diff --git a/CHOLMOD/Doc/CHOLMOD_UserGuide.pdf b/CHOLMOD/Doc/CHOLMOD_UserGuide.pdf
index d465856359..73b9f70566 100644
Binary files a/CHOLMOD/Doc/CHOLMOD_UserGuide.pdf and b/CHOLMOD/Doc/CHOLMOD_UserGuide.pdf differ
diff --git a/ChangeLog b/ChangeLog
index 580fff8dda..74cc008d39 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+Mar 3, 2021, SuiteSparse 5.9.0
+
+    * GraphBLAS v4.0.3: many new features, much faster performance
+
 July 14, 2020, SuiteSparse 5.8.1
 
     * SLIP_LU v1.0.2: resolved issue #51
diff --git a/GraphBLAS/.gitignore b/GraphBLAS/.gitignore
index 5a4ebfdb17..5b52c7158d 100644
--- a/GraphBLAS/.gitignore
+++ b/GraphBLAS/.gitignore
@@ -36,6 +36,7 @@ my_*.out
 .nfs*
 .pyc
 *.tmp
+gunk*
 
 Demo/bfs_demo.out
 Demo/mis_demo.out
@@ -60,6 +61,7 @@ alternative/*_demo
 Test/*.log
 Test/errlog.txt
 Test/errlog2.txt
+Test/errlog3.txt
 Test/log.txt
 Test/gunk*
 Test/fprint.txt
@@ -69,6 +71,7 @@ Doc/GraphBLAS_UserGuide.out
 
 Tcov/errlog.txt
 Tcov/errlog2.txt
+Tcov/errlog3.txt
 Tcov/log.txt
 Tcov/grbstat.mat
 Tcov/fprint.txt
diff --git a/GraphBLAS/CMakeLists.txt b/GraphBLAS/CMakeLists.txt
index f74d3efaa9..c5ea314a8c 100644
--- a/GraphBLAS/CMakeLists.txt
+++ b/GraphBLAS/CMakeLists.txt
@@ -2,8 +2,8 @@
 # GraphBLAS/CMakeLists.txt:  cmake script for GraphBLAS
 #-------------------------------------------------------------------------------
 
-# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-# http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 # CMakeLists.txt: instructions for cmake to build GraphBLAS.
 # An ANSI C11 compiler is required.
@@ -59,9 +59,9 @@ endif ( )
 set ( CMAKE_MACOSX_RPATH TRUE )
 
 # version of SuiteSparse:GraphBLAS
-set ( GraphBLAS_DATE "July 14, 2020" )
-set ( GraphBLAS_VERSION_MAJOR 3 )
-set ( GraphBLAS_VERSION_MINOR 3 )
+set ( GraphBLAS_DATE "Jan 19, 2021")
+set ( GraphBLAS_VERSION_MAJOR 4 )
+set ( GraphBLAS_VERSION_MINOR 0 )
 set ( GraphBLAS_VERSION_SUB   3 )
 
 # GraphBLAS C API Specification version, at graphblas.org
@@ -101,12 +101,14 @@ else ( )
     message ( STATUS "Building dynamic GraphBLAS library only" )
 endif ( )
 
-# select "true" to enable burble, for GraphBLAS development only
-# set ( GB_BURBLE true )
-  set ( GB_BURBLE false )
+# select "false" to disable the burble.  It is now enabled by default.
+  set ( GB_BURBLE true )
+# set ( GB_BURBLE false )
 
 if ( GB_BURBLE )
     set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DGB_BURBLE=1 " )
+else ( )
+    set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DGB_BURBLE=0 " )
 endif ( )
 
 if ( GBCOMPACT )
@@ -159,7 +161,7 @@ if ( CMAKE_CUDA )
     message ( STATUS "CUDA enabled" )
     set ( CMAKE_CUDA_FLAG " -DGBCUDA" )
     set ( GB_CUDA graphblascuda  cuda cudadevrt cudart nvrtc )
-    link_directories ( "CUDA" "/usr/local/cuda/lib64" )
+    link_directories ( "CUDA" "/usr/local/cuda/lib64" "/usr/local/cuda/lib64/stubs" )
 else ( )
     message ( STATUS "CUDA not enabled" )
     set ( CMAKE_CUDA_FLAG " " )
@@ -180,8 +182,6 @@ endif ( )
 
 message ( STATUS "CMAKE compiler ID:         " ${CMAKE_C_COMPILER_ID} )
 message ( STATUS "CMAKE thread library:      " ${CMAKE_THREAD_LIBS_INIT} )
-message ( STATUS "CMAKE have pthreads:       " ${CMAKE_USE_PTHREADS_INIT}  )
-message ( STATUS "CMAKE have Win32 pthreads: " ${CMAKE_USE_WIN32_THREADS_INIT} )
 message ( STATUS "CMAKE have OpenMP:         " ${OPENMP_FOUND} )
 
 #-------------------------------------------------------------------------------
@@ -238,6 +238,9 @@ include_directories ( Source/Template Source Source/Generated Source/Generator I
 # check which compiler is being used.  If you need to make
 # compiler-specific modifications, here is the place to do it.
 if ( "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
+    # The -g option is useful for the Intel VTune tool, but it should be
+    # removed in production.  Comment this line out if not in use:
+    # set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -g" )
     # cmake 2.8 workaround: gcc needs to be told to do ANSI C11.
     # cmake 3.0 doesn't have this problem.
     set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -std=c11 -lm -Wno-pragmas " )
@@ -251,7 +254,6 @@ if ( "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU")
     set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -fwrapv ")
     # check all warnings (uncomment for development only)
 #   set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -Wall -Wextra -Wpedantic -Werror " )
-    # set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -g" )
     if ( CMAKE_C_COMPILER_VERSION VERSION_LESS 4.9 )
         message ( FATAL_ERROR "gcc version must be at least 4.9" )
     endif ( )
@@ -274,6 +276,7 @@ elseif ( "${CMAKE_C_COMPILER_ID}" STREQUAL "Intel" )
     endif ( )
 elseif ( "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang" )
     # options for clang
+    set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -Wno-pointer-sign " )
     if ( CMAKE_C_COMPILER_VERSION VERSION_LESS 3.3 )
         message ( FATAL_ERROR "clang version must be at least 3.3" )
     endif ( )
@@ -349,25 +352,10 @@ endif ( )
 # select the threading library
 #-------------------------------------------------------------------------------
 
-if ( USER_OPENMP )
-    # user insists on OpenMP synchronization inside GraphBLAS
-    message ( STATUS "cmake -DUSER_OPENMP=1: insisting on using OpenMP" )
+if ( OPENMP_FOUND )
+#   set this to 'false' if you do not want OpenMP
+#   set ( USE_OPENMP false )
     set ( USE_OPENMP true )
-elseif ( USER_POSIX )
-    # user insists on POSIX synchronization inside GraphBLAS
-    message ( STATUS "cmake -DUSER_POSIX=1: insisting on using POSIX" )
-    set ( USE_POSIX true )
-elseif ( USER_NONE )
-    message ( STATUS "cmake -DUSER_NONE=1: insisting on using no threading" )
-    set ( USE_NONE true )
-else ( )
-    # default: automatic selection
-    message ( STATUS "Automatic selection of synchronization method for user threads" )
-    if ( OPENMP_FOUND )
-        set ( USE_OPENMP true )
-    elseif ( CMAKE_USE_PTHREADS_INIT )
-        set ( USE_POSIX true )
-    endif ( )
 endif ( )
 
 #-------------------------------------------------------------------------------
@@ -380,67 +368,44 @@ else ( )
     set ( M_LIB "m" )
 endif ( )
 
+target_link_libraries ( graphblas ${M_LIB} )
+if ( BUILD_GRB_STATIC_LIBRARY )
+    target_link_libraries ( graphblas_static ${M_LIB} )
+endif ( )
+
 #-------------------------------------------------------------------------------
-# add the threading library
+# add the OpenMP, CUDA, BLAS, ... libraries
 #-------------------------------------------------------------------------------
 
 if ( USE_OPENMP )
-    # use OpenMP for user thread synchronization
-    message ( STATUS "Using OpenMP to synchronize user threads" )
-    target_link_libraries ( graphblas ${M_LIB} ${OpenMP_C_LIBRARIES} ${GB_CUDA} )
-    if ( BUILD_GRB_STATIC_LIBRARY )
-        target_link_libraries ( graphblas_static ${M_LIB} ${OpenMP_C_LIBRARIES} ${GB_CUDA} )
-    endif ( )
-    set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} -DUSER_OPENMP_THREADS " )
-elseif ( USE_POSIX )
-    # use POSIX for user thread synchronization
-    message ( STATUS "Using POSIX pthreads to synchronize user threads" )
-    target_link_libraries ( graphblas ${M_LIB} ${GB_CUDA} )
+    target_link_libraries ( graphblas ${OpenMP_C_LIBRARIES} )
     if ( BUILD_GRB_STATIC_LIBRARY )
-        target_link_libraries ( graphblas_static ${M_LIB} ${GB_CUDA} )
+        target_link_libraries ( graphblas_static ${OpenMP_C_LIBRARIES} )
     endif ( )
-    set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -pthread -DUSER_POSIX_THREADS " )
-else ( )
-    # use no threading at all
-    message ( WARNING "No support for user threads; GraphBLAS will not be thread-safe" )
-    target_link_libraries ( graphblas ${M_LIB} ${GB_CUDA} )
-    if ( BUILD_GRB_STATIC_LIBRARY )
-        target_link_libraries ( graphblas_static ${M_LIB} ${GB_CUDA} )
-    endif ( )
-    set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DUSER_NO_THREADS " )
+    set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS} " )
 endif ( )
 
-if ( CMAKE_THREAD_LIBS_INIT )
-    target_link_libraries ( graphblas ${CMAKE_THREAD_LIBS_INIT} ${GB_CUDA} )
+if ( CMAKE_CUDA )
+    target_link_libraries ( graphblas ${GB_CUDA} )
     if ( BUILD_GRB_STATIC_LIBRARY )
-        target_link_libraries ( graphblas_static ${CMAKE_THREAD_LIBS_INIT} ${GB_CUDA} )
+        target_link_libraries ( graphblas_static ${GB_CUDA} )
     endif ( )
 endif ( )
 
-if ( OPENMP_FOUND )
-    # use OpenMP for internal parallelism
-    message ( STATUS "Using OpenMP for internal parallelism" )
-    set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" )
-    target_link_libraries ( graphblas ${M_LIB} ${OpenMP_C_LIBRARIES}  ${GB_CUDA} )
+if ( CMAKE_THREAD_LIBS_INIT )
+    target_link_libraries ( graphblas ${CMAKE_THREAD_LIBS_INIT} )
     if ( BUILD_GRB_STATIC_LIBRARY )
-        target_link_libraries ( graphblas_static ${M_LIB} ${OpenMP_C_LIBRARIES} ${GB_CUDA} )
+        target_link_libraries ( graphblas_static ${CMAKE_THREAD_LIBS_INIT} )
     endif ( )
 endif ( )
 
-if ( CMAKE_USE_PTHREADS_INIT )
-    set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DHAVE_PTHREADS " )
-endif ( )
-
-if ( CMAKE_USE_WIN32_THREADS_INIT )
-    set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DHAVE_WINDOWS_THREADS " )
-endif ( )
-
-if ( BLAS_FOUND )
-    # use the dense CBLAS
-    message ( STATUS "Using dense CBLAS for faster dense matrix/vector operations" )
-    set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DGB_HAS_CBLAS " )
-    target_link_libraries ( graphblas ${BLAS_LIBRARIES} )
-endif ( )
+# FUTURE:
+# if ( BLAS_FOUND )
+#   # use the dense CBLAS
+#   message ( STATUS "Using dense CBLAS for faster dense matrix/vector operations" )
+#   set ( CMAKE_C_FLAGS  "${CMAKE_C_FLAGS} -DGB_HAS_CBLAS " )
+#   target_link_libraries ( graphblas ${BLAS_LIBRARIES} )
+# endif ( )
 
 #-------------------------------------------------------------------------------
 # determine the default matrix format
@@ -487,7 +452,6 @@ endif ( )
 add_executable ( pagerank_demo "Demo/Program/pagerank_demo.c" )
 add_executable ( bfs_demo      "Demo/Program/bfs_demo.c" )
 add_executable ( tri_demo      "Demo/Program/tri_demo.c" )
-add_executable ( pthread_demo  "Demo/Program/pthread_demo.c" )
 add_executable ( openmp_demo   "Demo/Program/openmp_demo.c" )
 add_executable ( mis_demo      "Demo/Program/mis_demo.c" )
 add_executable ( complex_demo  "Demo/Program/complex_demo.c" )
@@ -501,7 +465,6 @@ add_executable ( import_demo   "Demo/Program/import_demo.c" )
 target_link_libraries ( pagerank_demo graphblas graphblasdemo ${GB_CUDA} )
 target_link_libraries ( bfs_demo      graphblas graphblasdemo ${GB_CUDA} )
 target_link_libraries ( tri_demo      graphblas graphblasdemo ${GB_CUDA} )
-target_link_libraries ( pthread_demo  graphblas graphblasdemo ${GB_CUDA} )
 target_link_libraries ( openmp_demo   graphblas graphblasdemo ${GB_CUDA} )
 target_link_libraries ( mis_demo      graphblas graphblasdemo ${GB_CUDA} )
 target_link_libraries ( complex_demo  graphblas graphblasdemo ${GB_CUDA} )
diff --git a/GraphBLAS/Extras/tri/.gitignore b/GraphBLAS/CUDA/.gitignore
similarity index 68%
rename from GraphBLAS/Extras/tri/.gitignore
rename to GraphBLAS/CUDA/.gitignore
index 36ca52a96d..5db082e1af 100644
--- a/GraphBLAS/Extras/tri/.gitignore
+++ b/GraphBLAS/CUDA/.gitignore
@@ -1,6 +1,9 @@
 # Ignore these files:
 *.o
-tri_main
+*.a
+*.so
+jitFactory
+stringify
 
 # Do not ignore this file
 !.gitignore
diff --git a/GraphBLAS/CUDA/GB_AxB_dot3_cuda.cu b/GraphBLAS/CUDA/GB_AxB_dot3_cuda.cu
new file mode 100644
index 0000000000..fcb9294910
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_AxB_dot3_cuda.cu
@@ -0,0 +1,693 @@
+//------------------------------------------------------------------------------
+// GB_AxB_dot3_cuda: compute C<M> = A'*B in parallel, on the GPU(s)
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
+// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+
+//------------------------------------------------------------------------------
+
+// This function only computes C<M>=A'*B on the GPUs.  The mask must be
+// present, and not complemented.  The mask is always applied.
+
+extern "C" 
+{
+  #include "GB_mxm.h"
+}
+#include "GB_cuda.h"
+
+
+#include "templates/GB_jit_AxB_dot3_phase1.cu.jit"
+#include "templates/GB_jit_AxB_dot3_phase2.cu.jit"
+// the 5 kernels for the 5 buckets:
+#include "templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit"
+#include "templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit"
+#include "templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit"
+#include "templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit"
+#include "templates/GB_jit_AxB_dot3_phase3_mp.cu.jit"
+#include "templates/GB_jit_AxB_dot3_phase3_warpix.cu.jit"
+#include "templates/reduceNonZombiesWarp.cu.jit"
+
+#include "GB_jit_launcher.h"
+
+
+const std::vector<std::string> header_names ={};
+
+
+#define GB_FREE_WORK                                                    \
+{                                                                       \
+    GB_cuda_free (Nanobuckets) ;    Nanobuckets = NULL ;                    \
+    GB_cuda_free (Blockbucket) ;    Blockbucket = NULL ;                    \
+    GB_cuda_free (Bucket);          Bucket      = NULL;                     \
+    GB_cuda_free (Bucketp);         Bucketp     = NULL;                     \
+    GB_cuda_free (offset);          offset      = NULL;                     \
+}
+
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_FREE_WORK ;                                                      \
+    GrB_Matrix_free (Chandle) ;                                         \
+}
+
+GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
+(
+    GrB_Matrix *Chandle,            // output matrix
+    const GrB_Matrix M,             // mask matrix
+    const bool Mask_struct,         // if true, use the only structure of M
+    const GrB_Matrix A,             // input matrix
+    const GrB_Matrix B,             // input matrix
+    const GrB_Semiring semiring,    // semiring that defines C=A*B
+    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    ASSERT (Chandle != NULL) ;
+    ASSERT (*Chandle == NULL) ;
+
+    ASSERT_MATRIX_OK (M, "M for dot3 cuda A'*B", GB0) ;
+    ASSERT_MATRIX_OK (A, "A for dot3 cuda A'*B", GB0) ;
+    ASSERT_MATRIX_OK (B, "B for dot3 cuda A'*B", GB0) ;
+
+    ASSERT (!GB_PENDING (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+
+    ASSERT (!GB_PENDING (B)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+
+    ASSERT_SEMIRING_OK (semiring, "semiring for dot3 numeric A'*B", GB0) ;
+
+    ASSERT (A->vlen == B->vlen) ;
+    GBURBLE ("(GPU dot3) ") ;
+
+    //--------------------------------------------------------------------------
+    // initializations
+    //--------------------------------------------------------------------------
+
+    int ntasks = 0, number_of_sms = 0 ;
+    int64_t *Nanobuckets = NULL, *Blockbucket = NULL ;
+    int64_t *Bucket = NULL;
+    int64_t *Bucketp = NULL;
+    int64_t *offset = NULL;
+    (*Chandle) = NULL ;
+
+    // just in case M is jumbled and we don't handle it yet (TODO)
+    GB_MATRIX_WAIT (M) ;
+    ASSERT (!GB_JUMBLED (M)) ;
+
+    int device = -1;
+
+    cudaSetDevice( 0 ) ;
+
+    cudaGetDevice(&device);
+
+    //--------------------------------------------------------------------------
+    // get M
+    //--------------------------------------------------------------------------
+
+    const int64_t *restrict Mp = M->p ;
+    const int64_t *restrict Mh = M->h ;
+    // const int64_t *restrict Mi = M->i ;
+    // const GB_void *restrict Mx = M->x ;
+    // const size_t msize = M->type->size ;
+    const int64_t mvlen = M->vlen ;
+    const int64_t mvdim = M->vdim ;
+    const int64_t mnz = GB_NNZ (M) ;
+    const int64_t mnvec = M->nvec ;
+    const bool M_is_hyper = GB_IS_HYPERSPARSE( M ) ;
+
+    const int64_t anz = GB_NNZ (A) ;
+    const int64_t anvec = A->nvec ;
+
+    const int64_t bnz = GB_NNZ (B) ;
+    const int64_t bnvec = B->nvec ;
+
+    //--------------------------------------------------------------------------
+    // allocate C, the same size and # of entries as M
+    //--------------------------------------------------------------------------
+
+    // FUTURE: ctype need not be the op->ztype
+    GrB_Type ctype = semiring->add->op->ztype ;
+    int64_t cvlen = mvlen ;
+    int64_t cvdim = mvdim ;
+    int64_t cnz = mnz ;
+    int64_t cnvec = mnvec ;
+
+    // TODO tell GB_CREATE where to put the data: CPU or GPU (via
+    // cudaMemAdvise), but this works as-is.
+    int sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
+    info = GB_new_bix (Chandle, // sparse or hyper (from M), new header
+        ctype, cvlen, cvdim, GB_Ap_malloc, true,
+        sparsity, false, M->hyper_switch, cnvec,
+        cnz+1,  // add one to cnz for GB_cumsum of Cwork 
+        true, Context) ;
+
+    if (info != GrB_SUCCESS)
+    { 
+        // out of memory
+        GB_FREE_ALL ;
+        return (info) ;
+    }
+
+    GrB_Matrix C = (*Chandle) ;
+    //int64_t *Citemp =  C->i ;        
+    //auto *Cxtemp = C->x ;        
+    //cudaMalloc ((void**) &(C->i), cnz * sizeof( int64_t) ); 
+    //cudaMalloc ((void**) &(C->x), cnz * C->type->size ); 
+    cudaMemAdvise( C->i, cnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device); 
+    cudaMemAdvise( C->x, cnz * C->type->size , cudaMemAdviseSetPreferredLocation, device); 
+
+    int64_t *restrict Cp = M->p ;
+    int64_t *restrict Ch = M->h ;
+    // int64_t *restrict Ci = C->i ;
+    // use C->i as workspace
+
+    //--------------------------------------------------------------------------
+    // copy Mp and Mh into C
+    //--------------------------------------------------------------------------
+
+    //cudaMemcpy (Cp, Mp, (cnvec+1) * sizeof (int64_t), cudaMemcpyDefault) ;
+    if (M_is_hyper)
+    { 
+        //cudaMemcpy (Ch, Mh, cnvec * sizeof (int64_t), cudaMemcpyDefault) ;
+    }
+    C->magic = GB_MAGIC ;
+    C->nvec_nonempty = M->nvec_nonempty ;
+    C->nvec = M->nvec ;
+
+    GBURBLE ("(GPU C created and copied from M) ") ;
+    //--------------------------------------------------------------------------
+    // stringify the semiring and the mask
+    //--------------------------------------------------------------------------
+
+    char semiring_name [GB_CUDA_STRLEN+2] ;
+    char semiring_code [GB_CUDA_STRLEN+2] ;
+    char mask_name [GB_CUDA_STRLEN+2] ;
+
+    GB_cuda_stringify_semiring (semiring, flipxy,
+        ctype, A->type, B->type, M->type, Mask_struct,  // matrix types
+        true, semiring_name, semiring_code, mask_name) ;
+
+    GBURBLE ("(GPU stringified) ") ;
+    //--------------------------------------------------------------------------
+    // construct the tasks for phase1 and phase2
+    //--------------------------------------------------------------------------
+
+    // on the CPU: nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
+    // on the GPU:
+
+    // # of threads in phase1 and phase2 kernel launches must be the same
+    #define chunksize 128 
+    #define SYMBOLIC_PHASE_NTHREADS 32 
+    #define NBUCKETS (GB_BUCKET_MERGEPATH + 1)
+
+    number_of_sms = GB_Global_gpu_sm_get (0) ;
+    // C and M have cnz entries, so create ...
+    //ntasks = ( (mnvec +7)/8   + SYMBOLIC_PHASE_NTHREADS -1 )/SYMBOLIC_PHASE_NTHREADS;
+    ntasks =  ( mnz +chunksize -1)/chunksize;
+    // Idea is to have each task work on a continguous block of columns of C
+    ntasks = GB_IMIN( ntasks,  128*number_of_sms) ;    // ntasks will be grid.x
+
+    GBURBLE ("(GPU mnz=%ld mnvec=%ld blockDim=32, nblock= %d) ", mnz, mnvec, ntasks ) ;
+
+    std::cout<< "ntasks, nthreads = " <<ntasks<<","<<SYMBOLIC_PHASE_NTHREADS<<std::endl; 
+    //--------------------------------------------------------------------------
+    // phase1 and phase2: place each C(i,j) in a bucket
+    //--------------------------------------------------------------------------
+
+    cudaMalloc ((void**) &Nanobuckets,
+        NBUCKETS * SYMBOLIC_PHASE_NTHREADS * ntasks * sizeof (int64_t)) ;
+
+    //Nanobuckets = (int64_t*)GB_cuda_malloc (
+    //    NBUCKETS * SYMBOLIC_PHASE_NTHREADS * ntasks * sizeof (int64_t)) ;
+    //cudaMemAdvise( Nanobuckets, NBUCKETS * SYMBOLIC_PHASE_NTHREADS * ntasks
+    //                           * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device); 
+    /*
+    */
+
+    cudaMalloc ((void**) &Blockbucket,
+        NBUCKETS * ntasks* sizeof (int64_t) ) ;
+    //Blockbucket = (int64_t*)GB_cuda_malloc ( NBUCKETS * ntasks* sizeof (int64_t) ) ;
+    //cudaMemAdvise( Blockbucket, NBUCKETS * ntasks
+    //                           * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device); 
+    /*
+    */
+
+    cudaMalloc ((void**) &Bucket, cnz*sizeof(int64_t));
+    //Bucket = (int64_t*)GB_cuda_malloc ( cnz*sizeof(int64_t) );
+    //cudaMemAdvise( Bucket, cnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device); 
+    /*
+    */
+
+    //cudaMallocManaged ((void**) &Bucketp, (NBUCKETS+1)*sizeof(int64_t)) ;
+    Bucketp = (int64_t*)GB_cuda_malloc( (NBUCKETS+1)*sizeof(int64_t) ) ;
+    cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId); 
+    cudaMemAdvise( Bucketp, (NBUCKETS+1) * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device); 
+
+    //cudaMallocManaged ((void**) &offset, (NBUCKETS)*sizeof(int64_t)) ;
+    offset = (int64_t*)GB_cuda_malloc( (NBUCKETS)*sizeof(int64_t) ) ;
+    cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId); 
+    cudaMemAdvise( offset, NBUCKETS * sizeof ( int64_t), cudaMemAdviseSetAccessedBy, device); 
+
+    memset( offset, 0, NBUCKETS * sizeof(int64_t) ); 
+    
+  /* 
+    if (Blockbucket == NULL || Nanobuckets == NULL || Bucket == NULL || Bucketp == NULL )
+    { 
+        // out of memory
+        GB_FREE_ALL ;
+        return (GB_OUT_OF_MEMORY) ;
+    }
+    */
+    
+
+    //--------------------------------------------------------------------------
+    // Pre-fetch arrays that will be used on the device
+    //--------------------------------------------------------------------------
+
+    
+    //cudaMemPrefetchAsync( Nanobuckets, NBUCKETS * SYMBOLIC_PHASE_NTHREADS
+    //                     * ntasks * sizeof (int64_t), device, NULL) ;
+
+    //cudaMemPrefetchAsync( Blockbucket, NBUCKETS * ntasks 
+    //                        * sizeof (int64_t), device, NULL) ;
+
+    //cudaMemPrefetchAsync( Bucket, cnz * sizeof (int64_t), device, NULL) ;
+    
+
+    /*
+    
+    //cudaStream_t stream_data;
+    //cudaStreamCreate ( &stream_data);
+    */
+    /* 
+    cudaMemAdvise( M->p, (mnvec+1) * sizeof (int64_t), cudaMemAdviseSetPreferredLocation, device) ;
+    cudaMemAdvise( M->i, mnz * sizeof ( int64_t), cudaMemAdviseSetPreferredLocation, device); 
+    cudaMemAdvise( M->x, mnz * M->type->size, cudaMemAdviseSetPreferredLocation,device) ;
+    
+    cudaMemAdvise( M->p, (mnvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
+    cudaMemAdvise( M->i, mnz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
+    cudaMemAdvise( M->x, mnz * M->type->size, cudaMemAdviseSetReadMostly,device) ;
+    */
+
+    cudaMemPrefetchAsync( M->p, (mnvec+1) * sizeof (int64_t), device, NULL) ; //stream_data) ;
+    cudaMemPrefetchAsync( M->i, mnz * sizeof (int64_t), device, NULL ) ; //stream_data) ;
+    cudaMemPrefetchAsync( M->x, mnz * M->type->size, device, NULL ) ; //stream_data) ;
+    /*
+    cudaMemAdvise( C->p, (mnvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
+    cudaMemAdvise( C->i, mnz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
+    cudaMemAdvise( C->x, mnz * C->type->size, cudaMemAdviseSetReadMostly,device) ;
+    */
+    //cudaMemPrefetchAsync( C->p, (mnvec+1) * sizeof (int64_t), device, NULL) ; //stream_data) ;
+    cudaMemPrefetchAsync( C->i, mnz * sizeof (int64_t), device, NULL ); //stream_data) ;
+    cudaMemPrefetchAsync( C->x, mnz * C->type->size, device, NULL ); //stream_data) ;
+    
+    /*
+    cudaMemAdvise( A->p, (anvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
+    cudaMemAdvise( A->i, anz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
+    cudaMemAdvise( A->x, anz * A->type->size, cudaMemAdviseSetReadMostly,device) ;
+    */
+    cudaMemPrefetchAsync( A->p, (anvec+1) * sizeof (int64_t), device, NULL); // stream_data) ;
+    cudaMemPrefetchAsync( A->i, anz * sizeof (int64_t), device, NULL ) ; //stream_data) ;
+    cudaMemPrefetchAsync( A->x, anz * A->type->size, device, NULL ) ; //stream_data) ;
+
+    /*
+    cudaMemAdvise( B->p, (bnvec+1) * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
+    cudaMemAdvise( B->i, bnz * sizeof (int64_t), cudaMemAdviseSetReadMostly, device) ;
+    cudaMemAdvise( B->x, bnz * B->type->size, cudaMemAdviseSetReadMostly, device) ;
+    */
+    cudaMemPrefetchAsync( B->p, (bnvec+1) * sizeof (int64_t), device, NULL) ; //stream_data) ;
+    cudaMemPrefetchAsync( B->i, bnz * sizeof (int64_t), device, NULL ) ; //stream_data) ;
+    cudaMemPrefetchAsync( B->x, bnz * B->type->size, device, NULL ) ; //stream_data) ;
+
+    
+
+    // The work to compute C(i,j) is held in Ci [p], if C(i,j) appears in
+    // as the pth entry in C.
+    GB_callback mysemiring;
+    const char *header_name = (const char *)"mySemiRing.h";
+    mysemiring.load_string(header_name, semiring_code ) ;
+    SR_callback_ptr = &mysemiring;
+
+
+    //cudaStream_t stream_AxB;
+    //cudaStreamCreate ( &stream_AxB);
+    //----------------------------------------------------------------------
+    // phase1: assign each C(i,j) to a bucket, and count them
+    //----------------------------------------------------------------------
+    dim3 grid( ntasks) ; 
+    dim3 p2grid( (ntasks +  SYMBOLIC_PHASE_NTHREADS -1)
+                          / (SYMBOLIC_PHASE_NTHREADS) ) ; 
+    dim3 block( SYMBOLIC_PHASE_NTHREADS ) ;
+
+    std::string base_name = "GB_jit_AxB_dot3_";
+    std::string Opname = "phase1_" ;
+    
+    jitify::experimental::KernelLauncher phase1Kernel =
+    jit::launcher( base_name + Opname + mask_name,
+                   templates_GB_jit_AxB_dot3_phase1_cu,
+                   header_names,
+                   compiler_flags,
+                   callback_wrapper) //,
+                   //stream_AxB)
+               .set_kernel_inst("GB_AxB_cuda_dot3_phase1",
+                                {M->type->name})
+               .configure(grid, block); 
+
+    //----------------------------------------------------------------------
+    // phase2: cumsum across the blockbuckets, propagate to thread level
+    //----------------------------------------------------------------------
+    base_name = "GB_jit_AxB_dot3_";
+    Opname = "phase2";
+    jitify::experimental::KernelLauncher phase2Kernel =
+    jit::launcher( base_name + Opname,
+                   templates_GB_jit_AxB_dot3_phase2_cu,
+                   header_names,
+                   compiler_flags,
+                   callback_wrapper) //,
+                   //stream_AxB)
+               .set_kernel_inst("GB_AxB_dot3_phase2",
+                                {})
+               .configure(p2grid, block);
+
+    base_name = "GB_jit_AxB_dot3_";
+    Opname = "phase2";
+    jitify::experimental::KernelLauncher phase2endKernel =
+    jit::launcher( base_name + Opname,
+                   templates_GB_jit_AxB_dot3_phase2_cu,
+                   header_names,
+                   compiler_flags,
+                   callback_wrapper) //,
+                   //stream_AxB)
+               .set_kernel_inst("GB_AxB_dot3_phase2end",
+                                {})
+               .configure(grid, block);
+
+
+    phase1Kernel.launch(
+                        Nanobuckets,       // array of size NBUCKETS-blockDim.x-by-gridDim.x
+                        Blockbucket,       // bucket counts, of size NBUCKETS-by-gridDim.x
+                                           // input/output:
+                        C,                 // final output matrix
+                                           // inputs, not modified:
+                        M,                 // mask matrix
+                        A,                 // input matrix
+                        B                  // input matrix
+                    );
+
+
+    // cudaDeviceSynchronize();
+
+
+    GBURBLE ("(GPU phase1 done) ") ;
+    //for (int i = 0; i< cnz; i++){
+    //  printf("C[%d] = %ld\n", i , Ci[i]);
+    //}
+    //----------------------------------------------------------------------
+    // phase2: cumsum across the blockbuckets, propagate to thread level
+    //----------------------------------------------------------------------
+    int nblock = ntasks;
+
+    phase2Kernel.launch(                    // input
+                        Nanobuckets,       // array of size NBUCKETS-blockDim.x-by-gridDim.x
+                        Blockbucket,       // bucket counts, of size NBUCKETS-by-gridDim.x
+                                           // input/output:
+                        Bucketp,           // global bucket cumsum, of size NBUCKETS+1
+                        Bucket,            // global buckets, of size cnz (== mnz)
+                        offset,
+                        C,                 // final output matrix
+                                           // inputs, not modified:
+                        cnz,               // number of entries in mask and output matrix
+                        nblock
+                    );
+
+    cudaDeviceSynchronize();
+    //cudaMemPrefetchAsync( offset, (NBUCKETS) * sizeof (int64_t), cudaCpuDeviceId, NULL) ;
+
+    int64_t s= 0;
+    for ( int bucket = 0 ; bucket < NBUCKETS+1; ++bucket)
+    {
+       Bucketp[bucket] = s; 
+       s+= offset[bucket];
+       //printf("bucketp[%d] = %ld\n", bucket, Bucketp[bucket]);
+    }
+
+    GBURBLE ("(GPU phase2 done) ") ;
+
+    phase2endKernel.launch(                    // input
+                        Nanobuckets,       // array of size NBUCKETS-blockDim.x-by-gridDim.x
+                        Blockbucket,       // bucket counts, of size NBUCKETS-by-gridDim.x
+                                           // input/output:
+                        Bucketp,           // global bucket cumsum, of size NBUCKETS+1
+                        Bucket,            // global buckets, of size cnz (== mnz)
+                        offset,
+                        C,                 // final output matrix
+                                           // inputs, not modified:
+                        cnz                // number of entries in mask and output matrix
+                    );
+
+    cudaDeviceSynchronize();
+
+    GBURBLE ("(GPU phase2end done) ") ;
+    /* 
+    for (int i = 0; i< cnz; i++){
+      printf("C[%d],Bucket = %ld,%ld\n", i , Ci[i], Bucket[i]);
+    }
+    */
+    
+    //----------------------------------------------------------------------
+    // phase3: do the numerical work
+    //----------------------------------------------------------------------
+
+    base_name = "GB_jit_";
+    std::string kernel_name = "AxB_dot3_phase3_";
+    C->nzombies = Bucketp[1];  //set pre-zombie counts
+
+    for ( int bucket = 1 ; bucket < NBUCKETS; ++bucket)
+    {
+        std::string Opname = "";
+        int sz = 0 ;
+
+        const char*  jit_template;
+
+        int64_t start = Bucketp[bucket];
+        int64_t end = Bucketp[bucket+1];
+
+        //if( (end-start>0)  && (start == Bucketp[1]) ) start = Bucketp[0]; //add in zombie slots
+
+        int64_t Cnz = end- start;
+
+        int gridsz, blocksz;
+
+        //Nothing to do, next bucket
+        if ( Cnz == 0 ) continue;
+
+        GBURBLE ("\n\n(GPU phase3 bucket,bucketsize= %d,%ld) ",bucket,Cnz) ;
+
+        switch (bucket)
+        {
+
+            //--------------------------------------------------------------
+            // not a bucket ... bring out your dead:
+            //--------------------------------------------------------------
+
+            case GB_BUCKET_ZOMBIE : // C(i,j) is a zombie (not a bucket)
+                break ;
+
+            //--------------------------------------------------------------
+            // CUDA kernel: dndn, handles a single bucket:
+            //--------------------------------------------------------------
+
+            // both A(:,i) and B(:,j) are dense
+            case GB_BUCKET_DNDN :
+                Opname = "dndn" ;
+                jit_template = templates_GB_jit_AxB_dot3_phase3_dndn_cu;
+                blocksz = 32;
+                gridsz = ( Cnz -1 + blocksz)/blocksz;
+                break ;
+
+            //--------------------------------------------------------------
+            // CUDA kernel: spdn, handles 4 buckets:
+            //--------------------------------------------------------------
+
+            // A(:,i) is dense and B(:,j) is very sparse (< 256 entries)
+            case GB_BUCKET_DNVS :
+            // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense
+            case GB_BUCKET_VSDN :
+                sz = 64 ;
+                Opname = "spdn" ;
+                jit_template = templates_GB_jit_AxB_dot3_phase3_spdn_cu;
+                blocksz = 32;
+                gridsz = ( Cnz -1 + blocksz)/blocksz;
+                break ;
+
+            // A(:,i) is dense and B(:,j) is sparse (>= 256 entries)
+            case GB_BUCKET_DNSP :
+            // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense
+            case GB_BUCKET_SPDN :
+                sz = 256 ;
+                Opname = "spdn" ;
+                jit_template = templates_GB_jit_AxB_dot3_phase3_spdn_cu;
+                blocksz = 32;
+                gridsz = ( Cnz -1 + blocksz)/blocksz;
+                break ;
+
+            //--------------------------------------------------------------
+            // CUDA kernel: vssp, handles 1 bucket, uses binary search:
+            //--------------------------------------------------------------
+
+            // A(:,i) is very sparse compared to B(:,j), or visa versa
+            case GB_BUCKET_VSSP :
+                Opname = "vssp" ;
+                jit_template = templates_GB_jit_AxB_dot3_phase3_vssp_cu;
+                blocksz = 32;
+                gridsz = ( Cnz -1 + blocksz)/blocksz;
+                break ;
+
+            //--------------------------------------------------------------
+            // CUDA kernel: vsvs, handles 4 buckets:
+            //--------------------------------------------------------------
+
+            // let len = nnz (A (:,i) + nnz (B (:,j)), then:
+            
+            case GB_BUCKET_VSVS_256 : sz += 256-64 ;
+            case GB_BUCKET_VSVS_64 :  sz += 64-16  ;
+            case GB_BUCKET_VSVS_16 :  sz += 16-4   ;
+            case GB_BUCKET_VSVS_4 :   sz += 4      ;
+                Opname = "vsvs" ;
+                jit_template = templates_GB_jit_AxB_dot3_phase3_vsvs_cu;
+                blocksz = 1024;
+                gridsz = GB_IMIN( 1024*number_of_sms, ( Cnz  + blocksz -1 )/blocksz);
+                gridsz =  ( Cnz  + blocksz -1 )/blocksz;
+                /*
+                Opname = "warpix" ;
+                jit_template = templates_GB_jit_AxB_dot3_phase3_warpix_cu;
+                blocksz = 32;
+                gridsz =  GB_IMIN( (mnvec+15)/16, 256*number_of_sms);
+                */
+                break ;
+            
+            //--------------------------------------------------------------
+            // CUDA kernel: mp, use the merge-path method:
+            //--------------------------------------------------------------
+
+            case GB_BUCKET_MERGEPATH :
+                Opname = "mp" ;
+                jit_template = templates_GB_jit_AxB_dot3_phase3_mp_cu;
+                blocksz = 32;
+                gridsz = ( Cnz -1 + blocksz)/blocksz;
+                break ;
+
+            case GB_BUCKET_WARP_IX :   sz = 32      ;
+                Opname = "warpix" ;
+                jit_template = templates_GB_jit_AxB_dot3_phase3_warpix_cu;
+                blocksz = 32;
+                gridsz =  GB_IMIN( (mnvec+15)/16, 256*number_of_sms);
+                break ;
+
+            default:
+                break ;
+        }
+
+        dim3 grid(gridsz);
+        dim3 block(blocksz);
+
+        std::cout<< "Kernel name =" <<Opname<<std::endl; 
+        GBURBLE ("(GPU phase3 launch st,end=%ld,%ld nblocks,blocksize= %d,%d )\n",start,end,gridsz,blocksz) ;
+        jit::launcher( base_name + kernel_name + Opname + "_" + semiring_name,
+                       jit_template,
+                       header_names,
+                       compiler_flags,
+                       callback_wrapper)
+                   .set_kernel_inst(kernel_name + Opname,
+                                    { ctype->name,
+                                      A->type->name,
+                                      B->type->name,
+                                      semiring->multiply->xtype->name,
+                                      semiring->multiply->ytype->name,
+                                      semiring->multiply->ztype->name  })
+                   .configure(grid, block) //if commented, use implicit 1D configure in launch
+                   .launch(
+                            start,   // input/output:
+                            end, // global bucket cumsum, of size NBUCKETS+1
+                            Bucket,            // global buckets, of size cnz (== mnz)
+                            C,                 // final output matrix
+                                               // inputs, not modified:
+                            M,                 // Mi used for column index
+                            A,                 // A matrix
+                            B,                 // B matrix
+                            sz                 // only used for sparse-sparse cases
+
+                        );
+
+        cudaDeviceSynchronize();
+    }
+    GBURBLE ("(GPU phase3 done) ") ;
+    
+    std::string reduce_kernel_name = "reduceNonZombiesWarp";
+    const char*  jit_template;
+    #define red_blocksz 1024
+    jit_template = templates_reduceNonZombiesWarp_cu;
+    int num_reduce_blocks = GB_IMIN( 32*number_of_sms, (cnz + red_blocksz -1)/ red_blocksz  ) ;
+    dim3 red_grid( num_reduce_blocks ) ; 
+    dim3 red_block( red_blocksz ) ;
+
+    int32_t *block_sum;
+    //cudaMallocManaged ((void**) &block_sum, (num_reduce_blocks)*sizeof(int32_t)) ;
+    block_sum = (int32_t*)GB_cuda_malloc( (num_reduce_blocks)*sizeof(int32_t)) ;
+
+    GBURBLE ("(GPU reduce launch nblocks,blocksize= %d,%d )\n", num_reduce_blocks, red_blocksz) ;
+    jit::launcher( reduce_kernel_name + "_" + semiring_name,
+                   jit_template,
+                   header_names,
+                   compiler_flags,
+                   callback_wrapper)
+                   .set_kernel_inst( reduce_kernel_name , { ctype->name })
+                   .configure(red_grid, red_block) //if commented, use implicit 1D configure in launch
+                   .launch(
+                            C->i,   // index vector, only sum up values >= 0
+                            C->x,   // input pointer to vector to reduce, with zombies
+                            block_sum,             // Block sums on return 
+                            (unsigned int)cnz      // length of vector to reduce to scalar
+
+                        );
+
+    cudaDeviceSynchronize();
+
+    int32_t num_triangles = 0;
+    for (int i = 0; i< num_reduce_blocks; i++){
+       //printf("block%d num_triangles = %d\n", i, block_sum[i] );
+       num_triangles += block_sum[i] ;
+    }
+    printf("num_triangles = %d\n",  num_triangles );
+
+    GB_cuda_free( block_sum ); 
+    //cudaMemPrefetchAsync( C->p, (mnvec+1) * sizeof (int64_t), cudaCpuDeviceId, NULL) ; //stream_data ) ;
+    //cudaMemPrefetchAsync( C->i, cnz * sizeof (int64_t), cudaCpuDeviceId, NULL ) ; //stream_data ) ;
+    //cudaMemPrefetchAsync( C->x, cnz * sizeof (int32_t), cudaCpuDeviceId, NULL ) ; //stream_data ) ;
+    /*
+    cudaMemcpy( Citemp, C->i, cnz * sizeof( int64_t), cudaMemcpyDefault );    
+    cudaMemcpy( Cxtemp, C->x, cnz * C->type->size, cudaMemcpyDefault );    
+    GB_cuda_free( C->i);
+    GB_cuda_free( C->x);
+    C->i = Citemp;
+    C->x = Cxtemp;
+    */
+
+    cudaDeviceSynchronize();
+
+    return GrB_SUCCESS; 
+}
+
diff --git a/GraphBLAS/CUDA/GB_AxB_dot3_cuda_branch.cu b/GraphBLAS/CUDA/GB_AxB_dot3_cuda_branch.cu
new file mode 100644
index 0000000000..79083969f5
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_AxB_dot3_cuda_branch.cu
@@ -0,0 +1,41 @@
+
+// Decide branch direction for GPU use for the dot-product MxM
+extern "C" 
+{
+  #include "GB_mxm.h"
+}
+#include "GB_cuda.h"
+
+bool GB_AxB_dot3_cuda_branch 
+(
+    const GrB_Matrix M,             // mask matrix
+    const bool Mask_struct,         // if true, use the only structure of M
+    const GrB_Matrix A,             // input matrix
+    const GrB_Matrix B,             // input matrix
+    const GrB_Semiring semiring,    // semiring that defines C=A*B
+    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
+    GB_Context Context
+)
+{
+        // very rough estimate of the work to do
+        double adeg = ((double) GB_NNZ (A)) / ((double) GB_IMAX (1, A->nvec)) ;
+        double bdeg = ((double) GB_NNZ (B)) / ((double) GB_IMAX (1, B->nvec)) ;
+        double work = GB_NNZ (M) * GB_IMIN (adeg, bdeg) ;
+
+        // TODO if A or B are not accessed (first, 2nd, or pair ops)
+        // then the type if A can be user-defined here, for CUDA.
+
+        int ngpus_to_use = GB_ngpus_to_use (work) ;
+        GBURBLE (" work:%g gpus:%d ", work, ngpus_to_use) ;
+        if (ngpus_to_use > 0 && semiring->semiring_is_builtin
+            && (A->type->code != GB_UDT_code)
+            && (B->type->code != GB_UDT_code))
+        {
+            return true;
+        }
+        else
+        { 
+            return false;
+        }
+
+}
diff --git a/GraphBLAS/CUDA/GB_callback.hpp b/GraphBLAS/CUDA/GB_callback.hpp
new file mode 100644
index 0000000000..f277a0e320
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_callback.hpp
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+
+// Implementations of string callbacks
+#include <iostream>
+#pragma once
+// Define function pointer we will use later
+//std::istream* (*file_callback)(std::string, std::iostream&);
+
+// Define a factory class for building any buffer of text
+class GB_callback {
+  char *callback_string;
+  const char *include_filename;
+  public:
+     void load_string(const char *fname, char *input){
+        callback_string = input; 
+        include_filename =  fname;
+     }
+     std::istream* callback( std::string filename, std::iostream& tmp_stream) {
+        if ( filename == std::string(this->include_filename) )
+        {
+           tmp_stream << this->callback_string; 
+           return &tmp_stream;
+        }
+        else 
+        {
+           return nullptr;
+        }
+     }
+};
+
diff --git a/GraphBLAS/CUDA/GB_cuda.h b/GraphBLAS/CUDA/GB_cuda.h
new file mode 100644
index 0000000000..d8ecf291b6
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda.h
@@ -0,0 +1,98 @@
+//------------------------------------------------------------------------------
+// GB_cuda.h: definitions for using CUDA in GraphBLAS
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS/CUDA, (c) NVIDIA Corp. 2017-2019, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This file is #include'd only in the GraphBLAS/CUDA/GB_cuda*.cu source files.
+
+#ifndef GB_CUDA_H
+#define GB_CUDA_H
+
+// nvcc chokes on the 'restrict' keyword, so define it to the empty string
+// for compiling the *.cu files.
+#define restrict
+
+// nvcc also chokes on fpclassify (an ANSI C11 construct that does not appear
+// in C++11, it seems).  It also issues spurious warnings about compiler
+// pragmas.  Source/GB.h avoids these constructs if GB_NVCC is defined.
+#define GB_NVCC
+
+extern "C"
+{
+    #include "GB.h"
+    #include "GB_Global.h"
+    #include <stdint.h>
+    #include <stddef.h>
+}
+
+// GB_cuda_gateway.h is also included in Source/GB* files, which are not
+// compiled with nvcc, nor do they see the cuda.h or cuda_runtime.h
+// definitions.  Thus, this #include comes first.
+#include "GB_cuda_gateway.h"
+
+#include "GB_cuda_global.h"
+
+// Finally, include the CUDA definitions
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "jitify.hpp"
+
+#include <cassert>
+#include <cmath>
+#include <iostream>
+
+#define CHECK_CUDA_SIMPLE(call)                                           \
+  do {                                                                    \
+    cudaError_t err = call;                                               \
+    if (err != cudaSuccess) {                                            \
+      const char* str = cudaGetErrorName( err);                           \
+      std::cout << "(CUDA runtime) returned " << str;                     \
+      std::cout << " (" << __FILE__ << ":" << __LINE__ << ":" << __func__ \
+                << "())" << std::endl;                                    \
+      return (GrB_PANIC) ;                                                \
+    }                                                                     \
+  } while (0)
+
+
+//------------------------------------------------------------------------------
+// GB_CUDA_CATCH: catch error from a try { ... } region
+//------------------------------------------------------------------------------
+
+// Usage:  Must be used in a GB* function that returns GrB_Info, and has a
+// GB_Context Context parameter.
+//
+//  #define GB_FREE_ALL { some macro to free all temporaries }
+//  GrB_Info info ;
+//  try { ... do stuff that can through an exception }
+//  GB_CUDA_CATCH (info) ;
+
+#define GB_CUDA_CATCH(info)                                                    \
+    catch (std::exception& e)                                                  \
+    {                                                                          \
+        printf ("CUDA error: %s\n", e.what ( )) ;                              \
+        info = GrB_PANIC ;                                                     \
+        /* out_of_memory : info = GrB_OUT_OF_MEMORY ; */                       \
+        /* nulltpr:  info = ... ; */                                           \
+        /* no gpus here: info = GrB_PANIC ; */                                 \
+    }                                                                          \
+    if (info != GrB_SUCCESS)                                                   \
+    {                                                                          \
+        /* CUDA failed */                                                      \
+        GB_FREE_ALL ;                                                          \
+        return (GB_ERROR (info, (GB_LOG, "CUDA died\n"))) ;                    \
+    }
+
+// 12 buckets: computed by up to 11 kernel launches (zombies need no work...),
+// using 5 different kernels (with different configurations depending on the
+// bucket).
+    #include "GB_cuda_buckets.h"
+extern "C"
+{
+    #include "GB_cuda_stringify.h"
+    
+}
+#endif
diff --git a/GraphBLAS/CUDA/GB_cuda_buckets.h b/GraphBLAS/CUDA/GB_cuda_buckets.h
new file mode 100644
index 0000000000..4c616c5252
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_buckets.h
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// GB_cuda_buckets.h: definitions for buckets using for dot3 
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
+// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+
+//------------------------------------------------------------------------------
+
+// This file is #include'd only in the GraphBLAS/CUDA/GB_cuda*.cu source files.
+
+#ifndef GB_CUDA_BUCKETS_H
+#define GB_CUDA_BUCKETS_H
+
+// nvcc chokes on the 'restrict' keyword, so define it to the empty string
+// for compiling the *.cu files.
+#define restrict
+
+// nvcc also chokes on fpclassify (an ANSI C11 construct that does not appear
+// in C++11, it seems).  It also issues spurious warnings about compiler
+// pragmas.  Source/GB.h avoids these constructs if GB_NVCC is defined.
+#define GB_NVCC
+
+
+// 12 buckets: computed by up to 11 kernel launches (zombies need no work...),
+// using 5 different kernels (with different configurations depending on the
+// bucket).
+typedef enum
+{
+    // bring out your dead:
+    GB_BUCKET_ZOMBIE = 0,              // C(i,j) is a zombie (not a bucket)
+
+// dot3:  C<M>=A'B, M is sparse or hyper, C is sparse or hyper
+// 32 kernels A,B: (hyper,sparse,bitmap,full)^2 x M is (sparse/hyper)
+
+// a full/full kernel:
+    // CUDA kernel: dndn, handles a single bucket:
+    // both A(:,i) and B(:,j) are dense
+    GB_BUCKET_DNDN = 1,
+
+// two full/(sparse,hyper) kernels:
+    // CUDA kernel: spdn, handles 4 buckets:
+    // A(:,i) is dense and B(:,j) is very sparse (< 256 entries)
+    GB_BUCKET_DNVS = 2,
+    // A(:,i) is dense and B(:,j) is sparse (>= 256 entries)
+    GB_BUCKET_DNSP = 3,
+
+// a sparse/full kernel
+    // A(:,i) is very sparse (< 256 entries) and B(:,j) is dense
+    GB_BUCKET_VSDN = 4,
+    // A(:,i) is sparse (>= 256 entries) and B(:,j) is dense
+    GB_BUCKET_SPDN = 5,
+
+// a sparse/bitmap kernel
+// a bitmap/bitmap kernel
+// a bitmap/sparse kernel
+// ...
+
+
+// sparse/sparse:
+    // CUDA kernel: vssp, handles 1 bucket, uses binary search:
+    // A(:,i) is very sparse compared to B(:,j), or visa versa
+    GB_BUCKET_VSSP = 6,
+
+    // CUDA kernel: vsvs, handles 4 buckets:
+    // let len = nnz (A (:,i) + nnz (B (:,j)), then:
+    GB_BUCKET_VSVS_4 = 7,       // len <= 4
+    GB_BUCKET_VSVS_16 = 8,      // len <= 16
+    GB_BUCKET_VSVS_64 = 9,      // len <= 64
+    GB_BUCKET_VSVS_256 = 10,     // len <= 256
+
+    // CUDA kernel: mp, use the merge-path method:
+    GB_BUCKET_MERGEPATH = 11,
+
+    // CUDA kernel: warpix, use the warp-intersect method, unused so far:
+    GB_BUCKET_WARP_IX = 12
+}
+GB_bucket_code ;
+
+#endif
diff --git a/GraphBLAS/CUDA/GB_cuda_calloc.cu b/GraphBLAS/CUDA/GB_cuda_calloc.cu
new file mode 100644
index 0000000000..4d5d46bf95
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_calloc.cu
@@ -0,0 +1,31 @@
+//------------------------------------------------------------------------------
+// GB_cuda_calloc.cu: wrapper for cudaMallocManaged and memset
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
+// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+
+//------------------------------------------------------------------------------
+
+#include "GB_cuda.h"
+
+void *GB_cuda_calloc (size_t n, size_t size)   // standcard calloc signature
+{
+
+    // malloc the space
+    void *p = GB_cuda_malloc (n * size) ;
+
+    if (p == NULL)
+    {
+        // out of memory, or other CUDA error
+        return (NULL) ;
+    }
+
+    // set the space to zero
+    memset (p, 0, n * size) ;
+
+    // return the result
+    return (p) ;
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_cumsum.cu b/GraphBLAS/CUDA/GB_cuda_cumsum.cu
new file mode 100644
index 0000000000..f3dc45569e
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_cumsum.cu
@@ -0,0 +1,67 @@
+//------------------------------------------------------------------------------
+// GB_cuda_cumsum: cumlative sum of an array using GPU acceleration
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
+// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+
+//------------------------------------------------------------------------------
+
+// Compute the cumulative sum of an array count[0:n], of size n+1
+// in pseudo-MATLAB notation:
+
+//      k = sum (count [0:n-1] != 0) ;
+
+//      count = cumsum ([0 count[0:n-1]]) ;
+
+// That is, count [j] on input is overwritten with the value of
+// sum (count [0..j-1]).  count [n] is implicitly zero on input.
+// On output, count [n] is the total sum.
+
+#include "GB_cuda.h"
+#include <local_cub/device/device_scan.cuh>
+
+GrB_Info GB_cuda_cumsum             // compute the cumulative sum of an array
+(
+    int64_t *restrict count,    // size n+1, input/output
+    const int64_t n
+)
+{
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (count != NULL) ;
+    ASSERT (n >= 0) ;
+
+    //--------------------------------------------------------------------------
+    // count = cumsum ([0 count[0:n-1]]) ;
+    //--------------------------------------------------------------------------
+    void *d_temp_storage = NULL;
+    size_t temp_storage_bytes;
+    cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, count, count, (int)n);
+    d_temp_storage  = GB_malloc_memory( temp_storage_bytes, 1);
+    if ( d_temp_storage == NULL){
+       return GrB_OUT_OF_MEMORY;
+    } 
+
+    // Run
+    CubDebugExit(cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, count, count, n));
+
+    // Check for correctness (and display results, if specified)
+    #ifdef GB_DEBUG
+    int compare = CompareDeviceResults(h_reference, count, num_items, true, g_verbose);
+    ASSERT( compare == 0);
+    #endif
+
+    // Cleanup
+    GB_free_memory(d_temp_storage ) ; 
+
+    return GrB_SUCCESS;
+}
+
+
+
+
+
diff --git a/GraphBLAS/CUDA/GB_cuda_free.cu b/GraphBLAS/CUDA/GB_cuda_free.cu
new file mode 100644
index 0000000000..1fa21b3dfc
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_free.cu
@@ -0,0 +1,19 @@
+//------------------------------------------------------------------------------
+// GB_cuda_free.cu: wrapper for cudaFree
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
+// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+
+//------------------------------------------------------------------------------
+
+#include "GB_cuda.h"
+#include "rmm/detail/cnmem.h"
+
+void GB_cuda_free (void *p)     // standard free signature
+{
+    cnmemFree( p , NULL);
+    //printf(" GPU %d freeing mem\n", device);
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_get_device_count.cu b/GraphBLAS/CUDA/GB_cuda_get_device_count.cu
new file mode 100644
index 0000000000..cf7ed54962
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_get_device_count.cu
@@ -0,0 +1,21 @@
+//------------------------------------------------------------------------------
+// GB_cuda_get_device_count.cu: find out how many GPUs exist
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
+// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+
+//------------------------------------------------------------------------------
+
+#include "GB_cuda.h"
+
+bool GB_cuda_get_device_count   // true if OK, false if failure
+(
+    int *gpu_count              // return # of GPUs in the system
+)
+{
+    cudaError_t err = cudaGetDeviceCount (gpu_count) ;
+    return (err == cudaSuccess) ;
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu b/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu
new file mode 100644
index 0000000000..d3b48f1f64
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_get_device_properties.cu
@@ -0,0 +1,97 @@
+//------------------------------------------------------------------------------
+// GB_cuda_get_device_properties.cu: get the properties of a GPU
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
+// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+
+//------------------------------------------------------------------------------
+
+#include "GB_cuda.h"
+
+bool GB_cuda_get_device ( int &device){
+    bool goodreturn = false;
+    if (&device == NULL)
+    {
+        // invalid inputs
+        return (false) ;
+    }
+
+    CHECK_CUDA_SIMPLE ( cudaGetDevice( &device ) ); 
+    goodreturn = true;
+
+    return goodreturn;
+
+}
+
+bool GB_cuda_set_device( int device) {
+    bool goodreturn = false;
+    if (device < 0)
+    {
+        // invalid inputs
+        return (false) ;
+    }
+
+    CHECK_CUDA_SIMPLE ( cudaSetDevice( device ) ); 
+    goodreturn = true;
+
+    return goodreturn;
+}
+
+bool GB_cuda_get_device_properties  // true if OK, false if failure
+(
+    int device,
+    GB_cuda_device *prop
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+    bool goodreturn = false;
+    if (prop == NULL || device < 0)
+    {
+        // invalid inputs
+        return (false) ;
+    }
+
+    int old_device;
+    CHECK_CUDA_SIMPLE ( cudaGetDevice( &old_device ) ) ; 
+
+
+    //--------------------------------------------------------------------------
+    // get the properties
+    //--------------------------------------------------------------------------
+    int num_sms;
+    int compute_capability_major;
+    int compute_capability_minor;
+    size_t memfree, memtotal;
+
+    CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&num_sms,
+                                         cudaDevAttrMultiProcessorCount,
+                                         device) );
+    CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&compute_capability_major,
+                                         cudaDevAttrComputeCapabilityMajor,
+                                         device) );
+    CHECK_CUDA_SIMPLE( cudaDeviceGetAttribute (&compute_capability_minor,
+                                         cudaDevAttrComputeCapabilityMajor,
+                                         device) );
+
+    CHECK_CUDA_SIMPLE ( cudaSetDevice( device ) ); 
+    CHECK_CUDA_SIMPLE ( cudaMemGetInfo( & memfree, &memtotal) ) ;
+    CHECK_CUDA_SIMPLE ( cudaSetDevice( old_device ) ); 
+
+    prop->total_global_memory = memtotal;
+    prop->number_of_sms = num_sms;
+    prop->compute_capability_major = compute_capability_major;
+    prop->compute_capability_minor = compute_capability_minor;
+    
+    goodreturn = true;
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    return  goodreturn;
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_global.cpp b/GraphBLAS/CUDA/GB_cuda_global.cpp
new file mode 100644
index 0000000000..fbe0d168d8
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_global.cpp
@@ -0,0 +1,24 @@
+//------------------------------------------------------------------------------
+// GB_cuda_global.cpp: accessor functions for global GraphBLAS/CUDA variables
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS/CUDA, (c) NVIDIA Corp. 2017-2019, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_callback.hpp"
+
+//Global definition required here, sorry
+GB_callback *SR_callback_ptr;   // thunk
+
+std::istream* callback_wrapper
+(
+    std::string file_name,      // string with the requested "file" name
+    std::iostream& file_stream  // the I/O stream for the "file" contents
+)
+{
+    return SR_callback_ptr->callback (file_name, file_stream) ;
+}
+
+
diff --git a/GraphBLAS/CUDA/GB_cuda_global.h b/GraphBLAS/CUDA/GB_cuda_global.h
new file mode 100644
index 0000000000..8b26e728a0
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_global.h
@@ -0,0 +1,18 @@
+//------------------------------------------------------------------------------
+// GB_cuda_global.h: global variables needed for GraphBLAS/CUDA
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS/CUDA, (c) NVIDIA Corp. 2017-2019, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_CUDA_GLOBAL_H
+#define GB_CUDA_GLOBAL_H
+
+#include "GB_callback.hpp"
+
+std::istream* callback_wrapper( std::string file_name, std::iostream& tmp) ;
+
+#endif
+
diff --git a/GraphBLAS/CUDA/GB_cuda_malloc.cu b/GraphBLAS/CUDA/GB_cuda_malloc.cu
new file mode 100644
index 0000000000..64c5928961
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_malloc.cu
@@ -0,0 +1,24 @@
+//------------------------------------------------------------------------------
+// GB_cuda_malloc.cu: wrapper for cuda Managed Memory allocator, or pool 
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
+// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+
+//------------------------------------------------------------------------------
+
+#include "GB_cuda.h"
+#include "rmm/detail/cnmem.h"
+
+void *GB_cuda_malloc (size_t size)          // standard malloc signature
+{
+    void *p = NULL ;
+
+    cnmemMalloc( &p, size, NULL);
+
+    return p;
+  
+     
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_stringifier.cpp b/GraphBLAS/CUDA/GB_cuda_stringifier.cpp
new file mode 100644
index 0000000000..52f90dfab7
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_stringifier.cpp
@@ -0,0 +1,877 @@
+// Class to manage both stringify functions from semiring, ops and monoids to char buffers
+// Also provides a iostream callback to deliver the buffer to jitify as if read from a file
+
+// (c) Nvidia Corp. 2020 All rights reserved 
+// SPDX-License-Identifier: Apache-2.0
+
+// Implementations of string callbacks
+#pragma once
+#include <iostream>
+#include "GB.h"
+#include "GB_cuda_stringify.h"
+
+// Define function pointer we will use later
+//std::istream* (*file_callback)(std::string, std::iostream&);
+
+// Define a factory class for building any buffer of text
+class GB_cuda_stringifier {
+  char callback_buffer[2048];
+  char *callback_string;
+  const char *include_filename;
+
+  public:
+
+//------------------------------------------------------------------------------
+// load string: set string and file name to mimic
+//------------------------------------------------------------------------------
+    void load_string(const char *fname, char *input)
+    {
+        callback_string = input; 
+        include_filename =  fname;
+    }
+
+//------------------------------------------------------------------------------
+// callback: return string as if it was read from a file 
+//------------------------------------------------------------------------------
+
+    std::istream* callback( std::string filename, std::iostream& tmp_stream) 
+    {
+        if ( filename == std::string(this->include_filename) )
+        {
+           tmp_stream << this->callback_string; 
+           return &tmp_stream;
+        }
+        else 
+        {
+           return nullptr;
+        }
+    }
+
+//------------------------------------------------------------------------------
+// stringify_identity: return string for identity value
+//------------------------------------------------------------------------------
+#define  ID( x)  IDENT = (x)
+    void stringify_identity 
+    (
+        // output:
+        char *code_string,  // string with the #define macro
+        // input:
+        GB_Opcode opcode,     // must be a built-in binary operator from a monoid
+        GB_Type_code zcode    // type code used in the opcode we want
+    )
+    {
+        const char *IDENT;
+        switch (opcode)
+        {
+            case GB_MIN_opcode :
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code   : ID ("true") ;     // boolean AND
+                    case GB_INT8_code   : ID ("INT8_MAX") ;
+                    case GB_INT16_code  : ID ("INT16_MAX") ;
+                    case GB_INT32_code  : ID ("INT32_MAX") ;
+                    case GB_INT64_code  : ID ("INT64_MAX") ;
+                    case GB_UINT8_code  : ID ("UINT8_MAX") ;
+                    case GB_UINT16_code : ID ("UINT16_MAX") ;
+                    case GB_UINT32_code : ID ("UINT32_MAX") ;
+                    case GB_UINT64_code : ID ("UINT64_MAX") ;
+                    default             : ID ("INFINITY") ;
+                }
+                break ;
+
+            case GB_MAX_opcode :
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code   : ID ("false") ;    // boolean OR
+                    case GB_INT8_code   : ID ("INT8_MIN") ;
+                    case GB_INT16_code  : ID ("INT16_MIN") ;
+                    case GB_INT32_code  : ID ("INT32_MIN") ;
+                    case GB_INT64_code  : ID ("INT64_MIN") ;
+                    case GB_UINT8_code  : ID ("0") ;
+                    case GB_UINT16_code : ID ("0") ;
+                    case GB_UINT32_code : ID ("0") ;
+                    case GB_UINT64_code : ID ("0") ;
+                    default             : ID ("(-INFINITY)") ;
+                }
+                break ;
+
+            case GB_PLUS_opcode     : ID ("0") ;
+            case GB_TIMES_opcode    : ID ("1") ;
+            case GB_LOR_opcode      : ID ("false") ;
+            case GB_LAND_opcode     : ID ("true") ;
+            case GB_LXOR_opcode     : ID ("false") ;
+            // case GB_LXNOR_opcode :
+            case GB_EQ_opcode       : ID ("true") ;
+            // case GB_ANY_opcode   :
+            default                 : ID ("0") ;
+        }
+        snprintf (code_string, GB_CUDA_STRLEN, "#define GB_IDENTITY (%s)", IDENT) ;
+
+    }
+
+    
+    const char *GB_cuda_stringify_opcode
+    (
+    GB_Opcode opcode    // opcode of GraphBLAS operator
+    )
+    {
+        switch (opcode)
+        {
+            case GB_FIRST_opcode :  return ("1st") ;
+            // case GB_ANY_opcode : return ("any") ;
+            case GB_SECOND_opcode : return ("2nd") ;
+            case GB_MIN_opcode :    return ("min") ;
+            case GB_MAX_opcode :    return ("max") ;
+            case GB_PLUS_opcode :   return ("plus") ;
+            case GB_MINUS_opcode :  return ("minus") ;
+            case GB_RMINUS_opcode : return ("rminus") ;
+            case GB_TIMES_opcode :  return ("times") ;
+            case GB_DIV_opcode :    return ("div") ;
+            case GB_RDIV_opcode :   return ("rdiv") ;
+            case GB_EQ_opcode :     return ("eq") ;
+            case GB_ISEQ_opcode :   return ("iseq") ;
+            case GB_NE_opcode :     return ("ne") ;
+            case GB_ISNE_opcode :   return ("isne") ;
+            case GB_GT_opcode :     return ("gt") ;
+            case GB_ISGT_opcode :   return ("isgt") ;
+            case GB_LT_opcode :     return ("lt") ;
+            case GB_ISLT_opcode :   return ("islt") ;
+            case GB_GE_opcode :     return ("ge") ;
+            case GB_ISGE_opcode :   return ("isge") ;
+            case GB_LE_opcode :     return ("le") ;
+            case GB_ISLE_opcode :   return ("isle") ;
+            case GB_LOR_opcode :    return ("lor") ;
+            case GB_LAND_opcode :   return ("land") ;
+            case GB_LXOR_opcode :   return ("lxor") ;
+            // case GB_BOR_opcode : ... bitwise ops
+            // x | y, etc
+            // case GB_PAIR_opcode :
+            default :  ;
+        }
+
+        return ("") ;
+    }
+   
+    void stringify_binop 
+    (
+        // output:
+        char *code_string,  // string with the #define macro
+        // input:
+        const char *macro_name,   // name of macro to construct
+        GB_Opcode opcode,   // opcode of GraphBLAS operator to convert into a macro
+        GB_Type_code zcode  // op->ztype->code of the operator
+    )
+    {
+
+    // The binop macro generates an expression, not a full statement (there
+    // is no semicolon).
+
+    // for example:
+    // #define GB_MULT(x,y) ((x) * (y))
+
+        const char *f ;
+
+        switch (opcode)
+        {
+
+            case GB_FIRST_opcode :    //  7: z = x
+
+                f = "(x)" ;
+                break ;
+
+            // case GB_ANY_opcode :
+            case GB_SECOND_opcode :   //  8: z = y
+
+                f = "(y)" ;
+                break ;
+
+            case GB_MIN_opcode :      //  9: z = min(x,y)
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code    : f = "(x) && (y)" ;
+                    case GB_FP32_code    : f = "fminf (x,y)" ;
+                    case GB_FP64_code    : f = "fmin (x,y)" ;
+                    default              : f = "GB_IMIN (x,y)" ;
+                }
+                break ;
+
+            case GB_MAX_opcode :      // 10: z = max(x,y)
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code    : f = "(x) || (y)" ;
+                    case GB_FP32_code    : f = "fmaxf (x,y)" ;
+                    case GB_FP64_code    : f = "fmax (x,y)" ;
+                    default              : f = "GB_IMAX (x,y)" ;
+                }
+                break ;
+
+            case GB_PLUS_opcode :     // 11: z = x + y
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code    : f = "(x) || (y)" ;
+                    default              : f = "(x) + (y)" ;
+                }
+                break ;
+
+            case GB_MINUS_opcode :    // 12: z = x - y
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code    : f = "(x) != (y)" ;
+                    default              : f = "(x) - (y)" ;
+                }
+                break ;
+
+            case GB_RMINUS_opcode :   // 13: z = y - x
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code    : f = "(x) != (y)" ;
+                    default              : f = "(y) - (x)" ;
+                }
+                break ;
+
+            case GB_TIMES_opcode :    // 14: z = x * y
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code    : f = "(x) && (y)" ;
+                    default              : f = "(x) * (y)" ;
+                }
+                break ;
+
+            case GB_DIV_opcode :      // 15: z = x / y ;
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code   : f = "(x)" ;
+                    case GB_INT8_code   : f = "GB_IDIV_SIGNED (x,y,8)" ;
+                    case GB_INT16_code  : f = "GB_IDIV_SIGNED (x,y,16)" ;
+                    case GB_INT32_code  : f = "GB_IDIV_SIGNED (x,y,32)" ;
+                    case GB_INT64_code  : f = "GB_IDIV_SIGNED (x,y,64)" ;
+                    case GB_UINT8_code  : f = "GB_IDIV_UNSIGNED (x,y,8)" ;
+                    case GB_UINT16_code : f = "GB_IDIV_UNSIGNED (x,y,16)" ;
+                    case GB_UINT32_code : f = "GB_IDIV_UNSIGNED (x,y,32)" ;
+                    case GB_UINT64_code : f = "GB_IDIV_UNSIGNED (x,y,64)" ;
+                    default             : f = "(x) / (y)" ;
+                }
+                break ;
+
+            case GB_RDIV_opcode :      // z = y / x ;
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code   : f = "(x)" ;
+                    case GB_INT8_code   : f = "GB_IDIV_SIGNED (y,x,8)" ;
+                    case GB_INT16_code  : f = "GB_IDIV_SIGNED (y,x,16)" ;
+                    case GB_INT32_code  : f = "GB_IDIV_SIGNED (y,x,32)" ;
+                    case GB_INT64_code  : f = "GB_IDIV_SIGNED (y,x,64)" ;
+                    case GB_UINT8_code  : f = "GB_IDIV_UNSIGNED (y,x,8)" ;
+                    case GB_UINT16_code : f = "GB_IDIV_UNSIGNED (y,x,16)" ;
+                    case GB_UINT32_code : f = "GB_IDIV_UNSIGNED (y,x,32)" ;
+                    case GB_UINT64_code : f = "GB_IDIV_UNSIGNED (y,x,64)" ;
+                    default             : f = "(y) / (x)" ;
+                }
+                break ;
+
+            case GB_EQ_opcode :
+            case GB_ISEQ_opcode :     // 17: z = (x == y)
+
+                f = "(x) == (y)" ;
+                break ;
+
+            case GB_NE_opcode :
+            case GB_ISNE_opcode :     // 18: z = (x != y)
+
+                f = "(x) != (y)" ;
+                break ;
+
+            case GB_GT_opcode :
+            case GB_ISGT_opcode :     // 19: z = (x >  y)
+
+                f = "(x) > (y)" ;
+                break ;
+
+            case GB_LT_opcode :
+            case GB_ISLT_opcode :     // 20: z = (x <  y)
+
+                f = "(x) < (y)" ;
+                break ;
+
+            case GB_GE_opcode :
+            case GB_ISGE_opcode :     // 21: z = (x >= y)
+
+                f = "(x) >= (y)" ;
+                break ;
+
+            case GB_LE_opcode :
+            case GB_ISLE_opcode :     // 22: z = (x <= y)
+
+                f = "(x) <= (y)" ;
+                break ;
+
+            case GB_LOR_opcode :      // 23: z = (x != 0) || (y != 0)
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code    : f = "(x) || (y)" ;
+                    default              : f = "((x) != 0) || ((y) != 0)" ;
+                }
+                break ;
+
+            case GB_LAND_opcode :     // 23: z = (x != 0) && (y != 0)
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code    : f = "(x) && (y)" ;
+                    default              : f = "((x) != 0) && ((y) != 0)" ;
+                }
+                break ;
+
+            case GB_LXOR_opcode :     // 25: z = (x != 0) != (y != 0)
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code    : f = "(x) != (y)" ;
+                    default              : f = "((x) != 0) != ((y) != 0)" ;
+                }
+                break ;
+
+            // case GB_BOR_opcode : ... bitwise ops
+            // x | y, etc
+
+            // case GB_PAIR_opcode :
+            default :
+                
+                f = "1" ;
+                break ;
+        }
+
+        snprintf (code_string, GB_CUDA_STRLEN,
+            "#define %s(x,y) (%s)", macro_name, f) ;
+    }
+
+
+    void stringify_terminal 
+    (
+        // outputs:
+        bool *is_monoid_terminal,
+        char *terminal_condition,
+        char *terminal_statement,
+        // inputs:
+        const char *macro_condition_name,
+        const char *macro_statement_name,
+        GB_Opcode opcode,    // must be a built-in binary operator from a monoid
+        GB_Type_code zcode   // op->ztype->code
+    )
+    {
+    //------------------------------------------------------------------------------
+    // GB_cuda_stringify_terminal: string to check terminal condition
+    //------------------------------------------------------------------------------
+
+    // The macro_condition_name(cij) should return true if the value of cij has
+    // reached its terminal value, or false otherwise.  If the monoid is not
+    // terminal, then the macro should always return false.  The ANY monoid
+    // should always return true.
+
+    // The macro_statement_name is a macro containing a full statement.  If the
+    // monoid is never terminal, it becomes the empty statement (";").  Otherwise,
+    // it checks the terminal condition and does a "break" if true.
+
+
+    //--------------------------------------------------------------------------
+    // determine if the monoid is terminal, and find its terminal value
+    //--------------------------------------------------------------------------
+
+        bool is_terminal = false ;
+        const char *f = NULL ;
+
+        switch (opcode)
+        {
+
+            #if 0
+            case GB_ANY_opcode :
+                f = NULL ;
+                is_terminal = true ;
+                break ;
+            #endif
+
+            case GB_MIN_opcode :
+
+                is_terminal = true ;
+                switch (zcode)
+                {
+                    case GB_BOOL_code   : f = "false" ;         break ;
+                    case GB_INT8_code   : f = "INT8_MIN" ;      break ;
+                    case GB_INT16_code  : f = "INT16_MIN" ;     break ;
+                    case GB_INT32_code  : f = "INT32_MIN" ;     break ;
+                    case GB_INT64_code  : f = "INT64_MIN" ;     break ;
+                    case GB_UINT8_code  : f = "0" ;             break ;
+                    case GB_UINT16_code : f = "0" ;             break ;
+                    case GB_UINT32_code : f = "0" ;             break ;
+                    case GB_UINT64_code : f = "0" ;             break ;
+                    default             : f = "(-INFINITY)" ;   break ;
+                }
+                break ;
+
+            case GB_MAX_opcode :
+
+                is_terminal = true ;
+                switch (zcode)
+                {
+                    case GB_BOOL_code   : f = "true" ;          break ;
+                    case GB_INT8_code   : f = "INT8_MAX" ;      break ;
+                    case GB_INT16_code  : f = "INT16_MAX" ;     break ;
+                    case GB_INT32_code  : f = "INT32_MAX" ;     break ;
+                    case GB_INT64_code  : f = "INT64_MAX" ;     break ;
+                    case GB_UINT8_code  : f = "UINT8_MAX" ;     break ;
+                    case GB_UINT16_code : f = "UINT16_MAX" ;    break ;
+                    case GB_UINT32_code : f = "UINT32_MAX" ;    break ;
+                    case GB_UINT64_code : f = "UINT64_MAX" ;    break ;
+                    default             : f = "INFINITY" ;      break ;
+                }
+                break ;
+
+            case GB_PLUS_opcode :
+
+                if (zcode == GB_BOOL_code)
+                {
+                    f = "true" ;      // boolean OR
+                    is_terminal = true ;
+                }
+                else
+                {
+                    f = NULL ;
+                    is_terminal = false ;
+                }
+                break ;
+
+            case GB_TIMES_opcode :
+
+                switch (zcode)
+                {
+                    case GB_BOOL_code   :   // boolean AND
+                    case GB_INT8_code   :
+                    case GB_INT16_code  :
+                    case GB_INT32_code  :
+                    case GB_INT64_code  :
+                    case GB_UINT8_code  :
+                    case GB_UINT16_code :
+                    case GB_UINT32_code :
+                    case GB_UINT64_code :
+                        f = "0" ;
+                        is_terminal = true ;
+                        break ;
+                    default             :
+                        f = NULL ;
+                        is_terminal = false ;
+                        break ;
+                }
+                break ;
+
+            case GB_LOR_opcode      : f = "true"  ; is_terminal = true  ; break ;
+            case GB_LAND_opcode     : f = "false" ; is_terminal = true  ; break ; 
+
+            case GB_LXOR_opcode     :
+            // case GB_LXNOR_opcode :
+            case GB_EQ_opcode       :
+            default                 :
+                // the monoid is not terminal
+                f = NULL ;
+                is_terminal = false ;
+                break ;
+        }
+
+        //--------------------------------------------------------------------------
+        // construct the macro to test the terminal condition
+        //--------------------------------------------------------------------------
+
+        if (is_terminal)
+        {
+            // the monoid is terminal
+            if (f == NULL)
+            {
+                // ANY monoid
+                snprintf (terminal_condition, GB_CUDA_STRLEN,
+                    "#define %s(cij) true", macro_condition_name) ;
+                snprintf (terminal_statement, GB_CUDA_STRLEN,
+                    "#define %s break", macro_statement_name) ;
+            }
+            else
+            {
+                // typical terminal monoids: check if C(i,j) has reached its
+                // terminal value
+                snprintf (terminal_condition, GB_CUDA_STRLEN,
+                    "#define %s(cij) ((cij) == %s)", macro_condition_name, f) ;
+                snprintf (terminal_statement, GB_CUDA_STRLEN,
+                    "#define %s if (%s (cij)) break",
+                    macro_statement_name, macro_condition_name) ;
+            }
+        }
+        else
+        {
+            // the monoid is not terminal: the condition is always false
+            snprintf (terminal_condition, GB_CUDA_STRLEN, "#define %s(cij) false",
+                macro_condition_name) ;
+            snprintf (terminal_statement, GB_CUDA_STRLEN, "#define %s",
+                macro_statement_name) ;
+        }
+
+        (*is_monoid_terminal) = is_terminal ;
+    }
+
+
+    //--------------------------------------------------------------------------
+    //  Handle mask type and structural vs not 
+    //--------------------------------------------------------------------------
+    const char *stringify_mask
+    (
+       const GB_Type_code M_type_code,
+       bool mask_is_structural
+    )
+    {
+
+        if (mask_is_structural)
+        {
+            return (
+                "#define GB_MTYPE void\n"
+                "#define MX(i) true") ;
+        }
+        else
+        {
+            switch (M_type_code)
+            {
+                case GB_BOOL_code:
+                case GB_INT8_code:
+                case GB_UINT8_code:
+                    return (
+                        "#define GB_MTYPE uint8_t\n"
+                        "#define MX(i) Mx [i]") ;
+
+                case GB_INT16_code:
+                case GB_UINT16_code:
+                    return (
+                        "#define GB_MTYPE uint16_t\n"
+                        "#define MX(i) Mx [i]") ;
+
+                case GB_INT32_code:
+                case GB_UINT32_code:
+    //          case GB_FC32_code:
+                case GB_FP32_code:
+                    return (
+                        "#define GB_MTYPE uint32_t\n"
+                        "#define MX(i) Mx [i]") ;
+
+                case GB_INT64_code:
+                case GB_UINT64_code:
+    //          case GB_FC64_code:
+                case GB_FP64_code:
+                    return (
+                        "#define GB_MTYPE uint64_t\n"
+                        "#define MX(i) Mx [i]") ;
+
+    //          case GB_FC64_code:
+    //              return (
+    //                  "#define GB_MTYPE double complex\n"
+    //                  "#define MX(i) Mx [i]") ;
+
+                default: ;
+            }
+        }
+        
+        // unrecognized type
+        return (NULL) ;
+    }
+
+// Construct a macro to load and typecast.  For example:
+//  
+//  #define GB_GETA(blob) blob
+//
+// then use as:
+//      GB_GETA (double aij = Ax [p]) ;
+//      GB_GETA (double *Ax = A->x) ;
+//      GB_GETA (T_A *restrict Ax = A->x) ;
+//
+// which become
+//      double aij = Ax [p] ;
+//      double *Ax = A->x ;
+//      T_A *Ax = A->x ;
+//
+// or, if is_pattern is true, the macro becomes the empty string.
+
+    void stringify_load {}
+    (
+        // output:
+        char *result,
+        // input:
+        const char *macro_name,       // name of macro to construct
+        bool is_pattern         // if true, load/cast does nothing
+    )
+    {
+
+        if (is_pattern)
+        {
+            snprintf (result, GB_CUDA_STRLEN, "#define %s(blob)", macro_name) ;
+        }
+        else
+        {
+            snprintf (result, GB_CUDA_STRLEN, "#define %s(blob) blob", macro_name) ;
+        }
+    }
+
+    void stringify_semiring {}
+
+    // Construct a string defining a semiring.
+    // User-defined types are not handled.
+    // build a semiring (name and code)
+    (
+        // input:
+        GrB_Semiring semiring,  // the semiring to stringify
+        bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
+        GrB_Type ctype,         // the type of C
+        GrB_Type atype,         // the type of A
+        GrB_Type btype,         // the type of B
+        GrB_Type mtype,         // the type of M, or NULL if no mask
+        bool Mask_struct,       // mask is structural
+        bool mask_in_semiring_name, // if true, then the semiring_name includes
+                                    // the mask_name.  If false, then semiring_name
+                                    // is independent of the mask_name
+        // output: (all of size at least GB_CUDA_LEN+1)
+        char *semiring_name,    // name of the semiring
+        char *semiring_code,    // List of types and macro defs
+        char *mask_name         // definition of mask data load
+    )
+    {
+
+        // check inputs
+        ASSERT (semiring->object_kind == GB_BUILTIN) ;
+
+        // get the semiring
+        GrB_Monoid add = semiring->add ;
+        GrB_BinaryOp mult = semiring->multiply ;
+        GrB_BinaryOp addop = add->op ;
+        GrB_Type xtype = mult->xtype ;
+        GrB_Type ytype = mult->ytype ;
+        GrB_Type ztype = mult->ztype ;
+        GB_Opcode mult_opcode = mult->opcode ;
+        GB_Opcode add_opcode  = addop->opcode ;
+        GB_Type_code xcode = xtype->code ;
+        GB_Type_code ycode = ytype->code ;
+        GB_Type_code zcode = ztype->code ;
+
+        // these must always be true for any semiring:
+        ASSERT (mult->ztype == addop->ztype) ;
+        ASSERT (addop->xtype == addop->ztype && addop->ytype == addop->ztype) ;
+
+        // for now, this is true for all built-in binops:
+        ASSERT (xcode == ycode) ;
+
+        //--------------------------------------------------------------------------
+        // rename redundant boolean operators
+        //--------------------------------------------------------------------------
+
+        // consider z = op(x,y) where both x and y are boolean:
+        // DIV becomes FIRST
+        // RDIV becomes SECOND
+        // MIN and TIMES become LAND
+        // MAX and PLUS become LOR
+        // NE, ISNE, RMINUS, and MINUS become LXOR
+        // ISEQ becomes EQ
+        // ISGT becomes GT
+        // ISLT becomes LT
+        // ISGE becomes GE
+        // ISLE becomes LE
+
+        if (zcode == GB_BOOL_code)
+        {
+            // rename the monoid
+            add_opcode = GB_boolean_rename (add_opcode) ;
+        }
+
+        if (xcode == GB_BOOL_code)  // && (ycode == GB_BOOL_code)
+        { 
+            // rename the multiplicative operator
+            mult_opcode = GB_boolean_rename (mult_opcode) ;
+        }
+
+        //--------------------------------------------------------------------------
+        // handle the flip
+        //--------------------------------------------------------------------------
+
+        if (flipxy)
+        { 
+            // z = fmult (b,a) will be computed: handle this by renaming the
+            // multiplicative operator
+
+            // handle the flip
+            mult_opcode = GB_binop_flip (mult_opcode) ;
+
+            // the flip is now handled completely.  This assumes xtype and ytype
+            // are the same for all built-in operators.  If this changes, the
+            // types will have to be flipped too.
+            flipxy = false ;
+        }
+
+        //--------------------------------------------------------------------------
+        // determine if A and/or B are value-agnostic
+        //--------------------------------------------------------------------------
+
+        bool op_is_first  = (mult_opcode == GB_FIRST_opcode ) ;
+        bool op_is_second = (mult_opcode == GB_SECOND_opcode) ;
+        bool op_is_pair   = false ; // (mult_opcode == GB_PAIR_opcode) ;
+        bool A_is_pattern = op_is_second || op_is_pair ;
+        bool B_is_pattern = op_is_first  || op_is_pair ;
+
+        //--------------------------------------------------------------------------
+        // construct macros to load scalars from A and B (and typecast) them
+        //--------------------------------------------------------------------------
+
+        char acast [GB_CUDA_STRLEN+1] ;
+        char bcast [GB_CUDA_STRLEN+1] ;
+        GB_cuda_stringify_load (acast, "GB_GETA", A_is_pattern) ;
+        GB_cuda_stringify_load (bcast, "GB_GETB", B_is_pattern) ;
+
+        //--------------------------------------------------------------------------
+        // construct macros for the multiply
+        //--------------------------------------------------------------------------
+
+        char mult_function [GB_CUDA_STRLEN+1] ;
+        GB_cuda_stringify_binop (mult_function, "GB_MULT", mult_opcode, zcode) ;
+
+        //--------------------------------------------------------------------------
+        // construct the monoid macros
+        //--------------------------------------------------------------------------
+
+        char add_function [GB_CUDA_STRLEN+1] ;
+        GB_cuda_stringify_binop (add_function, "GB_ADD", add_opcode, zcode) ;
+
+        char identity_definition [GB_CUDA_STRLEN+1] ;
+        GB_cuda_stringify_identity ( identity_definition, add_opcode, zcode) ;
+
+        bool is_terminal ;
+        char terminal_condition [GB_CUDA_STRLEN+1] ;
+        char terminal_statement [GB_CUDA_STRLEN+1] ;
+
+        GB_cuda_stringify_terminal (
+            &is_terminal, terminal_condition, terminal_statement,
+            "GB_TERMINAL_CONDITION", "GB_IF_TERMINAL_BREAK", add_opcode, zcode) ;
+
+        //--------------------------------------------------------------------------
+        // macro to typecast the result back into C
+        //--------------------------------------------------------------------------
+
+        // for the ANY_PAIR semiring, "c_is_one" will be true, and Cx [0..cnz] will
+        // be filled with all 1's later.
+        bool c_is_one = false ;
+        // TODO:
+        // (add_opcode == GB_ANY_opcode && mult_opcode == GB_PAIR_opcode) ;
+        char ccast [GB_CUDA_STRLEN+1] ;
+        GB_cuda_stringify_load (ccast, "GB_PUTC", c_is_one) ;
+
+        //--------------------------------------------------------------------------
+        // construct the macros to access the mask (if any), and its name
+        //--------------------------------------------------------------------------
+
+        const char *mask_string = "" ;
+        const char *mask_type_name = "" ;
+        const char *struct_str = "struct";
+        if (mtype != NULL)
+        {
+            mask_string = GB_cuda_stringify_mask (mtype->code, Mask_struct) ;
+            mask_type_name = mtype->name ;
+        }
+        else
+        {
+            mask_type_name = struct_str;
+        }
+
+        snprintf (mask_name, GB_CUDA_STRLEN, "mask_%s", mask_type_name) ;
+
+        //--------------------------------------------------------------------------
+        // build the final semiring code
+        //--------------------------------------------------------------------------
+
+        snprintf (semiring_code, GB_CUDA_STRLEN,
+            "%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n",
+            acast, bcast, mult_function, add_function, identity_definition,
+            terminal_condition, terminal_statement, ccast, mask_string) ;
+
+        //--------------------------------------------------------------------------
+        // build the final semiring name
+        //--------------------------------------------------------------------------
+
+        // the semiring_name depends on:
+        // add_opcode
+        // mult_opcode
+        // ztype->name
+        // xtype->name (currently, always == ytype->name, but will change (TODO))
+        // ytype->name
+        // ctype->name
+        // mask_type_name    (but only if mask_in_semiring_name is true)
+        // atype->name
+        // btype->name
+
+        const char *add_name;
+        const char *mult_name;
+
+        add_name  = GB_cuda_stringify_opcode (add_opcode) ;
+        mult_name = GB_cuda_stringify_opcode (mult_opcode) ;
+
+    //  these are not needed: they are template parameters to the CUDA kernel:
+    //  ztype->name, xtype->name, ytype->name,
+    //  ctype->name, atype->name, btype->name
+
+    //  ztype->name is required, since the kernel needs it for the identity
+    //  value.  xtype->name is not strictly required.  However, the GraphBLAS
+    //  naming scheme is add_mult_xtype, so it is included here.  The ytype
+    //  and ztype need not be xtype.
+
+        if (mask_in_semiring_name)
+        {
+
+            // the format of the semiring name is:
+            //
+            //  semiring_add_mult_xtype_M_mtype_Z_ztype
+
+            snprintf (semiring_name, GB_CUDA_STRLEN,
+                "semiring_%s_%s_%s_M_%s_Z_%s",
+                // The first part is akin to GxB_PLUS_TIMES_FP64 (for example),
+                // but here this example is semiring_plus_times_double instead:
+                add_name, mult_name, xtype->name,
+                // these are not in the GrB* or GxB* name, but are needed by CUDA:
+                // mask_type_name is (say) 'int64' or 'bool'.
+                // ztype is the name of the monoid type.
+                mask_type_name, ztype->name) ;
+
+        }
+        else
+        {
+
+            // the format of the semiring name is:
+            //
+            //  semiring_add_mult_xtype_Z_ztype
+
+            snprintf (semiring_name, GB_CUDA_STRLEN,
+                "semiring_%s_%s_%s_Z_%s",
+                // The first part is akin to GxB_PLUS_TIMES_FP64 (for example),
+                // but here this example is semiring_plus_times_double instead:
+                add_name, mult_name, xtype->name,
+                // this is not in the GrB* or GxB* name, but is needed by CUDA:
+                // ztype is the name of the monoid type.
+                ztype->name) ;
+
+        }
+
+        printf ("semiring_name:\n%s\n", semiring_name) ;
+        //printf ("semiring_code:\n%s\n", semiring_code) ;
+        //printf ("mask_name:    \n%s\n", mask_name) ;
+    }
+
+
+};
+
diff --git a/GraphBLAS/CUDA/GB_cuda_stringify.h b/GraphBLAS/CUDA/GB_cuda_stringify.h
new file mode 100644
index 0000000000..18cc7464f1
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_stringify.h
@@ -0,0 +1,95 @@
+// SPDX-License-Identifier: Apache-2.0
+//------------------------------------------------------------------------------
+// GB_cuda_stringify.h: prototype definitions for using C helpers 
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
+// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+
+//------------------------------------------------------------------------------
+
+// This file is #include'd only in the GraphBLAS/CUDA/GB_cuda*.cu source files.
+
+#ifndef GB_CUDA_STRINGIFY_H
+#define GB_CUDA_STRINGIFY_H
+
+// length of strings for building semiring code and names
+#define GB_CUDA_STRLEN 2048
+
+void GB_cuda_stringify_terminal // return strings to check terminal
+(
+    // outputs:
+    bool *is_monoid_terminal,
+    char *terminal_condition,
+    char *terminal_statement,
+    // inputs:
+    const char *macro_condition_name,
+    const char *macro_statement_name,
+    GB_Opcode opcode,    // must be a built-in binary operator from a monoid
+    GB_Type_code zcode   // op->ztype->code
+) ;
+
+const char *GB_cuda_stringify_mask
+(
+    const GB_Type_code M_type_code,
+    bool mask_is_structural
+) ;
+
+void GB_cuda_stringify_semiring     // build a semiring (name and code)
+(
+    // input:
+    GrB_Semiring semiring,  // the semiring to stringify
+    bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
+    GrB_Type ctype,         // the type of C
+    GrB_Type atype,         // the type of A
+    GrB_Type btype,         // the type of B
+    GrB_Type mtype,         // the type of M, or NULL if no mask
+    bool Mask_struct,       // mask is structural
+    bool mask_in_semiring_name, // if true, then the semiring_name includes
+                                // the mask_name.  If false, then semiring_name
+                                // is independent of the mask_name
+    // output: (all of size at least GB_CUDA_LEN+1)
+    char *semiring_name,    // name of the semiring
+    char *semiring_code,    // List of types and macro defs
+    char *mask_name         // definition of mask data load
+) ;
+
+void GB_cuda_stringify_binop
+(
+    // output:
+    char *code_string,  // string with the #define macro
+    // input:
+    const char *macro_name,   // name of macro to construct
+    GB_Opcode opcode,   // opcode of GraphBLAS operator to convert into a macro
+    GB_Type_code zcode  // op->ztype->code of the operator
+) ;
+
+void GB_cuda_stringify_load    // return a string to load/typecast macro
+(
+    // output:
+    char *result,
+    // input:
+    const char *macro_name,       // name of macro to construct
+    bool is_pattern         // if true, load/cast does nothing
+) ;
+
+void GB_cuda_stringify_identity        // return string for identity value
+(
+    // output:
+    char *code_string,  // string with the #define macro
+    // input:
+    GB_Opcode opcode,     // must be a built-in binary operator from a monoid
+    GB_Type_code zcode
+) ;
+
+const char *GB_cuda_stringify_opcode
+(
+    GB_Opcode opcode    // opcode of GraphBLAS operator
+) ;
+
+GB_Opcode GB_binop_flip     // flipped opcode, or same opcode if not flipped
+(
+    GB_Opcode opcode        // opcode to flip
+) ;
+
+#endif
diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_binop.c b/GraphBLAS/CUDA/GB_cuda_stringify_binop.c
new file mode 100644
index 0000000000..c4e42a6379
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_stringify_binop.c
@@ -0,0 +1,647 @@
+//SPDX-License-Identifier: Apache-2.0
+
+#include "GB.h"
+#include "GB_cuda_stringify.h"
+
+// The binop macro generates an expression, not a full statement (there
+// is no semicolon).
+
+// for example:
+// #define GB_MULT(x,y) ((x) * (y))
+
+void GB_cuda_stringify_binop
+(
+    // output:
+    char *code_string,  // string with the #define macro
+    // input:
+    const char *macro_name,   // name of macro to construct
+    GB_Opcode opcode,   // opcode of GraphBLAS operator to convert into a macro
+    GB_Type_code zcode  // op->ztype->code of the operator
+)
+{
+    const char *op_string ;
+    int ecode ;
+    GB_cuda_enumify_binop (&ecode, opcode, zcode) ;
+    GB_cuda_charify_binop (&op_string, ecode, for_semiring) ;
+    GB_cuda_macrofy_binop (code_string, macro_name, op_string) ;
+}
+
+void GB_cuda_enumify_binop
+(
+    // output:
+    int *ecode,         // enumerated operator, in range 0 to ... (-1 on failure)
+    // input:
+    GB_Opcode opcode,   // opcode of GraphBLAS operator to convert into a macro
+    GB_Type_code zcode  // op->ztype->code of the operator
+//  bool for_semiring   // true for A*B, false for A+B or A.*B (not needed)
+)
+{
+    int e = -1 ;
+
+    switch (opcode)
+    {
+
+        case GB_FIRST_opcode :    // z = x
+
+            e = 0 ; // "(x)" ;
+            break ;
+
+        case GB_ANY_opcode :
+        case GB_SECOND_opcode :   // z = y
+
+            e = 1 ; // "(y)" ;
+            break ;
+
+        case GB_MIN_opcode :      //  z = min(x,y)
+
+            switch (zcode)
+            {
+                case GB_FP32_code    : e = 2 ; break ;  // "fminf (x,y)" ;
+                case GB_FP64_code    : e = 3 ; break ;  // "fmin (x,y)" ;
+                default              : e = 4 ; break ;  // "GB_IMIN (x,y)" ;
+            }
+            break ;
+
+        case GB_MAX_opcode :      // z = max(x,y)
+
+            switch (zcode)
+            {
+                case GB_FP32_code    : e = 5 ; break ;  // "fmaxf (x,y)" ;
+                case GB_FP64_code    : e = 6 ; break ;  // "fmax (x,y)" ;
+                default              : e = 7 ; break ;  // "GB_IMAX (x,y)" ;
+            }
+            break ;
+
+        case GB_PLUS_opcode :     // z = x + y
+
+            e = 8 ; break ;  // "(x) + (y)" ;
+
+        case GB_MINUS_opcode :    // z = x - y
+
+            e = 9 ; break ;  // "(x) - (y)" ;
+
+        case GB_RMINUS_opcode :   // z = y - x
+
+            e = 10 ; break ;  // "(y) - (x)" ;
+
+        case GB_TIMES_opcode :    // z = x * y
+
+            e = 11 ; break ;  // "(x) * (y)" ;
+
+        case GB_DIV_opcode :      // z = x / y ;
+
+            switch (zcode)
+            {
+                case GB_INT8_code   : e = 12 ; break ; // "GB_IDIV_SIGNED(x,y,8)"
+                case GB_INT16_code  : e = 13 ; break ; // "GB_IDIV_SIGNED(x,y,16)"
+                case GB_INT32_code  : e = 14 ; break ; // "GB_IDIV_SIGNED(x,y,32)"
+                case GB_INT64_code  : e = 15 ; break ; // "GB_IDIV_SIGNED(x,y,64)"
+                case GB_UINT8_code  : e = 16 ; break ; // "GB_IDIV_UNSIGNED(x,y,8)"
+                case GB_UINT16_code : e = 17 ; break ; // "GB_IDIV_UNSIGNED(x,y,16)"
+                case GB_UINT32_code : e = 18 ; break ; // "GB_IDIV_UNSIGNED(x,y,32)"
+                case GB_UINT64_code : e = 19 ; break ; // "GB_IDIV_UNSIGNED(x,y,64)"
+                default             : e = 20 ; break ; // "(x) / (y)"
+            }
+            break ;
+
+        case GB_RDIV_opcode :      // z = y / x ;
+
+            switch (zcode)
+            {
+                case GB_INT8_code   : e = 21 ; break ; // GB_IDIV_SIGNED(y,x,8)
+                case GB_INT16_code  : e = 22 ; break ; // GB_IDIV_SIGNED(y,x,16)"
+                case GB_INT32_code  : e = 23 ; break ; // GB_IDIV_SIGNED(y,x,32)"
+                case GB_INT64_code  : e = 24 ; break ; // GB_IDIV_SIGNED(y,x,64)"
+                case GB_UINT8_code  : e = 25 ; break ; // GB_IDIV_UNSIGNED(y,x,8)"
+                case GB_UINT16_code : e = 26 ; break ; // GB_IDIV_UNSIGNED(y,x,16)"
+                case GB_UINT32_code : e = 27 ; break ; // GB_IDIV_UNSIGNED(y,x,32)"
+                case GB_UINT64_code : e = 28 ; break ; // GB_IDIV_UNSIGNED(y,x,64)"
+                default             : e = 29 ; break ; // (y) / (x)
+            }
+            break ;
+
+        case GB_EQ_opcode :
+        case GB_ISEQ_opcode :     // z = (x == y)
+
+            e = 30 ;    // "(x) == (y)" ;
+            break ;
+
+        case GB_NE_opcode :
+        case GB_ISNE_opcode :     // z = (x != y)
+
+            e = 31 ;    // "(x) != (y)" ;
+            break ;
+
+        case GB_GT_opcode :
+        case GB_ISGT_opcode :     // z = (x >  y)
+
+            e = 32 ;    // "(x) > (y)" ;
+            break ;
+
+        case GB_LT_opcode :
+        case GB_ISLT_opcode :     // z = (x <  y)
+
+            e = 33 ;    // "(x) < (y)" ;
+            break ;
+
+        case GB_GE_opcode :
+        case GB_ISGE_opcode :     // z = (x >= y)
+
+            e = 34 ;    // "(x) >= (y)" ;
+            break ;
+
+        case GB_LE_opcode :
+        case GB_ISLE_opcode :     // z = (x <= y)
+
+            e = 35 ;    // "(x) <= (y)" ;
+            break ;
+
+        case GB_LOR_opcode :      // z = (x != 0) || (y != 0)
+
+            switch (zcode)
+            {
+                case GB_BOOL_code   : e = 36 ; break ; // "(x) || (y)"
+                default             : e = 37 ; break ; // "((x)!=0) || ((y)!=0)"
+            }
+            break ;
+
+        case GB_LAND_opcode :     // z = (x != 0) && (y != 0)
+
+            switch (zcode)
+            {
+                case GB_BOOL_code   : e = 38 ; break ; // "(x) && (y)"
+                default             : e = 39 ; break ; // "((x)!=0) && ((y)!=0)"
+            }
+            break ;
+
+        case GB_LXOR_opcode :     // z = (x != 0) != (y != 0)
+
+            switch (zcode)
+            {
+                case GB_BOOL_code   : e = 40 ; break ; // "(x) != (y)"
+                default             : e = 41 ; break ; // "((x)!=0) != ((y)!=0)"
+            }
+            break ;
+
+        case GB_BOR_opcode       :   // z = (x | y), bitwise or
+
+            if (zcode >= GB_INT8_code && zcode <= GB_UINT64_code)
+            {
+                e = 42 ;    // "(x) | (y)"
+            }
+            break ;
+
+        case GB_BAND_opcode      :   // z = (x & y), bitwise and
+
+            if (zcode >= GB_INT8_code && zcode <= GB_UINT64_code)
+            {
+                e = 43 ;    // "(x) & (y)"
+            }
+            break ;
+
+        case GB_BXOR_opcode      :   // z = (x ^ y), bitwise xor
+
+            if (zcode >= GB_INT8_code && zcode <= GB_UINT64_code)
+            {
+                e = 44 ;    // "(x) ^ (y)"
+            }
+            break ;
+
+        case GB_BXNOR_opcode     :   // z = ~(x ^ y), bitwise xnor
+
+            if (zcode >= GB_INT8_code && zcode <= GB_UINT64_code)
+            {
+                e = 45 ;    // "~((x) ^ (y))"
+            }
+            break ;
+
+        case GB_BGET_opcode      :   // z = bitget (x,y)
+
+            switch (zcode)
+            {
+                case GB_INT8_code   : e = 46 ; break ; // GB_BITGET(x,y,int8_t, 8)
+                case GB_INT16_code  : e = 47 ; break ; // GB_BITGET(x,y,int16_t,16)
+                case GB_INT32_code  : e = 48 ; break ; // GB_BITGET(x,y,int32_t,32)
+                case GB_INT64_code  : e = 49 ; break ; // GB_BITGET(x,y,int64_t,64)
+                case GB_UINT8_code  : e = 50 ; break ; // GB_BITGET(x,y,uint8_t,8)
+                case GB_UINT16_code : e = 51 ; break ; // GB_BITGET(x,y,uint16_t,16)
+                case GB_UINT32_code : e = 52 ; break ; // GB_BITGET(x,y,uint32_t,32)
+                case GB_UINT64_code : e = 53 ; break ; // GB_BITGET(x,y,uint64_t,64)
+                default             : e = -1 ; break ;
+            }
+            break ;
+
+        case GB_BSET_opcode      :   // z = bitset (x,y)
+
+            switch (zcode)
+            {
+                case GB_INT8_code   : e = 54 ; break ; // GB_BITSET(x,y,int8_t, 8)
+                case GB_INT16_code  : e = 55 ; break ; // GB_BITSET(x,y,int16_t,16)
+                case GB_INT32_code  : e = 56 ; break ; // GB_BITSET(x,y,int32_t,32)
+                case GB_INT64_code  : e = 57 ; break ; // GB_BITSET(x,y,int64_t,64)
+                case GB_UINT8_code  : e = 58 ; break ; // GB_BITSET(x,y,uint8_t,8)
+                case GB_UINT16_code : e = 59 ; break ; // GB_BITSET(x,y,uint16_t,16)
+                case GB_UINT32_code : e = 60 ; break ; // GB_BITSET(x,y,uint32_t,32)
+                case GB_UINT64_code : e = 61 ; break ; // GB_BITSET(x,y,uint64_t,64)
+                default             : e = -1 ; break ;
+            }
+            break ;
+
+        case GB_BCLR_opcode      :   // z = bitclr (x,y)
+
+            switch (zcode)
+            {
+                case GB_INT8_code   : e = 62 ; break ; // GB_BITCLR(x,y,int8_t, 8)
+                case GB_INT16_code  : e = 63 ; break ; // GB_BITCLR(x,y,int16_t,16)
+                case GB_INT32_code  : e = 64 ; break ; // GB_BITCLR(x,y,int32_t,32)
+                case GB_INT64_code  : e = 65 ; break ; // GB_BITCLR(x,y,int64_t,64)
+                case GB_UINT8_code  : e = 66 ; break ; // GB_BITCLR(x,y,uint8_t,8)
+                case GB_UINT16_code : e = 67 ; break ; // GB_BITCLR(x,y,uint16_t,16)
+                case GB_UINT32_code : e = 68 ; break ; // GB_BITCLR(x,y,uint32_t,32)
+                case GB_UINT64_code : e = 69 ; break ; // GB_BITCLR(x,y,uint64_t,64)
+                default             : e = -1 ; break ;
+            }
+            break ;
+
+        case GB_BSHIFT_opcode    :   // z = bitshift (x,y)
+
+            switch (zcode)
+            {
+                case GB_INT8_code   : e = 70 ; break ; // GB_bitshift_int8(x,y)
+                case GB_INT16_code  : e = 71 ; break ; // GB_bitshift_int16(x,y)
+                case GB_INT32_code  : e = 72 ; break ; // GB_bitshift_int32(x,y)
+                case GB_INT64_code  : e = 73 ; break ; // GB_bitshift_int64(x,y)
+                case GB_UINT8_code  : e = 74 ; break ; // GB_bitshift_uint8(x,y)
+                case GB_UINT16_code : e = 75 ; break ; // GB_bitshift_uint16(x,y)
+                case GB_UINT32_code : e = 76 ; break ; // GB_bitshift_uint32(x,y)
+                case GB_UINT64_code : e = 77 ; break ; // GB_bitshift_uint64(x,y)
+                default             : e = -1 ; break ;
+            }
+            break ;
+
+        case GB_POW_opcode    :   // z = pow (x,y)
+
+            switch (zcode)
+            {
+                case GB_INT8_code   : e = 78 ; break ; // GB_pow_int8 (x, y)
+                case GB_INT16_code  : e = 79 ; break ; // GB_pow_int16 (x, y)
+                case GB_INT32_code  : e = 80 ; break ; // GB_pow_int32 (x, y)
+                case GB_INT64_code  : e = 81 ; break ; // GB_pow_int64 (x, y)
+                case GB_UINT8_code  : e = 82 ; break ; // GB_pow_uint8 (x, y)
+                case GB_UINT16_code : e = 83 ; break ; // GB_pow_uint16 (x, y)
+                case GB_UINT32_code : e = 84 ; break ; // GB_pow_uint32 (x, y)
+                case GB_UINT64_code : e = 85 ; break ; // GB_pow_uint64 (x, y)
+                case GB_FP32_code   : e = 86 ; break ; // GB_powf (x, y)
+                case GB_FP64_code   : e = 87 ; break ; // GB_pow (x, y)
+                case GB_FC32_code   : e = 88 ; break ; // GB_cpowf (x, y)
+                case GB_FC64_code   : e = 89 ; break ; // GB_cpow (x, y)
+                default             : e = -1 ; break ;
+            }
+            break ;
+
+        case GB_ATAN2_opcode     :   // z = atan2 (x,y)
+
+            switch (zcode)
+            {
+                case GB_FP32_code   : e = 90 ; break ; // atan2f (x, y)
+                case GB_FP64_code   : e = 91 ; break ; // atan2 (x, y)
+                default             : e = -1 ; break ;
+            }
+            break ;
+
+        case GB_HYPOT_opcode     :   // z = hypot (x,y)
+
+            switch (zcode)
+            {
+                case GB_FP32_code   : e = 92 ; break ; // hypotf (x, y)
+                case GB_FP64_code   : e = 93 ; break ; // hypot (x, y)
+                default             : e = -1 ; break ;
+            }
+            break ;
+
+        case GB_FMOD_opcode      :   // z = fmod (x,y)
+
+            switch (zcode)
+            {
+                case GB_FP32_code   : e = 94 ; break ; // fmodf (x, y)
+                case GB_FP64_code   : e = 95 ; break ; // fmod (x, y)
+                default             : e = -1 ; break ;
+            }
+            break ;
+
+        case GB_REMAINDER_opcode :   // z = remainder (x,y)
+
+            switch (zcode)
+            {
+                case GB_FP32_code   : e = 96 ; break ; // remainderf (x, y)
+                case GB_FP64_code   : e = 97 ; break ; // remainder (x, y)
+                default             : e = -1 ; break ;
+            }
+            break ;
+
+        case GB_COPYSIGN_opcode  :   // z = copysign (x,y)
+
+            switch (zcode)
+            {
+                case GB_FP32_code   : e = 98 ; break ; // copysignf (x, y)
+                case GB_FP64_code   : e = 99 ; break ; // copysign (x, y)
+                default             : e = -1 ; break ;
+            }
+            break ;
+
+        case GB_LDEXP_opcode     :   // z = ldexp (x,y)
+
+            switch (zcode)
+            {
+                case GB_FP32_code   : e = 100 ; break ; // ldexpf (x, y)
+                case GB_FP64_code   : e = 101 ; break ; // ldexp (x, y)
+                default             : e = -1  ; break ;
+            }
+            break ;
+
+        case GB_CMPLX_opcode     :   // z = cmplx (x,y)
+
+            switch (zcode)
+            {
+                case GB_FP32_code   : e = 102 ; break ; // GxB_CMPLXF (x, y)
+                case GB_FP64_code   : e = 103 ; break ; // GxB_CMPLX (x, y)
+                default             : e = -1  ; break ;
+            }
+            break ;
+
+        case GB_PAIR_opcode :       // z = 1
+
+            e = 104 ; break ;       // 1
+
+        case GB_FIRSTI_opcode    :  // z = first_i(A(i,j),y) == i
+
+            e = 105 ; break ;       // z = i
+
+        case GB_FIRSTI1_opcode   :  // z = first_i1(A(i,j),y) == i+1
+
+            e = 106 ; break ;       // z = i+1
+
+        case GB_FIRSTJ_opcode    :  // z = first_j(A(i,j),y) == j
+
+            e = 107 ; break ;       // z = for_semiring ? (k) : (j)
+
+        case GB_FIRSTJ1_opcode   :  // z = first_j1(A(i,j),y) == j+1
+
+            e = 108 ; break ;       // z = for_semiring ? (k+1) : (j+1)
+
+        case GB_SECONDI_opcode   :  // z = second_i(x,B(i,j)) == i
+
+            e = 109 ; break ;       // z = for_semiring ? (k) : (i)
+
+        case GB_SECONDI1_opcode  :  // z = second_i1(x,B(i,j)) == i+1
+
+            e = 110 ; break ;       // z = for_semiring ? (k) : ()
+
+        case GB_SECONDJ_opcode   :  // z = second_j(x,B(i,j)) == j
+
+            e = 111 ; break ;       // z = j
+
+        case GB_SECONDJ1_opcode  :  // z = second_j1(x,B(i,j)) == j+1
+
+            e = 112 ; break ;       // z = j+1
+
+        default : break ;
+    }
+
+    (*ecode) = e ;
+}
+
+void GB_cuda_charify_binop
+(
+    // output:
+    char **op_string,   // string defining the operator
+    // input:
+    int ecode,          // from GB_cuda_enumify_binop
+    bool for_semiring   // true for A*B, false for A+B or A.*B (not needed)
+)
+{
+    const char *f ;
+
+    switch (ecode)
+    {
+
+        // first
+        case 0 : f = "(x)"              ; break ;
+
+        // any, second
+        case 1 : f = "(y)"              ; break ;
+
+        // min
+        case 2 : f = "fminf (x,y)"      ; break ;
+        case 3 : f = "fmin (x,y)"       ; break ;
+        case 4 : f = "GB_IMIN (x,y)"    ; break ;
+
+        // max
+        case 5 : f = "fmaxf (x,y)"      ; break ;
+        case 6 : f = "fmax (x,y)"       ; break ;
+        case 7 : f = "GB_IMAX (x,y)"    ; break ;
+
+        // plus
+        case 8 : f = "(x) + (y)"        ; break ;
+
+        // minus
+        case 9 : f = "(x) - (y)"        ; break ;
+
+        // rminus
+        case 10 : f = "(y) - (x)"       ; break ;
+
+        // times
+        case 11 : f = "(x) * (y)"       ; break ;
+
+        // div
+        case 12 : f = "GB_IDIV_SIGNED(x,y,8)"           ; break ;
+        case 13 : f = "GB_IDIV_SIGNED(x,y,16)"          ; break ;
+        case 14 : f = "GB_IDIV_SIGNED(x,y,32)"          ; break ;
+        case 15 : f = "GB_IDIV_SIGNED(x,y,64)"          ; break ;
+        case 16 : f = "GB_IDIV_UNSIGNED(x,y,8)"         ; break ;
+        case 17 : f = "GB_IDIV_UNSIGNED(x,y,16)"        ; break ;
+        case 18 : f = "GB_IDIV_UNSIGNED(x,y,32)"        ; break ;
+        case 19 : f = "GB_IDIV_UNSIGNED(x,y,64)"        ; break ;
+        case 20 : f = "(x) / (y)"                       ; break ;
+
+        // rdiv
+        case 21 : f = "GB_IDIV_SIGNED(y,x,8)"           ; break ;
+        case 22 : f = "GB_IDIV_SIGNED(y,x,16)"          ; break ;
+        case 23 : f = "GB_IDIV_SIGNED(y,x,32)"          ; break ;
+        case 24 : f = "GB_IDIV_SIGNED(y,x,64)"          ; break ;
+        case 25 : f = "GB_IDIV_UNSIGNED(y,x,8)"         ; break ;
+        case 26 : f = "GB_IDIV_UNSIGNED(y,x,16)"        ; break ;
+        case 27 : f = "GB_IDIV_UNSIGNED(y,x,32)"        ; break ;
+        case 28 : f = "GB_IDIV_UNSIGNED(y,x,64)"        ; break ;
+        case 29 : f = "(y) / (x)"                       ; break ;
+
+        // eq, iseq
+        case 30 : f = "(x) == (y)"                      ; break ;
+
+        // ne, isne
+        case 31 : f = "(x) != (y)"                      ; break ;
+
+        // gt, isgt
+        case 32 : f = "(x) > (y)"                       ; break ;
+
+        // lt, islt
+        case 33 : f = "(x) < (y)"                       ; break ;
+
+        // ge, isget
+        case 34 : f = "(x) >= (y)"                      ; break ;
+
+        // le, isle
+        case 35 : f = "(x) <= (y)"                      ; break ;
+
+        // lor
+        case 36 : f = "(x) || (y)"                      ; break ;
+        case 37 : f = "((x)!=0) || ((y)!=0)"            ; break ;
+
+        // land
+        case 38 : f = "(x) && (y)"                      ; break ;
+        case 39 : f = "((x)!=0) && ((y)!=0)"            ; break ;
+
+        // lxor
+        case 40 : f = "(x) != (y)"                      ; break ;
+        case 41 : f = "((x)!=0) != ((y)!=0)"            ; break ;
+
+        // bor
+        case 42 : f = "(x) | (y)"                       ; break ;
+
+        // band
+        case 43 : f = "(x) & (y)"                       ; break ;
+
+        // bxor
+        case 44 : f = "(x) ^ (y)"                       ; break ;
+
+        // bxnor
+        case 45 : f = "~((x) ^ (y))"                    ; break ;
+
+        // bget
+        case 46 : f = "GB_BITGET(x,y,int8_t, 8)"        ; break ;
+        case 47 : f = "GB_BITGET(x,y,int16_t,16)"       ; break ;
+        case 48 : f = "GB_BITGET(x,y,int32_t,32)"       ; break ;
+        case 49 : f = "GB_BITGET(x,y,int64_t,64)"       ; break ;
+        case 50 : f = "GB_BITGET(x,y,uint8_t,8)"        ; break ;
+        case 51 : f = "GB_BITGET(x,y,uint16_t,16)"      ; break ;
+        case 52 : f = "GB_BITGET(x,y,uint32_t,32)"      ; break ;
+        case 53 : f = "GB_BITGET(x,y,uint64_t,64)"      ; break ;
+
+        // bset
+        case 54 : f = "GB_BITSET(x,y,int8_t, 8)"        ; break ;
+        case 55 : f = "GB_BITSET(x,y,int16_t,16)"       ; break ;
+        case 56 : f = "GB_BITSET(x,y,int32_t,32)"       ; break ;
+        case 57 : f = "GB_BITSET(x,y,int64_t,64)"       ; break ;
+        case 58 : f = "GB_BITSET(x,y,uint8_t,8)"        ; break ;
+        case 59 : f = "GB_BITSET(x,y,uint16_t,16)"      ; break ;
+        case 60 : f = "GB_BITSET(x,y,uint32_t,32)"      ; break ;
+        case 61 : f = "GB_BITSET(x,y,uint64_t,64)"      ; break ;
+
+        // bclr
+        case 62 : f = "GB_BITCLR(x,y,int8_t, 8)"        ; break ;
+        case 63 : f = "GB_BITCLR(x,y,int16_t,16)"       ; break ;
+        case 64 : f = "GB_BITCLR(x,y,int32_t,32)"       ; break ;
+        case 65 : f = "GB_BITCLR(x,y,int64_t,64)"       ; break ;
+        case 66 : f = "GB_BITCLR(x,y,uint8_t,8)"        ; break ;
+        case 67 : f = "GB_BITCLR(x,y,uint16_t,16)"      ; break ;
+        case 68 : f = "GB_BITCLR(x,y,uint32_t,32)"      ; break ;
+        case 69 : f = "GB_BITCLR(x,y,uint64_t,64)"      ; break ;
+
+        // bshift
+        case 70 : f = "GB_bitshift_int8(x,y)"           ; break ;
+        case 71 : f = "GB_bitshift_int16(x,y)"          ; break ;
+        case 72 : f = "GB_bitshift_int32(x,y)"          ; break ;
+        case 73 : f = "GB_bitshift_int64(x,y)"          ; break ;
+        case 74 : f = "GB_bitshift_uint8(x,y)"          ; break ;
+        case 75 : f = "GB_bitshift_uint16(x,y)"         ; break ;
+        case 76 : f = "GB_bitshift_uint32(x,y)"         ; break ;
+        case 77 : f = "GB_bitshift_uint64(x,y)"         ; break ;
+
+        // pow
+        case 78 : f = "GB_pow_int8 (x, y)"              ; break ;
+        case 79 : f = "GB_pow_int16 (x, y)"             ; break ;
+        case 80 : f = "GB_pow_int32 (x, y)"             ; break ;
+        case 81 : f = "GB_pow_int64 (x, y)"             ; break ;
+        case 82 : f = "GB_pow_uint8 (x, y)"             ; break ;
+        case 83 : f = "GB_pow_uint16 (x, y)"            ; break ;
+        case 84 : f = "GB_pow_uint32 (x, y)"            ; break ;
+        case 85 : f = "GB_pow_uint64 (x, y)"            ; break ;
+        case 86 : f = "GB_powf (x, y)"                  ; break ;
+        case 87 : f = "GB_pow (x, y)"                   ; break ;
+        case 88 : f = "GB_cpowf (x, y)"                 ; break ;
+        case 89 : f = "GB_cpow (x, y)"                  ; break ;
+
+        // atan2
+        case 90 : f = "atan2f (x, y)"                   ; break ;
+        case 91 : f = "atan2 (x, y)"                    ; break ;
+
+        // hypot
+        case 92 : f = "hypotf (x, y)"                   ; break ;
+        case 93 : f = "hypot (x, y)"                    ; break ;
+
+        // fmod
+        case 94 : f = "fmodf (x, y)"                    ; break ;
+        case 95 : f = "fmod (x, y)"                     ; break ;
+
+        // remainder
+        case 96 : f = "remainderf (x, y)"               ; break ;
+        case 97 : f = "remainder (x, y)"                ; break ;
+
+        // copysign
+        case 98 : f = "copysignf (x, y)"                ; break ;
+        case 99 : f = "copysign (x, y)"                 ; break ;
+
+        // ldexp
+        case 100 : f = "ldexpf (x, y)"                  ; break ;
+        case 101 ; f = "ldexp (x, y)"                   ; break ;
+
+        // cmplex
+        case 102 : f = "GxB_CMPLXF (x, y)"              ; break ;
+        case 103 : f = "GxB_CMPLX (x, y)"               ; break ;
+
+        // pair
+        case 104 : f = "(1)"                            ; break ;
+
+        // firsti
+        case 105 : f = "(i)"                            ; break ;
+
+        // firsti1
+        case 106 : f = "(i+1)"                          ; break ;
+
+        // firstj
+        case 107 : f = for_semiring ? "(k)" : "(j)"     ; break ;
+
+        // firstj1
+        case 108 : f = for_semiring ? "(k+1)" : "(j+1)" ; break ;
+
+        // secondi
+        case 109 : f = for_semiring ? "(k)" : "(i)"     ; break ;
+
+        // secondi1
+        case 110 : f = for_semiring ? "(k+1)" : "(i+1)" ; break ;
+
+        // secondj
+        case 111 : f = "(j)"                            ; break ;
+
+        // secondj1
+        case 112 : f = "(j+1)"                          ; break ;
+
+        default : f = NULL ;                            ; break ;
+    }
+
+    (*op_string) = f ;
+}
+
+void GB_cuda_macrofy_binop
+(
+    // output:
+    char *code_string,  // string with the #define macro
+    // input:
+    const char *macro_name,   // name of macro to construct
+    char *op_string     // string defining the operator
+)
+{
+    snprintf (code_string, GB_CUDA_STRLEN,
+        "#define %s(x,y) (%s)", macro_name, op_string) ;
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_identity.c b/GraphBLAS/CUDA/GB_cuda_stringify_identity.c
new file mode 100644
index 0000000000..c969c591fd
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_stringify_identity.c
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: Apache-2.0
+//------------------------------------------------------------------------------
+// GB_cuda_stringify_identity: return string for identity value
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+#include "GB_cuda_stringify.h"
+
+#define  ID( x)  IDENT = (x)
+
+void GB_cuda_stringify_identity        // return string for identity value
+(
+    // output:
+    char *code_string,  // string with the #define macro
+    // input:
+    GB_Opcode opcode,     // must be a built-in binary operator from a monoid
+    GB_Type_code zcode    // type code used in the opcode we want
+)
+{
+    const char *IDENT;
+    switch (opcode)
+    {
+        case GB_MIN_opcode :
+
+            switch (zcode)
+            {
+                case GB_BOOL_code   : ID ("true") ;     // boolean AND
+                case GB_INT8_code   : ID ("INT8_MAX") ;
+                case GB_INT16_code  : ID ("INT16_MAX") ;
+                case GB_INT32_code  : ID ("INT32_MAX") ;
+                case GB_INT64_code  : ID ("INT64_MAX") ;
+                case GB_UINT8_code  : ID ("UINT8_MAX") ;
+                case GB_UINT16_code : ID ("UINT16_MAX") ;
+                case GB_UINT32_code : ID ("UINT32_MAX") ;
+                case GB_UINT64_code : ID ("UINT64_MAX") ;
+                default             : ID ("INFINITY") ;
+            }
+            break ;
+
+        case GB_MAX_opcode :
+
+            switch (zcode)
+            {
+                case GB_BOOL_code   : ID ("false") ;    // boolean OR
+                case GB_INT8_code   : ID ("INT8_MIN") ;
+                case GB_INT16_code  : ID ("INT16_MIN") ;
+                case GB_INT32_code  : ID ("INT32_MIN") ;
+                case GB_INT64_code  : ID ("INT64_MIN") ;
+                case GB_UINT8_code  : ID ("0") ;
+                case GB_UINT16_code : ID ("0") ;
+                case GB_UINT32_code : ID ("0") ;
+                case GB_UINT64_code : ID ("0") ;
+                default             : ID ("(-INFINITY)") ;
+            }
+            break ;
+
+        case GB_PLUS_opcode     : ID ("0") ;
+        case GB_TIMES_opcode    : ID ("1") ;
+        case GB_LOR_opcode      : ID ("false") ;
+        case GB_LAND_opcode     : ID ("true") ;
+        case GB_LXOR_opcode     : ID ("false") ;
+        // case GB_LXNOR_opcode :
+        case GB_EQ_opcode       : ID ("true") ;
+        // case GB_ANY_opcode   :
+        default                 : ID ("0") ;
+    }
+    snprintf (code_string, GB_CUDA_STRLEN, "#define GB_IDENTITY (%s)", IDENT) ;
+
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_load.c b/GraphBLAS/CUDA/GB_cuda_stringify_load.c
new file mode 100644
index 0000000000..cf0117db19
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_stringify_load.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: Apache-2.0
+
+// Construct a macro to load and typecast.  For example:
+//  
+//  #define GB_GETA(blob) blob
+//
+// then use as:
+//      GB_GETA (double aij = Ax [p]) ;
+//      GB_GETA (double *Ax = A->x) ;
+//      GB_GETA (T_A *restrict Ax = A->x) ;
+//
+// which become
+//      double aij = Ax [p] ;
+//      double *Ax = A->x ;
+//      T_A *Ax = A->x ;
+//
+// or, if is_pattern is true, the macro becomes the empty string.
+
+#include "GB.h"
+#include "GB_cuda_stringify.h"
+
+void GB_cuda_stringify_load    // return a string to load/typecast macro
+(
+    // output:
+    char *result,
+    // input:
+    const char *macro_name,       // name of macro to construct
+    bool is_pattern         // if true, load/cast does nothing
+)
+{
+
+    if (is_pattern)
+    {
+        snprintf (result, GB_CUDA_STRLEN, "#define %s(blob)", macro_name) ;
+    }
+    else
+    {
+        snprintf (result, GB_CUDA_STRLEN, "#define %s(blob) blob", macro_name) ;
+    }
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_mask.c b/GraphBLAS/CUDA/GB_cuda_stringify_mask.c
new file mode 100644
index 0000000000..3366328c99
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_stringify_mask.c
@@ -0,0 +1,65 @@
+//SPDX-License-Identifier: Apache-2.0
+
+//#include "GB_cuda.h"
+#include "GB.h"
+#include "GB_cuda_stringify.h"
+
+const char *GB_cuda_stringify_mask
+(
+    const GB_Type_code M_type_code,
+    bool mask_is_structural
+)
+{
+
+    if (mask_is_structural)
+    {
+        return (
+            "#define GB_MTYPE void\n"
+            "#define MX(i) true") ;
+    }
+    else
+    {
+        switch (M_type_code)
+        {
+            case GB_BOOL_code:
+            case GB_INT8_code:
+            case GB_UINT8_code:
+                return (
+                    "#define GB_MTYPE uint8_t\n"
+                    "#define MX(i) Mx [i]") ;
+
+            case GB_INT16_code:
+            case GB_UINT16_code:
+                return (
+                    "#define GB_MTYPE uint16_t\n"
+                    "#define MX(i) Mx [i]") ;
+
+            case GB_INT32_code:
+            case GB_UINT32_code:
+//          case GB_FC32_code:
+            case GB_FP32_code:
+                return (
+                    "#define GB_MTYPE uint32_t\n"
+                    "#define MX(i) Mx [i]") ;
+
+            case GB_INT64_code:
+            case GB_UINT64_code:
+//          case GB_FC64_code:
+            case GB_FP64_code:
+                return (
+                    "#define GB_MTYPE uint64_t\n"
+                    "#define MX(i) Mx [i]") ;
+
+//          case GB_FC64_code:
+//              return (
+//                  "#define GB_MTYPE double complex\n"
+//                  "#define MX(i) Mx [i]") ;
+
+            default: ;
+        }
+    }
+    
+    // unrecognized type
+    return (NULL) ;
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_opcode.c b/GraphBLAS/CUDA/GB_cuda_stringify_opcode.c
new file mode 100644
index 0000000000..c54b0c4c91
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_stringify_opcode.c
@@ -0,0 +1,48 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include "GB.h"
+#include "GB_cuda_stringify.h"
+
+const char *GB_cuda_stringify_opcode
+(
+    GB_Opcode opcode    // opcode of GraphBLAS operator
+)
+{
+
+    switch (opcode)
+    {
+        case GB_FIRST_opcode :  return ("1st") ;
+        // case GB_ANY_opcode : return ("any") ;
+        case GB_SECOND_opcode : return ("2nd") ;
+        case GB_MIN_opcode :    return ("min") ;
+        case GB_MAX_opcode :    return ("max") ;
+        case GB_PLUS_opcode :   return ("plus") ;
+        case GB_MINUS_opcode :  return ("minus") ;
+        case GB_RMINUS_opcode : return ("rminus") ;
+        case GB_TIMES_opcode :  return ("times") ;
+        case GB_DIV_opcode :    return ("div") ;
+        case GB_RDIV_opcode :   return ("rdiv") ;
+        case GB_EQ_opcode :     return ("eq") ;
+        case GB_ISEQ_opcode :   return ("iseq") ;
+        case GB_NE_opcode :     return ("ne") ;
+        case GB_ISNE_opcode :   return ("isne") ;
+        case GB_GT_opcode :     return ("gt") ;
+        case GB_ISGT_opcode :   return ("isgt") ;
+        case GB_LT_opcode :     return ("lt") ;
+        case GB_ISLT_opcode :   return ("islt") ;
+        case GB_GE_opcode :     return ("ge") ;
+        case GB_ISGE_opcode :   return ("isge") ;
+        case GB_LE_opcode :     return ("le") ;
+        case GB_ISLE_opcode :   return ("isle") ;
+        case GB_LOR_opcode :    return ("lor") ;
+        case GB_LAND_opcode :   return ("land") ;
+        case GB_LXOR_opcode :   return ("lxor") ;
+        // case GB_BOR_opcode : ... bitwise ops
+        // x | y, etc
+        // case GB_PAIR_opcode :
+        default :  ;
+    }
+
+    return ("") ;
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_semiring.c b/GraphBLAS/CUDA/GB_cuda_stringify_semiring.c
new file mode 100644
index 0000000000..4dfd292f3a
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_stringify_semiring.c
@@ -0,0 +1,252 @@
+// SPDX-License-Identifier: Apache-2.0
+
+// Construct a string defining a semiring.
+// User-defined types are not handled.
+
+#include "GB.h"
+#include "GB_cuda_stringify.h"
+
+void GB_cuda_stringify_semiring     // build a semiring (name and code)
+(
+    // input:
+    GrB_Semiring semiring,  // the semiring to stringify
+    bool flipxy,            // multiplier is: mult(a,b) or mult(b,a)
+    GrB_Type ctype,         // the type of C
+    GrB_Type atype,         // the type of A
+    GrB_Type btype,         // the type of B
+    GrB_Type mtype,         // the type of M, or NULL if no mask
+    bool Mask_struct,       // mask is structural
+    bool mask_in_semiring_name, // if true, then the semiring_name includes
+                                // the mask_name.  If false, then semiring_name
+                                // is independent of the mask_name
+    // output: (all of size at least GB_CUDA_LEN+1)
+    char *semiring_name,    // name of the semiring
+    char *semiring_code,    // List of types and macro defs
+    char *mask_name         // definition of mask data load
+)
+{
+
+    // check inputs
+    ASSERT (semiring->object_kind == GB_BUILTIN) ;
+
+    // get the semiring
+    GrB_Monoid add = semiring->add ;
+    GrB_BinaryOp mult = semiring->multiply ;
+    GrB_BinaryOp addop = add->op ;
+    GrB_Type xtype = mult->xtype ;
+    GrB_Type ytype = mult->ytype ;
+    GrB_Type ztype = mult->ztype ;
+    GB_Opcode mult_opcode = mult->opcode ;
+    GB_Opcode add_opcode  = addop->opcode ;
+    GB_Type_code xcode = xtype->code ;
+    GB_Type_code ycode = ytype->code ;
+    GB_Type_code zcode = ztype->code ;
+
+    // these must always be true for any semiring:
+    ASSERT (mult->ztype == addop->ztype) ;
+    ASSERT (addop->xtype == addop->ztype && addop->ytype == addop->ztype) ;
+
+    // for now, this is true for all built-in binops:
+    ASSERT (xcode == ycode) ;
+
+    //--------------------------------------------------------------------------
+    // rename redundant boolean operators
+    //--------------------------------------------------------------------------
+
+    // consider z = op(x,y) where both x and y are boolean:
+    // DIV becomes FIRST
+    // RDIV becomes SECOND
+    // MIN and TIMES become LAND
+    // MAX and PLUS become LOR
+    // NE, ISNE, RMINUS, and MINUS become LXOR
+    // ISEQ becomes EQ
+    // ISGT becomes GT
+    // ISLT becomes LT
+    // ISGE becomes GE
+    // ISLE becomes LE
+
+    if (zcode == GB_BOOL_code)
+    {
+        // rename the monoid
+        add_opcode = GB_boolean_rename (add_opcode) ;
+    }
+
+    if (xcode == GB_BOOL_code)  // && (ycode == GB_BOOL_code)
+    { 
+        // rename the multiplicative operator
+        mult_opcode = GB_boolean_rename (mult_opcode) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // handle the flip
+    //--------------------------------------------------------------------------
+
+    if (flipxy)
+    { 
+        // z = fmult (b,a) will be computed: handle this by renaming the
+        // multiplicative operator
+
+        // handle the flip
+        mult_opcode = GB_binop_flip (mult_opcode) ;
+
+        // the flip is now handled completely.  This assumes xtype and ytype
+        // are the same for all built-in operators.  If this changes, the
+        // types will have to be flipped too.
+        flipxy = false ;
+    }
+
+    //--------------------------------------------------------------------------
+    // determine if A and/or B are value-agnostic
+    //--------------------------------------------------------------------------
+
+    bool op_is_first  = (mult_opcode == GB_FIRST_opcode ) ;
+    bool op_is_second = (mult_opcode == GB_SECOND_opcode) ;
+    bool op_is_pair   = false ; // (mult_opcode == GB_PAIR_opcode) ;
+    bool A_is_pattern = op_is_second || op_is_pair ;
+    bool B_is_pattern = op_is_first  || op_is_pair ;
+
+    //--------------------------------------------------------------------------
+    // construct macros to load scalars from A and B (and typecast) them
+    //--------------------------------------------------------------------------
+
+    char acast [GB_CUDA_STRLEN+1] ;
+    char bcast [GB_CUDA_STRLEN+1] ;
+    GB_cuda_stringify_load (acast, "GB_GETA", A_is_pattern) ;
+    GB_cuda_stringify_load (bcast, "GB_GETB", B_is_pattern) ;
+
+    //--------------------------------------------------------------------------
+    // construct macros for the multiply
+    //--------------------------------------------------------------------------
+
+    char mult_function [GB_CUDA_STRLEN+1] ;
+    GB_cuda_stringify_binop (mult_function, "GB_MULT", mult_opcode, zcode) ;
+
+    //--------------------------------------------------------------------------
+    // construct the monoid macros
+    //--------------------------------------------------------------------------
+
+    char add_function [GB_CUDA_STRLEN+1] ;
+    GB_cuda_stringify_binop (add_function, "GB_ADD", add_opcode, zcode) ;
+
+    char identity_definition [GB_CUDA_STRLEN+1] ;
+    GB_cuda_stringify_identity ( identity_definition, add_opcode, zcode) ;
+
+    bool is_terminal ;
+    char terminal_condition [GB_CUDA_STRLEN+1] ;
+    char terminal_statement [GB_CUDA_STRLEN+1] ;
+
+    GB_cuda_stringify_terminal (
+        &is_terminal, terminal_condition, terminal_statement,
+        "GB_TERMINAL_CONDITION", "GB_IF_TERMINAL_BREAK", add_opcode, zcode) ;
+
+    //--------------------------------------------------------------------------
+    // macro to typecast the result back into C
+    //--------------------------------------------------------------------------
+
+    // for the ANY_PAIR semiring, "c_is_one" will be true, and Cx [0..cnz] will
+    // be filled with all 1's later.
+    bool c_is_one = false ;
+    // TODO:
+    // (add_opcode == GB_ANY_opcode && mult_opcode == GB_PAIR_opcode) ;
+    char ccast [GB_CUDA_STRLEN+1] ;
+    GB_cuda_stringify_load (ccast, "GB_PUTC", c_is_one) ;
+
+    //--------------------------------------------------------------------------
+    // construct the macros to access the mask (if any), and its name
+    //--------------------------------------------------------------------------
+
+    const char *mask_string = "" ;
+    const char *mask_type_name = "" ;
+    const char *struct_str = "struct";
+    if (mtype != NULL)
+    {
+        mask_string = GB_cuda_stringify_mask (mtype->code, Mask_struct) ;
+        mask_type_name = mtype->name ;
+    }
+    else
+    {
+        mask_type_name = struct_str;
+    }
+
+    snprintf (mask_name, GB_CUDA_STRLEN, "mask_%s", mask_type_name) ;
+
+    //--------------------------------------------------------------------------
+    // build the final semiring code
+    //--------------------------------------------------------------------------
+
+    snprintf (semiring_code, GB_CUDA_STRLEN,
+        "%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n%s\n",
+        acast, bcast, mult_function, add_function, identity_definition,
+        terminal_condition, terminal_statement, ccast, mask_string) ;
+
+    //--------------------------------------------------------------------------
+    // build the final semiring name
+    //--------------------------------------------------------------------------
+
+    // the semiring_name depends on:
+    // add_opcode
+    // mult_opcode
+    // ztype->name
+    // xtype->name (currently, always == ytype->name, but will change (TODO))
+    // ytype->name
+    // ctype->name
+    // mask_type_name    (but only if mask_in_semiring_name is true)
+    // atype->name
+    // btype->name
+
+    const char *add_name;
+    const char *mult_name;
+
+    add_name  = GB_cuda_stringify_opcode (add_opcode) ;
+    mult_name = GB_cuda_stringify_opcode (mult_opcode) ;
+
+//  these are not needed: they are template parameters to the CUDA kernel:
+//  ztype->name, xtype->name, ytype->name,
+//  ctype->name, atype->name, btype->name
+
+//  ztype->name is required, since the kernel needs it for the identity
+//  value.  xtype->name is not strictly required.  However, the GraphBLAS
+//  naming scheme is add_mult_xtype, so it is included here.  The ytype
+//  and ztype need not be xtype.
+
+    if (mask_in_semiring_name)
+    {
+
+        // the format of the semiring name is:
+        //
+        //  semiring_add_mult_xtype_M_mtype_Z_ztype
+
+        snprintf (semiring_name, GB_CUDA_STRLEN,
+            "semiring_%s_%s_%s_M_%s_Z_%s",
+            // The first part is akin to GxB_PLUS_TIMES_FP64 (for example),
+            // but here this example is semiring_plus_times_double instead:
+            add_name, mult_name, xtype->name,
+            // these are not in the GrB* or GxB* name, but are needed by CUDA:
+            // mask_type_name is (say) 'int64' or 'bool'.
+            // ztype is the name of the monoid type.
+            mask_type_name, ztype->name) ;
+
+    }
+    else
+    {
+
+        // the format of the semiring name is:
+        //
+        //  semiring_add_mult_xtype_Z_ztype
+
+        snprintf (semiring_name, GB_CUDA_STRLEN,
+            "semiring_%s_%s_%s_Z_%s",
+            // The first part is akin to GxB_PLUS_TIMES_FP64 (for example),
+            // but here this example is semiring_plus_times_double instead:
+            add_name, mult_name, xtype->name,
+            // this is not in the GrB* or GxB* name, but is needed by CUDA:
+            // ztype is the name of the monoid type.
+            ztype->name) ;
+
+    }
+
+    printf ("semiring_name:\n%s\n", semiring_name) ;
+    //printf ("semiring_code:\n%s\n", semiring_code) ;
+    //printf ("mask_name:    \n%s\n", mask_name) ;
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_stringify_terminal.c b/GraphBLAS/CUDA/GB_cuda_stringify_terminal.c
new file mode 100644
index 0000000000..26773d0383
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_stringify_terminal.c
@@ -0,0 +1,172 @@
+// SPDX-License-Identifier: Apache-2.0
+//------------------------------------------------------------------------------
+// GB_cuda_stringify_terminal: string to check terminal condition
+//------------------------------------------------------------------------------
+
+// The macro_condition_name(cij) should return true if the value of cij has
+// reached its terminal value, or false otherwise.  If the monoid is not
+// terminal, then the macro should always return false.  The ANY monoid
+// should always return true.
+
+// The macro_statement_name is a macro containing a full statement.  If the
+// monoid is never terminal, it becomes the empty statement (";").  Otherwise,
+// it checks the terminal condition and does a "break" if true.
+
+#include "GB.h"
+#include "GB_cuda_stringify.h"
+
+void GB_cuda_stringify_terminal // return strings to check terminal
+(
+    // outputs:
+    bool *is_monoid_terminal,
+    char *terminal_condition,
+    char *terminal_statement,
+    // inputs:
+    const char *macro_condition_name,
+    const char *macro_statement_name,
+    GB_Opcode opcode,    // must be a built-in binary operator from a monoid
+    GB_Type_code zcode   // op->ztype->code
+)
+{
+
+    //--------------------------------------------------------------------------
+    // determine if the monoid is terminal, and find its terminal value
+    //--------------------------------------------------------------------------
+
+    bool is_terminal = false ;
+    const char *f = NULL ;
+
+    switch (opcode)
+    {
+
+        #if 0
+        case GB_ANY_opcode :
+            f = NULL ;
+            is_terminal = true ;
+            break ;
+        #endif
+
+        case GB_MIN_opcode :
+
+            is_terminal = true ;
+            switch (zcode)
+            {
+                case GB_BOOL_code   : f = "false" ;         break ;
+                case GB_INT8_code   : f = "INT8_MIN" ;      break ;
+                case GB_INT16_code  : f = "INT16_MIN" ;     break ;
+                case GB_INT32_code  : f = "INT32_MIN" ;     break ;
+                case GB_INT64_code  : f = "INT64_MIN" ;     break ;
+                case GB_UINT8_code  : f = "0" ;             break ;
+                case GB_UINT16_code : f = "0" ;             break ;
+                case GB_UINT32_code : f = "0" ;             break ;
+                case GB_UINT64_code : f = "0" ;             break ;
+                default             : f = "(-INFINITY)" ;   break ;
+            }
+            break ;
+
+        case GB_MAX_opcode :
+
+            is_terminal = true ;
+            switch (zcode)
+            {
+                case GB_BOOL_code   : f = "true" ;          break ;
+                case GB_INT8_code   : f = "INT8_MAX" ;      break ;
+                case GB_INT16_code  : f = "INT16_MAX" ;     break ;
+                case GB_INT32_code  : f = "INT32_MAX" ;     break ;
+                case GB_INT64_code  : f = "INT64_MAX" ;     break ;
+                case GB_UINT8_code  : f = "UINT8_MAX" ;     break ;
+                case GB_UINT16_code : f = "UINT16_MAX" ;    break ;
+                case GB_UINT32_code : f = "UINT32_MAX" ;    break ;
+                case GB_UINT64_code : f = "UINT64_MAX" ;    break ;
+                default             : f = "INFINITY" ;      break ;
+            }
+            break ;
+
+        case GB_PLUS_opcode :
+
+            if (zcode == GB_BOOL_code)
+            {
+                f = "true" ;      // boolean OR
+                is_terminal = true ;
+            }
+            else
+            {
+                f = NULL ;
+                is_terminal = false ;
+            }
+            break ;
+
+        case GB_TIMES_opcode :
+
+            switch (zcode)
+            {
+                case GB_BOOL_code   :   // boolean AND
+                case GB_INT8_code   :
+                case GB_INT16_code  :
+                case GB_INT32_code  :
+                case GB_INT64_code  :
+                case GB_UINT8_code  :
+                case GB_UINT16_code :
+                case GB_UINT32_code :
+                case GB_UINT64_code :
+                    f = "0" ;
+                    is_terminal = true ;
+                    break ;
+                default             :
+                    f = NULL ;
+                    is_terminal = false ;
+                    break ;
+            }
+            break ;
+
+        case GB_LOR_opcode      : f = "true"  ; is_terminal = true  ; break ;
+        case GB_LAND_opcode     : f = "false" ; is_terminal = true  ; break ; 
+
+        case GB_LXOR_opcode     :
+        // case GB_LXNOR_opcode :
+        case GB_EQ_opcode       :
+        default                 :
+            // the monoid is not terminal
+            f = NULL ;
+            is_terminal = false ;
+            break ;
+    }
+
+    //--------------------------------------------------------------------------
+    // construct the macro to test the terminal condition
+    //--------------------------------------------------------------------------
+
+    if (is_terminal)
+    {
+        // the monoid is terminal
+        if (f == NULL)
+        {
+            // ANY monoid
+            snprintf (terminal_condition, GB_CUDA_STRLEN,
+                "#define %s(cij) true", macro_condition_name) ;
+            snprintf (terminal_statement, GB_CUDA_STRLEN,
+                "#define %s break", macro_statement_name) ;
+        }
+        else
+        {
+            // typical terminal monoids: check if C(i,j) has reached its
+            // terminal value
+            snprintf (terminal_condition, GB_CUDA_STRLEN,
+                "#define %s(cij) ((cij) == %s)", macro_condition_name, f) ;
+            snprintf (terminal_statement, GB_CUDA_STRLEN,
+                "#define %s if (%s (cij)) break",
+                macro_statement_name, macro_condition_name) ;
+        }
+    }
+    else
+    {
+        // the monoid is not terminal: the condition is always false
+        snprintf (terminal_condition, GB_CUDA_STRLEN, "#define %s(cij) false",
+            macro_condition_name) ;
+        snprintf (terminal_statement, GB_CUDA_STRLEN, "#define %s",
+            macro_statement_name) ;
+    }
+
+    (*is_monoid_terminal) = is_terminal ;
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_type_bits.c b/GraphBLAS/CUDA/GB_cuda_type_bits.c
new file mode 100644
index 0000000000..8712f1698d
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_type_bits.c
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include "GB.h"
+
+size_t GB_cuda_type_bits (GB_Type_code);
+
+size_t GB_cuda_type_bits (GB_Type_code type_code) 
+{
+    switch (type_code)
+    {
+        case GB_BOOL_code   : return (8) ;
+        case GB_INT8_code   : return (8) ;
+        case GB_INT16_code  : return (16) ;
+        case GB_INT32_code  : return (32) ;
+        case GB_INT64_code  : return (64) ;
+        case GB_UINT8_code  : return (8) ;
+        case GB_UINT16_code : return (16) ;
+        case GB_UINT32_code : return (32) ;
+        case GB_UINT64_code : return (64) ;
+        case GB_FP32_code   : return (32) ;
+        case GB_FP64_code   : return (64) ;
+//      case GB_FC32_code   : return (64) ;
+//      case GB_FC64_code   : return (128) ;
+        default             : return (0) ;
+    }
+}
+
diff --git a/GraphBLAS/CUDA/GB_cuda_warmup.cu b/GraphBLAS/CUDA/GB_cuda_warmup.cu
new file mode 100644
index 0000000000..6a0283bec3
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_cuda_warmup.cu
@@ -0,0 +1,74 @@
+//------------------------------------------------------------------------------
+// GB_cuda_warmup.cu: warmup the GPU
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
+// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+
+//------------------------------------------------------------------------------
+
+#include "GB_cuda.h"
+/*
+#include "rmm/include/rmm/mr/device/managed_memory_resource.hpp"
+#include "rmm/include/rmm/mr/device/pool_memory_resource.hpp"
+#include "rmm/include/rmm/mr/device/owning_wrapper.hpp"
+#include "rmm/include/rmm/mr/device/default_memory_resource.hpp"
+#include "rmm/include/rmm/mr/device/per_device_resource.hpp"
+#include "rmm/include/rmm/mr/device/cnmem_managed_memory_resource.hpp"
+*/
+#include "rmm/detail/cnmem.h"
+
+bool GB_cuda_warmup (int device)
+{
+    // allocate 'nothing' just to load the drivers.
+    // No need to free the result.
+    double gpu_memory_size = GB_Global_gpu_memorysize_get (device);
+
+    printf ("warming up device %d memsize %g sms %d\n",
+        device,
+        gpu_memory_size, 
+        GB_Global_gpu_sm_get (device)) ;
+
+
+    //auto cuda_managed = std::make_shared<rmm::mr::managed_memory_resource>();
+    //auto cuda = std::make_shared<rmm::mr::cuda_memory_resource>();
+    //auto pool = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>
+    //            ( cuda_managed, gpu_memory_size/2, gpu_memory_size ) ;  
+
+    std::vector<int> dev{0};
+    cnmemDevice_t cnmem_device;
+    memset(&cnmem_device, 0, sizeof(cnmem_device) ) ;
+    cnmem_device.size = gpu_memory_size/2;
+    if( device ==0)
+    {
+      cnmemInit(1, &cnmem_device, CNMEM_FLAGS_MANAGED);
+    }
+
+    //auto pool = std::make_shared<rmm::mr::cnmem_managed_memory_resource> ( gpu_memory_size/2 ) ;
+
+
+    //rmm::mr::set_per_device_resource ( rmm::cuda_device_id{device}, 
+    //                                 ( rmm::mr::device_memory_resource *)pool.get() ) ;
+    
+    //rmm::mr::set_default_resource ( pool.get() );
+    //rmm::mr::set_current_device_resource ( pool.get() );
+
+    //GB_Global_gpu_device_memory_resource_set( device, (void *)rmm::mr::get_current_device_resource() );
+
+    void *p ;
+    //cudaError_t err = cudaMalloc (&p, (size_t) 0) ;
+    //p = rmm::mr::get_current_device_resource()->allocate(  256) ;
+    //p = pool->allocate( 10) ;
+    cnmemMalloc( &p,  256 , NULL);
+    //rmm::mr::get_current_device_resource()->deallocate(p, 1);
+    //pool->deallocate( p, 10);
+    cnmemFree( p, NULL);
+
+    printf ("GPU %d nice and toasty now, pool=%g\n", device, gpu_memory_size/2 ) ;
+
+    // TODO check for jit cache? or in GB_init?
+
+    return  true; //(err == cudaSuccess) ;
+}
+
diff --git a/GraphBLAS/CUDA/GB_jit_cache.cu b/GraphBLAS/CUDA/GB_jit_cache.cu
new file mode 100644
index 0000000000..9df0889865
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_jit_cache.cu
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) 2019,2020 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <pwd.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+
+#include "GB_jit_cache.h"
+
+namespace jit {
+
+
+// Get the directory in home to use for storing the cache
+std::string get_user_home_cache_dir() {
+  auto home_dir = std::getenv("HOME");
+  if (home_dir != nullptr) {
+    return std::string(home_dir) + "/.GraphBLAS/";
+  } else {
+    return std::string();
+  }
+}
+
+// Default `GRAPHBLAS_CACHE_PATH` to `$HOME/.GraphBLAS`.
+// This definition can be overridden at compile time by specifying a
+// `-DGRAPHBLAS_CACHE_PATH=/kernel/cache/path` CMake argument.
+// This path is used in the `getCacheDir()` function below.
+#if !defined(GRAPHBLAS_CACHE_PATH)
+#define GRAPHBLAS_CACHE_PATH  get_user_home_cache_dir()
+#endif
+
+/**
+ * @brief Get the string path to the JITIFY kernel cache directory.
+ *
+ * This path can be overridden at runtime by defining an environment variable
+ * named `GRAPHBLAS_CACHE_PATH`. The value of this variable must be a path
+ * under which the process' user has read/write priveleges.
+ *
+ * This function returns a path to the cache directory, creating it if it
+ * doesn't exist.
+ *
+ * The default cache directory is `$HOME/.GraphBLAS`. If no overrides
+ * are used and if $HOME is not defined, returns an empty path and file 
+ * caching is not used.
+ **/
+std::string getCacheDir() {
+  // The environment variable always overrides the
+  // default/compile-time value of `GRAPHBLAS_CACHE_PATH`
+  auto kernel_cache_path_env = std::getenv("GRAPHBLAS_CACHE_PATH");
+  auto kernel_cache_path = (kernel_cache_path_env != nullptr ? kernel_cache_path_env
+                                       : GRAPHBLAS_CACHE_PATH);
+
+  struct stat st;
+  if ( (stat( kernel_cache_path.c_str(), &st) != 0) ) {
+    // `mkdir -p` the kernel cache path if it doesn't exist
+    printf("cache is going to path %s\n", kernel_cache_path.c_str());
+    int status;
+    status = mkdir(kernel_cache_path.c_str(), 0777);
+    if (status != 0 ) return std::string();
+    //boost::filesystem::create_directories(kernel_cache_path);
+  }
+  return std::string(kernel_cache_path);
+}
+
+GBJitCache::GBJitCache() { }
+
+GBJitCache::~GBJitCache() { }
+
+std::mutex GBJitCache::_kernel_cache_mutex;
+std::mutex GBJitCache::_program_cache_mutex;
+
+named_prog<jitify::experimental::Program> GBJitCache::getProgram(
+    std::string const& prog_name, 
+    std::string const& cuda_source,
+    std::vector<std::string> const& given_headers,
+    std::vector<std::string> const& given_options,
+    jitify::experimental::file_callback_type file_callback)
+{
+    // Lock for thread safety
+    std::lock_guard<std::mutex> lock(_program_cache_mutex);
+    //printf(" jit_cache get program %s\n", prog_name.c_str());
+
+    return getCached(prog_name, program_map, 
+        [&](){
+            return jitify::experimental::Program(cuda_source,
+                                        given_headers,
+                                        given_options,
+                                        file_callback);
+        }
+    );
+}
+
+named_prog<jitify::experimental::KernelInstantiation> GBJitCache::getKernelInstantiation(
+    std::string const& kern_name,
+    named_prog<jitify::experimental::Program> const& named_program,
+    std::vector<std::string> const& arguments)
+{
+    // Lock for thread safety
+    std::lock_guard<std::mutex> lock(_kernel_cache_mutex);
+
+    std::string prog_name = std::get<0>(named_program);
+    jitify::experimental::Program& program = *std::get<1>(named_program);
+
+    // Make instance name e.g. "prog_binop.kernel_v_v_int_int_long int_Add"
+    std::string kern_inst_name = prog_name + '.' + kern_name;
+    for ( auto&& arg : arguments ) kern_inst_name += '_' + arg;
+
+    //printf(" got kernel instance %s\n",kern_inst_name.c_str());
+
+    return getCached(kern_inst_name, kernel_inst_map, 
+        [&](){return program.kernel(kern_name)
+                            .instantiate(arguments);
+        }
+    );
+}
+
+// Another overload for getKernelInstantiation which might be useful to get
+// kernel instantiations in one step
+// ------------------------------------------------------------------------
+/*
+jitify::experimental::KernelInstantiation GBJitCache::getKernelInstantiation(
+    std::string const& kern_name,
+    std::string const& prog_name,
+    std::string const& cuda_source = "",
+    std::vector<std::string> const& given_headers = {},
+    std::vector<std::string> const& given_options = {},
+    file_callback_type file_callback = nullptr)
+{
+    auto program = getProgram(prog_name,
+                              cuda_source,
+                              given_headers,
+                              given_options,
+                              file_callback);
+    return getKernelInstantiation(kern_name, program);
+}
+*/
+
+GBJitCache::cacheFile::cacheFile(std::string file_name)
+ : _file_name{file_name}
+{ }
+
+GBJitCache::cacheFile::~cacheFile() { }
+
+std::string GBJitCache::cacheFile::read()
+{
+    // Open file (duh)
+    int fd = open ( _file_name.c_str(), O_RDWR );
+    if ( fd == -1 ) {
+        // TODO: connect errors to GrB_error result
+        //printf(" failed to open cache file %s\n",_file_name.c_str());
+        successful_read = false;
+        return std::string();
+    }
+
+    // Lock the file descriptor. we the only ones now
+    if ( lockf(fd, F_LOCK, 0) == -1 ) {
+        successful_read = false;
+        return std::string();
+    }
+
+    // Get file descriptor from file pointer
+    FILE *fp = fdopen( fd, "rb" );
+
+    // Get file length
+    fseek( fp , 0L , SEEK_END);
+    size_t file_size = ftell( fp );
+    rewind( fp );
+
+    // Allocate memory of file length size
+    std::string content;
+    content.resize(file_size);
+    char *buffer = &content[0];
+
+    // Copy file into buffer
+    if( fread(buffer, file_size, 1, fp) != 1 ) {
+        //printf(" failed to read cache file %s\n",_file_name.c_str());
+        successful_read = false;
+        fclose(fp);
+        free(buffer);
+        return std::string();
+    }
+    fclose(fp);
+    successful_read = true;
+    printf(" read cache file %s\n",_file_name.c_str());
+
+    return content;
+}
+
+void GBJitCache::cacheFile::write(std::string content)
+{
+    // Open file and create if it doesn't exist, with access 0600
+    int fd = open ( _file_name.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR );
+    if ( fd == -1 ) {
+        printf(" failed to open cache file for write %s\n",_file_name.c_str());
+        successful_write = false;
+        return;
+    }
+
+    // Lock the file descriptor. we the only ones now
+    if ( lockf(fd, F_LOCK, 0) == -1 ) {
+        successful_write = false;
+        return;
+    }
+
+    // Get file descriptor from file pointer
+    FILE *fp = fdopen( fd, "wb" );
+
+    // Copy string into file
+    if( fwrite(content.c_str(), content.length(), 1, fp) != 1 ) {
+        printf(" failed to write cache file %s\n",_file_name.c_str());
+        successful_write = false;
+        fclose(fp);
+        return;
+    }
+    fclose(fp);
+
+    successful_write = true;
+    //printf(" wrote cache file %s\n",_file_name.c_str());
+    
+    return;
+}
+
+} // namespace jit
diff --git a/GraphBLAS/CUDA/GB_jit_cache.h b/GraphBLAS/CUDA/GB_jit_cache.h
new file mode 100644
index 0000000000..0564c58f73
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_jit_cache.h
@@ -0,0 +1,222 @@
+/*
+ * Copyright (c) 2019,2020 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GB_JIT_CACHE_H_
+#define GB_JIT_CACHE_H_
+
+#include <jitify.hpp>
+#include <unordered_map>
+#include <string>
+#include <memory>
+#include <mutex>
+#include <iostream>
+#include <fstream>
+
+
+#define JITIFY_USE_CACHE 1
+
+namespace jit {
+
+template <typename Tv>
+using named_prog = std::pair<std::string, std::shared_ptr<Tv>>;
+
+/**
+ * @brief Get the string path to the JITIFY kernel cache directory.
+ *
+ * This path can be overridden at runtime by defining an environment variable
+ * named `GB_CUDA_KERNEL_CACHE_PATH`. The value of this variable must be a path
+ * under which the process' user has read/write priveleges.
+ *
+ * This function returns a path to the cache directory, creating it if it
+ * doesn't exist.
+ *
+ * The default cache directory `~/.GraphBLAS_kernel_cache`.
+ **/
+
+std::string getCacheDir();
+
+class GBJitCache
+{
+public:
+
+    /**---------------------------------------------------------------------------*
+     * @brief Get a process wide singleton cache object
+     * 
+     *---------------------------------------------------------------------------**/
+    static GBJitCache& Instance() {
+        // Meyers' singleton is thread safe in C++11
+        // Link: https://stackoverflow.com/a/1661564
+        static GBJitCache cache;
+        return cache;
+    }
+
+    GBJitCache();
+    ~GBJitCache();
+
+    /**---------------------------------------------------------------------------*
+     * @brief Get the Kernel Instantiation object
+     * 
+     * Searches an internal in-memory cache and file based cache for the kernel
+     * and if not found, JIT compiles and returns the kernel
+     * 
+     * @param kern_name [in] name of kernel to return
+     * @param program   [in] Jitify preprocessed program to get the kernel from
+     * @param arguments [in] template arguments for kernel in vector of strings
+     * @return  Pair of string kernel identifier and compiled kernel object
+     *---------------------------------------------------------------------------**/
+    named_prog<jitify::experimental::KernelInstantiation> getKernelInstantiation(
+        std::string const& kern_name,
+        named_prog<jitify::experimental::Program> const& program,
+        std::vector<std::string> const& arguments);
+
+    /**---------------------------------------------------------------------------*
+     * @brief Get the Jitify preprocessed Program object
+     * 
+     * Searches an internal in-memory cache and file based cache for the Jitify
+     * pre-processed program and if not found, JIT processes and returns it
+     * 
+     * @param prog_file_name [in] name of program to return
+     * @param cuda_source    [in] string source code of program to compile
+     * @param given_headers  [in] vector of strings representing source or names of
+     *  each header included in cuda_source
+     * @param given_options  [in] vector of strings options to pass to NVRTC
+     * @param file_callback  [in] pointer to callback function to call whenever a
+     *  header needs to be loaded
+     * @return named_prog<jitify::experimental::Program> 
+     *---------------------------------------------------------------------------**/
+    named_prog<jitify::experimental::Program> getProgram(
+        std::string const& prog_file_name, 
+        std::string const& cuda_source = "",
+        std::vector<std::string> const& given_headers = {},
+        std::vector<std::string> const& given_options = {},
+        jitify::experimental::file_callback_type file_callback = nullptr);
+
+private:
+    template <typename Tv>
+    using umap_str_shptr = std::unordered_map<std::string, std::shared_ptr<Tv>>;
+
+    umap_str_shptr<jitify::experimental::KernelInstantiation>  kernel_inst_map;
+    umap_str_shptr<jitify::experimental::Program>              program_map;
+
+    /*
+    Even though this class can be used as a non-singleton, the file cache
+    access should remain limited to one thread per process. The lockf locks can
+    prevent multiple processes from accessing the file but are ineffective in
+    preventing multiple threads from doing so as the lock is shared by the
+    entire process.
+    Therefore the mutexes are static.
+    */
+    static std::mutex _kernel_cache_mutex;
+    static std::mutex _program_cache_mutex;
+
+private:
+    /**---------------------------------------------------------------------------*
+     * @brief Class to allow process wise exclusive access to cache files
+     * 
+     *---------------------------------------------------------------------------**/
+    class cacheFile
+    {
+    private:
+        std::string _file_name ;
+        std::string _dir_name = "~/.GraphBLAS_kernel_cache/";
+        bool successful_read = false;
+        bool successful_write = false;
+    public:
+        cacheFile(std::string file_name);
+        ~cacheFile();
+
+        /**---------------------------------------------------------------------------*
+         * @brief Read this file and return the contents as a std::string
+         * 
+         *---------------------------------------------------------------------------**/
+        std::string read();
+
+        /**---------------------------------------------------------------------------*
+         * @brief Write the passed string to this file
+         * 
+         *---------------------------------------------------------------------------**/
+        void write(std::string);
+
+        /**---------------------------------------------------------------------------*
+         * @brief Check whether the read() operation on the file completed successfully
+         * 
+         * @return true Read was successful. String returned by `read()` is valid
+         * @return false Read was unsuccessful. String returned by `read()` is empty
+         *---------------------------------------------------------------------------**/
+        bool is_read_successful() { return successful_read; }
+
+        /**---------------------------------------------------------------------------*
+         * @brief Check whether the write() operation on the file completed successfully
+         * 
+         * @return true Write was successful.
+         * @return false Write was unsuccessful. File state is undefined
+         *---------------------------------------------------------------------------**/
+        bool is_write_successful() { return successful_write; }
+    };
+
+private:
+    template <typename T, typename FallbackFunc>
+    named_prog<T> getCached(
+        std::string const& name,
+        umap_str_shptr<T>& map,
+        FallbackFunc func) {
+
+        // Find memory cached T object
+        auto it = map.find(name);
+        if ( it != map.end()) {
+            std::cout<<"found memory-cached prog "<<name<<std::endl;
+            return std::make_pair(name, it->second);
+        }
+        else { // Find file cached T object
+            bool successful_read = false;
+            std::string serialized;
+            #if defined(JITIFY_USE_CACHE)
+                std::string cache_dir = getCacheDir();
+                if (not cache_dir.empty() ) {
+                    std::string file_name = cache_dir + name;
+                    //std::cout<<"looking for prog in file "<<file_name<<std::endl;
+
+                    cacheFile file{file_name};
+                    serialized = file.read();
+                    successful_read = file.is_read_successful();
+                }
+            #endif
+            if (not successful_read) {
+                // JIT compile and write to file if possible
+                serialized = func().serialize();
+                std::cout<<" compiled serialized prog "<<name<<std::endl;
+                #if defined(JITIFY_USE_CACHE)
+                    if (not cache_dir.empty()) {
+                        std::string file_name = cache_dir + name;
+                        std::cout<<"writing prog in file "<<file_name<<std::endl;
+                        cacheFile file{file_name};
+                        file.write(serialized);
+                    }
+                #endif
+            }
+            // Add deserialized T to cache and return
+            auto program = std::make_shared<T>(T::deserialize(serialized));
+            map[name] = program;
+            //std::cout<<"storing prog in memory "<<name<<std::endl;
+            return std::make_pair(name, program);
+        }
+    }
+};
+
+} // namespace jit
+
+
+#endif // GB_JIT_CACHE_H_
diff --git a/GraphBLAS/CUDA/GB_jit_launcher.cu b/GraphBLAS/CUDA/GB_jit_launcher.cu
new file mode 100644
index 0000000000..8875a3a1a0
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_jit_launcher.cu
@@ -0,0 +1,53 @@
+
+/*
+ * Copyright (c) 2019,2020 NVIDIA CORPORATION.
+ *
+ * Copyright 2018-2019 BlazingDB, Inc.
+ *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <GB_jit_launcher.h>
+#include <cstdint>
+
+namespace jit {
+
+    launcher::launcher(
+      const std::string& hash,
+      const std::string& cuda_source,
+      const std::vector<std::string>& header_names,
+      const std::vector<std::string>& compiler_flags,
+      jitify::experimental::file_callback_type file_callback,
+      cudaStream_t stream
+    )
+     : cache_instance{jit::GBJitCache::Instance()}
+     , stream(stream) 
+    {
+      program = cache_instance.getProgram(
+                  hash,
+                  cuda_source.c_str(),
+                  header_names,
+                  compiler_flags,
+                  file_callback
+                );
+    }
+
+    launcher::launcher(launcher&& launcher)
+     : program {std::move(launcher.program)}
+     , cache_instance {jit::GBJitCache::Instance()}
+     , kernel_inst {std::move(launcher.kernel_inst)}
+     , stream {launcher.stream}
+    { }
+
+} // namespace jit
diff --git a/GraphBLAS/CUDA/GB_jit_launcher.h b/GraphBLAS/CUDA/GB_jit_launcher.h
new file mode 100644
index 0000000000..c01b385fcc
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_jit_launcher.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2019,2020 NVIDIA CORPORATION.
+ *
+ * Copyright 2018-2019 BlazingDB, Inc.
+ *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GB_JIT_LAUNCHER_H
+#define GB_JIT_LAUNCHER_H
+
+#include <GB_jit_cache.h>
+#include <unordered_map>
+#include <memory>
+#include <string>
+#include <fstream>
+
+#define JITIFY_PRINT_INSTANTIATION 0
+#define JITIFY_PRINT_SOURCE 1
+#define JITIFY_PRINT_LOG 1
+#define JITIFY_PRINT_PTX 1
+#define JITIFY_PRINT_LINKER_LOG 0
+#define JITIFY_PRINT_LAUNCH 1
+#include <jitify.hpp>
+
+const std::vector<std::string> compiler_flags{
+   "-std=c++14",
+   "--use_fast_math",
+   "-remove-unused-globals",
+   "-w",
+   "-D__CUDACC_RTC__",
+   "-I.",
+   "-I..",
+   "-I../../Include",
+   "-I../../Source",
+   "-I../../Source/Template",
+   "-Ilocal_cub/block",
+   "-Itemplates",
+   "-I/usr/local/cuda/include"
+};
+
+namespace jit {
+
+/**
+ * @brief Class used to handle compilation and execution of JIT kernels
+ * 
+ */
+class launcher {
+ public:
+  launcher() = delete;
+   
+  /**
+   * @brief C'tor of the launcher class
+   * 
+   * Method to generate vector containing all template types for a JIT kernel.
+   *  This vector is used to get the compiled kernel for one set of types and set
+   *  it as the kernel to launch using this launcher.
+   * 
+   * @param hash The hash to be used as the key for caching
+   * @param cuda_code The CUDA code that contains the kernel to be launched
+   * @param header_names Strings of header_names or strings that contain content
+   * of the header files
+   * @param compiler_flags Strings of compiler flags
+   * @param file_callback a function that returns header file contents given header
+   * file names.
+   * @param stream The non-owned stream to use for execution
+   */
+  launcher(
+    const std::string& hash,
+    const std::string& cuda_source,
+    const std::vector<std::string>& header_names,
+    const std::vector<std::string>& compiler_flags,
+    jitify::experimental::file_callback_type file_callback,
+    cudaStream_t stream = 0
+  );       
+  launcher(launcher&&);
+  launcher(const launcher&) = delete;
+  launcher& operator=(launcher&&) = delete;
+  launcher& operator=(const launcher&) = delete;
+
+  /**
+   * @brief Sets the kernel to launch using this launcher
+   * 
+   * Method to generate vector containing all template types for a JIT kernel.
+   *  This vector is used to get the compiled kernel for one set of types and set
+   *  it as the kernel to launch using this launcher.
+   * 
+   * @param kernel_name The kernel to be launched
+   * @param arguments   The template arguments to be used to instantiate the kernel
+   * @return launcher& ref to this launcehr object
+   */
+  launcher& set_kernel_inst(
+    const std::string& kernel_name,
+    const std::vector<std::string>& arguments
+  )
+  { // program is a member variable of the launcher
+    kernel_inst = cache_instance.getKernelInstantiation(kernel_name, program, arguments);
+    return *this;
+  }
+
+  /**
+   * @brief Handle the Jitify API to launch using information 
+   *  contained in the members of `this`
+   * 
+   * @tparam grid and block sizes 
+   * @return Return launcher reference if successful
+   */
+  jitify::experimental::KernelLauncher configure( dim3 grid, dim3 block){
+    return get_kernel().configure( grid, block); 
+    //return get_kernel().configure_1d_max_occupancy( max_block_size=block.x); 
+  }
+
+
+  /**
+   * @brief Handle the Jitify API to launch using information 
+   *  contained in the members of `this`
+   * 
+   * @tparam All parameters to launch the kernel
+   * @return Return GDF_SUCCESS if successful
+   */
+  template <typename ... Args>
+  void launch(Args ... args){
+    get_kernel().configure_1d_max_occupancy(32, 0, 0, stream).launch(args...);
+  }
+
+ private:
+  jit::GBJitCache& cache_instance;
+  jit::named_prog<jitify::experimental::Program> program;
+  jit::named_prog<jitify::experimental::KernelInstantiation> kernel_inst;
+  cudaStream_t stream;
+
+  jitify::experimental::KernelInstantiation& get_kernel() { return *std::get<1>(kernel_inst); }
+};
+
+} // namespace jit
+
+#endif
diff --git a/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda.cu b/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda.cu
new file mode 100644
index 0000000000..4c52e4d427
--- /dev/null
+++ b/GraphBLAS/CUDA/GB_reduce_to_scalar_cuda.cu
@@ -0,0 +1,93 @@
+
+//------------------------------------------------------------------------------
+// GB_reduce_to_scalar_cuda.cu: reduce on the GPU with semiring 
+//------------------------------------------------------------------------------
+
+// SPDX-License-Identifier: Apache-2.0
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
+// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+
+//------------------------------------------------------------------------------
+
+#include "GB_cuda.h"
+
+#include "templates/reduceWarp.cu.jit"
+#include "templates/reduceNonZombiesWarp.cu.jit"
+#include "test/semiringFactory.hpp"
+
+#include "GB_jit_launcher.h"
+#include "GB_callback.hpp"
+
+GB_callback *SR_callback_ptr;
+
+std::istream* callback_wrapper( std::string file_name, std::iostream& tmp){
+   return SR_callback_ptr->callback( file_name, tmp);
+}
+
+const std::vector<std::string> header_names ={};
+
+GrB_Info GB_reduce_to_scalar_cuda
+(
+    GB_void *s,
+    const GrB_Monoid reduce,
+    const GrB_Matrix A,
+    GB_Context Context
+)
+{ 
+
+    printf ("Hi I am %s :-)\n", __FILE__) ;
+
+    // result = sum (Anz [0..anz-1]) using the GPU,
+    // with a kernel that has ntasks = grid.x and blocksize = blockDim.x
+    // nthreads = # of GPUs to use, but 1 for now
+    // We have a workspace W of size ntasks.
+
+    thread_local static jitify::JitCache kernel_cache;
+    std::string reduce_kernel_name = "reduceNonZombiesWarp";
+
+    // stringified kernel specified above
+    jitify::Program program= kernel_cache.program( templates_reduceNonZombiesWarp_cu, 0, 0,
+        file_callback_plus);
+    //{"--use_fast_math", "-I/usr/local/cuda/include"});
+
+    int nnz = GB_NNZ( A ) ;
+    GrB_Type ctype = reduce->op->ztype ;
+
+    int blocksize = 1024 ;
+    int ntasks = ( nnz + blocksize -1) / blocksize ;
+
+    int32_t *block_sum;
+    //cudaMallocManaged ((void**) &block_sum, (num_reduce_blocks)*sizeof(int32_t)) ;
+    block_sum = (int32_t*)GB_cuda_malloc( (ntasks)*sizeof(int32_t)) ;
+
+    dim3 red_grid(ntasks);
+    dim3 red_block(blocksize);
+
+    GBURBLE ("(GPU reduce launch nblocks,blocksize= %d,%d )\n", ntasks, blocksize) ;
+    jit::launcher( reduce_kernel_name + "_" + reduce->op->name,
+                   templates_reduceNonZombiesWarp_cu,
+                   header_names,
+                   compiler_flags,
+                   callback_wrapper)
+                   .set_kernel_inst( reduce_kernel_name , { ctype->name })
+                   .configure(red_grid, red_block) //if commented, use implicit 1D configure in launch
+                   .launch(
+                            A->i,   // index vector, only sum up values >= 0
+                            A->x,   // input pointer to vector to reduce, with zombies
+                            block_sum,             // Block sums on return 
+                            (unsigned int)nnz      // length of vector to reduce to scalar
+
+                        );
+
+    cudaDeviceSynchronize();
+
+
+    for (int i = 0 ; i < ntasks ; i++)
+    {
+        *s += (block_sum [i]) ; 
+    }
+
+
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/CUDA/License.txt b/GraphBLAS/CUDA/License.txt
new file mode 100644
index 0000000000..8ad4645770
--- /dev/null
+++ b/GraphBLAS/CUDA/License.txt
@@ -0,0 +1,36 @@
+This directory contains licensed OSS under the following terms:
+
+RMM
+http://github.com/rapidsai/rmm
+Apache-2.0 license
+
+CNMEM 
+http://github.com/NVIDIA/cnmem
+BSD 3-Clause
+
+Jitify
+http://github.com/NVIDIA/jitify
+BSD 3-Clause "New" or "Revised" License
+
+CUB 
+http://github.com/NVIDIA/cub
+BSD 3-Clause "New" or "Revised" License
+
+In addition, any source files not part of the above packages is hereby
+licensed under the Apache-2.0 license.
+
+Copyright 2020, NVIDIA Corporation
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file expect in compilance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or imlied.
+See the License for specific language governing permissions and 
+limitations under the License.
+
+
diff --git a/GraphBLAS/CUDA/Makefile b/GraphBLAS/CUDA/Makefile
new file mode 100644
index 0000000000..de2d973810
--- /dev/null
+++ b/GraphBLAS/CUDA/Makefile
@@ -0,0 +1,135 @@
+#-------------------------------------------------------------------------------
+# GraphBLAS/CUDA/Makefile
+#-------------------------------------------------------------------------------
+
+# cuda 10.1+ is assumed
+
+all: library
+
+GXX     ?= g++
+DOXYGEN ?= doxygen
+CXXFLAGS ?= -O3 -Wall -g -fmessage-length=80
+
+CXX11 ?= 1
+
+CUDA_DIR ?= /usr/local/cuda
+
+CXXFLAGS += -pthread
+
+ifeq ($(CXX11),1)
+	CXXFLAGS += -std=c++14
+endif
+
+EMBED_BEGIN = -rdynamic -Wl,-b,binary,
+EMBED_END   = ,-b,default
+
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Linux)
+	CXXFLAGS += -D LINUX
+	CUDA_LIB_DIR = $(CUDA_DIR)/lib64
+else ifeq ($(UNAME_S),Darwin)
+	CUDA_LIB_DIR = $(CUDA_DIR)/lib
+endif
+
+INC += -I$(CUDA_DIR)/include
+LIB += -ldl -L$(CUDA_LIB_DIR) -lcuda -lcudart -lnvrtc
+
+
+GCC ?= gcc
+
+SRC = GB*.cu
+SRC2 = $(notdir $(wildcard $(SRC)))
+OBJ = $(SRC2:.cu=.o)
+cSRC = $(wildcard *.c) 
+cOBJ = $(cSRC:.c=.o)
+
+I = -I. -I../Source -I../Source/Template -I../Include -Irmm/rmm/include/  -Irmm/thirdparty/spdlog/include -Irmm/include/thirdparty/cnmem/include
+SO_NAME = libgraphblascuda.so
+SO_OPTS = --shared \
+    -Xlinker -soname \
+    -Xlinker $(SO_NAME)
+
+LIBS = -L/usr/local/cuda/lib64 -lcudadevrt -lcudart -lrmm -lspdlog -lcnmem
+
+CUDA_OPTS = -O2 --cudart=shared --gpu-architecture=compute_75 \
+        --relocatable-device-code true \
+        --std=c++14 -Xcompiler -fPIC
+
+library: $(SO_NAME) 
+
+HEADERS = jitify.hpp 
+
+TEMPLATES :=  $(wildcard templates/*.cu)
+
+JIT_TEMP := $(patsubst %.cu, %.cu.jit, $(TEMPLATES))
+
+%.cu: %.cutmp
+	cp $? $@
+
+%.cu.jit: %.cu 
+	./stringify $? > $@
+
+stringify: stringify.cpp
+	$(GXX) -o $@ $< -O3 -Wall
+
+doc: jitify.hpp Doxyfile
+	$(DOXYGEN) Doxyfile
+.PHONY: doc
+
+test: $(cOBJ)
+	@echo $(cOBJ)
+
+$(cOBJ): %.o: %.c GB_cuda_stringify.h 
+	$(GCC) $(I) -o $@ -c $< -O2 -Wall
+
+$(SO_NAME): $(OBJ) $(cOBJ) $(JIT_TEMP) GB_AxB_dot3_cuda.o
+	echo $(OBJ)
+	nvcc $(SO_OPTS) $(LIBS) $(OBJ) $(cOBJ) -o $@
+
+GB_AxB_dot3_cuda.o: $(JIT_TEMP) matrix.h
+%.o: %.cu
+	nvcc -c $(I) $(CUDA_OPTS) -o $@ $< $(LIBS)
+
+
+config:
+	nvidia-smi
+	nvcc --version
+	@echo " "
+	@echo "SO_NAME:   " $(SO_NAME)
+	@echo "SO_OPTS:   " $(SO_OPTS)
+	@echo "LIBS:      " $(LIBS)
+	@echo "CUDA_OPTS: " $(CUDA_OPTS)
+	@echo "SRC:       " $(SRC)
+	@echo "OBJ:       " $(OBJ)
+	@echo "I:         " $(I)
+	@echo " "
+	gcc  --version
+	icc  --version
+
+clean:
+	rm -f *.o
+	rm -f stringify
+.PHONY: clean
+
+distclean: clean
+	rm -f *.so *.a
+
+purge: distclean
+
+################################################################################
+
+
+EMBED_BEGIN = -rdynamic -Wl,-b,binary,
+EMBED_END   = ,-b,default
+
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Linux)
+	CXXFLAGS += -D LINUX
+	CUDA_LIB_DIR = $(CUDA_DIR)/lib64
+else ifeq ($(UNAME_S),Darwin)
+	CUDA_LIB_DIR = $(CUDA_DIR)/lib
+endif
+
+
+
+
diff --git a/GraphBLAS/CUDA/Makefile.jitFactory b/GraphBLAS/CUDA/Makefile.jitFactory
new file mode 100644
index 0000000000..eb6b0c7995
--- /dev/null
+++ b/GraphBLAS/CUDA/Makefile.jitFactory
@@ -0,0 +1,59 @@
+
+GXX     ?= g++
+DOXYGEN ?= doxygen
+CXXFLAGS ?= -O3 -Wall -g -fmessage-length=80
+
+CXX11 ?= 0
+CXX14 ?= 1
+
+CUDA_DIR ?= /usr/local/cuda
+
+CXXFLAGS += -pthread
+
+ifeq ($(CXX11),1)
+	CXXFLAGS += -std=c++11
+endif
+ifeq ($(CXX14),1)
+	CXXFLAGS += -std=c++14
+endif
+
+EMBED_BEGIN = -rdynamic -Wl,-b,binary,
+EMBED_END   = ,-b,default
+
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Linux)
+	CXXFLAGS += -D LINUX
+	CUDA_LIB_DIR = $(CUDA_DIR)/lib64
+else ifeq ($(UNAME_S),Darwin)
+	CUDA_LIB_DIR = $(CUDA_DIR)/lib
+endif
+
+INC += -I$(CUDA_DIR)/include
+LIB += -ldl -L$(CUDA_LIB_DIR) -lcuda -lcudart -lnvrtc
+
+HEADERS = jitify.hpp 
+
+TEMPLATES :=  $(wildcard *.cutmp)
+
+JIT_TEMP := $(patsubst %.cutmp, %.cu.jit, $(TEMPLATES))
+
+jitFactory:  jitFactory.cpp $(HEADERS) $(JIT_TEMP) 
+	$(GXX) -o $@ $< $(CXXFLAGS) $(INC) $(LIB)
+
+%.cu: %.cutmp
+	cp $? $@
+
+%.cu.jit: %.cu 
+	./stringify $? > $@
+
+stringify: stringify.cpp
+	$(GXX) -o $@ $< -O3 -Wall
+
+doc: jitify.hpp Doxyfile
+	$(DOXYGEN) Doxyfile
+.PHONY: doc
+
+clean:
+	rm -f stringify
+	rm -f jitFactory
+.PHONY: clean
diff --git a/GraphBLAS/CUDA/Makefile_new b/GraphBLAS/CUDA/Makefile_new
new file mode 100644
index 0000000000..462bda5636
--- /dev/null
+++ b/GraphBLAS/CUDA/Makefile_new
@@ -0,0 +1,49 @@
+#-------------------------------------------------------------------------------
+# GraphBLAS/CUDA/Makefile
+#-------------------------------------------------------------------------------
+
+# cuda 10.1 is assumed
+
+SRC = GB*.cu
+SRC2 = $(notdir $(wildcard $(SRC)))
+OBJ = $(SRC2:.cu=.o)
+
+I = -I. -I../Source -I../Source/Template -I../Include
+SO_NAME = libgraphblascuda.a
+
+LIBS = -L/usr/local/cuda/lib64 -lcudadevrt -lcudart
+
+CUDA_OPTS = -O2 --cudart=shared \
+        --relocatable-device-code true \
+        --std=c++11 -Xcompiler -fPIC
+
+$(SO_NAME): $(OBJ)
+	echo $(OBJ)
+	ar rv $@ $^
+
+%.o: %.cu
+	nvcc -c $(I) $(CUDA_OPTS) -o $@ $< $(LIBS)
+
+config:
+	nvidia-smi
+	nvcc --version
+	@echo " "
+	@echo "SO_NAME:   " $(SO_NAME)
+	@echo "SO_OPTS:   " $(SO_OPTS)
+	@echo "LIBS:      " $(LIBS)
+	@echo "CUDA_OPTS: " $(CUDA_OPTS)
+	@echo "SRC:       " $(SRC)
+	@echo "OBJ:       " $(OBJ)
+	@echo "I:         " $(I)
+	@echo " "
+	gcc  --version
+	icc  --version
+
+clean:
+	rm -f *.o
+
+distclean: clean
+	rm -f *.so
+
+purge: distclean
+
diff --git a/GraphBLAS/CUDA/TODO.txt b/GraphBLAS/CUDA/TODO.txt
new file mode 100644
index 0000000000..b80a11783c
--- /dev/null
+++ b/GraphBLAS/CUDA/TODO.txt
@@ -0,0 +1,4 @@
+
+TODO Get libgraphblascuda.a to work.
+TODO why is pthread demo hanging with CUDA?
+
diff --git a/GraphBLAS/CUDA/binary_search.h b/GraphBLAS/CUDA/binary_search.h
new file mode 100644
index 0000000000..a21d07e1e6
--- /dev/null
+++ b/GraphBLAS/CUDA/binary_search.h
@@ -0,0 +1,39 @@
+
+#define GB_GETA( aval, ax, p) aval = (T_Z)ax[ ( p )]
+#define GB_GETB( bval, bx, p) bval = (T_Z)bx[ ( p )]
+#define GB_FLIP(i)             (-(i)-2)
+#define GB_IS_FLIPPED(i)       ((i) < 0)
+#define GB_IS_ZOMBIE(i)        ((i) < 0)
+#define GB_IS_NOT_FLIPPED(i)   ((i) >= 0)
+#define GB_IS_NOT_ZOMBIE(i)    ((i) >= 0)
+#define GB_UNFLIP(i)           (((i) < 0) ? GB_FLIP(i) : (i))
+
+//------------------------------------------------------------------------------
+// GB_BINARY_SEARCH
+//------------------------------------------------------------------------------
+
+// search for integer i in the list X [pleft...pright]; no zombies.
+// The list X [pleft ... pright] is in ascending order.  It may have
+// duplicates.
+
+#define GB_BINARY_TRIM_SEARCH(i,X,pleft,pright)                             \
+{                                                                           \
+    /* binary search of X [pleft ... pright] for integer i */               \
+    while (pleft < pright)                                                  \
+    {                                                                       \
+        int64_t pmiddle = (pleft + pright) / 2 ;                            \
+        if (X [pmiddle] < i)                                                \
+        {                                                                   \
+            /* if in the list, it appears in [pmiddle+1..pright] */         \
+            pleft = pmiddle + 1 ;                                           \
+        }                                                                   \
+        else                                                                \
+        {                                                                   \
+            /* if in the list, it appears in [pleft..pmiddle] */            \
+            pright = pmiddle ;                                              \
+        }                                                                   \
+    }                                                                       \
+    /* binary search is narrowed down to a single item */                   \
+    /* or it has found the list is empty */                                 \
+    /*ASSERT (pleft == pright || pleft == pright + 1) ;*/                   \
+}
diff --git a/GraphBLAS/CUDA/dot.c b/GraphBLAS/CUDA/dot.c
new file mode 100644
index 0000000000..16e40f06aa
--- /dev/null
+++ b/GraphBLAS/CUDA/dot.c
@@ -0,0 +1,31 @@
+
+consider these methods on the GPU: (see ../Source/Template/GB_AxB_dot_cij.c)
+
+            while (pA < pA_end && pB < pB_end)
+            {
+                int64_t ia = Ai [pA] ;
+                int64_t ib = Bi [pB] ;
+
+                #if 0
+                if (ia == ib)
+                {
+                    GB_DOT (ia, pA, pB) ;
+                    pA++ ;
+                    pB++ ;
+                }
+                else
+                { 
+                    pA += (ia < ib) ;
+                    pB += (ib < ia) ;
+                }
+                #endif
+
+                #if 0
+                // this might be fastest on the GPU
+                #if GB_IS_PLUS_PAIR_REAL_SEMIRING && GB_CTYPE_IGNORE_OVERFLOW
+                cij += (ia == ib) ;
+                pA += (ia <= ib) ;
+                pB += (ib <= ia) ;
+                #endif
+                #endif
+            }
diff --git a/GraphBLAS/CUDA/go b/GraphBLAS/CUDA/go
new file mode 100755
index 0000000000..4d7e48c801
--- /dev/null
+++ b/GraphBLAS/CUDA/go
@@ -0,0 +1,3 @@
+#!/bin/bash
+./jitFactory > o ; vim o
+
diff --git a/GraphBLAS/CUDA/jitify.hpp b/GraphBLAS/CUDA/jitify.hpp
new file mode 100644
index 0000000000..e9ff891155
--- /dev/null
+++ b/GraphBLAS/CUDA/jitify.hpp
@@ -0,0 +1,4185 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+  -----------
+  Jitify 0.9
+  -----------
+  A C++ library for easy integration of CUDA runtime compilation into
+  existing codes.
+
+  --------------
+  How to compile
+  --------------
+  Compiler dependencies: <jitify.hpp>, -std=c++11
+  Linker dependencies:   dl cuda nvrtc
+
+  --------------------------------------
+  Embedding source files into executable
+  --------------------------------------
+  g++  ... -ldl -rdynamic -DJITIFY_ENABLE_EMBEDDED_FILES=1
+  -Wl,-b,binary,my_kernel.cu,include/my_header.cuh,-b,default nvcc ... -ldl
+  -Xcompiler "-rdynamic
+  -Wl\,-b\,binary\,my_kernel.cu\,include/my_header.cuh\,-b\,default"
+  JITIFY_INCLUDE_EMBEDDED_FILE(my_kernel_cu);
+  JITIFY_INCLUDE_EMBEDDED_FILE(include_my_header_cuh);
+
+  ----
+  TODO
+  ----
+  Extract valid compile options and pass the rest to cuModuleLoadDataEx
+  See if can have stringified headers automatically looked-up
+    by having stringify add them to a (static) global map.
+    The global map can be updated by creating a static class instance
+      whose constructor performs the registration.
+    Can then remove all headers from JitCache constructor in example code
+  See other TODOs in code
+*/
+
+/*! \file jitify.hpp
+ *  \brief The Jitify library header
+ */
+
+/*! \mainpage Jitify - A C++ library that simplifies the use of NVRTC
+ *  \p Use class jitify::JitCache to manage and launch JIT-compiled CUDA
+ *    kernels.
+ *
+ *  \p Use namespace jitify::reflection to reflect types and values into
+ *    code-strings.
+ *
+ *  \p Use JITIFY_INCLUDE_EMBEDDED_FILE() to declare files that have been
+ *  embedded into the executable using the GCC linker.
+ *
+ *  \p Use jitify::parallel_for and JITIFY_LAMBDA() to generate and launch
+ *  simple kernels.
+ */
+
+#pragma once
+
+#ifndef JITIFY_THREAD_SAFE
+#define JITIFY_THREAD_SAFE 1
+#endif
+
+#if JITIFY_ENABLE_EMBEDDED_FILES
+#include <dlfcn.h>
+#endif
+#include <stdint.h>
+#include <algorithm>
+#include <cctype>
+#include <cstring>  // For strtok_r etc.
+#include <deque>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#if JITIFY_THREAD_SAFE
+#include <mutex>
+#endif
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>  // For dim3, cudaStream_t
+#if CUDA_VERSION >= 8000
+#define NVRTC_GET_TYPE_NAME 1
+#endif
+#include <nvrtc.h>
+
+// For use by get_current_executable_path().
+#ifdef __linux__
+#include <linux/limits.h>  // For PATH_MAX
+
+#include <cstdlib>  // For realpath
+#define JITIFY_PATH_MAX PATH_MAX
+#elif defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#define JITIFY_PATH_MAX MAX_PATH
+#else
+#error "Unsupported platform"
+#endif
+
+#ifdef _MSC_VER       // MSVC compiler
+#include <dbghelp.h>  // For UnDecorateSymbolName
+#else
+#include <cxxabi.h>  // For abi::__cxa_demangle
+#endif
+
+#if defined(_WIN32) || defined(_WIN64)
+// WAR for strtok_r being called strtok_s on Windows
+#pragma push_macro("strtok_r")
+#undef strtok_r
+#define strtok_r strtok_s
+// WAR for min and max possibly being macros defined by windows.h
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef JITIFY_PRINT_LOG
+#define JITIFY_PRINT_LOG 1
+#endif
+
+#if JITIFY_PRINT_ALL
+#define JITIFY_PRINT_INSTANTIATION 1
+#define JITIFY_PRINT_SOURCE 1
+#define JITIFY_PRINT_LOG 1
+#define JITIFY_PRINT_PTX 1
+#define JITIFY_PRINT_LINKER_LOG 1
+#define JITIFY_PRINT_LAUNCH 1
+#define JITIFY_PRINT_HEADER_PATHS 1
+#endif
+
+#if JITIFY_ENABLE_EMBEDDED_FILES
+#define JITIFY_FORCE_UNDEFINED_SYMBOL(x) void* x##_forced = (void*)&x
+/*! Include a source file that has been embedded into the executable using the
+ *    GCC linker.
+ * \param name The name of the source file (<b>not</b> as a string), which must
+ * be sanitized by replacing non-alpha-numeric characters with underscores.
+ * E.g., \code{.cpp}JITIFY_INCLUDE_EMBEDDED_FILE(my_header_h)\endcode will
+ * include the embedded file "my_header.h".
+ * \note Files declared with this macro can be referenced using
+ * their original (unsanitized) filenames when creating a \p
+ * jitify::Program instance.
+ */
+#define JITIFY_INCLUDE_EMBEDDED_FILE(name)                                \
+  extern "C" uint8_t _jitify_binary_##name##_start[] asm("_binary_" #name \
+                                                         "_start");       \
+  extern "C" uint8_t _jitify_binary_##name##_end[] asm("_binary_" #name   \
+                                                       "_end");           \
+  JITIFY_FORCE_UNDEFINED_SYMBOL(_jitify_binary_##name##_start);           \
+  JITIFY_FORCE_UNDEFINED_SYMBOL(_jitify_binary_##name##_end)
+#endif  // JITIFY_ENABLE_EMBEDDED_FILES
+
+/*! Jitify library namespace
+ */
+namespace jitify {
+
+/*! Source-file load callback.
+ *
+ *  \param filename The name of the requested source file.
+ *  \param tmp_stream A temporary stream that can be used to hold source code.
+ *  \return A pointer to an input stream containing the source code, or NULL
+ *  to defer loading of the file to Jitify's file-loading mechanisms.
+ */
+typedef std::istream* (*file_callback_type)(std::string filename,
+                                            std::iostream& tmp_stream);
+
+// Exclude from Doxygen
+//! \cond
+
+class JitCache;
+
+// Simple cache using LRU discard policy
+template <typename KeyType, typename ValueType>
+class ObjectCache {
+ public:
+  typedef KeyType key_type;
+  typedef ValueType value_type;
+
+ private:
+  typedef std::map<key_type, value_type> object_map;
+  typedef std::deque<key_type> key_rank;
+  typedef typename key_rank::iterator rank_iterator;
+  object_map _objects;
+  key_rank _ranked_keys;
+  size_t _capacity;
+
+  inline void discard_old(size_t n = 0) {
+    if (n > _capacity) {
+      throw std::runtime_error("Insufficient capacity in cache");
+    }
+    while (_objects.size() > _capacity - n) {
+      key_type discard_key = _ranked_keys.back();
+      _ranked_keys.pop_back();
+      _objects.erase(discard_key);
+    }
+  }
+
+ public:
+  inline ObjectCache(size_t capacity = 8) : _capacity(capacity) {}
+  inline void resize(size_t capacity) {
+    _capacity = capacity;
+    this->discard_old();
+  }
+  inline bool contains(const key_type& k) const {
+    return (bool)_objects.count(k);
+  }
+  inline void touch(const key_type& k) {
+    if (!this->contains(k)) {
+      throw std::runtime_error("Key not found in cache");
+    }
+    rank_iterator rank = std::find(_ranked_keys.begin(), _ranked_keys.end(), k);
+    if (rank != _ranked_keys.begin()) {
+      // Move key to front of ranks
+      _ranked_keys.erase(rank);
+      _ranked_keys.push_front(k);
+    }
+  }
+  inline value_type& get(const key_type& k) {
+    if (!this->contains(k)) {
+      throw std::runtime_error("Key not found in cache");
+    }
+    this->touch(k);
+    return _objects[k];
+  }
+  inline value_type& insert(const key_type& k,
+                            const value_type& v = value_type()) {
+    this->discard_old(1);
+    _ranked_keys.push_front(k);
+    return _objects.insert(std::make_pair(k, v)).first->second;
+  }
+  template <typename... Args>
+  inline value_type& emplace(const key_type& k, Args&&... args) {
+    this->discard_old(1);
+    // Note: Use of piecewise_construct allows non-movable non-copyable types
+    auto iter = _objects
+                    .emplace(std::piecewise_construct, std::forward_as_tuple(k),
+                             std::forward_as_tuple(args...))
+                    .first;
+    _ranked_keys.push_front(iter->first);
+    return iter->second;
+  }
+};
+
+namespace detail {
+
+// Convenience wrapper for std::vector that provides handy constructors
+template <typename T>
+class vector : public std::vector<T> {
+  typedef std::vector<T> super_type;
+
+ public:
+  vector() : super_type() {}
+  vector(size_t n) : super_type(n) {}  // Note: Not explicit, allows =0
+  vector(std::vector<T> const& vals) : super_type(vals) {}
+  template <int N>
+  vector(T const (&vals)[N]) : super_type(vals, vals + N) {}
+  vector(std::vector<T>&& vals) : super_type(vals) {}
+  vector(std::initializer_list<T> vals) : super_type(vals) {}
+};
+
+// Helper functions for parsing/manipulating source code
+
+inline std::string replace_characters(std::string str,
+                                      std::string const& oldchars,
+                                      char newchar) {
+  size_t i = str.find_first_of(oldchars);
+  while (i != std::string::npos) {
+    str[i] = newchar;
+    i = str.find_first_of(oldchars, i + 1);
+  }
+  return str;
+}
+inline std::string sanitize_filename(std::string name) {
+  return replace_characters(name, "/\\.-: ?%*|\"<>", '_');
+}
+
+#if JITIFY_ENABLE_EMBEDDED_FILES
+class EmbeddedData {
+  void* _app;
+  EmbeddedData(EmbeddedData const&);
+  EmbeddedData& operator=(EmbeddedData const&);
+
+ public:
+  EmbeddedData() {
+    _app = dlopen(NULL, RTLD_LAZY);
+    if (!_app) {
+      throw std::runtime_error(std::string("dlopen failed: ") + dlerror());
+    }
+    dlerror();  // Clear any existing error
+  }
+  ~EmbeddedData() {
+    if (_app) {
+      dlclose(_app);
+    }
+  }
+  const uint8_t* operator[](std::string key) const {
+    key = sanitize_filename(key);
+    key = "_binary_" + key;
+    uint8_t const* data = (uint8_t const*)dlsym(_app, key.c_str());
+    if (!data) {
+      throw std::runtime_error(std::string("dlsym failed: ") + dlerror());
+    }
+    return data;
+  }
+  const uint8_t* begin(std::string key) const {
+    return (*this)[key + "_start"];
+  }
+  const uint8_t* end(std::string key) const { return (*this)[key + "_end"]; }
+};
+#endif  // JITIFY_ENABLE_EMBEDDED_FILES
+
+inline bool is_tokenchar(char c) {
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
+         (c >= '0' && c <= '9') || c == '_';
+}
+inline std::string replace_token(std::string src, std::string token,
+                                 std::string replacement) {
+  size_t i = src.find(token);
+  while (i != std::string::npos) {
+    if (i == 0 || i == src.size() - token.size() ||
+        (!is_tokenchar(src[i - 1]) && !is_tokenchar(src[i + token.size()]))) {
+      src.replace(i, token.size(), replacement);
+      i += replacement.size();
+    } else {
+      i += token.size();
+    }
+    i = src.find(token, i);
+  }
+  return src;
+}
+inline std::string path_base(std::string p) {
+  // "/usr/local/myfile.dat" -> "/usr/local"
+  // "foo/bar"  -> "foo"
+  // "foo/bar/" -> "foo/bar"
+#if defined _WIN32 || defined _WIN64
+  char sep = '\\';
+#else
+  char sep = '/';
+#endif
+  size_t i = p.find_last_of(sep);
+  if (i != std::string::npos) {
+    return p.substr(0, i);
+  } else {
+    return "";
+  }
+}
+inline std::string path_join(std::string p1, std::string p2) {
+#ifdef _WIN32
+  char sep = '\\';
+#else
+  char sep = '/';
+#endif
+  if (p1.size() && p2.size() && p2[0] == sep) {
+    throw std::invalid_argument("Cannot join to absolute path");
+  }
+  if (p1.size() && p1[p1.size() - 1] != sep) {
+    p1 += sep;
+  }
+  return p1 + p2;
+}
+// Elides "/." and "/.." tokens from path.
+inline std::string path_simplify(const std::string& path) {
+  std::vector<std::string> dirs;
+  std::string cur_dir;
+  bool after_slash = false;
+  for (int i = 0; i < (int)path.size(); ++i) {
+    if (path[i] == '/') {
+      if (after_slash) continue;  // Ignore repeat slashes
+      after_slash = true;
+      if (cur_dir == ".." && !dirs.empty() && dirs.back() != "..") {
+        if (dirs.size() == 1 && dirs.front().empty()) {
+          throw std::runtime_error(
+              "Invalid path: back-traversals exceed depth of absolute path");
+        }
+        dirs.pop_back();
+      } else if (cur_dir != ".") {  // Ignore /./
+        dirs.push_back(cur_dir);
+      }
+      cur_dir.clear();
+    } else {
+      after_slash = false;
+      cur_dir.push_back(path[i]);
+    }
+  }
+  if (!after_slash) {
+    dirs.push_back(cur_dir);
+  }
+  std::stringstream ss;
+  for (int i = 0; i < (int)dirs.size() - 1; ++i) {
+    ss << dirs[i] << "/";
+  }
+  if (!dirs.empty()) ss << dirs.back();
+  if (after_slash) ss << "/";
+  return ss.str();
+}
+inline unsigned long long hash_larson64(const char* s,
+                                        unsigned long long seed = 0) {
+  unsigned long long hash = seed;
+  while (*s) {
+    hash = hash * 101 + *s++;
+  }
+  return hash;
+}
+
+inline uint64_t hash_combine(uint64_t a, uint64_t b) {
+  // Note: The magic number comes from the golden ratio
+  return a ^ (0x9E3779B97F4A7C17ull + b + (b >> 2) + (a << 6));
+}
+
+inline bool extract_include_info_from_compile_error(std::string log,
+                                                    std::string& name,
+                                                    std::string& parent,
+                                                    int& line_num) {
+  static const std::vector<std::string> pattern = {
+      "could not open source file \"", "cannot open source file \""};
+
+  for (auto& p : pattern) {
+    size_t beg = log.find(p);
+    if (beg != std::string::npos) {
+      beg += p.size();
+      size_t end = log.find("\"", beg);
+      name = log.substr(beg, end - beg);
+
+      size_t line_beg = log.rfind("\n", beg);
+      if (line_beg == std::string::npos) {
+        line_beg = 0;
+      } else {
+        line_beg += 1;
+      }
+
+      size_t split = log.find("(", line_beg);
+      parent = log.substr(line_beg, split - line_beg);
+      line_num =
+          atoi(log.substr(split + 1, log.find(")", split + 1) - (split + 1))
+                   .c_str());
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+inline bool is_include_directive_with_quotes(const std::string& source,
+                                             int line_num) {
+  // TODO: Check each find() for failure.
+  size_t beg = 0;
+  for (int i = 1; i < line_num; ++i) {
+    beg = source.find("\n", beg) + 1;
+  }
+  beg = source.find("include", beg) + 7;
+  beg = source.find_first_of("\"<", beg);
+  return source[beg] == '"';
+}
+
+inline std::string comment_out_code_line(int line_num, std::string source) {
+  size_t beg = 0;
+  for (int i = 1; i < line_num; ++i) {
+    beg = source.find("\n", beg) + 1;
+  }
+  return (source.substr(0, beg) + "//" + source.substr(beg));
+}
+
+inline void print_with_line_numbers(std::string const& source) {
+  int linenum = 1;
+  std::stringstream source_ss(source);
+  for (std::string line; std::getline(source_ss, line); ++linenum) {
+    std::cout << std::setfill(' ') << std::setw(3) << linenum << " " << line
+              << std::endl;
+  }
+}
+
+inline void print_compile_log(std::string program_name,
+                              std::string const& log) {
+  std::cout << "---------------------------------------------------"
+            << std::endl;
+  std::cout << "--- JIT compile log for " << program_name << " ---"
+            << std::endl;
+  std::cout << "---------------------------------------------------"
+            << std::endl;
+  std::cout << log << std::endl;
+  std::cout << "---------------------------------------------------"
+            << std::endl;
+}
+
+inline std::vector<std::string> split_string(std::string str,
+                                             long maxsplit = -1,
+                                             std::string delims = " \t") {
+  std::vector<std::string> results;
+  if (maxsplit == 0) {
+    results.push_back(str);
+    return results;
+  }
+  // Note: +1 to include NULL-terminator
+  std::vector<char> v_str(str.c_str(), str.c_str() + (str.size() + 1));
+  char* c_str = v_str.data();
+  char* saveptr = c_str;
+  char* token = nullptr;
+  for (long i = 0; i != maxsplit; ++i) {
+    token = ::strtok_r(c_str, delims.c_str(), &saveptr);
+    c_str = 0;
+    if (!token) {
+      return results;
+    }
+    results.push_back(token);
+  }
+  // Check if there's a final piece
+  token += ::strlen(token) + 1;
+  if (token - v_str.data() < (ptrdiff_t)str.size()) {
+    // Find the start of the final piece
+    token += ::strspn(token, delims.c_str());
+    if (*token) {
+      results.push_back(token);
+    }
+  }
+  return results;
+}
+
+static const std::map<std::string, std::string>& get_jitsafe_headers_map();
+
+inline bool load_source(
+    std::string filename, std::map<std::string, std::string>& sources,
+    std::string current_dir = "",
+    std::vector<std::string> include_paths = std::vector<std::string>(),
+    file_callback_type file_callback = 0,
+    std::map<std::string, std::string>* fullpaths = nullptr,
+    bool search_current_dir = true) {
+  std::istream* source_stream = 0;
+  std::stringstream string_stream;
+  std::ifstream file_stream;
+  // First detect direct source-code string ("my_program\nprogram_code...")
+  size_t newline_pos = filename.find("\n");
+  if (newline_pos != std::string::npos) {
+    std::string source = filename.substr(newline_pos + 1);
+    filename = filename.substr(0, newline_pos);
+    string_stream << source;
+    source_stream = &string_stream;
+  }
+  if (sources.count(filename)) {
+    // Already got this one
+    return true;
+  }
+  if (!source_stream) {
+    std::string fullpath = path_join(current_dir, filename);
+    // Try loading from callback
+    if (!file_callback ||
+        !(source_stream = file_callback(fullpath, string_stream))) {
+#if JITIFY_ENABLE_EMBEDDED_FILES
+      // Try loading as embedded file
+      EmbeddedData embedded;
+      std::string source;
+      try {
+        source.assign(embedded.begin(fullpath), embedded.end(fullpath));
+        string_stream << source;
+        source_stream = &string_stream;
+      } catch (std::runtime_error const&)
+#endif  // JITIFY_ENABLE_EMBEDDED_FILES
+      {
+        // Try loading from filesystem
+        bool found_file = false;
+        if (search_current_dir) {
+          file_stream.open(fullpath.c_str());
+          if (file_stream) {
+            source_stream = &file_stream;
+            found_file = true;
+          }
+        }
+        // Search include directories
+        if (!found_file) {
+          for (int i = 0; i < (int)include_paths.size(); ++i) {
+            fullpath = path_join(include_paths[i], filename);
+            file_stream.open(fullpath.c_str());
+            if (file_stream) {
+              source_stream = &file_stream;
+              found_file = true;
+              break;
+            }
+          }
+          if (!found_file) {
+            // Try loading from builtin headers
+            fullpath = path_join("__jitify_builtin", filename);
+            auto it = get_jitsafe_headers_map().find(filename);
+            if (it != get_jitsafe_headers_map().end()) {
+              string_stream << it->second;
+              source_stream = &string_stream;
+            } else {
+              return false;
+            }
+          }
+        }
+      }
+    }
+    if (fullpaths) {
+      // Record the full file path corresponding to this include name.
+      (*fullpaths)[filename] = path_simplify(fullpath);
+    }
+  }
+  sources[filename] = std::string();
+  std::string& source = sources[filename];
+  std::string line;
+  size_t linenum = 0;
+  unsigned long long hash = 0;
+  bool pragma_once = false;
+  bool remove_next_blank_line = false;
+  while (std::getline(*source_stream, line)) {
+    ++linenum;
+
+    // HACK WAR for static variables not allowed on the device (unless
+    // __shared__)
+    // TODO: This breaks static member variables
+    // line = replace_token(line, "static const", "/*static*/ const");
+
+    // TODO: Need to watch out for /* */ comments too
+    std::string cleanline =
+        line.substr(0, line.find("//"));  // Strip line comments
+    // if( cleanline.back() == "\r" ) { // Remove Windows line ending
+    //	cleanline = cleanline.substr(0, cleanline.size()-1);
+    //}
+    // TODO: Should trim whitespace before checking .empty()
+    if (cleanline.empty() && remove_next_blank_line) {
+      remove_next_blank_line = false;
+      continue;
+    }
+    // Maintain a file hash for use in #pragma once WAR
+    hash = hash_larson64(line.c_str(), hash);
+    if (cleanline.find("#pragma once") != std::string::npos) {
+      pragma_once = true;
+      // Note: This is an attempt to recover the original line numbering,
+      //         which otherwise gets off-by-one due to the include guard.
+      remove_next_blank_line = true;
+      // line = "//" + line; // Comment out the #pragma once line
+      continue;
+    }
+
+    // HACK WAR for Thrust using "#define FOO #pragma bar"
+    size_t pragma_beg = cleanline.find("#pragma ");
+    if (pragma_beg != std::string::npos) {
+      std::string line_after_pragma = line.substr(pragma_beg);
+      std::vector<std::string> pragma_split =
+          split_string(line_after_pragma, 2);
+      line =
+          (line.substr(0, pragma_beg) + "_Pragma(\"" + pragma_split[1] + "\")");
+      if (pragma_split.size() == 3) {
+        line += " " + pragma_split[2];
+      }
+    }
+
+    source += line + "\n";
+  }
+  // HACK TESTING (WAR for cub)
+  // source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source;
+  ////source = "cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }\n" +
+  /// source;
+
+  // WAR for #pragma once causing problems when there are multiple inclusions
+  //   of the same header from different paths.
+  if (pragma_once) {
+    std::stringstream ss;
+    ss << std::uppercase << std::hex << std::setw(8) << std::setfill('0')
+       << hash;
+    std::string include_guard_name = "_JITIFY_INCLUDE_GUARD_" + ss.str() + "\n";
+    std::string include_guard_header;
+    include_guard_header += "#ifndef " + include_guard_name;
+    include_guard_header += "#define " + include_guard_name;
+    std::string include_guard_footer;
+    include_guard_footer += "#endif // " + include_guard_name;
+    source = include_guard_header + source + "\n" + include_guard_footer;
+  }
+  // return filename;
+  return true;
+}
+
+}  // namespace detail
+
+//! \endcond
+
+/*! Jitify reflection utilities namespace
+ */
+namespace reflection {
+
+//  Provides type and value reflection via a function 'reflect':
+//    reflect<Type>()   -> "Type"
+//    reflect(value)    -> "(T)value"
+//    reflect<VAL>()    -> "VAL"
+//    reflect<Type,VAL> -> "VAL"
+//    reflect_template<float,NonType<int,7>,char>() -> "<float,7,char>"
+//    reflect_template({"float", "7", "char"}) -> "<float,7,char>"
+
+/*! A wrapper class for non-type template parameters.
+ */
+template <typename T, T VALUE_>
+struct NonType {
+  constexpr static T VALUE = VALUE_;
+};
+
+// Forward declaration
+template <typename T>
+inline std::string reflect(T const& value);
+
+//! \cond
+
+namespace detail {
+
+template <typename T>
+inline std::string value_string(const T& x) {
+  std::stringstream ss;
+  ss << x;
+  return ss.str();
+}
+// WAR for non-printable characters
+template <>
+inline std::string value_string<char>(const char& x) {
+  std::stringstream ss;
+  ss << (int)x;
+  return ss.str();
+}
+template <>
+inline std::string value_string<signed char>(const signed char& x) {
+  std::stringstream ss;
+  ss << (int)x;
+  return ss.str();
+}
+template <>
+inline std::string value_string<unsigned char>(const unsigned char& x) {
+  std::stringstream ss;
+  ss << (int)x;
+  return ss.str();
+}
+template <>
+inline std::string value_string<wchar_t>(const wchar_t& x) {
+  std::stringstream ss;
+  ss << (long)x;
+  return ss.str();
+}
+// Specialisation for bool true/false literals
+template <>
+inline std::string value_string<bool>(const bool& x) {
+  return x ? "true" : "false";
+}
+
+// Removes all tokens that start with double underscores.
+inline void strip_double_underscore_tokens(char* s) {
+  using jitify::detail::is_tokenchar;
+  char* w = s;
+  do {
+    if (*s == '_' && *(s + 1) == '_') {
+      while (is_tokenchar(*++s))
+        ;
+    }
+  } while ((*w++ = *s++));
+}
+
+//#if CUDA_VERSION < 8000
+#ifdef _MSC_VER  // MSVC compiler
+inline std::string demangle_cuda_symbol(const char* mangled_name) {
+  // We don't have a way to demangle CUDA symbol names under MSVC.
+  return mangled_name;
+}
+inline std::string demangle_native_type(const std::type_info& typeinfo) {
+  // Get the decorated name and skip over the leading '.'.
+  const char* decorated_name = typeinfo.raw_name() + 1;
+  char undecorated_name[4096];
+  if (UnDecorateSymbolName(
+          decorated_name, undecorated_name,
+          sizeof(undecorated_name) / sizeof(*undecorated_name),
+          UNDNAME_NO_ARGUMENTS |          // Treat input as a type name
+              UNDNAME_NAME_ONLY           // No "class" and "struct" prefixes
+          /*UNDNAME_NO_MS_KEYWORDS*/)) {  // No "__cdecl", "__ptr64" etc.
+    // WAR for UNDNAME_NO_MS_KEYWORDS messing up function types.
+    strip_double_underscore_tokens(undecorated_name);
+    return undecorated_name;
+  }
+  throw std::runtime_error("UnDecorateSymbolName failed");
+}
+#else   // not MSVC
+inline std::string demangle_cuda_symbol(const char* mangled_name) {
+  size_t bufsize = 0;
+  char* buf = nullptr;
+  std::string demangled_name;
+  int status;
+  auto demangled_ptr = std::unique_ptr<char, decltype(free)*>(
+      abi::__cxa_demangle(mangled_name, buf, &bufsize, &status), free);
+  if (status == 0) {
+    demangled_name = demangled_ptr.get();  // all worked as expected
+  } else if (status == -2) {
+    demangled_name = mangled_name;  // we interpret this as plain C name
+  } else if (status == -1) {
+    throw std::runtime_error(
+        std::string("memory allocation failure in __cxa_demangle"));
+  } else if (status == -3) {
+    throw std::runtime_error(std::string("invalid argument to __cxa_demangle"));
+  }
+  return demangled_name;
+}
+inline std::string demangle_native_type(const std::type_info& typeinfo) {
+  return demangle_cuda_symbol(typeinfo.name());
+}
+#endif  // not MSVC
+//#endif // CUDA_VERSION < 8000
+
+template <typename>
+class JitifyTypeNameWrapper_ {};
+
+template <typename T>
+struct type_reflection {
+  inline static std::string name() {
+    //#if CUDA_VERSION < 8000
+    // TODO: Use nvrtcGetTypeName once it has the same behavior as this.
+    // WAR for typeid discarding cv qualifiers on value-types
+    // Wrap type in dummy template class to preserve cv-qualifiers, then strip
+    // off the wrapper from the resulting string.
+    std::string wrapped_name =
+        demangle_native_type(typeid(JitifyTypeNameWrapper_<T>));
+    // Note: The reflected name of this class also has namespace prefixes.
+    const std::string wrapper_class_name = "JitifyTypeNameWrapper_<";
+    size_t start = wrapped_name.find(wrapper_class_name);
+    if (start == std::string::npos) {
+      throw std::runtime_error("Type reflection failed: " + wrapped_name);
+    }
+    start += wrapper_class_name.size();
+    std::string name =
+        wrapped_name.substr(start, wrapped_name.size() - (start + 1));
+    return name;
+    //#else
+    //         std::string ret;
+    //         nvrtcResult status = nvrtcGetTypeName<T>(&ret);
+    //         if( status != NVRTC_SUCCESS ) {
+    //                 throw std::runtime_error(std::string("nvrtcGetTypeName
+    // failed:
+    //")+ nvrtcGetErrorString(status));
+    //         }
+    //         return ret;
+    //#endif
+  }
+};  // namespace detail
+template <typename T, T VALUE>
+struct type_reflection<NonType<T, VALUE> > {
+  inline static std::string name() {
+    return jitify::reflection::reflect(VALUE);
+  }
+};
+
+}  // namespace detail
+
+//! \endcond
+
+/*! Create an Instance object that contains a const reference to the
+ *  value.  We use this to wrap abstract objects from which we want to extract
+ *  their type at runtime (e.g., derived type).  This is used to facilitate
+ *  templating on derived type when all we know at compile time is abstract
+ * type.
+ */
+template <typename T>
+struct Instance {
+  const T& value;
+  Instance(const T& value) : value(value) {}
+};
+
+/*! Create an Instance object from which we can extract the value's run-time
+ * type.
+ *  \param value The const value to be captured.
+ */
+template <typename T>
+inline Instance<T const> instance_of(T const& value) {
+  return Instance<T const>(value);
+}
+
+/*! A wrapper used for representing types as values.
+ */
+template <typename T>
+struct Type {};
+
+// Type reflection
+// E.g., reflect<float>() -> "float"
+// Note: This strips trailing const and volatile qualifiers
+/*! Generate a code-string for a type.
+ *  \code{.cpp}reflect<float>() --> "float"\endcode
+ */
+template <typename T>
+inline std::string reflect() {
+  return detail::type_reflection<T>::name();
+}
+// Value reflection
+// E.g., reflect(3.14f) -> "(float)3.14"
+/*! Generate a code-string for a value.
+ *  \code{.cpp}reflect(3.14f) --> "(float)3.14"\endcode
+ */
+template <typename T>
+inline std::string reflect(T const& value) {
+  return "(" + reflect<T>() + ")" + detail::value_string(value);
+}
+// Non-type template arg reflection (implicit conversion to int64_t)
+// E.g., reflect<7>() -> "(int64_t)7"
+/*! Generate a code-string for an integer non-type template argument.
+ *  \code{.cpp}reflect<7>() --> "(int64_t)7"\endcode
+ */
+template <int64_t N>
+inline std::string reflect() {
+  return reflect<NonType<int64_t, N> >();
+}
+// Non-type template arg reflection (explicit type)
+// E.g., reflect<int,7>() -> "(int)7"
+/*! Generate a code-string for a generic non-type template argument.
+ *  \code{.cpp} reflect<int,7>() --> "(int)7" \endcode
+ */
+template <typename T, T N>
+inline std::string reflect() {
+  return reflect<NonType<T, N> >();
+}
+// Type reflection via value
+// E.g., reflect(Type<float>()) -> "float"
+/*! Generate a code-string for a type wrapped as a Type instance.
+ *  \code{.cpp}reflect(Type<float>()) --> "float"\endcode
+ */
+template <typename T>
+inline std::string reflect(jitify::reflection::Type<T>) {
+  return reflect<T>();
+}
+
+/*! Generate a code-string for a type wrapped as an Instance instance.
+ *  \code{.cpp}reflect(Instance<float>(3.1f)) --> "float"\endcode
+ *  or more simply when passed to a instance_of helper
+ *  \code{.cpp}reflect(instance_of(3.1f)) --> "float"\endcodei
+ *  This is specifically for the case where we want to extract the run-time
+ * type, e.g., derived type, of an object pointer.
+ */
+template <typename T>
+inline std::string reflect(jitify::reflection::Instance<T>& value) {
+  return detail::demangle_native_type(typeid(value.value));
+}
+
+// Type from value
+// E.g., type_of(3.14f) -> Type<float>()
+/*! Create a Type object representing a value's type.
+ *  \param value The value whose type is to be captured.
+ */
+template <typename T>
+inline Type<T> type_of(T& value) {
+  return Type<T>();
+}
+/*! Create a Type object representing a value's type.
+ *  \param value The const value whose type is to be captured.
+ */
+template <typename T>
+inline Type<T const> type_of(T const& value) {
+  return Type<T const>();
+}
+
+// Multiple value reflections one call, returning list of strings
+template <typename... Args>
+inline std::vector<std::string> reflect_all(Args... args) {
+  return {reflect(args)...};
+}
+
+inline std::string reflect_list(jitify::detail::vector<std::string> const& args,
+                                std::string opener = "",
+                                std::string closer = "") {
+  std::stringstream ss;
+  ss << opener;
+  for (int i = 0; i < (int)args.size(); ++i) {
+    if (i > 0) ss << ",";
+    ss << args[i];
+  }
+  ss << closer;
+  return ss.str();
+}
+
+// Template instantiation reflection
+// inline std::string reflect_template(std::vector<std::string> const& args) {
+inline std::string reflect_template(
+    jitify::detail::vector<std::string> const& args) {
+  // Note: The space in " >" is a WAR to avoid '>>' appearing
+  return reflect_list(args, "<", " >");
+}
+// TODO: See if can make this evaluate completely at compile-time
+template <typename... Ts>
+inline std::string reflect_template() {
+  return reflect_template({reflect<Ts>()...});
+  // return reflect_template<sizeof...(Ts)>({reflect<Ts>()...});
+}
+
+}  // namespace reflection
+
+//! \cond
+
+namespace detail {
+
+// Demangles nested variable names using the PTX name mangling scheme
+// (which follows the Itanium64 ABI). E.g., _ZN1a3Foo2bcE -> a::Foo::bc.
+inline std::string demangle_ptx_variable_name(const char* name) {
+  std::stringstream ss;
+  const char* c = name;
+  if (*c++ != '_' || *c++ != 'Z') return name;  // Non-mangled name
+  if (*c++ != 'N') return "";  // Not a nested name, unsupported
+  while (true) {
+    // Parse identifier length.
+    int n = 0;
+    while (std::isdigit(*c)) {
+      n = n * 10 + (*c - '0');
+      c++;
+    }
+    if (!n) return "";  // Invalid or unsupported mangled name
+    // Parse identifier.
+    const char* c0 = c;
+    while (n-- && *c) c++;
+    if (!*c) return "";  // Mangled name is truncated
+    std::string id(c0, c);
+    // Identifiers starting with "_GLOBAL" are anonymous namespaces.
+    ss << (id.substr(0, 7) == "_GLOBAL" ? "(anonymous namespace)" : id);
+    // Nested name specifiers end with 'E'.
+    if (*c == 'E') break;
+    // There are more identifiers to come, add join token.
+    ss << "::";
+  }
+  return ss.str();
+}
+
+static const char* get_current_executable_path() {
+  static const char* path = []() -> const char* {
+    static char buffer[JITIFY_PATH_MAX] = {};
+#ifdef __linux__
+    if (!::realpath("/proc/self/exe", buffer)) return nullptr;
+#elif defined(_WIN32) || defined(_WIN64)
+    if (!GetModuleFileNameA(nullptr, buffer, JITIFY_PATH_MAX)) return nullptr;
+#endif
+    return buffer;
+  }();
+  return path;
+}
+
+inline bool endswith(const std::string& str, const std::string& suffix) {
+  return str.size() >= suffix.size() &&
+         str.substr(str.size() - suffix.size()) == suffix;
+}
+
+// Infers the JIT input type from the filename suffix. If no known suffix is
+// present, the filename is assumed to refer to a library, and the associated
+// suffix (and possibly prefix) is automatically added to the filename.
+inline CUjitInputType get_cuda_jit_input_type(std::string* filename) {
+  if (endswith(*filename, ".ptx")) {
+    return CU_JIT_INPUT_PTX;
+  } else if (endswith(*filename, ".cubin")) {
+    return CU_JIT_INPUT_CUBIN;
+  } else if (endswith(*filename, ".fatbin")) {
+    return CU_JIT_INPUT_FATBINARY;
+  } else if (endswith(*filename,
+#if defined _WIN32 || defined _WIN64
+                      ".obj"
+#else  // Linux
+                      ".o"
+#endif
+                      )) {
+    return CU_JIT_INPUT_OBJECT;
+  } else {  // Assume library
+#if defined _WIN32 || defined _WIN64
+    if (!endswith(*filename, ".lib")) {
+      *filename += ".lib";
+    }
+#else  // Linux
+    if (!endswith(*filename, ".a")) {
+      *filename = "lib" + *filename + ".a";
+    }
+#endif
+    return CU_JIT_INPUT_LIBRARY;
+  }
+}
+
+class CUDAKernel {
+  std::vector<std::string> _link_files;
+  std::vector<std::string> _link_paths;
+  CUlinkState _link_state;
+  CUmodule _module;
+  CUfunction _kernel;
+  std::string _func_name;
+  std::string _ptx;
+  std::map<std::string, std::string> _global_map;
+  std::vector<CUjit_option> _opts;
+  std::vector<void*> _optvals;
+#ifdef JITIFY_PRINT_LINKER_LOG
+  static const unsigned int _log_size = 8192;
+  char _error_log[_log_size];
+  char _info_log[_log_size];
+#endif
+
+  inline void cuda_safe_call(CUresult res) const {
+    if (res != CUDA_SUCCESS) {
+      const char* msg;
+      cuGetErrorName(res, &msg);
+      throw std::runtime_error(msg);
+    }
+  }
+  inline void create_module(std::vector<std::string> link_files,
+                            std::vector<std::string> link_paths) {
+    CUresult result;
+#ifndef JITIFY_PRINT_LINKER_LOG
+    // WAR since linker log does not seem to be constructed using a single call
+    // to cuModuleLoadDataEx.
+    if (link_files.empty()) {
+      result =
+          cuModuleLoadDataEx(&_module, _ptx.c_str(), (unsigned)_opts.size(),
+                             _opts.data(), _optvals.data());
+    } else
+#endif
+    {
+      cuda_safe_call(cuLinkCreate((unsigned)_opts.size(), _opts.data(),
+                                  _optvals.data(), &_link_state));
+      cuda_safe_call(cuLinkAddData(_link_state, CU_JIT_INPUT_PTX,
+                                   (void*)_ptx.c_str(), _ptx.size(),
+                                   "jitified_source.ptx", 0, 0, 0));
+      for (int i = 0; i < (int)link_files.size(); ++i) {
+        std::string link_file = link_files[i];
+        CUjitInputType jit_input_type;
+        if (link_file == ".") {
+          // Special case for linking to current executable.
+          link_file = get_current_executable_path();
+          jit_input_type = CU_JIT_INPUT_OBJECT;
+        } else {
+          // Infer based on filename.
+          jit_input_type = get_cuda_jit_input_type(&link_file);
+        }
+        CUresult result = cuLinkAddFile(_link_state, jit_input_type,
+                                        link_file.c_str(), 0, 0, 0);
+        int path_num = 0;
+        while (result == CUDA_ERROR_FILE_NOT_FOUND &&
+               path_num < (int)link_paths.size()) {
+          std::string filename = path_join(link_paths[path_num++], link_file);
+          result = cuLinkAddFile(_link_state, jit_input_type, filename.c_str(),
+                                 0, 0, 0);
+        }
+#if JITIFY_PRINT_LINKER_LOG
+        if (result == CUDA_ERROR_FILE_NOT_FOUND) {
+          std::cerr << "Linker error: Device library not found: " << link_file
+                    << std::endl;
+        } else if (result != CUDA_SUCCESS) {
+          std::cerr << "Linker error: Failed to add file: " << link_file
+                    << std::endl;
+          std::cerr << _error_log << std::endl;
+        }
+#endif
+        cuda_safe_call(result);
+      }
+      size_t cubin_size;
+      void* cubin;
+      result = cuLinkComplete(_link_state, &cubin, &cubin_size);
+      if (result == CUDA_SUCCESS) {
+        result = cuModuleLoadData(&_module, cubin);
+      }
+    }
+#ifdef JITIFY_PRINT_LINKER_LOG
+    std::cout << "---------------------------------------" << std::endl;
+    std::cout << "--- Linker for "
+              << reflection::detail::demangle_cuda_symbol(_func_name.c_str())
+              << " ---" << std::endl;
+    std::cout << "---------------------------------------" << std::endl;
+    std::cout << _info_log << std::endl;
+    std::cout << std::endl;
+    std::cout << _error_log << std::endl;
+    std::cout << "---------------------------------------" << std::endl;
+#endif
+    cuda_safe_call(result);
+    // Allow _func_name to be empty to support cases where we want to generate
+    // PTX containing extern symbol definitions but no kernels.
+    if (!_func_name.empty()) {
+      cuda_safe_call(
+          cuModuleGetFunction(&_kernel, _module, _func_name.c_str()));
+    }
+  }
+  inline void destroy_module() {
+    if (_link_state) {
+      cuda_safe_call(cuLinkDestroy(_link_state));
+    }
+    _link_state = 0;
+    if (_module) {
+      cuModuleUnload(_module);
+    }
+    _module = 0;
+  }
+
+  // create a map of __constant__ and __device__ variables in the ptx file
+  // mapping demangled to mangled name
+  inline void create_global_variable_map() {
+    size_t pos = 0;
+    while (pos < _ptx.size()) {
+      pos = std::min(_ptx.find(".const .align", pos),
+                     _ptx.find(".global .align", pos));
+      if (pos == std::string::npos) break;
+      size_t end = _ptx.find_first_of(";=", pos);
+      if (_ptx[end] == '=') --end;
+      std::string line = _ptx.substr(pos, end - pos);
+      pos = end;
+      size_t symbol_start = line.find_last_of(" ") + 1;
+      size_t symbol_end = line.find_last_of("[");
+      std::string entry = line.substr(symbol_start, symbol_end - symbol_start);
+      std::string key = detail::demangle_ptx_variable_name(entry.c_str());
+      // Skip unsupported mangled names. E.g., a static variable defined inside
+      // a function (such variables are not directly addressable from outside
+      // the function, so skipping them is the correct behavior).
+      if (key == "") continue;
+      _global_map[key] = entry;
+    }
+  }
+
+  inline void set_linker_log() {
+#ifdef JITIFY_PRINT_LINKER_LOG
+    _opts.push_back(CU_JIT_INFO_LOG_BUFFER);
+    _optvals.push_back((void*)_info_log);
+    _opts.push_back(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES);
+    _optvals.push_back((void*)(long)_log_size);
+    _opts.push_back(CU_JIT_ERROR_LOG_BUFFER);
+    _optvals.push_back((void*)_error_log);
+    _opts.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES);
+    _optvals.push_back((void*)(long)_log_size);
+    _opts.push_back(CU_JIT_LOG_VERBOSE);
+    _optvals.push_back((void*)1);
+#endif
+  }
+
+ public:
+  inline CUDAKernel() : _link_state(0), _module(0), _kernel(0) {}
+  inline CUDAKernel(const CUDAKernel& other) = delete;
+  inline CUDAKernel& operator=(const CUDAKernel& other) = delete;
+  inline CUDAKernel(CUDAKernel&& other) = delete;
+  inline CUDAKernel& operator=(CUDAKernel&& other) = delete;
+  inline CUDAKernel(const char* func_name, const char* ptx,
+                    std::vector<std::string> link_files,
+                    std::vector<std::string> link_paths, unsigned int nopts = 0,
+                    CUjit_option* opts = 0, void** optvals = 0)
+      : _link_files(link_files),
+        _link_paths(link_paths),
+        _link_state(0),
+        _module(0),
+        _kernel(0),
+        _func_name(func_name),
+        _ptx(ptx),
+        _opts(opts, opts + nopts),
+        _optvals(optvals, optvals + nopts) {
+    this->set_linker_log();
+    this->create_module(link_files, link_paths);
+    this->create_global_variable_map();
+  }
+
+  inline CUDAKernel& set(const char* func_name, const char* ptx,
+                         std::vector<std::string> link_files,
+                         std::vector<std::string> link_paths,
+                         unsigned int nopts = 0, CUjit_option* opts = 0,
+                         void** optvals = 0) {
+    this->destroy_module();
+    _func_name = func_name;
+    _ptx = ptx;
+    _link_files = link_files;
+    _link_paths = link_paths;
+    _opts.assign(opts, opts + nopts);
+    _optvals.assign(optvals, optvals + nopts);
+    this->set_linker_log();
+    this->create_module(link_files, link_paths);
+    this->create_global_variable_map();
+    return *this;
+  }
+  inline ~CUDAKernel() { this->destroy_module(); }
+  inline operator CUfunction() const { return _kernel; }
+
+  inline CUresult launch(dim3 grid, dim3 block, unsigned int smem,
+                         CUstream stream, std::vector<void*> arg_ptrs) const {
+    return cuLaunchKernel(_kernel, grid.x, grid.y, grid.z, block.x, block.y,
+                          block.z, smem, stream, arg_ptrs.data(), NULL);
+  }
+
+  inline CUdeviceptr get_global_ptr(const char* name,
+                                    size_t* size = nullptr) const {
+    CUdeviceptr global_ptr = 0;
+    auto global = _global_map.find(name);
+    if (global != _global_map.end()) {
+      cuda_safe_call(cuModuleGetGlobal(&global_ptr, size, _module,
+                                       global->second.c_str()));
+    } else {
+      throw std::runtime_error(std::string("failed to look up global ") + name);
+    }
+    return global_ptr;
+  }
+
+  template <typename T>
+  inline CUresult get_global_data(const char* name, T* data, size_t count,
+                                  CUstream stream = 0) const {
+    size_t size_bytes;
+    CUdeviceptr ptr = get_global_ptr(name, &size_bytes);
+    size_t given_size_bytes = count * sizeof(T);
+    if (given_size_bytes != size_bytes) {
+      throw std::runtime_error(
+          std::string("Value for global variable ") + name +
+          " has wrong size: got " + std::to_string(given_size_bytes) +
+          " bytes, expected " + std::to_string(size_bytes));
+    }
+    return cuMemcpyDtoH(data, ptr, size_bytes);
+  }
+
+  template <typename T>
+  inline CUresult set_global_data(const char* name, const T* data, size_t count,
+                                  CUstream stream = 0) const {
+    size_t size_bytes;
+    CUdeviceptr ptr = get_global_ptr(name, &size_bytes);
+    size_t given_size_bytes = count * sizeof(T);
+    if (given_size_bytes != size_bytes) {
+      throw std::runtime_error(
+          std::string("Value for global variable ") + name +
+          " has wrong size: got " + std::to_string(given_size_bytes) +
+          " bytes, expected " + std::to_string(size_bytes));
+    }
+    return cuMemcpyHtoD(ptr, data, size_bytes);
+  }
+
+  const std::string& function_name() const { return _func_name; }
+  const std::string& ptx() const { return _ptx; }
+  const std::vector<std::string>& link_files() const { return _link_files; }
+  const std::vector<std::string>& link_paths() const { return _link_paths; }
+};
+
+static const char* jitsafe_header_preinclude_h = R"(
+//// WAR for Thrust (which appears to have forgotten to include this in result_of_adaptable_function.h
+//#include <type_traits>
+
+//// WAR for Thrust (which appear to have forgotten to include this in error_code.h)
+//#include <string>
+
+// WAR for Thrust (which only supports gnuc, clang or msvc)
+#define __GNUC__ 4
+
+// WAR for generics/shfl.h
+#define THRUST_STATIC_ASSERT(x)
+
+// WAR for CUB
+#ifdef __host__
+#undef __host__
+#endif
+#define __host__
+
+// WAR to allow exceptions to be parsed
+#define try
+#define catch(...)
+)";
+
+
+static const char* jitsafe_header_float_h = R"(
+#pragma once
+
+#define FLT_RADIX       2
+#define FLT_MANT_DIG    24
+#define DBL_MANT_DIG    53
+#define FLT_DIG         6
+#define DBL_DIG         15
+#define FLT_MIN_EXP     -125
+#define DBL_MIN_EXP     -1021
+#define FLT_MIN_10_EXP  -37
+#define DBL_MIN_10_EXP  -307
+#define FLT_MAX_EXP     128
+#define DBL_MAX_EXP     1024
+#define FLT_MAX_10_EXP  38
+#define DBL_MAX_10_EXP  308
+#define FLT_MAX         3.4028234e38f 
+#define DBL_MAX         1.7976931348623157e308 
+#define FLT_EPSILON     1.19209289e-7f 
+#define DBL_EPSILON     2.220440492503130e-16 
+#define FLT_MIN         1.1754943e-38f; 
+#define DBL_MIN         2.2250738585072013e-308 
+#define FLT_ROUNDS      1
+#if defined __cplusplus && __cplusplus >= 201103L
+#define FLT_EVAL_METHOD 0
+#define DECIMAL_DIG     21
+#endif
+)";
+
+static const char* jitsafe_header_limits_h = R"(
+#pragma once
+
+#if defined _WIN32 || defined _WIN64
+ #define __WORDSIZE 32
+#else
+ #if defined __x86_64__ && !defined __ILP32__
+  #define __WORDSIZE 64
+ #else
+  #define __WORDSIZE 32
+ #endif
+#endif
+#define MB_LEN_MAX  16
+#define CHAR_BIT    8
+#define SCHAR_MIN   (-128)
+#define SCHAR_MAX   127
+#define UCHAR_MAX   255
+enum {
+  _JITIFY_CHAR_IS_UNSIGNED = (char)-1 >= 0,
+  CHAR_MIN = _JITIFY_CHAR_IS_UNSIGNED ? 0 : SCHAR_MIN,
+  CHAR_MAX = _JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX,
+};
+#define SHRT_MIN    (-32768)
+#define SHRT_MAX    32767
+#define USHRT_MAX   65535
+#define INT_MIN     (-INT_MAX - 1)
+#define INT_MAX     2147483647
+#define UINT_MAX    4294967295U
+#if __WORDSIZE == 64
+ # define LONG_MAX  9223372036854775807L
+#else
+ # define LONG_MAX  2147483647L
+#endif
+#define LONG_MIN    (-LONG_MAX - 1L)
+#if __WORDSIZE == 64
+ #define ULONG_MAX  18446744073709551615UL
+#else
+ #define ULONG_MAX  4294967295UL
+#endif
+#define LLONG_MAX  9223372036854775807LL
+#define LLONG_MIN  (-LLONG_MAX - 1LL)
+#define ULLONG_MAX 18446744073709551615ULL
+)";
+
+static const char* jitsafe_header_iterator = R"(
+#pragma once
+
+namespace __jitify_iterator_ns {
+struct output_iterator_tag {};
+struct input_iterator_tag {};
+struct forward_iterator_tag {};
+struct bidirectional_iterator_tag {};
+struct random_access_iterator_tag {};
+template<class Iterator>
+struct iterator_traits {
+  typedef typename Iterator::iterator_category iterator_category;
+  typedef typename Iterator::value_type        value_type;
+  typedef typename Iterator::difference_type   difference_type;
+  typedef typename Iterator::pointer           pointer;
+  typedef typename Iterator::reference         reference;
+};
+template<class T>
+struct iterator_traits<T*> {
+  typedef random_access_iterator_tag iterator_category;
+  typedef T                          value_type;
+  typedef ptrdiff_t                  difference_type;
+  typedef T*                         pointer;
+  typedef T&                         reference;
+};
+template<class T>
+struct iterator_traits<T const*> {
+  typedef random_access_iterator_tag iterator_category;
+  typedef T                          value_type;
+  typedef ptrdiff_t                  difference_type;
+  typedef T const*                   pointer;
+  typedef T const&                   reference;
+};
+} // namespace __jitify_iterator_ns
+namespace std { using namespace __jitify_iterator_ns; }
+using namespace __jitify_iterator_ns;
+)";
+
+// TODO: This is incomplete; need floating point limits
+//   Joe Eaton: added IEEE float and double types, none of the smaller types
+//              using type specific structs since we can't template on floats.
+static const char* jitsafe_header_limits = R"(
+#pragma once
+#include <climits>
+#include <cfloat>
+// TODO: epsilon(), infinity(), etc
+namespace __jitify_detail {
+#if __cplusplus >= 201103L
+#define JITIFY_CXX11_CONSTEXPR constexpr
+#define JITIFY_CXX11_NOEXCEPT noexcept
+#else
+#define JITIFY_CXX11_CONSTEXPR
+#define JITIFY_CXX11_NOEXCEPT
+#endif
+
+struct FloatLimits {
+#if __cplusplus >= 201103L
+   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
+          float lowest() JITIFY_CXX11_NOEXCEPT {   return -FLT_MAX;}
+   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
+          float min() JITIFY_CXX11_NOEXCEPT {      return FLT_MIN; }
+   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
+          float max() JITIFY_CXX11_NOEXCEPT {      return FLT_MAX; }
+#endif  // __cplusplus >= 201103L
+   enum {
+   is_specialized    = true,
+   is_signed         = true,
+   is_integer        = false,
+   is_exact          = false,
+   has_infinity      = true,
+   has_quiet_NaN     = true,
+   has_signaling_NaN = true,
+   has_denorm        = 1,
+   has_denorm_loss   = true,
+   round_style       = 1,
+   is_iec559         = true,
+   is_bounded        = true,
+   is_modulo         = false,
+   digits            = 24,
+   digits10          = 6,
+   max_digits10      = 9,
+   radix             = 2,
+   min_exponent      = -125,
+   min_exponent10    = -37,
+   max_exponent      = 128,
+   max_exponent10    = 38,
+   tinyness_before   = false,
+   traps             = false
+   };
+};
+struct DoubleLimits {
+#if __cplusplus >= 201103L
+   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
+          double lowest() noexcept { return -DBL_MAX; }
+   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
+          double min() noexcept { return DBL_MIN; }
+   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
+          double max() noexcept { return DBL_MAX; }
+#endif  // __cplusplus >= 201103L
+   enum {
+   is_specialized    = true,
+   is_signed         = true,
+   is_integer        = false,
+   is_exact          = false,
+   has_infinity      = true,
+   has_quiet_NaN     = true,
+   has_signaling_NaN = true,
+   has_denorm        = 1,
+   has_denorm_loss   = true,
+   round_style       = 1,
+   is_iec559         = true,
+   is_bounded        = true,
+   is_modulo         = false,
+   digits            = 53,
+   digits10          = 15,
+   max_digits10      = 17,
+   radix             = 2,
+   min_exponent      = -1021,
+   min_exponent10    = -307,
+   max_exponent      = 1024,
+   max_exponent10    = 308,
+   tinyness_before   = false,
+   traps             = false
+   };
+};
+template<class T, T Min, T Max, int Digits=-1>
+struct IntegerLimits {
+	static inline __host__ __device__ T min() { return Min; }
+	static inline __host__ __device__ T max() { return Max; }
+#if __cplusplus >= 201103L
+	static constexpr inline __host__ __device__ T lowest() noexcept {
+		return Min;
+	}
+#endif  // __cplusplus >= 201103L
+	enum {
+       is_specialized = true,
+       digits = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits,
+       digits10   = (digits * 30103) / 100000,
+       is_signed  = ((T)(-1)<0),
+       is_integer = true,
+       is_exact   = true,
+       radix      = 2,
+       is_bounded = true,
+       is_modulo  = false
+	};
+};
+} // namespace __jitify_detail
+namespace std { using namespace __jitify_detail; }
+namespace __jitify_limits_ns {
+template<typename T> struct numeric_limits {
+    enum { is_specialized = false };
+};
+template<> struct numeric_limits<bool>               : public 
+__jitify_detail::IntegerLimits<bool,              false,    true,1> {};
+template<> struct numeric_limits<char>               : public 
+__jitify_detail::IntegerLimits<char,              CHAR_MIN, CHAR_MAX> 
+{};
+template<> struct numeric_limits<signed char>        : public 
+__jitify_detail::IntegerLimits<signed char,       SCHAR_MIN,SCHAR_MAX> 
+{};
+template<> struct numeric_limits<unsigned char>      : public 
+__jitify_detail::IntegerLimits<unsigned char,     0,        UCHAR_MAX> 
+{};
+template<> struct numeric_limits<wchar_t>            : public 
+__jitify_detail::IntegerLimits<wchar_t,           INT_MIN,  INT_MAX> {};
+template<> struct numeric_limits<short>              : public 
+__jitify_detail::IntegerLimits<short,             SHRT_MIN, SHRT_MAX> 
+{};
+template<> struct numeric_limits<unsigned short>     : public 
+__jitify_detail::IntegerLimits<unsigned short,    0,        USHRT_MAX> 
+{};
+template<> struct numeric_limits<int>                : public 
+__jitify_detail::IntegerLimits<int,               INT_MIN,  INT_MAX> {};
+template<> struct numeric_limits<unsigned int>       : public 
+__jitify_detail::IntegerLimits<unsigned int,      0,        UINT_MAX> 
+{};
+template<> struct numeric_limits<long>               : public 
+__jitify_detail::IntegerLimits<long,              LONG_MIN, LONG_MAX> 
+{};
+template<> struct numeric_limits<unsigned long>      : public 
+__jitify_detail::IntegerLimits<unsigned long,     0,        ULONG_MAX> 
+{};
+template<> struct numeric_limits<long long>          : public 
+__jitify_detail::IntegerLimits<long long,         LLONG_MIN,LLONG_MAX> 
+{};
+template<> struct numeric_limits<unsigned long long> : public 
+__jitify_detail::IntegerLimits<unsigned long long,0,        ULLONG_MAX> 
+{};
+//template<typename T> struct numeric_limits { static const bool 
+//is_signed = ((T)(-1)<0); };
+template<> struct numeric_limits<float>              : public 
+__jitify_detail::FloatLimits 
+{};
+template<> struct numeric_limits<double>             : public 
+__jitify_detail::DoubleLimits 
+{};
+} // namespace __jitify_limits_ns
+namespace std { using namespace __jitify_limits_ns; }
+using namespace __jitify_limits_ns;
+)";
+
+// TODO: This is highly incomplete
+static const char* jitsafe_header_type_traits = R"(
+    #pragma once
+    #if __cplusplus >= 201103L
+    namespace __jitify_type_traits_ns {
+
+    template<bool B, class T = void> struct enable_if {};
+    template<class T>                struct enable_if<true, T> { typedef T type; };
+    #if __cplusplus >= 201402L
+    template< bool B, class T = void > using enable_if_t = typename enable_if<B,T>::type;
+    #endif
+
+    struct true_type  {
+      enum { value = true };
+      operator bool() const { return true; }
+    };
+    struct false_type {
+      enum { value = false };
+      operator bool() const { return false; }
+    };
+
+    template<typename T> struct is_floating_point    : false_type {};
+    template<> struct is_floating_point<float>       :  true_type {};
+    template<> struct is_floating_point<double>      :  true_type {};
+    template<> struct is_floating_point<long double> :  true_type {};
+
+    template<class T> struct is_integral              : false_type {};
+    template<> struct is_integral<bool>               :  true_type {};
+    template<> struct is_integral<char>               :  true_type {};
+    template<> struct is_integral<signed char>        :  true_type {};
+    template<> struct is_integral<unsigned char>      :  true_type {};
+    template<> struct is_integral<short>              :  true_type {};
+    template<> struct is_integral<unsigned short>     :  true_type {};
+    template<> struct is_integral<int>                :  true_type {};
+    template<> struct is_integral<unsigned int>       :  true_type {};
+    template<> struct is_integral<long>               :  true_type {};
+    template<> struct is_integral<unsigned long>      :  true_type {};
+    template<> struct is_integral<long long>          :  true_type {};
+    template<> struct is_integral<unsigned long long> :  true_type {};
+
+    template<typename T> struct is_signed    : false_type {};
+    template<> struct is_signed<float>       :  true_type {};
+    template<> struct is_signed<double>      :  true_type {};
+    template<> struct is_signed<long double> :  true_type {};
+    template<> struct is_signed<signed char> :  true_type {};
+    template<> struct is_signed<short>       :  true_type {};
+    template<> struct is_signed<int>         :  true_type {};
+    template<> struct is_signed<long>        :  true_type {};
+    template<> struct is_signed<long long>   :  true_type {};
+
+    template<typename T> struct is_unsigned             : false_type {};
+    template<> struct is_unsigned<unsigned char>      :  true_type {};
+    template<> struct is_unsigned<unsigned short>     :  true_type {};
+    template<> struct is_unsigned<unsigned int>       :  true_type {};
+    template<> struct is_unsigned<unsigned long>      :  true_type {};
+    template<> struct is_unsigned<unsigned long long> :  true_type {};
+
+    template<typename T, typename U> struct is_same      : false_type {};
+    template<typename T>             struct is_same<T,T> :  true_type {};
+
+    template<class T> struct is_array : false_type {};
+    template<class T> struct is_array<T[]> : true_type {};
+    template<class T, size_t N> struct is_array<T[N]> : true_type {};
+
+    //partial implementation only of is_function
+    template<class> struct is_function : false_type { };
+    template<class Ret, class... Args> struct is_function<Ret(Args...)> : true_type {}; //regular
+    template<class Ret, class... Args> struct is_function<Ret(Args......)> : true_type {}; // variadic
+
+    template<class> struct result_of;
+    template<class F, typename... Args>
+    struct result_of<F(Args...)> {
+    // TODO: This is a hack; a proper implem is quite complicated.
+    typedef typename F::result_type type;
+    };
+
+    template <class T> struct remove_reference { typedef T type; };
+    template <class T> struct remove_reference<T&> { typedef T type; };
+    template <class T> struct remove_reference<T&&> { typedef T type; };
+    #if __cplusplus >= 201402L
+    template< class T > using remove_reference_t = typename remove_reference<T>::type;
+    #endif
+
+    template<class T> struct remove_extent { typedef T type; };
+    template<class T> struct remove_extent<T[]> { typedef T type; };
+    template<class T, size_t N> struct remove_extent<T[N]> { typedef T type; };
+    #if __cplusplus >= 201402L
+    template< class T > using remove_extent_t = typename remove_extent<T>::type;
+    #endif
+
+    template< class T > struct remove_const          { typedef T type; };
+    template< class T > struct remove_const<const T> { typedef T type; };
+    template< class T > struct remove_volatile             { typedef T type; };
+    template< class T > struct remove_volatile<volatile T> { typedef T type; };
+    template< class T > struct remove_cv { typedef typename remove_volatile<typename remove_const<T>::type>::type type; };
+    #if __cplusplus >= 201402L
+    template< class T > using remove_cv_t       = typename remove_cv<T>::type;
+    template< class T > using remove_const_t    = typename remove_const<T>::type;
+    template< class T > using remove_volatile_t = typename remove_volatile<T>::type;
+    #endif
+
+    template<bool B, class T, class F> struct conditional { typedef T type; };
+    template<class T, class F> struct conditional<false, T, F> { typedef F type; };
+    #if __cplusplus >= 201402L
+    template< bool B, class T, class F > using conditional_t = typename conditional<B,T,F>::type;
+    #endif
+
+    namespace __jitify_detail {
+    template< class T, bool is_function_type = false > struct add_pointer { using type = typename remove_reference<T>::type*; };
+    template< class T > struct add_pointer<T, true> { using type = T; };
+    template< class T, class... Args > struct add_pointer<T(Args...), true> { using type = T(*)(Args...); };
+    template< class T, class... Args > struct add_pointer<T(Args..., ...), true> { using type = T(*)(Args..., ...); };
+    }
+    template< class T > struct add_pointer : __jitify_detail::add_pointer<T, is_function<T>::value> {};
+    #if __cplusplus >= 201402L
+    template< class T > using add_pointer_t = typename add_pointer<T>::type;
+    #endif
+
+    template< class T > struct decay {
+    private:
+      typedef typename remove_reference<T>::type U;
+    public:
+      typedef typename conditional<is_array<U>::value, typename remove_extent<U>::type*,
+        typename conditional<is_function<U>::value,typename add_pointer<U>::type,typename remove_cv<U>::type
+        >::type>::type type;
+    };
+    #if __cplusplus >= 201402L
+    template< class T > using decay_t = typename decay<T>::type;
+    #endif
+
+    } // namespace __jtiify_type_traits_ns
+    namespace std { using namespace __jitify_type_traits_ns; }
+    using namespace __jitify_type_traits_ns;
+    #endif // c++11
+)";
+
+// TODO: INT_FAST8_MAX et al. and a few other misc constants
+static const char* jitsafe_header_stdint_h =
+    "#pragma once\n"
+    "#include <climits>\n"
+    "namespace __jitify_stdint_ns {\n"
+    "typedef signed char      int8_t;\n"
+    "typedef signed short     int16_t;\n"
+    "typedef signed int       int32_t;\n"
+    "typedef signed long long int64_t;\n"
+    "typedef signed char      int_fast8_t;\n"
+    "typedef signed short     int_fast16_t;\n"
+    "typedef signed int       int_fast32_t;\n"
+    "typedef signed long long int_fast64_t;\n"
+    "typedef signed char      int_least8_t;\n"
+    "typedef signed short     int_least16_t;\n"
+    "typedef signed int       int_least32_t;\n"
+    "typedef signed long long int_least64_t;\n"
+    "typedef signed long long intmax_t;\n"
+    "typedef signed long      intptr_t; //optional\n"
+    "typedef unsigned char      uint8_t;\n"
+    "typedef unsigned short     uint16_t;\n"
+    "typedef unsigned int       uint32_t;\n"
+    "typedef unsigned long long uint64_t;\n"
+    "typedef unsigned char      uint_fast8_t;\n"
+    "typedef unsigned short     uint_fast16_t;\n"
+    "typedef unsigned int       uint_fast32_t;\n"
+    "typedef unsigned long long uint_fast64_t;\n"
+    "typedef unsigned char      uint_least8_t;\n"
+    "typedef unsigned short     uint_least16_t;\n"
+    "typedef unsigned int       uint_least32_t;\n"
+    "typedef unsigned long long uint_least64_t;\n"
+    "typedef unsigned long long uintmax_t;\n"
+    "typedef unsigned long      uintptr_t; //optional\n"
+    "#define INT8_MIN    SCHAR_MIN\n"
+    "#define INT16_MIN   SHRT_MIN\n"
+    "#define INT32_MIN   INT_MIN\n"
+    "#define INT64_MIN   LLONG_MIN\n"
+    "#define INT8_MAX    SCHAR_MAX\n"
+    "#define INT16_MAX   SHRT_MAX\n"
+    "#define INT32_MAX   INT_MAX\n"
+    "#define INT64_MAX   LLONG_MAX\n"
+    "#define UINT8_MAX   UCHAR_MAX\n"
+    "#define UINT16_MAX  USHRT_MAX\n"
+    "#define UINT32_MAX  UINT_MAX\n"
+    "#define UINT64_MAX  ULLONG_MAX\n"
+    "#define INTPTR_MIN  LONG_MIN\n"
+    "#define INTMAX_MIN  LLONG_MIN\n"
+    "#define INTPTR_MAX  LONG_MAX\n"
+    "#define INTMAX_MAX  LLONG_MAX\n"
+    "#define UINTPTR_MAX ULONG_MAX\n"
+    "#define UINTMAX_MAX ULLONG_MAX\n"
+    "#define PTRDIFF_MIN INTPTR_MIN\n"
+    "#define PTRDIFF_MAX INTPTR_MAX\n"
+    "#define SIZE_MAX    UINT64_MAX\n"
+    "} // namespace __jitify_stdint_ns\n"
+    "namespace std { using namespace __jitify_stdint_ns; }\n"
+    "using namespace __jitify_stdint_ns;\n";
+
+// TODO: offsetof
+static const char* jitsafe_header_stddef_h =
+    "#pragma once\n"
+    "#include <climits>\n"
+    "namespace __jitify_stddef_ns {\n"
+    "#if __cplusplus >= 201103L\n"
+    "typedef decltype(nullptr) nullptr_t;\n"
+    "#if defined(_MSC_VER)\n"
+    "  typedef double max_align_t;\n"
+    "#elif defined(__APPLE__)\n"
+    "  typedef long double max_align_t;\n"
+    "#else\n"
+    "  // Define max_align_t to match the GCC definition.\n"
+    "  typedef struct {\n"
+    "    long long __jitify_max_align_nonce1\n"
+    "        __attribute__((__aligned__(__alignof__(long long))));\n"
+    "    long double __jitify_max_align_nonce2\n"
+    "        __attribute__((__aligned__(__alignof__(long double))));\n"
+    "  } max_align_t;\n"
+    "#endif\n"
+    "#endif  // __cplusplus >= 201103L\n"
+    "#if __cplusplus >= 201703L\n"
+    "enum class byte : unsigned char {};\n"
+    "#endif  // __cplusplus >= 201703L\n"
+    "} // namespace __jitify_stddef_ns\n"
+    "namespace std {\n"
+    "  // NVRTC provides built-in definitions of ::size_t and ::ptrdiff_t.\n"
+    "  using ::size_t;\n"
+    "  using ::ptrdiff_t;\n"
+    "  using namespace __jitify_stddef_ns;\n"
+    "} // namespace std\n"
+    "using namespace __jitify_stddef_ns;\n";
+
+static const char* jitsafe_header_stdlib_h =
+    "#pragma once\n"
+    "#include <stddef.h>\n";
+static const char* jitsafe_header_stdio_h =
+    "#pragma once\n"
+    "#include <stddef.h>\n"
+    "#define FILE int\n"
+    "int fflush ( FILE * stream );\n"
+    "int fprintf ( FILE * stream, const char * format, ... );\n";
+
+static const char* jitsafe_header_string_h =
+    "#pragma once\n"
+    "char* strcpy ( char * destination, const char * source );\n"
+    "int strcmp ( const char * str1, const char * str2 );\n"
+    "char* strerror( int errnum );\n";
+
+static const char* jitsafe_header_cstring =
+    "#pragma once\n"
+    "\n"
+    "namespace __jitify_cstring_ns {\n"
+    "char* strcpy ( char * destination, const char * source );\n"
+    "int strcmp ( const char * str1, const char * str2 );\n"
+    "char* strerror( int errnum );\n"
+    "} // namespace __jitify_cstring_ns\n"
+    "namespace std { using namespace __jitify_cstring_ns; }\n"
+    "using namespace __jitify_cstring_ns;\n";
+
+// HACK TESTING (WAR for cub)
+static const char* jitsafe_header_iostream =
+    "#pragma once\n"
+    "#include <ostream>\n"
+    "#include <istream>\n";
+// HACK TESTING (WAR for Thrust)
+static const char* jitsafe_header_ostream =
+    "#pragma once\n"
+    "\n"
+    "namespace __jitify_ostream_ns {\n"
+    "template<class CharT,class Traits=void>\n"  // = std::char_traits<CharT>
+                                                 // >\n"
+    "struct basic_ostream {\n"
+    "};\n"
+    "typedef basic_ostream<char> ostream;\n"
+    "ostream& endl(ostream& os);\n"
+    "ostream& operator<<( ostream&, ostream& (*f)( ostream& ) );\n"
+    "template< class CharT, class Traits > basic_ostream<CharT, Traits>& endl( "
+    "basic_ostream<CharT, Traits>& os );\n"
+    "template< class CharT, class Traits > basic_ostream<CharT, Traits>& "
+    "operator<<( basic_ostream<CharT,Traits>& os, const char* c );\n"
+    "#if __cplusplus >= 201103L\n"
+    "template< class CharT, class Traits, class T > basic_ostream<CharT, "
+    "Traits>& operator<<( basic_ostream<CharT,Traits>&& os, const T& value );\n"
+    "#endif  // __cplusplus >= 201103L\n"
+    "} // namespace __jitify_ostream_ns\n"
+    "namespace std { using namespace __jitify_ostream_ns; }\n"
+    "using namespace __jitify_ostream_ns;\n";
+
+static const char* jitsafe_header_istream =
+    "#pragma once\n"
+    "\n"
+    "namespace __jitify_istream_ns {\n"
+    "template<class CharT,class Traits=void>\n"  // = std::char_traits<CharT>
+                                                 // >\n"
+    "struct basic_istream {\n"
+    "};\n"
+    "typedef basic_istream<char> istream;\n"
+    "} // namespace __jitify_istream_ns\n"
+    "namespace std { using namespace __jitify_istream_ns; }\n"
+    "using namespace __jitify_istream_ns;\n";
+
+static const char* jitsafe_header_sstream =
+    "#pragma once\n"
+    "#include <ostream>\n"
+    "#include <istream>\n";
+
+static const char* jitsafe_header_utility =
+    "#pragma once\n"
+    "namespace __jitify_utility_ns {\n"
+    "template<class T1, class T2>\n"
+    "struct pair {\n"
+    "	T1 first;\n"
+    "	T2 second;\n"
+    "	inline pair() {}\n"
+    "	inline pair(T1 const& first_, T2 const& second_)\n"
+    "		: first(first_), second(second_) {}\n"
+    "	// TODO: Standard includes many more constructors...\n"
+    "	// TODO: Comparison operators\n"
+    "};\n"
+    "template<class T1, class T2>\n"
+    "pair<T1,T2> make_pair(T1 const& first, T2 const& second) {\n"
+    "	return pair<T1,T2>(first, second);\n"
+    "}\n"
+    "} // namespace __jitify_utility_ns\n"
+    "namespace std { using namespace __jitify_utility_ns; }\n"
+    "using namespace __jitify_utility_ns;\n";
+
+// TODO: incomplete
+static const char* jitsafe_header_vector =
+    "#pragma once\n"
+    "namespace __jitify_vector_ns {\n"
+    "template<class T, class Allocator=void>\n"  // = std::allocator> \n"
+    "struct vector {\n"
+    "};\n"
+    "} // namespace __jitify_vector_ns\n"
+    "namespace std { using namespace __jitify_vector_ns; }\n"
+    "using namespace __jitify_vector_ns;\n";
+
+// TODO: incomplete
+static const char* jitsafe_header_string =
+    "#pragma once\n"
+    "namespace __jitify_string_ns {\n"
+    "template<class CharT,class Traits=void,class Allocator=void>\n"
+    "struct basic_string {\n"
+    "basic_string();\n"
+    "basic_string( const CharT* s );\n"  //, const Allocator& alloc =
+                                         // Allocator() );\n"
+    "const CharT* c_str() const;\n"
+    "bool empty() const;\n"
+    "void operator+=(const char *);\n"
+    "void operator+=(const basic_string &);\n"
+    "};\n"
+    "typedef basic_string<char> string;\n"
+    "} // namespace __jitify_string_ns\n"
+    "namespace std { using namespace __jitify_string_ns; }\n"
+    "using namespace __jitify_string_ns;\n";
+
+// TODO: incomplete
+static const char* jitsafe_header_stdexcept =
+    "#pragma once\n"
+    "namespace __jitify_stdexcept_ns {\n"
+    "struct runtime_error {\n"
+    "explicit runtime_error( const std::string& what_arg );"
+    "explicit runtime_error( const char* what_arg );"
+    "virtual const char* what() const;\n"
+    "};\n"
+    "} // namespace __jitify_stdexcept_ns\n"
+    "namespace std { using namespace __jitify_stdexcept_ns; }\n"
+    "using namespace __jitify_stdexcept_ns;\n";
+
+// TODO: incomplete
+static const char* jitsafe_header_complex =
+    "#pragma once\n"
+    "namespace __jitify_complex_ns {\n"
+    "template<typename T>\n"
+    "class complex {\n"
+    "	T _real;\n"
+    "	T _imag;\n"
+    "public:\n"
+    "	complex() : _real(0), _imag(0) {}\n"
+    "	complex(T const& real, T const& imag)\n"
+    "		: _real(real), _imag(imag) {}\n"
+    "	complex(T const& real)\n"
+    "               : _real(real), _imag(static_cast<T>(0)) {}\n"
+    "	T const& real() const { return _real; }\n"
+    "	T&       real()       { return _real; }\n"
+    "	void real(const T &r) { _real = r; }\n"
+    "	T const& imag() const { return _imag; }\n"
+    "	T&       imag()       { return _imag; }\n"
+    "	void imag(const T &i) { _imag = i; }\n"
+    "       complex<T>& operator+=(const complex<T> z)\n"
+    "         { _real += z.real(); _imag += z.imag(); return *this; }\n"
+    "};\n"
+    "template<typename T>\n"
+    "complex<T> operator*(const complex<T>& lhs, const complex<T>& rhs)\n"
+    "  { return complex<T>(lhs.real()*rhs.real()-lhs.imag()*rhs.imag(),\n"
+    "                      lhs.real()*rhs.imag()+lhs.imag()*rhs.real()); }\n"
+    "template<typename T>\n"
+    "complex<T> operator*(const complex<T>& lhs, const T & rhs)\n"
+    "  { return complexs<T>(lhs.real()*rhs,lhs.imag()*rhs); }\n"
+    "template<typename T>\n"
+    "complex<T> operator*(const T& lhs, const complex<T>& rhs)\n"
+    "  { return complexs<T>(rhs.real()*lhs,rhs.imag()*lhs); }\n"
+    "} // namespace __jitify_complex_ns\n"
+    "namespace std { using namespace __jitify_complex_ns; }\n"
+    "using namespace __jitify_complex_ns;\n";
+
+// TODO: This is incomplete (missing binary and integer funcs, macros,
+// constants, types)
+static const char* jitsafe_header_math =
+    "#pragma once\n"
+    "namespace __jitify_math_ns {\n"
+    "#if __cplusplus >= 201103L\n"
+    "#define DEFINE_MATH_UNARY_FUNC_WRAPPER(f) \\\n"
+    "	inline double      f(double x)         { return ::f(x); } \\\n"
+    "	inline float       f##f(float x)       { return ::f(x); } \\\n"
+    "	/*inline long double f##l(long double x) { return ::f(x); }*/ \\\n"
+    "	inline float       f(float x)          { return ::f(x); } \\\n"
+    "	/*inline long double f(long double x)    { return ::f(x); }*/\n"
+    "#else\n"
+    "#define DEFINE_MATH_UNARY_FUNC_WRAPPER(f) \\\n"
+    "	inline double      f(double x)         { return ::f(x); } \\\n"
+    "	inline float       f##f(float x)       { return ::f(x); } \\\n"
+    "	/*inline long double f##l(long double x) { return ::f(x); }*/\n"
+    "#endif\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(cos)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(sin)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(tan)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(acos)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(asin)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(atan)\n"
+    "template<typename T> inline T atan2(T y, T x) { return ::atan2(y, x); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(cosh)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(sinh)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(tanh)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(exp)\n"
+    "template<typename T> inline T frexp(T x, int* exp) { return ::frexp(x, "
+    "exp); }\n"
+    "template<typename T> inline T ldexp(T x, int  exp) { return ::ldexp(x, "
+    "exp); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(log)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(log10)\n"
+    "template<typename T> inline T modf(T x, T* intpart) { return ::modf(x, "
+    "intpart); }\n"
+    "template<typename T> inline T pow(T x, T y) { return ::pow(x, y); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(sqrt)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(ceil)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(floor)\n"
+    "template<typename T> inline T fmod(T n, T d) { return ::fmod(n, d); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(fabs)\n"
+    "template<typename T> inline T abs(T x) { return ::abs(x); }\n"
+    "#if __cplusplus >= 201103L\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(acosh)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(asinh)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(atanh)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(exp2)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(expm1)\n"
+    "template<typename T> inline int ilogb(T x) { return ::ilogb(x); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(log1p)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(log2)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(logb)\n"
+    "template<typename T> inline T scalbn (T x, int n)  { return ::scalbn(x, "
+    "n); }\n"
+    "template<typename T> inline T scalbln(T x, long n) { return ::scalbn(x, "
+    "n); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(cbrt)\n"
+    "template<typename T> inline T hypot(T x, T y) { return ::hypot(x, y); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(erf)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(erfc)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(tgamma)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(lgamma)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(trunc)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(round)\n"
+    "template<typename T> inline long lround(T x) { return ::lround(x); }\n"
+    "template<typename T> inline long long llround(T x) { return ::llround(x); "
+    "}\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(rint)\n"
+    "template<typename T> inline long lrint(T x) { return ::lrint(x); }\n"
+    "template<typename T> inline long long llrint(T x) { return ::llrint(x); "
+    "}\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(nearbyint)\n"
+    // TODO: remainder, remquo, copysign, nan, nextafter, nexttoward, fdim,
+    // fmax, fmin, fma
+    "#endif\n"
+    "#undef DEFINE_MATH_UNARY_FUNC_WRAPPER\n"
+    "} // namespace __jitify_math_ns\n"
+    "namespace std { using namespace __jitify_math_ns; }\n"
+    "#define M_PI 3.14159265358979323846\n"
+    // Note: Global namespace already includes CUDA math funcs
+    "//using namespace __jitify_math_ns;\n";
+
+static const char* jitsafe_header_memory_h = R"(
+    #pragma once
+    #include <string.h>
+ )";
+
+// TODO: incomplete
+static const char* jitsafe_header_mutex = R"(
+    #pragma once
+    #if __cplusplus >= 201103L
+    namespace __jitify_mutex_ns {
+    class mutex {
+    public:
+    void lock();
+    bool try_lock();
+    void unlock();
+    };
+    } // namespace __jitify_mutex_ns
+    namespace std { using namespace __jitify_mutex_ns; }
+    using namespace __jitify_mutex_ns;
+    #endif
+ )";
+
+static const char* jitsafe_header_algorithm = R"(
+    #pragma once
+    #if __cplusplus >= 201103L
+    namespace __jitify_algorithm_ns {
+
+    #if __cplusplus == 201103L
+    #define JITIFY_CXX14_CONSTEXPR
+    #else
+    #define JITIFY_CXX14_CONSTEXPR constexpr
+    #endif
+
+    template<class T> JITIFY_CXX14_CONSTEXPR const T& max(const T& a, const T& b)
+    {
+      return (b > a) ? b : a;
+    }
+    template<class T> JITIFY_CXX14_CONSTEXPR const T& min(const T& a, const T& b)
+    {
+      return (b < a) ? b : a;
+    }
+
+    } // namespace __jitify_algorithm_ns
+    namespace std { using namespace __jitify_algorithm_ns; }
+    using namespace __jitify_algorithm_ns;
+    #endif
+ )";
+
+static const char* jitsafe_header_time_h = R"(
+    #pragma once
+    #define NULL 0
+    #define CLOCKS_PER_SEC 1000000
+    namespace __jitify_time_ns {
+    typedef long time_t;
+    struct tm {
+      int tm_sec;
+      int tm_min;
+      int tm_hour;
+      int tm_mday;
+      int tm_mon;
+      int tm_year;
+      int tm_wday;
+      int tm_yday;
+      int tm_isdst;
+    };
+    #if __cplusplus >= 201703L
+    struct timespec {
+      time_t tv_sec;
+      long tv_nsec;
+    };
+    #endif
+    }  // namespace __jitify_time_ns
+    namespace std {
+      // NVRTC provides built-in definitions of ::size_t and ::clock_t.
+      using ::size_t;
+      using ::clock_t;
+      using namespace __jitify_time_ns;
+    }
+    using namespace __jitify_time_ns;
+ )";
+
+// WAR: These need to be pre-included as a workaround for NVRTC implicitly using
+// /usr/include as an include path. The other built-in headers will be included
+// lazily as needed.
+static const char* preinclude_jitsafe_header_names[] = {
+    "jitify_preinclude.h",
+    "limits.h",
+    "math.h",
+    "memory.h",
+    "stdint.h",
+    "stdlib.h",
+    "stdio.h",
+    "string.h",
+    "time.h",
+};
+
+template <class T, int N>
+int array_size(T (&)[N]) {
+  return N;
+}
+const int preinclude_jitsafe_headers_count =
+    array_size(preinclude_jitsafe_header_names);
+
+static const std::map<std::string, std::string>& get_jitsafe_headers_map() {
+  static const std::map<std::string, std::string> jitsafe_headers_map = {
+      {"jitify_preinclude.h", jitsafe_header_preinclude_h},
+      {"float.h", jitsafe_header_float_h},
+      {"cfloat", jitsafe_header_float_h},
+      {"limits.h", jitsafe_header_limits_h},
+      {"climits", jitsafe_header_limits_h},
+      {"stdint.h", jitsafe_header_stdint_h},
+      {"cstdint", jitsafe_header_stdint_h},
+      {"stddef.h", jitsafe_header_stddef_h},
+      {"cstddef", jitsafe_header_stddef_h},
+      {"stdlib.h", jitsafe_header_stdlib_h},
+      {"cstdlib", jitsafe_header_stdlib_h},
+      {"stdio.h", jitsafe_header_stdio_h},
+      {"cstdio", jitsafe_header_stdio_h},
+      {"string.h", jitsafe_header_string_h},
+      {"cstring", jitsafe_header_cstring},
+      {"iterator", jitsafe_header_iterator},
+      {"limits", jitsafe_header_limits},
+      {"type_traits", jitsafe_header_type_traits},
+      {"utility", jitsafe_header_utility},
+      {"math.h", jitsafe_header_math},
+      {"cmath", jitsafe_header_math},
+      {"memory.h", jitsafe_header_memory_h},
+      {"complex", jitsafe_header_complex},
+      {"iostream", jitsafe_header_iostream},
+      {"ostream", jitsafe_header_ostream},
+      {"istream", jitsafe_header_istream},
+      {"sstream", jitsafe_header_sstream},
+      {"vector", jitsafe_header_vector},
+      {"string", jitsafe_header_string},
+      {"stdexcept", jitsafe_header_stdexcept},
+      {"mutex", jitsafe_header_mutex},
+      {"algorithm", jitsafe_header_algorithm},
+      {"time.h", jitsafe_header_time_h},
+      {"ctime", jitsafe_header_time_h},
+  };
+  return jitsafe_headers_map;
+}
+
+inline void add_options_from_env(std::vector<std::string>& options) {
+  // Add options from environment variable
+  const char* env_options = std::getenv("JITIFY_OPTIONS");
+  if (env_options) {
+    std::stringstream ss;
+    ss << env_options;
+    std::string opt;
+    while (!(ss >> opt).fail()) {
+      options.push_back(opt);
+    }
+  }
+  // Add options from JITIFY_OPTIONS macro
+#ifdef JITIFY_OPTIONS
+#define JITIFY_TOSTRING_IMPL(x) #x
+#define JITIFY_TOSTRING(x) JITIFY_TOSTRING_IMPL(x)
+  std::stringstream ss;
+  ss << JITIFY_TOSTRING(JITIFY_OPTIONS);
+  std::string opt;
+  while (!(ss >> opt).fail()) {
+    options.push_back(opt);
+  }
+#undef JITIFY_TOSTRING
+#undef JITIFY_TOSTRING_IMPL
+#endif  // JITIFY_OPTIONS
+}
+
+inline void detect_and_add_cuda_arch(std::vector<std::string>& options) {
+  for (int i = 0; i < (int)options.size(); ++i) {
+    // Note that this will also match the middle of "--gpu-architecture".
+    if (options[i].find("-arch") != std::string::npos) {
+      // Arch already specified in options
+      return;
+    }
+  }
+  // Use the compute capability of the current device
+  // TODO: Check these API calls for errors
+  cudaError_t status;
+  int device;
+  status = cudaGetDevice(&device);
+  if (status != cudaSuccess) {
+    throw std::runtime_error(
+        std::string(
+            "Failed to detect GPU architecture: cudaGetDevice failed: ") +
+        cudaGetErrorString(status));
+  }
+  int cc_major;
+  cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device);
+  int cc_minor;
+  cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device);
+  int cc = cc_major * 10 + cc_minor;
+  // Note: We must limit the architecture to the max supported by the current
+  //         version of NVRTC, otherwise newer hardware will cause errors
+  //         on older versions of CUDA.
+  // TODO: It would be better to detect this somehow, rather than hard-coding it
+
+  // Tegra chips do not have forwards compatibility so we need to special case
+  // them.
+  bool is_tegra = ((cc_major == 3 && cc_minor == 2) ||  // Logan
+                   (cc_major == 5 && cc_minor == 3) ||  // Erista
+                   (cc_major == 6 && cc_minor == 2) ||  // Parker
+                   (cc_major == 7 && cc_minor == 2));   // Xavier
+  if (!is_tegra) {
+    // ensure that future CUDA versions just work (even if suboptimal)
+    const int cuda_major = std::min(10, CUDA_VERSION / 1000);
+    // clang-format off
+    switch (cuda_major) {
+      case 10: cc = std::min(cc, 75); break; // Turing
+      case  9: cc = std::min(cc, 70); break; // Volta
+      case  8: cc = std::min(cc, 61); break; // Pascal
+      case  7: cc = std::min(cc, 52); break; // Maxwell
+      default:
+        throw std::runtime_error("Unexpected CUDA major version " +
+                                 std::to_string(cuda_major));
+    }
+    // clang-format on
+  }
+
+  std::stringstream ss;
+  ss << cc;
+  options.push_back("-arch=compute_" + ss.str());
+}
+
+inline void detect_and_add_cxx11_flag(std::vector<std::string>& options) {
+  // Reverse loop so we can erase on the fly.
+  for (int i = (int)options.size() - 1; i >= 0; --i) {
+    if (options[i].find("-std=c++98") != std::string::npos) {
+      // NVRTC doesn't support specifying c++98 explicitly, so we remove it.
+      options.erase(options.begin() + i);
+      return;
+    } else if (options[i].find("-std") != std::string::npos) {
+      // Some other standard was explicitly specified, don't change anything.
+      return;
+    }
+  }
+  // Jitify must be compiled with C++11 support, so we default to enabling it
+  // for the JIT-compiled code too.
+  options.push_back("-std=c++11");
+}
+
+inline void split_compiler_and_linker_options(
+    std::vector<std::string> options,
+    std::vector<std::string>* compiler_options,
+    std::vector<std::string>* linker_files,
+    std::vector<std::string>* linker_paths) {
+  for (int i = 0; i < (int)options.size(); ++i) {
+    std::string opt = options[i];
+    std::string flag = opt.substr(0, 2);
+    std::string value = opt.substr(2);
+    if (flag == "-l") {
+      linker_files->push_back(value);
+    } else if (flag == "-L") {
+      linker_paths->push_back(value);
+    } else {
+      compiler_options->push_back(opt);
+    }
+  }
+}
+
+inline bool pop_remove_unused_globals_flag(std::vector<std::string>* options) {
+  auto it = std::remove_if(
+      options->begin(), options->end(), [](const std::string& opt) {
+        return opt.find("-remove-unused-globals") != std::string::npos;
+      });
+  if (it != options->end()) {
+    options->resize(it - options->begin());
+    return true;
+  }
+  return false;
+}
+
+inline std::string ptx_parse_decl_name(const std::string& line) {
+  size_t name_end = line.find_first_of("[;");
+  if (name_end == std::string::npos) {
+    throw std::runtime_error(
+        "Failed to parse .global/.const declaration in PTX: expected a "
+        "semicolon");
+  }
+  size_t name_start_minus1 = line.find_last_of(" \t", name_end);
+  if (name_start_minus1 == std::string::npos) {
+    throw std::runtime_error(
+        "Failed to parse .global/.const declaration in PTX: expected "
+        "whitespace");
+  }
+  size_t name_start = name_start_minus1 + 1;
+  std::string name = line.substr(name_start, name_end - name_start);
+  return name;
+}
+
+inline void ptx_remove_unused_globals(std::string* ptx) {
+  std::istringstream iss(*ptx);
+  std::vector<std::string> lines;
+  std::unordered_map<size_t, std::string> line_num_to_global_name;
+  std::unordered_set<std::string> name_set;
+  for (std::string line; std::getline(iss, line);) {
+    size_t line_num = lines.size();
+    lines.push_back(line);
+    auto terms = split_string(line);
+    if (terms.size() <= 1) continue;  // Ignore lines with no arguments
+    if (terms[0].substr(0, 2) == "//") continue;  // Ignore comment lines
+    if (terms[0].substr(0, 7) == ".global" ||
+        terms[0].substr(0, 6) == ".const") {
+      line_num_to_global_name.emplace(line_num, ptx_parse_decl_name(line));
+      continue;
+    }
+    if (terms[0][0] == '.') continue;  // Ignore .version, .reg, .param etc.
+    // Note: The first term will always be an instruction name; starting at 1
+    // also allows unchecked inspection of the previous term.
+    for (int i = 1; i < (int)terms.size(); ++i) {
+      if (terms[i].substr(0, 2) == "//") break;  // Ignore comments
+      // Note: The characters '.' and '%' are not treated as delimiters.
+      const char* token_delims = " \t()[]{},;+-*/~&|^?:=!<>\"'\\";
+      for (auto token : split_string(terms[i], -1, token_delims)) {
+        if (  // Ignore non-names
+            !(std::isalpha(token[0]) || token[0] == '_' || token[0] == '$') ||
+            token.find('.') != std::string::npos ||
+            // Ignore variable/parameter declarations
+            terms[i - 1][0] == '.' ||
+            // Ignore branch instructions
+            (token == "bra" && terms[i - 1][0] == '@') ||
+            // Ignore branch labels
+            (token.substr(0, 2) == "BB" &&
+             terms[i - 1].substr(0, 3) == "bra")) {
+          continue;
+        }
+        name_set.insert(token);
+      }
+    }
+  }
+  std::ostringstream oss;
+  for (size_t line_num = 0; line_num < lines.size(); ++line_num) {
+    auto it = line_num_to_global_name.find(line_num);
+    if (it != line_num_to_global_name.end()) {
+      const std::string& name = it->second;
+      if (!name_set.count(name)) {
+        continue;  // Remove unused .global declaration.
+      }
+    }
+    oss << lines[line_num] << '\n';
+  }
+  *ptx = oss.str();
+}
+
+inline nvrtcResult compile_kernel(std::string program_name,
+                                  std::map<std::string, std::string> sources,
+                                  std::vector<std::string> options,
+                                  std::string instantiation = "",
+                                  std::string* log = 0, std::string* ptx = 0,
+                                  std::string* mangled_instantiation = 0) {
+  std::string program_source = sources[program_name];
+  // Build arrays of header names and sources
+  std::vector<const char*> header_names_c;
+  std::vector<const char*> header_sources_c;
+  int num_headers = (int)(sources.size() - 1);
+  header_names_c.reserve(num_headers);
+  header_sources_c.reserve(num_headers);
+  typedef std::map<std::string, std::string> source_map;
+  for (source_map::const_iterator iter = sources.begin(); iter != sources.end();
+       ++iter) {
+    std::string const& name = iter->first;
+    std::string const& code = iter->second;
+    if (name == program_name) {
+      continue;
+    }
+    header_names_c.push_back(name.c_str());
+    header_sources_c.push_back(code.c_str());
+  }
+
+  // TODO: This WAR is expected to be unnecessary as of CUDA > 10.2.
+  bool should_remove_unused_globals =
+      detail::pop_remove_unused_globals_flag(&options);
+
+  std::vector<const char*> options_c(options.size() + 2);
+  options_c[0] = "--device-as-default-execution-space";
+  options_c[1] = "--pre-include=jitify_preinclude.h";
+  for (int i = 0; i < (int)options.size(); ++i) {
+    options_c[i + 2] = options[i].c_str();
+  }
+
+#if CUDA_VERSION < 8000
+  std::string inst_dummy;
+  if (!instantiation.empty()) {
+    // WAR for no nvrtcAddNameExpression before CUDA 8.0
+    // Force template instantiation by adding dummy reference to kernel
+    inst_dummy = "__jitify_instantiation";
+    program_source +=
+        "\nvoid* " + inst_dummy + " = (void*)" + instantiation + ";\n";
+  }
+#endif
+
+#define CHECK_NVRTC(call)       \
+  do {                          \
+    nvrtcResult ret = call;     \
+    if (ret != NVRTC_SUCCESS) { \
+      return ret;               \
+    }                           \
+  } while (0)
+
+  nvrtcProgram nvrtc_program;
+  CHECK_NVRTC(nvrtcCreateProgram(
+      &nvrtc_program, program_source.c_str(), program_name.c_str(), num_headers,
+      header_sources_c.data(), header_names_c.data()));
+
+#if CUDA_VERSION >= 8000
+  if (!instantiation.empty()) {
+    CHECK_NVRTC(nvrtcAddNameExpression(nvrtc_program, instantiation.c_str()));
+  }
+#endif
+
+  nvrtcResult ret = nvrtcCompileProgram(nvrtc_program, (int)options_c.size(),
+                                        options_c.data());
+  if (log) {
+    size_t logsize;
+    CHECK_NVRTC(nvrtcGetProgramLogSize(nvrtc_program, &logsize));
+    std::vector<char> vlog(logsize, 0);
+    CHECK_NVRTC(nvrtcGetProgramLog(nvrtc_program, vlog.data()));
+    log->assign(vlog.data(), logsize);
+  }
+  if (ret != NVRTC_SUCCESS) {
+    return ret;
+  }
+
+  if (ptx) {
+    size_t ptxsize;
+    CHECK_NVRTC(nvrtcGetPTXSize(nvrtc_program, &ptxsize));
+    std::vector<char> vptx(ptxsize);
+    CHECK_NVRTC(nvrtcGetPTX(nvrtc_program, vptx.data()));
+    ptx->assign(vptx.data(), ptxsize);
+    if (should_remove_unused_globals) {
+      detail::ptx_remove_unused_globals(ptx);
+    }
+  }
+
+  if (!instantiation.empty() && mangled_instantiation) {
+#if CUDA_VERSION >= 8000
+    const char* mangled_instantiation_cstr;
+    // Note: The returned string pointer becomes invalid after
+    //         nvrtcDestroyProgram has been called, so we save it.
+    CHECK_NVRTC(nvrtcGetLoweredName(nvrtc_program, instantiation.c_str(),
+                                    &mangled_instantiation_cstr));
+    *mangled_instantiation = mangled_instantiation_cstr;
+#else
+    // Extract mangled kernel template instantiation from PTX
+    inst_dummy += " = ";  // Note: This must match how the PTX is generated
+    int mi_beg = ptx->find(inst_dummy) + inst_dummy.size();
+    int mi_end = ptx->find(";", mi_beg);
+    *mangled_instantiation = ptx->substr(mi_beg, mi_end - mi_beg);
+#endif
+  }
+
+  CHECK_NVRTC(nvrtcDestroyProgram(&nvrtc_program));
+#undef CHECK_NVRTC
+  return NVRTC_SUCCESS;
+}
+
+inline void load_program(std::string const& cuda_source,
+                         std::vector<std::string> const& headers,
+                         file_callback_type file_callback,
+                         std::vector<std::string>* include_paths,
+                         std::map<std::string, std::string>* program_sources,
+                         std::vector<std::string>* program_options,
+                         std::string* program_name) {
+  // Extract include paths from compile options
+  std::vector<std::string>::iterator iter = program_options->begin();
+  while (iter != program_options->end()) {
+    std::string const& opt = *iter;
+    if (opt.substr(0, 2) == "-I") {
+      include_paths->push_back(opt.substr(2));
+      iter = program_options->erase(iter);
+    } else {
+      ++iter;
+    }
+  }
+
+  // Load program source
+  if (!detail::load_source(cuda_source, *program_sources, "", *include_paths,
+                           file_callback)) {
+    throw std::runtime_error("Source not found: " + cuda_source);
+  }
+  *program_name = program_sources->begin()->first;
+
+  // Maps header include names to their full file paths.
+  std::map<std::string, std::string> header_fullpaths;
+
+  // Load header sources
+  for (std::string const& header : headers) {
+    if (!detail::load_source(header, *program_sources, "", *include_paths,
+                             file_callback, &header_fullpaths)) {
+      // **TODO: Deal with source not found
+      throw std::runtime_error("Source not found: " + header);
+    }
+  }
+
+#if JITIFY_PRINT_SOURCE
+  std::string& program_source = (*program_sources)[*program_name];
+  std::cout << "---------------------------------------" << std::endl;
+  std::cout << "--- Source of " << *program_name << " ---" << std::endl;
+  std::cout << "---------------------------------------" << std::endl;
+  detail::print_with_line_numbers(program_source);
+  std::cout << "---------------------------------------" << std::endl;
+#endif
+
+  std::vector<std::string> compiler_options, linker_files, linker_paths;
+  detail::split_compiler_and_linker_options(*program_options, &compiler_options,
+                                            &linker_files, &linker_paths);
+
+  // If no arch is specified at this point we use whatever the current
+  // context is. This ensures we pick up the correct internal headers
+  // for arch-dependent compilation, e.g., some intrinsics are only
+  // present for specific architectures.
+  detail::detect_and_add_cuda_arch(compiler_options);
+  detail::detect_and_add_cxx11_flag(compiler_options);
+
+  // Iteratively try to compile the sources, and use the resulting errors to
+  // identify missing headers.
+  std::string log;
+  nvrtcResult ret;
+  while ((ret = detail::compile_kernel(*program_name, *program_sources,
+                                       compiler_options, "", &log)) ==
+         NVRTC_ERROR_COMPILATION) {
+    std::string include_name;
+    std::string include_parent;
+    int line_num = 0;
+    if (!detail::extract_include_info_from_compile_error(
+            log, include_name, include_parent, line_num)) {
+#if JITIFY_PRINT_LOG
+      detail::print_compile_log(*program_name, log);
+#endif
+      // There was a non include-related compilation error
+      // TODO: How to handle error?
+      throw std::runtime_error("Runtime compilation failed");
+    }
+
+    bool is_included_with_quotes = false;
+    if (program_sources->count(include_parent)) {
+      const std::string& parent_source = (*program_sources)[include_parent];
+      is_included_with_quotes =
+          is_include_directive_with_quotes(parent_source, line_num);
+    }
+
+    // Try to load the new header
+    // Note: This fullpath lookup is needed because the compiler error
+    // messages have the include name of the header instead of its full path.
+    std::string include_parent_fullpath = header_fullpaths[include_parent];
+    std::string include_path = detail::path_base(include_parent_fullpath);
+    if (detail::load_source(include_name, *program_sources, include_path,
+                            *include_paths, file_callback, &header_fullpaths,
+                            is_included_with_quotes)) {
+#if JITIFY_PRINT_HEADER_PATHS
+      std::cout << "Found #include " << include_name << " from "
+                << include_parent << ":" << line_num << " ["
+                << include_parent_fullpath << "]"
+                << " at:\n  " << header_fullpaths[include_name] << std::endl;
+#endif
+    } else {  // Failed to find header file.
+      // Comment-out the include line and print a warning
+      if (!program_sources->count(include_parent)) {
+        // ***TODO: Unless there's another mechanism (e.g., potentially
+        //            the parent path vs. filename problem), getting
+        //            here means include_parent was found automatically
+        //            in a system include path.
+        //            We need a WAR to zap it from *its parent*.
+
+        typedef std::map<std::string, std::string> source_map;
+        for (source_map::const_iterator it = program_sources->begin();
+             it != program_sources->end(); ++it) {
+          std::cout << "  " << it->first << std::endl;
+        }
+        throw std::out_of_range(include_parent +
+                                " not in loaded sources!"
+                                " This may be due to a header being loaded by"
+                                " NVRTC without Jitify's knowledge.");
+      }
+      std::string& parent_source = (*program_sources)[include_parent];
+      parent_source = detail::comment_out_code_line(line_num, parent_source);
+#if JITIFY_PRINT_LOG
+      std::cout << include_parent << "(" << line_num
+                << "): warning: " << include_name << ": [jitify] File not found"
+                << std::endl;
+#endif
+    }
+  }
+  if (ret != NVRTC_SUCCESS) {
+#if JITIFY_PRINT_LOG
+    if (ret == NVRTC_ERROR_INVALID_OPTION) {
+      std::cout << "Compiler options: ";
+      for (int i = 0; i < (int)compiler_options.size(); ++i) {
+        std::cout << compiler_options[i] << " ";
+      }
+      std::cout << std::endl;
+    }
+#endif
+    throw std::runtime_error(std::string("NVRTC error: ") +
+                             nvrtcGetErrorString(ret));
+  }
+}
+
+inline void instantiate_kernel(
+    std::string const& program_name,
+    std::map<std::string, std::string> const& program_sources,
+    std::string const& instantiation, std::vector<std::string> const& options,
+    std::string* log, std::string* ptx, std::string* mangled_instantiation,
+    std::vector<std::string>* linker_files,
+    std::vector<std::string>* linker_paths) {
+  std::vector<std::string> compiler_options;
+  detail::split_compiler_and_linker_options(options, &compiler_options,
+                                            linker_files, linker_paths);
+
+  nvrtcResult ret =
+      detail::compile_kernel(program_name, program_sources, compiler_options,
+                             instantiation, log, ptx, mangled_instantiation);
+#if JITIFY_PRINT_LOG
+  if (log->size() > 1) {
+    detail::print_compile_log(program_name, *log);
+  }
+#endif
+  if (ret != NVRTC_SUCCESS) {
+    throw std::runtime_error(std::string("NVRTC error: ") +
+                             nvrtcGetErrorString(ret));
+  }
+
+#if JITIFY_PRINT_PTX
+  std::cout << "---------------------------------------" << std::endl;
+  std::cout << *mangled_instantiation << std::endl;
+  std::cout << "---------------------------------------" << std::endl;
+  std::cout << "--- PTX for " << mangled_instantiation << " in " << program_name
+            << " ---" << std::endl;
+  std::cout << "---------------------------------------" << std::endl;
+  std::cout << *ptx << std::endl;
+  std::cout << "---------------------------------------" << std::endl;
+#endif
+}
+
+inline void get_1d_max_occupancy(CUfunction func,
+                                 CUoccupancyB2DSize smem_callback,
+                                 unsigned int* smem, int max_block_size,
+                                 unsigned int flags, int* grid, int* block) {
+  if (!func) {
+    throw std::runtime_error(
+        "Kernel pointer is NULL; you may need to define JITIFY_THREAD_SAFE "
+        "1");
+  }
+  CUresult res = cuOccupancyMaxPotentialBlockSizeWithFlags(
+      grid, block, func, smem_callback, *smem, max_block_size, flags);
+  if (res != CUDA_SUCCESS) {
+    const char* msg;
+    cuGetErrorName(res, &msg);
+    throw std::runtime_error(msg);
+  }
+  if (smem_callback) {
+    *smem = (unsigned int)smem_callback(*block);
+  }
+}
+
+}  // namespace detail
+
+//! \endcond
+
+class KernelInstantiation;
+class Kernel;
+class Program;
+class JitCache;
+
+struct ProgramConfig {
+  std::vector<std::string> options;
+  std::vector<std::string> include_paths;
+  std::string name;
+  typedef std::map<std::string, std::string> source_map;
+  source_map sources;
+};
+
+class JitCache_impl {
+  friend class Program_impl;
+  friend class KernelInstantiation_impl;
+  friend class KernelLauncher_impl;
+  typedef uint64_t key_type;
+  jitify::ObjectCache<key_type, detail::CUDAKernel> _kernel_cache;
+  jitify::ObjectCache<key_type, ProgramConfig> _program_config_cache;
+  std::vector<std::string> _options;
+#if JITIFY_THREAD_SAFE
+  std::mutex _kernel_cache_mutex;
+  std::mutex _program_cache_mutex;
+#endif
+ public:
+  inline JitCache_impl(size_t cache_size)
+      : _kernel_cache(cache_size), _program_config_cache(cache_size) {
+    detail::add_options_from_env(_options);
+
+    // Bootstrap the cuda context to avoid errors
+    cudaFree(0);
+  }
+};
+
+class Program_impl {
+  // A friendly class
+  friend class Kernel_impl;
+  friend class KernelLauncher_impl;
+  friend class KernelInstantiation_impl;
+  // TODO: This can become invalid if JitCache is destroyed before the
+  //         Program object is. However, this can't happen if JitCache
+  //           instances are static.
+  JitCache_impl& _cache;
+  uint64_t _hash;
+  ProgramConfig* _config;
+  void load_sources(std::string source, std::vector<std::string> headers,
+                    std::vector<std::string> options,
+                    file_callback_type file_callback);
+
+ public:
+  inline Program_impl(JitCache_impl& cache, std::string source,
+                      jitify::detail::vector<std::string> headers = 0,
+                      jitify::detail::vector<std::string> options = 0,
+                      file_callback_type file_callback = 0);
+  inline Program_impl(Program_impl const&) = default;
+  inline Program_impl(Program_impl&&) = default;
+  inline std::vector<std::string> const& options() const {
+    return _config->options;
+  }
+  inline std::string const& name() const { return _config->name; }
+  inline ProgramConfig::source_map const& sources() const {
+    return _config->sources;
+  }
+  inline std::vector<std::string> const& include_paths() const {
+    return _config->include_paths;
+  }
+};
+
+class Kernel_impl {
+  friend class KernelLauncher_impl;
+  friend class KernelInstantiation_impl;
+  Program_impl _program;
+  std::string _name;
+  std::vector<std::string> _options;
+  uint64_t _hash;
+
+ public:
+  inline Kernel_impl(Program_impl const& program, std::string name,
+                     jitify::detail::vector<std::string> options = 0);
+  inline Kernel_impl(Kernel_impl const&) = default;
+  inline Kernel_impl(Kernel_impl&&) = default;
+};
+
+class KernelInstantiation_impl {
+  friend class KernelLauncher_impl;
+  Kernel_impl _kernel;
+  uint64_t _hash;
+  std::string _template_inst;
+  std::vector<std::string> _options;
+  detail::CUDAKernel* _cuda_kernel;
+  inline void print() const;
+  void build_kernel();
+
+ public:
+  inline KernelInstantiation_impl(
+      Kernel_impl const& kernel, std::vector<std::string> const& template_args);
+  inline KernelInstantiation_impl(KernelInstantiation_impl const&) = default;
+  inline KernelInstantiation_impl(KernelInstantiation_impl&&) = default;
+  detail::CUDAKernel const& cuda_kernel() const { return *_cuda_kernel; }
+};
+
+class KernelLauncher_impl {
+  KernelInstantiation_impl _kernel_inst;
+  dim3 _grid;
+  dim3 _block;
+  unsigned int _smem;
+  cudaStream_t _stream;
+
+ public:
+  inline KernelLauncher_impl(KernelInstantiation_impl const& kernel_inst,
+                             dim3 grid, dim3 block, unsigned int smem = 0,
+                             cudaStream_t stream = 0)
+      : _kernel_inst(kernel_inst),
+        _grid(grid),
+        _block(block),
+        _smem(smem),
+        _stream(stream) {}
+  inline KernelLauncher_impl(KernelLauncher_impl const&) = default;
+  inline KernelLauncher_impl(KernelLauncher_impl&&) = default;
+  inline CUresult launch(
+      jitify::detail::vector<void*> arg_ptrs,
+      jitify::detail::vector<std::string> arg_types = 0) const;
+};
+
+/*! An object representing a configured and instantiated kernel ready
+ *    for launching.
+ */
+class KernelLauncher {
+  std::unique_ptr<KernelLauncher_impl const> _impl;
+
+ public:
+  inline KernelLauncher(KernelInstantiation const& kernel_inst, dim3 grid,
+                        dim3 block, unsigned int smem = 0,
+                        cudaStream_t stream = 0);
+
+  // Note: It's important that there is no implicit conversion required
+  //         for arg_ptrs, because otherwise the parameter pack version
+  //         below gets called instead (probably resulting in a segfault).
+  /*! Launch the kernel.
+   *
+   *  \param arg_ptrs  A vector of pointers to each function argument for the
+   *    kernel.
+   *  \param arg_types A vector of function argument types represented
+   *    as code-strings. This parameter is optional and is only used to print
+   *    out the function signature.
+   */
+  inline CUresult launch(
+      std::vector<void*> arg_ptrs = std::vector<void*>(),
+      jitify::detail::vector<std::string> arg_types = 0) const {
+    return _impl->launch(arg_ptrs, arg_types);
+  }
+  // Regular function call syntax
+  /*! Launch the kernel.
+   *
+   *  \see launch
+   */
+  template <typename... ArgTypes>
+  inline CUresult operator()(ArgTypes... args) const {
+    return this->launch(args...);
+  }
+  /*! Launch the kernel.
+   *
+   *  \param args Function arguments for the kernel.
+   */
+  template <typename... ArgTypes>
+  inline CUresult launch(ArgTypes... args) const {
+    return this->launch(std::vector<void*>({(void*)&args...}),
+                        {reflection::reflect<ArgTypes>()...});
+  }
+};
+
+/*! An object representing a kernel instantiation made up of a Kernel and
+ *    template arguments.
+ */
+class KernelInstantiation {
+  friend class KernelLauncher;
+  std::unique_ptr<KernelInstantiation_impl const> _impl;
+
+ public:
+  inline KernelInstantiation(Kernel const& kernel,
+                             std::vector<std::string> const& template_args);
+
+  /*! Implicit conversion to the underlying CUfunction object.
+   *
+   * \note This allows use of CUDA APIs like
+   *   cuOccupancyMaxActiveBlocksPerMultiprocessor.
+   */
+  inline operator CUfunction() const { return _impl->cuda_kernel(); }
+
+  /*! Configure the kernel launch.
+   *
+   *  \see configure
+   */
+  inline KernelLauncher operator()(dim3 grid, dim3 block, unsigned int smem = 0,
+                                   cudaStream_t stream = 0) const {
+    return this->configure(grid, block, smem, stream);
+  }
+  /*! Configure the kernel launch.
+   *
+   *  \param grid   The thread grid dimensions for the launch.
+   *  \param block  The thread block dimensions for the launch.
+   *  \param smem   The amount of shared memory to dynamically allocate, in
+   * bytes.
+   *  \param stream The CUDA stream to launch the kernel in.
+   */
+  inline KernelLauncher configure(dim3 grid, dim3 block, unsigned int smem = 0,
+                                  cudaStream_t stream = 0) const {
+    return KernelLauncher(*this, grid, block, smem, stream);
+  }
+  /*! Configure the kernel launch with a 1-dimensional block and grid chosen
+   *  automatically to maximise occupancy.
+   *
+   * \param max_block_size  The upper limit on the block size, or 0 for no
+   * limit.
+   * \param smem  The amount of shared memory to dynamically allocate, in bytes.
+   * \param smem_callback  A function returning smem for a given block size (overrides \p smem).
+   * \param stream The CUDA stream to launch the kernel in.
+   * \param flags The flags to pass to cuOccupancyMaxPotentialBlockSizeWithFlags.
+   */
+  inline KernelLauncher configure_1d_max_occupancy(
+      int max_block_size = 0, unsigned int smem = 0,
+      CUoccupancyB2DSize smem_callback = 0, cudaStream_t stream = 0,
+      unsigned int flags = 0) const {
+    int grid;
+    int block;
+    CUfunction func = _impl->cuda_kernel();
+    detail::get_1d_max_occupancy(func, smem_callback, &smem, max_block_size,
+                                 flags, &grid, &block);
+    return this->configure(grid, block, smem, stream);
+  }
+
+  /*
+   * \deprecated Use \p get_global_ptr instead.
+   */
+  inline CUdeviceptr get_constant_ptr(const char* name,
+                                      size_t* size = nullptr) const {
+    return get_global_ptr(name, size);
+  }
+
+  /*
+   * Get a device pointer to a global __constant__ or __device__ variable using
+   * its un-mangled name. If provided, *size is set to the size of the variable
+   * in bytes.
+   */
+  inline CUdeviceptr get_global_ptr(const char* name,
+                                    size_t* size = nullptr) const {
+    return _impl->cuda_kernel().get_global_ptr(name, size);
+  }
+
+  /*
+   * Copy data from a global __constant__ or __device__ array to the host using
+   * its un-mangled name.
+   */
+  template <typename T>
+  inline CUresult get_global_array(const char* name, T* data, size_t count,
+                                   CUstream stream = 0) const {
+    return _impl->cuda_kernel().get_global_data(name, data, count, stream);
+  }
+
+  /*
+   * Copy a value from a global __constant__ or __device__ variable to the host
+   * using its un-mangled name.
+   */
+  template <typename T>
+  inline CUresult get_global_value(const char* name, T* value,
+                                   CUstream stream = 0) const {
+    return get_global_array(name, value, 1, stream);
+  }
+
+  /*
+   * Copy data from the host to a global __constant__ or __device__ array using
+   * its un-mangled name.
+   */
+  template <typename T>
+  inline CUresult set_global_array(const char* name, const T* data,
+                                   size_t count, CUstream stream = 0) const {
+    return _impl->cuda_kernel().set_global_data(name, data, count, stream);
+  }
+
+  /*
+   * Copy a value from the host to a global __constant__ or __device__ variable
+   * using its un-mangled name.
+   */
+  template <typename T>
+  inline CUresult set_global_value(const char* name, const T& value,
+                                   CUstream stream = 0) const {
+    return set_global_array(name, &value, 1, stream);
+  }
+
+  const std::string& mangled_name() const {
+    return _impl->cuda_kernel().function_name();
+  }
+
+  const std::string& ptx() const { return _impl->cuda_kernel().ptx(); }
+
+  const std::vector<std::string>& link_files() const {
+    return _impl->cuda_kernel().link_files();
+  }
+
+  const std::vector<std::string>& link_paths() const {
+    return _impl->cuda_kernel().link_paths();
+  }
+};
+
+/*! An object representing a kernel made up of a Program, a name and options.
+ */
+class Kernel {
+  friend class KernelInstantiation;
+  std::unique_ptr<Kernel_impl const> _impl;
+
+ public:
+  Kernel(Program const& program, std::string name,
+         jitify::detail::vector<std::string> options = 0);
+
+  /*! Instantiate the kernel.
+   *
+   *  \param template_args A vector of template arguments represented as
+   *    code-strings. These can be generated using
+   *    \code{.cpp}jitify::reflection::reflect<type>()\endcode or
+   *    \code{.cpp}jitify::reflection::reflect(value)\endcode
+   *
+   *  \note Template type deduction is not possible, so all types must be
+   *    explicitly specified.
+   */
+  // inline KernelInstantiation instantiate(std::vector<std::string> const&
+  // template_args) const {
+  inline KernelInstantiation instantiate(
+      std::vector<std::string> const& template_args =
+          std::vector<std::string>()) const {
+    return KernelInstantiation(*this, template_args);
+  }
+
+  // Regular template instantiation syntax (note limited flexibility)
+  /*! Instantiate the kernel.
+   *
+   *  \note The template arguments specified on this function are
+   *    used to instantiate the kernel. Non-type template arguments must
+   *    be wrapped with
+   *    \code{.cpp}jitify::reflection::NonType<type,value>\endcode
+   *
+   *  \note Template type deduction is not possible, so all types must be
+   *    explicitly specified.
+   */
+  template <typename... TemplateArgs>
+  inline KernelInstantiation instantiate() const {
+    return this->instantiate(
+        std::vector<std::string>({reflection::reflect<TemplateArgs>()...}));
+  }
+  // Template-like instantiation syntax
+  //   E.g., instantiate(myvar,Type<MyType>())(grid,block)
+  /*! Instantiate the kernel.
+   *
+   *  \param targs The template arguments for the kernel, represented as
+   *    values. Types must be wrapped with
+   *    \code{.cpp}jitify::reflection::Type<type>()\endcode or
+   *    \code{.cpp}jitify::reflection::type_of(value)\endcode
+   *
+   *  \note Template type deduction is not possible, so all types must be
+   *    explicitly specified.
+   */
+  template <typename... TemplateArgs>
+  inline KernelInstantiation instantiate(TemplateArgs... targs) const {
+    return this->instantiate(
+        std::vector<std::string>({reflection::reflect(targs)...}));
+  }
+};
+
+/*! An object representing a program made up of source code, headers
+ *    and options.
+ */
+class Program {
+  friend class Kernel;
+  std::unique_ptr<Program_impl const> _impl;
+
+ public:
+  Program(JitCache& cache, std::string source,
+          jitify::detail::vector<std::string> headers = 0,
+          jitify::detail::vector<std::string> options = 0,
+          file_callback_type file_callback = 0);
+
+  /*! Select a kernel.
+   *
+   * \param name The name of the kernel (unmangled and without
+   * template arguments).
+   * \param options A vector of options to be passed to the NVRTC
+   * compiler when compiling this kernel.
+   */
+  inline Kernel kernel(std::string name,
+                       jitify::detail::vector<std::string> options = 0) const {
+    return Kernel(*this, name, options);
+  }
+  /*! Select a kernel.
+   *
+   *  \see kernel
+   */
+  inline Kernel operator()(
+      std::string name, jitify::detail::vector<std::string> options = 0) const {
+    return this->kernel(name, options);
+  }
+};
+
+/*! An object that manages a cache of JIT-compiled CUDA kernels.
+ *
+ */
+class JitCache {
+  friend class Program;
+  std::unique_ptr<JitCache_impl> _impl;
+
+ public:
+  /*! JitCache constructor.
+   *  \param cache_size The number of kernels to hold in the cache
+   *    before overwriting the least-recently-used ones.
+   */
+  enum { DEFAULT_CACHE_SIZE = 128 };
+  JitCache(size_t cache_size = DEFAULT_CACHE_SIZE)
+      : _impl(new JitCache_impl(cache_size)) {}
+
+  /*! Create a program.
+   *
+   *  \param source A string containing either the source filename or
+   *    the source itself; in the latter case, the first line must be
+   *    the name of the program.
+   *  \param headers A vector of strings representing the source of
+   *    each header file required by the program. Each entry can be
+   *    either the header filename or the header source itself; in
+   *    the latter case, the first line must be the name of the header
+   *    (i.e., the name by which the header is #included).
+   *  \param options A vector of options to be passed to the
+   *    NVRTC compiler. Include paths specified with \p -I
+   *    are added to the search paths used by Jitify. The environment
+   *    variable JITIFY_OPTIONS can also be used to define additional
+   *    options.
+   *  \param file_callback A pointer to a callback function that is
+   *    invoked whenever a source file needs to be loaded. Inside this
+   *    function, the user can either load/specify the source themselves
+   *    or defer to Jitify's file-loading mechanisms.
+   *  \note Program or header source files referenced by filename are
+   *  looked-up using the following mechanisms (in this order):
+   *  \note 1) By calling file_callback.
+   *  \note 2) By looking for the file embedded in the executable via the GCC
+   * linker.
+   *  \note 3) By looking for the file in the filesystem.
+   *
+   *  \note Jitify recursively scans all source files for \p #include
+   *  directives and automatically adds them to the set of headers needed
+   *  by the program.
+   *  If a \p #include directive references a header that cannot be found,
+   *  the directive is automatically removed from the source code to prevent
+   *  immediate compilation failure. This may result in compilation errors
+   *  if the header was required by the program.
+   *
+   *  \note Jitify automatically includes NVRTC-safe versions of some
+   *  standard library headers.
+   */
+  inline Program program(std::string source,
+                         jitify::detail::vector<std::string> headers = 0,
+                         jitify::detail::vector<std::string> options = 0,
+                         file_callback_type file_callback = 0) {
+    return Program(*this, source, headers, options, file_callback);
+  }
+};
+
+inline Program::Program(JitCache& cache, std::string source,
+                        jitify::detail::vector<std::string> headers,
+                        jitify::detail::vector<std::string> options,
+                        file_callback_type file_callback)
+    : _impl(new Program_impl(*cache._impl, source, headers, options,
+                             file_callback)) {}
+
+inline Kernel::Kernel(Program const& program, std::string name,
+                      jitify::detail::vector<std::string> options)
+    : _impl(new Kernel_impl(*program._impl, name, options)) {}
+
+inline KernelInstantiation::KernelInstantiation(
+    Kernel const& kernel, std::vector<std::string> const& template_args)
+    : _impl(new KernelInstantiation_impl(*kernel._impl, template_args)) {}
+
+inline KernelLauncher::KernelLauncher(KernelInstantiation const& kernel_inst,
+                                      dim3 grid, dim3 block, unsigned int smem,
+                                      cudaStream_t stream)
+    : _impl(new KernelLauncher_impl(*kernel_inst._impl, grid, block, smem,
+                                    stream)) {}
+
+inline std::ostream& operator<<(std::ostream& stream, dim3 d) {
+  if (d.y == 1 && d.z == 1) {
+    stream << d.x;
+  } else {
+    stream << "(" << d.x << "," << d.y << "," << d.z << ")";
+  }
+  return stream;
+}
+
+inline CUresult KernelLauncher_impl::launch(
+    jitify::detail::vector<void*> arg_ptrs,
+    jitify::detail::vector<std::string> arg_types) const {
+#if JITIFY_PRINT_LAUNCH
+  Kernel_impl const& kernel = _kernel_inst._kernel;
+  std::string arg_types_string =
+      (arg_types.empty() ? "..." : reflection::reflect_list(arg_types));
+  std::cout << "Launching " << kernel._name << _kernel_inst._template_inst
+            << "<<<" << _grid << "," << _block << "," << _smem << "," << _stream
+            << ">>>"
+            << "(" << arg_types_string << ")" << std::endl;
+#endif
+  if (!_kernel_inst._cuda_kernel) {
+    throw std::runtime_error(
+        "Kernel pointer is NULL; you may need to define JITIFY_THREAD_SAFE 1");
+  }
+  return _kernel_inst._cuda_kernel->launch(_grid, _block, _smem, _stream,
+                                           arg_ptrs);
+}
+
+inline KernelInstantiation_impl::KernelInstantiation_impl(
+    Kernel_impl const& kernel, std::vector<std::string> const& template_args)
+    : _kernel(kernel), _options(kernel._options) {
+  _template_inst =
+      (template_args.empty() ? ""
+                             : reflection::reflect_template(template_args));
+  using detail::hash_combine;
+  using detail::hash_larson64;
+  _hash = _kernel._hash;
+  _hash = hash_combine(_hash, hash_larson64(_template_inst.c_str()));
+  JitCache_impl& cache = _kernel._program._cache;
+  uint64_t cache_key = _hash;
+#if JITIFY_THREAD_SAFE
+  std::lock_guard<std::mutex> lock(cache._kernel_cache_mutex);
+#endif
+  if (cache._kernel_cache.contains(cache_key)) {
+#if JITIFY_PRINT_INSTANTIATION
+    std::cout << "Found ";
+    this->print();
+#endif
+    _cuda_kernel = &cache._kernel_cache.get(cache_key);
+  } else {
+#if JITIFY_PRINT_INSTANTIATION
+    std::cout << "Building ";
+    this->print();
+#endif
+    _cuda_kernel = &cache._kernel_cache.emplace(cache_key);
+    this->build_kernel();
+  }
+}
+
+inline void KernelInstantiation_impl::print() const {
+  std::string options_string = reflection::reflect_list(_options);
+  std::cout << _kernel._name << _template_inst << " [" << options_string << "]"
+            << std::endl;
+}
+
+inline void KernelInstantiation_impl::build_kernel() {
+  Program_impl const& program = _kernel._program;
+
+  std::string instantiation = _kernel._name + _template_inst;
+
+  std::string log, ptx, mangled_instantiation;
+  std::vector<std::string> linker_files, linker_paths;
+  detail::instantiate_kernel(program.name(), program.sources(), instantiation,
+                             _options, &log, &ptx, &mangled_instantiation,
+                             &linker_files, &linker_paths);
+
+  _cuda_kernel->set(mangled_instantiation.c_str(), ptx.c_str(), linker_files,
+                    linker_paths);
+}
+
+Kernel_impl::Kernel_impl(Program_impl const& program, std::string name,
+                         jitify::detail::vector<std::string> options)
+    : _program(program), _name(name), _options(options) {
+  // Merge options from parent
+  _options.insert(_options.end(), _program.options().begin(),
+                  _program.options().end());
+  detail::detect_and_add_cuda_arch(_options);
+  detail::detect_and_add_cxx11_flag(_options);
+  std::string options_string = reflection::reflect_list(_options);
+  using detail::hash_combine;
+  using detail::hash_larson64;
+  _hash = _program._hash;
+  _hash = hash_combine(_hash, hash_larson64(_name.c_str()));
+  _hash = hash_combine(_hash, hash_larson64(options_string.c_str()));
+}
+
+Program_impl::Program_impl(JitCache_impl& cache, std::string source,
+                           jitify::detail::vector<std::string> headers,
+                           jitify::detail::vector<std::string> options,
+                           file_callback_type file_callback)
+    : _cache(cache) {
+  // Compute hash of source, headers and options
+  std::string options_string = reflection::reflect_list(options);
+  using detail::hash_combine;
+  using detail::hash_larson64;
+  _hash = hash_combine(hash_larson64(source.c_str()),
+                       hash_larson64(options_string.c_str()));
+  for (size_t i = 0; i < headers.size(); ++i) {
+    _hash = hash_combine(_hash, hash_larson64(headers[i].c_str()));
+  }
+  _hash = hash_combine(_hash, (uint64_t)file_callback);
+  // Add pre-include built-in JIT-safe headers
+  for (int i = 0; i < detail::preinclude_jitsafe_headers_count; ++i) {
+    const char* hdr_name = detail::preinclude_jitsafe_header_names[i];
+    const std::string& hdr_source =
+        detail::get_jitsafe_headers_map().at(hdr_name);
+    headers.push_back(std::string(hdr_name) + "\n" + hdr_source);
+  }
+  // Merge options from parent
+  options.insert(options.end(), _cache._options.begin(), _cache._options.end());
+  // Load sources
+#if JITIFY_THREAD_SAFE
+  std::lock_guard<std::mutex> lock(cache._program_cache_mutex);
+#endif
+  if (!cache._program_config_cache.contains(_hash)) {
+    _config = &cache._program_config_cache.insert(_hash);
+    this->load_sources(source, headers, options, file_callback);
+  } else {
+    _config = &cache._program_config_cache.get(_hash);
+  }
+}
+
+inline void Program_impl::load_sources(std::string source,
+                                       std::vector<std::string> headers,
+                                       std::vector<std::string> options,
+                                       file_callback_type file_callback) {
+  _config->options = options;
+  detail::load_program(source, headers, file_callback, &_config->include_paths,
+                       &_config->sources, &_config->options, &_config->name);
+}
+
+enum Location { HOST, DEVICE };
+
+/*! Specifies location and parameters for execution of an algorithm.
+ *  \param stream        The CUDA stream on which to execute.
+ *  \param headers       A vector of headers to include in the code.
+ *  \param options       Options to pass to the NVRTC compiler.
+ *  \param file_callback See jitify::Program.
+ *  \param block_size    The size of the CUDA thread block with which to
+ * execute.
+ *  \param cache_size    The number of kernels to store in the cache
+ * before overwriting the least-recently-used ones.
+ */
+struct ExecutionPolicy {
+  /*! Location (HOST or DEVICE) on which to execute.*/
+  Location location;
+  /*! List of headers to include when compiling the algorithm.*/
+  std::vector<std::string> headers;
+  /*! List of compiler options.*/
+  std::vector<std::string> options;
+  /*! Optional callback for loading source files.*/
+  file_callback_type file_callback;
+  /*! CUDA stream on which to execute.*/
+  cudaStream_t stream;
+  /*! CUDA device on which to execute.*/
+  int device;
+  /*! CUDA block size with which to execute.*/
+  int block_size;
+  /*! The number of instantiations to store in the cache before overwriting
+   *  the least-recently-used ones.*/
+  size_t cache_size;
+  ExecutionPolicy(Location location_ = DEVICE,
+                  jitify::detail::vector<std::string> headers_ = 0,
+                  jitify::detail::vector<std::string> options_ = 0,
+                  file_callback_type file_callback_ = 0,
+                  cudaStream_t stream_ = 0, int device_ = 0,
+                  int block_size_ = 256,
+                  size_t cache_size_ = JitCache::DEFAULT_CACHE_SIZE)
+      : location(location_),
+        headers(headers_),
+        options(options_),
+        file_callback(file_callback_),
+        stream(stream_),
+        device(device_),
+        block_size(block_size_),
+        cache_size(cache_size_) {}
+};
+
+template <class Func>
+class Lambda;
+
+/*! An object that captures a set of variables for use in a parallel_for
+ *    expression. See JITIFY_CAPTURE().
+ */
+class Capture {
+ public:
+  std::vector<std::string> _arg_decls;
+  std::vector<void*> _arg_ptrs;
+
+ public:
+  template <typename... Args>
+  inline Capture(std::vector<std::string> arg_names, Args const&... args)
+      : _arg_ptrs{(void*)&args...} {
+    std::vector<std::string> arg_types = {reflection::reflect<Args>()...};
+    _arg_decls.resize(arg_names.size());
+    for (int i = 0; i < (int)arg_names.size(); ++i) {
+      _arg_decls[i] = arg_types[i] + " " + arg_names[i];
+    }
+  }
+};
+
+/*! An object that captures the instantiated Lambda function for use
+    in a parallel_for expression and the function string for NVRTC
+    compilation
+ */
+template <class Func>
+class Lambda {
+ public:
+  Capture _capture;
+  std::string _func_string;
+  Func _func;
+
+ public:
+  inline Lambda(Capture const& capture, std::string func_string, Func func)
+      : _capture(capture), _func_string(func_string), _func(func) {}
+};
+
+template <typename T>
+inline Lambda<T> make_Lambda(Capture const& capture, std::string func,
+                             T lambda) {
+  return Lambda<T>(capture, func, lambda);
+}
+
+#define JITIFY_CAPTURE(...)                                            \
+  jitify::Capture(jitify::detail::split_string(#__VA_ARGS__, -1, ","), \
+                  __VA_ARGS__)
+
+#define JITIFY_MAKE_LAMBDA(capture, x, ...)               \
+  jitify::make_Lambda(capture, std::string(#__VA_ARGS__), \
+                      [x](int i) { __VA_ARGS__; })
+
+#define JITIFY_ARGS(...) __VA_ARGS__
+
+#define JITIFY_LAMBDA_(x, ...) \
+  JITIFY_MAKE_LAMBDA(JITIFY_CAPTURE(x), JITIFY_ARGS(x), __VA_ARGS__)
+
+// macro sequence to strip surrounding brackets
+#define JITIFY_STRIP_PARENS(X) X
+#define JITIFY_PASS_PARAMETERS(X) JITIFY_STRIP_PARENS(JITIFY_ARGS X)
+
+/*! Creates a Lambda object with captured variables and a function
+ *    definition.
+ *  \param capture A bracket-enclosed list of variables to capture.
+ *  \param ...     The function definition.
+ *
+ *  \code{.cpp}
+ *  float* capture_me;
+ *  int    capture_me_too;
+ *  auto my_lambda = JITIFY_LAMBDA( (capture_me, capture_me_too),
+ *                                  capture_me[i] = i*capture_me_too );
+ *  \endcode
+ */
+#define JITIFY_LAMBDA(capture, ...)                            \
+  JITIFY_LAMBDA_(JITIFY_ARGS(JITIFY_PASS_PARAMETERS(capture)), \
+                 JITIFY_ARGS(__VA_ARGS__))
+
+// TODO: Try to implement for_each that accepts iterators instead of indices
+//       Add compile guard for NOCUDA compilation
+/*! Call a function for a range of indices
+ *
+ *  \param policy Determines the location and device parameters for
+ *  execution of the parallel_for.
+ *  \param begin  The starting index.
+ *  \param end    The ending index.
+ *  \param lambda A Lambda object created using the JITIFY_LAMBDA() macro.
+ *
+ *  \code{.cpp}
+ *  char const* in;
+ *  float*      out;
+ *  parallel_for(0, 100, JITIFY_LAMBDA( (in, out), {char x = in[i]; out[i] =
+ * x*x; } ); \endcode
+ */
+template <typename IndexType, class Func>
+CUresult parallel_for(ExecutionPolicy policy, IndexType begin, IndexType end,
+                      Lambda<Func> const& lambda) {
+  using namespace jitify;
+
+  if (policy.location == HOST) {
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (IndexType i = begin; i < end; i++) {
+      lambda._func(i);
+    }
+    return CUDA_SUCCESS;  // FIXME - replace with non-CUDA enum type?
+  }
+
+  thread_local static JitCache kernel_cache(policy.cache_size);
+
+  std::vector<std::string> arg_decls;
+  arg_decls.push_back("I begin, I end");
+  arg_decls.insert(arg_decls.end(), lambda._capture._arg_decls.begin(),
+                   lambda._capture._arg_decls.end());
+
+  std::stringstream source_ss;
+  source_ss << "parallel_for_program\n";
+  for (auto const& header : policy.headers) {
+    std::string header_name = header.substr(0, header.find("\n"));
+    source_ss << "#include <" << header_name << ">\n";
+  }
+  source_ss << "template<typename I>\n"
+               "__global__\n"
+               "void parallel_for_kernel("
+            << reflection::reflect_list(arg_decls)
+            << ") {\n"
+               "	I i0 = threadIdx.x + blockDim.x*blockIdx.x;\n"
+               "	for( I i=i0+begin; i<end; i+=blockDim.x*gridDim.x ) {\n"
+               "	"
+            << "\t" << lambda._func_string << ";\n"
+            << "	}\n"
+               "}\n";
+
+  Program program = kernel_cache.program(source_ss.str(), policy.headers,
+                                         policy.options, policy.file_callback);
+
+  std::vector<void*> arg_ptrs;
+  arg_ptrs.push_back(&begin);
+  arg_ptrs.push_back(&end);
+  arg_ptrs.insert(arg_ptrs.end(), lambda._capture._arg_ptrs.begin(),
+                  lambda._capture._arg_ptrs.end());
+
+  size_t n = end - begin;
+  dim3 block(policy.block_size);
+  dim3 grid((unsigned int)std::min((n - 1) / block.x + 1, size_t(65535)));
+  cudaSetDevice(policy.device);
+  return program.kernel("parallel_for_kernel")
+      .instantiate<IndexType>()
+      .configure(grid, block, 0, policy.stream)
+      .launch(arg_ptrs);
+}
+
+namespace experimental {
+
+using jitify::file_callback_type;
+
+namespace serialization {
+
+namespace detail {
+
+// This should be incremented whenever the serialization format changes in any
+// incompatible way.
+static constexpr const size_t kSerializationVersion = 1;
+
+inline void serialize(std::ostream& stream, size_t u) {
+  uint64_t u64 = u;
+  stream.write(reinterpret_cast<char*>(&u64), sizeof(u64));
+}
+
+inline bool deserialize(std::istream& stream, size_t* size) {
+  uint64_t u64;
+  stream.read(reinterpret_cast<char*>(&u64), sizeof(u64));
+  *size = u64;
+  return stream.good();
+}
+
+inline void serialize(std::ostream& stream, std::string const& s) {
+  serialize(stream, s.size());
+  stream.write(s.data(), s.size());
+}
+
+inline bool deserialize(std::istream& stream, std::string* s) {
+  size_t size;
+  if (!deserialize(stream, &size)) return false;
+  s->resize(size);
+  if (s->size()) {
+    stream.read(&(*s)[0], s->size());
+  }
+  return stream.good();
+}
+
+inline void serialize(std::ostream& stream, std::vector<std::string> const& v) {
+  serialize(stream, v.size());
+  for (auto const& s : v) {
+    serialize(stream, s);
+  }
+}
+
+inline bool deserialize(std::istream& stream, std::vector<std::string>* v) {
+  size_t size;
+  if (!deserialize(stream, &size)) return false;
+  v->resize(size);
+  for (auto& s : *v) {
+    if (!deserialize(stream, &s)) return false;
+  }
+  return true;
+}
+
+inline void serialize(std::ostream& stream,
+                      std::map<std::string, std::string> const& m) {
+  serialize(stream, m.size());
+  for (auto const& kv : m) {
+    serialize(stream, kv.first);
+    serialize(stream, kv.second);
+  }
+}
+
+inline bool deserialize(std::istream& stream,
+                        std::map<std::string, std::string>* m) {
+  size_t size;
+  if (!deserialize(stream, &size)) return false;
+  for (size_t i = 0; i < size; ++i) {
+    std::string key;
+    if (!deserialize(stream, &key)) return false;
+    if (!deserialize(stream, &(*m)[key])) return false;
+  }
+  return true;
+}
+
+template <typename T, typename... Rest>
+inline void serialize(std::ostream& stream, T const& value, Rest... rest) {
+  serialize(stream, value);
+  serialize(stream, rest...);
+}
+
+template <typename T, typename... Rest>
+inline bool deserialize(std::istream& stream, T* value, Rest... rest) {
+  if (!deserialize(stream, value)) return false;
+  return deserialize(stream, rest...);
+}
+
+inline void serialize_magic_number(std::ostream& stream) {
+  stream.write("JTFY", 4);
+  serialize(stream, kSerializationVersion);
+}
+
+inline bool deserialize_magic_number(std::istream& stream) {
+  char magic_number[4] = {0, 0, 0, 0};
+  stream.read(&magic_number[0], 4);
+  if (!(magic_number[0] == 'J' && magic_number[1] == 'T' &&
+        magic_number[2] == 'F' && magic_number[3] == 'Y')) {
+    return false;
+  }
+  size_t serialization_version;
+  if (!deserialize(stream, &serialization_version)) return false;
+  return serialization_version == kSerializationVersion;
+}
+
+}  // namespace detail
+
+template <typename... Values>
+inline std::string serialize(Values const&... values) {
+  std::ostringstream ss(std::stringstream::out | std::stringstream::binary);
+  detail::serialize_magic_number(ss);
+  detail::serialize(ss, values...);
+  return ss.str();
+}
+
+template <typename... Values>
+inline bool deserialize(std::string const& serialized, Values*... values) {
+  std::istringstream ss(serialized,
+                        std::stringstream::in | std::stringstream::binary);
+  if (!detail::deserialize_magic_number(ss)) return false;
+  return detail::deserialize(ss, values...);
+}
+
+}  // namespace serialization
+
+class Program;
+class Kernel;
+class KernelInstantiation;
+class KernelLauncher;
+
+/*! An object representing a program made up of source code, headers
+ *    and options.
+ */
+class Program {
+ private:
+  friend class KernelInstantiation;
+  std::string _name;
+  std::vector<std::string> _options;
+  std::map<std::string, std::string> _sources;
+
+  // Private constructor used by deserialize()
+  Program() {}
+
+ public:
+  /*! Create a program.
+   *
+   *  \param source A string containing either the source filename or
+   *    the source itself; in the latter case, the first line must be
+   *    the name of the program.
+   *  \param headers A vector of strings representing the source of
+   *    each header file required by the program. Each entry can be
+   *    either the header filename or the header source itself; in
+   *    the latter case, the first line must be the name of the header
+   *    (i.e., the name by which the header is #included).
+   *  \param options A vector of options to be passed to the
+   *    NVRTC compiler. Include paths specified with \p -I
+   *    are added to the search paths used by Jitify. The environment
+   *    variable JITIFY_OPTIONS can also be used to define additional
+   *    options.
+   *  \param file_callback A pointer to a callback function that is
+   *    invoked whenever a source file needs to be loaded. Inside this
+   *    function, the user can either load/specify the source themselves
+   *    or defer to Jitify's file-loading mechanisms.
+   *  \note Program or header source files referenced by filename are
+   *  looked-up using the following mechanisms (in this order):
+   *  \note 1) By calling file_callback.
+   *  \note 2) By looking for the file embedded in the executable via the GCC
+   * linker.
+   *  \note 3) By looking for the file in the filesystem.
+   *
+   *  \note Jitify recursively scans all source files for \p #include
+   *  directives and automatically adds them to the set of headers needed
+   *  by the program.
+   *  If a \p #include directive references a header that cannot be found,
+   *  the directive is automatically removed from the source code to prevent
+   *  immediate compilation failure. This may result in compilation errors
+   *  if the header was required by the program.
+   *
+   *  \note Jitify automatically includes NVRTC-safe versions of some
+   *  standard library headers.
+   */
+  Program(std::string const& cuda_source,
+          std::vector<std::string> const& given_headers = {},
+          std::vector<std::string> const& given_options = {},
+          file_callback_type file_callback = nullptr) {
+    // Add pre-include built-in JIT-safe headers
+    std::vector<std::string> headers = given_headers;
+    for (int i = 0; i < detail::preinclude_jitsafe_headers_count; ++i) {
+      const char* hdr_name = detail::preinclude_jitsafe_header_names[i];
+      const std::string& hdr_source =
+          detail::get_jitsafe_headers_map().at(hdr_name);
+      headers.push_back(std::string(hdr_name) + "\n" + hdr_source);
+    }
+
+    _options = given_options;
+    detail::add_options_from_env(_options);
+    std::vector<std::string> include_paths;
+    detail::load_program(cuda_source, headers, file_callback, &include_paths,
+                         &_sources, &_options, &_name);
+  }
+
+  /*! Restore a serialized program.
+   *
+   * \param serialized_program The serialized program to restore.
+   *
+   * \see serialize
+   */
+  static Program deserialize(std::string const& serialized_program) {
+    Program program;
+    if (!serialization::deserialize(serialized_program, &program._name,
+                                    &program._options, &program._sources)) {
+      throw std::runtime_error("Failed to deserialize program");
+    }
+    return program;
+  }
+
+  /*! Save the program.
+   *
+   * \see deserialize
+   */
+  std::string serialize() const {
+    // Note: Must update kSerializationVersion if this is changed.
+    return serialization::serialize(_name, _options, _sources);
+  };
+
+  /*! Select a kernel.
+   *
+   * \param name The name of the kernel (unmangled and without
+   * template arguments).
+   * \param options A vector of options to be passed to the NVRTC
+   * compiler when compiling this kernel.
+   */
+  Kernel kernel(std::string const& name,
+                std::vector<std::string> const& options = {}) const;
+};
+
+class Kernel {
+  friend class KernelInstantiation;
+  Program const* _program;
+  std::string _name;
+  std::vector<std::string> _options;
+
+ public:
+  Kernel(Program const* program, std::string const& name,
+         std::vector<std::string> const& options = {})
+      : _program(program), _name(name), _options(options) {}
+
+  /*! Instantiate the kernel.
+   *
+   *  \param template_args A vector of template arguments represented as
+   *    code-strings. These can be generated using
+   *    \code{.cpp}jitify::reflection::reflect<type>()\endcode or
+   *    \code{.cpp}jitify::reflection::reflect(value)\endcode
+   *
+   *  \note Template type deduction is not possible, so all types must be
+   *    explicitly specified.
+   */
+  KernelInstantiation instantiate(
+      std::vector<std::string> const& template_args =
+          std::vector<std::string>()) const;
+
+  // Regular template instantiation syntax (note limited flexibility)
+  /*! Instantiate the kernel.
+   *
+   *  \note The template arguments specified on this function are
+   *    used to instantiate the kernel. Non-type template arguments must
+   *    be wrapped with
+   *    \code{.cpp}jitify::reflection::NonType<type,value>\endcode
+   *
+   *  \note Template type deduction is not possible, so all types must be
+   *    explicitly specified.
+   */
+  template <typename... TemplateArgs>
+  KernelInstantiation instantiate() const;
+
+  // Template-like instantiation syntax
+  //   E.g., instantiate(myvar,Type<MyType>())(grid,block)
+  /*! Instantiate the kernel.
+   *
+   *  \param targs The template arguments for the kernel, represented as
+   *    values. Types must be wrapped with
+   *    \code{.cpp}jitify::reflection::Type<type>()\endcode or
+   *    \code{.cpp}jitify::reflection::type_of(value)\endcode
+   *
+   *  \note Template type deduction is not possible, so all types must be
+   *    explicitly specified.
+   */
+  template <typename... TemplateArgs>
+  KernelInstantiation instantiate(TemplateArgs... targs) const;
+};
+
+class KernelInstantiation {
+  friend class KernelLauncher;
+  std::unique_ptr<detail::CUDAKernel> _cuda_kernel;
+
+  // Private constructor used by deserialize()
+  KernelInstantiation(std::string const& func_name, std::string const& ptx,
+                      std::vector<std::string> const& link_files,
+                      std::vector<std::string> const& link_paths)
+      : _cuda_kernel(new detail::CUDAKernel(func_name.c_str(), ptx.c_str(),
+                                            link_files, link_paths)) {}
+
+ public:
+  KernelInstantiation(Kernel const& kernel,
+                      std::vector<std::string> const& template_args) {
+    Program const* program = kernel._program;
+
+    std::string template_inst =
+        (template_args.empty() ? ""
+                               : reflection::reflect_template(template_args));
+    std::string instantiation = kernel._name + template_inst;
+
+    std::vector<std::string> options;
+    options.insert(options.begin(), program->_options.begin(),
+                   program->_options.end());
+    options.insert(options.begin(), kernel._options.begin(),
+                   kernel._options.end());
+    detail::detect_and_add_cuda_arch(options);
+    detail::detect_and_add_cxx11_flag(options);
+
+    std::string log, ptx, mangled_instantiation;
+    std::vector<std::string> linker_files, linker_paths;
+    detail::instantiate_kernel(program->_name, program->_sources, instantiation,
+                               options, &log, &ptx, &mangled_instantiation,
+                               &linker_files, &linker_paths);
+
+    _cuda_kernel.reset(new detail::CUDAKernel(mangled_instantiation.c_str(),
+                                              ptx.c_str(), linker_files,
+                                              linker_paths));
+  }
+
+  /*! Implicit conversion to the underlying CUfunction object.
+   *
+   * \note This allows use of CUDA APIs like
+   *   cuOccupancyMaxActiveBlocksPerMultiprocessor.
+   */
+  operator CUfunction() const { return *_cuda_kernel; }
+
+  /*! Restore a serialized kernel instantiation.
+   *
+   * \param serialized_kernel_inst The serialized kernel instantiation to
+   * restore.
+   *
+   * \see serialize
+   */
+  static KernelInstantiation deserialize(
+      std::string const& serialized_kernel_inst) {
+    std::string func_name, ptx;
+    std::vector<std::string> link_files, link_paths;
+    if (!serialization::deserialize(serialized_kernel_inst, &func_name, &ptx,
+                                    &link_files, &link_paths)) {
+      throw std::runtime_error("Failed to deserialize kernel instantiation");
+    }
+    return KernelInstantiation(func_name, ptx, link_files, link_paths);
+  }
+
+  /*! Save the program.
+   *
+   * \see deserialize
+   */
+  std::string serialize() const {
+    // Note: Must update kSerializationVersion if this is changed.
+    return serialization::serialize(
+        _cuda_kernel->function_name(), _cuda_kernel->ptx(),
+        _cuda_kernel->link_files(), _cuda_kernel->link_paths());
+  }
+
+  /*! Configure the kernel launch.
+   *
+   *  \param grid   The thread grid dimensions for the launch.
+   *  \param block  The thread block dimensions for the launch.
+   *  \param smem   The amount of shared memory to dynamically allocate, in
+   * bytes.
+   *  \param stream The CUDA stream to launch the kernel in.
+   */
+  KernelLauncher configure(dim3 grid, dim3 block, unsigned int smem = 0,
+                           cudaStream_t stream = 0) const;
+
+  /*! Configure the kernel launch with a 1-dimensional block and grid chosen
+   *  automatically to maximise occupancy.
+   *
+   * \param max_block_size  The upper limit on the block size, or 0 for no
+   * limit.
+   * \param smem  The amount of shared memory to dynamically allocate, in bytes.
+   * \param smem_callback  A function returning smem for a given block size
+   * (overrides \p smem).
+   * \param stream The CUDA stream to launch the kernel in.
+   * \param flags The flags to pass to
+   * cuOccupancyMaxPotentialBlockSizeWithFlags.
+   */
+  KernelLauncher configure_1d_max_occupancy(
+      int max_block_size = 0, unsigned int smem = 0,
+      CUoccupancyB2DSize smem_callback = 0, cudaStream_t stream = 0,
+      unsigned int flags = 0) const;
+
+  /*
+   * \deprecated Use \p get_global_ptr instead.
+   */
+  CUdeviceptr get_constant_ptr(const char* name, size_t* size = nullptr) const {
+    return get_global_ptr(name, size);
+  }
+
+  /*
+   * Get a device pointer to a global __constant__ or __device__ variable using
+   * its un-mangled name. If provided, *size is set to the size of the variable
+   * in bytes.
+   */
+  CUdeviceptr get_global_ptr(const char* name, size_t* size = nullptr) const {
+    return _cuda_kernel->get_global_ptr(name, size);
+  }
+
+  /*
+   * Copy data from a global __constant__ or __device__ array to the host using
+   * its un-mangled name.
+   */
+  template <typename T>
+  CUresult get_global_array(const char* name, T* data, size_t count,
+                            CUstream stream = 0) const {
+    return _cuda_kernel->get_global_data(name, data, count, stream);
+  }
+
+  /*
+   * Copy a value from a global __constant__ or __device__ variable to the host
+   * using its un-mangled name.
+   */
+  template <typename T>
+  CUresult get_global_value(const char* name, T* value,
+                            CUstream stream = 0) const {
+    return get_global_array(name, value, 1, stream);
+  }
+
+  /*
+   * Copy data from the host to a global __constant__ or __device__ array using
+   * its un-mangled name.
+   */
+  template <typename T>
+  CUresult set_global_array(const char* name, const T* data, size_t count,
+                            CUstream stream = 0) const {
+    return _cuda_kernel->set_global_data(name, data, count, stream);
+  }
+
+  /*
+   * Copy a value from the host to a global __constant__ or __device__ variable
+   * using its un-mangled name.
+   */
+  template <typename T>
+  CUresult set_global_value(const char* name, const T& value,
+                            CUstream stream = 0) const {
+    return set_global_array(name, &value, 1, stream);
+  }
+
+  const std::string& mangled_name() const {
+    return _cuda_kernel->function_name();
+  }
+
+  const std::string& ptx() const { return _cuda_kernel->ptx(); }
+
+  const std::vector<std::string>& link_files() const {
+    return _cuda_kernel->link_files();
+  }
+
+  const std::vector<std::string>& link_paths() const {
+    return _cuda_kernel->link_paths();
+  }
+};
+
+class KernelLauncher {
+  KernelInstantiation const* _kernel_inst;
+  dim3 _grid;
+  dim3 _block;
+  unsigned int _smem;
+  cudaStream_t _stream;
+
+ public:
+  KernelLauncher(KernelInstantiation const* kernel_inst, dim3 grid, dim3 block,
+                 unsigned int smem = 0, cudaStream_t stream = 0)
+      : _kernel_inst(kernel_inst),
+        _grid(grid),
+        _block(block),
+        _smem(smem),
+        _stream(stream) {}
+
+  // Note: It's important that there is no implicit conversion required
+  //         for arg_ptrs, because otherwise the parameter pack version
+  //         below gets called instead (probably resulting in a segfault).
+  /*! Launch the kernel.
+   *
+   *  \param arg_ptrs  A vector of pointers to each function argument for the
+   *    kernel.
+   *  \param arg_types A vector of function argument types represented
+   *    as code-strings. This parameter is optional and is only used to print
+   *    out the function signature.
+   */
+  CUresult launch(std::vector<void*> arg_ptrs = {},
+                  std::vector<std::string> arg_types = {}) const {
+#if JITIFY_PRINT_LAUNCH
+    std::string arg_types_string =
+        (arg_types.empty() ? "..." : reflection::reflect_list(arg_types));
+    std::cout << "Launching " << _kernel_inst->_cuda_kernel->function_name()
+              << "<<<" << _grid << "," << _block << "," << _smem << ","
+              << _stream << ">>>"
+              << "(" << arg_types_string << ")" << std::endl;
+#endif
+    return _kernel_inst->_cuda_kernel->launch(_grid, _block, _smem, _stream,
+                                              arg_ptrs);
+  }
+
+  /*! Launch the kernel.
+   *
+   *  \param args Function arguments for the kernel.
+   */
+  template <typename... ArgTypes>
+  CUresult launch(ArgTypes... args) const {
+    return this->launch(std::vector<void*>({(void*)&args...}),
+                        {reflection::reflect<ArgTypes>()...});
+  }
+};
+
+inline Kernel Program::kernel(std::string const& name,
+                              std::vector<std::string> const& options) const {
+  return Kernel(this, name, options);
+}
+
+inline KernelInstantiation Kernel::instantiate(
+    std::vector<std::string> const& template_args) const {
+  return KernelInstantiation(*this, template_args);
+}
+
+template <typename... TemplateArgs>
+inline KernelInstantiation Kernel::instantiate() const {
+  return this->instantiate(
+      std::vector<std::string>({reflection::reflect<TemplateArgs>()...}));
+}
+
+template <typename... TemplateArgs>
+inline KernelInstantiation Kernel::instantiate(TemplateArgs... targs) const {
+  return this->instantiate(
+      std::vector<std::string>({reflection::reflect(targs)...}));
+}
+
+inline KernelLauncher KernelInstantiation::configure(
+    dim3 grid, dim3 block, unsigned int smem, cudaStream_t stream) const {
+  return KernelLauncher(this, grid, block, smem, stream);
+}
+
+inline KernelLauncher KernelInstantiation::configure_1d_max_occupancy(
+    int max_block_size, unsigned int smem, CUoccupancyB2DSize smem_callback,
+    cudaStream_t stream, unsigned int flags) const {
+  int grid;
+  int block;
+  CUfunction func = *_cuda_kernel;
+  detail::get_1d_max_occupancy(func, smem_callback, &smem, max_block_size,
+                               flags, &grid, &block);
+  return this->configure(grid, block, smem, stream);
+}
+
+}  // namespace experimental
+
+}  // namespace jitify
+
+#if defined(_WIN32) || defined(_WIN64)
+#pragma pop_macro("max")
+#pragma pop_macro("min")
+#pragma pop_macro("strtok_r")
+#endif
diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_histogram.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_histogram.cuh
new file mode 100644
index 0000000000..37b1ec9734
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/agent/agent_histogram.cuh
@@ -0,0 +1,787 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_load.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ *
+ */
+enum BlockHistogramMemoryPreference
+{
+    GMEM,
+    SMEM,
+    BLEND
+};
+
+
+/**
+ * Parameterizable tuning policy type for AgentHistogram
+ */
+template <
+    int                             _BLOCK_THREADS,                 ///< Threads per thread block
+    int                             _PIXELS_PER_THREAD,             ///< Pixels per thread (per tile of input)
+    BlockLoadAlgorithm              _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier               _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                            _RLE_COMPRESS,                  ///< Whether to perform localized RLE to compress samples before histogramming
+    BlockHistogramMemoryPreference  _MEM_PREFERENCE,                ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+    bool                            _WORK_STEALING>                 ///< Whether to dequeue tiles from a global work queue
+struct AgentHistogramPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,                   ///< Threads per thread block
+        PIXELS_PER_THREAD       = _PIXELS_PER_THREAD,               ///< Pixels per thread (per tile of input)
+        IS_RLE_COMPRESS         = _RLE_COMPRESS,                    ///< Whether to perform localized RLE to compress samples before histogramming
+        MEM_PREFERENCE          = _MEM_PREFERENCE,                  ///< Whether to prefer privatized shared-memory bins (versus privatized global-memory bins)
+        IS_WORK_STEALING        = _WORK_STEALING,                   ///< Whether to dequeue tiles from a global work queue
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentHistogram implements a stateful abstraction of CUDA thread blocks for participating in device-wide histogram .
+ */
+template <
+    typename    AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int         PRIVATIZED_SMEM_BINS,           ///< Number of privatized shared-memory histogram bins of any channel.  Zero indicates privatized counters to be maintained in device-accessible memory.
+    int         NUM_CHANNELS,                   ///< Number of channels interleaved in the input data.  Supports up to four channels.
+    int         NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,                ///< Random-access input iterator type for reading samples
+    typename    CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename    PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename    OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    int         PTX_ARCH = CUB_PTX_ARCH>        ///< PTX compute capability
+struct AgentHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample type of the input iterator
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    /// The pixel type of SampleT
+    typedef typename CubVector<SampleT, NUM_CHANNELS>::Type PixelT;
+
+    /// The quad type of SampleT
+    typedef typename CubVector<SampleT, 4>::Type QuadT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentHistogramPolicyT::BLOCK_THREADS,
+
+        PIXELS_PER_THREAD       = AgentHistogramPolicyT::PIXELS_PER_THREAD,
+        SAMPLES_PER_THREAD      = PIXELS_PER_THREAD * NUM_CHANNELS,
+        QUADS_PER_THREAD        = SAMPLES_PER_THREAD / 4,
+
+        TILE_PIXELS             = PIXELS_PER_THREAD * BLOCK_THREADS,
+        TILE_SAMPLES            = SAMPLES_PER_THREAD * BLOCK_THREADS,
+
+        IS_RLE_COMPRESS            = AgentHistogramPolicyT::IS_RLE_COMPRESS,
+
+        MEM_PREFERENCE          = (PRIVATIZED_SMEM_BINS > 0) ?
+                                        AgentHistogramPolicyT::MEM_PREFERENCE :
+                                        GMEM,
+
+        IS_WORK_STEALING           = AgentHistogramPolicyT::IS_WORK_STEALING,
+    };
+
+    /// Cache load modifier for reading input elements
+    static const CacheLoadModifier LOAD_MODIFIER = AgentHistogramPolicyT::LOAD_MODIFIER;
+
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<SampleIteratorT>::VALUE,
+            CacheModifiedInputIterator<LOAD_MODIFIER, SampleT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
+            SampleIteratorT>::Type                                           // Directly use the supplied input iterator type
+        WrappedSampleIteratorT;
+
+    /// Pixel input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, PixelT, OffsetT>
+        WrappedPixelIteratorT;
+
+    /// Qaud input iterator type (for applying cache modifier)
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, QuadT, OffsetT>
+        WrappedQuadIteratorT;
+
+    /// Parameterized BlockLoad type for samples
+    typedef BlockLoad<
+            SampleT,
+            BLOCK_THREADS,
+            SAMPLES_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadSampleT;
+
+    /// Parameterized BlockLoad type for pixels
+    typedef BlockLoad<
+            PixelT,
+            BLOCK_THREADS,
+            PIXELS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadPixelT;
+
+    /// Parameterized BlockLoad type for quads
+    typedef BlockLoad<
+            QuadT,
+            BLOCK_THREADS,
+            QUADS_PER_THREAD,
+            AgentHistogramPolicyT::LOAD_ALGORITHM>
+        BlockLoadQuadT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CounterT histograms[NUM_ACTIVE_CHANNELS][PRIVATIZED_SMEM_BINS + 1];     // Smem needed for block-privatized smem histogram (with 1 word of padding)
+
+        int tile_idx;
+
+        // Aliasable storage layout
+        union Aliasable
+        {
+            typename BlockLoadSampleT::TempStorage sample_load;     // Smem needed for loading a tile of samples
+            typename BlockLoadPixelT::TempStorage pixel_load;       // Smem needed for loading a tile of pixels
+            typename BlockLoadQuadT::TempStorage quad_load;         // Smem needed for loading a tile of quads
+
+        } aliasable;
+    };
+
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    /// Reference to temp_storage
+    _TempStorage &temp_storage;
+
+    /// Sample input iterator (with cache modifier applied, if possible)
+    WrappedSampleIteratorT d_wrapped_samples;
+
+    /// Native pointer for input samples (possibly NULL if unavailable)
+    SampleT* d_native_samples;
+
+    /// The number of output bins for each channel
+    int (&num_output_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// The number of privatized bins for each channel
+    int (&num_privatized_bins)[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to gmem privatized histograms for each channel
+    CounterT* d_privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+    /// Reference to final output histograms (gmem)
+    CounterT* (&d_output_histograms)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    OutputDecodeOpT (&output_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// The transform operator for determining privatized counter indices from samples, one for each channel
+    PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS];
+
+    /// Whether to prefer privatized smem counters vs privatized global counters
+    bool prefer_smem;
+
+
+    //---------------------------------------------------------------------
+    // Initialize privatized bin counters
+    //---------------------------------------------------------------------
+
+    // Initialize privatized bin counters
+    __device__ __forceinline__ void InitBinCounters(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Initialize histogram bin counts to zeros
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            for (int privatized_bin = threadIdx.x; privatized_bin < num_privatized_bins[CHANNEL]; privatized_bin += BLOCK_THREADS)
+            {
+                privatized_histograms[CHANNEL][privatized_bin] = 0;
+            }
+        }
+
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void InitSmemBinCounters()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        InitBinCounters(privatized_histograms);
+    }
+
+
+    // Initialize privatized bin counters.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void InitGmemBinCounters()
+    {
+        InitBinCounters(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Update final output histograms
+    //---------------------------------------------------------------------
+
+    // Update final output histograms from privatized histograms
+    __device__ __forceinline__ void StoreOutput(CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS])
+    {
+        // Barrier to make sure all threads are done updating counters
+        CTA_SYNC();
+
+        // Apply privatized bin counts to output bin counts
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            int channel_bins = num_privatized_bins[CHANNEL];
+            for (int privatized_bin = threadIdx.x; 
+                    privatized_bin < channel_bins;  
+                    privatized_bin += BLOCK_THREADS)
+            {
+                int         output_bin  = -1;
+                CounterT    count       = privatized_histograms[CHANNEL][privatized_bin];
+                bool        is_valid    = count > 0;
+
+                output_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>((SampleT) privatized_bin, output_bin, is_valid);
+
+                if (output_bin >= 0)
+                {
+                    atomicAdd(&d_output_histograms[CHANNEL][output_bin], count);
+                }
+
+            }
+        }
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized shared-memory counters
+    __device__ __forceinline__ void StoreSmemOutput()
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        StoreOutput(privatized_histograms);
+    }
+
+
+    // Update final output histograms from privatized histograms.  Specialized for privatized global-memory counters
+    __device__ __forceinline__ void StoreGmemOutput()
+    {
+        StoreOutput(d_privatized_histograms);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile accumulation
+    //---------------------------------------------------------------------
+
+    // Accumulate pixels.  Specialized for RLE compression.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<true>      is_rle_compress)
+    {
+        #pragma unroll
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+        {
+            // Bin pixels
+            int bins[PIXELS_PER_THREAD];
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            {
+                bins[PIXEL] = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bins[PIXEL], is_valid[PIXEL]);
+            }
+
+            CounterT accumulator = 1;
+
+            #pragma unroll
+            for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD - 1; ++PIXEL)
+            {
+                if (bins[PIXEL] != bins[PIXEL + 1])
+                {
+                    if (bins[PIXEL] >= 0)
+                        atomicAdd(privatized_histograms[CHANNEL] + bins[PIXEL], accumulator);
+
+                     accumulator = 0;
+                }
+                accumulator++;
+            }
+
+            // Last pixel
+            if (bins[PIXELS_PER_THREAD - 1] >= 0)
+                atomicAdd(privatized_histograms[CHANNEL] + bins[PIXELS_PER_THREAD - 1], accumulator);
+        }
+    }
+
+
+    // Accumulate pixels.  Specialized for individual accumulation of each pixel.
+    __device__ __forceinline__ void AccumulatePixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD],
+        CounterT*           privatized_histograms[NUM_ACTIVE_CHANNELS],
+        Int2Type<false>     is_rle_compress)
+    {
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+        {
+            #pragma unroll
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            {
+                int bin = -1;
+                privatized_decode_op[CHANNEL].template BinSelect<LOAD_MODIFIER>(samples[PIXEL][CHANNEL], bin, is_valid[PIXEL]);
+                if (bin >= 0)
+                    atomicAdd(privatized_histograms[CHANNEL] + bin, 1);
+            }
+        }
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for smem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateSmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        CounterT* privatized_histograms[NUM_ACTIVE_CHANNELS];
+
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            privatized_histograms[CHANNEL] = temp_storage.histograms[CHANNEL];
+
+        AccumulatePixels(samples, is_valid, privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+    /**
+     * Accumulate pixel, specialized for gmem privatized histogram
+     */
+    __device__ __forceinline__ void AccumulateGmemPixels(
+        SampleT             samples[PIXELS_PER_THREAD][NUM_CHANNELS],
+        bool                is_valid[PIXELS_PER_THREAD])
+    {
+        AccumulatePixels(samples, is_valid, d_privatized_histograms, Int2Type<IS_RLE_COMPRESS>());
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Tile loading
+    //---------------------------------------------------------------------
+
+    // Load full, aligned tile using pixel iterator (multi-channel)
+    template <int _NUM_ACTIVE_CHANNELS>
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<_NUM_ACTIVE_CHANNELS>  num_active_channels)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples));
+    }
+
+    // Load full, aligned tile using quad iterator (single-channel)
+    __device__ __forceinline__ void LoadFullAlignedTile(
+        OffsetT                         block_offset,
+        int                             valid_samples,
+        SampleT                         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<1>                     num_active_channels)
+    {
+        typedef QuadT AliasedQuads[QUADS_PER_THREAD];
+
+        WrappedQuadIteratorT d_wrapped_quads((QuadT*) (d_native_samples + block_offset));
+
+        // Load using a wrapped quad iterator
+        BlockLoadQuadT(temp_storage.aliasable.quad_load).Load(
+            d_wrapped_quads,
+            reinterpret_cast<AliasedQuads&>(samples));
+    }
+
+    // Load full, aligned tile
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        LoadFullAlignedTile(block_offset, valid_samples, samples, Int2Type<NUM_ACTIVE_CHANNELS>());
+    }
+
+    // Load full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<true>  is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        // Load using sample iterator
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples));
+    }
+
+    // Load partially-full, aligned tile using the pixel iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<true>  is_aligned)
+    {
+        typedef PixelT AliasedPixels[PIXELS_PER_THREAD];
+
+        WrappedPixelIteratorT d_wrapped_pixels((PixelT*) (d_native_samples + block_offset));
+
+        int valid_pixels = valid_samples / NUM_CHANNELS;
+
+        // Load using a wrapped pixel iterator
+        BlockLoadPixelT(temp_storage.aliasable.pixel_load).Load(
+            d_wrapped_pixels,
+            reinterpret_cast<AliasedPixels&>(samples),
+            valid_pixels);
+    }
+
+    // Load partially-full, mis-aligned tile using sample iterator
+    __device__ __forceinline__ void LoadTile(
+        OffsetT         block_offset,
+        int             valid_samples,
+        SampleT         (&samples)[PIXELS_PER_THREAD][NUM_CHANNELS],
+        Int2Type<false> is_full_tile,
+        Int2Type<false> is_aligned)
+    {
+        typedef SampleT AliasedSamples[SAMPLES_PER_THREAD];
+
+        BlockLoadSampleT(temp_storage.aliasable.sample_load).Load(
+            d_wrapped_samples + block_offset,
+            reinterpret_cast<AliasedSamples&>(samples),
+            valid_samples);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Tile processing
+    //---------------------------------------------------------------------
+
+    // Consume a tile of data samples
+    template <
+        bool IS_ALIGNED,        // Whether the tile offset is aligned (quad-aligned for single-channel, pixel-aligned for multi-channel)
+        bool IS_FULL_TILE>      // Whether the tile is full
+    __device__ __forceinline__ void ConsumeTile(OffsetT block_offset, int valid_samples)
+    {
+        SampleT     samples[PIXELS_PER_THREAD][NUM_CHANNELS];
+        bool        is_valid[PIXELS_PER_THREAD];
+
+        // Load tile
+        LoadTile(
+            block_offset,
+            valid_samples,
+            samples,
+            Int2Type<IS_FULL_TILE>(),
+            Int2Type<IS_ALIGNED>());
+
+        // Set valid flags
+        #pragma unroll
+        for (int PIXEL = 0; PIXEL < PIXELS_PER_THREAD; ++PIXEL)
+            is_valid[PIXEL] = IS_FULL_TILE || (((threadIdx.x * PIXELS_PER_THREAD + PIXEL) * NUM_CHANNELS) < valid_samples);
+
+        // Accumulate samples
+#if CUB_PTX_ARCH >= 120
+        if (prefer_smem)
+            AccumulateSmemPixels(samples, is_valid);
+        else
+            AccumulateGmemPixels(samples, is_valid);
+#else
+        AccumulateGmemPixels(samples, is_valid);
+#endif
+
+    }
+
+
+    // Consume row tiles.  Specialized for work-stealing from queue
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<true>      is_work_stealing)
+    {
+
+        int         num_tiles                   = num_rows * tiles_per_row;
+        int         tile_idx                    = (blockIdx.y  * gridDim.x) + blockIdx.x;
+        OffsetT     num_even_share_tiles        = gridDim.x * gridDim.y;
+
+        while (tile_idx < num_tiles)
+        {
+            int     row             = tile_idx / tiles_per_row;
+            int     col             = tile_idx - (row * tiles_per_row);
+            OffsetT row_offset      = row * row_stride_samples;
+            OffsetT col_offset      = (col * TILE_SAMPLES);
+            OffsetT tile_offset     = row_offset + col_offset;
+
+            if (col == tiles_per_row - 1)
+            {
+                // Consume a partially-full tile at the end of the row
+                OffsetT num_remaining = (num_row_pixels * NUM_CHANNELS) - col_offset;
+                ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+            } 
+            else
+            {
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+            }
+
+            CTA_SYNC();
+
+            // Get next tile
+            if (threadIdx.x == 0)
+                temp_storage.tile_idx = tile_queue.Drain(1) + num_even_share_tiles;
+
+            CTA_SYNC();
+
+            tile_idx = temp_storage.tile_idx;
+        }
+    }
+
+
+    // Consume row tiles.  Specialized for even-share (striped across thread blocks)
+    template <bool IS_ALIGNED>
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue,
+        Int2Type<false>     is_work_stealing)
+    {
+        for (int row = blockIdx.y; row < num_rows; row += gridDim.y)
+        {
+            OffsetT row_begin   = row * row_stride_samples;
+            OffsetT row_end     = row_begin + (num_row_pixels * NUM_CHANNELS);
+            OffsetT tile_offset = row_begin + (blockIdx.x * TILE_SAMPLES);
+
+            while (tile_offset < row_end)
+            {
+                OffsetT num_remaining = row_end - tile_offset;
+
+                if (num_remaining < TILE_SAMPLES)
+                {
+                    // Consume partial tile
+                    ConsumeTile<IS_ALIGNED, false>(tile_offset, num_remaining);
+                    break;
+                }
+
+                // Consume full tile
+                ConsumeTile<IS_ALIGNED, true>(tile_offset, TILE_SAMPLES);
+                tile_offset += gridDim.x * TILE_SAMPLES;
+            }
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Parameter extraction
+    //---------------------------------------------------------------------
+
+    // Return a native pixel pointer (specialized for CacheModifiedInputIterator types)
+    template <
+        CacheLoadModifier   _MODIFIER,
+        typename            _ValueT,
+        typename            _OffsetT>
+    __device__ __forceinline__ SampleT* NativePointer(CacheModifiedInputIterator<_MODIFIER, _ValueT, _OffsetT> itr)
+    {
+        return itr.ptr;
+    }
+
+    // Return a native pixel pointer (specialized for other types)
+    template <typename IteratorT>
+    __device__ __forceinline__ SampleT* NativePointer(IteratorT itr)
+    {
+        return NULL;
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentHistogram(
+        TempStorage         &temp_storage,                                      ///< Reference to temp_storage
+        SampleIteratorT     d_samples,                                          ///< Input data to reduce
+        int                 (&num_output_bins)[NUM_ACTIVE_CHANNELS],            ///< The number bins per final output histogram
+        int                 (&num_privatized_bins)[NUM_ACTIVE_CHANNELS],        ///< The number bins per privatized histogram
+        CounterT*           (&d_output_histograms)[NUM_ACTIVE_CHANNELS],        ///< Reference to final output histograms
+        CounterT*           (&d_privatized_histograms)[NUM_ACTIVE_CHANNELS],    ///< Reference to privatized histograms
+        OutputDecodeOpT     (&output_decode_op)[NUM_ACTIVE_CHANNELS],           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+        PrivatizedDecodeOpT (&privatized_decode_op)[NUM_ACTIVE_CHANNELS])       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    :
+        temp_storage(temp_storage.Alias()),
+        d_wrapped_samples(d_samples),
+        num_output_bins(num_output_bins),
+        num_privatized_bins(num_privatized_bins),
+        d_output_histograms(d_output_histograms),
+        privatized_decode_op(privatized_decode_op),
+        output_decode_op(output_decode_op),
+        d_native_samples(NativePointer(d_wrapped_samples)),
+        prefer_smem((MEM_PREFERENCE == SMEM) ?
+            true :                              // prefer smem privatized histograms
+            (MEM_PREFERENCE == GMEM) ?
+                false :                         // prefer gmem privatized histograms
+                blockIdx.x & 1)                 // prefer blended privatized histograms
+    {
+        int blockId = (blockIdx.y * gridDim.x) + blockIdx.x;
+
+        // Initialize the locations of this block's privatized histograms
+        for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+            this->d_privatized_histograms[CHANNEL] = d_privatized_histograms[CHANNEL] + (blockId * num_privatized_bins[CHANNEL]);
+    }
+
+
+    /**
+     * Consume image
+     */
+    __device__ __forceinline__ void ConsumeTiles(
+        OffsetT             num_row_pixels,             ///< The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                   ///< The number of rows in the region of interest
+        OffsetT             row_stride_samples,         ///< The number of samples between starts of consecutive rows in the region of interest
+        int                 tiles_per_row,              ///< Number of image tiles per row
+        GridQueue<int>      tile_queue)                 ///< Queue descriptor for assigning tiles of work to thread blocks
+    {
+        // Check whether all row starting offsets are quad-aligned (in single-channel) or pixel-aligned (in multi-channel)
+        int     quad_mask           = AlignBytes<QuadT>::ALIGN_BYTES - 1;
+        int     pixel_mask          = AlignBytes<PixelT>::ALIGN_BYTES - 1;
+        size_t  row_bytes           = sizeof(SampleT) * row_stride_samples;
+
+        bool quad_aligned_rows      = (NUM_CHANNELS == 1) && (SAMPLES_PER_THREAD % 4 == 0) &&     // Single channel
+                                        ((size_t(d_native_samples) & quad_mask) == 0) &&        // ptr is quad-aligned
+                                        ((num_rows == 1) || ((row_bytes & quad_mask) == 0));    // number of row-samples is a multiple of the alignment of the quad
+
+        bool pixel_aligned_rows     = (NUM_CHANNELS > 1) &&                                     // Multi channel
+                                        ((size_t(d_native_samples) & pixel_mask) == 0) &&       // ptr is pixel-aligned
+                                        ((row_bytes & pixel_mask) == 0);                        // number of row-samples is a multiple of the alignment of the pixel
+
+        // Whether rows are aligned and can be vectorized
+        if ((d_native_samples != NULL) && (quad_aligned_rows || pixel_aligned_rows))
+            ConsumeTiles<true>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+        else
+            ConsumeTiles<false>(num_row_pixels, num_rows, row_stride_samples, tiles_per_row, tile_queue, Int2Type<IS_WORK_STEALING>());
+    }
+
+
+    /**
+     * Initialize privatized bin counters.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void InitBinCounters()
+    {
+        if (prefer_smem)
+            InitSmemBinCounters();
+        else
+            InitGmemBinCounters();
+    }
+
+
+    /**
+     * Store privatized histogram to device-accessible memory.  Specialized for privatized shared-memory counters
+     */
+    __device__ __forceinline__ void StoreOutput()
+    {
+        if (prefer_smem)
+            StoreSmemOutput();
+        else
+            StoreGmemOutput();
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_radix_sort_downsweep.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_radix_sort_downsweep.cuh
new file mode 100644
index 0000000000..faea88138e
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/agent/agent_radix_sort_downsweep.cuh
@@ -0,0 +1,789 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_load.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_radix_rank.cuh"
+#include "../block/block_exchange.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Radix ranking algorithm
+ */
+enum RadixRankAlgorithm
+{
+    RADIX_RANK_BASIC,
+    RADIX_RANK_MEMOIZE,
+    RADIX_RANK_MATCH
+};
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortDownsweep
+ */
+template <
+    int                         _BLOCK_THREADS,         ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,        ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,         ///< Cache load modifier for reading keys (and values)
+    RadixRankAlgorithm          _RANK_ALGORITHM,        ///< The radix ranking algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM,        ///< The block scan algorithm to use
+    int                         _RADIX_BITS>            ///< The number of radix bits, i.e., log2(bins)
+struct AgentRadixSortDownsweepPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,           ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,        ///< Items per thread (per tile of input)
+        RADIX_BITS              = _RADIX_BITS,              ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const BlockLoadAlgorithm  LOAD_ALGORITHM     = _LOAD_ALGORITHM;    ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier   LOAD_MODIFIER      = _LOAD_MODIFIER;     ///< Cache load modifier for reading keys (and values)
+    static const RadixRankAlgorithm  RANK_ALGORITHM     = _RANK_ALGORITHM;    ///< The radix ranking algorithm to use
+    static const BlockScanAlgorithm  SCAN_ALGORITHM     = _SCAN_ALGORITHM;    ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+
+
+
+/**
+ * \brief AgentRadixSortDownsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort downsweep .
+ */
+template <
+    typename AgentRadixSortDownsweepPolicy,     ///< Parameterized AgentRadixSortDownsweepPolicy tuning policy type
+    bool     IS_DESCENDING,                     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,                              ///< KeyT type
+    typename ValueT,                            ///< ValueT type
+    typename OffsetT>                           ///< Signed integer type for global offsets
+struct AgentRadixSortDownsweep
+{
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    // Appropriate unsigned-bits representation of KeyT
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    static const UnsignedBits           LOWEST_KEY  = Traits<KeyT>::LOWEST_KEY;
+    static const UnsignedBits           MAX_KEY     = Traits<KeyT>::MAX_KEY;
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM  = AgentRadixSortDownsweepPolicy::LOAD_ALGORITHM;
+    static const CacheLoadModifier      LOAD_MODIFIER   = AgentRadixSortDownsweepPolicy::LOAD_MODIFIER;
+    static const RadixRankAlgorithm     RANK_ALGORITHM  = AgentRadixSortDownsweepPolicy::RANK_ALGORITHM;
+    static const BlockScanAlgorithm     SCAN_ALGORITHM  = AgentRadixSortDownsweepPolicy::SCAN_ALGORITHM;
+
+    enum
+    {
+        BLOCK_THREADS           = AgentRadixSortDownsweepPolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRadixSortDownsweepPolicy::ITEMS_PER_THREAD,
+        RADIX_BITS              = AgentRadixSortDownsweepPolicy::RADIX_BITS,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT>    KeysItr;
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, ValueT, OffsetT>          ValuesItr;
+
+    // Radix ranking type to use
+    typedef typename If<(RANK_ALGORITHM == RADIX_RANK_BASIC),
+            BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, false, SCAN_ALGORITHM>,
+            typename If<(RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+                BlockRadixRank<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, true, SCAN_ALGORITHM>,
+                BlockRadixRankMatch<BLOCK_THREADS, RADIX_BITS, IS_DESCENDING, SCAN_ALGORITHM>
+            >::Type
+        >::Type BlockRadixRankT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockRadixRankT::BINS_TRACKED_PER_THREAD
+    };
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        UnsignedBits,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadKeysT;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        LOAD_ALGORITHM> BlockLoadValuesT;
+
+    // Value exchange array type
+    typedef ValueT ValueExchangeT[TILE_ITEMS];
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        typename BlockLoadKeysT::TempStorage    load_keys;
+        typename BlockLoadValuesT::TempStorage  load_values;
+        typename BlockRadixRankT::TempStorage   radix_rank;
+
+        struct
+        {
+            UnsignedBits                        exchange_keys[TILE_ITEMS];
+            OffsetT                             relative_bin_offsets[RADIX_DIGITS];
+        };
+
+        Uninitialized<ValueExchangeT>           exchange_values;
+
+        OffsetT                                 exclusive_digit_prefix[RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+    ValuesItr       d_values_in;
+    UnsignedBits    *d_keys_out;
+    ValueT          *d_values_out;
+
+    // The global scatter base offset for each digit (valid in the first RADIX_DIGITS threads)
+    OffsetT         bin_offset[BINS_TRACKED_PER_THREAD];
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+    // Whether to short-cirucit
+    int             short_circuit;
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Scatter ranked keys through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterKeys(
+        UnsignedBits    (&twiddled_keys)[ITEMS_PER_THREAD],
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         valid_items)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            temp_storage.exchange_keys[ranks[ITEM]] = twiddled_keys[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            UnsignedBits key            = temp_storage.exchange_keys[threadIdx.x + (ITEM * BLOCK_THREADS)];
+            UnsignedBits digit          = BFE(key, current_bit, num_bits);
+            relative_bin_offsets[ITEM]  = temp_storage.relative_bin_offsets[digit];
+
+            // Un-twiddle
+            key = Traits<KeyT>::TwiddleOut(key);
+
+            if (FULL_TILE || 
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_keys_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = key;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter ranked values through shared memory, then to device-accessible memory
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ScatterValues(
+        ValueT      (&values)[ITEMS_PER_THREAD],
+        OffsetT     (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int         (&ranks)[ITEMS_PER_THREAD],
+        OffsetT     valid_items)
+    {
+        CTA_SYNC();
+
+        ValueExchangeT &exchange_values = temp_storage.exchange_values.Alias();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            exchange_values[ranks[ITEM]] = values[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT value = exchange_values[threadIdx.x + (ITEM * BLOCK_THREADS)];
+
+            if (FULL_TILE ||
+                (static_cast<OffsetT>(threadIdx.x + (ITEM * BLOCK_THREADS)) < valid_items))
+            {
+                d_values_out[relative_bin_offsets[ITEM] + threadIdx.x + (ITEM * BLOCK_THREADS)] = value;
+            }
+        }
+    }
+
+    /**
+     * Load a tile of keys (specialized for full tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        BlockLoadKeysT(temp_storage.load_keys).Load(
+            d_keys_in + block_offset, keys, valid_items, oob_item);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<true>              is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys);
+    }
+
+
+    /**
+     * Load a tile of keys (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadKeys(
+        UnsignedBits                (&keys)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        UnsignedBits                oob_item,
+        Int2Type<false>             is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        LoadDirectWarpStriped(threadIdx.x, d_keys_in + block_offset, keys, valid_items, oob_item);
+    }
+
+
+    /**
+     * Load a tile of values (specialized for full tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of values (specialized for partial tile, any ranking algorithm)
+     */
+    template <int _RANK_ALGORITHM>
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<_RANK_ALGORITHM>   rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        BlockLoadValuesT(temp_storage.load_values).Load(
+            d_values_in + block_offset, values, valid_items);
+
+        CTA_SYNC();
+    }
+
+
+    /**
+     * Load a tile of items (specialized for full tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<true>              is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values);
+    }
+
+
+    /**
+     * Load a tile of items (specialized for partial tile, match ranking algorithm)
+     */
+    __device__ __forceinline__ void LoadValues(
+        ValueT                      (&values)[ITEMS_PER_THREAD],
+        OffsetT                     block_offset,
+        OffsetT                     valid_items,
+        Int2Type<false>             is_full_tile,
+        Int2Type<RADIX_RANK_MATCH>  rank_algorithm)
+    {
+        // Register pressure work-around: moving valid_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        valid_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(valid_items, 0, 0xffffffff);
+
+        LoadDirectWarpStriped(threadIdx.x, d_values_in + block_offset, values, valid_items);
+    }
+
+
+    /**
+     * Truck along associated values
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&relative_bin_offsets)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        OffsetT         block_offset,
+        OffsetT         valid_items,
+        Int2Type<false> /*is_keys_only*/)
+    {
+        ValueT values[ITEMS_PER_THREAD];
+
+        CTA_SYNC();
+
+        LoadValues(
+            values,
+            block_offset,
+            valid_items,
+            Int2Type<FULL_TILE>(),
+            Int2Type<RANK_ALGORITHM>());
+
+        ScatterValues<FULL_TILE>(
+            values,
+            relative_bin_offsets,
+            ranks,
+            valid_items);
+    }
+
+
+    /**
+     * Truck along associated values (specialized for key-only sorting)
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void GatherScatterValues(
+        OffsetT         (&/*relative_bin_offsets*/)[ITEMS_PER_THREAD],
+        int             (&/*ranks*/)[ITEMS_PER_THREAD],
+        OffsetT         /*block_offset*/,
+        OffsetT         /*valid_items*/,
+        Int2Type<true>  /*is_keys_only*/)
+    {}
+
+
+    /**
+     * Process tile
+     */
+    template <bool FULL_TILE>
+    __device__ __forceinline__ void ProcessTile(
+        OffsetT block_offset,
+        const OffsetT &valid_items = TILE_ITEMS)
+    {
+        UnsignedBits    keys[ITEMS_PER_THREAD];
+        int             ranks[ITEMS_PER_THREAD];
+        OffsetT         relative_bin_offsets[ITEMS_PER_THREAD];
+
+        // Assign default (min/max) value to all keys
+        UnsignedBits default_key = (IS_DESCENDING) ? LOWEST_KEY : MAX_KEY;
+
+        // Load tile of keys
+        LoadKeys(
+            keys,
+            block_offset,
+            valid_items, 
+            default_key,
+            Int2Type<FULL_TILE>(),
+            Int2Type<RANK_ALGORITHM>());
+
+        // Twiddle key bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            keys[KEY] = Traits<KeyT>::TwiddleIn(keys[KEY]);
+        }
+
+        // Rank the twiddled keys
+        int exclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+        BlockRadixRankT(temp_storage.radix_rank).RankKeys(
+            keys,
+            ranks,
+            current_bit,
+            num_bits,
+            exclusive_digit_prefix);
+
+        CTA_SYNC();
+
+        // Share exclusive digit prefix
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Store exclusive prefix
+                temp_storage.exclusive_digit_prefix[bin_idx] =
+                    exclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Get inclusive digit prefix
+        int inclusive_digit_prefix[BINS_TRACKED_PER_THREAD];
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                {
+                    // Get inclusive digit prefix from exclusive prefix (higher bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == 0) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx - 1];
+                }
+                else
+                {
+                    // Get inclusive digit prefix from exclusive prefix (lower bins come first)
+                    inclusive_digit_prefix[track] = (bin_idx == RADIX_DIGITS - 1) ?
+                        (BLOCK_THREADS * ITEMS_PER_THREAD) :
+                        temp_storage.exclusive_digit_prefix[bin_idx + 1];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Update global scatter base offsets for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_offset[track] -= exclusive_digit_prefix[track];
+                temp_storage.relative_bin_offsets[bin_idx] = bin_offset[track];
+                bin_offset[track] += inclusive_digit_prefix[track];
+            }
+        }
+
+        CTA_SYNC();
+
+        // Scatter keys
+        ScatterKeys<FULL_TILE>(keys, relative_bin_offsets, ranks, valid_items);
+
+        // Gather/scatter values
+        GatherScatterValues<FULL_TILE>(relative_bin_offsets , ranks, block_offset, valid_items, Int2Type<KEYS_ONLY>());
+    }
+
+    //---------------------------------------------------------------------
+    // Copy shortcut
+    //---------------------------------------------------------------------
+
+    /**
+     * Copy tiles within the range of input
+     */
+    template <
+        typename InputIteratorT,
+        typename T>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  d_in,
+        T               *d_out,
+        OffsetT         block_offset,
+        OffsetT         block_end)
+    {
+        // Simply copy the input
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items);
+
+            block_offset += TILE_ITEMS;
+        }
+
+        // Clean up last partial tile with guarded-I/O
+        if (block_offset < block_end)
+        {
+            OffsetT valid_items = block_end - block_offset;
+
+            T items[ITEMS_PER_THREAD];
+
+            LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_in + block_offset, items, valid_items);
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(threadIdx.x, d_out + block_offset, items, valid_items);
+        }
+    }
+
+
+    /**
+     * Copy tiles within the range of input (specialized for NullType)
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Copy(
+        InputIteratorT  /*d_in*/,
+        NullType        * /*d_out*/,
+        OffsetT         /*block_offset*/,
+        OffsetT         /*block_end*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         (&bin_offset)[BINS_TRACKED_PER_THREAD],
+        OffsetT         num_items,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            this->bin_offset[track] = bin_offset[track];
+
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                // Short circuit if the histogram has only bin counts of only zeros or problem-size
+                short_circuit = short_circuit && ((bin_offset[track] == 0) || (bin_offset[track] == num_items));
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortDownsweep(
+        TempStorage     &temp_storage,
+        OffsetT         num_items,
+        OffsetT         *d_spine,
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             current_bit,
+        int             num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        d_values_in(d_values_in),
+        d_keys_out(reinterpret_cast<UnsignedBits*>(d_keys_out)),
+        d_values_out(d_values_out),
+        current_bit(current_bit),
+        num_bits(num_bits),
+        short_circuit(1)
+    {
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            // Load digit bin offsets (each of the first RADIX_DIGITS threads will load an offset for that digit)
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Short circuit if the first block's histogram has only bin counts of only zeros or problem-size
+                OffsetT first_block_bin_offset = d_spine[gridDim.x * bin_idx];
+                short_circuit = short_circuit && ((first_block_bin_offset == 0) || (first_block_bin_offset == num_items));
+
+                // Load my block's bin offset for my bin
+                bin_offset[track] = d_spine[(gridDim.x * bin_idx) + blockIdx.x];
+            }
+        }
+
+        short_circuit = CTA_SYNC_AND(short_circuit);
+    }
+
+
+    /**
+     * Distribute keys from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT   block_offset,
+        OffsetT   block_end)
+    {
+        if (short_circuit)
+        {
+            // Copy keys
+            Copy(d_keys_in, d_keys_out, block_offset, block_end);
+
+            // Copy values
+            Copy(d_values_in, d_values_out, block_offset, block_end);
+        }
+        else
+        {
+            // Process full tiles of tile_items
+            #pragma unroll 1
+            while (block_offset + TILE_ITEMS <= block_end)
+            {
+                ProcessTile<true>(block_offset);
+                block_offset += TILE_ITEMS;
+
+                CTA_SYNC();
+            }
+
+            // Clean up last partial tile with guarded-I/O
+            if (block_offset < block_end)
+            {
+                ProcessTile<false>(block_offset, block_end - block_offset);
+            }
+
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_radix_sort_upsweep.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_radix_sort_upsweep.cuh
new file mode 100644
index 0000000000..2081cefba9
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/agent/agent_radix_sort_upsweep.cuh
@@ -0,0 +1,526 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+
+#pragma once
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_load.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../block/block_load.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRadixSortUpsweep
+ */
+template <
+    int                 _BLOCK_THREADS,     ///< Threads per thread block
+    int                 _ITEMS_PER_THREAD,  ///< Items per thread (per tile of input)
+    CacheLoadModifier   _LOAD_MODIFIER,     ///< Cache load modifier for reading keys
+    int                 _RADIX_BITS>        ///< The number of radix bits, i.e., log2(bins)
+struct AgentRadixSortUpsweepPolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
+        RADIX_BITS          = _RADIX_BITS,          ///< The number of radix bits, i.e., log2(bins)
+    };
+
+    static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER;      ///< Cache load modifier for reading keys
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRadixSortUpsweep implements a stateful abstraction of CUDA thread blocks for participating in device-wide radix sort upsweep .
+ */
+template <
+    typename AgentRadixSortUpsweepPolicy,   ///< Parameterized AgentRadixSortUpsweepPolicy tuning policy type
+    typename KeyT,                          ///< KeyT type
+    typename OffsetT>                       ///< Signed integer type for global offsets
+struct AgentRadixSortUpsweep
+{
+
+    //---------------------------------------------------------------------
+    // Type definitions and constants
+    //---------------------------------------------------------------------
+
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBits;
+
+    // Integer type for digit counters (to be packed into words of PackedCounters)
+    typedef unsigned char DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef unsigned int PackedCounter;
+
+    static const CacheLoadModifier LOAD_MODIFIER = AgentRadixSortUpsweepPolicy::LOAD_MODIFIER;
+
+    enum
+    {
+        RADIX_BITS              = AgentRadixSortUpsweepPolicy::RADIX_BITS,
+        BLOCK_THREADS           = AgentRadixSortUpsweepPolicy::BLOCK_THREADS,
+        KEYS_PER_THREAD         = AgentRadixSortUpsweepPolicy::ITEMS_PER_THREAD,
+
+        RADIX_DIGITS            = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS        = CUB_PTX_LOG_WARP_THREADS,
+        WARP_THREADS            = 1 << LOG_WARP_THREADS,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        TILE_ITEMS              = BLOCK_THREADS * KEYS_PER_THREAD,
+
+        BYTES_PER_COUNTER       = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER   = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO           = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO       = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES       = CUB_MAX(0, RADIX_BITS - LOG_PACKING_RATIO),
+        COUNTER_LANES           = 1 << LOG_COUNTER_LANES,
+
+        // To prevent counter overflow, we must periodically unpack and aggregate the
+        // digit counters back into registers.  Each counter lane is assigned to a
+        // warp for aggregation.
+
+        LANES_PER_WARP          = CUB_MAX(1, (COUNTER_LANES + WARPS - 1) / WARPS),
+
+        // Unroll tiles in batches without risk of counter overflow
+        UNROLL_COUNT            = CUB_MIN(64, 255 / KEYS_PER_THREAD),
+        UNROLLED_ELEMENTS       = UNROLL_COUNT * TILE_ITEMS,
+    };
+
+
+    // Input iterator wrapper type (for applying cache modifier)s
+    typedef CacheModifiedInputIterator<LOAD_MODIFIER, UnsignedBits, OffsetT> KeysItr;
+
+    /**
+     * Shared memory storage layout
+     */
+    union __align__(16) _TempStorage
+    {
+        DigitCounter    thread_counters[COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+        PackedCounter   packed_thread_counters[COUNTER_LANES][BLOCK_THREADS];
+        OffsetT         block_counters[WARP_THREADS][RADIX_DIGITS];
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields (aggregate state bundle)
+    //---------------------------------------------------------------------
+
+    // Shared storage for this CTA
+    _TempStorage    &temp_storage;
+
+    // Thread-local counters for periodically aggregating composite-counter lanes
+    OffsetT         local_counts[LANES_PER_WARP][PACKING_RATIO];
+
+    // Input and output device pointers
+    KeysItr         d_keys_in;
+
+    // The least-significant bit position of the current digit to extract
+    int             current_bit;
+
+    // Number of bits in current digit
+    int             num_bits;
+
+
+
+    //---------------------------------------------------------------------
+    // Helper structure for templated iteration
+    //---------------------------------------------------------------------
+
+    // Iterate
+    template <int COUNT, int MAX>
+    struct Iterate
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(
+            AgentRadixSortUpsweep       &cta,
+            UnsignedBits                keys[KEYS_PER_THREAD])
+        {
+            cta.Bucket(keys[COUNT]);
+
+            // Next
+            Iterate<COUNT + 1, MAX>::BucketKeys(cta, keys);
+        }
+    };
+
+    // Terminate
+    template <int MAX>
+    struct Iterate<MAX, MAX>
+    {
+        // BucketKeys
+        static __device__ __forceinline__ void BucketKeys(AgentRadixSortUpsweep &/*cta*/, UnsignedBits /*keys*/[KEYS_PER_THREAD]) {}
+    };
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Decode a key and increment corresponding smem digit counter
+     */
+    __device__ __forceinline__ void Bucket(UnsignedBits key)
+    {
+        // Perform transform op
+        UnsignedBits converted_key = Traits<KeyT>::TwiddleIn(key);
+
+        // Extract current digit bits
+        UnsignedBits digit = BFE(converted_key, current_bit, num_bits);
+
+        // Get sub-counter offset
+        UnsignedBits sub_counter = digit & (PACKING_RATIO - 1);
+
+        // Get row offset
+        UnsignedBits row_offset = digit >> LOG_PACKING_RATIO;
+
+        // Increment counter
+        temp_storage.thread_counters[row_offset][threadIdx.x][sub_counter]++;
+    }
+
+
+    /**
+     * Reset composite counters
+     */
+    __device__ __forceinline__ void ResetDigitCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < COUNTER_LANES; LANE++)
+        {
+            temp_storage.packed_thread_counters[LANE][threadIdx.x] = 0;
+        }
+    }
+
+
+    /**
+     * Reset the unpacked counters in each thread
+     */
+    __device__ __forceinline__ void ResetUnpackedCounters()
+    {
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            #pragma unroll
+            for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+            {
+                local_counts[LANE][UNPACKED_COUNTER] = 0;
+            }
+        }
+    }
+
+
+    /**
+     * Extracts and aggregates the digit counters for each counter lane
+     * owned by this warp
+     */
+    __device__ __forceinline__ void UnpackDigitCounts()
+    {
+        unsigned int warp_id = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid = LaneId();
+
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            const int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                #pragma unroll
+                for (int PACKED_COUNTER = 0; PACKED_COUNTER < BLOCK_THREADS; PACKED_COUNTER += WARP_THREADS)
+                {
+                    #pragma unroll
+                    for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                    {
+                        OffsetT counter = temp_storage.thread_counters[counter_lane][warp_tid + PACKED_COUNTER][UNPACKED_COUNTER];
+                        local_counts[LANE][UNPACKED_COUNTER] += counter;
+                    }
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Processes a single, full tile
+     */
+    __device__ __forceinline__ void ProcessFullTile(OffsetT block_offset)
+    {
+        // Tile of keys
+        UnsignedBits keys[KEYS_PER_THREAD];
+
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_keys_in + block_offset, keys);
+
+        // Prevent hoisting
+        CTA_SYNC();
+
+        // Bucket tile of keys
+        Iterate<0, KEYS_PER_THREAD>::BucketKeys(*this, keys);
+    }
+
+
+    /**
+     * Processes a single load (may have some threads masked off)
+     */
+    __device__ __forceinline__ void ProcessPartialTile(
+        OffsetT block_offset,
+        const OffsetT &block_end)
+    {
+        // Process partial tile if necessary using single loads
+        block_offset += threadIdx.x;
+        while (block_offset < block_end)
+        {
+            // Load and bucket key
+            UnsignedBits key = d_keys_in[block_offset];
+            Bucket(key);
+            block_offset += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentRadixSortUpsweep(
+        TempStorage &temp_storage,
+        const KeyT  *d_keys_in,
+        int         current_bit,
+        int         num_bits)
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(reinterpret_cast<const UnsignedBits*>(d_keys_in)),
+        current_bit(current_bit),
+        num_bits(num_bits)
+    {}
+
+
+    /**
+     * Compute radix digit histograms from a segment of input tiles.
+     */
+    __device__ __forceinline__ void ProcessRegion(
+        OffsetT          block_offset,
+        const OffsetT    &block_end)
+    {
+        // Reset digit counters in smem and unpacked counters in registers
+        ResetDigitCounters();
+        ResetUnpackedCounters();
+
+        // Unroll batches of full tiles
+        while (block_offset + UNROLLED_ELEMENTS <= block_end)
+        {
+            for (int i = 0; i < UNROLL_COUNT; ++i)
+            {
+                ProcessFullTile(block_offset);
+                block_offset += TILE_ITEMS;
+            }
+
+            CTA_SYNC();
+
+            // Aggregate back into local_count registers to prevent overflow
+            UnpackDigitCounts();
+
+            CTA_SYNC();
+
+            // Reset composite counters in lanes
+            ResetDigitCounters();
+        }
+
+        // Unroll single full tiles
+        while (block_offset + TILE_ITEMS <= block_end)
+        {
+            ProcessFullTile(block_offset);
+            block_offset += TILE_ITEMS;
+        }
+
+        // Process partial tile if necessary
+        ProcessPartialTile(
+            block_offset,
+            block_end);
+
+        CTA_SYNC();
+
+        // Aggregate back into local_count registers
+        UnpackDigitCounts();
+    }
+
+
+    /**
+     * Extract counts (saving them to the external array)
+     */
+    template <bool IS_DESCENDING>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT     *counters,
+        int         bin_stride = 1,
+        int         bin_offset = 0)
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+
+        // Whole blocks
+        #pragma unroll
+        for (int BIN_BASE   = RADIX_DIGITS % BLOCK_THREADS;
+            (BIN_BASE + BLOCK_THREADS) <= RADIX_DIGITS;
+            BIN_BASE += BLOCK_THREADS)
+        {
+            int bin_idx = BIN_BASE + threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+
+        // Remainder
+        if ((RADIX_DIGITS % BLOCK_THREADS != 0) && (threadIdx.x < RADIX_DIGITS))
+        {
+            int bin_idx = threadIdx.x;
+
+            OffsetT bin_count = 0;
+            #pragma unroll
+            for (int i = 0; i < WARP_THREADS; ++i)
+                bin_count += temp_storage.block_counters[i][bin_idx];
+
+            if (IS_DESCENDING)
+                bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+            counters[(bin_stride * bin_idx) + bin_offset] = bin_count;
+        }
+    }
+
+
+    /**
+     * Extract counts
+     */
+    template <int BINS_TRACKED_PER_THREAD>
+    __device__ __forceinline__ void ExtractCounts(
+        OffsetT (&bin_count)[BINS_TRACKED_PER_THREAD])  ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        unsigned int warp_id    = threadIdx.x >> LOG_WARP_THREADS;
+        unsigned int warp_tid   = LaneId();
+
+        // Place unpacked digit counters in shared memory
+        #pragma unroll
+        for (int LANE = 0; LANE < LANES_PER_WARP; LANE++)
+        {
+            int counter_lane = (LANE * WARPS) + warp_id;
+            if (counter_lane < COUNTER_LANES)
+            {
+                int digit_row = counter_lane << LOG_PACKING_RATIO;
+
+                #pragma unroll
+                for (int UNPACKED_COUNTER = 0; UNPACKED_COUNTER < PACKING_RATIO; UNPACKED_COUNTER++)
+                {
+                    int bin_idx = digit_row + UNPACKED_COUNTER;
+
+                    temp_storage.block_counters[warp_tid][bin_idx] =
+                        local_counts[LANE][UNPACKED_COUNTER];
+                }
+            }
+        }
+
+        CTA_SYNC();
+
+        // Rake-reduce bin_count reductions
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                bin_count[track] = 0;
+
+                #pragma unroll
+                for (int i = 0; i < WARP_THREADS; ++i)
+                    bin_count[track] += temp_storage.block_counters[i][bin_idx];
+            }
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_reduce.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_reduce.cuh
new file mode 100644
index 0000000000..000a905ccf
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/agent/agent_reduce.cuh
@@ -0,0 +1,385 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../block/block_load.cuh"
+#include "../block/block_reduce.cuh"
+#include "../grid/grid_mapping.cuh"
+#include "../grid/grid_even_share.cuh"
+#include "../util_type.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduce
+ */
+template <
+    int                     _BLOCK_THREADS,         ///< Threads per thread block
+    int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
+    int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
+    BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
+    CacheLoadModifier       _LOAD_MODIFIER>         ///< Cache load modifier for reading input elements
+struct AgentReducePolicy
+{
+    enum
+    {
+        BLOCK_THREADS       = _BLOCK_THREADS,       ///< Threads per thread block
+        ITEMS_PER_THREAD    = _ITEMS_PER_THREAD,    ///< Items per thread (per tile of input)
+        VECTOR_LOAD_LENGTH  = _VECTOR_LOAD_LENGTH,  ///< Number of items per vectorized load
+    };
+
+    static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
+    static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+};
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduce implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduction .
+ *
+ * Each thread reduces only the values it loads. If \p FIRST_TILE, this
+ * partial reduction is stored into \p thread_aggregate.  Otherwise it is
+ * accumulated into \p thread_aggregate.
+ */
+template <
+    typename AgentReducePolicy,        ///< Parameterized AgentReducePolicy tuning policy type
+    typename InputIteratorT,           ///< Random-access iterator type for input
+    typename OutputIteratorT,          ///< Random-access iterator type for output
+    typename OffsetT,                  ///< Signed integer type for global offsets
+    typename ReductionOp>              ///< Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+struct AgentReduce
+{
+
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    /// The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    /// Vector type of InputT for data movement
+    typedef typename CubVector<InputT, AgentReducePolicy::VECTOR_LOAD_LENGTH>::Type VectorT;
+
+    /// Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, InputT, OffsetT>,      // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReducePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReducePolicy::ITEMS_PER_THREAD,
+        VECTOR_LOAD_LENGTH  = CUB_MIN(ITEMS_PER_THREAD, AgentReducePolicy::VECTOR_LOAD_LENGTH),
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Can vectorize according to the policy if the input iterator is a native pointer to a primitive type
+        ATTEMPT_VECTORIZATION   = (VECTOR_LOAD_LENGTH > 1) &&
+                                    (ITEMS_PER_THREAD % VECTOR_LOAD_LENGTH == 0) &&
+                                    (IsPointer<InputIteratorT>::VALUE) && Traits<InputT>::PRIMITIVE,
+
+    };
+
+    static const CacheLoadModifier    LOAD_MODIFIER   = AgentReducePolicy::LOAD_MODIFIER;
+    static const BlockReduceAlgorithm BLOCK_ALGORITHM = AgentReducePolicy::BLOCK_ALGORITHM;
+
+    /// Parameterized BlockReduce primitive
+    typedef BlockReduce<OutputT, BLOCK_THREADS, AgentReducePolicy::BLOCK_ALGORITHM> BlockReduceT;
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        typename BlockReduceT::TempStorage  reduce;
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&           temp_storage;       ///< Reference to temp_storage
+    InputIteratorT          d_in;               ///< Input data to reduce
+    WrappedInputIteratorT   d_wrapped_in;       ///< Wrapped input data to reduce
+    ReductionOp             reduction_op;       ///< Binary reduction operator
+
+
+    //---------------------------------------------------------------------
+    // Utility
+    //---------------------------------------------------------------------
+
+
+    // Whether or not the input is aligned with the vector type (specialized for types we can vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        d_in,
+        Int2Type<true>  /*can_vectorize*/)
+    {
+        return (size_t(d_in) & (sizeof(VectorT) - 1)) == 0;
+    }
+
+    // Whether or not the input is aligned with the vector type (specialized for types we cannot vectorize)
+    template <typename Iterator>
+    static __device__ __forceinline__ bool IsAligned(
+        Iterator        /*d_in*/,
+        Int2Type<false> /*can_vectorize*/)
+    {
+        return false;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentReduce(
+        TempStorage&            temp_storage,       ///< Reference to temp_storage
+        InputIteratorT          d_in,               ///< Input data to reduce
+        ReductionOp             reduction_op)       ///< Binary reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_wrapped_in(d_in),
+        reduction_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Tile consumption
+    //---------------------------------------------------------------------
+
+    /**
+     * Consume a full tile of input (non-vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<false>         /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        OutputT items[ITEMS_PER_THREAD];
+
+        // Load items in striped fashion
+        LoadDirectStriped<BLOCK_THREADS>(threadIdx.x, d_wrapped_in + block_offset, items);
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a full tile of input (vectorized)
+     */
+    template <int IS_FIRST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     /*valid_items*/,    ///< The number of valid items in the tile
+        Int2Type<true>          /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<true>          /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Alias items as an array of VectorT and load it in striped fashion
+        enum { WORDS =  ITEMS_PER_THREAD / VECTOR_LOAD_LENGTH };
+
+        // Fabricate a vectorized input iterator
+        InputT *d_in_unqualified = const_cast<InputT*>(d_in) + block_offset + (threadIdx.x * VECTOR_LOAD_LENGTH);
+        CacheModifiedInputIterator<AgentReducePolicy::LOAD_MODIFIER, VectorT, OffsetT> d_vec_in(
+            reinterpret_cast<VectorT*>(d_in_unqualified));
+
+        // Load items as vector items
+        InputT input_items[ITEMS_PER_THREAD];
+        VectorT *vec_items = reinterpret_cast<VectorT*>(input_items);
+        #pragma unroll
+        for (int i = 0; i < WORDS; ++i)
+            vec_items[i] = d_vec_in[BLOCK_THREADS * i];
+
+        // Convert from input type to output type
+        OutputT items[ITEMS_PER_THREAD];
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+            items[i] = input_items[i];
+
+        // Reduce items within each thread stripe
+        thread_aggregate = (IS_FIRST_TILE) ?
+            internal::ThreadReduce(items, reduction_op) :
+            internal::ThreadReduce(items, reduction_op, thread_aggregate);
+    }
+
+
+    /**
+     * Consume a partial tile of input
+     */
+    template <int IS_FIRST_TILE, int CAN_VECTORIZE>
+    __device__ __forceinline__ void ConsumeTile(
+        OutputT                 &thread_aggregate,
+        OffsetT                 block_offset,       ///< The offset the tile to consume
+        int                     valid_items,        ///< The number of valid items in the tile
+        Int2Type<false>         /*is_full_tile*/,   ///< Whether or not this is a full tile
+        Int2Type<CAN_VECTORIZE> /*can_vectorize*/)  ///< Whether or not we can vectorize loads
+    {
+        // Partial tile
+        int thread_offset = threadIdx.x;
+
+        // Read first item
+        if ((IS_FIRST_TILE) && (thread_offset < valid_items))
+        {
+            thread_aggregate = d_wrapped_in[block_offset + thread_offset];
+            thread_offset += BLOCK_THREADS;
+        }
+
+        // Continue reading items (block-striped)
+        while (thread_offset < valid_items)
+        {
+            OutputT item        = d_wrapped_in[block_offset + thread_offset];
+            thread_aggregate    = reduction_op(thread_aggregate, item);
+            thread_offset       += BLOCK_THREADS;
+        }
+    }
+
+
+    //---------------------------------------------------------------
+    // Consume a contiguous segment of tiles
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    template <int CAN_VECTORIZE>
+    __device__ __forceinline__ OutputT ConsumeRange(
+        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
+        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
+    {
+        OutputT thread_aggregate;
+
+        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
+        {
+            // First tile isn't full (not all threads have valid items)
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
+        }
+
+        // At least one full block
+        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+        even_share.block_offset += even_share.block_stride;
+
+        // Consume subsequent full tiles of input
+        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
+        {
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+            even_share.block_offset += even_share.block_stride;
+        }
+
+        // Consume a partially-full tile
+        if (even_share.block_offset < even_share.block_end)
+        {
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
+        }
+
+        // Compute block-wide reduction (all threads have valid items)
+        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
+    }
+
+
+    /**
+     * \brief Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeRange(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        GridEvenShare<OffsetT> even_share;
+        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
+
+        return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+    }
+
+
+    /**
+     * Reduce a contiguous segment of input tiles
+     */
+    __device__ __forceinline__ OutputT ConsumeTiles(
+        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
+    {
+        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
+
+        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
+
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_reduce_by_key.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_reduce_by_key.cuh
new file mode 100644
index 0000000000..51964d3e68
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/agent/agent_reduce_by_key.cuh
@@ -0,0 +1,547 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentReduceByKey
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentReduceByKeyPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentReduceByKey implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentReduceByKeyPolicyT,        ///< Parameterized AgentReduceByKeyPolicy tuning policy type
+    typename    KeysInputIteratorT,             ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,          ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,           ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,         ///< Output iterator type for recording number of items selected
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentReduceByKey
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueOutputT> OffsetValuePairT;
+
+    // Tuple type for pairing keys and values
+    typedef KeyValuePair<KeyOutputT, ValueOutputT> KeyValuePairT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
+
+    // Guarded inequality functor
+    template <typename _EqualityOpT>
+    struct GuardedInequalityWrapper
+    {
+        _EqualityOpT     op;             ///< Wrapped equality operator
+        int             num_remaining;  ///< Items remaining
+
+        /// Constructor
+        __host__ __device__ __forceinline__
+        GuardedInequalityWrapper(_EqualityOpT op, int num_remaining) : op(op), num_remaining(num_remaining) {}
+
+        /// Boolean inequality operator, returns <tt>(a != b)</tt>
+        template <typename T>
+        __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b, int idx) const
+        {
+            if (idx < num_remaining)
+                return !op(a, b);   // In bounds
+
+            // Return true if first out-of-bounds item, false otherwise
+            return (idx == num_remaining);
+       }
+    };
+
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentReduceByKeyPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentReduceByKeyPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER   = (ITEMS_PER_THREAD > 1),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueOutputT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If<IsPointer<KeysInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, KeyInputT, OffsetT>,     // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            KeysInputIteratorT>::Type                                                                   // Directly use the supplied input iterator type
+        WrappedKeysInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If<IsPointer<ValuesInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            ValuesInputIteratorT>::Type                                                                 // Directly use the supplied input iterator type
+        WrappedValuesInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentReduceByKeyPolicyT::LOAD_MODIFIER, ValueInputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                            // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceBySegmentOp<ReductionOpT> ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for keys
+    typedef BlockLoad<
+            KeyOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadKeysT;
+
+    // Parameterized BlockLoad type for values
+    typedef BlockLoad<
+            ValueOutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentReduceByKeyPolicyT::LOAD_ALGORITHM>
+        BlockLoadValuesT;
+
+    // Parameterized BlockDiscontinuity type for keys
+    typedef BlockDiscontinuity<
+            KeyOutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityKeys;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetValuePairT,
+            BLOCK_THREADS,
+            AgentReduceByKeyPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Key and value exchange types
+    typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
+    typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityKeys::TempStorage    discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadKeysT::TempStorage load_keys;
+
+        // Smem needed for loading values
+        typename BlockLoadValuesT::TempStorage load_values;
+
+        // Smem needed for compacting key value pairs(allows non POD items in this union)
+        Uninitialized<KeyValuePairT[TILE_ITEMS + 1]> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedKeysInputIteratorT       d_keys_in;          ///< Input keys
+    UniqueOutputIteratorT           d_unique_out;       ///< Unique output keys
+    WrappedValuesInputIteratorT     d_values_in;        ///< Input values
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    NumRunsOutputIteratorT          d_num_runs_out;     ///< Output pointer for total number of segments identified
+    EqualityOpT                     equality_op;        ///< KeyT equality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentReduceByKey(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        KeysInputIteratorT          d_keys_in,          ///< Input keys
+        UniqueOutputIteratorT       d_unique_out,       ///< Unique output keys
+        ValuesInputIteratorT        d_values_in,        ///< Input values
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        NumRunsOutputIteratorT      d_num_runs_out,     ///< Output pointer for total number of segments identified
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_keys_in(d_keys_in),
+        d_unique_out(d_unique_out),
+        d_values_in(d_values_in),
+        d_aggregates_out(d_aggregates_out),
+        d_num_runs_out(d_num_runs_out),
+        equality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Directly scatter flagged items to output offsets
+     */
+    __device__ __forceinline__ void ScatterDirect(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD])
+    {
+        // Scatter flagged keys and values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                d_unique_out[segment_indices[ITEM]]     = scatter_items[ITEM].key;
+                d_aggregates_out[segment_indices[ITEM]] = scatter_items[ITEM].value;
+            }
+        }
+    }
+
+
+    /**
+     * 2-phase scatter flagged items to output offsets
+     *
+     * The exclusive scan causes each head flag to be paired with the previous
+     * value aggregate: the scatter offsets must be decremented for value aggregates
+     */
+    __device__ __forceinline__ void ScatterTwoPhase(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        CTA_SYNC();
+
+        // Compact and scatter pairs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (segment_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[segment_indices[ITEM] - num_tile_segments_prefix] = scatter_items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_segments; item += BLOCK_THREADS)
+        {
+            KeyValuePairT pair                                  = temp_storage.raw_exchange.Alias()[item];
+            d_unique_out[num_tile_segments_prefix + item]       = pair.key;
+            d_aggregates_out[num_tile_segments_prefix + item]   = pair.value;
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    __device__ __forceinline__ void Scatter(
+        KeyValuePairT   (&scatter_items)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&segment_indices)[ITEMS_PER_THREAD],
+        OffsetT         num_tile_segments,
+        OffsetT         num_tile_segments_prefix)
+    {
+        // Do a one-phase scatter if (a) two-phase is disabled or (b) the average number of selected items per thread is less than one
+        if (TWO_PHASE_SCATTER && (num_tile_segments > BLOCK_THREADS))
+        {
+            ScatterTwoPhase(
+                scatter_items,
+                segment_flags,
+                segment_indices,
+                num_tile_segments,
+                num_tile_segments_prefix);
+        }
+        else
+        {
+            ScatterDirect(
+                scatter_items,
+                segment_flags,
+                segment_indices);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        KeyOutputT          keys[ITEMS_PER_THREAD];             // Tile keys
+        KeyOutputT          prev_keys[ITEMS_PER_THREAD];        // Tile keys shuffled up
+        ValueOutputT        values[ITEMS_PER_THREAD];           // Tile values
+        OffsetT             head_flags[ITEMS_PER_THREAD];       // Segment head flags
+        OffsetT             segment_indices[ITEMS_PER_THREAD];  // Segment indices
+        OffsetValuePairT    scan_items[ITEMS_PER_THREAD];       // Zipped values and segment flags|indices
+        KeyValuePairT       scatter_items[ITEMS_PER_THREAD];    // Zipped key value pairs for scattering
+
+        // Load keys
+        if (IS_LAST_TILE)
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys, num_remaining);
+        else
+            BlockLoadKeysT(temp_storage.load_keys).Load(d_keys_in + tile_offset, keys);
+
+        // Load tile predecessor key in first thread
+        KeyOutputT tile_predecessor;
+        if (threadIdx.x == 0)
+        {
+            tile_predecessor = (tile_idx == 0) ?
+                keys[0] :                       // First tile gets repeat of first item (thus first item will not be flagged as a head)
+                d_keys_in[tile_offset - 1];     // Subsequent tiles get last key from previous tile
+        }
+
+        CTA_SYNC();
+
+        // Load values
+        if (IS_LAST_TILE)
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values, num_remaining);
+        else
+            BlockLoadValuesT(temp_storage.load_values).Load(d_values_in + tile_offset, values);
+
+        CTA_SYNC();
+
+        // Initialize head-flags and shuffle up the previous keys
+        if (IS_LAST_TILE)
+        {
+            // Use custom flag operator to additionally flag the first out-of-bounds item
+            GuardedInequalityWrapper<EqualityOpT> flag_op(equality_op, num_remaining);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+        else
+        {
+            InequalityWrapper<EqualityOpT> flag_op(equality_op);
+            BlockDiscontinuityKeys(temp_storage.discontinuity).FlagHeads(
+                head_flags, keys, prev_keys, flag_op, tile_predecessor);
+        }
+
+        // Zip values and head flags
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scan_items[ITEM].value  = values[ITEM];
+            scan_items[ITEM].key    = head_flags[ITEM];
+        }
+
+        // Perform exclusive tile scan
+        OffsetValuePairT    block_aggregate;        // Inclusive block-wide scan aggregate
+        OffsetT             num_segments_prefix;    // Number of segments prior to this tile
+        OffsetValuePairT    total_aggregate;        // The tile prefix folded with block_aggregate
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, block_aggregate);
+            num_segments_prefix     = 0;
+            total_aggregate         = block_aggregate;
+
+            // Update tile status if there are successor tiles
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(scan_items, scan_items, scan_op, prefix_op);
+
+            block_aggregate         = prefix_op.GetBlockAggregate();
+            num_segments_prefix     = prefix_op.GetExclusivePrefix().key;
+            total_aggregate         = prefix_op.GetInclusivePrefix();
+        }
+
+        // Rezip scatter items and segment indices
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            scatter_items[ITEM].key     = prev_keys[ITEM];
+            scatter_items[ITEM].value   = scan_items[ITEM].value;
+            segment_indices[ITEM]       = scan_items[ITEM].key;
+        }
+
+        // At this point, each flagged segment head has:
+        //  - The key for the previous segment
+        //  - The reduced value from the previous segment
+        //  - The segment index for the reduced value
+
+        // Scatter flagged keys and values
+        OffsetT num_tile_segments = block_aggregate.key;
+        Scatter(scatter_items, head_flags, segment_indices, num_tile_segments, num_segments_prefix);
+
+        // Last thread in last tile will output final count (and last pair, if necessary)
+        if ((IS_LAST_TILE) && (threadIdx.x == BLOCK_THREADS - 1))
+        {
+            OffsetT num_segments = num_segments_prefix + num_tile_segments;
+
+            // If the last tile is a whole tile, output the final_value
+            if (num_remaining == TILE_ITEMS)
+            {
+                d_unique_out[num_segments]      = keys[ITEMS_PER_THREAD - 1];
+                d_aggregates_out[num_segments]  = total_aggregate.value;
+                num_segments++;
+            }
+
+            // Output the total number of items selected
+            *d_num_runs_out = num_segments;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_rle.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_rle.cuh
new file mode 100644
index 0000000000..cb7a4a652d
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/agent/agent_rle.cuh
@@ -0,0 +1,837 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentRle
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    bool                        _STORE_WARP_TIME_SLICING,       ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentRlePolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+        STORE_WARP_TIME_SLICING = _STORE_WARP_TIME_SLICING,     ///< Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentRle implements a stateful abstraction of CUDA thread blocks for participating in device-wide run-length-encode 
+ */
+template <
+    typename    AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename    InputIteratorT,         ///< Random-access input iterator type for data
+    typename    OffsetsOutputIteratorT, ///< Random-access output iterator type for offset values
+    typename    LengthsOutputIteratorT, ///< Random-access output iterator type for length values
+    typename    EqualityOpT,            ///< T equality operator type
+    typename    OffsetT>                ///< Signed integer type for global offsets
+struct AgentRle
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
+
+    /// The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    /// Tuple type for scanning (pairs run-length and run-index)
+    typedef KeyValuePair<OffsetT, LengthT> LengthOffsetPair;
+
+    /// Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        WARP_THREADS            = CUB_WARP_THREADS(PTX_ARCH),
+        BLOCK_THREADS           = AgentRlePolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentRlePolicyT::ITEMS_PER_THREAD,
+        WARP_ITEMS              = WARP_THREADS * ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        WARPS                   = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// Whether or not to sync after loading data
+        SYNC_AFTER_LOAD         = (AgentRlePolicyT::LOAD_ALGORITHM != BLOCK_LOAD_DIRECT),
+
+        /// Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any store-related data transpositions (versus each warp having its own storage)
+        STORE_WARP_TIME_SLICING = AgentRlePolicyT::STORE_WARP_TIME_SLICING,
+        ACTIVE_EXCHANGE_WARPS   = (STORE_WARP_TIME_SLICING) ? 1 : WARPS,
+    };
+
+
+    /**
+     * Special operator that signals all out-of-bounds items are not equal to everything else,
+     * forcing both (1) the last item to be tail-flagged and (2) all oob items to be marked
+     * trivial.
+     */
+    template <bool LAST_TILE>
+    struct OobInequalityOp
+    {
+        OffsetT         num_remaining;
+        EqualityOpT      equality_op;
+
+        __device__ __forceinline__ OobInequalityOp(
+            OffsetT     num_remaining,
+            EqualityOpT  equality_op)
+        :
+            num_remaining(num_remaining),
+            equality_op(equality_op)
+        {}
+
+        template <typename Index>
+        __host__ __device__ __forceinline__ bool operator()(T first, T second, Index idx)
+        {
+            if (!LAST_TILE || (idx < num_remaining))
+                return !equality_op(first, second);
+            else
+                return true;
+        }
+    };
+
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for data
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentRlePolicyT::LOAD_MODIFIER, T, OffsetT>,      // Wrap the native input pointer with CacheModifiedVLengthnputIterator
+            InputIteratorT>::Type                                                       // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Parameterized BlockLoad type for data
+    typedef BlockLoad<
+            T,
+            AgentRlePolicyT::BLOCK_THREADS,
+            AgentRlePolicyT::ITEMS_PER_THREAD,
+            AgentRlePolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockDiscontinuity type for data
+    typedef BlockDiscontinuity<T, BLOCK_THREADS> BlockDiscontinuityT;
+
+    // Parameterized WarpScan type
+    typedef WarpScan<LengthOffsetPair> WarpScanPairs;
+
+    // Reduce-length-by-run scan operator
+    typedef ReduceBySegmentOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            LengthOffsetPair,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Warp exchange types
+    typedef WarpExchange<LengthOffsetPair, ITEMS_PER_THREAD>        WarpExchangePairs;
+
+    typedef typename If<STORE_WARP_TIME_SLICING, typename WarpExchangePairs::TempStorage, NullType>::Type WarpExchangePairsStorage;
+
+    typedef WarpExchange<OffsetT, ITEMS_PER_THREAD>                 WarpExchangeOffsets;
+    typedef WarpExchange<LengthT, ITEMS_PER_THREAD>                 WarpExchangeLengths;
+
+    typedef LengthOffsetPair WarpAggregates[WARPS];
+
+    // Shared memory type for this thread block
+    struct _TempStorage
+    {
+        // Aliasable storage layout
+        union Aliasable
+        {
+            struct
+            {
+                typename BlockDiscontinuityT::TempStorage       discontinuity;              // Smem needed for discontinuity detection
+                typename WarpScanPairs::TempStorage             warp_scan[WARPS];           // Smem needed for warp-synchronous scans
+                Uninitialized<LengthOffsetPair[WARPS]>          warp_aggregates;            // Smem needed for sharing warp-wide aggregates
+                typename TilePrefixCallbackOpT::TempStorage     prefix;                     // Smem needed for cooperative prefix callback
+            };
+
+            // Smem needed for input loading
+            typename BlockLoadT::TempStorage                    load;
+
+            // Aliasable layout needed for two-phase scatter
+            union ScatterAliasable
+            {
+                unsigned long long                              align;
+                WarpExchangePairsStorage                        exchange_pairs[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeOffsets::TempStorage       exchange_offsets[ACTIVE_EXCHANGE_WARPS];
+                typename WarpExchangeLengths::TempStorage       exchange_lengths[ACTIVE_EXCHANGE_WARPS];
+
+            } scatter_aliasable;
+
+        } aliasable;
+
+        OffsetT             tile_idx;                   // Shared tile index
+        LengthOffsetPair    tile_inclusive;             // Inclusive tile prefix
+        LengthOffsetPair    tile_exclusive;             // Exclusive tile prefix
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+
+    WrappedInputIteratorT           d_in;               ///< Pointer to input sequence of data items
+    OffsetsOutputIteratorT          d_offsets_out;      ///< Input run offsets
+    LengthsOutputIteratorT          d_lengths_out;      ///< Output run lengths
+
+    EqualityOpT                     equality_op;        ///< T equality operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-length-by-flag scan operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentRle(
+        TempStorage                 &temp_storage,      ///< [in] Reference to temp_storage
+        InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run offsets
+        LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run lengths
+        EqualityOpT                 equality_op,        ///< [in] T equality operator
+        OffsetT                     num_items)          ///< [in] Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_offsets_out(d_offsets_out),
+        d_lengths_out(d_lengths_out),
+        equality_op(equality_op),
+        scan_op(cub::Sum()),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    template <bool FIRST_TILE, bool LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT             tile_offset,
+        OffsetT             num_remaining,
+        T                   (&items)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        bool                head_flags[ITEMS_PER_THREAD];
+        bool                tail_flags[ITEMS_PER_THREAD];
+
+        OobInequalityOp<LAST_TILE> inequality_op(num_remaining, equality_op);
+
+        if (FIRST_TILE && LAST_TILE)
+        {
+            // First-and-last-tile always head-flags the first item and tail-flags the last item
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, items, inequality_op);
+        }
+        else if (FIRST_TILE)
+        {
+            // First-tile always head-flags the first item
+
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tail_flags, tile_successor_item, items, inequality_op);
+        }
+        else if (LAST_TILE)
+        {
+            // Last-tile always flags the last item
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, items, inequality_op);
+        }
+        else
+        {
+            // Get the first item from the next tile
+            T tile_successor_item;
+            if (threadIdx.x == BLOCK_THREADS - 1)
+                tile_successor_item = d_in[tile_offset + TILE_ITEMS];
+
+            // Get the last item from the previous tile
+            T tile_predecessor_item;
+            if (threadIdx.x == 0)
+                tile_predecessor_item = d_in[tile_offset - 1];
+
+            BlockDiscontinuityT(temp_storage.aliasable.discontinuity).FlagHeadsAndTails(
+                head_flags, tile_predecessor_item, tail_flags, tile_successor_item, items, inequality_op);
+        }
+
+        // Zip counts and runs
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            lengths_and_num_runs[ITEM].key      = head_flags[ITEM] && (!tail_flags[ITEM]);
+            lengths_and_num_runs[ITEM].value    = ((!head_flags[ITEM]) || (!tail_flags[ITEM]));
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scan of allocations
+     */
+    __device__ __forceinline__ void WarpScanAllocations(
+        LengthOffsetPair    &tile_aggregate,
+        LengthOffsetPair    &warp_aggregate,
+        LengthOffsetPair    &warp_exclusive_in_tile,
+        LengthOffsetPair    &thread_exclusive_in_warp,
+        LengthOffsetPair    (&lengths_and_num_runs)[ITEMS_PER_THREAD])
+    {
+        // Perform warpscans
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        LengthOffsetPair identity;
+        identity.key = 0;
+        identity.value = 0;
+
+        LengthOffsetPair thread_inclusive;
+        LengthOffsetPair thread_aggregate = internal::ThreadReduce(lengths_and_num_runs, scan_op);
+        WarpScanPairs(temp_storage.aliasable.warp_scan[warp_id]).Scan(
+            thread_aggregate,
+            thread_inclusive,
+            thread_exclusive_in_warp,
+            identity,
+            scan_op);
+
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.aliasable.warp_aggregates.Alias()[warp_id] = thread_inclusive;
+
+        CTA_SYNC();
+
+        // Accumulate total selected and the warp-wide prefix
+        warp_exclusive_in_tile          = identity;
+        warp_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[warp_id];
+        tile_aggregate                  = temp_storage.aliasable.warp_aggregates.Alias()[0];
+
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_exclusive_in_tile = tile_aggregate;
+
+            tile_aggregate = scan_op(tile_aggregate, temp_storage.aliasable.warp_aggregates.Alias()[WARP]);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for scattering selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Two-phase scatter, specialized for warp time-slicing
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<true>      is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Locally compact items within the warp (first warp)
+        if (warp_id == 0)
+        {
+            WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+        }
+
+        // Locally compact items within the warp (remaining warps)
+        #pragma unroll
+        for (int SLICE = 1; SLICE < WARPS; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                WarpExchangePairs(temp_storage.aliasable.scatter_aliasable.exchange_pairs[0]).ScatterToStriped(
+                    lengths_and_offsets, thread_num_runs_exclusive_in_warp);
+            }
+        }
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) < warp_num_runs_aggregate - lane_id)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Two-phase scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD],
+        Int2Type<false>     is_warp_time_slice)
+    {
+        unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+        int lane_id = LaneId();
+
+        // Unzip
+        OffsetT run_offsets[ITEMS_PER_THREAD];
+        LengthT run_lengths[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            run_offsets[ITEM] = lengths_and_offsets[ITEM].key;
+            run_lengths[ITEM] = lengths_and_offsets[ITEM].value;
+        }
+
+        WarpExchangeOffsets(temp_storage.aliasable.scatter_aliasable.exchange_offsets[warp_id]).ScatterToStriped(
+            run_offsets, thread_num_runs_exclusive_in_warp);
+
+        WARP_SYNC(0xffffffff);
+
+        WarpExchangeLengths(temp_storage.aliasable.scatter_aliasable.exchange_lengths[warp_id]).ScatterToStriped(
+            run_lengths, thread_num_runs_exclusive_in_warp);
+
+        // Global scatter
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if ((ITEM * WARP_THREADS) + lane_id < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    (ITEM * WARP_THREADS) + lane_id;
+
+                // Scatter offset
+                d_offsets_out[item_offset] = run_offsets[ITEM];
+
+                // Scatter length if not the first (global) length
+                if ((!FIRST_TILE) || (ITEM != 0) || (threadIdx.x > 0))
+                {
+                    d_lengths_out[item_offset - 1] = run_lengths[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Direct scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (thread_num_runs_exclusive_in_warp[ITEM] < warp_num_runs_aggregate)
+            {
+                OffsetT item_offset =
+                    tile_num_runs_exclusive_in_global +
+                    warp_num_runs_exclusive_in_tile +
+                    thread_num_runs_exclusive_in_warp[ITEM];
+
+                // Scatter offset
+                d_offsets_out[item_offset] = lengths_and_offsets[ITEM].key;
+
+                // Scatter length if not the first (global) length
+                if (item_offset >= 1)
+                {
+                    d_lengths_out[item_offset - 1] = lengths_and_offsets[ITEM].value;
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter
+     */
+    template <bool FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OffsetT             tile_num_runs_aggregate,
+        OffsetT             tile_num_runs_exclusive_in_global,
+        OffsetT             warp_num_runs_aggregate,
+        OffsetT             warp_num_runs_exclusive_in_tile,
+        OffsetT             (&thread_num_runs_exclusive_in_warp)[ITEMS_PER_THREAD],
+        LengthOffsetPair    (&lengths_and_offsets)[ITEMS_PER_THREAD])
+    {
+        if ((ITEMS_PER_THREAD == 1) || (tile_num_runs_aggregate < BLOCK_THREADS))
+        {
+            // Direct scatter if the warp has any items
+            if (warp_num_runs_aggregate)
+            {
+                ScatterDirect<FIRST_TILE>(
+                    tile_num_runs_exclusive_in_global,
+                    warp_num_runs_aggregate,
+                    warp_num_runs_exclusive_in_tile,
+                    thread_num_runs_exclusive_in_warp,
+                    lengths_and_offsets);
+            }
+        }
+        else
+        {
+            // Scatter two phase
+            ScatterTwoPhase<FIRST_TILE>(
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets,
+                Int2Type<STORE_WARP_TIME_SLICING>());
+        }
+    }
+
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <
+        bool                LAST_TILE>
+    __device__ __forceinline__ LengthOffsetPair ConsumeTile(
+        OffsetT             num_items,          ///< Total number of global input items
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT      &tile_status)       ///< Global list of tile status
+    {
+        if (tile_idx == 0)
+        {
+            // First tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<true, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // Update tile status if this is not the last tile
+            if (!LAST_TILE && (threadIdx.x == 0))
+                tile_status.SetInclusive(0, tile_aggregate);
+
+            // Update thread_exclusive_in_warp to fold in warp run-length
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += warp_exclusive_in_tile.value;
+
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+
+            // Downsweep scan through lengths_and_num_runs
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = 0;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<true>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return tile_aggregate;
+        }
+        else
+        {
+            // Not first tile
+
+            // Load items
+            T items[ITEMS_PER_THREAD];
+            if (LAST_TILE)
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items, num_remaining, T());
+            else
+                BlockLoadT(temp_storage.aliasable.load).Load(d_in + tile_offset, items);
+
+            if (SYNC_AFTER_LOAD)
+                CTA_SYNC();
+
+            // Set flags
+            LengthOffsetPair    lengths_and_num_runs[ITEMS_PER_THREAD];
+
+            InitializeSelections<false, LAST_TILE>(
+                tile_offset,
+                num_remaining,
+                items,
+                lengths_and_num_runs);
+
+            // Exclusive scan of lengths and runs
+            LengthOffsetPair tile_aggregate;
+            LengthOffsetPair warp_aggregate;
+            LengthOffsetPair warp_exclusive_in_tile;
+            LengthOffsetPair thread_exclusive_in_warp;
+
+            WarpScanAllocations(
+                tile_aggregate,
+                warp_aggregate,
+                warp_exclusive_in_tile,
+                thread_exclusive_in_warp,
+                lengths_and_num_runs);
+
+            // First warp computes tile prefix in lane 0
+            TilePrefixCallbackOpT prefix_op(tile_status, temp_storage.aliasable.prefix, Sum(), tile_idx);
+            unsigned int warp_id = ((WARPS == 1) ? 0 : threadIdx.x / WARP_THREADS);
+            if (warp_id == 0)
+            {
+                prefix_op(tile_aggregate);
+                if (threadIdx.x == 0)
+                    temp_storage.tile_exclusive = prefix_op.exclusive_prefix;
+            }
+
+            CTA_SYNC();
+
+            LengthOffsetPair tile_exclusive_in_global = temp_storage.tile_exclusive;
+
+            // Update thread_exclusive_in_warp to fold in warp and tile run-lengths
+            LengthOffsetPair thread_exclusive = scan_op(tile_exclusive_in_global, warp_exclusive_in_tile);
+            if (thread_exclusive_in_warp.key == 0)
+                thread_exclusive_in_warp.value += thread_exclusive.value;
+
+            // Downsweep scan through lengths_and_num_runs
+            LengthOffsetPair    lengths_and_num_runs2[ITEMS_PER_THREAD];
+            LengthOffsetPair    lengths_and_offsets[ITEMS_PER_THREAD];
+            OffsetT             thread_num_runs_exclusive_in_warp[ITEMS_PER_THREAD];
+
+            internal::ThreadScanExclusive(lengths_and_num_runs, lengths_and_num_runs2, scan_op, thread_exclusive_in_warp);
+
+            // Zip
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                lengths_and_offsets[ITEM].value         = lengths_and_num_runs2[ITEM].value;
+                lengths_and_offsets[ITEM].key        = tile_offset + (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+                thread_num_runs_exclusive_in_warp[ITEM] = (lengths_and_num_runs[ITEM].key) ?
+                                                                lengths_and_num_runs2[ITEM].key :         // keep
+                                                                WARP_THREADS * ITEMS_PER_THREAD;            // discard
+            }
+
+            OffsetT tile_num_runs_aggregate              = tile_aggregate.key;
+            OffsetT tile_num_runs_exclusive_in_global    = tile_exclusive_in_global.key;
+            OffsetT warp_num_runs_aggregate              = warp_aggregate.key;
+            OffsetT warp_num_runs_exclusive_in_tile      = warp_exclusive_in_tile.key;
+
+            // Scatter
+            Scatter<false>(
+                tile_num_runs_aggregate,
+                tile_num_runs_exclusive_in_global,
+                warp_num_runs_aggregate,
+                warp_num_runs_exclusive_in_tile,
+                thread_num_runs_exclusive_in_warp,
+                lengths_and_offsets);
+
+            // Return running total (inclusive of this tile)
+            return prefix_op.inclusive_prefix;
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumRunsIteratorT>            ///< Output iterator type for recording number of items selected
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_tiles,              ///< Total number of input tiles
+        ScanTileStateT&     tile_status,            ///< Global list of tile status
+        NumRunsIteratorT    d_num_runs_out)         ///< Output pointer for total number of runs identified
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                  // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            LengthOffsetPair running_total = ConsumeTile<true>(num_items, num_remaining, tile_idx, tile_offset, tile_status);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selected
+                *d_num_runs_out = running_total.key;
+
+                // The inclusive prefix contains accumulated length reduction for the last run
+                if (running_total.key > 0)
+                    d_lengths_out[running_total.key - 1] = running_total.value;
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_scan.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_scan.cuh
new file mode 100644
index 0000000000..9368615ef4
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/agent/agent_scan.cuh
@@ -0,0 +1,471 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentScan
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockStoreAlgorithm         _STORE_ALGORITHM,               ///< The BlockStore algorithm to use
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentScanPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;          ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;           ///< Cache load modifier for reading input elements
+    static const BlockStoreAlgorithm    STORE_ALGORITHM         = _STORE_ALGORITHM;         ///< The BlockStore algorithm to use
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;          ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentScan implements a stateful abstraction of CUDA thread blocks for participating in device-wide prefix scan .
+ */
+template <
+    typename AgentScanPolicyT,      ///< Parameterized AgentScanPolicyT tuning policy type
+    typename InputIteratorT,        ///< Random-access input iterator type
+    typename OutputIteratorT,       ///< Random-access output iterator type
+    typename ScanOpT,               ///< Scan functor type
+    typename InitValueT,            ///< The init_value element for ScanOpT type (cub::NullType for inclusive scan)
+    typename OffsetT>               ///< Signed integer type for global offsets
+struct AgentScan
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OutputT> ScanTileStateT;
+
+    // Input iterator wrapper type (for applying cache modifier)
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentScanPolicyT::LOAD_MODIFIER, InputT, OffsetT>,   // Wrap the native input pointer with CacheModifiedInputIterator
+            InputIteratorT>::Type                                                           // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Constants
+    enum
+    {
+        IS_INCLUSIVE        = Equals<InitValueT, NullType>::VALUE,            // Inclusive scan if no init_value type is provided
+        BLOCK_THREADS       = AgentScanPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentScanPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    // Parameterized BlockLoad type
+    typedef BlockLoad<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockStore type
+    typedef BlockStore<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::ITEMS_PER_THREAD,
+            AgentScanPolicyT::STORE_ALGORITHM>
+        BlockStoreT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OutputT,
+            AgentScanPolicyT::BLOCK_THREADS,
+            AgentScanPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OutputT,
+            ScanOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Stateful BlockScan prefix callback type for managing a running total while scanning consecutive tiles
+    typedef BlockScanRunningPrefixOp<
+            OutputT,
+            ScanOpT>
+        RunningPrefixCallbackOp;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading
+        typename BlockStoreT::TempStorage   store;      // Smem needed for tile storing
+
+        struct
+        {
+            typename TilePrefixCallbackOpT::TempStorage  prefix;     // Smem needed for cooperative prefix callback
+            typename BlockScanT::TempStorage             scan;       // Smem needed for tile scanning
+        };
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&               temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT       d_in;               ///< Input data
+    OutputIteratorT             d_out;              ///< Output data
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    InitValueT                  init_value;         ///< The init_value element for ScanOpT
+
+
+    //---------------------------------------------------------------------
+    // Block scan utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Exclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        OutputT             init_value,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, init_value, scan_op, block_aggregate);
+        block_aggregate = scan_op(init_value, block_aggregate);
+    }
+
+
+    /**
+     * Inclusive scan specialization (first tile)
+     */
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        InitValueT          /*init_value*/,
+        ScanOpT             scan_op,
+        OutputT             &block_aggregate,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * Exclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<false>     /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).ExclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    /**
+     * Inclusive scan specialization (subsequent tiles)
+     */
+    template <typename PrefixCallback>
+    __device__ __forceinline__
+    void ScanTile(
+        OutputT             (&items)[ITEMS_PER_THREAD],
+        ScanOpT             scan_op,
+        PrefixCallback      &prefix_op,
+        Int2Type<true>      /*is_inclusive*/)
+    {
+        BlockScanT(temp_storage.scan).InclusiveScan(items, items, scan_op, prefix_op);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentScan(
+        TempStorage&    temp_storage,       ///< Reference to temp_storage
+        InputIteratorT  d_in,               ///< Input data
+        OutputIteratorT d_out,              ///< Output data
+        ScanOpT         scan_op,            ///< Binary scan operator
+        InitValueT      init_value)         ///< Initial value to seed the exclusive scan
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_out(d_out),
+        scan_op(scan_op),
+        init_value(init_value)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input (dynamic chained scan)
+     */
+    template <bool IS_LAST_TILE>                ///< Whether the current tile is the last tile
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, num_remaining);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Perform tile scan
+        if (tile_idx == 0)
+        {
+            // Scan first tile
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            if ((!IS_LAST_TILE) && (threadIdx.x == 0))
+                tile_state.SetInclusive(0, block_aggregate);
+        }
+        else
+        {
+            // Scan non-first tile
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, num_remaining);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        int                 start_tile)         ///< The starting tile for the current grid
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = start_tile + blockIdx.x;          // Current tile index
+        OffsetT tile_offset     = OffsetT(TILE_ITEMS) * tile_idx;   // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;          // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not last tile
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+        else if (num_remaining > 0)
+        {
+            // Last tile
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scan an sequence of consecutive tiles (independent of other thread blocks)
+    //---------------------------------------------------------------------
+
+    /**
+     * Process a tile of input
+     */
+    template <
+        bool                        IS_FIRST_TILE,
+        bool                        IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT                     tile_offset,                ///< Tile offset
+        RunningPrefixCallbackOp&    prefix_op,                  ///< Running prefix operator
+        int                         valid_items = TILE_ITEMS)   ///< Number of valid items in the tile
+    {
+        // Load items
+        OutputT items[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items, valid_items);
+        else
+            BlockLoadT(temp_storage.load).Load(d_in + tile_offset, items);
+
+        CTA_SYNC();
+
+        // Block scan
+        if (IS_FIRST_TILE)
+        {
+            OutputT block_aggregate;
+            ScanTile(items, init_value, scan_op, block_aggregate, Int2Type<IS_INCLUSIVE>());
+            prefix_op.running_total = block_aggregate;
+        }
+        else
+        {
+            ScanTile(items, scan_op, prefix_op, Int2Type<IS_INCLUSIVE>());
+        }
+
+        CTA_SYNC();
+
+        // Store items
+        if (IS_LAST_TILE)
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items, valid_items);
+        else
+            BlockStoreT(temp_storage.store).Store(d_out + tile_offset, items);
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT  range_offset,      ///< [in] Threadblock begin offset (inclusive)
+        OffsetT  range_end)         ///< [in] Threadblock end offset (exclusive)
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(scan_op);
+
+        if (range_offset + TILE_ITEMS <= range_end)
+        {
+            // Consume first tile of input (full)
+            ConsumeTile<true, true>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+
+            // Consume subsequent full tiles of input
+            while (range_offset + TILE_ITEMS <= range_end)
+            {
+                ConsumeTile<false, true>(range_offset, prefix_op);
+                range_offset += TILE_ITEMS;
+            }
+
+            // Consume a partially-full tile
+            if (range_offset < range_end)
+            {
+                int valid_items = range_end - range_offset;
+                ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+            }
+        }
+        else
+        {
+            // Consume the first tile of input (partially-full)
+            int valid_items = range_end - range_offset;
+            ConsumeTile<true, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+
+    /**
+     * Scan a consecutive share of input tiles, seeded with the specified prefix value
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        OffsetT range_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT range_end,                          ///< [in] Threadblock end offset (exclusive)
+        OutputT prefix)                             ///< [in] The prefix to apply to the scan segment
+    {
+        BlockScanRunningPrefixOp<OutputT, ScanOpT> prefix_op(prefix, scan_op);
+
+        // Consume full tiles of input
+        while (range_offset + TILE_ITEMS <= range_end)
+        {
+            ConsumeTile<true, false>(range_offset, prefix_op);
+            range_offset += TILE_ITEMS;
+        }
+
+        // Consume a partially-full tile
+        if (range_offset < range_end)
+        {
+            int valid_items = range_end - range_offset;
+            ConsumeTile<false, false>(range_offset, prefix_op, valid_items);
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_segment_fixup.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_segment_fixup.cuh
new file mode 100644
index 0000000000..e2de58ed66
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/agent/agent_segment_fixup.cuh
@@ -0,0 +1,375 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/constant_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSegmentFixup
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSegmentFixupPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+/**
+ * \brief AgentSegmentFixup implements a stateful abstraction of CUDA thread blocks for participating in device-wide reduce-value-by-key
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    EqualityOpT,                    ///< KeyT equality operator type
+    typename    ReductionOpT,                   ///< ValueT reduction operator type
+    typename    OffsetT>                        ///< Signed integer type for global offsets
+struct AgentSegmentFixup
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // Data type of key-value input iterator
+    typedef typename std::iterator_traits<PairsInputIteratorT>::value_type KeyValuePairT;
+
+    // Value type
+    typedef typename KeyValuePairT::Value ValueT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        BLOCK_THREADS       = AgentSegmentFixupPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = AgentSegmentFixupPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        // Whether or not do fixup using RLE + global atomics
+        USE_ATOMIC_FIXUP    = (CUB_PTX_ARCH >= 350) && 
+                                (Equals<ValueT, float>::VALUE || 
+                                 Equals<ValueT, int>::VALUE ||
+                                 Equals<ValueT, unsigned int>::VALUE ||
+                                 Equals<ValueT, unsigned long long>::VALUE),
+
+        // Whether or not the scan operation has a zero-valued identity value (true if we're performing addition on a primitive type)
+        HAS_IDENTITY_ZERO   = (Equals<ReductionOpT, cub::Sum>::VALUE) && (Traits<ValueT>::PRIMITIVE),
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for keys
+    typedef typename If<IsPointer<PairsInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, KeyValuePairT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            PairsInputIteratorT>::Type                                                                      // Directly use the supplied input iterator type
+        WrappedPairsInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for fixup values
+    typedef typename If<IsPointer<AggregatesOutputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSegmentFixupPolicyT::LOAD_MODIFIER, ValueT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            AggregatesOutputIteratorT>::Type                                                        // Directly use the supplied input iterator type
+        WrappedFixupInputIteratorT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
+
+    // Parameterized BlockLoad type for pairs
+    typedef BlockLoad<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSegmentFixupPolicyT::LOAD_ALGORITHM>
+        BlockLoadPairs;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSegmentFixupPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            KeyValuePairT,
+            ReduceBySegmentOpT,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+        };
+
+        // Smem needed for loading keys
+        typename BlockLoadPairs::TempStorage load_pairs;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedPairsInputIteratorT      d_pairs_in;          ///< Input keys
+    AggregatesOutputIteratorT       d_aggregates_out;   ///< Output value aggregates
+    WrappedFixupInputIteratorT      d_fixup_in;         ///< Fixup input values
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< KeyT inequality operator
+    ReductionOpT                    reduction_op;       ///< Reduction operator
+    ReduceBySegmentOpT              scan_op;            ///< Reduce-by-segment scan operator
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSegmentFixup(
+        TempStorage&                temp_storage,       ///< Reference to temp_storage
+        PairsInputIteratorT         d_pairs_in,          ///< Input keys
+        AggregatesOutputIteratorT   d_aggregates_out,   ///< Output value aggregates
+        EqualityOpT                 equality_op,        ///< KeyT equality operator
+        ReductionOpT                reduction_op)       ///< ValueT reduction operator
+    :
+        temp_storage(temp_storage.Alias()),
+        d_pairs_in(d_pairs_in),
+        d_aggregates_out(d_aggregates_out),
+        d_fixup_in(d_aggregates_out),
+        inequality_op(equality_op),
+        reduction_op(reduction_op),
+        scan_op(reduction_op)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process input tile.  Specialized for atomic-fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<true>      use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        // RLE 
+        #pragma unroll
+        for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            ValueT* d_scatter = d_aggregates_out + pairs[ITEM - 1].key;
+            if (pairs[ITEM].key != pairs[ITEM - 1].key)
+                atomicAdd(d_scatter, pairs[ITEM - 1].value);
+            else
+                pairs[ITEM].value = reduction_op(pairs[ITEM - 1].value, pairs[ITEM].value);
+        }
+
+        // Flush last item if valid
+        ValueT* d_scatter = d_aggregates_out + pairs[ITEMS_PER_THREAD - 1].key;
+        if ((!IS_LAST_TILE) || (pairs[ITEMS_PER_THREAD - 1].key >= 0))
+            atomicAdd(d_scatter, pairs[ITEMS_PER_THREAD - 1].value);
+    }
+
+
+    /**
+     * Process input tile.  Specialized for reduce-by-key fixup
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ void ConsumeTile(
+        OffsetT             num_remaining,      ///< Number of global input items remaining (including this tile)
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state,         ///< Global tile state descriptor
+        Int2Type<false>     use_atomic_fixup)   ///< Marker whether to use atomicAdd (instead of reduce-by-key)
+    {
+        KeyValuePairT   pairs[ITEMS_PER_THREAD];
+        KeyValuePairT   scatter_pairs[ITEMS_PER_THREAD];
+
+        // Load pairs
+        KeyValuePairT oob_pair;
+        oob_pair.key = -1;
+
+        if (IS_LAST_TILE)
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs, num_remaining, oob_pair);
+        else
+            BlockLoadPairs(temp_storage.load_pairs).Load(d_pairs_in + tile_offset, pairs);
+
+        CTA_SYNC();
+
+        KeyValuePairT tile_aggregate;
+        if (tile_idx == 0)
+        {
+            // Exclusive scan of values and segment_flags
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, tile_aggregate);
+
+            // Update tile status if this is not the last tile
+            if (threadIdx.x == 0)
+            {
+                // Set first segment id to not trigger a flush (invalid from exclusive scan)
+                scatter_pairs[0].key = pairs[0].key;
+
+                if (!IS_LAST_TILE)
+                    tile_state.SetInclusive(0, tile_aggregate);
+
+            }
+        }
+        else
+        {
+            // Exclusive scan of values and segment_flags
+            TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, scan_op, tile_idx);
+            BlockScanT(temp_storage.scan).ExclusiveScan(pairs, scatter_pairs, scan_op, prefix_op);
+            tile_aggregate = prefix_op.GetBlockAggregate();
+        }
+
+        // Scatter updated values
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (scatter_pairs[ITEM].key != pairs[ITEM].key)
+            {
+                // Update the value at the key location
+                ValueT value    = d_fixup_in[scatter_pairs[ITEM].key];
+                value           = reduction_op(value, scatter_pairs[ITEM].value);
+
+                d_aggregates_out[scatter_pairs[ITEM].key] = value;
+            }
+        }
+
+        // Finalize the last item
+        if (IS_LAST_TILE)
+        {
+            // Last thread will output final count and last item, if necessary
+            if (threadIdx.x == BLOCK_THREADS - 1)
+            {
+                // If the last tile is a whole tile, the inclusive prefix contains accumulated value reduction for the last segment
+                if (num_remaining == TILE_ITEMS)
+                {
+                    // Update the value at the key location
+                    OffsetT last_key = pairs[ITEMS_PER_THREAD - 1].key;
+                    d_aggregates_out[last_key] = reduction_op(tile_aggregate.value, d_fixup_in[last_key]);
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    __device__ __forceinline__ void ConsumeRange(
+        int                 num_items,          ///< Total number of input items
+        int                 num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+        OffsetT num_remaining   = num_items - tile_offset;                  // Remaining items (including this tile)
+
+        if (num_remaining > TILE_ITEMS)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+        else if (num_remaining > 0)
+        {
+            // The last tile (possibly partially-full)
+            ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state, Int2Type<USE_ATOMIC_FIXUP>());
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_select_if.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_select_if.cuh
new file mode 100644
index 0000000000..52ca9fc284
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/agent/agent_select_if.cuh
@@ -0,0 +1,703 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide select.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "single_pass_scan_operators.cuh"
+#include "../block/block_load.cuh"
+#include "../block/block_store.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../block/block_discontinuity.cuh"
+#include "../grid/grid_queue.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy types
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSelectIf
+ */
+template <
+    int                         _BLOCK_THREADS,                 ///< Threads per thread block
+    int                         _ITEMS_PER_THREAD,              ///< Items per thread (per tile of input)
+    BlockLoadAlgorithm          _LOAD_ALGORITHM,                ///< The BlockLoad algorithm to use
+    CacheLoadModifier           _LOAD_MODIFIER,                 ///< Cache load modifier for reading input elements
+    BlockScanAlgorithm          _SCAN_ALGORITHM>                ///< The BlockScan algorithm to use
+struct AgentSelectIfPolicy
+{
+    enum
+    {
+        BLOCK_THREADS           = _BLOCK_THREADS,               ///< Threads per thread block
+        ITEMS_PER_THREAD        = _ITEMS_PER_THREAD,            ///< Items per thread (per tile of input)
+    };
+
+    static const BlockLoadAlgorithm     LOAD_ALGORITHM          = _LOAD_ALGORITHM;      ///< The BlockLoad algorithm to use
+    static const CacheLoadModifier      LOAD_MODIFIER           = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
+    static const BlockScanAlgorithm     SCAN_ALGORITHM          = _SCAN_ALGORITHM;      ///< The BlockScan algorithm to use
+};
+
+
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+
+/**
+ * \brief AgentSelectIf implements a stateful abstraction of CUDA thread blocks for participating in device-wide selection
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename    AgentSelectIfPolicyT,           ///< Parameterized AgentSelectIfPolicy tuning policy type
+    typename    InputIteratorT,                 ///< Random-access input iterator type for selection items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for selections (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access input iterator type for selection_flags items
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selections or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selections is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct AgentSelectIf
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+    // Constants
+    enum
+    {
+        USE_SELECT_OP,
+        USE_SELECT_FLAGS,
+        USE_DISCONTINUITY,
+
+        BLOCK_THREADS           = AgentSelectIfPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSelectIfPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+        TWO_PHASE_SCATTER       = (ITEMS_PER_THREAD > 1),
+
+        SELECT_METHOD           = (!Equals<SelectOpT, NullType>::VALUE) ?
+                                    USE_SELECT_OP :
+                                    (!Equals<FlagT, NullType>::VALUE) ?
+                                        USE_SELECT_FLAGS :
+                                        USE_DISCONTINUITY
+    };
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for items
+    typedef typename If<IsPointer<InputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, InputT, OffsetT>,        // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            InputIteratorT>::Type                                                               // Directly use the supplied input iterator type
+        WrappedInputIteratorT;
+
+    // Cache-modified Input iterator wrapper type (for applying cache modifier) for values
+    typedef typename If<IsPointer<FlagsInputIteratorT>::VALUE,
+            CacheModifiedInputIterator<AgentSelectIfPolicyT::LOAD_MODIFIER, FlagT, OffsetT>,    // Wrap the native input pointer with CacheModifiedValuesInputIterator
+            FlagsInputIteratorT>::Type                                                          // Directly use the supplied input iterator type
+        WrappedFlagsInputIteratorT;
+
+    // Parameterized BlockLoad type for input data
+    typedef BlockLoad<
+            OutputT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadT;
+
+    // Parameterized BlockLoad type for flags
+    typedef BlockLoad<
+            FlagT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            AgentSelectIfPolicyT::LOAD_ALGORITHM>
+        BlockLoadFlags;
+
+    // Parameterized BlockDiscontinuity type for items
+    typedef BlockDiscontinuity<
+            OutputT,
+            BLOCK_THREADS>
+        BlockDiscontinuityT;
+
+    // Parameterized BlockScan type
+    typedef BlockScan<
+            OffsetT,
+            BLOCK_THREADS,
+            AgentSelectIfPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // Callback type for obtaining tile prefix during block scan
+    typedef TilePrefixCallbackOp<
+            OffsetT,
+            cub::Sum,
+            ScanTileStateT>
+        TilePrefixCallbackOpT;
+
+    // Item exchange type
+    typedef OutputT ItemExchangeT[TILE_ITEMS];
+
+    // Shared memory type for this thread block
+    union _TempStorage
+    {
+        struct
+        {
+            typename BlockScanT::TempStorage                scan;           // Smem needed for tile scanning
+            typename TilePrefixCallbackOpT::TempStorage     prefix;         // Smem needed for cooperative prefix callback
+            typename BlockDiscontinuityT::TempStorage       discontinuity;  // Smem needed for discontinuity detection
+        };
+
+        // Smem needed for loading items
+        typename BlockLoadT::TempStorage load_items;
+
+        // Smem needed for loading values
+        typename BlockLoadFlags::TempStorage load_flags;
+
+        // Smem needed for compacting items (allows non POD items in this union)
+        Uninitialized<ItemExchangeT> raw_exchange;
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    _TempStorage&                   temp_storage;       ///< Reference to temp_storage
+    WrappedInputIteratorT           d_in;               ///< Input items
+    SelectedOutputIteratorT         d_selected_out;     ///< Unique output items
+    WrappedFlagsInputIteratorT      d_flags_in;         ///< Input selection flags (if applicable)
+    InequalityWrapper<EqualityOpT>  inequality_op;      ///< T inequality operator
+    SelectOpT                       select_op;          ///< Selection operator
+    OffsetT                         num_items;          ///< Total number of input items
+
+
+    //---------------------------------------------------------------------
+    // Constructor
+    //---------------------------------------------------------------------
+
+    // Constructor
+    __device__ __forceinline__
+    AgentSelectIf(
+        TempStorage                 &temp_storage,      ///< Reference to temp_storage
+        InputIteratorT              d_in,               ///< Input data
+        FlagsInputIteratorT         d_flags_in,         ///< Input selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,     ///< Output data
+        SelectOpT                   select_op,          ///< Selection operator
+        EqualityOpT                 equality_op,        ///< Equality operator
+        OffsetT                     num_items)          ///< Total number of input items
+    :
+        temp_storage(temp_storage.Alias()),
+        d_in(d_in),
+        d_flags_in(d_flags_in),
+        d_selected_out(d_selected_out),
+        select_op(select_op),
+        inequality_op(equality_op),
+        num_items(num_items)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods for initializing the selections
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize selections (specialized for selection operator)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     /*tile_offset*/,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_OP>     /*select_method*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Out-of-bounds items are selection_flags
+            selection_flags[ITEM] = 1;
+
+            if (!IS_LAST_TILE || (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM < num_tile_items))
+                selection_flags[ITEM] = select_op(items[ITEM]);
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for valid flags)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&/*items*/)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_SELECT_FLAGS>  /*select_method*/)
+    {
+        CTA_SYNC();
+
+        FlagT flags[ITEMS_PER_THREAD];
+
+        if (IS_LAST_TILE)
+        {
+            // Out-of-bounds items are selection_flags
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags, num_tile_items, 1);
+        }
+        else
+        {
+            BlockLoadFlags(temp_storage.load_flags).Load(d_flags_in + tile_offset, flags);
+        }
+
+        // Convert flag type to selection_flags type
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            selection_flags[ITEM] = flags[ITEM];
+        }
+    }
+
+
+    /**
+     * Initialize selections (specialized for discontinuity detection)
+     */
+    template <bool IS_FIRST_TILE, bool IS_LAST_TILE>
+    __device__ __forceinline__ void InitializeSelections(
+        OffsetT                     tile_offset,
+        OffsetT                     num_tile_items,
+        OutputT                     (&items)[ITEMS_PER_THREAD],
+        OffsetT                     (&selection_flags)[ITEMS_PER_THREAD],
+        Int2Type<USE_DISCONTINUITY> /*select_method*/)
+    {
+        if (IS_FIRST_TILE)
+        {
+            CTA_SYNC();
+
+            // Set head selection_flags.  First tile sets the first flag for the first item
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op);
+        }
+        else
+        {
+            OutputT tile_predecessor;
+            if (threadIdx.x == 0)
+                tile_predecessor = d_in[tile_offset - 1];
+
+            CTA_SYNC();
+
+            BlockDiscontinuityT(temp_storage.discontinuity).FlagHeads(selection_flags, items, inequality_op, tile_predecessor);
+        }
+
+        // Set selection flags for out-of-bounds items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            // Set selection_flags for out-of-bounds items
+            if ((IS_LAST_TILE) && (OffsetT(threadIdx.x * ITEMS_PER_THREAD) + ITEM >= num_tile_items))
+                selection_flags[ITEM] = 1;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Scatter utility methods
+    //---------------------------------------------------------------------
+
+    /**
+     * Scatter flagged items to output offsets (specialized for direct scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterDirect(
+        OutputT (&items)[ITEMS_PER_THREAD],
+        OffsetT (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT (&selection_indices)[ITEMS_PER_THREAD],
+        OffsetT num_selections)
+    {
+        // Scatter flagged items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (selection_flags[ITEM])
+            {
+                if ((!IS_LAST_TILE) || selection_indices[ITEM] < num_selections)
+                {
+                    d_selected_out[selection_indices[ITEM]] = items[ITEM];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             /*num_tile_items*/,                         ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         /*num_rejected_prefix*/,                    ///< Total number of rejections prior to this tile
+        Int2Type<false> /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        // Compact and scatter items
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int local_scatter_offset = selection_indices[ITEM] - num_selections_prefix;
+            if (selection_flags[ITEM])
+            {
+                temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+            }
+        }
+
+        CTA_SYNC();
+
+        for (int item = threadIdx.x; item < num_tile_selections; item += BLOCK_THREADS)
+        {
+            d_selected_out[num_selections_prefix + item] = temp_storage.raw_exchange.Alias()[item];
+        }
+    }
+
+
+    /**
+     * Scatter flagged items to output offsets (specialized for two-phase scattering)
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void ScatterTwoPhase(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        Int2Type<true>  /*is_keep_rejects*/)                        ///< Marker type indicating whether to keep rejected items in the second partition
+    {
+        CTA_SYNC();
+
+        int tile_num_rejections = num_tile_items - num_tile_selections;
+
+        // Scatter items to shared memory (rejections first)
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx                = (threadIdx.x * ITEMS_PER_THREAD) + ITEM;
+            int local_selection_idx     = selection_indices[ITEM] - num_selections_prefix;
+            int local_rejection_idx     = item_idx - local_selection_idx;
+            int local_scatter_offset    = (selection_flags[ITEM]) ?
+                                            tile_num_rejections + local_selection_idx :
+                                            local_rejection_idx;
+
+            temp_storage.raw_exchange.Alias()[local_scatter_offset] = items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // Gather items from shared memory and scatter to global
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int item_idx            = (ITEM * BLOCK_THREADS) + threadIdx.x;
+            int rejection_idx       = item_idx;
+            int selection_idx       = item_idx - tile_num_rejections;
+            OffsetT scatter_offset  = (item_idx < tile_num_rejections) ?
+                                        num_items - num_rejected_prefix - rejection_idx - 1 :
+                                        num_selections_prefix + selection_idx;
+
+            OutputT item = temp_storage.raw_exchange.Alias()[item_idx];
+
+            if (!IS_LAST_TILE || (item_idx < num_tile_items))
+            {
+                d_selected_out[scatter_offset] = item;
+            }
+        }
+    }
+
+
+    /**
+     * Scatter flagged items
+     */
+    template <bool IS_LAST_TILE, bool IS_FIRST_TILE>
+    __device__ __forceinline__ void Scatter(
+        OutputT         (&items)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_flags)[ITEMS_PER_THREAD],
+        OffsetT         (&selection_indices)[ITEMS_PER_THREAD],
+        int             num_tile_items,                             ///< Number of valid items in this tile
+        int             num_tile_selections,                        ///< Number of selections in this tile
+        OffsetT         num_selections_prefix,                      ///< Total number of selections prior to this tile
+        OffsetT         num_rejected_prefix,                        ///< Total number of rejections prior to this tile
+        OffsetT         num_selections)                             ///< Total number of selections including this tile
+    {
+        // Do a two-phase scatter if (a) keeping both partitions or (b) two-phase is enabled and the average number of selection_flags items per thread is greater than one
+        if (KEEP_REJECTS || (TWO_PHASE_SCATTER && (num_tile_selections > BLOCK_THREADS)))
+        {
+            ScatterTwoPhase<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_tile_items,
+                num_tile_selections,
+                num_selections_prefix,
+                num_rejected_prefix,
+                Int2Type<KEEP_REJECTS>());
+        }
+        else
+        {
+            ScatterDirect<IS_LAST_TILE, IS_FIRST_TILE>(
+                items,
+                selection_flags,
+                selection_indices,
+                num_selections);
+        }
+    }
+
+    //---------------------------------------------------------------------
+    // Cooperatively scan a device-wide sequence of tiles with other CTAs
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Process first tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeFirstTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<true, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of selection_flags
+        OffsetT num_tile_selections;
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, num_tile_selections);
+
+        if (threadIdx.x == 0)
+        {
+            // Update tile status if this is not the last tile
+            if (!IS_LAST_TILE)
+                tile_state.SetInclusive(0, num_tile_selections);
+        }
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+            num_tile_selections -= (TILE_ITEMS - num_tile_items);
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, true>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            0,
+            0,
+            num_tile_selections);
+
+        return num_tile_selections;
+    }
+
+
+    /**
+     * Process subsequent tile of input (dynamic chained scan).  Returns the running count of selections (including this tile)
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeSubsequentTile(
+        int                 num_tile_items,      ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OutputT     items[ITEMS_PER_THREAD];
+        OffsetT     selection_flags[ITEMS_PER_THREAD];
+        OffsetT     selection_indices[ITEMS_PER_THREAD];
+
+        // Load items
+        if (IS_LAST_TILE)
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items, num_tile_items);
+        else
+            BlockLoadT(temp_storage.load_items).Load(d_in + tile_offset, items);
+
+        // Initialize selection_flags
+        InitializeSelections<false, IS_LAST_TILE>(
+            tile_offset,
+            num_tile_items,
+            items,
+            selection_flags,
+            Int2Type<SELECT_METHOD>());
+
+        CTA_SYNC();
+
+        // Exclusive scan of values and selection_flags
+        TilePrefixCallbackOpT prefix_op(tile_state, temp_storage.prefix, cub::Sum(), tile_idx);
+        BlockScanT(temp_storage.scan).ExclusiveSum(selection_flags, selection_indices, prefix_op);
+
+        OffsetT num_tile_selections     = prefix_op.GetBlockAggregate();
+        OffsetT num_selections          = prefix_op.GetInclusivePrefix();
+        OffsetT num_selections_prefix   = prefix_op.GetExclusivePrefix();
+        OffsetT num_rejected_prefix     = (tile_idx * TILE_ITEMS) - num_selections_prefix;
+
+        // Discount any out-of-bounds selections
+        if (IS_LAST_TILE)
+        {
+            int num_discount    = TILE_ITEMS - num_tile_items;
+            num_selections      -= num_discount;
+            num_tile_selections -= num_discount;
+        }
+
+        // Scatter flagged items
+        Scatter<IS_LAST_TILE, false>(
+            items,
+            selection_flags,
+            selection_indices,
+            num_tile_items,
+            num_tile_selections,
+            num_selections_prefix,
+            num_rejected_prefix,
+            num_selections);
+
+        return num_selections;
+    }
+
+
+    /**
+     * Process a tile of input
+     */
+    template <bool IS_LAST_TILE>
+    __device__ __forceinline__ OffsetT ConsumeTile(
+        int                 num_tile_items,         ///< Number of input items comprising this tile
+        int                 tile_idx,           ///< Tile index
+        OffsetT             tile_offset,        ///< Tile offset
+        ScanTileStateT&     tile_state)         ///< Global tile state descriptor
+    {
+        OffsetT num_selections;
+        if (tile_idx == 0)
+        {
+            num_selections = ConsumeFirstTile<IS_LAST_TILE>(num_tile_items, tile_offset, tile_state);
+        }
+        else
+        {
+            num_selections = ConsumeSubsequentTile<IS_LAST_TILE>(num_tile_items, tile_idx, tile_offset, tile_state);
+        }
+
+        return num_selections;
+    }
+
+
+    /**
+     * Scan tiles of items as part of a dynamic chained scan
+     */
+    template <typename NumSelectedIteratorT>        ///< Output iterator type for recording number of items selection_flags
+    __device__ __forceinline__ void ConsumeRange(
+        int                     num_tiles,          ///< Total number of input tiles
+        ScanTileStateT&         tile_state,         ///< Global tile state descriptor
+        NumSelectedIteratorT    d_num_selected_out) ///< Output total number selection_flags
+    {
+        // Blocks are launched in increasing order, so just assign one tile per block
+        int     tile_idx        = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+        OffsetT tile_offset     = tile_idx * TILE_ITEMS;                    // Global offset for the current tile
+
+        if (tile_idx < num_tiles - 1)
+        {
+            // Not the last tile (full)
+            ConsumeTile<false>(TILE_ITEMS, tile_idx, tile_offset, tile_state);
+        }
+        else
+        {
+            // The last tile (possibly partially-full)
+            OffsetT num_remaining   = num_items - tile_offset;
+            OffsetT num_selections  = ConsumeTile<true>(num_remaining, tile_idx, tile_offset, tile_state);
+
+            if (threadIdx.x == 0)
+            {
+                // Output the total number of items selection_flags
+                *d_num_selected_out = num_selections;
+            }
+        }
+    }
+
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/agent/agent_spmv_orig.cuh b/GraphBLAS/CUDA/local_cub/agent/agent_spmv_orig.cuh
new file mode 100644
index 0000000000..54e2a13946
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/agent/agent_spmv_orig.cuh
@@ -0,0 +1,670 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../util_type.cuh"
+#include "../block/block_reduce.cuh"
+#include "../block/block_scan.cuh"
+#include "../block/block_exchange.cuh"
+#include "../thread/thread_search.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../iterator/counting_input_iterator.cuh"
+#include "../iterator/tex_ref_input_iterator.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Tuning policy
+ ******************************************************************************/
+
+/**
+ * Parameterizable tuning policy type for AgentSpmv
+ */
+template <
+    int                             _BLOCK_THREADS,                         ///< Threads per thread block
+    int                             _ITEMS_PER_THREAD,                      ///< Items per thread (per tile of input)
+    CacheLoadModifier               _ROW_OFFSETS_SEARCH_LOAD_MODIFIER,      ///< Cache load modifier for reading CSR row-offsets during search
+    CacheLoadModifier               _ROW_OFFSETS_LOAD_MODIFIER,             ///< Cache load modifier for reading CSR row-offsets
+    CacheLoadModifier               _COLUMN_INDICES_LOAD_MODIFIER,          ///< Cache load modifier for reading CSR column-indices
+    CacheLoadModifier               _VALUES_LOAD_MODIFIER,                  ///< Cache load modifier for reading CSR values
+    CacheLoadModifier               _VECTOR_VALUES_LOAD_MODIFIER,           ///< Cache load modifier for reading vector values
+    bool                            _DIRECT_LOAD_NONZEROS,                  ///< Whether to load nonzeros directly from global during sequential merging (vs. pre-staged through shared memory)
+    BlockScanAlgorithm              _SCAN_ALGORITHM>                        ///< The BlockScan algorithm to use
+struct AgentSpmvPolicy
+{
+    enum
+    {
+        BLOCK_THREADS                                                   = _BLOCK_THREADS,                       ///< Threads per thread block
+        ITEMS_PER_THREAD                                                = _ITEMS_PER_THREAD,                    ///< Items per thread (per tile of input)
+        DIRECT_LOAD_NONZEROS                                            = _DIRECT_LOAD_NONZEROS,                ///< Whether to load nonzeros directly from global during sequential merging (pre-staged through shared memory)
+    };
+
+    static const CacheLoadModifier  ROW_OFFSETS_SEARCH_LOAD_MODIFIER    = _ROW_OFFSETS_SEARCH_LOAD_MODIFIER;    ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  ROW_OFFSETS_LOAD_MODIFIER           = _ROW_OFFSETS_LOAD_MODIFIER;           ///< Cache load modifier for reading CSR row-offsets
+    static const CacheLoadModifier  COLUMN_INDICES_LOAD_MODIFIER        = _COLUMN_INDICES_LOAD_MODIFIER;        ///< Cache load modifier for reading CSR column-indices
+    static const CacheLoadModifier  VALUES_LOAD_MODIFIER                = _VALUES_LOAD_MODIFIER;                ///< Cache load modifier for reading CSR values
+    static const CacheLoadModifier  VECTOR_VALUES_LOAD_MODIFIER         = _VECTOR_VALUES_LOAD_MODIFIER;         ///< Cache load modifier for reading vector values
+    static const BlockScanAlgorithm SCAN_ALGORITHM                      = _SCAN_ALGORITHM;                      ///< The BlockScan algorithm to use
+
+};
+
+
+/******************************************************************************
+ * Thread block abstractions
+ ******************************************************************************/
+
+template <
+    typename        ValueT,              ///< Matrix and vector value type
+    typename        OffsetT>             ///< Signed integer type for sequence offsets
+struct SpmvParams
+{
+    ValueT*         d_values;            ///< Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    OffsetT*        d_row_end_offsets;   ///< Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    OffsetT*        d_column_indices;    ///< Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    ValueT*         d_vector_x;          ///< Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    ValueT*         d_vector_y;          ///< Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+    int             num_rows;            ///< Number of rows of matrix <b>A</b>.
+    int             num_cols;            ///< Number of columns of matrix <b>A</b>.
+    int             num_nonzeros;        ///< Number of nonzero elements of matrix <b>A</b>.
+    ValueT          alpha;               ///< Alpha multiplicand
+    ValueT          beta;                ///< Beta addend-multiplicand
+
+    TexRefInputIterator<ValueT, 66778899, OffsetT>  t_vector_x;
+};
+
+
+/**
+ * \brief AgentSpmv implements a stateful abstraction of CUDA thread blocks for participating in device-wide SpMV.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized AgentSpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT,                    ///< Signed integer type for sequence offsets
+    bool        HAS_ALPHA,                  ///< Whether the input parameter \p alpha is 1
+    bool        HAS_BETA,                   ///< Whether the input parameter \p beta is 0
+    int         PTX_ARCH = CUB_PTX_ARCH>    ///< PTX compute capability
+struct AgentSpmv
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = AgentSpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = AgentSpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    /// 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    /// Input iterator wrapper types (for applying cache modifiers)
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::ROW_OFFSETS_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::COLUMN_INDICES_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        ColumnIndicesIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        ValueIteratorT;
+
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+    // Reduce-value-by-segment scan operator
+    typedef ReduceByKeyOp<cub::Sum> ReduceBySegmentOpT;
+
+    // BlockReduce specialization
+    typedef BlockReduce<
+            ValueT,
+            BLOCK_THREADS,
+            BLOCK_REDUCE_WARP_REDUCTIONS>
+        BlockReduceT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            KeyValuePairT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockScanT;
+
+    // BlockScan specialization
+    typedef BlockScan<
+            ValueT,
+            BLOCK_THREADS,
+            AgentSpmvPolicyT::SCAN_ALGORITHM>
+        BlockPrefixSumT;
+
+    // BlockExchange specialization
+    typedef BlockExchange<
+            ValueT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD>
+        BlockExchangeT;
+
+    /// Merge item type (either a non-zero value or a row-end offset)
+    union MergeItem
+    {
+        // Value type to pair with index type OffsetT (NullType if loading values directly during merge)
+        typedef typename If<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS, NullType, ValueT>::Type MergeValueT;
+
+        OffsetT     row_end_offset;
+        MergeValueT nonzero;
+    };
+
+    /// Shared memory type required by this thread block
+    struct _TempStorage
+    {
+        CoordinateT tile_coords[2];
+
+        union Aliasable
+        {
+            // Smem needed for tile of merge items
+            MergeItem merge_items[ITEMS_PER_THREAD + TILE_ITEMS + 1];
+
+            // Smem needed for block exchange
+            typename BlockExchangeT::TempStorage exchange;
+
+            // Smem needed for block-wide reduction
+            typename BlockReduceT::TempStorage reduce;
+
+            // Smem needed for tile scanning
+            typename BlockScanT::TempStorage scan;
+
+            // Smem needed for tile prefix sum
+            typename BlockPrefixSumT::TempStorage prefix_sum;
+
+        } aliasable;
+    };
+
+    /// Temporary storage type (unionable)
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+
+    _TempStorage&                   temp_storage;         /// Reference to temp_storage
+
+    SpmvParams<ValueT, OffsetT>&    spmv_params;
+
+    ValueIteratorT                  wd_values;            ///< Wrapped pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+    RowOffsetsIteratorT             wd_row_end_offsets;   ///< Wrapped Pointer to the array of \p m offsets demarcating the end of every row in \p d_column_indices and \p d_values
+    ColumnIndicesIteratorT          wd_column_indices;    ///< Wrapped Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+    VectorValueIteratorT            wd_vector_x;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+    VectorValueIteratorT            wd_vector_y;          ///< Wrapped Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+
+
+    //---------------------------------------------------------------------
+    // Interface
+    //---------------------------------------------------------------------
+
+    /**
+     * Constructor
+     */
+    __device__ __forceinline__ AgentSpmv(
+        TempStorage&                    temp_storage,           ///< Reference to temp_storage
+        SpmvParams<ValueT, OffsetT>&    spmv_params)            ///< SpMV input parameter bundle
+    :
+        temp_storage(temp_storage.Alias()),
+        spmv_params(spmv_params),
+        wd_values(spmv_params.d_values),
+        wd_row_end_offsets(spmv_params.d_row_end_offsets),
+        wd_column_indices(spmv_params.d_column_indices),
+        wd_vector_x(spmv_params.d_vector_x),
+        wd_vector_y(spmv_params.d_vector_y)
+    {}
+
+
+
+
+    /**
+     * Consume a merge tile, specialized for direct-load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<true>  is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+
+        ValueT          running_total = 0.0;
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            OffsetT nonzero_idx         = CUB_MIN(tile_nonzero_indices[thread_current_coord.y], spmv_params.num_nonzeros - 1);
+            OffsetT column_idx          = wd_column_indices[nonzero_idx];
+            ValueT  value               = wd_values[nonzero_idx];
+
+            ValueT  vector_value        = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+            vector_value                = wd_vector_x[column_idx];
+#endif
+            ValueT  nonzero             = value * vector_value;
+
+            OffsetT row_end_offset      = s_tile_row_end_offsets[thread_current_coord.x];
+
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                running_total += nonzero;
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = tile_num_rows;
+                ++thread_current_coord.y;
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = running_total;
+                scan_segment[ITEM].key      = thread_current_coord.x;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key   = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (tile_num_rows > 0)
+        {
+            if (threadIdx.x == 0)
+                scan_item.key = -1;
+
+            // Direct scatter
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM].key < tile_num_rows)
+                {
+                    if (scan_item.key == scan_segment[ITEM].key)
+                        scan_segment[ITEM].value = scan_item.value + scan_segment[ITEM].value;
+
+                    if (HAS_ALPHA)
+                    {
+                        scan_segment[ITEM].value *= spmv_params.alpha;
+                    }
+
+                    if (HAS_BETA)
+                    {
+                        // Update the output vector element
+                        ValueT addend = spmv_params.beta * wd_vector_y[tile_start_coord.x + scan_segment[ITEM].key];
+                        scan_segment[ITEM].value += addend;
+                    }
+
+                    // Set the output vector element
+                    spmv_params.d_vector_y[tile_start_coord.x + scan_segment[ITEM].key] = scan_segment[ITEM].value;
+                }
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+
+    /**
+     * Consume a merge tile, specialized for indirect load of nonzeros
+     */
+    __device__ __forceinline__ KeyValuePairT ConsumeTile(
+        int             tile_idx,
+        CoordinateT     tile_start_coord,
+        CoordinateT     tile_end_coord,
+        Int2Type<false> is_direct_load)     ///< Marker type indicating whether to load nonzeros directly during path-discovery or beforehand in batch
+    {
+        int         tile_num_rows           = tile_end_coord.x - tile_start_coord.x;
+        int         tile_num_nonzeros       = tile_end_coord.y - tile_start_coord.y;
+
+#if (CUB_PTX_ARCH >= 520)
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            int nonzero_idx = threadIdx.x + (ITEM * BLOCK_THREADS);
+
+            ValueIteratorT a                = wd_values + tile_start_coord.y + nonzero_idx;
+            ColumnIndicesIteratorT ci       = wd_column_indices + tile_start_coord.y + nonzero_idx;
+            ValueT* s                       = s_tile_nonzeros + nonzero_idx;
+
+            if (nonzero_idx < tile_num_nonzeros)
+            {
+
+                OffsetT column_idx              = *ci;
+                ValueT  value                   = *a;
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+                vector_value                    = wd_vector_x[column_idx];
+
+                ValueT  nonzero                 = value * vector_value;
+
+                *s    = nonzero;
+            }
+        }
+
+
+#else
+
+        OffsetT*    s_tile_row_end_offsets  = &temp_storage.aliasable.merge_items[0].row_end_offset;
+        ValueT*     s_tile_nonzeros         = &temp_storage.aliasable.merge_items[tile_num_rows + ITEMS_PER_THREAD].nonzero;
+
+        // Gather the nonzeros for the merge tile into shared memory
+        if (tile_num_nonzeros > 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                int     nonzero_idx             = threadIdx.x + (ITEM * BLOCK_THREADS);
+                nonzero_idx                     = CUB_MIN(nonzero_idx, tile_num_nonzeros - 1);
+
+                OffsetT column_idx              = wd_column_indices[tile_start_coord.y + nonzero_idx];
+                ValueT  value                   = wd_values[tile_start_coord.y + nonzero_idx];
+
+                ValueT  vector_value            = spmv_params.t_vector_x[column_idx];
+#if (CUB_PTX_ARCH >= 350)
+                vector_value                    = wd_vector_x[column_idx];
+#endif
+                ValueT  nonzero                 = value * vector_value;
+
+                s_tile_nonzeros[nonzero_idx]    = nonzero;
+            }
+        }
+
+#endif
+
+        // Gather the row end-offsets for the merge tile into shared memory
+        #pragma unroll 1
+        for (int item = threadIdx.x; item <= tile_num_rows; item += BLOCK_THREADS)
+        {
+            s_tile_row_end_offsets[item] = wd_row_end_offsets[tile_start_coord.x + item];
+        }
+
+        CTA_SYNC();
+
+        // Search for the thread's starting coordinate within the merge tile
+        CountingInputIterator<OffsetT>  tile_nonzero_indices(tile_start_coord.y);
+        CoordinateT                     thread_start_coord;
+
+        MergePathSearch(
+            OffsetT(threadIdx.x * ITEMS_PER_THREAD),    // Diagonal
+            s_tile_row_end_offsets,                     // List A
+            tile_nonzero_indices,                       // List B
+            tile_num_rows,
+            tile_num_nonzeros,
+            thread_start_coord);
+
+        CTA_SYNC();            // Perf-sync
+
+        // Compute the thread's merge path segment
+        CoordinateT     thread_current_coord = thread_start_coord;
+        KeyValuePairT   scan_segment[ITEMS_PER_THREAD];
+        ValueT          running_total = 0.0;
+
+        OffsetT row_end_offset  = s_tile_row_end_offsets[thread_current_coord.x];
+        ValueT  nonzero         = s_tile_nonzeros[thread_current_coord.y];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+        {
+            if (tile_nonzero_indices[thread_current_coord.y] < row_end_offset)
+            {
+                // Move down (accumulate)
+                scan_segment[ITEM].value    = nonzero;
+                running_total               += nonzero;
+                ++thread_current_coord.y;
+                nonzero                     = s_tile_nonzeros[thread_current_coord.y];
+            }
+            else
+            {
+                // Move right (reset)
+                scan_segment[ITEM].value    = 0.0;
+                running_total               = 0.0;
+                ++thread_current_coord.x;
+                row_end_offset              = s_tile_row_end_offsets[thread_current_coord.x];
+            }
+
+            scan_segment[ITEM].key = thread_current_coord.x;
+        }
+
+        CTA_SYNC();
+
+        // Block-wide reduce-value-by-segment
+        KeyValuePairT       tile_carry;
+        ReduceBySegmentOpT  scan_op;
+        KeyValuePairT       scan_item;
+
+        scan_item.value = running_total;
+        scan_item.key = thread_current_coord.x;
+
+        BlockScanT(temp_storage.aliasable.scan).ExclusiveScan(scan_item, scan_item, scan_op, tile_carry);
+
+        if (threadIdx.x == 0)
+        {
+            scan_item.key = thread_start_coord.x;
+            scan_item.value = 0.0;
+        }
+
+        if (tile_num_rows > 0)
+        {
+
+            CTA_SYNC();
+
+            // Scan downsweep and scatter
+            ValueT* s_partials = &temp_storage.aliasable.merge_items[0].nonzero;
+
+            if (scan_item.key != scan_segment[0].key)
+            {
+                s_partials[scan_item.key] = scan_item.value;
+            }
+            else
+            {
+                scan_segment[0].value += scan_item.value;
+            }
+
+            #pragma unroll
+            for (int ITEM = 1; ITEM < ITEMS_PER_THREAD; ++ITEM)
+            {
+                if (scan_segment[ITEM - 1].key != scan_segment[ITEM].key)
+                {
+                    s_partials[scan_segment[ITEM - 1].key] = scan_segment[ITEM - 1].value;
+                }
+                else
+                {
+                    scan_segment[ITEM].value += scan_segment[ITEM - 1].value;
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll 1
+            for (int item = threadIdx.x; item < tile_num_rows; item += BLOCK_THREADS)
+            {
+                spmv_params.d_vector_y[tile_start_coord.x + item] = s_partials[item];
+            }
+        }
+
+        // Return the tile's running carry-out
+        return tile_carry;
+    }
+
+
+    /**
+     * Consume input tile
+     */
+    __device__ __forceinline__ void ConsumeTile(
+        CoordinateT*    d_tile_coordinates,     ///< [in] Pointer to the temporary array of tile starting coordinates
+        KeyValuePairT*  d_tile_carry_pairs,     ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+        int             num_merge_tiles)        ///< [in] Number of merge tiles
+    {
+        int tile_idx = (blockIdx.x * gridDim.y) + blockIdx.y;    // Current tile index
+
+        if (tile_idx >= num_merge_tiles)
+            return;
+
+        // Read our starting coordinates
+        if (threadIdx.x < 2)
+        {
+            if (d_tile_coordinates == NULL)
+            {
+                // Search our starting coordinates
+                OffsetT                         diagonal = (tile_idx + threadIdx.x) * TILE_ITEMS;
+                CoordinateT                     tile_coord;
+                CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+                // Search the merge path
+                MergePathSearch(
+                    diagonal,
+                    RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+                    nonzero_indices,
+                    spmv_params.num_rows,
+                    spmv_params.num_nonzeros,
+                    tile_coord);
+
+                temp_storage.tile_coords[threadIdx.x] = tile_coord;
+            }
+            else
+            {
+                temp_storage.tile_coords[threadIdx.x] = d_tile_coordinates[tile_idx + threadIdx.x];
+            }
+        }
+
+        CTA_SYNC();
+
+        CoordinateT tile_start_coord     = temp_storage.tile_coords[0];
+        CoordinateT tile_end_coord       = temp_storage.tile_coords[1];
+
+        // Consume multi-segment tile
+        KeyValuePairT tile_carry = ConsumeTile(
+            tile_idx,
+            tile_start_coord,
+            tile_end_coord,
+            Int2Type<AgentSpmvPolicyT::DIRECT_LOAD_NONZEROS>());
+
+        // Output the tile's carry-out
+        if (threadIdx.x == 0)
+        {
+            if (HAS_ALPHA)
+                tile_carry.value *= spmv_params.alpha;
+
+            tile_carry.key += tile_start_coord.x;
+            d_tile_carry_pairs[tile_idx]    = tile_carry;
+        }
+    }
+
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/agent/single_pass_scan_operators.cuh b/GraphBLAS/CUDA/local_cub/agent/single_pass_scan_operators.cuh
new file mode 100644
index 0000000000..53409bdeec
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/agent/single_pass_scan_operators.cuh
@@ -0,0 +1,815 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Callback operator types for supplying BlockScan prefixes
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../warp/warp_reduce.cuh"
+#include "../util_arch.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Prefix functor type for maintaining a running prefix while scanning a
+ * region independent of other thread blocks
+ ******************************************************************************/
+
+/**
+ * Stateful callback operator type for supplying BlockScan prefixes.
+ * Maintains a running prefix that can be applied to consecutive
+ * BlockScan operations.
+ */
+template <
+    typename T,                 ///< BlockScan value type
+    typename ScanOpT>            ///< Wrapped scan operator type
+struct BlockScanRunningPrefixOp
+{
+    ScanOpT     op;                 ///< Wrapped scan operator
+    T           running_total;      ///< Running block-wide prefix
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(ScanOpT op)
+    :
+        op(op)
+    {}
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRunningPrefixOp(
+        T starting_prefix,
+        ScanOpT op)
+    :
+        op(op),
+        running_total(starting_prefix)
+    {}
+
+    /**
+     * Prefix callback operator.  Returns the block-wide running_total in thread-0.
+     */
+    __device__ __forceinline__ T operator()(
+        const T &block_aggregate)              ///< The aggregate sum of the BlockScan inputs
+    {
+        T retval = running_total;
+        running_total = op(running_total, block_aggregate);
+        return retval;
+    }
+};
+
+
+/******************************************************************************
+ * Generic tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Enumerations of tile status
+ */
+enum ScanTileStatus
+{
+    SCAN_TILE_OOB,          // Out-of-bounds (e.g., padding)
+    SCAN_TILE_INVALID = 99, // Not yet processed
+    SCAN_TILE_PARTIAL,      // Tile aggregate is available
+    SCAN_TILE_INCLUSIVE,    // Inclusive tile prefix is available
+};
+
+
+/**
+ * Tile status interface.
+ */
+template <
+    typename    T,
+    bool        SINGLE_WORD = Traits<T>::PRIMITIVE>
+struct ScanTileState;
+
+
+/**
+ * Tile status interface specialized for scan status and value types
+ * that can be combined into one machine word that can be
+ * read/written coherently in a single access.
+ */
+template <typename T>
+struct ScanTileState<T, true>
+{
+    // Status word type
+    typedef typename If<(sizeof(T) == 8),
+        long long,
+        typename If<(sizeof(T) == 4),
+            int,
+            typename If<(sizeof(T) == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+
+    // Unit word type
+    typedef typename If<(sizeof(T) == 8),
+        longlong2,
+        typename If<(sizeof(T) == 4),
+            int2,
+            typename If<(sizeof(T) == 2),
+                int,
+                uchar2>::Type>::Type>::Type TxnWord;
+
+
+    // Device word type
+    struct TileDescriptor
+    {
+        StatusWord  status;
+        T           value;
+    };
+
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+        TxnWord val = TxnWord();
+        TileDescriptor *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value = tile_inclusive;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status = SCAN_TILE_PARTIAL;
+        tile_descriptor.value = tile_partial;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status = tile_descriptor.status;
+        value = tile_descriptor.value;
+    }
+
+};
+
+
+
+/**
+ * Tile status interface specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <typename T>
+struct ScanTileState<T, false>
+{
+    // Status word type
+    typedef char StatusWord;
+
+    // Constants
+    enum
+    {
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Device storage
+    StatusWord  *d_tile_status;
+    T           *d_tile_partial;
+    T           *d_tile_inclusive;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ScanTileState()
+    :
+        d_tile_status(NULL),
+        d_tile_partial(NULL),
+        d_tile_inclusive(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     num_tiles,                          ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  temp_storage_bytes)                 ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        cudaError_t error = cudaSuccess;
+        do
+        {
+            void*   allocations[3];
+            size_t  allocation_sizes[3];
+
+            allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);           // bytes needed for tile status descriptors
+            allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for partials
+            allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);     // bytes needed for inclusives
+
+            // Compute allocation pointers into the single storage blob
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Alias the offsets
+            d_tile_status       = reinterpret_cast<StatusWord*>(allocations[0]);
+            d_tile_partial      = reinterpret_cast<T*>(allocations[1]);
+            d_tile_inclusive    = reinterpret_cast<T*>(allocations[2]);
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        // Specify storage allocation requirements
+        size_t  allocation_sizes[3];
+        allocation_sizes[0] = (num_tiles + TILE_STATUS_PADDING) * sizeof(StatusWord);         // bytes needed for tile status descriptors
+        allocation_sizes[1] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for partials
+        allocation_sizes[2] = (num_tiles + TILE_STATUS_PADDING) * sizeof(Uninitialized<T>);   // bytes needed for inclusives
+
+        // Set the necessary size of the blob
+        void* allocations[3];
+        return CubDebug(AliasTemporaries(NULL, temp_storage_bytes, allocations, allocation_sizes));
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            d_tile_status[TILE_STATUS_PADDING + tile_idx] = StatusWord(SCAN_TILE_INVALID);
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            d_tile_status[threadIdx.x] = StatusWord(SCAN_TILE_OOB);
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, T tile_inclusive)
+    {
+        // Update tile inclusive value
+        ThreadStore<STORE_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx, tile_inclusive);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_INCLUSIVE));
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, T tile_partial)
+    {
+        // Update tile partial value
+        ThreadStore<STORE_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx, tile_partial);
+
+        // Fence
+        __threadfence();
+
+        // Update tile status
+        ThreadStore<STORE_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx, StatusWord(SCAN_TILE_PARTIAL));
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int             tile_idx,
+        StatusWord      &status,
+        T               &value)
+    {
+        do {
+            status = ThreadLoad<LOAD_CG>(d_tile_status + TILE_STATUS_PADDING + tile_idx);
+
+            __threadfence();    // prevent hoisting loads from loop or loads below above this one
+
+        } while (status == SCAN_TILE_INVALID);
+
+        if (status == StatusWord(SCAN_TILE_PARTIAL)) 
+            value = ThreadLoad<LOAD_CG>(d_tile_partial + TILE_STATUS_PADDING + tile_idx);
+        else
+            value = ThreadLoad<LOAD_CG>(d_tile_inclusive + TILE_STATUS_PADDING + tile_idx);
+    }
+};
+
+
+/******************************************************************************
+ * ReduceByKey tile status interface types for block-cooperative scans
+ ******************************************************************************/
+
+/**
+ * Tile status interface for reduction by key.
+ *
+ */
+template <
+    typename    ValueT,
+    typename    KeyT,
+    bool        SINGLE_WORD = (Traits<ValueT>::PRIMITIVE) && (sizeof(ValueT) + sizeof(KeyT) < 16)>
+struct ReduceByKeyScanTileState;
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * cannot be combined into one machine word.
+ */
+template <
+    typename    ValueT,
+    typename    KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, false> :
+    ScanTileState<KeyValuePair<KeyT, ValueT> >
+{
+    typedef ScanTileState<KeyValuePair<KeyT, ValueT> > SuperClass;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState() : SuperClass() {}
+};
+
+
+/**
+ * Tile status interface for reduction by key, specialized for scan status and value types that
+ * can be combined into one machine word that can be read/written coherently in a single access.
+ */
+template <
+    typename ValueT,
+    typename KeyT>
+struct ReduceByKeyScanTileState<ValueT, KeyT, true>
+{
+    typedef KeyValuePair<KeyT, ValueT>KeyValuePairT;
+
+    // Constants
+    enum
+    {
+        PAIR_SIZE           = sizeof(ValueT) + sizeof(KeyT),
+        TXN_WORD_SIZE       = 1 << Log2<PAIR_SIZE + 1>::VALUE,
+        STATUS_WORD_SIZE    = TXN_WORD_SIZE - PAIR_SIZE,
+
+        TILE_STATUS_PADDING = CUB_PTX_WARP_THREADS,
+    };
+
+    // Status word type
+    typedef typename If<(STATUS_WORD_SIZE == 8),
+        long long,
+        typename If<(STATUS_WORD_SIZE == 4),
+            int,
+            typename If<(STATUS_WORD_SIZE == 2),
+                short,
+                char>::Type>::Type>::Type StatusWord;
+
+    // Status word type
+    typedef typename If<(TXN_WORD_SIZE == 16),
+        longlong2,
+        typename If<(TXN_WORD_SIZE == 8),
+            long long,
+            int>::Type>::Type TxnWord;
+
+    // Device word type (for when sizeof(ValueT) == sizeof(KeyT))
+    struct TileDescriptorBigStatus
+    {
+        KeyT        key;
+        ValueT      value;
+        StatusWord  status;
+    };
+
+    // Device word type (for when sizeof(ValueT) != sizeof(KeyT))
+    struct TileDescriptorLittleStatus
+    {
+        ValueT      value;
+        StatusWord  status;
+        KeyT        key;
+    };
+
+    // Device word type
+    typedef typename If<
+            (sizeof(ValueT) == sizeof(KeyT)),
+            TileDescriptorBigStatus,
+            TileDescriptorLittleStatus>::Type
+        TileDescriptor;
+
+
+    // Device storage
+    TxnWord *d_tile_descriptors;
+
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    ReduceByKeyScanTileState()
+    :
+        d_tile_descriptors(NULL)
+    {}
+
+
+    /// Initializer
+    __host__ __device__ __forceinline__
+    cudaError_t Init(
+        int     /*num_tiles*/,                      ///< [in] Number of tiles
+        void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t  /*temp_storage_bytes*/)             ///< [in] Size in bytes of \t d_temp_storage allocation
+    {
+        d_tile_descriptors = reinterpret_cast<TxnWord*>(d_temp_storage);
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Compute device memory needed for tile status
+     */
+    __host__ __device__ __forceinline__
+    static cudaError_t AllocationSize(
+        int     num_tiles,                          ///< [in] Number of tiles
+        size_t  &temp_storage_bytes)                ///< [out] Size in bytes of \t d_temp_storage allocation
+    {
+        temp_storage_bytes = (num_tiles + TILE_STATUS_PADDING) * sizeof(TileDescriptor);       // bytes needed for tile status descriptors
+        return cudaSuccess;
+    }
+
+
+    /**
+     * Initialize (from device)
+     */
+    __device__ __forceinline__ void InitializeStatus(int num_tiles)
+    {
+        int             tile_idx    = (blockIdx.x * blockDim.x) + threadIdx.x;
+        TxnWord         val         = TxnWord();
+        TileDescriptor  *descriptor = reinterpret_cast<TileDescriptor*>(&val);
+
+        if (tile_idx < num_tiles)
+        {
+            // Not-yet-set
+            descriptor->status = StatusWord(SCAN_TILE_INVALID);
+            d_tile_descriptors[TILE_STATUS_PADDING + tile_idx] = val;
+        }
+
+        if ((blockIdx.x == 0) && (threadIdx.x < TILE_STATUS_PADDING))
+        {
+            // Padding
+            descriptor->status = StatusWord(SCAN_TILE_OOB);
+            d_tile_descriptors[threadIdx.x] = val;
+        }
+    }
+
+
+    /**
+     * Update the specified tile's inclusive value and corresponding status
+     */
+    __device__ __forceinline__ void SetInclusive(int tile_idx, KeyValuePairT tile_inclusive)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_INCLUSIVE;
+        tile_descriptor.value   = tile_inclusive.value;
+        tile_descriptor.key     = tile_inclusive.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+
+    /**
+     * Update the specified tile's partial value and corresponding status
+     */
+    __device__ __forceinline__ void SetPartial(int tile_idx, KeyValuePairT tile_partial)
+    {
+        TileDescriptor tile_descriptor;
+        tile_descriptor.status  = SCAN_TILE_PARTIAL;
+        tile_descriptor.value   = tile_partial.value;
+        tile_descriptor.key     = tile_partial.key;
+
+        TxnWord alias;
+        *reinterpret_cast<TileDescriptor*>(&alias) = tile_descriptor;
+        ThreadStore<STORE_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx, alias);
+    }
+
+    /**
+     * Wait for the corresponding tile to become non-invalid
+     */
+    __device__ __forceinline__ void WaitForValid(
+        int                     tile_idx,
+        StatusWord              &status,
+        KeyValuePairT           &value)
+    {
+//        TxnWord         alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//        TileDescriptor  tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//
+//        while (tile_descriptor.status == SCAN_TILE_INVALID)
+//        {
+//            __threadfence_block(); // prevent hoisting loads from loop
+//
+//            alias           = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+//            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+//        }
+//
+//        status      = tile_descriptor.status;
+//        value.value = tile_descriptor.value;
+//        value.key   = tile_descriptor.key;
+
+        TileDescriptor tile_descriptor;
+        do
+        {
+            __threadfence_block(); // prevent hoisting loads from loop
+            TxnWord alias = ThreadLoad<LOAD_CG>(d_tile_descriptors + TILE_STATUS_PADDING + tile_idx);
+            tile_descriptor = reinterpret_cast<TileDescriptor&>(alias);
+
+        } while (WARP_ANY((tile_descriptor.status == SCAN_TILE_INVALID), 0xffffffff));
+
+        status      = tile_descriptor.status;
+        value.value = tile_descriptor.value;
+        value.key   = tile_descriptor.key;
+    }
+
+};
+
+
+/******************************************************************************
+ * Prefix call-back operator for coupling local block scan within a
+ * block-cooperative scan
+ ******************************************************************************/
+
+/**
+ * Stateful block-scan prefix functor.  Provides the the running prefix for
+ * the current tile by using the call-back warp to wait on on
+ * aggregates/prefixes from predecessor tiles to become available.
+ */
+template <
+    typename    T,
+    typename    ScanOpT,
+    typename    ScanTileStateT,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct TilePrefixCallbackOp
+{
+    // Parameterized warp reduce
+    typedef WarpReduce<T, CUB_PTX_WARP_THREADS, PTX_ARCH> WarpReduceT;
+
+    // Temporary storage type
+    struct _TempStorage
+    {
+        typename WarpReduceT::TempStorage   warp_reduce;
+        T                                   exclusive_prefix;
+        T                                   inclusive_prefix;
+        T                                   block_aggregate;
+    };
+
+    // Alias wrapper allowing temporary storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+    // Type of status word
+    typedef typename ScanTileStateT::StatusWord StatusWord;
+
+    // Fields
+    _TempStorage&               temp_storage;       ///< Reference to a warp-reduction instance
+    ScanTileStateT&             tile_status;        ///< Interface to tile status
+    ScanOpT                     scan_op;            ///< Binary scan operator
+    int                         tile_idx;           ///< The current tile index
+    T                           exclusive_prefix;   ///< Exclusive prefix for the tile
+    T                           inclusive_prefix;   ///< Inclusive prefix for the tile
+
+    // Constructor
+    __device__ __forceinline__
+    TilePrefixCallbackOp(
+        ScanTileStateT       &tile_status,
+        TempStorage         &temp_storage,
+        ScanOpT              scan_op,
+        int                 tile_idx)
+    :
+        temp_storage(temp_storage.Alias()),
+        tile_status(tile_status),
+        scan_op(scan_op),
+        tile_idx(tile_idx) {}
+
+
+    // Block until all predecessors within the warp-wide window have non-invalid status
+    __device__ __forceinline__
+    void ProcessWindow(
+        int         predecessor_idx,        ///< Preceding tile index to inspect
+        StatusWord  &predecessor_status,    ///< [out] Preceding tile status
+        T           &window_aggregate)      ///< [out] Relevant partial reduction from this window of preceding tiles
+    {
+        T value;
+        tile_status.WaitForValid(predecessor_idx, predecessor_status, value);
+
+        // Perform a segmented reduction to get the prefix for the current window.
+        // Use the swizzled scan operator because we are now scanning *down* towards thread0.
+
+        int tail_flag = (predecessor_status == StatusWord(SCAN_TILE_INCLUSIVE));
+        window_aggregate = WarpReduceT(temp_storage.warp_reduce).TailSegmentedReduce(
+            value,
+            tail_flag,
+            SwizzleScanOp<ScanOpT>(scan_op));
+    }
+
+
+    // BlockScan prefix callback functor (called by the first warp)
+    __device__ __forceinline__
+    T operator()(T block_aggregate)
+    {
+
+        // Update our status with our tile-aggregate
+        if (threadIdx.x == 0)
+        {
+            temp_storage.block_aggregate = block_aggregate;
+            tile_status.SetPartial(tile_idx, block_aggregate);
+        }
+
+        int         predecessor_idx = tile_idx - threadIdx.x - 1;
+        StatusWord  predecessor_status;
+        T           window_aggregate;
+
+        // Wait for the warp-wide window of predecessor tiles to become valid
+        ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+
+        // The exclusive tile prefix starts out as the current window aggregate
+        exclusive_prefix = window_aggregate;
+
+        // Keep sliding the window back until we come across a tile whose inclusive prefix is known
+        while (WARP_ALL((predecessor_status != StatusWord(SCAN_TILE_INCLUSIVE)), 0xffffffff))
+        {
+            predecessor_idx -= CUB_PTX_WARP_THREADS;
+
+            // Update exclusive tile prefix with the window prefix
+            ProcessWindow(predecessor_idx, predecessor_status, window_aggregate);
+            exclusive_prefix = scan_op(window_aggregate, exclusive_prefix);
+        }
+
+        // Compute the inclusive tile prefix and update the status for this tile
+        if (threadIdx.x == 0)
+        {
+            inclusive_prefix = scan_op(exclusive_prefix, block_aggregate);
+            tile_status.SetInclusive(tile_idx, inclusive_prefix);
+
+            temp_storage.exclusive_prefix = exclusive_prefix;
+            temp_storage.inclusive_prefix = inclusive_prefix;
+        }
+
+        // Return exclusive_prefix
+        return exclusive_prefix;
+    }
+
+    // Get the exclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetExclusivePrefix()
+    {
+        return temp_storage.exclusive_prefix;
+    }
+
+    // Get the inclusive prefix stored in temporary storage
+    __device__ __forceinline__
+    T GetInclusivePrefix()
+    {
+        return temp_storage.inclusive_prefix;
+    }
+
+    // Get the block aggregate stored in temporary storage
+    __device__ __forceinline__
+    T GetBlockAggregate()
+    {
+        return temp_storage.block_aggregate;
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/block_adjacent_difference.cuh b/GraphBLAS/CUDA/local_cub/block/block_adjacent_difference.cuh
new file mode 100644
index 0000000000..acef9f0568
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/block_adjacent_difference.cuh
@@ -0,0 +1,596 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockAdjacentDifference
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(b, a, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ T FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(b, a);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template <int ITERATION, int MAX_ITERATIONS>
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template <int MAX_ITERATIONS>
+    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockAdjacentDifference(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/block/block_discontinuity.cuh b/GraphBLAS/CUDA/local_cub/block/block_discontinuity.cuh
new file mode 100644
index 0000000000..503e3e0b04
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/block_discontinuity.cuh
@@ -0,0 +1,1148 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockDiscontinuity class provides [<em>collective</em>](index.html#sec0) methods for flagging discontinuities within an ordered set of items partitioned across a CUDA thread block. ![](discont_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                The data type to be flagged.
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A set of "head flags" (or "tail flags") is often used to indicate corresponding items
+ *   that differ from their predecessors (or successors).  For example, head flags are convenient
+ *   for demarcating disjoint data segments as part of a segmented scan or reduction.
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockDiscontinuity}
+ * \par
+ * The code snippet below illustrates the head flagging of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+ *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+ *
+ *     // Allocate shared memory for BlockDiscontinuity
+ *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute head flags for discontinuities in the segment
+ *     int head_flags[4];
+ *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+ * The corresponding output \p head_flags in those threads will be
+ * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+ *
+ * \par Performance Considerations
+ * - Incurs zero bank conflicts for most types
+ *
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,
+    int         BLOCK_DIM_Y     = 1,
+    int         BLOCK_DIM_Z     = 1,
+    int         PTX_ARCH        = CUB_PTX_ARCH>
+class BlockDiscontinuity
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T first_items[BLOCK_THREADS];
+        T last_items[BLOCK_THREADS];
+    };
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /// Specialization for when FlagOp has third index param
+    template <typename FlagOp, bool HAS_PARAM = BinaryOpHasIdxParam<T, FlagOp>::HAS_PARAM>
+    struct ApplyOp
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int idx)
+        {
+            return flag_op(a, b, idx);
+        }
+    };
+
+    /// Specialization for when FlagOp does not have a third index param
+    template <typename FlagOp>
+    struct ApplyOp<FlagOp, false>
+    {
+        // Apply flag operator
+        static __device__ __forceinline__ bool FlagT(FlagOp flag_op, const T &a, const T &b, int /*idx*/)
+        {
+            return flag_op(a, b);
+        }
+    };
+
+    /// Templated unrolling of item comparison (inductive case)
+    template <int ITERATION, int MAX_ITERATIONS>
+    struct Iterate
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            preds[ITERATION] = input[ITERATION - 1];
+
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[ITERATION],
+                input[ITERATION],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagHeads(linear_tid, flags, input, preds, flag_op);
+        }
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     linear_tid,
+            FlagT                   (&flags)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  flag_op)                            ///< [in] Binary boolean flag predicate
+        {
+            flags[ITERATION] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITERATION],
+                input[ITERATION + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITERATION + 1);
+
+            Iterate<ITERATION + 1, MAX_ITERATIONS>::FlagTails(linear_tid, flags, input, flag_op);
+        }
+
+    };
+
+    /// Templated unrolling of item comparison (termination case)
+    template <int MAX_ITERATIONS>
+    struct Iterate<MAX_ITERATIONS, MAX_ITERATIONS>
+    {
+        // Head flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagHeads(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            T                       (&/*preds*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+
+        // Tail flags
+        template <
+            int             ITEMS_PER_THREAD,
+            typename        FlagT,
+            typename        FlagOp>
+        static __device__ __forceinline__ void FlagTails(
+            int                     /*linear_tid*/,
+            FlagT                   (&/*flags*/)[ITEMS_PER_THREAD],         ///< [out] Calling thread's discontinuity head_flags
+            T                       (&/*input*/)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+            FlagOp                  /*flag_op*/)                            ///< [in] Binary boolean flag predicate
+        {}
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockDiscontinuity}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockDiscontinuity(
+        TempStorage &temp_storage)  ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head flag operations
+     *********************************************************************/
+    //@{
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        if (linear_tid == 0)
+        {
+            // Set flag for first thread-item (preds[0] is undefined)
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        T               (&preds)[ITEMS_PER_THREAD],         ///< [out] Calling thread's predecessor items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        // Share last item
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(flag_op, preds[0], input[0], linear_tid * ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block, for which the first item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(head_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>.
+     * The corresponding output \p head_flags in those threads will be
+     * <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op);
+    }
+
+
+    /**
+     * \brief Sets head flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Collectively compute head flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagHeads(
+     *         head_flags, thread_data, cub::Inequality(), tile_predecessor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], [3,4,4,4], ... }</tt>,
+     * and that \p tile_predecessor_item is \p 0.  The corresponding output \p head_flags in those threads will be
+     * <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeads(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_predecessor_item)              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+    {
+        T preds[ITEMS_PER_THREAD];
+        FlagHeads(head_flags, input, preds, flag_op, tile_predecessor_item);
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block, for which the last item has no reference and is always flagged.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>.
+     * The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute tail flags for discontinuities in the segment
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         tail_flags, thread_data, cub::Inequality(), tile_successor_item);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that \p tile_successor_item is \p 125.  The corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagTails(
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op,                            ///< [in] Binary boolean flag predicate
+        T               tile_successor_item)                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+    {
+        // Share first item
+        temp_storage.first_items[linear_tid] = input[0];
+
+        CTA_SYNC();
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Head & tail flag operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = temp_storage.last_items[linear_tid - 1];
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is always flagged.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_predecessor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tail_flags, tile_successor_item, thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>
+     * and that the tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [1,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        if (linear_tid == 0)
+        {
+            head_flags[0] = 1;
+        }
+        else
+        {
+            preds[0] = temp_storage.last_items[linear_tid - 1];
+            head_flags[0] = ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                preds[0],
+                input[0],
+                linear_tid * ITEMS_PER_THREAD);
+        }
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is always flagged.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,1] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        tail_flags[ITEMS_PER_THREAD - 1] = (linear_tid == BLOCK_THREADS - 1) ?
+            1 :                             // Last thread
+            ApplyOp<FlagOp>::FlagT(
+                flag_op,
+                input[ITEMS_PER_THREAD - 1],
+                temp_storage.first_items[linear_tid + 1],
+                (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+    /**
+     * \brief Sets both head and tail flags indicating discontinuities between items partitioned across the thread block.
+     *
+     * \par
+     * - The flag <tt>head_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(</tt><em>previous-item</em><tt>, input<sub><em>i</em></sub>)</tt>
+     *   returns \p true (where <em>previous-item</em> is either the preceding item
+     *   in the same thread or the last item in the previous thread).
+     * - For <em>thread</em><sub>0</sub>, item <tt>input<sub>0</sub></tt> is compared
+     *   against \p tile_predecessor_item.
+     * - The flag <tt>tail_flags<sub><em>i</em></sub></tt> is set for item
+     *   <tt>input<sub><em>i</em></sub></tt> when
+     *   <tt>flag_op(input<sub><em>i</em></sub>, </tt><em>next-item</em><tt>)</tt>
+     *   returns \p true (where <em>next-item</em> is either the next item
+     *   in the same thread or the first item in the next thread).
+     * - For <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>, item
+     *   <tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> is compared
+     *   against \p tile_successor_item.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the head- and tail-flagging of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_discontinuity.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockDiscontinuity for a 1D block of 128 threads on type int
+     *     typedef cub::BlockDiscontinuity<int, 128> BlockDiscontinuity;
+     *
+     *     // Allocate shared memory for BlockDiscontinuity
+     *     __shared__ typename BlockDiscontinuity::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Have thread0 obtain the predecessor item for the entire tile
+     *     int tile_predecessor_item;
+     *     if (threadIdx.x == 0) tile_predecessor_item == ...
+     *
+     *     // Have thread127 obtain the successor item for the entire tile
+     *     int tile_successor_item;
+     *     if (threadIdx.x == 127) tile_successor_item == ...
+     *
+     *     // Collectively compute head and flags for discontinuities in the segment
+     *     int head_flags[4];
+     *     int tail_flags[4];
+     *     BlockDiscontinuity(temp_storage).FlagTails(
+     *         head_flags, tile_predecessor_item, tail_flags, tile_successor_item,
+     *         thread_data, cub::Inequality());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,0,1,1], [1,1,1,1], [2,3,3,3], ..., [124,125,125,125] }</tt>,
+     * that the \p tile_predecessor_item is \p 0, and that the
+     * \p tile_successor_item is \p 125.  The corresponding output \p head_flags
+     * in those threads will be <tt>{ [0,0,1,0], [0,0,0,0], [1,1,0,0], [0,1,0,0], ... }</tt>.
+     * and the corresponding output \p tail_flags in those threads will be
+     * <tt>{ [0,1,0,0], [0,0,0,1], [1,0,0,...], ..., [1,0,0,0] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam FlagT                <b>[inferred]</b> The flag type (must be an integer type)
+     * \tparam FlagOp               <b>[inferred]</b> Binary predicate functor type having member <tt>T operator()(const T &a, const T &b)</tt> or member <tt>T operator()(const T &a, const T &b, unsigned int b_index)</tt>, and returning \p true if a discontinuity exists between \p a and \p b, otherwise \p false.  \p b_index is the rank of b in the aggregate tile of data.
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        FlagT,
+        typename        FlagOp>
+    __device__ __forceinline__ void FlagHeadsAndTails(
+        FlagT           (&head_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity head_flags
+        T               tile_predecessor_item,              ///< [in] <b>[<em>thread</em><sub>0</sub> only]</b> Item with which to compare the first tile item (<tt>input<sub>0</sub></tt> from <em>thread</em><sub>0</sub>).
+        FlagT           (&tail_flags)[ITEMS_PER_THREAD],    ///< [out] Calling thread's discontinuity tail_flags
+        T               tile_successor_item,                ///< [in] <b>[<em>thread</em><sub><tt>BLOCK_THREADS</tt>-1</sub> only]</b> Item with which to compare the last tile item (<tt>input</tt><sub><em>ITEMS_PER_THREAD</em>-1</sub> from <em>thread</em><sub><em>BLOCK_THREADS</em>-1</sub>).
+        T               (&input)[ITEMS_PER_THREAD],         ///< [in] Calling thread's input items
+        FlagOp          flag_op)                            ///< [in] Binary boolean flag predicate
+    {
+        // Share first and last items
+        temp_storage.first_items[linear_tid] = input[0];
+        temp_storage.last_items[linear_tid] = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        T preds[ITEMS_PER_THREAD];
+
+        // Set flag for first thread-item
+        preds[0] = (linear_tid == 0) ?
+            tile_predecessor_item :              // First thread
+            temp_storage.last_items[linear_tid - 1];
+
+        head_flags[0] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            preds[0],
+            input[0],
+            linear_tid * ITEMS_PER_THREAD);
+
+        // Set flag for last thread-item
+        T successor_item = (linear_tid == BLOCK_THREADS - 1) ?
+            tile_successor_item :              // Last thread
+            temp_storage.first_items[linear_tid + 1];
+
+        tail_flags[ITEMS_PER_THREAD - 1] = ApplyOp<FlagOp>::FlagT(
+            flag_op,
+            input[ITEMS_PER_THREAD - 1],
+            successor_item,
+            (linear_tid * ITEMS_PER_THREAD) + ITEMS_PER_THREAD);
+
+        // Set head_flags for remaining items
+        Iterate<1, ITEMS_PER_THREAD>::FlagHeads(linear_tid, head_flags, input, preds, flag_op);
+
+        // Set tail_flags for remaining items
+        Iterate<0, ITEMS_PER_THREAD - 1>::FlagTails(linear_tid, tail_flags, input, flag_op);
+    }
+
+
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/block/block_exchange.cuh b/GraphBLAS/CUDA/local_cub/block/block_exchange.cuh
new file mode 100644
index 0000000000..3ae9934391
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/block_exchange.cuh
@@ -0,0 +1,1248 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_ptx.cuh"
+#include "../util_arch.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockExchange class provides [<em>collective</em>](index.html#sec0) methods for rearranging data partitioned across a CUDA thread block. ![](transpose_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items partitioned onto each thread.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> When \p true, only use enough shared memory for a single warp's worth of tile data, time-slicing the block-wide exchange over multiple synchronized rounds.  Yields a smaller memory footprint at the expense of decreased parallelism.  (Default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - It is commonplace for blocks of threads to rearrange data items between
+ *   threads.  For example, the device-accessible memory subsystem prefers access patterns
+ *   where data items are "striped" across threads (where consecutive threads access consecutive items),
+ *   yet most block-wide operations prefer a "blocked" partitioning of items across threads
+ *   (where consecutive items belong to a single thread).
+ * - BlockExchange supports the following types of data exchanges:
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>striped</em>](index.html#sec5sec3) arrangements
+ *   - Transposing between [<em>blocked</em>](index.html#sec5sec3) and [<em>warp-striped</em>](index.html#sec5sec3) arrangements
+ *   - Scattering ranked items to a [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *   - Scattering ranked items to a [<em>striped arrangement</em>](index.html#sec5sec3)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockExchange}
+ * \par
+ * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+ * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+ *
+ *     // Allocate shared memory for BlockExchange
+ *     __shared__ typename BlockExchange::TempStorage temp_storage;
+ *
+ *     // Load a tile of data striped across threads
+ *     int thread_data[4];
+ *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+ *
+ *     // Collectively exchange data into a blocked arrangement across threads
+ *     BlockExchange(temp_storage).StripedToBlocked(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of striped input \p thread_data across the block of threads is
+ * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ * \par Performance Considerations
+ * - Proper device-specific padding ensures zero bank conflicts for most types.
+ *
+ */
+template <
+    typename    InputT,
+    int         BLOCK_DIM_X,
+    int         ITEMS_PER_THREAD,
+    bool        WARP_TIME_SLICING   = false,
+    int         BLOCK_DIM_Y         = 1,
+    int         BLOCK_DIM_Z         = 1,
+    int         PTX_ARCH            = CUB_PTX_ARCH>
+class BlockExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        TILE_ITEMS                  = BLOCK_THREADS * ITEMS_PER_THREAD,
+
+        TIME_SLICES                 = (WARP_TIME_SLICING) ? WARPS : 1,
+
+        TIME_SLICED_THREADS         = (WARP_TIME_SLICING) ? CUB_MIN(BLOCK_THREADS, WARP_THREADS) : BLOCK_THREADS,
+        TIME_SLICED_ITEMS           = TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        WARP_TIME_SLICED_THREADS    = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+        WARP_TIME_SLICED_ITEMS      = WARP_TIME_SLICED_THREADS * ITEMS_PER_THREAD,
+
+        // Insert padding to avoid bank conflicts during raking when items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (TIME_SLICED_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct __align__(16) _TempStorage
+    {
+        InputT buff[TIME_SLICED_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{BlockExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+    unsigned int lane_id;
+    unsigned int warp_id;
+    unsigned int warp_offset;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement. Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        if (warp_id == 0)
+        {
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                temp_storage.buff[item_offset] = input_items[ITEM];
+            }
+
+            WARP_SYNC(0xffffffff);
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                output_items[ITEM] = temp_storage.buff[item_offset];
+            }
+        }
+
+        #pragma unroll
+        for (unsigned int SLICE = 1; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        // No timeslicing
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        // Warp time-slicing
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Write a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_storage.buff[item_offset] = input_items[ITEM];
+                    }
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for no timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = warp_offset + ITEM + (lane_id * ITEMS_PER_THREAD);
+            if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.  Specialized for warp-timeslicing
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        Int2Type<true>  /*time_slicing*/)
+    {
+        #pragma unroll
+        for (unsigned int SLICE = 0; SLICE < TIME_SLICES; ++SLICE)
+        {
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (ITEM * WARP_TIME_SLICED_THREADS) + lane_id;
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+
+                WARP_SYNC(0xffffffff);
+
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = ITEM + (lane_id * ITEMS_PER_THREAD);
+                    if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                    output_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (linear_tid * ITEMS_PER_THREAD) + ITEM;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+    /**
+     * Exchanges data items annotated by rank into <em>blocked</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true>  /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            CTA_SYNC();
+
+            const int SLICE_OFFSET = TIME_SLICED_ITEMS * SLICE;
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            if (warp_id == SLICE)
+            {
+                #pragma unroll
+                for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+                {
+                    int item_offset = (lane_id * ITEMS_PER_THREAD) + ITEM;
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_items[ITEM] = temp_storage.buff[item_offset];
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for no timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<false> /*time_slicing*/)
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    /**
+     * Exchanges data items annotated by rank into <em>striped</em> arrangement.  Specialized for warp-timeslicing.
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT          input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OutputT         output_items[ITEMS_PER_THREAD],     ///< [out] Items to exchange, converting between <em>blocked</em> and <em>striped</em> arrangements.
+        OffsetT         ranks[ITEMS_PER_THREAD],    ///< [in] Corresponding scatter ranks
+        Int2Type<true> /*time_slicing*/)
+    {
+        InputT temp_items[ITEMS_PER_THREAD];
+
+        #pragma unroll
+        for (int SLICE = 0; SLICE < TIME_SLICES; SLICE++)
+        {
+            const int SLICE_OFFSET  = SLICE * TIME_SLICED_ITEMS;
+            const int SLICE_OOB     = SLICE_OFFSET + TIME_SLICED_ITEMS;
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                int item_offset = ranks[ITEM] - SLICE_OFFSET;
+                if ((item_offset >= 0) && (item_offset < WARP_TIME_SLICED_ITEMS))
+                {
+                    if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+                    temp_storage.buff[item_offset] = input_items[ITEM];
+                }
+            }
+
+            CTA_SYNC();
+
+            #pragma unroll
+            for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+            {
+                // Read a strip of items
+                const int STRIP_OFFSET  = ITEM * BLOCK_THREADS;
+                const int STRIP_OOB     = STRIP_OFFSET + BLOCK_THREADS;
+
+                if ((SLICE_OFFSET < STRIP_OOB) && (SLICE_OOB > STRIP_OFFSET))
+                {
+                    int item_offset = STRIP_OFFSET + linear_tid - SLICE_OFFSET;
+                    if ((item_offset >= 0) && (item_offset < TIME_SLICED_ITEMS))
+                    {
+                        if (INSERT_PADDING) item_offset += item_offset >> LOG_SMEM_BANKS;
+                        temp_items[ITEM] = temp_storage.buff[item_offset];
+                    }
+                }
+            }
+        }
+
+        // Copy
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            output_items[ITEM] = temp_items[ITEM];
+        }
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId()),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockExchange(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        lane_id(LaneId()),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        warp_offset(warp_id * WARP_TIME_SLICED_ITEMS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Structured exchanges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Transposes data items from <em>striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a striped arrangement across block threads
+     *     int thread_data[4];
+     *     cub::LoadDirectStriped<128>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).StripedToBlocked(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of striped input \p thread_data across the block of threads is
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> after loading from device-accessible memory.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across block threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], ..., [127,255,383,511] }</tt> in
+     * preparation for storing to device-accessible memory.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>warp-striped</em> arrangement to <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "warp-striped" to a "blocked" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Load a tile of ordered data into a warp-striped arrangement across warp threads
+     *     int thread_data[4];
+     *     cub::LoadSWarptriped<LOAD_DEFAULT>(threadIdx.x, d_data, thread_data);
+     *
+     *     // Collectively exchange data into a blocked arrangement across threads
+     *     BlockExchange(temp_storage).WarpStripedToBlocked(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of warp-striped input \p thread_data across the block of threads is
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * after loading from device-accessible memory.  (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Transposes data items from <em>blocked</em> arrangement to <em>warp-striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the conversion from a "blocked" to a "warp-striped" arrangement
+     * of 512 integer items partitioned across 128 threads where each thread owns 4 items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_exchange.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockExchange for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockExchange<int, 128, 4> BlockExchange;
+     *
+     *     // Allocate shared memory for BlockExchange
+     *     __shared__ typename BlockExchange::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively exchange data into a warp-striped arrangement across threads
+     *     BlockExchange(temp_storage).BlockedToWarpStriped(thread_data, thread_data);
+     *
+     *     // Store data striped across warp threads into an ordered tile
+     *     cub::StoreDirectStriped<STORE_DEFAULT, 128>(threadIdx.x, d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of blocked input \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,32,64,96], [1,33,65,97], [2,34,66,98], ..., [415,447,479,511] }</tt>
+     * in preparation for storing to device-accessible memory. (The first 128 items are striped across
+     * the first warp of 32 threads, the second 128 items are striped across the second warp, etc.)
+     *
+     */
+    template <typename OutputT>
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      input_items[ITEMS_PER_THREAD],    ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD])   ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(input_items, output_items, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Scatter exchanges
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>blocked</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(input_items, output_items, ranks, Int2Type<WARP_TIME_SLICING>());
+    }
+
+
+
+    /**
+     * \brief Exchanges data items annotated by rank into <em>striped</em> arrangement.  Items with rank -1 are not exchanged.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OutputT, typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])            ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (ranks[ITEM] >= 0)
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     * \tparam ValidFlag                            <b>[inferred]</b> FlagT type denoting which items are valid
+     */
+    template <typename OutputT, typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      input_items[ITEMS_PER_THREAD],      ///< [in] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OutputT     output_items[ITEMS_PER_THREAD],     ///< [out] Items from exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],            ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])         ///< [in] Corresponding flag denoting item validity
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = ranks[ITEM];
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            if (is_valid[ITEM])
+                temp_storage.buff[item_offset] = input_items[ITEM];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = int(ITEM * BLOCK_THREADS) + linear_tid;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            output_items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+
+    //@}  end member group
+
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+    __device__ __forceinline__ void StripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        StripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToStriped(
+        InputT      items[ITEMS_PER_THREAD])   ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToStriped(items, items);
+    }
+
+    __device__ __forceinline__ void WarpStripedToBlocked(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        WarpStripedToBlocked(items, items);
+    }
+
+    __device__ __forceinline__ void BlockedToWarpStriped(
+        InputT      items[ITEMS_PER_THREAD])    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+    {
+        BlockedToWarpStriped(items, items);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToBlocked(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToBlocked(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStriped(items, items, ranks);
+    }
+
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStripedGuarded(
+        InputT      items[ITEMS_PER_THREAD],    ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD])    ///< [in] Corresponding scatter ranks
+    {
+        ScatterToStripedGuarded(items, items, ranks);
+    }
+
+    template <typename OffsetT, typename ValidFlag>
+    __device__ __forceinline__ void ScatterToStripedFlagged(
+        InputT      items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange, converting between <em>striped</em> and <em>blocked</em> arrangements.
+        OffsetT     ranks[ITEMS_PER_THREAD],        ///< [in] Corresponding scatter ranks
+        ValidFlag   is_valid[ITEMS_PER_THREAD])     ///< [in] Corresponding flag denoting item validity
+    {
+        ScatterToStriped(items, items, ranks, is_valid);
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+template <
+    typename    T,
+    int         ITEMS_PER_THREAD,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpExchange
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        // Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        WARP_ITEMS                  = (ITEMS_PER_THREAD * LOGICAL_WARP_THREADS) + 1,
+
+        LOG_SMEM_BANKS              = CUB_LOG_SMEM_BANKS(PTX_ARCH),
+        SMEM_BANKS                  = 1 << LOG_SMEM_BANKS,
+
+        // Insert padding if the number of items per thread is a power of two and > 4 (otherwise we can typically use 128b loads)
+        INSERT_PADDING              = (ITEMS_PER_THREAD > 4) && (PowerOfTwo<ITEMS_PER_THREAD>::VALUE),
+        PADDING_ITEMS               = (INSERT_PADDING) ? (WARP_ITEMS >> LOG_SMEM_BANKS) : 0,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        T buff[WARP_ITEMS + PADDING_ITEMS];
+    };
+
+public:
+
+    /// \smemstorage{WarpExchange}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    int             lane_id;
+
+public:
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpExchange(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * \brief Exchanges valid data items annotated by rank into <em>striped</em> arrangement.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \tparam OffsetT                              <b>[inferred]</b> Signed integer type for local offsets
+     */
+    template <typename OffsetT>
+    __device__ __forceinline__ void ScatterToStriped(
+        T               items[ITEMS_PER_THREAD],        ///< [in-out] Items to exchange
+        OffsetT         ranks[ITEMS_PER_THREAD])        ///< [in] Corresponding scatter ranks
+    {
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            if (INSERT_PADDING) ranks[ITEM] = SHR_ADD(ranks[ITEM], LOG_SMEM_BANKS, ranks[ITEM]);
+            temp_storage.buff[ranks[ITEM]] = items[ITEM];
+        }
+
+        WARP_SYNC(0xffffffff);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        {
+            int item_offset = (ITEM * LOGICAL_WARP_THREADS) + lane_id;
+            if (INSERT_PADDING) item_offset = SHR_ADD(item_offset, LOG_SMEM_BANKS, item_offset);
+            items[ITEM] = temp_storage.buff[item_offset];
+        }
+    }
+
+};
+
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/block_histogram.cuh b/GraphBLAS/CUDA/local_cub/block/block_histogram.cuh
new file mode 100644
index 0000000000..b7cb9700e6
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/block_histogram.cuh
@@ -0,0 +1,415 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_histogram_sort.cuh"
+#include "specializations/block_histogram_atomic.cuh"
+#include "../util_ptx.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockHistogramAlgorithm enumerates alternative algorithms for the parallel construction of block-wide histograms.
+ */
+enum BlockHistogramAlgorithm
+{
+
+    /**
+     * \par Overview
+     * Sorting followed by differentiation.  Execution is comprised of two phases:
+     * -# Sort the data using efficient radix sort
+     * -# Look for "runs" of same-valued keys by detecting discontinuities; the run-lengths are histogram bin counts.
+     *
+     * \par Performance Considerations
+     * Delivers consistent throughput regardless of sample bin distribution.
+     */
+    BLOCK_HISTO_SORT,
+
+
+    /**
+     * \par Overview
+     * Use atomic addition to update byte counts directly
+     *
+     * \par Performance Considerations
+     * Performance is strongly tied to the hardware implementation of atomic
+     * addition, and may be significantly degraded for non uniformly-random
+     * input distributions where many concurrent updates are likely to be
+     * made to the same bin counter.
+     */
+    BLOCK_HISTO_ATOMIC,
+};
+
+
+
+/******************************************************************************
+ * Block histogram
+ ******************************************************************************/
+
+
+/**
+ * \brief The BlockHistogram class provides [<em>collective</em>](index.html#sec0) methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block. ![](histogram_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The sample type being histogrammed (must be castable to an integer bin identifier)
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam BINS                 The number bins within the histogram
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockHistogramAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_HISTO_SORT)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ *   counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ * - BlockHistogram can be optionally specialized to use different algorithms:
+ *   -# <b>cub::BLOCK_HISTO_SORT</b>.  Sorting followed by differentiation. [More...](\ref cub::BlockHistogramAlgorithm)
+ *   -# <b>cub::BLOCK_HISTO_ATOMIC</b>.  Use atomic addition to update byte counts directly. [More...](\ref cub::BlockHistogramAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockHistogram}
+ * \par
+ * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+ * are partitioned across 128 threads where each thread owns 4 samples.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+ *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+ *
+ *     // Allocate shared memory for BlockHistogram
+ *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+ *
+ *     // Allocate shared memory for block-wide histogram bin counts
+ *     __shared__ unsigned int smem_histogram[256];
+ *
+ *     // Obtain input samples per thread
+ *     unsigned char data[4];
+ *     ...
+ *
+ *     // Compute the block-wide histogram
+ *     BlockHistogram(temp_storage).Histogram(data, smem_histogram);
+ *
+ * \endcode
+ *
+ * \par Performance and Usage Considerations
+ * - The histogram output can be constructed in shared or device-accessible memory
+ * - See cub::BlockHistogramAlgorithm for performance details regarding algorithmic alternatives
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    int                     BINS,
+    BlockHistogramAlgorithm ALGORITHM           = BLOCK_HISTO_SORT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockHistogram
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * targeted device architecture.  BLOCK_HISTO_ATOMIC can only be used
+     * on version SM120 or later.  Otherwise BLOCK_HISTO_SORT is used
+     * regardless.
+     */
+    static const BlockHistogramAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_HISTO_ATOMIC) && (PTX_ARCH < 120)) ?
+            BLOCK_HISTO_SORT :
+            ALGORITHM;
+
+    /// Internal specialization.
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_HISTO_SORT),
+        BlockHistogramSort<T, BLOCK_DIM_X, ITEMS_PER_THREAD, BINS, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>,
+        BlockHistogramAtomic<BINS> >::Type InternalBlockHistogram;
+
+    /// Shared memory storage layout type for BlockHistogram
+    typedef typename InternalBlockHistogram::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /// \smemstorage{BlockHistogram}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockHistogram(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Histogram operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Initialize the shared histogram counters to zero.
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <typename CounterT     >
+    __device__ __forceinline__ void InitHistogram(CounterT      histogram[BINS])
+    {
+        // Initialize histogram bin counts to zeros
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            histogram[histo_offset + linear_tid] = 0;
+        }
+    }
+
+
+    /**
+     * \brief Constructs a block-wide histogram in shared/device-accessible memory.  Each thread contributes an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a 256-bin histogram of 512 integer samples that
+     * are partitioned across 128 threads where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Compute the block-wide histogram
+     *     BlockHistogram(temp_storage).Histogram(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Histogram(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Initialize histogram bin counts to zeros
+        InitHistogram(histogram);
+
+        CTA_SYNC();
+
+        // Composite the histogram
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+
+
+    /**
+     * \brief Updates an existing block-wide histogram in shared/device-accessible memory.  Each thread composites an array of input elements.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a the initialization and update of a
+     * histogram of 512 integer samples that are partitioned across 128 threads
+     * where each thread owns 4 samples.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_histogram.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize a 256-bin BlockHistogram type for a 1D block of 128 threads having 4 character samples each
+     *     typedef cub::BlockHistogram<unsigned char, 128, 4, 256> BlockHistogram;
+     *
+     *     // Allocate shared memory for BlockHistogram
+     *     __shared__ typename BlockHistogram::TempStorage temp_storage;
+     *
+     *     // Allocate shared memory for block-wide histogram bin counts
+     *     __shared__ unsigned int smem_histogram[256];
+     *
+     *     // Obtain input samples per thread
+     *     unsigned char thread_samples[4];
+     *     ...
+     *
+     *     // Initialize the block-wide histogram
+     *     BlockHistogram(temp_storage).InitHistogram(smem_histogram);
+     *
+     *     // Update the block-wide histogram
+     *     BlockHistogram(temp_storage).Composite(thread_samples, smem_histogram);
+     *
+     * \endcode
+     *
+     * \tparam CounterT              <b>[inferred]</b> Histogram counter type
+     */
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        InternalBlockHistogram(temp_storage).Composite(items, histogram);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/block_load.cuh b/GraphBLAS/CUDA/local_cub/block/block_load.cuh
new file mode 100644
index 0000000000..217f521234
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/block_load.cuh
@@ -0,0 +1,1241 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2016, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for reading linear tiles of data into the CUDA thread block.
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_exchange.cuh"
+#include "../iterator/cache_modified_input_iterator.cuh"
+#include "../util_ptx.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = thread_itr[ITEM];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    InputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((linear_tid * ITEMS_PER_THREAD) + ITEM < valid_items)
+        {
+            items[ITEM] = thread_itr[ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements..
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectBlocked(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+}
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Internal implementation for load vectorization
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void InternalLoadDirectBlockedVectorized(
+    int    linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T      *block_ptr,                 ///< [in] Input pointer for loading from
+    T      (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    // Biggest memory access word that T is a whole multiple of
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        TOTAL_WORDS = sizeof(items) / sizeof(DeviceWord),
+
+        VECTOR_SIZE = (TOTAL_WORDS % 4 == 0) ?
+            4 :
+            (TOTAL_WORDS % 2 == 0) ?
+                2 :
+                1,
+
+        VECTORS_PER_THREAD = TOTAL_WORDS / VECTOR_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<DeviceWord, VECTOR_SIZE>::Type Vector;
+
+    // Vector items
+    Vector vec_items[VECTORS_PER_THREAD];
+
+    // Aliased input ptr
+    Vector* vec_ptr = reinterpret_cast<Vector*>(block_ptr) + (linear_tid * VECTORS_PER_THREAD);
+
+    // Load directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < VECTORS_PER_THREAD; ITEM++)
+    {
+        vec_items[ITEM] = ThreadLoad<MODIFIER>(vec_ptr + ITEM);
+    }
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = *(reinterpret_cast<T*>(vec_items) + ITEM);
+    }
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Load a linear segment of items into a blocked arrangement across the thread block.
+ *
+ * \blocked
+ *
+ * The input offset (\p block_ptr + \p block_offset) must be quad-item aligned
+ *
+ * The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ */
+template <
+    typename        T,
+    int             ITEMS_PER_THREAD>
+__device__ __forceinline__ void LoadDirectBlockedVectorized(
+    int linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T   *block_ptr,                 ///< [in] Input pointer for loading from
+    T   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+}
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    InputIteratorT thread_itr = block_itr + linear_tid;
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    InputIteratorT thread_itr = block_itr + linear_tid;
+
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (linear_tid + (ITEM * BLOCK_THREADS) < valid_items)
+        {
+            items[ITEM] = thread_itr[ITEM * BLOCK_THREADS];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    int             BLOCK_THREADS,
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT       <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    InputIteratorT thread_itr = block_itr + warp_offset + tid ;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items)                ///< [in] Number of valid items to load
+{
+    int tid                = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid                = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset        = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    InputIteratorT thread_itr = block_itr + warp_offset + tid ;
+
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            items[ITEM] = thread_itr[(ITEM * CUB_PTX_WARP_THREADS)];
+        }
+    }
+}
+
+
+/**
+ * \brief Load a linear segment of items into a warp-striped arrangement across the thread block, guarded by range, with a fall-back assignment of out-of-bound elements.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to load.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam InputIteratorT        <b>[inferred]</b> The random-access iterator type for input \iterator.
+ */
+template <
+    typename        InputT,
+    typename        DefaultT,
+    int             ITEMS_PER_THREAD,
+    typename        InputIteratorT>
+__device__ __forceinline__ void LoadDirectWarpStriped(
+    int             linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+    InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+    int             valid_items,                ///< [in] Number of valid items to load
+    DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+{
+    // Load directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+        items[ITEM] = oob_default;
+
+    LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+}
+
+
+
+//@}  end member group
+
+/** @} */       // end group UtilIo
+
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockLoad abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+
+/**
+ * \brief cub::BlockLoadAlgorithm enumerates alternative algorithms for cub::BlockLoad to read a linear segment of data from memory into a blocked arrangement across a CUDA thread block.
+ */
+enum BlockLoadAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * directly from memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_LOAD_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is read
+     * from memory using CUDA's built-in vectorized loads as a coalescing optimization.
+     * For example, <tt>ld.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector load width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and loading will fall back to cub::BLOCK_LOAD_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p InputIteratorTis not a simple pointer type
+     *   - The block input offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_LOAD_VECTORIZE,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>striped arrangement</em>](index.html#sec5sec3) of data is read
+     * efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     */
+    BLOCK_LOAD_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * A [<em>warp-striped arrangement</em>](index.html#sec5sec3) of data is
+     * read efficiently from memory and then locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - The local reordering incurs slightly larger latencies than the
+     *   direct cub::BLOCK_LOAD_DIRECT and cub::BLOCK_LOAD_VECTORIZE alternatives.
+     * - Provisions more shared storage, but incurs smaller latencies than the
+     *   BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE,
+
+
+    /**
+     * \par Overview
+     *
+     * Like \p BLOCK_LOAD_WARP_TRANSPOSE, a [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * of data is read directly from memory and then is locally transposed into a
+     * [<em>blocked arrangement</em>](index.html#sec5sec3). To reduce the shared memory
+     * requirement, only one warp's worth of shared memory is provisioned and is
+     * subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items loaded per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_LOAD_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,
+};
+
+
+/**
+ * \brief The BlockLoad class provides [<em>collective</em>](index.html#sec0) data movement methods for loading a linear segment of items from memory into a [<em>blocked arrangement</em>](index.html#sec5sec3) across a CUDA thread block.  ![](block_load_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam InputT               The data type to read into (which must be convertible from the input iterator's value type).
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockLoadAlgorithm tuning policy.  default: cub::BLOCK_LOAD_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockLoad class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockLoadAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockLoad can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_LOAD_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory.  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory using CUDA's built-in vectorized loads as a
+ *      coalescing optimization.    [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_TRANSPOSE</b>.  A [<em>striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3).  [More...](\ref cub::BlockLoadAlgorithm)
+ *   -# <b>cub::BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED,</b>.  A [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+ *      of data is read directly from memory and is then locally transposed into a
+ *      [<em>blocked arrangement</em>](index.html#sec5sec3) one warp at a time.  [More...](\ref cub::BlockLoadAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockLoad}
+ * \par
+ * The code snippet below illustrates the loading of a linear
+ * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+ * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+ * meaning memory references are efficiently coalesced using a warp-striped access
+ * pattern (after which items are locally reordered among threads).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+ *
+ *     // Allocate shared memory for BlockLoad
+ *     __shared__ typename BlockLoad::TempStorage temp_storage;
+ *
+ *     // Load a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     BlockLoad(temp_storage).Load(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ * The set of \p thread_data across the block of threads in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename            InputT,
+    int                 BLOCK_DIM_X,
+    int                 ITEMS_PER_THREAD,
+    BlockLoadAlgorithm  ALGORITHM           = BLOCK_LOAD_DIRECT,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockLoad
+{
+private:
+
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Load helper
+    template <BlockLoadAlgorithm _POLICY, int DUMMY>
+    struct LoadInternal;
+
+
+    /**
+     * BLOCK_LOAD_DIRECT specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_VECTORIZE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputT               *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            const InputT         *block_ptr,                     ///< [in] The thread block's base input iterator for loading from
+            InputT               (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<LOAD_DEFAULT>(linear_tid, block_ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for native pointer types (attempts vectorization)
+        template <
+            CacheLoadModifier   MODIFIER,
+            typename            ValueType,
+            typename            OffsetT>
+        __device__ __forceinline__ void Load(
+            CacheModifiedInputIterator<MODIFIER, ValueType, OffsetT>    block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT                                                     (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load
+        {
+            InternalLoadDirectBlockedVectorized<MODIFIER>(linear_tid, block_itr.ptr, items);
+        }
+
+        /// Load a linear segment of items from memory, specialized for opaque input iterators (skips vectorization)
+        template <typename _InputIteratorT>
+        __device__ __forceinline__ void Load(
+            _InputIteratorT   block_itr,                    ///< [in] The thread block's base input iterator for loading from
+            InputT           (&items)[ITEMS_PER_THREAD])   ///< [out] Data to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range (skips vectorization)
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements (skips vectorization)
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT          oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectBlocked(linear_tid, block_itr, items, valid_items, oob_default);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).StripedToBlocked(items, items);
+        }
+
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /**
+     * BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED specialization of load helper
+     */
+    template <int DUMMY>
+    struct LoadInternal<BLOCK_LOAD_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<InputT, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {};
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ LoadInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Load a linear segment of items from memory
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD])     ///< [out] Data to load{
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+        /// Load a linear segment of items from memory, guarded by range
+        template <typename InputIteratorT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items)                    ///< [in] Number of valid items to load
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+
+
+        /// Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+        template <typename InputIteratorT, typename DefaultT>
+        __device__ __forceinline__ void Load(
+            InputIteratorT  block_itr,                      ///< [in] The thread block's base input iterator for loading from
+            InputT          (&items)[ITEMS_PER_THREAD],     ///< [out] Data to load
+            int             valid_items,                    ///< [in] Number of valid items to load
+            DefaultT        oob_default)                    ///< [in] Default value to assign out-of-bound items
+        {
+            LoadDirectWarpStriped(linear_tid, block_itr, items, valid_items, oob_default);
+            BlockExchange(temp_storage).WarpStripedToBlocked(items, items);
+        }
+    };
+
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef LoadInternal<ALGORITHM, 0> InternalLoad;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalLoad::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+    /// \smemstorage{BlockLoad}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockLoad(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Load a linear segment of items from memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt> and \p valid_items is \p 5.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,?,?,?], ..., [?,?,?,?] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items remaining unassigned).
+     *
+     */
+    template <typename InputIteratorT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items)                ///< [in] Number of valid items to load
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items);
+    }
+
+
+    /**
+     * \brief Load a linear segment of items from memory, guarded by range, with a fall-back assignment of out-of-bound elements
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded loading of a linear
+     * segment of 512 integers into a "blocked" arrangement across 128 threads where each
+     * thread owns 4 consecutive items.  The load is specialized for \p BLOCK_LOAD_WARP_TRANSPOSE,
+     * meaning memory references are efficiently coalesced using a warp-striped access
+     * pattern (after which items are locally reordered among threads).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_load.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockLoad for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockLoad<int, 128, 4, BLOCK_LOAD_WARP_TRANSPOSE> BlockLoad;
+     *
+     *     // Allocate shared memory for BlockLoad
+     *     __shared__ typename BlockLoad::TempStorage temp_storage;
+     *
+     *     // Load a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     BlockLoad(temp_storage).Load(d_data, thread_data, valid_items, -1);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, 1, 2, 3, 4, 5, 6...</tt>,
+     * \p valid_items is \p 5, and the out-of-bounds default is \p -1.
+     * The set of \p thread_data across the block of threads in those threads will be
+     * <tt>{ [0,1,2,3], [4,-1,-1,-1], ..., [-1,-1,-1,-1] }</tt>, with only the first two threads
+     * being unmasked to load portions of valid data (and other items are assigned \p -1)
+     *
+     */
+    template <typename InputIteratorT, typename DefaultT>
+    __device__ __forceinline__ void Load(
+        InputIteratorT  block_itr,                  ///< [in] The thread block's base input iterator for loading from
+        InputT          (&items)[ITEMS_PER_THREAD], ///< [out] Data to load
+        int             valid_items,                ///< [in] Number of valid items to load
+        DefaultT        oob_default)                ///< [in] Default value to assign out-of-bound items
+    {
+        InternalLoad(temp_storage, linear_tid).Load(block_itr, items, valid_items, oob_default);
+    }
+
+
+    //@}  end member group
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/block_radix_rank.cuh b/GraphBLAS/CUDA/local_cub/block/block_radix_rank.cuh
new file mode 100644
index 0000000000..c26451c666
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/block_radix_rank.cuh
@@ -0,0 +1,696 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
+ */
+
+#pragma once
+
+#include <stdint.h>
+
+#include "../thread/thread_reduce.cuh"
+#include "../thread/thread_scan.cuh"
+#include "../block/block_scan.cuh"
+#include "../util_ptx.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam RADIX_BITS           The number of radix bits per digit place
+ * \tparam IS_DESCENDING           Whether or not the sorted-order is high-to-low
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).  See BlockScanAlgorithm::BLOCK_SCAN_RAKING_MEMOIZE for more details.
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * Blah...
+ * - Keys must be in a form suitable for radix ranking (i.e., unsigned bits).
+ * - \blocked
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par Examples
+ * \par
+ * - <b>Example 1:</b> Simple radix rank of 32-bit integer keys
+ *      \code
+ *      #include <cub/cub.cuh>
+ *
+ *      template <int BLOCK_THREADS>
+ *      __global__ void ExampleKernel(...)
+ *      {
+ *
+ *      \endcode
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRank
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    // Integer type for digit counters (to be packed into words of type PackedCounters)
+    typedef unsigned short DigitCounter;
+
+    // Integer type for packing DigitCounters into columns of shared memory banks
+    typedef typename If<(SMEM_CONFIG == cudaSharedMemBankSizeEightByte),
+        unsigned long long,
+        unsigned int>::Type PackedCounter;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        BYTES_PER_COUNTER           = sizeof(DigitCounter),
+        LOG_BYTES_PER_COUNTER       = Log2<BYTES_PER_COUNTER>::VALUE,
+
+        PACKING_RATIO               = sizeof(PackedCounter) / sizeof(DigitCounter),
+        LOG_PACKING_RATIO           = Log2<PACKING_RATIO>::VALUE,
+
+        LOG_COUNTER_LANES           = CUB_MAX((RADIX_BITS - LOG_PACKING_RATIO), 0),                // Always at least one lane
+        COUNTER_LANES               = 1 << LOG_COUNTER_LANES,
+
+        // The number of packed counters per thread (plus one for padding)
+        PADDED_COUNTER_LANES        = COUNTER_LANES + 1,
+        RAKING_SEGMENT              = PADDED_COUNTER_LANES,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+
+    /// BlockScan type
+    typedef BlockScan<
+            PackedCounter,
+            BLOCK_DIM_X,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScan;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        union Aliasable
+        {
+            DigitCounter            digit_counters[PADDED_COUNTER_LANES][BLOCK_THREADS][PACKING_RATIO];
+            PackedCounter           raking_grid[BLOCK_THREADS][RAKING_SEGMENT];
+
+        } aliasable;
+
+        // Storage for scanning local ranks
+        typename BlockScan::TempStorage block_scan;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /// Copy of raking segment, promoted to registers
+    PackedCounter cached_segment[RAKING_SEGMENT];
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /**
+     * Internal storage allocator
+     */
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /**
+     * Performs upsweep raking reduction, returning the aggregate
+     */
+    __device__ __forceinline__ PackedCounter Upsweep()
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+        PackedCounter *raking_ptr;
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data into registers
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                cached_segment[i] = smem_raking_ptr[i];
+            }
+            raking_ptr = cached_segment;
+        }
+        else
+        {
+            raking_ptr = smem_raking_ptr;
+        }
+
+        return internal::ThreadReduce<RAKING_SEGMENT>(raking_ptr, Sum());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        PackedCounter raking_partial)
+    {
+        PackedCounter *smem_raking_ptr = temp_storage.aliasable.raking_grid[linear_tid];
+
+        PackedCounter *raking_ptr = (MEMOIZE_OUTER_SCAN) ?
+            cached_segment :
+            smem_raking_ptr;
+
+        // Exclusive raking downsweep scan
+        internal::ThreadScanExclusive<RAKING_SEGMENT>(raking_ptr, raking_ptr, Sum(), raking_partial);
+
+        if (MEMOIZE_OUTER_SCAN)
+        {
+            // Copy data back to smem
+            #pragma unroll
+            for (int i = 0; i < RAKING_SEGMENT; i++)
+            {
+                smem_raking_ptr[i] = cached_segment[i];
+            }
+        }
+    }
+
+
+    /**
+     * Reset shared memory digit counters
+     */
+    __device__ __forceinline__ void ResetCounters()
+    {
+        // Reset shared memory digit counters
+        #pragma unroll
+        for (int LANE = 0; LANE < PADDED_COUNTER_LANES; LANE++)
+        {
+            *((PackedCounter*) temp_storage.aliasable.digit_counters[LANE][linear_tid]) = 0;
+        }
+    }
+
+
+    /**
+     * Block-scan prefix callback
+     */
+    struct PrefixCallBack
+    {
+        __device__ __forceinline__ PackedCounter operator()(PackedCounter block_aggregate)
+        {
+            PackedCounter block_prefix = 0;
+
+            // Propagate totals in packed fields
+            #pragma unroll
+            for (int PACKED = 1; PACKED < PACKING_RATIO; PACKED++)
+            {
+                block_prefix += block_aggregate << (sizeof(DigitCounter) * 8 * PACKED);
+            }
+
+            return block_prefix;
+        }
+    };
+
+
+    /**
+     * Scan shared memory digit counters.
+     */
+    __device__ __forceinline__ void ScanCounters()
+    {
+        // Upsweep scan
+        PackedCounter raking_partial = Upsweep();
+
+        // Compute exclusive sum
+        PackedCounter exclusive_partial;
+        PrefixCallBack prefix_call_back;
+        BlockScan(temp_storage.block_scan).ExclusiveSum(raking_partial, exclusive_partial, prefix_call_back);
+
+        // Downsweep scan with exclusive partial
+        ExclusiveDownsweep(exclusive_partial);
+    }
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRank(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        DigitCounter    thread_prefixes[KEYS_PER_THREAD];   // For each key, the count of previous keys in this tile having the same digit
+        DigitCounter*   digit_counters[KEYS_PER_THREAD];    // For each key, the byte-offset of its corresponding digit counter in smem
+
+        // Reset shared memory digit counters
+        ResetCounters();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Get digit
+            unsigned int digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            // Get sub-counter
+            unsigned int sub_counter = digit >> LOG_COUNTER_LANES;
+
+            // Get counter lane
+            unsigned int counter_lane = digit & (COUNTER_LANES - 1);
+
+            if (IS_DESCENDING)
+            {
+                sub_counter = PACKING_RATIO - 1 - sub_counter;
+                counter_lane = COUNTER_LANES - 1 - counter_lane;
+            }
+
+            // Pointer to smem digit counter
+            digit_counters[ITEM] = &temp_storage.aliasable.digit_counters[counter_lane][linear_tid][sub_counter];
+
+            // Load thread-exclusive prefix
+            thread_prefixes[ITEM] = *digit_counters[ITEM];
+
+            // Store inclusive prefix
+            *digit_counters[ITEM] = thread_prefixes[ITEM] + 1;
+        }
+
+        CTA_SYNC();
+
+        // Scan shared memory counters
+        ScanCounters();
+
+        CTA_SYNC();
+
+        // Extract the local ranks of each key
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // Add in thread block exclusive prefix
+            ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
+        }
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        // Rank keys
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get the inclusive and exclusive digit totals corresponding to the calling thread.
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                // Obtain ex/inclusive digit counts.  (Unfortunately these all reside in the
+                // first counter column, resulting in unavoidable bank conflicts.)
+                unsigned int counter_lane   = (bin_idx & (COUNTER_LANES - 1));
+                unsigned int sub_counter    = bin_idx >> (LOG_COUNTER_LANES);
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.digit_counters[counter_lane][0][sub_counter];
+            }
+        }
+    }
+};
+
+
+
+
+
+/**
+ * Radix-rank using match.any
+ */
+template <
+    int                     BLOCK_DIM_X,
+    int                     RADIX_BITS,
+    bool                    IS_DESCENDING,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixRankMatch
+{
+private:
+
+    /******************************************************************************
+     * Type definitions and constants
+     ******************************************************************************/
+
+    typedef int32_t    RankT;
+    typedef int32_t    DigitCounterT;
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        RADIX_DIGITS                = 1 << RADIX_BITS,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        PADDED_WARPS            = ((WARPS & 0x1) == 0) ?
+                                    WARPS + 1 :
+                                    WARPS,
+
+        COUNTERS                = PADDED_WARPS * RADIX_DIGITS,
+        RAKING_SEGMENT          = (COUNTERS + BLOCK_THREADS - 1) / BLOCK_THREADS,
+        PADDED_RAKING_SEGMENT   = ((RAKING_SEGMENT & 0x1) == 0) ?
+                                    RAKING_SEGMENT + 1 :
+                                    RAKING_SEGMENT,
+    };
+
+public:
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = CUB_MAX(1, (RADIX_DIGITS + BLOCK_THREADS - 1) / BLOCK_THREADS),
+    };
+
+private:
+
+    /// BlockScan type
+    typedef BlockScan<
+            DigitCounterT,
+            BLOCK_THREADS,
+            INNER_SCAN_ALGORITHM,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockScanT;
+
+
+    /// Shared memory storage layout type for BlockRadixRank
+    struct __align__(16) _TempStorage
+    {
+        typename BlockScanT::TempStorage            block_scan;
+
+        union __align__(16) Aliasable
+        {
+            volatile DigitCounterT                  warp_digit_counters[RADIX_DIGITS][PADDED_WARPS];
+            DigitCounterT                           raking_grid[BLOCK_THREADS][PADDED_RAKING_SEGMENT];
+
+        } aliasable;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixRankMatch(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Raking
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Rank keys.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits)                           ///< [in] The number of bits in the current digit
+    {
+        // Initialize shared digit counters
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = 0;
+
+        CTA_SYNC();
+
+        // Each warp will strip-mine its section of input, one strip at a time
+
+        volatile DigitCounterT  *digit_counters[KEYS_PER_THREAD];
+        uint32_t                warp_id         = linear_tid >> LOG_WARP_THREADS;
+        uint32_t                lane_mask_lt    = LaneMaskLt();
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+        {
+            // My digit
+            uint32_t digit = BFE(keys[ITEM], current_bit, num_bits);
+
+            if (IS_DESCENDING)
+                digit = RADIX_DIGITS - digit - 1;
+
+            // Mask of peers who have same digit as me
+            uint32_t peer_mask = MatchAny<RADIX_BITS>(digit);
+
+            // Pointer to smem digit counter for this key
+            digit_counters[ITEM] = &temp_storage.aliasable.warp_digit_counters[digit][warp_id];
+
+            // Number of occurrences in previous strips
+            DigitCounterT warp_digit_prefix = *digit_counters[ITEM];
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of peers having same digit as me
+            int32_t digit_count = __popc(peer_mask);
+
+            // Number of lower-ranked peers having same digit seen so far
+            int32_t peer_digit_prefix = __popc(peer_mask & lane_mask_lt);
+
+            if (peer_digit_prefix == 0)
+            {
+                // First thread for each digit updates the shared warp counter
+                *digit_counters[ITEM] = DigitCounterT(warp_digit_prefix + digit_count);
+            }
+
+            // Warp-sync
+            WARP_SYNC(0xFFFFFFFF);
+
+            // Number of prior keys having same digit
+            ranks[ITEM] = warp_digit_prefix + DigitCounterT(peer_digit_prefix);
+        }
+
+        CTA_SYNC();
+
+        // Scan warp counters
+
+        DigitCounterT scan_counters[PADDED_RAKING_SEGMENT];
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            scan_counters[ITEM] = temp_storage.aliasable.raking_grid[linear_tid][ITEM];
+
+        BlockScanT(temp_storage.block_scan).ExclusiveSum(scan_counters, scan_counters);
+
+        #pragma unroll
+        for (int ITEM = 0; ITEM < PADDED_RAKING_SEGMENT; ++ITEM)
+            temp_storage.aliasable.raking_grid[linear_tid][ITEM] = scan_counters[ITEM];
+
+        CTA_SYNC();
+
+        // Seed ranks with counter values from previous warps
+        #pragma unroll
+        for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
+            ranks[ITEM] += *digit_counters[ITEM];
+    }
+
+
+    /**
+     * \brief Rank keys.  For the lower \p RADIX_DIGITS threads, digit counts for each digit are provided for the corresponding thread.
+     */
+    template <
+        typename        UnsignedBits,
+        int             KEYS_PER_THREAD>
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&keys)[KEYS_PER_THREAD],           ///< [in] Keys for this tile
+        int             (&ranks)[KEYS_PER_THREAD],          ///< [out] For each key, the local rank within the tile (out parameter)
+        int             current_bit,                        ///< [in] The least-significant bit position of the current digit to extract
+        int             num_bits,                           ///< [in] The number of bits in the current digit
+        int             (&exclusive_digit_prefix)[BINS_TRACKED_PER_THREAD])            ///< [out] The exclusive prefix sum for the digits [(threadIdx.x * BINS_TRACKED_PER_THREAD) ... (threadIdx.x * BINS_TRACKED_PER_THREAD) + BINS_TRACKED_PER_THREAD - 1]
+    {
+        RankKeys(keys, ranks, current_bit, num_bits);
+
+        // Get exclusive count for each digit
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (linear_tid * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+            {
+                if (IS_DESCENDING)
+                    bin_idx = RADIX_DIGITS - bin_idx - 1;
+
+                exclusive_digit_prefix[track] = temp_storage.aliasable.warp_digit_counters[bin_idx][0];
+            }
+        }
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/block/block_radix_sort.cuh b/GraphBLAS/CUDA/local_cub/block/block_radix_sort.cuh
new file mode 100644
index 0000000000..ac0c9f85b1
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/block_radix_sort.cuh
@@ -0,0 +1,863 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for radix sorting of items partitioned across a CUDA thread block.
+ */
+
+
+#pragma once
+
+#include "block_exchange.cuh"
+#include "block_radix_rank.cuh"
+#include "../util_ptx.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockRadixSort class provides [<em>collective</em>](index.html#sec0) methods for sorting items partitioned across a CUDA thread block using a radix sorting method.  ![](sorting_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam KeyT                 KeyT type
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of items per thread
+ * \tparam ValueT               <b>[optional]</b> ValueT type (default: cub::NullType, which indicates a keys-only sort)
+ * \tparam RADIX_BITS           <b>[optional]</b> The number of radix bits per digit place (default: 4 bits)
+ * \tparam MEMOIZE_OUTER_SCAN   <b>[optional]</b> Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure (default: true for architectures SM35 and newer, false otherwise).
+ * \tparam INNER_SCAN_ALGORITHM <b>[optional]</b> The cub::BlockScanAlgorithm algorithm to use (default: cub::BLOCK_SCAN_WARP_SCANS)
+ * \tparam SMEM_CONFIG          <b>[optional]</b> Shared memory bank mode (default: \p cudaSharedMemBankSizeFourByte)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ *   items into ascending order.  It relies upon a positional representation for
+ *   keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ *   characters, etc.) specified from least-significant to most-significant.  For a
+ *   given input sequence of keys and a set of rules specifying a total ordering
+ *   of the symbolic alphabet, the radix sorting method produces a lexicographic
+ *   ordering of those keys.
+ * - BlockRadixSort can sort all of the built-in C++ numeric primitive types
+ *   (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ *   half-precision floating-point type. Within each key, the implementation treats fixed-length
+ *   bit-sequences of \p RADIX_BITS as radix digit places.  Although the direct radix sorting
+ *   method can only be applied to unsigned integral types, BlockRadixSort
+ *   is able to sort signed and floating-point types via simple bit-wise transformations
+ *   that ensure lexicographic key ordering.
+ * - \rowmajor
+ *
+ * \par Performance Considerations
+ * - \granularity
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockRadixSort}
+ * \par
+ * The code snippet below illustrates a sort of 512 integer keys that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+ *
+ *     // Allocate shared memory for BlockRadixSort
+ *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_keys[4];
+ *     ...
+ *
+ *     // Collectively sort the keys
+ *     BlockRadixSort(temp_storage).Sort(thread_keys);
+ *
+ *     ...
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_keys across the block of threads is
+ * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+ * corresponding output \p thread_keys in those threads will be
+ * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+ *
+ */
+template <
+    typename                KeyT,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    typename                ValueT                   = NullType,
+    int                     RADIX_BITS              = 4,
+    bool                    MEMOIZE_OUTER_SCAN      = (CUB_PTX_ARCH >= 350) ? true : false,
+    BlockScanAlgorithm      INNER_SCAN_ALGORITHM    = BLOCK_SCAN_WARP_SCANS,
+    cudaSharedMemConfig     SMEM_CONFIG             = cudaSharedMemBankSizeFourByte,
+    int                     BLOCK_DIM_Y             = 1,
+    int                     BLOCK_DIM_Z             = 1,
+    int                     PTX_ARCH                = CUB_PTX_ARCH>
+class BlockRadixSort
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        // The thread block size in threads
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        // Whether or not there are values to be trucked along with keys
+        KEYS_ONLY                   = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // KeyT traits and unsigned bits type
+    typedef Traits<KeyT>                        KeyTraits;
+    typedef typename KeyTraits::UnsignedBits    UnsignedBits;
+
+    /// Ascending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            false,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        AscendingBlockRadixRank;
+
+    /// Descending BlockRadixRank utility type
+    typedef BlockRadixRank<
+            BLOCK_DIM_X,
+            RADIX_BITS,
+            true,
+            MEMOIZE_OUTER_SCAN,
+            INNER_SCAN_ALGORITHM,
+            SMEM_CONFIG,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        DescendingBlockRadixRank;
+
+    /// BlockExchange utility type for keys
+    typedef BlockExchange<KeyT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeKeys;
+
+    /// BlockExchange utility type for values
+    typedef BlockExchange<ValueT, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchangeValues;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename AscendingBlockRadixRank::TempStorage  asending_ranking_storage;
+        typename DescendingBlockRadixRank::TempStorage descending_ranking_storage;
+        typename BlockExchangeKeys::TempStorage        exchange_keys;
+        typename BlockExchangeValues::TempStorage      exchange_values;
+    };
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+    /// Rank keys (specialized for ascending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        int             begin_bit,
+        int             pass_bits,
+        Int2Type<false> /*is_descending*/)
+    {
+        AscendingBlockRadixRank(temp_storage.asending_ranking_storage).RankKeys(
+            unsigned_keys,
+            ranks,
+            begin_bit,
+            pass_bits);
+    }
+
+    /// Rank keys (specialized for descending sort)
+    __device__ __forceinline__ void RankKeys(
+        UnsignedBits    (&unsigned_keys)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        int             begin_bit,
+        int             pass_bits,
+        Int2Type<true>  /*is_descending*/)
+    {
+        DescendingBlockRadixRank(temp_storage.descending_ranking_storage).RankKeys(
+            unsigned_keys,
+            ranks,
+            begin_bit,
+            pass_bits);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-blocked arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<true>  /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToBlocked(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for key-value sort, to-striped arrangement)
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT          (&values)[ITEMS_PER_THREAD],
+        int             (&ranks)[ITEMS_PER_THREAD],
+        Int2Type<false> /*is_keys_only*/,
+        Int2Type<false> /*is_blocked*/)
+    {
+        CTA_SYNC();
+
+        // Exchange values through shared memory in blocked arrangement
+        BlockExchangeValues(temp_storage.exchange_values).ScatterToStriped(values, ranks);
+    }
+
+    /// ExchangeValues (specialized for keys-only sort)
+    template <int IS_BLOCKED>
+    __device__ __forceinline__ void ExchangeValues(
+        ValueT                  (&/*values*/)[ITEMS_PER_THREAD],
+        int                     (&/*ranks*/)[ITEMS_PER_THREAD],
+        Int2Type<true>          /*is_keys_only*/,
+        Int2Type<IS_BLOCKED>    /*is_blocked*/)
+    {}
+
+    /// Sort blocked arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlocked(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            // Quit if done
+            if (begin_bit >= end_bit) break;
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+public:
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Sort blocked -> striped arrangement
+    template <int DESCENDING, int KEYS_ONLY>
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT                    (&keys)[ITEMS_PER_THREAD],          ///< Keys to sort
+        ValueT                  (&values)[ITEMS_PER_THREAD],        ///< Values to sort
+        int                     begin_bit,                          ///< The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                            ///< The past-the-end (most-significant) bit index needed for key comparison
+        Int2Type<DESCENDING>    is_descending,                      ///< Tag whether is a descending-order sort
+        Int2Type<KEYS_ONLY>     is_keys_only)                       ///< Tag whether is keys-only sort
+    {
+        UnsignedBits (&unsigned_keys)[ITEMS_PER_THREAD] =
+            reinterpret_cast<UnsignedBits (&)[ITEMS_PER_THREAD]>(keys);
+
+        // Twiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleIn(unsigned_keys[KEY]);
+        }
+
+        // Radix sorting passes
+        while (true)
+        {
+            int pass_bits = CUB_MIN(RADIX_BITS, end_bit - begin_bit);
+
+            // Rank the blocked keys
+            int ranks[ITEMS_PER_THREAD];
+            RankKeys(unsigned_keys, ranks, begin_bit, pass_bits, is_descending);
+            begin_bit += RADIX_BITS;
+
+            CTA_SYNC();
+
+            // Check if this is the last pass
+            if (begin_bit >= end_bit)
+            {
+                // Last pass exchanges keys through shared memory in striped arrangement
+                BlockExchangeKeys(temp_storage.exchange_keys).ScatterToStriped(keys, ranks);
+
+                // Last pass exchanges through shared memory in striped arrangement
+                ExchangeValues(values, ranks, is_keys_only, Int2Type<false>());
+
+                // Quit
+                break;
+            }
+
+            // Exchange keys through shared memory in blocked arrangement
+            BlockExchangeKeys(temp_storage.exchange_keys).ScatterToBlocked(keys, ranks);
+
+            // Exchange values through shared memory in blocked arrangement
+            ExchangeValues(values, ranks, is_keys_only, Int2Type<true>());
+
+            CTA_SYNC();
+        }
+
+        // Untwiddle bits if necessary
+        #pragma unroll
+        for (int KEY = 0; KEY < ITEMS_PER_THREAD; KEY++)
+        {
+            unsigned_keys[KEY] = KeyTraits::TwiddleOut(unsigned_keys[KEY]);
+        }
+    }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// \smemstorage{BlockRadixSort}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockRadixSort(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangements)
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Performs an ascending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,1,2,3], [4,5,6,7], [8,9,10,11], ..., [508,509,510,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void Sort(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+    /**
+     * \brief Performs a descending block-wide radix sort over a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys.
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).Sort(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.
+     * The corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending block-wide radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values.
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).Sort(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,510,509,508], [11,10,9,8], [7,6,5,4], ..., [3,2,1,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescending(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlocked(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Sorting (blocked arrangement -> striped arrangement)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs an ascending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [0,128,256,384], [1,129,257,385], [2,130,258,386], ..., [127,255,383,511] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<false>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive keys.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys each
+     *     typedef cub::BlockRadixSort<int, 128, 4> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     ...
+     *
+     *     // Collectively sort the keys
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        NullType values[ITEMS_PER_THREAD];
+
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    /**
+     * \brief Performs a descending radix sort across a [<em>blocked arrangement</em>](index.html#sec5sec3) of keys and values, leaving them in a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par
+     * - BlockRadixSort can only accommodate one associated tile of values. To "truck along"
+     *   more than one tile of values, simply perform a key-value sort of the keys paired
+     *   with a temporary value array that enumerates the key indices.  The reordered indices
+     *   can then be used as a gather-vector for exchanging other associated tile data through
+     *   shared memory.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sort of 512 integer keys and values that
+     * are initially partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive pairs.  The final partitioning is striped.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_radix_sort.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockRadixSort for a 1D block of 128 threads owning 4 integer keys and values each
+     *     typedef cub::BlockRadixSort<int, 128, 4, int> BlockRadixSort;
+     *
+     *     // Allocate shared memory for BlockRadixSort
+     *     __shared__ typename BlockRadixSort::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_keys[4];
+     *     int thread_values[4];
+     *     ...
+     *
+     *     // Collectively sort the keys and values among block threads
+     *     BlockRadixSort(temp_storage).SortBlockedToStriped(thread_keys, thread_values);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_keys across the block of threads is
+     * <tt>{ [0,511,1,510], [2,509,3,508], [4,507,5,506], ..., [254,257,255,256] }</tt>.  The
+     * corresponding output \p thread_keys in those threads will be
+     * <tt>{ [511,383,255,127], [386,258,130,2], [385,257,128,1], ..., [384,256,128,0] }</tt>.
+     *
+     */
+    __device__ __forceinline__ void SortDescendingBlockedToStriped(
+        KeyT    (&keys)[ITEMS_PER_THREAD],          ///< [in-out] Keys to sort
+        ValueT  (&values)[ITEMS_PER_THREAD],        ///< [in-out] Values to sort
+        int     begin_bit   = 0,                    ///< [in] <b>[optional]</b> The beginning (least-significant) bit index needed for key comparison
+        int     end_bit     = sizeof(KeyT) * 8)      ///< [in] <b>[optional]</b> The past-the-end (most-significant) bit index needed for key comparison
+    {
+        SortBlockedToStriped(keys, values, begin_bit, end_bit, Int2Type<true>(), Int2Type<KEYS_ONLY>());
+    }
+
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_block_radix_sort.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/block_raking_layout.cuh b/GraphBLAS/CUDA/local_cub/block/block_raking_layout.cuh
new file mode 100644
index 0000000000..3500616863
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/block_raking_layout.cuh
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockRakingLayout provides a conflict-free shared memory layout abstraction for warp-raking across thread block data.
+ */
+
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockRakingLayout provides a conflict-free shared memory layout abstraction for 1D raking across thread block data.    ![](raking.png)
+ * \ingroup BlockModule
+ *
+ * \par Overview
+ * This type facilitates a shared memory usage pattern where a block of CUDA
+ * threads places elements into shared memory and then reduces the active
+ * parallelism to one "raking" warp of threads for serially aggregating consecutive
+ * sequences of shared items.  Padding is inserted to eliminate bank conflicts
+ * (for most data types).
+ *
+ * \tparam T                        The data type to be exchanged.
+ * \tparam BLOCK_THREADS            The thread block size in threads.
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ */
+template <
+    typename    T,
+    int         BLOCK_THREADS,
+    int         PTX_ARCH = CUB_PTX_ARCH>
+struct BlockRakingLayout
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// The total number of elements that need to be cooperatively reduced
+        SHARED_ELEMENTS = BLOCK_THREADS,
+
+        /// Maximum number of warp-synchronous raking threads
+        MAX_RAKING_THREADS = CUB_MIN(BLOCK_THREADS, CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Number of raking elements per warp-synchronous raking thread (rounded up)
+        SEGMENT_LENGTH = (SHARED_ELEMENTS + MAX_RAKING_THREADS - 1) / MAX_RAKING_THREADS,
+
+        /// Never use a raking thread that will have no valid data (e.g., when BLOCK_THREADS is 62 and SEGMENT_LENGTH is 2, we should only use 31 raking threads)
+        RAKING_THREADS = (SHARED_ELEMENTS + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH,
+
+        /// Whether we will have bank conflicts (technically we should find out if the GCD is > 1)
+        HAS_CONFLICTS = (CUB_SMEM_BANKS(PTX_ARCH) % SEGMENT_LENGTH == 0),
+
+        /// Degree of bank conflicts (e.g., 4-way)
+        CONFLICT_DEGREE = (HAS_CONFLICTS) ?
+            (MAX_RAKING_THREADS * SEGMENT_LENGTH) / CUB_SMEM_BANKS(PTX_ARCH) :
+            1,
+
+        /// Pad each segment length with one element if segment length is not relatively prime to warp size and can't be optimized as a vector load
+        USE_SEGMENT_PADDING = ((SEGMENT_LENGTH & 1) == 0) && (SEGMENT_LENGTH > 2),
+
+        /// Total number of elements in the raking grid
+        GRID_ELEMENTS = RAKING_THREADS * (SEGMENT_LENGTH + USE_SEGMENT_PADDING),
+
+        /// Whether or not we need bounds checking during raking (the number of reduction elements is not a multiple of the number of raking threads)
+        UNGUARDED = (SHARED_ELEMENTS % RAKING_THREADS == 0),
+    };
+
+
+    /**
+     * \brief Shared memory storage type
+     */
+    struct __align__(16) _TempStorage
+    {
+        T buff[BlockRakingLayout::GRID_ELEMENTS];
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /**
+     * \brief Returns the location for the calling thread to place data into the grid
+     */
+    static __device__ __forceinline__ T* PlacementPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        // Offset for partial
+        unsigned int offset = linear_tid;
+
+        // Add in one padding element for every segment
+        if (USE_SEGMENT_PADDING > 0)
+        {
+            offset += offset / SEGMENT_LENGTH;
+        }
+
+        // Incorporating a block of padding partials every shared memory segment
+        return temp_storage.Alias().buff + offset;
+    }
+
+
+    /**
+     * \brief Returns the location for the calling thread to begin sequential raking
+     */
+    static __device__ __forceinline__ T* RakingPtr(
+        TempStorage &temp_storage,
+        unsigned int linear_tid)
+    {
+        return temp_storage.Alias().buff + (linear_tid * (SEGMENT_LENGTH + USE_SEGMENT_PADDING));
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/block_reduce.cuh b/GraphBLAS/CUDA/local_cub/block/block_reduce.cuh
new file mode 100644
index 0000000000..261f2ea6f5
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/block_reduce.cuh
@@ -0,0 +1,607 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_reduce_raking.cuh"
+#include "specializations/block_reduce_raking_commutative_only.cuh"
+#include "specializations/block_reduce_warp_reductions.cuh"
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * BlockReduceAlgorithm enumerates alternative algorithms for parallel
+ * reduction across a CUDA thread block.
+ */
+enum BlockReduceAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that only supports commutative
+     * reduction operators (true for most operations, e.g., addition).
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Threads in warps other than the first warp place
+     *    their partial reductions into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within the first
+     *    warp continue to accumulate by raking across segments of shared partial reductions
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
+     *   and is preferable when the reduction operator is commutative.  This variant
+     *   applies fewer reduction operators  than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY,
+
+
+    /**
+     * \par Overview
+     * An efficient "raking" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators. \blocked.
+     *
+     * \par
+     * Execution is comprised of three phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a
+     *    single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style reduction within the raking warp.
+     *
+     * \par
+     * \image html block_reduce.png
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant performs more communication than BLOCK_REDUCE_RAKING
+     *   and is only preferable when the reduction operator is non-commutative.  This variant
+     *   applies fewer reduction operators than BLOCK_REDUCE_WARP_REDUCTIONS, and can provide higher overall
+     *   throughput across the GPU when suitably occupied.  However, turn-around latency may be
+     *   higher than to BLOCK_REDUCE_WARP_REDUCTIONS and thus less-desirable
+     *   when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_RAKING,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warp-reductions" reduction algorithm that supports commutative
+     * (e.g., addition) and non-commutative (e.g., string concatenation) reduction
+     * operators.
+     *
+     * \par
+     * Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more
+     *    than one input each).  Each thread then places the partial reduction
+     *    of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style
+     *    reduction within each warp.
+     * -# A propagation phase where the warp reduction outputs in each warp are
+     *    updated with the aggregate from each preceding warp.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
+     *   or BLOCK_REDUCE_RAKING_NON_COMMUTATIVE, which may result in lower overall
+     *   throughput across the GPU.  However turn-around latency may be lower and
+     *   thus useful when the GPU is under-occupied.
+     */
+    BLOCK_REDUCE_WARP_REDUCTIONS,
+};
+
+
+/******************************************************************************
+ * Block reduce
+ ******************************************************************************/
+
+/**
+ * \brief The BlockReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread block. ![](reduce_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being reduced
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockReduceAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_REDUCE_WARP_REDUCTIONS)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - \rowmajor
+ * - BlockReduce can be optionally specialized by algorithm to accommodate different latency/throughput workload profiles:
+ *   -# <b>cub::BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY</b>.  An efficient "raking" reduction algorithm that only supports commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_RAKING</b>.  An efficient "raking" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *   -# <b>cub::BLOCK_REDUCE_WARP_REDUCTIONS</b>.  A quick "tiled warp-reductions" reduction algorithm that supports commutative and non-commutative reduction operators. [More...](\ref cub::BlockReduceAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Very efficient (only one synchronization barrier).
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Summation (<b><em>vs.</em></b> generic reduction)
+ *   - \p BLOCK_THREADS is a multiple of the architecture's warp size
+ *   - Every thread has a valid input (i.e., full <b><em>vs.</em></b> partial-tiles)
+ * - See cub::BlockReduceAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockReduce}
+ * \par
+ * The code snippet below illustrates a sum reduction of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+ *     typedef cub::BlockReduce<int, 128> BlockReduce;
+ *
+ *     // Allocate shared memory for BlockReduce
+ *     __shared__ typename BlockReduce::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Compute the block-wide sum for thread0
+ *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    BlockReduceAlgorithm    ALGORITHM       = BLOCK_REDUCE_WARP_REDUCTIONS,
+    int                     BLOCK_DIM_Y     = 1,
+    int                     BLOCK_DIM_Z     = 1,
+    int                     PTX_ARCH        = CUB_PTX_ARCH>
+class BlockReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    typedef BlockReduceWarpReductions<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>           WarpReductions;
+    typedef BlockReduceRakingCommutativeOnly<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>    RakingCommutativeOnly;
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH>                   Raking;
+
+    /// Internal specialization type
+    typedef typename If<(ALGORITHM == BLOCK_REDUCE_WARP_REDUCTIONS),
+        WarpReductions,
+        typename If<(ALGORITHM == BLOCK_REDUCE_RAKING_COMMUTATIVE_ONLY),
+            RakingCommutativeOnly,
+            Raking>::Type>::Type InternalBlockReduce;     // BlockReduceRaking
+
+    /// Shared memory storage layout type for BlockReduce
+    typedef typename InternalBlockReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+public:
+
+    /// \smemstorage{BlockReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                      ///< [in] Calling thread's input
+        ReductionOp     reduction_op)               ///< [in] Binary reduction functor 
+    {
+        return InternalBlockReduce(temp_storage).template Reduce<true>(input, BLOCK_THREADS, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max());
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               (&inputs)[ITEMS_PER_THREAD],    ///< [in] Calling thread's input segment
+        ReductionOp     reduction_op)                   ///< [in] Binary reduction functor 
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, reduction_op);
+        return Reduce(partial, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using the specified binary reduction functor.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid) thread_data = ...
+     *
+     *     // Compute the block-wide max for thread0
+     *     int aggregate = BlockReduce(temp_storage).Reduce(thread_data, cub::Max(), num_valid);
+     *
+     * \endcode
+     *
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        ReductionOp         reduction_op,           ///< [in] Binary reduction functor 
+        int                 num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<true>(input, num_valid, reduction_op);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Reduce<false>(input, num_valid, reduction_op);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item
+     *     int thread_data;
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input)                      ///< [in] Calling thread's input
+    {
+        return InternalBlockReduce(temp_storage).template Sum<true>(input, BLOCK_THREADS);
+    }
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data);
+     *
+     * \endcode
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ T Sum(
+        T   (&inputs)[ITEMS_PER_THREAD])    ///< [in] Calling thread's input segment
+    {
+        // Reduce partials
+        T partial = internal::ThreadReduce(inputs, cub::Sum());
+        return Sum(partial);
+    }
+
+
+    /**
+     * \brief Computes a block-wide reduction for thread<sub>0</sub> using addition (+) as the reduction operator.  The first \p num_valid threads each contribute one input element.
+     *
+     * \par
+     * - The return value is undefined in threads other than thread<sub>0</sub>.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction of a partially-full tile of integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_reduce.cuh>
+     *
+     * __global__ void ExampleKernel(int num_valid, ...)
+     * {
+     *     // Specialize BlockReduce for a 1D block of 128 threads on type int
+     *     typedef cub::BlockReduce<int, 128> BlockReduce;
+     *
+     *     // Allocate shared memory for BlockReduce
+     *     __shared__ typename BlockReduce::TempStorage temp_storage;
+     *
+     *     // Each thread obtains an input item (up to num_items)
+     *     int thread_data;
+     *     if (threadIdx.x < num_valid)
+     *         thread_data = ...
+     *
+     *     // Compute the block-wide sum for thread0
+     *     int aggregate = BlockReduce(temp_storage).Sum(thread_data, num_valid);
+     *
+     * \endcode
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T   input,                  ///< [in] Calling thread's input
+        int num_valid)              ///< [in] Number of threads containing valid elements (may be less than BLOCK_THREADS)
+    {
+        // Determine if we scan skip bounds checking
+        if (num_valid >= BLOCK_THREADS)
+        {
+            return InternalBlockReduce(temp_storage).template Sum<true>(input, num_valid);
+        }
+        else
+        {
+            return InternalBlockReduce(temp_storage).template Sum<false>(input, num_valid);
+        }
+    }
+
+
+    //@}  end member group
+};
+
+/**
+ * \example example_block_reduce.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/block_scan.cuh b/GraphBLAS/CUDA/local_cub/block/block_scan.cuh
new file mode 100644
index 0000000000..27ea7ed409
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/block_scan.cuh
@@ -0,0 +1,2126 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "specializations/block_scan_raking.cuh"
+#include "specializations/block_scan_warp_scans.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_ptx.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Algorithmic variants
+ ******************************************************************************/
+
+/**
+ * \brief BlockScanAlgorithm enumerates alternative algorithms for cub::BlockScan to compute a parallel prefix scan across a CUDA thread block.
+ */
+enum BlockScanAlgorithm
+{
+
+    /**
+     * \par Overview
+     * An efficient "raking reduce-then-scan" prefix scan algorithm.  Execution is comprised of five phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Upsweep sequential reduction in shared memory.  Threads within a single warp rake across segments of shared partial reductions.
+     * -# A warp-synchronous Kogge-Stone style exclusive scan within the raking warp.
+     * -# Downsweep sequential exclusive scan in shared memory.  Threads within a single warp rake across segments of shared partial reductions, seeded with the warp-scan output.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_raking.png
+     * <div class="centercaption">\p BLOCK_SCAN_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer longer turnaround latencies when the
+     *   GPU is under-occupied, it can often provide higher overall throughput
+     *   across the GPU when suitably occupied.
+     */
+    BLOCK_SCAN_RAKING,
+
+
+    /**
+     * \par Overview
+     * Similar to cub::BLOCK_SCAN_RAKING, but with fewer shared memory reads at
+     * the expense of higher register pressure.  Raking threads preserve their
+     * "upsweep" segment of values in registers while performing warp-synchronous
+     * scan, allowing the "downsweep" not to re-read them from shared memory.
+     */
+    BLOCK_SCAN_RAKING_MEMOIZE,
+
+
+    /**
+     * \par Overview
+     * A quick "tiled warpscans" prefix scan algorithm.  Execution is comprised of four phases:
+     * -# Upsweep sequential reduction in registers (if threads contribute more than one input each).  Each thread then places the partial reduction of its item(s) into shared memory.
+     * -# Compute a shallow, but inefficient warp-synchronous Kogge-Stone style scan within each warp.
+     * -# A propagation phase where the warp scan outputs in each warp are updated with the aggregate from each preceding warp.
+     * -# Downsweep sequential scan in registers (if threads contribute more than one input), seeded with the raking scan output.
+     *
+     * \par
+     * \image html block_scan_warpscans.png
+     * <div class="centercaption">\p BLOCK_SCAN_WARP_SCANS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
+     *
+     * \par Performance Considerations
+     * - Although this variant may suffer lower overall throughput across the
+     *   GPU because due to a heavy reliance on inefficient warpscans, it can
+     *   often provide lower turnaround latencies when the GPU is under-occupied.
+     */
+    BLOCK_SCAN_WARP_SCANS,
+};
+
+
+/******************************************************************************
+ * Block scan
+ ******************************************************************************/
+
+/**
+ * \brief The BlockScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix sum/scan of items partitioned across a CUDA thread block. ![](block_scan_logo.png)
+ * \ingroup BlockModule
+ *
+ * \tparam T                Data type being scanned
+ * \tparam BLOCK_DIM_X      The thread block length in threads along the X dimension
+ * \tparam ALGORITHM        <b>[optional]</b> cub::BlockScanAlgorithm enumerator specifying the underlying algorithm to use (default: cub::BLOCK_SCAN_RAKING)
+ * \tparam BLOCK_DIM_Y      <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z      <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH         <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - \rowmajor
+ * - BlockScan can be optionally specialized by algorithm to accommodate different workload profiles:
+ *   -# <b>cub::BLOCK_SCAN_RAKING</b>.  An efficient (high throughput) "raking reduce-then-scan" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_RAKING_MEMOIZE</b>.  Similar to cub::BLOCK_SCAN_RAKING, but having higher throughput at the expense of additional register pressure for intermediate storage. [More...](\ref cub::BlockScanAlgorithm)
+ *   -# <b>cub::BLOCK_SCAN_WARP_SCANS</b>.  A quick (low latency) "tiled warpscans" prefix scan algorithm. [More...](\ref cub::BlockScanAlgorithm)
+ *
+ * \par Performance Considerations
+ * - \granularity
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Invokes a minimal number of minimal block-wide synchronization barriers (only
+ *   one or two depending on algorithm selection)
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *   - Prefix sum variants (<b><em>vs.</em></b> generic scan)
+ *   - \blocksize
+ * - See cub::BlockScanAlgorithm for performance details regarding algorithmic alternatives
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockScan}
+ * \par
+ * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+ * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+ * where each thread owns 4 consecutive items.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize BlockScan for a 1D block of 128 threads on type int
+ *     typedef cub::BlockScan<int, 128> BlockScan;
+ *
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ typename BlockScan::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Collectively compute the block-wide exclusive prefix sum
+ *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is
+ * <tt>{[1,1,1,1], [1,1,1,1], ..., [1,1,1,1]}</tt>.
+ * The corresponding output \p thread_data in those threads will be
+ * <tt>{[0,1,2,3], [4,5,6,7], ..., [508,509,510,511]}</tt>.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    BlockScanAlgorithm  ALGORITHM       = BLOCK_SCAN_RAKING,
+    int                 BLOCK_DIM_Y     = 1,
+    int                 BLOCK_DIM_Z     = 1,
+    int                 PTX_ARCH        = CUB_PTX_ARCH>
+class BlockScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /**
+     * Ensure the template parameterization meets the requirements of the
+     * specified algorithm. Currently, the BLOCK_SCAN_WARP_SCANS policy
+     * cannot be used with thread block sizes not a multiple of the
+     * architectural warp size.
+     */
+    static const BlockScanAlgorithm SAFE_ALGORITHM =
+        ((ALGORITHM == BLOCK_SCAN_WARP_SCANS) && (BLOCK_THREADS % CUB_WARP_THREADS(PTX_ARCH) != 0)) ?
+            BLOCK_SCAN_RAKING :
+            ALGORITHM;
+
+    typedef BlockScanWarpScans<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> WarpScans;
+    typedef BlockScanRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, (SAFE_ALGORITHM == BLOCK_SCAN_RAKING_MEMOIZE), PTX_ARCH> Raking;
+
+    /// Define the delegate type for the desired algorithm
+    typedef typename If<(SAFE_ALGORITHM == BLOCK_SCAN_WARP_SCANS),
+        WarpScans,
+        Raking>::Type InternalBlockScan;
+
+    /// Shared memory storage layout type for BlockScan
+    typedef typename InternalBlockScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+public:
+
+    /// \smemstorage{BlockScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  The value of 0 is applied as the initial value, and is assigned to \p output in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 1, ..., 127</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         BlockScan(temp_storage).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, ..., 127</tt>.
+     * The output for the second segment will be <tt>128, 129, ..., 255</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD])  ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  The value of 0 is applied as the initial value, and is assigned to \p output[0] in <em>thread</em><sub>0</sub>.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                 (&input)[ITEMS_PER_THREAD],       ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],      ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 &block_aggregate)                 ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T initial_value = 0;
+        ExclusiveScan(input, output, initial_value, cub::Sum(), block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \identityzero
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix sum
+     *         int block_aggregate;
+     *         BlockScan(temp_storage.scan).ExclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 1, 2, 3, ..., 510, 511</tt>.
+     * The output for the second segment will be <tt>512, 513, 514, 515, ..., 1022, 1023</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)    ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        ExclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+
+    //@}  end member group        // Exclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               initial_value,                  ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp          scan_op,            ///< [in] Binary scan functor 
+        T               &block_aggregate)   ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, ..., 124, 126</tt>.
+     * The output for the second segment will be <tt>126, 128, 128, 130, ..., 252, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group        // Inclusive prefix sums
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an exclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide exclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [INT_MIN,0,0,2], [2,4,4,6], ..., [506,508,508,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        T                 initial_value,                ///< [in] Initial value to seed the exclusive scan (and is assigned to \p output[0] in <em>thread</em><sub>0</sub>)
+        ScanOp            scan_op,                      ///< [in] Binary scan functor
+        T                 &block_aggregate)             ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, initial_value, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an exclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide exclusive prefix max scan
+     *         BlockScan(temp_storage.scan).ExclusiveScan(
+     *             thread_data, thread_data, INT_MIN, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>INT_MIN, 0, 0, 2, 2, 4, ..., 508, 510</tt>.
+     * The output for the second segment will be <tt>510, 512, 512, 514, 514, 516, ..., 1020, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        // Reduce consecutive thread items in registers
+        T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+        // Exclusive scan in registers with prefix as seed
+        internal::ThreadScanExclusive(input, output, scan_op, thread_prefix);
+    }
+
+
+    //@}  end member group
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, single datum per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).ExclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scan operations (no initial value, multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                 (&input)[ITEMS_PER_THREAD],   ///< [in] Calling thread's input items
+        T                 (&output)[ITEMS_PER_THREAD],  ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp            scan_op)                      ///< [in] Binary scan functor
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    /**
+     * \brief Computes an exclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        // Reduce consecutive thread items in registers
+        T thread_partial = internal::ThreadReduce(input, scan_op);
+
+        // Exclusive thread block-scan
+        ExclusiveScan(thread_partial, thread_partial, scan_op, block_aggregate);
+
+        // Exclusive scan in registers with prefix
+        internal::ThreadScanExclusive(input, output, scan_op, thread_partial, (linear_tid != 0));
+    }
+
+
+    //@}  end member group
+#endif // DOXYGEN_SHOULD_SKIP_THIS  // Do not document no-initial-value scans
+
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output)                        ///< [out] Calling thread's output item (may be aliased to \p input)
+    {
+        InclusiveScan(input, output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>1, 1, ..., 1</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>1, 2, ..., 128</tt>.
+     * Furthermore the value \p 128 will be stored in \p block_aggregate for all threads.
+     *
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InclusiveScan(input, output, cub::Sum(), block_aggregate);
+    }
+
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes one input element.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage).InclusiveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, ..., 128</tt>.
+     * The output for the second segment will be <tt>129, 130, ..., 256</tt>.
+     *
+     * \tparam BlockPrefixCallbackOp          <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InclusiveScan(input, output, cub::Sum(), block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sum operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD])    ///< [out] Calling thread's output items (may be aliased to \p input)
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0]);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix sum of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix sum
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveSum(thread_data, thread_data, block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [1,1,1,1], [1,1,1,1], ..., [1,1,1,1] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be
+     * <tt>{ [1,2,3,4], [5,6,7,8], ..., [509,510,511,512] }</tt>.
+     * Furthermore the value \p 512 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void InclusiveSum(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using addition (+) as the scan operator.  Each thread contributes an array of consecutive input elements.  Instead of using 0 as the block-wide prefix, the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix sum over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 512 integer items that are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3)
+     * across 128 threads where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total += block_aggregate;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix sum
+     *         BlockScan(temp_storage.scan).IncluisveSum(
+     *             thread_data, thread_data, prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>1, 1, 1, 1, 1, 1, 1, 1, ...</tt>.
+     * The corresponding output for the first segment will be <tt>1, 2, 3, 4, ..., 511, 512</tt>.
+     * The output for the second segment will be <tt>513, 514, 515, 516, ..., 1023, 1024</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int ITEMS_PER_THREAD,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveSum(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveSum(input[0], output[0], block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            Sum scan_op;
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveSum(thread_prefix, thread_prefix, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 128 integer items that
+     * are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain input item for each thread
+     *     int thread_data;
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>0, -1, 2, -3, ..., 126, -127</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * Furthermore the value \p 126 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ScanOp   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_aggregate);
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \rowmajor
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(INT_MIN);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data = d_data[block_offset];
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         d_data[block_offset] = thread_data;
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, ..., 126, 126</tt>.
+     * The output for the second segment will be <tt>128, 128, 130, 130, ..., 254, 254</tt>.
+     *
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp        <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        InternalBlockScan(temp_storage).InclusiveScan(input, output, scan_op, block_prefix_callback_op);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scan operations (multiple data per thread)
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.  The
+     * corresponding output \p thread_data in those threads will be <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan functor 
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates an inclusive prefix max scan of 512 integer items that
+     * are partitioned in a [<em>blocked arrangement</em>](index.html#sec5sec3) across 128 threads
+     * where each thread owns 4 consecutive items.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize BlockScan for a 1D block of 128 threads on type int
+     *     typedef cub::BlockScan<int, 128> BlockScan;
+     *
+     *     // Allocate shared memory for BlockScan
+     *     __shared__ typename BlockScan::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Collectively compute the block-wide inclusive prefix max scan
+     *     int block_aggregate;
+     *     BlockScan(temp_storage).InclusiveScan(thread_data, thread_data, cub::Max(), block_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is
+     * <tt>{ [0,-1,2,-3], [4,-5,6,-7], ..., [508,-509,510,-511] }</tt>.
+     * The corresponding output \p thread_data in those threads will be
+     * <tt>{ [0,0,2,2], [4,4,6,6], ..., [508,508,510,510] }</tt>.
+     * Furthermore the value \p 510 will be stored in \p block_aggregate for all threads.
+     *
+     * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp               <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename         ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T               (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan functor 
+        T               &block_aggregate)               ///< [out] block-wide aggregate reduction of input items
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_aggregate);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan (with no initial value)
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_aggregate);
+
+            // Inclusive scan in registers with prefix as seed (first thread does not seed)
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix, (linear_tid != 0));
+        }
+    }
+
+
+    /**
+     * \brief Computes an inclusive block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes an array of consecutive input elements.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+     *
+     * \par
+     * - The \p block_prefix_callback_op functor must implement a member function <tt>T operator()(T block_aggregate)</tt>.
+     *   The functor's input parameter \p block_aggregate is the same value also returned by the scan operation.
+     *   The functor will be invoked by the first warp of threads in the block, however only the return value from
+     *   <em>lane</em><sub>0</sub> is applied as the block-wide prefix.  Can be stateful.
+     * - Supports non-commutative scan operators.
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a single thread block that progressively
+     * computes an inclusive prefix max scan over multiple "tiles" of input using a
+     * prefix functor to maintain a running total between block-wide scans.  Each tile consists
+     * of 128 integer items that are partitioned across 128 threads.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_scan.cuh>
+     *
+     * // A stateful callback functor that maintains a running prefix to be applied
+     * // during consecutive scan operations.
+     * struct BlockPrefixCallbackOp
+     * {
+     *     // Running prefix
+     *     int running_total;
+     *
+     *     // Constructor
+     *     __device__ BlockPrefixCallbackOp(int running_total) : running_total(running_total) {}
+     *
+     *     // Callback operator to be entered by the first warp of threads in the block.
+     *     // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+     *     __device__ int operator()(int block_aggregate)
+     *     {
+     *         int old_prefix = running_total;
+     *         running_total = (block_aggregate > old_prefix) ? block_aggregate : old_prefix;
+     *         return old_prefix;
+     *     }
+     * };
+     *
+     * __global__ void ExampleKernel(int *d_data, int num_items, ...)
+     * {
+     *     // Specialize BlockLoad, BlockStore, and BlockScan for a 1D block of 128 threads, 4 ints per thread
+     *     typedef cub::BlockLoad<int*, 128, 4, BLOCK_LOAD_TRANSPOSE>   BlockLoad;
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_TRANSPOSE>  BlockStore;
+     *     typedef cub::BlockScan<int, 128>                             BlockScan;
+     *
+     *     // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+     *     __shared__ union {
+     *         typename BlockLoad::TempStorage     load;
+     *         typename BlockScan::TempStorage     scan;
+     *         typename BlockStore::TempStorage    store;
+     *     } temp_storage;
+     *
+     *     // Initialize running total
+     *     BlockPrefixCallbackOp prefix_op(0);
+     *
+     *     // Have the block iterate over segments of items
+     *     for (int block_offset = 0; block_offset < num_items; block_offset += 128 * 4)
+     *     {
+     *         // Load a segment of consecutive items that are blocked across threads
+     *         int thread_data[4];
+     *         BlockLoad(temp_storage.load).Load(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *
+     *         // Collectively compute the block-wide inclusive prefix max scan
+     *         BlockScan(temp_storage.scan).InclusiveScan(
+     *             thread_data, thread_data, cub::Max(), prefix_op);
+     *         CTA_SYNC();
+     *
+     *         // Store scanned items to output segment
+     *         BlockStore(temp_storage.store).Store(d_data + block_offset, thread_data);
+     *         CTA_SYNC();
+     *     }
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>0, -1, 2, -3, 4, -5, ...</tt>.
+     * The corresponding output for the first segment will be <tt>0, 0, 2, 2, 4, 4, ..., 510, 510</tt>.
+     * The output for the second segment will be <tt>512, 512, 514, 514, 516, 516, ..., 1022, 1022</tt>.
+     *
+     * \tparam ITEMS_PER_THREAD         <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+     * \tparam ScanOp                   <b>[inferred]</b> Binary scan functor  type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam BlockPrefixCallbackOp    <b>[inferred]</b> Call-back functor type having member <tt>T operator()(T block_aggregate)</tt>
+     */
+    template <
+        int             ITEMS_PER_THREAD,
+        typename        ScanOp,
+        typename        BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       (&input)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input items
+        T                       (&output)[ITEMS_PER_THREAD],    ///< [out] Calling thread's output items (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan functor 
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a block-wide prefix to be applied to the logical input sequence.
+    {
+        if (ITEMS_PER_THREAD == 1)
+        {
+            InclusiveScan(input[0], output[0], scan_op, block_prefix_callback_op);
+        }
+        else
+        {
+            // Reduce consecutive thread items in registers
+            T thread_prefix = internal::ThreadReduce(input, scan_op);
+
+            // Exclusive thread block-scan
+            ExclusiveScan(thread_prefix, thread_prefix, scan_op, block_prefix_callback_op);
+
+            // Inclusive scan in registers with prefix as seed
+            internal::ThreadScanInclusive(input, output, scan_op, thread_prefix);
+        }
+    }
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_block_scan.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/block_shuffle.cuh b/GraphBLAS/CUDA/local_cub/block/block_shuffle.cuh
new file mode 100644
index 0000000000..a0cc71d222
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/block_shuffle.cuh
@@ -0,0 +1,305 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../util_arch.cuh"
+#include "../util_ptx.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief The BlockShuffle class provides [<em>collective</em>](index.html#sec0) methods for shuffling data partitioned across a CUDA thread block.
+ * \ingroup BlockModule
+ *
+ * \tparam T                    The data type to be exchanged.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * It is commonplace for blocks of threads to rearrange data items between
+ * threads.  The BlockShuffle abstraction allows threads to efficiently shift items
+ * either (a) up to their successor or (b) down to their predecessor.
+ *
+ */
+template <
+    typename            T,
+    int                 BLOCK_DIM_X,
+    int                 BLOCK_DIM_Y         = 1,
+    int                 BLOCK_DIM_Z         = 1,
+    int                 PTX_ARCH            = CUB_PTX_ARCH>
+class BlockShuffle
+{
+private:
+
+    /******************************************************************************
+     * Constants
+     ******************************************************************************/
+
+    enum
+    {
+        BLOCK_THREADS               = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        LOG_WARP_THREADS            = CUB_LOG_WARP_THREADS(PTX_ARCH),
+        WARP_THREADS                = 1 << LOG_WARP_THREADS,
+        WARPS                       = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Shared memory storage layout type (last element from each thread's input)
+    struct _TempStorage
+    {
+        T prev[BLOCK_THREADS];
+        T next[BLOCK_THREADS];
+    };
+
+
+public:
+
+    /// \smemstorage{BlockShuffle}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+private:
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    unsigned int linear_tid;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+public:
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockShuffle(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Shuffle movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>. The offset \p distance may be negative.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Offset(
+        T   input,                  ///< [in] The input item from the calling thread (<em>thread<sub>i</sub></em>)
+        T&  output,                 ///< [out] The \p input item from the successor (or predecessor) thread <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub> (may be aliased to \p input).  This value is only updated for for <em>thread<sub>i</sub></em> when 0 <= (<em>i</em> + \p distance) < <tt>BLOCK_THREADS-1</tt>
+        int distance = 1)           ///< [in] Offset distance (may be negative)
+    {
+        temp_storage[linear_tid].prev = input;
+
+        CTA_SYNC();
+
+        if ((linear_tid + distance >= 0) && (linear_tid + distance < BLOCK_THREADS))
+            output = temp_storage[linear_tid + distance].prev;
+    }
+
+
+    /**
+     * \brief Each <em>thread<sub>i</sub></em> obtains the \p input provided by <em>thread</em><sub><em>i</em>+<tt>distance</tt></sub>.
+     *
+     * \par
+     * - \smemreuse
+     */
+    __device__ __forceinline__ void Rotate(
+        T   input,                  ///< [in] The calling thread's input item
+        T&  output,                 ///< [out] The \p input item from thread <em>thread</em><sub>(<em>i</em>+<tt>distance></tt>)%<tt><BLOCK_THREADS></tt></sub> (may be aliased to \p input).  This value is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>
+        unsigned int distance = 1)  ///< [in] Offset distance (0 < \p distance < <tt>BLOCK_THREADS</tt>)
+    {
+        temp_storage[linear_tid].prev = input;
+
+        CTA_SYNC();
+
+        unsigned int offset = threadIdx.x + distance;
+        if (offset >= BLOCK_THREADS)
+            offset -= BLOCK_THREADS;
+
+        output = temp_storage[offset].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+    {
+        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it up by one item.  All threads receive the \p input provided by <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Up(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The item \p prev[0] is not updated for <em>thread</em><sub>0</sub>.
+        T &block_suffix)                ///< [out] The item \p input[ITEMS_PER_THREAD-1] from <em>thread</em><sub><tt>BLOCK_THREADS-1</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_suffix = temp_storage[BLOCK_THREADS - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of \p input items, shifting it down by one item
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD])    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+    {
+        temp_storage[linear_tid].prev = input[ITEMS_PER_THREAD - 1];
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int ITEM = ITEMS_PER_THREAD - 1; ITEM > 0; --ITEM)
+            prev[ITEM] = input[ITEM - 1];
+
+        if (linear_tid > 0)
+            prev[0] = temp_storage[linear_tid - 1].prev;
+    }
+
+
+    /**
+     * \brief The thread block rotates its [<em>blocked arrangement</em>](index.html#sec5sec3) of input items, shifting it down by one item.  All threads receive \p input[0] provided by <em>thread</em><sub><tt>0</tt></sub>.
+     *
+     * \par
+     * - \blocked
+     * - \granularity
+     * - \smemreuse
+     */
+    template <int ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Down(
+        T (&input)[ITEMS_PER_THREAD],   ///< [in] The calling thread's input items
+        T (&prev)[ITEMS_PER_THREAD],    ///< [out] The corresponding predecessor items (may be aliased to \p input).  The value \p prev[0] is not updated for <em>thread</em><sub>BLOCK_THREADS-1</sub>.
+        T &block_prefix)                ///< [out] The item \p input[0] from <em>thread</em><sub><tt>0</tt></sub>, provided to all threads
+    {
+        Up(input, prev);
+        block_prefix = temp_storage[BLOCK_THREADS - 1].prev;
+    }
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/block_store.cuh b/GraphBLAS/CUDA/local_cub/block/block_store.cuh
new file mode 100644
index 0000000000..648bf9ff4d
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/block_store.cuh
@@ -0,0 +1,1000 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Operations for writing linear segments of data from the CUDA thread block
+ */
+
+#pragma once
+
+#include <iterator>
+
+#include "block_exchange.cuh"
+#include "../util_ptx.cuh"
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+/******************************************************************//**
+ * \name Blocked arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[ITEM] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items, guarded by range
+ *
+ * \blocked
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectBlocked(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + (linear_tid * ITEMS_PER_THREAD);
+
+    // Store directly in thread-blocked order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (ITEM + (linear_tid * ITEMS_PER_THREAD) < valid_items)
+        {
+            thread_itr[ITEM] = items[ITEM];
+        }
+    }
+}
+
+
+/**
+ * \brief Store a blocked arrangement of items across a thread block into a linear segment of items.
+ *
+ * \blocked
+ *
+ * The output offset (\p block_ptr + \p block_offset) must be quad-item aligned,
+ * which is the default starting offset returned by \p cudaMalloc()
+ *
+ * \par
+ * The following conditions will prevent vectorization and storing will fall back to cub::BLOCK_STORE_DIRECT:
+ *   - \p ITEMS_PER_THREAD is odd
+ *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ *
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD>
+__device__ __forceinline__ void StoreDirectBlockedVectorized(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    T                   *block_ptr,                 ///< [in] Input pointer for storing from
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    enum
+    {
+        // Maximum CUDA vector size is 4 elements
+        MAX_VEC_SIZE = CUB_MIN(4, ITEMS_PER_THREAD),
+
+        // Vector size must be a power of two and an even divisor of the items per thread
+        VEC_SIZE = ((((MAX_VEC_SIZE - 1) & MAX_VEC_SIZE) == 0) && ((ITEMS_PER_THREAD % MAX_VEC_SIZE) == 0)) ?
+            MAX_VEC_SIZE :
+            1,
+
+        VECTORS_PER_THREAD = ITEMS_PER_THREAD / VEC_SIZE,
+    };
+
+    // Vector type
+    typedef typename CubVector<T, VEC_SIZE>::Type Vector;
+
+    // Alias global pointer
+    Vector *block_ptr_vectors = reinterpret_cast<Vector*>(const_cast<T*>(block_ptr));
+
+    // Alias pointers (use "raw" array here which should get optimized away to prevent conservative PTXAS lmem spilling)
+    Vector raw_vector[VECTORS_PER_THREAD];
+    T *raw_items = reinterpret_cast<T*>(raw_vector);
+
+    // Copy
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        raw_items[ITEM] = items[ITEM];
+    }
+
+    // Direct-store using vector types
+    StoreDirectBlocked(linear_tid, block_ptr_vectors, raw_vector);
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \striped
+ *
+ * \tparam BLOCK_THREADS        The thread block size in threads
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    int                 BLOCK_THREADS,
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    OutputIteratorT thread_itr = block_itr + linear_tid;
+
+    // Store directly in striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if ((ITEM * BLOCK_THREADS) + linear_tid < valid_items)
+        {
+            thread_itr[(ITEM * BLOCK_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+
+//@}  end member group
+/******************************************************************//**
+ * \name Warp-striped arrangement I/O (direct)
+ *********************************************************************/
+//@{
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items.
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD]) ///< [out] Data to load
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+    }
+}
+
+
+/**
+ * \brief Store a warp-striped arrangement of data across the thread block into a linear segment of items, guarded by range
+ *
+ * \warpstriped
+ *
+ * \par Usage Considerations
+ * The number of threads in the thread block must be a multiple of the architecture's warp size.
+ *
+ * \tparam T                    <b>[inferred]</b> The data type to store.
+ * \tparam ITEMS_PER_THREAD     <b>[inferred]</b> The number of consecutive items partitioned onto each thread.
+ * \tparam OutputIteratorT      <b>[inferred]</b> The random-access iterator type for output \iterator.
+ */
+template <
+    typename            T,
+    int                 ITEMS_PER_THREAD,
+    typename            OutputIteratorT>
+__device__ __forceinline__ void StoreDirectWarpStriped(
+    int                 linear_tid,                 ///< [in] A suitable 1D thread-identifier for the calling thread (e.g., <tt>(threadIdx.y * blockDim.x) + linear_tid</tt> for 2D thread blocks)
+    OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+    T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+    int                 valid_items)                ///< [in] Number of valid items to write
+{
+    int tid         = linear_tid & (CUB_PTX_WARP_THREADS - 1);
+    int wid         = linear_tid >> CUB_PTX_LOG_WARP_THREADS;
+    int warp_offset = wid * CUB_PTX_WARP_THREADS * ITEMS_PER_THREAD;
+
+    OutputIteratorT thread_itr = block_itr + warp_offset + tid;
+
+    // Store directly in warp-striped order
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ITEM++)
+    {
+        if (warp_offset + tid + (ITEM * CUB_PTX_WARP_THREADS) < valid_items)
+        {
+            thread_itr[(ITEM * CUB_PTX_WARP_THREADS)] = items[ITEM];
+        }
+    }
+}
+
+
+//@}  end member group
+
+
+/** @} */       // end group UtilIo
+
+
+//-----------------------------------------------------------------------------
+// Generic BlockStore abstraction
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief cub::BlockStoreAlgorithm enumerates alternative algorithms for cub::BlockStore to write a blocked arrangement of items across a CUDA thread block to a linear segment of memory.
+ */
+enum BlockStoreAlgorithm
+{
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+     * directly to memory.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) decreases as the
+     *   access stride between threads increases (i.e., the number items per thread).
+     */
+    BLOCK_STORE_DIRECT,
+
+    /**
+     * \par Overview
+     *
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written directly
+     * to memory using CUDA's built-in vectorized stores as a coalescing optimization.
+     * For example, <tt>st.global.v4.s32</tt> instructions will be generated
+     * when \p T = \p int and \p ITEMS_PER_THREAD % 4 == 0.
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high until the the
+     *   access stride between threads (i.e., the number items per thread) exceeds the
+     *   maximum vector store width (typically 4 items or 64B, whichever is lower).
+     * - The following conditions will prevent vectorization and writing will fall back to cub::BLOCK_STORE_DIRECT:
+     *   - \p ITEMS_PER_THREAD is odd
+     *   - The \p OutputIteratorT is not a simple pointer type
+     *   - The block output offset is not quadword-aligned
+     *   - The data type \p T is not a built-in primitive or CUDA vector type (e.g., \p short, \p int2, \p double, \p float2, etc.)
+     */
+    BLOCK_STORE_VECTORIZE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a [<em>striped arrangement</em>](index.html#sec5sec3).
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - The local reordering incurs slightly longer latencies and throughput than the
+     *   direct cub::BLOCK_STORE_DIRECT and cub::BLOCK_STORE_VECTORIZE alternatives.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE,
+
+    /**
+     * \par Overview
+     * A [<em>blocked arrangement</em>](index.html#sec5sec3) is locally
+     * transposed and then efficiently written to memory as a
+     * [<em>warp-striped arrangement</em>](index.html#sec5sec3)
+     * To reduce the shared memory requirement, only one warp's worth of shared
+     * memory is provisioned and is subsequently time-sliced among warps.
+     *
+     * \par Usage Considerations
+     * - BLOCK_THREADS must be a multiple of WARP_THREADS
+     *
+     * \par Performance Considerations
+     * - The utilization of memory transactions (coalescing) remains high regardless
+     *   of items written per thread.
+     * - Provisions less shared memory temporary storage, but incurs larger
+     *   latencies than the BLOCK_STORE_WARP_TRANSPOSE alternative.
+     */
+    BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+
+};
+
+
+/**
+ * \brief The BlockStore class provides [<em>collective</em>](index.html#sec0) data movement methods for writing a [<em>blocked arrangement</em>](index.html#sec5sec3) of items partitioned across a CUDA thread block to a linear segment of memory.  ![](block_store_logo.png)
+ * \ingroup BlockModule
+ * \ingroup UtilIo
+ *
+ * \tparam T                    The type of data to be written.
+ * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
+ * \tparam ITEMS_PER_THREAD     The number of consecutive items partitioned onto each thread.
+ * \tparam ALGORITHM            <b>[optional]</b> cub::BlockStoreAlgorithm tuning policy enumeration.  default: cub::BLOCK_STORE_DIRECT.
+ * \tparam WARP_TIME_SLICING    <b>[optional]</b> Whether or not only one warp's worth of shared memory should be allocated and time-sliced among block-warps during any load-related data transpositions (versus each warp having its own storage). (default: false)
+ * \tparam BLOCK_DIM_Y          <b>[optional]</b> The thread block length in threads along the Y dimension (default: 1)
+ * \tparam BLOCK_DIM_Z          <b>[optional]</b> The thread block length in threads along the Z dimension (default: 1)
+ * \tparam PTX_ARCH             <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - The BlockStore class provides a single data movement abstraction that can be specialized
+ *   to implement different cub::BlockStoreAlgorithm strategies.  This facilitates different
+ *   performance policies for different architectures, data types, granularity sizes, etc.
+ * - BlockStore can be optionally specialized by different data movement strategies:
+ *   -# <b>cub::BLOCK_STORE_DIRECT</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3) of data is written
+ *      directly to memory. [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_VECTORIZE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      of data is written directly to memory using CUDA's built-in vectorized stores as a
+ *      coalescing optimization.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ *   -# <b>cub::BLOCK_STORE_WARP_TRANSPOSE</b>.  A [<em>blocked arrangement</em>](index.html#sec5sec3)
+ *      is locally transposed into a [<em>warp-striped arrangement</em>](index.html#sec5sec3) which is
+ *      then written to memory.  [More...](\ref cub::BlockStoreAlgorithm)
+ * - \rowmajor
+ *
+ * \par A Simple Example
+ * \blockcollective{BlockStore}
+ * \par
+ * The code snippet below illustrates the storing of a "blocked" arrangement
+ * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+ * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+ * meaning items are locally reordered among threads so that memory references will be
+ * efficiently coalesced using a warp-striped access pattern.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+ *
+ * __global__ void ExampleKernel(int *d_data, ...)
+ * {
+ *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+ *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+ *
+ *     // Allocate shared memory for BlockStore
+ *     __shared__ typename BlockStore::TempStorage temp_storage;
+ *
+ *     // Obtain a segment of consecutive items that are blocked across threads
+ *     int thread_data[4];
+ *     ...
+ *
+ *     // Store items to linear memory
+ *     int thread_data[4];
+ *     BlockStore(temp_storage).Store(d_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of \p thread_data across the block of threads is
+ * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+ * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+ *
+ */
+template <
+    typename                T,
+    int                     BLOCK_DIM_X,
+    int                     ITEMS_PER_THREAD,
+    BlockStoreAlgorithm     ALGORITHM           = BLOCK_STORE_DIRECT,
+    int                     BLOCK_DIM_Y         = 1,
+    int                     BLOCK_DIM_Z         = 1,
+    int                     PTX_ARCH            = CUB_PTX_ARCH>
+class BlockStore
+{
+private:
+    /******************************************************************************
+     * Constants and typed definitions
+     ******************************************************************************/
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+
+    /******************************************************************************
+     * Algorithmic variants
+     ******************************************************************************/
+
+    /// Store helper
+    template <BlockStoreAlgorithm _POLICY, int DUMMY>
+    struct StoreInternal;
+
+
+    /**
+     * BLOCK_STORE_DIRECT specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_DIRECT, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_VECTORIZE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_VECTORIZE, DUMMY>
+    {
+        /// Shared memory storage layout type
+        typedef NullType TempStorage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &/*temp_storage*/,
+            int linear_tid)
+        :
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory, specialized for native pointer types (attempts vectorization)
+        __device__ __forceinline__ void Store(
+            T                   *block_ptr,                 ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlockedVectorized(linear_tid, block_ptr, items);
+        }
+
+        /// Store items into a linear segment of memory, specialized for opaque input iterators (skips vectorization)
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT    block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            StoreDirectBlocked(linear_tid, block_itr, items, valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_TRANSPOSE, DUMMY>
+    {
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectStriped<BLOCK_THREADS>(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, false, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD])   ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                    ///< [in] The thread block's base output iterator for storing to
+            T                 (&items)[ITEMS_PER_THREAD],   ///< [in] Data to store
+            int               valid_items)                  ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+
+    /**
+     * BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED specialization of store helper
+     */
+    template <int DUMMY>
+    struct StoreInternal<BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED, DUMMY>
+    {
+        enum
+        {
+            WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH)
+        };
+
+        // Assert BLOCK_THREADS must be a multiple of WARP_THREADS
+        CUB_STATIC_ASSERT((BLOCK_THREADS % WARP_THREADS == 0), "BLOCK_THREADS must be a multiple of WARP_THREADS");
+
+        // BlockExchange utility type for keys
+        typedef BlockExchange<T, BLOCK_DIM_X, ITEMS_PER_THREAD, true, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> BlockExchange;
+
+        /// Shared memory storage layout type
+        struct _TempStorage : BlockExchange::TempStorage
+        {
+            /// Temporary storage for partially-full block guard
+            volatile int valid_items;
+        };
+
+        /// Alias wrapper allowing storage to be unioned
+        struct TempStorage : Uninitialized<_TempStorage> {};
+
+        /// Thread reference to shared storage
+        _TempStorage &temp_storage;
+
+        /// Linear thread-id
+        int linear_tid;
+
+        /// Constructor
+        __device__ __forceinline__ StoreInternal(
+            TempStorage &temp_storage,
+            int linear_tid)
+        :
+            temp_storage(temp_storage.Alias()),
+            linear_tid(linear_tid)
+        {}
+
+        /// Store items into a linear segment of memory
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            StoreDirectWarpStriped(linear_tid, block_itr, items);
+        }
+
+        /// Store items into a linear segment of memory, guarded by range
+        template <typename OutputIteratorT>
+        __device__ __forceinline__ void Store(
+            OutputIteratorT   block_itr,                  ///< [in] The thread block's base output iterator for storing to
+            T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+            int                 valid_items)                ///< [in] Number of valid items to write
+        {
+            BlockExchange(temp_storage).BlockedToWarpStriped(items);
+            if (linear_tid == 0)
+                temp_storage.valid_items = valid_items;     // Move through volatile smem as a workaround to prevent RF spilling on subsequent loads
+            CTA_SYNC();
+            StoreDirectWarpStriped(linear_tid, block_itr, items, temp_storage.valid_items);
+        }
+    };
+
+    /******************************************************************************
+     * Type definitions
+     ******************************************************************************/
+
+    /// Internal load implementation to use
+    typedef StoreInternal<ALGORITHM, 0> InternalStore;
+
+
+    /// Shared memory storage layout type
+    typedef typename InternalStore::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Internal storage allocator
+    __device__ __forceinline__ _TempStorage& PrivateStorage()
+    {
+        __shared__ _TempStorage private_storage;
+        return private_storage;
+    }
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Thread reference to shared storage
+    _TempStorage &temp_storage;
+
+    /// Linear thread-id
+    int linear_tid;
+
+public:
+
+
+    /// \smemstorage{BlockStore}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using a private static allocation of shared memory as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore()
+    :
+        temp_storage(PrivateStorage()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.
+     */
+    __device__ __forceinline__ BlockStore(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data movement
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Store items into a linear segment of memory.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt>.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, 5, ...</tt>.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD]) ///< [in] Data to store
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items);
+    }
+
+    /**
+     * \brief Store items into a linear segment of memory, guarded by range.
+     *
+     * \par
+     * - \blocked
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the guarded storing of a "blocked" arrangement
+     * of 512 integers across 128 threads (where each thread owns 4 consecutive items)
+     * into a linear segment of memory.  The store is specialized for \p BLOCK_STORE_WARP_TRANSPOSE,
+     * meaning items are locally reordered among threads so that memory references will be
+     * efficiently coalesced using a warp-striped access pattern.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/block/block_store.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items, ...)
+     * {
+     *     // Specialize BlockStore for a 1D block of 128 threads owning 4 integer items each
+     *     typedef cub::BlockStore<int, 128, 4, BLOCK_STORE_WARP_TRANSPOSE> BlockStore;
+     *
+     *     // Allocate shared memory for BlockStore
+     *     __shared__ typename BlockStore::TempStorage temp_storage;
+     *
+     *     // Obtain a segment of consecutive items that are blocked across threads
+     *     int thread_data[4];
+     *     ...
+     *
+     *     // Store items to linear memory
+     *     int thread_data[4];
+     *     BlockStore(temp_storage).Store(d_data, thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of \p thread_data across the block of threads is
+     * <tt>{ [0,1,2,3], [4,5,6,7], ..., [508,509,510,511] }</tt> and \p valid_items is \p 5.
+     * The output \p d_data will be <tt>0, 1, 2, 3, 4, ?, ?, ?, ...</tt>, with
+     * only the first two threads being unmasked to store portions of valid data.
+     *
+     */
+    template <typename OutputIteratorT>
+    __device__ __forceinline__ void Store(
+        OutputIteratorT     block_itr,                  ///< [in] The thread block's base output iterator for storing to
+        T                   (&items)[ITEMS_PER_THREAD], ///< [in] Data to store
+        int                 valid_items)                ///< [in] Number of valid items to write
+    {
+        InternalStore(temp_storage, linear_tid).Store(block_itr, items, valid_items);
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_histogram_atomic.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_histogram_atomic.cuh
new file mode 100644
index 0000000000..29db0df710
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_histogram_atomic.cuh
@@ -0,0 +1,82 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief The BlockHistogramAtomic class provides atomic-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <int BINS>
+struct BlockHistogramAtomic
+{
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramAtomic(
+        TempStorage &temp_storage)
+    {}
+
+
+    /// Composite data onto an existing histogram
+    template <
+        typename            T,
+        typename            CounterT,     
+        int                 ITEMS_PER_THREAD>
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT             histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        // Update histogram
+        #pragma unroll
+        for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+        {
+              atomicAdd(histogram + items[i], 1);
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_histogram_sort.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_histogram_sort.cuh
new file mode 100644
index 0000000000..9ef417adca
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_histogram_sort.cuh
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../block/block_radix_sort.cuh"
+#include "../../block/block_discontinuity.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \brief The BlockHistogramSort class provides sorting-based methods for constructing block-wide histograms from data samples partitioned across a CUDA thread block.
+ */
+template <
+    typename    T,                  ///< Sample type
+    int         BLOCK_DIM_X,        ///< The thread block length in threads along the X dimension
+    int         ITEMS_PER_THREAD,   ///< The number of samples per thread
+    int         BINS,               ///< The number of bins into which histogram samples may fall
+    int         BLOCK_DIM_Y,        ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,        ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>           ///< The PTX compute capability for which to to specialize this collective
+struct BlockHistogramSort
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // Parameterize BlockRadixSort type for our thread block
+    typedef BlockRadixSort<
+            T,
+            BLOCK_DIM_X,
+            ITEMS_PER_THREAD,
+            NullType,
+            4,
+            (PTX_ARCH >= 350) ? true : false,
+            BLOCK_SCAN_WARP_SCANS,
+            cudaSharedMemBankSizeFourByte,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockRadixSortT;
+
+    // Parameterize BlockDiscontinuity type for our thread block
+    typedef BlockDiscontinuity<
+            T,
+            BLOCK_DIM_X,
+            BLOCK_DIM_Y,
+            BLOCK_DIM_Z,
+            PTX_ARCH>
+        BlockDiscontinuityT;
+
+    /// Shared memory
+    union _TempStorage
+    {
+        // Storage for sorting bin values
+        typename BlockRadixSortT::TempStorage sort;
+
+        struct
+        {
+            // Storage for detecting discontinuities in the tile of sorted bin values
+            typename BlockDiscontinuityT::TempStorage flag;
+
+            // Storage for noting begin/end offsets of bin runs in the tile of sorted bin values
+            unsigned int run_begin[BINS];
+            unsigned int run_end[BINS];
+        };
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockHistogramSort(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    // Discontinuity functor
+    struct DiscontinuityOp
+    {
+        // Reference to temp_storage
+        _TempStorage &temp_storage;
+
+        // Constructor
+        __device__ __forceinline__ DiscontinuityOp(_TempStorage &temp_storage) :
+            temp_storage(temp_storage)
+        {}
+
+        // Discontinuity predicate
+        __device__ __forceinline__ bool operator()(const T &a, const T &b, int b_index)
+        {
+            if (a != b)
+            {
+                // Note the begin/end offsets in shared storage
+                temp_storage.run_begin[b] = b_index;
+                temp_storage.run_end[a] = b_index;
+
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    };
+
+
+    // Composite data onto an existing histogram
+    template <
+        typename            CounterT     >
+    __device__ __forceinline__ void Composite(
+        T                   (&items)[ITEMS_PER_THREAD],     ///< [in] Calling thread's input values to histogram
+        CounterT            histogram[BINS])                 ///< [out] Reference to shared/device-accessible memory histogram
+    {
+        enum { TILE_SIZE = BLOCK_THREADS * ITEMS_PER_THREAD };
+
+        // Sort bytes in blocked arrangement
+        BlockRadixSortT(temp_storage.sort).Sort(items);
+
+        CTA_SYNC();
+
+        // Initialize the shared memory's run_begin and run_end for each bin
+        int histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+        // Finish up with guarded initialization if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            temp_storage.run_begin[histo_offset + linear_tid] = TILE_SIZE;
+            temp_storage.run_end[histo_offset + linear_tid] = TILE_SIZE;
+        }
+
+        CTA_SYNC();
+
+        int flags[ITEMS_PER_THREAD];    // unused
+
+        // Compute head flags to demarcate contiguous runs of the same bin in the sorted tile
+        DiscontinuityOp flag_op(temp_storage);
+        BlockDiscontinuityT(temp_storage.flag).FlagHeads(flags, items, flag_op);
+
+        // Update begin for first item
+        if (linear_tid == 0) temp_storage.run_begin[items[0]] = 0;
+
+        CTA_SYNC();
+
+        // Composite into histogram
+        histo_offset = 0;
+
+        #pragma unroll
+        for(; histo_offset + BLOCK_THREADS <= BINS; histo_offset += BLOCK_THREADS)
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+
+        // Finish up with guarded composition if necessary
+        if ((BINS % BLOCK_THREADS != 0) && (histo_offset + linear_tid < BINS))
+        {
+            int thread_offset = histo_offset + linear_tid;
+            CounterT      count = temp_storage.run_end[thread_offset] - temp_storage.run_begin[thread_offset];
+            histogram[thread_offset] += count;
+        }
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_raking.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_raking.cuh
new file mode 100644
index 0000000000..aff97fc9b5
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_raking.cuh
@@ -0,0 +1,226 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../block/block_raking_layout.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRaking provides raking-based methods of parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ *
+ * Supports non-commutative binary reduction operators.  Unlike commutative
+ * reduction operators (e.g., addition), the application of a non-commutative
+ * reduction operator (e.g, string concatenation) across a sequence of inputs must
+ * honor the relative ordering of items and partial reductions when applying the
+ * reduction operator.
+ *
+ * Compared to the implementation of BlockReduceRaking (which does not support
+ * non-commutative operators), this implementation requires a few extra
+ * rounds of inter-thread communication.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRaking
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, BlockRakingLayout::RAKING_THREADS, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (RAKING_THREADS == BLOCK_THREADS),
+
+        /// Whether or not warp-synchronous reduction should be unguarded (i.e., the warp-reduction elements is a power of two
+        WARP_SYNCHRONOUS_UNGUARDED = PowerOfTwo<RAKING_THREADS>::VALUE,
+
+        /// Whether or not accesses into smem are unguarded
+        RAKING_UNGUARDED = BlockRakingLayout::UNGUARDED,
+
+    };
+
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        typename WarpReduce::TempStorage            warp_storage;        ///< Storage for warp-synchronous reduction
+        typename BlockRakingLayout::TempStorage     raking_grid;         ///< Padded thread block raking grid
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    template <bool IS_FULL_TILE, typename ReductionOp, int ITERATION>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           *raking_segment,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<ITERATION>         /*iteration*/)
+    {
+        // Update partial if addend is in range
+        if ((IS_FULL_TILE && RAKING_UNGUARDED) || ((linear_tid * SEGMENT_LENGTH) + ITERATION < num_valid))
+        {
+            T addend = raking_segment[ITERATION];
+            partial = reduction_op(partial, addend);
+        }
+        return RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<ITERATION + 1>());
+    }
+
+    template <bool IS_FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T RakingReduction(
+        ReductionOp                 /*reduction_op*/,   ///< [in] Binary scan operator
+        T                           * /*raking_segment*/,
+        T                           partial,            ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return partial;
+    }
+
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                IS_FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp synchronous reduction (unguarded if active threads is a power-of-two)
+            partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE>(
+                partial,
+                num_valid,
+                reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid.
+            *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = raking_segment[0];
+
+                partial = RakingReduction<IS_FULL_TILE>(reduction_op, raking_segment, partial, num_valid, Int2Type<1>());
+
+                int valid_raking_threads = (IS_FULL_TILE) ?
+                    RAKING_THREADS :
+                    (num_valid + SEGMENT_LENGTH - 1) / SEGMENT_LENGTH;
+
+                partial = WarpReduce(temp_storage.warp_storage).template Reduce<IS_FULL_TILE && RAKING_UNGUARDED>(
+                    partial,
+                    valid_raking_threads,
+                    reduction_op);
+
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool IS_FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum reduction_op;
+
+        return Reduce<IS_FULL_TILE>(partial, num_valid, reduction_op);
+    }
+
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_raking_commutative_only.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_raking_commutative_only.cuh
new file mode 100644
index 0000000000..454fdafa50
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_raking_commutative_only.cuh
@@ -0,0 +1,199 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "block_reduce_raking.cuh"
+#include "../../warp/warp_reduce.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceRakingCommutativeOnly provides raking-based methods of parallel reduction across a CUDA thread block.  Does not support non-commutative reduction operators.  Does not support block sizes that are not a multiple of the warp size.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceRakingCommutativeOnly
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    // The fall-back implementation to use when BLOCK_THREADS is not a multiple of the warp size or not all threads have valid values
+    typedef BlockReduceRaking<T, BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z, PTX_ARCH> FallBack;
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Whether or not to use fall-back
+        USE_FALLBACK = ((BLOCK_THREADS % WARP_THREADS != 0) || (BLOCK_THREADS <= WARP_THREADS)),
+
+        /// Number of raking threads
+        RAKING_THREADS = WARP_THREADS,
+
+        /// Number of threads actually sharing items with the raking threads
+        SHARING_THREADS = CUB_MAX(1, BLOCK_THREADS - RAKING_THREADS),
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = SHARING_THREADS / WARP_THREADS,
+    };
+
+    ///  WarpReduce utility type
+    typedef WarpReduce<T, RAKING_THREADS, PTX_ARCH> WarpReduce;
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, SHARING_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Shared memory storage layout type
+    union _TempStorage
+    {
+        struct
+        {
+            typename WarpReduce::TempStorage        warp_storage;        ///< Storage for warp-synchronous reduction
+            typename BlockRakingLayout::TempStorage raking_grid;         ///< Padded thread block raking grid
+        };
+        typename FallBack::TempStorage              fallback_storage;    ///< Fall-back storage for non-commutative block scan
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    unsigned int linear_tid;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceRakingCommutativeOnly(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Sum<FULL_TILE>(partial, num_valid);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, cub::Sum(), partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.warp_storage).Sum(partial);
+            }
+        }
+
+        return partial;
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   partial,            ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        if (USE_FALLBACK || !FULL_TILE)
+        {
+            return FallBack(temp_storage.fallback_storage).template Reduce<FULL_TILE>(partial, num_valid, reduction_op);
+        }
+        else
+        {
+            // Place partial into shared memory grid
+            if (linear_tid >= RAKING_THREADS)
+                *BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid - RAKING_THREADS) = partial;
+
+            CTA_SYNC();
+
+            // Reduce parallelism to one warp
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking reduction in grid
+                T *raking_segment = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+                partial = internal::ThreadReduce<SEGMENT_LENGTH>(raking_segment, reduction_op, partial);
+
+                // Warpscan
+                partial = WarpReduce(temp_storage.warp_storage).Reduce(partial, reduction_op);
+            }
+        }
+
+        return partial;
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_warp_reductions.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_warp_reductions.cuh
new file mode 100644
index 0000000000..10ba303b4c
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_reduce_warp_reductions.cuh
@@ -0,0 +1,218 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+
+#pragma once
+
+#include "../../warp/warp_reduce.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_arch.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockReduceWarpReductions provides variants of warp-reduction-based parallel reduction across a CUDA thread block.  Supports non-commutative reduction operators.
+ */
+template <
+    typename    T,              ///< Data type being reduced
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockReduceWarpReductions
+{
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+
+        /// The logical warp size for warp reductions
+        LOGICAL_WARP_SIZE = CUB_MIN(BLOCK_THREADS, WARP_THREADS),
+
+        /// Whether or not the logical warp size evenly divides the thread block size
+        EVEN_WARP_MULTIPLE = (BLOCK_THREADS % LOGICAL_WARP_SIZE == 0)
+    };
+
+
+    ///  WarpReduce utility type
+    typedef typename WarpReduce<T, LOGICAL_WARP_SIZE, PTX_ARCH>::InternalWarpReduce WarpReduce;
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpReduce::TempStorage    warp_reduce[WARPS];         ///< Buffer for warp-synchronous scan
+        T                                   warp_aggregates[WARPS];     ///< Shared totals from each warp-synchronous scan
+        T                                   block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    // Thread fields
+    _TempStorage &temp_storage;
+    int linear_tid;
+    int warp_id;
+    int lane_id;
+
+
+    /// Constructor
+    __device__ __forceinline__ BlockReduceWarpReductions(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    template <bool FULL_TILE, typename ReductionOp, int SUCCESSOR_WARP>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp                 reduction_op,       ///< [in] Binary scan operator
+        T                           warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                         num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<SUCCESSOR_WARP>    /*successor_warp*/)
+    {
+        if (FULL_TILE || (SUCCESSOR_WARP * LOGICAL_WARP_SIZE < num_valid))
+        {
+            T addend = temp_storage.warp_aggregates[SUCCESSOR_WARP];
+            warp_aggregate = reduction_op(warp_aggregate, addend);
+        }
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<SUCCESSOR_WARP + 1>());
+    }
+
+    template <bool FULL_TILE, typename ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         /*reduction_op*/,   ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 /*num_valid*/,      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        Int2Type<WARPS>     /*successor_warp*/)
+    {
+        return warp_aggregate;
+    }
+
+
+    /// Returns block-wide aggregate in <em>thread</em><sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T ApplyWarpAggregates(
+        ReductionOp         reduction_op,       ///< [in] Binary scan operator
+        T                   warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>0</sub> only]</b> Warp-wide aggregate reduction of input items
+        int                 num_valid)          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        // Share lane aggregates
+        if (lane_id == 0)
+        {
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+        }
+
+        CTA_SYNC();
+
+        // Update total aggregate in warp 0, lane 0
+        if (linear_tid == 0)
+        {
+            warp_aggregate = ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid, Int2Type<1>());
+        }
+
+        return warp_aggregate;
+    }
+
+
+    /// Computes a thread block-wide reduction using addition (+) as the reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <bool FULL_TILE>
+    __device__ __forceinline__ T Sum(
+        T                   input,          ///< [in] Calling thread's input partial reductions
+        int                 num_valid)      ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+    {
+        cub::Sum    reduction_op;
+        int         warp_offset = (warp_id * LOGICAL_WARP_SIZE);
+        int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
+                            LOGICAL_WARP_SIZE :
+                            num_valid - warp_offset;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
+            input,
+            warp_num_valid,
+            cub::Sum());
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+
+    /// Computes a thread block-wide reduction using the specified reduction operator. The first num_valid threads each contribute one reduction partial.  The return value is only valid for thread<sub>0</sub>.
+    template <
+        bool                FULL_TILE,
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input partial reductions
+        int                 num_valid,          ///< [in] Number of valid elements (may be less than BLOCK_THREADS)
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        int         warp_offset = warp_id * LOGICAL_WARP_SIZE;
+        int         warp_num_valid = ((FULL_TILE && EVEN_WARP_MULTIPLE) || (warp_offset + LOGICAL_WARP_SIZE <= num_valid)) ?
+                            LOGICAL_WARP_SIZE :
+                            num_valid - warp_offset;
+
+        // Warp reduction in every warp
+        T warp_aggregate = WarpReduce(temp_storage.warp_reduce[warp_id]).template Reduce<(FULL_TILE && EVEN_WARP_MULTIPLE)>(
+            input,
+            warp_num_valid,
+            reduction_op);
+
+        // Update outputs and block_aggregate with warp-wide aggregates from lane-0s
+        return ApplyWarpAggregates<FULL_TILE>(reduction_op, warp_aggregate, num_valid);
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_raking.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_raking.cuh
new file mode 100644
index 0000000000..a855cda0ba
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_raking.cuh
@@ -0,0 +1,666 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+
+/**
+ * \file
+ * cub::BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_ptx.cuh"
+#include "../../util_arch.cuh"
+#include "../../block/block_raking_layout.cuh"
+#include "../../thread/thread_reduce.cuh"
+#include "../../thread/thread_scan.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief BlockScanRaking provides variants of raking-based parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,              ///< Data type being scanned
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    bool        MEMOIZE,        ///< Whether or not to buffer outer raking scan partials to incur fewer shared memory reads at the expense of higher register pressure
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanRaking
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+    };
+
+    /// Layout type for padded thread block raking grid
+    typedef BlockRakingLayout<T, BLOCK_THREADS, PTX_ARCH> BlockRakingLayout;
+
+    /// Constants
+    enum
+    {
+        /// Number of raking threads
+        RAKING_THREADS = BlockRakingLayout::RAKING_THREADS,
+
+        /// Number of raking elements per warp synchronous raking thread
+        SEGMENT_LENGTH = BlockRakingLayout::SEGMENT_LENGTH,
+
+        /// Cooperative work can be entirely warp synchronous
+        WARP_SYNCHRONOUS = (BLOCK_THREADS == RAKING_THREADS),
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, RAKING_THREADS, PTX_ARCH> WarpScan;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpScan::TempStorage              warp_scan;          ///< Buffer for warp-synchronous scan
+        typename BlockRakingLayout::TempStorage     raking_grid;        ///< Padded thread block raking grid
+        T                                           block_aggregate;    ///< Block aggregate
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    T               cached_segment[SEGMENT_LENGTH];
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    /// Templated reduction
+    template <int ITERATION, typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                  raking_ptr,         ///< [in] Input array
+        ScanOp              scan_op,            ///< [in] Binary reduction operator
+        T                   raking_partial,     ///< [in] Prefix to seed reduction with
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        if ((BlockRakingLayout::UNGUARDED) || (((linear_tid * SEGMENT_LENGTH) + ITERATION) < BLOCK_THREADS))
+        {
+            T addend = raking_ptr[ITERATION];
+            raking_partial = scan_op(raking_partial, addend);
+        }
+
+        return GuardedReduce(raking_ptr, scan_op, raking_partial, Int2Type<ITERATION + 1>());
+    }
+
+
+    /// Templated reduction (base case)
+    template <typename ScanOp>
+    __device__ __forceinline__ T GuardedReduce(
+        T*                          /*raking_ptr*/,    ///< [in] Input array
+        ScanOp                      /*scan_op*/,       ///< [in] Binary reduction operator
+        T                           raking_partial,    ///< [in] Prefix to seed reduction with
+        Int2Type<SEGMENT_LENGTH>    /*iteration*/)
+    {
+        return raking_partial;
+    }
+
+
+    /// Templated copy
+    template <int ITERATION>
+    __device__ __forceinline__ void CopySegment(
+        T*                  out,            ///< [out] Out array
+        T*                  in,             ///< [in] Input array
+        Int2Type<ITERATION> /*iteration*/)
+    {
+        out[ITERATION] = in[ITERATION];
+        CopySegment(out, in, Int2Type<ITERATION + 1>());
+    }
+
+ 
+    /// Templated copy (base case)
+    __device__ __forceinline__ void CopySegment(
+        T*                  /*out*/,            ///< [out] Out array
+        T*                  /*in*/,             ///< [in] Input array
+        Int2Type<SEGMENT_LENGTH> /*iteration*/)
+    {}
+
+
+    /// Performs upsweep raking reduction, returning the aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ T Upsweep(
+        ScanOp scan_op)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data into registers
+        CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+
+        T raking_partial = cached_segment[0];
+
+        return GuardedReduce(cached_segment, scan_op, raking_partial, Int2Type<1>());
+    }
+
+
+    /// Performs exclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanExclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    /// Performs inclusive downsweep raking scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveDownsweep(
+        ScanOp          scan_op,
+        T               raking_partial,
+        bool            apply_prefix = true)
+    {
+        T *smem_raking_ptr = BlockRakingLayout::RakingPtr(temp_storage.raking_grid, linear_tid);
+
+        // Read data back into registers
+        if (!MEMOIZE)
+        {
+            CopySegment(cached_segment, smem_raking_ptr, Int2Type<0>());
+        }
+
+        internal::ThreadScanInclusive(cached_segment, cached_segment, scan_op, raking_partial, apply_prefix);
+
+        // Write data back to smem
+        CopySegment(smem_raking_ptr, cached_segment, Int2Type<0>());
+    }
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanRaking(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z))
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, exclusive_output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            exclusive_output = *placement_ptr;
+        }
+    }
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial= Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &output,            ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).ExclusiveScan(input, output, initial_value, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, initial_value, scan_op, block_aggregate);
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, exclusive_partial);
+
+                // Broadcast aggregate to other threads
+                if (linear_tid == 0)
+                    temp_storage.block_aggregate = block_aggregate;
+            }
+
+            CTA_SYNC();
+
+            // Grab exclusive partial from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.ExclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            output = scan_op(block_prefix, output);
+            if (linear_tid == 0)
+                output = block_prefix;
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Exclusive raking downsweep scan
+                ExclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Exclusive Warp-synchronous scan
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).ExclusiveScan(upsweep_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            WarpScan(temp_storage.warp_scan).InclusiveScan(input, output, scan_op, block_aggregate);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T inclusive_partial;
+                T exclusive_partial;
+                WarpScan(temp_storage.warp_scan).Scan(upsweep_partial, inclusive_partial, exclusive_partial, scan_op);
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, exclusive_partial, (linear_tid != 0));
+
+                // Broadcast aggregate to all threads
+                if (linear_tid == RAKING_THREADS - 1)
+                    temp_storage.block_aggregate = inclusive_partial;
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &output,                        ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        if (WARP_SYNCHRONOUS)
+        {
+            // Short-circuit directly to warp-synchronous scan
+            T block_aggregate;
+            WarpScan warp_scan(temp_storage.warp_scan);
+            warp_scan.InclusiveScan(input, output, scan_op, block_aggregate);
+
+            // Obtain warp-wide prefix in lane0, then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+            // Update prefix with exclusive warpscan partial
+            output = scan_op(block_prefix, output);
+        }
+        else
+        {
+            // Place thread partial into shared memory raking grid
+            T *placement_ptr = BlockRakingLayout::PlacementPtr(temp_storage.raking_grid, linear_tid);
+            *placement_ptr = input;
+
+            CTA_SYNC();
+
+            // Reduce parallelism down to just raking threads
+            if (linear_tid < RAKING_THREADS)
+            {
+                WarpScan warp_scan(temp_storage.warp_scan);
+
+                // Raking upsweep reduction across shared partials
+                T upsweep_partial = Upsweep(scan_op);
+
+                // Warp-synchronous scan
+                T exclusive_partial, block_aggregate;
+                warp_scan.ExclusiveScan(upsweep_partial, exclusive_partial, scan_op, block_aggregate);
+
+                // Obtain block-wide prefix in lane0, then broadcast to other lanes
+                T block_prefix = block_prefix_callback_op(block_aggregate);
+                block_prefix = warp_scan.Broadcast(block_prefix, 0);
+
+                // Update prefix with warpscan exclusive partial
+                T downsweep_prefix = scan_op(block_prefix, exclusive_partial);
+                if (linear_tid == 0)
+                    downsweep_prefix = block_prefix;
+
+                // Inclusive raking downsweep scan
+                InclusiveDownsweep(scan_op, downsweep_prefix);
+            }
+
+            CTA_SYNC();
+
+            // Grab thread prefix from shared memory
+            output = *placement_ptr;
+        }
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans.cuh
new file mode 100644
index 0000000000..85e4d6135a
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans.cuh
@@ -0,0 +1,392 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScan;
+
+    /// Shared memory storage layout type
+
+    struct __align__(32) _TempStorage
+    {
+        T                               warp_aggregates[WARPS];
+        typename WarpScanT::TempStorage warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                               block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  /*addend_warp*/)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &/*warp_prefix*/,       ///< [out] The calling thread's partial reduction
+        ScanOp          /*scan_op*/,            ///< [in] Binary scan operator
+        T               &/*block_aggregate*/,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> /*addend_warp*/)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        WarpScanT(temp_storage.warp_scan[warp_id]).Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans2.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans2.cuh
new file mode 100644
index 0000000000..4de7c69b70
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans2.cuh
@@ -0,0 +1,436 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// Number of warp threads
+        WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of active warps
+        WARPS = (BLOCK_THREADS + WARP_THREADS - 1) / WARP_THREADS,
+    };
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARP_THREADS, PTX_ARCH> WarpScanT;
+
+    ///  WarpScan utility type
+    typedef WarpScan<T, WARPS, PTX_ARCH> WarpAggregateScanT;
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        typename WarpAggregateScanT::TempStorage    inner_scan[WARPS];          ///< Buffer for warp-synchronous scans
+        typename WarpScanT::TempStorage             warp_scan[WARPS];           ///< Buffer for warp-synchronous scans
+        T                                           warp_aggregates[WARPS];
+        T                                           block_prefix;               ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((WARPS == 1) ? 0 : linear_tid / WARP_THREADS),
+        lane_id(LaneId())
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Utility methods
+    //---------------------------------------------------------------------
+
+    template <typename ScanOp, int WARP>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARP>  addend_warp)
+    {
+        if (warp_id == WARP)
+            warp_prefix = block_aggregate;
+
+        T addend = temp_storage.warp_aggregates[WARP];
+        block_aggregate = scan_op(block_aggregate, addend);
+
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<WARP + 1>());
+    }
+
+    template <typename ScanOp>
+    __device__ __forceinline__ void ApplyWarpAggregates(
+        T               &warp_prefix,           ///< [out] The calling thread's partial reduction
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        Int2Type<WARPS> addend_warp)
+    {}
+
+
+    /// Use the warp-wide aggregates to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = warp_aggregate;
+
+        CTA_SYNC();
+
+        // Accumulate block aggregates and save the one that is our warp's prefix
+        T warp_prefix;
+        block_aggregate = temp_storage.warp_aggregates[0];
+
+        // Use template unrolling (since the PTX backend can't handle unrolling it for SM1x)
+        ApplyWarpAggregates(warp_prefix, scan_op, block_aggregate, Int2Type<1>());
+/*
+        #pragma unroll
+        for (int WARP = 1; WARP < WARPS; ++WARP)
+        {
+            if (warp_id == WARP)
+                warp_prefix = block_aggregate;
+
+            T addend = temp_storage.warp_aggregates[WARP];
+            block_aggregate = scan_op(block_aggregate, addend);
+        }
+*/
+
+        return warp_prefix;
+    }
+
+
+    /// Use the warp-wide aggregates and initial-value to compute the calling warp's prefix.  Also returns block-wide aggregate in all threads.
+    template <typename ScanOp>
+    __device__ __forceinline__ T ComputeWarpPrefix(
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               warp_aggregate,     ///< [in] <b>[<em>lane</em><sub>WARP_THREADS - 1</sub> only]</b> Warp-wide aggregate reduction of input items
+        T               &block_aggregate,   ///< [out] Threadblock-wide aggregate reduction of input items
+        const T         &initial_value)     ///< [in] Initial value to seed the exclusive scan
+    {
+        T warp_prefix = ComputeWarpPrefix(scan_op, warp_aggregate, block_aggregate);
+
+        warp_prefix = scan_op(initial_value, warp_prefix);
+
+        if (warp_id == 0)
+            warp_prefix = initial_value;
+
+        return warp_prefix;
+    }
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            exclusive_output = scan_op(warp_prefix, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = warp_prefix;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT my_warp_scan(temp_storage.warp_scan[warp_id]);
+
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        my_warp_scan.Scan(input, inclusive_output, exclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp
+//        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate, initial_value);
+
+//--------------------------------------------------
+        // Last lane in each warp shares its warp-aggregate
+        if (lane_id == WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        // Get the warp scan partial
+        T warp_inclusive, warp_prefix;
+        if (lane_id < WARPS)
+        {
+            // Scan the warpscan partials
+            T warp_val = temp_storage.warp_aggregates[lane_id];
+            WarpAggregateScanT(temp_storage.inner_scan[warp_id]).Scan(warp_val, warp_inclusive, warp_prefix, initial_value, scan_op);
+        }
+
+        warp_prefix         = my_warp_scan.Broadcast(warp_prefix, warp_id);
+        block_aggregate     = my_warp_scan.Broadcast(warp_inclusive, WARPS - 1);
+//--------------------------------------------------
+
+        // Apply warp prefix to our lane's partial
+        exclusive_output = scan_op(warp_prefix, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = warp_prefix;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+                exclusive_output = block_prefix;                // The block prefix is the exclusive output for tid0
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        if (linear_tid > 0)
+        {
+            exclusive_output = scan_op(block_prefix, exclusive_output);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        WarpScanT(temp_storage.warp_scan[warp_id]).InclusiveScan(input, inclusive_output, scan_op);
+
+        // Compute the warp-wide prefix and block-wide aggregate for each warp.  Warp prefix for warp0 is invalid.
+        T warp_prefix = ComputeWarpPrefix(scan_op, inclusive_output, block_aggregate);
+
+        // Apply warp prefix to our lane's partial
+        if (warp_id != 0)
+        {
+            inclusive_output = scan_op(warp_prefix, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        T block_aggregate;
+        InclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+
+        // Use the first warp to determine the thread block prefix, returning the result in lane0
+        if (warp_id == 0)
+        {
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            if (lane_id == 0)
+            {
+                // Share the prefix with all threads
+                temp_storage.block_prefix = block_prefix;
+            }
+        }
+
+        CTA_SYNC();
+
+        // Incorporate thread block prefix into outputs
+        T block_prefix = temp_storage.block_prefix;
+        exclusive_output = scan_op(block_prefix, exclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans3.cuh b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans3.cuh
new file mode 100644
index 0000000000..147ca4c5af
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/block/specializations/block_scan_warp_scans3.cuh
@@ -0,0 +1,418 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::BlockScanWarpscans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+
+#pragma once
+
+#include "../../util_arch.cuh"
+#include "../../util_ptx.cuh"
+#include "../../warp/warp_scan.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief BlockScanWarpScans provides warpscan-based variants of parallel prefix scan across a CUDA thread block.
+ */
+template <
+    typename    T,
+    int         BLOCK_DIM_X,    ///< The thread block length in threads along the X dimension
+    int         BLOCK_DIM_Y,    ///< The thread block length in threads along the Y dimension
+    int         BLOCK_DIM_Z,    ///< The thread block length in threads along the Z dimension
+    int         PTX_ARCH>       ///< The PTX compute capability for which to to specialize this collective
+struct BlockScanWarpScans
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// Constants
+    enum
+    {
+        /// The thread block size in threads
+        BLOCK_THREADS = BLOCK_DIM_X * BLOCK_DIM_Y * BLOCK_DIM_Z,
+
+        /// Number of warp threads
+        INNER_WARP_THREADS = CUB_WARP_THREADS(PTX_ARCH),
+        OUTER_WARP_THREADS = BLOCK_THREADS / INNER_WARP_THREADS,
+
+        /// Number of outer scan warps
+        OUTER_WARPS = INNER_WARP_THREADS
+    };
+
+    ///  Outer WarpScan utility type
+    typedef WarpScan<T, OUTER_WARP_THREADS, PTX_ARCH> OuterWarpScanT;
+
+    ///  Inner WarpScan utility type
+    typedef WarpScan<T, INNER_WARP_THREADS, PTX_ARCH> InnerWarpScanT;
+
+    typedef typename OuterWarpScanT::TempStorage OuterScanArray[OUTER_WARPS];
+
+
+    /// Shared memory storage layout type
+    struct _TempStorage
+    {
+        union Aliasable
+        {
+            Uninitialized<OuterScanArray>           outer_warp_scan;  ///< Buffer for warp-synchronous outer scans
+            typename InnerWarpScanT::TempStorage    inner_warp_scan;  ///< Buffer for warp-synchronous inner scan
+
+        } aliasable;
+
+        T                               warp_aggregates[OUTER_WARPS];
+
+        T                               block_aggregate;                           ///< Shared prefix for the entire thread block
+    };
+
+
+    /// Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    //---------------------------------------------------------------------
+    // Per-thread fields
+    //---------------------------------------------------------------------
+
+    // Thread fields
+    _TempStorage    &temp_storage;
+    unsigned int    linear_tid;
+    unsigned int    warp_id;
+    unsigned int    lane_id;
+
+
+    //---------------------------------------------------------------------
+    // Constructors
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ BlockScanWarpScans(
+        TempStorage &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+        linear_tid(RowMajorTid(BLOCK_DIM_X, BLOCK_DIM_Y, BLOCK_DIM_Z)),
+        warp_id((OUTER_WARPS == 1) ? 0 : linear_tid / OUTER_WARP_THREADS),
+        lane_id((OUTER_WARPS == 1) ? linear_tid : linear_tid % OUTER_WARP_THREADS)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Exclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        // Compute block-wide exclusive scan.  The exclusive output from tid0 is invalid.
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        ExclusiveScan(input, exclusive_output, initial_value, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.  With no initial value, the output computed for <em>thread</em><sub>0</sub> is undefined.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item
+        T               &exclusive_output,  ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+            if (lane_id == 0)
+                exclusive_output = outer_warp_exclusive;
+        }
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input items
+        T               &exclusive_output,  ///< [out] Calling thread's output items (may be aliased to \p input)
+        const T         &initial_value,     ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &block_aggregate)   ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+        {
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+        }
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, initial_value, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        // Retrieve block aggregate
+        block_aggregate = temp_storage.block_aggregate;
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    /// Computes an exclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  The call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &exclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        T inclusive_output;
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).Scan(
+            input, inclusive_output, exclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid] = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial (or assign it if partial is invalid)
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        exclusive_output = scan_op(outer_warp_exclusive, exclusive_output);
+        if (lane_id == 0)
+            exclusive_output = outer_warp_exclusive;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scans
+    //---------------------------------------------------------------------
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op)                        ///< [in] Binary scan operator
+    {
+        T block_aggregate;
+        InclusiveScan(input, inclusive_output, scan_op, block_aggregate);
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  Also provides every thread with the block-wide \p block_aggregate of all inputs.
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,                          ///< [in] Calling thread's input item
+        T               &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp          scan_op,                        ///< [in] Binary scan operator
+        T               &block_aggregate)               ///< [out] Threadblock-wide aggregate reduction of input items
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            T outer_warp_input = temp_storage.warp_aggregates[linear_tid];
+            T outer_warp_exclusive;
+
+            InnerWarpScanT(temp_storage.aliasable.inner_warp_scan).ExclusiveScan(
+                outer_warp_input, outer_warp_exclusive, scan_op, block_aggregate);
+
+            temp_storage.block_aggregate                = block_aggregate;
+            temp_storage.warp_aggregates[linear_tid]    = outer_warp_exclusive;
+        }
+
+        CTA_SYNC();
+
+        if (warp_id != 0)
+        {
+            // Retrieve block aggregate
+            block_aggregate = temp_storage.block_aggregate;
+
+            // Apply warp prefix to our lane's partial
+            T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+            inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+        }
+    }
+
+
+    /// Computes an inclusive thread block-wide prefix scan using the specified binary \p scan_op functor.  Each thread contributes one input element.  the call-back functor \p block_prefix_callback_op is invoked by the first warp in the block, and the value returned by <em>lane</em><sub>0</sub> in that warp is used as the "seed" value that logically prefixes the thread block's scan inputs.
+    template <
+        typename ScanOp,
+        typename BlockPrefixCallbackOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,                          ///< [in] Calling thread's input item
+        T                       &inclusive_output,              ///< [out] Calling thread's output item (may be aliased to \p input)
+        ScanOp                  scan_op,                        ///< [in] Binary scan operator
+        BlockPrefixCallbackOp   &block_prefix_callback_op)      ///< [in-out] <b>[<em>warp</em><sub>0</sub> only]</b> Call-back functor for specifying a thread block-wide prefix to be applied to all inputs.
+    {
+        // Compute warp scan in each warp.  The exclusive output from each lane0 is invalid.
+        OuterWarpScanT(temp_storage.aliasable.outer_warp_scan.Alias()[warp_id]).InclusiveScan(
+            input, inclusive_output, scan_op);
+
+        // Share outer warp total
+        if (lane_id == OUTER_WARP_THREADS - 1)
+            temp_storage.warp_aggregates[warp_id] = inclusive_output;
+
+        CTA_SYNC();
+
+        if (linear_tid < INNER_WARP_THREADS)
+        {
+            InnerWarpScanT inner_scan(temp_storage.aliasable.inner_warp_scan);
+
+            T upsweep = temp_storage.warp_aggregates[linear_tid];
+            T downsweep_prefix, block_aggregate;
+            inner_scan.ExclusiveScan(upsweep, downsweep_prefix, scan_op, block_aggregate);
+
+            // Use callback functor to get block prefix in lane0 and then broadcast to other lanes
+            T block_prefix = block_prefix_callback_op(block_aggregate);
+            block_prefix = inner_scan.Broadcast(block_prefix, 0);
+
+            downsweep_prefix = scan_op(block_prefix, downsweep_prefix);
+            if (linear_tid == 0)
+                downsweep_prefix = block_prefix;
+
+            temp_storage.warp_aggregates[linear_tid]    = downsweep_prefix;
+        }
+
+        CTA_SYNC();
+
+        // Apply warp prefix to our lane's partial
+        T outer_warp_exclusive = temp_storage.warp_aggregates[warp_id];
+        inclusive_output = scan_op(outer_warp_exclusive, inclusive_output);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/cub.cuh b/GraphBLAS/CUDA/local_cub/cub.cuh
new file mode 100644
index 0000000000..3ece0f6584
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/cub.cuh
@@ -0,0 +1,95 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * CUB umbrella include file
+ */
+
+#pragma once
+
+
+// Block
+#include "block/block_histogram.cuh"
+#include "block/block_discontinuity.cuh"
+#include "block/block_exchange.cuh"
+#include "block/block_load.cuh"
+#include "block/block_radix_rank.cuh"
+#include "block/block_radix_sort.cuh"
+#include "block/block_reduce.cuh"
+#include "block/block_scan.cuh"
+#include "block/block_store.cuh"
+//#include "block/block_shift.cuh"
+
+// Device
+#include "device/device_histogram.cuh"
+#include "device/device_partition.cuh"
+#include "device/device_radix_sort.cuh"
+#include "device/device_reduce.cuh"
+#include "device/device_run_length_encode.cuh"
+#include "device/device_scan.cuh"
+#include "device/device_segmented_radix_sort.cuh"
+#include "device/device_segmented_reduce.cuh"
+#include "device/device_select.cuh"
+#include "device/device_spmv.cuh"
+
+// Grid
+//#include "grid/grid_barrier.cuh"
+#include "grid/grid_even_share.cuh"
+#include "grid/grid_mapping.cuh"
+#include "grid/grid_queue.cuh"
+
+// Thread
+#include "thread/thread_load.cuh"
+#include "thread/thread_operators.cuh"
+#include "thread/thread_reduce.cuh"
+#include "thread/thread_scan.cuh"
+#include "thread/thread_store.cuh"
+
+// Warp
+#include "warp/warp_reduce.cuh"
+#include "warp/warp_scan.cuh"
+
+// Iterator
+#include "iterator/arg_index_input_iterator.cuh"
+#include "iterator/cache_modified_input_iterator.cuh"
+#include "iterator/cache_modified_output_iterator.cuh"
+#include "iterator/constant_input_iterator.cuh"
+#include "iterator/counting_input_iterator.cuh"
+#include "iterator/tex_obj_input_iterator.cuh"
+#include "iterator/tex_ref_input_iterator.cuh"
+#include "iterator/transform_input_iterator.cuh"
+
+// Util
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_device.cuh"
+#include "util_macro.cuh"
+#include "util_ptx.cuh"
+#include "util_type.cuh"
+
diff --git a/GraphBLAS/CUDA/local_cub/device/device_histogram.cuh b/GraphBLAS/CUDA/local_cub/device/device_histogram.cuh
new file mode 100644
index 0000000000..a2556a6b85
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/device_histogram.cuh
@@ -0,0 +1,866 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_histogram.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory. ![](histogram_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Histogram"><em>histogram</em></a>
+ * counts the number of observations that fall into each of the disjoint categories (known as <em>bins</em>).
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceHistogram}
+ *
+ */
+struct DeviceHistogram
+{
+    /******************************************************************//**
+     * \name Evenly-segmented bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels;     // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;    // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;    // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_samples);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_samples,                                ///< [in] The number of input samples (i.e., the length of \p d_samples)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_samples,
+            1,
+            sizeof(SampleT) * num_samples,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using equal-width bins.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - All bins comprise the same width of sample values: (\p upper_level - \p lower_level) / (\p num_levels - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * size_t   row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels;         // e.g., 7       (seven level boundaries for six bins)
+     * float    lower_level;        // e.g., 0.0     (lower sample value boundary of lowest bin)
+     * float    upper_level;        // e.g., 12.0    (upper sample value boundary of upper bin)
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage  = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramEven(d_temp_storage, temp_storage_bytes, d_samples, d_histogram,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                                ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                                 ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT              lower_level,                                ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin.
+        LevelT              upper_level,                                ///< [in] The upper sample value bound (exclusive) for the highest histogram bin.
+        OffsetT             num_row_samples,                            ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT              lower_level1[1]     = {lower_level};
+        LevelT              upper_level1[1]     = {upper_level};
+
+        return MultiHistogramEven<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            lower_level1,
+            upper_level1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_pixels;         // e.g., 5
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 3, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_pixels,                                 ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        return MultiHistogramEven<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            lower_level,
+            upper_level,
+            num_pixels,
+            1,
+            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using equal-width bins.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., only <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 256-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5), (3, 0, 2, 1), (7, 0, 6, 2), (-, -, -, -),
+     *                                      //        (0, 6, 7, 5), (3, 0, 2, 6), (1, 1, 1, 1), (-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., three device pointers to three device buffers,
+     *                                      //       each allocated with 256 integer counters
+     * int              num_levels[3];      // e.g., {257, 257, 257};
+     * unsigned int     lower_level[3];     // e.g., {0, 0, 0};
+     * unsigned int     upper_level[3];     // e.g., {256, 256, 256};
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramEven<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, lower_level, upper_level,
+     *     num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [1, 1, 1, 2, 0, 0, 0, 1, 0, 0, 0, ..., 0],
+     * //                     [0, 4, 0, 0, 0, 0, 2, 0, 0, 0, 0, ..., 0],
+     * //                     [0, 1, 2, 0, 0, 0, 1, 2, 0, 0, 0, ..., 0] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramEven(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],           ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],            ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                           ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream                  = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
+        {
+            // Down-convert OffsetT data type
+
+
+            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchEven(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchEven(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, lower_level, upper_level,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Custom bin ranges
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of an six-bin histogram
+     * from a sequence of float samples
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_samples;    // e.g., 10
+     * float*   d_samples;      // e.g., [2.2, 6.0, 7.1, 2.9, 3.5, 0.3, 2.9, 2.0, 6.1, 999.5]
+     * int*     d_histogram;    // e.g., [ -, -, -, -, -, -, -, -]
+     * int      num_levels      // e.g., 7 (seven level boundaries for six bins)
+     * float*   d_levels;       // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_samples);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_samples,                            ///< [in] The number of data samples per row in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        CounterT*           d_histogram1[1] = {d_histogram};
+        int                 num_levels1[1]  = {num_levels};
+        LevelT*             d_levels1[1]    = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_samples,
+            1,
+            sizeof(SampleT) * num_samples,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes an intensity histogram from a sequence of data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins is (\p num_levels - 1)
+     * - The value range for bin<sub><em>i</em></sub> is [<tt>level[i]</tt>, <tt>level[i+1]</tt>)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of a six-bin histogram
+     * from a 2x5 region of interest within a flattened 2x7 array of float samples.
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples and
+     * // output histogram
+     * int      num_row_samples;    // e.g., 5
+     * int      num_rows;           // e.g., 2;
+     * int      row_stride_bytes;   // e.g., 7 * sizeof(float)
+     * float*   d_samples;          // e.g., [2.2, 6.0, 7.1, 2.9, 3.5,   -, -,
+     *                              //        0.3, 2.9, 2.0, 6.1, 999.5, -, -]
+     * int*     d_histogram;        // e.g., [ , , , , , , , ]
+     * int      num_levels          // e.g., 7 (seven level boundaries for six bins)
+     * float    *d_levels;          // e.g., [0.0, 2.0, 4.0, 6.0, 8.0, 12.0, 16.0]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::HistogramRange(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels,
+     *     num_row_samples, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [1, 0, 5, 0, 3, 0, 0, 0];
+     *
+     * \endcode
+     *
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t HistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the input sequence of data samples.
+        CounterT*           d_histogram,                            ///< [out] The pointer to the histogram counter output array of length <tt>num_levels</tt> - 1.
+        int                 num_levels,                             ///< [in] The number of boundaries (levels) for delineating histogram samples.  Implies that the number of bins is <tt>num_levels</tt> - 1.
+        LevelT*             d_levels,                               ///< [in] The pointer to the array of boundaries (levels).  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_samples,                        ///< [in] The number of data samples per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        CounterT*           d_histogram1[1]     = {d_histogram};
+        int                 num_levels1[1]      = {num_levels};
+        LevelT*             d_levels1[1]        = {d_levels};
+
+        return MultiHistogramRange<1, 1>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram1,
+            num_levels1,
+            d_levels1,
+            num_row_samples,
+            num_rows,
+            row_stride_bytes,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms
+     * from a quad-channel sequence of <em>RGBA</em> pixels (8 bits per channel per pixel)
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int            num_pixels;       // e.g., 5
+     * unsigned char  *d_samples;       // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(7, 0, 6, 2),
+     *                                  //        (0, 6, 7, 5),(3, 0, 2, 6)]
+     * unsigned int   *d_histogram[3];  // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int            num_levels[3];    // e.g., {5, 5, 5};
+     * unsigned int   *d_levels[3];     // e.g., [ [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8],
+     *                                  //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_pixels);
+     *
+     * // d_histogram   <-- [ [1, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [0, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_pixels,                             ///< [in] The number of multi-channel pixels (i.e., the length of \p d_samples / NUM_CHANNELS)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+        return MultiHistogramRange<NUM_CHANNELS, NUM_ACTIVE_CHANNELS>(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_samples,
+            d_histogram,
+            num_levels,
+            d_levels,
+            num_pixels,
+            1,
+            sizeof(SampleT) * NUM_CHANNELS * num_pixels,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes per-channel intensity histograms from a sequence of multi-channel "pixel" data samples using the specified bin boundary levels.
+     *
+     * \par
+     * - The input is a sequence of <em>pixel</em> structures, where each pixel comprises
+     *   a record of \p NUM_CHANNELS consecutive data samples (e.g., an <em>RGBA</em> pixel).
+     * - Of the \p NUM_CHANNELS specified, the function will only compute histograms
+     *   for the first \p NUM_ACTIVE_CHANNELS (e.g., <em>RGB</em> histograms from <em>RGBA</em>
+     *   pixel samples).
+     * - A two-dimensional <em>region of interest</em> within \p d_samples can be specified
+     *   using the \p num_row_samples, num_rows, and \p row_stride_bytes parameters.
+     * - The row stride must be a whole multiple of the sample data type
+     *   size, i.e., <tt>(row_stride_bytes % sizeof(SampleT)) == 0</tt>.
+     * - The number of histogram bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+     * - For channel<sub><em>i</em></sub>, the range of values for all histogram bins
+     *   have the same width: (<tt>upper_level[i]</tt> - <tt>lower_level[i]</tt>) / (<tt> num_levels[i]</tt> - 1)
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the computation of three 4-bin <em>RGB</em> histograms from a 2x3 region of
+     * interest of within a flattened 2x4 array of quad-channel <em>RGBA</em> pixels (8 bits per channel per pixel).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_histogram.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input samples
+     * // and output histograms
+     * int              num_row_pixels;     // e.g., 3
+     * int              num_rows;           // e.g., 2
+     * size_t           row_stride_bytes;   // e.g., 4 * sizeof(unsigned char) * NUM_CHANNELS
+     * unsigned char*   d_samples;          // e.g., [(2, 6, 7, 5),(3, 0, 2, 1),(1, 1, 1, 1),(-, -, -, -),
+     *                                      //        (7, 0, 6, 2),(0, 6, 7, 5),(3, 0, 2, 6),(-, -, -, -)]
+     * int*             d_histogram[3];     // e.g., [[ -, -, -, -],[ -, -, -, -],[ -, -, -, -]];
+     * int              num_levels[3];      // e.g., {5, 5, 5};
+     * unsigned int*    d_levels[3];        // e.g., [ [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8],
+     *                                      //         [0, 2, 4, 6, 8] ];
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Compute histograms
+     * cub::DeviceHistogram::MultiHistogramRange<4, 3>(d_temp_storage, temp_storage_bytes,
+     *     d_samples, d_histogram, num_levels, d_levels, num_row_pixels, num_rows, row_stride_bytes);
+     *
+     * // d_histogram   <-- [ [2, 3, 0, 1],
+     * //                     [3, 0, 0, 2],
+     * //                     [1, 2, 0, 3] ]
+     *
+     * \endcode
+     *
+     * \tparam NUM_CHANNELS             Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+     * \tparam NUM_ACTIVE_CHANNELS      <b>[inferred]</b> Number of channels actively being histogrammed
+     * \tparam SampleIteratorT          <b>[inferred]</b> Random-access input iterator type for reading input samples. \iterator
+     * \tparam CounterT                 <b>[inferred]</b> Integer type for histogram bin counters
+     * \tparam LevelT                   <b>[inferred]</b> Type for specifying boundaries (levels)
+     * \tparam OffsetT                  <b>[inferred]</b> Signed integer type for sequence offsets, list lengths, pointer differences, etc.  \offset_size1
+     */
+    template <
+        int                 NUM_CHANNELS,
+        int                 NUM_ACTIVE_CHANNELS,
+        typename            SampleIteratorT,
+        typename            CounterT,
+        typename            LevelT,
+        typename            OffsetT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t MultiHistogramRange(
+        void*               d_temp_storage,                         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                              ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four <em>RGBA</em> 8-bit samples).
+        CounterT*           d_histogram[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histogram[i]</tt> should be <tt>num_levels[i]</tt> - 1.
+        int                 num_levels[NUM_ACTIVE_CHANNELS],        ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_levels[i]</tt> - 1.
+        LevelT*             d_levels[NUM_ACTIVE_CHANNELS],          ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                         ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                               ///< [in] The number of rows in the region of interest
+        size_t              row_stride_bytes,                       ///< [in] The number of bytes between starts of consecutive rows in the region of interest
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        /// The sample value type of the input iterator
+        typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+        Int2Type<sizeof(SampleT) == 1> is_byte_sample;
+
+        if ((sizeof(OffsetT) > sizeof(int)) &&
+            ((unsigned long long) (num_rows * row_stride_bytes) < (unsigned long long) std::numeric_limits<int>::max()))
+        {
+            // Down-convert OffsetT data type
+            return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, int>::DispatchRange(
+                d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+                (int) num_row_pixels, (int) num_rows, (int) (row_stride_bytes / sizeof(SampleT)),
+                stream, debug_synchronous, is_byte_sample);
+        }
+
+        return DipatchHistogram<NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, LevelT, OffsetT>::DispatchRange(
+            d_temp_storage, temp_storage_bytes, d_samples, d_histogram, num_levels, d_levels,
+            num_row_pixels, num_rows, (OffsetT) (row_stride_bytes / sizeof(SampleT)),
+            stream, debug_synchronous, is_byte_sample);
+    }
+
+
+
+    //@}  end member group
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/device_partition.cuh b/GraphBLAS/CUDA/local_cub/device/device_partition.cuh
new file mode 100644
index 0000000000..5053540071
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/device_partition.cuh
@@ -0,0 +1,273 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DevicePartition provides device-wide, parallel operations for partitioning sequences of data items residing within device-accessible memory. ![](partition_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to construct a partitioned output sequence from items selected/unselected from
+ * a specified input sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DevicePartition}
+ *
+ * \par Performance
+ * \linear_performance{partition}
+ *
+ * \par
+ * The following chart illustrates DevicePartition::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected for the first partition.
+ * \plots_below
+ *
+ * \image html partition_if_int32_50_percent.png
+ *
+ */
+struct DevicePartition
+{
+    /**
+     * \brief Uses the \p d_flags sequence to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original
+     *   relative ordering, however copies of the unselected items are compacted into the
+     *   rear of \p d_out in reverse order.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DevicePartition::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7, 8, 5, 3, 2]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to split the corresponding items from \p d_in into a partitioned sequence \p d_out.  The total number of items copied into the first partition is written to \p d_num_selected_out. ![](partition_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original
+     *   relative ordering, however copies of the unselected items are compacted into the
+     *   rear of \p d_out in reverse order.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated partition-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected for the first partition with 50% probability.
+     *
+     * \image html partition_if_int32_50_percent.png
+     * \image html partition_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability for the first partition:
+     *
+     * \image html partition_if_int32_5_percent.png
+     * \image html partition_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_partition.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2, 8, 81, 9]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing output items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection functor type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of partitioned data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., the offset of the unselected partition)
+        int                         num_items,                      ///< [in] Total number of items to select from
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, true>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_partition_flagged.cu
+ * \example example_device_partition_if.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/device_radix_sort.cuh b/GraphBLAS/CUDA/local_cub/device/device_radix_sort.cuh
new file mode 100644
index 0000000000..1c0bdbea1d
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/device_radix_sort.cuh
@@ -0,0 +1,797 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_radix_sort.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory. ![](sorting_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, DeviceRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRadixSort}
+ *
+ * \par Performance
+ * \linear_performance{radix sort} The following chart illustrates DeviceRadixSort::SortKeys
+ * performance across different CUDA architectures for uniform-random \p uint32 keys.
+ * \plots_below
+ *
+ * \image html lsb_radix_sort_int32_keys.png
+ *
+ */
+struct DeviceRadixSort
+{
+
+    /******************************************************************//**
+     * \name KeyT-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random <tt>uint32,uint32</tt> and
+     * <tt>uint64,uint64</tt> pairs, respectively.
+     *
+     * \image html lsb_radix_sort_int32_pairs.png
+     * \image html lsb_radix_sort_int64_pairs.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchRadixSort<false, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values_out          <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] Pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] Pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortPairs.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     * // d_values.Current()    <-- [6, 0, 2, 1, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     * \tparam ValueT    <b>[inferred]</b> ValueT type
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] Number of items to sort
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchRadixSort<true, KeyT, ValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sorting performance across different
+     * CUDA architectures for uniform-random \p uint32 and \p uint64 keys, respectively.
+     *
+     * \image html lsb_radix_sort_int32_keys.png
+     * \image html lsb_radix_sort_int64_keys.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [0, 3, 5, 6, 7, 8, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<false, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out, num_items);
+     *
+     * // d_keys_out            <-- [9, 8, 7, 6, 5, 3, 0]s
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] Pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] Pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Performance
+     * Performance is similar to DeviceRadixSort::SortKeys.
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sorting of a device vector of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [        ...        ]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys, num_items);
+     *
+     * // d_keys.Current()      <-- [9, 8, 7, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT      <b>[inferred]</b> KeyT type
+     */
+    template <typename KeyT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] Number of items to sort
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchRadixSort<true, KeyT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+/**
+ * \example example_device_radix_sort.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/device_reduce.cuh b/GraphBLAS/CUDA/local_cub/device/device_reduce.cuh
new file mode 100644
index 0000000000..13c7a72d1a
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/device_reduce.cuh
@@ -0,0 +1,734 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceReduce}
+ *
+ * \par Performance
+ * \linear_performance{reduction, reduce-by-key, and run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::Sum
+ * performance across different CUDA architectures for \p int32 keys.
+ *
+ * \image html reduce_int32.png
+ *
+ * \par
+ * The following chart illustrates DeviceReduce::ReduceByKey (summation)
+ * performance across different CUDA architectures for \p fp32
+ * values.  Segments are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+ *
+ * \image html reduce_by_key_fp32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceReduce
+{
+    /**
+     * \brief Computes a device-wide reduction using the specified binary \p reduction_op functor and initial value \p init.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a user-defined min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     __device__ __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;  // e.g., 7
+     * int          *d_in;      // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;     // e.g., [-]
+     * CustomMin    min_op;
+     * int          init;       // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items, min_op, init);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam ReductionOpT         <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    ReductionOpT,
+        typename                    T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT                reduction_op,                       ///< [in] Binary reduction functor
+        T                           init,                               ///< [in] Initial value of the reduction
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            reduction_op,
+            init,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide sum using the addition (\p +) operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction.
+     * - Does not support \p + operators that are non-commutative..
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated sum-reduction performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html reduce_int32.png
+     * \image html reduce_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction.
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Min(),
+            Traits<InputT>::Max(), // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum using the less-than ('<') operator, also returning the index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p < operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_argmin, num_items);
+     *
+     * // d_out <-- [{5, 0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [-]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_max, num_items);
+     *
+     * // d_out <-- [9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchReduce<InputIteratorT, OutputIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_items,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum using the greater-than ('>') operator, also returning the index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum is written to <tt>d_out.value</tt> and its offset in the input array is written to <tt>d_out.key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - Does not support \p > operators that are non-commutative.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_items;      // e.g., 7
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_argmax, num_items);
+     *
+     * // d_out <-- [{6, 9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>cub::KeyValuePair<int, T></tt>) \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                        *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                              ///< [out] Pointer to the output aggregate
+        int                         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_items,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Reduces segments of values, where segments are demarcated by corresponding runs of identical keys.
+     *
+     * \par
+     * This operation computes segmented reductions within \p d_values_in using
+     * the specified binary \p reduction_op functor.  The segments are identified by
+     * "runs" of corresponding keys in \p d_keys_in, where runs are maximal ranges of
+     * consecutive, identical keys.  For the <em>i</em><sup>th</sup> run encountered,
+     * the first key of the run and the corresponding value aggregate of that run are
+     * written to <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_aggregates_out[<em>i</em>]</tt>,
+     * respectively. The total number of runs encountered is written to \p d_num_runs_out.
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following chart illustrates reduction-by-key (sum) performance across
+     * different CUDA architectures for \p fp32 and \p fp64 values, respectively.  Segments
+     * are identified by \p int32 keys, and have lengths uniformly sampled from [1,1000].
+     *
+     * \image html reduce_by_key_fp32_len_500.png
+     * \image html reduce_by_key_fp64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html reduce_by_key_fp32_len_5.png
+     * \image html reduce_by_key_fp64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the segmented reduction of \p int values grouped
+     * by runs of associated \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_keys_in;         // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_values_in;       // e.g., [0, 7, 1, 6, 2, 5, 3, 4]
+     * int          *d_unique_out;      // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_aggregates_out;  // e.g., [-, -, -, -, -, -, -, -]
+     * int          *d_num_runs_out;    // e.g., [-]
+     * CustomMin    reduction_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduce-by-key
+     * cub::DeviceReduce::ReduceByKey(d_temp_storage, temp_storage_bytes, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, reduction_op, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_aggregates_out  <-- [0, 1, 6, 2, 4]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam KeysInputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input keys \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output keys \iterator
+     * \tparam ValuesInputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input values \iterator
+     * \tparam AggregatesOutputIterator <b>[inferred]</b> Random-access output iterator type for writing output value aggregates \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     * \tparam ReductionOpT              <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+     */
+    template <
+        typename                    KeysInputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    ValuesInputIteratorT,
+        typename                    AggregatesOutputIteratorT,
+        typename                    NumRunsOutputIteratorT,
+        typename                    ReductionOpT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t ReduceByKey(
+        void                        *d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        ReductionOpT                reduction_op,                   ///< [in] Binary reduction functor
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // FlagT iterator type (not used)
+
+        // Selection op (not used)
+
+        // Default == operator
+        typedef Equality EqualityOp;
+
+        return DispatchReduceByKey<KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOpT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys_in,
+            d_unique_out,
+            d_values_in,
+            d_aggregates_out,
+            d_num_runs_out,
+            EqualityOp(),
+            reduction_op,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_reduce.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/device_run_length_encode.cuh b/GraphBLAS/CUDA/local_cub/device/device_run_length_encode.cuh
new file mode 100644
index 0000000000..7a2e82d9d7
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/device_run_length_encode.cuh
@@ -0,0 +1,278 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRunLengthEncode provides device-wide, parallel operations for computing a run-length encoding across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_rle.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceRunLengthEncode provides device-wide, parallel operations for demarcating "runs" of same-valued items within a sequence residing within device-accessible memory. ![](run_length_encode_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Run-length_encoding"><em>run-length encoding</em></a>
+ * computes a simple compressed representation of a sequence of input elements such that each
+ * maximal "run" of consecutive same-valued data items is encoded as a single data value along with a
+ * count of the elements in that run.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceRunLengthEncode}
+ *
+ * \par Performance
+ * \linear_performance{run-length encode}
+ *
+ * \par
+ * The following chart illustrates DeviceRunLengthEncode::RunLengthEncode performance across
+ * different CUDA architectures for \p int32 items.
+ * Segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html rle_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceRunLengthEncode
+{
+
+    /**
+     * \brief Computes a run-length encoding of the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> run encountered, the first key of the run and its length are written to
+     *   <tt>d_unique_out[<em>i</em>]</tt> and <tt>d_counts_out[<em>i</em>]</tt>,
+     *   respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated encode performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html rle_int32_len_500.png
+     * \image html rle_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html rle_int32_len_5.png
+     * \image html rle_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the run-length encoding of a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_unique_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_counts_out;      // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes, d_in, d_unique_out, d_counts_out, d_num_runs_out, num_items);
+     *
+     * // d_unique_out      <-- [0, 2, 9, 5, 8]
+     * // d_counts_out      <-- [1, 2, 1, 3, 1]
+     * // d_num_runs_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam UniqueOutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing unique output items \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing output counts \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    UniqueOutputIteratorT,
+        typename                    LengthsOutputIteratorT,
+        typename                    NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Encode(
+        void*                       d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        LengthsOutputIteratorT      d_counts_out,                   ///< [out] Pointer to the output sequence of run-lengths (one count per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                     ///< [out] Pointer to total number of runs
+        int                         num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef NullType*   FlagIterator;               // FlagT iterator type (not used)
+        typedef NullType    SelectOp;                   // Selection op (not used)
+        typedef Equality    EqualityOp;                 // Default == operator
+        typedef cub::Sum    ReductionOp;                // Value reduction operator
+
+        // The lengths output value type
+        typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+            OffsetT,                                                                                                    // ... then the OffsetT type,
+            typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+        // Generator type for providing 1s values for run-length reduction
+        typedef ConstantInputIterator<LengthT, OffsetT> LengthsInputIteratorT;
+
+        return DispatchReduceByKey<InputIteratorT, UniqueOutputIteratorT, LengthsInputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, ReductionOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_unique_out,
+            LengthsInputIteratorT((LengthT) 1),
+            d_counts_out,
+            d_num_runs_out,
+            EqualityOp(),
+            ReductionOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Enumerates the starting offsets and lengths of all non-trivial runs (of length > 1) of same-valued keys in the sequence \p d_in.
+     *
+     * \par
+     * - For the <em>i</em><sup>th</sup> non-trivial run, the run's starting offset
+     *   and its length are written to <tt>d_offsets_out[<em>i</em>]</tt> and
+     *   <tt>d_lengths_out[<em>i</em>]</tt>, respectively.
+     * - The total number of runs encountered is written to \p d_num_runs_out.
+     * - The <tt>==</tt> equality operator is used to determine whether values are equivalent
+     * - \devicestorage
+     *
+     * \par Performance
+     *
+     * \par Snippet
+     * The code snippet below illustrates the identification of non-trivial runs within a sequence of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_run_length_encode.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;          // e.g., 8
+     * int          *d_in;              // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int          *d_offsets_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_lengths_out;     // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int          *d_num_runs_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run encoding
+     * cub::DeviceRunLengthEncode::NonTrivialRuns(d_temp_storage, temp_storage_bytes, d_in, d_offsets_out, d_lengths_out, d_num_runs_out, num_items);
+     *
+     * // d_offsets_out         <-- [1, 4]
+     * // d_lengths_out         <-- [2, 3]
+     * // d_num_runs_out        <-- [2]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT           <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OffsetsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-offset values \iterator
+     * \tparam LengthsOutputIteratorT   <b>[inferred]</b> Random-access output iterator type for writing run-length values \iterator
+     * \tparam NumRunsOutputIteratorT   <b>[inferred]</b> Output iterator type for recording the number of runs encountered \iterator
+     */
+    template <
+        typename                InputIteratorT,
+        typename                OffsetsOutputIteratorT,
+        typename                LengthsOutputIteratorT,
+        typename                NumRunsOutputIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t NonTrivialRuns(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT          d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT  d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets (one offset per non-trivial run)
+        LengthsOutputIteratorT  d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths (one count per non-trivial run)
+        NumRunsOutputIteratorT  d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        int                     num_items,                      ///< [in] Total number of associated key+value pairs (i.e., the length of \p d_in_keys and \p d_in_values)
+        cudaStream_t            stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int         OffsetT;                    // Signed integer type for global offsets
+        typedef Equality    EqualityOp;                 // Default == operator
+
+        return DeviceRleDispatch<InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, EqualityOp, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_offsets_out,
+            d_lengths_out,
+            d_num_runs_out,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/device_scan.cuh b/GraphBLAS/CUDA/local_cub/device/device_scan.cuh
new file mode 100644
index 0000000000..e86fefe3cd
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/device_scan.cuh
@@ -0,0 +1,443 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_scan.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory. ![](device_scan.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * Given a sequence of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ * produces an output sequence where each element is computed to be the reduction
+ * of the elements occurring earlier in the input sequence.  <em>Prefix sum</em>
+ * connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ * that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ * The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ * the <em>i</em><sup>th</sup> output reduction.
+ *
+ * \par
+ * As of CUB 1.0.1 (2013), CUB's device-wide scan APIs have implemented our <em>"decoupled look-back"</em> algorithm
+ * for performing global prefix scan with only a single pass through the
+ * input data, as described in our 2016 technical report [1].  The central
+ * idea is to leverage a small, constant factor of redundant work in order to overlap the latencies
+ * of global prefix propagation with local computation.  As such, our algorithm requires only
+ * ~2<em>n</em> data movement (<em>n</em> inputs are read, <em>n</em> outputs are written), and typically
+ * proceeds at "memcpy" speeds.
+ *
+ * \par
+ * [1] [Duane Merrill and Michael Garland.  "Single-pass Parallel Prefix Scan with Decoupled Look-back", <em>NVIDIA Technical Report NVR-2016-002</em>, 2016.](https://research.nvidia.com/publication/single-pass-parallel-prefix-scan-decoupled-look-back)
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceScan}
+ *
+ * \par Performance
+ * \linear_performance{prefix scan}
+ *
+ * \par
+ * The following chart illustrates DeviceScan::ExclusiveSum
+ * performance across different CUDA architectures for \p int32 keys.
+ * \plots_below
+ *
+ * \image html scan_int32.png
+ *
+ */
+struct DeviceScan
+{
+    /******************************************************************//**
+     * \name Exclusive scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a device-wide exclusive prefix sum.  The value of 0 is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated exclusive sum performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.
+     *
+     * \image html scan_int32.png
+     * \image html scan_int64.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix sum
+     * cub::DeviceScan::ExclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out s<-- [0, 8, 14, 21, 26, 29, 29]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveSum(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        // Initial value
+        OutputT init_value = 0;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, OutputT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide exclusive prefix scan using the specified binary \p scan_op functor.  The \p init_value value is applied as the initial value, and is assigned to *d_out.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the exclusive prefix min-scan of an \p int device vector
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op
+     * ...
+     *
+     * // Determine temporary device storage requirements for exclusive prefix scan
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
+     *
+     * // Allocate temporary storage for exclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run exclusive prefix min-scan
+     * cub::DeviceScan::ExclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, (int) MAX_INT, num_items);
+     *
+     * // d_out <-- [2147483647, 8, 6, 6, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam Identity         <b>[inferred]</b> Type of the \p identity value used Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT,
+        typename        InitValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ExclusiveScan(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        InitValueT      init_value,                         ///< [in] Initial value to seed the exclusive scan (and is assigned to *d_out)
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, InitValueT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            init_value,
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix sum.
+     *
+     * \par
+     * - Supports non-commutative sum operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix sum of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;      // e.g., 7
+     * int  *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix sum
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix sum
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix sum
+     * cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
+     *
+     * // d_out <-- [8, 14, 21, 26, 29, 29, 38]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveSum(
+        void*               d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                          ///< [out] Pointer to the output sequence of data items
+        int                 num_items,                      ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t        stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, Sum, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            Sum(),
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide inclusive prefix scan using the specified binary \p scan_op functor.
+     *
+     * \par
+     * - Supports non-commutative scan operators.
+     * - Provides "run-to-run" determinism for pseudo-associative reduction
+     *   (e.g., addition of floating point types) on the same GPU device.
+     *   However, results for pseudo-associative reduction may be inconsistent
+     *   from one device to a another device of a different compute-capability
+     *   because CUB can employ different tile-sizing for different architectures.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the inclusive prefix min-scan of an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_scan.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_items;      // e.g., 7
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [ ,  ,  ,  ,  ,  ,  ]
+     * CustomMin    min_op;
+     * ...
+     *
+     * // Determine temporary device storage requirements for inclusive prefix scan
+     * void *d_temp_storage = NULL;
+     * size_t temp_storage_bytes = 0;
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // Allocate temporary storage for inclusive prefix scan
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run inclusive prefix min-scan
+     * cub::DeviceScan::InclusiveScan(d_temp_storage, temp_storage_bytes, d_in, d_out, min_op, num_items);
+     *
+     * // d_out <-- [8, 6, 6, 5, 3, 0, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT   <b>[inferred]</b> Random-access input iterator type for reading scan inputs \iterator
+     * \tparam OutputIteratorT  <b>[inferred]</b> Random-access output iterator type for writing scan outputs \iterator
+     * \tparam ScanOp           <b>[inferred]</b> Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename        InputIteratorT,
+        typename        OutputIteratorT,
+        typename        ScanOpT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t InclusiveScan(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                            ///< [in] Binary scan functor
+        int             num_items,                          ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream             = 0,             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous  = false)         ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchScan<InputIteratorT, OutputIteratorT, ScanOpT, NullType, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            scan_op,
+            NullType(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+
+};
+
+/**
+ * \example example_device_scan.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/device_segmented_radix_sort.cuh b/GraphBLAS/CUDA/local_cub/device/device_segmented_radix_sort.cuh
new file mode 100644
index 0000000000..0d36076277
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/device_segmented_radix_sort.cuh
@@ -0,0 +1,876 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_radix_sort.cuh"
+#include "../util_arch.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSegmentedRadixSort provides device-wide, parallel operations for computing a batched radix sort across multiple, non-overlapping sequences of data items residing within device-accessible memory. ![](segmented_sorting_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * The [<em>radix sorting method</em>](http://en.wikipedia.org/wiki/Radix_sort) arranges
+ * items into ascending (or descending) order.  The algorithm relies upon a positional representation for
+ * keys, i.e., each key is comprised of an ordered sequence of symbols (e.g., digits,
+ * characters, etc.) specified from least-significant to most-significant.  For a
+ * given input sequence of keys and a set of rules specifying a total ordering
+ * of the symbolic alphabet, the radix sorting method produces a lexicographic
+ * ordering of those keys.
+ *
+ * \par
+ * DeviceSegmentedRadixSort can sort all of the built-in C++ numeric primitive types
+ * (<tt>unsigned char</tt>, \p int, \p double, etc.) as well as CUDA's \p __half
+ * half-precision floating-point type.  Although the direct radix sorting
+ * method can only be applied to unsigned integral types, DeviceSegmentedRadixSort
+ * is able to sort signed and floating-point types via simple bit-wise transformations
+ * that ensure lexicographic key ordering.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedRadixSort}
+ *
+ */
+struct DeviceSegmentedRadixSort
+{
+
+    /******************************************************************//**
+     * \name Key-value pairs
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values_out          <-- [1, 2, 0, 5, 4, 3, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into ascending order. (~<em>N </em>auxiliary storage required)
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     * // d_values.Current()    <-- [5, 4, 3, 1, 2, 0, 6]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairs(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<false, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * int  *d_values_in;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_values_out;      // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes,
+     *     d_keys_in, d_keys_out, d_values_in, d_values_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values_out          <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            ValueT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        const ValueT        *d_values_in,                           ///< [in] %Device-accessible pointer to the corresponding input sequence of associated value items
+        ValueT              *d_values_out,                          ///< [out] %Device-accessible pointer to the correspondingly-reordered output sequence of associated value items
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>       d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<ValueT>     d_values(const_cast<ValueT*>(d_values_in), d_values_out);
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of key-value pairs into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers and a corresponding
+     *   pair of associated value buffers.  Each pair is managed by a DoubleBuffer
+     *   structure that indicates which of the two buffers is "current" (and thus
+     *   contains the input data to be sorted).
+     * - The contents of both buffers within each pair may be altered by the sorting
+     *   operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within each DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys
+     * with associated vector of \p int values.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * int  *d_value_buf;       // e.g., [0, 1, 2, 3, 4, 5, 6]
+     * int  *d_value_alt_buf;   // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a set of DoubleBuffers to wrap pairs of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     * cub::DoubleBuffer<int> d_values(d_value_buf, d_value_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage, temp_storage_bytes, d_keys, d_values,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     * // d_values.Current()    <-- [0, 2, 1, 6, 3, 4, 5]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam ValueT           <b>[inferred]</b> Value type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename                KeyT,
+        typename                ValueT,
+        typename                OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortPairsDescending(
+        void                    *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,                              ///< [in,out] Double-buffer of values whose "current" device-accessible buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                     num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                     end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t            stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedRadixSort<true, KeyT, ValueT, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Keys-only
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>2N </em>auxiliary storage required)
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into ascending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [6, 7, 8, 0, 3, 5, 9]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeys(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<false, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>2N</em> auxiliary storage required).
+     *
+     * \par
+     * - The contents of the input data are not altered by the sorting operation
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageNP  For sorting using only <em>O</em>(<tt>P</tt>) temporary storage, see the sorting interface using DoubleBuffer wrappers below.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_keys_in;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_keys_out;        // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys_out            <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        const KeyT          *d_keys_in,                             ///< [in] %Device-accessible pointer to the input data of key data to sort
+        KeyT                *d_keys_out,                            ///< [out] %Device-accessible pointer to the sorted output sequence of key data
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        DoubleBuffer<KeyT>      d_keys(const_cast<KeyT*>(d_keys_in), d_keys_out);
+        DoubleBuffer<NullType>  d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            false,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Sorts segments of keys into descending order. (~<em>N </em>auxiliary storage required).
+     *
+     * \par
+     * - The sorting operation is given a pair of key buffers managed by a
+     *   DoubleBuffer structure that indicates which of the two buffers is
+     *   "current" (and thus contains the input data to be sorted).
+     * - The contents of both buffers may be altered by the sorting operation.
+     * - Upon completion, the sorting operation will update the "current" indicator
+     *   within the DoubleBuffer wrapper to reference which of the two buffers
+     *   now contains the sorted output sequence (a function of the number of key bits
+     *   specified and the targeted device architecture).
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - An optional bit subrange <tt>[begin_bit, end_bit)</tt> of differentiating key bits can be specified.  This can reduce overall sorting overhead and yield a corresponding performance improvement.
+     * - \devicestorageP
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the batched sorting of three segments (with one zero-length segment) of \p int keys.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_segmentd_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for sorting data
+     * int  num_items;          // e.g., 7
+     * int  num_segments;       // e.g., 3
+     * int  *d_offsets;         // e.g., [0, 3, 3, 7]
+     * int  *d_key_buf;         // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int  *d_key_alt_buf;     // e.g., [-, -, -, -, -, -, -]
+     * ...
+     *
+     * // Create a DoubleBuffer to wrap the pair of device pointers
+     * cub::DoubleBuffer<int> d_keys(d_key_buf, d_key_alt_buf);
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sorting operation
+     * cub::DeviceSegmentedRadixSort::SortKeysDescending(d_temp_storage, temp_storage_bytes, d_keys,
+     *     num_items, num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_keys.Current()      <-- [8, 7, 6, 9, 5, 3, 0]
+     *
+     * \endcode
+     *
+     * \tparam KeyT             <b>[inferred]</b> Key type
+     * \tparam OffsetIteratorT  <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            KeyT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t SortKeysDescending(
+        void                *d_temp_storage,                        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>  &d_keys,                                ///< [in,out] Reference to the double-buffer of keys whose "current" device-accessible buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        int                 num_items,                              ///< [in] The total number of items to sort (across all segments)
+        int                 num_segments,                           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                 begin_bit           = 0,                ///< [in] <b>[optional]</b> The least-significant bit index (inclusive)  needed for key comparison
+        int                 end_bit             = sizeof(KeyT) * 8, ///< [in] <b>[optional]</b> The most-significant bit index (exclusive) needed for key comparison (e.g., sizeof(unsigned int) * 8)
+        cudaStream_t        stream              = 0,                ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)            ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // Null value type
+        DoubleBuffer<NullType> d_values;
+
+        return DispatchSegmentedRadixSort<true, KeyT, NullType, OffsetIteratorT, OffsetT>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_keys,
+            d_values,
+            num_items,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            begin_bit,
+            end_bit,
+            true,
+            stream,
+            debug_synchronous);
+    }
+
+
+    //@}  end member group
+
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/device_segmented_reduce.cuh b/GraphBLAS/CUDA/local_cub/device/device_segmented_reduce.cuh
new file mode 100644
index 0000000000..6c3b54a031
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/device_segmented_reduce.cuh
@@ -0,0 +1,619 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSegmentedReduce provides device-wide, parallel operations for computing a batched reduction across multiple sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../iterator/arg_index_input_iterator.cuh"
+#include "dispatch/dispatch_reduce.cuh"
+#include "dispatch/dispatch_reduce_by_key.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSegmentedReduce provides device-wide, parallel operations for computing a reduction across multiple sequences of data items residing within device-accessible memory. ![](reduce_logo.png)
+ * \ingroup SegmentedModule
+ *
+ * \par Overview
+ * A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ * uses a binary combining operator to compute a single aggregate from a sequence of input elements.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSegmentedReduce}
+ *
+ */
+struct DeviceSegmentedReduce
+{
+    /**
+     * \brief Computes a device-wide segmented reduction using the specified binary \p reduction_op functor.
+     *
+     * \par
+     * - Does not support binary reduction operators that are non-commutative.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates a custom min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // CustomMin functor
+     * struct CustomMin
+     * {
+     *     template <typename T>
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     T operator()(const T &a, const T &b) const {
+     *         return (b < a) ? b : a;
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int          num_segments;   // e.g., 3
+     * int          *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int          *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int          *d_out;         // e.g., [-, -, -]
+     * CustomMin    min_op;
+     * int          initial_value;           // e.g., INT_MAX
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run reduction
+     * cub::DeviceSegmentedReduce::Reduce(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1, min_op, initial_value);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     * \tparam ReductionOp          <b>[inferred]</b> Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+     * \tparam T                    <b>[inferred]</b> Data element type that is convertible to the \p value type of \p InputIteratorT
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT,
+        typename            ReductionOp,
+        typename            T>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Reduce(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOp         reduction_op,                       ///< [in] Binary reduction functor 
+        T                   initial_value,                      ///< [in] Initial value of the reduction for each segment
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        return DispatchSegmentedReduce<InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOp>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            reduction_op,
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented sum using the addition ('+') operator.
+     *
+     * \par
+     * - Uses \p 0 as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p + operators that are non-commutative..
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the sum reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run sum-reduction
+     * cub::DeviceSegmentedReduce::Sum(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [21, 0, 17]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Sum(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The output value type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Sum>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Sum(),
+            OutputT(),            // zero-initialize
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented minimum using the less-than ('<') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::max()</tt> as the initial value of the reduction for each segment.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the min-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run min-reduction
+     * cub::DeviceSegmentedReduce::Min(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [6, INT_MAX, 0]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Min(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Min>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Min(),
+            Traits<InputT>::Max(),    // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide minimum in each segment using the less-than ('<') operator, also returning the in-segment index of that item.
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The minimum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::max()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p < operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmin-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmin-reduction
+     * cub::DeviceSegmentedReduce::ArgMin(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{1,6}, {1,INT_MAX}, {2,0}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMin(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Max());   // replace with std::numeric_limits<T>::max() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMin>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMin(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Computes a device-wide segmented maximum using the greater-than ('>') operator.
+     *
+     * \par
+     * - Uses <tt>std::numeric_limits<T>::lowest()</tt> as the initial value of the reduction.
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the max-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int num_segments;   // e.g., 3
+     * int *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * int *d_out;         // e.g., [-, -, -]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run max-reduction
+     * cub::DeviceSegmentedReduce::Max(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [8, INT_MIN, 9]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate \iterator
+     * \tparam OffsetIteratorT      <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t Max(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input value type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
+
+        return DispatchSegmentedReduce<InputIteratorT,  OutputIteratorT, OffsetIteratorT, OffsetT, cub::Max>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::Max(),
+            Traits<InputT>::Lowest(),    // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Finds the first device-wide maximum in each segment using the greater-than ('>') operator, also returning the in-segment index of that item
+     *
+     * \par
+     * - The output value type of \p d_out is cub::KeyValuePair <tt><int, T></tt> (assuming the value type of \p d_in is \p T)
+     *   - The maximum of the <em>i</em><sup>th</sup> segment is written to <tt>d_out[i].value</tt> and its offset in that segment is written to <tt>d_out[i].key</tt>.
+     *   - The <tt>{1, std::numeric_limits<T>::lowest()}</tt> tuple is produced for zero-length inputs
+     * - When input a contiguous sequence of segments, a single sequence
+     *   \p segment_offsets (of length <tt>num_segments+1</tt>) can be aliased
+     *   for both the \p d_begin_offsets and \p d_end_offsets parameters (where
+     *   the latter is specified as <tt>segment_offsets+1</tt>).
+     * - Does not support \p > operators that are non-commutative.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the argmax-reduction of a device vector of \p int data elements.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_reduce.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int                      num_segments;   // e.g., 3
+     * int                      *d_offsets;     // e.g., [0, 3, 3, 7]
+     * int                      *d_in;          // e.g., [8, 6, 7, 5, 3, 0, 9]
+     * KeyValuePair<int, int>   *d_out;         // e.g., [{-,-}, {-,-}, {-,-}]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run argmax-reduction
+     * cub::DeviceSegmentedReduce::ArgMax(d_temp_storage, temp_storage_bytes, d_in, d_out,
+     *     num_segments, d_offsets, d_offsets + 1);
+     *
+     * // d_out <-- [{0,8}, {1,INT_MIN}, {3,9}]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT     <b>[inferred]</b> Random-access input iterator type for reading input items (of some type \p T) \iterator
+     * \tparam OutputIteratorT    <b>[inferred]</b> Output iterator type for recording the reduced aggregate (having value type <tt>KeyValuePair<int, T></tt>) \iterator
+     * \tparam OffsetIteratorT    <b>[inferred]</b> Random-access input iterator type for reading segment offsets \iterator
+     */
+    template <
+        typename            InputIteratorT,
+        typename            OutputIteratorT,
+        typename            OffsetIteratorT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t ArgMax(
+        void                *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t              &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                              ///< [out] Pointer to the output aggregate
+        int                 num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT     d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT     d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        cudaStream_t        stream              = 0,            ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous   = false)        ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        // Signed integer type for global offsets
+        typedef int OffsetT;
+
+        // The input type
+        typedef typename std::iterator_traits<InputIteratorT>::value_type InputValueT;
+
+        // The output tuple type
+        typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            KeyValuePair<OffsetT, InputValueT>,                                                                 // ... then the key value pair OffsetT + InputValueT
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputTupleT;                     // ... else the output iterator's value type
+
+        // The output value type
+        typedef typename OutputTupleT::Value OutputValueT;
+
+        // Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
+        typedef ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT> ArgIndexInputIteratorT;
+        ArgIndexInputIteratorT d_indexed_in(d_in);
+
+        // Initial value
+        OutputTupleT initial_value(1, Traits<InputValueT>::Lowest());     // replace with std::numeric_limits<T>::lowest() when C++11 support is more prevalent
+
+        return DispatchSegmentedReduce<ArgIndexInputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, cub::ArgMax>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_indexed_in,
+            d_out,
+            num_segments,
+            d_begin_offsets,
+            d_end_offsets,
+            cub::ArgMax(),
+            initial_value,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/device_select.cuh b/GraphBLAS/CUDA/local_cub/device/device_select.cuh
new file mode 100644
index 0000000000..52a3e126da
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/device_select.cuh
@@ -0,0 +1,369 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch/dispatch_select_if.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSelect provides device-wide, parallel operations for compacting selected items from sequences of data items residing within device-accessible memory. ![](select_logo.png)
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * These operations apply a selection criterion to selectively copy
+ * items from a specified input sequence to a compact output sequence.
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSelect}
+ *
+ * \par Performance
+ * \linear_performance{select-flagged, select-if, and select-unique}
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::If
+ * performance across different CUDA architectures for \p int32 items,
+ * where 50% of the items are randomly selected.
+ *
+ * \image html select_if_int32_50_percent.png
+ *
+ * \par
+ * The following chart illustrates DeviceSelect::Unique
+ * performance across different CUDA architectures for \p int32 items
+ * where segments have lengths uniformly sampled from [1,1000].
+ *
+ * \image html select_unique_int32_len_500.png
+ *
+ * \par
+ * \plots_below
+ *
+ */
+struct DeviceSelect
+{
+    /**
+     * \brief Uses the \p d_flags sequence to selectively copy the corresponding items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_flags_logo.png)
+     *
+     * \par
+     * - The value type of \p d_flags must be castable to \p bool (e.g., \p bool, \p char, \p int, etc.).
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input, flags, and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
+     * char *d_flags;               // e.g., [1, 0, 0, 1, 0, 1, 1, 0]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Flagged(d_temp_storage, temp_storage_bytes, d_in, d_flags, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [1, 4, 6, 7]
+     * // d_num_selected_out    <-- [4]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam FlagIterator         <b>[inferred]</b> Random-access input iterator type for reading selection flags \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    FlagIterator,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Flagged(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagIterator                d_flags,                        ///< [in] Pointer to the input sequence of selection flags
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            d_flags,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Uses the \p select_op functor to selectively copy items from \p d_in into \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](select_logo.png)
+     *
+     * \par
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-if performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Items are
+     * selected with 50% probability.
+     *
+     * \image html select_if_int32_50_percent.png
+     * \image html select_if_int64_50_percent.png
+     *
+     * \par
+     * The following charts are similar, but 5% selection probability:
+     *
+     * \image html select_if_int32_5_percent.png
+     * \image html select_if_int64_5_percent.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Functor type for selecting values less than some criteria
+     * struct LessThan
+     * {
+     *     int compare;
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     LessThan(int compare) : compare(compare) {}
+     *
+     *     CUB_RUNTIME_FUNCTION __forceinline__
+     *     bool operator()(const int &a) const {
+     *         return (a < compare);
+     *     }
+     * };
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int      num_items;              // e.g., 8
+     * int      *d_in;                  // e.g., [0, 2, 3, 9, 5, 2, 81, 8]
+     * int      *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int      *d_num_selected_out;    // e.g., [ ]
+     * LessThan select_op(7);
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::If(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items, select_op);
+     *
+     * // d_out                 <-- [0, 2, 3, 5, 2]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     * \tparam SelectOp             <b>[inferred]</b> Selection operator type having member <tt>bool operator()(const T &a)</tt>
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT,
+        typename                    SelectOp>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t If(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,                 ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        SelectOp                    select_op,                      ///< [in] Unary selection operator
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                EqualityOp;     // Equality operator (not used)
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            select_op,
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+
+    /**
+     * \brief Given an input sequence \p d_in having runs of consecutive equal-valued keys, only the first key from each run is selectively copied to \p d_out.  The total number of items selected is written to \p d_num_selected_out. ![](unique_logo.png)
+     *
+     * \par
+     * - The <tt>==</tt> equality operator is used to determine whether keys are equivalent
+     * - Copies of the selected items are compacted into \p d_out and maintain their original relative ordering.
+     * - \devicestorage
+     *
+     * \par Performance
+     * The following charts illustrate saturated select-unique performance across different
+     * CUDA architectures for \p int32 and \p int64 items, respectively.  Segments have
+     * lengths uniformly sampled from [1,1000].
+     *
+     * \image html select_unique_int32_len_500.png
+     * \image html select_unique_int64_len_500.png
+     *
+     * \par
+     * The following charts are similar, but with segment lengths uniformly sampled from [1,10]:
+     *
+     * \image html select_unique_int32_len_5.png
+     * \image html select_unique_int64_len_5.png
+     *
+     * \par Snippet
+     * The code snippet below illustrates the compaction of items selected from an \p int device vector.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>       // or equivalently <cub/device/device_select.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input and output
+     * int  num_items;              // e.g., 8
+     * int  *d_in;                  // e.g., [0, 2, 2, 9, 5, 5, 5, 8]
+     * int  *d_out;                 // e.g., [ ,  ,  ,  ,  ,  ,  ,  ]
+     * int  *d_num_selected_out;    // e.g., [ ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void     *d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run selection
+     * cub::DeviceSelect::Unique(d_temp_storage, temp_storage_bytes, d_in, d_out, d_num_selected_out, num_items);
+     *
+     * // d_out                 <-- [0, 2, 9, 5, 8]
+     * // d_num_selected_out    <-- [5]
+     *
+     * \endcode
+     *
+     * \tparam InputIteratorT       <b>[inferred]</b> Random-access input iterator type for reading input items \iterator
+     * \tparam OutputIteratorT      <b>[inferred]</b> Random-access output iterator type for writing selected items \iterator
+     * \tparam NumSelectedIteratorT  <b>[inferred]</b> Output iterator type for recording the number of items selected \iterator
+     */
+    template <
+        typename                    InputIteratorT,
+        typename                    OutputIteratorT,
+        typename                    NumSelectedIteratorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Unique(
+        void*               d_temp_storage,                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                      &temp_storage_bytes,            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT             d_out,                          ///< [out] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT         d_num_selected_out,             ///< [out] Pointer to the output total number of items selected (i.e., length of \p d_out)
+        int                         num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream             = 0,         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous  = false)     ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        typedef int                     OffsetT;         // Signed integer type for global offsets
+        typedef NullType*               FlagIterator;   // FlagT iterator type (not used)
+        typedef NullType                SelectOp;       // Selection op (not used)
+        typedef Equality                EqualityOp;     // Default == operator
+
+        return DispatchSelectIf<InputIteratorT, FlagIterator, OutputIteratorT, NumSelectedIteratorT, SelectOp, EqualityOp, OffsetT, false>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            d_in,
+            NULL,
+            d_out,
+            d_num_selected_out,
+            SelectOp(),
+            EqualityOp(),
+            num_items,
+            stream,
+            debug_synchronous);
+    }
+
+};
+
+/**
+ * \example example_device_select_flagged.cu
+ * \example example_device_select_if.cu
+ * \example example_device_select_unique.cu
+ */
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/device_spmv.cuh b/GraphBLAS/CUDA/local_cub/device/device_spmv.cuh
new file mode 100644
index 0000000000..63b6a7e86f
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/device_spmv.cuh
@@ -0,0 +1,174 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "dispatch/dispatch_spmv_orig.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * dense-vector multiplication (SpMV).
+ * \ingroup SingleModule
+ *
+ * \par Overview
+ * The [<em>SpMV computation</em>](http://en.wikipedia.org/wiki/Sparse_matrix-vector_multiplication)
+ * performs the matrix-vector operation
+ * <em>y</em> = <em>alpha</em>*<b>A</b>*<em>x</em> + <em>beta</em>*<em>y</em>,
+ * where:
+ *  - <b>A</b> is an <em>m</em>x<em>n</em> sparse matrix whose non-zero structure is specified in
+ *    [<em>compressed-storage-row (CSR) format</em>](http://en.wikipedia.org/wiki/Sparse_matrix#Compressed_row_Storage_.28CRS_or_CSR.29)
+ *    (i.e., three arrays: <em>values</em>, <em>row_offsets</em>, and <em>column_indices</em>)
+ *  - <em>x</em> and <em>y</em> are dense vectors
+ *  - <em>alpha</em> and <em>beta</em> are scalar multiplicands
+ *
+ * \par Usage Considerations
+ * \cdp_class{DeviceSpmv}
+ *
+ */
+struct DeviceSpmv
+{
+    /******************************************************************//**
+     * \name CSR matrix operations
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief This function performs the matrix-vector operation <em>y</em> = <b>A</b>*<em>x</em>.
+     *
+     * \par Snippet
+     * The code snippet below illustrates SpMV upon a 9x9 CSR matrix <b>A</b>
+     * representing a 3x3 lattice (24 non-zeros).
+     *
+     * \par
+     * \code
+     * #include <cub/cub.cuh>   // or equivalently <cub/device/device_spmv.cuh>
+     *
+     * // Declare, allocate, and initialize device-accessible pointers for input matrix A, input vector x,
+     * // and output vector y
+     * int    num_rows = 9;
+     * int    num_cols = 9;
+     * int    num_nonzeros = 24;
+     *
+     * float* d_values;  // e.g., [1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1,
+     *                   //        1, 1, 1, 1, 1, 1, 1, 1]
+     *
+     * int*   d_column_indices; // e.g., [1, 3, 0, 2, 4, 1, 5, 0,
+     *                          //        4, 6, 1, 3, 5, 7, 2, 4,
+     *                          //        8, 3, 7, 4, 6, 8, 5, 7]
+     *
+     * int*   d_row_offsets;    // e.g., [0, 2, 5, 7, 10, 14, 17, 19, 22, 24]
+     *
+     * float* d_vector_x;       // e.g., [1, 1, 1, 1, 1, 1, 1, 1, 1]
+     * float* d_vector_y;       // e.g., [ ,  ,  ,  ,  ,  ,  ,  ,  ]
+     * ...
+     *
+     * // Determine temporary device storage requirements
+     * void*    d_temp_storage = NULL;
+     * size_t   temp_storage_bytes = 0;
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros, alpha, beta);
+     *
+     * // Allocate temporary storage
+     * cudaMalloc(&d_temp_storage, temp_storage_bytes);
+     *
+     * // Run SpMV
+     * cub::DeviceSpmv::CsrMV(d_temp_storage, temp_storage_bytes, d_values,
+     *     d_row_offsets, d_column_indices, d_vector_x, d_vector_y,
+     *     num_rows, num_cols, num_nonzeros, alpha, beta);
+     *
+     * // d_vector_y <-- [2, 3, 2, 3, 4, 3, 2, 3, 2]
+     *
+     * \endcode
+     *
+     * \tparam ValueT       <b>[inferred]</b> Matrix and vector value type (e.g., /p float, /p double, etc.)
+     */
+    template <
+        typename            ValueT>
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t CsrMV(
+        void*               d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        ValueT*             d_values,                           ///< [in] Pointer to the array of \p num_nonzeros values of the corresponding nonzero elements of matrix <b>A</b>.
+        int*                d_row_offsets,                      ///< [in] Pointer to the array of \p m + 1 offsets demarcating the start of every row in \p d_column_indices and \p d_values (with the final entry being equal to \p num_nonzeros)
+        int*                d_column_indices,                   ///< [in] Pointer to the array of \p num_nonzeros column-indices of the corresponding nonzero elements of matrix <b>A</b>.  (Indices are zero-valued.)
+        ValueT*             d_vector_x,                         ///< [in] Pointer to the array of \p num_cols values corresponding to the dense input vector <em>x</em>
+        ValueT*             d_vector_y,                         ///< [out] Pointer to the array of \p num_rows values corresponding to the dense output vector <em>y</em>
+        int                 num_rows,                           ///< [in] number of rows of matrix <b>A</b>.
+        int                 num_cols,                           ///< [in] number of columns of matrix <b>A</b>.
+        int                 num_nonzeros,                       ///< [in] number of nonzero elements of matrix <b>A</b>.
+        cudaStream_t        stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        SpmvParams<ValueT, int> spmv_params;
+        spmv_params.d_values             = d_values;
+        spmv_params.d_row_end_offsets    = d_row_offsets + 1;
+        spmv_params.d_column_indices     = d_column_indices;
+        spmv_params.d_vector_x           = d_vector_x;
+        spmv_params.d_vector_y           = d_vector_y;
+        spmv_params.num_rows             = num_rows;
+        spmv_params.num_cols             = num_cols;
+        spmv_params.num_nonzeros         = num_nonzeros;
+        spmv_params.alpha                = 1.0;
+        spmv_params.beta                 = 0.0;
+
+        return DispatchSpmv<ValueT, int>::Dispatch(
+            d_temp_storage,
+            temp_storage_bytes,
+            spmv_params,
+            stream,
+            debug_synchronous);
+    }
+
+    //@}  end member group
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_histogram.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_histogram.cuh
new file mode 100644
index 0000000000..ab08e8ed05
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_histogram.cuh
@@ -0,0 +1,1096 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceHistogram provides device-wide parallel operations for constructing histogram(s) from a sequence of samples data residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+#include <limits>
+
+#include "../../agent/agent_histogram.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/******************************************************************************
+ * Histogram kernel entry points
+ *****************************************************************************/
+
+/**
+ * Histogram initialization kernel entry point
+ */
+template <
+    int                                             NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                        CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                        OffsetT>                        ///< Signed integer type for global offsets
+__global__ void DeviceHistogramInitKernel(
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>          num_output_bins_wrapper,        ///< Number of output histogram bins per channel
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>    d_output_histograms_wrapper,    ///< Histogram counter data having logical dimensions <tt>CounterT[NUM_ACTIVE_CHANNELS][num_bins.array[CHANNEL]]</tt>
+    GridQueue<int>                                  tile_queue)                     ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    if ((threadIdx.x == 0) && (blockIdx.x == 0))
+        tile_queue.ResetDrain();
+
+    int output_bin = (blockIdx.x * blockDim.x) + threadIdx.x;
+
+    #pragma unroll
+    for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+    {
+        if (output_bin < num_output_bins_wrapper.array[CHANNEL])
+            d_output_histograms_wrapper.array[CHANNEL][output_bin] = 0;
+    }
+}
+
+
+/**
+ * Histogram privatized sweep kernel entry point (multi-block).  Computes privatized histograms, one per thread block.
+ */
+template <
+    typename                                            AgentHistogramPolicyT,     ///< Parameterized AgentHistogramPolicy tuning policy type
+    int                                                 PRIVATIZED_SMEM_BINS,           ///< Maximum number of histogram bins per channel (e.g., up to 256)
+    int                                                 NUM_CHANNELS,                   ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int                                                 NUM_ACTIVE_CHANNELS,            ///< Number of channels actively being histogrammed
+    typename                                            SampleIteratorT,                ///< The input iterator type. \iterator.
+    typename                                            CounterT,                       ///< Integer type for counting sample occurrences per histogram bin
+    typename                                            PrivatizedDecodeOpT,            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+    typename                                            OutputDecodeOpT,                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+    typename                                            OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentHistogramPolicyT::BLOCK_THREADS))
+__global__ void DeviceHistogramSweepKernel(
+    SampleIteratorT                                         d_samples,                          ///< Input data to reduce
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_output_bins_wrapper,            ///< The number bins per final output histogram
+    ArrayWrapper<int, NUM_ACTIVE_CHANNELS>                  num_privatized_bins_wrapper,        ///< The number bins per privatized histogram
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_output_histograms_wrapper,        ///< Reference to final output histograms
+    ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS>            d_privatized_histograms_wrapper,    ///< Reference to privatized histograms
+    ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS>      output_decode_op_wrapper,           ///< The transform operator for determining output bin-ids from privatized counter indices, one for each channel
+    ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS>  privatized_decode_op_wrapper,       ///< The transform operator for determining privatized counter indices from samples, one for each channel
+    OffsetT                                                 num_row_pixels,                     ///< The number of multi-channel pixels per row in the region of interest
+    OffsetT                                                 num_rows,                           ///< The number of rows in the region of interest
+    OffsetT                                                 row_stride_samples,                 ///< The number of samples between starts of consecutive rows in the region of interest
+    int                                                     tiles_per_row,                      ///< Number of image tiles per row
+    GridQueue<int>                                          tile_queue)                         ///< Drain queue descriptor for dynamically mapping tile data onto thread blocks
+{
+    // Thread block type for compositing input tiles
+    typedef AgentHistogram<
+            AgentHistogramPolicyT,
+            PRIVATIZED_SMEM_BINS,
+            NUM_CHANNELS,
+            NUM_ACTIVE_CHANNELS,
+            SampleIteratorT,
+            CounterT,
+            PrivatizedDecodeOpT,
+            OutputDecodeOpT,
+            OffsetT>
+        AgentHistogramT;
+
+    // Shared memory for AgentHistogram
+    __shared__ typename AgentHistogramT::TempStorage temp_storage;
+
+    AgentHistogramT agent(
+        temp_storage,
+        d_samples,
+        num_output_bins_wrapper.array,
+        num_privatized_bins_wrapper.array,
+        d_output_histograms_wrapper.array,
+        d_privatized_histograms_wrapper.array,
+        output_decode_op_wrapper.array,
+        privatized_decode_op_wrapper.array);
+
+    // Initialize counters
+    agent.InitBinCounters();
+
+    // Consume input tiles
+    agent.ConsumeTiles(
+        num_row_pixels,
+        num_rows,
+        row_stride_samples,
+        tiles_per_row,
+        tile_queue);
+
+    // Store output to global (if necessary)
+    agent.StoreOutput();
+
+}
+
+
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceHistogram
+ */
+template <
+    int         NUM_CHANNELS,               ///< Number of channels interleaved in the input data (may be greater than the number of channels being actively histogrammed)
+    int         NUM_ACTIVE_CHANNELS,        ///< Number of channels actively being histogrammed
+    typename    SampleIteratorT,            ///< Random-access input iterator type for reading input items \iterator
+    typename    CounterT,                   ///< Integer type for counting sample occurrences per histogram bin
+    typename    LevelT,                     ///< Type for specifying bin level boundaries
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DipatchHistogram
+{
+    //---------------------------------------------------------------------
+    // Types and constants
+    //---------------------------------------------------------------------
+
+    /// The sample value type of the input iterator
+    typedef typename std::iterator_traits<SampleIteratorT>::value_type SampleT;
+
+    enum
+    {
+        // Maximum number of bins per channel for which we will use a privatized smem strategy
+        MAX_PRIVATIZED_SMEM_BINS = 256
+    };
+
+
+    //---------------------------------------------------------------------
+    // Transform functors for converting samples to bin-ids
+    //---------------------------------------------------------------------
+
+    // Searches for bin given a list of bin-boundary levels
+    template <typename LevelIteratorT>
+    struct SearchTransform
+    {
+        LevelIteratorT  d_levels;                   // Pointer to levels array
+        int             num_output_levels;          // Number of levels in array
+
+        // Initializer
+        __host__ __device__ __forceinline__ void Init(
+            LevelIteratorT  d_levels,               // Pointer to levels array
+            int             num_output_levels)      // Number of levels in array
+        {
+            this->d_levels          = d_levels;
+            this->num_output_levels = num_output_levels;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            /// Level iterator wrapper type
+            typedef typename If<IsPointer<LevelIteratorT>::VALUE,
+                    CacheModifiedInputIterator<LOAD_MODIFIER, LevelT, OffsetT>,     // Wrap the native input pointer with CacheModifiedInputIterator
+                    LevelIteratorT>::Type                                           // Directly use the supplied input iterator type
+                WrappedLevelIteratorT;
+
+            WrappedLevelIteratorT wrapped_levels(d_levels);
+
+            int num_bins = num_output_levels - 1;
+            if (valid)
+            {
+                bin = UpperBound(wrapped_levels, num_output_levels, (LevelT) sample) - 1;
+                if (bin >= num_bins)
+                    bin = -1;
+            }
+        }
+    };
+
+
+    // Scales samples to evenly-spaced bins
+    struct ScaleTransform
+    {
+        int    num_bins;    // Number of levels in array
+        LevelT max;         // Max sample level (exclusive)
+        LevelT min;         // Min sample level (inclusive)
+        LevelT scale;       // Bin scaling factor
+
+        // Initializer
+        template <typename _LevelT>
+        __host__ __device__ __forceinline__ void Init(
+            int     num_output_levels,  // Number of levels in array
+            _LevelT max,                // Max sample level (exclusive)
+            _LevelT min,                // Min sample level (inclusive)
+            _LevelT scale)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = scale;
+        }
+
+        // Initializer (float specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            float   max,                // Max sample level (exclusive)
+            float   min,                // Min sample level (inclusive)
+            float   scale)              // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = float(1.0) / scale;
+        }
+
+        // Initializer (double specialization)
+        __host__ __device__ __forceinline__ void Init(
+            int    num_output_levels,   // Number of levels in array
+            double max,                 // Max sample level (exclusive)
+            double min,                 // Min sample level (inclusive)
+            double scale)               // Bin scaling factor
+        {
+            this->num_bins = num_output_levels - 1;
+            this->max = max;
+            this->min = min;
+            this->scale = double(1.0) / scale;
+        }
+
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) / scale);
+        }
+
+        // Method for converting samples to bin-ids (float specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(float sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+
+        // Method for converting samples to bin-ids (double specialization)
+        template <CacheLoadModifier LOAD_MODIFIER>
+        __host__ __device__ __forceinline__ void BinSelect(double sample, int &bin, bool valid)
+        {
+            LevelT level_sample = (LevelT) sample;
+
+            if (valid && (level_sample >= min) && (level_sample < max))
+                bin = (int) ((level_sample - min) * scale);
+        }
+    };
+
+
+    // Pass-through bin transform operator
+    struct PassThruTransform
+    {
+        // Method for converting samples to bin-ids
+        template <CacheLoadModifier LOAD_MODIFIER, typename _SampleT>
+        __host__ __device__ __forceinline__ void BinSelect(_SampleT sample, int &bin, bool valid)
+        {
+            if (valid)
+                bin = (int) sample;
+        }
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    template <int NOMINAL_ITEMS_PER_THREAD>
+    struct TScale
+    {
+        enum
+        {
+            V_SCALE = (sizeof(SampleT) + sizeof(int) - 1) / sizeof(int),
+            VALUE   = CUB_MAX((NOMINAL_ITEMS_PER_THREAD / NUM_ACTIVE_CHANNELS / V_SCALE), 1)
+        };
+    };
+
+
+    /// SM11
+    struct Policy110
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                512,
+                (NUM_CHANNELS == 1) ? 8 : 2,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                true,
+                GMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                (NUM_CHANNELS == 1) ? 256 : 128,
+                (NUM_CHANNELS == 1) ? 8 : 3,
+                (NUM_CHANNELS == 1) ? BLOCK_LOAD_DIRECT : BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                512,
+                (NUM_CHANNELS == 1) ? 8 : 2,
+                BLOCK_LOAD_DIRECT,
+                LOAD_DEFAULT,
+                true,
+                GMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+    /// SM35
+    struct Policy350
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                128,
+                TScale<8>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLEND,
+                true>
+            HistogramSweepPolicy;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        // HistogramSweepPolicy
+        typedef AgentHistogramPolicy<
+                384,
+                TScale<16>::VALUE,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                SMEM,
+                false>
+            HistogramSweepPolicy;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxHistogramSweepPolicy : PtxPolicy::HistogramSweepPolicy {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t InitConfigs(
+        int             ptx_version,
+        KernelConfig    &histogram_sweep_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        return histogram_sweep_config.template Init<PtxHistogramSweepPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 500)
+        {
+            return histogram_sweep_config.template Init<typename Policy500::HistogramSweepPolicy>();
+        }
+        else if (ptx_version >= 350)
+        {
+            return histogram_sweep_config.template Init<typename Policy350::HistogramSweepPolicy>();
+        }
+        else if (ptx_version >= 300)
+        {
+            return histogram_sweep_config.template Init<typename Policy300::HistogramSweepPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            return histogram_sweep_config.template Init<typename Policy200::HistogramSweepPolicy>();
+        }
+        else if (ptx_version >= 110)
+        {
+            return histogram_sweep_config.template Init<typename Policy110::HistogramSweepPolicy>();
+        }
+        else
+        {
+            // No global atomic support
+            return cudaErrorNotSupported;
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration
+     */
+    struct KernelConfig
+    {
+        int                             block_threads;
+        int                             pixels_per_thread;
+
+        template <typename BlockPolicy>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t Init()
+        {
+            block_threads               = BlockPolicy::BLOCK_THREADS;
+            pixels_per_thread           = BlockPolicy::PIXELS_PER_THREAD;
+
+            return cudaSuccess;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Privatization-based dispatch routine
+     */
+    template <
+        typename                            PrivatizedDecodeOpT,                            ///< The transform operator type for determining privatized counter indices from samples, one for each channel
+        typename                            OutputDecodeOpT,                                ///< The transform operator type for determining output bin-ids from privatized counter indices, one for each channel
+        typename                            DeviceHistogramInitKernelT,                     ///< Function type of cub::DeviceHistogramInitKernel
+        typename                            DeviceHistogramSweepKernelT>                    ///< Function type of cub::DeviceHistogramSweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t PrivatizedDispatch(
+        void*                               d_temp_storage,                                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                             temp_storage_bytes,                             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT                     d_samples,                                      ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*                           d_output_histograms[NUM_ACTIVE_CHANNELS],       ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                                 num_privatized_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        PrivatizedDecodeOpT                 privatized_decode_op[NUM_ACTIVE_CHANNELS],      ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 num_output_levels[NUM_ACTIVE_CHANNELS],         ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        OutputDecodeOpT                     output_decode_op[NUM_ACTIVE_CHANNELS],          ///< [in] Transform operators for determining bin-ids from samples, one for each channel
+        int                                 max_num_output_bins,                            ///< [in] Maximum number of output bins in any channel
+        OffsetT                             num_row_pixels,                                 ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT                             num_rows,                                       ///< [in] The number of rows in the region of interest
+        OffsetT                             row_stride_samples,                             ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        DeviceHistogramInitKernelT          histogram_init_kernel,                          ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramInitKernel
+        DeviceHistogramSweepKernelT         histogram_sweep_kernel,                         ///< [in] Kernel function pointer to parameterization of cub::DeviceHistogramSweepKernel
+        KernelConfig                        histogram_sweep_config,                         ///< [in] Dispatch parameters that match the policy that \p histogram_sweep_kernel was compiled for
+        cudaStream_t                        stream,                                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                                debug_synchronous)                              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+    #ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+    #else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get SM occupancy for histogram_sweep_kernel
+            int histogram_sweep_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                histogram_sweep_sm_occupancy,
+                histogram_sweep_kernel,
+                histogram_sweep_config.block_threads))) break;
+
+            // Get device occupancy for histogram_sweep_kernel
+            int histogram_sweep_occupancy = histogram_sweep_sm_occupancy * sm_count;
+
+            if (num_row_pixels * NUM_CHANNELS == row_stride_samples)
+            {
+                // Treat as a single linear array of samples
+                num_row_pixels      *= num_rows;
+                num_rows            = 1;
+                row_stride_samples  = num_row_pixels * NUM_CHANNELS;
+            }
+
+            // Get grid dimensions, trying to keep total blocks ~histogram_sweep_occupancy
+            int pixels_per_tile     = histogram_sweep_config.block_threads * histogram_sweep_config.pixels_per_thread;
+            int tiles_per_row       = int(num_row_pixels + pixels_per_tile - 1) / pixels_per_tile;
+            int blocks_per_row      = CUB_MIN(histogram_sweep_occupancy, tiles_per_row);
+            int blocks_per_col      = (blocks_per_row > 0) ?
+                                        int(CUB_MIN(histogram_sweep_occupancy / blocks_per_row, num_rows)) :
+                                        0;
+            int num_thread_blocks   = blocks_per_row * blocks_per_col;
+
+            dim3 sweep_grid_dims;
+            sweep_grid_dims.x = (unsigned int) blocks_per_row;
+            sweep_grid_dims.y = (unsigned int) blocks_per_col;
+            sweep_grid_dims.z = 1;
+
+            // Temporary storage allocation requirements
+            const int   NUM_ALLOCATIONS = NUM_ACTIVE_CHANNELS + 1;
+            void*       allocations[NUM_ALLOCATIONS];
+            size_t      allocation_sizes[NUM_ALLOCATIONS];
+
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                allocation_sizes[CHANNEL] = size_t(num_thread_blocks) * (num_privatized_levels[CHANNEL] - 1) * sizeof(CounterT);
+
+            allocation_sizes[NUM_ALLOCATIONS - 1] = GridQueue<int>::AllocationSize();
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the grid queue descriptor
+            GridQueue<int> tile_queue(allocations[NUM_ALLOCATIONS - 1]);
+
+            // Setup array wrapper for histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_output_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_output_histograms_wrapper.array[CHANNEL] = d_output_histograms[CHANNEL];
+
+            // Setup array wrapper for privatized per-block histogram channel output (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<CounterT*, NUM_ACTIVE_CHANNELS> d_privatized_histograms_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                d_privatized_histograms_wrapper.array[CHANNEL] = (CounterT*) allocations[CHANNEL];
+
+            // Setup array wrapper for sweep bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<PrivatizedDecodeOpT, NUM_ACTIVE_CHANNELS> privatized_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                privatized_decode_op_wrapper.array[CHANNEL] = privatized_decode_op[CHANNEL];
+
+            // Setup array wrapper for aggregation bin transforms (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<OutputDecodeOpT, NUM_ACTIVE_CHANNELS> output_decode_op_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                output_decode_op_wrapper.array[CHANNEL] = output_decode_op[CHANNEL];
+
+            // Setup array wrapper for num privatized bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_privatized_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_privatized_bins_wrapper.array[CHANNEL] = num_privatized_levels[CHANNEL] - 1;
+
+            // Setup array wrapper for num output bins (because we can't pass static arrays as kernel parameters)
+            ArrayWrapper<int, NUM_ACTIVE_CHANNELS> num_output_bins_wrapper;
+            for (int CHANNEL = 0; CHANNEL < NUM_ACTIVE_CHANNELS; ++CHANNEL)
+                num_output_bins_wrapper.array[CHANNEL] = num_output_levels[CHANNEL] - 1;
+
+            int histogram_init_block_threads    = 256;
+            int histogram_init_grid_dims        = (max_num_output_bins + histogram_init_block_threads - 1) / histogram_init_block_threads;
+
+            // Log DeviceHistogramInitKernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceHistogramInitKernel<<<%d, %d, 0, %lld>>>()\n",
+                histogram_init_grid_dims, histogram_init_block_threads, (long long) stream);
+
+            // Invoke histogram_init_kernel
+            histogram_init_kernel<<<histogram_init_grid_dims, histogram_init_block_threads, 0, stream>>>(
+                num_output_bins_wrapper,
+                d_output_histograms_wrapper,
+                tile_queue);
+
+            // Return if empty problem
+            if ((blocks_per_row == 0) || (blocks_per_col == 0))
+                break;
+
+            // Log histogram_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking histogram_sweep_kernel<<<{%d, %d, %d}, %d, 0, %lld>>>(), %d pixels per thread, %d SM occupancy\n",
+                sweep_grid_dims.x, sweep_grid_dims.y, sweep_grid_dims.z,
+                histogram_sweep_config.block_threads, (long long) stream, histogram_sweep_config.pixels_per_thread, histogram_sweep_sm_occupancy);
+
+            // Invoke histogram_sweep_kernel
+            histogram_sweep_kernel<<<sweep_grid_dims, histogram_sweep_config.block_threads, 0, stream>>>(
+                d_samples,
+                num_output_bins_wrapper,
+                num_privatized_bins_wrapper,
+                d_output_histograms_wrapper,
+                d_privatized_histograms_wrapper,
+                output_decode_op_wrapper,
+                privatized_decode_op_wrapper,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                tiles_per_row,
+                tile_queue);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+    #endif // CUB_RUNTIME_ENABLED
+    }
+
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for sample types larger than 8bit
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],      ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the search transform op for converting samples to privatized bins
+            typedef SearchTransform<LevelT*> PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                privatized_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            // Dispatch
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Too many bins to keep in shared memory.
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramRange, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION
+    static cudaError_t DispatchRange(
+        void*               d_temp_storage,                             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the multi-channel input sequence of data samples. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],   ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of boundaries (levels) for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              *d_levels[NUM_ACTIVE_CHANNELS],             ///< [in] The pointers to the arrays of boundaries (levels), one for each active channel.  Bin ranges are defined by consecutive boundary pairings: lower sample value boundaries are inclusive and upper sample value boundaries are exclusive.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the search transform op for converting privatized bins to output bins
+            typedef SearchTransform<LevelT*> OutputDecodeOpT;
+
+            int                         num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];              // Maximum number of levels in any channel
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+                output_decode_op[channel].Init(d_levels[channel], num_output_levels[channel]);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        } while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for sample types larger than 8-bit
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<false>     is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the scale transform op for converting samples to privatized bins
+            typedef ScaleTransform PrivatizedDecodeOpT;
+
+            // Use the pass-thru transform op for converting privatized bins to output bins
+            typedef PassThruTransform OutputDecodeOpT;
+
+            PrivatizedDecodeOpT         privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT             output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                         max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+
+                privatized_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            if (max_num_output_bins > MAX_PRIVATIZED_SMEM_BINS)
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = 0;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+            else
+            {
+                // Dispatch shared-privatized approach
+                const int PRIVATIZED_SMEM_BINS = MAX_PRIVATIZED_SMEM_BINS;
+
+                if (CubDebug(error = PrivatizedDispatch(
+                    d_temp_storage,
+                    temp_storage_bytes,
+                    d_samples,
+                    d_output_histograms,
+                    num_output_levels,
+                    privatized_decode_op,
+                    num_output_levels,
+                    output_decode_op,
+                    max_num_output_bins,
+                    num_row_pixels,
+                    num_rows,
+                    row_stride_samples,
+                    DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                    DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                    histogram_sweep_config,
+                    stream,
+                    debug_synchronous))) break;
+            }
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /**
+     * Dispatch routine for HistogramEven, specialized for 8-bit sample types (computes 256-bin privatized histograms and then reduces to user-specified levels)
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t DispatchEven(
+        void*               d_temp_storage,                            ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,                        ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SampleIteratorT     d_samples,                                  ///< [in] The pointer to the input sequence of sample items. The samples from different channels are assumed to be interleaved (e.g., an array of 32-bit pixels where each pixel consists of four RGBA 8-bit samples).
+        CounterT*           d_output_histograms[NUM_ACTIVE_CHANNELS],  ///< [out] The pointers to the histogram counter output arrays, one for each active channel.  For channel<sub><em>i</em></sub>, the allocation length of <tt>d_histograms[i]</tt> should be <tt>num_output_levels[i]</tt> - 1.
+        int                 num_output_levels[NUM_ACTIVE_CHANNELS],     ///< [in] The number of bin level boundaries for delineating histogram samples in each active channel.  Implies that the number of bins for channel<sub><em>i</em></sub> is <tt>num_output_levels[i]</tt> - 1.
+        LevelT              lower_level[NUM_ACTIVE_CHANNELS],           ///< [in] The lower sample value bound (inclusive) for the lowest histogram bin in each active channel.
+        LevelT              upper_level[NUM_ACTIVE_CHANNELS],           ///< [in] The upper sample value bound (exclusive) for the highest histogram bin in each active channel.
+        OffsetT             num_row_pixels,                             ///< [in] The number of multi-channel pixels per row in the region of interest
+        OffsetT             num_rows,                                   ///< [in] The number of rows in the region of interest
+        OffsetT             row_stride_samples,                         ///< [in] The number of samples between starts of consecutive rows in the region of interest
+        cudaStream_t        stream,                                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,                          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+        Int2Type<true>      is_byte_sample)                             ///< [in] Marker type indicating whether or not SampleT is a 8b type
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel dispatch configurations
+            KernelConfig histogram_sweep_config;
+            if (CubDebug(error = InitConfigs(ptx_version, histogram_sweep_config)))
+                break;
+
+            // Use the pass-thru transform op for converting samples to privatized bins
+            typedef PassThruTransform PrivatizedDecodeOpT;
+
+            // Use the scale transform op for converting privatized bins to output bins
+            typedef ScaleTransform OutputDecodeOpT;
+
+            int                     num_privatized_levels[NUM_ACTIVE_CHANNELS];
+            PrivatizedDecodeOpT     privatized_decode_op[NUM_ACTIVE_CHANNELS];
+            OutputDecodeOpT         output_decode_op[NUM_ACTIVE_CHANNELS];
+            int                     max_levels = num_output_levels[0];
+
+            for (int channel = 0; channel < NUM_ACTIVE_CHANNELS; ++channel)
+            {
+                num_privatized_levels[channel] = 257;
+
+                int     bins    = num_output_levels[channel] - 1;
+                LevelT  scale   = (upper_level[channel] - lower_level[channel]) / bins;
+                output_decode_op[channel].Init(num_output_levels[channel], upper_level[channel], lower_level[channel], scale);
+
+                if (num_output_levels[channel] > max_levels)
+                    max_levels = num_output_levels[channel];
+            }
+            int max_num_output_bins = max_levels - 1;
+
+            const int PRIVATIZED_SMEM_BINS = 256;
+
+            if (CubDebug(error = PrivatizedDispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_samples,
+                d_output_histograms,
+                num_privatized_levels,
+                privatized_decode_op,
+                num_output_levels,
+                output_decode_op,
+                max_num_output_bins,
+                num_row_pixels,
+                num_rows,
+                row_stride_samples,
+                DeviceHistogramInitKernel<NUM_ACTIVE_CHANNELS, CounterT, OffsetT>,
+                DeviceHistogramSweepKernel<PtxHistogramSweepPolicy, PRIVATIZED_SMEM_BINS, NUM_CHANNELS, NUM_ACTIVE_CHANNELS, SampleIteratorT, CounterT, PrivatizedDecodeOpT, OutputDecodeOpT, OffsetT>,
+                histogram_sweep_config,
+                stream,
+                debug_synchronous))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_radix_sort.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_radix_sort.cuh
new file mode 100644
index 0000000000..d1a992d438
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_radix_sort.cuh
@@ -0,0 +1,1619 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRadixSort provides device-wide, parallel operations for computing a radix sort across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_radix_sort_upsweep.cuh"
+#include "../../agent/agent_radix_sort_downsweep.cuh"
+#include "../../agent/agent_scan.cuh"
+#include "../../block/block_radix_sort.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Upsweep digit-counting kernel entry point (multi-block).  Computes privatized digit histograms, one per block.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))
+__global__ void DeviceRadixSortUpsweepKernel(
+    const KeyT              *d_keys,                        ///< [in] Input keys buffer
+    OffsetT                 *d_spine,                       ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 /*num_items*/,                  ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    enum {
+        TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
+                        ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
+    };
+
+    // Parameterize AgentRadixSortUpsweep type for the current configuration
+    typedef AgentRadixSortUpsweep<
+            typename If<(ALT_DIGIT_BITS),
+                typename ChainedPolicyT::ActivePolicy::AltUpsweepPolicy,
+                typename ChainedPolicyT::ActivePolicy::UpsweepPolicy>::Type,
+            KeyT,
+            OffsetT>
+        AgentRadixSortUpsweepT;
+
+    // Shared memory storage
+    __shared__ typename AgentRadixSortUpsweepT::TempStorage temp_storage;
+
+    // Initialize GRID_MAPPING_RAKE even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    AgentRadixSortUpsweepT upsweep(temp_storage, d_keys, current_bit, num_bits);
+
+    upsweep.ProcessRegion(even_share.block_offset, even_share.block_end);
+
+    CTA_SYNC();
+
+    // Write out digit counts (striped)
+    upsweep.template ExtractCounts<IS_DESCENDING>(d_spine, gridDim.x, blockIdx.x);
+}
+
+
+/**
+ * Spine scan kernel entry point (single-block).  Computes an exclusive prefix sum over the privatized digit histograms
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1)
+__global__ void RadixSortScanBinsKernel(
+    OffsetT                 *d_spine,                       ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    int                     num_counts)                     ///< [in] Total number of bin-counts
+{
+    // Parameterize the AgentScan type for the current configuration
+    typedef AgentScan<
+            typename ChainedPolicyT::ActivePolicy::ScanPolicy,
+            OffsetT*,
+            OffsetT*,
+            cub::Sum,
+            OffsetT,
+            OffsetT>
+        AgentScanT;
+
+    // Shared memory storage
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Block scan instance
+    AgentScanT block_scan(temp_storage, d_spine, d_spine, cub::Sum(), OffsetT(0)) ;
+
+    // Process full input tiles
+    int block_offset = 0;
+    BlockScanRunningPrefixOp<OffsetT, Sum> prefix_op(0, Sum());
+    while (block_offset + AgentScanT::TILE_ITEMS <= num_counts)
+    {
+        block_scan.template ConsumeTile<false, false>(block_offset, prefix_op);
+        block_offset += AgentScanT::TILE_ITEMS;
+    }
+}
+
+
+/**
+ * Downsweep pass kernel entry point (multi-block).  Scatters keys (and values) into corresponding bins for the current digit place.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))
+__global__ void DeviceRadixSortDownsweepKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 *d_spine,                       ///< [in] Scan of privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.)
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     num_bits,                       ///< [in] Number of bits of current radix digit
+    GridEvenShare<OffsetT>  even_share)                     ///< [in] Even-share descriptor for mapan equal number of tiles onto each thread block
+{
+    enum {
+        TILE_ITEMS = ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS *
+                        ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::ITEMS_PER_THREAD
+    };
+
+    // Parameterize AgentRadixSortDownsweep type for the current configuration
+    typedef AgentRadixSortDownsweep<
+            typename If<(ALT_DIGIT_BITS),
+                typename ChainedPolicyT::ActivePolicy::AltDownsweepPolicy,
+                typename ChainedPolicyT::ActivePolicy::DownsweepPolicy>::Type,
+            IS_DESCENDING,
+            KeyT,
+            ValueT,
+            OffsetT>
+        AgentRadixSortDownsweepT;
+
+    // Shared memory storage
+    __shared__  typename AgentRadixSortDownsweepT::TempStorage temp_storage;
+
+    // Initialize even-share descriptor for this thread block
+    even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_RAKE>();
+
+    // Process input tiles
+    AgentRadixSortDownsweepT(temp_storage, num_items, d_spine, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, num_bits).ProcessRegion(
+        even_share.block_offset,
+        even_share.block_end);
+}
+
+
+/**
+ * Single pass kernel entry point (single-block).  Fully sorts a tile of input.
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceRadixSortSingleTileKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetT                 num_items,                      ///< [in] Total number of input data items
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     end_bit)                        ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+{
+    // Constants
+    enum
+    {
+        BLOCK_THREADS           = ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = ChainedPolicyT::ActivePolicy::SingleTilePolicy::ITEMS_PER_THREAD,
+        KEYS_ONLY               = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // BlockRadixSort type
+    typedef BlockRadixSort<
+            KeyT,
+            BLOCK_THREADS,
+            ITEMS_PER_THREAD,
+            ValueT,
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::RADIX_BITS,
+            (ChainedPolicyT::ActivePolicy::SingleTilePolicy::RANK_ALGORITHM == RADIX_RANK_MEMOIZE),
+            ChainedPolicyT::ActivePolicy::SingleTilePolicy::SCAN_ALGORITHM>
+        BlockRadixSortT;
+
+    // BlockLoad type (keys)
+    typedef BlockLoad<
+        KeyT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadKeys;
+
+    // BlockLoad type (values)
+    typedef BlockLoad<
+        ValueT,
+        BLOCK_THREADS,
+        ITEMS_PER_THREAD,
+        ChainedPolicyT::ActivePolicy::SingleTilePolicy::LOAD_ALGORITHM> BlockLoadValues;
+
+    // Unsigned word for key bits
+    typedef typename Traits<KeyT>::UnsignedBits UnsignedBitsT;
+
+    // Shared memory storage
+    __shared__ union TempStorage
+    {
+        typename BlockRadixSortT::TempStorage       sort;
+        typename BlockLoadKeys::TempStorage         load_keys;
+        typename BlockLoadValues::TempStorage       load_values;
+
+    } temp_storage;
+
+    // Keys and values for the block
+    KeyT            keys[ITEMS_PER_THREAD];
+    ValueT          values[ITEMS_PER_THREAD];
+
+    // Get default (min/max) value for out-of-bounds keys
+    UnsignedBitsT   default_key_bits = (IS_DESCENDING) ? Traits<KeyT>::LOWEST_KEY : Traits<KeyT>::MAX_KEY;
+    KeyT            default_key = reinterpret_cast<KeyT&>(default_key_bits);
+
+    // Load keys
+    BlockLoadKeys(temp_storage.load_keys).Load(d_keys_in, keys, num_items, default_key);
+
+    CTA_SYNC();
+
+    // Load values
+    if (!KEYS_ONLY)
+    {
+        // Register pressure work-around: moving num_items through shfl prevents compiler
+        // from reusing guards/addressing from prior guarded loads
+        num_items = ShuffleIndex<CUB_PTX_WARP_THREADS>(num_items, 0, 0xffffffff);
+
+        BlockLoadValues(temp_storage.load_values).Load(d_values_in, values, num_items);
+
+        CTA_SYNC();
+    }
+
+    // Sort tile
+    BlockRadixSortT(temp_storage.sort).SortBlockedToStriped(
+        keys,
+        values,
+        current_bit,
+        end_bit,
+        Int2Type<IS_DESCENDING>(),
+        Int2Type<KEYS_ONLY>());
+
+    // Store keys and values
+    #pragma unroll
+    for (int ITEM = 0; ITEM < ITEMS_PER_THREAD; ++ITEM)
+    {
+        int item_offset = ITEM * BLOCK_THREADS + threadIdx.x;
+        if (item_offset < num_items)
+        {
+            d_keys_out[item_offset] = keys[ITEM];
+            if (!KEYS_ONLY)
+                d_values_out[item_offset] = values[ITEM];
+        }
+    }
+}
+
+
+/**
+ * Segmented radix sorting pass (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,                 ///< Chained tuning policy
+    bool                    ALT_DIGIT_BITS,                 ///< Whether or not to use the alternate (lower-bits) policy
+    bool                    IS_DESCENDING,                  ///< Whether or not the sorted-order is high-to-low
+    typename                KeyT,                           ///< Key type
+    typename                ValueT,                         ///< Value type
+    typename                OffsetIteratorT,                ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                OffsetT>                        ///< Signed integer type for global offsets
+__launch_bounds__ (int((ALT_DIGIT_BITS) ?
+    ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS :
+    ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedRadixSortKernel(
+    const KeyT              *d_keys_in,                     ///< [in] Input keys buffer
+    KeyT                    *d_keys_out,                    ///< [in] Output keys buffer
+    const ValueT            *d_values_in,                   ///< [in] Input values buffer
+    ValueT                  *d_values_out,                  ///< [in] Output values buffer
+    OffsetIteratorT         d_begin_offsets,                ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets,                  ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,               ///< [in] The number of segments that comprise the sorting data
+    int                     current_bit,                    ///< [in] Bit position of current radix digit
+    int                     pass_bits)                      ///< [in] Number of bits of current radix digit
+{
+    //
+    // Constants
+    //
+
+    typedef typename If<(ALT_DIGIT_BITS),
+        typename ChainedPolicyT::ActivePolicy::AltSegmentedPolicy,
+        typename ChainedPolicyT::ActivePolicy::SegmentedPolicy>::Type SegmentedPolicyT;
+
+    enum
+    {
+        BLOCK_THREADS       = SegmentedPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD    = SegmentedPolicyT::ITEMS_PER_THREAD,
+        RADIX_BITS          = SegmentedPolicyT::RADIX_BITS,
+        TILE_ITEMS          = BLOCK_THREADS * ITEMS_PER_THREAD,
+        RADIX_DIGITS        = 1 << RADIX_BITS,
+        KEYS_ONLY           = Equals<ValueT, NullType>::VALUE,
+    };
+
+    // Upsweep type
+    typedef AgentRadixSortUpsweep<
+            AgentRadixSortUpsweepPolicy<BLOCK_THREADS, ITEMS_PER_THREAD, SegmentedPolicyT::LOAD_MODIFIER, RADIX_BITS>,
+            KeyT,
+            OffsetT>
+        BlockUpsweepT;
+
+    // Digit-scan type
+    typedef BlockScan<OffsetT, BLOCK_THREADS> DigitScanT;
+
+    // Downsweep type
+    typedef AgentRadixSortDownsweep<SegmentedPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT> BlockDownsweepT;
+
+    enum
+    {
+        /// Number of bin-starting offsets tracked per thread
+        BINS_TRACKED_PER_THREAD = BlockDownsweepT::BINS_TRACKED_PER_THREAD
+    };
+
+    //
+    // Process input tiles
+    //
+
+    // Shared memory storage
+    __shared__ union
+    {
+        typename BlockUpsweepT::TempStorage     upsweep;
+        typename BlockDownsweepT::TempStorage   downsweep;
+        struct
+        {
+            volatile OffsetT                        reverse_counts_in[RADIX_DIGITS];
+            volatile OffsetT                        reverse_counts_out[RADIX_DIGITS];
+            typename DigitScanT::TempStorage        scan;
+        };
+
+    } temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+    OffsetT num_items       = segment_end - segment_begin;
+
+    // Check if empty segment
+    if (num_items <= 0)
+        return;
+
+    // Upsweep
+    BlockUpsweepT upsweep(temp_storage.upsweep, d_keys_in, current_bit, pass_bits);
+    upsweep.ProcessRegion(segment_begin, segment_end);
+
+    CTA_SYNC();
+
+    // The count of each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    OffsetT bin_count[BINS_TRACKED_PER_THREAD];
+    upsweep.ExtractCounts(bin_count);
+
+    CTA_SYNC();
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin counts
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_in[bin_idx] = bin_count[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_count[track] = temp_storage.reverse_counts_in[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    // Scan
+    OffsetT bin_offset[BINS_TRACKED_PER_THREAD];     // The global scatter base offset for each digit value in this pass (valid in the first RADIX_DIGITS threads)
+    DigitScanT(temp_storage.scan).ExclusiveSum(bin_count, bin_offset);
+
+    #pragma unroll
+    for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+    {
+        bin_offset[track] += segment_begin;
+    }
+
+    if (IS_DESCENDING)
+    {
+        // Reverse bin offsets
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                temp_storage.reverse_counts_out[threadIdx.x] = bin_offset[track];
+        }
+
+        CTA_SYNC();
+
+        #pragma unroll
+        for (int track = 0; track < BINS_TRACKED_PER_THREAD; ++track)
+        {
+            int bin_idx = (threadIdx.x * BINS_TRACKED_PER_THREAD) + track;
+
+            if ((BLOCK_THREADS == RADIX_DIGITS) || (bin_idx < RADIX_DIGITS))
+                bin_offset[track] = temp_storage.reverse_counts_out[RADIX_DIGITS - bin_idx - 1];
+        }
+    }
+
+    CTA_SYNC();
+
+    // Downsweep
+    BlockDownsweepT downsweep(temp_storage.downsweep, bin_offset, num_items, d_keys_in, d_keys_out, d_values_in, d_values_out, current_bit, pass_bits);
+    downsweep.ProcessRegion(segment_begin, segment_end);
+}
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+/**
+ * Tuning policy for kernel specialization
+ */
+template <
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT>       ///< Signed integer type for global offsets
+struct DeviceRadixSortPolicy
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+    // Dominant-sized key/value type
+    typedef typename If<(sizeof(ValueT) > 4) && (sizeof(KeyT) < sizeof(ValueT)), ValueT, KeyT>::Type DominantT;
+
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy200>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+
+            // Relative size of KeyT type to a 4-byte word
+            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>   UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>       AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <512, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <64, CUB_MAX(1, 18 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 13 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+
+            // Relative size of KeyT type to a 4-byte word
+            SCALE_FACTOR_4B = (CUB_MAX(sizeof(KeyT), sizeof(ValueT)) + 3) / 4,
+        };
+
+        // Keys-only upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyKeys;
+        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 7 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyKeys;
+
+        // Key-value pairs upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, PRIMARY_RADIX_BITS>    UpsweepPolicyPairs;
+        typedef AgentRadixSortUpsweepPolicy <256, CUB_MAX(1, 5 / SCALE_FACTOR_4B), LOAD_DEFAULT, ALT_RADIX_BITS>        AltUpsweepPolicyPairs;
+
+        // Upsweep policies
+        typedef typename If<KEYS_ONLY, UpsweepPolicyKeys, UpsweepPolicyPairs>::Type         UpsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltUpsweepPolicyKeys, AltUpsweepPolicyPairs>::Type   AltUpsweepPolicy;
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 14 / SCALE_FACTOR_4B), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>       AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>    DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <128, CUB_MAX(1, 10 / SCALE_FACTOR_4B), BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, ALT_RADIX_BITS>        AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type         DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type   AltDownsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 6 : 5,    // 1.72B 32b keys/s, 1.17B 32b pairs/s, 1.55B 32b segmented keys/s (K40m)
+        };
+
+        // Scan policy
+        typedef AgentScanPolicy <1024, 4, BLOCK_LOAD_VECTORIZE, LOAD_DEFAULT, BLOCK_STORE_VECTORIZE, BLOCK_SCAN_WARP_SCANS> ScanPolicy;
+
+        // Keys-only downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(128, 9, DominantT), BLOCK_LOAD_WARP_TRANSPOSE, LOAD_LDG, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> DownsweepPolicyKeys;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(64, 18, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyKeys;
+
+        // Key-value pairs downsweep policies
+        typedef DownsweepPolicyKeys DownsweepPolicyPairs;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(128, 15, DominantT), BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1> AltDownsweepPolicyPairs;
+
+        // Downsweep policies
+        typedef typename If<KEYS_ONLY, DownsweepPolicyKeys, DownsweepPolicyPairs>::Type DownsweepPolicy;
+        typedef typename If<KEYS_ONLY, AltDownsweepPolicyKeys, AltDownsweepPolicyPairs>::Type AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef DownsweepPolicy SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+
+
+    };
+
+
+    /// SM50
+    struct Policy500 : ChainedPolicy<500, Policy500, Policy350>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.5B 32b keys/s, 1.92B 32b pairs/s (TitanX)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.1B 32b segmented keys/s (TitanX)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(160, 39, DominantT),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_BASIC, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>  DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 31, DominantT),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>   SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 11, DominantT),  BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1>       AltSegmentedPolicy;
+    };
+
+
+    /// SM60 (GP100)
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 5.9B 32b segmented keys/s (Quadro P100)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+
+    };
+
+
+    /// SM61 (GP104)
+    struct Policy610 : ChainedPolicy<610, Policy610, Policy600>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 3.4B 32b keys/s, 1.83B 32b pairs/s (1080)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 3.3B 32b segmented keys/s (1080)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 31, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 35, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef AgentRadixSortUpsweepPolicy <CUB_SCALED_GRANULARITIES(128, 16, DominantT), LOAD_LDG, PRIMARY_RADIX_BITS>        UpsweepPolicy;
+        typedef AgentRadixSortUpsweepPolicy <CUB_SCALED_GRANULARITIES(128, 16, DominantT), LOAD_LDG, PRIMARY_RADIX_BITS - 1>    AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// SM62 (Tegra, less RF)
+    struct Policy620 : ChainedPolicy<620, Policy620, Policy610>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = 5,
+            ALT_RADIX_BITS          = PRIMARY_RADIX_BITS - 1,
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 16, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_RAKING_MEMOIZE, ALT_RADIX_BITS>       AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS> SingleTilePolicy;
+
+        // Segmented policies
+        typedef DownsweepPolicy     SegmentedPolicy;
+        typedef AltDownsweepPolicy  AltSegmentedPolicy;
+    };
+
+
+    /// SM70 (GV100)
+    struct Policy700 : ChainedPolicy<700, Policy700, Policy620>
+    {
+        enum {
+            PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 7.62B 32b keys/s (GV100)
+            SINGLE_TILE_RADIX_BITS  = (sizeof(KeyT) > 1) ? 6 : 5,
+            SEGMENTED_RADIX_BITS    = (sizeof(KeyT) > 1) ? 6 : 5,    // 8.7B 32b segmented keys/s (GV100)
+        };
+
+        // ScanPolicy
+        typedef AgentScanPolicy <512, 23, BLOCK_LOAD_WARP_TRANSPOSE, LOAD_DEFAULT, BLOCK_STORE_WARP_TRANSPOSE, BLOCK_SCAN_RAKING_MEMOIZE> ScanPolicy;
+
+        // Downsweep policies
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MATCH, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS>   DownsweepPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 25, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, PRIMARY_RADIX_BITS - 1>   AltDownsweepPolicy;
+
+        // Upsweep policies
+        typedef DownsweepPolicy UpsweepPolicy;
+        typedef AltDownsweepPolicy AltUpsweepPolicy;
+
+        // Single-tile policy
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(256, 19, DominantT),  BLOCK_LOAD_DIRECT, LOAD_LDG, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SINGLE_TILE_RADIX_BITS>          SingleTilePolicy;
+
+        // Segmented policies
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(192, 39, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS>     SegmentedPolicy;
+        typedef AgentRadixSortDownsweepPolicy <CUB_SCALED_GRANULARITIES(384, 11, DominantT),  BLOCK_LOAD_TRANSPOSE, LOAD_DEFAULT, RADIX_RANK_MEMOIZE, BLOCK_SCAN_WARP_SCANS, SEGMENTED_RADIX_BITS - 1> AltSegmentedPolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy700 MaxPolicy;
+
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING, ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,          ///< Key type
+    typename ValueT,        ///< Value type
+    typename OffsetT>       ///< Signed integer type for global offsets
+struct DispatchRadixSort :
+    DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version),
+        is_overwrite_okay(is_overwrite_okay)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block to sort in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceRadixSortSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Log single_tile_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking single_tile_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                    1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, (long long) stream,
+                    ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD, 1, begin_bit, ActivePolicyT::SingleTilePolicy::RADIX_BITS);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_keys.Current(),
+                d_keys.Alternate(),
+                d_values.Current(),
+                d_values.Alternate(),
+                num_items,
+                begin_bit,
+                end_bit);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update selector
+            d_keys.selector ^= 1;
+            d_values.selector ^= 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation
+    //------------------------------------------------------------------------------
+
+    /**
+     * Invoke a three-kernel sorting pass at the current bit.
+     */
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        OffsetT         *d_spine,
+        int             spine_length,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log upsweep_kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking upsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, (long long) stream,
+                pass_config.upsweep_config.items_per_thread, pass_config.upsweep_config.sm_occupancy, current_bit, pass_bits);
+
+            // Invoke upsweep_kernel with same grid size as downsweep_kernel
+            pass_config.upsweep_kernel<<<pass_config.even_share.grid_size, pass_config.upsweep_config.block_threads, 0, stream>>>(
+                d_keys_in,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log scan_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread\n",
+                1, pass_config.scan_config.block_threads, (long long) stream, pass_config.scan_config.items_per_thread);
+
+            // Invoke scan_kernel
+            pass_config.scan_kernel<<<1, pass_config.scan_config.block_threads, 0, stream>>>(
+                d_spine,
+                spine_length);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log downsweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking downsweep_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, (long long) stream,
+                pass_config.downsweep_config.items_per_thread, pass_config.downsweep_config.sm_occupancy);
+
+            // Invoke downsweep_kernel
+            pass_config.downsweep_kernel<<<pass_config.even_share.grid_size, pass_config.downsweep_config.block_threads, 0, stream>>>(
+                d_keys_in,
+                d_keys_out,
+                d_values_in,
+                d_values_out,
+                d_spine,
+                num_items,
+                current_bit,
+                pass_bits,
+                pass_config.even_share);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+
+    /// Pass configuration structure
+    template <
+        typename UpsweepKernelT,
+        typename ScanKernelT,
+        typename DownsweepKernelT>
+    struct PassConfig
+    {
+        UpsweepKernelT          upsweep_kernel;
+        KernelConfig            upsweep_config;
+        ScanKernelT             scan_kernel;
+        KernelConfig            scan_config;
+        DownsweepKernelT        downsweep_kernel;
+        KernelConfig            downsweep_config;
+        int                     radix_bits;
+        int                     radix_digits;
+        int                     max_downsweep_grid_size;
+        GridEvenShare<OffsetT>  even_share;
+
+        /// Initialize pass configuration
+        template <
+            typename UpsweepPolicyT,
+            typename ScanPolicyT,
+            typename DownsweepPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(
+            UpsweepKernelT      upsweep_kernel,
+            ScanKernelT         scan_kernel,
+            DownsweepKernelT    downsweep_kernel,
+            int                 ptx_version,
+            int                 sm_count,
+            int                 num_items)
+        {
+            cudaError error = cudaSuccess;
+            do
+            {
+                this->upsweep_kernel    = upsweep_kernel;
+                this->scan_kernel       = scan_kernel;
+                this->downsweep_kernel  = downsweep_kernel;
+                radix_bits              = DownsweepPolicyT::RADIX_BITS;
+                radix_digits            = 1 << radix_bits;
+
+                if (CubDebug(error = upsweep_config.Init<UpsweepPolicyT>(upsweep_kernel))) break;
+                if (CubDebug(error = scan_config.Init<ScanPolicyT>(scan_kernel))) break;
+                if (CubDebug(error = downsweep_config.Init<DownsweepPolicyT>(downsweep_kernel))) break;
+
+                max_downsweep_grid_size = (downsweep_config.sm_occupancy * sm_count) * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+
+                even_share.DispatchInit(
+                    num_items,
+                    max_downsweep_grid_size,
+                    CUB_MAX(downsweep_config.tile_size, upsweep_config.tile_size));
+
+            }
+            while (0);
+            return error;
+        }
+
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename            ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename            UpsweepKernelT,         ///< Function type of cub::DeviceRadixSortUpsweepKernel
+        typename            ScanKernelT,            ///< Function type of cub::SpineScanKernel
+        typename            DownsweepKernelT>       ///< Function type of cub::DeviceRadixSortDownsweepKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        UpsweepKernelT      upsweep_kernel,         ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        UpsweepKernelT      alt_upsweep_kernel,     ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortUpsweepKernel
+        ScanKernelT         scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::SpineScanKernel
+        DownsweepKernelT    downsweep_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+        DownsweepKernelT    alt_downsweep_kernel)   ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceRadixSortDownsweepKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)upsweep_kernel;
+        (void)alt_upsweep_kernel;
+        (void)scan_kernel;
+        (void)downsweep_kernel;
+        (void)alt_downsweep_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular and alternate-digit kernel configurations
+            PassConfig<UpsweepKernelT, ScanKernelT, DownsweepKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template InitPassConfig<
+                    typename ActivePolicyT::UpsweepPolicy, 
+                    typename ActivePolicyT::ScanPolicy, 
+                    typename ActivePolicyT::DownsweepPolicy>(
+                upsweep_kernel, scan_kernel, downsweep_kernel, ptx_version, sm_count, num_items))) break;
+
+            if ((error = alt_pass_config.template InitPassConfig<
+                    typename ActivePolicyT::AltUpsweepPolicy, 
+                    typename ActivePolicyT::ScanPolicy, 
+                    typename ActivePolicyT::AltDownsweepPolicy>(
+                alt_upsweep_kernel, scan_kernel, alt_downsweep_kernel, ptx_version, sm_count, num_items))) break;
+
+            // Get maximum spine length
+            int max_grid_size       = CUB_MAX(pass_config.max_downsweep_grid_size, alt_pass_config.max_downsweep_grid_size);
+            int spine_length        = (max_grid_size * pass_config.radix_digits) + pass_config.scan_config.tile_size;
+
+            // Temporary storage allocation requirements
+            void* allocations[3];
+            size_t allocation_sizes[3] =
+            {
+                spine_length * sizeof(OffsetT),                                         // bytes needed for privatized block digit histograms
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                     // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),    // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+                return cudaSuccess;
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + pass_config.radix_bits - 1) / pass_config.radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * pass_config.radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_pass_config.radix_bits));
+
+            // Alias the temporary storage allocations
+            OffsetT *d_spine = static_cast<OffsetT*>(allocations[0]);
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[1]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[1]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[2]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[2]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                d_spine, spine_length, current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_spine, spine_length, current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;;
+
+                // Invert selectors
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchRadixSort::MaxPolicy       MaxPolicyT;
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceRadixSortSingleTileKernel<MaxPolicyT, IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceRadixSortUpsweepKernel<   MaxPolicyT, false,   IS_DESCENDING, KeyT, OffsetT>,
+                DeviceRadixSortUpsweepKernel<   MaxPolicyT, true,    IS_DESCENDING, KeyT, OffsetT>,
+                RadixSortScanBinsKernel<        MaxPolicyT, OffsetT>,
+                DeviceRadixSortDownsweepKernel< MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetT>,
+                DeviceRadixSortDownsweepKernel< MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        OffsetT                 num_items,              ///< [in] Number of items to sort
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for segmented device-wide radix sort
+ */
+template <
+    bool     IS_DESCENDING,     ///< Whether or not the sorted-order is high-to-low
+    typename KeyT,              ///< Key type
+    typename ValueT,            ///< Value type
+    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename OffsetT>           ///< Signed integer type for global offsets
+struct DispatchSegmentedRadixSort :
+    DeviceRadixSortPolicy<KeyT, ValueT, OffsetT>
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    enum
+    {
+        // Whether this is a keys-only (or key-value) sort
+        KEYS_ONLY = (Equals<ValueT, NullType>::VALUE),
+    };
+
+
+    //------------------------------------------------------------------------------
+    // Parameter members
+    //------------------------------------------------------------------------------
+
+    void                    *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t                  &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    DoubleBuffer<KeyT>      &d_keys;                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+    DoubleBuffer<ValueT>    &d_values;              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+    OffsetT                 num_items;              ///< [in] Number of items to sort
+    OffsetT                 num_segments;           ///< [in] The number of segments that comprise the sorting data
+    OffsetIteratorT         d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     begin_bit;              ///< [in] The beginning (least-significant) bit index needed for key comparison
+    int                     end_bit;                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+    cudaStream_t            stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                    debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                     ptx_version;            ///< [in] PTX version
+    bool                    is_overwrite_okay;      ///< [in] Whether is okay to overwrite source buffers
+
+
+    //------------------------------------------------------------------------------
+    // Constructors
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedRadixSort(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        DoubleBuffer<KeyT>      &d_keys,
+        DoubleBuffer<ValueT>    &d_values,
+        OffsetT                 num_items,
+        OffsetT                 num_segments,
+        OffsetIteratorT         d_begin_offsets,
+        OffsetIteratorT         d_end_offsets,
+        int                     begin_bit,
+        int                     end_bit,
+        bool                    is_overwrite_okay,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_keys(d_keys),
+        d_values(d_values),
+        num_items(num_items),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        begin_bit(begin_bit),
+        end_bit(end_bit),
+        is_overwrite_okay(is_overwrite_okay),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Multi-segment invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a three-kernel sorting pass at the current bit.
+    template <typename PassConfigT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePass(
+        const KeyT      *d_keys_in,
+        KeyT            *d_keys_out,
+        const ValueT    *d_values_in,
+        ValueT          *d_values_out,
+        int             &current_bit,
+        PassConfigT     &pass_config)
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            int pass_bits = CUB_MIN(pass_config.radix_bits, (end_bit - current_bit));
+
+            // Log kernel configuration
+            if (debug_synchronous)
+                _CubLog("Invoking segmented_kernels<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy, current bit %d, bit_grain %d\n",
+                    num_segments, pass_config.segmented_config.block_threads, (long long) stream,
+                pass_config.segmented_config.items_per_thread, pass_config.segmented_config.sm_occupancy, current_bit, pass_bits);
+
+            pass_config.segmented_kernel<<<num_segments, pass_config.segmented_config.block_threads, 0, stream>>>(
+                d_keys_in, d_keys_out,
+                d_values_in,  d_values_out,
+                d_begin_offsets, d_end_offsets, num_segments,
+                current_bit, pass_bits);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Update current bit
+            current_bit += pass_bits;
+        }
+        while (0);
+
+        return error;
+    }
+
+
+    /// PassConfig data structure
+    template <typename SegmentedKernelT>
+    struct PassConfig
+    {
+        SegmentedKernelT    segmented_kernel;
+        KernelConfig        segmented_config;
+        int                 radix_bits;
+        int                 radix_digits;
+
+        /// Initialize pass configuration
+        template <typename SegmentedPolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        cudaError_t InitPassConfig(SegmentedKernelT segmented_kernel)
+        {
+            this->segmented_kernel  = segmented_kernel;
+            this->radix_bits        = SegmentedPolicyT::RADIX_BITS;
+            this->radix_digits      = 1 << radix_bits;
+
+            return CubDebug(segmented_config.Init<SegmentedPolicyT>(segmented_kernel));
+        }
+    };
+
+
+    /// Invocation (run multiple digit passes)
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SegmentedKernelT>       ///< Function type of cub::DeviceSegmentedRadixSortKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        SegmentedKernelT     segmented_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+        SegmentedKernelT     alt_segmented_kernel)      ///< [in] Alternate kernel function pointer to parameterization of cub::DeviceSegmentedRadixSortKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+      (void)segmented_kernel;
+      (void)alt_segmented_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Init regular and alternate kernel configurations
+            PassConfig<SegmentedKernelT> pass_config, alt_pass_config;
+            if ((error = pass_config.template       InitPassConfig<typename ActivePolicyT::SegmentedPolicy>(segmented_kernel))) break;
+            if ((error = alt_pass_config.template   InitPassConfig<typename ActivePolicyT::AltSegmentedPolicy>(alt_segmented_kernel))) break;
+
+            // Temporary storage allocation requirements
+            void* allocations[2];
+            size_t allocation_sizes[2] =
+            {
+                (is_overwrite_okay) ? 0 : num_items * sizeof(KeyT),                      // bytes needed for 3rd keys buffer
+                (is_overwrite_okay || (KEYS_ONLY)) ? 0 : num_items * sizeof(ValueT),     // bytes needed for 3rd values buffer
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                if (temp_storage_bytes == 0)
+                    temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Pass planning.  Run passes of the alternate digit-size configuration until we have an even multiple of our preferred digit size
+            int radix_bits          = ActivePolicyT::SegmentedPolicy::RADIX_BITS;
+            int alt_radix_bits      = ActivePolicyT::AltSegmentedPolicy::RADIX_BITS;
+            int num_bits            = end_bit - begin_bit;
+            int num_passes          = (num_bits + radix_bits - 1) / radix_bits;
+            bool is_num_passes_odd  = num_passes & 1;
+            int max_alt_passes      = (num_passes * radix_bits) - num_bits;
+            int alt_end_bit         = CUB_MIN(end_bit, begin_bit + (max_alt_passes * alt_radix_bits));
+
+            DoubleBuffer<KeyT> d_keys_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_keys.Alternate() : static_cast<KeyT*>(allocations[0]),
+                (is_overwrite_okay) ? d_keys.Current() : (is_num_passes_odd) ? static_cast<KeyT*>(allocations[0]) : d_keys.Alternate());
+
+            DoubleBuffer<ValueT> d_values_remaining_passes(
+                (is_overwrite_okay || is_num_passes_odd) ? d_values.Alternate() : static_cast<ValueT*>(allocations[1]),
+                (is_overwrite_okay) ? d_values.Current() : (is_num_passes_odd) ? static_cast<ValueT*>(allocations[1]) : d_values.Alternate());
+
+            // Run first pass, consuming from the input's current buffers
+            int current_bit = begin_bit;
+
+            if (CubDebug(error = InvokePass(
+                d_keys.Current(), d_keys_remaining_passes.Current(),
+                d_values.Current(), d_values_remaining_passes.Current(),
+                current_bit,
+                (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+            // Run remaining passes
+            while (current_bit < end_bit)
+            {
+                if (CubDebug(error = InvokePass(
+                    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector],    d_keys_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector],  d_values_remaining_passes.d_buffers[d_keys_remaining_passes.selector ^ 1],
+                    current_bit,
+                    (current_bit < alt_end_bit) ? alt_pass_config : pass_config))) break;
+
+                // Invert selectors and update current bit
+                d_keys_remaining_passes.selector ^= 1;
+                d_values_remaining_passes.selector ^= 1;
+            }
+
+            // Update selector
+            if (!is_overwrite_okay) {
+                num_passes = 1; // Sorted data always ends up in the other vector
+            }
+
+            d_keys.selector = (d_keys.selector + num_passes) & 1;
+            d_values.selector = (d_values.selector + num_passes) & 1;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, false,   IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>,
+            DeviceSegmentedRadixSortKernel<MaxPolicyT, true,    IS_DESCENDING, KeyT, ValueT, OffsetIteratorT, OffsetT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+
+    /// Internal dispatch routine
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t                  &temp_storage_bytes,    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        DoubleBuffer<KeyT>      &d_keys,                ///< [in,out] Double-buffer whose current buffer contains the unsorted input keys and, upon return, is updated to point to the sorted output keys
+        DoubleBuffer<ValueT>    &d_values,              ///< [in,out] Double-buffer whose current buffer contains the unsorted input values and, upon return, is updated to point to the sorted output values
+        int                     num_items,              ///< [in] Number of items to sort
+        int                     num_segments,           ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT         d_begin_offsets,        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT         d_end_offsets,          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        int                     begin_bit,              ///< [in] The beginning (least-significant) bit index needed for key comparison
+        int                     end_bit,                ///< [in] The past-the-end (most-significant) bit index needed for key comparison
+        bool                    is_overwrite_okay,      ///< [in] Whether is okay to overwrite source buffers
+        cudaStream_t            stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous)      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedRadixSort::MaxPolicy MaxPolicyT;
+
+        cudaError_t error;
+        do {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedRadixSort dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_keys, d_values,
+                num_items, num_segments, d_begin_offsets, d_end_offsets,
+                begin_bit, end_bit, is_overwrite_okay,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+
+        } while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_reduce.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_reduce.cuh
new file mode 100644
index 0000000000..e9d1b7ac17
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_reduce.cuh
@@ -0,0 +1,882 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduce provides device-wide, parallel operations for computing a reduction across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_reduce.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_even_share.cuh"
+#include "../../iterator/arg_index_input_iterator.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Reduce region kernel entry point (multi-block).  Computes privatized reductions, one per thread block.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT>               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    GridEvenShare<OffsetT>  even_share,                 ///< [in] Even-share descriptor for mapping an equal number of tiles onto each thread block
+    ReductionOpT            reduction_op)               ///< [in] Binary reduction functor
+{
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeTiles(even_share);
+
+    // Output result
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = block_aggregate;
+}
+
+
+/**
+ * Reduce a single tile kernel entry point (single-block).  Can be used to aggregate privatized thread block reductions from a previous multi-block reduction pass.
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OuputT>                     ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1)
+__global__ void DeviceReduceSingleTileKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetT                 num_items,                  ///< [in] Total number of input data items
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor
+    OuputT                  init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::SingleTilePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    // Check if empty problem
+    if (num_items == 0)
+    {
+        if (threadIdx.x == 0)
+            *d_out = init;
+        return;
+    }
+
+    // Consume input tiles
+    OuputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        OffsetT(0),
+        num_items);
+
+    // Output result
+    if (threadIdx.x == 0)
+        *d_out = reduction_op(init, block_aggregate);
+}
+
+
+/// Normalize input iterator to segment offset
+template <typename T, typename OffsetT, typename IteratorT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    T &/*val*/,
+    OffsetT /*base_offset*/,
+    IteratorT /*itr*/)
+{}
+
+
+/// Normalize input iterator to segment offset (specialized for arg-index)
+template <typename KeyValuePairT, typename OffsetT, typename WrappedIteratorT, typename OutputValueT>
+__device__ __forceinline__
+void NormalizeReductionOutput(
+    KeyValuePairT &val,
+    OffsetT base_offset,
+    ArgIndexInputIterator<WrappedIteratorT, OffsetT, OutputValueT> /*itr*/)
+{
+    val.key -= base_offset;
+}
+
+
+/**
+ * Segmented reduction (one block per segment)
+ */
+template <
+    typename                ChainedPolicyT,             ///< Chained tuning policy
+    typename                InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename                OutputIteratorT,            ///< Output iterator type for recording the reduced aggregate \iterator
+    typename                OffsetIteratorT,            ///< Random-access input iterator type for reading segment offsets \iterator
+    typename                OffsetT,                    ///< Signed integer type for global offsets
+    typename                ReductionOpT,               ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename                OutputT>                    ///< Data element type that is convertible to the \p value type of \p OutputIteratorT
+__launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS))
+__global__ void DeviceSegmentedReduceKernel(
+    InputIteratorT          d_in,                       ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT         d_out,                      ///< [out] Pointer to the output aggregate
+    OffsetIteratorT         d_begin_offsets,            ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT         d_end_offsets,              ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    int                     /*num_segments*/,           ///< [in] The number of segments that comprise the sorting data
+    ReductionOpT            reduction_op,               ///< [in] Binary reduction functor 
+    OutputT                 init)                       ///< [in] The initial value of the reduction
+{
+    // Thread block type for reducing input tiles
+    typedef AgentReduce<
+            typename ChainedPolicyT::ActivePolicy::ReducePolicy,
+            InputIteratorT,
+            OutputIteratorT,
+            OffsetT,
+            ReductionOpT>
+        AgentReduceT;
+
+    // Shared memory storage
+    __shared__ typename AgentReduceT::TempStorage temp_storage;
+
+    OffsetT segment_begin   = d_begin_offsets[blockIdx.x];
+    OffsetT segment_end     = d_end_offsets[blockIdx.x];
+
+    // Check if empty problem
+    if (segment_begin == segment_end)
+    {
+        if (threadIdx.x == 0)
+            d_out[blockIdx.x] = init;
+        return;
+    }
+
+    // Consume input tiles
+    OutputT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(
+        segment_begin,
+        segment_end);
+
+    // Normalize as needed
+    NormalizeReductionOutput(block_aggregate, segment_begin, d_in);
+
+    if (threadIdx.x == 0)
+        d_out[blockIdx.x] = reduction_op(init, block_aggregate);;
+}
+
+
+
+
+/******************************************************************************
+ * Policy
+ ******************************************************************************/
+
+template <
+    typename OuputT,            ///< Data type
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+struct DeviceReducePolicy
+{
+    //------------------------------------------------------------------------------
+    // Architecture-specific tuning policies
+    //------------------------------------------------------------------------------
+
+    /// SM13
+    struct Policy130 : ChainedPolicy<130, Policy130, Policy130>
+    {
+        // ReducePolicy
+        typedef AgentReducePolicy<
+                CUB_SCALED_GRANULARITIES(128, 8, OuputT), ///< Threads per block, items per thread
+                2,                                  ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                       ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM20
+    struct Policy200 : ChainedPolicy<200, Policy200, Policy130>
+    {
+        // ReducePolicy (GTX 580: 178.9 GB/s @ 48M 4B items, 158.1 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                CUB_SCALED_GRANULARITIES(128, 8, OuputT),     ///< Threads per block, items per thread
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_RAKING,                    ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM30
+    struct Policy300 : ChainedPolicy<300, Policy300, Policy200>
+    {
+        // ReducePolicy (GTX670: 154.0 @ 48M 4B items)
+        typedef AgentReducePolicy<
+                CUB_SCALED_GRANULARITIES(256, 20, OuputT),    ///< Threads per block, items per thread
+                2,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_DEFAULT>                           ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// SM35
+    struct Policy350 : ChainedPolicy<350, Policy350, Policy300>
+    {
+        // ReducePolicy (GTX Titan: 255.1 GB/s @ 48M 4B items; 228.7 GB/s @ 192M 1B items)
+        typedef AgentReducePolicy<
+                CUB_SCALED_GRANULARITIES(256, 20, OuputT),    ///< Threads per block, items per thread
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+    /// SM60
+    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
+    {
+        // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
+        typedef AgentReducePolicy<
+                CUB_SCALED_GRANULARITIES(256, 16, OuputT),    ///< Threads per block, items per thread
+                4,                                      ///< Number of items per vectorized load
+                BLOCK_REDUCE_WARP_REDUCTIONS,           ///< Cooperative block-wide reduction algorithm to use
+                LOAD_LDG>                               ///< Cache load modifier
+            ReducePolicy;
+
+        // SingleTilePolicy
+        typedef ReducePolicy SingleTilePolicy;
+
+        // SegmentedReducePolicy
+        typedef ReducePolicy SegmentedReducePolicy;
+    };
+
+
+    /// MaxPolicy
+    typedef Policy600 MaxPolicy;
+
+};
+
+
+
+/******************************************************************************
+ * Single-problem dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+struct DispatchReduce :
+    DeviceReducePolicy<
+        typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+            typename std::iterator_traits<InputIteratorT>::value_type,                                  // ... then the input iterator's value type,
+            typename std::iterator_traits<OutputIteratorT>::value_type>::Type,                          // ... else the output iterator's value type
+        OffsetT,
+        ReductionOpT>
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    // Data type of output iterator
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;                ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;            ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                           ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                          ///< [out] Pointer to the output aggregate
+    OffsetT             num_items;                      ///< [in] Total number of input items (i.e., length of \p d_in)
+    ReductionOpT        reduction_op;                   ///< [in] Binary reduction functor 
+    OutputT             init;                           ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;                    ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_items,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_items(num_items),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+    //------------------------------------------------------------------------------
+    // Small-problem (single tile) invocation
+    //------------------------------------------------------------------------------
+
+    /// Invoke a single block block to reduce in-core
+    template <
+        typename                ActivePolicyT,          ///< Umbrella policy active for the target device
+        typename                SingleTileKernelT>      ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokeSingleTile(
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                break;
+            }
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke single_reduce_sweep_kernel
+            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_in,
+                d_out,
+                num_items,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Normal problem size invocation (two-pass)
+    //------------------------------------------------------------------------------
+
+    /// Invoke two-passes to reduce
+    template <
+        typename                ActivePolicyT,              ///< Umbrella policy active for the target device
+        typename                ReduceKernelT,              ///< Function type of cub::DeviceReduceKernel
+        typename                SingleTileKernelT>          ///< Function type of cub::DeviceReduceSingleTileKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        ReduceKernelT           reduce_kernel,          ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceKernel
+        SingleTileKernelT       single_tile_kernel)     ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceSingleTileKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)                  reduce_kernel;
+        (void)                  single_tile_kernel;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Init regular kernel configuration
+            KernelConfig reduce_config;
+            if (CubDebug(error = reduce_config.Init<typename ActivePolicyT::ReducePolicy>(reduce_kernel))) break;
+            int reduce_device_occupancy = reduce_config.sm_occupancy * sm_count;
+
+            // Even-share work distribution
+            int max_blocks = reduce_device_occupancy * CUB_SUBSCRIPTION_FACTOR(ptx_version);
+            GridEvenShare<OffsetT> even_share;
+            even_share.DispatchInit(num_items, max_blocks, reduce_config.tile_size);
+
+            // Temporary storage allocation requirements
+            void* allocations[1];
+            size_t allocation_sizes[1] =
+            {
+                max_blocks * sizeof(OutputT)    // bytes needed for privatized block reductions
+            };
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                return cudaSuccess;
+            }
+
+            // Alias the allocation for the privatized per-block reductions
+            OutputT *d_block_reductions = (OutputT*) allocations[0];
+
+            // Get grid size for device_reduce_sweep_kernel
+            int reduce_grid_size = even_share.grid_size;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                reduce_grid_size,
+                ActivePolicyT::ReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::ReducePolicy::ITEMS_PER_THREAD,
+                reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            reduce_kernel<<<reduce_grid_size, ActivePolicyT::ReducePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_in,
+                d_block_reductions,
+                num_items,
+                even_share,
+                reduction_op);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Log single_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking DeviceReduceSingleTileKernel<<<1, %d, 0, %lld>>>(), %d items per thread\n",
+                ActivePolicyT::SingleTilePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SingleTilePolicy::ITEMS_PER_THREAD);
+
+            // Invoke DeviceReduceSingleTileKernel
+            single_tile_kernel<<<1, ActivePolicyT::SingleTilePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_block_reductions,
+                d_out,
+                reduce_grid_size,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename ActivePolicyT::SingleTilePolicy    SingleTilePolicyT;
+        typedef typename DispatchReduce::MaxPolicy          MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        if (num_items <= (SingleTilePolicyT::BLOCK_THREADS * SingleTilePolicyT::ITEMS_PER_THREAD))
+        {
+            // Small, single tile size
+            return InvokeSingleTile<ActivePolicyT>(
+                DeviceReduceSingleTileKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+        else
+        {
+            // Regular size
+            return InvokePasses<ActivePolicyT>(
+                DeviceReduceKernel<typename DispatchReduce::MaxPolicy, InputIteratorT, OutputT*, OffsetT, ReductionOpT>,
+                DeviceReduceSingleTileKernel<MaxPolicyT, OutputT*, OutputIteratorT, OffsetT, ReductionOpT, OutputT>);
+        }
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        OffsetT         num_items,                          ///< [in] Total number of input items (i.e., length of \p d_in)
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor 
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchReduce::MaxPolicy MaxPolicyT;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out, num_items, reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+/******************************************************************************
+ * Segmented dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for device-wide reduction
+ */
+template <
+    typename InputIteratorT,    ///< Random-access input iterator type for reading input items \iterator
+    typename OutputIteratorT,   ///< Output iterator type for recording the reduced aggregate \iterator
+    typename OffsetIteratorT,   ///< Random-access input iterator type for reading segment offsets \iterator
+    typename OffsetT,           ///< Signed integer type for global offsets
+    typename ReductionOpT>      ///< Binary reduction functor type having member <tt>T operator()(const T &a, const T &b)</tt> 
+struct DispatchSegmentedReduce :
+    DeviceReducePolicy<
+        typename std::iterator_traits<InputIteratorT>::value_type,
+        OffsetT,
+        ReductionOpT>
+{
+    //------------------------------------------------------------------------------
+    // Constants
+    //------------------------------------------------------------------------------
+
+    /// The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+
+    //------------------------------------------------------------------------------
+    // Problem state
+    //------------------------------------------------------------------------------
+
+    void                *d_temp_storage;        ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t              &temp_storage_bytes;    ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+    InputIteratorT      d_in;                   ///< [in] Pointer to the input sequence of data items
+    OutputIteratorT     d_out;                  ///< [out] Pointer to the output aggregate
+    OffsetT             num_segments;           ///< [in] The number of segments that comprise the sorting data
+    OffsetIteratorT     d_begin_offsets;        ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+    OffsetIteratorT     d_end_offsets;          ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+    ReductionOpT        reduction_op;           ///< [in] Binary reduction functor 
+    OutputT             init;                   ///< [in] The initial value of the reduction
+    cudaStream_t        stream;                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+    bool                debug_synchronous;      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    int                 ptx_version;            ///< [in] PTX version
+
+    //------------------------------------------------------------------------------
+    // Constructor
+    //------------------------------------------------------------------------------
+
+    /// Constructor
+    CUB_RUNTIME_FUNCTION __forceinline__
+    DispatchSegmentedReduce(
+        void*                   d_temp_storage,
+        size_t                  &temp_storage_bytes,
+        InputIteratorT          d_in,
+        OutputIteratorT         d_out,
+        OffsetT                 num_segments,
+        OffsetIteratorT         d_begin_offsets,
+        OffsetIteratorT         d_end_offsets,
+        ReductionOpT            reduction_op,
+        OutputT                 init,
+        cudaStream_t            stream,
+        bool                    debug_synchronous,
+        int                     ptx_version)
+    :
+        d_temp_storage(d_temp_storage),
+        temp_storage_bytes(temp_storage_bytes),
+        d_in(d_in),
+        d_out(d_out),
+        num_segments(num_segments),
+        d_begin_offsets(d_begin_offsets),
+        d_end_offsets(d_end_offsets),
+        reduction_op(reduction_op),
+        init(init),
+        stream(stream),
+        debug_synchronous(debug_synchronous),
+        ptx_version(ptx_version)
+    {}
+
+
+
+    //------------------------------------------------------------------------------
+    // Chained policy invocation
+    //------------------------------------------------------------------------------
+
+    /// Invocation
+    template <
+        typename                        ActivePolicyT,                  ///< Umbrella policy active for the target device
+        typename                        DeviceSegmentedReduceKernelT>   ///< Function type of cub::DeviceSegmentedReduceKernel
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t InvokePasses(
+        DeviceSegmentedReduceKernelT    segmented_reduce_kernel)        ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentedReduceKernel
+    {
+#ifndef CUB_RUNTIME_ENABLED
+        (void)segmented_reduce_kernel;
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Return if the caller is simply requesting the size of the storage allocation
+            if (d_temp_storage == NULL)
+            {
+                temp_storage_bytes = 1;
+                return cudaSuccess;
+            }
+
+            // Init kernel configuration
+            KernelConfig segmented_reduce_config;
+            if (CubDebug(error = segmented_reduce_config.Init<typename ActivePolicyT::SegmentedReducePolicy>(segmented_reduce_kernel))) break;
+
+            // Log device_reduce_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking SegmentedDeviceReduceKernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                num_segments,
+                ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS,
+                (long long) stream,
+                ActivePolicyT::SegmentedReducePolicy::ITEMS_PER_THREAD,
+                segmented_reduce_config.sm_occupancy);
+
+            // Invoke DeviceReduceKernel
+            segmented_reduce_kernel<<<num_segments, ActivePolicyT::SegmentedReducePolicy::BLOCK_THREADS, 0, stream>>>(
+                d_in,
+                d_out,
+                d_begin_offsets,
+                d_end_offsets,
+                num_segments,
+                reduction_op,
+                init);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+
+    }
+
+
+    /// Invocation
+    template <typename ActivePolicyT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Invoke()
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        // Force kernel code-generation in all compiler passes
+        return InvokePasses<ActivePolicyT>(
+            DeviceSegmentedReduceKernel<MaxPolicyT, InputIteratorT, OutputIteratorT, OffsetIteratorT, OffsetT, ReductionOpT, OutputT>);
+    }
+
+
+    //------------------------------------------------------------------------------
+    // Dispatch entrypoints
+    //------------------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void            *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t          &temp_storage_bytes,                ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                               ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                              ///< [out] Pointer to the output aggregate
+        int             num_segments,                       ///< [in] The number of segments that comprise the sorting data
+        OffsetIteratorT d_begin_offsets,                    ///< [in] Pointer to the sequence of beginning offsets of length \p num_segments, such that <tt>d_begin_offsets[i]</tt> is the first element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>
+        OffsetIteratorT d_end_offsets,                      ///< [in] Pointer to the sequence of ending offsets of length \p num_segments, such that <tt>d_end_offsets[i]-1</tt> is the last element of the <em>i</em><sup>th</sup> data segment in <tt>d_keys_*</tt> and <tt>d_values_*</tt>.  If <tt>d_end_offsets[i]-1</tt> <= <tt>d_begin_offsets[i]</tt>, the <em>i</em><sup>th</sup> is considered empty.
+        ReductionOpT    reduction_op,                       ///< [in] Binary reduction functor 
+        OutputT         init,                               ///< [in] The initial value of the reduction
+        cudaStream_t    stream,                             ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)                  ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        typedef typename DispatchSegmentedReduce::MaxPolicy MaxPolicyT;
+
+        if (num_segments <= 0)
+            return cudaSuccess;
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Create dispatch functor
+            DispatchSegmentedReduce dispatch(
+                d_temp_storage, temp_storage_bytes,
+                d_in, d_out,
+                num_segments, d_begin_offsets, d_end_offsets,
+                reduction_op, init,
+                stream, debug_synchronous, ptx_version);
+
+            // Dispatch to chained policy
+            if (CubDebug(error = MaxPolicyT::Invoke(ptx_version, dispatch))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_reduce_by_key.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_reduce_by_key.cuh
new file mode 100644
index 0000000000..6f4837b7f8
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_reduce_by_key.cuh
@@ -0,0 +1,554 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceReduceByKey provides device-wide, parallel operations for reducing segments of values residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../agent/agent_reduce_by_key.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename            AgentReduceByKeyPolicyT,                 ///< Parameterized AgentReduceByKeyPolicyT tuning policy type
+    typename            KeysInputIteratorT,                     ///< Random-access input iterator type for keys
+    typename            UniqueOutputIteratorT,                  ///< Random-access output iterator type for keys
+    typename            ValuesInputIteratorT,                   ///< Random-access input iterator type for values
+    typename            AggregatesOutputIteratorT,              ///< Random-access output iterator type for values
+    typename            NumRunsOutputIteratorT,                 ///< Output iterator type for recording number of segments encountered
+    typename            ScanTileStateT,                         ///< Tile status interface type
+    typename            EqualityOpT,                            ///< KeyT equality operator type
+    typename            ReductionOpT,                           ///< ValueT reduction operator type
+    typename            OffsetT>                                ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentReduceByKeyPolicyT::BLOCK_THREADS))
+__global__ void DeviceReduceByKeyKernel(
+    KeysInputIteratorT          d_keys_in,                      ///< Pointer to the input sequence of keys
+    UniqueOutputIteratorT       d_unique_out,                   ///< Pointer to the output sequence of unique keys (one key per run)
+    ValuesInputIteratorT        d_values_in,                    ///< Pointer to the input sequence of corresponding values
+    AggregatesOutputIteratorT   d_aggregates_out,               ///< Pointer to the output sequence of value aggregates (one aggregate per run)
+    NumRunsOutputIteratorT      d_num_runs_out,                 ///< Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+    ScanTileStateT              tile_state,                     ///< Tile status interface
+    int                         start_tile,                     ///< The starting tile for the current grid
+    EqualityOpT                 equality_op,                    ///< KeyT equality operator
+    ReductionOpT                reduction_op,                   ///< ValueT reduction operator
+    OffsetT                     num_items)                      ///< Total number of items to select from
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentReduceByKey<
+            AgentReduceByKeyPolicyT,
+            KeysInputIteratorT,
+            UniqueOutputIteratorT,
+            ValuesInputIteratorT,
+            AggregatesOutputIteratorT,
+            NumRunsOutputIteratorT,
+            EqualityOpT,
+            ReductionOpT,
+            OffsetT>
+        AgentReduceByKeyT;
+
+    // Shared memory for AgentReduceByKey
+    __shared__ typename AgentReduceByKeyT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentReduceByKeyT(temp_storage, d_keys_in, d_unique_out, d_values_in, d_aggregates_out, d_num_runs_out, equality_op, reduction_op).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceReduceByKey
+ */
+template <
+    typename    KeysInputIteratorT,         ///< Random-access input iterator type for keys
+    typename    UniqueOutputIteratorT,      ///< Random-access output iterator type for keys
+    typename    ValuesInputIteratorT,       ///< Random-access input iterator type for values
+    typename    AggregatesOutputIteratorT,  ///< Random-access output iterator type for values
+    typename    NumRunsOutputIteratorT,     ///< Output iterator type for recording number of segments encountered
+    typename    EqualityOpT,                ///< KeyT equality operator type
+    typename    ReductionOpT,               ///< ValueT reduction operator type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchReduceByKey
+{
+    //-------------------------------------------------------------------------
+    // Types and constants
+    //-------------------------------------------------------------------------
+
+    // The input keys type
+    typedef typename std::iterator_traits<KeysInputIteratorT>::value_type KeyInputT;
+
+    // The output keys type
+    typedef typename If<(Equals<typename std::iterator_traits<UniqueOutputIteratorT>::value_type, void>::VALUE),    // KeyOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<KeysInputIteratorT>::value_type,                                              // ... then the input iterator's value type,
+        typename std::iterator_traits<UniqueOutputIteratorT>::value_type>::Type KeyOutputT;                         // ... else the output iterator's value type
+
+    // The input values type
+    typedef typename std::iterator_traits<ValuesInputIteratorT>::value_type ValueInputT;
+
+    // The output values type
+    typedef typename If<(Equals<typename std::iterator_traits<AggregatesOutputIteratorT>::value_type, void>::VALUE),    // ValueOutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<ValuesInputIteratorT>::value_type,                                                // ... then the input iterator's value type,
+        typename std::iterator_traits<AggregatesOutputIteratorT>::value_type>::Type ValueOutputT;                       // ... else the output iterator's value type
+
+    enum
+    {
+        INIT_KERNEL_THREADS     = 128,
+        MAX_INPUT_BYTES         = CUB_MAX(sizeof(KeyOutputT), sizeof(ValueOutputT)),
+        COMBINED_INPUT_BYTES    = sizeof(KeyOutputT) + sizeof(ValueOutputT),
+    };
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueOutputT, OffsetT> ScanTileStateT;
+
+
+    //-------------------------------------------------------------------------
+    // Tuning policies
+    //-------------------------------------------------------------------------
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = (MAX_INPUT_BYTES <= 8) ? 6 : CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 6,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 11,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, ((NOMINAL_4B_ITEMS_PER_THREAD * 8) + COMBINED_INPUT_BYTES - 1) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            ReduceByKeyPolicyT;
+    };
+
+    /// SM11
+    struct Policy110
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 8) / COMBINED_INPUT_BYTES)),
+        };
+
+        typedef AgentReduceByKeyPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING>
+            ReduceByKeyPolicyT;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxReduceByKeyPolicy : PtxPolicy::ReduceByKeyPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &reduce_by_key_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+        (void)ptx_version;
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        reduce_by_key_config.template Init<PtxReduceByKeyPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            reduce_by_key_config.template Init<typename Policy350::ReduceByKeyPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            reduce_by_key_config.template Init<typename Policy300::ReduceByKeyPolicyT>();
+        }
+        else if (ptx_version >= 200)
+        {
+            reduce_by_key_config.template Init<typename Policy200::ReduceByKeyPolicyT>();
+        }
+        else if (ptx_version >= 130)
+        {
+            reduce_by_key_config.template Init<typename Policy130::ReduceByKeyPolicyT>();
+        }
+        else
+        {
+            reduce_by_key_config.template Init<typename Policy110::ReduceByKeyPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduce-by-key using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelT,         ///< Function type of cub::DeviceScanInitKernel
+        typename                    ReduceByKeyKernelT>      ///< Function type of cub::DeviceReduceByKeyKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,             ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,         ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                  ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,               ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,           ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,             ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,               ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                  ///< [in] Total number of items to select from
+        cudaStream_t                stream,                     ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,          ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,            ///< [in] PTX version of dispatch kernels
+        ScanInitKernelT                init_kernel,                ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        ReduceByKeyKernelT             reduce_by_key_kernel,       ///< [in] Kernel function pointer to parameterization of cub::DeviceReduceByKeyKernel
+        KernelConfig                reduce_by_key_config)       ///< [in] Dispatch parameters that match the policy that \p reduce_by_key_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+      (void)d_temp_storage;
+      (void)temp_storage_bytes;
+      (void)d_keys_in;
+      (void)d_unique_out;
+      (void)d_values_in;
+      (void)d_aggregates_out;
+      (void)d_num_runs_out;
+      (void)equality_op;
+      (void)reduction_op;
+      (void)num_items;
+      (void)stream;
+      (void)debug_synchronous;
+      (void)init_kernel;
+      (void)reduce_by_key_kernel;
+      (void)reduce_by_key_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = reduce_by_key_config.block_threads * reduce_by_key_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_state,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for reduce_by_key_kernel
+            int reduce_by_key_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                reduce_by_key_sm_occupancy,            // out
+                reduce_by_key_kernel,
+                reduce_by_key_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log reduce_by_key_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d reduce_by_key_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, reduce_by_key_config.block_threads, (long long) stream, reduce_by_key_config.items_per_thread, reduce_by_key_sm_occupancy);
+
+                // Invoke reduce_by_key_kernel
+                reduce_by_key_kernel<<<scan_grid_size, reduce_by_key_config.block_threads, 0, stream>>>(
+                    d_keys_in,
+                    d_unique_out,
+                    d_values_in,
+                    d_aggregates_out,
+                    d_num_runs_out,
+                    tile_state,
+                    start_tile,
+                    equality_op,
+                    reduction_op,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        KeysInputIteratorT          d_keys_in,                      ///< [in] Pointer to the input sequence of keys
+        UniqueOutputIteratorT       d_unique_out,                   ///< [out] Pointer to the output sequence of unique keys (one key per run)
+        ValuesInputIteratorT        d_values_in,                    ///< [in] Pointer to the input sequence of corresponding values
+        AggregatesOutputIteratorT   d_aggregates_out,               ///< [out] Pointer to the output sequence of value aggregates (one aggregate per run)
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs encountered (i.e., the length of d_unique_out)
+        EqualityOpT                 equality_op,                    ///< [in] KeyT equality operator
+        ReductionOpT                reduction_op,                   ///< [in] ValueT reduction operator
+        OffsetT                     num_items,                      ///< [in] Total number of items to select from
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig reduce_by_key_config;
+            InitConfigs(ptx_version, reduce_by_key_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_keys_in,
+                d_unique_out,
+                d_values_in,
+                d_aggregates_out,
+                d_num_runs_out,
+                equality_op,
+                reduction_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceReduceByKeyKernel<PtxReduceByKeyPolicy, KeysInputIteratorT, UniqueOutputIteratorT, ValuesInputIteratorT, AggregatesOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, ReductionOpT, OffsetT>,
+                reduce_by_key_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_rle.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_rle.cuh
new file mode 100644
index 0000000000..98c3681f0a
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_rle.cuh
@@ -0,0 +1,538 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceRle provides device-wide, parallel operations for run-length-encoding sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../agent/agent_rle.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOp functor type != NullType
+ * Otherwise performs flag-based selection if FlagIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentRlePolicyT,        ///< Parameterized AgentRlePolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            ScanTileStateT,              ///< Tile status interface type
+    typename            EqualityOpT,                 ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+__launch_bounds__ (int(AgentRlePolicyT::BLOCK_THREADS))
+__global__ void DeviceRleSweepKernel(
+    InputIteratorT              d_in,               ///< [in] Pointer to input sequence of data items
+    OffsetsOutputIteratorT      d_offsets_out,      ///< [out] Pointer to output sequence of run-offsets
+    LengthsOutputIteratorT      d_lengths_out,      ///< [out] Pointer to output sequence of run-lengths
+    NumRunsOutputIteratorT      d_num_runs_out,     ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+    ScanTileStateT              tile_status,        ///< [in] Tile status interface
+    EqualityOpT                 equality_op,        ///< [in] Equality operator for input items
+    OffsetT                     num_items,          ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                         num_tiles)          ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentRle<
+        AgentRlePolicyT,
+        InputIteratorT,
+        OffsetsOutputIteratorT,
+        LengthsOutputIteratorT,
+        EqualityOpT,
+        OffsetT> AgentRleT;
+
+    // Shared memory for AgentRle
+    __shared__ typename AgentRleT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentRleT(temp_storage, d_in, d_offsets_out, d_lengths_out, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_runs_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceRle
+ */
+template <
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items \iterator
+    typename            OffsetsOutputIteratorT,     ///< Random-access output iterator type for writing run-offset values \iterator
+    typename            LengthsOutputIteratorT,     ///< Random-access output iterator type for writing run-length values \iterator
+    typename            NumRunsOutputIteratorT,     ///< Output iterator type for recording the number of runs encountered \iterator
+    typename            EqualityOpT,                ///< T equality operator type
+    typename            OffsetT>                    ///< Signed integer type for global offsets
+struct DeviceRleDispatch
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The input value type
+    typedef typename std::iterator_traits<InputIteratorT>::value_type T;
+
+    // The lengths output value type
+    typedef typename If<(Equals<typename std::iterator_traits<LengthsOutputIteratorT>::value_type, void>::VALUE),   // LengthT =  (if output iterator's value type is void) ?
+        OffsetT,                                                                                                    // ... then the OffsetT type,
+        typename std::iterator_traits<LengthsOutputIteratorT>::value_type>::Type LengthT;                           // ... else the output iterator's value type
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<LengthT, OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                96,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                true,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 5,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            RleSweepPolicy;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T)))),
+        };
+
+        typedef AgentRlePolicy<
+                256,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                true,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            RleSweepPolicy;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxRleSweepPolicy : PtxPolicy::RleSweepPolicy {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig&   device_rle_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        device_rle_config.template Init<PtxRleSweepPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            device_rle_config.template Init<typename Policy350::RleSweepPolicy>();
+        }
+        else if (ptx_version >= 300)
+        {
+            device_rle_config.template Init<typename Policy300::RleSweepPolicy>();
+        }
+        else if (ptx_version >= 200)
+        {
+            device_rle_config.template Init<typename Policy200::RleSweepPolicy>();
+        }
+        else if (ptx_version >= 130)
+        {
+            device_rle_config.template Init<typename Policy130::RleSweepPolicy>();
+        }
+        else
+        {
+            device_rle_config.template Init<typename Policy100::RleSweepPolicy>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.  Mirrors the constants within AgentRlePolicyT.
+     */
+    struct KernelConfig
+    {
+        int                     block_threads;
+        int                     items_per_thread;
+        BlockLoadAlgorithm      load_policy;
+        bool                    store_warp_time_slicing;
+        BlockScanAlgorithm      scan_algorithm;
+
+        template <typename AgentRlePolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads               = AgentRlePolicyT::BLOCK_THREADS;
+            items_per_thread            = AgentRlePolicyT::ITEMS_PER_THREAD;
+            load_policy                 = AgentRlePolicyT::LOAD_ALGORITHM;
+            store_warp_time_slicing     = AgentRlePolicyT::STORE_WARP_TIME_SLICING;
+            scan_algorithm              = AgentRlePolicyT::SCAN_ALGORITHM;
+        }
+
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Print()
+        {
+            printf("%d, %d, %d, %d, %d",
+                block_threads,
+                items_per_thread,
+                load_policy,
+                store_warp_time_slicing,
+                scan_algorithm);
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide run-length-encode using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    DeviceScanInitKernelPtr,        ///< Function type of cub::DeviceScanInitKernel
+        typename                    DeviceRleSweepKernelPtr>        ///< Function type of cub::DeviceRleSweepKernelPtr
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to the output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to the output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to the total number of runs encountered (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         ptx_version,                    ///< [in] PTX version of dispatch kernels
+        DeviceScanInitKernelPtr     device_scan_init_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        DeviceRleSweepKernelPtr     device_rle_sweep_kernel,        ///< [in] Kernel function pointer to parameterization of cub::DeviceRleSweepKernel
+        KernelConfig                device_rle_config)              ///< [in] Dispatch parameters that match the policy that \p device_rle_sweep_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = device_rle_config.block_threads * device_rle_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log device_scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking device_scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke device_scan_init_kernel to initialize tile descriptors and queue descriptors
+            device_scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_status,
+                num_tiles,
+                d_num_runs_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for device_rle_sweep_kernel
+            int device_rle_kernel_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                device_rle_kernel_sm_occupancy,            // out
+                device_rle_sweep_kernel,
+                device_rle_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log device_rle_sweep_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking device_rle_sweep_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, device_rle_config.block_threads, (long long) stream, device_rle_config.items_per_thread, device_rle_kernel_sm_occupancy);
+
+            // Invoke device_rle_sweep_kernel
+            device_rle_sweep_kernel<<<scan_grid_size, device_rle_config.block_threads, 0, stream>>>(
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                tile_status,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to input sequence of data items
+        OffsetsOutputIteratorT      d_offsets_out,                  ///< [out] Pointer to output sequence of run-offsets
+        LengthsOutputIteratorT      d_lengths_out,                  ///< [out] Pointer to output sequence of run-lengths
+        NumRunsOutputIteratorT      d_num_runs_out,                 ///< [out] Pointer to total number of runs (i.e., length of \p d_offsets_out)
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator for input items
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig device_rle_config;
+            InitConfigs(ptx_version, device_rle_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_offsets_out,
+                d_lengths_out,
+                d_num_runs_out,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumRunsOutputIteratorT>,
+                DeviceRleSweepKernel<PtxRleSweepPolicy, InputIteratorT, OffsetsOutputIteratorT, LengthsOutputIteratorT, NumRunsOutputIteratorT, ScanTileStateT, EqualityOpT, OffsetT>,
+                device_rle_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_scan.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_scan.cuh
new file mode 100644
index 0000000000..3ef720a446
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_scan.cuh
@@ -0,0 +1,563 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceScan provides device-wide, parallel operations for computing a prefix scan across a sequence of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/agent_scan.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_arch.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename            ScanTileStateT>     ///< Tile status interface type
+__global__ void DeviceScanInitKernel(
+    ScanTileStateT      tile_state,         ///< [in] Tile status interface
+    int                 num_tiles)          ///< [in] Number of tiles
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+}
+
+/**
+ * Initialization kernel for tile status initialization (multi-block)
+ */
+template <
+    typename                ScanTileStateT,         ///< Tile status interface type
+    typename                NumSelectedIteratorT>   ///< Output iterator type for recording the number of items selected
+__global__ void DeviceCompactInitKernel(
+    ScanTileStateT          tile_state,             ///< [in] Tile status interface
+    int                     num_tiles,              ///< [in] Number of tiles
+    NumSelectedIteratorT    d_num_selected_out)     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+{
+    // Initialize tile status
+    tile_state.InitializeStatus(num_tiles);
+
+    // Initialize d_num_selected_out
+    if ((blockIdx.x == 0) && (threadIdx.x == 0))
+        *d_num_selected_out = 0;
+}
+
+
+/**
+ * Scan kernel entry point (multi-block)
+ */
+template <
+    typename            ScanPolicyT,        ///< Parameterized ScanPolicyT tuning policy type
+    typename            InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename            OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename            ScanTileStateT,     ///< Tile status interface type
+    typename            ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename            InitValueT,         ///< Initial value to seed the exclusive scan (cub::NullType for inclusive scans)
+    typename            OffsetT>            ///< Signed integer type for global offsets
+__launch_bounds__ (int(ScanPolicyT::BLOCK_THREADS))
+__global__ void DeviceScanKernel(
+    InputIteratorT      d_in,               ///< Input data
+    OutputIteratorT     d_out,              ///< Output data
+    ScanTileStateT      tile_state,         ///< Tile status interface
+    int                 start_tile,         ///< The starting tile for the current grid
+    ScanOpT             scan_op,            ///< Binary scan functor 
+    InitValueT          init_value,         ///< Initial value to seed the exclusive scan
+    OffsetT             num_items)          ///< Total number of scan items for the entire problem
+{
+    // Thread block type for scanning input tiles
+    typedef AgentScan<
+        ScanPolicyT,
+        InputIteratorT,
+        OutputIteratorT,
+        ScanOpT,
+        InitValueT,
+        OffsetT> AgentScanT;
+
+    // Shared memory for AgentScan
+    __shared__ typename AgentScanT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentScanT(temp_storage, d_in, d_out, scan_op, init_value).ConsumeRange(
+        num_items,
+        tile_state,
+        start_tile);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceScan
+ */
+template <
+    typename InputIteratorT,     ///< Random-access input iterator type for reading scan inputs \iterator
+    typename OutputIteratorT,    ///< Random-access output iterator type for writing scan outputs \iterator
+    typename ScanOpT,            ///< Binary scan functor type having member <tt>T operator()(const T &a, const T &b)</tt>
+    typename InitValueT,          ///< The init_value element type for ScanOpT (cub::NullType for inclusive scans)
+    typename OffsetT>            ///< Signed integer type for global offsets
+struct DispatchScan
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<OutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                          // ... then the input iterator's value type,
+        typename std::iterator_traits<OutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OutputT> ScanTileStateT;
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    /// SM600
+    struct Policy600
+    {
+        typedef AgentScanPolicy<
+            CUB_SCALED_GRANULARITIES(128, 15, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+
+    /// SM520
+    struct Policy520
+    {
+        // Titan X: 32.47B items/s @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(128, 12, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+
+    /// SM35
+    struct Policy350
+    {
+        // GTX Titan: 29.5B items/s (232.4 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(128, 12, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_STORE_WARP_TRANSPOSE_TIMESLICED,
+                BLOCK_SCAN_RAKING>
+            ScanPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(256, 9, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        // GTX 580: 20.3B items/s (162.3 GB/s) @ 48M 32-bit T
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(128, 12, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(96, 21, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            ScanPolicyT;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        typedef AgentScanPolicy<
+                CUB_SCALED_GRANULARITIES(64, 9, OutputT),      ///< Threads per block, items per thread
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_STORE_WARP_TRANSPOSE,
+                BLOCK_SCAN_WARP_SCANS>
+            ScanPolicyT;
+    };
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 600)
+    typedef Policy600 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 520)
+    typedef Policy520 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxAgentScanPolicy : PtxPolicy::ScanPolicyT {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &scan_kernel_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+        (void)ptx_version;
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        scan_kernel_config.template Init<PtxAgentScanPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 600)
+        {
+            scan_kernel_config.template Init<typename Policy600::ScanPolicyT>();
+        }
+        else if (ptx_version >= 520)
+        {
+            scan_kernel_config.template Init<typename Policy520::ScanPolicyT>();
+        }
+        else if (ptx_version >= 350)
+        {
+            scan_kernel_config.template Init<typename Policy350::ScanPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            scan_kernel_config.template Init<typename Policy300::ScanPolicyT>();
+        }
+        else if (ptx_version >= 200)
+        {
+            scan_kernel_config.template Init<typename Policy200::ScanPolicyT>();
+        }
+        else if (ptx_version >= 130)
+        {
+            scan_kernel_config.template Init<typename Policy130::ScanPolicyT>();
+        }
+        else
+        {
+            scan_kernel_config.template Init<typename Policy100::ScanPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide prefix scan using the
+     * specified kernel functions.
+     */
+    template <
+        typename            ScanInitKernelPtrT,     ///< Function type of cub::DeviceScanInitKernel
+        typename            ScanSweepKernelPtrT>    ///< Function type of cub::DeviceScanKernelPtrT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*               d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&             temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT      d_in,                   ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT     d_out,                  ///< [out] Pointer to the output sequence of data items
+        ScanOpT             scan_op,                ///< [in] Binary scan functor 
+        InitValueT          init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT             num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t        stream,                 ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                debug_synchronous,      ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                 /*ptx_version*/,        ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtrT  init_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        ScanSweepKernelPtrT scan_kernel,            ///< [in] Kernel function pointer to parameterization of cub::DeviceScanKernel
+        KernelConfig        scan_kernel_config)     ///< [in] Dispatch parameters that match the policy that \p scan_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+        (void)d_temp_storage;
+        (void)temp_storage_bytes;
+        (void)d_in;
+        (void)d_out;
+        (void)scan_op;
+        (void)init_value;
+        (void)num_items;
+        (void)stream;
+        (void)debug_synchronous;
+        (void)init_kernel;
+        (void)scan_kernel;
+        (void)scan_kernel_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = scan_kernel_config.block_threads * scan_kernel_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log init_kernel configuration
+            int init_grid_size = (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS;
+            if (debug_synchronous) _CubLog("Invoking init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke init_kernel to initialize tile descriptors
+            init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_state,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Get SM occupancy for scan_kernel
+            int scan_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                scan_sm_occupancy,            // out
+                scan_kernel,
+                scan_kernel_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Run grids in epochs (in case number of tiles exceeds max x-dimension
+            int scan_grid_size = CUB_MIN(num_tiles, max_dim_x);
+            for (int start_tile = 0; start_tile < num_tiles; start_tile += scan_grid_size)
+            {
+                // Log scan_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking %d scan_kernel<<<%d, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    start_tile, scan_grid_size, scan_kernel_config.block_threads, (long long) stream, scan_kernel_config.items_per_thread, scan_sm_occupancy);
+
+                // Invoke scan_kernel
+                scan_kernel<<<scan_grid_size, scan_kernel_config.block_threads, 0, stream>>>(
+                    d_in,
+                    d_out,
+                    tile_state,
+                    start_tile,
+                    scan_op,
+                    init_value,
+                    num_items);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*           d_temp_storage,         ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&         temp_storage_bytes,     ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT  d_in,                   ///< [in] Pointer to the input sequence of data items
+        OutputIteratorT d_out,                  ///< [out] Pointer to the output sequence of data items
+        ScanOpT         scan_op,                ///< [in] Binary scan functor 
+        InitValueT      init_value,             ///< [in] Initial value to seed the exclusive scan
+        OffsetT         num_items,              ///< [in] Total number of input items (i.e., the length of \p d_in)
+        cudaStream_t    stream,                 ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool            debug_synchronous)      ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig scan_kernel_config;
+            InitConfigs(ptx_version, scan_kernel_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_out,
+                scan_op,
+                init_value,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceScanInitKernel<ScanTileStateT>,
+                DeviceScanKernel<PtxAgentScanPolicy, InputIteratorT, OutputIteratorT, ScanTileStateT, ScanOpT, InitValueT, OffsetT>,
+                scan_kernel_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_select_if.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_select_if.cuh
new file mode 100644
index 0000000000..60b331338d
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_select_if.cuh
@@ -0,0 +1,542 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSelect provides device-wide, parallel operations for selecting items from sequences of data items residing within device-accessible memory.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "dispatch_scan.cuh"
+#include "../../agent/agent_select_if.cuh"
+#include "../../thread/thread_operators.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_device.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/******************************************************************************
+ * Kernel entry points
+ *****************************************************************************/
+
+/**
+ * Select kernel entry point (multi-block)
+ *
+ * Performs functor-based selection if SelectOpT functor type != NullType
+ * Otherwise performs flag-based selection if FlagsInputIterator's value type != NullType
+ * Otherwise performs discontinuity selection (keep unique)
+ */
+template <
+    typename            AgentSelectIfPolicyT,       ///< Parameterized AgentSelectIfPolicyT tuning policy type
+    typename            InputIteratorT,             ///< Random-access input iterator type for reading input items
+    typename            FlagsInputIteratorT,        ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename            SelectedOutputIteratorT,    ///< Random-access output iterator type for writing selected items
+    typename            NumSelectedIteratorT,       ///< Output iterator type for recording the number of items selected
+    typename            ScanTileStateT,             ///< Tile status interface type
+    typename            SelectOpT,                  ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename            EqualityOpT,                ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename            OffsetT,                    ///< Signed integer type for global offsets
+    bool                KEEP_REJECTS>               ///< Whether or not we push rejected items to the back of the output
+__launch_bounds__ (int(AgentSelectIfPolicyT::BLOCK_THREADS))
+__global__ void DeviceSelectSweepKernel(
+    InputIteratorT          d_in,                   ///< [in] Pointer to the input sequence of data items
+    FlagsInputIteratorT     d_flags,                ///< [in] Pointer to the input sequence of selection flags (if applicable)
+    SelectedOutputIteratorT d_selected_out,         ///< [out] Pointer to the output sequence of selected data items
+    NumSelectedIteratorT    d_num_selected_out,     ///< [out] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+    ScanTileStateT          tile_status,            ///< [in] Tile status interface
+    SelectOpT               select_op,              ///< [in] Selection operator
+    EqualityOpT             equality_op,            ///< [in] Equality operator
+    OffsetT                 num_items,              ///< [in] Total number of input items (i.e., length of \p d_in)
+    int                     num_tiles)              ///< [in] Total number of tiles for the entire problem
+{
+    // Thread block type for selecting data from input tiles
+    typedef AgentSelectIf<
+        AgentSelectIfPolicyT,
+        InputIteratorT,
+        FlagsInputIteratorT,
+        SelectedOutputIteratorT,
+        SelectOpT,
+        EqualityOpT,
+        OffsetT,
+        KEEP_REJECTS> AgentSelectIfT;
+
+    // Shared memory for AgentSelectIf
+    __shared__ typename AgentSelectIfT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSelectIfT(temp_storage, d_in, d_flags, d_selected_out, select_op, equality_op, num_items).ConsumeRange(
+        num_tiles,
+        tile_status,
+        d_num_selected_out);
+}
+
+
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSelect
+ */
+template <
+    typename    InputIteratorT,                 ///< Random-access input iterator type for reading input items
+    typename    FlagsInputIteratorT,            ///< Random-access input iterator type for reading selection flags (NullType* if a selection functor or discontinuity flagging is to be used for selection)
+    typename    SelectedOutputIteratorT,        ///< Random-access output iterator type for writing selected items
+    typename    NumSelectedIteratorT,           ///< Output iterator type for recording the number of items selected
+    typename    SelectOpT,                      ///< Selection operator type (NullType if selection flags or discontinuity flagging is to be used for selection)
+    typename    EqualityOpT,                    ///< Equality operator type (NullType if selection functor or selection flags is to be used for selection)
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    bool        KEEP_REJECTS>                   ///< Whether or not we push rejected items to the back of the output
+struct DispatchSelectIf
+{
+    /******************************************************************************
+     * Types and constants
+     ******************************************************************************/
+
+    // The output value type
+    typedef typename If<(Equals<typename std::iterator_traits<SelectedOutputIteratorT>::value_type, void>::VALUE),  // OutputT =  (if output iterator's value type is void) ?
+        typename std::iterator_traits<InputIteratorT>::value_type,                                                  // ... then the input iterator's value type,
+        typename std::iterator_traits<SelectedOutputIteratorT>::value_type>::Type OutputT;                          // ... else the output iterator's value type
+
+    // The flag value type
+    typedef typename std::iterator_traits<FlagsInputIteratorT>::value_type FlagT;
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128,
+    };
+
+    // Tile status descriptor interface type
+    typedef ScanTileState<OffsetT> ScanTileStateT;
+
+
+    /******************************************************************************
+     * Tuning policies
+     ******************************************************************************/
+
+    /// SM35
+    struct Policy350
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 10,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM30
+    struct Policy300
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 7,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(3, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM20
+    struct Policy200
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = (KEEP_REJECTS) ? 7 : 15,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                128,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SelectIfPolicyT;
+    };
+
+    /// SM13
+    struct Policy130
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SelectIfPolicyT;
+    };
+
+    /// SM10
+    struct Policy100
+    {
+        enum {
+            NOMINAL_4B_ITEMS_PER_THREAD = 9,
+            ITEMS_PER_THREAD            = CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD, CUB_MAX(1, (NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(OutputT)))),
+        };
+
+        typedef AgentSelectIfPolicy<
+                64,
+                ITEMS_PER_THREAD,
+                BLOCK_LOAD_WARP_TRANSPOSE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_RAKING>
+            SelectIfPolicyT;
+    };
+
+
+    /******************************************************************************
+     * Tuning policies of current PTX compiler pass
+     ******************************************************************************/
+
+#if (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 130)
+    typedef Policy130 PtxPolicy;
+
+#else
+    typedef Policy100 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSelectIfPolicyT : PtxPolicy::SelectIfPolicyT {};
+
+
+    /******************************************************************************
+     * Utilities
+     ******************************************************************************/
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &select_if_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+        (void)ptx_version;
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        select_if_config.template Init<PtxSelectIfPolicyT>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 350)
+        {
+            select_if_config.template Init<typename Policy350::SelectIfPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            select_if_config.template Init<typename Policy300::SelectIfPolicyT>();
+        }
+        else if (ptx_version >= 200)
+        {
+            select_if_config.template Init<typename Policy200::SelectIfPolicyT>();
+        }
+        else if (ptx_version >= 130)
+        {
+            select_if_config.template Init<typename Policy130::SelectIfPolicyT>();
+        }
+        else
+        {
+            select_if_config.template Init<typename Policy100::SelectIfPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    /******************************************************************************
+     * Dispatch entrypoints
+     ******************************************************************************/
+
+    /**
+     * Internal dispatch routine for computing a device-wide selection using the
+     * specified kernel functions.
+     */
+    template <
+        typename                    ScanInitKernelPtrT,             ///< Function type of cub::DeviceScanInitKernel
+        typename                    SelectIfKernelPtrT>             ///< Function type of cub::SelectIfKernelPtrT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous,              ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        int                         /*ptx_version*/,                ///< [in] PTX version of dispatch kernels
+        ScanInitKernelPtrT          scan_init_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceScanInitKernel
+        SelectIfKernelPtrT          select_if_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSelectSweepKernel
+        KernelConfig                select_if_config)               ///< [in] Dispatch parameters that match the policy that \p select_if_kernel was compiled for
+    {
+
+#ifndef CUB_RUNTIME_ENABLED
+        (void)d_temp_storage;
+        (void)temp_storage_bytes;
+        (void)d_in;
+        (void)d_flags;
+        (void)d_selected_out;
+        (void)d_num_selected_out;
+        (void)select_op;
+        (void)equality_op;
+        (void)num_items;
+        (void)stream;
+        (void)debug_synchronous;
+        (void)scan_init_kernel;
+        (void)select_if_kernel;
+        (void)select_if_config;
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported);
+
+#else
+
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Number of input tiles
+            int tile_size = select_if_config.block_threads * select_if_config.items_per_thread;
+            int num_tiles = (num_items + tile_size - 1) / tile_size;
+
+            // Specify temporary storage allocation requirements
+            size_t  allocation_sizes[1];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_tiles, allocation_sizes[0]))) break;    // bytes needed for tile status descriptors
+
+            // Compute allocation pointers into the single storage blob (or compute the necessary size of the blob)
+            void* allocations[1];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_status;
+            if (CubDebug(error = tile_status.Init(num_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Log scan_init_kernel configuration
+            int init_grid_size = CUB_MAX(1, (num_tiles + INIT_KERNEL_THREADS - 1) / INIT_KERNEL_THREADS);
+            if (debug_synchronous) _CubLog("Invoking scan_init_kernel<<<%d, %d, 0, %lld>>>()\n", init_grid_size, INIT_KERNEL_THREADS, (long long) stream);
+
+            // Invoke scan_init_kernel to initialize tile descriptors
+            scan_init_kernel<<<init_grid_size, INIT_KERNEL_THREADS, 0, stream>>>(
+                tile_status,
+                num_tiles,
+                d_num_selected_out);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Return if empty problem
+            if (num_items == 0)
+                break;
+
+            // Get SM occupancy for select_if_kernel
+            int range_select_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                range_select_sm_occupancy,            // out
+                select_if_kernel,
+                select_if_config.block_threads))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Get grid size for scanning tiles
+            dim3 scan_grid_size;
+            scan_grid_size.z = 1;
+            scan_grid_size.y = ((unsigned int) num_tiles + max_dim_x - 1) / max_dim_x;
+            scan_grid_size.x = CUB_MIN(num_tiles, max_dim_x);
+
+            // Log select_if_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking select_if_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                scan_grid_size.x, scan_grid_size.y, scan_grid_size.z, select_if_config.block_threads, (long long) stream, select_if_config.items_per_thread, range_select_sm_occupancy);
+
+            // Invoke select_if_kernel
+            select_if_kernel<<<scan_grid_size, select_if_config.block_threads, 0, stream>>>(
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                tile_status,
+                select_op,
+                equality_op,
+                num_items,
+                num_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+        }
+        while (0);
+
+        return error;
+
+#endif  // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                       d_temp_storage,                 ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                     temp_storage_bytes,             ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        InputIteratorT              d_in,                           ///< [in] Pointer to the input sequence of data items
+        FlagsInputIteratorT         d_flags,                        ///< [in] Pointer to the input sequence of selection flags (if applicable)
+        SelectedOutputIteratorT     d_selected_out,                 ///< [in] Pointer to the output sequence of selected data items
+        NumSelectedIteratorT        d_num_selected_out,             ///< [in] Pointer to the total number of items selected (i.e., length of \p d_selected_out)
+        SelectOpT                   select_op,                      ///< [in] Selection operator
+        EqualityOpT                 equality_op,                    ///< [in] Equality operator
+        OffsetT                     num_items,                      ///< [in] Total number of input items (i.e., length of \p d_in)
+        cudaStream_t                stream,                         ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                        debug_synchronous)              ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig select_if_config;
+            InitConfigs(ptx_version, select_if_config);
+
+            // Dispatch
+            if (CubDebug(error = Dispatch(
+                d_temp_storage,
+                temp_storage_bytes,
+                d_in,
+                d_flags,
+                d_selected_out,
+                d_num_selected_out,
+                select_op,
+                equality_op,
+                num_items,
+                stream,
+                debug_synchronous,
+                ptx_version,
+                DeviceCompactInitKernel<ScanTileStateT, NumSelectedIteratorT>,
+                DeviceSelectSweepKernel<PtxSelectIfPolicyT, InputIteratorT, FlagsInputIteratorT, SelectedOutputIteratorT, NumSelectedIteratorT, ScanTileStateT, SelectOpT, EqualityOpT, OffsetT, KEEP_REJECTS>,
+                select_if_config))) break;
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_spmv_orig.cuh b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_spmv_orig.cuh
new file mode 100644
index 0000000000..ab9c5346d2
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/device/dispatch/dispatch_spmv_orig.cuh
@@ -0,0 +1,834 @@
+
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::DeviceSpmv provides device-wide parallel operations for performing sparse-matrix * vector multiplication (SpMV).
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include <iterator>
+
+#include "../../agent/single_pass_scan_operators.cuh"
+#include "../../agent/agent_segment_fixup.cuh"
+#include "../../agent/agent_spmv_orig.cuh"
+#include "../../util_type.cuh"
+#include "../../util_debug.cuh"
+#include "../../util_device.cuh"
+#include "../../thread/thread_search.cuh"
+#include "../../grid/grid_queue.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * SpMV kernel entry points
+ *****************************************************************************/
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    AgentSpmvPolicyT,           ///< Parameterized SpmvPolicy tuning policy type
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT>                    ///< Signed integer type for sequence offsets
+__global__ void DeviceSpmv1ColKernel(
+    SpmvParams<ValueT, OffsetT> spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    typedef CacheModifiedInputIterator<
+            AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER,
+            ValueT,
+            OffsetT>
+        VectorValueIteratorT;
+
+    VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x);
+
+    int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (row_idx < spmv_params.num_rows)
+    {
+        OffsetT     end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx];
+        OffsetT     nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1];
+
+        ValueT value = 0.0;
+        if (end_nonzero_idx != nonzero_idx)
+        {
+            value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]];
+        }
+
+        spmv_params.d_vector_y[row_idx] = value;
+    }
+}
+
+
+/**
+ * Spmv search kernel. Identifies merge path starting coordinates for each tile.
+ */
+template <
+    typename    SpmvPolicyT,                    ///< Parameterized SpmvPolicy tuning policy type
+    typename    OffsetT,                        ///< Signed integer type for sequence offsets
+    typename    CoordinateT,                    ///< Merge path coordinate type
+    typename    SpmvParamsT>                    ///< SpmvParams type
+__global__ void DeviceSpmvSearchKernel(
+    int             num_merge_tiles,            ///< [in] Number of SpMV merge tiles (spmv grid size)
+    CoordinateT*    d_tile_coordinates,         ///< [out] Pointer to the temporary array of tile starting coordinates
+    SpmvParamsT     spmv_params)                ///< [in] SpMV input parameter bundle
+{
+    /// Constants
+    enum
+    {
+        BLOCK_THREADS           = SpmvPolicyT::BLOCK_THREADS,
+        ITEMS_PER_THREAD        = SpmvPolicyT::ITEMS_PER_THREAD,
+        TILE_ITEMS              = BLOCK_THREADS * ITEMS_PER_THREAD,
+    };
+
+    typedef CacheModifiedInputIterator<
+            SpmvPolicyT::ROW_OFFSETS_SEARCH_LOAD_MODIFIER,
+            OffsetT,
+            OffsetT>
+        RowOffsetsSearchIteratorT;
+
+    // Find the starting coordinate for all tiles (plus the end coordinate of the last one)
+    int tile_idx = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (tile_idx < num_merge_tiles + 1)
+    {
+        OffsetT                         diagonal = (tile_idx * TILE_ITEMS);
+        CoordinateT                     tile_coordinate;
+        CountingInputIterator<OffsetT>  nonzero_indices(0);
+
+        // Search the merge path
+        MergePathSearch(
+            diagonal,
+            RowOffsetsSearchIteratorT(spmv_params.d_row_end_offsets),
+            nonzero_indices,
+            spmv_params.num_rows,
+            spmv_params.num_nonzeros,
+            tile_coordinate);
+
+        // Output starting offset
+        d_tile_coordinates[tile_idx] = tile_coordinate;
+    }
+}
+
+
+/**
+ * Spmv agent entry point
+ */
+template <
+    typename        SpmvPolicyT,                ///< Parameterized SpmvPolicy tuning policy type
+    typename        ScanTileStateT,             ///< Tile status interface type
+    typename        ValueT,                     ///< Matrix and vector value type
+    typename        OffsetT,                    ///< Signed integer type for sequence offsets
+    typename        CoordinateT,                ///< Merge path coordinate type
+    bool            HAS_ALPHA,                  ///< Whether the input parameter Alpha is 1
+    bool            HAS_BETA>                   ///< Whether the input parameter Beta is 0
+__launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS))
+__global__ void DeviceSpmvKernel(
+    SpmvParams<ValueT, OffsetT>     spmv_params,                ///< [in] SpMV input parameter bundle
+    CoordinateT*                    d_tile_coordinates,         ///< [in] Pointer to the temporary array of tile starting coordinates
+    KeyValuePair<OffsetT,ValueT>*   d_tile_carry_pairs,         ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block
+    int                             num_tiles,                  ///< [in] Number of merge tiles
+    ScanTileStateT                  tile_state,                 ///< [in] Tile status interface for fixup reduce-by-key kernel
+    int                             num_segment_fixup_tiles)    ///< [in] Number of reduce-by-key tiles (fixup grid size)
+{
+    // Spmv agent type specialization
+    typedef AgentSpmv<
+            SpmvPolicyT,
+            ValueT,
+            OffsetT,
+            HAS_ALPHA,
+            HAS_BETA>
+        AgentSpmvT;
+
+    // Shared memory for AgentSpmv
+    __shared__ typename AgentSpmvT::TempStorage temp_storage;
+
+    AgentSpmvT(temp_storage, spmv_params).ConsumeTile(
+        d_tile_coordinates,
+        d_tile_carry_pairs,
+        num_tiles);
+
+    // Initialize fixup tile status
+    tile_state.InitializeStatus(num_segment_fixup_tiles);
+
+}
+
+
+/**
+ * Multi-block reduce-by-key sweep kernel entry point
+ */
+template <
+    typename    AgentSegmentFixupPolicyT,       ///< Parameterized AgentSegmentFixupPolicy tuning policy type
+    typename    PairsInputIteratorT,            ///< Random-access input iterator type for keys
+    typename    AggregatesOutputIteratorT,      ///< Random-access output iterator type for values
+    typename    OffsetT,                        ///< Signed integer type for global offsets
+    typename    ScanTileStateT>                 ///< Tile status interface type
+__launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS))
+__global__ void DeviceSegmentFixupKernel(
+    PairsInputIteratorT         d_pairs_in,         ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block
+    AggregatesOutputIteratorT   d_aggregates_out,   ///< [in,out] Output value aggregates
+    OffsetT                     num_items,          ///< [in] Total number of items to select from
+    int                         num_tiles,          ///< [in] Total number of tiles for the entire problem
+    ScanTileStateT              tile_state)         ///< [in] Tile status interface
+{
+    // Thread block type for reducing tiles of value segments
+    typedef AgentSegmentFixup<
+            AgentSegmentFixupPolicyT,
+            PairsInputIteratorT,
+            AggregatesOutputIteratorT,
+            cub::Equality,
+            cub::Sum,
+            OffsetT>
+        AgentSegmentFixupT;
+
+    // Shared memory for AgentSegmentFixup
+    __shared__ typename AgentSegmentFixupT::TempStorage temp_storage;
+
+    // Process tiles
+    AgentSegmentFixupT(temp_storage, d_pairs_in, d_aggregates_out, cub::Equality(), cub::Sum()).ConsumeRange(
+        num_items,
+        num_tiles,
+        tile_state);
+}
+
+
+/******************************************************************************
+ * Dispatch
+ ******************************************************************************/
+
+/**
+ * Utility class for dispatching the appropriately-tuned kernels for DeviceSpmv
+ */
+template <
+    typename    ValueT,                     ///< Matrix and vector value type
+    typename    OffsetT>                    ///< Signed integer type for global offsets
+struct DispatchSpmv
+{
+    //---------------------------------------------------------------------
+    // Constants and Types
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        INIT_KERNEL_THREADS = 128
+    };
+
+    // SpmvParams bundle type
+    typedef SpmvParams<ValueT, OffsetT> SpmvParamsT;
+
+    // 2D merge path coordinate type
+    typedef typename CubVector<OffsetT, 2>::Type CoordinateT;
+
+    // Tile status descriptor interface type
+    typedef ReduceByKeyScanTileState<ValueT, OffsetT> ScanTileStateT;
+
+    // Tuple type for scanning (pairs accumulated segment-value with segment-index)
+    typedef KeyValuePair<OffsetT, ValueT> KeyValuePairT;
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies
+    //---------------------------------------------------------------------
+
+    /// SM11
+    struct Policy110
+    {
+        typedef AgentSpmvPolicy<
+                128,
+                1,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM20
+    struct Policy200 
+    {
+        typedef AgentSpmvPolicy<
+                96,
+                18,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_RAKING>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+
+    };
+
+
+
+    /// SM30
+    struct Policy300 
+    {
+        typedef AgentSpmvPolicy<
+                96,
+                6,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                4,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_DEFAULT,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+
+    };
+
+
+    /// SM35
+    struct Policy350
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 96 : 128,
+                (sizeof(ValueT) > 4) ? 4 : 7,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM37
+    struct Policy370
+    {
+
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 128 : 128,
+                (sizeof(ValueT) > 4) ? 9 : 14,
+                LOAD_LDG,
+                LOAD_CA,
+                LOAD_LDG,
+                LOAD_LDG,
+                LOAD_LDG,
+                false, 
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+    /// SM50
+    struct Policy500
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 6 : 7,
+                LOAD_LDG,
+                LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                (sizeof(ValueT) > 4) ? LOAD_LDG : LOAD_DEFAULT,
+                LOAD_LDG,
+                (sizeof(ValueT) > 4) ? true : false,
+                (sizeof(ValueT) > 4) ? BLOCK_SCAN_WARP_SCANS : BLOCK_SCAN_RAKING_MEMOIZE>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_VECTORIZE,
+                LOAD_LDG,
+                BLOCK_SCAN_RAKING_MEMOIZE>
+            SegmentFixupPolicyT;
+    };
+
+
+    /// SM60
+    struct Policy600
+    {
+        typedef AgentSpmvPolicy<
+                (sizeof(ValueT) > 4) ? 64 : 128,
+                (sizeof(ValueT) > 4) ? 5 : 7,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                LOAD_DEFAULT,
+                false,
+                BLOCK_SCAN_WARP_SCANS>
+            SpmvPolicyT;
+
+
+        typedef AgentSegmentFixupPolicy<
+                128,
+                3,
+                BLOCK_LOAD_DIRECT,
+                LOAD_LDG,
+                BLOCK_SCAN_WARP_SCANS>
+            SegmentFixupPolicyT;
+    };
+
+
+
+    //---------------------------------------------------------------------
+    // Tuning policies of current PTX compiler pass
+    //---------------------------------------------------------------------
+
+#if (CUB_PTX_ARCH >= 600)
+    typedef Policy600 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 500)
+    typedef Policy500 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 370)
+    typedef Policy370 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 350)
+    typedef Policy350 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 300)
+    typedef Policy300 PtxPolicy;
+
+#elif (CUB_PTX_ARCH >= 200)
+    typedef Policy200 PtxPolicy;
+
+#else
+    typedef Policy110 PtxPolicy;
+
+#endif
+
+    // "Opaque" policies (whose parameterizations aren't reflected in the type signature)
+    struct PtxSpmvPolicyT : PtxPolicy::SpmvPolicyT {};
+    struct PtxSegmentFixupPolicy : PtxPolicy::SegmentFixupPolicyT {};
+
+
+    //---------------------------------------------------------------------
+    // Utilities
+    //---------------------------------------------------------------------
+
+    /**
+     * Initialize kernel dispatch configurations with the policies corresponding to the PTX assembly we will use
+     */
+    template <typename KernelConfig>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static void InitConfigs(
+        int             ptx_version,
+        KernelConfig    &spmv_config,
+        KernelConfig    &segment_fixup_config)
+    {
+    #if (CUB_PTX_ARCH > 0)
+
+        // We're on the device, so initialize the kernel dispatch configurations with the current PTX policy
+        spmv_config.template Init<PtxSpmvPolicyT>();
+        segment_fixup_config.template Init<PtxSegmentFixupPolicy>();
+
+    #else
+
+        // We're on the host, so lookup and initialize the kernel dispatch configurations with the policies that match the device's PTX version
+        if (ptx_version >= 600)
+        {
+            spmv_config.template            Init<typename Policy600::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy600::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 500)
+        {
+            spmv_config.template            Init<typename Policy500::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy500::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 370)
+        {
+            spmv_config.template            Init<typename Policy370::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy370::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 350)
+        {
+            spmv_config.template            Init<typename Policy350::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy350::SegmentFixupPolicyT>();
+        }
+        else if (ptx_version >= 300)
+        {
+            spmv_config.template            Init<typename Policy300::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy300::SegmentFixupPolicyT>();
+
+        }
+        else if (ptx_version >= 200)
+        {
+            spmv_config.template            Init<typename Policy200::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy200::SegmentFixupPolicyT>();
+        }
+        else
+        {
+            spmv_config.template            Init<typename Policy110::SpmvPolicyT>();
+            segment_fixup_config.template   Init<typename Policy110::SegmentFixupPolicyT>();
+        }
+
+    #endif
+    }
+
+
+    /**
+     * Kernel kernel dispatch configuration.
+     */
+    struct KernelConfig
+    {
+        int block_threads;
+        int items_per_thread;
+        int tile_items;
+
+        template <typename PolicyT>
+        CUB_RUNTIME_FUNCTION __forceinline__
+        void Init()
+        {
+            block_threads       = PolicyT::BLOCK_THREADS;
+            items_per_thread    = PolicyT::ITEMS_PER_THREAD;
+            tile_items          = block_threads * items_per_thread;
+        }
+    };
+
+
+    //---------------------------------------------------------------------
+    // Dispatch entrypoints
+    //---------------------------------------------------------------------
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction using the
+     * specified kernel functions.
+     *
+     * If the input is larger than a single tile, this method uses two-passes of
+     * kernel invocations.
+     */
+    template <
+        typename                Spmv1ColKernelT,                    ///< Function type of cub::DeviceSpmv1ColKernel
+        typename                SpmvSearchKernelT,                  ///< Function type of cub::AgentSpmvSearchKernel
+        typename                SpmvKernelT,                        ///< Function type of cub::AgentSpmvKernel
+        typename                SegmentFixupKernelT>                 ///< Function type of cub::DeviceSegmentFixupKernelT
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream,                             ///< [in] CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous,                  ///< [in] Whether or not to synchronize the stream after every kernel launch to check for errors.  Also causes launch configurations to be printed to the console.  Default is \p false.
+        Spmv1ColKernelT         spmv_1col_kernel,                   ///< [in] Kernel function pointer to parameterization of DeviceSpmv1ColKernel
+        SpmvSearchKernelT       spmv_search_kernel,                 ///< [in] Kernel function pointer to parameterization of AgentSpmvSearchKernel
+        SpmvKernelT             spmv_kernel,                        ///< [in] Kernel function pointer to parameterization of AgentSpmvKernel
+        SegmentFixupKernelT     segment_fixup_kernel,               ///< [in] Kernel function pointer to parameterization of cub::DeviceSegmentFixupKernel
+        KernelConfig            spmv_config,                        ///< [in] Dispatch parameters that match the policy that \p spmv_kernel was compiled for
+        KernelConfig            segment_fixup_config)               ///< [in] Dispatch parameters that match the policy that \p segment_fixup_kernel was compiled for
+    {
+#ifndef CUB_RUNTIME_ENABLED
+
+        // Kernel launch not supported from this device
+        return CubDebug(cudaErrorNotSupported );
+
+#else
+        cudaError error = cudaSuccess;
+        do
+        {
+            if (spmv_params.num_cols == 1)
+            {
+                if (d_temp_storage == NULL)
+                {
+                    // Return if the caller is simply requesting the size of the storage allocation
+                    temp_storage_bytes = 1;
+                    break;
+                }
+
+                // Get search/init grid dims
+                int degen_col_kernel_block_size     = INIT_KERNEL_THREADS;
+                int degen_col_kernel_grid_size      = (spmv_params.num_rows + degen_col_kernel_block_size - 1) / degen_col_kernel_block_size;
+
+                if (debug_synchronous) _CubLog("Invoking spmv_1col_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    degen_col_kernel_grid_size, degen_col_kernel_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                spmv_1col_kernel<<<degen_col_kernel_grid_size, degen_col_kernel_block_size, 0, stream>>>(
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+                break;
+            }
+
+            // Get device ordinal
+            int device_ordinal;
+            if (CubDebug(error = cudaGetDevice(&device_ordinal))) break;
+
+            // Get SM count
+            int sm_count;
+            if (CubDebug(error = cudaDeviceGetAttribute (&sm_count, cudaDevAttrMultiProcessorCount, device_ordinal))) break;
+
+            // Get max x-dimension of grid
+            int max_dim_x;
+            if (CubDebug(error = cudaDeviceGetAttribute(&max_dim_x, cudaDevAttrMaxGridDimX, device_ordinal))) break;;
+
+            // Total number of spmv work items
+            int num_merge_items = spmv_params.num_rows + spmv_params.num_nonzeros;
+
+            // Tile sizes of kernels
+            int merge_tile_size              = spmv_config.block_threads * spmv_config.items_per_thread;
+            int segment_fixup_tile_size     = segment_fixup_config.block_threads * segment_fixup_config.items_per_thread;
+
+            // Number of tiles for kernels
+            unsigned int num_merge_tiles            = (num_merge_items + merge_tile_size - 1) / merge_tile_size;
+            unsigned int num_segment_fixup_tiles    = (num_merge_tiles + segment_fixup_tile_size - 1) / segment_fixup_tile_size;
+
+            // Get SM occupancy for kernels
+            int spmv_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                spmv_sm_occupancy,
+                spmv_kernel,
+                spmv_config.block_threads))) break;
+
+            int segment_fixup_sm_occupancy;
+            if (CubDebug(error = MaxSmOccupancy(
+                segment_fixup_sm_occupancy,
+                segment_fixup_kernel,
+                segment_fixup_config.block_threads))) break;
+
+            // Get grid dimensions
+            dim3 spmv_grid_size(
+                CUB_MIN(num_merge_tiles, max_dim_x),
+                (num_merge_tiles + max_dim_x - 1) / max_dim_x,
+                1);
+
+            dim3 segment_fixup_grid_size(
+                CUB_MIN(num_segment_fixup_tiles, max_dim_x),
+                (num_segment_fixup_tiles + max_dim_x - 1) / max_dim_x,
+                1);
+
+            // Get the temporary storage allocation requirements
+            size_t allocation_sizes[3];
+            if (CubDebug(error = ScanTileStateT::AllocationSize(num_segment_fixup_tiles, allocation_sizes[0]))) break;    // bytes needed for reduce-by-key tile status descriptors
+            allocation_sizes[1] = num_merge_tiles * sizeof(KeyValuePairT);       // bytes needed for block carry-out pairs
+            allocation_sizes[2] = (num_merge_tiles + 1) * sizeof(CoordinateT);   // bytes needed for tile starting coordinates
+
+            // Alias the temporary allocations from the single storage blob (or compute the necessary size of the blob)
+            void* allocations[3];
+            if (CubDebug(error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes))) break;
+            if (d_temp_storage == NULL)
+            {
+                // Return if the caller is simply requesting the size of the storage allocation
+                break;
+            }
+
+            // Construct the tile status interface
+            ScanTileStateT tile_state;
+            if (CubDebug(error = tile_state.Init(num_segment_fixup_tiles, allocations[0], allocation_sizes[0]))) break;
+
+            // Alias the other allocations
+            KeyValuePairT*  d_tile_carry_pairs      = (KeyValuePairT*) allocations[1];  // Agent carry-out pairs
+            CoordinateT*    d_tile_coordinates      = (CoordinateT*) allocations[2];    // Agent starting coordinates
+
+            // Get search/init grid dims
+            int search_block_size   = INIT_KERNEL_THREADS;
+            int search_grid_size    = (num_merge_tiles + 1 + search_block_size - 1) / search_block_size;
+
+#if (CUB_PTX_ARCH == 0)
+            // Init textures
+            if (CubDebug(error = spmv_params.t_vector_x.BindTexture(spmv_params.d_vector_x))) break;
+#endif
+
+            if (search_grid_size < sm_count)
+//            if (num_merge_tiles < spmv_sm_occupancy * sm_count)
+            {
+                // Not enough spmv tiles to saturate the device: have spmv blocks search their own staring coords
+                d_tile_coordinates = NULL;
+            }
+            else
+            {
+                // Use separate search kernel if we have enough spmv tiles to saturate the device
+
+                // Log spmv_search_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking spmv_search_kernel<<<%d, %d, 0, %lld>>>()\n",
+                    search_grid_size, search_block_size, (long long) stream);
+
+                // Invoke spmv_search_kernel
+                spmv_search_kernel<<<search_grid_size, search_block_size, 0, stream>>>(
+                    num_merge_tiles,
+                    d_tile_coordinates,
+                    spmv_params);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+            // Log spmv_kernel configuration
+            if (debug_synchronous) _CubLog("Invoking spmv_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                spmv_grid_size.x, spmv_grid_size.y, spmv_grid_size.z, spmv_config.block_threads, (long long) stream, spmv_config.items_per_thread, spmv_sm_occupancy);
+
+            // Invoke spmv_kernel
+            spmv_kernel<<<spmv_grid_size, spmv_config.block_threads, 0, stream>>>(
+                spmv_params,
+                d_tile_coordinates,
+                d_tile_carry_pairs,
+                num_merge_tiles,
+                tile_state,
+                num_segment_fixup_tiles);
+
+            // Check for failure to launch
+            if (CubDebug(error = cudaPeekAtLastError())) break;
+
+            // Sync the stream if specified to flush runtime errors
+            if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+
+            // Run reduce-by-key fixup if necessary
+            if (num_merge_tiles > 1)
+            {
+                // Log segment_fixup_kernel configuration
+                if (debug_synchronous) _CubLog("Invoking segment_fixup_kernel<<<{%d,%d,%d}, %d, 0, %lld>>>(), %d items per thread, %d SM occupancy\n",
+                    segment_fixup_grid_size.x, segment_fixup_grid_size.y, segment_fixup_grid_size.z, segment_fixup_config.block_threads, (long long) stream, segment_fixup_config.items_per_thread, segment_fixup_sm_occupancy);
+
+                // Invoke segment_fixup_kernel
+                segment_fixup_kernel<<<segment_fixup_grid_size, segment_fixup_config.block_threads, 0, stream>>>(
+                    d_tile_carry_pairs,
+                    spmv_params.d_vector_y,
+                    num_merge_tiles,
+                    num_segment_fixup_tiles,
+                    tile_state);
+
+                // Check for failure to launch
+                if (CubDebug(error = cudaPeekAtLastError())) break;
+
+                // Sync the stream if specified to flush runtime errors
+                if (debug_synchronous && (CubDebug(error = SyncStream(stream)))) break;
+            }
+
+#if (CUB_PTX_ARCH == 0)
+            // Free textures
+            if (CubDebug(error = spmv_params.t_vector_x.UnbindTexture())) break;
+#endif
+        }
+        while (0);
+
+        return error;
+
+#endif // CUB_RUNTIME_ENABLED
+    }
+
+
+    /**
+     * Internal dispatch routine for computing a device-wide reduction
+     */
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Dispatch(
+        void*                   d_temp_storage,                     ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+        size_t&                 temp_storage_bytes,                 ///< [in,out] Reference to size in bytes of \p d_temp_storage allocation
+        SpmvParamsT&            spmv_params,                        ///< SpMV input parameter bundle
+        cudaStream_t            stream                  = 0,        ///< [in] <b>[optional]</b> CUDA stream to launch kernels within.  Default is stream<sub>0</sub>.
+        bool                    debug_synchronous       = false)    ///< [in] <b>[optional]</b> Whether or not to synchronize the stream after every kernel launch to check for errors.  May cause significant slowdown.  Default is \p false.
+    {
+        cudaError error = cudaSuccess;
+        do
+        {
+            // Get PTX version
+            int ptx_version;
+    #if (CUB_PTX_ARCH == 0)
+            if (CubDebug(error = PtxVersion(ptx_version))) break;
+    #else
+            ptx_version = CUB_PTX_ARCH;
+    #endif
+
+            // Get kernel kernel dispatch configurations
+            KernelConfig spmv_config, segment_fixup_config;
+            InitConfigs(ptx_version, spmv_config, segment_fixup_config);
+
+            if (CubDebug(error = Dispatch(
+                d_temp_storage, temp_storage_bytes, spmv_params, stream, debug_synchronous,
+                DeviceSpmv1ColKernel<PtxSpmvPolicyT, ValueT, OffsetT>,
+                DeviceSpmvSearchKernel<PtxSpmvPolicyT, OffsetT, CoordinateT, SpmvParamsT>,
+                DeviceSpmvKernel<PtxSpmvPolicyT, ScanTileStateT, ValueT, OffsetT, CoordinateT, false, false>,
+                DeviceSegmentFixupKernel<PtxSegmentFixupPolicy, KeyValuePairT*, ValueT*, OffsetT, ScanTileStateT>,
+                spmv_config, segment_fixup_config))) break;
+
+        }
+        while (0);
+
+        return error;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/grid/grid_barrier.cuh b/GraphBLAS/CUDA/local_cub/grid/grid_barrier.cuh
new file mode 100644
index 0000000000..461fb44216
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/grid/grid_barrier.cuh
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+
+#pragma once
+
+#include "../util_debug.cuh"
+#include "../util_namespace.cuh"
+#include "../thread/thread_load.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
+ */
+class GridBarrier
+{
+protected :
+
+    typedef unsigned int SyncFlag;
+
+    // Counters in global device memory
+    SyncFlag* d_sync;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrier() : d_sync(NULL) {}
+
+
+    /**
+     * Synchronize
+     */
+    __device__ __forceinline__ void Sync() const
+    {
+        volatile SyncFlag *d_vol_sync = d_sync;
+
+        // Threadfence and syncthreads to make sure global writes are visible before
+        // thread-0 reports in with its sync counter
+        __threadfence();
+        CTA_SYNC();
+
+        if (blockIdx.x == 0)
+        {
+            // Report in ourselves
+            if (threadIdx.x == 0)
+            {
+                d_vol_sync[blockIdx.x] = 1;
+            }
+
+            CTA_SYNC();
+
+            // Wait for everyone else to report in
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+
+            // Let everyone know it's safe to proceed
+            for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
+            {
+                d_vol_sync[peer_block] = 0;
+            }
+        }
+        else
+        {
+            if (threadIdx.x == 0)
+            {
+                // Report in
+                d_vol_sync[blockIdx.x] = 1;
+
+                // Wait for acknowledgment
+                while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
+                {
+                    __threadfence_block();
+                }
+            }
+
+            CTA_SYNC();
+        }
+    }
+};
+
+
+/**
+ * \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
+ *
+ * Uses RAII for lifetime, i.e., device resources are reclaimed when
+ * the destructor is called.
+ */
+class GridBarrierLifetime : public GridBarrier
+{
+protected:
+
+    // Number of bytes backed by d_sync
+    size_t sync_bytes;
+
+public:
+
+    /**
+     * Constructor
+     */
+    GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}
+
+
+    /**
+     * DeviceFrees and resets the progress counters
+     */
+    cudaError_t HostReset()
+    {
+        cudaError_t retval = cudaSuccess;
+        if (d_sync)
+        {
+            CubDebug(retval = cudaFree(d_sync));
+            d_sync = NULL;
+        }
+        sync_bytes = 0;
+        return retval;
+    }
+
+
+    /**
+     * Destructor
+     */
+    virtual ~GridBarrierLifetime()
+    {
+        HostReset();
+    }
+
+
+    /**
+     * Sets up the progress counters for the next kernel launch (lazily
+     * allocating and initializing them if necessary)
+     */
+    cudaError_t Setup(int sweep_grid_size)
+    {
+        cudaError_t retval = cudaSuccess;
+        do {
+            size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
+            if (new_sync_bytes > sync_bytes)
+            {
+                if (d_sync)
+                {
+                    if (CubDebug(retval = cudaFree(d_sync))) break;
+                }
+
+                sync_bytes = new_sync_bytes;
+
+                // Allocate and initialize to zero
+                if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
+                if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
+            }
+        } while (0);
+
+        return retval;
+    }
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/grid/grid_even_share.cuh b/GraphBLAS/CUDA/local_cub/grid/grid_even_share.cuh
new file mode 100644
index 0000000000..f0b3a69ae0
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/grid/grid_even_share.cuh
@@ -0,0 +1,222 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridEvenShare is a descriptor utility for distributing input among CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly the same number of fixed-size work units (grains).
+ */
+
+
+#pragma once
+
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+#include "grid_mapping.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridEvenShare is a descriptor utility for distributing input among
+ * CUDA thread blocks in an "even-share" fashion.  Each thread block gets roughly
+ * the same number of input tiles.
+ *
+ * \par Overview
+ * Each thread block is assigned a consecutive sequence of input tiles.  To help
+ * preserve alignment and eliminate the overhead of guarded loads for all but the
+ * last thread block, to GridEvenShare assigns one of three different amounts of
+ * work to a given thread block: "big", "normal", or "last".  The "big" workloads
+ * are one scheduling grain larger than "normal".  The "last" work unit for the
+ * last thread block may be partially-full if the input is not an even multiple of
+ * the scheduling grain size.
+ *
+ * \par
+ * Before invoking a child grid, a parent thread will typically construct an
+ * instance of GridEvenShare.  The instance can be passed to child thread blocks
+ * which can initialize their per-thread block offsets using \p BlockInit().
+ */
+template <typename OffsetT>
+struct GridEvenShare
+{
+private:
+
+    OffsetT     total_tiles;
+    int         big_shares;
+    OffsetT     big_share_items;
+    OffsetT     normal_share_items;
+    OffsetT     normal_base_offset;
+
+public:
+
+    /// Total number of input items
+    OffsetT     num_items;
+
+    /// Grid size in thread blocks
+    int         grid_size;
+
+    /// OffsetT into input marking the beginning of the owning thread block's segment of input tiles
+    OffsetT     block_offset;
+
+    /// OffsetT into input of marking the end (one-past) of the owning thread block's segment of input tiles
+    OffsetT     block_end;
+
+    /// Stride between input tiles
+    OffsetT     block_stride;
+
+
+    /**
+     * \brief Constructor.
+     */
+    __host__ __device__ __forceinline__ GridEvenShare() :
+        total_tiles(0),
+        big_shares(0),
+        big_share_items(0),
+        normal_share_items(0),
+        normal_base_offset(0),
+        num_items(0),
+        grid_size(0),
+        block_offset(0),
+        block_end(0),
+        block_stride(0)
+    {}
+
+
+    /**
+     * \brief Dispatch initializer. To be called prior prior to kernel launch.
+     */
+    __host__ __device__ __forceinline__ void DispatchInit(
+        OffsetT num_items,          ///< Total number of input items
+        int     max_grid_size,      ///< Maximum grid size allowable (actual grid size may be less if not warranted by the the number of input items)
+        int     tile_items)         ///< Number of data items per input tile
+    {
+        this->block_offset          = num_items;    // Initialize past-the-end
+        this->block_end             = num_items;    // Initialize past-the-end
+        this->num_items             = num_items;
+        this->total_tiles           = (num_items + tile_items - 1) / tile_items;
+        this->grid_size             = CUB_MIN(total_tiles, max_grid_size);
+        OffsetT avg_tiles_per_block = total_tiles / grid_size;
+        this->big_shares            = total_tiles - (avg_tiles_per_block * grid_size);        // leftover grains go to big blocks
+        this->normal_share_items    = avg_tiles_per_block * tile_items;
+        this->normal_base_offset    = big_shares * tile_items;
+        this->big_share_items       = normal_share_items + tile_items;
+    }
+
+
+    /**
+     * \brief Initializes ranges for the specified thread block index.  Specialized
+     * for a "raking" access pattern in which each thread block is assigned a
+     * consecutive sequence of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_RAKE> /*strategy_tag*/)
+    {
+        block_stride = TILE_ITEMS;
+        if (block_id < big_shares)
+        {
+            // This thread block gets a big share of grains (avg_tiles_per_block + 1)
+            block_offset = (block_id * big_share_items);
+            block_end = block_offset + big_share_items;
+        }
+        else if (block_id < total_tiles)
+        {
+            // This thread block gets a normal share of grains (avg_tiles_per_block)
+            block_offset = normal_base_offset + (block_id * normal_share_items);
+            block_end = CUB_MIN(num_items, block_offset + normal_share_items);
+        }
+        // Else default past-the-end
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        int block_id,
+        Int2Type<GRID_MAPPING_STRIP_MINE> /*strategy_tag*/)
+    {
+        block_stride = grid_size * TILE_ITEMS;
+        block_offset = (block_id * TILE_ITEMS);
+        block_end = num_items;
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for "strip mining" access
+     * pattern in which the input tiles assigned to each thread block are
+     * separated by a stride equal to the the extent of the grid.
+     */
+    template <
+        int TILE_ITEMS,
+        GridMappingStrategy STRATEGY>
+    __device__ __forceinline__ void BlockInit()
+    {
+        BlockInit<TILE_ITEMS>(blockIdx.x, Int2Type<STRATEGY>());
+    }
+
+
+    /**
+     * \brief Block-initialization, specialized for a "raking" access
+     * pattern in which each thread block is assigned a consecutive sequence
+     * of input tiles.
+     */
+    template <int TILE_ITEMS>
+    __device__ __forceinline__ void BlockInit(
+        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
+        OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
+    {
+        this->block_offset = block_offset;
+        this->block_end = block_end;
+        this->block_stride = TILE_ITEMS;
+    }
+
+
+};
+
+
+
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/grid/grid_mapping.cuh b/GraphBLAS/CUDA/local_cub/grid/grid_mapping.cuh
new file mode 100644
index 0000000000..f0e9fded26
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/grid/grid_mapping.cuh
@@ -0,0 +1,113 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/******************************************************************************
+ * Mapping policies
+ *****************************************************************************/
+
+
+/**
+ * \brief cub::GridMappingStrategy enumerates alternative strategies for mapping constant-sized tiles of device-wide data onto a grid of CUDA thread blocks.
+ */
+enum GridMappingStrategy
+{
+    /**
+     * \brief An a "raking" access pattern in which each thread block is
+     * assigned a consecutive sequence of input tiles
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p segments, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each segment is comprised of
+     * consecutive tiles, where a tile is a small, constant-sized unit of input
+     * to be processed to completion before the thread block terminates or
+     * obtains more work.  The kernel invokes \p p thread blocks, each
+     * of which iteratively consumes a segment of <em>n</em>/<em>p</em> elements
+     * in tile-size increments.
+     */
+    GRID_MAPPING_RAKE,
+
+    /**
+     * \brief An a "strip mining" access pattern in which the input tiles assigned
+     * to each thread block are separated by a stride equal to the the extent of
+     * the grid.
+     *
+     * \par Overview
+     * The input is evenly partitioned into \p p sets, where \p p is
+     * constant and corresponds loosely to the number of thread blocks that may
+     * actively reside on the target device. Each set is comprised of
+     * data tiles separated by stride \p tiles, where a tile is a small,
+     * constant-sized unit of input to be processed to completion before the
+     * thread block terminates or obtains more work.  The kernel invokes \p p
+     * thread blocks, each of which iteratively consumes a segment of
+     * <em>n</em>/<em>p</em> elements in tile-size increments.
+     */
+    GRID_MAPPING_STRIP_MINE,
+
+    /**
+     * \brief A dynamic "queue-based" strategy for assigning input tiles to thread blocks.
+     *
+     * \par Overview
+     * The input is treated as a queue to be dynamically consumed by a grid of
+     * thread blocks.  Work is atomically dequeued in tiles, where a tile is a
+     * unit of input to be processed to completion before the thread block
+     * terminates or obtains more work.  The grid size \p p is constant,
+     * loosely corresponding to the number of thread blocks that may actively
+     * reside on the target device.
+     */
+    GRID_MAPPING_DYNAMIC,
+};
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/grid/grid_queue.cuh b/GraphBLAS/CUDA/local_cub/grid/grid_queue.cuh
new file mode 100644
index 0000000000..9615b14dbe
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/grid/grid_queue.cuh
@@ -0,0 +1,220 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::GridQueue is a descriptor utility for dynamic queue management.
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+#include "../util_debug.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup GridModule
+ * @{
+ */
+
+
+/**
+ * \brief GridQueue is a descriptor utility for dynamic queue management.
+ *
+ * \par Overview
+ * GridQueue descriptors provides abstractions for "filling" or
+ * "draining" globally-shared vectors.
+ *
+ * \par
+ * A "filling" GridQueue works by atomically-adding to a zero-initialized counter,
+ * returning a unique offset for the calling thread to write its items.
+ * The GridQueue maintains the total "fill-size".  The fill counter must be reset
+ * using GridQueue::ResetFill by the host or kernel instance prior to the kernel instance that
+ * will be filling.
+ *
+ * \par
+ * Similarly, a "draining" GridQueue works by works by atomically-incrementing a
+ * zero-initialized counter, returning a unique offset for the calling thread to
+ * read its items. Threads can safely drain until the array's logical fill-size is
+ * exceeded.  The drain counter must be reset using GridQueue::ResetDrain or
+ * GridQueue::FillAndResetDrain by the host or kernel instance prior to the kernel instance that
+ * will be filling.  (For dynamic work distribution of existing data, the corresponding fill-size
+ * is simply the number of elements in the array.)
+ *
+ * \par
+ * Iterative work management can be implemented simply with a pair of flip-flopping
+ * work buffers, each with an associated set of fill and drain GridQueue descriptors.
+ *
+ * \tparam OffsetT Signed integer type for global offsets
+ */
+template <typename OffsetT>
+class GridQueue
+{
+private:
+
+    /// Counter indices
+    enum
+    {
+        FILL    = 0,
+        DRAIN   = 1,
+    };
+
+    /// Pair of counters
+    OffsetT *d_counters;
+
+public:
+
+    /// Returns the device allocation size in bytes needed to construct a GridQueue instance
+    __host__ __device__ __forceinline__
+    static size_t AllocationSize()
+    {
+        return sizeof(OffsetT) * 2;
+    }
+
+
+    /// Constructs an invalid GridQueue descriptor
+    __host__ __device__ __forceinline__ GridQueue()
+    :
+        d_counters(NULL)
+    {}
+
+
+    /// Constructs a GridQueue descriptor around the device storage allocation
+    __host__ __device__ __forceinline__ GridQueue(
+        void *d_storage)                    ///< Device allocation to back the GridQueue.  Must be at least as big as <tt>AllocationSize()</tt>.
+    :
+        d_counters((OffsetT*) d_storage)
+    {}
+
+
+    /// This operation sets the fill-size and resets the drain counter, preparing the GridQueue for draining in the next kernel instance.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t FillAndResetDrain(
+        OffsetT fill_size,
+        cudaStream_t stream = 0)
+    {
+#if (CUB_PTX_ARCH > 0)
+        (void)stream;
+        d_counters[FILL] = fill_size;
+        d_counters[DRAIN] = 0;
+        return cudaSuccess;
+#else
+        OffsetT counters[2];
+        counters[FILL] = fill_size;
+        counters[DRAIN] = 0;
+        return CubDebug(cudaMemcpyAsync(d_counters, counters, sizeof(OffsetT) * 2, cudaMemcpyHostToDevice, stream));
+#endif
+    }
+
+
+    /// This operation resets the drain so that it may advance to meet the existing fill-size.  To be called by the host or by a kernel prior to that which will be draining.
+    __host__ __device__ __forceinline__ cudaError_t ResetDrain(cudaStream_t stream = 0)
+    {
+#if (CUB_PTX_ARCH > 0)
+        (void)stream;
+        d_counters[DRAIN] = 0;
+        return cudaSuccess;
+#else
+        return CubDebug(cudaMemsetAsync(d_counters + DRAIN, 0, sizeof(OffsetT), stream));
+#endif
+    }
+
+
+    /// This operation resets the fill counter.  To be called by the host or by a kernel prior to that which will be filling.
+    __host__ __device__ __forceinline__ cudaError_t ResetFill(cudaStream_t stream = 0)
+    {
+#if (CUB_PTX_ARCH > 0)
+        (void)stream;
+        d_counters[FILL] = 0;
+        return cudaSuccess;
+#else
+        return CubDebug(cudaMemsetAsync(d_counters + FILL, 0, sizeof(OffsetT), stream));
+#endif
+    }
+
+
+    /// Returns the fill-size established by the parent or by the previous kernel.
+    __host__ __device__ __forceinline__ cudaError_t FillSize(
+        OffsetT &fill_size,
+        cudaStream_t stream = 0)
+    {
+#if (CUB_PTX_ARCH > 0)
+        (void)stream;
+        fill_size = d_counters[FILL];
+        return cudaSuccess;
+#else
+        return CubDebug(cudaMemcpyAsync(&fill_size, d_counters + FILL, sizeof(OffsetT), cudaMemcpyDeviceToHost, stream));
+#endif
+    }
+
+
+    /// Drain \p num_items from the queue.  Returns offset from which to read items.  To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Drain(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + DRAIN, num_items);
+    }
+
+
+    /// Fill \p num_items into the queue.  Returns offset from which to write items.    To be called from CUDA kernel.
+    __device__ __forceinline__ OffsetT Fill(OffsetT num_items)
+    {
+        return atomicAdd(d_counters + FILL, num_items);
+    }
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Reset grid queue (call with 1 block of 1 thread)
+ */
+template <typename OffsetT>
+__global__ void FillAndResetDrainKernel(
+    GridQueue<OffsetT>   grid_queue,
+    OffsetT              num_items)
+{
+    grid_queue.FillAndResetDrain(num_items);
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group GridModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+
diff --git a/GraphBLAS/CUDA/local_cub/host/mutex.cuh b/GraphBLAS/CUDA/local_cub/host/mutex.cuh
new file mode 100644
index 0000000000..ff7ec90ddc
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/host/mutex.cuh
@@ -0,0 +1,171 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple portable mutex
+ */
+
+
+#pragma once
+
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+    #include <mutex>
+#else
+    #if defined(_WIN32) || defined(_WIN64)
+        #include <intrin.h>
+
+        #define WIN32_LEAN_AND_MEAN
+        #define NOMINMAX
+        #include <windows.h>
+        #undef WIN32_LEAN_AND_MEAN
+        #undef NOMINMAX
+
+        /**
+         * Compiler read/write barrier
+         */
+        #pragma intrinsic(_ReadWriteBarrier)
+
+    #endif
+#endif
+
+#include "../util_namespace.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Simple portable mutex
+ *   - Wraps std::mutex when compiled with C++11 or newer (supported on all platforms)
+ *   - Uses GNU/Windows spinlock mechanisms for pre C++11 (supported on x86/x64 when compiled with cl.exe or g++)
+ */
+struct Mutex
+{
+#if (__cplusplus > 199711L) || (defined(_MSC_VER) && _MSC_VER >= 1800)
+
+    std::mutex mtx;
+
+    void Lock()
+    {
+        mtx.lock();
+    }
+
+    void Unlock()
+    {
+        mtx.unlock();
+    }
+
+    void TryLock()
+    {
+        mtx.try_lock();
+    }
+
+#else       //__cplusplus > 199711L
+
+    #if defined(_MSC_VER)
+
+        // Microsoft VC++
+        typedef long Spinlock;
+
+    #else
+
+        // GNU g++
+        typedef int Spinlock;
+
+        /**
+         * Compiler read/write barrier
+         */
+        __forceinline__ void _ReadWriteBarrier()
+        {
+            __sync_synchronize();
+        }
+
+        /**
+         * Atomic exchange
+         */
+        __forceinline__ long _InterlockedExchange(volatile int * const Target, const int Value)
+        {
+            // NOTE: __sync_lock_test_and_set would be an acquire barrier, so we force a full barrier
+            _ReadWriteBarrier();
+            return __sync_lock_test_and_set(Target, Value);
+        }
+
+        /**
+         * Pause instruction to prevent excess processor bus usage
+         */
+        __forceinline__ void YieldProcessor()
+        {
+        }
+
+    #endif  // defined(_MSC_VER)
+
+        /// Lock member
+        volatile Spinlock lock;
+
+        /**
+         * Constructor
+         */
+        Mutex() : lock(0) {}
+
+        /**
+         * Return when the specified spinlock has been acquired
+         */
+        __forceinline__ void Lock()
+        {
+            while (1)
+            {
+                if (!_InterlockedExchange(&lock, 1)) return;
+                while (lock) YieldProcessor();
+            }
+        }
+
+
+        /**
+         * Release the specified spinlock
+         */
+        __forceinline__ void Unlock()
+        {
+            _ReadWriteBarrier();
+            lock = 0;
+        }
+
+#endif      // __cplusplus > 199711L
+
+};
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
diff --git a/GraphBLAS/CUDA/local_cub/iterator/arg_index_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/arg_index_input_iterator.cuh
new file mode 100644
index 0000000000..95a84a5797
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/iterator/arg_index_input_iterator.cuh
@@ -0,0 +1,259 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#include <thrust/version.h>
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for pairing dereferenced values with their corresponding indices (forming \p KeyValuePair tuples).
+ *
+ * \par Overview
+ * - ArgIndexInputIteratorTwraps a random access input iterator \p itr of type \p InputIteratorT.
+ *   Dereferencing an ArgIndexInputIteratorTat offset \p i produces a \p KeyValuePair value whose
+ *   \p key field is \p i and whose \p value field is <tt>itr[i]</tt>.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ArgIndexInputIteratorTto
+ * dereference an array of doubles
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/arg_index_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;         // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::ArgIndexInputIterator<double*> itr(d_in);
+ *
+ * // Within device code:
+ * typedef typename cub::ArgIndexInputIterator<double*>::value_type Tuple;
+ * Tuple item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 8.0 @ 0
+ *
+ * itr = itr + 6;
+ * item_offset_pair.key = *itr;
+ * printf("%f @ %d\n",
+ *   item_offset_pair.value,
+ *   item_offset_pair.key);   // 9.0 @ 6
+ *
+ * \endcode
+ *
+ * \tparam InputIteratorT       The value type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ * \tparam OutputValueT         The paired value type of the <offset,value> tuple (Default: value type of input iterator)
+ */
+template <
+    typename    InputIteratorT,
+    typename    OffsetT             = ptrdiff_t,
+    typename    OutputValueT        = typename std::iterator_traits<InputIteratorT>::value_type>
+class ArgIndexInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ArgIndexInputIterator                       self_type;              ///< My own type
+    typedef OffsetT                                     difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef KeyValuePair<difference_type, OutputValueT> value_type;             ///< The type of the element the iterator can point to
+    typedef value_type*                                 pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef value_type                                  reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    InputIteratorT  itr;
+    difference_type offset;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArgIndexInputIterator(
+        InputIteratorT  itr,            ///< Input iterator to wrap
+        difference_type offset = 0)     ///< OffsetT (in items) from \p itr denoting the position of the iterator
+    :
+        itr(itr),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        value_type retval;
+        retval.value = itr[offset];
+        retval.key = offset;
+        return retval;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(itr, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(itr, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((itr == rhs.itr) && (offset == rhs.offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((itr != rhs.itr) || (offset != rhs.offset));
+    }
+
+    /// Normalize
+    __host__ __device__ __forceinline__ void normalize()
+    {
+        itr += offset;
+        offset = 0;
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/iterator/cache_modified_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/cache_modified_input_iterator.cuh
new file mode 100644
index 0000000000..b4ad91e2f1
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/iterator/cache_modified_input_iterator.cuh
@@ -0,0 +1,240 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values using a PTX cache load modifier.
+ *
+ * \par Overview
+ * - CacheModifiedInputIteratorTis a random-access input iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by reading \p ValueType values through loads modified by \p MODIFIER.
+ * - Can be used to load any data type from memory using PTX cache load modifiers (e.g., "LOAD_LDG",
+ *   "LOAD_CG", "LOAD_CA", "LOAD_CS", "LOAD_CV", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedInputIteratorTto
+ * dereference a device array of double using the "ldg" PTX load modifier
+ * (i.e., load values through texture cache).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_in;            // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedInputIterator<cub::LOAD_LDG, double> itr(d_in);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 8.0
+ * printf("%f\n", itr[1]);  // 6.0
+ * printf("%f\n", itr[6]);  // 9.0
+ *
+ * \endcode
+ *
+ * \tparam CacheLoadModifier    The cub::CacheLoadModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheLoadModifier   MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedInputIterator          self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+
+public:
+
+    /// Wrapped native pointer
+    ValueType* ptr;
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedInputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __device__ __forceinline__ reference operator*() const
+    {
+        return ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return ThreadLoad<MODIFIER>(ptr + n);
+    }
+
+    /// Structure dereference
+    __device__ __forceinline__ pointer operator->()
+    {
+        return &ThreadLoad<MODIFIER>(ptr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/iterator/cache_modified_output_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/cache_modified_output_iterator.cuh
new file mode 100644
index 0000000000..c3e3321d30
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/iterator/cache_modified_output_iterator.cuh
@@ -0,0 +1,254 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access output wrapper for storing array values using a PTX cache-modifier.
+ *
+ * \par Overview
+ * - CacheModifiedOutputIterator is a random-access output iterator that wraps a native
+ *   device pointer of type <tt>ValueType*</tt>. \p ValueType references are
+ *   made by writing \p ValueType values through stores modified by \p MODIFIER.
+ * - Can be used to store any data type to memory using PTX cache store modifiers (e.g., "STORE_WB",
+ *   "STORE_CG", "STORE_CS", "STORE_WT", etc.).
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions, but can only be dereferenced within device functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CacheModifiedOutputIterator to
+ * dereference a device array of doubles using the "wt" PTX load modifier
+ * (i.e., write-through to system memory).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/cache_modified_output_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * double *d_out;              // e.g., [, , , , , , ]
+ *
+ * // Create an iterator wrapper
+ * cub::CacheModifiedOutputIterator<cub::STORE_WT, double> itr(d_out);
+ *
+ * // Within device code:
+ * itr[0]  = 8.0;
+ * itr[1]  = 66.0;
+ * itr[55] = 24.0;
+ *
+ * \endcode
+ *
+ * \par Usage Considerations
+ * - Can only be dereferenced within device code
+ *
+ * \tparam CacheStoreModifier     The cub::CacheStoreModifier to use when accessing data
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            ValueType,
+    typename            OffsetT = ptrdiff_t>
+class CacheModifiedOutputIterator
+{
+private:
+
+    // Proxy object
+    struct Reference
+    {
+        ValueType* ptr;
+
+        /// Constructor
+        __host__ __device__ __forceinline__ Reference(ValueType* ptr) : ptr(ptr) {}
+
+        /// Assignment
+        __device__ __forceinline__ ValueType operator =(ValueType val)
+        {
+            ThreadStore<MODIFIER>(ptr, val);
+            return val;
+        }
+    };
+
+public:
+
+    // Required iterator traits
+    typedef CacheModifiedOutputIterator         self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                                value_type;             ///< The type of the element the iterator can point to
+    typedef void                                pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef Reference                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType* ptr;
+
+public:
+
+    /// Constructor
+    template <typename QualifiedValueType>
+    __host__ __device__ __forceinline__ CacheModifiedOutputIterator(
+        QualifiedValueType* ptr)     ///< Native pointer to wrap
+    :
+        ptr(const_cast<typename RemoveQualifiers<QualifiedValueType>::Type *>(ptr))
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        ptr++;
+        return retval;
+    }
+
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        ptr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return Reference(ptr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(ptr + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        ptr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(ptr - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        ptr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return ptr - other.ptr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return Reference(ptr + n);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (ptr == rhs.ptr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (ptr != rhs.ptr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/iterator/constant_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/constant_input_iterator.cuh
new file mode 100644
index 0000000000..1e0a91044d
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/iterator/constant_input_iterator.cuh
@@ -0,0 +1,235 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of homogeneous values
+ *
+ * \par Overview
+ * - Read references to a ConstantInputIteratorTiterator always return the supplied constant
+ *   of type \p ValueType.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p ConstantInputIteratorTto
+ * dereference a sequence of homogeneous doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/constant_input_iterator.cuh>
+ *
+ * cub::ConstantInputIterator<double> itr(5.0);
+ *
+ * printf("%f\n", itr[0]);      // 5.0
+ * printf("%f\n", itr[1]);      // 5.0
+ * printf("%f\n", itr[2]);      // 5.0
+ * printf("%f\n", itr[50]);     // 5.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class ConstantInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef ConstantInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType   val;
+    OffsetT     offset;
+#ifdef _WIN32
+    OffsetT     pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];        // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ConstantInputIterator(
+        ValueType   val,            ///< Starting value for the iterator instance to report
+        OffsetT     offset = 0)     ///< Base offset
+    :
+        val(val),
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val, offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val, offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance /*n*/) const
+    {
+        return val;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset) && ((val == rhs.val));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset) || (val!= rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "," << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/iterator/counting_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/counting_input_iterator.cuh
new file mode 100644
index 0000000000..7f49348d6c
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/iterator/counting_input_iterator.cuh
@@ -0,0 +1,228 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+/**
+ * \brief A random-access input generator for dereferencing a sequence of incrementing integer values.
+ *
+ * \par Overview
+ * - After initializing a CountingInputIteratorTto a certain integer \p base, read references
+ *   at \p offset will return the value \p base + \p offset.
+ * - Can be constructed, manipulated, dereferenced, and exchanged within and between host and device
+ *   functions.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p CountingInputIteratorTto
+ * dereference a sequence of incrementing integers.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/counting_input_iterator.cuh>
+ *
+ * cub::CountingInputIterator<int> itr(5);
+ *
+ * printf("%d\n", itr[0]);      // 5
+ * printf("%d\n", itr[1]);      // 6
+ * printf("%d\n", itr[2]);      // 7
+ * printf("%d\n", itr[50]);     // 55
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename ValueType,
+    typename OffsetT = ptrdiff_t>
+class CountingInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef CountingInputIterator               self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ValueType val;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ CountingInputIterator(
+        const ValueType &val)          ///< Starting value for the iterator instance to report
+    :
+        val(val)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        val++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        val++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return val;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(val + (ValueType) n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        val += (ValueType) n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(val - (ValueType) n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        val -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return (difference_type) (val - other.val);
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return val + (ValueType) n;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &val;
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (val == rhs.val);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (val != rhs.val);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.val << "]";
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/iterator/discard_output_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/discard_output_iterator.cuh
new file mode 100644
index 0000000000..28473e5f22
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/iterator/discard_output_iterator.cuh
@@ -0,0 +1,220 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../util_namespace.cuh"
+#include "../util_macro.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A discard iterator
+ */
+template <typename OffsetT = ptrdiff_t>
+class DiscardOutputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef DiscardOutputIterator   self_type;              ///< My own type
+    typedef OffsetT                 difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef void                    value_type;             ///< The type of the element the iterator can point to
+    typedef void                    pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef void                    reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    OffsetT offset;
+
+#if defined(_WIN32) || !defined(_WIN64)
+    // Workaround for win32 parameter-passing bug (ulonglong2 argmin DeviceReduce)
+    OffsetT pad[CUB_MAX(1, (16 / sizeof(OffsetT) - 1))];
+#endif
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ DiscardOutputIterator(
+        OffsetT offset = 0)     ///< Base offset
+    :
+        offset(offset)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ self_type& operator*()
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(offset + n);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(offset - n);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return offset - other.offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator[](Distance n)
+    {
+        // return self reference, which can be assigned to anything
+        return *this;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return;
+    }
+
+    /// Assignment to self (no-op)
+    __host__ __device__ __forceinline__ void operator=(self_type const& other)
+    {
+        offset = other.offset;
+    }
+
+    /// Assignment to anything else (no-op)
+    template<typename T>
+    __host__ __device__ __forceinline__ void operator=(T const&)
+    {}
+
+    /// Cast to void* operator
+    __host__ __device__ __forceinline__ operator void*() const { return NULL; }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (offset == rhs.offset);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (offset != rhs.offset);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        os << "[" << itr.offset << "]";
+        return os;
+    }
+
+};
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/iterator/tex_obj_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/tex_obj_input_iterator.cuh
new file mode 100644
index 0000000000..b99103ec55
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/iterator/tex_obj_input_iterator.cuh
@@ -0,0 +1,310 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses newer Kepler-style texture objects.
+ *
+ * \par Overview
+ * - TexObjInputIteratorTwraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - With regard to nested/dynamic parallelism, TexObjInputIteratorTiterators may only be
+ *   created by the host thread, but can be used by any descendant kernel.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIteratorTto
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_obj_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexObjInputIterator<double> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    typename    OffsetT = ptrdiff_t>
+class TexObjInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexObjInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    // Largest texture word we can use in device
+    typedef typename UnitWord<T>::TextureWord TextureWord;
+
+    // Number of texture words per T
+    enum {
+        TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+    };
+
+private:
+
+    T*                  ptr;
+    difference_type     tex_offset;
+    cudaTextureObject_t tex_obj;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TexObjInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0),
+        tex_obj(0)
+    {}
+
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,               ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes = size_t(-1),         ///< Number of bytes in the range
+        size_t          tex_offset = 0)     ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
+        this->tex_offset = tex_offset;
+
+        cudaChannelFormatDesc   channel_desc = cudaCreateChannelDesc<TextureWord>();
+        cudaResourceDesc        res_desc;
+        cudaTextureDesc         tex_desc;
+        memset(&res_desc, 0, sizeof(cudaResourceDesc));
+        memset(&tex_desc, 0, sizeof(cudaTextureDesc));
+        res_desc.resType                = cudaResourceTypeLinear;
+        res_desc.res.linear.devPtr      = this->ptr;
+        res_desc.res.linear.desc        = channel_desc;
+        res_desc.res.linear.sizeInBytes = bytes;
+        tex_desc.readMode               = cudaReadModeElementType;
+        return cudaCreateTextureObject(&tex_obj, &res_desc, &tex_desc, NULL);
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return cudaDestroyTextureObject(tex_obj);
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return ptr[tex_offset];
+#else
+        // Move array of uninitialized words, then alias and assign to return value
+        TextureWord words[TEXTURE_MULTIPLE];
+
+        #pragma unroll
+        for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+        {
+            words[i] = tex1Dfetch<TextureWord>(
+                tex_obj,
+                (tex_offset * TEXTURE_MULTIPLE) + i);
+        }
+
+        // Load from words
+        return *reinterpret_cast<T*>(words);
+#endif
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr          = ptr;
+        retval.tex_obj      = tex_obj;
+        retval.tex_offset   = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset) && (tex_obj == rhs.tex_obj));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset) || (tex_obj != rhs.tex_obj));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/iterator/tex_ref_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/tex_ref_input_iterator.cuh
new file mode 100644
index 0000000000..95d0ffbc96
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/iterator/tex_ref_input_iterator.cuh
@@ -0,0 +1,374 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_debug.cuh"
+#include "../util_namespace.cuh"
+
+#if (CUDA_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer
+
+#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/******************************************************************************
+ * Static file-scope Tesla/Fermi-style texture references
+ *****************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+// Anonymous namespace
+namespace {
+
+/// Global texture reference specialized by type
+template <typename T>
+struct IteratorTexRef
+{
+    /// And by unique ID
+    template <int UNIQUE_ID>
+    struct TexId
+    {
+        // Largest texture word we can use in device
+        typedef typename UnitWord<T>::DeviceWord DeviceWord;
+        typedef typename UnitWord<T>::TextureWord TextureWord;
+
+        // Number of texture words per T
+        enum {
+            DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord),
+            TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
+        };
+
+        // Texture reference type
+        typedef texture<TextureWord> TexRef;
+
+        // Texture reference
+        static TexRef ref;
+
+        /// Bind texture
+        static cudaError_t BindTexture(void *d_in, size_t &offset)
+        {
+            if (d_in)
+            {
+                cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<TextureWord>();
+                ref.channelDesc = tex_desc;
+                return (CubDebug(cudaBindTexture(&offset, ref, d_in)));
+            }
+
+            return cudaSuccess;
+        }
+
+        /// Unbind texture
+        static cudaError_t UnbindTexture()
+        {
+            return CubDebug(cudaUnbindTexture(ref));
+        }
+
+        /// Fetch element
+        template <typename Distance>
+        static __device__ __forceinline__ T Fetch(Distance tex_offset)
+        {
+            DeviceWord temp[DEVICE_MULTIPLE];
+            TextureWord *words = reinterpret_cast<TextureWord*>(temp);
+
+            #pragma unroll
+            for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
+            {
+                words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i);
+            }
+
+            return reinterpret_cast<T&>(temp);
+        }
+    };
+};
+
+// Texture reference definitions
+template <typename  T>
+template <int       UNIQUE_ID>
+typename IteratorTexRef<T>::template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>::template TexId<UNIQUE_ID>::ref = 0;
+
+
+} // Anonymous namespace
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+
+/**
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses older Tesla/Fermi-style texture references.
+ *
+ * \par Overview
+ * - TexRefInputIteratorTwraps a native device pointer of type <tt>ValueType*</tt>. References
+ *   to elements are to be loaded through texture cache.
+ * - Can be used to load any data type from memory through texture cache.
+ * - Can be manipulated and exchanged within and between host and device
+ *   functions, can only be constructed within host functions, and can only be
+ *   dereferenced within device functions.
+ * - The \p UNIQUE_ID template parameter is used to statically name the underlying texture
+ *   reference.  Only one TexRefInputIteratorTinstance can be bound at any given time for a
+ *   specific combination of (1) data type \p T, (2) \p UNIQUE_ID, (3) host
+ *   thread, and (4) compilation .o unit.
+ * - With regard to nested/dynamic parallelism, TexRefInputIteratorTiterators may only be
+ *   created by the host thread and used by a top-level kernel (i.e. the one which is launched
+ *   from the host).
+ * - Compatible with Thrust API v1.7 or newer.
+ * - Compatible with CUDA toolkit v5.5 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TexRefInputIteratorTto
+ * dereference a device array of doubles through texture cache.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/tex_ref_input_iterator.cuh>
+ *
+ * // Declare, allocate, and initialize a device array
+ * int num_items;   // e.g., 7
+ * double *d_in;    // e.g., [8.0, 6.0, 7.0, 5.0, 3.0, 0.0, 9.0]
+ *
+ * // Create an iterator wrapper
+ * cub::TexRefInputIterator<double, __LINE__> itr;
+ * itr.BindTexture(d_in, sizeof(double) * num_items);
+ * ...
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);      // 8.0
+ * printf("%f\n", itr[1]);      // 6.0
+ * printf("%f\n", itr[6]);      // 9.0
+ *
+ * ...
+ * itr.UnbindTexture();
+ *
+ * \endcode
+ *
+ * \tparam T                    The value type of this iterator
+ * \tparam UNIQUE_ID            A globally-unique identifier (within the compilation unit) to name the underlying texture reference
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ */
+template <
+    typename    T,
+    int         UNIQUE_ID,
+    typename    OffsetT = ptrdiff_t>
+class TexRefInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TexRefInputIterator                 self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef T                                   value_type;             ///< The type of the element the iterator can point to
+    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::device_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    T*              ptr;
+    difference_type tex_offset;
+
+    // Texture reference wrapper (old Tesla/Fermi-style textures)
+    typedef typename IteratorTexRef<T>::template TexId<UNIQUE_ID> TexId;
+
+public:
+/*
+    /// Constructor
+    __host__ __device__ __forceinline__ TexRefInputIterator()
+    :
+        ptr(NULL),
+        tex_offset(0)
+    {}
+*/
+    /// Use this iterator to bind \p ptr with a texture reference
+    template <typename QualifiedT>
+    cudaError_t BindTexture(
+        QualifiedT      *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
+        size_t          bytes = size_t(-1),     ///< Number of bytes in the range
+        size_t          tex_offset = 0)         ///< OffsetT (in items) from \p ptr denoting the position of the iterator
+    {
+        this->ptr = const_cast<typename RemoveQualifiers<QualifiedT>::Type *>(ptr);
+        size_t offset;
+        cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, offset);
+        this->tex_offset = (difference_type) (offset / sizeof(QualifiedT));
+        return retval;
+    }
+
+    /// Unbind this iterator from its texture reference
+    cudaError_t UnbindTexture()
+    {
+        return TexId::UnbindTexture();
+    }
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        tex_offset++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        tex_offset++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+#if (CUB_PTX_ARCH == 0)
+        // Simply dereference the pointer on the host
+        return ptr[tex_offset];
+#else
+        // Use the texture reference
+        return TexId::Fetch(tex_offset);
+#endif
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval;
+        retval.ptr = ptr;
+        retval.tex_offset = tex_offset + n;
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        tex_offset += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval;
+        retval.ptr = ptr;
+        retval.tex_offset = tex_offset - n;
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        tex_offset -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return tex_offset - other.tex_offset;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        self_type offset = (*this) + n;
+        return *offset;
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &(*(*this));
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset));
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset));
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
+
+#endif // CUDA_VERSION
diff --git a/GraphBLAS/CUDA/local_cub/iterator/transform_input_iterator.cuh b/GraphBLAS/CUDA/local_cub/iterator/transform_input_iterator.cuh
new file mode 100644
index 0000000000..dad1f50041
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/iterator/transform_input_iterator.cuh
@@ -0,0 +1,252 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Random-access iterator types
+ */
+
+#pragma once
+
+#include <iterator>
+#include <iostream>
+
+#include "../thread/thread_load.cuh"
+#include "../thread/thread_store.cuh"
+#include "../util_device.cuh"
+#include "../util_namespace.cuh"
+
+#if (THRUST_VERSION >= 100700)
+    // This iterator is compatible with Thrust API 1.7 and newer
+    #include <thrust/iterator/iterator_facade.h>
+    #include <thrust/iterator/iterator_traits.h>
+#endif // THRUST_VERSION
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIterator
+ * @{
+ */
+
+
+/**
+ * \brief A random-access input wrapper for transforming dereferenced values.
+ *
+ * \par Overview
+ * - TransformInputIteratorTwraps a unary conversion functor of type \p
+ *   ConversionOp and a random-access input iterator of type <tt>InputIteratorT</tt>,
+ *   using the former to produce references of type \p ValueType from the latter.
+ * - Can be used with any data type.
+ * - Can be constructed, manipulated, and exchanged within and between host and device
+ *   functions.  Wrapped host memory can only be dereferenced on the host, and wrapped
+ *   device memory can only be dereferenced on the device.
+ * - Compatible with Thrust API v1.7 or newer.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of \p TransformInputIteratorTto
+ * dereference an array of integers, tripling the values and converting them to doubles.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/iterator/transform_input_iterator.cuh>
+ *
+ * // Functor for tripling integer values and converting to doubles
+ * struct TripleDoubler
+ * {
+ *     __host__ __device__ __forceinline__
+ *     double operator()(const int &a) const {
+ *         return double(a * 3);
+ *     }
+ * };
+ *
+ * // Declare, allocate, and initialize a device array
+ * int *d_in;                   // e.g., [8, 6, 7, 5, 3, 0, 9]
+ * TripleDoubler conversion_op;
+ *
+ * // Create an iterator wrapper
+ * cub::TransformInputIterator<double, TripleDoubler, int*> itr(d_in, conversion_op);
+ *
+ * // Within device code:
+ * printf("%f\n", itr[0]);  // 24.0
+ * printf("%f\n", itr[1]);  // 18.0
+ * printf("%f\n", itr[6]);  // 27.0
+ *
+ * \endcode
+ *
+ * \tparam ValueType            The value type of this iterator
+ * \tparam ConversionOp         Unary functor type for mapping objects of type \p InputType to type \p ValueType.  Must have member <tt>ValueType operator()(const InputType &datum)</tt>.
+ * \tparam InputIteratorT       The type of the wrapped input iterator
+ * \tparam OffsetT              The difference type of this iterator (Default: \p ptrdiff_t)
+ *
+ */
+template <
+    typename ValueType,
+    typename ConversionOp,
+    typename InputIteratorT,
+    typename OffsetT = ptrdiff_t>
+class TransformInputIterator
+{
+public:
+
+    // Required iterator traits
+    typedef TransformInputIterator              self_type;              ///< My own type
+    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
+    typedef ValueType                           value_type;             ///< The type of the element the iterator can point to
+    typedef ValueType*                          pointer;                ///< The type of a pointer to an element the iterator can point to
+    typedef ValueType                           reference;              ///< The type of a reference to an element the iterator can point to
+
+#if (THRUST_VERSION >= 100700)
+    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
+    typedef typename thrust::detail::iterator_facade_category<
+        thrust::any_system_tag,
+        thrust::random_access_traversal_tag,
+        value_type,
+        reference
+      >::type iterator_category;                                        ///< The iterator category
+#else
+    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
+#endif  // THRUST_VERSION
+
+private:
+
+    ConversionOp    conversion_op;
+    InputIteratorT  input_itr;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__ TransformInputIterator(
+        InputIteratorT      input_itr,          ///< Input iterator to wrap
+        ConversionOp        conversion_op)      ///< Conversion functor to wrap
+    :
+        conversion_op(conversion_op),
+        input_itr(input_itr)
+    {}
+
+    /// Postfix increment
+    __host__ __device__ __forceinline__ self_type operator++(int)
+    {
+        self_type retval = *this;
+        input_itr++;
+        return retval;
+    }
+
+    /// Prefix increment
+    __host__ __device__ __forceinline__ self_type operator++()
+    {
+        input_itr++;
+        return *this;
+    }
+
+    /// Indirection
+    __host__ __device__ __forceinline__ reference operator*() const
+    {
+        return conversion_op(*input_itr);
+    }
+
+    /// Addition
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
+    {
+        self_type retval(input_itr + n, conversion_op);
+        return retval;
+    }
+
+    /// Addition assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
+    {
+        input_itr += n;
+        return *this;
+    }
+
+    /// Subtraction
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
+    {
+        self_type retval(input_itr - n, conversion_op);
+        return retval;
+    }
+
+    /// Subtraction assignment
+    template <typename Distance>
+    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
+    {
+        input_itr -= n;
+        return *this;
+    }
+
+    /// Distance
+    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
+    {
+        return input_itr - other.input_itr;
+    }
+
+    /// Array subscript
+    template <typename Distance>
+    __host__ __device__ __forceinline__ reference operator[](Distance n) const
+    {
+        return conversion_op(input_itr[n]);
+    }
+
+    /// Structure dereference
+    __host__ __device__ __forceinline__ pointer operator->()
+    {
+        return &conversion_op(*input_itr);
+    }
+
+    /// Equal to
+    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
+    {
+        return (input_itr == rhs.input_itr);
+    }
+
+    /// Not equal to
+    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
+    {
+        return (input_itr != rhs.input_itr);
+    }
+
+    /// ostream operator
+    friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
+    {
+        return os;
+    }
+};
+
+
+
+/** @} */       // end group UtilIterator
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/thread/thread_load.cuh b/GraphBLAS/CUDA/local_cub/thread/thread_load.cuh
new file mode 100644
index 0000000000..b1ca412faf
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/thread/thread_load.cuh
@@ -0,0 +1,438 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for reading memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#include <iterator>
+
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory load operations.
+ */
+enum CacheLoadModifier
+{
+    LOAD_DEFAULT,       ///< Default (no modifier)
+    LOAD_CA,            ///< Cache at all levels
+    LOAD_CG,            ///< Cache at global level
+    LOAD_CS,            ///< Cache streaming (likely to be accessed once)
+    LOAD_CV,            ///< Cache as volatile (including cached system lines)
+    LOAD_LDG,           ///< Cache as texture
+    LOAD_VOLATILE,      ///< Volatile (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for reading memory using cub::CacheLoadModifier cache modifiers.  Can be used to load any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_load.cuh>
+ *
+ * // 32-bit load using cache-global modifier:
+ * int *d_in;
+ * int val = cub::ThreadLoad<cub::LOAD_CA>(d_in + threadIdx.x);
+ *
+ * // 16-bit load using default modifier
+ * short *d_in;
+ * short val = cub::ThreadLoad<cub::LOAD_DEFAULT>(d_in + threadIdx.x);
+ *
+ * // 256-bit load using cache-volatile modifier
+ * double4 *d_in;
+ * double4 val = cub::ThreadLoad<cub::LOAD_CV>(d_in + threadIdx.x);
+ *
+ * // 96-bit load using cache-streaming modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val = cub::ThreadLoad<cub::LOAD_CS>(d_in + threadIdx.x);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheLoadModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Input iterator type \iterator
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated load iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadLoad
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const *ptr, T *vals)
+    {
+        vals[COUNT] = ThreadLoad<MODIFIER>(ptr + COUNT);
+        IterateThreadLoad<COUNT + 1, MAX>::template Load<MODIFIER>(ptr, vals);
+    }
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT itr, T *vals)
+    {
+        vals[COUNT] = itr[COUNT];
+        IterateThreadLoad<COUNT + 1, MAX>::Dereference(itr, vals);
+    }
+};
+
+
+/// Helper structure for templated load iteration (termination case)
+template <int MAX>
+struct IterateThreadLoad<MAX, MAX>
+{
+    template <CacheLoadModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Load(T const * /*ptr*/, T * /*vals*/) {}
+
+    template <typename InputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(InputIteratorT /*itr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_16(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ uint4 ThreadLoad<cub_modifier, uint4 const *>(uint4 const *ptr)                   \
+    {                                                                                       \
+        uint4 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v4.u32 {%0, %1, %2, %3}, [%4];" :                 \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y),                                                                 \
+            "=r"(retval.z),                                                                 \
+            "=r"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ ulonglong2 ThreadLoad<cub_modifier, ulonglong2 const *>(ulonglong2 const *ptr)    \
+    {                                                                                       \
+        ulonglong2 retval;                                                                  \
+        asm volatile ("ld."#ptx_modifier".v2.u64 {%0, %1}, [%2];" :                         \
+            "=l"(retval.x),                                                                 \
+            "=l"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint2 (8B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_8(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ ushort4 ThreadLoad<cub_modifier, ushort4 const *>(ushort4 const *ptr)             \
+    {                                                                                       \
+        ushort4 retval;                                                                     \
+        asm volatile ("ld."#ptx_modifier".v4.u16 {%0, %1, %2, %3}, [%4];" :                 \
+            "=h"(retval.x),                                                                 \
+            "=h"(retval.y),                                                                 \
+            "=h"(retval.z),                                                                 \
+            "=h"(retval.w) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ uint2 ThreadLoad<cub_modifier, uint2 const *>(uint2 const *ptr)                   \
+    {                                                                                       \
+        uint2 retval;                                                                       \
+        asm volatile ("ld."#ptx_modifier".v2.u32 {%0, %1}, [%2];" :                         \
+            "=r"(retval.x),                                                                 \
+            "=r"(retval.y) :                                                                \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned long long ThreadLoad<cub_modifier, unsigned long long const *>(unsigned long long const *ptr)    \
+    {                                                                                       \
+        unsigned long long retval;                                                          \
+        asm volatile ("ld."#ptx_modifier".u64 %0, [%1];" :                                  \
+            "=l"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+/**
+ * Define a uint (4B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_4(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned int ThreadLoad<cub_modifier, unsigned int const *>(unsigned int const *ptr)                      \
+    {                                                                                       \
+        unsigned int retval;                                                                \
+        asm volatile ("ld."#ptx_modifier".u32 %0, [%1];" :                                  \
+            "=r"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_2(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned short ThreadLoad<cub_modifier, unsigned short const *>(unsigned short const *ptr)                \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile ("ld."#ptx_modifier".u16 %0, [%1];" :                                  \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return retval;                                                                      \
+    }
+
+
+/**
+ * Define an unsigned char (1B) ThreadLoad specialization for the given Cache load modifier
+ */
+#define _CUB_LOAD_1(cub_modifier, ptx_modifier)                                              \
+    template<>                                                                              \
+    __device__ __forceinline__ unsigned char ThreadLoad<cub_modifier, unsigned char const *>(unsigned char const *ptr)                   \
+    {                                                                                       \
+        unsigned short retval;                                                              \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "    ld."#ptx_modifier".u8 datum, [%1];"                                            \
+        "    cvt.u16.u8 %0, datum;"                                                         \
+        "}" :                                                                               \
+            "=h"(retval) :                                                                  \
+            _CUB_ASM_PTR_(ptr));                                                            \
+        return (unsigned char) retval;                                                      \
+    }
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the given Cache load modifier
+ */
+#define _CUB_LOAD_ALL(cub_modifier, ptx_modifier)                                            \
+    _CUB_LOAD_16(cub_modifier, ptx_modifier)                                                 \
+    _CUB_LOAD_8(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_4(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_2(cub_modifier, ptx_modifier)                                                  \
+    _CUB_LOAD_1(cub_modifier, ptx_modifier)                                                  \
+
+
+/**
+ * Define powers-of-two ThreadLoad specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_LOAD_ALL(LOAD_CA, ca)
+    _CUB_LOAD_ALL(LOAD_CG, cg)
+    _CUB_LOAD_ALL(LOAD_CS, cs)
+    _CUB_LOAD_ALL(LOAD_CV, cv)
+#else
+    _CUB_LOAD_ALL(LOAD_CA, global)
+    // Use volatile to ensure coherent reads when this PTX is JIT'd to run on newer architectures with L1
+    _CUB_LOAD_ALL(LOAD_CG, volatile.global)
+    _CUB_LOAD_ALL(LOAD_CS, global)
+    _CUB_LOAD_ALL(LOAD_CV, volatile.global)
+#endif
+
+#if CUB_PTX_ARCH >= 350
+    _CUB_LOAD_ALL(LOAD_LDG, global.nc)
+#else
+    _CUB_LOAD_ALL(LOAD_LDG, global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_LOAD_ALL
+#undef _CUB_LOAD_1
+#undef _CUB_LOAD_2
+#undef _CUB_LOAD_4
+#undef _CUB_LOAD_8
+#undef _CUB_LOAD_16
+
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on iterator types
+ */
+template <typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(
+    InputIteratorT          itr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<false>         /*is_pointer*/)
+{
+    return *itr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_DEFAULT>  /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    return *ptr;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<true>          /*is_primitive*/)
+{
+    T retval = *reinterpret_cast<volatile T*>(ptr);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoadVolatilePointer(
+    T                       *ptr,
+    Int2Type<false>         /*is_primitive*/)
+{
+    typedef typename UnitWord<T>::VolatileWord VolatileWord;   // Word type for memcopying
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+/*
+    VolatileWord words[VOLATILE_MULTIPLE];
+
+    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+
+    return *reinterpret_cast<T*>(words);
+*/
+
+    T retval;
+    VolatileWord *words = reinterpret_cast<VolatileWord*>(&retval);
+    IterateThreadLoad<0, VOLATILE_MULTIPLE>::Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+    return retval;
+}
+
+
+/**
+ * ThreadLoad definition for LOAD_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ T ThreadLoad(
+    T                       *ptr,
+    Int2Type<LOAD_VOLATILE> /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoadVolatilePointer(ptr, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ T ThreadLoad(
+    T const                 *ptr,
+    Int2Type<MODIFIER>      /*modifier*/,
+    Int2Type<true>          /*is_pointer*/)
+{
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    const int DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord);
+
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    IterateThreadLoad<0, DEVICE_MULTIPLE>::template Load<CacheLoadModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(const_cast<T*>(ptr)),
+        words);
+
+    return *reinterpret_cast<T*>(words);
+}
+
+
+/**
+ * ThreadLoad definition for generic modifiers
+ */
+template <
+    CacheLoadModifier MODIFIER,
+    typename InputIteratorT>
+__device__ __forceinline__ typename std::iterator_traits<InputIteratorT>::value_type ThreadLoad(InputIteratorT itr)
+{
+    // Apply tags for partial-specialization
+    return ThreadLoad(
+        itr,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<InputIteratorT>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/thread/thread_operators.cuh b/GraphBLAS/CUDA/local_cub/thread/thread_operators.cuh
new file mode 100644
index 0000000000..76cd800f58
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/thread/thread_operators.cuh
@@ -0,0 +1,317 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Simple binary operator functor types
+ */
+
+/******************************************************************************
+ * Simple functor operators
+ ******************************************************************************/
+
+#pragma once
+
+#include "../util_macro.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \brief Default equality functor
+ */
+struct Equality
+{
+    /// Boolean equality operator, returns <tt>(a == b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a == b;
+    }
+};
+
+
+/**
+ * \brief Default inequality functor
+ */
+struct Inequality
+{
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b) const
+    {
+        return a != b;
+    }
+};
+
+
+/**
+ * \brief Inequality functor (wraps equality functor)
+ */
+template <typename EqualityOp>
+struct InequalityWrapper
+{
+    /// Wrapped equality operator
+    EqualityOp op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    InequalityWrapper(EqualityOp op) : op(op) {}
+
+    /// Boolean inequality operator, returns <tt>(a != b)</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ bool operator()(const T &a, const T &b)
+    {
+        return !op(a, b);
+    }
+};
+
+
+/**
+ * \brief Default sum functor
+ */
+struct Sum
+{
+    /// Boolean sum operator, returns <tt>a + b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return a + b;
+    }
+};
+
+
+/**
+ * \brief Default max functor
+ */
+struct Max
+{
+    /// Boolean max operator, returns <tt>(a > b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MAX(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg max functor (keeps the value and offset of the first occurrence of the larger item)
+ */
+struct ArgMax
+{
+    /// Boolean max operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value > a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default min functor
+ */
+struct Min
+{
+    /// Boolean min operator, returns <tt>(a < b) ? a : b</tt>
+    template <typename T>
+    __host__ __device__ __forceinline__ T operator()(const T &a, const T &b) const
+    {
+        return CUB_MIN(a, b);
+    }
+};
+
+
+/**
+ * \brief Arg min functor (keeps the value and offset of the first occurrence of the smallest item)
+ */
+struct ArgMin
+{
+    /// Boolean min operator, preferring the item having the smaller offset in case of ties
+    template <typename T, typename OffsetT>
+    __host__ __device__ __forceinline__ KeyValuePair<OffsetT, T> operator()(
+        const KeyValuePair<OffsetT, T> &a,
+        const KeyValuePair<OffsetT, T> &b) const
+    {
+// Mooch BUG (device reduce argmax gk110 3.2 million random fp32)
+//        return ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key))) ? b : a;
+
+        if ((b.value < a.value) || ((a.value == b.value) && (b.key < a.key)))
+            return b;
+        return a;
+    }
+};
+
+
+/**
+ * \brief Default cast functor
+ */
+template <typename B>
+struct CastOp
+{
+    /// Cast operator, returns <tt>(B) a</tt>
+    template <typename A>
+    __host__ __device__ __forceinline__ B operator()(const A &a) const
+    {
+        return (B) a;
+    }
+};
+
+
+/**
+ * \brief Binary operator wrapper for switching non-commutative scan arguments
+ */
+template <typename ScanOp>
+class SwizzleScanOp
+{
+private:
+
+    /// Wrapped scan operator
+    ScanOp scan_op;
+
+public:
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    SwizzleScanOp(ScanOp scan_op) : scan_op(scan_op) {}
+
+    /// Switch the scan arguments
+    template <typename T>
+    __host__ __device__ __forceinline__
+    T operator()(const T &a, const T &b)
+    {
+      T _a(a);
+      T _b(b);
+
+      return scan_op(_b, _a);
+    }
+};
+
+
+/**
+ * \brief Reduce-by-segment functor.
+ *
+ * Given two cub::KeyValuePair inputs \p a and \p b and a
+ * binary associative combining operator \p <tt>f(const T &x, const T &y)</tt>,
+ * an instance of this functor returns a cub::KeyValuePair whose \p key
+ * field is <tt>a.key</tt> + <tt>b.key</tt>, and whose \p value field
+ * is either b.value if b.key is non-zero, or f(a.value, b.value) otherwise.
+ *
+ * ReduceBySegmentOp is an associative, non-commutative binary combining operator
+ * for input sequences of cub::KeyValuePair pairings.  Such
+ * sequences are typically used to represent a segmented set of values to be reduced
+ * and a corresponding set of {0,1}-valued integer "head flags" demarcating the
+ * first value of each segment.
+ *
+ */
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceBySegmentOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceBySegmentOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>       ///< KeyValuePair pairing of T (value) and OffsetT (head flag)
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,         ///< First partial reduction
+        const KeyValuePairT &second)        ///< Second partial reduction
+    {
+        KeyValuePairT retval;
+        retval.key = first.key + second.key;
+        retval.value = (second.key) ?
+                second.value :                          // The second partial reduction spans a segment reset, so it's value aggregate becomes the running aggregate
+                op(first.value, second.value);          // The second partial reduction does not span a reset, so accumulate both into the running aggregate
+        return retval;
+    }
+};
+
+
+
+template <typename ReductionOpT>    ///< Binary reduction operator to apply to values
+struct ReduceByKeyOp
+{
+    /// Wrapped reduction operator
+    ReductionOpT op;
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ReduceByKeyOp(ReductionOpT op) : op(op) {}
+
+    /// Scan operator
+    template <typename KeyValuePairT>
+    __host__ __device__ __forceinline__ KeyValuePairT operator()(
+        const KeyValuePairT &first,       ///< First partial reduction
+        const KeyValuePairT &second)      ///< Second partial reduction
+    {
+        KeyValuePairT retval = second;
+
+        if (first.key == second.key)
+            retval.value = op(first.value, retval.value);
+
+        return retval;
+    }
+};
+
+
+
+
+
+
+
+/** @} */       // end group UtilModule
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/thread/thread_reduce.cuh b/GraphBLAS/CUDA/local_cub/thread/thread_reduce.cuh
new file mode 100644
index 0000000000..4c13688f33
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/thread/thread_reduce.cuh
@@ -0,0 +1,152 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential reduction over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+/**
+ * Sequential reduction over statically-sized array types
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*                  input,                  ///< [in] Input array
+    ReductionOp         reduction_op,           ///< [in] Binary reduction operator
+    T                   prefix,                 ///< [in] Prefix to seed reduction with
+    Int2Type<LENGTH>    /*length*/)
+{
+    T retval = prefix;
+
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+        retval = reduction_op(retval, input[i]);
+
+    return retval;
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Perform a sequential reduction over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T*          input,                  ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    T prefix = input[0];
+    return ThreadReduce<LENGTH - 1>(input + 1, reduction_op, prefix);
+}
+
+
+/**
+ * \brief Perform a sequential reduction over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op,           ///< [in] Binary reduction operator
+    T           prefix)                 ///< [in] Prefix to seed reduction with
+{
+    return ThreadReduce(input, reduction_op, prefix, Int2Type<LENGTH>());
+}
+
+
+/**
+ * \brief Serial reduction with the specified operator
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input array
+ * \tparam T          <b>[inferred]</b> The data type to be reduced.
+ * \tparam ScanOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ReductionOp>
+__device__ __forceinline__ T ThreadReduce(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    ReductionOp reduction_op)           ///< [in] Binary reduction operator
+{
+    return ThreadReduce<LENGTH>((T*) input, reduction_op);
+}
+
+
+}               // internal namespace
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/thread/thread_scan.cuh b/GraphBLAS/CUDA/local_cub/thread/thread_scan.cuh
new file mode 100644
index 0000000000..8d67549ae8
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/thread/thread_scan.cuh
@@ -0,0 +1,268 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential prefix scan over statically-sized array types
+ */
+
+#pragma once
+
+#include "../thread/thread_operators.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/// Internal namespace (to prevent ADL mishaps between static functions when mixing different CUB installations)
+namespace internal {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+/**
+ * \name Sequential prefix scan over statically-sized array types
+ * @{
+ */
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T                   inclusive,
+    T                   exclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(exclusive, input[i]);
+        output[i] = exclusive;
+        exclusive = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  If not, the first output element is undefined.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = prefix;
+    T exclusive = inclusive;
+
+    return ThreadScanExclusive(inclusive, exclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential exclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanExclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanExclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+
+
+
+
+
+
+
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T                   inclusive,
+    T                   *input,                 ///< [in] Input array
+    T                   *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp              scan_op,                ///< [in] Binary scan operator
+    Int2Type<LENGTH>    /*length*/)
+{
+    #pragma unroll
+    for (int i = 0; i < LENGTH; ++i)
+    {
+        inclusive = scan_op(inclusive, input[i]);
+        output[i] = inclusive;
+    }
+
+    return inclusive;
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    T inclusive = input[0];
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op)                ///< [in] Binary scan operator
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op);
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over \p LENGTH elements of the \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           *input,                 ///< [in] Input array
+    T           *output,                ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    T inclusive = input[0];
+    if (apply_prefix)
+    {
+        inclusive = scan_op(prefix, inclusive);
+    }
+    output[0] = inclusive;
+
+    // Continue scan
+    return ThreadScanInclusive(inclusive, input + 1, output + 1, scan_op, Int2Type<LENGTH - 1>());
+}
+
+
+/**
+ * \brief Perform a sequential inclusive prefix scan over the statically-sized \p input array, seeded with the specified \p prefix.  The aggregate is returned.
+ *
+ * \tparam LENGTH     <b>[inferred]</b> LengthT of \p input and \p output arrays
+ * \tparam T          <b>[inferred]</b> The data type to be scanned.
+ * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+ */
+template <
+    int         LENGTH,
+    typename    T,
+    typename    ScanOp>
+__device__ __forceinline__ T ThreadScanInclusive(
+    T           (&input)[LENGTH],       ///< [in] Input array
+    T           (&output)[LENGTH],      ///< [out] Output array (may be aliased to \p input)
+    ScanOp      scan_op,                ///< [in] Binary scan operator
+    T           prefix,                 ///< [in] Prefix to seed scan with
+    bool        apply_prefix = true)    ///< [in] Whether or not the calling thread should apply its prefix.  (Handy for preventing thread-0 from applying a prefix.)
+{
+    return ThreadScanInclusive<LENGTH>((T*) input, (T*) output, scan_op, prefix, apply_prefix);
+}
+
+
+//@}  end member group
+
+/** @} */       // end group UtilModule
+
+
+}               // internal namespace
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/thread/thread_search.cuh b/GraphBLAS/CUDA/local_cub/thread/thread_search.cuh
new file mode 100644
index 0000000000..3099080a3c
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/thread/thread_search.cuh
@@ -0,0 +1,154 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for sequential search
+ */
+
+#pragma once
+
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * Computes the begin offsets into A and B for the specific diagonal
+ */
+template <
+    typename AIteratorT,
+    typename BIteratorT,
+    typename OffsetT,
+    typename CoordinateT>
+__host__ __device__ __forceinline__ void MergePathSearch(
+    OffsetT         diagonal,
+    AIteratorT      a,
+    BIteratorT      b,
+    OffsetT         a_len,
+    OffsetT         b_len,
+    CoordinateT&    path_coordinate)
+{
+    /// The value type of the input iterator
+    typedef typename std::iterator_traits<AIteratorT>::value_type T;
+
+    OffsetT split_min = CUB_MAX(diagonal - b_len, 0);
+    OffsetT split_max = CUB_MIN(diagonal, a_len);
+
+    while (split_min < split_max)
+    {
+        OffsetT split_pivot = (split_min + split_max) >> 1;
+        if (a[split_pivot] <= b[diagonal - split_pivot - 1])
+        {
+            // Move candidate split range up A, down B
+            split_min = split_pivot + 1;
+        }
+        else
+        {
+            // Move candidate split range up B, down A
+            split_max = split_pivot;
+        }
+    }
+
+    path_coordinate.x = CUB_MIN(split_min, a_len);
+    path_coordinate.y = diagonal - split_min;
+}
+
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which does not compare less than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT LowerBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (input[retval + half] < val)
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+        else
+        {
+            num_items = half;
+        }
+    }
+
+    return retval;
+}
+
+
+/**
+ * \brief Returns the offset of the first value within \p input which compares greater than \p val
+ */
+template <
+    typename InputIteratorT,
+    typename OffsetT,
+    typename T>
+__device__ __forceinline__ OffsetT UpperBound(
+    InputIteratorT      input,              ///< [in] Input sequence
+    OffsetT             num_items,          ///< [in] Input sequence length
+    T                   val)                ///< [in] Search key
+{
+    OffsetT retval = 0;
+    while (num_items > 0)
+    {
+        OffsetT half = num_items >> 1;
+        if (val < input[retval + half])
+        {
+            num_items = half;
+        }
+        else
+        {
+            retval = retval + (half + 1);
+            num_items = num_items - (half + 1);
+        }
+    }
+
+    return retval;
+}
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/thread/thread_store.cuh b/GraphBLAS/CUDA/local_cub/thread/thread_store.cuh
new file mode 100644
index 0000000000..ec20b36f40
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/thread/thread_store.cuh
@@ -0,0 +1,422 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Thread utilities for writing memory using PTX cache modifiers.
+ */
+
+#pragma once
+
+#include <cuda.h>
+
+#include "../util_ptx.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup UtilIo
+ * @{
+ */
+
+
+//-----------------------------------------------------------------------------
+// Tags and constants
+//-----------------------------------------------------------------------------
+
+/**
+ * \brief Enumeration of cache modifiers for memory store operations.
+ */
+enum CacheStoreModifier
+{
+    STORE_DEFAULT,              ///< Default (no modifier)
+    STORE_WB,                   ///< Cache write-back all coherent levels
+    STORE_CG,                   ///< Cache at global level
+    STORE_CS,                   ///< Cache streaming (likely to be accessed once)
+    STORE_WT,                   ///< Cache write-through (to system memory)
+    STORE_VOLATILE,             ///< Volatile shared (any memory space)
+};
+
+
+/**
+ * \name Thread I/O (cache modified)
+ * @{
+ */
+
+/**
+ * \brief Thread utility for writing memory using cub::CacheStoreModifier cache modifiers.  Can be used to store any data type.
+ *
+ * \par Example
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/thread/thread_store.cuh>
+ *
+ * // 32-bit store using cache-global modifier:
+ * int *d_out;
+ * int val;
+ * cub::ThreadStore<cub::STORE_CG>(d_out + threadIdx.x, val);
+ *
+ * // 16-bit store using default modifier
+ * short *d_out;
+ * short val;
+ * cub::ThreadStore<cub::STORE_DEFAULT>(d_out + threadIdx.x, val);
+ *
+ * // 256-bit store using write-through modifier
+ * double4 *d_out;
+ * double4 val;
+ * cub::ThreadStore<cub::STORE_WT>(d_out + threadIdx.x, val);
+ *
+ * // 96-bit store using cache-streaming cache modifier
+ * struct TestFoo { bool a; short b; };
+ * TestFoo *d_struct;
+ * TestFoo val;
+ * cub::ThreadStore<cub::STORE_CS>(d_out + threadIdx.x, val);
+ * \endcode
+ *
+ * \tparam MODIFIER             <b>[inferred]</b> CacheStoreModifier enumeration
+ * \tparam InputIteratorT       <b>[inferred]</b> Output iterator type \iterator
+ * \tparam T                    <b>[inferred]</b> Data type of output value
+ */
+template <
+    CacheStoreModifier  MODIFIER,
+    typename            OutputIteratorT,
+    typename            T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val);
+
+
+//@}  end member group
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/// Helper structure for templated store iteration (inductive case)
+template <int COUNT, int MAX>
+struct IterateThreadStore
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T *ptr, T *vals)
+    {
+        ThreadStore<MODIFIER>(ptr + COUNT, vals[COUNT]);
+        IterateThreadStore<COUNT + 1, MAX>::template Store<MODIFIER>(ptr, vals);
+    }
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT ptr, T *vals)
+    {
+        ptr[COUNT] = vals[COUNT];
+        IterateThreadStore<COUNT + 1, MAX>::Dereference(ptr, vals);
+    }
+
+};
+
+/// Helper structure for templated store iteration (termination case)
+template <int MAX>
+struct IterateThreadStore<MAX, MAX>
+{
+    template <CacheStoreModifier MODIFIER, typename T>
+    static __device__ __forceinline__ void Store(T * /*ptr*/, T * /*vals*/) {}
+
+    template <typename OutputIteratorT, typename T>
+    static __device__ __forceinline__ void Dereference(OutputIteratorT /*ptr*/, T * /*vals*/) {}
+};
+
+
+/**
+ * Define a uint4 (16B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_16(cub_modifier, ptx_modifier)                                            \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint4*, uint4>(uint4* ptr, uint4 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u32 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y),                                                                     \
+            "r"(val.z),                                                                     \
+            "r"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ulonglong2*, ulonglong2>(ulonglong2* ptr, ulonglong2 val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u64 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val.x),                                                                     \
+            "l"(val.y));                                                                    \
+    }
+
+
+/**
+ * Define a uint2 (8B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_8(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, ushort4*, ushort4>(ushort4* ptr, ushort4 val)                 \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v4.u16 [%0], {%1, %2, %3, %4};" : :               \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val.x),                                                                     \
+            "h"(val.y),                                                                     \
+            "h"(val.z),                                                                     \
+            "h"(val.w));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, uint2*, uint2>(uint2* ptr, uint2 val)                         \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".v2.u32 [%0], {%1, %2};" : :                       \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val.x),                                                                     \
+            "r"(val.y));                                                                    \
+    }                                                                                       \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned long long*, unsigned long long>(unsigned long long* ptr, unsigned long long val)     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u64 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "l"(val));                                                                      \
+    }
+
+/**
+ * Define a unsigned int (4B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_4(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned int*, unsigned int>(unsigned int* ptr, unsigned int val)                             \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u32 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "r"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned short (2B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_2(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned short*, unsigned short>(unsigned short* ptr, unsigned short val)                     \
+    {                                                                                       \
+        asm volatile ("st."#ptx_modifier".u16 [%0], %1;" : :                                \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"(val));                                                                      \
+    }
+
+
+/**
+ * Define a unsigned char (1B) ThreadStore specialization for the given Cache load modifier
+ */
+#define _CUB_STORE_1(cub_modifier, ptx_modifier)                                             \
+    template<>                                                                              \
+    __device__ __forceinline__ void ThreadStore<cub_modifier, unsigned char*, unsigned char>(unsigned char* ptr, unsigned char val)                         \
+    {                                                                                       \
+        asm volatile (                                                                      \
+        "{"                                                                                 \
+        "   .reg .u8 datum;"                                                                \
+        "   cvt.u8.u16 datum, %1;"                                                          \
+        "   st."#ptx_modifier".u8 [%0], datum;"                                             \
+        "}" : :                                                                             \
+            _CUB_ASM_PTR_(ptr),                                                             \
+            "h"((unsigned short) val));                                                               \
+    }
+
+/**
+ * Define powers-of-two ThreadStore specializations for the given Cache load modifier
+ */
+#define _CUB_STORE_ALL(cub_modifier, ptx_modifier)                                           \
+    _CUB_STORE_16(cub_modifier, ptx_modifier)                                                \
+    _CUB_STORE_8(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_4(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_2(cub_modifier, ptx_modifier)                                                 \
+    _CUB_STORE_1(cub_modifier, ptx_modifier)                                                 \
+
+
+/**
+ * Define ThreadStore specializations for the various Cache load modifiers
+ */
+#if CUB_PTX_ARCH >= 200
+    _CUB_STORE_ALL(STORE_WB, wb)
+    _CUB_STORE_ALL(STORE_CG, cg)
+    _CUB_STORE_ALL(STORE_CS, cs)
+    _CUB_STORE_ALL(STORE_WT, wt)
+#else
+    _CUB_STORE_ALL(STORE_WB, global)
+    _CUB_STORE_ALL(STORE_CG, global)
+    _CUB_STORE_ALL(STORE_CS, global)
+    _CUB_STORE_ALL(STORE_WT, volatile.global)
+#endif
+
+
+// Macro cleanup
+#undef _CUB_STORE_ALL
+#undef _CUB_STORE_1
+#undef _CUB_STORE_2
+#undef _CUB_STORE_4
+#undef _CUB_STORE_8
+#undef _CUB_STORE_16
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on iterator types
+ */
+template <typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(
+    OutputIteratorT             itr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<false>             /*is_pointer*/)
+{
+    *itr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_DEFAULT modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_DEFAULT>     /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    *ptr = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<true>              /*is_primitive*/)
+{
+    *reinterpret_cast<volatile T*>(ptr) = val;
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on non-primitive pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStoreVolatilePtr(
+    T                           *ptr,
+    T                           val,
+    Int2Type<false>             /*is_primitive*/)
+{
+    // Create a temporary using shuffle-words, then store using volatile-words
+    typedef typename UnitWord<T>::VolatileWord  VolatileWord;  
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int VOLATILE_MULTIPLE = sizeof(T) / sizeof(VolatileWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+    
+    VolatileWord words[VOLATILE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, VOLATILE_MULTIPLE>::template Dereference(
+        reinterpret_cast<volatile VolatileWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for STORE_VOLATILE modifier on pointer types
+ */
+template <typename T>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<STORE_VOLATILE>    /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    ThreadStoreVolatilePtr(ptr, val, Int2Type<Traits<T>::PRIMITIVE>());
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers on pointer types
+ */
+template <typename T, int MODIFIER>
+__device__ __forceinline__ void ThreadStore(
+    T                           *ptr,
+    T                           val,
+    Int2Type<MODIFIER>          /*modifier*/,
+    Int2Type<true>              /*is_pointer*/)
+{
+    // Create a temporary using shuffle-words, then store using device-words
+    typedef typename UnitWord<T>::DeviceWord    DeviceWord;  
+    typedef typename UnitWord<T>::ShuffleWord   ShuffleWord;
+
+    const int DEVICE_MULTIPLE   = sizeof(T) / sizeof(DeviceWord);
+    const int SHUFFLE_MULTIPLE  = sizeof(T) / sizeof(ShuffleWord);
+    
+    DeviceWord words[DEVICE_MULTIPLE];
+
+    #pragma unroll
+    for (int i = 0; i < SHUFFLE_MULTIPLE; ++i)
+        reinterpret_cast<ShuffleWord*>(words)[i] = reinterpret_cast<ShuffleWord*>(&val)[i];
+
+    IterateThreadStore<0, DEVICE_MULTIPLE>::template Store<CacheStoreModifier(MODIFIER)>(
+        reinterpret_cast<DeviceWord*>(ptr),
+        words);
+}
+
+
+/**
+ * ThreadStore definition for generic modifiers
+ */
+template <CacheStoreModifier MODIFIER, typename OutputIteratorT, typename T>
+__device__ __forceinline__ void ThreadStore(OutputIteratorT itr, T val)
+{
+    ThreadStore(
+        itr,
+        val,
+        Int2Type<MODIFIER>(),
+        Int2Type<IsPointer<OutputIteratorT>::VALUE>());
+}
+
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilIo
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/util_allocator.cuh b/GraphBLAS/CUDA/local_cub/util_allocator.cuh
new file mode 100644
index 0000000000..0e6dd0486e
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/util_allocator.cuh
@@ -0,0 +1,708 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Simple caching allocator for device memory allocations. The allocator is
+ * thread-safe and capable of managing device allocations on multiple devices.
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+#include <set>
+#include <map>
+
+#include "host/mutex.cuh"
+#include <math.h>
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/******************************************************************************
+ * CachingDeviceAllocator (host use)
+ ******************************************************************************/
+
+/**
+ * \brief A simple caching allocator for device memory allocations.
+ *
+ * \par Overview
+ * The allocator is thread-safe and stream-safe and is capable of managing cached
+ * device allocations on multiple devices.  It behaves as follows:
+ *
+ * \par
+ * - Allocations from the allocator are associated with an \p active_stream.  Once freed,
+ *   the allocation becomes available immediately for reuse within the \p active_stream
+ *   with which it was associated with during allocation, and it becomes available for
+ *   reuse within other streams when all prior work submitted to \p active_stream has completed.
+ * - Allocations are categorized and cached by bin size.  A new allocation request of
+ *   a given size will only consider cached allocations within the corresponding bin.
+ * - Bin limits progress geometrically in accordance with the growth factor
+ *   \p bin_growth provided during construction.  Unused device allocations within
+ *   a larger bin cache are not reused for allocation requests that categorize to
+ *   smaller bin sizes.
+ * - Allocation requests below (\p bin_growth ^ \p min_bin) are rounded up to
+ *   (\p bin_growth ^ \p min_bin).
+ * - Allocations above (\p bin_growth ^ \p max_bin) are not rounded up to the nearest
+ *   bin and are simply freed when they are deallocated instead of being returned
+ *   to a bin-cache.
+ * - %If the total storage of cached allocations on a given device will exceed
+ *   \p max_cached_bytes, allocations for that device are simply freed when they are
+ *   deallocated instead of being returned to their bin-cache.
+ *
+ * \par
+ * For example, the default-constructed CachingDeviceAllocator is configured with:
+ * - \p bin_growth          = 8
+ * - \p min_bin             = 3
+ * - \p max_bin             = 7
+ * - \p max_cached_bytes    = 6MB - 1B
+ *
+ * \par
+ * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB
+ * and sets a maximum of 6,291,455 cached bytes per device
+ *
+ */
+struct CachingDeviceAllocator
+{
+
+    //---------------------------------------------------------------------
+    // Constants
+    //---------------------------------------------------------------------
+
+    /// Out-of-bounds bin
+    static const unsigned int INVALID_BIN = (unsigned int) -1;
+
+    /// Invalid size
+    static const size_t INVALID_SIZE = (size_t) -1;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Invalid device ordinal
+    static const int INVALID_DEVICE_ORDINAL = -1;
+
+    //---------------------------------------------------------------------
+    // Type definitions and helper types
+    //---------------------------------------------------------------------
+
+    /**
+     * Descriptor for device memory allocations
+     */
+    struct BlockDescriptor
+    {
+        void*           d_ptr;              // Device pointer
+        size_t          bytes;              // Size of allocation in bytes
+        unsigned int    bin;                // Bin enumeration
+        int             device;             // device ordinal
+        cudaStream_t    associated_stream;  // Associated associated_stream
+        cudaEvent_t     ready_event;        // Signal when associated stream has run to the point at which this block was freed
+
+        // Constructor (suitable for searching maps for a specific block, given its pointer and device)
+        BlockDescriptor(void *d_ptr, int device) :
+            d_ptr(d_ptr),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Constructor (suitable for searching maps for a range of suitable blocks, given a device)
+        BlockDescriptor(int device) :
+            d_ptr(NULL),
+            bytes(0),
+            bin(INVALID_BIN),
+            device(device),
+            associated_stream(0),
+            ready_event(0)
+        {}
+
+        // Comparison functor for comparing device pointers
+        static bool PtrCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.d_ptr < b.d_ptr);
+            else
+                return (a.device < b.device);
+        }
+
+        // Comparison functor for comparing allocation sizes
+        static bool SizeCompare(const BlockDescriptor &a, const BlockDescriptor &b)
+        {
+            if (a.device == b.device)
+                return (a.bytes < b.bytes);
+            else
+                return (a.device < b.device);
+        }
+    };
+
+    /// BlockDescriptor comparator function interface
+    typedef bool (*Compare)(const BlockDescriptor &, const BlockDescriptor &);
+
+    class TotalBytes {
+    public:
+        size_t free;
+        size_t live;
+        TotalBytes() { free = live = 0; }
+    };
+
+    /// Set type for cached blocks (ordered by size)
+    typedef std::multiset<BlockDescriptor, Compare> CachedBlocks;
+
+    /// Set type for live blocks (ordered by ptr)
+    typedef std::multiset<BlockDescriptor, Compare> BusyBlocks;
+
+    /// Map type of device ordinals to the number of cached bytes cached by each device
+    typedef std::map<int, TotalBytes> GpuCachedBytes;
+
+
+    //---------------------------------------------------------------------
+    // Utility functions
+    //---------------------------------------------------------------------
+
+    /**
+     * Integer pow function for unsigned base and exponent
+     */
+    static unsigned int IntPow(
+        unsigned int base,
+        unsigned int exp)
+    {
+        unsigned int retval = 1;
+        while (exp > 0)
+        {
+            if (exp & 1) {
+                retval = retval * base;        // multiply the result by the current base
+            }
+            base = base * base;                // square the base
+            exp = exp >> 1;                    // divide the exponent in half
+        }
+        return retval;
+    }
+
+
+    /**
+     * Round up to the nearest power-of
+     */
+    void NearestPowerOf(
+        unsigned int    &power,
+        size_t          &rounded_bytes,
+        unsigned int    base,
+        size_t          value)
+    {
+        power = 0;
+        rounded_bytes = 1;
+
+        if (value * base < value)
+        {
+            // Overflow
+            power = sizeof(size_t) * 8;
+            rounded_bytes = size_t(0) - 1;
+            return;
+        }
+
+        while (rounded_bytes < value)
+        {
+            rounded_bytes *= base;
+            power++;
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Fields
+    //---------------------------------------------------------------------
+
+    cub::Mutex      mutex;              /// Mutex for thread-safety
+
+    unsigned int    bin_growth;         /// Geometric growth factor for bin-sizes
+    unsigned int    min_bin;            /// Minimum bin enumeration
+    unsigned int    max_bin;            /// Maximum bin enumeration
+
+    size_t          min_bin_bytes;      /// Minimum bin size
+    size_t          max_bin_bytes;      /// Maximum bin size
+    size_t          max_cached_bytes;   /// Maximum aggregate cached bytes per device
+
+    const bool      skip_cleanup;       /// Whether or not to skip a call to FreeAllCached() when destructor is called.  (The CUDA runtime may have already shut down for statically declared allocators)
+    bool            debug;              /// Whether or not to print (de)allocation events to stdout
+
+    GpuCachedBytes  cached_bytes;       /// Map of device ordinal to aggregate cached bytes on that device
+    CachedBlocks    cached_blocks;      /// Set of cached device allocations available for reuse
+    BusyBlocks      live_blocks;        /// Set of live device allocations currently in use
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    //---------------------------------------------------------------------
+    // Methods
+    //---------------------------------------------------------------------
+
+    /**
+     * \brief Constructor.
+     */
+    CachingDeviceAllocator(
+        unsigned int    bin_growth,                             ///< Geometric growth factor for bin-sizes
+        unsigned int    min_bin             = 1,                ///< Minimum bin (default is bin_growth ^ 1)
+        unsigned int    max_bin             = INVALID_BIN,      ///< Maximum bin (default is no max bin)
+        size_t          max_cached_bytes    = INVALID_SIZE,     ///< Maximum aggregate cached bytes per device (default is no limit)
+        bool            skip_cleanup        = false,            ///< Whether or not to skip a call to \p FreeAllCached() when the destructor is called (default is to deallocate)
+        bool            debug               = false)            ///< Whether or not to print (de)allocation events to stdout (default is no stderr output)
+    :
+        bin_growth(bin_growth),
+        min_bin(min_bin),
+        max_bin(max_bin),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes(max_cached_bytes),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Default constructor.
+     *
+     * Configured with:
+     * \par
+     * - \p bin_growth          = 8
+     * - \p min_bin             = 3
+     * - \p max_bin             = 7
+     * - \p max_cached_bytes    = (\p bin_growth ^ \p max_bin) * 3) - 1 = 6,291,455 bytes
+     *
+     * which delineates five bin-sizes: 512B, 4KB, 32KB, 256KB, and 2MB and
+     * sets a maximum of 6,291,455 cached bytes per device
+     */
+    CachingDeviceAllocator(
+        bool skip_cleanup = false,
+        bool debug = false)
+    :
+        bin_growth(8),
+        min_bin(3),
+        max_bin(7),
+        min_bin_bytes(IntPow(bin_growth, min_bin)),
+        max_bin_bytes(IntPow(bin_growth, max_bin)),
+        max_cached_bytes((max_bin_bytes * 3) - 1),
+        skip_cleanup(skip_cleanup),
+        debug(debug),
+        cached_blocks(BlockDescriptor::SizeCompare),
+        live_blocks(BlockDescriptor::PtrCompare)
+    {}
+
+
+    /**
+     * \brief Sets the limit on the number bytes this allocator is allowed to cache per device.
+     *
+     * Changing the ceiling of cached bytes does not cause any allocations (in-use or
+     * cached-in-reserve) to be freed.  See \p FreeAllCached().
+     */
+    cudaError_t SetMaxCachedBytes(
+        size_t max_cached_bytes)
+    {
+        // Lock
+        mutex.Lock();
+
+        if (debug) _CubLog("Changing max_cached_bytes (%lld -> %lld)\n", (long long) this->max_cached_bytes, (long long) max_cached_bytes);
+
+        this->max_cached_bytes = max_cached_bytes;
+
+        // Unlock
+        mutex.Unlock();
+
+        return cudaSuccess;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the specified device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        int             device,             ///< [in] Device on which to place the allocation
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        *d_ptr                          = NULL;
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            device = entrypoint_device;
+        }
+
+        // Create a block descriptor for the requested allocation
+        bool found = false;
+        BlockDescriptor search_key(device);
+        search_key.associated_stream = active_stream;
+        NearestPowerOf(search_key.bin, search_key.bytes, bin_growth, bytes);
+
+        if (search_key.bin > max_bin)
+        {
+            // Bin is greater than our maximum bin: allocate the request
+            // exactly and give out-of-bounds bin.  It will not be cached
+            // for reuse when returned.
+            search_key.bin      = INVALID_BIN;
+            search_key.bytes    = bytes;
+        }
+        else
+        {
+            // Search for a suitable cached allocation: lock
+            mutex.Lock();
+
+            if (search_key.bin < min_bin)
+            {
+                // Bin is less than minimum bin: round up
+                search_key.bin      = min_bin;
+                search_key.bytes    = min_bin_bytes;
+            }
+
+            // Iterate through the range of cached blocks on the same device in the same bin
+            CachedBlocks::iterator block_itr = cached_blocks.lower_bound(search_key);
+            while ((block_itr != cached_blocks.end())
+                    && (block_itr->device == device)
+                    && (block_itr->bin == search_key.bin))
+            {
+                // To prevent races with reusing blocks returned by the host but still
+                // in use by the device, only consider cached blocks that are
+                // either (from the active stream) or (from an idle stream)
+                if ((active_stream == block_itr->associated_stream) ||
+                    (cudaEventQuery(block_itr->ready_event) != cudaErrorNotReady))
+                {
+                    // Reuse existing cache block.  Insert into live blocks.
+                    found = true;
+                    search_key = *block_itr;
+                    search_key.associated_stream = active_stream;
+                    live_blocks.insert(search_key);
+
+                    // Remove from free blocks
+                    cached_bytes[device].free -= search_key.bytes;
+                    cached_bytes[device].live += search_key.bytes;
+
+                    if (debug) _CubLog("\tDevice %d reused cached block at %p (%lld bytes) for stream %lld (previously associated with stream %lld).\n",
+                        device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long)  block_itr->associated_stream);
+
+                    cached_blocks.erase(block_itr);
+
+                    break;
+                }
+                block_itr++;
+            }
+
+            // Done searching: unlock
+            mutex.Unlock();
+        }
+
+        // Allocate the block if necessary
+        if (!found)
+        {
+            // Set runtime's current device to specified device (entrypoint may not be set)
+            if (device != entrypoint_device)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+                if (CubDebug(error = cudaSetDevice(device))) return error;
+            }
+
+            // Attempt to allocate
+            if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes)) == cudaErrorMemoryAllocation)
+            {
+                // The allocation attempt failed: free all cached blocks on device and retry
+                if (debug) _CubLog("\tDevice %d failed to allocate %lld bytes for stream %lld, retrying after freeing cached allocations",
+                      device, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+                error = cudaSuccess;    // Reset the error we will return
+                cudaGetLastError();     // Reset CUDART's error
+
+                // Lock
+                mutex.Lock();
+
+                // Iterate the range of free blocks on the same device
+                BlockDescriptor free_key(device);
+                CachedBlocks::iterator block_itr = cached_blocks.lower_bound(free_key);
+
+                while ((block_itr != cached_blocks.end()) && (block_itr->device == device))
+                {
+                    // No need to worry about synchronization with the device: cudaFree is
+                    // blocking and will synchronize across all kernels executing
+                    // on the current device
+
+                    // Free device memory and destroy stream event.
+                    if (CubDebug(error = cudaFree(block_itr->d_ptr))) break;
+                    if (CubDebug(error = cudaEventDestroy(block_itr->ready_event))) break;
+
+                    // Reduce balance and erase entry
+                    cached_bytes[device].free -= block_itr->bytes;
+
+                    if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                        device, (long long) block_itr->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+                    cached_blocks.erase(block_itr);
+
+                    block_itr++;
+                }
+
+                // Unlock
+                mutex.Unlock();
+
+                // Return under error
+                if (error) return error;
+
+                // Try to allocate again
+                if (CubDebug(error = cudaMalloc(&search_key.d_ptr, search_key.bytes))) return error;
+            }
+
+            // Create ready event
+            if (CubDebug(error = cudaEventCreateWithFlags(&search_key.ready_event, cudaEventDisableTiming)))
+                return error;
+
+            // Insert into live blocks
+            mutex.Lock();
+            live_blocks.insert(search_key);
+            cached_bytes[device].live += search_key.bytes;
+            mutex.Unlock();
+
+            if (debug) _CubLog("\tDevice %d allocated new device block at %p (%lld bytes associated with stream %lld).\n",
+                      device, search_key.d_ptr, (long long) search_key.bytes, (long long) search_key.associated_stream);
+
+            // Attempt to revert back to previous device if necessary
+            if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+            {
+                if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+            }
+        }
+
+        // Copy device pointer to output parameter
+        *d_ptr = search_key.d_ptr;
+
+        if (debug) _CubLog("\t\t%lld available blocks cached (%lld bytes), %lld live blocks outstanding(%lld bytes).\n",
+            (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+
+        return error;
+    }
+
+
+    /**
+     * \brief Provides a suitable allocation of device memory for the given size on the current device.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceAllocate(
+        void            **d_ptr,            ///< [out] Reference to pointer to the allocation
+        size_t          bytes,              ///< [in] Minimum number of bytes for the allocation
+        cudaStream_t    active_stream = 0)  ///< [in] The stream to be associated with this allocation
+    {
+        return DeviceAllocate(INVALID_DEVICE_ORDINAL, d_ptr, bytes, active_stream);
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the specified device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        int             device,
+        void*           d_ptr)
+    {
+        int entrypoint_device           = INVALID_DEVICE_ORDINAL;
+        cudaError_t error               = cudaSuccess;
+
+        if (device == INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device)))
+                return error;
+            device = entrypoint_device;
+        }
+
+        // Lock
+        mutex.Lock();
+
+        // Find corresponding block descriptor
+        bool recached = false;
+        BlockDescriptor search_key(d_ptr, device);
+        BusyBlocks::iterator block_itr = live_blocks.find(search_key);
+        if (block_itr != live_blocks.end())
+        {
+            // Remove from live blocks
+            search_key = *block_itr;
+            live_blocks.erase(block_itr);
+            cached_bytes[device].live -= search_key.bytes;
+
+            // Keep the returned allocation if bin is valid and we won't exceed the max cached threshold
+            if ((search_key.bin != INVALID_BIN) && (cached_bytes[device].free + search_key.bytes <= max_cached_bytes))
+            {
+                // Insert returned allocation into free blocks
+                recached = true;
+                cached_blocks.insert(search_key);
+                cached_bytes[device].free += search_key.bytes;
+
+                if (debug) _CubLog("\tDevice %d returned %lld bytes from associated stream %lld.\n\t\t %lld available blocks cached (%lld bytes), %lld live blocks outstanding. (%lld bytes)\n",
+                    device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(),
+                    (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+            }
+        }
+
+        // Unlock
+        mutex.Unlock();
+
+        // First set to specified device (entrypoint may not be set)
+        if (device != entrypoint_device)
+        {
+            if (CubDebug(error = cudaGetDevice(&entrypoint_device))) return error;
+            if (CubDebug(error = cudaSetDevice(device))) return error;
+        }
+
+        if (recached)
+        {
+            // Insert the ready event in the associated stream (must have current device set properly)
+            if (CubDebug(error = cudaEventRecord(search_key.ready_event, search_key.associated_stream))) return error;
+        }
+        else
+        {
+            // Free the allocation from the runtime and cleanup the event.
+            if (CubDebug(error = cudaFree(d_ptr))) return error;
+            if (CubDebug(error = cudaEventDestroy(search_key.ready_event))) return error;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes from associated stream %lld.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                device, (long long) search_key.bytes, (long long) search_key.associated_stream, (long long) cached_blocks.size(), (long long) cached_bytes[device].free, (long long) live_blocks.size(), (long long) cached_bytes[device].live);
+        }
+
+        // Reset device
+        if ((entrypoint_device != INVALID_DEVICE_ORDINAL) && (entrypoint_device != device))
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Frees a live allocation of device memory on the current device, returning it to the allocator.
+     *
+     * Once freed, the allocation becomes available immediately for reuse within the \p active_stream
+     * with which it was associated with during allocation, and it becomes available for reuse within other
+     * streams when all prior work submitted to \p active_stream has completed.
+     */
+    cudaError_t DeviceFree(
+        void*           d_ptr)
+    {
+        return DeviceFree(INVALID_DEVICE_ORDINAL, d_ptr);
+    }
+
+
+    /**
+     * \brief Frees all cached device allocations on all devices
+     */
+    cudaError_t FreeAllCached()
+    {
+        cudaError_t error         = cudaSuccess;
+        int entrypoint_device     = INVALID_DEVICE_ORDINAL;
+        int current_device        = INVALID_DEVICE_ORDINAL;
+
+        mutex.Lock();
+
+        while (!cached_blocks.empty())
+        {
+            // Get first block
+            CachedBlocks::iterator begin = cached_blocks.begin();
+
+            // Get entry-point device ordinal if necessary
+            if (entrypoint_device == INVALID_DEVICE_ORDINAL)
+            {
+                if (CubDebug(error = cudaGetDevice(&entrypoint_device))) break;
+            }
+
+            // Set current device ordinal if necessary
+            if (begin->device != current_device)
+            {
+                if (CubDebug(error = cudaSetDevice(begin->device))) break;
+                current_device = begin->device;
+            }
+
+            // Free device memory
+            if (CubDebug(error = cudaFree(begin->d_ptr))) break;
+            if (CubDebug(error = cudaEventDestroy(begin->ready_event))) break;
+
+            // Reduce balance and erase entry
+            cached_bytes[current_device].free -= begin->bytes;
+
+            if (debug) _CubLog("\tDevice %d freed %lld bytes.\n\t\t  %lld available blocks cached (%lld bytes), %lld live blocks (%lld bytes) outstanding.\n",
+                current_device, (long long) begin->bytes, (long long) cached_blocks.size(), (long long) cached_bytes[current_device].free, (long long) live_blocks.size(), (long long) cached_bytes[current_device].live);
+
+            cached_blocks.erase(begin);
+        }
+
+        mutex.Unlock();
+
+        // Attempt to revert back to entry-point device if necessary
+        if (entrypoint_device != INVALID_DEVICE_ORDINAL)
+        {
+            if (CubDebug(error = cudaSetDevice(entrypoint_device))) return error;
+        }
+
+        return error;
+    }
+
+
+    /**
+     * \brief Destructor
+     */
+    virtual ~CachingDeviceAllocator()
+    {
+        if (!skip_cleanup)
+            FreeAllCached();
+    }
+
+};
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/util_arch.cuh b/GraphBLAS/CUDA/local_cub/util_arch.cuh
new file mode 100644
index 0000000000..28d81e7cd0
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/util_arch.cuh
@@ -0,0 +1,151 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Static architectural properties by SM version.
+ */
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+#if (__CUDACC_VER_MAJOR__ >= 9) && !defined(CUB_USE_COOPERATIVE_GROUPS)
+    #define CUB_USE_COOPERATIVE_GROUPS
+#endif
+
+/// CUB_PTX_ARCH reflects the PTX version targeted by the active compiler pass (or zero during the host pass).
+#ifndef CUB_PTX_ARCH
+    #ifndef __CUDA_ARCH__
+        #define CUB_PTX_ARCH 0
+    #else
+        #define CUB_PTX_ARCH __CUDA_ARCH__
+    #endif
+#endif
+
+
+/// Whether or not the source targeted by the active compiler pass is allowed to  invoke device kernels or methods from the CUDA runtime API.
+#ifndef CUB_RUNTIME_FUNCTION
+    #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__))
+        #define CUB_RUNTIME_ENABLED
+        #define CUB_RUNTIME_FUNCTION __host__ __device__
+    #else
+        #define CUB_RUNTIME_FUNCTION __host__
+    #endif
+#endif
+
+
+/// Number of threads per warp
+#ifndef CUB_LOG_WARP_THREADS
+    #define CUB_LOG_WARP_THREADS(arch)                      \
+        (5)
+    #define CUB_WARP_THREADS(arch)                          \
+        (1 << CUB_LOG_WARP_THREADS(arch))
+
+    #define CUB_PTX_WARP_THREADS        CUB_WARP_THREADS(CUB_PTX_ARCH)
+    #define CUB_PTX_LOG_WARP_THREADS    CUB_LOG_WARP_THREADS(CUB_PTX_ARCH)
+#endif
+
+
+/// Number of smem banks
+#ifndef CUB_LOG_SMEM_BANKS
+    #define CUB_LOG_SMEM_BANKS(arch)                        \
+        ((arch >= 200) ?                                    \
+            (5) :                                           \
+            (4))
+    #define CUB_SMEM_BANKS(arch)                            \
+        (1 << CUB_LOG_SMEM_BANKS(arch))
+
+    #define CUB_PTX_LOG_SMEM_BANKS      CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH)
+    #define CUB_PTX_SMEM_BANKS          CUB_SMEM_BANKS(CUB_PTX_ARCH)
+#endif
+
+
+/// Oversubscription factor
+#ifndef CUB_SUBSCRIPTION_FACTOR
+    #define CUB_SUBSCRIPTION_FACTOR(arch)                   \
+        ((arch >= 300) ?                                    \
+            (5) :                                           \
+            ((arch >= 200) ?                                \
+                (3) :                                       \
+                (10)))
+    #define CUB_PTX_SUBSCRIPTION_FACTOR             CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH)
+#endif
+
+
+/// Prefer padding overhead vs X-way conflicts greater than this threshold
+#ifndef CUB_PREFER_CONFLICT_OVER_PADDING
+    #define CUB_PREFER_CONFLICT_OVER_PADDING(arch)          \
+        ((arch >= 300) ?                                    \
+            (1) :                                           \
+            (4))
+    #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING    CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH)
+#endif
+
+
+/// Scale down the number of threads to keep same amount of scratch storage as the nominal configuration for 4B data.  Minimum of two warps.
+#ifndef CUB_SCALED_BLOCK_THREADS
+    #define CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)                   \
+        (CUB_MIN(                                                                           \
+            NOMINAL_4B_BLOCK_THREADS,                                                       \
+            CUB_WARP_THREADS(PTX_ARCH) * CUB_MAX(                                           \
+                2,                                                                          \
+                (NOMINAL_4B_BLOCK_THREADS / CUB_WARP_THREADS(PTX_ARCH)) * 4 / sizeof(T))))
+#endif
+
+/// Scale down number of items per thread to keep the same amount of register storage as the nominal configuration for 4B data.  Minimum 1 item per thread
+#ifndef CUB_SCALED_ITEMS_PER_THREAD
+    #define CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH)     \
+        CUB_MAX(                                                                                                \
+            1,                                                                                                  \
+            (sizeof(T) < 4) ?                                                                                   \
+                ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH) / 2 :  \
+                ((NOMINAL_4B_ITEMS_PER_THREAD * NOMINAL_4B_BLOCK_THREADS * 4) / CUB_MAX(4, sizeof(T))) / CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, PTX_ARCH))
+#endif
+
+/// Define both nominal threads-per-block and items-per-thread
+#ifndef CUB_SCALED_GRANULARITIES
+    #define CUB_SCALED_GRANULARITIES(NOMINAL_4B_BLOCK_THREADS, NOMINAL_4B_ITEMS_PER_THREAD, T)      \
+        CUB_SCALED_BLOCK_THREADS(NOMINAL_4B_BLOCK_THREADS, T, 200),                                   \
+        CUB_SCALED_ITEMS_PER_THREAD(NOMINAL_4B_ITEMS_PER_THREAD, NOMINAL_4B_BLOCK_THREADS, T, 200)
+#endif
+
+
+
+#endif  // Do not document
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/util_debug.cuh b/GraphBLAS/CUDA/local_cub/util_debug.cuh
new file mode 100644
index 0000000000..3ad832e731
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/util_debug.cuh
@@ -0,0 +1,145 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Error and event logging routines.
+ *
+ * The following macros definitions are supported:
+ * - \p CUB_LOG.  Simple event messages are printed to \p stdout.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "util_namespace.cuh"
+#include "util_arch.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+
+/// CUB error reporting macro (prints error messages to stderr)
+#if (defined(DEBUG) || defined(_DEBUG)) && !defined(CUB_STDERR)
+    #define CUB_STDERR
+#endif
+
+
+
+/**
+ * \brief %If \p CUB_STDERR is defined and \p error is not \p cudaSuccess, the corresponding error message is printed to \p stderr (or \p stdout in device code) along with the supplied source context.
+ *
+ * \return The CUDA error.
+ */
+__host__ __device__ __forceinline__ cudaError_t Debug(
+    cudaError_t     error,
+    const char*     filename,
+    int             line)
+{
+    (void)filename;
+    (void)line;
+#ifdef CUB_STDERR
+    if (error)
+    {
+    #if (CUB_PTX_ARCH == 0)
+        fprintf(stderr, "CUDA error %d [%s, %d]: %s\n", error, filename, line, cudaGetErrorString(error));
+        fflush(stderr);
+    #elif (CUB_PTX_ARCH >= 200)
+        printf("CUDA error %d [block (%d,%d,%d) thread (%d,%d,%d), %s, %d]\n", error, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, filename, line);
+    #endif
+    }
+#endif
+    return error;
+}
+
+
+/**
+ * \brief Debug macro
+ */
+#ifndef CubDebug
+    #define CubDebug(e) cub::Debug((cudaError_t) (e), __FILE__, __LINE__)
+#endif
+
+
+/**
+ * \brief Debug macro with exit
+ */
+#ifndef CubDebugExit
+    #define CubDebugExit(e) if (cub::Debug((cudaError_t) (e), __FILE__, __LINE__)) { exit(1); }
+#endif
+
+
+/**
+ * \brief Log macro for printf statements.
+ */
+#if !defined(_CubLog)
+    #if !(defined(__clang__) && defined(__CUDA__))
+        #if (CUB_PTX_ARCH == 0)
+            #define _CubLog(format, ...) printf(format,__VA_ARGS__);
+        #elif (CUB_PTX_ARCH >= 200)
+            #define _CubLog(format, ...) printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, __VA_ARGS__);
+        #endif
+    #else
+        // XXX shameless hack for clang around variadic printf...
+        //     Compilies w/o supplying -std=c++11 but shows warning,
+        //     so we sielence them :)
+        #pragma clang diagnostic ignored "-Wc++11-extensions"
+        #pragma clang diagnostic ignored "-Wunnamed-type-template-args"
+            template <class... Args>
+            inline __host__ __device__ void va_printf(char const* format, Args const&... args)
+            {
+        #ifdef __CUDA_ARCH__
+              printf(format, blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, args...);
+        #else
+              printf(format, args...);
+        #endif
+            }
+        #ifndef __CUDA_ARCH__
+            #define _CubLog(format, ...) va_printf(format,__VA_ARGS__);
+        #else
+            #define _CubLog(format, ...) va_printf("[block (%d,%d,%d), thread (%d,%d,%d)]: " format, __VA_ARGS__);
+        #endif
+    #endif
+#endif
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/util_device.cuh b/GraphBLAS/CUDA/local_cub/util_device.cuh
new file mode 100644
index 0000000000..a5f3b61443
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/util_device.cuh
@@ -0,0 +1,347 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Properties of a given CUDA device and the corresponding PTX bundle
+ */
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_debug.cuh"
+#include "util_namespace.cuh"
+#include "util_macro.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilMgmt
+ * @{
+ */
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
+ */
+template <int ALLOCATIONS>
+__host__ __device__ __forceinline__
+cudaError_t AliasTemporaries(
+    void    *d_temp_storage,                    ///< [in] %Device-accessible allocation of temporary storage.  When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
+    size_t  &temp_storage_bytes,                ///< [in,out] Size in bytes of \t d_temp_storage allocation
+    void*   (&allocations)[ALLOCATIONS],        ///< [in,out] Pointers to device allocations needed
+    size_t  (&allocation_sizes)[ALLOCATIONS])   ///< [in] Sizes in bytes of device allocations needed
+{
+    const int ALIGN_BYTES   = 256;
+    const int ALIGN_MASK    = ~(ALIGN_BYTES - 1);
+
+    // Compute exclusive prefix sum over allocation requests
+    size_t allocation_offsets[ALLOCATIONS];
+    size_t bytes_needed = 0;
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+        allocation_offsets[i] = bytes_needed;
+        bytes_needed += allocation_bytes;
+    }
+    bytes_needed += ALIGN_BYTES - 1;
+
+    // Check if the caller is simply requesting the size of the storage allocation
+    if (!d_temp_storage)
+    {
+        temp_storage_bytes = bytes_needed;
+        return cudaSuccess;
+    }
+
+    // Check if enough storage provided
+    if (temp_storage_bytes < bytes_needed)
+    {
+        return CubDebug(cudaErrorInvalidValue);
+    }
+
+    // Alias
+    d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
+    for (int i = 0; i < ALLOCATIONS; ++i)
+    {
+        allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
+    }
+
+    return cudaSuccess;
+}
+
+
+/**
+ * Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
+ */
+template <typename T>
+__global__ void EmptyKernel(void) { }
+
+
+#endif  // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10)
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int &ptx_version)
+{
+    struct Dummy
+    {
+        /// Type definition of the EmptyKernel kernel entry point
+        typedef void (*EmptyKernelPtr)();
+
+        /// Force EmptyKernel<void> to be generated if this class is used
+        CUB_RUNTIME_FUNCTION __forceinline__
+        EmptyKernelPtr Empty()
+        {
+            return EmptyKernel<void>;
+        }
+    };
+
+
+#ifndef CUB_RUNTIME_ENABLED
+    (void)ptx_version;
+
+    // CUDA API calls not supported from this device
+    return cudaErrorInvalidConfiguration;
+
+#elif (CUB_PTX_ARCH > 0)
+
+    ptx_version = CUB_PTX_ARCH;
+    return cudaSuccess;
+
+#else
+
+    cudaError_t error = cudaSuccess;
+    do
+    {
+        cudaFuncAttributes empty_kernel_attrs;
+        if (CubDebug(error = cudaFuncGetAttributes(&empty_kernel_attrs, EmptyKernel<void>))) break;
+        ptx_version = empty_kernel_attrs.ptxVersion * 10;
+    }
+    while (0);
+
+    return error;
+
+#endif
+}
+
+
+/**
+ * \brief Retrieves the SM version (major * 100 + minor * 10)
+ */
+CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int &sm_version, int device_ordinal)
+{
+#ifndef CUB_RUNTIME_ENABLED
+    (void)sm_version;
+    (void)device_ordinal;
+
+    // CUDA API calls not supported from this device
+    return cudaErrorInvalidConfiguration;
+
+#else
+
+    cudaError_t error = cudaSuccess;
+    do
+    {
+        // Fill in SM version
+        int major, minor;
+        if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device_ordinal))) break;
+        if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device_ordinal))) break;
+        sm_version = major * 100 + minor * 10;
+    }
+    while (0);
+
+    return error;
+
+#endif
+}
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Synchronize the stream if specified
+ */
+CUB_RUNTIME_FUNCTION __forceinline__
+static cudaError_t SyncStream(cudaStream_t stream)
+{
+#if (CUB_PTX_ARCH == 0)
+    return cudaStreamSynchronize(stream);
+#else
+    (void)stream;
+    // Device can't yet sync on a specific stream
+    return cudaDeviceSynchronize();
+#endif
+}
+
+
+/**
+ * \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
+ *
+ * \par Snippet
+ * The code snippet below illustrates the use of the MaxSmOccupancy function.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_device.cuh>
+ *
+ * template <typename T>
+ * __global__ void ExampleKernel()
+ * {
+ *     // Allocate shared memory for BlockScan
+ *     __shared__ volatile T buffer[4096];
+ *
+ *        ...
+ * }
+ *
+ *     ...
+ *
+ * // Determine SM occupancy for ExampleKernel specialized for unsigned char
+ * int max_sm_occupancy;
+ * MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
+ *
+ * // max_sm_occupancy  <-- 4 on SM10
+ * // max_sm_occupancy  <-- 8 on SM20
+ * // max_sm_occupancy  <-- 12 on SM35
+ *
+ * \endcode
+ *
+ */
+template <typename KernelPtr>
+CUB_RUNTIME_FUNCTION __forceinline__
+cudaError_t MaxSmOccupancy(
+    int                 &max_sm_occupancy,          ///< [out] maximum number of thread blocks that can reside on a single SM
+    KernelPtr           kernel_ptr,                 ///< [in] Kernel pointer for which to compute SM occupancy
+    int                 block_threads,              ///< [in] Number of threads per thread block
+    int                 dynamic_smem_bytes = 0)
+{
+#ifndef CUB_RUNTIME_ENABLED
+    (void)dynamic_smem_bytes;
+    (void)block_threads;
+    (void)kernel_ptr;
+    (void)max_sm_occupancy;
+
+    // CUDA API calls not supported from this device
+    return CubDebug(cudaErrorInvalidConfiguration);
+
+#else
+
+    return cudaOccupancyMaxActiveBlocksPerMultiprocessor (
+        &max_sm_occupancy,
+        kernel_ptr,
+        block_threads,
+        dynamic_smem_bytes);
+
+#endif  // CUB_RUNTIME_ENABLED
+}
+
+
+/******************************************************************************
+ * Policy management
+ ******************************************************************************/
+
+/**
+ * Kernel dispatch configuration
+ */
+struct KernelConfig
+{
+    int block_threads;
+    int items_per_thread;
+    int tile_size;
+    int sm_occupancy;
+
+    CUB_RUNTIME_FUNCTION __forceinline__
+    KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}
+
+    template <typename AgentPolicyT, typename KernelPtrT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    cudaError_t Init(KernelPtrT kernel_ptr)
+    {
+        block_threads        = AgentPolicyT::BLOCK_THREADS;
+        items_per_thread     = AgentPolicyT::ITEMS_PER_THREAD;
+        tile_size            = block_threads * items_per_thread;
+        cudaError_t retval   = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
+        return retval;
+    }
+};
+
+
+
+/// Helper for dispatching into a policy chain
+template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
+struct ChainedPolicy
+{
+   /// The policy for the active compiler pass
+   typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;
+
+   /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+   template <typename FunctorT>
+   CUB_RUNTIME_FUNCTION __forceinline__
+   static cudaError_t Invoke(int ptx_version, FunctorT &op)
+   {
+       if (ptx_version < PTX_VERSION) {
+           return PrevPolicyT::Invoke(ptx_version, op);
+       }
+       return op.template Invoke<PolicyT>();
+   }
+};
+
+/// Helper for dispatching into a policy chain (end-of-chain specialization)
+template <int PTX_VERSION, typename PolicyT>
+struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
+{
+    /// The policy for the active compiler pass
+    typedef PolicyT ActivePolicy;
+
+    /// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
+    template <typename FunctorT>
+    CUB_RUNTIME_FUNCTION __forceinline__
+    static cudaError_t Invoke(int /*ptx_version*/, FunctorT &op) {
+        return op.template Invoke<PolicyT>();
+    }
+};
+
+
+
+
+#endif  // Do not document
+
+
+
+
+/** @} */       // end group UtilMgmt
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/util_macro.cuh b/GraphBLAS/CUDA/local_cub/util_macro.cuh
new file mode 100644
index 0000000000..ff86365422
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/util_macro.cuh
@@ -0,0 +1,103 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Common C/C++ macro utilities
+ ******************************************************************************/
+
+#pragma once
+
+#include "util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+#ifndef CUB_ALIGN
+    #if defined(_WIN32) || defined(_WIN64)
+        /// Align struct
+        #define CUB_ALIGN(bytes) __declspec(align(32))
+    #else
+        /// Align struct
+        #define CUB_ALIGN(bytes) __attribute__((aligned(bytes)))
+    #endif
+#endif
+
+#ifndef CUB_MAX
+    /// Select maximum(a, b)
+    #define CUB_MAX(a, b) (((b) > (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_MIN
+    /// Select minimum(a, b)
+    #define CUB_MIN(a, b) (((b) < (a)) ? (b) : (a))
+#endif
+
+#ifndef CUB_QUOTIENT_FLOOR
+    /// Quotient of x/y rounded down to nearest integer
+    #define CUB_QUOTIENT_FLOOR(x, y) ((x) / (y))
+#endif
+
+#ifndef CUB_QUOTIENT_CEILING
+    /// Quotient of x/y rounded up to nearest integer
+    #define CUB_QUOTIENT_CEILING(x, y) (((x) + (y) - 1) / (y))
+#endif
+
+#ifndef CUB_ROUND_UP_NEAREST
+    /// x rounded up to the nearest multiple of y
+    #define CUB_ROUND_UP_NEAREST(x, y) ((((x) + (y) - 1) / (y)) * y)
+#endif
+
+#ifndef CUB_ROUND_DOWN_NEAREST
+    /// x rounded down to the nearest multiple of y
+    #define CUB_ROUND_DOWN_NEAREST(x, y) (((x) / (y)) * y)
+#endif
+
+
+#ifndef CUB_STATIC_ASSERT
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+        #define CUB_CAT_(a, b) a ## b
+        #define CUB_CAT(a, b) CUB_CAT_(a, b)
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+    /// Static assert
+    #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1]
+#endif
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/util_namespace.cuh b/GraphBLAS/CUDA/local_cub/util_namespace.cuh
new file mode 100644
index 0000000000..c8991d08fb
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/util_namespace.cuh
@@ -0,0 +1,46 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Place-holder for prefixing the cub namespace
+ */
+
+#pragma once
+
+// For example:
+//#define CUB_NS_PREFIX namespace thrust{ namespace detail {
+//#define CUB_NS_POSTFIX } }
+
+#ifndef CUB_NS_PREFIX
+#define CUB_NS_PREFIX
+#endif
+
+#ifndef CUB_NS_POSTFIX
+#define CUB_NS_POSTFIX
+#endif
diff --git a/GraphBLAS/CUDA/local_cub/util_ptx.cuh b/GraphBLAS/CUDA/local_cub/util_ptx.cuh
new file mode 100644
index 0000000000..582ca0d8b8
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/util_ptx.cuh
@@ -0,0 +1,758 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * PTX intrinsics
+ */
+
+
+#pragma once
+
+#include "util_type.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+#include "util_debug.cuh"
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilPtx
+ * @{
+ */
+
+
+/******************************************************************************
+ * PTX helper macros
+ ******************************************************************************/
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Register modifier for pointer-types (for inlining PTX assembly)
+ */
+#if defined(_WIN64) || defined(__LP64__)
+    #define __CUB_LP64__ 1
+    // 64-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "l"
+    #define _CUB_ASM_PTR_SIZE_ "u64"
+#else
+    #define __CUB_LP64__ 0
+    // 32-bit register modifier for inlined asm
+    #define _CUB_ASM_PTR_ "r"
+    #define _CUB_ASM_PTR_SIZE_ "u32"
+#endif
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Inlined PTX intrinsics
+ ******************************************************************************/
+
+/**
+ * \brief Shift-right then add.  Returns (\p x >> \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHR_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+#if CUB_PTX_ARCH >= 200
+    asm ("vshr.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+#else
+    ret = (x >> shift) + addend;
+#endif
+    return ret;
+}
+
+
+/**
+ * \brief Shift-left then add.  Returns (\p x << \p shift) + \p addend.
+ */
+__device__ __forceinline__ unsigned int SHL_ADD(
+    unsigned int x,
+    unsigned int shift,
+    unsigned int addend)
+{
+    unsigned int ret;
+#if CUB_PTX_ARCH >= 200
+    asm ("vshl.u32.u32.u32.clamp.add %0, %1, %2, %3;" :
+        "=r"(ret) : "r"(x), "r"(shift), "r"(addend));
+#else
+    ret = (x << shift) + addend;
+#endif
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Bitfield-extract.
+ */
+template <typename UnsignedBits, int BYTE_LEN>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<BYTE_LEN>      /*byte_len*/)
+{
+    unsigned int bits;
+#if CUB_PTX_ARCH >= 200
+    asm ("bfe.u32 %0, %1, %2, %3;" : "=r"(bits) : "r"((unsigned int) source), "r"(bit_start), "r"(num_bits));
+#else
+    const unsigned int MASK = (1 << num_bits) - 1;
+    bits = (source >> bit_start) & MASK;
+#endif
+    return bits;
+}
+
+
+/**
+ * Bitfield-extract for 64-bit types.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits            source,
+    unsigned int            bit_start,
+    unsigned int            num_bits,
+    Int2Type<8>             /*byte_len*/)
+{
+    const unsigned long long MASK = (1ull << num_bits) - 1;
+    return (source >> bit_start) & MASK;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Bitfield-extract.  Extracts \p num_bits from \p source starting at bit-offset \p bit_start.  The input \p source may be an 8b, 16b, 32b, or 64b unsigned integer type.
+ */
+template <typename UnsignedBits>
+__device__ __forceinline__ unsigned int BFE(
+    UnsignedBits source,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+    return BFE(source, bit_start, num_bits, Int2Type<sizeof(UnsignedBits)>());
+}
+
+
+/**
+ * \brief Bitfield insert.  Inserts the \p num_bits least significant bits of \p y into \p x at bit-offset \p bit_start.
+ */
+__device__ __forceinline__ void BFI(
+    unsigned int &ret,
+    unsigned int x,
+    unsigned int y,
+    unsigned int bit_start,
+    unsigned int num_bits)
+{
+#if CUB_PTX_ARCH >= 200
+    asm ("bfi.b32 %0, %1, %2, %3, %4;" :
+        "=r"(ret) : "r"(y), "r"(x), "r"(bit_start), "r"(num_bits));
+#else
+    x <<= bit_start;
+    unsigned int MASK_X = ((1 << num_bits) - 1) << bit_start;
+    unsigned int MASK_Y = ~MASK_X;
+    ret = (y & MASK_Y) | (x & MASK_X);
+#endif
+}
+
+
+/**
+ * \brief Three-operand add.  Returns \p x + \p y + \p z.
+ */
+__device__ __forceinline__ unsigned int IADD3(unsigned int x, unsigned int y, unsigned int z)
+{
+#if CUB_PTX_ARCH >= 200
+    asm ("vadd.u32.u32.u32.add %0, %1, %2, %3;" : "=r"(x) : "r"(x), "r"(y), "r"(z));
+#else
+    x = x + y + z;
+#endif
+    return x;
+}
+
+
+/**
+ * \brief Byte-permute. Pick four arbitrary bytes from two 32-bit registers, and reassemble them into a 32-bit destination register.  For SM2.0 or later.
+ *
+ * \par
+ * The bytes in the two source registers \p a and \p b are numbered from 0 to 7:
+ * {\p b, \p a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}. For each of the four bytes
+ * {b3, b2, b1, b0} selected in the return value, a 4-bit selector is defined within
+ * the four lower "nibbles" of \p index: {\p index } = {n7, n6, n5, n4, n3, n2, n1, n0}
+ *
+ * \par Snippet
+ * The code snippet below illustrates byte-permute.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     int a        = 0x03020100;
+ *     int b        = 0x07060504;
+ *     int index    = 0x00007531;
+ *
+ *     int selected = PRMT(a, b, index);    // 0x07050301
+ *
+ * \endcode
+ *
+ */
+__device__ __forceinline__ int PRMT(unsigned int a, unsigned int b, unsigned int index)
+{
+    int ret;
+    asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(a), "r"(b), "r"(index));
+    return ret;
+}
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+/**
+ * Sync-threads barrier.
+ */
+__device__ __forceinline__ void BAR(int count)
+{
+    asm volatile("bar.sync 1, %0;" : : "r"(count));
+}
+
+/**
+ * CTA barrier
+ */
+__device__  __forceinline__ void CTA_SYNC()
+{
+    __syncthreads();
+}
+
+
+/**
+ * CTA barrier with predicate
+ */
+__device__  __forceinline__ int CTA_SYNC_AND(int p)
+{
+    return __syncthreads_and(p);
+}
+
+
+/**
+ * Warp barrier
+ */
+__device__  __forceinline__ void WARP_SYNC(unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    __syncwarp(member_mask);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ANY(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __any_sync(member_mask, predicate);
+#else
+    return ::__any(predicate);
+#endif
+}
+
+
+/**
+ * Warp any
+ */
+__device__  __forceinline__ int WARP_ALL(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __all_sync(member_mask, predicate);
+#else
+    return ::__all(predicate);
+#endif
+}
+
+
+/**
+ * Warp ballot
+ */
+__device__  __forceinline__ int WARP_BALLOT(int predicate, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    return __ballot_sync(member_mask, predicate);
+#else
+    return __ballot(predicate);
+#endif
+}
+
+/**
+ * Warp synchronous shfl_up
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_UP_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.up.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.up.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_down
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_DOWN_SYNC(unsigned int word, int src_offset, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.down.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.down.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_offset), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Warp synchronous shfl_idx
+ */
+__device__ __forceinline__ 
+unsigned int SHFL_IDX_SYNC(unsigned int word, int src_lane, int flags, unsigned int member_mask)
+{
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+    asm volatile("shfl.sync.idx.b32 %0, %1, %2, %3, %4;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags), "r"(member_mask));
+#else
+    asm volatile("shfl.idx.b32 %0, %1, %2, %3;"
+        : "=r"(word) : "r"(word), "r"(src_lane), "r"(flags));
+#endif
+    return word;
+}
+
+/**
+ * Floating point multiply. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FMUL_RZ(float a, float b)
+{
+    float d;
+    asm ("mul.rz.f32 %0, %1, %2;" : "=f"(d) : "f"(a), "f"(b));
+    return d;
+}
+
+
+/**
+ * Floating point multiply-add. (Mantissa LSB rounds towards zero.)
+ */
+__device__ __forceinline__ float FFMA_RZ(float a, float b, float c)
+{
+    float d;
+    asm ("fma.rz.f32 %0, %1, %2, %3;" : "=f"(d) : "f"(a), "f"(b), "f"(c));
+    return d;
+}
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Terminates the calling thread
+ */
+__device__ __forceinline__ void ThreadExit() {
+    asm volatile("exit;");
+}    
+
+
+/**
+ * \brief  Abort execution and generate an interrupt to the host CPU
+ */
+__device__ __forceinline__ void ThreadTrap() {
+    asm volatile("trap;");
+}
+
+
+/**
+ * \brief Returns the row-major linear thread identifier for a multidimensional thread block
+ */
+__device__ __forceinline__ int RowMajorTid(int block_dim_x, int block_dim_y, int block_dim_z)
+{
+    return ((block_dim_z == 1) ? 0 : (threadIdx.z * block_dim_x * block_dim_y)) +
+            ((block_dim_y == 1) ? 0 : (threadIdx.y * block_dim_x)) +
+            threadIdx.x;
+}
+
+
+/**
+ * \brief Returns the warp lane ID of the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%laneid;" : "=r"(ret) );
+    return ret;
+}
+
+
+/**
+ * \brief Returns the warp ID of the calling thread.  Warp ID is guaranteed to be unique among warps, but may not correspond to a zero-based ranking within the thread block.
+ */
+__device__ __forceinline__ unsigned int WarpId()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%warpid;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_lt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes less than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskLe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_le;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGt()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_gt;" : "=r"(ret) );
+    return ret;
+}
+
+/**
+ * \brief Returns the warp lane mask of all lanes greater than or equal to the calling thread
+ */
+__device__ __forceinline__ unsigned int LaneMaskGe()
+{
+    unsigned int ret;
+    asm ("mov.u32 %0, %%lanemask_ge;" : "=r"(ret) );
+    return ret;
+}
+
+/** @} */       // end group UtilPtx
+
+
+
+
+/**
+ * \brief Shuffle-up for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>-<tt>src_offset</tt></sub>.  For thread lanes \e i < src_offset, the thread's own \p input is returned to the thread. ![](shfl_up_logo.png)
+ * \ingroup WarpModule
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * predecessor of its predecessor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleUp<32>(thread_data, 2, 0, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 2.0, 1.0, 2.0, 3.0, ..., 30.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleUp(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative down-offset of the peer to read from
+    int             first_thread,       ///< [in] Index of first lane in logical warp (typically 0)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+ 
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_UP_SYNC((unsigned int)input_alias[0], src_offset, first_thread | SHFL_C, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_UP_SYNC((unsigned int)input_alias[WORD], src_offset, first_thread | SHFL_C, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-down for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input contributed by <em>warp-lane</em><sub><em>i</em>+<tt>src_offset</tt></sub>.  For thread lanes \e i >= WARP_THREADS, the thread's own \p input is returned to the thread.  ![](shfl_down_logo.png)
+ * \ingroup WarpModule
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from the
+ * successor of its successor.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from two ranks below
+ *     double peer_data = ShuffleDown<32>(thread_data, 2, 31, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{3.0, 4.0, 5.0, 6.0, 7.0, ..., 32.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleDown(
+    T               input,              ///< [in] The value to broadcast
+    int             src_offset,         ///< [in] The relative up-offset of the peer to read from
+    int             last_thread,        ///< [in] Index of last thread in logical warp (typically 31 for a 32-thread warp)
+    unsigned int    member_mask)        ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = (32 - LOGICAL_WARP_THREADS) << 8
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word    = SHFL_DOWN_SYNC((unsigned int)input_alias[0], src_offset, last_thread | SHFL_C, member_mask);
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word       = SHFL_DOWN_SYNC((unsigned int)input_alias[WORD], src_offset, last_thread | SHFL_C, member_mask);
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+/**
+ * \brief Shuffle-broadcast for any data type.  Each <em>warp-lane<sub>i</sub></em> obtains the value \p input
+ * contributed by <em>warp-lane</em><sub><tt>src_lane</tt></sub>.  For \p src_lane < 0 or \p src_lane >= WARP_THREADS,
+ * then the thread's own \p input is returned to the thread. ![](shfl_broadcast_logo.png)
+ *
+ * \tparam LOGICAL_WARP_THREADS     The number of threads per "logical" warp.  Must be a power-of-two <= 32.
+ * \tparam T                        <b>[inferred]</b> The input/output element type
+ *
+ * \ingroup WarpModule
+ *
+ * \par
+ * - Available only for SM3.0 or newer
+ *
+ * \par Snippet
+ * The code snippet below illustrates each thread obtaining a \p double value from <em>warp-lane</em><sub>0</sub>.
+ *
+ * \par
+ * \code
+ * #include <cub/cub.cuh>   // or equivalently <cub/util_ptx.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Obtain one input item per thread
+ *     double thread_data = ...
+ *
+ *     // Obtain item from thread 0
+ *     double peer_data = ShuffleIndex<32>(thread_data, 0, 0xffffffff);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the first warp of threads is <tt>{1.0, 2.0, 3.0, 4.0, 5.0, ..., 32.0}</tt>.
+ * The corresponding output \p peer_data will be <tt>{1.0, 1.0, 1.0, 1.0, 1.0, ..., 1.0}</tt>.
+ *
+ */
+template <
+    int LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    typename T>
+__device__ __forceinline__ T ShuffleIndex(
+    T               input,                  ///< [in] The value to broadcast
+    int             src_lane,               ///< [in] Which warp lane is to do the broadcasting
+    unsigned int    member_mask)            ///< [in] 32-bit mask of participating warp lanes
+{
+    /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+    enum {
+        SHFL_C = ((32 - LOGICAL_WARP_THREADS) << 8) | (LOGICAL_WARP_THREADS - 1)
+    };
+
+    typedef typename UnitWord<T>::ShuffleWord ShuffleWord;
+
+    const int       WORDS           = (sizeof(T) + sizeof(ShuffleWord) - 1) / sizeof(ShuffleWord);
+
+    T               output;
+    ShuffleWord     *output_alias   = reinterpret_cast<ShuffleWord *>(&output);
+    ShuffleWord     *input_alias    = reinterpret_cast<ShuffleWord *>(&input);
+
+    unsigned int shuffle_word;
+    shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[0],
+                                 src_lane,
+                                 SHFL_C,
+                                 member_mask);
+
+    output_alias[0] = shuffle_word;
+
+    #pragma unroll
+    for (int WORD = 1; WORD < WORDS; ++WORD)
+    {
+        shuffle_word = SHFL_IDX_SYNC((unsigned int)input_alias[WORD],
+                                     src_lane,
+                                     SHFL_C,
+                                     member_mask);
+
+        output_alias[WORD] = shuffle_word;
+    }
+
+    return output;
+}
+
+
+
+/**
+ * Compute a 32b mask of threads having the same least-significant
+ * LABEL_BITS of \p label as the calling thread.
+ */
+template <int LABEL_BITS>
+inline __device__ unsigned int MatchAny(unsigned int label)
+{
+    unsigned int retval;
+
+    // Extract masks of common threads for each bit
+    #pragma unroll
+    for (int BIT = 0; BIT < LABEL_BITS; ++BIT)
+    {
+        unsigned int mask;
+        unsigned int current_bit = 1 << BIT;
+        asm ("{\n"
+            "    .reg .pred p;\n"
+            "    and.b32 %0, %1, %2;"
+            "    setp.eq.u32 p, %0, %2;\n"
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+            "    vote.ballot.sync.b32 %0, p, 0xffffffff;\n"
+#else
+            "    vote.ballot.b32 %0, p;\n"
+#endif
+            "    @!p not.b32 %0, %0;\n"
+            "}\n" : "=r"(mask) : "r"(label), "r"(current_bit));
+
+        // Remove peers who differ
+        retval = (BIT == 0) ? mask : retval & mask;
+    }
+
+    return retval;
+
+//  // VOLTA match
+//    unsigned int retval;
+//    asm ("{\n"
+//         "    match.any.sync.b32 %0, %1, 0xffffffff;\n"
+//         "}\n" : "=r"(retval) : "r"(label));
+//    return retval;
+
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/util_type.cuh b/GraphBLAS/CUDA/local_cub/util_type.cuh
new file mode 100644
index 0000000000..0ba41e1ed2
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/util_type.cuh
@@ -0,0 +1,1167 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * Common type manipulation (metaprogramming) utilities
+ */
+
+#pragma once
+
+#include <iostream>
+#include <limits>
+#include <cfloat>
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    #include <cuda_fp16.h>
+#endif
+
+#include "util_macro.cuh"
+#include "util_arch.cuh"
+#include "util_namespace.cuh"
+
+
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup UtilModule
+ * @{
+ */
+
+
+
+/******************************************************************************
+ * Type equality
+ ******************************************************************************/
+
+/**
+ * \brief Type selection (<tt>IF ? ThenType : ElseType</tt>)
+ */
+template <bool IF, typename ThenType, typename ElseType>
+struct If
+{
+    /// Conditional type result
+    typedef ThenType Type;      // true
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename ThenType, typename ElseType>
+struct If<false, ThenType, ElseType>
+{
+    typedef ElseType Type;      // false
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Conditional types
+ ******************************************************************************/
+
+/**
+ * \brief Type equality test
+ */
+template <typename A, typename B>
+struct Equals
+{
+    enum {
+        VALUE = 0,
+        NEGATE = 1
+    };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename A>
+struct Equals <A, A>
+{
+    enum {
+        VALUE = 1,
+        NEGATE = 0
+    };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Static math
+ ******************************************************************************/
+
+/**
+ * \brief Statically determine log2(N), rounded up.
+ *
+ * For example:
+ *     Log2<8>::VALUE   // 3
+ *     Log2<3>::VALUE   // 2
+ */
+template <int N, int CURRENT_VAL = N, int COUNT = 0>
+struct Log2
+{
+    /// Static logarithm value
+    enum { VALUE = Log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };         // Inductive case
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <int N, int COUNT>
+struct Log2<N, 0, COUNT>
+{
+    enum {VALUE = (1 << (COUNT - 1) < N) ?                                  // Base case
+        COUNT :
+        COUNT - 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/**
+ * \brief Statically determine if N is a power-of-two
+ */
+template <int N>
+struct PowerOfTwo
+{
+    enum { VALUE = ((N & (N - 1)) == 0) };
+};
+
+
+
+/******************************************************************************
+ * Pointer vs. iterator detection
+ ******************************************************************************/
+
+/**
+ * \brief Pointer vs. iterator
+ */
+template <typename Tp>
+struct IsPointer
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsPointer<Tp*>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Qualifier detection
+ ******************************************************************************/
+
+/**
+ * \brief Volatile modifier test
+ */
+template <typename Tp>
+struct IsVolatile
+{
+    enum { VALUE = 0 };
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp>
+struct IsVolatile<Tp volatile>
+{
+    enum { VALUE = 1 };
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/******************************************************************************
+ * Qualifier removal
+ ******************************************************************************/
+
+/**
+ * \brief Removes \p const and \p volatile qualifiers from type \p Tp.
+ *
+ * For example:
+ *     <tt>typename RemoveQualifiers<volatile int>::Type         // int;</tt>
+ */
+template <typename Tp, typename Up = Tp>
+struct RemoveQualifiers
+{
+    /// Type without \p const and \p volatile qualifiers
+    typedef Up Type;
+};
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, volatile Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const Up>
+{
+    typedef Up Type;
+};
+
+template <typename Tp, typename Up>
+struct RemoveQualifiers<Tp, const volatile Up>
+{
+    typedef Up Type;
+};
+
+
+/******************************************************************************
+ * Marker types
+ ******************************************************************************/
+
+/**
+ * \brief A simple "NULL" marker type
+ */
+struct NullType
+{
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    template <typename T>
+    __host__ __device__ __forceinline__ NullType& operator =(const T&) { return *this; }
+
+    __host__ __device__ __forceinline__ bool operator ==(const NullType&) { return true; }
+
+    __host__ __device__ __forceinline__ bool operator !=(const NullType&) { return false; }
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+};
+
+
+/**
+ * \brief Allows for the treatment of an integral constant as a type at compile-time (e.g., to achieve static call dispatch based on constant integral values)
+ */
+template <int A>
+struct Int2Type
+{
+   enum {VALUE = A};
+};
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/******************************************************************************
+ * Size and alignment
+ ******************************************************************************/
+
+/// Structure alignment
+template <typename T>
+struct AlignBytes
+{
+    struct Pad
+    {
+        T       val;
+        char    byte;
+    };
+
+    enum
+    {
+        /// The "true CUDA" alignment of T in bytes
+        ALIGN_BYTES = sizeof(Pad) - sizeof(T)
+    };
+
+    /// The "truly aligned" type
+    typedef T Type;
+};
+
+// Specializations where host C++ compilers (e.g., 32-bit Windows) may disagree
+// with device C++ compilers (EDG) on types passed as template parameters through
+// kernel functions
+
+#define __CUB_ALIGN_BYTES(t, b)         \
+    template <> struct AlignBytes<t>    \
+    { enum { ALIGN_BYTES = b }; typedef __align__(b) t Type; };
+
+__CUB_ALIGN_BYTES(short4, 8)
+__CUB_ALIGN_BYTES(ushort4, 8)
+__CUB_ALIGN_BYTES(int2, 8)
+__CUB_ALIGN_BYTES(uint2, 8)
+__CUB_ALIGN_BYTES(long long, 8)
+__CUB_ALIGN_BYTES(unsigned long long, 8)
+__CUB_ALIGN_BYTES(float2, 8)
+__CUB_ALIGN_BYTES(double, 8)
+#ifdef _WIN32
+    __CUB_ALIGN_BYTES(long2, 8)
+    __CUB_ALIGN_BYTES(ulong2, 8)
+#else
+    __CUB_ALIGN_BYTES(long2, 16)
+    __CUB_ALIGN_BYTES(ulong2, 16)
+#endif
+__CUB_ALIGN_BYTES(int4, 16)
+__CUB_ALIGN_BYTES(uint4, 16)
+__CUB_ALIGN_BYTES(float4, 16)
+__CUB_ALIGN_BYTES(long4, 16)
+__CUB_ALIGN_BYTES(ulong4, 16)
+__CUB_ALIGN_BYTES(longlong2, 16)
+__CUB_ALIGN_BYTES(ulonglong2, 16)
+__CUB_ALIGN_BYTES(double2, 16)
+__CUB_ALIGN_BYTES(longlong4, 16)
+__CUB_ALIGN_BYTES(ulonglong4, 16)
+__CUB_ALIGN_BYTES(double4, 16)
+
+template <typename T> struct AlignBytes<volatile T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const T> : AlignBytes<T> {};
+template <typename T> struct AlignBytes<const volatile T> : AlignBytes<T> {};
+
+
+/// Unit-words of data movement
+template <typename T>
+struct UnitWord
+{
+    enum {
+        ALIGN_BYTES = AlignBytes<T>::ALIGN_BYTES
+    };
+
+    template <typename Unit>
+    struct IsMultiple
+    {
+        enum {
+            UNIT_ALIGN_BYTES    = AlignBytes<Unit>::ALIGN_BYTES,
+            IS_MULTIPLE         = (sizeof(T) % sizeof(Unit) == 0) && (ALIGN_BYTES % UNIT_ALIGN_BYTES == 0)
+        };
+    };
+
+    /// Biggest shuffle word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<int>::IS_MULTIPLE,
+        unsigned int,
+        typename If<IsMultiple<short>::IS_MULTIPLE,
+            unsigned short,
+            unsigned char>::Type>::Type         ShuffleWord;
+
+    /// Biggest volatile word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<long long>::IS_MULTIPLE,
+        unsigned long long,
+        ShuffleWord>::Type                      VolatileWord;
+
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<longlong2>::IS_MULTIPLE,
+        ulonglong2,
+        VolatileWord>::Type                     DeviceWord;
+
+    /// Biggest texture reference word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename If<IsMultiple<int4>::IS_MULTIPLE,
+        uint4,
+        typename If<IsMultiple<int2>::IS_MULTIPLE,
+            uint2,
+            ShuffleWord>::Type>::Type           TextureWord;
+};
+
+
+// float2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float2>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float       VolatileWord;
+    typedef uint2       DeviceWord;
+#else
+    typedef unsigned long long   VolatileWord;
+    typedef unsigned long long   DeviceWord;
+#endif
+    typedef float2      TextureWord;
+};
+
+// float4 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <float4>
+{
+    typedef int         ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef float               VolatileWord;
+    typedef uint4               DeviceWord;
+#else
+    typedef unsigned long long  VolatileWord;
+    typedef ulonglong2          DeviceWord;
+#endif
+    typedef float4              TextureWord;
+};
+
+
+// char2 specialization workaround (for SM10-SM13)
+template <>
+struct UnitWord <char2>
+{
+    typedef unsigned short      ShuffleWord;
+#if (CUB_PTX_ARCH > 0) && (CUB_PTX_ARCH <= 130)
+    typedef unsigned short      VolatileWord;
+    typedef short               DeviceWord;
+#else
+    typedef unsigned short      VolatileWord;
+    typedef unsigned short      DeviceWord;
+#endif
+    typedef unsigned short      TextureWord;
+};
+
+
+template <typename T> struct UnitWord<volatile T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const T> : UnitWord<T> {};
+template <typename T> struct UnitWord<const volatile T> : UnitWord<T> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Vector type inference utilities.
+ ******************************************************************************/
+
+/**
+ * \brief Exposes a member typedef \p Type that names the corresponding CUDA vector type if one exists.  Otherwise \p Type refers to the CubVector structure itself, which will wrap the corresponding \p x, \p y, etc. vector fields.
+ */
+template <typename T, int vec_elements> struct CubVector;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+enum
+{
+    /// The maximum number of elements in CUDA vector types
+    MAX_VEC_ELEMENTS = 4,
+};
+
+
+/**
+ * Generic vector-1 type
+ */
+template <typename T>
+struct CubVector<T, 1>
+{
+    T x;
+
+    typedef T BaseType;
+    typedef CubVector<T, 1> Type;
+};
+
+/**
+ * Generic vector-2 type
+ */
+template <typename T>
+struct CubVector<T, 2>
+{
+    T x;
+    T y;
+
+    typedef T BaseType;
+    typedef CubVector<T, 2> Type;
+};
+
+/**
+ * Generic vector-3 type
+ */
+template <typename T>
+struct CubVector<T, 3>
+{
+    T x;
+    T y;
+    T z;
+
+    typedef T BaseType;
+    typedef CubVector<T, 3> Type;
+};
+
+/**
+ * Generic vector-4 type
+ */
+template <typename T>
+struct CubVector<T, 4>
+{
+    T x;
+    T y;
+    T z;
+    T w;
+
+    typedef T BaseType;
+    typedef CubVector<T, 4> Type;
+};
+
+
+/**
+ * Macro for expanding partially-specialized built-in vector types
+ */
+#define CUB_DEFINE_VECTOR_TYPE(base_type,short_type)                                                    \
+                                                                                                        \
+    template<> struct CubVector<base_type, 1> : short_type##1                                           \
+    {                                                                                                   \
+      typedef base_type       BaseType;                                                                 \
+      typedef short_type##1   Type;                                                                     \
+      __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x + other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+      __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {           \
+          CubVector retval;                                                                             \
+          retval.x = x - other.x;                                                                       \
+          return retval;                                                                                \
+      }                                                                                                 \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 2> : short_type##2                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##2   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 3> : short_type##3                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##3   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };                                                                                                  \
+                                                                                                        \
+    template<> struct CubVector<base_type, 4> : short_type##4                                           \
+    {                                                                                                   \
+        typedef base_type       BaseType;                                                               \
+        typedef short_type##4   Type;                                                                   \
+        __host__ __device__ __forceinline__ CubVector operator+(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x + other.x;                                                                     \
+            retval.y = y + other.y;                                                                     \
+            retval.z = z + other.z;                                                                     \
+            retval.w = w + other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+        __host__ __device__ __forceinline__ CubVector operator-(const CubVector &other) const {         \
+            CubVector retval;                                                                           \
+            retval.x = x - other.x;                                                                     \
+            retval.y = y - other.y;                                                                     \
+            retval.z = z - other.z;                                                                     \
+            retval.w = w - other.w;                                                                     \
+            return retval;                                                                              \
+        }                                                                                               \
+    };
+
+
+
+// Expand CUDA vector types for built-in primitives
+CUB_DEFINE_VECTOR_TYPE(char,               char)
+CUB_DEFINE_VECTOR_TYPE(signed char,        char)
+CUB_DEFINE_VECTOR_TYPE(short,              short)
+CUB_DEFINE_VECTOR_TYPE(int,                int)
+CUB_DEFINE_VECTOR_TYPE(long,               long)
+CUB_DEFINE_VECTOR_TYPE(long long,          longlong)
+CUB_DEFINE_VECTOR_TYPE(unsigned char,      uchar)
+CUB_DEFINE_VECTOR_TYPE(unsigned short,     ushort)
+CUB_DEFINE_VECTOR_TYPE(unsigned int,       uint)
+CUB_DEFINE_VECTOR_TYPE(unsigned long,      ulong)
+CUB_DEFINE_VECTOR_TYPE(unsigned long long, ulonglong)
+CUB_DEFINE_VECTOR_TYPE(float,              float)
+CUB_DEFINE_VECTOR_TYPE(double,             double)
+CUB_DEFINE_VECTOR_TYPE(bool,               uchar)
+
+// Undefine macros
+#undef CUB_DEFINE_VECTOR_TYPE
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+
+/******************************************************************************
+ * Wrapper types
+ ******************************************************************************/
+
+/**
+ * \brief A storage-backing wrapper that allows types with non-trivial constructors to be aliased in unions
+ */
+template <typename T>
+struct Uninitialized
+{
+    /// Biggest memory-access word that T is a whole multiple of and is not larger than the alignment of T
+    typedef typename UnitWord<T>::DeviceWord DeviceWord;
+
+    enum
+    {
+        WORDS = sizeof(T) / sizeof(DeviceWord)
+    };
+
+    /// Backing storage
+    DeviceWord storage[WORDS];
+
+    /// Alias
+    __host__ __device__ __forceinline__ T& Alias()
+    {
+        return reinterpret_cast<T&>(*this);
+    }
+};
+
+
+/**
+ * \brief A key identifier paired with a corresponding value
+ */
+template <
+    typename    _Key,
+    typename    _Value
+#if defined(_WIN32) && !defined(_WIN64)
+    , bool KeyIsLT = (AlignBytes<_Key>::ALIGN_BYTES < AlignBytes<_Value>::ALIGN_BYTES)
+    , bool ValIsLT = (AlignBytes<_Value>::ALIGN_BYTES < AlignBytes<_Key>::ALIGN_BYTES)
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+    >
+struct KeyValuePair
+{
+    typedef _Key    Key;                ///< Key data type
+    typedef _Value  Value;              ///< Value data type
+
+    Key     key;                        ///< Item key
+    Value   value;                      ///< Item value
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#if defined(_WIN32) && !defined(_WIN64)
+
+/**
+ * Win32 won't do 16B alignment.  This can present two problems for
+ * should-be-16B-aligned (but actually 8B aligned) built-in and intrinsics members:
+ * 1) If a smaller-aligned item were to be listed first, the host compiler places the
+ *    should-be-16B item at too early an offset (and disagrees with device compiler)
+ * 2) Or, if a smaller-aligned item lists second, the host compiler gets the size
+ *    of the struct wrong (and disagrees with device compiler)
+ *
+ * So we put the larger-should-be-aligned item first, and explicitly pad the
+ * end of the struct
+ */
+
+/// Smaller key specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, true, false>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<V>::ALIGN_BYTES - AlignBytes<K>::ALIGN_BYTES];
+
+    Value   value;  // Value has larger would-be alignment and goes first
+    Key     key;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+
+/// Smaller value specialization
+template <typename K, typename V>
+struct KeyValuePair<K, V, false, true>
+{
+    typedef K Key;
+    typedef V Value;
+
+    typedef char Pad[AlignBytes<K>::ALIGN_BYTES - AlignBytes<V>::ALIGN_BYTES];
+
+    Key     key;    // Key has larger would-be alignment and goes first
+    Value   value;
+    Pad     pad;
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair() {}
+
+    /// Constructor
+    __host__ __device__ __forceinline__
+    KeyValuePair(Key const& key, Value const& value) : key(key), value(value) {}
+
+    /// Inequality operator
+    __host__ __device__ __forceinline__ bool operator !=(const KeyValuePair &b)
+    {
+        return (value != b.value) || (key != b.key);
+    }
+};
+
+#endif // #if defined(_WIN32) && !defined(_WIN64)
+
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+
+/**
+ * \brief A wrapper for passing simple static arrays as kernel parameters
+ */
+template <typename T, int COUNT>
+struct ArrayWrapper
+{
+
+    /// Statically-sized array of type \p T
+    T array[COUNT];
+
+    /// Constructor
+    __host__ __device__ __forceinline__ ArrayWrapper() {}
+};
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/**
+ * \brief Double-buffer storage wrapper for multi-pass stream transformations that require more than one storage array for streaming intermediate results back and forth.
+ *
+ * Many multi-pass computations require a pair of "ping-pong" storage
+ * buffers (e.g., one for reading from and the other for writing to, and then
+ * vice-versa for the subsequent pass).  This structure wraps a set of device
+ * buffers and a "selector" member to track which is "current".
+ */
+template <typename T>
+struct DoubleBuffer
+{
+    /// Pair of device buffer pointers
+    T *d_buffers[2];
+
+    ///  Selector into \p d_buffers (i.e., the active/valid buffer)
+    int selector;
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer()
+    {
+        selector = 0;
+        d_buffers[0] = NULL;
+        d_buffers[1] = NULL;
+    }
+
+    /// \brief Constructor
+    __host__ __device__ __forceinline__ DoubleBuffer(
+        T *d_current,         ///< The currently valid buffer
+        T *d_alternate)       ///< Alternate storage buffer of the same size as \p d_current
+    {
+        selector = 0;
+        d_buffers[0] = d_current;
+        d_buffers[1] = d_alternate;
+    }
+
+    /// \brief Return pointer to the currently valid buffer
+    __host__ __device__ __forceinline__ T* Current() { return d_buffers[selector]; }
+
+    /// \brief Return pointer to the currently invalid buffer
+    __host__ __device__ __forceinline__ T* Alternate() { return d_buffers[selector ^ 1]; }
+
+};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+
+/**
+ * \brief Defines a structure \p detector_name that is templated on type \p T.  The \p detector_name struct exposes a constant member \p VALUE indicating whether or not parameter \p T exposes a nested type \p nested_type_name
+ */
+#define CUB_DEFINE_DETECT_NESTED_TYPE(detector_name, nested_type_name)  \
+    template <typename T>                                               \
+    struct detector_name                                                \
+    {                                                                   \
+        template <typename C>                                           \
+        static char& test(typename C::nested_type_name*);               \
+        template <typename>                                             \
+        static int& test(...);                                          \
+        enum                                                            \
+        {                                                               \
+            VALUE = sizeof(test<T>(0)) < sizeof(int)                    \
+        };                                                              \
+    };
+
+
+
+/******************************************************************************
+ * Simple enable-if (similar to Boost)
+ ******************************************************************************/
+
+/**
+ * \brief Simple enable-if (similar to Boost)
+ */
+template <bool Condition, class T = void>
+struct EnableIf
+{
+    /// Enable-if type for SFINAE dummy variables
+    typedef T Type;
+};
+
+
+template <class T>
+struct EnableIf<false, T> {};
+
+
+
+/******************************************************************************
+ * Typedef-detection
+ ******************************************************************************/
+
+/**
+ * \brief Determine whether or not BinaryOp's functor is of the form <tt>bool operator()(const T& a, const T&b)</tt> or <tt>bool operator()(const T& a, const T&b, unsigned int idx)</tt>
+ */
+template <typename T, typename BinaryOp>
+struct BinaryOpHasIdxParam
+{
+private:
+/*
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx) const>  struct SFINAE1 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, unsigned int idx)>        struct SFINAE2 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx) const>                struct SFINAE3 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, unsigned int idx)>                      struct SFINAE4 {};
+*/
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx) const>           struct SFINAE5 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(const T &a, const T &b, int idx)>                 struct SFINAE6 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx) const>                         struct SFINAE7 {};
+    template <typename BinaryOpT, bool (BinaryOpT::*)(T a, T b, int idx)>                               struct SFINAE8 {};
+/*
+    template <typename BinaryOpT> static char Test(SFINAE1<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE2<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE3<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> static char Test(SFINAE4<BinaryOpT, &BinaryOpT::operator()> *);
+*/
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE5<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE6<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE7<BinaryOpT, &BinaryOpT::operator()> *);
+    template <typename BinaryOpT> __host__ __device__ static char Test(SFINAE8<BinaryOpT, &BinaryOpT::operator()> *);
+
+    template <typename BinaryOpT> static int Test(...);
+
+public:
+
+    /// Whether the functor BinaryOp has a third <tt>unsigned int</tt> index param
+    static const bool HAS_PARAM = sizeof(Test<BinaryOp>(NULL)) == sizeof(char);
+};
+
+
+
+
+/******************************************************************************
+ * Simple type traits utilities.
+ *
+ * For example:
+ *     Traits<int>::CATEGORY             // SIGNED_INTEGER
+ *     Traits<NullType>::NULL_TYPE       // true
+ *     Traits<uint4>::CATEGORY           // NOT_A_NUMBER
+ *     Traits<uint4>::PRIMITIVE;         // false
+ *
+ ******************************************************************************/
+
+/**
+ * \brief Basic type traits categories
+ */
+enum Category
+{
+    NOT_A_NUMBER,
+    SIGNED_INTEGER,
+    UNSIGNED_INTEGER,
+    FLOATING_POINT
+};
+
+
+/**
+ * \brief Basic type traits
+ */
+template <Category _CATEGORY, bool _PRIMITIVE, bool _NULL_TYPE, typename _UnsignedBits, typename T>
+struct BaseTraits
+{
+    /// Category
+    static const Category CATEGORY      = _CATEGORY;
+    enum
+    {
+        PRIMITIVE       = _PRIMITIVE,
+        NULL_TYPE       = _NULL_TYPE,
+    };
+};
+
+
+/**
+ * Basic type traits (unsigned primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<UNSIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = UNSIGNED_INTEGER;
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(0);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1);
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key;
+    }
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+
+/**
+ * Basic type traits (signed primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<SIGNED_INTEGER, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = SIGNED_INTEGER;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = HIGH_BIT;
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        return key ^ HIGH_BIT;
+    };
+
+    static __host__ __device__ __forceinline__ T Max()
+    {
+        UnsignedBits retval = MAX_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest()
+    {
+        UnsignedBits retval = LOWEST_KEY;
+        return reinterpret_cast<T&>(retval);
+    }
+};
+
+template <typename _T>
+struct FpLimits;
+
+template <>
+struct FpLimits<float>
+{
+    static __host__ __device__ __forceinline__ float Max() {
+        return FLT_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ float Lowest() {
+        return FLT_MAX * float(-1);
+    }
+};
+
+template <>
+struct FpLimits<double>
+{
+    static __host__ __device__ __forceinline__ double Max() {
+        return DBL_MAX;
+    }
+
+    static __host__ __device__ __forceinline__ double Lowest() {
+        return DBL_MAX  * double(-1);
+    }
+};
+
+
+#if (__CUDACC_VER_MAJOR__ >= 9)
+template <>
+struct FpLimits<__half>
+{
+    static __host__ __device__ __forceinline__ __half Max() {
+        unsigned short max_word = 0x7BFF;
+        return reinterpret_cast<__half&>(max_word);
+    }
+
+    static __host__ __device__ __forceinline__ __half Lowest() {
+        unsigned short lowest_word = 0xFBFF;
+        return reinterpret_cast<__half&>(lowest_word);
+    }
+};
+#endif
+
+
+/**
+ * Basic type traits (fp primitive specialization)
+ */
+template <typename _UnsignedBits, typename T>
+struct BaseTraits<FLOATING_POINT, true, false, _UnsignedBits, T>
+{
+    typedef _UnsignedBits       UnsignedBits;
+
+    static const Category       CATEGORY    = FLOATING_POINT;
+    static const UnsignedBits   HIGH_BIT    = UnsignedBits(1) << ((sizeof(UnsignedBits) * 8) - 1);
+    static const UnsignedBits   LOWEST_KEY  = UnsignedBits(-1);
+    static const UnsignedBits   MAX_KEY     = UnsignedBits(-1) ^ HIGH_BIT;
+
+    enum
+    {
+        PRIMITIVE       = true,
+        NULL_TYPE       = false,
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleIn(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? UnsignedBits(-1) : HIGH_BIT;
+        return key ^ mask;
+    };
+
+    static __device__ __forceinline__ UnsignedBits TwiddleOut(UnsignedBits key)
+    {
+        UnsignedBits mask = (key & HIGH_BIT) ? HIGH_BIT : UnsignedBits(-1);
+        return key ^ mask;
+    };
+
+    static __host__ __device__ __forceinline__ T Max() {
+        return FpLimits<T>::Max();
+    }
+
+    static __host__ __device__ __forceinline__ T Lowest() {
+        return FpLimits<T>::Lowest();
+    }
+};
+
+
+/**
+ * \brief Numeric type traits
+ */
+template <typename T> struct NumericTraits :            BaseTraits<NOT_A_NUMBER, false, false, T, T> {};
+
+template <> struct NumericTraits<NullType> :            BaseTraits<NOT_A_NUMBER, false, true, NullType, NullType> {};
+
+template <> struct NumericTraits<char> :                BaseTraits<(std::numeric_limits<char>::is_signed) ? SIGNED_INTEGER : UNSIGNED_INTEGER, true, false, unsigned char, char> {};
+template <> struct NumericTraits<signed char> :         BaseTraits<SIGNED_INTEGER, true, false, unsigned char, signed char> {};
+template <> struct NumericTraits<short> :               BaseTraits<SIGNED_INTEGER, true, false, unsigned short, short> {};
+template <> struct NumericTraits<int> :                 BaseTraits<SIGNED_INTEGER, true, false, unsigned int, int> {};
+template <> struct NumericTraits<long> :                BaseTraits<SIGNED_INTEGER, true, false, unsigned long, long> {};
+template <> struct NumericTraits<long long> :           BaseTraits<SIGNED_INTEGER, true, false, unsigned long long, long long> {};
+
+template <> struct NumericTraits<unsigned char> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned char, unsigned char> {};
+template <> struct NumericTraits<unsigned short> :      BaseTraits<UNSIGNED_INTEGER, true, false, unsigned short, unsigned short> {};
+template <> struct NumericTraits<unsigned int> :        BaseTraits<UNSIGNED_INTEGER, true, false, unsigned int, unsigned int> {};
+template <> struct NumericTraits<unsigned long> :       BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long, unsigned long> {};
+template <> struct NumericTraits<unsigned long long> :  BaseTraits<UNSIGNED_INTEGER, true, false, unsigned long long, unsigned long long> {};
+
+template <> struct NumericTraits<float> :               BaseTraits<FLOATING_POINT, true, false, unsigned int, float> {};
+template <> struct NumericTraits<double> :              BaseTraits<FLOATING_POINT, true, false, unsigned long long, double> {};
+#if (__CUDACC_VER_MAJOR__ >= 9)
+    template <> struct NumericTraits<__half> :          BaseTraits<FLOATING_POINT, true, false, unsigned short, __half> {};
+#endif
+
+template <> struct NumericTraits<bool> :                BaseTraits<UNSIGNED_INTEGER, true, false, typename UnitWord<bool>::VolatileWord, bool> {};
+
+
+
+/**
+ * \brief Type traits
+ */
+template <typename T>
+struct Traits : NumericTraits<typename RemoveQualifiers<T>::Type> {};
+
+
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+/** @} */       // end group UtilModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/warp/specializations/warp_reduce_shfl.cuh b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_reduce_shfl.cuh
new file mode 100644
index 0000000000..bbbf37e5c7
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_reduce_shfl.cuh
@@ -0,0 +1,541 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_type.cuh"
+#include "../../util_macro.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \brief WarpReduceShfl provides SHFL-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp reduction steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// Number of logical warps in a PTX warp
+        LOGICAL_WARPS = CUB_WARP_THREADS(PTX_ARCH) / LOGICAL_WARP_THREADS,
+
+        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
+
+    };
+
+    template <typename S>
+    struct IsInteger
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+
+    /// Shared memory storage layout type
+    typedef NullType TempStorage;
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    /// Lane index in logical warp
+    unsigned int lane_id;
+
+    /// Logical warp index in 32-thread physical warp
+    unsigned int warp_id;
+
+    /// 32-thread physical warp member mask of logical warp
+    unsigned int member_mask;
+
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceShfl(
+        TempStorage &/*temp_storage*/)
+    {
+        lane_id = LaneId();
+        warp_id = 0;
+        member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
+
+        if (!IS_ARCH_WARP)
+        {
+            warp_id = lane_id / LOGICAL_WARP_THREADS;
+            lane_id = lane_id % LOGICAL_WARP_THREADS;
+            member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Reduction steps
+    //---------------------------------------------------------------------
+
+    /// Reduction (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int ReduceStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across fp32 types)
+    __device__ __forceinline__ float ReduceStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*reduction_op*/,   ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.down.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.down.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long ReduceStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.u64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across long long types)
+    __device__ __forceinline__ long long ReduceStep(
+        long long           input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 %0, {lo, hi};"
+            "  @p add.s64 %0, %0, %1;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for summation across double types)
+    __device__ __forceinline__ double ReduceStep(
+        double              input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+        int shfl_c = last_lane | SHFL_C;   // Shuffle control (mask and last_lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.down.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.down.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.down.b32 lo|p, lo, %2, %3;"
+            "  shfl.down.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+    /// Reduction (specialized for swizzled ReduceByKeyOp<cub::Sum> across KeyValuePair<KeyT, ValueT> types)
+    template <typename ValueT, typename KeyT>
+    __device__ __forceinline__ KeyValuePair<KeyT, ValueT> ReduceStep(
+        KeyValuePair<KeyT, ValueT>                  input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceByKeyOp<cub::Sum> >     /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                                         last_lane,          ///< [in] Index of last lane in segment
+        int                                         offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<KeyT, ValueT> output;
+
+        KeyT other_key = ShuffleDown<LOGICAL_WARP_THREADS>(input.key, offset, last_lane, member_mask);
+        
+        output.key = input.key;
+        output.value = ReduceStep(
+            input.value, 
+            cub::Sum(), 
+            last_lane, 
+            offset, 
+            Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key != other_key)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+
+    /// Reduction (specialized for swizzled ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, ValueT> types)
+    template <typename ValueT, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, ValueT> ReduceStep(
+        KeyValuePair<OffsetT, ValueT>                 input,              ///< [in] Calling thread's input item.
+        SwizzleScanOp<ReduceBySegmentOp<cub::Sum> >   /*reduction_op*/,   ///< [in] Binary reduction operator
+        int                                           last_lane,          ///< [in] Index of last lane in segment
+        int                                           offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, ValueT> output;
+
+        output.value = ReduceStep(input.value, cub::Sum(), last_lane, offset, Int2Type<IsInteger<ValueT>::IS_SMALL_UNSIGNED>());
+        output.key = ReduceStep(input.key, cub::Sum(), last_lane, offset, Int2Type<IsInteger<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+
+
+    /// Reduction step (generic)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T                  input,              ///< [in] Calling thread's input item.
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 last_lane,          ///< [in] Index of last lane in segment
+        int                 offset)             ///< [in] Up-offset to pull from
+    {
+        _T output = input;
+
+        _T temp = ShuffleDown<LOGICAL_WARP_THREADS>(output, offset, last_lane, member_mask);
+
+        // Perform reduction op if valid
+        if (offset + lane_id <= last_lane)
+            output = reduction_op(input, temp);
+
+        return output;
+    }
+
+
+    /// Reduction step (specialized for small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    /// Reduction step (specialized for types other than small unsigned integers size 32b or less)
+    template <typename _T, typename ReductionOp>
+    __device__ __forceinline__ _T ReduceStep(
+        _T              input,                  ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,           ///< [in] Binary reduction operator
+        int             last_lane,              ///< [in] Index of last lane in segment
+        int             offset,                 ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small unsigned integer
+    {
+        return ReduceStep(input, reduction_op, last_lane, offset);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Templated inclusive scan iteration
+    //---------------------------------------------------------------------
+
+    template <typename ReductionOp, int STEP>
+    __device__ __forceinline__ void ReduceStep(
+        T&              input,              ///< [in] Calling thread's input item.
+        ReductionOp     reduction_op,       ///< [in] Binary reduction operator
+        int             last_lane,          ///< [in] Index of last lane in segment
+        Int2Type<STEP>  /*step*/)
+    {
+        input = ReduceStep(input, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+
+        ReduceStep(input, reduction_op, last_lane, Int2Type<STEP + 1>());
+    }
+
+    template <typename ReductionOp>
+    __device__ __forceinline__ void ReduceStep(
+        T&              /*input*/,              ///< [in] Calling thread's input item.
+        ReductionOp     /*reduction_op*/,       ///< [in] Binary reduction operator
+        int             /*last_lane*/,          ///< [in] Index of last lane in segment
+        Int2Type<STEPS> /*step*/)
+    {}
+
+
+    //---------------------------------------------------------------------
+    // Reduction operations
+    //---------------------------------------------------------------------
+
+    /// Reduction
+    template <
+        bool            ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename        ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T               input,                  ///< [in] Calling thread's input
+        int             valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp     reduction_op)           ///< [in] Binary reduction operator
+    {
+        int last_lane = (ALL_LANES_VALID) ?
+                            LOGICAL_WARP_THREADS - 1 :
+                            valid_items - 1;
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+
+
+    /// Segmented reduction
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT           flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Binary reduction operator
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        // Convert to tail-segmented
+        if (HEAD_SEGMENTED)
+            warp_flags >>= 1;
+
+        // Mask out the bits below the current thread
+        warp_flags &= LaneMaskGe();
+
+        // Mask of physical lanes outside the logical warp and convert to logical lanemask
+        if (!IS_ARCH_WARP)
+        {
+            warp_flags = (warp_flags & member_mask) >> (warp_id * LOGICAL_WARP_THREADS);
+        }
+
+        // Mask in the last lane of logical warp
+        warp_flags |= 1u << (LOGICAL_WARP_THREADS - 1);
+
+        // Find the next set flag
+        int last_lane = __clz(__brev(warp_flags));
+
+        T output = input;
+
+//        // Iterate reduction steps
+//        #pragma unroll
+//        for (int STEP = 0; STEP < STEPS; STEP++)
+//        {
+//            output = ReduceStep(output, reduction_op, last_lane, 1 << STEP, Int2Type<IsInteger<T>::IS_SMALL_UNSIGNED>());
+//        }
+
+        // Template-iterate reduction steps
+        ReduceStep(output, reduction_op, last_lane, Int2Type<0>());
+
+        return output;
+    }
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/warp/specializations/warp_reduce_smem.cuh b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_reduce_smem.cuh
new file mode 100644
index 0000000000..7baa573be1
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_reduce_smem.cuh
@@ -0,0 +1,372 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpReduceSmem provides smem-based variants of parallel reduction of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being reduced
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpReduceSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+
+        /// FlagT status (when not using ballot)
+        UNSET   = 0x0,  // Is initially unset
+        SET     = 0x1,  // Is initially set
+        SEEN    = 0x2,  // Has seen another head flag from a successor peer
+    };
+
+    /// Shared memory flag type
+    typedef unsigned char SmemFlag;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    struct _TempStorage
+    {
+        T           reduce[WARP_SMEM_ELEMENTS];
+        SmemFlag    flags[WARP_SMEM_ELEMENTS];
+    };
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpReduceSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Regular reduction
+    //---------------------------------------------------------------------
+
+    /**
+     * Reduction step
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp,
+        int                 STEP>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp         reduction_op,           ///< [in] Reduction operator
+        Int2Type<STEP>      /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share input through buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+        WARP_SYNC(member_mask);
+
+        // Update input if peer_addend is in range
+        if ((ALL_LANES_VALID && IS_POW_OF_TWO) || ((lane_id + OFFSET) < valid_items))
+        {
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+            input = reduction_op(input, peer_addend);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<STEP + 1>());
+    }
+
+
+    /**
+     * Reduction step (terminate)
+     */
+    template <
+        bool                ALL_LANES_VALID,            ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp>
+    __device__ __forceinline__ T ReduceStep(
+        T                   input,                      ///< [in] Calling thread's input
+        int                 valid_items,                ///< [in] Total number of valid items across the logical warp
+        ReductionOp         /*reduction_op*/,           ///< [in] Reduction operator
+        Int2Type<STEPS>     /*step*/)
+    {
+        return input;
+    }
+
+
+    //---------------------------------------------------------------------
+    // Segmented reduction
+    //---------------------------------------------------------------------
+
+
+    /**
+     * Ballot-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<true>  /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        // Get the start flags for each thread in the warp.
+        int warp_flags = WARP_BALLOT(flag, member_mask);
+
+        if (!HEAD_SEGMENTED)
+            warp_flags <<= 1;
+
+        // Keep bits above the current thread.
+        warp_flags &= LaneMaskGt();
+
+        // Accommodate packing of multiple logical warps in a single physical warp
+        if (!IS_ARCH_WARP)
+        {
+            warp_flags >>= (LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS;
+        }
+
+        // Find next flag
+        int next_flag = __clz(__brev(warp_flags));
+
+        // Clip the next segment at the warp boundary if necessary
+        if (LOGICAL_WARP_THREADS != 32)
+            next_flag = CUB_MIN(next_flag, LOGICAL_WARP_THREADS);
+
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input into buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Update input if peer_addend is in range
+            if (OFFSET + lane_id < next_flag)
+            {
+                T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+                input = reduction_op(input, peer_addend);
+            }
+
+            WARP_SYNC(member_mask);
+        }
+
+        return input;
+    }
+
+
+    /**
+     * Smem-based segmented reduce
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,                  ///< [in] Calling thread's input
+        FlagT           flag,                   ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op,           ///< [in] Reduction operator
+        Int2Type<false> /*has_ballot*/)         ///< [in] Marker type for whether the target arch has ballot functionality
+    {
+        enum
+        {
+            UNSET   = 0x0,  // Is initially unset
+            SET     = 0x1,  // Is initially set
+            SEEN    = 0x2,  // Has seen another head flag from a successor peer
+        };
+
+        // Alias flags onto shared data storage
+        volatile SmemFlag *flag_storage = temp_storage.flags;
+
+        SmemFlag flag_status = (flag) ? SET : UNSET;
+
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            const int OFFSET = 1 << STEP;
+
+            // Share input through buffer
+            ThreadStore<STORE_VOLATILE>(&temp_storage.reduce[lane_id], input);
+
+            WARP_SYNC(member_mask);
+
+            // Get peer from buffer
+            T peer_addend = ThreadLoad<LOAD_VOLATILE>(&temp_storage.reduce[lane_id + OFFSET]);
+
+            WARP_SYNC(member_mask);
+
+            // Share flag through buffer
+            flag_storage[lane_id] = flag_status;
+
+            // Get peer flag from buffer
+            SmemFlag peer_flag_status = flag_storage[lane_id + OFFSET];
+
+            // Update input if peer was in range
+            if (lane_id < LOGICAL_WARP_THREADS - OFFSET)
+            {
+                if (HEAD_SEGMENTED)
+                {
+                    // Head-segmented
+                    if ((flag_status & SEEN) == 0)
+                    {
+                        // Has not seen a more distant head flag
+                        if (peer_flag_status & SET)
+                        {
+                            // Has now seen a head flag
+                            flag_status |= SEEN;
+                        }
+                        else
+                        {
+                            // Peer is not a head flag: grab its count
+                            input = reduction_op(input, peer_addend);
+                        }
+
+                        // Update seen status to include that of peer
+                        flag_status |= (peer_flag_status & SEEN);
+                    }
+                }
+                else
+                {
+                    // Tail-segmented.  Simply propagate flag status
+                    if (!flag_status)
+                    {
+                        input = reduction_op(input, peer_addend);
+                        flag_status |= peer_flag_status;
+                    }
+
+                }
+            }
+        }
+
+        return input;
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    /**
+     * Reduction
+     */
+    template <
+        bool                ALL_LANES_VALID,        ///< Whether all lanes in each warp are contributing a valid fold of items
+        typename            ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,                  ///< [in] Calling thread's input
+        int                 valid_items,            ///< [in] Total number of valid items across the logical warp
+        ReductionOp         reduction_op)           ///< [in] Reduction operator
+    {
+        return ReduceStep<ALL_LANES_VALID>(input, valid_items, reduction_op, Int2Type<0>());
+    }
+
+
+    /**
+     * Segmented reduction
+     */
+    template <
+        bool            HEAD_SEGMENTED,     ///< Whether flags indicate a segment-head or a segment-tail
+        typename        FlagT,
+        typename        ReductionOp>
+    __device__ __forceinline__ T SegmentedReduce(
+        T               input,              ///< [in] Calling thread's input
+        FlagT            flag,               ///< [in] Whether or not the current lane is a segment head/tail
+        ReductionOp     reduction_op)       ///< [in] Reduction operator
+    {
+        return SegmentedReduce<HEAD_SEGMENTED>(input, flag, reduction_op, Int2Type<(PTX_ARCH >= 200)>());
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/warp/specializations/warp_scan_shfl.cuh b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_scan_shfl.cuh
new file mode 100644
index 0000000000..7f4e1c94bb
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_scan_shfl.cuh
@@ -0,0 +1,632 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../util_type.cuh"
+#include "../../util_ptx.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanShfl
+{
+    //---------------------------------------------------------------------
+    // Constants and type definitions
+    //---------------------------------------------------------------------
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The 5-bit SHFL mask for logically splitting warps into sub-segments starts 8-bits up
+        SHFL_C = (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS) << 8
+    };
+
+    template <typename S>
+    struct IntegerTraits
+    {
+        enum {
+            ///Whether the data type is a small (32b or less) integer for which we can use a single SFHL instruction per exchange
+            IS_SMALL_UNSIGNED = (Traits<S>::CATEGORY == UNSIGNED_INTEGER) && (sizeof(S) <= sizeof(unsigned int))
+        };
+    };
+
+    /// Shared memory storage layout type
+    struct TempStorage {};
+
+
+    //---------------------------------------------------------------------
+    // Thread fields
+    //---------------------------------------------------------------------
+
+    /// Lane index in logical warp
+    unsigned int lane_id;
+
+    /// Logical warp index in 32-thread physical warp
+    unsigned int warp_id;
+
+    /// 32-thread physical warp member mask of logical warp
+    unsigned int member_mask;
+
+    //---------------------------------------------------------------------
+    // Construction
+    //---------------------------------------------------------------------
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanShfl(
+        TempStorage &/*temp_storage*/)
+    {
+        lane_id = LaneId();
+        warp_id = 0;
+        member_mask = 0xffffffffu >> (CUB_WARP_THREADS(PTX_ARCH) - LOGICAL_WARP_THREADS);
+
+        if (!IS_ARCH_WARP)
+        {
+            warp_id = lane_id / LOGICAL_WARP_THREADS;
+            lane_id = lane_id % LOGICAL_WARP_THREADS;
+            member_mask = member_mask << (warp_id * LOGICAL_WARP_THREADS);
+        }
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive scan steps
+    //---------------------------------------------------------------------
+
+    /// Inclusive prefix scan step (specialized for summation across int32 types)
+    __device__ __forceinline__ int InclusiveScanStep(
+        int             input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.s32 r0, r0, %4;"
+            "  mov.s32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+    /// Inclusive prefix scan step (specialized for summation across uint32 types)
+    __device__ __forceinline__ unsigned int InclusiveScanStep(
+        unsigned int    input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned int output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.u32 r0, r0, %4;"
+            "  mov.u32 %0, r0;"
+            "}"
+            : "=r"(output) : "r"(input), "r"(offset), "r"(shfl_c), "r"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp32 types)
+    __device__ __forceinline__ float InclusiveScanStep(
+        float           input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        float output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.sync.up.b32 r0|p, %1, %2, %3, %5;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .f32 r0;"
+            "  .reg .pred p;"
+            "  shfl.up.b32 r0|p, %1, %2, %3;"
+            "  @p add.f32 r0, r0, %4;"
+            "  mov.f32 %0, r0;"
+            "}"
+            : "=f"(output) : "f"(input), "r"(offset), "r"(shfl_c), "f"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across unsigned long long types)
+    __device__ __forceinline__ unsigned long long InclusiveScanStep(
+        unsigned long long  input,              ///< [in] Calling thread's input item.
+        cub::Sum            /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        unsigned long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.u64 r0, r0, %4;"
+            "  mov.u64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across long long types)
+    __device__ __forceinline__ long long InclusiveScanStep(
+        long long       input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        long long output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %5;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %5;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .s64 r0;"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.s64 r0, r0, %4;"
+            "  mov.s64 %0, r0;"
+            "}"
+            : "=l"(output) : "l"(input), "r"(offset), "r"(shfl_c), "l"(input));
+#endif
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for summation across fp64 types)
+    __device__ __forceinline__ double InclusiveScanStep(
+        double          input,              ///< [in] Calling thread's input item.
+        cub::Sum        /*scan_op*/,        ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        double output;
+        int shfl_c = first_lane | SHFL_C;   // Shuffle control (mask and first-lane)
+
+        // Use predicate set from SHFL to guard against invalid peers
+#ifdef CUB_USE_COOPERATIVE_GROUPS
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.sync.up.b32 lo|p, lo, %2, %3, %4;"
+            "  shfl.sync.up.b32 hi|p, hi, %2, %3, %4;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c), "r"(member_mask));
+#else
+        asm volatile(
+            "{"
+            "  .reg .u32 lo;"
+            "  .reg .u32 hi;"
+            "  .reg .pred p;"
+            "  .reg .f64 r0;"
+            "  mov.b64 %0, %1;"
+            "  mov.b64 {lo, hi}, %1;"
+            "  shfl.up.b32 lo|p, lo, %2, %3;"
+            "  shfl.up.b32 hi|p, hi, %2, %3;"
+            "  mov.b64 r0, {lo, hi};"
+            "  @p add.f64 %0, %0, r0;"
+            "}"
+            : "=d"(output) : "d"(input), "r"(offset), "r"(shfl_c));
+#endif
+
+        return output;
+    }
+
+
+/*
+    /// Inclusive prefix scan (specialized for ReduceBySegmentOp<cub::Sum> across KeyValuePair<OffsetT, Value> types)
+    template <typename Value, typename OffsetT>
+    __device__ __forceinline__ KeyValuePair<OffsetT, Value>InclusiveScanStep(
+        KeyValuePair<OffsetT, Value>    input,              ///< [in] Calling thread's input item.
+        ReduceBySegmentOp<cub::Sum>     scan_op,            ///< [in] Binary scan operator
+        int                             first_lane,         ///< [in] Index of first lane in segment
+        int                             offset)             ///< [in] Up-offset to pull from
+    {
+        KeyValuePair<OffsetT, Value> output;
+
+        output.value = InclusiveScanStep(input.value, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<Value>::IS_SMALL_UNSIGNED>());
+        output.key = InclusiveScanStep(input.key, cub::Sum(), first_lane, offset, Int2Type<IntegerTraits<OffsetT>::IS_SMALL_UNSIGNED>());
+
+        if (input.key > 0)
+            output.value = input.value;
+
+        return output;
+    }
+*/
+
+    /// Inclusive prefix scan step (generic)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset)             ///< [in] Up-offset to pull from
+    {
+        _T temp = ShuffleUp<LOGICAL_WARP_THREADS>(input, offset, first_lane, member_mask);
+
+        // Perform scan op if from a valid peer
+        _T output = scan_op(temp, input);
+        if (static_cast<int>(lane_id) < first_lane + offset)
+            output = input;
+
+        return output;
+    }
+
+
+    /// Inclusive prefix scan step (specialized for small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<true>  /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+
+    /// Inclusive prefix scan step (specialized for types other than small integers size 32b or less)
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ _T InclusiveScanStep(
+        _T              input,              ///< [in] Calling thread's input item.
+        ScanOpT          scan_op,            ///< [in] Binary scan operator
+        int             first_lane,         ///< [in] Index of first lane in segment
+        int             offset,             ///< [in] Up-offset to pull from
+        Int2Type<false> /*is_small_unsigned*/)  ///< [in] Marker type indicating whether T is a small integer
+    {
+        return InclusiveScanStep(input, scan_op, first_lane, offset);
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        int             src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return ShuffleIndex<LOGICAL_WARP_THREADS>(input, src_lane, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename _T, typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        _T              input,              ///< [in] Calling thread's input item.
+        _T              &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        // Iterate scan steps
+        int segment_first_lane = 0;
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output = InclusiveScanStep(
+                inclusive_output,
+                scan_op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+
+    }
+
+    /// Inclusive scan, specialized for reduce-value-by-key
+    template <typename KeyT, typename ValueT, typename ReductionOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        KeyValuePair<KeyT, ValueT>      input,              ///< [in] Calling thread's input item.
+        KeyValuePair<KeyT, ValueT>      &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ReduceByKeyOp<ReductionOpT >    scan_op)            ///< [in] Binary scan operator
+    {
+        inclusive_output = input;
+
+        KeyT pred_key = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive_output.key, 1, 0, member_mask);
+
+        unsigned int ballot = WARP_BALLOT((pred_key != inclusive_output.key), member_mask);
+
+        // Mask away all lanes greater than ours
+        ballot = ballot & LaneMaskLe();
+
+        // Find index of first set bit
+        int segment_first_lane = CUB_MAX(0, 31 - __clz(ballot));
+
+        // Iterate scan steps
+        #pragma unroll
+        for (int STEP = 0; STEP < STEPS; STEP++)
+        {
+            inclusive_output.value = InclusiveScanStep(
+                inclusive_output.value,
+                scan_op.op,
+                segment_first_lane,
+                (1 << STEP),
+                Int2Type<IntegerTraits<T>::IS_SMALL_UNSIGNED>());
+        }
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOpT>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOpT         scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Grab aggregate from last warp lane
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive_output, LOGICAL_WARP_THREADS - 1, member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,          ///< [in]
+        T                       &inclusive,         ///< [in, out]
+        T                       &exclusive,         ///< [out]
+        ScanOpT                 /*scan_op*/,        ///< [in]
+        IsIntegerT              /*is_integer*/)     ///< [in]
+    {
+        // initial value unknown
+        exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = ShuffleUp<LOGICAL_WARP_THREADS>(inclusive, 1, 0, member_mask);
+
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
+        Update(input, inclusive, exclusive, scan_op, is_integer);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              is_integer)
+    {
+        warp_aggregate = ShuffleIndex<LOGICAL_WARP_THREADS>(inclusive, LOGICAL_WARP_THREADS - 1, member_mask);
+        Update(input, inclusive, exclusive, scan_op, initial_value, is_integer);
+    }
+
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/warp/specializations/warp_scan_smem.cuh b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_scan_smem.cuh
new file mode 100644
index 0000000000..3237fcbfe9
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/warp/specializations/warp_scan_smem.cuh
@@ -0,0 +1,397 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * cub::WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "../../thread/thread_operators.cuh"
+#include "../../thread/thread_load.cuh"
+#include "../../thread/thread_store.cuh"
+#include "../../util_type.cuh"
+#include "../../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \brief WarpScanSmem provides smem-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+template <
+    typename    T,                      ///< Data type being scanned
+    int         LOGICAL_WARP_THREADS,   ///< Number of threads per logical warp
+    int         PTX_ARCH>               ///< The PTX compute capability for which to to specialize this collective
+struct WarpScanSmem
+{
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of warp scan steps
+        STEPS = Log2<LOGICAL_WARP_THREADS>::VALUE,
+
+        /// The number of threads in half a warp
+        HALF_WARP_THREADS = 1 << (STEPS - 1),
+
+        /// The number of shared memory elements per warp
+        WARP_SMEM_ELEMENTS =  LOGICAL_WARP_THREADS + HALF_WARP_THREADS,
+    };
+
+    /// Storage cell type (workaround for SM1x compiler bugs with custom-ops like Max() on signed chars)
+    typedef typename If<((Equals<T, char>::VALUE || Equals<T, signed char>::VALUE) && (PTX_ARCH < 200)), int, T>::Type CellT;
+
+    /// Shared memory storage layout type (1.5 warps-worth of elements for each warp)
+    typedef CellT _TempStorage[WARP_SMEM_ELEMENTS];
+
+    // Alias wrapper allowing storage to be unioned
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+    unsigned int    member_mask;
+
+
+    /******************************************************************************
+     * Construction
+     ******************************************************************************/
+
+    /// Constructor
+    __device__ __forceinline__ WarpScanSmem(
+        TempStorage     &temp_storage)
+    :
+        temp_storage(temp_storage.Alias()),
+
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
+    {}
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+    /// Basic inclusive scan iteration (template unrolled, inductive-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        int         STEP,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &partial,
+        ScanOp                  scan_op,
+        Int2Type<STEP>          /*step*/)
+    {
+        const int OFFSET = 1 << STEP;
+
+        // Share partial into buffer
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) partial);
+
+        WARP_SYNC(member_mask);
+
+        // Update partial if addend is in range
+        if (HAS_IDENTITY || (lane_id >= OFFSET))
+        {
+            T addend = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - OFFSET]);
+            partial = scan_op(addend, partial);
+        }
+        WARP_SYNC(member_mask);
+
+        ScanStep<HAS_IDENTITY>(partial, scan_op, Int2Type<STEP + 1>());
+    }
+
+
+    /// Basic inclusive scan iteration(template unrolled, base-case specialization)
+    template <
+        bool        HAS_IDENTITY,
+        typename    ScanOp>
+    __device__ __forceinline__ void ScanStep(
+        T                       &/*partial*/,
+        ScanOp                  /*scan_op*/,
+        Int2Type<STEPS>         /*step*/)
+    {}
+
+
+    /// Inclusive prefix scan (specialized for summation across primitive types)
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        Sum                     scan_op,            ///< [in] Binary scan operator
+        Int2Type<true>          /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        T identity = 0;
+        ThreadStore<STORE_VOLATILE>(&temp_storage[lane_id], (CellT) identity);
+
+        WARP_SYNC(member_mask);
+
+        // Iterate scan steps
+        output = input;
+        ScanStep<true>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /// Inclusive prefix scan
+    template <typename ScanOp, int IS_PRIMITIVE>
+    __device__ __forceinline__ void InclusiveScan(
+        T                       input,              ///< [in] Calling thread's input item.
+        T                       &output,            ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp                  scan_op,            ///< [in] Binary scan operator
+        Int2Type<IS_PRIMITIVE>  /*is_primitive*/)   ///< [in] Marker type indicating whether T is primitive type
+    {
+        // Iterate scan steps
+        output = input;
+        ScanStep<false>(output, scan_op, Int2Type<0>());
+    }
+
+
+    /******************************************************************************
+     * Interface
+     ******************************************************************************/
+
+    //---------------------------------------------------------------------
+    // Broadcast
+    //---------------------------------------------------------------------
+
+    /// Broadcast
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        if (lane_id == src_lane)
+        {
+            ThreadStore<STORE_VOLATILE>(temp_storage, (CellT) input);
+        }
+
+        WARP_SYNC(member_mask);
+
+        return (T)ThreadLoad<LOAD_VOLATILE>(temp_storage);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Inclusive operations
+    //---------------------------------------------------------------------
+
+    /// Inclusive scan
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InclusiveScan(input, inclusive_output, scan_op, Int2Type<Traits<T>::PRIMITIVE>());
+    }
+
+
+    /// Inclusive scan with aggregate
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, scan_op);
+
+        // Retrieve aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive_output);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+    }
+
+
+    //---------------------------------------------------------------------
+    // Get exclusive from inclusive
+    //---------------------------------------------------------------------
+
+    /// Update inclusive and exclusive using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update(
+        T                       /*input*/,      ///< [in]
+        T                       &inclusive,     ///< [in, out]
+        T                       &exclusive,     ///< [out]
+        ScanOpT                 /*scan_op*/,    ///< [in]
+        IsIntegerT              /*is_integer*/) ///< [in]
+    {
+        // initial value unknown
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+    }
+
+    /// Update inclusive and exclusive using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update(
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                /*scan_op*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // initial value presumed 0
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive and exclusive using initial value using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+    /// Update inclusive and exclusive using initial value using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        cub::Sum                scan_op,
+        T                       initial_value,
+        Int2Type<true>          /*is_integer*/)
+    {
+        inclusive = scan_op(initial_value, inclusive);
+        exclusive = inclusive - input;
+    }
+
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 /*scan_op*/,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1]);
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input and inclusive (specialized for summation of integer types)
+    __device__ __forceinline__ void Update (
+        T                       input,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        cub::Sum                /*scan_o*/,
+        Int2Type<true>          /*is_integer*/)
+    {
+        // Initial value presumed to be unknown or identity (either way our padding is correct)
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+        exclusive = inclusive - input;
+    }
+
+    /// Update inclusive, exclusive, and warp aggregate using input, inclusive, and initial value
+    template <typename ScanOpT, typename IsIntegerT>
+    __device__ __forceinline__ void Update (
+        T                       /*input*/,
+        T                       &inclusive,
+        T                       &exclusive,
+        T                       &warp_aggregate,
+        ScanOpT                 scan_op,
+        T                       initial_value,
+        IsIntegerT              /*is_integer*/)
+    {
+        // Broadcast warp aggregate
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        warp_aggregate = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[WARP_SMEM_ELEMENTS - 1]);
+
+        WARP_SYNC(member_mask);
+
+        // Update inclusive with initial value
+        inclusive = scan_op(initial_value, inclusive);
+
+        // Get exclusive from exclusive
+        ThreadStore<STORE_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 1], (CellT) inclusive);
+
+        WARP_SYNC(member_mask);
+
+        exclusive = (T) ThreadLoad<LOAD_VOLATILE>(&temp_storage[HALF_WARP_THREADS + lane_id - 2]);
+
+        if (lane_id == 0)
+            exclusive = initial_value;
+    }
+
+
+};
+
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/warp/warp_reduce.cuh b/GraphBLAS/CUDA/local_cub/warp/warp_reduce.cuh
new file mode 100644
index 0000000000..189896b071
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/warp/warp_reduce.cuh
@@ -0,0 +1,612 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "specializations/warp_reduce_shfl.cuh"
+#include "specializations/warp_reduce_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpReduce class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel reduction of items partitioned across a CUDA thread warp. ![](warp_reduce_logo.png)
+ *
+ * \tparam T                        The reduction input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size of the targeted CUDA compute-capability (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - A <a href="http://en.wikipedia.org/wiki/Reduce_(higher-order_function)"><em>reduction</em></a> (or <em>fold</em>)
+ *   uses a binary combining operator to compute a single aggregate from a list of input elements.
+ * - Supports "logical" warps smaller than the physical warp size (e.g., logical warps of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL instructions)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic reduction)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpReduce}
+ * \par
+ * The code snippet below illustrates four concurrent warp sum reductions within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for 4 warps
+ *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Return the warp-wide sums to each lane0 (threads 0, 32, 64, and 96)
+ *     int warp_id = threadIdx.x / 32;
+ *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+ * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+ * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+ *
+ * \par
+ * The code snippet below illustrates a single warp sum reduction within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpReduce for type int
+ *     typedef cub::WarpReduce<int> WarpReduce;
+ *
+ *     // Allocate WarpReduce shared memory for one warp
+ *     __shared__ typename WarpReduce::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a reduction
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Return the warp-wide sum to lane0
+ *         int aggregate = WarpReduce(temp_storage).Sum(thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ * The corresponding output \p aggregate in thread0 will be \p 496 (and is undefined in other threads).
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpReduce
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE,
+    };
+
+public:
+
+    #ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
+
+    /// Internal specialization.  Use SHFL-based reduction if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
+    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
+        WarpReduceShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+        WarpReduceSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpReduce;
+
+    #endif // DOXYGEN_SHOULD_SKIP_THIS
+
+
+private:
+
+    /// Shared memory storage layout type for WarpReduce
+    typedef typename InternalWarpReduce::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage &temp_storage;
+
+
+    /******************************************************************************
+     * Utility methods
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpReduce}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpReduce(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias())
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Summation reductions
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes a warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp sum reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Sum(thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 496, \p 1520,
+     * \p 2544, and \p 3568, respectively (and is undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input)              ///< [in] Calling thread's input
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, cub::Sum());
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide sum in the calling warp.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a sum reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Sum(
+     *         thread_data, valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 6 (and is
+     * undefined in other threads).
+     *
+     */
+    __device__ __forceinline__ T Sum(
+        T                   input,              ///< [in] Calling thread's input
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        // Determine if we don't need bounds checking
+        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by head-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedSum(
+     *         thread_data, head_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     *
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                head_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return HeadSegmentedReduce(input, head_flag, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes a segmented sum in the calling warp where segments are defined by tail-flags.  The sum of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp sum
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide sums to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedSum(
+     *         thread_data, tail_flag);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 6, \p 22, \p 38, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedSum(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT                tail_flag)          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+    {
+        return TailSegmentedReduce(input, tail_flag, cub::Sum());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Generic reductions
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes a warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp max reductions within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for 4 warps
+     *     __shared__ typename WarpReduce::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int warp_id = threadIdx.x / 32;
+     *     int aggregate = WarpReduce(temp_storage[warp_id]).Reduce(
+     *         thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p aggregate in threads 0, 32, 64, and 96 will \p 31, \p 63,
+     * \p 95, and \p 127, respectively  (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op)       ///< [in] Binary reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<true>(input, LOGICAL_WARP_THREADS, reduction_op);
+    }
+
+    /**
+     * \brief Computes a partially-full warp-wide reduction in the calling warp using the specified binary reduction functor.  The output is valid in warp <em>lane</em><sub>0</sub>.
+     *
+     * All threads across the calling warp must agree on the same value for \p valid_items.  Otherwise the result is undefined.
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a max reduction within a single, partially-full
+     * block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(int *d_data, int valid_items)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item per thread if in range
+     *     int thread_data;
+     *     if (threadIdx.x < valid_items)
+     *         thread_data = d_data[threadIdx.x];
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).Reduce(
+     *         thread_data, cub::Max(), valid_items);
+     *
+     * \endcode
+     * \par
+     * Suppose the input \p d_data is <tt>{0, 1, 2, 3, 4, ...</tt> and \p valid_items
+     * is \p 4.  The corresponding output \p aggregate in thread0 is \p 3 (and is
+     * undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ReductionOp>
+    __device__ __forceinline__ T Reduce(
+        T                   input,              ///< [in] Calling thread's input
+        ReductionOp         reduction_op,       ///< [in] Binary reduction operator
+        int                 valid_items)        ///< [in] Total number of valid items in the calling thread's logical warp (may be less than \p LOGICAL_WARP_THREADS)
+    {
+        return InternalWarpReduce(temp_storage).template Reduce<false>(input, valid_items, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by head-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a head-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int head_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).HeadSegmentedReduce(
+     *         thread_data, head_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p head_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{1, 0, 0, 0, 1, 0, 0, 0, ..., 1, 0, 0, 0</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T HeadSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT               head_flag,          ///< [in] Head flag denoting whether or not \p input is the start of a new segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<true>(input, head_flag, reduction_op);
+    }
+
+
+    /**
+     * \brief Computes a segmented reduction in the calling warp where segments are defined by tail-flags.  The reduction of each segment is returned to the first lane in that segment (which always includes <em>lane</em><sub>0</sub>).
+     *
+     * Supports non-commutative reduction operators
+     *
+     * \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates a tail-segmented warp max
+     * reduction within a block of 32 threads (one warp).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpReduce for type int
+     *     typedef cub::WarpReduce<int> WarpReduce;
+     *
+     *     // Allocate WarpReduce shared memory for one warp
+     *     __shared__ typename WarpReduce::TempStorage temp_storage;
+     *
+     *     // Obtain one input item and flag per thread
+     *     int thread_data = ...
+     *     int tail_flag = ...
+     *
+     *     // Return the warp-wide reductions to each lane0
+     *     int aggregate = WarpReduce(temp_storage).TailSegmentedReduce(
+     *         thread_data, tail_flag, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data and \p tail_flag across the block of threads
+     * is <tt>{0, 1, 2, 3, ..., 31</tt> and is <tt>{0, 0, 0, 1, 0, 0, 0, 1, ..., 0, 0, 0, 1</tt>,
+     * respectively.  The corresponding output \p aggregate in threads 0, 4, 8, etc. will be
+     * \p 3, \p 7, \p 11, etc. (and is undefined in other threads).
+     *
+     * \tparam ReductionOp     <b>[inferred]</b> Binary reduction operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <
+        typename            ReductionOp,
+        typename            FlagT>
+    __device__ __forceinline__ T TailSegmentedReduce(
+        T                   input,              ///< [in] Calling thread's input
+        FlagT               tail_flag,          ///< [in] Tail flag denoting whether or not \p input is the end of the current segment
+        ReductionOp         reduction_op)       ///< [in] Reduction operator
+    {
+        return InternalWarpReduce(temp_storage).template SegmentedReduce<false>(input, tail_flag, reduction_op);
+    }
+
+
+
+    //@}  end member group
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/local_cub/warp/warp_scan.cuh b/GraphBLAS/CUDA/local_cub/warp/warp_scan.cuh
new file mode 100644
index 0000000000..c7af0d343d
--- /dev/null
+++ b/GraphBLAS/CUDA/local_cub/warp/warp_scan.cuh
@@ -0,0 +1,936 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ * 
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/**
+ * \file
+ * The cub::WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.
+ */
+
+#pragma once
+
+#include "specializations/warp_scan_shfl.cuh"
+#include "specializations/warp_scan_smem.cuh"
+#include "../thread/thread_operators.cuh"
+#include "../util_arch.cuh"
+#include "../util_type.cuh"
+#include "../util_namespace.cuh"
+
+/// Optional outer namespace(s)
+CUB_NS_PREFIX
+
+/// CUB namespace
+namespace cub {
+
+/**
+ * \addtogroup WarpModule
+ * @{
+ */
+
+/**
+ * \brief The WarpScan class provides [<em>collective</em>](index.html#sec0) methods for computing a parallel prefix scan of items partitioned across a CUDA thread warp.  ![](warp_scan_logo.png)
+ *
+ * \tparam T                        The scan input/output element type
+ * \tparam LOGICAL_WARP_THREADS     <b>[optional]</b> The number of threads per "logical" warp (may be less than the number of hardware warp threads).  Default is the warp size associated with the CUDA Compute Capability targeted by the compiler (e.g., 32 threads for SM20).
+ * \tparam PTX_ARCH                 <b>[optional]</b> \ptxversion
+ *
+ * \par Overview
+ * - Given a list of input elements and a binary reduction operator, a [<em>prefix scan</em>](http://en.wikipedia.org/wiki/Prefix_sum)
+ *   produces an output list where each element is computed to be the reduction
+ *   of the elements occurring earlier in the input list.  <em>Prefix sum</em>
+ *   connotes a prefix scan with the addition operator. The term \em inclusive indicates
+ *   that the <em>i</em><sup>th</sup> output reduction incorporates the <em>i</em><sup>th</sup> input.
+ *   The term \em exclusive indicates the <em>i</em><sup>th</sup> input is not incorporated into
+ *   the <em>i</em><sup>th</sup> output reduction.
+ * - Supports non-commutative scan operators
+ * - Supports "logical" warps smaller than the physical warp size (e.g., a logical warp of 8 threads)
+ * - The number of entrant threads must be an multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Performance Considerations
+ * - Uses special instructions when applicable (e.g., warp \p SHFL)
+ * - Uses synchronization-free communication between warp lanes when applicable
+ * - Incurs zero bank conflicts for most types
+ * - Computation is slightly more efficient (i.e., having lower instruction overhead) for:
+ *     - Summation (<b><em>vs.</em></b> generic scan)
+ *     - The architecture's warp size is a whole multiple of \p LOGICAL_WARP_THREADS
+ *
+ * \par Simple Examples
+ * \warpcollective{WarpScan}
+ * \par
+ * The code snippet below illustrates four concurrent warp prefix sums within a block of
+ * 128 threads (one per each of the 32-thread warps).
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for 4 warps
+ *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+ *
+ *     // Obtain one input item per thread
+ *     int thread_data = ...
+ *
+ *     // Compute warp-wide prefix sums
+ *     int warp_id = threadIdx.x / 32;
+ *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data in each of the four warps of threads will be
+ * <tt>0, 1, 2, 3, ..., 31}</tt>.
+ *
+ * \par
+ * The code snippet below illustrates a single warp prefix sum within a block of
+ * 128 threads.
+ * \par
+ * \code
+ * #include <cub/cub.cuh>
+ *
+ * __global__ void ExampleKernel(...)
+ * {
+ *     // Specialize WarpScan for type int
+ *     typedef cub::WarpScan<int> WarpScan;
+ *
+ *     // Allocate WarpScan shared memory for one warp
+ *     __shared__ typename WarpScan::TempStorage temp_storage;
+ *     ...
+ *
+ *     // Only the first warp performs a prefix sum
+ *     if (threadIdx.x < 32)
+ *     {
+ *         // Obtain one input item per thread
+ *         int thread_data = ...
+ *
+ *         // Compute warp-wide prefix sums
+ *         WarpScan(temp_storage).ExclusiveSum(thread_data, thread_data);
+ *
+ * \endcode
+ * \par
+ * Suppose the set of input \p thread_data across the warp of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+ * The corresponding output \p thread_data will be <tt>{0, 1, 2, 3, ..., 31}</tt>.
+ *
+ */
+template <
+    typename    T,
+    int         LOGICAL_WARP_THREADS    = CUB_PTX_WARP_THREADS,
+    int         PTX_ARCH                = CUB_PTX_ARCH>
+class WarpScan
+{
+private:
+
+    /******************************************************************************
+     * Constants and type definitions
+     ******************************************************************************/
+
+    enum
+    {
+        /// Whether the logical warp size and the PTX warp size coincide
+        IS_ARCH_WARP = (LOGICAL_WARP_THREADS == CUB_WARP_THREADS(PTX_ARCH)),
+
+        /// Whether the logical warp size is a power-of-two
+        IS_POW_OF_TWO = ((LOGICAL_WARP_THREADS & (LOGICAL_WARP_THREADS - 1)) == 0),
+
+        /// Whether the data type is an integer (which has fully-associative addition)
+        IS_INTEGER = ((Traits<T>::CATEGORY == SIGNED_INTEGER) || (Traits<T>::CATEGORY == UNSIGNED_INTEGER))
+    };
+
+    /// Internal specialization.  Use SHFL-based scan if (architecture is >= SM30) and (LOGICAL_WARP_THREADS is a power-of-two)
+    typedef typename If<(PTX_ARCH >= 300) && (IS_POW_OF_TWO),
+        WarpScanShfl<T, LOGICAL_WARP_THREADS, PTX_ARCH>,
+        WarpScanSmem<T, LOGICAL_WARP_THREADS, PTX_ARCH> >::Type InternalWarpScan;
+
+    /// Shared memory storage layout type for WarpScan
+    typedef typename InternalWarpScan::TempStorage _TempStorage;
+
+
+    /******************************************************************************
+     * Thread fields
+     ******************************************************************************/
+
+    /// Shared storage reference
+    _TempStorage    &temp_storage;
+    unsigned int    lane_id;
+
+
+
+    /******************************************************************************
+     * Public types
+     ******************************************************************************/
+
+public:
+
+    /// \smemstorage{WarpScan}
+    struct TempStorage : Uninitialized<_TempStorage> {};
+
+
+    /******************************************************************//**
+     * \name Collective constructors
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Collective constructor using the specified memory allocation as temporary storage.  Logical warp and lane identifiers are constructed from <tt>threadIdx.x</tt>.
+     */
+    __device__ __forceinline__ WarpScan(
+        TempStorage &temp_storage)             ///< [in] Reference to memory allocation having layout type TempStorage
+    :
+        temp_storage(temp_storage.Alias()),
+        lane_id(IS_ARCH_WARP ?
+            LaneId() :
+            LaneId() % LOGICAL_WARP_THREADS)
+    {}
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix sum across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>1, 2, 3, ..., 32}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void InclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InclusiveScan(input, inclusive_output, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix sums
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.
+     *
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output)  ///< [out] Calling thread's output item.  May be aliased with \p input.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix sum across the calling warp.  The value of 0 is applied as the initial value, and is assigned to \p exclusive_output in <em>thread</em><sub>0</sub>.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     *  - \identityzero
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix sums within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix sums
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveSum(thread_data, thread_data, warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{1, 1, 1, 1, ...}</tt>.
+     * The corresponding output \p thread_data in each of the four warps of threads will be
+     * <tt>0, 1, 2, ..., 31}</tt>.  Furthermore, \p warp_aggregate for all threads in all warps will be \p 32.
+     */
+    __device__ __forceinline__ void ExclusiveSum(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        T initial_value = 0;
+        ExclusiveScan(input, exclusive_output, initial_value, cub::Sum(), warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Inclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op);
+    }
+
+
+    /**
+     * \brief Computes an inclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide inclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).InclusiveScan(
+     *         thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void InclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan(temp_storage).InclusiveScan(input, inclusive_output, scan_op, warp_aggregate);
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Exclusive prefix scans
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p output computed for <em>warp-lane</em><sub>0</sub> is undefined.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)  Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,   ///< [out] Calling thread's output item.  May be aliased with \p input.
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes an exclusive prefix scan using the specified binary scan functor across the calling warp.  Also provides every thread with the warp-wide \p warp_aggregate of all inputs.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int warp_aggregate;
+     *     int warp_id = threadIdx.x / 32;
+     *     WarpScan(temp_storage[warp_id]).ExclusiveScan(thread_data, thread_data, INT_MIN, cub::Max(), warp_aggregate);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p thread_data in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * Furthermore, \p warp_aggregate would be assigned \p 30 for threads in the first warp, \p 62 for threads
+     * in the second warp, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void ExclusiveScan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &exclusive_output,  ///< [out] Calling thread's output item.  May be aliased with \p input.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op,            ///< [in] Binary scan operator
+        T               &warp_aggregate)    ///< [out] Warp-wide aggregate reduction of input items.
+    {
+        InternalWarpScan internal(temp_storage);
+
+        T inclusive_output;
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            warp_aggregate,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Combination (inclusive & exclusive) prefix scans
+     *********************************************************************/
+    //@{
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.  Because no initial value is supplied, the \p exclusive_output computed for <em>warp-lane</em><sub>0</sub> is undefined.
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide exclusive prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute exclusive warp-wide prefix max scans
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>?, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>?, 32, 32, 34, ..., 60, 62</tt>, etc.
+     * (The output \p thread_data in warp lane<sub>0</sub> is undefined.)
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+    /**
+     * \brief Computes both inclusive and exclusive prefix scans using the specified binary scan functor across the calling warp.
+     *
+     * \par
+     *  - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates four concurrent warp-wide prefix max scans within a block of
+     * 128 threads (one per each of the 32-thread warps).
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Compute inclusive warp-wide prefix max scans
+     *     int warp_id = threadIdx.x / 32;
+     *     int inclusive_partial, exclusive_partial;
+     *     WarpScan(temp_storage[warp_id]).Scan(thread_data, inclusive_partial, exclusive_partial, INT_MIN, cub::Max());
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, -1, 2, -3, ..., 126, -127}</tt>.
+     * The corresponding output \p inclusive_partial in the first warp would be
+     * <tt>0, 0, 2, 2, ..., 30, 30</tt>, the output for the second warp would be <tt>32, 32, 34, 34, ..., 62, 62</tt>, etc.
+     * The corresponding output \p exclusive_partial in the first warp would be
+     * <tt>INT_MIN, 0, 0, 2, ..., 28, 30</tt>, the output for the second warp would be <tt>30, 32, 32, 34, ..., 60, 62</tt>, etc.
+     *
+     * \tparam ScanOp     <b>[inferred]</b> Binary scan operator type having member <tt>T operator()(const T &a, const T &b)</tt>
+     */
+    template <typename ScanOp>
+    __device__ __forceinline__ void Scan(
+        T               input,              ///< [in] Calling thread's input item.
+        T               &inclusive_output,  ///< [out] Calling thread's inclusive-scan output item.
+        T               &exclusive_output,  ///< [out] Calling thread's exclusive-scan output item.
+        T               initial_value,      ///< [in] Initial value to seed the exclusive scan
+        ScanOp          scan_op)            ///< [in] Binary scan operator
+    {
+        InternalWarpScan internal(temp_storage);
+
+        internal.InclusiveScan(input, inclusive_output, scan_op);
+
+        internal.Update(
+            input,
+            inclusive_output,
+            exclusive_output,
+            scan_op,
+            initial_value,
+            Int2Type<IS_INTEGER>());
+    }
+
+
+
+    //@}  end member group
+    /******************************************************************//**
+     * \name Data exchange
+     *********************************************************************/
+    //@{
+
+    /**
+     * \brief Broadcast the value \p input from <em>warp-lane</em><sub><tt>src_lane</tt></sub> to all lanes in the warp
+     *
+     * \par
+     * - \smemreuse
+     *
+     * \par Snippet
+     * The code snippet below illustrates the warp-wide broadcasts of values from
+     * lanes<sub>0</sub> in each of four warps to all other threads in those warps.
+     * \par
+     * \code
+     * #include <cub/cub.cuh>
+     *
+     * __global__ void ExampleKernel(...)
+     * {
+     *     // Specialize WarpScan for type int
+     *     typedef cub::WarpScan<int> WarpScan;
+     *
+     *     // Allocate WarpScan shared memory for 4 warps
+     *     __shared__ typename WarpScan::TempStorage temp_storage[4];
+     *
+     *     // Obtain one input item per thread
+     *     int thread_data = ...
+     *
+     *     // Broadcast from lane0 in each warp to all other threads in the warp
+     *     int warp_id = threadIdx.x / 32;
+     *     thread_data = WarpScan(temp_storage[warp_id]).Broadcast(thread_data, 0);
+     *
+     * \endcode
+     * \par
+     * Suppose the set of input \p thread_data across the block of threads is <tt>{0, 1, 2, 3, ..., 127}</tt>.
+     * The corresponding output \p thread_data will be
+     * <tt>{0, 0, ..., 0}</tt> in warp<sub>0</sub>,
+     * <tt>{32, 32, ..., 32}</tt> in warp<sub>1</sub>,
+     * <tt>{64, 64, ..., 64}</tt> in warp<sub>2</sub>, etc.
+     */
+    __device__ __forceinline__ T Broadcast(
+        T               input,              ///< [in] The value to broadcast
+        unsigned int    src_lane)           ///< [in] Which warp lane is to do the broadcasting
+    {
+        return InternalWarpScan(temp_storage).Broadcast(input, src_lane);
+    }
+
+    //@}  end member group
+
+};
+
+/** @} */       // end group WarpModule
+
+}               // CUB namespace
+CUB_NS_POSTFIX  // Optional outer namespace(s)
diff --git a/GraphBLAS/CUDA/matrix.h b/GraphBLAS/CUDA/matrix.h
new file mode 100644
index 0000000000..11930475b7
--- /dev/null
+++ b/GraphBLAS/CUDA/matrix.h
@@ -0,0 +1,72 @@
+//SPDX-License-Identifier: Apache-2.0
+
+#define chunksize 128 
+
+#define ASSERT
+#define GB_RESTRICT __restrict__
+//#define GB_GETA( aval, ax, p) aval = (T_Z)ax[ ( p )]
+//#define GB_GETB( bval, bx, p) bval = (T_Z)bx[ ( p )]
+#define GB_ADD_F( f , s)  f = GB_ADD ( f, s ) 
+#define GB_C_MULT( c, a, b)  c = GB_MULT( (a), (b) )
+#define GB_MULTADD( c, a ,b ) GB_ADD_F( (c), GB_MULT( (a),(b) ) )
+#define GB_DOT_TERMINAL ( c )   
+//# if ( c == TERMINAL_VALUE) break;
+
+#include "GB_imin.h"
+#include "GB_zombie.h"
+#include "GB_nnz.h"
+#include "GB_partition.h"
+#include "GB_binary_search.h"
+#include "GB_search_for_vector_template.c"
+
+#undef GB_DOT_MERGE
+// cij += A(k,i) * B(k,j), for merge operation
+#define GB_DOT_MERGE                                                \
+{                                                                   \
+    GB_GETA ( aki= (T_Z)Ax[pA]) ;       /* aki = A(k,i) */          \
+    GB_GETB ( bkj= (T_Z)Bx[pB]) ;       /* bkj = B(k,j) */          \
+    if (cij_exists)                                                 \
+    {                                                               \
+        GB_MULTADD (cij, aki, bkj) ;    /* cij += aki * bkj */      \
+    }                                                               \
+    else                                                            \
+    {                                                               \
+        /* cij = A(k,i) * B(k,j), and add to the pattern    */      \
+        cij_exists = true ;                                         \
+        GB_C_MULT (cij, aki, bkj) ;     /* cij  = aki * bkj */      \
+    }                                                               \
+}
+
+
+typedef void (*GxB_binary_function) (void *, const void *, const void *) ;
+
+#include "GB_opaque.h"
+
+typedef enum
+{
+    // for all GrB_Descriptor fields:
+    GxB_DEFAULT = 0,    // default behavior of the method
+
+    // for GrB_OUTP only:
+    GrB_REPLACE = 1,    // clear the output before assigning new values to it
+
+    // for GrB_MASK only:
+    GrB_COMP = 2,       // use the structural complement of the input
+    GrB_SCMP = 2,       // same as GrB_COMP (deprecated; use GrB_COMP instead)
+    GrB_STRUCTURE = 4,  // use the only pattern of the mask, not its values
+
+    // for GrB_INP0 and GrB_INP1 only:
+    GrB_TRAN = 3,       // use the transpose of the input
+
+    // for GxB_GPU_CONTROL only:
+    GxB_GPU_ALWAYS  = 4,
+    GxB_GPU_NEVER   = 5,
+
+    // for GxB_AxB_METHOD only:
+    GxB_AxB_GUSTAVSON = 1001,   // gather-scatter saxpy method
+    GxB_AxB_DOT       = 1003,   // dot product
+    GxB_AxB_HASH      = 1004,   // hash-based saxpy method
+    GxB_AxB_SAXPY     = 1005    // saxpy method (any kind)
+}
+GrB_Desc_Value ;
+
diff --git a/GraphBLAS/CUDA/stringify.cpp b/GraphBLAS/CUDA/stringify.cpp
new file mode 100644
index 0000000000..47ebbe77dd
--- /dev/null
+++ b/GraphBLAS/CUDA/stringify.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+  Stringify is a simple utility to convert text files to C string literals.
+ */
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+// Replaces non-alphanumeric characters with '_' and
+//   prepends '_' if the string begins with a digit.
+std::string sanitize_varname(std::string const& s) {
+  std::string r = s;
+  if (std::isdigit(r[0])) {
+    r = '_' + r;
+  }
+  for (std::string::iterator it = r.begin(); it != r.end(); ++it) {
+    if (!std::isalnum(*it)) {
+      *it = '_';
+    }
+  }
+  return r;
+}
+// Replaces " with \"
+std::string sanitize_string_literal(std::string const& s) {
+  std::stringstream ss;
+  for (std::string::const_iterator it = s.begin(); it != s.end(); ++it) {
+    if (*it == '"' || *it == '\\') {
+      ss << '\\';
+    }
+    ss << *it;
+  }
+  return ss.str();
+}
+
+int main(int argc, char* argv[]) {
+  if (argc <= 1 || argv[1][0] == '-') {
+    std::cout << "Stringify - Converts text files to C string literals"
+              << std::endl;
+    std::cout << "Usage: " << argv[0] << " infile [varname] > outfile"
+              << std::endl;
+    return -1;
+  }
+  char* filename = argv[1];
+  std::string varname = (argc > 2) ? argv[2] : sanitize_varname(filename);
+  std::ifstream istream(filename);
+  std::ostream& ostream = std::cout;
+  std::string line;
+  // Note: This puts "filename\n" at the beginning of the string, which is
+  //         what jitify expects.
+  ostream << "const char* const " << varname << " = "
+          << "\"" << filename << "\\n\"" << std::endl;
+  while (std::getline(istream, line)) {
+    ostream << "\"" << sanitize_string_literal(line) << "\\n\"" << std::endl;
+  }
+  ostream << ";" << std::endl;
+  return 0;
+}
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase1.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase1.cu
new file mode 100644
index 0000000000..aa367c7b9a
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase1.cu
@@ -0,0 +1,598 @@
+//------------------------------------------------------------------------------
+// templates/GB_AxB_cuda_dot3_phase1: symbolic load balancing and data partition
+// to assign work to different 'buckets' for later compute
+//------------------------------------------------------------------------------
+
+//  This kernel scans the non-zero pattern in A and B, takes into account the
+//  mask and computes total work required to form C. Then it classifies each
+//  dot product into a set of buckets for efficient compute. 
+
+#define GB_KERNEL
+#include <limits>
+#include <cstdint>
+#include "matrix.h"
+#include "GB_cuda_buckets.h"
+#include "local_cub/block/block_scan.cuh"
+#include "mySemiRing.h"
+
+//------------------------------------------------------------------------------
+// GB_bucket_assignment
+//------------------------------------------------------------------------------
+
+// assign the dot product C(i,j) = A(:,i)'*B(:,j) to a specific bucket
+__device__ static inline GB_bucket_code GB_bucket_assignment
+(
+    int64_t ainz,       // # of entries A(:,i), always > 0
+    int64_t bjnz,       // # of entries B(:,j), always > 0
+    int64_t vlen        // vector length of A(:,i) and B(:,j)
+)
+{
+
+    int b = 0 ; // no bucket assigned yet
+
+    // GB_BUCKET (condition,bucket) :  assigns an entry to a bucket,
+    // if the condition holds, but without using any if statements.
+    // An entry is assigned once and not reassigned.
+
+    // If the bucket b has not assigned, it is b = 0.  The GB_BUCKET function
+    // tests this case, and if the condition is also true, the expression
+    // (b==0) * condition * (bucket+1) becomes equal to bucket+1.  This
+    // value is added to b, which is zero, so the final result is that b
+    // is set to bucket+1.
+
+    // If the bucket b has been assigned already, we have b > 0.  Thus,
+    // the expression ((b==0) * condition * (bucket+1)) becomes zero.
+    // When added to b, the result is that b doesn't change, so the bucket
+    // assignment b is unmodified.
+
+    #define GB_BUCKET(condition,bucket) \
+        b = (((b == 0) * (condition)) * (bucket+1)) + b ;
+
+//  if (ia_last < ib_first || ib_last < ia_first)
+    { 
+
+        //----------------------------------------------------------------------
+        // pattern of A(:,i) and B(:,j) do not overlap
+        //----------------------------------------------------------------------
+
+        // The patterns of A(:,i) and B(:,j) are always sorted.  If the last
+        // entry in A(:,i) comes before the first entry in B(:,j), or visa
+        // versa, then there is no work to do since C(i,j) must be a zombie.
+
+        // GB_BUCKET (ia_last < ib_first || ib_last < ia_first, GB_BUCKET_ZOMBIE);
+
+    }
+//  else if (bjnz == vlen && ainz == vlen && vlen > 256)
+    {
+
+        //----------------------------------------------------------------------
+        // both A(:,i) and B(:,j) are dense
+        //----------------------------------------------------------------------
+
+        // No search of A(:,i) or B(:,j) is needed.  Total work is O(vlen).
+        // The intersection is non-empty, so C(i,j) cannot be a zombie.
+
+        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit
+
+        GB_BUCKET (bjnz == vlen && ainz == vlen && vlen > 256, GB_BUCKET_DNDN) ;
+
+    }
+//  else if (ainz == vlen)
+    {
+ 
+        //----------------------------------------------------------------------
+        // A(:,i) is dense and B(:,j) is sparse
+        //----------------------------------------------------------------------
+ 
+        // No search of A(:,i) is needed.  Total work is O(bjnz), via a linear
+        // time scan of B(:,j).  Since A(:,i) is dense and B(:,j) is non-empty,
+        // the intersection is non-empty, so C(i,j) cannot be a zombie.
+
+        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit
+        // Two buckets are used, depending on bjnz.
+        GB_BUCKET (ainz == vlen && bjnz <  256, GB_BUCKET_DNVS) ;
+        GB_BUCKET (ainz == vlen && bjnz >= 256, GB_BUCKET_DNSP) ;
+ 
+    }
+//  else if (bjnz == vlen)
+    {
+
+        //----------------------------------------------------------------------
+        // A(:,i) is sparse and B(:,j) is dense
+        //----------------------------------------------------------------------
+
+        // No search of B(:,j) is needed.  Total work is O(ainz), via a linear
+        // time scan of A(:,i).  Since B(:,j) is dense and A(:,i) is non-empty,
+        // the intersection is non-empty, so C(i,j) cannot be a zombie.
+
+        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit
+        // Two buckets are used, depending on ainz.
+        GB_BUCKET (bjnz == vlen && ainz <  256, GB_BUCKET_VSDN) ;
+        GB_BUCKET (bjnz == vlen && ainz >= 256, GB_BUCKET_SPDN) ;
+
+    }
+//  else if ((ainz > 32 * bjnz && bjnz < 256)
+//        || (bjnz > 32 * ainz && ainz < 256))
+    {
+
+        //----------------------------------------------------------------------
+        // A(:,i) is very sparse compared to B(:,j), or visa versa
+        //----------------------------------------------------------------------
+
+        // Since B(:,j) is small, and much smaller than A(:,i), the efficient
+        // way to compute C(i,j) is a linear scan of B(:,j).  For each B(k,j),
+        // a binary search for the index A(k,i) is done.  The expected work to
+        // compute C(i,j) is thus O(bjnz * log2 (ainz)).  If A(:,i) is very
+        // sparse compared to B(:,j), the opposite is done inside the kernel.
+
+        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit
+
+        GB_BUCKET ((ainz > 32 * bjnz && bjnz < 256)
+                || (bjnz > 32 * ainz && ainz < 256), GB_BUCKET_VSSP) ;
+
+    }
+//  else if (ainz + bjnz <= 4)
+    {
+
+        //----------------------------------------------------------------------
+        // both A(:,i) and B(:,j) are very tiny (total size 4 or less)
+        //----------------------------------------------------------------------
+
+        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit
+        //GB_BUCKET (ainz + bjnz <= 4, GB_BUCKET_VSVS_4) ;
+
+    }
+//  else if (ainz + bjnz <= 16)
+    {
+
+        //----------------------------------------------------------------------
+        // both A(:,i) and B(:,j) are tiny (total size 16 or less)
+        //----------------------------------------------------------------------
+
+        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit
+        //GB_BUCKET (ainz + bjnz <= 16, GB_BUCKET_VSVS_16) ;
+
+    }
+//  else if (ainz + bjnz <= 64)
+    {
+
+        //----------------------------------------------------------------------
+        // both A(:,i) and B(:,j) are small (total size 64 or less)
+        //----------------------------------------------------------------------
+
+        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit
+        //GB_BUCKET (ainz + bjnz <= 64, GB_BUCKET_VSVS_64) ;
+
+    }
+//  else if (ainz + bjnz <= 256)
+    {
+
+        //----------------------------------------------------------------------
+        // both A(:,i) and B(:,j) are modest in size (total size 256 or less)
+        //----------------------------------------------------------------------
+
+        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit
+        GB_BUCKET (ainz + bjnz <= 256, GB_BUCKET_VSVS_256) ;
+
+    }
+//  else
+    {
+
+        //----------------------------------------------------------------------
+        // default: use the merge-path method
+        //----------------------------------------------------------------------
+
+        // CUDA kernel: templates/GB_jit_AxB_dot3_phase3_mp.cu.jit
+        GB_BUCKET (true, GB_BUCKET_MERGEPATH) ;
+    }
+
+    // subtract one to undo the "bucket+1" assignment in the
+    // GB_BUCKET macro assignment expression.
+    return (GB_bucket_code) (b-1) ;
+}
+
+
+//--------------------------------------------------------------------------
+// GB_AxB_cuda_dot3_phase1: build nanobuckets, hunt for pre-zombies
+//--------------------------------------------------------------------------
+
+// GB_AxB_cuda_dot3_phase1 is a CUDA kernel that scans all entries in C and
+// assigns them to each of the 12 buckets.  The output is a 12-by-blockDim array of
+// bucket counts, per threadblock (the nanobucket array).  Each of the blockDim.x 
+// threads has its own set of 12 bucket counts.  Each threadblock in this
+// kernel then computes the first part of the cumulative sum of the
+// nanobuckets, and writes it to global memory.
+
+// The kernel also computes Ci, of size nnz(C), which contains the
+// zombie assignment or bucket assignment for non-zombies in C.
+
+template<typename Type_M> 
+__global__ void GB_AxB_cuda_dot3_phase1
+(
+    // outputs, preallocated in global memory:
+    int64_t *nanobuckets,       // array of size 12-blockDim.x-by-gridDim.x
+    int64_t *blockbucket,       // bucket counts, of size 12-by-gridDim.x
+    // input/output:
+    GrB_Matrix C,               // final output matrix
+    // inputs, not modified:
+    const GrB_Matrix M,         // mask matrix
+    const GrB_Matrix A,         // input matrix
+    const GrB_Matrix B          // input matrix
+)
+{
+
+    //--------------------------------------------------------------------------
+    // get C, M, A, and B
+    //--------------------------------------------------------------------------
+    
+    const int64_t *__restrict__ Mh = M->h ;
+    const int64_t *__restrict__ Mp = M->p ;
+    const int64_t *__restrict__ Mi = M->i ;
+    const Type_M *__restrict__ Mx = (Type_M*)M->x ;    // not accessed if M is structural
+    const int64_t mnvec = M->nvec ;
+    const int64_t mnz =  GB_NNZ(M) ;
+    const bool M_is_hyper = M->is_hyper ;
+
+    const int64_t *__restrict__ Ah = A->h ;
+    const int64_t *__restrict__ Ap = A->p ;
+    const int64_t *__restrict__ Ai = A->i ;
+    const int64_t avlen = A->vlen ;
+    const int64_t anz = GB_NNZ(A) ;
+    const bool A_is_hyper = A->is_hyper ;
+
+    const int64_t *__restrict__ Bh = B->h ;
+    const int64_t *__restrict__ Bp = B->p ;
+    const int64_t *__restrict__ Bi = B->i ;
+    const int64_t bvlen = B->vlen ;
+    const int64_t bnz = GB_NNZ(B);
+    const bool B_is_hyper = B->is_hyper ;
+
+    // int64_t *restrict Cp = C->p ;    // copy of Mp
+    // int64_t *restrict Ch = C->h ;    // copy of Mh
+    int64_t *__restrict__ Ci = C->i ;       // for zombies, or bucket assignment
+
+    // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a
+    // zombie, or (k << 4) + bucket otherwise, where C(:,j) is the kth vector
+    // of C (j = Ch [k] if hypersparse or j = k if standard sparse), and
+    // where bucket is the bucket assignment for C(i,j). 
+    // bucket can be recovered from Ci by bucket = Ci & 0xF
+
+    //--------------------------------------------------------------------------
+    // clear the bucket counters
+    //--------------------------------------------------------------------------
+
+    //ASSERT (mnz > 0) ;
+    //ASSERT (gridDim.x <= mnz) ;
+
+    // each thread uses 12 bucket counters, held in register
+    int64_t my_bucket_0  = 0 ;
+    int64_t my_bucket_1  = 0 ;
+    int64_t my_bucket_2  = 0 ;
+    int64_t my_bucket_3  = 0 ;
+    int64_t my_bucket_4  = 0 ;
+    int64_t my_bucket_5  = 0 ;
+    int64_t my_bucket_6  = 0 ;
+    int64_t my_bucket_7  = 0 ;
+    int64_t my_bucket_8  = 0 ;
+    int64_t my_bucket_9  = 0 ;
+    int64_t my_bucket_10 = 0 ;
+    int64_t my_bucket_11 = 0 ;
+
+    // Registers cannot be indexed (!) so this macro is used instead.
+    // The bucket registers are indexed by the GB_bucket_code enum.
+    #define GB_BUCKET_COUNT(bucket)                 \
+    {                                               \
+        switch (bucket)                             \
+        {                                           \
+            case  0: my_bucket_0++  ; break ;       \
+            case  1: my_bucket_1++  ; break ;       \
+            case  2: my_bucket_2++  ; break ;       \
+            case  3: my_bucket_3++  ; break ;       \
+            case  4: my_bucket_4++  ; break ;       \
+            case  5: my_bucket_5++  ; break ;       \
+            case  6: my_bucket_6++  ; break ;       \
+            case  7: my_bucket_7++  ; break ;       \
+            case  8: my_bucket_8++  ; break ;       \
+            case  9: my_bucket_9++  ; break ;       \
+            case 10: my_bucket_10++ ; break ;       \
+            case 11: my_bucket_11++ ; break ;       \
+        }                                           \
+    }
+     /*
+    if(threadIdx.x==0 ) {
+       printf(" in phase1 kernel, mnz,anz,bnz= %ld,%ld,%ld\n",mnz,anz,bnz); 
+    }
+    __syncthreads();
+     */
+     #define pointerchunk 256
+
+     __shared__ int64_t Mps[pointerchunk];
+     __shared__ int64_t ks [chunksize];
+
+    //--------------------------------------------------------------------------
+    // compute the task descriptor
+    //--------------------------------------------------------------------------
+
+    // all threads in this block will compute the same values for these:
+    int32_t pfirst, plast, kfirst, klast ;
+    /*
+    for ( int tid_global = threadIdx.x + blockIdx.x * blockDim.x ; 
+              tid_global < (mnvec+ 7)/8 ;
+              tid_global += blockDim.x*gridDim.x) 
+              */
+    int chunk_max= (mnz + chunksize -1)/chunksize;
+    for ( int chunk = blockIdx.x;
+              chunk < chunk_max;
+              chunk += gridDim.x ) 
+    {
+
+      // The slice for each task contains entries pfirst:plast-1 of M and C.
+      //GB_PARTITION (pfirst, plast, mnz, chunk, (mnz+1023)/1024 ) ;
+      pfirst = chunksize * chunk ; 
+      plast  = GB_IMIN( chunksize * (chunk+1), mnz ) ;
+
+      int chunk_end;
+      if ( mnz > chunksize) chunk_end = GB_IMIN(  chunksize, 
+                                                  mnz - chunksize*(chunk) ) ; 
+      else chunk_end = mnz;
+
+      // find the first vector of the slice for task tid_global: the
+      // vector that owns the entry Ai [pfirst] and Ax [pfirst].
+      kfirst = GB_search_for_vector_device (pfirst, Mp, 0, mnvec) -1 ;
+      //if( pfirst ==0) kfirst = 0;
+
+      // find the last vector of the slice for task blockIdx.x: the
+      // vector that owns the entry Ai [plast-1] and Ax [plast-1].
+      klast = GB_search_for_vector_device (plast-1, Mp, kfirst, mnvec) ;
+
+      int k_end = GB_IMIN(  pointerchunk ,  klast - kfirst +2 ) ;
+       /* 
+      if( threadIdx.x ==0) 
+      {
+         printf("chunk%d pfirst,plast,ch_end =%d,%d,%d kfirst,klast,kend = %d,%d,%d\n",
+                 chunk, pfirst, plast, chunk_end, kfirst, klast, k_end ) ;
+      }
+      __syncthreads();
+      */
+      
+     
+      // load pointer values for this chunk
+      for ( int i = threadIdx.x; i< k_end; i+= blockDim.x)
+      {
+          Mps[i] = Mp[i + kfirst];
+      }
+      __syncthreads();
+
+      // search for k values for each entry
+      float slope = (float)(mnvec)/(float)(mnz* chunksize) ;
+      for ( int i =  threadIdx.x; i< chunk_end; i+= blockDim.x)
+      {   
+          ks[i] = kfirst + slope*( float )(i);
+          while ( Mps[ ks[i] - kfirst + 1 ] <= (i+pfirst) )
+             ks[i]++;
+          while ( Mps[ ks[i] - kfirst     ] >  (i+pfirst) )
+             ks[i]--;
+      }
+      __syncthreads();
+
+
+    //ASSERT (0 <= kfirst && kfirst <= klast && klast < mnvec) ;
+    /*
+    if (threadIdx.x ==0 ) {
+       printf ("threadblock %d  after ksearch pfirst %ld plast %ld kfirst %ld klast %ld\n",
+                blockIdx.x, pfirst, plast, kfirst, klast) ;
+    }
+    __syncthreads();
+    */
+    
+    
+
+    //--------------------------------------------------------------------------
+    // assign entries in C(i,j) to the buckets
+    //--------------------------------------------------------------------------
+
+    // if B is hypersparse, bpleft ... TODO describe
+    // int64_t bpleft = 0 ;
+    
+        //----------------------------------------------------------------------
+        // no binary search variant
+        //----------------------------------------------------------------------
+
+        //printf ("no binary search\n") ;
+
+        //int32_t pM_start, pM_end ;
+        //for (int64_t pM = pfirst + threadIdx.x ; pM < plast ; pM += blockDim.x)
+        int32_t i,j;
+        int32_t k = kfirst ;
+            
+        //for (int64_t pM = pfirst; pM < plast; pM++ ) 
+        for ( int pM = pfirst + threadIdx.x;
+                  pM < pfirst + chunk_end;
+                  pM += blockDim.x )
+        {
+            GB_bucket_code bucket = GB_BUCKET_ZOMBIE ;
+            k = ks[ pM - pfirst ] ;
+            //k += ( pM == Mp[k+1] ) ;
+            //printf ("tid%d  k %ld pM %ld\n", tid_global, k, pM;
+            i = Mi [ pM ] ;
+
+            if ( MX ( pM ) )
+            { 
+
+            // do a binary search for k (and j) that has this entry M(i,j)
+            //k = GB_search_for_vector_device (pM, Mp, k, klast) ;
+
+// HACK
+j = k ;
+//          int64_t j = (Mh == NULL) ? k : Mh [k] ;
+
+            //--------------------------------------------------------------
+            // get B(:,j)
+            //--------------------------------------------------------------
+
+            int64_t pB, pB_end ;
+// HACK: for sparse only, not hypersparse
+pB = Bp [j] ;
+pB_end = Bp [j+1] ;
+//              GB_lookup_device (B_is_hyper, Bh, Bp, &bpleft, bnvec-1, j,
+//                  &pB, &pB_end) ;
+                int64_t bjnz = pB_end - pB ;
+                if (bjnz > 0)
+                {
+                 //   int64_t ib_first = Bi [pB] ;
+                 //   int64_t ib_last  = Bi [pB_end-1] ;
+
+                    //----------------------------------------------------------
+                    // get A(:,i)
+                    //----------------------------------------------------------
+
+                    int64_t pA, pA_end ;
+                    //int64_t apleft = 0 ;
+// HACK: for sparse only, not hypersparse
+pA = Ap [i] ;
+pA_end = Ap [i+1] ;
+//                  GB_lookup_device (A_is_hyper, Ah, Ap, &apleft, anvec-1, i,
+//                      &pA, &pA_end) ;
+                    int64_t ainz = pA_end - pA ;
+                    if (ainz > 0)
+                    {
+                     //   int64_t ia_first = Ai [pA] ;
+                     //   int64_t ia_last  = Ai [pA_end-1] ;
+
+                        //------------------------------------------------------
+                        // determine the bucket for C(i,j)
+                        //------------------------------------------------------
+
+                        //bucket = GB_BUCKET_MERGEPATH ;
+                         bucket= GB_bucket_assignment ( ainz, bjnz, bvlen) ;
+                    }
+                }
+            }
+
+            if (bucket == GB_BUCKET_ZOMBIE)
+            {
+                // mark C(i,j) is a zombie
+                //printf ("tid%d pM=%d %d,%d prezombie\n",threadIdx.x,pM,i,j) ;
+                Ci [pM] = GB_FLIP (i) << 4 ;
+                // GB_BUCKET_COUNT (GB_BUCKET_ZOMBIE) ;
+                my_bucket_0++ ; //0 is the zombie bucket
+            }
+            else
+            {
+                // place C(i,j) in its bucket
+                Ci [pM] = (k << 4) + bucket ;
+                GB_BUCKET_COUNT (bucket) ;
+                //printf ("tid%d pM=%d %d,%d b=%d\n",threadIdx.x, pM, i,j, (int)bucket) ;
+            }
+         }
+            
+        
+    
+    }
+    __syncthreads();
+
+    //--------------------------------------------------------------------------
+    // cumulative sum of each bucket
+    //--------------------------------------------------------------------------
+
+    typedef cub::BlockScan<int64_t, 32, cub::BLOCK_SCAN_WARP_SCANS> BlockCumSum; 
+    __shared__ typename BlockCumSum::TempStorage temp_storage;
+
+    // The taskbucket for this thread block is an array of size
+    // 12-by-blockDim.x, held by row.  Each thread owns one column of this
+    // taskbucket, the nanobucket.  The nanobucket is a column of length 12,
+    // with stride equal to blockDim.x.
+    int64_t *nanobucket =
+        nanobuckets + blockIdx.x * (12 * blockDim.x) + threadIdx.x ;
+
+    #define CUMSUM_AND_STORE_NANOBUCKET(bucket)                             \
+        if( threadIdx.x == blockDim.x-1)                                    \
+            blockbucket [blockIdx.x + bucket * gridDim.x] =                 \
+            my_bucket_ ## bucket ;                                          \
+        BlockCumSum(temp_storage).ExclusiveSum                              \
+            ( my_bucket_ ## bucket, my_bucket_ ## bucket) ;                 \
+            __syncthreads();                                                \
+        nanobucket [bucket * blockDim.x] = my_bucket_ ## bucket ;
+
+    CUMSUM_AND_STORE_NANOBUCKET (0) ;
+    CUMSUM_AND_STORE_NANOBUCKET (1) ;
+    CUMSUM_AND_STORE_NANOBUCKET (2) ;
+    CUMSUM_AND_STORE_NANOBUCKET (3) ;
+    CUMSUM_AND_STORE_NANOBUCKET (4) ;
+    CUMSUM_AND_STORE_NANOBUCKET (5) ;
+    CUMSUM_AND_STORE_NANOBUCKET (6) ;
+    CUMSUM_AND_STORE_NANOBUCKET (7) ;
+    CUMSUM_AND_STORE_NANOBUCKET (8) ;
+    CUMSUM_AND_STORE_NANOBUCKET (9) ;
+    CUMSUM_AND_STORE_NANOBUCKET (10) ;
+    CUMSUM_AND_STORE_NANOBUCKET (11) ;
+
+    /*    
+    if(threadIdx.x +blockIdx.x*blockDim.x <= mnvec) //blockDim.x -1){ 
+    {
+       printf("thd %d blk%d nbucket0 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[0]);
+       printf("thd %d blk%d nbucket1 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[1*blockDim.x]);
+       printf("thd %d blk%d nbucket2 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[2*blockDim.x]);
+       printf("thd %d blk%d nbucket3 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[3*blockDim.x]);
+       printf("thd %d blk%d nbucket4 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[4*blockDim.x]);
+       printf("thd %d blk%d nbucket5 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[5*blockDim.x]);
+       printf("thd %d blk%d nbucket6 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[6*blockDim.x]);
+       printf("thd %d blk%d nbucket7 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[7*blockDim.x]);
+       printf("thd %d blk%d nbucket8 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[8*blockDim.x]);
+       printf("thd %d blk%d nbucket9 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[9*blockDim.x]);
+       printf("thd %d blk%d nbucket10 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[10*blockDim.x]);
+       printf("thd %d blk%d nbucket11 has %ld prev\n",threadIdx.x, blockIdx.x, nanobucket[11*blockDim.x]);
+
+    }
+    __syncthreads();
+    */
+        
+
+    // The last thread now has the sum of all nanobuckets, which is then saved
+    // to the global bucket counts.   blockbucket is an array of size
+    // 12-by-gridDim.x, held by row, with one column per thread block.
+    // The last thread saves its result in the column of this thread block.
+    // Note that this write to global memory is not coalesced.
+
+    #define STORE_GLOBAL_BUCKET_COUNT(bucket)                    \
+        blockbucket [blockIdx.x + bucket * gridDim.x] +=         \
+            my_bucket_ ## bucket ;
+
+    if (threadIdx.x == blockDim.x - 1 ) 
+    {
+        STORE_GLOBAL_BUCKET_COUNT (0) ;
+        STORE_GLOBAL_BUCKET_COUNT (1) ;
+        STORE_GLOBAL_BUCKET_COUNT (2) ;
+        STORE_GLOBAL_BUCKET_COUNT (3) ;
+        STORE_GLOBAL_BUCKET_COUNT (4) ;
+        STORE_GLOBAL_BUCKET_COUNT (5) ;
+        STORE_GLOBAL_BUCKET_COUNT (6) ;
+        STORE_GLOBAL_BUCKET_COUNT (7) ;
+        STORE_GLOBAL_BUCKET_COUNT (8) ;
+        STORE_GLOBAL_BUCKET_COUNT (9) ;
+        STORE_GLOBAL_BUCKET_COUNT (10) ;
+        STORE_GLOBAL_BUCKET_COUNT (11) ;
+    }
+    
+    /* 
+    if(threadIdx.x == blockDim.x -1){ 
+
+       printf("block%d bbucket0 has %ld entries\n",blockIdx.x, blockbucket[0*gridDim.x+blockIdx.x]);
+       printf("block%d bbucket1 has %ld entries\n",blockIdx.x, blockbucket[1*gridDim.x+blockIdx.x]);
+       printf("block%d bbucket2 has %ld entries\n",blockIdx.x, blockbucket[2*gridDim.x+blockIdx.x]);
+       printf("block%d bbucket3 has %ld entries\n",blockIdx.x, blockbucket[3*gridDim.x+blockIdx.x]);
+       printf("block%d bbucket4 has %ld entries\n",blockIdx.x, blockbucket[4*gridDim.x+blockIdx.x]);
+       printf("block%d bbucket5 has %ld entries\n",blockIdx.x, blockbucket[5*gridDim.x+blockIdx.x]);
+       printf("block%d bbucket6 has %ld entries\n",blockIdx.x, blockbucket[6*gridDim.x+blockIdx.x]);
+       printf("block%d bbucket7 has %ld entries\n",blockIdx.x, blockbucket[7*gridDim.x+blockIdx.x]);
+       printf("block%d bbucket8 has %ld entries\n",blockIdx.x, blockbucket[8*gridDim.x+blockIdx.x]);
+       printf("block%d bbucket9 has %ld entries\n",blockIdx.x, blockbucket[9*gridDim.x+blockIdx.x]);
+       printf("block%d bbucket10 has %ld entries\n",blockIdx.x, blockbucket[10*gridDim.x+blockIdx.x]);
+       printf("block%d bbucket11 has %ld entries\n",blockIdx.x, blockbucket[11*gridDim.x+blockIdx.x]);
+
+    }
+    __syncthreads();
+    */
+    
+}
+
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase2.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase2.cu
new file mode 100644
index 0000000000..b4447c60ff
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase2.cu
@@ -0,0 +1,436 @@
+//------------------------------------------------------------------------------
+// templates/GB_AxB_cuda_dot3_phase2: fill the global buckets
+//------------------------------------------------------------------------------
+
+// TODO describe me
+
+#define GB_KERNEL
+#include <cstdint>
+#include "GB_cuda_buckets.h"
+#include "matrix.h"
+#include <cooperative_groups.h>
+#include "local_cub/block/block_scan.cuh"
+
+using namespace cooperative_groups;
+
+// A stateful callback functor that maintains a running prefix to be applied
+// during consecutive scan operations.
+struct BlockPrefixCallbackOp
+{
+   // Running prefix
+   int64_t running_total;
+   // Constructor
+   __device__ BlockPrefixCallbackOp(int64_t running_total) : running_total(running_total) {}
+
+   // Callback operator to be entered by the first warp of threads in the block.
+   // Thread-0 is responsible for returning a value for seeding the block-wide scan.
+   __device__ int64_t operator()(int64_t block_aggregate)
+   {
+     int64_t old_prefix = running_total;
+     running_total += block_aggregate;
+     return old_prefix;
+   }
+};
+
+__inline__ 
+__device__ void blockBucketExclusiveSum(int bucketId, int64_t *d_data, int nblocks)
+{
+   #define blocksize  32
+
+   // Specialize BlockScan for a 1D block of 32 threads
+   typedef cub::BlockScan<int64_t, 32, cub::BLOCK_SCAN_WARP_SCANS> BlockScan; 
+
+   // Allocate shared memory for BlockScan
+   __shared__ typename BlockScan::TempStorage temp_storage;
+
+   // Initialize running total
+   BlockPrefixCallbackOp prefix_op(0);
+
+   // Have the block iterate over segments of items
+   int64_t data=0;
+
+   int64_t *blockbucket= d_data;
+
+   for (int block_id = 0; block_id < nblocks; block_id += blocksize)
+   {
+    // Load a segment of consecutive items that are blocked across threads
+
+    //printf("block %d entering sum\n",blockIdx.x);
+      int loc = block_id + threadIdx.x;
+      if ( loc < nblocks)
+      { 
+        //printf("block %di loading tid=%d\n",block_id,tid);
+        data  = blockbucket[bucketId*nblocks    +loc ] ; 
+      }
+      __syncthreads();
+
+      //printf("bb%d_%d s0 before prefix= %ld \n", block_id,bucketId, 
+      //                     blockbucket[bucketId*nblocks + block_id+threadIdx.x] )  ; 
+      // Collectively compute the block-wide exclusive prefix sum
+      BlockScan(temp_storage).ExclusiveSum( data, data, prefix_op);
+      __syncthreads();
+
+      if ( loc < nblocks)
+      { 
+        blockbucket[bucketId*nblocks   +loc ]  = data  ; 
+      }
+      __syncthreads();
+
+        //printf("bb%d_%d = %ld \n", block_id, bucketId, blockbucket[bucketId*nblocks+block_id+threadIdx.x] )  ; 
+      
+      data = 0;
+   }
+}
+
+
+template< typename T, int tile_sz>
+__inline__ __device__ 
+T warp_ReduceSumPlus( thread_block_tile<tile_sz> tile, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = tile.size() / 2; i > 0; i /= 2) {
+        val +=  tile.shfl_down( val, i);
+    }
+    return val; // note: only thread 0 will return full sum
+}
+
+template<typename T, int warpSize>
+__inline__ __device__
+T block_ReduceSum(thread_block g, T val)
+{
+  static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+  thread_block_tile<warpSize> tile = tiled_partition<warpSize>( g );
+
+  // Each warp performs partial reduction
+  val = warp_ReduceSumPlus<T, warpSize>( tile, val);    
+
+  // Wait for all partial reductions
+  if (lane==0) { 
+     //printf("thd%d warp%d sum is %d\n", threadIdx.x, wid, val);
+     shared[wid]=val; // Write reduced value to shared memory
+     //printf("thd%d stored warp %d sum %d\n", threadIdx.x, wid, val);
+  }
+  __syncthreads();              // Wait for all partial reductions
+
+  if (wid > 0 ) return val ;
+  //Final reduce within first warp
+  if (wid==0) val = warp_ReduceSumPlus<T, warpSize>( tile, val) ; 
+
+  return val;
+}
+
+// GB_AxB_cuda_dot3_phase2 is a CUDA kernel that takes as input the
+// nanobuckets and blockbucket arrays computed by the first phase kernel,
+// GB_AxB_cuda_dot3_phase1.  The launch geometry of this kernel must match the
+// GB_AxB_cuda_dot3_phase1 kernel, with the same # of threads and threadblocks.
+
+__global__ 
+void GB_AxB_dot3_phase2
+(
+    // input, not modified:
+    int64_t *__restrict__ nanobuckets,    // array of size 12-blockDim.x-by-nblocks
+    int64_t *__restrict__ blockbucket,    // global bucket count, of size 12*nblocks
+    // output:
+    int64_t *__restrict__ bucketp,        // global bucket cumsum, of size 13 
+    int64_t *__restrict__ bucket,         // global buckets, of size cnz (== mnz)
+    int64_t *__restrict__ offset,         // global offsets, for each bucket
+    // inputs, not modified:
+    GrB_Matrix C,             // output matrix
+    const int64_t cnz,        // number of entries in C and M 
+    const int nblocks         // input number of blocks to reduce
+)
+{
+
+    //--------------------------------------------------------------------------
+    // get C and M
+    //--------------------------------------------------------------------------
+
+    //int64_t *Ci = C->i ;       // for zombies, or bucket assignment
+
+    // Ci [p] for an entry C(i,j) contains either GB_FLIP(i) if C(i,j) is a
+    // zombie, or (k << 4) + bucket otherwise, where C(:,j) is the kth vector
+    // of C (j = Ch [k] if hypersparse or j = k if standard sparse), and
+    // where bucket is the bucket assignment for C(i,j).  This phase does not
+    // need k, just the bucket for each entry C(i,j).
+
+    //--------------------------------------------------------------------------
+    // sum up the bucket counts of prior threadblocks
+    //--------------------------------------------------------------------------
+
+    // blockbucket is an array of size 12-by-nblocks, held by row.  The
+    // entry blockbucket [bucket * nblocks + t] holds the # of entries
+    // in the bucket (in range 0 to 11) found by threadblock t.
+
+
+    //__shared__ uint64_t offset [12] ;
+    uint64_t s_0=0;
+    uint64_t s_1=0;
+    uint64_t s_2=0;
+    uint64_t s_3=0;
+    uint64_t s_4=0;
+    uint64_t s_5=0;
+    uint64_t s_6=0;
+    uint64_t s_7=0;
+    uint64_t s_8=0;
+    uint64_t s_9=0;
+    uint64_t s_10=0;
+    uint64_t s_11=0;
+
+    thread_block_tile<32> tile = tiled_partition<32>(this_thread_block() );
+
+    //printf("block %d entering sum\n",blockIdx.x);
+    int tid = threadIdx.x  + blockIdx.x*blockDim.x;
+    #define reduceBucket( B )    \
+     for( tid = threadIdx.x + blockIdx.x*blockDim.x; \
+          tid < nblocks;  \
+          tid += blockDim.x*gridDim.x) \
+     {                           \
+        s_ ## B  += blockbucket[  B *nblocks +tid] ;  \
+     } \
+     __syncthreads(); \
+     s_ ## B  = warp_ReduceSumPlus<uint64_t , 32>( tile, s_ ## B); 
+
+     reduceBucket( 0 )
+     reduceBucket( 1 )
+     reduceBucket( 2 )
+     reduceBucket( 3 )
+     reduceBucket( 4 )
+     reduceBucket( 5 )
+     reduceBucket( 6 )
+     reduceBucket( 7 )
+     reduceBucket( 8 )
+     reduceBucket( 9 )
+     reduceBucket( 10 )
+     reduceBucket( 11 )
+
+
+        //printf("summing blk,tid=%d,%d\n",blockIdx.x,threadIdx.x);
+       if (threadIdx.x ==0 )
+       {
+          atomicAdd( (unsigned long long int*)&(offset[0]), s_0);
+          atomicAdd( (unsigned long long int*)&(offset[1]), s_1);
+          atomicAdd( (unsigned long long int*)&(offset[2]), s_2);
+          atomicAdd( (unsigned long long int*)&(offset[3]), s_3);
+          atomicAdd( (unsigned long long int*)&(offset[4]), s_4);
+          atomicAdd( (unsigned long long int*)&(offset[5]), s_5);
+          atomicAdd( (unsigned long long int*)&(offset[6]), s_6);
+          atomicAdd( (unsigned long long int*)&(offset[7]), s_7);
+          atomicAdd( (unsigned long long int*)&(offset[8]), s_8);
+          atomicAdd( (unsigned long long int*)&(offset[9]), s_9);
+          atomicAdd( (unsigned long long int*)&(offset[10]),s_10);
+          atomicAdd( (unsigned long long int*)&(offset[11]),s_11);
+       }
+       __syncthreads();
+       
+
+
+    if( gridDim.x >= 12)
+    {
+        // Cumulative sum across blocks for each bucket 
+        if (blockIdx.x <12)
+           blockBucketExclusiveSum( blockIdx.x, blockbucket, nblocks ) ;
+    }
+    else
+    {
+        if (blockIdx.x == 0)
+        {
+           blockBucketExclusiveSum( 0, blockbucket, nblocks ) ;
+           blockBucketExclusiveSum( 1, blockbucket, nblocks ) ;
+           blockBucketExclusiveSum( 2, blockbucket, nblocks ) ;
+           blockBucketExclusiveSum( 3, blockbucket, nblocks ) ;
+           blockBucketExclusiveSum( 4, blockbucket, nblocks ) ;
+           blockBucketExclusiveSum( 5, blockbucket, nblocks ) ;
+           blockBucketExclusiveSum( 6, blockbucket, nblocks ) ;
+           blockBucketExclusiveSum( 7, blockbucket, nblocks ) ;
+           blockBucketExclusiveSum( 8, blockbucket, nblocks ) ;
+           blockBucketExclusiveSum( 9, blockbucket, nblocks ) ;
+           blockBucketExclusiveSum( 10, blockbucket, nblocks) ;
+           blockBucketExclusiveSum( 11, blockbucket, nblocks) ;
+        }
+    }
+    
+    
+    
+
+    //--------------------------------------------------------------------------
+    // last threadblock saves the cumsum of the 12 global buckets
+    //--------------------------------------------------------------------------
+    /* do on cpu
+    if (blockIdx.x == 0) // gridDim.x - 1)
+    {
+
+        // the last threadblock: compute all 12 global bucket sizes, and its
+        // cumulative sum
+        if (threadIdx.x == 0)
+        {
+            // the work in this last threadblock is single-threaded
+            uint64_t s = 0;
+            for (int bucket = 0 ; bucket < 12 ; bucket++)
+            {
+                // write the global cumsum of all buckets to the final global
+                // bucketp.  bucketp [bucket] is the starting position in
+                // the bucket.
+                bucketp [bucket] = s ;
+                
+                // bucket_size is the total # of entries in this bucket, for
+                // all threadblocks.  It has nearly been computed already,
+                // since offset [bucket] = sum (blockbucket (bucket,0:blockDim.x-1)).
+                // All that is left is to add the counts for the last threadblock.`
+                //int64_t global_bucket_size = offset [bucket];   
+                     // + blockbucket [bucket * gridDim.x + blockIdx.x] ;
+
+                //printf("bucketp[%d]= %ld\n",bucket, s);
+                // s is a cumulative sum of the global bucket sizes
+                s += offset[bucket]; // global_bucket_size ;
+            }
+            // The kth global bucket (for k = 0 to 11) appears in:
+            // bucket [bucketp [k]... bucketp [k+1]-1],
+            // so the end of the last bucket needs bucketp [12].
+            bucketp [12] = (int64_t)s;
+                //printf("bucketp[12]= %ld\n", s);
+            // all entries in C now appear in the buckets.
+            // ASSERT (s == cnz) ;
+        }
+        __syncthreads ( ) ;
+    }
+    */
+
+} // phase2 
+
+
+__global__ 
+void GB_AxB_dot3_phase2end
+(
+    // input, not modified:
+    int64_t *__restrict__ nanobuckets,    // array of size 12-blockDim.x-by-nblocks
+    const int64_t *__restrict__ blockbucket,    // global bucket count, of size 12*nblocks
+    // output:
+    const int64_t *__restrict__ bucketp,        // global bucket cumsum, of size 13 
+    int64_t *__restrict__ bucket,         // global buckets, of size cnz (== mnz)
+    const int64_t *__restrict__ offset,        // global offsets, for each bucket
+    // inputs, not modified:
+    const GrB_Matrix C,            // output matrix
+    const int64_t cnz        // number of entries in C and M 
+)
+{
+
+
+    int64_t *__restrict__ Ci = C->i ;       // for zombies, or bucket assignment
+    int64_t *__restrict__ Mp = C->p ;       // for offset calculations 
+    int64_t mnvec = C->nvec;
+
+    //--------------------------------------------------------------------------
+    // load and shift the nanobuckets for this thread block
+    //--------------------------------------------------------------------------
+
+    // The taskbucket for this threadblock is an array of size
+    // 12-by-blockDim.x, held by row.  It forms a 2D array within the 3D
+    // nanobuckets array.
+    int64_t *__restrict__ taskbucket = nanobuckets + blockIdx.x * (12 * blockDim.x) ;
+
+    //printf("block%d thd%d blockbucket= %ld\n", blockIdx.x, threadIdx.x, 
+    //                                           blockbucket[blockIdx.x*gridDim.x+blockIdx.x]);
+
+    // Each thread in this threadblock owns one column of this taskbucket, for
+    // its set of 12 nanobuckets.  The nanobuckets are a column of length 12,
+    // with stride equal to blockDim.x.
+    int64_t *__restrict__ nanobucket = taskbucket + threadIdx.x;
+
+    // Each thread loads its 12 nanobucket values into registers.
+    #define LOAD_NANOBUCKET(bucket)                     \
+        int64_t my_bucket_ ## bucket =                  \
+            nanobucket [bucket * blockDim.x]            \
+         + blockbucket [bucket * gridDim.x + blockIdx.x]\
+         + bucketp [bucket] ;                          
+
+    LOAD_NANOBUCKET (0) ;
+    LOAD_NANOBUCKET (1) ;
+    LOAD_NANOBUCKET (2) ;
+    LOAD_NANOBUCKET (3) ;
+    LOAD_NANOBUCKET (4) ;
+    LOAD_NANOBUCKET (5) ;
+    LOAD_NANOBUCKET (6) ;
+    LOAD_NANOBUCKET (7) ;
+    LOAD_NANOBUCKET (8) ;
+    LOAD_NANOBUCKET (9) ;
+    LOAD_NANOBUCKET (10) ;
+    LOAD_NANOBUCKET (11) ;
+
+    // Now each thread has an index into the global set of 12 buckets,
+    // held in bucket, of where to place its own entries.
+
+    //--------------------------------------------------------------------------
+    // construct the global buckets
+    //--------------------------------------------------------------------------
+
+    // The slice for task blockIdx.x contains entries pfirst:plast-1 of M and
+    // C, which is the part of C operated on by this threadblock.
+    int64_t pfirst, plast ;
+
+    /*
+    for ( int tid_global = threadIdx.x + blockIdx.x * blockDim.x ;
+              tid_global < (mnvec+7)/8 ;
+              tid_global += blockDim.x * gridDim.x)
+    */
+    int chunk_max= (cnz + chunksize -1)/chunksize;
+    for ( int chunk = blockIdx.x;
+              chunk < chunk_max;
+              chunk += gridDim.x ) 
+    {
+
+    //GB_PARTITION (pfirst, plast, cnz, tid_global, (mnvec+7)/8 ) ;
+      pfirst = chunksize * chunk ; 
+      plast  = GB_IMIN( chunksize * (chunk+1), cnz ) ;
+
+      int chunk_end;
+      if ( cnz > chunksize) chunk_end = GB_IMIN(  chunksize, 
+                                                  cnz - chunksize*(chunk) ); 
+      else chunk_end = cnz;
+
+    // find the first vector of the slice for task blockIdx.x: the
+    // vector that owns the entry Ai [pfirst] and Ax [pfirst].
+    //kfirst = GB_search_for_vector_device (pfirst, Mp, 0, mnvec) ;
+
+    // find the last vector of the slice for task blockIdx.x: the
+    // vector that owns the entry Ai [plast-1] and Ax [plast-1].
+    //klast = GB_search_for_vector_device (plast-1, Mp, kfirst, mnvec) ;
+    
+
+    for ( int p = pfirst + threadIdx.x;
+              p < pfirst + chunk_end;
+              p += blockDim.x )
+    {
+        // get the entry C(i,j), and extract its bucket.  Then
+        // place the entry C(i,j) in the global bucket it belongs to.
+
+        // TODO: these writes to global are not coalesced.  Instead: each
+        // threadblock could buffer its writes to 12 buffers and when the
+        // buffers are full they can be written to global.
+        int ibucket = Ci[p] & 0xF;
+        //printf(" thd: %d p,Ci[p] = %ld,%ld,%d\n", threadIdx.x, p, Ci[p], irow );
+        switch (ibucket)
+        {
+            case  0: bucket [my_bucket_0++ ] = p ; Ci[p] = Ci[p] >>4; break ; //unshift zombies
+            case  1: bucket [my_bucket_1++ ] = p ; break ;
+            case  2: bucket [my_bucket_2++ ] = p ; break ;
+            case  3: bucket [my_bucket_3++ ] = p ; break ;
+            case  4: bucket [my_bucket_4++ ] = p ; break ;
+            case  5: bucket [my_bucket_5++ ] = p ; break ;
+            case  6: bucket [my_bucket_6++ ] = p ; break ;
+            case  7: bucket [my_bucket_7++ ] = p ; break ;
+            case  8: bucket [my_bucket_8++ ] = p ; break ;
+            case  9: bucket [my_bucket_9++ ] = p ; break ;
+            case 10: bucket [my_bucket_10++] = p ; break ;
+            case 11: bucket [my_bucket_11++] = p ; break ;
+            default: break; 
+        }
+        
+    }
+    //__syncthreads();
+  } 
+    
+}
+
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cu
new file mode 100644
index 0000000000..5211464e3b
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cu
@@ -0,0 +1,176 @@
+
+//------------------------------------------------------------------------------
+// AxB_dot3_phase3_dndn.cu 
+//------------------------------------------------------------------------------
+
+// This CUDA kernel produces the semi-ring product of two
+// sparse matrices of types T_A and T_B and common index space size n, to a  
+// output matrix of type T_C. The matrices are sparse, with different numbers
+// of non-zeros and different sparsity patterns. 
+// ie. we want to produce C = A'*B in the sense of the given semi-ring.
+
+// This version uses a simple warp-based dense dot product algorithm, when the
+// vectors coming from both A and B are dense, for any size of N.
+
+// Both the grid and block are 1D, so blockDim.x is the # threads in a
+// threadblock, and the # of threadblocks is grid.x
+
+// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number
+// of active threads = min( min(nzA, nzB), 32) 
+
+// Thus, threadblock b owns a semi-ring dot product on a pair of vectors. 
+// The work is to load the data, do the multiply and add work and finally 
+// reduce this data to a scalar, and write it to Cx[pair].
+
+//  int64_t start          <- start of vector pairs for this kernel
+//  int64_t end            <- end of vector pairs for this kernel
+//  int64_t *Bucket        <- array of pair indices for all kernels 
+//  GrB_Matrix C           <- result matrix 
+//  GrB_Matrix M           <- mask matrix
+//  GrB_Matrix A           <- input matrix A
+//  GrB_Matrix B           <- input matrix B
+//  int sz                 <- size parameter (not used) 
+
+#include <limits>
+#include <cstdint>
+#include <cooperative_groups.h>
+#include "matrix.h"
+#include "mySemiRing.h"
+
+// Using tile size fixed at compile time, we don't need shared memory
+#define tile_sz 32 
+
+using namespace cooperative_groups;
+
+template< typename T, int warp_sz>
+__inline__ __device__ T warp_ReduceSum(thread_block_tile<warp_sz> g, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2)
+    {
+        T next = g.shfl_down( val, i) ;
+        val = GB_ADD( val, next ); 
+    }
+    return val; // note: only thread 0 will return full sum
+}
+
+template<typename T, int warpSize >
+__inline__ __device__
+T block_ReduceSum(thread_block g, T val, T Ident)
+{
+  static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+  thread_block_tile<warpSize> tile = tiled_partition<warpSize>(g);
+
+  // Each warp performs partial reduction
+  val = warp_ReduceSum< T, warpSize>(tile, val);    
+
+  if (lane==0) shared[wid] = val; // Write reduced value to shared memory
+
+  //tile.sync();                    // Wait for all partial reductions
+
+  if (wid > 0 || gridDim.x == 1 ) return val;
+
+  //read from shared memory only if that warp existed
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] :  Ident  ;
+
+  if (wid==0) val = warp_ReduceSum< T, warpSize>(tile,val); //Final reduce within first warp
+
+  return val;
+}
+
+
+template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>
+__global__ void AxB_dot3_phase3_dndn 
+(
+    int64_t start,
+    int64_t end,
+    int64_t *Bucket,
+    GrB_Matrix C,
+    GrB_Matrix M,
+    GrB_Matrix A,
+    GrB_Matrix B,
+    int sz
+)
+{
+
+    T_A *Ax = (T_A*)A->x;
+    T_B *Bx = (T_B*)B->x;
+    T_C *Cx = (T_C*)C->x;
+    int64_t *Mi = M->i;
+    int64_t *Ci = C->i;
+    int64_t *Ap = A->p;
+    int64_t *Bp = B->p;
+
+    // zombie count
+    int zc = 0;
+    int64_t pair_id;
+
+    // total items to be inspected
+    int64_t nnzA = 0;
+    int64_t nnzB = 0;
+    int s = blockDim.x;
+
+    // Main loop over pairs 
+    for (pair_id = start + blockIdx.x; //warp per pair 
+         pair_id < end;  
+         pair_id += gridDim.x ){
+
+         int64_t i = Mi[pair_id];
+         int64_t j = Ci[pair_id] >> 4;
+
+         int64_t pA = Ap[i];
+         int64_t xend   = Ap[i+1];
+         nnzA = xend - pA;
+
+         int64_t pB = Bp[j]; 
+         int64_t yend   = Bp[j+1]; 
+         nnzB = yend - pB;
+
+    /*
+    if (threadIdx.x == 0 ){
+        printf(" i,j = %d,%d  nnz= %d xstart,end = %d,%d  ystart,end = %d,%d\n",
+            (int)i,(int)j,  (int)nnzA, (int)xstart,(int)xend, (int)ystart, (int)yend);
+    }
+    __syncthreads();                                          
+    */
+
+    
+    // convert global data pointer to the local pointer of this block
+    T_A  aki; // *xdata = &Ax[xstart]; 
+    T_B  bkj; // *ydata = &Bx[ystart];
+    T_Z  cij;
+
+    GB_GETA ( aki=(T_Z)Ax[pA+threadIdx.x] ) ;             // aki = A(0,i)
+    GB_GETB ( bkj=(T_Z)Bx[pB+threadIdx.x] ) ;             // bkj = B(0,j)
+    GB_C_MULT ( cij, aki, bkj ) ;                        // cij = aki * bkj
+
+    for ( int tid = threadIdx.x + s; tid < nnzA; tid+= s) { 
+          // cij += A(k,i) * B(k,j)
+          // GB_DOT_TERMINAL ( cij ) ;             // break if cij == terminal
+          GB_GETA ( aki=(T_Z)Ax[pA+tid] ) ;         // aki = A(k,i)
+          GB_GETB ( bkj=(T_Z)Bx[pB+tid] ) ;        // bkj = B(k,j)
+          GB_MULTADD ( cij, aki, bkj ) ;        // cij += aki * bkj
+    }
+
+
+    //--------------------------------------------------------------------------
+    // reduce per-thread sums to a single scalar
+    //--------------------------------------------------------------------------
+    thread_block_tile<32> tile = tiled_partition<32>( this_thread_block() );
+    cij = warp_ReduceSum<T_Z, 32> ( tile, cij);
+
+    // write result for this block to global mem
+    if (threadIdx.x == 0)
+    {
+       //printf("tid: %d final sum after reduce = %d\n", threadIdx.x, sum);
+       GB_PUTC( Cx[pair_id]=(T_C)cij ) ;
+       GB_PUTC( Ci[pair_id]=i ) ;
+    }
+    //__syncthreads ( ) ;
+  }
+
+}
+
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit
new file mode 100644
index 0000000000..e5564d863e
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit
@@ -0,0 +1,178 @@
+const char* const templates_GB_jit_AxB_dot3_phase3_dndn_cu = "templates/GB_jit_AxB_dot3_phase3_dndn.cu\n"
+"\n"
+"//------------------------------------------------------------------------------\n"
+"// AxB_dot3_phase3_dndn.cu \n"
+"//------------------------------------------------------------------------------\n"
+"\n"
+"// This CUDA kernel produces the semi-ring product of two\n"
+"// sparse matrices of types T_A and T_B and common index space size n, to a  \n"
+"// output matrix of type T_C. The matrices are sparse, with different numbers\n"
+"// of non-zeros and different sparsity patterns. \n"
+"// ie. we want to produce C = A'*B in the sense of the given semi-ring.\n"
+"\n"
+"// This version uses a simple warp-based dense dot product algorithm, when the\n"
+"// vectors coming from both A and B are dense, for any size of N.\n"
+"\n"
+"// Both the grid and block are 1D, so blockDim.x is the # threads in a\n"
+"// threadblock, and the # of threadblocks is grid.x\n"
+"\n"
+"// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number\n"
+"// of active threads = min( min(nzA, nzB), 32) \n"
+"\n"
+"// Thus, threadblock b owns a semi-ring dot product on a pair of vectors. \n"
+"// The work is to load the data, do the multiply and add work and finally \n"
+"// reduce this data to a scalar, and write it to Cx[pair].\n"
+"\n"
+"//  int64_t start          <- start of vector pairs for this kernel\n"
+"//  int64_t end            <- end of vector pairs for this kernel\n"
+"//  int64_t *Bucket        <- array of pair indices for all kernels \n"
+"//  GrB_Matrix C           <- result matrix \n"
+"//  GrB_Matrix M           <- mask matrix\n"
+"//  GrB_Matrix A           <- input matrix A\n"
+"//  GrB_Matrix B           <- input matrix B\n"
+"//  int sz                 <- size parameter (not used) \n"
+"\n"
+"#include <limits>\n"
+"#include <cstdint>\n"
+"#include <cooperative_groups.h>\n"
+"#include \"matrix.h\"\n"
+"#include \"mySemiRing.h\"\n"
+"\n"
+"// Using tile size fixed at compile time, we don't need shared memory\n"
+"#define tile_sz 32 \n"
+"\n"
+"using namespace cooperative_groups;\n"
+"\n"
+"template< typename T, int warp_sz>\n"
+"__inline__ __device__ T warp_ReduceSum(thread_block_tile<warp_sz> g, T val)\n"
+"{\n"
+"    // Each iteration halves the number of active threads\n"
+"    // Each thread adds its partial sum[i] to sum[lane+i]\n"
+"    for (int i = g.size() / 2; i > 0; i /= 2)\n"
+"    {\n"
+"        T next = g.shfl_down( val, i) ;\n"
+"        val = GB_ADD( val, next ); \n"
+"    }\n"
+"    return val; // note: only thread 0 will return full sum\n"
+"}\n"
+"\n"
+"template<typename T, int warpSize >\n"
+"__inline__ __device__\n"
+"T block_ReduceSum(thread_block g, T val, T Ident)\n"
+"{\n"
+"  static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums\n"
+"  int lane = threadIdx.x % warpSize;\n"
+"  int wid = threadIdx.x / warpSize;\n"
+"  thread_block_tile<warpSize> tile = tiled_partition<warpSize>(g);\n"
+"\n"
+"  // Each warp performs partial reduction\n"
+"  val = warp_ReduceSum< T, warpSize>(tile, val);    \n"
+"\n"
+"  if (lane==0) shared[wid] = val; // Write reduced value to shared memory\n"
+"\n"
+"  //tile.sync();                    // Wait for all partial reductions\n"
+"\n"
+"  if (wid > 0 || gridDim.x == 1 ) return val;\n"
+"\n"
+"  //read from shared memory only if that warp existed\n"
+"  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] :  Ident  ;\n"
+"\n"
+"  if (wid==0) val = warp_ReduceSum< T, warpSize>(tile,val); //Final reduce within first warp\n"
+"\n"
+"  return val;\n"
+"}\n"
+"\n"
+"\n"
+"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>\n"
+"__global__ void AxB_dot3_phase3_dndn \n"
+"(\n"
+"    int64_t start,\n"
+"    int64_t end,\n"
+"    int64_t *Bucket,\n"
+"    GrB_Matrix C,\n"
+"    GrB_Matrix M,\n"
+"    GrB_Matrix A,\n"
+"    GrB_Matrix B,\n"
+"    int sz\n"
+")\n"
+"{\n"
+"\n"
+"    T_A *Ax = (T_A*)A->x;\n"
+"    T_B *Bx = (T_B*)B->x;\n"
+"    T_C *Cx = (T_C*)C->x;\n"
+"    int64_t *Mi = M->i;\n"
+"    int64_t *Ci = C->i;\n"
+"    int64_t *Ap = A->p;\n"
+"    int64_t *Bp = B->p;\n"
+"\n"
+"    // zombie count\n"
+"    int zc = 0;\n"
+"    int64_t pair_id;\n"
+"\n"
+"    // total items to be inspected\n"
+"    int64_t nnzA = 0;\n"
+"    int64_t nnzB = 0;\n"
+"    int s = blockDim.x;\n"
+"\n"
+"    // Main loop over pairs \n"
+"    for (pair_id = start + blockIdx.x; //warp per pair \n"
+"         pair_id < end;  \n"
+"         pair_id += gridDim.x ){\n"
+"\n"
+"         int64_t i = Mi[pair_id];\n"
+"         int64_t j = Ci[pair_id] >> 4;\n"
+"\n"
+"         int64_t pA = Ap[i];\n"
+"         int64_t xend   = Ap[i+1];\n"
+"         nnzA = xend - pA;\n"
+"\n"
+"         int64_t pB = Bp[j]; \n"
+"         int64_t yend   = Bp[j+1]; \n"
+"         nnzB = yend - pB;\n"
+"\n"
+"    /*\n"
+"    if (threadIdx.x == 0 ){\n"
+"        printf(\" i,j = %d,%d  nnz= %d xstart,end = %d,%d  ystart,end = %d,%d\\n\",\n"
+"            (int)i,(int)j,  (int)nnzA, (int)xstart,(int)xend, (int)ystart, (int)yend);\n"
+"    }\n"
+"    __syncthreads();                                          \n"
+"    */\n"
+"\n"
+"    \n"
+"    // convert global data pointer to the local pointer of this block\n"
+"    T_A  aki; // *xdata = &Ax[xstart]; \n"
+"    T_B  bkj; // *ydata = &Bx[ystart];\n"
+"    T_Z  cij;\n"
+"\n"
+"    GB_GETA ( aki=(T_Z)Ax[pA+threadIdx.x] ) ;             // aki = A(0,i)\n"
+"    GB_GETB ( bkj=(T_Z)Bx[pB+threadIdx.x] ) ;             // bkj = B(0,j)\n"
+"    GB_C_MULT ( cij, aki, bkj ) ;                        // cij = aki * bkj\n"
+"\n"
+"    for ( int tid = threadIdx.x + s; tid < nnzA; tid+= s) { \n"
+"          // cij += A(k,i) * B(k,j)\n"
+"          // GB_DOT_TERMINAL ( cij ) ;             // break if cij == terminal\n"
+"          GB_GETA ( aki=(T_Z)Ax[pA+tid] ) ;         // aki = A(k,i)\n"
+"          GB_GETB ( bkj=(T_Z)Bx[pB+tid] ) ;        // bkj = B(k,j)\n"
+"          GB_MULTADD ( cij, aki, bkj ) ;        // cij += aki * bkj\n"
+"    }\n"
+"\n"
+"\n"
+"    //--------------------------------------------------------------------------\n"
+"    // reduce per-thread sums to a single scalar\n"
+"    //--------------------------------------------------------------------------\n"
+"    thread_block_tile<32> tile = tiled_partition<32>( this_thread_block() );\n"
+"    cij = warp_ReduceSum<T_Z, 32> ( tile, cij);\n"
+"\n"
+"    // write result for this block to global mem\n"
+"    if (threadIdx.x == 0)\n"
+"    {\n"
+"       //printf(\"tid: %d final sum after reduce = %d\\n\", threadIdx.x, sum);\n"
+"       GB_PUTC( Cx[pair_id]=(T_C)cij ) ;\n"
+"       GB_PUTC( Ci[pair_id]=i ) ;\n"
+"    }\n"
+"    //__syncthreads ( ) ;\n"
+"  }\n"
+"\n"
+"}\n"
+"\n"
+;
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu
new file mode 100644
index 0000000000..825a02e2ab
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu
@@ -0,0 +1,300 @@
+//------------------------------------------------------------------------------
+// AxB_dot3_phase3_mp.cu 
+//------------------------------------------------------------------------------
+
+// This CUDA kernel produces the semi-ring product of two
+// sparse matrices of types T_A and T_B and common index space size n, to a  
+// output matrix of type T_C. The matrices are sparse, with different numbers
+// of non-zeros and different sparsity patterns. 
+// ie. we want to produce C = A'*B in the sense of the given semi-ring.
+
+// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are 
+// relatively close in size, neither is very spare nor dense, for any size of N.
+// Handles arbitrary sparsity patterns with guaranteed load balance.
+
+// Both the grid and block are 1D, so blockDim.x is the # threads in a
+// threadblock, and the # of threadblocks is grid.x
+
+// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number
+// of active threads = min( min(g_xnz, g_ynz), 32) 
+
+// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi.  Its job
+// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot
+// product on those items in the intersection, and finally reduce this data to a scalar, 
+// on exit write it to g_odata [b].
+
+//  int64_t start          <- start of vector pairs for this kernel
+//  int64_t end            <- end of vector pairs for this kernel
+//  int64_t *Bucket        <- array of pair indices for all kernels 
+//  matrix<T_C> *C         <- result matrix 
+//  matrix<T_M> *M         <- mask matrix
+//  matrix<T_A> *A         <- input matrix A
+//  matrix<T_B> *B         <- input matrix B
+#include <limits>
+#include <cstdint>
+#include <cooperative_groups.h>
+#include "mySemiRing.h"
+#include "matrix.h"
+
+// Using tile size fixed at compile time, we don't need shared memory
+#define tile_sz 32 
+
+using namespace cooperative_groups;
+
+template< typename T, int warp_sz>
+__device__ __inline__ 
+T GB_reduce_sum(thread_block_tile<warp_sz> g, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2)
+    {
+        T next = g.shfl_down( val, i);
+        val = GB_ADD( val, next ) ;
+    }
+    return val;
+}
+
+template< typename T, int warp_sz>
+__device__ __inline__ 
+T reduce_plus(thread_block_tile<warp_sz> g, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2)
+    {
+        val += g.shfl_down( val, i) ;
+    }
+    return val; // note: only thread 0 will return full sum and flag value
+}
+
+#define intersects_per_thread 8
+
+template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>  
+__global__ void AxB_dot3_phase3_mp
+(
+    int64_t start,
+    int64_t end,
+    int64_t *Bucket,
+    GrB_Matrix C,
+    GrB_Matrix M,
+    GrB_Matrix A,
+    GrB_Matrix B,
+    int sz
+)
+{
+
+    T_A *Ax = (T_A*)A->x;
+    T_B *Bx = (T_B*)B->x;
+    T_C *Cx = (T_C*)C->x;
+    int64_t *Ci = C->i;
+    int64_t *Mi = M->i;
+    int64_t *Ai = A->i;
+    int64_t *Bi = B->i;
+    int64_t *Ap = A->p;
+    int64_t *Bp = B->p;
+
+
+    // zombie count
+    int zc = 0;
+
+    int64_t pair_id;
+
+    // set thread ID
+    int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;
+    int tid = threadIdx.x;
+
+    int b = blockIdx.x ;
+
+    // total items to be inspected
+    int64_t nnzA = 0;
+    int64_t nnzB = 0;
+    int64_t n_intersect = 0;
+
+    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());
+
+    int parts = blockDim.x; //(n_intersect+ intersects_per_thread -1)/ intersects_per_thread; 
+
+    // int has_zombies = 0 ;
+
+    // Main loop over pairs 
+    for (pair_id = start+ blockIdx.x; //warp per pair 
+         pair_id < end;  
+         pair_id += gridDim.x )
+    {
+
+         int64_t i = Mi[pair_id];
+         int64_t j = Ci[pair_id] >> 4;
+
+         int64_t xstart = Ap[i];
+         int64_t xend   = Ap[i+1];
+         nnzA = xend - xstart;
+
+         int64_t ystart = Bp[j]; 
+         int64_t yend   = Bp[j+1]; 
+         nnzB = yend - ystart;
+
+         n_intersect = GB_IMIN( xend -xstart, yend -ystart); 
+    /* 
+    if (threadIdx.x ==0 ) {
+      printf("block %d  doing dot %lld  i,j= %lld,%lld\n", blockIdx.x, pair_id, i, j);
+    }
+    */
+    //we want more than one intersection per thread
+    int64_t nxy = nnzA + nnzB;
+
+    int work_per_thread = (nxy +parts -1)/parts;
+    int diag = GB_IMIN( work_per_thread*tid, nxy);
+    int diag_end = GB_IMIN( diag + work_per_thread, nxy);
+    //printf(" thd%d parts = %u wpt = %u diag, diag_end  = %u,%u\n",tid, parts, work_per_thread, diag, diag_end); 
+
+    int x_min = GB_IMAX( (int)(diag - nnzB), 0);
+    int x_max = GB_IMIN( diag, nnzA);
+
+    //printf("start thd%u x_min = %u x_max = %u\n", tid_global, x_min,x_max);
+    while ( x_min < x_max) { //binary search for correct diag break
+      int pivot = (x_min +x_max)/2;
+      if ( Ai[pivot + xstart] < Bi[ diag -pivot -1 + ystart]) {
+         x_min = pivot +1;
+      }
+      else {
+         x_max = pivot;
+      }
+    }
+    int xcoord = x_min;
+    int ycoord = diag -x_min -1;
+    if (( diag > 0) &&(diag < (nnzA+nnzB)) && (Ai[xcoord+xstart] == Bi[ycoord+ystart]) ) { 
+       diag--; //adjust for intersection incrementing both pointers 
+    }
+    // two start points are known now
+    int tx_start = xcoord +xstart;
+    int ty_start = diag -xcoord +ystart; 
+
+    //if (x_start != y_start)
+    //   printf("start thd%u  xs,ys = %i,%i\n", tid_global, x_start, y_start);
+
+    x_min = GB_IMAX( (int)(diag_end - nnzB), 0);
+    x_max = GB_IMIN( diag_end, nnzA);
+
+    while ( x_min < x_max) {
+       int pivot = (x_min +x_max)/2;
+       //printf("thd%u pre_sw piv=%u diag_e = %u  xmin,xmax=%u,%u\n", tid_global, pivot, diag_end,x_min, x_max);
+       if ( Ai[pivot+ xstart] < Bi[ diag_end -pivot -1 +ystart]) {
+          x_min = pivot +1;
+       }
+       else {
+          x_max = pivot;
+       }
+       //printf("thd%u piv=%u xmin,xmax = %u,%u\n", tid_global, pivot, x_min, x_max);
+    }
+    xcoord = x_min;
+    ycoord = diag_end -x_min -1;
+    if ( (diag_end < (nnzA +nnzB)) && (Ai[xcoord +xstart] == Bi[ycoord + ystart]) ) { 
+        diag--; //adjust for intersection incrementing both pointers  
+    }
+    // two end points are known now
+    int tx_end = xcoord +xstart; 
+    int ty_end = diag_end - xcoord + ystart; 
+
+    T_A aki;
+    T_B bkj;
+    T_Z cij = GB_IDENTITY ;
+
+    // TODO PLUS_PAIR_INT64, FP32, FP64: no need for cij_exists.
+    // just check if cij > 0
+
+    int cij_exists  = 0 ;
+    //printf(" thd%u has init value %f\n",tid, cij);
+
+    //merge-path dot product
+    int k = tx_start;
+    int l = ty_start;
+    while ( k < tx_end && l < ty_end )
+    {
+       if (Ai [k] == Bi [l])
+       {
+          GB_GETA ( aki=(T_Z)Ax[k] ) ;
+          GB_GETB ( bkj=(T_Z)Bx[l] ) ;
+          if (cij_exists)
+          {
+            T_Z t = GB_MULT( (T_Z)aki, (T_Z)bkj );
+            GB_ADD_F (cij, t ) ;
+          //printf("  thd%d ix at %lld   cij += %d * %d \n", tid_global, Ai[k], aki, bkj);
+          }
+          else
+          {
+            cij_exists = 1 ;
+            cij = GB_MULT ( (T_Z)aki, (T_Z)bkj ) ;
+          //printf("  thd%d ix at %lld   cij = %d * %d \n", tid_global, Ai[k], Ax[k], Bx[l]);
+          }
+          // TODO check terminal condition
+          k+= 1;
+          l+= 1;
+          //printf(" block%u work value = %d, exists = %d\n", b, cij, cij_exists);
+       }
+       else
+       {
+            k += ( Ai[k] < Bi[l] ) ;
+            l += ( Ai[k] > Bi[l] ) ;
+       }
+    }
+
+    //tile.sync( ) ;
+    //--------------------------------------------------------------------------
+    // reduce sum per-thread values to a single scalar, get OR of flag
+    //--------------------------------------------------------------------------
+    /*
+    if (tid == 0)
+    {
+        printf ("reduce %d : %d exists = %d\n", b,  cij, cij_exists) ;
+    }
+    __syncthreads();
+    */
+
+    // Do vote here for control.
+    cij_exists  = tile.any( cij_exists);
+    //tile.sync();
+
+    if (cij_exists)
+    {
+       cij = GB_reduce_sum<T_Z, tile_sz>( tile, cij );
+       
+    }
+    // else has_zombies = 1;
+
+
+    //__syncthreads();
+    //tile.sync( );
+    // write result for this block to global mem
+    if (tid == 0)
+    {
+        //printf ("final %d : %d exists = %d\n", b,  cij, cij_exists) ;
+        if (cij_exists)
+        {
+           //printf(" cij = %d\n", cij);
+           GB_PUTC ( Cx[pair_id]=(T_C)cij ) ;
+           GB_PUTC ( Ci[pair_id]=i ) ;
+        }
+        else
+        {
+           //printf(" dot %d is a zombie\n", pair_id);
+           zc++;
+           GB_PUTC ( Ci[pair_id]=GB_FLIP (i) ) ;
+        }
+    }
+    //__syncthreads(); 
+  }
+
+//--------------------------------------------------------------------------
+
+  if( tid ==0 && zc > 0)
+  {
+      //printf("warp %d zombie count = %d\n", blockIdx.x, zc);
+      atomicAdd( (unsigned long long int*)&(C->nzombies), (unsigned long long int)zc);
+      //printf(" Czombie = %lld\n",C->nzombies);
+  }
+
+  //__syncthreads();
+
+}
+
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu.jit b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu.jit
new file mode 100644
index 0000000000..ed9c569df5
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_mp.cu.jit
@@ -0,0 +1,302 @@
+const char* const templates_GB_jit_AxB_dot3_phase3_mp_cu = "templates/GB_jit_AxB_dot3_phase3_mp.cu\n"
+"//------------------------------------------------------------------------------\n"
+"// AxB_dot3_phase3_mp.cu \n"
+"//------------------------------------------------------------------------------\n"
+"\n"
+"// This CUDA kernel produces the semi-ring product of two\n"
+"// sparse matrices of types T_A and T_B and common index space size n, to a  \n"
+"// output matrix of type T_C. The matrices are sparse, with different numbers\n"
+"// of non-zeros and different sparsity patterns. \n"
+"// ie. we want to produce C = A'*B in the sense of the given semi-ring.\n"
+"\n"
+"// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are \n"
+"// relatively close in size, neither is very spare nor dense, for any size of N.\n"
+"// Handles arbitrary sparsity patterns with guaranteed load balance.\n"
+"\n"
+"// Both the grid and block are 1D, so blockDim.x is the # threads in a\n"
+"// threadblock, and the # of threadblocks is grid.x\n"
+"\n"
+"// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number\n"
+"// of active threads = min( min(g_xnz, g_ynz), 32) \n"
+"\n"
+"// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi.  Its job\n"
+"// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot\n"
+"// product on those items in the intersection, and finally reduce this data to a scalar, \n"
+"// on exit write it to g_odata [b].\n"
+"\n"
+"//  int64_t start          <- start of vector pairs for this kernel\n"
+"//  int64_t end            <- end of vector pairs for this kernel\n"
+"//  int64_t *Bucket        <- array of pair indices for all kernels \n"
+"//  matrix<T_C> *C         <- result matrix \n"
+"//  matrix<T_M> *M         <- mask matrix\n"
+"//  matrix<T_A> *A         <- input matrix A\n"
+"//  matrix<T_B> *B         <- input matrix B\n"
+"#include <limits>\n"
+"#include <cstdint>\n"
+"#include <cooperative_groups.h>\n"
+"#include \"mySemiRing.h\"\n"
+"#include \"matrix.h\"\n"
+"\n"
+"// Using tile size fixed at compile time, we don't need shared memory\n"
+"#define tile_sz 32 \n"
+"\n"
+"using namespace cooperative_groups;\n"
+"\n"
+"template< typename T, int warp_sz>\n"
+"__device__ __inline__ \n"
+"T GB_reduce_sum(thread_block_tile<warp_sz> g, T val)\n"
+"{\n"
+"    // Each iteration halves the number of active threads\n"
+"    // Each thread adds its partial sum[i] to sum[lane+i]\n"
+"    for (int i = g.size() / 2; i > 0; i /= 2)\n"
+"    {\n"
+"        T next = g.shfl_down( val, i);\n"
+"        val = GB_ADD( val, next ) ;\n"
+"    }\n"
+"    return val;\n"
+"}\n"
+"\n"
+"template< typename T, int warp_sz>\n"
+"__device__ __inline__ \n"
+"T reduce_plus(thread_block_tile<warp_sz> g, T val)\n"
+"{\n"
+"    // Each iteration halves the number of active threads\n"
+"    // Each thread adds its partial sum[i] to sum[lane+i]\n"
+"    for (int i = g.size() / 2; i > 0; i /= 2)\n"
+"    {\n"
+"        val += g.shfl_down( val, i) ;\n"
+"    }\n"
+"    return val; // note: only thread 0 will return full sum and flag value\n"
+"}\n"
+"\n"
+"#define intersects_per_thread 8\n"
+"\n"
+"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>  \n"
+"__global__ void AxB_dot3_phase3_mp\n"
+"(\n"
+"    int64_t start,\n"
+"    int64_t end,\n"
+"    int64_t *Bucket,\n"
+"    GrB_Matrix C,\n"
+"    GrB_Matrix M,\n"
+"    GrB_Matrix A,\n"
+"    GrB_Matrix B,\n"
+"    int sz\n"
+")\n"
+"{\n"
+"\n"
+"    T_A *Ax = (T_A*)A->x;\n"
+"    T_B *Bx = (T_B*)B->x;\n"
+"    T_C *Cx = (T_C*)C->x;\n"
+"    int64_t *Ci = C->i;\n"
+"    int64_t *Mi = M->i;\n"
+"    int64_t *Ai = A->i;\n"
+"    int64_t *Bi = B->i;\n"
+"    int64_t *Ap = A->p;\n"
+"    int64_t *Bp = B->p;\n"
+"\n"
+"\n"
+"    // zombie count\n"
+"    int zc = 0;\n"
+"\n"
+"    int64_t pair_id;\n"
+"\n"
+"    // set thread ID\n"
+"    int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;\n"
+"    int tid = threadIdx.x;\n"
+"\n"
+"    int b = blockIdx.x ;\n"
+"\n"
+"    // total items to be inspected\n"
+"    int64_t nnzA = 0;\n"
+"    int64_t nnzB = 0;\n"
+"    int64_t n_intersect = 0;\n"
+"\n"
+"    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());\n"
+"\n"
+"    int parts = blockDim.x; //(n_intersect+ intersects_per_thread -1)/ intersects_per_thread; \n"
+"\n"
+"    // int has_zombies = 0 ;\n"
+"\n"
+"    // Main loop over pairs \n"
+"    for (pair_id = start+ blockIdx.x; //warp per pair \n"
+"         pair_id < end;  \n"
+"         pair_id += gridDim.x )\n"
+"    {\n"
+"\n"
+"         int64_t i = Mi[pair_id];\n"
+"         int64_t j = Ci[pair_id] >> 4;\n"
+"\n"
+"         int64_t xstart = Ap[i];\n"
+"         int64_t xend   = Ap[i+1];\n"
+"         nnzA = xend - xstart;\n"
+"\n"
+"         int64_t ystart = Bp[j]; \n"
+"         int64_t yend   = Bp[j+1]; \n"
+"         nnzB = yend - ystart;\n"
+"\n"
+"         n_intersect = GB_IMIN( xend -xstart, yend -ystart); \n"
+"    /* \n"
+"    if (threadIdx.x ==0 ) {\n"
+"      printf(\"block %d  doing dot %lld  i,j= %lld,%lld\\n\", blockIdx.x, pair_id, i, j);\n"
+"    }\n"
+"    */\n"
+"    //we want more than one intersection per thread\n"
+"    int64_t nxy = nnzA + nnzB;\n"
+"\n"
+"    int work_per_thread = (nxy +parts -1)/parts;\n"
+"    int diag = GB_IMIN( work_per_thread*tid, nxy);\n"
+"    int diag_end = GB_IMIN( diag + work_per_thread, nxy);\n"
+"    //printf(\" thd%d parts = %u wpt = %u diag, diag_end  = %u,%u\\n\",tid, parts, work_per_thread, diag, diag_end); \n"
+"\n"
+"    int x_min = GB_IMAX( (int)(diag - nnzB), 0);\n"
+"    int x_max = GB_IMIN( diag, nnzA);\n"
+"\n"
+"    //printf(\"start thd%u x_min = %u x_max = %u\\n\", tid_global, x_min,x_max);\n"
+"    while ( x_min < x_max) { //binary search for correct diag break\n"
+"      int pivot = (x_min +x_max)/2;\n"
+"      if ( Ai[pivot + xstart] < Bi[ diag -pivot -1 + ystart]) {\n"
+"         x_min = pivot +1;\n"
+"      }\n"
+"      else {\n"
+"         x_max = pivot;\n"
+"      }\n"
+"    }\n"
+"    int xcoord = x_min;\n"
+"    int ycoord = diag -x_min -1;\n"
+"    if (( diag > 0) &&(diag < (nnzA+nnzB)) && (Ai[xcoord+xstart] == Bi[ycoord+ystart]) ) { \n"
+"       diag--; //adjust for intersection incrementing both pointers \n"
+"    }\n"
+"    // two start points are known now\n"
+"    int tx_start = xcoord +xstart;\n"
+"    int ty_start = diag -xcoord +ystart; \n"
+"\n"
+"    //if (x_start != y_start)\n"
+"    //   printf(\"start thd%u  xs,ys = %i,%i\\n\", tid_global, x_start, y_start);\n"
+"\n"
+"    x_min = GB_IMAX( (int)(diag_end - nnzB), 0);\n"
+"    x_max = GB_IMIN( diag_end, nnzA);\n"
+"\n"
+"    while ( x_min < x_max) {\n"
+"       int pivot = (x_min +x_max)/2;\n"
+"       //printf(\"thd%u pre_sw piv=%u diag_e = %u  xmin,xmax=%u,%u\\n\", tid_global, pivot, diag_end,x_min, x_max);\n"
+"       if ( Ai[pivot+ xstart] < Bi[ diag_end -pivot -1 +ystart]) {\n"
+"          x_min = pivot +1;\n"
+"       }\n"
+"       else {\n"
+"          x_max = pivot;\n"
+"       }\n"
+"       //printf(\"thd%u piv=%u xmin,xmax = %u,%u\\n\", tid_global, pivot, x_min, x_max);\n"
+"    }\n"
+"    xcoord = x_min;\n"
+"    ycoord = diag_end -x_min -1;\n"
+"    if ( (diag_end < (nnzA +nnzB)) && (Ai[xcoord +xstart] == Bi[ycoord + ystart]) ) { \n"
+"        diag--; //adjust for intersection incrementing both pointers  \n"
+"    }\n"
+"    // two end points are known now\n"
+"    int tx_end = xcoord +xstart; \n"
+"    int ty_end = diag_end - xcoord + ystart; \n"
+"\n"
+"    T_A aki;\n"
+"    T_B bkj;\n"
+"    T_Z cij = GB_IDENTITY ;\n"
+"\n"
+"    // TODO PLUS_PAIR_INT64, FP32, FP64: no need for cij_exists.\n"
+"    // just check if cij > 0\n"
+"\n"
+"    int cij_exists  = 0 ;\n"
+"    //printf(\" thd%u has init value %f\\n\",tid, cij);\n"
+"\n"
+"    //merge-path dot product\n"
+"    int k = tx_start;\n"
+"    int l = ty_start;\n"
+"    while ( k < tx_end && l < ty_end )\n"
+"    {\n"
+"       if (Ai [k] == Bi [l])\n"
+"       {\n"
+"          GB_GETA ( aki=(T_Z)Ax[k] ) ;\n"
+"          GB_GETB ( bkj=(T_Z)Bx[l] ) ;\n"
+"          if (cij_exists)\n"
+"          {\n"
+"            T_Z t = GB_MULT( (T_Z)aki, (T_Z)bkj );\n"
+"            GB_ADD_F (cij, t ) ;\n"
+"          //printf(\"  thd%d ix at %lld   cij += %d * %d \\n\", tid_global, Ai[k], aki, bkj);\n"
+"          }\n"
+"          else\n"
+"          {\n"
+"            cij_exists = 1 ;\n"
+"            cij = GB_MULT ( (T_Z)aki, (T_Z)bkj ) ;\n"
+"          //printf(\"  thd%d ix at %lld   cij = %d * %d \\n\", tid_global, Ai[k], Ax[k], Bx[l]);\n"
+"          }\n"
+"          // TODO check terminal condition\n"
+"          k+= 1;\n"
+"          l+= 1;\n"
+"          //printf(\" block%u work value = %d, exists = %d\\n\", b, cij, cij_exists);\n"
+"       }\n"
+"       else\n"
+"       {\n"
+"            k += ( Ai[k] < Bi[l] ) ;\n"
+"            l += ( Ai[k] > Bi[l] ) ;\n"
+"       }\n"
+"    }\n"
+"\n"
+"    //tile.sync( ) ;\n"
+"    //--------------------------------------------------------------------------\n"
+"    // reduce sum per-thread values to a single scalar, get OR of flag\n"
+"    //--------------------------------------------------------------------------\n"
+"    /*\n"
+"    if (tid == 0)\n"
+"    {\n"
+"        printf (\"reduce %d : %d exists = %d\\n\", b,  cij, cij_exists) ;\n"
+"    }\n"
+"    __syncthreads();\n"
+"    */\n"
+"\n"
+"    // Do vote here for control.\n"
+"    cij_exists  = tile.any( cij_exists);\n"
+"    //tile.sync();\n"
+"\n"
+"    if (cij_exists)\n"
+"    {\n"
+"       cij = GB_reduce_sum<T_Z, tile_sz>( tile, cij );\n"
+"       \n"
+"    }\n"
+"    // else has_zombies = 1;\n"
+"\n"
+"\n"
+"    //__syncthreads();\n"
+"    //tile.sync( );\n"
+"    // write result for this block to global mem\n"
+"    if (tid == 0)\n"
+"    {\n"
+"        //printf (\"final %d : %d exists = %d\\n\", b,  cij, cij_exists) ;\n"
+"        if (cij_exists)\n"
+"        {\n"
+"           //printf(\" cij = %d\\n\", cij);\n"
+"           GB_PUTC ( Cx[pair_id]=(T_C)cij ) ;\n"
+"           GB_PUTC ( Ci[pair_id]=i ) ;\n"
+"        }\n"
+"        else\n"
+"        {\n"
+"           //printf(\" dot %d is a zombie\\n\", pair_id);\n"
+"           zc++;\n"
+"           GB_PUTC ( Ci[pair_id]=GB_FLIP (i) ) ;\n"
+"        }\n"
+"    }\n"
+"    //__syncthreads(); \n"
+"  }\n"
+"\n"
+"//--------------------------------------------------------------------------\n"
+"\n"
+"  if( tid ==0 && zc > 0)\n"
+"  {\n"
+"      //printf(\"warp %d zombie count = %d\\n\", blockIdx.x, zc);\n"
+"      atomicAdd( (unsigned long long int*)&(C->zombie_count), (unsigned long long int)zc);\n"
+"      //printf(\" Czombie = %lld\\n\",C->zombie_count);\n"
+"  }\n"
+"\n"
+"  //__syncthreads();\n"
+"\n"
+"}\n"
+"\n"
+;
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cu
new file mode 100644
index 0000000000..537c489fb8
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cu
@@ -0,0 +1,136 @@
+//******************************************************************************
+//  Sparse dot products in batch form, sparse - dense case. 
+//  Each thread in this kernel is responsible for m vector-pairs(x,y), 
+//  m = 256/sz, where sz is in {4, 16, 64, 256}
+//  We know each non-zero on the sparse side will hit a dense value.
+//  Template on <T_C, T_A, T_B, T_X, T_Y, T_Z >
+//  Parameters:
+
+//  int64_t start          <- beginning of bucket  
+//  int64_t end            <- end of bucket
+//  int64_t *Bucket        <- index of each pair in this bucket
+//  matrix<T_C> *C         <- C result matrix 
+//  matrix<T_C> *M         <- Mask matrix 
+//  matrix<T_A> *A         <- A matrix to multiply, sparse 
+//  matrix<T_B> *B         <- B matrix to multiply, dense in sparse format? 
+//  int sz                 <- size hint for smaller vector
+//******************************************************************************
+#include <limits>
+#include <cstdint>
+#include <stdio.h>
+#include "mySemiRing.h"
+#include "matrix.h"
+
+template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>
+__global__ void AxB_dot3_phase3_spdn
+( 
+  int64_t start, 
+  int64_t end,
+  int64_t *Bucket, 
+  GrB_Matrix C, 
+  GrB_Matrix M, 
+  GrB_Matrix A, 
+  GrB_Matrix B,
+  int sz 
+)
+{
+
+   T_A *Ax = (T_A*)A->x;
+   T_B *Bx = (T_B*)B->x;
+   T_C *Cx = (T_C*)C->x;
+   int64_t *Ci = C->i;
+   int64_t *Mi = M->i;
+   int64_t *Ai = A->i;
+   int64_t *Bi = B->i;
+   int64_t *Ap = A->p;
+   int64_t *Bp = B->p;
+
+   // sz = expected non-zeros per dot 
+   int m = 256/sz;
+   int nvecs = end - start;
+   int dpt = nvecs/32;
+   m = dpt < m ? dpt : m;
+   //if( threadIdx.x ==0)
+   //   printf("thd:%d %d dots/thrd, nvecs = %d blockDim=%d\n",threadIdx.x, sz, nvecs, blockDim.x);
+   //__syncthreads();
+   int dots = (nvecs +m -1)/m; 
+   int zc = 0;
+     
+   for ( int tid= threadIdx.x +blockDim.x*blockIdx.x;
+             tid < dots;
+             tid += blockDim.x * gridDim.x) {
+      int pair_id, im; 
+       //if (threadIdx.x ==0)
+       //  printf("thd%u pi=%lld\n",tid, start+threadIdx.x); 
+       //  __syncthreads();
+
+      for (pair_id = start+tid, im = 0; 
+           im < m && pair_id < end;  
+           ++im,     pair_id += dots ){
+
+         int64_t i = Mi[pair_id];
+         int64_t j = Ci[pair_id] >> 4;
+      //if (threadIdx.x ==0)
+      //   printf("thd%u i,j=%lld,%lld\n",tid, i,j); 
+      //   __syncthreads();
+         
+     //  printf("thd%d pi=%d xn=%lld yn=%lld\n",tid, pair_id, 
+     //                 A->p[i+1]- A->p[i],
+     //                 B->p[j+1]- B->p[j]);
+
+         int64_t pA = Ap[i];
+         int64_t pA_end   = Ap[i+1];
+         int64_t nnzA   = pA_end - pA;
+         int64_t pB = Bp[i];
+         int64_t pB_end   = Bp[i+1];
+         int64_t nnzB   = pB_end - pB;
+         T_A aki;
+         T_B bkj;
+         T_Z cij;
+
+         if( nnzA == A->vlen) // A is dense
+         {
+            int64_t k = Bi [pB] ;               // first row index of B(:,j)
+            // cij = A(k,i) * B(k,j)
+            GB_GETA ( aki=(T_Z)Ax[pA+k] ) ;           // aki = A(k,i)
+            GB_GETB ( bkj=(T_Z)Bx[pB] ) ;           // bkj = B(k,j)
+            GB_C_MULT ( cij, aki, bkj ) ;           // cij = aki * bkj
+
+            for (int64_t p = pB+1 ; p < pB_end ; p++)
+            { 
+                //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal
+                int64_t k = Bi [p] ;                // next row index of B(:,j)
+                // cij += A(k,i) * B(k,j)
+                GB_GETA ( aki=(T_Z)Ax[pA+k] ) ;           // aki = A(k,i)
+                GB_GETB ( bkj=(T_Z)Bx[p] ) ;           // bkj = B(k,j)
+                GB_MULTADD ( cij, aki, bkj ) ;        // cij += aki * bkj
+            }
+
+         }
+         if( nnzB == B->vlen) // B is dense
+         {
+            int64_t k = Ai [pA] ;               // first row index of A(:,i)
+            // cij = A(k,i) * B(k,j)
+            GB_GETA ( aki=(T_Z)Ax[ pA ] ) ;           // aki = A(k,i)
+            GB_GETB ( bkj=(T_Z)Bx[ pB+k ] ) ;           // bkj = B(k,j)
+            GB_C_MULT ( cij, aki, bkj) ;           // cij = aki * bkj
+
+            for (int64_t p = pA+1 ; p < pA_end ; p++)
+            { 
+                //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal
+                int64_t k = Ai [p] ;                // next row index of A(:,i)
+                // cij += A(k,i) * B(k,j)
+                GB_GETA ( aki=(T_Z)Ax[ p ] ) ;           // aki = A(k,i)
+                GB_GETB ( bkj=(T_Z)Bx[ pB+k] ) ;           // bkj = B(k,j)
+                GB_MULTADD ( cij, aki, bkj) ;        // cij += aki * bkj
+            }
+         }
+
+         GB_PUTC( Ci[pair_id]=i ) ;
+         GB_PUTC( Cx[pair_id]=cij ) ;
+        
+      }
+  
+   }
+   
+}
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit
new file mode 100644
index 0000000000..d057e78a6c
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit
@@ -0,0 +1,138 @@
+const char* const templates_GB_jit_AxB_dot3_phase3_spdn_cu = "templates/GB_jit_AxB_dot3_phase3_spdn.cu\n"
+"//******************************************************************************\n"
+"//  Sparse dot products in batch form, sparse - dense case. \n"
+"//  Each thread in this kernel is responsible for m vector-pairs(x,y), \n"
+"//  m = 256/sz, where sz is in {4, 16, 64, 256}\n"
+"//  We know each non-zero on the sparse side will hit a dense value.\n"
+"//  Template on <T_C, T_A, T_B, T_X, T_Y, T_Z >\n"
+"//  Parameters:\n"
+"\n"
+"//  int64_t start          <- beginning of bucket  \n"
+"//  int64_t end            <- end of bucket\n"
+"//  int64_t *Bucket        <- index of each pair in this bucket\n"
+"//  matrix<T_C> *C         <- C result matrix \n"
+"//  matrix<T_C> *M         <- Mask matrix \n"
+"//  matrix<T_A> *A         <- A matrix to multiply, sparse \n"
+"//  matrix<T_B> *B         <- B matrix to multiply, dense in sparse format? \n"
+"//  int sz                 <- size hint for smaller vector\n"
+"//******************************************************************************\n"
+"#include <limits>\n"
+"#include <cstdint>\n"
+"#include <stdio.h>\n"
+"#include \"mySemiRing.h\"\n"
+"#include \"matrix.h\"\n"
+"\n"
+"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>\n"
+"__global__ void AxB_dot3_phase3_spdn\n"
+"( \n"
+"  int64_t start, \n"
+"  int64_t end,\n"
+"  int64_t *Bucket, \n"
+"  GrB_Matrix C, \n"
+"  GrB_Matrix M, \n"
+"  GrB_Matrix A, \n"
+"  GrB_Matrix B,\n"
+"  int sz \n"
+")\n"
+"{\n"
+"\n"
+"   T_A *Ax = (T_A*)A->x;\n"
+"   T_B *Bx = (T_B*)B->x;\n"
+"   T_C *Cx = (T_C*)C->x;\n"
+"   int64_t *Ci = C->i;\n"
+"   int64_t *Mi = M->i;\n"
+"   int64_t *Ai = A->i;\n"
+"   int64_t *Bi = B->i;\n"
+"   int64_t *Ap = A->p;\n"
+"   int64_t *Bp = B->p;\n"
+"\n"
+"   // sz = expected non-zeros per dot \n"
+"   int m = 256/sz;\n"
+"   int nvecs = end - start;\n"
+"   int dpt = nvecs/32;\n"
+"   m = dpt < m ? dpt : m;\n"
+"   //if( threadIdx.x ==0)\n"
+"   //   printf(\"thd:%d %d dots/thrd, nvecs = %d blockDim=%d\\n\",threadIdx.x, sz, nvecs, blockDim.x);\n"
+"   //__syncthreads();\n"
+"   int dots = (nvecs +m -1)/m; \n"
+"   int zc = 0;\n"
+"     \n"
+"   for ( int tid= threadIdx.x +blockDim.x*blockIdx.x;\n"
+"             tid < dots;\n"
+"             tid += blockDim.x * gridDim.x) {\n"
+"      int pair_id, im; \n"
+"       //if (threadIdx.x ==0)\n"
+"       //  printf(\"thd%u pi=%lld\\n\",tid, start+threadIdx.x); \n"
+"       //  __syncthreads();\n"
+"\n"
+"      for (pair_id = start+tid, im = 0; \n"
+"           im < m && pair_id < end;  \n"
+"           ++im,     pair_id += dots ){\n"
+"\n"
+"         int64_t i = Mi[pair_id];\n"
+"         int64_t j = Ci[pair_id] >> 4;\n"
+"      //if (threadIdx.x ==0)\n"
+"      //   printf(\"thd%u i,j=%lld,%lld\\n\",tid, i,j); \n"
+"      //   __syncthreads();\n"
+"         \n"
+"     //  printf(\"thd%d pi=%d xn=%lld yn=%lld\\n\",tid, pair_id, \n"
+"     //                 A->p[i+1]- A->p[i],\n"
+"     //                 B->p[j+1]- B->p[j]);\n"
+"\n"
+"         int64_t pA = Ap[i];\n"
+"         int64_t pA_end   = Ap[i+1];\n"
+"         int64_t nnzA   = pA_end - pA;\n"
+"         int64_t pB = Bp[i];\n"
+"         int64_t pB_end   = Bp[i+1];\n"
+"         int64_t nnzB   = pB_end - pB;\n"
+"         T_A aki;\n"
+"         T_B bkj;\n"
+"         T_Z cij;\n"
+"\n"
+"         if( nnzA == A->vlen) // A is dense\n"
+"         {\n"
+"            int64_t k = Bi [pB] ;               // first row index of B(:,j)\n"
+"            // cij = A(k,i) * B(k,j)\n"
+"            GB_GETA ( aki=(T_Z)Ax[pA+k] ) ;           // aki = A(k,i)\n"
+"            GB_GETB ( bkj=(T_Z)Bx[pB] ) ;           // bkj = B(k,j)\n"
+"            GB_C_MULT ( cij, aki, bkj ) ;           // cij = aki * bkj\n"
+"\n"
+"            for (int64_t p = pB+1 ; p < pB_end ; p++)\n"
+"            { \n"
+"                //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal\n"
+"                int64_t k = Bi [p] ;                // next row index of B(:,j)\n"
+"                // cij += A(k,i) * B(k,j)\n"
+"                GB_GETA ( aki=(T_Z)Ax[pA+k] ) ;           // aki = A(k,i)\n"
+"                GB_GETB ( bkj=(T_Z)Bx[p] ) ;           // bkj = B(k,j)\n"
+"                GB_MULTADD ( cij, aki, bkj ) ;        // cij += aki * bkj\n"
+"            }\n"
+"\n"
+"         }\n"
+"         if( nnzB == B->vlen) // B is dense\n"
+"         {\n"
+"            int64_t k = Ai [pA] ;               // first row index of A(:,i)\n"
+"            // cij = A(k,i) * B(k,j)\n"
+"            GB_GETA ( aki=(T_Z)Ax[ pA ] ) ;           // aki = A(k,i)\n"
+"            GB_GETB ( bkj=(T_Z)Bx[ pB+k ] ) ;           // bkj = B(k,j)\n"
+"            GB_C_MULT ( cij, aki, bkj) ;           // cij = aki * bkj\n"
+"\n"
+"            for (int64_t p = pA+1 ; p < pA_end ; p++)\n"
+"            { \n"
+"                //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal\n"
+"                int64_t k = Ai [p] ;                // next row index of A(:,i)\n"
+"                // cij += A(k,i) * B(k,j)\n"
+"                GB_GETA ( aki=(T_Z)Ax[ p ] ) ;           // aki = A(k,i)\n"
+"                GB_GETB ( bkj=(T_Z)Bx[ pB+k] ) ;           // bkj = B(k,j)\n"
+"                GB_MULTADD ( cij, aki, bkj) ;        // cij += aki * bkj\n"
+"            }\n"
+"         }\n"
+"\n"
+"         GB_PUTC( Ci[pair_id]=i ) ;\n"
+"         GB_PUTC( Cx[pair_id]=cij ) ;\n"
+"        \n"
+"      }\n"
+"  \n"
+"   }\n"
+"   \n"
+"}\n"
+;
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cu
new file mode 100644
index 0000000000..3ed255b7e3
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cu
@@ -0,0 +1,234 @@
+//------------------------------------------------------------------------------
+// spGEMM_very_sparse_sparse.cu 
+//------------------------------------------------------------------------------
+
+// The spGEM_vssp CUDA kernel produces the semi-ring product of two
+// sparse matrices of types T_A and T_B and common index space size n, to a  
+// output matrix of type T_C. The matrices are sparse, with different numbers
+// of non-zeros and different sparsity patterns. 
+// ie. we want to produce C = A'*B in the sense of the given semi-ring.
+
+// This version uses a binary-search algorithm, when the sizes nnzA and nnzB
+// are far apart in size, neither is very spare nor dense, for any size of N.
+
+// Both the grid and block are 1D, so blockDim.x is the # threads in a
+// threadblock, and the # of threadblocks is grid.x
+
+// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number
+// of active threads = min( min(nzA, nzB), 32) 
+
+// Thus, each t in threadblock b owns a part of the set of pairs in the 
+// sparse-sparse bucket of work. The job for each pair of vectors is to find 
+// the intersection of the index sets Ai and Bi, perform the semi-ring dot 
+// product on those items in the intersection, and finally
+// on exit write it to Cx [pair].
+
+//  int64_t start          <- start of vector pairs for this kernel
+//  int64_t end            <- end of vector pairs for this kernel
+//  int64_t *Bucket        <- array of pair indices for all kernels 
+//  GrB_Matrix C         <- result matrix 
+//  GrB_Matrix M         <- mask matrix
+//  GrB_Matrix A         <- input matrix A
+//  GrB_Matrix B         <- input matrix B
+
+#include <limits>
+#include <cstdint>
+#include <cooperative_groups.h>
+#include "mySemiRing.h"
+#include "matrix.h"
+
+// Using tile size fixed at compile time, we don't need shared memory
+#define tile_sz 32 
+
+using namespace cooperative_groups;
+
+template< typename T, int warpSize >
+__device__ T reduce_sum(thread_block_tile<warpSize> g, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2)
+    {
+        val += g.shfl_down(val,i) ;
+    }
+    return val; // note: only thread 0 will return full sum
+}
+
+#define intersects_per_thread 8
+
+template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>
+__global__ void AxB_dot3_phase3_vssp
+(
+    int64_t start,
+    int64_t end,
+    int64_t *Bucket,
+    GrB_Matrix C,
+    GrB_Matrix M,
+    GrB_Matrix A,
+    GrB_Matrix B,
+    int sz
+)
+{
+   // Typed pointers to access data in A,B,C
+   T_A *Ax = (T_A*)A->x;
+   T_B *Bx = (T_B*)B->x;
+   T_C *Cx = (T_C*)C->x;
+   int64_t *Ci = C->i;
+   int64_t *Mi = M->i;
+   int64_t *Ai = A->i;
+   int64_t *Bi = B->i;
+   int64_t *Ap = A->p;
+   int64_t *Bp = B->p;
+
+   // sz = expected non-zeros per dot 
+   int m = 256/sz;
+   int nvecs = end - start;
+   int dpt = nvecs/(gridDim.x*32);
+   
+   int dots = (nvecs +dpt -1)/dpt; 
+
+   // zombie count
+   int zc = 0;
+   int64_t pair_id, im;
+
+   // set thread ID
+   unsigned int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;
+   unsigned int tid = threadIdx.x;
+
+   unsigned long int b = blockIdx.x ;
+
+   // Main loop over pairs 
+   for (pair_id = start+ tid_global, im = 0; 
+        pair_id < end && im < m;  
+        pair_id += gridDim.x*blockDim.x, ++im){
+
+        int64_t i = Mi[pair_id];
+        int64_t j = Ci[pair_id] >> 4;
+
+        if( j < 0) //Pre-zombie
+        {
+            zc++;
+            continue;
+        }
+
+        int64_t pA      = Ap[i];
+        int64_t pA_end  = Ap[i+1];
+        int64_t nnzA = pA_end - pA;
+
+        int64_t pB      = B->p[j]; 
+        int64_t pB_end  = B->p[j+1]; 
+        int64_t nnzB = pB_end - pB;
+
+        //Search for each nonzero in the smaller vector to find intersection 
+        bool cij_exists = false;
+
+        T_A aki;
+        T_B bkj;
+        T_Z cij;
+
+        if (nnzA <= nnzB) {
+            //----------------------------------------------------------------------
+            // A(:,i) is very sparse compared to B(:,j)
+            //----------------------------------------------------------------------
+
+            while (pA < pA_end && pB < pB_end)
+            {
+                int64_t ia = Ai [pA] ;
+                int64_t ib = Bi [pB] ;
+                if (ia < ib)
+                { 
+                    // A(ia,i) appears before B(ib,j)
+                    pA++ ;
+                }
+                else if (ib < ia)
+                { 
+                    // B(ib,j) appears before A(ia,i)
+                    // discard all entries B(ib:ia-1,j)
+                    int64_t pleft = pB + 1 ;
+                    int64_t pright = pB_end - 1 ;
+                    GB_TRIM_BINARY_SEARCH (ia, Bi, pleft, pright) ;
+                    //ASSERT (pleft > pB) ;
+                    pB = pleft ;
+                }
+                else // ia == ib == k
+                { 
+                    // A(k,i) and B(k,j) are the next entries to merge
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cij_exists = true ;
+                    break ;
+                    #else
+                    GB_DOT_MERGE ;
+                    //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal
+                    pA++ ;
+                    pB++ ;
+                    #endif
+                }
+            }
+        }
+        else {
+            //----------------------------------------------------------------------
+            // B(:,j) is very sparse compared to A(:,i)
+            //----------------------------------------------------------------------
+
+            while (pA < pA_end && pB < pB_end)
+            {
+                int64_t ia = Ai [pA] ;
+                int64_t ib = Bi [pB] ;
+                if (ia < ib)
+                { 
+                    // A(ia,i) appears before B(ib,j)
+                    // discard all entries A(ia:ib-1,i)
+                    int64_t pleft = pA + 1 ;
+                    int64_t pright = pA_end - 1 ;
+                    GB_TRIM_BINARY_SEARCH (ib, Ai, pleft, pright) ;
+                    //ASSERT (pleft > pA) ;
+                    pA = pleft ;
+                }
+                else if (ib < ia)
+                { 
+                    // B(ib,j) appears before A(ia,i)
+                    pB++ ;
+                }
+                else // ia == ib == k
+                { 
+                    // A(k,i) and B(k,j) are the next entries to merge
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cij_exists = true ;
+                    break ;
+                    #else
+                    GB_DOT_MERGE ;
+                    //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal
+                    pA++ ;
+                    pB++ ;
+                    #endif
+                }
+            }
+
+        }
+        if ( cij_exists){
+           GB_PUTC ( Ci[pair_id]=i ) ;
+           GB_PUTC ( Cx[pair_id]=(T_C)cij ) ;
+        }
+        else {
+           zc++; 
+           //printf(" %lld, %lld is zombie %d!\n",i,j,zc);
+           GB_PUTC( Ci[pair_id] = GB_FLIP( i ) ) ;
+        }
+
+
+    }
+
+    //--------------------------------------------------------------------------
+    // reduce sum per-thread values to a single scalar
+    //--------------------------------------------------------------------------
+    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());
+    zc = reduce_sum<int,tile_sz>(tile, zc);
+
+    if( threadIdx.x ==0) {
+      //printf("warp %d zombie count = %d\n", blockIdx.x, zc);
+      atomicAdd( (unsigned long long int*)&(C->nzombies), (unsigned long long int)zc);
+      //printf(" Czombie = %lld\n",C->nzombies);
+    }
+
+}
+
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit
new file mode 100644
index 0000000000..d8d5f480b8
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit
@@ -0,0 +1,230 @@
+const char* const templates_GB_jit_AxB_dot3_phase3_vssp_cu = "templates/GB_jit_AxB_dot3_phase3_vssp.cu\n"
+"//------------------------------------------------------------------------------\n"
+"// spGEMM_very_sparse_sparse.cu \n"
+"//------------------------------------------------------------------------------\n"
+"\n"
+"// The spGEM_vssp CUDA kernel produces the semi-ring product of two\n"
+"// sparse matrices of types T_A and T_B and common index space size n, to a  \n"
+"// output matrix of type T_C. The matrices are sparse, with different numbers\n"
+"// of non-zeros and different sparsity patterns. \n"
+"// ie. we want to produce C = A'*B in the sense of the given semi-ring.\n"
+"\n"
+"// This version uses a binary-search algorithm, when the sizes nnzA and nnzB\n"
+"// are far apart in size, neither is very spare nor dense, for any size of N.\n"
+"\n"
+"// Both the grid and block are 1D, so blockDim.x is the # threads in a\n"
+"// threadblock, and the # of threadblocks is grid.x\n"
+"\n"
+"// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number\n"
+"// of active threads = min( min(nzA, nzB), 32) \n"
+"\n"
+"// Thus, each t in threadblock b owns a part of the set of pairs in the \n"
+"// sparse-sparse bucket of work. The job for each pair of vectors is to find \n"
+"// the intersection of the index sets Ai and Bi, perform the semi-ring dot \n"
+"// product on those items in the intersection, and finally\n"
+"// on exit write it to Cx [pair].\n"
+"\n"
+"//  int64_t start          <- start of vector pairs for this kernel\n"
+"//  int64_t end            <- end of vector pairs for this kernel\n"
+"//  int64_t *Bucket        <- array of pair indices for all kernels \n"
+"//  GrB_Matrix C         <- result matrix \n"
+"//  GrB_Matrix M         <- mask matrix\n"
+"//  GrB_Matrix A         <- input matrix A\n"
+"//  GrB_Matrix B         <- input matrix B\n"
+"\n"
+"#include <limits>\n"
+"#include <cstdint>\n"
+"#include <cooperative_groups.h>\n"
+"#include \"mySemiRing.h\"\n"
+"#include \"matrix.h\"\n"
+"\n"
+"// Using tile size fixed at compile time, we don't need shared memory\n"
+"#define tile_sz 32 \n"
+"\n"
+"using namespace cooperative_groups;\n"
+"\n"
+"template< typename T, int warpSize >\n"
+"__device__ T reduce_sum(thread_block_tile<warpSize> g, T val)\n"
+"{\n"
+"    // Each iteration halves the number of active threads\n"
+"    // Each thread adds its partial sum[i] to sum[lane+i]\n"
+"    for (int i = g.size() / 2; i > 0; i /= 2)\n"
+"    {\n"
+"        val += g.shfl_down(val,i) ;\n"
+"    }\n"
+"    return val; // note: only thread 0 will return full sum\n"
+"}\n"
+"\n"
+"#define intersects_per_thread 8\n"
+"\n"
+"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>\n"
+"__global__ void AxB_dot3_phase3_vssp\n"
+"(\n"
+"    int64_t start,\n"
+"    int64_t end,\n"
+"    int64_t *Bucket,\n"
+"    GrB_Matrix C,\n"
+"    GrB_Matrix M,\n"
+"    GrB_Matrix A,\n"
+"    GrB_Matrix B,\n"
+"    int sz\n"
+")\n"
+"{\n"
+"   // Typed pointers to access data in A,B,C\n"
+"   T_A *Ax = (T_A*)A->x;\n"
+"   T_B *Bx = (T_B*)B->x;\n"
+"   T_C *Cx = (T_C*)C->x;\n"
+"   int64_t *Ci = C->i;\n"
+"   int64_t *Mi = M->i;\n"
+"   int64_t *Ai = A->i;\n"
+"   int64_t *Bi = B->i;\n"
+"   int64_t *Ap = A->p;\n"
+"   int64_t *Bp = B->p;\n"
+"\n"
+"   // sz = expected non-zeros per dot \n"
+"   int m = 256/sz;\n"
+"   int nvecs = end - start;\n"
+"   int dpt = nvecs/(gridDim.x*32);\n"
+"   \n"
+"   int dots = (nvecs +dpt -1)/dpt; \n"
+"\n"
+"   // zombie count\n"
+"   int zc = 0;\n"
+"   int64_t pair_id, im;\n"
+"\n"
+"   // set thread ID\n"
+"   unsigned int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;\n"
+"   unsigned int tid = threadIdx.x;\n"
+"\n"
+"   unsigned long int b = blockIdx.x ;\n"
+"\n"
+"   // Main loop over pairs \n"
+"   for (pair_id = start+ tid_global, im = 0; \n"
+"        pair_id < end && im < m;  \n"
+"        pair_id += gridDim.x*blockDim.x, ++im){\n"
+"\n"
+"        int64_t i = Mi[pair_id];\n"
+"        int64_t j = Ci[pair_id] >> 4;\n"
+"\n"
+"        int64_t pA      = Ap[i];\n"
+"        int64_t pA_end  = Ap[i+1];\n"
+"        int64_t nnzA = pA_end - pA;\n"
+"\n"
+"        int64_t pB      = B->p[j]; \n"
+"        int64_t pB_end  = B->p[j+1]; \n"
+"        int64_t nnzB = pB_end - pB;\n"
+"\n"
+"        //Search for each nonzero in the smaller vector to find intersection \n"
+"        bool cij_exists = false;\n"
+"\n"
+"        T_A aki;\n"
+"        T_B bkj;\n"
+"        T_Z cij;\n"
+"\n"
+"        if (nnzA <= nnzB) {\n"
+"            //----------------------------------------------------------------------\n"
+"            // A(:,i) is very sparse compared to B(:,j)\n"
+"            //----------------------------------------------------------------------\n"
+"\n"
+"            while (pA < pA_end && pB < pB_end)\n"
+"            {\n"
+"                int64_t ia = Ai [pA] ;\n"
+"                int64_t ib = Bi [pB] ;\n"
+"                if (ia < ib)\n"
+"                { \n"
+"                    // A(ia,i) appears before B(ib,j)\n"
+"                    pA++ ;\n"
+"                }\n"
+"                else if (ib < ia)\n"
+"                { \n"
+"                    // B(ib,j) appears before A(ia,i)\n"
+"                    // discard all entries B(ib:ia-1,j)\n"
+"                    int64_t pleft = pB + 1 ;\n"
+"                    int64_t pright = pB_end - 1 ;\n"
+"                    GB_TRIM_BINARY_SEARCH (ia, Bi, pleft, pright) ;\n"
+"                    //ASSERT (pleft > pB) ;\n"
+"                    pB = pleft ;\n"
+"                }\n"
+"                else // ia == ib == k\n"
+"                { \n"
+"                    // A(k,i) and B(k,j) are the next entries to merge\n"
+"                    #if defined ( GB_PHASE_1_OF_2 )\n"
+"                    cij_exists = true ;\n"
+"                    break ;\n"
+"                    #else\n"
+"                    GB_DOT_MERGE ;\n"
+"                    //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal\n"
+"                    pA++ ;\n"
+"                    pB++ ;\n"
+"                    #endif\n"
+"                }\n"
+"            }\n"
+"        }\n"
+"        else {\n"
+"            //----------------------------------------------------------------------\n"
+"            // B(:,j) is very sparse compared to A(:,i)\n"
+"            //----------------------------------------------------------------------\n"
+"\n"
+"            while (pA < pA_end && pB < pB_end)\n"
+"            {\n"
+"                int64_t ia = Ai [pA] ;\n"
+"                int64_t ib = Bi [pB] ;\n"
+"                if (ia < ib)\n"
+"                { \n"
+"                    // A(ia,i) appears before B(ib,j)\n"
+"                    // discard all entries A(ia:ib-1,i)\n"
+"                    int64_t pleft = pA + 1 ;\n"
+"                    int64_t pright = pA_end - 1 ;\n"
+"                    GB_TRIM_BINARY_SEARCH (ib, Ai, pleft, pright) ;\n"
+"                    //ASSERT (pleft > pA) ;\n"
+"                    pA = pleft ;\n"
+"                }\n"
+"                else if (ib < ia)\n"
+"                { \n"
+"                    // B(ib,j) appears before A(ia,i)\n"
+"                    pB++ ;\n"
+"                }\n"
+"                else // ia == ib == k\n"
+"                { \n"
+"                    // A(k,i) and B(k,j) are the next entries to merge\n"
+"                    #if defined ( GB_PHASE_1_OF_2 )\n"
+"                    cij_exists = true ;\n"
+"                    break ;\n"
+"                    #else\n"
+"                    GB_DOT_MERGE ;\n"
+"                    //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal\n"
+"                    pA++ ;\n"
+"                    pB++ ;\n"
+"                    #endif\n"
+"                }\n"
+"            }\n"
+"\n"
+"        }\n"
+"        if ( cij_exists){\n"
+"           GB_PUTC ( Ci[pair_id]=i ) ;\n"
+"           GB_PUTC ( Cx[pair_id]=(T_C)cij ) ;\n"
+"        }\n"
+"        else {\n"
+"           zc++; \n"
+"           //printf(\" %lld, %lld is zombie %d!\\n\",i,j,zc);\n"
+"           GB_PUTC( Ci[pair_id] = GB_FLIP( i ) ) ;\n"
+"        }\n"
+"\n"
+"\n"
+"    }\n"
+"\n"
+"    //--------------------------------------------------------------------------\n"
+"    // reduce sum per-thread values to a single scalar\n"
+"    //--------------------------------------------------------------------------\n"
+"    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());\n"
+"    zc = reduce_sum<int,tile_sz>(tile, zc);\n"
+"\n"
+"    if( threadIdx.x ==0) {\n"
+"      //printf(\"warp %d zombie count = %d\\n\", blockIdx.x, zc);\n"
+"      atomicAdd( (unsigned long long int*)&(C->zombie_count), (unsigned long long int)zc);\n"
+"      //printf(\" Czombie = %lld\\n\",C->zombie_count);\n"
+"    }\n"
+"\n"
+"}\n"
+"\n"
+;
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cu
new file mode 100644
index 0000000000..7482a86b3f
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cu
@@ -0,0 +1,208 @@
+//******************************************************************************
+//  Sparse dot version of Matrix-Matrix multiply with mask 
+//  Each thread in this kernel is responsible for m vector-pairs(x,y), 
+//  finding intersections and producting the final dot product for each
+//  using a serial merge algorithm on the sparse vectors. 
+//  m = 256/sz, where sz is in {4, 16, 64, 256}
+//  For a vector-pair, sz = xnz + ynz 
+//  Template on <T_C, T_M, T_A, T_B>
+//  Parameters:
+
+//  int64_t start          <- start of vector pairs for this kernel
+//  int64_t end            <- end of vector pairs for this kernel
+//  int64_t *Bucket        <- array of pair indices for all kernels 
+//  matrix<T_C> *C         <- result matrix 
+//  matrix<T_M> *M         <- mask matrix
+//  matrix<T_A> *A         <- input matrix A
+//  matrix<T_B> *B         <- input matrix B
+//  int sz                 <- nnz of very sparse vectors
+
+//  Blocksize is 1024, uses warp and block reductions to count zombies produced.
+//******************************************************************************
+#define GB_KERNEL
+#include <limits>
+#include <cstdint>
+#include <stdio.h>
+#include <cooperative_groups.h>
+#include "matrix.h"
+#include "mySemiRing.h"
+
+using namespace cooperative_groups;
+
+template< typename T, int tile_sz>
+__inline__ __device__ 
+T warp_ReduceSumPlus( thread_block_tile<tile_sz> g, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2) {
+        //printf("thd%d   %d OP %d is %d\n", threadIdx.x, val, fold, OP( val, fold));
+        val +=  g.shfl_down( val, i);
+    }
+    return val; // note: only thread 0 will return full sum
+}
+
+template< typename T, int tile_sz>
+__inline__ __device__ 
+T warp_Reduce( thread_block_tile<tile_sz> g, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2) {
+        T next = g.shfl_down( val, i) ;
+        val = GB_ADD( sum, next ) ; 
+    }
+    return val; // note: only thread 0 will return full sum
+}
+
+template<typename T, int warpSize>
+__inline__ __device__
+T block_ReduceSum(thread_block g, T val)
+{
+  static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums
+  int lane = threadIdx.x & 31 ; // % warpSize;
+  int wid  = threadIdx.x >> 5 ; // / warpSize;
+  thread_block_tile<warpSize> tile = tiled_partition<warpSize>( g );
+
+  // Each warp performs partial reduction
+  val = warp_ReduceSumPlus<T, warpSize>( tile, val);    
+
+  // Wait for all partial reductions
+  if (lane==0) shared[wid]=val; // Write reduced value to shared memory
+  __syncthreads();              // Wait for all partial reductions
+
+  if (wid > 0 || gridDim.x == 1 ) return val;
+
+  //read from shared memory only if that warp existed
+  val = (threadIdx.x <  (blockDim.x / warpSize ) ) ? shared[lane] : 0;
+  //printf("thd%d warp loaded val = %d\n", threadIdx.x, lane, val);
+
+  if (wid==0) val = warp_ReduceSumPlus<T, warpSize>( tile, val); //Final reduce within first warp
+
+  return val;
+}
+
+template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>
+__global__ void AxB_dot3_phase3_vsvs
+( 
+  const int64_t start, 
+  const int64_t end,
+  const int64_t *__restrict__ Bucket, 
+  const GrB_Matrix C, 
+  const GrB_Matrix M, 
+  const GrB_Matrix A, 
+  const GrB_Matrix B,
+  const int sz 
+)
+{
+   int dots = end - start;
+   // sz = expected non-zeros per dot 
+   /*
+   int m = (gridDim.x*blockDim.x)*256/sz;
+   int dpt = (nvecs+ gridDim.x*blockDim.x -1)/(gridDim.x*blockDim.x);
+   m = dpt < m ? dpt : m;
+   
+   int dots = (nvecs +m -1)/m; 
+   */
+   const T_A *__restrict__ Ax = (T_A *)A->x  ;
+   const T_B *__restrict__ Bx = (T_B *)B->x  ;
+   T_C *__restrict__ Cx = (T_C *)C->x  ;
+   int64_t *__restrict__ Ci = C->i ;
+   const int64_t *__restrict__ Mi = M->i ;
+   const int64_t *__restrict__ Ai = A->i ;
+   const int64_t *__restrict__ Bi = B->i ;
+   const int64_t *__restrict__ Ap = A->p ;
+   const int64_t *__restrict__ Bp = B->p ;
+
+   int pfirst, plast;
+
+   GB_PARTITION (pfirst, plast, dots, blockIdx.x, gridDim.x ) ;
+   /* 
+   if( threadIdx.x ==0 )
+   {
+      printf("block%d %d dots/thrd, start,end = %ld,%ld pf,pl=%d,%d blockDim=%d\n",
+               blockIdx.x, (dots + blockDim.x*gridDim.x -1)/(blockDim.x*gridDim.x), 
+               start, end, pfirst, plast, blockDim.x);
+   }
+   __syncthreads();
+   */
+   
+
+   int zc = 0 ;
+     
+   int64_t pair_id;
+
+   //for ( int tid= threadIdx.x +blockDim.x*blockIdx.x;
+   //          tid < dots;
+   //          tid += blockDim.x * gridDim.x)
+   for ( int tid = pfirst+ threadIdx.x ;
+             tid < plast;
+             tid += blockDim.x )
+   {
+
+         pair_id = Bucket[ start + tid ]; 
+
+         int64_t i = Mi [pair_id] ;
+         int64_t j = Ci [pair_id]>>4 ; 
+
+         int64_t pA       = Ap[i] ;
+         int64_t pA_end   = Ap[i+1] ;
+         int64_t pB       = Bp[j] ; 
+         int64_t pB_end   = Bp[j+1] ; 
+
+         T_A aki;
+         T_B bkj;
+         T_Z cij ;
+
+         bool cij_exists = false;
+
+         while (pA < pA_end && pB < pB_end)
+         {
+            int64_t ia = Ai [pA] ;
+            int64_t ib = Bi [pB] ;
+            if( ia == ib)
+            { 
+                // A(k,i) and B(k,j) are the next entries to merge
+                #if defined ( GB_PHASE_1_OF_2 )
+                cij_exists = true ;
+                break ;
+                #else
+                GB_DOT_MERGE ;
+                //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal
+                pA++ ;
+                pB++ ;
+                #endif
+            }
+            else 
+            {
+                // A(ia,i) appears before B(ib,j)
+                pA += ( ia < ib);
+                // B(ib,j) appears before A(ia,i)
+                pB += ( ib < ia);
+            }
+         }
+         if (cij_exists){
+            GB_PUTC ( Ci[pair_id] = i ) ;
+            GB_PUTC ( Cx[pair_id] = (T_C)cij ) ;
+         }
+         else{
+            //printf(" %lld, %lld is zombie %d!\n",i,j,zc);
+            zc++; 
+            GB_PUTC( Ci[pair_id] = GB_FLIP( i ) ) ;
+         }
+   }
+  
+   __syncthreads();
+
+   //printf("thd%d zombie count = %d\n",threadIdx.x,zc);
+   zc = block_ReduceSum<int , 32>( this_thread_block(), zc); 
+   __syncthreads();
+
+   if( threadIdx.x == 0 && zc > 0) {
+      //printf("block%d zombie count = %d\n", blockIdx.x, zc);
+      atomicAdd( (unsigned long long int*)&(C->nzombies), (unsigned long long int)zc);
+      //C->nzombies += (unsigned long long int)zc;
+      //printf("blk:%d Czombie = %lld\n", blockIdx.x,C->nzombies);
+   }
+   
+}
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit
new file mode 100644
index 0000000000..6885b62420
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit
@@ -0,0 +1,216 @@
+const char* const templates_GB_jit_AxB_dot3_phase3_vsvs_cu = "templates/GB_jit_AxB_dot3_phase3_vsvs.cu\n"
+"//******************************************************************************\n"
+"//  Sparse dot version of Matrix-Matrix multiply with mask \n"
+"//  Each thread in this kernel is responsible for m vector-pairs(x,y), \n"
+"//  finding intersections and producting the final dot product for each\n"
+"//  using a serial merge algorithm on the sparse vectors. \n"
+"//  m = 256/sz, where sz is in {4, 16, 64, 256}\n"
+"//  For a vector-pair, sz = xnz + ynz \n"
+"//  Template on <T_C, T_M, T_A, T_B>\n"
+"//  Parameters:\n"
+"\n"
+"//  int64_t start          <- start of vector pairs for this kernel\n"
+"//  int64_t end            <- end of vector pairs for this kernel\n"
+"//  int64_t *Bucket        <- array of pair indices for all kernels \n"
+"//  matrix<T_C> *C         <- result matrix \n"
+"//  matrix<T_M> *M         <- mask matrix\n"
+"//  matrix<T_A> *A         <- input matrix A\n"
+"//  matrix<T_B> *B         <- input matrix B\n"
+"//  int sz                 <- nnz of very sparse vectors\n"
+"\n"
+"//  Blocksize is 1024, uses warp and block reductions to count zombies produced.\n"
+"//******************************************************************************\n"
+"#include <limits>\n"
+"#include <cstdint>\n"
+"#include <stdio.h>\n"
+"#include <cooperative_groups.h>\n"
+"//#include \"GB_matrix.h\"\n"
+"#include \"matrix.h\"\n"
+"#include \"mySemiRing.h\"\n"
+"\n"
+"using namespace cooperative_groups;\n"
+"\n"
+"template< typename T, int tile_sz>\n"
+"__inline__ __device__ \n"
+"T warp_ReduceSumPlus( thread_block_tile<tile_sz> g, T val)\n"
+"{\n"
+"    // Each iteration halves the number of active threads\n"
+"    // Each thread adds its partial sum[i] to sum[lane+i]\n"
+"    for (int i = g.size() / 2; i > 0; i /= 2) {\n"
+"        //printf(\"thd%d   %d OP %d is %d\\n\", threadIdx.x, val, fold, OP( val, fold));\n"
+"        val +=  g.shfl_down( val, i);\n"
+"    }\n"
+"    return val; // note: only thread 0 will return full sum\n"
+"}\n"
+"\n"
+"template< typename T, int tile_sz>\n"
+"__inline__ __device__ \n"
+"T warp_Reduce( thread_block_tile<tile_sz> g, T val)\n"
+"{\n"
+"    // Each iteration halves the number of active threads\n"
+"    // Each thread adds its partial sum[i] to sum[lane+i]\n"
+"    for (int i = g.size() / 2; i > 0; i /= 2) {\n"
+"        //printf(\"thd%d   %d OP %d is %d\\n\", threadIdx.x, val, fold, OP( val, fold));\n"
+"        T next = g.shfl_down( val, i) ;\n"
+"        val = GB_ADD( sum, next ) ; \n"
+"    }\n"
+"    //if (threadIdx.x ==0) printf(\"thd%d single warp sum is %d\\n\", threadIdx.x,  val);\n"
+"    return val; // note: only thread 0 will return full sum\n"
+"}\n"
+"\n"
+"template<typename T, int warpSize>\n"
+"__inline__ __device__\n"
+"T block_ReduceSum(thread_block g, T val)\n"
+"{\n"
+"  static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums\n"
+"  int lane = threadIdx.x % warpSize;\n"
+"  int wid = threadIdx.x / warpSize;\n"
+"  thread_block_tile<warpSize> tile = tiled_partition<warpSize>( g );\n"
+"\n"
+"  // Each warp performs partial reduction\n"
+"  val = warp_ReduceSumPlus<T, warpSize>( tile, val);    \n"
+"\n"
+"  // Wait for all partial reductions\n"
+"  if (lane==0) { \n"
+"     //printf(\"thd%d warp%d sum is %d\\n\", threadIdx.x, wid, val);\n"
+"     shared[wid]=val; // Write reduced value to shared memory\n"
+"     //printf(\"thd%d stored warp %d sum %d\\n\", threadIdx.x, wid, val);\n"
+"  }\n"
+"  tile.sync();              // Wait for all partial reductions\n"
+"\n"
+"  if (wid > 0 || gridDim.x == 1 ) return val;\n"
+"  //read from shared memory only if that warp existed\n"
+"  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0;\n"
+"  //printf(\"thd%d warp loaded val = %d\\n\", threadIdx.x, lane, val);\n"
+"\n"
+"  \n"
+"  if (wid==0) val = warp_ReduceSumPlus<T, warpSize>( tile, val); //Final reduce within first warp\n"
+"\n"
+"  return val;\n"
+"}\n"
+"\n"
+"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>\n"
+"__global__ void AxB_dot3_phase3_vsvs\n"
+"( \n"
+"  int64_t start, \n"
+"  int64_t end,\n"
+"  int64_t *Bucket, \n"
+"  GrB_Matrix C, \n"
+"  GrB_Matrix M, \n"
+"  GrB_Matrix A, \n"
+"  GrB_Matrix B,\n"
+"  int sz \n"
+")\n"
+"{\n"
+"   // sz = expected non-zeros per dot \n"
+"   int m = 256/sz;\n"
+"   int nvecs = end - start;\n"
+"   int dpt = nvecs/32;\n"
+"   m = dpt < m ? dpt : m;\n"
+"   //__shared__ int zombie_local[32];\n"
+"   /*\n"
+"   if( threadIdx.x ==0 && blockIdx.x == 0)\n"
+"      printf(\" %d dots/thrd, nvecs = %d blockDim=%d\\n\",sz, nvecs, blockDim.x);\n"
+"   __syncthreads();\n"
+"   */\n"
+"   int dots = (nvecs +m -1)/m; \n"
+"   int zc = 0;\n"
+"\n"
+"   T_A *Ax = (T_A *)A->x  ;\n"
+"   T_B *Bx = (T_B *)B->x  ;\n"
+"   T_C *Cx = (T_C *)C->x  ;\n"
+"   int64_t *Ci = C->i ;\n"
+"   int64_t *Mi = M->i ;\n"
+"   int64_t *Ai = A->i ;\n"
+"   int64_t *Bi = B->i ;\n"
+"   int64_t *Ap = A->p ;\n"
+"   int64_t *Bp = B->p ;\n"
+"     \n"
+"   for ( int tid= threadIdx.x +blockDim.x*blockIdx.x;\n"
+"             tid < dots;\n"
+"             tid += blockDim.x * gridDim.x) {\n"
+"      int pair_id, im; \n"
+"       //if (threadIdx.x ==0)\n"
+"       //  printf(\"thd%u pi=%lld\\n\",tid, start+threadIdx.x); \n"
+"       //  __syncthreads();\n"
+"\n"
+"      for (pair_id = start+tid, im = 0; \n"
+"           im < m && pair_id < end;  \n"
+"           ++im,     pair_id += dots ){\n"
+"\n"
+"         int64_t i = Mi [pair_id] ;\n"
+"         int64_t j = Ci [pair_id]>>4 ; \n"
+"         //int64_t i = M->i[pair_id];\n"
+"         //int64_t j = C->i[pair_id] >> 4;\n"
+"      //if (threadIdx.x ==0)\n"
+"      //   printf(\"thd%u i,j=%lld,%lld\\n\",tid, i,j); \n"
+"      //   __syncthreads();\n"
+"         \n"
+"     //  printf(\"thd%d pi=%d xn=%lld yn=%lld\\n\",tid, pair_id, \n"
+"     //                 A->p[i+1]- A->p[i],\n"
+"     //                 B->p[j+1]- B->p[j]);\n"
+"\n"
+"         int64_t pA       = Ap[i];\n"
+"         int64_t pA_end   = Ap[i+1];\n"
+"         int64_t pB       = Bp[j]; \n"
+"         int64_t pB_end   = Bp[j+1]; \n"
+"\n"
+"         T_A aki;\n"
+"         T_B bkj;\n"
+"         T_Z cij ;\n"
+"\n"
+"         bool cij_exists = false;\n"
+"\n"
+"         while (pA < pA_end && pB < pB_end)\n"
+"         {\n"
+"            int64_t ia = Ai [pA] ;\n"
+"            int64_t ib = Bi [pB] ;\n"
+"            if (ia < ib)\n"
+"            { \n"
+"                // A(ia,i) appears before B(ib,j)\n"
+"                pA++ ;\n"
+"            }\n"
+"            else if (ib < ia)\n"
+"            { \n"
+"                // B(ib,j) appears before A(ia,i)\n"
+"                pB++ ;\n"
+"            }\n"
+"            else // ia == ib == k\n"
+"            { \n"
+"                // A(k,i) and B(k,j) are the next entries to merge\n"
+"                #if defined ( GB_PHASE_1_OF_2 )\n"
+"                cij_exists = true ;\n"
+"                break ;\n"
+"                #else\n"
+"                GB_DOT_MERGE ;\n"
+"                //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal\n"
+"                pA++ ;\n"
+"                pB++ ;\n"
+"                #endif\n"
+"            }\n"
+"         }\n"
+"         if (cij_exists){\n"
+"            GB_PUTC ( Ci[pair_id] = i ) ;\n"
+"            GB_PUTC ( Cx[pair_id] = (T_C)cij ) ;\n"
+"         }\n"
+"         else{\n"
+"            zc++; \n"
+"            //printf(\" %lld, %lld is zombie %d!\\n\",i,j,zc);\n"
+"            GB_PUTC( Ci[pair_id] = GB_FLIP( i ) ) ;\n"
+"         }\n"
+"      }\n"
+"  \n"
+"   }\n"
+"   //printf(\"thd%d zombie count = %d\\n\",threadIdx.x,zc);\n"
+"   zc = block_ReduceSum<int , 32>( this_thread_block(), zc); \n"
+"   \n"
+"   __syncthreads();\n"
+"   if( threadIdx.x == 0 && zc > 0) {\n"
+"      //printf(\"block zombie count = %d\\n\",zc);\n"
+"      atomicAdd( (unsigned long long int*)&(C->zombie_count), (unsigned long long int)zc);\n"
+"      //C->zombie_count += (unsigned long long int)zc;\n"
+"      //printf(\"blk:%d Czombie = %lld\\n\", blockIdx.x,C->zombie_count);\n"
+"   }\n"
+"   \n"
+"}\n"
+;
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cu b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cu
new file mode 100644
index 0000000000..fff127a074
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cu
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// AxB_dot3_phase3_warpix.cu 
+//------------------------------------------------------------------------------
+
+// This CUDA kernel produces the semi-ring product of two
+// sparse matrices of types T_A and T_B and common index space size n, to a  
+// output matrix of type T_C. The matrices are sparse, with different numbers
+// of non-zeros and different sparsity patterns. 
+// ie. we want to produce C = A'*B in the sense of the given semi-ring.
+
+// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are 
+// relatively close in size, neither is very spare nor dense, for any size of N.
+// Handles arbitrary sparsity patterns with guaranteed load balance.
+
+// Both the grid and block are 1D, so blockDim.x is the # threads in a
+// threadblock, and the # of threadblocks is grid.x
+
+// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number
+// of active threads = min( min(g_xnz, g_ynz), 32) 
+
+// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi.  Its job
+// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot
+// product on those items in the intersection, and finally reduce this data to a scalar, 
+// on exit write it to g_odata [b].
+
+//  int64_t start          <- start of vector pairs for this kernel
+//  int64_t end            <- end of vector pairs for this kernel
+//  int64_t *Bucket        <- array of pair indices for all kernels 
+//  matrix<T_C> *C         <- result matrix 
+//  matrix<T_M> *M         <- mask matrix
+//  matrix<T_A> *A         <- input matrix A
+//  matrix<T_B> *B         <- input matrix B
+#define GB_KERNEL
+#include <limits>
+#include <cstdint>
+#include "matrix.h"
+#include <cooperative_groups.h>
+#include "mySemiRing.h"
+
+// Using tile size fixed at compile time, we don't need shared memory
+#define tile_sz 32 
+
+using namespace cooperative_groups;
+
+template< typename T, int warp_sz>
+__device__ __inline__ 
+T GB_reduce_sum(thread_block_tile<warp_sz> g, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2)
+    {
+        T next = g.shfl_down( val, i);
+        val = GB_ADD( val, next ) ;
+    }
+    return val;
+}
+
+template< typename T, int warp_sz>
+__device__ __inline__ 
+T reduce_plus(thread_block_tile<warp_sz> g, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2)
+    {
+        val += g.shfl_down( val, i) ;
+    }
+    return val; // note: only thread 0 will return full sum and flag value
+}
+
+#define intersects_per_thread 8
+
+template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>  
+__global__ void AxB_dot3_phase3_warpix
+(
+    int64_t start,
+    int64_t end,
+    int64_t *__restrict__ Bucket,
+    GrB_Matrix C,
+    GrB_Matrix M,
+    GrB_Matrix A,
+    GrB_Matrix B,
+    int sz
+)
+{
+
+    T_A *__restrict__ Ax = (T_A*)A->x;
+    T_B *__restrict__ Bx = (T_B*)B->x;
+    T_C *__restrict__ Cx = (T_C*)C->x;
+    int64_t *__restrict__ Ci = C->i;
+    int64_t *__restrict__ Mi = M->i;
+    int64_t *__restrict__ Mp = M->p;
+    int64_t *__restrict__ Ai = A->i;
+    int64_t *__restrict__ Bi = B->i;
+    int64_t *__restrict__ Ap = A->p;
+    int64_t *__restrict__ Bp = B->p;
+
+    int64_t mnvec = M->nvec;
+
+    // zombie count
+    int zc;
+
+    int64_t pair_id;
+
+    // set thread ID
+    int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;
+    int tid = threadIdx.x;
+    int b = blockIdx.x ;
+
+    // total items to be inspected
+    int64_t nnzA = 0;
+    int64_t nnzB = 0;
+
+    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());
+
+    //int parts = gridDim.x; //Each warp is a part
+
+    //Find our part of the work bucket
+    int64_t pfirst, plast, kfirst, klast ;
+    GB_PARTITION (pfirst, plast, end-start, b, gridDim.x ) ;
+    /* 
+    if( tid ==0 ) {
+       printf("block%d is alive, pf,pl=%ld,%ld \n", b, pfirst, plast);
+    }
+    __syncthreads();
+    */
+    
+    
+    __shared__ int64_t As[256];
+    __shared__ int64_t Bs[256];
+    __shared__ T_A Axs[256]; 
+    __shared__ T_B Bxs[256]; 
+
+   /* 
+    int Bpl[9]; // local offsets into shared for multiple vectors of B
+    int shr_vec[8] ; //columns of B we see in this task
+
+    pair_id = Bucket[pfirst];
+    int64_t i = Mi[pair_id] ;
+    int vecs = 1 ;
+    int last_vec = i;
+    shr_vec[0] = i;
+    for (int id =1; id< plast-pfirst; id++)
+    {
+         pair_id = Bucket[pfirst+id];
+         i = Mi[pair_id];
+         if (i == last_vec) continue;
+         vecs++;
+         shr_vec[vecs] = i;
+         last_vec = i;
+    }
+    int all_loaded = 0;
+
+    Bpl[0] = 0;
+    for ( int k = 0; k < vecs; k++)
+    {   
+        int64_t pA       = Ap[ shr_vec[k] ]; 
+        int64_t pA_end   = Ap[ shr_vec[k] +1]; 
+        nnzA = pA_end - pA;
+        Bpl[k+1] = Bpl[k] + nnzA;
+        for (int i = tid ; i < nnzA; i+= blockDim.x)
+        {
+           As[ Bpl[k] +i ] = Ai[ pA + i ] ; 
+        }
+        __syncthreads();
+    }
+
+    //pre-load columns of B, which will be reused, to shared memory
+    //Due to loading a contigious block with stride 1 this is fast
+        
+    all_loaded = (Bpl[vecs] < 256 );
+    if( tid == 0 ) {
+       printf("block%d loaded %d vals from B, vecs=%d, all_loaded=%d\n",
+                 b, Bpl[vecs], vecs, all_loaded );
+    }
+    __syncthreads();
+
+
+    // reset counter
+    */
+    // Main loop over pairs 
+    for (int id = start + pfirst; // loop on pairs 
+         id < start+ plast;  
+         id ++ )
+    {
+         int64_t pair_id = Bucket[id];
+          
+         int64_t i = Mi[pair_id];
+         int64_t j = Ci[pair_id] >> 4;
+
+         int64_t pA       = Ap[i];
+         int64_t pA_end   = Ap[i+1];
+         nnzA = pA_end - pA;
+
+         int64_t pB       = Bp[j]; 
+         int64_t pB_end   = Bp[j+1]; 
+         nnzB = pB_end - pB;
+
+         zc = 0 ;
+         int j_last = -1 ;
+         
+         
+    // No search, this warp does all the work
+
+    int tx_start = pA;
+    int tx_end   = pA_end;
+    int ty_start = pB;
+    int ty_end   = pB_end;
+
+    for ( int i = tid; i < nnzA ; i+= blockDim.x)
+    {
+       As [i] = Ai[ pA + i];
+       Axs[i] = Ax[ pA + i];
+    }
+    __syncthreads();
+
+    if ( j != j_last) { 
+        for ( int i = tid; i < nnzB ; i+= blockDim.x)
+        {
+           Bs [i] = Bi[ pB + i];
+           Bxs[i] = Bx[ pB + i];
+        }
+        __syncthreads();
+        j_last = j;
+    }
+    
+
+    /*     
+    if ( tid==0 ) {
+      //printf("block %d dot %lld i,j= %lld,%lld\n", blockIdx.x, pair_id, i, j);
+      printf("block%d dot %ld(i,j)=(%ld,%ld) xs,xe= %d,%d ys,ye = %d,%d \n", 
+               b, pair_id, i, j, tx_start,tx_end, ty_start, ty_end);
+      //for(int a = 0; a < nnzA; a++) printf(" As[%d]:%ld ",a, As[j]);
+    }
+    tile.sync();
+    */
+    
+    
+
+    // Warp intersection: balanced by design, no idle threads. 
+    // Each 32 thread warp will handle 32 comparisons per loop.
+    // Either A or B takes stride 4, other takes stride 8
+    // For this version A strides 4, B strides 8
+    T_A aki;
+    T_B bkj;
+    T_Z cij = GB_IDENTITY ;
+    int Astride = nnzA > nnzB ? 8 : 4;
+    int Ashift  = nnzA > nnzB ? 3 : 2;
+    int Amask   = nnzA > nnzB ? 7 : 3;
+    int Bstride = nnzB >= nnzA ? 8 : 4;
+    //printf(" Astride = %d, Bstride = %d\n", Astride, Bstride);
+
+    // TODO PLUS_PAIR_INT64, FP32, FP64: no need for cij_exists.
+    // just check if cij > 0
+
+    int cij_exists  = 0 ;
+
+    //Warp intersection dot product
+    int bitty_row = tid &  Amask ;
+    int bitty_col = tid >> Ashift ;
+
+    int k = tx_start + bitty_row ;
+    int l = ty_start + bitty_col ;
+
+    //Ai[k] = As[ k -pA ];  for lookup
+    //Bi[l] = Bs[ l -pB ]; 
+
+
+    int inc_k,inc_l;
+
+    int active = ( ( k < tx_end) && (l < ty_end ) );
+       
+    /*    
+    printf("block%d tid%d  Ai,As=%ld,%ld Bi,Bs=%ld,%ld  k,l =%d,%d active:%d\n",
+                    b,tid, Ai[k], As[k -pA], Bi[l], Bs[l -pB],
+                    k, l,  active );
+    */
+                    
+    
+    while ( tile.any(active) )
+    {
+       inc_k = 0;
+       inc_l = 0;
+       int kp = k-pA;
+       int lp = l-pB;
+       if ( active )
+       { 
+          coalesced_group g = coalesced_threads();
+          if ( g.thread_rank() == g.size()-1)
+          {
+             inc_k = ( As[kp] <= Bs[lp] ) ;
+             inc_l = ( Bs[lp] <= As[kp] ) ;
+             // printf("block%d tid%d inc_k= %d inc_l = %d\n",b, tid, inc_k, inc_l );
+          }
+          //tile.sync();
+
+          if ( As [kp] == Bs [lp] )
+          {
+              //Axs[kp] = Ax[k];
+              //Bxs[lp] = Bx[l];
+
+              GB_GETA ( aki=(T_Z)Axs[kp] ) ;
+              GB_GETB ( bkj=(T_Z)Bxs[lp] ) ;
+              if (cij_exists)
+              {
+                T_Z t = GB_MULT( (T_Z) aki, (T_Z) bkj);
+                GB_ADD_F( cij, t ) ;
+                //printf("block%d  thd%d ix at %ld(%ld)  cij += %d * %d\n",b, tid, Ai[k], As[kp], aki, bkj);
+              }
+              else
+              {
+                cij_exists = 1 ;
+                cij = GB_MULT ( (T_Z) aki, (T_Z) bkj) ;
+                //printf("  thd%d ix at %ld(%ld)  cij = %d * %d \n", tid, Ai[k], Ais[kp], aki, bkj);
+              }
+          }
+          // TODO check terminal condition
+          //printf(" block%u work value = %d, exists = %d\n", b, cij, cij_exists);
+          //printf("block%d tid%d k,l = %d,%d Ai,Bi = %ld,%ld \n", b, tid, k, l, Ai[k], Bi[l] );
+       }
+       //tile.sync();
+       //inc_k = tile.shfl_down( inc_k, 31-tid);
+       if( tile.any(inc_k) ) {
+          k =1+ tile.shfl_down(k,31-tid) + bitty_row ; // tid%Astride;
+          //Ais [k-pA] = As[k-pA];
+          //Axs [bitty_row] = Ax[k];
+       }
+       if( tile.any(inc_l) ) {
+          l =1+ tile.shfl_down(l,31-tid) + bitty_col ; // tid/Astride;
+          //Bis [l-pB] = Bs[l-pB];
+          //Bxs [bitty_col] = Bx[l];
+       }
+       active = ( ( k < tx_end) && (l < ty_end ) );
+       //printf("block%d tid = %d k = %d l= %d active=%d\n", b, tid, k, l,active);
+    }
+    tile.sync();
+
+    //--------------------------------------------------------------------------
+    // reduce sum per-thread values to a single scalar, get OR of flag
+    //--------------------------------------------------------------------------
+
+    // Do vote here for control.
+    cij_exists  = tile.any( cij_exists);
+    tile.sync();
+
+    if (cij_exists)
+    {
+       cij = GB_reduce_sum<T_Z, tile_sz>( tile, cij );
+    }
+    tile.sync();
+    
+
+    // Atomic write result for this block to global mem
+    if (tid == 0)
+    {
+        //printf ("final %d : %d exists = %d\n", b,  cij, cij_exists) ;
+        if (cij_exists)
+        {
+           //printf("block%d i,j =%ld,%ld cij = %d\n",b, i, j, cij);
+           GB_PUTC( Cx[pair_id] = (T_C) cij ) ;
+           GB_PUTC ( Ci[pair_id] = i ) ;
+           
+        }
+        else
+        {
+            //printf(" dot %d is a zombie\n", pair_id);
+            zc++;
+            GB_PUTC ( Ci[pair_id] = GB_FLIP (i) ) ;
+        }
+    
+    //__syncthreads(); 
+  
+
+       if( zc > 0)
+       {
+          //printf("warp %d zombie count = %d\n", blockIdx.x, zc);
+          atomicAdd( (unsigned long long int*)&(C->nzombies), (unsigned long long int)zc);
+          //printf("blk:%d Czombie = %lld\n",blockIdx.x,C->zombies);
+       }
+
+    }
+    tile.sync();
+    /*
+    */
+  }
+}
+
diff --git a/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cu.jit b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cu.jit
new file mode 100644
index 0000000000..96a938a7c1
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/GB_jit_AxB_dot3_phase3_warpix.cu.jit
@@ -0,0 +1,356 @@
+const char* const templates_GB_jit_AxB_dot3_phase3_warpix_cu = "templates/GB_jit_AxB_dot3_phase3_warpix.cu\n"
+"//------------------------------------------------------------------------------\n"
+"// AxB_dot3_phase3_warpix.cu \n"
+"//------------------------------------------------------------------------------\n"
+"\n"
+"// This CUDA kernel produces the semi-ring product of two\n"
+"// sparse matrices of types T_A and T_B and common index space size n, to a  \n"
+"// output matrix of type T_C. The matrices are sparse, with different numbers\n"
+"// of non-zeros and different sparsity patterns. \n"
+"// ie. we want to produce C = A'*B in the sense of the given semi-ring.\n"
+"\n"
+"// This version uses a merge-path algorithm, when the sizes nnzA and nnzB are \n"
+"// relatively close in size, neither is very spare nor dense, for any size of N.\n"
+"// Handles arbitrary sparsity patterns with guaranteed load balance.\n"
+"\n"
+"// Both the grid and block are 1D, so blockDim.x is the # threads in a\n"
+"// threadblock, and the # of threadblocks is grid.x\n"
+"\n"
+"// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number\n"
+"// of active threads = min( min(g_xnz, g_ynz), 32) \n"
+"\n"
+"// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi.  Its job\n"
+"// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot\n"
+"// product on those items in the intersection, and finally reduce this data to a scalar, \n"
+"// on exit write it to g_odata [b].\n"
+"\n"
+"//  int64_t start          <- start of vector pairs for this kernel\n"
+"//  int64_t end            <- end of vector pairs for this kernel\n"
+"//  int64_t *Bucket        <- array of pair indices for all kernels \n"
+"//  matrix<T_C> *C         <- result matrix \n"
+"//  matrix<T_M> *M         <- mask matrix\n"
+"//  matrix<T_A> *A         <- input matrix A\n"
+"//  matrix<T_B> *B         <- input matrix B\n"
+"#include <limits>\n"
+"#include <cstdint>\n"
+"#include <cooperative_groups.h>\n"
+"#include \"mySemiRing.h\"\n"
+"#include \"matrix.h\"\n"
+"\n"
+"// Using tile size fixed at compile time, we don't need shared memory\n"
+"#define tile_sz 32 \n"
+"\n"
+"using namespace cooperative_groups;\n"
+"\n"
+"template< typename T, int warp_sz>\n"
+"__device__ __inline__ \n"
+"T GB_reduce_sum(thread_block_tile<warp_sz> g, T val)\n"
+"{\n"
+"    // Each iteration halves the number of active threads\n"
+"    // Each thread adds its partial sum[i] to sum[lane+i]\n"
+"    for (int i = g.size() / 2; i > 0; i /= 2)\n"
+"    {\n"
+"        T next = g.shfl_down( val, i);\n"
+"        val = GB_ADD( val, next ) ;\n"
+"    }\n"
+"    return val;\n"
+"}\n"
+"\n"
+"template< typename T, int warp_sz>\n"
+"__device__ __inline__ \n"
+"T reduce_plus(thread_block_tile<warp_sz> g, T val)\n"
+"{\n"
+"    // Each iteration halves the number of active threads\n"
+"    // Each thread adds its partial sum[i] to sum[lane+i]\n"
+"    for (int i = g.size() / 2; i > 0; i /= 2)\n"
+"    {\n"
+"        val += g.shfl_down( val, i) ;\n"
+"    }\n"
+"    return val; // note: only thread 0 will return full sum and flag value\n"
+"}\n"
+"\n"
+"#define intersects_per_thread 8\n"
+"\n"
+"template< typename T_C, typename T_A, typename T_B, typename T_X, typename T_Y, typename T_Z>  \n"
+"__global__ void AxB_dot3_phase3_warp\n"
+"(\n"
+"    int64_t start,\n"
+"    int64_t end,\n"
+"    int64_t *Bucket,\n"
+"    GrB_Matrix C,\n"
+"    GrB_Matrix M,\n"
+"    GrB_Matrix A,\n"
+"    GrB_Matrix B,\n"
+"    int sz\n"
+")\n"
+"{\n"
+"\n"
+"    T_A *Ax = (T_A*)A->x;\n"
+"    T_B *Bx = (T_B*)B->x;\n"
+"    T_C *Cx = (T_C*)C->x;\n"
+"    int64_t *Ci = C->i;\n"
+"    int64_t *Mi = M->i;\n"
+"    int64_t *Ai = A->i;\n"
+"    int64_t *Bi = B->i;\n"
+"    int64_t *Ap = A->p;\n"
+"    int64_t *Bp = B->p;\n"
+"\n"
+"\n"
+"    // zombie count\n"
+"    int zc = 0;\n"
+"\n"
+"    int64_t pair_id;\n"
+"\n"
+"    // set thread ID\n"
+"    int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;\n"
+"    int tid = threadIdx.x;\n"
+"\n"
+"    int b = blockIdx.x ;\n"
+"\n"
+"    // total items to be inspected\n"
+"    int64_t nnzA = 0;\n"
+"    int64_t nnzB = 0;\n"
+"    int64_t n_intersect = 0;\n"
+"\n"
+"    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());\n"
+"\n"
+"    //int parts = gridDim.x; //Each warp is a part\n"
+"\n"
+"    // Main loop over pairs \n"
+"    for (pair_id = start +blockIdx.x; // 1 warp per pair \n"
+"         pair_id < end;  \n"
+"         pair_id += gridDim.x )\n"
+"    {\n"
+"\n"
+"         int64_t i = Mi[pair_id];\n"
+"         int64_t j = Ci[pair_id] >> 4;\n"
+"\n"
+"         int64_t pA = Ap[i];\n"
+"         int64_t pA_end   = Ap[i+1];\n"
+"         nnzA = pA_end - pA;\n"
+"\n"
+"         int64_t pB = Bp[j]; \n"
+"         int64_t pB_end   = Bp[j+1]; \n"
+"         nnzB = pB_end - pB;\n"
+"\n"
+"         n_intersect = INTMIN( nnzA, nnzB); \n"
+"\n"
+"         /*  only for multi-warp version\n"
+"         if( tid ==0 ) {\n"
+"            C->zombie_count = end - start;\n"
+"            Ci[pair_id] = GB_FLIP(i);     // zombie until proven otherwise\n"
+"            printf(\"block %d flipped indices for %lld %lld\\n\", b, i, j);\n"
+"         }\n"
+"         tile.sync(); \n"
+"         */\n"
+"\n"
+"    int64_t nxy = nnzA + nnzB;\n"
+"     /*\n"
+"    int work_per_warp = (nxy +parts -1)/parts;\n"
+"    int diag = INTMIN( work_per_warp*blockIdx.x, nxy);\n"
+"    int diag_end = INTMIN( diag + work_per_warp, nxy);\n"
+"    //printf(\" thd%d parts = %u wpt = %u diag, diag_end  = %u,%u\\n\",tid, parts, work_per_warp, diag, diag_end); \n"
+"\n"
+"    int x_min = INTMAX( (int)(diag - nnzB), 0);\n"
+"    int x_max = INTMIN( diag, nnzA);\n"
+"\n"
+"    //printf(\"start thd%u x_min = %u x_max = %u\\n\", tid_global, x_min,x_max);\n"
+"    while ( x_min < x_max) { //binary search for correct diag break\n"
+"      int pivot = (x_min +x_max)/2;\n"
+"      if ( Ai[pivot + pA] < Bi[ diag -pivot -1 + pB]) {\n"
+"         x_min = pivot +1;\n"
+"      }\n"
+"      else {\n"
+"         x_max = pivot;\n"
+"      }\n"
+"    }\n"
+"    int xcoord = x_min;\n"
+"    int ycoord = diag -x_min -1;\n"
+"    if (( diag > 0) &&(diag < (nnzA+nnzB)) && (Ai[xcoord+pA] == Bi[ycoord+pB]) ) { \n"
+"       diag--; //adjust for intersection incrementing both pointers \n"
+"    }\n"
+"    // two start points are known now\n"
+"    int tx_start = xcoord +pA;\n"
+"    int ty_start = diag -xcoord +pB; \n"
+"\n"
+"    //if (x_start != y_start)\n"
+"    //   printf(\"start thd%u  xs,ys = %i,%i\\n\", tid_global, x_start, y_start);\n"
+"\n"
+"    x_min = INTMAX( (int)(diag_end - nnzB), 0);\n"
+"    x_max = INTMIN( diag_end, nnzA);\n"
+"\n"
+"    while ( x_min < x_max) {\n"
+"       int pivot = (x_min +x_max)/2;\n"
+"       //printf(\"thd%u pre_sw piv=%u diag_e = %u  xmin,xmax=%u,%u\\n\", tid_global, pivot, diag_end,x_min, x_max);\n"
+"       if ( Ai[pivot+ pA] < Bi[ diag_end -pivot -1 +pB]) {\n"
+"          x_min = pivot +1;\n"
+"       }\n"
+"       else {\n"
+"          x_max = pivot;\n"
+"       }\n"
+"       //printf(\"thd%u piv=%u xmin,xmax = %u,%u\\n\", tid_global, pivot, x_min, x_max);\n"
+"    }\n"
+"    xcoord = x_min;\n"
+"    ycoord = diag_end -x_min -1;\n"
+"    if ( (diag_end < (nnzA +nnzB)) && (Ai[xcoord +pA] == Bi[ycoord +pB]) ) { \n"
+"        diag--; //adjust for intersection incrementing both pointers  \n"
+"    }\n"
+"    // two end points are known now\n"
+"    int tx_end = xcoord +pA; \n"
+"    int ty_end = diag_end - xcoord + pB; \n"
+"\n"
+"    */ \n"
+"\n"
+"    // No search, this warp does all the work\n"
+"    int tx_start = pA;\n"
+"    int tx_end   = pA_end;\n"
+"    int ty_start = pB;\n"
+"    int ty_end   = pB_end;\n"
+"    /*\n"
+"    if (threadIdx.x ==0 ) {\n"
+"      printf(\"block %d dot %lld i,j= %lld,%lld\\n\", blockIdx.x, pair_id, i, j);\n"
+"      printf(\"block %d dot %lld xs,xe= %d,%d ys,ye = %d,%d\\n\", \n"
+"               blockIdx.x, pair_id, tx_start,tx_end, ty_start, ty_end);\n"
+"    }\n"
+"    tile.sync();\n"
+"    */\n"
+"\n"
+"    // Warp intersection: balanced by design, no idle threads. \n"
+"    // Each 32 thread warp will handle 32 comparisons per loop.\n"
+"    // Either A or B takes stride 4, other takes stride 8\n"
+"    // For this version A strides 4, B strides 8\n"
+"    T_A aki;\n"
+"    T_B bkj;\n"
+"    T_Z cij = MONOID_IDENTITY ;\n"
+"    int Astride = nnzA > nnzB ? 8 : 4;\n"
+"    int Ashift  = nnzA > nnzB ? 3 : 2;\n"
+"    int Amask   = nnzA > nnzB ? 7 : 3;\n"
+"    int Bstride = nnzB >= nnzA ? 8 : 4;\n"
+"    //printf(\" Astride = %d, Bstride = %d\\n\", Astride, Bstride);\n"
+"\n"
+"    // TODO PLUS_PAIR_INT64, FP32, FP64: no need for cij_exists.\n"
+"    // just check if cij > 0\n"
+"\n"
+"    int cij_exists  = 0 ;\n"
+"\n"
+"    //Warp intersection dot product\n"
+"    int bitty_row = tid &  Amask ;\n"
+"    int bitty_col = tid >> Ashift ;\n"
+"\n"
+"    int k = tx_start + bitty_row ;\n"
+"    int l = ty_start + bitty_col ;\n"
+"    int inc_k, inc_l;\n"
+"\n"
+"    // int last_k = tx_start + 31%Astride ;\n"
+"    // int last_l = ty_start + 31/Astride ;\n"
+"\n"
+"    //printf(\" thd%u has init value %d, k,l =%d, %d\\n\",tid, cij, k, l );\n"
+"    while ( k < tx_end && l < ty_end )\n"
+"    {\n"
+"       if (Ai [k] == Bi [l])\n"
+"       {\n"
+"          GB_GETA ( aki=(T_Z)Ax[k] ) ;\n"
+"          GB_GETB ( bkj=(T_Z)Bx[l] ) ;\n"
+"          if (cij_exists)\n"
+"          {\n"
+"            T_Z t = GB_MULT( (T_Z) aki, (T_Z) bkj);\n"
+"            cij = GB_ADD (cij, t ) ;\n"
+"            //printf(\"  thd%d ix at %lld   cij += %d * %d \\n\", tid_global, Ai[k], aki, bkj);\n"
+"          }\n"
+"          else\n"
+"          {\n"
+"            cij_exists = 1 ;\n"
+"            cij = GB_MULT ( (T_Z) aki, (T_Z) bkj) ;\n"
+"            //printf(\"  thd%d ix at %lld   cij = %d * %d \\n\", tid_global, Ai[k], Ax[k], Bx[l]);\n"
+"          }\n"
+"          // TODO check terminal condition\n"
+"          //printf(\" block%u work value = %d, exists = %d\\n\", b, cij, cij_exists);\n"
+"       }\n"
+"       if( tid == 31) // Last thread in the warp has the highest index.\n"
+"       {\n"
+"          inc_k = ( Ai[k] <  Bi[l]);\n"
+"          inc_l = ( Ai[k] >= Bi[l]);\n"
+"          k += inc_k ; // * Astride ; \n"
+"          l += inc_l ; // * Bstride ; \n"
+"          // last_k += inc_k * Astride ; \n"
+"          // last_l += inc_l * Bstride ; \n"
+"          //printf(\"block%d k = %d l= %d, Ai,Bi = %lld,%lld\\n\", b,  k, l, Ai[k], Bi[l]);\n"
+"       }\n"
+"       tile.sync();\n"
+"       inc_k = tile.shfl_down( inc_k, 31-tid);\n"
+"       if( inc_k) {\n"
+"          k = tile.shfl_down(k,31-tid) + bitty_row ; // tid%Astride;\n"
+"       }\n"
+"       else {\n"
+"          l = tile.shfl_down(l,31-tid) + bitty_col ; // tid/Astride;\n"
+"       }\n"
+"       //printf(\"block%d tid = %d k = %d l= %d\\n\", b, tid, k, l);\n"
+"    }\n"
+"\n"
+"    //--------------------------------------------------------------------------\n"
+"    // reduce sum per-thread values to a single scalar, get OR of flag\n"
+"    //--------------------------------------------------------------------------\n"
+"\n"
+"    // Do vote here for control.\n"
+"    cij_exists  = tile.any( cij_exists);\n"
+"    tile.sync();\n"
+"\n"
+"    if (cij_exists)\n"
+"    {\n"
+"       cij = GB_reduce_sum<T_Z, tile_sz>( tile, cij );\n"
+"    }\n"
+"    tile.sync();\n"
+"    \n"
+"\n"
+"    // Atomic write result for this block to global mem\n"
+"    if (tid == 0)\n"
+"    {\n"
+"        //printf (\"final %d : %d exists = %d\\n\", b,  cij, cij_exists) ;\n"
+"        if (cij_exists)\n"
+"        {\n"
+"           //printf(\" cij = %d\\n\", cij);\n"
+"           //T_C old = atomicCAS( (T_C *)&(Cx[pair_id]), MONOID_IDENTITY, (T_C) cij ) ;\n"
+"           //T_C assumed;\n"
+"            GB_PUTC( Cx[pair_id] = (T_C) cij ) ;\n"
+"           // Need ATOMIC_ADD here, use CAS on semi-ring operation  \n"
+"           //if ( old != MONOID_IDENTITY) {\n"
+"           //   do {\n"
+"           //       assumed = old;\n"
+"           //       old = atomicCAS( (T_C *)&(Cx[pair_id]), \n"
+"           //                        assumed, \n"
+"           //                        (T_C)( ADD( assumed, cij ) ) );\n"
+"           //   }\n"
+"           //  while (  assumed != old);\n"
+"                \n"
+"           //} \n"
+"           //unsigned long long int old_i;\n"
+"           //old_i = atomicCAS((unsigned long long int *)&(Ci[pair_id]),\n"
+"           //                  (unsigned long long int)    GB_FLIP(i),  i ) ; \n"
+"           //if ( old_i == GB_FLIP(i) ) {\n"
+"           //   zc -= 1;  //decrement zombies, this one is alive\n"
+"           //   atomicAdd( (unsigned long long int*)&(C->zombie_count), \n"
+"           //              (unsigned long long int)zc);\n"
+"           // }\n"
+"           GB_PUTC ( Ci[pair_id] = i ) ;\n"
+"           \n"
+"        }\n"
+"        else\n"
+"        {\n"
+"            //printf(\" dot %d is a zombie\\n\", pair_id);\n"
+"            zc++;\n"
+"            GB_PUTC ( Ci[pair_id] = GB_FLIP (i) ) ;\n"
+"        }\n"
+"    }\n"
+"    //__syncthreads(); \n"
+"  }\n"
+"\n"
+"  if( tid ==0 && zc > 0)\n"
+"  {\n"
+"      //printf(\"warp %d zombie count = %d\\n\", blockIdx.x, zc);\n"
+"      atomicAdd( (unsigned long long int*)&(C->zombie_count), (unsigned long long int)zc);\n"
+"      //printf(\"blk:%d Czombie = %lld\\n\",blockIdx.x,C->zombie_count);\n"
+"  }\n"
+"\n"
+"}\n"
+"\n"
+;
diff --git a/GraphBLAS/CUDA/templates/cooperative_groups.h b/GraphBLAS/CUDA/templates/cooperative_groups.h
new file mode 100755
index 0000000000..1f296729e5
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/cooperative_groups.h
@@ -0,0 +1,996 @@
+/*
+ * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _COOPERATIVE_GROUPS_H_
+# define _COOPERATIVE_GROUPS_H_
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+# include "cooperative_groups_helpers.h"
+
+_CG_BEGIN_NAMESPACE
+
+/**
+ * class thread_group;
+ *
+ * Generic thread group type, into which all groups are convertible.
+ * It acts as a container for all storage necessary for the derived groups,
+ * and will dispatch the API calls to the correct derived group. This means
+ * that all derived groups must implement the same interface as thread_group.
+ */
+class thread_group
+{
+    friend _CG_QUALIFIER thread_group this_thread();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend class thread_block;
+
+ protected:
+    union __align__(8) {
+        unsigned int type : 8;
+        struct {
+            unsigned int type : 8;
+            unsigned int size : 24;
+            unsigned int mask;
+        } coalesced;
+        struct {
+            void* ptr[2];
+        } buffer;
+    } _data;
+
+    _CG_QUALIFIER thread_group operator=(const thread_group& src);
+    _CG_QUALIFIER thread_group(__internal::groupType type) {
+        _data.type = type;
+    }
+
+#if __cplusplus >= 201103L
+    static_assert(sizeof(_data) == 16, "Failed size check");
+#endif
+
+public:
+    _CG_QUALIFIER unsigned int size() const;
+    _CG_QUALIFIER unsigned int thread_rank() const;
+    _CG_QUALIFIER void sync() const;
+};
+
+/**
+ * thread_group this_thread()
+ *
+ * Constructs a generic thread_group containing only the calling thread
+ */
+_CG_QUALIFIER thread_group this_thread()
+{
+    thread_group g = thread_group(__internal::Coalesced);
+    g._data.coalesced.mask = __internal::lanemask32_eq();
+    g._data.coalesced.size = 1;
+    return (g);
+}
+
+#if defined(_CG_HAS_MULTI_GRID_GROUP)
+
+/**
+ * class multi_grid_group;
+ *
+ * Threads within this this group are guaranteed to be co-resident on the
+ * same system, on multiple devices within the same launched kernels.
+ * To use this group, the kernel must have been launched with
+ * cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
+ * and the device must support it (queryable device attribute).
+ *
+ * Constructed via this_multi_grid();
+ */
+class multi_grid_group
+{
+    friend _CG_QUALIFIER multi_grid_group this_multi_grid();
+
+    struct __align__(8) {
+        unsigned long long handle;
+        unsigned int size;
+        unsigned int rank;
+    } _data;
+
+#if __cplusplus >= 201103L
+    static_assert(sizeof(_data) == 16, "Failed size check");
+#endif
+
+public:
+    _CG_QUALIFIER multi_grid_group() {
+        _data.handle = __internal::multi_grid::get_intrinsic_handle();
+        _data.size = __internal::multi_grid::size(_data.handle);
+        _data.rank = __internal::multi_grid::thread_rank(_data.handle);
+    }
+
+    _CG_QUALIFIER bool is_valid() const {
+        return (_data.handle != 0);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        _CG_ASSERT(is_valid());
+        __internal::multi_grid::sync(_data.handle);
+    }
+
+    _CG_QUALIFIER unsigned int size() const {
+        _CG_ASSERT(is_valid());
+        return (_data.size);
+    }
+
+    _CG_QUALIFIER unsigned int thread_rank() const {
+        _CG_ASSERT(is_valid());
+        return (_data.rank);
+    }
+
+    _CG_QUALIFIER unsigned int grid_rank() const {
+        _CG_ASSERT(is_valid());
+        return (__internal::multi_grid::grid_rank(_data.handle));
+    }
+
+    _CG_QUALIFIER unsigned int num_grids() const {
+        _CG_ASSERT(is_valid());
+        return (__internal::multi_grid::num_grids(_data.handle));
+    }
+};
+
+/**
+ * multi_grid_group this_multi_grid()
+ *
+ * Constructs a multi_grid_group
+ */
+_CG_QUALIFIER multi_grid_group this_multi_grid()
+{
+    return (multi_grid_group());
+}
+
+#endif
+
+#if defined(_CG_HAS_GRID_GROUP)
+
+/**
+ * class grid_group;
+ *
+ * Threads within this this group are guaranteed to be co-resident on the
+ * same device within the same launched kernel. To use this group, the kernel
+ * must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
+ * and the device must support it (queryable device attribute).
+ *
+ * Constructed via this_grid();
+ */
+class grid_group
+{
+    friend _CG_QUALIFIER grid_group this_grid();
+
+    struct __align__(8) {
+        unsigned long long handle;
+        unsigned int size;
+        unsigned int rank;
+    } _data;
+
+#if __cplusplus >= 201103L
+    static_assert(sizeof(_data) == 16, "Failed size check");
+#endif
+
+ public:
+    _CG_QUALIFIER grid_group() {
+        _data.handle = (__internal::grid::get_intrinsic_handle());
+        _data.size = __internal::grid::size(_data.handle);
+        _data.rank = __internal::grid::thread_rank(_data.handle);
+    }
+
+    _CG_QUALIFIER bool is_valid() const {
+        return (_data.handle != 0);
+    }
+
+    _CG_QUALIFIER void sync() const {
+        _CG_ASSERT(is_valid());
+        __internal::grid::sync(_data.handle);
+    }
+
+    _CG_QUALIFIER unsigned int size() const {
+        _CG_ASSERT(is_valid());
+        return (_data.size);
+    }
+
+    _CG_QUALIFIER unsigned int thread_rank() const {
+        _CG_ASSERT(is_valid());
+        return (_data.rank);
+    }
+
+    _CG_QUALIFIER dim3 group_dim() const {
+        _CG_ASSERT(is_valid());
+        return (__internal::grid::grid_dim());
+    }
+
+};
+
+/**
+ * grid_group this_grid()
+ *
+ * Constructs a grid_group
+ */
+_CG_QUALIFIER grid_group this_grid()
+{
+    return (grid_group());
+}
+
+#endif
+
+/**
+ * class thread_block
+ *
+ * Every GPU kernel is executed by a grid of thread blocks, and threads within
+ * each block are guaranteed to reside on the same streaming multiprocessor.
+ * A thread_block represents a thread block whose dimensions are not known until runtime.
+ *
+ * Constructed via this_thread_block();
+ */
+class thread_block : public thread_group
+{
+    friend _CG_QUALIFIER thread_block this_thread_block();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
+
+    _CG_QUALIFIER thread_block() : thread_group(__internal::ThreadBlock) {
+    }
+
+    // Internal Use
+    _CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
+        const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
+
+        // Invalid, immediately fail
+        if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
+            __internal::abort();
+            return (thread_block());
+        }
+
+        unsigned int mask;
+        unsigned int base_offset = thread_rank() & (~(tilesz - 1));
+        unsigned int masklength = min(size() - base_offset, tilesz);
+
+        mask = (unsigned int)(-1) >> (32 - masklength);
+        mask <<= (__internal::laneid() & ~(tilesz - 1));
+        thread_group tile = thread_group(__internal::CoalescedTile);
+        tile._data.coalesced.mask = mask;
+        tile._data.coalesced.size = __popc(mask);
+        return (tile);
+    }
+
+ public:
+    _CG_QUALIFIER void sync() const {
+        __internal::cta::sync();
+    }
+
+    _CG_QUALIFIER unsigned int size() const {
+        return (__internal::cta::size());
+    }
+
+    _CG_QUALIFIER unsigned int thread_rank() const {
+        return (__internal::cta::thread_rank());
+    }
+
+    // Additional functionality exposed by the group
+    _CG_QUALIFIER dim3 group_index() const {
+        return (__internal::cta::group_index());
+    }
+
+    _CG_QUALIFIER dim3 thread_index() const {
+        return (__internal::cta::thread_index());
+    }
+
+    _CG_QUALIFIER dim3 group_dim() const {
+        return (__internal::cta::block_dim());
+    }
+
+};
+
+/**
+ * thread_block this_thread_block()
+ *
+ * Constructs a thread_block group
+ */
+_CG_QUALIFIER thread_block this_thread_block()
+{
+    return (thread_block());
+}
+
+/**
+ * class coalesced_group
+ *
+ * A group representing the current set of converged threads in a warp.
+ * The size of the group is not guaranteed and it may return a group of
+ * only one thread (itself).
+ *
+ * This group exposes warp-synchronous builtins.
+ * Constructed via coalesced_threads();
+ */
+class coalesced_group : public thread_group
+{
+    friend _CG_QUALIFIER coalesced_group coalesced_threads();
+    friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
+    friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
+
+    _CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
+        unsigned int member_pack = 0;
+        unsigned int member_rank = 0;
+        for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
+            unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
+            if (lane_bit) {
+                if (laneMask & lane_bit)
+                    member_pack |= 1 << member_rank;
+                member_rank++;
+            }
+        }
+        return (member_pack);
+    }
+
+    // Internal Use
+    _CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
+        const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
+
+        // Invalid, immediately fail
+        if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
+            __internal::abort();
+            return (coalesced_group(0));
+        }
+        if (size() <= tilesz) {
+            return (*this);
+        }
+
+        if ((_data.type == __internal::CoalescedTile) && pow2_tilesz) {
+            unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
+            unsigned int masklength = min(size() - base_offset, tilesz);
+            unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
+
+            mask <<= (__internal::laneid() & ~(tilesz - 1));
+            coalesced_group coalesced_tile = coalesced_group(mask);
+            coalesced_tile._data.type = __internal::CoalescedTile;
+            return (coalesced_tile);
+        }
+        else if ((_data.type == __internal::Coalesced) && pow2_tilesz) {
+            unsigned int mask = 0;
+            unsigned int member_rank = 0;
+            int seen_lanes = (thread_rank() / tilesz) * tilesz;
+            for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
+                unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
+                if (lane_bit) {
+                    if (seen_lanes <= 0 && member_rank < tilesz) {
+                        mask |= lane_bit;
+                        member_rank++;
+                    }
+                    seen_lanes--;
+                }
+            }
+            return (coalesced_group(mask));
+        }
+        else {
+            // None in _CG_VERSION 1000
+            __internal::abort();
+        }
+
+        return (coalesced_group(0));
+    }
+
+ protected:
+    // Construct a group from scratch (coalesced_threads)
+    _CG_QUALIFIER coalesced_group(unsigned int mask) : thread_group(__internal::Coalesced) {
+        _data.coalesced.mask = mask;
+        _data.coalesced.size = __popc(mask);
+    }
+
+ public:
+    _CG_QUALIFIER unsigned int size() const {
+        return (_data.coalesced.size);
+    }
+    _CG_QUALIFIER unsigned int thread_rank() const {
+        return (__popc(_data.coalesced.mask & __internal::lanemask32_lt()));
+    }
+    _CG_QUALIFIER void sync() const {
+        __syncwarp(_data.coalesced.mask);
+    }
+
+#define COALESCED_SHFL_FUNCTION(type)                                   \
+    _CG_QUALIFIER type shfl(type var, unsigned int src_rank) const {    \
+        unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 : \
+            (size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1)); \
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));      \
+    }
+
+#define COALESCED_SHFL_UP_FUNCTION(type)                                \
+    _CG_QUALIFIER type shfl_up(type var, int delta) const {             \
+        if (size() == 32) {                                             \
+            return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));        \
+        }                                                               \
+        unsigned lane = __fns(_data.coalesced.mask, __internal::laneid(), -(delta + 1)); \
+        if (lane >= 32) lane = __internal::laneid();                    \
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));      \
+    }
+
+#define COALESCED_SHFL_DOWN_FUNCTION(type)                              \
+    _CG_QUALIFIER type shfl_down(type var, int delta) const {           \
+        if (size() == 32) {                                             \
+            return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));      \
+        }                                                               \
+        unsigned int lane = __fns(_data.coalesced.mask, __internal::laneid(), delta + 1); \
+        if (lane >= 32) lane = __internal::laneid();                    \
+        return (__shfl_sync(_data.coalesced.mask, var, lane, 32));      \
+    }
+
+    COALESCED_SHFL_FUNCTION(int);
+    COALESCED_SHFL_FUNCTION(unsigned int);
+    COALESCED_SHFL_FUNCTION(long);
+    COALESCED_SHFL_FUNCTION(unsigned long);
+    COALESCED_SHFL_FUNCTION(long long);
+    COALESCED_SHFL_FUNCTION(unsigned long long);
+    COALESCED_SHFL_FUNCTION(float);
+    COALESCED_SHFL_FUNCTION(double);
+
+    COALESCED_SHFL_UP_FUNCTION(int);
+    COALESCED_SHFL_UP_FUNCTION(unsigned int);
+    COALESCED_SHFL_UP_FUNCTION(long);
+    COALESCED_SHFL_UP_FUNCTION(unsigned long);
+    COALESCED_SHFL_UP_FUNCTION(long long);
+    COALESCED_SHFL_UP_FUNCTION(unsigned long long);
+    COALESCED_SHFL_UP_FUNCTION(float);
+    COALESCED_SHFL_UP_FUNCTION(double);
+
+    COALESCED_SHFL_DOWN_FUNCTION(int);
+    COALESCED_SHFL_DOWN_FUNCTION(unsigned int);
+    COALESCED_SHFL_DOWN_FUNCTION(long);
+    COALESCED_SHFL_DOWN_FUNCTION(unsigned long);
+    COALESCED_SHFL_DOWN_FUNCTION(long long);
+    COALESCED_SHFL_DOWN_FUNCTION(unsigned long long);
+    COALESCED_SHFL_DOWN_FUNCTION(float);
+    COALESCED_SHFL_DOWN_FUNCTION(double);
+
+# ifdef _CG_HAS_FP16_COLLECTIVE
+    COALESCED_SHFL_FUNCTION(__half);
+    COALESCED_SHFL_UP_FUNCTION(__half);
+    COALESCED_SHFL_DOWN_FUNCTION(__half);
+
+    COALESCED_SHFL_FUNCTION(__half2);
+    COALESCED_SHFL_UP_FUNCTION(__half2);
+    COALESCED_SHFL_DOWN_FUNCTION(__half2);
+# endif
+
+#undef COALESCED_SHFL_FUNCTION
+#undef COALESCED_SHFL_UP_FUNCTION
+#undef COALESCED_SHFL_DOWN_FUNCTION
+
+    _CG_QUALIFIER int any(int predicate) const {
+        return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
+    }
+    _CG_QUALIFIER unsigned int ballot(int predicate) const {
+        if (size() == 32) {
+            return (__ballot_sync(0xFFFFFFFF, predicate));
+        }
+        unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
+        return (_packLanes(lane_ballot));
+    }
+
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+
+# define COALESCED_MATCH_ANY_FUNCTION(type)                             \
+    _CG_QUALIFIER unsigned int match_any(type val) const {              \
+        if (size() == 32) {                                             \
+            return (__match_any_sync(0xFFFFFFFF, val));                 \
+        }                                                               \
+        unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val); \
+        return (_packLanes(lane_match));                                \
+    }
+# define COALESCED_MATCH_ALL_FUNCTION(type)                             \
+    _CG_QUALIFIER unsigned int match_all(type val, int &pred) const {   \
+        if (size() == 32) {                                             \
+            return (__match_all_sync(0xFFFFFFFF, val, &pred));          \
+        }                                                               \
+        unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred); \
+        return (_packLanes(lane_match));                                \
+    }
+
+    COALESCED_MATCH_ANY_FUNCTION(int);
+    COALESCED_MATCH_ANY_FUNCTION(unsigned int);
+    COALESCED_MATCH_ANY_FUNCTION(long);
+    COALESCED_MATCH_ANY_FUNCTION(unsigned long);
+    COALESCED_MATCH_ANY_FUNCTION(long long);
+    COALESCED_MATCH_ANY_FUNCTION(unsigned long long);
+    COALESCED_MATCH_ANY_FUNCTION(float);
+    COALESCED_MATCH_ANY_FUNCTION(double);
+
+    COALESCED_MATCH_ALL_FUNCTION(int);
+    COALESCED_MATCH_ALL_FUNCTION(unsigned int);
+    COALESCED_MATCH_ALL_FUNCTION(long);
+    COALESCED_MATCH_ALL_FUNCTION(unsigned long);
+    COALESCED_MATCH_ALL_FUNCTION(long long);
+    COALESCED_MATCH_ALL_FUNCTION(unsigned long long);
+    COALESCED_MATCH_ALL_FUNCTION(float);
+    COALESCED_MATCH_ALL_FUNCTION(double);
+
+# undef COALESCED_MATCH_ANY_FUNCTION
+# undef COALESCED_MATCH_ALL_FUNCTION
+
+#endif /* !_CG_HAS_MATCH_COLLECTIVE */
+
+};
+
+_CG_QUALIFIER coalesced_group coalesced_threads()
+{
+    return (coalesced_group(__activemask()));
+}
+
+template <unsigned int Size>
+class __thread_block_tile_base : public thread_group
+{
+    static const unsigned int numThreads = Size;
+
+    _CG_QUALIFIER unsigned int build_mask() const {
+        unsigned int mask;
+
+        if (numThreads == 32) {
+            mask = 0xFFFFFFFF;
+        }
+        else {
+            mask = (unsigned int)(-1) >> (32 - numThreads);
+            mask <<= (__internal::laneid() & (~(numThreads - 1)));
+        }
+        return (mask);
+    }
+
+ protected:
+    _CG_QUALIFIER __thread_block_tile_base() : thread_group(__internal::CoalescedTile) {
+        _data.coalesced.mask = build_mask();
+        _data.coalesced.size = numThreads;
+    }
+
+ public:
+    _CG_QUALIFIER void sync() const {
+        __syncwarp(build_mask());
+    }
+    _CG_QUALIFIER unsigned int thread_rank() const {
+        return (__internal::laneid() & (numThreads - 1));
+    }
+    _CG_QUALIFIER unsigned int size() const {
+        return (numThreads);
+    }
+
+    // PTX supported collectives
+    _CG_QUALIFIER int shfl(int var, int srcRank) const {
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+    _CG_QUALIFIER int shfl_down(int var, unsigned int delta) const {
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER int shfl_up(int var, unsigned int delta) const {
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER int shfl_xor(int var, unsigned int laneMask) const {
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+    _CG_QUALIFIER unsigned int shfl(unsigned int var, int srcRank) const {
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+    _CG_QUALIFIER unsigned int shfl_down(unsigned int var, unsigned int delta) const {
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER unsigned int shfl_up(unsigned int var, unsigned int delta) const {
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER unsigned int shfl_xor(unsigned int var, unsigned int laneMask) const {
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+    _CG_QUALIFIER long shfl(long var, int srcRank) const {
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+    _CG_QUALIFIER long shfl_down(long var, unsigned int delta) const {
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER long shfl_up(long var, unsigned int delta) const {
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER long shfl_xor(long var, unsigned int laneMask) const {
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+    _CG_QUALIFIER unsigned long shfl(unsigned long var, int srcRank) const {
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+    _CG_QUALIFIER unsigned long shfl_down(unsigned long var, unsigned int delta) const {
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER unsigned long shfl_up(unsigned long var, unsigned int delta) const {
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER unsigned long shfl_xor(unsigned long var, unsigned int laneMask) const {
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+    _CG_QUALIFIER long long shfl(long long var, int srcRank) const {
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+    _CG_QUALIFIER long long shfl_down(long long var, unsigned int delta) const {
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER long long shfl_up(long long var, unsigned int delta) const {
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER long long shfl_xor(long long var, unsigned int laneMask) const {
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+    _CG_QUALIFIER unsigned long long shfl(unsigned long long var, int srcRank) const {
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+    _CG_QUALIFIER unsigned long long shfl_down(unsigned long long var, unsigned int delta) const {
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER unsigned long long shfl_up(unsigned long long var, unsigned int delta) const {
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER unsigned long long shfl_xor(unsigned long long var, unsigned int laneMask) const {
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+    _CG_QUALIFIER float shfl(float var, int srcRank) const {
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+    _CG_QUALIFIER float shfl_down(float var, unsigned int delta) const {
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER float shfl_up(float var, unsigned int delta) const {
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER float shfl_xor(float var, unsigned int laneMask) const {
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+    _CG_QUALIFIER double shfl(double var, int srcRank) const {
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+    _CG_QUALIFIER double shfl_down(double var, unsigned int delta) const {
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER double shfl_up(double var, unsigned int delta) const {
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER double shfl_xor(double var, unsigned int laneMask) const {
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+    _CG_QUALIFIER int any(int predicate) const {
+        unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate);
+        return (lane_ballot != 0);
+    }
+    _CG_QUALIFIER int all(int predicate) const {
+        unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate);
+        return (lane_ballot == build_mask());
+    }
+    _CG_QUALIFIER unsigned int ballot(int predicate) const {
+        unsigned int lane_ballot = build_mask() & __ballot_sync(build_mask(), predicate);
+        return (lane_ballot >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+
+#ifdef _CG_HAS_FP16_COLLECTIVE
+    _CG_QUALIFIER __half shfl(__half var, int srcRank) const {
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+    _CG_QUALIFIER __half shfl_down(__half var, unsigned int delta) const {
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER __half shfl_up(__half var, unsigned int delta) const {
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER __half shfl_xor(__half var, unsigned int laneMask) const {
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+    _CG_QUALIFIER __half2 shfl(__half2 var, int srcRank) const {
+        return (__shfl_sync(build_mask(), var, srcRank, numThreads));
+    }
+    _CG_QUALIFIER __half2 shfl_down(__half2 var, unsigned int delta) const {
+        return (__shfl_down_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER __half2 shfl_up(__half2 var, unsigned int delta) const {
+        return (__shfl_up_sync(build_mask(), var, delta, numThreads));
+    }
+    _CG_QUALIFIER __half2 shfl_xor(__half2 var, unsigned int laneMask) const {
+        return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
+    }
+#endif
+
+#ifdef _CG_HAS_MATCH_COLLECTIVE
+    _CG_QUALIFIER unsigned int match_any(int val) const {
+        unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_any(unsigned int val) const {
+        unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_any(long val) const {
+        unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_any(unsigned long val) const {
+        unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_any(long long val) const {
+        unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_any(unsigned long long val) const {
+        unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_any(float val) const {
+        unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_any(double val) const {
+        unsigned int lane_match = build_mask() & __match_any_sync(build_mask(), val);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+
+    _CG_QUALIFIER unsigned int match_all(int val, int &pred) const {
+        unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_all(unsigned int val, int &pred) const {
+        unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_all(long val, int &pred) const {
+        unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_all(unsigned long val, int &pred) const {
+        unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_all(long long val, int &pred) const {
+        unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_all(unsigned long long val, int &pred) const {
+        unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_all(float val, int &pred) const {
+        unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+    _CG_QUALIFIER unsigned int match_all(double val, int &pred) const {
+        unsigned int lane_match = build_mask() & __match_all_sync(build_mask(), val, &pred);
+        return (lane_match >> (__internal::laneid() & (~(numThreads - 1))));
+    }
+#endif
+
+};
+
+/**
+ * class thread_block_tile<unsigned int Size>
+ *
+ * Statically-sized group type, representing one tile of a thread block.
+ * The only specializations currently supported are those with native
+ * hardware support (1/2/4/8/16/32)
+ *
+ * This group exposes warp-synchronous builtins.
+ * Constructed via tiled_partition<Size>(class thread_block);
+ */
+template <unsigned int Size>
+class thread_block_tile;
+template <> class thread_block_tile<32> : public __thread_block_tile_base<32> { };
+template <> class thread_block_tile<16> : public __thread_block_tile_base<16> { };
+template <> class thread_block_tile<8>  : public __thread_block_tile_base<8> { };
+template <> class thread_block_tile<4>  : public __thread_block_tile_base<4> { };
+template <> class thread_block_tile<2>  : public __thread_block_tile_base<2> { };
+template <> class thread_block_tile<1>  : public __thread_block_tile_base<1> { };
+
+/**
+ * Outer level API calls
+ * void sync(GroupT) - see <group_type>.sync()
+ * void thread_rank(GroupT) - see <group_type>.thread_rank()
+ * void group_size(GroupT) - see <group_type>.size()
+ */
+template <class GroupT> _CG_QUALIFIER void sync(GroupT const &g)
+{
+    g.sync();
+}
+
+template <class GroupT> _CG_QUALIFIER unsigned int thread_rank(GroupT const& g)
+{
+    return (g.thread_rank());
+}
+
+template <class GroupT> _CG_QUALIFIER unsigned int group_size(GroupT const &g)
+{
+    return (g.size());
+}
+
+/**
+ * <group_type>.sync()
+ *
+ * Executes a barrier across the group
+ *
+ * Implements both a compiler fence and an architectural fence to prevent,
+ * memory reordering around the barrier.
+ */
+_CG_QUALIFIER void thread_group::sync() const
+{
+    if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) {
+        static_cast<const coalesced_group*>(this)->sync();
+    }
+    else {
+        static_cast<const thread_block*>(this)->sync();
+    }
+}
+
+/**
+ * <group_type>.size()
+ *
+ * Returns the total number of threads in the group.
+ */
+_CG_QUALIFIER unsigned int thread_group::size() const
+{
+    if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) {
+        return (static_cast<const coalesced_group*>(this)->size());
+    }
+    else {
+        return (static_cast<const thread_block*>(this)->size());
+    }
+}
+
+/**
+ * <group_type>.thread_rank()
+ *
+ * Returns the linearized rank of the calling thread along the interval [0, size()).
+ */
+_CG_QUALIFIER unsigned int thread_group::thread_rank() const
+{
+    if (_data.type == __internal::Coalesced || _data.type == __internal::CoalescedTile) {
+        return (static_cast<const coalesced_group*>(this)->thread_rank());
+    }
+    else {
+        return (static_cast<const thread_block*>(this)->thread_rank());
+    }
+}
+
+/**
+ * tiled_partition
+ *
+ * The tiled_partition(parent, tilesz) method is a collective operation that
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
+ *
+ * A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
+ * be created where threads having identical k = (thread_rank(parent)/tilesz)
+ * will be members of the same subgroup.
+ *
+ * The implementation may cause the calling thread to wait until all the members
+ * of the parent group have invoked the operation before resuming execution.
+ *
+ * Functionality is limited to power-of-two sized subgorup instances of at most
+ * 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
+ * tiled_partition() in _CG_VERSION 1000.
+ */
+_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
+{
+    if (parent._data.type == __internal::Coalesced || parent._data.type == __internal::CoalescedTile) {
+        return (static_cast<const coalesced_group&>(parent)._get_tiled_threads(tilesz));
+    }
+    else {
+        return (static_cast<const thread_block&>(parent)._get_tiled_threads(tilesz));
+    }
+}
+// Thread block type overload: returns a basic thread_group for now (may be specialized later)
+_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
+{
+    return (parent._get_tiled_threads(tilesz));
+}
+// Coalesced group type overload: retains its ability to stay coalesced
+_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
+{
+    return (parent._get_tiled_threads(tilesz));
+}
+
+namespace __internal {
+
+    // For specializing on different tiled_partition template arguments
+    template <unsigned int Size, typename ParentT>
+    struct tiled_partition_impl;
+
+    template <unsigned int Size>
+    struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size> {
+        _CG_QUALIFIER tiled_partition_impl(thread_block const &) : thread_block_tile<Size>() {}
+    };
+    template <unsigned int Size>
+    struct tiled_partition_impl<Size, thread_block_tile<32> > : public thread_block_tile<Size> {
+        _CG_QUALIFIER tiled_partition_impl(thread_block_tile<32> const&) : thread_block_tile<Size>() {}
+    };
+    template <unsigned int Size>
+    struct tiled_partition_impl<Size, thread_block_tile<16> > : public thread_block_tile<Size> {
+        _CG_QUALIFIER tiled_partition_impl(thread_block_tile<16> const&) : thread_block_tile<Size>() {}
+    };
+    template <unsigned int Size>
+    struct tiled_partition_impl<Size, thread_block_tile<8> > : public thread_block_tile<Size> {
+        _CG_QUALIFIER tiled_partition_impl(thread_block_tile<8> const&) : thread_block_tile<Size>() {}
+    };
+    template <unsigned int Size>
+    struct tiled_partition_impl<Size, thread_block_tile<4> > : public thread_block_tile<Size> {
+        _CG_QUALIFIER tiled_partition_impl(thread_block_tile<4> const&) : thread_block_tile<Size>() {}
+    };
+    template <unsigned int Size>
+    struct tiled_partition_impl<Size, thread_block_tile<2> > : public thread_block_tile<Size> {
+        _CG_QUALIFIER tiled_partition_impl(thread_block_tile<2> const&) : thread_block_tile<Size>() {}
+    };
+    template <>
+    struct tiled_partition_impl<1, thread_block_tile<1> > : public thread_block_tile<1> {
+        _CG_QUALIFIER tiled_partition_impl(thread_block_tile<1> const&) : thread_block_tile<1>() {}
+    };
+
+};
+
+/**
+ * tiled_partition<tilesz>
+ *
+ * The tiled_partition<tilesz>(parent) method is a collective operation that
+ * partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
+ *
+ * A total of ((size(parent)/tilesz) subgroups will be created,
+ * therefore the parent group size must be evenly divisible by the tilesz.
+ * The allow parent groups are thread_block or thread_block_tile<size>.
+ *
+ * The implementation may cause the calling thread to wait until all the members
+ * of the parent group have invoked the operation before resuming execution.
+ *
+ * Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
+ * The size(parent) must be greater than the template Size parameter
+ * otherwise the results are undefined.
+ */
+template <unsigned int Size, typename ParentT>
+_CG_QUALIFIER thread_block_tile<Size> tiled_partition(const ParentT& g)
+{
+    return (__internal::tiled_partition_impl<Size, ParentT>(g));
+}
+
+_CG_END_NAMESPACE
+
+# endif /* ! (__cplusplus, __CUDACC__) */
+
+#endif /* !_COOPERATIVE_GROUPS_H_ */
diff --git a/GraphBLAS/CUDA/templates/cooperative_groups_helpers.h b/GraphBLAS/CUDA/templates/cooperative_groups_helpers.h
new file mode 100755
index 0000000000..f1c499f62e
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/cooperative_groups_helpers.h
@@ -0,0 +1,286 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+/*
+** Define: _CG_VERSION
+*/
+# define _CG_VERSION 1000
+
+/*
+** Define: _CG_ABI_VERSION
+*/
+# ifndef _CG_ABI_VERSION
+#  define _CG_ABI_VERSION 1
+# endif
+
+/*
+** Define: _CG_ABI_EXPERIMENTAL
+** Desc: If enabled, sets all features enabled (ABI-breaking or experimental)
+*/
+# if defined(_CG_ABI_EXPERIMENTAL)
+# endif
+
+# define _CG_CONCAT_INNER(x, y) x ## y
+# define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y)
+# define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION)
+
+# define _CG_BEGIN_NAMESPACE \
+    namespace cooperative_groups { namespace _CG_NAMESPACE {
+# define _CG_END_NAMESPACE \
+    }; using namespace _CG_NAMESPACE; };
+
+# if !defined(_CG_STATIC_QUALIFIER)
+#  define _CG_STATIC_QUALIFIER static __forceinline__ __device__
+# endif
+# if !defined(_CG_QUALIFIER)
+#  define _CG_QUALIFIER __forceinline__ __device__
+# endif
+
+# if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
+#  define _CG_HAS_GRID_GROUP
+# endif
+# if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
+#  define _CG_HAS_MULTI_GRID_GROUP
+# endif
+# if (__CUDA_ARCH__ >= 700) || !defined(__CUDA_ARCH__)
+#  define _CG_HAS_MATCH_COLLECTIVE
+# endif
+// Has __half and __half2
+// Only usable if you include the cuda_fp16.h extension, and
+// _before_ including cooperative_groups.h
+# ifdef __CUDA_FP16_TYPES_EXIST__
+#  define _CG_HAS_FP16_COLLECTIVE
+# endif
+
+/*
+** Define: CG_DEBUG
+** What: Enables various runtime safety checks
+*/
+#if defined(__CUDACC_DEBUG__) && !defined(_CG_DEBUG)
+# define _CG_DEBUG 1
+#endif
+
+#if defined(_CG_DEBUG) && (_CG_DEBUG == 1) && !defined(NDEBUG)
+# include <assert.h>
+# define _CG_ASSERT(x) assert((x));
+# define _CG_ABORT() assert(0);
+#else
+# define _CG_ASSERT(x)
+# define _CG_ABORT() __trap();
+#endif
+
+_CG_BEGIN_NAMESPACE
+
+namespace __internal {
+
+    enum groupType {
+        CoalescedTile,
+        Coalesced,
+        ThreadBlock,
+        Grid,
+        MultiGrid,
+    };
+
+#if defined(_CG_HAS_GRID_GROUP)
+
+    namespace grid {
+
+        _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
+        {
+            return (cudaCGGetIntrinsicHandle(cudaCGScopeGrid));
+        }
+
+        _CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
+        {
+            cudaCGSynchronizeGrid(handle, 0);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
+        {
+            return (blockDim.z * gridDim.z) *
+                (blockDim.y * gridDim.y) *
+                (blockDim.x * gridDim.x);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
+        {
+            unsigned int blkIdx = ((blockIdx.z * gridDim.y * gridDim.x) +
+                               (blockIdx.y * gridDim.x) +
+                               blockIdx.x);
+            return (blkIdx * (blockDim.x * blockDim.y * blockDim.z) +
+                    ((threadIdx.z * blockDim.y * blockDim.x) +
+                     (threadIdx.y * blockDim.x) +
+                     threadIdx.x));
+        }
+
+        _CG_STATIC_QUALIFIER dim3 grid_dim()
+        {
+            return (dim3(gridDim.x, gridDim.y, gridDim.z));
+        }
+    };
+
+#endif
+
+#if defined(_CG_HAS_MULTI_GRID_GROUP)
+
+    namespace multi_grid {
+
+        _CG_STATIC_QUALIFIER unsigned long long get_intrinsic_handle()
+        {
+            return (cudaCGGetIntrinsicHandle(cudaCGScopeMultiGrid));
+        }
+
+        _CG_STATIC_QUALIFIER void sync(const unsigned long long handle)
+        {
+            cudaError_t err = cudaCGSynchronize(handle, 0);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int size(const unsigned long long handle)
+        {
+            unsigned int numThreads = 0;
+            cudaCGGetSize(&numThreads, NULL, handle);
+            return numThreads;
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int thread_rank(const unsigned long long handle)
+        {
+            unsigned int threadRank = 0;
+            cudaCGGetRank(&threadRank, NULL, handle);
+            return threadRank;
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int grid_rank(const unsigned long long handle)
+        {
+            unsigned int gridRank = 0;
+            cudaCGGetRank(NULL, &gridRank, handle);
+            return gridRank;
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int num_grids(const unsigned long long handle)
+        {
+            unsigned int numGrids = 0;
+            cudaCGGetSize(NULL, &numGrids, handle);
+            return numGrids;
+        }
+
+    };
+
+#endif
+
+    namespace cta {
+
+        _CG_STATIC_QUALIFIER void sync()
+        {
+            __barrier_sync(0);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int size()
+        {
+            return (blockDim.x * blockDim.y * blockDim.z);
+        }
+
+        _CG_STATIC_QUALIFIER unsigned int thread_rank()
+        {
+            return ((threadIdx.z * blockDim.y * blockDim.x) +
+                    (threadIdx.y * blockDim.x) +
+                    threadIdx.x);
+        }
+
+        _CG_STATIC_QUALIFIER dim3 group_index()
+        {
+            return (dim3(blockIdx.x, blockIdx.y, blockIdx.z));
+        }
+
+        _CG_STATIC_QUALIFIER dim3 thread_index()
+        {
+            return (dim3(threadIdx.x, threadIdx.y, threadIdx.z));
+        }
+
+        _CG_STATIC_QUALIFIER dim3 block_dim()
+        {
+            return (dim3(blockDim.x, blockDim.y, blockDim.z));
+        }
+
+    };
+
+    _CG_STATIC_QUALIFIER unsigned int laneid()
+    {
+        unsigned int laneid;
+        asm volatile("mov.u32 %0, %%laneid;" : "=r"(laneid));
+        return laneid;
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int warpsz()
+    {
+        unsigned int warpSize;
+        asm volatile("mov.u32 %0, WARP_SZ;" : "=r"(warpSize));
+        return warpSize;
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int lanemask32_eq()
+    {
+        unsigned int lanemask32_eq;
+        asm volatile("mov.u32 %0, %%lanemask_eq;" : "=r"(lanemask32_eq));
+        return (lanemask32_eq);
+    }
+
+    _CG_STATIC_QUALIFIER unsigned int lanemask32_lt()
+    {
+        unsigned int lanemask32_lt;
+        asm volatile("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask32_lt));
+        return (lanemask32_lt);
+    }
+
+    _CG_STATIC_QUALIFIER void abort()
+    {
+        _CG_ABORT();
+    }
+
+}; // !Namespace internal
+
+_CG_END_NAMESPACE
diff --git a/GraphBLAS/CUDA/templates/denseDotProduct.cu b/GraphBLAS/CUDA/templates/denseDotProduct.cu
new file mode 100644
index 0000000000..62841ed4b2
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/denseDotProduct.cu
@@ -0,0 +1,202 @@
+//------------------------------------------------------------------------------
+// denseDotProduct.cu 
+//------------------------------------------------------------------------------
+
+// The denseDotProduct CUDA kernel produces the semi-ring dot product of two
+// vectors of types T1 and T2 and common size n, to a vector odata of type T3.
+// ie. we want to produce dot(x,y) in the sense of the given semi-ring.
+
+// Both the grid and block are 1D, so blockDim.x is the # threads in a
+// threadblock, and the # of threadblocks is grid.x
+
+// Let b = blockIdx.x, and let s be blockDim.x.
+// Each threadblock owns s*8 contiguous items in the input data.
+
+// Thus, threadblock b owns g_idata [b*s*8 ... min(n,(b+1)*s*8-1)].  It's job
+// is to reduce this data to a scalar, and write it to g_odata [b].
+
+#include <limits>
+#include "mySemiRing.h"
+#include <cooperative_groups.h>
+
+using namespace cooperative_groups;
+
+template< typename T3, int tile_sz>
+__inline__ __device__ 
+T3 warp_ReduceSum(thread_block_tile<tile_sz> g, T3 val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2)
+    {
+        T3 fold = g.shfl_down( val, i);
+        val = ADD( val, fold );
+    }
+    return val; // note: only thread 0 will return full sum
+}
+
+template<typename T3, int warpSize>
+__inline__ __device__
+T3 block_ReduceSum(thread_block g, T3 val)
+{
+  static __shared__ T3 shared[warpSize]; // Shared mem for 32 partial sums
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+  thread_block_tile<warpSize> tile = tiled_partition<warpSize>(g);
+
+  // Each warp performs partial reduction
+  val = warp_ReduceSum<T3,warpSize>(tile, val);    
+
+  if (lane==0) shared[wid]=val; // Write reduced value to shared memory
+
+  __syncthreads();              // Wait for all partial reductions
+
+  //read from shared memory only if that warp existed
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : (T3)MONOID_IDENTITY3;
+
+  
+  if (wid==0) val = warp_ReduceSum<T3,warpSize>(tile,val); //Final reduce within first warp
+
+  return val;
+}
+
+template< typename T1, typename T2, typename T3>
+__global__ void denseDotProduct
+(
+    T1 *g_xdata,     // array of size n, type T1
+    T2 *g_ydata,     // array of size n, type T2
+    T3 *g_odata,       // array of size grid.x, type T3
+    unsigned int n
+)
+{
+    // set thread ID
+    unsigned int tid = threadIdx.x ;
+
+    // this threadblock b owns g_idata [block_start ... block_end-1]
+    unsigned long int s = blockDim.x ;
+    unsigned long int b = blockIdx.x ;
+    unsigned long int block_start = b * s * 8 ;
+    unsigned long int block_end   = (b + 1) * s * 8 ;
+
+    /*
+    if (tid == 0)
+    {
+        printf ("block %d: [%lu ... %ld]\n", b, block_start, block_end-1) ;
+    }
+    */
+
+    /*
+    if (tid == 0 && b == 0)
+    {
+        printf ("type is size %d\n", sizeof (T)) ;
+        for (int k = 0 ; k < n ; k++) printf ("%4d: %g\n", k, (double) g_idata [k]) ;
+        printf ("\n") ;
+    }
+    */
+
+    // each thread tid reduces its result into sum 
+    T3 sum;
+
+    // nothing to do
+    if (block_start > block_end) { return ; }
+
+    // convert global data pointer to the local pointer of this block
+    T1 *xdata = g_xdata + block_start ;
+    T2 *ydata = g_ydata + block_start ;
+
+    T1 x0, x1, x2, x3, x4, x5, x6, x7 ;
+    T2 y0, y1, y2, y3, y4, y5, y6, y7 ;
+
+    if (block_end <= n)
+    {
+        // unrolling 8
+        x0 = xdata [tid] ;
+        x1 = xdata [tid +     s] ;
+        x2 = xdata [tid + 2 * s] ;
+        x3 = xdata [tid + 3 * s] ;
+        x4 = xdata [tid + 4 * s] ;
+        x5 = xdata [tid + 5 * s] ;
+        x6 = xdata [tid + 6 * s] ;
+        x7 = xdata [tid + 7 * s] ;
+
+        y0 = ydata [tid] ;
+        y1 = ydata [tid +     s] ;
+        y2 = ydata [tid + 2 * s] ;
+        y3 = ydata [tid + 3 * s] ;
+        y4 = ydata [tid + 4 * s] ;
+        y5 = ydata [tid + 5 * s] ;
+        y6 = ydata [tid + 6 * s] ;
+        y7 = ydata [tid + 7 * s] ;
+        /*
+        if (b == 0)
+        {
+            printf ("block zero: here is tid %2d : %g %g %g %g %g %g %g %g \n", tid,
+                (double) x0, (double) x1, (double) x2, (double) x3,
+                (double) x4, (double) x5, (double) x6, (double) x7) ;
+        }
+        */
+
+    }
+    else
+    {
+        // the last block has size less than 8*s
+        #define XDATA(i) ((i < lastblocksize) ? xdata [i] : MONOID_IDENTITY1)
+        #define YDATA(i) ((i < lastblocksize) ? ydata [i] : MONOID_IDENTITY2)
+        int lastblocksize = n - block_start ;
+        x0 = XDATA (tid) ;
+        x1 = XDATA (tid +     s) ;
+        x2 = XDATA (tid + 2 * s) ;
+        x3 = XDATA (tid + 3 * s) ;
+        x4 = XDATA (tid + 4 * s) ;
+        x5 = XDATA (tid + 5 * s) ;
+        x6 = XDATA (tid + 6 * s) ;
+        x7 = XDATA (tid + 7 * s) ;
+
+        y0 = YDATA (tid) ;
+        y1 = YDATA (tid +     s) ;
+        y2 = YDATA (tid + 2 * s) ;
+        y3 = YDATA (tid + 3 * s) ;
+        y4 = YDATA (tid + 4 * s) ;
+        y5 = YDATA (tid + 5 * s) ;
+        y6 = YDATA (tid + 6 * s) ;
+        y7 = YDATA (tid + 7 * s) ;
+    }
+
+    //work [tid] = mul(x0,y0) + mul(x1,y1) + mul(x2,y2) + mul(x3,y3)
+    //               + mul(x4,y4) + mul(x5,y5) + mul(x6,y6)+ mul(x7,y7) ;
+          sum  = ADD( MUL(x0,y0) , ADD( MUL(x1,y1) , ADD( MUL(x2,y2), 
+                 ADD( MUL(x3,y3) , ADD( MUL(x4,y4) , ADD( MUL(x5,y5), 
+                 ADD( MUL(x6,y6) , MUL(x7,y7)))))))) ;
+
+        /*
+        if (b == 0)
+        {
+            printf ("block zero: still is tid %2d : %g %g %g %g %g %g %g %g \n", tid,
+                (double) x0, (double) x1, (double) x2, (double) x3,
+                (double) x4, (double) x5, (double) x6, (double) x7) ;
+        }
+
+        if (b == 0)
+        {
+            printf ("block zero: here is tid %d result %g  is %g\n",
+            tid, sum,
+            (double) (x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7)) ;
+        }
+        */
+
+    __syncthreads ( ) ;
+
+    //--------------------------------------------------------------------------
+    // reduce per-thread sums to a single scalar
+    //--------------------------------------------------------------------------
+
+    sum = block_ReduceSum<T3, 32>( this_thread_block(), sum); 
+
+    // write result for this block to global mem
+    if (tid == 0)
+    {
+        printf ("final %d : %g\n", b, (T3) sum) ;
+        g_odata [b] = sum ;
+    }
+}
+
diff --git a/GraphBLAS/CUDA/templates/reduceNonZombiesWarp.cu b/GraphBLAS/CUDA/templates/reduceNonZombiesWarp.cu
new file mode 100644
index 0000000000..02953cbd9d
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/reduceNonZombiesWarp.cu
@@ -0,0 +1,108 @@
+//------------------------------------------------------------------------------
+// reduceUnrolled.cu
+//------------------------------------------------------------------------------
+
+// The reduceUnrolled CUDA kernel reduces an array g_idata of size n, of any
+// type T, to an array g_odata of size grid.x.  Each threadblock (blockIdx.x)
+// reduces its portion of g_idata to a single scalar, g_odata [blockIdx.x].
+
+// Both the grid and block are 1D, so blockDim.x is the # threads in a
+// threadblock, and the # of threadblocks is grid.x
+
+// Let b = blockIdx.x, and let s be blockDim.x.
+// Each threadblock owns s*8 contiguous items in the input data.
+
+// Thus, threadblock b owns g_idata [b*s*8 ... min(n,(b+1)*s*8-1)].  It's job
+// is to reduce this data to a scalar, and write it to g_odata [b].
+
+#define GB_KERNEL
+#include <limits>
+#include <cstdint>
+#include <cooperative_groups.h>
+#include "mySemiRing.h"
+
+using namespace cooperative_groups;
+
+template< typename T, int tile_sz>
+__inline__ __device__ 
+T warp_ReduceSum( thread_block_tile<tile_sz> g, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2) {
+        T fold = g.shfl_down( val, i);
+        //printf("thd%d   %d OP %d is %d\n", threadIdx.x, val, fold, OP( val, fold));
+        val = GB_ADD( val, fold );
+    }
+    //if (threadIdx.x ==0) printf("thd%d single warp sum is %d\n", threadIdx.x,  val);
+    return val; // note: only thread 0 will return full sum
+}
+
+template<typename T, int warpSize>
+__inline__ __device__
+T block_ReduceSum(thread_block g, T val)
+{
+  static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums
+  int lane = threadIdx.x & 31 ; // % warpSize;
+  int wid  = threadIdx.x >> 5 ; // / warpSize;
+  thread_block_tile<warpSize> tile = tiled_partition<warpSize>( g );
+
+  // Each warp performs partial reduction
+  val = warp_ReduceSum<T, warpSize>( tile, val);    
+
+  // Wait for all partial reductions
+  if (lane==0) { 
+     //printf("thd%d warp%d sum is %d\n", threadIdx.x, wid, val);
+     shared[wid] = val; // Write reduced value to shared memory
+     //printf("thd%d stored warp%d sum %d\n", threadIdx.x, wid, val);
+  }
+  __syncthreads();              // Wait for all partial reductions
+
+  if (wid > 0 ) return val;
+  //read from shared memory only if that warp existed
+  else { 
+    val = (threadIdx.x < (blockDim.x / warpSize) ) ? shared[lane] : GB_IDENTITY ;
+    //if (lane < (blockDim.x/ warpSize) ) printf("thd%d warp%d loaded val = %d\n", threadIdx.x, lane, val);
+    val = warp_ReduceSum<T, warpSize>( tile, val); //Final reduce within first warp
+  }
+
+  return val;
+}
+
+template< typename T>
+__global__ void reduceNonZombiesWarp
+(
+    int64_t *index,  // array of size n
+    T *g_idata,      // array of size n
+    T *g_odata,      // array of size grid.x
+    unsigned int N
+)
+{
+    // set thread ID
+    int tid = threadIdx.x ;
+
+    // each thread tid reduces its result into sum
+    T sum = (T) GB_IDENTITY;
+
+    for(int i = blockIdx.x * blockDim.x + threadIdx.x; 
+        i < N; 
+        i += blockDim.x * gridDim.x) {
+        if ( index[i] < 0) continue;
+        T fold = g_idata[i];
+        sum = GB_ADD( sum, fold );
+    }
+    //printf("thd%d  sum is %d\n", threadIdx.x + blockDim.x*blockIdx.x, sum);
+    __syncthreads();
+    //--------------------------------------------------------------------------
+    // reduce work [0..s-1] to a single scalar
+    //--------------------------------------------------------------------------
+    // this assumes blockDim is a multiple of 32
+    sum = block_ReduceSum< T, 32 >( this_thread_block(), sum) ; 
+
+    // write result for this block to global mem
+    if (tid == 0)
+    {
+        g_odata [blockIdx.x] = sum ;
+    }
+}
+
diff --git a/GraphBLAS/CUDA/templates/reduceUnrolled.cu b/GraphBLAS/CUDA/templates/reduceUnrolled.cu
new file mode 100644
index 0000000000..da2e3e3eed
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/reduceUnrolled.cu
@@ -0,0 +1,187 @@
+//------------------------------------------------------------------------------
+// reduceUnrolled.cu
+//------------------------------------------------------------------------------
+
+// The reduceUnrolled CUDA kernel reduces an array g_idata of size n, of any
+// type T, to an array g_odata of size grid.x.  Each threadblock (blockIdx.x)
+// reduces its portion of g_idata to a single scalar, g_odata [blockIdx.x].
+
+// Both the grid and block are 1D, so blockDim.x is the # threads in a
+// threadblock, and the # of threadblocks is grid.x
+
+// Let b = blockIdx.x, and let s be blockDim.x.
+// Each threadblock owns s*8 contiguous items in the input data.
+
+// Thus, threadblock b owns g_idata [b*s*8 ... min(n,(b+1)*s*8-1)].  It's job
+// is to reduce this data to a scalar, and write it to g_odata [b].
+
+#include "myOp.h"
+#include <cooperative_groups.h>
+#include "GB_cuda.h"
+
+GrB_Matrix Stuff ;  // hack hack hack
+
+using namespace cooperative_groups;
+
+template< typename T, int tile_sz>
+__inline__ __device__ 
+T warp_ReduceSum( thread_block_tile<tile_sz> g, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2) {
+        T fold = g.shfl_down( val, i);
+        //printf("thd%d   %d OP %d is %d\n", threadIdx.x, val, fold, OP( val, fold));
+        val = OP( val, fold );
+    }
+    //if (threadIdx.x ==0) printf("thd%d single warp sum is %d\n", threadIdx.x,  val);
+    return val; // note: only thread 0 will return full sum
+}
+
+template<typename T, int warpSize>
+__inline__ __device__
+T block_ReduceSum(thread_block g, T val)
+{
+  static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+  thread_block_tile<warpSize> tile = tiled_partition<warpSize>( g );
+
+  // Each warp performs partial reduction
+  val = warp_ReduceSum<T, warpSize>( tile, val);    
+
+  // Wait for all partial reductions
+  if (lane==0) { 
+     //printf("thd%d warp%d sum is %d\n", threadIdx.x, wid, val);
+     shared[wid]=val; // Write reduced value to shared memory
+     //printf("thd%d stored warp %d sum %d\n", threadIdx.x, wid, val);
+  }
+  g.sync();              // Wait for all partial reductions
+
+  if (wid > 0 || gridDim.x == 1 ) return val;
+  //read from shared memory only if that warp existed
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : MONOID_IDENTITY;
+  //printf("thd%d warp loaded val = %d\n", threadIdx.x, lane, val);
+
+  
+  if (wid==0) val = warp_ReduceSum<T, warpSize>( tile, val); //Final reduce within first warp
+
+  return val;
+}
+
+template< typename T>
+__global__ void reduceUnrolled
+(
+    T *g_idata,     // array of size n
+    T *g_odata,     // array of size grid.x
+    unsigned int n
+)
+{
+    // set thread ID
+    unsigned int tid = threadIdx.x ;
+
+    // this threadblock b owns g_idata [block_start ... block_end-1]
+    unsigned long int s = blockDim.x ;
+    unsigned long int b = blockIdx.x ;
+    unsigned long int block_start = b * s * 8 ;
+    unsigned long int block_end   = (b + 1) * s * 8 ;
+
+    /*
+    if (tid == 0)
+    {
+        printf ("block %d: [%lu ... %ld]\n", b, block_start, block_end-1) ;
+    }
+    */
+
+    /*
+    if (tid == 0 && b == 0)
+    {
+        printf ("type is size %d\n", sizeof (T)) ;
+        for (int k = 0 ; k < n ; k++) printf ("%4d: %g\n", k, (double) g_idata [k]) ;
+        printf ("\n") ;
+    }
+    */
+
+    // nothing to do
+    if (block_start > block_end) { if (tid == 0) printf ("bye!\n") ; return ; }
+
+    // convert global data pointer to the local pointer of this block
+    T *idata = g_idata + block_start ;
+
+    T x0, x1, x2, x3, x4, x5, x6, x7 ;
+
+    if (block_end <= n)
+    {
+        // unrolling 8
+        x0 = idata [tid] ;
+        x1 = idata [tid +     s] ;
+        x2 = idata [tid + 2 * s] ;
+        x3 = idata [tid + 3 * s] ;
+        x4 = idata [tid + 4 * s] ;
+        x5 = idata [tid + 5 * s] ;
+        x6 = idata [tid + 6 * s] ;
+        x7 = idata [tid + 7 * s] ;
+
+        /*
+        if (b == 0)
+        {
+            printf ("block zero: here is tid %2d : %g %g %g %g %g %g %g %g \n", tid,
+                (double) x0, (double) x1, (double) x2, (double) x3,
+                (double) x4, (double) x5, (double) x6, (double) x7) ;
+        }
+        */
+
+    }
+    else
+    {
+        // the last block has size less than 8*s
+        #define IDATA(i) ((i < lastblocksize) ? idata [i] : MONOID_IDENTITY)
+        int lastblocksize = n - block_start ;
+        x0 = IDATA (tid) ;
+        x1 = IDATA (tid +     s) ;
+        x2 = IDATA (tid + 2 * s) ;
+        x3 = IDATA (tid + 3 * s) ;
+        x4 = IDATA (tid + 4 * s) ;
+        x5 = IDATA (tid + 5 * s) ;
+        x6 = IDATA (tid + 6 * s) ;
+        x7 = IDATA (tid + 7 * s) ;
+    }
+    T sum;
+    //work [tid] = x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7 ;
+    sum = OP( x0 ,OP( x1, OP( x2, OP( x3,
+                 OP( x4, OP( x5 , OP( x6 , x7))))))) ;
+
+        /*
+        if (b == 0)
+        {
+            printf ("block zero: still is tid %2d : %g %g %g %g %g %g %g %g \n", tid,
+                (double) x0, (double) x1, (double) x2, (double) x3,
+                (double) x4, (double) x5, (double) x6, (double) x7) ;
+        }
+
+        if (b == 0)
+        {
+            printf ("block zero: here is tid %d result %g  is %g\n",
+            tid, (double) work [tid],
+            (double) (x0 + x1 + x2 + x3 + x4 + x5 + x6 + x7)) ;
+        }
+        */
+
+    __syncthreads ( ) ;
+
+    //--------------------------------------------------------------------------
+    // reduce work [0..s-1] to a single scalar
+    //--------------------------------------------------------------------------
+
+    // This assumes that s is a power of 2 and <= 1024, and at least 32
+    // This assumes blockDim is a multiple of 32
+    sum = block_ReduceSum<T , 32>( this_thread_block(), sum); 
+
+    // write result for this block to global mem
+    if (tid == 0)
+    {
+        // printf ("final %d : %g\n", b, (double) work [0]) ;
+        g_odata [b] = sum ;
+    }
+}
+
diff --git a/GraphBLAS/CUDA/templates/reduceWarp.cu b/GraphBLAS/CUDA/templates/reduceWarp.cu
new file mode 100644
index 0000000000..000733b522
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/reduceWarp.cu
@@ -0,0 +1,103 @@
+//------------------------------------------------------------------------------
+// reduceUnrolled.cu
+//------------------------------------------------------------------------------
+
+// The reduceUnrolled CUDA kernel reduces an array g_idata of size n, of any
+// type T, to an array g_odata of size grid.x.  Each threadblock (blockIdx.x)
+// reduces its portion of g_idata to a single scalar, g_odata [blockIdx.x].
+
+// Both the grid and block are 1D, so blockDim.x is the # threads in a
+// threadblock, and the # of threadblocks is grid.x
+
+// Let b = blockIdx.x, and let s be blockDim.x.
+// Each threadblock owns s*8 contiguous items in the input data.
+
+// Thus, threadblock b owns g_idata [b*s*8 ... min(n,(b+1)*s*8-1)].  It's job
+// is to reduce this data to a scalar, and write it to g_odata [b].
+
+#include "mySemiRing.h"
+#include <cooperative_groups.h>
+
+using namespace cooperative_groups;
+
+template< typename T, int tile_sz>
+__inline__ __device__ 
+T warp_ReduceSum( thread_block_tile<tile_sz> g, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2) {
+        T fold = g.shfl_down( val, i);
+        //printf("thd%d   %d OP %d is %d\n", threadIdx.x, val, fold, OP( val, fold));
+        val = OP( val, fold );
+    }
+    //if (threadIdx.x ==0) printf("thd%d single warp sum is %d\n", threadIdx.x,  val);
+    return val; // note: only thread 0 will return full sum
+}
+
+template<typename T, int warpSize>
+__inline__ __device__
+T block_ReduceSum(thread_block g, T val)
+{
+  static __shared__ T shared[warpSize]; // Shared mem for 32 partial sums
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+  thread_block_tile<warpSize> tile = tiled_partition<warpSize>( g );
+
+  // Each warp performs partial reduction
+  val = warp_ReduceSum<T, warpSize>( tile, val);    
+
+  // Wait for all partial reductions
+  if (lane==0) { 
+     //printf("thd%d warp%d sum is %d\n", threadIdx.x, wid, val);
+     shared[wid]=val; // Write reduced value to shared memory
+     //printf("thd%d stored warp %d sum %d\n", threadIdx.x, wid, val);
+  }
+  __syncthreads();              // Wait for all partial reductions
+
+  if (wid > 0 || gridDim.x == 1 ) return val;
+  //read from shared memory only if that warp existed
+  val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : MONOID_IDENTITY;
+  //printf("thd%d warp loaded val = %d\n", threadIdx.x, lane, val);
+
+  
+  if (wid==0) val = warp_ReduceSum<T, warpSize>( tile, val); //Final reduce within first warp
+
+  return val;
+}
+
+template< typename T>
+__global__ void reduceWarp
+(
+    T *g_idata,     // array of size n
+    T *g_odata,     // array of size grid.x
+    unsigned int N
+)
+{
+    // set thread ID
+    unsigned int tid = threadIdx.x ;
+
+    // each thread tid reduces its result into sum
+    T sum = (T) MONOID_IDENTITY;
+
+    for(int i = blockIdx.x * blockDim.x + threadIdx.x; 
+        i < N; 
+        i += blockDim.x * gridDim.x) {
+        sum = OP( sum, g_idata[i]);
+    }
+    //printf("thd%d  sum is %d\n", threadIdx.x + blockDim.x*blockIdx.x, sum);
+    __syncthreads();
+    //--------------------------------------------------------------------------
+    // reduce work [0..s-1] to a single scalar
+    //--------------------------------------------------------------------------
+    // this assumes blockDim is a multiple of 32
+    sum = block_ReduceSum<T , 32>( this_thread_block(), sum); 
+
+    // write result for this block to global mem
+    if (tid == 0)
+    {
+        // printf ("final %d : %g\n", b, (double) work [0]) ;
+        g_odata [blockIdx.x] = sum ;
+    }
+}
+
diff --git a/GraphBLAS/CUDA/templates/sparseDotProduct.cu b/GraphBLAS/CUDA/templates/sparseDotProduct.cu
new file mode 100644
index 0000000000..ee1943b21c
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/sparseDotProduct.cu
@@ -0,0 +1,189 @@
+//------------------------------------------------------------------------------
+// sparseDotProduct_merge_path.cu 
+//------------------------------------------------------------------------------
+
+// The sparseDotProduct CUDA kernel produces the semi-ring dot product of two
+// sparse vectors of types T1 and T2 and common index space size n, to a scalar 
+// odata of type T3. The vectors are sparse, with different numbers of non-zeros.
+// ie. we want to produce dot(x,y) in the sense of the given semi-ring.
+
+// This version uses a merge-path algorithm, when the sizes g_xnz and g_ynz are 
+// relatively close in size, but for any size of N.
+
+// Both the grid and block are 1D, so blockDim.x is the # threads in a
+// threadblock, and the # of threadblocks is grid.x
+
+// Let b = blockIdx.x, and let s be blockDim.x. s= 32 with a variable number
+// of active threads = min( min(g_xnz, g_ynz), 32) 
+
+// Thus, threadblock b owns a part of the index set spanned by g_xi and g_yi.  Its job
+// is to find the intersection of the index sets g_xi and g_yi, perform the semi-ring dot
+// product on those items in the intersection, and finally reduce this data to a scalar, 
+// on exit write it to g_odata [b].
+
+#include <limits>
+#include <cooperative_groups.h>
+#include "mySemiRing.h"
+
+using namespace cooperative_groups;
+
+template< typename T, int tile_sz>
+__device__ T reduce_sum(thread_block_tile<tile_sz> g, T val)
+{
+    // Each iteration halves the number of active threads
+    // Each thread adds its partial sum[i] to sum[lane+i]
+    for (int i = g.size() / 2; i > 0; i /= 2)
+    {
+        val = ADD( val, g.shfl_down(val,i) );
+        //if (g.thread_rank() ==0)
+        //    printf("in reduce_sum i=%i val = %f\n", i, val);
+    }
+    return val; // note: only thread 0 will return full sum
+}
+
+#define INTMIN( A, B) ( (A) < (B) ) ?  (A) : (B)
+#define INTMAX( A, B) ( (A) > (B) ) ?  (A) : (B)
+#define intersects_per_thread 4
+
+template< typename T1, typename T2, typename T3>
+__global__ void sparseDotProduct
+(
+    unsigned int g_xnz,       // Number of non-zeros in x
+    unsigned int *g_xi,       // Non-zero indices in x, size xnz
+    T1 *g_xdata,              // array of size xnz, type T1
+    unsigned int g_ynz,       // Number of non-zeros in y
+    unsigned int *g_yi,       // Non-zero indices in y, size ynz
+    T2 *g_ydata,              // array of size ynz, type T2
+    T3 *g_odata               // array of size grid.x, type T3
+)
+{
+    // set thread ID
+    unsigned int tid_global = threadIdx.x+ blockDim.x* blockIdx.x;
+    unsigned int tid = threadIdx.x;
+
+    unsigned long int b = blockIdx.x ;
+
+    // total items to be inspected
+    unsigned int nxy = (g_xnz + g_ynz);
+
+    //largest possible number of intersections is the smaller nz
+    unsigned int n_intersect = INTMIN( g_xnz, g_ynz); 
+
+    //we want more than one intersection per thread
+    unsigned int parts = (n_intersect+ intersects_per_thread -1)/ intersects_per_thread; 
+
+    unsigned int work_per_thread = (nxy +parts -1)/parts;
+    unsigned int diag = INTMIN( work_per_thread*tid_global, nxy);
+    unsigned int diag_end = INTMIN( diag + work_per_thread, nxy);
+    //printf(" thd%d parts = %u wpt = %u diag, diag_end  = %u,%u\n",tid, parts, work_per_thread, diag, diag_end); 
+
+   unsigned int x_min = INTMAX( (int)(diag - g_ynz), 0);
+   unsigned int x_max = INTMIN( diag, g_xnz);
+
+   //printf("start thd%u x_min = %u x_max = %u\n", tid_global, x_min,x_max);
+   while ( x_min < x_max) { //binary search for correct diag break
+      unsigned int pivot = (x_min +x_max)/2;
+      if ( g_xi[pivot] < g_yi[ diag -pivot -1]) {
+         x_min = pivot +1;
+      }
+      else {
+         x_max = pivot;
+      }
+   }
+   int xcoord = x_min;
+   int ycoord = diag -x_min -1;
+   if (( diag > 0) &&(diag < (g_xnz+g_ynz)) && (g_xi[xcoord] == g_yi[ycoord]) ) { 
+       diag--; //adjust for intersection incrementing both pointers 
+   }
+   // two start points are known now
+   int x_start = xcoord;
+   int y_start = diag -xcoord; 
+
+   //if (x_start != y_start)
+   //   printf("start thd%u  xs,ys = %i,%i\n", tid_global, x_start, y_start);
+
+   x_min = INTMAX( (int)(diag_end - g_ynz), 0);
+   x_max = INTMIN( diag_end, g_xnz);
+
+   while ( x_min < x_max) {
+      unsigned int pivot = (x_min +x_max)/2;
+      //printf("thd%u pre_sw piv=%u diag_e = %u  xmin,xmax=%u,%u\n", tid_global, pivot, diag_end,x_min, x_max);
+      if ( g_xi[pivot] < g_yi[ diag_end -pivot -1]) {
+         x_min = pivot +1;
+      }
+      else {
+         x_max = pivot;
+      }
+      //printf("thd%u piv=%u xmin,xmax = %u,%u\n", tid_global, pivot, x_min, x_max);
+   }
+   xcoord = x_min;
+   ycoord = diag_end -x_min -1;
+   if ( (diag_end < (g_xnz+g_ynz)) && (g_xi[xcoord] == g_yi[ycoord]) ) { 
+       diag--; //adjust for intersection incrementing both pointers  
+   }
+   // two end points are known now
+   int x_end = xcoord; 
+   int y_end = diag_end - xcoord; 
+
+   /* 
+   if (tid == 0 && b == 0) {
+        printf ("type1 is size %d\n", sizeof (T1)) ;
+        for (int k = 0 ; k < g_xnz ; k++) printf ("%4d: %g,", k, (T1) g_xdata [k]) ;
+        printf ("\n") ;
+        printf ("type2 is size %d\n", sizeof (T2)) ;
+        for (int k = 0 ; k < g_ynz ; k++) printf ("%4d: %g,", k, (T2) g_ydata [k]) ;
+        printf ("\n") ;
+    }
+    __syncthreads();
+    */
+
+    T3 sum = (T3) 0;
+    //printf(" thd%u has init value %f\n",tid, sum);
+
+    // nothing to do
+    if ( (x_start >= x_end) || (y_start >= y_end) ) { return ; }
+
+    //merge-path dot product
+    int k = x_start;
+    int l = y_start;
+    while ( k < x_end && l < y_end )
+    {
+       if      ( g_xi[k] < g_yi[l] ) k += 1;
+       else if ( g_xi[k] > g_yi[l] ) l += 1; 
+       else {
+          //printf("  thd%d ix at %u \n",tid_global,g_xi[k]);
+          //printf("   sum += %f * %f \n",tid,g_xdata[k],g_ydata[l]);
+          //sum = ADD( sum, MUL( g_xdata[k], g_ydata[l]));
+          MULADD( sum, g_xdata[k], g_ydata[l]);
+          //printf(" thd%u work value = %f\n",tid_global, sum);
+          k+= 1;
+          l+= 1;
+       }
+
+    }
+
+    __syncthreads ( ) ;
+    /*
+    if (1)
+    {
+        printf ("thd%u done with intersect and multiply, val = %f\n",tid_global, sum) ;
+    }
+    __syncthreads ( ) ;
+    */
+
+    //--------------------------------------------------------------------------
+    // reduce sum per-thread values to a single scalar
+    //--------------------------------------------------------------------------
+    // Using tile size fixed at compile time, we don't need shared memory
+    #define tile_sz 32 
+    thread_block_tile<tile_sz> tile = tiled_partition<tile_sz>( this_thread_block());
+    T3 block_sum = reduce_sum<T3,tile_sz>(tile, sum);
+
+    // write result for this block to global mem
+    if (tid == 0)
+    {
+        printf ("final %d : %g\n", b,  block_sum) ;
+        g_odata [b] = block_sum ;
+    }
+}
+
diff --git a/GraphBLAS/CUDA/templates/stuff.cu b/GraphBLAS/CUDA/templates/stuff.cu
new file mode 100644
index 0000000000..9241fd1ebb
--- /dev/null
+++ b/GraphBLAS/CUDA/templates/stuff.cu
@@ -0,0 +1,9 @@
+ val = ADD( val, g.shfl_down( val, i) );
+
+
+    t = g.shfl_down( val, i) ;
+    val = ADD( val, t );
+
+    GB_ADD (val, t) ;           // statment  val = GB_ADD_FUNCTION (val, t)
+
+
diff --git a/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.hpp b/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.hpp
new file mode 100644
index 0000000000..197d24e5ec
--- /dev/null
+++ b/GraphBLAS/CUDA/test/AxB_dot3_cuda_tests.hpp
@@ -0,0 +1,19 @@
+// SPDX-License-Identifier: Apache-2.0
+
+// Test AxB_dot3_cuda kernels 
+// Using data generators and test classes, cover
+// all 12 cases for the masked GEMM ( C, M, A, B) in GraphBLAS
+// Tests Semirings, data types and a range of data input sizes and shapes
+// Connects to the jitFactory for launches.
+
+#include <cassert>
+#include <cmath>
+#include <random>
+#include <algorithm>
+#include <cstdint>
+#include "jitTestFactory.hpp"
+#include "gtest/gtest.h"
+
+//Test instances and groupings 
+#include "AxB_dot3_test_instances.hpp"
+
diff --git a/GraphBLAS/CUDA/test/AxB_dot3_test_instances.hpp b/GraphBLAS/CUDA/test/AxB_dot3_test_instances.hpp
new file mode 100644
index 0000000000..1719df741c
--- /dev/null
+++ b/GraphBLAS/CUDA/test/AxB_dot3_test_instances.hpp
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: Apache-2.0
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,bool,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,bool,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMboolAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,bool,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMboolAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,bool,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMboolAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMboolAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMboolAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMboolAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMint32_tAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMint32_tAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMint32_tAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMint32_tAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCint32_tMint32_tAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCint32_tMint32_tAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMboolAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMboolAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMboolAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMboolAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMboolAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMboolAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMboolAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMboolAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMint32_tAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMint32_tAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMint32_tAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMint32_tAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMint32_tAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMint32_tAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyPLUS_TIMESCuint64_tMint32_tAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallPLUS_TIMESCuint64_tMint32_tAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "PLUS_TIMES";  test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMboolAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMboolAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMboolAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMboolAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMboolAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMboolAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMint32_tAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMint32_tAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMint32_tAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMint32_tAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCint32_tMint32_tAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCint32_tMint32_tAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMboolAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMboolAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMboolAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMboolAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMboolAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMboolAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMboolAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMboolAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMint32_tAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMint32_tAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMint32_tAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMint32_tAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMint32_tAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMint32_tAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMIN_PLUSCuint64_tMint32_tAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMIN_PLUSCuint64_tMint32_tAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MIN_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMboolAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMboolAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMboolAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMboolAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMboolAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMboolAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMboolAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,bool,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMint32_tAint32_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMint32_tAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMint32_tAint32_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMint32_tAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMint32_tAuint64_tBint32_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,int32_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCint32_tMint32_tAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCint32_tMint32_tAuint64_tBuint64_tXint32_tYint32_tZint32_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< int32_t,int32_t,uint64_t,uint64_t,int32_t,int32_t,int32_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMboolAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMboolAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMboolAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMboolAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMboolAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMboolAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMboolAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMboolAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,bool,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMint32_tAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMint32_tAint32_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMint32_tAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMint32_tAint32_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMint32_tAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMint32_tAuint64_tBint32_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,int32_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
+TEST( AxB_dot3_tests_warp, tinyxtinyMAX_PLUSCuint64_tMint32_tAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 32, 256, 128, SR);}
+TEST( AxB_dot3_tests_warp, smallxsmallMAX_PLUSCuint64_tMint32_tAuint64_tBuint64_tXuint64_tYuint64_tZuint64_t){ std::string SR = "MAX_PLUS";  test_AxB_dot3_warp_factory< uint64_t,int32_t,uint64_t,uint64_t,uint64_t,uint64_t,uint64_t > (5, 1024, 65536, 65536, SR);}
diff --git a/GraphBLAS/CUDA/test/GpuTimer.h b/GraphBLAS/CUDA/test/GpuTimer.h
new file mode 100644
index 0000000000..63c3e1aaf4
--- /dev/null
+++ b/GraphBLAS/CUDA/test/GpuTimer.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: Apache-2.0
+#ifndef __GPU_TIMER_H__
+#define __GPU_TIMER_H__
+
+struct GpuTimer
+{
+          cudaEvent_t start;
+          cudaEvent_t stop;
+                 
+          GpuTimer()
+          {
+               cudaEventCreate(&start);
+               cudaEventCreate(&stop);
+          }
+                       
+          ~GpuTimer()
+          {
+               cudaEventDestroy(start);
+               cudaEventDestroy(stop);
+          }
+                             
+          void Start()
+          {
+               cudaEventRecord(start, 0);
+          }
+                                   
+          void Stop()
+          {
+              cudaEventRecord(stop, 0);
+          }
+                                         
+          float Elapsed()
+          {
+              float elapsed;
+              cudaEventSynchronize(stop);
+              cudaEventElapsedTime(&elapsed, start, stop);
+              return elapsed;
+          }
+};
+
+#endif  /* __GPU_TIMER_H__ */
diff --git a/GraphBLAS/CUDA/test/Makefile b/GraphBLAS/CUDA/test/Makefile
new file mode 100644
index 0000000000..289e5da8c8
--- /dev/null
+++ b/GraphBLAS/CUDA/test/Makefile
@@ -0,0 +1,133 @@
+#-------------------------------------------------------------------------------
+# GraphBLAS/CUDA/Makefile
+#-------------------------------------------------------------------------------
+
+# cuda 10.1+ is assumed
+
+all: cudaTest
+
+
+LIBS = -L/usr/local/cuda/lib64 -L/usr/local/cuda/lib64/stubs -lpthreads -lcudadevrt -lcudart -lnvrtc
+INC += -I$(CUDA_DIR)/include -I../ -I../../Source -I../../Include -I../../Source/Template -I$(TEMPLATE_DIR) -Igoogletest/include
+
+CUDA_OPTS = -O2 --cudart=shared --gpu-architecture=compute_75\
+        --relocatable-device-code true --device-c\
+        --std=c++11 -Xcompiler -fPIC
+
+
+%.o: %.cu
+	nvcc -c $(I) $(CUDA_OPTS) $(INC)  -o $@ $< 
+
+config:
+	nvidia-smi
+	nvcc --version
+	@echo " "
+	@echo "SO_NAME:   " $(SO_NAME)
+	@echo "SO_OPTS:   " $(SO_OPTS)
+	@echo "LIBS:      " $(LIBS)
+	@echo "CUDA_OPTS: " $(CUDA_OPTS)
+	@echo "SRC:       " $(SRC)
+	@echo "OBJ:       " $(OBJ)
+	@echo "I:         " $(I)
+	@echo " "
+	gcc  --version
+	icc  --version
+
+clean:
+	rm -f *.o
+	rm -f stringify
+	rm -f cudaTest 
+	rm -f testJit
+.PHONY: clean
+
+distclean: clean
+	rm -f *.so *.a
+
+purge: distclean
+
+################################################################################
+
+GXX     ?= g++
+GCC     ?= gcc
+DOXYGEN ?= doxygen
+CXXFLAGS ?= -O3 -Wall -g -fmessage-length=80
+CFLAGS ?= -O2  -g -std=c11 
+
+CXX11 ?= 1
+
+CUDA_DIR ?= /usr/local/cuda
+
+CXXFLAGS += -pthread
+
+ifeq ($(CXX11),1)
+	CXXFLAGS += -std=c++14
+endif
+
+EMBED_BEGIN = -rdynamic -Wl,-b,binary,
+EMBED_END   = ,-b,default
+
+UNAME_S := $(shell uname -s)
+ifeq ($(UNAME_S),Linux)
+	CXXFLAGS += -D LINUX 
+	CUDA_LIB_DIR = $(CUDA_DIR)/lib64
+else ifeq ($(UNAME_S),Darwin)
+	CUDA_LIB_DIR = $(CUDA_DIR)/lib
+endif
+
+TEMPLATE_DIR ?= ../templates
+
+LIB += -ldl -L$(CUDA_LIB_DIR) -L$(CUDA_LIB_DIR)/stubs -lcuda -lcudadevrt -lcudart -lnvrtc
+
+HEADERS = jitify.hpp dataFactory.hpp jitFactory.hpp jitTestFactory.hpp semiringFactory.hpp \
+          ../type_name.hpp 
+
+TEMPLATES :=  $(wildcard $(TEMPLATE_DIR)/*.cu)
+
+CU_OBJS := ../GB_jit_cache.o ../GB_jit_launcher.o 
+
+CFILES := $(wildcard ../*.c)  
+
+COBJS := $(patsubst %.c, %.o, $(CFILES) )
+
+JIT_TEMP := $(patsubst %.cu, %.cu.jit, $(TEMPLATES))
+
+GTEST_LIB := googletest/build/lib/libgtest.a googletest/build/lib/libgtest_main.a
+
+%.cu.jit: %.cu 
+	../stringify $? > $@
+
+stringify: stringify.cpp
+	$(GXX) -o $@ $< -O3 -Wall
+
+%.o: %.c
+	$(GXX) -c -o $@ $< $(CFLAGS) $(INC) 
+
+%.o: %.cpp
+	$(GXX) -c -o $@ $< $(CXXFLAGS) $(INC) 
+
+cu_link.o: $(CU_OBJS)
+	nvcc --gpu-architecture=compute_75 --device-link $(CU_OBJS)  --output-file cu_link.o
+
+
+testJit: testJit.cpp $(OBJS) $(HEADERS) $(JIT_TEMP) 
+	$(GXX) -o $@ $< $(CXXFLAGS) $(INC) $(OBJS) $(LIB)
+
+AxB_dot3_test_instances.hpp:  testGen.py
+	python3 testGen.py
+
+
+instances :=  AxB_dot3_test_instances.hpp 
+
+
+cudaTest: cudaTest.cpp $(COBJS) $(OBJS) $(HEADERS) $(JIT_TEMP) cu_link.o AxB_dot3_cuda_tests.hpp  $(instances)
+	$(GXX) -o $@ $< $(CXXFLAGS) $(INC) $(COBJS) $(CU_OBJS) cu_link.o $(LIB) $(GTEST_LIB)
+
+%.cu: %.cutmp
+	cp $? $@
+
+
+doc: jitify.hpp Doxyfile
+	$(DOXYGEN) Doxyfile
+.PHONY: doc
+
+
diff --git a/GraphBLAS/CUDA/test/cudaTest.cpp b/GraphBLAS/CUDA/test/cudaTest.cpp
new file mode 100644
index 0000000000..e635ae39ab
--- /dev/null
+++ b/GraphBLAS/CUDA/test/cudaTest.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+  Extended example for building on-the-fly kernels with C interface.
+  Simple examples demonstrating different ways to load source code
+    and call kernels.
+ */
+
+
+#include "AxB_dot3_cuda_tests.hpp"
+#include "gtest/gtest.h"
+
+
+//int main(int argc, char* argv[]) {
+#if __cplusplus >= 201103L
+
+//#define TEST_RESULT(result) (result ? "PASSED" : "FAILED")
+//std::cout << "Running tests..."<<std::endl;
+
+/*
+TEST(MergePathDot, PlusTimesffd){
+  bool test_spdot_plus_times_ffd_nu = test_spdotfactoryUM<float,float,double>(256, 32,120,"PLUS_TIMES");
+  EXPECT_EQ( true, test_spdot_plus_times_ffd_nu);
+}
+
+TEST(MergePathDot, PlusTimesffdLarge) {
+  bool test_spdot_plus_times_ffd_lrg_nu = test_spdotfactoryUM<float,float,double>(4096, 256,256,"PLUS_TIMES");
+  EXPECT_EQ(true, test_spdot_plus_times_ffd_lrg_nu);
+}
+
+TEST(MergePathDot, PlusTimesfff) {
+  bool test_spdot_plus_times_fff = test_spdotfactoryUM<float,float,float>(256, 32,32,"PLUS_TIMES");
+  EXPECT_EQ(true, test_spdot_plus_times_fff);
+}
+
+TEST(MergePathDot, PlusTimeffdTiny) {
+  bool test_spdot_plus_times_ffd = test_spdotfactoryUM<float,float,double>(256, 32,32,"PLUS_TIMES");
+  EXPECT_EQ(true, test_spdot_plus_times_ffd);
+}
+
+TEST(VSVSDot, PlusTimesfff) {
+  bool test_spdot_batch_fff = test_spdot_batch_factoryUM<float, float, float>(5, 32, 128, 128, "PLUS_TIMES"); 
+  EXPECT_EQ( true, test_spdot_batch_fff);
+}
+
+TEST(VSVSDot, PlusTimesiii) {
+  bool test_spdot_batch_iii = test_spdot_batch_factoryUM<int, int, int>(5, 32, 128, 128, "PLUS_TIMES"); 
+  EXPECT_EQ( true, test_spdot_batch_iii);
+}
+
+
+
+//  bool test_spdot_batch_fff = test_spdot_batch_factoryUM<float, float, float>(5, 32, 128, 128, "PLUS_TIMES"); 
+
+  cudaSetDevice(0); 
+  cudaDeviceReset();
+  bool test_spdot_batch_iii = test_spdot_batch_factoryUM<int, int, int>(5, 32, 128, 128, "PLUS_TIMES"); 
+  std::cout << "test_spdot_batchUM<int,int,int> uncached:       " 
+            << TEST_RESULT(test_spdot_batch_iii)
+            << std::endl;
+
+  cudaSetDevice(1); 
+  cudaDeviceReset();
+
+  bool test_spdot_batch_iii2= test_spdot_batch_factoryUM<int, int, int64_t>(5, 32, 256, 128, "PLUS_TIMES"); 
+  std::cout << "test_spdot_batchUM<int,int,int64> uncached:       " 
+            << TEST_RESULT(test_spdot_batch_iii2)
+            << std::endl;
+
+
+
+
+
+  bool test_dot_min_plus_iil = test_dotfactoryUM<int,int,long>(4096,"MIN_PLUS");
+  std::cout << "test_dotfactoryUM<int,int,long> uncached:       " 
+            << TEST_RESULT(test_dot_min_plus_iil)
+            << std::endl;
+
+  bool test_dot_min_plus_ffd = test_dotfactoryUM<float,float,double>(4096,"MIN_PLUS");
+  std::cout << "test_dotfactoryUM<float,float,double> uncached:       " 
+            << TEST_RESULT(test_dot_min_plus_ffd)
+            << std::endl;
+
+  bool test_dot_plus_times_ffd = test_dotfactoryUM<float,float,double>(4096,"PLUS_TIMES");
+  std::cout << "test_dotfactoryUM<float,float,double> uncached:       " 
+            << TEST_RESULT(test_dot_plus_times_ffd)
+            << std::endl;
+
+  bool test_dot_plus_times_fii = test_dotfactoryUM<float,int,int>(4096,"PLUS_TIMES");
+  std::cout << "test_dotfactoryUM<float,int,int> uncached:       " 
+            << TEST_RESULT(test_dot_plus_times_fii)
+            << std::endl;
+
+  bool test_dot_plus_times_iil = test_dotfactoryUM<int,int,long>(4096,"PLUS_TIMES");
+  std::cout << "test_dotfactoryUM<int,int,long> uncached:       " 
+            << TEST_RESULT(test_dot_plus_times_iil)
+            << std::endl;
+
+  bool test_reducefactory_float_result = test_reducefactoryUM<float>(4096, "PLUS");
+  std::cout << "test_reducefactoryUM<float> uncached:       " 
+            << TEST_RESULT(test_reducefactory_float_result)
+            << std::endl;
+
+  bool test_reducefactory_double_plus_result = test_reducefactoryUM<double>(4096, "PLUS");
+  std::cout << "test_reducefactoryUM<double> uncached:       " 
+            << TEST_RESULT(test_reducefactory_double_plus_result)
+            << std::endl;
+
+  std::cout << "testing cached kernel" <<std::endl;
+  bool test2_reducefactory_double_plus_result = test_reducefactoryUM<double>(4096, "PLUS");
+  std::cout << "test_reducefactoryUM<double> cached:       " 
+            << TEST_RESULT(test2_reducefactory_double_plus_result)
+            << std::endl;
+
+  bool test_reducefactory_float_min_result = test_reducefactoryUM<float>(32,"MIN");
+  std::cout << "test_reducefactoryUM<float> MIN uncached:       " 
+            << TEST_RESULT(test_reducefactory_float_min_result)
+            << std::endl;
+
+  bool test_reducefactory_int_min_result = test_reducefactoryUM<int>(32,"MIN");
+  std::cout << "test_reducefactoryUM<int> MIN uncached:       " 
+            << TEST_RESULT(test_reducefactory_int_min_result)
+            << std::endl;
+
+  bool test_reducefactory_int_max_result = test_reducefactoryUM<int>(32,"MAX");
+  std::cout << "test_reducefactoryUM<int> MAX uncached:       " 
+            << TEST_RESULT(test_reducefactory_int_max_result)
+            << std::endl;
+
+  bool test_reducefactory_int_result = test_reducefactoryUM<int>(4096,"PLUS");
+  std::cout << "test_reducefactoryUM<int> PLUS uncached:       " 
+            << TEST_RESULT(test_reducefactory_int_result)
+            << std::endl;
+
+  bool test_reducefactory_int_cache_result = 
+                test_reducefactoryUM<int>(4096,"PLUS");
+  std::cout << "test_reducefactoryUM<int> PLUS cached:          " 
+            << TEST_RESULT(test_reducefactory_int_cache_result)
+            << std::endl;
+*/
+#endif
diff --git a/GraphBLAS/CUDA/test/dataFactory.hpp b/GraphBLAS/CUDA/test/dataFactory.hpp
new file mode 100644
index 0000000000..e2a345d43b
--- /dev/null
+++ b/GraphBLAS/CUDA/test/dataFactory.hpp
@@ -0,0 +1,306 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cmath>
+#include <cstdint>
+#include <random>
+
+static const char *_cudaGetErrorEnum(cudaError_t error) {
+  return cudaGetErrorName(error);
+}
+
+template <typename T>
+void check(T result, char const *const func, const char *const file,
+           int const line) {
+  if (result) {
+    fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
+            static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
+    exit(EXIT_FAILURE);
+  }
+}
+
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+// This will output the proper error string when calling cudaGetLastError
+#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __getLastCudaError(const char *errorMessage, const char *file,
+                               const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+}
+
+// This will only print the proper error string when calling cudaGetLastError
+// but not exit program incase error detected.
+#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
+
+inline void __printLastCudaError(const char *errorMessage, const char *file,
+                                 const int line) {
+  cudaError_t err = cudaGetLastError();
+
+  if (cudaSuccess != err) {
+    fprintf(stderr,
+            "%s(%i) : getLastCudaError() CUDA error :"
+            " %s : (%d) %s.\n",
+            file, line, errorMessage, static_cast<int>(err),
+            cudaGetErrorString(err));
+  }
+}
+#define CHECK_CUDA(call) checkCudaErrors( call )
+
+//Vector generators
+template<typename T>
+void fillvector_linear( int N, T *vec) {
+   for (int i = 0; i< N; ++i) vec[i] = T(i);
+}
+template<typename T>
+void fillvector_constant( int N, T *vec, T val) {
+   for (int i = 0; i< N; ++i) vec[i] = val;
+}
+
+// Mix-in class to enable unified memory
+class Managed {
+public:
+  void *operator new(size_t len) {
+    void *ptr = nullptr;
+    //std::cout<<"in new operator, alloc for "<<len<<" bytes"<<std::endl;
+    CHECK_CUDA( cudaMallocManaged( &ptr, len) );
+    cudaDeviceSynchronize();
+    //std::cout<<"in new operator, sync "<<len<<" bytes"<<std::endl;
+    return ptr;
+  }
+
+  void operator delete(void *ptr) {
+    cudaDeviceSynchronize();
+    //std::cout<<"in delete operator, free "<<std::endl;
+    CHECK_CUDA( cudaFree(ptr) );
+  }
+};
+
+//Basic matrix container class
+template<typename T>
+class matrix : public Managed {
+  public:
+    uint64_t zombie_count = 0;
+     int64_t vlen;
+     int64_t vdim;
+     int64_t nnz; 
+     int64_t *p = nullptr;
+     int64_t *h = nullptr;
+     int64_t *i = nullptr;
+     T *x = nullptr;
+     bool is_filled = false;
+
+     matrix(){};
+
+     matrix( int64_t N, int64_t nvecs){
+        vlen = N;
+        vdim = nvecs;
+     }
+
+     void set_zombie_count( uint64_t zc) { zombie_count = zc;}
+     uint64_t get_zombie_count() { return zombie_count;}
+     void add_zombie_count( int nz) { zombie_count += nz;}
+
+     void clear() {
+        if ( p != nullptr){  cudaFree(p); p = nullptr; }
+        if ( h != nullptr){  cudaFree(h); h = nullptr; }
+        if ( i != nullptr){  cudaFree(i); i = nullptr; }
+        if ( x != nullptr){  cudaFree(x); x = nullptr; } 
+        is_filled = false;
+        vlen = 0;
+        vdim = 0;
+        nnz  = 0;
+        zombie_count = 0;
+     }
+
+     void alloc( int64_t N, int64_t Nz) {
+
+        //cudaMallocManaged((void**)&p, (Nz+N+1)*sizeof(int64_t)+ (Nz*sizeof(T)));
+        //i = p+(N+1);
+        //x = (T*)(p + (Nz+N+1));
+        CHECK_CUDA( cudaMallocManaged((void**)&p, (N+1)*sizeof(int64_t)) );
+        CHECK_CUDA( cudaMallocManaged((void**)&i, Nz*sizeof(int64_t)) );
+        CHECK_CUDA( cudaMallocManaged((void**)&x, Nz*sizeof(T)) );
+
+     }
+ 
+     void fill_random(  int64_t N, int64_t Nz, std::mt19937 r) {
+  
+        int64_t inv_sparsity = (N*N)/Nz;   //= values not taken per value occupied in index space
+
+        //std::cout<< "fill_random N="<< N<<" need "<< Nz<<" values, invsparse = "<<inv_sparsity<<std::endl;
+        alloc( N, Nz);
+
+        //std::cout<< "fill_random"<<" after alloc values"<<std::endl;
+        vdim = N; 
+        //std::cout<<"vdim ready "<<std::endl;
+        vlen = N;
+        //std::cout<<"vlen ready "<<std::endl;
+        nnz = Nz;
+        //std::cout<<"ready to fill p"<<std::endl;
+
+        p[0] = 0; 
+        p[N] = nnz;
+
+        //std::cout<<"   in fill loop"<<std::endl;
+        for (int64_t j = 0; j < N; ++j) {
+           p[j+1] = p[j] + Nz/N; 
+           //std::cout<<" row "<<j<<" has "<< p[j+1]-p[j]<<" entries."<<std::endl;
+           for ( int k = p[j] ; k < p[j+1]; ++k) {
+               i[k] = (k-p[j])*inv_sparsity +  r() % inv_sparsity;
+               x[k] = (T) (k & 63) ;
+           }
+        }
+        is_filled = true;
+     }
+};
+
+
+
+template< typename T_C, typename T_M, typename T_A, typename T_B>
+class SpGEMM_problem_generator {
+
+    float Anzpercent,Bnzpercent,Cnzpercent;
+    int64_t Cnz;
+    int64_t *Bucket = nullptr;
+    int64_t BucketStart[13];
+    unsigned seed = 13372801;
+    std::mt19937 r; //random number generator Mersenne Twister
+    bool ready = false;
+
+  public:
+
+    matrix<T_C> *C= nullptr;
+    matrix<T_M> *M= nullptr;
+    matrix<T_A> *A= nullptr;
+    matrix<T_B> *B= nullptr;
+
+    SpGEMM_problem_generator() {
+    
+       //std::cout<<"creating matrices"<<std::endl;
+       // Create sparse matrices
+       C = new matrix<T_C>;
+       // CHECK_CUDA( cudaMallocManaged( (void**)&C, sizeof(matrix<T_C>)) );
+       //cudaMemAdvise ( C, sizeof(matrix<T_C>), cudaMemAdviseSetReadMostly, 1);
+       //std::cout<<"created  C matrix"<<std::endl;
+       M = new matrix<T_M>;
+       //cudaMallocManaged( (void**)&M, sizeof(matrix<T_M>));
+       //cudaMemAdvise ( M, sizeof(matrix<T_C>), cudaMemAdviseSetReadOnly, 1);
+       //std::cout<<"created  M matrix"<<std::endl;
+       A = new matrix<T_A>;
+       //cudaMallocManaged( (void**)&A, sizeof(matrix<T_A>));
+       //cudaMemAdvise ( C, sizeof(matrix<T_C>), cudaMemAdviseSetReadOnly, 1);
+       //std::cout<<"created  A matrix"<<std::endl;
+       B = new matrix<T_B>;
+       //cudaMallocManaged( (void**)&B, sizeof(matrix<T_B>));
+       //cudaMemAdvise ( C, sizeof(matrix<T_C>), cudaMemAdviseSetReadOnly, 1);
+       //std::cout<<"created  B matrix"<<std::endl;
+
+    };
+
+    matrix<T_C>* getCptr(){ return C;}
+    matrix<T_M>* getMptr(){ return M;}
+    matrix<T_A>* getAptr(){ return A;}
+    matrix<T_B>* getBptr(){ return B;}
+
+    int64_t* getBucket() { return Bucket;}
+    int64_t* getBucketStart(){ return BucketStart;}
+
+    void loadCj() {
+
+       // Load C_i with column j info to avoid another lookup
+       for (int c = 0 ; c< M->vdim; ++c) {
+           for ( int r = M->p[c]; r< M->p[c+1]; ++r){
+               C->i[r] = c << 4 ; //shift to store bucket info
+           }
+       }
+
+    }
+
+    void init( int64_t N , int64_t Anz, int64_t Bnz, float Cnzpercent){
+
+       // Get sizes relative to fully dense matrices
+       Anzpercent = float(Anz)/float(N*N);
+       Bnzpercent = float(Bnz)/float(N*N);
+       Cnzpercent = Cnzpercent;
+       Cnz = (int64_t)(Cnzpercent * N * N);
+       std::cout<<"Anz% ="<<Anzpercent<<" Bnz% ="<<Bnzpercent<<" Cnz% ="<<Cnzpercent<<std::endl;
+
+       //Seed the generator
+       r.seed(seed);
+
+       std::cout<<"filling matrices"<<std::endl;
+
+       C->fill_random( N, Cnz, r);
+       M->fill_random( N, Cnz, r);
+       A->fill_random( N, Anz, r);
+       B->fill_random( N, Bnz, r);
+
+       std::cout<<"fill complete"<<std::endl;
+       C->p = M->p; //same column pointers (assuming CSC here)
+
+       loadCj();
+
+    }
+
+    void del(){
+       C->clear();
+       M->clear();
+       A->clear();
+       B->clear();
+       if (Bucket != nullptr) CHECK_CUDA( cudaFree(Bucket) );
+       delete C;
+       delete M;
+       delete A;
+       delete B;
+       CHECK_CUDA( cudaDeviceSynchronize() );
+    }
+
+    void fill_buckets( int fill_bucket){
+
+       std::cout<<Cnz<<" slots to fill"<<std::endl;
+
+       if (fill_bucket == -1){  
+
+       // Allocate Bucket space
+       CHECK_CUDA( cudaMallocManaged((void**)&Bucket, Cnz*sizeof(int64_t)) );
+
+       //Fill buckets with random extents such that they sum to Cnz, set BucketStart
+           BucketStart[0] = 0; 
+           BucketStart[12] = Cnz;
+           for (int b = 1; b < 12; ++b){
+              BucketStart[b] = BucketStart[b-1] + (Cnz / 12);  
+              //std::cout<< "bucket "<< b<<" starts at "<<BucketStart[b]<<std::endl;
+              for (int j = BucketStart[b-1]; j < BucketStart[b]; ++j) { 
+                Bucket[j] = b ; 
+              }
+           }
+           int b = 11;
+           for (int j = BucketStart[11]; j < BucketStart[12]; ++j) { 
+                Bucket[j] = b ; 
+           }
+       }
+       else {// all in one test bucket
+           Bucket = nullptr;
+           BucketStart[0] = 0; 
+           BucketStart[12] = Cnz;
+           for (int b= 0; b<12; ++b){
+              if (b <= fill_bucket) BucketStart[b] = 0;
+              if (b  > fill_bucket) BucketStart[b] = Cnz;
+              //std::cout<< " one  bucket "<< b<<"starts at "<<BucketStart[b]<<std::endl;
+           } 
+           std::cout<<"all pairs to bucket "<<fill_bucket<<", no filling"<<std::endl;
+           std::cout<<"done assigning buckets"<<std::endl;
+       }
+    }
+};
+
+
diff --git a/GraphBLAS/CUDA/test/googlemock/CMakeLists.txt b/GraphBLAS/CUDA/test/googlemock/CMakeLists.txt
new file mode 100644
index 0000000000..8ab59d7f6c
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/CMakeLists.txt
@@ -0,0 +1,232 @@
+########################################################################
+# Note: CMake support is community-based. The maintainers do not use CMake
+# internally.
+#
+# CMake build script for Google Mock.
+#
+# To run the tests for Google Mock itself on Linux, use 'make test' or
+# ctest.  You can select which tests to run using 'ctest -R regex'.
+# For more options, run 'ctest --help'.
+
+option(gmock_build_tests "Build all of Google Mock's own tests." OFF)
+
+# A directory to find Google Test sources.
+if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/gtest/CMakeLists.txt")
+  set(gtest_dir gtest)
+else()
+  set(gtest_dir ../googletest)
+endif()
+
+# Defines pre_project_set_up_hermetic_build() and set_up_hermetic_build().
+include("${gtest_dir}/cmake/hermetic_build.cmake" OPTIONAL)
+
+if (COMMAND pre_project_set_up_hermetic_build)
+  # Google Test also calls hermetic setup functions from add_subdirectory,
+  # although its changes will not affect things at the current scope.
+  pre_project_set_up_hermetic_build()
+endif()
+
+########################################################################
+#
+# Project-wide settings
+
+# Name of the project.
+#
+# CMake files in this project can refer to the root source directory
+# as ${gmock_SOURCE_DIR} and to the root binary directory as
+# ${gmock_BINARY_DIR}.
+# Language "C" is required for find_package(Threads).
+if (CMAKE_VERSION VERSION_LESS 3.0)
+  project(gmock CXX C)
+else()
+  cmake_policy(SET CMP0048 NEW)
+  project(gmock VERSION ${GOOGLETEST_VERSION} LANGUAGES CXX C)
+endif()
+cmake_minimum_required(VERSION 2.6.4)
+
+if (COMMAND set_up_hermetic_build)
+  set_up_hermetic_build()
+endif()
+
+# Instructs CMake to process Google Test's CMakeLists.txt and add its
+# targets to the current scope.  We are placing Google Test's binary
+# directory in a subdirectory of our own as VC compilation may break
+# if they are the same (the default).
+add_subdirectory("${gtest_dir}" "${gmock_BINARY_DIR}/${gtest_dir}")
+
+
+# These commands only run if this is the main project
+if(CMAKE_PROJECT_NAME STREQUAL "gmock" OR CMAKE_PROJECT_NAME STREQUAL "googletest-distribution")
+  # BUILD_SHARED_LIBS is a standard CMake variable, but we declare it here to
+  # make it prominent in the GUI.
+  option(BUILD_SHARED_LIBS "Build shared libraries (DLLs)." OFF)
+else()
+  mark_as_advanced(gmock_build_tests)
+endif()
+
+# Although Google Test's CMakeLists.txt calls this function, the
+# changes there don't affect the current scope.  Therefore we have to
+# call it again here.
+config_compiler_and_linker()  # from ${gtest_dir}/cmake/internal_utils.cmake
+
+# Adds Google Mock's and Google Test's header directories to the search path.
+set(gmock_build_include_dirs
+  "${gmock_SOURCE_DIR}/include"
+  "${gmock_SOURCE_DIR}"
+  "${gtest_SOURCE_DIR}/include"
+  # This directory is needed to build directly from Google Test sources.
+  "${gtest_SOURCE_DIR}")
+include_directories(${gmock_build_include_dirs})
+
+########################################################################
+#
+# Defines the gmock & gmock_main libraries.  User tests should link
+# with one of them.
+
+# Google Mock libraries.  We build them using more strict warnings than what
+# are used for other targets, to ensure that Google Mock can be compiled by
+# a user aggressive about warnings.
+if (MSVC)
+  cxx_library(gmock
+              "${cxx_strict}"
+              "${gtest_dir}/src/gtest-all.cc"
+              src/gmock-all.cc)
+
+  cxx_library(gmock_main
+              "${cxx_strict}"
+              "${gtest_dir}/src/gtest-all.cc"
+              src/gmock-all.cc
+              src/gmock_main.cc)
+else()
+  cxx_library(gmock "${cxx_strict}" src/gmock-all.cc)
+  target_link_libraries(gmock PUBLIC gtest)
+  cxx_library(gmock_main "${cxx_strict}" src/gmock_main.cc)
+  target_link_libraries(gmock_main PUBLIC gmock)
+endif()
+# If the CMake version supports it, attach header directory information
+# to the targets for when we are part of a parent build (ie being pulled
+# in via add_subdirectory() rather than being a standalone build).
+if (DEFINED CMAKE_VERSION AND NOT "${CMAKE_VERSION}" VERSION_LESS "2.8.11")
+  target_include_directories(gmock SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${gmock_build_include_dirs}>"
+    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+  target_include_directories(gmock_main SYSTEM INTERFACE
+    "$<BUILD_INTERFACE:${gmock_build_include_dirs}>"
+    "$<INSTALL_INTERFACE:$<INSTALL_PREFIX>/${CMAKE_INSTALL_INCLUDEDIR}>")
+endif()
+
+########################################################################
+#
+# Install rules
+install_project(gmock gmock_main)
+
+########################################################################
+#
+# Google Mock's own tests.
+#
+# You can skip this section if you aren't interested in testing
+# Google Mock itself.
+#
+# The tests are not built by default.  To build them, set the
+# gmock_build_tests option to ON.  You can do it by running ccmake
+# or specifying the -Dgmock_build_tests=ON flag when running cmake.
+
+if (gmock_build_tests)
+  # This must be set in the root directory for the tests to be run by
+  # 'make test' or ctest.
+  enable_testing()
+
+  if (WIN32)
+    file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/$<CONFIG>/RunTest.ps1"
+         CONTENT
+"$project_bin = \"${CMAKE_BINARY_DIR}/bin/$<CONFIG>\"
+$env:Path = \"$project_bin;$env:Path\"
+& $args")
+  elseif (MINGW OR CYGWIN)
+    file(GENERATE OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/RunTest.ps1"
+         CONTENT
+"$project_bin = (cygpath --windows ${CMAKE_BINARY_DIR}/bin)
+$env:Path = \"$project_bin;$env:Path\"
+& $args")
+  endif()
+
+  if (MINGW OR CYGWIN)
+    if (CMAKE_VERSION VERSION_LESS "2.8.12")
+      add_compile_options("-Wa,-mbig-obj")
+    else()
+      add_definitions("-Wa,-mbig-obj")
+    endif()
+  endif()
+
+  ############################################################
+  # C++ tests built with standard compiler flags.
+
+  cxx_test(gmock-actions_test gmock_main)
+  cxx_test(gmock-cardinalities_test gmock_main)
+  cxx_test(gmock_ex_test gmock_main)
+  cxx_test(gmock-function-mocker_test gmock_main)
+  cxx_test(gmock-generated-actions_test gmock_main)
+  cxx_test(gmock-generated-matchers_test gmock_main)
+  cxx_test(gmock-internal-utils_test gmock_main)
+  cxx_test(gmock-matchers_test gmock_main)
+  cxx_test(gmock-more-actions_test gmock_main)
+  cxx_test(gmock-nice-strict_test gmock_main)
+  cxx_test(gmock-port_test gmock_main)
+  cxx_test(gmock-spec-builders_test gmock_main)
+  cxx_test(gmock_link_test gmock_main test/gmock_link2_test.cc)
+  cxx_test(gmock_test gmock_main)
+
+  if (DEFINED GTEST_HAS_PTHREAD)
+    cxx_test(gmock_stress_test gmock)
+  endif()
+
+  # gmock_all_test is commented to save time building and running tests.
+  # Uncomment if necessary.
+  # cxx_test(gmock_all_test gmock_main)
+
+  ############################################################
+  # C++ tests built with non-standard compiler flags.
+
+  if (MSVC)
+    cxx_library(gmock_main_no_exception "${cxx_no_exception}"
+      "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
+
+    cxx_library(gmock_main_no_rtti "${cxx_no_rtti}"
+      "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
+
+  else()
+    cxx_library(gmock_main_no_exception "${cxx_no_exception}" src/gmock_main.cc)
+    target_link_libraries(gmock_main_no_exception PUBLIC gmock)
+
+    cxx_library(gmock_main_no_rtti "${cxx_no_rtti}" src/gmock_main.cc)
+    target_link_libraries(gmock_main_no_rtti PUBLIC gmock)
+  endif()
+  cxx_test_with_flags(gmock-more-actions_no_exception_test "${cxx_no_exception}"
+    gmock_main_no_exception test/gmock-more-actions_test.cc)
+
+  cxx_test_with_flags(gmock_no_rtti_test "${cxx_no_rtti}"
+    gmock_main_no_rtti test/gmock-spec-builders_test.cc)
+
+  cxx_shared_library(shared_gmock_main "${cxx_default}"
+    "${gtest_dir}/src/gtest-all.cc" src/gmock-all.cc src/gmock_main.cc)
+
+  # Tests that a binary can be built with Google Mock as a shared library.  On
+  # some system configurations, it may not possible to run the binary without
+  # knowing more details about the system configurations. We do not try to run
+  # this binary. To get a more robust shared library coverage, configure with
+  # -DBUILD_SHARED_LIBS=ON.
+  cxx_executable_with_flags(shared_gmock_test_ "${cxx_default}"
+    shared_gmock_main test/gmock-spec-builders_test.cc)
+  set_target_properties(shared_gmock_test_
+    PROPERTIES
+    COMPILE_DEFINITIONS "GTEST_LINKED_AS_SHARED_LIBRARY=1")
+
+  ############################################################
+  # Python tests.
+
+  cxx_executable(gmock_leak_test_ test gmock_main)
+  py_test(gmock_leak_test)
+
+  cxx_executable(gmock_output_test_ test gmock)
+  py_test(gmock_output_test)
+endif()
diff --git a/GraphBLAS/CUDA/test/googlemock/CONTRIBUTORS b/GraphBLAS/CUDA/test/googlemock/CONTRIBUTORS
new file mode 100644
index 0000000000..6e9ae362b6
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/CONTRIBUTORS
@@ -0,0 +1,40 @@
+# This file contains a list of people who've made non-trivial
+# contribution to the Google C++ Mocking Framework project.  People
+# who commit code to the project are encouraged to add their names
+# here.  Please keep the list sorted by first names.
+
+Benoit Sigoure <tsuna@google.com>
+Bogdan Piloca <boo@google.com>
+Chandler Carruth <chandlerc@google.com>
+Dave MacLachlan <dmaclach@gmail.com>
+David Anderson <danderson@google.com>
+Dean Sturtevant
+Gene Volovich <gv@cite.com>
+Hal Burch <gmock@hburch.com>
+Jeffrey Yasskin <jyasskin@google.com>
+Jim Keller <jimkeller@google.com>
+Joe Walnes <joe@truemesh.com>
+Jon Wray <jwray@google.com>
+Keir Mierle <mierle@gmail.com>
+Keith Ray <keith.ray@gmail.com>
+Kostya Serebryany <kcc@google.com>
+Lev Makhlis
+Manuel Klimek <klimek@google.com>
+Mario Tanev <radix@google.com>
+Mark Paskin
+Markus Heule <markus.heule@gmail.com>
+Matthew Simmons <simmonmt@acm.org>
+Mike Bland <mbland@google.com>
+Neal Norwitz <nnorwitz@gmail.com>
+Nermin Ozkiranartli <nermin@google.com>
+Owen Carlsen <ocarlsen@google.com>
+Paneendra Ba <paneendra@google.com>
+Paul Menage <menage@google.com>
+Piotr Kaminski <piotrk@google.com>
+Russ Rufer <russ@pentad.com>
+Sverre Sundsdal <sundsdal@gmail.com>
+Takeshi Yoshino <tyoshino@google.com>
+Vadim Berman <vadimb@google.com>
+Vlad Losev <vladl@google.com>
+Wolfgang Klier <wklier@google.com>
+Zhanyong Wan <wan@google.com>
diff --git a/GraphBLAS/CUDA/test/googlemock/LICENSE b/GraphBLAS/CUDA/test/googlemock/LICENSE
new file mode 100644
index 0000000000..1941a11f8c
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/LICENSE
@@ -0,0 +1,28 @@
+Copyright 2008, Google Inc.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+    * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/GraphBLAS/CUDA/test/googlemock/README.md b/GraphBLAS/CUDA/test/googlemock/README.md
new file mode 100644
index 0000000000..183fdb81d9
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/README.md
@@ -0,0 +1,44 @@
+# Googletest Mocking (gMock) Framework
+
+### Overview
+
+Google's framework for writing and using C++ mock classes. It can help you
+derive better designs of your system and write better tests.
+
+It is inspired by:
+
+*   [jMock](http://www.jmock.org/),
+*   [EasyMock](http://www.easymock.org/), and
+*   [Hamcrest](http://code.google.com/p/hamcrest/),
+
+and designed with C++'s specifics in mind.
+
+gMock:
+
+-   provides a declarative syntax for defining mocks,
+-   can define partial (hybrid) mocks, which are a cross of real and mock
+    objects,
+-   handles functions of arbitrary types and overloaded functions,
+-   comes with a rich set of matchers for validating function arguments,
+-   uses an intuitive syntax for controlling the behavior of a mock,
+-   does automatic verification of expectations (no record-and-replay needed),
+-   allows arbitrary (partial) ordering constraints on function calls to be
+    expressed,
+-   lets a user extend it by defining new matchers and actions.
+-   does not use exceptions, and
+-   is easy to learn and use.
+
+Details and examples can be found here:
+
+*   [gMock for Dummies](docs/for_dummies.md)
+*   [Legacy gMock FAQ](docs/gmock_faq.md)
+*   [gMock Cookbook](docs/cook_book.md)
+*   [gMock Cheat Sheet](docs/cheat_sheet.md)
+
+Please note that code under scripts/generator/ is from the [cppclean
+project](http://code.google.com/p/cppclean/) and under the Apache
+License, which is different from Google Mock's license.
+
+Google Mock is a part of
+[Google Test C++ testing framework](http://github.com/google/googletest/) and a
+subject to the same requirements.
diff --git a/GraphBLAS/CUDA/test/googlemock/cmake/gmock.pc.in b/GraphBLAS/CUDA/test/googlemock/cmake/gmock.pc.in
new file mode 100644
index 0000000000..5780fcaa53
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/cmake/gmock.pc.in
@@ -0,0 +1,10 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: gmock
+Description: GoogleMock (without main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Requires: gtest
+Libs: -L${libdir} -lgmock @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@
diff --git a/GraphBLAS/CUDA/test/googlemock/cmake/gmock_main.pc.in b/GraphBLAS/CUDA/test/googlemock/cmake/gmock_main.pc.in
new file mode 100644
index 0000000000..f2dfe69e0f
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/cmake/gmock_main.pc.in
@@ -0,0 +1,10 @@
+libdir=@CMAKE_INSTALL_FULL_LIBDIR@
+includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
+
+Name: gmock_main
+Description: GoogleMock (with main() function)
+Version: @PROJECT_VERSION@
+URL: https://github.com/google/googletest
+Requires: gmock
+Libs: -L${libdir} -lgmock_main @CMAKE_THREAD_LIBS_INIT@
+Cflags: -I${includedir} @GTEST_HAS_PTHREAD_MACRO@
diff --git a/GraphBLAS/CUDA/test/googlemock/docs/cheat_sheet.md b/GraphBLAS/CUDA/test/googlemock/docs/cheat_sheet.md
new file mode 100644
index 0000000000..1e0541ba81
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/docs/cheat_sheet.md
@@ -0,0 +1,776 @@
+## gMock Cheat Sheet
+
+<!-- GOOGLETEST_CM0019 DO NOT DELETE -->
+
+<!-- GOOGLETEST_CM0033 DO NOT DELETE -->
+
+### Defining a Mock Class
+
+#### Mocking a Normal Class {#MockClass}
+
+Given
+
+```cpp
+class Foo {
+  ...
+  virtual ~Foo();
+  virtual int GetSize() const = 0;
+  virtual string Describe(const char* name) = 0;
+  virtual string Describe(int type) = 0;
+  virtual bool Process(Bar elem, int count) = 0;
+};
+```
+
+(note that `~Foo()` **must** be virtual) we can define its mock as
+
+```cpp
+#include "gmock/gmock.h"
+
+class MockFoo : public Foo {
+  ...
+  MOCK_METHOD(int, GetSize, (), (const, override));
+  MOCK_METHOD(string, Describe, (const char* name), (override));
+  MOCK_METHOD(string, Describe, (int type), (override));
+  MOCK_METHOD(bool, Process, (Bar elem, int count), (override));
+};
+```
+
+To create a "nice" mock, which ignores all uninteresting calls, a "naggy" mock,
+which warns on all uninteresting calls, or a "strict" mock, which treats them as
+failures:
+
+```cpp
+using ::testing::NiceMock;
+using ::testing::NaggyMock;
+using ::testing::StrictMock;
+
+NiceMock<MockFoo> nice_foo;      // The type is a subclass of MockFoo.
+NaggyMock<MockFoo> naggy_foo;    // The type is a subclass of MockFoo.
+StrictMock<MockFoo> strict_foo;  // The type is a subclass of MockFoo.
+```
+
+**Note:** A mock object is currently naggy by default. We may make it nice by
+default in the future.
+
+#### Mocking a Class Template {#MockTemplate}
+
+Class templates can be mocked just like any class.
+
+To mock
+
+```cpp
+template <typename Elem>
+class StackInterface {
+  ...
+  virtual ~StackInterface();
+  virtual int GetSize() const = 0;
+  virtual void Push(const Elem& x) = 0;
+};
+```
+
+(note that all member functions that are mocked, including `~StackInterface()`
+**must** be virtual).
+
+```cpp
+template <typename Elem>
+class MockStack : public StackInterface<Elem> {
+  ...
+  MOCK_METHOD(int, GetSize, (), (const, override));
+  MOCK_METHOD(void, Push, (const Elem& x), (override));
+};
+```
+
+#### Specifying Calling Conventions for Mock Functions
+
+If your mock function doesn't use the default calling convention, you can
+specify it by adding `Calltype(convention)` to `MOCK_METHOD`'s 4th parameter.
+For example,
+
+```cpp
+  MOCK_METHOD(bool, Foo, (int n), (Calltype(STDMETHODCALLTYPE)));
+  MOCK_METHOD(int, Bar, (double x, double y),
+              (const, Calltype(STDMETHODCALLTYPE)));
+```
+
+where `STDMETHODCALLTYPE` is defined by `<objbase.h>` on Windows.
+
+### Using Mocks in Tests {#UsingMocks}
+
+The typical work flow is:
+
+1.  Import the gMock names you need to use. All gMock symbols are in the
+    `testing` namespace unless they are macros or otherwise noted.
+2.  Create the mock objects.
+3.  Optionally, set the default actions of the mock objects.
+4.  Set your expectations on the mock objects (How will they be called? What
+    will they do?).
+5.  Exercise code that uses the mock objects; if necessary, check the result
+    using googletest assertions.
+6.  When a mock object is destructed, gMock automatically verifies that all
+    expectations on it have been satisfied.
+
+Here's an example:
+
+```cpp
+using ::testing::Return;                          // #1
+
+TEST(BarTest, DoesThis) {
+  MockFoo foo;                                    // #2
+
+  ON_CALL(foo, GetSize())                         // #3
+      .WillByDefault(Return(1));
+  // ... other default actions ...
+
+  EXPECT_CALL(foo, Describe(5))                   // #4
+      .Times(3)
+      .WillRepeatedly(Return("Category 5"));
+  // ... other expectations ...
+
+  EXPECT_EQ("good", MyProductionFunction(&foo));  // #5
+}                                                 // #6
+```
+
+### Setting Default Actions {#OnCall}
+
+gMock has a **built-in default action** for any function that returns `void`,
+`bool`, a numeric value, or a pointer. In C++11, it will additionally returns
+the default-constructed value, if one exists for the given type.
+
+To customize the default action for functions with return type *`T`*:
+
+```cpp
+using ::testing::DefaultValue;
+
+// Sets the default value to be returned. T must be CopyConstructible.
+DefaultValue<T>::Set(value);
+// Sets a factory. Will be invoked on demand. T must be MoveConstructible.
+//  T MakeT();
+DefaultValue<T>::SetFactory(&MakeT);
+// ... use the mocks ...
+// Resets the default value.
+DefaultValue<T>::Clear();
+```
+
+Example usage:
+
+```cpp
+  // Sets the default action for return type std::unique_ptr<Buzz> to
+  // creating a new Buzz every time.
+  DefaultValue<std::unique_ptr<Buzz>>::SetFactory(
+      [] { return MakeUnique<Buzz>(AccessLevel::kInternal); });
+
+  // When this fires, the default action of MakeBuzz() will run, which
+  // will return a new Buzz object.
+  EXPECT_CALL(mock_buzzer_, MakeBuzz("hello")).Times(AnyNumber());
+
+  auto buzz1 = mock_buzzer_.MakeBuzz("hello");
+  auto buzz2 = mock_buzzer_.MakeBuzz("hello");
+  EXPECT_NE(nullptr, buzz1);
+  EXPECT_NE(nullptr, buzz2);
+  EXPECT_NE(buzz1, buzz2);
+
+  // Resets the default action for return type std::unique_ptr<Buzz>,
+  // to avoid interfere with other tests.
+  DefaultValue<std::unique_ptr<Buzz>>::Clear();
+```
+
+To customize the default action for a particular method of a specific mock
+object, use `ON_CALL()`. `ON_CALL()` has a similar syntax to `EXPECT_CALL()`,
+but it is used for setting default behaviors (when you do not require that the
+mock method is called). See [here](cook_book.md#UseOnCall) for a more detailed
+discussion.
+
+```cpp
+ON_CALL(mock-object, method(matchers))
+    .With(multi-argument-matcher)   ?
+    .WillByDefault(action);
+```
+
+### Setting Expectations {#ExpectCall}
+
+`EXPECT_CALL()` sets **expectations** on a mock method (How will it be called?
+What will it do?):
+
+```cpp
+EXPECT_CALL(mock-object, method (matchers)?)
+     .With(multi-argument-matcher)  ?
+     .Times(cardinality)            ?
+     .InSequence(sequences)         *
+     .After(expectations)           *
+     .WillOnce(action)              *
+     .WillRepeatedly(action)        ?
+     .RetiresOnSaturation();        ?
+```
+
+For each item above, `?` means it can be used at most once, while `*` means it
+can be used any number of times.
+
+In order to pass, `EXPECT_CALL` must be used before the calls are actually made.
+
+The `(matchers)` is a comma-separated list of matchers that correspond to each
+of the arguments of `method`, and sets the expectation only for calls of
+`method` that matches all of the matchers.
+
+If `(matchers)` is omitted, the expectation is the same as if the matchers were
+set to anything matchers (for example, `(_, _, _, _)` for a four-arg method).
+
+If `Times()` is omitted, the cardinality is assumed to be:
+
+*   `Times(1)` when there is neither `WillOnce()` nor `WillRepeatedly()`;
+*   `Times(n)` when there are `n` `WillOnce()`s but no `WillRepeatedly()`, where
+    `n` >= 1; or
+*   `Times(AtLeast(n))` when there are `n` `WillOnce()`s and a
+    `WillRepeatedly()`, where `n` >= 0.
+
+A method with no `EXPECT_CALL()` is free to be invoked *any number of times*,
+and the default action will be taken each time.
+
+### Matchers {#MatcherList}
+
+<!-- GOOGLETEST_CM0020 DO NOT DELETE -->
+
+A **matcher** matches a *single* argument. You can use it inside `ON_CALL()` or
+`EXPECT_CALL()`, or use it to validate a value directly using two macros:
+
+<!-- mdformat off(github rendering does not support multiline tables) -->
+| Macro                                | Description                           |
+| :----------------------------------- | :------------------------------------ |
+| `EXPECT_THAT(actual_value, matcher)` | Asserts that `actual_value` matches `matcher`. |
+| `ASSERT_THAT(actual_value, matcher)` | The same as `EXPECT_THAT(actual_value, matcher)`, except that it generates a **fatal** failure. |
+<!-- mdformat on -->
+
+Built-in matchers (where `argument` is the function argument, e.g.
+`actual_value` in the example above, or when used in the context of
+`EXPECT_CALL(mock_object, method(matchers))`, the arguments of `method`) are
+divided into several categories:
+
+#### Wildcard
+
+Matcher                     | Description
+:-------------------------- | :-----------------------------------------------
+`_`                         | `argument` can be any value of the correct type.
+`A<type>()` or `An<type>()` | `argument` can be any value of type `type`.
+
+#### Generic Comparison
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                | Description                                         |
+| :--------------------- | :-------------------------------------------------- |
+| `Eq(value)` or `value` | `argument == value`                                 |
+| `Ge(value)`            | `argument >= value`                                 |
+| `Gt(value)`            | `argument > value`                                  |
+| `Le(value)`            | `argument <= value`                                 |
+| `Lt(value)`            | `argument < value`                                  |
+| `Ne(value)`            | `argument != value`                                 |
+| `IsFalse()`            | `argument` evaluates to `false` in a Boolean context. |
+| `IsTrue()`             | `argument` evaluates to `true` in a Boolean context. |
+| `IsNull()`             | `argument` is a `NULL` pointer (raw or smart).      |
+| `NotNull()`            | `argument` is a non-null pointer (raw or smart).    |
+| `Optional(m)`          | `argument` is `optional<>` that contains a value matching `m`. (For testing whether an `optional<>` is set, check for equality with `nullopt`. You may need to use `Eq(nullopt)` if the inner type doesn't have `==`.)|
+| `VariantWith<T>(m)`    | `argument` is `variant<>` that holds the alternative of type T with a value matching `m`. |
+| `Ref(variable)`        | `argument` is a reference to `variable`.            |
+| `TypedEq<type>(value)` | `argument` has type `type` and is equal to `value`. You may need to use this instead of `Eq(value)` when the mock function is overloaded. |
+<!-- mdformat on -->
+
+Except `Ref()`, these matchers make a *copy* of `value` in case it's modified or
+destructed later. If the compiler complains that `value` doesn't have a public
+copy constructor, try wrap it in `ByRef()`, e.g.
+`Eq(ByRef(non_copyable_value))`. If you do that, make sure `non_copyable_value`
+is not changed afterwards, or the meaning of your matcher will be changed.
+
+`IsTrue` and `IsFalse` are useful when you need to use a matcher, or for types
+that can be explicitly converted to Boolean, but are not implicitly converted to
+Boolean. In other cases, you can use the basic
+[`EXPECT_TRUE` and `EXPECT_FALSE`](../../googletest/docs/primer#basic-assertions)
+assertions.
+
+#### Floating-Point Matchers {#FpMatchers}
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                          | Description                        |
+| :------------------------------- | :--------------------------------- |
+| `DoubleEq(a_double)`             | `argument` is a `double` value approximately equal to `a_double`, treating two NaNs as unequal. |
+| `FloatEq(a_float)`               | `argument` is a `float` value approximately equal to `a_float`, treating two NaNs as unequal. |
+| `NanSensitiveDoubleEq(a_double)` | `argument` is a `double` value approximately equal to `a_double`, treating two NaNs as equal. |
+| `NanSensitiveFloatEq(a_float)`   | `argument` is a `float` value approximately equal to `a_float`, treating two NaNs as equal. |
+| `IsNan()`   | `argument` is any floating-point type with a NaN value. |
+<!-- mdformat on -->
+
+The above matchers use ULP-based comparison (the same as used in googletest).
+They automatically pick a reasonable error bound based on the absolute value of
+the expected value. `DoubleEq()` and `FloatEq()` conform to the IEEE standard,
+which requires comparing two NaNs for equality to return false. The
+`NanSensitive*` version instead treats two NaNs as equal, which is often what a
+user wants.
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                                           | Description              |
+| :------------------------------------------------ | :----------------------- |
+| `DoubleNear(a_double, max_abs_error)`             | `argument` is a `double` value close to `a_double` (absolute error <= `max_abs_error`), treating two NaNs as unequal. |
+| `FloatNear(a_float, max_abs_error)`               | `argument` is a `float` value close to `a_float` (absolute error <= `max_abs_error`), treating two NaNs as unequal. |
+| `NanSensitiveDoubleNear(a_double, max_abs_error)` | `argument` is a `double` value close to `a_double` (absolute error <= `max_abs_error`), treating two NaNs as equal. |
+| `NanSensitiveFloatNear(a_float, max_abs_error)`   | `argument` is a `float` value close to `a_float` (absolute error <= `max_abs_error`), treating two NaNs as equal. |
+<!-- mdformat on -->
+
+#### String Matchers
+
+The `argument` can be either a C string or a C++ string object:
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                 | Description                                        |
+| :---------------------- | :------------------------------------------------- |
+| `ContainsRegex(string)` | `argument` matches the given regular expression.   |
+| `EndsWith(suffix)`      | `argument` ends with string `suffix`.              |
+| `HasSubstr(string)`     | `argument` contains `string` as a sub-string.      |
+| `MatchesRegex(string)`  | `argument` matches the given regular expression with the match starting at the first character and ending at the last character. |
+| `StartsWith(prefix)`    | `argument` starts with string `prefix`.            |
+| `StrCaseEq(string)`     | `argument` is equal to `string`, ignoring case.    |
+| `StrCaseNe(string)`     | `argument` is not equal to `string`, ignoring case. |
+| `StrEq(string)`         | `argument` is equal to `string`.                   |
+| `StrNe(string)`         | `argument` is not equal to `string`.               |
+<!-- mdformat on -->
+
+`ContainsRegex()` and `MatchesRegex()` take ownership of the `RE` object. They
+use the regular expression syntax defined
+[here](../../googletest/docs/advanced.md#regular-expression-syntax). All of
+these matchers, except `ContainsRegex()` and `MatchesRegex()` work for wide
+strings as well.
+
+#### Container Matchers
+
+Most STL-style containers support `==`, so you can use `Eq(expected_container)`
+or simply `expected_container` to match a container exactly. If you want to
+write the elements in-line, match them more flexibly, or get more informative
+messages, you can use:
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                                   | Description                      |
+| :---------------------------------------- | :------------------------------- |
+| `BeginEndDistanceIs(m)` | `argument` is a container whose `begin()` and `end()` iterators are separated by a number of increments matching `m`. E.g. `BeginEndDistanceIs(2)` or `BeginEndDistanceIs(Lt(2))`. For containers that define a `size()` method, `SizeIs(m)` may be more efficient. |
+| `ContainerEq(container)` | The same as `Eq(container)` except that the failure message also includes which elements are in one container but not the other. |
+| `Contains(e)` | `argument` contains an element that matches `e`, which can be either a value or a matcher. |
+| `Each(e)` | `argument` is a container where *every* element matches `e`, which can be either a value or a matcher. |
+| `ElementsAre(e0, e1, ..., en)` | `argument` has `n + 1` elements, where the *i*-th element matches `ei`, which can be a value or a matcher. |
+| `ElementsAreArray({e0, e1, ..., en})`, `ElementsAreArray(a_container)`, `ElementsAreArray(begin, end)`, `ElementsAreArray(array)`, or `ElementsAreArray(array, count)` | The same as `ElementsAre()` except that the expected element values/matchers come from an initializer list, STL-style container, iterator range, or C-style array. |
+| `IsEmpty()` | `argument` is an empty container (`container.empty()`). |
+| `IsSubsetOf({e0, e1, ..., en})`, `IsSubsetOf(a_container)`, `IsSubsetOf(begin, end)`, `IsSubsetOf(array)`, or `IsSubsetOf(array, count)` | `argument` matches `UnorderedElementsAre(x0, x1, ..., xk)` for some subset `{x0, x1, ..., xk}` of the expected matchers. |
+| `IsSupersetOf({e0, e1, ..., en})`, `IsSupersetOf(a_container)`, `IsSupersetOf(begin, end)`, `IsSupersetOf(array)`, or `IsSupersetOf(array, count)` | Some subset of `argument` matches `UnorderedElementsAre(`expected matchers`)`. |
+| `Pointwise(m, container)`, `Pointwise(m, {e0, e1, ..., en})` | `argument` contains the same number of elements as in `container`, and for all i, (the i-th element in `argument`, the i-th element in `container`) match `m`, which is a matcher on 2-tuples. E.g. `Pointwise(Le(), upper_bounds)` verifies that each element in `argument` doesn't exceed the corresponding element in `upper_bounds`. See more detail below. |
+| `SizeIs(m)` | `argument` is a container whose size matches `m`. E.g. `SizeIs(2)` or `SizeIs(Lt(2))`. |
+| `UnorderedElementsAre(e0, e1, ..., en)` | `argument` has `n + 1` elements, and under *some* permutation of the elements, each element matches an `ei` (for a different `i`), which can be a value or a matcher. |
+| `UnorderedElementsAreArray({e0, e1, ..., en})`, `UnorderedElementsAreArray(a_container)`, `UnorderedElementsAreArray(begin, end)`, `UnorderedElementsAreArray(array)`, or `UnorderedElementsAreArray(array, count)` | The same as `UnorderedElementsAre()` except that the expected element values/matchers come from an initializer list, STL-style container, iterator range, or C-style array. |
+| `UnorderedPointwise(m, container)`, `UnorderedPointwise(m, {e0, e1, ..., en})` | Like `Pointwise(m, container)`, but ignores the order of elements. |
+| `WhenSorted(m)` | When `argument` is sorted using the `<` operator, it matches container matcher `m`. E.g. `WhenSorted(ElementsAre(1, 2, 3))` verifies that `argument` contains elements 1, 2, and 3, ignoring order. |
+| `WhenSortedBy(comparator, m)` | The same as `WhenSorted(m)`, except that the given comparator instead of `<` is used to sort `argument`. E.g. `WhenSortedBy(std::greater(), ElementsAre(3, 2, 1))`. |
+<!-- mdformat on -->
+
+**Notes:**
+
+*   These matchers can also match:
+    1.  a native array passed by reference (e.g. in `Foo(const int (&a)[5])`),
+        and
+    2.  an array passed as a pointer and a count (e.g. in `Bar(const T* buffer,
+        int len)` -- see [Multi-argument Matchers](#MultiArgMatchers)).
+*   The array being matched may be multi-dimensional (i.e. its elements can be
+    arrays).
+*   `m` in `Pointwise(m, ...)` should be a matcher for `::std::tuple<T, U>`
+    where `T` and `U` are the element type of the actual container and the
+    expected container, respectively. For example, to compare two `Foo`
+    containers where `Foo` doesn't support `operator==`, one might write:
+
+    ```cpp
+    using ::std::get;
+    MATCHER(FooEq, "") {
+      return std::get<0>(arg).Equals(std::get<1>(arg));
+    }
+    ...
+    EXPECT_THAT(actual_foos, Pointwise(FooEq(), expected_foos));
+    ```
+
+#### Member Matchers
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                         | Description                                |
+| :------------------------------ | :----------------------------------------- |
+| `Field(&class::field, m)`       | `argument.field` (or `argument->field` when `argument` is a plain pointer) matches matcher `m`, where `argument` is an object of type _class_. |
+| `Key(e)`                        | `argument.first` matches `e`, which can be either a value or a matcher. E.g. `Contains(Key(Le(5)))` can verify that a `map` contains a key `<= 5`. |
+| `Pair(m1, m2)`                  | `argument` is an `std::pair` whose `first` field matches `m1` and `second` field matches `m2`. |
+| `Property(&class::property, m)` | `argument.property()` (or `argument->property()` when `argument` is a plain pointer) matches matcher `m`, where `argument` is an object of type _class_. |
+<!-- mdformat on -->
+
+#### Matching the Result of a Function, Functor, or Callback
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher          | Description                                       |
+| :--------------- | :------------------------------------------------ |
+| `ResultOf(f, m)` | `f(argument)` matches matcher `m`, where `f` is a function or functor. |
+<!-- mdformat on -->
+
+#### Pointer Matchers
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                   | Description                                     |
+| :------------------------ | :---------------------------------------------- |
+| `Pointee(m)`              | `argument` (either a smart pointer or a raw pointer) points to a value that matches matcher `m`. |
+| `WhenDynamicCastTo<T>(m)` | when `argument` is passed through `dynamic_cast<T>()`, it matches matcher `m`. |
+<!-- mdformat on -->
+
+<!-- GOOGLETEST_CM0026 DO NOT DELETE -->
+
+<!-- GOOGLETEST_CM0027 DO NOT DELETE -->
+
+#### Multi-argument Matchers {#MultiArgMatchers}
+
+Technically, all matchers match a *single* value. A "multi-argument" matcher is
+just one that matches a *tuple*. The following matchers can be used to match a
+tuple `(x, y)`:
+
+Matcher | Description
+:------ | :----------
+`Eq()`  | `x == y`
+`Ge()`  | `x >= y`
+`Gt()`  | `x > y`
+`Le()`  | `x <= y`
+`Lt()`  | `x < y`
+`Ne()`  | `x != y`
+
+You can use the following selectors to pick a subset of the arguments (or
+reorder them) to participate in the matching:
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                    | Description                                     |
+| :------------------------- | :---------------------------------------------- |
+| `AllArgs(m)`               | Equivalent to `m`. Useful as syntactic sugar in `.With(AllArgs(m))`. |
+| `Args<N1, N2, ..., Nk>(m)` | The tuple of the `k` selected (using 0-based indices) arguments matches `m`, e.g. `Args<1, 2>(Eq())`. |
+<!-- mdformat on -->
+
+#### Composite Matchers
+
+You can make a matcher from one or more other matchers:
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                          | Description                             |
+| :------------------------------- | :-------------------------------------- |
+| `AllOf(m1, m2, ..., mn)` | `argument` matches all of the matchers `m1` to `mn`. |
+| `AllOfArray({m0, m1, ..., mn})`, `AllOfArray(a_container)`, `AllOfArray(begin, end)`, `AllOfArray(array)`, or `AllOfArray(array, count)` | The same as `AllOf()` except that the matchers come from an initializer list, STL-style container, iterator range, or C-style array. |
+| `AnyOf(m1, m2, ..., mn)` | `argument` matches at least one of the matchers `m1` to `mn`. |
+| `AnyOfArray({m0, m1, ..., mn})`, `AnyOfArray(a_container)`, `AnyOfArray(begin, end)`, `AnyOfArray(array)`, or `AnyOfArray(array, count)` | The same as `AnyOf()` except that the matchers come from an initializer list, STL-style container, iterator range, or C-style array. |
+| `Not(m)` | `argument` doesn't match matcher `m`. |
+<!-- mdformat on -->
+
+<!-- GOOGLETEST_CM0028 DO NOT DELETE -->
+
+#### Adapters for Matchers
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                 | Description                           |
+| :---------------------- | :------------------------------------ |
+| `MatcherCast<T>(m)`     | casts matcher `m` to type `Matcher<T>`. |
+| `SafeMatcherCast<T>(m)` | [safely casts](cook_book.md#casting-matchers) matcher `m` to type `Matcher<T>`. |
+| `Truly(predicate)`      | `predicate(argument)` returns something considered by C++ to be true, where `predicate` is a function or functor. |
+<!-- mdformat on -->
+
+`AddressSatisfies(callback)` and `Truly(callback)` take ownership of `callback`,
+which must be a permanent callback.
+
+#### Using Matchers as Predicates {#MatchersAsPredicatesCheat}
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                       | Description                                 |
+| :---------------------------- | :------------------------------------------ |
+| `Matches(m)(value)` | evaluates to `true` if `value` matches `m`. You can use `Matches(m)` alone as a unary functor. |
+| `ExplainMatchResult(m, value, result_listener)` | evaluates to `true` if `value` matches `m`, explaining the result to `result_listener`. |
+| `Value(value, m)` | evaluates to `true` if `value` matches `m`. |
+<!-- mdformat on -->
+
+#### Defining Matchers
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher                              | Description                           |
+| :----------------------------------- | :------------------------------------ |
+| `MATCHER(IsEven, "") { return (arg % 2) == 0; }` | Defines a matcher `IsEven()` to match an even number. |
+| `MATCHER_P(IsDivisibleBy, n, "") { *result_listener << "where the remainder is " << (arg % n); return (arg % n) == 0; }` | Defines a matcher `IsDivisibleBy(n)` to match a number divisible by `n`. |
+| `MATCHER_P2(IsBetween, a, b, std::string(negation ? "isn't" : "is") + " between " + PrintToString(a) + " and " + PrintToString(b)) { return a <= arg && arg <= b; }` | Defines a matcher `IsBetween(a, b)` to match a value in the range [`a`, `b`]. |
+<!-- mdformat on -->
+
+**Notes:**
+
+1.  The `MATCHER*` macros cannot be used inside a function or class.
+2.  The matcher body must be *purely functional* (i.e. it cannot have any side
+    effect, and the result must not depend on anything other than the value
+    being matched and the matcher parameters).
+3.  You can use `PrintToString(x)` to convert a value `x` of any type to a
+    string.
+
+### Actions {#ActionList}
+
+**Actions** specify what a mock function should do when invoked.
+
+#### Returning a Value
+
+<!-- mdformat off(no multiline tables) -->
+|                                   |                                               |
+| :-------------------------------- | :-------------------------------------------- |
+| `Return()`                        | Return from a `void` mock function.           |
+| `Return(value)`                   | Return `value`. If the type of `value` is     different to the mock function's return type, `value` is converted to the latter type <i>at the time the expectation is set</i>, not when the action is executed. |
+| `ReturnArg<N>()`                  | Return the `N`-th (0-based) argument.         |
+| `ReturnNew<T>(a1, ..., ak)`       | Return `new T(a1, ..., ak)`; a different      object is created each time. |
+| `ReturnNull()`                    | Return a null pointer.                        |
+| `ReturnPointee(ptr)`              | Return the value pointed to by `ptr`.         |
+| `ReturnRef(variable)`             | Return a reference to `variable`.             |
+| `ReturnRefOfCopy(value)`          | Return a reference to a copy of `value`; the  copy lives as long as the action. |
+| `ReturnRoundRobin({a1, ..., ak})` | Each call will return the next `ai` in the list, starting at the beginning when the end of the list is reached. |
+<!-- mdformat on -->
+
+#### Side Effects
+
+<!-- mdformat off(no multiline tables) -->
+|                                    |                                         |
+| :--------------------------------- | :-------------------------------------- |
+| `Assign(&variable, value)` | Assign `value` to variable. |
+| `DeleteArg<N>()` | Delete the `N`-th (0-based) argument, which must be a pointer. |
+| `SaveArg<N>(pointer)` | Save the `N`-th (0-based) argument to `*pointer`. |
+| `SaveArgPointee<N>(pointer)` | Save the value pointed to by the `N`-th (0-based) argument to `*pointer`. |
+| `SetArgReferee<N>(value)` | Assign value to the variable referenced by the `N`-th (0-based) argument. |
+| `SetArgPointee<N>(value)` | Assign `value` to the variable pointed by the `N`-th (0-based) argument. |
+| `SetArgumentPointee<N>(value)` | Same as `SetArgPointee<N>(value)`. Deprecated. Will be removed in v1.7.0. |
+| `SetArrayArgument<N>(first, last)` | Copies the elements in source range [`first`, `last`) to the array pointed to by the `N`-th (0-based) argument, which can be either a pointer or an iterator. The action does not take ownership of the elements in the source range. |
+| `SetErrnoAndReturn(error, value)` | Set `errno` to `error` and return `value`. |
+| `Throw(exception)` | Throws the given exception, which can be any copyable value. Available since v1.1.0. |
+<!-- mdformat on -->
+
+#### Using a Function, Functor, or Lambda as an Action
+
+In the following, by "callable" we mean a free function, `std::function`,
+functor, or lambda.
+
+<!-- mdformat off(no multiline tables) -->
+|                                     |                                        |
+| :---------------------------------- | :------------------------------------- |
+| `f` | Invoke f with the arguments passed to the mock function, where f is a callable. |
+| `Invoke(f)` | Invoke `f` with the arguments passed to the mock function, where `f` can be a global/static function or a functor. |
+| `Invoke(object_pointer, &class::method)` | Invoke the method on the object with the arguments passed to the mock function. |
+| `InvokeWithoutArgs(f)` | Invoke `f`, which can be a global/static function or a functor. `f` must take no arguments. |
+| `InvokeWithoutArgs(object_pointer, &class::method)` | Invoke the method on the object, which takes no arguments. |
+| `InvokeArgument<N>(arg1, arg2, ..., argk)` | Invoke the mock function's `N`-th (0-based) argument, which must be a function or a functor, with the `k` arguments. |
+<!-- mdformat on -->
+
+The return value of the invoked function is used as the return value of the
+action.
+
+When defining a callable to be used with `Invoke*()`, you can declare any unused
+parameters as `Unused`:
+
+```cpp
+using ::testing::Invoke;
+double Distance(Unused, double x, double y) { return sqrt(x*x + y*y); }
+...
+EXPECT_CALL(mock, Foo("Hi", _, _)).WillOnce(Invoke(Distance));
+```
+
+`Invoke(callback)` and `InvokeWithoutArgs(callback)` take ownership of
+`callback`, which must be permanent. The type of `callback` must be a base
+callback type instead of a derived one, e.g.
+
+```cpp
+  BlockingClosure* done = new BlockingClosure;
+  ... Invoke(done) ...;  // This won't compile!
+
+  Closure* done2 = new BlockingClosure;
+  ... Invoke(done2) ...;  // This works.
+```
+
+In `InvokeArgument<N>(...)`, if an argument needs to be passed by reference,
+wrap it inside `ByRef()`. For example,
+
+```cpp
+using ::testing::ByRef;
+using ::testing::InvokeArgument;
+...
+InvokeArgument<2>(5, string("Hi"), ByRef(foo))
+```
+
+calls the mock function's #2 argument, passing to it `5` and `string("Hi")` by
+value, and `foo` by reference.
+
+#### Default Action
+
+<!-- mdformat off(no multiline tables) -->
+| Matcher       | Description                                            |
+| :------------ | :----------------------------------------------------- |
+| `DoDefault()` | Do the default action (specified by `ON_CALL()` or the built-in one). |
+<!-- mdformat on -->
+
+**Note:** due to technical reasons, `DoDefault()` cannot be used inside a
+composite action - trying to do so will result in a run-time error.
+
+<!-- GOOGLETEST_CM0032 DO NOT DELETE -->
+
+#### Composite Actions
+
+<!-- mdformat off(no multiline tables) -->
+|                                |                                             |
+| :----------------------------- | :------------------------------------------ |
+| `DoAll(a1, a2, ..., an)`       | Do all actions `a1` to `an` and return the result of `an` in each invocation. The first `n - 1` sub-actions must return void. |
+| `IgnoreResult(a)`              | Perform action `a` and ignore its result. `a` must not return void. |
+| `WithArg<N>(a)`                | Pass the `N`-th (0-based) argument of the mock function to action `a` and perform it. |
+| `WithArgs<N1, N2, ..., Nk>(a)` | Pass the selected (0-based) arguments of the mock function to action `a` and perform it. |
+| `WithoutArgs(a)`               | Perform action `a` without any arguments. |
+<!-- mdformat on -->
+
+#### Defining Actions
+
+<!-- mdformat off(no multiline tables) -->
+|                                    |                                         |
+| :--------------------------------- | :-------------------------------------- |
+| `ACTION(Sum) { return arg0 + arg1; }` | Defines an action `Sum()` to return the sum of the mock function's argument #0 and #1. |
+| `ACTION_P(Plus, n) { return arg0 + n; }` | Defines an action `Plus(n)` to return the sum of the mock function's argument #0 and `n`. |
+| `ACTION_Pk(Foo, p1, ..., pk) { statements; }` | Defines a parameterized action `Foo(p1, ..., pk)` to execute the given `statements`. |
+<!-- mdformat on -->
+
+The `ACTION*` macros cannot be used inside a function or class.
+
+### Cardinalities {#CardinalityList}
+
+These are used in `Times()` to specify how many times a mock function will be
+called:
+
+<!-- mdformat off(no multiline tables) -->
+|                   |                                                        |
+| :---------------- | :----------------------------------------------------- |
+| `AnyNumber()`     | The function can be called any number of times.        |
+| `AtLeast(n)`      | The call is expected at least `n` times.               |
+| `AtMost(n)`       | The call is expected at most `n` times.                |
+| `Between(m, n)`   | The call is expected between `m` and `n` (inclusive) times. |
+| `Exactly(n) or n` | The call is expected exactly `n` times. In particular, the call should never happen when `n` is 0. |
+<!-- mdformat on -->
+
+### Expectation Order
+
+By default, the expectations can be matched in *any* order. If some or all
+expectations must be matched in a given order, there are two ways to specify it.
+They can be used either independently or together.
+
+#### The After Clause {#AfterClause}
+
+```cpp
+using ::testing::Expectation;
+...
+Expectation init_x = EXPECT_CALL(foo, InitX());
+Expectation init_y = EXPECT_CALL(foo, InitY());
+EXPECT_CALL(foo, Bar())
+     .After(init_x, init_y);
+```
+
+says that `Bar()` can be called only after both `InitX()` and `InitY()` have
+been called.
+
+If you don't know how many pre-requisites an expectation has when you write it,
+you can use an `ExpectationSet` to collect them:
+
+```cpp
+using ::testing::ExpectationSet;
+...
+ExpectationSet all_inits;
+for (int i = 0; i < element_count; i++) {
+  all_inits += EXPECT_CALL(foo, InitElement(i));
+}
+EXPECT_CALL(foo, Bar())
+     .After(all_inits);
+```
+
+says that `Bar()` can be called only after all elements have been initialized
+(but we don't care about which elements get initialized before the others).
+
+Modifying an `ExpectationSet` after using it in an `.After()` doesn't affect the
+meaning of the `.After()`.
+
+#### Sequences {#UsingSequences}
+
+When you have a long chain of sequential expectations, it's easier to specify
+the order using **sequences**, which don't require you to given each expectation
+in the chain a different name. *All expected calls* in the same sequence must
+occur in the order they are specified.
+
+```cpp
+using ::testing::Return;
+using ::testing::Sequence;
+Sequence s1, s2;
+...
+EXPECT_CALL(foo, Reset())
+    .InSequence(s1, s2)
+    .WillOnce(Return(true));
+EXPECT_CALL(foo, GetSize())
+    .InSequence(s1)
+    .WillOnce(Return(1));
+EXPECT_CALL(foo, Describe(A<const char*>()))
+    .InSequence(s2)
+    .WillOnce(Return("dummy"));
+```
+
+says that `Reset()` must be called before *both* `GetSize()` *and* `Describe()`,
+and the latter two can occur in any order.
+
+To put many expectations in a sequence conveniently:
+
+```cpp
+using ::testing::InSequence;
+{
+  InSequence seq;
+
+  EXPECT_CALL(...)...;
+  EXPECT_CALL(...)...;
+  ...
+  EXPECT_CALL(...)...;
+}
+```
+
+says that all expected calls in the scope of `seq` must occur in strict order.
+The name `seq` is irrelevant.
+
+### Verifying and Resetting a Mock
+
+gMock will verify the expectations on a mock object when it is destructed, or
+you can do it earlier:
+
+```cpp
+using ::testing::Mock;
+...
+// Verifies and removes the expectations on mock_obj;
+// returns true if and only if successful.
+Mock::VerifyAndClearExpectations(&mock_obj);
+...
+// Verifies and removes the expectations on mock_obj;
+// also removes the default actions set by ON_CALL();
+// returns true if and only if successful.
+Mock::VerifyAndClear(&mock_obj);
+```
+
+You can also tell gMock that a mock object can be leaked and doesn't need to be
+verified:
+
+```cpp
+Mock::AllowLeak(&mock_obj);
+```
+
+### Mock Classes
+
+gMock defines a convenient mock class template
+
+```cpp
+class MockFunction<R(A1, ..., An)> {
+ public:
+  MOCK_METHOD(R, Call, (A1, ..., An));
+};
+```
+
+See this [recipe](cook_book.md#using-check-points) for one application of it.
+
+### Flags
+
+<!-- mdformat off(no multiline tables) -->
+| Flag                           | Description                               |
+| :----------------------------- | :---------------------------------------- |
+| `--gmock_catch_leaked_mocks=0` | Don't report leaked mock objects as failures. |
+| `--gmock_verbose=LEVEL` | Sets the default verbosity level (`info`, `warning`, or `error`) of Google Mock messages. |
+<!-- mdformat on -->
diff --git a/GraphBLAS/CUDA/test/googlemock/docs/cook_book.md b/GraphBLAS/CUDA/test/googlemock/docs/cook_book.md
new file mode 100644
index 0000000000..51eb94a9ad
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/docs/cook_book.md
@@ -0,0 +1,4271 @@
+# gMock Cookbook
+
+<!-- GOOGLETEST_CM0012 DO NOT DELETE -->
+
+You can find recipes for using gMock here. If you haven't yet, please read
+[this](for_dummies.md) first to make sure you understand the basics.
+
+**Note:** gMock lives in the `testing` name space. For readability, it is
+recommended to write `using ::testing::Foo;` once in your file before using the
+name `Foo` defined by gMock. We omit such `using` statements in this section for
+brevity, but you should do it in your own code.
+
+## Creating Mock Classes
+
+Mock classes are defined as normal classes, using the `MOCK_METHOD` macro to
+generate mocked methods. The macro gets 3 or 4 parameters:
+
+```cpp
+class MyMock {
+ public:
+  MOCK_METHOD(ReturnType, MethodName, (Args...));
+  MOCK_METHOD(ReturnType, MethodName, (Args...), (Specs...));
+};
+```
+
+The first 3 parameters are simply the method declaration, split into 3 parts.
+The 4th parameter accepts a closed list of qualifiers, which affect the
+generated method:
+
+*   **`const`** - Makes the mocked method a `const` method. Required if
+    overriding a `const` method.
+*   **`override`** - Marks the method with `override`. Recommended if overriding
+    a `virtual` method.
+*   **`noexcept`** - Marks the method with `noexcept`. Required if overriding a
+    `noexcept` method.
+*   **`Calltype(...)`** - Sets the call type for the method (e.g. to
+    `STDMETHODCALLTYPE`), useful in Windows.
+
+### Dealing with unprotected commas
+
+Unprotected commas, i.e. commas which are not surrounded by parentheses, prevent
+`MOCK_METHOD` from parsing its arguments correctly:
+
+```cpp {.bad}
+class MockFoo {
+ public:
+  MOCK_METHOD(std::pair<bool, int>, GetPair, ());  // Won't compile!
+  MOCK_METHOD(bool, CheckMap, (std::map<int, double>, bool));  // Won't compile!
+};
+```
+
+Solution 1 - wrap with parentheses:
+
+```cpp {.good}
+class MockFoo {
+ public:
+  MOCK_METHOD((std::pair<bool, int>), GetPair, ());
+  MOCK_METHOD(bool, CheckMap, ((std::map<int, double>), bool));
+};
+```
+
+Note that wrapping a return or argument type with parentheses is, in general,
+invalid C++. `MOCK_METHOD` removes the parentheses.
+
+Solution 2 - define an alias:
+
+```cpp {.good}
+class MockFoo {
+ public:
+  using BoolAndInt = std::pair<bool, int>;
+  MOCK_METHOD(BoolAndInt, GetPair, ());
+  using MapIntDouble = std::map<int, double>;
+  MOCK_METHOD(bool, CheckMap, (MapIntDouble, bool));
+};
+```
+
+### Mocking Private or Protected Methods
+
+You must always put a mock method definition (`MOCK_METHOD`) in a `public:`
+section of the mock class, regardless of the method being mocked being `public`,
+`protected`, or `private` in the base class. This allows `ON_CALL` and
+`EXPECT_CALL` to reference the mock function from outside of the mock class.
+(Yes, C++ allows a subclass to change the access level of a virtual function in
+the base class.) Example:
+
+```cpp
+class Foo {
+ public:
+  ...
+  virtual bool Transform(Gadget* g) = 0;
+
+ protected:
+  virtual void Resume();
+
+ private:
+  virtual int GetTimeOut();
+};
+
+class MockFoo : public Foo {
+ public:
+  ...
+  MOCK_METHOD(bool, Transform, (Gadget* g), (override));
+
+  // The following must be in the public section, even though the
+  // methods are protected or private in the base class.
+  MOCK_METHOD(void, Resume, (), (override));
+  MOCK_METHOD(int, GetTimeOut, (), (override));
+};
+```
+
+### Mocking Overloaded Methods
+
+You can mock overloaded functions as usual. No special attention is required:
+
+```cpp
+class Foo {
+  ...
+
+  // Must be virtual as we'll inherit from Foo.
+  virtual ~Foo();
+
+  // Overloaded on the types and/or numbers of arguments.
+  virtual int Add(Element x);
+  virtual int Add(int times, Element x);
+
+  // Overloaded on the const-ness of this object.
+  virtual Bar& GetBar();
+  virtual const Bar& GetBar() const;
+};
+
+class MockFoo : public Foo {
+  ...
+  MOCK_METHOD(int, Add, (Element x), (override));
+  MOCK_METHOD(int, Add, (int times, Element x), (override));
+
+  MOCK_METHOD(Bar&, GetBar, (), (override));
+  MOCK_METHOD(const Bar&, GetBar, (), (const, override));
+};
+```
+
+**Note:** if you don't mock all versions of the overloaded method, the compiler
+will give you a warning about some methods in the base class being hidden. To
+fix that, use `using` to bring them in scope:
+
+```cpp
+class MockFoo : public Foo {
+  ...
+  using Foo::Add;
+  MOCK_METHOD(int, Add, (Element x), (override));
+  // We don't want to mock int Add(int times, Element x);
+  ...
+};
+```
+
+### Mocking Class Templates
+
+You can mock class templates just like any class.
+
+```cpp
+template <typename Elem>
+class StackInterface {
+  ...
+  // Must be virtual as we'll inherit from StackInterface.
+  virtual ~StackInterface();
+
+  virtual int GetSize() const = 0;
+  virtual void Push(const Elem& x) = 0;
+};
+
+template <typename Elem>
+class MockStack : public StackInterface<Elem> {
+  ...
+  MOCK_METHOD(int, GetSize, (), (override));
+  MOCK_METHOD(void, Push, (const Elem& x), (override));
+};
+```
+
+### Mocking Non-virtual Methods {#MockingNonVirtualMethods}
+
+gMock can mock non-virtual functions to be used in Hi-perf dependency
+injection.<!-- GOOGLETEST_CM0017 DO NOT DELETE -->
+
+In this case, instead of sharing a common base class with the real class, your
+mock class will be *unrelated* to the real class, but contain methods with the
+same signatures. The syntax for mocking non-virtual methods is the *same* as
+mocking virtual methods (just don't add `override`):
+
+```cpp
+// A simple packet stream class.  None of its members is virtual.
+class ConcretePacketStream {
+ public:
+  void AppendPacket(Packet* new_packet);
+  const Packet* GetPacket(size_t packet_number) const;
+  size_t NumberOfPackets() const;
+  ...
+};
+
+// A mock packet stream class.  It inherits from no other, but defines
+// GetPacket() and NumberOfPackets().
+class MockPacketStream {
+ public:
+  MOCK_METHOD(const Packet*, GetPacket, (size_t packet_number), (const));
+  MOCK_METHOD(size_t, NumberOfPackets, (), (const));
+  ...
+};
+```
+
+Note that the mock class doesn't define `AppendPacket()`, unlike the real class.
+That's fine as long as the test doesn't need to call it.
+
+Next, you need a way to say that you want to use `ConcretePacketStream` in
+production code, and use `MockPacketStream` in tests. Since the functions are
+not virtual and the two classes are unrelated, you must specify your choice at
+*compile time* (as opposed to run time).
+
+One way to do it is to templatize your code that needs to use a packet stream.
+More specifically, you will give your code a template type argument for the type
+of the packet stream. In production, you will instantiate your template with
+`ConcretePacketStream` as the type argument. In tests, you will instantiate the
+same template with `MockPacketStream`. For example, you may write:
+
+```cpp
+template <class PacketStream>
+void CreateConnection(PacketStream* stream) { ... }
+
+template <class PacketStream>
+class PacketReader {
+ public:
+  void ReadPackets(PacketStream* stream, size_t packet_num);
+};
+```
+
+Then you can use `CreateConnection<ConcretePacketStream>()` and
+`PacketReader<ConcretePacketStream>` in production code, and use
+`CreateConnection<MockPacketStream>()` and `PacketReader<MockPacketStream>` in
+tests.
+
+```cpp
+  MockPacketStream mock_stream;
+  EXPECT_CALL(mock_stream, ...)...;
+  .. set more expectations on mock_stream ...
+  PacketReader<MockPacketStream> reader(&mock_stream);
+  ... exercise reader ...
+```
+
+### Mocking Free Functions
+
+It's possible to use gMock to mock a free function (i.e. a C-style function or a
+static method). You just need to rewrite your code to use an interface (abstract
+class).
+
+Instead of calling a free function (say, `OpenFile`) directly, introduce an
+interface for it and have a concrete subclass that calls the free function:
+
+```cpp
+class FileInterface {
+ public:
+  ...
+  virtual bool Open(const char* path, const char* mode) = 0;
+};
+
+class File : public FileInterface {
+ public:
+  ...
+  virtual bool Open(const char* path, const char* mode) {
+     return OpenFile(path, mode);
+  }
+};
+```
+
+Your code should talk to `FileInterface` to open a file. Now it's easy to mock
+out the function.
+
+This may seem like a lot of hassle, but in practice you often have multiple
+related functions that you can put in the same interface, so the per-function
+syntactic overhead will be much lower.
+
+If you are concerned about the performance overhead incurred by virtual
+functions, and profiling confirms your concern, you can combine this with the
+recipe for [mocking non-virtual methods](#MockingNonVirtualMethods).
+
+### Old-Style `MOCK_METHODn` Macros
+
+Before the generic `MOCK_METHOD` macro was introduced, mocks where created using
+a family of macros collectively called `MOCK_METHODn`. These macros are still
+supported, though migration to the new `MOCK_METHOD` is recommended.
+
+The macros in the `MOCK_METHODn` family differ from `MOCK_METHOD`:
+
+*   The general structure is `MOCK_METHODn(MethodName, ReturnType(Args))`,
+    instead of `MOCK_METHOD(ReturnType, MethodName, (Args))`.
+*   The number `n` must equal the number of arguments.
+*   When mocking a const method, one must use `MOCK_CONST_METHODn`.
+*   When mocking a class template, the macro name must be suffixed with `_T`.
+*   In order to specify the call type, the macro name must be suffixed with
+    `_WITH_CALLTYPE`, and the call type is the first macro argument.
+
+Old macros and their new equivalents:
+
+<a name="table99"></a>
+<table border="1" cellspacing="0" cellpadding="1">
+<tr> <th colspan=2> Simple </th></tr>
+<tr> <td> Old </td> <td> `MOCK_METHOD1(Foo, bool(int))` </td> </tr>
+<tr> <td> New </td> <td> `MOCK_METHOD(bool, Foo, (int))` </td> </tr>
+
+<tr> <th colspan=2> Const Method </th></tr> <tr> <td> Old </td> <td>
+`MOCK_CONST_METHOD1(Foo, bool(int))` </td> </tr> <tr> <td> New </td> <td>
+`MOCK_METHOD(bool, Foo, (int), (const))` </td> </tr>
+
+<tr> <th colspan=2> Method in a Class Template </th></tr> <tr> <td> Old </td>
+<td> `MOCK_METHOD1_T(Foo, bool(int))` </td> </tr> <tr> <td> New </td> <td>
+`MOCK_METHOD(bool, Foo, (int))` </td> </tr>
+
+<tr> <th colspan=2> Const Method in a Class Template </th></tr> <tr> <td> Old
+</td> <td> `MOCK_CONST_METHOD1_T(Foo, bool(int))` </td> </tr> <tr> <td> New
+</td> <td> `MOCK_METHOD(bool, Foo, (int), (const))` </td> </tr>
+
+<tr> <th colspan=2> Method with Call Type </th></tr> <tr> <td> Old </td> <td>
+`MOCK_METHOD1_WITH_CALLTYPE(STDMETHODCALLTYPE, Foo, bool(int))` </td> </tr> <tr>
+<td> New </td> <td> `MOCK_METHOD(bool, Foo, (int),
+(Calltype(STDMETHODCALLTYPE)))` </td> </tr>
+
+<tr> <th colspan=2> Const Method with Call Type </th></tr> <tr> <td> Old</td>
+<td> `MOCK_CONST_METHOD1_WITH_CALLTYPE(STDMETHODCALLTYPE, Foo, bool(int))` </td>
+</tr> <tr> <td> New </td> <td> `MOCK_METHOD(bool, Foo, (int), (const,
+Calltype(STDMETHODCALLTYPE)))` </td> </tr>
+
+<tr> <th colspan=2> Method with Call Type in a Class Template </th></tr> <tr>
+<td> Old </td> <td> `MOCK_METHOD1_T_WITH_CALLTYPE(STDMETHODCALLTYPE, Foo,
+bool(int))` </td> </tr> <tr> <td> New </td> <td> `MOCK_METHOD(bool, Foo, (int),
+(Calltype(STDMETHODCALLTYPE)))` </td> </tr>
+
+<tr> <th colspan=2> Const Method with Call Type in a Class Template </th></tr>
+<tr> <td> Old </td> <td> `MOCK_CONST_METHOD1_T_WITH_CALLTYPE(STDMETHODCALLTYPE,
+Foo, bool(int))` </td> </tr> <tr> <td> New </td> <td> `MOCK_METHOD(bool, Foo,
+(int), (const, Calltype(STDMETHODCALLTYPE)))` </td> </tr>
+
+</table>
+
+### The Nice, the Strict, and the Naggy {#NiceStrictNaggy}
+
+If a mock method has no `EXPECT_CALL` spec but is called, we say that it's an
+"uninteresting call", and the default action (which can be specified using
+`ON_CALL()`) of the method will be taken. Currently, an uninteresting call will
+also by default cause gMock to print a warning. (In the future, we might remove
+this warning by default.)
+
+However, sometimes you may want to ignore these uninteresting calls, and
+sometimes you may want to treat them as errors. gMock lets you make the decision
+on a per-mock-object basis.
+
+Suppose your test uses a mock class `MockFoo`:
+
+```cpp
+TEST(...) {
+  MockFoo mock_foo;
+  EXPECT_CALL(mock_foo, DoThis());
+  ... code that uses mock_foo ...
+}
+```
+
+If a method of `mock_foo` other than `DoThis()` is called, you will get a
+warning. However, if you rewrite your test to use `NiceMock<MockFoo>` instead,
+you can suppress the warning:
+
+```cpp
+using ::testing::NiceMock;
+
+TEST(...) {
+  NiceMock<MockFoo> mock_foo;
+  EXPECT_CALL(mock_foo, DoThis());
+  ... code that uses mock_foo ...
+}
+```
+
+`NiceMock<MockFoo>` is a subclass of `MockFoo`, so it can be used wherever
+`MockFoo` is accepted.
+
+It also works if `MockFoo`'s constructor takes some arguments, as
+`NiceMock<MockFoo>` "inherits" `MockFoo`'s constructors:
+
+```cpp
+using ::testing::NiceMock;
+
+TEST(...) {
+  NiceMock<MockFoo> mock_foo(5, "hi");  // Calls MockFoo(5, "hi").
+  EXPECT_CALL(mock_foo, DoThis());
+  ... code that uses mock_foo ...
+}
+```
+
+The usage of `StrictMock` is similar, except that it makes all uninteresting
+calls failures:
+
+```cpp
+using ::testing::StrictMock;
+
+TEST(...) {
+  StrictMock<MockFoo> mock_foo;
+  EXPECT_CALL(mock_foo, DoThis());
+  ... code that uses mock_foo ...
+
+  // The test will fail if a method of mock_foo other than DoThis()
+  // is called.
+}
+```
+
+NOTE: `NiceMock` and `StrictMock` only affects *uninteresting* calls (calls of
+*methods* with no expectations); they do not affect *unexpected* calls (calls of
+methods with expectations, but they don't match). See
+[Understanding Uninteresting vs Unexpected Calls](#uninteresting-vs-unexpected).
+
+There are some caveats though (I dislike them just as much as the next guy, but
+sadly they are side effects of C++'s limitations):
+
+1.  `NiceMock<MockFoo>` and `StrictMock<MockFoo>` only work for mock methods
+    defined using the `MOCK_METHOD` macro **directly** in the `MockFoo` class.
+    If a mock method is defined in a **base class** of `MockFoo`, the "nice" or
+    "strict" modifier may not affect it, depending on the compiler. In
+    particular, nesting `NiceMock` and `StrictMock` (e.g.
+    `NiceMock<StrictMock<MockFoo> >`) is **not** supported.
+2.  `NiceMock<MockFoo>` and `StrictMock<MockFoo>` may not work correctly if the
+    destructor of `MockFoo` is not virtual. We would like to fix this, but it
+    requires cleaning up existing tests.
+3.  During the constructor or destructor of `MockFoo`, the mock object is *not*
+    nice or strict. This may cause surprises if the constructor or destructor
+    calls a mock method on `this` object. (This behavior, however, is consistent
+    with C++'s general rule: if a constructor or destructor calls a virtual
+    method of `this` object, that method is treated as non-virtual. In other
+    words, to the base class's constructor or destructor, `this` object behaves
+    like an instance of the base class, not the derived class. This rule is
+    required for safety. Otherwise a base constructor may use members of a
+    derived class before they are initialized, or a base destructor may use
+    members of a derived class after they have been destroyed.)
+
+Finally, you should be **very cautious** about when to use naggy or strict
+mocks, as they tend to make tests more brittle and harder to maintain. When you
+refactor your code without changing its externally visible behavior, ideally you
+shouldn't need to update any tests. If your code interacts with a naggy mock,
+however, you may start to get spammed with warnings as the result of your
+change. Worse, if your code interacts with a strict mock, your tests may start
+to fail and you'll be forced to fix them. Our general recommendation is to use
+nice mocks (not yet the default) most of the time, use naggy mocks (the current
+default) when developing or debugging tests, and use strict mocks only as the
+last resort.
+
+### Simplifying the Interface without Breaking Existing Code {#SimplerInterfaces}
+
+Sometimes a method has a long list of arguments that is mostly uninteresting.
+For example:
+
+```cpp
+class LogSink {
+ public:
+  ...
+  virtual void send(LogSeverity severity, const char* full_filename,
+                    const char* base_filename, int line,
+                    const struct tm* tm_time,
+                    const char* message, size_t message_len) = 0;
+};
+```
+
+This method's argument list is lengthy and hard to work with (the `message`
+argument is not even 0-terminated). If we mock it as is, using the mock will be
+awkward. If, however, we try to simplify this interface, we'll need to fix all
+clients depending on it, which is often infeasible.
+
+The trick is to redispatch the method in the mock class:
+
+```cpp
+class ScopedMockLog : public LogSink {
+ public:
+  ...
+  virtual void send(LogSeverity severity, const char* full_filename,
+                    const char* base_filename, int line, const tm* tm_time,
+                    const char* message, size_t message_len) {
+    // We are only interested in the log severity, full file name, and
+    // log message.
+    Log(severity, full_filename, std::string(message, message_len));
+  }
+
+  // Implements the mock method:
+  //
+  //   void Log(LogSeverity severity,
+  //            const string& file_path,
+  //            const string& message);
+  MOCK_METHOD(void, Log,
+              (LogSeverity severity, const string& file_path,
+               const string& message));
+};
+```
+
+By defining a new mock method with a trimmed argument list, we make the mock
+class more user-friendly.
+
+This technique may also be applied to make overloaded methods more amenable to
+mocking. For example, when overloads have been used to implement default
+arguments:
+
+```cpp
+class MockTurtleFactory : public TurtleFactory {
+ public:
+  Turtle* MakeTurtle(int length, int weight) override { ... }
+  Turtle* MakeTurtle(int length, int weight, int speed) override { ... }
+
+  // the above methods delegate to this one:
+  MOCK_METHOD(Turtle*, DoMakeTurtle, ());
+};
+```
+
+This allows tests that don't care which overload was invoked to avoid specifying
+argument matchers:
+
+```cpp
+ON_CALL(factory, DoMakeTurtle)
+    .WillByDefault(MakeMockTurtle());
+```
+
+### Alternative to Mocking Concrete Classes
+
+Often you may find yourself using classes that don't implement interfaces. In
+order to test your code that uses such a class (let's call it `Concrete`), you
+may be tempted to make the methods of `Concrete` virtual and then mock it.
+
+Try not to do that.
+
+Making a non-virtual function virtual is a big decision. It creates an extension
+point where subclasses can tweak your class' behavior. This weakens your control
+on the class because now it's harder to maintain the class invariants. You
+should make a function virtual only when there is a valid reason for a subclass
+to override it.
+
+Mocking concrete classes directly is problematic as it creates a tight coupling
+between the class and the tests - any small change in the class may invalidate
+your tests and make test maintenance a pain.
+
+To avoid such problems, many programmers have been practicing "coding to
+interfaces": instead of talking to the `Concrete` class, your code would define
+an interface and talk to it. Then you implement that interface as an adaptor on
+top of `Concrete`. In tests, you can easily mock that interface to observe how
+your code is doing.
+
+This technique incurs some overhead:
+
+*   You pay the cost of virtual function calls (usually not a problem).
+*   There is more abstraction for the programmers to learn.
+
+However, it can also bring significant benefits in addition to better
+testability:
+
+*   `Concrete`'s API may not fit your problem domain very well, as you may not
+    be the only client it tries to serve. By designing your own interface, you
+    have a chance to tailor it to your need - you may add higher-level
+    functionalities, rename stuff, etc instead of just trimming the class. This
+    allows you to write your code (user of the interface) in a more natural way,
+    which means it will be more readable, more maintainable, and you'll be more
+    productive.
+*   If `Concrete`'s implementation ever has to change, you don't have to rewrite
+    everywhere it is used. Instead, you can absorb the change in your
+    implementation of the interface, and your other code and tests will be
+    insulated from this change.
+
+Some people worry that if everyone is practicing this technique, they will end
+up writing lots of redundant code. This concern is totally understandable.
+However, there are two reasons why it may not be the case:
+
+*   Different projects may need to use `Concrete` in different ways, so the best
+    interfaces for them will be different. Therefore, each of them will have its
+    own domain-specific interface on top of `Concrete`, and they will not be the
+    same code.
+*   If enough projects want to use the same interface, they can always share it,
+    just like they have been sharing `Concrete`. You can check in the interface
+    and the adaptor somewhere near `Concrete` (perhaps in a `contrib`
+    sub-directory) and let many projects use it.
+
+You need to weigh the pros and cons carefully for your particular problem, but
+I'd like to assure you that the Java community has been practicing this for a
+long time and it's a proven effective technique applicable in a wide variety of
+situations. :-)
+
+### Delegating Calls to a Fake {#DelegatingToFake}
+
+Some times you have a non-trivial fake implementation of an interface. For
+example:
+
+```cpp
+class Foo {
+ public:
+  virtual ~Foo() {}
+  virtual char DoThis(int n) = 0;
+  virtual void DoThat(const char* s, int* p) = 0;
+};
+
+class FakeFoo : public Foo {
+ public:
+  char DoThis(int n) override {
+    return (n > 0) ? '+' :
+           (n < 0) ? '-' : '0';
+  }
+
+  void DoThat(const char* s, int* p) override {
+    *p = strlen(s);
+  }
+};
+```
+
+Now you want to mock this interface such that you can set expectations on it.
+However, you also want to use `FakeFoo` for the default behavior, as duplicating
+it in the mock object is, well, a lot of work.
+
+When you define the mock class using gMock, you can have it delegate its default
+action to a fake class you already have, using this pattern:
+
+```cpp
+class MockFoo : public Foo {
+ public:
+  // Normal mock method definitions using gMock.
+  MOCK_METHOD(char, DoThis, (int n), (override));
+  MOCK_METHOD(void, DoThat, (const char* s, int* p), (override));
+
+  // Delegates the default actions of the methods to a FakeFoo object.
+  // This must be called *before* the custom ON_CALL() statements.
+  void DelegateToFake() {
+    ON_CALL(*this, DoThis).WillByDefault([this](int n) {
+      return fake_.DoThis(n);
+    });
+    ON_CALL(*this, DoThat).WillByDefault([this](const char* s, int* p) {
+      fake_.DoThat(s, p);
+    });
+  }
+
+ private:
+  FakeFoo fake_;  // Keeps an instance of the fake in the mock.
+};
+```
+
+With that, you can use `MockFoo` in your tests as usual. Just remember that if
+you don't explicitly set an action in an `ON_CALL()` or `EXPECT_CALL()`, the
+fake will be called upon to do it.:
+
+```cpp
+using ::testing::_;
+
+TEST(AbcTest, Xyz) {
+  MockFoo foo;
+
+  foo.DelegateToFake();  // Enables the fake for delegation.
+
+  // Put your ON_CALL(foo, ...)s here, if any.
+
+  // No action specified, meaning to use the default action.
+  EXPECT_CALL(foo, DoThis(5));
+  EXPECT_CALL(foo, DoThat(_, _));
+
+  int n = 0;
+  EXPECT_EQ('+', foo.DoThis(5));  // FakeFoo::DoThis() is invoked.
+  foo.DoThat("Hi", &n);  // FakeFoo::DoThat() is invoked.
+  EXPECT_EQ(2, n);
+}
+```
+
+**Some tips:**
+
+*   If you want, you can still override the default action by providing your own
+    `ON_CALL()` or using `.WillOnce()` / `.WillRepeatedly()` in `EXPECT_CALL()`.
+*   In `DelegateToFake()`, you only need to delegate the methods whose fake
+    implementation you intend to use.
+
+*   The general technique discussed here works for overloaded methods, but
+    you'll need to tell the compiler which version you mean. To disambiguate a
+    mock function (the one you specify inside the parentheses of `ON_CALL()`),
+    use [this technique](#SelectOverload); to disambiguate a fake function (the
+    one you place inside `Invoke()`), use a `static_cast` to specify the
+    function's type. For instance, if class `Foo` has methods `char DoThis(int
+    n)` and `bool DoThis(double x) const`, and you want to invoke the latter,
+    you need to write `Invoke(&fake_, static_cast<bool (FakeFoo::*)(double)
+    const>(&FakeFoo::DoThis))` instead of `Invoke(&fake_, &FakeFoo::DoThis)`
+    (The strange-looking thing inside the angled brackets of `static_cast` is
+    the type of a function pointer to the second `DoThis()` method.).
+
+*   Having to mix a mock and a fake is often a sign of something gone wrong.
+    Perhaps you haven't got used to the interaction-based way of testing yet. Or
+    perhaps your interface is taking on too many roles and should be split up.
+    Therefore, **don't abuse this**. We would only recommend to do it as an
+    intermediate step when you are refactoring your code.
+
+Regarding the tip on mixing a mock and a fake, here's an example on why it may
+be a bad sign: Suppose you have a class `System` for low-level system
+operations. In particular, it does file and I/O operations. And suppose you want
+to test how your code uses `System` to do I/O, and you just want the file
+operations to work normally. If you mock out the entire `System` class, you'll
+have to provide a fake implementation for the file operation part, which
+suggests that `System` is taking on too many roles.
+
+Instead, you can define a `FileOps` interface and an `IOOps` interface and split
+`System`'s functionalities into the two. Then you can mock `IOOps` without
+mocking `FileOps`.
+
+### Delegating Calls to a Real Object
+
+When using testing doubles (mocks, fakes, stubs, and etc), sometimes their
+behaviors will differ from those of the real objects. This difference could be
+either intentional (as in simulating an error such that you can test the error
+handling code) or unintentional. If your mocks have different behaviors than the
+real objects by mistake, you could end up with code that passes the tests but
+fails in production.
+
+You can use the *delegating-to-real* technique to ensure that your mock has the
+same behavior as the real object while retaining the ability to validate calls.
+This technique is very similar to the [delegating-to-fake](#DelegatingToFake)
+technique, the difference being that we use a real object instead of a fake.
+Here's an example:
+
+```cpp
+using ::testing::AtLeast;
+
+class MockFoo : public Foo {
+ public:
+  MockFoo() {
+    // By default, all calls are delegated to the real object.
+    ON_CALL(*this, DoThis).WillByDefault([this](int n) {
+      return real_.DoThis(n);
+    });
+    ON_CALL(*this, DoThat).WillByDefault([this](const char* s, int* p) {
+      real_.DoThat(s, p);
+    });
+    ...
+  }
+  MOCK_METHOD(char, DoThis, ...);
+  MOCK_METHOD(void, DoThat, ...);
+  ...
+ private:
+  Foo real_;
+};
+
+...
+  MockFoo mock;
+  EXPECT_CALL(mock, DoThis())
+      .Times(3);
+  EXPECT_CALL(mock, DoThat("Hi"))
+      .Times(AtLeast(1));
+  ... use mock in test ...
+```
+
+With this, gMock will verify that your code made the right calls (with the right
+arguments, in the right order, called the right number of times, etc), and a
+real object will answer the calls (so the behavior will be the same as in
+production). This gives you the best of both worlds.
+
+### Delegating Calls to a Parent Class
+
+Ideally, you should code to interfaces, whose methods are all pure virtual. In
+reality, sometimes you do need to mock a virtual method that is not pure (i.e,
+it already has an implementation). For example:
+
+```cpp
+class Foo {
+ public:
+  virtual ~Foo();
+
+  virtual void Pure(int n) = 0;
+  virtual int Concrete(const char* str) { ... }
+};
+
+class MockFoo : public Foo {
+ public:
+  // Mocking a pure method.
+  MOCK_METHOD(void, Pure, (int n), (override));
+  // Mocking a concrete method.  Foo::Concrete() is shadowed.
+  MOCK_METHOD(int, Concrete, (const char* str), (override));
+};
+```
+
+Sometimes you may want to call `Foo::Concrete()` instead of
+`MockFoo::Concrete()`. Perhaps you want to do it as part of a stub action, or
+perhaps your test doesn't need to mock `Concrete()` at all (but it would be
+oh-so painful to have to define a new mock class whenever you don't need to mock
+one of its methods).
+
+The trick is to leave a back door in your mock class for accessing the real
+methods in the base class:
+
+```cpp
+class MockFoo : public Foo {
+ public:
+  // Mocking a pure method.
+  MOCK_METHOD(void, Pure, (int n), (override));
+  // Mocking a concrete method.  Foo::Concrete() is shadowed.
+  MOCK_METHOD(int, Concrete, (const char* str), (override));
+
+  // Use this to call Concrete() defined in Foo.
+  int FooConcrete(const char* str) { return Foo::Concrete(str); }
+};
+```
+
+Now, you can call `Foo::Concrete()` inside an action by:
+
+```cpp
+...
+  EXPECT_CALL(foo, Concrete).WillOnce([&foo](const char* str) {
+    return foo.FooConcrete(str);
+  });
+```
+
+or tell the mock object that you don't want to mock `Concrete()`:
+
+```cpp
+...
+  ON_CALL(foo, Concrete).WillByDefault([&foo](const char* str) {
+    return foo.FooConcrete(str);
+  });
+```
+
+(Why don't we just write `{ return foo.Concrete(str); }`? If you do that,
+`MockFoo::Concrete()` will be called (and cause an infinite recursion) since
+`Foo::Concrete()` is virtual. That's just how C++ works.)
+
+## Using Matchers
+
+### Matching Argument Values Exactly
+
+You can specify exactly which arguments a mock method is expecting:
+
+```cpp
+using ::testing::Return;
+...
+  EXPECT_CALL(foo, DoThis(5))
+      .WillOnce(Return('a'));
+  EXPECT_CALL(foo, DoThat("Hello", bar));
+```
+
+### Using Simple Matchers
+
+You can use matchers to match arguments that have a certain property:
+
+```cpp
+using ::testing::NotNull;
+using ::testing::Return;
+...
+  EXPECT_CALL(foo, DoThis(Ge(5)))  // The argument must be >= 5.
+      .WillOnce(Return('a'));
+  EXPECT_CALL(foo, DoThat("Hello", NotNull()));
+      // The second argument must not be NULL.
+```
+
+A frequently used matcher is `_`, which matches anything:
+
+```cpp
+  EXPECT_CALL(foo, DoThat(_, NotNull()));
+```
+<!-- GOOGLETEST_CM0022 DO NOT DELETE -->
+
+### Combining Matchers {#CombiningMatchers}
+
+You can build complex matchers from existing ones using `AllOf()`,
+`AllOfArray()`, `AnyOf()`, `AnyOfArray()` and `Not()`:
+
+```cpp
+using ::testing::AllOf;
+using ::testing::Gt;
+using ::testing::HasSubstr;
+using ::testing::Ne;
+using ::testing::Not;
+...
+  // The argument must be > 5 and != 10.
+  EXPECT_CALL(foo, DoThis(AllOf(Gt(5),
+                                Ne(10))));
+
+  // The first argument must not contain sub-string "blah".
+  EXPECT_CALL(foo, DoThat(Not(HasSubstr("blah")),
+                          NULL));
+```
+
+### Casting Matchers {#SafeMatcherCast}
+
+gMock matchers are statically typed, meaning that the compiler can catch your
+mistake if you use a matcher of the wrong type (for example, if you use `Eq(5)`
+to match a `string` argument). Good for you!
+
+Sometimes, however, you know what you're doing and want the compiler to give you
+some slack. One example is that you have a matcher for `long` and the argument
+you want to match is `int`. While the two types aren't exactly the same, there
+is nothing really wrong with using a `Matcher<long>` to match an `int` - after
+all, we can first convert the `int` argument to a `long` losslessly before
+giving it to the matcher.
+
+To support this need, gMock gives you the `SafeMatcherCast<T>(m)` function. It
+casts a matcher `m` to type `Matcher<T>`. To ensure safety, gMock checks that
+(let `U` be the type `m` accepts :
+
+1.  Type `T` can be *implicitly* cast to type `U`;
+2.  When both `T` and `U` are built-in arithmetic types (`bool`, integers, and
+    floating-point numbers), the conversion from `T` to `U` is not lossy (in
+    other words, any value representable by `T` can also be represented by `U`);
+    and
+3.  When `U` is a reference, `T` must also be a reference (as the underlying
+    matcher may be interested in the address of the `U` value).
+
+The code won't compile if any of these conditions isn't met.
+
+Here's one example:
+
+```cpp
+using ::testing::SafeMatcherCast;
+
+// A base class and a child class.
+class Base { ... };
+class Derived : public Base { ... };
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(void, DoThis, (Derived* derived), (override));
+};
+
+...
+  MockFoo foo;
+  // m is a Matcher<Base*> we got from somewhere.
+  EXPECT_CALL(foo, DoThis(SafeMatcherCast<Derived*>(m)));
+```
+
+If you find `SafeMatcherCast<T>(m)` too limiting, you can use a similar function
+`MatcherCast<T>(m)`. The difference is that `MatcherCast` works as long as you
+can `static_cast` type `T` to type `U`.
+
+`MatcherCast` essentially lets you bypass C++'s type system (`static_cast` isn't
+always safe as it could throw away information, for example), so be careful not
+to misuse/abuse it.
+
+### Selecting Between Overloaded Functions {#SelectOverload}
+
+If you expect an overloaded function to be called, the compiler may need some
+help on which overloaded version it is.
+
+To disambiguate functions overloaded on the const-ness of this object, use the
+`Const()` argument wrapper.
+
+```cpp
+using ::testing::ReturnRef;
+
+class MockFoo : public Foo {
+  ...
+  MOCK_METHOD(Bar&, GetBar, (), (override));
+  MOCK_METHOD(const Bar&, GetBar, (), (const, override));
+};
+
+...
+  MockFoo foo;
+  Bar bar1, bar2;
+  EXPECT_CALL(foo, GetBar())         // The non-const GetBar().
+      .WillOnce(ReturnRef(bar1));
+  EXPECT_CALL(Const(foo), GetBar())  // The const GetBar().
+      .WillOnce(ReturnRef(bar2));
+```
+
+(`Const()` is defined by gMock and returns a `const` reference to its argument.)
+
+To disambiguate overloaded functions with the same number of arguments but
+different argument types, you may need to specify the exact type of a matcher,
+either by wrapping your matcher in `Matcher<type>()`, or using a matcher whose
+type is fixed (`TypedEq<type>`, `An<type>()`, etc):
+
+```cpp
+using ::testing::An;
+using ::testing::Matcher;
+using ::testing::TypedEq;
+
+class MockPrinter : public Printer {
+ public:
+  MOCK_METHOD(void, Print, (int n), (override));
+  MOCK_METHOD(void, Print, (char c), (override));
+};
+
+TEST(PrinterTest, Print) {
+  MockPrinter printer;
+
+  EXPECT_CALL(printer, Print(An<int>()));            // void Print(int);
+  EXPECT_CALL(printer, Print(Matcher<int>(Lt(5))));  // void Print(int);
+  EXPECT_CALL(printer, Print(TypedEq<char>('a')));   // void Print(char);
+
+  printer.Print(3);
+  printer.Print(6);
+  printer.Print('a');
+}
+```
+
+### Performing Different Actions Based on the Arguments
+
+When a mock method is called, the *last* matching expectation that's still
+active will be selected (think "newer overrides older"). So, you can make a
+method do different things depending on its argument values like this:
+
+```cpp
+using ::testing::_;
+using ::testing::Lt;
+using ::testing::Return;
+...
+  // The default case.
+  EXPECT_CALL(foo, DoThis(_))
+      .WillRepeatedly(Return('b'));
+  // The more specific case.
+  EXPECT_CALL(foo, DoThis(Lt(5)))
+      .WillRepeatedly(Return('a'));
+```
+
+Now, if `foo.DoThis()` is called with a value less than 5, `'a'` will be
+returned; otherwise `'b'` will be returned.
+
+### Matching Multiple Arguments as a Whole
+
+Sometimes it's not enough to match the arguments individually. For example, we
+may want to say that the first argument must be less than the second argument.
+The `With()` clause allows us to match all arguments of a mock function as a
+whole. For example,
+
+```cpp
+using ::testing::_;
+using ::testing::Ne;
+using ::testing::Lt;
+...
+  EXPECT_CALL(foo, InRange(Ne(0), _))
+      .With(Lt());
+```
+
+says that the first argument of `InRange()` must not be 0, and must be less than
+the second argument.
+
+The expression inside `With()` must be a matcher of type `Matcher<std::tuple<A1,
+..., An>>`, where `A1`, ..., `An` are the types of the function arguments.
+
+You can also write `AllArgs(m)` instead of `m` inside `.With()`. The two forms
+are equivalent, but `.With(AllArgs(Lt()))` is more readable than `.With(Lt())`.
+
+You can use `Args<k1, ..., kn>(m)` to match the `n` selected arguments (as a
+tuple) against `m`. For example,
+
+```cpp
+using ::testing::_;
+using ::testing::AllOf;
+using ::testing::Args;
+using ::testing::Lt;
+...
+  EXPECT_CALL(foo, Blah)
+      .With(AllOf(Args<0, 1>(Lt()), Args<1, 2>(Lt())));
+```
+
+says that `Blah` will be called with arguments `x`, `y`, and `z` where `x < y <
+z`. Note that in this example, it wasn't necessary specify the positional
+matchers.
+
+As a convenience and example, gMock provides some matchers for 2-tuples,
+including the `Lt()` matcher above. See [here](#MultiArgMatchers) for the
+complete list.
+
+Note that if you want to pass the arguments to a predicate of your own (e.g.
+`.With(Args<0, 1>(Truly(&MyPredicate)))`), that predicate MUST be written to
+take a `std::tuple` as its argument; gMock will pass the `n` selected arguments
+as *one* single tuple to the predicate.
+
+### Using Matchers as Predicates
+
+Have you noticed that a matcher is just a fancy predicate that also knows how to
+describe itself? Many existing algorithms take predicates as arguments (e.g.
+those defined in STL's `<algorithm>` header), and it would be a shame if gMock
+matchers were not allowed to participate.
+
+Luckily, you can use a matcher where a unary predicate functor is expected by
+wrapping it inside the `Matches()` function. For example,
+
+```cpp
+#include <algorithm>
+#include <vector>
+
+using ::testing::Matches;
+using ::testing::Ge;
+
+vector<int> v;
+...
+// How many elements in v are >= 10?
+const int count = count_if(v.begin(), v.end(), Matches(Ge(10)));
+```
+
+Since you can build complex matchers from simpler ones easily using gMock, this
+gives you a way to conveniently construct composite predicates (doing the same
+using STL's `<functional>` header is just painful). For example, here's a
+predicate that's satisfied by any number that is >= 0, <= 100, and != 50:
+
+```cpp
+using testing::AllOf;
+using testing::Ge;
+using testing::Le;
+using testing::Matches;
+using testing::Ne;
+...
+Matches(AllOf(Ge(0), Le(100), Ne(50)))
+```
+
+### Using Matchers in googletest Assertions
+
+Since matchers are basically predicates that also know how to describe
+themselves, there is a way to take advantage of them in googletest assertions.
+It's called `ASSERT_THAT` and `EXPECT_THAT`:
+
+```cpp
+  ASSERT_THAT(value, matcher);  // Asserts that value matches matcher.
+  EXPECT_THAT(value, matcher);  // The non-fatal version.
+```
+
+For example, in a googletest test you can write:
+
+```cpp
+#include "gmock/gmock.h"
+
+using ::testing::AllOf;
+using ::testing::Ge;
+using ::testing::Le;
+using ::testing::MatchesRegex;
+using ::testing::StartsWith;
+
+...
+  EXPECT_THAT(Foo(), StartsWith("Hello"));
+  EXPECT_THAT(Bar(), MatchesRegex("Line \\d+"));
+  ASSERT_THAT(Baz(), AllOf(Ge(5), Le(10)));
+```
+
+which (as you can probably guess) executes `Foo()`, `Bar()`, and `Baz()`, and
+verifies that:
+
+*   `Foo()` returns a string that starts with `"Hello"`.
+*   `Bar()` returns a string that matches regular expression `"Line \\d+"`.
+*   `Baz()` returns a number in the range [5, 10].
+
+The nice thing about these macros is that *they read like English*. They
+generate informative messages too. For example, if the first `EXPECT_THAT()`
+above fails, the message will be something like:
+
+```cpp
+Value of: Foo()
+  Actual: "Hi, world!"
+Expected: starts with "Hello"
+```
+
+**Credit:** The idea of `(ASSERT|EXPECT)_THAT` was borrowed from Joe Walnes'
+Hamcrest project, which adds `assertThat()` to JUnit.
+
+### Using Predicates as Matchers
+
+gMock provides a [built-in set](#MatcherList) of matchers. In case you find them
+lacking, you can use an arbitrary unary predicate function or functor as a
+matcher - as long as the predicate accepts a value of the type you want. You do
+this by wrapping the predicate inside the `Truly()` function, for example:
+
+```cpp
+using ::testing::Truly;
+
+int IsEven(int n) { return (n % 2) == 0 ? 1 : 0; }
+...
+  // Bar() must be called with an even number.
+  EXPECT_CALL(foo, Bar(Truly(IsEven)));
+```
+
+Note that the predicate function / functor doesn't have to return `bool`. It
+works as long as the return value can be used as the condition in in statement
+`if (condition) ...`.
+
+<!-- GOOGLETEST_CM0023 DO NOT DELETE -->
+
+### Matching Arguments that Are Not Copyable
+
+When you do an `EXPECT_CALL(mock_obj, Foo(bar))`, gMock saves away a copy of
+`bar`. When `Foo()` is called later, gMock compares the argument to `Foo()` with
+the saved copy of `bar`. This way, you don't need to worry about `bar` being
+modified or destroyed after the `EXPECT_CALL()` is executed. The same is true
+when you use matchers like `Eq(bar)`, `Le(bar)`, and so on.
+
+But what if `bar` cannot be copied (i.e. has no copy constructor)? You could
+define your own matcher function or callback and use it with `Truly()`, as the
+previous couple of recipes have shown. Or, you may be able to get away from it
+if you can guarantee that `bar` won't be changed after the `EXPECT_CALL()` is
+executed. Just tell gMock that it should save a reference to `bar`, instead of a
+copy of it. Here's how:
+
+```cpp
+using ::testing::ByRef;
+using ::testing::Eq;
+using ::testing::Lt;
+...
+  // Expects that Foo()'s argument == bar.
+  EXPECT_CALL(mock_obj, Foo(Eq(ByRef(bar))));
+
+  // Expects that Foo()'s argument < bar.
+  EXPECT_CALL(mock_obj, Foo(Lt(ByRef(bar))));
+```
+
+Remember: if you do this, don't change `bar` after the `EXPECT_CALL()`, or the
+result is undefined.
+
+### Validating a Member of an Object
+
+Often a mock function takes a reference to object as an argument. When matching
+the argument, you may not want to compare the entire object against a fixed
+object, as that may be over-specification. Instead, you may need to validate a
+certain member variable or the result of a certain getter method of the object.
+You can do this with `Field()` and `Property()`. More specifically,
+
+```cpp
+Field(&Foo::bar, m)
+```
+
+is a matcher that matches a `Foo` object whose `bar` member variable satisfies
+matcher `m`.
+
+```cpp
+Property(&Foo::baz, m)
+```
+
+is a matcher that matches a `Foo` object whose `baz()` method returns a value
+that satisfies matcher `m`.
+
+For example:
+
+<!-- mdformat off(github rendering does not support multiline tables) -->
+| Expression                   | Description                              |
+| :--------------------------- | :--------------------------------------- |
+| `Field(&Foo::number, Ge(3))` | Matches `x` where `x.number >= 3`.       |
+| `Property(&Foo::name,  StartsWith("John "))` | Matches `x` where `x.name()` starts with  `"John "`. |
+<!-- mdformat on -->
+
+Note that in `Property(&Foo::baz, ...)`, method `baz()` must take no argument
+and be declared as `const`.
+
+BTW, `Field()` and `Property()` can also match plain pointers to objects. For
+instance,
+
+```cpp
+using ::testing::Field;
+using ::testing::Ge;
+...
+Field(&Foo::number, Ge(3))
+```
+
+matches a plain pointer `p` where `p->number >= 3`. If `p` is `NULL`, the match
+will always fail regardless of the inner matcher.
+
+What if you want to validate more than one members at the same time? Remember
+that there are [`AllOf()` and `AllOfArray()`](#CombiningMatchers).
+
+Finally `Field()` and `Property()` provide overloads that take the field or
+property names as the first argument to include it in the error message. This
+can be useful when creating combined matchers.
+
+```cpp
+using ::testing::AllOf;
+using ::testing::Field;
+using ::testing::Matcher;
+using ::testing::SafeMatcherCast;
+
+Matcher<Foo> IsFoo(const Foo& foo) {
+  return AllOf(Field("some_field", &Foo::some_field, foo.some_field),
+               Field("other_field", &Foo::other_field, foo.other_field),
+               Field("last_field", &Foo::last_field, foo.last_field));
+}
+```
+
+### Validating the Value Pointed to by a Pointer Argument
+
+C++ functions often take pointers as arguments. You can use matchers like
+`IsNull()`, `NotNull()`, and other comparison matchers to match a pointer, but
+what if you want to make sure the value *pointed to* by the pointer, instead of
+the pointer itself, has a certain property? Well, you can use the `Pointee(m)`
+matcher.
+
+`Pointee(m)` matches a pointer if and only if `m` matches the value the pointer
+points to. For example:
+
+```cpp
+using ::testing::Ge;
+using ::testing::Pointee;
+...
+  EXPECT_CALL(foo, Bar(Pointee(Ge(3))));
+```
+
+expects `foo.Bar()` to be called with a pointer that points to a value greater
+than or equal to 3.
+
+One nice thing about `Pointee()` is that it treats a `NULL` pointer as a match
+failure, so you can write `Pointee(m)` instead of
+
+```cpp
+using ::testing::AllOf;
+using ::testing::NotNull;
+using ::testing::Pointee;
+...
+  AllOf(NotNull(), Pointee(m))
+```
+
+without worrying that a `NULL` pointer will crash your test.
+
+Also, did we tell you that `Pointee()` works with both raw pointers **and**
+smart pointers (`std::unique_ptr`, `std::shared_ptr`, etc)?
+
+What if you have a pointer to pointer? You guessed it - you can use nested
+`Pointee()` to probe deeper inside the value. For example,
+`Pointee(Pointee(Lt(3)))` matches a pointer that points to a pointer that points
+to a number less than 3 (what a mouthful...).
+
+### Testing a Certain Property of an Object
+
+Sometimes you want to specify that an object argument has a certain property,
+but there is no existing matcher that does this. If you want good error
+messages, you should [define a matcher](#NewMatchers). If you want to do it
+quick and dirty, you could get away with writing an ordinary function.
+
+Let's say you have a mock function that takes an object of type `Foo`, which has
+an `int bar()` method and an `int baz()` method, and you want to constrain that
+the argument's `bar()` value plus its `baz()` value is a given number. Here's
+how you can define a matcher to do it:
+
+```cpp
+using ::testing::Matcher;
+using ::testing::MatcherInterface;
+using ::testing::MatchResultListener;
+
+class BarPlusBazEqMatcher : public MatcherInterface<const Foo&> {
+ public:
+  explicit BarPlusBazEqMatcher(int expected_sum)
+      : expected_sum_(expected_sum) {}
+
+  bool MatchAndExplain(const Foo& foo,
+                       MatchResultListener* /* listener */) const override {
+    return (foo.bar() + foo.baz()) == expected_sum_;
+  }
+
+  void DescribeTo(std::ostream* os) const override {
+    *os << "bar() + baz() equals " << expected_sum_;
+  }
+
+  void DescribeNegationTo(std::ostream* os) const override {
+    *os << "bar() + baz() does not equal " << expected_sum_;
+  }
+ private:
+  const int expected_sum_;
+};
+
+Matcher<const Foo&> BarPlusBazEq(int expected_sum) {
+  return MakeMatcher(new BarPlusBazEqMatcher(expected_sum));
+}
+
+...
+  EXPECT_CALL(..., DoThis(BarPlusBazEq(5)))...;
+```
+
+### Matching Containers
+
+Sometimes an STL container (e.g. list, vector, map, ...) is passed to a mock
+function and you may want to validate it. Since most STL containers support the
+`==` operator, you can write `Eq(expected_container)` or simply
+`expected_container` to match a container exactly.
+
+Sometimes, though, you may want to be more flexible (for example, the first
+element must be an exact match, but the second element can be any positive
+number, and so on). Also, containers used in tests often have a small number of
+elements, and having to define the expected container out-of-line is a bit of a
+hassle.
+
+You can use the `ElementsAre()` or `UnorderedElementsAre()` matcher in such
+cases:
+
+```cpp
+using ::testing::_;
+using ::testing::ElementsAre;
+using ::testing::Gt;
+...
+  MOCK_METHOD(void, Foo, (const vector<int>& numbers), (override));
+...
+  EXPECT_CALL(mock, Foo(ElementsAre(1, Gt(0), _, 5)));
+```
+
+The above matcher says that the container must have 4 elements, which must be 1,
+greater than 0, anything, and 5 respectively.
+
+If you instead write:
+
+```cpp
+using ::testing::_;
+using ::testing::Gt;
+using ::testing::UnorderedElementsAre;
+...
+  MOCK_METHOD(void, Foo, (const vector<int>& numbers), (override));
+...
+  EXPECT_CALL(mock, Foo(UnorderedElementsAre(1, Gt(0), _, 5)));
+```
+
+It means that the container must have 4 elements, which (under some permutation)
+must be 1, greater than 0, anything, and 5 respectively.
+
+As an alternative you can place the arguments in a C-style array and use
+`ElementsAreArray()` or `UnorderedElementsAreArray()` instead:
+
+```cpp
+using ::testing::ElementsAreArray;
+...
+  // ElementsAreArray accepts an array of element values.
+  const int expected_vector1[] = {1, 5, 2, 4, ...};
+  EXPECT_CALL(mock, Foo(ElementsAreArray(expected_vector1)));
+
+  // Or, an array of element matchers.
+  Matcher<int> expected_vector2[] = {1, Gt(2), _, 3, ...};
+  EXPECT_CALL(mock, Foo(ElementsAreArray(expected_vector2)));
+```
+
+In case the array needs to be dynamically created (and therefore the array size
+cannot be inferred by the compiler), you can give `ElementsAreArray()` an
+additional argument to specify the array size:
+
+```cpp
+using ::testing::ElementsAreArray;
+...
+  int* const expected_vector3 = new int[count];
+  ... fill expected_vector3 with values ...
+  EXPECT_CALL(mock, Foo(ElementsAreArray(expected_vector3, count)));
+```
+
+Use `Pair` when comparing maps or other associative containers.
+
+```cpp
+using testing::ElementsAre;
+using testing::Pair;
+...
+  std::map<string, int> m = {{"a", 1}, {"b", 2}, {"c", 3}};
+  EXPECT_THAT(m, ElementsAre(Pair("a", 1), Pair("b", 2), Pair("c", 3)));
+```
+
+**Tips:**
+
+*   `ElementsAre*()` can be used to match *any* container that implements the
+    STL iterator pattern (i.e. it has a `const_iterator` type and supports
+    `begin()/end()`), not just the ones defined in STL. It will even work with
+    container types yet to be written - as long as they follows the above
+    pattern.
+*   You can use nested `ElementsAre*()` to match nested (multi-dimensional)
+    containers.
+*   If the container is passed by pointer instead of by reference, just write
+    `Pointee(ElementsAre*(...))`.
+*   The order of elements *matters* for `ElementsAre*()`. If you are using it
+    with containers whose element order are undefined (e.g. `hash_map`) you
+    should use `WhenSorted` around `ElementsAre`.
+
+### Sharing Matchers
+
+Under the hood, a gMock matcher object consists of a pointer to a ref-counted
+implementation object. Copying matchers is allowed and very efficient, as only
+the pointer is copied. When the last matcher that references the implementation
+object dies, the implementation object will be deleted.
+
+Therefore, if you have some complex matcher that you want to use again and
+again, there is no need to build it everytime. Just assign it to a matcher
+variable and use that variable repeatedly! For example,
+
+```cpp
+using ::testing::AllOf;
+using ::testing::Gt;
+using ::testing::Le;
+using ::testing::Matcher;
+...
+  Matcher<int> in_range = AllOf(Gt(5), Le(10));
+  ... use in_range as a matcher in multiple EXPECT_CALLs ...
+```
+
+### Matchers must have no side-effects {#PureMatchers}
+
+WARNING: gMock does not guarantee when or how many times a matcher will be
+invoked. Therefore, all matchers must be *purely functional*: they cannot have
+any side effects, and the match result must not depend on anything other than
+the matcher's parameters and the value being matched.
+
+This requirement must be satisfied no matter how a matcher is defined (e.g., if
+it is one of the standard matchers, or a custom matcher). In particular, a
+matcher can never call a mock function, as that will affect the state of the
+mock object and gMock.
+
+## Setting Expectations
+
+### Knowing When to Expect {#UseOnCall}
+
+<!-- GOOGLETEST_CM0018 DO NOT DELETE -->
+
+**`ON_CALL`** is likely the *single most under-utilized construct* in gMock.
+
+There are basically two constructs for defining the behavior of a mock object:
+`ON_CALL` and `EXPECT_CALL`. The difference? `ON_CALL` defines what happens when
+a mock method is called, but <em>doesn't imply any expectation on the method
+being called</em>. `EXPECT_CALL` not only defines the behavior, but also sets an
+expectation that <em>the method will be called with the given arguments, for the
+given number of times</em> (and *in the given order* when you specify the order
+too).
+
+Since `EXPECT_CALL` does more, isn't it better than `ON_CALL`? Not really. Every
+`EXPECT_CALL` adds a constraint on the behavior of the code under test. Having
+more constraints than necessary is *baaad* - even worse than not having enough
+constraints.
+
+This may be counter-intuitive. How could tests that verify more be worse than
+tests that verify less? Isn't verification the whole point of tests?
+
+The answer lies in *what* a test should verify. **A good test verifies the
+contract of the code.** If a test over-specifies, it doesn't leave enough
+freedom to the implementation. As a result, changing the implementation without
+breaking the contract (e.g. refactoring and optimization), which should be
+perfectly fine to do, can break such tests. Then you have to spend time fixing
+them, only to see them broken again the next time the implementation is changed.
+
+Keep in mind that one doesn't have to verify more than one property in one test.
+In fact, **it's a good style to verify only one thing in one test.** If you do
+that, a bug will likely break only one or two tests instead of dozens (which
+case would you rather debug?). If you are also in the habit of giving tests
+descriptive names that tell what they verify, you can often easily guess what's
+wrong just from the test log itself.
+
+So use `ON_CALL` by default, and only use `EXPECT_CALL` when you actually intend
+to verify that the call is made. For example, you may have a bunch of `ON_CALL`s
+in your test fixture to set the common mock behavior shared by all tests in the
+same group, and write (scarcely) different `EXPECT_CALL`s in different `TEST_F`s
+to verify different aspects of the code's behavior. Compared with the style
+where each `TEST` has many `EXPECT_CALL`s, this leads to tests that are more
+resilient to implementational changes (and thus less likely to require
+maintenance) and makes the intent of the tests more obvious (so they are easier
+to maintain when you do need to maintain them).
+
+If you are bothered by the "Uninteresting mock function call" message printed
+when a mock method without an `EXPECT_CALL` is called, you may use a `NiceMock`
+instead to suppress all such messages for the mock object, or suppress the
+message for specific methods by adding `EXPECT_CALL(...).Times(AnyNumber())`. DO
+NOT suppress it by blindly adding an `EXPECT_CALL(...)`, or you'll have a test
+that's a pain to maintain.
+
+### Ignoring Uninteresting Calls
+
+If you are not interested in how a mock method is called, just don't say
+anything about it. In this case, if the method is ever called, gMock will
+perform its default action to allow the test program to continue. If you are not
+happy with the default action taken by gMock, you can override it using
+`DefaultValue<T>::Set()` (described [here](#DefaultValue)) or `ON_CALL()`.
+
+Please note that once you expressed interest in a particular mock method (via
+`EXPECT_CALL()`), all invocations to it must match some expectation. If this
+function is called but the arguments don't match any `EXPECT_CALL()` statement,
+it will be an error.
+
+### Disallowing Unexpected Calls
+
+If a mock method shouldn't be called at all, explicitly say so:
+
+```cpp
+using ::testing::_;
+...
+  EXPECT_CALL(foo, Bar(_))
+      .Times(0);
+```
+
+If some calls to the method are allowed, but the rest are not, just list all the
+expected calls:
+
+```cpp
+using ::testing::AnyNumber;
+using ::testing::Gt;
+...
+  EXPECT_CALL(foo, Bar(5));
+  EXPECT_CALL(foo, Bar(Gt(10)))
+      .Times(AnyNumber());
+```
+
+A call to `foo.Bar()` that doesn't match any of the `EXPECT_CALL()` statements
+will be an error.
+
+### Understanding Uninteresting vs Unexpected Calls {#uninteresting-vs-unexpected}
+
+*Uninteresting* calls and *unexpected* calls are different concepts in gMock.
+*Very* different.
+
+A call `x.Y(...)` is **uninteresting** if there's *not even a single*
+`EXPECT_CALL(x, Y(...))` set. In other words, the test isn't interested in the
+`x.Y()` method at all, as evident in that the test doesn't care to say anything
+about it.
+
+A call `x.Y(...)` is **unexpected** if there are *some* `EXPECT_CALL(x,
+Y(...))`s set, but none of them matches the call. Put another way, the test is
+interested in the `x.Y()` method (therefore it explicitly sets some
+`EXPECT_CALL` to verify how it's called); however, the verification fails as the
+test doesn't expect this particular call to happen.
+
+**An unexpected call is always an error,** as the code under test doesn't behave
+the way the test expects it to behave.
+
+**By default, an uninteresting call is not an error,** as it violates no
+constraint specified by the test. (gMock's philosophy is that saying nothing
+means there is no constraint.) However, it leads to a warning, as it *might*
+indicate a problem (e.g. the test author might have forgotten to specify a
+constraint).
+
+In gMock, `NiceMock` and `StrictMock` can be used to make a mock class "nice" or
+"strict". How does this affect uninteresting calls and unexpected calls?
+
+A **nice mock** suppresses uninteresting call *warnings*. It is less chatty than
+the default mock, but otherwise is the same. If a test fails with a default
+mock, it will also fail using a nice mock instead. And vice versa. Don't expect
+making a mock nice to change the test's result.
+
+A **strict mock** turns uninteresting call warnings into errors. So making a
+mock strict may change the test's result.
+
+Let's look at an example:
+
+```cpp
+TEST(...) {
+  NiceMock<MockDomainRegistry> mock_registry;
+  EXPECT_CALL(mock_registry, GetDomainOwner("google.com"))
+          .WillRepeatedly(Return("Larry Page"));
+
+  // Use mock_registry in code under test.
+  ... &mock_registry ...
+}
+```
+
+The sole `EXPECT_CALL` here says that all calls to `GetDomainOwner()` must have
+`"google.com"` as the argument. If `GetDomainOwner("yahoo.com")` is called, it
+will be an unexpected call, and thus an error. *Having a nice mock doesn't
+change the severity of an unexpected call.*
+
+So how do we tell gMock that `GetDomainOwner()` can be called with some other
+arguments as well? The standard technique is to add a "catch all" `EXPECT_CALL`:
+
+```cpp
+  EXPECT_CALL(mock_registry, GetDomainOwner(_))
+        .Times(AnyNumber());  // catches all other calls to this method.
+  EXPECT_CALL(mock_registry, GetDomainOwner("google.com"))
+        .WillRepeatedly(Return("Larry Page"));
+```
+
+Remember that `_` is the wildcard matcher that matches anything. With this, if
+`GetDomainOwner("google.com")` is called, it will do what the second
+`EXPECT_CALL` says; if it is called with a different argument, it will do what
+the first `EXPECT_CALL` says.
+
+Note that the order of the two `EXPECT_CALL`s is important, as a newer
+`EXPECT_CALL` takes precedence over an older one.
+
+For more on uninteresting calls, nice mocks, and strict mocks, read
+["The Nice, the Strict, and the Naggy"](#NiceStrictNaggy).
+
+### Ignoring Uninteresting Arguments {#ParameterlessExpectations}
+
+If your test doesn't care about the parameters (it only cares about the number
+or order of calls), you can often simply omit the parameter list:
+
+```cpp
+  // Expect foo.Bar( ... ) twice with any arguments.
+  EXPECT_CALL(foo, Bar).Times(2);
+
+  // Delegate to the given method whenever the factory is invoked.
+  ON_CALL(foo_factory, MakeFoo)
+      .WillByDefault(&BuildFooForTest);
+```
+
+This functionality is only available when a method is not overloaded; to prevent
+unexpected behavior it is a compilation error to try to set an expectation on a
+method where the specific overload is ambiguous. You can work around this by
+supplying a [simpler mock interface](#SimplerInterfaces) than the mocked class
+provides.
+
+This pattern is also useful when the arguments are interesting, but match logic
+is substantially complex. You can leave the argument list unspecified and use
+SaveArg actions to [save the values for later verification](#SaveArgVerify). If
+you do that, you can easily differentiate calling the method the wrong number of
+times from calling it with the wrong arguments.
+
+### Expecting Ordered Calls {#OrderedCalls}
+
+Although an `EXPECT_CALL()` statement defined earlier takes precedence when
+gMock tries to match a function call with an expectation, by default calls don't
+have to happen in the order `EXPECT_CALL()` statements are written. For example,
+if the arguments match the matchers in the third `EXPECT_CALL()`, but not those
+in the first two, then the third expectation will be used.
+
+If you would rather have all calls occur in the order of the expectations, put
+the `EXPECT_CALL()` statements in a block where you define a variable of type
+`InSequence`:
+
+```cpp
+using ::testing::_;
+using ::testing::InSequence;
+
+  {
+    InSequence s;
+
+    EXPECT_CALL(foo, DoThis(5));
+    EXPECT_CALL(bar, DoThat(_))
+        .Times(2);
+    EXPECT_CALL(foo, DoThis(6));
+  }
+```
+
+In this example, we expect a call to `foo.DoThis(5)`, followed by two calls to
+`bar.DoThat()` where the argument can be anything, which are in turn followed by
+a call to `foo.DoThis(6)`. If a call occurred out-of-order, gMock will report an
+error.
+
+### Expecting Partially Ordered Calls {#PartialOrder}
+
+Sometimes requiring everything to occur in a predetermined order can lead to
+brittle tests. For example, we may care about `A` occurring before both `B` and
+`C`, but aren't interested in the relative order of `B` and `C`. In this case,
+the test should reflect our real intent, instead of being overly constraining.
+
+gMock allows you to impose an arbitrary DAG (directed acyclic graph) on the
+calls. One way to express the DAG is to use the [After](#AfterClause) clause of
+`EXPECT_CALL`.
+
+Another way is via the `InSequence()` clause (not the same as the `InSequence`
+class), which we borrowed from jMock 2. It's less flexible than `After()`, but
+more convenient when you have long chains of sequential calls, as it doesn't
+require you to come up with different names for the expectations in the chains.
+Here's how it works:
+
+If we view `EXPECT_CALL()` statements as nodes in a graph, and add an edge from
+node A to node B wherever A must occur before B, we can get a DAG. We use the
+term "sequence" to mean a directed path in this DAG. Now, if we decompose the
+DAG into sequences, we just need to know which sequences each `EXPECT_CALL()`
+belongs to in order to be able to reconstruct the original DAG.
+
+So, to specify the partial order on the expectations we need to do two things:
+first to define some `Sequence` objects, and then for each `EXPECT_CALL()` say
+which `Sequence` objects it is part of.
+
+Expectations in the same sequence must occur in the order they are written. For
+example,
+
+```cpp
+using ::testing::Sequence;
+...
+  Sequence s1, s2;
+
+  EXPECT_CALL(foo, A())
+      .InSequence(s1, s2);
+  EXPECT_CALL(bar, B())
+      .InSequence(s1);
+  EXPECT_CALL(bar, C())
+      .InSequence(s2);
+  EXPECT_CALL(foo, D())
+      .InSequence(s2);
+```
+
+specifies the following DAG (where `s1` is `A -> B`, and `s2` is `A -> C -> D`):
+
+```text
+       +---> B
+       |
+  A ---|
+       |
+        +---> C ---> D
+```
+
+This means that A must occur before B and C, and C must occur before D. There's
+no restriction about the order other than these.
+
+### Controlling When an Expectation Retires
+
+When a mock method is called, gMock only considers expectations that are still
+active. An expectation is active when created, and becomes inactive (aka
+*retires*) when a call that has to occur later has occurred. For example, in
+
+```cpp
+using ::testing::_;
+using ::testing::Sequence;
+...
+  Sequence s1, s2;
+
+  EXPECT_CALL(log, Log(WARNING, _, "File too large."))      // #1
+      .Times(AnyNumber())
+      .InSequence(s1, s2);
+  EXPECT_CALL(log, Log(WARNING, _, "Data set is empty."))   // #2
+      .InSequence(s1);
+  EXPECT_CALL(log, Log(WARNING, _, "User not found."))      // #3
+      .InSequence(s2);
+```
+
+as soon as either #2 or #3 is matched, #1 will retire. If a warning `"File too
+large."` is logged after this, it will be an error.
+
+Note that an expectation doesn't retire automatically when it's saturated. For
+example,
+
+```cpp
+using ::testing::_;
+...
+  EXPECT_CALL(log, Log(WARNING, _, _));                     // #1
+  EXPECT_CALL(log, Log(WARNING, _, "File too large."));     // #2
+```
+
+says that there will be exactly one warning with the message `"File too
+large."`. If the second warning contains this message too, #2 will match again
+and result in an upper-bound-violated error.
+
+If this is not what you want, you can ask an expectation to retire as soon as it
+becomes saturated:
+
+```cpp
+using ::testing::_;
+...
+  EXPECT_CALL(log, Log(WARNING, _, _));                     // #1
+  EXPECT_CALL(log, Log(WARNING, _, "File too large."))      // #2
+      .RetiresOnSaturation();
+```
+
+Here #2 can be used only once, so if you have two warnings with the message
+`"File too large."`, the first will match #2 and the second will match #1 -
+there will be no error.
+
+## Using Actions
+
+### Returning References from Mock Methods
+
+If a mock function's return type is a reference, you need to use `ReturnRef()`
+instead of `Return()` to return a result:
+
+```cpp
+using ::testing::ReturnRef;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(Bar&, GetBar, (), (override));
+};
+...
+  MockFoo foo;
+  Bar bar;
+  EXPECT_CALL(foo, GetBar())
+      .WillOnce(ReturnRef(bar));
+...
+```
+
+### Returning Live Values from Mock Methods
+
+The `Return(x)` action saves a copy of `x` when the action is created, and
+always returns the same value whenever it's executed. Sometimes you may want to
+instead return the *live* value of `x` (i.e. its value at the time when the
+action is *executed*.). Use either `ReturnRef()` or `ReturnPointee()` for this
+purpose.
+
+If the mock function's return type is a reference, you can do it using
+`ReturnRef(x)`, as shown in the previous recipe ("Returning References from Mock
+Methods"). However, gMock doesn't let you use `ReturnRef()` in a mock function
+whose return type is not a reference, as doing that usually indicates a user
+error. So, what shall you do?
+
+Though you may be tempted, DO NOT use `ByRef()`:
+
+```cpp
+using testing::ByRef;
+using testing::Return;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(int, GetValue, (), (override));
+};
+...
+  int x = 0;
+  MockFoo foo;
+  EXPECT_CALL(foo, GetValue())
+      .WillRepeatedly(Return(ByRef(x)));  // Wrong!
+  x = 42;
+  EXPECT_EQ(42, foo.GetValue());
+```
+
+Unfortunately, it doesn't work here. The above code will fail with error:
+
+```text
+Value of: foo.GetValue()
+  Actual: 0
+Expected: 42
+```
+
+The reason is that `Return(*value*)` converts `value` to the actual return type
+of the mock function at the time when the action is *created*, not when it is
+*executed*. (This behavior was chosen for the action to be safe when `value` is
+a proxy object that references some temporary objects.) As a result, `ByRef(x)`
+is converted to an `int` value (instead of a `const int&`) when the expectation
+is set, and `Return(ByRef(x))` will always return 0.
+
+`ReturnPointee(pointer)` was provided to solve this problem specifically. It
+returns the value pointed to by `pointer` at the time the action is *executed*:
+
+```cpp
+using testing::ReturnPointee;
+...
+  int x = 0;
+  MockFoo foo;
+  EXPECT_CALL(foo, GetValue())
+      .WillRepeatedly(ReturnPointee(&x));  // Note the & here.
+  x = 42;
+  EXPECT_EQ(42, foo.GetValue());  // This will succeed now.
+```
+
+### Combining Actions
+
+Want to do more than one thing when a function is called? That's fine. `DoAll()`
+allow you to do sequence of actions every time. Only the return value of the
+last action in the sequence will be used.
+
+```cpp
+using ::testing::_;
+using ::testing::DoAll;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(bool, Bar, (int n), (override));
+};
+...
+  EXPECT_CALL(foo, Bar(_))
+      .WillOnce(DoAll(action_1,
+                      action_2,
+                      ...
+                      action_n));
+```
+
+### Verifying Complex Arguments {#SaveArgVerify}
+
+If you want to verify that a method is called with a particular argument but the
+match criteria is complex, it can be difficult to distinguish between
+cardinality failures (calling the method the wrong number of times) and argument
+match failures. Similarly, if you are matching multiple parameters, it may not
+be easy to distinguishing which argument failed to match. For example:
+
+```cpp
+  // Not ideal: this could fail because of a problem with arg1 or arg2, or maybe
+  // just the method wasn't called.
+  EXPECT_CALL(foo, SendValues(_, ElementsAre(1, 4, 4, 7), EqualsProto( ... )));
+```
+
+You can instead save the arguments and test them individually:
+
+```cpp
+  EXPECT_CALL(foo, SendValues)
+      .WillOnce(DoAll(SaveArg<1>(&actual_array), SaveArg<2>(&actual_proto)));
+  ... run the test
+  EXPECT_THAT(actual_array, ElementsAre(1, 4, 4, 7));
+  EXPECT_THAT(actual_proto, EqualsProto( ... ));
+```
+
+### Mocking Side Effects {#MockingSideEffects}
+
+Sometimes a method exhibits its effect not via returning a value but via side
+effects. For example, it may change some global state or modify an output
+argument. To mock side effects, in general you can define your own action by
+implementing `::testing::ActionInterface`.
+
+If all you need to do is to change an output argument, the built-in
+`SetArgPointee()` action is convenient:
+
+```cpp
+using ::testing::_;
+using ::testing::SetArgPointee;
+
+class MockMutator : public Mutator {
+ public:
+  MOCK_METHOD(void, Mutate, (bool mutate, int* value), (override));
+  ...
+}
+...
+  MockMutator mutator;
+  EXPECT_CALL(mutator, Mutate(true, _))
+      .WillOnce(SetArgPointee<1>(5));
+```
+
+In this example, when `mutator.Mutate()` is called, we will assign 5 to the
+`int` variable pointed to by argument #1 (0-based).
+
+`SetArgPointee()` conveniently makes an internal copy of the value you pass to
+it, removing the need to keep the value in scope and alive. The implication
+however is that the value must have a copy constructor and assignment operator.
+
+If the mock method also needs to return a value as well, you can chain
+`SetArgPointee()` with `Return()` using `DoAll()`, remembering to put the
+`Return()` statement last:
+
+```cpp
+using ::testing::_;
+using ::testing::Return;
+using ::testing::SetArgPointee;
+
+class MockMutator : public Mutator {
+ public:
+  ...
+  MOCK_METHOD(bool, MutateInt, (int* value), (override));
+}
+...
+  MockMutator mutator;
+  EXPECT_CALL(mutator, MutateInt(_))
+      .WillOnce(DoAll(SetArgPointee<0>(5),
+                      Return(true)));
+```
+
+Note, however, that if you use the `ReturnOKWith()` method, it will override the
+values provided by `SetArgPointee()` in the response parameters of your function
+call.
+
+If the output argument is an array, use the `SetArrayArgument<N>(first, last)`
+action instead. It copies the elements in source range `[first, last)` to the
+array pointed to by the `N`-th (0-based) argument:
+
+```cpp
+using ::testing::NotNull;
+using ::testing::SetArrayArgument;
+
+class MockArrayMutator : public ArrayMutator {
+ public:
+  MOCK_METHOD(void, Mutate, (int* values, int num_values), (override));
+  ...
+}
+...
+  MockArrayMutator mutator;
+  int values[5] = {1, 2, 3, 4, 5};
+  EXPECT_CALL(mutator, Mutate(NotNull(), 5))
+      .WillOnce(SetArrayArgument<0>(values, values + 5));
+```
+
+This also works when the argument is an output iterator:
+
+```cpp
+using ::testing::_;
+using ::testing::SetArrayArgument;
+
+class MockRolodex : public Rolodex {
+ public:
+  MOCK_METHOD(void, GetNames, (std::back_insert_iterator<vector<string>>),
+              (override));
+  ...
+}
+...
+  MockRolodex rolodex;
+  vector<string> names;
+  names.push_back("George");
+  names.push_back("John");
+  names.push_back("Thomas");
+  EXPECT_CALL(rolodex, GetNames(_))
+      .WillOnce(SetArrayArgument<0>(names.begin(), names.end()));
+```
+
+### Changing a Mock Object's Behavior Based on the State
+
+If you expect a call to change the behavior of a mock object, you can use
+`::testing::InSequence` to specify different behaviors before and after the
+call:
+
+```cpp
+using ::testing::InSequence;
+using ::testing::Return;
+
+...
+  {
+     InSequence seq;
+     EXPECT_CALL(my_mock, IsDirty())
+         .WillRepeatedly(Return(true));
+     EXPECT_CALL(my_mock, Flush());
+     EXPECT_CALL(my_mock, IsDirty())
+         .WillRepeatedly(Return(false));
+  }
+  my_mock.FlushIfDirty();
+```
+
+This makes `my_mock.IsDirty()` return `true` before `my_mock.Flush()` is called
+and return `false` afterwards.
+
+If the behavior change is more complex, you can store the effects in a variable
+and make a mock method get its return value from that variable:
+
+```cpp
+using ::testing::_;
+using ::testing::SaveArg;
+using ::testing::Return;
+
+ACTION_P(ReturnPointee, p) { return *p; }
+...
+  int previous_value = 0;
+  EXPECT_CALL(my_mock, GetPrevValue)
+      .WillRepeatedly(ReturnPointee(&previous_value));
+  EXPECT_CALL(my_mock, UpdateValue)
+      .WillRepeatedly(SaveArg<0>(&previous_value));
+  my_mock.DoSomethingToUpdateValue();
+```
+
+Here `my_mock.GetPrevValue()` will always return the argument of the last
+`UpdateValue()` call.
+
+### Setting the Default Value for a Return Type {#DefaultValue}
+
+If a mock method's return type is a built-in C++ type or pointer, by default it
+will return 0 when invoked. Also, in C++ 11 and above, a mock method whose
+return type has a default constructor will return a default-constructed value by
+default. You only need to specify an action if this default value doesn't work
+for you.
+
+Sometimes, you may want to change this default value, or you may want to specify
+a default value for types gMock doesn't know about. You can do this using the
+`::testing::DefaultValue` class template:
+
+```cpp
+using ::testing::DefaultValue;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(Bar, CalculateBar, (), (override));
+};
+
+
+...
+  Bar default_bar;
+  // Sets the default return value for type Bar.
+  DefaultValue<Bar>::Set(default_bar);
+
+  MockFoo foo;
+
+  // We don't need to specify an action here, as the default
+  // return value works for us.
+  EXPECT_CALL(foo, CalculateBar());
+
+  foo.CalculateBar();  // This should return default_bar.
+
+  // Unsets the default return value.
+  DefaultValue<Bar>::Clear();
+```
+
+Please note that changing the default value for a type can make you tests hard
+to understand. We recommend you to use this feature judiciously. For example,
+you may want to make sure the `Set()` and `Clear()` calls are right next to the
+code that uses your mock.
+
+### Setting the Default Actions for a Mock Method
+
+You've learned how to change the default value of a given type. However, this
+may be too coarse for your purpose: perhaps you have two mock methods with the
+same return type and you want them to have different behaviors. The `ON_CALL()`
+macro allows you to customize your mock's behavior at the method level:
+
+```cpp
+using ::testing::_;
+using ::testing::AnyNumber;
+using ::testing::Gt;
+using ::testing::Return;
+...
+  ON_CALL(foo, Sign(_))
+      .WillByDefault(Return(-1));
+  ON_CALL(foo, Sign(0))
+      .WillByDefault(Return(0));
+  ON_CALL(foo, Sign(Gt(0)))
+      .WillByDefault(Return(1));
+
+  EXPECT_CALL(foo, Sign(_))
+      .Times(AnyNumber());
+
+  foo.Sign(5);   // This should return 1.
+  foo.Sign(-9);  // This should return -1.
+  foo.Sign(0);   // This should return 0.
+```
+
+As you may have guessed, when there are more than one `ON_CALL()` statements,
+the newer ones in the order take precedence over the older ones. In other words,
+the **last** one that matches the function arguments will be used. This matching
+order allows you to set up the common behavior in a mock object's constructor or
+the test fixture's set-up phase and specialize the mock's behavior later.
+
+Note that both `ON_CALL` and `EXPECT_CALL` have the same "later statements take
+precedence" rule, but they don't interact. That is, `EXPECT_CALL`s have their
+own precedence order distinct from the `ON_CALL` precedence order.
+
+### Using Functions/Methods/Functors/Lambdas as Actions {#FunctionsAsActions}
+
+If the built-in actions don't suit you, you can use an existing callable
+(function, `std::function`, method, functor, lambda) as an action.
+
+<!-- GOOGLETEST_CM0024 DO NOT DELETE -->
+
+```cpp
+using ::testing::_; using ::testing::Invoke;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(int, Sum, (int x, int y), (override));
+  MOCK_METHOD(bool, ComplexJob, (int x), (override));
+};
+
+int CalculateSum(int x, int y) { return x + y; }
+int Sum3(int x, int y, int z) { return x + y + z; }
+
+class Helper {
+ public:
+  bool ComplexJob(int x);
+};
+
+...
+  MockFoo foo;
+  Helper helper;
+  EXPECT_CALL(foo, Sum(_, _))
+      .WillOnce(&CalculateSum)
+      .WillRepeatedly(Invoke(NewPermanentCallback(Sum3, 1)));
+  EXPECT_CALL(foo, ComplexJob(_))
+      .WillOnce(Invoke(&helper, &Helper::ComplexJob))
+      .WillOnce([] { return true; })
+      .WillRepeatedly([](int x) { return x > 0; });
+
+  foo.Sum(5, 6);         // Invokes CalculateSum(5, 6).
+  foo.Sum(2, 3);         // Invokes Sum3(1, 2, 3).
+  foo.ComplexJob(10);    // Invokes helper.ComplexJob(10).
+  foo.ComplexJob(-1);    // Invokes the inline lambda.
+```
+
+The only requirement is that the type of the function, etc must be *compatible*
+with the signature of the mock function, meaning that the latter's arguments (if
+it takes any) can be implicitly converted to the corresponding arguments of the
+former, and the former's return type can be implicitly converted to that of the
+latter. So, you can invoke something whose type is *not* exactly the same as the
+mock function, as long as it's safe to do so - nice, huh?
+
+**`Note:`{.escaped}**
+
+*   The action takes ownership of the callback and will delete it when the
+    action itself is destructed.
+*   If the type of a callback is derived from a base callback type `C`, you need
+    to implicitly cast it to `C` to resolve the overloading, e.g.
+
+    ```cpp
+    using ::testing::Invoke;
+    ...
+      ResultCallback<bool>* is_ok = ...;
+      ... Invoke(is_ok) ...;  // This works.
+
+      BlockingClosure* done = new BlockingClosure;
+      ... Invoke(implicit_cast<Closure*>(done)) ...;  // The cast is necessary.
+    ```
+
+### Using Functions with Extra Info as Actions
+
+The function or functor you call using `Invoke()` must have the same number of
+arguments as the mock function you use it for. Sometimes you may have a function
+that takes more arguments, and you are willing to pass in the extra arguments
+yourself to fill the gap. You can do this in gMock using callbacks with
+pre-bound arguments. Here's an example:
+
+```cpp
+using ::testing::Invoke;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(char, DoThis, (int n), (override));
+};
+
+char SignOfSum(int x, int y) {
+  const int sum = x + y;
+  return (sum > 0) ? '+' : (sum < 0) ? '-' : '0';
+}
+
+TEST_F(FooTest, Test) {
+  MockFoo foo;
+
+  EXPECT_CALL(foo, DoThis(2))
+      .WillOnce(Invoke(NewPermanentCallback(SignOfSum, 5)));
+  EXPECT_EQ('+', foo.DoThis(2));  // Invokes SignOfSum(5, 2).
+}
+```
+
+### Invoking a Function/Method/Functor/Lambda/Callback Without Arguments
+
+`Invoke()` passes the mock function's arguments to the function, etc being
+invoked such that the callee has the full context of the call to work with. If
+the invoked function is not interested in some or all of the arguments, it can
+simply ignore them.
+
+Yet, a common pattern is that a test author wants to invoke a function without
+the arguments of the mock function. She could do that using a wrapper function
+that throws away the arguments before invoking an underlining nullary function.
+Needless to say, this can be tedious and obscures the intent of the test.
+
+There are two solutions to this problem. First, you can pass any callable of
+zero args as an action. Alternatively, use `InvokeWithoutArgs()`, which is like
+`Invoke()` except that it doesn't pass the mock function's arguments to the
+callee. Here's an example of each:
+
+```cpp
+using ::testing::_;
+using ::testing::InvokeWithoutArgs;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(bool, ComplexJob, (int n), (override));
+};
+
+bool Job1() { ... }
+bool Job2(int n, char c) { ... }
+
+...
+  MockFoo foo;
+  EXPECT_CALL(foo, ComplexJob(_))
+      .WillOnce([] { Job1(); });
+      .WillOnce(InvokeWithoutArgs(NewPermanentCallback(Job2, 5, 'a')));
+
+  foo.ComplexJob(10);  // Invokes Job1().
+  foo.ComplexJob(20);  // Invokes Job2(5, 'a').
+```
+
+**`Note:`{.escaped}**
+
+*   The action takes ownership of the callback and will delete it when the
+    action itself is destructed.
+*   If the type of a callback is derived from a base callback type `C`, you need
+    to implicitly cast it to `C` to resolve the overloading, e.g.
+
+    ```cpp
+    using ::testing::InvokeWithoutArgs;
+    ...
+      ResultCallback<bool>* is_ok = ...;
+      ... InvokeWithoutArgs(is_ok) ...;  // This works.
+
+      BlockingClosure* done = ...;
+      ... InvokeWithoutArgs(implicit_cast<Closure*>(done)) ...;
+      // The cast is necessary.
+    ```
+
+### Invoking an Argument of the Mock Function
+
+Sometimes a mock function will receive a function pointer, a functor (in other
+words, a "callable") as an argument, e.g.
+
+```cpp
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(bool, DoThis, (int n, (ResultCallback1<bool, int>* callback)),
+              (override));
+};
+```
+
+and you may want to invoke this callable argument:
+
+```cpp
+using ::testing::_;
+...
+  MockFoo foo;
+  EXPECT_CALL(foo, DoThis(_, _))
+      .WillOnce(...);
+      // Will execute callback->Run(5), where callback is the
+      // second argument DoThis() receives.
+```
+
+NOTE: The section below is legacy documentation from before C++ had lambdas:
+
+Arghh, you need to refer to a mock function argument but C++ has no lambda
+(yet), so you have to define your own action. :-( Or do you really?
+
+Well, gMock has an action to solve *exactly* this problem:
+
+```cpp
+InvokeArgument<N>(arg_1, arg_2, ..., arg_m)
+```
+
+will invoke the `N`-th (0-based) argument the mock function receives, with
+`arg_1`, `arg_2`, ..., and `arg_m`. No matter if the argument is a function
+pointer, a functor, or a callback. gMock handles them all.
+
+With that, you could write:
+
+```cpp
+using ::testing::_;
+using ::testing::InvokeArgument;
+...
+  EXPECT_CALL(foo, DoThis(_, _))
+      .WillOnce(InvokeArgument<1>(5));
+      // Will execute callback->Run(5), where callback is the
+      // second argument DoThis() receives.
+```
+
+What if the callable takes an argument by reference? No problem - just wrap it
+inside `ByRef()`:
+
+```cpp
+  ...
+  MOCK_METHOD(bool, Bar,
+              ((ResultCallback2<bool, int, const Helper&>* callback)),
+              (override));
+  ...
+  using ::testing::_;
+  using ::testing::ByRef;
+  using ::testing::InvokeArgument;
+  ...
+  MockFoo foo;
+  Helper helper;
+  ...
+  EXPECT_CALL(foo, Bar(_))
+      .WillOnce(InvokeArgument<0>(5, ByRef(helper)));
+      // ByRef(helper) guarantees that a reference to helper, not a copy of it,
+      // will be passed to the callback.
+```
+
+What if the callable takes an argument by reference and we do **not** wrap the
+argument in `ByRef()`? Then `InvokeArgument()` will *make a copy* of the
+argument, and pass a *reference to the copy*, instead of a reference to the
+original value, to the callable. This is especially handy when the argument is a
+temporary value:
+
+```cpp
+  ...
+  MOCK_METHOD(bool, DoThat, (bool (*f)(const double& x, const string& s)),
+              (override));
+  ...
+  using ::testing::_;
+  using ::testing::InvokeArgument;
+  ...
+  MockFoo foo;
+  ...
+  EXPECT_CALL(foo, DoThat(_))
+      .WillOnce(InvokeArgument<0>(5.0, string("Hi")));
+      // Will execute (*f)(5.0, string("Hi")), where f is the function pointer
+      // DoThat() receives.  Note that the values 5.0 and string("Hi") are
+      // temporary and dead once the EXPECT_CALL() statement finishes.  Yet
+      // it's fine to perform this action later, since a copy of the values
+      // are kept inside the InvokeArgument action.
+```
+
+### Ignoring an Action's Result
+
+Sometimes you have an action that returns *something*, but you need an action
+that returns `void` (perhaps you want to use it in a mock function that returns
+`void`, or perhaps it needs to be used in `DoAll()` and it's not the last in the
+list). `IgnoreResult()` lets you do that. For example:
+
+```cpp
+using ::testing::_;
+using ::testing::DoAll;
+using ::testing::IgnoreResult;
+using ::testing::Return;
+
+int Process(const MyData& data);
+string DoSomething();
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(void, Abc, (const MyData& data), (override));
+  MOCK_METHOD(bool, Xyz, (), (override));
+};
+
+  ...
+  MockFoo foo;
+  EXPECT_CALL(foo, Abc(_))
+      // .WillOnce(Invoke(Process));
+      // The above line won't compile as Process() returns int but Abc() needs
+      // to return void.
+      .WillOnce(IgnoreResult(Process));
+  EXPECT_CALL(foo, Xyz())
+      .WillOnce(DoAll(IgnoreResult(DoSomething),
+                      // Ignores the string DoSomething() returns.
+                      Return(true)));
+```
+
+Note that you **cannot** use `IgnoreResult()` on an action that already returns
+`void`. Doing so will lead to ugly compiler errors.
+
+### Selecting an Action's Arguments {#SelectingArgs}
+
+Say you have a mock function `Foo()` that takes seven arguments, and you have a
+custom action that you want to invoke when `Foo()` is called. Trouble is, the
+custom action only wants three arguments:
+
+```cpp
+using ::testing::_;
+using ::testing::Invoke;
+...
+  MOCK_METHOD(bool, Foo,
+              (bool visible, const string& name, int x, int y,
+               (const map<pair<int, int>>), double& weight, double min_weight,
+               double max_wight));
+...
+bool IsVisibleInQuadrant1(bool visible, int x, int y) {
+  return visible && x >= 0 && y >= 0;
+}
+...
+  EXPECT_CALL(mock, Foo)
+      .WillOnce(Invoke(IsVisibleInQuadrant1));  // Uh, won't compile. :-(
+```
+
+To please the compiler God, you need to define an "adaptor" that has the same
+signature as `Foo()` and calls the custom action with the right arguments:
+
+```cpp
+using ::testing::_;
+using ::testing::Invoke;
+...
+bool MyIsVisibleInQuadrant1(bool visible, const string& name, int x, int y,
+                            const map<pair<int, int>, double>& weight,
+                            double min_weight, double max_wight) {
+  return IsVisibleInQuadrant1(visible, x, y);
+}
+...
+  EXPECT_CALL(mock, Foo)
+      .WillOnce(Invoke(MyIsVisibleInQuadrant1));  // Now it works.
+```
+
+But isn't this awkward?
+
+gMock provides a generic *action adaptor*, so you can spend your time minding
+more important business than writing your own adaptors. Here's the syntax:
+
+```cpp
+WithArgs<N1, N2, ..., Nk>(action)
+```
+
+creates an action that passes the arguments of the mock function at the given
+indices (0-based) to the inner `action` and performs it. Using `WithArgs`, our
+original example can be written as:
+
+```cpp
+using ::testing::_;
+using ::testing::Invoke;
+using ::testing::WithArgs;
+...
+  EXPECT_CALL(mock, Foo)
+      .WillOnce(WithArgs<0, 2, 3>(Invoke(IsVisibleInQuadrant1)));  // No need to define your own adaptor.
+```
+
+For better readability, gMock also gives you:
+
+*   `WithoutArgs(action)` when the inner `action` takes *no* argument, and
+*   `WithArg<N>(action)` (no `s` after `Arg`) when the inner `action` takes
+    *one* argument.
+
+As you may have realized, `InvokeWithoutArgs(...)` is just syntactic sugar for
+`WithoutArgs(Invoke(...))`.
+
+Here are more tips:
+
+*   The inner action used in `WithArgs` and friends does not have to be
+    `Invoke()` -- it can be anything.
+*   You can repeat an argument in the argument list if necessary, e.g.
+    `WithArgs<2, 3, 3, 5>(...)`.
+*   You can change the order of the arguments, e.g. `WithArgs<3, 2, 1>(...)`.
+*   The types of the selected arguments do *not* have to match the signature of
+    the inner action exactly. It works as long as they can be implicitly
+    converted to the corresponding arguments of the inner action. For example,
+    if the 4-th argument of the mock function is an `int` and `my_action` takes
+    a `double`, `WithArg<4>(my_action)` will work.
+
+### Ignoring Arguments in Action Functions
+
+The [selecting-an-action's-arguments](#SelectingArgs) recipe showed us one way
+to make a mock function and an action with incompatible argument lists fit
+together. The downside is that wrapping the action in `WithArgs<...>()` can get
+tedious for people writing the tests.
+
+If you are defining a function (or method, functor, lambda, callback) to be used
+with `Invoke*()`, and you are not interested in some of its arguments, an
+alternative to `WithArgs` is to declare the uninteresting arguments as `Unused`.
+This makes the definition less cluttered and less fragile in case the types of
+the uninteresting arguments change. It could also increase the chance the action
+function can be reused. For example, given
+
+```cpp
+ public:
+  MOCK_METHOD(double, Foo, double(const string& label, double x, double y),
+              (override));
+  MOCK_METHOD(double, Bar, (int index, double x, double y), (override));
+```
+
+instead of
+
+```cpp
+using ::testing::_;
+using ::testing::Invoke;
+
+double DistanceToOriginWithLabel(const string& label, double x, double y) {
+  return sqrt(x*x + y*y);
+}
+double DistanceToOriginWithIndex(int index, double x, double y) {
+  return sqrt(x*x + y*y);
+}
+...
+  EXPECT_CALL(mock, Foo("abc", _, _))
+      .WillOnce(Invoke(DistanceToOriginWithLabel));
+  EXPECT_CALL(mock, Bar(5, _, _))
+      .WillOnce(Invoke(DistanceToOriginWithIndex));
+```
+
+you could write
+
+```cpp
+using ::testing::_;
+using ::testing::Invoke;
+using ::testing::Unused;
+
+double DistanceToOrigin(Unused, double x, double y) {
+  return sqrt(x*x + y*y);
+}
+...
+  EXPECT_CALL(mock, Foo("abc", _, _))
+      .WillOnce(Invoke(DistanceToOrigin));
+  EXPECT_CALL(mock, Bar(5, _, _))
+      .WillOnce(Invoke(DistanceToOrigin));
+```
+
+### Sharing Actions
+
+Just like matchers, a gMock action object consists of a pointer to a ref-counted
+implementation object. Therefore copying actions is also allowed and very
+efficient. When the last action that references the implementation object dies,
+the implementation object will be deleted.
+
+If you have some complex action that you want to use again and again, you may
+not have to build it from scratch everytime. If the action doesn't have an
+internal state (i.e. if it always does the same thing no matter how many times
+it has been called), you can assign it to an action variable and use that
+variable repeatedly. For example:
+
+```cpp
+using ::testing::Action;
+using ::testing::DoAll;
+using ::testing::Return;
+using ::testing::SetArgPointee;
+...
+  Action<bool(int*)> set_flag = DoAll(SetArgPointee<0>(5),
+                                      Return(true));
+  ... use set_flag in .WillOnce() and .WillRepeatedly() ...
+```
+
+However, if the action has its own state, you may be surprised if you share the
+action object. Suppose you have an action factory `IncrementCounter(init)` which
+creates an action that increments and returns a counter whose initial value is
+`init`, using two actions created from the same expression and using a shared
+action will exhibit different behaviors. Example:
+
+```cpp
+  EXPECT_CALL(foo, DoThis())
+      .WillRepeatedly(IncrementCounter(0));
+  EXPECT_CALL(foo, DoThat())
+      .WillRepeatedly(IncrementCounter(0));
+  foo.DoThis();  // Returns 1.
+  foo.DoThis();  // Returns 2.
+  foo.DoThat();  // Returns 1 - Blah() uses a different
+                 // counter than Bar()'s.
+```
+
+versus
+
+```cpp
+using ::testing::Action;
+...
+  Action<int()> increment = IncrementCounter(0);
+  EXPECT_CALL(foo, DoThis())
+      .WillRepeatedly(increment);
+  EXPECT_CALL(foo, DoThat())
+      .WillRepeatedly(increment);
+  foo.DoThis();  // Returns 1.
+  foo.DoThis();  // Returns 2.
+  foo.DoThat();  // Returns 3 - the counter is shared.
+```
+
+### Testing Asynchronous Behavior
+
+One oft-encountered problem with gMock is that it can be hard to test
+asynchronous behavior. Suppose you had a `EventQueue` class that you wanted to
+test, and you created a separate `EventDispatcher` interface so that you could
+easily mock it out. However, the implementation of the class fired all the
+events on a background thread, which made test timings difficult. You could just
+insert `sleep()` statements and hope for the best, but that makes your test
+behavior nondeterministic. A better way is to use gMock actions and
+`Notification` objects to force your asynchronous test to behave synchronously.
+
+```cpp
+using ::testing::DoAll;
+using ::testing::InvokeWithoutArgs;
+using ::testing::Return;
+
+class MockEventDispatcher : public EventDispatcher {
+  MOCK_METHOD(bool, DispatchEvent, (int32), (override));
+};
+
+ACTION_P(Notify, notification) {
+  notification->Notify();
+}
+
+TEST(EventQueueTest, EnqueueEventTest) {
+  MockEventDispatcher mock_event_dispatcher;
+  EventQueue event_queue(&mock_event_dispatcher);
+
+  const int32 kEventId = 321;
+  Notification done;
+  EXPECT_CALL(mock_event_dispatcher, DispatchEvent(kEventId))
+      .WillOnce(Notify(&done));
+
+  event_queue.EnqueueEvent(kEventId);
+  done.WaitForNotification();
+}
+```
+
+In the example above, we set our normal gMock expectations, but then add an
+additional action to notify the `Notification` object. Now we can just call
+`Notification::WaitForNotification()` in the main thread to wait for the
+asynchronous call to finish. After that, our test suite is complete and we can
+safely exit.
+
+Note: this example has a downside: namely, if the expectation is not satisfied,
+our test will run forever. It will eventually time-out and fail, but it will
+take longer and be slightly harder to debug. To alleviate this problem, you can
+use `WaitForNotificationWithTimeout(ms)` instead of `WaitForNotification()`.
+
+## Misc Recipes on Using gMock
+
+### Mocking Methods That Use Move-Only Types
+
+C++11 introduced *move-only types*. A move-only-typed value can be moved from
+one object to another, but cannot be copied. `std::unique_ptr<T>` is probably
+the most commonly used move-only type.
+
+Mocking a method that takes and/or returns move-only types presents some
+challenges, but nothing insurmountable. This recipe shows you how you can do it.
+Note that the support for move-only method arguments was only introduced to
+gMock in April 2017; in older code, you may find more complex
+[workarounds](#LegacyMoveOnly) for lack of this feature.
+
+Let’s say we are working on a fictional project that lets one post and share
+snippets called “buzzes”. Your code uses these types:
+
+```cpp
+enum class AccessLevel { kInternal, kPublic };
+
+class Buzz {
+ public:
+  explicit Buzz(AccessLevel access) { ... }
+  ...
+};
+
+class Buzzer {
+ public:
+  virtual ~Buzzer() {}
+  virtual std::unique_ptr<Buzz> MakeBuzz(StringPiece text) = 0;
+  virtual bool ShareBuzz(std::unique_ptr<Buzz> buzz, int64_t timestamp) = 0;
+  ...
+};
+```
+
+A `Buzz` object represents a snippet being posted. A class that implements the
+`Buzzer` interface is capable of creating and sharing `Buzz`es. Methods in
+`Buzzer` may return a `unique_ptr<Buzz>` or take a `unique_ptr<Buzz>`. Now we
+need to mock `Buzzer` in our tests.
+
+To mock a method that accepts or returns move-only types, you just use the
+familiar `MOCK_METHOD` syntax as usual:
+
+```cpp
+class MockBuzzer : public Buzzer {
+ public:
+  MOCK_METHOD(std::unique_ptr<Buzz>, MakeBuzz, (StringPiece text), (override));
+  MOCK_METHOD(bool, ShareBuzz, (std::unique_ptr<Buzz> buzz, int64_t timestamp),
+              (override));
+};
+```
+
+Now that we have the mock class defined, we can use it in tests. In the
+following code examples, we assume that we have defined a `MockBuzzer` object
+named `mock_buzzer_`:
+
+```cpp
+  MockBuzzer mock_buzzer_;
+```
+
+First let’s see how we can set expectations on the `MakeBuzz()` method, which
+returns a `unique_ptr<Buzz>`.
+
+As usual, if you set an expectation without an action (i.e. the `.WillOnce()` or
+`.WillRepeatedly()` clause), when that expectation fires, the default action for
+that method will be taken. Since `unique_ptr<>` has a default constructor that
+returns a null `unique_ptr`, that’s what you’ll get if you don’t specify an
+action:
+
+```cpp
+  // Use the default action.
+  EXPECT_CALL(mock_buzzer_, MakeBuzz("hello"));
+
+  // Triggers the previous EXPECT_CALL.
+  EXPECT_EQ(nullptr, mock_buzzer_.MakeBuzz("hello"));
+```
+
+If you are not happy with the default action, you can tweak it as usual; see
+[Setting Default Actions](#OnCall).
+
+If you just need to return a pre-defined move-only value, you can use the
+`Return(ByMove(...))` action:
+
+```cpp
+  // When this fires, the unique_ptr<> specified by ByMove(...) will
+  // be returned.
+  EXPECT_CALL(mock_buzzer_, MakeBuzz("world"))
+      .WillOnce(Return(ByMove(MakeUnique<Buzz>(AccessLevel::kInternal))));
+
+  EXPECT_NE(nullptr, mock_buzzer_.MakeBuzz("world"));
+```
+
+Note that `ByMove()` is essential here - if you drop it, the code won’t compile.
+
+Quiz time! What do you think will happen if a `Return(ByMove(...))` action is
+performed more than once (e.g. you write `...
+.WillRepeatedly(Return(ByMove(...)));`)? Come think of it, after the first time
+the action runs, the source value will be consumed (since it’s a move-only
+value), so the next time around, there’s no value to move from -- you’ll get a
+run-time error that `Return(ByMove(...))` can only be run once.
+
+If you need your mock method to do more than just moving a pre-defined value,
+remember that you can always use a lambda or a callable object, which can do
+pretty much anything you want:
+
+```cpp
+  EXPECT_CALL(mock_buzzer_, MakeBuzz("x"))
+      .WillRepeatedly([](StringPiece text) {
+        return MakeUnique<Buzz>(AccessLevel::kInternal);
+      });
+
+  EXPECT_NE(nullptr, mock_buzzer_.MakeBuzz("x"));
+  EXPECT_NE(nullptr, mock_buzzer_.MakeBuzz("x"));
+```
+
+Every time this `EXPECT_CALL` fires, a new `unique_ptr<Buzz>` will be created
+and returned. You cannot do this with `Return(ByMove(...))`.
+
+That covers returning move-only values; but how do we work with methods
+accepting move-only arguments? The answer is that they work normally, although
+some actions will not compile when any of method's arguments are move-only. You
+can always use `Return`, or a [lambda or functor](#FunctionsAsActions):
+
+```cpp
+  using ::testing::Unused;
+
+  EXPECT_CALL(mock_buzzer_, ShareBuzz(NotNull(), _)).WillOnce(Return(true));
+  EXPECT_TRUE(mock_buzzer_.ShareBuzz(MakeUnique<Buzz>(AccessLevel::kInternal)),
+              0);
+
+  EXPECT_CALL(mock_buzzer_, ShareBuzz(_, _)).WillOnce(
+      [](std::unique_ptr<Buzz> buzz, Unused) { return buzz != nullptr; });
+  EXPECT_FALSE(mock_buzzer_.ShareBuzz(nullptr, 0));
+```
+
+Many built-in actions (`WithArgs`, `WithoutArgs`,`DeleteArg`, `SaveArg`, ...)
+could in principle support move-only arguments, but the support for this is not
+implemented yet. If this is blocking you, please file a bug.
+
+A few actions (e.g. `DoAll`) copy their arguments internally, so they can never
+work with non-copyable objects; you'll have to use functors instead.
+
+#### Legacy workarounds for move-only types {#LegacyMoveOnly}
+
+Support for move-only function arguments was only introduced to gMock in April
+2017. In older code, you may encounter the following workaround for the lack of
+this feature (it is no longer necessary - we're including it just for
+reference):
+
+```cpp
+class MockBuzzer : public Buzzer {
+ public:
+  MOCK_METHOD(bool, DoShareBuzz, (Buzz* buzz, Time timestamp));
+  bool ShareBuzz(std::unique_ptr<Buzz> buzz, Time timestamp) override {
+    return DoShareBuzz(buzz.get(), timestamp);
+  }
+};
+```
+
+The trick is to delegate the `ShareBuzz()` method to a mock method (let’s call
+it `DoShareBuzz()`) that does not take move-only parameters. Then, instead of
+setting expectations on `ShareBuzz()`, you set them on the `DoShareBuzz()` mock
+method:
+
+```cpp
+  MockBuzzer mock_buzzer_;
+  EXPECT_CALL(mock_buzzer_, DoShareBuzz(NotNull(), _));
+
+  // When one calls ShareBuzz() on the MockBuzzer like this, the call is
+  // forwarded to DoShareBuzz(), which is mocked.  Therefore this statement
+  // will trigger the above EXPECT_CALL.
+  mock_buzzer_.ShareBuzz(MakeUnique<Buzz>(AccessLevel::kInternal), 0);
+```
+
+### Making the Compilation Faster
+
+Believe it or not, the *vast majority* of the time spent on compiling a mock
+class is in generating its constructor and destructor, as they perform
+non-trivial tasks (e.g. verification of the expectations). What's more, mock
+methods with different signatures have different types and thus their
+constructors/destructors need to be generated by the compiler separately. As a
+result, if you mock many different types of methods, compiling your mock class
+can get really slow.
+
+If you are experiencing slow compilation, you can move the definition of your
+mock class' constructor and destructor out of the class body and into a `.cc`
+file. This way, even if you `#include` your mock class in N files, the compiler
+only needs to generate its constructor and destructor once, resulting in a much
+faster compilation.
+
+Let's illustrate the idea using an example. Here's the definition of a mock
+class before applying this recipe:
+
+```cpp
+// File mock_foo.h.
+...
+class MockFoo : public Foo {
+ public:
+  // Since we don't declare the constructor or the destructor,
+  // the compiler will generate them in every translation unit
+  // where this mock class is used.
+
+  MOCK_METHOD(int, DoThis, (), (override));
+  MOCK_METHOD(bool, DoThat, (const char* str), (override));
+  ... more mock methods ...
+};
+```
+
+After the change, it would look like:
+
+```cpp
+// File mock_foo.h.
+...
+class MockFoo : public Foo {
+ public:
+  // The constructor and destructor are declared, but not defined, here.
+  MockFoo();
+  virtual ~MockFoo();
+
+  MOCK_METHOD(int, DoThis, (), (override));
+  MOCK_METHOD(bool, DoThat, (const char* str), (override));
+  ... more mock methods ...
+};
+```
+
+and
+
+```cpp
+// File mock_foo.cc.
+#include "path/to/mock_foo.h"
+
+// The definitions may appear trivial, but the functions actually do a
+// lot of things through the constructors/destructors of the member
+// variables used to implement the mock methods.
+MockFoo::MockFoo() {}
+MockFoo::~MockFoo() {}
+```
+
+### Forcing a Verification
+
+When it's being destroyed, your friendly mock object will automatically verify
+that all expectations on it have been satisfied, and will generate googletest
+failures if not. This is convenient as it leaves you with one less thing to
+worry about. That is, unless you are not sure if your mock object will be
+destroyed.
+
+How could it be that your mock object won't eventually be destroyed? Well, it
+might be created on the heap and owned by the code you are testing. Suppose
+there's a bug in that code and it doesn't delete the mock object properly - you
+could end up with a passing test when there's actually a bug.
+
+Using a heap checker is a good idea and can alleviate the concern, but its
+implementation is not 100% reliable. So, sometimes you do want to *force* gMock
+to verify a mock object before it is (hopefully) destructed. You can do this
+with `Mock::VerifyAndClearExpectations(&mock_object)`:
+
+```cpp
+TEST(MyServerTest, ProcessesRequest) {
+  using ::testing::Mock;
+
+  MockFoo* const foo = new MockFoo;
+  EXPECT_CALL(*foo, ...)...;
+  // ... other expectations ...
+
+  // server now owns foo.
+  MyServer server(foo);
+  server.ProcessRequest(...);
+
+  // In case that server's destructor will forget to delete foo,
+  // this will verify the expectations anyway.
+  Mock::VerifyAndClearExpectations(foo);
+}  // server is destroyed when it goes out of scope here.
+```
+
+**Tip:** The `Mock::VerifyAndClearExpectations()` function returns a `bool` to
+indicate whether the verification was successful (`true` for yes), so you can
+wrap that function call inside a `ASSERT_TRUE()` if there is no point going
+further when the verification has failed.
+
+### Using Check Points {#UsingCheckPoints}
+
+Sometimes you may want to "reset" a mock object at various check points in your
+test: at each check point, you verify that all existing expectations on the mock
+object have been satisfied, and then you set some new expectations on it as if
+it's newly created. This allows you to work with a mock object in "phases" whose
+sizes are each manageable.
+
+One such scenario is that in your test's `SetUp()` function, you may want to put
+the object you are testing into a certain state, with the help from a mock
+object. Once in the desired state, you want to clear all expectations on the
+mock, such that in the `TEST_F` body you can set fresh expectations on it.
+
+As you may have figured out, the `Mock::VerifyAndClearExpectations()` function
+we saw in the previous recipe can help you here. Or, if you are using
+`ON_CALL()` to set default actions on the mock object and want to clear the
+default actions as well, use `Mock::VerifyAndClear(&mock_object)` instead. This
+function does what `Mock::VerifyAndClearExpectations(&mock_object)` does and
+returns the same `bool`, **plus** it clears the `ON_CALL()` statements on
+`mock_object` too.
+
+Another trick you can use to achieve the same effect is to put the expectations
+in sequences and insert calls to a dummy "check-point" function at specific
+places. Then you can verify that the mock function calls do happen at the right
+time. For example, if you are exercising code:
+
+```cpp
+  Foo(1);
+  Foo(2);
+  Foo(3);
+```
+
+and want to verify that `Foo(1)` and `Foo(3)` both invoke `mock.Bar("a")`, but
+`Foo(2)` doesn't invoke anything. You can write:
+
+```cpp
+using ::testing::MockFunction;
+
+TEST(FooTest, InvokesBarCorrectly) {
+  MyMock mock;
+  // Class MockFunction<F> has exactly one mock method.  It is named
+  // Call() and has type F.
+  MockFunction<void(string check_point_name)> check;
+  {
+    InSequence s;
+
+    EXPECT_CALL(mock, Bar("a"));
+    EXPECT_CALL(check, Call("1"));
+    EXPECT_CALL(check, Call("2"));
+    EXPECT_CALL(mock, Bar("a"));
+  }
+  Foo(1);
+  check.Call("1");
+  Foo(2);
+  check.Call("2");
+  Foo(3);
+}
+```
+
+The expectation spec says that the first `Bar("a")` must happen before check
+point "1", the second `Bar("a")` must happen after check point "2", and nothing
+should happen between the two check points. The explicit check points make it
+easy to tell which `Bar("a")` is called by which call to `Foo()`.
+
+### Mocking Destructors
+
+Sometimes you want to make sure a mock object is destructed at the right time,
+e.g. after `bar->A()` is called but before `bar->B()` is called. We already know
+that you can specify constraints on the [order](#OrderedCalls) of mock function
+calls, so all we need to do is to mock the destructor of the mock function.
+
+This sounds simple, except for one problem: a destructor is a special function
+with special syntax and special semantics, and the `MOCK_METHOD` macro doesn't
+work for it:
+
+```cpp
+MOCK_METHOD(void, ~MockFoo, ());  // Won't compile!
+```
+
+The good news is that you can use a simple pattern to achieve the same effect.
+First, add a mock function `Die()` to your mock class and call it in the
+destructor, like this:
+
+```cpp
+class MockFoo : public Foo {
+  ...
+  // Add the following two lines to the mock class.
+  MOCK_METHOD(void, Die, ());
+  virtual ~MockFoo() { Die(); }
+};
+```
+
+(If the name `Die()` clashes with an existing symbol, choose another name.) Now,
+we have translated the problem of testing when a `MockFoo` object dies to
+testing when its `Die()` method is called:
+
+```cpp
+  MockFoo* foo = new MockFoo;
+  MockBar* bar = new MockBar;
+  ...
+  {
+    InSequence s;
+
+    // Expects *foo to die after bar->A() and before bar->B().
+    EXPECT_CALL(*bar, A());
+    EXPECT_CALL(*foo, Die());
+    EXPECT_CALL(*bar, B());
+  }
+```
+
+And that's that.
+
+### Using gMock and Threads {#UsingThreads}
+
+In a **unit** test, it's best if you could isolate and test a piece of code in a
+single-threaded context. That avoids race conditions and dead locks, and makes
+debugging your test much easier.
+
+Yet most programs are multi-threaded, and sometimes to test something we need to
+pound on it from more than one thread. gMock works for this purpose too.
+
+Remember the steps for using a mock:
+
+1.  Create a mock object `foo`.
+2.  Set its default actions and expectations using `ON_CALL()` and
+    `EXPECT_CALL()`.
+3.  The code under test calls methods of `foo`.
+4.  Optionally, verify and reset the mock.
+5.  Destroy the mock yourself, or let the code under test destroy it. The
+    destructor will automatically verify it.
+
+If you follow the following simple rules, your mocks and threads can live
+happily together:
+
+*   Execute your *test code* (as opposed to the code being tested) in *one*
+    thread. This makes your test easy to follow.
+*   Obviously, you can do step #1 without locking.
+*   When doing step #2 and #5, make sure no other thread is accessing `foo`.
+    Obvious too, huh?
+*   #3 and #4 can be done either in one thread or in multiple threads - anyway
+    you want. gMock takes care of the locking, so you don't have to do any -
+    unless required by your test logic.
+
+If you violate the rules (for example, if you set expectations on a mock while
+another thread is calling its methods), you get undefined behavior. That's not
+fun, so don't do it.
+
+gMock guarantees that the action for a mock function is done in the same thread
+that called the mock function. For example, in
+
+```cpp
+  EXPECT_CALL(mock, Foo(1))
+      .WillOnce(action1);
+  EXPECT_CALL(mock, Foo(2))
+      .WillOnce(action2);
+```
+
+if `Foo(1)` is called in thread 1 and `Foo(2)` is called in thread 2, gMock will
+execute `action1` in thread 1 and `action2` in thread 2.
+
+gMock does *not* impose a sequence on actions performed in different threads
+(doing so may create deadlocks as the actions may need to cooperate). This means
+that the execution of `action1` and `action2` in the above example *may*
+interleave. If this is a problem, you should add proper synchronization logic to
+`action1` and `action2` to make the test thread-safe.
+
+Also, remember that `DefaultValue<T>` is a global resource that potentially
+affects *all* living mock objects in your program. Naturally, you won't want to
+mess with it from multiple threads or when there still are mocks in action.
+
+### Controlling How Much Information gMock Prints
+
+When gMock sees something that has the potential of being an error (e.g. a mock
+function with no expectation is called, a.k.a. an uninteresting call, which is
+allowed but perhaps you forgot to explicitly ban the call), it prints some
+warning messages, including the arguments of the function, the return value, and
+the stack trace. Hopefully this will remind you to take a look and see if there
+is indeed a problem.
+
+Sometimes you are confident that your tests are correct and may not appreciate
+such friendly messages. Some other times, you are debugging your tests or
+learning about the behavior of the code you are testing, and wish you could
+observe every mock call that happens (including argument values, the return
+value, and the stack trace). Clearly, one size doesn't fit all.
+
+You can control how much gMock tells you using the `--gmock_verbose=LEVEL`
+command-line flag, where `LEVEL` is a string with three possible values:
+
+*   `info`: gMock will print all informational messages, warnings, and errors
+    (most verbose). At this setting, gMock will also log any calls to the
+    `ON_CALL/EXPECT_CALL` macros. It will include a stack trace in
+    "uninteresting call" warnings.
+*   `warning`: gMock will print both warnings and errors (less verbose); it will
+    omit the stack traces in "uninteresting call" warnings. This is the default.
+*   `error`: gMock will print errors only (least verbose).
+
+Alternatively, you can adjust the value of that flag from within your tests like
+so:
+
+```cpp
+  ::testing::FLAGS_gmock_verbose = "error";
+```
+
+If you find gMock printing too many stack frames with its informational or
+warning messages, remember that you can control their amount with the
+`--gtest_stack_trace_depth=max_depth` flag.
+
+Now, judiciously use the right flag to enable gMock serve you better!
+
+### Gaining Super Vision into Mock Calls
+
+You have a test using gMock. It fails: gMock tells you some expectations aren't
+satisfied. However, you aren't sure why: Is there a typo somewhere in the
+matchers? Did you mess up the order of the `EXPECT_CALL`s? Or is the code under
+test doing something wrong? How can you find out the cause?
+
+Won't it be nice if you have X-ray vision and can actually see the trace of all
+`EXPECT_CALL`s and mock method calls as they are made? For each call, would you
+like to see its actual argument values and which `EXPECT_CALL` gMock thinks it
+matches? If you still need some help to figure out who made these calls, how
+about being able to see the complete stack trace at each mock call?
+
+You can unlock this power by running your test with the `--gmock_verbose=info`
+flag. For example, given the test program:
+
+```cpp
+#include "gmock/gmock.h"
+
+using testing::_;
+using testing::HasSubstr;
+using testing::Return;
+
+class MockFoo {
+ public:
+  MOCK_METHOD(void, F, (const string& x, const string& y));
+};
+
+TEST(Foo, Bar) {
+  MockFoo mock;
+  EXPECT_CALL(mock, F(_, _)).WillRepeatedly(Return());
+  EXPECT_CALL(mock, F("a", "b"));
+  EXPECT_CALL(mock, F("c", HasSubstr("d")));
+
+  mock.F("a", "good");
+  mock.F("a", "b");
+}
+```
+
+if you run it with `--gmock_verbose=info`, you will see this output:
+
+```shell
+[ RUN       ] Foo.Bar
+
+foo_test.cc:14: EXPECT_CALL(mock, F(_, _)) invoked
+Stack trace: ...
+
+foo_test.cc:15: EXPECT_CALL(mock, F("a", "b")) invoked
+Stack trace: ...
+
+foo_test.cc:16: EXPECT_CALL(mock, F("c", HasSubstr("d"))) invoked
+Stack trace: ...
+
+foo_test.cc:14: Mock function call matches EXPECT_CALL(mock, F(_, _))...
+    Function call: F(@0x7fff7c8dad40"a",@0x7fff7c8dad10"good")
+Stack trace: ...
+
+foo_test.cc:15: Mock function call matches EXPECT_CALL(mock, F("a", "b"))...
+    Function call: F(@0x7fff7c8dada0"a",@0x7fff7c8dad70"b")
+Stack trace: ...
+
+foo_test.cc:16: Failure
+Actual function call count doesn't match EXPECT_CALL(mock, F("c", HasSubstr("d")))...
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] Foo.Bar
+```
+
+Suppose the bug is that the `"c"` in the third `EXPECT_CALL` is a typo and
+should actually be `"a"`. With the above message, you should see that the actual
+`F("a", "good")` call is matched by the first `EXPECT_CALL`, not the third as
+you thought. From that it should be obvious that the third `EXPECT_CALL` is
+written wrong. Case solved.
+
+If you are interested in the mock call trace but not the stack traces, you can
+combine `--gmock_verbose=info` with `--gtest_stack_trace_depth=0` on the test
+command line.
+
+<!-- GOOGLETEST_CM0025 DO NOT DELETE -->
+
+### Running Tests in Emacs
+
+If you build and run your tests in Emacs using the `M-x google-compile` command
+(as many googletest users do), the source file locations of gMock and googletest
+errors will be highlighted. Just press `<Enter>` on one of them and you'll be
+taken to the offending line. Or, you can just type `C-x`` to jump to the next
+error.
+
+To make it even easier, you can add the following lines to your `~/.emacs` file:
+
+```text
+(global-set-key "\M-m"  'google-compile)  ; m is for make
+(global-set-key [M-down] 'next-error)
+(global-set-key [M-up]  '(lambda () (interactive) (next-error -1)))
+```
+
+Then you can type `M-m` to start a build (if you want to run the test as well,
+just make sure `foo_test.run` or `runtests` is in the build command you supply
+after typing `M-m`), or `M-up`/`M-down` to move back and forth between errors.
+
+## Extending gMock
+
+### Writing New Matchers Quickly {#NewMatchers}
+
+WARNING: gMock does not guarantee when or how many times a matcher will be
+invoked. Therefore, all matchers must be functionally pure. See
+[this section](#PureMatchers) for more details.
+
+The `MATCHER*` family of macros can be used to define custom matchers easily.
+The syntax:
+
+```cpp
+MATCHER(name, description_string_expression) { statements; }
+```
+
+will define a matcher with the given name that executes the statements, which
+must return a `bool` to indicate if the match succeeds. Inside the statements,
+you can refer to the value being matched by `arg`, and refer to its type by
+`arg_type`.
+
+The *description string* is a `string`-typed expression that documents what the
+matcher does, and is used to generate the failure message when the match fails.
+It can (and should) reference the special `bool` variable `negation`, and should
+evaluate to the description of the matcher when `negation` is `false`, or that
+of the matcher's negation when `negation` is `true`.
+
+For convenience, we allow the description string to be empty (`""`), in which
+case gMock will use the sequence of words in the matcher name as the
+description.
+
+For example:
+
+```cpp
+MATCHER(IsDivisibleBy7, "") { return (arg % 7) == 0; }
+```
+
+allows you to write
+
+```cpp
+  // Expects mock_foo.Bar(n) to be called where n is divisible by 7.
+  EXPECT_CALL(mock_foo, Bar(IsDivisibleBy7()));
+```
+
+or,
+
+```cpp
+  using ::testing::Not;
+  ...
+  // Verifies that two values are divisible by 7.
+  EXPECT_THAT(some_expression, IsDivisibleBy7());
+  EXPECT_THAT(some_other_expression, Not(IsDivisibleBy7()));
+```
+
+If the above assertions fail, they will print something like:
+
+```shell
+  Value of: some_expression
+  Expected: is divisible by 7
+    Actual: 27
+  ...
+  Value of: some_other_expression
+  Expected: not (is divisible by 7)
+    Actual: 21
+```
+
+where the descriptions `"is divisible by 7"` and `"not (is divisible by 7)"` are
+automatically calculated from the matcher name `IsDivisibleBy7`.
+
+As you may have noticed, the auto-generated descriptions (especially those for
+the negation) may not be so great. You can always override them with a `string`
+expression of your own:
+
+```cpp
+MATCHER(IsDivisibleBy7,
+        absl::StrCat(negation ? "isn't" : "is", " divisible by 7")) {
+  return (arg % 7) == 0;
+}
+```
+
+Optionally, you can stream additional information to a hidden argument named
+`result_listener` to explain the match result. For example, a better definition
+of `IsDivisibleBy7` is:
+
+```cpp
+MATCHER(IsDivisibleBy7, "") {
+  if ((arg % 7) == 0)
+    return true;
+
+  *result_listener << "the remainder is " << (arg % 7);
+  return false;
+}
+```
+
+With this definition, the above assertion will give a better message:
+
+```shell
+  Value of: some_expression
+  Expected: is divisible by 7
+    Actual: 27 (the remainder is 6)
+```
+
+You should let `MatchAndExplain()` print *any additional information* that can
+help a user understand the match result. Note that it should explain why the
+match succeeds in case of a success (unless it's obvious) - this is useful when
+the matcher is used inside `Not()`. There is no need to print the argument value
+itself, as gMock already prints it for you.
+
+NOTE: The type of the value being matched (`arg_type`) is determined by the
+context in which you use the matcher and is supplied to you by the compiler, so
+you don't need to worry about declaring it (nor can you). This allows the
+matcher to be polymorphic. For example, `IsDivisibleBy7()` can be used to match
+any type where the value of `(arg % 7) == 0` can be implicitly converted to a
+`bool`. In the `Bar(IsDivisibleBy7())` example above, if method `Bar()` takes an
+`int`, `arg_type` will be `int`; if it takes an `unsigned long`, `arg_type` will
+be `unsigned long`; and so on.
+
+### Writing New Parameterized Matchers Quickly
+
+Sometimes you'll want to define a matcher that has parameters. For that you can
+use the macro:
+
+```cpp
+MATCHER_P(name, param_name, description_string) { statements; }
+```
+
+where the description string can be either `""` or a `string` expression that
+references `negation` and `param_name`.
+
+For example:
+
+```cpp
+MATCHER_P(HasAbsoluteValue, value, "") { return abs(arg) == value; }
+```
+
+will allow you to write:
+
+```cpp
+  EXPECT_THAT(Blah("a"), HasAbsoluteValue(n));
+```
+
+which may lead to this message (assuming `n` is 10):
+
+```shell
+  Value of: Blah("a")
+  Expected: has absolute value 10
+    Actual: -9
+```
+
+Note that both the matcher description and its parameter are printed, making the
+message human-friendly.
+
+In the matcher definition body, you can write `foo_type` to reference the type
+of a parameter named `foo`. For example, in the body of
+`MATCHER_P(HasAbsoluteValue, value)` above, you can write `value_type` to refer
+to the type of `value`.
+
+gMock also provides `MATCHER_P2`, `MATCHER_P3`, ..., up to `MATCHER_P10` to
+support multi-parameter matchers:
+
+```cpp
+MATCHER_Pk(name, param_1, ..., param_k, description_string) { statements; }
+```
+
+Please note that the custom description string is for a particular *instance* of
+the matcher, where the parameters have been bound to actual values. Therefore
+usually you'll want the parameter values to be part of the description. gMock
+lets you do that by referencing the matcher parameters in the description string
+expression.
+
+For example,
+
+```cpp
+using ::testing::PrintToString;
+MATCHER_P2(InClosedRange, low, hi,
+           absl::StrFormat("%s in range [%s, %s]", negation ? "isn't" : "is",
+                           PrintToString(low), PrintToString(hi))) {
+  return low <= arg && arg <= hi;
+}
+...
+EXPECT_THAT(3, InClosedRange(4, 6));
+```
+
+would generate a failure that contains the message:
+
+```shell
+  Expected: is in range [4, 6]
+```
+
+If you specify `""` as the description, the failure message will contain the
+sequence of words in the matcher name followed by the parameter values printed
+as a tuple. For example,
+
+```cpp
+  MATCHER_P2(InClosedRange, low, hi, "") { ... }
+  ...
+  EXPECT_THAT(3, InClosedRange(4, 6));
+```
+
+would generate a failure that contains the text:
+
+```shell
+  Expected: in closed range (4, 6)
+```
+
+For the purpose of typing, you can view
+
+```cpp
+MATCHER_Pk(Foo, p1, ..., pk, description_string) { ... }
+```
+
+as shorthand for
+
+```cpp
+template <typename p1_type, ..., typename pk_type>
+FooMatcherPk<p1_type, ..., pk_type>
+Foo(p1_type p1, ..., pk_type pk) { ... }
+```
+
+When you write `Foo(v1, ..., vk)`, the compiler infers the types of the
+parameters `v1`, ..., and `vk` for you. If you are not happy with the result of
+the type inference, you can specify the types by explicitly instantiating the
+template, as in `Foo<long, bool>(5, false)`. As said earlier, you don't get to
+(or need to) specify `arg_type` as that's determined by the context in which the
+matcher is used.
+
+You can assign the result of expression `Foo(p1, ..., pk)` to a variable of type
+`FooMatcherPk<p1_type, ..., pk_type>`. This can be useful when composing
+matchers. Matchers that don't have a parameter or have only one parameter have
+special types: you can assign `Foo()` to a `FooMatcher`-typed variable, and
+assign `Foo(p)` to a `FooMatcherP<p_type>`-typed variable.
+
+While you can instantiate a matcher template with reference types, passing the
+parameters by pointer usually makes your code more readable. If, however, you
+still want to pass a parameter by reference, be aware that in the failure
+message generated by the matcher you will see the value of the referenced object
+but not its address.
+
+You can overload matchers with different numbers of parameters:
+
+```cpp
+MATCHER_P(Blah, a, description_string_1) { ... }
+MATCHER_P2(Blah, a, b, description_string_2) { ... }
+```
+
+While it's tempting to always use the `MATCHER*` macros when defining a new
+matcher, you should also consider implementing `MatcherInterface` or using
+`MakePolymorphicMatcher()` instead (see the recipes that follow), especially if
+you need to use the matcher a lot. While these approaches require more work,
+they give you more control on the types of the value being matched and the
+matcher parameters, which in general leads to better compiler error messages
+that pay off in the long run. They also allow overloading matchers based on
+parameter types (as opposed to just based on the number of parameters).
+
+### Writing New Monomorphic Matchers
+
+A matcher of argument type `T` implements `::testing::MatcherInterface<T>` and
+does two things: it tests whether a value of type `T` matches the matcher, and
+can describe what kind of values it matches. The latter ability is used for
+generating readable error messages when expectations are violated.
+
+The interface looks like this:
+
+```cpp
+class MatchResultListener {
+ public:
+  ...
+  // Streams x to the underlying ostream; does nothing if the ostream
+  // is NULL.
+  template <typename T>
+  MatchResultListener& operator<<(const T& x);
+
+  // Returns the underlying ostream.
+  std::ostream* stream();
+};
+
+template <typename T>
+class MatcherInterface {
+ public:
+  virtual ~MatcherInterface();
+
+  // Returns true if and only if the matcher matches x; also explains the match
+  // result to 'listener'.
+  virtual bool MatchAndExplain(T x, MatchResultListener* listener) const = 0;
+
+  // Describes this matcher to an ostream.
+  virtual void DescribeTo(std::ostream* os) const = 0;
+
+  // Describes the negation of this matcher to an ostream.
+  virtual void DescribeNegationTo(std::ostream* os) const;
+};
+```
+
+If you need a custom matcher but `Truly()` is not a good option (for example,
+you may not be happy with the way `Truly(predicate)` describes itself, or you
+may want your matcher to be polymorphic as `Eq(value)` is), you can define a
+matcher to do whatever you want in two steps: first implement the matcher
+interface, and then define a factory function to create a matcher instance. The
+second step is not strictly needed but it makes the syntax of using the matcher
+nicer.
+
+For example, you can define a matcher to test whether an `int` is divisible by 7
+and then use it like this:
+
+```cpp
+using ::testing::MakeMatcher;
+using ::testing::Matcher;
+using ::testing::MatcherInterface;
+using ::testing::MatchResultListener;
+
+class DivisibleBy7Matcher : public MatcherInterface<int> {
+ public:
+  bool MatchAndExplain(int n,
+                       MatchResultListener* /* listener */) const override {
+    return (n % 7) == 0;
+  }
+
+  void DescribeTo(std::ostream* os) const override {
+    *os << "is divisible by 7";
+  }
+
+  void DescribeNegationTo(std::ostream* os) const override {
+    *os << "is not divisible by 7";
+  }
+};
+
+Matcher<int> DivisibleBy7() {
+  return MakeMatcher(new DivisibleBy7Matcher);
+}
+
+...
+  EXPECT_CALL(foo, Bar(DivisibleBy7()));
+```
+
+You may improve the matcher message by streaming additional information to the
+`listener` argument in `MatchAndExplain()`:
+
+```cpp
+class DivisibleBy7Matcher : public MatcherInterface<int> {
+ public:
+  bool MatchAndExplain(int n,
+                       MatchResultListener* listener) const override {
+    const int remainder = n % 7;
+    if (remainder != 0) {
+      *listener << "the remainder is " << remainder;
+    }
+    return remainder == 0;
+  }
+  ...
+};
+```
+
+Then, `EXPECT_THAT(x, DivisibleBy7());` may generate a message like this:
+
+```shell
+Value of: x
+Expected: is divisible by 7
+  Actual: 23 (the remainder is 2)
+```
+
+### Writing New Polymorphic Matchers
+
+You've learned how to write your own matchers in the previous recipe. Just one
+problem: a matcher created using `MakeMatcher()` only works for one particular
+type of arguments. If you want a *polymorphic* matcher that works with arguments
+of several types (for instance, `Eq(x)` can be used to match a *`value`* as long
+as `value == x` compiles -- *`value`* and `x` don't have to share the same
+type), you can learn the trick from `testing/base/public/gmock-matchers.h` but
+it's a bit involved.
+
+Fortunately, most of the time you can define a polymorphic matcher easily with
+the help of `MakePolymorphicMatcher()`. Here's how you can define `NotNull()` as
+an example:
+
+```cpp
+using ::testing::MakePolymorphicMatcher;
+using ::testing::MatchResultListener;
+using ::testing::PolymorphicMatcher;
+
+class NotNullMatcher {
+ public:
+  // To implement a polymorphic matcher, first define a COPYABLE class
+  // that has three members MatchAndExplain(), DescribeTo(), and
+  // DescribeNegationTo(), like the following.
+
+  // In this example, we want to use NotNull() with any pointer, so
+  // MatchAndExplain() accepts a pointer of any type as its first argument.
+  // In general, you can define MatchAndExplain() as an ordinary method or
+  // a method template, or even overload it.
+  template <typename T>
+  bool MatchAndExplain(T* p,
+                       MatchResultListener* /* listener */) const {
+    return p != NULL;
+  }
+
+  // Describes the property of a value matching this matcher.
+  void DescribeTo(std::ostream* os) const { *os << "is not NULL"; }
+
+  // Describes the property of a value NOT matching this matcher.
+  void DescribeNegationTo(std::ostream* os) const { *os << "is NULL"; }
+};
+
+// To construct a polymorphic matcher, pass an instance of the class
+// to MakePolymorphicMatcher().  Note the return type.
+PolymorphicMatcher<NotNullMatcher> NotNull() {
+  return MakePolymorphicMatcher(NotNullMatcher());
+}
+
+...
+
+  EXPECT_CALL(foo, Bar(NotNull()));  // The argument must be a non-NULL pointer.
+```
+
+**Note:** Your polymorphic matcher class does **not** need to inherit from
+`MatcherInterface` or any other class, and its methods do **not** need to be
+virtual.
+
+Like in a monomorphic matcher, you may explain the match result by streaming
+additional information to the `listener` argument in `MatchAndExplain()`.
+
+### Writing New Cardinalities
+
+A cardinality is used in `Times()` to tell gMock how many times you expect a
+call to occur. It doesn't have to be exact. For example, you can say
+`AtLeast(5)` or `Between(2, 4)`.
+
+If the [built-in set](cheat_sheet.md#CardinalityList) of cardinalities doesn't
+suit you, you are free to define your own by implementing the following
+interface (in namespace `testing`):
+
+```cpp
+class CardinalityInterface {
+ public:
+  virtual ~CardinalityInterface();
+
+  // Returns true if and only if call_count calls will satisfy this cardinality.
+  virtual bool IsSatisfiedByCallCount(int call_count) const = 0;
+
+  // Returns true if and only if call_count calls will saturate this
+  // cardinality.
+  virtual bool IsSaturatedByCallCount(int call_count) const = 0;
+
+  // Describes self to an ostream.
+  virtual void DescribeTo(std::ostream* os) const = 0;
+};
+```
+
+For example, to specify that a call must occur even number of times, you can
+write
+
+```cpp
+using ::testing::Cardinality;
+using ::testing::CardinalityInterface;
+using ::testing::MakeCardinality;
+
+class EvenNumberCardinality : public CardinalityInterface {
+ public:
+  bool IsSatisfiedByCallCount(int call_count) const override {
+    return (call_count % 2) == 0;
+  }
+
+  bool IsSaturatedByCallCount(int call_count) const override {
+    return false;
+  }
+
+  void DescribeTo(std::ostream* os) const {
+    *os << "called even number of times";
+  }
+};
+
+Cardinality EvenNumber() {
+  return MakeCardinality(new EvenNumberCardinality);
+}
+
+...
+  EXPECT_CALL(foo, Bar(3))
+      .Times(EvenNumber());
+```
+
+### Writing New Actions Quickly {#QuickNewActions}
+
+If the built-in actions don't work for you, you can easily define your own one.
+Just define a functor class with a (possibly templated) call operator, matching
+the signature of your action.
+
+```cpp
+struct Increment {
+  template <typename T>
+  T operator()(T* arg) {
+    return ++(*arg);
+  }
+}
+```
+
+The same approach works with stateful functors (or any callable, really):
+
+```
+struct MultiplyBy {
+  template <typename T>
+  T operator()(T arg) { return arg * multiplier; }
+
+  int multiplier;
+}
+
+// Then use:
+// EXPECT_CALL(...).WillOnce(MultiplyBy{7});
+```
+
+#### Legacy macro-based Actions
+
+Before C++11, the functor-based actions were not supported; the old way of
+writing actions was through a set of `ACTION*` macros. We suggest to avoid them
+in new code; they hide a lot of logic behind the macro, potentially leading to
+harder-to-understand compiler errors. Nevertheless, we cover them here for
+completeness.
+
+By writing
+
+```cpp
+ACTION(name) { statements; }
+```
+
+in a namespace scope (i.e. not inside a class or function), you will define an
+action with the given name that executes the statements. The value returned by
+`statements` will be used as the return value of the action. Inside the
+statements, you can refer to the K-th (0-based) argument of the mock function as
+`argK`. For example:
+
+```cpp
+ACTION(IncrementArg1) { return ++(*arg1); }
+```
+
+allows you to write
+
+```cpp
+... WillOnce(IncrementArg1());
+```
+
+Note that you don't need to specify the types of the mock function arguments.
+Rest assured that your code is type-safe though: you'll get a compiler error if
+`*arg1` doesn't support the `++` operator, or if the type of `++(*arg1)` isn't
+compatible with the mock function's return type.
+
+Another example:
+
+```cpp
+ACTION(Foo) {
+  (*arg2)(5);
+  Blah();
+  *arg1 = 0;
+  return arg0;
+}
+```
+
+defines an action `Foo()` that invokes argument #2 (a function pointer) with 5,
+calls function `Blah()`, sets the value pointed to by argument #1 to 0, and
+returns argument #0.
+
+For more convenience and flexibility, you can also use the following pre-defined
+symbols in the body of `ACTION`:
+
+`argK_type`     | The type of the K-th (0-based) argument of the mock function
+:-------------- | :-----------------------------------------------------------
+`args`          | All arguments of the mock function as a tuple
+`args_type`     | The type of all arguments of the mock function as a tuple
+`return_type`   | The return type of the mock function
+`function_type` | The type of the mock function
+
+For example, when using an `ACTION` as a stub action for mock function:
+
+```cpp
+int DoSomething(bool flag, int* ptr);
+```
+
+we have:
+
+Pre-defined Symbol | Is Bound To
+------------------ | ---------------------------------
+`arg0`             | the value of `flag`
+`arg0_type`        | the type `bool`
+`arg1`             | the value of `ptr`
+`arg1_type`        | the type `int*`
+`args`             | the tuple `(flag, ptr)`
+`args_type`        | the type `std::tuple<bool, int*>`
+`return_type`      | the type `int`
+`function_type`    | the type `int(bool, int*)`
+
+#### Legacy macro-based parameterized Actions
+
+Sometimes you'll want to parameterize an action you define. For that we have
+another macro
+
+```cpp
+ACTION_P(name, param) { statements; }
+```
+
+For example,
+
+```cpp
+ACTION_P(Add, n) { return arg0 + n; }
+```
+
+will allow you to write
+
+```cpp
+// Returns argument #0 + 5.
+... WillOnce(Add(5));
+```
+
+For convenience, we use the term *arguments* for the values used to invoke the
+mock function, and the term *parameters* for the values used to instantiate an
+action.
+
+Note that you don't need to provide the type of the parameter either. Suppose
+the parameter is named `param`, you can also use the gMock-defined symbol
+`param_type` to refer to the type of the parameter as inferred by the compiler.
+For example, in the body of `ACTION_P(Add, n)` above, you can write `n_type` for
+the type of `n`.
+
+gMock also provides `ACTION_P2`, `ACTION_P3`, and etc to support multi-parameter
+actions. For example,
+
+```cpp
+ACTION_P2(ReturnDistanceTo, x, y) {
+  double dx = arg0 - x;
+  double dy = arg1 - y;
+  return sqrt(dx*dx + dy*dy);
+}
+```
+
+lets you write
+
+```cpp
+... WillOnce(ReturnDistanceTo(5.0, 26.5));
+```
+
+You can view `ACTION` as a degenerated parameterized action where the number of
+parameters is 0.
+
+You can also easily define actions overloaded on the number of parameters:
+
+```cpp
+ACTION_P(Plus, a) { ... }
+ACTION_P2(Plus, a, b) { ... }
+```
+
+### Restricting the Type of an Argument or Parameter in an ACTION
+
+For maximum brevity and reusability, the `ACTION*` macros don't ask you to
+provide the types of the mock function arguments and the action parameters.
+Instead, we let the compiler infer the types for us.
+
+Sometimes, however, we may want to be more explicit about the types. There are
+several tricks to do that. For example:
+
+```cpp
+ACTION(Foo) {
+  // Makes sure arg0 can be converted to int.
+  int n = arg0;
+  ... use n instead of arg0 here ...
+}
+
+ACTION_P(Bar, param) {
+  // Makes sure the type of arg1 is const char*.
+  ::testing::StaticAssertTypeEq<const char*, arg1_type>();
+
+  // Makes sure param can be converted to bool.
+  bool flag = param;
+}
+```
+
+where `StaticAssertTypeEq` is a compile-time assertion in googletest that
+verifies two types are the same.
+
+### Writing New Action Templates Quickly
+
+Sometimes you want to give an action explicit template parameters that cannot be
+inferred from its value parameters. `ACTION_TEMPLATE()` supports that and can be
+viewed as an extension to `ACTION()` and `ACTION_P*()`.
+
+The syntax:
+
+```cpp
+ACTION_TEMPLATE(ActionName,
+                HAS_m_TEMPLATE_PARAMS(kind1, name1, ..., kind_m, name_m),
+                AND_n_VALUE_PARAMS(p1, ..., p_n)) { statements; }
+```
+
+defines an action template that takes *m* explicit template parameters and *n*
+value parameters, where *m* is in [1, 10] and *n* is in [0, 10]. `name_i` is the
+name of the *i*-th template parameter, and `kind_i` specifies whether it's a
+`typename`, an integral constant, or a template. `p_i` is the name of the *i*-th
+value parameter.
+
+Example:
+
+```cpp
+// DuplicateArg<k, T>(output) converts the k-th argument of the mock
+// function to type T and copies it to *output.
+ACTION_TEMPLATE(DuplicateArg,
+                // Note the comma between int and k:
+                HAS_2_TEMPLATE_PARAMS(int, k, typename, T),
+                AND_1_VALUE_PARAMS(output)) {
+  *output = T(std::get<k>(args));
+}
+```
+
+To create an instance of an action template, write:
+
+```cpp
+ActionName<t1, ..., t_m>(v1, ..., v_n)
+```
+
+where the `t`s are the template arguments and the `v`s are the value arguments.
+The value argument types are inferred by the compiler. For example:
+
+```cpp
+using ::testing::_;
+...
+  int n;
+  EXPECT_CALL(mock, Foo).WillOnce(DuplicateArg<1, unsigned char>(&n));
+```
+
+If you want to explicitly specify the value argument types, you can provide
+additional template arguments:
+
+```cpp
+ActionName<t1, ..., t_m, u1, ..., u_k>(v1, ..., v_n)
+```
+
+where `u_i` is the desired type of `v_i`.
+
+`ACTION_TEMPLATE` and `ACTION`/`ACTION_P*` can be overloaded on the number of
+value parameters, but not on the number of template parameters. Without the
+restriction, the meaning of the following is unclear:
+
+```cpp
+  OverloadedAction<int, bool>(x);
+```
+
+Are we using a single-template-parameter action where `bool` refers to the type
+of `x`, or a two-template-parameter action where the compiler is asked to infer
+the type of `x`?
+
+### Using the ACTION Object's Type
+
+If you are writing a function that returns an `ACTION` object, you'll need to
+know its type. The type depends on the macro used to define the action and the
+parameter types. The rule is relatively simple:
+
+| Given Definition              | Expression          | Has Type              |
+| ----------------------------- | ------------------- | --------------------- |
+| `ACTION(Foo)`                 | `Foo()`             | `FooAction`           |
+| `ACTION_TEMPLATE(Foo,`        | `Foo<t1, ...,       | `FooAction<t1, ...,   |
+: `HAS_m_TEMPLATE_PARAMS(...),` : t_m>()`             : t_m>`                 :
+: `AND_0_VALUE_PARAMS())`       :                     :                       :
+| `ACTION_P(Bar, param)`        | `Bar(int_value)`    | `BarActionP<int>`     |
+| `ACTION_TEMPLATE(Bar,`        | `Bar<t1, ..., t_m>` | `FooActionP<t1, ...,  |
+: `HAS_m_TEMPLATE_PARAMS(...),` : `(int_value)`       : t_m, int>`            :
+: `AND_1_VALUE_PARAMS(p1))`     :                     :                       :
+| `ACTION_P2(Baz, p1, p2)`      | `Baz(bool_value,`   | `BazActionP2<bool,    |
+:                               : `int_value)`        : int>`                 :
+| `ACTION_TEMPLATE(Baz,`        | `Baz<t1, ..., t_m>` | `FooActionP2<t1, ..., |
+: `HAS_m_TEMPLATE_PARAMS(...),` : `(bool_value,`      : t_m,` `bool, int>`    :
+: `AND_2_VALUE_PARAMS(p1, p2))` : `int_value)`        :                       :
+| ...                           | ...                 | ...                   |
+
+Note that we have to pick different suffixes (`Action`, `ActionP`, `ActionP2`,
+and etc) for actions with different numbers of value parameters, or the action
+definitions cannot be overloaded on the number of them.
+
+### Writing New Monomorphic Actions {#NewMonoActions}
+
+While the `ACTION*` macros are very convenient, sometimes they are
+inappropriate. For example, despite the tricks shown in the previous recipes,
+they don't let you directly specify the types of the mock function arguments and
+the action parameters, which in general leads to unoptimized compiler error
+messages that can baffle unfamiliar users. They also don't allow overloading
+actions based on parameter types without jumping through some hoops.
+
+An alternative to the `ACTION*` macros is to implement
+`::testing::ActionInterface<F>`, where `F` is the type of the mock function in
+which the action will be used. For example:
+
+```cpp
+template <typename F>
+class ActionInterface {
+ public:
+  virtual ~ActionInterface();
+
+  // Performs the action.  Result is the return type of function type
+  // F, and ArgumentTuple is the tuple of arguments of F.
+  //
+
+  // For example, if F is int(bool, const string&), then Result would
+  // be int, and ArgumentTuple would be std::tuple<bool, const string&>.
+  virtual Result Perform(const ArgumentTuple& args) = 0;
+};
+```
+
+```cpp
+using ::testing::_;
+using ::testing::Action;
+using ::testing::ActionInterface;
+using ::testing::MakeAction;
+
+typedef int IncrementMethod(int*);
+
+class IncrementArgumentAction : public ActionInterface<IncrementMethod> {
+ public:
+  int Perform(const std::tuple<int*>& args) override {
+    int* p = std::get<0>(args);  // Grabs the first argument.
+    return *p++;
+  }
+};
+
+Action<IncrementMethod> IncrementArgument() {
+  return MakeAction(new IncrementArgumentAction);
+}
+
+...
+  EXPECT_CALL(foo, Baz(_))
+      .WillOnce(IncrementArgument());
+
+  int n = 5;
+  foo.Baz(&n);  // Should return 5 and change n to 6.
+```
+
+### Writing New Polymorphic Actions {#NewPolyActions}
+
+The previous recipe showed you how to define your own action. This is all good,
+except that you need to know the type of the function in which the action will
+be used. Sometimes that can be a problem. For example, if you want to use the
+action in functions with *different* types (e.g. like `Return()` and
+`SetArgPointee()`).
+
+If an action can be used in several types of mock functions, we say it's
+*polymorphic*. The `MakePolymorphicAction()` function template makes it easy to
+define such an action:
+
+```cpp
+namespace testing {
+template <typename Impl>
+PolymorphicAction<Impl> MakePolymorphicAction(const Impl& impl);
+}  // namespace testing
+```
+
+As an example, let's define an action that returns the second argument in the
+mock function's argument list. The first step is to define an implementation
+class:
+
+```cpp
+class ReturnSecondArgumentAction {
+ public:
+  template <typename Result, typename ArgumentTuple>
+  Result Perform(const ArgumentTuple& args) const {
+    // To get the i-th (0-based) argument, use std::get(args).
+    return std::get<1>(args);
+  }
+};
+```
+
+This implementation class does *not* need to inherit from any particular class.
+What matters is that it must have a `Perform()` method template. This method
+template takes the mock function's arguments as a tuple in a **single**
+argument, and returns the result of the action. It can be either `const` or not,
+but must be invokable with exactly one template argument, which is the result
+type. In other words, you must be able to call `Perform<R>(args)` where `R` is
+the mock function's return type and `args` is its arguments in a tuple.
+
+Next, we use `MakePolymorphicAction()` to turn an instance of the implementation
+class into the polymorphic action we need. It will be convenient to have a
+wrapper for this:
+
+```cpp
+using ::testing::MakePolymorphicAction;
+using ::testing::PolymorphicAction;
+
+PolymorphicAction<ReturnSecondArgumentAction> ReturnSecondArgument() {
+  return MakePolymorphicAction(ReturnSecondArgumentAction());
+}
+```
+
+Now, you can use this polymorphic action the same way you use the built-in ones:
+
+```cpp
+using ::testing::_;
+
+class MockFoo : public Foo {
+ public:
+  MOCK_METHOD(int, DoThis, (bool flag, int n), (override));
+  MOCK_METHOD(string, DoThat, (int x, const char* str1, const char* str2),
+              (override));
+};
+
+  ...
+  MockFoo foo;
+  EXPECT_CALL(foo, DoThis).WillOnce(ReturnSecondArgument());
+  EXPECT_CALL(foo, DoThat).WillOnce(ReturnSecondArgument());
+  ...
+  foo.DoThis(true, 5);  // Will return 5.
+  foo.DoThat(1, "Hi", "Bye");  // Will return "Hi".
+```
+
+### Teaching gMock How to Print Your Values
+
+When an uninteresting or unexpected call occurs, gMock prints the argument
+values and the stack trace to help you debug. Assertion macros like
+`EXPECT_THAT` and `EXPECT_EQ` also print the values in question when the
+assertion fails. gMock and googletest do this using googletest's user-extensible
+value printer.
+
+This printer knows how to print built-in C++ types, native arrays, STL
+containers, and any type that supports the `<<` operator. For other types, it
+prints the raw bytes in the value and hopes that you the user can figure it out.
+[googletest's advanced guide](../../googletest/docs/advanced.md#teaching-googletest-how-to-print-your-values)
+explains how to extend the printer to do a better job at printing your
+particular type than to dump the bytes.
+
+## Useful Mocks Created Using gMock
+
+<!--#include file="includes/g3_testing_LOGs.md"-->
+<!--#include file="includes/g3_mock_callbacks.md"-->
+
+### Mock std::function {#MockFunction}
+
+`std::function` is a general function type introduced in C++11. It is a
+preferred way of passing callbacks to new interfaces. Functions are copiable,
+and are not usually passed around by pointer, which makes them tricky to mock.
+But fear not - `MockFunction` can help you with that.
+
+`MockFunction<R(T1, ..., Tn)>` has a mock method `Call()` with the signature:
+
+```cpp
+  R Call(T1, ..., Tn);
+```
+
+It also has a `AsStdFunction()` method, which creates a `std::function` proxy
+forwarding to Call:
+
+```cpp
+  std::function<R(T1, ..., Tn)> AsStdFunction();
+```
+
+To use `MockFunction`, first create `MockFunction` object and set up
+expectations on its `Call` method. Then pass proxy obtained from
+`AsStdFunction()` to the code you are testing. For example:
+
+```cpp
+TEST(FooTest, RunsCallbackWithBarArgument) {
+  // 1. Create a mock object.
+  MockFunction<int(string)> mock_function;
+
+  // 2. Set expectations on Call() method.
+  EXPECT_CALL(mock_function, Call("bar")).WillOnce(Return(1));
+
+  // 3. Exercise code that uses std::function.
+  Foo(mock_function.AsStdFunction());
+  // Foo's signature can be either of:
+  // void Foo(const std::function<int(string)>& fun);
+  // void Foo(std::function<int(string)> fun);
+
+  // 4. All expectations will be verified when mock_function
+  //     goes out of scope and is destroyed.
+}
+```
+
+Remember that function objects created with `AsStdFunction()` are just
+forwarders. If you create multiple of them, they will share the same set of
+expectations.
+
+Although `std::function` supports unlimited number of arguments, `MockFunction`
+implementation is limited to ten. If you ever hit that limit... well, your
+callback has bigger problems than being mockable. :-)
+
+<!-- GOOGLETEST_CM0034 DO NOT DELETE -->
diff --git a/GraphBLAS/CUDA/test/googlemock/docs/for_dummies.md b/GraphBLAS/CUDA/test/googlemock/docs/for_dummies.md
new file mode 100644
index 0000000000..327e6cc327
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/docs/for_dummies.md
@@ -0,0 +1,700 @@
+## gMock for Dummies {#GMockForDummies}
+
+<!-- GOOGLETEST_CM0013 DO NOT DELETE -->
+
+### What Is gMock?
+
+When you write a prototype or test, often it's not feasible or wise to rely on
+real objects entirely. A **mock object** implements the same interface as a real
+object (so it can be used as one), but lets you specify at run time how it will
+be used and what it should do (which methods will be called? in which order? how
+many times? with what arguments? what will they return? etc).
+
+**Note:** It is easy to confuse the term *fake objects* with mock objects. Fakes
+and mocks actually mean very different things in the Test-Driven Development
+(TDD) community:
+
+*   **Fake** objects have working implementations, but usually take some
+    shortcut (perhaps to make the operations less expensive), which makes them
+    not suitable for production. An in-memory file system would be an example of
+    a fake.
+*   **Mocks** are objects pre-programmed with *expectations*, which form a
+    specification of the calls they are expected to receive.
+
+If all this seems too abstract for you, don't worry - the most important thing
+to remember is that a mock allows you to check the *interaction* between itself
+and code that uses it. The difference between fakes and mocks shall become much
+clearer once you start to use mocks.
+
+**gMock** is a library (sometimes we also call it a "framework" to make it sound
+cool) for creating mock classes and using them. It does to C++ what
+jMock/EasyMock does to Java (well, more or less).
+
+When using gMock,
+
+1.  first, you use some simple macros to describe the interface you want to
+    mock, and they will expand to the implementation of your mock class;
+2.  next, you create some mock objects and specify its expectations and behavior
+    using an intuitive syntax;
+3.  then you exercise code that uses the mock objects. gMock will catch any
+    violation to the expectations as soon as it arises.
+
+### Why gMock?
+
+While mock objects help you remove unnecessary dependencies in tests and make
+them fast and reliable, using mocks manually in C++ is *hard*:
+
+*   Someone has to implement the mocks. The job is usually tedious and
+    error-prone. No wonder people go great distance to avoid it.
+*   The quality of those manually written mocks is a bit, uh, unpredictable. You
+    may see some really polished ones, but you may also see some that were
+    hacked up in a hurry and have all sorts of ad hoc restrictions.
+*   The knowledge you gained from using one mock doesn't transfer to the next
+    one.
+
+In contrast, Java and Python programmers have some fine mock frameworks (jMock,
+EasyMock, [Mox](http://wtf/mox), etc), which automate the creation of mocks. As
+a result, mocking is a proven effective technique and widely adopted practice in
+those communities. Having the right tool absolutely makes the difference.
+
+gMock was built to help C++ programmers. It was inspired by jMock and EasyMock,
+but designed with C++'s specifics in mind. It is your friend if any of the
+following problems is bothering you:
+
+*   You are stuck with a sub-optimal design and wish you had done more
+    prototyping before it was too late, but prototyping in C++ is by no means
+    "rapid".
+*   Your tests are slow as they depend on too many libraries or use expensive
+    resources (e.g. a database).
+*   Your tests are brittle as some resources they use are unreliable (e.g. the
+    network).
+*   You want to test how your code handles a failure (e.g. a file checksum
+    error), but it's not easy to cause one.
+*   You need to make sure that your module interacts with other modules in the
+    right way, but it's hard to observe the interaction; therefore you resort to
+    observing the side effects at the end of the action, but it's awkward at
+    best.
+*   You want to "mock out" your dependencies, except that they don't have mock
+    implementations yet; and, frankly, you aren't thrilled by some of those
+    hand-written mocks.
+
+We encourage you to use gMock as
+
+*   a *design* tool, for it lets you experiment with your interface design early
+    and often. More iterations lead to better designs!
+*   a *testing* tool to cut your tests' outbound dependencies and probe the
+    interaction between your module and its collaborators.
+
+### Getting Started
+
+gMock is bundled with googletest.
+
+### A Case for Mock Turtles
+
+Let's look at an example. Suppose you are developing a graphics program that
+relies on a [LOGO](http://en.wikipedia.org/wiki/Logo_programming_language)-like
+API for drawing. How would you test that it does the right thing? Well, you can
+run it and compare the screen with a golden screen snapshot, but let's admit it:
+tests like this are expensive to run and fragile (What if you just upgraded to a
+shiny new graphics card that has better anti-aliasing? Suddenly you have to
+update all your golden images.). It would be too painful if all your tests are
+like this. Fortunately, you learned about
+[Dependency Injection](http://en.wikipedia.org/wiki/Dependency_injection) and know the right thing
+to do: instead of having your application talk to the system API directly, wrap
+the API in an interface (say, `Turtle`) and code to that interface:
+
+```cpp
+class Turtle {
+  ...
+  virtual ~Turtle() {};
+  virtual void PenUp() = 0;
+  virtual void PenDown() = 0;
+  virtual void Forward(int distance) = 0;
+  virtual void Turn(int degrees) = 0;
+  virtual void GoTo(int x, int y) = 0;
+  virtual int GetX() const = 0;
+  virtual int GetY() const = 0;
+};
+```
+
+(Note that the destructor of `Turtle` **must** be virtual, as is the case for
+**all** classes you intend to inherit from - otherwise the destructor of the
+derived class will not be called when you delete an object through a base
+pointer, and you'll get corrupted program states like memory leaks.)
+
+You can control whether the turtle's movement will leave a trace using `PenUp()`
+and `PenDown()`, and control its movement using `Forward()`, `Turn()`, and
+`GoTo()`. Finally, `GetX()` and `GetY()` tell you the current position of the
+turtle.
+
+Your program will normally use a real implementation of this interface. In
+tests, you can use a mock implementation instead. This allows you to easily
+check what drawing primitives your program is calling, with what arguments, and
+in which order. Tests written this way are much more robust (they won't break
+because your new machine does anti-aliasing differently), easier to read and
+maintain (the intent of a test is expressed in the code, not in some binary
+images), and run *much, much faster*.
+
+### Writing the Mock Class
+
+If you are lucky, the mocks you need to use have already been implemented by
+some nice people. If, however, you find yourself in the position to write a mock
+class, relax - gMock turns this task into a fun game! (Well, almost.)
+
+#### How to Define It
+
+Using the `Turtle` interface as example, here are the simple steps you need to
+follow:
+
+*   Derive a class `MockTurtle` from `Turtle`.
+*   Take a *virtual* function of `Turtle` (while it's possible to
+    [mock non-virtual methods using templates](cook_book.md#MockingNonVirtualMethods),
+    it's much more involved).
+*   In the `public:` section of the child class, write `MOCK_METHOD();`
+*   Now comes the fun part: you take the function signature, cut-and-paste it
+    into the macro, and add two commas - one between the return type and the
+    name, another between the name and the argument list.
+*   If you're mocking a const method, add a 4th parameter containing `(const)`
+    (the parentheses are required).
+*   Since you're overriding a virtual method, we suggest adding the `override`
+    keyword. For const methods the 4th parameter becomes `(const, override)`,
+    for non-const methods just `(override)`. This isn't mandatory.
+*   Repeat until all virtual functions you want to mock are done. (It goes
+    without saying that *all* pure virtual methods in your abstract class must
+    be either mocked or overridden.)
+
+After the process, you should have something like:
+
+```cpp
+#include "gmock/gmock.h"  // Brings in gMock.
+
+class MockTurtle : public Turtle {
+ public:
+  ...
+  MOCK_METHOD(void, PenUp, (), (override));
+  MOCK_METHOD(void, PenDown, (), (override));
+  MOCK_METHOD(void, Forward, (int distance), (override));
+  MOCK_METHOD(void, Turn, (int degrees), (override));
+  MOCK_METHOD(void, GoTo, (int x, int y), (override));
+  MOCK_METHOD(int, GetX, (), (const, override));
+  MOCK_METHOD(int, GetY, (), (const, override));
+};
+```
+
+You don't need to define these mock methods somewhere else - the `MOCK_METHOD`
+macro will generate the definitions for you. It's that simple!
+
+#### Where to Put It
+
+When you define a mock class, you need to decide where to put its definition.
+Some people put it in a `_test.cc`. This is fine when the interface being mocked
+(say, `Foo`) is owned by the same person or team. Otherwise, when the owner of
+`Foo` changes it, your test could break. (You can't really expect `Foo`'s
+maintainer to fix every test that uses `Foo`, can you?)
+
+So, the rule of thumb is: if you need to mock `Foo` and it's owned by others,
+define the mock class in `Foo`'s package (better, in a `testing` sub-package
+such that you can clearly separate production code and testing utilities), put
+it in a `.h` and a `cc_library`. Then everyone can reference them from their
+tests. If `Foo` ever changes, there is only one copy of `MockFoo` to change, and
+only tests that depend on the changed methods need to be fixed.
+
+Another way to do it: you can introduce a thin layer `FooAdaptor` on top of
+`Foo` and code to this new interface. Since you own `FooAdaptor`, you can absorb
+changes in `Foo` much more easily. While this is more work initially, carefully
+choosing the adaptor interface can make your code easier to write and more
+readable (a net win in the long run), as you can choose `FooAdaptor` to fit your
+specific domain much better than `Foo` does.
+
+<!-- GOOGLETEST_CM0029 DO NOT DELETE -->
+
+### Using Mocks in Tests
+
+Once you have a mock class, using it is easy. The typical work flow is:
+
+1.  Import the gMock names from the `testing` namespace such that you can use
+    them unqualified (You only have to do it once per file). Remember that
+    namespaces are a good idea.
+2.  Create some mock objects.
+3.  Specify your expectations on them (How many times will a method be called?
+    With what arguments? What should it do? etc.).
+4.  Exercise some code that uses the mocks; optionally, check the result using
+    googletest assertions. If a mock method is called more than expected or with
+    wrong arguments, you'll get an error immediately.
+5.  When a mock is destructed, gMock will automatically check whether all
+    expectations on it have been satisfied.
+
+Here's an example:
+
+```cpp
+#include "path/to/mock-turtle.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using ::testing::AtLeast;                         // #1
+
+TEST(PainterTest, CanDrawSomething) {
+  MockTurtle turtle;                              // #2
+  EXPECT_CALL(turtle, PenDown())                  // #3
+      .Times(AtLeast(1));
+
+  Painter painter(&turtle);                       // #4
+
+  EXPECT_TRUE(painter.DrawCircle(0, 0, 10));      // #5
+}
+```
+
+As you might have guessed, this test checks that `PenDown()` is called at least
+once. If the `painter` object didn't call this method, your test will fail with
+a message like this:
+
+```text
+path/to/my_test.cc:119: Failure
+Actual function call count doesn't match this expectation:
+Actually: never called;
+Expected: called at least once.
+Stack trace:
+...
+```
+
+**Tip 1:** If you run the test from an Emacs buffer, you can hit <Enter> on the
+line number to jump right to the failed expectation.
+
+**Tip 2:** If your mock objects are never deleted, the final verification won't
+happen. Therefore it's a good idea to turn on the heap checker in your tests
+when you allocate mocks on the heap. You get that automatically if you use the
+`gtest_main` library already.
+
+**Important note:** gMock requires expectations to be set **before** the mock
+functions are called, otherwise the behavior is **undefined**. In particular,
+you mustn't interleave `EXPECT_CALL()s` and calls to the mock functions.
+
+This means `EXPECT_CALL()` should be read as expecting that a call will occur
+*in the future*, not that a call has occurred. Why does gMock work like that?
+Well, specifying the expectation beforehand allows gMock to report a violation
+as soon as it rises, when the context (stack trace, etc) is still available.
+This makes debugging much easier.
+
+Admittedly, this test is contrived and doesn't do much. You can easily achieve
+the same effect without using gMock. However, as we shall reveal soon, gMock
+allows you to do *so much more* with the mocks.
+
+### Setting Expectations
+
+The key to using a mock object successfully is to set the *right expectations*
+on it. If you set the expectations too strict, your test will fail as the result
+of unrelated changes. If you set them too loose, bugs can slip through. You want
+to do it just right such that your test can catch exactly the kind of bugs you
+intend it to catch. gMock provides the necessary means for you to do it "just
+right."
+
+#### General Syntax
+
+In gMock we use the `EXPECT_CALL()` macro to set an expectation on a mock
+method. The general syntax is:
+
+```cpp
+EXPECT_CALL(mock_object, method(matchers))
+    .Times(cardinality)
+    .WillOnce(action)
+    .WillRepeatedly(action);
+```
+
+The macro has two arguments: first the mock object, and then the method and its
+arguments. Note that the two are separated by a comma (`,`), not a period (`.`).
+(Why using a comma? The answer is that it was necessary for technical reasons.)
+If the method is not overloaded, the macro can also be called without matchers:
+
+```cpp
+EXPECT_CALL(mock_object, non-overloaded-method)
+    .Times(cardinality)
+    .WillOnce(action)
+    .WillRepeatedly(action);
+```
+
+This syntax allows the test writer to specify "called with any arguments"
+without explicitly specifying the number or types of arguments. To avoid
+unintended ambiguity, this syntax may only be used for methods which are not
+overloaded
+
+Either form of the macro can be followed by some optional *clauses* that provide
+more information about the expectation. We'll discuss how each clause works in
+the coming sections.
+
+This syntax is designed to make an expectation read like English. For example,
+you can probably guess that
+
+```cpp
+using ::testing::Return;
+...
+EXPECT_CALL(turtle, GetX())
+    .Times(5)
+    .WillOnce(Return(100))
+    .WillOnce(Return(150))
+    .WillRepeatedly(Return(200));
+```
+
+says that the `turtle` object's `GetX()` method will be called five times, it
+will return 100 the first time, 150 the second time, and then 200 every time.
+Some people like to call this style of syntax a Domain-Specific Language (DSL).
+
+**Note:** Why do we use a macro to do this? Well it serves two purposes: first
+it makes expectations easily identifiable (either by `gsearch` or by a human
+reader), and second it allows gMock to include the source file location of a
+failed expectation in messages, making debugging easier.
+
+#### Matchers: What Arguments Do We Expect?
+
+When a mock function takes arguments, we may specify what arguments we are
+expecting, for example:
+
+```cpp
+// Expects the turtle to move forward by 100 units.
+EXPECT_CALL(turtle, Forward(100));
+```
+
+Oftentimes you do not want to be too specific. Remember that talk about tests
+being too rigid? Over specification leads to brittle tests and obscures the
+intent of tests. Therefore we encourage you to specify only what's necessary—no
+more, no less. If you aren't interested in the value of an argument, write `_`
+as the argument, which means "anything goes":
+
+```cpp
+using ::testing::_;
+...
+// Expects that the turtle jumps to somewhere on the x=50 line.
+EXPECT_CALL(turtle, GoTo(50, _));
+```
+
+`_` is an instance of what we call **matchers**. A matcher is like a predicate
+and can test whether an argument is what we'd expect. You can use a matcher
+inside `EXPECT_CALL()` wherever a function argument is expected. `_` is a
+convenient way of saying "any value".
+
+In the above examples, `100` and `50` are also matchers; implicitly, they are
+the same as `Eq(100)` and `Eq(50)`, which specify that the argument must be
+equal (using `operator==`) to the matcher argument. There are many
+[built-in matchers](cheat_sheet.md#MatcherList) for common types (as well as
+[custom matchers](cook_book.md#NewMatchers)); for example:
+
+```cpp
+using ::testing::Ge;
+...
+// Expects the turtle moves forward by at least 100.
+EXPECT_CALL(turtle, Forward(Ge(100)));
+```
+
+If you don't care about *any* arguments, rather than specify `_` for each of
+them you may instead omit the parameter list:
+
+```cpp
+// Expects the turtle to move forward.
+EXPECT_CALL(turtle, Forward);
+// Expects the turtle to jump somewhere.
+EXPECT_CALL(turtle, GoTo);
+```
+
+This works for all non-overloaded methods; if a method is overloaded, you need
+to help gMock resolve which overload is expected by specifying the number of
+arguments and possibly also the
+[types of the arguments](cook_book.md#SelectOverload).
+
+#### Cardinalities: How Many Times Will It Be Called?
+
+The first clause we can specify following an `EXPECT_CALL()` is `Times()`. We
+call its argument a **cardinality** as it tells *how many times* the call should
+occur. It allows us to repeat an expectation many times without actually writing
+it as many times. More importantly, a cardinality can be "fuzzy", just like a
+matcher can be. This allows a user to express the intent of a test exactly.
+
+An interesting special case is when we say `Times(0)`. You may have guessed - it
+means that the function shouldn't be called with the given arguments at all, and
+gMock will report a googletest failure whenever the function is (wrongfully)
+called.
+
+We've seen `AtLeast(n)` as an example of fuzzy cardinalities earlier. For the
+list of built-in cardinalities you can use, see
+[here](cheat_sheet.md#CardinalityList).
+
+The `Times()` clause can be omitted. **If you omit `Times()`, gMock will infer
+the cardinality for you.** The rules are easy to remember:
+
+*   If **neither** `WillOnce()` **nor** `WillRepeatedly()` is in the
+    `EXPECT_CALL()`, the inferred cardinality is `Times(1)`.
+*   If there are *n* `WillOnce()`'s but **no** `WillRepeatedly()`, where *n* >=
+    1, the cardinality is `Times(n)`.
+*   If there are *n* `WillOnce()`'s and **one** `WillRepeatedly()`, where *n* >=
+    0, the cardinality is `Times(AtLeast(n))`.
+
+**Quick quiz:** what do you think will happen if a function is expected to be
+called twice but actually called four times?
+
+#### Actions: What Should It Do?
+
+Remember that a mock object doesn't really have a working implementation? We as
+users have to tell it what to do when a method is invoked. This is easy in
+gMock.
+
+First, if the return type of a mock function is a built-in type or a pointer,
+the function has a **default action** (a `void` function will just return, a
+`bool` function will return `false`, and other functions will return 0). In
+addition, in C++ 11 and above, a mock function whose return type is
+default-constructible (i.e. has a default constructor) has a default action of
+returning a default-constructed value. If you don't say anything, this behavior
+will be used.
+
+Second, if a mock function doesn't have a default action, or the default action
+doesn't suit you, you can specify the action to be taken each time the
+expectation matches using a series of `WillOnce()` clauses followed by an
+optional `WillRepeatedly()`. For example,
+
+```cpp
+using ::testing::Return;
+...
+EXPECT_CALL(turtle, GetX())
+     .WillOnce(Return(100))
+     .WillOnce(Return(200))
+     .WillOnce(Return(300));
+```
+
+says that `turtle.GetX()` will be called *exactly three times* (gMock inferred
+this from how many `WillOnce()` clauses we've written, since we didn't
+explicitly write `Times()`), and will return 100, 200, and 300 respectively.
+
+```cpp
+using ::testing::Return;
+...
+EXPECT_CALL(turtle, GetY())
+     .WillOnce(Return(100))
+     .WillOnce(Return(200))
+     .WillRepeatedly(Return(300));
+```
+
+says that `turtle.GetY()` will be called *at least twice* (gMock knows this as
+we've written two `WillOnce()` clauses and a `WillRepeatedly()` while having no
+explicit `Times()`), will return 100 and 200 respectively the first two times,
+and 300 from the third time on.
+
+Of course, if you explicitly write a `Times()`, gMock will not try to infer the
+cardinality itself. What if the number you specified is larger than there are
+`WillOnce()` clauses? Well, after all `WillOnce()`s are used up, gMock will do
+the *default* action for the function every time (unless, of course, you have a
+`WillRepeatedly()`.).
+
+What can we do inside `WillOnce()` besides `Return()`? You can return a
+reference using `ReturnRef(*variable*)`, or invoke a pre-defined function, among
+[others](cook_book.md#using-actions).
+
+**Important note:** The `EXPECT_CALL()` statement evaluates the action clause
+only once, even though the action may be performed many times. Therefore you
+must be careful about side effects. The following may not do what you want:
+
+```cpp
+using ::testing::Return;
+...
+int n = 100;
+EXPECT_CALL(turtle, GetX())
+    .Times(4)
+    .WillRepeatedly(Return(n++));
+```
+
+Instead of returning 100, 101, 102, ..., consecutively, this mock function will
+always return 100 as `n++` is only evaluated once. Similarly, `Return(new Foo)`
+will create a new `Foo` object when the `EXPECT_CALL()` is executed, and will
+return the same pointer every time. If you want the side effect to happen every
+time, you need to define a custom action, which we'll teach in the
+[cook book](http://<!-- GOOGLETEST_CM0012 DO NOT DELETE -->).
+
+Time for another quiz! What do you think the following means?
+
+```cpp
+using ::testing::Return;
+...
+EXPECT_CALL(turtle, GetY())
+    .Times(4)
+    .WillOnce(Return(100));
+```
+
+Obviously `turtle.GetY()` is expected to be called four times. But if you think
+it will return 100 every time, think twice! Remember that one `WillOnce()`
+clause will be consumed each time the function is invoked and the default action
+will be taken afterwards. So the right answer is that `turtle.GetY()` will
+return 100 the first time, but **return 0 from the second time on**, as
+returning 0 is the default action for `int` functions.
+
+#### Using Multiple Expectations {#MultiExpectations}
+
+So far we've only shown examples where you have a single expectation. More
+realistically, you'll specify expectations on multiple mock methods which may be
+from multiple mock objects.
+
+By default, when a mock method is invoked, gMock will search the expectations in
+the **reverse order** they are defined, and stop when an active expectation that
+matches the arguments is found (you can think of it as "newer rules override
+older ones."). If the matching expectation cannot take any more calls, you will
+get an upper-bound-violated failure. Here's an example:
+
+```cpp
+using ::testing::_;
+...
+EXPECT_CALL(turtle, Forward(_));  // #1
+EXPECT_CALL(turtle, Forward(10))  // #2
+    .Times(2);
+```
+
+If `Forward(10)` is called three times in a row, the third time it will be an
+error, as the last matching expectation (#2) has been saturated. If, however,
+the third `Forward(10)` call is replaced by `Forward(20)`, then it would be OK,
+as now #1 will be the matching expectation.
+
+**Note:** Why does gMock search for a match in the *reverse* order of the
+expectations? The reason is that this allows a user to set up the default
+expectations in a mock object's constructor or the test fixture's set-up phase
+and then customize the mock by writing more specific expectations in the test
+body. So, if you have two expectations on the same method, you want to put the
+one with more specific matchers **after** the other, or the more specific rule
+would be shadowed by the more general one that comes after it.
+
+**Tip:** It is very common to start with a catch-all expectation for a method
+and `Times(AnyNumber())` (omitting arguments, or with `_` for all arguments, if
+overloaded). This makes any calls to the method expected. This is not necessary
+for methods that are not mentioned at all (these are "uninteresting"), but is
+useful for methods that have some expectations, but for which other calls are
+ok. See
+[Understanding Uninteresting vs Unexpected Calls](cook_book.md#uninteresting-vs-unexpected).
+
+#### Ordered vs Unordered Calls {#OrderedCalls}
+
+By default, an expectation can match a call even though an earlier expectation
+hasn't been satisfied. In other words, the calls don't have to occur in the
+order the expectations are specified.
+
+Sometimes, you may want all the expected calls to occur in a strict order. To
+say this in gMock is easy:
+
+```cpp
+using ::testing::InSequence;
+...
+TEST(FooTest, DrawsLineSegment) {
+  ...
+  {
+    InSequence seq;
+
+    EXPECT_CALL(turtle, PenDown());
+    EXPECT_CALL(turtle, Forward(100));
+    EXPECT_CALL(turtle, PenUp());
+  }
+  Foo();
+}
+```
+
+By creating an object of type `InSequence`, all expectations in its scope are
+put into a *sequence* and have to occur *sequentially*. Since we are just
+relying on the constructor and destructor of this object to do the actual work,
+its name is really irrelevant.
+
+In this example, we test that `Foo()` calls the three expected functions in the
+order as written. If a call is made out-of-order, it will be an error.
+
+(What if you care about the relative order of some of the calls, but not all of
+them? Can you specify an arbitrary partial order? The answer is ... yes! The
+details can be found [here](cook_book.md#OrderedCalls).)
+
+#### All Expectations Are Sticky (Unless Said Otherwise) {#StickyExpectations}
+
+Now let's do a quick quiz to see how well you can use this mock stuff already.
+How would you test that the turtle is asked to go to the origin *exactly twice*
+(you want to ignore any other instructions it receives)?
+
+After you've come up with your answer, take a look at ours and compare notes
+(solve it yourself first - don't cheat!):
+
+```cpp
+using ::testing::_;
+using ::testing::AnyNumber;
+...
+EXPECT_CALL(turtle, GoTo(_, _))  // #1
+     .Times(AnyNumber());
+EXPECT_CALL(turtle, GoTo(0, 0))  // #2
+     .Times(2);
+```
+
+Suppose `turtle.GoTo(0, 0)` is called three times. In the third time, gMock will
+see that the arguments match expectation #2 (remember that we always pick the
+last matching expectation). Now, since we said that there should be only two
+such calls, gMock will report an error immediately. This is basically what we've
+told you in the [Using Multiple Expectations](#MultiExpectations) section above.
+
+This example shows that **expectations in gMock are "sticky" by default**, in
+the sense that they remain active even after we have reached their invocation
+upper bounds. This is an important rule to remember, as it affects the meaning
+of the spec, and is **different** to how it's done in many other mocking
+frameworks (Why'd we do that? Because we think our rule makes the common cases
+easier to express and understand.).
+
+Simple? Let's see if you've really understood it: what does the following code
+say?
+
+```cpp
+using ::testing::Return;
+...
+for (int i = n; i > 0; i--) {
+  EXPECT_CALL(turtle, GetX())
+      .WillOnce(Return(10*i));
+}
+```
+
+If you think it says that `turtle.GetX()` will be called `n` times and will
+return 10, 20, 30, ..., consecutively, think twice! The problem is that, as we
+said, expectations are sticky. So, the second time `turtle.GetX()` is called,
+the last (latest) `EXPECT_CALL()` statement will match, and will immediately
+lead to an "upper bound violated" error - this piece of code is not very useful!
+
+One correct way of saying that `turtle.GetX()` will return 10, 20, 30, ..., is
+to explicitly say that the expectations are *not* sticky. In other words, they
+should *retire* as soon as they are saturated:
+
+```cpp
+using ::testing::Return;
+...
+for (int i = n; i > 0; i--) {
+  EXPECT_CALL(turtle, GetX())
+      .WillOnce(Return(10*i))
+      .RetiresOnSaturation();
+}
+```
+
+And, there's a better way to do it: in this case, we expect the calls to occur
+in a specific order, and we line up the actions to match the order. Since the
+order is important here, we should make it explicit using a sequence:
+
+```cpp
+using ::testing::InSequence;
+using ::testing::Return;
+...
+{
+  InSequence s;
+
+  for (int i = 1; i <= n; i++) {
+    EXPECT_CALL(turtle, GetX())
+        .WillOnce(Return(10*i))
+        .RetiresOnSaturation();
+  }
+}
+```
+
+By the way, the other situation where an expectation may *not* be sticky is when
+it's in a sequence - as soon as another expectation that comes after it in the
+sequence has been used, it automatically retires (and will never be used to
+match any call).
+
+#### Uninteresting Calls
+
+A mock object may have many methods, and not all of them are that interesting.
+For example, in some tests we may not care about how many times `GetX()` and
+`GetY()` get called.
+
+In gMock, if you are not interested in a method, just don't say anything about
+it. If a call to this method occurs, you'll see a warning in the test output,
+but it won't be a failure. This is called "naggy" behavior; to change, see
+[The Nice, the Strict, and the Naggy](cook_book.md#NiceStrictNaggy).
diff --git a/GraphBLAS/CUDA/test/googlemock/docs/gmock_faq.md b/GraphBLAS/CUDA/test/googlemock/docs/gmock_faq.md
new file mode 100644
index 0000000000..214aabf121
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/docs/gmock_faq.md
@@ -0,0 +1,396 @@
+## Legacy gMock FAQ {#GMockFaq}
+
+<!-- GOOGLETEST_CM0021 DO NOT DELETE -->
+
+### When I call a method on my mock object, the method for the real object is invoked instead. What's the problem?
+
+In order for a method to be mocked, it must be *virtual*, unless you use the
+[high-perf dependency injection technique](#MockingNonVirtualMethods).
+
+### Can I mock a variadic function?
+
+You cannot mock a variadic function (i.e. a function taking ellipsis (`...`)
+arguments) directly in gMock.
+
+The problem is that in general, there is *no way* for a mock object to know how
+many arguments are passed to the variadic method, and what the arguments' types
+are. Only the *author of the base class* knows the protocol, and we cannot look
+into his or her head.
+
+Therefore, to mock such a function, the *user* must teach the mock object how to
+figure out the number of arguments and their types. One way to do it is to
+provide overloaded versions of the function.
+
+Ellipsis arguments are inherited from C and not really a C++ feature. They are
+unsafe to use and don't work with arguments that have constructors or
+destructors. Therefore we recommend to avoid them in C++ as much as possible.
+
+### MSVC gives me warning C4301 or C4373 when I define a mock method with a const parameter. Why?
+
+If you compile this using Microsoft Visual C++ 2005 SP1:
+
+```cpp
+class Foo {
+  ...
+  virtual void Bar(const int i) = 0;
+};
+
+class MockFoo : public Foo {
+  ...
+  MOCK_METHOD(void, Bar, (const int i), (override));
+};
+```
+
+You may get the following warning:
+
+```shell
+warning C4301: 'MockFoo::Bar': overriding virtual function only differs from 'Foo::Bar' by const/volatile qualifier
+```
+
+This is a MSVC bug. The same code compiles fine with gcc, for example. If you
+use Visual C++ 2008 SP1, you would get the warning:
+
+```shell
+warning C4373: 'MockFoo::Bar': virtual function overrides 'Foo::Bar', previous versions of the compiler did not override when parameters only differed by const/volatile qualifiers
+```
+
+In C++, if you *declare* a function with a `const` parameter, the `const`
+modifier is ignored. Therefore, the `Foo` base class above is equivalent to:
+
+```cpp
+class Foo {
+  ...
+  virtual void Bar(int i) = 0;  // int or const int?  Makes no difference.
+};
+```
+
+In fact, you can *declare* `Bar()` with an `int` parameter, and define it with a
+`const int` parameter. The compiler will still match them up.
+
+Since making a parameter `const` is meaningless in the method declaration, we
+recommend to remove it in both `Foo` and `MockFoo`. That should workaround the
+VC bug.
+
+Note that we are talking about the *top-level* `const` modifier here. If the
+function parameter is passed by pointer or reference, declaring the pointee or
+referee as `const` is still meaningful. For example, the following two
+declarations are *not* equivalent:
+
+```cpp
+void Bar(int* p);         // Neither p nor *p is const.
+void Bar(const int* p);  // p is not const, but *p is.
+```
+
+<!-- GOOGLETEST_CM0030 DO NOT DELETE -->
+
+### I can't figure out why gMock thinks my expectations are not satisfied. What should I do?
+
+You might want to run your test with `--gmock_verbose=info`. This flag lets
+gMock print a trace of every mock function call it receives. By studying the
+trace, you'll gain insights on why the expectations you set are not met.
+
+If you see the message "The mock function has no default action set, and its
+return type has no default value set.", then try
+[adding a default action](for_dummies.md#DefaultValue). Due to a known issue,
+unexpected calls on mocks without default actions don't print out a detailed
+comparison between the actual arguments and the expected arguments.
+
+### My program crashed and `ScopedMockLog` spit out tons of messages. Is it a gMock bug?
+
+gMock and `ScopedMockLog` are likely doing the right thing here.
+
+When a test crashes, the failure signal handler will try to log a lot of
+information (the stack trace, and the address map, for example). The messages
+are compounded if you have many threads with depth stacks. When `ScopedMockLog`
+intercepts these messages and finds that they don't match any expectations, it
+prints an error for each of them.
+
+You can learn to ignore the errors, or you can rewrite your expectations to make
+your test more robust, for example, by adding something like:
+
+```cpp
+using ::testing::AnyNumber;
+using ::testing::Not;
+...
+  // Ignores any log not done by us.
+  EXPECT_CALL(log, Log(_, Not(EndsWith("/my_file.cc")), _))
+      .Times(AnyNumber());
+```
+
+### How can I assert that a function is NEVER called?
+
+```cpp
+using ::testing::_;
+...
+  EXPECT_CALL(foo, Bar(_))
+      .Times(0);
+```
+
+<!-- GOOGLETEST_CM0031 DO NOT DELETE -->
+
+### I have a failed test where gMock tells me TWICE that a particular expectation is not satisfied. Isn't this redundant?
+
+When gMock detects a failure, it prints relevant information (the mock function
+arguments, the state of relevant expectations, and etc) to help the user debug.
+If another failure is detected, gMock will do the same, including printing the
+state of relevant expectations.
+
+Sometimes an expectation's state didn't change between two failures, and you'll
+see the same description of the state twice. They are however *not* redundant,
+as they refer to *different points in time*. The fact they are the same *is*
+interesting information.
+
+### I get a heapcheck failure when using a mock object, but using a real object is fine. What can be wrong?
+
+Does the class (hopefully a pure interface) you are mocking have a virtual
+destructor?
+
+Whenever you derive from a base class, make sure its destructor is virtual.
+Otherwise Bad Things will happen. Consider the following code:
+
+```cpp
+class Base {
+ public:
+  // Not virtual, but should be.
+  ~Base() { ... }
+  ...
+};
+
+class Derived : public Base {
+ public:
+  ...
+ private:
+  std::string value_;
+};
+
+...
+  Base* p = new Derived;
+  ...
+  delete p;  // Surprise! ~Base() will be called, but ~Derived() will not
+                 // - value_ is leaked.
+```
+
+By changing `~Base()` to virtual, `~Derived()` will be correctly called when
+`delete p` is executed, and the heap checker will be happy.
+
+### The "newer expectations override older ones" rule makes writing expectations awkward. Why does gMock do that?
+
+When people complain about this, often they are referring to code like:
+
+```cpp
+using ::testing::Return;
+...
+  // foo.Bar() should be called twice, return 1 the first time, and return
+  // 2 the second time.  However, I have to write the expectations in the
+  // reverse order.  This sucks big time!!!
+  EXPECT_CALL(foo, Bar())
+      .WillOnce(Return(2))
+      .RetiresOnSaturation();
+  EXPECT_CALL(foo, Bar())
+      .WillOnce(Return(1))
+      .RetiresOnSaturation();
+```
+
+The problem, is that they didn't pick the **best** way to express the test's
+intent.
+
+By default, expectations don't have to be matched in *any* particular order. If
+you want them to match in a certain order, you need to be explicit. This is
+gMock's (and jMock's) fundamental philosophy: it's easy to accidentally
+over-specify your tests, and we want to make it harder to do so.
+
+There are two better ways to write the test spec. You could either put the
+expectations in sequence:
+
+```cpp
+using ::testing::Return;
+...
+  // foo.Bar() should be called twice, return 1 the first time, and return
+  // 2 the second time.  Using a sequence, we can write the expectations
+  // in their natural order.
+  {
+    InSequence s;
+    EXPECT_CALL(foo, Bar())
+        .WillOnce(Return(1))
+        .RetiresOnSaturation();
+    EXPECT_CALL(foo, Bar())
+        .WillOnce(Return(2))
+        .RetiresOnSaturation();
+  }
+```
+
+or you can put the sequence of actions in the same expectation:
+
+```cpp
+using ::testing::Return;
+...
+  // foo.Bar() should be called twice, return 1 the first time, and return
+  // 2 the second time.
+  EXPECT_CALL(foo, Bar())
+      .WillOnce(Return(1))
+      .WillOnce(Return(2))
+      .RetiresOnSaturation();
+```
+
+Back to the original questions: why does gMock search the expectations (and
+`ON_CALL`s) from back to front? Because this allows a user to set up a mock's
+behavior for the common case early (e.g. in the mock's constructor or the test
+fixture's set-up phase) and customize it with more specific rules later. If
+gMock searches from front to back, this very useful pattern won't be possible.
+
+### gMock prints a warning when a function without EXPECT_CALL is called, even if I have set its behavior using ON_CALL. Would it be reasonable not to show the warning in this case?
+
+When choosing between being neat and being safe, we lean toward the latter. So
+the answer is that we think it's better to show the warning.
+
+Often people write `ON_CALL`s in the mock object's constructor or `SetUp()`, as
+the default behavior rarely changes from test to test. Then in the test body
+they set the expectations, which are often different for each test. Having an
+`ON_CALL` in the set-up part of a test doesn't mean that the calls are expected.
+If there's no `EXPECT_CALL` and the method is called, it's possibly an error. If
+we quietly let the call go through without notifying the user, bugs may creep in
+unnoticed.
+
+If, however, you are sure that the calls are OK, you can write
+
+```cpp
+using ::testing::_;
+...
+  EXPECT_CALL(foo, Bar(_))
+      .WillRepeatedly(...);
+```
+
+instead of
+
+```cpp
+using ::testing::_;
+...
+  ON_CALL(foo, Bar(_))
+      .WillByDefault(...);
+```
+
+This tells gMock that you do expect the calls and no warning should be printed.
+
+Also, you can control the verbosity by specifying `--gmock_verbose=error`. Other
+values are `info` and `warning`. If you find the output too noisy when
+debugging, just choose a less verbose level.
+
+### How can I delete the mock function's argument in an action?
+
+If your mock function takes a pointer argument and you want to delete that
+argument, you can use testing::DeleteArg<N>() to delete the N'th (zero-indexed)
+argument:
+
+```cpp
+using ::testing::_;
+  ...
+  MOCK_METHOD(void, Bar, (X* x, const Y& y));
+  ...
+  EXPECT_CALL(mock_foo_, Bar(_, _))
+      .WillOnce(testing::DeleteArg<0>()));
+```
+
+### How can I perform an arbitrary action on a mock function's argument?
+
+If you find yourself needing to perform some action that's not supported by
+gMock directly, remember that you can define your own actions using
+[`MakeAction()`](#NewMonoActions) or
+[`MakePolymorphicAction()`](#NewPolyActions), or you can write a stub function
+and invoke it using [`Invoke()`](#FunctionsAsActions).
+
+```cpp
+using ::testing::_;
+using ::testing::Invoke;
+  ...
+  MOCK_METHOD(void, Bar, (X* p));
+  ...
+  EXPECT_CALL(mock_foo_, Bar(_))
+      .WillOnce(Invoke(MyAction(...)));
+```
+
+### My code calls a static/global function. Can I mock it?
+
+You can, but you need to make some changes.
+
+In general, if you find yourself needing to mock a static function, it's a sign
+that your modules are too tightly coupled (and less flexible, less reusable,
+less testable, etc). You are probably better off defining a small interface and
+call the function through that interface, which then can be easily mocked. It's
+a bit of work initially, but usually pays for itself quickly.
+
+This Google Testing Blog
+[post](https://testing.googleblog.com/2008/06/defeat-static-cling.html) says it
+excellently. Check it out.
+
+### My mock object needs to do complex stuff. It's a lot of pain to specify the actions. gMock sucks!
+
+I know it's not a question, but you get an answer for free any way. :-)
+
+With gMock, you can create mocks in C++ easily. And people might be tempted to
+use them everywhere. Sometimes they work great, and sometimes you may find them,
+well, a pain to use. So, what's wrong in the latter case?
+
+When you write a test without using mocks, you exercise the code and assert that
+it returns the correct value or that the system is in an expected state. This is
+sometimes called "state-based testing".
+
+Mocks are great for what some call "interaction-based" testing: instead of
+checking the system state at the very end, mock objects verify that they are
+invoked the right way and report an error as soon as it arises, giving you a
+handle on the precise context in which the error was triggered. This is often
+more effective and economical to do than state-based testing.
+
+If you are doing state-based testing and using a test double just to simulate
+the real object, you are probably better off using a fake. Using a mock in this
+case causes pain, as it's not a strong point for mocks to perform complex
+actions. If you experience this and think that mocks suck, you are just not
+using the right tool for your problem. Or, you might be trying to solve the
+wrong problem. :-)
+
+### I got a warning "Uninteresting function call encountered - default action taken.." Should I panic?
+
+By all means, NO! It's just an FYI. :-)
+
+What it means is that you have a mock function, you haven't set any expectations
+on it (by gMock's rule this means that you are not interested in calls to this
+function and therefore it can be called any number of times), and it is called.
+That's OK - you didn't say it's not OK to call the function!
+
+What if you actually meant to disallow this function to be called, but forgot to
+write `EXPECT_CALL(foo, Bar()).Times(0)`? While one can argue that it's the
+user's fault, gMock tries to be nice and prints you a note.
+
+So, when you see the message and believe that there shouldn't be any
+uninteresting calls, you should investigate what's going on. To make your life
+easier, gMock dumps the stack trace when an uninteresting call is encountered.
+From that you can figure out which mock function it is, and how it is called.
+
+### I want to define a custom action. Should I use Invoke() or implement the ActionInterface interface?
+
+Either way is fine - you want to choose the one that's more convenient for your
+circumstance.
+
+Usually, if your action is for a particular function type, defining it using
+`Invoke()` should be easier; if your action can be used in functions of
+different types (e.g. if you are defining `Return(*value*)`),
+`MakePolymorphicAction()` is easiest. Sometimes you want precise control on what
+types of functions the action can be used in, and implementing `ActionInterface`
+is the way to go here. See the implementation of `Return()` in
+`testing/base/public/gmock-actions.h` for an example.
+
+### I use SetArgPointee() in WillOnce(), but gcc complains about "conflicting return type specified". What does it mean?
+
+You got this error as gMock has no idea what value it should return when the
+mock method is called. `SetArgPointee()` says what the side effect is, but
+doesn't say what the return value should be. You need `DoAll()` to chain a
+`SetArgPointee()` with a `Return()` that provides a value appropriate to the API
+being mocked.
+
+See this [recipe](cook_book.md#mocking-side-effects) for more details and an
+example.
+
+### I have a huge mock class, and Microsoft Visual C++ runs out of memory when compiling it. What can I do?
+
+We've noticed that when the `/clr` compiler flag is used, Visual C++ uses 5~6
+times as much memory when compiling a mock class. We suggest to avoid `/clr`
+when compiling native C++ mocks.
diff --git a/GraphBLAS/CUDA/test/googlemock/docs/pump_manual.md b/GraphBLAS/CUDA/test/googlemock/docs/pump_manual.md
new file mode 100644
index 0000000000..cdf7c57da2
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/docs/pump_manual.md
@@ -0,0 +1,187 @@
+<b>P</b>ump is <b>U</b>seful for <b>M</b>eta <b>P</b>rogramming.
+
+# The Problem
+
+Template and macro libraries often need to define many classes, functions, or
+macros that vary only (or almost only) in the number of arguments they take.
+It's a lot of repetitive, mechanical, and error-prone work.
+
+Our experience is that it's tedious to write custom scripts, which tend to
+reflect the structure of the generated code poorly and are often hard to read
+and edit. For example, a small change needed in the generated code may require
+some non-intuitive, non-trivial changes in the script. This is especially
+painful when experimenting with the code.
+
+This script may be useful for generating meta code, for example a series of
+macros of FOO1, FOO2, etc. Nevertheless, please make it your last resort
+technique by favouring C++ template metaprogramming or variadic macros.
+
+# Our Solution
+
+Pump (for Pump is Useful for Meta Programming, Pretty Useful for Meta
+Programming, or Practical Utility for Meta Programming, whichever you prefer) is
+a simple meta-programming tool for C++. The idea is that a programmer writes a
+`foo.pump` file which contains C++ code plus meta code that manipulates the C++
+code. The meta code can handle iterations over a range, nested iterations, local
+meta variable definitions, simple arithmetic, and conditional expressions. You
+can view it as a small Domain-Specific Language. The meta language is designed
+to be non-intrusive (s.t. it won't confuse Emacs' C++ mode, for example) and
+concise, making Pump code intuitive and easy to maintain.
+
+## Highlights
+
+*   The implementation is in a single Python script and thus ultra portable: no
+    build or installation is needed and it works cross platforms.
+*   Pump tries to be smart with respect to
+    [Google's style guide](https://github.com/google/styleguide): it breaks long
+    lines (easy to have when they are generated) at acceptable places to fit
+    within 80 columns and indent the continuation lines correctly.
+*   The format is human-readable and more concise than XML.
+*   The format works relatively well with Emacs' C++ mode.
+
+## Examples
+
+The following Pump code (where meta keywords start with `$`, `[[` and `]]` are
+meta brackets, and `$$` starts a meta comment that ends with the line):
+
+```
+$var n = 3     $$ Defines a meta variable n.
+$range i 0..n  $$ Declares the range of meta iterator i (inclusive).
+$for i [[
+               $$ Meta loop.
+// Foo$i does blah for $i-ary predicates.
+$range j 1..i
+template <size_t N $for j [[, typename A$j]]>
+class Foo$i {
+$if i == 0 [[
+  blah a;
+]] $elif i <= 2 [[
+  blah b;
+]] $else [[
+  blah c;
+]]
+};
+
+]]
+```
+
+will be translated by the Pump compiler to:
+
+```cpp
+// Foo0 does blah for 0-ary predicates.
+template <size_t N>
+class Foo0 {
+  blah a;
+};
+
+// Foo1 does blah for 1-ary predicates.
+template <size_t N, typename A1>
+class Foo1 {
+  blah b;
+};
+
+// Foo2 does blah for 2-ary predicates.
+template <size_t N, typename A1, typename A2>
+class Foo2 {
+  blah b;
+};
+
+// Foo3 does blah for 3-ary predicates.
+template <size_t N, typename A1, typename A2, typename A3>
+class Foo3 {
+  blah c;
+};
+```
+
+In another example,
+
+```
+$range i 1..n
+Func($for i + [[a$i]]);
+$$ The text between i and [[ is the separator between iterations.
+```
+
+will generate one of the following lines (without the comments), depending on
+the value of `n`:
+
+```cpp
+Func();              // If n is 0.
+Func(a1);            // If n is 1.
+Func(a1 + a2);       // If n is 2.
+Func(a1 + a2 + a3);  // If n is 3.
+// And so on...
+```
+
+## Constructs
+
+We support the following meta programming constructs:
+
+| `$var id = exp`                  | Defines a named constant value. `$id` is |
+:                                  : valid util the end of the current meta   :
+:                                  : lexical block.                           :
+| :------------------------------- | :--------------------------------------- |
+| `$range id exp..exp`             | Sets the range of an iteration variable, |
+:                                  : which can be reused in multiple loops    :
+:                                  : later.                                   :
+| `$for id sep [[ code ]]`         | Iteration. The range of `id` must have   |
+:                                  : been defined earlier. `$id` is valid in  :
+:                                  : `code`.                                  :
+| `$($)`                           | Generates a single `$` character.        |
+| `$id`                            | Value of the named constant or iteration |
+:                                  : variable.                                :
+| `$(exp)`                         | Value of the expression.                 |
+| `$if exp [[ code ]] else_branch` | Conditional.                             |
+| `[[ code ]]`                     | Meta lexical block.                      |
+| `cpp_code`                       | Raw C++ code.                            |
+| `$$ comment`                     | Meta comment.                            |
+
+**Note:** To give the user some freedom in formatting the Pump source code, Pump
+ignores a new-line character if it's right after `$for foo` or next to `[[` or
+`]]`. Without this rule you'll often be forced to write very long lines to get
+the desired output. Therefore sometimes you may need to insert an extra new-line
+in such places for a new-line to show up in your output.
+
+## Grammar
+
+```ebnf
+code ::= atomic_code*
+atomic_code ::= $var id = exp
+    | $var id = [[ code ]]
+    | $range id exp..exp
+    | $for id sep [[ code ]]
+    | $($)
+    | $id
+    | $(exp)
+    | $if exp [[ code ]] else_branch
+    | [[ code ]]
+    | cpp_code
+sep ::= cpp_code | empty_string
+else_branch ::= $else [[ code ]]
+    | $elif exp [[ code ]] else_branch
+    | empty_string
+exp ::= simple_expression_in_Python_syntax
+```
+
+## Code
+
+You can find the source code of Pump in [scripts/pump.py](../scripts/pump.py).
+It is still very unpolished and lacks automated tests, although it has been
+successfully used many times. If you find a chance to use it in your project,
+please let us know what you think! We also welcome help on improving Pump.
+
+## Real Examples
+
+You can find real-world applications of Pump in
+[Google Test](https://github.com/google/googletest/tree/master/googletest) and
+[Google Mock](https://github.com/google/googletest/tree/master/googlemock). The
+source file `foo.h.pump` generates `foo.h`.
+
+## Tips
+
+*   If a meta variable is followed by a letter or digit, you can separate them
+    using `[[]]`, which inserts an empty string. For example `Foo$j[[]]Helper`
+    generate `Foo1Helper` when `j` is 1.
+*   To avoid extra-long Pump source lines, you can break a line anywhere you
+    want by inserting `[[]]` followed by a new line. Since any new-line
+    character next to `[[` or `]]` is ignored, the generated code won't contain
+    this new line.
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-actions.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-actions.h
new file mode 100644
index 0000000000..615651b342
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-actions.h
@@ -0,0 +1,1567 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// The ACTION* family of macros can be used in a namespace scope to
+// define custom actions easily.  The syntax:
+//
+//   ACTION(name) { statements; }
+//
+// will define an action with the given name that executes the
+// statements.  The value returned by the statements will be used as
+// the return value of the action.  Inside the statements, you can
+// refer to the K-th (0-based) argument of the mock function by
+// 'argK', and refer to its type by 'argK_type'.  For example:
+//
+//   ACTION(IncrementArg1) {
+//     arg1_type temp = arg1;
+//     return ++(*temp);
+//   }
+//
+// allows you to write
+//
+//   ...WillOnce(IncrementArg1());
+//
+// You can also refer to the entire argument tuple and its type by
+// 'args' and 'args_type', and refer to the mock function type and its
+// return type by 'function_type' and 'return_type'.
+//
+// Note that you don't need to specify the types of the mock function
+// arguments.  However rest assured that your code is still type-safe:
+// you'll get a compiler error if *arg1 doesn't support the ++
+// operator, or if the type of ++(*arg1) isn't compatible with the
+// mock function's return type, for example.
+//
+// Sometimes you'll want to parameterize the action.   For that you can use
+// another macro:
+//
+//   ACTION_P(name, param_name) { statements; }
+//
+// For example:
+//
+//   ACTION_P(Add, n) { return arg0 + n; }
+//
+// will allow you to write:
+//
+//   ...WillOnce(Add(5));
+//
+// Note that you don't need to provide the type of the parameter
+// either.  If you need to reference the type of a parameter named
+// 'foo', you can write 'foo_type'.  For example, in the body of
+// ACTION_P(Add, n) above, you can write 'n_type' to refer to the type
+// of 'n'.
+//
+// We also provide ACTION_P2, ACTION_P3, ..., up to ACTION_P10 to support
+// multi-parameter actions.
+//
+// For the purpose of typing, you can view
+//
+//   ACTION_Pk(Foo, p1, ..., pk) { ... }
+//
+// as shorthand for
+//
+//   template <typename p1_type, ..., typename pk_type>
+//   FooActionPk<p1_type, ..., pk_type> Foo(p1_type p1, ..., pk_type pk) { ... }
+//
+// In particular, you can provide the template type arguments
+// explicitly when invoking Foo(), as in Foo<long, bool>(5, false);
+// although usually you can rely on the compiler to infer the types
+// for you automatically.  You can assign the result of expression
+// Foo(p1, ..., pk) to a variable of type FooActionPk<p1_type, ...,
+// pk_type>.  This can be useful when composing actions.
+//
+// You can also overload actions with different numbers of parameters:
+//
+//   ACTION_P(Plus, a) { ... }
+//   ACTION_P2(Plus, a, b) { ... }
+//
+// While it's tempting to always use the ACTION* macros when defining
+// a new action, you should also consider implementing ActionInterface
+// or using MakePolymorphicAction() instead, especially if you need to
+// use the action a lot.  While these approaches require more work,
+// they give you more control on the types of the mock function
+// arguments and the action parameters, which in general leads to
+// better compiler error messages that pay off in the long run.  They
+// also allow overloading actions based on parameter types (as opposed
+// to just based on the number of parameters).
+//
+// CAVEAT:
+//
+// ACTION*() can only be used in a namespace scope as templates cannot be
+// declared inside of a local class.
+// Users can, however, define any local functors (e.g. a lambda) that
+// can be used as actions.
+//
+// MORE INFORMATION:
+//
+// To learn more about using these macros, please search for 'ACTION' on
+// https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
+
+#ifndef _WIN32_WCE
+# include <errno.h>
+#endif
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gmock/internal/gmock-pp.h"
+
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+namespace testing {
+
+// To implement an action Foo, define:
+//   1. a class FooAction that implements the ActionInterface interface, and
+//   2. a factory function that creates an Action object from a
+//      const FooAction*.
+//
+// The two-level delegation design follows that of Matcher, providing
+// consistency for extension developers.  It also eases ownership
+// management as Action objects can now be copied like plain values.
+
+namespace internal {
+
+// BuiltInDefaultValueGetter<T, true>::Get() returns a
+// default-constructed T value.  BuiltInDefaultValueGetter<T,
+// false>::Get() crashes with an error.
+//
+// This primary template is used when kDefaultConstructible is true.
+template <typename T, bool kDefaultConstructible>
+struct BuiltInDefaultValueGetter {
+  static T Get() { return T(); }
+};
+template <typename T>
+struct BuiltInDefaultValueGetter<T, false> {
+  static T Get() {
+    Assert(false, __FILE__, __LINE__,
+           "Default action undefined for the function return type.");
+    return internal::Invalid<T>();
+    // The above statement will never be reached, but is required in
+    // order for this function to compile.
+  }
+};
+
+// BuiltInDefaultValue<T>::Get() returns the "built-in" default value
+// for type T, which is NULL when T is a raw pointer type, 0 when T is
+// a numeric type, false when T is bool, or "" when T is string or
+// std::string.  In addition, in C++11 and above, it turns a
+// default-constructed T value if T is default constructible.  For any
+// other type T, the built-in default T value is undefined, and the
+// function will abort the process.
+template <typename T>
+class BuiltInDefaultValue {
+ public:
+  // This function returns true if and only if type T has a built-in default
+  // value.
+  static bool Exists() {
+    return ::std::is_default_constructible<T>::value;
+  }
+
+  static T Get() {
+    return BuiltInDefaultValueGetter<
+        T, ::std::is_default_constructible<T>::value>::Get();
+  }
+};
+
+// This partial specialization says that we use the same built-in
+// default value for T and const T.
+template <typename T>
+class BuiltInDefaultValue<const T> {
+ public:
+  static bool Exists() { return BuiltInDefaultValue<T>::Exists(); }
+  static T Get() { return BuiltInDefaultValue<T>::Get(); }
+};
+
+// This partial specialization defines the default values for pointer
+// types.
+template <typename T>
+class BuiltInDefaultValue<T*> {
+ public:
+  static bool Exists() { return true; }
+  static T* Get() { return nullptr; }
+};
+
+// The following specializations define the default values for
+// specific types we care about.
+#define GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(type, value) \
+  template <> \
+  class BuiltInDefaultValue<type> { \
+   public: \
+    static bool Exists() { return true; } \
+    static type Get() { return value; } \
+  }
+
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(void, );  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(::std::string, "");
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(bool, false);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned char, '\0');
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed char, '\0');
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(char, '\0');
+
+// There's no need for a default action for signed wchar_t, as that
+// type is the same as wchar_t for gcc, and invalid for MSVC.
+//
+// There's also no need for a default action for unsigned wchar_t, as
+// that type is the same as unsigned int for gcc, and invalid for
+// MSVC.
+#if GMOCK_WCHAR_T_IS_NATIVE_
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(wchar_t, 0U);  // NOLINT
+#endif
+
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned short, 0U);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed short, 0);     // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned int, 0U);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed int, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long, 0UL);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long, 0L);     // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(unsigned long long, 0);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(signed long long, 0);  // NOLINT
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(float, 0);
+GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_(double, 0);
+
+#undef GMOCK_DEFINE_DEFAULT_ACTION_FOR_RETURN_TYPE_
+
+// Simple two-arg form of std::disjunction.
+template <typename P, typename Q>
+using disjunction = typename ::std::conditional<P::value, P, Q>::type;
+
+}  // namespace internal
+
+// When an unexpected function call is encountered, Google Mock will
+// let it return a default value if the user has specified one for its
+// return type, or if the return type has a built-in default value;
+// otherwise Google Mock won't know what value to return and will have
+// to abort the process.
+//
+// The DefaultValue<T> class allows a user to specify the
+// default value for a type T that is both copyable and publicly
+// destructible (i.e. anything that can be used as a function return
+// type).  The usage is:
+//
+//   // Sets the default value for type T to be foo.
+//   DefaultValue<T>::Set(foo);
+template <typename T>
+class DefaultValue {
+ public:
+  // Sets the default value for type T; requires T to be
+  // copy-constructable and have a public destructor.
+  static void Set(T x) {
+    delete producer_;
+    producer_ = new FixedValueProducer(x);
+  }
+
+  // Provides a factory function to be called to generate the default value.
+  // This method can be used even if T is only move-constructible, but it is not
+  // limited to that case.
+  typedef T (*FactoryFunction)();
+  static void SetFactory(FactoryFunction factory) {
+    delete producer_;
+    producer_ = new FactoryValueProducer(factory);
+  }
+
+  // Unsets the default value for type T.
+  static void Clear() {
+    delete producer_;
+    producer_ = nullptr;
+  }
+
+  // Returns true if and only if the user has set the default value for type T.
+  static bool IsSet() { return producer_ != nullptr; }
+
+  // Returns true if T has a default return value set by the user or there
+  // exists a built-in default value.
+  static bool Exists() {
+    return IsSet() || internal::BuiltInDefaultValue<T>::Exists();
+  }
+
+  // Returns the default value for type T if the user has set one;
+  // otherwise returns the built-in default value. Requires that Exists()
+  // is true, which ensures that the return value is well-defined.
+  static T Get() {
+    return producer_ == nullptr ? internal::BuiltInDefaultValue<T>::Get()
+                                : producer_->Produce();
+  }
+
+ private:
+  class ValueProducer {
+   public:
+    virtual ~ValueProducer() {}
+    virtual T Produce() = 0;
+  };
+
+  class FixedValueProducer : public ValueProducer {
+   public:
+    explicit FixedValueProducer(T value) : value_(value) {}
+    T Produce() override { return value_; }
+
+   private:
+    const T value_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(FixedValueProducer);
+  };
+
+  class FactoryValueProducer : public ValueProducer {
+   public:
+    explicit FactoryValueProducer(FactoryFunction factory)
+        : factory_(factory) {}
+    T Produce() override { return factory_(); }
+
+   private:
+    const FactoryFunction factory_;
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(FactoryValueProducer);
+  };
+
+  static ValueProducer* producer_;
+};
+
+// This partial specialization allows a user to set default values for
+// reference types.
+template <typename T>
+class DefaultValue<T&> {
+ public:
+  // Sets the default value for type T&.
+  static void Set(T& x) {  // NOLINT
+    address_ = &x;
+  }
+
+  // Unsets the default value for type T&.
+  static void Clear() { address_ = nullptr; }
+
+  // Returns true if and only if the user has set the default value for type T&.
+  static bool IsSet() { return address_ != nullptr; }
+
+  // Returns true if T has a default return value set by the user or there
+  // exists a built-in default value.
+  static bool Exists() {
+    return IsSet() || internal::BuiltInDefaultValue<T&>::Exists();
+  }
+
+  // Returns the default value for type T& if the user has set one;
+  // otherwise returns the built-in default value if there is one;
+  // otherwise aborts the process.
+  static T& Get() {
+    return address_ == nullptr ? internal::BuiltInDefaultValue<T&>::Get()
+                               : *address_;
+  }
+
+ private:
+  static T* address_;
+};
+
+// This specialization allows DefaultValue<void>::Get() to
+// compile.
+template <>
+class DefaultValue<void> {
+ public:
+  static bool Exists() { return true; }
+  static void Get() {}
+};
+
+// Points to the user-set default value for type T.
+template <typename T>
+typename DefaultValue<T>::ValueProducer* DefaultValue<T>::producer_ = nullptr;
+
+// Points to the user-set default value for type T&.
+template <typename T>
+T* DefaultValue<T&>::address_ = nullptr;
+
+// Implement this interface to define an action for function type F.
+template <typename F>
+class ActionInterface {
+ public:
+  typedef typename internal::Function<F>::Result Result;
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  ActionInterface() {}
+  virtual ~ActionInterface() {}
+
+  // Performs the action.  This method is not const, as in general an
+  // action can have side effects and be stateful.  For example, a
+  // get-the-next-element-from-the-collection action will need to
+  // remember the current element.
+  virtual Result Perform(const ArgumentTuple& args) = 0;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionInterface);
+};
+
+// An Action<F> is a copyable and IMMUTABLE (except by assignment)
+// object that represents an action to be taken when a mock function
+// of type F is called.  The implementation of Action<T> is just a
+// std::shared_ptr to const ActionInterface<T>. Don't inherit from Action!
+// You can view an object implementing ActionInterface<F> as a
+// concrete action (including its current state), and an Action<F>
+// object as a handle to it.
+template <typename F>
+class Action {
+  // Adapter class to allow constructing Action from a legacy ActionInterface.
+  // New code should create Actions from functors instead.
+  struct ActionAdapter {
+    // Adapter must be copyable to satisfy std::function requirements.
+    ::std::shared_ptr<ActionInterface<F>> impl_;
+
+    template <typename... Args>
+    typename internal::Function<F>::Result operator()(Args&&... args) {
+      return impl_->Perform(
+          ::std::forward_as_tuple(::std::forward<Args>(args)...));
+    }
+  };
+
+ public:
+  typedef typename internal::Function<F>::Result Result;
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+  // Constructs a null Action.  Needed for storing Action objects in
+  // STL containers.
+  Action() {}
+
+  // Construct an Action from a specified callable.
+  // This cannot take std::function directly, because then Action would not be
+  // directly constructible from lambda (it would require two conversions).
+  template <typename G,
+            typename IsCompatibleFunctor =
+                ::std::is_constructible<::std::function<F>, G>,
+            typename IsNoArgsFunctor =
+                ::std::is_constructible<::std::function<Result()>, G>,
+            typename = typename ::std::enable_if<internal::disjunction<
+                IsCompatibleFunctor, IsNoArgsFunctor>::value>::type>
+  Action(G&& fun) {  // NOLINT
+    Init(::std::forward<G>(fun), IsCompatibleFunctor());
+  }
+
+  // Constructs an Action from its implementation.
+  explicit Action(ActionInterface<F>* impl)
+      : fun_(ActionAdapter{::std::shared_ptr<ActionInterface<F>>(impl)}) {}
+
+  // This constructor allows us to turn an Action<Func> object into an
+  // Action<F>, as long as F's arguments can be implicitly converted
+  // to Func's and Func's return type can be implicitly converted to F's.
+  template <typename Func>
+  explicit Action(const Action<Func>& action) : fun_(action.fun_) {}
+
+  // Returns true if and only if this is the DoDefault() action.
+  bool IsDoDefault() const { return fun_ == nullptr; }
+
+  // Performs the action.  Note that this method is const even though
+  // the corresponding method in ActionInterface is not.  The reason
+  // is that a const Action<F> means that it cannot be re-bound to
+  // another concrete action, not that the concrete action it binds to
+  // cannot change state.  (Think of the difference between a const
+  // pointer and a pointer to const.)
+  Result Perform(ArgumentTuple args) const {
+    if (IsDoDefault()) {
+      internal::IllegalDoDefault(__FILE__, __LINE__);
+    }
+    return internal::Apply(fun_, ::std::move(args));
+  }
+
+ private:
+  template <typename G>
+  friend class Action;
+
+  template <typename G>
+  void Init(G&& g, ::std::true_type) {
+    fun_ = ::std::forward<G>(g);
+  }
+
+  template <typename G>
+  void Init(G&& g, ::std::false_type) {
+    fun_ = IgnoreArgs<typename ::std::decay<G>::type>{::std::forward<G>(g)};
+  }
+
+  template <typename FunctionImpl>
+  struct IgnoreArgs {
+    template <typename... Args>
+    Result operator()(const Args&...) const {
+      return function_impl();
+    }
+
+    FunctionImpl function_impl;
+  };
+
+  // fun_ is an empty function if and only if this is the DoDefault() action.
+  ::std::function<F> fun_;
+};
+
+// The PolymorphicAction class template makes it easy to implement a
+// polymorphic action (i.e. an action that can be used in mock
+// functions of than one type, e.g. Return()).
+//
+// To define a polymorphic action, a user first provides a COPYABLE
+// implementation class that has a Perform() method template:
+//
+//   class FooAction {
+//    public:
+//     template <typename Result, typename ArgumentTuple>
+//     Result Perform(const ArgumentTuple& args) const {
+//       // Processes the arguments and returns a result, using
+//       // std::get<N>(args) to get the N-th (0-based) argument in the tuple.
+//     }
+//     ...
+//   };
+//
+// Then the user creates the polymorphic action using
+// MakePolymorphicAction(object) where object has type FooAction.  See
+// the definition of Return(void) and SetArgumentPointee<N>(value) for
+// complete examples.
+template <typename Impl>
+class PolymorphicAction {
+ public:
+  explicit PolymorphicAction(const Impl& impl) : impl_(impl) {}
+
+  template <typename F>
+  operator Action<F>() const {
+    return Action<F>(new MonomorphicImpl<F>(impl_));
+  }
+
+ private:
+  template <typename F>
+  class MonomorphicImpl : public ActionInterface<F> {
+   public:
+    typedef typename internal::Function<F>::Result Result;
+    typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit MonomorphicImpl(const Impl& impl) : impl_(impl) {}
+
+    Result Perform(const ArgumentTuple& args) override {
+      return impl_.template Perform<Result>(args);
+    }
+
+   private:
+    Impl impl_;
+
+    GTEST_DISALLOW_ASSIGN_(MonomorphicImpl);
+  };
+
+  Impl impl_;
+
+  GTEST_DISALLOW_ASSIGN_(PolymorphicAction);
+};
+
+// Creates an Action from its implementation and returns it.  The
+// created Action object owns the implementation.
+template <typename F>
+Action<F> MakeAction(ActionInterface<F>* impl) {
+  return Action<F>(impl);
+}
+
+// Creates a polymorphic action from its implementation.  This is
+// easier to use than the PolymorphicAction<Impl> constructor as it
+// doesn't require you to explicitly write the template argument, e.g.
+//
+//   MakePolymorphicAction(foo);
+// vs
+//   PolymorphicAction<TypeOfFoo>(foo);
+template <typename Impl>
+inline PolymorphicAction<Impl> MakePolymorphicAction(const Impl& impl) {
+  return PolymorphicAction<Impl>(impl);
+}
+
+namespace internal {
+
+// Helper struct to specialize ReturnAction to execute a move instead of a copy
+// on return. Useful for move-only types, but could be used on any type.
+template <typename T>
+struct ByMoveWrapper {
+  explicit ByMoveWrapper(T value) : payload(std::move(value)) {}
+  T payload;
+};
+
+// Implements the polymorphic Return(x) action, which can be used in
+// any function that returns the type of x, regardless of the argument
+// types.
+//
+// Note: The value passed into Return must be converted into
+// Function<F>::Result when this action is cast to Action<F> rather than
+// when that action is performed. This is important in scenarios like
+//
+// MOCK_METHOD1(Method, T(U));
+// ...
+// {
+//   Foo foo;
+//   X x(&foo);
+//   EXPECT_CALL(mock, Method(_)).WillOnce(Return(x));
+// }
+//
+// In the example above the variable x holds reference to foo which leaves
+// scope and gets destroyed.  If copying X just copies a reference to foo,
+// that copy will be left with a hanging reference.  If conversion to T
+// makes a copy of foo, the above code is safe. To support that scenario, we
+// need to make sure that the type conversion happens inside the EXPECT_CALL
+// statement, and conversion of the result of Return to Action<T(U)> is a
+// good place for that.
+//
+// The real life example of the above scenario happens when an invocation
+// of gtl::Container() is passed into Return.
+//
+template <typename R>
+class ReturnAction {
+ public:
+  // Constructs a ReturnAction object from the value to be returned.
+  // 'value' is passed by value instead of by const reference in order
+  // to allow Return("string literal") to compile.
+  explicit ReturnAction(R value) : value_(new R(std::move(value))) {}
+
+  // This template type conversion operator allows Return(x) to be
+  // used in ANY function that returns x's type.
+  template <typename F>
+  operator Action<F>() const {  // NOLINT
+    // Assert statement belongs here because this is the best place to verify
+    // conditions on F. It produces the clearest error messages
+    // in most compilers.
+    // Impl really belongs in this scope as a local class but can't
+    // because MSVC produces duplicate symbols in different translation units
+    // in this case. Until MS fixes that bug we put Impl into the class scope
+    // and put the typedef both here (for use in assert statement) and
+    // in the Impl class. But both definitions must be the same.
+    typedef typename Function<F>::Result Result;
+    GTEST_COMPILE_ASSERT_(
+        !std::is_reference<Result>::value,
+        use_ReturnRef_instead_of_Return_to_return_a_reference);
+    static_assert(!std::is_void<Result>::value,
+                  "Can't use Return() on an action expected to return `void`.");
+    return Action<F>(new Impl<R, F>(value_));
+  }
+
+ private:
+  // Implements the Return(x) action for a particular function type F.
+  template <typename R_, typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    // The implicit cast is necessary when Result has more than one
+    // single-argument constructor (e.g. Result is std::vector<int>) and R
+    // has a type conversion operator template.  In that case, value_(value)
+    // won't compile as the compiler doesn't known which constructor of
+    // Result to call.  ImplicitCast_ forces the compiler to convert R to
+    // Result without considering explicit constructors, thus resolving the
+    // ambiguity. value_ is then initialized using its copy constructor.
+    explicit Impl(const std::shared_ptr<R>& value)
+        : value_before_cast_(*value),
+          value_(ImplicitCast_<Result>(value_before_cast_)) {}
+
+    Result Perform(const ArgumentTuple&) override { return value_; }
+
+   private:
+    GTEST_COMPILE_ASSERT_(!std::is_reference<Result>::value,
+                          Result_cannot_be_a_reference_type);
+    // We save the value before casting just in case it is being cast to a
+    // wrapper type.
+    R value_before_cast_;
+    Result value_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(Impl);
+  };
+
+  // Partially specialize for ByMoveWrapper. This version of ReturnAction will
+  // move its contents instead.
+  template <typename R_, typename F>
+  class Impl<ByMoveWrapper<R_>, F> : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const std::shared_ptr<R>& wrapper)
+        : performed_(false), wrapper_(wrapper) {}
+
+    Result Perform(const ArgumentTuple&) override {
+      GTEST_CHECK_(!performed_)
+          << "A ByMove() action should only be performed once.";
+      performed_ = true;
+      return std::move(wrapper_->payload);
+    }
+
+   private:
+    bool performed_;
+    const std::shared_ptr<R> wrapper_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const std::shared_ptr<R> value_;
+
+  GTEST_DISALLOW_ASSIGN_(ReturnAction);
+};
+
+// Implements the ReturnNull() action.
+class ReturnNullAction {
+ public:
+  // Allows ReturnNull() to be used in any pointer-returning function. In C++11
+  // this is enforced by returning nullptr, and in non-C++11 by asserting a
+  // pointer type on compile time.
+  template <typename Result, typename ArgumentTuple>
+  static Result Perform(const ArgumentTuple&) {
+    return nullptr;
+  }
+};
+
+// Implements the Return() action.
+class ReturnVoidAction {
+ public:
+  // Allows Return() to be used in any void-returning function.
+  template <typename Result, typename ArgumentTuple>
+  static void Perform(const ArgumentTuple&) {
+    static_assert(std::is_void<Result>::value, "Result should be void.");
+  }
+};
+
+// Implements the polymorphic ReturnRef(x) action, which can be used
+// in any function that returns a reference to the type of x,
+// regardless of the argument types.
+template <typename T>
+class ReturnRefAction {
+ public:
+  // Constructs a ReturnRefAction object from the reference to be returned.
+  explicit ReturnRefAction(T& ref) : ref_(ref) {}  // NOLINT
+
+  // This template type conversion operator allows ReturnRef(x) to be
+  // used in ANY function that returns a reference to x's type.
+  template <typename F>
+  operator Action<F>() const {
+    typedef typename Function<F>::Result Result;
+    // Asserts that the function return type is a reference.  This
+    // catches the user error of using ReturnRef(x) when Return(x)
+    // should be used, and generates some helpful error message.
+    GTEST_COMPILE_ASSERT_(std::is_reference<Result>::value,
+                          use_Return_instead_of_ReturnRef_to_return_a_value);
+    return Action<F>(new Impl<F>(ref_));
+  }
+
+ private:
+  // Implements the ReturnRef(x) action for a particular function type F.
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(T& ref) : ref_(ref) {}  // NOLINT
+
+    Result Perform(const ArgumentTuple&) override { return ref_; }
+
+   private:
+    T& ref_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  T& ref_;
+
+  GTEST_DISALLOW_ASSIGN_(ReturnRefAction);
+};
+
+// Implements the polymorphic ReturnRefOfCopy(x) action, which can be
+// used in any function that returns a reference to the type of x,
+// regardless of the argument types.
+template <typename T>
+class ReturnRefOfCopyAction {
+ public:
+  // Constructs a ReturnRefOfCopyAction object from the reference to
+  // be returned.
+  explicit ReturnRefOfCopyAction(const T& value) : value_(value) {}  // NOLINT
+
+  // This template type conversion operator allows ReturnRefOfCopy(x) to be
+  // used in ANY function that returns a reference to x's type.
+  template <typename F>
+  operator Action<F>() const {
+    typedef typename Function<F>::Result Result;
+    // Asserts that the function return type is a reference.  This
+    // catches the user error of using ReturnRefOfCopy(x) when Return(x)
+    // should be used, and generates some helpful error message.
+    GTEST_COMPILE_ASSERT_(
+        std::is_reference<Result>::value,
+        use_Return_instead_of_ReturnRefOfCopy_to_return_a_value);
+    return Action<F>(new Impl<F>(value_));
+  }
+
+ private:
+  // Implements the ReturnRefOfCopy(x) action for a particular function type F.
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename Function<F>::Result Result;
+    typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const T& value) : value_(value) {}  // NOLINT
+
+    Result Perform(const ArgumentTuple&) override { return value_; }
+
+   private:
+    T value_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const T value_;
+
+  GTEST_DISALLOW_ASSIGN_(ReturnRefOfCopyAction);
+};
+
+// Implements the polymorphic ReturnRoundRobin(v) action, which can be
+// used in any function that returns the element_type of v.
+template <typename T>
+class ReturnRoundRobinAction {
+ public:
+  explicit ReturnRoundRobinAction(std::vector<T> values) {
+    GTEST_CHECK_(!values.empty())
+        << "ReturnRoundRobin requires at least one element.";
+    state_->values = std::move(values);
+  }
+
+  template <typename... Args>
+  T operator()(Args&&...) const {
+     return state_->Next();
+  }
+
+ private:
+  struct State {
+    T Next() {
+      T ret_val = values[i++];
+      if (i == values.size()) i = 0;
+      return ret_val;
+    }
+
+    std::vector<T> values;
+    size_t i = 0;
+  };
+  std::shared_ptr<State> state_ = std::make_shared<State>();
+};
+
+// Implements the polymorphic DoDefault() action.
+class DoDefaultAction {
+ public:
+  // This template type conversion operator allows DoDefault() to be
+  // used in any function.
+  template <typename F>
+  operator Action<F>() const { return Action<F>(); }  // NOLINT
+};
+
+// Implements the Assign action to set a given pointer referent to a
+// particular value.
+template <typename T1, typename T2>
+class AssignAction {
+ public:
+  AssignAction(T1* ptr, T2 value) : ptr_(ptr), value_(value) {}
+
+  template <typename Result, typename ArgumentTuple>
+  void Perform(const ArgumentTuple& /* args */) const {
+    *ptr_ = value_;
+  }
+
+ private:
+  T1* const ptr_;
+  const T2 value_;
+
+  GTEST_DISALLOW_ASSIGN_(AssignAction);
+};
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+// Implements the SetErrnoAndReturn action to simulate return from
+// various system calls and libc functions.
+template <typename T>
+class SetErrnoAndReturnAction {
+ public:
+  SetErrnoAndReturnAction(int errno_value, T result)
+      : errno_(errno_value),
+        result_(result) {}
+  template <typename Result, typename ArgumentTuple>
+  Result Perform(const ArgumentTuple& /* args */) const {
+    errno = errno_;
+    return result_;
+  }
+
+ private:
+  const int errno_;
+  const T result_;
+
+  GTEST_DISALLOW_ASSIGN_(SetErrnoAndReturnAction);
+};
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Implements the SetArgumentPointee<N>(x) action for any function
+// whose N-th argument (0-based) is a pointer to x's type.
+template <size_t N, typename A, typename = void>
+struct SetArgumentPointeeAction {
+  A value;
+
+  template <typename... Args>
+  void operator()(const Args&... args) const {
+    *::std::get<N>(std::tie(args...)) = value;
+  }
+};
+
+// Implements the Invoke(object_ptr, &Class::Method) action.
+template <class Class, typename MethodPtr>
+struct InvokeMethodAction {
+  Class* const obj_ptr;
+  const MethodPtr method_ptr;
+
+  template <typename... Args>
+  auto operator()(Args&&... args) const
+      -> decltype((obj_ptr->*method_ptr)(std::forward<Args>(args)...)) {
+    return (obj_ptr->*method_ptr)(std::forward<Args>(args)...);
+  }
+};
+
+// Implements the InvokeWithoutArgs(f) action.  The template argument
+// FunctionImpl is the implementation type of f, which can be either a
+// function pointer or a functor.  InvokeWithoutArgs(f) can be used as an
+// Action<F> as long as f's type is compatible with F.
+template <typename FunctionImpl>
+struct InvokeWithoutArgsAction {
+  FunctionImpl function_impl;
+
+  // Allows InvokeWithoutArgs(f) to be used as any action whose type is
+  // compatible with f.
+  template <typename... Args>
+  auto operator()(const Args&...) -> decltype(function_impl()) {
+    return function_impl();
+  }
+};
+
+// Implements the InvokeWithoutArgs(object_ptr, &Class::Method) action.
+template <class Class, typename MethodPtr>
+struct InvokeMethodWithoutArgsAction {
+  Class* const obj_ptr;
+  const MethodPtr method_ptr;
+
+  using ReturnType =
+      decltype((std::declval<Class*>()->*std::declval<MethodPtr>())());
+
+  template <typename... Args>
+  ReturnType operator()(const Args&...) const {
+    return (obj_ptr->*method_ptr)();
+  }
+};
+
+// Implements the IgnoreResult(action) action.
+template <typename A>
+class IgnoreResultAction {
+ public:
+  explicit IgnoreResultAction(const A& action) : action_(action) {}
+
+  template <typename F>
+  operator Action<F>() const {
+    // Assert statement belongs here because this is the best place to verify
+    // conditions on F. It produces the clearest error messages
+    // in most compilers.
+    // Impl really belongs in this scope as a local class but can't
+    // because MSVC produces duplicate symbols in different translation units
+    // in this case. Until MS fixes that bug we put Impl into the class scope
+    // and put the typedef both here (for use in assert statement) and
+    // in the Impl class. But both definitions must be the same.
+    typedef typename internal::Function<F>::Result Result;
+
+    // Asserts at compile time that F returns void.
+    static_assert(std::is_void<Result>::value, "Result type should be void.");
+
+    return Action<F>(new Impl<F>(action_));
+  }
+
+ private:
+  template <typename F>
+  class Impl : public ActionInterface<F> {
+   public:
+    typedef typename internal::Function<F>::Result Result;
+    typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+
+    explicit Impl(const A& action) : action_(action) {}
+
+    void Perform(const ArgumentTuple& args) override {
+      // Performs the action and ignores its result.
+      action_.Perform(args);
+    }
+
+   private:
+    // Type OriginalFunction is the same as F except that its return
+    // type is IgnoredValue.
+    typedef typename internal::Function<F>::MakeResultIgnoredValue
+        OriginalFunction;
+
+    const Action<OriginalFunction> action_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const A action_;
+
+  GTEST_DISALLOW_ASSIGN_(IgnoreResultAction);
+};
+
+template <typename InnerAction, size_t... I>
+struct WithArgsAction {
+  InnerAction action;
+
+  // The inner action could be anything convertible to Action<X>.
+  // We use the conversion operator to detect the signature of the inner Action.
+  template <typename R, typename... Args>
+  operator Action<R(Args...)>() const {  // NOLINT
+    using TupleType = std::tuple<Args...>;
+    Action<R(typename std::tuple_element<I, TupleType>::type...)>
+        converted(action);
+
+    return [converted](Args... args) -> R {
+      return converted.Perform(std::forward_as_tuple(
+        std::get<I>(std::forward_as_tuple(std::forward<Args>(args)...))...));
+    };
+  }
+};
+
+template <typename... Actions>
+struct DoAllAction {
+ private:
+  template <typename... Args, size_t... I>
+  std::vector<Action<void(Args...)>> Convert(IndexSequence<I...>) const {
+    return {std::get<I>(actions)...};
+  }
+
+ public:
+  std::tuple<Actions...> actions;
+
+  template <typename R, typename... Args>
+  operator Action<R(Args...)>() const {  // NOLINT
+    struct Op {
+      std::vector<Action<void(Args...)>> converted;
+      Action<R(Args...)> last;
+      R operator()(Args... args) const {
+        auto tuple_args = std::forward_as_tuple(std::forward<Args>(args)...);
+        for (auto& a : converted) {
+          a.Perform(tuple_args);
+        }
+        return last.Perform(tuple_args);
+      }
+    };
+    return Op{Convert<Args...>(MakeIndexSequence<sizeof...(Actions) - 1>()),
+              std::get<sizeof...(Actions) - 1>(actions)};
+  }
+};
+
+}  // namespace internal
+
+// An Unused object can be implicitly constructed from ANY value.
+// This is handy when defining actions that ignore some or all of the
+// mock function arguments.  For example, given
+//
+//   MOCK_METHOD3(Foo, double(const string& label, double x, double y));
+//   MOCK_METHOD3(Bar, double(int index, double x, double y));
+//
+// instead of
+//
+//   double DistanceToOriginWithLabel(const string& label, double x, double y) {
+//     return sqrt(x*x + y*y);
+//   }
+//   double DistanceToOriginWithIndex(int index, double x, double y) {
+//     return sqrt(x*x + y*y);
+//   }
+//   ...
+//   EXPECT_CALL(mock, Foo("abc", _, _))
+//       .WillOnce(Invoke(DistanceToOriginWithLabel));
+//   EXPECT_CALL(mock, Bar(5, _, _))
+//       .WillOnce(Invoke(DistanceToOriginWithIndex));
+//
+// you could write
+//
+//   // We can declare any uninteresting argument as Unused.
+//   double DistanceToOrigin(Unused, double x, double y) {
+//     return sqrt(x*x + y*y);
+//   }
+//   ...
+//   EXPECT_CALL(mock, Foo("abc", _, _)).WillOnce(Invoke(DistanceToOrigin));
+//   EXPECT_CALL(mock, Bar(5, _, _)).WillOnce(Invoke(DistanceToOrigin));
+typedef internal::IgnoredValue Unused;
+
+// Creates an action that does actions a1, a2, ..., sequentially in
+// each invocation.
+template <typename... Action>
+internal::DoAllAction<typename std::decay<Action>::type...> DoAll(
+    Action&&... action) {
+  return {std::forward_as_tuple(std::forward<Action>(action)...)};
+}
+
+// WithArg<k>(an_action) creates an action that passes the k-th
+// (0-based) argument of the mock function to an_action and performs
+// it.  It adapts an action accepting one argument to one that accepts
+// multiple arguments.  For convenience, we also provide
+// WithArgs<k>(an_action) (defined below) as a synonym.
+template <size_t k, typename InnerAction>
+internal::WithArgsAction<typename std::decay<InnerAction>::type, k>
+WithArg(InnerAction&& action) {
+  return {std::forward<InnerAction>(action)};
+}
+
+// WithArgs<N1, N2, ..., Nk>(an_action) creates an action that passes
+// the selected arguments of the mock function to an_action and
+// performs it.  It serves as an adaptor between actions with
+// different argument lists.
+template <size_t k, size_t... ks, typename InnerAction>
+internal::WithArgsAction<typename std::decay<InnerAction>::type, k, ks...>
+WithArgs(InnerAction&& action) {
+  return {std::forward<InnerAction>(action)};
+}
+
+// WithoutArgs(inner_action) can be used in a mock function with a
+// non-empty argument list to perform inner_action, which takes no
+// argument.  In other words, it adapts an action accepting no
+// argument to one that accepts (and ignores) arguments.
+template <typename InnerAction>
+internal::WithArgsAction<typename std::decay<InnerAction>::type>
+WithoutArgs(InnerAction&& action) {
+  return {std::forward<InnerAction>(action)};
+}
+
+// Creates an action that returns 'value'.  'value' is passed by value
+// instead of const reference - otherwise Return("string literal")
+// will trigger a compiler error about using array as initializer.
+template <typename R>
+internal::ReturnAction<R> Return(R value) {
+  return internal::ReturnAction<R>(std::move(value));
+}
+
+// Creates an action that returns NULL.
+inline PolymorphicAction<internal::ReturnNullAction> ReturnNull() {
+  return MakePolymorphicAction(internal::ReturnNullAction());
+}
+
+// Creates an action that returns from a void function.
+inline PolymorphicAction<internal::ReturnVoidAction> Return() {
+  return MakePolymorphicAction(internal::ReturnVoidAction());
+}
+
+// Creates an action that returns the reference to a variable.
+template <typename R>
+inline internal::ReturnRefAction<R> ReturnRef(R& x) {  // NOLINT
+  return internal::ReturnRefAction<R>(x);
+}
+
+// Prevent using ReturnRef on reference to temporary.
+template <typename R, R* = nullptr>
+internal::ReturnRefAction<R> ReturnRef(R&&) = delete;
+
+// Creates an action that returns the reference to a copy of the
+// argument.  The copy is created when the action is constructed and
+// lives as long as the action.
+template <typename R>
+inline internal::ReturnRefOfCopyAction<R> ReturnRefOfCopy(const R& x) {
+  return internal::ReturnRefOfCopyAction<R>(x);
+}
+
+// Modifies the parent action (a Return() action) to perform a move of the
+// argument instead of a copy.
+// Return(ByMove()) actions can only be executed once and will assert this
+// invariant.
+template <typename R>
+internal::ByMoveWrapper<R> ByMove(R x) {
+  return internal::ByMoveWrapper<R>(std::move(x));
+}
+
+// Creates an action that returns an element of `vals`. Calling this action will
+// repeatedly return the next value from `vals` until it reaches the end and
+// will restart from the beginning.
+template <typename T>
+internal::ReturnRoundRobinAction<T> ReturnRoundRobin(std::vector<T> vals) {
+  return internal::ReturnRoundRobinAction<T>(std::move(vals));
+}
+
+// Creates an action that returns an element of `vals`. Calling this action will
+// repeatedly return the next value from `vals` until it reaches the end and
+// will restart from the beginning.
+template <typename T>
+internal::ReturnRoundRobinAction<T> ReturnRoundRobin(
+    std::initializer_list<T> vals) {
+  return internal::ReturnRoundRobinAction<T>(std::vector<T>(vals));
+}
+
+// Creates an action that does the default action for the give mock function.
+inline internal::DoDefaultAction DoDefault() {
+  return internal::DoDefaultAction();
+}
+
+// Creates an action that sets the variable pointed by the N-th
+// (0-based) function argument to 'value'.
+template <size_t N, typename T>
+internal::SetArgumentPointeeAction<N, T> SetArgPointee(T value) {
+  return {std::move(value)};
+}
+
+// The following version is DEPRECATED.
+template <size_t N, typename T>
+internal::SetArgumentPointeeAction<N, T> SetArgumentPointee(T value) {
+  return {std::move(value)};
+}
+
+// Creates an action that sets a pointer referent to a given value.
+template <typename T1, typename T2>
+PolymorphicAction<internal::AssignAction<T1, T2> > Assign(T1* ptr, T2 val) {
+  return MakePolymorphicAction(internal::AssignAction<T1, T2>(ptr, val));
+}
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+// Creates an action that sets errno and returns the appropriate error.
+template <typename T>
+PolymorphicAction<internal::SetErrnoAndReturnAction<T> >
+SetErrnoAndReturn(int errval, T result) {
+  return MakePolymorphicAction(
+      internal::SetErrnoAndReturnAction<T>(errval, result));
+}
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Various overloads for Invoke().
+
+// Legacy function.
+// Actions can now be implicitly constructed from callables. No need to create
+// wrapper objects.
+// This function exists for backwards compatibility.
+template <typename FunctionImpl>
+typename std::decay<FunctionImpl>::type Invoke(FunctionImpl&& function_impl) {
+  return std::forward<FunctionImpl>(function_impl);
+}
+
+// Creates an action that invokes the given method on the given object
+// with the mock function's arguments.
+template <class Class, typename MethodPtr>
+internal::InvokeMethodAction<Class, MethodPtr> Invoke(Class* obj_ptr,
+                                                      MethodPtr method_ptr) {
+  return {obj_ptr, method_ptr};
+}
+
+// Creates an action that invokes 'function_impl' with no argument.
+template <typename FunctionImpl>
+internal::InvokeWithoutArgsAction<typename std::decay<FunctionImpl>::type>
+InvokeWithoutArgs(FunctionImpl function_impl) {
+  return {std::move(function_impl)};
+}
+
+// Creates an action that invokes the given method on the given object
+// with no argument.
+template <class Class, typename MethodPtr>
+internal::InvokeMethodWithoutArgsAction<Class, MethodPtr> InvokeWithoutArgs(
+    Class* obj_ptr, MethodPtr method_ptr) {
+  return {obj_ptr, method_ptr};
+}
+
+// Creates an action that performs an_action and throws away its
+// result.  In other words, it changes the return type of an_action to
+// void.  an_action MUST NOT return void, or the code won't compile.
+template <typename A>
+inline internal::IgnoreResultAction<A> IgnoreResult(const A& an_action) {
+  return internal::IgnoreResultAction<A>(an_action);
+}
+
+// Creates a reference wrapper for the given L-value.  If necessary,
+// you can explicitly specify the type of the reference.  For example,
+// suppose 'derived' is an object of type Derived, ByRef(derived)
+// would wrap a Derived&.  If you want to wrap a const Base& instead,
+// where Base is a base class of Derived, just write:
+//
+//   ByRef<const Base>(derived)
+//
+// N.B. ByRef is redundant with std::ref, std::cref and std::reference_wrapper.
+// However, it may still be used for consistency with ByMove().
+template <typename T>
+inline ::std::reference_wrapper<T> ByRef(T& l_value) {  // NOLINT
+  return ::std::reference_wrapper<T>(l_value);
+}
+
+namespace internal {
+
+// A macro from the ACTION* family (defined later in gmock-generated-actions.h)
+// defines an action that can be used in a mock function.  Typically,
+// these actions only care about a subset of the arguments of the mock
+// function.  For example, if such an action only uses the second
+// argument, it can be used in any mock function that takes >= 2
+// arguments where the type of the second argument is compatible.
+//
+// Therefore, the action implementation must be prepared to take more
+// arguments than it needs.  The ExcessiveArg type is used to
+// represent those excessive arguments.  In order to keep the compiler
+// error messages tractable, we define it in the testing namespace
+// instead of testing::internal.  However, this is an INTERNAL TYPE
+// and subject to change without notice, so a user MUST NOT USE THIS
+// TYPE DIRECTLY.
+struct ExcessiveArg {};
+
+// A helper class needed for implementing the ACTION* macros.
+template <typename Result, class Impl>
+class ActionHelper {
+ public:
+  template <typename... Ts>
+  static Result Perform(Impl* impl, const std::tuple<Ts...>& args) {
+    return Apply(impl, args, MakeIndexSequence<sizeof...(Ts)>{},
+                 MakeIndexSequence<10 - sizeof...(Ts)>{});
+  }
+
+ private:
+  template <typename... Ts, std::size_t... tuple_ids, std::size_t... rest_ids>
+  static Result Apply(Impl* impl, const std::tuple<Ts...>& args,
+                      IndexSequence<tuple_ids...>, IndexSequence<rest_ids...>) {
+    return impl->template gmock_PerformImpl<Ts...>(
+        args, std::get<tuple_ids>(args)...,
+        ((void)rest_ids, ExcessiveArg())...);
+  }
+};
+
+// A helper base class needed for implementing the ACTION* macros.
+// Implements constructor and conversion operator for Action.
+//
+// Template specialization for parameterless Action.
+template <typename Derived>
+class ActionImpl {
+ public:
+  ActionImpl() = default;
+
+  template <typename F>
+  operator ::testing::Action<F>() const {  // NOLINT(runtime/explicit)
+    return ::testing::Action<F>(new typename Derived::template gmock_Impl<F>());
+  }
+};
+
+// Template specialization for parameterized Action.
+template <template <typename...> class Derived, typename... Ts>
+class ActionImpl<Derived<Ts...>> {
+ public:
+  explicit ActionImpl(Ts... params) : params_(std::forward<Ts>(params)...) {}
+
+  template <typename F>
+  operator ::testing::Action<F>() const {  // NOLINT(runtime/explicit)
+    return Apply<F>(MakeIndexSequence<sizeof...(Ts)>{});
+  }
+
+ private:
+  template <typename F, std::size_t... tuple_ids>
+  ::testing::Action<F> Apply(IndexSequence<tuple_ids...>) const {
+    return ::testing::Action<F>(new
+                                typename Derived<Ts...>::template gmock_Impl<F>(
+                                    std::get<tuple_ids>(params_)...));
+  }
+
+  std::tuple<Ts...> params_;
+};
+
+namespace invoke_argument {
+
+// Appears in InvokeArgumentAdl's argument list to help avoid
+// accidental calls to user functions of the same name.
+struct AdlTag {};
+
+// InvokeArgumentAdl - a helper for InvokeArgument.
+// The basic overloads are provided here for generic functors.
+// Overloads for other custom-callables are provided in the
+// internal/custom/gmock-generated-actions.h header.
+template <typename F, typename... Args>
+auto InvokeArgumentAdl(AdlTag, F f, Args... args) -> decltype(f(args...)) {
+  return f(args...);
+}
+
+}  // namespace invoke_argument
+
+#define GMOCK_INTERNAL_ARG_UNUSED(i, data, el) \
+  , const arg##i##_type& arg##i GTEST_ATTRIBUTE_UNUSED_
+#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_                 \
+  const args_type& args GTEST_ATTRIBUTE_UNUSED_ GMOCK_PP_REPEAT( \
+      GMOCK_INTERNAL_ARG_UNUSED, , 10)
+
+#define GMOCK_INTERNAL_ARG(i, data, el) , const arg##i##_type& arg##i
+#define GMOCK_ACTION_ARG_TYPES_AND_NAMES_ \
+  const args_type& args GMOCK_PP_REPEAT(GMOCK_INTERNAL_ARG, , 10)
+
+#define GMOCK_INTERNAL_TEMPLATE_ARG(i, data, el) , typename arg##i##_type
+#define GMOCK_ACTION_TEMPLATE_ARGS_NAMES_ \
+  GMOCK_PP_TAIL(GMOCK_PP_REPEAT(GMOCK_INTERNAL_TEMPLATE_ARG, , 10))
+
+#define GMOCK_INTERNAL_TYPENAME_PARAM(i, data, param) , typename param##_type
+#define GMOCK_ACTION_TYPENAME_PARAMS_(params) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_TYPENAME_PARAM, , params))
+
+#define GMOCK_INTERNAL_TYPE_PARAM(i, data, param) , param##_type
+#define GMOCK_ACTION_TYPE_PARAMS_(params) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_TYPE_PARAM, , params))
+
+#define GMOCK_INTERNAL_TYPE_GVALUE_PARAM(i, data, param) \
+  , param##_type gmock_p##i
+#define GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_TYPE_GVALUE_PARAM, , params))
+
+#define GMOCK_INTERNAL_GVALUE_PARAM(i, data, param) \
+  , std::forward<param##_type>(gmock_p##i)
+#define GMOCK_ACTION_GVALUE_PARAMS_(params) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GVALUE_PARAM, , params))
+
+#define GMOCK_INTERNAL_INIT_PARAM(i, data, param) \
+  , param(::std::forward<param##_type>(gmock_p##i))
+#define GMOCK_ACTION_INIT_PARAMS_(params) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_INIT_PARAM, , params))
+
+#define GMOCK_INTERNAL_FIELD_PARAM(i, data, param) param##_type param;
+#define GMOCK_ACTION_FIELD_PARAMS_(params) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_FIELD_PARAM, , params)
+
+#define GMOCK_INTERNAL_ACTION(name, full_name, params)                        \
+  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                            \
+  class full_name : public ::testing::internal::ActionImpl<                   \
+                        full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>> {       \
+    using base_type = ::testing::internal::ActionImpl<full_name>;             \
+                                                                              \
+   public:                                                                    \
+    using base_type::base_type;                                               \
+    template <typename F>                                                     \
+    class gmock_Impl : public ::testing::ActionInterface<F> {                 \
+     public:                                                                  \
+      typedef F function_type;                                                \
+      typedef typename ::testing::internal::Function<F>::Result return_type;  \
+      typedef                                                                 \
+          typename ::testing::internal::Function<F>::ArgumentTuple args_type; \
+      explicit gmock_Impl(GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params))           \
+          : GMOCK_ACTION_INIT_PARAMS_(params) {}                              \
+      return_type Perform(const args_type& args) override {                   \
+        return ::testing::internal::ActionHelper<return_type,                 \
+                                                 gmock_Impl>::Perform(this,   \
+                                                                      args);  \
+      }                                                                       \
+      template <GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                            \
+      return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const; \
+      GMOCK_ACTION_FIELD_PARAMS_(params)                                      \
+                                                                              \
+     private:                                                                 \
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);                                     \
+    };                                                                        \
+                                                                              \
+   private:                                                                   \
+    GTEST_DISALLOW_ASSIGN_(full_name);                                        \
+  };                                                                          \
+  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                            \
+  inline full_name<GMOCK_ACTION_TYPE_PARAMS_(params)> name(                   \
+      GMOCK_ACTION_TYPE_GVALUE_PARAMS_(params)) {                             \
+    return full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>(                      \
+        GMOCK_ACTION_GVALUE_PARAMS_(params));                                 \
+  }                                                                           \
+  template <GMOCK_ACTION_TYPENAME_PARAMS_(params)>                            \
+  template <typename F>                                                       \
+  template <GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                                \
+  typename ::testing::internal::Function<F>::Result                           \
+      full_name<GMOCK_ACTION_TYPE_PARAMS_(params)>::gmock_Impl<               \
+          F>::gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_)     \
+          const
+
+}  // namespace internal
+
+#define ACTION(name)                                                          \
+  class name##Action : public ::testing::internal::ActionImpl<name##Action> { \
+    using base_type = ::testing::internal::ActionImpl<name##Action>;          \
+                                                                              \
+   public:                                                                    \
+    using base_type::base_type;                                               \
+    template <typename F>                                                     \
+    class gmock_Impl : public ::testing::ActionInterface<F> {                 \
+     public:                                                                  \
+      typedef F function_type;                                                \
+      typedef typename ::testing::internal::Function<F>::Result return_type;  \
+      typedef                                                                 \
+          typename ::testing::internal::Function<F>::ArgumentTuple args_type; \
+      gmock_Impl() {}                                                         \
+      return_type Perform(const args_type& args) override {                   \
+        return ::testing::internal::ActionHelper<return_type,                 \
+                                                 gmock_Impl>::Perform(this,   \
+                                                                      args);  \
+      }                                                                       \
+      template <GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                            \
+      return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const; \
+                                                                              \
+     private:                                                                 \
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);                                     \
+    };                                                                        \
+                                                                              \
+   private:                                                                   \
+    GTEST_DISALLOW_ASSIGN_(name##Action);                                     \
+  };                                                                          \
+  inline name##Action name() { return name##Action(); }                       \
+  template <typename F>                                                       \
+  template <GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>                                \
+  typename ::testing::internal::Function<F>::Result                           \
+      name##Action::gmock_Impl<F>::gmock_PerformImpl(                         \
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+#define ACTION_P(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP, (__VA_ARGS__))
+
+#define ACTION_P2(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP2, (__VA_ARGS__))
+
+#define ACTION_P3(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP3, (__VA_ARGS__))
+
+#define ACTION_P4(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP4, (__VA_ARGS__))
+
+#define ACTION_P5(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP5, (__VA_ARGS__))
+
+#define ACTION_P6(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP6, (__VA_ARGS__))
+
+#define ACTION_P7(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP7, (__VA_ARGS__))
+
+#define ACTION_P8(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP8, (__VA_ARGS__))
+
+#define ACTION_P9(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP9, (__VA_ARGS__))
+
+#define ACTION_P10(name, ...) \
+  GMOCK_INTERNAL_ACTION(name, name##ActionP10, (__VA_ARGS__))
+
+}  // namespace testing
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_ACTIONS_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-cardinalities.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-cardinalities.h
new file mode 100644
index 0000000000..46e01e102d
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-cardinalities.h
@@ -0,0 +1,157 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used cardinalities.  More
+// cardinalities can be defined by the user implementing the
+// CardinalityInterface interface if necessary.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
+
+#include <limits.h>
+#include <memory>
+#include <ostream>  // NOLINT
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// To implement a cardinality Foo, define:
+//   1. a class FooCardinality that implements the
+//      CardinalityInterface interface, and
+//   2. a factory function that creates a Cardinality object from a
+//      const FooCardinality*.
+//
+// The two-level delegation design follows that of Matcher, providing
+// consistency for extension developers.  It also eases ownership
+// management as Cardinality objects can now be copied like plain values.
+
+// The implementation of a cardinality.
+class CardinalityInterface {
+ public:
+  virtual ~CardinalityInterface() {}
+
+  // Conservative estimate on the lower/upper bound of the number of
+  // calls allowed.
+  virtual int ConservativeLowerBound() const { return 0; }
+  virtual int ConservativeUpperBound() const { return INT_MAX; }
+
+  // Returns true if and only if call_count calls will satisfy this
+  // cardinality.
+  virtual bool IsSatisfiedByCallCount(int call_count) const = 0;
+
+  // Returns true if and only if call_count calls will saturate this
+  // cardinality.
+  virtual bool IsSaturatedByCallCount(int call_count) const = 0;
+
+  // Describes self to an ostream.
+  virtual void DescribeTo(::std::ostream* os) const = 0;
+};
+
+// A Cardinality is a copyable and IMMUTABLE (except by assignment)
+// object that specifies how many times a mock function is expected to
+// be called.  The implementation of Cardinality is just a std::shared_ptr
+// to const CardinalityInterface. Don't inherit from Cardinality!
+class GTEST_API_ Cardinality {
+ public:
+  // Constructs a null cardinality.  Needed for storing Cardinality
+  // objects in STL containers.
+  Cardinality() {}
+
+  // Constructs a Cardinality from its implementation.
+  explicit Cardinality(const CardinalityInterface* impl) : impl_(impl) {}
+
+  // Conservative estimate on the lower/upper bound of the number of
+  // calls allowed.
+  int ConservativeLowerBound() const { return impl_->ConservativeLowerBound(); }
+  int ConservativeUpperBound() const { return impl_->ConservativeUpperBound(); }
+
+  // Returns true if and only if call_count calls will satisfy this
+  // cardinality.
+  bool IsSatisfiedByCallCount(int call_count) const {
+    return impl_->IsSatisfiedByCallCount(call_count);
+  }
+
+  // Returns true if and only if call_count calls will saturate this
+  // cardinality.
+  bool IsSaturatedByCallCount(int call_count) const {
+    return impl_->IsSaturatedByCallCount(call_count);
+  }
+
+  // Returns true if and only if call_count calls will over-saturate this
+  // cardinality, i.e. exceed the maximum number of allowed calls.
+  bool IsOverSaturatedByCallCount(int call_count) const {
+    return impl_->IsSaturatedByCallCount(call_count) &&
+        !impl_->IsSatisfiedByCallCount(call_count);
+  }
+
+  // Describes self to an ostream
+  void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); }
+
+  // Describes the given actual call count to an ostream.
+  static void DescribeActualCallCountTo(int actual_call_count,
+                                        ::std::ostream* os);
+
+ private:
+  std::shared_ptr<const CardinalityInterface> impl_;
+};
+
+// Creates a cardinality that allows at least n calls.
+GTEST_API_ Cardinality AtLeast(int n);
+
+// Creates a cardinality that allows at most n calls.
+GTEST_API_ Cardinality AtMost(int n);
+
+// Creates a cardinality that allows any number of calls.
+GTEST_API_ Cardinality AnyNumber();
+
+// Creates a cardinality that allows between min and max calls.
+GTEST_API_ Cardinality Between(int min, int max);
+
+// Creates a cardinality that allows exactly n calls.
+GTEST_API_ Cardinality Exactly(int n);
+
+// Creates a cardinality from its implementation.
+inline Cardinality MakeCardinality(const CardinalityInterface* c) {
+  return Cardinality(c);
+}
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_CARDINALITIES_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-function-mocker.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-function-mocker.h
new file mode 100644
index 0000000000..317d6c2b7e
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-function-mocker.h
@@ -0,0 +1,458 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements MOCK_METHOD.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef THIRD_PARTY_GOOGLETEST_GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_  // NOLINT
+#define THIRD_PARTY_GOOGLETEST_GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_  // NOLINT
+
+#include <type_traits>  // IWYU pragma: keep
+#include <utility>      // IWYU pragma: keep
+
+#include "gmock/gmock-spec-builders.h"
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-pp.h"
+
+namespace testing {
+namespace internal {
+template <typename T>
+using identity_t = T;
+
+template <typename MockType>
+const MockType* AdjustConstness_const(const MockType* mock) {
+  return mock;
+}
+
+template <typename MockType>
+MockType* AdjustConstness_(const MockType* mock) {
+  return const_cast<MockType*>(mock);
+}
+
+}  // namespace internal
+
+// The style guide prohibits "using" statements in a namespace scope
+// inside a header file.  However, the FunctionMocker class template
+// is meant to be defined in the ::testing namespace.  The following
+// line is just a trick for working around a bug in MSVC 8.0, which
+// cannot handle it if we define FunctionMocker in ::testing.
+using internal::FunctionMocker;
+}  // namespace testing
+
+#define MOCK_METHOD(...) \
+  GMOCK_PP_VARIADIC_CALL(GMOCK_INTERNAL_MOCK_METHOD_ARG_, __VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_1(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_2(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_3(_Ret, _MethodName, _Args) \
+  GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, ())
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_4(_Ret, _MethodName, _Args, _Spec)  \
+  GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Args);                                \
+  GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Spec);                                \
+  GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(                                   \
+      GMOCK_PP_NARG0 _Args, GMOCK_INTERNAL_SIGNATURE(_Ret, _Args));        \
+  GMOCK_INTERNAL_ASSERT_VALID_SPEC(_Spec)                                  \
+  GMOCK_INTERNAL_MOCK_METHOD_IMPL(                                         \
+      GMOCK_PP_NARG0 _Args, _MethodName, GMOCK_INTERNAL_HAS_CONST(_Spec),  \
+      GMOCK_INTERNAL_HAS_OVERRIDE(_Spec), GMOCK_INTERNAL_HAS_FINAL(_Spec), \
+      GMOCK_INTERNAL_GET_NOEXCEPT_SPEC(_Spec),                             \
+      GMOCK_INTERNAL_GET_CALLTYPE(_Spec),                                  \
+      (GMOCK_INTERNAL_SIGNATURE(_Ret, _Args)))
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_5(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_6(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_ARG_7(...) \
+  GMOCK_INTERNAL_WRONG_ARITY(__VA_ARGS__)
+
+#define GMOCK_INTERNAL_WRONG_ARITY(...)                                      \
+  static_assert(                                                             \
+      false,                                                                 \
+      "MOCK_METHOD must be called with 3 or 4 arguments. _Ret, "             \
+      "_MethodName, _Args and optionally _Spec. _Args and _Spec must be "    \
+      "enclosed in parentheses. If _Ret is a type with unprotected commas, " \
+      "it must also be enclosed in parentheses.")
+
+#define GMOCK_INTERNAL_ASSERT_PARENTHESIS(_Tuple) \
+  static_assert(                                  \
+      GMOCK_PP_IS_ENCLOSED_PARENS(_Tuple),        \
+      GMOCK_PP_STRINGIZE(_Tuple) " should be enclosed in parentheses.")
+
+#define GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(_N, ...)                 \
+  static_assert(                                                       \
+      std::is_function<__VA_ARGS__>::value,                            \
+      "Signature must be a function type, maybe return type contains " \
+      "unprotected comma.");                                           \
+  static_assert(                                                       \
+      ::testing::tuple_size<typename ::testing::internal::Function<    \
+              __VA_ARGS__>::ArgumentTuple>::value == _N,               \
+      "This method does not take " GMOCK_PP_STRINGIZE(                 \
+          _N) " arguments. Parenthesize all types with unproctected commas.")
+
+#define GMOCK_INTERNAL_ASSERT_VALID_SPEC(_Spec) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT, ~, _Spec)
+
+#define GMOCK_INTERNAL_MOCK_METHOD_IMPL(_N, _MethodName, _Constness,           \
+                                        _Override, _Final, _NoexceptSpec,      \
+                                        _CallType, _Signature)                 \
+  typename ::testing::internal::Function<GMOCK_PP_REMOVE_PARENS(               \
+      _Signature)>::Result                                                     \
+  GMOCK_INTERNAL_EXPAND(_CallType)                                             \
+      _MethodName(GMOCK_PP_REPEAT(GMOCK_INTERNAL_PARAMETER, _Signature, _N))   \
+          GMOCK_PP_IF(_Constness, const, ) _NoexceptSpec                       \
+          GMOCK_PP_IF(_Override, override, ) GMOCK_PP_IF(_Final, final, ) {    \
+    GMOCK_MOCKER_(_N, _Constness, _MethodName)                                 \
+        .SetOwnerAndName(this, #_MethodName);                                  \
+    return GMOCK_MOCKER_(_N, _Constness, _MethodName)                          \
+        .Invoke(GMOCK_PP_REPEAT(GMOCK_INTERNAL_FORWARD_ARG, _Signature, _N));  \
+  }                                                                            \
+  ::testing::MockSpec<GMOCK_PP_REMOVE_PARENS(_Signature)> gmock_##_MethodName( \
+      GMOCK_PP_REPEAT(GMOCK_INTERNAL_MATCHER_PARAMETER, _Signature, _N))       \
+      GMOCK_PP_IF(_Constness, const, ) {                                       \
+    GMOCK_MOCKER_(_N, _Constness, _MethodName).RegisterOwner(this);            \
+    return GMOCK_MOCKER_(_N, _Constness, _MethodName)                          \
+        .With(GMOCK_PP_REPEAT(GMOCK_INTERNAL_MATCHER_ARGUMENT, , _N));         \
+  }                                                                            \
+  ::testing::MockSpec<GMOCK_PP_REMOVE_PARENS(_Signature)> gmock_##_MethodName( \
+      const ::testing::internal::WithoutMatchers&,                             \
+      GMOCK_PP_IF(_Constness, const, )::testing::internal::Function<           \
+          GMOCK_PP_REMOVE_PARENS(_Signature)>*) const _NoexceptSpec {          \
+    return GMOCK_PP_CAT(::testing::internal::AdjustConstness_,                 \
+                        GMOCK_PP_IF(_Constness, const, ))(this)                \
+        ->gmock_##_MethodName(GMOCK_PP_REPEAT(                                 \
+            GMOCK_INTERNAL_A_MATCHER_ARGUMENT, _Signature, _N));               \
+  }                                                                            \
+  mutable ::testing::FunctionMocker<GMOCK_PP_REMOVE_PARENS(_Signature)>        \
+      GMOCK_MOCKER_(_N, _Constness, _MethodName)
+
+#define GMOCK_INTERNAL_EXPAND(...) __VA_ARGS__
+
+// Five Valid modifiers.
+#define GMOCK_INTERNAL_HAS_CONST(_Tuple) \
+  GMOCK_PP_HAS_COMMA(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_CONST, ~, _Tuple))
+
+#define GMOCK_INTERNAL_HAS_OVERRIDE(_Tuple) \
+  GMOCK_PP_HAS_COMMA(                       \
+      GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_OVERRIDE, ~, _Tuple))
+
+#define GMOCK_INTERNAL_HAS_FINAL(_Tuple) \
+  GMOCK_PP_HAS_COMMA(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_DETECT_FINAL, ~, _Tuple))
+
+#define GMOCK_INTERNAL_GET_NOEXCEPT_SPEC(_Tuple) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_NOEXCEPT_SPEC_IF_NOEXCEPT, ~, _Tuple)
+
+#define GMOCK_INTERNAL_NOEXCEPT_SPEC_IF_NOEXCEPT(_i, _, _elem)          \
+  GMOCK_PP_IF(                                                          \
+      GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem)), \
+      _elem, )
+
+#define GMOCK_INTERNAL_GET_CALLTYPE(_Tuple) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GET_CALLTYPE_IMPL, ~, _Tuple)
+
+#define GMOCK_INTERNAL_ASSERT_VALID_SPEC_ELEMENT(_i, _, _elem)            \
+  static_assert(                                                          \
+      (GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem)) +    \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_OVERRIDE(_i, _, _elem)) + \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_FINAL(_i, _, _elem)) +    \
+       GMOCK_PP_HAS_COMMA(GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem)) + \
+       GMOCK_INTERNAL_IS_CALLTYPE(_elem)) == 1,                           \
+      GMOCK_PP_STRINGIZE(                                                 \
+          _elem) " cannot be recognized as a valid specification modifier.");
+
+// Modifiers implementation.
+#define GMOCK_INTERNAL_DETECT_CONST(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_CONST_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_CONST_I_const ,
+
+#define GMOCK_INTERNAL_DETECT_OVERRIDE(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_OVERRIDE_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_OVERRIDE_I_override ,
+
+#define GMOCK_INTERNAL_DETECT_FINAL(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_FINAL_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_FINAL_I_final ,
+
+#define GMOCK_INTERNAL_DETECT_NOEXCEPT(_i, _, _elem) \
+  GMOCK_PP_CAT(GMOCK_INTERNAL_DETECT_NOEXCEPT_I_, _elem)
+
+#define GMOCK_INTERNAL_DETECT_NOEXCEPT_I_noexcept ,
+
+#define GMOCK_INTERNAL_GET_CALLTYPE_IMPL(_i, _, _elem)           \
+  GMOCK_PP_IF(GMOCK_INTERNAL_IS_CALLTYPE(_elem),                 \
+              GMOCK_INTERNAL_GET_VALUE_CALLTYPE, GMOCK_PP_EMPTY) \
+  (_elem)
+
+// TODO(iserna): GMOCK_INTERNAL_IS_CALLTYPE and
+// GMOCK_INTERNAL_GET_VALUE_CALLTYPE needed more expansions to work on windows
+// maybe they can be simplified somehow.
+#define GMOCK_INTERNAL_IS_CALLTYPE(_arg) \
+  GMOCK_INTERNAL_IS_CALLTYPE_I(          \
+      GMOCK_PP_CAT(GMOCK_INTERNAL_IS_CALLTYPE_HELPER_, _arg))
+#define GMOCK_INTERNAL_IS_CALLTYPE_I(_arg) GMOCK_PP_IS_ENCLOSED_PARENS(_arg)
+
+#define GMOCK_INTERNAL_GET_VALUE_CALLTYPE(_arg) \
+  GMOCK_INTERNAL_GET_VALUE_CALLTYPE_I(          \
+      GMOCK_PP_CAT(GMOCK_INTERNAL_IS_CALLTYPE_HELPER_, _arg))
+#define GMOCK_INTERNAL_GET_VALUE_CALLTYPE_I(_arg) \
+  GMOCK_PP_CAT(GMOCK_PP_IDENTITY, _arg)
+
+#define GMOCK_INTERNAL_IS_CALLTYPE_HELPER_Calltype
+
+// Note: The use of `identity_t` here allows _Ret to represent return types that
+// would normally need to be specified in a different way. For example, a method
+// returning a function pointer must be written as
+//
+// fn_ptr_return_t (*method(method_args_t...))(fn_ptr_args_t...)
+//
+// But we only support placing the return type at the beginning. To handle this,
+// we wrap all calls in identity_t, so that a declaration will be expanded to
+//
+// identity_t<fn_ptr_return_t (*)(fn_ptr_args_t...)> method(method_args_t...)
+//
+// This allows us to work around the syntactic oddities of function/method
+// types.
+#define GMOCK_INTERNAL_SIGNATURE(_Ret, _Args)                                 \
+  ::testing::internal::identity_t<GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(_Ret), \
+                                              GMOCK_PP_REMOVE_PARENS,         \
+                                              GMOCK_PP_IDENTITY)(_Ret)>(      \
+      GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_GET_TYPE, _, _Args))
+
+#define GMOCK_INTERNAL_GET_TYPE(_i, _, _elem)                          \
+  GMOCK_PP_COMMA_IF(_i)                                                \
+  GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(_elem), GMOCK_PP_REMOVE_PARENS, \
+              GMOCK_PP_IDENTITY)                                       \
+  (_elem)
+
+#define GMOCK_INTERNAL_PARAMETER(_i, _Signature, _)            \
+  GMOCK_PP_COMMA_IF(_i)                                        \
+  GMOCK_INTERNAL_ARG_O(_i, GMOCK_PP_REMOVE_PARENS(_Signature)) \
+  gmock_a##_i
+
+#define GMOCK_INTERNAL_FORWARD_ARG(_i, _Signature, _) \
+  GMOCK_PP_COMMA_IF(_i)                               \
+  ::std::forward<GMOCK_INTERNAL_ARG_O(                \
+      _i, GMOCK_PP_REMOVE_PARENS(_Signature))>(gmock_a##_i)
+
+#define GMOCK_INTERNAL_MATCHER_PARAMETER(_i, _Signature, _)        \
+  GMOCK_PP_COMMA_IF(_i)                                            \
+  GMOCK_INTERNAL_MATCHER_O(_i, GMOCK_PP_REMOVE_PARENS(_Signature)) \
+  gmock_a##_i
+
+#define GMOCK_INTERNAL_MATCHER_ARGUMENT(_i, _1, _2) \
+  GMOCK_PP_COMMA_IF(_i)                             \
+  gmock_a##_i
+
+#define GMOCK_INTERNAL_A_MATCHER_ARGUMENT(_i, _Signature, _) \
+  GMOCK_PP_COMMA_IF(_i)                                      \
+  ::testing::A<GMOCK_INTERNAL_ARG_O(_i, GMOCK_PP_REMOVE_PARENS(_Signature))>()
+
+#define GMOCK_INTERNAL_ARG_O(_i, ...) \
+  typename ::testing::internal::Function<__VA_ARGS__>::template Arg<_i>::type
+
+#define GMOCK_INTERNAL_MATCHER_O(_i, ...)                          \
+  const ::testing::Matcher<typename ::testing::internal::Function< \
+      __VA_ARGS__>::template Arg<_i>::type>&
+
+#define MOCK_METHOD0(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 0, __VA_ARGS__)
+#define MOCK_METHOD1(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 1, __VA_ARGS__)
+#define MOCK_METHOD2(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 2, __VA_ARGS__)
+#define MOCK_METHOD3(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 3, __VA_ARGS__)
+#define MOCK_METHOD4(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 4, __VA_ARGS__)
+#define MOCK_METHOD5(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 5, __VA_ARGS__)
+#define MOCK_METHOD6(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 6, __VA_ARGS__)
+#define MOCK_METHOD7(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 7, __VA_ARGS__)
+#define MOCK_METHOD8(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 8, __VA_ARGS__)
+#define MOCK_METHOD9(m, ...) GMOCK_INTERNAL_MOCK_METHODN(, , m, 9, __VA_ARGS__)
+#define MOCK_METHOD10(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, , m, 10, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 0, __VA_ARGS__)
+#define MOCK_CONST_METHOD1(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 1, __VA_ARGS__)
+#define MOCK_CONST_METHOD2(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 2, __VA_ARGS__)
+#define MOCK_CONST_METHOD3(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 3, __VA_ARGS__)
+#define MOCK_CONST_METHOD4(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 4, __VA_ARGS__)
+#define MOCK_CONST_METHOD5(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 5, __VA_ARGS__)
+#define MOCK_CONST_METHOD6(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 6, __VA_ARGS__)
+#define MOCK_CONST_METHOD7(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 7, __VA_ARGS__)
+#define MOCK_CONST_METHOD8(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 8, __VA_ARGS__)
+#define MOCK_CONST_METHOD9(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 9, __VA_ARGS__)
+#define MOCK_CONST_METHOD10(m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, , m, 10, __VA_ARGS__)
+
+#define MOCK_METHOD0_T(m, ...) MOCK_METHOD0(m, __VA_ARGS__)
+#define MOCK_METHOD1_T(m, ...) MOCK_METHOD1(m, __VA_ARGS__)
+#define MOCK_METHOD2_T(m, ...) MOCK_METHOD2(m, __VA_ARGS__)
+#define MOCK_METHOD3_T(m, ...) MOCK_METHOD3(m, __VA_ARGS__)
+#define MOCK_METHOD4_T(m, ...) MOCK_METHOD4(m, __VA_ARGS__)
+#define MOCK_METHOD5_T(m, ...) MOCK_METHOD5(m, __VA_ARGS__)
+#define MOCK_METHOD6_T(m, ...) MOCK_METHOD6(m, __VA_ARGS__)
+#define MOCK_METHOD7_T(m, ...) MOCK_METHOD7(m, __VA_ARGS__)
+#define MOCK_METHOD8_T(m, ...) MOCK_METHOD8(m, __VA_ARGS__)
+#define MOCK_METHOD9_T(m, ...) MOCK_METHOD9(m, __VA_ARGS__)
+#define MOCK_METHOD10_T(m, ...) MOCK_METHOD10(m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_T(m, ...) MOCK_CONST_METHOD0(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_T(m, ...) MOCK_CONST_METHOD1(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_T(m, ...) MOCK_CONST_METHOD2(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_T(m, ...) MOCK_CONST_METHOD3(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_T(m, ...) MOCK_CONST_METHOD4(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_T(m, ...) MOCK_CONST_METHOD5(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_T(m, ...) MOCK_CONST_METHOD6(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_T(m, ...) MOCK_CONST_METHOD7(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_T(m, ...) MOCK_CONST_METHOD8(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_T(m, ...) MOCK_CONST_METHOD9(m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_T(m, ...) MOCK_CONST_METHOD10(m, __VA_ARGS__)
+
+#define MOCK_METHOD0_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 0, __VA_ARGS__)
+#define MOCK_METHOD1_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 1, __VA_ARGS__)
+#define MOCK_METHOD2_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 2, __VA_ARGS__)
+#define MOCK_METHOD3_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 3, __VA_ARGS__)
+#define MOCK_METHOD4_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 4, __VA_ARGS__)
+#define MOCK_METHOD5_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 5, __VA_ARGS__)
+#define MOCK_METHOD6_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 6, __VA_ARGS__)
+#define MOCK_METHOD7_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 7, __VA_ARGS__)
+#define MOCK_METHOD8_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 8, __VA_ARGS__)
+#define MOCK_METHOD9_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 9, __VA_ARGS__)
+#define MOCK_METHOD10_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(, ct, m, 10, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 0, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 1, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 2, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 3, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 4, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 5, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 6, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 7, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 8, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 9, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_WITH_CALLTYPE(ct, m, ...) \
+  GMOCK_INTERNAL_MOCK_METHODN(const, ct, m, 10, __VA_ARGS__)
+
+#define MOCK_METHOD0_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD0_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD1_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD1_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD2_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD2_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD3_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD3_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD4_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD4_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD5_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD5_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD6_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD6_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD7_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD7_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD8_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD8_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD9_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD9_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_METHOD10_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_METHOD10_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+
+#define MOCK_CONST_METHOD0_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD0_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD1_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD1_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD2_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD2_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD3_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD3_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD4_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD4_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD5_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD5_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD6_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD6_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD7_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD7_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD8_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD8_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD9_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD9_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+#define MOCK_CONST_METHOD10_T_WITH_CALLTYPE(ct, m, ...) \
+  MOCK_CONST_METHOD10_WITH_CALLTYPE(ct, m, __VA_ARGS__)
+
+#define GMOCK_INTERNAL_MOCK_METHODN(constness, ct, Method, args_num, ...) \
+  GMOCK_INTERNAL_ASSERT_VALID_SIGNATURE(                                  \
+      args_num, ::testing::internal::identity_t<__VA_ARGS__>);            \
+  GMOCK_INTERNAL_MOCK_METHOD_IMPL(                                        \
+      args_num, Method, GMOCK_PP_NARG0(constness), 0, 0, , ct,            \
+      (::testing::internal::identity_t<__VA_ARGS__>))
+
+#define GMOCK_MOCKER_(arity, constness, Method) \
+  GTEST_CONCAT_TOKEN_(gmock##constness##arity##_##Method##_, __LINE__)
+
+#endif  // THIRD_PARTY_GOOGLETEST_GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_FUNCTION_MOCKER_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-generated-actions.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-generated-actions.h
new file mode 100644
index 0000000000..c78debef07
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-generated-actions.h
@@ -0,0 +1,687 @@
+// This file was GENERATED by command:
+//     pump.py gmock-generated-actions.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used variadic actions.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
+
+#include <memory>
+#include <utility>
+
+#include "gmock/gmock-actions.h"
+#include "gmock/internal/gmock-port.h"
+
+
+// Sometimes you want to give an action explicit template parameters
+// that cannot be inferred from its value parameters.  ACTION() and
+// ACTION_P*() don't support that.  ACTION_TEMPLATE() remedies that
+// and can be viewed as an extension to ACTION() and ACTION_P*().
+//
+// The syntax:
+//
+//   ACTION_TEMPLATE(ActionName,
+//                   HAS_m_TEMPLATE_PARAMS(kind1, name1, ..., kind_m, name_m),
+//                   AND_n_VALUE_PARAMS(p1, ..., p_n)) { statements; }
+//
+// defines an action template that takes m explicit template
+// parameters and n value parameters.  name_i is the name of the i-th
+// template parameter, and kind_i specifies whether it's a typename,
+// an integral constant, or a template.  p_i is the name of the i-th
+// value parameter.
+//
+// Example:
+//
+//   // DuplicateArg<k, T>(output) converts the k-th argument of the mock
+//   // function to type T and copies it to *output.
+//   ACTION_TEMPLATE(DuplicateArg,
+//                   HAS_2_TEMPLATE_PARAMS(int, k, typename, T),
+//                   AND_1_VALUE_PARAMS(output)) {
+//     *output = T(::std::get<k>(args));
+//   }
+//   ...
+//     int n;
+//     EXPECT_CALL(mock, Foo(_, _))
+//         .WillOnce(DuplicateArg<1, unsigned char>(&n));
+//
+// To create an instance of an action template, write:
+//
+//   ActionName<t1, ..., t_m>(v1, ..., v_n)
+//
+// where the ts are the template arguments and the vs are the value
+// arguments.  The value argument types are inferred by the compiler.
+// If you want to explicitly specify the value argument types, you can
+// provide additional template arguments:
+//
+//   ActionName<t1, ..., t_m, u1, ..., u_k>(v1, ..., v_n)
+//
+// where u_i is the desired type of v_i.
+//
+// ACTION_TEMPLATE and ACTION/ACTION_P* can be overloaded on the
+// number of value parameters, but not on the number of template
+// parameters.  Without the restriction, the meaning of the following
+// is unclear:
+//
+//   OverloadedAction<int, bool>(x);
+//
+// Are we using a single-template-parameter action where 'bool' refers
+// to the type of x, or are we using a two-template-parameter action
+// where the compiler is asked to infer the type of x?
+//
+// Implementation notes:
+//
+// GMOCK_INTERNAL_*_HAS_m_TEMPLATE_PARAMS and
+// GMOCK_INTERNAL_*_AND_n_VALUE_PARAMS are internal macros for
+// implementing ACTION_TEMPLATE.  The main trick we use is to create
+// new macro invocations when expanding a macro.  For example, we have
+//
+//   #define ACTION_TEMPLATE(name, template_params, value_params)
+//       ... GMOCK_INTERNAL_DECL_##template_params ...
+//
+// which causes ACTION_TEMPLATE(..., HAS_1_TEMPLATE_PARAMS(typename, T), ...)
+// to expand to
+//
+//       ... GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(typename, T) ...
+//
+// Since GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS is a macro, the
+// preprocessor will continue to expand it to
+//
+//       ... typename T ...
+//
+// This technique conforms to the C++ standard and is portable.  It
+// allows us to implement action templates using O(N) code, where N is
+// the maximum number of template/value parameters supported.  Without
+// using it, we'd have to devote O(N^2) amount of code to implement all
+// combinations of m and n.
+
+// Declares the template parameters.
+#define GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(kind0, name0) kind0 name0
+#define GMOCK_INTERNAL_DECL_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1) kind0 name0, kind1 name1
+#define GMOCK_INTERNAL_DECL_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2) kind0 name0, kind1 name1, kind2 name2
+#define GMOCK_INTERNAL_DECL_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3) kind0 name0, kind1 name1, kind2 name2, \
+    kind3 name3
+#define GMOCK_INTERNAL_DECL_HAS_5_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4) kind0 name0, kind1 name1, \
+    kind2 name2, kind3 name3, kind4 name4
+#define GMOCK_INTERNAL_DECL_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5) kind0 name0, \
+    kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5
+#define GMOCK_INTERNAL_DECL_HAS_7_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6) kind0 name0, kind1 name1, kind2 name2, kind3 name3, kind4 name4, \
+    kind5 name5, kind6 name6
+#define GMOCK_INTERNAL_DECL_HAS_8_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7) kind0 name0, kind1 name1, kind2 name2, kind3 name3, \
+    kind4 name4, kind5 name5, kind6 name6, kind7 name7
+#define GMOCK_INTERNAL_DECL_HAS_9_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7, kind8, name8) kind0 name0, kind1 name1, kind2 name2, \
+    kind3 name3, kind4 name4, kind5 name5, kind6 name6, kind7 name7, \
+    kind8 name8
+#define GMOCK_INTERNAL_DECL_HAS_10_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1, kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6, kind7, name7, kind8, name8, kind9, name9) kind0 name0, \
+    kind1 name1, kind2 name2, kind3 name3, kind4 name4, kind5 name5, \
+    kind6 name6, kind7 name7, kind8 name8, kind9 name9
+
+// Lists the template parameters.
+#define GMOCK_INTERNAL_LIST_HAS_1_TEMPLATE_PARAMS(kind0, name0) name0
+#define GMOCK_INTERNAL_LIST_HAS_2_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1) name0, name1
+#define GMOCK_INTERNAL_LIST_HAS_3_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2) name0, name1, name2
+#define GMOCK_INTERNAL_LIST_HAS_4_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3) name0, name1, name2, name3
+#define GMOCK_INTERNAL_LIST_HAS_5_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4) name0, name1, name2, name3, \
+    name4
+#define GMOCK_INTERNAL_LIST_HAS_6_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5) name0, name1, \
+    name2, name3, name4, name5
+#define GMOCK_INTERNAL_LIST_HAS_7_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6) name0, name1, name2, name3, name4, name5, name6
+#define GMOCK_INTERNAL_LIST_HAS_8_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7) name0, name1, name2, name3, name4, name5, name6, name7
+#define GMOCK_INTERNAL_LIST_HAS_9_TEMPLATE_PARAMS(kind0, name0, kind1, name1, \
+    kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, name6, \
+    kind7, name7, kind8, name8) name0, name1, name2, name3, name4, name5, \
+    name6, name7, name8
+#define GMOCK_INTERNAL_LIST_HAS_10_TEMPLATE_PARAMS(kind0, name0, kind1, \
+    name1, kind2, name2, kind3, name3, kind4, name4, kind5, name5, kind6, \
+    name6, kind7, name7, kind8, name8, kind9, name9) name0, name1, name2, \
+    name3, name4, name5, name6, name7, name8, name9
+
+// Declares the types of value parameters.
+#define GMOCK_INTERNAL_DECL_TYPE_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DECL_TYPE_AND_1_VALUE_PARAMS(p0) , typename p0##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_2_VALUE_PARAMS(p0, p1) , \
+    typename p0##_type, typename p1##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) , \
+    typename p0##_type, typename p1##_type, typename p2##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) , \
+    typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) , \
+    typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) , \
+    typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) , typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type, \
+    typename p6##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7) , typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type, \
+    typename p6##_type, typename p7##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8) , typename p0##_type, typename p1##_type, typename p2##_type, \
+    typename p3##_type, typename p4##_type, typename p5##_type, \
+    typename p6##_type, typename p7##_type, typename p8##_type
+#define GMOCK_INTERNAL_DECL_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8, p9) , typename p0##_type, typename p1##_type, \
+    typename p2##_type, typename p3##_type, typename p4##_type, \
+    typename p5##_type, typename p6##_type, typename p7##_type, \
+    typename p8##_type, typename p9##_type
+
+// Initializes the value parameters.
+#define GMOCK_INTERNAL_INIT_AND_0_VALUE_PARAMS()\
+    ()
+#define GMOCK_INTERNAL_INIT_AND_1_VALUE_PARAMS(p0)\
+    (p0##_type gmock_p0) : p0(::std::move(gmock_p0))
+#define GMOCK_INTERNAL_INIT_AND_2_VALUE_PARAMS(p0, p1)\
+    (p0##_type gmock_p0, p1##_type gmock_p1) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1))
+#define GMOCK_INTERNAL_INIT_AND_3_VALUE_PARAMS(p0, p1, p2)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, \
+        p2##_type gmock_p2) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2))
+#define GMOCK_INTERNAL_INIT_AND_4_VALUE_PARAMS(p0, p1, p2, p3)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3))
+#define GMOCK_INTERNAL_INIT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4))
+#define GMOCK_INTERNAL_INIT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, \
+        p5##_type gmock_p5) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5))
+#define GMOCK_INTERNAL_INIT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6))
+#define GMOCK_INTERNAL_INIT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6, p7##_type gmock_p7) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+        p7(::std::move(gmock_p7))
+#define GMOCK_INTERNAL_INIT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6, p7##_type gmock_p7, \
+        p8##_type gmock_p8) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+        p7(::std::move(gmock_p7)), p8(::std::move(gmock_p8))
+#define GMOCK_INTERNAL_INIT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9)\
+    (p0##_type gmock_p0, p1##_type gmock_p1, p2##_type gmock_p2, \
+        p3##_type gmock_p3, p4##_type gmock_p4, p5##_type gmock_p5, \
+        p6##_type gmock_p6, p7##_type gmock_p7, p8##_type gmock_p8, \
+        p9##_type gmock_p9) : p0(::std::move(gmock_p0)), \
+        p1(::std::move(gmock_p1)), p2(::std::move(gmock_p2)), \
+        p3(::std::move(gmock_p3)), p4(::std::move(gmock_p4)), \
+        p5(::std::move(gmock_p5)), p6(::std::move(gmock_p6)), \
+        p7(::std::move(gmock_p7)), p8(::std::move(gmock_p8)), \
+        p9(::std::move(gmock_p9))
+
+// Declares the fields for storing the value parameters.
+#define GMOCK_INTERNAL_DEFN_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DEFN_AND_1_VALUE_PARAMS(p0) p0##_type p0;
+#define GMOCK_INTERNAL_DEFN_AND_2_VALUE_PARAMS(p0, p1) p0##_type p0; \
+    p1##_type p1;
+#define GMOCK_INTERNAL_DEFN_AND_3_VALUE_PARAMS(p0, p1, p2) p0##_type p0; \
+    p1##_type p1; p2##_type p2;
+#define GMOCK_INTERNAL_DEFN_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0##_type p0; \
+    p1##_type p1; p2##_type p2; p3##_type p3;
+#define GMOCK_INTERNAL_DEFN_AND_5_VALUE_PARAMS(p0, p1, p2, p3, \
+    p4) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4;
+#define GMOCK_INTERNAL_DEFN_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, \
+    p5) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
+    p5##_type p5;
+#define GMOCK_INTERNAL_DEFN_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
+    p5##_type p5; p6##_type p6;
+#define GMOCK_INTERNAL_DEFN_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; p4##_type p4; \
+    p5##_type p5; p6##_type p6; p7##_type p7;
+#define GMOCK_INTERNAL_DEFN_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; \
+    p4##_type p4; p5##_type p5; p6##_type p6; p7##_type p7; p8##_type p8;
+#define GMOCK_INTERNAL_DEFN_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) p0##_type p0; p1##_type p1; p2##_type p2; p3##_type p3; \
+    p4##_type p4; p5##_type p5; p6##_type p6; p7##_type p7; p8##_type p8; \
+    p9##_type p9;
+
+// Lists the value parameters.
+#define GMOCK_INTERNAL_LIST_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_LIST_AND_1_VALUE_PARAMS(p0) p0
+#define GMOCK_INTERNAL_LIST_AND_2_VALUE_PARAMS(p0, p1) p0, p1
+#define GMOCK_INTERNAL_LIST_AND_3_VALUE_PARAMS(p0, p1, p2) p0, p1, p2
+#define GMOCK_INTERNAL_LIST_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0, p1, p2, p3
+#define GMOCK_INTERNAL_LIST_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) p0, p1, \
+    p2, p3, p4
+#define GMOCK_INTERNAL_LIST_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) p0, \
+    p1, p2, p3, p4, p5
+#define GMOCK_INTERNAL_LIST_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) p0, p1, p2, p3, p4, p5, p6
+#define GMOCK_INTERNAL_LIST_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) p0, p1, p2, p3, p4, p5, p6, p7
+#define GMOCK_INTERNAL_LIST_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) p0, p1, p2, p3, p4, p5, p6, p7, p8
+#define GMOCK_INTERNAL_LIST_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) p0, p1, p2, p3, p4, p5, p6, p7, p8, p9
+
+// Lists the value parameter types.
+#define GMOCK_INTERNAL_LIST_TYPE_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_LIST_TYPE_AND_1_VALUE_PARAMS(p0) , p0##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_2_VALUE_PARAMS(p0, p1) , p0##_type, \
+    p1##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_3_VALUE_PARAMS(p0, p1, p2) , p0##_type, \
+    p1##_type, p2##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_4_VALUE_PARAMS(p0, p1, p2, p3) , \
+    p0##_type, p1##_type, p2##_type, p3##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) , \
+    p0##_type, p1##_type, p2##_type, p3##_type, p4##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) , \
+    p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, p5##_type, \
+    p6##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+    p5##_type, p6##_type, p7##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+    p5##_type, p6##_type, p7##_type, p8##_type
+#define GMOCK_INTERNAL_LIST_TYPE_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6, p7, p8, p9) , p0##_type, p1##_type, p2##_type, p3##_type, p4##_type, \
+    p5##_type, p6##_type, p7##_type, p8##_type, p9##_type
+
+// Declares the value parameters.
+#define GMOCK_INTERNAL_DECL_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_DECL_AND_1_VALUE_PARAMS(p0) p0##_type p0
+#define GMOCK_INTERNAL_DECL_AND_2_VALUE_PARAMS(p0, p1) p0##_type p0, \
+    p1##_type p1
+#define GMOCK_INTERNAL_DECL_AND_3_VALUE_PARAMS(p0, p1, p2) p0##_type p0, \
+    p1##_type p1, p2##_type p2
+#define GMOCK_INTERNAL_DECL_AND_4_VALUE_PARAMS(p0, p1, p2, p3) p0##_type p0, \
+    p1##_type p1, p2##_type p2, p3##_type p3
+#define GMOCK_INTERNAL_DECL_AND_5_VALUE_PARAMS(p0, p1, p2, p3, \
+    p4) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4
+#define GMOCK_INTERNAL_DECL_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, \
+    p5) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+    p5##_type p5
+#define GMOCK_INTERNAL_DECL_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, \
+    p6) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+    p5##_type p5, p6##_type p6
+#define GMOCK_INTERNAL_DECL_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, p4##_type p4, \
+    p5##_type p5, p6##_type p6, p7##_type p7
+#define GMOCK_INTERNAL_DECL_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+    p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8
+#define GMOCK_INTERNAL_DECL_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) p0##_type p0, p1##_type p1, p2##_type p2, p3##_type p3, \
+    p4##_type p4, p5##_type p5, p6##_type p6, p7##_type p7, p8##_type p8, \
+    p9##_type p9
+
+// The suffix of the class template implementing the action template.
+#define GMOCK_INTERNAL_COUNT_AND_0_VALUE_PARAMS()
+#define GMOCK_INTERNAL_COUNT_AND_1_VALUE_PARAMS(p0) P
+#define GMOCK_INTERNAL_COUNT_AND_2_VALUE_PARAMS(p0, p1) P2
+#define GMOCK_INTERNAL_COUNT_AND_3_VALUE_PARAMS(p0, p1, p2) P3
+#define GMOCK_INTERNAL_COUNT_AND_4_VALUE_PARAMS(p0, p1, p2, p3) P4
+#define GMOCK_INTERNAL_COUNT_AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4) P5
+#define GMOCK_INTERNAL_COUNT_AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5) P6
+#define GMOCK_INTERNAL_COUNT_AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6) P7
+#define GMOCK_INTERNAL_COUNT_AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7) P8
+#define GMOCK_INTERNAL_COUNT_AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8) P9
+#define GMOCK_INTERNAL_COUNT_AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, \
+    p7, p8, p9) P10
+
+// The name of the class template implementing the action template.
+#define GMOCK_ACTION_CLASS_(name, value_params)\
+    GTEST_CONCAT_TOKEN_(name##Action, GMOCK_INTERNAL_COUNT_##value_params)
+
+#define ACTION_TEMPLATE(name, template_params, value_params)\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  class GMOCK_ACTION_CLASS_(name, value_params) {\
+   public:\
+    explicit GMOCK_ACTION_CLASS_(name, value_params)\
+        GMOCK_INTERNAL_INIT_##value_params {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      explicit gmock_Impl GMOCK_INTERNAL_INIT_##value_params {}\
+      return_type Perform(const args_type& args) override {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>\
+      return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const;\
+      GMOCK_INTERNAL_DEFN_##value_params\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(\
+          new gmock_Impl<F>(GMOCK_INTERNAL_LIST_##value_params));\
+    }\
+    GMOCK_INTERNAL_DEFN_##value_params\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(GMOCK_ACTION_CLASS_(name, value_params));\
+  };\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  inline GMOCK_ACTION_CLASS_(name, value_params)<\
+      GMOCK_INTERNAL_LIST_##template_params\
+      GMOCK_INTERNAL_LIST_TYPE_##value_params> name(\
+          GMOCK_INTERNAL_DECL_##value_params) {\
+    return GMOCK_ACTION_CLASS_(name, value_params)<\
+        GMOCK_INTERNAL_LIST_##template_params\
+        GMOCK_INTERNAL_LIST_TYPE_##value_params>(\
+            GMOCK_INTERNAL_LIST_##value_params);\
+  }\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  template <typename F>\
+  template <GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>\
+  typename ::testing::internal::Function<F>::Result\
+      GMOCK_ACTION_CLASS_(name, value_params)<\
+          GMOCK_INTERNAL_LIST_##template_params\
+          GMOCK_INTERNAL_LIST_TYPE_##value_params>::gmock_Impl<F>::\
+              gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+
+namespace testing {
+
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+// Various overloads for InvokeArgument<N>().
+//
+// The InvokeArgument<N>(a1, a2, ..., a_k) action invokes the N-th
+// (0-based) argument, which must be a k-ary callable, of the mock
+// function, with arguments a1, a2, ..., a_k.
+//
+// Notes:
+//
+//   1. The arguments are passed by value by default.  If you need to
+//   pass an argument by reference, wrap it inside ByRef().  For
+//   example,
+//
+//     InvokeArgument<1>(5, string("Hello"), ByRef(foo))
+//
+//   passes 5 and string("Hello") by value, and passes foo by
+//   reference.
+//
+//   2. If the callable takes an argument by reference but ByRef() is
+//   not used, it will receive the reference to a copy of the value,
+//   instead of the original value.  For example, when the 0-th
+//   argument of the mock function takes a const string&, the action
+//
+//     InvokeArgument<0>(string("Hello"))
+//
+//   makes a copy of the temporary string("Hello") object and passes a
+//   reference of the copy, instead of the original temporary object,
+//   to the callable.  This makes it easy for a user to define an
+//   InvokeArgument action from temporary values and have it performed
+//   later.
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_0_VALUE_PARAMS()) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl(internal::invoke_argument::AdlTag(),
+                           ::std::get<k>(args));
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_1_VALUE_PARAMS(p0)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl(internal::invoke_argument::AdlTag(),
+                           ::std::get<k>(args), p0);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_2_VALUE_PARAMS(p0, p1)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl(internal::invoke_argument::AdlTag(),
+                           ::std::get<k>(args), p0, p1);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_3_VALUE_PARAMS(p0, p1, p2)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl(internal::invoke_argument::AdlTag(),
+                           ::std::get<k>(args), p0, p1, p2);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_4_VALUE_PARAMS(p0, p1, p2, p3)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl(internal::invoke_argument::AdlTag(),
+                           ::std::get<k>(args), p0, p1, p2, p3);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl(internal::invoke_argument::AdlTag(),
+                           ::std::get<k>(args), p0, p1, p2, p3, p4);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl(internal::invoke_argument::AdlTag(),
+                           ::std::get<k>(args), p0, p1, p2, p3, p4, p5);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl(internal::invoke_argument::AdlTag(),
+                           ::std::get<k>(args), p0, p1, p2, p3, p4, p5, p6);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl(internal::invoke_argument::AdlTag(),
+                           ::std::get<k>(args), p0, p1, p2, p3, p4, p5, p6, p7);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, p8)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl(internal::invoke_argument::AdlTag(),
+                           ::std::get<k>(args), p0, p1, p2, p3, p4, p5, p6, p7,
+                               p8);
+}
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl(internal::invoke_argument::AdlTag(),
+                           ::std::get<k>(args), p0, p1, p2, p3, p4, p5, p6, p7,
+                               p8, p9);
+}
+
+// Various overloads for ReturnNew<T>().
+//
+// The ReturnNew<T>(a1, a2, ..., a_k) action returns a pointer to a new
+// instance of type T, constructed on the heap with constructor arguments
+// a1, a2, ..., and a_k. The caller assumes ownership of the returned value.
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_0_VALUE_PARAMS()) {
+  return new T();
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_1_VALUE_PARAMS(p0)) {
+  return new T(p0);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_2_VALUE_PARAMS(p0, p1)) {
+  return new T(p0, p1);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_3_VALUE_PARAMS(p0, p1, p2)) {
+  return new T(p0, p1, p2);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_4_VALUE_PARAMS(p0, p1, p2, p3)) {
+  return new T(p0, p1, p2, p3);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_5_VALUE_PARAMS(p0, p1, p2, p3, p4)) {
+  return new T(p0, p1, p2, p3, p4);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_6_VALUE_PARAMS(p0, p1, p2, p3, p4, p5)) {
+  return new T(p0, p1, p2, p3, p4, p5);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_7_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6)) {
+  return new T(p0, p1, p2, p3, p4, p5, p6);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_8_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7)) {
+  return new T(p0, p1, p2, p3, p4, p5, p6, p7);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_9_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, p8)) {
+  return new T(p0, p1, p2, p3, p4, p5, p6, p7, p8);
+}
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_10_VALUE_PARAMS(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)) {
+  return new T(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9);
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+}  // namespace testing
+
+// Include any custom callback actions added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gmock/internal/custom/gmock-generated-actions.h"
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-generated-actions.h.pump b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-generated-actions.h.pump
new file mode 100644
index 0000000000..be9d99fed2
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-generated-actions.h.pump
@@ -0,0 +1,376 @@
+$$ -*- mode: c++; -*-
+$$ This is a Pump source file. Please use Pump to convert it to
+$$ gmock-generated-actions.h.
+$$
+$var n = 10  $$ The maximum arity we support.
+$$}} This meta comment fixes auto-indentation in editors.
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some commonly used variadic actions.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
+
+#include <memory>
+#include <utility>
+
+#include "gmock/gmock-actions.h"
+#include "gmock/internal/gmock-port.h"
+
+$range i 0..n
+$range k 0..n-1
+
+
+// Sometimes you want to give an action explicit template parameters
+// that cannot be inferred from its value parameters.  ACTION() and
+// ACTION_P*() don't support that.  ACTION_TEMPLATE() remedies that
+// and can be viewed as an extension to ACTION() and ACTION_P*().
+//
+// The syntax:
+//
+//   ACTION_TEMPLATE(ActionName,
+//                   HAS_m_TEMPLATE_PARAMS(kind1, name1, ..., kind_m, name_m),
+//                   AND_n_VALUE_PARAMS(p1, ..., p_n)) { statements; }
+//
+// defines an action template that takes m explicit template
+// parameters and n value parameters.  name_i is the name of the i-th
+// template parameter, and kind_i specifies whether it's a typename,
+// an integral constant, or a template.  p_i is the name of the i-th
+// value parameter.
+//
+// Example:
+//
+//   // DuplicateArg<k, T>(output) converts the k-th argument of the mock
+//   // function to type T and copies it to *output.
+//   ACTION_TEMPLATE(DuplicateArg,
+//                   HAS_2_TEMPLATE_PARAMS(int, k, typename, T),
+//                   AND_1_VALUE_PARAMS(output)) {
+//     *output = T(::std::get<k>(args));
+//   }
+//   ...
+//     int n;
+//     EXPECT_CALL(mock, Foo(_, _))
+//         .WillOnce(DuplicateArg<1, unsigned char>(&n));
+//
+// To create an instance of an action template, write:
+//
+//   ActionName<t1, ..., t_m>(v1, ..., v_n)
+//
+// where the ts are the template arguments and the vs are the value
+// arguments.  The value argument types are inferred by the compiler.
+// If you want to explicitly specify the value argument types, you can
+// provide additional template arguments:
+//
+//   ActionName<t1, ..., t_m, u1, ..., u_k>(v1, ..., v_n)
+//
+// where u_i is the desired type of v_i.
+//
+// ACTION_TEMPLATE and ACTION/ACTION_P* can be overloaded on the
+// number of value parameters, but not on the number of template
+// parameters.  Without the restriction, the meaning of the following
+// is unclear:
+//
+//   OverloadedAction<int, bool>(x);
+//
+// Are we using a single-template-parameter action where 'bool' refers
+// to the type of x, or are we using a two-template-parameter action
+// where the compiler is asked to infer the type of x?
+//
+// Implementation notes:
+//
+// GMOCK_INTERNAL_*_HAS_m_TEMPLATE_PARAMS and
+// GMOCK_INTERNAL_*_AND_n_VALUE_PARAMS are internal macros for
+// implementing ACTION_TEMPLATE.  The main trick we use is to create
+// new macro invocations when expanding a macro.  For example, we have
+//
+//   #define ACTION_TEMPLATE(name, template_params, value_params)
+//       ... GMOCK_INTERNAL_DECL_##template_params ...
+//
+// which causes ACTION_TEMPLATE(..., HAS_1_TEMPLATE_PARAMS(typename, T), ...)
+// to expand to
+//
+//       ... GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS(typename, T) ...
+//
+// Since GMOCK_INTERNAL_DECL_HAS_1_TEMPLATE_PARAMS is a macro, the
+// preprocessor will continue to expand it to
+//
+//       ... typename T ...
+//
+// This technique conforms to the C++ standard and is portable.  It
+// allows us to implement action templates using O(N) code, where N is
+// the maximum number of template/value parameters supported.  Without
+// using it, we'd have to devote O(N^2) amount of code to implement all
+// combinations of m and n.
+
+// Declares the template parameters.
+
+$range j 1..n
+$for j [[
+$range m 0..j-1
+#define GMOCK_INTERNAL_DECL_HAS_$j[[]]
+_TEMPLATE_PARAMS($for m, [[kind$m, name$m]]) $for m, [[kind$m name$m]]
+
+
+]]
+
+// Lists the template parameters.
+
+$for j [[
+$range m 0..j-1
+#define GMOCK_INTERNAL_LIST_HAS_$j[[]]
+_TEMPLATE_PARAMS($for m, [[kind$m, name$m]]) $for m, [[name$m]]
+
+
+]]
+
+// Declares the types of value parameters.
+
+$for i [[
+$range j 0..i-1
+#define GMOCK_INTERNAL_DECL_TYPE_AND_$i[[]]
+_VALUE_PARAMS($for j, [[p$j]]) $for j [[, typename p$j##_type]]
+
+
+]]
+
+// Initializes the value parameters.
+
+$for i [[
+$range j 0..i-1
+#define GMOCK_INTERNAL_INIT_AND_$i[[]]_VALUE_PARAMS($for j, [[p$j]])\
+    ($for j, [[p$j##_type gmock_p$j]])$if i>0 [[ : ]]$for j, [[p$j(::std::move(gmock_p$j))]]
+
+
+]]
+
+// Declares the fields for storing the value parameters.
+
+$for i [[
+$range j 0..i-1
+#define GMOCK_INTERNAL_DEFN_AND_$i[[]]
+_VALUE_PARAMS($for j, [[p$j]]) $for j [[p$j##_type p$j; ]]
+
+
+]]
+
+// Lists the value parameters.
+
+$for i [[
+$range j 0..i-1
+#define GMOCK_INTERNAL_LIST_AND_$i[[]]
+_VALUE_PARAMS($for j, [[p$j]]) $for j, [[p$j]]
+
+
+]]
+
+// Lists the value parameter types.
+
+$for i [[
+$range j 0..i-1
+#define GMOCK_INTERNAL_LIST_TYPE_AND_$i[[]]
+_VALUE_PARAMS($for j, [[p$j]]) $for j [[, p$j##_type]]
+
+
+]]
+
+// Declares the value parameters.
+
+$for i [[
+$range j 0..i-1
+#define GMOCK_INTERNAL_DECL_AND_$i[[]]_VALUE_PARAMS($for j, [[p$j]]) [[]]
+$for j, [[p$j##_type p$j]]
+
+
+]]
+
+// The suffix of the class template implementing the action template.
+$for i [[
+
+
+$range j 0..i-1
+#define GMOCK_INTERNAL_COUNT_AND_$i[[]]_VALUE_PARAMS($for j, [[p$j]]) [[]]
+$if i==1 [[P]] $elif i>=2 [[P$i]]
+]]
+
+
+// The name of the class template implementing the action template.
+#define GMOCK_ACTION_CLASS_(name, value_params)\
+    GTEST_CONCAT_TOKEN_(name##Action, GMOCK_INTERNAL_COUNT_##value_params)
+
+$range k 0..n-1
+
+#define ACTION_TEMPLATE(name, template_params, value_params)\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  class GMOCK_ACTION_CLASS_(name, value_params) {\
+   public:\
+    explicit GMOCK_ACTION_CLASS_(name, value_params)\
+        GMOCK_INTERNAL_INIT_##value_params {}\
+    template <typename F>\
+    class gmock_Impl : public ::testing::ActionInterface<F> {\
+     public:\
+      typedef F function_type;\
+      typedef typename ::testing::internal::Function<F>::Result return_type;\
+      typedef typename ::testing::internal::Function<F>::ArgumentTuple\
+          args_type;\
+      explicit gmock_Impl GMOCK_INTERNAL_INIT_##value_params {}\
+      return_type Perform(const args_type& args) override {\
+        return ::testing::internal::ActionHelper<return_type, gmock_Impl>::\
+            Perform(this, args);\
+      }\
+      template <GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>\
+      return_type gmock_PerformImpl(GMOCK_ACTION_ARG_TYPES_AND_NAMES_) const;\
+      GMOCK_INTERNAL_DEFN_##value_params\
+     private:\
+      GTEST_DISALLOW_ASSIGN_(gmock_Impl);\
+    };\
+    template <typename F> operator ::testing::Action<F>() const {\
+      return ::testing::Action<F>(\
+          new gmock_Impl<F>(GMOCK_INTERNAL_LIST_##value_params));\
+    }\
+    GMOCK_INTERNAL_DEFN_##value_params\
+   private:\
+    GTEST_DISALLOW_ASSIGN_(GMOCK_ACTION_CLASS_(name, value_params));\
+  };\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  inline GMOCK_ACTION_CLASS_(name, value_params)<\
+      GMOCK_INTERNAL_LIST_##template_params\
+      GMOCK_INTERNAL_LIST_TYPE_##value_params> name(\
+          GMOCK_INTERNAL_DECL_##value_params) {\
+    return GMOCK_ACTION_CLASS_(name, value_params)<\
+        GMOCK_INTERNAL_LIST_##template_params\
+        GMOCK_INTERNAL_LIST_TYPE_##value_params>(\
+            GMOCK_INTERNAL_LIST_##value_params);\
+  }\
+  template <GMOCK_INTERNAL_DECL_##template_params\
+            GMOCK_INTERNAL_DECL_TYPE_##value_params>\
+  template <typename F>\
+  template <GMOCK_ACTION_TEMPLATE_ARGS_NAMES_>\
+  typename ::testing::internal::Function<F>::Result\
+      GMOCK_ACTION_CLASS_(name, value_params)<\
+          GMOCK_INTERNAL_LIST_##template_params\
+          GMOCK_INTERNAL_LIST_TYPE_##value_params>::gmock_Impl<F>::\
+              gmock_PerformImpl(\
+          GMOCK_ACTION_ARG_TYPES_AND_NAMES_UNUSED_) const
+
+
+namespace testing {
+
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+// Various overloads for InvokeArgument<N>().
+//
+// The InvokeArgument<N>(a1, a2, ..., a_k) action invokes the N-th
+// (0-based) argument, which must be a k-ary callable, of the mock
+// function, with arguments a1, a2, ..., a_k.
+//
+// Notes:
+//
+//   1. The arguments are passed by value by default.  If you need to
+//   pass an argument by reference, wrap it inside ByRef().  For
+//   example,
+//
+//     InvokeArgument<1>(5, string("Hello"), ByRef(foo))
+//
+//   passes 5 and string("Hello") by value, and passes foo by
+//   reference.
+//
+//   2. If the callable takes an argument by reference but ByRef() is
+//   not used, it will receive the reference to a copy of the value,
+//   instead of the original value.  For example, when the 0-th
+//   argument of the mock function takes a const string&, the action
+//
+//     InvokeArgument<0>(string("Hello"))
+//
+//   makes a copy of the temporary string("Hello") object and passes a
+//   reference of the copy, instead of the original temporary object,
+//   to the callable.  This makes it easy for a user to define an
+//   InvokeArgument action from temporary values and have it performed
+//   later.
+
+$range i 0..n
+$for i [[
+$range j 0..i-1
+
+ACTION_TEMPLATE(InvokeArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_$i[[]]_VALUE_PARAMS($for j, [[p$j]])) {
+  using internal::invoke_argument::InvokeArgumentAdl;
+  return InvokeArgumentAdl(internal::invoke_argument::AdlTag(),
+                           ::std::get<k>(args)$for j[[, p$j]]);
+}
+
+]]
+
+// Various overloads for ReturnNew<T>().
+//
+// The ReturnNew<T>(a1, a2, ..., a_k) action returns a pointer to a new
+// instance of type T, constructed on the heap with constructor arguments
+// a1, a2, ..., and a_k. The caller assumes ownership of the returned value.
+$range i 0..n
+$for i [[
+$range j 0..i-1
+$var ps = [[$for j, [[p$j]]]]
+
+ACTION_TEMPLATE(ReturnNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_$i[[]]_VALUE_PARAMS($ps)) {
+  return new T($ps);
+}
+
+]]
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+}  // namespace testing
+
+// Include any custom callback actions added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gmock/internal/custom/gmock-generated-actions.h"
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_GENERATED_ACTIONS_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-matchers.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-matchers.h
new file mode 100644
index 0000000000..fe88a7c70b
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-matchers.h
@@ -0,0 +1,4982 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// The MATCHER* family of macros can be used in a namespace scope to
+// define custom matchers easily.
+//
+// Basic Usage
+// ===========
+//
+// The syntax
+//
+//   MATCHER(name, description_string) { statements; }
+//
+// defines a matcher with the given name that executes the statements,
+// which must return a bool to indicate if the match succeeds.  Inside
+// the statements, you can refer to the value being matched by 'arg',
+// and refer to its type by 'arg_type'.
+//
+// The description string documents what the matcher does, and is used
+// to generate the failure message when the match fails.  Since a
+// MATCHER() is usually defined in a header file shared by multiple
+// C++ source files, we require the description to be a C-string
+// literal to avoid possible side effects.  It can be empty, in which
+// case we'll use the sequence of words in the matcher name as the
+// description.
+//
+// For example:
+//
+//   MATCHER(IsEven, "") { return (arg % 2) == 0; }
+//
+// allows you to write
+//
+//   // Expects mock_foo.Bar(n) to be called where n is even.
+//   EXPECT_CALL(mock_foo, Bar(IsEven()));
+//
+// or,
+//
+//   // Verifies that the value of some_expression is even.
+//   EXPECT_THAT(some_expression, IsEven());
+//
+// If the above assertion fails, it will print something like:
+//
+//   Value of: some_expression
+//   Expected: is even
+//     Actual: 7
+//
+// where the description "is even" is automatically calculated from the
+// matcher name IsEven.
+//
+// Argument Type
+// =============
+//
+// Note that the type of the value being matched (arg_type) is
+// determined by the context in which you use the matcher and is
+// supplied to you by the compiler, so you don't need to worry about
+// declaring it (nor can you).  This allows the matcher to be
+// polymorphic.  For example, IsEven() can be used to match any type
+// where the value of "(arg % 2) == 0" can be implicitly converted to
+// a bool.  In the "Bar(IsEven())" example above, if method Bar()
+// takes an int, 'arg_type' will be int; if it takes an unsigned long,
+// 'arg_type' will be unsigned long; and so on.
+//
+// Parameterizing Matchers
+// =======================
+//
+// Sometimes you'll want to parameterize the matcher.  For that you
+// can use another macro:
+//
+//   MATCHER_P(name, param_name, description_string) { statements; }
+//
+// For example:
+//
+//   MATCHER_P(HasAbsoluteValue, value, "") { return abs(arg) == value; }
+//
+// will allow you to write:
+//
+//   EXPECT_THAT(Blah("a"), HasAbsoluteValue(n));
+//
+// which may lead to this message (assuming n is 10):
+//
+//   Value of: Blah("a")
+//   Expected: has absolute value 10
+//     Actual: -9
+//
+// Note that both the matcher description and its parameter are
+// printed, making the message human-friendly.
+//
+// In the matcher definition body, you can write 'foo_type' to
+// reference the type of a parameter named 'foo'.  For example, in the
+// body of MATCHER_P(HasAbsoluteValue, value) above, you can write
+// 'value_type' to refer to the type of 'value'.
+//
+// We also provide MATCHER_P2, MATCHER_P3, ..., up to MATCHER_P$n to
+// support multi-parameter matchers.
+//
+// Describing Parameterized Matchers
+// =================================
+//
+// The last argument to MATCHER*() is a string-typed expression.  The
+// expression can reference all of the matcher's parameters and a
+// special bool-typed variable named 'negation'.  When 'negation' is
+// false, the expression should evaluate to the matcher's description;
+// otherwise it should evaluate to the description of the negation of
+// the matcher.  For example,
+//
+//   using testing::PrintToString;
+//
+//   MATCHER_P2(InClosedRange, low, hi,
+//       std::string(negation ? "is not" : "is") + " in range [" +
+//       PrintToString(low) + ", " + PrintToString(hi) + "]") {
+//     return low <= arg && arg <= hi;
+//   }
+//   ...
+//   EXPECT_THAT(3, InClosedRange(4, 6));
+//   EXPECT_THAT(3, Not(InClosedRange(2, 4)));
+//
+// would generate two failures that contain the text:
+//
+//   Expected: is in range [4, 6]
+//   ...
+//   Expected: is not in range [2, 4]
+//
+// If you specify "" as the description, the failure message will
+// contain the sequence of words in the matcher name followed by the
+// parameter values printed as a tuple.  For example,
+//
+//   MATCHER_P2(InClosedRange, low, hi, "") { ... }
+//   ...
+//   EXPECT_THAT(3, InClosedRange(4, 6));
+//   EXPECT_THAT(3, Not(InClosedRange(2, 4)));
+//
+// would generate two failures that contain the text:
+//
+//   Expected: in closed range (4, 6)
+//   ...
+//   Expected: not (in closed range (2, 4))
+//
+// Types of Matcher Parameters
+// ===========================
+//
+// For the purpose of typing, you can view
+//
+//   MATCHER_Pk(Foo, p1, ..., pk, description_string) { ... }
+//
+// as shorthand for
+//
+//   template <typename p1_type, ..., typename pk_type>
+//   FooMatcherPk<p1_type, ..., pk_type>
+//   Foo(p1_type p1, ..., pk_type pk) { ... }
+//
+// When you write Foo(v1, ..., vk), the compiler infers the types of
+// the parameters v1, ..., and vk for you.  If you are not happy with
+// the result of the type inference, you can specify the types by
+// explicitly instantiating the template, as in Foo<long, bool>(5,
+// false).  As said earlier, you don't get to (or need to) specify
+// 'arg_type' as that's determined by the context in which the matcher
+// is used.  You can assign the result of expression Foo(p1, ..., pk)
+// to a variable of type FooMatcherPk<p1_type, ..., pk_type>.  This
+// can be useful when composing matchers.
+//
+// While you can instantiate a matcher template with reference types,
+// passing the parameters by pointer usually makes your code more
+// readable.  If, however, you still want to pass a parameter by
+// reference, be aware that in the failure message generated by the
+// matcher you will see the value of the referenced object but not its
+// address.
+//
+// Explaining Match Results
+// ========================
+//
+// Sometimes the matcher description alone isn't enough to explain why
+// the match has failed or succeeded.  For example, when expecting a
+// long string, it can be very helpful to also print the diff between
+// the expected string and the actual one.  To achieve that, you can
+// optionally stream additional information to a special variable
+// named result_listener, whose type is a pointer to class
+// MatchResultListener:
+//
+//   MATCHER_P(EqualsLongString, str, "") {
+//     if (arg == str) return true;
+//
+//     *result_listener << "the difference: "
+///                     << DiffStrings(str, arg);
+//     return false;
+//   }
+//
+// Overloading Matchers
+// ====================
+//
+// You can overload matchers with different numbers of parameters:
+//
+//   MATCHER_P(Blah, a, description_string1) { ... }
+//   MATCHER_P2(Blah, a, b, description_string2) { ... }
+//
+// Caveats
+// =======
+//
+// When defining a new matcher, you should also consider implementing
+// MatcherInterface or using MakePolymorphicMatcher().  These
+// approaches require more work than the MATCHER* macros, but also
+// give you more control on the types of the value being matched and
+// the matcher parameters, which may leads to better compiler error
+// messages when the matcher is used wrong.  They also allow
+// overloading matchers based on parameter types (as opposed to just
+// based on the number of parameters).
+//
+// MATCHER*() can only be used in a namespace scope as templates cannot be
+// declared inside of a local class.
+//
+// More Information
+// ================
+//
+// To learn more about using these macros, please search for 'MATCHER'
+// on
+// https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md
+//
+// This file also implements some commonly used argument matchers.  More
+// matchers can be defined by the user implementing the
+// MatcherInterface<T> interface if necessary.
+//
+// See googletest/include/gtest/gtest-matchers.h for the definition of class
+// Matcher, class MatcherInterface, and others.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
+
+#include <algorithm>
+#include <cmath>
+#include <initializer_list>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gmock/internal/gmock-pp.h"
+#include "gtest/gtest.h"
+
+// MSVC warning C5046 is new as of VS2017 version 15.8.
+#if defined(_MSC_VER) && _MSC_VER >= 1915
+#define GMOCK_MAYBE_5046_ 5046
+#else
+#define GMOCK_MAYBE_5046_
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4251 GMOCK_MAYBE_5046_ /* class A needs to have dll-interface to be used by
+                              clients of class B */
+    /* Symbol involving type with internal linkage not defined */)
+
+namespace testing {
+
+// To implement a matcher Foo for type T, define:
+//   1. a class FooMatcherImpl that implements the
+//      MatcherInterface<T> interface, and
+//   2. a factory function that creates a Matcher<T> object from a
+//      FooMatcherImpl*.
+//
+// The two-level delegation design makes it possible to allow a user
+// to write "v" instead of "Eq(v)" where a Matcher is expected, which
+// is impossible if we pass matchers by pointers.  It also eases
+// ownership management as Matcher objects can now be copied like
+// plain values.
+
+// A match result listener that stores the explanation in a string.
+class StringMatchResultListener : public MatchResultListener {
+ public:
+  StringMatchResultListener() : MatchResultListener(&ss_) {}
+
+  // Returns the explanation accumulated so far.
+  std::string str() const { return ss_.str(); }
+
+  // Clears the explanation accumulated so far.
+  void Clear() { ss_.str(""); }
+
+ private:
+  ::std::stringstream ss_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StringMatchResultListener);
+};
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// The MatcherCastImpl class template is a helper for implementing
+// MatcherCast().  We need this helper in order to partially
+// specialize the implementation of MatcherCast() (C++ allows
+// class/struct templates to be partially specialized, but not
+// function templates.).
+
+// This general version is used when MatcherCast()'s argument is a
+// polymorphic matcher (i.e. something that can be converted to a
+// Matcher but is not one yet; for example, Eq(value)) or a value (for
+// example, "hello").
+template <typename T, typename M>
+class MatcherCastImpl {
+ public:
+  static Matcher<T> Cast(const M& polymorphic_matcher_or_value) {
+    // M can be a polymorphic matcher, in which case we want to use
+    // its conversion operator to create Matcher<T>.  Or it can be a value
+    // that should be passed to the Matcher<T>'s constructor.
+    //
+    // We can't call Matcher<T>(polymorphic_matcher_or_value) when M is a
+    // polymorphic matcher because it'll be ambiguous if T has an implicit
+    // constructor from M (this usually happens when T has an implicit
+    // constructor from any type).
+    //
+    // It won't work to unconditionally implict_cast
+    // polymorphic_matcher_or_value to Matcher<T> because it won't trigger
+    // a user-defined conversion from M to T if one exists (assuming M is
+    // a value).
+    return CastImpl(polymorphic_matcher_or_value,
+                    std::is_convertible<M, Matcher<T>>{},
+                    std::is_convertible<M, T>{});
+  }
+
+ private:
+  template <bool Ignore>
+  static Matcher<T> CastImpl(const M& polymorphic_matcher_or_value,
+                             std::true_type /* convertible_to_matcher */,
+                             std::integral_constant<bool, Ignore>) {
+    // M is implicitly convertible to Matcher<T>, which means that either
+    // M is a polymorphic matcher or Matcher<T> has an implicit constructor
+    // from M.  In both cases using the implicit conversion will produce a
+    // matcher.
+    //
+    // Even if T has an implicit constructor from M, it won't be called because
+    // creating Matcher<T> would require a chain of two user-defined conversions
+    // (first to create T from M and then to create Matcher<T> from T).
+    return polymorphic_matcher_or_value;
+  }
+
+  // M can't be implicitly converted to Matcher<T>, so M isn't a polymorphic
+  // matcher. It's a value of a type implicitly convertible to T. Use direct
+  // initialization to create a matcher.
+  static Matcher<T> CastImpl(const M& value,
+                             std::false_type /* convertible_to_matcher */,
+                             std::true_type /* convertible_to_T */) {
+    return Matcher<T>(ImplicitCast_<T>(value));
+  }
+
+  // M can't be implicitly converted to either Matcher<T> or T. Attempt to use
+  // polymorphic matcher Eq(value) in this case.
+  //
+  // Note that we first attempt to perform an implicit cast on the value and
+  // only fall back to the polymorphic Eq() matcher afterwards because the
+  // latter calls bool operator==(const Lhs& lhs, const Rhs& rhs) in the end
+  // which might be undefined even when Rhs is implicitly convertible to Lhs
+  // (e.g. std::pair<const int, int> vs. std::pair<int, int>).
+  //
+  // We don't define this method inline as we need the declaration of Eq().
+  static Matcher<T> CastImpl(const M& value,
+                             std::false_type /* convertible_to_matcher */,
+                             std::false_type /* convertible_to_T */);
+};
+
+// This more specialized version is used when MatcherCast()'s argument
+// is already a Matcher.  This only compiles when type T can be
+// statically converted to type U.
+template <typename T, typename U>
+class MatcherCastImpl<T, Matcher<U> > {
+ public:
+  static Matcher<T> Cast(const Matcher<U>& source_matcher) {
+    return Matcher<T>(new Impl(source_matcher));
+  }
+
+ private:
+  class Impl : public MatcherInterface<T> {
+   public:
+    explicit Impl(const Matcher<U>& source_matcher)
+        : source_matcher_(source_matcher) {}
+
+    // We delegate the matching logic to the source matcher.
+    bool MatchAndExplain(T x, MatchResultListener* listener) const override {
+      using FromType = typename std::remove_cv<typename std::remove_pointer<
+          typename std::remove_reference<T>::type>::type>::type;
+      using ToType = typename std::remove_cv<typename std::remove_pointer<
+          typename std::remove_reference<U>::type>::type>::type;
+      // Do not allow implicitly converting base*/& to derived*/&.
+      static_assert(
+          // Do not trigger if only one of them is a pointer. That implies a
+          // regular conversion and not a down_cast.
+          (std::is_pointer<typename std::remove_reference<T>::type>::value !=
+           std::is_pointer<typename std::remove_reference<U>::type>::value) ||
+              std::is_same<FromType, ToType>::value ||
+              !std::is_base_of<FromType, ToType>::value,
+          "Can't implicitly convert from <base> to <derived>");
+
+      // Do the cast to `U` explicitly if necessary.
+      // Otherwise, let implicit conversions do the trick.
+      using CastType =
+          typename std::conditional<std::is_convertible<T&, const U&>::value,
+                                    T&, U>::type;
+
+      return source_matcher_.MatchAndExplain(static_cast<CastType>(x),
+                                             listener);
+    }
+
+    void DescribeTo(::std::ostream* os) const override {
+      source_matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      source_matcher_.DescribeNegationTo(os);
+    }
+
+   private:
+    const Matcher<U> source_matcher_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+};
+
+// This even more specialized version is used for efficiently casting
+// a matcher to its own type.
+template <typename T>
+class MatcherCastImpl<T, Matcher<T> > {
+ public:
+  static Matcher<T> Cast(const Matcher<T>& matcher) { return matcher; }
+};
+
+// Template specialization for parameterless Matcher.
+template <typename Derived>
+class MatcherBaseImpl {
+ public:
+  MatcherBaseImpl() = default;
+
+  template <typename T>
+  operator ::testing::Matcher<T>() const {  // NOLINT(runtime/explicit)
+    return ::testing::Matcher<T>(new
+                                 typename Derived::template gmock_Impl<T>());
+  }
+};
+
+// Template specialization for Matcher with parameters.
+template <template <typename...> class Derived, typename... Ts>
+class MatcherBaseImpl<Derived<Ts...>> {
+ public:
+  // Mark the constructor explicit for single argument T to avoid implicit
+  // conversions.
+  template <typename E = std::enable_if<sizeof...(Ts) == 1>,
+            typename E::type* = nullptr>
+  explicit MatcherBaseImpl(Ts... params)
+      : params_(std::forward<Ts>(params)...) {}
+  template <typename E = std::enable_if<sizeof...(Ts) != 1>,
+            typename = typename E::type>
+  MatcherBaseImpl(Ts... params)  // NOLINT
+      : params_(std::forward<Ts>(params)...) {}
+
+  template <typename F>
+  operator ::testing::Matcher<F>() const {  // NOLINT(runtime/explicit)
+    return Apply<F>(MakeIndexSequence<sizeof...(Ts)>{});
+  }
+
+ private:
+  template <typename F, std::size_t... tuple_ids>
+  ::testing::Matcher<F> Apply(IndexSequence<tuple_ids...>) const {
+    return ::testing::Matcher<F>(
+        new typename Derived<Ts...>::template gmock_Impl<F>(
+            std::get<tuple_ids>(params_)...));
+  }
+
+  const std::tuple<Ts...> params_;
+};
+
+}  // namespace internal
+
+// In order to be safe and clear, casting between different matcher
+// types is done explicitly via MatcherCast<T>(m), which takes a
+// matcher m and returns a Matcher<T>.  It compiles only when T can be
+// statically converted to the argument type of m.
+template <typename T, typename M>
+inline Matcher<T> MatcherCast(const M& matcher) {
+  return internal::MatcherCastImpl<T, M>::Cast(matcher);
+}
+
+// This overload handles polymorphic matchers and values only since
+// monomorphic matchers are handled by the next one.
+template <typename T, typename M>
+inline Matcher<T> SafeMatcherCast(const M& polymorphic_matcher_or_value) {
+  return MatcherCast<T>(polymorphic_matcher_or_value);
+}
+
+// This overload handles monomorphic matchers.
+//
+// In general, if type T can be implicitly converted to type U, we can
+// safely convert a Matcher<U> to a Matcher<T> (i.e. Matcher is
+// contravariant): just keep a copy of the original Matcher<U>, convert the
+// argument from type T to U, and then pass it to the underlying Matcher<U>.
+// The only exception is when U is a reference and T is not, as the
+// underlying Matcher<U> may be interested in the argument's address, which
+// is not preserved in the conversion from T to U.
+template <typename T, typename U>
+inline Matcher<T> SafeMatcherCast(const Matcher<U>& matcher) {
+  // Enforce that T can be implicitly converted to U.
+  static_assert(std::is_convertible<const T&, const U&>::value,
+                "T must be implicitly convertible to U");
+  // Enforce that we are not converting a non-reference type T to a reference
+  // type U.
+  GTEST_COMPILE_ASSERT_(
+      std::is_reference<T>::value || !std::is_reference<U>::value,
+      cannot_convert_non_reference_arg_to_reference);
+  // In case both T and U are arithmetic types, enforce that the
+  // conversion is not lossy.
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(T) RawT;
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(U) RawU;
+  constexpr bool kTIsOther = GMOCK_KIND_OF_(RawT) == internal::kOther;
+  constexpr bool kUIsOther = GMOCK_KIND_OF_(RawU) == internal::kOther;
+  GTEST_COMPILE_ASSERT_(
+      kTIsOther || kUIsOther ||
+      (internal::LosslessArithmeticConvertible<RawT, RawU>::value),
+      conversion_of_arithmetic_types_must_be_lossless);
+  return MatcherCast<T>(matcher);
+}
+
+// A<T>() returns a matcher that matches any value of type T.
+template <typename T>
+Matcher<T> A();
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// If the explanation is not empty, prints it to the ostream.
+inline void PrintIfNotEmpty(const std::string& explanation,
+                            ::std::ostream* os) {
+  if (explanation != "" && os != nullptr) {
+    *os << ", " << explanation;
+  }
+}
+
+// Returns true if the given type name is easy to read by a human.
+// This is used to decide whether printing the type of a value might
+// be helpful.
+inline bool IsReadableTypeName(const std::string& type_name) {
+  // We consider a type name readable if it's short or doesn't contain
+  // a template or function type.
+  return (type_name.length() <= 20 ||
+          type_name.find_first_of("<(") == std::string::npos);
+}
+
+// Matches the value against the given matcher, prints the value and explains
+// the match result to the listener. Returns the match result.
+// 'listener' must not be NULL.
+// Value cannot be passed by const reference, because some matchers take a
+// non-const argument.
+template <typename Value, typename T>
+bool MatchPrintAndExplain(Value& value, const Matcher<T>& matcher,
+                          MatchResultListener* listener) {
+  if (!listener->IsInterested()) {
+    // If the listener is not interested, we do not need to construct the
+    // inner explanation.
+    return matcher.Matches(value);
+  }
+
+  StringMatchResultListener inner_listener;
+  const bool match = matcher.MatchAndExplain(value, &inner_listener);
+
+  UniversalPrint(value, listener->stream());
+#if GTEST_HAS_RTTI
+  const std::string& type_name = GetTypeName<Value>();
+  if (IsReadableTypeName(type_name))
+    *listener->stream() << " (of type " << type_name << ")";
+#endif
+  PrintIfNotEmpty(inner_listener.str(), listener->stream());
+
+  return match;
+}
+
+// An internal helper class for doing compile-time loop on a tuple's
+// fields.
+template <size_t N>
+class TuplePrefix {
+ public:
+  // TuplePrefix<N>::Matches(matcher_tuple, value_tuple) returns true
+  // if and only if the first N fields of matcher_tuple matches
+  // the first N fields of value_tuple, respectively.
+  template <typename MatcherTuple, typename ValueTuple>
+  static bool Matches(const MatcherTuple& matcher_tuple,
+                      const ValueTuple& value_tuple) {
+    return TuplePrefix<N - 1>::Matches(matcher_tuple, value_tuple) &&
+           std::get<N - 1>(matcher_tuple).Matches(std::get<N - 1>(value_tuple));
+  }
+
+  // TuplePrefix<N>::ExplainMatchFailuresTo(matchers, values, os)
+  // describes failures in matching the first N fields of matchers
+  // against the first N fields of values.  If there is no failure,
+  // nothing will be streamed to os.
+  template <typename MatcherTuple, typename ValueTuple>
+  static void ExplainMatchFailuresTo(const MatcherTuple& matchers,
+                                     const ValueTuple& values,
+                                     ::std::ostream* os) {
+    // First, describes failures in the first N - 1 fields.
+    TuplePrefix<N - 1>::ExplainMatchFailuresTo(matchers, values, os);
+
+    // Then describes the failure (if any) in the (N - 1)-th (0-based)
+    // field.
+    typename std::tuple_element<N - 1, MatcherTuple>::type matcher =
+        std::get<N - 1>(matchers);
+    typedef typename std::tuple_element<N - 1, ValueTuple>::type Value;
+    const Value& value = std::get<N - 1>(values);
+    StringMatchResultListener listener;
+    if (!matcher.MatchAndExplain(value, &listener)) {
+      *os << "  Expected arg #" << N - 1 << ": ";
+      std::get<N - 1>(matchers).DescribeTo(os);
+      *os << "\n           Actual: ";
+      // We remove the reference in type Value to prevent the
+      // universal printer from printing the address of value, which
+      // isn't interesting to the user most of the time.  The
+      // matcher's MatchAndExplain() method handles the case when
+      // the address is interesting.
+      internal::UniversalPrint(value, os);
+      PrintIfNotEmpty(listener.str(), os);
+      *os << "\n";
+    }
+  }
+};
+
+// The base case.
+template <>
+class TuplePrefix<0> {
+ public:
+  template <typename MatcherTuple, typename ValueTuple>
+  static bool Matches(const MatcherTuple& /* matcher_tuple */,
+                      const ValueTuple& /* value_tuple */) {
+    return true;
+  }
+
+  template <typename MatcherTuple, typename ValueTuple>
+  static void ExplainMatchFailuresTo(const MatcherTuple& /* matchers */,
+                                     const ValueTuple& /* values */,
+                                     ::std::ostream* /* os */) {}
+};
+
+// TupleMatches(matcher_tuple, value_tuple) returns true if and only if
+// all matchers in matcher_tuple match the corresponding fields in
+// value_tuple.  It is a compiler error if matcher_tuple and
+// value_tuple have different number of fields or incompatible field
+// types.
+template <typename MatcherTuple, typename ValueTuple>
+bool TupleMatches(const MatcherTuple& matcher_tuple,
+                  const ValueTuple& value_tuple) {
+  // Makes sure that matcher_tuple and value_tuple have the same
+  // number of fields.
+  GTEST_COMPILE_ASSERT_(std::tuple_size<MatcherTuple>::value ==
+                            std::tuple_size<ValueTuple>::value,
+                        matcher_and_value_have_different_numbers_of_fields);
+  return TuplePrefix<std::tuple_size<ValueTuple>::value>::Matches(matcher_tuple,
+                                                                  value_tuple);
+}
+
+// Describes failures in matching matchers against values.  If there
+// is no failure, nothing will be streamed to os.
+template <typename MatcherTuple, typename ValueTuple>
+void ExplainMatchFailureTupleTo(const MatcherTuple& matchers,
+                                const ValueTuple& values,
+                                ::std::ostream* os) {
+  TuplePrefix<std::tuple_size<MatcherTuple>::value>::ExplainMatchFailuresTo(
+      matchers, values, os);
+}
+
+// TransformTupleValues and its helper.
+//
+// TransformTupleValuesHelper hides the internal machinery that
+// TransformTupleValues uses to implement a tuple traversal.
+template <typename Tuple, typename Func, typename OutIter>
+class TransformTupleValuesHelper {
+ private:
+  typedef ::std::tuple_size<Tuple> TupleSize;
+
+ public:
+  // For each member of tuple 't', taken in order, evaluates '*out++ = f(t)'.
+  // Returns the final value of 'out' in case the caller needs it.
+  static OutIter Run(Func f, const Tuple& t, OutIter out) {
+    return IterateOverTuple<Tuple, TupleSize::value>()(f, t, out);
+  }
+
+ private:
+  template <typename Tup, size_t kRemainingSize>
+  struct IterateOverTuple {
+    OutIter operator() (Func f, const Tup& t, OutIter out) const {
+      *out++ = f(::std::get<TupleSize::value - kRemainingSize>(t));
+      return IterateOverTuple<Tup, kRemainingSize - 1>()(f, t, out);
+    }
+  };
+  template <typename Tup>
+  struct IterateOverTuple<Tup, 0> {
+    OutIter operator() (Func /* f */, const Tup& /* t */, OutIter out) const {
+      return out;
+    }
+  };
+};
+
+// Successively invokes 'f(element)' on each element of the tuple 't',
+// appending each result to the 'out' iterator. Returns the final value
+// of 'out'.
+template <typename Tuple, typename Func, typename OutIter>
+OutIter TransformTupleValues(Func f, const Tuple& t, OutIter out) {
+  return TransformTupleValuesHelper<Tuple, Func, OutIter>::Run(f, t, out);
+}
+
+// Implements A<T>().
+template <typename T>
+class AnyMatcherImpl : public MatcherInterface<const T&> {
+ public:
+  bool MatchAndExplain(const T& /* x */,
+                       MatchResultListener* /* listener */) const override {
+    return true;
+  }
+  void DescribeTo(::std::ostream* os) const override { *os << "is anything"; }
+  void DescribeNegationTo(::std::ostream* os) const override {
+    // This is mostly for completeness' safe, as it's not very useful
+    // to write Not(A<bool>()).  However we cannot completely rule out
+    // such a possibility, and it doesn't hurt to be prepared.
+    *os << "never matches";
+  }
+};
+
+// Implements _, a matcher that matches any value of any
+// type.  This is a polymorphic matcher, so we need a template type
+// conversion operator to make it appearing as a Matcher<T> for any
+// type T.
+class AnythingMatcher {
+ public:
+  template <typename T>
+  operator Matcher<T>() const { return A<T>(); }
+};
+
+// Implements the polymorphic IsNull() matcher, which matches any raw or smart
+// pointer that is NULL.
+class IsNullMatcher {
+ public:
+  template <typename Pointer>
+  bool MatchAndExplain(const Pointer& p,
+                       MatchResultListener* /* listener */) const {
+    return p == nullptr;
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << "is NULL"; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "isn't NULL";
+  }
+};
+
+// Implements the polymorphic NotNull() matcher, which matches any raw or smart
+// pointer that is not NULL.
+class NotNullMatcher {
+ public:
+  template <typename Pointer>
+  bool MatchAndExplain(const Pointer& p,
+                       MatchResultListener* /* listener */) const {
+    return p != nullptr;
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << "isn't NULL"; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "is NULL";
+  }
+};
+
+// Ref(variable) matches any argument that is a reference to
+// 'variable'.  This matcher is polymorphic as it can match any
+// super type of the type of 'variable'.
+//
+// The RefMatcher template class implements Ref(variable).  It can
+// only be instantiated with a reference type.  This prevents a user
+// from mistakenly using Ref(x) to match a non-reference function
+// argument.  For example, the following will righteously cause a
+// compiler error:
+//
+//   int n;
+//   Matcher<int> m1 = Ref(n);   // This won't compile.
+//   Matcher<int&> m2 = Ref(n);  // This will compile.
+template <typename T>
+class RefMatcher;
+
+template <typename T>
+class RefMatcher<T&> {
+  // Google Mock is a generic framework and thus needs to support
+  // mocking any function types, including those that take non-const
+  // reference arguments.  Therefore the template parameter T (and
+  // Super below) can be instantiated to either a const type or a
+  // non-const type.
+ public:
+  // RefMatcher() takes a T& instead of const T&, as we want the
+  // compiler to catch using Ref(const_value) as a matcher for a
+  // non-const reference.
+  explicit RefMatcher(T& x) : object_(x) {}  // NOLINT
+
+  template <typename Super>
+  operator Matcher<Super&>() const {
+    // By passing object_ (type T&) to Impl(), which expects a Super&,
+    // we make sure that Super is a super type of T.  In particular,
+    // this catches using Ref(const_value) as a matcher for a
+    // non-const reference, as you cannot implicitly convert a const
+    // reference to a non-const reference.
+    return MakeMatcher(new Impl<Super>(object_));
+  }
+
+ private:
+  template <typename Super>
+  class Impl : public MatcherInterface<Super&> {
+   public:
+    explicit Impl(Super& x) : object_(x) {}  // NOLINT
+
+    // MatchAndExplain() takes a Super& (as opposed to const Super&)
+    // in order to match the interface MatcherInterface<Super&>.
+    bool MatchAndExplain(Super& x,
+                         MatchResultListener* listener) const override {
+      *listener << "which is located @" << static_cast<const void*>(&x);
+      return &x == &object_;
+    }
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "references the variable ";
+      UniversalPrinter<Super&>::Print(object_, os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "does not reference the variable ";
+      UniversalPrinter<Super&>::Print(object_, os);
+    }
+
+   private:
+    const Super& object_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  T& object_;
+
+  GTEST_DISALLOW_ASSIGN_(RefMatcher);
+};
+
+// Polymorphic helper functions for narrow and wide string matchers.
+inline bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
+  return String::CaseInsensitiveCStringEquals(lhs, rhs);
+}
+
+inline bool CaseInsensitiveCStringEquals(const wchar_t* lhs,
+                                         const wchar_t* rhs) {
+  return String::CaseInsensitiveWideCStringEquals(lhs, rhs);
+}
+
+// String comparison for narrow or wide strings that can have embedded NUL
+// characters.
+template <typename StringType>
+bool CaseInsensitiveStringEquals(const StringType& s1,
+                                 const StringType& s2) {
+  // Are the heads equal?
+  if (!CaseInsensitiveCStringEquals(s1.c_str(), s2.c_str())) {
+    return false;
+  }
+
+  // Skip the equal heads.
+  const typename StringType::value_type nul = 0;
+  const size_t i1 = s1.find(nul), i2 = s2.find(nul);
+
+  // Are we at the end of either s1 or s2?
+  if (i1 == StringType::npos || i2 == StringType::npos) {
+    return i1 == i2;
+  }
+
+  // Are the tails equal?
+  return CaseInsensitiveStringEquals(s1.substr(i1 + 1), s2.substr(i2 + 1));
+}
+
+// String matchers.
+
+// Implements equality-based string matchers like StrEq, StrCaseNe, and etc.
+template <typename StringType>
+class StrEqualityMatcher {
+ public:
+  StrEqualityMatcher(const StringType& str, bool expect_eq,
+                     bool case_sensitive)
+      : string_(str), expect_eq_(expect_eq), case_sensitive_(case_sensitive) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  bool MatchAndExplain(const internal::StringView& s,
+                       MatchResultListener* listener) const {
+    // This should fail to compile if StringView is used with wide
+    // strings.
+    const StringType& str = std::string(s);
+    return MatchAndExplain(str, listener);
+  }
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    if (s == nullptr) {
+      return !expect_eq_;
+    }
+    return MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because StringView has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType s2(s);
+    const bool eq = case_sensitive_ ? s2 == string_ :
+        CaseInsensitiveStringEquals(s2, string_);
+    return expect_eq_ == eq;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    DescribeToHelper(expect_eq_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    DescribeToHelper(!expect_eq_, os);
+  }
+
+ private:
+  void DescribeToHelper(bool expect_eq, ::std::ostream* os) const {
+    *os << (expect_eq ? "is " : "isn't ");
+    *os << "equal to ";
+    if (!case_sensitive_) {
+      *os << "(ignoring case) ";
+    }
+    UniversalPrint(string_, os);
+  }
+
+  const StringType string_;
+  const bool expect_eq_;
+  const bool case_sensitive_;
+
+  GTEST_DISALLOW_ASSIGN_(StrEqualityMatcher);
+};
+
+// Implements the polymorphic HasSubstr(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class HasSubstrMatcher {
+ public:
+  explicit HasSubstrMatcher(const StringType& substring)
+      : substring_(substring) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  bool MatchAndExplain(const internal::StringView& s,
+                       MatchResultListener* listener) const {
+    // This should fail to compile if StringView is used with wide
+    // strings.
+    const StringType& str = std::string(s);
+    return MatchAndExplain(str, listener);
+  }
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != nullptr && MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because StringView has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    return StringType(s).find(substring_) != StringType::npos;
+  }
+
+  // Describes what this matcher matches.
+  void DescribeTo(::std::ostream* os) const {
+    *os << "has substring ";
+    UniversalPrint(substring_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "has no substring ";
+    UniversalPrint(substring_, os);
+  }
+
+ private:
+  const StringType substring_;
+
+  GTEST_DISALLOW_ASSIGN_(HasSubstrMatcher);
+};
+
+// Implements the polymorphic StartsWith(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class StartsWithMatcher {
+ public:
+  explicit StartsWithMatcher(const StringType& prefix) : prefix_(prefix) {
+  }
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  bool MatchAndExplain(const internal::StringView& s,
+                       MatchResultListener* listener) const {
+    // This should fail to compile if StringView is used with wide
+    // strings.
+    const StringType& str = std::string(s);
+    return MatchAndExplain(str, listener);
+  }
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != nullptr && MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because StringView has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType& s2(s);
+    return s2.length() >= prefix_.length() &&
+        s2.substr(0, prefix_.length()) == prefix_;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "starts with ";
+    UniversalPrint(prefix_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't start with ";
+    UniversalPrint(prefix_, os);
+  }
+
+ private:
+  const StringType prefix_;
+
+  GTEST_DISALLOW_ASSIGN_(StartsWithMatcher);
+};
+
+// Implements the polymorphic EndsWith(substring) matcher, which
+// can be used as a Matcher<T> as long as T can be converted to a
+// string.
+template <typename StringType>
+class EndsWithMatcher {
+ public:
+  explicit EndsWithMatcher(const StringType& suffix) : suffix_(suffix) {}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  bool MatchAndExplain(const internal::StringView& s,
+                       MatchResultListener* listener) const {
+    // This should fail to compile if StringView is used with wide
+    // strings.
+    const StringType& str = std::string(s);
+    return MatchAndExplain(str, listener);
+  }
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+  // Accepts pointer types, particularly:
+  //   const char*
+  //   char*
+  //   const wchar_t*
+  //   wchar_t*
+  template <typename CharType>
+  bool MatchAndExplain(CharType* s, MatchResultListener* listener) const {
+    return s != nullptr && MatchAndExplain(StringType(s), listener);
+  }
+
+  // Matches anything that can convert to StringType.
+  //
+  // This is a template, not just a plain function with const StringType&,
+  // because StringView has some interfering non-explicit constructors.
+  template <typename MatcheeStringType>
+  bool MatchAndExplain(const MatcheeStringType& s,
+                       MatchResultListener* /* listener */) const {
+    const StringType& s2(s);
+    return s2.length() >= suffix_.length() &&
+        s2.substr(s2.length() - suffix_.length()) == suffix_;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "ends with ";
+    UniversalPrint(suffix_, os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't end with ";
+    UniversalPrint(suffix_, os);
+  }
+
+ private:
+  const StringType suffix_;
+
+  GTEST_DISALLOW_ASSIGN_(EndsWithMatcher);
+};
+
+// Implements a matcher that compares the two fields of a 2-tuple
+// using one of the ==, <=, <, etc, operators.  The two fields being
+// compared don't have to have the same type.
+//
+// The matcher defined here is polymorphic (for example, Eq() can be
+// used to match a std::tuple<int, short>, a std::tuple<const long&, double>,
+// etc).  Therefore we use a template type conversion operator in the
+// implementation.
+template <typename D, typename Op>
+class PairMatchBase {
+ public:
+  template <typename T1, typename T2>
+  operator Matcher<::std::tuple<T1, T2>>() const {
+    return Matcher<::std::tuple<T1, T2>>(new Impl<const ::std::tuple<T1, T2>&>);
+  }
+  template <typename T1, typename T2>
+  operator Matcher<const ::std::tuple<T1, T2>&>() const {
+    return MakeMatcher(new Impl<const ::std::tuple<T1, T2>&>);
+  }
+
+ private:
+  static ::std::ostream& GetDesc(::std::ostream& os) {  // NOLINT
+    return os << D::Desc();
+  }
+
+  template <typename Tuple>
+  class Impl : public MatcherInterface<Tuple> {
+   public:
+    bool MatchAndExplain(Tuple args,
+                         MatchResultListener* /* listener */) const override {
+      return Op()(::std::get<0>(args), ::std::get<1>(args));
+    }
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "are " << GetDesc;
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "aren't " << GetDesc;
+    }
+  };
+};
+
+class Eq2Matcher : public PairMatchBase<Eq2Matcher, AnyEq> {
+ public:
+  static const char* Desc() { return "an equal pair"; }
+};
+class Ne2Matcher : public PairMatchBase<Ne2Matcher, AnyNe> {
+ public:
+  static const char* Desc() { return "an unequal pair"; }
+};
+class Lt2Matcher : public PairMatchBase<Lt2Matcher, AnyLt> {
+ public:
+  static const char* Desc() { return "a pair where the first < the second"; }
+};
+class Gt2Matcher : public PairMatchBase<Gt2Matcher, AnyGt> {
+ public:
+  static const char* Desc() { return "a pair where the first > the second"; }
+};
+class Le2Matcher : public PairMatchBase<Le2Matcher, AnyLe> {
+ public:
+  static const char* Desc() { return "a pair where the first <= the second"; }
+};
+class Ge2Matcher : public PairMatchBase<Ge2Matcher, AnyGe> {
+ public:
+  static const char* Desc() { return "a pair where the first >= the second"; }
+};
+
+// Implements the Not(...) matcher for a particular argument type T.
+// We do not nest it inside the NotMatcher class template, as that
+// will prevent different instantiations of NotMatcher from sharing
+// the same NotMatcherImpl<T> class.
+template <typename T>
+class NotMatcherImpl : public MatcherInterface<const T&> {
+ public:
+  explicit NotMatcherImpl(const Matcher<T>& matcher)
+      : matcher_(matcher) {}
+
+  bool MatchAndExplain(const T& x,
+                       MatchResultListener* listener) const override {
+    return !matcher_.MatchAndExplain(x, listener);
+  }
+
+  void DescribeTo(::std::ostream* os) const override {
+    matcher_.DescribeNegationTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    matcher_.DescribeTo(os);
+  }
+
+ private:
+  const Matcher<T> matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(NotMatcherImpl);
+};
+
+// Implements the Not(m) matcher, which matches a value that doesn't
+// match matcher m.
+template <typename InnerMatcher>
+class NotMatcher {
+ public:
+  explicit NotMatcher(InnerMatcher matcher) : matcher_(matcher) {}
+
+  // This template type conversion operator allows Not(m) to be used
+  // to match any type m can match.
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new NotMatcherImpl<T>(SafeMatcherCast<T>(matcher_)));
+  }
+
+ private:
+  InnerMatcher matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(NotMatcher);
+};
+
+// Implements the AllOf(m1, m2) matcher for a particular argument type
+// T. We do not nest it inside the BothOfMatcher class template, as
+// that will prevent different instantiations of BothOfMatcher from
+// sharing the same BothOfMatcherImpl<T> class.
+template <typename T>
+class AllOfMatcherImpl : public MatcherInterface<const T&> {
+ public:
+  explicit AllOfMatcherImpl(std::vector<Matcher<T> > matchers)
+      : matchers_(std::move(matchers)) {}
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "(";
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      if (i != 0) *os << ") and (";
+      matchers_[i].DescribeTo(os);
+    }
+    *os << ")";
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "(";
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      if (i != 0) *os << ") or (";
+      matchers_[i].DescribeNegationTo(os);
+    }
+    *os << ")";
+  }
+
+  bool MatchAndExplain(const T& x,
+                       MatchResultListener* listener) const override {
+    // If either matcher1_ or matcher2_ doesn't match x, we only need
+    // to explain why one of them fails.
+    std::string all_match_result;
+
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      StringMatchResultListener slistener;
+      if (matchers_[i].MatchAndExplain(x, &slistener)) {
+        if (all_match_result.empty()) {
+          all_match_result = slistener.str();
+        } else {
+          std::string result = slistener.str();
+          if (!result.empty()) {
+            all_match_result += ", and ";
+            all_match_result += result;
+          }
+        }
+      } else {
+        *listener << slistener.str();
+        return false;
+      }
+    }
+
+    // Otherwise we need to explain why *both* of them match.
+    *listener << all_match_result;
+    return true;
+  }
+
+ private:
+  const std::vector<Matcher<T> > matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(AllOfMatcherImpl);
+};
+
+// VariadicMatcher is used for the variadic implementation of
+// AllOf(m_1, m_2, ...) and AnyOf(m_1, m_2, ...).
+// CombiningMatcher<T> is used to recursively combine the provided matchers
+// (of type Args...).
+template <template <typename T> class CombiningMatcher, typename... Args>
+class VariadicMatcher {
+ public:
+  VariadicMatcher(const Args&... matchers)  // NOLINT
+      : matchers_(matchers...) {
+    static_assert(sizeof...(Args) > 0, "Must have at least one matcher.");
+  }
+
+  // This template type conversion operator allows an
+  // VariadicMatcher<Matcher1, Matcher2...> object to match any type that
+  // all of the provided matchers (Matcher1, Matcher2, ...) can match.
+  template <typename T>
+  operator Matcher<T>() const {
+    std::vector<Matcher<T> > values;
+    CreateVariadicMatcher<T>(&values, std::integral_constant<size_t, 0>());
+    return Matcher<T>(new CombiningMatcher<T>(std::move(values)));
+  }
+
+ private:
+  template <typename T, size_t I>
+  void CreateVariadicMatcher(std::vector<Matcher<T> >* values,
+                             std::integral_constant<size_t, I>) const {
+    values->push_back(SafeMatcherCast<T>(std::get<I>(matchers_)));
+    CreateVariadicMatcher<T>(values, std::integral_constant<size_t, I + 1>());
+  }
+
+  template <typename T>
+  void CreateVariadicMatcher(
+      std::vector<Matcher<T> >*,
+      std::integral_constant<size_t, sizeof...(Args)>) const {}
+
+  std::tuple<Args...> matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(VariadicMatcher);
+};
+
+template <typename... Args>
+using AllOfMatcher = VariadicMatcher<AllOfMatcherImpl, Args...>;
+
+// Implements the AnyOf(m1, m2) matcher for a particular argument type
+// T.  We do not nest it inside the AnyOfMatcher class template, as
+// that will prevent different instantiations of AnyOfMatcher from
+// sharing the same EitherOfMatcherImpl<T> class.
+template <typename T>
+class AnyOfMatcherImpl : public MatcherInterface<const T&> {
+ public:
+  explicit AnyOfMatcherImpl(std::vector<Matcher<T> > matchers)
+      : matchers_(std::move(matchers)) {}
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "(";
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      if (i != 0) *os << ") or (";
+      matchers_[i].DescribeTo(os);
+    }
+    *os << ")";
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "(";
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      if (i != 0) *os << ") and (";
+      matchers_[i].DescribeNegationTo(os);
+    }
+    *os << ")";
+  }
+
+  bool MatchAndExplain(const T& x,
+                       MatchResultListener* listener) const override {
+    std::string no_match_result;
+
+    // If either matcher1_ or matcher2_ matches x, we just need to
+    // explain why *one* of them matches.
+    for (size_t i = 0; i < matchers_.size(); ++i) {
+      StringMatchResultListener slistener;
+      if (matchers_[i].MatchAndExplain(x, &slistener)) {
+        *listener << slistener.str();
+        return true;
+      } else {
+        if (no_match_result.empty()) {
+          no_match_result = slistener.str();
+        } else {
+          std::string result = slistener.str();
+          if (!result.empty()) {
+            no_match_result += ", and ";
+            no_match_result += result;
+          }
+        }
+      }
+    }
+
+    // Otherwise we need to explain why *both* of them fail.
+    *listener << no_match_result;
+    return false;
+  }
+
+ private:
+  const std::vector<Matcher<T> > matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(AnyOfMatcherImpl);
+};
+
+// AnyOfMatcher is used for the variadic implementation of AnyOf(m_1, m_2, ...).
+template <typename... Args>
+using AnyOfMatcher = VariadicMatcher<AnyOfMatcherImpl, Args...>;
+
+// Wrapper for implementation of Any/AllOfArray().
+template <template <class> class MatcherImpl, typename T>
+class SomeOfArrayMatcher {
+ public:
+  // Constructs the matcher from a sequence of element values or
+  // element matchers.
+  template <typename Iter>
+  SomeOfArrayMatcher(Iter first, Iter last) : matchers_(first, last) {}
+
+  template <typename U>
+  operator Matcher<U>() const {  // NOLINT
+    using RawU = typename std::decay<U>::type;
+    std::vector<Matcher<RawU>> matchers;
+    for (const auto& matcher : matchers_) {
+      matchers.push_back(MatcherCast<RawU>(matcher));
+    }
+    return Matcher<U>(new MatcherImpl<RawU>(std::move(matchers)));
+  }
+
+ private:
+  const ::std::vector<T> matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(SomeOfArrayMatcher);
+};
+
+template <typename T>
+using AllOfArrayMatcher = SomeOfArrayMatcher<AllOfMatcherImpl, T>;
+
+template <typename T>
+using AnyOfArrayMatcher = SomeOfArrayMatcher<AnyOfMatcherImpl, T>;
+
+// Used for implementing Truly(pred), which turns a predicate into a
+// matcher.
+template <typename Predicate>
+class TrulyMatcher {
+ public:
+  explicit TrulyMatcher(Predicate pred) : predicate_(pred) {}
+
+  // This method template allows Truly(pred) to be used as a matcher
+  // for type T where T is the argument type of predicate 'pred'.  The
+  // argument is passed by reference as the predicate may be
+  // interested in the address of the argument.
+  template <typename T>
+  bool MatchAndExplain(T& x,  // NOLINT
+                       MatchResultListener* /* listener */) const {
+    // Without the if-statement, MSVC sometimes warns about converting
+    // a value to bool (warning 4800).
+    //
+    // We cannot write 'return !!predicate_(x);' as that doesn't work
+    // when predicate_(x) returns a class convertible to bool but
+    // having no operator!().
+    if (predicate_(x))
+      return true;
+    return false;
+  }
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "satisfies the given predicate";
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "doesn't satisfy the given predicate";
+  }
+
+ private:
+  Predicate predicate_;
+
+  GTEST_DISALLOW_ASSIGN_(TrulyMatcher);
+};
+
+// Used for implementing Matches(matcher), which turns a matcher into
+// a predicate.
+template <typename M>
+class MatcherAsPredicate {
+ public:
+  explicit MatcherAsPredicate(M matcher) : matcher_(matcher) {}
+
+  // This template operator() allows Matches(m) to be used as a
+  // predicate on type T where m is a matcher on type T.
+  //
+  // The argument x is passed by reference instead of by value, as
+  // some matcher may be interested in its address (e.g. as in
+  // Matches(Ref(n))(x)).
+  template <typename T>
+  bool operator()(const T& x) const {
+    // We let matcher_ commit to a particular type here instead of
+    // when the MatcherAsPredicate object was constructed.  This
+    // allows us to write Matches(m) where m is a polymorphic matcher
+    // (e.g. Eq(5)).
+    //
+    // If we write Matcher<T>(matcher_).Matches(x) here, it won't
+    // compile when matcher_ has type Matcher<const T&>; if we write
+    // Matcher<const T&>(matcher_).Matches(x) here, it won't compile
+    // when matcher_ has type Matcher<T>; if we just write
+    // matcher_.Matches(x), it won't compile when matcher_ is
+    // polymorphic, e.g. Eq(5).
+    //
+    // MatcherCast<const T&>() is necessary for making the code work
+    // in all of the above situations.
+    return MatcherCast<const T&>(matcher_).Matches(x);
+  }
+
+ private:
+  M matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(MatcherAsPredicate);
+};
+
+// For implementing ASSERT_THAT() and EXPECT_THAT().  The template
+// argument M must be a type that can be converted to a matcher.
+template <typename M>
+class PredicateFormatterFromMatcher {
+ public:
+  explicit PredicateFormatterFromMatcher(M m) : matcher_(std::move(m)) {}
+
+  // This template () operator allows a PredicateFormatterFromMatcher
+  // object to act as a predicate-formatter suitable for using with
+  // Google Test's EXPECT_PRED_FORMAT1() macro.
+  template <typename T>
+  AssertionResult operator()(const char* value_text, const T& x) const {
+    // We convert matcher_ to a Matcher<const T&> *now* instead of
+    // when the PredicateFormatterFromMatcher object was constructed,
+    // as matcher_ may be polymorphic (e.g. NotNull()) and we won't
+    // know which type to instantiate it to until we actually see the
+    // type of x here.
+    //
+    // We write SafeMatcherCast<const T&>(matcher_) instead of
+    // Matcher<const T&>(matcher_), as the latter won't compile when
+    // matcher_ has type Matcher<T> (e.g. An<int>()).
+    // We don't write MatcherCast<const T&> either, as that allows
+    // potentially unsafe downcasting of the matcher argument.
+    const Matcher<const T&> matcher = SafeMatcherCast<const T&>(matcher_);
+
+    // The expected path here is that the matcher should match (i.e. that most
+    // tests pass) so optimize for this case.
+    if (matcher.Matches(x)) {
+      return AssertionSuccess();
+    }
+
+    ::std::stringstream ss;
+    ss << "Value of: " << value_text << "\n"
+       << "Expected: ";
+    matcher.DescribeTo(&ss);
+
+    // Rerun the matcher to "PrintAndExplain" the failure.
+    StringMatchResultListener listener;
+    if (MatchPrintAndExplain(x, matcher, &listener)) {
+      ss << "\n  The matcher failed on the initial attempt; but passed when "
+            "rerun to generate the explanation.";
+    }
+    ss << "\n  Actual: " << listener.str();
+    return AssertionFailure() << ss.str();
+  }
+
+ private:
+  const M matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PredicateFormatterFromMatcher);
+};
+
+// A helper function for converting a matcher to a predicate-formatter
+// without the user needing to explicitly write the type.  This is
+// used for implementing ASSERT_THAT() and EXPECT_THAT().
+// Implementation detail: 'matcher' is received by-value to force decaying.
+template <typename M>
+inline PredicateFormatterFromMatcher<M>
+MakePredicateFormatterFromMatcher(M matcher) {
+  return PredicateFormatterFromMatcher<M>(std::move(matcher));
+}
+
+// Implements the polymorphic IsNan() matcher, which matches any floating type
+// value that is Nan.
+class IsNanMatcher {
+ public:
+  template <typename FloatType>
+  bool MatchAndExplain(const FloatType& f,
+                       MatchResultListener* /* listener */) const {
+    return (::std::isnan)(f);
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << "is NaN"; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "isn't NaN";
+  }
+};
+
+// Implements the polymorphic floating point equality matcher, which matches
+// two float values using ULP-based approximation or, optionally, a
+// user-specified epsilon.  The template is meant to be instantiated with
+// FloatType being either float or double.
+template <typename FloatType>
+class FloatingEqMatcher {
+ public:
+  // Constructor for FloatingEqMatcher.
+  // The matcher's input will be compared with expected.  The matcher treats two
+  // NANs as equal if nan_eq_nan is true.  Otherwise, under IEEE standards,
+  // equality comparisons between NANs will always return false.  We specify a
+  // negative max_abs_error_ term to indicate that ULP-based approximation will
+  // be used for comparison.
+  FloatingEqMatcher(FloatType expected, bool nan_eq_nan) :
+    expected_(expected), nan_eq_nan_(nan_eq_nan), max_abs_error_(-1) {
+  }
+
+  // Constructor that supports a user-specified max_abs_error that will be used
+  // for comparison instead of ULP-based approximation.  The max absolute
+  // should be non-negative.
+  FloatingEqMatcher(FloatType expected, bool nan_eq_nan,
+                    FloatType max_abs_error)
+      : expected_(expected),
+        nan_eq_nan_(nan_eq_nan),
+        max_abs_error_(max_abs_error) {
+    GTEST_CHECK_(max_abs_error >= 0)
+        << ", where max_abs_error is" << max_abs_error;
+  }
+
+  // Implements floating point equality matcher as a Matcher<T>.
+  template <typename T>
+  class Impl : public MatcherInterface<T> {
+   public:
+    Impl(FloatType expected, bool nan_eq_nan, FloatType max_abs_error)
+        : expected_(expected),
+          nan_eq_nan_(nan_eq_nan),
+          max_abs_error_(max_abs_error) {}
+
+    bool MatchAndExplain(T value,
+                         MatchResultListener* listener) const override {
+      const FloatingPoint<FloatType> actual(value), expected(expected_);
+
+      // Compares NaNs first, if nan_eq_nan_ is true.
+      if (actual.is_nan() || expected.is_nan()) {
+        if (actual.is_nan() && expected.is_nan()) {
+          return nan_eq_nan_;
+        }
+        // One is nan; the other is not nan.
+        return false;
+      }
+      if (HasMaxAbsError()) {
+        // We perform an equality check so that inf will match inf, regardless
+        // of error bounds.  If the result of value - expected_ would result in
+        // overflow or if either value is inf, the default result is infinity,
+        // which should only match if max_abs_error_ is also infinity.
+        if (value == expected_) {
+          return true;
+        }
+
+        const FloatType diff = value - expected_;
+        if (::std::fabs(diff) <= max_abs_error_) {
+          return true;
+        }
+
+        if (listener->IsInterested()) {
+          *listener << "which is " << diff << " from " << expected_;
+        }
+        return false;
+      } else {
+        return actual.AlmostEquals(expected);
+      }
+    }
+
+    void DescribeTo(::std::ostream* os) const override {
+      // os->precision() returns the previously set precision, which we
+      // store to restore the ostream to its original configuration
+      // after outputting.
+      const ::std::streamsize old_precision = os->precision(
+          ::std::numeric_limits<FloatType>::digits10 + 2);
+      if (FloatingPoint<FloatType>(expected_).is_nan()) {
+        if (nan_eq_nan_) {
+          *os << "is NaN";
+        } else {
+          *os << "never matches";
+        }
+      } else {
+        *os << "is approximately " << expected_;
+        if (HasMaxAbsError()) {
+          *os << " (absolute error <= " << max_abs_error_ << ")";
+        }
+      }
+      os->precision(old_precision);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      // As before, get original precision.
+      const ::std::streamsize old_precision = os->precision(
+          ::std::numeric_limits<FloatType>::digits10 + 2);
+      if (FloatingPoint<FloatType>(expected_).is_nan()) {
+        if (nan_eq_nan_) {
+          *os << "isn't NaN";
+        } else {
+          *os << "is anything";
+        }
+      } else {
+        *os << "isn't approximately " << expected_;
+        if (HasMaxAbsError()) {
+          *os << " (absolute error > " << max_abs_error_ << ")";
+        }
+      }
+      // Restore original precision.
+      os->precision(old_precision);
+    }
+
+   private:
+    bool HasMaxAbsError() const {
+      return max_abs_error_ >= 0;
+    }
+
+    const FloatType expected_;
+    const bool nan_eq_nan_;
+    // max_abs_error will be used for value comparison when >= 0.
+    const FloatType max_abs_error_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  // The following 3 type conversion operators allow FloatEq(expected) and
+  // NanSensitiveFloatEq(expected) to be used as a Matcher<float>, a
+  // Matcher<const float&>, or a Matcher<float&>, but nothing else.
+  // (While Google's C++ coding style doesn't allow arguments passed
+  // by non-const reference, we may see them in code not conforming to
+  // the style.  Therefore Google Mock needs to support them.)
+  operator Matcher<FloatType>() const {
+    return MakeMatcher(
+        new Impl<FloatType>(expected_, nan_eq_nan_, max_abs_error_));
+  }
+
+  operator Matcher<const FloatType&>() const {
+    return MakeMatcher(
+        new Impl<const FloatType&>(expected_, nan_eq_nan_, max_abs_error_));
+  }
+
+  operator Matcher<FloatType&>() const {
+    return MakeMatcher(
+        new Impl<FloatType&>(expected_, nan_eq_nan_, max_abs_error_));
+  }
+
+ private:
+  const FloatType expected_;
+  const bool nan_eq_nan_;
+  // max_abs_error will be used for value comparison when >= 0.
+  const FloatType max_abs_error_;
+
+  GTEST_DISALLOW_ASSIGN_(FloatingEqMatcher);
+};
+
+// A 2-tuple ("binary") wrapper around FloatingEqMatcher:
+// FloatingEq2Matcher() matches (x, y) by matching FloatingEqMatcher(x, false)
+// against y, and FloatingEq2Matcher(e) matches FloatingEqMatcher(x, false, e)
+// against y. The former implements "Eq", the latter "Near". At present, there
+// is no version that compares NaNs as equal.
+template <typename FloatType>
+class FloatingEq2Matcher {
+ public:
+  FloatingEq2Matcher() { Init(-1, false); }
+
+  explicit FloatingEq2Matcher(bool nan_eq_nan) { Init(-1, nan_eq_nan); }
+
+  explicit FloatingEq2Matcher(FloatType max_abs_error) {
+    Init(max_abs_error, false);
+  }
+
+  FloatingEq2Matcher(FloatType max_abs_error, bool nan_eq_nan) {
+    Init(max_abs_error, nan_eq_nan);
+  }
+
+  template <typename T1, typename T2>
+  operator Matcher<::std::tuple<T1, T2>>() const {
+    return MakeMatcher(
+        new Impl<::std::tuple<T1, T2>>(max_abs_error_, nan_eq_nan_));
+  }
+  template <typename T1, typename T2>
+  operator Matcher<const ::std::tuple<T1, T2>&>() const {
+    return MakeMatcher(
+        new Impl<const ::std::tuple<T1, T2>&>(max_abs_error_, nan_eq_nan_));
+  }
+
+ private:
+  static ::std::ostream& GetDesc(::std::ostream& os) {  // NOLINT
+    return os << "an almost-equal pair";
+  }
+
+  template <typename Tuple>
+  class Impl : public MatcherInterface<Tuple> {
+   public:
+    Impl(FloatType max_abs_error, bool nan_eq_nan) :
+        max_abs_error_(max_abs_error),
+        nan_eq_nan_(nan_eq_nan) {}
+
+    bool MatchAndExplain(Tuple args,
+                         MatchResultListener* listener) const override {
+      if (max_abs_error_ == -1) {
+        FloatingEqMatcher<FloatType> fm(::std::get<0>(args), nan_eq_nan_);
+        return static_cast<Matcher<FloatType>>(fm).MatchAndExplain(
+            ::std::get<1>(args), listener);
+      } else {
+        FloatingEqMatcher<FloatType> fm(::std::get<0>(args), nan_eq_nan_,
+                                        max_abs_error_);
+        return static_cast<Matcher<FloatType>>(fm).MatchAndExplain(
+            ::std::get<1>(args), listener);
+      }
+    }
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "are " << GetDesc;
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "aren't " << GetDesc;
+    }
+
+   private:
+    FloatType max_abs_error_;
+    const bool nan_eq_nan_;
+  };
+
+  void Init(FloatType max_abs_error_val, bool nan_eq_nan_val) {
+    max_abs_error_ = max_abs_error_val;
+    nan_eq_nan_ = nan_eq_nan_val;
+  }
+  FloatType max_abs_error_;
+  bool nan_eq_nan_;
+};
+
+// Implements the Pointee(m) matcher for matching a pointer whose
+// pointee matches matcher m.  The pointer can be either raw or smart.
+template <typename InnerMatcher>
+class PointeeMatcher {
+ public:
+  explicit PointeeMatcher(const InnerMatcher& matcher) : matcher_(matcher) {}
+
+  // This type conversion operator template allows Pointee(m) to be
+  // used as a matcher for any pointer type whose pointee type is
+  // compatible with the inner matcher, where type Pointer can be
+  // either a raw pointer or a smart pointer.
+  //
+  // The reason we do this instead of relying on
+  // MakePolymorphicMatcher() is that the latter is not flexible
+  // enough for implementing the DescribeTo() method of Pointee().
+  template <typename Pointer>
+  operator Matcher<Pointer>() const {
+    return Matcher<Pointer>(new Impl<const Pointer&>(matcher_));
+  }
+
+ private:
+  // The monomorphic implementation that works for a particular pointer type.
+  template <typename Pointer>
+  class Impl : public MatcherInterface<Pointer> {
+   public:
+    typedef typename PointeeOf<GTEST_REMOVE_REFERENCE_AND_CONST_(Pointer)>::type
+        Pointee;
+
+    explicit Impl(const InnerMatcher& matcher)
+        : matcher_(MatcherCast<const Pointee&>(matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "points to a value that ";
+      matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "does not point to a value that ";
+      matcher_.DescribeTo(os);
+    }
+
+    bool MatchAndExplain(Pointer pointer,
+                         MatchResultListener* listener) const override {
+      if (GetRawPointer(pointer) == nullptr) return false;
+
+      *listener << "which points to ";
+      return MatchPrintAndExplain(*pointer, matcher_, listener);
+    }
+
+   private:
+    const Matcher<const Pointee&> matcher_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const InnerMatcher matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PointeeMatcher);
+};
+
+#if GTEST_HAS_RTTI
+// Implements the WhenDynamicCastTo<T>(m) matcher that matches a pointer or
+// reference that matches inner_matcher when dynamic_cast<T> is applied.
+// The result of dynamic_cast<To> is forwarded to the inner matcher.
+// If To is a pointer and the cast fails, the inner matcher will receive NULL.
+// If To is a reference and the cast fails, this matcher returns false
+// immediately.
+template <typename To>
+class WhenDynamicCastToMatcherBase {
+ public:
+  explicit WhenDynamicCastToMatcherBase(const Matcher<To>& matcher)
+      : matcher_(matcher) {}
+
+  void DescribeTo(::std::ostream* os) const {
+    GetCastTypeDescription(os);
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    GetCastTypeDescription(os);
+    matcher_.DescribeNegationTo(os);
+  }
+
+ protected:
+  const Matcher<To> matcher_;
+
+  static std::string GetToName() {
+    return GetTypeName<To>();
+  }
+
+ private:
+  static void GetCastTypeDescription(::std::ostream* os) {
+    *os << "when dynamic_cast to " << GetToName() << ", ";
+  }
+
+  GTEST_DISALLOW_ASSIGN_(WhenDynamicCastToMatcherBase);
+};
+
+// Primary template.
+// To is a pointer. Cast and forward the result.
+template <typename To>
+class WhenDynamicCastToMatcher : public WhenDynamicCastToMatcherBase<To> {
+ public:
+  explicit WhenDynamicCastToMatcher(const Matcher<To>& matcher)
+      : WhenDynamicCastToMatcherBase<To>(matcher) {}
+
+  template <typename From>
+  bool MatchAndExplain(From from, MatchResultListener* listener) const {
+    To to = dynamic_cast<To>(from);
+    return MatchPrintAndExplain(to, this->matcher_, listener);
+  }
+};
+
+// Specialize for references.
+// In this case we return false if the dynamic_cast fails.
+template <typename To>
+class WhenDynamicCastToMatcher<To&> : public WhenDynamicCastToMatcherBase<To&> {
+ public:
+  explicit WhenDynamicCastToMatcher(const Matcher<To&>& matcher)
+      : WhenDynamicCastToMatcherBase<To&>(matcher) {}
+
+  template <typename From>
+  bool MatchAndExplain(From& from, MatchResultListener* listener) const {
+    // We don't want an std::bad_cast here, so do the cast with pointers.
+    To* to = dynamic_cast<To*>(&from);
+    if (to == nullptr) {
+      *listener << "which cannot be dynamic_cast to " << this->GetToName();
+      return false;
+    }
+    return MatchPrintAndExplain(*to, this->matcher_, listener);
+  }
+};
+#endif  // GTEST_HAS_RTTI
+
+// Implements the Field() matcher for matching a field (i.e. member
+// variable) of an object.
+template <typename Class, typename FieldType>
+class FieldMatcher {
+ public:
+  FieldMatcher(FieldType Class::*field,
+               const Matcher<const FieldType&>& matcher)
+      : field_(field), matcher_(matcher), whose_field_("whose given field ") {}
+
+  FieldMatcher(const std::string& field_name, FieldType Class::*field,
+               const Matcher<const FieldType&>& matcher)
+      : field_(field),
+        matcher_(matcher),
+        whose_field_("whose field `" + field_name + "` ") {}
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "is an object " << whose_field_;
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "is an object " << whose_field_;
+    matcher_.DescribeNegationTo(os);
+  }
+
+  template <typename T>
+  bool MatchAndExplain(const T& value, MatchResultListener* listener) const {
+    // FIXME: The dispatch on std::is_pointer was introduced as a workaround for
+    // a compiler bug, and can now be removed.
+    return MatchAndExplainImpl(
+        typename std::is_pointer<typename std::remove_const<T>::type>::type(),
+        value, listener);
+  }
+
+ private:
+  bool MatchAndExplainImpl(std::false_type /* is_not_pointer */,
+                           const Class& obj,
+                           MatchResultListener* listener) const {
+    *listener << whose_field_ << "is ";
+    return MatchPrintAndExplain(obj.*field_, matcher_, listener);
+  }
+
+  bool MatchAndExplainImpl(std::true_type /* is_pointer */, const Class* p,
+                           MatchResultListener* listener) const {
+    if (p == nullptr) return false;
+
+    *listener << "which points to an object ";
+    // Since *p has a field, it must be a class/struct/union type and
+    // thus cannot be a pointer.  Therefore we pass false_type() as
+    // the first argument.
+    return MatchAndExplainImpl(std::false_type(), *p, listener);
+  }
+
+  const FieldType Class::*field_;
+  const Matcher<const FieldType&> matcher_;
+
+  // Contains either "whose given field " if the name of the field is unknown
+  // or "whose field `name_of_field` " if the name is known.
+  const std::string whose_field_;
+
+  GTEST_DISALLOW_ASSIGN_(FieldMatcher);
+};
+
+// Implements the Property() matcher for matching a property
+// (i.e. return value of a getter method) of an object.
+//
+// Property is a const-qualified member function of Class returning
+// PropertyType.
+template <typename Class, typename PropertyType, typename Property>
+class PropertyMatcher {
+ public:
+  typedef const PropertyType& RefToConstProperty;
+
+  PropertyMatcher(Property property, const Matcher<RefToConstProperty>& matcher)
+      : property_(property),
+        matcher_(matcher),
+        whose_property_("whose given property ") {}
+
+  PropertyMatcher(const std::string& property_name, Property property,
+                  const Matcher<RefToConstProperty>& matcher)
+      : property_(property),
+        matcher_(matcher),
+        whose_property_("whose property `" + property_name + "` ") {}
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "is an object " << whose_property_;
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "is an object " << whose_property_;
+    matcher_.DescribeNegationTo(os);
+  }
+
+  template <typename T>
+  bool MatchAndExplain(const T&value, MatchResultListener* listener) const {
+    return MatchAndExplainImpl(
+        typename std::is_pointer<typename std::remove_const<T>::type>::type(),
+        value, listener);
+  }
+
+ private:
+  bool MatchAndExplainImpl(std::false_type /* is_not_pointer */,
+                           const Class& obj,
+                           MatchResultListener* listener) const {
+    *listener << whose_property_ << "is ";
+    // Cannot pass the return value (for example, int) to MatchPrintAndExplain,
+    // which takes a non-const reference as argument.
+    RefToConstProperty result = (obj.*property_)();
+    return MatchPrintAndExplain(result, matcher_, listener);
+  }
+
+  bool MatchAndExplainImpl(std::true_type /* is_pointer */, const Class* p,
+                           MatchResultListener* listener) const {
+    if (p == nullptr) return false;
+
+    *listener << "which points to an object ";
+    // Since *p has a property method, it must be a class/struct/union
+    // type and thus cannot be a pointer.  Therefore we pass
+    // false_type() as the first argument.
+    return MatchAndExplainImpl(std::false_type(), *p, listener);
+  }
+
+  Property property_;
+  const Matcher<RefToConstProperty> matcher_;
+
+  // Contains either "whose given property " if the name of the property is
+  // unknown or "whose property `name_of_property` " if the name is known.
+  const std::string whose_property_;
+
+  GTEST_DISALLOW_ASSIGN_(PropertyMatcher);
+};
+
+// Type traits specifying various features of different functors for ResultOf.
+// The default template specifies features for functor objects.
+template <typename Functor>
+struct CallableTraits {
+  typedef Functor StorageType;
+
+  static void CheckIsValid(Functor /* functor */) {}
+
+  template <typename T>
+  static auto Invoke(Functor f, const T& arg) -> decltype(f(arg)) {
+    return f(arg);
+  }
+};
+
+// Specialization for function pointers.
+template <typename ArgType, typename ResType>
+struct CallableTraits<ResType(*)(ArgType)> {
+  typedef ResType ResultType;
+  typedef ResType(*StorageType)(ArgType);
+
+  static void CheckIsValid(ResType(*f)(ArgType)) {
+    GTEST_CHECK_(f != nullptr)
+        << "NULL function pointer is passed into ResultOf().";
+  }
+  template <typename T>
+  static ResType Invoke(ResType(*f)(ArgType), T arg) {
+    return (*f)(arg);
+  }
+};
+
+// Implements the ResultOf() matcher for matching a return value of a
+// unary function of an object.
+template <typename Callable, typename InnerMatcher>
+class ResultOfMatcher {
+ public:
+  ResultOfMatcher(Callable callable, InnerMatcher matcher)
+      : callable_(std::move(callable)), matcher_(std::move(matcher)) {
+    CallableTraits<Callable>::CheckIsValid(callable_);
+  }
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return Matcher<T>(new Impl<const T&>(callable_, matcher_));
+  }
+
+ private:
+  typedef typename CallableTraits<Callable>::StorageType CallableStorageType;
+
+  template <typename T>
+  class Impl : public MatcherInterface<T> {
+    using ResultType = decltype(CallableTraits<Callable>::template Invoke<T>(
+        std::declval<CallableStorageType>(), std::declval<T>()));
+
+   public:
+    template <typename M>
+    Impl(const CallableStorageType& callable, const M& matcher)
+        : callable_(callable), matcher_(MatcherCast<ResultType>(matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "is mapped by the given callable to a value that ";
+      matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "is mapped by the given callable to a value that ";
+      matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(T obj, MatchResultListener* listener) const override {
+      *listener << "which is mapped by the given callable to ";
+      // Cannot pass the return value directly to MatchPrintAndExplain, which
+      // takes a non-const reference as argument.
+      // Also, specifying template argument explicitly is needed because T could
+      // be a non-const reference (e.g. Matcher<Uncopyable&>).
+      ResultType result =
+          CallableTraits<Callable>::template Invoke<T>(callable_, obj);
+      return MatchPrintAndExplain(result, matcher_, listener);
+    }
+
+   private:
+    // Functors often define operator() as non-const method even though
+    // they are actually stateless. But we need to use them even when
+    // 'this' is a const pointer. It's the user's responsibility not to
+    // use stateful callables with ResultOf(), which doesn't guarantee
+    // how many times the callable will be invoked.
+    mutable CallableStorageType callable_;
+    const Matcher<ResultType> matcher_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };  // class Impl
+
+  const CallableStorageType callable_;
+  const InnerMatcher matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(ResultOfMatcher);
+};
+
+// Implements a matcher that checks the size of an STL-style container.
+template <typename SizeMatcher>
+class SizeIsMatcher {
+ public:
+  explicit SizeIsMatcher(const SizeMatcher& size_matcher)
+       : size_matcher_(size_matcher) {
+  }
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(new Impl<const Container&>(size_matcher_));
+  }
+
+  template <typename Container>
+  class Impl : public MatcherInterface<Container> {
+   public:
+    using SizeType = decltype(std::declval<Container>().size());
+    explicit Impl(const SizeMatcher& size_matcher)
+        : size_matcher_(MatcherCast<SizeType>(size_matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "size ";
+      size_matcher_.DescribeTo(os);
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "size ";
+      size_matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(Container container,
+                         MatchResultListener* listener) const override {
+      SizeType size = container.size();
+      StringMatchResultListener size_listener;
+      const bool result = size_matcher_.MatchAndExplain(size, &size_listener);
+      *listener
+          << "whose size " << size << (result ? " matches" : " doesn't match");
+      PrintIfNotEmpty(size_listener.str(), listener->stream());
+      return result;
+    }
+
+   private:
+    const Matcher<SizeType> size_matcher_;
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+ private:
+  const SizeMatcher size_matcher_;
+  GTEST_DISALLOW_ASSIGN_(SizeIsMatcher);
+};
+
+// Implements a matcher that checks the begin()..end() distance of an STL-style
+// container.
+template <typename DistanceMatcher>
+class BeginEndDistanceIsMatcher {
+ public:
+  explicit BeginEndDistanceIsMatcher(const DistanceMatcher& distance_matcher)
+      : distance_matcher_(distance_matcher) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(new Impl<const Container&>(distance_matcher_));
+  }
+
+  template <typename Container>
+  class Impl : public MatcherInterface<Container> {
+   public:
+    typedef internal::StlContainerView<
+        GTEST_REMOVE_REFERENCE_AND_CONST_(Container)> ContainerView;
+    typedef typename std::iterator_traits<
+        typename ContainerView::type::const_iterator>::difference_type
+        DistanceType;
+    explicit Impl(const DistanceMatcher& distance_matcher)
+        : distance_matcher_(MatcherCast<DistanceType>(distance_matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "distance between begin() and end() ";
+      distance_matcher_.DescribeTo(os);
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "distance between begin() and end() ";
+      distance_matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(Container container,
+                         MatchResultListener* listener) const override {
+      using std::begin;
+      using std::end;
+      DistanceType distance = std::distance(begin(container), end(container));
+      StringMatchResultListener distance_listener;
+      const bool result =
+          distance_matcher_.MatchAndExplain(distance, &distance_listener);
+      *listener << "whose distance between begin() and end() " << distance
+                << (result ? " matches" : " doesn't match");
+      PrintIfNotEmpty(distance_listener.str(), listener->stream());
+      return result;
+    }
+
+   private:
+    const Matcher<DistanceType> distance_matcher_;
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+ private:
+  const DistanceMatcher distance_matcher_;
+  GTEST_DISALLOW_ASSIGN_(BeginEndDistanceIsMatcher);
+};
+
+// Implements an equality matcher for any STL-style container whose elements
+// support ==. This matcher is like Eq(), but its failure explanations provide
+// more detailed information that is useful when the container is used as a set.
+// The failure message reports elements that are in one of the operands but not
+// the other. The failure messages do not report duplicate or out-of-order
+// elements in the containers (which don't properly matter to sets, but can
+// occur if the containers are vectors or lists, for example).
+//
+// Uses the container's const_iterator, value_type, operator ==,
+// begin(), and end().
+template <typename Container>
+class ContainerEqMatcher {
+ public:
+  typedef internal::StlContainerView<Container> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+
+  static_assert(!std::is_const<Container>::value,
+                "Container type must not be const");
+  static_assert(!std::is_reference<Container>::value,
+                "Container type must not be a reference");
+
+  // We make a copy of expected in case the elements in it are modified
+  // after this matcher is created.
+  explicit ContainerEqMatcher(const Container& expected)
+      : expected_(View::Copy(expected)) {}
+
+  void DescribeTo(::std::ostream* os) const {
+    *os << "equals ";
+    UniversalPrint(expected_, os);
+  }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "does not equal ";
+    UniversalPrint(expected_, os);
+  }
+
+  template <typename LhsContainer>
+  bool MatchAndExplain(const LhsContainer& lhs,
+                       MatchResultListener* listener) const {
+    typedef internal::StlContainerView<
+        typename std::remove_const<LhsContainer>::type>
+        LhsView;
+    typedef typename LhsView::type LhsStlContainer;
+    StlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+    if (lhs_stl_container == expected_)
+      return true;
+
+    ::std::ostream* const os = listener->stream();
+    if (os != nullptr) {
+      // Something is different. Check for extra values first.
+      bool printed_header = false;
+      for (typename LhsStlContainer::const_iterator it =
+               lhs_stl_container.begin();
+           it != lhs_stl_container.end(); ++it) {
+        if (internal::ArrayAwareFind(expected_.begin(), expected_.end(), *it) ==
+            expected_.end()) {
+          if (printed_header) {
+            *os << ", ";
+          } else {
+            *os << "which has these unexpected elements: ";
+            printed_header = true;
+          }
+          UniversalPrint(*it, os);
+        }
+      }
+
+      // Now check for missing values.
+      bool printed_header2 = false;
+      for (typename StlContainer::const_iterator it = expected_.begin();
+           it != expected_.end(); ++it) {
+        if (internal::ArrayAwareFind(
+                lhs_stl_container.begin(), lhs_stl_container.end(), *it) ==
+            lhs_stl_container.end()) {
+          if (printed_header2) {
+            *os << ", ";
+          } else {
+            *os << (printed_header ? ",\nand" : "which")
+                << " doesn't have these expected elements: ";
+            printed_header2 = true;
+          }
+          UniversalPrint(*it, os);
+        }
+      }
+    }
+
+    return false;
+  }
+
+ private:
+  const StlContainer expected_;
+
+  GTEST_DISALLOW_ASSIGN_(ContainerEqMatcher);
+};
+
+// A comparator functor that uses the < operator to compare two values.
+struct LessComparator {
+  template <typename T, typename U>
+  bool operator()(const T& lhs, const U& rhs) const { return lhs < rhs; }
+};
+
+// Implements WhenSortedBy(comparator, container_matcher).
+template <typename Comparator, typename ContainerMatcher>
+class WhenSortedByMatcher {
+ public:
+  WhenSortedByMatcher(const Comparator& comparator,
+                      const ContainerMatcher& matcher)
+      : comparator_(comparator), matcher_(matcher) {}
+
+  template <typename LhsContainer>
+  operator Matcher<LhsContainer>() const {
+    return MakeMatcher(new Impl<LhsContainer>(comparator_, matcher_));
+  }
+
+  template <typename LhsContainer>
+  class Impl : public MatcherInterface<LhsContainer> {
+   public:
+    typedef internal::StlContainerView<
+         GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)> LhsView;
+    typedef typename LhsView::type LhsStlContainer;
+    typedef typename LhsView::const_reference LhsStlContainerReference;
+    // Transforms std::pair<const Key, Value> into std::pair<Key, Value>
+    // so that we can match associative containers.
+    typedef typename RemoveConstFromKey<
+        typename LhsStlContainer::value_type>::type LhsValue;
+
+    Impl(const Comparator& comparator, const ContainerMatcher& matcher)
+        : comparator_(comparator), matcher_(matcher) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "(when sorted) ";
+      matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "(when sorted) ";
+      matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(LhsContainer lhs,
+                         MatchResultListener* listener) const override {
+      LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+      ::std::vector<LhsValue> sorted_container(lhs_stl_container.begin(),
+                                               lhs_stl_container.end());
+      ::std::sort(
+           sorted_container.begin(), sorted_container.end(), comparator_);
+
+      if (!listener->IsInterested()) {
+        // If the listener is not interested, we do not need to
+        // construct the inner explanation.
+        return matcher_.Matches(sorted_container);
+      }
+
+      *listener << "which is ";
+      UniversalPrint(sorted_container, listener->stream());
+      *listener << " when sorted";
+
+      StringMatchResultListener inner_listener;
+      const bool match = matcher_.MatchAndExplain(sorted_container,
+                                                  &inner_listener);
+      PrintIfNotEmpty(inner_listener.str(), listener->stream());
+      return match;
+    }
+
+   private:
+    const Comparator comparator_;
+    const Matcher<const ::std::vector<LhsValue>&> matcher_;
+
+    GTEST_DISALLOW_COPY_AND_ASSIGN_(Impl);
+  };
+
+ private:
+  const Comparator comparator_;
+  const ContainerMatcher matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(WhenSortedByMatcher);
+};
+
+// Implements Pointwise(tuple_matcher, rhs_container).  tuple_matcher
+// must be able to be safely cast to Matcher<std::tuple<const T1&, const
+// T2&> >, where T1 and T2 are the types of elements in the LHS
+// container and the RHS container respectively.
+template <typename TupleMatcher, typename RhsContainer>
+class PointwiseMatcher {
+  GTEST_COMPILE_ASSERT_(
+      !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(RhsContainer)>::value,
+      use_UnorderedPointwise_with_hash_tables);
+
+ public:
+  typedef internal::StlContainerView<RhsContainer> RhsView;
+  typedef typename RhsView::type RhsStlContainer;
+  typedef typename RhsStlContainer::value_type RhsValue;
+
+  static_assert(!std::is_const<RhsContainer>::value,
+                "RhsContainer type must not be const");
+  static_assert(!std::is_reference<RhsContainer>::value,
+                "RhsContainer type must not be a reference");
+
+  // Like ContainerEq, we make a copy of rhs in case the elements in
+  // it are modified after this matcher is created.
+  PointwiseMatcher(const TupleMatcher& tuple_matcher, const RhsContainer& rhs)
+      : tuple_matcher_(tuple_matcher), rhs_(RhsView::Copy(rhs)) {}
+
+  template <typename LhsContainer>
+  operator Matcher<LhsContainer>() const {
+    GTEST_COMPILE_ASSERT_(
+        !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)>::value,
+        use_UnorderedPointwise_with_hash_tables);
+
+    return Matcher<LhsContainer>(
+        new Impl<const LhsContainer&>(tuple_matcher_, rhs_));
+  }
+
+  template <typename LhsContainer>
+  class Impl : public MatcherInterface<LhsContainer> {
+   public:
+    typedef internal::StlContainerView<
+         GTEST_REMOVE_REFERENCE_AND_CONST_(LhsContainer)> LhsView;
+    typedef typename LhsView::type LhsStlContainer;
+    typedef typename LhsView::const_reference LhsStlContainerReference;
+    typedef typename LhsStlContainer::value_type LhsValue;
+    // We pass the LHS value and the RHS value to the inner matcher by
+    // reference, as they may be expensive to copy.  We must use tuple
+    // instead of pair here, as a pair cannot hold references (C++ 98,
+    // 20.2.2 [lib.pairs]).
+    typedef ::std::tuple<const LhsValue&, const RhsValue&> InnerMatcherArg;
+
+    Impl(const TupleMatcher& tuple_matcher, const RhsStlContainer& rhs)
+        // mono_tuple_matcher_ holds a monomorphic version of the tuple matcher.
+        : mono_tuple_matcher_(SafeMatcherCast<InnerMatcherArg>(tuple_matcher)),
+          rhs_(rhs) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "contains " << rhs_.size()
+          << " values, where each value and its corresponding value in ";
+      UniversalPrinter<RhsStlContainer>::Print(rhs_, os);
+      *os << " ";
+      mono_tuple_matcher_.DescribeTo(os);
+    }
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "doesn't contain exactly " << rhs_.size()
+          << " values, or contains a value x at some index i"
+          << " where x and the i-th value of ";
+      UniversalPrint(rhs_, os);
+      *os << " ";
+      mono_tuple_matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(LhsContainer lhs,
+                         MatchResultListener* listener) const override {
+      LhsStlContainerReference lhs_stl_container = LhsView::ConstReference(lhs);
+      const size_t actual_size = lhs_stl_container.size();
+      if (actual_size != rhs_.size()) {
+        *listener << "which contains " << actual_size << " values";
+        return false;
+      }
+
+      typename LhsStlContainer::const_iterator left = lhs_stl_container.begin();
+      typename RhsStlContainer::const_iterator right = rhs_.begin();
+      for (size_t i = 0; i != actual_size; ++i, ++left, ++right) {
+        if (listener->IsInterested()) {
+          StringMatchResultListener inner_listener;
+          // Create InnerMatcherArg as a temporarily object to avoid it outlives
+          // *left and *right. Dereference or the conversion to `const T&` may
+          // return temp objects, e.g for vector<bool>.
+          if (!mono_tuple_matcher_.MatchAndExplain(
+                  InnerMatcherArg(ImplicitCast_<const LhsValue&>(*left),
+                                  ImplicitCast_<const RhsValue&>(*right)),
+                  &inner_listener)) {
+            *listener << "where the value pair (";
+            UniversalPrint(*left, listener->stream());
+            *listener << ", ";
+            UniversalPrint(*right, listener->stream());
+            *listener << ") at index #" << i << " don't match";
+            PrintIfNotEmpty(inner_listener.str(), listener->stream());
+            return false;
+          }
+        } else {
+          if (!mono_tuple_matcher_.Matches(
+                  InnerMatcherArg(ImplicitCast_<const LhsValue&>(*left),
+                                  ImplicitCast_<const RhsValue&>(*right))))
+            return false;
+        }
+      }
+
+      return true;
+    }
+
+   private:
+    const Matcher<InnerMatcherArg> mono_tuple_matcher_;
+    const RhsStlContainer rhs_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+ private:
+  const TupleMatcher tuple_matcher_;
+  const RhsStlContainer rhs_;
+
+  GTEST_DISALLOW_ASSIGN_(PointwiseMatcher);
+};
+
+// Holds the logic common to ContainsMatcherImpl and EachMatcherImpl.
+template <typename Container>
+class QuantifierMatcherImpl : public MatcherInterface<Container> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+  typedef StlContainerView<RawContainer> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+  typedef typename StlContainer::value_type Element;
+
+  template <typename InnerMatcher>
+  explicit QuantifierMatcherImpl(InnerMatcher inner_matcher)
+      : inner_matcher_(
+           testing::SafeMatcherCast<const Element&>(inner_matcher)) {}
+
+  // Checks whether:
+  // * All elements in the container match, if all_elements_should_match.
+  // * Any element in the container matches, if !all_elements_should_match.
+  bool MatchAndExplainImpl(bool all_elements_should_match,
+                           Container container,
+                           MatchResultListener* listener) const {
+    StlContainerReference stl_container = View::ConstReference(container);
+    size_t i = 0;
+    for (typename StlContainer::const_iterator it = stl_container.begin();
+         it != stl_container.end(); ++it, ++i) {
+      StringMatchResultListener inner_listener;
+      const bool matches = inner_matcher_.MatchAndExplain(*it, &inner_listener);
+
+      if (matches != all_elements_should_match) {
+        *listener << "whose element #" << i
+                  << (matches ? " matches" : " doesn't match");
+        PrintIfNotEmpty(inner_listener.str(), listener->stream());
+        return !all_elements_should_match;
+      }
+    }
+    return all_elements_should_match;
+  }
+
+ protected:
+  const Matcher<const Element&> inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(QuantifierMatcherImpl);
+};
+
+// Implements Contains(element_matcher) for the given argument type Container.
+// Symmetric to EachMatcherImpl.
+template <typename Container>
+class ContainsMatcherImpl : public QuantifierMatcherImpl<Container> {
+ public:
+  template <typename InnerMatcher>
+  explicit ContainsMatcherImpl(InnerMatcher inner_matcher)
+      : QuantifierMatcherImpl<Container>(inner_matcher) {}
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "contains at least one element that ";
+    this->inner_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "doesn't contain any element that ";
+    this->inner_matcher_.DescribeTo(os);
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    return this->MatchAndExplainImpl(false, container, listener);
+  }
+
+ private:
+  GTEST_DISALLOW_ASSIGN_(ContainsMatcherImpl);
+};
+
+// Implements Each(element_matcher) for the given argument type Container.
+// Symmetric to ContainsMatcherImpl.
+template <typename Container>
+class EachMatcherImpl : public QuantifierMatcherImpl<Container> {
+ public:
+  template <typename InnerMatcher>
+  explicit EachMatcherImpl(InnerMatcher inner_matcher)
+      : QuantifierMatcherImpl<Container>(inner_matcher) {}
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "only contains elements that ";
+    this->inner_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "contains some element that ";
+    this->inner_matcher_.DescribeNegationTo(os);
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    return this->MatchAndExplainImpl(true, container, listener);
+  }
+
+ private:
+  GTEST_DISALLOW_ASSIGN_(EachMatcherImpl);
+};
+
+// Implements polymorphic Contains(element_matcher).
+template <typename M>
+class ContainsMatcher {
+ public:
+  explicit ContainsMatcher(M m) : inner_matcher_(m) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(
+        new ContainsMatcherImpl<const Container&>(inner_matcher_));
+  }
+
+ private:
+  const M inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(ContainsMatcher);
+};
+
+// Implements polymorphic Each(element_matcher).
+template <typename M>
+class EachMatcher {
+ public:
+  explicit EachMatcher(M m) : inner_matcher_(m) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(
+        new EachMatcherImpl<const Container&>(inner_matcher_));
+  }
+
+ private:
+  const M inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(EachMatcher);
+};
+
+struct Rank1 {};
+struct Rank0 : Rank1 {};
+
+namespace pair_getters {
+using std::get;
+template <typename T>
+auto First(T& x, Rank1) -> decltype(get<0>(x)) {  // NOLINT
+  return get<0>(x);
+}
+template <typename T>
+auto First(T& x, Rank0) -> decltype((x.first)) {  // NOLINT
+  return x.first;
+}
+
+template <typename T>
+auto Second(T& x, Rank1) -> decltype(get<1>(x)) {  // NOLINT
+  return get<1>(x);
+}
+template <typename T>
+auto Second(T& x, Rank0) -> decltype((x.second)) {  // NOLINT
+  return x.second;
+}
+}  // namespace pair_getters
+
+// Implements Key(inner_matcher) for the given argument pair type.
+// Key(inner_matcher) matches an std::pair whose 'first' field matches
+// inner_matcher.  For example, Contains(Key(Ge(5))) can be used to match an
+// std::map that contains at least one element whose key is >= 5.
+template <typename PairType>
+class KeyMatcherImpl : public MatcherInterface<PairType> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(PairType) RawPairType;
+  typedef typename RawPairType::first_type KeyType;
+
+  template <typename InnerMatcher>
+  explicit KeyMatcherImpl(InnerMatcher inner_matcher)
+      : inner_matcher_(
+          testing::SafeMatcherCast<const KeyType&>(inner_matcher)) {
+  }
+
+  // Returns true if and only if 'key_value.first' (the key) matches the inner
+  // matcher.
+  bool MatchAndExplain(PairType key_value,
+                       MatchResultListener* listener) const override {
+    StringMatchResultListener inner_listener;
+    const bool match = inner_matcher_.MatchAndExplain(
+        pair_getters::First(key_value, Rank0()), &inner_listener);
+    const std::string explanation = inner_listener.str();
+    if (explanation != "") {
+      *listener << "whose first field is a value " << explanation;
+    }
+    return match;
+  }
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "has a key that ";
+    inner_matcher_.DescribeTo(os);
+  }
+
+  // Describes what the negation of this matcher does.
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "doesn't have a key that ";
+    inner_matcher_.DescribeTo(os);
+  }
+
+ private:
+  const Matcher<const KeyType&> inner_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(KeyMatcherImpl);
+};
+
+// Implements polymorphic Key(matcher_for_key).
+template <typename M>
+class KeyMatcher {
+ public:
+  explicit KeyMatcher(M m) : matcher_for_key_(m) {}
+
+  template <typename PairType>
+  operator Matcher<PairType>() const {
+    return Matcher<PairType>(
+        new KeyMatcherImpl<const PairType&>(matcher_for_key_));
+  }
+
+ private:
+  const M matcher_for_key_;
+
+  GTEST_DISALLOW_ASSIGN_(KeyMatcher);
+};
+
+// Implements Pair(first_matcher, second_matcher) for the given argument pair
+// type with its two matchers. See Pair() function below.
+template <typename PairType>
+class PairMatcherImpl : public MatcherInterface<PairType> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(PairType) RawPairType;
+  typedef typename RawPairType::first_type FirstType;
+  typedef typename RawPairType::second_type SecondType;
+
+  template <typename FirstMatcher, typename SecondMatcher>
+  PairMatcherImpl(FirstMatcher first_matcher, SecondMatcher second_matcher)
+      : first_matcher_(
+            testing::SafeMatcherCast<const FirstType&>(first_matcher)),
+        second_matcher_(
+            testing::SafeMatcherCast<const SecondType&>(second_matcher)) {
+  }
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "has a first field that ";
+    first_matcher_.DescribeTo(os);
+    *os << ", and has a second field that ";
+    second_matcher_.DescribeTo(os);
+  }
+
+  // Describes what the negation of this matcher does.
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "has a first field that ";
+    first_matcher_.DescribeNegationTo(os);
+    *os << ", or has a second field that ";
+    second_matcher_.DescribeNegationTo(os);
+  }
+
+  // Returns true if and only if 'a_pair.first' matches first_matcher and
+  // 'a_pair.second' matches second_matcher.
+  bool MatchAndExplain(PairType a_pair,
+                       MatchResultListener* listener) const override {
+    if (!listener->IsInterested()) {
+      // If the listener is not interested, we don't need to construct the
+      // explanation.
+      return first_matcher_.Matches(pair_getters::First(a_pair, Rank0())) &&
+             second_matcher_.Matches(pair_getters::Second(a_pair, Rank0()));
+    }
+    StringMatchResultListener first_inner_listener;
+    if (!first_matcher_.MatchAndExplain(pair_getters::First(a_pair, Rank0()),
+                                        &first_inner_listener)) {
+      *listener << "whose first field does not match";
+      PrintIfNotEmpty(first_inner_listener.str(), listener->stream());
+      return false;
+    }
+    StringMatchResultListener second_inner_listener;
+    if (!second_matcher_.MatchAndExplain(pair_getters::Second(a_pair, Rank0()),
+                                         &second_inner_listener)) {
+      *listener << "whose second field does not match";
+      PrintIfNotEmpty(second_inner_listener.str(), listener->stream());
+      return false;
+    }
+    ExplainSuccess(first_inner_listener.str(), second_inner_listener.str(),
+                   listener);
+    return true;
+  }
+
+ private:
+  void ExplainSuccess(const std::string& first_explanation,
+                      const std::string& second_explanation,
+                      MatchResultListener* listener) const {
+    *listener << "whose both fields match";
+    if (first_explanation != "") {
+      *listener << ", where the first field is a value " << first_explanation;
+    }
+    if (second_explanation != "") {
+      *listener << ", ";
+      if (first_explanation != "") {
+        *listener << "and ";
+      } else {
+        *listener << "where ";
+      }
+      *listener << "the second field is a value " << second_explanation;
+    }
+  }
+
+  const Matcher<const FirstType&> first_matcher_;
+  const Matcher<const SecondType&> second_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PairMatcherImpl);
+};
+
+// Implements polymorphic Pair(first_matcher, second_matcher).
+template <typename FirstMatcher, typename SecondMatcher>
+class PairMatcher {
+ public:
+  PairMatcher(FirstMatcher first_matcher, SecondMatcher second_matcher)
+      : first_matcher_(first_matcher), second_matcher_(second_matcher) {}
+
+  template <typename PairType>
+  operator Matcher<PairType> () const {
+    return Matcher<PairType>(
+        new PairMatcherImpl<const PairType&>(first_matcher_, second_matcher_));
+  }
+
+ private:
+  const FirstMatcher first_matcher_;
+  const SecondMatcher second_matcher_;
+
+  GTEST_DISALLOW_ASSIGN_(PairMatcher);
+};
+
+// Implements ElementsAre() and ElementsAreArray().
+template <typename Container>
+class ElementsAreMatcherImpl : public MatcherInterface<Container> {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+  typedef internal::StlContainerView<RawContainer> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+  typedef typename StlContainer::value_type Element;
+
+  // Constructs the matcher from a sequence of element values or
+  // element matchers.
+  template <typename InputIter>
+  ElementsAreMatcherImpl(InputIter first, InputIter last) {
+    while (first != last) {
+      matchers_.push_back(MatcherCast<const Element&>(*first++));
+    }
+  }
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    if (count() == 0) {
+      *os << "is empty";
+    } else if (count() == 1) {
+      *os << "has 1 element that ";
+      matchers_[0].DescribeTo(os);
+    } else {
+      *os << "has " << Elements(count()) << " where\n";
+      for (size_t i = 0; i != count(); ++i) {
+        *os << "element #" << i << " ";
+        matchers_[i].DescribeTo(os);
+        if (i + 1 < count()) {
+          *os << ",\n";
+        }
+      }
+    }
+  }
+
+  // Describes what the negation of this matcher does.
+  void DescribeNegationTo(::std::ostream* os) const override {
+    if (count() == 0) {
+      *os << "isn't empty";
+      return;
+    }
+
+    *os << "doesn't have " << Elements(count()) << ", or\n";
+    for (size_t i = 0; i != count(); ++i) {
+      *os << "element #" << i << " ";
+      matchers_[i].DescribeNegationTo(os);
+      if (i + 1 < count()) {
+        *os << ", or\n";
+      }
+    }
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    // To work with stream-like "containers", we must only walk
+    // through the elements in one pass.
+
+    const bool listener_interested = listener->IsInterested();
+
+    // explanations[i] is the explanation of the element at index i.
+    ::std::vector<std::string> explanations(count());
+    StlContainerReference stl_container = View::ConstReference(container);
+    typename StlContainer::const_iterator it = stl_container.begin();
+    size_t exam_pos = 0;
+    bool mismatch_found = false;  // Have we found a mismatched element yet?
+
+    // Go through the elements and matchers in pairs, until we reach
+    // the end of either the elements or the matchers, or until we find a
+    // mismatch.
+    for (; it != stl_container.end() && exam_pos != count(); ++it, ++exam_pos) {
+      bool match;  // Does the current element match the current matcher?
+      if (listener_interested) {
+        StringMatchResultListener s;
+        match = matchers_[exam_pos].MatchAndExplain(*it, &s);
+        explanations[exam_pos] = s.str();
+      } else {
+        match = matchers_[exam_pos].Matches(*it);
+      }
+
+      if (!match) {
+        mismatch_found = true;
+        break;
+      }
+    }
+    // If mismatch_found is true, 'exam_pos' is the index of the mismatch.
+
+    // Find how many elements the actual container has.  We avoid
+    // calling size() s.t. this code works for stream-like "containers"
+    // that don't define size().
+    size_t actual_count = exam_pos;
+    for (; it != stl_container.end(); ++it) {
+      ++actual_count;
+    }
+
+    if (actual_count != count()) {
+      // The element count doesn't match.  If the container is empty,
+      // there's no need to explain anything as Google Mock already
+      // prints the empty container.  Otherwise we just need to show
+      // how many elements there actually are.
+      if (listener_interested && (actual_count != 0)) {
+        *listener << "which has " << Elements(actual_count);
+      }
+      return false;
+    }
+
+    if (mismatch_found) {
+      // The element count matches, but the exam_pos-th element doesn't match.
+      if (listener_interested) {
+        *listener << "whose element #" << exam_pos << " doesn't match";
+        PrintIfNotEmpty(explanations[exam_pos], listener->stream());
+      }
+      return false;
+    }
+
+    // Every element matches its expectation.  We need to explain why
+    // (the obvious ones can be skipped).
+    if (listener_interested) {
+      bool reason_printed = false;
+      for (size_t i = 0; i != count(); ++i) {
+        const std::string& s = explanations[i];
+        if (!s.empty()) {
+          if (reason_printed) {
+            *listener << ",\nand ";
+          }
+          *listener << "whose element #" << i << " matches, " << s;
+          reason_printed = true;
+        }
+      }
+    }
+    return true;
+  }
+
+ private:
+  static Message Elements(size_t count) {
+    return Message() << count << (count == 1 ? " element" : " elements");
+  }
+
+  size_t count() const { return matchers_.size(); }
+
+  ::std::vector<Matcher<const Element&> > matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(ElementsAreMatcherImpl);
+};
+
+// Connectivity matrix of (elements X matchers), in element-major order.
+// Initially, there are no edges.
+// Use NextGraph() to iterate over all possible edge configurations.
+// Use Randomize() to generate a random edge configuration.
+class GTEST_API_ MatchMatrix {
+ public:
+  MatchMatrix(size_t num_elements, size_t num_matchers)
+      : num_elements_(num_elements),
+        num_matchers_(num_matchers),
+        matched_(num_elements_* num_matchers_, 0) {
+  }
+
+  size_t LhsSize() const { return num_elements_; }
+  size_t RhsSize() const { return num_matchers_; }
+  bool HasEdge(size_t ilhs, size_t irhs) const {
+    return matched_[SpaceIndex(ilhs, irhs)] == 1;
+  }
+  void SetEdge(size_t ilhs, size_t irhs, bool b) {
+    matched_[SpaceIndex(ilhs, irhs)] = b ? 1 : 0;
+  }
+
+  // Treating the connectivity matrix as a (LhsSize()*RhsSize())-bit number,
+  // adds 1 to that number; returns false if incrementing the graph left it
+  // empty.
+  bool NextGraph();
+
+  void Randomize();
+
+  std::string DebugString() const;
+
+ private:
+  size_t SpaceIndex(size_t ilhs, size_t irhs) const {
+    return ilhs * num_matchers_ + irhs;
+  }
+
+  size_t num_elements_;
+  size_t num_matchers_;
+
+  // Each element is a char interpreted as bool. They are stored as a
+  // flattened array in lhs-major order, use 'SpaceIndex()' to translate
+  // a (ilhs, irhs) matrix coordinate into an offset.
+  ::std::vector<char> matched_;
+};
+
+typedef ::std::pair<size_t, size_t> ElementMatcherPair;
+typedef ::std::vector<ElementMatcherPair> ElementMatcherPairs;
+
+// Returns a maximum bipartite matching for the specified graph 'g'.
+// The matching is represented as a vector of {element, matcher} pairs.
+GTEST_API_ ElementMatcherPairs
+FindMaxBipartiteMatching(const MatchMatrix& g);
+
+struct UnorderedMatcherRequire {
+  enum Flags {
+    Superset = 1 << 0,
+    Subset = 1 << 1,
+    ExactMatch = Superset | Subset,
+  };
+};
+
+// Untyped base class for implementing UnorderedElementsAre.  By
+// putting logic that's not specific to the element type here, we
+// reduce binary bloat and increase compilation speed.
+class GTEST_API_ UnorderedElementsAreMatcherImplBase {
+ protected:
+  explicit UnorderedElementsAreMatcherImplBase(
+      UnorderedMatcherRequire::Flags matcher_flags)
+      : match_flags_(matcher_flags) {}
+
+  // A vector of matcher describers, one for each element matcher.
+  // Does not own the describers (and thus can be used only when the
+  // element matchers are alive).
+  typedef ::std::vector<const MatcherDescriberInterface*> MatcherDescriberVec;
+
+  // Describes this UnorderedElementsAre matcher.
+  void DescribeToImpl(::std::ostream* os) const;
+
+  // Describes the negation of this UnorderedElementsAre matcher.
+  void DescribeNegationToImpl(::std::ostream* os) const;
+
+  bool VerifyMatchMatrix(const ::std::vector<std::string>& element_printouts,
+                         const MatchMatrix& matrix,
+                         MatchResultListener* listener) const;
+
+  bool FindPairing(const MatchMatrix& matrix,
+                   MatchResultListener* listener) const;
+
+  MatcherDescriberVec& matcher_describers() {
+    return matcher_describers_;
+  }
+
+  static Message Elements(size_t n) {
+    return Message() << n << " element" << (n == 1 ? "" : "s");
+  }
+
+  UnorderedMatcherRequire::Flags match_flags() const { return match_flags_; }
+
+ private:
+  UnorderedMatcherRequire::Flags match_flags_;
+  MatcherDescriberVec matcher_describers_;
+
+  GTEST_DISALLOW_ASSIGN_(UnorderedElementsAreMatcherImplBase);
+};
+
+// Implements UnorderedElementsAre, UnorderedElementsAreArray, IsSubsetOf, and
+// IsSupersetOf.
+template <typename Container>
+class UnorderedElementsAreMatcherImpl
+    : public MatcherInterface<Container>,
+      public UnorderedElementsAreMatcherImplBase {
+ public:
+  typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+  typedef internal::StlContainerView<RawContainer> View;
+  typedef typename View::type StlContainer;
+  typedef typename View::const_reference StlContainerReference;
+  typedef typename StlContainer::const_iterator StlContainerConstIterator;
+  typedef typename StlContainer::value_type Element;
+
+  template <typename InputIter>
+  UnorderedElementsAreMatcherImpl(UnorderedMatcherRequire::Flags matcher_flags,
+                                  InputIter first, InputIter last)
+      : UnorderedElementsAreMatcherImplBase(matcher_flags) {
+    for (; first != last; ++first) {
+      matchers_.push_back(MatcherCast<const Element&>(*first));
+      matcher_describers().push_back(matchers_.back().GetDescriber());
+    }
+  }
+
+  // Describes what this matcher does.
+  void DescribeTo(::std::ostream* os) const override {
+    return UnorderedElementsAreMatcherImplBase::DescribeToImpl(os);
+  }
+
+  // Describes what the negation of this matcher does.
+  void DescribeNegationTo(::std::ostream* os) const override {
+    return UnorderedElementsAreMatcherImplBase::DescribeNegationToImpl(os);
+  }
+
+  bool MatchAndExplain(Container container,
+                       MatchResultListener* listener) const override {
+    StlContainerReference stl_container = View::ConstReference(container);
+    ::std::vector<std::string> element_printouts;
+    MatchMatrix matrix =
+        AnalyzeElements(stl_container.begin(), stl_container.end(),
+                        &element_printouts, listener);
+
+    if (matrix.LhsSize() == 0 && matrix.RhsSize() == 0) {
+      return true;
+    }
+
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      if (matrix.LhsSize() != matrix.RhsSize()) {
+        // The element count doesn't match.  If the container is empty,
+        // there's no need to explain anything as Google Mock already
+        // prints the empty container. Otherwise we just need to show
+        // how many elements there actually are.
+        if (matrix.LhsSize() != 0 && listener->IsInterested()) {
+          *listener << "which has " << Elements(matrix.LhsSize());
+        }
+        return false;
+      }
+    }
+
+    return VerifyMatchMatrix(element_printouts, matrix, listener) &&
+           FindPairing(matrix, listener);
+  }
+
+ private:
+  template <typename ElementIter>
+  MatchMatrix AnalyzeElements(ElementIter elem_first, ElementIter elem_last,
+                              ::std::vector<std::string>* element_printouts,
+                              MatchResultListener* listener) const {
+    element_printouts->clear();
+    ::std::vector<char> did_match;
+    size_t num_elements = 0;
+    DummyMatchResultListener dummy;
+    for (; elem_first != elem_last; ++num_elements, ++elem_first) {
+      if (listener->IsInterested()) {
+        element_printouts->push_back(PrintToString(*elem_first));
+      }
+      for (size_t irhs = 0; irhs != matchers_.size(); ++irhs) {
+        did_match.push_back(
+            matchers_[irhs].MatchAndExplain(*elem_first, &dummy));
+      }
+    }
+
+    MatchMatrix matrix(num_elements, matchers_.size());
+    ::std::vector<char>::const_iterator did_match_iter = did_match.begin();
+    for (size_t ilhs = 0; ilhs != num_elements; ++ilhs) {
+      for (size_t irhs = 0; irhs != matchers_.size(); ++irhs) {
+        matrix.SetEdge(ilhs, irhs, *did_match_iter++ != 0);
+      }
+    }
+    return matrix;
+  }
+
+  ::std::vector<Matcher<const Element&> > matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(UnorderedElementsAreMatcherImpl);
+};
+
+// Functor for use in TransformTuple.
+// Performs MatcherCast<Target> on an input argument of any type.
+template <typename Target>
+struct CastAndAppendTransform {
+  template <typename Arg>
+  Matcher<Target> operator()(const Arg& a) const {
+    return MatcherCast<Target>(a);
+  }
+};
+
+// Implements UnorderedElementsAre.
+template <typename MatcherTuple>
+class UnorderedElementsAreMatcher {
+ public:
+  explicit UnorderedElementsAreMatcher(const MatcherTuple& args)
+      : matchers_(args) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+    typedef typename internal::StlContainerView<RawContainer>::type View;
+    typedef typename View::value_type Element;
+    typedef ::std::vector<Matcher<const Element&> > MatcherVec;
+    MatcherVec matchers;
+    matchers.reserve(::std::tuple_size<MatcherTuple>::value);
+    TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
+                         ::std::back_inserter(matchers));
+    return Matcher<Container>(
+        new UnorderedElementsAreMatcherImpl<const Container&>(
+            UnorderedMatcherRequire::ExactMatch, matchers.begin(),
+            matchers.end()));
+  }
+
+ private:
+  const MatcherTuple matchers_;
+  GTEST_DISALLOW_ASSIGN_(UnorderedElementsAreMatcher);
+};
+
+// Implements ElementsAre.
+template <typename MatcherTuple>
+class ElementsAreMatcher {
+ public:
+  explicit ElementsAreMatcher(const MatcherTuple& args) : matchers_(args) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    GTEST_COMPILE_ASSERT_(
+        !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>::value ||
+            ::std::tuple_size<MatcherTuple>::value < 2,
+        use_UnorderedElementsAre_with_hash_tables);
+
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Container) RawContainer;
+    typedef typename internal::StlContainerView<RawContainer>::type View;
+    typedef typename View::value_type Element;
+    typedef ::std::vector<Matcher<const Element&> > MatcherVec;
+    MatcherVec matchers;
+    matchers.reserve(::std::tuple_size<MatcherTuple>::value);
+    TransformTupleValues(CastAndAppendTransform<const Element&>(), matchers_,
+                         ::std::back_inserter(matchers));
+    return Matcher<Container>(new ElementsAreMatcherImpl<const Container&>(
+        matchers.begin(), matchers.end()));
+  }
+
+ private:
+  const MatcherTuple matchers_;
+  GTEST_DISALLOW_ASSIGN_(ElementsAreMatcher);
+};
+
+// Implements UnorderedElementsAreArray(), IsSubsetOf(), and IsSupersetOf().
+template <typename T>
+class UnorderedElementsAreArrayMatcher {
+ public:
+  template <typename Iter>
+  UnorderedElementsAreArrayMatcher(UnorderedMatcherRequire::Flags match_flags,
+                                   Iter first, Iter last)
+      : match_flags_(match_flags), matchers_(first, last) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    return Matcher<Container>(
+        new UnorderedElementsAreMatcherImpl<const Container&>(
+            match_flags_, matchers_.begin(), matchers_.end()));
+  }
+
+ private:
+  UnorderedMatcherRequire::Flags match_flags_;
+  ::std::vector<T> matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(UnorderedElementsAreArrayMatcher);
+};
+
+// Implements ElementsAreArray().
+template <typename T>
+class ElementsAreArrayMatcher {
+ public:
+  template <typename Iter>
+  ElementsAreArrayMatcher(Iter first, Iter last) : matchers_(first, last) {}
+
+  template <typename Container>
+  operator Matcher<Container>() const {
+    GTEST_COMPILE_ASSERT_(
+        !IsHashTable<GTEST_REMOVE_REFERENCE_AND_CONST_(Container)>::value,
+        use_UnorderedElementsAreArray_with_hash_tables);
+
+    return Matcher<Container>(new ElementsAreMatcherImpl<const Container&>(
+        matchers_.begin(), matchers_.end()));
+  }
+
+ private:
+  const ::std::vector<T> matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(ElementsAreArrayMatcher);
+};
+
+// Given a 2-tuple matcher tm of type Tuple2Matcher and a value second
+// of type Second, BoundSecondMatcher<Tuple2Matcher, Second>(tm,
+// second) is a polymorphic matcher that matches a value x if and only if
+// tm matches tuple (x, second).  Useful for implementing
+// UnorderedPointwise() in terms of UnorderedElementsAreArray().
+//
+// BoundSecondMatcher is copyable and assignable, as we need to put
+// instances of this class in a vector when implementing
+// UnorderedPointwise().
+template <typename Tuple2Matcher, typename Second>
+class BoundSecondMatcher {
+ public:
+  BoundSecondMatcher(const Tuple2Matcher& tm, const Second& second)
+      : tuple2_matcher_(tm), second_value_(second) {}
+
+  BoundSecondMatcher(const BoundSecondMatcher& other) = default;
+
+  template <typename T>
+  operator Matcher<T>() const {
+    return MakeMatcher(new Impl<T>(tuple2_matcher_, second_value_));
+  }
+
+  // We have to define this for UnorderedPointwise() to compile in
+  // C++98 mode, as it puts BoundSecondMatcher instances in a vector,
+  // which requires the elements to be assignable in C++98.  The
+  // compiler cannot generate the operator= for us, as Tuple2Matcher
+  // and Second may not be assignable.
+  //
+  // However, this should never be called, so the implementation just
+  // need to assert.
+  void operator=(const BoundSecondMatcher& /*rhs*/) {
+    GTEST_LOG_(FATAL) << "BoundSecondMatcher should never be assigned.";
+  }
+
+ private:
+  template <typename T>
+  class Impl : public MatcherInterface<T> {
+   public:
+    typedef ::std::tuple<T, Second> ArgTuple;
+
+    Impl(const Tuple2Matcher& tm, const Second& second)
+        : mono_tuple2_matcher_(SafeMatcherCast<const ArgTuple&>(tm)),
+          second_value_(second) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "and ";
+      UniversalPrint(second_value_, os);
+      *os << " ";
+      mono_tuple2_matcher_.DescribeTo(os);
+    }
+
+    bool MatchAndExplain(T x, MatchResultListener* listener) const override {
+      return mono_tuple2_matcher_.MatchAndExplain(ArgTuple(x, second_value_),
+                                                  listener);
+    }
+
+   private:
+    const Matcher<const ArgTuple&> mono_tuple2_matcher_;
+    const Second second_value_;
+
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+  const Tuple2Matcher tuple2_matcher_;
+  const Second second_value_;
+};
+
+// Given a 2-tuple matcher tm and a value second,
+// MatcherBindSecond(tm, second) returns a matcher that matches a
+// value x if and only if tm matches tuple (x, second).  Useful for
+// implementing UnorderedPointwise() in terms of UnorderedElementsAreArray().
+template <typename Tuple2Matcher, typename Second>
+BoundSecondMatcher<Tuple2Matcher, Second> MatcherBindSecond(
+    const Tuple2Matcher& tm, const Second& second) {
+  return BoundSecondMatcher<Tuple2Matcher, Second>(tm, second);
+}
+
+// Returns the description for a matcher defined using the MATCHER*()
+// macro where the user-supplied description string is "", if
+// 'negation' is false; otherwise returns the description of the
+// negation of the matcher.  'param_values' contains a list of strings
+// that are the print-out of the matcher's parameters.
+GTEST_API_ std::string FormatMatcherDescription(bool negation,
+                                                const char* matcher_name,
+                                                const Strings& param_values);
+
+// Implements a matcher that checks the value of a optional<> type variable.
+template <typename ValueMatcher>
+class OptionalMatcher {
+ public:
+  explicit OptionalMatcher(const ValueMatcher& value_matcher)
+      : value_matcher_(value_matcher) {}
+
+  template <typename Optional>
+  operator Matcher<Optional>() const {
+    return Matcher<Optional>(new Impl<const Optional&>(value_matcher_));
+  }
+
+  template <typename Optional>
+  class Impl : public MatcherInterface<Optional> {
+   public:
+    typedef GTEST_REMOVE_REFERENCE_AND_CONST_(Optional) OptionalView;
+    typedef typename OptionalView::value_type ValueType;
+    explicit Impl(const ValueMatcher& value_matcher)
+        : value_matcher_(MatcherCast<ValueType>(value_matcher)) {}
+
+    void DescribeTo(::std::ostream* os) const override {
+      *os << "value ";
+      value_matcher_.DescribeTo(os);
+    }
+
+    void DescribeNegationTo(::std::ostream* os) const override {
+      *os << "value ";
+      value_matcher_.DescribeNegationTo(os);
+    }
+
+    bool MatchAndExplain(Optional optional,
+                         MatchResultListener* listener) const override {
+      if (!optional) {
+        *listener << "which is not engaged";
+        return false;
+      }
+      const ValueType& value = *optional;
+      StringMatchResultListener value_listener;
+      const bool match = value_matcher_.MatchAndExplain(value, &value_listener);
+      *listener << "whose value " << PrintToString(value)
+                << (match ? " matches" : " doesn't match");
+      PrintIfNotEmpty(value_listener.str(), listener->stream());
+      return match;
+    }
+
+   private:
+    const Matcher<ValueType> value_matcher_;
+    GTEST_DISALLOW_ASSIGN_(Impl);
+  };
+
+ private:
+  const ValueMatcher value_matcher_;
+  GTEST_DISALLOW_ASSIGN_(OptionalMatcher);
+};
+
+namespace variant_matcher {
+// Overloads to allow VariantMatcher to do proper ADL lookup.
+template <typename T>
+void holds_alternative() {}
+template <typename T>
+void get() {}
+
+// Implements a matcher that checks the value of a variant<> type variable.
+template <typename T>
+class VariantMatcher {
+ public:
+  explicit VariantMatcher(::testing::Matcher<const T&> matcher)
+      : matcher_(std::move(matcher)) {}
+
+  template <typename Variant>
+  bool MatchAndExplain(const Variant& value,
+                       ::testing::MatchResultListener* listener) const {
+    using std::get;
+    if (!listener->IsInterested()) {
+      return holds_alternative<T>(value) && matcher_.Matches(get<T>(value));
+    }
+
+    if (!holds_alternative<T>(value)) {
+      *listener << "whose value is not of type '" << GetTypeName() << "'";
+      return false;
+    }
+
+    const T& elem = get<T>(value);
+    StringMatchResultListener elem_listener;
+    const bool match = matcher_.MatchAndExplain(elem, &elem_listener);
+    *listener << "whose value " << PrintToString(elem)
+              << (match ? " matches" : " doesn't match");
+    PrintIfNotEmpty(elem_listener.str(), listener->stream());
+    return match;
+  }
+
+  void DescribeTo(std::ostream* os) const {
+    *os << "is a variant<> with value of type '" << GetTypeName()
+        << "' and the value ";
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "is a variant<> with value of type other than '" << GetTypeName()
+        << "' or the value ";
+    matcher_.DescribeNegationTo(os);
+  }
+
+ private:
+  static std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(
+        return internal::GetTypeName<T>());
+#endif
+    return "the element type";
+  }
+
+  const ::testing::Matcher<const T&> matcher_;
+};
+
+}  // namespace variant_matcher
+
+namespace any_cast_matcher {
+
+// Overloads to allow AnyCastMatcher to do proper ADL lookup.
+template <typename T>
+void any_cast() {}
+
+// Implements a matcher that any_casts the value.
+template <typename T>
+class AnyCastMatcher {
+ public:
+  explicit AnyCastMatcher(const ::testing::Matcher<const T&>& matcher)
+      : matcher_(matcher) {}
+
+  template <typename AnyType>
+  bool MatchAndExplain(const AnyType& value,
+                       ::testing::MatchResultListener* listener) const {
+    if (!listener->IsInterested()) {
+      const T* ptr = any_cast<T>(&value);
+      return ptr != nullptr && matcher_.Matches(*ptr);
+    }
+
+    const T* elem = any_cast<T>(&value);
+    if (elem == nullptr) {
+      *listener << "whose value is not of type '" << GetTypeName() << "'";
+      return false;
+    }
+
+    StringMatchResultListener elem_listener;
+    const bool match = matcher_.MatchAndExplain(*elem, &elem_listener);
+    *listener << "whose value " << PrintToString(*elem)
+              << (match ? " matches" : " doesn't match");
+    PrintIfNotEmpty(elem_listener.str(), listener->stream());
+    return match;
+  }
+
+  void DescribeTo(std::ostream* os) const {
+    *os << "is an 'any' type with value of type '" << GetTypeName()
+        << "' and the value ";
+    matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "is an 'any' type with value of type other than '" << GetTypeName()
+        << "' or the value ";
+    matcher_.DescribeNegationTo(os);
+  }
+
+ private:
+  static std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(
+        return internal::GetTypeName<T>());
+#endif
+    return "the element type";
+  }
+
+  const ::testing::Matcher<const T&> matcher_;
+};
+
+}  // namespace any_cast_matcher
+
+// Implements the Args() matcher.
+template <class ArgsTuple, size_t... k>
+class ArgsMatcherImpl : public MatcherInterface<ArgsTuple> {
+ public:
+  using RawArgsTuple = typename std::decay<ArgsTuple>::type;
+  using SelectedArgs =
+      std::tuple<typename std::tuple_element<k, RawArgsTuple>::type...>;
+  using MonomorphicInnerMatcher = Matcher<const SelectedArgs&>;
+
+  template <typename InnerMatcher>
+  explicit ArgsMatcherImpl(const InnerMatcher& inner_matcher)
+      : inner_matcher_(SafeMatcherCast<const SelectedArgs&>(inner_matcher)) {}
+
+  bool MatchAndExplain(ArgsTuple args,
+                       MatchResultListener* listener) const override {
+    // Workaround spurious C4100 on MSVC<=15.7 when k is empty.
+    (void)args;
+    const SelectedArgs& selected_args =
+        std::forward_as_tuple(std::get<k>(args)...);
+    if (!listener->IsInterested()) return inner_matcher_.Matches(selected_args);
+
+    PrintIndices(listener->stream());
+    *listener << "are " << PrintToString(selected_args);
+
+    StringMatchResultListener inner_listener;
+    const bool match =
+        inner_matcher_.MatchAndExplain(selected_args, &inner_listener);
+    PrintIfNotEmpty(inner_listener.str(), listener->stream());
+    return match;
+  }
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "are a tuple ";
+    PrintIndices(os);
+    inner_matcher_.DescribeTo(os);
+  }
+
+  void DescribeNegationTo(::std::ostream* os) const override {
+    *os << "are a tuple ";
+    PrintIndices(os);
+    inner_matcher_.DescribeNegationTo(os);
+  }
+
+ private:
+  // Prints the indices of the selected fields.
+  static void PrintIndices(::std::ostream* os) {
+    *os << "whose fields (";
+    const char* sep = "";
+    // Workaround spurious C4189 on MSVC<=15.7 when k is empty.
+    (void)sep;
+    const char* dummy[] = {"", (*os << sep << "#" << k, sep = ", ")...};
+    (void)dummy;
+    *os << ") ";
+  }
+
+  MonomorphicInnerMatcher inner_matcher_;
+};
+
+template <class InnerMatcher, size_t... k>
+class ArgsMatcher {
+ public:
+  explicit ArgsMatcher(InnerMatcher inner_matcher)
+      : inner_matcher_(std::move(inner_matcher)) {}
+
+  template <typename ArgsTuple>
+  operator Matcher<ArgsTuple>() const {  // NOLINT
+    return MakeMatcher(new ArgsMatcherImpl<ArgsTuple, k...>(inner_matcher_));
+  }
+
+ private:
+  InnerMatcher inner_matcher_;
+};
+
+}  // namespace internal
+
+// ElementsAreArray(iterator_first, iterator_last)
+// ElementsAreArray(pointer, count)
+// ElementsAreArray(array)
+// ElementsAreArray(container)
+// ElementsAreArray({ e1, e2, ..., en })
+//
+// The ElementsAreArray() functions are like ElementsAre(...), except
+// that they are given a homogeneous sequence rather than taking each
+// element as a function argument. The sequence can be specified as an
+// array, a pointer and count, a vector, an initializer list, or an
+// STL iterator range. In each of these cases, the underlying sequence
+// can be either a sequence of values or a sequence of matchers.
+//
+// All forms of ElementsAreArray() make a copy of the input matcher sequence.
+
+template <typename Iter>
+inline internal::ElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+ElementsAreArray(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::ElementsAreArrayMatcher<T>(first, last);
+}
+
+template <typename T>
+inline internal::ElementsAreArrayMatcher<T> ElementsAreArray(
+    const T* pointer, size_t count) {
+  return ElementsAreArray(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::ElementsAreArrayMatcher<T> ElementsAreArray(
+    const T (&array)[N]) {
+  return ElementsAreArray(array, N);
+}
+
+template <typename Container>
+inline internal::ElementsAreArrayMatcher<typename Container::value_type>
+ElementsAreArray(const Container& container) {
+  return ElementsAreArray(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::ElementsAreArrayMatcher<T>
+ElementsAreArray(::std::initializer_list<T> xs) {
+  return ElementsAreArray(xs.begin(), xs.end());
+}
+
+// UnorderedElementsAreArray(iterator_first, iterator_last)
+// UnorderedElementsAreArray(pointer, count)
+// UnorderedElementsAreArray(array)
+// UnorderedElementsAreArray(container)
+// UnorderedElementsAreArray({ e1, e2, ..., en })
+//
+// UnorderedElementsAreArray() verifies that a bijective mapping onto a
+// collection of matchers exists.
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+UnorderedElementsAreArray(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::UnorderedElementsAreArrayMatcher<T>(
+      internal::UnorderedMatcherRequire::ExactMatch, first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T>
+UnorderedElementsAreArray(const T* pointer, size_t count) {
+  return UnorderedElementsAreArray(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T>
+UnorderedElementsAreArray(const T (&array)[N]) {
+  return UnorderedElementsAreArray(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename Container::value_type>
+UnorderedElementsAreArray(const Container& container) {
+  return UnorderedElementsAreArray(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T>
+UnorderedElementsAreArray(::std::initializer_list<T> xs) {
+  return UnorderedElementsAreArray(xs.begin(), xs.end());
+}
+
+// _ is a matcher that matches anything of any type.
+//
+// This definition is fine as:
+//
+//   1. The C++ standard permits using the name _ in a namespace that
+//      is not the global namespace or ::std.
+//   2. The AnythingMatcher class has no data member or constructor,
+//      so it's OK to create global variables of this type.
+//   3. c-style has approved of using _ in this case.
+const internal::AnythingMatcher _ = {};
+// Creates a matcher that matches any value of the given type T.
+template <typename T>
+inline Matcher<T> A() {
+  return Matcher<T>(new internal::AnyMatcherImpl<T>());
+}
+
+// Creates a matcher that matches any value of the given type T.
+template <typename T>
+inline Matcher<T> An() { return A<T>(); }
+
+template <typename T, typename M>
+Matcher<T> internal::MatcherCastImpl<T, M>::CastImpl(
+    const M& value, std::false_type /* convertible_to_matcher */,
+    std::false_type /* convertible_to_T */) {
+  return Eq(value);
+}
+
+// Creates a polymorphic matcher that matches any NULL pointer.
+inline PolymorphicMatcher<internal::IsNullMatcher > IsNull() {
+  return MakePolymorphicMatcher(internal::IsNullMatcher());
+}
+
+// Creates a polymorphic matcher that matches any non-NULL pointer.
+// This is convenient as Not(NULL) doesn't compile (the compiler
+// thinks that that expression is comparing a pointer with an integer).
+inline PolymorphicMatcher<internal::NotNullMatcher > NotNull() {
+  return MakePolymorphicMatcher(internal::NotNullMatcher());
+}
+
+// Creates a polymorphic matcher that matches any argument that
+// references variable x.
+template <typename T>
+inline internal::RefMatcher<T&> Ref(T& x) {  // NOLINT
+  return internal::RefMatcher<T&>(x);
+}
+
+// Creates a polymorphic matcher that matches any NaN floating point.
+inline PolymorphicMatcher<internal::IsNanMatcher> IsNan() {
+  return MakePolymorphicMatcher(internal::IsNanMatcher());
+}
+
+// Creates a matcher that matches any double argument approximately
+// equal to rhs, where two NANs are considered unequal.
+inline internal::FloatingEqMatcher<double> DoubleEq(double rhs) {
+  return internal::FloatingEqMatcher<double>(rhs, false);
+}
+
+// Creates a matcher that matches any double argument approximately
+// equal to rhs, including NaN values when rhs is NaN.
+inline internal::FloatingEqMatcher<double> NanSensitiveDoubleEq(double rhs) {
+  return internal::FloatingEqMatcher<double>(rhs, true);
+}
+
+// Creates a matcher that matches any double argument approximately equal to
+// rhs, up to the specified max absolute error bound, where two NANs are
+// considered unequal.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<double> DoubleNear(
+    double rhs, double max_abs_error) {
+  return internal::FloatingEqMatcher<double>(rhs, false, max_abs_error);
+}
+
+// Creates a matcher that matches any double argument approximately equal to
+// rhs, up to the specified max absolute error bound, including NaN values when
+// rhs is NaN.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<double> NanSensitiveDoubleNear(
+    double rhs, double max_abs_error) {
+  return internal::FloatingEqMatcher<double>(rhs, true, max_abs_error);
+}
+
+// Creates a matcher that matches any float argument approximately
+// equal to rhs, where two NANs are considered unequal.
+inline internal::FloatingEqMatcher<float> FloatEq(float rhs) {
+  return internal::FloatingEqMatcher<float>(rhs, false);
+}
+
+// Creates a matcher that matches any float argument approximately
+// equal to rhs, including NaN values when rhs is NaN.
+inline internal::FloatingEqMatcher<float> NanSensitiveFloatEq(float rhs) {
+  return internal::FloatingEqMatcher<float>(rhs, true);
+}
+
+// Creates a matcher that matches any float argument approximately equal to
+// rhs, up to the specified max absolute error bound, where two NANs are
+// considered unequal.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<float> FloatNear(
+    float rhs, float max_abs_error) {
+  return internal::FloatingEqMatcher<float>(rhs, false, max_abs_error);
+}
+
+// Creates a matcher that matches any float argument approximately equal to
+// rhs, up to the specified max absolute error bound, including NaN values when
+// rhs is NaN.  The max absolute error bound must be non-negative.
+inline internal::FloatingEqMatcher<float> NanSensitiveFloatNear(
+    float rhs, float max_abs_error) {
+  return internal::FloatingEqMatcher<float>(rhs, true, max_abs_error);
+}
+
+// Creates a matcher that matches a pointer (raw or smart) that points
+// to a value that matches inner_matcher.
+template <typename InnerMatcher>
+inline internal::PointeeMatcher<InnerMatcher> Pointee(
+    const InnerMatcher& inner_matcher) {
+  return internal::PointeeMatcher<InnerMatcher>(inner_matcher);
+}
+
+#if GTEST_HAS_RTTI
+// Creates a matcher that matches a pointer or reference that matches
+// inner_matcher when dynamic_cast<To> is applied.
+// The result of dynamic_cast<To> is forwarded to the inner matcher.
+// If To is a pointer and the cast fails, the inner matcher will receive NULL.
+// If To is a reference and the cast fails, this matcher returns false
+// immediately.
+template <typename To>
+inline PolymorphicMatcher<internal::WhenDynamicCastToMatcher<To> >
+WhenDynamicCastTo(const Matcher<To>& inner_matcher) {
+  return MakePolymorphicMatcher(
+      internal::WhenDynamicCastToMatcher<To>(inner_matcher));
+}
+#endif  // GTEST_HAS_RTTI
+
+// Creates a matcher that matches an object whose given field matches
+// 'matcher'.  For example,
+//   Field(&Foo::number, Ge(5))
+// matches a Foo object x if and only if x.number >= 5.
+template <typename Class, typename FieldType, typename FieldMatcher>
+inline PolymorphicMatcher<
+  internal::FieldMatcher<Class, FieldType> > Field(
+    FieldType Class::*field, const FieldMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::FieldMatcher<Class, FieldType>(
+          field, MatcherCast<const FieldType&>(matcher)));
+  // The call to MatcherCast() is required for supporting inner
+  // matchers of compatible types.  For example, it allows
+  //   Field(&Foo::bar, m)
+  // to compile where bar is an int32 and m is a matcher for int64.
+}
+
+// Same as Field() but also takes the name of the field to provide better error
+// messages.
+template <typename Class, typename FieldType, typename FieldMatcher>
+inline PolymorphicMatcher<internal::FieldMatcher<Class, FieldType> > Field(
+    const std::string& field_name, FieldType Class::*field,
+    const FieldMatcher& matcher) {
+  return MakePolymorphicMatcher(internal::FieldMatcher<Class, FieldType>(
+      field_name, field, MatcherCast<const FieldType&>(matcher)));
+}
+
+// Creates a matcher that matches an object whose given property
+// matches 'matcher'.  For example,
+//   Property(&Foo::str, StartsWith("hi"))
+// matches a Foo object x if and only if x.str() starts with "hi".
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+    Class, PropertyType, PropertyType (Class::*)() const> >
+Property(PropertyType (Class::*property)() const,
+         const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType,
+                                PropertyType (Class::*)() const>(
+          property, MatcherCast<const PropertyType&>(matcher)));
+  // The call to MatcherCast() is required for supporting inner
+  // matchers of compatible types.  For example, it allows
+  //   Property(&Foo::bar, m)
+  // to compile where bar() returns an int32 and m is a matcher for int64.
+}
+
+// Same as Property() above, but also takes the name of the property to provide
+// better error messages.
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+    Class, PropertyType, PropertyType (Class::*)() const> >
+Property(const std::string& property_name,
+         PropertyType (Class::*property)() const,
+         const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType,
+                                PropertyType (Class::*)() const>(
+          property_name, property, MatcherCast<const PropertyType&>(matcher)));
+}
+
+// The same as above but for reference-qualified member functions.
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+    Class, PropertyType, PropertyType (Class::*)() const &> >
+Property(PropertyType (Class::*property)() const &,
+         const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType,
+                                PropertyType (Class::*)() const&>(
+          property, MatcherCast<const PropertyType&>(matcher)));
+}
+
+// Three-argument form for reference-qualified member functions.
+template <typename Class, typename PropertyType, typename PropertyMatcher>
+inline PolymorphicMatcher<internal::PropertyMatcher<
+    Class, PropertyType, PropertyType (Class::*)() const &> >
+Property(const std::string& property_name,
+         PropertyType (Class::*property)() const &,
+         const PropertyMatcher& matcher) {
+  return MakePolymorphicMatcher(
+      internal::PropertyMatcher<Class, PropertyType,
+                                PropertyType (Class::*)() const&>(
+          property_name, property, MatcherCast<const PropertyType&>(matcher)));
+}
+
+// Creates a matcher that matches an object if and only if the result of
+// applying a callable to x matches 'matcher'. For example,
+//   ResultOf(f, StartsWith("hi"))
+// matches a Foo object x if and only if f(x) starts with "hi".
+// `callable` parameter can be a function, function pointer, or a functor. It is
+// required to keep no state affecting the results of the calls on it and make
+// no assumptions about how many calls will be made. Any state it keeps must be
+// protected from the concurrent access.
+template <typename Callable, typename InnerMatcher>
+internal::ResultOfMatcher<Callable, InnerMatcher> ResultOf(
+    Callable callable, InnerMatcher matcher) {
+  return internal::ResultOfMatcher<Callable, InnerMatcher>(
+      std::move(callable), std::move(matcher));
+}
+
+// String matchers.
+
+// Matches a string equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrEq(
+    const std::string& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::string>(str, true, true));
+}
+
+// Matches a string not equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrNe(
+    const std::string& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::string>(str, false, true));
+}
+
+// Matches a string equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrCaseEq(
+    const std::string& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::string>(str, true, false));
+}
+
+// Matches a string not equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::string> > StrCaseNe(
+    const std::string& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::string>(str, false, false));
+}
+
+// Creates a matcher that matches any string, std::string, or C string
+// that contains the given substring.
+inline PolymorphicMatcher<internal::HasSubstrMatcher<std::string> > HasSubstr(
+    const std::string& substring) {
+  return MakePolymorphicMatcher(
+      internal::HasSubstrMatcher<std::string>(substring));
+}
+
+// Matches a string that starts with 'prefix' (case-sensitive).
+inline PolymorphicMatcher<internal::StartsWithMatcher<std::string> > StartsWith(
+    const std::string& prefix) {
+  return MakePolymorphicMatcher(
+      internal::StartsWithMatcher<std::string>(prefix));
+}
+
+// Matches a string that ends with 'suffix' (case-sensitive).
+inline PolymorphicMatcher<internal::EndsWithMatcher<std::string> > EndsWith(
+    const std::string& suffix) {
+  return MakePolymorphicMatcher(internal::EndsWithMatcher<std::string>(suffix));
+}
+
+#if GTEST_HAS_STD_WSTRING
+// Wide string matchers.
+
+// Matches a string equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> > StrEq(
+    const std::wstring& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::wstring>(str, true, true));
+}
+
+// Matches a string not equal to str.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> > StrNe(
+    const std::wstring& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::wstring>(str, false, true));
+}
+
+// Matches a string equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> >
+StrCaseEq(const std::wstring& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::wstring>(str, true, false));
+}
+
+// Matches a string not equal to str, ignoring case.
+inline PolymorphicMatcher<internal::StrEqualityMatcher<std::wstring> >
+StrCaseNe(const std::wstring& str) {
+  return MakePolymorphicMatcher(
+      internal::StrEqualityMatcher<std::wstring>(str, false, false));
+}
+
+// Creates a matcher that matches any ::wstring, std::wstring, or C wide string
+// that contains the given substring.
+inline PolymorphicMatcher<internal::HasSubstrMatcher<std::wstring> > HasSubstr(
+    const std::wstring& substring) {
+  return MakePolymorphicMatcher(
+      internal::HasSubstrMatcher<std::wstring>(substring));
+}
+
+// Matches a string that starts with 'prefix' (case-sensitive).
+inline PolymorphicMatcher<internal::StartsWithMatcher<std::wstring> >
+StartsWith(const std::wstring& prefix) {
+  return MakePolymorphicMatcher(
+      internal::StartsWithMatcher<std::wstring>(prefix));
+}
+
+// Matches a string that ends with 'suffix' (case-sensitive).
+inline PolymorphicMatcher<internal::EndsWithMatcher<std::wstring> > EndsWith(
+    const std::wstring& suffix) {
+  return MakePolymorphicMatcher(
+      internal::EndsWithMatcher<std::wstring>(suffix));
+}
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field == the second field.
+inline internal::Eq2Matcher Eq() { return internal::Eq2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field >= the second field.
+inline internal::Ge2Matcher Ge() { return internal::Ge2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field > the second field.
+inline internal::Gt2Matcher Gt() { return internal::Gt2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field <= the second field.
+inline internal::Le2Matcher Le() { return internal::Le2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field < the second field.
+inline internal::Lt2Matcher Lt() { return internal::Lt2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where the
+// first field != the second field.
+inline internal::Ne2Matcher Ne() { return internal::Ne2Matcher(); }
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatEq(first field) matches the second field.
+inline internal::FloatingEq2Matcher<float> FloatEq() {
+  return internal::FloatingEq2Matcher<float>();
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleEq(first field) matches the second field.
+inline internal::FloatingEq2Matcher<double> DoubleEq() {
+  return internal::FloatingEq2Matcher<double>();
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatEq(first field) matches the second field with NaN equality.
+inline internal::FloatingEq2Matcher<float> NanSensitiveFloatEq() {
+  return internal::FloatingEq2Matcher<float>(true);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleEq(first field) matches the second field with NaN equality.
+inline internal::FloatingEq2Matcher<double> NanSensitiveDoubleEq() {
+  return internal::FloatingEq2Matcher<double>(true);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatNear(first field, max_abs_error) matches the second field.
+inline internal::FloatingEq2Matcher<float> FloatNear(float max_abs_error) {
+  return internal::FloatingEq2Matcher<float>(max_abs_error);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleNear(first field, max_abs_error) matches the second field.
+inline internal::FloatingEq2Matcher<double> DoubleNear(double max_abs_error) {
+  return internal::FloatingEq2Matcher<double>(max_abs_error);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// FloatNear(first field, max_abs_error) matches the second field with NaN
+// equality.
+inline internal::FloatingEq2Matcher<float> NanSensitiveFloatNear(
+    float max_abs_error) {
+  return internal::FloatingEq2Matcher<float>(max_abs_error, true);
+}
+
+// Creates a polymorphic matcher that matches a 2-tuple where
+// DoubleNear(first field, max_abs_error) matches the second field with NaN
+// equality.
+inline internal::FloatingEq2Matcher<double> NanSensitiveDoubleNear(
+    double max_abs_error) {
+  return internal::FloatingEq2Matcher<double>(max_abs_error, true);
+}
+
+// Creates a matcher that matches any value of type T that m doesn't
+// match.
+template <typename InnerMatcher>
+inline internal::NotMatcher<InnerMatcher> Not(InnerMatcher m) {
+  return internal::NotMatcher<InnerMatcher>(m);
+}
+
+// Returns a matcher that matches anything that satisfies the given
+// predicate.  The predicate can be any unary function or functor
+// whose return type can be implicitly converted to bool.
+template <typename Predicate>
+inline PolymorphicMatcher<internal::TrulyMatcher<Predicate> >
+Truly(Predicate pred) {
+  return MakePolymorphicMatcher(internal::TrulyMatcher<Predicate>(pred));
+}
+
+// Returns a matcher that matches the container size. The container must
+// support both size() and size_type which all STL-like containers provide.
+// Note that the parameter 'size' can be a value of type size_type as well as
+// matcher. For instance:
+//   EXPECT_THAT(container, SizeIs(2));     // Checks container has 2 elements.
+//   EXPECT_THAT(container, SizeIs(Le(2));  // Checks container has at most 2.
+template <typename SizeMatcher>
+inline internal::SizeIsMatcher<SizeMatcher>
+SizeIs(const SizeMatcher& size_matcher) {
+  return internal::SizeIsMatcher<SizeMatcher>(size_matcher);
+}
+
+// Returns a matcher that matches the distance between the container's begin()
+// iterator and its end() iterator, i.e. the size of the container. This matcher
+// can be used instead of SizeIs with containers such as std::forward_list which
+// do not implement size(). The container must provide const_iterator (with
+// valid iterator_traits), begin() and end().
+template <typename DistanceMatcher>
+inline internal::BeginEndDistanceIsMatcher<DistanceMatcher>
+BeginEndDistanceIs(const DistanceMatcher& distance_matcher) {
+  return internal::BeginEndDistanceIsMatcher<DistanceMatcher>(distance_matcher);
+}
+
+// Returns a matcher that matches an equal container.
+// This matcher behaves like Eq(), but in the event of mismatch lists the
+// values that are included in one container but not the other. (Duplicate
+// values and order differences are not explained.)
+template <typename Container>
+inline PolymorphicMatcher<internal::ContainerEqMatcher<
+    typename std::remove_const<Container>::type>>
+ContainerEq(const Container& rhs) {
+  return MakePolymorphicMatcher(internal::ContainerEqMatcher<Container>(rhs));
+}
+
+// Returns a matcher that matches a container that, when sorted using
+// the given comparator, matches container_matcher.
+template <typename Comparator, typename ContainerMatcher>
+inline internal::WhenSortedByMatcher<Comparator, ContainerMatcher>
+WhenSortedBy(const Comparator& comparator,
+             const ContainerMatcher& container_matcher) {
+  return internal::WhenSortedByMatcher<Comparator, ContainerMatcher>(
+      comparator, container_matcher);
+}
+
+// Returns a matcher that matches a container that, when sorted using
+// the < operator, matches container_matcher.
+template <typename ContainerMatcher>
+inline internal::WhenSortedByMatcher<internal::LessComparator, ContainerMatcher>
+WhenSorted(const ContainerMatcher& container_matcher) {
+  return
+      internal::WhenSortedByMatcher<internal::LessComparator, ContainerMatcher>(
+          internal::LessComparator(), container_matcher);
+}
+
+// Matches an STL-style container or a native array that contains the
+// same number of elements as in rhs, where its i-th element and rhs's
+// i-th element (as a pair) satisfy the given pair matcher, for all i.
+// TupleMatcher must be able to be safely cast to Matcher<std::tuple<const
+// T1&, const T2&> >, where T1 and T2 are the types of elements in the
+// LHS container and the RHS container respectively.
+template <typename TupleMatcher, typename Container>
+inline internal::PointwiseMatcher<TupleMatcher,
+                                  typename std::remove_const<Container>::type>
+Pointwise(const TupleMatcher& tuple_matcher, const Container& rhs) {
+  return internal::PointwiseMatcher<TupleMatcher, Container>(tuple_matcher,
+                                                             rhs);
+}
+
+
+// Supports the Pointwise(m, {a, b, c}) syntax.
+template <typename TupleMatcher, typename T>
+inline internal::PointwiseMatcher<TupleMatcher, std::vector<T> > Pointwise(
+    const TupleMatcher& tuple_matcher, std::initializer_list<T> rhs) {
+  return Pointwise(tuple_matcher, std::vector<T>(rhs));
+}
+
+
+// UnorderedPointwise(pair_matcher, rhs) matches an STL-style
+// container or a native array that contains the same number of
+// elements as in rhs, where in some permutation of the container, its
+// i-th element and rhs's i-th element (as a pair) satisfy the given
+// pair matcher, for all i.  Tuple2Matcher must be able to be safely
+// cast to Matcher<std::tuple<const T1&, const T2&> >, where T1 and T2 are
+// the types of elements in the LHS container and the RHS container
+// respectively.
+//
+// This is like Pointwise(pair_matcher, rhs), except that the element
+// order doesn't matter.
+template <typename Tuple2Matcher, typename RhsContainer>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename internal::BoundSecondMatcher<
+        Tuple2Matcher,
+        typename internal::StlContainerView<
+            typename std::remove_const<RhsContainer>::type>::type::value_type>>
+UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
+                   const RhsContainer& rhs_container) {
+  // RhsView allows the same code to handle RhsContainer being a
+  // STL-style container and it being a native C-style array.
+  typedef typename internal::StlContainerView<RhsContainer> RhsView;
+  typedef typename RhsView::type RhsStlContainer;
+  typedef typename RhsStlContainer::value_type Second;
+  const RhsStlContainer& rhs_stl_container =
+      RhsView::ConstReference(rhs_container);
+
+  // Create a matcher for each element in rhs_container.
+  ::std::vector<internal::BoundSecondMatcher<Tuple2Matcher, Second> > matchers;
+  for (typename RhsStlContainer::const_iterator it = rhs_stl_container.begin();
+       it != rhs_stl_container.end(); ++it) {
+    matchers.push_back(
+        internal::MatcherBindSecond(tuple2_matcher, *it));
+  }
+
+  // Delegate the work to UnorderedElementsAreArray().
+  return UnorderedElementsAreArray(matchers);
+}
+
+
+// Supports the UnorderedPointwise(m, {a, b, c}) syntax.
+template <typename Tuple2Matcher, typename T>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename internal::BoundSecondMatcher<Tuple2Matcher, T> >
+UnorderedPointwise(const Tuple2Matcher& tuple2_matcher,
+                   std::initializer_list<T> rhs) {
+  return UnorderedPointwise(tuple2_matcher, std::vector<T>(rhs));
+}
+
+
+// Matches an STL-style container or a native array that contains at
+// least one element matching the given value or matcher.
+//
+// Examples:
+//   ::std::set<int> page_ids;
+//   page_ids.insert(3);
+//   page_ids.insert(1);
+//   EXPECT_THAT(page_ids, Contains(1));
+//   EXPECT_THAT(page_ids, Contains(Gt(2)));
+//   EXPECT_THAT(page_ids, Not(Contains(4)));
+//
+//   ::std::map<int, size_t> page_lengths;
+//   page_lengths[1] = 100;
+//   EXPECT_THAT(page_lengths,
+//               Contains(::std::pair<const int, size_t>(1, 100)));
+//
+//   const char* user_ids[] = { "joe", "mike", "tom" };
+//   EXPECT_THAT(user_ids, Contains(Eq(::std::string("tom"))));
+template <typename M>
+inline internal::ContainsMatcher<M> Contains(M matcher) {
+  return internal::ContainsMatcher<M>(matcher);
+}
+
+// IsSupersetOf(iterator_first, iterator_last)
+// IsSupersetOf(pointer, count)
+// IsSupersetOf(array)
+// IsSupersetOf(container)
+// IsSupersetOf({e1, e2, ..., en})
+//
+// IsSupersetOf() verifies that a surjective partial mapping onto a collection
+// of matchers exists. In other words, a container matches
+// IsSupersetOf({e1, ..., en}) if and only if there is a permutation
+// {y1, ..., yn} of some of the container's elements where y1 matches e1,
+// ..., and yn matches en. Obviously, the size of the container must be >= n
+// in order to have a match. Examples:
+//
+// - {1, 2, 3} matches IsSupersetOf({Ge(3), Ne(0)}), as 3 matches Ge(3) and
+//   1 matches Ne(0).
+// - {1, 2} doesn't match IsSupersetOf({Eq(1), Lt(2)}), even though 1 matches
+//   both Eq(1) and Lt(2). The reason is that different matchers must be used
+//   for elements in different slots of the container.
+// - {1, 1, 2} matches IsSupersetOf({Eq(1), Lt(2)}), as (the first) 1 matches
+//   Eq(1) and (the second) 1 matches Lt(2).
+// - {1, 2, 3} matches IsSupersetOf(Gt(1), Gt(1)), as 2 matches (the first)
+//   Gt(1) and 3 matches (the second) Gt(1).
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+IsSupersetOf(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::UnorderedElementsAreArrayMatcher<T>(
+      internal::UnorderedMatcherRequire::Superset, first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
+    const T* pointer, size_t count) {
+  return IsSupersetOf(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
+    const T (&array)[N]) {
+  return IsSupersetOf(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename Container::value_type>
+IsSupersetOf(const Container& container) {
+  return IsSupersetOf(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSupersetOf(
+    ::std::initializer_list<T> xs) {
+  return IsSupersetOf(xs.begin(), xs.end());
+}
+
+// IsSubsetOf(iterator_first, iterator_last)
+// IsSubsetOf(pointer, count)
+// IsSubsetOf(array)
+// IsSubsetOf(container)
+// IsSubsetOf({e1, e2, ..., en})
+//
+// IsSubsetOf() verifies that an injective mapping onto a collection of matchers
+// exists.  In other words, a container matches IsSubsetOf({e1, ..., en}) if and
+// only if there is a subset of matchers {m1, ..., mk} which would match the
+// container using UnorderedElementsAre.  Obviously, the size of the container
+// must be <= n in order to have a match. Examples:
+//
+// - {1} matches IsSubsetOf({Gt(0), Lt(0)}), as 1 matches Gt(0).
+// - {1, -1} matches IsSubsetOf({Lt(0), Gt(0)}), as 1 matches Gt(0) and -1
+//   matches Lt(0).
+// - {1, 2} doesn't matches IsSubsetOf({Gt(0), Lt(0)}), even though 1 and 2 both
+//   match Gt(0). The reason is that different matchers must be used for
+//   elements in different slots of the container.
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+IsSubsetOf(Iter first, Iter last) {
+  typedef typename ::std::iterator_traits<Iter>::value_type T;
+  return internal::UnorderedElementsAreArrayMatcher<T>(
+      internal::UnorderedMatcherRequire::Subset, first, last);
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
+    const T* pointer, size_t count) {
+  return IsSubsetOf(pointer, pointer + count);
+}
+
+template <typename T, size_t N>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
+    const T (&array)[N]) {
+  return IsSubsetOf(array, N);
+}
+
+template <typename Container>
+inline internal::UnorderedElementsAreArrayMatcher<
+    typename Container::value_type>
+IsSubsetOf(const Container& container) {
+  return IsSubsetOf(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::UnorderedElementsAreArrayMatcher<T> IsSubsetOf(
+    ::std::initializer_list<T> xs) {
+  return IsSubsetOf(xs.begin(), xs.end());
+}
+
+// Matches an STL-style container or a native array that contains only
+// elements matching the given value or matcher.
+//
+// Each(m) is semantically equivalent to Not(Contains(Not(m))). Only
+// the messages are different.
+//
+// Examples:
+//   ::std::set<int> page_ids;
+//   // Each(m) matches an empty container, regardless of what m is.
+//   EXPECT_THAT(page_ids, Each(Eq(1)));
+//   EXPECT_THAT(page_ids, Each(Eq(77)));
+//
+//   page_ids.insert(3);
+//   EXPECT_THAT(page_ids, Each(Gt(0)));
+//   EXPECT_THAT(page_ids, Not(Each(Gt(4))));
+//   page_ids.insert(1);
+//   EXPECT_THAT(page_ids, Not(Each(Lt(2))));
+//
+//   ::std::map<int, size_t> page_lengths;
+//   page_lengths[1] = 100;
+//   page_lengths[2] = 200;
+//   page_lengths[3] = 300;
+//   EXPECT_THAT(page_lengths, Not(Each(Pair(1, 100))));
+//   EXPECT_THAT(page_lengths, Each(Key(Le(3))));
+//
+//   const char* user_ids[] = { "joe", "mike", "tom" };
+//   EXPECT_THAT(user_ids, Not(Each(Eq(::std::string("tom")))));
+template <typename M>
+inline internal::EachMatcher<M> Each(M matcher) {
+  return internal::EachMatcher<M>(matcher);
+}
+
+// Key(inner_matcher) matches an std::pair whose 'first' field matches
+// inner_matcher.  For example, Contains(Key(Ge(5))) can be used to match an
+// std::map that contains at least one element whose key is >= 5.
+template <typename M>
+inline internal::KeyMatcher<M> Key(M inner_matcher) {
+  return internal::KeyMatcher<M>(inner_matcher);
+}
+
+// Pair(first_matcher, second_matcher) matches a std::pair whose 'first' field
+// matches first_matcher and whose 'second' field matches second_matcher.  For
+// example, EXPECT_THAT(map_type, ElementsAre(Pair(Ge(5), "foo"))) can be used
+// to match a std::map<int, string> that contains exactly one element whose key
+// is >= 5 and whose value equals "foo".
+template <typename FirstMatcher, typename SecondMatcher>
+inline internal::PairMatcher<FirstMatcher, SecondMatcher>
+Pair(FirstMatcher first_matcher, SecondMatcher second_matcher) {
+  return internal::PairMatcher<FirstMatcher, SecondMatcher>(
+      first_matcher, second_matcher);
+}
+
+// Returns a predicate that is satisfied by anything that matches the
+// given matcher.
+template <typename M>
+inline internal::MatcherAsPredicate<M> Matches(M matcher) {
+  return internal::MatcherAsPredicate<M>(matcher);
+}
+
+// Returns true if and only if the value matches the matcher.
+template <typename T, typename M>
+inline bool Value(const T& value, M matcher) {
+  return testing::Matches(matcher)(value);
+}
+
+// Matches the value against the given matcher and explains the match
+// result to listener.
+template <typename T, typename M>
+inline bool ExplainMatchResult(
+    M matcher, const T& value, MatchResultListener* listener) {
+  return SafeMatcherCast<const T&>(matcher).MatchAndExplain(value, listener);
+}
+
+// Returns a string representation of the given matcher.  Useful for description
+// strings of matchers defined using MATCHER_P* macros that accept matchers as
+// their arguments.  For example:
+//
+// MATCHER_P(XAndYThat, matcher,
+//           "X that " + DescribeMatcher<int>(matcher, negation) +
+//               " and Y that " + DescribeMatcher<double>(matcher, negation)) {
+//   return ExplainMatchResult(matcher, arg.x(), result_listener) &&
+//          ExplainMatchResult(matcher, arg.y(), result_listener);
+// }
+template <typename T, typename M>
+std::string DescribeMatcher(const M& matcher, bool negation = false) {
+  ::std::stringstream ss;
+  Matcher<T> monomorphic_matcher = SafeMatcherCast<T>(matcher);
+  if (negation) {
+    monomorphic_matcher.DescribeNegationTo(&ss);
+  } else {
+    monomorphic_matcher.DescribeTo(&ss);
+  }
+  return ss.str();
+}
+
+template <typename... Args>
+internal::ElementsAreMatcher<
+    std::tuple<typename std::decay<const Args&>::type...>>
+ElementsAre(const Args&... matchers) {
+  return internal::ElementsAreMatcher<
+      std::tuple<typename std::decay<const Args&>::type...>>(
+      std::make_tuple(matchers...));
+}
+
+template <typename... Args>
+internal::UnorderedElementsAreMatcher<
+    std::tuple<typename std::decay<const Args&>::type...>>
+UnorderedElementsAre(const Args&... matchers) {
+  return internal::UnorderedElementsAreMatcher<
+      std::tuple<typename std::decay<const Args&>::type...>>(
+      std::make_tuple(matchers...));
+}
+
+// Define variadic matcher versions.
+template <typename... Args>
+internal::AllOfMatcher<typename std::decay<const Args&>::type...> AllOf(
+    const Args&... matchers) {
+  return internal::AllOfMatcher<typename std::decay<const Args&>::type...>(
+      matchers...);
+}
+
+template <typename... Args>
+internal::AnyOfMatcher<typename std::decay<const Args&>::type...> AnyOf(
+    const Args&... matchers) {
+  return internal::AnyOfMatcher<typename std::decay<const Args&>::type...>(
+      matchers...);
+}
+
+// AnyOfArray(array)
+// AnyOfArray(pointer, count)
+// AnyOfArray(container)
+// AnyOfArray({ e1, e2, ..., en })
+// AnyOfArray(iterator_first, iterator_last)
+//
+// AnyOfArray() verifies whether a given value matches any member of a
+// collection of matchers.
+//
+// AllOfArray(array)
+// AllOfArray(pointer, count)
+// AllOfArray(container)
+// AllOfArray({ e1, e2, ..., en })
+// AllOfArray(iterator_first, iterator_last)
+//
+// AllOfArray() verifies whether a given value matches all members of a
+// collection of matchers.
+//
+// The matchers can be specified as an array, a pointer and count, a container,
+// an initializer list, or an STL iterator range. In each of these cases, the
+// underlying matchers can be either values or matchers.
+
+template <typename Iter>
+inline internal::AnyOfArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+AnyOfArray(Iter first, Iter last) {
+  return internal::AnyOfArrayMatcher<
+      typename ::std::iterator_traits<Iter>::value_type>(first, last);
+}
+
+template <typename Iter>
+inline internal::AllOfArrayMatcher<
+    typename ::std::iterator_traits<Iter>::value_type>
+AllOfArray(Iter first, Iter last) {
+  return internal::AllOfArrayMatcher<
+      typename ::std::iterator_traits<Iter>::value_type>(first, last);
+}
+
+template <typename T>
+inline internal::AnyOfArrayMatcher<T> AnyOfArray(const T* ptr, size_t count) {
+  return AnyOfArray(ptr, ptr + count);
+}
+
+template <typename T>
+inline internal::AllOfArrayMatcher<T> AllOfArray(const T* ptr, size_t count) {
+  return AllOfArray(ptr, ptr + count);
+}
+
+template <typename T, size_t N>
+inline internal::AnyOfArrayMatcher<T> AnyOfArray(const T (&array)[N]) {
+  return AnyOfArray(array, N);
+}
+
+template <typename T, size_t N>
+inline internal::AllOfArrayMatcher<T> AllOfArray(const T (&array)[N]) {
+  return AllOfArray(array, N);
+}
+
+template <typename Container>
+inline internal::AnyOfArrayMatcher<typename Container::value_type> AnyOfArray(
+    const Container& container) {
+  return AnyOfArray(container.begin(), container.end());
+}
+
+template <typename Container>
+inline internal::AllOfArrayMatcher<typename Container::value_type> AllOfArray(
+    const Container& container) {
+  return AllOfArray(container.begin(), container.end());
+}
+
+template <typename T>
+inline internal::AnyOfArrayMatcher<T> AnyOfArray(
+    ::std::initializer_list<T> xs) {
+  return AnyOfArray(xs.begin(), xs.end());
+}
+
+template <typename T>
+inline internal::AllOfArrayMatcher<T> AllOfArray(
+    ::std::initializer_list<T> xs) {
+  return AllOfArray(xs.begin(), xs.end());
+}
+
+// Args<N1, N2, ..., Nk>(a_matcher) matches a tuple if the selected
+// fields of it matches a_matcher.  C++ doesn't support default
+// arguments for function templates, so we have to overload it.
+template <size_t... k, typename InnerMatcher>
+internal::ArgsMatcher<typename std::decay<InnerMatcher>::type, k...> Args(
+    InnerMatcher&& matcher) {
+  return internal::ArgsMatcher<typename std::decay<InnerMatcher>::type, k...>(
+      std::forward<InnerMatcher>(matcher));
+}
+
+// AllArgs(m) is a synonym of m.  This is useful in
+//
+//   EXPECT_CALL(foo, Bar(_, _)).With(AllArgs(Eq()));
+//
+// which is easier to read than
+//
+//   EXPECT_CALL(foo, Bar(_, _)).With(Eq());
+template <typename InnerMatcher>
+inline InnerMatcher AllArgs(const InnerMatcher& matcher) { return matcher; }
+
+// Returns a matcher that matches the value of an optional<> type variable.
+// The matcher implementation only uses '!arg' and requires that the optional<>
+// type has a 'value_type' member type and that '*arg' is of type 'value_type'
+// and is printable using 'PrintToString'. It is compatible with
+// std::optional/std::experimental::optional.
+// Note that to compare an optional type variable against nullopt you should
+// use Eq(nullopt) and not Optional(Eq(nullopt)). The latter implies that the
+// optional value contains an optional itself.
+template <typename ValueMatcher>
+inline internal::OptionalMatcher<ValueMatcher> Optional(
+    const ValueMatcher& value_matcher) {
+  return internal::OptionalMatcher<ValueMatcher>(value_matcher);
+}
+
+// Returns a matcher that matches the value of a absl::any type variable.
+template <typename T>
+PolymorphicMatcher<internal::any_cast_matcher::AnyCastMatcher<T> > AnyWith(
+    const Matcher<const T&>& matcher) {
+  return MakePolymorphicMatcher(
+      internal::any_cast_matcher::AnyCastMatcher<T>(matcher));
+}
+
+// Returns a matcher that matches the value of a variant<> type variable.
+// The matcher implementation uses ADL to find the holds_alternative and get
+// functions.
+// It is compatible with std::variant.
+template <typename T>
+PolymorphicMatcher<internal::variant_matcher::VariantMatcher<T> > VariantWith(
+    const Matcher<const T&>& matcher) {
+  return MakePolymorphicMatcher(
+      internal::variant_matcher::VariantMatcher<T>(matcher));
+}
+
+// These macros allow using matchers to check values in Google Test
+// tests.  ASSERT_THAT(value, matcher) and EXPECT_THAT(value, matcher)
+// succeed if and only if the value matches the matcher.  If the assertion
+// fails, the value and the description of the matcher will be printed.
+#define ASSERT_THAT(value, matcher) ASSERT_PRED_FORMAT1(\
+    ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+#define EXPECT_THAT(value, matcher) EXPECT_PRED_FORMAT1(\
+    ::testing::internal::MakePredicateFormatterFromMatcher(matcher), value)
+
+// MATCHER* macroses itself are listed below.
+#define MATCHER(name, description)                                             \
+  class name##Matcher                                                          \
+      : public ::testing::internal::MatcherBaseImpl<name##Matcher> {           \
+   public:                                                                     \
+    template <typename arg_type>                                               \
+    class gmock_Impl : public ::testing::MatcherInterface<const arg_type&> {   \
+     public:                                                                   \
+      gmock_Impl() {}                                                          \
+      bool MatchAndExplain(                                                    \
+          const arg_type& arg,                                                 \
+          ::testing::MatchResultListener* result_listener) const override;     \
+      void DescribeTo(::std::ostream* gmock_os) const override {               \
+        *gmock_os << FormatDescription(false);                                 \
+      }                                                                        \
+      void DescribeNegationTo(::std::ostream* gmock_os) const override {       \
+        *gmock_os << FormatDescription(true);                                  \
+      }                                                                        \
+                                                                               \
+     private:                                                                  \
+      ::std::string FormatDescription(bool negation) const {                   \
+        ::std::string gmock_description = (description);                       \
+        if (!gmock_description.empty()) {                                      \
+          return gmock_description;                                            \
+        }                                                                      \
+        return ::testing::internal::FormatMatcherDescription(negation, #name,  \
+                                                             {});              \
+      }                                                                        \
+    };                                                                         \
+  };                                                                           \
+  GTEST_ATTRIBUTE_UNUSED_ inline name##Matcher name() { return {}; }           \
+  template <typename arg_type>                                                 \
+  bool name##Matcher::gmock_Impl<arg_type>::MatchAndExplain(                   \
+      const arg_type& arg,                                                     \
+      ::testing::MatchResultListener* result_listener GTEST_ATTRIBUTE_UNUSED_) \
+      const
+
+#define MATCHER_P(name, p0, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP, description, (p0))
+#define MATCHER_P2(name, p0, p1, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP2, description, (p0, p1))
+#define MATCHER_P3(name, p0, p1, p2, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP3, description, (p0, p1, p2))
+#define MATCHER_P4(name, p0, p1, p2, p3, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP4, description, (p0, p1, p2, p3))
+#define MATCHER_P5(name, p0, p1, p2, p3, p4, description)    \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP5, description, \
+                         (p0, p1, p2, p3, p4))
+#define MATCHER_P6(name, p0, p1, p2, p3, p4, p5, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP6, description,  \
+                         (p0, p1, p2, p3, p4, p5))
+#define MATCHER_P7(name, p0, p1, p2, p3, p4, p5, p6, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP7, description,      \
+                         (p0, p1, p2, p3, p4, p5, p6))
+#define MATCHER_P8(name, p0, p1, p2, p3, p4, p5, p6, p7, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP8, description,          \
+                         (p0, p1, p2, p3, p4, p5, p6, p7))
+#define MATCHER_P9(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP9, description,              \
+                         (p0, p1, p2, p3, p4, p5, p6, p7, p8))
+#define MATCHER_P10(name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, description) \
+  GMOCK_INTERNAL_MATCHER(name, name##MatcherP10, description,                  \
+                         (p0, p1, p2, p3, p4, p5, p6, p7, p8, p9))
+
+#define GMOCK_INTERNAL_MATCHER(name, full_name, description, args)             \
+  template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)>                      \
+  class full_name : public ::testing::internal::MatcherBaseImpl<               \
+                        full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>> { \
+   public:                                                                     \
+    using full_name::MatcherBaseImpl::MatcherBaseImpl;                         \
+    template <typename arg_type>                                               \
+    class gmock_Impl : public ::testing::MatcherInterface<const arg_type&> {   \
+     public:                                                                   \
+      explicit gmock_Impl(GMOCK_INTERNAL_MATCHER_FUNCTION_ARGS(args))          \
+          : GMOCK_INTERNAL_MATCHER_FORWARD_ARGS(args) {}                       \
+      bool MatchAndExplain(                                                    \
+          const arg_type& arg,                                                 \
+          ::testing::MatchResultListener* result_listener) const override;     \
+      void DescribeTo(::std::ostream* gmock_os) const override {               \
+        *gmock_os << FormatDescription(false);                                 \
+      }                                                                        \
+      void DescribeNegationTo(::std::ostream* gmock_os) const override {       \
+        *gmock_os << FormatDescription(true);                                  \
+      }                                                                        \
+      GMOCK_INTERNAL_MATCHER_MEMBERS(args)                                     \
+                                                                               \
+     private:                                                                  \
+      ::std::string FormatDescription(bool negation) const {                   \
+        ::std::string gmock_description = (description);                       \
+        if (!gmock_description.empty()) {                                      \
+          return gmock_description;                                            \
+        }                                                                      \
+        return ::testing::internal::FormatMatcherDescription(                  \
+            negation, #name,                                                   \
+            ::testing::internal::UniversalTersePrintTupleFieldsToStrings(      \
+                ::std::tuple<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>(        \
+                    GMOCK_INTERNAL_MATCHER_MEMBERS_USAGE(args))));             \
+      }                                                                        \
+    };                                                                         \
+  };                                                                           \
+  template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)>                      \
+  inline full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)> name(             \
+      GMOCK_INTERNAL_MATCHER_FUNCTION_ARGS(args)) {                            \
+    return full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>(                \
+        GMOCK_INTERNAL_MATCHER_ARGS_USAGE(args));                              \
+  }                                                                            \
+  template <GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args)>                      \
+  template <typename arg_type>                                                 \
+  bool full_name<GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args)>::gmock_Impl<        \
+      arg_type>::MatchAndExplain(const arg_type& arg,                          \
+                                 ::testing::MatchResultListener*               \
+                                     result_listener GTEST_ATTRIBUTE_UNUSED_)  \
+      const
+
+#define GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAMS(args) \
+  GMOCK_PP_TAIL(                                     \
+      GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAM, , args))
+#define GMOCK_INTERNAL_MATCHER_TEMPLATE_PARAM(i_unused, data_unused, arg) \
+  , typename arg##_type
+
+#define GMOCK_INTERNAL_MATCHER_TYPE_PARAMS(args) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_TYPE_PARAM, , args))
+#define GMOCK_INTERNAL_MATCHER_TYPE_PARAM(i_unused, data_unused, arg) \
+  , arg##_type
+
+#define GMOCK_INTERNAL_MATCHER_FUNCTION_ARGS(args) \
+  GMOCK_PP_TAIL(dummy_first GMOCK_PP_FOR_EACH(     \
+      GMOCK_INTERNAL_MATCHER_FUNCTION_ARG, , args))
+#define GMOCK_INTERNAL_MATCHER_FUNCTION_ARG(i, data_unused, arg) \
+  , arg##_type gmock_p##i
+
+#define GMOCK_INTERNAL_MATCHER_FORWARD_ARGS(args) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_FORWARD_ARG, , args))
+#define GMOCK_INTERNAL_MATCHER_FORWARD_ARG(i, data_unused, arg) \
+  , arg(::std::forward<arg##_type>(gmock_p##i))
+
+#define GMOCK_INTERNAL_MATCHER_MEMBERS(args) \
+  GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_MEMBER, , args)
+#define GMOCK_INTERNAL_MATCHER_MEMBER(i_unused, data_unused, arg) \
+  const arg##_type arg;
+
+#define GMOCK_INTERNAL_MATCHER_MEMBERS_USAGE(args) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_MEMBER_USAGE, , args))
+#define GMOCK_INTERNAL_MATCHER_MEMBER_USAGE(i_unused, data_unused, arg) , arg
+
+#define GMOCK_INTERNAL_MATCHER_ARGS_USAGE(args) \
+  GMOCK_PP_TAIL(GMOCK_PP_FOR_EACH(GMOCK_INTERNAL_MATCHER_ARG_USAGE, , args))
+#define GMOCK_INTERNAL_MATCHER_ARG_USAGE(i, data_unused, arg_unused) \
+  , gmock_p##i
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
+
+// Include any custom callback matchers added by the local installation.
+// We must include this header at the end to make sure it can use the
+// declarations from this file.
+#include "gmock/internal/custom/gmock-matchers.h"
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_MATCHERS_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-more-actions.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-more-actions.h
new file mode 100644
index 0000000000..d42484aef2
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-more-actions.h
@@ -0,0 +1,162 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some actions that depend on gmock-generated-actions.h.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
+
+#include <algorithm>
+#include <type_traits>
+
+#include "gmock/gmock-generated-actions.h"
+
+namespace testing {
+namespace internal {
+
+// An internal replacement for std::copy which mimics its behavior. This is
+// necessary because Visual Studio deprecates ::std::copy, issuing warning 4996.
+// However Visual Studio 2010 and later do not honor #pragmas which disable that
+// warning.
+template<typename InputIterator, typename OutputIterator>
+inline OutputIterator CopyElements(InputIterator first,
+                                   InputIterator last,
+                                   OutputIterator output) {
+  for (; first != last; ++first, ++output) {
+    *output = *first;
+  }
+  return output;
+}
+
+}  // namespace internal
+
+// Various overloads for Invoke().
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+// Action ReturnArg<k>() returns the k-th argument of the mock function.
+ACTION_TEMPLATE(ReturnArg,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_0_VALUE_PARAMS()) {
+  return ::std::get<k>(args);
+}
+
+// Action SaveArg<k>(pointer) saves the k-th (0-based) argument of the
+// mock function to *pointer.
+ACTION_TEMPLATE(SaveArg,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_1_VALUE_PARAMS(pointer)) {
+  *pointer = ::std::get<k>(args);
+}
+
+// Action SaveArgPointee<k>(pointer) saves the value pointed to
+// by the k-th (0-based) argument of the mock function to *pointer.
+ACTION_TEMPLATE(SaveArgPointee,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_1_VALUE_PARAMS(pointer)) {
+  *pointer = *::std::get<k>(args);
+}
+
+// Action SetArgReferee<k>(value) assigns 'value' to the variable
+// referenced by the k-th (0-based) argument of the mock function.
+ACTION_TEMPLATE(SetArgReferee,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_1_VALUE_PARAMS(value)) {
+  typedef typename ::std::tuple_element<k, args_type>::type argk_type;
+  // Ensures that argument #k is a reference.  If you get a compiler
+  // error on the next line, you are using SetArgReferee<k>(value) in
+  // a mock function whose k-th (0-based) argument is not a reference.
+  GTEST_COMPILE_ASSERT_(std::is_reference<argk_type>::value,
+                        SetArgReferee_must_be_used_with_a_reference_argument);
+  ::std::get<k>(args) = value;
+}
+
+// Action SetArrayArgument<k>(first, last) copies the elements in
+// source range [first, last) to the array pointed to by the k-th
+// (0-based) argument, which can be either a pointer or an
+// iterator. The action does not take ownership of the elements in the
+// source range.
+ACTION_TEMPLATE(SetArrayArgument,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_2_VALUE_PARAMS(first, last)) {
+  // Visual Studio deprecates ::std::copy, so we use our own copy in that case.
+#ifdef _MSC_VER
+  internal::CopyElements(first, last, ::std::get<k>(args));
+#else
+  ::std::copy(first, last, ::std::get<k>(args));
+#endif
+}
+
+// Action DeleteArg<k>() deletes the k-th (0-based) argument of the mock
+// function.
+ACTION_TEMPLATE(DeleteArg,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_0_VALUE_PARAMS()) {
+  delete ::std::get<k>(args);
+}
+
+// This action returns the value pointed to by 'pointer'.
+ACTION_P(ReturnPointee, pointer) { return *pointer; }
+
+// Action Throw(exception) can be used in a mock function of any type
+// to throw the given exception.  Any copyable value can be thrown.
+#if GTEST_HAS_EXCEPTIONS
+
+// Suppresses the 'unreachable code' warning that VC generates in opt modes.
+# ifdef _MSC_VER
+#  pragma warning(push)          // Saves the current warning state.
+#  pragma warning(disable:4702)  // Temporarily disables warning 4702.
+# endif
+ACTION_P(Throw, exception) { throw exception; }
+# ifdef _MSC_VER
+#  pragma warning(pop)           // Restores the warning state.
+# endif
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_MORE_ACTIONS_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-more-matchers.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-more-matchers.h
new file mode 100644
index 0000000000..b306dd6037
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-more-matchers.h
@@ -0,0 +1,92 @@
+// Copyright 2013, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements some matchers that depend on gmock-matchers.h.
+//
+// Note that tests are implemented in gmock-matchers_test.cc rather than
+// gmock-more-matchers-test.cc.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_MORE_MATCHERS_H_
+#define GMOCK_INCLUDE_GMOCK_MORE_MATCHERS_H_
+
+#include "gmock/gmock-matchers.h"
+
+namespace testing {
+
+// Silence C4100 (unreferenced formal
+// parameter) for MSVC
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#if (_MSC_VER == 1900)
+// and silence C4800 (C4800: 'int *const ': forcing value
+// to bool 'true' or 'false') for MSVC 14
+# pragma warning(disable:4800)
+  #endif
+#endif
+
+// Defines a matcher that matches an empty container. The container must
+// support both size() and empty(), which all STL-like containers provide.
+MATCHER(IsEmpty, negation ? "isn't empty" : "is empty") {
+  if (arg.empty()) {
+    return true;
+  }
+  *result_listener << "whose size is " << arg.size();
+  return false;
+}
+
+// Define a matcher that matches a value that evaluates in boolean
+// context to true.  Useful for types that define "explicit operator
+// bool" operators and so can't be compared for equality with true
+// and false.
+MATCHER(IsTrue, negation ? "is false" : "is true") {
+  return static_cast<bool>(arg);
+}
+
+// Define a matcher that matches a value that evaluates in boolean
+// context to false.  Useful for types that define "explicit operator
+// bool" operators and so can't be compared for equality with true
+// and false.
+MATCHER(IsFalse, negation ? "is true" : "is false") {
+  return !static_cast<bool>(arg);
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_MORE_MATCHERS_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-nice-strict.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-nice-strict.h
new file mode 100644
index 0000000000..5495a9805b
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-nice-strict.h
@@ -0,0 +1,215 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Implements class templates NiceMock, NaggyMock, and StrictMock.
+//
+// Given a mock class MockFoo that is created using Google Mock,
+// NiceMock<MockFoo> is a subclass of MockFoo that allows
+// uninteresting calls (i.e. calls to mock methods that have no
+// EXPECT_CALL specs), NaggyMock<MockFoo> is a subclass of MockFoo
+// that prints a warning when an uninteresting call occurs, and
+// StrictMock<MockFoo> is a subclass of MockFoo that treats all
+// uninteresting calls as errors.
+//
+// Currently a mock is naggy by default, so MockFoo and
+// NaggyMock<MockFoo> behave like the same.  However, we will soon
+// switch the default behavior of mocks to be nice, as that in general
+// leads to more maintainable tests.  When that happens, MockFoo will
+// stop behaving like NaggyMock<MockFoo> and start behaving like
+// NiceMock<MockFoo>.
+//
+// NiceMock, NaggyMock, and StrictMock "inherit" the constructors of
+// their respective base class.  Therefore you can write
+// NiceMock<MockFoo>(5, "a") to construct a nice mock where MockFoo
+// has a constructor that accepts (int, const char*), for example.
+//
+// A known limitation is that NiceMock<MockFoo>, NaggyMock<MockFoo>,
+// and StrictMock<MockFoo> only works for mock methods defined using
+// the MOCK_METHOD* family of macros DIRECTLY in the MockFoo class.
+// If a mock method is defined in a base class of MockFoo, the "nice"
+// or "strict" modifier may not affect it, depending on the compiler.
+// In particular, nesting NiceMock, NaggyMock, and StrictMock is NOT
+// supported.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
+
+#include "gmock/gmock-spec-builders.h"
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+
+template <class MockClass>
+class NiceMock : public MockClass {
+ public:
+  NiceMock() : MockClass() {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  // Ideally, we would inherit base class's constructors through a using
+  // declaration, which would preserve their visibility. However, many existing
+  // tests rely on the fact that current implementation reexports protected
+  // constructors as public. These tests would need to be cleaned up first.
+
+  // Single argument constructor is special-cased so that it can be
+  // made explicit.
+  template <typename A>
+  explicit NiceMock(A&& arg) : MockClass(std::forward<A>(arg)) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename... An>
+  NiceMock(A1&& arg1, A2&& arg2, An&&... args)
+      : MockClass(std::forward<A1>(arg1), std::forward<A2>(arg2),
+                  std::forward<An>(args)...) {
+    ::testing::Mock::AllowUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  ~NiceMock() {  // NOLINT
+    ::testing::Mock::UnregisterCallReaction(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(NiceMock);
+};
+
+template <class MockClass>
+class NaggyMock : public MockClass {
+ public:
+  NaggyMock() : MockClass() {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  // Ideally, we would inherit base class's constructors through a using
+  // declaration, which would preserve their visibility. However, many existing
+  // tests rely on the fact that current implementation reexports protected
+  // constructors as public. These tests would need to be cleaned up first.
+
+  // Single argument constructor is special-cased so that it can be
+  // made explicit.
+  template <typename A>
+  explicit NaggyMock(A&& arg) : MockClass(std::forward<A>(arg)) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename... An>
+  NaggyMock(A1&& arg1, A2&& arg2, An&&... args)
+      : MockClass(std::forward<A1>(arg1), std::forward<A2>(arg2),
+                  std::forward<An>(args)...) {
+    ::testing::Mock::WarnUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  ~NaggyMock() {  // NOLINT
+    ::testing::Mock::UnregisterCallReaction(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(NaggyMock);
+};
+
+template <class MockClass>
+class StrictMock : public MockClass {
+ public:
+  StrictMock() : MockClass() {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  // Ideally, we would inherit base class's constructors through a using
+  // declaration, which would preserve their visibility. However, many existing
+  // tests rely on the fact that current implementation reexports protected
+  // constructors as public. These tests would need to be cleaned up first.
+
+  // Single argument constructor is special-cased so that it can be
+  // made explicit.
+  template <typename A>
+  explicit StrictMock(A&& arg) : MockClass(std::forward<A>(arg)) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  template <typename A1, typename A2, typename... An>
+  StrictMock(A1&& arg1, A2&& arg2, An&&... args)
+      : MockClass(std::forward<A1>(arg1), std::forward<A2>(arg2),
+                  std::forward<An>(args)...) {
+    ::testing::Mock::FailUninterestingCalls(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+  ~StrictMock() {  // NOLINT
+    ::testing::Mock::UnregisterCallReaction(
+        internal::ImplicitCast_<MockClass*>(this));
+  }
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(StrictMock);
+};
+
+// The following specializations catch some (relatively more common)
+// user errors of nesting nice and strict mocks.  They do NOT catch
+// all possible errors.
+
+// These specializations are declared but not defined, as NiceMock,
+// NaggyMock, and StrictMock cannot be nested.
+
+template <typename MockClass>
+class NiceMock<NiceMock<MockClass> >;
+template <typename MockClass>
+class NiceMock<NaggyMock<MockClass> >;
+template <typename MockClass>
+class NiceMock<StrictMock<MockClass> >;
+
+template <typename MockClass>
+class NaggyMock<NiceMock<MockClass> >;
+template <typename MockClass>
+class NaggyMock<NaggyMock<MockClass> >;
+template <typename MockClass>
+class NaggyMock<StrictMock<MockClass> >;
+
+template <typename MockClass>
+class StrictMock<NiceMock<MockClass> >;
+template <typename MockClass>
+class StrictMock<NaggyMock<MockClass> >;
+template <typename MockClass>
+class StrictMock<StrictMock<MockClass> >;
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_NICE_STRICT_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-spec-builders.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-spec-builders.h
new file mode 100644
index 0000000000..4b5fc66123
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock-spec-builders.h
@@ -0,0 +1,2031 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements the ON_CALL() and EXPECT_CALL() macros.
+//
+// A user can use the ON_CALL() macro to specify the default action of
+// a mock method.  The syntax is:
+//
+//   ON_CALL(mock_object, Method(argument-matchers))
+//       .With(multi-argument-matcher)
+//       .WillByDefault(action);
+//
+//  where the .With() clause is optional.
+//
+// A user can use the EXPECT_CALL() macro to specify an expectation on
+// a mock method.  The syntax is:
+//
+//   EXPECT_CALL(mock_object, Method(argument-matchers))
+//       .With(multi-argument-matchers)
+//       .Times(cardinality)
+//       .InSequence(sequences)
+//       .After(expectations)
+//       .WillOnce(action)
+//       .WillRepeatedly(action)
+//       .RetiresOnSaturation();
+//
+// where all clauses are optional, and .InSequence()/.After()/
+// .WillOnce() can appear any number of times.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+#include "gmock/gmock-actions.h"
+#include "gmock/gmock-cardinalities.h"
+#include "gmock/gmock-matchers.h"
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+#if GTEST_HAS_EXCEPTIONS
+# include <stdexcept>  // NOLINT
+#endif
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// An abstract handle of an expectation.
+class Expectation;
+
+// A set of expectation handles.
+class ExpectationSet;
+
+// Anything inside the 'internal' namespace IS INTERNAL IMPLEMENTATION
+// and MUST NOT BE USED IN USER CODE!!!
+namespace internal {
+
+// Implements a mock function.
+template <typename F> class FunctionMocker;
+
+// Base class for expectations.
+class ExpectationBase;
+
+// Implements an expectation.
+template <typename F> class TypedExpectation;
+
+// Helper class for testing the Expectation class template.
+class ExpectationTester;
+
+// Protects the mock object registry (in class Mock), all function
+// mockers, and all expectations.
+//
+// The reason we don't use more fine-grained protection is: when a
+// mock function Foo() is called, it needs to consult its expectations
+// to see which one should be picked.  If another thread is allowed to
+// call a mock function (either Foo() or a different one) at the same
+// time, it could affect the "retired" attributes of Foo()'s
+// expectations when InSequence() is used, and thus affect which
+// expectation gets picked.  Therefore, we sequence all mock function
+// calls to ensure the integrity of the mock objects' states.
+GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_gmock_mutex);
+
+// Untyped base class for ActionResultHolder<R>.
+class UntypedActionResultHolderBase;
+
+// Abstract base class of FunctionMocker.  This is the
+// type-agnostic part of the function mocker interface.  Its pure
+// virtual methods are implemented by FunctionMocker.
+class GTEST_API_ UntypedFunctionMockerBase {
+ public:
+  UntypedFunctionMockerBase();
+  virtual ~UntypedFunctionMockerBase();
+
+  // Verifies that all expectations on this mock function have been
+  // satisfied.  Reports one or more Google Test non-fatal failures
+  // and returns false if not.
+  bool VerifyAndClearExpectationsLocked()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Clears the ON_CALL()s set on this mock function.
+  virtual void ClearDefaultActionsLocked()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) = 0;
+
+  // In all of the following Untyped* functions, it's the caller's
+  // responsibility to guarantee the correctness of the arguments'
+  // types.
+
+  // Performs the default action with the given arguments and returns
+  // the action's result.  The call description string will be used in
+  // the error message to describe the call in the case the default
+  // action fails.
+  // L = *
+  virtual UntypedActionResultHolderBase* UntypedPerformDefaultAction(
+      void* untyped_args, const std::string& call_description) const = 0;
+
+  // Performs the given action with the given arguments and returns
+  // the action's result.
+  // L = *
+  virtual UntypedActionResultHolderBase* UntypedPerformAction(
+      const void* untyped_action, void* untyped_args) const = 0;
+
+  // Writes a message that the call is uninteresting (i.e. neither
+  // explicitly expected nor explicitly unexpected) to the given
+  // ostream.
+  virtual void UntypedDescribeUninterestingCall(
+      const void* untyped_args,
+      ::std::ostream* os) const
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
+
+  // Returns the expectation that matches the given function arguments
+  // (or NULL is there's no match); when a match is found,
+  // untyped_action is set to point to the action that should be
+  // performed (or NULL if the action is "do default"), and
+  // is_excessive is modified to indicate whether the call exceeds the
+  // expected number.
+  virtual const ExpectationBase* UntypedFindMatchingExpectation(
+      const void* untyped_args,
+      const void** untyped_action, bool* is_excessive,
+      ::std::ostream* what, ::std::ostream* why)
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) = 0;
+
+  // Prints the given function arguments to the ostream.
+  virtual void UntypedPrintArgs(const void* untyped_args,
+                                ::std::ostream* os) const = 0;
+
+  // Sets the mock object this mock method belongs to, and registers
+  // this information in the global mock registry.  Will be called
+  // whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
+  // method.
+  void RegisterOwner(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Sets the mock object this mock method belongs to, and sets the
+  // name of the mock function.  Will be called upon each invocation
+  // of this mock function.
+  void SetOwnerAndName(const void* mock_obj, const char* name)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Returns the mock object this mock method belongs to.  Must be
+  // called after RegisterOwner() or SetOwnerAndName() has been
+  // called.
+  const void* MockObject() const
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Returns the name of this mock method.  Must be called after
+  // SetOwnerAndName() has been called.
+  const char* Name() const
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+  // Returns the result of invoking this mock function with the given
+  // arguments.  This function can be safely called from multiple
+  // threads concurrently.  The caller is responsible for deleting the
+  // result.
+  UntypedActionResultHolderBase* UntypedInvokeWith(void* untyped_args)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex);
+
+ protected:
+  typedef std::vector<const void*> UntypedOnCallSpecs;
+
+  using UntypedExpectations = std::vector<std::shared_ptr<ExpectationBase>>;
+
+  // Returns an Expectation object that references and co-owns exp,
+  // which must be an expectation on this mock function.
+  Expectation GetHandleOf(ExpectationBase* exp);
+
+  // Address of the mock object this mock method belongs to.  Only
+  // valid after this mock method has been called or
+  // ON_CALL/EXPECT_CALL has been invoked on it.
+  const void* mock_obj_;  // Protected by g_gmock_mutex.
+
+  // Name of the function being mocked.  Only valid after this mock
+  // method has been called.
+  const char* name_;  // Protected by g_gmock_mutex.
+
+  // All default action specs for this function mocker.
+  UntypedOnCallSpecs untyped_on_call_specs_;
+
+  // All expectations for this function mocker.
+  //
+  // It's undefined behavior to interleave expectations (EXPECT_CALLs
+  // or ON_CALLs) and mock function calls.  Also, the order of
+  // expectations is important.  Therefore it's a logic race condition
+  // to read/write untyped_expectations_ concurrently.  In order for
+  // tools like tsan to catch concurrent read/write accesses to
+  // untyped_expectations, we deliberately leave accesses to it
+  // unprotected.
+  UntypedExpectations untyped_expectations_;
+};  // class UntypedFunctionMockerBase
+
+// Untyped base class for OnCallSpec<F>.
+class UntypedOnCallSpecBase {
+ public:
+  // The arguments are the location of the ON_CALL() statement.
+  UntypedOnCallSpecBase(const char* a_file, int a_line)
+      : file_(a_file), line_(a_line), last_clause_(kNone) {}
+
+  // Where in the source file was the default action spec defined?
+  const char* file() const { return file_; }
+  int line() const { return line_; }
+
+ protected:
+  // Gives each clause in the ON_CALL() statement a name.
+  enum Clause {
+    // Do not change the order of the enum members!  The run-time
+    // syntax checking relies on it.
+    kNone,
+    kWith,
+    kWillByDefault
+  };
+
+  // Asserts that the ON_CALL() statement has a certain property.
+  void AssertSpecProperty(bool property,
+                          const std::string& failure_message) const {
+    Assert(property, file_, line_, failure_message);
+  }
+
+  // Expects that the ON_CALL() statement has a certain property.
+  void ExpectSpecProperty(bool property,
+                          const std::string& failure_message) const {
+    Expect(property, file_, line_, failure_message);
+  }
+
+  const char* file_;
+  int line_;
+
+  // The last clause in the ON_CALL() statement as seen so far.
+  // Initially kNone and changes as the statement is parsed.
+  Clause last_clause_;
+};  // class UntypedOnCallSpecBase
+
+// This template class implements an ON_CALL spec.
+template <typename F>
+class OnCallSpec : public UntypedOnCallSpecBase {
+ public:
+  typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
+
+  // Constructs an OnCallSpec object from the information inside
+  // the parenthesis of an ON_CALL() statement.
+  OnCallSpec(const char* a_file, int a_line,
+             const ArgumentMatcherTuple& matchers)
+      : UntypedOnCallSpecBase(a_file, a_line),
+        matchers_(matchers),
+        // By default, extra_matcher_ should match anything.  However,
+        // we cannot initialize it with _ as that causes ambiguity between
+        // Matcher's copy and move constructor for some argument types.
+        extra_matcher_(A<const ArgumentTuple&>()) {}
+
+  // Implements the .With() clause.
+  OnCallSpec& With(const Matcher<const ArgumentTuple&>& m) {
+    // Makes sure this is called at most once.
+    ExpectSpecProperty(last_clause_ < kWith,
+                       ".With() cannot appear "
+                       "more than once in an ON_CALL().");
+    last_clause_ = kWith;
+
+    extra_matcher_ = m;
+    return *this;
+  }
+
+  // Implements the .WillByDefault() clause.
+  OnCallSpec& WillByDefault(const Action<F>& action) {
+    ExpectSpecProperty(last_clause_ < kWillByDefault,
+                       ".WillByDefault() must appear "
+                       "exactly once in an ON_CALL().");
+    last_clause_ = kWillByDefault;
+
+    ExpectSpecProperty(!action.IsDoDefault(),
+                       "DoDefault() cannot be used in ON_CALL().");
+    action_ = action;
+    return *this;
+  }
+
+  // Returns true if and only if the given arguments match the matchers.
+  bool Matches(const ArgumentTuple& args) const {
+    return TupleMatches(matchers_, args) && extra_matcher_.Matches(args);
+  }
+
+  // Returns the action specified by the user.
+  const Action<F>& GetAction() const {
+    AssertSpecProperty(last_clause_ == kWillByDefault,
+                       ".WillByDefault() must appear exactly "
+                       "once in an ON_CALL().");
+    return action_;
+  }
+
+ private:
+  // The information in statement
+  //
+  //   ON_CALL(mock_object, Method(matchers))
+  //       .With(multi-argument-matcher)
+  //       .WillByDefault(action);
+  //
+  // is recorded in the data members like this:
+  //
+  //   source file that contains the statement => file_
+  //   line number of the statement            => line_
+  //   matchers                                => matchers_
+  //   multi-argument-matcher                  => extra_matcher_
+  //   action                                  => action_
+  ArgumentMatcherTuple matchers_;
+  Matcher<const ArgumentTuple&> extra_matcher_;
+  Action<F> action_;
+};  // class OnCallSpec
+
+// Possible reactions on uninteresting calls.
+enum CallReaction {
+  kAllow,
+  kWarn,
+  kFail,
+};
+
+}  // namespace internal
+
+// Utilities for manipulating mock objects.
+class GTEST_API_ Mock {
+ public:
+  // The following public methods can be called concurrently.
+
+  // Tells Google Mock to ignore mock_obj when checking for leaked
+  // mock objects.
+  static void AllowLeak(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Verifies and clears all expectations on the given mock object.
+  // If the expectations aren't satisfied, generates one or more
+  // Google Test non-fatal failures and returns false.
+  static bool VerifyAndClearExpectations(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Verifies all expectations on the given mock object and clears its
+  // default actions and expectations.  Returns true if and only if the
+  // verification was successful.
+  static bool VerifyAndClear(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Returns whether the mock was created as a naggy mock (default)
+  static bool IsNaggy(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+  // Returns whether the mock was created as a nice mock
+  static bool IsNice(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+  // Returns whether the mock was created as a strict mock
+  static bool IsStrict(void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+ private:
+  friend class internal::UntypedFunctionMockerBase;
+
+  // Needed for a function mocker to register itself (so that we know
+  // how to clear a mock object).
+  template <typename F>
+  friend class internal::FunctionMocker;
+
+  template <typename M>
+  friend class NiceMock;
+
+  template <typename M>
+  friend class NaggyMock;
+
+  template <typename M>
+  friend class StrictMock;
+
+  // Tells Google Mock to allow uninteresting calls on the given mock
+  // object.
+  static void AllowUninterestingCalls(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock to warn the user about uninteresting calls on
+  // the given mock object.
+  static void WarnUninterestingCalls(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock to fail uninteresting calls on the given mock
+  // object.
+  static void FailUninterestingCalls(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock the given mock object is being destroyed and
+  // its entry in the call-reaction table should be removed.
+  static void UnregisterCallReaction(const void* mock_obj)
+      GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Returns the reaction Google Mock will have on uninteresting calls
+  // made on the given mock object.
+  static internal::CallReaction GetReactionOnUninterestingCalls(
+      const void* mock_obj)
+          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Verifies that all expectations on the given mock object have been
+  // satisfied.  Reports one or more Google Test non-fatal failures
+  // and returns false if not.
+  static bool VerifyAndClearExpectationsLocked(void* mock_obj)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+
+  // Clears all ON_CALL()s set on the given mock object.
+  static void ClearDefaultActionsLocked(void* mock_obj)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+
+  // Registers a mock object and a mock method it owns.
+  static void Register(
+      const void* mock_obj,
+      internal::UntypedFunctionMockerBase* mocker)
+          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Tells Google Mock where in the source code mock_obj is used in an
+  // ON_CALL or EXPECT_CALL.  In case mock_obj is leaked, this
+  // information helps the user identify which object it is.
+  static void RegisterUseByOnCallOrExpectCall(
+      const void* mock_obj, const char* file, int line)
+          GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex);
+
+  // Unregisters a mock method; removes the owning mock object from
+  // the registry when the last mock method associated with it has
+  // been unregistered.  This is called only in the destructor of
+  // FunctionMocker.
+  static void UnregisterLocked(internal::UntypedFunctionMockerBase* mocker)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex);
+};  // class Mock
+
+// An abstract handle of an expectation.  Useful in the .After()
+// clause of EXPECT_CALL() for setting the (partial) order of
+// expectations.  The syntax:
+//
+//   Expectation e1 = EXPECT_CALL(...)...;
+//   EXPECT_CALL(...).After(e1)...;
+//
+// sets two expectations where the latter can only be matched after
+// the former has been satisfied.
+//
+// Notes:
+//   - This class is copyable and has value semantics.
+//   - Constness is shallow: a const Expectation object itself cannot
+//     be modified, but the mutable methods of the ExpectationBase
+//     object it references can be called via expectation_base().
+
+class GTEST_API_ Expectation {
+ public:
+  // Constructs a null object that doesn't reference any expectation.
+  Expectation();
+
+  ~Expectation();
+
+  // This single-argument ctor must not be explicit, in order to support the
+  //   Expectation e = EXPECT_CALL(...);
+  // syntax.
+  //
+  // A TypedExpectation object stores its pre-requisites as
+  // Expectation objects, and needs to call the non-const Retire()
+  // method on the ExpectationBase objects they reference.  Therefore
+  // Expectation must receive a *non-const* reference to the
+  // ExpectationBase object.
+  Expectation(internal::ExpectationBase& exp);  // NOLINT
+
+  // The compiler-generated copy ctor and operator= work exactly as
+  // intended, so we don't need to define our own.
+
+  // Returns true if and only if rhs references the same expectation as this
+  // object does.
+  bool operator==(const Expectation& rhs) const {
+    return expectation_base_ == rhs.expectation_base_;
+  }
+
+  bool operator!=(const Expectation& rhs) const { return !(*this == rhs); }
+
+ private:
+  friend class ExpectationSet;
+  friend class Sequence;
+  friend class ::testing::internal::ExpectationBase;
+  friend class ::testing::internal::UntypedFunctionMockerBase;
+
+  template <typename F>
+  friend class ::testing::internal::FunctionMocker;
+
+  template <typename F>
+  friend class ::testing::internal::TypedExpectation;
+
+  // This comparator is needed for putting Expectation objects into a set.
+  class Less {
+   public:
+    bool operator()(const Expectation& lhs, const Expectation& rhs) const {
+      return lhs.expectation_base_.get() < rhs.expectation_base_.get();
+    }
+  };
+
+  typedef ::std::set<Expectation, Less> Set;
+
+  Expectation(
+      const std::shared_ptr<internal::ExpectationBase>& expectation_base);
+
+  // Returns the expectation this object references.
+  const std::shared_ptr<internal::ExpectationBase>& expectation_base() const {
+    return expectation_base_;
+  }
+
+  // A shared_ptr that co-owns the expectation this handle references.
+  std::shared_ptr<internal::ExpectationBase> expectation_base_;
+};
+
+// A set of expectation handles.  Useful in the .After() clause of
+// EXPECT_CALL() for setting the (partial) order of expectations.  The
+// syntax:
+//
+//   ExpectationSet es;
+//   es += EXPECT_CALL(...)...;
+//   es += EXPECT_CALL(...)...;
+//   EXPECT_CALL(...).After(es)...;
+//
+// sets three expectations where the last one can only be matched
+// after the first two have both been satisfied.
+//
+// This class is copyable and has value semantics.
+class ExpectationSet {
+ public:
+  // A bidirectional iterator that can read a const element in the set.
+  typedef Expectation::Set::const_iterator const_iterator;
+
+  // An object stored in the set.  This is an alias of Expectation.
+  typedef Expectation::Set::value_type value_type;
+
+  // Constructs an empty set.
+  ExpectationSet() {}
+
+  // This single-argument ctor must not be explicit, in order to support the
+  //   ExpectationSet es = EXPECT_CALL(...);
+  // syntax.
+  ExpectationSet(internal::ExpectationBase& exp) {  // NOLINT
+    *this += Expectation(exp);
+  }
+
+  // This single-argument ctor implements implicit conversion from
+  // Expectation and thus must not be explicit.  This allows either an
+  // Expectation or an ExpectationSet to be used in .After().
+  ExpectationSet(const Expectation& e) {  // NOLINT
+    *this += e;
+  }
+
+  // The compiler-generator ctor and operator= works exactly as
+  // intended, so we don't need to define our own.
+
+  // Returns true if and only if rhs contains the same set of Expectation
+  // objects as this does.
+  bool operator==(const ExpectationSet& rhs) const {
+    return expectations_ == rhs.expectations_;
+  }
+
+  bool operator!=(const ExpectationSet& rhs) const { return !(*this == rhs); }
+
+  // Implements the syntax
+  //   expectation_set += EXPECT_CALL(...);
+  ExpectationSet& operator+=(const Expectation& e) {
+    expectations_.insert(e);
+    return *this;
+  }
+
+  int size() const { return static_cast<int>(expectations_.size()); }
+
+  const_iterator begin() const { return expectations_.begin(); }
+  const_iterator end() const { return expectations_.end(); }
+
+ private:
+  Expectation::Set expectations_;
+};
+
+
+// Sequence objects are used by a user to specify the relative order
+// in which the expectations should match.  They are copyable (we rely
+// on the compiler-defined copy constructor and assignment operator).
+class GTEST_API_ Sequence {
+ public:
+  // Constructs an empty sequence.
+  Sequence() : last_expectation_(new Expectation) {}
+
+  // Adds an expectation to this sequence.  The caller must ensure
+  // that no other thread is accessing this Sequence object.
+  void AddExpectation(const Expectation& expectation) const;
+
+ private:
+  // The last expectation in this sequence.
+  std::shared_ptr<Expectation> last_expectation_;
+};  // class Sequence
+
+// An object of this type causes all EXPECT_CALL() statements
+// encountered in its scope to be put in an anonymous sequence.  The
+// work is done in the constructor and destructor.  You should only
+// create an InSequence object on the stack.
+//
+// The sole purpose for this class is to support easy definition of
+// sequential expectations, e.g.
+//
+//   {
+//     InSequence dummy;  // The name of the object doesn't matter.
+//
+//     // The following expectations must match in the order they appear.
+//     EXPECT_CALL(a, Bar())...;
+//     EXPECT_CALL(a, Baz())...;
+//     ...
+//     EXPECT_CALL(b, Xyz())...;
+//   }
+//
+// You can create InSequence objects in multiple threads, as long as
+// they are used to affect different mock objects.  The idea is that
+// each thread can create and set up its own mocks as if it's the only
+// thread.  However, for clarity of your tests we recommend you to set
+// up mocks in the main thread unless you have a good reason not to do
+// so.
+class GTEST_API_ InSequence {
+ public:
+  InSequence();
+  ~InSequence();
+ private:
+  bool sequence_created_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(InSequence);  // NOLINT
+} GTEST_ATTRIBUTE_UNUSED_;
+
+namespace internal {
+
+// Points to the implicit sequence introduced by a living InSequence
+// object (if any) in the current thread or NULL.
+GTEST_API_ extern ThreadLocal<Sequence*> g_gmock_implicit_sequence;
+
+// Base class for implementing expectations.
+//
+// There are two reasons for having a type-agnostic base class for
+// Expectation:
+//
+//   1. We need to store collections of expectations of different
+//   types (e.g. all pre-requisites of a particular expectation, all
+//   expectations in a sequence).  Therefore these expectation objects
+//   must share a common base class.
+//
+//   2. We can avoid binary code bloat by moving methods not depending
+//   on the template argument of Expectation to the base class.
+//
+// This class is internal and mustn't be used by user code directly.
+class GTEST_API_ ExpectationBase {
+ public:
+  // source_text is the EXPECT_CALL(...) source that created this Expectation.
+  ExpectationBase(const char* file, int line, const std::string& source_text);
+
+  virtual ~ExpectationBase();
+
+  // Where in the source file was the expectation spec defined?
+  const char* file() const { return file_; }
+  int line() const { return line_; }
+  const char* source_text() const { return source_text_.c_str(); }
+  // Returns the cardinality specified in the expectation spec.
+  const Cardinality& cardinality() const { return cardinality_; }
+
+  // Describes the source file location of this expectation.
+  void DescribeLocationTo(::std::ostream* os) const {
+    *os << FormatFileLocation(file(), line()) << " ";
+  }
+
+  // Describes how many times a function call matching this
+  // expectation has occurred.
+  void DescribeCallCountTo(::std::ostream* os) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // If this mock method has an extra matcher (i.e. .With(matcher)),
+  // describes it to the ostream.
+  virtual void MaybeDescribeExtraMatcherTo(::std::ostream* os) = 0;
+
+ protected:
+  friend class ::testing::Expectation;
+  friend class UntypedFunctionMockerBase;
+
+  enum Clause {
+    // Don't change the order of the enum members!
+    kNone,
+    kWith,
+    kTimes,
+    kInSequence,
+    kAfter,
+    kWillOnce,
+    kWillRepeatedly,
+    kRetiresOnSaturation
+  };
+
+  typedef std::vector<const void*> UntypedActions;
+
+  // Returns an Expectation object that references and co-owns this
+  // expectation.
+  virtual Expectation GetHandle() = 0;
+
+  // Asserts that the EXPECT_CALL() statement has the given property.
+  void AssertSpecProperty(bool property,
+                          const std::string& failure_message) const {
+    Assert(property, file_, line_, failure_message);
+  }
+
+  // Expects that the EXPECT_CALL() statement has the given property.
+  void ExpectSpecProperty(bool property,
+                          const std::string& failure_message) const {
+    Expect(property, file_, line_, failure_message);
+  }
+
+  // Explicitly specifies the cardinality of this expectation.  Used
+  // by the subclasses to implement the .Times() clause.
+  void SpecifyCardinality(const Cardinality& cardinality);
+
+  // Returns true if and only if the user specified the cardinality
+  // explicitly using a .Times().
+  bool cardinality_specified() const { return cardinality_specified_; }
+
+  // Sets the cardinality of this expectation spec.
+  void set_cardinality(const Cardinality& a_cardinality) {
+    cardinality_ = a_cardinality;
+  }
+
+  // The following group of methods should only be called after the
+  // EXPECT_CALL() statement, and only when g_gmock_mutex is held by
+  // the current thread.
+
+  // Retires all pre-requisites of this expectation.
+  void RetireAllPreRequisites()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Returns true if and only if this expectation is retired.
+  bool is_retired() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return retired_;
+  }
+
+  // Retires this expectation.
+  void Retire()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    retired_ = true;
+  }
+
+  // Returns true if and only if this expectation is satisfied.
+  bool IsSatisfied() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return cardinality().IsSatisfiedByCallCount(call_count_);
+  }
+
+  // Returns true if and only if this expectation is saturated.
+  bool IsSaturated() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return cardinality().IsSaturatedByCallCount(call_count_);
+  }
+
+  // Returns true if and only if this expectation is over-saturated.
+  bool IsOverSaturated() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return cardinality().IsOverSaturatedByCallCount(call_count_);
+  }
+
+  // Returns true if and only if all pre-requisites of this expectation are
+  // satisfied.
+  bool AllPrerequisitesAreSatisfied() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Adds unsatisfied pre-requisites of this expectation to 'result'.
+  void FindUnsatisfiedPrerequisites(ExpectationSet* result) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex);
+
+  // Returns the number this expectation has been invoked.
+  int call_count() const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return call_count_;
+  }
+
+  // Increments the number this expectation has been invoked.
+  void IncrementCallCount()
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    call_count_++;
+  }
+
+  // Checks the action count (i.e. the number of WillOnce() and
+  // WillRepeatedly() clauses) against the cardinality if this hasn't
+  // been done before.  Prints a warning if there are too many or too
+  // few actions.
+  void CheckActionCountIfNotDone() const
+      GTEST_LOCK_EXCLUDED_(mutex_);
+
+  friend class ::testing::Sequence;
+  friend class ::testing::internal::ExpectationTester;
+
+  template <typename Function>
+  friend class TypedExpectation;
+
+  // Implements the .Times() clause.
+  void UntypedTimes(const Cardinality& a_cardinality);
+
+  // This group of fields are part of the spec and won't change after
+  // an EXPECT_CALL() statement finishes.
+  const char* file_;          // The file that contains the expectation.
+  int line_;                  // The line number of the expectation.
+  const std::string source_text_;  // The EXPECT_CALL(...) source text.
+  // True if and only if the cardinality is specified explicitly.
+  bool cardinality_specified_;
+  Cardinality cardinality_;            // The cardinality of the expectation.
+  // The immediate pre-requisites (i.e. expectations that must be
+  // satisfied before this expectation can be matched) of this
+  // expectation.  We use std::shared_ptr in the set because we want an
+  // Expectation object to be co-owned by its FunctionMocker and its
+  // successors.  This allows multiple mock objects to be deleted at
+  // different times.
+  ExpectationSet immediate_prerequisites_;
+
+  // This group of fields are the current state of the expectation,
+  // and can change as the mock function is called.
+  int call_count_;  // How many times this expectation has been invoked.
+  bool retired_;    // True if and only if this expectation has retired.
+  UntypedActions untyped_actions_;
+  bool extra_matcher_specified_;
+  bool repeated_action_specified_;  // True if a WillRepeatedly() was specified.
+  bool retires_on_saturation_;
+  Clause last_clause_;
+  mutable bool action_count_checked_;  // Under mutex_.
+  mutable Mutex mutex_;  // Protects action_count_checked_.
+
+  GTEST_DISALLOW_ASSIGN_(ExpectationBase);
+};  // class ExpectationBase
+
+// Impements an expectation for the given function type.
+template <typename F>
+class TypedExpectation : public ExpectationBase {
+ public:
+  typedef typename Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename Function<F>::ArgumentMatcherTuple ArgumentMatcherTuple;
+  typedef typename Function<F>::Result Result;
+
+  TypedExpectation(FunctionMocker<F>* owner, const char* a_file, int a_line,
+                   const std::string& a_source_text,
+                   const ArgumentMatcherTuple& m)
+      : ExpectationBase(a_file, a_line, a_source_text),
+        owner_(owner),
+        matchers_(m),
+        // By default, extra_matcher_ should match anything.  However,
+        // we cannot initialize it with _ as that causes ambiguity between
+        // Matcher's copy and move constructor for some argument types.
+        extra_matcher_(A<const ArgumentTuple&>()),
+        repeated_action_(DoDefault()) {}
+
+  ~TypedExpectation() override {
+    // Check the validity of the action count if it hasn't been done
+    // yet (for example, if the expectation was never used).
+    CheckActionCountIfNotDone();
+    for (UntypedActions::const_iterator it = untyped_actions_.begin();
+         it != untyped_actions_.end(); ++it) {
+      delete static_cast<const Action<F>*>(*it);
+    }
+  }
+
+  // Implements the .With() clause.
+  TypedExpectation& With(const Matcher<const ArgumentTuple&>& m) {
+    if (last_clause_ == kWith) {
+      ExpectSpecProperty(false,
+                         ".With() cannot appear "
+                         "more than once in an EXPECT_CALL().");
+    } else {
+      ExpectSpecProperty(last_clause_ < kWith,
+                         ".With() must be the first "
+                         "clause in an EXPECT_CALL().");
+    }
+    last_clause_ = kWith;
+
+    extra_matcher_ = m;
+    extra_matcher_specified_ = true;
+    return *this;
+  }
+
+  // Implements the .Times() clause.
+  TypedExpectation& Times(const Cardinality& a_cardinality) {
+    ExpectationBase::UntypedTimes(a_cardinality);
+    return *this;
+  }
+
+  // Implements the .Times() clause.
+  TypedExpectation& Times(int n) {
+    return Times(Exactly(n));
+  }
+
+  // Implements the .InSequence() clause.
+  TypedExpectation& InSequence(const Sequence& s) {
+    ExpectSpecProperty(last_clause_ <= kInSequence,
+                       ".InSequence() cannot appear after .After(),"
+                       " .WillOnce(), .WillRepeatedly(), or "
+                       ".RetiresOnSaturation().");
+    last_clause_ = kInSequence;
+
+    s.AddExpectation(GetHandle());
+    return *this;
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2) {
+    return InSequence(s1).InSequence(s2);
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+                               const Sequence& s3) {
+    return InSequence(s1, s2).InSequence(s3);
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+                               const Sequence& s3, const Sequence& s4) {
+    return InSequence(s1, s2, s3).InSequence(s4);
+  }
+  TypedExpectation& InSequence(const Sequence& s1, const Sequence& s2,
+                               const Sequence& s3, const Sequence& s4,
+                               const Sequence& s5) {
+    return InSequence(s1, s2, s3, s4).InSequence(s5);
+  }
+
+  // Implements that .After() clause.
+  TypedExpectation& After(const ExpectationSet& s) {
+    ExpectSpecProperty(last_clause_ <= kAfter,
+                       ".After() cannot appear after .WillOnce(),"
+                       " .WillRepeatedly(), or "
+                       ".RetiresOnSaturation().");
+    last_clause_ = kAfter;
+
+    for (ExpectationSet::const_iterator it = s.begin(); it != s.end(); ++it) {
+      immediate_prerequisites_ += *it;
+    }
+    return *this;
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2) {
+    return After(s1).After(s2);
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+                          const ExpectationSet& s3) {
+    return After(s1, s2).After(s3);
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+                          const ExpectationSet& s3, const ExpectationSet& s4) {
+    return After(s1, s2, s3).After(s4);
+  }
+  TypedExpectation& After(const ExpectationSet& s1, const ExpectationSet& s2,
+                          const ExpectationSet& s3, const ExpectationSet& s4,
+                          const ExpectationSet& s5) {
+    return After(s1, s2, s3, s4).After(s5);
+  }
+
+  // Implements the .WillOnce() clause.
+  TypedExpectation& WillOnce(const Action<F>& action) {
+    ExpectSpecProperty(last_clause_ <= kWillOnce,
+                       ".WillOnce() cannot appear after "
+                       ".WillRepeatedly() or .RetiresOnSaturation().");
+    last_clause_ = kWillOnce;
+
+    untyped_actions_.push_back(new Action<F>(action));
+    if (!cardinality_specified()) {
+      set_cardinality(Exactly(static_cast<int>(untyped_actions_.size())));
+    }
+    return *this;
+  }
+
+  // Implements the .WillRepeatedly() clause.
+  TypedExpectation& WillRepeatedly(const Action<F>& action) {
+    if (last_clause_ == kWillRepeatedly) {
+      ExpectSpecProperty(false,
+                         ".WillRepeatedly() cannot appear "
+                         "more than once in an EXPECT_CALL().");
+    } else {
+      ExpectSpecProperty(last_clause_ < kWillRepeatedly,
+                         ".WillRepeatedly() cannot appear "
+                         "after .RetiresOnSaturation().");
+    }
+    last_clause_ = kWillRepeatedly;
+    repeated_action_specified_ = true;
+
+    repeated_action_ = action;
+    if (!cardinality_specified()) {
+      set_cardinality(AtLeast(static_cast<int>(untyped_actions_.size())));
+    }
+
+    // Now that no more action clauses can be specified, we check
+    // whether their count makes sense.
+    CheckActionCountIfNotDone();
+    return *this;
+  }
+
+  // Implements the .RetiresOnSaturation() clause.
+  TypedExpectation& RetiresOnSaturation() {
+    ExpectSpecProperty(last_clause_ < kRetiresOnSaturation,
+                       ".RetiresOnSaturation() cannot appear "
+                       "more than once.");
+    last_clause_ = kRetiresOnSaturation;
+    retires_on_saturation_ = true;
+
+    // Now that no more action clauses can be specified, we check
+    // whether their count makes sense.
+    CheckActionCountIfNotDone();
+    return *this;
+  }
+
+  // Returns the matchers for the arguments as specified inside the
+  // EXPECT_CALL() macro.
+  const ArgumentMatcherTuple& matchers() const {
+    return matchers_;
+  }
+
+  // Returns the matcher specified by the .With() clause.
+  const Matcher<const ArgumentTuple&>& extra_matcher() const {
+    return extra_matcher_;
+  }
+
+  // Returns the action specified by the .WillRepeatedly() clause.
+  const Action<F>& repeated_action() const { return repeated_action_; }
+
+  // If this mock method has an extra matcher (i.e. .With(matcher)),
+  // describes it to the ostream.
+  void MaybeDescribeExtraMatcherTo(::std::ostream* os) override {
+    if (extra_matcher_specified_) {
+      *os << "    Expected args: ";
+      extra_matcher_.DescribeTo(os);
+      *os << "\n";
+    }
+  }
+
+ private:
+  template <typename Function>
+  friend class FunctionMocker;
+
+  // Returns an Expectation object that references and co-owns this
+  // expectation.
+  Expectation GetHandle() override { return owner_->GetHandleOf(this); }
+
+  // The following methods will be called only after the EXPECT_CALL()
+  // statement finishes and when the current thread holds
+  // g_gmock_mutex.
+
+  // Returns true if and only if this expectation matches the given arguments.
+  bool Matches(const ArgumentTuple& args) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    return TupleMatches(matchers_, args) && extra_matcher_.Matches(args);
+  }
+
+  // Returns true if and only if this expectation should handle the given
+  // arguments.
+  bool ShouldHandleArguments(const ArgumentTuple& args) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+
+    // In case the action count wasn't checked when the expectation
+    // was defined (e.g. if this expectation has no WillRepeatedly()
+    // or RetiresOnSaturation() clause), we check it when the
+    // expectation is used for the first time.
+    CheckActionCountIfNotDone();
+    return !is_retired() && AllPrerequisitesAreSatisfied() && Matches(args);
+  }
+
+  // Describes the result of matching the arguments against this
+  // expectation to the given ostream.
+  void ExplainMatchResultTo(
+      const ArgumentTuple& args,
+      ::std::ostream* os) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+
+    if (is_retired()) {
+      *os << "         Expected: the expectation is active\n"
+          << "           Actual: it is retired\n";
+    } else if (!Matches(args)) {
+      if (!TupleMatches(matchers_, args)) {
+        ExplainMatchFailureTupleTo(matchers_, args, os);
+      }
+      StringMatchResultListener listener;
+      if (!extra_matcher_.MatchAndExplain(args, &listener)) {
+        *os << "    Expected args: ";
+        extra_matcher_.DescribeTo(os);
+        *os << "\n           Actual: don't match";
+
+        internal::PrintIfNotEmpty(listener.str(), os);
+        *os << "\n";
+      }
+    } else if (!AllPrerequisitesAreSatisfied()) {
+      *os << "         Expected: all pre-requisites are satisfied\n"
+          << "           Actual: the following immediate pre-requisites "
+          << "are not satisfied:\n";
+      ExpectationSet unsatisfied_prereqs;
+      FindUnsatisfiedPrerequisites(&unsatisfied_prereqs);
+      int i = 0;
+      for (ExpectationSet::const_iterator it = unsatisfied_prereqs.begin();
+           it != unsatisfied_prereqs.end(); ++it) {
+        it->expectation_base()->DescribeLocationTo(os);
+        *os << "pre-requisite #" << i++ << "\n";
+      }
+      *os << "                   (end of pre-requisites)\n";
+    } else {
+      // This line is here just for completeness' sake.  It will never
+      // be executed as currently the ExplainMatchResultTo() function
+      // is called only when the mock function call does NOT match the
+      // expectation.
+      *os << "The call matches the expectation.\n";
+    }
+  }
+
+  // Returns the action that should be taken for the current invocation.
+  const Action<F>& GetCurrentAction(const FunctionMocker<F>* mocker,
+                                    const ArgumentTuple& args) const
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    const int count = call_count();
+    Assert(count >= 1, __FILE__, __LINE__,
+           "call_count() is <= 0 when GetCurrentAction() is "
+           "called - this should never happen.");
+
+    const int action_count = static_cast<int>(untyped_actions_.size());
+    if (action_count > 0 && !repeated_action_specified_ &&
+        count > action_count) {
+      // If there is at least one WillOnce() and no WillRepeatedly(),
+      // we warn the user when the WillOnce() clauses ran out.
+      ::std::stringstream ss;
+      DescribeLocationTo(&ss);
+      ss << "Actions ran out in " << source_text() << "...\n"
+         << "Called " << count << " times, but only "
+         << action_count << " WillOnce()"
+         << (action_count == 1 ? " is" : "s are") << " specified - ";
+      mocker->DescribeDefaultActionTo(args, &ss);
+      Log(kWarning, ss.str(), 1);
+    }
+
+    return count <= action_count
+               ? *static_cast<const Action<F>*>(
+                     untyped_actions_[static_cast<size_t>(count - 1)])
+               : repeated_action();
+  }
+
+  // Given the arguments of a mock function call, if the call will
+  // over-saturate this expectation, returns the default action;
+  // otherwise, returns the next action in this expectation.  Also
+  // describes *what* happened to 'what', and explains *why* Google
+  // Mock does it to 'why'.  This method is not const as it calls
+  // IncrementCallCount().  A return value of NULL means the default
+  // action.
+  const Action<F>* GetActionForArguments(const FunctionMocker<F>* mocker,
+                                         const ArgumentTuple& args,
+                                         ::std::ostream* what,
+                                         ::std::ostream* why)
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    if (IsSaturated()) {
+      // We have an excessive call.
+      IncrementCallCount();
+      *what << "Mock function called more times than expected - ";
+      mocker->DescribeDefaultActionTo(args, what);
+      DescribeCallCountTo(why);
+
+      return nullptr;
+    }
+
+    IncrementCallCount();
+    RetireAllPreRequisites();
+
+    if (retires_on_saturation_ && IsSaturated()) {
+      Retire();
+    }
+
+    // Must be done after IncrementCount()!
+    *what << "Mock function call matches " << source_text() <<"...\n";
+    return &(GetCurrentAction(mocker, args));
+  }
+
+  // All the fields below won't change once the EXPECT_CALL()
+  // statement finishes.
+  FunctionMocker<F>* const owner_;
+  ArgumentMatcherTuple matchers_;
+  Matcher<const ArgumentTuple&> extra_matcher_;
+  Action<F> repeated_action_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(TypedExpectation);
+};  // class TypedExpectation
+
+// A MockSpec object is used by ON_CALL() or EXPECT_CALL() for
+// specifying the default behavior of, or expectation on, a mock
+// function.
+
+// Note: class MockSpec really belongs to the ::testing namespace.
+// However if we define it in ::testing, MSVC will complain when
+// classes in ::testing::internal declare it as a friend class
+// template.  To workaround this compiler bug, we define MockSpec in
+// ::testing::internal and import it into ::testing.
+
+// Logs a message including file and line number information.
+GTEST_API_ void LogWithLocation(testing::internal::LogSeverity severity,
+                                const char* file, int line,
+                                const std::string& message);
+
+template <typename F>
+class MockSpec {
+ public:
+  typedef typename internal::Function<F>::ArgumentTuple ArgumentTuple;
+  typedef typename internal::Function<F>::ArgumentMatcherTuple
+      ArgumentMatcherTuple;
+
+  // Constructs a MockSpec object, given the function mocker object
+  // that the spec is associated with.
+  MockSpec(internal::FunctionMocker<F>* function_mocker,
+           const ArgumentMatcherTuple& matchers)
+      : function_mocker_(function_mocker), matchers_(matchers) {}
+
+  // Adds a new default action spec to the function mocker and returns
+  // the newly created spec.
+  internal::OnCallSpec<F>& InternalDefaultActionSetAt(
+      const char* file, int line, const char* obj, const char* call) {
+    LogWithLocation(internal::kInfo, file, line,
+                    std::string("ON_CALL(") + obj + ", " + call + ") invoked");
+    return function_mocker_->AddNewOnCallSpec(file, line, matchers_);
+  }
+
+  // Adds a new expectation spec to the function mocker and returns
+  // the newly created spec.
+  internal::TypedExpectation<F>& InternalExpectedAt(
+      const char* file, int line, const char* obj, const char* call) {
+    const std::string source_text(std::string("EXPECT_CALL(") + obj + ", " +
+                                  call + ")");
+    LogWithLocation(internal::kInfo, file, line, source_text + " invoked");
+    return function_mocker_->AddNewExpectation(
+        file, line, source_text, matchers_);
+  }
+
+  // This operator overload is used to swallow the superfluous parameter list
+  // introduced by the ON/EXPECT_CALL macros. See the macro comments for more
+  // explanation.
+  MockSpec<F>& operator()(const internal::WithoutMatchers&, void* const) {
+    return *this;
+  }
+
+ private:
+  template <typename Function>
+  friend class internal::FunctionMocker;
+
+  // The function mocker that owns this spec.
+  internal::FunctionMocker<F>* const function_mocker_;
+  // The argument matchers specified in the spec.
+  ArgumentMatcherTuple matchers_;
+
+  GTEST_DISALLOW_ASSIGN_(MockSpec);
+};  // class MockSpec
+
+// Wrapper type for generically holding an ordinary value or lvalue reference.
+// If T is not a reference type, it must be copyable or movable.
+// ReferenceOrValueWrapper<T> is movable, and will also be copyable unless
+// T is a move-only value type (which means that it will always be copyable
+// if the current platform does not support move semantics).
+//
+// The primary template defines handling for values, but function header
+// comments describe the contract for the whole template (including
+// specializations).
+template <typename T>
+class ReferenceOrValueWrapper {
+ public:
+  // Constructs a wrapper from the given value/reference.
+  explicit ReferenceOrValueWrapper(T value)
+      : value_(std::move(value)) {
+  }
+
+  // Unwraps and returns the underlying value/reference, exactly as
+  // originally passed. The behavior of calling this more than once on
+  // the same object is unspecified.
+  T Unwrap() { return std::move(value_); }
+
+  // Provides nondestructive access to the underlying value/reference.
+  // Always returns a const reference (more precisely,
+  // const std::add_lvalue_reference<T>::type). The behavior of calling this
+  // after calling Unwrap on the same object is unspecified.
+  const T& Peek() const {
+    return value_;
+  }
+
+ private:
+  T value_;
+};
+
+// Specialization for lvalue reference types. See primary template
+// for documentation.
+template <typename T>
+class ReferenceOrValueWrapper<T&> {
+ public:
+  // Workaround for debatable pass-by-reference lint warning (c-library-team
+  // policy precludes NOLINT in this context)
+  typedef T& reference;
+  explicit ReferenceOrValueWrapper(reference ref)
+      : value_ptr_(&ref) {}
+  T& Unwrap() { return *value_ptr_; }
+  const T& Peek() const { return *value_ptr_; }
+
+ private:
+  T* value_ptr_;
+};
+
+// C++ treats the void type specially.  For example, you cannot define
+// a void-typed variable or pass a void value to a function.
+// ActionResultHolder<T> holds a value of type T, where T must be a
+// copyable type or void (T doesn't need to be default-constructable).
+// It hides the syntactic difference between void and other types, and
+// is used to unify the code for invoking both void-returning and
+// non-void-returning mock functions.
+
+// Untyped base class for ActionResultHolder<T>.
+class UntypedActionResultHolderBase {
+ public:
+  virtual ~UntypedActionResultHolderBase() {}
+
+  // Prints the held value as an action's result to os.
+  virtual void PrintAsActionResult(::std::ostream* os) const = 0;
+};
+
+// This generic definition is used when T is not void.
+template <typename T>
+class ActionResultHolder : public UntypedActionResultHolderBase {
+ public:
+  // Returns the held value. Must not be called more than once.
+  T Unwrap() {
+    return result_.Unwrap();
+  }
+
+  // Prints the held value as an action's result to os.
+  void PrintAsActionResult(::std::ostream* os) const override {
+    *os << "\n          Returns: ";
+    // T may be a reference type, so we don't use UniversalPrint().
+    UniversalPrinter<T>::Print(result_.Peek(), os);
+  }
+
+  // Performs the given mock function's default action and returns the
+  // result in a new-ed ActionResultHolder.
+  template <typename F>
+  static ActionResultHolder* PerformDefaultAction(
+      const FunctionMocker<F>* func_mocker,
+      typename Function<F>::ArgumentTuple&& args,
+      const std::string& call_description) {
+    return new ActionResultHolder(Wrapper(func_mocker->PerformDefaultAction(
+        std::move(args), call_description)));
+  }
+
+  // Performs the given action and returns the result in a new-ed
+  // ActionResultHolder.
+  template <typename F>
+  static ActionResultHolder* PerformAction(
+      const Action<F>& action, typename Function<F>::ArgumentTuple&& args) {
+    return new ActionResultHolder(
+        Wrapper(action.Perform(std::move(args))));
+  }
+
+ private:
+  typedef ReferenceOrValueWrapper<T> Wrapper;
+
+  explicit ActionResultHolder(Wrapper result)
+      : result_(std::move(result)) {
+  }
+
+  Wrapper result_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionResultHolder);
+};
+
+// Specialization for T = void.
+template <>
+class ActionResultHolder<void> : public UntypedActionResultHolderBase {
+ public:
+  void Unwrap() { }
+
+  void PrintAsActionResult(::std::ostream* /* os */) const override {}
+
+  // Performs the given mock function's default action and returns ownership
+  // of an empty ActionResultHolder*.
+  template <typename F>
+  static ActionResultHolder* PerformDefaultAction(
+      const FunctionMocker<F>* func_mocker,
+      typename Function<F>::ArgumentTuple&& args,
+      const std::string& call_description) {
+    func_mocker->PerformDefaultAction(std::move(args), call_description);
+    return new ActionResultHolder;
+  }
+
+  // Performs the given action and returns ownership of an empty
+  // ActionResultHolder*.
+  template <typename F>
+  static ActionResultHolder* PerformAction(
+      const Action<F>& action, typename Function<F>::ArgumentTuple&& args) {
+    action.Perform(std::move(args));
+    return new ActionResultHolder;
+  }
+
+ private:
+  ActionResultHolder() {}
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ActionResultHolder);
+};
+
+template <typename F>
+class FunctionMocker;
+
+template <typename R, typename... Args>
+class FunctionMocker<R(Args...)> final : public UntypedFunctionMockerBase {
+  using F = R(Args...);
+
+ public:
+  using Result = R;
+  using ArgumentTuple = std::tuple<Args...>;
+  using ArgumentMatcherTuple = std::tuple<Matcher<Args>...>;
+
+  FunctionMocker() {}
+
+  // There is no generally useful and implementable semantics of
+  // copying a mock object, so copying a mock is usually a user error.
+  // Thus we disallow copying function mockers.  If the user really
+  // wants to copy a mock object, they should implement their own copy
+  // operation, for example:
+  //
+  //   class MockFoo : public Foo {
+  //    public:
+  //     // Defines a copy constructor explicitly.
+  //     MockFoo(const MockFoo& src) {}
+  //     ...
+  //   };
+  FunctionMocker(const FunctionMocker&) = delete;
+  FunctionMocker& operator=(const FunctionMocker&) = delete;
+
+  // The destructor verifies that all expectations on this mock
+  // function have been satisfied.  If not, it will report Google Test
+  // non-fatal failures for the violations.
+  ~FunctionMocker() override GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    MutexLock l(&g_gmock_mutex);
+    VerifyAndClearExpectationsLocked();
+    Mock::UnregisterLocked(this);
+    ClearDefaultActionsLocked();
+  }
+
+  // Returns the ON_CALL spec that matches this mock function with the
+  // given arguments; returns NULL if no matching ON_CALL is found.
+  // L = *
+  const OnCallSpec<F>* FindOnCallSpec(
+      const ArgumentTuple& args) const {
+    for (UntypedOnCallSpecs::const_reverse_iterator it
+             = untyped_on_call_specs_.rbegin();
+         it != untyped_on_call_specs_.rend(); ++it) {
+      const OnCallSpec<F>* spec = static_cast<const OnCallSpec<F>*>(*it);
+      if (spec->Matches(args))
+        return spec;
+    }
+
+    return nullptr;
+  }
+
+  // Performs the default action of this mock function on the given
+  // arguments and returns the result. Asserts (or throws if
+  // exceptions are enabled) with a helpful call descrption if there
+  // is no valid return value. This method doesn't depend on the
+  // mutable state of this object, and thus can be called concurrently
+  // without locking.
+  // L = *
+  Result PerformDefaultAction(ArgumentTuple&& args,
+                              const std::string& call_description) const {
+    const OnCallSpec<F>* const spec =
+        this->FindOnCallSpec(args);
+    if (spec != nullptr) {
+      return spec->GetAction().Perform(std::move(args));
+    }
+    const std::string message =
+        call_description +
+        "\n    The mock function has no default action "
+        "set, and its return type has no default value set.";
+#if GTEST_HAS_EXCEPTIONS
+    if (!DefaultValue<Result>::Exists()) {
+      throw std::runtime_error(message);
+    }
+#else
+    Assert(DefaultValue<Result>::Exists(), "", -1, message);
+#endif
+    return DefaultValue<Result>::Get();
+  }
+
+  // Performs the default action with the given arguments and returns
+  // the action's result.  The call description string will be used in
+  // the error message to describe the call in the case the default
+  // action fails.  The caller is responsible for deleting the result.
+  // L = *
+  UntypedActionResultHolderBase* UntypedPerformDefaultAction(
+      void* untyped_args,  // must point to an ArgumentTuple
+      const std::string& call_description) const override {
+    ArgumentTuple* args = static_cast<ArgumentTuple*>(untyped_args);
+    return ResultHolder::PerformDefaultAction(this, std::move(*args),
+                                              call_description);
+  }
+
+  // Performs the given action with the given arguments and returns
+  // the action's result.  The caller is responsible for deleting the
+  // result.
+  // L = *
+  UntypedActionResultHolderBase* UntypedPerformAction(
+      const void* untyped_action, void* untyped_args) const override {
+    // Make a copy of the action before performing it, in case the
+    // action deletes the mock object (and thus deletes itself).
+    const Action<F> action = *static_cast<const Action<F>*>(untyped_action);
+    ArgumentTuple* args = static_cast<ArgumentTuple*>(untyped_args);
+    return ResultHolder::PerformAction(action, std::move(*args));
+  }
+
+  // Implements UntypedFunctionMockerBase::ClearDefaultActionsLocked():
+  // clears the ON_CALL()s set on this mock function.
+  void ClearDefaultActionsLocked() override
+      GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+
+    // Deleting our default actions may trigger other mock objects to be
+    // deleted, for example if an action contains a reference counted smart
+    // pointer to that mock object, and that is the last reference. So if we
+    // delete our actions within the context of the global mutex we may deadlock
+    // when this method is called again. Instead, make a copy of the set of
+    // actions to delete, clear our set within the mutex, and then delete the
+    // actions outside of the mutex.
+    UntypedOnCallSpecs specs_to_delete;
+    untyped_on_call_specs_.swap(specs_to_delete);
+
+    g_gmock_mutex.Unlock();
+    for (UntypedOnCallSpecs::const_iterator it =
+             specs_to_delete.begin();
+         it != specs_to_delete.end(); ++it) {
+      delete static_cast<const OnCallSpec<F>*>(*it);
+    }
+
+    // Lock the mutex again, since the caller expects it to be locked when we
+    // return.
+    g_gmock_mutex.Lock();
+  }
+
+  // Returns the result of invoking this mock function with the given
+  // arguments.  This function can be safely called from multiple
+  // threads concurrently.
+  Result Invoke(Args... args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    ArgumentTuple tuple(std::forward<Args>(args)...);
+    std::unique_ptr<ResultHolder> holder(DownCast_<ResultHolder*>(
+        this->UntypedInvokeWith(static_cast<void*>(&tuple))));
+    return holder->Unwrap();
+  }
+
+  MockSpec<F> With(Matcher<Args>... m) {
+    return MockSpec<F>(this, ::std::make_tuple(std::move(m)...));
+  }
+
+ protected:
+  template <typename Function>
+  friend class MockSpec;
+
+  typedef ActionResultHolder<Result> ResultHolder;
+
+  // Adds and returns a default action spec for this mock function.
+  OnCallSpec<F>& AddNewOnCallSpec(
+      const char* file, int line,
+      const ArgumentMatcherTuple& m)
+          GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
+    OnCallSpec<F>* const on_call_spec = new OnCallSpec<F>(file, line, m);
+    untyped_on_call_specs_.push_back(on_call_spec);
+    return *on_call_spec;
+  }
+
+  // Adds and returns an expectation spec for this mock function.
+  TypedExpectation<F>& AddNewExpectation(const char* file, int line,
+                                         const std::string& source_text,
+                                         const ArgumentMatcherTuple& m)
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    Mock::RegisterUseByOnCallOrExpectCall(MockObject(), file, line);
+    TypedExpectation<F>* const expectation =
+        new TypedExpectation<F>(this, file, line, source_text, m);
+    const std::shared_ptr<ExpectationBase> untyped_expectation(expectation);
+    // See the definition of untyped_expectations_ for why access to
+    // it is unprotected here.
+    untyped_expectations_.push_back(untyped_expectation);
+
+    // Adds this expectation into the implicit sequence if there is one.
+    Sequence* const implicit_sequence = g_gmock_implicit_sequence.get();
+    if (implicit_sequence != nullptr) {
+      implicit_sequence->AddExpectation(Expectation(untyped_expectation));
+    }
+
+    return *expectation;
+  }
+
+ private:
+  template <typename Func> friend class TypedExpectation;
+
+  // Some utilities needed for implementing UntypedInvokeWith().
+
+  // Describes what default action will be performed for the given
+  // arguments.
+  // L = *
+  void DescribeDefaultActionTo(const ArgumentTuple& args,
+                               ::std::ostream* os) const {
+    const OnCallSpec<F>* const spec = FindOnCallSpec(args);
+
+    if (spec == nullptr) {
+      *os << (std::is_void<Result>::value ? "returning directly.\n"
+                                          : "returning default value.\n");
+    } else {
+      *os << "taking default action specified at:\n"
+          << FormatFileLocation(spec->file(), spec->line()) << "\n";
+    }
+  }
+
+  // Writes a message that the call is uninteresting (i.e. neither
+  // explicitly expected nor explicitly unexpected) to the given
+  // ostream.
+  void UntypedDescribeUninterestingCall(const void* untyped_args,
+                                        ::std::ostream* os) const override
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    *os << "Uninteresting mock function call - ";
+    DescribeDefaultActionTo(args, os);
+    *os << "    Function call: " << Name();
+    UniversalPrint(args, os);
+  }
+
+  // Returns the expectation that matches the given function arguments
+  // (or NULL is there's no match); when a match is found,
+  // untyped_action is set to point to the action that should be
+  // performed (or NULL if the action is "do default"), and
+  // is_excessive is modified to indicate whether the call exceeds the
+  // expected number.
+  //
+  // Critical section: We must find the matching expectation and the
+  // corresponding action that needs to be taken in an ATOMIC
+  // transaction.  Otherwise another thread may call this mock
+  // method in the middle and mess up the state.
+  //
+  // However, performing the action has to be left out of the critical
+  // section.  The reason is that we have no control on what the
+  // action does (it can invoke an arbitrary user function or even a
+  // mock function) and excessive locking could cause a dead lock.
+  const ExpectationBase* UntypedFindMatchingExpectation(
+      const void* untyped_args, const void** untyped_action, bool* is_excessive,
+      ::std::ostream* what, ::std::ostream* why) override
+      GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    MutexLock l(&g_gmock_mutex);
+    TypedExpectation<F>* exp = this->FindMatchingExpectationLocked(args);
+    if (exp == nullptr) {  // A match wasn't found.
+      this->FormatUnexpectedCallMessageLocked(args, what, why);
+      return nullptr;
+    }
+
+    // This line must be done before calling GetActionForArguments(),
+    // which will increment the call count for *exp and thus affect
+    // its saturation status.
+    *is_excessive = exp->IsSaturated();
+    const Action<F>* action = exp->GetActionForArguments(this, args, what, why);
+    if (action != nullptr && action->IsDoDefault())
+      action = nullptr;  // Normalize "do default" to NULL.
+    *untyped_action = action;
+    return exp;
+  }
+
+  // Prints the given function arguments to the ostream.
+  void UntypedPrintArgs(const void* untyped_args,
+                        ::std::ostream* os) const override {
+    const ArgumentTuple& args =
+        *static_cast<const ArgumentTuple*>(untyped_args);
+    UniversalPrint(args, os);
+  }
+
+  // Returns the expectation that matches the arguments, or NULL if no
+  // expectation matches them.
+  TypedExpectation<F>* FindMatchingExpectationLocked(
+      const ArgumentTuple& args) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    // See the definition of untyped_expectations_ for why access to
+    // it is unprotected here.
+    for (typename UntypedExpectations::const_reverse_iterator it =
+             untyped_expectations_.rbegin();
+         it != untyped_expectations_.rend(); ++it) {
+      TypedExpectation<F>* const exp =
+          static_cast<TypedExpectation<F>*>(it->get());
+      if (exp->ShouldHandleArguments(args)) {
+        return exp;
+      }
+    }
+    return nullptr;
+  }
+
+  // Returns a message that the arguments don't match any expectation.
+  void FormatUnexpectedCallMessageLocked(
+      const ArgumentTuple& args,
+      ::std::ostream* os,
+      ::std::ostream* why) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    *os << "\nUnexpected mock function call - ";
+    DescribeDefaultActionTo(args, os);
+    PrintTriedExpectationsLocked(args, why);
+  }
+
+  // Prints a list of expectations that have been tried against the
+  // current mock function call.
+  void PrintTriedExpectationsLocked(
+      const ArgumentTuple& args,
+      ::std::ostream* why) const
+          GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+    g_gmock_mutex.AssertHeld();
+    const size_t count = untyped_expectations_.size();
+    *why << "Google Mock tried the following " << count << " "
+         << (count == 1 ? "expectation, but it didn't match" :
+             "expectations, but none matched")
+         << ":\n";
+    for (size_t i = 0; i < count; i++) {
+      TypedExpectation<F>* const expectation =
+          static_cast<TypedExpectation<F>*>(untyped_expectations_[i].get());
+      *why << "\n";
+      expectation->DescribeLocationTo(why);
+      if (count > 1) {
+        *why << "tried expectation #" << i << ": ";
+      }
+      *why << expectation->source_text() << "...\n";
+      expectation->ExplainMatchResultTo(args, why);
+      expectation->DescribeCallCountTo(why);
+    }
+  }
+};  // class FunctionMocker
+
+// Reports an uninteresting call (whose description is in msg) in the
+// manner specified by 'reaction'.
+void ReportUninterestingCall(CallReaction reaction, const std::string& msg);
+
+}  // namespace internal
+
+namespace internal {
+
+template <typename F>
+class MockFunction;
+
+template <typename R, typename... Args>
+class MockFunction<R(Args...)> {
+ public:
+  MockFunction(const MockFunction&) = delete;
+  MockFunction& operator=(const MockFunction&) = delete;
+
+  std::function<R(Args...)> AsStdFunction() {
+    return [this](Args... args) -> R {
+      return this->Call(std::forward<Args>(args)...);
+    };
+  }
+
+  // Implementation detail: the expansion of the MOCK_METHOD macro.
+  R Call(Args... args) {
+    mock_.SetOwnerAndName(this, "Call");
+    return mock_.Invoke(std::forward<Args>(args)...);
+  }
+
+  MockSpec<R(Args...)> gmock_Call(Matcher<Args>... m) {
+    mock_.RegisterOwner(this);
+    return mock_.With(std::move(m)...);
+  }
+
+  MockSpec<R(Args...)> gmock_Call(const WithoutMatchers&, R (*)(Args...)) {
+    return this->gmock_Call(::testing::A<Args>()...);
+  }
+
+ protected:
+  MockFunction() = default;
+  ~MockFunction() = default;
+
+ private:
+  FunctionMocker<R(Args...)> mock_;
+};
+
+/*
+The SignatureOf<F> struct is a meta-function returning function signature
+corresponding to the provided F argument.
+
+It makes use of MockFunction easier by allowing it to accept more F arguments
+than just function signatures.
+
+Specializations provided here cover only a signature type itself and
+std::function. However, if need be it can be easily extended to cover also other
+types (like for example boost::function).
+*/
+
+template <typename F>
+struct SignatureOf;
+
+template <typename R, typename... Args>
+struct SignatureOf<R(Args...)> {
+  using type = R(Args...);
+};
+
+template <typename F>
+struct SignatureOf<std::function<F>> : SignatureOf<F> {};
+
+template <typename F>
+using SignatureOfT = typename SignatureOf<F>::type;
+
+}  // namespace internal
+
+// A MockFunction<F> type has one mock method whose type is
+// internal::SignatureOfT<F>.  It is useful when you just want your
+// test code to emit some messages and have Google Mock verify the
+// right messages are sent (and perhaps at the right times).  For
+// example, if you are exercising code:
+//
+//   Foo(1);
+//   Foo(2);
+//   Foo(3);
+//
+// and want to verify that Foo(1) and Foo(3) both invoke
+// mock.Bar("a"), but Foo(2) doesn't invoke anything, you can write:
+//
+// TEST(FooTest, InvokesBarCorrectly) {
+//   MyMock mock;
+//   MockFunction<void(string check_point_name)> check;
+//   {
+//     InSequence s;
+//
+//     EXPECT_CALL(mock, Bar("a"));
+//     EXPECT_CALL(check, Call("1"));
+//     EXPECT_CALL(check, Call("2"));
+//     EXPECT_CALL(mock, Bar("a"));
+//   }
+//   Foo(1);
+//   check.Call("1");
+//   Foo(2);
+//   check.Call("2");
+//   Foo(3);
+// }
+//
+// The expectation spec says that the first Bar("a") must happen
+// before check point "1", the second Bar("a") must happen after check
+// point "2", and nothing should happen between the two check
+// points. The explicit check points make it easy to tell which
+// Bar("a") is called by which call to Foo().
+//
+// MockFunction<F> can also be used to exercise code that accepts
+// std::function<internal::SignatureOfT<F>> callbacks. To do so, use
+// AsStdFunction() method to create std::function proxy forwarding to
+// original object's Call. Example:
+//
+// TEST(FooTest, RunsCallbackWithBarArgument) {
+//   MockFunction<int(string)> callback;
+//   EXPECT_CALL(callback, Call("bar")).WillOnce(Return(1));
+//   Foo(callback.AsStdFunction());
+// }
+//
+// The internal::SignatureOfT<F> indirection allows to use other types
+// than just function signature type. This is typically useful when
+// providing a mock for a predefined std::function type. Example:
+//
+// using FilterPredicate = std::function<bool(string)>;
+// void MyFilterAlgorithm(FilterPredicate predicate);
+//
+// TEST(FooTest, FilterPredicateAlwaysAccepts) {
+//   MockFunction<FilterPredicate> predicateMock;
+//   EXPECT_CALL(predicateMock, Call(_)).WillRepeatedly(Return(true));
+//   MyFilterAlgorithm(predicateMock.AsStdFunction());
+// }
+template <typename F>
+class MockFunction : public internal::MockFunction<internal::SignatureOfT<F>> {
+  using Base = internal::MockFunction<internal::SignatureOfT<F>>;
+
+ public:
+  using Base::Base;
+};
+
+// The style guide prohibits "using" statements in a namespace scope
+// inside a header file.  However, the MockSpec class template is
+// meant to be defined in the ::testing namespace.  The following line
+// is just a trick for working around a bug in MSVC 8.0, which cannot
+// handle it if we define MockSpec in ::testing.
+using internal::MockSpec;
+
+// Const(x) is a convenient function for obtaining a const reference
+// to x.  This is useful for setting expectations on an overloaded
+// const mock method, e.g.
+//
+//   class MockFoo : public FooInterface {
+//    public:
+//     MOCK_METHOD0(Bar, int());
+//     MOCK_CONST_METHOD0(Bar, int&());
+//   };
+//
+//   MockFoo foo;
+//   // Expects a call to non-const MockFoo::Bar().
+//   EXPECT_CALL(foo, Bar());
+//   // Expects a call to const MockFoo::Bar().
+//   EXPECT_CALL(Const(foo), Bar());
+template <typename T>
+inline const T& Const(const T& x) { return x; }
+
+// Constructs an Expectation object that references and co-owns exp.
+inline Expectation::Expectation(internal::ExpectationBase& exp)  // NOLINT
+    : expectation_base_(exp.GetHandle().expectation_base()) {}
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
+
+// Implementation for ON_CALL and EXPECT_CALL macros. A separate macro is
+// required to avoid compile errors when the name of the method used in call is
+// a result of macro expansion. See CompilesWithMethodNameExpandedFromMacro
+// tests in internal/gmock-spec-builders_test.cc for more details.
+//
+// This macro supports statements both with and without parameter matchers. If
+// the parameter list is omitted, gMock will accept any parameters, which allows
+// tests to be written that don't need to encode the number of method
+// parameter. This technique may only be used for non-overloaded methods.
+//
+//   // These are the same:
+//   ON_CALL(mock, NoArgsMethod()).WillByDefault(...);
+//   ON_CALL(mock, NoArgsMethod).WillByDefault(...);
+//
+//   // As are these:
+//   ON_CALL(mock, TwoArgsMethod(_, _)).WillByDefault(...);
+//   ON_CALL(mock, TwoArgsMethod).WillByDefault(...);
+//
+//   // Can also specify args if you want, of course:
+//   ON_CALL(mock, TwoArgsMethod(_, 45)).WillByDefault(...);
+//
+//   // Overloads work as long as you specify parameters:
+//   ON_CALL(mock, OverloadedMethod(_)).WillByDefault(...);
+//   ON_CALL(mock, OverloadedMethod(_, _)).WillByDefault(...);
+//
+//   // Oops! Which overload did you want?
+//   ON_CALL(mock, OverloadedMethod).WillByDefault(...);
+//     => ERROR: call to member function 'gmock_OverloadedMethod' is ambiguous
+//
+// How this works: The mock class uses two overloads of the gmock_Method
+// expectation setter method plus an operator() overload on the MockSpec object.
+// In the matcher list form, the macro expands to:
+//
+//   // This statement:
+//   ON_CALL(mock, TwoArgsMethod(_, 45))...
+//
+//   // ...expands to:
+//   mock.gmock_TwoArgsMethod(_, 45)(WithoutMatchers(), nullptr)...
+//   |-------------v---------------||------------v-------------|
+//       invokes first overload        swallowed by operator()
+//
+//   // ...which is essentially:
+//   mock.gmock_TwoArgsMethod(_, 45)...
+//
+// Whereas the form without a matcher list:
+//
+//   // This statement:
+//   ON_CALL(mock, TwoArgsMethod)...
+//
+//   // ...expands to:
+//   mock.gmock_TwoArgsMethod(WithoutMatchers(), nullptr)...
+//   |-----------------------v--------------------------|
+//                 invokes second overload
+//
+//   // ...which is essentially:
+//   mock.gmock_TwoArgsMethod(_, _)...
+//
+// The WithoutMatchers() argument is used to disambiguate overloads and to
+// block the caller from accidentally invoking the second overload directly. The
+// second argument is an internal type derived from the method signature. The
+// failure to disambiguate two overloads of this method in the ON_CALL statement
+// is how we block callers from setting expectations on overloaded methods.
+#define GMOCK_ON_CALL_IMPL_(mock_expr, Setter, call)                    \
+  ((mock_expr).gmock_##call)(::testing::internal::GetWithoutMatchers(), \
+                             nullptr)                                   \
+      .Setter(__FILE__, __LINE__, #mock_expr, #call)
+
+#define ON_CALL(obj, call) \
+  GMOCK_ON_CALL_IMPL_(obj, InternalDefaultActionSetAt, call)
+
+#define EXPECT_CALL(obj, call) \
+  GMOCK_ON_CALL_IMPL_(obj, InternalExpectedAt, call)
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_SPEC_BUILDERS_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock.h
new file mode 100644
index 0000000000..3c317b6d47
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/gmock.h
@@ -0,0 +1,99 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This is the main header file a user should include.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_GMOCK_H_
+#define GMOCK_INCLUDE_GMOCK_GMOCK_H_
+
+// This file implements the following syntax:
+//
+//   ON_CALL(mock_object, Method(...))
+//     .With(...) ?
+//     .WillByDefault(...);
+//
+// where With() is optional and WillByDefault() must appear exactly
+// once.
+//
+//   EXPECT_CALL(mock_object, Method(...))
+//     .With(...) ?
+//     .Times(...) ?
+//     .InSequence(...) *
+//     .WillOnce(...) *
+//     .WillRepeatedly(...) ?
+//     .RetiresOnSaturation() ? ;
+//
+// where all clauses are optional and WillOnce() can be repeated.
+
+#include "gmock/gmock-actions.h"
+#include "gmock/gmock-cardinalities.h"
+#include "gmock/gmock-function-mocker.h"
+#include "gmock/gmock-generated-actions.h"
+#include "gmock/gmock-matchers.h"
+#include "gmock/gmock-more-actions.h"
+#include "gmock/gmock-more-matchers.h"
+#include "gmock/gmock-nice-strict.h"
+#include "gmock/internal/gmock-internal-utils.h"
+
+namespace testing {
+
+// Declares Google Mock flags that we want a user to use programmatically.
+GMOCK_DECLARE_bool_(catch_leaked_mocks);
+GMOCK_DECLARE_string_(verbose);
+GMOCK_DECLARE_int32_(default_mock_behavior);
+
+// Initializes Google Mock.  This must be called before running the
+// tests.  In particular, it parses the command line for the flags
+// that Google Mock recognizes.  Whenever a Google Mock flag is seen,
+// it is removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Mock flag variables are
+// updated.
+//
+// Since Google Test is needed for Google Mock to work, this function
+// also initializes Google Test and parses its flags, if that hasn't
+// been done.
+GTEST_API_ void InitGoogleMock(int* argc, char** argv);
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleMock(int* argc, wchar_t** argv);
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleMock();
+
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_GMOCK_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/README.md b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/README.md
new file mode 100644
index 0000000000..f6c93f616d
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/README.md
@@ -0,0 +1,16 @@
+# Customization Points
+
+The custom directory is an injection point for custom user configurations.
+
+## Header `gmock-port.h`
+
+The following macros can be defined:
+
+### Flag related macros:
+
+*   `GMOCK_DECLARE_bool_(name)`
+*   `GMOCK_DECLARE_int32_(name)`
+*   `GMOCK_DECLARE_string_(name)`
+*   `GMOCK_DEFINE_bool_(name, default_val, doc)`
+*   `GMOCK_DEFINE_int32_(name, default_val, doc)`
+*   `GMOCK_DEFINE_string_(name, default_val, doc)`
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/gmock-generated-actions.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
new file mode 100644
index 0000000000..92d910cf06
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/gmock-generated-actions.h
@@ -0,0 +1,10 @@
+// This file was GENERATED by command:
+//     pump.py gmock-generated-actions.h.pump
+// DO NOT EDIT BY HAND!!!
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/gmock-generated-actions.h.pump b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/gmock-generated-actions.h.pump
new file mode 100644
index 0000000000..67c221f14c
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/gmock-generated-actions.h.pump
@@ -0,0 +1,12 @@
+$$ -*- mode: c++; -*-
+$$ This is a Pump source file. Please use Pump to convert
+$$ it to callback-actions.h.
+$$
+$var max_callback_arity = 5
+$$}} This meta comment fixes auto-indentation in editors.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_GENERATED_ACTIONS_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/gmock-matchers.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/gmock-matchers.h
new file mode 100644
index 0000000000..14aafaabe6
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/gmock-matchers.h
@@ -0,0 +1,36 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_MATCHERS_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/gmock-port.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/gmock-port.h
new file mode 100644
index 0000000000..0030fe9111
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/custom/gmock-port.h
@@ -0,0 +1,39 @@
+// Copyright 2015, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Injection point for custom user configurations. See README for details
+//
+// ** Custom implementation starts here **
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_CUSTOM_GMOCK_PORT_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/gmock-internal-utils.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/gmock-internal-utils.h
new file mode 100644
index 0000000000..66cf857b7b
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/gmock-internal-utils.h
@@ -0,0 +1,470 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file defines some utilities useful for implementing Google
+// Mock.  They are subject to change without notice, so please DO NOT
+// USE THEM IN USER CODE.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
+
+#include <stdio.h>
+#include <ostream>  // NOLINT
+#include <string>
+#include <type_traits>
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+
+template <typename>
+class Matcher;
+
+namespace internal {
+
+// Silence MSVC C4100 (unreferenced formal parameter) and
+// C4805('==': unsafe mix of type 'const int' and type 'const bool')
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+# pragma warning(disable:4805)
+#endif
+
+// Joins a vector of strings as if they are fields of a tuple; returns
+// the joined string.
+GTEST_API_ std::string JoinAsTuple(const Strings& fields);
+
+// Converts an identifier name to a space-separated list of lower-case
+// words.  Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
+// treated as one word.  For example, both "FooBar123" and
+// "foo_bar_123" are converted to "foo bar 123".
+GTEST_API_ std::string ConvertIdentifierNameToWords(const char* id_name);
+
+// PointeeOf<Pointer>::type is the type of a value pointed to by a
+// Pointer, which can be either a smart pointer or a raw pointer.  The
+// following default implementation is for the case where Pointer is a
+// smart pointer.
+template <typename Pointer>
+struct PointeeOf {
+  // Smart pointer classes define type element_type as the type of
+  // their pointees.
+  typedef typename Pointer::element_type type;
+};
+// This specialization is for the raw pointer case.
+template <typename T>
+struct PointeeOf<T*> { typedef T type; };  // NOLINT
+
+// GetRawPointer(p) returns the raw pointer underlying p when p is a
+// smart pointer, or returns p itself when p is already a raw pointer.
+// The following default implementation is for the smart pointer case.
+template <typename Pointer>
+inline const typename Pointer::element_type* GetRawPointer(const Pointer& p) {
+  return p.get();
+}
+// This overloaded version is for the raw pointer case.
+template <typename Element>
+inline Element* GetRawPointer(Element* p) { return p; }
+
+// MSVC treats wchar_t as a native type usually, but treats it as the
+// same as unsigned short when the compiler option /Zc:wchar_t- is
+// specified.  It defines _NATIVE_WCHAR_T_DEFINED symbol when wchar_t
+// is a native type.
+#if defined(_MSC_VER) && !defined(_NATIVE_WCHAR_T_DEFINED)
+// wchar_t is a typedef.
+#else
+# define GMOCK_WCHAR_T_IS_NATIVE_ 1
+#endif
+
+// In what follows, we use the term "kind" to indicate whether a type
+// is bool, an integer type (excluding bool), a floating-point type,
+// or none of them.  This categorization is useful for determining
+// when a matcher argument type can be safely converted to another
+// type in the implementation of SafeMatcherCast.
+enum TypeKind {
+  kBool, kInteger, kFloatingPoint, kOther
+};
+
+// KindOf<T>::value is the kind of type T.
+template <typename T> struct KindOf {
+  enum { value = kOther };  // The default kind.
+};
+
+// This macro declares that the kind of 'type' is 'kind'.
+#define GMOCK_DECLARE_KIND_(type, kind) \
+  template <> struct KindOf<type> { enum { value = kind }; }
+
+GMOCK_DECLARE_KIND_(bool, kBool);
+
+// All standard integer types.
+GMOCK_DECLARE_KIND_(char, kInteger);
+GMOCK_DECLARE_KIND_(signed char, kInteger);
+GMOCK_DECLARE_KIND_(unsigned char, kInteger);
+GMOCK_DECLARE_KIND_(short, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(unsigned short, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(int, kInteger);
+GMOCK_DECLARE_KIND_(unsigned int, kInteger);
+GMOCK_DECLARE_KIND_(long, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(unsigned long, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(long long, kInteger);  // NOLINT
+GMOCK_DECLARE_KIND_(unsigned long long, kInteger);  // NOLINT
+
+#if GMOCK_WCHAR_T_IS_NATIVE_
+GMOCK_DECLARE_KIND_(wchar_t, kInteger);
+#endif
+
+// All standard floating-point types.
+GMOCK_DECLARE_KIND_(float, kFloatingPoint);
+GMOCK_DECLARE_KIND_(double, kFloatingPoint);
+GMOCK_DECLARE_KIND_(long double, kFloatingPoint);
+
+#undef GMOCK_DECLARE_KIND_
+
+// Evaluates to the kind of 'type'.
+#define GMOCK_KIND_OF_(type) \
+  static_cast< ::testing::internal::TypeKind>( \
+      ::testing::internal::KindOf<type>::value)
+
+// LosslessArithmeticConvertibleImpl<kFromKind, From, kToKind, To>::value
+// is true if and only if arithmetic type From can be losslessly converted to
+// arithmetic type To.
+//
+// It's the user's responsibility to ensure that both From and To are
+// raw (i.e. has no CV modifier, is not a pointer, and is not a
+// reference) built-in arithmetic types, kFromKind is the kind of
+// From, and kToKind is the kind of To; the value is
+// implementation-defined when the above pre-condition is violated.
+template <TypeKind kFromKind, typename From, TypeKind kToKind, typename To>
+using LosslessArithmeticConvertibleImpl = std::integral_constant<
+    bool,
+    // clang-format off
+      // Converting from bool is always lossless
+      (kFromKind == kBool) ? true
+      // Converting between any other type kinds will be lossy if the type
+      // kinds are not the same.
+    : (kFromKind != kToKind) ? false
+    : (kFromKind == kInteger &&
+       // Converting between integers of different widths is allowed so long
+       // as the conversion does not go from signed to unsigned.
+      (((sizeof(From) < sizeof(To)) &&
+        !(std::is_signed<From>::value && !std::is_signed<To>::value)) ||
+       // Converting between integers of the same width only requires the
+       // two types to have the same signedness.
+       ((sizeof(From) == sizeof(To)) &&
+        (std::is_signed<From>::value == std::is_signed<To>::value)))
+       ) ? true
+      // Floating point conversions are lossless if and only if `To` is at least
+      // as wide as `From`.
+    : (kFromKind == kFloatingPoint && (sizeof(From) <= sizeof(To))) ? true
+    : false
+    // clang-format on
+    >;
+
+// LosslessArithmeticConvertible<From, To>::value is true if and only if
+// arithmetic type From can be losslessly converted to arithmetic type To.
+//
+// It's the user's responsibility to ensure that both From and To are
+// raw (i.e. has no CV modifier, is not a pointer, and is not a
+// reference) built-in arithmetic types; the value is
+// implementation-defined when the above pre-condition is violated.
+template <typename From, typename To>
+using LosslessArithmeticConvertible =
+    LosslessArithmeticConvertibleImpl<GMOCK_KIND_OF_(From), From,
+                                      GMOCK_KIND_OF_(To), To>;
+
+// This interface knows how to report a Google Mock failure (either
+// non-fatal or fatal).
+class FailureReporterInterface {
+ public:
+  // The type of a failure (either non-fatal or fatal).
+  enum FailureType {
+    kNonfatal, kFatal
+  };
+
+  virtual ~FailureReporterInterface() {}
+
+  // Reports a failure that occurred at the given source file location.
+  virtual void ReportFailure(FailureType type, const char* file, int line,
+                             const std::string& message) = 0;
+};
+
+// Returns the failure reporter used by Google Mock.
+GTEST_API_ FailureReporterInterface* GetFailureReporter();
+
+// Asserts that condition is true; aborts the process with the given
+// message if condition is false.  We cannot use LOG(FATAL) or CHECK()
+// as Google Mock might be used to mock the log sink itself.  We
+// inline this function to prevent it from showing up in the stack
+// trace.
+inline void Assert(bool condition, const char* file, int line,
+                   const std::string& msg) {
+  if (!condition) {
+    GetFailureReporter()->ReportFailure(FailureReporterInterface::kFatal,
+                                        file, line, msg);
+  }
+}
+inline void Assert(bool condition, const char* file, int line) {
+  Assert(condition, file, line, "Assertion failed.");
+}
+
+// Verifies that condition is true; generates a non-fatal failure if
+// condition is false.
+inline void Expect(bool condition, const char* file, int line,
+                   const std::string& msg) {
+  if (!condition) {
+    GetFailureReporter()->ReportFailure(FailureReporterInterface::kNonfatal,
+                                        file, line, msg);
+  }
+}
+inline void Expect(bool condition, const char* file, int line) {
+  Expect(condition, file, line, "Expectation failed.");
+}
+
+// Severity level of a log.
+enum LogSeverity {
+  kInfo = 0,
+  kWarning = 1
+};
+
+// Valid values for the --gmock_verbose flag.
+
+// All logs (informational and warnings) are printed.
+const char kInfoVerbosity[] = "info";
+// Only warnings are printed.
+const char kWarningVerbosity[] = "warning";
+// No logs are printed.
+const char kErrorVerbosity[] = "error";
+
+// Returns true if and only if a log with the given severity is visible
+// according to the --gmock_verbose flag.
+GTEST_API_ bool LogIsVisible(LogSeverity severity);
+
+// Prints the given message to stdout if and only if 'severity' >= the level
+// specified by the --gmock_verbose flag.  If stack_frames_to_skip >=
+// 0, also prints the stack trace excluding the top
+// stack_frames_to_skip frames.  In opt mode, any positive
+// stack_frames_to_skip is treated as 0, since we don't know which
+// function calls will be inlined by the compiler and need to be
+// conservative.
+GTEST_API_ void Log(LogSeverity severity, const std::string& message,
+                    int stack_frames_to_skip);
+
+// A marker class that is used to resolve parameterless expectations to the
+// correct overload. This must not be instantiable, to prevent client code from
+// accidentally resolving to the overload; for example:
+//
+//    ON_CALL(mock, Method({}, nullptr))...
+//
+class WithoutMatchers {
+ private:
+  WithoutMatchers() {}
+  friend GTEST_API_ WithoutMatchers GetWithoutMatchers();
+};
+
+// Internal use only: access the singleton instance of WithoutMatchers.
+GTEST_API_ WithoutMatchers GetWithoutMatchers();
+
+// Disable MSVC warnings for infinite recursion, since in this case the
+// the recursion is unreachable.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4717)
+#endif
+
+// Invalid<T>() is usable as an expression of type T, but will terminate
+// the program with an assertion failure if actually run.  This is useful
+// when a value of type T is needed for compilation, but the statement
+// will not really be executed (or we don't care if the statement
+// crashes).
+template <typename T>
+inline T Invalid() {
+  Assert(false, "", -1, "Internal error: attempt to return invalid value");
+  // This statement is unreachable, and would never terminate even if it
+  // could be reached. It is provided only to placate compiler warnings
+  // about missing return statements.
+  return Invalid<T>();
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+// Given a raw type (i.e. having no top-level reference or const
+// modifier) RawContainer that's either an STL-style container or a
+// native array, class StlContainerView<RawContainer> has the
+// following members:
+//
+//   - type is a type that provides an STL-style container view to
+//     (i.e. implements the STL container concept for) RawContainer;
+//   - const_reference is a type that provides a reference to a const
+//     RawContainer;
+//   - ConstReference(raw_container) returns a const reference to an STL-style
+//     container view to raw_container, which is a RawContainer.
+//   - Copy(raw_container) returns an STL-style container view of a
+//     copy of raw_container, which is a RawContainer.
+//
+// This generic version is used when RawContainer itself is already an
+// STL-style container.
+template <class RawContainer>
+class StlContainerView {
+ public:
+  typedef RawContainer type;
+  typedef const type& const_reference;
+
+  static const_reference ConstReference(const RawContainer& container) {
+    static_assert(!std::is_const<RawContainer>::value,
+                  "RawContainer type must not be const");
+    return container;
+  }
+  static type Copy(const RawContainer& container) { return container; }
+};
+
+// This specialization is used when RawContainer is a native array type.
+template <typename Element, size_t N>
+class StlContainerView<Element[N]> {
+ public:
+  typedef typename std::remove_const<Element>::type RawElement;
+  typedef internal::NativeArray<RawElement> type;
+  // NativeArray<T> can represent a native array either by value or by
+  // reference (selected by a constructor argument), so 'const type'
+  // can be used to reference a const native array.  We cannot
+  // 'typedef const type& const_reference' here, as that would mean
+  // ConstReference() has to return a reference to a local variable.
+  typedef const type const_reference;
+
+  static const_reference ConstReference(const Element (&array)[N]) {
+    static_assert(std::is_same<Element, RawElement>::value,
+                  "Element type must not be const");
+    return type(array, N, RelationToSourceReference());
+  }
+  static type Copy(const Element (&array)[N]) {
+    return type(array, N, RelationToSourceCopy());
+  }
+};
+
+// This specialization is used when RawContainer is a native array
+// represented as a (pointer, size) tuple.
+template <typename ElementPointer, typename Size>
+class StlContainerView< ::std::tuple<ElementPointer, Size> > {
+ public:
+  typedef typename std::remove_const<
+      typename internal::PointeeOf<ElementPointer>::type>::type RawElement;
+  typedef internal::NativeArray<RawElement> type;
+  typedef const type const_reference;
+
+  static const_reference ConstReference(
+      const ::std::tuple<ElementPointer, Size>& array) {
+    return type(std::get<0>(array), std::get<1>(array),
+                RelationToSourceReference());
+  }
+  static type Copy(const ::std::tuple<ElementPointer, Size>& array) {
+    return type(std::get<0>(array), std::get<1>(array), RelationToSourceCopy());
+  }
+};
+
+// The following specialization prevents the user from instantiating
+// StlContainer with a reference type.
+template <typename T> class StlContainerView<T&>;
+
+// A type transform to remove constness from the first part of a pair.
+// Pairs like that are used as the value_type of associative containers,
+// and this transform produces a similar but assignable pair.
+template <typename T>
+struct RemoveConstFromKey {
+  typedef T type;
+};
+
+// Partially specialized to remove constness from std::pair<const K, V>.
+template <typename K, typename V>
+struct RemoveConstFromKey<std::pair<const K, V> > {
+  typedef std::pair<K, V> type;
+};
+
+// Emit an assertion failure due to incorrect DoDefault() usage. Out-of-lined to
+// reduce code size.
+GTEST_API_ void IllegalDoDefault(const char* file, int line);
+
+template <typename F, typename Tuple, size_t... Idx>
+auto ApplyImpl(F&& f, Tuple&& args, IndexSequence<Idx...>) -> decltype(
+    std::forward<F>(f)(std::get<Idx>(std::forward<Tuple>(args))...)) {
+  return std::forward<F>(f)(std::get<Idx>(std::forward<Tuple>(args))...);
+}
+
+// Apply the function to a tuple of arguments.
+template <typename F, typename Tuple>
+auto Apply(F&& f, Tuple&& args)
+    -> decltype(ApplyImpl(std::forward<F>(f), std::forward<Tuple>(args),
+                          MakeIndexSequence<std::tuple_size<Tuple>::value>())) {
+  return ApplyImpl(std::forward<F>(f), std::forward<Tuple>(args),
+                   MakeIndexSequence<std::tuple_size<Tuple>::value>());
+}
+
+// Template struct Function<F>, where F must be a function type, contains
+// the following typedefs:
+//
+//   Result:               the function's return type.
+//   Arg<N>:               the type of the N-th argument, where N starts with 0.
+//   ArgumentTuple:        the tuple type consisting of all parameters of F.
+//   ArgumentMatcherTuple: the tuple type consisting of Matchers for all
+//                         parameters of F.
+//   MakeResultVoid:       the function type obtained by substituting void
+//                         for the return type of F.
+//   MakeResultIgnoredValue:
+//                         the function type obtained by substituting Something
+//                         for the return type of F.
+template <typename T>
+struct Function;
+
+template <typename R, typename... Args>
+struct Function<R(Args...)> {
+  using Result = R;
+  static constexpr size_t ArgumentCount = sizeof...(Args);
+  template <size_t I>
+  using Arg = ElemFromList<I, Args...>;
+  using ArgumentTuple = std::tuple<Args...>;
+  using ArgumentMatcherTuple = std::tuple<Matcher<Args>...>;
+  using MakeResultVoid = void(Args...);
+  using MakeResultIgnoredValue = IgnoredValue(Args...);
+};
+
+template <typename R, typename... Args>
+constexpr size_t Function<R(Args...)>::ArgumentCount;
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+}  // namespace internal
+}  // namespace testing
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_INTERNAL_UTILS_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/gmock-port.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/gmock-port.h
new file mode 100644
index 0000000000..70872ef392
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/gmock-port.h
@@ -0,0 +1,87 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Low-level types and utilities for porting Google Mock to various
+// platforms.  All macros ending with _ and symbols defined in an
+// internal namespace are subject to change without notice.  Code
+// outside Google Mock MUST NOT USE THEM DIRECTLY.  Macros that don't
+// end with _ are part of Google Mock's public API and can be used by
+// code outside Google Mock.
+
+// GOOGLETEST_CM0002 DO NOT DELETE
+
+#ifndef GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
+#define GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
+
+#include <assert.h>
+#include <stdlib.h>
+#include <cstdint>
+#include <iostream>
+
+// Most of the utilities needed for porting Google Mock are also
+// required for Google Test and are defined in gtest-port.h.
+//
+// Note to maintainers: to reduce code duplication, prefer adding
+// portability utilities to Google Test's gtest-port.h instead of
+// here, as Google Mock depends on Google Test.  Only add a utility
+// here if it's truly specific to Google Mock.
+
+#include "gtest/internal/gtest-port.h"
+#include "gmock/internal/custom/gmock-port.h"
+
+// For MS Visual C++, check the compiler version. At least VS 2015 is
+// required to compile Google Mock.
+#if defined(_MSC_VER) && _MSC_VER < 1900
+# error "At least Visual C++ 2015 (14.0) is required to compile Google Mock."
+#endif
+
+// Macro for referencing flags.  This is public as we want the user to
+// use this syntax to reference Google Mock flags.
+#define GMOCK_FLAG(name) FLAGS_gmock_##name
+
+#if !defined(GMOCK_DECLARE_bool_)
+
+// Macros for declaring flags.
+# define GMOCK_DECLARE_bool_(name) extern GTEST_API_ bool GMOCK_FLAG(name)
+# define GMOCK_DECLARE_int32_(name) extern GTEST_API_ int32_t GMOCK_FLAG(name)
+# define GMOCK_DECLARE_string_(name) \
+    extern GTEST_API_ ::std::string GMOCK_FLAG(name)
+
+// Macros for defining flags.
+# define GMOCK_DEFINE_bool_(name, default_val, doc) \
+    GTEST_API_ bool GMOCK_FLAG(name) = (default_val)
+# define GMOCK_DEFINE_int32_(name, default_val, doc) \
+    GTEST_API_ int32_t GMOCK_FLAG(name) = (default_val)
+# define GMOCK_DEFINE_string_(name, default_val, doc) \
+    GTEST_API_ ::std::string GMOCK_FLAG(name) = (default_val)
+
+#endif  // !defined(GMOCK_DECLARE_bool_)
+
+#endif  // GMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PORT_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/gmock-pp.h b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/gmock-pp.h
new file mode 100644
index 0000000000..d13e75f30d
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/include/gmock/internal/gmock-pp.h
@@ -0,0 +1,279 @@
+#ifndef THIRD_PARTY_GOOGLETEST_GOOGLEMOCK_INCLUDE_GMOCK_PP_H_
+#define THIRD_PARTY_GOOGLETEST_GOOGLEMOCK_INCLUDE_GMOCK_PP_H_
+
+// Expands and concatenates the arguments. Constructed macros reevaluate.
+#define GMOCK_PP_CAT(_1, _2) GMOCK_PP_INTERNAL_CAT(_1, _2)
+
+// Expands and stringifies the only argument.
+#define GMOCK_PP_STRINGIZE(...) GMOCK_PP_INTERNAL_STRINGIZE(__VA_ARGS__)
+
+// Returns empty. Given a variadic number of arguments.
+#define GMOCK_PP_EMPTY(...)
+
+// Returns a comma. Given a variadic number of arguments.
+#define GMOCK_PP_COMMA(...) ,
+
+// Returns the only argument.
+#define GMOCK_PP_IDENTITY(_1) _1
+
+// Evaluates to the number of arguments after expansion.
+//
+//   #define PAIR x, y
+//
+//   GMOCK_PP_NARG() => 1
+//   GMOCK_PP_NARG(x) => 1
+//   GMOCK_PP_NARG(x, y) => 2
+//   GMOCK_PP_NARG(PAIR) => 2
+//
+// Requires: the number of arguments after expansion is at most 15.
+#define GMOCK_PP_NARG(...) \
+  GMOCK_PP_INTERNAL_16TH(  \
+      (__VA_ARGS__, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1))
+
+// Returns 1 if the expansion of arguments has an unprotected comma. Otherwise
+// returns 0. Requires no more than 15 unprotected commas.
+#define GMOCK_PP_HAS_COMMA(...) \
+  GMOCK_PP_INTERNAL_16TH(       \
+      (__VA_ARGS__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0))
+
+// Returns the first argument.
+#define GMOCK_PP_HEAD(...) GMOCK_PP_INTERNAL_HEAD((__VA_ARGS__))
+
+// Returns the tail. A variadic list of all arguments minus the first. Requires
+// at least one argument.
+#define GMOCK_PP_TAIL(...) GMOCK_PP_INTERNAL_TAIL((__VA_ARGS__))
+
+// Calls CAT(_Macro, NARG(__VA_ARGS__))(__VA_ARGS__)
+#define GMOCK_PP_VARIADIC_CALL(_Macro, ...) \
+  GMOCK_PP_IDENTITY(                        \
+      GMOCK_PP_CAT(_Macro, GMOCK_PP_NARG(__VA_ARGS__))(__VA_ARGS__))
+
+// If the arguments after expansion have no tokens, evaluates to `1`. Otherwise
+// evaluates to `0`.
+//
+// Requires: * the number of arguments after expansion is at most 15.
+//           * If the argument is a macro, it must be able to be called with one
+//             argument.
+//
+// Implementation details:
+//
+// There is one case when it generates a compile error: if the argument is macro
+// that cannot be called with one argument.
+//
+//   #define M(a, b)  // it doesn't matter what it expands to
+//
+//   // Expected: expands to `0`.
+//   // Actual: compile error.
+//   GMOCK_PP_IS_EMPTY(M)
+//
+// There are 4 cases tested:
+//
+// * __VA_ARGS__ possible expansion has no unparen'd commas. Expected 0.
+// * __VA_ARGS__ possible expansion is not enclosed in parenthesis. Expected 0.
+// * __VA_ARGS__ possible expansion is not a macro that ()-evaluates to a comma.
+//   Expected 0
+// * __VA_ARGS__ is empty, or has unparen'd commas, or is enclosed in
+//   parenthesis, or is a macro that ()-evaluates to comma. Expected 1.
+//
+// We trigger detection on '0001', i.e. on empty.
+#define GMOCK_PP_IS_EMPTY(...)                                               \
+  GMOCK_PP_INTERNAL_IS_EMPTY(GMOCK_PP_HAS_COMMA(__VA_ARGS__),                \
+                             GMOCK_PP_HAS_COMMA(GMOCK_PP_COMMA __VA_ARGS__), \
+                             GMOCK_PP_HAS_COMMA(__VA_ARGS__()),              \
+                             GMOCK_PP_HAS_COMMA(GMOCK_PP_COMMA __VA_ARGS__()))
+
+// Evaluates to _Then if _Cond is 1 and _Else if _Cond is 0.
+#define GMOCK_PP_IF(_Cond, _Then, _Else) \
+  GMOCK_PP_CAT(GMOCK_PP_INTERNAL_IF_, _Cond)(_Then, _Else)
+
+// Similar to GMOCK_PP_IF but takes _Then and _Else in parentheses.
+//
+// GMOCK_PP_GENERIC_IF(1, (a, b, c), (d, e, f)) => a, b, c
+// GMOCK_PP_GENERIC_IF(0, (a, b, c), (d, e, f)) => d, e, f
+//
+#define GMOCK_PP_GENERIC_IF(_Cond, _Then, _Else) \
+  GMOCK_PP_REMOVE_PARENS(GMOCK_PP_IF(_Cond, _Then, _Else))
+
+// Evaluates to the number of arguments after expansion. Identifies 'empty' as
+// 0.
+//
+//   #define PAIR x, y
+//
+//   GMOCK_PP_NARG0() => 0
+//   GMOCK_PP_NARG0(x) => 1
+//   GMOCK_PP_NARG0(x, y) => 2
+//   GMOCK_PP_NARG0(PAIR) => 2
+//
+// Requires: * the number of arguments after expansion is at most 15.
+//           * If the argument is a macro, it must be able to be called with one
+//             argument.
+#define GMOCK_PP_NARG0(...) \
+  GMOCK_PP_IF(GMOCK_PP_IS_EMPTY(__VA_ARGS__), 0, GMOCK_PP_NARG(__VA_ARGS__))
+
+// Expands to 1 if the first argument starts with something in parentheses,
+// otherwise to 0.
+#define GMOCK_PP_IS_BEGIN_PARENS(...)                              \
+  GMOCK_PP_HEAD(GMOCK_PP_CAT(GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_, \
+                             GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C __VA_ARGS__))
+
+// Expands to 1 is there is only one argument and it is enclosed in parentheses.
+#define GMOCK_PP_IS_ENCLOSED_PARENS(...)             \
+  GMOCK_PP_IF(GMOCK_PP_IS_BEGIN_PARENS(__VA_ARGS__), \
+              GMOCK_PP_IS_EMPTY(GMOCK_PP_EMPTY __VA_ARGS__), 0)
+
+// Remove the parens, requires GMOCK_PP_IS_ENCLOSED_PARENS(args) => 1.
+#define GMOCK_PP_REMOVE_PARENS(...) GMOCK_PP_INTERNAL_REMOVE_PARENS __VA_ARGS__
+
+// Expands to _Macro(0, _Data, e1) _Macro(1, _Data, e2) ... _Macro(K -1, _Data,
+// eK) as many of GMOCK_INTERNAL_NARG0 _Tuple.
+// Requires: * |_Macro| can be called with 3 arguments.
+//           * |_Tuple| expansion has no more than 15 elements.
+#define GMOCK_PP_FOR_EACH(_Macro, _Data, _Tuple)                        \
+  GMOCK_PP_CAT(GMOCK_PP_INTERNAL_FOR_EACH_IMPL_, GMOCK_PP_NARG0 _Tuple) \
+  (0, _Macro, _Data, _Tuple)
+
+// Expands to _Macro(0, _Data, ) _Macro(1, _Data, ) ... _Macro(K - 1, _Data, )
+// Empty if _K = 0.
+// Requires: * |_Macro| can be called with 3 arguments.
+//           * |_K| literal between 0 and 15
+#define GMOCK_PP_REPEAT(_Macro, _Data, _N)           \
+  GMOCK_PP_CAT(GMOCK_PP_INTERNAL_FOR_EACH_IMPL_, _N) \
+  (0, _Macro, _Data, GMOCK_PP_INTENRAL_EMPTY_TUPLE)
+
+// Increments the argument, requires the argument to be between 0 and 15.
+#define GMOCK_PP_INC(_i) GMOCK_PP_CAT(GMOCK_PP_INTERNAL_INC_, _i)
+
+// Returns comma if _i != 0. Requires _i to be between 0 and 15.
+#define GMOCK_PP_COMMA_IF(_i) GMOCK_PP_CAT(GMOCK_PP_INTERNAL_COMMA_IF_, _i)
+
+// Internal details follow. Do not use any of these symbols outside of this
+// file or we will break your code.
+#define GMOCK_PP_INTENRAL_EMPTY_TUPLE (, , , , , , , , , , , , , , , )
+#define GMOCK_PP_INTERNAL_CAT(_1, _2) _1##_2
+#define GMOCK_PP_INTERNAL_STRINGIZE(...) #__VA_ARGS__
+#define GMOCK_PP_INTERNAL_CAT_5(_1, _2, _3, _4, _5) _1##_2##_3##_4##_5
+#define GMOCK_PP_INTERNAL_IS_EMPTY(_1, _2, _3, _4)                             \
+  GMOCK_PP_HAS_COMMA(GMOCK_PP_INTERNAL_CAT_5(GMOCK_PP_INTERNAL_IS_EMPTY_CASE_, \
+                                             _1, _2, _3, _4))
+#define GMOCK_PP_INTERNAL_IS_EMPTY_CASE_0001 ,
+#define GMOCK_PP_INTERNAL_IF_1(_Then, _Else) _Then
+#define GMOCK_PP_INTERNAL_IF_0(_Then, _Else) _Else
+
+// Because of MSVC treating a token with a comma in it as a single token when
+// passed to another macro, we need to force it to evaluate it as multiple
+// tokens. We do that by using a "IDENTITY(MACRO PARENTHESIZED_ARGS)" macro. We
+// define one per possible macro that relies on this behavior. Note "_Args" must
+// be parenthesized.
+#define GMOCK_PP_INTERNAL_INTERNAL_16TH(_1, _2, _3, _4, _5, _6, _7, _8, _9, \
+                                        _10, _11, _12, _13, _14, _15, _16,  \
+                                        ...)                                \
+  _16
+#define GMOCK_PP_INTERNAL_16TH(_Args) \
+  GMOCK_PP_IDENTITY(GMOCK_PP_INTERNAL_INTERNAL_16TH _Args)
+#define GMOCK_PP_INTERNAL_INTERNAL_HEAD(_1, ...) _1
+#define GMOCK_PP_INTERNAL_HEAD(_Args) \
+  GMOCK_PP_IDENTITY(GMOCK_PP_INTERNAL_INTERNAL_HEAD _Args)
+#define GMOCK_PP_INTERNAL_INTERNAL_TAIL(_1, ...) __VA_ARGS__
+#define GMOCK_PP_INTERNAL_TAIL(_Args) \
+  GMOCK_PP_IDENTITY(GMOCK_PP_INTERNAL_INTERNAL_TAIL _Args)
+
+#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C(...) 1 _
+#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_1 1,
+#define GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_R_GMOCK_PP_INTERNAL_IBP_IS_VARIADIC_C \
+  0,
+#define GMOCK_PP_INTERNAL_REMOVE_PARENS(...) __VA_ARGS__
+#define GMOCK_PP_INTERNAL_INC_0 1
+#define GMOCK_PP_INTERNAL_INC_1 2
+#define GMOCK_PP_INTERNAL_INC_2 3
+#define GMOCK_PP_INTERNAL_INC_3 4
+#define GMOCK_PP_INTERNAL_INC_4 5
+#define GMOCK_PP_INTERNAL_INC_5 6
+#define GMOCK_PP_INTERNAL_INC_6 7
+#define GMOCK_PP_INTERNAL_INC_7 8
+#define GMOCK_PP_INTERNAL_INC_8 9
+#define GMOCK_PP_INTERNAL_INC_9 10
+#define GMOCK_PP_INTERNAL_INC_10 11
+#define GMOCK_PP_INTERNAL_INC_11 12
+#define GMOCK_PP_INTERNAL_INC_12 13
+#define GMOCK_PP_INTERNAL_INC_13 14
+#define GMOCK_PP_INTERNAL_INC_14 15
+#define GMOCK_PP_INTERNAL_INC_15 16
+#define GMOCK_PP_INTERNAL_COMMA_IF_0
+#define GMOCK_PP_INTERNAL_COMMA_IF_1 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_2 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_3 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_4 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_5 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_6 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_7 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_8 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_9 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_10 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_11 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_12 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_13 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_14 ,
+#define GMOCK_PP_INTERNAL_COMMA_IF_15 ,
+#define GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, _element) \
+  _Macro(_i, _Data, _element)
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_0(_i, _Macro, _Data, _Tuple)
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_1(_i, _Macro, _Data, _Tuple) \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple)
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_2(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_1(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_3(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_2(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_4(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_3(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_5(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_4(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_6(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_5(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_7(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_6(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_8(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_7(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_9(_i, _Macro, _Data, _Tuple)    \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_8(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_10(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_9(GMOCK_PP_INC(_i), _Macro, _Data,    \
+                                    (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_11(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_10(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_12(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_11(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_13(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_12(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_14(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_13(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+#define GMOCK_PP_INTERNAL_FOR_EACH_IMPL_15(_i, _Macro, _Data, _Tuple)   \
+  GMOCK_PP_INTERNAL_CALL_MACRO(_Macro, _i, _Data, GMOCK_PP_HEAD _Tuple) \
+  GMOCK_PP_INTERNAL_FOR_EACH_IMPL_14(GMOCK_PP_INC(_i), _Macro, _Data,   \
+                                     (GMOCK_PP_TAIL _Tuple))
+
+#endif  // THIRD_PARTY_GOOGLETEST_GOOGLEMOCK_INCLUDE_GMOCK_INTERNAL_GMOCK_PP_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/README.md b/GraphBLAS/CUDA/test/googlemock/scripts/README.md
new file mode 100644
index 0000000000..a3301e5bf6
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/README.md
@@ -0,0 +1,5 @@
+# Please Note:
+
+Files in this directory are no longer supported by the maintainers. They
+represent mostly historical artifacts and supported by the community only. There
+is no guarantee whatsoever that these scripts still work.
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/fuse_gmock_files.py b/GraphBLAS/CUDA/test/googlemock/scripts/fuse_gmock_files.py
new file mode 100755
index 0000000000..c33c7253fe
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/fuse_gmock_files.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python
+#
+# Copyright 2009, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""fuse_gmock_files.py v0.1.0
+Fuses Google Mock and Google Test source code into two .h files and a .cc file.
+
+SYNOPSIS
+       fuse_gmock_files.py [GMOCK_ROOT_DIR] OUTPUT_DIR
+
+       Scans GMOCK_ROOT_DIR for Google Mock and Google Test source
+       code, assuming Google Test is in the GMOCK_ROOT_DIR/../googletest
+       directory, and generates three files:
+       OUTPUT_DIR/gtest/gtest.h, OUTPUT_DIR/gmock/gmock.h, and
+       OUTPUT_DIR/gmock-gtest-all.cc.  Then you can build your tests
+       by adding OUTPUT_DIR to the include search path and linking
+       with OUTPUT_DIR/gmock-gtest-all.cc.  These three files contain
+       everything you need to use Google Mock.  Hence you can
+       "install" Google Mock by copying them to wherever you want.
+
+       GMOCK_ROOT_DIR can be omitted and defaults to the parent
+       directory of the directory holding this script.
+
+EXAMPLES
+       ./fuse_gmock_files.py fused_gmock
+       ./fuse_gmock_files.py path/to/unpacked/gmock fused_gmock
+
+This tool is experimental.  In particular, it assumes that there is no
+conditional inclusion of Google Mock or Google Test headers.  Please
+report any problems to googlemock@googlegroups.com.  You can read
+https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md for more
+information.
+"""
+
+__author__ = 'wan@google.com (Zhanyong Wan)'
+
+import os
+import re
+import sets
+import sys
+
+# We assume that this file is in the scripts/ directory in the Google
+# Mock root directory.
+DEFAULT_GMOCK_ROOT_DIR = os.path.join(os.path.dirname(__file__), '..')
+
+# We need to call into googletest/scripts/fuse_gtest_files.py.
+sys.path.append(os.path.join(DEFAULT_GMOCK_ROOT_DIR, '../googletest/scripts'))
+import fuse_gtest_files
+gtest = fuse_gtest_files
+
+# Regex for matching '#include "gmock/..."'.
+INCLUDE_GMOCK_FILE_REGEX = re.compile(r'^\s*#\s*include\s*"(gmock/.+)"')
+
+# Where to find the source seed files.
+GMOCK_H_SEED = 'include/gmock/gmock.h'
+GMOCK_ALL_CC_SEED = 'src/gmock-all.cc'
+
+# Where to put the generated files.
+GTEST_H_OUTPUT = 'gtest/gtest.h'
+GMOCK_H_OUTPUT = 'gmock/gmock.h'
+GMOCK_GTEST_ALL_CC_OUTPUT = 'gmock-gtest-all.cc'
+
+
+def GetGTestRootDir(gmock_root):
+  """Returns the root directory of Google Test."""
+
+  return os.path.join(gmock_root, '../googletest')
+
+
+def ValidateGMockRootDir(gmock_root):
+  """Makes sure gmock_root points to a valid gmock root directory.
+
+  The function aborts the program on failure.
+  """
+
+  gtest.ValidateGTestRootDir(GetGTestRootDir(gmock_root))
+  gtest.VerifyFileExists(gmock_root, GMOCK_H_SEED)
+  gtest.VerifyFileExists(gmock_root, GMOCK_ALL_CC_SEED)
+
+
+def ValidateOutputDir(output_dir):
+  """Makes sure output_dir points to a valid output directory.
+
+  The function aborts the program on failure.
+  """
+
+  gtest.VerifyOutputFile(output_dir, gtest.GTEST_H_OUTPUT)
+  gtest.VerifyOutputFile(output_dir, GMOCK_H_OUTPUT)
+  gtest.VerifyOutputFile(output_dir, GMOCK_GTEST_ALL_CC_OUTPUT)
+
+
+def FuseGMockH(gmock_root, output_dir):
+  """Scans folder gmock_root to generate gmock/gmock.h in output_dir."""
+
+  output_file = file(os.path.join(output_dir, GMOCK_H_OUTPUT), 'w')
+  processed_files = sets.Set()  # Holds all gmock headers we've processed.
+
+  def ProcessFile(gmock_header_path):
+    """Processes the given gmock header file."""
+
+    # We don't process the same header twice.
+    if gmock_header_path in processed_files:
+      return
+
+    processed_files.add(gmock_header_path)
+
+    # Reads each line in the given gmock header.
+    for line in file(os.path.join(gmock_root, gmock_header_path), 'r'):
+      m = INCLUDE_GMOCK_FILE_REGEX.match(line)
+      if m:
+        # It's '#include "gmock/..."' - let's process it recursively.
+        ProcessFile('include/' + m.group(1))
+      else:
+        m = gtest.INCLUDE_GTEST_FILE_REGEX.match(line)
+        if m:
+          # It's '#include "gtest/foo.h"'.  We translate it to
+          # "gtest/gtest.h", regardless of what foo is, since all
+          # gtest headers are fused into gtest/gtest.h.
+
+          # There is no need to #include gtest.h twice.
+          if not gtest.GTEST_H_SEED in processed_files:
+            processed_files.add(gtest.GTEST_H_SEED)
+            output_file.write('#include "%s"\n' % (gtest.GTEST_H_OUTPUT,))
+        else:
+          # Otherwise we copy the line unchanged to the output file.
+          output_file.write(line)
+
+  ProcessFile(GMOCK_H_SEED)
+  output_file.close()
+
+
+def FuseGMockAllCcToFile(gmock_root, output_file):
+  """Scans folder gmock_root to fuse gmock-all.cc into output_file."""
+
+  processed_files = sets.Set()
+
+  def ProcessFile(gmock_source_file):
+    """Processes the given gmock source file."""
+
+    # We don't process the same #included file twice.
+    if gmock_source_file in processed_files:
+      return
+
+    processed_files.add(gmock_source_file)
+
+    # Reads each line in the given gmock source file.
+    for line in file(os.path.join(gmock_root, gmock_source_file), 'r'):
+      m = INCLUDE_GMOCK_FILE_REGEX.match(line)
+      if m:
+        # It's '#include "gmock/foo.h"'.  We treat it as '#include
+        # "gmock/gmock.h"', as all other gmock headers are being fused
+        # into gmock.h and cannot be #included directly.
+
+        # There is no need to #include "gmock/gmock.h" more than once.
+        if not GMOCK_H_SEED in processed_files:
+          processed_files.add(GMOCK_H_SEED)
+          output_file.write('#include "%s"\n' % (GMOCK_H_OUTPUT,))
+      else:
+        m = gtest.INCLUDE_GTEST_FILE_REGEX.match(line)
+        if m:
+          # It's '#include "gtest/..."'.
+          # There is no need to #include gtest.h as it has been
+          # #included by gtest-all.cc.
+          pass
+        else:
+          m = gtest.INCLUDE_SRC_FILE_REGEX.match(line)
+          if m:
+            # It's '#include "src/foo"' - let's process it recursively.
+            ProcessFile(m.group(1))
+          else:
+            # Otherwise we copy the line unchanged to the output file.
+            output_file.write(line)
+
+  ProcessFile(GMOCK_ALL_CC_SEED)
+
+
+def FuseGMockGTestAllCc(gmock_root, output_dir):
+  """Scans folder gmock_root to generate gmock-gtest-all.cc in output_dir."""
+
+  output_file = file(os.path.join(output_dir, GMOCK_GTEST_ALL_CC_OUTPUT), 'w')
+  # First, fuse gtest-all.cc into gmock-gtest-all.cc.
+  gtest.FuseGTestAllCcToFile(GetGTestRootDir(gmock_root), output_file)
+  # Next, append fused gmock-all.cc to gmock-gtest-all.cc.
+  FuseGMockAllCcToFile(gmock_root, output_file)
+  output_file.close()
+
+
+def FuseGMock(gmock_root, output_dir):
+  """Fuses gtest.h, gmock.h, and gmock-gtest-all.h."""
+
+  ValidateGMockRootDir(gmock_root)
+  ValidateOutputDir(output_dir)
+
+  gtest.FuseGTestH(GetGTestRootDir(gmock_root), output_dir)
+  FuseGMockH(gmock_root, output_dir)
+  FuseGMockGTestAllCc(gmock_root, output_dir)
+
+
+def main():
+  argc = len(sys.argv)
+  if argc == 2:
+    # fuse_gmock_files.py OUTPUT_DIR
+    FuseGMock(DEFAULT_GMOCK_ROOT_DIR, sys.argv[1])
+  elif argc == 3:
+    # fuse_gmock_files.py GMOCK_ROOT_DIR OUTPUT_DIR
+    FuseGMock(sys.argv[1], sys.argv[2])
+  else:
+    print __doc__
+    sys.exit(1)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/generator/LICENSE b/GraphBLAS/CUDA/test/googlemock/scripts/generator/LICENSE
new file mode 100644
index 0000000000..87ea063651
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/generator/LICENSE
@@ -0,0 +1,203 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [2007] Neal Norwitz
+   Portions Copyright [2007] Google Inc.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/generator/README b/GraphBLAS/CUDA/test/googlemock/scripts/generator/README
new file mode 100644
index 0000000000..01fd463dda
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/generator/README
@@ -0,0 +1,34 @@
+
+The Google Mock class generator is an application that is part of cppclean.
+For more information about cppclean, visit http://code.google.com/p/cppclean/
+
+The mock generator requires Python 2.3.5 or later.  If you don't have Python
+installed on your system, you will also need to install it.  You can download
+Python from:  http://www.python.org/download/releases/
+
+To use the Google Mock class generator, you need to call it
+on the command line passing the header file and class for which you want
+to generate a Google Mock class.
+
+Make sure to install the scripts somewhere in your path.  Then you can
+run the program.
+
+  gmock_gen.py header-file.h [ClassName]...
+
+If no ClassNames are specified, all classes in the file are emitted.
+
+To change the indentation from the default of 2, set INDENT in
+the environment.  For example to use an indent of 4 spaces:
+
+INDENT=4 gmock_gen.py header-file.h ClassName
+
+This version was made from SVN revision 281 in the cppclean repository.
+
+Known Limitations
+-----------------
+Not all code will be generated properly.  For example, when mocking templated
+classes, the template information is lost.  You will need to add the template
+information manually.
+
+Not all permutations of using multiple pointers/references will be rendered
+properly.  These will also have to be fixed manually.
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/generator/README.cppclean b/GraphBLAS/CUDA/test/googlemock/scripts/generator/README.cppclean
new file mode 100644
index 0000000000..65431b6175
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/generator/README.cppclean
@@ -0,0 +1,115 @@
+Goal:
+-----
+  CppClean attempts to find problems in C++ source that slow development
+  in large code bases, for example various forms of unused code.
+  Unused code can be unused functions, methods, data members, types, etc
+  to unnecessary #include directives.  Unnecessary #includes can cause
+  considerable extra compiles increasing the edit-compile-run cycle.
+
+  The project home page is:   http://code.google.com/p/cppclean/
+
+
+Features:
+---------
+ * Find and print C++ language constructs: classes, methods, functions, etc.
+ * Find classes with virtual methods, no virtual destructor, and no bases
+ * Find global/static data that are potential problems when using threads
+ * Unnecessary forward class declarations
+ * Unnecessary function declarations
+ * Undeclared function definitions
+ * (planned) Find unnecessary header files #included
+   - No direct reference to anything in the header
+   - Header is unnecessary if classes were forward declared instead
+ * (planned) Source files that reference headers not directly #included,
+   ie, files that rely on a transitive #include from another header
+ * (planned) Unused members (private, protected, & public) methods and data
+ * (planned) Store AST in a SQL database so relationships can be queried
+
+AST is Abstract Syntax Tree, a representation of parsed source code.
+http://en.wikipedia.org/wiki/Abstract_syntax_tree
+
+
+System Requirements:
+--------------------
+ * Python 2.4 or later (2.3 probably works too)
+ * Works on Windows (untested), Mac OS X, and Unix
+
+
+How to Run:
+-----------
+  For all examples, it is assumed that cppclean resides in a directory called
+  /cppclean.
+
+  To print warnings for classes with virtual methods, no virtual destructor and
+  no base classes:
+
+      /cppclean/run.sh nonvirtual_dtors.py file1.h file2.h file3.cc ...
+
+  To print all the functions defined in header file(s):
+
+      /cppclean/run.sh functions.py file1.h file2.h ...
+
+  All the commands take multiple files on the command line.  Other programs
+  include: find_warnings, headers, methods, and types.  Some other programs
+  are available, but used primarily for debugging.
+
+  run.sh is a simple wrapper that sets PYTHONPATH to /cppclean and then
+  runs the program in /cppclean/cpp/PROGRAM.py.  There is currently
+  no equivalent for Windows.  Contributions for a run.bat file
+  would be greatly appreciated.
+
+
+How to Configure:
+-----------------
+  You can add a siteheaders.py file in /cppclean/cpp to configure where
+  to look for other headers (typically -I options passed to a compiler).
+  Currently two values are supported:  _TRANSITIVE and GetIncludeDirs.
+  _TRANSITIVE should be set to a boolean value (True or False) indicating
+  whether to transitively process all header files.  The default is False.
+
+  GetIncludeDirs is a function that takes a single argument and returns
+  a sequence of directories to include.  This can be a generator or
+  return a static list.
+
+      def GetIncludeDirs(filename):
+          return ['/some/path/with/other/headers']
+
+      # Here is a more complicated example.
+      def GetIncludeDirs(filename):
+          yield '/path1'
+          yield os.path.join('/path2', os.path.dirname(filename))
+          yield '/path3'
+
+
+How to Test:
+------------
+  For all examples, it is assumed that cppclean resides in a directory called
+  /cppclean.  The tests require
+
+  cd /cppclean
+  make test
+  # To generate expected results after a change:
+  make expected
+
+
+Current Status:
+---------------
+  The parser works pretty well for header files, parsing about 99% of Google's
+  header files.  Anything which inspects structure of C++ source files should
+  work reasonably well.  Function bodies are not transformed to an AST,
+  but left as tokens.  Much work is still needed on finding unused header files
+  and storing an AST in a database.
+
+
+Non-goals:
+----------
+ * Parsing all valid C++ source
+ * Handling invalid C++ source gracefully
+ * Compiling to machine code (or anything beyond an AST)
+
+
+Contact:
+--------
+  If you used cppclean, I would love to hear about your experiences
+  cppclean@googlegroups.com.  Even if you don't use cppclean, I'd like to
+  hear from you.  :-)  (You can contact me directly at:  nnorwitz@gmail.com)
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/__init__.py b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/__init__.py
new file mode 100755
index 0000000000..e69de29bb2
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/ast.py b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/ast.py
new file mode 100755
index 0000000000..b4890a5471
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/ast.py
@@ -0,0 +1,1773 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 Neal Norwitz
+# Portions Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generate an Abstract Syntax Tree (AST) for C++."""
+
+# FIXME:
+#  * Tokens should never be exported, need to convert to Nodes
+#    (return types, parameters, etc.)
+#  * Handle static class data for templatized classes
+#  * Handle casts (both C++ and C-style)
+#  * Handle conditions and loops (if/else, switch, for, while/do)
+#
+# TODO much, much later:
+#  * Handle #define
+#  * exceptions
+
+
+try:
+  # Python 3.x
+  import builtins
+except ImportError:
+  # Python 2.x
+  import __builtin__ as builtins
+
+import sys
+import traceback
+
+from cpp import keywords
+from cpp import tokenize
+from cpp import utils
+
+
+if not hasattr(builtins, 'reversed'):
+  # Support Python 2.3 and earlier.
+  def reversed(seq):
+    for i in range(len(seq)-1, -1, -1):
+      yield seq[i]
+
+if not hasattr(builtins, 'next'):
+  # Support Python 2.5 and earlier.
+  def next(obj):
+    return obj.next()
+
+
+VISIBILITY_PUBLIC, VISIBILITY_PROTECTED, VISIBILITY_PRIVATE = range(3)
+
+FUNCTION_NONE = 0x00
+FUNCTION_CONST = 0x01
+FUNCTION_VIRTUAL = 0x02
+FUNCTION_PURE_VIRTUAL = 0x04
+FUNCTION_CTOR = 0x08
+FUNCTION_DTOR = 0x10
+FUNCTION_ATTRIBUTE = 0x20
+FUNCTION_UNKNOWN_ANNOTATION = 0x40
+FUNCTION_THROW = 0x80
+FUNCTION_OVERRIDE = 0x100
+
+"""
+These are currently unused.  Should really handle these properly at some point.
+
+TYPE_MODIFIER_INLINE   = 0x010000
+TYPE_MODIFIER_EXTERN   = 0x020000
+TYPE_MODIFIER_STATIC   = 0x040000
+TYPE_MODIFIER_CONST    = 0x080000
+TYPE_MODIFIER_REGISTER = 0x100000
+TYPE_MODIFIER_VOLATILE = 0x200000
+TYPE_MODIFIER_MUTABLE  = 0x400000
+
+TYPE_MODIFIER_MAP = {
+    'inline': TYPE_MODIFIER_INLINE,
+    'extern': TYPE_MODIFIER_EXTERN,
+    'static': TYPE_MODIFIER_STATIC,
+    'const': TYPE_MODIFIER_CONST,
+    'register': TYPE_MODIFIER_REGISTER,
+    'volatile': TYPE_MODIFIER_VOLATILE,
+    'mutable': TYPE_MODIFIER_MUTABLE,
+    }
+"""
+
+_INTERNAL_TOKEN = 'internal'
+_NAMESPACE_POP = 'ns-pop'
+
+
+# TODO(nnorwitz): use this as a singleton for templated_types, etc
+# where we don't want to create a new empty dict each time.  It is also const.
+class _NullDict(object):
+  __contains__ = lambda self: False
+  keys = values = items = iterkeys = itervalues = iteritems = lambda self: ()
+
+
+# TODO(nnorwitz): move AST nodes into a separate module.
+class Node(object):
+  """Base AST node."""
+
+  def __init__(self, start, end):
+    self.start = start
+    self.end = end
+
+  def IsDeclaration(self):
+    """Returns bool if this node is a declaration."""
+    return False
+
+  def IsDefinition(self):
+    """Returns bool if this node is a definition."""
+    return False
+
+  def IsExportable(self):
+    """Returns bool if this node exportable from a header file."""
+    return False
+
+  def Requires(self, node):
+    """Does this AST node require the definition of the node passed in?"""
+    return False
+
+  def XXX__str__(self):
+    return self._StringHelper(self.__class__.__name__, '')
+
+  def _StringHelper(self, name, suffix):
+    if not utils.DEBUG:
+      return '%s(%s)' % (name, suffix)
+    return '%s(%d, %d, %s)' % (name, self.start, self.end, suffix)
+
+  def __repr__(self):
+    return str(self)
+
+
+class Define(Node):
+  def __init__(self, start, end, name, definition):
+    Node.__init__(self, start, end)
+    self.name = name
+    self.definition = definition
+
+  def __str__(self):
+    value = '%s %s' % (self.name, self.definition)
+    return self._StringHelper(self.__class__.__name__, value)
+
+
+class Include(Node):
+  def __init__(self, start, end, filename, system):
+    Node.__init__(self, start, end)
+    self.filename = filename
+    self.system = system
+
+  def __str__(self):
+    fmt = '"%s"'
+    if self.system:
+      fmt = '<%s>'
+    return self._StringHelper(self.__class__.__name__, fmt % self.filename)
+
+
+class Goto(Node):
+  def __init__(self, start, end, label):
+    Node.__init__(self, start, end)
+    self.label = label
+
+  def __str__(self):
+    return self._StringHelper(self.__class__.__name__, str(self.label))
+
+
+class Expr(Node):
+  def __init__(self, start, end, expr):
+    Node.__init__(self, start, end)
+    self.expr = expr
+
+  def Requires(self, node):
+    # TODO(nnorwitz): impl.
+    return False
+
+  def __str__(self):
+    return self._StringHelper(self.__class__.__name__, str(self.expr))
+
+
+class Return(Expr):
+  pass
+
+
+class Delete(Expr):
+  pass
+
+
+class Friend(Expr):
+  def __init__(self, start, end, expr, namespace):
+    Expr.__init__(self, start, end, expr)
+    self.namespace = namespace[:]
+
+
+class Using(Node):
+  def __init__(self, start, end, names):
+    Node.__init__(self, start, end)
+    self.names = names
+
+  def __str__(self):
+    return self._StringHelper(self.__class__.__name__, str(self.names))
+
+
+class Parameter(Node):
+  def __init__(self, start, end, name, parameter_type, default):
+    Node.__init__(self, start, end)
+    self.name = name
+    self.type = parameter_type
+    self.default = default
+
+  def Requires(self, node):
+    # TODO(nnorwitz): handle namespaces, etc.
+    return self.type.name == node.name
+
+  def __str__(self):
+    name = str(self.type)
+    suffix = '%s %s' % (name, self.name)
+    if self.default:
+      suffix += ' = ' + ''.join([d.name for d in self.default])
+    return self._StringHelper(self.__class__.__name__, suffix)
+
+
+class _GenericDeclaration(Node):
+  def __init__(self, start, end, name, namespace):
+    Node.__init__(self, start, end)
+    self.name = name
+    self.namespace = namespace[:]
+
+  def FullName(self):
+    prefix = ''
+    if self.namespace and self.namespace[-1]:
+      prefix = '::'.join(self.namespace) + '::'
+    return prefix + self.name
+
+  def _TypeStringHelper(self, suffix):
+    if self.namespace:
+      names = [n or '<anonymous>' for n in self.namespace]
+      suffix += ' in ' + '::'.join(names)
+    return self._StringHelper(self.__class__.__name__, suffix)
+
+
+# TODO(nnorwitz): merge with Parameter in some way?
+class VariableDeclaration(_GenericDeclaration):
+  def __init__(self, start, end, name, var_type, initial_value, namespace):
+    _GenericDeclaration.__init__(self, start, end, name, namespace)
+    self.type = var_type
+    self.initial_value = initial_value
+
+  def Requires(self, node):
+    # TODO(nnorwitz): handle namespaces, etc.
+    return self.type.name == node.name
+
+  def ToString(self):
+    """Return a string that tries to reconstitute the variable decl."""
+    suffix = '%s %s' % (self.type, self.name)
+    if self.initial_value:
+      suffix += ' = ' + self.initial_value
+    return suffix
+
+  def __str__(self):
+    return self._StringHelper(self.__class__.__name__, self.ToString())
+
+
+class Typedef(_GenericDeclaration):
+  def __init__(self, start, end, name, alias, namespace):
+    _GenericDeclaration.__init__(self, start, end, name, namespace)
+    self.alias = alias
+
+  def IsDefinition(self):
+    return True
+
+  def IsExportable(self):
+    return True
+
+  def Requires(self, node):
+    # TODO(nnorwitz): handle namespaces, etc.
+    name = node.name
+    for token in self.alias:
+      if token is not None and name == token.name:
+        return True
+    return False
+
+  def __str__(self):
+    suffix = '%s, %s' % (self.name, self.alias)
+    return self._TypeStringHelper(suffix)
+
+
+class _NestedType(_GenericDeclaration):
+  def __init__(self, start, end, name, fields, namespace):
+    _GenericDeclaration.__init__(self, start, end, name, namespace)
+    self.fields = fields
+
+  def IsDefinition(self):
+    return True
+
+  def IsExportable(self):
+    return True
+
+  def __str__(self):
+    suffix = '%s, {%s}' % (self.name, self.fields)
+    return self._TypeStringHelper(suffix)
+
+
+class Union(_NestedType):
+  pass
+
+
+class Enum(_NestedType):
+  pass
+
+
+class Class(_GenericDeclaration):
+  def __init__(self, start, end, name, bases, templated_types, body, namespace):
+    _GenericDeclaration.__init__(self, start, end, name, namespace)
+    self.bases = bases
+    self.body = body
+    self.templated_types = templated_types
+
+  def IsDeclaration(self):
+    return self.bases is None and self.body is None
+
+  def IsDefinition(self):
+    return not self.IsDeclaration()
+
+  def IsExportable(self):
+    return not self.IsDeclaration()
+
+  def Requires(self, node):
+    # TODO(nnorwitz): handle namespaces, etc.
+    if self.bases:
+      for token_list in self.bases:
+        # TODO(nnorwitz): bases are tokens, do name comparision.
+        for token in token_list:
+          if token.name == node.name:
+            return True
+    # TODO(nnorwitz): search in body too.
+    return False
+
+  def __str__(self):
+    name = self.name
+    if self.templated_types:
+      name += '<%s>' % self.templated_types
+    suffix = '%s, %s, %s' % (name, self.bases, self.body)
+    return self._TypeStringHelper(suffix)
+
+
+class Struct(Class):
+  pass
+
+
+class Function(_GenericDeclaration):
+  def __init__(self, start, end, name, return_type, parameters,
+               modifiers, templated_types, body, namespace):
+    _GenericDeclaration.__init__(self, start, end, name, namespace)
+    converter = TypeConverter(namespace)
+    self.return_type = converter.CreateReturnType(return_type)
+    self.parameters = converter.ToParameters(parameters)
+    self.modifiers = modifiers
+    self.body = body
+    self.templated_types = templated_types
+
+  def IsDeclaration(self):
+    return self.body is None
+
+  def IsDefinition(self):
+    return self.body is not None
+
+  def IsExportable(self):
+    if self.return_type and 'static' in self.return_type.modifiers:
+      return False
+    return None not in self.namespace
+
+  def Requires(self, node):
+    if self.parameters:
+      # TODO(nnorwitz): parameters are tokens, do name comparision.
+      for p in self.parameters:
+        if p.name == node.name:
+          return True
+    # TODO(nnorwitz): search in body too.
+    return False
+
+  def __str__(self):
+    # TODO(nnorwitz): add templated_types.
+    suffix = ('%s %s(%s), 0x%02x, %s' %
+              (self.return_type, self.name, self.parameters,
+               self.modifiers, self.body))
+    return self._TypeStringHelper(suffix)
+
+
+class Method(Function):
+  def __init__(self, start, end, name, in_class, return_type, parameters,
+               modifiers, templated_types, body, namespace):
+    Function.__init__(self, start, end, name, return_type, parameters,
+                      modifiers, templated_types, body, namespace)
+    # TODO(nnorwitz): in_class could also be a namespace which can
+    # mess up finding functions properly.
+    self.in_class = in_class
+
+
+class Type(_GenericDeclaration):
+  """Type used for any variable (eg class, primitive, struct, etc)."""
+
+  def __init__(self, start, end, name, templated_types, modifiers,
+               reference, pointer, array):
+    """
+        Args:
+          name: str name of main type
+          templated_types: [Class (Type?)] template type info between <>
+          modifiers: [str] type modifiers (keywords) eg, const, mutable, etc.
+          reference, pointer, array: bools
+        """
+    _GenericDeclaration.__init__(self, start, end, name, [])
+    self.templated_types = templated_types
+    if not name and modifiers:
+      self.name = modifiers.pop()
+    self.modifiers = modifiers
+    self.reference = reference
+    self.pointer = pointer
+    self.array = array
+
+  def __str__(self):
+    prefix = ''
+    if self.modifiers:
+      prefix = ' '.join(self.modifiers) + ' '
+    name = str(self.name)
+    if self.templated_types:
+      name += '<%s>' % self.templated_types
+    suffix = prefix + name
+    if self.reference:
+      suffix += '&'
+    if self.pointer:
+      suffix += '*'
+    if self.array:
+      suffix += '[]'
+    return self._TypeStringHelper(suffix)
+
+  # By definition, Is* are always False.  A Type can only exist in
+  # some sort of variable declaration, parameter, or return value.
+  def IsDeclaration(self):
+    return False
+
+  def IsDefinition(self):
+    return False
+
+  def IsExportable(self):
+    return False
+
+
+class TypeConverter(object):
+
+  def __init__(self, namespace_stack):
+    self.namespace_stack = namespace_stack
+
+  def _GetTemplateEnd(self, tokens, start):
+    count = 1
+    end = start
+    while 1:
+      token = tokens[end]
+      end += 1
+      if token.name == '<':
+        count += 1
+      elif token.name == '>':
+        count -= 1
+        if count == 0:
+          break
+    return tokens[start:end-1], end
+
+  def ToType(self, tokens):
+    """Convert [Token,...] to [Class(...), ] useful for base classes.
+        For example, code like class Foo : public Bar<x, y> { ... };
+        the "Bar<x, y>" portion gets converted to an AST.
+
+        Returns:
+          [Class(...), ...]
+        """
+    result = []
+    name_tokens = []
+    reference = pointer = array = False
+
+    def AddType(templated_types):
+      # Partition tokens into name and modifier tokens.
+      names = []
+      modifiers = []
+      for t in name_tokens:
+        if keywords.IsKeyword(t.name):
+          modifiers.append(t.name)
+        else:
+          names.append(t.name)
+      name = ''.join(names)
+      if name_tokens:
+        result.append(Type(name_tokens[0].start, name_tokens[-1].end,
+                           name, templated_types, modifiers,
+                           reference, pointer, array))
+      del name_tokens[:]
+
+    i = 0
+    end = len(tokens)
+    while i < end:
+      token = tokens[i]
+      if token.name == '<':
+        new_tokens, new_end = self._GetTemplateEnd(tokens, i+1)
+        AddType(self.ToType(new_tokens))
+        # If there is a comma after the template, we need to consume
+        # that here otherwise it becomes part of the name.
+        i = new_end
+        reference = pointer = array = False
+      elif token.name == ',':
+        AddType([])
+        reference = pointer = array = False
+      elif token.name == '*':
+        pointer = True
+      elif token.name == '&':
+        reference = True
+      elif token.name == '[':
+        pointer = True
+      elif token.name == ']':
+        pass
+      else:
+        name_tokens.append(token)
+      i += 1
+
+    if name_tokens:
+      # No '<' in the tokens, just a simple name and no template.
+      AddType([])
+    return result
+
+  def DeclarationToParts(self, parts, needs_name_removed):
+    name = None
+    default = []
+    if needs_name_removed:
+      # Handle default (initial) values properly.
+      for i, t in enumerate(parts):
+        if t.name == '=':
+          default = parts[i+1:]
+          name = parts[i-1].name
+          if name == ']' and parts[i-2].name == '[':
+            name = parts[i-3].name
+            i -= 1
+          parts = parts[:i-1]
+          break
+      else:
+        if parts[-1].token_type == tokenize.NAME:
+          name = parts.pop().name
+        else:
+          # TODO(nnorwitz): this is a hack that happens for code like
+          # Register(Foo<T>); where it thinks this is a function call
+          # but it's actually a declaration.
+          name = '???'
+    modifiers = []
+    type_name = []
+    other_tokens = []
+    templated_types = []
+    i = 0
+    end = len(parts)
+    while i < end:
+      p = parts[i]
+      if keywords.IsKeyword(p.name):
+        modifiers.append(p.name)
+      elif p.name == '<':
+        templated_tokens, new_end = self._GetTemplateEnd(parts, i+1)
+        templated_types = self.ToType(templated_tokens)
+        i = new_end - 1
+        # Don't add a spurious :: to data members being initialized.
+        next_index = i + 1
+        if next_index < end and parts[next_index].name == '::':
+          i += 1
+      elif p.name in ('[', ']', '='):
+        # These are handled elsewhere.
+        other_tokens.append(p)
+      elif p.name not in ('*', '&', '>'):
+        # Ensure that names have a space between them.
+        if (type_name and type_name[-1].token_type == tokenize.NAME and
+                p.token_type == tokenize.NAME):
+          type_name.append(tokenize.Token(tokenize.SYNTAX, ' ', 0, 0))
+        type_name.append(p)
+      else:
+        other_tokens.append(p)
+      i += 1
+    type_name = ''.join([t.name for t in type_name])
+    return name, type_name, templated_types, modifiers, default, other_tokens
+
+  def ToParameters(self, tokens):
+    if not tokens:
+      return []
+
+    result = []
+    name = type_name = ''
+    type_modifiers = []
+    pointer = reference = array = False
+    first_token = None
+    default = []
+
+    def AddParameter(end):
+      if default:
+        del default[0]  # Remove flag.
+      parts = self.DeclarationToParts(type_modifiers, True)
+      (name, type_name, templated_types, modifiers,
+       unused_default, unused_other_tokens) = parts
+      parameter_type = Type(first_token.start, first_token.end,
+                            type_name, templated_types, modifiers,
+                            reference, pointer, array)
+      p = Parameter(first_token.start, end, name,
+                    parameter_type, default)
+      result.append(p)
+
+    template_count = 0
+    brace_count = 0
+    for s in tokens:
+      if not first_token:
+        first_token = s
+
+      # Check for braces before templates, as we can have unmatched '<>'
+      # inside default arguments.
+      if s.name == '{':
+        brace_count += 1
+      elif s.name == '}':
+        brace_count -= 1
+      if brace_count > 0:
+        type_modifiers.append(s)
+        continue
+
+      if s.name == '<':
+        template_count += 1
+      elif s.name == '>':
+        template_count -= 1
+      if template_count > 0:
+        type_modifiers.append(s)
+        continue
+
+      if s.name == ',':
+        AddParameter(s.start)
+        name = type_name = ''
+        type_modifiers = []
+        pointer = reference = array = False
+        first_token = None
+        default = []
+      elif s.name == '*':
+        pointer = True
+      elif s.name == '&':
+        reference = True
+      elif s.name == '[':
+        array = True
+      elif s.name == ']':
+        pass  # Just don't add to type_modifiers.
+      elif s.name == '=':
+        # Got a default value.  Add any value (None) as a flag.
+        default.append(None)
+      elif default:
+        default.append(s)
+      else:
+        type_modifiers.append(s)
+    AddParameter(tokens[-1].end)
+    return result
+
+  def CreateReturnType(self, return_type_seq):
+    if not return_type_seq:
+      return None
+    start = return_type_seq[0].start
+    end = return_type_seq[-1].end
+    _, name, templated_types, modifiers, default, other_tokens = \
+        self.DeclarationToParts(return_type_seq, False)
+    names = [n.name for n in other_tokens]
+    reference = '&' in names
+    pointer = '*' in names
+    array = '[' in names
+    return Type(start, end, name, templated_types, modifiers,
+                reference, pointer, array)
+
+  def GetTemplateIndices(self, names):
+    # names is a list of strings.
+    start = names.index('<')
+    end = len(names) - 1
+    while end > 0:
+      if names[end] == '>':
+        break
+      end -= 1
+    return start, end+1
+
+class AstBuilder(object):
+  def __init__(self, token_stream, filename, in_class='', visibility=None,
+               namespace_stack=[]):
+    self.tokens = token_stream
+    self.filename = filename
+    # TODO(nnorwitz): use a better data structure (deque) for the queue.
+    # Switching directions of the "queue" improved perf by about 25%.
+    # Using a deque should be even better since we access from both sides.
+    self.token_queue = []
+    self.namespace_stack = namespace_stack[:]
+    self.in_class = in_class
+    if in_class is None:
+      self.in_class_name_only = None
+    else:
+      self.in_class_name_only = in_class.split('::')[-1]
+    self.visibility = visibility
+    self.in_function = False
+    self.current_token = None
+    # Keep the state whether we are currently handling a typedef or not.
+    self._handling_typedef = False
+
+    self.converter = TypeConverter(self.namespace_stack)
+
+  def HandleError(self, msg, token):
+    printable_queue = list(reversed(self.token_queue[-20:]))
+    sys.stderr.write('Got %s in %s @ %s %s\n' %
+                     (msg, self.filename, token, printable_queue))
+
+  def Generate(self):
+    while 1:
+      token = self._GetNextToken()
+      if not token:
+        break
+
+      # Get the next token.
+      self.current_token = token
+
+      # Dispatch on the next token type.
+      if token.token_type == _INTERNAL_TOKEN:
+        if token.name == _NAMESPACE_POP:
+          self.namespace_stack.pop()
+        continue
+
+      try:
+        result = self._GenerateOne(token)
+        if result is not None:
+          yield result
+      except:
+        self.HandleError('exception', token)
+        raise
+
+  def _CreateVariable(self, pos_token, name, type_name, type_modifiers,
+                      ref_pointer_name_seq, templated_types, value=None):
+    reference = '&' in ref_pointer_name_seq
+    pointer = '*' in ref_pointer_name_seq
+    array = '[' in ref_pointer_name_seq
+    var_type = Type(pos_token.start, pos_token.end, type_name,
+                    templated_types, type_modifiers,
+                    reference, pointer, array)
+    return VariableDeclaration(pos_token.start, pos_token.end,
+                               name, var_type, value, self.namespace_stack)
+
+  def _GenerateOne(self, token):
+    if token.token_type == tokenize.NAME:
+      if (keywords.IsKeyword(token.name) and
+          not keywords.IsBuiltinType(token.name)):
+        if token.name == 'enum':
+          # Pop the next token and only put it back if it's not
+          # 'class'.  This allows us to support the two-token
+          # 'enum class' keyword as if it were simply 'enum'.
+          next = self._GetNextToken()
+          if next.name != 'class':
+            self._AddBackToken(next)
+
+        method = getattr(self, 'handle_' + token.name)
+        return method()
+      elif token.name == self.in_class_name_only:
+        # The token name is the same as the class, must be a ctor if
+        # there is a paren.  Otherwise, it's the return type.
+        # Peek ahead to get the next token to figure out which.
+        next = self._GetNextToken()
+        self._AddBackToken(next)
+        if next.token_type == tokenize.SYNTAX and next.name == '(':
+          return self._GetMethod([token], FUNCTION_CTOR, None, True)
+        # Fall through--handle like any other method.
+
+      # Handle data or function declaration/definition.
+      syntax = tokenize.SYNTAX
+      temp_tokens, last_token = \
+          self._GetVarTokensUpToIgnoringTemplates(syntax,
+                                                  '(', ';', '{', '[')
+      temp_tokens.insert(0, token)
+      if last_token.name == '(':
+        # If there is an assignment before the paren,
+        # this is an expression, not a method.
+        expr = bool([e for e in temp_tokens if e.name == '='])
+        if expr:
+          new_temp = self._GetTokensUpTo(tokenize.SYNTAX, ';')
+          temp_tokens.append(last_token)
+          temp_tokens.extend(new_temp)
+          last_token = tokenize.Token(tokenize.SYNTAX, ';', 0, 0)
+
+      if last_token.name == '[':
+        # Handle array, this isn't a method, unless it's an operator.
+        # TODO(nnorwitz): keep the size somewhere.
+        # unused_size = self._GetTokensUpTo(tokenize.SYNTAX, ']')
+        temp_tokens.append(last_token)
+        if temp_tokens[-2].name == 'operator':
+          temp_tokens.append(self._GetNextToken())
+        else:
+          temp_tokens2, last_token = \
+              self._GetVarTokensUpTo(tokenize.SYNTAX, ';')
+          temp_tokens.extend(temp_tokens2)
+
+      if last_token.name == ';':
+        # Handle data, this isn't a method.
+        parts = self.converter.DeclarationToParts(temp_tokens, True)
+        (name, type_name, templated_types, modifiers, default,
+         unused_other_tokens) = parts
+
+        t0 = temp_tokens[0]
+        names = [t.name for t in temp_tokens]
+        if templated_types:
+          start, end = self.converter.GetTemplateIndices(names)
+          names = names[:start] + names[end:]
+        default = ''.join([t.name for t in default])
+        return self._CreateVariable(t0, name, type_name, modifiers,
+                                    names, templated_types, default)
+      if last_token.name == '{':
+        self._AddBackTokens(temp_tokens[1:])
+        self._AddBackToken(last_token)
+        method_name = temp_tokens[0].name
+        method = getattr(self, 'handle_' + method_name, None)
+        if not method:
+          # Must be declaring a variable.
+          # TODO(nnorwitz): handle the declaration.
+          return None
+        return method()
+      return self._GetMethod(temp_tokens, 0, None, False)
+    elif token.token_type == tokenize.SYNTAX:
+      if token.name == '~' and self.in_class:
+        # Must be a dtor (probably not in method body).
+        token = self._GetNextToken()
+        # self.in_class can contain A::Name, but the dtor will only
+        # be Name.  Make sure to compare against the right value.
+        if (token.token_type == tokenize.NAME and
+                token.name == self.in_class_name_only):
+          return self._GetMethod([token], FUNCTION_DTOR, None, True)
+      # TODO(nnorwitz): handle a lot more syntax.
+    elif token.token_type == tokenize.PREPROCESSOR:
+      # TODO(nnorwitz): handle more preprocessor directives.
+      # token starts with a #, so remove it and strip whitespace.
+      name = token.name[1:].lstrip()
+      if name.startswith('include'):
+        # Remove "include".
+        name = name[7:].strip()
+        assert name
+        # Handle #include \<newline> "header-on-second-line.h".
+        if name.startswith('\\'):
+          name = name[1:].strip()
+        assert name[0] in '<"', token
+        assert name[-1] in '>"', token
+        system = name[0] == '<'
+        filename = name[1:-1]
+        return Include(token.start, token.end, filename, system)
+      if name.startswith('define'):
+        # Remove "define".
+        name = name[6:].strip()
+        assert name
+        value = ''
+        for i, c in enumerate(name):
+          if c.isspace():
+            value = name[i:].lstrip()
+            name = name[:i]
+            break
+        return Define(token.start, token.end, name, value)
+      if name.startswith('if') and name[2:3].isspace():
+        condition = name[3:].strip()
+        if condition.startswith('0') or condition.startswith('(0)'):
+          self._SkipIf0Blocks()
+    return None
+
+  def _GetTokensUpTo(self, expected_token_type, expected_token):
+    return self._GetVarTokensUpTo(expected_token_type, expected_token)[0]
+
+  def _GetVarTokensUpTo(self, expected_token_type, *expected_tokens):
+    last_token = self._GetNextToken()
+    tokens = []
+    while (last_token.token_type != expected_token_type or
+           last_token.name not in expected_tokens):
+      tokens.append(last_token)
+      last_token = self._GetNextToken()
+    return tokens, last_token
+
+  # Same as _GetVarTokensUpTo, but skips over '<...>' which could contain an
+  # expected token.
+  def _GetVarTokensUpToIgnoringTemplates(self, expected_token_type,
+                                         *expected_tokens):
+    last_token = self._GetNextToken()
+    tokens = []
+    nesting = 0
+    while (nesting > 0 or
+           last_token.token_type != expected_token_type or
+           last_token.name not in expected_tokens):
+      tokens.append(last_token)
+      last_token = self._GetNextToken()
+      if last_token.name == '<':
+        nesting += 1
+      elif last_token.name == '>':
+        nesting -= 1
+    return tokens, last_token
+
+  # TODO(nnorwitz): remove _IgnoreUpTo() it shouldn't be necesary.
+  def _IgnoreUpTo(self, token_type, token):
+    unused_tokens = self._GetTokensUpTo(token_type, token)
+
+  def _SkipIf0Blocks(self):
+    count = 1
+    while 1:
+      token = self._GetNextToken()
+      if token.token_type != tokenize.PREPROCESSOR:
+        continue
+
+      name = token.name[1:].lstrip()
+      if name.startswith('endif'):
+        count -= 1
+        if count == 0:
+          break
+      elif name.startswith('if'):
+        count += 1
+
+  def _GetMatchingChar(self, open_paren, close_paren, GetNextToken=None):
+    if GetNextToken is None:
+      GetNextToken = self._GetNextToken
+    # Assumes the current token is open_paren and we will consume
+    # and return up to the close_paren.
+    count = 1
+    token = GetNextToken()
+    while 1:
+      if token.token_type == tokenize.SYNTAX:
+        if token.name == open_paren:
+          count += 1
+        elif token.name == close_paren:
+          count -= 1
+          if count == 0:
+            break
+      yield token
+      token = GetNextToken()
+    yield token
+
+  def _GetParameters(self):
+    return self._GetMatchingChar('(', ')')
+
+  def GetScope(self):
+    return self._GetMatchingChar('{', '}')
+
+  def _GetNextToken(self):
+    if self.token_queue:
+      return self.token_queue.pop()
+    try:
+      return next(self.tokens)
+    except StopIteration:
+      return
+
+  def _AddBackToken(self, token):
+    if token.whence == tokenize.WHENCE_STREAM:
+      token.whence = tokenize.WHENCE_QUEUE
+      self.token_queue.insert(0, token)
+    else:
+      assert token.whence == tokenize.WHENCE_QUEUE, token
+      self.token_queue.append(token)
+
+  def _AddBackTokens(self, tokens):
+    if tokens:
+      if tokens[-1].whence == tokenize.WHENCE_STREAM:
+        for token in tokens:
+          token.whence = tokenize.WHENCE_QUEUE
+        self.token_queue[:0] = reversed(tokens)
+      else:
+        assert tokens[-1].whence == tokenize.WHENCE_QUEUE, tokens
+        self.token_queue.extend(reversed(tokens))
+
+  def GetName(self, seq=None):
+    """Returns ([tokens], next_token_info)."""
+    GetNextToken = self._GetNextToken
+    if seq is not None:
+      it = iter(seq)
+      GetNextToken = lambda: next(it)
+    next_token = GetNextToken()
+    tokens = []
+    last_token_was_name = False
+    while (next_token.token_type == tokenize.NAME or
+           (next_token.token_type == tokenize.SYNTAX and
+            next_token.name in ('::', '<'))):
+      # Two NAMEs in a row means the identifier should terminate.
+      # It's probably some sort of variable declaration.
+      if last_token_was_name and next_token.token_type == tokenize.NAME:
+        break
+      last_token_was_name = next_token.token_type == tokenize.NAME
+      tokens.append(next_token)
+      # Handle templated names.
+      if next_token.name == '<':
+        tokens.extend(self._GetMatchingChar('<', '>', GetNextToken))
+        last_token_was_name = True
+      next_token = GetNextToken()
+    return tokens, next_token
+
+  def GetMethod(self, modifiers, templated_types):
+    return_type_and_name = self._GetTokensUpTo(tokenize.SYNTAX, '(')
+    assert len(return_type_and_name) >= 1
+    return self._GetMethod(return_type_and_name, modifiers, templated_types,
+                           False)
+
+  def _GetMethod(self, return_type_and_name, modifiers, templated_types,
+                 get_paren):
+    template_portion = None
+    if get_paren:
+      token = self._GetNextToken()
+      assert token.token_type == tokenize.SYNTAX, token
+      if token.name == '<':
+        # Handle templatized dtors.
+        template_portion = [token]
+        template_portion.extend(self._GetMatchingChar('<', '>'))
+        token = self._GetNextToken()
+      assert token.token_type == tokenize.SYNTAX, token
+      assert token.name == '(', token
+
+    name = return_type_and_name.pop()
+    # Handle templatized ctors.
+    if name.name == '>':
+      index = 1
+      while return_type_and_name[index].name != '<':
+        index += 1
+      template_portion = return_type_and_name[index:] + [name]
+      del return_type_and_name[index:]
+      name = return_type_and_name.pop()
+    elif name.name == ']':
+      rt = return_type_and_name
+      assert rt[-1].name == '[', return_type_and_name
+      assert rt[-2].name == 'operator', return_type_and_name
+      name_seq = return_type_and_name[-2:]
+      del return_type_and_name[-2:]
+      name = tokenize.Token(tokenize.NAME, 'operator[]',
+                            name_seq[0].start, name.end)
+      # Get the open paren so _GetParameters() below works.
+      unused_open_paren = self._GetNextToken()
+
+    # TODO(nnorwitz): store template_portion.
+    return_type = return_type_and_name
+    indices = name
+    if return_type:
+      indices = return_type[0]
+
+    # Force ctor for templatized ctors.
+    if name.name == self.in_class and not modifiers:
+      modifiers |= FUNCTION_CTOR
+    parameters = list(self._GetParameters())
+    del parameters[-1]              # Remove trailing ')'.
+
+    # Handling operator() is especially weird.
+    if name.name == 'operator' and not parameters:
+      token = self._GetNextToken()
+      assert token.name == '(', token
+      parameters = list(self._GetParameters())
+      del parameters[-1]          # Remove trailing ')'.
+
+    token = self._GetNextToken()
+    while token.token_type == tokenize.NAME:
+      modifier_token = token
+      token = self._GetNextToken()
+      if modifier_token.name == 'const':
+        modifiers |= FUNCTION_CONST
+      elif modifier_token.name == '__attribute__':
+        # TODO(nnorwitz): handle more __attribute__ details.
+        modifiers |= FUNCTION_ATTRIBUTE
+        assert token.name == '(', token
+        # Consume everything between the (parens).
+        unused_tokens = list(self._GetMatchingChar('(', ')'))
+        token = self._GetNextToken()
+      elif modifier_token.name == 'throw':
+        modifiers |= FUNCTION_THROW
+        assert token.name == '(', token
+        # Consume everything between the (parens).
+        unused_tokens = list(self._GetMatchingChar('(', ')'))
+        token = self._GetNextToken()
+      elif modifier_token.name == 'override':
+        modifiers |= FUNCTION_OVERRIDE
+      elif modifier_token.name == modifier_token.name.upper():
+        # HACK(nnorwitz):  assume that all upper-case names
+        # are some macro we aren't expanding.
+        modifiers |= FUNCTION_UNKNOWN_ANNOTATION
+      else:
+        self.HandleError('unexpected token', modifier_token)
+
+    assert token.token_type == tokenize.SYNTAX, token
+    # Handle ctor initializers.
+    if token.name == ':':
+      # TODO(nnorwitz): anything else to handle for initializer list?
+      while token.name != ';' and token.name != '{':
+        token = self._GetNextToken()
+
+    # Handle pointer to functions that are really data but look
+    # like method declarations.
+    if token.name == '(':
+      if parameters[0].name == '*':
+        # name contains the return type.
+        name = parameters.pop()
+        # parameters contains the name of the data.
+        modifiers = [p.name for p in parameters]
+        # Already at the ( to open the parameter list.
+        function_parameters = list(self._GetMatchingChar('(', ')'))
+        del function_parameters[-1]  # Remove trailing ')'.
+        # TODO(nnorwitz): store the function_parameters.
+        token = self._GetNextToken()
+        assert token.token_type == tokenize.SYNTAX, token
+        assert token.name == ';', token
+        return self._CreateVariable(indices, name.name, indices.name,
+                                    modifiers, '', None)
+      # At this point, we got something like:
+      #  return_type (type::*name_)(params);
+      # This is a data member called name_ that is a function pointer.
+      # With this code: void (sq_type::*field_)(string&);
+      # We get: name=void return_type=[] parameters=sq_type ... field_
+      # TODO(nnorwitz): is return_type always empty?
+      # TODO(nnorwitz): this isn't even close to being correct.
+      # Just put in something so we don't crash and can move on.
+      real_name = parameters[-1]
+      modifiers = [p.name for p in self._GetParameters()]
+      del modifiers[-1]           # Remove trailing ')'.
+      return self._CreateVariable(indices, real_name.name, indices.name,
+                                  modifiers, '', None)
+
+    if token.name == '{':
+      body = list(self.GetScope())
+      del body[-1]                # Remove trailing '}'.
+    else:
+      body = None
+      if token.name == '=':
+        token = self._GetNextToken()
+
+        if token.name == 'default' or token.name == 'delete':
+          # Ignore explicitly defaulted and deleted special members
+          # in C++11.
+          token = self._GetNextToken()
+        else:
+          # Handle pure-virtual declarations.
+          assert token.token_type == tokenize.CONSTANT, token
+          assert token.name == '0', token
+          modifiers |= FUNCTION_PURE_VIRTUAL
+          token = self._GetNextToken()
+
+      if token.name == '[':
+        # TODO(nnorwitz): store tokens and improve parsing.
+        # template <typename T, size_t N> char (&ASH(T (&seq)[N]))[N];
+        tokens = list(self._GetMatchingChar('[', ']'))
+        token = self._GetNextToken()
+
+      assert token.name == ';', (token, return_type_and_name, parameters)
+
+    # Looks like we got a method, not a function.
+    if len(return_type) > 2 and return_type[-1].name == '::':
+      return_type, in_class = \
+          self._GetReturnTypeAndClassName(return_type)
+      return Method(indices.start, indices.end, name.name, in_class,
+                    return_type, parameters, modifiers, templated_types,
+                    body, self.namespace_stack)
+    return Function(indices.start, indices.end, name.name, return_type,
+                    parameters, modifiers, templated_types, body,
+                    self.namespace_stack)
+
+  def _GetReturnTypeAndClassName(self, token_seq):
+    # Splitting the return type from the class name in a method
+    # can be tricky.  For example, Return::Type::Is::Hard::To::Find().
+    # Where is the return type and where is the class name?
+    # The heuristic used is to pull the last name as the class name.
+    # This includes all the templated type info.
+    # TODO(nnorwitz): if there is only One name like in the
+    # example above, punt and assume the last bit is the class name.
+
+    # Ignore a :: prefix, if exists so we can find the first real name.
+    i = 0
+    if token_seq[0].name == '::':
+      i = 1
+    # Ignore a :: suffix, if exists.
+    end = len(token_seq) - 1
+    if token_seq[end-1].name == '::':
+      end -= 1
+
+    # Make a copy of the sequence so we can append a sentinel
+    # value. This is required for GetName will has to have some
+    # terminating condition beyond the last name.
+    seq_copy = token_seq[i:end]
+    seq_copy.append(tokenize.Token(tokenize.SYNTAX, '', 0, 0))
+    names = []
+    while i < end:
+      # Iterate through the sequence parsing out each name.
+      new_name, next = self.GetName(seq_copy[i:])
+      assert new_name, 'Got empty new_name, next=%s' % next
+      # We got a pointer or ref.  Add it to the name.
+      if next and next.token_type == tokenize.SYNTAX:
+        new_name.append(next)
+      names.append(new_name)
+      i += len(new_name)
+
+    # Now that we have the names, it's time to undo what we did.
+
+    # Remove the sentinel value.
+    names[-1].pop()
+    # Flatten the token sequence for the return type.
+    return_type = [e for seq in names[:-1] for e in seq]
+    # The class name is the last name.
+    class_name = names[-1]
+    return return_type, class_name
+
+  def handle_bool(self):
+    pass
+
+  def handle_char(self):
+    pass
+
+  def handle_int(self):
+    pass
+
+  def handle_long(self):
+    pass
+
+  def handle_short(self):
+    pass
+
+  def handle_double(self):
+    pass
+
+  def handle_float(self):
+    pass
+
+  def handle_void(self):
+    pass
+
+  def handle_wchar_t(self):
+    pass
+
+  def handle_unsigned(self):
+    pass
+
+  def handle_signed(self):
+    pass
+
+  def _GetNestedType(self, ctor):
+    name = None
+    name_tokens, token = self.GetName()
+    if name_tokens:
+      name = ''.join([t.name for t in name_tokens])
+
+    # Handle forward declarations.
+    if token.token_type == tokenize.SYNTAX and token.name == ';':
+      return ctor(token.start, token.end, name, None,
+                  self.namespace_stack)
+
+    if token.token_type == tokenize.NAME and self._handling_typedef:
+      self._AddBackToken(token)
+      return ctor(token.start, token.end, name, None,
+                  self.namespace_stack)
+
+    # Must be the type declaration.
+    fields = list(self._GetMatchingChar('{', '}'))
+    del fields[-1]                  # Remove trailing '}'.
+    if token.token_type == tokenize.SYNTAX and token.name == '{':
+      next = self._GetNextToken()
+      new_type = ctor(token.start, token.end, name, fields,
+                      self.namespace_stack)
+      # A name means this is an anonymous type and the name
+      # is the variable declaration.
+      if next.token_type != tokenize.NAME:
+        return new_type
+      name = new_type
+      token = next
+
+    # Must be variable declaration using the type prefixed with keyword.
+    assert token.token_type == tokenize.NAME, token
+    return self._CreateVariable(token, token.name, name, [], '', None)
+
+  def handle_struct(self):
+    # Special case the handling typedef/aliasing of structs here.
+    # It would be a pain to handle in the class code.
+    name_tokens, var_token = self.GetName()
+    if name_tokens:
+      next_token = self._GetNextToken()
+      is_syntax = (var_token.token_type == tokenize.SYNTAX and
+                   var_token.name[0] in '*&')
+      is_variable = (var_token.token_type == tokenize.NAME and
+                     next_token.name == ';')
+      variable = var_token
+      if is_syntax and not is_variable:
+        variable = next_token
+        temp = self._GetNextToken()
+        if temp.token_type == tokenize.SYNTAX and temp.name == '(':
+          # Handle methods declared to return a struct.
+          t0 = name_tokens[0]
+          struct = tokenize.Token(tokenize.NAME, 'struct',
+                                  t0.start-7, t0.start-2)
+          type_and_name = [struct]
+          type_and_name.extend(name_tokens)
+          type_and_name.extend((var_token, next_token))
+          return self._GetMethod(type_and_name, 0, None, False)
+        assert temp.name == ';', (temp, name_tokens, var_token)
+      if is_syntax or (is_variable and not self._handling_typedef):
+        modifiers = ['struct']
+        type_name = ''.join([t.name for t in name_tokens])
+        position = name_tokens[0]
+        return self._CreateVariable(position, variable.name, type_name,
+                                    modifiers, var_token.name, None)
+      name_tokens.extend((var_token, next_token))
+      self._AddBackTokens(name_tokens)
+    else:
+      self._AddBackToken(var_token)
+    return self._GetClass(Struct, VISIBILITY_PUBLIC, None)
+
+  def handle_union(self):
+    return self._GetNestedType(Union)
+
+  def handle_enum(self):
+    return self._GetNestedType(Enum)
+
+  def handle_auto(self):
+    # TODO(nnorwitz): warn about using auto?  Probably not since it
+    # will be reclaimed and useful for C++0x.
+    pass
+
+  def handle_register(self):
+    pass
+
+  def handle_const(self):
+    pass
+
+  def handle_inline(self):
+    pass
+
+  def handle_extern(self):
+    pass
+
+  def handle_static(self):
+    pass
+
+  def handle_virtual(self):
+    # What follows must be a method.
+    token = token2 = self._GetNextToken()
+    if token.name == 'inline':
+      # HACK(nnorwitz): handle inline dtors by ignoring 'inline'.
+      token2 = self._GetNextToken()
+    if token2.token_type == tokenize.SYNTAX and token2.name == '~':
+      return self.GetMethod(FUNCTION_VIRTUAL + FUNCTION_DTOR, None)
+    assert token.token_type == tokenize.NAME or token.name == '::', token
+    return_type_and_name, _ = self._GetVarTokensUpToIgnoringTemplates(
+        tokenize.SYNTAX, '(')  # )
+    return_type_and_name.insert(0, token)
+    if token2 is not token:
+      return_type_and_name.insert(1, token2)
+    return self._GetMethod(return_type_and_name, FUNCTION_VIRTUAL,
+                           None, False)
+
+  def handle_volatile(self):
+    pass
+
+  def handle_mutable(self):
+    pass
+
+  def handle_public(self):
+    assert self.in_class
+    self.visibility = VISIBILITY_PUBLIC
+
+  def handle_protected(self):
+    assert self.in_class
+    self.visibility = VISIBILITY_PROTECTED
+
+  def handle_private(self):
+    assert self.in_class
+    self.visibility = VISIBILITY_PRIVATE
+
+  def handle_friend(self):
+    tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';')
+    assert tokens
+    t0 = tokens[0]
+    return Friend(t0.start, t0.end, tokens, self.namespace_stack)
+
+  def handle_static_cast(self):
+    pass
+
+  def handle_const_cast(self):
+    pass
+
+  def handle_dynamic_cast(self):
+    pass
+
+  def handle_reinterpret_cast(self):
+    pass
+
+  def handle_new(self):
+    pass
+
+  def handle_delete(self):
+    tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';')
+    assert tokens
+    return Delete(tokens[0].start, tokens[0].end, tokens)
+
+  def handle_typedef(self):
+    token = self._GetNextToken()
+    if (token.token_type == tokenize.NAME and
+            keywords.IsKeyword(token.name)):
+      # Token must be struct/enum/union/class.
+      method = getattr(self, 'handle_' + token.name)
+      self._handling_typedef = True
+      tokens = [method()]
+      self._handling_typedef = False
+    else:
+      tokens = [token]
+
+    # Get the remainder of the typedef up to the semi-colon.
+    tokens.extend(self._GetTokensUpTo(tokenize.SYNTAX, ';'))
+
+    # TODO(nnorwitz): clean all this up.
+    assert tokens
+    name = tokens.pop()
+    indices = name
+    if tokens:
+      indices = tokens[0]
+    if not indices:
+      indices = token
+    if name.name == ')':
+      # HACK(nnorwitz): Handle pointers to functions "properly".
+      if (len(tokens) >= 4 and
+              tokens[1].name == '(' and tokens[2].name == '*'):
+        tokens.append(name)
+        name = tokens[3]
+    elif name.name == ']':
+      # HACK(nnorwitz): Handle arrays properly.
+      if len(tokens) >= 2:
+        tokens.append(name)
+        name = tokens[1]
+    new_type = tokens
+    if tokens and isinstance(tokens[0], tokenize.Token):
+      new_type = self.converter.ToType(tokens)[0]
+    return Typedef(indices.start, indices.end, name.name,
+                   new_type, self.namespace_stack)
+
+  def handle_typeid(self):
+    pass  # Not needed yet.
+
+  def handle_typename(self):
+    pass  # Not needed yet.
+
+  def _GetTemplatedTypes(self):
+    result = {}
+    tokens = list(self._GetMatchingChar('<', '>'))
+    len_tokens = len(tokens) - 1    # Ignore trailing '>'.
+    i = 0
+    while i < len_tokens:
+      key = tokens[i].name
+      i += 1
+      if keywords.IsKeyword(key) or key == ',':
+        continue
+      type_name = default = None
+      if i < len_tokens:
+        i += 1
+        if tokens[i-1].name == '=':
+          assert i < len_tokens, '%s %s' % (i, tokens)
+          default, unused_next_token = self.GetName(tokens[i:])
+          i += len(default)
+        else:
+          if tokens[i-1].name != ',':
+            # We got something like: Type variable.
+            # Re-adjust the key (variable) and type_name (Type).
+            key = tokens[i-1].name
+            type_name = tokens[i-2]
+
+      result[key] = (type_name, default)
+    return result
+
+  def handle_template(self):
+    token = self._GetNextToken()
+    assert token.token_type == tokenize.SYNTAX, token
+    assert token.name == '<', token
+    templated_types = self._GetTemplatedTypes()
+    # TODO(nnorwitz): for now, just ignore the template params.
+    token = self._GetNextToken()
+    if token.token_type == tokenize.NAME:
+      if token.name == 'class':
+        return self._GetClass(Class, VISIBILITY_PRIVATE, templated_types)
+      elif token.name == 'struct':
+        return self._GetClass(Struct, VISIBILITY_PUBLIC, templated_types)
+      elif token.name == 'friend':
+        return self.handle_friend()
+    self._AddBackToken(token)
+    tokens, last = self._GetVarTokensUpTo(tokenize.SYNTAX, '(', ';')
+    tokens.append(last)
+    self._AddBackTokens(tokens)
+    if last.name == '(':
+      return self.GetMethod(FUNCTION_NONE, templated_types)
+    # Must be a variable definition.
+    return None
+
+  def handle_true(self):
+    pass  # Nothing to do.
+
+  def handle_false(self):
+    pass  # Nothing to do.
+
+  def handle_asm(self):
+    pass  # Not needed yet.
+
+  def handle_class(self):
+    return self._GetClass(Class, VISIBILITY_PRIVATE, None)
+
+  def _GetBases(self):
+    # Get base classes.
+    bases = []
+    while 1:
+      token = self._GetNextToken()
+      assert token.token_type == tokenize.NAME, token
+      # TODO(nnorwitz): store kind of inheritance...maybe.
+      if token.name not in ('public', 'protected', 'private'):
+        # If inheritance type is not specified, it is private.
+        # Just put the token back so we can form a name.
+        # TODO(nnorwitz): it would be good to warn about this.
+        self._AddBackToken(token)
+      else:
+        # Check for virtual inheritance.
+        token = self._GetNextToken()
+        if token.name != 'virtual':
+          self._AddBackToken(token)
+        else:
+          # TODO(nnorwitz): store that we got virtual for this base.
+          pass
+      base, next_token = self.GetName()
+      bases_ast = self.converter.ToType(base)
+      assert len(bases_ast) == 1, bases_ast
+      bases.append(bases_ast[0])
+      assert next_token.token_type == tokenize.SYNTAX, next_token
+      if next_token.name == '{':
+        token = next_token
+        break
+      # Support multiple inheritance.
+      assert next_token.name == ',', next_token
+    return bases, token
+
+  def _GetClass(self, class_type, visibility, templated_types):
+    class_name = None
+    class_token = self._GetNextToken()
+    if class_token.token_type != tokenize.NAME:
+      assert class_token.token_type == tokenize.SYNTAX, class_token
+      token = class_token
+    else:
+      # Skip any macro (e.g. storage class specifiers) after the
+      # 'class' keyword.
+      next_token = self._GetNextToken()
+      if next_token.token_type == tokenize.NAME:
+        self._AddBackToken(next_token)
+      else:
+        self._AddBackTokens([class_token, next_token])
+      name_tokens, token = self.GetName()
+      class_name = ''.join([t.name for t in name_tokens])
+    bases = None
+    if token.token_type == tokenize.SYNTAX:
+      if token.name == ';':
+        # Forward declaration.
+        return class_type(class_token.start, class_token.end,
+                          class_name, None, templated_types, None,
+                          self.namespace_stack)
+      if token.name in '*&':
+        # Inline forward declaration.  Could be method or data.
+        name_token = self._GetNextToken()
+        next_token = self._GetNextToken()
+        if next_token.name == ';':
+          # Handle data
+          modifiers = ['class']
+          return self._CreateVariable(class_token, name_token.name,
+                                      class_name,
+                                      modifiers, token.name, None)
+        else:
+          # Assume this is a method.
+          tokens = (class_token, token, name_token, next_token)
+          self._AddBackTokens(tokens)
+          return self.GetMethod(FUNCTION_NONE, None)
+      if token.name == ':':
+        bases, token = self._GetBases()
+
+    body = None
+    if token.token_type == tokenize.SYNTAX and token.name == '{':
+      assert token.token_type == tokenize.SYNTAX, token
+      assert token.name == '{', token
+
+      ast = AstBuilder(self.GetScope(), self.filename, class_name,
+                       visibility, self.namespace_stack)
+      body = list(ast.Generate())
+
+      if not self._handling_typedef:
+        token = self._GetNextToken()
+        if token.token_type != tokenize.NAME:
+          assert token.token_type == tokenize.SYNTAX, token
+          assert token.name == ';', token
+        else:
+          new_class = class_type(class_token.start, class_token.end,
+                                 class_name, bases, None,
+                                 body, self.namespace_stack)
+
+          modifiers = []
+          return self._CreateVariable(class_token,
+                                      token.name, new_class,
+                                      modifiers, token.name, None)
+    else:
+      if not self._handling_typedef:
+        self.HandleError('non-typedef token', token)
+      self._AddBackToken(token)
+
+    return class_type(class_token.start, class_token.end, class_name,
+                      bases, templated_types, body, self.namespace_stack)
+
+  def handle_namespace(self):
+    token = self._GetNextToken()
+    # Support anonymous namespaces.
+    name = None
+    if token.token_type == tokenize.NAME:
+      name = token.name
+      token = self._GetNextToken()
+    self.namespace_stack.append(name)
+    assert token.token_type == tokenize.SYNTAX, token
+    # Create an internal token that denotes when the namespace is complete.
+    internal_token = tokenize.Token(_INTERNAL_TOKEN, _NAMESPACE_POP,
+                                    None, None)
+    internal_token.whence = token.whence
+    if token.name == '=':
+      # TODO(nnorwitz): handle aliasing namespaces.
+      name, next_token = self.GetName()
+      assert next_token.name == ';', next_token
+      self._AddBackToken(internal_token)
+    else:
+      assert token.name == '{', token
+      tokens = list(self.GetScope())
+      # Replace the trailing } with the internal namespace pop token.
+      tokens[-1] = internal_token
+      # Handle namespace with nothing in it.
+      self._AddBackTokens(tokens)
+    return None
+
+  def handle_using(self):
+    tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';')
+    assert tokens
+    return Using(tokens[0].start, tokens[0].end, tokens)
+
+  def handle_explicit(self):
+    assert self.in_class
+    # Nothing much to do.
+    # TODO(nnorwitz): maybe verify the method name == class name.
+    # This must be a ctor.
+    return self.GetMethod(FUNCTION_CTOR, None)
+
+  def handle_this(self):
+    pass  # Nothing to do.
+
+  def handle_operator(self):
+    # Pull off the next token(s?) and make that part of the method name.
+    pass
+
+  def handle_sizeof(self):
+    pass
+
+  def handle_case(self):
+    pass
+
+  def handle_switch(self):
+    pass
+
+  def handle_default(self):
+    token = self._GetNextToken()
+    assert token.token_type == tokenize.SYNTAX
+    assert token.name == ':'
+
+  def handle_if(self):
+    pass
+
+  def handle_else(self):
+    pass
+
+  def handle_return(self):
+    tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';')
+    if not tokens:
+      return Return(self.current_token.start, self.current_token.end, None)
+    return Return(tokens[0].start, tokens[0].end, tokens)
+
+  def handle_goto(self):
+    tokens = self._GetTokensUpTo(tokenize.SYNTAX, ';')
+    assert len(tokens) == 1, str(tokens)
+    return Goto(tokens[0].start, tokens[0].end, tokens[0].name)
+
+  def handle_try(self):
+    pass  # Not needed yet.
+
+  def handle_catch(self):
+    pass  # Not needed yet.
+
+  def handle_throw(self):
+    pass  # Not needed yet.
+
+  def handle_while(self):
+    pass
+
+  def handle_do(self):
+    pass
+
+  def handle_for(self):
+    pass
+
+  def handle_break(self):
+    self._IgnoreUpTo(tokenize.SYNTAX, ';')
+
+  def handle_continue(self):
+    self._IgnoreUpTo(tokenize.SYNTAX, ';')
+
+
+def BuilderFromSource(source, filename):
+  """Utility method that returns an AstBuilder from source code.
+
+    Args:
+      source: 'C++ source code'
+      filename: 'file1'
+
+    Returns:
+      AstBuilder
+    """
+  return AstBuilder(tokenize.GetTokens(source), filename)
+
+
+def PrintIndentifiers(filename, should_print):
+  """Prints all identifiers for a C++ source file.
+
+    Args:
+      filename: 'file1'
+      should_print: predicate with signature: bool Function(token)
+    """
+  source = utils.ReadFile(filename, False)
+  if source is None:
+    sys.stderr.write('Unable to find: %s\n' % filename)
+    return
+
+  #print('Processing %s' % actual_filename)
+  builder = BuilderFromSource(source, filename)
+  try:
+    for node in builder.Generate():
+      if should_print(node):
+        print(node.name)
+  except KeyboardInterrupt:
+    return
+  except:
+    pass
+
+
+def PrintAllIndentifiers(filenames, should_print):
+  """Prints all identifiers for each C++ source file in filenames.
+
+    Args:
+      filenames: ['file1', 'file2', ...]
+      should_print: predicate with signature: bool Function(token)
+    """
+  for path in filenames:
+    PrintIndentifiers(path, should_print)
+
+
+def main(argv):
+  for filename in argv[1:]:
+    source = utils.ReadFile(filename)
+    if source is None:
+      continue
+
+    print('Processing %s' % filename)
+    builder = BuilderFromSource(source, filename)
+    try:
+      entire_ast = filter(None, builder.Generate())
+    except KeyboardInterrupt:
+      return
+    except:
+      # Already printed a warning, print the traceback and continue.
+      traceback.print_exc()
+    else:
+      if utils.DEBUG:
+        for ast in entire_ast:
+          print(ast)
+
+
+if __name__ == '__main__':
+  main(sys.argv)
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/gmock_class.py b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/gmock_class.py
new file mode 100755
index 0000000000..488cc15376
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/gmock_class.py
@@ -0,0 +1,245 @@
+#!/usr/bin/env python
+#
+# Copyright 2008 Google Inc.  All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generate Google Mock classes from base classes.
+
+This program will read in a C++ source file and output the Google Mock
+classes for the specified classes.  If no class is specified, all
+classes in the source file are emitted.
+
+Usage:
+  gmock_class.py header-file.h [ClassName]...
+
+Output is sent to stdout.
+"""
+
+import os
+import re
+import sys
+
+from cpp import ast
+from cpp import utils
+
+# Preserve compatibility with Python 2.3.
+try:
+  _dummy = set
+except NameError:
+  import sets
+
+  set = sets.Set
+
+_VERSION = (1, 0, 1)  # The version of this script.
+# How many spaces to indent.  Can set me with the INDENT environment variable.
+_INDENT = 2
+
+
+def _RenderType(ast_type):
+  """Renders the potentially recursively templated type into a string.
+
+  Args:
+    ast_type: The AST of the type.
+
+  Returns:
+    Rendered string of the type.
+  """
+  # Add modifiers like 'const'.
+  modifiers = ''
+  if ast_type.modifiers:
+    modifiers = ' '.join(ast_type.modifiers) + ' '
+  return_type = modifiers + ast_type.name
+  if ast_type.templated_types:
+    # Collect template args.
+    template_args = []
+    for arg in ast_type.templated_types:
+      rendered_arg = _RenderType(arg)
+      template_args.append(rendered_arg)
+    return_type += '<' + ', '.join(template_args) + '>'
+  if ast_type.pointer:
+    return_type += '*'
+  if ast_type.reference:
+    return_type += '&'
+  return return_type
+
+
+def _GenerateArg(source):
+  """Strips out comments, default arguments, and redundant spaces from a single argument.
+
+  Args:
+    source: A string for a single argument.
+
+  Returns:
+    Rendered string of the argument.
+  """
+  # Remove end of line comments before eliminating newlines.
+  arg = re.sub(r'//.*', '', source)
+
+  # Remove c-style comments.
+  arg = re.sub(r'/\*.*\*/', '', arg)
+
+  # Remove default arguments.
+  arg = re.sub(r'=.*', '', arg)
+
+  # Collapse spaces and newlines into a single space.
+  arg = re.sub(r'\s+', ' ', arg)
+  return arg.strip()
+
+
+def _EscapeForMacro(s):
+  """Escapes a string for use as an argument to a C++ macro."""
+  paren_count = 0
+  for c in s:
+    if c == '(':
+      paren_count += 1
+    elif c == ')':
+      paren_count -= 1
+    elif c == ',' and paren_count == 0:
+      return '(' + s + ')'
+  return s
+
+
+def _GenerateMethods(output_lines, source, class_node):
+  function_type = (
+      ast.FUNCTION_VIRTUAL | ast.FUNCTION_PURE_VIRTUAL | ast.FUNCTION_OVERRIDE)
+  ctor_or_dtor = ast.FUNCTION_CTOR | ast.FUNCTION_DTOR
+  indent = ' ' * _INDENT
+
+  for node in class_node.body:
+    # We only care about virtual functions.
+    if (isinstance(node, ast.Function) and node.modifiers & function_type and
+        not node.modifiers & ctor_or_dtor):
+      # Pick out all the elements we need from the original function.
+      modifiers = 'override'
+      if node.modifiers & ast.FUNCTION_CONST:
+        modifiers = 'const, ' + modifiers
+
+      return_type = 'void'
+      if node.return_type:
+        return_type = _EscapeForMacro(_RenderType(node.return_type))
+
+      args = []
+      for p in node.parameters:
+        arg = _GenerateArg(source[p.start:p.end])
+        args.append(_EscapeForMacro(arg))
+
+      # Create the mock method definition.
+      output_lines.extend([
+          '%sMOCK_METHOD(%s, %s, (%s), (%s));' %
+          (indent, return_type, node.name, ', '.join(args), modifiers)
+      ])
+
+
+def _GenerateMocks(filename, source, ast_list, desired_class_names):
+  processed_class_names = set()
+  lines = []
+  for node in ast_list:
+    if (isinstance(node, ast.Class) and node.body and
+        # desired_class_names being None means that all classes are selected.
+        (not desired_class_names or node.name in desired_class_names)):
+      class_name = node.name
+      parent_name = class_name
+      processed_class_names.add(class_name)
+      class_node = node
+      # Add namespace before the class.
+      if class_node.namespace:
+        lines.extend(['namespace %s {' % n for n in class_node.namespace])  # }
+        lines.append('')
+
+      # Add template args for templated classes.
+      if class_node.templated_types:
+        # TODO(paulchang): The AST doesn't preserve template argument order,
+        # so we have to make up names here.
+        # TODO(paulchang): Handle non-type template arguments (e.g.
+        # template<typename T, int N>).
+        template_arg_count = len(class_node.templated_types.keys())
+        template_args = ['T%d' % n for n in range(template_arg_count)]
+        template_decls = ['typename ' + arg for arg in template_args]
+        lines.append('template <' + ', '.join(template_decls) + '>')
+        parent_name += '<' + ', '.join(template_args) + '>'
+
+      # Add the class prolog.
+      lines.append('class Mock%s : public %s {'  # }
+                   % (class_name, parent_name))
+      lines.append('%spublic:' % (' ' * (_INDENT // 2)))
+
+      # Add all the methods.
+      _GenerateMethods(lines, source, class_node)
+
+      # Close the class.
+      if lines:
+        # If there are no virtual methods, no need for a public label.
+        if len(lines) == 2:
+          del lines[-1]
+
+        # Only close the class if there really is a class.
+        lines.append('};')
+        lines.append('')  # Add an extra newline.
+
+      # Close the namespace.
+      if class_node.namespace:
+        for i in range(len(class_node.namespace) - 1, -1, -1):
+          lines.append('}  // namespace %s' % class_node.namespace[i])
+        lines.append('')  # Add an extra newline.
+
+  if desired_class_names:
+    missing_class_name_list = list(desired_class_names - processed_class_names)
+    if missing_class_name_list:
+      missing_class_name_list.sort()
+      sys.stderr.write('Class(es) not found in %s: %s\n' %
+                       (filename, ', '.join(missing_class_name_list)))
+  elif not processed_class_names:
+    sys.stderr.write('No class found in %s\n' % filename)
+
+  return lines
+
+
+def main(argv=sys.argv):
+  if len(argv) < 2:
+    sys.stderr.write('Google Mock Class Generator v%s\n\n' %
+                     '.'.join(map(str, _VERSION)))
+    sys.stderr.write(__doc__)
+    return 1
+
+  global _INDENT
+  try:
+    _INDENT = int(os.environ['INDENT'])
+  except KeyError:
+    pass
+  except:
+    sys.stderr.write('Unable to use indent of %s\n' % os.environ.get('INDENT'))
+
+  filename = argv[1]
+  desired_class_names = None  # None means all classes in the source file.
+  if len(argv) >= 3:
+    desired_class_names = set(argv[2:])
+  source = utils.ReadFile(filename)
+  if source is None:
+    return 1
+
+  builder = ast.BuilderFromSource(source, filename)
+  try:
+    entire_ast = filter(None, builder.Generate())
+  except KeyboardInterrupt:
+    return
+  except:
+    # An error message was already printed since we couldn't parse.
+    sys.exit(1)
+  else:
+    lines = _GenerateMocks(filename, source, entire_ast, desired_class_names)
+    sys.stdout.write('\n'.join(lines))
+
+
+if __name__ == '__main__':
+  main(sys.argv)
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/gmock_class_test.py b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/gmock_class_test.py
new file mode 100755
index 0000000000..74655692c3
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/gmock_class_test.py
@@ -0,0 +1,552 @@
+#!/usr/bin/env python
+#
+# Copyright 2009 Neal Norwitz All Rights Reserved.
+# Portions Copyright 2009 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for gmock.scripts.generator.cpp.gmock_class."""
+
+import os
+import sys
+import unittest
+
+# Allow the cpp imports below to work when run as a standalone script.
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+
+from cpp import ast
+from cpp import gmock_class
+
+
+class TestCase(unittest.TestCase):
+  """Helper class that adds assert methods."""
+
+  @staticmethod
+  def StripLeadingWhitespace(lines):
+    """Strip leading whitespace in each line in 'lines'."""
+    return '\n'.join([s.lstrip() for s in lines.split('\n')])
+
+  def assertEqualIgnoreLeadingWhitespace(self, expected_lines, lines):
+    """Specialized assert that ignores the indent level."""
+    self.assertEqual(expected_lines, self.StripLeadingWhitespace(lines))
+
+
+class GenerateMethodsTest(TestCase):
+
+  @staticmethod
+  def GenerateMethodSource(cpp_source):
+    """Convert C++ source to Google Mock output source lines."""
+    method_source_lines = []
+    # <test> is a pseudo-filename, it is not read or written.
+    builder = ast.BuilderFromSource(cpp_source, '<test>')
+    ast_list = list(builder.Generate())
+    gmock_class._GenerateMethods(method_source_lines, cpp_source, ast_list[0])
+    return '\n'.join(method_source_lines)
+
+  def testSimpleMethod(self):
+    source = """
+class Foo {
+ public:
+  virtual int Bar();
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(int, Bar, (), (override));',
+        self.GenerateMethodSource(source))
+
+  def testSimpleConstructorsAndDestructor(self):
+    source = """
+class Foo {
+ public:
+  Foo();
+  Foo(int x);
+  Foo(const Foo& f);
+  Foo(Foo&& f);
+  ~Foo();
+  virtual int Bar() = 0;
+};
+"""
+    # The constructors and destructor should be ignored.
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(int, Bar, (), (override));',
+        self.GenerateMethodSource(source))
+
+  def testVirtualDestructor(self):
+    source = """
+class Foo {
+ public:
+  virtual ~Foo();
+  virtual int Bar() = 0;
+};
+"""
+    # The destructor should be ignored.
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(int, Bar, (), (override));',
+        self.GenerateMethodSource(source))
+
+  def testExplicitlyDefaultedConstructorsAndDestructor(self):
+    source = """
+class Foo {
+ public:
+  Foo() = default;
+  Foo(const Foo& f) = default;
+  Foo(Foo&& f) = default;
+  ~Foo() = default;
+  virtual int Bar() = 0;
+};
+"""
+    # The constructors and destructor should be ignored.
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(int, Bar, (), (override));',
+        self.GenerateMethodSource(source))
+
+  def testExplicitlyDeletedConstructorsAndDestructor(self):
+    source = """
+class Foo {
+ public:
+  Foo() = delete;
+  Foo(const Foo& f) = delete;
+  Foo(Foo&& f) = delete;
+  ~Foo() = delete;
+  virtual int Bar() = 0;
+};
+"""
+    # The constructors and destructor should be ignored.
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(int, Bar, (), (override));',
+        self.GenerateMethodSource(source))
+
+  def testSimpleOverrideMethod(self):
+    source = """
+class Foo {
+ public:
+  int Bar() override;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(int, Bar, (), (override));',
+        self.GenerateMethodSource(source))
+
+  def testSimpleConstMethod(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(bool flag) const;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(void, Bar, (bool flag), (const, override));',
+        self.GenerateMethodSource(source))
+
+  def testExplicitVoid(self):
+    source = """
+class Foo {
+ public:
+  virtual int Bar(void);
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(int, Bar, (void), (override));',
+        self.GenerateMethodSource(source))
+
+  def testStrangeNewlineInParameter(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(int
+a) = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(void, Bar, (int a), (override));',
+        self.GenerateMethodSource(source))
+
+  def testDefaultParameters(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(int a, char c = 'x') = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(void, Bar, (int a, char c), (override));',
+        self.GenerateMethodSource(source))
+
+  def testMultipleDefaultParameters(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(
+        int a = 42, 
+        char c = 'x', 
+        const int* const p = nullptr, 
+        const std::string& s = "42",
+        char tab[] = {'4','2'},
+        int const *& rp = aDefaultPointer) = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(void, Bar, '
+        '(int a, char c, const int* const p, const std::string& s, char tab[], int const *& rp), '
+        '(override));', self.GenerateMethodSource(source))
+
+  def testMultipleSingleLineDefaultParameters(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(int a = 42, int b = 43, int c = 44) = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(void, Bar, (int a, int b, int c), (override));',
+        self.GenerateMethodSource(source))
+
+  def testConstDefaultParameter(self):
+    source = """
+class Test {
+ public:
+  virtual bool Bar(const int test_arg = 42) = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(bool, Bar, (const int test_arg), (override));',
+        self.GenerateMethodSource(source))
+
+  def testConstRefDefaultParameter(self):
+    source = """
+class Test {
+ public:
+  virtual bool Bar(const std::string& test_arg = "42" ) = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(bool, Bar, (const std::string& test_arg), (override));',
+        self.GenerateMethodSource(source))
+
+  def testRemovesCommentsWhenDefaultsArePresent(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(int a = 42 /* a comment */,
+                   char /* other comment */ c= 'x') = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(void, Bar, (int a, char c), (override));',
+        self.GenerateMethodSource(source))
+
+  def testDoubleSlashCommentsInParameterListAreRemoved(self):
+    source = """
+class Foo {
+ public:
+  virtual void Bar(int a,  // inline comments should be elided.
+                   int b   // inline comments should be elided.
+                   ) const = 0;
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(void, Bar, (int a, int b), (const, override));',
+        self.GenerateMethodSource(source))
+
+  def testCStyleCommentsInParameterListAreNotRemoved(self):
+    # NOTE(nnorwitz): I'm not sure if it's the best behavior to keep these
+    # comments.  Also note that C style comments after the last parameter
+    # are still elided.
+    source = """
+class Foo {
+ public:
+  virtual const string& Bar(int /* keeper */, int b);
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(const string&, Bar, (int, int b), (override));',
+        self.GenerateMethodSource(source))
+
+  def testArgsOfTemplateTypes(self):
+    source = """
+class Foo {
+ public:
+  virtual int Bar(const vector<int>& v, map<int, string>* output);
+};"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(int, Bar, (const vector<int>& v, (map<int, string>* output)), (override));',
+        self.GenerateMethodSource(source))
+
+  def testReturnTypeWithOneTemplateArg(self):
+    source = """
+class Foo {
+ public:
+  virtual vector<int>* Bar(int n);
+};"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(vector<int>*, Bar, (int n), (override));',
+        self.GenerateMethodSource(source))
+
+  def testReturnTypeWithManyTemplateArgs(self):
+    source = """
+class Foo {
+ public:
+  virtual map<int, string> Bar();
+};"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD((map<int, string>), Bar, (), (override));',
+        self.GenerateMethodSource(source))
+
+  def testSimpleMethodInTemplatedClass(self):
+    source = """
+template<class T>
+class Foo {
+ public:
+  virtual int Bar();
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(int, Bar, (), (override));',
+        self.GenerateMethodSource(source))
+
+  def testPointerArgWithoutNames(self):
+    source = """
+class Foo {
+  virtual int Bar(C*);
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(int, Bar, (C*), (override));',
+        self.GenerateMethodSource(source))
+
+  def testReferenceArgWithoutNames(self):
+    source = """
+class Foo {
+  virtual int Bar(C&);
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(int, Bar, (C&), (override));',
+        self.GenerateMethodSource(source))
+
+  def testArrayArgWithoutNames(self):
+    source = """
+class Foo {
+  virtual int Bar(C[]);
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(
+        'MOCK_METHOD(int, Bar, (C[]), (override));',
+        self.GenerateMethodSource(source))
+
+
+class GenerateMocksTest(TestCase):
+
+  @staticmethod
+  def GenerateMocks(cpp_source):
+    """Convert C++ source to complete Google Mock output source."""
+    # <test> is a pseudo-filename, it is not read or written.
+    filename = '<test>'
+    builder = ast.BuilderFromSource(cpp_source, filename)
+    ast_list = list(builder.Generate())
+    lines = gmock_class._GenerateMocks(filename, cpp_source, ast_list, None)
+    return '\n'.join(lines)
+
+  def testNamespaces(self):
+    source = """
+namespace Foo {
+namespace Bar { class Forward; }
+namespace Baz {
+
+class Test {
+ public:
+  virtual void Foo();
+};
+
+}  // namespace Baz
+}  // namespace Foo
+"""
+    expected = """\
+namespace Foo {
+namespace Baz {
+
+class MockTest : public Test {
+public:
+MOCK_METHOD(void, Foo, (), (override));
+};
+
+}  // namespace Baz
+}  // namespace Foo
+"""
+    self.assertEqualIgnoreLeadingWhitespace(expected,
+                                            self.GenerateMocks(source))
+
+  def testClassWithStorageSpecifierMacro(self):
+    source = """
+class STORAGE_SPECIFIER Test {
+ public:
+  virtual void Foo();
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD(void, Foo, (), (override));
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(expected,
+                                            self.GenerateMocks(source))
+
+  def testTemplatedForwardDeclaration(self):
+    source = """
+template <class T> class Forward;  // Forward declaration should be ignored.
+class Test {
+ public:
+  virtual void Foo();
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD(void, Foo, (), (override));
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(expected,
+                                            self.GenerateMocks(source))
+
+  def testTemplatedClass(self):
+    source = """
+template <typename S, typename T>
+class Test {
+ public:
+  virtual void Foo();
+};
+"""
+    expected = """\
+template <typename T0, typename T1>
+class MockTest : public Test<T0, T1> {
+public:
+MOCK_METHOD(void, Foo, (), (override));
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(expected,
+                                            self.GenerateMocks(source))
+
+  def testTemplateInATemplateTypedef(self):
+    source = """
+class Test {
+ public:
+  typedef std::vector<std::list<int>> FooType;
+  virtual void Bar(const FooType& test_arg);
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD(void, Bar, (const FooType& test_arg), (override));
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(expected,
+                                            self.GenerateMocks(source))
+
+  def testTemplateInATemplateTypedefWithComma(self):
+    source = """
+class Test {
+ public:
+  typedef std::function<void(
+      const vector<std::list<int>>&, int> FooType;
+  virtual void Bar(const FooType& test_arg);
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD(void, Bar, (const FooType& test_arg), (override));
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(expected,
+                                            self.GenerateMocks(source))
+
+  def testParenthesizedCommaInArg(self):
+    source = """
+class Test {
+ public:
+   virtual void Bar(std::function<void(int, int)> f);
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD(void, Bar, (std::function<void(int, int)> f), (override));
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(expected,
+                                            self.GenerateMocks(source))
+
+  def testEnumType(self):
+    source = """
+class Test {
+ public:
+  enum Bar {
+    BAZ, QUX, QUUX, QUUUX
+  };
+  virtual void Foo();
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD(void, Foo, (), (override));
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(expected,
+                                            self.GenerateMocks(source))
+
+  def testEnumClassType(self):
+    source = """
+class Test {
+ public:
+  enum class Bar {
+    BAZ, QUX, QUUX, QUUUX
+  };
+  virtual void Foo();
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD(void, Foo, (), (override));
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(expected,
+                                            self.GenerateMocks(source))
+
+  def testStdFunction(self):
+    source = """
+class Test {
+ public:
+  Test(std::function<int(std::string)> foo) : foo_(foo) {}
+
+  virtual std::function<int(std::string)> foo();
+
+ private:
+  std::function<int(std::string)> foo_;
+};
+"""
+    expected = """\
+class MockTest : public Test {
+public:
+MOCK_METHOD(std::function<int (std::string)>, foo, (), (override));
+};
+"""
+    self.assertEqualIgnoreLeadingWhitespace(expected,
+                                            self.GenerateMocks(source))
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/keywords.py b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/keywords.py
new file mode 100755
index 0000000000..e4282714dd
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/keywords.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 Neal Norwitz
+# Portions Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""C++ keywords and helper utilities for determining keywords."""
+
+try:
+    # Python 3.x
+    import builtins
+except ImportError:
+    # Python 2.x
+    import __builtin__ as builtins
+
+
+if not hasattr(builtins, 'set'):
+    # Nominal support for Python 2.3.
+    from sets import Set as set
+
+
+TYPES = set('bool char int long short double float void wchar_t unsigned signed'.split())
+TYPE_MODIFIERS = set('auto register const inline extern static virtual volatile mutable'.split())
+ACCESS = set('public protected private friend'.split())
+
+CASTS = set('static_cast const_cast dynamic_cast reinterpret_cast'.split())
+
+OTHERS = set('true false asm class namespace using explicit this operator sizeof'.split())
+OTHER_TYPES = set('new delete typedef struct union enum typeid typename template'.split())
+
+CONTROL = set('case switch default if else return goto'.split())
+EXCEPTION = set('try catch throw'.split())
+LOOP = set('while do for break continue'.split())
+
+ALL = TYPES | TYPE_MODIFIERS | ACCESS | CASTS | OTHERS | OTHER_TYPES | CONTROL | EXCEPTION | LOOP
+
+
+def IsKeyword(token):
+    return token in ALL
+
+def IsBuiltinType(token):
+    if token in ('virtual', 'inline'):
+        # These only apply to methods, they can't be types by themselves.
+        return False
+    return token in TYPES or token in TYPE_MODIFIERS
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/tokenize.py b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/tokenize.py
new file mode 100755
index 0000000000..a75edcb142
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/tokenize.py
@@ -0,0 +1,284 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 Neal Norwitz
+# Portions Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenize C++ source code."""
+
+try:
+    # Python 3.x
+    import builtins
+except ImportError:
+    # Python 2.x
+    import __builtin__ as builtins
+
+
+import sys
+
+from cpp import utils
+
+
+if not hasattr(builtins, 'set'):
+    # Nominal support for Python 2.3.
+    from sets import Set as set
+
+
+# Add $ as a valid identifier char since so much code uses it.
+_letters = 'abcdefghijklmnopqrstuvwxyz'
+VALID_IDENTIFIER_CHARS = set(_letters + _letters.upper() + '_0123456789$')
+HEX_DIGITS = set('0123456789abcdefABCDEF')
+INT_OR_FLOAT_DIGITS = set('01234567890eE-+')
+
+
+# C++0x string preffixes.
+_STR_PREFIXES = set(('R', 'u8', 'u8R', 'u', 'uR', 'U', 'UR', 'L', 'LR'))
+
+
+# Token types.
+UNKNOWN = 'UNKNOWN'
+SYNTAX = 'SYNTAX'
+CONSTANT = 'CONSTANT'
+NAME = 'NAME'
+PREPROCESSOR = 'PREPROCESSOR'
+
+# Where the token originated from.  This can be used for backtracking.
+# It is always set to WHENCE_STREAM in this code.
+WHENCE_STREAM, WHENCE_QUEUE = range(2)
+
+
+class Token(object):
+    """Data container to represent a C++ token.
+
+    Tokens can be identifiers, syntax char(s), constants, or
+    pre-processor directives.
+
+    start contains the index of the first char of the token in the source
+    end contains the index of the last char of the token in the source
+    """
+
+    def __init__(self, token_type, name, start, end):
+        self.token_type = token_type
+        self.name = name
+        self.start = start
+        self.end = end
+        self.whence = WHENCE_STREAM
+
+    def __str__(self):
+        if not utils.DEBUG:
+            return 'Token(%r)' % self.name
+        return 'Token(%r, %s, %s)' % (self.name, self.start, self.end)
+
+    __repr__ = __str__
+
+
+def _GetString(source, start, i):
+    i = source.find('"', i+1)
+    while source[i-1] == '\\':
+        # Count the trailing backslashes.
+        backslash_count = 1
+        j = i - 2
+        while source[j] == '\\':
+            backslash_count += 1
+            j -= 1
+        # When trailing backslashes are even, they escape each other.
+        if (backslash_count % 2) == 0:
+            break
+        i = source.find('"', i+1)
+    return i + 1
+
+
+def _GetChar(source, start, i):
+    # NOTE(nnorwitz): may not be quite correct, should be good enough.
+    i = source.find("'", i+1)
+    while source[i-1] == '\\':
+        # Need to special case '\\'.
+        if (i - 2) > start and source[i-2] == '\\':
+            break
+        i = source.find("'", i+1)
+    # Try to handle unterminated single quotes (in a #if 0 block).
+    if i < 0:
+        i = start
+    return i + 1
+
+
+def GetTokens(source):
+    """Returns a sequence of Tokens.
+
+    Args:
+      source: string of C++ source code.
+
+    Yields:
+      Token that represents the next token in the source.
+    """
+    # Cache various valid character sets for speed.
+    valid_identifier_chars = VALID_IDENTIFIER_CHARS
+    hex_digits = HEX_DIGITS
+    int_or_float_digits = INT_OR_FLOAT_DIGITS
+    int_or_float_digits2 = int_or_float_digits | set('.')
+
+    # Only ignore errors while in a #if 0 block.
+    ignore_errors = False
+    count_ifs = 0
+
+    i = 0
+    end = len(source)
+    while i < end:
+        # Skip whitespace.
+        while i < end and source[i].isspace():
+            i += 1
+        if i >= end:
+            return
+
+        token_type = UNKNOWN
+        start = i
+        c = source[i]
+        if c.isalpha() or c == '_':              # Find a string token.
+            token_type = NAME
+            while source[i] in valid_identifier_chars:
+                i += 1
+            # String and character constants can look like a name if
+            # they are something like L"".
+            if (source[i] == "'" and (i - start) == 1 and
+                source[start:i] in 'uUL'):
+                # u, U, and L are valid C++0x character preffixes.
+                token_type = CONSTANT
+                i = _GetChar(source, start, i)
+            elif source[i] == "'" and source[start:i] in _STR_PREFIXES:
+                token_type = CONSTANT
+                i = _GetString(source, start, i)
+        elif c == '/' and source[i+1] == '/':    # Find // comments.
+            i = source.find('\n', i)
+            if i == -1:  # Handle EOF.
+                i = end
+            continue
+        elif c == '/' and source[i+1] == '*':    # Find /* comments. */
+            i = source.find('*/', i) + 2
+            continue
+        elif c in ':+-<>&|*=':                   # : or :: (plus other chars).
+            token_type = SYNTAX
+            i += 1
+            new_ch = source[i]
+            if new_ch == c and c != '>':         # Treat ">>" as two tokens.
+                i += 1
+            elif c == '-' and new_ch == '>':
+                i += 1
+            elif new_ch == '=':
+                i += 1
+        elif c in '()[]{}~!?^%;/.,':             # Handle single char tokens.
+            token_type = SYNTAX
+            i += 1
+            if c == '.' and source[i].isdigit():
+                token_type = CONSTANT
+                i += 1
+                while source[i] in int_or_float_digits:
+                    i += 1
+                # Handle float suffixes.
+                for suffix in ('l', 'f'):
+                    if suffix == source[i:i+1].lower():
+                        i += 1
+                        break
+        elif c.isdigit():                        # Find integer.
+            token_type = CONSTANT
+            if c == '0' and source[i+1] in 'xX':
+                # Handle hex digits.
+                i += 2
+                while source[i] in hex_digits:
+                    i += 1
+            else:
+                while source[i] in int_or_float_digits2:
+                    i += 1
+            # Handle integer (and float) suffixes.
+            for suffix in ('ull', 'll', 'ul', 'l', 'f', 'u'):
+                size = len(suffix)
+                if suffix == source[i:i+size].lower():
+                    i += size
+                    break
+        elif c == '"':                           # Find string.
+            token_type = CONSTANT
+            i = _GetString(source, start, i)
+        elif c == "'":                           # Find char.
+            token_type = CONSTANT
+            i = _GetChar(source, start, i)
+        elif c == '#':                           # Find pre-processor command.
+            token_type = PREPROCESSOR
+            got_if = source[i:i+3] == '#if' and source[i+3:i+4].isspace()
+            if got_if:
+                count_ifs += 1
+            elif source[i:i+6] == '#endif':
+                count_ifs -= 1
+                if count_ifs == 0:
+                    ignore_errors = False
+
+            # TODO(nnorwitz): handle preprocessor statements (\ continuations).
+            while 1:
+                i1 = source.find('\n', i)
+                i2 = source.find('//', i)
+                i3 = source.find('/*', i)
+                i4 = source.find('"', i)
+                # NOTE(nnorwitz): doesn't handle comments in #define macros.
+                # Get the first important symbol (newline, comment, EOF/end).
+                i = min([x for x in (i1, i2, i3, i4, end) if x != -1])
+
+                # Handle #include "dir//foo.h" properly.
+                if source[i] == '"':
+                    i = source.find('"', i+1) + 1
+                    assert i > 0
+                    continue
+                # Keep going if end of the line and the line ends with \.
+                if not (i == i1 and source[i-1] == '\\'):
+                    if got_if:
+                        condition = source[start+4:i].lstrip()
+                        if (condition.startswith('0') or
+                            condition.startswith('(0)')):
+                            ignore_errors = True
+                    break
+                i += 1
+        elif c == '\\':                          # Handle \ in code.
+            # This is different from the pre-processor \ handling.
+            i += 1
+            continue
+        elif ignore_errors:
+            # The tokenizer seems to be in pretty good shape.  This
+            # raise is conditionally disabled so that bogus code
+            # in an #if 0 block can be handled.  Since we will ignore
+            # it anyways, this is probably fine.  So disable the
+            # exception and  return the bogus char.
+            i += 1
+        else:
+            sys.stderr.write('Got invalid token in %s @ %d token:%s: %r\n' %
+                             ('?', i, c, source[i-10:i+10]))
+            raise RuntimeError('unexpected token')
+
+        if i <= 0:
+            print('Invalid index, exiting now.')
+            return
+        yield Token(token_type, source[start:i], start, i)
+
+
+if __name__ == '__main__':
+    def main(argv):
+        """Driver mostly for testing purposes."""
+        for filename in argv[1:]:
+            source = utils.ReadFile(filename)
+            if source is None:
+                continue
+
+            for token in GetTokens(source):
+                print('%-12s: %s' % (token.token_type, token.name))
+                # print('\r%6.2f%%' % (100.0 * index / token.end),)
+            sys.stdout.write('\n')
+
+
+    main(sys.argv)
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/utils.py b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/utils.py
new file mode 100755
index 0000000000..6f5fc097b9
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/generator/cpp/utils.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+#
+# Copyright 2007 Neal Norwitz
+# Portions Copyright 2007 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Generic utilities for C++ parsing."""
+
+import sys
+
+# Set to True to see the start/end token indices.
+DEBUG = True
+
+
+def ReadFile(filename, print_error=True):
+    """Returns the contents of a file."""
+    try:
+        fp = open(filename)
+        try:
+            return fp.read()
+        finally:
+            fp.close()
+    except IOError:
+        if print_error:
+            print('Error reading %s: %s' % (filename, sys.exc_info()[1]))
+        return None
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/generator/gmock_gen.py b/GraphBLAS/CUDA/test/googlemock/scripts/generator/gmock_gen.py
new file mode 100755
index 0000000000..9d528a56d9
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/generator/gmock_gen.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+#
+# Copyright 2008 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Driver for starting up Google Mock class generator."""
+
+
+import os
+import sys
+
+if __name__ == '__main__':
+  # Add the directory of this script to the path so we can import gmock_class.
+  sys.path.append(os.path.dirname(__file__))
+
+  from cpp import gmock_class
+  # Fix the docstring in case they require the usage.
+  gmock_class.__doc__ = gmock_class.__doc__.replace('gmock_class.py', __file__)
+  gmock_class.main()
diff --git a/GraphBLAS/CUDA/test/googlemock/scripts/pump.py b/GraphBLAS/CUDA/test/googlemock/scripts/pump.py
new file mode 100755
index 0000000000..5523a19de0
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/scripts/pump.py
@@ -0,0 +1,856 @@
+#!/usr/bin/env python
+#
+# Copyright 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""pump v0.2.0 - Pretty Useful for Meta Programming.
+
+A tool for preprocessor meta programming.  Useful for generating
+repetitive boilerplate code.  Especially useful for writing C++
+classes, functions, macros, and templates that need to work with
+various number of arguments.
+
+USAGE:
+       pump.py SOURCE_FILE
+
+EXAMPLES:
+       pump.py foo.cc.pump
+         Converts foo.cc.pump to foo.cc.
+
+GRAMMAR:
+       CODE ::= ATOMIC_CODE*
+       ATOMIC_CODE ::= $var ID = EXPRESSION
+           | $var ID = [[ CODE ]]
+           | $range ID EXPRESSION..EXPRESSION
+           | $for ID SEPARATOR [[ CODE ]]
+           | $($)
+           | $ID
+           | $(EXPRESSION)
+           | $if EXPRESSION [[ CODE ]] ELSE_BRANCH
+           | [[ CODE ]]
+           | RAW_CODE
+       SEPARATOR ::= RAW_CODE | EMPTY
+       ELSE_BRANCH ::= $else [[ CODE ]]
+           | $elif EXPRESSION [[ CODE ]] ELSE_BRANCH
+           | EMPTY
+       EXPRESSION has Python syntax.
+"""
+
+from __future__ import print_function
+
+import io
+import os
+import re
+import sys
+
+
+TOKEN_TABLE = [
+    (re.compile(r'\$var\s+'), '$var'),
+    (re.compile(r'\$elif\s+'), '$elif'),
+    (re.compile(r'\$else\s+'), '$else'),
+    (re.compile(r'\$for\s+'), '$for'),
+    (re.compile(r'\$if\s+'), '$if'),
+    (re.compile(r'\$range\s+'), '$range'),
+    (re.compile(r'\$[_A-Za-z]\w*'), '$id'),
+    (re.compile(r'\$\(\$\)'), '$($)'),
+    (re.compile(r'\$'), '$'),
+    (re.compile(r'\[\[\n?'), '[['),
+    (re.compile(r'\]\]\n?'), ']]'),
+    ]
+
+
+class Cursor:
+  """Represents a position (line and column) in a text file."""
+
+  def __init__(self, line=-1, column=-1):
+    self.line = line
+    self.column = column
+
+  def __eq__(self, rhs):
+    return self.line == rhs.line and self.column == rhs.column
+
+  def __ne__(self, rhs):
+    return not self == rhs
+
+  def __lt__(self, rhs):
+    return self.line < rhs.line or (
+        self.line == rhs.line and self.column < rhs.column)
+
+  def __le__(self, rhs):
+    return self < rhs or self == rhs
+
+  def __gt__(self, rhs):
+    return rhs < self
+
+  def __ge__(self, rhs):
+    return rhs <= self
+
+  def __str__(self):
+    if self == Eof():
+      return 'EOF'
+    else:
+      return '%s(%s)' % (self.line + 1, self.column)
+
+  def __add__(self, offset):
+    return Cursor(self.line, self.column + offset)
+
+  def __sub__(self, offset):
+    return Cursor(self.line, self.column - offset)
+
+  def Clone(self):
+    """Returns a copy of self."""
+
+    return Cursor(self.line, self.column)
+
+
+# Special cursor to indicate the end-of-file.
+def Eof():
+  """Returns the special cursor to denote the end-of-file."""
+  return Cursor(-1, -1)
+
+
+class Token:
+  """Represents a token in a Pump source file."""
+
+  def __init__(self, start=None, end=None, value=None, token_type=None):
+    if start is None:
+      self.start = Eof()
+    else:
+      self.start = start
+    if end is None:
+      self.end = Eof()
+    else:
+      self.end = end
+    self.value = value
+    self.token_type = token_type
+
+  def __str__(self):
+    return 'Token @%s: \'%s\' type=%s' % (
+        self.start, self.value, self.token_type)
+
+  def Clone(self):
+    """Returns a copy of self."""
+
+    return Token(self.start.Clone(), self.end.Clone(), self.value,
+                 self.token_type)
+
+
+def StartsWith(lines, pos, string):
+  """Returns True iff the given position in lines starts with 'string'."""
+
+  return lines[pos.line][pos.column:].startswith(string)
+
+
+def FindFirstInLine(line, token_table):
+  best_match_start = -1
+  for (regex, token_type) in token_table:
+    m = regex.search(line)
+    if m:
+      # We found regex in lines
+      if best_match_start < 0 or m.start() < best_match_start:
+        best_match_start = m.start()
+        best_match_length = m.end() - m.start()
+        best_match_token_type = token_type
+
+  if best_match_start < 0:
+    return None
+
+  return (best_match_start, best_match_length, best_match_token_type)
+
+
+def FindFirst(lines, token_table, cursor):
+  """Finds the first occurrence of any string in strings in lines."""
+
+  start = cursor.Clone()
+  cur_line_number = cursor.line
+  for line in lines[start.line:]:
+    if cur_line_number == start.line:
+      line = line[start.column:]
+    m = FindFirstInLine(line, token_table)
+    if m:
+      # We found a regex in line.
+      (start_column, length, token_type) = m
+      if cur_line_number == start.line:
+        start_column += start.column
+      found_start = Cursor(cur_line_number, start_column)
+      found_end = found_start + length
+      return MakeToken(lines, found_start, found_end, token_type)
+    cur_line_number += 1
+  # We failed to find str in lines
+  return None
+
+
+def SubString(lines, start, end):
+  """Returns a substring in lines."""
+
+  if end == Eof():
+    end = Cursor(len(lines) - 1, len(lines[-1]))
+
+  if start >= end:
+    return ''
+
+  if start.line == end.line:
+    return lines[start.line][start.column:end.column]
+
+  result_lines = ([lines[start.line][start.column:]] +
+                  lines[start.line + 1:end.line] +
+                  [lines[end.line][:end.column]])
+  return ''.join(result_lines)
+
+
+def StripMetaComments(str):
+  """Strip meta comments from each line in the given string."""
+
+  # First, completely remove lines containing nothing but a meta
+  # comment, including the trailing \n.
+  str = re.sub(r'^\s*\$\$.*\n', '', str)
+
+  # Then, remove meta comments from contentful lines.
+  return re.sub(r'\s*\$\$.*', '', str)
+
+
+def MakeToken(lines, start, end, token_type):
+  """Creates a new instance of Token."""
+
+  return Token(start, end, SubString(lines, start, end), token_type)
+
+
+def ParseToken(lines, pos, regex, token_type):
+  line = lines[pos.line][pos.column:]
+  m = regex.search(line)
+  if m and not m.start():
+    return MakeToken(lines, pos, pos + m.end(), token_type)
+  else:
+    print('ERROR: %s expected at %s.' % (token_type, pos))
+    sys.exit(1)
+
+
+ID_REGEX = re.compile(r'[_A-Za-z]\w*')
+EQ_REGEX = re.compile(r'=')
+REST_OF_LINE_REGEX = re.compile(r'.*?(?=$|\$\$)')
+OPTIONAL_WHITE_SPACES_REGEX = re.compile(r'\s*')
+WHITE_SPACE_REGEX = re.compile(r'\s')
+DOT_DOT_REGEX = re.compile(r'\.\.')
+
+
+def Skip(lines, pos, regex):
+  line = lines[pos.line][pos.column:]
+  m = re.search(regex, line)
+  if m and not m.start():
+    return pos + m.end()
+  else:
+    return pos
+
+
+def SkipUntil(lines, pos, regex, token_type):
+  line = lines[pos.line][pos.column:]
+  m = re.search(regex, line)
+  if m:
+    return pos + m.start()
+  else:
+    print ('ERROR: %s expected on line %s after column %s.' %
+           (token_type, pos.line + 1, pos.column))
+    sys.exit(1)
+
+
+def ParseExpTokenInParens(lines, pos):
+  def ParseInParens(pos):
+    pos = Skip(lines, pos, OPTIONAL_WHITE_SPACES_REGEX)
+    pos = Skip(lines, pos, r'\(')
+    pos = Parse(pos)
+    pos = Skip(lines, pos, r'\)')
+    return pos
+
+  def Parse(pos):
+    pos = SkipUntil(lines, pos, r'\(|\)', ')')
+    if SubString(lines, pos, pos + 1) == '(':
+      pos = Parse(pos + 1)
+      pos = Skip(lines, pos, r'\)')
+      return Parse(pos)
+    else:
+      return pos
+
+  start = pos.Clone()
+  pos = ParseInParens(pos)
+  return MakeToken(lines, start, pos, 'exp')
+
+
+def RStripNewLineFromToken(token):
+  if token.value.endswith('\n'):
+    return Token(token.start, token.end, token.value[:-1], token.token_type)
+  else:
+    return token
+
+
+def TokenizeLines(lines, pos):
+  while True:
+    found = FindFirst(lines, TOKEN_TABLE, pos)
+    if not found:
+      yield MakeToken(lines, pos, Eof(), 'code')
+      return
+
+    if found.start == pos:
+      prev_token = None
+      prev_token_rstripped = None
+    else:
+      prev_token = MakeToken(lines, pos, found.start, 'code')
+      prev_token_rstripped = RStripNewLineFromToken(prev_token)
+
+    if found.token_type == '$var':
+      if prev_token_rstripped:
+        yield prev_token_rstripped
+      yield found
+      id_token = ParseToken(lines, found.end, ID_REGEX, 'id')
+      yield id_token
+      pos = Skip(lines, id_token.end, OPTIONAL_WHITE_SPACES_REGEX)
+
+      eq_token = ParseToken(lines, pos, EQ_REGEX, '=')
+      yield eq_token
+      pos = Skip(lines, eq_token.end, r'\s*')
+
+      if SubString(lines, pos, pos + 2) != '[[':
+        exp_token = ParseToken(lines, pos, REST_OF_LINE_REGEX, 'exp')
+        yield exp_token
+        pos = Cursor(exp_token.end.line + 1, 0)
+    elif found.token_type == '$for':
+      if prev_token_rstripped:
+        yield prev_token_rstripped
+      yield found
+      id_token = ParseToken(lines, found.end, ID_REGEX, 'id')
+      yield id_token
+      pos = Skip(lines, id_token.end, WHITE_SPACE_REGEX)
+    elif found.token_type == '$range':
+      if prev_token_rstripped:
+        yield prev_token_rstripped
+      yield found
+      id_token = ParseToken(lines, found.end, ID_REGEX, 'id')
+      yield id_token
+      pos = Skip(lines, id_token.end, OPTIONAL_WHITE_SPACES_REGEX)
+
+      dots_pos = SkipUntil(lines, pos, DOT_DOT_REGEX, '..')
+      yield MakeToken(lines, pos, dots_pos, 'exp')
+      yield MakeToken(lines, dots_pos, dots_pos + 2, '..')
+      pos = dots_pos + 2
+      new_pos = Cursor(pos.line + 1, 0)
+      yield MakeToken(lines, pos, new_pos, 'exp')
+      pos = new_pos
+    elif found.token_type == '$':
+      if prev_token:
+        yield prev_token
+      yield found
+      exp_token = ParseExpTokenInParens(lines, found.end)
+      yield exp_token
+      pos = exp_token.end
+    elif (found.token_type == ']]' or found.token_type == '$if' or
+          found.token_type == '$elif' or found.token_type == '$else'):
+      if prev_token_rstripped:
+        yield prev_token_rstripped
+      yield found
+      pos = found.end
+    else:
+      if prev_token:
+        yield prev_token
+      yield found
+      pos = found.end
+
+
+def Tokenize(s):
+  """A generator that yields the tokens in the given string."""
+  if s != '':
+    lines = s.splitlines(True)
+    for token in TokenizeLines(lines, Cursor(0, 0)):
+      yield token
+
+
+class CodeNode:
+  def __init__(self, atomic_code_list=None):
+    self.atomic_code = atomic_code_list
+
+
+class VarNode:
+  def __init__(self, identifier=None, atomic_code=None):
+    self.identifier = identifier
+    self.atomic_code = atomic_code
+
+
+class RangeNode:
+  def __init__(self, identifier=None, exp1=None, exp2=None):
+    self.identifier = identifier
+    self.exp1 = exp1
+    self.exp2 = exp2
+
+
+class ForNode:
+  def __init__(self, identifier=None, sep=None, code=None):
+    self.identifier = identifier
+    self.sep = sep
+    self.code = code
+
+
+class ElseNode:
+  def __init__(self, else_branch=None):
+    self.else_branch = else_branch
+
+
+class IfNode:
+  def __init__(self, exp=None, then_branch=None, else_branch=None):
+    self.exp = exp
+    self.then_branch = then_branch
+    self.else_branch = else_branch
+
+
+class RawCodeNode:
+  def __init__(self, token=None):
+    self.raw_code = token
+
+
+class LiteralDollarNode:
+  def __init__(self, token):
+    self.token = token
+
+
+class ExpNode:
+  def __init__(self, token, python_exp):
+    self.token = token
+    self.python_exp = python_exp
+
+
+def PopFront(a_list):
+  head = a_list[0]
+  a_list[:1] = []
+  return head
+
+
+def PushFront(a_list, elem):
+  a_list[:0] = [elem]
+
+
+def PopToken(a_list, token_type=None):
+  token = PopFront(a_list)
+  if token_type is not None and token.token_type != token_type:
+    print('ERROR: %s expected at %s' % (token_type, token.start))
+    print('ERROR: %s found instead' % (token,))
+    sys.exit(1)
+
+  return token
+
+
+def PeekToken(a_list):
+  if not a_list:
+    return None
+
+  return a_list[0]
+
+
+def ParseExpNode(token):
+  python_exp = re.sub(r'([_A-Za-z]\w*)', r'self.GetValue("\1")', token.value)
+  return ExpNode(token, python_exp)
+
+
+def ParseElseNode(tokens):
+  def Pop(token_type=None):
+    return PopToken(tokens, token_type)
+
+  next = PeekToken(tokens)
+  if not next:
+    return None
+  if next.token_type == '$else':
+    Pop('$else')
+    Pop('[[')
+    code_node = ParseCodeNode(tokens)
+    Pop(']]')
+    return code_node
+  elif next.token_type == '$elif':
+    Pop('$elif')
+    exp = Pop('code')
+    Pop('[[')
+    code_node = ParseCodeNode(tokens)
+    Pop(']]')
+    inner_else_node = ParseElseNode(tokens)
+    return CodeNode([IfNode(ParseExpNode(exp), code_node, inner_else_node)])
+  elif not next.value.strip():
+    Pop('code')
+    return ParseElseNode(tokens)
+  else:
+    return None
+
+
+def ParseAtomicCodeNode(tokens):
+  def Pop(token_type=None):
+    return PopToken(tokens, token_type)
+
+  head = PopFront(tokens)
+  t = head.token_type
+  if t == 'code':
+    return RawCodeNode(head)
+  elif t == '$var':
+    id_token = Pop('id')
+    Pop('=')
+    next = PeekToken(tokens)
+    if next.token_type == 'exp':
+      exp_token = Pop()
+      return VarNode(id_token, ParseExpNode(exp_token))
+    Pop('[[')
+    code_node = ParseCodeNode(tokens)
+    Pop(']]')
+    return VarNode(id_token, code_node)
+  elif t == '$for':
+    id_token = Pop('id')
+    next_token = PeekToken(tokens)
+    if next_token.token_type == 'code':
+      sep_token = next_token
+      Pop('code')
+    else:
+      sep_token = None
+    Pop('[[')
+    code_node = ParseCodeNode(tokens)
+    Pop(']]')
+    return ForNode(id_token, sep_token, code_node)
+  elif t == '$if':
+    exp_token = Pop('code')
+    Pop('[[')
+    code_node = ParseCodeNode(tokens)
+    Pop(']]')
+    else_node = ParseElseNode(tokens)
+    return IfNode(ParseExpNode(exp_token), code_node, else_node)
+  elif t == '$range':
+    id_token = Pop('id')
+    exp1_token = Pop('exp')
+    Pop('..')
+    exp2_token = Pop('exp')
+    return RangeNode(id_token, ParseExpNode(exp1_token),
+                     ParseExpNode(exp2_token))
+  elif t == '$id':
+    return ParseExpNode(Token(head.start + 1, head.end, head.value[1:], 'id'))
+  elif t == '$($)':
+    return LiteralDollarNode(head)
+  elif t == '$':
+    exp_token = Pop('exp')
+    return ParseExpNode(exp_token)
+  elif t == '[[':
+    code_node = ParseCodeNode(tokens)
+    Pop(']]')
+    return code_node
+  else:
+    PushFront(tokens, head)
+    return None
+
+
+def ParseCodeNode(tokens):
+  atomic_code_list = []
+  while True:
+    if not tokens:
+      break
+    atomic_code_node = ParseAtomicCodeNode(tokens)
+    if atomic_code_node:
+      atomic_code_list.append(atomic_code_node)
+    else:
+      break
+  return CodeNode(atomic_code_list)
+
+
+def ParseToAST(pump_src_text):
+  """Convert the given Pump source text into an AST."""
+  tokens = list(Tokenize(pump_src_text))
+  code_node = ParseCodeNode(tokens)
+  return code_node
+
+
+class Env:
+  def __init__(self):
+    self.variables = []
+    self.ranges = []
+
+  def Clone(self):
+    clone = Env()
+    clone.variables = self.variables[:]
+    clone.ranges = self.ranges[:]
+    return clone
+
+  def PushVariable(self, var, value):
+    # If value looks like an int, store it as an int.
+    try:
+      int_value = int(value)
+      if ('%s' % int_value) == value:
+        value = int_value
+    except Exception:
+      pass
+    self.variables[:0] = [(var, value)]
+
+  def PopVariable(self):
+    self.variables[:1] = []
+
+  def PushRange(self, var, lower, upper):
+    self.ranges[:0] = [(var, lower, upper)]
+
+  def PopRange(self):
+    self.ranges[:1] = []
+
+  def GetValue(self, identifier):
+    for (var, value) in self.variables:
+      if identifier == var:
+        return value
+
+    print('ERROR: meta variable %s is undefined.' % (identifier,))
+    sys.exit(1)
+
+  def EvalExp(self, exp):
+    try:
+      result = eval(exp.python_exp)
+    except Exception as e:  # pylint: disable=broad-except
+      print('ERROR: caught exception %s: %s' % (e.__class__.__name__, e))
+      print('ERROR: failed to evaluate meta expression %s at %s' %
+            (exp.python_exp, exp.token.start))
+      sys.exit(1)
+    return result
+
+  def GetRange(self, identifier):
+    for (var, lower, upper) in self.ranges:
+      if identifier == var:
+        return (lower, upper)
+
+    print('ERROR: range %s is undefined.' % (identifier,))
+    sys.exit(1)
+
+
+class Output:
+  def __init__(self):
+    self.string = ''
+
+  def GetLastLine(self):
+    index = self.string.rfind('\n')
+    if index < 0:
+      return ''
+
+    return self.string[index + 1:]
+
+  def Append(self, s):
+    self.string += s
+
+
+def RunAtomicCode(env, node, output):
+  if isinstance(node, VarNode):
+    identifier = node.identifier.value.strip()
+    result = Output()
+    RunAtomicCode(env.Clone(), node.atomic_code, result)
+    value = result.string
+    env.PushVariable(identifier, value)
+  elif isinstance(node, RangeNode):
+    identifier = node.identifier.value.strip()
+    lower = int(env.EvalExp(node.exp1))
+    upper = int(env.EvalExp(node.exp2))
+    env.PushRange(identifier, lower, upper)
+  elif isinstance(node, ForNode):
+    identifier = node.identifier.value.strip()
+    if node.sep is None:
+      sep = ''
+    else:
+      sep = node.sep.value
+    (lower, upper) = env.GetRange(identifier)
+    for i in range(lower, upper + 1):
+      new_env = env.Clone()
+      new_env.PushVariable(identifier, i)
+      RunCode(new_env, node.code, output)
+      if i != upper:
+        output.Append(sep)
+  elif isinstance(node, RawCodeNode):
+    output.Append(node.raw_code.value)
+  elif isinstance(node, IfNode):
+    cond = env.EvalExp(node.exp)
+    if cond:
+      RunCode(env.Clone(), node.then_branch, output)
+    elif node.else_branch is not None:
+      RunCode(env.Clone(), node.else_branch, output)
+  elif isinstance(node, ExpNode):
+    value = env.EvalExp(node)
+    output.Append('%s' % (value,))
+  elif isinstance(node, LiteralDollarNode):
+    output.Append('$')
+  elif isinstance(node, CodeNode):
+    RunCode(env.Clone(), node, output)
+  else:
+    print('BAD')
+    print(node)
+    sys.exit(1)
+
+
+def RunCode(env, code_node, output):
+  for atomic_code in code_node.atomic_code:
+    RunAtomicCode(env, atomic_code, output)
+
+
+def IsSingleLineComment(cur_line):
+  return '//' in cur_line
+
+
+def IsInPreprocessorDirective(prev_lines, cur_line):
+  if cur_line.lstrip().startswith('#'):
+    return True
+  return prev_lines and prev_lines[-1].endswith('\\')
+
+
+def WrapComment(line, output):
+  loc = line.find('//')
+  before_comment = line[:loc].rstrip()
+  if before_comment == '':
+    indent = loc
+  else:
+    output.append(before_comment)
+    indent = len(before_comment) - len(before_comment.lstrip())
+  prefix = indent*' ' + '// '
+  max_len = 80 - len(prefix)
+  comment = line[loc + 2:].strip()
+  segs = [seg for seg in re.split(r'(\w+\W*)', comment) if seg != '']
+  cur_line = ''
+  for seg in segs:
+    if len((cur_line + seg).rstrip()) < max_len:
+      cur_line += seg
+    else:
+      if cur_line.strip() != '':
+        output.append(prefix + cur_line.rstrip())
+      cur_line = seg.lstrip()
+  if cur_line.strip() != '':
+    output.append(prefix + cur_line.strip())
+
+
+def WrapCode(line, line_concat, output):
+  indent = len(line) - len(line.lstrip())
+  prefix = indent*' '  # Prefix of the current line
+  max_len = 80 - indent - len(line_concat)  # Maximum length of the current line
+  new_prefix = prefix + 4*' '  # Prefix of a continuation line
+  new_max_len = max_len - 4  # Maximum length of a continuation line
+  # Prefers to wrap a line after a ',' or ';'.
+  segs = [seg for seg in re.split(r'([^,;]+[,;]?)', line.strip()) if seg != '']
+  cur_line = ''  # The current line without leading spaces.
+  for seg in segs:
+    # If the line is still too long, wrap at a space.
+    while cur_line == '' and len(seg.strip()) > max_len:
+      seg = seg.lstrip()
+      split_at = seg.rfind(' ', 0, max_len)
+      output.append(prefix + seg[:split_at].strip() + line_concat)
+      seg = seg[split_at + 1:]
+      prefix = new_prefix
+      max_len = new_max_len
+
+    if len((cur_line + seg).rstrip()) < max_len:
+      cur_line = (cur_line + seg).lstrip()
+    else:
+      output.append(prefix + cur_line.rstrip() + line_concat)
+      prefix = new_prefix
+      max_len = new_max_len
+      cur_line = seg.lstrip()
+  if cur_line.strip() != '':
+    output.append(prefix + cur_line.strip())
+
+
+def WrapPreprocessorDirective(line, output):
+  WrapCode(line, ' \\', output)
+
+
+def WrapPlainCode(line, output):
+  WrapCode(line, '', output)
+
+
+def IsMultiLineIWYUPragma(line):
+  return re.search(r'/\* IWYU pragma: ', line)
+
+
+def IsHeaderGuardIncludeOrOneLineIWYUPragma(line):
+  return (re.match(r'^#(ifndef|define|endif\s*//)\s*[\w_]+\s*$', line) or
+          re.match(r'^#include\s', line) or
+          # Don't break IWYU pragmas, either; that causes iwyu.py problems.
+          re.search(r'// IWYU pragma: ', line))
+
+
+def WrapLongLine(line, output):
+  line = line.rstrip()
+  if len(line) <= 80:
+    output.append(line)
+  elif IsSingleLineComment(line):
+    if IsHeaderGuardIncludeOrOneLineIWYUPragma(line):
+      # The style guide made an exception to allow long header guard lines,
+      # includes and IWYU pragmas.
+      output.append(line)
+    else:
+      WrapComment(line, output)
+  elif IsInPreprocessorDirective(output, line):
+    if IsHeaderGuardIncludeOrOneLineIWYUPragma(line):
+      # The style guide made an exception to allow long header guard lines,
+      # includes and IWYU pragmas.
+      output.append(line)
+    else:
+      WrapPreprocessorDirective(line, output)
+  elif IsMultiLineIWYUPragma(line):
+    output.append(line)
+  else:
+    WrapPlainCode(line, output)
+
+
+def BeautifyCode(string):
+  lines = string.splitlines()
+  output = []
+  for line in lines:
+    WrapLongLine(line, output)
+  output2 = [line.rstrip() for line in output]
+  return '\n'.join(output2) + '\n'
+
+
+def ConvertFromPumpSource(src_text):
+  """Return the text generated from the given Pump source text."""
+  ast = ParseToAST(StripMetaComments(src_text))
+  output = Output()
+  RunCode(Env(), ast, output)
+  return BeautifyCode(output.string)
+
+
+def main(argv):
+  if len(argv) == 1:
+    print(__doc__)
+    sys.exit(1)
+
+  file_path = argv[-1]
+  output_str = ConvertFromPumpSource(io.open(file_path, 'r').read())
+  if file_path.endswith('.pump'):
+    output_file_path = file_path[:-5]
+  else:
+    output_file_path = '-'
+  if output_file_path == '-':
+    print(output_str,)
+  else:
+    output_file = io.open(output_file_path, 'w')
+    output_file.write(u'// This file was GENERATED by command:\n')
+    output_file.write(u'//     %s %s\n' %
+                      (os.path.basename(__file__), os.path.basename(file_path)))
+    output_file.write(u'// DO NOT EDIT BY HAND!!!\n\n')
+    output_file.write(output_str)
+    output_file.close()
+
+
+if __name__ == '__main__':
+  main(sys.argv)
diff --git a/GraphBLAS/CUDA/test/googlemock/src/gmock-all.cc b/GraphBLAS/CUDA/test/googlemock/src/gmock-all.cc
new file mode 100644
index 0000000000..e43c9b7b4c
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/src/gmock-all.cc
@@ -0,0 +1,46 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Google C++ Mocking Framework (Google Mock)
+//
+// This file #includes all Google Mock implementation .cc files.  The
+// purpose is to allow a user to build Google Mock by compiling this
+// file alone.
+
+// This line ensures that gmock.h can be compiled on its own, even
+// when it's fused.
+#include "gmock/gmock.h"
+
+// The following lines pull in the real gmock *.cc files.
+#include "src/gmock-cardinalities.cc"
+#include "src/gmock-internal-utils.cc"
+#include "src/gmock-matchers.cc"
+#include "src/gmock-spec-builders.cc"
+#include "src/gmock.cc"
diff --git a/GraphBLAS/CUDA/test/googlemock/src/gmock-cardinalities.cc b/GraphBLAS/CUDA/test/googlemock/src/gmock-cardinalities.cc
new file mode 100644
index 0000000000..7463f43832
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/src/gmock-cardinalities.cc
@@ -0,0 +1,155 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements cardinalities.
+
+#include "gmock/gmock-cardinalities.h"
+
+#include <limits.h>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+#include "gmock/internal/gmock-internal-utils.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+
+namespace {
+
+// Implements the Between(m, n) cardinality.
+class BetweenCardinalityImpl : public CardinalityInterface {
+ public:
+  BetweenCardinalityImpl(int min, int max)
+      : min_(min >= 0 ? min : 0),
+        max_(max >= min_ ? max : min_) {
+    std::stringstream ss;
+    if (min < 0) {
+      ss << "The invocation lower bound must be >= 0, "
+         << "but is actually " << min << ".";
+      internal::Expect(false, __FILE__, __LINE__, ss.str());
+    } else if (max < 0) {
+      ss << "The invocation upper bound must be >= 0, "
+         << "but is actually " << max << ".";
+      internal::Expect(false, __FILE__, __LINE__, ss.str());
+    } else if (min > max) {
+      ss << "The invocation upper bound (" << max
+         << ") must be >= the invocation lower bound (" << min
+         << ").";
+      internal::Expect(false, __FILE__, __LINE__, ss.str());
+    }
+  }
+
+  // Conservative estimate on the lower/upper bound of the number of
+  // calls allowed.
+  int ConservativeLowerBound() const override { return min_; }
+  int ConservativeUpperBound() const override { return max_; }
+
+  bool IsSatisfiedByCallCount(int call_count) const override {
+    return min_ <= call_count && call_count <= max_;
+  }
+
+  bool IsSaturatedByCallCount(int call_count) const override {
+    return call_count >= max_;
+  }
+
+  void DescribeTo(::std::ostream* os) const override;
+
+ private:
+  const int min_;
+  const int max_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(BetweenCardinalityImpl);
+};
+
+// Formats "n times" in a human-friendly way.
+inline std::string FormatTimes(int n) {
+  if (n == 1) {
+    return "once";
+  } else if (n == 2) {
+    return "twice";
+  } else {
+    std::stringstream ss;
+    ss << n << " times";
+    return ss.str();
+  }
+}
+
+// Describes the Between(m, n) cardinality in human-friendly text.
+void BetweenCardinalityImpl::DescribeTo(::std::ostream* os) const {
+  if (min_ == 0) {
+    if (max_ == 0) {
+      *os << "never called";
+    } else if (max_ == INT_MAX) {
+      *os << "called any number of times";
+    } else {
+      *os << "called at most " << FormatTimes(max_);
+    }
+  } else if (min_ == max_) {
+    *os << "called " << FormatTimes(min_);
+  } else if (max_ == INT_MAX) {
+    *os << "called at least " << FormatTimes(min_);
+  } else {
+    // 0 < min_ < max_ < INT_MAX
+    *os << "called between " << min_ << " and " << max_ << " times";
+  }
+}
+
+}  // Unnamed namespace
+
+// Describes the given call count to an ostream.
+void Cardinality::DescribeActualCallCountTo(int actual_call_count,
+                                            ::std::ostream* os) {
+  if (actual_call_count > 0) {
+    *os << "called " << FormatTimes(actual_call_count);
+  } else {
+    *os << "never called";
+  }
+}
+
+// Creates a cardinality that allows at least n calls.
+GTEST_API_ Cardinality AtLeast(int n) { return Between(n, INT_MAX); }
+
+// Creates a cardinality that allows at most n calls.
+GTEST_API_ Cardinality AtMost(int n) { return Between(0, n); }
+
+// Creates a cardinality that allows any number of calls.
+GTEST_API_ Cardinality AnyNumber() { return AtLeast(0); }
+
+// Creates a cardinality that allows between min and max calls.
+GTEST_API_ Cardinality Between(int min, int max) {
+  return Cardinality(new BetweenCardinalityImpl(min, max));
+}
+
+// Creates a cardinality that allows exactly n calls.
+GTEST_API_ Cardinality Exactly(int n) { return Between(n, n); }
+
+}  // namespace testing
diff --git a/GraphBLAS/CUDA/test/googlemock/src/gmock-internal-utils.cc b/GraphBLAS/CUDA/test/googlemock/src/gmock-internal-utils.cc
new file mode 100644
index 0000000000..e5b547981d
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/src/gmock-internal-utils.cc
@@ -0,0 +1,200 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file defines some utilities useful for implementing Google
+// Mock.  They are subject to change without notice, so please DO NOT
+// USE THEM IN USER CODE.
+
+#include "gmock/internal/gmock-internal-utils.h"
+
+#include <ctype.h>
+#include <ostream>  // NOLINT
+#include <string>
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace internal {
+
+// Joins a vector of strings as if they are fields of a tuple; returns
+// the joined string.
+GTEST_API_ std::string JoinAsTuple(const Strings& fields) {
+  switch (fields.size()) {
+    case 0:
+      return "";
+    case 1:
+      return fields[0];
+    default:
+      std::string result = "(" + fields[0];
+      for (size_t i = 1; i < fields.size(); i++) {
+        result += ", ";
+        result += fields[i];
+      }
+      result += ")";
+      return result;
+  }
+}
+
+// Converts an identifier name to a space-separated list of lower-case
+// words.  Each maximum substring of the form [A-Za-z][a-z]*|\d+ is
+// treated as one word.  For example, both "FooBar123" and
+// "foo_bar_123" are converted to "foo bar 123".
+GTEST_API_ std::string ConvertIdentifierNameToWords(const char* id_name) {
+  std::string result;
+  char prev_char = '\0';
+  for (const char* p = id_name; *p != '\0'; prev_char = *(p++)) {
+    // We don't care about the current locale as the input is
+    // guaranteed to be a valid C++ identifier name.
+    const bool starts_new_word = IsUpper(*p) ||
+        (!IsAlpha(prev_char) && IsLower(*p)) ||
+        (!IsDigit(prev_char) && IsDigit(*p));
+
+    if (IsAlNum(*p)) {
+      if (starts_new_word && result != "")
+        result += ' ';
+      result += ToLower(*p);
+    }
+  }
+  return result;
+}
+
+// This class reports Google Mock failures as Google Test failures.  A
+// user can define another class in a similar fashion if they intend to
+// use Google Mock with a testing framework other than Google Test.
+class GoogleTestFailureReporter : public FailureReporterInterface {
+ public:
+  void ReportFailure(FailureType type, const char* file, int line,
+                     const std::string& message) override {
+    AssertHelper(type == kFatal ?
+                 TestPartResult::kFatalFailure :
+                 TestPartResult::kNonFatalFailure,
+                 file,
+                 line,
+                 message.c_str()) = Message();
+    if (type == kFatal) {
+      posix::Abort();
+    }
+  }
+};
+
+// Returns the global failure reporter.  Will create a
+// GoogleTestFailureReporter and return it the first time called.
+GTEST_API_ FailureReporterInterface* GetFailureReporter() {
+  // Points to the global failure reporter used by Google Mock.  gcc
+  // guarantees that the following use of failure_reporter is
+  // thread-safe.  We may need to add additional synchronization to
+  // protect failure_reporter if we port Google Mock to other
+  // compilers.
+  static FailureReporterInterface* const failure_reporter =
+      new GoogleTestFailureReporter();
+  return failure_reporter;
+}
+
+// Protects global resources (stdout in particular) used by Log().
+static GTEST_DEFINE_STATIC_MUTEX_(g_log_mutex);
+
+// Returns true if and only if a log with the given severity is visible
+// according to the --gmock_verbose flag.
+GTEST_API_ bool LogIsVisible(LogSeverity severity) {
+  if (GMOCK_FLAG(verbose) == kInfoVerbosity) {
+    // Always show the log if --gmock_verbose=info.
+    return true;
+  } else if (GMOCK_FLAG(verbose) == kErrorVerbosity) {
+    // Always hide it if --gmock_verbose=error.
+    return false;
+  } else {
+    // If --gmock_verbose is neither "info" nor "error", we treat it
+    // as "warning" (its default value).
+    return severity == kWarning;
+  }
+}
+
+// Prints the given message to stdout if and only if 'severity' >= the level
+// specified by the --gmock_verbose flag.  If stack_frames_to_skip >=
+// 0, also prints the stack trace excluding the top
+// stack_frames_to_skip frames.  In opt mode, any positive
+// stack_frames_to_skip is treated as 0, since we don't know which
+// function calls will be inlined by the compiler and need to be
+// conservative.
+GTEST_API_ void Log(LogSeverity severity, const std::string& message,
+                    int stack_frames_to_skip) {
+  if (!LogIsVisible(severity))
+    return;
+
+  // Ensures that logs from different threads don't interleave.
+  MutexLock l(&g_log_mutex);
+
+  if (severity == kWarning) {
+    // Prints a GMOCK WARNING marker to make the warnings easily searchable.
+    std::cout << "\nGMOCK WARNING:";
+  }
+  // Pre-pends a new-line to message if it doesn't start with one.
+  if (message.empty() || message[0] != '\n') {
+    std::cout << "\n";
+  }
+  std::cout << message;
+  if (stack_frames_to_skip >= 0) {
+#ifdef NDEBUG
+    // In opt mode, we have to be conservative and skip no stack frame.
+    const int actual_to_skip = 0;
+#else
+    // In dbg mode, we can do what the caller tell us to do (plus one
+    // for skipping this function's stack frame).
+    const int actual_to_skip = stack_frames_to_skip + 1;
+#endif  // NDEBUG
+
+    // Appends a new-line to message if it doesn't end with one.
+    if (!message.empty() && *message.rbegin() != '\n') {
+      std::cout << "\n";
+    }
+    std::cout << "Stack trace:\n"
+         << ::testing::internal::GetCurrentOsStackTraceExceptTop(
+             ::testing::UnitTest::GetInstance(), actual_to_skip);
+  }
+  std::cout << ::std::flush;
+}
+
+GTEST_API_ WithoutMatchers GetWithoutMatchers() { return WithoutMatchers(); }
+
+GTEST_API_ void IllegalDoDefault(const char* file, int line) {
+  internal::Assert(
+      false, file, line,
+      "You are using DoDefault() inside a composite action like "
+      "DoAll() or WithArgs().  This is not supported for technical "
+      "reasons.  Please instead spell out the default action, or "
+      "assign the default action to an Action variable and use "
+      "the variable in various places.");
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/GraphBLAS/CUDA/test/googlemock/src/gmock-matchers.cc b/GraphBLAS/CUDA/test/googlemock/src/gmock-matchers.cc
new file mode 100644
index 0000000000..4f73e0a69c
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/src/gmock-matchers.cc
@@ -0,0 +1,461 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements Matcher<const string&>, Matcher<string>, and
+// utilities for defining matchers.
+
+#include "gmock/gmock-matchers.h"
+
+#include <string.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+namespace testing {
+namespace internal {
+
+// Returns the description for a matcher defined using the MATCHER*()
+// macro where the user-supplied description string is "", if
+// 'negation' is false; otherwise returns the description of the
+// negation of the matcher.  'param_values' contains a list of strings
+// that are the print-out of the matcher's parameters.
+GTEST_API_ std::string FormatMatcherDescription(bool negation,
+                                                const char* matcher_name,
+                                                const Strings& param_values) {
+  std::string result = ConvertIdentifierNameToWords(matcher_name);
+  if (param_values.size() >= 1) result += " " + JoinAsTuple(param_values);
+  return negation ? "not (" + result + ")" : result;
+}
+
+// FindMaxBipartiteMatching and its helper class.
+//
+// Uses the well-known Ford-Fulkerson max flow method to find a maximum
+// bipartite matching. Flow is considered to be from left to right.
+// There is an implicit source node that is connected to all of the left
+// nodes, and an implicit sink node that is connected to all of the
+// right nodes. All edges have unit capacity.
+//
+// Neither the flow graph nor the residual flow graph are represented
+// explicitly. Instead, they are implied by the information in 'graph' and
+// a vector<int> called 'left_' whose elements are initialized to the
+// value kUnused. This represents the initial state of the algorithm,
+// where the flow graph is empty, and the residual flow graph has the
+// following edges:
+//   - An edge from source to each left_ node
+//   - An edge from each right_ node to sink
+//   - An edge from each left_ node to each right_ node, if the
+//     corresponding edge exists in 'graph'.
+//
+// When the TryAugment() method adds a flow, it sets left_[l] = r for some
+// nodes l and r. This induces the following changes:
+//   - The edges (source, l), (l, r), and (r, sink) are added to the
+//     flow graph.
+//   - The same three edges are removed from the residual flow graph.
+//   - The reverse edges (l, source), (r, l), and (sink, r) are added
+//     to the residual flow graph, which is a directional graph
+//     representing unused flow capacity.
+//
+// When the method augments a flow (moving left_[l] from some r1 to some
+// other r2), this can be thought of as "undoing" the above steps with
+// respect to r1 and "redoing" them with respect to r2.
+//
+// It bears repeating that the flow graph and residual flow graph are
+// never represented explicitly, but can be derived by looking at the
+// information in 'graph' and in left_.
+//
+// As an optimization, there is a second vector<int> called right_ which
+// does not provide any new information. Instead, it enables more
+// efficient queries about edges entering or leaving the right-side nodes
+// of the flow or residual flow graphs. The following invariants are
+// maintained:
+//
+// left[l] == kUnused or right[left[l]] == l
+// right[r] == kUnused or left[right[r]] == r
+//
+// . [ source ]                                        .
+// .   |||                                             .
+// .   |||                                             .
+// .   ||\--> left[0]=1  ---\    right[0]=-1 ----\     .
+// .   ||                   |                    |     .
+// .   |\---> left[1]=-1    \--> right[1]=0  ---\|     .
+// .   |                                        ||     .
+// .   \----> left[2]=2  ------> right[2]=2  --\||     .
+// .                                           |||     .
+// .         elements           matchers       vvv     .
+// .                                         [ sink ]  .
+//
+// See Also:
+//   [1] Cormen, et al (2001). "Section 26.2: The Ford-Fulkerson method".
+//       "Introduction to Algorithms (Second ed.)", pp. 651-664.
+//   [2] "Ford-Fulkerson algorithm", Wikipedia,
+//       'http://en.wikipedia.org/wiki/Ford%E2%80%93Fulkerson_algorithm'
+class MaxBipartiteMatchState {
+ public:
+  explicit MaxBipartiteMatchState(const MatchMatrix& graph)
+      : graph_(&graph),
+        left_(graph_->LhsSize(), kUnused),
+        right_(graph_->RhsSize(), kUnused) {}
+
+  // Returns the edges of a maximal match, each in the form {left, right}.
+  ElementMatcherPairs Compute() {
+    // 'seen' is used for path finding { 0: unseen, 1: seen }.
+    ::std::vector<char> seen;
+    // Searches the residual flow graph for a path from each left node to
+    // the sink in the residual flow graph, and if one is found, add flow
+    // to the graph. It's okay to search through the left nodes once. The
+    // edge from the implicit source node to each previously-visited left
+    // node will have flow if that left node has any path to the sink
+    // whatsoever. Subsequent augmentations can only add flow to the
+    // network, and cannot take away that previous flow unit from the source.
+    // Since the source-to-left edge can only carry one flow unit (or,
+    // each element can be matched to only one matcher), there is no need
+    // to visit the left nodes more than once looking for augmented paths.
+    // The flow is known to be possible or impossible by looking at the
+    // node once.
+    for (size_t ilhs = 0; ilhs < graph_->LhsSize(); ++ilhs) {
+      // Reset the path-marking vector and try to find a path from
+      // source to sink starting at the left_[ilhs] node.
+      GTEST_CHECK_(left_[ilhs] == kUnused)
+          << "ilhs: " << ilhs << ", left_[ilhs]: " << left_[ilhs];
+      // 'seen' initialized to 'graph_->RhsSize()' copies of 0.
+      seen.assign(graph_->RhsSize(), 0);
+      TryAugment(ilhs, &seen);
+    }
+    ElementMatcherPairs result;
+    for (size_t ilhs = 0; ilhs < left_.size(); ++ilhs) {
+      size_t irhs = left_[ilhs];
+      if (irhs == kUnused) continue;
+      result.push_back(ElementMatcherPair(ilhs, irhs));
+    }
+    return result;
+  }
+
+ private:
+  static const size_t kUnused = static_cast<size_t>(-1);
+
+  // Perform a depth-first search from left node ilhs to the sink.  If a
+  // path is found, flow is added to the network by linking the left and
+  // right vector elements corresponding each segment of the path.
+  // Returns true if a path to sink was found, which means that a unit of
+  // flow was added to the network. The 'seen' vector elements correspond
+  // to right nodes and are marked to eliminate cycles from the search.
+  //
+  // Left nodes will only be explored at most once because they
+  // are accessible from at most one right node in the residual flow
+  // graph.
+  //
+  // Note that left_[ilhs] is the only element of left_ that TryAugment will
+  // potentially transition from kUnused to another value. Any other
+  // left_ element holding kUnused before TryAugment will be holding it
+  // when TryAugment returns.
+  //
+  bool TryAugment(size_t ilhs, ::std::vector<char>* seen) {
+    for (size_t irhs = 0; irhs < graph_->RhsSize(); ++irhs) {
+      if ((*seen)[irhs]) continue;
+      if (!graph_->HasEdge(ilhs, irhs)) continue;
+      // There's an available edge from ilhs to irhs.
+      (*seen)[irhs] = 1;
+      // Next a search is performed to determine whether
+      // this edge is a dead end or leads to the sink.
+      //
+      // right_[irhs] == kUnused means that there is residual flow from
+      // right node irhs to the sink, so we can use that to finish this
+      // flow path and return success.
+      //
+      // Otherwise there is residual flow to some ilhs. We push flow
+      // along that path and call ourselves recursively to see if this
+      // ultimately leads to sink.
+      if (right_[irhs] == kUnused || TryAugment(right_[irhs], seen)) {
+        // Add flow from left_[ilhs] to right_[irhs].
+        left_[ilhs] = irhs;
+        right_[irhs] = ilhs;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  const MatchMatrix* graph_;  // not owned
+  // Each element of the left_ vector represents a left hand side node
+  // (i.e. an element) and each element of right_ is a right hand side
+  // node (i.e. a matcher). The values in the left_ vector indicate
+  // outflow from that node to a node on the right_ side. The values
+  // in the right_ indicate inflow, and specify which left_ node is
+  // feeding that right_ node, if any. For example, left_[3] == 1 means
+  // there's a flow from element #3 to matcher #1. Such a flow would also
+  // be redundantly represented in the right_ vector as right_[1] == 3.
+  // Elements of left_ and right_ are either kUnused or mutually
+  // referent. Mutually referent means that left_[right_[i]] = i and
+  // right_[left_[i]] = i.
+  ::std::vector<size_t> left_;
+  ::std::vector<size_t> right_;
+
+  GTEST_DISALLOW_ASSIGN_(MaxBipartiteMatchState);
+};
+
+const size_t MaxBipartiteMatchState::kUnused;
+
+GTEST_API_ ElementMatcherPairs FindMaxBipartiteMatching(const MatchMatrix& g) {
+  return MaxBipartiteMatchState(g).Compute();
+}
+
+static void LogElementMatcherPairVec(const ElementMatcherPairs& pairs,
+                                     ::std::ostream* stream) {
+  typedef ElementMatcherPairs::const_iterator Iter;
+  ::std::ostream& os = *stream;
+  os << "{";
+  const char* sep = "";
+  for (Iter it = pairs.begin(); it != pairs.end(); ++it) {
+    os << sep << "\n  ("
+       << "element #" << it->first << ", "
+       << "matcher #" << it->second << ")";
+    sep = ",";
+  }
+  os << "\n}";
+}
+
+bool MatchMatrix::NextGraph() {
+  for (size_t ilhs = 0; ilhs < LhsSize(); ++ilhs) {
+    for (size_t irhs = 0; irhs < RhsSize(); ++irhs) {
+      char& b = matched_[SpaceIndex(ilhs, irhs)];
+      if (!b) {
+        b = 1;
+        return true;
+      }
+      b = 0;
+    }
+  }
+  return false;
+}
+
+void MatchMatrix::Randomize() {
+  for (size_t ilhs = 0; ilhs < LhsSize(); ++ilhs) {
+    for (size_t irhs = 0; irhs < RhsSize(); ++irhs) {
+      char& b = matched_[SpaceIndex(ilhs, irhs)];
+      b = static_cast<char>(rand() & 1);  // NOLINT
+    }
+  }
+}
+
+std::string MatchMatrix::DebugString() const {
+  ::std::stringstream ss;
+  const char* sep = "";
+  for (size_t i = 0; i < LhsSize(); ++i) {
+    ss << sep;
+    for (size_t j = 0; j < RhsSize(); ++j) {
+      ss << HasEdge(i, j);
+    }
+    sep = ";";
+  }
+  return ss.str();
+}
+
+void UnorderedElementsAreMatcherImplBase::DescribeToImpl(
+    ::std::ostream* os) const {
+  switch (match_flags()) {
+    case UnorderedMatcherRequire::ExactMatch:
+      if (matcher_describers_.empty()) {
+        *os << "is empty";
+        return;
+      }
+      if (matcher_describers_.size() == 1) {
+        *os << "has " << Elements(1) << " and that element ";
+        matcher_describers_[0]->DescribeTo(os);
+        return;
+      }
+      *os << "has " << Elements(matcher_describers_.size())
+          << " and there exists some permutation of elements such that:\n";
+      break;
+    case UnorderedMatcherRequire::Superset:
+      *os << "a surjection from elements to requirements exists such that:\n";
+      break;
+    case UnorderedMatcherRequire::Subset:
+      *os << "an injection from elements to requirements exists such that:\n";
+      break;
+  }
+
+  const char* sep = "";
+  for (size_t i = 0; i != matcher_describers_.size(); ++i) {
+    *os << sep;
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      *os << " - element #" << i << " ";
+    } else {
+      *os << " - an element ";
+    }
+    matcher_describers_[i]->DescribeTo(os);
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      sep = ", and\n";
+    } else {
+      sep = "\n";
+    }
+  }
+}
+
+void UnorderedElementsAreMatcherImplBase::DescribeNegationToImpl(
+    ::std::ostream* os) const {
+  switch (match_flags()) {
+    case UnorderedMatcherRequire::ExactMatch:
+      if (matcher_describers_.empty()) {
+        *os << "isn't empty";
+        return;
+      }
+      if (matcher_describers_.size() == 1) {
+        *os << "doesn't have " << Elements(1) << ", or has " << Elements(1)
+            << " that ";
+        matcher_describers_[0]->DescribeNegationTo(os);
+        return;
+      }
+      *os << "doesn't have " << Elements(matcher_describers_.size())
+          << ", or there exists no permutation of elements such that:\n";
+      break;
+    case UnorderedMatcherRequire::Superset:
+      *os << "no surjection from elements to requirements exists such that:\n";
+      break;
+    case UnorderedMatcherRequire::Subset:
+      *os << "no injection from elements to requirements exists such that:\n";
+      break;
+  }
+  const char* sep = "";
+  for (size_t i = 0; i != matcher_describers_.size(); ++i) {
+    *os << sep;
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      *os << " - element #" << i << " ";
+    } else {
+      *os << " - an element ";
+    }
+    matcher_describers_[i]->DescribeTo(os);
+    if (match_flags() == UnorderedMatcherRequire::ExactMatch) {
+      sep = ", and\n";
+    } else {
+      sep = "\n";
+    }
+  }
+}
+
+// Checks that all matchers match at least one element, and that all
+// elements match at least one matcher. This enables faster matching
+// and better error reporting.
+// Returns false, writing an explanation to 'listener', if and only
+// if the success criteria are not met.
+bool UnorderedElementsAreMatcherImplBase::VerifyMatchMatrix(
+    const ::std::vector<std::string>& element_printouts,
+    const MatchMatrix& matrix, MatchResultListener* listener) const {
+  bool result = true;
+  ::std::vector<char> element_matched(matrix.LhsSize(), 0);
+  ::std::vector<char> matcher_matched(matrix.RhsSize(), 0);
+
+  for (size_t ilhs = 0; ilhs < matrix.LhsSize(); ilhs++) {
+    for (size_t irhs = 0; irhs < matrix.RhsSize(); irhs++) {
+      char matched = matrix.HasEdge(ilhs, irhs);
+      element_matched[ilhs] |= matched;
+      matcher_matched[irhs] |= matched;
+    }
+  }
+
+  if (match_flags() & UnorderedMatcherRequire::Superset) {
+    const char* sep =
+        "where the following matchers don't match any elements:\n";
+    for (size_t mi = 0; mi < matcher_matched.size(); ++mi) {
+      if (matcher_matched[mi]) continue;
+      result = false;
+      if (listener->IsInterested()) {
+        *listener << sep << "matcher #" << mi << ": ";
+        matcher_describers_[mi]->DescribeTo(listener->stream());
+        sep = ",\n";
+      }
+    }
+  }
+
+  if (match_flags() & UnorderedMatcherRequire::Subset) {
+    const char* sep =
+        "where the following elements don't match any matchers:\n";
+    const char* outer_sep = "";
+    if (!result) {
+      outer_sep = "\nand ";
+    }
+    for (size_t ei = 0; ei < element_matched.size(); ++ei) {
+      if (element_matched[ei]) continue;
+      result = false;
+      if (listener->IsInterested()) {
+        *listener << outer_sep << sep << "element #" << ei << ": "
+                  << element_printouts[ei];
+        sep = ",\n";
+        outer_sep = "";
+      }
+    }
+  }
+  return result;
+}
+
+bool UnorderedElementsAreMatcherImplBase::FindPairing(
+    const MatchMatrix& matrix, MatchResultListener* listener) const {
+  ElementMatcherPairs matches = FindMaxBipartiteMatching(matrix);
+
+  size_t max_flow = matches.size();
+  if ((match_flags() & UnorderedMatcherRequire::Superset) &&
+      max_flow < matrix.RhsSize()) {
+    if (listener->IsInterested()) {
+      *listener << "where no permutation of the elements can satisfy all "
+                   "matchers, and the closest match is "
+                << max_flow << " of " << matrix.RhsSize()
+                << " matchers with the pairings:\n";
+      LogElementMatcherPairVec(matches, listener->stream());
+    }
+    return false;
+  }
+  if ((match_flags() & UnorderedMatcherRequire::Subset) &&
+      max_flow < matrix.LhsSize()) {
+    if (listener->IsInterested()) {
+      *listener
+          << "where not all elements can be matched, and the closest match is "
+          << max_flow << " of " << matrix.RhsSize()
+          << " matchers with the pairings:\n";
+      LogElementMatcherPairVec(matches, listener->stream());
+    }
+    return false;
+  }
+
+  if (matches.size() > 1) {
+    if (listener->IsInterested()) {
+      const char* sep = "where:\n";
+      for (size_t mi = 0; mi < matches.size(); ++mi) {
+        *listener << sep << " - element #" << matches[mi].first
+                  << " is matched by matcher #" << matches[mi].second;
+        sep = ",\n";
+      }
+    }
+  }
+  return true;
+}
+
+}  // namespace internal
+}  // namespace testing
diff --git a/GraphBLAS/CUDA/test/googlemock/src/gmock-spec-builders.cc b/GraphBLAS/CUDA/test/googlemock/src/gmock-spec-builders.cc
new file mode 100644
index 0000000000..81ea98949c
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/src/gmock-spec-builders.cc
@@ -0,0 +1,892 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file implements the spec builder syntax (ON_CALL and
+// EXPECT_CALL).
+
+#include "gmock/gmock-spec-builders.h"
+
+#include <stdlib.h>
+
+#include <iostream>  // NOLINT
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/internal/gtest-port.h"
+
+#if GTEST_OS_CYGWIN || GTEST_OS_LINUX || GTEST_OS_MAC
+# include <unistd.h>  // NOLINT
+#endif
+
+// Silence C4800 (C4800: 'int *const ': forcing value
+// to bool 'true' or 'false') for MSVC 15
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#  pragma warning(push)
+#  pragma warning(disable:4800)
+#endif
+#endif
+
+namespace testing {
+namespace internal {
+
+// Protects the mock object registry (in class Mock), all function
+// mockers, and all expectations.
+GTEST_API_ GTEST_DEFINE_STATIC_MUTEX_(g_gmock_mutex);
+
+// Logs a message including file and line number information.
+GTEST_API_ void LogWithLocation(testing::internal::LogSeverity severity,
+                                const char* file, int line,
+                                const std::string& message) {
+  ::std::ostringstream s;
+  s << internal::FormatFileLocation(file, line) << " " << message
+    << ::std::endl;
+  Log(severity, s.str(), 0);
+}
+
+// Constructs an ExpectationBase object.
+ExpectationBase::ExpectationBase(const char* a_file, int a_line,
+                                 const std::string& a_source_text)
+    : file_(a_file),
+      line_(a_line),
+      source_text_(a_source_text),
+      cardinality_specified_(false),
+      cardinality_(Exactly(1)),
+      call_count_(0),
+      retired_(false),
+      extra_matcher_specified_(false),
+      repeated_action_specified_(false),
+      retires_on_saturation_(false),
+      last_clause_(kNone),
+      action_count_checked_(false) {}
+
+// Destructs an ExpectationBase object.
+ExpectationBase::~ExpectationBase() {}
+
+// Explicitly specifies the cardinality of this expectation.  Used by
+// the subclasses to implement the .Times() clause.
+void ExpectationBase::SpecifyCardinality(const Cardinality& a_cardinality) {
+  cardinality_specified_ = true;
+  cardinality_ = a_cardinality;
+}
+
+// Retires all pre-requisites of this expectation.
+void ExpectationBase::RetireAllPreRequisites()
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  if (is_retired()) {
+    // We can take this short-cut as we never retire an expectation
+    // until we have retired all its pre-requisites.
+    return;
+  }
+
+  ::std::vector<ExpectationBase*> expectations(1, this);
+  while (!expectations.empty()) {
+    ExpectationBase* exp = expectations.back();
+    expectations.pop_back();
+
+    for (ExpectationSet::const_iterator it =
+             exp->immediate_prerequisites_.begin();
+         it != exp->immediate_prerequisites_.end(); ++it) {
+      ExpectationBase* next = it->expectation_base().get();
+      if (!next->is_retired()) {
+        next->Retire();
+        expectations.push_back(next);
+      }
+    }
+  }
+}
+
+// Returns true if and only if all pre-requisites of this expectation
+// have been satisfied.
+bool ExpectationBase::AllPrerequisitesAreSatisfied() const
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+  ::std::vector<const ExpectationBase*> expectations(1, this);
+  while (!expectations.empty()) {
+    const ExpectationBase* exp = expectations.back();
+    expectations.pop_back();
+
+    for (ExpectationSet::const_iterator it =
+             exp->immediate_prerequisites_.begin();
+         it != exp->immediate_prerequisites_.end(); ++it) {
+      const ExpectationBase* next = it->expectation_base().get();
+      if (!next->IsSatisfied()) return false;
+      expectations.push_back(next);
+    }
+  }
+  return true;
+}
+
+// Adds unsatisfied pre-requisites of this expectation to 'result'.
+void ExpectationBase::FindUnsatisfiedPrerequisites(ExpectationSet* result) const
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+  ::std::vector<const ExpectationBase*> expectations(1, this);
+  while (!expectations.empty()) {
+    const ExpectationBase* exp = expectations.back();
+    expectations.pop_back();
+
+    for (ExpectationSet::const_iterator it =
+             exp->immediate_prerequisites_.begin();
+         it != exp->immediate_prerequisites_.end(); ++it) {
+      const ExpectationBase* next = it->expectation_base().get();
+
+      if (next->IsSatisfied()) {
+        // If *it is satisfied and has a call count of 0, some of its
+        // pre-requisites may not be satisfied yet.
+        if (next->call_count_ == 0) {
+          expectations.push_back(next);
+        }
+      } else {
+        // Now that we know next is unsatisfied, we are not so interested
+        // in whether its pre-requisites are satisfied.  Therefore we
+        // don't iterate into it here.
+        *result += *it;
+      }
+    }
+  }
+}
+
+// Describes how many times a function call matching this
+// expectation has occurred.
+void ExpectationBase::DescribeCallCountTo(::std::ostream* os) const
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+
+  // Describes how many times the function is expected to be called.
+  *os << "         Expected: to be ";
+  cardinality().DescribeTo(os);
+  *os << "\n           Actual: ";
+  Cardinality::DescribeActualCallCountTo(call_count(), os);
+
+  // Describes the state of the expectation (e.g. is it satisfied?
+  // is it active?).
+  *os << " - " << (IsOverSaturated() ? "over-saturated" :
+                   IsSaturated() ? "saturated" :
+                   IsSatisfied() ? "satisfied" : "unsatisfied")
+      << " and "
+      << (is_retired() ? "retired" : "active");
+}
+
+// Checks the action count (i.e. the number of WillOnce() and
+// WillRepeatedly() clauses) against the cardinality if this hasn't
+// been done before.  Prints a warning if there are too many or too
+// few actions.
+void ExpectationBase::CheckActionCountIfNotDone() const
+    GTEST_LOCK_EXCLUDED_(mutex_) {
+  bool should_check = false;
+  {
+    MutexLock l(&mutex_);
+    if (!action_count_checked_) {
+      action_count_checked_ = true;
+      should_check = true;
+    }
+  }
+
+  if (should_check) {
+    if (!cardinality_specified_) {
+      // The cardinality was inferred - no need to check the action
+      // count against it.
+      return;
+    }
+
+    // The cardinality was explicitly specified.
+    const int action_count = static_cast<int>(untyped_actions_.size());
+    const int upper_bound = cardinality().ConservativeUpperBound();
+    const int lower_bound = cardinality().ConservativeLowerBound();
+    bool too_many;  // True if there are too many actions, or false
+    // if there are too few.
+    if (action_count > upper_bound ||
+        (action_count == upper_bound && repeated_action_specified_)) {
+      too_many = true;
+    } else if (0 < action_count && action_count < lower_bound &&
+               !repeated_action_specified_) {
+      too_many = false;
+    } else {
+      return;
+    }
+
+    ::std::stringstream ss;
+    DescribeLocationTo(&ss);
+    ss << "Too " << (too_many ? "many" : "few")
+       << " actions specified in " << source_text() << "...\n"
+       << "Expected to be ";
+    cardinality().DescribeTo(&ss);
+    ss << ", but has " << (too_many ? "" : "only ")
+       << action_count << " WillOnce()"
+       << (action_count == 1 ? "" : "s");
+    if (repeated_action_specified_) {
+      ss << " and a WillRepeatedly()";
+    }
+    ss << ".";
+    Log(kWarning, ss.str(), -1);  // -1 means "don't print stack trace".
+  }
+}
+
+// Implements the .Times() clause.
+void ExpectationBase::UntypedTimes(const Cardinality& a_cardinality) {
+  if (last_clause_ == kTimes) {
+    ExpectSpecProperty(false,
+                       ".Times() cannot appear "
+                       "more than once in an EXPECT_CALL().");
+  } else {
+    ExpectSpecProperty(last_clause_ < kTimes,
+                       ".Times() cannot appear after "
+                       ".InSequence(), .WillOnce(), .WillRepeatedly(), "
+                       "or .RetiresOnSaturation().");
+  }
+  last_clause_ = kTimes;
+
+  SpecifyCardinality(a_cardinality);
+}
+
+// Points to the implicit sequence introduced by a living InSequence
+// object (if any) in the current thread or NULL.
+GTEST_API_ ThreadLocal<Sequence*> g_gmock_implicit_sequence;
+
+// Reports an uninteresting call (whose description is in msg) in the
+// manner specified by 'reaction'.
+void ReportUninterestingCall(CallReaction reaction, const std::string& msg) {
+  // Include a stack trace only if --gmock_verbose=info is specified.
+  const int stack_frames_to_skip =
+      GMOCK_FLAG(verbose) == kInfoVerbosity ? 3 : -1;
+  switch (reaction) {
+    case kAllow:
+      Log(kInfo, msg, stack_frames_to_skip);
+      break;
+    case kWarn:
+      Log(kWarning,
+          msg +
+              "\nNOTE: You can safely ignore the above warning unless this "
+              "call should not happen.  Do not suppress it by blindly adding "
+              "an EXPECT_CALL() if you don't mean to enforce the call.  "
+              "See "
+              "https://github.com/google/googletest/blob/master/googlemock/"
+              "docs/cook_book.md#"
+              "knowing-when-to-expect for details.\n",
+          stack_frames_to_skip);
+      break;
+    default:  // FAIL
+      Expect(false, nullptr, -1, msg);
+  }
+}
+
+UntypedFunctionMockerBase::UntypedFunctionMockerBase()
+    : mock_obj_(nullptr), name_("") {}
+
+UntypedFunctionMockerBase::~UntypedFunctionMockerBase() {}
+
+// Sets the mock object this mock method belongs to, and registers
+// this information in the global mock registry.  Will be called
+// whenever an EXPECT_CALL() or ON_CALL() is executed on this mock
+// method.
+void UntypedFunctionMockerBase::RegisterOwner(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  {
+    MutexLock l(&g_gmock_mutex);
+    mock_obj_ = mock_obj;
+  }
+  Mock::Register(mock_obj, this);
+}
+
+// Sets the mock object this mock method belongs to, and sets the name
+// of the mock function.  Will be called upon each invocation of this
+// mock function.
+void UntypedFunctionMockerBase::SetOwnerAndName(const void* mock_obj,
+                                                const char* name)
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  // We protect name_ under g_gmock_mutex in case this mock function
+  // is called from two threads concurrently.
+  MutexLock l(&g_gmock_mutex);
+  mock_obj_ = mock_obj;
+  name_ = name;
+}
+
+// Returns the name of the function being mocked.  Must be called
+// after RegisterOwner() or SetOwnerAndName() has been called.
+const void* UntypedFunctionMockerBase::MockObject() const
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  const void* mock_obj;
+  {
+    // We protect mock_obj_ under g_gmock_mutex in case this mock
+    // function is called from two threads concurrently.
+    MutexLock l(&g_gmock_mutex);
+    Assert(mock_obj_ != nullptr, __FILE__, __LINE__,
+           "MockObject() must not be called before RegisterOwner() or "
+           "SetOwnerAndName() has been called.");
+    mock_obj = mock_obj_;
+  }
+  return mock_obj;
+}
+
+// Returns the name of this mock method.  Must be called after
+// SetOwnerAndName() has been called.
+const char* UntypedFunctionMockerBase::Name() const
+    GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  const char* name;
+  {
+    // We protect name_ under g_gmock_mutex in case this mock
+    // function is called from two threads concurrently.
+    MutexLock l(&g_gmock_mutex);
+    Assert(name_ != nullptr, __FILE__, __LINE__,
+           "Name() must not be called before SetOwnerAndName() has "
+           "been called.");
+    name = name_;
+  }
+  return name;
+}
+
+// Calculates the result of invoking this mock function with the given
+// arguments, prints it, and returns it.  The caller is responsible
+// for deleting the result.
+UntypedActionResultHolderBase* UntypedFunctionMockerBase::UntypedInvokeWith(
+    void* const untyped_args) GTEST_LOCK_EXCLUDED_(g_gmock_mutex) {
+  // See the definition of untyped_expectations_ for why access to it
+  // is unprotected here.
+  if (untyped_expectations_.size() == 0) {
+    // No expectation is set on this mock method - we have an
+    // uninteresting call.
+
+    // We must get Google Mock's reaction on uninteresting calls
+    // made on this mock object BEFORE performing the action,
+    // because the action may DELETE the mock object and make the
+    // following expression meaningless.
+    const CallReaction reaction =
+        Mock::GetReactionOnUninterestingCalls(MockObject());
+
+    // True if and only if we need to print this call's arguments and return
+    // value.  This definition must be kept in sync with
+    // the behavior of ReportUninterestingCall().
+    const bool need_to_report_uninteresting_call =
+        // If the user allows this uninteresting call, we print it
+        // only when they want informational messages.
+        reaction == kAllow ? LogIsVisible(kInfo) :
+                           // If the user wants this to be a warning, we print
+                           // it only when they want to see warnings.
+            reaction == kWarn
+                ? LogIsVisible(kWarning)
+                :
+                // Otherwise, the user wants this to be an error, and we
+                // should always print detailed information in the error.
+                true;
+
+    if (!need_to_report_uninteresting_call) {
+      // Perform the action without printing the call information.
+      return this->UntypedPerformDefaultAction(
+          untyped_args, "Function call: " + std::string(Name()));
+    }
+
+    // Warns about the uninteresting call.
+    ::std::stringstream ss;
+    this->UntypedDescribeUninterestingCall(untyped_args, &ss);
+
+    // Calculates the function result.
+    UntypedActionResultHolderBase* const result =
+        this->UntypedPerformDefaultAction(untyped_args, ss.str());
+
+    // Prints the function result.
+    if (result != nullptr) result->PrintAsActionResult(&ss);
+
+    ReportUninterestingCall(reaction, ss.str());
+    return result;
+  }
+
+  bool is_excessive = false;
+  ::std::stringstream ss;
+  ::std::stringstream why;
+  ::std::stringstream loc;
+  const void* untyped_action = nullptr;
+
+  // The UntypedFindMatchingExpectation() function acquires and
+  // releases g_gmock_mutex.
+  const ExpectationBase* const untyped_expectation =
+      this->UntypedFindMatchingExpectation(
+          untyped_args, &untyped_action, &is_excessive,
+          &ss, &why);
+  const bool found = untyped_expectation != nullptr;
+
+  // True if and only if we need to print the call's arguments
+  // and return value.
+  // This definition must be kept in sync with the uses of Expect()
+  // and Log() in this function.
+  const bool need_to_report_call =
+      !found || is_excessive || LogIsVisible(kInfo);
+  if (!need_to_report_call) {
+    // Perform the action without printing the call information.
+    return untyped_action == nullptr
+               ? this->UntypedPerformDefaultAction(untyped_args, "")
+               : this->UntypedPerformAction(untyped_action, untyped_args);
+  }
+
+  ss << "    Function call: " << Name();
+  this->UntypedPrintArgs(untyped_args, &ss);
+
+  // In case the action deletes a piece of the expectation, we
+  // generate the message beforehand.
+  if (found && !is_excessive) {
+    untyped_expectation->DescribeLocationTo(&loc);
+  }
+
+  UntypedActionResultHolderBase* const result =
+      untyped_action == nullptr
+          ? this->UntypedPerformDefaultAction(untyped_args, ss.str())
+          : this->UntypedPerformAction(untyped_action, untyped_args);
+  if (result != nullptr) result->PrintAsActionResult(&ss);
+  ss << "\n" << why.str();
+
+  if (!found) {
+    // No expectation matches this call - reports a failure.
+    Expect(false, nullptr, -1, ss.str());
+  } else if (is_excessive) {
+    // We had an upper-bound violation and the failure message is in ss.
+    Expect(false, untyped_expectation->file(),
+           untyped_expectation->line(), ss.str());
+  } else {
+    // We had an expected call and the matching expectation is
+    // described in ss.
+    Log(kInfo, loc.str() + ss.str(), 2);
+  }
+
+  return result;
+}
+
+// Returns an Expectation object that references and co-owns exp,
+// which must be an expectation on this mock function.
+Expectation UntypedFunctionMockerBase::GetHandleOf(ExpectationBase* exp) {
+  // See the definition of untyped_expectations_ for why access to it
+  // is unprotected here.
+  for (UntypedExpectations::const_iterator it =
+           untyped_expectations_.begin();
+       it != untyped_expectations_.end(); ++it) {
+    if (it->get() == exp) {
+      return Expectation(*it);
+    }
+  }
+
+  Assert(false, __FILE__, __LINE__, "Cannot find expectation.");
+  return Expectation();
+  // The above statement is just to make the code compile, and will
+  // never be executed.
+}
+
+// Verifies that all expectations on this mock function have been
+// satisfied.  Reports one or more Google Test non-fatal failures
+// and returns false if not.
+bool UntypedFunctionMockerBase::VerifyAndClearExpectationsLocked()
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(g_gmock_mutex) {
+  g_gmock_mutex.AssertHeld();
+  bool expectations_met = true;
+  for (UntypedExpectations::const_iterator it =
+           untyped_expectations_.begin();
+       it != untyped_expectations_.end(); ++it) {
+    ExpectationBase* const untyped_expectation = it->get();
+    if (untyped_expectation->IsOverSaturated()) {
+      // There was an upper-bound violation.  Since the error was
+      // already reported when it occurred, there is no need to do
+      // anything here.
+      expectations_met = false;
+    } else if (!untyped_expectation->IsSatisfied()) {
+      expectations_met = false;
+      ::std::stringstream ss;
+      ss  << "Actual function call count doesn't match "
+          << untyped_expectation->source_text() << "...\n";
+      // No need to show the source file location of the expectation
+      // in the description, as the Expect() call that follows already
+      // takes care of it.
+      untyped_expectation->MaybeDescribeExtraMatcherTo(&ss);
+      untyped_expectation->DescribeCallCountTo(&ss);
+      Expect(false, untyped_expectation->file(),
+             untyped_expectation->line(), ss.str());
+    }
+  }
+
+  // Deleting our expectations may trigger other mock objects to be deleted, for
+  // example if an action contains a reference counted smart pointer to that
+  // mock object, and that is the last reference. So if we delete our
+  // expectations within the context of the global mutex we may deadlock when
+  // this method is called again. Instead, make a copy of the set of
+  // expectations to delete, clear our set within the mutex, and then clear the
+  // copied set outside of it.
+  UntypedExpectations expectations_to_delete;
+  untyped_expectations_.swap(expectations_to_delete);
+
+  g_gmock_mutex.Unlock();
+  expectations_to_delete.clear();
+  g_gmock_mutex.Lock();
+
+  return expectations_met;
+}
+
+CallReaction intToCallReaction(int mock_behavior) {
+  if (mock_behavior >= kAllow && mock_behavior <= kFail) {
+    return static_cast<internal::CallReaction>(mock_behavior);
+  }
+  return kWarn;
+}
+
+}  // namespace internal
+
+// Class Mock.
+
+namespace {
+
+typedef std::set<internal::UntypedFunctionMockerBase*> FunctionMockers;
+
+// The current state of a mock object.  Such information is needed for
+// detecting leaked mock objects and explicitly verifying a mock's
+// expectations.
+struct MockObjectState {
+  MockObjectState()
+      : first_used_file(nullptr), first_used_line(-1), leakable(false) {}
+
+  // Where in the source file an ON_CALL or EXPECT_CALL is first
+  // invoked on this mock object.
+  const char* first_used_file;
+  int first_used_line;
+  ::std::string first_used_test_suite;
+  ::std::string first_used_test;
+  bool leakable;  // true if and only if it's OK to leak the object.
+  FunctionMockers function_mockers;  // All registered methods of the object.
+};
+
+// A global registry holding the state of all mock objects that are
+// alive.  A mock object is added to this registry the first time
+// Mock::AllowLeak(), ON_CALL(), or EXPECT_CALL() is called on it.  It
+// is removed from the registry in the mock object's destructor.
+class MockObjectRegistry {
+ public:
+  // Maps a mock object (identified by its address) to its state.
+  typedef std::map<const void*, MockObjectState> StateMap;
+
+  // This destructor will be called when a program exits, after all
+  // tests in it have been run.  By then, there should be no mock
+  // object alive.  Therefore we report any living object as test
+  // failure, unless the user explicitly asked us to ignore it.
+  ~MockObjectRegistry() {
+    if (!GMOCK_FLAG(catch_leaked_mocks))
+      return;
+
+    int leaked_count = 0;
+    for (StateMap::const_iterator it = states_.begin(); it != states_.end();
+         ++it) {
+      if (it->second.leakable)  // The user said it's fine to leak this object.
+        continue;
+
+      // FIXME: Print the type of the leaked object.
+      // This can help the user identify the leaked object.
+      std::cout << "\n";
+      const MockObjectState& state = it->second;
+      std::cout << internal::FormatFileLocation(state.first_used_file,
+                                                state.first_used_line);
+      std::cout << " ERROR: this mock object";
+      if (state.first_used_test != "") {
+        std::cout << " (used in test " << state.first_used_test_suite << "."
+                  << state.first_used_test << ")";
+      }
+      std::cout << " should be deleted but never is. Its address is @"
+           << it->first << ".";
+      leaked_count++;
+    }
+    if (leaked_count > 0) {
+      std::cout << "\nERROR: " << leaked_count << " leaked mock "
+                << (leaked_count == 1 ? "object" : "objects")
+                << " found at program exit. Expectations on a mock object is "
+                   "verified when the object is destructed. Leaking a mock "
+                   "means that its expectations aren't verified, which is "
+                   "usually a test bug. If you really intend to leak a mock, "
+                   "you can suppress this error using "
+                   "testing::Mock::AllowLeak(mock_object), or you may use a "
+                   "fake or stub instead of a mock.\n";
+      std::cout.flush();
+      ::std::cerr.flush();
+      // RUN_ALL_TESTS() has already returned when this destructor is
+      // called.  Therefore we cannot use the normal Google Test
+      // failure reporting mechanism.
+      _exit(1);  // We cannot call exit() as it is not reentrant and
+                 // may already have been called.
+    }
+  }
+
+  StateMap& states() { return states_; }
+
+ private:
+  StateMap states_;
+};
+
+// Protected by g_gmock_mutex.
+MockObjectRegistry g_mock_object_registry;
+
+// Maps a mock object to the reaction Google Mock should have when an
+// uninteresting method is called.  Protected by g_gmock_mutex.
+std::map<const void*, internal::CallReaction> g_uninteresting_call_reaction;
+
+// Sets the reaction Google Mock should have when an uninteresting
+// method of the given mock object is called.
+void SetReactionOnUninterestingCalls(const void* mock_obj,
+                                     internal::CallReaction reaction)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_uninteresting_call_reaction[mock_obj] = reaction;
+}
+
+}  // namespace
+
+// Tells Google Mock to allow uninteresting calls on the given mock
+// object.
+void Mock::AllowUninterestingCalls(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  SetReactionOnUninterestingCalls(mock_obj, internal::kAllow);
+}
+
+// Tells Google Mock to warn the user about uninteresting calls on the
+// given mock object.
+void Mock::WarnUninterestingCalls(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  SetReactionOnUninterestingCalls(mock_obj, internal::kWarn);
+}
+
+// Tells Google Mock to fail uninteresting calls on the given mock
+// object.
+void Mock::FailUninterestingCalls(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  SetReactionOnUninterestingCalls(mock_obj, internal::kFail);
+}
+
+// Tells Google Mock the given mock object is being destroyed and its
+// entry in the call-reaction table should be removed.
+void Mock::UnregisterCallReaction(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_uninteresting_call_reaction.erase(mock_obj);
+}
+
+// Returns the reaction Google Mock will have on uninteresting calls
+// made on the given mock object.
+internal::CallReaction Mock::GetReactionOnUninterestingCalls(
+    const void* mock_obj)
+        GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  return (g_uninteresting_call_reaction.count(mock_obj) == 0) ?
+      internal::intToCallReaction(GMOCK_FLAG(default_mock_behavior)) :
+      g_uninteresting_call_reaction[mock_obj];
+}
+
+// Tells Google Mock to ignore mock_obj when checking for leaked mock
+// objects.
+void Mock::AllowLeak(const void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_mock_object_registry.states()[mock_obj].leakable = true;
+}
+
+// Verifies and clears all expectations on the given mock object.  If
+// the expectations aren't satisfied, generates one or more Google
+// Test non-fatal failures and returns false.
+bool Mock::VerifyAndClearExpectations(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  return VerifyAndClearExpectationsLocked(mock_obj);
+}
+
+// Verifies all expectations on the given mock object and clears its
+// default actions and expectations.  Returns true if and only if the
+// verification was successful.
+bool Mock::VerifyAndClear(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  ClearDefaultActionsLocked(mock_obj);
+  return VerifyAndClearExpectationsLocked(mock_obj);
+}
+
+// Verifies and clears all expectations on the given mock object.  If
+// the expectations aren't satisfied, generates one or more Google
+// Test non-fatal failures and returns false.
+bool Mock::VerifyAndClearExpectationsLocked(void* mock_obj)
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+  internal::g_gmock_mutex.AssertHeld();
+  if (g_mock_object_registry.states().count(mock_obj) == 0) {
+    // No EXPECT_CALL() was set on the given mock object.
+    return true;
+  }
+
+  // Verifies and clears the expectations on each mock method in the
+  // given mock object.
+  bool expectations_met = true;
+  FunctionMockers& mockers =
+      g_mock_object_registry.states()[mock_obj].function_mockers;
+  for (FunctionMockers::const_iterator it = mockers.begin();
+       it != mockers.end(); ++it) {
+    if (!(*it)->VerifyAndClearExpectationsLocked()) {
+      expectations_met = false;
+    }
+  }
+
+  // We don't clear the content of mockers, as they may still be
+  // needed by ClearDefaultActionsLocked().
+  return expectations_met;
+}
+
+bool Mock::IsNaggy(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kWarn;
+}
+bool Mock::IsNice(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kAllow;
+}
+bool Mock::IsStrict(void* mock_obj)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  return Mock::GetReactionOnUninterestingCalls(mock_obj) == internal::kFail;
+}
+
+// Registers a mock object and a mock method it owns.
+void Mock::Register(const void* mock_obj,
+                    internal::UntypedFunctionMockerBase* mocker)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  g_mock_object_registry.states()[mock_obj].function_mockers.insert(mocker);
+}
+
+// Tells Google Mock where in the source code mock_obj is used in an
+// ON_CALL or EXPECT_CALL.  In case mock_obj is leaked, this
+// information helps the user identify which object it is.
+void Mock::RegisterUseByOnCallOrExpectCall(const void* mock_obj,
+                                           const char* file, int line)
+    GTEST_LOCK_EXCLUDED_(internal::g_gmock_mutex) {
+  internal::MutexLock l(&internal::g_gmock_mutex);
+  MockObjectState& state = g_mock_object_registry.states()[mock_obj];
+  if (state.first_used_file == nullptr) {
+    state.first_used_file = file;
+    state.first_used_line = line;
+    const TestInfo* const test_info =
+        UnitTest::GetInstance()->current_test_info();
+    if (test_info != nullptr) {
+      state.first_used_test_suite = test_info->test_suite_name();
+      state.first_used_test = test_info->name();
+    }
+  }
+}
+
+// Unregisters a mock method; removes the owning mock object from the
+// registry when the last mock method associated with it has been
+// unregistered.  This is called only in the destructor of
+// FunctionMockerBase.
+void Mock::UnregisterLocked(internal::UntypedFunctionMockerBase* mocker)
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+  internal::g_gmock_mutex.AssertHeld();
+  for (MockObjectRegistry::StateMap::iterator it =
+           g_mock_object_registry.states().begin();
+       it != g_mock_object_registry.states().end(); ++it) {
+    FunctionMockers& mockers = it->second.function_mockers;
+    if (mockers.erase(mocker) > 0) {
+      // mocker was in mockers and has been just removed.
+      if (mockers.empty()) {
+        g_mock_object_registry.states().erase(it);
+      }
+      return;
+    }
+  }
+}
+
+// Clears all ON_CALL()s set on the given mock object.
+void Mock::ClearDefaultActionsLocked(void* mock_obj)
+    GTEST_EXCLUSIVE_LOCK_REQUIRED_(internal::g_gmock_mutex) {
+  internal::g_gmock_mutex.AssertHeld();
+
+  if (g_mock_object_registry.states().count(mock_obj) == 0) {
+    // No ON_CALL() was set on the given mock object.
+    return;
+  }
+
+  // Clears the default actions for each mock method in the given mock
+  // object.
+  FunctionMockers& mockers =
+      g_mock_object_registry.states()[mock_obj].function_mockers;
+  for (FunctionMockers::const_iterator it = mockers.begin();
+       it != mockers.end(); ++it) {
+    (*it)->ClearDefaultActionsLocked();
+  }
+
+  // We don't clear the content of mockers, as they may still be
+  // needed by VerifyAndClearExpectationsLocked().
+}
+
+Expectation::Expectation() {}
+
+Expectation::Expectation(
+    const std::shared_ptr<internal::ExpectationBase>& an_expectation_base)
+    : expectation_base_(an_expectation_base) {}
+
+Expectation::~Expectation() {}
+
+// Adds an expectation to a sequence.
+void Sequence::AddExpectation(const Expectation& expectation) const {
+  if (*last_expectation_ != expectation) {
+    if (last_expectation_->expectation_base() != nullptr) {
+      expectation.expectation_base()->immediate_prerequisites_
+          += *last_expectation_;
+    }
+    *last_expectation_ = expectation;
+  }
+}
+
+// Creates the implicit sequence if there isn't one.
+InSequence::InSequence() {
+  if (internal::g_gmock_implicit_sequence.get() == nullptr) {
+    internal::g_gmock_implicit_sequence.set(new Sequence);
+    sequence_created_ = true;
+  } else {
+    sequence_created_ = false;
+  }
+}
+
+// Deletes the implicit sequence if it was created by the constructor
+// of this object.
+InSequence::~InSequence() {
+  if (sequence_created_) {
+    delete internal::g_gmock_implicit_sequence.get();
+    internal::g_gmock_implicit_sequence.set(nullptr);
+  }
+}
+
+}  // namespace testing
+
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#  pragma warning(pop)
+#endif
+#endif
diff --git a/GraphBLAS/CUDA/test/googlemock/src/gmock.cc b/GraphBLAS/CUDA/test/googlemock/src/gmock.cc
new file mode 100644
index 0000000000..32b2a7394f
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/src/gmock.cc
@@ -0,0 +1,213 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+
+namespace testing {
+
+GMOCK_DEFINE_bool_(catch_leaked_mocks, true,
+                   "true if and only if Google Mock should report leaked "
+                   "mock objects as failures.");
+
+GMOCK_DEFINE_string_(verbose, internal::kWarningVerbosity,
+                     "Controls how verbose Google Mock's output is."
+                     "  Valid values:\n"
+                     "  info    - prints all messages.\n"
+                     "  warning - prints warnings and errors.\n"
+                     "  error   - prints errors only.");
+
+GMOCK_DEFINE_int32_(default_mock_behavior, 1,
+                    "Controls the default behavior of mocks."
+                    "  Valid values:\n"
+                    "  0 - by default, mocks act as NiceMocks.\n"
+                    "  1 - by default, mocks act as NaggyMocks.\n"
+                    "  2 - by default, mocks act as StrictMocks.");
+
+namespace internal {
+
+// Parses a string as a command line flag.  The string should have the
+// format "--gmock_flag=value".  When def_optional is true, the
+// "=value" part can be omitted.
+//
+// Returns the value of the flag, or NULL if the parsing failed.
+static const char* ParseGoogleMockFlagValue(const char* str,
+                                            const char* flag,
+                                            bool def_optional) {
+  // str and flag must not be NULL.
+  if (str == nullptr || flag == nullptr) return nullptr;
+
+  // The flag must start with "--gmock_".
+  const std::string flag_str = std::string("--gmock_") + flag;
+  const size_t flag_len = flag_str.length();
+  if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
+
+  // Skips the flag name.
+  const char* flag_end = str + flag_len;
+
+  // When def_optional is true, it's OK to not have a "=value" part.
+  if (def_optional && (flag_end[0] == '\0')) {
+    return flag_end;
+  }
+
+  // If def_optional is true and there are more characters after the
+  // flag name, or if def_optional is false, there must be a '=' after
+  // the flag name.
+  if (flag_end[0] != '=') return nullptr;
+
+  // Returns the string after "=".
+  return flag_end + 1;
+}
+
+// Parses a string for a Google Mock bool flag, in the form of
+// "--gmock_flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+static bool ParseGoogleMockBoolFlag(const char* str, const char* flag,
+                                    bool* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Converts the string value to a bool.
+  *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F');
+  return true;
+}
+
+// Parses a string for a Google Mock string flag, in the form of
+// "--gmock_flag=value".
+//
+// On success, stores the value of the flag in *value, and returns
+// true.  On failure, returns false without changing *value.
+template <typename String>
+static bool ParseGoogleMockStringFlag(const char* str, const char* flag,
+                                      String* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag, false);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  *value = value_str;
+  return true;
+}
+
+static bool ParseGoogleMockIntFlag(const char* str, const char* flag,
+                                   int* value) {
+  // Gets the value of the flag as a string.
+  const char* const value_str = ParseGoogleMockFlagValue(str, flag, true);
+
+  // Aborts if the parsing failed.
+  if (value_str == nullptr) return false;
+
+  // Sets *value to the value of the flag.
+  return ParseInt32(Message() << "The value of flag --" << flag,
+                    value_str, value);
+}
+
+// The internal implementation of InitGoogleMock().
+//
+// The type parameter CharType can be instantiated to either char or
+// wchar_t.
+template <typename CharType>
+void InitGoogleMockImpl(int* argc, CharType** argv) {
+  // Makes sure Google Test is initialized.  InitGoogleTest() is
+  // idempotent, so it's fine if the user has already called it.
+  InitGoogleTest(argc, argv);
+  if (*argc <= 0) return;
+
+  for (int i = 1; i != *argc; i++) {
+    const std::string arg_string = StreamableToString(argv[i]);
+    const char* const arg = arg_string.c_str();
+
+    // Do we see a Google Mock flag?
+    if (ParseGoogleMockBoolFlag(arg, "catch_leaked_mocks",
+                                &GMOCK_FLAG(catch_leaked_mocks)) ||
+        ParseGoogleMockStringFlag(arg, "verbose", &GMOCK_FLAG(verbose)) ||
+        ParseGoogleMockIntFlag(arg, "default_mock_behavior",
+                               &GMOCK_FLAG(default_mock_behavior))) {
+      // Yes.  Shift the remainder of the argv list left by one.  Note
+      // that argv has (*argc + 1) elements, the last one always being
+      // NULL.  The following loop moves the trailing NULL element as
+      // well.
+      for (int j = i; j != *argc; j++) {
+        argv[j] = argv[j + 1];
+      }
+
+      // Decrements the argument count.
+      (*argc)--;
+
+      // We also need to decrement the iterator as we just removed
+      // an element.
+      i--;
+    }
+  }
+}
+
+}  // namespace internal
+
+// Initializes Google Mock.  This must be called before running the
+// tests.  In particular, it parses a command line for the flags that
+// Google Mock recognizes.  Whenever a Google Mock flag is seen, it is
+// removed from argv, and *argc is decremented.
+//
+// No value is returned.  Instead, the Google Mock flag variables are
+// updated.
+//
+// Since Google Test is needed for Google Mock to work, this function
+// also initializes Google Test and parses its flags, if that hasn't
+// been done.
+GTEST_API_ void InitGoogleMock(int* argc, char** argv) {
+  internal::InitGoogleMockImpl(argc, argv);
+}
+
+// This overloaded version can be used in Windows programs compiled in
+// UNICODE mode.
+GTEST_API_ void InitGoogleMock(int* argc, wchar_t** argv) {
+  internal::InitGoogleMockImpl(argc, argv);
+}
+
+// This overloaded version can be used on Arduino/embedded platforms where
+// there is no argc/argv.
+GTEST_API_ void InitGoogleMock() {
+  // Since Arduino doesn't have a command line, fake out the argc/argv arguments
+  int argc = 1;
+  const auto arg0 = "dummy";
+  char* argv0 = const_cast<char*>(arg0);
+  char** argv = &argv0;
+
+  internal::InitGoogleMockImpl(&argc, argv);
+}
+
+}  // namespace testing
diff --git a/GraphBLAS/CUDA/test/googlemock/src/gmock_main.cc b/GraphBLAS/CUDA/test/googlemock/src/gmock_main.cc
new file mode 100644
index 0000000000..18c500f663
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/src/gmock_main.cc
@@ -0,0 +1,72 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+#include <iostream>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#if GTEST_OS_ESP8266 || GTEST_OS_ESP32
+#if GTEST_OS_ESP8266
+extern "C" {
+#endif
+void setup() {
+  // Since Google Mock depends on Google Test, InitGoogleMock() is
+  // also responsible for initializing Google Test.  Therefore there's
+  // no need for calling testing::InitGoogleTest() separately.
+  testing::InitGoogleMock();
+}
+void loop() { RUN_ALL_TESTS(); }
+#if GTEST_OS_ESP8266
+}
+#endif
+
+#else
+
+// MS C++ compiler/linker has a bug on Windows (not on Windows CE), which
+// causes a link error when _tmain is defined in a static library and UNICODE
+// is enabled. For this reason instead of _tmain, main function is used on
+// Windows. See the following link to track the current status of this bug:
+// https://web.archive.org/web/20170912203238/connect.microsoft.com/VisualStudio/feedback/details/394464/wmain-link-error-in-the-static-library
+// // NOLINT
+#if GTEST_OS_WINDOWS_MOBILE
+# include <tchar.h>  // NOLINT
+
+GTEST_API_ int _tmain(int argc, TCHAR** argv) {
+#else
+GTEST_API_ int main(int argc, char** argv) {
+#endif  // GTEST_OS_WINDOWS_MOBILE
+  std::cout << "Running main() from gmock_main.cc\n";
+  // Since Google Mock depends on Google Test, InitGoogleMock() is
+  // also responsible for initializing Google Test.  Therefore there's
+  // no need for calling testing::InitGoogleTest() separately.
+  testing::InitGoogleMock(&argc, argv);
+  return RUN_ALL_TESTS();
+}
+#endif
diff --git a/GraphBLAS/CUDA/test/googlemock/test/BUILD.bazel b/GraphBLAS/CUDA/test/googlemock/test/BUILD.bazel
new file mode 100644
index 0000000000..4aa9a75e80
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/BUILD.bazel
@@ -0,0 +1,108 @@
+# Copyright 2017 Google Inc.
+# All Rights Reserved.
+#
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+#   Bazel Build for Google C++ Testing Framework(Google Test)-googlemock
+
+load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_test")
+load("@rules_python//python:defs.bzl", "py_library", "py_test")
+
+licenses(["notice"])
+
+# Tests for GMock itself
+cc_test(
+    name = "gmock_all_test",
+    size = "small",
+    srcs = glob(include = ["gmock-*.cc"]),
+    linkopts = select({
+        "//:windows": [],
+        "//conditions:default": ["-pthread"],
+    }),
+    deps = ["//:gtest"],
+)
+
+# Python tests
+py_library(
+    name = "gmock_test_utils",
+    testonly = 1,
+    srcs = ["gmock_test_utils.py"],
+)
+
+cc_binary(
+    name = "gmock_leak_test_",
+    testonly = 1,
+    srcs = ["gmock_leak_test_.cc"],
+    deps = ["//:gtest_main"],
+)
+
+py_test(
+    name = "gmock_leak_test",
+    size = "medium",
+    srcs = ["gmock_leak_test.py"],
+    data = [
+        ":gmock_leak_test_",
+        ":gmock_test_utils",
+    ],
+)
+
+cc_test(
+    name = "gmock_link_test",
+    size = "small",
+    srcs = [
+        "gmock_link2_test.cc",
+        "gmock_link_test.cc",
+        "gmock_link_test.h",
+    ],
+    deps = ["//:gtest_main"],
+)
+
+cc_binary(
+    name = "gmock_output_test_",
+    srcs = ["gmock_output_test_.cc"],
+    deps = ["//:gtest"],
+)
+
+py_test(
+    name = "gmock_output_test",
+    size = "medium",
+    srcs = ["gmock_output_test.py"],
+    data = [
+        ":gmock_output_test_",
+        ":gmock_output_test_golden.txt",
+    ],
+    python_version = "PY2",
+    deps = [":gmock_test_utils"],
+)
+
+cc_test(
+    name = "gmock_test",
+    size = "small",
+    srcs = ["gmock_test.cc"],
+    deps = ["//:gtest_main"],
+)
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-actions_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-actions_test.cc
new file mode 100644
index 0000000000..d1229ac99d
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-actions_test.cc
@@ -0,0 +1,1518 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the built-in actions.
+
+// Silence C4800 (C4800: 'int *const ': forcing value
+// to bool 'true' or 'false') for MSVC 15
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#  pragma warning(push)
+#  pragma warning(disable:4800)
+#endif
+#endif
+
+#include "gmock/gmock-actions.h"
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+
+namespace {
+
+// This list should be kept sorted.
+using testing::_;
+using testing::Action;
+using testing::ActionInterface;
+using testing::Assign;
+using testing::ByMove;
+using testing::ByRef;
+using testing::DefaultValue;
+using testing::DoAll;
+using testing::DoDefault;
+using testing::IgnoreResult;
+using testing::Invoke;
+using testing::InvokeWithoutArgs;
+using testing::MakePolymorphicAction;
+using testing::Ne;
+using testing::PolymorphicAction;
+using testing::Return;
+using testing::ReturnNull;
+using testing::ReturnRef;
+using testing::ReturnRefOfCopy;
+using testing::ReturnRoundRobin;
+using testing::SetArgPointee;
+using testing::SetArgumentPointee;
+using testing::Unused;
+using testing::WithArgs;
+using testing::internal::BuiltInDefaultValue;
+
+#if !GTEST_OS_WINDOWS_MOBILE
+using testing::SetErrnoAndReturn;
+#endif
+
+// Tests that BuiltInDefaultValue<T*>::Get() returns NULL.
+TEST(BuiltInDefaultValueTest, IsNullForPointerTypes) {
+  EXPECT_TRUE(BuiltInDefaultValue<int*>::Get() == nullptr);
+  EXPECT_TRUE(BuiltInDefaultValue<const char*>::Get() == nullptr);
+  EXPECT_TRUE(BuiltInDefaultValue<void*>::Get() == nullptr);
+}
+
+// Tests that BuiltInDefaultValue<T*>::Exists() return true.
+TEST(BuiltInDefaultValueTest, ExistsForPointerTypes) {
+  EXPECT_TRUE(BuiltInDefaultValue<int*>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<const char*>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<void*>::Exists());
+}
+
+// Tests that BuiltInDefaultValue<T>::Get() returns 0 when T is a
+// built-in numeric type.
+TEST(BuiltInDefaultValueTest, IsZeroForNumericTypes) {
+  EXPECT_EQ(0U, BuiltInDefaultValue<unsigned char>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<signed char>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<char>::Get());
+#if GMOCK_WCHAR_T_IS_NATIVE_
+#if !defined(__WCHAR_UNSIGNED__)
+  EXPECT_EQ(0, BuiltInDefaultValue<wchar_t>::Get());
+#else
+  EXPECT_EQ(0U, BuiltInDefaultValue<wchar_t>::Get());
+#endif
+#endif
+  EXPECT_EQ(0U, BuiltInDefaultValue<unsigned short>::Get());  // NOLINT
+  EXPECT_EQ(0, BuiltInDefaultValue<signed short>::Get());  // NOLINT
+  EXPECT_EQ(0, BuiltInDefaultValue<short>::Get());  // NOLINT
+  EXPECT_EQ(0U, BuiltInDefaultValue<unsigned int>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<signed int>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<int>::Get());
+  EXPECT_EQ(0U, BuiltInDefaultValue<unsigned long>::Get());  // NOLINT
+  EXPECT_EQ(0, BuiltInDefaultValue<signed long>::Get());  // NOLINT
+  EXPECT_EQ(0, BuiltInDefaultValue<long>::Get());  // NOLINT
+  EXPECT_EQ(0U, BuiltInDefaultValue<unsigned long long>::Get());  // NOLINT
+  EXPECT_EQ(0, BuiltInDefaultValue<signed long long>::Get());  // NOLINT
+  EXPECT_EQ(0, BuiltInDefaultValue<long long>::Get());  // NOLINT
+  EXPECT_EQ(0, BuiltInDefaultValue<float>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<double>::Get());
+}
+
+// Tests that BuiltInDefaultValue<T>::Exists() returns true when T is a
+// built-in numeric type.
+TEST(BuiltInDefaultValueTest, ExistsForNumericTypes) {
+  EXPECT_TRUE(BuiltInDefaultValue<unsigned char>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<signed char>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<char>::Exists());
+#if GMOCK_WCHAR_T_IS_NATIVE_
+  EXPECT_TRUE(BuiltInDefaultValue<wchar_t>::Exists());
+#endif
+  EXPECT_TRUE(BuiltInDefaultValue<unsigned short>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<signed short>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<short>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<unsigned int>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<signed int>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<int>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<unsigned long>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<signed long>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<long>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<unsigned long long>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<signed long long>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<long long>::Exists());  // NOLINT
+  EXPECT_TRUE(BuiltInDefaultValue<float>::Exists());
+  EXPECT_TRUE(BuiltInDefaultValue<double>::Exists());
+}
+
+// Tests that BuiltInDefaultValue<bool>::Get() returns false.
+TEST(BuiltInDefaultValueTest, IsFalseForBool) {
+  EXPECT_FALSE(BuiltInDefaultValue<bool>::Get());
+}
+
+// Tests that BuiltInDefaultValue<bool>::Exists() returns true.
+TEST(BuiltInDefaultValueTest, BoolExists) {
+  EXPECT_TRUE(BuiltInDefaultValue<bool>::Exists());
+}
+
+// Tests that BuiltInDefaultValue<T>::Get() returns "" when T is a
+// string type.
+TEST(BuiltInDefaultValueTest, IsEmptyStringForString) {
+  EXPECT_EQ("", BuiltInDefaultValue< ::std::string>::Get());
+}
+
+// Tests that BuiltInDefaultValue<T>::Exists() returns true when T is a
+// string type.
+TEST(BuiltInDefaultValueTest, ExistsForString) {
+  EXPECT_TRUE(BuiltInDefaultValue< ::std::string>::Exists());
+}
+
+// Tests that BuiltInDefaultValue<const T>::Get() returns the same
+// value as BuiltInDefaultValue<T>::Get() does.
+TEST(BuiltInDefaultValueTest, WorksForConstTypes) {
+  EXPECT_EQ("", BuiltInDefaultValue<const std::string>::Get());
+  EXPECT_EQ(0, BuiltInDefaultValue<const int>::Get());
+  EXPECT_TRUE(BuiltInDefaultValue<char* const>::Get() == nullptr);
+  EXPECT_FALSE(BuiltInDefaultValue<const bool>::Get());
+}
+
+// A type that's default constructible.
+class MyDefaultConstructible {
+ public:
+  MyDefaultConstructible() : value_(42) {}
+
+  int value() const { return value_; }
+
+ private:
+  int value_;
+};
+
+// A type that's not default constructible.
+class MyNonDefaultConstructible {
+ public:
+  // Does not have a default ctor.
+  explicit MyNonDefaultConstructible(int a_value) : value_(a_value) {}
+
+  int value() const { return value_; }
+
+ private:
+  int value_;
+};
+
+
+TEST(BuiltInDefaultValueTest, ExistsForDefaultConstructibleType) {
+  EXPECT_TRUE(BuiltInDefaultValue<MyDefaultConstructible>::Exists());
+}
+
+TEST(BuiltInDefaultValueTest, IsDefaultConstructedForDefaultConstructibleType) {
+  EXPECT_EQ(42, BuiltInDefaultValue<MyDefaultConstructible>::Get().value());
+}
+
+
+TEST(BuiltInDefaultValueTest, DoesNotExistForNonDefaultConstructibleType) {
+  EXPECT_FALSE(BuiltInDefaultValue<MyNonDefaultConstructible>::Exists());
+}
+
+// Tests that BuiltInDefaultValue<T&>::Get() aborts the program.
+TEST(BuiltInDefaultValueDeathTest, IsUndefinedForReferences) {
+  EXPECT_DEATH_IF_SUPPORTED({
+    BuiltInDefaultValue<int&>::Get();
+  }, "");
+  EXPECT_DEATH_IF_SUPPORTED({
+    BuiltInDefaultValue<const char&>::Get();
+  }, "");
+}
+
+TEST(BuiltInDefaultValueDeathTest, IsUndefinedForNonDefaultConstructibleType) {
+  EXPECT_DEATH_IF_SUPPORTED({
+    BuiltInDefaultValue<MyNonDefaultConstructible>::Get();
+  }, "");
+}
+
+// Tests that DefaultValue<T>::IsSet() is false initially.
+TEST(DefaultValueTest, IsInitiallyUnset) {
+  EXPECT_FALSE(DefaultValue<int>::IsSet());
+  EXPECT_FALSE(DefaultValue<MyDefaultConstructible>::IsSet());
+  EXPECT_FALSE(DefaultValue<const MyNonDefaultConstructible>::IsSet());
+}
+
+// Tests that DefaultValue<T> can be set and then unset.
+TEST(DefaultValueTest, CanBeSetAndUnset) {
+  EXPECT_TRUE(DefaultValue<int>::Exists());
+  EXPECT_FALSE(DefaultValue<const MyNonDefaultConstructible>::Exists());
+
+  DefaultValue<int>::Set(1);
+  DefaultValue<const MyNonDefaultConstructible>::Set(
+      MyNonDefaultConstructible(42));
+
+  EXPECT_EQ(1, DefaultValue<int>::Get());
+  EXPECT_EQ(42, DefaultValue<const MyNonDefaultConstructible>::Get().value());
+
+  EXPECT_TRUE(DefaultValue<int>::Exists());
+  EXPECT_TRUE(DefaultValue<const MyNonDefaultConstructible>::Exists());
+
+  DefaultValue<int>::Clear();
+  DefaultValue<const MyNonDefaultConstructible>::Clear();
+
+  EXPECT_FALSE(DefaultValue<int>::IsSet());
+  EXPECT_FALSE(DefaultValue<const MyNonDefaultConstructible>::IsSet());
+
+  EXPECT_TRUE(DefaultValue<int>::Exists());
+  EXPECT_FALSE(DefaultValue<const MyNonDefaultConstructible>::Exists());
+}
+
+// Tests that DefaultValue<T>::Get() returns the
+// BuiltInDefaultValue<T>::Get() when DefaultValue<T>::IsSet() is
+// false.
+TEST(DefaultValueDeathTest, GetReturnsBuiltInDefaultValueWhenUnset) {
+  EXPECT_FALSE(DefaultValue<int>::IsSet());
+  EXPECT_TRUE(DefaultValue<int>::Exists());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible>::IsSet());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible>::Exists());
+
+  EXPECT_EQ(0, DefaultValue<int>::Get());
+
+  EXPECT_DEATH_IF_SUPPORTED({
+    DefaultValue<MyNonDefaultConstructible>::Get();
+  }, "");
+}
+
+TEST(DefaultValueTest, GetWorksForMoveOnlyIfSet) {
+  EXPECT_TRUE(DefaultValue<std::unique_ptr<int>>::Exists());
+  EXPECT_TRUE(DefaultValue<std::unique_ptr<int>>::Get() == nullptr);
+  DefaultValue<std::unique_ptr<int>>::SetFactory([] {
+    return std::unique_ptr<int>(new int(42));
+  });
+  EXPECT_TRUE(DefaultValue<std::unique_ptr<int>>::Exists());
+  std::unique_ptr<int> i = DefaultValue<std::unique_ptr<int>>::Get();
+  EXPECT_EQ(42, *i);
+}
+
+// Tests that DefaultValue<void>::Get() returns void.
+TEST(DefaultValueTest, GetWorksForVoid) {
+  return DefaultValue<void>::Get();
+}
+
+// Tests using DefaultValue with a reference type.
+
+// Tests that DefaultValue<T&>::IsSet() is false initially.
+TEST(DefaultValueOfReferenceTest, IsInitiallyUnset) {
+  EXPECT_FALSE(DefaultValue<int&>::IsSet());
+  EXPECT_FALSE(DefaultValue<MyDefaultConstructible&>::IsSet());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible&>::IsSet());
+}
+
+// Tests that DefaultValue<T&>::Exists is false initiallly.
+TEST(DefaultValueOfReferenceTest, IsInitiallyNotExisting) {
+  EXPECT_FALSE(DefaultValue<int&>::Exists());
+  EXPECT_FALSE(DefaultValue<MyDefaultConstructible&>::Exists());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible&>::Exists());
+}
+
+// Tests that DefaultValue<T&> can be set and then unset.
+TEST(DefaultValueOfReferenceTest, CanBeSetAndUnset) {
+  int n = 1;
+  DefaultValue<const int&>::Set(n);
+  MyNonDefaultConstructible x(42);
+  DefaultValue<MyNonDefaultConstructible&>::Set(x);
+
+  EXPECT_TRUE(DefaultValue<const int&>::Exists());
+  EXPECT_TRUE(DefaultValue<MyNonDefaultConstructible&>::Exists());
+
+  EXPECT_EQ(&n, &(DefaultValue<const int&>::Get()));
+  EXPECT_EQ(&x, &(DefaultValue<MyNonDefaultConstructible&>::Get()));
+
+  DefaultValue<const int&>::Clear();
+  DefaultValue<MyNonDefaultConstructible&>::Clear();
+
+  EXPECT_FALSE(DefaultValue<const int&>::Exists());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible&>::Exists());
+
+  EXPECT_FALSE(DefaultValue<const int&>::IsSet());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible&>::IsSet());
+}
+
+// Tests that DefaultValue<T&>::Get() returns the
+// BuiltInDefaultValue<T&>::Get() when DefaultValue<T&>::IsSet() is
+// false.
+TEST(DefaultValueOfReferenceDeathTest, GetReturnsBuiltInDefaultValueWhenUnset) {
+  EXPECT_FALSE(DefaultValue<int&>::IsSet());
+  EXPECT_FALSE(DefaultValue<MyNonDefaultConstructible&>::IsSet());
+
+  EXPECT_DEATH_IF_SUPPORTED({
+    DefaultValue<int&>::Get();
+  }, "");
+  EXPECT_DEATH_IF_SUPPORTED({
+    DefaultValue<MyNonDefaultConstructible>::Get();
+  }, "");
+}
+
+// Tests that ActionInterface can be implemented by defining the
+// Perform method.
+
+typedef int MyGlobalFunction(bool, int);
+
+class MyActionImpl : public ActionInterface<MyGlobalFunction> {
+ public:
+  int Perform(const std::tuple<bool, int>& args) override {
+    return std::get<0>(args) ? std::get<1>(args) : 0;
+  }
+};
+
+TEST(ActionInterfaceTest, CanBeImplementedByDefiningPerform) {
+  MyActionImpl my_action_impl;
+  (void)my_action_impl;
+}
+
+TEST(ActionInterfaceTest, MakeAction) {
+  Action<MyGlobalFunction> action = MakeAction(new MyActionImpl);
+
+  // When exercising the Perform() method of Action<F>, we must pass
+  // it a tuple whose size and type are compatible with F's argument
+  // types.  For example, if F is int(), then Perform() takes a
+  // 0-tuple; if F is void(bool, int), then Perform() takes a
+  // std::tuple<bool, int>, and so on.
+  EXPECT_EQ(5, action.Perform(std::make_tuple(true, 5)));
+}
+
+// Tests that Action<F> can be contructed from a pointer to
+// ActionInterface<F>.
+TEST(ActionTest, CanBeConstructedFromActionInterface) {
+  Action<MyGlobalFunction> action(new MyActionImpl);
+}
+
+// Tests that Action<F> delegates actual work to ActionInterface<F>.
+TEST(ActionTest, DelegatesWorkToActionInterface) {
+  const Action<MyGlobalFunction> action(new MyActionImpl);
+
+  EXPECT_EQ(5, action.Perform(std::make_tuple(true, 5)));
+  EXPECT_EQ(0, action.Perform(std::make_tuple(false, 1)));
+}
+
+// Tests that Action<F> can be copied.
+TEST(ActionTest, IsCopyable) {
+  Action<MyGlobalFunction> a1(new MyActionImpl);
+  Action<MyGlobalFunction> a2(a1);  // Tests the copy constructor.
+
+  // a1 should continue to work after being copied from.
+  EXPECT_EQ(5, a1.Perform(std::make_tuple(true, 5)));
+  EXPECT_EQ(0, a1.Perform(std::make_tuple(false, 1)));
+
+  // a2 should work like the action it was copied from.
+  EXPECT_EQ(5, a2.Perform(std::make_tuple(true, 5)));
+  EXPECT_EQ(0, a2.Perform(std::make_tuple(false, 1)));
+
+  a2 = a1;  // Tests the assignment operator.
+
+  // a1 should continue to work after being copied from.
+  EXPECT_EQ(5, a1.Perform(std::make_tuple(true, 5)));
+  EXPECT_EQ(0, a1.Perform(std::make_tuple(false, 1)));
+
+  // a2 should work like the action it was copied from.
+  EXPECT_EQ(5, a2.Perform(std::make_tuple(true, 5)));
+  EXPECT_EQ(0, a2.Perform(std::make_tuple(false, 1)));
+}
+
+// Tests that an Action<From> object can be converted to a
+// compatible Action<To> object.
+
+class IsNotZero : public ActionInterface<bool(int)> {  // NOLINT
+ public:
+  bool Perform(const std::tuple<int>& arg) override {
+    return std::get<0>(arg) != 0;
+  }
+};
+
+TEST(ActionTest, CanBeConvertedToOtherActionType) {
+  const Action<bool(int)> a1(new IsNotZero);  // NOLINT
+  const Action<int(char)> a2 = Action<int(char)>(a1);  // NOLINT
+  EXPECT_EQ(1, a2.Perform(std::make_tuple('a')));
+  EXPECT_EQ(0, a2.Perform(std::make_tuple('\0')));
+}
+
+// The following two classes are for testing MakePolymorphicAction().
+
+// Implements a polymorphic action that returns the second of the
+// arguments it receives.
+class ReturnSecondArgumentAction {
+ public:
+  // We want to verify that MakePolymorphicAction() can work with a
+  // polymorphic action whose Perform() method template is either
+  // const or not.  This lets us verify the non-const case.
+  template <typename Result, typename ArgumentTuple>
+  Result Perform(const ArgumentTuple& args) {
+    return std::get<1>(args);
+  }
+};
+
+// Implements a polymorphic action that can be used in a nullary
+// function to return 0.
+class ReturnZeroFromNullaryFunctionAction {
+ public:
+  // For testing that MakePolymorphicAction() works when the
+  // implementation class' Perform() method template takes only one
+  // template parameter.
+  //
+  // We want to verify that MakePolymorphicAction() can work with a
+  // polymorphic action whose Perform() method template is either
+  // const or not.  This lets us verify the const case.
+  template <typename Result>
+  Result Perform(const std::tuple<>&) const {
+    return 0;
+  }
+};
+
+// These functions verify that MakePolymorphicAction() returns a
+// PolymorphicAction<T> where T is the argument's type.
+
+PolymorphicAction<ReturnSecondArgumentAction> ReturnSecondArgument() {
+  return MakePolymorphicAction(ReturnSecondArgumentAction());
+}
+
+PolymorphicAction<ReturnZeroFromNullaryFunctionAction>
+ReturnZeroFromNullaryFunction() {
+  return MakePolymorphicAction(ReturnZeroFromNullaryFunctionAction());
+}
+
+// Tests that MakePolymorphicAction() turns a polymorphic action
+// implementation class into a polymorphic action.
+TEST(MakePolymorphicActionTest, ConstructsActionFromImpl) {
+  Action<int(bool, int, double)> a1 = ReturnSecondArgument();  // NOLINT
+  EXPECT_EQ(5, a1.Perform(std::make_tuple(false, 5, 2.0)));
+}
+
+// Tests that MakePolymorphicAction() works when the implementation
+// class' Perform() method template has only one template parameter.
+TEST(MakePolymorphicActionTest, WorksWhenPerformHasOneTemplateParameter) {
+  Action<int()> a1 = ReturnZeroFromNullaryFunction();
+  EXPECT_EQ(0, a1.Perform(std::make_tuple()));
+
+  Action<void*()> a2 = ReturnZeroFromNullaryFunction();
+  EXPECT_TRUE(a2.Perform(std::make_tuple()) == nullptr);
+}
+
+// Tests that Return() works as an action for void-returning
+// functions.
+TEST(ReturnTest, WorksForVoid) {
+  const Action<void(int)> ret = Return();  // NOLINT
+  return ret.Perform(std::make_tuple(1));
+}
+
+// Tests that Return(v) returns v.
+TEST(ReturnTest, ReturnsGivenValue) {
+  Action<int()> ret = Return(1);  // NOLINT
+  EXPECT_EQ(1, ret.Perform(std::make_tuple()));
+
+  ret = Return(-5);
+  EXPECT_EQ(-5, ret.Perform(std::make_tuple()));
+}
+
+// Tests that Return("string literal") works.
+TEST(ReturnTest, AcceptsStringLiteral) {
+  Action<const char*()> a1 = Return("Hello");
+  EXPECT_STREQ("Hello", a1.Perform(std::make_tuple()));
+
+  Action<std::string()> a2 = Return("world");
+  EXPECT_EQ("world", a2.Perform(std::make_tuple()));
+}
+
+// Test struct which wraps a vector of integers. Used in
+// 'SupportsWrapperReturnType' test.
+struct IntegerVectorWrapper {
+  std::vector<int> * v;
+  IntegerVectorWrapper(std::vector<int>& _v) : v(&_v) {}  // NOLINT
+};
+
+// Tests that Return() works when return type is a wrapper type.
+TEST(ReturnTest, SupportsWrapperReturnType) {
+  // Initialize vector of integers.
+  std::vector<int> v;
+  for (int i = 0; i < 5; ++i) v.push_back(i);
+
+  // Return() called with 'v' as argument. The Action will return the same data
+  // as 'v' (copy) but it will be wrapped in an IntegerVectorWrapper.
+  Action<IntegerVectorWrapper()> a = Return(v);
+  const std::vector<int>& result = *(a.Perform(std::make_tuple()).v);
+  EXPECT_THAT(result, ::testing::ElementsAre(0, 1, 2, 3, 4));
+}
+
+// Tests that Return(v) is covaraint.
+
+struct Base {
+  bool operator==(const Base&) { return true; }
+};
+
+struct Derived : public Base {
+  bool operator==(const Derived&) { return true; }
+};
+
+TEST(ReturnTest, IsCovariant) {
+  Base base;
+  Derived derived;
+  Action<Base*()> ret = Return(&base);
+  EXPECT_EQ(&base, ret.Perform(std::make_tuple()));
+
+  ret = Return(&derived);
+  EXPECT_EQ(&derived, ret.Perform(std::make_tuple()));
+}
+
+// Tests that the type of the value passed into Return is converted into T
+// when the action is cast to Action<T(...)> rather than when the action is
+// performed. See comments on testing::internal::ReturnAction in
+// gmock-actions.h for more information.
+class FromType {
+ public:
+  explicit FromType(bool* is_converted) : converted_(is_converted) {}
+  bool* converted() const { return converted_; }
+
+ private:
+  bool* const converted_;
+
+  GTEST_DISALLOW_ASSIGN_(FromType);
+};
+
+class ToType {
+ public:
+  // Must allow implicit conversion due to use in ImplicitCast_<T>.
+  ToType(const FromType& x) { *x.converted() = true; }  // NOLINT
+};
+
+TEST(ReturnTest, ConvertsArgumentWhenConverted) {
+  bool converted = false;
+  FromType x(&converted);
+  Action<ToType()> action(Return(x));
+  EXPECT_TRUE(converted) << "Return must convert its argument in its own "
+                         << "conversion operator.";
+  converted = false;
+  action.Perform(std::tuple<>());
+  EXPECT_FALSE(converted) << "Action must NOT convert its argument "
+                          << "when performed.";
+}
+
+class DestinationType {};
+
+class SourceType {
+ public:
+  // Note: a non-const typecast operator.
+  operator DestinationType() { return DestinationType(); }
+};
+
+TEST(ReturnTest, CanConvertArgumentUsingNonConstTypeCastOperator) {
+  SourceType s;
+  Action<DestinationType()> action(Return(s));
+}
+
+// Tests that ReturnNull() returns NULL in a pointer-returning function.
+TEST(ReturnNullTest, WorksInPointerReturningFunction) {
+  const Action<int*()> a1 = ReturnNull();
+  EXPECT_TRUE(a1.Perform(std::make_tuple()) == nullptr);
+
+  const Action<const char*(bool)> a2 = ReturnNull();  // NOLINT
+  EXPECT_TRUE(a2.Perform(std::make_tuple(true)) == nullptr);
+}
+
+// Tests that ReturnNull() returns NULL for shared_ptr and unique_ptr returning
+// functions.
+TEST(ReturnNullTest, WorksInSmartPointerReturningFunction) {
+  const Action<std::unique_ptr<const int>()> a1 = ReturnNull();
+  EXPECT_TRUE(a1.Perform(std::make_tuple()) == nullptr);
+
+  const Action<std::shared_ptr<int>(std::string)> a2 = ReturnNull();
+  EXPECT_TRUE(a2.Perform(std::make_tuple("foo")) == nullptr);
+}
+
+// Tests that ReturnRef(v) works for reference types.
+TEST(ReturnRefTest, WorksForReference) {
+  const int n = 0;
+  const Action<const int&(bool)> ret = ReturnRef(n);  // NOLINT
+
+  EXPECT_EQ(&n, &ret.Perform(std::make_tuple(true)));
+}
+
+// Tests that ReturnRef(v) is covariant.
+TEST(ReturnRefTest, IsCovariant) {
+  Base base;
+  Derived derived;
+  Action<Base&()> a = ReturnRef(base);
+  EXPECT_EQ(&base, &a.Perform(std::make_tuple()));
+
+  a = ReturnRef(derived);
+  EXPECT_EQ(&derived, &a.Perform(std::make_tuple()));
+}
+
+template <typename T, typename = decltype(ReturnRef(std::declval<T&&>()))>
+bool CanCallReturnRef(T&&) { return true; }
+bool CanCallReturnRef(Unused) { return false; }
+
+// Tests that ReturnRef(v) is working with non-temporaries (T&)
+TEST(ReturnRefTest, WorksForNonTemporary) {
+  int scalar_value = 123;
+  EXPECT_TRUE(CanCallReturnRef(scalar_value));
+
+  std::string non_scalar_value("ABC");
+  EXPECT_TRUE(CanCallReturnRef(non_scalar_value));
+
+  const int const_scalar_value{321};
+  EXPECT_TRUE(CanCallReturnRef(const_scalar_value));
+
+  const std::string const_non_scalar_value("CBA");
+  EXPECT_TRUE(CanCallReturnRef(const_non_scalar_value));
+}
+
+// Tests that ReturnRef(v) is not working with temporaries (T&&)
+TEST(ReturnRefTest, DoesNotWorkForTemporary) {
+  auto scalar_value = []()  -> int { return 123; };
+  EXPECT_FALSE(CanCallReturnRef(scalar_value()));
+
+  auto non_scalar_value = []() -> std::string { return "ABC"; };
+  EXPECT_FALSE(CanCallReturnRef(non_scalar_value()));
+
+  // cannot use here callable returning "const scalar type",
+  // because such const for scalar return type is ignored
+  EXPECT_FALSE(CanCallReturnRef(static_cast<const int>(321)));
+
+  auto const_non_scalar_value = []() -> const std::string { return "CBA"; };
+  EXPECT_FALSE(CanCallReturnRef(const_non_scalar_value()));
+}
+
+// Tests that ReturnRefOfCopy(v) works for reference types.
+TEST(ReturnRefOfCopyTest, WorksForReference) {
+  int n = 42;
+  const Action<const int&()> ret = ReturnRefOfCopy(n);
+
+  EXPECT_NE(&n, &ret.Perform(std::make_tuple()));
+  EXPECT_EQ(42, ret.Perform(std::make_tuple()));
+
+  n = 43;
+  EXPECT_NE(&n, &ret.Perform(std::make_tuple()));
+  EXPECT_EQ(42, ret.Perform(std::make_tuple()));
+}
+
+// Tests that ReturnRefOfCopy(v) is covariant.
+TEST(ReturnRefOfCopyTest, IsCovariant) {
+  Base base;
+  Derived derived;
+  Action<Base&()> a = ReturnRefOfCopy(base);
+  EXPECT_NE(&base, &a.Perform(std::make_tuple()));
+
+  a = ReturnRefOfCopy(derived);
+  EXPECT_NE(&derived, &a.Perform(std::make_tuple()));
+}
+
+// Tests that ReturnRoundRobin(v) works with initializer lists
+TEST(ReturnRoundRobinTest, WorksForInitList) {
+  Action<int()> ret = ReturnRoundRobin({1, 2, 3});
+
+  EXPECT_EQ(1, ret.Perform(std::make_tuple()));
+  EXPECT_EQ(2, ret.Perform(std::make_tuple()));
+  EXPECT_EQ(3, ret.Perform(std::make_tuple()));
+  EXPECT_EQ(1, ret.Perform(std::make_tuple()));
+  EXPECT_EQ(2, ret.Perform(std::make_tuple()));
+  EXPECT_EQ(3, ret.Perform(std::make_tuple()));
+}
+
+// Tests that ReturnRoundRobin(v) works with vectors
+TEST(ReturnRoundRobinTest, WorksForVector) {
+  std::vector<double> v = {4.4, 5.5, 6.6};
+  Action<double()> ret = ReturnRoundRobin(v);
+
+  EXPECT_EQ(4.4, ret.Perform(std::make_tuple()));
+  EXPECT_EQ(5.5, ret.Perform(std::make_tuple()));
+  EXPECT_EQ(6.6, ret.Perform(std::make_tuple()));
+  EXPECT_EQ(4.4, ret.Perform(std::make_tuple()));
+  EXPECT_EQ(5.5, ret.Perform(std::make_tuple()));
+  EXPECT_EQ(6.6, ret.Perform(std::make_tuple()));
+}
+
+// Tests that DoDefault() does the default action for the mock method.
+
+class MockClass {
+ public:
+  MockClass() {}
+
+  MOCK_METHOD1(IntFunc, int(bool flag));  // NOLINT
+  MOCK_METHOD0(Foo, MyNonDefaultConstructible());
+  MOCK_METHOD0(MakeUnique, std::unique_ptr<int>());
+  MOCK_METHOD0(MakeUniqueBase, std::unique_ptr<Base>());
+  MOCK_METHOD0(MakeVectorUnique, std::vector<std::unique_ptr<int>>());
+  MOCK_METHOD1(TakeUnique, int(std::unique_ptr<int>));
+  MOCK_METHOD2(TakeUnique,
+               int(const std::unique_ptr<int>&, std::unique_ptr<int>));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockClass);
+};
+
+// Tests that DoDefault() returns the built-in default value for the
+// return type by default.
+TEST(DoDefaultTest, ReturnsBuiltInDefaultValueByDefault) {
+  MockClass mock;
+  EXPECT_CALL(mock, IntFunc(_))
+      .WillOnce(DoDefault());
+  EXPECT_EQ(0, mock.IntFunc(true));
+}
+
+// Tests that DoDefault() throws (when exceptions are enabled) or aborts
+// the process when there is no built-in default value for the return type.
+TEST(DoDefaultDeathTest, DiesForUnknowType) {
+  MockClass mock;
+  EXPECT_CALL(mock, Foo())
+      .WillRepeatedly(DoDefault());
+#if GTEST_HAS_EXCEPTIONS
+  EXPECT_ANY_THROW(mock.Foo());
+#else
+  EXPECT_DEATH_IF_SUPPORTED({
+    mock.Foo();
+  }, "");
+#endif
+}
+
+// Tests that using DoDefault() inside a composite action leads to a
+// run-time error.
+
+void VoidFunc(bool /* flag */) {}
+
+TEST(DoDefaultDeathTest, DiesIfUsedInCompositeAction) {
+  MockClass mock;
+  EXPECT_CALL(mock, IntFunc(_))
+      .WillRepeatedly(DoAll(Invoke(VoidFunc),
+                            DoDefault()));
+
+  // Ideally we should verify the error message as well.  Sadly,
+  // EXPECT_DEATH() can only capture stderr, while Google Mock's
+  // errors are printed on stdout.  Therefore we have to settle for
+  // not verifying the message.
+  EXPECT_DEATH_IF_SUPPORTED({
+    mock.IntFunc(true);
+  }, "");
+}
+
+// Tests that DoDefault() returns the default value set by
+// DefaultValue<T>::Set() when it's not overriden by an ON_CALL().
+TEST(DoDefaultTest, ReturnsUserSpecifiedPerTypeDefaultValueWhenThereIsOne) {
+  DefaultValue<int>::Set(1);
+  MockClass mock;
+  EXPECT_CALL(mock, IntFunc(_))
+      .WillOnce(DoDefault());
+  EXPECT_EQ(1, mock.IntFunc(false));
+  DefaultValue<int>::Clear();
+}
+
+// Tests that DoDefault() does the action specified by ON_CALL().
+TEST(DoDefaultTest, DoesWhatOnCallSpecifies) {
+  MockClass mock;
+  ON_CALL(mock, IntFunc(_))
+      .WillByDefault(Return(2));
+  EXPECT_CALL(mock, IntFunc(_))
+      .WillOnce(DoDefault());
+  EXPECT_EQ(2, mock.IntFunc(false));
+}
+
+// Tests that using DoDefault() in ON_CALL() leads to a run-time failure.
+TEST(DoDefaultTest, CannotBeUsedInOnCall) {
+  MockClass mock;
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    ON_CALL(mock, IntFunc(_))
+      .WillByDefault(DoDefault());
+  }, "DoDefault() cannot be used in ON_CALL()");
+}
+
+// Tests that SetArgPointee<N>(v) sets the variable pointed to by
+// the N-th (0-based) argument to v.
+TEST(SetArgPointeeTest, SetsTheNthPointee) {
+  typedef void MyFunction(bool, int*, char*);
+  Action<MyFunction> a = SetArgPointee<1>(2);
+
+  int n = 0;
+  char ch = '\0';
+  a.Perform(std::make_tuple(true, &n, &ch));
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('\0', ch);
+
+  a = SetArgPointee<2>('a');
+  n = 0;
+  ch = '\0';
+  a.Perform(std::make_tuple(true, &n, &ch));
+  EXPECT_EQ(0, n);
+  EXPECT_EQ('a', ch);
+}
+
+// Tests that SetArgPointee<N>() accepts a string literal.
+TEST(SetArgPointeeTest, AcceptsStringLiteral) {
+  typedef void MyFunction(std::string*, const char**);
+  Action<MyFunction> a = SetArgPointee<0>("hi");
+  std::string str;
+  const char* ptr = nullptr;
+  a.Perform(std::make_tuple(&str, &ptr));
+  EXPECT_EQ("hi", str);
+  EXPECT_TRUE(ptr == nullptr);
+
+  a = SetArgPointee<1>("world");
+  str = "";
+  a.Perform(std::make_tuple(&str, &ptr));
+  EXPECT_EQ("", str);
+  EXPECT_STREQ("world", ptr);
+}
+
+TEST(SetArgPointeeTest, AcceptsWideStringLiteral) {
+  typedef void MyFunction(const wchar_t**);
+  Action<MyFunction> a = SetArgPointee<0>(L"world");
+  const wchar_t* ptr = nullptr;
+  a.Perform(std::make_tuple(&ptr));
+  EXPECT_STREQ(L"world", ptr);
+
+# if GTEST_HAS_STD_WSTRING
+
+  typedef void MyStringFunction(std::wstring*);
+  Action<MyStringFunction> a2 = SetArgPointee<0>(L"world");
+  std::wstring str = L"";
+  a2.Perform(std::make_tuple(&str));
+  EXPECT_EQ(L"world", str);
+
+# endif
+}
+
+// Tests that SetArgPointee<N>() accepts a char pointer.
+TEST(SetArgPointeeTest, AcceptsCharPointer) {
+  typedef void MyFunction(bool, std::string*, const char**);
+  const char* const hi = "hi";
+  Action<MyFunction> a = SetArgPointee<1>(hi);
+  std::string str;
+  const char* ptr = nullptr;
+  a.Perform(std::make_tuple(true, &str, &ptr));
+  EXPECT_EQ("hi", str);
+  EXPECT_TRUE(ptr == nullptr);
+
+  char world_array[] = "world";
+  char* const world = world_array;
+  a = SetArgPointee<2>(world);
+  str = "";
+  a.Perform(std::make_tuple(true, &str, &ptr));
+  EXPECT_EQ("", str);
+  EXPECT_EQ(world, ptr);
+}
+
+TEST(SetArgPointeeTest, AcceptsWideCharPointer) {
+  typedef void MyFunction(bool, const wchar_t**);
+  const wchar_t* const hi = L"hi";
+  Action<MyFunction> a = SetArgPointee<1>(hi);
+  const wchar_t* ptr = nullptr;
+  a.Perform(std::make_tuple(true, &ptr));
+  EXPECT_EQ(hi, ptr);
+
+# if GTEST_HAS_STD_WSTRING
+
+  typedef void MyStringFunction(bool, std::wstring*);
+  wchar_t world_array[] = L"world";
+  wchar_t* const world = world_array;
+  Action<MyStringFunction> a2 = SetArgPointee<1>(world);
+  std::wstring str;
+  a2.Perform(std::make_tuple(true, &str));
+  EXPECT_EQ(world_array, str);
+# endif
+}
+
+// Tests that SetArgumentPointee<N>(v) sets the variable pointed to by
+// the N-th (0-based) argument to v.
+TEST(SetArgumentPointeeTest, SetsTheNthPointee) {
+  typedef void MyFunction(bool, int*, char*);
+  Action<MyFunction> a = SetArgumentPointee<1>(2);
+
+  int n = 0;
+  char ch = '\0';
+  a.Perform(std::make_tuple(true, &n, &ch));
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('\0', ch);
+
+  a = SetArgumentPointee<2>('a');
+  n = 0;
+  ch = '\0';
+  a.Perform(std::make_tuple(true, &n, &ch));
+  EXPECT_EQ(0, n);
+  EXPECT_EQ('a', ch);
+}
+
+// Sample functions and functors for testing Invoke() and etc.
+int Nullary() { return 1; }
+
+class NullaryFunctor {
+ public:
+  int operator()() { return 2; }
+};
+
+bool g_done = false;
+void VoidNullary() { g_done = true; }
+
+class VoidNullaryFunctor {
+ public:
+  void operator()() { g_done = true; }
+};
+
+short Short(short n) { return n; }  // NOLINT
+char Char(char ch) { return ch; }
+
+const char* CharPtr(const char* s) { return s; }
+
+bool Unary(int x) { return x < 0; }
+
+const char* Binary(const char* input, short n) { return input + n; }  // NOLINT
+
+void VoidBinary(int, char) { g_done = true; }
+
+int Ternary(int x, char y, short z) { return x + y + z; }  // NOLINT
+
+int SumOf4(int a, int b, int c, int d) { return a + b + c + d; }
+
+class Foo {
+ public:
+  Foo() : value_(123) {}
+
+  int Nullary() const { return value_; }
+
+ private:
+  int value_;
+};
+
+// Tests InvokeWithoutArgs(function).
+TEST(InvokeWithoutArgsTest, Function) {
+  // As an action that takes one argument.
+  Action<int(int)> a = InvokeWithoutArgs(Nullary);  // NOLINT
+  EXPECT_EQ(1, a.Perform(std::make_tuple(2)));
+
+  // As an action that takes two arguments.
+  Action<int(int, double)> a2 = InvokeWithoutArgs(Nullary);  // NOLINT
+  EXPECT_EQ(1, a2.Perform(std::make_tuple(2, 3.5)));
+
+  // As an action that returns void.
+  Action<void(int)> a3 = InvokeWithoutArgs(VoidNullary);  // NOLINT
+  g_done = false;
+  a3.Perform(std::make_tuple(1));
+  EXPECT_TRUE(g_done);
+}
+
+// Tests InvokeWithoutArgs(functor).
+TEST(InvokeWithoutArgsTest, Functor) {
+  // As an action that takes no argument.
+  Action<int()> a = InvokeWithoutArgs(NullaryFunctor());  // NOLINT
+  EXPECT_EQ(2, a.Perform(std::make_tuple()));
+
+  // As an action that takes three arguments.
+  Action<int(int, double, char)> a2 =  // NOLINT
+      InvokeWithoutArgs(NullaryFunctor());
+  EXPECT_EQ(2, a2.Perform(std::make_tuple(3, 3.5, 'a')));
+
+  // As an action that returns void.
+  Action<void()> a3 = InvokeWithoutArgs(VoidNullaryFunctor());
+  g_done = false;
+  a3.Perform(std::make_tuple());
+  EXPECT_TRUE(g_done);
+}
+
+// Tests InvokeWithoutArgs(obj_ptr, method).
+TEST(InvokeWithoutArgsTest, Method) {
+  Foo foo;
+  Action<int(bool, char)> a =  // NOLINT
+      InvokeWithoutArgs(&foo, &Foo::Nullary);
+  EXPECT_EQ(123, a.Perform(std::make_tuple(true, 'a')));
+}
+
+// Tests using IgnoreResult() on a polymorphic action.
+TEST(IgnoreResultTest, PolymorphicAction) {
+  Action<void(int)> a = IgnoreResult(Return(5));  // NOLINT
+  a.Perform(std::make_tuple(1));
+}
+
+// Tests using IgnoreResult() on a monomorphic action.
+
+int ReturnOne() {
+  g_done = true;
+  return 1;
+}
+
+TEST(IgnoreResultTest, MonomorphicAction) {
+  g_done = false;
+  Action<void()> a = IgnoreResult(Invoke(ReturnOne));
+  a.Perform(std::make_tuple());
+  EXPECT_TRUE(g_done);
+}
+
+// Tests using IgnoreResult() on an action that returns a class type.
+
+MyNonDefaultConstructible ReturnMyNonDefaultConstructible(double /* x */) {
+  g_done = true;
+  return MyNonDefaultConstructible(42);
+}
+
+TEST(IgnoreResultTest, ActionReturningClass) {
+  g_done = false;
+  Action<void(int)> a =
+      IgnoreResult(Invoke(ReturnMyNonDefaultConstructible));  // NOLINT
+  a.Perform(std::make_tuple(2));
+  EXPECT_TRUE(g_done);
+}
+
+TEST(AssignTest, Int) {
+  int x = 0;
+  Action<void(int)> a = Assign(&x, 5);
+  a.Perform(std::make_tuple(0));
+  EXPECT_EQ(5, x);
+}
+
+TEST(AssignTest, String) {
+  ::std::string x;
+  Action<void(void)> a = Assign(&x, "Hello, world");
+  a.Perform(std::make_tuple());
+  EXPECT_EQ("Hello, world", x);
+}
+
+TEST(AssignTest, CompatibleTypes) {
+  double x = 0;
+  Action<void(int)> a = Assign(&x, 5);
+  a.Perform(std::make_tuple(0));
+  EXPECT_DOUBLE_EQ(5, x);
+}
+
+
+// Tests using WithArgs and with an action that takes 1 argument.
+TEST(WithArgsTest, OneArg) {
+  Action<bool(double x, int n)> a = WithArgs<1>(Invoke(Unary));  // NOLINT
+  EXPECT_TRUE(a.Perform(std::make_tuple(1.5, -1)));
+  EXPECT_FALSE(a.Perform(std::make_tuple(1.5, 1)));
+}
+
+// Tests using WithArgs with an action that takes 2 arguments.
+TEST(WithArgsTest, TwoArgs) {
+  Action<const char*(const char* s, double x, short n)> a =  // NOLINT
+      WithArgs<0, 2>(Invoke(Binary));
+  const char s[] = "Hello";
+  EXPECT_EQ(s + 2, a.Perform(std::make_tuple(CharPtr(s), 0.5, Short(2))));
+}
+
+struct ConcatAll {
+  std::string operator()() const { return {}; }
+  template <typename... I>
+  std::string operator()(const char* a, I... i) const {
+    return a + ConcatAll()(i...);
+  }
+};
+
+// Tests using WithArgs with an action that takes 10 arguments.
+TEST(WithArgsTest, TenArgs) {
+  Action<std::string(const char*, const char*, const char*, const char*)> a =
+      WithArgs<0, 1, 2, 3, 2, 1, 0, 1, 2, 3>(Invoke(ConcatAll{}));
+  EXPECT_EQ("0123210123",
+            a.Perform(std::make_tuple(CharPtr("0"), CharPtr("1"), CharPtr("2"),
+                                      CharPtr("3"))));
+}
+
+// Tests using WithArgs with an action that is not Invoke().
+class SubtractAction : public ActionInterface<int(int, int)> {
+ public:
+  int Perform(const std::tuple<int, int>& args) override {
+    return std::get<0>(args) - std::get<1>(args);
+  }
+};
+
+TEST(WithArgsTest, NonInvokeAction) {
+  Action<int(const std::string&, int, int)> a =
+      WithArgs<2, 1>(MakeAction(new SubtractAction));
+  std::tuple<std::string, int, int> dummy =
+      std::make_tuple(std::string("hi"), 2, 10);
+  EXPECT_EQ(8, a.Perform(dummy));
+}
+
+// Tests using WithArgs to pass all original arguments in the original order.
+TEST(WithArgsTest, Identity) {
+  Action<int(int x, char y, short z)> a =  // NOLINT
+      WithArgs<0, 1, 2>(Invoke(Ternary));
+  EXPECT_EQ(123, a.Perform(std::make_tuple(100, Char(20), Short(3))));
+}
+
+// Tests using WithArgs with repeated arguments.
+TEST(WithArgsTest, RepeatedArguments) {
+  Action<int(bool, int m, int n)> a =  // NOLINT
+      WithArgs<1, 1, 1, 1>(Invoke(SumOf4));
+  EXPECT_EQ(4, a.Perform(std::make_tuple(false, 1, 10)));
+}
+
+// Tests using WithArgs with reversed argument order.
+TEST(WithArgsTest, ReversedArgumentOrder) {
+  Action<const char*(short n, const char* input)> a =  // NOLINT
+      WithArgs<1, 0>(Invoke(Binary));
+  const char s[] = "Hello";
+  EXPECT_EQ(s + 2, a.Perform(std::make_tuple(Short(2), CharPtr(s))));
+}
+
+// Tests using WithArgs with compatible, but not identical, argument types.
+TEST(WithArgsTest, ArgsOfCompatibleTypes) {
+  Action<long(short x, char y, double z, char c)> a =  // NOLINT
+      WithArgs<0, 1, 3>(Invoke(Ternary));
+  EXPECT_EQ(123,
+            a.Perform(std::make_tuple(Short(100), Char(20), 5.6, Char(3))));
+}
+
+// Tests using WithArgs with an action that returns void.
+TEST(WithArgsTest, VoidAction) {
+  Action<void(double x, char c, int n)> a = WithArgs<2, 1>(Invoke(VoidBinary));
+  g_done = false;
+  a.Perform(std::make_tuple(1.5, 'a', 3));
+  EXPECT_TRUE(g_done);
+}
+
+TEST(WithArgsTest, ReturnReference) {
+  Action<int&(int&, void*)> aa = WithArgs<0>([](int& a) -> int& { return a; });
+  int i = 0;
+  const int& res = aa.Perform(std::forward_as_tuple(i, nullptr));
+  EXPECT_EQ(&i, &res);
+}
+
+TEST(WithArgsTest, InnerActionWithConversion) {
+  Action<Derived*()> inner = [] { return nullptr; };
+  Action<Base*(double)> a = testing::WithoutArgs(inner);
+  EXPECT_EQ(nullptr, a.Perform(std::make_tuple(1.1)));
+}
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+class SetErrnoAndReturnTest : public testing::Test {
+ protected:
+  void SetUp() override { errno = 0; }
+  void TearDown() override { errno = 0; }
+};
+
+TEST_F(SetErrnoAndReturnTest, Int) {
+  Action<int(void)> a = SetErrnoAndReturn(ENOTTY, -5);
+  EXPECT_EQ(-5, a.Perform(std::make_tuple()));
+  EXPECT_EQ(ENOTTY, errno);
+}
+
+TEST_F(SetErrnoAndReturnTest, Ptr) {
+  int x;
+  Action<int*(void)> a = SetErrnoAndReturn(ENOTTY, &x);
+  EXPECT_EQ(&x, a.Perform(std::make_tuple()));
+  EXPECT_EQ(ENOTTY, errno);
+}
+
+TEST_F(SetErrnoAndReturnTest, CompatibleTypes) {
+  Action<double()> a = SetErrnoAndReturn(EINVAL, 5);
+  EXPECT_DOUBLE_EQ(5.0, a.Perform(std::make_tuple()));
+  EXPECT_EQ(EINVAL, errno);
+}
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Tests ByRef().
+
+// Tests that the result of ByRef() is copyable.
+TEST(ByRefTest, IsCopyable) {
+  const std::string s1 = "Hi";
+  const std::string s2 = "Hello";
+
+  auto ref_wrapper = ByRef(s1);
+  const std::string& r1 = ref_wrapper;
+  EXPECT_EQ(&s1, &r1);
+
+  // Assigns a new value to ref_wrapper.
+  ref_wrapper = ByRef(s2);
+  const std::string& r2 = ref_wrapper;
+  EXPECT_EQ(&s2, &r2);
+
+  auto ref_wrapper1 = ByRef(s1);
+  // Copies ref_wrapper1 to ref_wrapper.
+  ref_wrapper = ref_wrapper1;
+  const std::string& r3 = ref_wrapper;
+  EXPECT_EQ(&s1, &r3);
+}
+
+// Tests using ByRef() on a const value.
+TEST(ByRefTest, ConstValue) {
+  const int n = 0;
+  // int& ref = ByRef(n);  // This shouldn't compile - we have a
+                           // negative compilation test to catch it.
+  const int& const_ref = ByRef(n);
+  EXPECT_EQ(&n, &const_ref);
+}
+
+// Tests using ByRef() on a non-const value.
+TEST(ByRefTest, NonConstValue) {
+  int n = 0;
+
+  // ByRef(n) can be used as either an int&,
+  int& ref = ByRef(n);
+  EXPECT_EQ(&n, &ref);
+
+  // or a const int&.
+  const int& const_ref = ByRef(n);
+  EXPECT_EQ(&n, &const_ref);
+}
+
+// Tests explicitly specifying the type when using ByRef().
+TEST(ByRefTest, ExplicitType) {
+  int n = 0;
+  const int& r1 = ByRef<const int>(n);
+  EXPECT_EQ(&n, &r1);
+
+  // ByRef<char>(n);  // This shouldn't compile - we have a negative
+                      // compilation test to catch it.
+
+  Derived d;
+  Derived& r2 = ByRef<Derived>(d);
+  EXPECT_EQ(&d, &r2);
+
+  const Derived& r3 = ByRef<const Derived>(d);
+  EXPECT_EQ(&d, &r3);
+
+  Base& r4 = ByRef<Base>(d);
+  EXPECT_EQ(&d, &r4);
+
+  const Base& r5 = ByRef<const Base>(d);
+  EXPECT_EQ(&d, &r5);
+
+  // The following shouldn't compile - we have a negative compilation
+  // test for it.
+  //
+  // Base b;
+  // ByRef<Derived>(b);
+}
+
+// Tests that Google Mock prints expression ByRef(x) as a reference to x.
+TEST(ByRefTest, PrintsCorrectly) {
+  int n = 42;
+  ::std::stringstream expected, actual;
+  testing::internal::UniversalPrinter<const int&>::Print(n, &expected);
+  testing::internal::UniversalPrint(ByRef(n), &actual);
+  EXPECT_EQ(expected.str(), actual.str());
+}
+
+
+std::unique_ptr<int> UniquePtrSource() {
+  return std::unique_ptr<int>(new int(19));
+}
+
+std::vector<std::unique_ptr<int>> VectorUniquePtrSource() {
+  std::vector<std::unique_ptr<int>> out;
+  out.emplace_back(new int(7));
+  return out;
+}
+
+TEST(MockMethodTest, CanReturnMoveOnlyValue_Return) {
+  MockClass mock;
+  std::unique_ptr<int> i(new int(19));
+  EXPECT_CALL(mock, MakeUnique()).WillOnce(Return(ByMove(std::move(i))));
+  EXPECT_CALL(mock, MakeVectorUnique())
+      .WillOnce(Return(ByMove(VectorUniquePtrSource())));
+  Derived* d = new Derived;
+  EXPECT_CALL(mock, MakeUniqueBase())
+      .WillOnce(Return(ByMove(std::unique_ptr<Derived>(d))));
+
+  std::unique_ptr<int> result1 = mock.MakeUnique();
+  EXPECT_EQ(19, *result1);
+
+  std::vector<std::unique_ptr<int>> vresult = mock.MakeVectorUnique();
+  EXPECT_EQ(1u, vresult.size());
+  EXPECT_NE(nullptr, vresult[0]);
+  EXPECT_EQ(7, *vresult[0]);
+
+  std::unique_ptr<Base> result2 = mock.MakeUniqueBase();
+  EXPECT_EQ(d, result2.get());
+}
+
+TEST(MockMethodTest, CanReturnMoveOnlyValue_DoAllReturn) {
+  testing::MockFunction<void()> mock_function;
+  MockClass mock;
+  std::unique_ptr<int> i(new int(19));
+  EXPECT_CALL(mock_function, Call());
+  EXPECT_CALL(mock, MakeUnique()).WillOnce(DoAll(
+      InvokeWithoutArgs(&mock_function, &testing::MockFunction<void()>::Call),
+      Return(ByMove(std::move(i)))));
+
+  std::unique_ptr<int> result1 = mock.MakeUnique();
+  EXPECT_EQ(19, *result1);
+}
+
+TEST(MockMethodTest, CanReturnMoveOnlyValue_Invoke) {
+  MockClass mock;
+
+  // Check default value
+  DefaultValue<std::unique_ptr<int>>::SetFactory([] {
+    return std::unique_ptr<int>(new int(42));
+  });
+  EXPECT_EQ(42, *mock.MakeUnique());
+
+  EXPECT_CALL(mock, MakeUnique()).WillRepeatedly(Invoke(UniquePtrSource));
+  EXPECT_CALL(mock, MakeVectorUnique())
+      .WillRepeatedly(Invoke(VectorUniquePtrSource));
+  std::unique_ptr<int> result1 = mock.MakeUnique();
+  EXPECT_EQ(19, *result1);
+  std::unique_ptr<int> result2 = mock.MakeUnique();
+  EXPECT_EQ(19, *result2);
+  EXPECT_NE(result1, result2);
+
+  std::vector<std::unique_ptr<int>> vresult = mock.MakeVectorUnique();
+  EXPECT_EQ(1u, vresult.size());
+  EXPECT_NE(nullptr, vresult[0]);
+  EXPECT_EQ(7, *vresult[0]);
+}
+
+TEST(MockMethodTest, CanTakeMoveOnlyValue) {
+  MockClass mock;
+  auto make = [](int i) { return std::unique_ptr<int>(new int(i)); };
+
+  EXPECT_CALL(mock, TakeUnique(_)).WillRepeatedly([](std::unique_ptr<int> i) {
+    return *i;
+  });
+  // DoAll() does not compile, since it would move from its arguments twice.
+  // EXPECT_CALL(mock, TakeUnique(_, _))
+  //     .WillRepeatedly(DoAll(Invoke([](std::unique_ptr<int> j) {}),
+  //     Return(1)));
+  EXPECT_CALL(mock, TakeUnique(testing::Pointee(7)))
+      .WillOnce(Return(-7))
+      .RetiresOnSaturation();
+  EXPECT_CALL(mock, TakeUnique(testing::IsNull()))
+      .WillOnce(Return(-1))
+      .RetiresOnSaturation();
+
+  EXPECT_EQ(5, mock.TakeUnique(make(5)));
+  EXPECT_EQ(-7, mock.TakeUnique(make(7)));
+  EXPECT_EQ(7, mock.TakeUnique(make(7)));
+  EXPECT_EQ(7, mock.TakeUnique(make(7)));
+  EXPECT_EQ(-1, mock.TakeUnique({}));
+
+  // Some arguments are moved, some passed by reference.
+  auto lvalue = make(6);
+  EXPECT_CALL(mock, TakeUnique(_, _))
+      .WillOnce([](const std::unique_ptr<int>& i, std::unique_ptr<int> j) {
+        return *i * *j;
+      });
+  EXPECT_EQ(42, mock.TakeUnique(lvalue, make(7)));
+
+  // The unique_ptr can be saved by the action.
+  std::unique_ptr<int> saved;
+  EXPECT_CALL(mock, TakeUnique(_)).WillOnce([&saved](std::unique_ptr<int> i) {
+    saved = std::move(i);
+    return 0;
+  });
+  EXPECT_EQ(0, mock.TakeUnique(make(42)));
+  EXPECT_EQ(42, *saved);
+}
+
+
+// Tests for std::function based action.
+
+int Add(int val, int& ref, int* ptr) {  // NOLINT
+  int result = val + ref + *ptr;
+  ref = 42;
+  *ptr = 43;
+  return result;
+}
+
+int Deref(std::unique_ptr<int> ptr) { return *ptr; }
+
+struct Double {
+  template <typename T>
+  T operator()(T t) { return 2 * t; }
+};
+
+std::unique_ptr<int> UniqueInt(int i) {
+  return std::unique_ptr<int>(new int(i));
+}
+
+TEST(FunctorActionTest, ActionFromFunction) {
+  Action<int(int, int&, int*)> a = &Add;
+  int x = 1, y = 2, z = 3;
+  EXPECT_EQ(6, a.Perform(std::forward_as_tuple(x, y, &z)));
+  EXPECT_EQ(42, y);
+  EXPECT_EQ(43, z);
+
+  Action<int(std::unique_ptr<int>)> a1 = &Deref;
+  EXPECT_EQ(7, a1.Perform(std::make_tuple(UniqueInt(7))));
+}
+
+TEST(FunctorActionTest, ActionFromLambda) {
+  Action<int(bool, int)> a1 = [](bool b, int i) { return b ? i : 0; };
+  EXPECT_EQ(5, a1.Perform(std::make_tuple(true, 5)));
+  EXPECT_EQ(0, a1.Perform(std::make_tuple(false, 5)));
+
+  std::unique_ptr<int> saved;
+  Action<void(std::unique_ptr<int>)> a2 = [&saved](std::unique_ptr<int> p) {
+    saved = std::move(p);
+  };
+  a2.Perform(std::make_tuple(UniqueInt(5)));
+  EXPECT_EQ(5, *saved);
+}
+
+TEST(FunctorActionTest, PolymorphicFunctor) {
+  Action<int(int)> ai = Double();
+  EXPECT_EQ(2, ai.Perform(std::make_tuple(1)));
+  Action<double(double)> ad = Double();  // Double? Double double!
+  EXPECT_EQ(3.0, ad.Perform(std::make_tuple(1.5)));
+}
+
+TEST(FunctorActionTest, TypeConversion) {
+  // Numeric promotions are allowed.
+  const Action<bool(int)> a1 = [](int i) { return i > 1; };
+  const Action<int(bool)> a2 = Action<int(bool)>(a1);
+  EXPECT_EQ(1, a1.Perform(std::make_tuple(42)));
+  EXPECT_EQ(0, a2.Perform(std::make_tuple(42)));
+
+  // Implicit constructors are allowed.
+  const Action<bool(std::string)> s1 = [](std::string s) { return !s.empty(); };
+  const Action<int(const char*)> s2 = Action<int(const char*)>(s1);
+  EXPECT_EQ(0, s2.Perform(std::make_tuple("")));
+  EXPECT_EQ(1, s2.Perform(std::make_tuple("hello")));
+
+  // Also between the lambda and the action itself.
+  const Action<bool(std::string)> x1 = [](Unused) { return 42; };
+  const Action<bool(std::string)> x2 = [] { return 42; };
+  EXPECT_TRUE(x1.Perform(std::make_tuple("hello")));
+  EXPECT_TRUE(x2.Perform(std::make_tuple("hello")));
+
+  // Ensure decay occurs where required.
+  std::function<int()> f = [] { return 7; };
+  Action<int(int)> d = f;
+  f = nullptr;
+  EXPECT_EQ(7, d.Perform(std::make_tuple(1)));
+
+  // Ensure creation of an empty action succeeds.
+  Action<void(int)>(nullptr);
+}
+
+TEST(FunctorActionTest, UnusedArguments) {
+  // Verify that users can ignore uninteresting arguments.
+  Action<int(int, double y, double z)> a =
+      [](int i, Unused, Unused) { return 2 * i; };
+  std::tuple<int, double, double> dummy = std::make_tuple(3, 7.3, 9.44);
+  EXPECT_EQ(6, a.Perform(dummy));
+}
+
+// Test that basic built-in actions work with move-only arguments.
+TEST(MoveOnlyArgumentsTest, ReturningActions) {
+  Action<int(std::unique_ptr<int>)> a = Return(1);
+  EXPECT_EQ(1, a.Perform(std::make_tuple(nullptr)));
+
+  a = testing::WithoutArgs([]() { return 7; });
+  EXPECT_EQ(7, a.Perform(std::make_tuple(nullptr)));
+
+  Action<void(std::unique_ptr<int>, int*)> a2 = testing::SetArgPointee<1>(3);
+  int x = 0;
+  a2.Perform(std::make_tuple(nullptr, &x));
+  EXPECT_EQ(x, 3);
+}
+
+
+}  // Unnamed namespace
+
+#ifdef _MSC_VER
+#if _MSC_VER == 1900
+#  pragma warning(pop)
+#endif
+#endif
+
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-cardinalities_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-cardinalities_test.cc
new file mode 100644
index 0000000000..ca97cae249
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-cardinalities_test.cc
@@ -0,0 +1,429 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the built-in cardinalities.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+
+namespace {
+
+using std::stringstream;
+using testing::AnyNumber;
+using testing::AtLeast;
+using testing::AtMost;
+using testing::Between;
+using testing::Cardinality;
+using testing::CardinalityInterface;
+using testing::Exactly;
+using testing::IsSubstring;
+using testing::MakeCardinality;
+
+class MockFoo {
+ public:
+  MockFoo() {}
+  MOCK_METHOD0(Bar, int());  // NOLINT
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFoo);
+};
+
+// Tests that Cardinality objects can be default constructed.
+TEST(CardinalityTest, IsDefaultConstructable) {
+  Cardinality c;
+}
+
+// Tests that Cardinality objects are copyable.
+TEST(CardinalityTest, IsCopyable) {
+  // Tests the copy constructor.
+  Cardinality c = Exactly(1);
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(0));
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(1));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(1));
+
+  // Tests the assignment operator.
+  c = Exactly(2);
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(1));
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(2));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(2));
+}
+
+TEST(CardinalityTest, IsOverSaturatedByCallCountWorks) {
+  const Cardinality c = AtMost(5);
+  EXPECT_FALSE(c.IsOverSaturatedByCallCount(4));
+  EXPECT_FALSE(c.IsOverSaturatedByCallCount(5));
+  EXPECT_TRUE(c.IsOverSaturatedByCallCount(6));
+}
+
+// Tests that Cardinality::DescribeActualCallCountTo() creates the
+// correct description.
+TEST(CardinalityTest, CanDescribeActualCallCount) {
+  stringstream ss0;
+  Cardinality::DescribeActualCallCountTo(0, &ss0);
+  EXPECT_EQ("never called", ss0.str());
+
+  stringstream ss1;
+  Cardinality::DescribeActualCallCountTo(1, &ss1);
+  EXPECT_EQ("called once", ss1.str());
+
+  stringstream ss2;
+  Cardinality::DescribeActualCallCountTo(2, &ss2);
+  EXPECT_EQ("called twice", ss2.str());
+
+  stringstream ss3;
+  Cardinality::DescribeActualCallCountTo(3, &ss3);
+  EXPECT_EQ("called 3 times", ss3.str());
+}
+
+// Tests AnyNumber()
+TEST(AnyNumber, Works) {
+  const Cardinality c = AnyNumber();
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(1));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(1));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(9));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(9));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called any number of times",
+                      ss.str());
+}
+
+TEST(AnyNumberTest, HasCorrectBounds) {
+  const Cardinality c = AnyNumber();
+  EXPECT_EQ(0, c.ConservativeLowerBound());
+  EXPECT_EQ(INT_MAX, c.ConservativeUpperBound());
+}
+
+// Tests AtLeast(n).
+
+TEST(AtLeastTest, OnNegativeNumber) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    AtLeast(-1);
+  }, "The invocation lower bound must be >= 0");
+}
+
+TEST(AtLeastTest, OnZero) {
+  const Cardinality c = AtLeast(0);
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(1));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(1));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "any number of times",
+                      ss.str());
+}
+
+TEST(AtLeastTest, OnPositiveNumber) {
+  const Cardinality c = AtLeast(2);
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(0));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(1));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(1));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(2));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(2));
+
+  stringstream ss1;
+  AtLeast(1).DescribeTo(&ss1);
+  EXPECT_PRED_FORMAT2(IsSubstring, "at least once",
+                      ss1.str());
+
+  stringstream ss2;
+  c.DescribeTo(&ss2);
+  EXPECT_PRED_FORMAT2(IsSubstring, "at least twice",
+                      ss2.str());
+
+  stringstream ss3;
+  AtLeast(3).DescribeTo(&ss3);
+  EXPECT_PRED_FORMAT2(IsSubstring, "at least 3 times",
+                      ss3.str());
+}
+
+TEST(AtLeastTest, HasCorrectBounds) {
+  const Cardinality c = AtLeast(2);
+  EXPECT_EQ(2, c.ConservativeLowerBound());
+  EXPECT_EQ(INT_MAX, c.ConservativeUpperBound());
+}
+
+// Tests AtMost(n).
+
+TEST(AtMostTest, OnNegativeNumber) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    AtMost(-1);
+  }, "The invocation upper bound must be >= 0");
+}
+
+TEST(AtMostTest, OnZero) {
+  const Cardinality c = AtMost(0);
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(1));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(1));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "never called",
+                      ss.str());
+}
+
+TEST(AtMostTest, OnPositiveNumber) {
+  const Cardinality c = AtMost(2);
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(1));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(1));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(2));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(2));
+
+  stringstream ss1;
+  AtMost(1).DescribeTo(&ss1);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called at most once",
+                      ss1.str());
+
+  stringstream ss2;
+  c.DescribeTo(&ss2);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called at most twice",
+                      ss2.str());
+
+  stringstream ss3;
+  AtMost(3).DescribeTo(&ss3);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called at most 3 times",
+                      ss3.str());
+}
+
+TEST(AtMostTest, HasCorrectBounds) {
+  const Cardinality c = AtMost(2);
+  EXPECT_EQ(0, c.ConservativeLowerBound());
+  EXPECT_EQ(2, c.ConservativeUpperBound());
+}
+
+// Tests Between(m, n).
+
+TEST(BetweenTest, OnNegativeStart) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    Between(-1, 2);
+  }, "The invocation lower bound must be >= 0, but is actually -1");
+}
+
+TEST(BetweenTest, OnNegativeEnd) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    Between(1, -2);
+  }, "The invocation upper bound must be >= 0, but is actually -2");
+}
+
+TEST(BetweenTest, OnStartBiggerThanEnd) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    Between(2, 1);
+  }, "The invocation upper bound (1) must be >= "
+     "the invocation lower bound (2)");
+}
+
+TEST(BetweenTest, OnZeroStartAndZeroEnd) {
+  const Cardinality c = Between(0, 0);
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(1));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(1));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "never called",
+                      ss.str());
+}
+
+TEST(BetweenTest, OnZeroStartAndNonZeroEnd) {
+  const Cardinality c = Between(0, 2);
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(2));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(2));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(4));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(4));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called at most twice",
+                      ss.str());
+}
+
+TEST(BetweenTest, OnSameStartAndEnd) {
+  const Cardinality c = Between(3, 3);
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(2));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(2));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(3));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(3));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(4));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(4));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called 3 times",
+                      ss.str());
+}
+
+TEST(BetweenTest, OnDifferentStartAndEnd) {
+  const Cardinality c = Between(3, 5);
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(2));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(2));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(3));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(3));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(5));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(5));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(6));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(6));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called between 3 and 5 times",
+                      ss.str());
+}
+
+TEST(BetweenTest, HasCorrectBounds) {
+  const Cardinality c = Between(3, 5);
+  EXPECT_EQ(3, c.ConservativeLowerBound());
+  EXPECT_EQ(5, c.ConservativeUpperBound());
+}
+
+// Tests Exactly(n).
+
+TEST(ExactlyTest, OnNegativeNumber) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    Exactly(-1);
+  }, "The invocation lower bound must be >= 0");
+}
+
+TEST(ExactlyTest, OnZero) {
+  const Cardinality c = Exactly(0);
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(0));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(1));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(1));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_PRED_FORMAT2(IsSubstring, "never called",
+                      ss.str());
+}
+
+TEST(ExactlyTest, OnPositiveNumber) {
+  const Cardinality c = Exactly(2);
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(0));
+  EXPECT_FALSE(c.IsSaturatedByCallCount(0));
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(2));
+  EXPECT_TRUE(c.IsSaturatedByCallCount(2));
+
+  stringstream ss1;
+  Exactly(1).DescribeTo(&ss1);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called once",
+                      ss1.str());
+
+  stringstream ss2;
+  c.DescribeTo(&ss2);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called twice",
+                      ss2.str());
+
+  stringstream ss3;
+  Exactly(3).DescribeTo(&ss3);
+  EXPECT_PRED_FORMAT2(IsSubstring, "called 3 times",
+                      ss3.str());
+}
+
+TEST(ExactlyTest, HasCorrectBounds) {
+  const Cardinality c = Exactly(3);
+  EXPECT_EQ(3, c.ConservativeLowerBound());
+  EXPECT_EQ(3, c.ConservativeUpperBound());
+}
+
+// Tests that a user can make their own cardinality by implementing
+// CardinalityInterface and calling MakeCardinality().
+
+class EvenCardinality : public CardinalityInterface {
+ public:
+  // Returns true if and only if call_count calls will satisfy this
+  // cardinality.
+  bool IsSatisfiedByCallCount(int call_count) const override {
+    return (call_count % 2 == 0);
+  }
+
+  // Returns true if and only if call_count calls will saturate this
+  // cardinality.
+  bool IsSaturatedByCallCount(int /* call_count */) const override {
+    return false;
+  }
+
+  // Describes self to an ostream.
+  void DescribeTo(::std::ostream* ss) const override {
+    *ss << "called even number of times";
+  }
+};
+
+TEST(MakeCardinalityTest, ConstructsCardinalityFromInterface) {
+  const Cardinality c = MakeCardinality(new EvenCardinality);
+
+  EXPECT_TRUE(c.IsSatisfiedByCallCount(2));
+  EXPECT_FALSE(c.IsSatisfiedByCallCount(3));
+
+  EXPECT_FALSE(c.IsSaturatedByCallCount(10000));
+
+  stringstream ss;
+  c.DescribeTo(&ss);
+  EXPECT_EQ("called even number of times", ss.str());
+}
+
+}  // Unnamed namespace
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-function-mocker_nc.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-function-mocker_nc.cc
new file mode 100644
index 0000000000..d38fe85ef0
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-function-mocker_nc.cc
@@ -0,0 +1,16 @@
+#include "gmock/gmock.h"
+
+#include <memory>
+#include <string>
+
+#if defined(TEST_MOCK_METHOD_INVALID_CONST_SPEC)
+
+struct Base {
+  MOCK_METHOD(int, F, (), (onst));
+};
+
+#else
+
+// Sanity check - this should compile.
+
+#endif
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-function-mocker_nc_test.py b/GraphBLAS/CUDA/test/googlemock/test/gmock-function-mocker_nc_test.py
new file mode 100644
index 0000000000..8ef6e09fa2
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-function-mocker_nc_test.py
@@ -0,0 +1,43 @@
+"""Negative compilation tests for Google Mock macro MOCK_METHOD."""
+
+import os
+import sys
+
+IS_LINUX = os.name == "posix" and os.uname()[0] == "Linux"
+if not IS_LINUX:
+  sys.stderr.write(
+      "WARNING: Negative compilation tests are not supported on this platform")
+  sys.exit(0)
+
+# Suppresses the 'Import not at the top of the file' lint complaint.
+# pylint: disable-msg=C6204
+from google3.testing.pybase import fake_target_util
+from google3.testing.pybase import googletest
+
+# pylint: enable-msg=C6204
+
+
+class GMockMethodNCTest(googletest.TestCase):
+  """Negative compilation tests for MOCK_METHOD."""
+
+  # The class body is intentionally empty.  The actual test*() methods
+  # will be defined at run time by a call to
+  # DefineNegativeCompilationTests() later.
+  pass
+
+
+# Defines a list of test specs, where each element is a tuple
+# (test name, list of regexes for matching the compiler errors).
+TEST_SPECS = [
+    ("MOCK_METHOD_INVALID_CONST_SPEC",
+     [r"onst cannot be recognized as a valid specification modifier"]),
+]
+
+# Define a test method in GMockNCTest for each element in TEST_SPECS.
+fake_target_util.DefineNegativeCompilationTests(
+    GMockMethodNCTest,
+    "google3/third_party/googletest/googlemock/test/gmock-function-mocker_nc",
+    "gmock-function-mocker_nc.o", TEST_SPECS)
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-function-mocker_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-function-mocker_test.cc
new file mode 100644
index 0000000000..94aaafba70
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-function-mocker_test.cc
@@ -0,0 +1,908 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the function mocker classes.
+#include "gmock/gmock-function-mocker.h"
+
+#if GTEST_OS_WINDOWS
+// MSDN says the header file to be included for STDMETHOD is BaseTyps.h but
+// we are getting compiler errors if we use basetyps.h, hence including
+// objbase.h for definition of STDMETHOD.
+# include <objbase.h>
+#endif  // GTEST_OS_WINDOWS
+
+#include <functional>
+#include <map>
+#include <string>
+#include <type_traits>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace gmock_function_mocker_test {
+
+using testing::_;
+using testing::A;
+using testing::An;
+using testing::AnyNumber;
+using testing::Const;
+using testing::DoDefault;
+using testing::Eq;
+using testing::Lt;
+using testing::MockFunction;
+using testing::Ref;
+using testing::Return;
+using testing::ReturnRef;
+using testing::TypedEq;
+
+template<typename T>
+class TemplatedCopyable {
+ public:
+  TemplatedCopyable() {}
+
+  template <typename U>
+  TemplatedCopyable(const U& other) {}  // NOLINT
+};
+
+class FooInterface {
+ public:
+  virtual ~FooInterface() {}
+
+  virtual void VoidReturning(int x) = 0;
+
+  virtual int Nullary() = 0;
+  virtual bool Unary(int x) = 0;
+  virtual long Binary(short x, int y) = 0;  // NOLINT
+  virtual int Decimal(bool b, char c, short d, int e, long f,  // NOLINT
+                      float g, double h, unsigned i, char* j,
+                      const std::string& k) = 0;
+
+  virtual bool TakesNonConstReference(int& n) = 0;  // NOLINT
+  virtual std::string TakesConstReference(const int& n) = 0;
+  virtual bool TakesConst(const int x) = 0;
+
+  virtual int OverloadedOnArgumentNumber() = 0;
+  virtual int OverloadedOnArgumentNumber(int n) = 0;
+
+  virtual int OverloadedOnArgumentType(int n) = 0;
+  virtual char OverloadedOnArgumentType(char c) = 0;
+
+  virtual int OverloadedOnConstness() = 0;
+  virtual char OverloadedOnConstness() const = 0;
+
+  virtual int TypeWithHole(int (*func)()) = 0;
+  virtual int TypeWithComma(const std::map<int, std::string>& a_map) = 0;
+  virtual int TypeWithTemplatedCopyCtor(const TemplatedCopyable<int>&) = 0;
+
+  virtual int (*ReturnsFunctionPointer1(int))(bool) = 0;
+  using fn_ptr = int (*)(bool);
+  virtual fn_ptr ReturnsFunctionPointer2(int) = 0;
+
+#if GTEST_OS_WINDOWS
+  STDMETHOD_(int, CTNullary)() = 0;
+  STDMETHOD_(bool, CTUnary)(int x) = 0;
+  STDMETHOD_(int, CTDecimal)
+  (bool b, char c, short d, int e, long f,  // NOLINT
+   float g, double h, unsigned i, char* j, const std::string& k) = 0;
+  STDMETHOD_(char, CTConst)(int x) const = 0;
+#endif  // GTEST_OS_WINDOWS
+};
+
+// Const qualifiers on arguments were once (incorrectly) considered
+// significant in determining whether two virtual functions had the same
+// signature. This was fixed in Visual Studio 2008. However, the compiler
+// still emits a warning that alerts about this change in behavior.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable : 4373)
+#endif
+class MockFoo : public FooInterface {
+ public:
+  MockFoo() {}
+
+  // Makes sure that a mock function parameter can be named.
+  MOCK_METHOD(void, VoidReturning, (int n));  // NOLINT
+
+  MOCK_METHOD(int, Nullary, ());  // NOLINT
+
+  // Makes sure that a mock function parameter can be unnamed.
+  MOCK_METHOD(bool, Unary, (int));          // NOLINT
+  MOCK_METHOD(long, Binary, (short, int));  // NOLINT
+  MOCK_METHOD(int, Decimal,
+              (bool, char, short, int, long, float,  // NOLINT
+               double, unsigned, char*, const std::string& str),
+              (override));
+
+  MOCK_METHOD(bool, TakesNonConstReference, (int&));  // NOLINT
+  MOCK_METHOD(std::string, TakesConstReference, (const int&));
+  MOCK_METHOD(bool, TakesConst, (const int));  // NOLINT
+
+  // Tests that the function return type can contain unprotected comma.
+  MOCK_METHOD((std::map<int, std::string>), ReturnTypeWithComma, (), ());
+  MOCK_METHOD((std::map<int, std::string>), ReturnTypeWithComma, (int),
+              (const));  // NOLINT
+
+  MOCK_METHOD(int, OverloadedOnArgumentNumber, ());     // NOLINT
+  MOCK_METHOD(int, OverloadedOnArgumentNumber, (int));  // NOLINT
+
+  MOCK_METHOD(int, OverloadedOnArgumentType, (int));    // NOLINT
+  MOCK_METHOD(char, OverloadedOnArgumentType, (char));  // NOLINT
+
+  MOCK_METHOD(int, OverloadedOnConstness, (), (override));          // NOLINT
+  MOCK_METHOD(char, OverloadedOnConstness, (), (override, const));  // NOLINT
+
+  MOCK_METHOD(int, TypeWithHole, (int (*)()), ());  // NOLINT
+  MOCK_METHOD(int, TypeWithComma, ((const std::map<int, std::string>&)));
+  MOCK_METHOD(int, TypeWithTemplatedCopyCtor,
+              (const TemplatedCopyable<int>&));  // NOLINT
+
+  MOCK_METHOD(int (*)(bool), ReturnsFunctionPointer1, (int), ());
+  MOCK_METHOD(fn_ptr, ReturnsFunctionPointer2, (int), ());
+
+#if GTEST_OS_WINDOWS
+  MOCK_METHOD(int, CTNullary, (), (Calltype(STDMETHODCALLTYPE)));
+  MOCK_METHOD(bool, CTUnary, (int), (Calltype(STDMETHODCALLTYPE)));
+  MOCK_METHOD(int, CTDecimal,
+              (bool b, char c, short d, int e, long f, float g, double h,
+               unsigned i, char* j, const std::string& k),
+              (Calltype(STDMETHODCALLTYPE)));
+  MOCK_METHOD(char, CTConst, (int), (const, Calltype(STDMETHODCALLTYPE)));
+  MOCK_METHOD((std::map<int, std::string>), CTReturnTypeWithComma, (),
+              (Calltype(STDMETHODCALLTYPE)));
+#endif  // GTEST_OS_WINDOWS
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFoo);
+};
+
+class LegacyMockFoo : public FooInterface {
+ public:
+  LegacyMockFoo() {}
+
+  // Makes sure that a mock function parameter can be named.
+  MOCK_METHOD1(VoidReturning, void(int n));  // NOLINT
+
+  MOCK_METHOD0(Nullary, int());  // NOLINT
+
+  // Makes sure that a mock function parameter can be unnamed.
+  MOCK_METHOD1(Unary, bool(int));                                  // NOLINT
+  MOCK_METHOD2(Binary, long(short, int));                          // NOLINT
+  MOCK_METHOD10(Decimal, int(bool, char, short, int, long, float,  // NOLINT
+                             double, unsigned, char*, const std::string& str));
+
+  MOCK_METHOD1(TakesNonConstReference, bool(int&));  // NOLINT
+  MOCK_METHOD1(TakesConstReference, std::string(const int&));
+  MOCK_METHOD1(TakesConst, bool(const int));  // NOLINT
+
+  // Tests that the function return type can contain unprotected comma.
+  MOCK_METHOD0(ReturnTypeWithComma, std::map<int, std::string>());
+  MOCK_CONST_METHOD1(ReturnTypeWithComma,
+                     std::map<int, std::string>(int));  // NOLINT
+
+  MOCK_METHOD0(OverloadedOnArgumentNumber, int());     // NOLINT
+  MOCK_METHOD1(OverloadedOnArgumentNumber, int(int));  // NOLINT
+
+  MOCK_METHOD1(OverloadedOnArgumentType, int(int));    // NOLINT
+  MOCK_METHOD1(OverloadedOnArgumentType, char(char));  // NOLINT
+
+  MOCK_METHOD0(OverloadedOnConstness, int());         // NOLINT
+  MOCK_CONST_METHOD0(OverloadedOnConstness, char());  // NOLINT
+
+  MOCK_METHOD1(TypeWithHole, int(int (*)()));  // NOLINT
+  MOCK_METHOD1(TypeWithComma,
+               int(const std::map<int, std::string>&));  // NOLINT
+  MOCK_METHOD1(TypeWithTemplatedCopyCtor,
+               int(const TemplatedCopyable<int>&));  // NOLINT
+
+  MOCK_METHOD1(ReturnsFunctionPointer1, int (*(int))(bool));
+  MOCK_METHOD1(ReturnsFunctionPointer2, fn_ptr(int));
+
+#if GTEST_OS_WINDOWS
+  MOCK_METHOD0_WITH_CALLTYPE(STDMETHODCALLTYPE, CTNullary, int());
+  MOCK_METHOD1_WITH_CALLTYPE(STDMETHODCALLTYPE, CTUnary, bool(int));  // NOLINT
+  MOCK_METHOD10_WITH_CALLTYPE(STDMETHODCALLTYPE, CTDecimal,
+                              int(bool b, char c, short d, int e,  // NOLINT
+                                  long f, float g, double h,       // NOLINT
+                                  unsigned i, char* j, const std::string& k));
+  MOCK_CONST_METHOD1_WITH_CALLTYPE(STDMETHODCALLTYPE, CTConst,
+                                   char(int));  // NOLINT
+
+  // Tests that the function return type can contain unprotected comma.
+  MOCK_METHOD0_WITH_CALLTYPE(STDMETHODCALLTYPE, CTReturnTypeWithComma,
+                             std::map<int, std::string>());
+#endif  // GTEST_OS_WINDOWS
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(LegacyMockFoo);
+};
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+template <class T>
+class FunctionMockerTest : public testing::Test {
+ protected:
+  FunctionMockerTest() : foo_(&mock_foo_) {}
+
+  FooInterface* const foo_;
+  T mock_foo_;
+};
+using FunctionMockerTestTypes = ::testing::Types<MockFoo, LegacyMockFoo>;
+TYPED_TEST_SUITE(FunctionMockerTest, FunctionMockerTestTypes);
+
+// Tests mocking a void-returning function.
+TYPED_TEST(FunctionMockerTest, MocksVoidFunction) {
+  EXPECT_CALL(this->mock_foo_, VoidReturning(Lt(100)));
+  this->foo_->VoidReturning(0);
+}
+
+// Tests mocking a nullary function.
+TYPED_TEST(FunctionMockerTest, MocksNullaryFunction) {
+  EXPECT_CALL(this->mock_foo_, Nullary())
+      .WillOnce(DoDefault())
+      .WillOnce(Return(1));
+
+  EXPECT_EQ(0, this->foo_->Nullary());
+  EXPECT_EQ(1, this->foo_->Nullary());
+}
+
+// Tests mocking a unary function.
+TYPED_TEST(FunctionMockerTest, MocksUnaryFunction) {
+  EXPECT_CALL(this->mock_foo_, Unary(Eq(2))).Times(2).WillOnce(Return(true));
+
+  EXPECT_TRUE(this->foo_->Unary(2));
+  EXPECT_FALSE(this->foo_->Unary(2));
+}
+
+// Tests mocking a binary function.
+TYPED_TEST(FunctionMockerTest, MocksBinaryFunction) {
+  EXPECT_CALL(this->mock_foo_, Binary(2, _)).WillOnce(Return(3));
+
+  EXPECT_EQ(3, this->foo_->Binary(2, 1));
+}
+
+// Tests mocking a decimal function.
+TYPED_TEST(FunctionMockerTest, MocksDecimalFunction) {
+  EXPECT_CALL(this->mock_foo_,
+              Decimal(true, 'a', 0, 0, 1L, A<float>(), Lt(100), 5U, NULL, "hi"))
+      .WillOnce(Return(5));
+
+  EXPECT_EQ(5, this->foo_->Decimal(true, 'a', 0, 0, 1, 0, 0, 5, nullptr, "hi"));
+}
+
+// Tests mocking a function that takes a non-const reference.
+TYPED_TEST(FunctionMockerTest, MocksFunctionWithNonConstReferenceArgument) {
+  int a = 0;
+  EXPECT_CALL(this->mock_foo_, TakesNonConstReference(Ref(a)))
+      .WillOnce(Return(true));
+
+  EXPECT_TRUE(this->foo_->TakesNonConstReference(a));
+}
+
+// Tests mocking a function that takes a const reference.
+TYPED_TEST(FunctionMockerTest, MocksFunctionWithConstReferenceArgument) {
+  int a = 0;
+  EXPECT_CALL(this->mock_foo_, TakesConstReference(Ref(a)))
+      .WillOnce(Return("Hello"));
+
+  EXPECT_EQ("Hello", this->foo_->TakesConstReference(a));
+}
+
+// Tests mocking a function that takes a const variable.
+TYPED_TEST(FunctionMockerTest, MocksFunctionWithConstArgument) {
+  EXPECT_CALL(this->mock_foo_, TakesConst(Lt(10))).WillOnce(DoDefault());
+
+  EXPECT_FALSE(this->foo_->TakesConst(5));
+}
+
+// Tests mocking functions overloaded on the number of arguments.
+TYPED_TEST(FunctionMockerTest, MocksFunctionsOverloadedOnArgumentNumber) {
+  EXPECT_CALL(this->mock_foo_, OverloadedOnArgumentNumber())
+      .WillOnce(Return(1));
+  EXPECT_CALL(this->mock_foo_, OverloadedOnArgumentNumber(_))
+      .WillOnce(Return(2));
+
+  EXPECT_EQ(2, this->foo_->OverloadedOnArgumentNumber(1));
+  EXPECT_EQ(1, this->foo_->OverloadedOnArgumentNumber());
+}
+
+// Tests mocking functions overloaded on the types of argument.
+TYPED_TEST(FunctionMockerTest, MocksFunctionsOverloadedOnArgumentType) {
+  EXPECT_CALL(this->mock_foo_, OverloadedOnArgumentType(An<int>()))
+      .WillOnce(Return(1));
+  EXPECT_CALL(this->mock_foo_, OverloadedOnArgumentType(TypedEq<char>('a')))
+      .WillOnce(Return('b'));
+
+  EXPECT_EQ(1, this->foo_->OverloadedOnArgumentType(0));
+  EXPECT_EQ('b', this->foo_->OverloadedOnArgumentType('a'));
+}
+
+// Tests mocking functions overloaded on the const-ness of this object.
+TYPED_TEST(FunctionMockerTest, MocksFunctionsOverloadedOnConstnessOfThis) {
+  EXPECT_CALL(this->mock_foo_, OverloadedOnConstness());
+  EXPECT_CALL(Const(this->mock_foo_), OverloadedOnConstness())
+      .WillOnce(Return('a'));
+
+  EXPECT_EQ(0, this->foo_->OverloadedOnConstness());
+  EXPECT_EQ('a', Const(*this->foo_).OverloadedOnConstness());
+}
+
+TYPED_TEST(FunctionMockerTest, MocksReturnTypeWithComma) {
+  const std::map<int, std::string> a_map;
+  EXPECT_CALL(this->mock_foo_, ReturnTypeWithComma()).WillOnce(Return(a_map));
+  EXPECT_CALL(this->mock_foo_, ReturnTypeWithComma(42)).WillOnce(Return(a_map));
+
+  EXPECT_EQ(a_map, this->mock_foo_.ReturnTypeWithComma());
+  EXPECT_EQ(a_map, this->mock_foo_.ReturnTypeWithComma(42));
+}
+
+TYPED_TEST(FunctionMockerTest, MocksTypeWithTemplatedCopyCtor) {
+  EXPECT_CALL(this->mock_foo_, TypeWithTemplatedCopyCtor(_))
+      .WillOnce(Return(true));
+  EXPECT_TRUE(this->foo_->TypeWithTemplatedCopyCtor(TemplatedCopyable<int>()));
+}
+
+#if GTEST_OS_WINDOWS
+// Tests mocking a nullary function with calltype.
+TYPED_TEST(FunctionMockerTest, MocksNullaryFunctionWithCallType) {
+  EXPECT_CALL(this->mock_foo_, CTNullary())
+      .WillOnce(Return(-1))
+      .WillOnce(Return(0));
+
+  EXPECT_EQ(-1, this->foo_->CTNullary());
+  EXPECT_EQ(0, this->foo_->CTNullary());
+}
+
+// Tests mocking a unary function with calltype.
+TYPED_TEST(FunctionMockerTest, MocksUnaryFunctionWithCallType) {
+  EXPECT_CALL(this->mock_foo_, CTUnary(Eq(2)))
+      .Times(2)
+      .WillOnce(Return(true))
+      .WillOnce(Return(false));
+
+  EXPECT_TRUE(this->foo_->CTUnary(2));
+  EXPECT_FALSE(this->foo_->CTUnary(2));
+}
+
+// Tests mocking a decimal function with calltype.
+TYPED_TEST(FunctionMockerTest, MocksDecimalFunctionWithCallType) {
+  EXPECT_CALL(this->mock_foo_, CTDecimal(true, 'a', 0, 0, 1L, A<float>(),
+                                         Lt(100), 5U, NULL, "hi"))
+      .WillOnce(Return(10));
+
+  EXPECT_EQ(10, this->foo_->CTDecimal(true, 'a', 0, 0, 1, 0, 0, 5, NULL, "hi"));
+}
+
+// Tests mocking functions overloaded on the const-ness of this object.
+TYPED_TEST(FunctionMockerTest, MocksFunctionsConstFunctionWithCallType) {
+  EXPECT_CALL(Const(this->mock_foo_), CTConst(_)).WillOnce(Return('a'));
+
+  EXPECT_EQ('a', Const(*this->foo_).CTConst(0));
+}
+
+TYPED_TEST(FunctionMockerTest, MocksReturnTypeWithCommaAndCallType) {
+  const std::map<int, std::string> a_map;
+  EXPECT_CALL(this->mock_foo_, CTReturnTypeWithComma()).WillOnce(Return(a_map));
+
+  EXPECT_EQ(a_map, this->mock_foo_.CTReturnTypeWithComma());
+}
+
+#endif  // GTEST_OS_WINDOWS
+
+class MockB {
+ public:
+  MockB() {}
+
+  MOCK_METHOD(void, DoB, ());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockB);
+};
+
+class LegacyMockB {
+ public:
+  LegacyMockB() {}
+
+  MOCK_METHOD0(DoB, void());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(LegacyMockB);
+};
+
+template <typename T>
+class ExpectCallTest : public ::testing::Test {};
+using ExpectCallTestTypes = ::testing::Types<MockB, LegacyMockB>;
+TYPED_TEST_SUITE(ExpectCallTest, ExpectCallTestTypes);
+
+// Tests that functions with no EXPECT_CALL() rules can be called any
+// number of times.
+TYPED_TEST(ExpectCallTest, UnmentionedFunctionCanBeCalledAnyNumberOfTimes) {
+  { TypeParam b; }
+
+  {
+    TypeParam b;
+    b.DoB();
+  }
+
+  {
+    TypeParam b;
+    b.DoB();
+    b.DoB();
+  }
+}
+
+// Tests mocking template interfaces.
+
+template <typename T>
+class StackInterface {
+ public:
+  virtual ~StackInterface() {}
+
+  // Template parameter appears in function parameter.
+  virtual void Push(const T& value) = 0;
+  virtual void Pop() = 0;
+  virtual int GetSize() const = 0;
+  // Template parameter appears in function return type.
+  virtual const T& GetTop() const = 0;
+};
+
+template <typename T>
+class MockStack : public StackInterface<T> {
+ public:
+  MockStack() {}
+
+  MOCK_METHOD(void, Push, (const T& elem), ());
+  MOCK_METHOD(void, Pop, (), (final));
+  MOCK_METHOD(int, GetSize, (), (const, override));
+  MOCK_METHOD(const T&, GetTop, (), (const));
+
+  // Tests that the function return type can contain unprotected comma.
+  MOCK_METHOD((std::map<int, int>), ReturnTypeWithComma, (), ());
+  MOCK_METHOD((std::map<int, int>), ReturnTypeWithComma, (int), (const));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockStack);
+};
+
+template <typename T>
+class LegacyMockStack : public StackInterface<T> {
+ public:
+  LegacyMockStack() {}
+
+  MOCK_METHOD1_T(Push, void(const T& elem));
+  MOCK_METHOD0_T(Pop, void());
+  MOCK_CONST_METHOD0_T(GetSize, int());  // NOLINT
+  MOCK_CONST_METHOD0_T(GetTop, const T&());
+
+  // Tests that the function return type can contain unprotected comma.
+  MOCK_METHOD0_T(ReturnTypeWithComma, std::map<int, int>());
+  MOCK_CONST_METHOD1_T(ReturnTypeWithComma, std::map<int, int>(int));  // NOLINT
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(LegacyMockStack);
+};
+
+template <typename T>
+class TemplateMockTest : public ::testing::Test {};
+using TemplateMockTestTypes =
+    ::testing::Types<MockStack<int>, LegacyMockStack<int>>;
+TYPED_TEST_SUITE(TemplateMockTest, TemplateMockTestTypes);
+
+// Tests that template mock works.
+TYPED_TEST(TemplateMockTest, Works) {
+  TypeParam mock;
+
+  EXPECT_CALL(mock, GetSize())
+      .WillOnce(Return(0))
+      .WillOnce(Return(1))
+      .WillOnce(Return(0));
+  EXPECT_CALL(mock, Push(_));
+  int n = 5;
+  EXPECT_CALL(mock, GetTop())
+      .WillOnce(ReturnRef(n));
+  EXPECT_CALL(mock, Pop())
+      .Times(AnyNumber());
+
+  EXPECT_EQ(0, mock.GetSize());
+  mock.Push(5);
+  EXPECT_EQ(1, mock.GetSize());
+  EXPECT_EQ(5, mock.GetTop());
+  mock.Pop();
+  EXPECT_EQ(0, mock.GetSize());
+}
+
+TYPED_TEST(TemplateMockTest, MethodWithCommaInReturnTypeWorks) {
+  TypeParam mock;
+
+  const std::map<int, int> a_map;
+  EXPECT_CALL(mock, ReturnTypeWithComma())
+      .WillOnce(Return(a_map));
+  EXPECT_CALL(mock, ReturnTypeWithComma(1))
+      .WillOnce(Return(a_map));
+
+  EXPECT_EQ(a_map, mock.ReturnTypeWithComma());
+  EXPECT_EQ(a_map, mock.ReturnTypeWithComma(1));
+}
+
+#if GTEST_OS_WINDOWS
+// Tests mocking template interfaces with calltype.
+
+template <typename T>
+class StackInterfaceWithCallType {
+ public:
+  virtual ~StackInterfaceWithCallType() {}
+
+  // Template parameter appears in function parameter.
+  STDMETHOD_(void, Push)(const T& value) = 0;
+  STDMETHOD_(void, Pop)() = 0;
+  STDMETHOD_(int, GetSize)() const = 0;
+  // Template parameter appears in function return type.
+  STDMETHOD_(const T&, GetTop)() const = 0;
+};
+
+template <typename T>
+class MockStackWithCallType : public StackInterfaceWithCallType<T> {
+ public:
+  MockStackWithCallType() {}
+
+  MOCK_METHOD(void, Push, (const T& elem),
+              (Calltype(STDMETHODCALLTYPE), override));
+  MOCK_METHOD(void, Pop, (), (Calltype(STDMETHODCALLTYPE), override));
+  MOCK_METHOD(int, GetSize, (), (Calltype(STDMETHODCALLTYPE), override, const));
+  MOCK_METHOD(const T&, GetTop, (),
+              (Calltype(STDMETHODCALLTYPE), override, const));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockStackWithCallType);
+};
+
+template <typename T>
+class LegacyMockStackWithCallType : public StackInterfaceWithCallType<T> {
+ public:
+  LegacyMockStackWithCallType() {}
+
+  MOCK_METHOD1_T_WITH_CALLTYPE(STDMETHODCALLTYPE, Push, void(const T& elem));
+  MOCK_METHOD0_T_WITH_CALLTYPE(STDMETHODCALLTYPE, Pop, void());
+  MOCK_CONST_METHOD0_T_WITH_CALLTYPE(STDMETHODCALLTYPE, GetSize, int());
+  MOCK_CONST_METHOD0_T_WITH_CALLTYPE(STDMETHODCALLTYPE, GetTop, const T&());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(LegacyMockStackWithCallType);
+};
+
+template <typename T>
+class TemplateMockTestWithCallType : public ::testing::Test {};
+using TemplateMockTestWithCallTypeTypes =
+    ::testing::Types<MockStackWithCallType<int>,
+                     LegacyMockStackWithCallType<int>>;
+TYPED_TEST_SUITE(TemplateMockTestWithCallType,
+                 TemplateMockTestWithCallTypeTypes);
+
+// Tests that template mock with calltype works.
+TYPED_TEST(TemplateMockTestWithCallType, Works) {
+  TypeParam mock;
+
+  EXPECT_CALL(mock, GetSize())
+      .WillOnce(Return(0))
+      .WillOnce(Return(1))
+      .WillOnce(Return(0));
+  EXPECT_CALL(mock, Push(_));
+  int n = 5;
+  EXPECT_CALL(mock, GetTop())
+      .WillOnce(ReturnRef(n));
+  EXPECT_CALL(mock, Pop())
+      .Times(AnyNumber());
+
+  EXPECT_EQ(0, mock.GetSize());
+  mock.Push(5);
+  EXPECT_EQ(1, mock.GetSize());
+  EXPECT_EQ(5, mock.GetTop());
+  mock.Pop();
+  EXPECT_EQ(0, mock.GetSize());
+}
+#endif  // GTEST_OS_WINDOWS
+
+#define MY_MOCK_METHODS1_                       \
+  MOCK_METHOD(void, Overloaded, ());            \
+  MOCK_METHOD(int, Overloaded, (int), (const)); \
+  MOCK_METHOD(bool, Overloaded, (bool f, int n))
+
+#define LEGACY_MY_MOCK_METHODS1_              \
+  MOCK_METHOD0(Overloaded, void());           \
+  MOCK_CONST_METHOD1(Overloaded, int(int n)); \
+  MOCK_METHOD2(Overloaded, bool(bool f, int n))
+
+class MockOverloadedOnArgNumber {
+ public:
+  MockOverloadedOnArgNumber() {}
+
+  MY_MOCK_METHODS1_;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockOverloadedOnArgNumber);
+};
+
+class LegacyMockOverloadedOnArgNumber {
+ public:
+  LegacyMockOverloadedOnArgNumber() {}
+
+  LEGACY_MY_MOCK_METHODS1_;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(LegacyMockOverloadedOnArgNumber);
+};
+
+template <typename T>
+class OverloadedMockMethodTest : public ::testing::Test {};
+using OverloadedMockMethodTestTypes =
+    ::testing::Types<MockOverloadedOnArgNumber,
+                     LegacyMockOverloadedOnArgNumber>;
+TYPED_TEST_SUITE(OverloadedMockMethodTest, OverloadedMockMethodTestTypes);
+
+TYPED_TEST(OverloadedMockMethodTest, CanOverloadOnArgNumberInMacroBody) {
+  TypeParam mock;
+  EXPECT_CALL(mock, Overloaded());
+  EXPECT_CALL(mock, Overloaded(1)).WillOnce(Return(2));
+  EXPECT_CALL(mock, Overloaded(true, 1)).WillOnce(Return(true));
+
+  mock.Overloaded();
+  EXPECT_EQ(2, mock.Overloaded(1));
+  EXPECT_TRUE(mock.Overloaded(true, 1));
+}
+
+#define MY_MOCK_METHODS2_ \
+    MOCK_CONST_METHOD1(Overloaded, int(int n)); \
+    MOCK_METHOD1(Overloaded, int(int n))
+
+class MockOverloadedOnConstness {
+ public:
+  MockOverloadedOnConstness() {}
+
+  MY_MOCK_METHODS2_;
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockOverloadedOnConstness);
+};
+
+TEST(MockMethodOverloadedMockMethodTest, CanOverloadOnConstnessInMacroBody) {
+  MockOverloadedOnConstness mock;
+  const MockOverloadedOnConstness* const_mock = &mock;
+  EXPECT_CALL(mock, Overloaded(1)).WillOnce(Return(2));
+  EXPECT_CALL(*const_mock, Overloaded(1)).WillOnce(Return(3));
+
+  EXPECT_EQ(2, mock.Overloaded(1));
+  EXPECT_EQ(3, const_mock->Overloaded(1));
+}
+
+TEST(MockMethodMockFunctionTest, WorksForVoidNullary) {
+  MockFunction<void()> foo;
+  EXPECT_CALL(foo, Call());
+  foo.Call();
+}
+
+TEST(MockMethodMockFunctionTest, WorksForNonVoidNullary) {
+  MockFunction<int()> foo;
+  EXPECT_CALL(foo, Call())
+      .WillOnce(Return(1))
+      .WillOnce(Return(2));
+  EXPECT_EQ(1, foo.Call());
+  EXPECT_EQ(2, foo.Call());
+}
+
+TEST(MockMethodMockFunctionTest, WorksForVoidUnary) {
+  MockFunction<void(int)> foo;
+  EXPECT_CALL(foo, Call(1));
+  foo.Call(1);
+}
+
+TEST(MockMethodMockFunctionTest, WorksForNonVoidBinary) {
+  MockFunction<int(bool, int)> foo;
+  EXPECT_CALL(foo, Call(false, 42))
+      .WillOnce(Return(1))
+      .WillOnce(Return(2));
+  EXPECT_CALL(foo, Call(true, Ge(100)))
+      .WillOnce(Return(3));
+  EXPECT_EQ(1, foo.Call(false, 42));
+  EXPECT_EQ(2, foo.Call(false, 42));
+  EXPECT_EQ(3, foo.Call(true, 120));
+}
+
+TEST(MockMethodMockFunctionTest, WorksFor10Arguments) {
+  MockFunction<int(bool a0, char a1, int a2, int a3, int a4,
+                   int a5, int a6, char a7, int a8, bool a9)> foo;
+  EXPECT_CALL(foo, Call(_, 'a', _, _, _, _, _, _, _, _))
+      .WillOnce(Return(1))
+      .WillOnce(Return(2));
+  EXPECT_EQ(1, foo.Call(false, 'a', 0, 0, 0, 0, 0, 'b', 0, true));
+  EXPECT_EQ(2, foo.Call(true, 'a', 0, 0, 0, 0, 0, 'b', 1, false));
+}
+
+TEST(MockMethodMockFunctionTest, AsStdFunction) {
+  MockFunction<int(int)> foo;
+  auto call = [](const std::function<int(int)> &f, int i) {
+    return f(i);
+  };
+  EXPECT_CALL(foo, Call(1)).WillOnce(Return(-1));
+  EXPECT_CALL(foo, Call(2)).WillOnce(Return(-2));
+  EXPECT_EQ(-1, call(foo.AsStdFunction(), 1));
+  EXPECT_EQ(-2, call(foo.AsStdFunction(), 2));
+}
+
+TEST(MockMethodMockFunctionTest, AsStdFunctionReturnsReference) {
+  MockFunction<int&()> foo;
+  int value = 1;
+  EXPECT_CALL(foo, Call()).WillOnce(ReturnRef(value));
+  int& ref = foo.AsStdFunction()();
+  EXPECT_EQ(1, ref);
+  value = 2;
+  EXPECT_EQ(2, ref);
+}
+
+TEST(MockMethodMockFunctionTest, AsStdFunctionWithReferenceParameter) {
+  MockFunction<int(int &)> foo;
+  auto call = [](const std::function<int(int& )> &f, int &i) {
+    return f(i);
+  };
+  int i = 42;
+  EXPECT_CALL(foo, Call(i)).WillOnce(Return(-1));
+  EXPECT_EQ(-1, call(foo.AsStdFunction(), i));
+}
+
+namespace {
+
+template <typename Expected, typename F>
+static constexpr bool IsMockFunctionTemplateArgumentDeducedTo(
+    const MockFunction<F>&) {
+  return std::is_same<F, Expected>::value;
+}
+
+}  // namespace
+
+template <typename F>
+class MockMethodMockFunctionSignatureTest : public Test {};
+
+using MockMethodMockFunctionSignatureTypes =
+    Types<void(), int(), void(int), int(int), int(bool, int),
+          int(bool, char, int, int, int, int, int, char, int, bool)>;
+TYPED_TEST_SUITE(MockMethodMockFunctionSignatureTest,
+                 MockMethodMockFunctionSignatureTypes);
+
+TYPED_TEST(MockMethodMockFunctionSignatureTest,
+           IsMockFunctionTemplateArgumentDeducedForRawSignature) {
+  using Argument = TypeParam;
+  MockFunction<Argument> foo;
+  EXPECT_TRUE(IsMockFunctionTemplateArgumentDeducedTo<Argument>(foo));
+}
+
+TYPED_TEST(MockMethodMockFunctionSignatureTest,
+           IsMockFunctionTemplateArgumentDeducedForStdFunction) {
+  using Argument = std::function<TypeParam>;
+  MockFunction<Argument> foo;
+  EXPECT_TRUE(IsMockFunctionTemplateArgumentDeducedTo<Argument>(foo));
+}
+
+TYPED_TEST(
+    MockMethodMockFunctionSignatureTest,
+    IsMockFunctionCallMethodSignatureTheSameForRawSignatureAndStdFunction) {
+  using ForRawSignature = decltype(&MockFunction<TypeParam>::Call);
+  using ForStdFunction =
+      decltype(&MockFunction<std::function<TypeParam>>::Call);
+  EXPECT_TRUE((std::is_same<ForRawSignature, ForStdFunction>::value));
+}
+
+TYPED_TEST(
+    MockMethodMockFunctionSignatureTest,
+    IsMockFunctionAsStdFunctionMethodSignatureTheSameForRawSignatureAndStdFunction) {
+  using ForRawSignature = decltype(&MockFunction<TypeParam>::AsStdFunction);
+  using ForStdFunction =
+      decltype(&MockFunction<std::function<TypeParam>>::AsStdFunction);
+  EXPECT_TRUE((std::is_same<ForRawSignature, ForStdFunction>::value));
+}
+
+struct MockMethodSizes0 {
+  MOCK_METHOD(void, func, ());
+};
+struct MockMethodSizes1 {
+  MOCK_METHOD(void, func, (int));
+};
+struct MockMethodSizes2 {
+  MOCK_METHOD(void, func, (int, int));
+};
+struct MockMethodSizes3 {
+  MOCK_METHOD(void, func, (int, int, int));
+};
+struct MockMethodSizes4 {
+  MOCK_METHOD(void, func, (int, int, int, int));
+};
+
+struct LegacyMockMethodSizes0 {
+    MOCK_METHOD0(func, void());
+};
+struct LegacyMockMethodSizes1 {
+    MOCK_METHOD1(func, void(int));
+};
+struct LegacyMockMethodSizes2 {
+    MOCK_METHOD2(func, void(int, int));
+};
+struct LegacyMockMethodSizes3 {
+    MOCK_METHOD3(func, void(int, int, int));
+};
+struct LegacyMockMethodSizes4 {
+    MOCK_METHOD4(func, void(int, int, int, int));
+};
+
+
+TEST(MockMethodMockFunctionTest, MockMethodSizeOverhead) {
+  EXPECT_EQ(sizeof(MockMethodSizes0), sizeof(MockMethodSizes1));
+  EXPECT_EQ(sizeof(MockMethodSizes0), sizeof(MockMethodSizes2));
+  EXPECT_EQ(sizeof(MockMethodSizes0), sizeof(MockMethodSizes3));
+  EXPECT_EQ(sizeof(MockMethodSizes0), sizeof(MockMethodSizes4));
+
+  EXPECT_EQ(sizeof(LegacyMockMethodSizes0), sizeof(LegacyMockMethodSizes1));
+  EXPECT_EQ(sizeof(LegacyMockMethodSizes0), sizeof(LegacyMockMethodSizes2));
+  EXPECT_EQ(sizeof(LegacyMockMethodSizes0), sizeof(LegacyMockMethodSizes3));
+  EXPECT_EQ(sizeof(LegacyMockMethodSizes0), sizeof(LegacyMockMethodSizes4));
+
+  EXPECT_EQ(sizeof(LegacyMockMethodSizes0), sizeof(MockMethodSizes0));
+}
+
+void hasTwoParams(int, int);
+void MaybeThrows();
+void DoesntThrow() noexcept;
+struct MockMethodNoexceptSpecifier {
+  MOCK_METHOD(void, func1, (), (noexcept));
+  MOCK_METHOD(void, func2, (), (noexcept(true)));
+  MOCK_METHOD(void, func3, (), (noexcept(false)));
+  MOCK_METHOD(void, func4, (), (noexcept(noexcept(MaybeThrows()))));
+  MOCK_METHOD(void, func5, (), (noexcept(noexcept(DoesntThrow()))));
+  MOCK_METHOD(void, func6, (), (noexcept(noexcept(DoesntThrow())), const));
+  MOCK_METHOD(void, func7, (), (const, noexcept(noexcept(DoesntThrow()))));
+  // Put commas in the noexcept expression
+  MOCK_METHOD(void, func8, (), (noexcept(noexcept(hasTwoParams(1, 2))), const));
+};
+
+TEST(MockMethodMockFunctionTest, NoexceptSpecifierPreserved) {
+  EXPECT_TRUE(noexcept(std::declval<MockMethodNoexceptSpecifier>().func1()));
+  EXPECT_TRUE(noexcept(std::declval<MockMethodNoexceptSpecifier>().func2()));
+  EXPECT_FALSE(noexcept(std::declval<MockMethodNoexceptSpecifier>().func3()));
+  EXPECT_FALSE(noexcept(std::declval<MockMethodNoexceptSpecifier>().func4()));
+  EXPECT_TRUE(noexcept(std::declval<MockMethodNoexceptSpecifier>().func5()));
+  EXPECT_TRUE(noexcept(std::declval<MockMethodNoexceptSpecifier>().func6()));
+  EXPECT_TRUE(noexcept(std::declval<MockMethodNoexceptSpecifier>().func7()));
+  EXPECT_EQ(noexcept(std::declval<MockMethodNoexceptSpecifier>().func8()),
+            noexcept(hasTwoParams(1, 2)));
+}
+
+}  // namespace gmock_function_mocker_test
+}  // namespace testing
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-generated-actions_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-generated-actions_test.cc
new file mode 100644
index 0000000000..4c649a7ee9
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-generated-actions_test.cc
@@ -0,0 +1,1064 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the built-in actions generated by a script.
+
+#include "gmock/gmock-generated-actions.h"
+
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <string>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace gmock_generated_actions_test {
+
+using ::std::plus;
+using ::std::string;
+using testing::_;
+using testing::Action;
+using testing::ActionInterface;
+using testing::ByRef;
+using testing::DoAll;
+using testing::Invoke;
+using testing::Return;
+using testing::ReturnNew;
+using testing::SetArgPointee;
+using testing::StaticAssertTypeEq;
+using testing::Unused;
+
+// For suppressing compiler warnings on conversion possibly losing precision.
+inline short Short(short n) { return n; }  // NOLINT
+inline char Char(char ch) { return ch; }
+
+// Sample functions and functors for testing various actions.
+int Nullary() { return 1; }
+
+bool g_done = false;
+
+bool ByConstRef(const std::string& s) { return s == "Hi"; }
+
+const double g_double = 0;
+bool ReferencesGlobalDouble(const double& x) { return &x == &g_double; }
+
+struct UnaryFunctor {
+  int operator()(bool x) { return x ? 1 : -1; }
+};
+
+const char* Binary(const char* input, short n) { return input + n; }  // NOLINT
+
+int SumOf5(int a, int b, int c, int d, int e) { return a + b + c + d + e; }
+
+struct SumOf5Functor {
+  int operator()(int a, int b, int c, int d, int e) {
+    return a + b + c + d + e;
+  }
+};
+
+std::string Concat5(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5) {
+  return std::string(s1) + s2 + s3 + s4 + s5;
+}
+
+int SumOf6(int a, int b, int c, int d, int e, int f) {
+  return a + b + c + d + e + f;
+}
+
+struct SumOf6Functor {
+  int operator()(int a, int b, int c, int d, int e, int f) {
+    return a + b + c + d + e + f;
+  }
+};
+
+std::string Concat6(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6;
+}
+
+std::string Concat7(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6,
+                    const char* s7) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7;
+}
+
+std::string Concat8(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6,
+                    const char* s7, const char* s8) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8;
+}
+
+std::string Concat9(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6,
+                    const char* s7, const char* s8, const char* s9) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9;
+}
+
+std::string Concat10(const char* s1, const char* s2, const char* s3,
+                     const char* s4, const char* s5, const char* s6,
+                     const char* s7, const char* s8, const char* s9,
+                     const char* s10) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10;
+}
+
+// A helper that turns the type of a C-string literal from const
+// char[N] to const char*.
+inline const char* CharPtr(const char* s) { return s; }
+
+// Tests InvokeArgument<N>(...).
+
+// Tests using InvokeArgument with a nullary function.
+TEST(InvokeArgumentTest, Function0) {
+  Action<int(int, int(*)())> a = InvokeArgument<1>();  // NOLINT
+  EXPECT_EQ(1, a.Perform(std::make_tuple(2, &Nullary)));
+}
+
+// Tests using InvokeArgument with a unary function.
+TEST(InvokeArgumentTest, Functor1) {
+  Action<int(UnaryFunctor)> a = InvokeArgument<0>(true);  // NOLINT
+  EXPECT_EQ(1, a.Perform(std::make_tuple(UnaryFunctor())));
+}
+
+// Tests using InvokeArgument with a 5-ary function.
+TEST(InvokeArgumentTest, Function5) {
+  Action<int(int(*)(int, int, int, int, int))> a =  // NOLINT
+      InvokeArgument<0>(10000, 2000, 300, 40, 5);
+  EXPECT_EQ(12345, a.Perform(std::make_tuple(&SumOf5)));
+}
+
+// Tests using InvokeArgument with a 5-ary functor.
+TEST(InvokeArgumentTest, Functor5) {
+  Action<int(SumOf5Functor)> a =  // NOLINT
+      InvokeArgument<0>(10000, 2000, 300, 40, 5);
+  EXPECT_EQ(12345, a.Perform(std::make_tuple(SumOf5Functor())));
+}
+
+// Tests using InvokeArgument with a 6-ary function.
+TEST(InvokeArgumentTest, Function6) {
+  Action<int(int(*)(int, int, int, int, int, int))> a =  // NOLINT
+      InvokeArgument<0>(100000, 20000, 3000, 400, 50, 6);
+  EXPECT_EQ(123456, a.Perform(std::make_tuple(&SumOf6)));
+}
+
+// Tests using InvokeArgument with a 6-ary functor.
+TEST(InvokeArgumentTest, Functor6) {
+  Action<int(SumOf6Functor)> a =  // NOLINT
+      InvokeArgument<0>(100000, 20000, 3000, 400, 50, 6);
+  EXPECT_EQ(123456, a.Perform(std::make_tuple(SumOf6Functor())));
+}
+
+// Tests using InvokeArgument with a 7-ary function.
+TEST(InvokeArgumentTest, Function7) {
+  Action<std::string(std::string(*)(const char*, const char*, const char*,
+                                    const char*, const char*, const char*,
+                                    const char*))>
+      a = InvokeArgument<0>("1", "2", "3", "4", "5", "6", "7");
+  EXPECT_EQ("1234567", a.Perform(std::make_tuple(&Concat7)));
+}
+
+// Tests using InvokeArgument with a 8-ary function.
+TEST(InvokeArgumentTest, Function8) {
+  Action<std::string(std::string(*)(const char*, const char*, const char*,
+                                    const char*, const char*, const char*,
+                                    const char*, const char*))>
+      a = InvokeArgument<0>("1", "2", "3", "4", "5", "6", "7", "8");
+  EXPECT_EQ("12345678", a.Perform(std::make_tuple(&Concat8)));
+}
+
+// Tests using InvokeArgument with a 9-ary function.
+TEST(InvokeArgumentTest, Function9) {
+  Action<std::string(std::string(*)(const char*, const char*, const char*,
+                                    const char*, const char*, const char*,
+                                    const char*, const char*, const char*))>
+      a = InvokeArgument<0>("1", "2", "3", "4", "5", "6", "7", "8", "9");
+  EXPECT_EQ("123456789", a.Perform(std::make_tuple(&Concat9)));
+}
+
+// Tests using InvokeArgument with a 10-ary function.
+TEST(InvokeArgumentTest, Function10) {
+  Action<std::string(std::string(*)(
+      const char*, const char*, const char*, const char*, const char*,
+      const char*, const char*, const char*, const char*, const char*))>
+      a = InvokeArgument<0>("1", "2", "3", "4", "5", "6", "7", "8", "9", "0");
+  EXPECT_EQ("1234567890", a.Perform(std::make_tuple(&Concat10)));
+}
+
+// Tests using InvokeArgument with a function that takes a pointer argument.
+TEST(InvokeArgumentTest, ByPointerFunction) {
+  Action<const char*(const char*(*)(const char* input, short n))> a =  // NOLINT
+      InvokeArgument<0>(static_cast<const char*>("Hi"), Short(1));
+  EXPECT_STREQ("i", a.Perform(std::make_tuple(&Binary)));
+}
+
+// Tests using InvokeArgument with a function that takes a const char*
+// by passing it a C-string literal.
+TEST(InvokeArgumentTest, FunctionWithCStringLiteral) {
+  Action<const char*(const char*(*)(const char* input, short n))> a =  // NOLINT
+      InvokeArgument<0>("Hi", Short(1));
+  EXPECT_STREQ("i", a.Perform(std::make_tuple(&Binary)));
+}
+
+// Tests using InvokeArgument with a function that takes a const reference.
+TEST(InvokeArgumentTest, ByConstReferenceFunction) {
+  Action<bool(bool (*function)(const std::string& s))> a =  // NOLINT
+      InvokeArgument<0>(std::string("Hi"));
+  // When action 'a' is constructed, it makes a copy of the temporary
+  // string object passed to it, so it's OK to use 'a' later, when the
+  // temporary object has already died.
+  EXPECT_TRUE(a.Perform(std::make_tuple(&ByConstRef)));
+}
+
+// Tests using InvokeArgument with ByRef() and a function that takes a
+// const reference.
+TEST(InvokeArgumentTest, ByExplicitConstReferenceFunction) {
+  Action<bool(bool(*)(const double& x))> a =  // NOLINT
+      InvokeArgument<0>(ByRef(g_double));
+  // The above line calls ByRef() on a const value.
+  EXPECT_TRUE(a.Perform(std::make_tuple(&ReferencesGlobalDouble)));
+
+  double x = 0;
+  a = InvokeArgument<0>(ByRef(x));  // This calls ByRef() on a non-const.
+  EXPECT_FALSE(a.Perform(std::make_tuple(&ReferencesGlobalDouble)));
+}
+
+// Tests DoAll(a1, a2).
+TEST(DoAllTest, TwoActions) {
+  int n = 0;
+  Action<int(int*)> a = DoAll(SetArgPointee<0>(1),  // NOLINT
+                              Return(2));
+  EXPECT_EQ(2, a.Perform(std::make_tuple(&n)));
+  EXPECT_EQ(1, n);
+}
+
+// Tests DoAll(a1, a2, a3).
+TEST(DoAllTest, ThreeActions) {
+  int m = 0, n = 0;
+  Action<int(int*, int*)> a = DoAll(SetArgPointee<0>(1),  // NOLINT
+                                    SetArgPointee<1>(2),
+                                    Return(3));
+  EXPECT_EQ(3, a.Perform(std::make_tuple(&m, &n)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+}
+
+// Tests DoAll(a1, a2, a3, a4).
+TEST(DoAllTest, FourActions) {
+  int m = 0, n = 0;
+  char ch = '\0';
+  Action<int(int*, int*, char*)> a =  // NOLINT
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            Return(3));
+  EXPECT_EQ(3, a.Perform(std::make_tuple(&m, &n, &ch)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', ch);
+}
+
+// Tests DoAll(a1, a2, a3, a4, a5).
+TEST(DoAllTest, FiveActions) {
+  int m = 0, n = 0;
+  char a = '\0', b = '\0';
+  Action<int(int*, int*, char*, char*)> action =  // NOLINT
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            SetArgPointee<3>('b'),
+            Return(3));
+  EXPECT_EQ(3, action.Perform(std::make_tuple(&m, &n, &a, &b)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', a);
+  EXPECT_EQ('b', b);
+}
+
+// Tests DoAll(a1, a2, ..., a6).
+TEST(DoAllTest, SixActions) {
+  int m = 0, n = 0;
+  char a = '\0', b = '\0', c = '\0';
+  Action<int(int*, int*, char*, char*, char*)> action =  // NOLINT
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            SetArgPointee<3>('b'),
+            SetArgPointee<4>('c'),
+            Return(3));
+  EXPECT_EQ(3, action.Perform(std::make_tuple(&m, &n, &a, &b, &c)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', a);
+  EXPECT_EQ('b', b);
+  EXPECT_EQ('c', c);
+}
+
+// Tests DoAll(a1, a2, ..., a7).
+TEST(DoAllTest, SevenActions) {
+  int m = 0, n = 0;
+  char a = '\0', b = '\0', c = '\0', d = '\0';
+  Action<int(int*, int*, char*, char*, char*, char*)> action =  // NOLINT
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            SetArgPointee<3>('b'),
+            SetArgPointee<4>('c'),
+            SetArgPointee<5>('d'),
+            Return(3));
+  EXPECT_EQ(3, action.Perform(std::make_tuple(&m, &n, &a, &b, &c, &d)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', a);
+  EXPECT_EQ('b', b);
+  EXPECT_EQ('c', c);
+  EXPECT_EQ('d', d);
+}
+
+// Tests DoAll(a1, a2, ..., a8).
+TEST(DoAllTest, EightActions) {
+  int m = 0, n = 0;
+  char a = '\0', b = '\0', c = '\0', d = '\0', e = '\0';
+  Action<int(int*, int*, char*, char*, char*, char*,  // NOLINT
+             char*)> action =
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            SetArgPointee<3>('b'),
+            SetArgPointee<4>('c'),
+            SetArgPointee<5>('d'),
+            SetArgPointee<6>('e'),
+            Return(3));
+  EXPECT_EQ(3, action.Perform(std::make_tuple(&m, &n, &a, &b, &c, &d, &e)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', a);
+  EXPECT_EQ('b', b);
+  EXPECT_EQ('c', c);
+  EXPECT_EQ('d', d);
+  EXPECT_EQ('e', e);
+}
+
+// Tests DoAll(a1, a2, ..., a9).
+TEST(DoAllTest, NineActions) {
+  int m = 0, n = 0;
+  char a = '\0', b = '\0', c = '\0', d = '\0', e = '\0', f = '\0';
+  Action<int(int*, int*, char*, char*, char*, char*,  // NOLINT
+             char*, char*)> action =
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            SetArgPointee<3>('b'),
+            SetArgPointee<4>('c'),
+            SetArgPointee<5>('d'),
+            SetArgPointee<6>('e'),
+            SetArgPointee<7>('f'),
+            Return(3));
+  EXPECT_EQ(3, action.Perform(std::make_tuple(&m, &n, &a, &b, &c, &d, &e, &f)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', a);
+  EXPECT_EQ('b', b);
+  EXPECT_EQ('c', c);
+  EXPECT_EQ('d', d);
+  EXPECT_EQ('e', e);
+  EXPECT_EQ('f', f);
+}
+
+// Tests DoAll(a1, a2, ..., a10).
+TEST(DoAllTest, TenActions) {
+  int m = 0, n = 0;
+  char a = '\0', b = '\0', c = '\0', d = '\0';
+  char e = '\0', f = '\0', g = '\0';
+  Action<int(int*, int*, char*, char*, char*, char*,  // NOLINT
+             char*, char*, char*)> action =
+      DoAll(SetArgPointee<0>(1),
+            SetArgPointee<1>(2),
+            SetArgPointee<2>('a'),
+            SetArgPointee<3>('b'),
+            SetArgPointee<4>('c'),
+            SetArgPointee<5>('d'),
+            SetArgPointee<6>('e'),
+            SetArgPointee<7>('f'),
+            SetArgPointee<8>('g'),
+            Return(3));
+  EXPECT_EQ(
+      3, action.Perform(std::make_tuple(&m, &n, &a, &b, &c, &d, &e, &f, &g)));
+  EXPECT_EQ(1, m);
+  EXPECT_EQ(2, n);
+  EXPECT_EQ('a', a);
+  EXPECT_EQ('b', b);
+  EXPECT_EQ('c', c);
+  EXPECT_EQ('d', d);
+  EXPECT_EQ('e', e);
+  EXPECT_EQ('f', f);
+  EXPECT_EQ('g', g);
+}
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+// Also suppress C4503 decorated name length exceeded, name was truncated
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+# pragma warning(disable:4503)
+#endif
+// Tests the ACTION*() macro family.
+
+// Tests that ACTION() can define an action that doesn't reference the
+// mock function arguments.
+ACTION(Return5) { return 5; }
+
+TEST(ActionMacroTest, WorksWhenNotReferencingArguments) {
+  Action<double()> a1 = Return5();
+  EXPECT_DOUBLE_EQ(5, a1.Perform(std::make_tuple()));
+
+  Action<int(double, bool)> a2 = Return5();
+  EXPECT_EQ(5, a2.Perform(std::make_tuple(1, true)));
+}
+
+// Tests that ACTION() can define an action that returns void.
+ACTION(IncrementArg1) { (*arg1)++; }
+
+TEST(ActionMacroTest, WorksWhenReturningVoid) {
+  Action<void(int, int*)> a1 = IncrementArg1();
+  int n = 0;
+  a1.Perform(std::make_tuple(5, &n));
+  EXPECT_EQ(1, n);
+}
+
+// Tests that the body of ACTION() can reference the type of the
+// argument.
+ACTION(IncrementArg2) {
+  StaticAssertTypeEq<int*, arg2_type>();
+  arg2_type temp = arg2;
+  (*temp)++;
+}
+
+TEST(ActionMacroTest, CanReferenceArgumentType) {
+  Action<void(int, bool, int*)> a1 = IncrementArg2();
+  int n = 0;
+  a1.Perform(std::make_tuple(5, false, &n));
+  EXPECT_EQ(1, n);
+}
+
+// Tests that the body of ACTION() can reference the argument tuple
+// via args_type and args.
+ACTION(Sum2) {
+  StaticAssertTypeEq<std::tuple<int, char, int*>, args_type>();
+  args_type args_copy = args;
+  return std::get<0>(args_copy) + std::get<1>(args_copy);
+}
+
+TEST(ActionMacroTest, CanReferenceArgumentTuple) {
+  Action<int(int, char, int*)> a1 = Sum2();
+  int dummy = 0;
+  EXPECT_EQ(11, a1.Perform(std::make_tuple(5, Char(6), &dummy)));
+}
+
+// Tests that the body of ACTION() can reference the mock function
+// type.
+int Dummy(bool flag) { return flag? 1 : 0; }
+
+ACTION(InvokeDummy) {
+  StaticAssertTypeEq<int(bool), function_type>();
+  function_type* fp = &Dummy;
+  return (*fp)(true);
+}
+
+TEST(ActionMacroTest, CanReferenceMockFunctionType) {
+  Action<int(bool)> a1 = InvokeDummy();
+  EXPECT_EQ(1, a1.Perform(std::make_tuple(true)));
+  EXPECT_EQ(1, a1.Perform(std::make_tuple(false)));
+}
+
+// Tests that the body of ACTION() can reference the mock function's
+// return type.
+ACTION(InvokeDummy2) {
+  StaticAssertTypeEq<int, return_type>();
+  return_type result = Dummy(true);
+  return result;
+}
+
+TEST(ActionMacroTest, CanReferenceMockFunctionReturnType) {
+  Action<int(bool)> a1 = InvokeDummy2();
+  EXPECT_EQ(1, a1.Perform(std::make_tuple(true)));
+  EXPECT_EQ(1, a1.Perform(std::make_tuple(false)));
+}
+
+// Tests that ACTION() works for arguments passed by const reference.
+ACTION(ReturnAddrOfConstBoolReferenceArg) {
+  StaticAssertTypeEq<const bool&, arg1_type>();
+  return &arg1;
+}
+
+TEST(ActionMacroTest, WorksForConstReferenceArg) {
+  Action<const bool*(int, const bool&)> a = ReturnAddrOfConstBoolReferenceArg();
+  const bool b = false;
+  EXPECT_EQ(&b, a.Perform(std::tuple<int, const bool&>(0, b)));
+}
+
+// Tests that ACTION() works for arguments passed by non-const reference.
+ACTION(ReturnAddrOfIntReferenceArg) {
+  StaticAssertTypeEq<int&, arg0_type>();
+  return &arg0;
+}
+
+TEST(ActionMacroTest, WorksForNonConstReferenceArg) {
+  Action<int*(int&, bool, int)> a = ReturnAddrOfIntReferenceArg();
+  int n = 0;
+  EXPECT_EQ(&n, a.Perform(std::tuple<int&, bool, int>(n, true, 1)));
+}
+
+// Tests that ACTION() can be used in a namespace.
+namespace action_test {
+ACTION(Sum) { return arg0 + arg1; }
+}  // namespace action_test
+
+TEST(ActionMacroTest, WorksInNamespace) {
+  Action<int(int, int)> a1 = action_test::Sum();
+  EXPECT_EQ(3, a1.Perform(std::make_tuple(1, 2)));
+}
+
+// Tests that the same ACTION definition works for mock functions with
+// different argument numbers.
+ACTION(PlusTwo) { return arg0 + 2; }
+
+TEST(ActionMacroTest, WorksForDifferentArgumentNumbers) {
+  Action<int(int)> a1 = PlusTwo();
+  EXPECT_EQ(4, a1.Perform(std::make_tuple(2)));
+
+  Action<double(float, void*)> a2 = PlusTwo();
+  int dummy;
+  EXPECT_DOUBLE_EQ(6, a2.Perform(std::make_tuple(4.0f, &dummy)));
+}
+
+// Tests that ACTION_P can define a parameterized action.
+ACTION_P(Plus, n) { return arg0 + n; }
+
+TEST(ActionPMacroTest, DefinesParameterizedAction) {
+  Action<int(int m, bool t)> a1 = Plus(9);
+  EXPECT_EQ(10, a1.Perform(std::make_tuple(1, true)));
+}
+
+// Tests that the body of ACTION_P can reference the argument types
+// and the parameter type.
+ACTION_P(TypedPlus, n) {
+  arg0_type t1 = arg0;
+  n_type t2 = n;
+  return t1 + t2;
+}
+
+TEST(ActionPMacroTest, CanReferenceArgumentAndParameterTypes) {
+  Action<int(char m, bool t)> a1 = TypedPlus(9);
+  EXPECT_EQ(10, a1.Perform(std::make_tuple(Char(1), true)));
+}
+
+// Tests that a parameterized action can be used in any mock function
+// whose type is compatible.
+TEST(ActionPMacroTest, WorksInCompatibleMockFunction) {
+  Action<std::string(const std::string& s)> a1 = Plus("tail");
+  const std::string re = "re";
+  std::tuple<const std::string> dummy = std::make_tuple(re);
+  EXPECT_EQ("retail", a1.Perform(dummy));
+}
+
+// Tests that we can use ACTION*() to define actions overloaded on the
+// number of parameters.
+
+ACTION(OverloadedAction) { return arg0 ? arg1 : "hello"; }
+
+ACTION_P(OverloadedAction, default_value) {
+  return arg0 ? arg1 : default_value;
+}
+
+ACTION_P2(OverloadedAction, true_value, false_value) {
+  return arg0 ? true_value : false_value;
+}
+
+TEST(ActionMacroTest, CanDefineOverloadedActions) {
+  typedef Action<const char*(bool, const char*)> MyAction;
+
+  const MyAction a1 = OverloadedAction();
+  EXPECT_STREQ("hello", a1.Perform(std::make_tuple(false, CharPtr("world"))));
+  EXPECT_STREQ("world", a1.Perform(std::make_tuple(true, CharPtr("world"))));
+
+  const MyAction a2 = OverloadedAction("hi");
+  EXPECT_STREQ("hi", a2.Perform(std::make_tuple(false, CharPtr("world"))));
+  EXPECT_STREQ("world", a2.Perform(std::make_tuple(true, CharPtr("world"))));
+
+  const MyAction a3 = OverloadedAction("hi", "you");
+  EXPECT_STREQ("hi", a3.Perform(std::make_tuple(true, CharPtr("world"))));
+  EXPECT_STREQ("you", a3.Perform(std::make_tuple(false, CharPtr("world"))));
+}
+
+// Tests ACTION_Pn where n >= 3.
+
+ACTION_P3(Plus, m, n, k) { return arg0 + m + n + k; }
+
+TEST(ActionPnMacroTest, WorksFor3Parameters) {
+  Action<double(int m, bool t)> a1 = Plus(100, 20, 3.4);
+  EXPECT_DOUBLE_EQ(3123.4, a1.Perform(std::make_tuple(3000, true)));
+
+  Action<std::string(const std::string& s)> a2 = Plus("tail", "-", ">");
+  const std::string re = "re";
+  std::tuple<const std::string> dummy = std::make_tuple(re);
+  EXPECT_EQ("retail->", a2.Perform(dummy));
+}
+
+ACTION_P4(Plus, p0, p1, p2, p3) { return arg0 + p0 + p1 + p2 + p3; }
+
+TEST(ActionPnMacroTest, WorksFor4Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4, a1.Perform(std::make_tuple(10)));
+}
+
+ACTION_P5(Plus, p0, p1, p2, p3, p4) { return arg0 + p0 + p1 + p2 + p3 + p4; }
+
+TEST(ActionPnMacroTest, WorksFor5Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4, 5);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4 + 5, a1.Perform(std::make_tuple(10)));
+}
+
+ACTION_P6(Plus, p0, p1, p2, p3, p4, p5) {
+  return arg0 + p0 + p1 + p2 + p3 + p4 + p5;
+}
+
+TEST(ActionPnMacroTest, WorksFor6Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4, 5, 6);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4 + 5 + 6, a1.Perform(std::make_tuple(10)));
+}
+
+ACTION_P7(Plus, p0, p1, p2, p3, p4, p5, p6) {
+  return arg0 + p0 + p1 + p2 + p3 + p4 + p5 + p6;
+}
+
+TEST(ActionPnMacroTest, WorksFor7Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4, 5, 6, 7);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4 + 5 + 6 + 7, a1.Perform(std::make_tuple(10)));
+}
+
+ACTION_P8(Plus, p0, p1, p2, p3, p4, p5, p6, p7) {
+  return arg0 + p0 + p1 + p2 + p3 + p4 + p5 + p6 + p7;
+}
+
+TEST(ActionPnMacroTest, WorksFor8Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4, 5, 6, 7, 8);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
+            a1.Perform(std::make_tuple(10)));
+}
+
+ACTION_P9(Plus, p0, p1, p2, p3, p4, p5, p6, p7, p8) {
+  return arg0 + p0 + p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8;
+}
+
+TEST(ActionPnMacroTest, WorksFor9Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4, 5, 6, 7, 8, 9);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9,
+            a1.Perform(std::make_tuple(10)));
+}
+
+ACTION_P10(Plus, p0, p1, p2, p3, p4, p5, p6, p7, p8, last_param) {
+  arg0_type t0 = arg0;
+  last_param_type t9 = last_param;
+  return t0 + p0 + p1 + p2 + p3 + p4 + p5 + p6 + p7 + p8 + t9;
+}
+
+TEST(ActionPnMacroTest, WorksFor10Parameters) {
+  Action<int(int)> a1 = Plus(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+  EXPECT_EQ(10 + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8 + 9 + 10,
+            a1.Perform(std::make_tuple(10)));
+}
+
+// Tests that the action body can promote the parameter types.
+
+ACTION_P2(PadArgument, prefix, suffix) {
+  // The following lines promote the two parameters to desired types.
+  std::string prefix_str(prefix);
+  char suffix_char = static_cast<char>(suffix);
+  return prefix_str + arg0 + suffix_char;
+}
+
+TEST(ActionPnMacroTest, SimpleTypePromotion) {
+  Action<std::string(const char*)> no_promo =
+      PadArgument(std::string("foo"), 'r');
+  Action<std::string(const char*)> promo =
+      PadArgument("foo", static_cast<int>('r'));
+  EXPECT_EQ("foobar", no_promo.Perform(std::make_tuple(CharPtr("ba"))));
+  EXPECT_EQ("foobar", promo.Perform(std::make_tuple(CharPtr("ba"))));
+}
+
+// Tests that we can partially restrict parameter types using a
+// straight-forward pattern.
+
+// Defines a generic action that doesn't restrict the types of its
+// parameters.
+ACTION_P3(ConcatImpl, a, b, c) {
+  std::stringstream ss;
+  ss << a << b << c;
+  return ss.str();
+}
+
+// Next, we try to restrict that either the first parameter is a
+// string, or the second parameter is an int.
+
+// Defines a partially specialized wrapper that restricts the first
+// parameter to std::string.
+template <typename T1, typename T2>
+// ConcatImplActionP3 is the class template ACTION_P3 uses to
+// implement ConcatImpl.  We shouldn't change the name as this
+// pattern requires the user to use it directly.
+ConcatImplActionP3<std::string, T1, T2>
+Concat(const std::string& a, T1 b, T2 c) {
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (true) {
+  GTEST_INTENTIONAL_CONST_COND_POP_()
+    // This branch verifies that ConcatImpl() can be invoked without
+    // explicit template arguments.
+    return ConcatImpl(a, b, c);
+  } else {
+    // This branch verifies that ConcatImpl() can also be invoked with
+    // explicit template arguments.  It doesn't really need to be
+    // executed as this is a compile-time verification.
+    return ConcatImpl<std::string, T1, T2>(a, b, c);
+  }
+}
+
+// Defines another partially specialized wrapper that restricts the
+// second parameter to int.
+template <typename T1, typename T2>
+ConcatImplActionP3<T1, int, T2>
+Concat(T1 a, int b, T2 c) {
+  return ConcatImpl(a, b, c);
+}
+
+TEST(ActionPnMacroTest, CanPartiallyRestrictParameterTypes) {
+  Action<const std::string()> a1 = Concat("Hello", "1", 2);
+  EXPECT_EQ("Hello12", a1.Perform(std::make_tuple()));
+
+  a1 = Concat(1, 2, 3);
+  EXPECT_EQ("123", a1.Perform(std::make_tuple()));
+}
+
+// Verifies the type of an ACTION*.
+
+ACTION(DoFoo) {}
+ACTION_P(DoFoo, p) {}
+ACTION_P2(DoFoo, p0, p1) {}
+
+TEST(ActionPnMacroTest, TypesAreCorrect) {
+  // DoFoo() must be assignable to a DoFooAction variable.
+  DoFooAction a0 = DoFoo();
+
+  // DoFoo(1) must be assignable to a DoFooActionP variable.
+  DoFooActionP<int> a1 = DoFoo(1);
+
+  // DoFoo(p1, ..., pk) must be assignable to a DoFooActionPk
+  // variable, and so on.
+  DoFooActionP2<int, char> a2 = DoFoo(1, '2');
+  PlusActionP3<int, int, char> a3 = Plus(1, 2, '3');
+  PlusActionP4<int, int, int, char> a4 = Plus(1, 2, 3, '4');
+  PlusActionP5<int, int, int, int, char> a5 = Plus(1, 2, 3, 4, '5');
+  PlusActionP6<int, int, int, int, int, char> a6 = Plus(1, 2, 3, 4, 5, '6');
+  PlusActionP7<int, int, int, int, int, int, char> a7 =
+      Plus(1, 2, 3, 4, 5, 6, '7');
+  PlusActionP8<int, int, int, int, int, int, int, char> a8 =
+      Plus(1, 2, 3, 4, 5, 6, 7, '8');
+  PlusActionP9<int, int, int, int, int, int, int, int, char> a9 =
+      Plus(1, 2, 3, 4, 5, 6, 7, 8, '9');
+  PlusActionP10<int, int, int, int, int, int, int, int, int, char> a10 =
+      Plus(1, 2, 3, 4, 5, 6, 7, 8, 9, '0');
+
+  // Avoid "unused variable" warnings.
+  (void)a0;
+  (void)a1;
+  (void)a2;
+  (void)a3;
+  (void)a4;
+  (void)a5;
+  (void)a6;
+  (void)a7;
+  (void)a8;
+  (void)a9;
+  (void)a10;
+}
+
+// Tests that an ACTION_P*() action can be explicitly instantiated
+// with reference-typed parameters.
+
+ACTION_P(Plus1, x) { return x; }
+ACTION_P2(Plus2, x, y) { return x + y; }
+ACTION_P3(Plus3, x, y, z) { return x + y + z; }
+ACTION_P10(Plus10, a0, a1, a2, a3, a4, a5, a6, a7, a8, a9) {
+  return a0 + a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9;
+}
+
+TEST(ActionPnMacroTest, CanExplicitlyInstantiateWithReferenceTypes) {
+  int x = 1, y = 2, z = 3;
+  const std::tuple<> empty = std::make_tuple();
+
+  Action<int()> a = Plus1<int&>(x);
+  EXPECT_EQ(1, a.Perform(empty));
+
+  a = Plus2<const int&, int&>(x, y);
+  EXPECT_EQ(3, a.Perform(empty));
+
+  a = Plus3<int&, const int&, int&>(x, y, z);
+  EXPECT_EQ(6, a.Perform(empty));
+
+  int n[10] = { 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 };
+  a = Plus10<const int&, int&, const int&, int&, const int&, int&, const int&,
+      int&, const int&, int&>(n[0], n[1], n[2], n[3], n[4], n[5], n[6], n[7],
+                              n[8], n[9]);
+  EXPECT_EQ(55, a.Perform(empty));
+}
+
+class NullaryConstructorClass {
+ public:
+  NullaryConstructorClass() : value_(123) {}
+  int value_;
+};
+
+// Tests using ReturnNew() with a nullary constructor.
+TEST(ReturnNewTest, NoArgs) {
+  Action<NullaryConstructorClass*()> a = ReturnNew<NullaryConstructorClass>();
+  NullaryConstructorClass* c = a.Perform(std::make_tuple());
+  EXPECT_EQ(123, c->value_);
+  delete c;
+}
+
+class UnaryConstructorClass {
+ public:
+  explicit UnaryConstructorClass(int value) : value_(value) {}
+  int value_;
+};
+
+// Tests using ReturnNew() with a unary constructor.
+TEST(ReturnNewTest, Unary) {
+  Action<UnaryConstructorClass*()> a = ReturnNew<UnaryConstructorClass>(4000);
+  UnaryConstructorClass* c = a.Perform(std::make_tuple());
+  EXPECT_EQ(4000, c->value_);
+  delete c;
+}
+
+TEST(ReturnNewTest, UnaryWorksWhenMockMethodHasArgs) {
+  Action<UnaryConstructorClass*(bool, int)> a =
+      ReturnNew<UnaryConstructorClass>(4000);
+  UnaryConstructorClass* c = a.Perform(std::make_tuple(false, 5));
+  EXPECT_EQ(4000, c->value_);
+  delete c;
+}
+
+TEST(ReturnNewTest, UnaryWorksWhenMockMethodReturnsPointerToConst) {
+  Action<const UnaryConstructorClass*()> a =
+      ReturnNew<UnaryConstructorClass>(4000);
+  const UnaryConstructorClass* c = a.Perform(std::make_tuple());
+  EXPECT_EQ(4000, c->value_);
+  delete c;
+}
+
+class TenArgConstructorClass {
+ public:
+  TenArgConstructorClass(int a1, int a2, int a3, int a4, int a5,
+                         int a6, int a7, int a8, int a9, int a10)
+    : value_(a1 + a2 + a3 + a4 + a5 + a6 + a7 + a8 + a9 + a10) {
+  }
+  int value_;
+};
+
+// Tests using ReturnNew() with a 10-argument constructor.
+TEST(ReturnNewTest, ConstructorThatTakes10Arguments) {
+  Action<TenArgConstructorClass*()> a =
+      ReturnNew<TenArgConstructorClass>(1000000000, 200000000, 30000000,
+                                        4000000, 500000, 60000,
+                                        7000, 800, 90, 0);
+  TenArgConstructorClass* c = a.Perform(std::make_tuple());
+  EXPECT_EQ(1234567890, c->value_);
+  delete c;
+}
+
+// Tests that ACTION_TEMPLATE works when there is no value parameter.
+ACTION_TEMPLATE(CreateNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_0_VALUE_PARAMS()) {
+  return new T;
+}
+
+TEST(ActionTemplateTest, WorksWithoutValueParam) {
+  const Action<int*()> a = CreateNew<int>();
+  int* p = a.Perform(std::make_tuple());
+  delete p;
+}
+
+// Tests that ACTION_TEMPLATE works when there are value parameters.
+ACTION_TEMPLATE(CreateNew,
+                HAS_1_TEMPLATE_PARAMS(typename, T),
+                AND_1_VALUE_PARAMS(a0)) {
+  return new T(a0);
+}
+
+TEST(ActionTemplateTest, WorksWithValueParams) {
+  const Action<int*()> a = CreateNew<int>(42);
+  int* p = a.Perform(std::make_tuple());
+  EXPECT_EQ(42, *p);
+  delete p;
+}
+
+// Tests that ACTION_TEMPLATE works for integral template parameters.
+ACTION_TEMPLATE(MyDeleteArg,
+                HAS_1_TEMPLATE_PARAMS(int, k),
+                AND_0_VALUE_PARAMS()) {
+  delete std::get<k>(args);
+}
+
+// Resets a bool variable in the destructor.
+class BoolResetter {
+ public:
+  explicit BoolResetter(bool* value) : value_(value) {}
+  ~BoolResetter() { *value_ = false; }
+ private:
+  bool* value_;
+};
+
+TEST(ActionTemplateTest, WorksForIntegralTemplateParams) {
+  const Action<void(int*, BoolResetter*)> a = MyDeleteArg<1>();
+  int n = 0;
+  bool b = true;
+  BoolResetter* resetter = new BoolResetter(&b);
+  a.Perform(std::make_tuple(&n, resetter));
+  EXPECT_FALSE(b);  // Verifies that resetter is deleted.
+}
+
+// Tests that ACTION_TEMPLATES works for template template parameters.
+ACTION_TEMPLATE(ReturnSmartPointer,
+                HAS_1_TEMPLATE_PARAMS(template <typename Pointee> class,
+                                      Pointer),
+                AND_1_VALUE_PARAMS(pointee)) {
+  return Pointer<pointee_type>(new pointee_type(pointee));
+}
+
+TEST(ActionTemplateTest, WorksForTemplateTemplateParameters) {
+  const Action<std::shared_ptr<int>()> a =
+      ReturnSmartPointer<std::shared_ptr>(42);
+  std::shared_ptr<int> p = a.Perform(std::make_tuple());
+  EXPECT_EQ(42, *p);
+}
+
+// Tests that ACTION_TEMPLATE works for 10 template parameters.
+template <typename T1, typename T2, typename T3, int k4, bool k5,
+          unsigned int k6, typename T7, typename T8, typename T9>
+struct GiantTemplate {
+ public:
+  explicit GiantTemplate(int a_value) : value(a_value) {}
+  int value;
+};
+
+ACTION_TEMPLATE(ReturnGiant,
+                HAS_10_TEMPLATE_PARAMS(
+                    typename, T1,
+                    typename, T2,
+                    typename, T3,
+                    int, k4,
+                    bool, k5,
+                    unsigned int, k6,
+                    class, T7,
+                    class, T8,
+                    class, T9,
+                    template <typename T> class, T10),
+                AND_1_VALUE_PARAMS(value)) {
+  return GiantTemplate<T10<T1>, T2, T3, k4, k5, k6, T7, T8, T9>(value);
+}
+
+TEST(ActionTemplateTest, WorksFor10TemplateParameters) {
+  using Giant = GiantTemplate<std::shared_ptr<int>, bool, double, 5, true, 6,
+                              char, unsigned, int>;
+  const Action<Giant()> a = ReturnGiant<int, bool, double, 5, true, 6, char,
+                                        unsigned, int, std::shared_ptr>(42);
+  Giant giant = a.Perform(std::make_tuple());
+  EXPECT_EQ(42, giant.value);
+}
+
+// Tests that ACTION_TEMPLATE works for 10 value parameters.
+ACTION_TEMPLATE(ReturnSum,
+                HAS_1_TEMPLATE_PARAMS(typename, Number),
+                AND_10_VALUE_PARAMS(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10)) {
+  return static_cast<Number>(v1) + v2 + v3 + v4 + v5 + v6 + v7 + v8 + v9 + v10;
+}
+
+TEST(ActionTemplateTest, WorksFor10ValueParameters) {
+  const Action<int()> a = ReturnSum<int>(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
+  EXPECT_EQ(55, a.Perform(std::make_tuple()));
+}
+
+// Tests that ACTION_TEMPLATE and ACTION/ACTION_P* can be overloaded
+// on the number of value parameters.
+
+ACTION(ReturnSum) { return 0; }
+
+ACTION_P(ReturnSum, x) { return x; }
+
+ACTION_TEMPLATE(ReturnSum,
+                HAS_1_TEMPLATE_PARAMS(typename, Number),
+                AND_2_VALUE_PARAMS(v1, v2)) {
+  return static_cast<Number>(v1) + v2;
+}
+
+ACTION_TEMPLATE(ReturnSum,
+                HAS_1_TEMPLATE_PARAMS(typename, Number),
+                AND_3_VALUE_PARAMS(v1, v2, v3)) {
+  return static_cast<Number>(v1) + v2 + v3;
+}
+
+ACTION_TEMPLATE(ReturnSum,
+                HAS_2_TEMPLATE_PARAMS(typename, Number, int, k),
+                AND_4_VALUE_PARAMS(v1, v2, v3, v4)) {
+  return static_cast<Number>(v1) + v2 + v3 + v4 + k;
+}
+
+TEST(ActionTemplateTest, CanBeOverloadedOnNumberOfValueParameters) {
+  const Action<int()> a0 = ReturnSum();
+  const Action<int()> a1 = ReturnSum(1);
+  const Action<int()> a2 = ReturnSum<int>(1, 2);
+  const Action<int()> a3 = ReturnSum<int>(1, 2, 3);
+  const Action<int()> a4 = ReturnSum<int, 10000>(2000, 300, 40, 5);
+  EXPECT_EQ(0, a0.Perform(std::make_tuple()));
+  EXPECT_EQ(1, a1.Perform(std::make_tuple()));
+  EXPECT_EQ(3, a2.Perform(std::make_tuple()));
+  EXPECT_EQ(6, a3.Perform(std::make_tuple()));
+  EXPECT_EQ(12345, a4.Perform(std::make_tuple()));
+}
+
+
+}  // namespace gmock_generated_actions_test
+}  // namespace testing
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-generated-matchers_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-generated-matchers_test.cc
new file mode 100644
index 0000000000..26c41f6832
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-generated-matchers_test.cc
@@ -0,0 +1,1321 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the built-in matchers generated by a script.
+
+// Silence warning C4244: 'initializing': conversion from 'int' to 'short',
+// possible loss of data and C4100, unreferenced local parameter
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4244)
+# pragma warning(disable:4100)
+#endif
+
+#include "gmock/gmock-matchers.h"
+
+#include <array>
+#include <iterator>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+using std::list;
+using std::map;
+using std::pair;
+using std::set;
+using std::stringstream;
+using std::vector;
+using testing::_;
+using testing::AllOf;
+using testing::AllOfArray;
+using testing::AnyOf;
+using testing::AnyOfArray;
+using testing::Args;
+using testing::Contains;
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::Eq;
+using testing::Ge;
+using testing::Gt;
+using testing::Le;
+using testing::Lt;
+using testing::MakeMatcher;
+using testing::Matcher;
+using testing::MatcherInterface;
+using testing::MatchResultListener;
+using testing::Ne;
+using testing::Not;
+using testing::Pointee;
+using testing::PrintToString;
+using testing::Ref;
+using testing::StaticAssertTypeEq;
+using testing::StrEq;
+using testing::Value;
+using testing::internal::ElementsAreArrayMatcher;
+
+// Returns the description of the given matcher.
+template <typename T>
+std::string Describe(const Matcher<T>& m) {
+  stringstream ss;
+  m.DescribeTo(&ss);
+  return ss.str();
+}
+
+// Returns the description of the negation of the given matcher.
+template <typename T>
+std::string DescribeNegation(const Matcher<T>& m) {
+  stringstream ss;
+  m.DescribeNegationTo(&ss);
+  return ss.str();
+}
+
+// Returns the reason why x matches, or doesn't match, m.
+template <typename MatcherType, typename Value>
+std::string Explain(const MatcherType& m, const Value& x) {
+  stringstream ss;
+  m.ExplainMatchResultTo(x, &ss);
+  return ss.str();
+}
+
+// For testing ExplainMatchResultTo().
+class GreaterThanMatcher : public MatcherInterface<int> {
+ public:
+  explicit GreaterThanMatcher(int rhs) : rhs_(rhs) {}
+
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "is greater than " << rhs_;
+  }
+
+  bool MatchAndExplain(int lhs, MatchResultListener* listener) const override {
+    const int diff = lhs - rhs_;
+    if (diff > 0) {
+      *listener << "which is " << diff << " more than " << rhs_;
+    } else if (diff == 0) {
+      *listener << "which is the same as " << rhs_;
+    } else {
+      *listener << "which is " << -diff << " less than " << rhs_;
+    }
+
+    return lhs > rhs_;
+  }
+
+ private:
+  int rhs_;
+};
+
+Matcher<int> GreaterThan(int n) {
+  return MakeMatcher(new GreaterThanMatcher(n));
+}
+
+// Tests for ElementsAre().
+
+TEST(ElementsAreTest, CanDescribeExpectingNoElement) {
+  Matcher<const vector<int>&> m = ElementsAre();
+  EXPECT_EQ("is empty", Describe(m));
+}
+
+TEST(ElementsAreTest, CanDescribeExpectingOneElement) {
+  Matcher<vector<int> > m = ElementsAre(Gt(5));
+  EXPECT_EQ("has 1 element that is > 5", Describe(m));
+}
+
+TEST(ElementsAreTest, CanDescribeExpectingManyElements) {
+  Matcher<list<std::string> > m = ElementsAre(StrEq("one"), "two");
+  EXPECT_EQ("has 2 elements where\n"
+            "element #0 is equal to \"one\",\n"
+            "element #1 is equal to \"two\"", Describe(m));
+}
+
+TEST(ElementsAreTest, CanDescribeNegationOfExpectingNoElement) {
+  Matcher<vector<int> > m = ElementsAre();
+  EXPECT_EQ("isn't empty", DescribeNegation(m));
+}
+
+TEST(ElementsAreTest, CanDescribeNegationOfExpectingOneElment) {
+  Matcher<const list<int>& > m = ElementsAre(Gt(5));
+  EXPECT_EQ("doesn't have 1 element, or\n"
+            "element #0 isn't > 5", DescribeNegation(m));
+}
+
+TEST(ElementsAreTest, CanDescribeNegationOfExpectingManyElements) {
+  Matcher<const list<std::string>&> m = ElementsAre("one", "two");
+  EXPECT_EQ("doesn't have 2 elements, or\n"
+            "element #0 isn't equal to \"one\", or\n"
+            "element #1 isn't equal to \"two\"", DescribeNegation(m));
+}
+
+TEST(ElementsAreTest, DoesNotExplainTrivialMatch) {
+  Matcher<const list<int>& > m = ElementsAre(1, Ne(2));
+
+  list<int> test_list;
+  test_list.push_back(1);
+  test_list.push_back(3);
+  EXPECT_EQ("", Explain(m, test_list));  // No need to explain anything.
+}
+
+TEST(ElementsAreTest, ExplainsNonTrivialMatch) {
+  Matcher<const vector<int>& > m =
+      ElementsAre(GreaterThan(1), 0, GreaterThan(2));
+
+  const int a[] = { 10, 0, 100 };
+  vector<int> test_vector(std::begin(a), std::end(a));
+  EXPECT_EQ("whose element #0 matches, which is 9 more than 1,\n"
+            "and whose element #2 matches, which is 98 more than 2",
+            Explain(m, test_vector));
+}
+
+TEST(ElementsAreTest, CanExplainMismatchWrongSize) {
+  Matcher<const list<int>& > m = ElementsAre(1, 3);
+
+  list<int> test_list;
+  // No need to explain when the container is empty.
+  EXPECT_EQ("", Explain(m, test_list));
+
+  test_list.push_back(1);
+  EXPECT_EQ("which has 1 element", Explain(m, test_list));
+}
+
+TEST(ElementsAreTest, CanExplainMismatchRightSize) {
+  Matcher<const vector<int>& > m = ElementsAre(1, GreaterThan(5));
+
+  vector<int> v;
+  v.push_back(2);
+  v.push_back(1);
+  EXPECT_EQ("whose element #0 doesn't match", Explain(m, v));
+
+  v[0] = 1;
+  EXPECT_EQ("whose element #1 doesn't match, which is 4 less than 5",
+            Explain(m, v));
+}
+
+TEST(ElementsAreTest, MatchesOneElementVector) {
+  vector<std::string> test_vector;
+  test_vector.push_back("test string");
+
+  EXPECT_THAT(test_vector, ElementsAre(StrEq("test string")));
+}
+
+TEST(ElementsAreTest, MatchesOneElementList) {
+  list<std::string> test_list;
+  test_list.push_back("test string");
+
+  EXPECT_THAT(test_list, ElementsAre("test string"));
+}
+
+TEST(ElementsAreTest, MatchesThreeElementVector) {
+  vector<std::string> test_vector;
+  test_vector.push_back("one");
+  test_vector.push_back("two");
+  test_vector.push_back("three");
+
+  EXPECT_THAT(test_vector, ElementsAre("one", StrEq("two"), _));
+}
+
+TEST(ElementsAreTest, MatchesOneElementEqMatcher) {
+  vector<int> test_vector;
+  test_vector.push_back(4);
+
+  EXPECT_THAT(test_vector, ElementsAre(Eq(4)));
+}
+
+TEST(ElementsAreTest, MatchesOneElementAnyMatcher) {
+  vector<int> test_vector;
+  test_vector.push_back(4);
+
+  EXPECT_THAT(test_vector, ElementsAre(_));
+}
+
+TEST(ElementsAreTest, MatchesOneElementValue) {
+  vector<int> test_vector;
+  test_vector.push_back(4);
+
+  EXPECT_THAT(test_vector, ElementsAre(4));
+}
+
+TEST(ElementsAreTest, MatchesThreeElementsMixedMatchers) {
+  vector<int> test_vector;
+  test_vector.push_back(1);
+  test_vector.push_back(2);
+  test_vector.push_back(3);
+
+  EXPECT_THAT(test_vector, ElementsAre(1, Eq(2), _));
+}
+
+TEST(ElementsAreTest, MatchesTenElementVector) {
+  const int a[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 };
+  vector<int> test_vector(std::begin(a), std::end(a));
+
+  EXPECT_THAT(test_vector,
+              // The element list can contain values and/or matchers
+              // of different types.
+              ElementsAre(0, Ge(0), _, 3, 4, Ne(2), Eq(6), 7, 8, _));
+}
+
+TEST(ElementsAreTest, DoesNotMatchWrongSize) {
+  vector<std::string> test_vector;
+  test_vector.push_back("test string");
+  test_vector.push_back("test string");
+
+  Matcher<vector<std::string> > m = ElementsAre(StrEq("test string"));
+  EXPECT_FALSE(m.Matches(test_vector));
+}
+
+TEST(ElementsAreTest, DoesNotMatchWrongValue) {
+  vector<std::string> test_vector;
+  test_vector.push_back("other string");
+
+  Matcher<vector<std::string> > m = ElementsAre(StrEq("test string"));
+  EXPECT_FALSE(m.Matches(test_vector));
+}
+
+TEST(ElementsAreTest, DoesNotMatchWrongOrder) {
+  vector<std::string> test_vector;
+  test_vector.push_back("one");
+  test_vector.push_back("three");
+  test_vector.push_back("two");
+
+  Matcher<vector<std::string> > m =
+      ElementsAre(StrEq("one"), StrEq("two"), StrEq("three"));
+  EXPECT_FALSE(m.Matches(test_vector));
+}
+
+TEST(ElementsAreTest, WorksForNestedContainer) {
+  constexpr std::array<const char*, 2> strings = {{"Hi", "world"}};
+
+  vector<list<char> > nested;
+  for (size_t i = 0; i < strings.size(); i++) {
+    nested.push_back(list<char>(strings[i], strings[i] + strlen(strings[i])));
+  }
+
+  EXPECT_THAT(nested, ElementsAre(ElementsAre('H', Ne('e')),
+                                  ElementsAre('w', 'o', _, _, 'd')));
+  EXPECT_THAT(nested, Not(ElementsAre(ElementsAre('H', 'e'),
+                                      ElementsAre('w', 'o', _, _, 'd'))));
+}
+
+TEST(ElementsAreTest, WorksWithByRefElementMatchers) {
+  int a[] = { 0, 1, 2 };
+  vector<int> v(std::begin(a), std::end(a));
+
+  EXPECT_THAT(v, ElementsAre(Ref(v[0]), Ref(v[1]), Ref(v[2])));
+  EXPECT_THAT(v, Not(ElementsAre(Ref(v[0]), Ref(v[1]), Ref(a[2]))));
+}
+
+TEST(ElementsAreTest, WorksWithContainerPointerUsingPointee) {
+  int a[] = { 0, 1, 2 };
+  vector<int> v(std::begin(a), std::end(a));
+
+  EXPECT_THAT(&v, Pointee(ElementsAre(0, 1, _)));
+  EXPECT_THAT(&v, Not(Pointee(ElementsAre(0, _, 3))));
+}
+
+TEST(ElementsAreTest, WorksWithNativeArrayPassedByReference) {
+  int array[] = { 0, 1, 2 };
+  EXPECT_THAT(array, ElementsAre(0, 1, _));
+  EXPECT_THAT(array, Not(ElementsAre(1, _, _)));
+  EXPECT_THAT(array, Not(ElementsAre(0, _)));
+}
+
+class NativeArrayPassedAsPointerAndSize {
+ public:
+  NativeArrayPassedAsPointerAndSize() {}
+
+  MOCK_METHOD2(Helper, void(int* array, int size));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(NativeArrayPassedAsPointerAndSize);
+};
+
+TEST(ElementsAreTest, WorksWithNativeArrayPassedAsPointerAndSize) {
+  int array[] = { 0, 1 };
+  ::std::tuple<int*, size_t> array_as_tuple(array, 2);
+  EXPECT_THAT(array_as_tuple, ElementsAre(0, 1));
+  EXPECT_THAT(array_as_tuple, Not(ElementsAre(0)));
+
+  NativeArrayPassedAsPointerAndSize helper;
+  EXPECT_CALL(helper, Helper(_, _))
+      .With(ElementsAre(0, 1));
+  helper.Helper(array, 2);
+}
+
+TEST(ElementsAreTest, WorksWithTwoDimensionalNativeArray) {
+  const char a2[][3] = { "hi", "lo" };
+  EXPECT_THAT(a2, ElementsAre(ElementsAre('h', 'i', '\0'),
+                              ElementsAre('l', 'o', '\0')));
+  EXPECT_THAT(a2, ElementsAre(StrEq("hi"), StrEq("lo")));
+  EXPECT_THAT(a2, ElementsAre(Not(ElementsAre('h', 'o', '\0')),
+                              ElementsAre('l', 'o', '\0')));
+}
+
+TEST(ElementsAreTest, AcceptsStringLiteral) {
+  std::string array[] = {"hi", "one", "two"};
+  EXPECT_THAT(array, ElementsAre("hi", "one", "two"));
+  EXPECT_THAT(array, Not(ElementsAre("hi", "one", "too")));
+}
+
+// Declared here with the size unknown.  Defined AFTER the following test.
+extern const char kHi[];
+
+TEST(ElementsAreTest, AcceptsArrayWithUnknownSize) {
+  // The size of kHi is not known in this test, but ElementsAre() should
+  // still accept it.
+
+  std::string array1[] = {"hi"};
+  EXPECT_THAT(array1, ElementsAre(kHi));
+
+  std::string array2[] = {"ho"};
+  EXPECT_THAT(array2, Not(ElementsAre(kHi)));
+}
+
+const char kHi[] = "hi";
+
+TEST(ElementsAreTest, MakesCopyOfArguments) {
+  int x = 1;
+  int y = 2;
+  // This should make a copy of x and y.
+  ::testing::internal::ElementsAreMatcher<std::tuple<int, int> >
+      polymorphic_matcher = ElementsAre(x, y);
+  // Changing x and y now shouldn't affect the meaning of the above matcher.
+  x = y = 0;
+  const int array1[] = { 1, 2 };
+  EXPECT_THAT(array1, polymorphic_matcher);
+  const int array2[] = { 0, 0 };
+  EXPECT_THAT(array2, Not(polymorphic_matcher));
+}
+
+
+// Tests for ElementsAreArray().  Since ElementsAreArray() shares most
+// of the implementation with ElementsAre(), we don't test it as
+// thoroughly here.
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithValueArray) {
+  const int a[] = { 1, 2, 3 };
+
+  vector<int> test_vector(std::begin(a), std::end(a));
+  EXPECT_THAT(test_vector, ElementsAreArray(a));
+
+  test_vector[2] = 0;
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(a)));
+}
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithArraySize) {
+  std::array<const char*, 3> a = {{"one", "two", "three"}};
+
+  vector<std::string> test_vector(std::begin(a), std::end(a));
+  EXPECT_THAT(test_vector, ElementsAreArray(a.data(), a.size()));
+
+  const char** p = a.data();
+  test_vector[0] = "1";
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(p, a.size())));
+}
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithoutArraySize) {
+  const char* a[] = { "one", "two", "three" };
+
+  vector<std::string> test_vector(std::begin(a), std::end(a));
+  EXPECT_THAT(test_vector, ElementsAreArray(a));
+
+  test_vector[0] = "1";
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(a)));
+}
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithMatcherArray) {
+  const Matcher<std::string> kMatcherArray[] = {StrEq("one"), StrEq("two"),
+                                                StrEq("three")};
+
+  vector<std::string> test_vector;
+  test_vector.push_back("one");
+  test_vector.push_back("two");
+  test_vector.push_back("three");
+  EXPECT_THAT(test_vector, ElementsAreArray(kMatcherArray));
+
+  test_vector.push_back("three");
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(kMatcherArray)));
+}
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithVector) {
+  const int a[] = { 1, 2, 3 };
+  vector<int> test_vector(std::begin(a), std::end(a));
+  const vector<int> expected(std::begin(a), std::end(a));
+  EXPECT_THAT(test_vector, ElementsAreArray(expected));
+  test_vector.push_back(4);
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(expected)));
+}
+
+
+TEST(ElementsAreArrayTest, TakesInitializerList) {
+  const int a[5] = { 1, 2, 3, 4, 5 };
+  EXPECT_THAT(a, ElementsAreArray({ 1, 2, 3, 4, 5 }));
+  EXPECT_THAT(a, Not(ElementsAreArray({ 1, 2, 3, 5, 4 })));
+  EXPECT_THAT(a, Not(ElementsAreArray({ 1, 2, 3, 4, 6 })));
+}
+
+TEST(ElementsAreArrayTest, TakesInitializerListOfCStrings) {
+  const std::string a[5] = {"a", "b", "c", "d", "e"};
+  EXPECT_THAT(a, ElementsAreArray({ "a", "b", "c", "d", "e" }));
+  EXPECT_THAT(a, Not(ElementsAreArray({ "a", "b", "c", "e", "d" })));
+  EXPECT_THAT(a, Not(ElementsAreArray({ "a", "b", "c", "d", "ef" })));
+}
+
+TEST(ElementsAreArrayTest, TakesInitializerListOfSameTypedMatchers) {
+  const int a[5] = { 1, 2, 3, 4, 5 };
+  EXPECT_THAT(a, ElementsAreArray(
+      { Eq(1), Eq(2), Eq(3), Eq(4), Eq(5) }));
+  EXPECT_THAT(a, Not(ElementsAreArray(
+      { Eq(1), Eq(2), Eq(3), Eq(4), Eq(6) })));
+}
+
+TEST(ElementsAreArrayTest,
+     TakesInitializerListOfDifferentTypedMatchers) {
+  const int a[5] = { 1, 2, 3, 4, 5 };
+  // The compiler cannot infer the type of the initializer list if its
+  // elements have different types.  We must explicitly specify the
+  // unified element type in this case.
+  EXPECT_THAT(a, ElementsAreArray<Matcher<int> >(
+      { Eq(1), Ne(-2), Ge(3), Le(4), Eq(5) }));
+  EXPECT_THAT(a, Not(ElementsAreArray<Matcher<int> >(
+      { Eq(1), Ne(-2), Ge(3), Le(4), Eq(6) })));
+}
+
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithMatcherVector) {
+  const int a[] = { 1, 2, 3 };
+  const Matcher<int> kMatchers[] = { Eq(1), Eq(2), Eq(3) };
+  vector<int> test_vector(std::begin(a), std::end(a));
+  const vector<Matcher<int>> expected(std::begin(kMatchers),
+                                      std::end(kMatchers));
+  EXPECT_THAT(test_vector, ElementsAreArray(expected));
+  test_vector.push_back(4);
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(expected)));
+}
+
+TEST(ElementsAreArrayTest, CanBeCreatedWithIteratorRange) {
+  const int a[] = { 1, 2, 3 };
+  const vector<int> test_vector(std::begin(a), std::end(a));
+  const vector<int> expected(std::begin(a), std::end(a));
+  EXPECT_THAT(test_vector, ElementsAreArray(expected.begin(), expected.end()));
+  // Pointers are iterators, too.
+  EXPECT_THAT(test_vector, ElementsAreArray(std::begin(a), std::end(a)));
+  // The empty range of NULL pointers should also be okay.
+  int* const null_int = nullptr;
+  EXPECT_THAT(test_vector, Not(ElementsAreArray(null_int, null_int)));
+  EXPECT_THAT((vector<int>()), ElementsAreArray(null_int, null_int));
+}
+
+// Since ElementsAre() and ElementsAreArray() share much of the
+// implementation, we only do a sanity test for native arrays here.
+TEST(ElementsAreArrayTest, WorksWithNativeArray) {
+  ::std::string a[] = { "hi", "ho" };
+  ::std::string b[] = { "hi", "ho" };
+
+  EXPECT_THAT(a, ElementsAreArray(b));
+  EXPECT_THAT(a, ElementsAreArray(b, 2));
+  EXPECT_THAT(a, Not(ElementsAreArray(b, 1)));
+}
+
+TEST(ElementsAreArrayTest, SourceLifeSpan) {
+  const int a[] = { 1, 2, 3 };
+  vector<int> test_vector(std::begin(a), std::end(a));
+  vector<int> expect(std::begin(a), std::end(a));
+  ElementsAreArrayMatcher<int> matcher_maker =
+      ElementsAreArray(expect.begin(), expect.end());
+  EXPECT_THAT(test_vector, matcher_maker);
+  // Changing in place the values that initialized matcher_maker should not
+  // affect matcher_maker anymore. It should have made its own copy of them.
+  typedef vector<int>::iterator Iter;
+  for (Iter it = expect.begin(); it != expect.end(); ++it) { *it += 10; }
+  EXPECT_THAT(test_vector, matcher_maker);
+  test_vector.push_back(3);
+  EXPECT_THAT(test_vector, Not(matcher_maker));
+}
+
+// Tests for the MATCHER*() macro family.
+
+// Tests that a simple MATCHER() definition works.
+
+MATCHER(IsEven, "") { return (arg % 2) == 0; }
+
+TEST(MatcherMacroTest, Works) {
+  const Matcher<int> m = IsEven();
+  EXPECT_TRUE(m.Matches(6));
+  EXPECT_FALSE(m.Matches(7));
+
+  EXPECT_EQ("is even", Describe(m));
+  EXPECT_EQ("not (is even)", DescribeNegation(m));
+  EXPECT_EQ("", Explain(m, 6));
+  EXPECT_EQ("", Explain(m, 7));
+}
+
+// This also tests that the description string can reference 'negation'.
+MATCHER(IsEven2, negation ? "is odd" : "is even") {
+  if ((arg % 2) == 0) {
+    // Verifies that we can stream to result_listener, a listener
+    // supplied by the MATCHER macro implicitly.
+    *result_listener << "OK";
+    return true;
+  } else {
+    *result_listener << "% 2 == " << (arg % 2);
+    return false;
+  }
+}
+
+// This also tests that the description string can reference matcher
+// parameters.
+MATCHER_P2(EqSumOf, x, y, std::string(negation ? "doesn't equal" : "equals") +
+                              " the sum of " + PrintToString(x) + " and " +
+                              PrintToString(y)) {
+  if (arg == (x + y)) {
+    *result_listener << "OK";
+    return true;
+  } else {
+    // Verifies that we can stream to the underlying stream of
+    // result_listener.
+    if (result_listener->stream() != nullptr) {
+      *result_listener->stream() << "diff == " << (x + y - arg);
+    }
+    return false;
+  }
+}
+
+// Tests that the matcher description can reference 'negation' and the
+// matcher parameters.
+TEST(MatcherMacroTest, DescriptionCanReferenceNegationAndParameters) {
+  const Matcher<int> m1 = IsEven2();
+  EXPECT_EQ("is even", Describe(m1));
+  EXPECT_EQ("is odd", DescribeNegation(m1));
+
+  const Matcher<int> m2 = EqSumOf(5, 9);
+  EXPECT_EQ("equals the sum of 5 and 9", Describe(m2));
+  EXPECT_EQ("doesn't equal the sum of 5 and 9", DescribeNegation(m2));
+}
+
+// Tests explaining match result in a MATCHER* macro.
+TEST(MatcherMacroTest, CanExplainMatchResult) {
+  const Matcher<int> m1 = IsEven2();
+  EXPECT_EQ("OK", Explain(m1, 4));
+  EXPECT_EQ("% 2 == 1", Explain(m1, 5));
+
+  const Matcher<int> m2 = EqSumOf(1, 2);
+  EXPECT_EQ("OK", Explain(m2, 3));
+  EXPECT_EQ("diff == -1", Explain(m2, 4));
+}
+
+// Tests that the body of MATCHER() can reference the type of the
+// value being matched.
+
+MATCHER(IsEmptyString, "") {
+  StaticAssertTypeEq< ::std::string, arg_type>();
+  return arg == "";
+}
+
+MATCHER(IsEmptyStringByRef, "") {
+  StaticAssertTypeEq<const ::std::string&, arg_type>();
+  return arg == "";
+}
+
+TEST(MatcherMacroTest, CanReferenceArgType) {
+  const Matcher< ::std::string> m1 = IsEmptyString();
+  EXPECT_TRUE(m1.Matches(""));
+
+  const Matcher<const ::std::string&> m2 = IsEmptyStringByRef();
+  EXPECT_TRUE(m2.Matches(""));
+}
+
+// Tests that MATCHER() can be used in a namespace.
+
+namespace matcher_test {
+MATCHER(IsOdd, "") { return (arg % 2) != 0; }
+}  // namespace matcher_test
+
+TEST(MatcherMacroTest, WorksInNamespace) {
+  Matcher<int> m = matcher_test::IsOdd();
+  EXPECT_FALSE(m.Matches(4));
+  EXPECT_TRUE(m.Matches(5));
+}
+
+// Tests that Value() can be used to compose matchers.
+MATCHER(IsPositiveOdd, "") {
+  return Value(arg, matcher_test::IsOdd()) && arg > 0;
+}
+
+TEST(MatcherMacroTest, CanBeComposedUsingValue) {
+  EXPECT_THAT(3, IsPositiveOdd());
+  EXPECT_THAT(4, Not(IsPositiveOdd()));
+  EXPECT_THAT(-1, Not(IsPositiveOdd()));
+}
+
+// Tests that a simple MATCHER_P() definition works.
+
+MATCHER_P(IsGreaterThan32And, n, "") { return arg > 32 && arg > n; }
+
+TEST(MatcherPMacroTest, Works) {
+  const Matcher<int> m = IsGreaterThan32And(5);
+  EXPECT_TRUE(m.Matches(36));
+  EXPECT_FALSE(m.Matches(5));
+
+  EXPECT_EQ("is greater than 32 and 5", Describe(m));
+  EXPECT_EQ("not (is greater than 32 and 5)", DescribeNegation(m));
+  EXPECT_EQ("", Explain(m, 36));
+  EXPECT_EQ("", Explain(m, 5));
+}
+
+// Tests that the description is calculated correctly from the matcher name.
+MATCHER_P(_is_Greater_Than32and_, n, "") { return arg > 32 && arg > n; }
+
+TEST(MatcherPMacroTest, GeneratesCorrectDescription) {
+  const Matcher<int> m = _is_Greater_Than32and_(5);
+
+  EXPECT_EQ("is greater than 32 and 5", Describe(m));
+  EXPECT_EQ("not (is greater than 32 and 5)", DescribeNegation(m));
+  EXPECT_EQ("", Explain(m, 36));
+  EXPECT_EQ("", Explain(m, 5));
+}
+
+// Tests that a MATCHER_P matcher can be explicitly instantiated with
+// a reference parameter type.
+
+class UncopyableFoo {
+ public:
+  explicit UncopyableFoo(char value) : value_(value) {}
+ private:
+  UncopyableFoo(const UncopyableFoo&);
+  void operator=(const UncopyableFoo&);
+
+  char value_;
+};
+
+MATCHER_P(ReferencesUncopyable, variable, "") { return &arg == &variable; }
+
+TEST(MatcherPMacroTest, WorksWhenExplicitlyInstantiatedWithReference) {
+  UncopyableFoo foo1('1'), foo2('2');
+  const Matcher<const UncopyableFoo&> m =
+      ReferencesUncopyable<const UncopyableFoo&>(foo1);
+
+  EXPECT_TRUE(m.Matches(foo1));
+  EXPECT_FALSE(m.Matches(foo2));
+
+  // We don't want the address of the parameter printed, as most
+  // likely it will just annoy the user.  If the address is
+  // interesting, the user should consider passing the parameter by
+  // pointer instead.
+  EXPECT_EQ("references uncopyable 1-byte object <31>", Describe(m));
+}
+
+
+// Tests that the body of MATCHER_Pn() can reference the parameter
+// types.
+
+MATCHER_P3(ParamTypesAreIntLongAndChar, foo, bar, baz, "") {
+  StaticAssertTypeEq<int, foo_type>();
+  StaticAssertTypeEq<long, bar_type>();  // NOLINT
+  StaticAssertTypeEq<char, baz_type>();
+  return arg == 0;
+}
+
+TEST(MatcherPnMacroTest, CanReferenceParamTypes) {
+  EXPECT_THAT(0, ParamTypesAreIntLongAndChar(10, 20L, 'a'));
+}
+
+// Tests that a MATCHER_Pn matcher can be explicitly instantiated with
+// reference parameter types.
+
+MATCHER_P2(ReferencesAnyOf, variable1, variable2, "") {
+  return &arg == &variable1 || &arg == &variable2;
+}
+
+TEST(MatcherPnMacroTest, WorksWhenExplicitlyInstantiatedWithReferences) {
+  UncopyableFoo foo1('1'), foo2('2'), foo3('3');
+  const Matcher<const UncopyableFoo&> const_m =
+      ReferencesAnyOf<const UncopyableFoo&, const UncopyableFoo&>(foo1, foo2);
+
+  EXPECT_TRUE(const_m.Matches(foo1));
+  EXPECT_TRUE(const_m.Matches(foo2));
+  EXPECT_FALSE(const_m.Matches(foo3));
+
+  const Matcher<UncopyableFoo&> m =
+      ReferencesAnyOf<UncopyableFoo&, UncopyableFoo&>(foo1, foo2);
+
+  EXPECT_TRUE(m.Matches(foo1));
+  EXPECT_TRUE(m.Matches(foo2));
+  EXPECT_FALSE(m.Matches(foo3));
+}
+
+TEST(MatcherPnMacroTest,
+     GeneratesCorretDescriptionWhenExplicitlyInstantiatedWithReferences) {
+  UncopyableFoo foo1('1'), foo2('2');
+  const Matcher<const UncopyableFoo&> m =
+      ReferencesAnyOf<const UncopyableFoo&, const UncopyableFoo&>(foo1, foo2);
+
+  // We don't want the addresses of the parameters printed, as most
+  // likely they will just annoy the user.  If the addresses are
+  // interesting, the user should consider passing the parameters by
+  // pointers instead.
+  EXPECT_EQ("references any of (1-byte object <31>, 1-byte object <32>)",
+            Describe(m));
+}
+
+// Tests that a simple MATCHER_P2() definition works.
+
+MATCHER_P2(IsNotInClosedRange, low, hi, "") { return arg < low || arg > hi; }
+
+TEST(MatcherPnMacroTest, Works) {
+  const Matcher<const long&> m = IsNotInClosedRange(10, 20);  // NOLINT
+  EXPECT_TRUE(m.Matches(36L));
+  EXPECT_FALSE(m.Matches(15L));
+
+  EXPECT_EQ("is not in closed range (10, 20)", Describe(m));
+  EXPECT_EQ("not (is not in closed range (10, 20))", DescribeNegation(m));
+  EXPECT_EQ("", Explain(m, 36L));
+  EXPECT_EQ("", Explain(m, 15L));
+}
+
+// Tests that MATCHER*() definitions can be overloaded on the number
+// of parameters; also tests MATCHER_Pn() where n >= 3.
+
+MATCHER(EqualsSumOf, "") { return arg == 0; }
+MATCHER_P(EqualsSumOf, a, "") { return arg == a; }
+MATCHER_P2(EqualsSumOf, a, b, "") { return arg == a + b; }
+MATCHER_P3(EqualsSumOf, a, b, c, "") { return arg == a + b + c; }
+MATCHER_P4(EqualsSumOf, a, b, c, d, "") { return arg == a + b + c + d; }
+MATCHER_P5(EqualsSumOf, a, b, c, d, e, "") { return arg == a + b + c + d + e; }
+MATCHER_P6(EqualsSumOf, a, b, c, d, e, f, "") {
+  return arg == a + b + c + d + e + f;
+}
+MATCHER_P7(EqualsSumOf, a, b, c, d, e, f, g, "") {
+  return arg == a + b + c + d + e + f + g;
+}
+MATCHER_P8(EqualsSumOf, a, b, c, d, e, f, g, h, "") {
+  return arg == a + b + c + d + e + f + g + h;
+}
+MATCHER_P9(EqualsSumOf, a, b, c, d, e, f, g, h, i, "") {
+  return arg == a + b + c + d + e + f + g + h + i;
+}
+MATCHER_P10(EqualsSumOf, a, b, c, d, e, f, g, h, i, j, "") {
+  return arg == a + b + c + d + e + f + g + h + i + j;
+}
+
+TEST(MatcherPnMacroTest, CanBeOverloadedOnNumberOfParameters) {
+  EXPECT_THAT(0, EqualsSumOf());
+  EXPECT_THAT(1, EqualsSumOf(1));
+  EXPECT_THAT(12, EqualsSumOf(10, 2));
+  EXPECT_THAT(123, EqualsSumOf(100, 20, 3));
+  EXPECT_THAT(1234, EqualsSumOf(1000, 200, 30, 4));
+  EXPECT_THAT(12345, EqualsSumOf(10000, 2000, 300, 40, 5));
+  EXPECT_THAT("abcdef",
+              EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f'));
+  EXPECT_THAT("abcdefg",
+              EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g'));
+  EXPECT_THAT("abcdefgh",
+              EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g',
+                          "h"));
+  EXPECT_THAT("abcdefghi",
+              EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g',
+                          "h", 'i'));
+  EXPECT_THAT("abcdefghij",
+              EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g',
+                          "h", 'i', ::std::string("j")));
+
+  EXPECT_THAT(1, Not(EqualsSumOf()));
+  EXPECT_THAT(-1, Not(EqualsSumOf(1)));
+  EXPECT_THAT(-12, Not(EqualsSumOf(10, 2)));
+  EXPECT_THAT(-123, Not(EqualsSumOf(100, 20, 3)));
+  EXPECT_THAT(-1234, Not(EqualsSumOf(1000, 200, 30, 4)));
+  EXPECT_THAT(-12345, Not(EqualsSumOf(10000, 2000, 300, 40, 5)));
+  EXPECT_THAT("abcdef ",
+              Not(EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f')));
+  EXPECT_THAT("abcdefg ",
+              Not(EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f',
+                              'g')));
+  EXPECT_THAT("abcdefgh ",
+              Not(EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g',
+                              "h")));
+  EXPECT_THAT("abcdefghi ",
+              Not(EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g',
+                              "h", 'i')));
+  EXPECT_THAT("abcdefghij ",
+              Not(EqualsSumOf(::std::string("a"), 'b', 'c', "d", "e", 'f', 'g',
+                              "h", 'i', ::std::string("j"))));
+}
+
+// Tests that a MATCHER_Pn() definition can be instantiated with any
+// compatible parameter types.
+TEST(MatcherPnMacroTest, WorksForDifferentParameterTypes) {
+  EXPECT_THAT(123, EqualsSumOf(100L, 20, static_cast<char>(3)));
+  EXPECT_THAT("abcd", EqualsSumOf(::std::string("a"), "b", 'c', "d"));
+
+  EXPECT_THAT(124, Not(EqualsSumOf(100L, 20, static_cast<char>(3))));
+  EXPECT_THAT("abcde", Not(EqualsSumOf(::std::string("a"), "b", 'c', "d")));
+}
+
+// Tests that the matcher body can promote the parameter types.
+
+MATCHER_P2(EqConcat, prefix, suffix, "") {
+  // The following lines promote the two parameters to desired types.
+  std::string prefix_str(prefix);
+  char suffix_char = static_cast<char>(suffix);
+  return arg == prefix_str + suffix_char;
+}
+
+TEST(MatcherPnMacroTest, SimpleTypePromotion) {
+  Matcher<std::string> no_promo =
+      EqConcat(std::string("foo"), 't');
+  Matcher<const std::string&> promo =
+      EqConcat("foo", static_cast<int>('t'));
+  EXPECT_FALSE(no_promo.Matches("fool"));
+  EXPECT_FALSE(promo.Matches("fool"));
+  EXPECT_TRUE(no_promo.Matches("foot"));
+  EXPECT_TRUE(promo.Matches("foot"));
+}
+
+// Verifies the type of a MATCHER*.
+
+TEST(MatcherPnMacroTest, TypesAreCorrect) {
+  // EqualsSumOf() must be assignable to a EqualsSumOfMatcher variable.
+  EqualsSumOfMatcher a0 = EqualsSumOf();
+
+  // EqualsSumOf(1) must be assignable to a EqualsSumOfMatcherP variable.
+  EqualsSumOfMatcherP<int> a1 = EqualsSumOf(1);
+
+  // EqualsSumOf(p1, ..., pk) must be assignable to a EqualsSumOfMatcherPk
+  // variable, and so on.
+  EqualsSumOfMatcherP2<int, char> a2 = EqualsSumOf(1, '2');
+  EqualsSumOfMatcherP3<int, int, char> a3 = EqualsSumOf(1, 2, '3');
+  EqualsSumOfMatcherP4<int, int, int, char> a4 = EqualsSumOf(1, 2, 3, '4');
+  EqualsSumOfMatcherP5<int, int, int, int, char> a5 =
+      EqualsSumOf(1, 2, 3, 4, '5');
+  EqualsSumOfMatcherP6<int, int, int, int, int, char> a6 =
+      EqualsSumOf(1, 2, 3, 4, 5, '6');
+  EqualsSumOfMatcherP7<int, int, int, int, int, int, char> a7 =
+      EqualsSumOf(1, 2, 3, 4, 5, 6, '7');
+  EqualsSumOfMatcherP8<int, int, int, int, int, int, int, char> a8 =
+      EqualsSumOf(1, 2, 3, 4, 5, 6, 7, '8');
+  EqualsSumOfMatcherP9<int, int, int, int, int, int, int, int, char> a9 =
+      EqualsSumOf(1, 2, 3, 4, 5, 6, 7, 8, '9');
+  EqualsSumOfMatcherP10<int, int, int, int, int, int, int, int, int, char> a10 =
+      EqualsSumOf(1, 2, 3, 4, 5, 6, 7, 8, 9, '0');
+
+  // Avoid "unused variable" warnings.
+  (void)a0;
+  (void)a1;
+  (void)a2;
+  (void)a3;
+  (void)a4;
+  (void)a5;
+  (void)a6;
+  (void)a7;
+  (void)a8;
+  (void)a9;
+  (void)a10;
+}
+
+// Tests that matcher-typed parameters can be used in Value() inside a
+// MATCHER_Pn definition.
+
+// Succeeds if arg matches exactly 2 of the 3 matchers.
+MATCHER_P3(TwoOf, m1, m2, m3, "") {
+  const int count = static_cast<int>(Value(arg, m1))
+      + static_cast<int>(Value(arg, m2)) + static_cast<int>(Value(arg, m3));
+  return count == 2;
+}
+
+TEST(MatcherPnMacroTest, CanUseMatcherTypedParameterInValue) {
+  EXPECT_THAT(42, TwoOf(Gt(0), Lt(50), Eq(10)));
+  EXPECT_THAT(0, Not(TwoOf(Gt(-1), Lt(1), Eq(0))));
+}
+
+// Tests Contains().
+
+TEST(ContainsTest, ListMatchesWhenElementIsInContainer) {
+  list<int> some_list;
+  some_list.push_back(3);
+  some_list.push_back(1);
+  some_list.push_back(2);
+  EXPECT_THAT(some_list, Contains(1));
+  EXPECT_THAT(some_list, Contains(Gt(2.5)));
+  EXPECT_THAT(some_list, Contains(Eq(2.0f)));
+
+  list<std::string> another_list;
+  another_list.push_back("fee");
+  another_list.push_back("fie");
+  another_list.push_back("foe");
+  another_list.push_back("fum");
+  EXPECT_THAT(another_list, Contains(std::string("fee")));
+}
+
+TEST(ContainsTest, ListDoesNotMatchWhenElementIsNotInContainer) {
+  list<int> some_list;
+  some_list.push_back(3);
+  some_list.push_back(1);
+  EXPECT_THAT(some_list, Not(Contains(4)));
+}
+
+TEST(ContainsTest, SetMatchesWhenElementIsInContainer) {
+  set<int> some_set;
+  some_set.insert(3);
+  some_set.insert(1);
+  some_set.insert(2);
+  EXPECT_THAT(some_set, Contains(Eq(1.0)));
+  EXPECT_THAT(some_set, Contains(Eq(3.0f)));
+  EXPECT_THAT(some_set, Contains(2));
+
+  set<const char*> another_set;
+  another_set.insert("fee");
+  another_set.insert("fie");
+  another_set.insert("foe");
+  another_set.insert("fum");
+  EXPECT_THAT(another_set, Contains(Eq(std::string("fum"))));
+}
+
+TEST(ContainsTest, SetDoesNotMatchWhenElementIsNotInContainer) {
+  set<int> some_set;
+  some_set.insert(3);
+  some_set.insert(1);
+  EXPECT_THAT(some_set, Not(Contains(4)));
+
+  set<const char*> c_string_set;
+  c_string_set.insert("hello");
+  EXPECT_THAT(c_string_set, Not(Contains(std::string("hello").c_str())));
+}
+
+TEST(ContainsTest, ExplainsMatchResultCorrectly) {
+  const int a[2] = { 1, 2 };
+  Matcher<const int (&)[2]> m = Contains(2);
+  EXPECT_EQ("whose element #1 matches", Explain(m, a));
+
+  m = Contains(3);
+  EXPECT_EQ("", Explain(m, a));
+
+  m = Contains(GreaterThan(0));
+  EXPECT_EQ("whose element #0 matches, which is 1 more than 0", Explain(m, a));
+
+  m = Contains(GreaterThan(10));
+  EXPECT_EQ("", Explain(m, a));
+}
+
+TEST(ContainsTest, DescribesItselfCorrectly) {
+  Matcher<vector<int> > m = Contains(1);
+  EXPECT_EQ("contains at least one element that is equal to 1", Describe(m));
+
+  Matcher<vector<int> > m2 = Not(m);
+  EXPECT_EQ("doesn't contain any element that is equal to 1", Describe(m2));
+}
+
+TEST(ContainsTest, MapMatchesWhenElementIsInContainer) {
+  map<const char*, int> my_map;
+  const char* bar = "a string";
+  my_map[bar] = 2;
+  EXPECT_THAT(my_map, Contains(pair<const char* const, int>(bar, 2)));
+
+  map<std::string, int> another_map;
+  another_map["fee"] = 1;
+  another_map["fie"] = 2;
+  another_map["foe"] = 3;
+  another_map["fum"] = 4;
+  EXPECT_THAT(another_map,
+              Contains(pair<const std::string, int>(std::string("fee"), 1)));
+  EXPECT_THAT(another_map, Contains(pair<const std::string, int>("fie", 2)));
+}
+
+TEST(ContainsTest, MapDoesNotMatchWhenElementIsNotInContainer) {
+  map<int, int> some_map;
+  some_map[1] = 11;
+  some_map[2] = 22;
+  EXPECT_THAT(some_map, Not(Contains(pair<const int, int>(2, 23))));
+}
+
+TEST(ContainsTest, ArrayMatchesWhenElementIsInContainer) {
+  const char* string_array[] = { "fee", "fie", "foe", "fum" };
+  EXPECT_THAT(string_array, Contains(Eq(std::string("fum"))));
+}
+
+TEST(ContainsTest, ArrayDoesNotMatchWhenElementIsNotInContainer) {
+  int int_array[] = { 1, 2, 3, 4 };
+  EXPECT_THAT(int_array, Not(Contains(5)));
+}
+
+TEST(ContainsTest, AcceptsMatcher) {
+  const int a[] = { 1, 2, 3 };
+  EXPECT_THAT(a, Contains(Gt(2)));
+  EXPECT_THAT(a, Not(Contains(Gt(4))));
+}
+
+TEST(ContainsTest, WorksForNativeArrayAsTuple) {
+  const int a[] = { 1, 2 };
+  const int* const pointer = a;
+  EXPECT_THAT(std::make_tuple(pointer, 2), Contains(1));
+  EXPECT_THAT(std::make_tuple(pointer, 2), Not(Contains(Gt(3))));
+}
+
+TEST(ContainsTest, WorksForTwoDimensionalNativeArray) {
+  int a[][3] = { { 1, 2, 3 }, { 4, 5, 6 } };
+  EXPECT_THAT(a, Contains(ElementsAre(4, 5, 6)));
+  EXPECT_THAT(a, Contains(Contains(5)));
+  EXPECT_THAT(a, Not(Contains(ElementsAre(3, 4, 5))));
+  EXPECT_THAT(a, Contains(Not(Contains(5))));
+}
+
+TEST(AllOfArrayTest, BasicForms) {
+  // Iterator
+  std::vector<int> v0{};
+  std::vector<int> v1{1};
+  std::vector<int> v2{2, 3};
+  std::vector<int> v3{4, 4, 4};
+  EXPECT_THAT(0, AllOfArray(v0.begin(), v0.end()));
+  EXPECT_THAT(1, AllOfArray(v1.begin(), v1.end()));
+  EXPECT_THAT(2, Not(AllOfArray(v1.begin(), v1.end())));
+  EXPECT_THAT(3, Not(AllOfArray(v2.begin(), v2.end())));
+  EXPECT_THAT(4, AllOfArray(v3.begin(), v3.end()));
+  // Pointer +  size
+  int ar[6] = {1, 2, 3, 4, 4, 4};
+  EXPECT_THAT(0, AllOfArray(ar, 0));
+  EXPECT_THAT(1, AllOfArray(ar, 1));
+  EXPECT_THAT(2, Not(AllOfArray(ar, 1)));
+  EXPECT_THAT(3, Not(AllOfArray(ar + 1, 3)));
+  EXPECT_THAT(4, AllOfArray(ar + 3, 3));
+  // Array
+  // int ar0[0];  Not usable
+  int ar1[1] = {1};
+  int ar2[2] = {2, 3};
+  int ar3[3] = {4, 4, 4};
+  // EXPECT_THAT(0, Not(AllOfArray(ar0)));  // Cannot work
+  EXPECT_THAT(1, AllOfArray(ar1));
+  EXPECT_THAT(2, Not(AllOfArray(ar1)));
+  EXPECT_THAT(3, Not(AllOfArray(ar2)));
+  EXPECT_THAT(4, AllOfArray(ar3));
+  // Container
+  EXPECT_THAT(0, AllOfArray(v0));
+  EXPECT_THAT(1, AllOfArray(v1));
+  EXPECT_THAT(2, Not(AllOfArray(v1)));
+  EXPECT_THAT(3, Not(AllOfArray(v2)));
+  EXPECT_THAT(4, AllOfArray(v3));
+  // Initializer
+  EXPECT_THAT(0, AllOfArray<int>({}));  // Requires template arg.
+  EXPECT_THAT(1, AllOfArray({1}));
+  EXPECT_THAT(2, Not(AllOfArray({1})));
+  EXPECT_THAT(3, Not(AllOfArray({2, 3})));
+  EXPECT_THAT(4, AllOfArray({4, 4, 4}));
+}
+
+TEST(AllOfArrayTest, Matchers) {
+  // vector
+  std::vector<Matcher<int>> matchers{Ge(1), Lt(2)};
+  EXPECT_THAT(0, Not(AllOfArray(matchers)));
+  EXPECT_THAT(1, AllOfArray(matchers));
+  EXPECT_THAT(2, Not(AllOfArray(matchers)));
+  // initializer_list
+  EXPECT_THAT(0, Not(AllOfArray({Ge(0), Ge(1)})));
+  EXPECT_THAT(1, AllOfArray({Ge(0), Ge(1)}));
+}
+
+TEST(AnyOfArrayTest, BasicForms) {
+  // Iterator
+  std::vector<int> v0{};
+  std::vector<int> v1{1};
+  std::vector<int> v2{2, 3};
+  EXPECT_THAT(0, Not(AnyOfArray(v0.begin(), v0.end())));
+  EXPECT_THAT(1, AnyOfArray(v1.begin(), v1.end()));
+  EXPECT_THAT(2, Not(AnyOfArray(v1.begin(), v1.end())));
+  EXPECT_THAT(3, AnyOfArray(v2.begin(), v2.end()));
+  EXPECT_THAT(4, Not(AnyOfArray(v2.begin(), v2.end())));
+  // Pointer +  size
+  int ar[3] = {1, 2, 3};
+  EXPECT_THAT(0, Not(AnyOfArray(ar, 0)));
+  EXPECT_THAT(1, AnyOfArray(ar, 1));
+  EXPECT_THAT(2, Not(AnyOfArray(ar, 1)));
+  EXPECT_THAT(3, AnyOfArray(ar + 1, 2));
+  EXPECT_THAT(4, Not(AnyOfArray(ar + 1, 2)));
+  // Array
+  // int ar0[0];  Not usable
+  int ar1[1] = {1};
+  int ar2[2] = {2, 3};
+  // EXPECT_THAT(0, Not(AnyOfArray(ar0)));  // Cannot work
+  EXPECT_THAT(1, AnyOfArray(ar1));
+  EXPECT_THAT(2, Not(AnyOfArray(ar1)));
+  EXPECT_THAT(3, AnyOfArray(ar2));
+  EXPECT_THAT(4, Not(AnyOfArray(ar2)));
+  // Container
+  EXPECT_THAT(0, Not(AnyOfArray(v0)));
+  EXPECT_THAT(1, AnyOfArray(v1));
+  EXPECT_THAT(2, Not(AnyOfArray(v1)));
+  EXPECT_THAT(3, AnyOfArray(v2));
+  EXPECT_THAT(4, Not(AnyOfArray(v2)));
+  // Initializer
+  EXPECT_THAT(0, Not(AnyOfArray<int>({})));  // Requires template arg.
+  EXPECT_THAT(1, AnyOfArray({1}));
+  EXPECT_THAT(2, Not(AnyOfArray({1})));
+  EXPECT_THAT(3, AnyOfArray({2, 3}));
+  EXPECT_THAT(4, Not(AnyOfArray({2, 3})));
+}
+
+TEST(AnyOfArrayTest, Matchers) {
+  // We negate test AllOfArrayTest.Matchers.
+  // vector
+  std::vector<Matcher<int>> matchers{Lt(1), Ge(2)};
+  EXPECT_THAT(0, AnyOfArray(matchers));
+  EXPECT_THAT(1, Not(AnyOfArray(matchers)));
+  EXPECT_THAT(2, AnyOfArray(matchers));
+  // initializer_list
+  EXPECT_THAT(0, AnyOfArray({Lt(0), Lt(1)}));
+  EXPECT_THAT(1, Not(AllOfArray({Lt(0), Lt(1)})));
+}
+
+TEST(AnyOfArrayTest, ExplainsMatchResultCorrectly) {
+  // AnyOfArray and AllOfArry use the same underlying template-template,
+  // thus it is sufficient to test one here.
+  const std::vector<int> v0{};
+  const std::vector<int> v1{1};
+  const std::vector<int> v2{2, 3};
+  const Matcher<int> m0 = AnyOfArray(v0);
+  const Matcher<int> m1 = AnyOfArray(v1);
+  const Matcher<int> m2 = AnyOfArray(v2);
+  EXPECT_EQ("", Explain(m0, 0));
+  EXPECT_EQ("", Explain(m1, 1));
+  EXPECT_EQ("", Explain(m1, 2));
+  EXPECT_EQ("", Explain(m2, 3));
+  EXPECT_EQ("", Explain(m2, 4));
+  EXPECT_EQ("()", Describe(m0));
+  EXPECT_EQ("(is equal to 1)", Describe(m1));
+  EXPECT_EQ("(is equal to 2) or (is equal to 3)", Describe(m2));
+  EXPECT_EQ("()", DescribeNegation(m0));
+  EXPECT_EQ("(isn't equal to 1)", DescribeNegation(m1));
+  EXPECT_EQ("(isn't equal to 2) and (isn't equal to 3)", DescribeNegation(m2));
+  // Explain with matchers
+  const Matcher<int> g1 = AnyOfArray({GreaterThan(1)});
+  const Matcher<int> g2 = AnyOfArray({GreaterThan(1), GreaterThan(2)});
+  // Explains the first positiv match and all prior negative matches...
+  EXPECT_EQ("which is 1 less than 1", Explain(g1, 0));
+  EXPECT_EQ("which is the same as 1", Explain(g1, 1));
+  EXPECT_EQ("which is 1 more than 1", Explain(g1, 2));
+  EXPECT_EQ("which is 1 less than 1, and which is 2 less than 2",
+            Explain(g2, 0));
+  EXPECT_EQ("which is the same as 1, and which is 1 less than 2",
+            Explain(g2, 1));
+  EXPECT_EQ("which is 1 more than 1",  // Only the first
+            Explain(g2, 2));
+}
+
+TEST(AllOfTest, HugeMatcher) {
+  // Verify that using AllOf with many arguments doesn't cause
+  // the compiler to exceed template instantiation depth limit.
+  EXPECT_THAT(0, testing::AllOf(_, _, _, _, _, _, _, _, _,
+                                testing::AllOf(_, _, _, _, _, _, _, _, _, _)));
+}
+
+TEST(AnyOfTest, HugeMatcher) {
+  // Verify that using AnyOf with many arguments doesn't cause
+  // the compiler to exceed template instantiation depth limit.
+  EXPECT_THAT(0, testing::AnyOf(_, _, _, _, _, _, _, _, _,
+                                testing::AnyOf(_, _, _, _, _, _, _, _, _, _)));
+}
+
+namespace adl_test {
+
+// Verifies that the implementation of ::testing::AllOf and ::testing::AnyOf
+// don't issue unqualified recursive calls.  If they do, the argument dependent
+// name lookup will cause AllOf/AnyOf in the 'adl_test' namespace to be found
+// as a candidate and the compilation will break due to an ambiguous overload.
+
+// The matcher must be in the same namespace as AllOf/AnyOf to make argument
+// dependent lookup find those.
+MATCHER(M, "") { return true; }
+
+template <typename T1, typename T2>
+bool AllOf(const T1& /*t1*/, const T2& /*t2*/) { return true; }
+
+TEST(AllOfTest, DoesNotCallAllOfUnqualified) {
+  EXPECT_THAT(42, testing::AllOf(
+      M(), M(), M(), M(), M(), M(), M(), M(), M(), M()));
+}
+
+template <typename T1, typename T2> bool
+AnyOf(const T1& t1, const T2& t2) { return true; }
+
+TEST(AnyOfTest, DoesNotCallAnyOfUnqualified) {
+  EXPECT_THAT(42, testing::AnyOf(
+      M(), M(), M(), M(), M(), M(), M(), M(), M(), M()));
+}
+
+}  // namespace adl_test
+
+
+TEST(AllOfTest, WorksOnMoveOnlyType) {
+  std::unique_ptr<int> p(new int(3));
+  EXPECT_THAT(p, AllOf(Pointee(Eq(3)), Pointee(Gt(0)), Pointee(Lt(5))));
+  EXPECT_THAT(p, Not(AllOf(Pointee(Eq(3)), Pointee(Gt(0)), Pointee(Lt(3)))));
+}
+
+TEST(AnyOfTest, WorksOnMoveOnlyType) {
+  std::unique_ptr<int> p(new int(3));
+  EXPECT_THAT(p, AnyOf(Pointee(Eq(5)), Pointee(Lt(0)), Pointee(Lt(5))));
+  EXPECT_THAT(p, Not(AnyOf(Pointee(Eq(5)), Pointee(Lt(0)), Pointee(Gt(5)))));
+}
+
+MATCHER(IsNotNull, "") {
+  return arg != nullptr;
+}
+
+// Verifies that a matcher defined using MATCHER() can work on
+// move-only types.
+TEST(MatcherMacroTest, WorksOnMoveOnlyType) {
+  std::unique_ptr<int> p(new int(3));
+  EXPECT_THAT(p, IsNotNull());
+  EXPECT_THAT(std::unique_ptr<int>(), Not(IsNotNull()));
+}
+
+MATCHER_P(UniquePointee, pointee, "") {
+  return *arg == pointee;
+}
+
+// Verifies that a matcher defined using MATCHER_P*() can work on
+// move-only types.
+TEST(MatcherPMacroTest, WorksOnMoveOnlyType) {
+  std::unique_ptr<int> p(new int(3));
+  EXPECT_THAT(p, UniquePointee(3));
+  EXPECT_THAT(p, Not(UniquePointee(2)));
+}
+
+
+}  // namespace
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-internal-utils_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-internal-utils_test.cc
new file mode 100644
index 0000000000..8019f4a306
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-internal-utils_test.cc
@@ -0,0 +1,734 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the internal utilities.
+
+#include "gmock/internal/gmock-internal-utils.h"
+
+#include <stdlib.h>
+
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
+
+// Indicates that this translation unit is part of Google Test's
+// implementation.  It must come before gtest-internal-inl.h is
+// included, or there will be a compiler error.  This trick is to
+// prevent a user from accidentally including gtest-internal-inl.h in
+// their code.
+#define GTEST_IMPLEMENTATION_ 1
+#include "src/gtest-internal-inl.h"
+#undef GTEST_IMPLEMENTATION_
+
+#if GTEST_OS_CYGWIN
+# include <sys/types.h>  // For ssize_t. NOLINT
+#endif
+
+namespace proto2 {
+class Message;
+}  // namespace proto2
+
+namespace testing {
+namespace internal {
+
+namespace {
+
+TEST(JoinAsTupleTest, JoinsEmptyTuple) {
+  EXPECT_EQ("", JoinAsTuple(Strings()));
+}
+
+TEST(JoinAsTupleTest, JoinsOneTuple) {
+  const char* fields[] = {"1"};
+  EXPECT_EQ("1", JoinAsTuple(Strings(fields, fields + 1)));
+}
+
+TEST(JoinAsTupleTest, JoinsTwoTuple) {
+  const char* fields[] = {"1", "a"};
+  EXPECT_EQ("(1, a)", JoinAsTuple(Strings(fields, fields + 2)));
+}
+
+TEST(JoinAsTupleTest, JoinsTenTuple) {
+  const char* fields[] = {"1", "2", "3", "4", "5", "6", "7", "8", "9", "10"};
+  EXPECT_EQ("(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)",
+            JoinAsTuple(Strings(fields, fields + 10)));
+}
+
+TEST(ConvertIdentifierNameToWordsTest, WorksWhenNameContainsNoWord) {
+  EXPECT_EQ("", ConvertIdentifierNameToWords(""));
+  EXPECT_EQ("", ConvertIdentifierNameToWords("_"));
+  EXPECT_EQ("", ConvertIdentifierNameToWords("__"));
+}
+
+TEST(ConvertIdentifierNameToWordsTest, WorksWhenNameContainsDigits) {
+  EXPECT_EQ("1", ConvertIdentifierNameToWords("_1"));
+  EXPECT_EQ("2", ConvertIdentifierNameToWords("2_"));
+  EXPECT_EQ("34", ConvertIdentifierNameToWords("_34_"));
+  EXPECT_EQ("34 56", ConvertIdentifierNameToWords("_34_56"));
+}
+
+TEST(ConvertIdentifierNameToWordsTest, WorksWhenNameContainsCamelCaseWords) {
+  EXPECT_EQ("a big word", ConvertIdentifierNameToWords("ABigWord"));
+  EXPECT_EQ("foo bar", ConvertIdentifierNameToWords("FooBar"));
+  EXPECT_EQ("foo", ConvertIdentifierNameToWords("Foo_"));
+  EXPECT_EQ("foo bar", ConvertIdentifierNameToWords("_Foo_Bar_"));
+  EXPECT_EQ("foo and bar", ConvertIdentifierNameToWords("_Foo__And_Bar"));
+}
+
+TEST(ConvertIdentifierNameToWordsTest, WorksWhenNameContains_SeparatedWords) {
+  EXPECT_EQ("foo bar", ConvertIdentifierNameToWords("foo_bar"));
+  EXPECT_EQ("foo", ConvertIdentifierNameToWords("_foo_"));
+  EXPECT_EQ("foo bar", ConvertIdentifierNameToWords("_foo_bar_"));
+  EXPECT_EQ("foo and bar", ConvertIdentifierNameToWords("_foo__and_bar"));
+}
+
+TEST(ConvertIdentifierNameToWordsTest, WorksWhenNameIsMixture) {
+  EXPECT_EQ("foo bar 123", ConvertIdentifierNameToWords("Foo_bar123"));
+  EXPECT_EQ("chapter 11 section 1",
+            ConvertIdentifierNameToWords("_Chapter11Section_1_"));
+}
+
+TEST(PointeeOfTest, WorksForSmartPointers) {
+  EXPECT_TRUE(
+      (std::is_same<int, PointeeOf<std::unique_ptr<int>>::type>::value));
+  EXPECT_TRUE(
+      (std::is_same<std::string,
+                    PointeeOf<std::shared_ptr<std::string>>::type>::value));
+}
+
+TEST(PointeeOfTest, WorksForRawPointers) {
+  EXPECT_TRUE((std::is_same<int, PointeeOf<int*>::type>::value));
+  EXPECT_TRUE((std::is_same<const char, PointeeOf<const char*>::type>::value));
+  EXPECT_TRUE((std::is_void<PointeeOf<void*>::type>::value));
+}
+
+TEST(GetRawPointerTest, WorksForSmartPointers) {
+  const char* const raw_p1 = new const char('a');  // NOLINT
+  const std::unique_ptr<const char> p1(raw_p1);
+  EXPECT_EQ(raw_p1, GetRawPointer(p1));
+  double* const raw_p2 = new double(2.5);  // NOLINT
+  const std::shared_ptr<double> p2(raw_p2);
+  EXPECT_EQ(raw_p2, GetRawPointer(p2));
+}
+
+TEST(GetRawPointerTest, WorksForRawPointers) {
+  int* p = nullptr;
+  EXPECT_TRUE(nullptr == GetRawPointer(p));
+  int n = 1;
+  EXPECT_EQ(&n, GetRawPointer(&n));
+}
+
+// Tests KindOf<T>.
+
+class Base {};
+class Derived : public Base {};
+
+TEST(KindOfTest, Bool) {
+  EXPECT_EQ(kBool, GMOCK_KIND_OF_(bool));  // NOLINT
+}
+
+TEST(KindOfTest, Integer) {
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(char));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(signed char));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(unsigned char));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(short));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(unsigned short));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(int));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(unsigned int));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(long));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(unsigned long));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(long long));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(unsigned long long));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(wchar_t));  // NOLINT
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(size_t));  // NOLINT
+#if GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_CYGWIN
+  // ssize_t is not defined on Windows and possibly some other OSes.
+  EXPECT_EQ(kInteger, GMOCK_KIND_OF_(ssize_t));  // NOLINT
+#endif
+}
+
+TEST(KindOfTest, FloatingPoint) {
+  EXPECT_EQ(kFloatingPoint, GMOCK_KIND_OF_(float));  // NOLINT
+  EXPECT_EQ(kFloatingPoint, GMOCK_KIND_OF_(double));  // NOLINT
+  EXPECT_EQ(kFloatingPoint, GMOCK_KIND_OF_(long double));  // NOLINT
+}
+
+TEST(KindOfTest, Other) {
+  EXPECT_EQ(kOther, GMOCK_KIND_OF_(void*));  // NOLINT
+  EXPECT_EQ(kOther, GMOCK_KIND_OF_(char**));  // NOLINT
+  EXPECT_EQ(kOther, GMOCK_KIND_OF_(Base));  // NOLINT
+}
+
+// Tests LosslessArithmeticConvertible<T, U>.
+
+TEST(LosslessArithmeticConvertibleTest, BoolToBool) {
+  EXPECT_TRUE((LosslessArithmeticConvertible<bool, bool>::value));
+}
+
+TEST(LosslessArithmeticConvertibleTest, BoolToInteger) {
+  EXPECT_TRUE((LosslessArithmeticConvertible<bool, char>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<bool, int>::value));
+  EXPECT_TRUE(
+      (LosslessArithmeticConvertible<bool, unsigned long>::value));  // NOLINT
+}
+
+TEST(LosslessArithmeticConvertibleTest, BoolToFloatingPoint) {
+  EXPECT_TRUE((LosslessArithmeticConvertible<bool, float>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<bool, double>::value));
+}
+
+TEST(LosslessArithmeticConvertibleTest, IntegerToBool) {
+  EXPECT_FALSE((LosslessArithmeticConvertible<unsigned char, bool>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<int, bool>::value));
+}
+
+TEST(LosslessArithmeticConvertibleTest, IntegerToInteger) {
+  // Unsigned => larger signed is fine.
+  EXPECT_TRUE((LosslessArithmeticConvertible<unsigned char, int>::value));
+
+  // Unsigned => larger unsigned is fine.
+  EXPECT_TRUE((LosslessArithmeticConvertible<
+               unsigned short, uint64_t>::value));  // NOLINT
+
+  // Signed => unsigned is not fine.
+  EXPECT_FALSE((LosslessArithmeticConvertible<
+                short, uint64_t>::value));  // NOLINT
+  EXPECT_FALSE((LosslessArithmeticConvertible<
+      signed char, unsigned int>::value));  // NOLINT
+
+  // Same size and same signedness: fine too.
+  EXPECT_TRUE((LosslessArithmeticConvertible<
+               unsigned char, unsigned char>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<int, int>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<wchar_t, wchar_t>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<
+               unsigned long, unsigned long>::value));  // NOLINT
+
+  // Same size, different signedness: not fine.
+  EXPECT_FALSE((LosslessArithmeticConvertible<
+                unsigned char, signed char>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<int, unsigned int>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<uint64_t, int64_t>::value));
+
+  // Larger size => smaller size is not fine.
+  EXPECT_FALSE((LosslessArithmeticConvertible<long, char>::value));  // NOLINT
+  EXPECT_FALSE((LosslessArithmeticConvertible<int, signed char>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<int64_t, unsigned int>::value));
+}
+
+TEST(LosslessArithmeticConvertibleTest, IntegerToFloatingPoint) {
+  // Integers cannot be losslessly converted to floating-points, as
+  // the format of the latter is implementation-defined.
+  EXPECT_FALSE((LosslessArithmeticConvertible<char, float>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<int, double>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<
+                short, long double>::value));  // NOLINT
+}
+
+TEST(LosslessArithmeticConvertibleTest, FloatingPointToBool) {
+  EXPECT_FALSE((LosslessArithmeticConvertible<float, bool>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<double, bool>::value));
+}
+
+TEST(LosslessArithmeticConvertibleTest, FloatingPointToInteger) {
+  EXPECT_FALSE((LosslessArithmeticConvertible<float, long>::value));  // NOLINT
+  EXPECT_FALSE((LosslessArithmeticConvertible<double, int64_t>::value));
+  EXPECT_FALSE((LosslessArithmeticConvertible<long double, int>::value));
+}
+
+TEST(LosslessArithmeticConvertibleTest, FloatingPointToFloatingPoint) {
+  // Smaller size => larger size is fine.
+  EXPECT_TRUE((LosslessArithmeticConvertible<float, double>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<float, long double>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<double, long double>::value));
+
+  // Same size: fine.
+  EXPECT_TRUE((LosslessArithmeticConvertible<float, float>::value));
+  EXPECT_TRUE((LosslessArithmeticConvertible<double, double>::value));
+
+  // Larger size => smaller size is not fine.
+  EXPECT_FALSE((LosslessArithmeticConvertible<double, float>::value));
+  GTEST_INTENTIONAL_CONST_COND_PUSH_()
+  if (sizeof(double) == sizeof(long double)) {  // NOLINT
+  GTEST_INTENTIONAL_CONST_COND_POP_()
+    // In some implementations (e.g. MSVC), double and long double
+    // have the same size.
+    EXPECT_TRUE((LosslessArithmeticConvertible<long double, double>::value));
+  } else {
+    EXPECT_FALSE((LosslessArithmeticConvertible<long double, double>::value));
+  }
+}
+
+// Tests the TupleMatches() template function.
+
+TEST(TupleMatchesTest, WorksForSize0) {
+  std::tuple<> matchers;
+  std::tuple<> values;
+
+  EXPECT_TRUE(TupleMatches(matchers, values));
+}
+
+TEST(TupleMatchesTest, WorksForSize1) {
+  std::tuple<Matcher<int> > matchers(Eq(1));
+  std::tuple<int> values1(1), values2(2);
+
+  EXPECT_TRUE(TupleMatches(matchers, values1));
+  EXPECT_FALSE(TupleMatches(matchers, values2));
+}
+
+TEST(TupleMatchesTest, WorksForSize2) {
+  std::tuple<Matcher<int>, Matcher<char> > matchers(Eq(1), Eq('a'));
+  std::tuple<int, char> values1(1, 'a'), values2(1, 'b'), values3(2, 'a'),
+      values4(2, 'b');
+
+  EXPECT_TRUE(TupleMatches(matchers, values1));
+  EXPECT_FALSE(TupleMatches(matchers, values2));
+  EXPECT_FALSE(TupleMatches(matchers, values3));
+  EXPECT_FALSE(TupleMatches(matchers, values4));
+}
+
+TEST(TupleMatchesTest, WorksForSize5) {
+  std::tuple<Matcher<int>, Matcher<char>, Matcher<bool>,
+             Matcher<long>,  // NOLINT
+             Matcher<std::string> >
+      matchers(Eq(1), Eq('a'), Eq(true), Eq(2L), Eq("hi"));
+  std::tuple<int, char, bool, long, std::string>  // NOLINT
+      values1(1, 'a', true, 2L, "hi"), values2(1, 'a', true, 2L, "hello"),
+      values3(2, 'a', true, 2L, "hi");
+
+  EXPECT_TRUE(TupleMatches(matchers, values1));
+  EXPECT_FALSE(TupleMatches(matchers, values2));
+  EXPECT_FALSE(TupleMatches(matchers, values3));
+}
+
+// Tests that Assert(true, ...) succeeds.
+TEST(AssertTest, SucceedsOnTrue) {
+  Assert(true, __FILE__, __LINE__, "This should succeed.");
+  Assert(true, __FILE__, __LINE__);  // This should succeed too.
+}
+
+// Tests that Assert(false, ...) generates a fatal failure.
+TEST(AssertTest, FailsFatallyOnFalse) {
+  EXPECT_DEATH_IF_SUPPORTED({
+    Assert(false, __FILE__, __LINE__, "This should fail.");
+  }, "");
+
+  EXPECT_DEATH_IF_SUPPORTED({
+    Assert(false, __FILE__, __LINE__);
+  }, "");
+}
+
+// Tests that Expect(true, ...) succeeds.
+TEST(ExpectTest, SucceedsOnTrue) {
+  Expect(true, __FILE__, __LINE__, "This should succeed.");
+  Expect(true, __FILE__, __LINE__);  // This should succeed too.
+}
+
+// Tests that Expect(false, ...) generates a non-fatal failure.
+TEST(ExpectTest, FailsNonfatallyOnFalse) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    Expect(false, __FILE__, __LINE__, "This should fail.");
+  }, "This should fail");
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    Expect(false, __FILE__, __LINE__);
+  }, "Expectation failed");
+}
+
+// Tests LogIsVisible().
+
+class LogIsVisibleTest : public ::testing::Test {
+ protected:
+  void SetUp() override { original_verbose_ = GMOCK_FLAG(verbose); }
+
+  void TearDown() override { GMOCK_FLAG(verbose) = original_verbose_; }
+
+  std::string original_verbose_;
+};
+
+TEST_F(LogIsVisibleTest, AlwaysReturnsTrueIfVerbosityIsInfo) {
+  GMOCK_FLAG(verbose) = kInfoVerbosity;
+  EXPECT_TRUE(LogIsVisible(kInfo));
+  EXPECT_TRUE(LogIsVisible(kWarning));
+}
+
+TEST_F(LogIsVisibleTest, AlwaysReturnsFalseIfVerbosityIsError) {
+  GMOCK_FLAG(verbose) = kErrorVerbosity;
+  EXPECT_FALSE(LogIsVisible(kInfo));
+  EXPECT_FALSE(LogIsVisible(kWarning));
+}
+
+TEST_F(LogIsVisibleTest, WorksWhenVerbosityIsWarning) {
+  GMOCK_FLAG(verbose) = kWarningVerbosity;
+  EXPECT_FALSE(LogIsVisible(kInfo));
+  EXPECT_TRUE(LogIsVisible(kWarning));
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Tests the Log() function.
+
+// Verifies that Log() behaves correctly for the given verbosity level
+// and log severity.
+void TestLogWithSeverity(const std::string& verbosity, LogSeverity severity,
+                         bool should_print) {
+  const std::string old_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = verbosity;
+  CaptureStdout();
+  Log(severity, "Test log.\n", 0);
+  if (should_print) {
+    EXPECT_THAT(GetCapturedStdout().c_str(),
+                ContainsRegex(
+                    severity == kWarning ?
+                    "^\nGMOCK WARNING:\nTest log\\.\nStack trace:\n" :
+                    "^\nTest log\\.\nStack trace:\n"));
+  } else {
+    EXPECT_STREQ("", GetCapturedStdout().c_str());
+  }
+  GMOCK_FLAG(verbose) = old_flag;
+}
+
+// Tests that when the stack_frames_to_skip parameter is negative,
+// Log() doesn't include the stack trace in the output.
+TEST(LogTest, NoStackTraceWhenStackFramesToSkipIsNegative) {
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = kInfoVerbosity;
+  CaptureStdout();
+  Log(kInfo, "Test log.\n", -1);
+  EXPECT_STREQ("\nTest log.\n", GetCapturedStdout().c_str());
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+struct MockStackTraceGetter : testing::internal::OsStackTraceGetterInterface {
+  std::string CurrentStackTrace(int max_depth, int skip_count) override {
+    return (testing::Message() << max_depth << "::" << skip_count << "\n")
+        .GetString();
+  }
+  void UponLeavingGTest() override {}
+};
+
+// Tests that in opt mode, a positive stack_frames_to_skip argument is
+// treated as 0.
+TEST(LogTest, NoSkippingStackFrameInOptMode) {
+  MockStackTraceGetter* mock_os_stack_trace_getter = new MockStackTraceGetter;
+  GetUnitTestImpl()->set_os_stack_trace_getter(mock_os_stack_trace_getter);
+
+  CaptureStdout();
+  Log(kWarning, "Test log.\n", 100);
+  const std::string log = GetCapturedStdout();
+
+  std::string expected_trace =
+      (testing::Message() << GTEST_FLAG(stack_trace_depth) << "::").GetString();
+  std::string expected_message =
+      "\nGMOCK WARNING:\n"
+      "Test log.\n"
+      "Stack trace:\n" +
+      expected_trace;
+  EXPECT_THAT(log, HasSubstr(expected_message));
+  int skip_count = atoi(log.substr(expected_message.size()).c_str());
+
+# if defined(NDEBUG)
+  // In opt mode, no stack frame should be skipped.
+  const int expected_skip_count = 0;
+# else
+  // In dbg mode, the stack frames should be skipped.
+  const int expected_skip_count = 100;
+# endif
+
+  // Note that each inner implementation layer will +1 the number to remove
+  // itself from the trace. This means that the value is a little higher than
+  // expected, but close enough.
+  EXPECT_THAT(skip_count,
+              AllOf(Ge(expected_skip_count), Le(expected_skip_count + 10)));
+
+  // Restores the default OS stack trace getter.
+  GetUnitTestImpl()->set_os_stack_trace_getter(nullptr);
+}
+
+// Tests that all logs are printed when the value of the
+// --gmock_verbose flag is "info".
+TEST(LogTest, AllLogsArePrintedWhenVerbosityIsInfo) {
+  TestLogWithSeverity(kInfoVerbosity, kInfo, true);
+  TestLogWithSeverity(kInfoVerbosity, kWarning, true);
+}
+
+// Tests that only warnings are printed when the value of the
+// --gmock_verbose flag is "warning".
+TEST(LogTest, OnlyWarningsArePrintedWhenVerbosityIsWarning) {
+  TestLogWithSeverity(kWarningVerbosity, kInfo, false);
+  TestLogWithSeverity(kWarningVerbosity, kWarning, true);
+}
+
+// Tests that no logs are printed when the value of the
+// --gmock_verbose flag is "error".
+TEST(LogTest, NoLogsArePrintedWhenVerbosityIsError) {
+  TestLogWithSeverity(kErrorVerbosity, kInfo, false);
+  TestLogWithSeverity(kErrorVerbosity, kWarning, false);
+}
+
+// Tests that only warnings are printed when the value of the
+// --gmock_verbose flag is invalid.
+TEST(LogTest, OnlyWarningsArePrintedWhenVerbosityIsInvalid) {
+  TestLogWithSeverity("invalid", kInfo, false);
+  TestLogWithSeverity("invalid", kWarning, true);
+}
+
+// Verifies that Log() behaves correctly for the given verbosity level
+// and log severity.
+std::string GrabOutput(void(*logger)(), const char* verbosity) {
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = verbosity;
+  CaptureStdout();
+  logger();
+  GMOCK_FLAG(verbose) = saved_flag;
+  return GetCapturedStdout();
+}
+
+class DummyMock {
+ public:
+  MOCK_METHOD0(TestMethod, void());
+  MOCK_METHOD1(TestMethodArg, void(int dummy));
+};
+
+void ExpectCallLogger() {
+  DummyMock mock;
+  EXPECT_CALL(mock, TestMethod());
+  mock.TestMethod();
+}
+
+// Verifies that EXPECT_CALL logs if the --gmock_verbose flag is set to "info".
+TEST(ExpectCallTest, LogsWhenVerbosityIsInfo) {
+  EXPECT_THAT(std::string(GrabOutput(ExpectCallLogger, kInfoVerbosity)),
+              HasSubstr("EXPECT_CALL(mock, TestMethod())"));
+}
+
+// Verifies that EXPECT_CALL doesn't log
+// if the --gmock_verbose flag is set to "warning".
+TEST(ExpectCallTest, DoesNotLogWhenVerbosityIsWarning) {
+  EXPECT_STREQ("", GrabOutput(ExpectCallLogger, kWarningVerbosity).c_str());
+}
+
+// Verifies that EXPECT_CALL doesn't log
+// if the --gmock_verbose flag is set to "error".
+TEST(ExpectCallTest,  DoesNotLogWhenVerbosityIsError) {
+  EXPECT_STREQ("", GrabOutput(ExpectCallLogger, kErrorVerbosity).c_str());
+}
+
+void OnCallLogger() {
+  DummyMock mock;
+  ON_CALL(mock, TestMethod());
+}
+
+// Verifies that ON_CALL logs if the --gmock_verbose flag is set to "info".
+TEST(OnCallTest, LogsWhenVerbosityIsInfo) {
+  EXPECT_THAT(std::string(GrabOutput(OnCallLogger, kInfoVerbosity)),
+              HasSubstr("ON_CALL(mock, TestMethod())"));
+}
+
+// Verifies that ON_CALL doesn't log
+// if the --gmock_verbose flag is set to "warning".
+TEST(OnCallTest, DoesNotLogWhenVerbosityIsWarning) {
+  EXPECT_STREQ("", GrabOutput(OnCallLogger, kWarningVerbosity).c_str());
+}
+
+// Verifies that ON_CALL doesn't log if
+// the --gmock_verbose flag is set to "error".
+TEST(OnCallTest, DoesNotLogWhenVerbosityIsError) {
+  EXPECT_STREQ("", GrabOutput(OnCallLogger, kErrorVerbosity).c_str());
+}
+
+void OnCallAnyArgumentLogger() {
+  DummyMock mock;
+  ON_CALL(mock, TestMethodArg(_));
+}
+
+// Verifies that ON_CALL prints provided _ argument.
+TEST(OnCallTest, LogsAnythingArgument) {
+  EXPECT_THAT(std::string(GrabOutput(OnCallAnyArgumentLogger, kInfoVerbosity)),
+              HasSubstr("ON_CALL(mock, TestMethodArg(_)"));
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Tests StlContainerView.
+
+TEST(StlContainerViewTest, WorksForStlContainer) {
+  StaticAssertTypeEq<std::vector<int>,
+      StlContainerView<std::vector<int> >::type>();
+  StaticAssertTypeEq<const std::vector<double>&,
+      StlContainerView<std::vector<double> >::const_reference>();
+
+  typedef std::vector<char> Chars;
+  Chars v1;
+  const Chars& v2(StlContainerView<Chars>::ConstReference(v1));
+  EXPECT_EQ(&v1, &v2);
+
+  v1.push_back('a');
+  Chars v3 = StlContainerView<Chars>::Copy(v1);
+  EXPECT_THAT(v3, Eq(v3));
+}
+
+TEST(StlContainerViewTest, WorksForStaticNativeArray) {
+  StaticAssertTypeEq<NativeArray<int>,
+      StlContainerView<int[3]>::type>();
+  StaticAssertTypeEq<NativeArray<double>,
+      StlContainerView<const double[4]>::type>();
+  StaticAssertTypeEq<NativeArray<char[3]>,
+      StlContainerView<const char[2][3]>::type>();
+
+  StaticAssertTypeEq<const NativeArray<int>,
+      StlContainerView<int[2]>::const_reference>();
+
+  int a1[3] = { 0, 1, 2 };
+  NativeArray<int> a2 = StlContainerView<int[3]>::ConstReference(a1);
+  EXPECT_EQ(3U, a2.size());
+  EXPECT_EQ(a1, a2.begin());
+
+  const NativeArray<int> a3 = StlContainerView<int[3]>::Copy(a1);
+  ASSERT_EQ(3U, a3.size());
+  EXPECT_EQ(0, a3.begin()[0]);
+  EXPECT_EQ(1, a3.begin()[1]);
+  EXPECT_EQ(2, a3.begin()[2]);
+
+  // Makes sure a1 and a3 aren't aliases.
+  a1[0] = 3;
+  EXPECT_EQ(0, a3.begin()[0]);
+}
+
+TEST(StlContainerViewTest, WorksForDynamicNativeArray) {
+  StaticAssertTypeEq<NativeArray<int>,
+                     StlContainerView<std::tuple<const int*, size_t> >::type>();
+  StaticAssertTypeEq<
+      NativeArray<double>,
+      StlContainerView<std::tuple<std::shared_ptr<double>, int> >::type>();
+
+  StaticAssertTypeEq<
+      const NativeArray<int>,
+      StlContainerView<std::tuple<const int*, int> >::const_reference>();
+
+  int a1[3] = { 0, 1, 2 };
+  const int* const p1 = a1;
+  NativeArray<int> a2 =
+      StlContainerView<std::tuple<const int*, int> >::ConstReference(
+          std::make_tuple(p1, 3));
+  EXPECT_EQ(3U, a2.size());
+  EXPECT_EQ(a1, a2.begin());
+
+  const NativeArray<int> a3 = StlContainerView<std::tuple<int*, size_t> >::Copy(
+      std::make_tuple(static_cast<int*>(a1), 3));
+  ASSERT_EQ(3U, a3.size());
+  EXPECT_EQ(0, a3.begin()[0]);
+  EXPECT_EQ(1, a3.begin()[1]);
+  EXPECT_EQ(2, a3.begin()[2]);
+
+  // Makes sure a1 and a3 aren't aliases.
+  a1[0] = 3;
+  EXPECT_EQ(0, a3.begin()[0]);
+}
+
+// Tests the Function template struct.
+
+TEST(FunctionTest, Nullary) {
+  typedef Function<int()> F;  // NOLINT
+  EXPECT_EQ(0u, F::ArgumentCount);
+  EXPECT_TRUE((std::is_same<int, F::Result>::value));
+  EXPECT_TRUE((std::is_same<std::tuple<>, F::ArgumentTuple>::value));
+  EXPECT_TRUE((std::is_same<std::tuple<>, F::ArgumentMatcherTuple>::value));
+  EXPECT_TRUE((std::is_same<void(), F::MakeResultVoid>::value));
+  EXPECT_TRUE((std::is_same<IgnoredValue(), F::MakeResultIgnoredValue>::value));
+}
+
+TEST(FunctionTest, Unary) {
+  typedef Function<int(bool)> F;  // NOLINT
+  EXPECT_EQ(1u, F::ArgumentCount);
+  EXPECT_TRUE((std::is_same<int, F::Result>::value));
+  EXPECT_TRUE((std::is_same<bool, F::Arg<0>::type>::value));
+  EXPECT_TRUE((std::is_same<std::tuple<bool>, F::ArgumentTuple>::value));
+  EXPECT_TRUE((
+      std::is_same<std::tuple<Matcher<bool>>, F::ArgumentMatcherTuple>::value));
+  EXPECT_TRUE((std::is_same<void(bool), F::MakeResultVoid>::value));  // NOLINT
+  EXPECT_TRUE((std::is_same<IgnoredValue(bool),                       // NOLINT
+                            F::MakeResultIgnoredValue>::value));
+}
+
+TEST(FunctionTest, Binary) {
+  typedef Function<int(bool, const long&)> F;  // NOLINT
+  EXPECT_EQ(2u, F::ArgumentCount);
+  EXPECT_TRUE((std::is_same<int, F::Result>::value));
+  EXPECT_TRUE((std::is_same<bool, F::Arg<0>::type>::value));
+  EXPECT_TRUE((std::is_same<const long&, F::Arg<1>::type>::value));  // NOLINT
+  EXPECT_TRUE((std::is_same<std::tuple<bool, const long&>,           // NOLINT
+                            F::ArgumentTuple>::value));
+  EXPECT_TRUE(
+      (std::is_same<std::tuple<Matcher<bool>, Matcher<const long&>>,  // NOLINT
+                    F::ArgumentMatcherTuple>::value));
+  EXPECT_TRUE((std::is_same<void(bool, const long&),  // NOLINT
+                            F::MakeResultVoid>::value));
+  EXPECT_TRUE((std::is_same<IgnoredValue(bool, const long&),  // NOLINT
+                            F::MakeResultIgnoredValue>::value));
+}
+
+TEST(FunctionTest, LongArgumentList) {
+  typedef Function<char(bool, int, char*, int&, const long&)> F;  // NOLINT
+  EXPECT_EQ(5u, F::ArgumentCount);
+  EXPECT_TRUE((std::is_same<char, F::Result>::value));
+  EXPECT_TRUE((std::is_same<bool, F::Arg<0>::type>::value));
+  EXPECT_TRUE((std::is_same<int, F::Arg<1>::type>::value));
+  EXPECT_TRUE((std::is_same<char*, F::Arg<2>::type>::value));
+  EXPECT_TRUE((std::is_same<int&, F::Arg<3>::type>::value));
+  EXPECT_TRUE((std::is_same<const long&, F::Arg<4>::type>::value));  // NOLINT
+  EXPECT_TRUE(
+      (std::is_same<std::tuple<bool, int, char*, int&, const long&>,  // NOLINT
+                    F::ArgumentTuple>::value));
+  EXPECT_TRUE(
+      (std::is_same<
+          std::tuple<Matcher<bool>, Matcher<int>, Matcher<char*>, Matcher<int&>,
+                     Matcher<const long&>>,  // NOLINT
+          F::ArgumentMatcherTuple>::value));
+  EXPECT_TRUE(
+      (std::is_same<void(bool, int, char*, int&, const long&),  // NOLINT
+                    F::MakeResultVoid>::value));
+  EXPECT_TRUE((
+      std::is_same<IgnoredValue(bool, int, char*, int&, const long&),  // NOLINT
+                   F::MakeResultIgnoredValue>::value));
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace testing
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-matchers_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-matchers_test.cc
new file mode 100644
index 0000000000..186d8aaecf
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-matchers_test.cc
@@ -0,0 +1,6938 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests some commonly used argument matchers.
+
+// Silence warning C4244: 'initializing': conversion from 'int' to 'short',
+// possible loss of data and C4100, unreferenced local parameter
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4244)
+# pragma warning(disable:4100)
+#endif
+
+#include "gmock/gmock-matchers.h"
+
+#include <string.h>
+#include <time.h>
+
+#include <array>
+#include <cstdint>
+#include <deque>
+#include <forward_list>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "gmock/gmock-more-matchers.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace gmock_matchers_test {
+namespace {
+
+using std::greater;
+using std::less;
+using std::list;
+using std::make_pair;
+using std::map;
+using std::multimap;
+using std::multiset;
+using std::ostream;
+using std::pair;
+using std::set;
+using std::stringstream;
+using std::vector;
+using testing::internal::DummyMatchResultListener;
+using testing::internal::ElementMatcherPair;
+using testing::internal::ElementMatcherPairs;
+using testing::internal::ExplainMatchFailureTupleTo;
+using testing::internal::FloatingEqMatcher;
+using testing::internal::FormatMatcherDescription;
+using testing::internal::IsReadableTypeName;
+using testing::internal::MatchMatrix;
+using testing::internal::PredicateFormatterFromMatcher;
+using testing::internal::RE;
+using testing::internal::StreamMatchResultListener;
+using testing::internal::Strings;
+
+// Helper for testing container-valued matchers in mock method context. It is
+// important to test matchers in this context, since it requires additional type
+// deduction beyond what EXPECT_THAT does, thus making it more restrictive.
+struct ContainerHelper {
+  MOCK_METHOD1(Call, void(std::vector<std::unique_ptr<int>>));
+};
+
+std::vector<std::unique_ptr<int>> MakeUniquePtrs(const std::vector<int>& ints) {
+  std::vector<std::unique_ptr<int>> pointers;
+  for (int i : ints) pointers.emplace_back(new int(i));
+  return pointers;
+}
+
+// For testing ExplainMatchResultTo().
+class GreaterThanMatcher : public MatcherInterface<int> {
+ public:
+  explicit GreaterThanMatcher(int rhs) : rhs_(rhs) {}
+
+  void DescribeTo(ostream* os) const override { *os << "is > " << rhs_; }
+
+  bool MatchAndExplain(int lhs, MatchResultListener* listener) const override {
+    const int diff = lhs - rhs_;
+    if (diff > 0) {
+      *listener << "which is " << diff << " more than " << rhs_;
+    } else if (diff == 0) {
+      *listener << "which is the same as " << rhs_;
+    } else {
+      *listener << "which is " << -diff << " less than " << rhs_;
+    }
+
+    return lhs > rhs_;
+  }
+
+ private:
+  int rhs_;
+};
+
+Matcher<int> GreaterThan(int n) {
+  return MakeMatcher(new GreaterThanMatcher(n));
+}
+
+std::string OfType(const std::string& type_name) {
+#if GTEST_HAS_RTTI
+  return IsReadableTypeName(type_name) ? " (of type " + type_name + ")" : "";
+#else
+  return "";
+#endif
+}
+
+// Returns the description of the given matcher.
+template <typename T>
+std::string Describe(const Matcher<T>& m) {
+  return DescribeMatcher<T>(m);
+}
+
+// Returns the description of the negation of the given matcher.
+template <typename T>
+std::string DescribeNegation(const Matcher<T>& m) {
+  return DescribeMatcher<T>(m, true);
+}
+
+// Returns the reason why x matches, or doesn't match, m.
+template <typename MatcherType, typename Value>
+std::string Explain(const MatcherType& m, const Value& x) {
+  StringMatchResultListener listener;
+  ExplainMatchResult(m, x, &listener);
+  return listener.str();
+}
+
+TEST(MonotonicMatcherTest, IsPrintable) {
+  stringstream ss;
+  ss << GreaterThan(5);
+  EXPECT_EQ("is > 5", ss.str());
+}
+
+TEST(MatchResultListenerTest, StreamingWorks) {
+  StringMatchResultListener listener;
+  listener << "hi" << 5;
+  EXPECT_EQ("hi5", listener.str());
+
+  listener.Clear();
+  EXPECT_EQ("", listener.str());
+
+  listener << 42;
+  EXPECT_EQ("42", listener.str());
+
+  // Streaming shouldn't crash when the underlying ostream is NULL.
+  DummyMatchResultListener dummy;
+  dummy << "hi" << 5;
+}
+
+TEST(MatchResultListenerTest, CanAccessUnderlyingStream) {
+  EXPECT_TRUE(DummyMatchResultListener().stream() == nullptr);
+  EXPECT_TRUE(StreamMatchResultListener(nullptr).stream() == nullptr);
+
+  EXPECT_EQ(&std::cout, StreamMatchResultListener(&std::cout).stream());
+}
+
+TEST(MatchResultListenerTest, IsInterestedWorks) {
+  EXPECT_TRUE(StringMatchResultListener().IsInterested());
+  EXPECT_TRUE(StreamMatchResultListener(&std::cout).IsInterested());
+
+  EXPECT_FALSE(DummyMatchResultListener().IsInterested());
+  EXPECT_FALSE(StreamMatchResultListener(nullptr).IsInterested());
+}
+
+// Makes sure that the MatcherInterface<T> interface doesn't
+// change.
+class EvenMatcherImpl : public MatcherInterface<int> {
+ public:
+  bool MatchAndExplain(int x,
+                       MatchResultListener* /* listener */) const override {
+    return x % 2 == 0;
+  }
+
+  void DescribeTo(ostream* os) const override { *os << "is an even number"; }
+
+  // We deliberately don't define DescribeNegationTo() and
+  // ExplainMatchResultTo() here, to make sure the definition of these
+  // two methods is optional.
+};
+
+// Makes sure that the MatcherInterface API doesn't change.
+TEST(MatcherInterfaceTest, CanBeImplementedUsingPublishedAPI) {
+  EvenMatcherImpl m;
+}
+
+// Tests implementing a monomorphic matcher using MatchAndExplain().
+
+class NewEvenMatcherImpl : public MatcherInterface<int> {
+ public:
+  bool MatchAndExplain(int x, MatchResultListener* listener) const override {
+    const bool match = x % 2 == 0;
+    // Verifies that we can stream to a listener directly.
+    *listener << "value % " << 2;
+    if (listener->stream() != nullptr) {
+      // Verifies that we can stream to a listener's underlying stream
+      // too.
+      *listener->stream() << " == " << (x % 2);
+    }
+    return match;
+  }
+
+  void DescribeTo(ostream* os) const override { *os << "is an even number"; }
+};
+
+TEST(MatcherInterfaceTest, CanBeImplementedUsingNewAPI) {
+  Matcher<int> m = MakeMatcher(new NewEvenMatcherImpl);
+  EXPECT_TRUE(m.Matches(2));
+  EXPECT_FALSE(m.Matches(3));
+  EXPECT_EQ("value % 2 == 0", Explain(m, 2));
+  EXPECT_EQ("value % 2 == 1", Explain(m, 3));
+}
+
+// Tests default-constructing a matcher.
+TEST(MatcherTest, CanBeDefaultConstructed) {
+  Matcher<double> m;
+}
+
+// Tests that Matcher<T> can be constructed from a MatcherInterface<T>*.
+TEST(MatcherTest, CanBeConstructedFromMatcherInterface) {
+  const MatcherInterface<int>* impl = new EvenMatcherImpl;
+  Matcher<int> m(impl);
+  EXPECT_TRUE(m.Matches(4));
+  EXPECT_FALSE(m.Matches(5));
+}
+
+// Tests that value can be used in place of Eq(value).
+TEST(MatcherTest, CanBeImplicitlyConstructedFromValue) {
+  Matcher<int> m1 = 5;
+  EXPECT_TRUE(m1.Matches(5));
+  EXPECT_FALSE(m1.Matches(6));
+}
+
+// Tests that NULL can be used in place of Eq(NULL).
+TEST(MatcherTest, CanBeImplicitlyConstructedFromNULL) {
+  Matcher<int*> m1 = nullptr;
+  EXPECT_TRUE(m1.Matches(nullptr));
+  int n = 0;
+  EXPECT_FALSE(m1.Matches(&n));
+}
+
+// Tests that matchers can be constructed from a variable that is not properly
+// defined. This should be illegal, but many users rely on this accidentally.
+struct Undefined {
+  virtual ~Undefined() = 0;
+  static const int kInt = 1;
+};
+
+TEST(MatcherTest, CanBeConstructedFromUndefinedVariable) {
+  Matcher<int> m1 = Undefined::kInt;
+  EXPECT_TRUE(m1.Matches(1));
+  EXPECT_FALSE(m1.Matches(2));
+}
+
+// Test that a matcher parameterized with an abstract class compiles.
+TEST(MatcherTest, CanAcceptAbstractClass) { Matcher<const Undefined&> m = _; }
+
+// Tests that matchers are copyable.
+TEST(MatcherTest, IsCopyable) {
+  // Tests the copy constructor.
+  Matcher<bool> m1 = Eq(false);
+  EXPECT_TRUE(m1.Matches(false));
+  EXPECT_FALSE(m1.Matches(true));
+
+  // Tests the assignment operator.
+  m1 = Eq(true);
+  EXPECT_TRUE(m1.Matches(true));
+  EXPECT_FALSE(m1.Matches(false));
+}
+
+// Tests that Matcher<T>::DescribeTo() calls
+// MatcherInterface<T>::DescribeTo().
+TEST(MatcherTest, CanDescribeItself) {
+  EXPECT_EQ("is an even number",
+            Describe(Matcher<int>(new EvenMatcherImpl)));
+}
+
+// Tests Matcher<T>::MatchAndExplain().
+TEST(MatcherTest, MatchAndExplain) {
+  Matcher<int> m = GreaterThan(0);
+  StringMatchResultListener listener1;
+  EXPECT_TRUE(m.MatchAndExplain(42, &listener1));
+  EXPECT_EQ("which is 42 more than 0", listener1.str());
+
+  StringMatchResultListener listener2;
+  EXPECT_FALSE(m.MatchAndExplain(-9, &listener2));
+  EXPECT_EQ("which is 9 less than 0", listener2.str());
+}
+
+// Tests that a C-string literal can be implicitly converted to a
+// Matcher<std::string> or Matcher<const std::string&>.
+TEST(StringMatcherTest, CanBeImplicitlyConstructedFromCStringLiteral) {
+  Matcher<std::string> m1 = "hi";
+  EXPECT_TRUE(m1.Matches("hi"));
+  EXPECT_FALSE(m1.Matches("hello"));
+
+  Matcher<const std::string&> m2 = "hi";
+  EXPECT_TRUE(m2.Matches("hi"));
+  EXPECT_FALSE(m2.Matches("hello"));
+}
+
+// Tests that a string object can be implicitly converted to a
+// Matcher<std::string> or Matcher<const std::string&>.
+TEST(StringMatcherTest, CanBeImplicitlyConstructedFromString) {
+  Matcher<std::string> m1 = std::string("hi");
+  EXPECT_TRUE(m1.Matches("hi"));
+  EXPECT_FALSE(m1.Matches("hello"));
+
+  Matcher<const std::string&> m2 = std::string("hi");
+  EXPECT_TRUE(m2.Matches("hi"));
+  EXPECT_FALSE(m2.Matches("hello"));
+}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// Tests that a C-string literal can be implicitly converted to a
+// Matcher<StringView> or Matcher<const StringView&>.
+TEST(StringViewMatcherTest, CanBeImplicitlyConstructedFromCStringLiteral) {
+  Matcher<internal::StringView> m1 = "cats";
+  EXPECT_TRUE(m1.Matches("cats"));
+  EXPECT_FALSE(m1.Matches("dogs"));
+
+  Matcher<const internal::StringView&> m2 = "cats";
+  EXPECT_TRUE(m2.Matches("cats"));
+  EXPECT_FALSE(m2.Matches("dogs"));
+}
+
+// Tests that a std::string object can be implicitly converted to a
+// Matcher<StringView> or Matcher<const StringView&>.
+TEST(StringViewMatcherTest, CanBeImplicitlyConstructedFromString) {
+  Matcher<internal::StringView> m1 = std::string("cats");
+  EXPECT_TRUE(m1.Matches("cats"));
+  EXPECT_FALSE(m1.Matches("dogs"));
+
+  Matcher<const internal::StringView&> m2 = std::string("cats");
+  EXPECT_TRUE(m2.Matches("cats"));
+  EXPECT_FALSE(m2.Matches("dogs"));
+}
+
+// Tests that a StringView object can be implicitly converted to a
+// Matcher<StringView> or Matcher<const StringView&>.
+TEST(StringViewMatcherTest, CanBeImplicitlyConstructedFromStringView) {
+  Matcher<internal::StringView> m1 = internal::StringView("cats");
+  EXPECT_TRUE(m1.Matches("cats"));
+  EXPECT_FALSE(m1.Matches("dogs"));
+
+  Matcher<const internal::StringView&> m2 = internal::StringView("cats");
+  EXPECT_TRUE(m2.Matches("cats"));
+  EXPECT_FALSE(m2.Matches("dogs"));
+}
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+// Tests that a std::reference_wrapper<std::string> object can be implicitly
+// converted to a Matcher<std::string> or Matcher<const std::string&> via Eq().
+TEST(StringMatcherTest,
+     CanBeImplicitlyConstructedFromEqReferenceWrapperString) {
+  std::string value = "cats";
+  Matcher<std::string> m1 = Eq(std::ref(value));
+  EXPECT_TRUE(m1.Matches("cats"));
+  EXPECT_FALSE(m1.Matches("dogs"));
+
+  Matcher<const std::string&> m2 = Eq(std::ref(value));
+  EXPECT_TRUE(m2.Matches("cats"));
+  EXPECT_FALSE(m2.Matches("dogs"));
+}
+
+// Tests that MakeMatcher() constructs a Matcher<T> from a
+// MatcherInterface* without requiring the user to explicitly
+// write the type.
+TEST(MakeMatcherTest, ConstructsMatcherFromMatcherInterface) {
+  const MatcherInterface<int>* dummy_impl = nullptr;
+  Matcher<int> m = MakeMatcher(dummy_impl);
+}
+
+// Tests that MakePolymorphicMatcher() can construct a polymorphic
+// matcher from its implementation using the old API.
+const int g_bar = 1;
+class ReferencesBarOrIsZeroImpl {
+ public:
+  template <typename T>
+  bool MatchAndExplain(const T& x,
+                       MatchResultListener* /* listener */) const {
+    const void* p = &x;
+    return p == &g_bar || x == 0;
+  }
+
+  void DescribeTo(ostream* os) const { *os << "g_bar or zero"; }
+
+  void DescribeNegationTo(ostream* os) const {
+    *os << "doesn't reference g_bar and is not zero";
+  }
+};
+
+// This function verifies that MakePolymorphicMatcher() returns a
+// PolymorphicMatcher<T> where T is the argument's type.
+PolymorphicMatcher<ReferencesBarOrIsZeroImpl> ReferencesBarOrIsZero() {
+  return MakePolymorphicMatcher(ReferencesBarOrIsZeroImpl());
+}
+
+TEST(MakePolymorphicMatcherTest, ConstructsMatcherUsingOldAPI) {
+  // Using a polymorphic matcher to match a reference type.
+  Matcher<const int&> m1 = ReferencesBarOrIsZero();
+  EXPECT_TRUE(m1.Matches(0));
+  // Verifies that the identity of a by-reference argument is preserved.
+  EXPECT_TRUE(m1.Matches(g_bar));
+  EXPECT_FALSE(m1.Matches(1));
+  EXPECT_EQ("g_bar or zero", Describe(m1));
+
+  // Using a polymorphic matcher to match a value type.
+  Matcher<double> m2 = ReferencesBarOrIsZero();
+  EXPECT_TRUE(m2.Matches(0.0));
+  EXPECT_FALSE(m2.Matches(0.1));
+  EXPECT_EQ("g_bar or zero", Describe(m2));
+}
+
+// Tests implementing a polymorphic matcher using MatchAndExplain().
+
+class PolymorphicIsEvenImpl {
+ public:
+  void DescribeTo(ostream* os) const { *os << "is even"; }
+
+  void DescribeNegationTo(ostream* os) const {
+    *os << "is odd";
+  }
+
+  template <typename T>
+  bool MatchAndExplain(const T& x, MatchResultListener* listener) const {
+    // Verifies that we can stream to the listener directly.
+    *listener << "% " << 2;
+    if (listener->stream() != nullptr) {
+      // Verifies that we can stream to the listener's underlying stream
+      // too.
+      *listener->stream() << " == " << (x % 2);
+    }
+    return (x % 2) == 0;
+  }
+};
+
+PolymorphicMatcher<PolymorphicIsEvenImpl> PolymorphicIsEven() {
+  return MakePolymorphicMatcher(PolymorphicIsEvenImpl());
+}
+
+TEST(MakePolymorphicMatcherTest, ConstructsMatcherUsingNewAPI) {
+  // Using PolymorphicIsEven() as a Matcher<int>.
+  const Matcher<int> m1 = PolymorphicIsEven();
+  EXPECT_TRUE(m1.Matches(42));
+  EXPECT_FALSE(m1.Matches(43));
+  EXPECT_EQ("is even", Describe(m1));
+
+  const Matcher<int> not_m1 = Not(m1);
+  EXPECT_EQ("is odd", Describe(not_m1));
+
+  EXPECT_EQ("% 2 == 0", Explain(m1, 42));
+
+  // Using PolymorphicIsEven() as a Matcher<char>.
+  const Matcher<char> m2 = PolymorphicIsEven();
+  EXPECT_TRUE(m2.Matches('\x42'));
+  EXPECT_FALSE(m2.Matches('\x43'));
+  EXPECT_EQ("is even", Describe(m2));
+
+  const Matcher<char> not_m2 = Not(m2);
+  EXPECT_EQ("is odd", Describe(not_m2));
+
+  EXPECT_EQ("% 2 == 0", Explain(m2, '\x42'));
+}
+
+// Tests that MatcherCast<T>(m) works when m is a polymorphic matcher.
+TEST(MatcherCastTest, FromPolymorphicMatcher) {
+  Matcher<int> m = MatcherCast<int>(Eq(5));
+  EXPECT_TRUE(m.Matches(5));
+  EXPECT_FALSE(m.Matches(6));
+}
+
+// For testing casting matchers between compatible types.
+class IntValue {
+ public:
+  // An int can be statically (although not implicitly) cast to a
+  // IntValue.
+  explicit IntValue(int a_value) : value_(a_value) {}
+
+  int value() const { return value_; }
+ private:
+  int value_;
+};
+
+// For testing casting matchers between compatible types.
+bool IsPositiveIntValue(const IntValue& foo) {
+  return foo.value() > 0;
+}
+
+// Tests that MatcherCast<T>(m) works when m is a Matcher<U> where T
+// can be statically converted to U.
+TEST(MatcherCastTest, FromCompatibleType) {
+  Matcher<double> m1 = Eq(2.0);
+  Matcher<int> m2 = MatcherCast<int>(m1);
+  EXPECT_TRUE(m2.Matches(2));
+  EXPECT_FALSE(m2.Matches(3));
+
+  Matcher<IntValue> m3 = Truly(IsPositiveIntValue);
+  Matcher<int> m4 = MatcherCast<int>(m3);
+  // In the following, the arguments 1 and 0 are statically converted
+  // to IntValue objects, and then tested by the IsPositiveIntValue()
+  // predicate.
+  EXPECT_TRUE(m4.Matches(1));
+  EXPECT_FALSE(m4.Matches(0));
+}
+
+// Tests that MatcherCast<T>(m) works when m is a Matcher<const T&>.
+TEST(MatcherCastTest, FromConstReferenceToNonReference) {
+  Matcher<const int&> m1 = Eq(0);
+  Matcher<int> m2 = MatcherCast<int>(m1);
+  EXPECT_TRUE(m2.Matches(0));
+  EXPECT_FALSE(m2.Matches(1));
+}
+
+// Tests that MatcherCast<T>(m) works when m is a Matcher<T&>.
+TEST(MatcherCastTest, FromReferenceToNonReference) {
+  Matcher<int&> m1 = Eq(0);
+  Matcher<int> m2 = MatcherCast<int>(m1);
+  EXPECT_TRUE(m2.Matches(0));
+  EXPECT_FALSE(m2.Matches(1));
+}
+
+// Tests that MatcherCast<const T&>(m) works when m is a Matcher<T>.
+TEST(MatcherCastTest, FromNonReferenceToConstReference) {
+  Matcher<int> m1 = Eq(0);
+  Matcher<const int&> m2 = MatcherCast<const int&>(m1);
+  EXPECT_TRUE(m2.Matches(0));
+  EXPECT_FALSE(m2.Matches(1));
+}
+
+// Tests that MatcherCast<T&>(m) works when m is a Matcher<T>.
+TEST(MatcherCastTest, FromNonReferenceToReference) {
+  Matcher<int> m1 = Eq(0);
+  Matcher<int&> m2 = MatcherCast<int&>(m1);
+  int n = 0;
+  EXPECT_TRUE(m2.Matches(n));
+  n = 1;
+  EXPECT_FALSE(m2.Matches(n));
+}
+
+// Tests that MatcherCast<T>(m) works when m is a Matcher<T>.
+TEST(MatcherCastTest, FromSameType) {
+  Matcher<int> m1 = Eq(0);
+  Matcher<int> m2 = MatcherCast<int>(m1);
+  EXPECT_TRUE(m2.Matches(0));
+  EXPECT_FALSE(m2.Matches(1));
+}
+
+// Tests that MatcherCast<T>(m) works when m is a value of the same type as the
+// value type of the Matcher.
+TEST(MatcherCastTest, FromAValue) {
+  Matcher<int> m = MatcherCast<int>(42);
+  EXPECT_TRUE(m.Matches(42));
+  EXPECT_FALSE(m.Matches(239));
+}
+
+// Tests that MatcherCast<T>(m) works when m is a value of the type implicitly
+// convertible to the value type of the Matcher.
+TEST(MatcherCastTest, FromAnImplicitlyConvertibleValue) {
+  const int kExpected = 'c';
+  Matcher<int> m = MatcherCast<int>('c');
+  EXPECT_TRUE(m.Matches(kExpected));
+  EXPECT_FALSE(m.Matches(kExpected + 1));
+}
+
+struct NonImplicitlyConstructibleTypeWithOperatorEq {
+  friend bool operator==(
+      const NonImplicitlyConstructibleTypeWithOperatorEq& /* ignored */,
+      int rhs) {
+    return 42 == rhs;
+  }
+  friend bool operator==(
+      int lhs,
+      const NonImplicitlyConstructibleTypeWithOperatorEq& /* ignored */) {
+    return lhs == 42;
+  }
+};
+
+// Tests that MatcherCast<T>(m) works when m is a neither a matcher nor
+// implicitly convertible to the value type of the Matcher, but the value type
+// of the matcher has operator==() overload accepting m.
+TEST(MatcherCastTest, NonImplicitlyConstructibleTypeWithOperatorEq) {
+  Matcher<NonImplicitlyConstructibleTypeWithOperatorEq> m1 =
+      MatcherCast<NonImplicitlyConstructibleTypeWithOperatorEq>(42);
+  EXPECT_TRUE(m1.Matches(NonImplicitlyConstructibleTypeWithOperatorEq()));
+
+  Matcher<NonImplicitlyConstructibleTypeWithOperatorEq> m2 =
+      MatcherCast<NonImplicitlyConstructibleTypeWithOperatorEq>(239);
+  EXPECT_FALSE(m2.Matches(NonImplicitlyConstructibleTypeWithOperatorEq()));
+
+  // When updating the following lines please also change the comment to
+  // namespace convertible_from_any.
+  Matcher<int> m3 =
+      MatcherCast<int>(NonImplicitlyConstructibleTypeWithOperatorEq());
+  EXPECT_TRUE(m3.Matches(42));
+  EXPECT_FALSE(m3.Matches(239));
+}
+
+// ConvertibleFromAny does not work with MSVC. resulting in
+// error C2440: 'initializing': cannot convert from 'Eq' to 'M'
+// No constructor could take the source type, or constructor overload
+// resolution was ambiguous
+
+#if !defined _MSC_VER
+
+// The below ConvertibleFromAny struct is implicitly constructible from anything
+// and when in the same namespace can interact with other tests. In particular,
+// if it is in the same namespace as other tests and one removes
+//   NonImplicitlyConstructibleTypeWithOperatorEq::operator==(int lhs, ...);
+// then the corresponding test still compiles (and it should not!) by implicitly
+// converting NonImplicitlyConstructibleTypeWithOperatorEq to ConvertibleFromAny
+// in m3.Matcher().
+namespace convertible_from_any {
+// Implicitly convertible from any type.
+struct ConvertibleFromAny {
+  ConvertibleFromAny(int a_value) : value(a_value) {}
+  template <typename T>
+  ConvertibleFromAny(const T& /*a_value*/) : value(-1) {
+    ADD_FAILURE() << "Conversion constructor called";
+  }
+  int value;
+};
+
+bool operator==(const ConvertibleFromAny& a, const ConvertibleFromAny& b) {
+  return a.value == b.value;
+}
+
+ostream& operator<<(ostream& os, const ConvertibleFromAny& a) {
+  return os << a.value;
+}
+
+TEST(MatcherCastTest, ConversionConstructorIsUsed) {
+  Matcher<ConvertibleFromAny> m = MatcherCast<ConvertibleFromAny>(1);
+  EXPECT_TRUE(m.Matches(ConvertibleFromAny(1)));
+  EXPECT_FALSE(m.Matches(ConvertibleFromAny(2)));
+}
+
+TEST(MatcherCastTest, FromConvertibleFromAny) {
+  Matcher<ConvertibleFromAny> m =
+      MatcherCast<ConvertibleFromAny>(Eq(ConvertibleFromAny(1)));
+  EXPECT_TRUE(m.Matches(ConvertibleFromAny(1)));
+  EXPECT_FALSE(m.Matches(ConvertibleFromAny(2)));
+}
+}  // namespace convertible_from_any
+
+#endif  // !defined _MSC_VER
+
+struct IntReferenceWrapper {
+  IntReferenceWrapper(const int& a_value) : value(&a_value) {}
+  const int* value;
+};
+
+bool operator==(const IntReferenceWrapper& a, const IntReferenceWrapper& b) {
+  return a.value == b.value;
+}
+
+TEST(MatcherCastTest, ValueIsNotCopied) {
+  int n = 42;
+  Matcher<IntReferenceWrapper> m = MatcherCast<IntReferenceWrapper>(n);
+  // Verify that the matcher holds a reference to n, not to its temporary copy.
+  EXPECT_TRUE(m.Matches(n));
+}
+
+class Base {
+ public:
+  virtual ~Base() {}
+  Base() {}
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Base);
+};
+
+class Derived : public Base {
+ public:
+  Derived() : Base() {}
+  int i;
+};
+
+class OtherDerived : public Base {};
+
+// Tests that SafeMatcherCast<T>(m) works when m is a polymorphic matcher.
+TEST(SafeMatcherCastTest, FromPolymorphicMatcher) {
+  Matcher<char> m2 = SafeMatcherCast<char>(Eq(32));
+  EXPECT_TRUE(m2.Matches(' '));
+  EXPECT_FALSE(m2.Matches('\n'));
+}
+
+// Tests that SafeMatcherCast<T>(m) works when m is a Matcher<U> where
+// T and U are arithmetic types and T can be losslessly converted to
+// U.
+TEST(SafeMatcherCastTest, FromLosslesslyConvertibleArithmeticType) {
+  Matcher<double> m1 = DoubleEq(1.0);
+  Matcher<float> m2 = SafeMatcherCast<float>(m1);
+  EXPECT_TRUE(m2.Matches(1.0f));
+  EXPECT_FALSE(m2.Matches(2.0f));
+
+  Matcher<char> m3 = SafeMatcherCast<char>(TypedEq<int>('a'));
+  EXPECT_TRUE(m3.Matches('a'));
+  EXPECT_FALSE(m3.Matches('b'));
+}
+
+// Tests that SafeMatcherCast<T>(m) works when m is a Matcher<U> where T and U
+// are pointers or references to a derived and a base class, correspondingly.
+TEST(SafeMatcherCastTest, FromBaseClass) {
+  Derived d, d2;
+  Matcher<Base*> m1 = Eq(&d);
+  Matcher<Derived*> m2 = SafeMatcherCast<Derived*>(m1);
+  EXPECT_TRUE(m2.Matches(&d));
+  EXPECT_FALSE(m2.Matches(&d2));
+
+  Matcher<Base&> m3 = Ref(d);
+  Matcher<Derived&> m4 = SafeMatcherCast<Derived&>(m3);
+  EXPECT_TRUE(m4.Matches(d));
+  EXPECT_FALSE(m4.Matches(d2));
+}
+
+// Tests that SafeMatcherCast<T&>(m) works when m is a Matcher<const T&>.
+TEST(SafeMatcherCastTest, FromConstReferenceToReference) {
+  int n = 0;
+  Matcher<const int&> m1 = Ref(n);
+  Matcher<int&> m2 = SafeMatcherCast<int&>(m1);
+  int n1 = 0;
+  EXPECT_TRUE(m2.Matches(n));
+  EXPECT_FALSE(m2.Matches(n1));
+}
+
+// Tests that MatcherCast<const T&>(m) works when m is a Matcher<T>.
+TEST(SafeMatcherCastTest, FromNonReferenceToConstReference) {
+  Matcher<std::unique_ptr<int>> m1 = IsNull();
+  Matcher<const std::unique_ptr<int>&> m2 =
+      SafeMatcherCast<const std::unique_ptr<int>&>(m1);
+  EXPECT_TRUE(m2.Matches(std::unique_ptr<int>()));
+  EXPECT_FALSE(m2.Matches(std::unique_ptr<int>(new int)));
+}
+
+// Tests that SafeMatcherCast<T&>(m) works when m is a Matcher<T>.
+TEST(SafeMatcherCastTest, FromNonReferenceToReference) {
+  Matcher<int> m1 = Eq(0);
+  Matcher<int&> m2 = SafeMatcherCast<int&>(m1);
+  int n = 0;
+  EXPECT_TRUE(m2.Matches(n));
+  n = 1;
+  EXPECT_FALSE(m2.Matches(n));
+}
+
+// Tests that SafeMatcherCast<T>(m) works when m is a Matcher<T>.
+TEST(SafeMatcherCastTest, FromSameType) {
+  Matcher<int> m1 = Eq(0);
+  Matcher<int> m2 = SafeMatcherCast<int>(m1);
+  EXPECT_TRUE(m2.Matches(0));
+  EXPECT_FALSE(m2.Matches(1));
+}
+
+#if !defined _MSC_VER
+
+namespace convertible_from_any {
+TEST(SafeMatcherCastTest, ConversionConstructorIsUsed) {
+  Matcher<ConvertibleFromAny> m = SafeMatcherCast<ConvertibleFromAny>(1);
+  EXPECT_TRUE(m.Matches(ConvertibleFromAny(1)));
+  EXPECT_FALSE(m.Matches(ConvertibleFromAny(2)));
+}
+
+TEST(SafeMatcherCastTest, FromConvertibleFromAny) {
+  Matcher<ConvertibleFromAny> m =
+      SafeMatcherCast<ConvertibleFromAny>(Eq(ConvertibleFromAny(1)));
+  EXPECT_TRUE(m.Matches(ConvertibleFromAny(1)));
+  EXPECT_FALSE(m.Matches(ConvertibleFromAny(2)));
+}
+}  // namespace convertible_from_any
+
+#endif  // !defined _MSC_VER
+
+TEST(SafeMatcherCastTest, ValueIsNotCopied) {
+  int n = 42;
+  Matcher<IntReferenceWrapper> m = SafeMatcherCast<IntReferenceWrapper>(n);
+  // Verify that the matcher holds a reference to n, not to its temporary copy.
+  EXPECT_TRUE(m.Matches(n));
+}
+
+TEST(ExpectThat, TakesLiterals) {
+  EXPECT_THAT(1, 1);
+  EXPECT_THAT(1.0, 1.0);
+  EXPECT_THAT(std::string(), "");
+}
+
+TEST(ExpectThat, TakesFunctions) {
+  struct Helper {
+    static void Func() {}
+  };
+  void (*func)() = Helper::Func;
+  EXPECT_THAT(func, Helper::Func);
+  EXPECT_THAT(func, &Helper::Func);
+}
+
+// Tests that A<T>() matches any value of type T.
+TEST(ATest, MatchesAnyValue) {
+  // Tests a matcher for a value type.
+  Matcher<double> m1 = A<double>();
+  EXPECT_TRUE(m1.Matches(91.43));
+  EXPECT_TRUE(m1.Matches(-15.32));
+
+  // Tests a matcher for a reference type.
+  int a = 2;
+  int b = -6;
+  Matcher<int&> m2 = A<int&>();
+  EXPECT_TRUE(m2.Matches(a));
+  EXPECT_TRUE(m2.Matches(b));
+}
+
+TEST(ATest, WorksForDerivedClass) {
+  Base base;
+  Derived derived;
+  EXPECT_THAT(&base, A<Base*>());
+  // This shouldn't compile: EXPECT_THAT(&base, A<Derived*>());
+  EXPECT_THAT(&derived, A<Base*>());
+  EXPECT_THAT(&derived, A<Derived*>());
+}
+
+// Tests that A<T>() describes itself properly.
+TEST(ATest, CanDescribeSelf) {
+  EXPECT_EQ("is anything", Describe(A<bool>()));
+}
+
+// Tests that An<T>() matches any value of type T.
+TEST(AnTest, MatchesAnyValue) {
+  // Tests a matcher for a value type.
+  Matcher<int> m1 = An<int>();
+  EXPECT_TRUE(m1.Matches(9143));
+  EXPECT_TRUE(m1.Matches(-1532));
+
+  // Tests a matcher for a reference type.
+  int a = 2;
+  int b = -6;
+  Matcher<int&> m2 = An<int&>();
+  EXPECT_TRUE(m2.Matches(a));
+  EXPECT_TRUE(m2.Matches(b));
+}
+
+// Tests that An<T>() describes itself properly.
+TEST(AnTest, CanDescribeSelf) {
+  EXPECT_EQ("is anything", Describe(An<int>()));
+}
+
+// Tests that _ can be used as a matcher for any type and matches any
+// value of that type.
+TEST(UnderscoreTest, MatchesAnyValue) {
+  // Uses _ as a matcher for a value type.
+  Matcher<int> m1 = _;
+  EXPECT_TRUE(m1.Matches(123));
+  EXPECT_TRUE(m1.Matches(-242));
+
+  // Uses _ as a matcher for a reference type.
+  bool a = false;
+  const bool b = true;
+  Matcher<const bool&> m2 = _;
+  EXPECT_TRUE(m2.Matches(a));
+  EXPECT_TRUE(m2.Matches(b));
+}
+
+// Tests that _ describes itself properly.
+TEST(UnderscoreTest, CanDescribeSelf) {
+  Matcher<int> m = _;
+  EXPECT_EQ("is anything", Describe(m));
+}
+
+// Tests that Eq(x) matches any value equal to x.
+TEST(EqTest, MatchesEqualValue) {
+  // 2 C-strings with same content but different addresses.
+  const char a1[] = "hi";
+  const char a2[] = "hi";
+
+  Matcher<const char*> m1 = Eq(a1);
+  EXPECT_TRUE(m1.Matches(a1));
+  EXPECT_FALSE(m1.Matches(a2));
+}
+
+// Tests that Eq(v) describes itself properly.
+
+class Unprintable {
+ public:
+  Unprintable() : c_('a') {}
+
+  bool operator==(const Unprintable& /* rhs */) const { return true; }
+  // -Wunused-private-field: dummy accessor for `c_`.
+  char dummy_c() { return c_; }
+ private:
+  char c_;
+};
+
+TEST(EqTest, CanDescribeSelf) {
+  Matcher<Unprintable> m = Eq(Unprintable());
+  EXPECT_EQ("is equal to 1-byte object <61>", Describe(m));
+}
+
+// Tests that Eq(v) can be used to match any type that supports
+// comparing with type T, where T is v's type.
+TEST(EqTest, IsPolymorphic) {
+  Matcher<int> m1 = Eq(1);
+  EXPECT_TRUE(m1.Matches(1));
+  EXPECT_FALSE(m1.Matches(2));
+
+  Matcher<char> m2 = Eq(1);
+  EXPECT_TRUE(m2.Matches('\1'));
+  EXPECT_FALSE(m2.Matches('a'));
+}
+
+// Tests that TypedEq<T>(v) matches values of type T that's equal to v.
+TEST(TypedEqTest, ChecksEqualityForGivenType) {
+  Matcher<char> m1 = TypedEq<char>('a');
+  EXPECT_TRUE(m1.Matches('a'));
+  EXPECT_FALSE(m1.Matches('b'));
+
+  Matcher<int> m2 = TypedEq<int>(6);
+  EXPECT_TRUE(m2.Matches(6));
+  EXPECT_FALSE(m2.Matches(7));
+}
+
+// Tests that TypedEq(v) describes itself properly.
+TEST(TypedEqTest, CanDescribeSelf) {
+  EXPECT_EQ("is equal to 2", Describe(TypedEq<int>(2)));
+}
+
+// Tests that TypedEq<T>(v) has type Matcher<T>.
+
+// Type<T>::IsTypeOf(v) compiles if and only if the type of value v is T, where
+// T is a "bare" type (i.e. not in the form of const U or U&).  If v's type is
+// not T, the compiler will generate a message about "undefined reference".
+template <typename T>
+struct Type {
+  static bool IsTypeOf(const T& /* v */) { return true; }
+
+  template <typename T2>
+  static void IsTypeOf(T2 v);
+};
+
+TEST(TypedEqTest, HasSpecifiedType) {
+  // Verfies that the type of TypedEq<T>(v) is Matcher<T>.
+  Type<Matcher<int> >::IsTypeOf(TypedEq<int>(5));
+  Type<Matcher<double> >::IsTypeOf(TypedEq<double>(5));
+}
+
+// Tests that Ge(v) matches anything >= v.
+TEST(GeTest, ImplementsGreaterThanOrEqual) {
+  Matcher<int> m1 = Ge(0);
+  EXPECT_TRUE(m1.Matches(1));
+  EXPECT_TRUE(m1.Matches(0));
+  EXPECT_FALSE(m1.Matches(-1));
+}
+
+// Tests that Ge(v) describes itself properly.
+TEST(GeTest, CanDescribeSelf) {
+  Matcher<int> m = Ge(5);
+  EXPECT_EQ("is >= 5", Describe(m));
+}
+
+// Tests that Gt(v) matches anything > v.
+TEST(GtTest, ImplementsGreaterThan) {
+  Matcher<double> m1 = Gt(0);
+  EXPECT_TRUE(m1.Matches(1.0));
+  EXPECT_FALSE(m1.Matches(0.0));
+  EXPECT_FALSE(m1.Matches(-1.0));
+}
+
+// Tests that Gt(v) describes itself properly.
+TEST(GtTest, CanDescribeSelf) {
+  Matcher<int> m = Gt(5);
+  EXPECT_EQ("is > 5", Describe(m));
+}
+
+// Tests that Le(v) matches anything <= v.
+TEST(LeTest, ImplementsLessThanOrEqual) {
+  Matcher<char> m1 = Le('b');
+  EXPECT_TRUE(m1.Matches('a'));
+  EXPECT_TRUE(m1.Matches('b'));
+  EXPECT_FALSE(m1.Matches('c'));
+}
+
+// Tests that Le(v) describes itself properly.
+TEST(LeTest, CanDescribeSelf) {
+  Matcher<int> m = Le(5);
+  EXPECT_EQ("is <= 5", Describe(m));
+}
+
+// Tests that Lt(v) matches anything < v.
+TEST(LtTest, ImplementsLessThan) {
+  Matcher<const std::string&> m1 = Lt("Hello");
+  EXPECT_TRUE(m1.Matches("Abc"));
+  EXPECT_FALSE(m1.Matches("Hello"));
+  EXPECT_FALSE(m1.Matches("Hello, world!"));
+}
+
+// Tests that Lt(v) describes itself properly.
+TEST(LtTest, CanDescribeSelf) {
+  Matcher<int> m = Lt(5);
+  EXPECT_EQ("is < 5", Describe(m));
+}
+
+// Tests that Ne(v) matches anything != v.
+TEST(NeTest, ImplementsNotEqual) {
+  Matcher<int> m1 = Ne(0);
+  EXPECT_TRUE(m1.Matches(1));
+  EXPECT_TRUE(m1.Matches(-1));
+  EXPECT_FALSE(m1.Matches(0));
+}
+
+// Tests that Ne(v) describes itself properly.
+TEST(NeTest, CanDescribeSelf) {
+  Matcher<int> m = Ne(5);
+  EXPECT_EQ("isn't equal to 5", Describe(m));
+}
+
+class MoveOnly {
+ public:
+  explicit MoveOnly(int i) : i_(i) {}
+  MoveOnly(const MoveOnly&) = delete;
+  MoveOnly(MoveOnly&&) = default;
+  MoveOnly& operator=(const MoveOnly&) = delete;
+  MoveOnly& operator=(MoveOnly&&) = default;
+
+  bool operator==(const MoveOnly& other) const { return i_ == other.i_; }
+  bool operator!=(const MoveOnly& other) const { return i_ != other.i_; }
+  bool operator<(const MoveOnly& other) const { return i_ < other.i_; }
+  bool operator<=(const MoveOnly& other) const { return i_ <= other.i_; }
+  bool operator>(const MoveOnly& other) const { return i_ > other.i_; }
+  bool operator>=(const MoveOnly& other) const { return i_ >= other.i_; }
+
+ private:
+  int i_;
+};
+
+struct MoveHelper {
+  MOCK_METHOD1(Call, void(MoveOnly));
+};
+
+TEST(ComparisonBaseTest, WorksWithMoveOnly) {
+  MoveOnly m{0};
+  MoveHelper helper;
+
+  EXPECT_CALL(helper, Call(Eq(ByRef(m))));
+  helper.Call(MoveOnly(0));
+  EXPECT_CALL(helper, Call(Ne(ByRef(m))));
+  helper.Call(MoveOnly(1));
+  EXPECT_CALL(helper, Call(Le(ByRef(m))));
+  helper.Call(MoveOnly(0));
+  EXPECT_CALL(helper, Call(Lt(ByRef(m))));
+  helper.Call(MoveOnly(-1));
+  EXPECT_CALL(helper, Call(Ge(ByRef(m))));
+  helper.Call(MoveOnly(0));
+  EXPECT_CALL(helper, Call(Gt(ByRef(m))));
+  helper.Call(MoveOnly(1));
+}
+
+// Tests that IsNull() matches any NULL pointer of any type.
+TEST(IsNullTest, MatchesNullPointer) {
+  Matcher<int*> m1 = IsNull();
+  int* p1 = nullptr;
+  int n = 0;
+  EXPECT_TRUE(m1.Matches(p1));
+  EXPECT_FALSE(m1.Matches(&n));
+
+  Matcher<const char*> m2 = IsNull();
+  const char* p2 = nullptr;
+  EXPECT_TRUE(m2.Matches(p2));
+  EXPECT_FALSE(m2.Matches("hi"));
+
+  Matcher<void*> m3 = IsNull();
+  void* p3 = nullptr;
+  EXPECT_TRUE(m3.Matches(p3));
+  EXPECT_FALSE(m3.Matches(reinterpret_cast<void*>(0xbeef)));
+}
+
+TEST(IsNullTest, StdFunction) {
+  const Matcher<std::function<void()>> m = IsNull();
+
+  EXPECT_TRUE(m.Matches(std::function<void()>()));
+  EXPECT_FALSE(m.Matches([]{}));
+}
+
+// Tests that IsNull() describes itself properly.
+TEST(IsNullTest, CanDescribeSelf) {
+  Matcher<int*> m = IsNull();
+  EXPECT_EQ("is NULL", Describe(m));
+  EXPECT_EQ("isn't NULL", DescribeNegation(m));
+}
+
+// Tests that NotNull() matches any non-NULL pointer of any type.
+TEST(NotNullTest, MatchesNonNullPointer) {
+  Matcher<int*> m1 = NotNull();
+  int* p1 = nullptr;
+  int n = 0;
+  EXPECT_FALSE(m1.Matches(p1));
+  EXPECT_TRUE(m1.Matches(&n));
+
+  Matcher<const char*> m2 = NotNull();
+  const char* p2 = nullptr;
+  EXPECT_FALSE(m2.Matches(p2));
+  EXPECT_TRUE(m2.Matches("hi"));
+}
+
+TEST(NotNullTest, LinkedPtr) {
+  const Matcher<std::shared_ptr<int>> m = NotNull();
+  const std::shared_ptr<int> null_p;
+  const std::shared_ptr<int> non_null_p(new int);
+
+  EXPECT_FALSE(m.Matches(null_p));
+  EXPECT_TRUE(m.Matches(non_null_p));
+}
+
+TEST(NotNullTest, ReferenceToConstLinkedPtr) {
+  const Matcher<const std::shared_ptr<double>&> m = NotNull();
+  const std::shared_ptr<double> null_p;
+  const std::shared_ptr<double> non_null_p(new double);
+
+  EXPECT_FALSE(m.Matches(null_p));
+  EXPECT_TRUE(m.Matches(non_null_p));
+}
+
+TEST(NotNullTest, StdFunction) {
+  const Matcher<std::function<void()>> m = NotNull();
+
+  EXPECT_TRUE(m.Matches([]{}));
+  EXPECT_FALSE(m.Matches(std::function<void()>()));
+}
+
+// Tests that NotNull() describes itself properly.
+TEST(NotNullTest, CanDescribeSelf) {
+  Matcher<int*> m = NotNull();
+  EXPECT_EQ("isn't NULL", Describe(m));
+}
+
+// Tests that Ref(variable) matches an argument that references
+// 'variable'.
+TEST(RefTest, MatchesSameVariable) {
+  int a = 0;
+  int b = 0;
+  Matcher<int&> m = Ref(a);
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_FALSE(m.Matches(b));
+}
+
+// Tests that Ref(variable) describes itself properly.
+TEST(RefTest, CanDescribeSelf) {
+  int n = 5;
+  Matcher<int&> m = Ref(n);
+  stringstream ss;
+  ss << "references the variable @" << &n << " 5";
+  EXPECT_EQ(ss.str(), Describe(m));
+}
+
+// Test that Ref(non_const_varialbe) can be used as a matcher for a
+// const reference.
+TEST(RefTest, CanBeUsedAsMatcherForConstReference) {
+  int a = 0;
+  int b = 0;
+  Matcher<const int&> m = Ref(a);
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_FALSE(m.Matches(b));
+}
+
+// Tests that Ref(variable) is covariant, i.e. Ref(derived) can be
+// used wherever Ref(base) can be used (Ref(derived) is a sub-type
+// of Ref(base), but not vice versa.
+
+TEST(RefTest, IsCovariant) {
+  Base base, base2;
+  Derived derived;
+  Matcher<const Base&> m1 = Ref(base);
+  EXPECT_TRUE(m1.Matches(base));
+  EXPECT_FALSE(m1.Matches(base2));
+  EXPECT_FALSE(m1.Matches(derived));
+
+  m1 = Ref(derived);
+  EXPECT_TRUE(m1.Matches(derived));
+  EXPECT_FALSE(m1.Matches(base));
+  EXPECT_FALSE(m1.Matches(base2));
+}
+
+TEST(RefTest, ExplainsResult) {
+  int n = 0;
+  EXPECT_THAT(Explain(Matcher<const int&>(Ref(n)), n),
+              StartsWith("which is located @"));
+
+  int m = 0;
+  EXPECT_THAT(Explain(Matcher<const int&>(Ref(n)), m),
+              StartsWith("which is located @"));
+}
+
+// Tests string comparison matchers.
+
+TEST(StrEqTest, MatchesEqualString) {
+  Matcher<const char*> m = StrEq(std::string("Hello"));
+  EXPECT_TRUE(m.Matches("Hello"));
+  EXPECT_FALSE(m.Matches("hello"));
+  EXPECT_FALSE(m.Matches(nullptr));
+
+  Matcher<const std::string&> m2 = StrEq("Hello");
+  EXPECT_TRUE(m2.Matches("Hello"));
+  EXPECT_FALSE(m2.Matches("Hi"));
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  Matcher<const internal::StringView&> m3 = StrEq("Hello");
+  EXPECT_TRUE(m3.Matches(internal::StringView("Hello")));
+  EXPECT_FALSE(m3.Matches(internal::StringView("hello")));
+  EXPECT_FALSE(m3.Matches(internal::StringView()));
+
+  Matcher<const internal::StringView&> m_empty = StrEq("");
+  EXPECT_TRUE(m_empty.Matches(internal::StringView("")));
+  EXPECT_TRUE(m_empty.Matches(internal::StringView()));
+  EXPECT_FALSE(m_empty.Matches(internal::StringView("hello")));
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+}
+
+TEST(StrEqTest, CanDescribeSelf) {
+  Matcher<std::string> m = StrEq("Hi-\'\"?\\\a\b\f\n\r\t\v\xD3");
+  EXPECT_EQ("is equal to \"Hi-\'\\\"?\\\\\\a\\b\\f\\n\\r\\t\\v\\xD3\"",
+      Describe(m));
+
+  std::string str("01204500800");
+  str[3] = '\0';
+  Matcher<std::string> m2 = StrEq(str);
+  EXPECT_EQ("is equal to \"012\\04500800\"", Describe(m2));
+  str[0] = str[6] = str[7] = str[9] = str[10] = '\0';
+  Matcher<std::string> m3 = StrEq(str);
+  EXPECT_EQ("is equal to \"\\012\\045\\0\\08\\0\\0\"", Describe(m3));
+}
+
+TEST(StrNeTest, MatchesUnequalString) {
+  Matcher<const char*> m = StrNe("Hello");
+  EXPECT_TRUE(m.Matches(""));
+  EXPECT_TRUE(m.Matches(nullptr));
+  EXPECT_FALSE(m.Matches("Hello"));
+
+  Matcher<std::string> m2 = StrNe(std::string("Hello"));
+  EXPECT_TRUE(m2.Matches("hello"));
+  EXPECT_FALSE(m2.Matches("Hello"));
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  Matcher<const internal::StringView> m3 = StrNe("Hello");
+  EXPECT_TRUE(m3.Matches(internal::StringView("")));
+  EXPECT_TRUE(m3.Matches(internal::StringView()));
+  EXPECT_FALSE(m3.Matches(internal::StringView("Hello")));
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+}
+
+TEST(StrNeTest, CanDescribeSelf) {
+  Matcher<const char*> m = StrNe("Hi");
+  EXPECT_EQ("isn't equal to \"Hi\"", Describe(m));
+}
+
+TEST(StrCaseEqTest, MatchesEqualStringIgnoringCase) {
+  Matcher<const char*> m = StrCaseEq(std::string("Hello"));
+  EXPECT_TRUE(m.Matches("Hello"));
+  EXPECT_TRUE(m.Matches("hello"));
+  EXPECT_FALSE(m.Matches("Hi"));
+  EXPECT_FALSE(m.Matches(nullptr));
+
+  Matcher<const std::string&> m2 = StrCaseEq("Hello");
+  EXPECT_TRUE(m2.Matches("hello"));
+  EXPECT_FALSE(m2.Matches("Hi"));
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  Matcher<const internal::StringView&> m3 = StrCaseEq(std::string("Hello"));
+  EXPECT_TRUE(m3.Matches(internal::StringView("Hello")));
+  EXPECT_TRUE(m3.Matches(internal::StringView("hello")));
+  EXPECT_FALSE(m3.Matches(internal::StringView("Hi")));
+  EXPECT_FALSE(m3.Matches(internal::StringView()));
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+}
+
+TEST(StrCaseEqTest, MatchesEqualStringWith0IgnoringCase) {
+  std::string str1("oabocdooeoo");
+  std::string str2("OABOCDOOEOO");
+  Matcher<const std::string&> m0 = StrCaseEq(str1);
+  EXPECT_FALSE(m0.Matches(str2 + std::string(1, '\0')));
+
+  str1[3] = str2[3] = '\0';
+  Matcher<const std::string&> m1 = StrCaseEq(str1);
+  EXPECT_TRUE(m1.Matches(str2));
+
+  str1[0] = str1[6] = str1[7] = str1[10] = '\0';
+  str2[0] = str2[6] = str2[7] = str2[10] = '\0';
+  Matcher<const std::string&> m2 = StrCaseEq(str1);
+  str1[9] = str2[9] = '\0';
+  EXPECT_FALSE(m2.Matches(str2));
+
+  Matcher<const std::string&> m3 = StrCaseEq(str1);
+  EXPECT_TRUE(m3.Matches(str2));
+
+  EXPECT_FALSE(m3.Matches(str2 + "x"));
+  str2.append(1, '\0');
+  EXPECT_FALSE(m3.Matches(str2));
+  EXPECT_FALSE(m3.Matches(std::string(str2, 0, 9)));
+}
+
+TEST(StrCaseEqTest, CanDescribeSelf) {
+  Matcher<std::string> m = StrCaseEq("Hi");
+  EXPECT_EQ("is equal to (ignoring case) \"Hi\"", Describe(m));
+}
+
+TEST(StrCaseNeTest, MatchesUnequalStringIgnoringCase) {
+  Matcher<const char*> m = StrCaseNe("Hello");
+  EXPECT_TRUE(m.Matches("Hi"));
+  EXPECT_TRUE(m.Matches(nullptr));
+  EXPECT_FALSE(m.Matches("Hello"));
+  EXPECT_FALSE(m.Matches("hello"));
+
+  Matcher<std::string> m2 = StrCaseNe(std::string("Hello"));
+  EXPECT_TRUE(m2.Matches(""));
+  EXPECT_FALSE(m2.Matches("Hello"));
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  Matcher<const internal::StringView> m3 = StrCaseNe("Hello");
+  EXPECT_TRUE(m3.Matches(internal::StringView("Hi")));
+  EXPECT_TRUE(m3.Matches(internal::StringView()));
+  EXPECT_FALSE(m3.Matches(internal::StringView("Hello")));
+  EXPECT_FALSE(m3.Matches(internal::StringView("hello")));
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+}
+
+TEST(StrCaseNeTest, CanDescribeSelf) {
+  Matcher<const char*> m = StrCaseNe("Hi");
+  EXPECT_EQ("isn't equal to (ignoring case) \"Hi\"", Describe(m));
+}
+
+// Tests that HasSubstr() works for matching string-typed values.
+TEST(HasSubstrTest, WorksForStringClasses) {
+  const Matcher<std::string> m1 = HasSubstr("foo");
+  EXPECT_TRUE(m1.Matches(std::string("I love food.")));
+  EXPECT_FALSE(m1.Matches(std::string("tofo")));
+
+  const Matcher<const std::string&> m2 = HasSubstr("foo");
+  EXPECT_TRUE(m2.Matches(std::string("I love food.")));
+  EXPECT_FALSE(m2.Matches(std::string("tofo")));
+
+  const Matcher<std::string> m_empty = HasSubstr("");
+  EXPECT_TRUE(m_empty.Matches(std::string()));
+  EXPECT_TRUE(m_empty.Matches(std::string("not empty")));
+}
+
+// Tests that HasSubstr() works for matching C-string-typed values.
+TEST(HasSubstrTest, WorksForCStrings) {
+  const Matcher<char*> m1 = HasSubstr("foo");
+  EXPECT_TRUE(m1.Matches(const_cast<char*>("I love food.")));
+  EXPECT_FALSE(m1.Matches(const_cast<char*>("tofo")));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const char*> m2 = HasSubstr("foo");
+  EXPECT_TRUE(m2.Matches("I love food."));
+  EXPECT_FALSE(m2.Matches("tofo"));
+  EXPECT_FALSE(m2.Matches(nullptr));
+
+  const Matcher<const char*> m_empty = HasSubstr("");
+  EXPECT_TRUE(m_empty.Matches("not empty"));
+  EXPECT_TRUE(m_empty.Matches(""));
+  EXPECT_FALSE(m_empty.Matches(nullptr));
+}
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+// Tests that HasSubstr() works for matching StringView-typed values.
+TEST(HasSubstrTest, WorksForStringViewClasses) {
+  const Matcher<internal::StringView> m1 = HasSubstr("foo");
+  EXPECT_TRUE(m1.Matches(internal::StringView("I love food.")));
+  EXPECT_FALSE(m1.Matches(internal::StringView("tofo")));
+  EXPECT_FALSE(m1.Matches(internal::StringView()));
+
+  const Matcher<const internal::StringView&> m2 = HasSubstr("foo");
+  EXPECT_TRUE(m2.Matches(internal::StringView("I love food.")));
+  EXPECT_FALSE(m2.Matches(internal::StringView("tofo")));
+  EXPECT_FALSE(m2.Matches(internal::StringView()));
+
+  const Matcher<const internal::StringView&> m3 = HasSubstr("");
+  EXPECT_TRUE(m3.Matches(internal::StringView("foo")));
+  EXPECT_TRUE(m3.Matches(internal::StringView("")));
+  EXPECT_TRUE(m3.Matches(internal::StringView()));
+}
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+
+// Tests that HasSubstr(s) describes itself properly.
+TEST(HasSubstrTest, CanDescribeSelf) {
+  Matcher<std::string> m = HasSubstr("foo\n\"");
+  EXPECT_EQ("has substring \"foo\\n\\\"\"", Describe(m));
+}
+
+TEST(KeyTest, CanDescribeSelf) {
+  Matcher<const pair<std::string, int>&> m = Key("foo");
+  EXPECT_EQ("has a key that is equal to \"foo\"", Describe(m));
+  EXPECT_EQ("doesn't have a key that is equal to \"foo\"", DescribeNegation(m));
+}
+
+TEST(KeyTest, ExplainsResult) {
+  Matcher<pair<int, bool> > m = Key(GreaterThan(10));
+  EXPECT_EQ("whose first field is a value which is 5 less than 10",
+            Explain(m, make_pair(5, true)));
+  EXPECT_EQ("whose first field is a value which is 5 more than 10",
+            Explain(m, make_pair(15, true)));
+}
+
+TEST(KeyTest, MatchesCorrectly) {
+  pair<int, std::string> p(25, "foo");
+  EXPECT_THAT(p, Key(25));
+  EXPECT_THAT(p, Not(Key(42)));
+  EXPECT_THAT(p, Key(Ge(20)));
+  EXPECT_THAT(p, Not(Key(Lt(25))));
+}
+
+TEST(KeyTest, WorksWithMoveOnly) {
+  pair<std::unique_ptr<int>, std::unique_ptr<int>> p;
+  EXPECT_THAT(p, Key(Eq(nullptr)));
+}
+
+template <size_t I>
+struct Tag {};
+
+struct PairWithGet {
+  int member_1;
+  std::string member_2;
+  using first_type = int;
+  using second_type = std::string;
+
+  const int& GetImpl(Tag<0>) const { return member_1; }
+  const std::string& GetImpl(Tag<1>) const { return member_2; }
+};
+template <size_t I>
+auto get(const PairWithGet& value) -> decltype(value.GetImpl(Tag<I>())) {
+  return value.GetImpl(Tag<I>());
+}
+TEST(PairTest, MatchesPairWithGetCorrectly) {
+  PairWithGet p{25, "foo"};
+  EXPECT_THAT(p, Key(25));
+  EXPECT_THAT(p, Not(Key(42)));
+  EXPECT_THAT(p, Key(Ge(20)));
+  EXPECT_THAT(p, Not(Key(Lt(25))));
+
+  std::vector<PairWithGet> v = {{11, "Foo"}, {29, "gMockIsBestMock"}};
+  EXPECT_THAT(v, Contains(Key(29)));
+}
+
+TEST(KeyTest, SafelyCastsInnerMatcher) {
+  Matcher<int> is_positive = Gt(0);
+  Matcher<int> is_negative = Lt(0);
+  pair<char, bool> p('a', true);
+  EXPECT_THAT(p, Key(is_positive));
+  EXPECT_THAT(p, Not(Key(is_negative)));
+}
+
+TEST(KeyTest, InsideContainsUsingMap) {
+  map<int, char> container;
+  container.insert(make_pair(1, 'a'));
+  container.insert(make_pair(2, 'b'));
+  container.insert(make_pair(4, 'c'));
+  EXPECT_THAT(container, Contains(Key(1)));
+  EXPECT_THAT(container, Not(Contains(Key(3))));
+}
+
+TEST(KeyTest, InsideContainsUsingMultimap) {
+  multimap<int, char> container;
+  container.insert(make_pair(1, 'a'));
+  container.insert(make_pair(2, 'b'));
+  container.insert(make_pair(4, 'c'));
+
+  EXPECT_THAT(container, Not(Contains(Key(25))));
+  container.insert(make_pair(25, 'd'));
+  EXPECT_THAT(container, Contains(Key(25)));
+  container.insert(make_pair(25, 'e'));
+  EXPECT_THAT(container, Contains(Key(25)));
+
+  EXPECT_THAT(container, Contains(Key(1)));
+  EXPECT_THAT(container, Not(Contains(Key(3))));
+}
+
+TEST(PairTest, Typing) {
+  // Test verifies the following type conversions can be compiled.
+  Matcher<const pair<const char*, int>&> m1 = Pair("foo", 42);
+  Matcher<const pair<const char*, int> > m2 = Pair("foo", 42);
+  Matcher<pair<const char*, int> > m3 = Pair("foo", 42);
+
+  Matcher<pair<int, const std::string> > m4 = Pair(25, "42");
+  Matcher<pair<const std::string, int> > m5 = Pair("25", 42);
+}
+
+TEST(PairTest, CanDescribeSelf) {
+  Matcher<const pair<std::string, int>&> m1 = Pair("foo", 42);
+  EXPECT_EQ("has a first field that is equal to \"foo\""
+            ", and has a second field that is equal to 42",
+            Describe(m1));
+  EXPECT_EQ("has a first field that isn't equal to \"foo\""
+            ", or has a second field that isn't equal to 42",
+            DescribeNegation(m1));
+  // Double and triple negation (1 or 2 times not and description of negation).
+  Matcher<const pair<int, int>&> m2 = Not(Pair(Not(13), 42));
+  EXPECT_EQ("has a first field that isn't equal to 13"
+            ", and has a second field that is equal to 42",
+            DescribeNegation(m2));
+}
+
+TEST(PairTest, CanExplainMatchResultTo) {
+  // If neither field matches, Pair() should explain about the first
+  // field.
+  const Matcher<pair<int, int> > m = Pair(GreaterThan(0), GreaterThan(0));
+  EXPECT_EQ("whose first field does not match, which is 1 less than 0",
+            Explain(m, make_pair(-1, -2)));
+
+  // If the first field matches but the second doesn't, Pair() should
+  // explain about the second field.
+  EXPECT_EQ("whose second field does not match, which is 2 less than 0",
+            Explain(m, make_pair(1, -2)));
+
+  // If the first field doesn't match but the second does, Pair()
+  // should explain about the first field.
+  EXPECT_EQ("whose first field does not match, which is 1 less than 0",
+            Explain(m, make_pair(-1, 2)));
+
+  // If both fields match, Pair() should explain about them both.
+  EXPECT_EQ("whose both fields match, where the first field is a value "
+            "which is 1 more than 0, and the second field is a value "
+            "which is 2 more than 0",
+            Explain(m, make_pair(1, 2)));
+
+  // If only the first match has an explanation, only this explanation should
+  // be printed.
+  const Matcher<pair<int, int> > explain_first = Pair(GreaterThan(0), 0);
+  EXPECT_EQ("whose both fields match, where the first field is a value "
+            "which is 1 more than 0",
+            Explain(explain_first, make_pair(1, 0)));
+
+  // If only the second match has an explanation, only this explanation should
+  // be printed.
+  const Matcher<pair<int, int> > explain_second = Pair(0, GreaterThan(0));
+  EXPECT_EQ("whose both fields match, where the second field is a value "
+            "which is 1 more than 0",
+            Explain(explain_second, make_pair(0, 1)));
+}
+
+TEST(PairTest, MatchesCorrectly) {
+  pair<int, std::string> p(25, "foo");
+
+  // Both fields match.
+  EXPECT_THAT(p, Pair(25, "foo"));
+  EXPECT_THAT(p, Pair(Ge(20), HasSubstr("o")));
+
+  // 'first' doesnt' match, but 'second' matches.
+  EXPECT_THAT(p, Not(Pair(42, "foo")));
+  EXPECT_THAT(p, Not(Pair(Lt(25), "foo")));
+
+  // 'first' matches, but 'second' doesn't match.
+  EXPECT_THAT(p, Not(Pair(25, "bar")));
+  EXPECT_THAT(p, Not(Pair(25, Not("foo"))));
+
+  // Neither field matches.
+  EXPECT_THAT(p, Not(Pair(13, "bar")));
+  EXPECT_THAT(p, Not(Pair(Lt(13), HasSubstr("a"))));
+}
+
+TEST(PairTest, WorksWithMoveOnly) {
+  pair<std::unique_ptr<int>, std::unique_ptr<int>> p;
+  p.second.reset(new int(7));
+  EXPECT_THAT(p, Pair(Eq(nullptr), Ne(nullptr)));
+}
+
+TEST(PairTest, SafelyCastsInnerMatchers) {
+  Matcher<int> is_positive = Gt(0);
+  Matcher<int> is_negative = Lt(0);
+  pair<char, bool> p('a', true);
+  EXPECT_THAT(p, Pair(is_positive, _));
+  EXPECT_THAT(p, Not(Pair(is_negative, _)));
+  EXPECT_THAT(p, Pair(_, is_positive));
+  EXPECT_THAT(p, Not(Pair(_, is_negative)));
+}
+
+TEST(PairTest, InsideContainsUsingMap) {
+  map<int, char> container;
+  container.insert(make_pair(1, 'a'));
+  container.insert(make_pair(2, 'b'));
+  container.insert(make_pair(4, 'c'));
+  EXPECT_THAT(container, Contains(Pair(1, 'a')));
+  EXPECT_THAT(container, Contains(Pair(1, _)));
+  EXPECT_THAT(container, Contains(Pair(_, 'a')));
+  EXPECT_THAT(container, Not(Contains(Pair(3, _))));
+}
+
+TEST(ContainsTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(Contains(Pointee(2))));
+  helper.Call(MakeUniquePtrs({1, 2}));
+}
+
+TEST(PairTest, UseGetInsteadOfMembers) {
+  PairWithGet pair{7, "ABC"};
+  EXPECT_THAT(pair, Pair(7, "ABC"));
+  EXPECT_THAT(pair, Pair(Ge(7), HasSubstr("AB")));
+  EXPECT_THAT(pair, Not(Pair(Lt(7), "ABC")));
+
+  std::vector<PairWithGet> v = {{11, "Foo"}, {29, "gMockIsBestMock"}};
+  EXPECT_THAT(v,
+              ElementsAre(Pair(11, std::string("Foo")), Pair(Ge(10), Not(""))));
+}
+
+// Tests StartsWith(s).
+
+TEST(StartsWithTest, MatchesStringWithGivenPrefix) {
+  const Matcher<const char*> m1 = StartsWith(std::string(""));
+  EXPECT_TRUE(m1.Matches("Hi"));
+  EXPECT_TRUE(m1.Matches(""));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const std::string&> m2 = StartsWith("Hi");
+  EXPECT_TRUE(m2.Matches("Hi"));
+  EXPECT_TRUE(m2.Matches("Hi Hi!"));
+  EXPECT_TRUE(m2.Matches("High"));
+  EXPECT_FALSE(m2.Matches("H"));
+  EXPECT_FALSE(m2.Matches(" Hi"));
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  const Matcher<internal::StringView> m_empty = StartsWith("");
+  EXPECT_TRUE(m_empty.Matches(internal::StringView()));
+  EXPECT_TRUE(m_empty.Matches(internal::StringView("")));
+  EXPECT_TRUE(m_empty.Matches(internal::StringView("not empty")));
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+}
+
+TEST(StartsWithTest, CanDescribeSelf) {
+  Matcher<const std::string> m = StartsWith("Hi");
+  EXPECT_EQ("starts with \"Hi\"", Describe(m));
+}
+
+// Tests EndsWith(s).
+
+TEST(EndsWithTest, MatchesStringWithGivenSuffix) {
+  const Matcher<const char*> m1 = EndsWith("");
+  EXPECT_TRUE(m1.Matches("Hi"));
+  EXPECT_TRUE(m1.Matches(""));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const std::string&> m2 = EndsWith(std::string("Hi"));
+  EXPECT_TRUE(m2.Matches("Hi"));
+  EXPECT_TRUE(m2.Matches("Wow Hi Hi"));
+  EXPECT_TRUE(m2.Matches("Super Hi"));
+  EXPECT_FALSE(m2.Matches("i"));
+  EXPECT_FALSE(m2.Matches("Hi "));
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  const Matcher<const internal::StringView&> m4 = EndsWith("");
+  EXPECT_TRUE(m4.Matches("Hi"));
+  EXPECT_TRUE(m4.Matches(""));
+  EXPECT_TRUE(m4.Matches(internal::StringView()));
+  EXPECT_TRUE(m4.Matches(internal::StringView("")));
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+}
+
+TEST(EndsWithTest, CanDescribeSelf) {
+  Matcher<const std::string> m = EndsWith("Hi");
+  EXPECT_EQ("ends with \"Hi\"", Describe(m));
+}
+
+// Tests MatchesRegex().
+
+TEST(MatchesRegexTest, MatchesStringMatchingGivenRegex) {
+  const Matcher<const char*> m1 = MatchesRegex("a.*z");
+  EXPECT_TRUE(m1.Matches("az"));
+  EXPECT_TRUE(m1.Matches("abcz"));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const std::string&> m2 = MatchesRegex(new RE("a.*z"));
+  EXPECT_TRUE(m2.Matches("azbz"));
+  EXPECT_FALSE(m2.Matches("az1"));
+  EXPECT_FALSE(m2.Matches("1az"));
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  const Matcher<const internal::StringView&> m3 = MatchesRegex("a.*z");
+  EXPECT_TRUE(m3.Matches(internal::StringView("az")));
+  EXPECT_TRUE(m3.Matches(internal::StringView("abcz")));
+  EXPECT_FALSE(m3.Matches(internal::StringView("1az")));
+  EXPECT_FALSE(m3.Matches(internal::StringView()));
+  const Matcher<const internal::StringView&> m4 = MatchesRegex("");
+  EXPECT_TRUE(m4.Matches(internal::StringView("")));
+  EXPECT_TRUE(m4.Matches(internal::StringView()));
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+}
+
+TEST(MatchesRegexTest, CanDescribeSelf) {
+  Matcher<const std::string> m1 = MatchesRegex(std::string("Hi.*"));
+  EXPECT_EQ("matches regular expression \"Hi.*\"", Describe(m1));
+
+  Matcher<const char*> m2 = MatchesRegex(new RE("a.*"));
+  EXPECT_EQ("matches regular expression \"a.*\"", Describe(m2));
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  Matcher<const internal::StringView> m3 = MatchesRegex(new RE("0.*"));
+  EXPECT_EQ("matches regular expression \"0.*\"", Describe(m3));
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+}
+
+// Tests ContainsRegex().
+
+TEST(ContainsRegexTest, MatchesStringContainingGivenRegex) {
+  const Matcher<const char*> m1 = ContainsRegex(std::string("a.*z"));
+  EXPECT_TRUE(m1.Matches("az"));
+  EXPECT_TRUE(m1.Matches("0abcz1"));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const std::string&> m2 = ContainsRegex(new RE("a.*z"));
+  EXPECT_TRUE(m2.Matches("azbz"));
+  EXPECT_TRUE(m2.Matches("az1"));
+  EXPECT_FALSE(m2.Matches("1a"));
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  const Matcher<const internal::StringView&> m3 =
+      ContainsRegex(new RE("a.*z"));
+  EXPECT_TRUE(m3.Matches(internal::StringView("azbz")));
+  EXPECT_TRUE(m3.Matches(internal::StringView("az1")));
+  EXPECT_FALSE(m3.Matches(internal::StringView("1a")));
+  EXPECT_FALSE(m3.Matches(internal::StringView()));
+  const Matcher<const internal::StringView&> m4 = ContainsRegex("");
+  EXPECT_TRUE(m4.Matches(internal::StringView("")));
+  EXPECT_TRUE(m4.Matches(internal::StringView()));
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+}
+
+TEST(ContainsRegexTest, CanDescribeSelf) {
+  Matcher<const std::string> m1 = ContainsRegex("Hi.*");
+  EXPECT_EQ("contains regular expression \"Hi.*\"", Describe(m1));
+
+  Matcher<const char*> m2 = ContainsRegex(new RE("a.*"));
+  EXPECT_EQ("contains regular expression \"a.*\"", Describe(m2));
+
+#if GTEST_INTERNAL_HAS_STRING_VIEW
+  Matcher<const internal::StringView> m3 = ContainsRegex(new RE("0.*"));
+  EXPECT_EQ("contains regular expression \"0.*\"", Describe(m3));
+#endif  // GTEST_INTERNAL_HAS_STRING_VIEW
+}
+
+// Tests for wide strings.
+#if GTEST_HAS_STD_WSTRING
+TEST(StdWideStrEqTest, MatchesEqual) {
+  Matcher<const wchar_t*> m = StrEq(::std::wstring(L"Hello"));
+  EXPECT_TRUE(m.Matches(L"Hello"));
+  EXPECT_FALSE(m.Matches(L"hello"));
+  EXPECT_FALSE(m.Matches(nullptr));
+
+  Matcher<const ::std::wstring&> m2 = StrEq(L"Hello");
+  EXPECT_TRUE(m2.Matches(L"Hello"));
+  EXPECT_FALSE(m2.Matches(L"Hi"));
+
+  Matcher<const ::std::wstring&> m3 = StrEq(L"\xD3\x576\x8D3\xC74D");
+  EXPECT_TRUE(m3.Matches(L"\xD3\x576\x8D3\xC74D"));
+  EXPECT_FALSE(m3.Matches(L"\xD3\x576\x8D3\xC74E"));
+
+  ::std::wstring str(L"01204500800");
+  str[3] = L'\0';
+  Matcher<const ::std::wstring&> m4 = StrEq(str);
+  EXPECT_TRUE(m4.Matches(str));
+  str[0] = str[6] = str[7] = str[9] = str[10] = L'\0';
+  Matcher<const ::std::wstring&> m5 = StrEq(str);
+  EXPECT_TRUE(m5.Matches(str));
+}
+
+TEST(StdWideStrEqTest, CanDescribeSelf) {
+  Matcher< ::std::wstring> m = StrEq(L"Hi-\'\"?\\\a\b\f\n\r\t\v");
+  EXPECT_EQ("is equal to L\"Hi-\'\\\"?\\\\\\a\\b\\f\\n\\r\\t\\v\"",
+    Describe(m));
+
+  Matcher< ::std::wstring> m2 = StrEq(L"\xD3\x576\x8D3\xC74D");
+  EXPECT_EQ("is equal to L\"\\xD3\\x576\\x8D3\\xC74D\"",
+    Describe(m2));
+
+  ::std::wstring str(L"01204500800");
+  str[3] = L'\0';
+  Matcher<const ::std::wstring&> m4 = StrEq(str);
+  EXPECT_EQ("is equal to L\"012\\04500800\"", Describe(m4));
+  str[0] = str[6] = str[7] = str[9] = str[10] = L'\0';
+  Matcher<const ::std::wstring&> m5 = StrEq(str);
+  EXPECT_EQ("is equal to L\"\\012\\045\\0\\08\\0\\0\"", Describe(m5));
+}
+
+TEST(StdWideStrNeTest, MatchesUnequalString) {
+  Matcher<const wchar_t*> m = StrNe(L"Hello");
+  EXPECT_TRUE(m.Matches(L""));
+  EXPECT_TRUE(m.Matches(nullptr));
+  EXPECT_FALSE(m.Matches(L"Hello"));
+
+  Matcher< ::std::wstring> m2 = StrNe(::std::wstring(L"Hello"));
+  EXPECT_TRUE(m2.Matches(L"hello"));
+  EXPECT_FALSE(m2.Matches(L"Hello"));
+}
+
+TEST(StdWideStrNeTest, CanDescribeSelf) {
+  Matcher<const wchar_t*> m = StrNe(L"Hi");
+  EXPECT_EQ("isn't equal to L\"Hi\"", Describe(m));
+}
+
+TEST(StdWideStrCaseEqTest, MatchesEqualStringIgnoringCase) {
+  Matcher<const wchar_t*> m = StrCaseEq(::std::wstring(L"Hello"));
+  EXPECT_TRUE(m.Matches(L"Hello"));
+  EXPECT_TRUE(m.Matches(L"hello"));
+  EXPECT_FALSE(m.Matches(L"Hi"));
+  EXPECT_FALSE(m.Matches(nullptr));
+
+  Matcher<const ::std::wstring&> m2 = StrCaseEq(L"Hello");
+  EXPECT_TRUE(m2.Matches(L"hello"));
+  EXPECT_FALSE(m2.Matches(L"Hi"));
+}
+
+TEST(StdWideStrCaseEqTest, MatchesEqualStringWith0IgnoringCase) {
+  ::std::wstring str1(L"oabocdooeoo");
+  ::std::wstring str2(L"OABOCDOOEOO");
+  Matcher<const ::std::wstring&> m0 = StrCaseEq(str1);
+  EXPECT_FALSE(m0.Matches(str2 + ::std::wstring(1, L'\0')));
+
+  str1[3] = str2[3] = L'\0';
+  Matcher<const ::std::wstring&> m1 = StrCaseEq(str1);
+  EXPECT_TRUE(m1.Matches(str2));
+
+  str1[0] = str1[6] = str1[7] = str1[10] = L'\0';
+  str2[0] = str2[6] = str2[7] = str2[10] = L'\0';
+  Matcher<const ::std::wstring&> m2 = StrCaseEq(str1);
+  str1[9] = str2[9] = L'\0';
+  EXPECT_FALSE(m2.Matches(str2));
+
+  Matcher<const ::std::wstring&> m3 = StrCaseEq(str1);
+  EXPECT_TRUE(m3.Matches(str2));
+
+  EXPECT_FALSE(m3.Matches(str2 + L"x"));
+  str2.append(1, L'\0');
+  EXPECT_FALSE(m3.Matches(str2));
+  EXPECT_FALSE(m3.Matches(::std::wstring(str2, 0, 9)));
+}
+
+TEST(StdWideStrCaseEqTest, CanDescribeSelf) {
+  Matcher< ::std::wstring> m = StrCaseEq(L"Hi");
+  EXPECT_EQ("is equal to (ignoring case) L\"Hi\"", Describe(m));
+}
+
+TEST(StdWideStrCaseNeTest, MatchesUnequalStringIgnoringCase) {
+  Matcher<const wchar_t*> m = StrCaseNe(L"Hello");
+  EXPECT_TRUE(m.Matches(L"Hi"));
+  EXPECT_TRUE(m.Matches(nullptr));
+  EXPECT_FALSE(m.Matches(L"Hello"));
+  EXPECT_FALSE(m.Matches(L"hello"));
+
+  Matcher< ::std::wstring> m2 = StrCaseNe(::std::wstring(L"Hello"));
+  EXPECT_TRUE(m2.Matches(L""));
+  EXPECT_FALSE(m2.Matches(L"Hello"));
+}
+
+TEST(StdWideStrCaseNeTest, CanDescribeSelf) {
+  Matcher<const wchar_t*> m = StrCaseNe(L"Hi");
+  EXPECT_EQ("isn't equal to (ignoring case) L\"Hi\"", Describe(m));
+}
+
+// Tests that HasSubstr() works for matching wstring-typed values.
+TEST(StdWideHasSubstrTest, WorksForStringClasses) {
+  const Matcher< ::std::wstring> m1 = HasSubstr(L"foo");
+  EXPECT_TRUE(m1.Matches(::std::wstring(L"I love food.")));
+  EXPECT_FALSE(m1.Matches(::std::wstring(L"tofo")));
+
+  const Matcher<const ::std::wstring&> m2 = HasSubstr(L"foo");
+  EXPECT_TRUE(m2.Matches(::std::wstring(L"I love food.")));
+  EXPECT_FALSE(m2.Matches(::std::wstring(L"tofo")));
+}
+
+// Tests that HasSubstr() works for matching C-wide-string-typed values.
+TEST(StdWideHasSubstrTest, WorksForCStrings) {
+  const Matcher<wchar_t*> m1 = HasSubstr(L"foo");
+  EXPECT_TRUE(m1.Matches(const_cast<wchar_t*>(L"I love food.")));
+  EXPECT_FALSE(m1.Matches(const_cast<wchar_t*>(L"tofo")));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const wchar_t*> m2 = HasSubstr(L"foo");
+  EXPECT_TRUE(m2.Matches(L"I love food."));
+  EXPECT_FALSE(m2.Matches(L"tofo"));
+  EXPECT_FALSE(m2.Matches(nullptr));
+}
+
+// Tests that HasSubstr(s) describes itself properly.
+TEST(StdWideHasSubstrTest, CanDescribeSelf) {
+  Matcher< ::std::wstring> m = HasSubstr(L"foo\n\"");
+  EXPECT_EQ("has substring L\"foo\\n\\\"\"", Describe(m));
+}
+
+// Tests StartsWith(s).
+
+TEST(StdWideStartsWithTest, MatchesStringWithGivenPrefix) {
+  const Matcher<const wchar_t*> m1 = StartsWith(::std::wstring(L""));
+  EXPECT_TRUE(m1.Matches(L"Hi"));
+  EXPECT_TRUE(m1.Matches(L""));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const ::std::wstring&> m2 = StartsWith(L"Hi");
+  EXPECT_TRUE(m2.Matches(L"Hi"));
+  EXPECT_TRUE(m2.Matches(L"Hi Hi!"));
+  EXPECT_TRUE(m2.Matches(L"High"));
+  EXPECT_FALSE(m2.Matches(L"H"));
+  EXPECT_FALSE(m2.Matches(L" Hi"));
+}
+
+TEST(StdWideStartsWithTest, CanDescribeSelf) {
+  Matcher<const ::std::wstring> m = StartsWith(L"Hi");
+  EXPECT_EQ("starts with L\"Hi\"", Describe(m));
+}
+
+// Tests EndsWith(s).
+
+TEST(StdWideEndsWithTest, MatchesStringWithGivenSuffix) {
+  const Matcher<const wchar_t*> m1 = EndsWith(L"");
+  EXPECT_TRUE(m1.Matches(L"Hi"));
+  EXPECT_TRUE(m1.Matches(L""));
+  EXPECT_FALSE(m1.Matches(nullptr));
+
+  const Matcher<const ::std::wstring&> m2 = EndsWith(::std::wstring(L"Hi"));
+  EXPECT_TRUE(m2.Matches(L"Hi"));
+  EXPECT_TRUE(m2.Matches(L"Wow Hi Hi"));
+  EXPECT_TRUE(m2.Matches(L"Super Hi"));
+  EXPECT_FALSE(m2.Matches(L"i"));
+  EXPECT_FALSE(m2.Matches(L"Hi "));
+}
+
+TEST(StdWideEndsWithTest, CanDescribeSelf) {
+  Matcher<const ::std::wstring> m = EndsWith(L"Hi");
+  EXPECT_EQ("ends with L\"Hi\"", Describe(m));
+}
+
+#endif  // GTEST_HAS_STD_WSTRING
+
+typedef ::std::tuple<long, int> Tuple2;  // NOLINT
+
+// Tests that Eq() matches a 2-tuple where the first field == the
+// second field.
+TEST(Eq2Test, MatchesEqualArguments) {
+  Matcher<const Tuple2&> m = Eq();
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 5)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 6)));
+}
+
+// Tests that Eq() describes itself properly.
+TEST(Eq2Test, CanDescribeSelf) {
+  Matcher<const Tuple2&> m = Eq();
+  EXPECT_EQ("are an equal pair", Describe(m));
+}
+
+// Tests that Ge() matches a 2-tuple where the first field >= the
+// second field.
+TEST(Ge2Test, MatchesGreaterThanOrEqualArguments) {
+  Matcher<const Tuple2&> m = Ge();
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 4)));
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 5)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 6)));
+}
+
+// Tests that Ge() describes itself properly.
+TEST(Ge2Test, CanDescribeSelf) {
+  Matcher<const Tuple2&> m = Ge();
+  EXPECT_EQ("are a pair where the first >= the second", Describe(m));
+}
+
+// Tests that Gt() matches a 2-tuple where the first field > the
+// second field.
+TEST(Gt2Test, MatchesGreaterThanArguments) {
+  Matcher<const Tuple2&> m = Gt();
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 4)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 5)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 6)));
+}
+
+// Tests that Gt() describes itself properly.
+TEST(Gt2Test, CanDescribeSelf) {
+  Matcher<const Tuple2&> m = Gt();
+  EXPECT_EQ("are a pair where the first > the second", Describe(m));
+}
+
+// Tests that Le() matches a 2-tuple where the first field <= the
+// second field.
+TEST(Le2Test, MatchesLessThanOrEqualArguments) {
+  Matcher<const Tuple2&> m = Le();
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 6)));
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 5)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 4)));
+}
+
+// Tests that Le() describes itself properly.
+TEST(Le2Test, CanDescribeSelf) {
+  Matcher<const Tuple2&> m = Le();
+  EXPECT_EQ("are a pair where the first <= the second", Describe(m));
+}
+
+// Tests that Lt() matches a 2-tuple where the first field < the
+// second field.
+TEST(Lt2Test, MatchesLessThanArguments) {
+  Matcher<const Tuple2&> m = Lt();
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 6)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 5)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 4)));
+}
+
+// Tests that Lt() describes itself properly.
+TEST(Lt2Test, CanDescribeSelf) {
+  Matcher<const Tuple2&> m = Lt();
+  EXPECT_EQ("are a pair where the first < the second", Describe(m));
+}
+
+// Tests that Ne() matches a 2-tuple where the first field != the
+// second field.
+TEST(Ne2Test, MatchesUnequalArguments) {
+  Matcher<const Tuple2&> m = Ne();
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 6)));
+  EXPECT_TRUE(m.Matches(Tuple2(5L, 4)));
+  EXPECT_FALSE(m.Matches(Tuple2(5L, 5)));
+}
+
+// Tests that Ne() describes itself properly.
+TEST(Ne2Test, CanDescribeSelf) {
+  Matcher<const Tuple2&> m = Ne();
+  EXPECT_EQ("are an unequal pair", Describe(m));
+}
+
+TEST(PairMatchBaseTest, WorksWithMoveOnly) {
+  using Pointers = std::tuple<std::unique_ptr<int>, std::unique_ptr<int>>;
+  Matcher<Pointers> matcher = Eq();
+  Pointers pointers;
+  // Tested values don't matter; the point is that matcher does not copy the
+  // matched values.
+  EXPECT_TRUE(matcher.Matches(pointers));
+}
+
+// Tests that IsNan() matches a NaN, with float.
+TEST(IsNan, FloatMatchesNan) {
+  float quiet_nan = std::numeric_limits<float>::quiet_NaN();
+  float other_nan = std::nanf("1");
+  float real_value = 1.0f;
+
+  Matcher<float> m = IsNan();
+  EXPECT_TRUE(m.Matches(quiet_nan));
+  EXPECT_TRUE(m.Matches(other_nan));
+  EXPECT_FALSE(m.Matches(real_value));
+
+  Matcher<float&> m_ref = IsNan();
+  EXPECT_TRUE(m_ref.Matches(quiet_nan));
+  EXPECT_TRUE(m_ref.Matches(other_nan));
+  EXPECT_FALSE(m_ref.Matches(real_value));
+
+  Matcher<const float&> m_cref = IsNan();
+  EXPECT_TRUE(m_cref.Matches(quiet_nan));
+  EXPECT_TRUE(m_cref.Matches(other_nan));
+  EXPECT_FALSE(m_cref.Matches(real_value));
+}
+
+// Tests that IsNan() matches a NaN, with double.
+TEST(IsNan, DoubleMatchesNan) {
+  double quiet_nan = std::numeric_limits<double>::quiet_NaN();
+  double other_nan = std::nan("1");
+  double real_value = 1.0;
+
+  Matcher<double> m = IsNan();
+  EXPECT_TRUE(m.Matches(quiet_nan));
+  EXPECT_TRUE(m.Matches(other_nan));
+  EXPECT_FALSE(m.Matches(real_value));
+
+  Matcher<double&> m_ref = IsNan();
+  EXPECT_TRUE(m_ref.Matches(quiet_nan));
+  EXPECT_TRUE(m_ref.Matches(other_nan));
+  EXPECT_FALSE(m_ref.Matches(real_value));
+
+  Matcher<const double&> m_cref = IsNan();
+  EXPECT_TRUE(m_cref.Matches(quiet_nan));
+  EXPECT_TRUE(m_cref.Matches(other_nan));
+  EXPECT_FALSE(m_cref.Matches(real_value));
+}
+
+// Tests that IsNan() matches a NaN, with long double.
+TEST(IsNan, LongDoubleMatchesNan) {
+  long double quiet_nan = std::numeric_limits<long double>::quiet_NaN();
+  long double other_nan = std::nan("1");
+  long double real_value = 1.0;
+
+  Matcher<long double> m = IsNan();
+  EXPECT_TRUE(m.Matches(quiet_nan));
+  EXPECT_TRUE(m.Matches(other_nan));
+  EXPECT_FALSE(m.Matches(real_value));
+
+  Matcher<long double&> m_ref = IsNan();
+  EXPECT_TRUE(m_ref.Matches(quiet_nan));
+  EXPECT_TRUE(m_ref.Matches(other_nan));
+  EXPECT_FALSE(m_ref.Matches(real_value));
+
+  Matcher<const long double&> m_cref = IsNan();
+  EXPECT_TRUE(m_cref.Matches(quiet_nan));
+  EXPECT_TRUE(m_cref.Matches(other_nan));
+  EXPECT_FALSE(m_cref.Matches(real_value));
+}
+
+// Tests that IsNan() works with Not.
+TEST(IsNan, NotMatchesNan) {
+  Matcher<float> mf = Not(IsNan());
+  EXPECT_FALSE(mf.Matches(std::numeric_limits<float>::quiet_NaN()));
+  EXPECT_FALSE(mf.Matches(std::nanf("1")));
+  EXPECT_TRUE(mf.Matches(1.0));
+
+  Matcher<double> md = Not(IsNan());
+  EXPECT_FALSE(md.Matches(std::numeric_limits<double>::quiet_NaN()));
+  EXPECT_FALSE(md.Matches(std::nan("1")));
+  EXPECT_TRUE(md.Matches(1.0));
+
+  Matcher<long double> mld = Not(IsNan());
+  EXPECT_FALSE(mld.Matches(std::numeric_limits<long double>::quiet_NaN()));
+  EXPECT_FALSE(mld.Matches(std::nanl("1")));
+  EXPECT_TRUE(mld.Matches(1.0));
+}
+
+// Tests that IsNan() can describe itself.
+TEST(IsNan, CanDescribeSelf) {
+  Matcher<float> mf = IsNan();
+  EXPECT_EQ("is NaN", Describe(mf));
+
+  Matcher<double> md = IsNan();
+  EXPECT_EQ("is NaN", Describe(md));
+
+  Matcher<long double> mld = IsNan();
+  EXPECT_EQ("is NaN", Describe(mld));
+}
+
+// Tests that IsNan() can describe itself with Not.
+TEST(IsNan, CanDescribeSelfWithNot) {
+  Matcher<float> mf = Not(IsNan());
+  EXPECT_EQ("isn't NaN", Describe(mf));
+
+  Matcher<double> md = Not(IsNan());
+  EXPECT_EQ("isn't NaN", Describe(md));
+
+  Matcher<long double> mld = Not(IsNan());
+  EXPECT_EQ("isn't NaN", Describe(mld));
+}
+
+// Tests that FloatEq() matches a 2-tuple where
+// FloatEq(first field) matches the second field.
+TEST(FloatEq2Test, MatchesEqualArguments) {
+  typedef ::std::tuple<float, float> Tpl;
+  Matcher<const Tpl&> m = FloatEq();
+  EXPECT_TRUE(m.Matches(Tpl(1.0f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(0.3f, 0.1f + 0.1f + 0.1f)));
+  EXPECT_FALSE(m.Matches(Tpl(1.1f, 1.0f)));
+}
+
+// Tests that FloatEq() describes itself properly.
+TEST(FloatEq2Test, CanDescribeSelf) {
+  Matcher<const ::std::tuple<float, float>&> m = FloatEq();
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that NanSensitiveFloatEq() matches a 2-tuple where
+// NanSensitiveFloatEq(first field) matches the second field.
+TEST(NanSensitiveFloatEqTest, MatchesEqualArgumentsWithNaN) {
+  typedef ::std::tuple<float, float> Tpl;
+  Matcher<const Tpl&> m = NanSensitiveFloatEq();
+  EXPECT_TRUE(m.Matches(Tpl(1.0f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(std::numeric_limits<float>::quiet_NaN(),
+                            std::numeric_limits<float>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(1.1f, 1.0f)));
+  EXPECT_FALSE(m.Matches(Tpl(1.0f, std::numeric_limits<float>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(std::numeric_limits<float>::quiet_NaN(), 1.0f)));
+}
+
+// Tests that NanSensitiveFloatEq() describes itself properly.
+TEST(NanSensitiveFloatEqTest, CanDescribeSelfWithNaNs) {
+  Matcher<const ::std::tuple<float, float>&> m = NanSensitiveFloatEq();
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that DoubleEq() matches a 2-tuple where
+// DoubleEq(first field) matches the second field.
+TEST(DoubleEq2Test, MatchesEqualArguments) {
+  typedef ::std::tuple<double, double> Tpl;
+  Matcher<const Tpl&> m = DoubleEq();
+  EXPECT_TRUE(m.Matches(Tpl(1.0, 1.0)));
+  EXPECT_TRUE(m.Matches(Tpl(0.3, 0.1 + 0.1 + 0.1)));
+  EXPECT_FALSE(m.Matches(Tpl(1.1, 1.0)));
+}
+
+// Tests that DoubleEq() describes itself properly.
+TEST(DoubleEq2Test, CanDescribeSelf) {
+  Matcher<const ::std::tuple<double, double>&> m = DoubleEq();
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that NanSensitiveDoubleEq() matches a 2-tuple where
+// NanSensitiveDoubleEq(first field) matches the second field.
+TEST(NanSensitiveDoubleEqTest, MatchesEqualArgumentsWithNaN) {
+  typedef ::std::tuple<double, double> Tpl;
+  Matcher<const Tpl&> m = NanSensitiveDoubleEq();
+  EXPECT_TRUE(m.Matches(Tpl(1.0f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(std::numeric_limits<double>::quiet_NaN(),
+                            std::numeric_limits<double>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(1.1f, 1.0f)));
+  EXPECT_FALSE(m.Matches(Tpl(1.0f, std::numeric_limits<double>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(std::numeric_limits<double>::quiet_NaN(), 1.0f)));
+}
+
+// Tests that DoubleEq() describes itself properly.
+TEST(NanSensitiveDoubleEqTest, CanDescribeSelfWithNaNs) {
+  Matcher<const ::std::tuple<double, double>&> m = NanSensitiveDoubleEq();
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that FloatEq() matches a 2-tuple where
+// FloatNear(first field, max_abs_error) matches the second field.
+TEST(FloatNear2Test, MatchesEqualArguments) {
+  typedef ::std::tuple<float, float> Tpl;
+  Matcher<const Tpl&> m = FloatNear(0.5f);
+  EXPECT_TRUE(m.Matches(Tpl(1.0f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(1.3f, 1.0f)));
+  EXPECT_FALSE(m.Matches(Tpl(1.8f, 1.0f)));
+}
+
+// Tests that FloatNear() describes itself properly.
+TEST(FloatNear2Test, CanDescribeSelf) {
+  Matcher<const ::std::tuple<float, float>&> m = FloatNear(0.5f);
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that NanSensitiveFloatNear() matches a 2-tuple where
+// NanSensitiveFloatNear(first field) matches the second field.
+TEST(NanSensitiveFloatNearTest, MatchesNearbyArgumentsWithNaN) {
+  typedef ::std::tuple<float, float> Tpl;
+  Matcher<const Tpl&> m = NanSensitiveFloatNear(0.5f);
+  EXPECT_TRUE(m.Matches(Tpl(1.0f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(1.1f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(std::numeric_limits<float>::quiet_NaN(),
+                            std::numeric_limits<float>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(1.6f, 1.0f)));
+  EXPECT_FALSE(m.Matches(Tpl(1.0f, std::numeric_limits<float>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(std::numeric_limits<float>::quiet_NaN(), 1.0f)));
+}
+
+// Tests that NanSensitiveFloatNear() describes itself properly.
+TEST(NanSensitiveFloatNearTest, CanDescribeSelfWithNaNs) {
+  Matcher<const ::std::tuple<float, float>&> m = NanSensitiveFloatNear(0.5f);
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that FloatEq() matches a 2-tuple where
+// DoubleNear(first field, max_abs_error) matches the second field.
+TEST(DoubleNear2Test, MatchesEqualArguments) {
+  typedef ::std::tuple<double, double> Tpl;
+  Matcher<const Tpl&> m = DoubleNear(0.5);
+  EXPECT_TRUE(m.Matches(Tpl(1.0, 1.0)));
+  EXPECT_TRUE(m.Matches(Tpl(1.3, 1.0)));
+  EXPECT_FALSE(m.Matches(Tpl(1.8, 1.0)));
+}
+
+// Tests that DoubleNear() describes itself properly.
+TEST(DoubleNear2Test, CanDescribeSelf) {
+  Matcher<const ::std::tuple<double, double>&> m = DoubleNear(0.5);
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that NanSensitiveDoubleNear() matches a 2-tuple where
+// NanSensitiveDoubleNear(first field) matches the second field.
+TEST(NanSensitiveDoubleNearTest, MatchesNearbyArgumentsWithNaN) {
+  typedef ::std::tuple<double, double> Tpl;
+  Matcher<const Tpl&> m = NanSensitiveDoubleNear(0.5f);
+  EXPECT_TRUE(m.Matches(Tpl(1.0f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(1.1f, 1.0f)));
+  EXPECT_TRUE(m.Matches(Tpl(std::numeric_limits<double>::quiet_NaN(),
+                            std::numeric_limits<double>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(1.6f, 1.0f)));
+  EXPECT_FALSE(m.Matches(Tpl(1.0f, std::numeric_limits<double>::quiet_NaN())));
+  EXPECT_FALSE(m.Matches(Tpl(std::numeric_limits<double>::quiet_NaN(), 1.0f)));
+}
+
+// Tests that NanSensitiveDoubleNear() describes itself properly.
+TEST(NanSensitiveDoubleNearTest, CanDescribeSelfWithNaNs) {
+  Matcher<const ::std::tuple<double, double>&> m = NanSensitiveDoubleNear(0.5f);
+  EXPECT_EQ("are an almost-equal pair", Describe(m));
+}
+
+// Tests that Not(m) matches any value that doesn't match m.
+TEST(NotTest, NegatesMatcher) {
+  Matcher<int> m;
+  m = Not(Eq(2));
+  EXPECT_TRUE(m.Matches(3));
+  EXPECT_FALSE(m.Matches(2));
+}
+
+// Tests that Not(m) describes itself properly.
+TEST(NotTest, CanDescribeSelf) {
+  Matcher<int> m = Not(Eq(5));
+  EXPECT_EQ("isn't equal to 5", Describe(m));
+}
+
+// Tests that monomorphic matchers are safely cast by the Not matcher.
+TEST(NotTest, NotMatcherSafelyCastsMonomorphicMatchers) {
+  // greater_than_5 is a monomorphic matcher.
+  Matcher<int> greater_than_5 = Gt(5);
+
+  Matcher<const int&> m = Not(greater_than_5);
+  Matcher<int&> m2 = Not(greater_than_5);
+  Matcher<int&> m3 = Not(m);
+}
+
+// Helper to allow easy testing of AllOf matchers with num parameters.
+void AllOfMatches(int num, const Matcher<int>& m) {
+  SCOPED_TRACE(Describe(m));
+  EXPECT_TRUE(m.Matches(0));
+  for (int i = 1; i <= num; ++i) {
+    EXPECT_FALSE(m.Matches(i));
+  }
+  EXPECT_TRUE(m.Matches(num + 1));
+}
+
+// Tests that AllOf(m1, ..., mn) matches any value that matches all of
+// the given matchers.
+TEST(AllOfTest, MatchesWhenAllMatch) {
+  Matcher<int> m;
+  m = AllOf(Le(2), Ge(1));
+  EXPECT_TRUE(m.Matches(1));
+  EXPECT_TRUE(m.Matches(2));
+  EXPECT_FALSE(m.Matches(0));
+  EXPECT_FALSE(m.Matches(3));
+
+  m = AllOf(Gt(0), Ne(1), Ne(2));
+  EXPECT_TRUE(m.Matches(3));
+  EXPECT_FALSE(m.Matches(2));
+  EXPECT_FALSE(m.Matches(1));
+  EXPECT_FALSE(m.Matches(0));
+
+  m = AllOf(Gt(0), Ne(1), Ne(2), Ne(3));
+  EXPECT_TRUE(m.Matches(4));
+  EXPECT_FALSE(m.Matches(3));
+  EXPECT_FALSE(m.Matches(2));
+  EXPECT_FALSE(m.Matches(1));
+  EXPECT_FALSE(m.Matches(0));
+
+  m = AllOf(Ge(0), Lt(10), Ne(3), Ne(5), Ne(7));
+  EXPECT_TRUE(m.Matches(0));
+  EXPECT_TRUE(m.Matches(1));
+  EXPECT_FALSE(m.Matches(3));
+
+  // The following tests for varying number of sub-matchers. Due to the way
+  // the sub-matchers are handled it is enough to test every sub-matcher once
+  // with sub-matchers using the same matcher type. Varying matcher types are
+  // checked for above.
+  AllOfMatches(2, AllOf(Ne(1), Ne(2)));
+  AllOfMatches(3, AllOf(Ne(1), Ne(2), Ne(3)));
+  AllOfMatches(4, AllOf(Ne(1), Ne(2), Ne(3), Ne(4)));
+  AllOfMatches(5, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5)));
+  AllOfMatches(6, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6)));
+  AllOfMatches(7, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6), Ne(7)));
+  AllOfMatches(8, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6), Ne(7),
+                        Ne(8)));
+  AllOfMatches(9, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6), Ne(7),
+                        Ne(8), Ne(9)));
+  AllOfMatches(10, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6), Ne(7), Ne(8),
+                         Ne(9), Ne(10)));
+  AllOfMatches(
+      50, AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6), Ne(7), Ne(8), Ne(9),
+                Ne(10), Ne(11), Ne(12), Ne(13), Ne(14), Ne(15), Ne(16), Ne(17),
+                Ne(18), Ne(19), Ne(20), Ne(21), Ne(22), Ne(23), Ne(24), Ne(25),
+                Ne(26), Ne(27), Ne(28), Ne(29), Ne(30), Ne(31), Ne(32), Ne(33),
+                Ne(34), Ne(35), Ne(36), Ne(37), Ne(38), Ne(39), Ne(40), Ne(41),
+                Ne(42), Ne(43), Ne(44), Ne(45), Ne(46), Ne(47), Ne(48), Ne(49),
+                Ne(50)));
+}
+
+
+// Tests that AllOf(m1, ..., mn) describes itself properly.
+TEST(AllOfTest, CanDescribeSelf) {
+  Matcher<int> m;
+  m = AllOf(Le(2), Ge(1));
+  EXPECT_EQ("(is <= 2) and (is >= 1)", Describe(m));
+
+  m = AllOf(Gt(0), Ne(1), Ne(2));
+  std::string expected_descr1 =
+      "(is > 0) and (isn't equal to 1) and (isn't equal to 2)";
+  EXPECT_EQ(expected_descr1, Describe(m));
+
+  m = AllOf(Gt(0), Ne(1), Ne(2), Ne(3));
+  std::string expected_descr2 =
+      "(is > 0) and (isn't equal to 1) and (isn't equal to 2) and (isn't equal "
+      "to 3)";
+  EXPECT_EQ(expected_descr2, Describe(m));
+
+  m = AllOf(Ge(0), Lt(10), Ne(3), Ne(5), Ne(7));
+  std::string expected_descr3 =
+      "(is >= 0) and (is < 10) and (isn't equal to 3) and (isn't equal to 5) "
+      "and (isn't equal to 7)";
+  EXPECT_EQ(expected_descr3, Describe(m));
+}
+
+// Tests that AllOf(m1, ..., mn) describes its negation properly.
+TEST(AllOfTest, CanDescribeNegation) {
+  Matcher<int> m;
+  m = AllOf(Le(2), Ge(1));
+  std::string expected_descr4 = "(isn't <= 2) or (isn't >= 1)";
+  EXPECT_EQ(expected_descr4, DescribeNegation(m));
+
+  m = AllOf(Gt(0), Ne(1), Ne(2));
+  std::string expected_descr5 =
+      "(isn't > 0) or (is equal to 1) or (is equal to 2)";
+  EXPECT_EQ(expected_descr5, DescribeNegation(m));
+
+  m = AllOf(Gt(0), Ne(1), Ne(2), Ne(3));
+  std::string expected_descr6 =
+      "(isn't > 0) or (is equal to 1) or (is equal to 2) or (is equal to 3)";
+  EXPECT_EQ(expected_descr6, DescribeNegation(m));
+
+  m = AllOf(Ge(0), Lt(10), Ne(3), Ne(5), Ne(7));
+  std::string expected_desr7 =
+      "(isn't >= 0) or (isn't < 10) or (is equal to 3) or (is equal to 5) or "
+      "(is equal to 7)";
+  EXPECT_EQ(expected_desr7, DescribeNegation(m));
+
+  m = AllOf(Ne(1), Ne(2), Ne(3), Ne(4), Ne(5), Ne(6), Ne(7), Ne(8), Ne(9),
+            Ne(10), Ne(11));
+  AllOf(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+  EXPECT_THAT(Describe(m), EndsWith("and (isn't equal to 11)"));
+  AllOfMatches(11, m);
+}
+
+// Tests that monomorphic matchers are safely cast by the AllOf matcher.
+TEST(AllOfTest, AllOfMatcherSafelyCastsMonomorphicMatchers) {
+  // greater_than_5 and less_than_10 are monomorphic matchers.
+  Matcher<int> greater_than_5 = Gt(5);
+  Matcher<int> less_than_10 = Lt(10);
+
+  Matcher<const int&> m = AllOf(greater_than_5, less_than_10);
+  Matcher<int&> m2 = AllOf(greater_than_5, less_than_10);
+  Matcher<int&> m3 = AllOf(greater_than_5, m2);
+
+  // Tests that BothOf works when composing itself.
+  Matcher<const int&> m4 = AllOf(greater_than_5, less_than_10, less_than_10);
+  Matcher<int&> m5 = AllOf(greater_than_5, less_than_10, less_than_10);
+}
+
+TEST(AllOfTest, ExplainsResult) {
+  Matcher<int> m;
+
+  // Successful match.  Both matchers need to explain.  The second
+  // matcher doesn't give an explanation, so only the first matcher's
+  // explanation is printed.
+  m = AllOf(GreaterThan(10), Lt(30));
+  EXPECT_EQ("which is 15 more than 10", Explain(m, 25));
+
+  // Successful match.  Both matchers need to explain.
+  m = AllOf(GreaterThan(10), GreaterThan(20));
+  EXPECT_EQ("which is 20 more than 10, and which is 10 more than 20",
+            Explain(m, 30));
+
+  // Successful match.  All matchers need to explain.  The second
+  // matcher doesn't given an explanation.
+  m = AllOf(GreaterThan(10), Lt(30), GreaterThan(20));
+  EXPECT_EQ("which is 15 more than 10, and which is 5 more than 20",
+            Explain(m, 25));
+
+  // Successful match.  All matchers need to explain.
+  m = AllOf(GreaterThan(10), GreaterThan(20), GreaterThan(30));
+  EXPECT_EQ("which is 30 more than 10, and which is 20 more than 20, "
+            "and which is 10 more than 30",
+            Explain(m, 40));
+
+  // Failed match.  The first matcher, which failed, needs to
+  // explain.
+  m = AllOf(GreaterThan(10), GreaterThan(20));
+  EXPECT_EQ("which is 5 less than 10", Explain(m, 5));
+
+  // Failed match.  The second matcher, which failed, needs to
+  // explain.  Since it doesn't given an explanation, nothing is
+  // printed.
+  m = AllOf(GreaterThan(10), Lt(30));
+  EXPECT_EQ("", Explain(m, 40));
+
+  // Failed match.  The second matcher, which failed, needs to
+  // explain.
+  m = AllOf(GreaterThan(10), GreaterThan(20));
+  EXPECT_EQ("which is 5 less than 20", Explain(m, 15));
+}
+
+// Helper to allow easy testing of AnyOf matchers with num parameters.
+static void AnyOfMatches(int num, const Matcher<int>& m) {
+  SCOPED_TRACE(Describe(m));
+  EXPECT_FALSE(m.Matches(0));
+  for (int i = 1; i <= num; ++i) {
+    EXPECT_TRUE(m.Matches(i));
+  }
+  EXPECT_FALSE(m.Matches(num + 1));
+}
+
+static void AnyOfStringMatches(int num, const Matcher<std::string>& m) {
+  SCOPED_TRACE(Describe(m));
+  EXPECT_FALSE(m.Matches(std::to_string(0)));
+
+  for (int i = 1; i <= num; ++i) {
+    EXPECT_TRUE(m.Matches(std::to_string(i)));
+  }
+  EXPECT_FALSE(m.Matches(std::to_string(num + 1)));
+}
+
+// Tests that AnyOf(m1, ..., mn) matches any value that matches at
+// least one of the given matchers.
+TEST(AnyOfTest, MatchesWhenAnyMatches) {
+  Matcher<int> m;
+  m = AnyOf(Le(1), Ge(3));
+  EXPECT_TRUE(m.Matches(1));
+  EXPECT_TRUE(m.Matches(4));
+  EXPECT_FALSE(m.Matches(2));
+
+  m = AnyOf(Lt(0), Eq(1), Eq(2));
+  EXPECT_TRUE(m.Matches(-1));
+  EXPECT_TRUE(m.Matches(1));
+  EXPECT_TRUE(m.Matches(2));
+  EXPECT_FALSE(m.Matches(0));
+
+  m = AnyOf(Lt(0), Eq(1), Eq(2), Eq(3));
+  EXPECT_TRUE(m.Matches(-1));
+  EXPECT_TRUE(m.Matches(1));
+  EXPECT_TRUE(m.Matches(2));
+  EXPECT_TRUE(m.Matches(3));
+  EXPECT_FALSE(m.Matches(0));
+
+  m = AnyOf(Le(0), Gt(10), 3, 5, 7);
+  EXPECT_TRUE(m.Matches(0));
+  EXPECT_TRUE(m.Matches(11));
+  EXPECT_TRUE(m.Matches(3));
+  EXPECT_FALSE(m.Matches(2));
+
+  // The following tests for varying number of sub-matchers. Due to the way
+  // the sub-matchers are handled it is enough to test every sub-matcher once
+  // with sub-matchers using the same matcher type. Varying matcher types are
+  // checked for above.
+  AnyOfMatches(2, AnyOf(1, 2));
+  AnyOfMatches(3, AnyOf(1, 2, 3));
+  AnyOfMatches(4, AnyOf(1, 2, 3, 4));
+  AnyOfMatches(5, AnyOf(1, 2, 3, 4, 5));
+  AnyOfMatches(6, AnyOf(1, 2, 3, 4, 5, 6));
+  AnyOfMatches(7, AnyOf(1, 2, 3, 4, 5, 6, 7));
+  AnyOfMatches(8, AnyOf(1, 2, 3, 4, 5, 6, 7, 8));
+  AnyOfMatches(9, AnyOf(1, 2, 3, 4, 5, 6, 7, 8, 9));
+  AnyOfMatches(10, AnyOf(1, 2, 3, 4, 5, 6, 7, 8, 9, 10));
+}
+
+// Tests the variadic version of the AnyOfMatcher.
+TEST(AnyOfTest, VariadicMatchesWhenAnyMatches) {
+  // Also make sure AnyOf is defined in the right namespace and does not depend
+  // on ADL.
+  Matcher<int> m = ::testing::AnyOf(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11);
+
+  EXPECT_THAT(Describe(m), EndsWith("or (is equal to 11)"));
+  AnyOfMatches(11, m);
+  AnyOfMatches(50, AnyOf(1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+                         11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+                         21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+                         31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+                         41, 42, 43, 44, 45, 46, 47, 48, 49, 50));
+  AnyOfStringMatches(
+      50, AnyOf("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12",
+                "13", "14", "15", "16", "17", "18", "19", "20", "21", "22",
+                "23", "24", "25", "26", "27", "28", "29", "30", "31", "32",
+                "33", "34", "35", "36", "37", "38", "39", "40", "41", "42",
+                "43", "44", "45", "46", "47", "48", "49", "50"));
+}
+
+// Tests the variadic version of the ElementsAreMatcher
+TEST(ElementsAreTest, HugeMatcher) {
+  vector<int> test_vector{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+
+  EXPECT_THAT(test_vector,
+              ElementsAre(Eq(1), Eq(2), Lt(13), Eq(4), Eq(5), Eq(6), Eq(7),
+                          Eq(8), Eq(9), Eq(10), Gt(1), Eq(12)));
+}
+
+// Tests the variadic version of the UnorderedElementsAreMatcher
+TEST(ElementsAreTest, HugeMatcherStr) {
+  vector<std::string> test_vector{
+      "literal_string", "", "", "", "", "", "", "", "", "", "", ""};
+
+  EXPECT_THAT(test_vector, UnorderedElementsAre("literal_string", _, _, _, _, _,
+                                                _, _, _, _, _, _));
+}
+
+// Tests the variadic version of the UnorderedElementsAreMatcher
+TEST(ElementsAreTest, HugeMatcherUnordered) {
+  vector<int> test_vector{2, 1, 8, 5, 4, 6, 7, 3, 9, 12, 11, 10};
+
+  EXPECT_THAT(test_vector, UnorderedElementsAre(
+                               Eq(2), Eq(1), Gt(7), Eq(5), Eq(4), Eq(6), Eq(7),
+                               Eq(3), Eq(9), Eq(12), Eq(11), Ne(122)));
+}
+
+
+// Tests that AnyOf(m1, ..., mn) describes itself properly.
+TEST(AnyOfTest, CanDescribeSelf) {
+  Matcher<int> m;
+  m = AnyOf(Le(1), Ge(3));
+
+  EXPECT_EQ("(is <= 1) or (is >= 3)",
+            Describe(m));
+
+  m = AnyOf(Lt(0), Eq(1), Eq(2));
+  EXPECT_EQ("(is < 0) or (is equal to 1) or (is equal to 2)", Describe(m));
+
+  m = AnyOf(Lt(0), Eq(1), Eq(2), Eq(3));
+  EXPECT_EQ("(is < 0) or (is equal to 1) or (is equal to 2) or (is equal to 3)",
+            Describe(m));
+
+  m = AnyOf(Le(0), Gt(10), 3, 5, 7);
+  EXPECT_EQ(
+      "(is <= 0) or (is > 10) or (is equal to 3) or (is equal to 5) or (is "
+      "equal to 7)",
+      Describe(m));
+}
+
+// Tests that AnyOf(m1, ..., mn) describes its negation properly.
+TEST(AnyOfTest, CanDescribeNegation) {
+  Matcher<int> m;
+  m = AnyOf(Le(1), Ge(3));
+  EXPECT_EQ("(isn't <= 1) and (isn't >= 3)",
+            DescribeNegation(m));
+
+  m = AnyOf(Lt(0), Eq(1), Eq(2));
+  EXPECT_EQ("(isn't < 0) and (isn't equal to 1) and (isn't equal to 2)",
+            DescribeNegation(m));
+
+  m = AnyOf(Lt(0), Eq(1), Eq(2), Eq(3));
+  EXPECT_EQ(
+      "(isn't < 0) and (isn't equal to 1) and (isn't equal to 2) and (isn't "
+      "equal to 3)",
+      DescribeNegation(m));
+
+  m = AnyOf(Le(0), Gt(10), 3, 5, 7);
+  EXPECT_EQ(
+      "(isn't <= 0) and (isn't > 10) and (isn't equal to 3) and (isn't equal "
+      "to 5) and (isn't equal to 7)",
+      DescribeNegation(m));
+}
+
+// Tests that monomorphic matchers are safely cast by the AnyOf matcher.
+TEST(AnyOfTest, AnyOfMatcherSafelyCastsMonomorphicMatchers) {
+  // greater_than_5 and less_than_10 are monomorphic matchers.
+  Matcher<int> greater_than_5 = Gt(5);
+  Matcher<int> less_than_10 = Lt(10);
+
+  Matcher<const int&> m = AnyOf(greater_than_5, less_than_10);
+  Matcher<int&> m2 = AnyOf(greater_than_5, less_than_10);
+  Matcher<int&> m3 = AnyOf(greater_than_5, m2);
+
+  // Tests that EitherOf works when composing itself.
+  Matcher<const int&> m4 = AnyOf(greater_than_5, less_than_10, less_than_10);
+  Matcher<int&> m5 = AnyOf(greater_than_5, less_than_10, less_than_10);
+}
+
+TEST(AnyOfTest, ExplainsResult) {
+  Matcher<int> m;
+
+  // Failed match.  Both matchers need to explain.  The second
+  // matcher doesn't give an explanation, so only the first matcher's
+  // explanation is printed.
+  m = AnyOf(GreaterThan(10), Lt(0));
+  EXPECT_EQ("which is 5 less than 10", Explain(m, 5));
+
+  // Failed match.  Both matchers need to explain.
+  m = AnyOf(GreaterThan(10), GreaterThan(20));
+  EXPECT_EQ("which is 5 less than 10, and which is 15 less than 20",
+            Explain(m, 5));
+
+  // Failed match.  All matchers need to explain.  The second
+  // matcher doesn't given an explanation.
+  m = AnyOf(GreaterThan(10), Gt(20), GreaterThan(30));
+  EXPECT_EQ("which is 5 less than 10, and which is 25 less than 30",
+            Explain(m, 5));
+
+  // Failed match.  All matchers need to explain.
+  m = AnyOf(GreaterThan(10), GreaterThan(20), GreaterThan(30));
+  EXPECT_EQ("which is 5 less than 10, and which is 15 less than 20, "
+            "and which is 25 less than 30",
+            Explain(m, 5));
+
+  // Successful match.  The first matcher, which succeeded, needs to
+  // explain.
+  m = AnyOf(GreaterThan(10), GreaterThan(20));
+  EXPECT_EQ("which is 5 more than 10", Explain(m, 15));
+
+  // Successful match.  The second matcher, which succeeded, needs to
+  // explain.  Since it doesn't given an explanation, nothing is
+  // printed.
+  m = AnyOf(GreaterThan(10), Lt(30));
+  EXPECT_EQ("", Explain(m, 0));
+
+  // Successful match.  The second matcher, which succeeded, needs to
+  // explain.
+  m = AnyOf(GreaterThan(30), GreaterThan(20));
+  EXPECT_EQ("which is 5 more than 20", Explain(m, 25));
+}
+
+// The following predicate function and predicate functor are for
+// testing the Truly(predicate) matcher.
+
+// Returns non-zero if the input is positive.  Note that the return
+// type of this function is not bool.  It's OK as Truly() accepts any
+// unary function or functor whose return type can be implicitly
+// converted to bool.
+int IsPositive(double x) {
+  return x > 0 ? 1 : 0;
+}
+
+// This functor returns true if the input is greater than the given
+// number.
+class IsGreaterThan {
+ public:
+  explicit IsGreaterThan(int threshold) : threshold_(threshold) {}
+
+  bool operator()(int n) const { return n > threshold_; }
+
+ private:
+  int threshold_;
+};
+
+// For testing Truly().
+const int foo = 0;
+
+// This predicate returns true if and only if the argument references foo and
+// has a zero value.
+bool ReferencesFooAndIsZero(const int& n) {
+  return (&n == &foo) && (n == 0);
+}
+
+// Tests that Truly(predicate) matches what satisfies the given
+// predicate.
+TEST(TrulyTest, MatchesWhatSatisfiesThePredicate) {
+  Matcher<double> m = Truly(IsPositive);
+  EXPECT_TRUE(m.Matches(2.0));
+  EXPECT_FALSE(m.Matches(-1.5));
+}
+
+// Tests that Truly(predicate_functor) works too.
+TEST(TrulyTest, CanBeUsedWithFunctor) {
+  Matcher<int> m = Truly(IsGreaterThan(5));
+  EXPECT_TRUE(m.Matches(6));
+  EXPECT_FALSE(m.Matches(4));
+}
+
+// A class that can be implicitly converted to bool.
+class ConvertibleToBool {
+ public:
+  explicit ConvertibleToBool(int number) : number_(number) {}
+  operator bool() const { return number_ != 0; }
+
+ private:
+  int number_;
+};
+
+ConvertibleToBool IsNotZero(int number) {
+  return ConvertibleToBool(number);
+}
+
+// Tests that the predicate used in Truly() may return a class that's
+// implicitly convertible to bool, even when the class has no
+// operator!().
+TEST(TrulyTest, PredicateCanReturnAClassConvertibleToBool) {
+  Matcher<int> m = Truly(IsNotZero);
+  EXPECT_TRUE(m.Matches(1));
+  EXPECT_FALSE(m.Matches(0));
+}
+
+// Tests that Truly(predicate) can describe itself properly.
+TEST(TrulyTest, CanDescribeSelf) {
+  Matcher<double> m = Truly(IsPositive);
+  EXPECT_EQ("satisfies the given predicate",
+            Describe(m));
+}
+
+// Tests that Truly(predicate) works when the matcher takes its
+// argument by reference.
+TEST(TrulyTest, WorksForByRefArguments) {
+  Matcher<const int&> m = Truly(ReferencesFooAndIsZero);
+  EXPECT_TRUE(m.Matches(foo));
+  int n = 0;
+  EXPECT_FALSE(m.Matches(n));
+}
+
+// Tests that Matches(m) is a predicate satisfied by whatever that
+// matches matcher m.
+TEST(MatchesTest, IsSatisfiedByWhatMatchesTheMatcher) {
+  EXPECT_TRUE(Matches(Ge(0))(1));
+  EXPECT_FALSE(Matches(Eq('a'))('b'));
+}
+
+// Tests that Matches(m) works when the matcher takes its argument by
+// reference.
+TEST(MatchesTest, WorksOnByRefArguments) {
+  int m = 0, n = 0;
+  EXPECT_TRUE(Matches(AllOf(Ref(n), Eq(0)))(n));
+  EXPECT_FALSE(Matches(Ref(m))(n));
+}
+
+// Tests that a Matcher on non-reference type can be used in
+// Matches().
+TEST(MatchesTest, WorksWithMatcherOnNonRefType) {
+  Matcher<int> eq5 = Eq(5);
+  EXPECT_TRUE(Matches(eq5)(5));
+  EXPECT_FALSE(Matches(eq5)(2));
+}
+
+// Tests Value(value, matcher).  Since Value() is a simple wrapper for
+// Matches(), which has been tested already, we don't spend a lot of
+// effort on testing Value().
+TEST(ValueTest, WorksWithPolymorphicMatcher) {
+  EXPECT_TRUE(Value("hi", StartsWith("h")));
+  EXPECT_FALSE(Value(5, Gt(10)));
+}
+
+TEST(ValueTest, WorksWithMonomorphicMatcher) {
+  const Matcher<int> is_zero = Eq(0);
+  EXPECT_TRUE(Value(0, is_zero));
+  EXPECT_FALSE(Value('a', is_zero));
+
+  int n = 0;
+  const Matcher<const int&> ref_n = Ref(n);
+  EXPECT_TRUE(Value(n, ref_n));
+  EXPECT_FALSE(Value(1, ref_n));
+}
+
+TEST(ExplainMatchResultTest, WorksWithPolymorphicMatcher) {
+  StringMatchResultListener listener1;
+  EXPECT_TRUE(ExplainMatchResult(PolymorphicIsEven(), 42, &listener1));
+  EXPECT_EQ("% 2 == 0", listener1.str());
+
+  StringMatchResultListener listener2;
+  EXPECT_FALSE(ExplainMatchResult(Ge(42), 1.5, &listener2));
+  EXPECT_EQ("", listener2.str());
+}
+
+TEST(ExplainMatchResultTest, WorksWithMonomorphicMatcher) {
+  const Matcher<int> is_even = PolymorphicIsEven();
+  StringMatchResultListener listener1;
+  EXPECT_TRUE(ExplainMatchResult(is_even, 42, &listener1));
+  EXPECT_EQ("% 2 == 0", listener1.str());
+
+  const Matcher<const double&> is_zero = Eq(0);
+  StringMatchResultListener listener2;
+  EXPECT_FALSE(ExplainMatchResult(is_zero, 1.5, &listener2));
+  EXPECT_EQ("", listener2.str());
+}
+
+MATCHER(ConstructNoArg, "") { return true; }
+MATCHER_P(Construct1Arg, arg1, "") { return true; }
+MATCHER_P2(Construct2Args, arg1, arg2, "") { return true; }
+
+TEST(MatcherConstruct, ExplicitVsImplicit) {
+  {
+    // No arg constructor can be constructed with empty brace.
+    ConstructNoArgMatcher m = {};
+    (void)m;
+    // And with no args
+    ConstructNoArgMatcher m2;
+    (void)m2;
+  }
+  {
+    // The one arg constructor has an explicit constructor.
+    // This is to prevent the implicit conversion.
+    using M = Construct1ArgMatcherP<int>;
+    EXPECT_TRUE((std::is_constructible<M, int>::value));
+    EXPECT_FALSE((std::is_convertible<int, M>::value));
+  }
+  {
+    // Multiple arg matchers can be constructed with an implicit construction.
+    Construct2ArgsMatcherP2<int, double> m = {1, 2.2};
+    (void)m;
+  }
+}
+
+MATCHER_P(Really, inner_matcher, "") {
+  return ExplainMatchResult(inner_matcher, arg, result_listener);
+}
+
+TEST(ExplainMatchResultTest, WorksInsideMATCHER) {
+  EXPECT_THAT(0, Really(Eq(0)));
+}
+
+TEST(DescribeMatcherTest, WorksWithValue) {
+  EXPECT_EQ("is equal to 42", DescribeMatcher<int>(42));
+  EXPECT_EQ("isn't equal to 42", DescribeMatcher<int>(42, true));
+}
+
+TEST(DescribeMatcherTest, WorksWithMonomorphicMatcher) {
+  const Matcher<int> monomorphic = Le(0);
+  EXPECT_EQ("is <= 0", DescribeMatcher<int>(monomorphic));
+  EXPECT_EQ("isn't <= 0", DescribeMatcher<int>(monomorphic, true));
+}
+
+TEST(DescribeMatcherTest, WorksWithPolymorphicMatcher) {
+  EXPECT_EQ("is even", DescribeMatcher<int>(PolymorphicIsEven()));
+  EXPECT_EQ("is odd", DescribeMatcher<int>(PolymorphicIsEven(), true));
+}
+
+TEST(AllArgsTest, WorksForTuple) {
+  EXPECT_THAT(std::make_tuple(1, 2L), AllArgs(Lt()));
+  EXPECT_THAT(std::make_tuple(2L, 1), Not(AllArgs(Lt())));
+}
+
+TEST(AllArgsTest, WorksForNonTuple) {
+  EXPECT_THAT(42, AllArgs(Gt(0)));
+  EXPECT_THAT('a', Not(AllArgs(Eq('b'))));
+}
+
+class AllArgsHelper {
+ public:
+  AllArgsHelper() {}
+
+  MOCK_METHOD2(Helper, int(char x, int y));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(AllArgsHelper);
+};
+
+TEST(AllArgsTest, WorksInWithClause) {
+  AllArgsHelper helper;
+  ON_CALL(helper, Helper(_, _))
+      .With(AllArgs(Lt()))
+      .WillByDefault(Return(1));
+  EXPECT_CALL(helper, Helper(_, _));
+  EXPECT_CALL(helper, Helper(_, _))
+      .With(AllArgs(Gt()))
+      .WillOnce(Return(2));
+
+  EXPECT_EQ(1, helper.Helper('\1', 2));
+  EXPECT_EQ(2, helper.Helper('a', 1));
+}
+
+class OptionalMatchersHelper {
+ public:
+  OptionalMatchersHelper() {}
+
+  MOCK_METHOD0(NoArgs, int());
+
+  MOCK_METHOD1(OneArg, int(int y));
+
+  MOCK_METHOD2(TwoArgs, int(char x, int y));
+
+  MOCK_METHOD1(Overloaded, int(char x));
+  MOCK_METHOD2(Overloaded, int(char x, int y));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(OptionalMatchersHelper);
+};
+
+TEST(AllArgsTest, WorksWithoutMatchers) {
+  OptionalMatchersHelper helper;
+
+  ON_CALL(helper, NoArgs).WillByDefault(Return(10));
+  ON_CALL(helper, OneArg).WillByDefault(Return(20));
+  ON_CALL(helper, TwoArgs).WillByDefault(Return(30));
+
+  EXPECT_EQ(10, helper.NoArgs());
+  EXPECT_EQ(20, helper.OneArg(1));
+  EXPECT_EQ(30, helper.TwoArgs('\1', 2));
+
+  EXPECT_CALL(helper, NoArgs).Times(1);
+  EXPECT_CALL(helper, OneArg).WillOnce(Return(100));
+  EXPECT_CALL(helper, OneArg(17)).WillOnce(Return(200));
+  EXPECT_CALL(helper, TwoArgs).Times(0);
+
+  EXPECT_EQ(10, helper.NoArgs());
+  EXPECT_EQ(100, helper.OneArg(1));
+  EXPECT_EQ(200, helper.OneArg(17));
+}
+
+// Tests that ASSERT_THAT() and EXPECT_THAT() work when the value
+// matches the matcher.
+TEST(MatcherAssertionTest, WorksWhenMatcherIsSatisfied) {
+  ASSERT_THAT(5, Ge(2)) << "This should succeed.";
+  ASSERT_THAT("Foo", EndsWith("oo"));
+  EXPECT_THAT(2, AllOf(Le(7), Ge(0))) << "This should succeed too.";
+  EXPECT_THAT("Hello", StartsWith("Hell"));
+}
+
+// Tests that ASSERT_THAT() and EXPECT_THAT() work when the value
+// doesn't match the matcher.
+TEST(MatcherAssertionTest, WorksWhenMatcherIsNotSatisfied) {
+  // 'n' must be static as it is used in an EXPECT_FATAL_FAILURE(),
+  // which cannot reference auto variables.
+  static unsigned short n;  // NOLINT
+  n = 5;
+
+  EXPECT_FATAL_FAILURE(ASSERT_THAT(n, Gt(10)),
+                       "Value of: n\n"
+                       "Expected: is > 10\n"
+                       "  Actual: 5" + OfType("unsigned short"));
+  n = 0;
+  EXPECT_NONFATAL_FAILURE(
+      EXPECT_THAT(n, AllOf(Le(7), Ge(5))),
+      "Value of: n\n"
+      "Expected: (is <= 7) and (is >= 5)\n"
+      "  Actual: 0" + OfType("unsigned short"));
+}
+
+// Tests that ASSERT_THAT() and EXPECT_THAT() work when the argument
+// has a reference type.
+TEST(MatcherAssertionTest, WorksForByRefArguments) {
+  // We use a static variable here as EXPECT_FATAL_FAILURE() cannot
+  // reference auto variables.
+  static int n;
+  n = 0;
+  EXPECT_THAT(n, AllOf(Le(7), Ref(n)));
+  EXPECT_FATAL_FAILURE(ASSERT_THAT(n, Not(Ref(n))),
+                       "Value of: n\n"
+                       "Expected: does not reference the variable @");
+  // Tests the "Actual" part.
+  EXPECT_FATAL_FAILURE(ASSERT_THAT(n, Not(Ref(n))),
+                       "Actual: 0" + OfType("int") + ", which is located @");
+}
+
+// Tests that ASSERT_THAT() and EXPECT_THAT() work when the matcher is
+// monomorphic.
+TEST(MatcherAssertionTest, WorksForMonomorphicMatcher) {
+  Matcher<const char*> starts_with_he = StartsWith("he");
+  ASSERT_THAT("hello", starts_with_he);
+
+  Matcher<const std::string&> ends_with_ok = EndsWith("ok");
+  ASSERT_THAT("book", ends_with_ok);
+  const std::string bad = "bad";
+  EXPECT_NONFATAL_FAILURE(EXPECT_THAT(bad, ends_with_ok),
+                          "Value of: bad\n"
+                          "Expected: ends with \"ok\"\n"
+                          "  Actual: \"bad\"");
+  Matcher<int> is_greater_than_5 = Gt(5);
+  EXPECT_NONFATAL_FAILURE(EXPECT_THAT(5, is_greater_than_5),
+                          "Value of: 5\n"
+                          "Expected: is > 5\n"
+                          "  Actual: 5" + OfType("int"));
+}
+
+// Tests floating-point matchers.
+template <typename RawType>
+class FloatingPointTest : public testing::Test {
+ protected:
+  typedef testing::internal::FloatingPoint<RawType> Floating;
+  typedef typename Floating::Bits Bits;
+
+  FloatingPointTest()
+      : max_ulps_(Floating::kMaxUlps),
+        zero_bits_(Floating(0).bits()),
+        one_bits_(Floating(1).bits()),
+        infinity_bits_(Floating(Floating::Infinity()).bits()),
+        close_to_positive_zero_(
+            Floating::ReinterpretBits(zero_bits_ + max_ulps_/2)),
+        close_to_negative_zero_(
+            -Floating::ReinterpretBits(zero_bits_ + max_ulps_ - max_ulps_/2)),
+        further_from_negative_zero_(-Floating::ReinterpretBits(
+            zero_bits_ + max_ulps_ + 1 - max_ulps_/2)),
+        close_to_one_(Floating::ReinterpretBits(one_bits_ + max_ulps_)),
+        further_from_one_(Floating::ReinterpretBits(one_bits_ + max_ulps_ + 1)),
+        infinity_(Floating::Infinity()),
+        close_to_infinity_(
+            Floating::ReinterpretBits(infinity_bits_ - max_ulps_)),
+        further_from_infinity_(
+            Floating::ReinterpretBits(infinity_bits_ - max_ulps_ - 1)),
+        max_(Floating::Max()),
+        nan1_(Floating::ReinterpretBits(Floating::kExponentBitMask | 1)),
+        nan2_(Floating::ReinterpretBits(Floating::kExponentBitMask | 200)) {
+  }
+
+  void TestSize() {
+    EXPECT_EQ(sizeof(RawType), sizeof(Bits));
+  }
+
+  // A battery of tests for FloatingEqMatcher::Matches.
+  // matcher_maker is a pointer to a function which creates a FloatingEqMatcher.
+  void TestMatches(
+      testing::internal::FloatingEqMatcher<RawType> (*matcher_maker)(RawType)) {
+    Matcher<RawType> m1 = matcher_maker(0.0);
+    EXPECT_TRUE(m1.Matches(-0.0));
+    EXPECT_TRUE(m1.Matches(close_to_positive_zero_));
+    EXPECT_TRUE(m1.Matches(close_to_negative_zero_));
+    EXPECT_FALSE(m1.Matches(1.0));
+
+    Matcher<RawType> m2 = matcher_maker(close_to_positive_zero_);
+    EXPECT_FALSE(m2.Matches(further_from_negative_zero_));
+
+    Matcher<RawType> m3 = matcher_maker(1.0);
+    EXPECT_TRUE(m3.Matches(close_to_one_));
+    EXPECT_FALSE(m3.Matches(further_from_one_));
+
+    // Test commutativity: matcher_maker(0.0).Matches(1.0) was tested above.
+    EXPECT_FALSE(m3.Matches(0.0));
+
+    Matcher<RawType> m4 = matcher_maker(-infinity_);
+    EXPECT_TRUE(m4.Matches(-close_to_infinity_));
+
+    Matcher<RawType> m5 = matcher_maker(infinity_);
+    EXPECT_TRUE(m5.Matches(close_to_infinity_));
+
+    // This is interesting as the representations of infinity_ and nan1_
+    // are only 1 DLP apart.
+    EXPECT_FALSE(m5.Matches(nan1_));
+
+    // matcher_maker can produce a Matcher<const RawType&>, which is needed in
+    // some cases.
+    Matcher<const RawType&> m6 = matcher_maker(0.0);
+    EXPECT_TRUE(m6.Matches(-0.0));
+    EXPECT_TRUE(m6.Matches(close_to_positive_zero_));
+    EXPECT_FALSE(m6.Matches(1.0));
+
+    // matcher_maker can produce a Matcher<RawType&>, which is needed in some
+    // cases.
+    Matcher<RawType&> m7 = matcher_maker(0.0);
+    RawType x = 0.0;
+    EXPECT_TRUE(m7.Matches(x));
+    x = 0.01f;
+    EXPECT_FALSE(m7.Matches(x));
+  }
+
+  // Pre-calculated numbers to be used by the tests.
+
+  const Bits max_ulps_;
+
+  const Bits zero_bits_;  // The bits that represent 0.0.
+  const Bits one_bits_;  // The bits that represent 1.0.
+  const Bits infinity_bits_;  // The bits that represent +infinity.
+
+  // Some numbers close to 0.0.
+  const RawType close_to_positive_zero_;
+  const RawType close_to_negative_zero_;
+  const RawType further_from_negative_zero_;
+
+  // Some numbers close to 1.0.
+  const RawType close_to_one_;
+  const RawType further_from_one_;
+
+  // Some numbers close to +infinity.
+  const RawType infinity_;
+  const RawType close_to_infinity_;
+  const RawType further_from_infinity_;
+
+  // Maximum representable value that's not infinity.
+  const RawType max_;
+
+  // Some NaNs.
+  const RawType nan1_;
+  const RawType nan2_;
+};
+
+// Tests floating-point matchers with fixed epsilons.
+template <typename RawType>
+class FloatingPointNearTest : public FloatingPointTest<RawType> {
+ protected:
+  typedef FloatingPointTest<RawType> ParentType;
+
+  // A battery of tests for FloatingEqMatcher::Matches with a fixed epsilon.
+  // matcher_maker is a pointer to a function which creates a FloatingEqMatcher.
+  void TestNearMatches(
+      testing::internal::FloatingEqMatcher<RawType>
+          (*matcher_maker)(RawType, RawType)) {
+    Matcher<RawType> m1 = matcher_maker(0.0, 0.0);
+    EXPECT_TRUE(m1.Matches(0.0));
+    EXPECT_TRUE(m1.Matches(-0.0));
+    EXPECT_FALSE(m1.Matches(ParentType::close_to_positive_zero_));
+    EXPECT_FALSE(m1.Matches(ParentType::close_to_negative_zero_));
+    EXPECT_FALSE(m1.Matches(1.0));
+
+    Matcher<RawType> m2 = matcher_maker(0.0, 1.0);
+    EXPECT_TRUE(m2.Matches(0.0));
+    EXPECT_TRUE(m2.Matches(-0.0));
+    EXPECT_TRUE(m2.Matches(1.0));
+    EXPECT_TRUE(m2.Matches(-1.0));
+    EXPECT_FALSE(m2.Matches(ParentType::close_to_one_));
+    EXPECT_FALSE(m2.Matches(-ParentType::close_to_one_));
+
+    // Check that inf matches inf, regardless of the of the specified max
+    // absolute error.
+    Matcher<RawType> m3 = matcher_maker(ParentType::infinity_, 0.0);
+    EXPECT_TRUE(m3.Matches(ParentType::infinity_));
+    EXPECT_FALSE(m3.Matches(ParentType::close_to_infinity_));
+    EXPECT_FALSE(m3.Matches(-ParentType::infinity_));
+
+    Matcher<RawType> m4 = matcher_maker(-ParentType::infinity_, 0.0);
+    EXPECT_TRUE(m4.Matches(-ParentType::infinity_));
+    EXPECT_FALSE(m4.Matches(-ParentType::close_to_infinity_));
+    EXPECT_FALSE(m4.Matches(ParentType::infinity_));
+
+    // Test various overflow scenarios.
+    Matcher<RawType> m5 = matcher_maker(ParentType::max_, ParentType::max_);
+    EXPECT_TRUE(m5.Matches(ParentType::max_));
+    EXPECT_FALSE(m5.Matches(-ParentType::max_));
+
+    Matcher<RawType> m6 = matcher_maker(-ParentType::max_, ParentType::max_);
+    EXPECT_FALSE(m6.Matches(ParentType::max_));
+    EXPECT_TRUE(m6.Matches(-ParentType::max_));
+
+    Matcher<RawType> m7 = matcher_maker(ParentType::max_, 0);
+    EXPECT_TRUE(m7.Matches(ParentType::max_));
+    EXPECT_FALSE(m7.Matches(-ParentType::max_));
+
+    Matcher<RawType> m8 = matcher_maker(-ParentType::max_, 0);
+    EXPECT_FALSE(m8.Matches(ParentType::max_));
+    EXPECT_TRUE(m8.Matches(-ParentType::max_));
+
+    // The difference between max() and -max() normally overflows to infinity,
+    // but it should still match if the max_abs_error is also infinity.
+    Matcher<RawType> m9 = matcher_maker(
+        ParentType::max_, ParentType::infinity_);
+    EXPECT_TRUE(m8.Matches(-ParentType::max_));
+
+    // matcher_maker can produce a Matcher<const RawType&>, which is needed in
+    // some cases.
+    Matcher<const RawType&> m10 = matcher_maker(0.0, 1.0);
+    EXPECT_TRUE(m10.Matches(-0.0));
+    EXPECT_TRUE(m10.Matches(ParentType::close_to_positive_zero_));
+    EXPECT_FALSE(m10.Matches(ParentType::close_to_one_));
+
+    // matcher_maker can produce a Matcher<RawType&>, which is needed in some
+    // cases.
+    Matcher<RawType&> m11 = matcher_maker(0.0, 1.0);
+    RawType x = 0.0;
+    EXPECT_TRUE(m11.Matches(x));
+    x = 1.0f;
+    EXPECT_TRUE(m11.Matches(x));
+    x = -1.0f;
+    EXPECT_TRUE(m11.Matches(x));
+    x = 1.1f;
+    EXPECT_FALSE(m11.Matches(x));
+    x = -1.1f;
+    EXPECT_FALSE(m11.Matches(x));
+  }
+};
+
+// Instantiate FloatingPointTest for testing floats.
+typedef FloatingPointTest<float> FloatTest;
+
+TEST_F(FloatTest, FloatEqApproximatelyMatchesFloats) {
+  TestMatches(&FloatEq);
+}
+
+TEST_F(FloatTest, NanSensitiveFloatEqApproximatelyMatchesFloats) {
+  TestMatches(&NanSensitiveFloatEq);
+}
+
+TEST_F(FloatTest, FloatEqCannotMatchNaN) {
+  // FloatEq never matches NaN.
+  Matcher<float> m = FloatEq(nan1_);
+  EXPECT_FALSE(m.Matches(nan1_));
+  EXPECT_FALSE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST_F(FloatTest, NanSensitiveFloatEqCanMatchNaN) {
+  // NanSensitiveFloatEq will match NaN.
+  Matcher<float> m = NanSensitiveFloatEq(nan1_);
+  EXPECT_TRUE(m.Matches(nan1_));
+  EXPECT_TRUE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST_F(FloatTest, FloatEqCanDescribeSelf) {
+  Matcher<float> m1 = FloatEq(2.0f);
+  EXPECT_EQ("is approximately 2", Describe(m1));
+  EXPECT_EQ("isn't approximately 2", DescribeNegation(m1));
+
+  Matcher<float> m2 = FloatEq(0.5f);
+  EXPECT_EQ("is approximately 0.5", Describe(m2));
+  EXPECT_EQ("isn't approximately 0.5", DescribeNegation(m2));
+
+  Matcher<float> m3 = FloatEq(nan1_);
+  EXPECT_EQ("never matches", Describe(m3));
+  EXPECT_EQ("is anything", DescribeNegation(m3));
+}
+
+TEST_F(FloatTest, NanSensitiveFloatEqCanDescribeSelf) {
+  Matcher<float> m1 = NanSensitiveFloatEq(2.0f);
+  EXPECT_EQ("is approximately 2", Describe(m1));
+  EXPECT_EQ("isn't approximately 2", DescribeNegation(m1));
+
+  Matcher<float> m2 = NanSensitiveFloatEq(0.5f);
+  EXPECT_EQ("is approximately 0.5", Describe(m2));
+  EXPECT_EQ("isn't approximately 0.5", DescribeNegation(m2));
+
+  Matcher<float> m3 = NanSensitiveFloatEq(nan1_);
+  EXPECT_EQ("is NaN", Describe(m3));
+  EXPECT_EQ("isn't NaN", DescribeNegation(m3));
+}
+
+// Instantiate FloatingPointTest for testing floats with a user-specified
+// max absolute error.
+typedef FloatingPointNearTest<float> FloatNearTest;
+
+TEST_F(FloatNearTest, FloatNearMatches) {
+  TestNearMatches(&FloatNear);
+}
+
+TEST_F(FloatNearTest, NanSensitiveFloatNearApproximatelyMatchesFloats) {
+  TestNearMatches(&NanSensitiveFloatNear);
+}
+
+TEST_F(FloatNearTest, FloatNearCanDescribeSelf) {
+  Matcher<float> m1 = FloatNear(2.0f, 0.5f);
+  EXPECT_EQ("is approximately 2 (absolute error <= 0.5)", Describe(m1));
+  EXPECT_EQ(
+      "isn't approximately 2 (absolute error > 0.5)", DescribeNegation(m1));
+
+  Matcher<float> m2 = FloatNear(0.5f, 0.5f);
+  EXPECT_EQ("is approximately 0.5 (absolute error <= 0.5)", Describe(m2));
+  EXPECT_EQ(
+      "isn't approximately 0.5 (absolute error > 0.5)", DescribeNegation(m2));
+
+  Matcher<float> m3 = FloatNear(nan1_, 0.0);
+  EXPECT_EQ("never matches", Describe(m3));
+  EXPECT_EQ("is anything", DescribeNegation(m3));
+}
+
+TEST_F(FloatNearTest, NanSensitiveFloatNearCanDescribeSelf) {
+  Matcher<float> m1 = NanSensitiveFloatNear(2.0f, 0.5f);
+  EXPECT_EQ("is approximately 2 (absolute error <= 0.5)", Describe(m1));
+  EXPECT_EQ(
+      "isn't approximately 2 (absolute error > 0.5)", DescribeNegation(m1));
+
+  Matcher<float> m2 = NanSensitiveFloatNear(0.5f, 0.5f);
+  EXPECT_EQ("is approximately 0.5 (absolute error <= 0.5)", Describe(m2));
+  EXPECT_EQ(
+      "isn't approximately 0.5 (absolute error > 0.5)", DescribeNegation(m2));
+
+  Matcher<float> m3 = NanSensitiveFloatNear(nan1_, 0.1f);
+  EXPECT_EQ("is NaN", Describe(m3));
+  EXPECT_EQ("isn't NaN", DescribeNegation(m3));
+}
+
+TEST_F(FloatNearTest, FloatNearCannotMatchNaN) {
+  // FloatNear never matches NaN.
+  Matcher<float> m = FloatNear(ParentType::nan1_, 0.1f);
+  EXPECT_FALSE(m.Matches(nan1_));
+  EXPECT_FALSE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST_F(FloatNearTest, NanSensitiveFloatNearCanMatchNaN) {
+  // NanSensitiveFloatNear will match NaN.
+  Matcher<float> m = NanSensitiveFloatNear(nan1_, 0.1f);
+  EXPECT_TRUE(m.Matches(nan1_));
+  EXPECT_TRUE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+// Instantiate FloatingPointTest for testing doubles.
+typedef FloatingPointTest<double> DoubleTest;
+
+TEST_F(DoubleTest, DoubleEqApproximatelyMatchesDoubles) {
+  TestMatches(&DoubleEq);
+}
+
+TEST_F(DoubleTest, NanSensitiveDoubleEqApproximatelyMatchesDoubles) {
+  TestMatches(&NanSensitiveDoubleEq);
+}
+
+TEST_F(DoubleTest, DoubleEqCannotMatchNaN) {
+  // DoubleEq never matches NaN.
+  Matcher<double> m = DoubleEq(nan1_);
+  EXPECT_FALSE(m.Matches(nan1_));
+  EXPECT_FALSE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST_F(DoubleTest, NanSensitiveDoubleEqCanMatchNaN) {
+  // NanSensitiveDoubleEq will match NaN.
+  Matcher<double> m = NanSensitiveDoubleEq(nan1_);
+  EXPECT_TRUE(m.Matches(nan1_));
+  EXPECT_TRUE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST_F(DoubleTest, DoubleEqCanDescribeSelf) {
+  Matcher<double> m1 = DoubleEq(2.0);
+  EXPECT_EQ("is approximately 2", Describe(m1));
+  EXPECT_EQ("isn't approximately 2", DescribeNegation(m1));
+
+  Matcher<double> m2 = DoubleEq(0.5);
+  EXPECT_EQ("is approximately 0.5", Describe(m2));
+  EXPECT_EQ("isn't approximately 0.5", DescribeNegation(m2));
+
+  Matcher<double> m3 = DoubleEq(nan1_);
+  EXPECT_EQ("never matches", Describe(m3));
+  EXPECT_EQ("is anything", DescribeNegation(m3));
+}
+
+TEST_F(DoubleTest, NanSensitiveDoubleEqCanDescribeSelf) {
+  Matcher<double> m1 = NanSensitiveDoubleEq(2.0);
+  EXPECT_EQ("is approximately 2", Describe(m1));
+  EXPECT_EQ("isn't approximately 2", DescribeNegation(m1));
+
+  Matcher<double> m2 = NanSensitiveDoubleEq(0.5);
+  EXPECT_EQ("is approximately 0.5", Describe(m2));
+  EXPECT_EQ("isn't approximately 0.5", DescribeNegation(m2));
+
+  Matcher<double> m3 = NanSensitiveDoubleEq(nan1_);
+  EXPECT_EQ("is NaN", Describe(m3));
+  EXPECT_EQ("isn't NaN", DescribeNegation(m3));
+}
+
+// Instantiate FloatingPointTest for testing floats with a user-specified
+// max absolute error.
+typedef FloatingPointNearTest<double> DoubleNearTest;
+
+TEST_F(DoubleNearTest, DoubleNearMatches) {
+  TestNearMatches(&DoubleNear);
+}
+
+TEST_F(DoubleNearTest, NanSensitiveDoubleNearApproximatelyMatchesDoubles) {
+  TestNearMatches(&NanSensitiveDoubleNear);
+}
+
+TEST_F(DoubleNearTest, DoubleNearCanDescribeSelf) {
+  Matcher<double> m1 = DoubleNear(2.0, 0.5);
+  EXPECT_EQ("is approximately 2 (absolute error <= 0.5)", Describe(m1));
+  EXPECT_EQ(
+      "isn't approximately 2 (absolute error > 0.5)", DescribeNegation(m1));
+
+  Matcher<double> m2 = DoubleNear(0.5, 0.5);
+  EXPECT_EQ("is approximately 0.5 (absolute error <= 0.5)", Describe(m2));
+  EXPECT_EQ(
+      "isn't approximately 0.5 (absolute error > 0.5)", DescribeNegation(m2));
+
+  Matcher<double> m3 = DoubleNear(nan1_, 0.0);
+  EXPECT_EQ("never matches", Describe(m3));
+  EXPECT_EQ("is anything", DescribeNegation(m3));
+}
+
+TEST_F(DoubleNearTest, ExplainsResultWhenMatchFails) {
+  EXPECT_EQ("", Explain(DoubleNear(2.0, 0.1), 2.05));
+  EXPECT_EQ("which is 0.2 from 2", Explain(DoubleNear(2.0, 0.1), 2.2));
+  EXPECT_EQ("which is -0.3 from 2", Explain(DoubleNear(2.0, 0.1), 1.7));
+
+  const std::string explanation =
+      Explain(DoubleNear(2.1, 1e-10), 2.1 + 1.2e-10);
+  // Different C++ implementations may print floating-point numbers
+  // slightly differently.
+  EXPECT_TRUE(explanation == "which is 1.2e-10 from 2.1" ||  // GCC
+              explanation == "which is 1.2e-010 from 2.1")   // MSVC
+      << " where explanation is \"" << explanation << "\".";
+}
+
+TEST_F(DoubleNearTest, NanSensitiveDoubleNearCanDescribeSelf) {
+  Matcher<double> m1 = NanSensitiveDoubleNear(2.0, 0.5);
+  EXPECT_EQ("is approximately 2 (absolute error <= 0.5)", Describe(m1));
+  EXPECT_EQ(
+      "isn't approximately 2 (absolute error > 0.5)", DescribeNegation(m1));
+
+  Matcher<double> m2 = NanSensitiveDoubleNear(0.5, 0.5);
+  EXPECT_EQ("is approximately 0.5 (absolute error <= 0.5)", Describe(m2));
+  EXPECT_EQ(
+      "isn't approximately 0.5 (absolute error > 0.5)", DescribeNegation(m2));
+
+  Matcher<double> m3 = NanSensitiveDoubleNear(nan1_, 0.1);
+  EXPECT_EQ("is NaN", Describe(m3));
+  EXPECT_EQ("isn't NaN", DescribeNegation(m3));
+}
+
+TEST_F(DoubleNearTest, DoubleNearCannotMatchNaN) {
+  // DoubleNear never matches NaN.
+  Matcher<double> m = DoubleNear(ParentType::nan1_, 0.1);
+  EXPECT_FALSE(m.Matches(nan1_));
+  EXPECT_FALSE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST_F(DoubleNearTest, NanSensitiveDoubleNearCanMatchNaN) {
+  // NanSensitiveDoubleNear will match NaN.
+  Matcher<double> m = NanSensitiveDoubleNear(nan1_, 0.1);
+  EXPECT_TRUE(m.Matches(nan1_));
+  EXPECT_TRUE(m.Matches(nan2_));
+  EXPECT_FALSE(m.Matches(1.0));
+}
+
+TEST(PointeeTest, RawPointer) {
+  const Matcher<int*> m = Pointee(Ge(0));
+
+  int n = 1;
+  EXPECT_TRUE(m.Matches(&n));
+  n = -1;
+  EXPECT_FALSE(m.Matches(&n));
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+TEST(PointeeTest, RawPointerToConst) {
+  const Matcher<const double*> m = Pointee(Ge(0));
+
+  double x = 1;
+  EXPECT_TRUE(m.Matches(&x));
+  x = -1;
+  EXPECT_FALSE(m.Matches(&x));
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+TEST(PointeeTest, ReferenceToConstRawPointer) {
+  const Matcher<int* const &> m = Pointee(Ge(0));
+
+  int n = 1;
+  EXPECT_TRUE(m.Matches(&n));
+  n = -1;
+  EXPECT_FALSE(m.Matches(&n));
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+TEST(PointeeTest, ReferenceToNonConstRawPointer) {
+  const Matcher<double* &> m = Pointee(Ge(0));
+
+  double x = 1.0;
+  double* p = &x;
+  EXPECT_TRUE(m.Matches(p));
+  x = -1;
+  EXPECT_FALSE(m.Matches(p));
+  p = nullptr;
+  EXPECT_FALSE(m.Matches(p));
+}
+
+MATCHER_P(FieldIIs, inner_matcher, "") {
+  return ExplainMatchResult(inner_matcher, arg.i, result_listener);
+}
+
+#if GTEST_HAS_RTTI
+TEST(WhenDynamicCastToTest, SameType) {
+  Derived derived;
+  derived.i = 4;
+
+  // Right type. A pointer is passed down.
+  Base* as_base_ptr = &derived;
+  EXPECT_THAT(as_base_ptr, WhenDynamicCastTo<Derived*>(Not(IsNull())));
+  EXPECT_THAT(as_base_ptr, WhenDynamicCastTo<Derived*>(Pointee(FieldIIs(4))));
+  EXPECT_THAT(as_base_ptr,
+              Not(WhenDynamicCastTo<Derived*>(Pointee(FieldIIs(5)))));
+}
+
+TEST(WhenDynamicCastToTest, WrongTypes) {
+  Base base;
+  Derived derived;
+  OtherDerived other_derived;
+
+  // Wrong types. NULL is passed.
+  EXPECT_THAT(&base, Not(WhenDynamicCastTo<Derived*>(Pointee(_))));
+  EXPECT_THAT(&base, WhenDynamicCastTo<Derived*>(IsNull()));
+  Base* as_base_ptr = &derived;
+  EXPECT_THAT(as_base_ptr, Not(WhenDynamicCastTo<OtherDerived*>(Pointee(_))));
+  EXPECT_THAT(as_base_ptr, WhenDynamicCastTo<OtherDerived*>(IsNull()));
+  as_base_ptr = &other_derived;
+  EXPECT_THAT(as_base_ptr, Not(WhenDynamicCastTo<Derived*>(Pointee(_))));
+  EXPECT_THAT(as_base_ptr, WhenDynamicCastTo<Derived*>(IsNull()));
+}
+
+TEST(WhenDynamicCastToTest, AlreadyNull) {
+  // Already NULL.
+  Base* as_base_ptr = nullptr;
+  EXPECT_THAT(as_base_ptr, WhenDynamicCastTo<Derived*>(IsNull()));
+}
+
+struct AmbiguousCastTypes {
+  class VirtualDerived : public virtual Base {};
+  class DerivedSub1 : public VirtualDerived {};
+  class DerivedSub2 : public VirtualDerived {};
+  class ManyDerivedInHierarchy : public DerivedSub1, public DerivedSub2 {};
+};
+
+TEST(WhenDynamicCastToTest, AmbiguousCast) {
+  AmbiguousCastTypes::DerivedSub1 sub1;
+  AmbiguousCastTypes::ManyDerivedInHierarchy many_derived;
+  // Multiply derived from Base. dynamic_cast<> returns NULL.
+  Base* as_base_ptr =
+      static_cast<AmbiguousCastTypes::DerivedSub1*>(&many_derived);
+  EXPECT_THAT(as_base_ptr,
+              WhenDynamicCastTo<AmbiguousCastTypes::VirtualDerived*>(IsNull()));
+  as_base_ptr = &sub1;
+  EXPECT_THAT(
+      as_base_ptr,
+      WhenDynamicCastTo<AmbiguousCastTypes::VirtualDerived*>(Not(IsNull())));
+}
+
+TEST(WhenDynamicCastToTest, Describe) {
+  Matcher<Base*> matcher = WhenDynamicCastTo<Derived*>(Pointee(_));
+  const std::string prefix =
+      "when dynamic_cast to " + internal::GetTypeName<Derived*>() + ", ";
+  EXPECT_EQ(prefix + "points to a value that is anything", Describe(matcher));
+  EXPECT_EQ(prefix + "does not point to a value that is anything",
+            DescribeNegation(matcher));
+}
+
+TEST(WhenDynamicCastToTest, Explain) {
+  Matcher<Base*> matcher = WhenDynamicCastTo<Derived*>(Pointee(_));
+  Base* null = nullptr;
+  EXPECT_THAT(Explain(matcher, null), HasSubstr("NULL"));
+  Derived derived;
+  EXPECT_TRUE(matcher.Matches(&derived));
+  EXPECT_THAT(Explain(matcher, &derived), HasSubstr("which points to "));
+
+  // With references, the matcher itself can fail. Test for that one.
+  Matcher<const Base&> ref_matcher = WhenDynamicCastTo<const OtherDerived&>(_);
+  EXPECT_THAT(Explain(ref_matcher, derived),
+              HasSubstr("which cannot be dynamic_cast"));
+}
+
+TEST(WhenDynamicCastToTest, GoodReference) {
+  Derived derived;
+  derived.i = 4;
+  Base& as_base_ref = derived;
+  EXPECT_THAT(as_base_ref, WhenDynamicCastTo<const Derived&>(FieldIIs(4)));
+  EXPECT_THAT(as_base_ref, WhenDynamicCastTo<const Derived&>(Not(FieldIIs(5))));
+}
+
+TEST(WhenDynamicCastToTest, BadReference) {
+  Derived derived;
+  Base& as_base_ref = derived;
+  EXPECT_THAT(as_base_ref, Not(WhenDynamicCastTo<const OtherDerived&>(_)));
+}
+#endif  // GTEST_HAS_RTTI
+
+// Minimal const-propagating pointer.
+template <typename T>
+class ConstPropagatingPtr {
+ public:
+  typedef T element_type;
+
+  ConstPropagatingPtr() : val_() {}
+  explicit ConstPropagatingPtr(T* t) : val_(t) {}
+  ConstPropagatingPtr(const ConstPropagatingPtr& other) : val_(other.val_) {}
+
+  T* get() { return val_; }
+  T& operator*() { return *val_; }
+  // Most smart pointers return non-const T* and T& from the next methods.
+  const T* get() const { return val_; }
+  const T& operator*() const { return *val_; }
+
+ private:
+  T* val_;
+};
+
+TEST(PointeeTest, WorksWithConstPropagatingPointers) {
+  const Matcher< ConstPropagatingPtr<int> > m = Pointee(Lt(5));
+  int three = 3;
+  const ConstPropagatingPtr<int> co(&three);
+  ConstPropagatingPtr<int> o(&three);
+  EXPECT_TRUE(m.Matches(o));
+  EXPECT_TRUE(m.Matches(co));
+  *o = 6;
+  EXPECT_FALSE(m.Matches(o));
+  EXPECT_FALSE(m.Matches(ConstPropagatingPtr<int>()));
+}
+
+TEST(PointeeTest, NeverMatchesNull) {
+  const Matcher<const char*> m = Pointee(_);
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+// Tests that we can write Pointee(value) instead of Pointee(Eq(value)).
+TEST(PointeeTest, MatchesAgainstAValue) {
+  const Matcher<int*> m = Pointee(5);
+
+  int n = 5;
+  EXPECT_TRUE(m.Matches(&n));
+  n = -1;
+  EXPECT_FALSE(m.Matches(&n));
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+TEST(PointeeTest, CanDescribeSelf) {
+  const Matcher<int*> m = Pointee(Gt(3));
+  EXPECT_EQ("points to a value that is > 3", Describe(m));
+  EXPECT_EQ("does not point to a value that is > 3",
+            DescribeNegation(m));
+}
+
+TEST(PointeeTest, CanExplainMatchResult) {
+  const Matcher<const std::string*> m = Pointee(StartsWith("Hi"));
+
+  EXPECT_EQ("", Explain(m, static_cast<const std::string*>(nullptr)));
+
+  const Matcher<long*> m2 = Pointee(GreaterThan(1));  // NOLINT
+  long n = 3;  // NOLINT
+  EXPECT_EQ("which points to 3" + OfType("long") + ", which is 2 more than 1",
+            Explain(m2, &n));
+}
+
+TEST(PointeeTest, AlwaysExplainsPointee) {
+  const Matcher<int*> m = Pointee(0);
+  int n = 42;
+  EXPECT_EQ("which points to 42" + OfType("int"), Explain(m, &n));
+}
+
+// An uncopyable class.
+class Uncopyable {
+ public:
+  Uncopyable() : value_(-1) {}
+  explicit Uncopyable(int a_value) : value_(a_value) {}
+
+  int value() const { return value_; }
+  void set_value(int i) { value_ = i; }
+
+ private:
+  int value_;
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Uncopyable);
+};
+
+// Returns true if and only if x.value() is positive.
+bool ValueIsPositive(const Uncopyable& x) { return x.value() > 0; }
+
+MATCHER_P(UncopyableIs, inner_matcher, "") {
+  return ExplainMatchResult(inner_matcher, arg.value(), result_listener);
+}
+
+// A user-defined struct for testing Field().
+struct AStruct {
+  AStruct() : x(0), y(1.0), z(5), p(nullptr) {}
+  AStruct(const AStruct& rhs)
+      : x(rhs.x), y(rhs.y), z(rhs.z.value()), p(rhs.p) {}
+
+  int x;           // A non-const field.
+  const double y;  // A const field.
+  Uncopyable z;    // An uncopyable field.
+  const char* p;   // A pointer field.
+
+ private:
+  GTEST_DISALLOW_ASSIGN_(AStruct);
+};
+
+// A derived struct for testing Field().
+struct DerivedStruct : public AStruct {
+  char ch;
+
+ private:
+  GTEST_DISALLOW_ASSIGN_(DerivedStruct);
+};
+
+// Tests that Field(&Foo::field, ...) works when field is non-const.
+TEST(FieldTest, WorksForNonConstField) {
+  Matcher<AStruct> m = Field(&AStruct::x, Ge(0));
+  Matcher<AStruct> m_with_name = Field("x", &AStruct::x, Ge(0));
+
+  AStruct a;
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_TRUE(m_with_name.Matches(a));
+  a.x = -1;
+  EXPECT_FALSE(m.Matches(a));
+  EXPECT_FALSE(m_with_name.Matches(a));
+}
+
+// Tests that Field(&Foo::field, ...) works when field is const.
+TEST(FieldTest, WorksForConstField) {
+  AStruct a;
+
+  Matcher<AStruct> m = Field(&AStruct::y, Ge(0.0));
+  Matcher<AStruct> m_with_name = Field("y", &AStruct::y, Ge(0.0));
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_TRUE(m_with_name.Matches(a));
+  m = Field(&AStruct::y, Le(0.0));
+  m_with_name = Field("y", &AStruct::y, Le(0.0));
+  EXPECT_FALSE(m.Matches(a));
+  EXPECT_FALSE(m_with_name.Matches(a));
+}
+
+// Tests that Field(&Foo::field, ...) works when field is not copyable.
+TEST(FieldTest, WorksForUncopyableField) {
+  AStruct a;
+
+  Matcher<AStruct> m = Field(&AStruct::z, Truly(ValueIsPositive));
+  EXPECT_TRUE(m.Matches(a));
+  m = Field(&AStruct::z, Not(Truly(ValueIsPositive)));
+  EXPECT_FALSE(m.Matches(a));
+}
+
+// Tests that Field(&Foo::field, ...) works when field is a pointer.
+TEST(FieldTest, WorksForPointerField) {
+  // Matching against NULL.
+  Matcher<AStruct> m = Field(&AStruct::p, static_cast<const char*>(nullptr));
+  AStruct a;
+  EXPECT_TRUE(m.Matches(a));
+  a.p = "hi";
+  EXPECT_FALSE(m.Matches(a));
+
+  // Matching a pointer that is not NULL.
+  m = Field(&AStruct::p, StartsWith("hi"));
+  a.p = "hill";
+  EXPECT_TRUE(m.Matches(a));
+  a.p = "hole";
+  EXPECT_FALSE(m.Matches(a));
+}
+
+// Tests that Field() works when the object is passed by reference.
+TEST(FieldTest, WorksForByRefArgument) {
+  Matcher<const AStruct&> m = Field(&AStruct::x, Ge(0));
+
+  AStruct a;
+  EXPECT_TRUE(m.Matches(a));
+  a.x = -1;
+  EXPECT_FALSE(m.Matches(a));
+}
+
+// Tests that Field(&Foo::field, ...) works when the argument's type
+// is a sub-type of Foo.
+TEST(FieldTest, WorksForArgumentOfSubType) {
+  // Note that the matcher expects DerivedStruct but we say AStruct
+  // inside Field().
+  Matcher<const DerivedStruct&> m = Field(&AStruct::x, Ge(0));
+
+  DerivedStruct d;
+  EXPECT_TRUE(m.Matches(d));
+  d.x = -1;
+  EXPECT_FALSE(m.Matches(d));
+}
+
+// Tests that Field(&Foo::field, m) works when field's type and m's
+// argument type are compatible but not the same.
+TEST(FieldTest, WorksForCompatibleMatcherType) {
+  // The field is an int, but the inner matcher expects a signed char.
+  Matcher<const AStruct&> m = Field(&AStruct::x,
+                                    Matcher<signed char>(Ge(0)));
+
+  AStruct a;
+  EXPECT_TRUE(m.Matches(a));
+  a.x = -1;
+  EXPECT_FALSE(m.Matches(a));
+}
+
+// Tests that Field() can describe itself.
+TEST(FieldTest, CanDescribeSelf) {
+  Matcher<const AStruct&> m = Field(&AStruct::x, Ge(0));
+
+  EXPECT_EQ("is an object whose given field is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose given field isn't >= 0", DescribeNegation(m));
+}
+
+TEST(FieldTest, CanDescribeSelfWithFieldName) {
+  Matcher<const AStruct&> m = Field("field_name", &AStruct::x, Ge(0));
+
+  EXPECT_EQ("is an object whose field `field_name` is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose field `field_name` isn't >= 0",
+            DescribeNegation(m));
+}
+
+// Tests that Field() can explain the match result.
+TEST(FieldTest, CanExplainMatchResult) {
+  Matcher<const AStruct&> m = Field(&AStruct::x, Ge(0));
+
+  AStruct a;
+  a.x = 1;
+  EXPECT_EQ("whose given field is 1" + OfType("int"), Explain(m, a));
+
+  m = Field(&AStruct::x, GreaterThan(0));
+  EXPECT_EQ(
+      "whose given field is 1" + OfType("int") + ", which is 1 more than 0",
+      Explain(m, a));
+}
+
+TEST(FieldTest, CanExplainMatchResultWithFieldName) {
+  Matcher<const AStruct&> m = Field("field_name", &AStruct::x, Ge(0));
+
+  AStruct a;
+  a.x = 1;
+  EXPECT_EQ("whose field `field_name` is 1" + OfType("int"), Explain(m, a));
+
+  m = Field("field_name", &AStruct::x, GreaterThan(0));
+  EXPECT_EQ("whose field `field_name` is 1" + OfType("int") +
+                ", which is 1 more than 0",
+            Explain(m, a));
+}
+
+// Tests that Field() works when the argument is a pointer to const.
+TEST(FieldForPointerTest, WorksForPointerToConst) {
+  Matcher<const AStruct*> m = Field(&AStruct::x, Ge(0));
+
+  AStruct a;
+  EXPECT_TRUE(m.Matches(&a));
+  a.x = -1;
+  EXPECT_FALSE(m.Matches(&a));
+}
+
+// Tests that Field() works when the argument is a pointer to non-const.
+TEST(FieldForPointerTest, WorksForPointerToNonConst) {
+  Matcher<AStruct*> m = Field(&AStruct::x, Ge(0));
+
+  AStruct a;
+  EXPECT_TRUE(m.Matches(&a));
+  a.x = -1;
+  EXPECT_FALSE(m.Matches(&a));
+}
+
+// Tests that Field() works when the argument is a reference to a const pointer.
+TEST(FieldForPointerTest, WorksForReferenceToConstPointer) {
+  Matcher<AStruct* const&> m = Field(&AStruct::x, Ge(0));
+
+  AStruct a;
+  EXPECT_TRUE(m.Matches(&a));
+  a.x = -1;
+  EXPECT_FALSE(m.Matches(&a));
+}
+
+// Tests that Field() does not match the NULL pointer.
+TEST(FieldForPointerTest, DoesNotMatchNull) {
+  Matcher<const AStruct*> m = Field(&AStruct::x, _);
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+// Tests that Field(&Foo::field, ...) works when the argument's type
+// is a sub-type of const Foo*.
+TEST(FieldForPointerTest, WorksForArgumentOfSubType) {
+  // Note that the matcher expects DerivedStruct but we say AStruct
+  // inside Field().
+  Matcher<DerivedStruct*> m = Field(&AStruct::x, Ge(0));
+
+  DerivedStruct d;
+  EXPECT_TRUE(m.Matches(&d));
+  d.x = -1;
+  EXPECT_FALSE(m.Matches(&d));
+}
+
+// Tests that Field() can describe itself when used to match a pointer.
+TEST(FieldForPointerTest, CanDescribeSelf) {
+  Matcher<const AStruct*> m = Field(&AStruct::x, Ge(0));
+
+  EXPECT_EQ("is an object whose given field is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose given field isn't >= 0", DescribeNegation(m));
+}
+
+TEST(FieldForPointerTest, CanDescribeSelfWithFieldName) {
+  Matcher<const AStruct*> m = Field("field_name", &AStruct::x, Ge(0));
+
+  EXPECT_EQ("is an object whose field `field_name` is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose field `field_name` isn't >= 0",
+            DescribeNegation(m));
+}
+
+// Tests that Field() can explain the result of matching a pointer.
+TEST(FieldForPointerTest, CanExplainMatchResult) {
+  Matcher<const AStruct*> m = Field(&AStruct::x, Ge(0));
+
+  AStruct a;
+  a.x = 1;
+  EXPECT_EQ("", Explain(m, static_cast<const AStruct*>(nullptr)));
+  EXPECT_EQ("which points to an object whose given field is 1" + OfType("int"),
+            Explain(m, &a));
+
+  m = Field(&AStruct::x, GreaterThan(0));
+  EXPECT_EQ("which points to an object whose given field is 1" + OfType("int") +
+            ", which is 1 more than 0", Explain(m, &a));
+}
+
+TEST(FieldForPointerTest, CanExplainMatchResultWithFieldName) {
+  Matcher<const AStruct*> m = Field("field_name", &AStruct::x, Ge(0));
+
+  AStruct a;
+  a.x = 1;
+  EXPECT_EQ("", Explain(m, static_cast<const AStruct*>(nullptr)));
+  EXPECT_EQ(
+      "which points to an object whose field `field_name` is 1" + OfType("int"),
+      Explain(m, &a));
+
+  m = Field("field_name", &AStruct::x, GreaterThan(0));
+  EXPECT_EQ("which points to an object whose field `field_name` is 1" +
+                OfType("int") + ", which is 1 more than 0",
+            Explain(m, &a));
+}
+
+// A user-defined class for testing Property().
+class AClass {
+ public:
+  AClass() : n_(0) {}
+
+  // A getter that returns a non-reference.
+  int n() const { return n_; }
+
+  void set_n(int new_n) { n_ = new_n; }
+
+  // A getter that returns a reference to const.
+  const std::string& s() const { return s_; }
+
+  const std::string& s_ref() const & { return s_; }
+
+  void set_s(const std::string& new_s) { s_ = new_s; }
+
+  // A getter that returns a reference to non-const.
+  double& x() const { return x_; }
+
+ private:
+  int n_;
+  std::string s_;
+
+  static double x_;
+};
+
+double AClass::x_ = 0.0;
+
+// A derived class for testing Property().
+class DerivedClass : public AClass {
+ public:
+  int k() const { return k_; }
+ private:
+  int k_;
+};
+
+// Tests that Property(&Foo::property, ...) works when property()
+// returns a non-reference.
+TEST(PropertyTest, WorksForNonReferenceProperty) {
+  Matcher<const AClass&> m = Property(&AClass::n, Ge(0));
+  Matcher<const AClass&> m_with_name = Property("n", &AClass::n, Ge(0));
+
+  AClass a;
+  a.set_n(1);
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_TRUE(m_with_name.Matches(a));
+
+  a.set_n(-1);
+  EXPECT_FALSE(m.Matches(a));
+  EXPECT_FALSE(m_with_name.Matches(a));
+}
+
+// Tests that Property(&Foo::property, ...) works when property()
+// returns a reference to const.
+TEST(PropertyTest, WorksForReferenceToConstProperty) {
+  Matcher<const AClass&> m = Property(&AClass::s, StartsWith("hi"));
+  Matcher<const AClass&> m_with_name =
+      Property("s", &AClass::s, StartsWith("hi"));
+
+  AClass a;
+  a.set_s("hill");
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_TRUE(m_with_name.Matches(a));
+
+  a.set_s("hole");
+  EXPECT_FALSE(m.Matches(a));
+  EXPECT_FALSE(m_with_name.Matches(a));
+}
+
+// Tests that Property(&Foo::property, ...) works when property() is
+// ref-qualified.
+TEST(PropertyTest, WorksForRefQualifiedProperty) {
+  Matcher<const AClass&> m = Property(&AClass::s_ref, StartsWith("hi"));
+  Matcher<const AClass&> m_with_name =
+      Property("s", &AClass::s_ref, StartsWith("hi"));
+
+  AClass a;
+  a.set_s("hill");
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_TRUE(m_with_name.Matches(a));
+
+  a.set_s("hole");
+  EXPECT_FALSE(m.Matches(a));
+  EXPECT_FALSE(m_with_name.Matches(a));
+}
+
+// Tests that Property(&Foo::property, ...) works when property()
+// returns a reference to non-const.
+TEST(PropertyTest, WorksForReferenceToNonConstProperty) {
+  double x = 0.0;
+  AClass a;
+
+  Matcher<const AClass&> m = Property(&AClass::x, Ref(x));
+  EXPECT_FALSE(m.Matches(a));
+
+  m = Property(&AClass::x, Not(Ref(x)));
+  EXPECT_TRUE(m.Matches(a));
+}
+
+// Tests that Property(&Foo::property, ...) works when the argument is
+// passed by value.
+TEST(PropertyTest, WorksForByValueArgument) {
+  Matcher<AClass> m = Property(&AClass::s, StartsWith("hi"));
+
+  AClass a;
+  a.set_s("hill");
+  EXPECT_TRUE(m.Matches(a));
+
+  a.set_s("hole");
+  EXPECT_FALSE(m.Matches(a));
+}
+
+// Tests that Property(&Foo::property, ...) works when the argument's
+// type is a sub-type of Foo.
+TEST(PropertyTest, WorksForArgumentOfSubType) {
+  // The matcher expects a DerivedClass, but inside the Property() we
+  // say AClass.
+  Matcher<const DerivedClass&> m = Property(&AClass::n, Ge(0));
+
+  DerivedClass d;
+  d.set_n(1);
+  EXPECT_TRUE(m.Matches(d));
+
+  d.set_n(-1);
+  EXPECT_FALSE(m.Matches(d));
+}
+
+// Tests that Property(&Foo::property, m) works when property()'s type
+// and m's argument type are compatible but different.
+TEST(PropertyTest, WorksForCompatibleMatcherType) {
+  // n() returns an int but the inner matcher expects a signed char.
+  Matcher<const AClass&> m = Property(&AClass::n,
+                                      Matcher<signed char>(Ge(0)));
+
+  Matcher<const AClass&> m_with_name =
+      Property("n", &AClass::n, Matcher<signed char>(Ge(0)));
+
+  AClass a;
+  EXPECT_TRUE(m.Matches(a));
+  EXPECT_TRUE(m_with_name.Matches(a));
+  a.set_n(-1);
+  EXPECT_FALSE(m.Matches(a));
+  EXPECT_FALSE(m_with_name.Matches(a));
+}
+
+// Tests that Property() can describe itself.
+TEST(PropertyTest, CanDescribeSelf) {
+  Matcher<const AClass&> m = Property(&AClass::n, Ge(0));
+
+  EXPECT_EQ("is an object whose given property is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose given property isn't >= 0",
+            DescribeNegation(m));
+}
+
+TEST(PropertyTest, CanDescribeSelfWithPropertyName) {
+  Matcher<const AClass&> m = Property("fancy_name", &AClass::n, Ge(0));
+
+  EXPECT_EQ("is an object whose property `fancy_name` is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose property `fancy_name` isn't >= 0",
+            DescribeNegation(m));
+}
+
+// Tests that Property() can explain the match result.
+TEST(PropertyTest, CanExplainMatchResult) {
+  Matcher<const AClass&> m = Property(&AClass::n, Ge(0));
+
+  AClass a;
+  a.set_n(1);
+  EXPECT_EQ("whose given property is 1" + OfType("int"), Explain(m, a));
+
+  m = Property(&AClass::n, GreaterThan(0));
+  EXPECT_EQ(
+      "whose given property is 1" + OfType("int") + ", which is 1 more than 0",
+      Explain(m, a));
+}
+
+TEST(PropertyTest, CanExplainMatchResultWithPropertyName) {
+  Matcher<const AClass&> m = Property("fancy_name", &AClass::n, Ge(0));
+
+  AClass a;
+  a.set_n(1);
+  EXPECT_EQ("whose property `fancy_name` is 1" + OfType("int"), Explain(m, a));
+
+  m = Property("fancy_name", &AClass::n, GreaterThan(0));
+  EXPECT_EQ("whose property `fancy_name` is 1" + OfType("int") +
+                ", which is 1 more than 0",
+            Explain(m, a));
+}
+
+// Tests that Property() works when the argument is a pointer to const.
+TEST(PropertyForPointerTest, WorksForPointerToConst) {
+  Matcher<const AClass*> m = Property(&AClass::n, Ge(0));
+
+  AClass a;
+  a.set_n(1);
+  EXPECT_TRUE(m.Matches(&a));
+
+  a.set_n(-1);
+  EXPECT_FALSE(m.Matches(&a));
+}
+
+// Tests that Property() works when the argument is a pointer to non-const.
+TEST(PropertyForPointerTest, WorksForPointerToNonConst) {
+  Matcher<AClass*> m = Property(&AClass::s, StartsWith("hi"));
+
+  AClass a;
+  a.set_s("hill");
+  EXPECT_TRUE(m.Matches(&a));
+
+  a.set_s("hole");
+  EXPECT_FALSE(m.Matches(&a));
+}
+
+// Tests that Property() works when the argument is a reference to a
+// const pointer.
+TEST(PropertyForPointerTest, WorksForReferenceToConstPointer) {
+  Matcher<AClass* const&> m = Property(&AClass::s, StartsWith("hi"));
+
+  AClass a;
+  a.set_s("hill");
+  EXPECT_TRUE(m.Matches(&a));
+
+  a.set_s("hole");
+  EXPECT_FALSE(m.Matches(&a));
+}
+
+// Tests that Property() does not match the NULL pointer.
+TEST(PropertyForPointerTest, WorksForReferenceToNonConstProperty) {
+  Matcher<const AClass*> m = Property(&AClass::x, _);
+  EXPECT_FALSE(m.Matches(nullptr));
+}
+
+// Tests that Property(&Foo::property, ...) works when the argument's
+// type is a sub-type of const Foo*.
+TEST(PropertyForPointerTest, WorksForArgumentOfSubType) {
+  // The matcher expects a DerivedClass, but inside the Property() we
+  // say AClass.
+  Matcher<const DerivedClass*> m = Property(&AClass::n, Ge(0));
+
+  DerivedClass d;
+  d.set_n(1);
+  EXPECT_TRUE(m.Matches(&d));
+
+  d.set_n(-1);
+  EXPECT_FALSE(m.Matches(&d));
+}
+
+// Tests that Property() can describe itself when used to match a pointer.
+TEST(PropertyForPointerTest, CanDescribeSelf) {
+  Matcher<const AClass*> m = Property(&AClass::n, Ge(0));
+
+  EXPECT_EQ("is an object whose given property is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose given property isn't >= 0",
+            DescribeNegation(m));
+}
+
+TEST(PropertyForPointerTest, CanDescribeSelfWithPropertyDescription) {
+  Matcher<const AClass*> m = Property("fancy_name", &AClass::n, Ge(0));
+
+  EXPECT_EQ("is an object whose property `fancy_name` is >= 0", Describe(m));
+  EXPECT_EQ("is an object whose property `fancy_name` isn't >= 0",
+            DescribeNegation(m));
+}
+
+// Tests that Property() can explain the result of matching a pointer.
+TEST(PropertyForPointerTest, CanExplainMatchResult) {
+  Matcher<const AClass*> m = Property(&AClass::n, Ge(0));
+
+  AClass a;
+  a.set_n(1);
+  EXPECT_EQ("", Explain(m, static_cast<const AClass*>(nullptr)));
+  EXPECT_EQ(
+      "which points to an object whose given property is 1" + OfType("int"),
+      Explain(m, &a));
+
+  m = Property(&AClass::n, GreaterThan(0));
+  EXPECT_EQ("which points to an object whose given property is 1" +
+            OfType("int") + ", which is 1 more than 0",
+            Explain(m, &a));
+}
+
+TEST(PropertyForPointerTest, CanExplainMatchResultWithPropertyName) {
+  Matcher<const AClass*> m = Property("fancy_name", &AClass::n, Ge(0));
+
+  AClass a;
+  a.set_n(1);
+  EXPECT_EQ("", Explain(m, static_cast<const AClass*>(nullptr)));
+  EXPECT_EQ("which points to an object whose property `fancy_name` is 1" +
+                OfType("int"),
+            Explain(m, &a));
+
+  m = Property("fancy_name", &AClass::n, GreaterThan(0));
+  EXPECT_EQ("which points to an object whose property `fancy_name` is 1" +
+                OfType("int") + ", which is 1 more than 0",
+            Explain(m, &a));
+}
+
+// Tests ResultOf.
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f is a
+// function pointer.
+std::string IntToStringFunction(int input) {
+  return input == 1 ? "foo" : "bar";
+}
+
+TEST(ResultOfTest, WorksForFunctionPointers) {
+  Matcher<int> matcher = ResultOf(&IntToStringFunction, Eq(std::string("foo")));
+
+  EXPECT_TRUE(matcher.Matches(1));
+  EXPECT_FALSE(matcher.Matches(2));
+}
+
+// Tests that ResultOf() can describe itself.
+TEST(ResultOfTest, CanDescribeItself) {
+  Matcher<int> matcher = ResultOf(&IntToStringFunction, StrEq("foo"));
+
+  EXPECT_EQ("is mapped by the given callable to a value that "
+            "is equal to \"foo\"", Describe(matcher));
+  EXPECT_EQ("is mapped by the given callable to a value that "
+            "isn't equal to \"foo\"", DescribeNegation(matcher));
+}
+
+// Tests that ResultOf() can explain the match result.
+int IntFunction(int input) { return input == 42 ? 80 : 90; }
+
+TEST(ResultOfTest, CanExplainMatchResult) {
+  Matcher<int> matcher = ResultOf(&IntFunction, Ge(85));
+  EXPECT_EQ("which is mapped by the given callable to 90" + OfType("int"),
+            Explain(matcher, 36));
+
+  matcher = ResultOf(&IntFunction, GreaterThan(85));
+  EXPECT_EQ("which is mapped by the given callable to 90" + OfType("int") +
+            ", which is 5 more than 85", Explain(matcher, 36));
+}
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f(x)
+// returns a non-reference.
+TEST(ResultOfTest, WorksForNonReferenceResults) {
+  Matcher<int> matcher = ResultOf(&IntFunction, Eq(80));
+
+  EXPECT_TRUE(matcher.Matches(42));
+  EXPECT_FALSE(matcher.Matches(36));
+}
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f(x)
+// returns a reference to non-const.
+double& DoubleFunction(double& input) { return input; }  // NOLINT
+
+Uncopyable& RefUncopyableFunction(Uncopyable& obj) {  // NOLINT
+  return obj;
+}
+
+TEST(ResultOfTest, WorksForReferenceToNonConstResults) {
+  double x = 3.14;
+  double x2 = x;
+  Matcher<double&> matcher = ResultOf(&DoubleFunction, Ref(x));
+
+  EXPECT_TRUE(matcher.Matches(x));
+  EXPECT_FALSE(matcher.Matches(x2));
+
+  // Test that ResultOf works with uncopyable objects
+  Uncopyable obj(0);
+  Uncopyable obj2(0);
+  Matcher<Uncopyable&> matcher2 =
+      ResultOf(&RefUncopyableFunction, Ref(obj));
+
+  EXPECT_TRUE(matcher2.Matches(obj));
+  EXPECT_FALSE(matcher2.Matches(obj2));
+}
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f(x)
+// returns a reference to const.
+const std::string& StringFunction(const std::string& input) { return input; }
+
+TEST(ResultOfTest, WorksForReferenceToConstResults) {
+  std::string s = "foo";
+  std::string s2 = s;
+  Matcher<const std::string&> matcher = ResultOf(&StringFunction, Ref(s));
+
+  EXPECT_TRUE(matcher.Matches(s));
+  EXPECT_FALSE(matcher.Matches(s2));
+}
+
+// Tests that ResultOf(f, m) works when f(x) and m's
+// argument types are compatible but different.
+TEST(ResultOfTest, WorksForCompatibleMatcherTypes) {
+  // IntFunction() returns int but the inner matcher expects a signed char.
+  Matcher<int> matcher = ResultOf(IntFunction, Matcher<signed char>(Ge(85)));
+
+  EXPECT_TRUE(matcher.Matches(36));
+  EXPECT_FALSE(matcher.Matches(42));
+}
+
+// Tests that the program aborts when ResultOf is passed
+// a NULL function pointer.
+TEST(ResultOfDeathTest, DiesOnNullFunctionPointers) {
+  EXPECT_DEATH_IF_SUPPORTED(
+      ResultOf(static_cast<std::string (*)(int dummy)>(nullptr),
+               Eq(std::string("foo"))),
+      "NULL function pointer is passed into ResultOf\\(\\)\\.");
+}
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f is a
+// function reference.
+TEST(ResultOfTest, WorksForFunctionReferences) {
+  Matcher<int> matcher = ResultOf(IntToStringFunction, StrEq("foo"));
+  EXPECT_TRUE(matcher.Matches(1));
+  EXPECT_FALSE(matcher.Matches(2));
+}
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f is a
+// function object.
+struct Functor {
+  std::string operator()(int input) const {
+    return IntToStringFunction(input);
+  }
+};
+
+TEST(ResultOfTest, WorksForFunctors) {
+  Matcher<int> matcher = ResultOf(Functor(), Eq(std::string("foo")));
+
+  EXPECT_TRUE(matcher.Matches(1));
+  EXPECT_FALSE(matcher.Matches(2));
+}
+
+// Tests that ResultOf(f, ...) compiles and works as expected when f is a
+// functor with more than one operator() defined. ResultOf() must work
+// for each defined operator().
+struct PolymorphicFunctor {
+  typedef int result_type;
+  int operator()(int n) { return n; }
+  int operator()(const char* s) { return static_cast<int>(strlen(s)); }
+  std::string operator()(int *p) { return p ? "good ptr" : "null"; }
+};
+
+TEST(ResultOfTest, WorksForPolymorphicFunctors) {
+  Matcher<int> matcher_int = ResultOf(PolymorphicFunctor(), Ge(5));
+
+  EXPECT_TRUE(matcher_int.Matches(10));
+  EXPECT_FALSE(matcher_int.Matches(2));
+
+  Matcher<const char*> matcher_string = ResultOf(PolymorphicFunctor(), Ge(5));
+
+  EXPECT_TRUE(matcher_string.Matches("long string"));
+  EXPECT_FALSE(matcher_string.Matches("shrt"));
+}
+
+TEST(ResultOfTest, WorksForPolymorphicFunctorsIgnoringResultType) {
+  Matcher<int*> matcher = ResultOf(PolymorphicFunctor(), "good ptr");
+
+  int n = 0;
+  EXPECT_TRUE(matcher.Matches(&n));
+  EXPECT_FALSE(matcher.Matches(nullptr));
+}
+
+TEST(ResultOfTest, WorksForLambdas) {
+  Matcher<int> matcher = ResultOf(
+      [](int str_len) {
+        return std::string(static_cast<size_t>(str_len), 'x');
+      },
+      "xxx");
+  EXPECT_TRUE(matcher.Matches(3));
+  EXPECT_FALSE(matcher.Matches(1));
+}
+
+TEST(ResultOfTest, WorksForNonCopyableArguments) {
+  Matcher<std::unique_ptr<int>> matcher = ResultOf(
+      [](const std::unique_ptr<int>& str_len) {
+        return std::string(static_cast<size_t>(*str_len), 'x');
+      },
+      "xxx");
+  EXPECT_TRUE(matcher.Matches(std::unique_ptr<int>(new int(3))));
+  EXPECT_FALSE(matcher.Matches(std::unique_ptr<int>(new int(1))));
+}
+
+const int* ReferencingFunction(const int& n) { return &n; }
+
+struct ReferencingFunctor {
+  typedef const int* result_type;
+  result_type operator()(const int& n) { return &n; }
+};
+
+TEST(ResultOfTest, WorksForReferencingCallables) {
+  const int n = 1;
+  const int n2 = 1;
+  Matcher<const int&> matcher2 = ResultOf(ReferencingFunction, Eq(&n));
+  EXPECT_TRUE(matcher2.Matches(n));
+  EXPECT_FALSE(matcher2.Matches(n2));
+
+  Matcher<const int&> matcher3 = ResultOf(ReferencingFunctor(), Eq(&n));
+  EXPECT_TRUE(matcher3.Matches(n));
+  EXPECT_FALSE(matcher3.Matches(n2));
+}
+
+class DivisibleByImpl {
+ public:
+  explicit DivisibleByImpl(int a_divider) : divider_(a_divider) {}
+
+  // For testing using ExplainMatchResultTo() with polymorphic matchers.
+  template <typename T>
+  bool MatchAndExplain(const T& n, MatchResultListener* listener) const {
+    *listener << "which is " << (n % divider_) << " modulo "
+              << divider_;
+    return (n % divider_) == 0;
+  }
+
+  void DescribeTo(ostream* os) const {
+    *os << "is divisible by " << divider_;
+  }
+
+  void DescribeNegationTo(ostream* os) const {
+    *os << "is not divisible by " << divider_;
+  }
+
+  void set_divider(int a_divider) { divider_ = a_divider; }
+  int divider() const { return divider_; }
+
+ private:
+  int divider_;
+};
+
+PolymorphicMatcher<DivisibleByImpl> DivisibleBy(int n) {
+  return MakePolymorphicMatcher(DivisibleByImpl(n));
+}
+
+// Tests that when AllOf() fails, only the first failing matcher is
+// asked to explain why.
+TEST(ExplainMatchResultTest, AllOf_False_False) {
+  const Matcher<int> m = AllOf(DivisibleBy(4), DivisibleBy(3));
+  EXPECT_EQ("which is 1 modulo 4", Explain(m, 5));
+}
+
+// Tests that when AllOf() fails, only the first failing matcher is
+// asked to explain why.
+TEST(ExplainMatchResultTest, AllOf_False_True) {
+  const Matcher<int> m = AllOf(DivisibleBy(4), DivisibleBy(3));
+  EXPECT_EQ("which is 2 modulo 4", Explain(m, 6));
+}
+
+// Tests that when AllOf() fails, only the first failing matcher is
+// asked to explain why.
+TEST(ExplainMatchResultTest, AllOf_True_False) {
+  const Matcher<int> m = AllOf(Ge(1), DivisibleBy(3));
+  EXPECT_EQ("which is 2 modulo 3", Explain(m, 5));
+}
+
+// Tests that when AllOf() succeeds, all matchers are asked to explain
+// why.
+TEST(ExplainMatchResultTest, AllOf_True_True) {
+  const Matcher<int> m = AllOf(DivisibleBy(2), DivisibleBy(3));
+  EXPECT_EQ("which is 0 modulo 2, and which is 0 modulo 3", Explain(m, 6));
+}
+
+TEST(ExplainMatchResultTest, AllOf_True_True_2) {
+  const Matcher<int> m = AllOf(Ge(2), Le(3));
+  EXPECT_EQ("", Explain(m, 2));
+}
+
+TEST(ExplainmatcherResultTest, MonomorphicMatcher) {
+  const Matcher<int> m = GreaterThan(5);
+  EXPECT_EQ("which is 1 more than 5", Explain(m, 6));
+}
+
+// The following two tests verify that values without a public copy
+// ctor can be used as arguments to matchers like Eq(), Ge(), and etc
+// with the help of ByRef().
+
+class NotCopyable {
+ public:
+  explicit NotCopyable(int a_value) : value_(a_value) {}
+
+  int value() const { return value_; }
+
+  bool operator==(const NotCopyable& rhs) const {
+    return value() == rhs.value();
+  }
+
+  bool operator>=(const NotCopyable& rhs) const {
+    return value() >= rhs.value();
+  }
+ private:
+  int value_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(NotCopyable);
+};
+
+TEST(ByRefTest, AllowsNotCopyableConstValueInMatchers) {
+  const NotCopyable const_value1(1);
+  const Matcher<const NotCopyable&> m = Eq(ByRef(const_value1));
+
+  const NotCopyable n1(1), n2(2);
+  EXPECT_TRUE(m.Matches(n1));
+  EXPECT_FALSE(m.Matches(n2));
+}
+
+TEST(ByRefTest, AllowsNotCopyableValueInMatchers) {
+  NotCopyable value2(2);
+  const Matcher<NotCopyable&> m = Ge(ByRef(value2));
+
+  NotCopyable n1(1), n2(2);
+  EXPECT_FALSE(m.Matches(n1));
+  EXPECT_TRUE(m.Matches(n2));
+}
+
+TEST(IsEmptyTest, ImplementsIsEmpty) {
+  vector<int> container;
+  EXPECT_THAT(container, IsEmpty());
+  container.push_back(0);
+  EXPECT_THAT(container, Not(IsEmpty()));
+  container.push_back(1);
+  EXPECT_THAT(container, Not(IsEmpty()));
+}
+
+TEST(IsEmptyTest, WorksWithString) {
+  std::string text;
+  EXPECT_THAT(text, IsEmpty());
+  text = "foo";
+  EXPECT_THAT(text, Not(IsEmpty()));
+  text = std::string("\0", 1);
+  EXPECT_THAT(text, Not(IsEmpty()));
+}
+
+TEST(IsEmptyTest, CanDescribeSelf) {
+  Matcher<vector<int> > m = IsEmpty();
+  EXPECT_EQ("is empty", Describe(m));
+  EXPECT_EQ("isn't empty", DescribeNegation(m));
+}
+
+TEST(IsEmptyTest, ExplainsResult) {
+  Matcher<vector<int> > m = IsEmpty();
+  vector<int> container;
+  EXPECT_EQ("", Explain(m, container));
+  container.push_back(0);
+  EXPECT_EQ("whose size is 1", Explain(m, container));
+}
+
+TEST(IsEmptyTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(IsEmpty()));
+  helper.Call({});
+}
+
+TEST(IsTrueTest, IsTrueIsFalse) {
+  EXPECT_THAT(true, IsTrue());
+  EXPECT_THAT(false, IsFalse());
+  EXPECT_THAT(true, Not(IsFalse()));
+  EXPECT_THAT(false, Not(IsTrue()));
+  EXPECT_THAT(0, Not(IsTrue()));
+  EXPECT_THAT(0, IsFalse());
+  EXPECT_THAT(nullptr, Not(IsTrue()));
+  EXPECT_THAT(nullptr, IsFalse());
+  EXPECT_THAT(-1, IsTrue());
+  EXPECT_THAT(-1, Not(IsFalse()));
+  EXPECT_THAT(1, IsTrue());
+  EXPECT_THAT(1, Not(IsFalse()));
+  EXPECT_THAT(2, IsTrue());
+  EXPECT_THAT(2, Not(IsFalse()));
+  int a = 42;
+  EXPECT_THAT(a, IsTrue());
+  EXPECT_THAT(a, Not(IsFalse()));
+  EXPECT_THAT(&a, IsTrue());
+  EXPECT_THAT(&a, Not(IsFalse()));
+  EXPECT_THAT(false, Not(IsTrue()));
+  EXPECT_THAT(true, Not(IsFalse()));
+  EXPECT_THAT(std::true_type(), IsTrue());
+  EXPECT_THAT(std::true_type(), Not(IsFalse()));
+  EXPECT_THAT(std::false_type(), IsFalse());
+  EXPECT_THAT(std::false_type(), Not(IsTrue()));
+  EXPECT_THAT(nullptr, Not(IsTrue()));
+  EXPECT_THAT(nullptr, IsFalse());
+  std::unique_ptr<int> null_unique;
+  std::unique_ptr<int> nonnull_unique(new int(0));
+  EXPECT_THAT(null_unique, Not(IsTrue()));
+  EXPECT_THAT(null_unique, IsFalse());
+  EXPECT_THAT(nonnull_unique, IsTrue());
+  EXPECT_THAT(nonnull_unique, Not(IsFalse()));
+}
+
+TEST(SizeIsTest, ImplementsSizeIs) {
+  vector<int> container;
+  EXPECT_THAT(container, SizeIs(0));
+  EXPECT_THAT(container, Not(SizeIs(1)));
+  container.push_back(0);
+  EXPECT_THAT(container, Not(SizeIs(0)));
+  EXPECT_THAT(container, SizeIs(1));
+  container.push_back(0);
+  EXPECT_THAT(container, Not(SizeIs(0)));
+  EXPECT_THAT(container, SizeIs(2));
+}
+
+TEST(SizeIsTest, WorksWithMap) {
+  map<std::string, int> container;
+  EXPECT_THAT(container, SizeIs(0));
+  EXPECT_THAT(container, Not(SizeIs(1)));
+  container.insert(make_pair("foo", 1));
+  EXPECT_THAT(container, Not(SizeIs(0)));
+  EXPECT_THAT(container, SizeIs(1));
+  container.insert(make_pair("bar", 2));
+  EXPECT_THAT(container, Not(SizeIs(0)));
+  EXPECT_THAT(container, SizeIs(2));
+}
+
+TEST(SizeIsTest, WorksWithReferences) {
+  vector<int> container;
+  Matcher<const vector<int>&> m = SizeIs(1);
+  EXPECT_THAT(container, Not(m));
+  container.push_back(0);
+  EXPECT_THAT(container, m);
+}
+
+TEST(SizeIsTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(SizeIs(3)));
+  helper.Call(MakeUniquePtrs({1, 2, 3}));
+}
+
+// SizeIs should work for any type that provides a size() member function.
+// For example, a size_type member type should not need to be provided.
+struct MinimalistCustomType {
+  int size() const { return 1; }
+};
+TEST(SizeIsTest, WorksWithMinimalistCustomType) {
+  MinimalistCustomType container;
+  EXPECT_THAT(container, SizeIs(1));
+  EXPECT_THAT(container, Not(SizeIs(0)));
+}
+
+TEST(SizeIsTest, CanDescribeSelf) {
+  Matcher<vector<int> > m = SizeIs(2);
+  EXPECT_EQ("size is equal to 2", Describe(m));
+  EXPECT_EQ("size isn't equal to 2", DescribeNegation(m));
+}
+
+TEST(SizeIsTest, ExplainsResult) {
+  Matcher<vector<int> > m1 = SizeIs(2);
+  Matcher<vector<int> > m2 = SizeIs(Lt(2u));
+  Matcher<vector<int> > m3 = SizeIs(AnyOf(0, 3));
+  Matcher<vector<int> > m4 = SizeIs(Gt(1u));
+  vector<int> container;
+  EXPECT_EQ("whose size 0 doesn't match", Explain(m1, container));
+  EXPECT_EQ("whose size 0 matches", Explain(m2, container));
+  EXPECT_EQ("whose size 0 matches", Explain(m3, container));
+  EXPECT_EQ("whose size 0 doesn't match", Explain(m4, container));
+  container.push_back(0);
+  container.push_back(0);
+  EXPECT_EQ("whose size 2 matches", Explain(m1, container));
+  EXPECT_EQ("whose size 2 doesn't match", Explain(m2, container));
+  EXPECT_EQ("whose size 2 doesn't match", Explain(m3, container));
+  EXPECT_EQ("whose size 2 matches", Explain(m4, container));
+}
+
+#if GTEST_HAS_TYPED_TEST
+// Tests ContainerEq with different container types, and
+// different element types.
+
+template <typename T>
+class ContainerEqTest : public testing::Test {};
+
+typedef testing::Types<
+    set<int>,
+    vector<size_t>,
+    multiset<size_t>,
+    list<int> >
+    ContainerEqTestTypes;
+
+TYPED_TEST_SUITE(ContainerEqTest, ContainerEqTestTypes);
+
+// Tests that the filled container is equal to itself.
+TYPED_TEST(ContainerEqTest, EqualsSelf) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  TypeParam my_set(vals, vals + 6);
+  const Matcher<TypeParam> m = ContainerEq(my_set);
+  EXPECT_TRUE(m.Matches(my_set));
+  EXPECT_EQ("", Explain(m, my_set));
+}
+
+// Tests that missing values are reported.
+TYPED_TEST(ContainerEqTest, ValueMissing) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {2, 1, 8, 5};
+  TypeParam my_set(vals, vals + 6);
+  TypeParam test_set(test_vals, test_vals + 4);
+  const Matcher<TypeParam> m = ContainerEq(my_set);
+  EXPECT_FALSE(m.Matches(test_set));
+  EXPECT_EQ("which doesn't have these expected elements: 3",
+            Explain(m, test_set));
+}
+
+// Tests that added values are reported.
+TYPED_TEST(ContainerEqTest, ValueAdded) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {1, 2, 3, 5, 8, 46};
+  TypeParam my_set(vals, vals + 6);
+  TypeParam test_set(test_vals, test_vals + 6);
+  const Matcher<const TypeParam&> m = ContainerEq(my_set);
+  EXPECT_FALSE(m.Matches(test_set));
+  EXPECT_EQ("which has these unexpected elements: 46", Explain(m, test_set));
+}
+
+// Tests that added and missing values are reported together.
+TYPED_TEST(ContainerEqTest, ValueAddedAndRemoved) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {1, 2, 3, 8, 46};
+  TypeParam my_set(vals, vals + 6);
+  TypeParam test_set(test_vals, test_vals + 5);
+  const Matcher<TypeParam> m = ContainerEq(my_set);
+  EXPECT_FALSE(m.Matches(test_set));
+  EXPECT_EQ("which has these unexpected elements: 46,\n"
+            "and doesn't have these expected elements: 5",
+            Explain(m, test_set));
+}
+
+// Tests duplicated value -- expect no explanation.
+TYPED_TEST(ContainerEqTest, DuplicateDifference) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {1, 2, 3, 5, 8};
+  TypeParam my_set(vals, vals + 6);
+  TypeParam test_set(test_vals, test_vals + 5);
+  const Matcher<const TypeParam&> m = ContainerEq(my_set);
+  // Depending on the container, match may be true or false
+  // But in any case there should be no explanation.
+  EXPECT_EQ("", Explain(m, test_set));
+}
+#endif  // GTEST_HAS_TYPED_TEST
+
+// Tests that multiple missing values are reported.
+// Using just vector here, so order is predictable.
+TEST(ContainerEqExtraTest, MultipleValuesMissing) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {2, 1, 5};
+  vector<int> my_set(vals, vals + 6);
+  vector<int> test_set(test_vals, test_vals + 3);
+  const Matcher<vector<int> > m = ContainerEq(my_set);
+  EXPECT_FALSE(m.Matches(test_set));
+  EXPECT_EQ("which doesn't have these expected elements: 3, 8",
+            Explain(m, test_set));
+}
+
+// Tests that added values are reported.
+// Using just vector here, so order is predictable.
+TEST(ContainerEqExtraTest, MultipleValuesAdded) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {1, 2, 92, 3, 5, 8, 46};
+  list<size_t> my_set(vals, vals + 6);
+  list<size_t> test_set(test_vals, test_vals + 7);
+  const Matcher<const list<size_t>&> m = ContainerEq(my_set);
+  EXPECT_FALSE(m.Matches(test_set));
+  EXPECT_EQ("which has these unexpected elements: 92, 46",
+            Explain(m, test_set));
+}
+
+// Tests that added and missing values are reported together.
+TEST(ContainerEqExtraTest, MultipleValuesAddedAndRemoved) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {1, 2, 3, 92, 46};
+  list<size_t> my_set(vals, vals + 6);
+  list<size_t> test_set(test_vals, test_vals + 5);
+  const Matcher<const list<size_t> > m = ContainerEq(my_set);
+  EXPECT_FALSE(m.Matches(test_set));
+  EXPECT_EQ("which has these unexpected elements: 92, 46,\n"
+            "and doesn't have these expected elements: 5, 8",
+            Explain(m, test_set));
+}
+
+// Tests to see that duplicate elements are detected,
+// but (as above) not reported in the explanation.
+TEST(ContainerEqExtraTest, MultiSetOfIntDuplicateDifference) {
+  static const int vals[] = {1, 1, 2, 3, 5, 8};
+  static const int test_vals[] = {1, 2, 3, 5, 8};
+  vector<int> my_set(vals, vals + 6);
+  vector<int> test_set(test_vals, test_vals + 5);
+  const Matcher<vector<int> > m = ContainerEq(my_set);
+  EXPECT_TRUE(m.Matches(my_set));
+  EXPECT_FALSE(m.Matches(test_set));
+  // There is nothing to report when both sets contain all the same values.
+  EXPECT_EQ("", Explain(m, test_set));
+}
+
+// Tests that ContainerEq works for non-trivial associative containers,
+// like maps.
+TEST(ContainerEqExtraTest, WorksForMaps) {
+  map<int, std::string> my_map;
+  my_map[0] = "a";
+  my_map[1] = "b";
+
+  map<int, std::string> test_map;
+  test_map[0] = "aa";
+  test_map[1] = "b";
+
+  const Matcher<const map<int, std::string>&> m = ContainerEq(my_map);
+  EXPECT_TRUE(m.Matches(my_map));
+  EXPECT_FALSE(m.Matches(test_map));
+
+  EXPECT_EQ("which has these unexpected elements: (0, \"aa\"),\n"
+            "and doesn't have these expected elements: (0, \"a\")",
+            Explain(m, test_map));
+}
+
+TEST(ContainerEqExtraTest, WorksForNativeArray) {
+  int a1[] = {1, 2, 3};
+  int a2[] = {1, 2, 3};
+  int b[] = {1, 2, 4};
+
+  EXPECT_THAT(a1, ContainerEq(a2));
+  EXPECT_THAT(a1, Not(ContainerEq(b)));
+}
+
+TEST(ContainerEqExtraTest, WorksForTwoDimensionalNativeArray) {
+  const char a1[][3] = {"hi", "lo"};
+  const char a2[][3] = {"hi", "lo"};
+  const char b[][3] = {"lo", "hi"};
+
+  // Tests using ContainerEq() in the first dimension.
+  EXPECT_THAT(a1, ContainerEq(a2));
+  EXPECT_THAT(a1, Not(ContainerEq(b)));
+
+  // Tests using ContainerEq() in the second dimension.
+  EXPECT_THAT(a1, ElementsAre(ContainerEq(a2[0]), ContainerEq(a2[1])));
+  EXPECT_THAT(a1, ElementsAre(Not(ContainerEq(b[0])), ContainerEq(a2[1])));
+}
+
+TEST(ContainerEqExtraTest, WorksForNativeArrayAsTuple) {
+  const int a1[] = {1, 2, 3};
+  const int a2[] = {1, 2, 3};
+  const int b[] = {1, 2, 3, 4};
+
+  const int* const p1 = a1;
+  EXPECT_THAT(std::make_tuple(p1, 3), ContainerEq(a2));
+  EXPECT_THAT(std::make_tuple(p1, 3), Not(ContainerEq(b)));
+
+  const int c[] = {1, 3, 2};
+  EXPECT_THAT(std::make_tuple(p1, 3), Not(ContainerEq(c)));
+}
+
+TEST(ContainerEqExtraTest, CopiesNativeArrayParameter) {
+  std::string a1[][3] = {
+    {"hi", "hello", "ciao"},
+    {"bye", "see you", "ciao"}
+  };
+
+  std::string a2[][3] = {
+    {"hi", "hello", "ciao"},
+    {"bye", "see you", "ciao"}
+  };
+
+  const Matcher<const std::string(&)[2][3]> m = ContainerEq(a2);
+  EXPECT_THAT(a1, m);
+
+  a2[0][0] = "ha";
+  EXPECT_THAT(a1, m);
+}
+
+TEST(WhenSortedByTest, WorksForEmptyContainer) {
+  const vector<int> numbers;
+  EXPECT_THAT(numbers, WhenSortedBy(less<int>(), ElementsAre()));
+  EXPECT_THAT(numbers, Not(WhenSortedBy(less<int>(), ElementsAre(1))));
+}
+
+TEST(WhenSortedByTest, WorksForNonEmptyContainer) {
+  vector<unsigned> numbers;
+  numbers.push_back(3);
+  numbers.push_back(1);
+  numbers.push_back(2);
+  numbers.push_back(2);
+  EXPECT_THAT(numbers, WhenSortedBy(greater<unsigned>(),
+                                    ElementsAre(3, 2, 2, 1)));
+  EXPECT_THAT(numbers, Not(WhenSortedBy(greater<unsigned>(),
+                                        ElementsAre(1, 2, 2, 3))));
+}
+
+TEST(WhenSortedByTest, WorksForNonVectorContainer) {
+  list<std::string> words;
+  words.push_back("say");
+  words.push_back("hello");
+  words.push_back("world");
+  EXPECT_THAT(words, WhenSortedBy(less<std::string>(),
+                                  ElementsAre("hello", "say", "world")));
+  EXPECT_THAT(words, Not(WhenSortedBy(less<std::string>(),
+                                      ElementsAre("say", "hello", "world"))));
+}
+
+TEST(WhenSortedByTest, WorksForNativeArray) {
+  const int numbers[] = {1, 3, 2, 4};
+  const int sorted_numbers[] = {1, 2, 3, 4};
+  EXPECT_THAT(numbers, WhenSortedBy(less<int>(), ElementsAre(1, 2, 3, 4)));
+  EXPECT_THAT(numbers, WhenSortedBy(less<int>(),
+                                    ElementsAreArray(sorted_numbers)));
+  EXPECT_THAT(numbers, Not(WhenSortedBy(less<int>(), ElementsAre(1, 3, 2, 4))));
+}
+
+TEST(WhenSortedByTest, CanDescribeSelf) {
+  const Matcher<vector<int> > m = WhenSortedBy(less<int>(), ElementsAre(1, 2));
+  EXPECT_EQ("(when sorted) has 2 elements where\n"
+            "element #0 is equal to 1,\n"
+            "element #1 is equal to 2",
+            Describe(m));
+  EXPECT_EQ("(when sorted) doesn't have 2 elements, or\n"
+            "element #0 isn't equal to 1, or\n"
+            "element #1 isn't equal to 2",
+            DescribeNegation(m));
+}
+
+TEST(WhenSortedByTest, ExplainsMatchResult) {
+  const int a[] = {2, 1};
+  EXPECT_EQ("which is { 1, 2 } when sorted, whose element #0 doesn't match",
+            Explain(WhenSortedBy(less<int>(), ElementsAre(2, 3)), a));
+  EXPECT_EQ("which is { 1, 2 } when sorted",
+            Explain(WhenSortedBy(less<int>(), ElementsAre(1, 2)), a));
+}
+
+// WhenSorted() is a simple wrapper on WhenSortedBy().  Hence we don't
+// need to test it as exhaustively as we test the latter.
+
+TEST(WhenSortedTest, WorksForEmptyContainer) {
+  const vector<int> numbers;
+  EXPECT_THAT(numbers, WhenSorted(ElementsAre()));
+  EXPECT_THAT(numbers, Not(WhenSorted(ElementsAre(1))));
+}
+
+TEST(WhenSortedTest, WorksForNonEmptyContainer) {
+  list<std::string> words;
+  words.push_back("3");
+  words.push_back("1");
+  words.push_back("2");
+  words.push_back("2");
+  EXPECT_THAT(words, WhenSorted(ElementsAre("1", "2", "2", "3")));
+  EXPECT_THAT(words, Not(WhenSorted(ElementsAre("3", "1", "2", "2"))));
+}
+
+TEST(WhenSortedTest, WorksForMapTypes) {
+  map<std::string, int> word_counts;
+  word_counts["and"] = 1;
+  word_counts["the"] = 1;
+  word_counts["buffalo"] = 2;
+  EXPECT_THAT(word_counts,
+              WhenSorted(ElementsAre(Pair("and", 1), Pair("buffalo", 2),
+                                     Pair("the", 1))));
+  EXPECT_THAT(word_counts,
+              Not(WhenSorted(ElementsAre(Pair("and", 1), Pair("the", 1),
+                                         Pair("buffalo", 2)))));
+}
+
+TEST(WhenSortedTest, WorksForMultiMapTypes) {
+    multimap<int, int> ifib;
+    ifib.insert(make_pair(8, 6));
+    ifib.insert(make_pair(2, 3));
+    ifib.insert(make_pair(1, 1));
+    ifib.insert(make_pair(3, 4));
+    ifib.insert(make_pair(1, 2));
+    ifib.insert(make_pair(5, 5));
+    EXPECT_THAT(ifib, WhenSorted(ElementsAre(Pair(1, 1),
+                                             Pair(1, 2),
+                                             Pair(2, 3),
+                                             Pair(3, 4),
+                                             Pair(5, 5),
+                                             Pair(8, 6))));
+    EXPECT_THAT(ifib, Not(WhenSorted(ElementsAre(Pair(8, 6),
+                                                 Pair(2, 3),
+                                                 Pair(1, 1),
+                                                 Pair(3, 4),
+                                                 Pair(1, 2),
+                                                 Pair(5, 5)))));
+}
+
+TEST(WhenSortedTest, WorksForPolymorphicMatcher) {
+    std::deque<int> d;
+    d.push_back(2);
+    d.push_back(1);
+    EXPECT_THAT(d, WhenSorted(ElementsAre(1, 2)));
+    EXPECT_THAT(d, Not(WhenSorted(ElementsAre(2, 1))));
+}
+
+TEST(WhenSortedTest, WorksForVectorConstRefMatcher) {
+    std::deque<int> d;
+    d.push_back(2);
+    d.push_back(1);
+    Matcher<const std::vector<int>&> vector_match = ElementsAre(1, 2);
+    EXPECT_THAT(d, WhenSorted(vector_match));
+    Matcher<const std::vector<int>&> not_vector_match = ElementsAre(2, 1);
+    EXPECT_THAT(d, Not(WhenSorted(not_vector_match)));
+}
+
+// Deliberately bare pseudo-container.
+// Offers only begin() and end() accessors, yielding InputIterator.
+template <typename T>
+class Streamlike {
+ private:
+  class ConstIter;
+ public:
+  typedef ConstIter const_iterator;
+  typedef T value_type;
+
+  template <typename InIter>
+  Streamlike(InIter first, InIter last) : remainder_(first, last) {}
+
+  const_iterator begin() const {
+    return const_iterator(this, remainder_.begin());
+  }
+  const_iterator end() const {
+    return const_iterator(this, remainder_.end());
+  }
+
+ private:
+  class ConstIter : public std::iterator<std::input_iterator_tag,
+                                         value_type,
+                                         ptrdiff_t,
+                                         const value_type*,
+                                         const value_type&> {
+   public:
+    ConstIter(const Streamlike* s,
+              typename std::list<value_type>::iterator pos)
+        : s_(s), pos_(pos) {}
+
+    const value_type& operator*() const { return *pos_; }
+    const value_type* operator->() const { return &*pos_; }
+    ConstIter& operator++() {
+      s_->remainder_.erase(pos_++);
+      return *this;
+    }
+
+    // *iter++ is required to work (see std::istreambuf_iterator).
+    // (void)iter++ is also required to work.
+    class PostIncrProxy {
+     public:
+      explicit PostIncrProxy(const value_type& value) : value_(value) {}
+      value_type operator*() const { return value_; }
+     private:
+      value_type value_;
+    };
+    PostIncrProxy operator++(int) {
+      PostIncrProxy proxy(**this);
+      ++(*this);
+      return proxy;
+    }
+
+    friend bool operator==(const ConstIter& a, const ConstIter& b) {
+      return a.s_ == b.s_ && a.pos_ == b.pos_;
+    }
+    friend bool operator!=(const ConstIter& a, const ConstIter& b) {
+      return !(a == b);
+    }
+
+   private:
+    const Streamlike* s_;
+    typename std::list<value_type>::iterator pos_;
+  };
+
+  friend std::ostream& operator<<(std::ostream& os, const Streamlike& s) {
+    os << "[";
+    typedef typename std::list<value_type>::const_iterator Iter;
+    const char* sep = "";
+    for (Iter it = s.remainder_.begin(); it != s.remainder_.end(); ++it) {
+      os << sep << *it;
+      sep = ",";
+    }
+    os << "]";
+    return os;
+  }
+
+  mutable std::list<value_type> remainder_;  // modified by iteration
+};
+
+TEST(StreamlikeTest, Iteration) {
+  const int a[5] = {2, 1, 4, 5, 3};
+  Streamlike<int> s(a, a + 5);
+  Streamlike<int>::const_iterator it = s.begin();
+  const int* ip = a;
+  while (it != s.end()) {
+    SCOPED_TRACE(ip - a);
+    EXPECT_EQ(*ip++, *it++);
+  }
+}
+
+TEST(BeginEndDistanceIsTest, WorksWithForwardList) {
+  std::forward_list<int> container;
+  EXPECT_THAT(container, BeginEndDistanceIs(0));
+  EXPECT_THAT(container, Not(BeginEndDistanceIs(1)));
+  container.push_front(0);
+  EXPECT_THAT(container, Not(BeginEndDistanceIs(0)));
+  EXPECT_THAT(container, BeginEndDistanceIs(1));
+  container.push_front(0);
+  EXPECT_THAT(container, Not(BeginEndDistanceIs(0)));
+  EXPECT_THAT(container, BeginEndDistanceIs(2));
+}
+
+TEST(BeginEndDistanceIsTest, WorksWithNonStdList) {
+  const int a[5] = {1, 2, 3, 4, 5};
+  Streamlike<int> s(a, a + 5);
+  EXPECT_THAT(s, BeginEndDistanceIs(5));
+}
+
+TEST(BeginEndDistanceIsTest, CanDescribeSelf) {
+  Matcher<vector<int> > m = BeginEndDistanceIs(2);
+  EXPECT_EQ("distance between begin() and end() is equal to 2", Describe(m));
+  EXPECT_EQ("distance between begin() and end() isn't equal to 2",
+            DescribeNegation(m));
+}
+
+TEST(BeginEndDistanceIsTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(BeginEndDistanceIs(2)));
+  helper.Call(MakeUniquePtrs({1, 2}));
+}
+
+TEST(BeginEndDistanceIsTest, ExplainsResult) {
+  Matcher<vector<int> > m1 = BeginEndDistanceIs(2);
+  Matcher<vector<int> > m2 = BeginEndDistanceIs(Lt(2));
+  Matcher<vector<int> > m3 = BeginEndDistanceIs(AnyOf(0, 3));
+  Matcher<vector<int> > m4 = BeginEndDistanceIs(GreaterThan(1));
+  vector<int> container;
+  EXPECT_EQ("whose distance between begin() and end() 0 doesn't match",
+            Explain(m1, container));
+  EXPECT_EQ("whose distance between begin() and end() 0 matches",
+            Explain(m2, container));
+  EXPECT_EQ("whose distance between begin() and end() 0 matches",
+            Explain(m3, container));
+  EXPECT_EQ(
+      "whose distance between begin() and end() 0 doesn't match, which is 1 "
+      "less than 1",
+      Explain(m4, container));
+  container.push_back(0);
+  container.push_back(0);
+  EXPECT_EQ("whose distance between begin() and end() 2 matches",
+            Explain(m1, container));
+  EXPECT_EQ("whose distance between begin() and end() 2 doesn't match",
+            Explain(m2, container));
+  EXPECT_EQ("whose distance between begin() and end() 2 doesn't match",
+            Explain(m3, container));
+  EXPECT_EQ(
+      "whose distance between begin() and end() 2 matches, which is 1 more "
+      "than 1",
+      Explain(m4, container));
+}
+
+TEST(WhenSortedTest, WorksForStreamlike) {
+  // Streamlike 'container' provides only minimal iterator support.
+  // Its iterators are tagged with input_iterator_tag.
+  const int a[5] = {2, 1, 4, 5, 3};
+  Streamlike<int> s(std::begin(a), std::end(a));
+  EXPECT_THAT(s, WhenSorted(ElementsAre(1, 2, 3, 4, 5)));
+  EXPECT_THAT(s, Not(WhenSorted(ElementsAre(2, 1, 4, 5, 3))));
+}
+
+TEST(WhenSortedTest, WorksForVectorConstRefMatcherOnStreamlike) {
+  const int a[] = {2, 1, 4, 5, 3};
+  Streamlike<int> s(std::begin(a), std::end(a));
+  Matcher<const std::vector<int>&> vector_match = ElementsAre(1, 2, 3, 4, 5);
+  EXPECT_THAT(s, WhenSorted(vector_match));
+  EXPECT_THAT(s, Not(WhenSorted(ElementsAre(2, 1, 4, 5, 3))));
+}
+
+TEST(IsSupersetOfTest, WorksForNativeArray) {
+  const int subset[] = {1, 4};
+  const int superset[] = {1, 2, 4};
+  const int disjoint[] = {1, 0, 3};
+  EXPECT_THAT(subset, IsSupersetOf(subset));
+  EXPECT_THAT(subset, Not(IsSupersetOf(superset)));
+  EXPECT_THAT(superset, IsSupersetOf(subset));
+  EXPECT_THAT(subset, Not(IsSupersetOf(disjoint)));
+  EXPECT_THAT(disjoint, Not(IsSupersetOf(subset)));
+}
+
+TEST(IsSupersetOfTest, WorksWithDuplicates) {
+  const int not_enough[] = {1, 2};
+  const int enough[] = {1, 1, 2};
+  const int expected[] = {1, 1};
+  EXPECT_THAT(not_enough, Not(IsSupersetOf(expected)));
+  EXPECT_THAT(enough, IsSupersetOf(expected));
+}
+
+TEST(IsSupersetOfTest, WorksForEmpty) {
+  vector<int> numbers;
+  vector<int> expected;
+  EXPECT_THAT(numbers, IsSupersetOf(expected));
+  expected.push_back(1);
+  EXPECT_THAT(numbers, Not(IsSupersetOf(expected)));
+  expected.clear();
+  numbers.push_back(1);
+  numbers.push_back(2);
+  EXPECT_THAT(numbers, IsSupersetOf(expected));
+  expected.push_back(1);
+  EXPECT_THAT(numbers, IsSupersetOf(expected));
+  expected.push_back(2);
+  EXPECT_THAT(numbers, IsSupersetOf(expected));
+  expected.push_back(3);
+  EXPECT_THAT(numbers, Not(IsSupersetOf(expected)));
+}
+
+TEST(IsSupersetOfTest, WorksForStreamlike) {
+  const int a[5] = {1, 2, 3, 4, 5};
+  Streamlike<int> s(std::begin(a), std::end(a));
+
+  vector<int> expected;
+  expected.push_back(1);
+  expected.push_back(2);
+  expected.push_back(5);
+  EXPECT_THAT(s, IsSupersetOf(expected));
+
+  expected.push_back(0);
+  EXPECT_THAT(s, Not(IsSupersetOf(expected)));
+}
+
+TEST(IsSupersetOfTest, TakesStlContainer) {
+  const int actual[] = {3, 1, 2};
+
+  ::std::list<int> expected;
+  expected.push_back(1);
+  expected.push_back(3);
+  EXPECT_THAT(actual, IsSupersetOf(expected));
+
+  expected.push_back(4);
+  EXPECT_THAT(actual, Not(IsSupersetOf(expected)));
+}
+
+TEST(IsSupersetOfTest, Describe) {
+  typedef std::vector<int> IntVec;
+  IntVec expected;
+  expected.push_back(111);
+  expected.push_back(222);
+  expected.push_back(333);
+  EXPECT_THAT(
+      Describe<IntVec>(IsSupersetOf(expected)),
+      Eq("a surjection from elements to requirements exists such that:\n"
+         " - an element is equal to 111\n"
+         " - an element is equal to 222\n"
+         " - an element is equal to 333"));
+}
+
+TEST(IsSupersetOfTest, DescribeNegation) {
+  typedef std::vector<int> IntVec;
+  IntVec expected;
+  expected.push_back(111);
+  expected.push_back(222);
+  expected.push_back(333);
+  EXPECT_THAT(
+      DescribeNegation<IntVec>(IsSupersetOf(expected)),
+      Eq("no surjection from elements to requirements exists such that:\n"
+         " - an element is equal to 111\n"
+         " - an element is equal to 222\n"
+         " - an element is equal to 333"));
+}
+
+TEST(IsSupersetOfTest, MatchAndExplain) {
+  std::vector<int> v;
+  v.push_back(2);
+  v.push_back(3);
+  std::vector<int> expected;
+  expected.push_back(1);
+  expected.push_back(2);
+  StringMatchResultListener listener;
+  ASSERT_FALSE(ExplainMatchResult(IsSupersetOf(expected), v, &listener))
+      << listener.str();
+  EXPECT_THAT(listener.str(),
+              Eq("where the following matchers don't match any elements:\n"
+                 "matcher #0: is equal to 1"));
+
+  v.push_back(1);
+  listener.Clear();
+  ASSERT_TRUE(ExplainMatchResult(IsSupersetOf(expected), v, &listener))
+      << listener.str();
+  EXPECT_THAT(listener.str(), Eq("where:\n"
+                                 " - element #0 is matched by matcher #1,\n"
+                                 " - element #2 is matched by matcher #0"));
+}
+
+TEST(IsSupersetOfTest, WorksForRhsInitializerList) {
+  const int numbers[] = {1, 3, 6, 2, 4, 5};
+  EXPECT_THAT(numbers, IsSupersetOf({1, 2}));
+  EXPECT_THAT(numbers, Not(IsSupersetOf({3, 0})));
+}
+
+TEST(IsSupersetOfTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(IsSupersetOf({Pointee(1)})));
+  helper.Call(MakeUniquePtrs({1, 2}));
+  EXPECT_CALL(helper, Call(Not(IsSupersetOf({Pointee(1), Pointee(2)}))));
+  helper.Call(MakeUniquePtrs({2}));
+}
+
+TEST(IsSubsetOfTest, WorksForNativeArray) {
+  const int subset[] = {1, 4};
+  const int superset[] = {1, 2, 4};
+  const int disjoint[] = {1, 0, 3};
+  EXPECT_THAT(subset, IsSubsetOf(subset));
+  EXPECT_THAT(subset, IsSubsetOf(superset));
+  EXPECT_THAT(superset, Not(IsSubsetOf(subset)));
+  EXPECT_THAT(subset, Not(IsSubsetOf(disjoint)));
+  EXPECT_THAT(disjoint, Not(IsSubsetOf(subset)));
+}
+
+TEST(IsSubsetOfTest, WorksWithDuplicates) {
+  const int not_enough[] = {1, 2};
+  const int enough[] = {1, 1, 2};
+  const int actual[] = {1, 1};
+  EXPECT_THAT(actual, Not(IsSubsetOf(not_enough)));
+  EXPECT_THAT(actual, IsSubsetOf(enough));
+}
+
+TEST(IsSubsetOfTest, WorksForEmpty) {
+  vector<int> numbers;
+  vector<int> expected;
+  EXPECT_THAT(numbers, IsSubsetOf(expected));
+  expected.push_back(1);
+  EXPECT_THAT(numbers, IsSubsetOf(expected));
+  expected.clear();
+  numbers.push_back(1);
+  numbers.push_back(2);
+  EXPECT_THAT(numbers, Not(IsSubsetOf(expected)));
+  expected.push_back(1);
+  EXPECT_THAT(numbers, Not(IsSubsetOf(expected)));
+  expected.push_back(2);
+  EXPECT_THAT(numbers, IsSubsetOf(expected));
+  expected.push_back(3);
+  EXPECT_THAT(numbers, IsSubsetOf(expected));
+}
+
+TEST(IsSubsetOfTest, WorksForStreamlike) {
+  const int a[5] = {1, 2};
+  Streamlike<int> s(std::begin(a), std::end(a));
+
+  vector<int> expected;
+  expected.push_back(1);
+  EXPECT_THAT(s, Not(IsSubsetOf(expected)));
+  expected.push_back(2);
+  expected.push_back(5);
+  EXPECT_THAT(s, IsSubsetOf(expected));
+}
+
+TEST(IsSubsetOfTest, TakesStlContainer) {
+  const int actual[] = {3, 1, 2};
+
+  ::std::list<int> expected;
+  expected.push_back(1);
+  expected.push_back(3);
+  EXPECT_THAT(actual, Not(IsSubsetOf(expected)));
+
+  expected.push_back(2);
+  expected.push_back(4);
+  EXPECT_THAT(actual, IsSubsetOf(expected));
+}
+
+TEST(IsSubsetOfTest, Describe) {
+  typedef std::vector<int> IntVec;
+  IntVec expected;
+  expected.push_back(111);
+  expected.push_back(222);
+  expected.push_back(333);
+
+  EXPECT_THAT(
+      Describe<IntVec>(IsSubsetOf(expected)),
+      Eq("an injection from elements to requirements exists such that:\n"
+         " - an element is equal to 111\n"
+         " - an element is equal to 222\n"
+         " - an element is equal to 333"));
+}
+
+TEST(IsSubsetOfTest, DescribeNegation) {
+  typedef std::vector<int> IntVec;
+  IntVec expected;
+  expected.push_back(111);
+  expected.push_back(222);
+  expected.push_back(333);
+  EXPECT_THAT(
+      DescribeNegation<IntVec>(IsSubsetOf(expected)),
+      Eq("no injection from elements to requirements exists such that:\n"
+         " - an element is equal to 111\n"
+         " - an element is equal to 222\n"
+         " - an element is equal to 333"));
+}
+
+TEST(IsSubsetOfTest, MatchAndExplain) {
+  std::vector<int> v;
+  v.push_back(2);
+  v.push_back(3);
+  std::vector<int> expected;
+  expected.push_back(1);
+  expected.push_back(2);
+  StringMatchResultListener listener;
+  ASSERT_FALSE(ExplainMatchResult(IsSubsetOf(expected), v, &listener))
+      << listener.str();
+  EXPECT_THAT(listener.str(),
+              Eq("where the following elements don't match any matchers:\n"
+                 "element #1: 3"));
+
+  expected.push_back(3);
+  listener.Clear();
+  ASSERT_TRUE(ExplainMatchResult(IsSubsetOf(expected), v, &listener))
+      << listener.str();
+  EXPECT_THAT(listener.str(), Eq("where:\n"
+                                 " - element #0 is matched by matcher #1,\n"
+                                 " - element #1 is matched by matcher #2"));
+}
+
+TEST(IsSubsetOfTest, WorksForRhsInitializerList) {
+  const int numbers[] = {1, 2, 3};
+  EXPECT_THAT(numbers, IsSubsetOf({1, 2, 3, 4}));
+  EXPECT_THAT(numbers, Not(IsSubsetOf({1, 2})));
+}
+
+TEST(IsSubsetOfTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(IsSubsetOf({Pointee(1), Pointee(2)})));
+  helper.Call(MakeUniquePtrs({1}));
+  EXPECT_CALL(helper, Call(Not(IsSubsetOf({Pointee(1)}))));
+  helper.Call(MakeUniquePtrs({2}));
+}
+
+// Tests using ElementsAre() and ElementsAreArray() with stream-like
+// "containers".
+
+TEST(ElemensAreStreamTest, WorksForStreamlike) {
+  const int a[5] = {1, 2, 3, 4, 5};
+  Streamlike<int> s(std::begin(a), std::end(a));
+  EXPECT_THAT(s, ElementsAre(1, 2, 3, 4, 5));
+  EXPECT_THAT(s, Not(ElementsAre(2, 1, 4, 5, 3)));
+}
+
+TEST(ElemensAreArrayStreamTest, WorksForStreamlike) {
+  const int a[5] = {1, 2, 3, 4, 5};
+  Streamlike<int> s(std::begin(a), std::end(a));
+
+  vector<int> expected;
+  expected.push_back(1);
+  expected.push_back(2);
+  expected.push_back(3);
+  expected.push_back(4);
+  expected.push_back(5);
+  EXPECT_THAT(s, ElementsAreArray(expected));
+
+  expected[3] = 0;
+  EXPECT_THAT(s, Not(ElementsAreArray(expected)));
+}
+
+TEST(ElementsAreTest, WorksWithUncopyable) {
+  Uncopyable objs[2];
+  objs[0].set_value(-3);
+  objs[1].set_value(1);
+  EXPECT_THAT(objs, ElementsAre(UncopyableIs(-3), Truly(ValueIsPositive)));
+}
+
+TEST(ElementsAreTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(ElementsAre(Pointee(1), Pointee(2))));
+  helper.Call(MakeUniquePtrs({1, 2}));
+
+  EXPECT_CALL(helper, Call(ElementsAreArray({Pointee(3), Pointee(4)})));
+  helper.Call(MakeUniquePtrs({3, 4}));
+}
+
+TEST(ElementsAreTest, TakesStlContainer) {
+  const int actual[] = {3, 1, 2};
+
+  ::std::list<int> expected;
+  expected.push_back(3);
+  expected.push_back(1);
+  expected.push_back(2);
+  EXPECT_THAT(actual, ElementsAreArray(expected));
+
+  expected.push_back(4);
+  EXPECT_THAT(actual, Not(ElementsAreArray(expected)));
+}
+
+// Tests for UnorderedElementsAreArray()
+
+TEST(UnorderedElementsAreArrayTest, SucceedsWhenExpected) {
+  const int a[] = {0, 1, 2, 3, 4};
+  std::vector<int> s(std::begin(a), std::end(a));
+  do {
+    StringMatchResultListener listener;
+    EXPECT_TRUE(ExplainMatchResult(UnorderedElementsAreArray(a),
+                                   s, &listener)) << listener.str();
+  } while (std::next_permutation(s.begin(), s.end()));
+}
+
+TEST(UnorderedElementsAreArrayTest, VectorBool) {
+  const bool a[] = {0, 1, 0, 1, 1};
+  const bool b[] = {1, 0, 1, 1, 0};
+  std::vector<bool> expected(std::begin(a), std::end(a));
+  std::vector<bool> actual(std::begin(b), std::end(b));
+  StringMatchResultListener listener;
+  EXPECT_TRUE(ExplainMatchResult(UnorderedElementsAreArray(expected),
+                                 actual, &listener)) << listener.str();
+}
+
+TEST(UnorderedElementsAreArrayTest, WorksForStreamlike) {
+  // Streamlike 'container' provides only minimal iterator support.
+  // Its iterators are tagged with input_iterator_tag, and it has no
+  // size() or empty() methods.
+  const int a[5] = {2, 1, 4, 5, 3};
+  Streamlike<int> s(std::begin(a), std::end(a));
+
+  ::std::vector<int> expected;
+  expected.push_back(1);
+  expected.push_back(2);
+  expected.push_back(3);
+  expected.push_back(4);
+  expected.push_back(5);
+  EXPECT_THAT(s, UnorderedElementsAreArray(expected));
+
+  expected.push_back(6);
+  EXPECT_THAT(s, Not(UnorderedElementsAreArray(expected)));
+}
+
+TEST(UnorderedElementsAreArrayTest, TakesStlContainer) {
+  const int actual[] = {3, 1, 2};
+
+  ::std::list<int> expected;
+  expected.push_back(1);
+  expected.push_back(2);
+  expected.push_back(3);
+  EXPECT_THAT(actual, UnorderedElementsAreArray(expected));
+
+  expected.push_back(4);
+  EXPECT_THAT(actual, Not(UnorderedElementsAreArray(expected)));
+}
+
+
+TEST(UnorderedElementsAreArrayTest, TakesInitializerList) {
+  const int a[5] = {2, 1, 4, 5, 3};
+  EXPECT_THAT(a, UnorderedElementsAreArray({1, 2, 3, 4, 5}));
+  EXPECT_THAT(a, Not(UnorderedElementsAreArray({1, 2, 3, 4, 6})));
+}
+
+TEST(UnorderedElementsAreArrayTest, TakesInitializerListOfCStrings) {
+  const std::string a[5] = {"a", "b", "c", "d", "e"};
+  EXPECT_THAT(a, UnorderedElementsAreArray({"a", "b", "c", "d", "e"}));
+  EXPECT_THAT(a, Not(UnorderedElementsAreArray({"a", "b", "c", "d", "ef"})));
+}
+
+TEST(UnorderedElementsAreArrayTest, TakesInitializerListOfSameTypedMatchers) {
+  const int a[5] = {2, 1, 4, 5, 3};
+  EXPECT_THAT(a, UnorderedElementsAreArray(
+      {Eq(1), Eq(2), Eq(3), Eq(4), Eq(5)}));
+  EXPECT_THAT(a, Not(UnorderedElementsAreArray(
+      {Eq(1), Eq(2), Eq(3), Eq(4), Eq(6)})));
+}
+
+TEST(UnorderedElementsAreArrayTest,
+     TakesInitializerListOfDifferentTypedMatchers) {
+  const int a[5] = {2, 1, 4, 5, 3};
+  // The compiler cannot infer the type of the initializer list if its
+  // elements have different types.  We must explicitly specify the
+  // unified element type in this case.
+  EXPECT_THAT(a, UnorderedElementsAreArray<Matcher<int> >(
+      {Eq(1), Ne(-2), Ge(3), Le(4), Eq(5)}));
+  EXPECT_THAT(a, Not(UnorderedElementsAreArray<Matcher<int> >(
+      {Eq(1), Ne(-2), Ge(3), Le(4), Eq(6)})));
+}
+
+
+TEST(UnorderedElementsAreArrayTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper,
+              Call(UnorderedElementsAreArray({Pointee(1), Pointee(2)})));
+  helper.Call(MakeUniquePtrs({2, 1}));
+}
+
+class UnorderedElementsAreTest : public testing::Test {
+ protected:
+  typedef std::vector<int> IntVec;
+};
+
+TEST_F(UnorderedElementsAreTest, WorksWithUncopyable) {
+  Uncopyable objs[2];
+  objs[0].set_value(-3);
+  objs[1].set_value(1);
+  EXPECT_THAT(objs,
+              UnorderedElementsAre(Truly(ValueIsPositive), UncopyableIs(-3)));
+}
+
+TEST_F(UnorderedElementsAreTest, SucceedsWhenExpected) {
+  const int a[] = {1, 2, 3};
+  std::vector<int> s(std::begin(a), std::end(a));
+  do {
+    StringMatchResultListener listener;
+    EXPECT_TRUE(ExplainMatchResult(UnorderedElementsAre(1, 2, 3),
+                                   s, &listener)) << listener.str();
+  } while (std::next_permutation(s.begin(), s.end()));
+}
+
+TEST_F(UnorderedElementsAreTest, FailsWhenAnElementMatchesNoMatcher) {
+  const int a[] = {1, 2, 3};
+  std::vector<int> s(std::begin(a), std::end(a));
+  std::vector<Matcher<int> > mv;
+  mv.push_back(1);
+  mv.push_back(2);
+  mv.push_back(2);
+  // The element with value '3' matches nothing: fail fast.
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(UnorderedElementsAreArray(mv),
+                                  s, &listener)) << listener.str();
+}
+
+TEST_F(UnorderedElementsAreTest, WorksForStreamlike) {
+  // Streamlike 'container' provides only minimal iterator support.
+  // Its iterators are tagged with input_iterator_tag, and it has no
+  // size() or empty() methods.
+  const int a[5] = {2, 1, 4, 5, 3};
+  Streamlike<int> s(std::begin(a), std::end(a));
+
+  EXPECT_THAT(s, UnorderedElementsAre(1, 2, 3, 4, 5));
+  EXPECT_THAT(s, Not(UnorderedElementsAre(2, 2, 3, 4, 5)));
+}
+
+TEST_F(UnorderedElementsAreTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(UnorderedElementsAre(Pointee(1), Pointee(2))));
+  helper.Call(MakeUniquePtrs({2, 1}));
+}
+
+// One naive implementation of the matcher runs in O(N!) time, which is too
+// slow for many real-world inputs. This test shows that our matcher can match
+// 100 inputs very quickly (a few milliseconds).  An O(100!) is 10^158
+// iterations and obviously effectively incomputable.
+// [ RUN      ] UnorderedElementsAreTest.Performance
+// [       OK ] UnorderedElementsAreTest.Performance (4 ms)
+TEST_F(UnorderedElementsAreTest, Performance) {
+  std::vector<int> s;
+  std::vector<Matcher<int> > mv;
+  for (int i = 0; i < 100; ++i) {
+    s.push_back(i);
+    mv.push_back(_);
+  }
+  mv[50] = Eq(0);
+  StringMatchResultListener listener;
+  EXPECT_TRUE(ExplainMatchResult(UnorderedElementsAreArray(mv),
+                                 s, &listener)) << listener.str();
+}
+
+// Another variant of 'Performance' with similar expectations.
+// [ RUN      ] UnorderedElementsAreTest.PerformanceHalfStrict
+// [       OK ] UnorderedElementsAreTest.PerformanceHalfStrict (4 ms)
+TEST_F(UnorderedElementsAreTest, PerformanceHalfStrict) {
+  std::vector<int> s;
+  std::vector<Matcher<int> > mv;
+  for (int i = 0; i < 100; ++i) {
+    s.push_back(i);
+    if (i & 1) {
+      mv.push_back(_);
+    } else {
+      mv.push_back(i);
+    }
+  }
+  StringMatchResultListener listener;
+  EXPECT_TRUE(ExplainMatchResult(UnorderedElementsAreArray(mv),
+                                 s, &listener)) << listener.str();
+}
+
+TEST_F(UnorderedElementsAreTest, FailMessageCountWrong) {
+  std::vector<int> v;
+  v.push_back(4);
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(UnorderedElementsAre(1, 2, 3),
+                                  v, &listener)) << listener.str();
+  EXPECT_THAT(listener.str(), Eq("which has 1 element"));
+}
+
+TEST_F(UnorderedElementsAreTest, FailMessageCountWrongZero) {
+  std::vector<int> v;
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(UnorderedElementsAre(1, 2, 3),
+                                  v, &listener)) << listener.str();
+  EXPECT_THAT(listener.str(), Eq(""));
+}
+
+TEST_F(UnorderedElementsAreTest, FailMessageUnmatchedMatchers) {
+  std::vector<int> v;
+  v.push_back(1);
+  v.push_back(1);
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(UnorderedElementsAre(1, 2),
+                                  v, &listener)) << listener.str();
+  EXPECT_THAT(
+      listener.str(),
+      Eq("where the following matchers don't match any elements:\n"
+         "matcher #1: is equal to 2"));
+}
+
+TEST_F(UnorderedElementsAreTest, FailMessageUnmatchedElements) {
+  std::vector<int> v;
+  v.push_back(1);
+  v.push_back(2);
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(UnorderedElementsAre(1, 1),
+                                  v, &listener)) << listener.str();
+  EXPECT_THAT(
+      listener.str(),
+      Eq("where the following elements don't match any matchers:\n"
+         "element #1: 2"));
+}
+
+TEST_F(UnorderedElementsAreTest, FailMessageUnmatchedMatcherAndElement) {
+  std::vector<int> v;
+  v.push_back(2);
+  v.push_back(3);
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(UnorderedElementsAre(1, 2),
+                                  v, &listener)) << listener.str();
+  EXPECT_THAT(
+      listener.str(),
+      Eq("where"
+         " the following matchers don't match any elements:\n"
+         "matcher #0: is equal to 1\n"
+         "and"
+         " where"
+         " the following elements don't match any matchers:\n"
+         "element #1: 3"));
+}
+
+// Test helper for formatting element, matcher index pairs in expectations.
+static std::string EMString(int element, int matcher) {
+  stringstream ss;
+  ss << "(element #" << element << ", matcher #" << matcher << ")";
+  return ss.str();
+}
+
+TEST_F(UnorderedElementsAreTest, FailMessageImperfectMatchOnly) {
+  // A situation where all elements and matchers have a match
+  // associated with them, but the max matching is not perfect.
+  std::vector<std::string> v;
+  v.push_back("a");
+  v.push_back("b");
+  v.push_back("c");
+  StringMatchResultListener listener;
+  EXPECT_FALSE(ExplainMatchResult(
+      UnorderedElementsAre("a", "a", AnyOf("b", "c")), v, &listener))
+      << listener.str();
+
+  std::string prefix =
+      "where no permutation of the elements can satisfy all matchers, "
+      "and the closest match is 2 of 3 matchers with the "
+      "pairings:\n";
+
+  // We have to be a bit loose here, because there are 4 valid max matches.
+  EXPECT_THAT(
+      listener.str(),
+      AnyOf(prefix + "{\n  " + EMString(0, 0) +
+                     ",\n  " + EMString(1, 2) + "\n}",
+            prefix + "{\n  " + EMString(0, 1) +
+                     ",\n  " + EMString(1, 2) + "\n}",
+            prefix + "{\n  " + EMString(0, 0) +
+                     ",\n  " + EMString(2, 2) + "\n}",
+            prefix + "{\n  " + EMString(0, 1) +
+                     ",\n  " + EMString(2, 2) + "\n}"));
+}
+
+TEST_F(UnorderedElementsAreTest, Describe) {
+  EXPECT_THAT(Describe<IntVec>(UnorderedElementsAre()),
+              Eq("is empty"));
+  EXPECT_THAT(
+      Describe<IntVec>(UnorderedElementsAre(345)),
+      Eq("has 1 element and that element is equal to 345"));
+  EXPECT_THAT(
+      Describe<IntVec>(UnorderedElementsAre(111, 222, 333)),
+      Eq("has 3 elements and there exists some permutation "
+         "of elements such that:\n"
+         " - element #0 is equal to 111, and\n"
+         " - element #1 is equal to 222, and\n"
+         " - element #2 is equal to 333"));
+}
+
+TEST_F(UnorderedElementsAreTest, DescribeNegation) {
+  EXPECT_THAT(DescribeNegation<IntVec>(UnorderedElementsAre()),
+              Eq("isn't empty"));
+  EXPECT_THAT(
+      DescribeNegation<IntVec>(UnorderedElementsAre(345)),
+      Eq("doesn't have 1 element, or has 1 element that isn't equal to 345"));
+  EXPECT_THAT(
+      DescribeNegation<IntVec>(UnorderedElementsAre(123, 234, 345)),
+      Eq("doesn't have 3 elements, or there exists no permutation "
+         "of elements such that:\n"
+         " - element #0 is equal to 123, and\n"
+         " - element #1 is equal to 234, and\n"
+         " - element #2 is equal to 345"));
+}
+
+namespace {
+
+// Used as a check on the more complex max flow method used in the
+// real testing::internal::FindMaxBipartiteMatching. This method is
+// compatible but runs in worst-case factorial time, so we only
+// use it in testing for small problem sizes.
+template <typename Graph>
+class BacktrackingMaxBPMState {
+ public:
+  // Does not take ownership of 'g'.
+  explicit BacktrackingMaxBPMState(const Graph* g) : graph_(g) { }
+
+  ElementMatcherPairs Compute() {
+    if (graph_->LhsSize() == 0 || graph_->RhsSize() == 0) {
+      return best_so_far_;
+    }
+    lhs_used_.assign(graph_->LhsSize(), kUnused);
+    rhs_used_.assign(graph_->RhsSize(), kUnused);
+    for (size_t irhs = 0; irhs < graph_->RhsSize(); ++irhs) {
+      matches_.clear();
+      RecurseInto(irhs);
+      if (best_so_far_.size() == graph_->RhsSize())
+        break;
+    }
+    return best_so_far_;
+  }
+
+ private:
+  static const size_t kUnused = static_cast<size_t>(-1);
+
+  void PushMatch(size_t lhs, size_t rhs) {
+    matches_.push_back(ElementMatcherPair(lhs, rhs));
+    lhs_used_[lhs] = rhs;
+    rhs_used_[rhs] = lhs;
+    if (matches_.size() > best_so_far_.size()) {
+      best_so_far_ = matches_;
+    }
+  }
+
+  void PopMatch() {
+    const ElementMatcherPair& back = matches_.back();
+    lhs_used_[back.first] = kUnused;
+    rhs_used_[back.second] = kUnused;
+    matches_.pop_back();
+  }
+
+  bool RecurseInto(size_t irhs) {
+    if (rhs_used_[irhs] != kUnused) {
+      return true;
+    }
+    for (size_t ilhs = 0; ilhs < graph_->LhsSize(); ++ilhs) {
+      if (lhs_used_[ilhs] != kUnused) {
+        continue;
+      }
+      if (!graph_->HasEdge(ilhs, irhs)) {
+        continue;
+      }
+      PushMatch(ilhs, irhs);
+      if (best_so_far_.size() == graph_->RhsSize()) {
+        return false;
+      }
+      for (size_t mi = irhs + 1; mi < graph_->RhsSize(); ++mi) {
+        if (!RecurseInto(mi)) return false;
+      }
+      PopMatch();
+    }
+    return true;
+  }
+
+  const Graph* graph_;  // not owned
+  std::vector<size_t> lhs_used_;
+  std::vector<size_t> rhs_used_;
+  ElementMatcherPairs matches_;
+  ElementMatcherPairs best_so_far_;
+};
+
+template <typename Graph>
+const size_t BacktrackingMaxBPMState<Graph>::kUnused;
+
+}  // namespace
+
+// Implement a simple backtracking algorithm to determine if it is possible
+// to find one element per matcher, without reusing elements.
+template <typename Graph>
+ElementMatcherPairs
+FindBacktrackingMaxBPM(const Graph& g) {
+  return BacktrackingMaxBPMState<Graph>(&g).Compute();
+}
+
+class BacktrackingBPMTest : public ::testing::Test { };
+
+// Tests the MaxBipartiteMatching algorithm with square matrices.
+// The single int param is the # of nodes on each of the left and right sides.
+class BipartiteTest : public ::testing::TestWithParam<size_t> {};
+
+// Verify all match graphs up to some moderate number of edges.
+TEST_P(BipartiteTest, Exhaustive) {
+  size_t nodes = GetParam();
+  MatchMatrix graph(nodes, nodes);
+  do {
+    ElementMatcherPairs matches =
+        internal::FindMaxBipartiteMatching(graph);
+    EXPECT_EQ(FindBacktrackingMaxBPM(graph).size(), matches.size())
+        << "graph: " << graph.DebugString();
+    // Check that all elements of matches are in the graph.
+    // Check that elements of first and second are unique.
+    std::vector<bool> seen_element(graph.LhsSize());
+    std::vector<bool> seen_matcher(graph.RhsSize());
+    SCOPED_TRACE(PrintToString(matches));
+    for (size_t i = 0; i < matches.size(); ++i) {
+      size_t ilhs = matches[i].first;
+      size_t irhs = matches[i].second;
+      EXPECT_TRUE(graph.HasEdge(ilhs, irhs));
+      EXPECT_FALSE(seen_element[ilhs]);
+      EXPECT_FALSE(seen_matcher[irhs]);
+      seen_element[ilhs] = true;
+      seen_matcher[irhs] = true;
+    }
+  } while (graph.NextGraph());
+}
+
+INSTANTIATE_TEST_SUITE_P(AllGraphs, BipartiteTest,
+                         ::testing::Range(size_t{0}, size_t{5}));
+
+// Parameterized by a pair interpreted as (LhsSize, RhsSize).
+class BipartiteNonSquareTest
+    : public ::testing::TestWithParam<std::pair<size_t, size_t> > {
+};
+
+TEST_F(BipartiteNonSquareTest, SimpleBacktracking) {
+  //   .......
+  // 0:-----\ :
+  // 1:---\ | :
+  // 2:---\ | :
+  // 3:-\ | | :
+  //  :.......:
+  //    0 1 2
+  MatchMatrix g(4, 3);
+  constexpr std::array<std::array<size_t, 2>, 4> kEdges = {
+      {{{0, 2}}, {{1, 1}}, {{2, 1}}, {{3, 0}}}};
+  for (size_t i = 0; i < kEdges.size(); ++i) {
+    g.SetEdge(kEdges[i][0], kEdges[i][1], true);
+  }
+  EXPECT_THAT(FindBacktrackingMaxBPM(g),
+              ElementsAre(Pair(3, 0),
+                          Pair(AnyOf(1, 2), 1),
+                          Pair(0, 2))) << g.DebugString();
+}
+
+// Verify a few nonsquare matrices.
+TEST_P(BipartiteNonSquareTest, Exhaustive) {
+  size_t nlhs = GetParam().first;
+  size_t nrhs = GetParam().second;
+  MatchMatrix graph(nlhs, nrhs);
+  do {
+    EXPECT_EQ(FindBacktrackingMaxBPM(graph).size(),
+              internal::FindMaxBipartiteMatching(graph).size())
+        << "graph: " << graph.DebugString()
+        << "\nbacktracking: "
+        << PrintToString(FindBacktrackingMaxBPM(graph))
+        << "\nmax flow: "
+        << PrintToString(internal::FindMaxBipartiteMatching(graph));
+  } while (graph.NextGraph());
+}
+
+INSTANTIATE_TEST_SUITE_P(AllGraphs, BipartiteNonSquareTest,
+    testing::Values(
+        std::make_pair(1, 2),
+        std::make_pair(2, 1),
+        std::make_pair(3, 2),
+        std::make_pair(2, 3),
+        std::make_pair(4, 1),
+        std::make_pair(1, 4),
+        std::make_pair(4, 3),
+        std::make_pair(3, 4)));
+
+class BipartiteRandomTest
+    : public ::testing::TestWithParam<std::pair<int, int> > {
+};
+
+// Verifies a large sample of larger graphs.
+TEST_P(BipartiteRandomTest, LargerNets) {
+  int nodes = GetParam().first;
+  int iters = GetParam().second;
+  MatchMatrix graph(static_cast<size_t>(nodes), static_cast<size_t>(nodes));
+
+  auto seed = static_cast<uint32_t>(GTEST_FLAG(random_seed));
+  if (seed == 0) {
+    seed = static_cast<uint32_t>(time(nullptr));
+  }
+
+  for (; iters > 0; --iters, ++seed) {
+    srand(static_cast<unsigned int>(seed));
+    graph.Randomize();
+    EXPECT_EQ(FindBacktrackingMaxBPM(graph).size(),
+              internal::FindMaxBipartiteMatching(graph).size())
+        << " graph: " << graph.DebugString()
+        << "\nTo reproduce the failure, rerun the test with the flag"
+           " --" << GTEST_FLAG_PREFIX_ << "random_seed=" << seed;
+  }
+}
+
+// Test argument is a std::pair<int, int> representing (nodes, iters).
+INSTANTIATE_TEST_SUITE_P(Samples, BipartiteRandomTest,
+    testing::Values(
+        std::make_pair(5, 10000),
+        std::make_pair(6, 5000),
+        std::make_pair(7, 2000),
+        std::make_pair(8, 500),
+        std::make_pair(9, 100)));
+
+// Tests IsReadableTypeName().
+
+TEST(IsReadableTypeNameTest, ReturnsTrueForShortNames) {
+  EXPECT_TRUE(IsReadableTypeName("int"));
+  EXPECT_TRUE(IsReadableTypeName("const unsigned char*"));
+  EXPECT_TRUE(IsReadableTypeName("MyMap<int, void*>"));
+  EXPECT_TRUE(IsReadableTypeName("void (*)(int, bool)"));
+}
+
+TEST(IsReadableTypeNameTest, ReturnsTrueForLongNonTemplateNonFunctionNames) {
+  EXPECT_TRUE(IsReadableTypeName("my_long_namespace::MyClassName"));
+  EXPECT_TRUE(IsReadableTypeName("int [5][6][7][8][9][10][11]"));
+  EXPECT_TRUE(IsReadableTypeName("my_namespace::MyOuterClass::MyInnerClass"));
+}
+
+TEST(IsReadableTypeNameTest, ReturnsFalseForLongTemplateNames) {
+  EXPECT_FALSE(
+      IsReadableTypeName("basic_string<char, std::char_traits<char> >"));
+  EXPECT_FALSE(IsReadableTypeName("std::vector<int, std::alloc_traits<int> >"));
+}
+
+TEST(IsReadableTypeNameTest, ReturnsFalseForLongFunctionTypeNames) {
+  EXPECT_FALSE(IsReadableTypeName("void (&)(int, bool, char, float)"));
+}
+
+// Tests FormatMatcherDescription().
+
+TEST(FormatMatcherDescriptionTest, WorksForEmptyDescription) {
+  EXPECT_EQ("is even",
+            FormatMatcherDescription(false, "IsEven", Strings()));
+  EXPECT_EQ("not (is even)",
+            FormatMatcherDescription(true, "IsEven", Strings()));
+
+  const char* params[] = {"5"};
+  EXPECT_EQ("equals 5",
+            FormatMatcherDescription(false, "Equals",
+                                     Strings(params, params + 1)));
+
+  const char* params2[] = {"5", "8"};
+  EXPECT_EQ("is in range (5, 8)",
+            FormatMatcherDescription(false, "IsInRange",
+                                     Strings(params2, params2 + 2)));
+}
+
+// Tests PolymorphicMatcher::mutable_impl().
+TEST(PolymorphicMatcherTest, CanAccessMutableImpl) {
+  PolymorphicMatcher<DivisibleByImpl> m(DivisibleByImpl(42));
+  DivisibleByImpl& impl = m.mutable_impl();
+  EXPECT_EQ(42, impl.divider());
+
+  impl.set_divider(0);
+  EXPECT_EQ(0, m.mutable_impl().divider());
+}
+
+// Tests PolymorphicMatcher::impl().
+TEST(PolymorphicMatcherTest, CanAccessImpl) {
+  const PolymorphicMatcher<DivisibleByImpl> m(DivisibleByImpl(42));
+  const DivisibleByImpl& impl = m.impl();
+  EXPECT_EQ(42, impl.divider());
+}
+
+TEST(MatcherTupleTest, ExplainsMatchFailure) {
+  stringstream ss1;
+  ExplainMatchFailureTupleTo(
+      std::make_tuple(Matcher<char>(Eq('a')), GreaterThan(5)),
+      std::make_tuple('a', 10), &ss1);
+  EXPECT_EQ("", ss1.str());  // Successful match.
+
+  stringstream ss2;
+  ExplainMatchFailureTupleTo(
+      std::make_tuple(GreaterThan(5), Matcher<char>(Eq('a'))),
+      std::make_tuple(2, 'b'), &ss2);
+  EXPECT_EQ("  Expected arg #0: is > 5\n"
+            "           Actual: 2, which is 3 less than 5\n"
+            "  Expected arg #1: is equal to 'a' (97, 0x61)\n"
+            "           Actual: 'b' (98, 0x62)\n",
+            ss2.str());  // Failed match where both arguments need explanation.
+
+  stringstream ss3;
+  ExplainMatchFailureTupleTo(
+      std::make_tuple(GreaterThan(5), Matcher<char>(Eq('a'))),
+      std::make_tuple(2, 'a'), &ss3);
+  EXPECT_EQ("  Expected arg #0: is > 5\n"
+            "           Actual: 2, which is 3 less than 5\n",
+            ss3.str());  // Failed match where only one argument needs
+                         // explanation.
+}
+
+// Tests Each().
+
+TEST(EachTest, ExplainsMatchResultCorrectly) {
+  set<int> a;  // empty
+
+  Matcher<set<int> > m = Each(2);
+  EXPECT_EQ("", Explain(m, a));
+
+  Matcher<const int(&)[1]> n = Each(1);  // NOLINT
+
+  const int b[1] = {1};
+  EXPECT_EQ("", Explain(n, b));
+
+  n = Each(3);
+  EXPECT_EQ("whose element #0 doesn't match", Explain(n, b));
+
+  a.insert(1);
+  a.insert(2);
+  a.insert(3);
+  m = Each(GreaterThan(0));
+  EXPECT_EQ("", Explain(m, a));
+
+  m = Each(GreaterThan(10));
+  EXPECT_EQ("whose element #0 doesn't match, which is 9 less than 10",
+            Explain(m, a));
+}
+
+TEST(EachTest, DescribesItselfCorrectly) {
+  Matcher<vector<int> > m = Each(1);
+  EXPECT_EQ("only contains elements that is equal to 1", Describe(m));
+
+  Matcher<vector<int> > m2 = Not(m);
+  EXPECT_EQ("contains some element that isn't equal to 1", Describe(m2));
+}
+
+TEST(EachTest, MatchesVectorWhenAllElementsMatch) {
+  vector<int> some_vector;
+  EXPECT_THAT(some_vector, Each(1));
+  some_vector.push_back(3);
+  EXPECT_THAT(some_vector, Not(Each(1)));
+  EXPECT_THAT(some_vector, Each(3));
+  some_vector.push_back(1);
+  some_vector.push_back(2);
+  EXPECT_THAT(some_vector, Not(Each(3)));
+  EXPECT_THAT(some_vector, Each(Lt(3.5)));
+
+  vector<std::string> another_vector;
+  another_vector.push_back("fee");
+  EXPECT_THAT(another_vector, Each(std::string("fee")));
+  another_vector.push_back("fie");
+  another_vector.push_back("foe");
+  another_vector.push_back("fum");
+  EXPECT_THAT(another_vector, Not(Each(std::string("fee"))));
+}
+
+TEST(EachTest, MatchesMapWhenAllElementsMatch) {
+  map<const char*, int> my_map;
+  const char* bar = "a string";
+  my_map[bar] = 2;
+  EXPECT_THAT(my_map, Each(make_pair(bar, 2)));
+
+  map<std::string, int> another_map;
+  EXPECT_THAT(another_map, Each(make_pair(std::string("fee"), 1)));
+  another_map["fee"] = 1;
+  EXPECT_THAT(another_map, Each(make_pair(std::string("fee"), 1)));
+  another_map["fie"] = 2;
+  another_map["foe"] = 3;
+  another_map["fum"] = 4;
+  EXPECT_THAT(another_map, Not(Each(make_pair(std::string("fee"), 1))));
+  EXPECT_THAT(another_map, Not(Each(make_pair(std::string("fum"), 1))));
+  EXPECT_THAT(another_map, Each(Pair(_, Gt(0))));
+}
+
+TEST(EachTest, AcceptsMatcher) {
+  const int a[] = {1, 2, 3};
+  EXPECT_THAT(a, Each(Gt(0)));
+  EXPECT_THAT(a, Not(Each(Gt(1))));
+}
+
+TEST(EachTest, WorksForNativeArrayAsTuple) {
+  const int a[] = {1, 2};
+  const int* const pointer = a;
+  EXPECT_THAT(std::make_tuple(pointer, 2), Each(Gt(0)));
+  EXPECT_THAT(std::make_tuple(pointer, 2), Not(Each(Gt(1))));
+}
+
+TEST(EachTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(Each(Pointee(Gt(0)))));
+  helper.Call(MakeUniquePtrs({1, 2}));
+}
+
+// For testing Pointwise().
+class IsHalfOfMatcher {
+ public:
+  template <typename T1, typename T2>
+  bool MatchAndExplain(const std::tuple<T1, T2>& a_pair,
+                       MatchResultListener* listener) const {
+    if (std::get<0>(a_pair) == std::get<1>(a_pair) / 2) {
+      *listener << "where the second is " << std::get<1>(a_pair);
+      return true;
+    } else {
+      *listener << "where the second/2 is " << std::get<1>(a_pair) / 2;
+      return false;
+    }
+  }
+
+  void DescribeTo(ostream* os) const {
+    *os << "are a pair where the first is half of the second";
+  }
+
+  void DescribeNegationTo(ostream* os) const {
+    *os << "are a pair where the first isn't half of the second";
+  }
+};
+
+PolymorphicMatcher<IsHalfOfMatcher> IsHalfOf() {
+  return MakePolymorphicMatcher(IsHalfOfMatcher());
+}
+
+TEST(PointwiseTest, DescribesSelf) {
+  vector<int> rhs;
+  rhs.push_back(1);
+  rhs.push_back(2);
+  rhs.push_back(3);
+  const Matcher<const vector<int>&> m = Pointwise(IsHalfOf(), rhs);
+  EXPECT_EQ("contains 3 values, where each value and its corresponding value "
+            "in { 1, 2, 3 } are a pair where the first is half of the second",
+            Describe(m));
+  EXPECT_EQ("doesn't contain exactly 3 values, or contains a value x at some "
+            "index i where x and the i-th value of { 1, 2, 3 } are a pair "
+            "where the first isn't half of the second",
+            DescribeNegation(m));
+}
+
+TEST(PointwiseTest, MakesCopyOfRhs) {
+  list<signed char> rhs;
+  rhs.push_back(2);
+  rhs.push_back(4);
+
+  int lhs[] = {1, 2};
+  const Matcher<const int (&)[2]> m = Pointwise(IsHalfOf(), rhs);
+  EXPECT_THAT(lhs, m);
+
+  // Changing rhs now shouldn't affect m, which made a copy of rhs.
+  rhs.push_back(6);
+  EXPECT_THAT(lhs, m);
+}
+
+TEST(PointwiseTest, WorksForLhsNativeArray) {
+  const int lhs[] = {1, 2, 3};
+  vector<int> rhs;
+  rhs.push_back(2);
+  rhs.push_back(4);
+  rhs.push_back(6);
+  EXPECT_THAT(lhs, Pointwise(Lt(), rhs));
+  EXPECT_THAT(lhs, Not(Pointwise(Gt(), rhs)));
+}
+
+TEST(PointwiseTest, WorksForRhsNativeArray) {
+  const int rhs[] = {1, 2, 3};
+  vector<int> lhs;
+  lhs.push_back(2);
+  lhs.push_back(4);
+  lhs.push_back(6);
+  EXPECT_THAT(lhs, Pointwise(Gt(), rhs));
+  EXPECT_THAT(lhs, Not(Pointwise(Lt(), rhs)));
+}
+
+// Test is effective only with sanitizers.
+TEST(PointwiseTest, WorksForVectorOfBool) {
+  vector<bool> rhs(3, false);
+  rhs[1] = true;
+  vector<bool> lhs = rhs;
+  EXPECT_THAT(lhs, Pointwise(Eq(), rhs));
+  rhs[0] = true;
+  EXPECT_THAT(lhs, Not(Pointwise(Eq(), rhs)));
+}
+
+
+TEST(PointwiseTest, WorksForRhsInitializerList) {
+  const vector<int> lhs{2, 4, 6};
+  EXPECT_THAT(lhs, Pointwise(Gt(), {1, 2, 3}));
+  EXPECT_THAT(lhs, Not(Pointwise(Lt(), {3, 3, 7})));
+}
+
+
+TEST(PointwiseTest, RejectsWrongSize) {
+  const double lhs[2] = {1, 2};
+  const int rhs[1] = {0};
+  EXPECT_THAT(lhs, Not(Pointwise(Gt(), rhs)));
+  EXPECT_EQ("which contains 2 values",
+            Explain(Pointwise(Gt(), rhs), lhs));
+
+  const int rhs2[3] = {0, 1, 2};
+  EXPECT_THAT(lhs, Not(Pointwise(Gt(), rhs2)));
+}
+
+TEST(PointwiseTest, RejectsWrongContent) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {2, 6, 4};
+  EXPECT_THAT(lhs, Not(Pointwise(IsHalfOf(), rhs)));
+  EXPECT_EQ("where the value pair (2, 6) at index #1 don't match, "
+            "where the second/2 is 3",
+            Explain(Pointwise(IsHalfOf(), rhs), lhs));
+}
+
+TEST(PointwiseTest, AcceptsCorrectContent) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {2, 4, 6};
+  EXPECT_THAT(lhs, Pointwise(IsHalfOf(), rhs));
+  EXPECT_EQ("", Explain(Pointwise(IsHalfOf(), rhs), lhs));
+}
+
+TEST(PointwiseTest, AllowsMonomorphicInnerMatcher) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {2, 4, 6};
+  const Matcher<std::tuple<const double&, const int&>> m1 = IsHalfOf();
+  EXPECT_THAT(lhs, Pointwise(m1, rhs));
+  EXPECT_EQ("", Explain(Pointwise(m1, rhs), lhs));
+
+  // This type works as a std::tuple<const double&, const int&> can be
+  // implicitly cast to std::tuple<double, int>.
+  const Matcher<std::tuple<double, int>> m2 = IsHalfOf();
+  EXPECT_THAT(lhs, Pointwise(m2, rhs));
+  EXPECT_EQ("", Explain(Pointwise(m2, rhs), lhs));
+}
+
+MATCHER(PointeeEquals, "Points to an equal value") {
+  return ExplainMatchResult(::testing::Pointee(::testing::get<1>(arg)),
+                            ::testing::get<0>(arg), result_listener);
+}
+
+TEST(PointwiseTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(Pointwise(PointeeEquals(), std::vector<int>{1, 2})));
+  helper.Call(MakeUniquePtrs({1, 2}));
+}
+
+TEST(UnorderedPointwiseTest, DescribesSelf) {
+  vector<int> rhs;
+  rhs.push_back(1);
+  rhs.push_back(2);
+  rhs.push_back(3);
+  const Matcher<const vector<int>&> m = UnorderedPointwise(IsHalfOf(), rhs);
+  EXPECT_EQ(
+      "has 3 elements and there exists some permutation of elements such "
+      "that:\n"
+      " - element #0 and 1 are a pair where the first is half of the second, "
+      "and\n"
+      " - element #1 and 2 are a pair where the first is half of the second, "
+      "and\n"
+      " - element #2 and 3 are a pair where the first is half of the second",
+      Describe(m));
+  EXPECT_EQ(
+      "doesn't have 3 elements, or there exists no permutation of elements "
+      "such that:\n"
+      " - element #0 and 1 are a pair where the first is half of the second, "
+      "and\n"
+      " - element #1 and 2 are a pair where the first is half of the second, "
+      "and\n"
+      " - element #2 and 3 are a pair where the first is half of the second",
+      DescribeNegation(m));
+}
+
+TEST(UnorderedPointwiseTest, MakesCopyOfRhs) {
+  list<signed char> rhs;
+  rhs.push_back(2);
+  rhs.push_back(4);
+
+  int lhs[] = {2, 1};
+  const Matcher<const int (&)[2]> m = UnorderedPointwise(IsHalfOf(), rhs);
+  EXPECT_THAT(lhs, m);
+
+  // Changing rhs now shouldn't affect m, which made a copy of rhs.
+  rhs.push_back(6);
+  EXPECT_THAT(lhs, m);
+}
+
+TEST(UnorderedPointwiseTest, WorksForLhsNativeArray) {
+  const int lhs[] = {1, 2, 3};
+  vector<int> rhs;
+  rhs.push_back(4);
+  rhs.push_back(6);
+  rhs.push_back(2);
+  EXPECT_THAT(lhs, UnorderedPointwise(Lt(), rhs));
+  EXPECT_THAT(lhs, Not(UnorderedPointwise(Gt(), rhs)));
+}
+
+TEST(UnorderedPointwiseTest, WorksForRhsNativeArray) {
+  const int rhs[] = {1, 2, 3};
+  vector<int> lhs;
+  lhs.push_back(4);
+  lhs.push_back(2);
+  lhs.push_back(6);
+  EXPECT_THAT(lhs, UnorderedPointwise(Gt(), rhs));
+  EXPECT_THAT(lhs, Not(UnorderedPointwise(Lt(), rhs)));
+}
+
+
+TEST(UnorderedPointwiseTest, WorksForRhsInitializerList) {
+  const vector<int> lhs{2, 4, 6};
+  EXPECT_THAT(lhs, UnorderedPointwise(Gt(), {5, 1, 3}));
+  EXPECT_THAT(lhs, Not(UnorderedPointwise(Lt(), {1, 1, 7})));
+}
+
+
+TEST(UnorderedPointwiseTest, RejectsWrongSize) {
+  const double lhs[2] = {1, 2};
+  const int rhs[1] = {0};
+  EXPECT_THAT(lhs, Not(UnorderedPointwise(Gt(), rhs)));
+  EXPECT_EQ("which has 2 elements",
+            Explain(UnorderedPointwise(Gt(), rhs), lhs));
+
+  const int rhs2[3] = {0, 1, 2};
+  EXPECT_THAT(lhs, Not(UnorderedPointwise(Gt(), rhs2)));
+}
+
+TEST(UnorderedPointwiseTest, RejectsWrongContent) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {2, 6, 6};
+  EXPECT_THAT(lhs, Not(UnorderedPointwise(IsHalfOf(), rhs)));
+  EXPECT_EQ("where the following elements don't match any matchers:\n"
+            "element #1: 2",
+            Explain(UnorderedPointwise(IsHalfOf(), rhs), lhs));
+}
+
+TEST(UnorderedPointwiseTest, AcceptsCorrectContentInSameOrder) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {2, 4, 6};
+  EXPECT_THAT(lhs, UnorderedPointwise(IsHalfOf(), rhs));
+}
+
+TEST(UnorderedPointwiseTest, AcceptsCorrectContentInDifferentOrder) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {6, 4, 2};
+  EXPECT_THAT(lhs, UnorderedPointwise(IsHalfOf(), rhs));
+}
+
+TEST(UnorderedPointwiseTest, AllowsMonomorphicInnerMatcher) {
+  const double lhs[3] = {1, 2, 3};
+  const int rhs[3] = {4, 6, 2};
+  const Matcher<std::tuple<const double&, const int&>> m1 = IsHalfOf();
+  EXPECT_THAT(lhs, UnorderedPointwise(m1, rhs));
+
+  // This type works as a std::tuple<const double&, const int&> can be
+  // implicitly cast to std::tuple<double, int>.
+  const Matcher<std::tuple<double, int>> m2 = IsHalfOf();
+  EXPECT_THAT(lhs, UnorderedPointwise(m2, rhs));
+}
+
+TEST(UnorderedPointwiseTest, WorksWithMoveOnly) {
+  ContainerHelper helper;
+  EXPECT_CALL(helper, Call(UnorderedPointwise(PointeeEquals(),
+                                              std::vector<int>{1, 2})));
+  helper.Call(MakeUniquePtrs({2, 1}));
+}
+
+// Sample optional type implementation with minimal requirements for use with
+// Optional matcher.
+template <typename T>
+class SampleOptional {
+ public:
+  using value_type = T;
+  explicit SampleOptional(T value)
+      : value_(std::move(value)), has_value_(true) {}
+  SampleOptional() : value_(), has_value_(false) {}
+  operator bool() const { return has_value_; }
+  const T& operator*() const { return value_; }
+
+ private:
+  T value_;
+  bool has_value_;
+};
+
+TEST(OptionalTest, DescribesSelf) {
+  const Matcher<SampleOptional<int>> m = Optional(Eq(1));
+  EXPECT_EQ("value is equal to 1", Describe(m));
+}
+
+TEST(OptionalTest, ExplainsSelf) {
+  const Matcher<SampleOptional<int>> m = Optional(Eq(1));
+  EXPECT_EQ("whose value 1 matches", Explain(m, SampleOptional<int>(1)));
+  EXPECT_EQ("whose value 2 doesn't match", Explain(m, SampleOptional<int>(2)));
+}
+
+TEST(OptionalTest, MatchesNonEmptyOptional) {
+  const Matcher<SampleOptional<int>> m1 = Optional(1);
+  const Matcher<SampleOptional<int>> m2 = Optional(Eq(2));
+  const Matcher<SampleOptional<int>> m3 = Optional(Lt(3));
+  SampleOptional<int> opt(1);
+  EXPECT_TRUE(m1.Matches(opt));
+  EXPECT_FALSE(m2.Matches(opt));
+  EXPECT_TRUE(m3.Matches(opt));
+}
+
+TEST(OptionalTest, DoesNotMatchNullopt) {
+  const Matcher<SampleOptional<int>> m = Optional(1);
+  SampleOptional<int> empty;
+  EXPECT_FALSE(m.Matches(empty));
+}
+
+TEST(OptionalTest, WorksWithMoveOnly) {
+  Matcher<SampleOptional<std::unique_ptr<int>>> m = Optional(Eq(nullptr));
+  EXPECT_TRUE(m.Matches(SampleOptional<std::unique_ptr<int>>(nullptr)));
+}
+
+class SampleVariantIntString {
+ public:
+  SampleVariantIntString(int i) : i_(i), has_int_(true) {}
+  SampleVariantIntString(const std::string& s) : s_(s), has_int_(false) {}
+
+  template <typename T>
+  friend bool holds_alternative(const SampleVariantIntString& value) {
+    return value.has_int_ == std::is_same<T, int>::value;
+  }
+
+  template <typename T>
+  friend const T& get(const SampleVariantIntString& value) {
+    return value.get_impl(static_cast<T*>(nullptr));
+  }
+
+ private:
+  const int& get_impl(int*) const { return i_; }
+  const std::string& get_impl(std::string*) const { return s_; }
+
+  int i_;
+  std::string s_;
+  bool has_int_;
+};
+
+TEST(VariantTest, DescribesSelf) {
+  const Matcher<SampleVariantIntString> m = VariantWith<int>(Eq(1));
+  EXPECT_THAT(Describe(m), ContainsRegex("is a variant<> with value of type "
+                                         "'.*' and the value is equal to 1"));
+}
+
+TEST(VariantTest, ExplainsSelf) {
+  const Matcher<SampleVariantIntString> m = VariantWith<int>(Eq(1));
+  EXPECT_THAT(Explain(m, SampleVariantIntString(1)),
+              ContainsRegex("whose value 1"));
+  EXPECT_THAT(Explain(m, SampleVariantIntString("A")),
+              HasSubstr("whose value is not of type '"));
+  EXPECT_THAT(Explain(m, SampleVariantIntString(2)),
+              "whose value 2 doesn't match");
+}
+
+TEST(VariantTest, FullMatch) {
+  Matcher<SampleVariantIntString> m = VariantWith<int>(Eq(1));
+  EXPECT_TRUE(m.Matches(SampleVariantIntString(1)));
+
+  m = VariantWith<std::string>(Eq("1"));
+  EXPECT_TRUE(m.Matches(SampleVariantIntString("1")));
+}
+
+TEST(VariantTest, TypeDoesNotMatch) {
+  Matcher<SampleVariantIntString> m = VariantWith<int>(Eq(1));
+  EXPECT_FALSE(m.Matches(SampleVariantIntString("1")));
+
+  m = VariantWith<std::string>(Eq("1"));
+  EXPECT_FALSE(m.Matches(SampleVariantIntString(1)));
+}
+
+TEST(VariantTest, InnerDoesNotMatch) {
+  Matcher<SampleVariantIntString> m = VariantWith<int>(Eq(1));
+  EXPECT_FALSE(m.Matches(SampleVariantIntString(2)));
+
+  m = VariantWith<std::string>(Eq("1"));
+  EXPECT_FALSE(m.Matches(SampleVariantIntString("2")));
+}
+
+class SampleAnyType {
+ public:
+  explicit SampleAnyType(int i) : index_(0), i_(i) {}
+  explicit SampleAnyType(const std::string& s) : index_(1), s_(s) {}
+
+  template <typename T>
+  friend const T* any_cast(const SampleAnyType* any) {
+    return any->get_impl(static_cast<T*>(nullptr));
+  }
+
+ private:
+  int index_;
+  int i_;
+  std::string s_;
+
+  const int* get_impl(int*) const { return index_ == 0 ? &i_ : nullptr; }
+  const std::string* get_impl(std::string*) const {
+    return index_ == 1 ? &s_ : nullptr;
+  }
+};
+
+TEST(AnyWithTest, FullMatch) {
+  Matcher<SampleAnyType> m = AnyWith<int>(Eq(1));
+  EXPECT_TRUE(m.Matches(SampleAnyType(1)));
+}
+
+TEST(AnyWithTest, TestBadCastType) {
+  Matcher<SampleAnyType> m = AnyWith<std::string>(Eq("fail"));
+  EXPECT_FALSE(m.Matches(SampleAnyType(1)));
+}
+
+TEST(AnyWithTest, TestUseInContainers) {
+  std::vector<SampleAnyType> a;
+  a.emplace_back(1);
+  a.emplace_back(2);
+  a.emplace_back(3);
+  EXPECT_THAT(
+      a, ElementsAreArray({AnyWith<int>(1), AnyWith<int>(2), AnyWith<int>(3)}));
+
+  std::vector<SampleAnyType> b;
+  b.emplace_back("hello");
+  b.emplace_back("merhaba");
+  b.emplace_back("salut");
+  EXPECT_THAT(b, ElementsAreArray({AnyWith<std::string>("hello"),
+                                   AnyWith<std::string>("merhaba"),
+                                   AnyWith<std::string>("salut")}));
+}
+TEST(AnyWithTest, TestCompare) {
+  EXPECT_THAT(SampleAnyType(1), AnyWith<int>(Gt(0)));
+}
+
+TEST(AnyWithTest, DescribesSelf) {
+  const Matcher<const SampleAnyType&> m = AnyWith<int>(Eq(1));
+  EXPECT_THAT(Describe(m), ContainsRegex("is an 'any' type with value of type "
+                                         "'.*' and the value is equal to 1"));
+}
+
+TEST(AnyWithTest, ExplainsSelf) {
+  const Matcher<const SampleAnyType&> m = AnyWith<int>(Eq(1));
+
+  EXPECT_THAT(Explain(m, SampleAnyType(1)), ContainsRegex("whose value 1"));
+  EXPECT_THAT(Explain(m, SampleAnyType("A")),
+              HasSubstr("whose value is not of type '"));
+  EXPECT_THAT(Explain(m, SampleAnyType(2)), "whose value 2 doesn't match");
+}
+
+TEST(PointeeTest, WorksOnMoveOnlyType) {
+  std::unique_ptr<int> p(new int(3));
+  EXPECT_THAT(p, Pointee(Eq(3)));
+  EXPECT_THAT(p, Not(Pointee(Eq(2))));
+}
+
+TEST(NotTest, WorksOnMoveOnlyType) {
+  std::unique_ptr<int> p(new int(3));
+  EXPECT_THAT(p, Pointee(Eq(3)));
+  EXPECT_THAT(p, Not(Pointee(Eq(2))));
+}
+
+// Tests Args<k0, ..., kn>(m).
+
+TEST(ArgsTest, AcceptsZeroTemplateArg) {
+  const std::tuple<int, bool> t(5, true);
+  EXPECT_THAT(t, Args<>(Eq(std::tuple<>())));
+  EXPECT_THAT(t, Not(Args<>(Ne(std::tuple<>()))));
+}
+
+TEST(ArgsTest, AcceptsOneTemplateArg) {
+  const std::tuple<int, bool> t(5, true);
+  EXPECT_THAT(t, Args<0>(Eq(std::make_tuple(5))));
+  EXPECT_THAT(t, Args<1>(Eq(std::make_tuple(true))));
+  EXPECT_THAT(t, Not(Args<1>(Eq(std::make_tuple(false)))));
+}
+
+TEST(ArgsTest, AcceptsTwoTemplateArgs) {
+  const std::tuple<short, int, long> t(4, 5, 6L);  // NOLINT
+
+  EXPECT_THAT(t, (Args<0, 1>(Lt())));
+  EXPECT_THAT(t, (Args<1, 2>(Lt())));
+  EXPECT_THAT(t, Not(Args<0, 2>(Gt())));
+}
+
+TEST(ArgsTest, AcceptsRepeatedTemplateArgs) {
+  const std::tuple<short, int, long> t(4, 5, 6L);  // NOLINT
+  EXPECT_THAT(t, (Args<0, 0>(Eq())));
+  EXPECT_THAT(t, Not(Args<1, 1>(Ne())));
+}
+
+TEST(ArgsTest, AcceptsDecreasingTemplateArgs) {
+  const std::tuple<short, int, long> t(4, 5, 6L);  // NOLINT
+  EXPECT_THAT(t, (Args<2, 0>(Gt())));
+  EXPECT_THAT(t, Not(Args<2, 1>(Lt())));
+}
+
+MATCHER(SumIsZero, "") {
+  return std::get<0>(arg) + std::get<1>(arg) + std::get<2>(arg) == 0;
+}
+
+TEST(ArgsTest, AcceptsMoreTemplateArgsThanArityOfOriginalTuple) {
+  EXPECT_THAT(std::make_tuple(-1, 2), (Args<0, 0, 1>(SumIsZero())));
+  EXPECT_THAT(std::make_tuple(1, 2), Not(Args<0, 0, 1>(SumIsZero())));
+}
+
+TEST(ArgsTest, CanBeNested) {
+  const std::tuple<short, int, long, int> t(4, 5, 6L, 6);  // NOLINT
+  EXPECT_THAT(t, (Args<1, 2, 3>(Args<1, 2>(Eq()))));
+  EXPECT_THAT(t, (Args<0, 1, 3>(Args<0, 2>(Lt()))));
+}
+
+TEST(ArgsTest, CanMatchTupleByValue) {
+  typedef std::tuple<char, int, int> Tuple3;
+  const Matcher<Tuple3> m = Args<1, 2>(Lt());
+  EXPECT_TRUE(m.Matches(Tuple3('a', 1, 2)));
+  EXPECT_FALSE(m.Matches(Tuple3('b', 2, 2)));
+}
+
+TEST(ArgsTest, CanMatchTupleByReference) {
+  typedef std::tuple<char, char, int> Tuple3;
+  const Matcher<const Tuple3&> m = Args<0, 1>(Lt());
+  EXPECT_TRUE(m.Matches(Tuple3('a', 'b', 2)));
+  EXPECT_FALSE(m.Matches(Tuple3('b', 'b', 2)));
+}
+
+// Validates that arg is printed as str.
+MATCHER_P(PrintsAs, str, "") {
+  return testing::PrintToString(arg) == str;
+}
+
+TEST(ArgsTest, AcceptsTenTemplateArgs) {
+  EXPECT_THAT(std::make_tuple(0, 1L, 2, 3L, 4, 5, 6, 7, 8, 9),
+              (Args<9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(
+                  PrintsAs("(9, 8, 7, 6, 5, 4, 3, 2, 1, 0)"))));
+  EXPECT_THAT(std::make_tuple(0, 1L, 2, 3L, 4, 5, 6, 7, 8, 9),
+              Not(Args<9, 8, 7, 6, 5, 4, 3, 2, 1, 0>(
+                  PrintsAs("(0, 8, 7, 6, 5, 4, 3, 2, 1, 0)"))));
+}
+
+TEST(ArgsTest, DescirbesSelfCorrectly) {
+  const Matcher<std::tuple<int, bool, char> > m = Args<2, 0>(Lt());
+  EXPECT_EQ("are a tuple whose fields (#2, #0) are a pair where "
+            "the first < the second",
+            Describe(m));
+}
+
+TEST(ArgsTest, DescirbesNestedArgsCorrectly) {
+  const Matcher<const std::tuple<int, bool, char, int>&> m =
+      Args<0, 2, 3>(Args<2, 0>(Lt()));
+  EXPECT_EQ("are a tuple whose fields (#0, #2, #3) are a tuple "
+            "whose fields (#2, #0) are a pair where the first < the second",
+            Describe(m));
+}
+
+TEST(ArgsTest, DescribesNegationCorrectly) {
+  const Matcher<std::tuple<int, char> > m = Args<1, 0>(Gt());
+  EXPECT_EQ("are a tuple whose fields (#1, #0) aren't a pair "
+            "where the first > the second",
+            DescribeNegation(m));
+}
+
+TEST(ArgsTest, ExplainsMatchResultWithoutInnerExplanation) {
+  const Matcher<std::tuple<bool, int, int> > m = Args<1, 2>(Eq());
+  EXPECT_EQ("whose fields (#1, #2) are (42, 42)",
+            Explain(m, std::make_tuple(false, 42, 42)));
+  EXPECT_EQ("whose fields (#1, #2) are (42, 43)",
+            Explain(m, std::make_tuple(false, 42, 43)));
+}
+
+// For testing Args<>'s explanation.
+class LessThanMatcher : public MatcherInterface<std::tuple<char, int> > {
+ public:
+  void DescribeTo(::std::ostream* /*os*/) const override {}
+
+  bool MatchAndExplain(std::tuple<char, int> value,
+                       MatchResultListener* listener) const override {
+    const int diff = std::get<0>(value) - std::get<1>(value);
+    if (diff > 0) {
+      *listener << "where the first value is " << diff
+                << " more than the second";
+    }
+    return diff < 0;
+  }
+};
+
+Matcher<std::tuple<char, int> > LessThan() {
+  return MakeMatcher(new LessThanMatcher);
+}
+
+TEST(ArgsTest, ExplainsMatchResultWithInnerExplanation) {
+  const Matcher<std::tuple<char, int, int> > m = Args<0, 2>(LessThan());
+  EXPECT_EQ(
+      "whose fields (#0, #2) are ('a' (97, 0x61), 42), "
+      "where the first value is 55 more than the second",
+      Explain(m, std::make_tuple('a', 42, 42)));
+  EXPECT_EQ("whose fields (#0, #2) are ('\\0', 43)",
+            Explain(m, std::make_tuple('\0', 42, 43)));
+}
+
+class PredicateFormatterFromMatcherTest : public ::testing::Test {
+ protected:
+  enum Behavior { kInitialSuccess, kAlwaysFail, kFlaky };
+
+  // A matcher that can return different results when used multiple times on the
+  // same input. No real matcher should do this; but this lets us test that we
+  // detect such behavior and fail appropriately.
+  class MockMatcher : public MatcherInterface<Behavior> {
+   public:
+    bool MatchAndExplain(Behavior behavior,
+                         MatchResultListener* listener) const override {
+      *listener << "[MatchAndExplain]";
+      switch (behavior) {
+        case kInitialSuccess:
+          // The first call to MatchAndExplain should use a "not interested"
+          // listener; so this is expected to return |true|. There should be no
+          // subsequent calls.
+          return !listener->IsInterested();
+
+        case kAlwaysFail:
+          return false;
+
+        case kFlaky:
+          // The first call to MatchAndExplain should use a "not interested"
+          // listener; so this will return |false|. Subsequent calls should have
+          // an "interested" listener; so this will return |true|, thus
+          // simulating a flaky matcher.
+          return listener->IsInterested();
+      }
+
+      GTEST_LOG_(FATAL) << "This should never be reached";
+      return false;
+    }
+
+    void DescribeTo(ostream* os) const override { *os << "[DescribeTo]"; }
+
+    void DescribeNegationTo(ostream* os) const override {
+      *os << "[DescribeNegationTo]";
+    }
+  };
+
+  AssertionResult RunPredicateFormatter(Behavior behavior) {
+    auto matcher = MakeMatcher(new MockMatcher);
+    PredicateFormatterFromMatcher<Matcher<Behavior>> predicate_formatter(
+        matcher);
+    return predicate_formatter("dummy-name", behavior);
+  }
+};
+
+TEST_F(PredicateFormatterFromMatcherTest, ShortCircuitOnSuccess) {
+  AssertionResult result = RunPredicateFormatter(kInitialSuccess);
+  EXPECT_TRUE(result);  // Implicit cast to bool.
+  std::string expect;
+  EXPECT_EQ(expect, result.message());
+}
+
+TEST_F(PredicateFormatterFromMatcherTest, NoShortCircuitOnFailure) {
+  AssertionResult result = RunPredicateFormatter(kAlwaysFail);
+  EXPECT_FALSE(result);  // Implicit cast to bool.
+  std::string expect =
+      "Value of: dummy-name\nExpected: [DescribeTo]\n"
+      "  Actual: 1" +
+      OfType(internal::GetTypeName<Behavior>()) + ", [MatchAndExplain]";
+  EXPECT_EQ(expect, result.message());
+}
+
+TEST_F(PredicateFormatterFromMatcherTest, DetectsFlakyShortCircuit) {
+  AssertionResult result = RunPredicateFormatter(kFlaky);
+  EXPECT_FALSE(result);  // Implicit cast to bool.
+  std::string expect =
+      "Value of: dummy-name\nExpected: [DescribeTo]\n"
+      "  The matcher failed on the initial attempt; but passed when rerun to "
+      "generate the explanation.\n"
+      "  Actual: 2" +
+      OfType(internal::GetTypeName<Behavior>()) + ", [MatchAndExplain]";
+  EXPECT_EQ(expect, result.message());
+}
+
+}  // namespace
+}  // namespace gmock_matchers_test
+}  // namespace testing
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-more-actions_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-more-actions_test.cc
new file mode 100644
index 0000000000..97ec5cf08f
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-more-actions_test.cc
@@ -0,0 +1,698 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the built-in actions in gmock-more-actions.h.
+
+#include "gmock/gmock-more-actions.h"
+
+#include <functional>
+#include <memory>
+#include <sstream>
+#include <string>
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace gmock_more_actions_test {
+
+using ::std::plus;
+using ::std::string;
+using testing::_;
+using testing::Action;
+using testing::ActionInterface;
+using testing::DeleteArg;
+using testing::Invoke;
+using testing::Return;
+using testing::ReturnArg;
+using testing::ReturnPointee;
+using testing::SaveArg;
+using testing::SaveArgPointee;
+using testing::SetArgReferee;
+using testing::Unused;
+using testing::WithArg;
+using testing::WithoutArgs;
+
+// For suppressing compiler warnings on conversion possibly losing precision.
+inline short Short(short n) { return n; }  // NOLINT
+inline char Char(char ch) { return ch; }
+
+// Sample functions and functors for testing Invoke() and etc.
+int Nullary() { return 1; }
+
+class NullaryFunctor {
+ public:
+  int operator()() { return 2; }
+};
+
+bool g_done = false;
+void VoidNullary() { g_done = true; }
+
+class VoidNullaryFunctor {
+ public:
+  void operator()() { g_done = true; }
+};
+
+bool Unary(int x) { return x < 0; }
+
+const char* Plus1(const char* s) { return s + 1; }
+
+void VoidUnary(int /* n */) { g_done = true; }
+
+bool ByConstRef(const std::string& s) { return s == "Hi"; }
+
+const double g_double = 0;
+bool ReferencesGlobalDouble(const double& x) { return &x == &g_double; }
+
+std::string ByNonConstRef(std::string& s) { return s += "+"; }  // NOLINT
+
+struct UnaryFunctor {
+  int operator()(bool x) { return x ? 1 : -1; }
+};
+
+const char* Binary(const char* input, short n) { return input + n; }  // NOLINT
+
+void VoidBinary(int, char) { g_done = true; }
+
+int Ternary(int x, char y, short z) { return x + y + z; }  // NOLINT
+
+void VoidTernary(int, char, bool) { g_done = true; }
+
+int SumOf4(int a, int b, int c, int d) { return a + b + c + d; }
+
+int SumOfFirst2(int a, int b, Unused, Unused) { return a + b; }
+
+void VoidFunctionWithFourArguments(char, int, float, double) { g_done = true; }
+
+std::string Concat4(const char* s1, const char* s2, const char* s3,
+                    const char* s4) {
+  return std::string(s1) + s2 + s3 + s4;
+}
+
+int SumOf5(int a, int b, int c, int d, int e) { return a + b + c + d + e; }
+
+struct SumOf5Functor {
+  int operator()(int a, int b, int c, int d, int e) {
+    return a + b + c + d + e;
+  }
+};
+
+std::string Concat5(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5) {
+  return std::string(s1) + s2 + s3 + s4 + s5;
+}
+
+int SumOf6(int a, int b, int c, int d, int e, int f) {
+  return a + b + c + d + e + f;
+}
+
+struct SumOf6Functor {
+  int operator()(int a, int b, int c, int d, int e, int f) {
+    return a + b + c + d + e + f;
+  }
+};
+
+std::string Concat6(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6;
+}
+
+std::string Concat7(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6,
+                    const char* s7) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7;
+}
+
+std::string Concat8(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6,
+                    const char* s7, const char* s8) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8;
+}
+
+std::string Concat9(const char* s1, const char* s2, const char* s3,
+                    const char* s4, const char* s5, const char* s6,
+                    const char* s7, const char* s8, const char* s9) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9;
+}
+
+std::string Concat10(const char* s1, const char* s2, const char* s3,
+                     const char* s4, const char* s5, const char* s6,
+                     const char* s7, const char* s8, const char* s9,
+                     const char* s10) {
+  return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10;
+}
+
+class Foo {
+ public:
+  Foo() : value_(123) {}
+
+  int Nullary() const { return value_; }
+
+  short Unary(long x) { return static_cast<short>(value_ + x); }  // NOLINT
+
+  std::string Binary(const std::string& str, char c) const { return str + c; }
+
+  int Ternary(int x, bool y, char z) { return value_ + x + y*z; }
+
+  int SumOf4(int a, int b, int c, int d) const {
+    return a + b + c + d + value_;
+  }
+
+  int SumOfLast2(Unused, Unused, int a, int b) const { return a + b; }
+
+  int SumOf5(int a, int b, int c, int d, int e) { return a + b + c + d + e; }
+
+  int SumOf6(int a, int b, int c, int d, int e, int f) {
+    return a + b + c + d + e + f;
+  }
+
+  std::string Concat7(const char* s1, const char* s2, const char* s3,
+                      const char* s4, const char* s5, const char* s6,
+                      const char* s7) {
+    return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7;
+  }
+
+  std::string Concat8(const char* s1, const char* s2, const char* s3,
+                      const char* s4, const char* s5, const char* s6,
+                      const char* s7, const char* s8) {
+    return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8;
+  }
+
+  std::string Concat9(const char* s1, const char* s2, const char* s3,
+                      const char* s4, const char* s5, const char* s6,
+                      const char* s7, const char* s8, const char* s9) {
+    return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9;
+  }
+
+  std::string Concat10(const char* s1, const char* s2, const char* s3,
+                       const char* s4, const char* s5, const char* s6,
+                       const char* s7, const char* s8, const char* s9,
+                       const char* s10) {
+    return std::string(s1) + s2 + s3 + s4 + s5 + s6 + s7 + s8 + s9 + s10;
+  }
+
+ private:
+  int value_;
+};
+
+// Tests using Invoke() with a nullary function.
+TEST(InvokeTest, Nullary) {
+  Action<int()> a = Invoke(Nullary);  // NOLINT
+  EXPECT_EQ(1, a.Perform(std::make_tuple()));
+}
+
+// Tests using Invoke() with a unary function.
+TEST(InvokeTest, Unary) {
+  Action<bool(int)> a = Invoke(Unary);  // NOLINT
+  EXPECT_FALSE(a.Perform(std::make_tuple(1)));
+  EXPECT_TRUE(a.Perform(std::make_tuple(-1)));
+}
+
+// Tests using Invoke() with a binary function.
+TEST(InvokeTest, Binary) {
+  Action<const char*(const char*, short)> a = Invoke(Binary);  // NOLINT
+  const char* p = "Hello";
+  EXPECT_EQ(p + 2, a.Perform(std::make_tuple(p, Short(2))));
+}
+
+// Tests using Invoke() with a ternary function.
+TEST(InvokeTest, Ternary) {
+  Action<int(int, char, short)> a = Invoke(Ternary);  // NOLINT
+  EXPECT_EQ(6, a.Perform(std::make_tuple(1, '\2', Short(3))));
+}
+
+// Tests using Invoke() with a 4-argument function.
+TEST(InvokeTest, FunctionThatTakes4Arguments) {
+  Action<int(int, int, int, int)> a = Invoke(SumOf4);  // NOLINT
+  EXPECT_EQ(1234, a.Perform(std::make_tuple(1000, 200, 30, 4)));
+}
+
+// Tests using Invoke() with a 5-argument function.
+TEST(InvokeTest, FunctionThatTakes5Arguments) {
+  Action<int(int, int, int, int, int)> a = Invoke(SumOf5);  // NOLINT
+  EXPECT_EQ(12345, a.Perform(std::make_tuple(10000, 2000, 300, 40, 5)));
+}
+
+// Tests using Invoke() with a 6-argument function.
+TEST(InvokeTest, FunctionThatTakes6Arguments) {
+  Action<int(int, int, int, int, int, int)> a = Invoke(SumOf6);  // NOLINT
+  EXPECT_EQ(123456,
+            a.Perform(std::make_tuple(100000, 20000, 3000, 400, 50, 6)));
+}
+
+// A helper that turns the type of a C-string literal from const
+// char[N] to const char*.
+inline const char* CharPtr(const char* s) { return s; }
+
+// Tests using Invoke() with a 7-argument function.
+TEST(InvokeTest, FunctionThatTakes7Arguments) {
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*)>
+      a = Invoke(Concat7);
+  EXPECT_EQ("1234567",
+            a.Perform(std::make_tuple(CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                                      CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                                      CharPtr("7"))));
+}
+
+// Tests using Invoke() with a 8-argument function.
+TEST(InvokeTest, FunctionThatTakes8Arguments) {
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*, const char*)>
+      a = Invoke(Concat8);
+  EXPECT_EQ("12345678",
+            a.Perform(std::make_tuple(CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                                      CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                                      CharPtr("7"), CharPtr("8"))));
+}
+
+// Tests using Invoke() with a 9-argument function.
+TEST(InvokeTest, FunctionThatTakes9Arguments) {
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*, const char*,
+                     const char*)>
+      a = Invoke(Concat9);
+  EXPECT_EQ("123456789", a.Perform(std::make_tuple(
+                             CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                             CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                             CharPtr("7"), CharPtr("8"), CharPtr("9"))));
+}
+
+// Tests using Invoke() with a 10-argument function.
+TEST(InvokeTest, FunctionThatTakes10Arguments) {
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*, const char*,
+                     const char*, const char*)>
+      a = Invoke(Concat10);
+  EXPECT_EQ("1234567890",
+            a.Perform(std::make_tuple(CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                                      CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                                      CharPtr("7"), CharPtr("8"), CharPtr("9"),
+                                      CharPtr("0"))));
+}
+
+// Tests using Invoke() with functions with parameters declared as Unused.
+TEST(InvokeTest, FunctionWithUnusedParameters) {
+  Action<int(int, int, double, const std::string&)> a1 = Invoke(SumOfFirst2);
+  std::tuple<int, int, double, std::string> dummy =
+      std::make_tuple(10, 2, 5.6, std::string("hi"));
+  EXPECT_EQ(12, a1.Perform(dummy));
+
+  Action<int(int, int, bool, int*)> a2 =
+      Invoke(SumOfFirst2);
+  EXPECT_EQ(
+      23, a2.Perform(std::make_tuple(20, 3, true, static_cast<int*>(nullptr))));
+}
+
+// Tests using Invoke() with methods with parameters declared as Unused.
+TEST(InvokeTest, MethodWithUnusedParameters) {
+  Foo foo;
+  Action<int(std::string, bool, int, int)> a1 = Invoke(&foo, &Foo::SumOfLast2);
+  EXPECT_EQ(12, a1.Perform(std::make_tuple(CharPtr("hi"), true, 10, 2)));
+
+  Action<int(char, double, int, int)> a2 =
+      Invoke(&foo, &Foo::SumOfLast2);
+  EXPECT_EQ(23, a2.Perform(std::make_tuple('a', 2.5, 20, 3)));
+}
+
+// Tests using Invoke() with a functor.
+TEST(InvokeTest, Functor) {
+  Action<long(long, int)> a = Invoke(plus<long>());  // NOLINT
+  EXPECT_EQ(3L, a.Perform(std::make_tuple(1, 2)));
+}
+
+// Tests using Invoke(f) as an action of a compatible type.
+TEST(InvokeTest, FunctionWithCompatibleType) {
+  Action<long(int, short, char, bool)> a = Invoke(SumOf4);  // NOLINT
+  EXPECT_EQ(4321, a.Perform(std::make_tuple(4000, Short(300), Char(20), true)));
+}
+
+// Tests using Invoke() with an object pointer and a method pointer.
+
+// Tests using Invoke() with a nullary method.
+TEST(InvokeMethodTest, Nullary) {
+  Foo foo;
+  Action<int()> a = Invoke(&foo, &Foo::Nullary);  // NOLINT
+  EXPECT_EQ(123, a.Perform(std::make_tuple()));
+}
+
+// Tests using Invoke() with a unary method.
+TEST(InvokeMethodTest, Unary) {
+  Foo foo;
+  Action<short(long)> a = Invoke(&foo, &Foo::Unary);  // NOLINT
+  EXPECT_EQ(4123, a.Perform(std::make_tuple(4000)));
+}
+
+// Tests using Invoke() with a binary method.
+TEST(InvokeMethodTest, Binary) {
+  Foo foo;
+  Action<std::string(const std::string&, char)> a = Invoke(&foo, &Foo::Binary);
+  std::string s("Hell");
+  std::tuple<std::string, char> dummy = std::make_tuple(s, 'o');
+  EXPECT_EQ("Hello", a.Perform(dummy));
+}
+
+// Tests using Invoke() with a ternary method.
+TEST(InvokeMethodTest, Ternary) {
+  Foo foo;
+  Action<int(int, bool, char)> a = Invoke(&foo, &Foo::Ternary);  // NOLINT
+  EXPECT_EQ(1124, a.Perform(std::make_tuple(1000, true, Char(1))));
+}
+
+// Tests using Invoke() with a 4-argument method.
+TEST(InvokeMethodTest, MethodThatTakes4Arguments) {
+  Foo foo;
+  Action<int(int, int, int, int)> a = Invoke(&foo, &Foo::SumOf4);  // NOLINT
+  EXPECT_EQ(1357, a.Perform(std::make_tuple(1000, 200, 30, 4)));
+}
+
+// Tests using Invoke() with a 5-argument method.
+TEST(InvokeMethodTest, MethodThatTakes5Arguments) {
+  Foo foo;
+  Action<int(int, int, int, int, int)> a = Invoke(&foo, &Foo::SumOf5);  // NOLINT
+  EXPECT_EQ(12345, a.Perform(std::make_tuple(10000, 2000, 300, 40, 5)));
+}
+
+// Tests using Invoke() with a 6-argument method.
+TEST(InvokeMethodTest, MethodThatTakes6Arguments) {
+  Foo foo;
+  Action<int(int, int, int, int, int, int)> a =  // NOLINT
+      Invoke(&foo, &Foo::SumOf6);
+  EXPECT_EQ(123456,
+            a.Perform(std::make_tuple(100000, 20000, 3000, 400, 50, 6)));
+}
+
+// Tests using Invoke() with a 7-argument method.
+TEST(InvokeMethodTest, MethodThatTakes7Arguments) {
+  Foo foo;
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*)>
+      a = Invoke(&foo, &Foo::Concat7);
+  EXPECT_EQ("1234567",
+            a.Perform(std::make_tuple(CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                                      CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                                      CharPtr("7"))));
+}
+
+// Tests using Invoke() with a 8-argument method.
+TEST(InvokeMethodTest, MethodThatTakes8Arguments) {
+  Foo foo;
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*, const char*)>
+      a = Invoke(&foo, &Foo::Concat8);
+  EXPECT_EQ("12345678",
+            a.Perform(std::make_tuple(CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                                      CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                                      CharPtr("7"), CharPtr("8"))));
+}
+
+// Tests using Invoke() with a 9-argument method.
+TEST(InvokeMethodTest, MethodThatTakes9Arguments) {
+  Foo foo;
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*, const char*,
+                     const char*)>
+      a = Invoke(&foo, &Foo::Concat9);
+  EXPECT_EQ("123456789", a.Perform(std::make_tuple(
+                             CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                             CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                             CharPtr("7"), CharPtr("8"), CharPtr("9"))));
+}
+
+// Tests using Invoke() with a 10-argument method.
+TEST(InvokeMethodTest, MethodThatTakes10Arguments) {
+  Foo foo;
+  Action<std::string(const char*, const char*, const char*, const char*,
+                     const char*, const char*, const char*, const char*,
+                     const char*, const char*)>
+      a = Invoke(&foo, &Foo::Concat10);
+  EXPECT_EQ("1234567890",
+            a.Perform(std::make_tuple(CharPtr("1"), CharPtr("2"), CharPtr("3"),
+                                      CharPtr("4"), CharPtr("5"), CharPtr("6"),
+                                      CharPtr("7"), CharPtr("8"), CharPtr("9"),
+                                      CharPtr("0"))));
+}
+
+// Tests using Invoke(f) as an action of a compatible type.
+TEST(InvokeMethodTest, MethodWithCompatibleType) {
+  Foo foo;
+  Action<long(int, short, char, bool)> a =  // NOLINT
+      Invoke(&foo, &Foo::SumOf4);
+  EXPECT_EQ(4444, a.Perform(std::make_tuple(4000, Short(300), Char(20), true)));
+}
+
+// Tests using WithoutArgs with an action that takes no argument.
+TEST(WithoutArgsTest, NoArg) {
+  Action<int(int n)> a = WithoutArgs(Invoke(Nullary));  // NOLINT
+  EXPECT_EQ(1, a.Perform(std::make_tuple(2)));
+}
+
+// Tests using WithArg with an action that takes 1 argument.
+TEST(WithArgTest, OneArg) {
+  Action<bool(double x, int n)> b = WithArg<1>(Invoke(Unary));  // NOLINT
+  EXPECT_TRUE(b.Perform(std::make_tuple(1.5, -1)));
+  EXPECT_FALSE(b.Perform(std::make_tuple(1.5, 1)));
+}
+
+TEST(ReturnArgActionTest, WorksForOneArgIntArg0) {
+  const Action<int(int)> a = ReturnArg<0>();
+  EXPECT_EQ(5, a.Perform(std::make_tuple(5)));
+}
+
+TEST(ReturnArgActionTest, WorksForMultiArgBoolArg0) {
+  const Action<bool(bool, bool, bool)> a = ReturnArg<0>();
+  EXPECT_TRUE(a.Perform(std::make_tuple(true, false, false)));
+}
+
+TEST(ReturnArgActionTest, WorksForMultiArgStringArg2) {
+  const Action<std::string(int, int, std::string, int)> a = ReturnArg<2>();
+  EXPECT_EQ("seven", a.Perform(std::make_tuple(5, 6, std::string("seven"), 8)));
+}
+
+TEST(SaveArgActionTest, WorksForSameType) {
+  int result = 0;
+  const Action<void(int n)> a1 = SaveArg<0>(&result);
+  a1.Perform(std::make_tuple(5));
+  EXPECT_EQ(5, result);
+}
+
+TEST(SaveArgActionTest, WorksForCompatibleType) {
+  int result = 0;
+  const Action<void(bool, char)> a1 = SaveArg<1>(&result);
+  a1.Perform(std::make_tuple(true, 'a'));
+  EXPECT_EQ('a', result);
+}
+
+TEST(SaveArgPointeeActionTest, WorksForSameType) {
+  int result = 0;
+  const int value = 5;
+  const Action<void(const int*)> a1 = SaveArgPointee<0>(&result);
+  a1.Perform(std::make_tuple(&value));
+  EXPECT_EQ(5, result);
+}
+
+TEST(SaveArgPointeeActionTest, WorksForCompatibleType) {
+  int result = 0;
+  char value = 'a';
+  const Action<void(bool, char*)> a1 = SaveArgPointee<1>(&result);
+  a1.Perform(std::make_tuple(true, &value));
+  EXPECT_EQ('a', result);
+}
+
+TEST(SetArgRefereeActionTest, WorksForSameType) {
+  int value = 0;
+  const Action<void(int&)> a1 = SetArgReferee<0>(1);
+  a1.Perform(std::tuple<int&>(value));
+  EXPECT_EQ(1, value);
+}
+
+TEST(SetArgRefereeActionTest, WorksForCompatibleType) {
+  int value = 0;
+  const Action<void(int, int&)> a1 = SetArgReferee<1>('a');
+  a1.Perform(std::tuple<int, int&>(0, value));
+  EXPECT_EQ('a', value);
+}
+
+TEST(SetArgRefereeActionTest, WorksWithExtraArguments) {
+  int value = 0;
+  const Action<void(bool, int, int&, const char*)> a1 = SetArgReferee<2>('a');
+  a1.Perform(std::tuple<bool, int, int&, const char*>(true, 0, value, "hi"));
+  EXPECT_EQ('a', value);
+}
+
+// A class that can be used to verify that its destructor is called: it will set
+// the bool provided to the constructor to true when destroyed.
+class DeletionTester {
+ public:
+  explicit DeletionTester(bool* is_deleted)
+    : is_deleted_(is_deleted) {
+    // Make sure the bit is set to false.
+    *is_deleted_ = false;
+  }
+
+  ~DeletionTester() {
+    *is_deleted_ = true;
+  }
+
+ private:
+  bool* is_deleted_;
+};
+
+TEST(DeleteArgActionTest, OneArg) {
+  bool is_deleted = false;
+  DeletionTester* t = new DeletionTester(&is_deleted);
+  const Action<void(DeletionTester*)> a1 = DeleteArg<0>();      // NOLINT
+  EXPECT_FALSE(is_deleted);
+  a1.Perform(std::make_tuple(t));
+  EXPECT_TRUE(is_deleted);
+}
+
+TEST(DeleteArgActionTest, TenArgs) {
+  bool is_deleted = false;
+  DeletionTester* t = new DeletionTester(&is_deleted);
+  const Action<void(bool, int, int, const char*, bool,
+                    int, int, int, int, DeletionTester*)> a1 = DeleteArg<9>();
+  EXPECT_FALSE(is_deleted);
+  a1.Perform(std::make_tuple(true, 5, 6, CharPtr("hi"), false, 7, 8, 9, 10, t));
+  EXPECT_TRUE(is_deleted);
+}
+
+#if GTEST_HAS_EXCEPTIONS
+
+TEST(ThrowActionTest, ThrowsGivenExceptionInVoidFunction) {
+  const Action<void(int n)> a = Throw('a');
+  EXPECT_THROW(a.Perform(std::make_tuple(0)), char);
+}
+
+class MyException {};
+
+TEST(ThrowActionTest, ThrowsGivenExceptionInNonVoidFunction) {
+  const Action<double(char ch)> a = Throw(MyException());
+  EXPECT_THROW(a.Perform(std::make_tuple('0')), MyException);
+}
+
+TEST(ThrowActionTest, ThrowsGivenExceptionInNullaryFunction) {
+  const Action<double()> a = Throw(MyException());
+  EXPECT_THROW(a.Perform(std::make_tuple()), MyException);
+}
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// Tests that SetArrayArgument<N>(first, last) sets the elements of the array
+// pointed to by the N-th (0-based) argument to values in range [first, last).
+TEST(SetArrayArgumentTest, SetsTheNthArray) {
+  typedef void MyFunction(bool, int*, char*);
+  int numbers[] = { 1, 2, 3 };
+  Action<MyFunction> a = SetArrayArgument<1>(numbers, numbers + 3);
+
+  int n[4] = {};
+  int* pn = n;
+  char ch[4] = {};
+  char* pch = ch;
+  a.Perform(std::make_tuple(true, pn, pch));
+  EXPECT_EQ(1, n[0]);
+  EXPECT_EQ(2, n[1]);
+  EXPECT_EQ(3, n[2]);
+  EXPECT_EQ(0, n[3]);
+  EXPECT_EQ('\0', ch[0]);
+  EXPECT_EQ('\0', ch[1]);
+  EXPECT_EQ('\0', ch[2]);
+  EXPECT_EQ('\0', ch[3]);
+
+  // Tests first and last are iterators.
+  std::string letters = "abc";
+  a = SetArrayArgument<2>(letters.begin(), letters.end());
+  std::fill_n(n, 4, 0);
+  std::fill_n(ch, 4, '\0');
+  a.Perform(std::make_tuple(true, pn, pch));
+  EXPECT_EQ(0, n[0]);
+  EXPECT_EQ(0, n[1]);
+  EXPECT_EQ(0, n[2]);
+  EXPECT_EQ(0, n[3]);
+  EXPECT_EQ('a', ch[0]);
+  EXPECT_EQ('b', ch[1]);
+  EXPECT_EQ('c', ch[2]);
+  EXPECT_EQ('\0', ch[3]);
+}
+
+// Tests SetArrayArgument<N>(first, last) where first == last.
+TEST(SetArrayArgumentTest, SetsTheNthArrayWithEmptyRange) {
+  typedef void MyFunction(bool, int*);
+  int numbers[] = { 1, 2, 3 };
+  Action<MyFunction> a = SetArrayArgument<1>(numbers, numbers);
+
+  int n[4] = {};
+  int* pn = n;
+  a.Perform(std::make_tuple(true, pn));
+  EXPECT_EQ(0, n[0]);
+  EXPECT_EQ(0, n[1]);
+  EXPECT_EQ(0, n[2]);
+  EXPECT_EQ(0, n[3]);
+}
+
+// Tests SetArrayArgument<N>(first, last) where *first is convertible
+// (but not equal) to the argument type.
+TEST(SetArrayArgumentTest, SetsTheNthArrayWithConvertibleType) {
+  typedef void MyFunction(bool, int*);
+  char chars[] = { 97, 98, 99 };
+  Action<MyFunction> a = SetArrayArgument<1>(chars, chars + 3);
+
+  int codes[4] = { 111, 222, 333, 444 };
+  int* pcodes = codes;
+  a.Perform(std::make_tuple(true, pcodes));
+  EXPECT_EQ(97, codes[0]);
+  EXPECT_EQ(98, codes[1]);
+  EXPECT_EQ(99, codes[2]);
+  EXPECT_EQ(444, codes[3]);
+}
+
+// Test SetArrayArgument<N>(first, last) with iterator as argument.
+TEST(SetArrayArgumentTest, SetsTheNthArrayWithIteratorArgument) {
+  typedef void MyFunction(bool, std::back_insert_iterator<std::string>);
+  std::string letters = "abc";
+  Action<MyFunction> a = SetArrayArgument<1>(letters.begin(), letters.end());
+
+  std::string s;
+  a.Perform(std::make_tuple(true, back_inserter(s)));
+  EXPECT_EQ(letters, s);
+}
+
+TEST(ReturnPointeeTest, Works) {
+  int n = 42;
+  const Action<int()> a = ReturnPointee(&n);
+  EXPECT_EQ(42, a.Perform(std::make_tuple()));
+
+  n = 43;
+  EXPECT_EQ(43, a.Perform(std::make_tuple()));
+}
+
+}  // namespace gmock_generated_actions_test
+}  // namespace testing
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-nice-strict_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-nice-strict_test.cc
new file mode 100644
index 0000000000..0a201ed7eb
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-nice-strict_test.cc
@@ -0,0 +1,500 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include "gmock/gmock-nice-strict.h"
+
+#include <string>
+#include <utility>
+#include "gmock/gmock.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
+
+// This must not be defined inside the ::testing namespace, or it will
+// clash with ::testing::Mock.
+class Mock {
+ public:
+  Mock() {}
+
+  MOCK_METHOD0(DoThis, void());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mock);
+};
+
+namespace testing {
+namespace gmock_nice_strict_test {
+
+using testing::GMOCK_FLAG(verbose);
+using testing::HasSubstr;
+using testing::NaggyMock;
+using testing::NiceMock;
+using testing::StrictMock;
+
+#if GTEST_HAS_STREAM_REDIRECTION
+using testing::internal::CaptureStdout;
+using testing::internal::GetCapturedStdout;
+#endif
+
+// Class without default constructor.
+class NotDefaultConstructible {
+ public:
+  explicit NotDefaultConstructible(int) {}
+};
+
+// Defines some mock classes needed by the tests.
+
+class Foo {
+ public:
+  virtual ~Foo() {}
+
+  virtual void DoThis() = 0;
+  virtual int DoThat(bool flag) = 0;
+};
+
+class MockFoo : public Foo {
+ public:
+  MockFoo() {}
+  void Delete() { delete this; }
+
+  MOCK_METHOD0(DoThis, void());
+  MOCK_METHOD1(DoThat, int(bool flag));
+  MOCK_METHOD0(ReturnNonDefaultConstructible, NotDefaultConstructible());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFoo);
+};
+
+class MockBar {
+ public:
+  explicit MockBar(const std::string& s) : str_(s) {}
+
+  MockBar(char a1, char a2, std::string a3, std::string a4, int a5, int a6,
+          const std::string& a7, const std::string& a8, bool a9, bool a10) {
+    str_ = std::string() + a1 + a2 + a3 + a4 + static_cast<char>(a5) +
+        static_cast<char>(a6) + a7 + a8 + (a9 ? 'T' : 'F') + (a10 ? 'T' : 'F');
+  }
+
+  virtual ~MockBar() {}
+
+  const std::string& str() const { return str_; }
+
+  MOCK_METHOD0(This, int());
+  MOCK_METHOD2(That, std::string(int, bool));
+
+ private:
+  std::string str_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockBar);
+};
+
+
+class MockBaz {
+ public:
+  class MoveOnly {
+   public:
+    MoveOnly() = default;
+
+    MoveOnly(const MoveOnly&) = delete;
+    MoveOnly& operator=(const MoveOnly&) = delete;
+
+    MoveOnly(MoveOnly&&) = default;
+    MoveOnly& operator=(MoveOnly&&) = default;
+  };
+
+  MockBaz(MoveOnly) {}
+};
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that a raw mock generates warnings for uninteresting calls.
+TEST(RawMockTest, WarningForUninterestingCall) {
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = "warning";
+
+  MockFoo raw_foo;
+
+  CaptureStdout();
+  raw_foo.DoThis();
+  raw_foo.DoThat(true);
+  EXPECT_THAT(GetCapturedStdout(),
+              HasSubstr("Uninteresting mock function call"));
+
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+// Tests that a raw mock generates warnings for uninteresting calls
+// that delete the mock object.
+TEST(RawMockTest, WarningForUninterestingCallAfterDeath) {
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = "warning";
+
+  MockFoo* const raw_foo = new MockFoo;
+
+  ON_CALL(*raw_foo, DoThis())
+      .WillByDefault(Invoke(raw_foo, &MockFoo::Delete));
+
+  CaptureStdout();
+  raw_foo->DoThis();
+  EXPECT_THAT(GetCapturedStdout(),
+              HasSubstr("Uninteresting mock function call"));
+
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+// Tests that a raw mock generates informational logs for
+// uninteresting calls.
+TEST(RawMockTest, InfoForUninterestingCall) {
+  MockFoo raw_foo;
+
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = "info";
+  CaptureStdout();
+  raw_foo.DoThis();
+  EXPECT_THAT(GetCapturedStdout(),
+              HasSubstr("Uninteresting mock function call"));
+
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+TEST(RawMockTest, IsNaggy_IsNice_IsStrict) {
+  MockFoo raw_foo;
+  EXPECT_TRUE(Mock::IsNaggy(&raw_foo));
+  EXPECT_FALSE(Mock::IsNice(&raw_foo));
+  EXPECT_FALSE(Mock::IsStrict(&raw_foo));
+}
+
+// Tests that a nice mock generates no warning for uninteresting calls.
+TEST(NiceMockTest, NoWarningForUninterestingCall) {
+  NiceMock<MockFoo> nice_foo;
+
+  CaptureStdout();
+  nice_foo.DoThis();
+  nice_foo.DoThat(true);
+  EXPECT_EQ("", GetCapturedStdout());
+}
+
+// Tests that a nice mock generates no warning for uninteresting calls
+// that delete the mock object.
+TEST(NiceMockTest, NoWarningForUninterestingCallAfterDeath) {
+  NiceMock<MockFoo>* const nice_foo = new NiceMock<MockFoo>;
+
+  ON_CALL(*nice_foo, DoThis())
+      .WillByDefault(Invoke(nice_foo, &MockFoo::Delete));
+
+  CaptureStdout();
+  nice_foo->DoThis();
+  EXPECT_EQ("", GetCapturedStdout());
+}
+
+// Tests that a nice mock generates informational logs for
+// uninteresting calls.
+TEST(NiceMockTest, InfoForUninterestingCall) {
+  NiceMock<MockFoo> nice_foo;
+
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = "info";
+  CaptureStdout();
+  nice_foo.DoThis();
+  EXPECT_THAT(GetCapturedStdout(),
+              HasSubstr("Uninteresting mock function call"));
+
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that a nice mock allows expected calls.
+TEST(NiceMockTest, AllowsExpectedCall) {
+  NiceMock<MockFoo> nice_foo;
+
+  EXPECT_CALL(nice_foo, DoThis());
+  nice_foo.DoThis();
+}
+
+// Tests that an unexpected call on a nice mock which returns a
+// not-default-constructible type throws an exception and the exception contains
+// the method's name.
+TEST(NiceMockTest, ThrowsExceptionForUnknownReturnTypes) {
+  NiceMock<MockFoo> nice_foo;
+#if GTEST_HAS_EXCEPTIONS
+  try {
+    nice_foo.ReturnNonDefaultConstructible();
+    FAIL();
+  } catch (const std::runtime_error& ex) {
+    EXPECT_THAT(ex.what(), HasSubstr("ReturnNonDefaultConstructible"));
+  }
+#else
+  EXPECT_DEATH_IF_SUPPORTED({ nice_foo.ReturnNonDefaultConstructible(); }, "");
+#endif
+}
+
+// Tests that an unexpected call on a nice mock fails.
+TEST(NiceMockTest, UnexpectedCallFails) {
+  NiceMock<MockFoo> nice_foo;
+
+  EXPECT_CALL(nice_foo, DoThis()).Times(0);
+  EXPECT_NONFATAL_FAILURE(nice_foo.DoThis(), "called more times than expected");
+}
+
+// Tests that NiceMock works with a mock class that has a non-default
+// constructor.
+TEST(NiceMockTest, NonDefaultConstructor) {
+  NiceMock<MockBar> nice_bar("hi");
+  EXPECT_EQ("hi", nice_bar.str());
+
+  nice_bar.This();
+  nice_bar.That(5, true);
+}
+
+// Tests that NiceMock works with a mock class that has a 10-ary
+// non-default constructor.
+TEST(NiceMockTest, NonDefaultConstructor10) {
+  NiceMock<MockBar> nice_bar('a', 'b', "c", "d", 'e', 'f',
+                             "g", "h", true, false);
+  EXPECT_EQ("abcdefghTF", nice_bar.str());
+
+  nice_bar.This();
+  nice_bar.That(5, true);
+}
+
+TEST(NiceMockTest, AllowLeak) {
+  NiceMock<MockFoo>* leaked = new NiceMock<MockFoo>;
+  Mock::AllowLeak(leaked);
+  EXPECT_CALL(*leaked, DoThis());
+  leaked->DoThis();
+}
+
+TEST(NiceMockTest, MoveOnlyConstructor) {
+  NiceMock<MockBaz> nice_baz(MockBaz::MoveOnly{});
+}
+
+// Tests that NiceMock<Mock> compiles where Mock is a user-defined
+// class (as opposed to ::testing::Mock).
+TEST(NiceMockTest, AcceptsClassNamedMock) {
+  NiceMock< ::Mock> nice;
+  EXPECT_CALL(nice, DoThis());
+  nice.DoThis();
+}
+
+TEST(NiceMockTest, IsNaggy_IsNice_IsStrict) {
+  NiceMock<MockFoo> nice_foo;
+  EXPECT_FALSE(Mock::IsNaggy(&nice_foo));
+  EXPECT_TRUE(Mock::IsNice(&nice_foo));
+  EXPECT_FALSE(Mock::IsStrict(&nice_foo));
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that a naggy mock generates warnings for uninteresting calls.
+TEST(NaggyMockTest, WarningForUninterestingCall) {
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = "warning";
+
+  NaggyMock<MockFoo> naggy_foo;
+
+  CaptureStdout();
+  naggy_foo.DoThis();
+  naggy_foo.DoThat(true);
+  EXPECT_THAT(GetCapturedStdout(),
+              HasSubstr("Uninteresting mock function call"));
+
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+// Tests that a naggy mock generates a warning for an uninteresting call
+// that deletes the mock object.
+TEST(NaggyMockTest, WarningForUninterestingCallAfterDeath) {
+  const std::string saved_flag = GMOCK_FLAG(verbose);
+  GMOCK_FLAG(verbose) = "warning";
+
+  NaggyMock<MockFoo>* const naggy_foo = new NaggyMock<MockFoo>;
+
+  ON_CALL(*naggy_foo, DoThis())
+      .WillByDefault(Invoke(naggy_foo, &MockFoo::Delete));
+
+  CaptureStdout();
+  naggy_foo->DoThis();
+  EXPECT_THAT(GetCapturedStdout(),
+              HasSubstr("Uninteresting mock function call"));
+
+  GMOCK_FLAG(verbose) = saved_flag;
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that a naggy mock allows expected calls.
+TEST(NaggyMockTest, AllowsExpectedCall) {
+  NaggyMock<MockFoo> naggy_foo;
+
+  EXPECT_CALL(naggy_foo, DoThis());
+  naggy_foo.DoThis();
+}
+
+// Tests that an unexpected call on a naggy mock fails.
+TEST(NaggyMockTest, UnexpectedCallFails) {
+  NaggyMock<MockFoo> naggy_foo;
+
+  EXPECT_CALL(naggy_foo, DoThis()).Times(0);
+  EXPECT_NONFATAL_FAILURE(naggy_foo.DoThis(),
+                          "called more times than expected");
+}
+
+// Tests that NaggyMock works with a mock class that has a non-default
+// constructor.
+TEST(NaggyMockTest, NonDefaultConstructor) {
+  NaggyMock<MockBar> naggy_bar("hi");
+  EXPECT_EQ("hi", naggy_bar.str());
+
+  naggy_bar.This();
+  naggy_bar.That(5, true);
+}
+
+// Tests that NaggyMock works with a mock class that has a 10-ary
+// non-default constructor.
+TEST(NaggyMockTest, NonDefaultConstructor10) {
+  NaggyMock<MockBar> naggy_bar('0', '1', "2", "3", '4', '5',
+                               "6", "7", true, false);
+  EXPECT_EQ("01234567TF", naggy_bar.str());
+
+  naggy_bar.This();
+  naggy_bar.That(5, true);
+}
+
+TEST(NaggyMockTest, AllowLeak) {
+  NaggyMock<MockFoo>* leaked = new NaggyMock<MockFoo>;
+  Mock::AllowLeak(leaked);
+  EXPECT_CALL(*leaked, DoThis());
+  leaked->DoThis();
+}
+
+TEST(NaggyMockTest, MoveOnlyConstructor) {
+  NaggyMock<MockBaz> naggy_baz(MockBaz::MoveOnly{});
+}
+
+// Tests that NaggyMock<Mock> compiles where Mock is a user-defined
+// class (as opposed to ::testing::Mock).
+TEST(NaggyMockTest, AcceptsClassNamedMock) {
+  NaggyMock< ::Mock> naggy;
+  EXPECT_CALL(naggy, DoThis());
+  naggy.DoThis();
+}
+
+TEST(NaggyMockTest, IsNaggy_IsNice_IsStrict) {
+  NaggyMock<MockFoo> naggy_foo;
+  EXPECT_TRUE(Mock::IsNaggy(&naggy_foo));
+  EXPECT_FALSE(Mock::IsNice(&naggy_foo));
+  EXPECT_FALSE(Mock::IsStrict(&naggy_foo));
+}
+
+// Tests that a strict mock allows expected calls.
+TEST(StrictMockTest, AllowsExpectedCall) {
+  StrictMock<MockFoo> strict_foo;
+
+  EXPECT_CALL(strict_foo, DoThis());
+  strict_foo.DoThis();
+}
+
+// Tests that an unexpected call on a strict mock fails.
+TEST(StrictMockTest, UnexpectedCallFails) {
+  StrictMock<MockFoo> strict_foo;
+
+  EXPECT_CALL(strict_foo, DoThis()).Times(0);
+  EXPECT_NONFATAL_FAILURE(strict_foo.DoThis(),
+                          "called more times than expected");
+}
+
+// Tests that an uninteresting call on a strict mock fails.
+TEST(StrictMockTest, UninterestingCallFails) {
+  StrictMock<MockFoo> strict_foo;
+
+  EXPECT_NONFATAL_FAILURE(strict_foo.DoThis(),
+                          "Uninteresting mock function call");
+}
+
+// Tests that an uninteresting call on a strict mock fails, even if
+// the call deletes the mock object.
+TEST(StrictMockTest, UninterestingCallFailsAfterDeath) {
+  StrictMock<MockFoo>* const strict_foo = new StrictMock<MockFoo>;
+
+  ON_CALL(*strict_foo, DoThis())
+      .WillByDefault(Invoke(strict_foo, &MockFoo::Delete));
+
+  EXPECT_NONFATAL_FAILURE(strict_foo->DoThis(),
+                          "Uninteresting mock function call");
+}
+
+// Tests that StrictMock works with a mock class that has a
+// non-default constructor.
+TEST(StrictMockTest, NonDefaultConstructor) {
+  StrictMock<MockBar> strict_bar("hi");
+  EXPECT_EQ("hi", strict_bar.str());
+
+  EXPECT_NONFATAL_FAILURE(strict_bar.That(5, true),
+                          "Uninteresting mock function call");
+}
+
+// Tests that StrictMock works with a mock class that has a 10-ary
+// non-default constructor.
+TEST(StrictMockTest, NonDefaultConstructor10) {
+  StrictMock<MockBar> strict_bar('a', 'b', "c", "d", 'e', 'f',
+                                 "g", "h", true, false);
+  EXPECT_EQ("abcdefghTF", strict_bar.str());
+
+  EXPECT_NONFATAL_FAILURE(strict_bar.That(5, true),
+                          "Uninteresting mock function call");
+}
+
+TEST(StrictMockTest, AllowLeak) {
+  StrictMock<MockFoo>* leaked = new StrictMock<MockFoo>;
+  Mock::AllowLeak(leaked);
+  EXPECT_CALL(*leaked, DoThis());
+  leaked->DoThis();
+}
+
+TEST(StrictMockTest, MoveOnlyConstructor) {
+  StrictMock<MockBaz> strict_baz(MockBaz::MoveOnly{});
+}
+
+// Tests that StrictMock<Mock> compiles where Mock is a user-defined
+// class (as opposed to ::testing::Mock).
+TEST(StrictMockTest, AcceptsClassNamedMock) {
+  StrictMock< ::Mock> strict;
+  EXPECT_CALL(strict, DoThis());
+  strict.DoThis();
+}
+
+TEST(StrictMockTest, IsNaggy_IsNice_IsStrict) {
+  StrictMock<MockFoo> strict_foo;
+  EXPECT_FALSE(Mock::IsNaggy(&strict_foo));
+  EXPECT_FALSE(Mock::IsNice(&strict_foo));
+  EXPECT_TRUE(Mock::IsStrict(&strict_foo));
+}
+
+}  // namespace gmock_nice_strict_test
+}  // namespace testing
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-port_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-port_test.cc
new file mode 100644
index 0000000000..a2c2be2488
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-port_test.cc
@@ -0,0 +1,42 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the internal cross-platform support utilities.
+
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+
+// NOTE: if this file is left without tests for some reason, put a dummy
+// test here to make references to symbols in the gtest library and avoid
+// 'undefined symbol' linker errors in gmock_main:
+
+TEST(DummyTest, Dummy) {}
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-pp-string_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-pp-string_test.cc
new file mode 100644
index 0000000000..6f66cf156a
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-pp-string_test.cc
@@ -0,0 +1,206 @@
+// Copyright 2018, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the internal preprocessor macro library.
+#include "gmock/internal/gmock-pp.h"
+
+#include <string>
+
+#include "gmock/gmock.h"
+
+namespace testing {
+namespace {
+
+// Matcher to verify that to strings are identical up to whitespace
+// Not 100% correct, because it treats "AB" as equal to "A B".
+::testing::Matcher<const std::string&> SameExceptSpaces(const std::string& s) {
+  auto remove_spaces = [](std::string to_split) {
+    to_split.erase(std::remove(to_split.begin(), to_split.end(), ' '),
+                   to_split.end());
+    return to_split;
+  };
+  return ::testing::ResultOf(remove_spaces, remove_spaces(s));
+}
+
+// Verify that a macro expands to a given text. Ignores whitespace difference.
+// In MSVC, GMOCK_PP_STRINGIZE() returns nothing, rather than "". So concatenate
+// with an empty string.
+#define EXPECT_EXPANSION(Result, Macro) \
+  EXPECT_THAT("" GMOCK_PP_STRINGIZE(Macro), SameExceptSpaces(Result))
+
+TEST(Macros, Cat) {
+  EXPECT_EXPANSION("14", GMOCK_PP_CAT(1, 4));
+  EXPECT_EXPANSION("+=", GMOCK_PP_CAT(+, =));
+}
+
+TEST(Macros, Narg) {
+  EXPECT_EXPANSION("1", GMOCK_PP_NARG());
+  EXPECT_EXPANSION("1", GMOCK_PP_NARG(x));
+  EXPECT_EXPANSION("2", GMOCK_PP_NARG(x, y));
+  EXPECT_EXPANSION("3", GMOCK_PP_NARG(x, y, z));
+  EXPECT_EXPANSION("4", GMOCK_PP_NARG(x, y, z, w));
+
+  EXPECT_EXPANSION("0", GMOCK_PP_NARG0());
+  EXPECT_EXPANSION("1", GMOCK_PP_NARG0(x));
+  EXPECT_EXPANSION("2", GMOCK_PP_NARG0(x, y));
+}
+
+TEST(Macros, Comma) {
+  EXPECT_EXPANSION("0", GMOCK_PP_HAS_COMMA());
+  EXPECT_EXPANSION("1", GMOCK_PP_HAS_COMMA(, ));
+  EXPECT_EXPANSION("0", GMOCK_PP_HAS_COMMA((, )));
+}
+
+TEST(Macros, IsEmpty) {
+  EXPECT_EXPANSION("1", GMOCK_PP_IS_EMPTY());
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_EMPTY(, ));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_EMPTY(a));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_EMPTY(()));
+
+#define GMOCK_PP_INTERNAL_IS_EMPTY_TEST_1
+  EXPECT_EXPANSION("1", GMOCK_PP_IS_EMPTY(GMOCK_PP_INTERNAL_IS_EMPTY_TEST_1));
+}
+
+TEST(Macros, If) {
+  EXPECT_EXPANSION("1", GMOCK_PP_IF(1, 1, 2));
+  EXPECT_EXPANSION("2", GMOCK_PP_IF(0, 1, 2));
+}
+
+TEST(Macros, HeadTail) {
+  EXPECT_EXPANSION("1", GMOCK_PP_HEAD(1));
+  EXPECT_EXPANSION("1", GMOCK_PP_HEAD(1, 2));
+  EXPECT_EXPANSION("1", GMOCK_PP_HEAD(1, 2, 3));
+
+  EXPECT_EXPANSION("", GMOCK_PP_TAIL(1));
+  EXPECT_EXPANSION("2", GMOCK_PP_TAIL(1, 2));
+  EXPECT_EXPANSION("2", GMOCK_PP_HEAD(GMOCK_PP_TAIL(1, 2, 3)));
+}
+
+TEST(Macros, Parentheses) {
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_BEGIN_PARENS(sss));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_BEGIN_PARENS(sss()));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_BEGIN_PARENS(sss() sss));
+  EXPECT_EXPANSION("1", GMOCK_PP_IS_BEGIN_PARENS((sss)));
+  EXPECT_EXPANSION("1", GMOCK_PP_IS_BEGIN_PARENS((sss)ss));
+
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_ENCLOSED_PARENS(sss));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_ENCLOSED_PARENS(sss()));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_ENCLOSED_PARENS(sss() sss));
+  EXPECT_EXPANSION("1", GMOCK_PP_IS_ENCLOSED_PARENS((sss)));
+  EXPECT_EXPANSION("0", GMOCK_PP_IS_ENCLOSED_PARENS((sss)ss));
+
+  EXPECT_EXPANSION("1 + 1", GMOCK_PP_REMOVE_PARENS((1 + 1)));
+}
+
+TEST(Macros, Increment) {
+  EXPECT_EXPANSION("1", GMOCK_PP_INC(0));
+  EXPECT_EXPANSION("2", GMOCK_PP_INC(1));
+  EXPECT_EXPANSION("3", GMOCK_PP_INC(2));
+  EXPECT_EXPANSION("4", GMOCK_PP_INC(3));
+  EXPECT_EXPANSION("5", GMOCK_PP_INC(4));
+
+  EXPECT_EXPANSION("16", GMOCK_PP_INC(15));
+}
+
+#define JOINER_CAT(a, b) a##b
+#define JOINER(_N, _Data, _Elem) JOINER_CAT(_Data, _N) = _Elem
+
+TEST(Macros, Repeat) {
+  EXPECT_EXPANSION("", GMOCK_PP_REPEAT(JOINER, X, 0));
+  EXPECT_EXPANSION("X0=", GMOCK_PP_REPEAT(JOINER, X, 1));
+  EXPECT_EXPANSION("X0= X1=", GMOCK_PP_REPEAT(JOINER, X, 2));
+  EXPECT_EXPANSION("X0= X1= X2=", GMOCK_PP_REPEAT(JOINER, X, 3));
+  EXPECT_EXPANSION("X0= X1= X2= X3=", GMOCK_PP_REPEAT(JOINER, X, 4));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4=", GMOCK_PP_REPEAT(JOINER, X, 5));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5=", GMOCK_PP_REPEAT(JOINER, X, 6));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6=",
+                   GMOCK_PP_REPEAT(JOINER, X, 7));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6= X7=",
+                   GMOCK_PP_REPEAT(JOINER, X, 8));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6= X7= X8=",
+                   GMOCK_PP_REPEAT(JOINER, X, 9));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6= X7= X8= X9=",
+                   GMOCK_PP_REPEAT(JOINER, X, 10));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6= X7= X8= X9= X10=",
+                   GMOCK_PP_REPEAT(JOINER, X, 11));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6= X7= X8= X9= X10= X11=",
+                   GMOCK_PP_REPEAT(JOINER, X, 12));
+  EXPECT_EXPANSION("X0= X1= X2= X3= X4= X5= X6= X7= X8= X9= X10= X11= X12=",
+                   GMOCK_PP_REPEAT(JOINER, X, 13));
+  EXPECT_EXPANSION(
+      "X0= X1= X2= X3= X4= X5= X6= X7= X8= X9= X10= X11= X12= X13=",
+      GMOCK_PP_REPEAT(JOINER, X, 14));
+  EXPECT_EXPANSION(
+      "X0= X1= X2= X3= X4= X5= X6= X7= X8= X9= X10= X11= X12= X13= X14=",
+      GMOCK_PP_REPEAT(JOINER, X, 15));
+}
+TEST(Macros, ForEach) {
+  EXPECT_EXPANSION("", GMOCK_PP_FOR_EACH(JOINER, X, ()));
+  EXPECT_EXPANSION("X0=a", GMOCK_PP_FOR_EACH(JOINER, X, (a)));
+  EXPECT_EXPANSION("X0=a X1=b", GMOCK_PP_FOR_EACH(JOINER, X, (a, b)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c", GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c X3=d",
+                   GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c X3=d X4=e",
+                   GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c X3=d X4=e X5=f",
+                   GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c X3=d X4=e X5=f X6=g",
+                   GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h",
+                   GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h)));
+  EXPECT_EXPANSION("X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i",
+                   GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h, i)));
+  EXPECT_EXPANSION(
+      "X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i X9=j",
+      GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h, i, j)));
+  EXPECT_EXPANSION(
+      "X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i X9=j X10=k",
+      GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h, i, j, k)));
+  EXPECT_EXPANSION(
+      "X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i X9=j X10=k X11=l",
+      GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h, i, j, k, l)));
+  EXPECT_EXPANSION(
+      "X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i X9=j X10=k X11=l X12=m",
+      GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h, i, j, k, l, m)));
+  EXPECT_EXPANSION(
+      "X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i X9=j X10=k X11=l X12=m "
+      "X13=n",
+      GMOCK_PP_FOR_EACH(JOINER, X, (a, b, c, d, e, f, g, h, i, j, k, l, m, n)));
+  EXPECT_EXPANSION(
+      "X0=a X1=b X2=c X3=d X4=e X5=f X6=g X7=h X8=i X9=j X10=k X11=l X12=m "
+      "X13=n X14=o",
+      GMOCK_PP_FOR_EACH(JOINER, X,
+                        (a, b, c, d, e, f, g, h, i, j, k, l, m, n, o)));
+}
+
+}  // namespace
+}  // namespace testing
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-pp_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-pp_test.cc
new file mode 100644
index 0000000000..5d1566e388
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-pp_test.cc
@@ -0,0 +1,83 @@
+#include "gmock/internal/gmock-pp.h"
+
+// Used to test MSVC treating __VA_ARGS__ with a comma in it as one value
+#define GMOCK_TEST_REPLACE_comma_WITH_COMMA_I_comma ,
+#define GMOCK_TEST_REPLACE_comma_WITH_COMMA(x) \
+  GMOCK_PP_CAT(GMOCK_TEST_REPLACE_comma_WITH_COMMA_I_, x)
+
+// Static assertions.
+namespace testing {
+namespace internal {
+namespace gmockpp {
+
+static_assert(GMOCK_PP_CAT(1, 4) == 14, "");
+static_assert(GMOCK_PP_INTERNAL_INTERNAL_16TH(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+                                              12, 13, 14, 15, 16, 17, 18) == 16,
+              "");
+static_assert(GMOCK_PP_NARG() == 1, "");
+static_assert(GMOCK_PP_NARG(x) == 1, "");
+static_assert(GMOCK_PP_NARG(x, y) == 2, "");
+static_assert(GMOCK_PP_NARG(x, y, z) == 3, "");
+static_assert(GMOCK_PP_NARG(x, y, z, w) == 4, "");
+static_assert(!GMOCK_PP_HAS_COMMA(), "");
+static_assert(GMOCK_PP_HAS_COMMA(b, ), "");
+static_assert(!GMOCK_PP_HAS_COMMA((, )), "");
+static_assert(GMOCK_PP_HAS_COMMA(GMOCK_TEST_REPLACE_comma_WITH_COMMA(comma)),
+              "");
+static_assert(
+    GMOCK_PP_HAS_COMMA(GMOCK_TEST_REPLACE_comma_WITH_COMMA(comma(unrelated))),
+    "");
+static_assert(!GMOCK_PP_IS_EMPTY(, ), "");
+static_assert(!GMOCK_PP_IS_EMPTY(a), "");
+static_assert(!GMOCK_PP_IS_EMPTY(()), "");
+static_assert(GMOCK_PP_IF(1, 1, 2) == 1, "");
+static_assert(GMOCK_PP_IF(0, 1, 2) == 2, "");
+static_assert(GMOCK_PP_NARG0(x) == 1, "");
+static_assert(GMOCK_PP_NARG0(x, y) == 2, "");
+static_assert(GMOCK_PP_HEAD(1) == 1, "");
+static_assert(GMOCK_PP_HEAD(1, 2) == 1, "");
+static_assert(GMOCK_PP_HEAD(1, 2, 3) == 1, "");
+static_assert(GMOCK_PP_TAIL(1, 2) == 2, "");
+static_assert(GMOCK_PP_HEAD(GMOCK_PP_TAIL(1, 2, 3)) == 2, "");
+static_assert(!GMOCK_PP_IS_BEGIN_PARENS(sss), "");
+static_assert(!GMOCK_PP_IS_BEGIN_PARENS(sss()), "");
+static_assert(!GMOCK_PP_IS_BEGIN_PARENS(sss() sss), "");
+static_assert(GMOCK_PP_IS_BEGIN_PARENS((sss)), "");
+static_assert(GMOCK_PP_IS_BEGIN_PARENS((sss)ss), "");
+static_assert(!GMOCK_PP_IS_ENCLOSED_PARENS(sss), "");
+static_assert(!GMOCK_PP_IS_ENCLOSED_PARENS(sss()), "");
+static_assert(!GMOCK_PP_IS_ENCLOSED_PARENS(sss() sss), "");
+static_assert(!GMOCK_PP_IS_ENCLOSED_PARENS((sss)ss), "");
+static_assert(GMOCK_PP_REMOVE_PARENS((1 + 1)) * 2 == 3, "");
+static_assert(GMOCK_PP_INC(4) == 5, "");
+
+template <class... Args>
+struct Test {
+  static constexpr int kArgs = sizeof...(Args);
+};
+#define GMOCK_PP_INTERNAL_TYPE_TEST(_i, _Data, _element) \
+  GMOCK_PP_COMMA_IF(_i) _element
+static_assert(Test<GMOCK_PP_FOR_EACH(GMOCK_PP_INTERNAL_TYPE_TEST, ~,
+                                     (int, float, double, char))>::kArgs == 4,
+              "");
+#define GMOCK_PP_INTERNAL_VAR_TEST_1(_x) 1
+#define GMOCK_PP_INTERNAL_VAR_TEST_2(_x, _y) 2
+#define GMOCK_PP_INTERNAL_VAR_TEST_3(_x, _y, _z) 3
+
+#define GMOCK_PP_INTERNAL_VAR_TEST(...) \
+  GMOCK_PP_VARIADIC_CALL(GMOCK_PP_INTERNAL_VAR_TEST_, __VA_ARGS__)
+static_assert(GMOCK_PP_INTERNAL_VAR_TEST(x, y) == 2, "");
+static_assert(GMOCK_PP_INTERNAL_VAR_TEST(silly) == 1, "");
+static_assert(GMOCK_PP_INTERNAL_VAR_TEST(x, y, z) == 3, "");
+
+// TODO(iserna): The following asserts fail in --config=lexan.
+#define GMOCK_PP_INTERNAL_IS_EMPTY_TEST_1
+static_assert(GMOCK_PP_IS_EMPTY(GMOCK_PP_INTERNAL_IS_EMPTY_TEST_1), "");
+static_assert(GMOCK_PP_IS_EMPTY(), "");
+static_assert(GMOCK_PP_IS_ENCLOSED_PARENS((sss)), "");
+static_assert(GMOCK_PP_IS_EMPTY(GMOCK_PP_TAIL(1)), "");
+static_assert(GMOCK_PP_NARG0() == 0, "");
+
+}  // namespace gmockpp
+}  // namespace internal
+}  // namespace testing
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock-spec-builders_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock-spec-builders_test.cc
new file mode 100644
index 0000000000..791a247681
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock-spec-builders_test.cc
@@ -0,0 +1,2775 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests the spec builder syntax.
+
+#include "gmock/gmock-spec-builders.h"
+
+#include <memory>
+#include <ostream>  // NOLINT
+#include <sstream>
+#include <string>
+
+#include "gmock/gmock.h"
+#include "gmock/internal/gmock-port.h"
+#include "gtest/gtest.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/internal/gtest-port.h"
+
+namespace testing {
+namespace internal {
+
+// Helper class for testing the Expectation class template.
+class ExpectationTester {
+ public:
+  // Sets the call count of the given expectation to the given number.
+  void SetCallCount(int n, ExpectationBase* exp) {
+    exp->call_count_ = n;
+  }
+};
+
+}  // namespace internal
+}  // namespace testing
+
+namespace {
+
+using testing::_;
+using testing::AnyNumber;
+using testing::AtLeast;
+using testing::AtMost;
+using testing::Between;
+using testing::Cardinality;
+using testing::CardinalityInterface;
+using testing::Const;
+using testing::ContainsRegex;
+using testing::DoAll;
+using testing::DoDefault;
+using testing::Eq;
+using testing::Expectation;
+using testing::ExpectationSet;
+using testing::GMOCK_FLAG(verbose);
+using testing::Gt;
+using testing::IgnoreResult;
+using testing::InSequence;
+using testing::Invoke;
+using testing::InvokeWithoutArgs;
+using testing::IsNotSubstring;
+using testing::IsSubstring;
+using testing::Lt;
+using testing::Message;
+using testing::Mock;
+using testing::NaggyMock;
+using testing::Ne;
+using testing::Return;
+using testing::SaveArg;
+using testing::Sequence;
+using testing::SetArgPointee;
+using testing::internal::ExpectationTester;
+using testing::internal::FormatFileLocation;
+using testing::internal::kAllow;
+using testing::internal::kErrorVerbosity;
+using testing::internal::kFail;
+using testing::internal::kInfoVerbosity;
+using testing::internal::kWarn;
+using testing::internal::kWarningVerbosity;
+
+#if GTEST_HAS_STREAM_REDIRECTION
+using testing::HasSubstr;
+using testing::internal::CaptureStdout;
+using testing::internal::GetCapturedStdout;
+#endif
+
+class Incomplete;
+
+class MockIncomplete {
+ public:
+  // This line verifies that a mock method can take a by-reference
+  // argument of an incomplete type.
+  MOCK_METHOD1(ByRefFunc, void(const Incomplete& x));
+};
+
+// Tells Google Mock how to print a value of type Incomplete.
+void PrintTo(const Incomplete& x, ::std::ostream* os);
+
+TEST(MockMethodTest, CanInstantiateWithIncompleteArgType) {
+  // Even though this mock class contains a mock method that takes
+  // by-reference an argument whose type is incomplete, we can still
+  // use the mock, as long as Google Mock knows how to print the
+  // argument.
+  MockIncomplete incomplete;
+  EXPECT_CALL(incomplete, ByRefFunc(_))
+      .Times(AnyNumber());
+}
+
+// The definition of the printer for the argument type doesn't have to
+// be visible where the mock is used.
+void PrintTo(const Incomplete& /* x */, ::std::ostream* os) {
+  *os << "incomplete";
+}
+
+class Result {};
+
+// A type that's not default constructible.
+class NonDefaultConstructible {
+ public:
+  explicit NonDefaultConstructible(int /* dummy */) {}
+};
+
+class MockA {
+ public:
+  MockA() {}
+
+  MOCK_METHOD1(DoA, void(int n));
+  MOCK_METHOD1(ReturnResult, Result(int n));
+  MOCK_METHOD0(ReturnNonDefaultConstructible, NonDefaultConstructible());
+  MOCK_METHOD2(Binary, bool(int x, int y));
+  MOCK_METHOD2(ReturnInt, int(int x, int y));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockA);
+};
+
+class MockB {
+ public:
+  MockB() {}
+
+  MOCK_CONST_METHOD0(DoB, int());  // NOLINT
+  MOCK_METHOD1(DoB, int(int n));  // NOLINT
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockB);
+};
+
+class ReferenceHoldingMock {
+ public:
+  ReferenceHoldingMock() {}
+
+  MOCK_METHOD1(AcceptReference, void(std::shared_ptr<MockA>*));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(ReferenceHoldingMock);
+};
+
+// Tests that EXPECT_CALL and ON_CALL compile in a presence of macro
+// redefining a mock method name. This could happen, for example, when
+// the tested code #includes Win32 API headers which define many APIs
+// as macros, e.g. #define TextOut TextOutW.
+
+#define Method MethodW
+
+class CC {
+ public:
+  virtual ~CC() {}
+  virtual int Method() = 0;
+};
+class MockCC : public CC {
+ public:
+  MockCC() {}
+
+  MOCK_METHOD0(Method, int());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockCC);
+};
+
+// Tests that a method with expanded name compiles.
+TEST(OnCallSyntaxTest, CompilesWithMethodNameExpandedFromMacro) {
+  MockCC cc;
+  ON_CALL(cc, Method());
+}
+
+// Tests that the method with expanded name not only compiles but runs
+// and returns a correct value, too.
+TEST(OnCallSyntaxTest, WorksWithMethodNameExpandedFromMacro) {
+  MockCC cc;
+  ON_CALL(cc, Method()).WillByDefault(Return(42));
+  EXPECT_EQ(42, cc.Method());
+}
+
+// Tests that a method with expanded name compiles.
+TEST(ExpectCallSyntaxTest, CompilesWithMethodNameExpandedFromMacro) {
+  MockCC cc;
+  EXPECT_CALL(cc, Method());
+  cc.Method();
+}
+
+// Tests that it works, too.
+TEST(ExpectCallSyntaxTest, WorksWithMethodNameExpandedFromMacro) {
+  MockCC cc;
+  EXPECT_CALL(cc, Method()).WillOnce(Return(42));
+  EXPECT_EQ(42, cc.Method());
+}
+
+#undef Method  // Done with macro redefinition tests.
+
+// Tests that ON_CALL evaluates its arguments exactly once as promised
+// by Google Mock.
+TEST(OnCallSyntaxTest, EvaluatesFirstArgumentOnce) {
+  MockA a;
+  MockA* pa = &a;
+
+  ON_CALL(*pa++, DoA(_));
+  EXPECT_EQ(&a + 1, pa);
+}
+
+TEST(OnCallSyntaxTest, EvaluatesSecondArgumentOnce) {
+  MockA a;
+  int n = 0;
+
+  ON_CALL(a, DoA(n++));
+  EXPECT_EQ(1, n);
+}
+
+// Tests that the syntax of ON_CALL() is enforced at run time.
+
+TEST(OnCallSyntaxTest, WithIsOptional) {
+  MockA a;
+
+  ON_CALL(a, DoA(5))
+      .WillByDefault(Return());
+  ON_CALL(a, DoA(_))
+      .With(_)
+      .WillByDefault(Return());
+}
+
+TEST(OnCallSyntaxTest, WithCanAppearAtMostOnce) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    ON_CALL(a, ReturnResult(_))
+        .With(_)
+        .With(_)
+        .WillByDefault(Return(Result()));
+  }, ".With() cannot appear more than once in an ON_CALL()");
+}
+
+TEST(OnCallSyntaxTest, WillByDefaultIsMandatory) {
+  MockA a;
+
+  EXPECT_DEATH_IF_SUPPORTED({
+    ON_CALL(a, DoA(5));
+    a.DoA(5);
+  }, "");
+}
+
+TEST(OnCallSyntaxTest, WillByDefaultCanAppearAtMostOnce) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    ON_CALL(a, DoA(5))
+        .WillByDefault(Return())
+        .WillByDefault(Return());
+  }, ".WillByDefault() must appear exactly once in an ON_CALL()");
+}
+
+// Tests that EXPECT_CALL evaluates its arguments exactly once as
+// promised by Google Mock.
+TEST(ExpectCallSyntaxTest, EvaluatesFirstArgumentOnce) {
+  MockA a;
+  MockA* pa = &a;
+
+  EXPECT_CALL(*pa++, DoA(_));
+  a.DoA(0);
+  EXPECT_EQ(&a + 1, pa);
+}
+
+TEST(ExpectCallSyntaxTest, EvaluatesSecondArgumentOnce) {
+  MockA a;
+  int n = 0;
+
+  EXPECT_CALL(a, DoA(n++));
+  a.DoA(0);
+  EXPECT_EQ(1, n);
+}
+
+// Tests that the syntax of EXPECT_CALL() is enforced at run time.
+
+TEST(ExpectCallSyntaxTest, WithIsOptional) {
+  MockA a;
+
+  EXPECT_CALL(a, DoA(5))
+      .Times(0);
+  EXPECT_CALL(a, DoA(6))
+      .With(_)
+      .Times(0);
+}
+
+TEST(ExpectCallSyntaxTest, WithCanAppearAtMostOnce) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(6))
+        .With(_)
+        .With(_);
+  }, ".With() cannot appear more than once in an EXPECT_CALL()");
+
+  a.DoA(6);
+}
+
+TEST(ExpectCallSyntaxTest, WithMustBeFirstClause) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .Times(1)
+        .With(_);
+  }, ".With() must be the first clause in an EXPECT_CALL()");
+
+  a.DoA(1);
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(2))
+        .WillOnce(Return())
+        .With(_);
+  }, ".With() must be the first clause in an EXPECT_CALL()");
+
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, TimesCanBeInferred) {
+  MockA a;
+
+  EXPECT_CALL(a, DoA(1))
+      .WillOnce(Return());
+
+  EXPECT_CALL(a, DoA(2))
+      .WillOnce(Return())
+      .WillRepeatedly(Return());
+
+  a.DoA(1);
+  a.DoA(2);
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, TimesCanAppearAtMostOnce) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .Times(1)
+        .Times(2);
+  }, ".Times() cannot appear more than once in an EXPECT_CALL()");
+
+  a.DoA(1);
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, TimesMustBeBeforeInSequence) {
+  MockA a;
+  Sequence s;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .InSequence(s)
+        .Times(1);
+  }, ".Times() cannot appear after ");
+
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, InSequenceIsOptional) {
+  MockA a;
+  Sequence s;
+
+  EXPECT_CALL(a, DoA(1));
+  EXPECT_CALL(a, DoA(2))
+      .InSequence(s);
+
+  a.DoA(1);
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, InSequenceCanAppearMultipleTimes) {
+  MockA a;
+  Sequence s1, s2;
+
+  EXPECT_CALL(a, DoA(1))
+      .InSequence(s1, s2)
+      .InSequence(s1);
+
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, InSequenceMustBeBeforeAfter) {
+  MockA a;
+  Sequence s;
+
+  Expectation e = EXPECT_CALL(a, DoA(1))
+      .Times(AnyNumber());
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(2))
+        .After(e)
+        .InSequence(s);
+  }, ".InSequence() cannot appear after ");
+
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, InSequenceMustBeBeforeWillOnce) {
+  MockA a;
+  Sequence s;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .WillOnce(Return())
+        .InSequence(s);
+  }, ".InSequence() cannot appear after ");
+
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, AfterMustBeBeforeWillOnce) {
+  MockA a;
+
+  Expectation e = EXPECT_CALL(a, DoA(1));
+  EXPECT_NONFATAL_FAILURE({
+    EXPECT_CALL(a, DoA(2))
+        .WillOnce(Return())
+        .After(e);
+  }, ".After() cannot appear after ");
+
+  a.DoA(1);
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, WillIsOptional) {
+  MockA a;
+
+  EXPECT_CALL(a, DoA(1));
+  EXPECT_CALL(a, DoA(2))
+      .WillOnce(Return());
+
+  a.DoA(1);
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, WillCanAppearMultipleTimes) {
+  MockA a;
+
+  EXPECT_CALL(a, DoA(1))
+      .Times(AnyNumber())
+      .WillOnce(Return())
+      .WillOnce(Return())
+      .WillOnce(Return());
+}
+
+TEST(ExpectCallSyntaxTest, WillMustBeBeforeWillRepeatedly) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .WillRepeatedly(Return())
+        .WillOnce(Return());
+  }, ".WillOnce() cannot appear after ");
+
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, WillRepeatedlyIsOptional) {
+  MockA a;
+
+  EXPECT_CALL(a, DoA(1))
+      .WillOnce(Return());
+  EXPECT_CALL(a, DoA(2))
+      .WillOnce(Return())
+      .WillRepeatedly(Return());
+
+  a.DoA(1);
+  a.DoA(2);
+  a.DoA(2);
+}
+
+TEST(ExpectCallSyntaxTest, WillRepeatedlyCannotAppearMultipleTimes) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .WillRepeatedly(Return())
+        .WillRepeatedly(Return());
+  }, ".WillRepeatedly() cannot appear more than once in an "
+     "EXPECT_CALL()");
+}
+
+TEST(ExpectCallSyntaxTest, WillRepeatedlyMustBeBeforeRetiresOnSaturation) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .RetiresOnSaturation()
+        .WillRepeatedly(Return());
+  }, ".WillRepeatedly() cannot appear after ");
+}
+
+TEST(ExpectCallSyntaxTest, RetiresOnSaturationIsOptional) {
+  MockA a;
+
+  EXPECT_CALL(a, DoA(1));
+  EXPECT_CALL(a, DoA(1))
+      .RetiresOnSaturation();
+
+  a.DoA(1);
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, RetiresOnSaturationCannotAppearMultipleTimes) {
+  MockA a;
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    EXPECT_CALL(a, DoA(1))
+        .RetiresOnSaturation()
+        .RetiresOnSaturation();
+  }, ".RetiresOnSaturation() cannot appear more than once");
+
+  a.DoA(1);
+}
+
+TEST(ExpectCallSyntaxTest, DefaultCardinalityIsOnce) {
+  {
+    MockA a;
+    EXPECT_CALL(a, DoA(1));
+    a.DoA(1);
+  }
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    MockA a;
+    EXPECT_CALL(a, DoA(1));
+  }, "to be called once");
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    MockA a;
+    EXPECT_CALL(a, DoA(1));
+    a.DoA(1);
+    a.DoA(1);
+  }, "to be called once");
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that Google Mock doesn't print a warning when the number of
+// WillOnce() is adequate.
+TEST(ExpectCallSyntaxTest, DoesNotWarnOnAdequateActionCount) {
+  CaptureStdout();
+  {
+    MockB b;
+
+    // It's always fine to omit WillOnce() entirely.
+    EXPECT_CALL(b, DoB())
+        .Times(0);
+    EXPECT_CALL(b, DoB(1))
+        .Times(AtMost(1));
+    EXPECT_CALL(b, DoB(2))
+        .Times(1)
+        .WillRepeatedly(Return(1));
+
+    // It's fine for the number of WillOnce()s to equal the upper bound.
+    EXPECT_CALL(b, DoB(3))
+        .Times(Between(1, 2))
+        .WillOnce(Return(1))
+        .WillOnce(Return(2));
+
+    // It's fine for the number of WillOnce()s to be smaller than the
+    // upper bound when there is a WillRepeatedly().
+    EXPECT_CALL(b, DoB(4))
+        .Times(AtMost(3))
+        .WillOnce(Return(1))
+        .WillRepeatedly(Return(2));
+
+    // Satisfies the above expectations.
+    b.DoB(2);
+    b.DoB(3);
+  }
+  EXPECT_STREQ("", GetCapturedStdout().c_str());
+}
+
+// Tests that Google Mock warns on having too many actions in an
+// expectation compared to its cardinality.
+TEST(ExpectCallSyntaxTest, WarnsOnTooManyActions) {
+  CaptureStdout();
+  {
+    MockB b;
+
+    // Warns when the number of WillOnce()s is larger than the upper bound.
+    EXPECT_CALL(b, DoB())
+        .Times(0)
+        .WillOnce(Return(1));  // #1
+    EXPECT_CALL(b, DoB())
+        .Times(AtMost(1))
+        .WillOnce(Return(1))
+        .WillOnce(Return(2));  // #2
+    EXPECT_CALL(b, DoB(1))
+        .Times(1)
+        .WillOnce(Return(1))
+        .WillOnce(Return(2))
+        .RetiresOnSaturation();  // #3
+
+    // Warns when the number of WillOnce()s equals the upper bound and
+    // there is a WillRepeatedly().
+    EXPECT_CALL(b, DoB())
+        .Times(0)
+        .WillRepeatedly(Return(1));  // #4
+    EXPECT_CALL(b, DoB(2))
+        .Times(1)
+        .WillOnce(Return(1))
+        .WillRepeatedly(Return(2));  // #5
+
+    // Satisfies the above expectations.
+    b.DoB(1);
+    b.DoB(2);
+  }
+  const std::string output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Too many actions specified in EXPECT_CALL(b, DoB())...\n"
+      "Expected to be never called, but has 1 WillOnce().",
+      output);  // #1
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Too many actions specified in EXPECT_CALL(b, DoB())...\n"
+      "Expected to be called at most once, "
+      "but has 2 WillOnce()s.",
+      output);  // #2
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Too many actions specified in EXPECT_CALL(b, DoB(1))...\n"
+      "Expected to be called once, but has 2 WillOnce()s.",
+      output);  // #3
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Too many actions specified in EXPECT_CALL(b, DoB())...\n"
+      "Expected to be never called, but has 0 WillOnce()s "
+      "and a WillRepeatedly().",
+      output);  // #4
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Too many actions specified in EXPECT_CALL(b, DoB(2))...\n"
+      "Expected to be called once, but has 1 WillOnce() "
+      "and a WillRepeatedly().",
+      output);  // #5
+}
+
+// Tests that Google Mock warns on having too few actions in an
+// expectation compared to its cardinality.
+TEST(ExpectCallSyntaxTest, WarnsOnTooFewActions) {
+  MockB b;
+
+  EXPECT_CALL(b, DoB())
+      .Times(Between(2, 3))
+      .WillOnce(Return(1));
+
+  CaptureStdout();
+  b.DoB();
+  const std::string output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Too few actions specified in EXPECT_CALL(b, DoB())...\n"
+      "Expected to be called between 2 and 3 times, "
+      "but has only 1 WillOnce().",
+      output);
+  b.DoB();
+}
+
+TEST(ExpectCallSyntaxTest, WarningIsErrorWithFlag) {
+  int original_behavior = testing::GMOCK_FLAG(default_mock_behavior);
+
+  testing::GMOCK_FLAG(default_mock_behavior) = kAllow;
+  CaptureStdout();
+  {
+    MockA a;
+    a.DoA(0);
+  }
+  std::string output = GetCapturedStdout();
+  EXPECT_TRUE(output.empty()) << output;
+
+  testing::GMOCK_FLAG(default_mock_behavior) = kWarn;
+  CaptureStdout();
+  {
+    MockA a;
+    a.DoA(0);
+  }
+  std::string warning_output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(IsSubstring, "GMOCK WARNING", warning_output);
+  EXPECT_PRED_FORMAT2(IsSubstring, "Uninteresting mock function call",
+                      warning_output);
+
+  testing::GMOCK_FLAG(default_mock_behavior) = kFail;
+  EXPECT_NONFATAL_FAILURE({
+    MockA a;
+    a.DoA(0);
+  }, "Uninteresting mock function call");
+
+  // Out of bounds values are converted to kWarn
+  testing::GMOCK_FLAG(default_mock_behavior) = -1;
+  CaptureStdout();
+  {
+    MockA a;
+    a.DoA(0);
+  }
+  warning_output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(IsSubstring, "GMOCK WARNING", warning_output);
+  EXPECT_PRED_FORMAT2(IsSubstring, "Uninteresting mock function call",
+                      warning_output);
+  testing::GMOCK_FLAG(default_mock_behavior) = 3;
+  CaptureStdout();
+  {
+    MockA a;
+    a.DoA(0);
+  }
+  warning_output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(IsSubstring, "GMOCK WARNING", warning_output);
+  EXPECT_PRED_FORMAT2(IsSubstring, "Uninteresting mock function call",
+                      warning_output);
+
+  testing::GMOCK_FLAG(default_mock_behavior) = original_behavior;
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Tests the semantics of ON_CALL().
+
+// Tests that the built-in default action is taken when no ON_CALL()
+// is specified.
+TEST(OnCallTest, TakesBuiltInDefaultActionWhenNoOnCall) {
+  MockB b;
+  EXPECT_CALL(b, DoB());
+
+  EXPECT_EQ(0, b.DoB());
+}
+
+// Tests that the built-in default action is taken when no ON_CALL()
+// matches the invocation.
+TEST(OnCallTest, TakesBuiltInDefaultActionWhenNoOnCallMatches) {
+  MockB b;
+  ON_CALL(b, DoB(1))
+      .WillByDefault(Return(1));
+  EXPECT_CALL(b, DoB(_));
+
+  EXPECT_EQ(0, b.DoB(2));
+}
+
+// Tests that the last matching ON_CALL() action is taken.
+TEST(OnCallTest, PicksLastMatchingOnCall) {
+  MockB b;
+  ON_CALL(b, DoB(_))
+      .WillByDefault(Return(3));
+  ON_CALL(b, DoB(2))
+      .WillByDefault(Return(2));
+  ON_CALL(b, DoB(1))
+      .WillByDefault(Return(1));
+  EXPECT_CALL(b, DoB(_));
+
+  EXPECT_EQ(2, b.DoB(2));
+}
+
+// Tests the semantics of EXPECT_CALL().
+
+// Tests that any call is allowed when no EXPECT_CALL() is specified.
+TEST(ExpectCallTest, AllowsAnyCallWhenNoSpec) {
+  MockB b;
+  EXPECT_CALL(b, DoB());
+  // There is no expectation on DoB(int).
+
+  b.DoB();
+
+  // DoB(int) can be called any number of times.
+  b.DoB(1);
+  b.DoB(2);
+}
+
+// Tests that the last matching EXPECT_CALL() fires.
+TEST(ExpectCallTest, PicksLastMatchingExpectCall) {
+  MockB b;
+  EXPECT_CALL(b, DoB(_))
+      .WillRepeatedly(Return(2));
+  EXPECT_CALL(b, DoB(1))
+      .WillRepeatedly(Return(1));
+
+  EXPECT_EQ(1, b.DoB(1));
+}
+
+// Tests lower-bound violation.
+TEST(ExpectCallTest, CatchesTooFewCalls) {
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    MockB b;
+    EXPECT_CALL(b, DoB(5))
+        .Times(AtLeast(2));
+
+    b.DoB(5);
+  }, "Actual function call count doesn't match EXPECT_CALL(b, DoB(5))...\n"
+     "         Expected: to be called at least twice\n"
+     "           Actual: called once - unsatisfied and active");
+}
+
+// Tests that the cardinality can be inferred when no Times(...) is
+// specified.
+TEST(ExpectCallTest, InfersCardinalityWhenThereIsNoWillRepeatedly) {
+  {
+    MockB b;
+    EXPECT_CALL(b, DoB())
+        .WillOnce(Return(1))
+        .WillOnce(Return(2));
+
+    EXPECT_EQ(1, b.DoB());
+    EXPECT_EQ(2, b.DoB());
+  }
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    MockB b;
+    EXPECT_CALL(b, DoB())
+        .WillOnce(Return(1))
+        .WillOnce(Return(2));
+
+    EXPECT_EQ(1, b.DoB());
+  }, "to be called twice");
+
+  {  // NOLINT
+    MockB b;
+    EXPECT_CALL(b, DoB())
+        .WillOnce(Return(1))
+        .WillOnce(Return(2));
+
+    EXPECT_EQ(1, b.DoB());
+    EXPECT_EQ(2, b.DoB());
+    EXPECT_NONFATAL_FAILURE(b.DoB(), "to be called twice");
+  }
+}
+
+TEST(ExpectCallTest, InfersCardinality1WhenThereIsWillRepeatedly) {
+  {
+    MockB b;
+    EXPECT_CALL(b, DoB())
+        .WillOnce(Return(1))
+        .WillRepeatedly(Return(2));
+
+    EXPECT_EQ(1, b.DoB());
+  }
+
+  {  // NOLINT
+    MockB b;
+    EXPECT_CALL(b, DoB())
+        .WillOnce(Return(1))
+        .WillRepeatedly(Return(2));
+
+    EXPECT_EQ(1, b.DoB());
+    EXPECT_EQ(2, b.DoB());
+    EXPECT_EQ(2, b.DoB());
+  }
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    MockB b;
+    EXPECT_CALL(b, DoB())
+        .WillOnce(Return(1))
+        .WillRepeatedly(Return(2));
+  }, "to be called at least once");
+}
+
+// Tests that the n-th action is taken for the n-th matching
+// invocation.
+TEST(ExpectCallTest, NthMatchTakesNthAction) {
+  MockB b;
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(1))
+      .WillOnce(Return(2))
+      .WillOnce(Return(3));
+
+  EXPECT_EQ(1, b.DoB());
+  EXPECT_EQ(2, b.DoB());
+  EXPECT_EQ(3, b.DoB());
+}
+
+// Tests that the WillRepeatedly() action is taken when the WillOnce(...)
+// list is exhausted.
+TEST(ExpectCallTest, TakesRepeatedActionWhenWillListIsExhausted) {
+  MockB b;
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(1))
+      .WillRepeatedly(Return(2));
+
+  EXPECT_EQ(1, b.DoB());
+  EXPECT_EQ(2, b.DoB());
+  EXPECT_EQ(2, b.DoB());
+}
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that the default action is taken when the WillOnce(...) list is
+// exhausted and there is no WillRepeatedly().
+TEST(ExpectCallTest, TakesDefaultActionWhenWillListIsExhausted) {
+  MockB b;
+  EXPECT_CALL(b, DoB(_))
+      .Times(1);
+  EXPECT_CALL(b, DoB())
+      .Times(AnyNumber())
+      .WillOnce(Return(1))
+      .WillOnce(Return(2));
+
+  CaptureStdout();
+  EXPECT_EQ(0, b.DoB(1));  // Shouldn't generate a warning as the
+                           // expectation has no action clause at all.
+  EXPECT_EQ(1, b.DoB());
+  EXPECT_EQ(2, b.DoB());
+  const std::string output1 = GetCapturedStdout();
+  EXPECT_STREQ("", output1.c_str());
+
+  CaptureStdout();
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB());
+  const std::string output2 = GetCapturedStdout();
+  EXPECT_THAT(output2.c_str(),
+              HasSubstr("Actions ran out in EXPECT_CALL(b, DoB())...\n"
+                        "Called 3 times, but only 2 WillOnce()s are specified"
+                        " - returning default value."));
+  EXPECT_THAT(output2.c_str(),
+              HasSubstr("Actions ran out in EXPECT_CALL(b, DoB())...\n"
+                        "Called 4 times, but only 2 WillOnce()s are specified"
+                        " - returning default value."));
+}
+
+TEST(FunctionMockerMessageTest, ReportsExpectCallLocationForExhausedActions) {
+  MockB b;
+  std::string expect_call_location = FormatFileLocation(__FILE__, __LINE__ + 1);
+  EXPECT_CALL(b, DoB()).Times(AnyNumber()).WillOnce(Return(1));
+
+  EXPECT_EQ(1, b.DoB());
+
+  CaptureStdout();
+  EXPECT_EQ(0, b.DoB());
+  const std::string output = GetCapturedStdout();
+  // The warning message should contain the call location.
+  EXPECT_PRED_FORMAT2(IsSubstring, expect_call_location, output);
+}
+
+TEST(FunctionMockerMessageTest,
+     ReportsDefaultActionLocationOfUninterestingCallsForNaggyMock) {
+  std::string on_call_location;
+  CaptureStdout();
+  {
+    NaggyMock<MockB> b;
+    on_call_location = FormatFileLocation(__FILE__, __LINE__ + 1);
+    ON_CALL(b, DoB(_)).WillByDefault(Return(0));
+    b.DoB(0);
+  }
+  EXPECT_PRED_FORMAT2(IsSubstring, on_call_location, GetCapturedStdout());
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that an uninteresting call performs the default action.
+TEST(UninterestingCallTest, DoesDefaultAction) {
+  // When there is an ON_CALL() statement, the action specified by it
+  // should be taken.
+  MockA a;
+  ON_CALL(a, Binary(_, _))
+      .WillByDefault(Return(true));
+  EXPECT_TRUE(a.Binary(1, 2));
+
+  // When there is no ON_CALL(), the default value for the return type
+  // should be returned.
+  MockB b;
+  EXPECT_EQ(0, b.DoB());
+}
+
+// Tests that an unexpected call performs the default action.
+TEST(UnexpectedCallTest, DoesDefaultAction) {
+  // When there is an ON_CALL() statement, the action specified by it
+  // should be taken.
+  MockA a;
+  ON_CALL(a, Binary(_, _))
+      .WillByDefault(Return(true));
+  EXPECT_CALL(a, Binary(0, 0));
+  a.Binary(0, 0);
+  bool result = false;
+  EXPECT_NONFATAL_FAILURE(result = a.Binary(1, 2),
+                          "Unexpected mock function call");
+  EXPECT_TRUE(result);
+
+  // When there is no ON_CALL(), the default value for the return type
+  // should be returned.
+  MockB b;
+  EXPECT_CALL(b, DoB(0))
+      .Times(0);
+  int n = -1;
+  EXPECT_NONFATAL_FAILURE(n = b.DoB(1),
+                          "Unexpected mock function call");
+  EXPECT_EQ(0, n);
+}
+
+// Tests that when an unexpected void function generates the right
+// failure message.
+TEST(UnexpectedCallTest, GeneratesFailureForVoidFunction) {
+  // First, tests the message when there is only one EXPECT_CALL().
+  MockA a1;
+  EXPECT_CALL(a1, DoA(1));
+  a1.DoA(1);
+  // Ideally we should match the failure message against a regex, but
+  // EXPECT_NONFATAL_FAILURE doesn't support that, so we test for
+  // multiple sub-strings instead.
+  EXPECT_NONFATAL_FAILURE(
+      a1.DoA(9),
+      "Unexpected mock function call - returning directly.\n"
+      "    Function call: DoA(9)\n"
+      "Google Mock tried the following 1 expectation, but it didn't match:");
+  EXPECT_NONFATAL_FAILURE(
+      a1.DoA(9),
+      "  Expected arg #0: is equal to 1\n"
+      "           Actual: 9\n"
+      "         Expected: to be called once\n"
+      "           Actual: called once - saturated and active");
+
+  // Next, tests the message when there are more than one EXPECT_CALL().
+  MockA a2;
+  EXPECT_CALL(a2, DoA(1));
+  EXPECT_CALL(a2, DoA(3));
+  a2.DoA(1);
+  EXPECT_NONFATAL_FAILURE(
+      a2.DoA(2),
+      "Unexpected mock function call - returning directly.\n"
+      "    Function call: DoA(2)\n"
+      "Google Mock tried the following 2 expectations, but none matched:");
+  EXPECT_NONFATAL_FAILURE(
+      a2.DoA(2),
+      "tried expectation #0: EXPECT_CALL(a2, DoA(1))...\n"
+      "  Expected arg #0: is equal to 1\n"
+      "           Actual: 2\n"
+      "         Expected: to be called once\n"
+      "           Actual: called once - saturated and active");
+  EXPECT_NONFATAL_FAILURE(
+      a2.DoA(2),
+      "tried expectation #1: EXPECT_CALL(a2, DoA(3))...\n"
+      "  Expected arg #0: is equal to 3\n"
+      "           Actual: 2\n"
+      "         Expected: to be called once\n"
+      "           Actual: never called - unsatisfied and active");
+  a2.DoA(3);
+}
+
+// Tests that an unexpected non-void function generates the right
+// failure message.
+TEST(UnexpectedCallTest, GeneartesFailureForNonVoidFunction) {
+  MockB b1;
+  EXPECT_CALL(b1, DoB(1));
+  b1.DoB(1);
+  EXPECT_NONFATAL_FAILURE(
+      b1.DoB(2),
+      "Unexpected mock function call - returning default value.\n"
+      "    Function call: DoB(2)\n"
+      "          Returns: 0\n"
+      "Google Mock tried the following 1 expectation, but it didn't match:");
+  EXPECT_NONFATAL_FAILURE(
+      b1.DoB(2),
+      "  Expected arg #0: is equal to 1\n"
+      "           Actual: 2\n"
+      "         Expected: to be called once\n"
+      "           Actual: called once - saturated and active");
+}
+
+// Tests that Google Mock explains that an retired expectation doesn't
+// match the call.
+TEST(UnexpectedCallTest, RetiredExpectation) {
+  MockB b;
+  EXPECT_CALL(b, DoB(1))
+      .RetiresOnSaturation();
+
+  b.DoB(1);
+  EXPECT_NONFATAL_FAILURE(
+      b.DoB(1),
+      "         Expected: the expectation is active\n"
+      "           Actual: it is retired");
+}
+
+// Tests that Google Mock explains that an expectation that doesn't
+// match the arguments doesn't match the call.
+TEST(UnexpectedCallTest, UnmatchedArguments) {
+  MockB b;
+  EXPECT_CALL(b, DoB(1));
+
+  EXPECT_NONFATAL_FAILURE(
+      b.DoB(2),
+      "  Expected arg #0: is equal to 1\n"
+      "           Actual: 2\n");
+  b.DoB(1);
+}
+
+// Tests that Google Mock explains that an expectation with
+// unsatisfied pre-requisites doesn't match the call.
+TEST(UnexpectedCallTest, UnsatisifiedPrerequisites) {
+  Sequence s1, s2;
+  MockB b;
+  EXPECT_CALL(b, DoB(1))
+      .InSequence(s1);
+  EXPECT_CALL(b, DoB(2))
+      .Times(AnyNumber())
+      .InSequence(s1);
+  EXPECT_CALL(b, DoB(3))
+      .InSequence(s2);
+  EXPECT_CALL(b, DoB(4))
+      .InSequence(s1, s2);
+
+  ::testing::TestPartResultArray failures;
+  {
+    ::testing::ScopedFakeTestPartResultReporter reporter(&failures);
+    b.DoB(4);
+    // Now 'failures' contains the Google Test failures generated by
+    // the above statement.
+  }
+
+  // There should be one non-fatal failure.
+  ASSERT_EQ(1, failures.size());
+  const ::testing::TestPartResult& r = failures.GetTestPartResult(0);
+  EXPECT_EQ(::testing::TestPartResult::kNonFatalFailure, r.type());
+
+  // Verifies that the failure message contains the two unsatisfied
+  // pre-requisites but not the satisfied one.
+#if GTEST_USES_PCRE
+  EXPECT_THAT(r.message(), ContainsRegex(
+      // PCRE has trouble using (.|\n) to match any character, but
+      // supports the (?s) prefix for using . to match any character.
+      "(?s)the following immediate pre-requisites are not satisfied:\n"
+      ".*: pre-requisite #0\n"
+      ".*: pre-requisite #1"));
+#elif GTEST_USES_POSIX_RE
+  EXPECT_THAT(r.message(), ContainsRegex(
+      // POSIX RE doesn't understand the (?s) prefix, but has no trouble
+      // with (.|\n).
+      "the following immediate pre-requisites are not satisfied:\n"
+      "(.|\n)*: pre-requisite #0\n"
+      "(.|\n)*: pre-requisite #1"));
+#else
+  // We can only use Google Test's own simple regex.
+  EXPECT_THAT(r.message(), ContainsRegex(
+      "the following immediate pre-requisites are not satisfied:"));
+  EXPECT_THAT(r.message(), ContainsRegex(": pre-requisite #0"));
+  EXPECT_THAT(r.message(), ContainsRegex(": pre-requisite #1"));
+#endif  // GTEST_USES_PCRE
+
+  b.DoB(1);
+  b.DoB(3);
+  b.DoB(4);
+}
+
+TEST(UndefinedReturnValueTest,
+     ReturnValueIsMandatoryWhenNotDefaultConstructible) {
+  MockA a;
+  // FIXME: We should really verify the output message,
+  // but we cannot yet due to that EXPECT_DEATH only captures stderr
+  // while Google Mock logs to stdout.
+#if GTEST_HAS_EXCEPTIONS
+  EXPECT_ANY_THROW(a.ReturnNonDefaultConstructible());
+#else
+  EXPECT_DEATH_IF_SUPPORTED(a.ReturnNonDefaultConstructible(), "");
+#endif
+}
+
+// Tests that an excessive call (one whose arguments match the
+// matchers but is called too many times) performs the default action.
+TEST(ExcessiveCallTest, DoesDefaultAction) {
+  // When there is an ON_CALL() statement, the action specified by it
+  // should be taken.
+  MockA a;
+  ON_CALL(a, Binary(_, _))
+      .WillByDefault(Return(true));
+  EXPECT_CALL(a, Binary(0, 0));
+  a.Binary(0, 0);
+  bool result = false;
+  EXPECT_NONFATAL_FAILURE(result = a.Binary(0, 0),
+                          "Mock function called more times than expected");
+  EXPECT_TRUE(result);
+
+  // When there is no ON_CALL(), the default value for the return type
+  // should be returned.
+  MockB b;
+  EXPECT_CALL(b, DoB(0))
+      .Times(0);
+  int n = -1;
+  EXPECT_NONFATAL_FAILURE(n = b.DoB(0),
+                          "Mock function called more times than expected");
+  EXPECT_EQ(0, n);
+}
+
+// Tests that when a void function is called too many times,
+// the failure message contains the argument values.
+TEST(ExcessiveCallTest, GeneratesFailureForVoidFunction) {
+  MockA a;
+  EXPECT_CALL(a, DoA(_))
+      .Times(0);
+  EXPECT_NONFATAL_FAILURE(
+      a.DoA(9),
+      "Mock function called more times than expected - returning directly.\n"
+      "    Function call: DoA(9)\n"
+      "         Expected: to be never called\n"
+      "           Actual: called once - over-saturated and active");
+}
+
+// Tests that when a non-void function is called too many times, the
+// failure message contains the argument values and the return value.
+TEST(ExcessiveCallTest, GeneratesFailureForNonVoidFunction) {
+  MockB b;
+  EXPECT_CALL(b, DoB(_));
+  b.DoB(1);
+  EXPECT_NONFATAL_FAILURE(
+      b.DoB(2),
+      "Mock function called more times than expected - "
+      "returning default value.\n"
+      "    Function call: DoB(2)\n"
+      "          Returns: 0\n"
+      "         Expected: to be called once\n"
+      "           Actual: called twice - over-saturated and active");
+}
+
+// Tests using sequences.
+
+TEST(InSequenceTest, AllExpectationInScopeAreInSequence) {
+  MockA a;
+  {
+    InSequence dummy;
+
+    EXPECT_CALL(a, DoA(1));
+    EXPECT_CALL(a, DoA(2));
+  }
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    a.DoA(2);
+  }, "Unexpected mock function call");
+
+  a.DoA(1);
+  a.DoA(2);
+}
+
+TEST(InSequenceTest, NestedInSequence) {
+  MockA a;
+  {
+    InSequence dummy;
+
+    EXPECT_CALL(a, DoA(1));
+    {
+      InSequence dummy2;
+
+      EXPECT_CALL(a, DoA(2));
+      EXPECT_CALL(a, DoA(3));
+    }
+  }
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    a.DoA(1);
+    a.DoA(3);
+  }, "Unexpected mock function call");
+
+  a.DoA(2);
+  a.DoA(3);
+}
+
+TEST(InSequenceTest, ExpectationsOutOfScopeAreNotAffected) {
+  MockA a;
+  {
+    InSequence dummy;
+
+    EXPECT_CALL(a, DoA(1));
+    EXPECT_CALL(a, DoA(2));
+  }
+  EXPECT_CALL(a, DoA(3));
+
+  EXPECT_NONFATAL_FAILURE({  // NOLINT
+    a.DoA(2);
+  }, "Unexpected mock function call");
+
+  a.DoA(3);
+  a.DoA(1);
+  a.DoA(2);
+}
+
+// Tests that any order is allowed when no sequence is used.
+TEST(SequenceTest, AnyOrderIsOkByDefault) {
+  {
+    MockA a;
+    MockB b;
+
+    EXPECT_CALL(a, DoA(1));
+    EXPECT_CALL(b, DoB())
+        .Times(AnyNumber());
+
+    a.DoA(1);
+    b.DoB();
+  }
+
+  {  // NOLINT
+    MockA a;
+    MockB b;
+
+    EXPECT_CALL(a, DoA(1));
+    EXPECT_CALL(b, DoB())
+        .Times(AnyNumber());
+
+    b.DoB();
+    a.DoA(1);
+  }
+}
+
+// Tests that the calls must be in strict order when a complete order
+// is specified.
+TEST(SequenceTest, CallsMustBeInStrictOrderWhenSaidSo1) {
+  MockA a;
+  ON_CALL(a, ReturnResult(_))
+      .WillByDefault(Return(Result()));
+
+  Sequence s;
+  EXPECT_CALL(a, ReturnResult(1))
+      .InSequence(s);
+  EXPECT_CALL(a, ReturnResult(2))
+      .InSequence(s);
+  EXPECT_CALL(a, ReturnResult(3))
+      .InSequence(s);
+
+  a.ReturnResult(1);
+
+  // May only be called after a.ReturnResult(2).
+  EXPECT_NONFATAL_FAILURE(a.ReturnResult(3), "Unexpected mock function call");
+
+  a.ReturnResult(2);
+  a.ReturnResult(3);
+}
+
+// Tests that the calls must be in strict order when a complete order
+// is specified.
+TEST(SequenceTest, CallsMustBeInStrictOrderWhenSaidSo2) {
+  MockA a;
+  ON_CALL(a, ReturnResult(_))
+      .WillByDefault(Return(Result()));
+
+  Sequence s;
+  EXPECT_CALL(a, ReturnResult(1))
+      .InSequence(s);
+  EXPECT_CALL(a, ReturnResult(2))
+      .InSequence(s);
+
+  // May only be called after a.ReturnResult(1).
+  EXPECT_NONFATAL_FAILURE(a.ReturnResult(2), "Unexpected mock function call");
+
+  a.ReturnResult(1);
+  a.ReturnResult(2);
+}
+
+// Tests specifying a DAG using multiple sequences.
+class PartialOrderTest : public testing::Test {
+ protected:
+  PartialOrderTest() {
+    ON_CALL(a_, ReturnResult(_))
+        .WillByDefault(Return(Result()));
+
+    // Specifies this partial ordering:
+    //
+    // a.ReturnResult(1) ==>
+    //                       a.ReturnResult(2) * n  ==>  a.ReturnResult(3)
+    // b.DoB() * 2       ==>
+    Sequence x, y;
+    EXPECT_CALL(a_, ReturnResult(1))
+        .InSequence(x);
+    EXPECT_CALL(b_, DoB())
+        .Times(2)
+        .InSequence(y);
+    EXPECT_CALL(a_, ReturnResult(2))
+        .Times(AnyNumber())
+        .InSequence(x, y);
+    EXPECT_CALL(a_, ReturnResult(3))
+        .InSequence(x);
+  }
+
+  MockA a_;
+  MockB b_;
+};
+
+TEST_F(PartialOrderTest, CallsMustConformToSpecifiedDag1) {
+  a_.ReturnResult(1);
+  b_.DoB();
+
+  // May only be called after the second DoB().
+  EXPECT_NONFATAL_FAILURE(a_.ReturnResult(2), "Unexpected mock function call");
+
+  b_.DoB();
+  a_.ReturnResult(3);
+}
+
+TEST_F(PartialOrderTest, CallsMustConformToSpecifiedDag2) {
+  // May only be called after ReturnResult(1).
+  EXPECT_NONFATAL_FAILURE(a_.ReturnResult(2), "Unexpected mock function call");
+
+  a_.ReturnResult(1);
+  b_.DoB();
+  b_.DoB();
+  a_.ReturnResult(3);
+}
+
+TEST_F(PartialOrderTest, CallsMustConformToSpecifiedDag3) {
+  // May only be called last.
+  EXPECT_NONFATAL_FAILURE(a_.ReturnResult(3), "Unexpected mock function call");
+
+  a_.ReturnResult(1);
+  b_.DoB();
+  b_.DoB();
+  a_.ReturnResult(3);
+}
+
+TEST_F(PartialOrderTest, CallsMustConformToSpecifiedDag4) {
+  a_.ReturnResult(1);
+  b_.DoB();
+  b_.DoB();
+  a_.ReturnResult(3);
+
+  // May only be called before ReturnResult(3).
+  EXPECT_NONFATAL_FAILURE(a_.ReturnResult(2), "Unexpected mock function call");
+}
+
+TEST(SequenceTest, Retirement) {
+  MockA a;
+  Sequence s;
+
+  EXPECT_CALL(a, DoA(1))
+      .InSequence(s);
+  EXPECT_CALL(a, DoA(_))
+      .InSequence(s)
+      .RetiresOnSaturation();
+  EXPECT_CALL(a, DoA(1))
+      .InSequence(s);
+
+  a.DoA(1);
+  a.DoA(2);
+  a.DoA(1);
+}
+
+// Tests Expectation.
+
+TEST(ExpectationTest, ConstrutorsWork) {
+  MockA a;
+  Expectation e1;  // Default ctor.
+
+  // Ctor from various forms of EXPECT_CALL.
+  Expectation e2 = EXPECT_CALL(a, DoA(2));
+  Expectation e3 = EXPECT_CALL(a, DoA(3)).With(_);
+  {
+    Sequence s;
+    Expectation e4 = EXPECT_CALL(a, DoA(4)).Times(1);
+    Expectation e5 = EXPECT_CALL(a, DoA(5)).InSequence(s);
+  }
+  Expectation e6 = EXPECT_CALL(a, DoA(6)).After(e2);
+  Expectation e7 = EXPECT_CALL(a, DoA(7)).WillOnce(Return());
+  Expectation e8 = EXPECT_CALL(a, DoA(8)).WillRepeatedly(Return());
+  Expectation e9 = EXPECT_CALL(a, DoA(9)).RetiresOnSaturation();
+
+  Expectation e10 = e2;  // Copy ctor.
+
+  EXPECT_THAT(e1, Ne(e2));
+  EXPECT_THAT(e2, Eq(e10));
+
+  a.DoA(2);
+  a.DoA(3);
+  a.DoA(4);
+  a.DoA(5);
+  a.DoA(6);
+  a.DoA(7);
+  a.DoA(8);
+  a.DoA(9);
+}
+
+TEST(ExpectationTest, AssignmentWorks) {
+  MockA a;
+  Expectation e1;
+  Expectation e2 = EXPECT_CALL(a, DoA(1));
+
+  EXPECT_THAT(e1, Ne(e2));
+
+  e1 = e2;
+  EXPECT_THAT(e1, Eq(e2));
+
+  a.DoA(1);
+}
+
+// Tests ExpectationSet.
+
+TEST(ExpectationSetTest, MemberTypesAreCorrect) {
+  ::testing::StaticAssertTypeEq<Expectation, ExpectationSet::value_type>();
+}
+
+TEST(ExpectationSetTest, ConstructorsWork) {
+  MockA a;
+
+  Expectation e1;
+  const Expectation e2;
+  ExpectationSet es1;  // Default ctor.
+  ExpectationSet es2 = EXPECT_CALL(a, DoA(1));  // Ctor from EXPECT_CALL.
+  ExpectationSet es3 = e1;  // Ctor from Expectation.
+  ExpectationSet es4(e1);   // Ctor from Expectation; alternative syntax.
+  ExpectationSet es5 = e2;  // Ctor from const Expectation.
+  ExpectationSet es6(e2);   // Ctor from const Expectation; alternative syntax.
+  ExpectationSet es7 = es2;  // Copy ctor.
+
+  EXPECT_EQ(0, es1.size());
+  EXPECT_EQ(1, es2.size());
+  EXPECT_EQ(1, es3.size());
+  EXPECT_EQ(1, es4.size());
+  EXPECT_EQ(1, es5.size());
+  EXPECT_EQ(1, es6.size());
+  EXPECT_EQ(1, es7.size());
+
+  EXPECT_THAT(es3, Ne(es2));
+  EXPECT_THAT(es4, Eq(es3));
+  EXPECT_THAT(es5, Eq(es4));
+  EXPECT_THAT(es6, Eq(es5));
+  EXPECT_THAT(es7, Eq(es2));
+  a.DoA(1);
+}
+
+TEST(ExpectationSetTest, AssignmentWorks) {
+  ExpectationSet es1;
+  ExpectationSet es2 = Expectation();
+
+  es1 = es2;
+  EXPECT_EQ(1, es1.size());
+  EXPECT_THAT(*(es1.begin()), Eq(Expectation()));
+  EXPECT_THAT(es1, Eq(es2));
+}
+
+TEST(ExpectationSetTest, InsertionWorks) {
+  ExpectationSet es1;
+  Expectation e1;
+  es1 += e1;
+  EXPECT_EQ(1, es1.size());
+  EXPECT_THAT(*(es1.begin()), Eq(e1));
+
+  MockA a;
+  Expectation e2 = EXPECT_CALL(a, DoA(1));
+  es1 += e2;
+  EXPECT_EQ(2, es1.size());
+
+  ExpectationSet::const_iterator it1 = es1.begin();
+  ExpectationSet::const_iterator it2 = it1;
+  ++it2;
+  EXPECT_TRUE(*it1 == e1 || *it2 == e1);  // e1 must be in the set.
+  EXPECT_TRUE(*it1 == e2 || *it2 == e2);  // e2 must be in the set too.
+  a.DoA(1);
+}
+
+TEST(ExpectationSetTest, SizeWorks) {
+  ExpectationSet es;
+  EXPECT_EQ(0, es.size());
+
+  es += Expectation();
+  EXPECT_EQ(1, es.size());
+
+  MockA a;
+  es += EXPECT_CALL(a, DoA(1));
+  EXPECT_EQ(2, es.size());
+
+  a.DoA(1);
+}
+
+TEST(ExpectationSetTest, IsEnumerable) {
+  ExpectationSet es;
+  EXPECT_TRUE(es.begin() == es.end());
+
+  es += Expectation();
+  ExpectationSet::const_iterator it = es.begin();
+  EXPECT_TRUE(it != es.end());
+  EXPECT_THAT(*it, Eq(Expectation()));
+  ++it;
+  EXPECT_TRUE(it== es.end());
+}
+
+// Tests the .After() clause.
+
+TEST(AfterTest, SucceedsWhenPartialOrderIsSatisfied) {
+  MockA a;
+  ExpectationSet es;
+  es += EXPECT_CALL(a, DoA(1));
+  es += EXPECT_CALL(a, DoA(2));
+  EXPECT_CALL(a, DoA(3))
+      .After(es);
+
+  a.DoA(1);
+  a.DoA(2);
+  a.DoA(3);
+}
+
+TEST(AfterTest, SucceedsWhenTotalOrderIsSatisfied) {
+  MockA a;
+  MockB b;
+  // The following also verifies that const Expectation objects work
+  // too.  Do not remove the const modifiers.
+  const Expectation e1 = EXPECT_CALL(a, DoA(1));
+  const Expectation e2 = EXPECT_CALL(b, DoB())
+      .Times(2)
+      .After(e1);
+  EXPECT_CALL(a, DoA(2)).After(e2);
+
+  a.DoA(1);
+  b.DoB();
+  b.DoB();
+  a.DoA(2);
+}
+
+// Calls must be in strict order when specified so using .After().
+TEST(AfterTest, CallsMustBeInStrictOrderWhenSpecifiedSo1) {
+  MockA a;
+  MockB b;
+
+  // Define ordering:
+  //   a.DoA(1) ==> b.DoB() ==> a.DoA(2)
+  Expectation e1 = EXPECT_CALL(a, DoA(1));
+  Expectation e2 = EXPECT_CALL(b, DoB())
+      .After(e1);
+  EXPECT_CALL(a, DoA(2))
+      .After(e2);
+
+  a.DoA(1);
+
+  // May only be called after DoB().
+  EXPECT_NONFATAL_FAILURE(a.DoA(2), "Unexpected mock function call");
+
+  b.DoB();
+  a.DoA(2);
+}
+
+// Calls must be in strict order when specified so using .After().
+TEST(AfterTest, CallsMustBeInStrictOrderWhenSpecifiedSo2) {
+  MockA a;
+  MockB b;
+
+  // Define ordering:
+  //   a.DoA(1) ==> b.DoB() * 2 ==> a.DoA(2)
+  Expectation e1 = EXPECT_CALL(a, DoA(1));
+  Expectation e2 = EXPECT_CALL(b, DoB())
+      .Times(2)
+      .After(e1);
+  EXPECT_CALL(a, DoA(2))
+      .After(e2);
+
+  a.DoA(1);
+  b.DoB();
+
+  // May only be called after the second DoB().
+  EXPECT_NONFATAL_FAILURE(a.DoA(2), "Unexpected mock function call");
+
+  b.DoB();
+  a.DoA(2);
+}
+
+// Calls must satisfy the partial order when specified so.
+TEST(AfterTest, CallsMustSatisfyPartialOrderWhenSpecifiedSo) {
+  MockA a;
+  ON_CALL(a, ReturnResult(_))
+      .WillByDefault(Return(Result()));
+
+  // Define ordering:
+  //   a.DoA(1) ==>
+  //   a.DoA(2) ==> a.ReturnResult(3)
+  Expectation e = EXPECT_CALL(a, DoA(1));
+  const ExpectationSet es = EXPECT_CALL(a, DoA(2));
+  EXPECT_CALL(a, ReturnResult(3))
+      .After(e, es);
+
+  // May only be called last.
+  EXPECT_NONFATAL_FAILURE(a.ReturnResult(3), "Unexpected mock function call");
+
+  a.DoA(2);
+  a.DoA(1);
+  a.ReturnResult(3);
+}
+
+// Calls must satisfy the partial order when specified so.
+TEST(AfterTest, CallsMustSatisfyPartialOrderWhenSpecifiedSo2) {
+  MockA a;
+
+  // Define ordering:
+  //   a.DoA(1) ==>
+  //   a.DoA(2) ==> a.DoA(3)
+  Expectation e = EXPECT_CALL(a, DoA(1));
+  const ExpectationSet es = EXPECT_CALL(a, DoA(2));
+  EXPECT_CALL(a, DoA(3))
+      .After(e, es);
+
+  a.DoA(2);
+
+  // May only be called last.
+  EXPECT_NONFATAL_FAILURE(a.DoA(3), "Unexpected mock function call");
+
+  a.DoA(1);
+  a.DoA(3);
+}
+
+// .After() can be combined with .InSequence().
+TEST(AfterTest, CanBeUsedWithInSequence) {
+  MockA a;
+  Sequence s;
+  Expectation e = EXPECT_CALL(a, DoA(1));
+  EXPECT_CALL(a, DoA(2)).InSequence(s);
+  EXPECT_CALL(a, DoA(3))
+      .InSequence(s)
+      .After(e);
+
+  a.DoA(1);
+
+  // May only be after DoA(2).
+  EXPECT_NONFATAL_FAILURE(a.DoA(3), "Unexpected mock function call");
+
+  a.DoA(2);
+  a.DoA(3);
+}
+
+// .After() can be called multiple times.
+TEST(AfterTest, CanBeCalledManyTimes) {
+  MockA a;
+  Expectation e1 = EXPECT_CALL(a, DoA(1));
+  Expectation e2 = EXPECT_CALL(a, DoA(2));
+  Expectation e3 = EXPECT_CALL(a, DoA(3));
+  EXPECT_CALL(a, DoA(4))
+      .After(e1)
+      .After(e2)
+      .After(e3);
+
+  a.DoA(3);
+  a.DoA(1);
+  a.DoA(2);
+  a.DoA(4);
+}
+
+// .After() accepts up to 5 arguments.
+TEST(AfterTest, AcceptsUpToFiveArguments) {
+  MockA a;
+  Expectation e1 = EXPECT_CALL(a, DoA(1));
+  Expectation e2 = EXPECT_CALL(a, DoA(2));
+  Expectation e3 = EXPECT_CALL(a, DoA(3));
+  ExpectationSet es1 = EXPECT_CALL(a, DoA(4));
+  ExpectationSet es2 = EXPECT_CALL(a, DoA(5));
+  EXPECT_CALL(a, DoA(6))
+      .After(e1, e2, e3, es1, es2);
+
+  a.DoA(5);
+  a.DoA(2);
+  a.DoA(4);
+  a.DoA(1);
+  a.DoA(3);
+  a.DoA(6);
+}
+
+// .After() allows input to contain duplicated Expectations.
+TEST(AfterTest, AcceptsDuplicatedInput) {
+  MockA a;
+  ON_CALL(a, ReturnResult(_))
+      .WillByDefault(Return(Result()));
+
+  // Define ordering:
+  //   DoA(1) ==>
+  //   DoA(2) ==> ReturnResult(3)
+  Expectation e1 = EXPECT_CALL(a, DoA(1));
+  Expectation e2 = EXPECT_CALL(a, DoA(2));
+  ExpectationSet es;
+  es += e1;
+  es += e2;
+  EXPECT_CALL(a, ReturnResult(3))
+      .After(e1, e2, es, e1);
+
+  a.DoA(1);
+
+  // May only be after DoA(2).
+  EXPECT_NONFATAL_FAILURE(a.ReturnResult(3), "Unexpected mock function call");
+
+  a.DoA(2);
+  a.ReturnResult(3);
+}
+
+// An Expectation added to an ExpectationSet after it has been used in
+// an .After() has no effect.
+TEST(AfterTest, ChangesToExpectationSetHaveNoEffectAfterwards) {
+  MockA a;
+  ExpectationSet es1 = EXPECT_CALL(a, DoA(1));
+  Expectation e2 = EXPECT_CALL(a, DoA(2));
+  EXPECT_CALL(a, DoA(3))
+      .After(es1);
+  es1 += e2;
+
+  a.DoA(1);
+  a.DoA(3);
+  a.DoA(2);
+}
+
+// Tests that Google Mock correctly handles calls to mock functions
+// after a mock object owning one of their pre-requisites has died.
+
+// Tests that calls that satisfy the original spec are successful.
+TEST(DeletingMockEarlyTest, Success1) {
+  MockB* const b1 = new MockB;
+  MockA* const a = new MockA;
+  MockB* const b2 = new MockB;
+
+  {
+    InSequence dummy;
+    EXPECT_CALL(*b1, DoB(_))
+        .WillOnce(Return(1));
+    EXPECT_CALL(*a, Binary(_, _))
+        .Times(AnyNumber())
+        .WillRepeatedly(Return(true));
+    EXPECT_CALL(*b2, DoB(_))
+        .Times(AnyNumber())
+        .WillRepeatedly(Return(2));
+  }
+
+  EXPECT_EQ(1, b1->DoB(1));
+  delete b1;
+  // a's pre-requisite has died.
+  EXPECT_TRUE(a->Binary(0, 1));
+  delete b2;
+  // a's successor has died.
+  EXPECT_TRUE(a->Binary(1, 2));
+  delete a;
+}
+
+// Tests that calls that satisfy the original spec are successful.
+TEST(DeletingMockEarlyTest, Success2) {
+  MockB* const b1 = new MockB;
+  MockA* const a = new MockA;
+  MockB* const b2 = new MockB;
+
+  {
+    InSequence dummy;
+    EXPECT_CALL(*b1, DoB(_))
+        .WillOnce(Return(1));
+    EXPECT_CALL(*a, Binary(_, _))
+        .Times(AnyNumber());
+    EXPECT_CALL(*b2, DoB(_))
+        .Times(AnyNumber())
+        .WillRepeatedly(Return(2));
+  }
+
+  delete a;  // a is trivially satisfied.
+  EXPECT_EQ(1, b1->DoB(1));
+  EXPECT_EQ(2, b2->DoB(2));
+  delete b1;
+  delete b2;
+}
+
+// Tests that it's OK to delete a mock object itself in its action.
+
+// Suppresses warning on unreferenced formal parameter in MSVC with
+// -W4.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+ACTION_P(Delete, ptr) { delete ptr; }
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+TEST(DeletingMockEarlyTest, CanDeleteSelfInActionReturningVoid) {
+  MockA* const a = new MockA;
+  EXPECT_CALL(*a, DoA(_)).WillOnce(Delete(a));
+  a->DoA(42);  // This will cause a to be deleted.
+}
+
+TEST(DeletingMockEarlyTest, CanDeleteSelfInActionReturningValue) {
+  MockA* const a = new MockA;
+  EXPECT_CALL(*a, ReturnResult(_))
+      .WillOnce(DoAll(Delete(a), Return(Result())));
+  a->ReturnResult(42);  // This will cause a to be deleted.
+}
+
+// Tests that calls that violate the original spec yield failures.
+TEST(DeletingMockEarlyTest, Failure1) {
+  MockB* const b1 = new MockB;
+  MockA* const a = new MockA;
+  MockB* const b2 = new MockB;
+
+  {
+    InSequence dummy;
+    EXPECT_CALL(*b1, DoB(_))
+        .WillOnce(Return(1));
+    EXPECT_CALL(*a, Binary(_, _))
+        .Times(AnyNumber());
+    EXPECT_CALL(*b2, DoB(_))
+        .Times(AnyNumber())
+        .WillRepeatedly(Return(2));
+  }
+
+  delete a;  // a is trivially satisfied.
+  EXPECT_NONFATAL_FAILURE({
+    b2->DoB(2);
+  }, "Unexpected mock function call");
+  EXPECT_EQ(1, b1->DoB(1));
+  delete b1;
+  delete b2;
+}
+
+// Tests that calls that violate the original spec yield failures.
+TEST(DeletingMockEarlyTest, Failure2) {
+  MockB* const b1 = new MockB;
+  MockA* const a = new MockA;
+  MockB* const b2 = new MockB;
+
+  {
+    InSequence dummy;
+    EXPECT_CALL(*b1, DoB(_));
+    EXPECT_CALL(*a, Binary(_, _))
+        .Times(AnyNumber());
+    EXPECT_CALL(*b2, DoB(_))
+        .Times(AnyNumber());
+  }
+
+  EXPECT_NONFATAL_FAILURE(delete b1,
+                          "Actual: never called");
+  EXPECT_NONFATAL_FAILURE(a->Binary(0, 1),
+                          "Unexpected mock function call");
+  EXPECT_NONFATAL_FAILURE(b2->DoB(1),
+                          "Unexpected mock function call");
+  delete a;
+  delete b2;
+}
+
+class EvenNumberCardinality : public CardinalityInterface {
+ public:
+  // Returns true if and only if call_count calls will satisfy this
+  // cardinality.
+  bool IsSatisfiedByCallCount(int call_count) const override {
+    return call_count % 2 == 0;
+  }
+
+  // Returns true if and only if call_count calls will saturate this
+  // cardinality.
+  bool IsSaturatedByCallCount(int /* call_count */) const override {
+    return false;
+  }
+
+  // Describes self to an ostream.
+  void DescribeTo(::std::ostream* os) const override {
+    *os << "called even number of times";
+  }
+};
+
+Cardinality EvenNumber() {
+  return Cardinality(new EvenNumberCardinality);
+}
+
+TEST(ExpectationBaseTest,
+     AllPrerequisitesAreSatisfiedWorksForNonMonotonicCardinality) {
+  MockA* a = new MockA;
+  Sequence s;
+
+  EXPECT_CALL(*a, DoA(1))
+      .Times(EvenNumber())
+      .InSequence(s);
+  EXPECT_CALL(*a, DoA(2))
+      .Times(AnyNumber())
+      .InSequence(s);
+  EXPECT_CALL(*a, DoA(3))
+      .Times(AnyNumber());
+
+  a->DoA(3);
+  a->DoA(1);
+  EXPECT_NONFATAL_FAILURE(a->DoA(2), "Unexpected mock function call");
+  EXPECT_NONFATAL_FAILURE(delete a, "to be called even number of times");
+}
+
+// The following tests verify the message generated when a mock
+// function is called.
+
+struct Printable {
+};
+
+inline void operator<<(::std::ostream& os, const Printable&) {
+  os << "Printable";
+}
+
+struct Unprintable {
+  Unprintable() : value(0) {}
+  int value;
+};
+
+class MockC {
+ public:
+  MockC() {}
+
+  MOCK_METHOD6(VoidMethod, void(bool cond, int n, std::string s, void* p,
+                                const Printable& x, Unprintable y));
+  MOCK_METHOD0(NonVoidMethod, int());  // NOLINT
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockC);
+};
+
+class VerboseFlagPreservingFixture : public testing::Test {
+ protected:
+  VerboseFlagPreservingFixture()
+      : saved_verbose_flag_(GMOCK_FLAG(verbose)) {}
+
+  ~VerboseFlagPreservingFixture() override {
+    GMOCK_FLAG(verbose) = saved_verbose_flag_;
+  }
+
+ private:
+  const std::string saved_verbose_flag_;
+
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(VerboseFlagPreservingFixture);
+};
+
+#if GTEST_HAS_STREAM_REDIRECTION
+
+// Tests that an uninteresting mock function call on a naggy mock
+// generates a warning without the stack trace when
+// --gmock_verbose=warning is specified.
+TEST(FunctionCallMessageTest,
+     UninterestingCallOnNaggyMockGeneratesNoStackTraceWhenVerboseWarning) {
+  GMOCK_FLAG(verbose) = kWarningVerbosity;
+  NaggyMock<MockC> c;
+  CaptureStdout();
+  c.VoidMethod(false, 5, "Hi", nullptr, Printable(), Unprintable());
+  const std::string output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(IsSubstring, "GMOCK WARNING", output);
+  EXPECT_PRED_FORMAT2(IsNotSubstring, "Stack trace:", output);
+}
+
+// Tests that an uninteresting mock function call on a naggy mock
+// generates a warning containing the stack trace when
+// --gmock_verbose=info is specified.
+TEST(FunctionCallMessageTest,
+     UninterestingCallOnNaggyMockGeneratesFyiWithStackTraceWhenVerboseInfo) {
+  GMOCK_FLAG(verbose) = kInfoVerbosity;
+  NaggyMock<MockC> c;
+  CaptureStdout();
+  c.VoidMethod(false, 5, "Hi", nullptr, Printable(), Unprintable());
+  const std::string output = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(IsSubstring, "GMOCK WARNING", output);
+  EXPECT_PRED_FORMAT2(IsSubstring, "Stack trace:", output);
+
+# ifndef NDEBUG
+
+  // We check the stack trace content in dbg-mode only, as opt-mode
+  // may inline the call we are interested in seeing.
+
+  // Verifies that a void mock function's name appears in the stack
+  // trace.
+  EXPECT_PRED_FORMAT2(IsSubstring, "VoidMethod(", output);
+
+  // Verifies that a non-void mock function's name appears in the
+  // stack trace.
+  CaptureStdout();
+  c.NonVoidMethod();
+  const std::string output2 = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(IsSubstring, "NonVoidMethod(", output2);
+
+# endif  // NDEBUG
+}
+
+// Tests that an uninteresting mock function call on a naggy mock
+// causes the function arguments and return value to be printed.
+TEST(FunctionCallMessageTest,
+     UninterestingCallOnNaggyMockPrintsArgumentsAndReturnValue) {
+  // A non-void mock function.
+  NaggyMock<MockB> b;
+  CaptureStdout();
+  b.DoB();
+  const std::string output1 = GetCapturedStdout();
+  EXPECT_PRED_FORMAT2(
+      IsSubstring,
+      "Uninteresting mock function call - returning default value.\n"
+      "    Function call: DoB()\n"
+      "          Returns: 0\n", output1.c_str());
+  // Makes sure the return value is printed.
+
+  // A void mock function.
+  NaggyMock<MockC> c;
+  CaptureStdout();
+  c.VoidMethod(false, 5, "Hi", nullptr, Printable(), Unprintable());
+  const std::string output2 = GetCapturedStdout();
+  EXPECT_THAT(output2.c_str(),
+              ContainsRegex(
+                  "Uninteresting mock function call - returning directly\\.\n"
+                  "    Function call: VoidMethod"
+                  "\\(false, 5, \"Hi\", NULL, @.+ "
+                  "Printable, 4-byte object <00-00 00-00>\\)"));
+  // A void function has no return value to print.
+}
+
+// Tests how the --gmock_verbose flag affects Google Mock's output.
+
+class GMockVerboseFlagTest : public VerboseFlagPreservingFixture {
+ public:
+  // Verifies that the given Google Mock output is correct.  (When
+  // should_print is true, the output should match the given regex and
+  // contain the given function name in the stack trace.  When it's
+  // false, the output should be empty.)
+  void VerifyOutput(const std::string& output, bool should_print,
+                    const std::string& expected_substring,
+                    const std::string& function_name) {
+    if (should_print) {
+      EXPECT_THAT(output.c_str(), HasSubstr(expected_substring));
+# ifndef NDEBUG
+      // We check the stack trace content in dbg-mode only, as opt-mode
+      // may inline the call we are interested in seeing.
+      EXPECT_THAT(output.c_str(), HasSubstr(function_name));
+# else
+      // Suppresses 'unused function parameter' warnings.
+      static_cast<void>(function_name);
+# endif  // NDEBUG
+    } else {
+      EXPECT_STREQ("", output.c_str());
+    }
+  }
+
+  // Tests how the flag affects expected calls.
+  void TestExpectedCall(bool should_print) {
+    MockA a;
+    EXPECT_CALL(a, DoA(5));
+    EXPECT_CALL(a, Binary(_, 1))
+        .WillOnce(Return(true));
+
+    // A void-returning function.
+    CaptureStdout();
+    a.DoA(5);
+    VerifyOutput(
+        GetCapturedStdout(),
+        should_print,
+        "Mock function call matches EXPECT_CALL(a, DoA(5))...\n"
+        "    Function call: DoA(5)\n"
+        "Stack trace:\n",
+        "DoA");
+
+    // A non-void-returning function.
+    CaptureStdout();
+    a.Binary(2, 1);
+    VerifyOutput(
+        GetCapturedStdout(),
+        should_print,
+        "Mock function call matches EXPECT_CALL(a, Binary(_, 1))...\n"
+        "    Function call: Binary(2, 1)\n"
+        "          Returns: true\n"
+        "Stack trace:\n",
+        "Binary");
+  }
+
+  // Tests how the flag affects uninteresting calls on a naggy mock.
+  void TestUninterestingCallOnNaggyMock(bool should_print) {
+    NaggyMock<MockA> a;
+    const std::string note =
+        "NOTE: You can safely ignore the above warning unless this "
+        "call should not happen.  Do not suppress it by blindly adding "
+        "an EXPECT_CALL() if you don't mean to enforce the call.  "
+        "See "
+        "https://github.com/google/googletest/blob/master/googlemock/docs/"
+        "cook_book.md#"
+        "knowing-when-to-expect for details.";
+
+    // A void-returning function.
+    CaptureStdout();
+    a.DoA(5);
+    VerifyOutput(
+        GetCapturedStdout(),
+        should_print,
+        "\nGMOCK WARNING:\n"
+        "Uninteresting mock function call - returning directly.\n"
+        "    Function call: DoA(5)\n" +
+        note,
+        "DoA");
+
+    // A non-void-returning function.
+    CaptureStdout();
+    a.Binary(2, 1);
+    VerifyOutput(
+        GetCapturedStdout(),
+        should_print,
+        "\nGMOCK WARNING:\n"
+        "Uninteresting mock function call - returning default value.\n"
+        "    Function call: Binary(2, 1)\n"
+        "          Returns: false\n" +
+        note,
+        "Binary");
+  }
+};
+
+// Tests that --gmock_verbose=info causes both expected and
+// uninteresting calls to be reported.
+TEST_F(GMockVerboseFlagTest, Info) {
+  GMOCK_FLAG(verbose) = kInfoVerbosity;
+  TestExpectedCall(true);
+  TestUninterestingCallOnNaggyMock(true);
+}
+
+// Tests that --gmock_verbose=warning causes uninteresting calls to be
+// reported.
+TEST_F(GMockVerboseFlagTest, Warning) {
+  GMOCK_FLAG(verbose) = kWarningVerbosity;
+  TestExpectedCall(false);
+  TestUninterestingCallOnNaggyMock(true);
+}
+
+// Tests that --gmock_verbose=warning causes neither expected nor
+// uninteresting calls to be reported.
+TEST_F(GMockVerboseFlagTest, Error) {
+  GMOCK_FLAG(verbose) = kErrorVerbosity;
+  TestExpectedCall(false);
+  TestUninterestingCallOnNaggyMock(false);
+}
+
+// Tests that --gmock_verbose=SOME_INVALID_VALUE has the same effect
+// as --gmock_verbose=warning.
+TEST_F(GMockVerboseFlagTest, InvalidFlagIsTreatedAsWarning) {
+  GMOCK_FLAG(verbose) = "invalid";  // Treated as "warning".
+  TestExpectedCall(false);
+  TestUninterestingCallOnNaggyMock(true);
+}
+
+#endif  // GTEST_HAS_STREAM_REDIRECTION
+
+// A helper class that generates a failure when printed.  We use it to
+// ensure that Google Mock doesn't print a value (even to an internal
+// buffer) when it is not supposed to do so.
+class PrintMeNot {};
+
+void PrintTo(PrintMeNot /* dummy */, ::std::ostream* /* os */) {
+  ADD_FAILURE() << "Google Mock is printing a value that shouldn't be "
+                << "printed even to an internal buffer.";
+}
+
+class LogTestHelper {
+ public:
+  LogTestHelper() {}
+
+  MOCK_METHOD1(Foo, PrintMeNot(PrintMeNot));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(LogTestHelper);
+};
+
+class GMockLogTest : public VerboseFlagPreservingFixture {
+ protected:
+  LogTestHelper helper_;
+};
+
+TEST_F(GMockLogTest, DoesNotPrintGoodCallInternallyIfVerbosityIsWarning) {
+  GMOCK_FLAG(verbose) = kWarningVerbosity;
+  EXPECT_CALL(helper_, Foo(_))
+      .WillOnce(Return(PrintMeNot()));
+  helper_.Foo(PrintMeNot());  // This is an expected call.
+}
+
+TEST_F(GMockLogTest, DoesNotPrintGoodCallInternallyIfVerbosityIsError) {
+  GMOCK_FLAG(verbose) = kErrorVerbosity;
+  EXPECT_CALL(helper_, Foo(_))
+      .WillOnce(Return(PrintMeNot()));
+  helper_.Foo(PrintMeNot());  // This is an expected call.
+}
+
+TEST_F(GMockLogTest, DoesNotPrintWarningInternallyIfVerbosityIsError) {
+  GMOCK_FLAG(verbose) = kErrorVerbosity;
+  ON_CALL(helper_, Foo(_))
+      .WillByDefault(Return(PrintMeNot()));
+  helper_.Foo(PrintMeNot());  // This should generate a warning.
+}
+
+// Tests Mock::AllowLeak().
+
+TEST(AllowLeakTest, AllowsLeakingUnusedMockObject) {
+  MockA* a = new MockA;
+  Mock::AllowLeak(a);
+}
+
+TEST(AllowLeakTest, CanBeCalledBeforeOnCall) {
+  MockA* a = new MockA;
+  Mock::AllowLeak(a);
+  ON_CALL(*a, DoA(_)).WillByDefault(Return());
+  a->DoA(0);
+}
+
+TEST(AllowLeakTest, CanBeCalledAfterOnCall) {
+  MockA* a = new MockA;
+  ON_CALL(*a, DoA(_)).WillByDefault(Return());
+  Mock::AllowLeak(a);
+}
+
+TEST(AllowLeakTest, CanBeCalledBeforeExpectCall) {
+  MockA* a = new MockA;
+  Mock::AllowLeak(a);
+  EXPECT_CALL(*a, DoA(_));
+  a->DoA(0);
+}
+
+TEST(AllowLeakTest, CanBeCalledAfterExpectCall) {
+  MockA* a = new MockA;
+  EXPECT_CALL(*a, DoA(_)).Times(AnyNumber());
+  Mock::AllowLeak(a);
+}
+
+TEST(AllowLeakTest, WorksWhenBothOnCallAndExpectCallArePresent) {
+  MockA* a = new MockA;
+  ON_CALL(*a, DoA(_)).WillByDefault(Return());
+  EXPECT_CALL(*a, DoA(_)).Times(AnyNumber());
+  Mock::AllowLeak(a);
+}
+
+// Tests that we can verify and clear a mock object's expectations
+// when none of its methods has expectations.
+TEST(VerifyAndClearExpectationsTest, NoMethodHasExpectations) {
+  MockB b;
+  ASSERT_TRUE(Mock::VerifyAndClearExpectations(&b));
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can verify and clear a mock object's expectations
+// when some, but not all, of its methods have expectations *and* the
+// verification succeeds.
+TEST(VerifyAndClearExpectationsTest, SomeMethodsHaveExpectationsAndSucceed) {
+  MockB b;
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(1));
+  b.DoB();
+  ASSERT_TRUE(Mock::VerifyAndClearExpectations(&b));
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can verify and clear a mock object's expectations
+// when some, but not all, of its methods have expectations *and* the
+// verification fails.
+TEST(VerifyAndClearExpectationsTest, SomeMethodsHaveExpectationsAndFail) {
+  MockB b;
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(1));
+  bool result = true;
+  EXPECT_NONFATAL_FAILURE(result = Mock::VerifyAndClearExpectations(&b),
+                          "Actual: never called");
+  ASSERT_FALSE(result);
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can verify and clear a mock object's expectations
+// when all of its methods have expectations.
+TEST(VerifyAndClearExpectationsTest, AllMethodsHaveExpectations) {
+  MockB b;
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(1));
+  EXPECT_CALL(b, DoB(_))
+      .WillOnce(Return(2));
+  b.DoB();
+  b.DoB(1);
+  ASSERT_TRUE(Mock::VerifyAndClearExpectations(&b));
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can verify and clear a mock object's expectations
+// when a method has more than one expectation.
+TEST(VerifyAndClearExpectationsTest, AMethodHasManyExpectations) {
+  MockB b;
+  EXPECT_CALL(b, DoB(0))
+      .WillOnce(Return(1));
+  EXPECT_CALL(b, DoB(_))
+      .WillOnce(Return(2));
+  b.DoB(1);
+  bool result = true;
+  EXPECT_NONFATAL_FAILURE(result = Mock::VerifyAndClearExpectations(&b),
+                          "Actual: never called");
+  ASSERT_FALSE(result);
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can call VerifyAndClearExpectations() on the same
+// mock object multiple times.
+TEST(VerifyAndClearExpectationsTest, CanCallManyTimes) {
+  MockB b;
+  EXPECT_CALL(b, DoB());
+  b.DoB();
+  Mock::VerifyAndClearExpectations(&b);
+
+  EXPECT_CALL(b, DoB(_))
+      .WillOnce(Return(1));
+  b.DoB(1);
+  Mock::VerifyAndClearExpectations(&b);
+  Mock::VerifyAndClearExpectations(&b);
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can clear a mock object's default actions when none
+// of its methods has default actions.
+TEST(VerifyAndClearTest, NoMethodHasDefaultActions) {
+  MockB b;
+  // If this crashes or generates a failure, the test will catch it.
+  Mock::VerifyAndClear(&b);
+  EXPECT_EQ(0, b.DoB());
+}
+
+// Tests that we can clear a mock object's default actions when some,
+// but not all of its methods have default actions.
+TEST(VerifyAndClearTest, SomeMethodsHaveDefaultActions) {
+  MockB b;
+  ON_CALL(b, DoB())
+      .WillByDefault(Return(1));
+
+  Mock::VerifyAndClear(&b);
+
+  // Verifies that the default action of int DoB() was removed.
+  EXPECT_EQ(0, b.DoB());
+}
+
+// Tests that we can clear a mock object's default actions when all of
+// its methods have default actions.
+TEST(VerifyAndClearTest, AllMethodsHaveDefaultActions) {
+  MockB b;
+  ON_CALL(b, DoB())
+      .WillByDefault(Return(1));
+  ON_CALL(b, DoB(_))
+      .WillByDefault(Return(2));
+
+  Mock::VerifyAndClear(&b);
+
+  // Verifies that the default action of int DoB() was removed.
+  EXPECT_EQ(0, b.DoB());
+
+  // Verifies that the default action of int DoB(int) was removed.
+  EXPECT_EQ(0, b.DoB(0));
+}
+
+// Tests that we can clear a mock object's default actions when a
+// method has more than one ON_CALL() set on it.
+TEST(VerifyAndClearTest, AMethodHasManyDefaultActions) {
+  MockB b;
+  ON_CALL(b, DoB(0))
+      .WillByDefault(Return(1));
+  ON_CALL(b, DoB(_))
+      .WillByDefault(Return(2));
+
+  Mock::VerifyAndClear(&b);
+
+  // Verifies that the default actions (there are two) of int DoB(int)
+  // were removed.
+  EXPECT_EQ(0, b.DoB(0));
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can call VerifyAndClear() on a mock object multiple
+// times.
+TEST(VerifyAndClearTest, CanCallManyTimes) {
+  MockB b;
+  ON_CALL(b, DoB())
+      .WillByDefault(Return(1));
+  Mock::VerifyAndClear(&b);
+  Mock::VerifyAndClear(&b);
+
+  ON_CALL(b, DoB(_))
+      .WillByDefault(Return(1));
+  Mock::VerifyAndClear(&b);
+
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that VerifyAndClear() works when the verification succeeds.
+TEST(VerifyAndClearTest, Success) {
+  MockB b;
+  ON_CALL(b, DoB())
+      .WillByDefault(Return(1));
+  EXPECT_CALL(b, DoB(1))
+      .WillOnce(Return(2));
+
+  b.DoB();
+  b.DoB(1);
+  ASSERT_TRUE(Mock::VerifyAndClear(&b));
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that VerifyAndClear() works when the verification fails.
+TEST(VerifyAndClearTest, Failure) {
+  MockB b;
+  ON_CALL(b, DoB(_))
+      .WillByDefault(Return(1));
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(2));
+
+  b.DoB(1);
+  bool result = true;
+  EXPECT_NONFATAL_FAILURE(result = Mock::VerifyAndClear(&b),
+                          "Actual: never called");
+  ASSERT_FALSE(result);
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that VerifyAndClear() works when the default actions and
+// expectations are set on a const mock object.
+TEST(VerifyAndClearTest, Const) {
+  MockB b;
+  ON_CALL(Const(b), DoB())
+      .WillByDefault(Return(1));
+
+  EXPECT_CALL(Const(b), DoB())
+      .WillOnce(DoDefault())
+      .WillOnce(Return(2));
+
+  b.DoB();
+  b.DoB();
+  ASSERT_TRUE(Mock::VerifyAndClear(&b));
+
+  // There should be no expectations on the methods now, so we can
+  // freely call them.
+  EXPECT_EQ(0, b.DoB());
+  EXPECT_EQ(0, b.DoB(1));
+}
+
+// Tests that we can set default actions and expectations on a mock
+// object after VerifyAndClear() has been called on it.
+TEST(VerifyAndClearTest, CanSetDefaultActionsAndExpectationsAfterwards) {
+  MockB b;
+  ON_CALL(b, DoB())
+      .WillByDefault(Return(1));
+  EXPECT_CALL(b, DoB(_))
+      .WillOnce(Return(2));
+  b.DoB(1);
+
+  Mock::VerifyAndClear(&b);
+
+  EXPECT_CALL(b, DoB())
+      .WillOnce(Return(3));
+  ON_CALL(b, DoB(_))
+      .WillByDefault(Return(4));
+
+  EXPECT_EQ(3, b.DoB());
+  EXPECT_EQ(4, b.DoB(1));
+}
+
+// Tests that calling VerifyAndClear() on one mock object does not
+// affect other mock objects (either of the same type or not).
+TEST(VerifyAndClearTest, DoesNotAffectOtherMockObjects) {
+  MockA a;
+  MockB b1;
+  MockB b2;
+
+  ON_CALL(a, Binary(_, _))
+      .WillByDefault(Return(true));
+  EXPECT_CALL(a, Binary(_, _))
+      .WillOnce(DoDefault())
+      .WillOnce(Return(false));
+
+  ON_CALL(b1, DoB())
+      .WillByDefault(Return(1));
+  EXPECT_CALL(b1, DoB(_))
+      .WillOnce(Return(2));
+
+  ON_CALL(b2, DoB())
+      .WillByDefault(Return(3));
+  EXPECT_CALL(b2, DoB(_));
+
+  b2.DoB(0);
+  Mock::VerifyAndClear(&b2);
+
+  // Verifies that the default actions and expectations of a and b1
+  // are still in effect.
+  EXPECT_TRUE(a.Binary(0, 0));
+  EXPECT_FALSE(a.Binary(0, 0));
+
+  EXPECT_EQ(1, b1.DoB());
+  EXPECT_EQ(2, b1.DoB(0));
+}
+
+TEST(VerifyAndClearTest,
+     DestroyingChainedMocksDoesNotDeadlockThroughExpectations) {
+  std::shared_ptr<MockA> a(new MockA);
+  ReferenceHoldingMock test_mock;
+
+  // EXPECT_CALL stores a reference to a inside test_mock.
+  EXPECT_CALL(test_mock, AcceptReference(_))
+      .WillRepeatedly(SetArgPointee<0>(a));
+
+  // Throw away the reference to the mock that we have in a. After this, the
+  // only reference to it is stored by test_mock.
+  a.reset();
+
+  // When test_mock goes out of scope, it destroys the last remaining reference
+  // to the mock object originally pointed to by a. This will cause the MockA
+  // destructor to be called from inside the ReferenceHoldingMock destructor.
+  // The state of all mocks is protected by a single global lock, but there
+  // should be no deadlock.
+}
+
+TEST(VerifyAndClearTest,
+     DestroyingChainedMocksDoesNotDeadlockThroughDefaultAction) {
+  std::shared_ptr<MockA> a(new MockA);
+  ReferenceHoldingMock test_mock;
+
+  // ON_CALL stores a reference to a inside test_mock.
+  ON_CALL(test_mock, AcceptReference(_))
+      .WillByDefault(SetArgPointee<0>(a));
+
+  // Throw away the reference to the mock that we have in a. After this, the
+  // only reference to it is stored by test_mock.
+  a.reset();
+
+  // When test_mock goes out of scope, it destroys the last remaining reference
+  // to the mock object originally pointed to by a. This will cause the MockA
+  // destructor to be called from inside the ReferenceHoldingMock destructor.
+  // The state of all mocks is protected by a single global lock, but there
+  // should be no deadlock.
+}
+
+// Tests that a mock function's action can call a mock function
+// (either the same function or a different one) either as an explicit
+// action or as a default action without causing a dead lock.  It
+// verifies that the action is not performed inside the critical
+// section.
+TEST(SynchronizationTest, CanCallMockMethodInAction) {
+  MockA a;
+  MockC c;
+  ON_CALL(a, DoA(_))
+      .WillByDefault(IgnoreResult(InvokeWithoutArgs(&c,
+                                                    &MockC::NonVoidMethod)));
+  EXPECT_CALL(a, DoA(1));
+  EXPECT_CALL(a, DoA(1))
+      .WillOnce(Invoke(&a, &MockA::DoA))
+      .RetiresOnSaturation();
+  EXPECT_CALL(c, NonVoidMethod());
+
+  a.DoA(1);
+  // This will match the second EXPECT_CALL() and trigger another a.DoA(1),
+  // which will in turn match the first EXPECT_CALL() and trigger a call to
+  // c.NonVoidMethod() that was specified by the ON_CALL() since the first
+  // EXPECT_CALL() did not specify an action.
+}
+
+TEST(ParameterlessExpectationsTest, CanSetExpectationsWithoutMatchers) {
+  MockA a;
+  int do_a_arg0 = 0;
+  ON_CALL(a, DoA).WillByDefault(SaveArg<0>(&do_a_arg0));
+  int do_a_47_arg0 = 0;
+  ON_CALL(a, DoA(47)).WillByDefault(SaveArg<0>(&do_a_47_arg0));
+
+  a.DoA(17);
+  EXPECT_THAT(do_a_arg0, 17);
+  EXPECT_THAT(do_a_47_arg0, 0);
+  a.DoA(47);
+  EXPECT_THAT(do_a_arg0, 17);
+  EXPECT_THAT(do_a_47_arg0, 47);
+
+  ON_CALL(a, Binary).WillByDefault(Return(true));
+  ON_CALL(a, Binary(_, 14)).WillByDefault(Return(false));
+  EXPECT_THAT(a.Binary(14, 17), true);
+  EXPECT_THAT(a.Binary(17, 14), false);
+}
+
+TEST(ParameterlessExpectationsTest, CanSetExpectationsForOverloadedMethods) {
+  MockB b;
+  ON_CALL(b, DoB()).WillByDefault(Return(9));
+  ON_CALL(b, DoB(5)).WillByDefault(Return(11));
+
+  EXPECT_THAT(b.DoB(), 9);
+  EXPECT_THAT(b.DoB(1), 0);  // default value
+  EXPECT_THAT(b.DoB(5), 11);
+}
+
+struct MockWithConstMethods {
+ public:
+  MOCK_CONST_METHOD1(Foo, int(int));
+  MOCK_CONST_METHOD2(Bar, int(int, const char*));
+};
+
+TEST(ParameterlessExpectationsTest, CanSetExpectationsForConstMethods) {
+  MockWithConstMethods mock;
+  ON_CALL(mock, Foo).WillByDefault(Return(7));
+  ON_CALL(mock, Bar).WillByDefault(Return(33));
+
+  EXPECT_THAT(mock.Foo(17), 7);
+  EXPECT_THAT(mock.Bar(27, "purple"), 33);
+}
+
+class MockConstOverload {
+ public:
+  MOCK_METHOD1(Overloaded, int(int));
+  MOCK_CONST_METHOD1(Overloaded, int(int));
+};
+
+TEST(ParameterlessExpectationsTest,
+     CanSetExpectationsForConstOverloadedMethods) {
+  MockConstOverload mock;
+  ON_CALL(mock, Overloaded(_)).WillByDefault(Return(7));
+  ON_CALL(mock, Overloaded(5)).WillByDefault(Return(9));
+  ON_CALL(Const(mock), Overloaded(5)).WillByDefault(Return(11));
+  ON_CALL(Const(mock), Overloaded(7)).WillByDefault(Return(13));
+
+  EXPECT_THAT(mock.Overloaded(1), 7);
+  EXPECT_THAT(mock.Overloaded(5), 9);
+  EXPECT_THAT(mock.Overloaded(7), 7);
+
+  const MockConstOverload& const_mock = mock;
+  EXPECT_THAT(const_mock.Overloaded(1), 0);
+  EXPECT_THAT(const_mock.Overloaded(5), 11);
+  EXPECT_THAT(const_mock.Overloaded(7), 13);
+}
+
+}  // namespace
+
+// Allows the user to define their own main and then invoke gmock_main
+// from it. This might be necessary on some platforms which require
+// specific setup and teardown.
+#if GMOCK_RENAME_MAIN
+int gmock_main(int argc, char **argv) {
+#else
+int main(int argc, char **argv) {
+#endif  // GMOCK_RENAME_MAIN
+  testing::InitGoogleMock(&argc, argv);
+  // Ensures that the tests pass no matter what value of
+  // --gmock_catch_leaked_mocks and --gmock_verbose the user specifies.
+  testing::GMOCK_FLAG(catch_leaked_mocks) = true;
+  testing::GMOCK_FLAG(verbose) = testing::internal::kWarningVerbosity;
+
+  return RUN_ALL_TESTS();
+}
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_all_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock_all_test.cc
new file mode 100644
index 0000000000..c53d0965f6
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_all_test.cc
@@ -0,0 +1,48 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+//
+// Tests for Google C++ Mocking Framework (Google Mock)
+//
+// Some users use a build system that Google Mock doesn't support directly,
+// yet they still want to build and run Google Mock's own tests.  This file
+// includes most such tests, making it easier for these users to maintain
+// their build scripts (they just need to build this file, even though the
+// below list of actual *_test.cc files might change).
+#include "test/gmock-actions_test.cc"
+#include "test/gmock-cardinalities_test.cc"
+#include "test/gmock-generated-actions_test.cc"
+#include "test/gmock-generated-matchers_test.cc"
+#include "test/gmock-internal-utils_test.cc"
+#include "test/gmock-matchers_test.cc"
+#include "test/gmock-more-actions_test.cc"
+#include "test/gmock-nice-strict_test.cc"
+#include "test/gmock-port_test.cc"
+#include "test/gmock-spec-builders_test.cc"
+#include "test/gmock_test.cc"
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_ex_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock_ex_test.cc
new file mode 100644
index 0000000000..72eb43f74e
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_ex_test.cc
@@ -0,0 +1,80 @@
+// Copyright 2013, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Tests Google Mock's functionality that depends on exceptions.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#if GTEST_HAS_EXCEPTIONS
+namespace {
+
+using testing::HasSubstr;
+
+using testing::internal::GoogleTestFailureException;
+
+// A type that cannot be default constructed.
+class NonDefaultConstructible {
+ public:
+  explicit NonDefaultConstructible(int /* dummy */) {}
+};
+
+class MockFoo {
+ public:
+  // A mock method that returns a user-defined type.  Google Mock
+  // doesn't know what the default value for this type is.
+  MOCK_METHOD0(GetNonDefaultConstructible, NonDefaultConstructible());
+};
+
+TEST(DefaultValueTest, ThrowsRuntimeErrorWhenNoDefaultValue) {
+  MockFoo mock;
+  try {
+    // No expectation is set on this method, so Google Mock must
+    // return the default value.  However, since Google Mock knows
+    // nothing about the return type, it doesn't know what to return,
+    // and has to throw (when exceptions are enabled) or abort
+    // (otherwise).
+    mock.GetNonDefaultConstructible();
+    FAIL() << "GetNonDefaultConstructible()'s return type has no default "
+           << "value, so Google Mock should have thrown.";
+  } catch (const GoogleTestFailureException& /* unused */) {
+    FAIL() << "Google Test does not try to catch an exception of type "
+           << "GoogleTestFailureException, which is used for reporting "
+           << "a failure to other testing frameworks.  Google Mock should "
+           << "not throw a GoogleTestFailureException as it will kill the "
+           << "entire test program instead of just the current TEST.";
+  } catch (const std::exception& ex) {
+    EXPECT_THAT(ex.what(), HasSubstr("has no default value"));
+  }
+}
+
+
+}  // unnamed namespace
+#endif
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_leak_test.py b/GraphBLAS/CUDA/test/googlemock/test/gmock_leak_test.py
new file mode 100755
index 0000000000..7e4b1eea9a
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_leak_test.py
@@ -0,0 +1,104 @@
+#!/usr/bin/env python
+#
+# Copyright 2009, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Tests that leaked mock objects can be caught be Google Mock."""
+
+import gmock_test_utils
+
+PROGRAM_PATH = gmock_test_utils.GetTestExecutablePath('gmock_leak_test_')
+TEST_WITH_EXPECT_CALL = [PROGRAM_PATH, '--gtest_filter=*ExpectCall*']
+TEST_WITH_ON_CALL = [PROGRAM_PATH, '--gtest_filter=*OnCall*']
+TEST_MULTIPLE_LEAKS = [PROGRAM_PATH, '--gtest_filter=*MultipleLeaked*']
+
+environ = gmock_test_utils.environ
+SetEnvVar = gmock_test_utils.SetEnvVar
+
+# Tests in this file run a Google-Test-based test program and expect it
+# to terminate prematurely.  Therefore they are incompatible with
+# the premature-exit-file protocol by design.  Unset the
+# premature-exit filepath to prevent Google Test from creating
+# the file.
+SetEnvVar(gmock_test_utils.PREMATURE_EXIT_FILE_ENV_VAR, None)
+
+
+class GMockLeakTest(gmock_test_utils.TestCase):
+
+  def testCatchesLeakedMockByDefault(self):
+    self.assertNotEqual(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_EXPECT_CALL,
+                                    env=environ).exit_code)
+    self.assertNotEqual(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_ON_CALL,
+                                    env=environ).exit_code)
+
+  def testDoesNotCatchLeakedMockWhenDisabled(self):
+    self.assertEquals(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_EXPECT_CALL +
+                                    ['--gmock_catch_leaked_mocks=0'],
+                                    env=environ).exit_code)
+    self.assertEquals(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_ON_CALL +
+                                    ['--gmock_catch_leaked_mocks=0'],
+                                    env=environ).exit_code)
+
+  def testCatchesLeakedMockWhenEnabled(self):
+    self.assertNotEqual(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_EXPECT_CALL +
+                                    ['--gmock_catch_leaked_mocks'],
+                                    env=environ).exit_code)
+    self.assertNotEqual(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_ON_CALL +
+                                    ['--gmock_catch_leaked_mocks'],
+                                    env=environ).exit_code)
+
+  def testCatchesLeakedMockWhenEnabledWithExplictFlagValue(self):
+    self.assertNotEqual(
+        0,
+        gmock_test_utils.Subprocess(TEST_WITH_EXPECT_CALL +
+                                    ['--gmock_catch_leaked_mocks=1'],
+                                    env=environ).exit_code)
+
+  def testCatchesMultipleLeakedMocks(self):
+    self.assertNotEqual(
+        0,
+        gmock_test_utils.Subprocess(TEST_MULTIPLE_LEAKS +
+                                    ['--gmock_catch_leaked_mocks'],
+                                    env=environ).exit_code)
+
+
+if __name__ == '__main__':
+  gmock_test_utils.Main()
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_leak_test_.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock_leak_test_.cc
new file mode 100644
index 0000000000..2e095abcf4
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_leak_test_.cc
@@ -0,0 +1,99 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This program is for verifying that a leaked mock object can be
+// caught by Google Mock's leak detector.
+
+#include "gmock/gmock.h"
+
+namespace {
+
+using ::testing::Return;
+
+class FooInterface {
+ public:
+  virtual ~FooInterface() {}
+  virtual void DoThis() = 0;
+};
+
+class MockFoo : public FooInterface {
+ public:
+  MockFoo() {}
+
+  MOCK_METHOD0(DoThis, void());
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFoo);
+};
+
+TEST(LeakTest, LeakedMockWithExpectCallCausesFailureWhenLeakCheckingIsEnabled) {
+  MockFoo* foo = new MockFoo;
+
+  EXPECT_CALL(*foo, DoThis());
+  foo->DoThis();
+
+  // In order to test the leak detector, we deliberately leak foo.
+
+  // Makes sure Google Mock's leak detector can change the exit code
+  // to 1 even when the code is already exiting with 0.
+  exit(0);
+}
+
+TEST(LeakTest, LeakedMockWithOnCallCausesFailureWhenLeakCheckingIsEnabled) {
+  MockFoo* foo = new MockFoo;
+
+  ON_CALL(*foo, DoThis()).WillByDefault(Return());
+
+  // In order to test the leak detector, we deliberately leak foo.
+
+  // Makes sure Google Mock's leak detector can change the exit code
+  // to 1 even when the code is already exiting with 0.
+  exit(0);
+}
+
+TEST(LeakTest, CatchesMultipleLeakedMockObjects) {
+  MockFoo* foo1 = new MockFoo;
+  MockFoo* foo2 = new MockFoo;
+
+  ON_CALL(*foo1, DoThis()).WillByDefault(Return());
+  EXPECT_CALL(*foo2, DoThis());
+  foo2->DoThis();
+
+  // In order to test the leak detector, we deliberately leak foo1 and
+  // foo2.
+
+  // Makes sure Google Mock's leak detector can change the exit code
+  // to 1 even when the code is already exiting with 0.
+  exit(0);
+}
+
+}  // namespace
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_link2_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock_link2_test.cc
new file mode 100644
index 0000000000..d27ce17688
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_link2_test.cc
@@ -0,0 +1,39 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file is for verifying that various Google Mock constructs do not
+// produce linker errors when instantiated in different translation units.
+// Please see gmock_link_test.h for details.
+
+#define LinkTest LinkTest2
+
+#include "test/gmock_link_test.h"
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_link_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock_link_test.cc
new file mode 100644
index 0000000000..e7c54cc230
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_link_test.cc
@@ -0,0 +1,39 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file is for verifying that various Google Mock constructs do not
+// produce linker errors when instantiated in different translation units.
+// Please see gmock_link_test.h for details.
+
+#define LinkTest LinkTest1
+
+#include "test/gmock_link_test.h"
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_link_test.h b/GraphBLAS/CUDA/test/googlemock/test/gmock_link_test.h
new file mode 100644
index 0000000000..175d2bdd1b
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_link_test.h
@@ -0,0 +1,690 @@
+// Copyright 2009, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests that:
+// a. A header file defining a mock class can be included in multiple
+//    translation units without causing a link error.
+// b. Actions and matchers can be instantiated with identical template
+//    arguments in different translation units without causing link
+//    errors.
+//    The following constructs are currently tested:
+//    Actions:
+//      Return()
+//      Return(value)
+//      ReturnNull
+//      ReturnRef
+//      Assign
+//      SetArgPointee
+//      SetArrayArgument
+//      SetErrnoAndReturn
+//      Invoke(function)
+//      Invoke(object, method)
+//      InvokeWithoutArgs(function)
+//      InvokeWithoutArgs(object, method)
+//      InvokeArgument
+//      WithArg
+//      WithArgs
+//      WithoutArgs
+//      DoAll
+//      DoDefault
+//      IgnoreResult
+//      Throw
+//      ACTION()-generated
+//      ACTION_P()-generated
+//      ACTION_P2()-generated
+//    Matchers:
+//      _
+//      A
+//      An
+//      Eq
+//      Gt, Lt, Ge, Le, Ne
+//      NotNull
+//      Ref
+//      TypedEq
+//      DoubleEq
+//      FloatEq
+//      NanSensitiveDoubleEq
+//      NanSensitiveFloatEq
+//      ContainsRegex
+//      MatchesRegex
+//      EndsWith
+//      HasSubstr
+//      StartsWith
+//      StrCaseEq
+//      StrCaseNe
+//      StrEq
+//      StrNe
+//      ElementsAre
+//      ElementsAreArray
+//      ContainerEq
+//      Field
+//      Property
+//      ResultOf(function)
+//      ResultOf(callback)
+//      Pointee
+//      Truly(predicate)
+//      AddressSatisfies
+//      AllOf
+//      AnyOf
+//      Not
+//      MatcherCast<T>
+//
+//  Please note: this test does not verify the functioning of these
+//  constructs, only that the programs using them will link successfully.
+//
+// Implementation note:
+// This test requires identical definitions of Interface and Mock to be
+// included in different translation units.  We achieve this by writing
+// them in this header and #including it in gmock_link_test.cc and
+// gmock_link2_test.cc.  Because the symbols generated by the compiler for
+// those constructs must be identical in both translation units,
+// definitions of Interface and Mock tests MUST be kept in the SAME
+// NON-ANONYMOUS namespace in this file.  The test fixture class LinkTest
+// is defined as LinkTest1 in gmock_link_test.cc and as LinkTest2 in
+// gmock_link2_test.cc to avoid producing linker errors.
+
+#ifndef GMOCK_TEST_GMOCK_LINK_TEST_H_
+#define GMOCK_TEST_GMOCK_LINK_TEST_H_
+
+#include "gmock/gmock.h"
+
+#if !GTEST_OS_WINDOWS_MOBILE
+# include <errno.h>
+#endif
+
+#include <iostream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "gtest/internal/gtest-port.h"
+
+using testing::_;
+using testing::A;
+using testing::Action;
+using testing::AllOf;
+using testing::AnyOf;
+using testing::Assign;
+using testing::ContainerEq;
+using testing::DoAll;
+using testing::DoDefault;
+using testing::DoubleEq;
+using testing::ElementsAre;
+using testing::ElementsAreArray;
+using testing::EndsWith;
+using testing::Eq;
+using testing::Field;
+using testing::FloatEq;
+using testing::Ge;
+using testing::Gt;
+using testing::HasSubstr;
+using testing::IgnoreResult;
+using testing::Invoke;
+using testing::InvokeArgument;
+using testing::InvokeWithoutArgs;
+using testing::IsNull;
+using testing::IsSubsetOf;
+using testing::IsSupersetOf;
+using testing::Le;
+using testing::Lt;
+using testing::Matcher;
+using testing::MatcherCast;
+using testing::NanSensitiveDoubleEq;
+using testing::NanSensitiveFloatEq;
+using testing::Ne;
+using testing::Not;
+using testing::NotNull;
+using testing::Pointee;
+using testing::Property;
+using testing::Ref;
+using testing::ResultOf;
+using testing::Return;
+using testing::ReturnNull;
+using testing::ReturnRef;
+using testing::SetArgPointee;
+using testing::SetArrayArgument;
+using testing::StartsWith;
+using testing::StrCaseEq;
+using testing::StrCaseNe;
+using testing::StrEq;
+using testing::StrNe;
+using testing::Truly;
+using testing::TypedEq;
+using testing::WithArg;
+using testing::WithArgs;
+using testing::WithoutArgs;
+
+#if !GTEST_OS_WINDOWS_MOBILE
+using testing::SetErrnoAndReturn;
+#endif
+
+#if GTEST_HAS_EXCEPTIONS
+using testing::Throw;
+#endif
+
+using testing::ContainsRegex;
+using testing::MatchesRegex;
+
+class Interface {
+ public:
+  virtual ~Interface() {}
+  virtual void VoidFromString(char* str) = 0;
+  virtual char* StringFromString(char* str) = 0;
+  virtual int IntFromString(char* str) = 0;
+  virtual int& IntRefFromString(char* str) = 0;
+  virtual void VoidFromFunc(void(*func)(char* str)) = 0;
+  virtual void VoidFromIntRef(int& n) = 0;  // NOLINT
+  virtual void VoidFromFloat(float n) = 0;
+  virtual void VoidFromDouble(double n) = 0;
+  virtual void VoidFromVector(const std::vector<int>& v) = 0;
+};
+
+class Mock: public Interface {
+ public:
+  Mock() {}
+
+  MOCK_METHOD1(VoidFromString, void(char* str));
+  MOCK_METHOD1(StringFromString, char*(char* str));
+  MOCK_METHOD1(IntFromString, int(char* str));
+  MOCK_METHOD1(IntRefFromString, int&(char* str));
+  MOCK_METHOD1(VoidFromFunc, void(void(*func)(char* str)));
+  MOCK_METHOD1(VoidFromIntRef, void(int& n));  // NOLINT
+  MOCK_METHOD1(VoidFromFloat, void(float n));
+  MOCK_METHOD1(VoidFromDouble, void(double n));
+  MOCK_METHOD1(VoidFromVector, void(const std::vector<int>& v));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mock);
+};
+
+class InvokeHelper {
+ public:
+  static void StaticVoidFromVoid() {}
+  void VoidFromVoid() {}
+  static void StaticVoidFromString(char* /* str */) {}
+  void VoidFromString(char* /* str */) {}
+  static int StaticIntFromString(char* /* str */) { return 1; }
+  static bool StaticBoolFromString(const char* /* str */) { return true; }
+};
+
+class FieldHelper {
+ public:
+  explicit FieldHelper(int a_field) : field_(a_field) {}
+  int field() const { return field_; }
+  int field_;  // NOLINT -- need external access to field_ to test
+               //           the Field matcher.
+};
+
+// Tests the linkage of the ReturnVoid action.
+TEST(LinkTest, TestReturnVoid) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(Return());
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the Return action.
+TEST(LinkTest, TestReturn) {
+  Mock mock;
+  char ch = 'x';
+
+  EXPECT_CALL(mock, StringFromString(_)).WillOnce(Return(&ch));
+  mock.StringFromString(nullptr);
+}
+
+// Tests the linkage of the ReturnNull action.
+TEST(LinkTest, TestReturnNull) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(Return());
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the ReturnRef action.
+TEST(LinkTest, TestReturnRef) {
+  Mock mock;
+  int n = 42;
+
+  EXPECT_CALL(mock, IntRefFromString(_)).WillOnce(ReturnRef(n));
+  mock.IntRefFromString(nullptr);
+}
+
+// Tests the linkage of the Assign action.
+TEST(LinkTest, TestAssign) {
+  Mock mock;
+  char ch = 'x';
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(Assign(&ch, 'y'));
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the SetArgPointee action.
+TEST(LinkTest, TestSetArgPointee) {
+  Mock mock;
+  char ch = 'x';
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(SetArgPointee<0>('y'));
+  mock.VoidFromString(&ch);
+}
+
+// Tests the linkage of the SetArrayArgument action.
+TEST(LinkTest, TestSetArrayArgument) {
+  Mock mock;
+  char ch = 'x';
+  char ch2 = 'y';
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(SetArrayArgument<0>(&ch2,
+                                                                    &ch2 + 1));
+  mock.VoidFromString(&ch);
+}
+
+#if !GTEST_OS_WINDOWS_MOBILE
+
+// Tests the linkage of the SetErrnoAndReturn action.
+TEST(LinkTest, TestSetErrnoAndReturn) {
+  Mock mock;
+
+  int saved_errno = errno;
+  EXPECT_CALL(mock, IntFromString(_)).WillOnce(SetErrnoAndReturn(1, -1));
+  mock.IntFromString(nullptr);
+  errno = saved_errno;
+}
+
+#endif  // !GTEST_OS_WINDOWS_MOBILE
+
+// Tests the linkage of the Invoke(function) and Invoke(object, method) actions.
+TEST(LinkTest, TestInvoke) {
+  Mock mock;
+  InvokeHelper test_invoke_helper;
+
+  EXPECT_CALL(mock, VoidFromString(_))
+      .WillOnce(Invoke(&InvokeHelper::StaticVoidFromString))
+      .WillOnce(Invoke(&test_invoke_helper, &InvokeHelper::VoidFromString));
+  mock.VoidFromString(nullptr);
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the InvokeWithoutArgs action.
+TEST(LinkTest, TestInvokeWithoutArgs) {
+  Mock mock;
+  InvokeHelper test_invoke_helper;
+
+  EXPECT_CALL(mock, VoidFromString(_))
+      .WillOnce(InvokeWithoutArgs(&InvokeHelper::StaticVoidFromVoid))
+      .WillOnce(InvokeWithoutArgs(&test_invoke_helper,
+                                  &InvokeHelper::VoidFromVoid));
+  mock.VoidFromString(nullptr);
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the InvokeArgument action.
+TEST(LinkTest, TestInvokeArgument) {
+  Mock mock;
+  char ch = 'x';
+
+  EXPECT_CALL(mock, VoidFromFunc(_)).WillOnce(InvokeArgument<0>(&ch));
+  mock.VoidFromFunc(InvokeHelper::StaticVoidFromString);
+}
+
+// Tests the linkage of the WithArg action.
+TEST(LinkTest, TestWithArg) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_))
+      .WillOnce(WithArg<0>(Invoke(&InvokeHelper::StaticVoidFromString)));
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the WithArgs action.
+TEST(LinkTest, TestWithArgs) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_))
+      .WillOnce(WithArgs<0>(Invoke(&InvokeHelper::StaticVoidFromString)));
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the WithoutArgs action.
+TEST(LinkTest, TestWithoutArgs) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(WithoutArgs(Return()));
+  mock.VoidFromString(nullptr);
+}
+
+// Tests the linkage of the DoAll action.
+TEST(LinkTest, TestDoAll) {
+  Mock mock;
+  char ch = 'x';
+
+  EXPECT_CALL(mock, VoidFromString(_))
+      .WillOnce(DoAll(SetArgPointee<0>('y'), Return()));
+  mock.VoidFromString(&ch);
+}
+
+// Tests the linkage of the DoDefault action.
+TEST(LinkTest, TestDoDefault) {
+  Mock mock;
+  char ch = 'x';
+
+  ON_CALL(mock, VoidFromString(_)).WillByDefault(Return());
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(DoDefault());
+  mock.VoidFromString(&ch);
+}
+
+// Tests the linkage of the IgnoreResult action.
+TEST(LinkTest, TestIgnoreResult) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(IgnoreResult(Return(42)));
+  mock.VoidFromString(nullptr);
+}
+
+#if GTEST_HAS_EXCEPTIONS
+// Tests the linkage of the Throw action.
+TEST(LinkTest, TestThrow) {
+  Mock mock;
+
+  EXPECT_CALL(mock, VoidFromString(_)).WillOnce(Throw(42));
+  EXPECT_THROW(mock.VoidFromString(nullptr), int);
+}
+#endif  // GTEST_HAS_EXCEPTIONS
+
+// The ACTION*() macros trigger warning C4100 (unreferenced formal
+// parameter) in MSVC with -W4.  Unfortunately they cannot be fixed in
+// the macro definition, as the warnings are generated when the macro
+// is expanded and macro expansion cannot contain #pragma.  Therefore
+// we suppress them here.
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+// Tests the linkage of actions created using ACTION macro.
+namespace {
+ACTION(Return1) { return 1; }
+}
+
+TEST(LinkTest, TestActionMacro) {
+  Mock mock;
+
+  EXPECT_CALL(mock, IntFromString(_)).WillOnce(Return1());
+  mock.IntFromString(nullptr);
+}
+
+// Tests the linkage of actions created using ACTION_P macro.
+namespace {
+ACTION_P(ReturnArgument, ret_value) { return ret_value; }
+}
+
+TEST(LinkTest, TestActionPMacro) {
+  Mock mock;
+
+  EXPECT_CALL(mock, IntFromString(_)).WillOnce(ReturnArgument(42));
+  mock.IntFromString(nullptr);
+}
+
+// Tests the linkage of actions created using ACTION_P2 macro.
+namespace {
+ACTION_P2(ReturnEqualsEitherOf, first, second) {
+  return arg0 == first || arg0 == second;
+}
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
+
+TEST(LinkTest, TestActionP2Macro) {
+  Mock mock;
+  char ch = 'x';
+
+  EXPECT_CALL(mock, IntFromString(_))
+      .WillOnce(ReturnEqualsEitherOf("one", "two"));
+  mock.IntFromString(&ch);
+}
+
+// Tests the linkage of the "_" matcher.
+TEST(LinkTest, TestMatcherAnything) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(_)).WillByDefault(Return());
+}
+
+// Tests the linkage of the A matcher.
+TEST(LinkTest, TestMatcherA) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(A<char*>())).WillByDefault(Return());
+}
+
+// Tests the linkage of the Eq and the "bare value" matcher.
+TEST(LinkTest, TestMatchersEq) {
+  Mock mock;
+  const char* p = "x";
+
+  ON_CALL(mock, VoidFromString(Eq(p))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromString(const_cast<char*>("y")))
+      .WillByDefault(Return());
+}
+
+// Tests the linkage of the Lt, Gt, Le, Ge, and Ne matchers.
+TEST(LinkTest, TestMatchersRelations) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromFloat(Lt(1.0f))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromFloat(Gt(1.0f))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromFloat(Le(1.0f))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromFloat(Ge(1.0f))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromFloat(Ne(1.0f))).WillByDefault(Return());
+}
+
+// Tests the linkage of the NotNull matcher.
+TEST(LinkTest, TestMatcherNotNull) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(NotNull())).WillByDefault(Return());
+}
+
+// Tests the linkage of the IsNull matcher.
+TEST(LinkTest, TestMatcherIsNull) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(IsNull())).WillByDefault(Return());
+}
+
+// Tests the linkage of the Ref matcher.
+TEST(LinkTest, TestMatcherRef) {
+  Mock mock;
+  int a = 0;
+
+  ON_CALL(mock, VoidFromIntRef(Ref(a))).WillByDefault(Return());
+}
+
+// Tests the linkage of the TypedEq matcher.
+TEST(LinkTest, TestMatcherTypedEq) {
+  Mock mock;
+  long a = 0;
+
+  ON_CALL(mock, VoidFromIntRef(TypedEq<int&>(a))).WillByDefault(Return());
+}
+
+// Tests the linkage of the FloatEq, DoubleEq, NanSensitiveFloatEq and
+// NanSensitiveDoubleEq matchers.
+TEST(LinkTest, TestMatchersFloatingPoint) {
+  Mock mock;
+  float a = 0;
+
+  ON_CALL(mock, VoidFromFloat(FloatEq(a))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromDouble(DoubleEq(a))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromFloat(NanSensitiveFloatEq(a))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromDouble(NanSensitiveDoubleEq(a)))
+      .WillByDefault(Return());
+}
+
+// Tests the linkage of the ContainsRegex matcher.
+TEST(LinkTest, TestMatcherContainsRegex) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(ContainsRegex(".*"))).WillByDefault(Return());
+}
+
+// Tests the linkage of the MatchesRegex matcher.
+TEST(LinkTest, TestMatcherMatchesRegex) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(MatchesRegex(".*"))).WillByDefault(Return());
+}
+
+// Tests the linkage of the StartsWith, EndsWith, and HasSubstr matchers.
+TEST(LinkTest, TestMatchersSubstrings) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromString(StartsWith("a"))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromString(EndsWith("c"))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromString(HasSubstr("b"))).WillByDefault(Return());
+}
+
+// Tests the linkage of the StrEq, StrNe, StrCaseEq, and StrCaseNe matchers.
+TEST(LinkTest, TestMatchersStringEquality) {
+  Mock mock;
+  ON_CALL(mock, VoidFromString(StrEq("a"))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromString(StrNe("a"))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromString(StrCaseEq("a"))).WillByDefault(Return());
+  ON_CALL(mock, VoidFromString(StrCaseNe("a"))).WillByDefault(Return());
+}
+
+// Tests the linkage of the ElementsAre matcher.
+TEST(LinkTest, TestMatcherElementsAre) {
+  Mock mock;
+
+  ON_CALL(mock, VoidFromVector(ElementsAre('a', _))).WillByDefault(Return());
+}
+
+// Tests the linkage of the ElementsAreArray matcher.
+TEST(LinkTest, TestMatcherElementsAreArray) {
+  Mock mock;
+  char arr[] = { 'a', 'b' };
+
+  ON_CALL(mock, VoidFromVector(ElementsAreArray(arr))).WillByDefault(Return());
+}
+
+// Tests the linkage of the IsSubsetOf matcher.
+TEST(LinkTest, TestMatcherIsSubsetOf) {
+  Mock mock;
+  char arr[] = {'a', 'b'};
+
+  ON_CALL(mock, VoidFromVector(IsSubsetOf(arr))).WillByDefault(Return());
+}
+
+// Tests the linkage of the IsSupersetOf matcher.
+TEST(LinkTest, TestMatcherIsSupersetOf) {
+  Mock mock;
+  char arr[] = {'a', 'b'};
+
+  ON_CALL(mock, VoidFromVector(IsSupersetOf(arr))).WillByDefault(Return());
+}
+
+// Tests the linkage of the ContainerEq matcher.
+TEST(LinkTest, TestMatcherContainerEq) {
+  Mock mock;
+  std::vector<int> v;
+
+  ON_CALL(mock, VoidFromVector(ContainerEq(v))).WillByDefault(Return());
+}
+
+// Tests the linkage of the Field matcher.
+TEST(LinkTest, TestMatcherField) {
+  FieldHelper helper(0);
+
+  Matcher<const FieldHelper&> m = Field(&FieldHelper::field_, Eq(0));
+  EXPECT_TRUE(m.Matches(helper));
+
+  Matcher<const FieldHelper*> m2 = Field(&FieldHelper::field_, Eq(0));
+  EXPECT_TRUE(m2.Matches(&helper));
+}
+
+// Tests the linkage of the Property matcher.
+TEST(LinkTest, TestMatcherProperty) {
+  FieldHelper helper(0);
+
+  Matcher<const FieldHelper&> m = Property(&FieldHelper::field, Eq(0));
+  EXPECT_TRUE(m.Matches(helper));
+
+  Matcher<const FieldHelper*> m2 = Property(&FieldHelper::field, Eq(0));
+  EXPECT_TRUE(m2.Matches(&helper));
+}
+
+// Tests the linkage of the ResultOf matcher.
+TEST(LinkTest, TestMatcherResultOf) {
+  Matcher<char*> m = ResultOf(&InvokeHelper::StaticIntFromString, Eq(1));
+  EXPECT_TRUE(m.Matches(nullptr));
+}
+
+// Tests the linkage of the ResultOf matcher.
+TEST(LinkTest, TestMatcherPointee) {
+  int n = 1;
+
+  Matcher<int*> m = Pointee(Eq(1));
+  EXPECT_TRUE(m.Matches(&n));
+}
+
+// Tests the linkage of the Truly matcher.
+TEST(LinkTest, TestMatcherTruly) {
+  Matcher<const char*> m = Truly(&InvokeHelper::StaticBoolFromString);
+  EXPECT_TRUE(m.Matches(nullptr));
+}
+
+// Tests the linkage of the AllOf matcher.
+TEST(LinkTest, TestMatcherAllOf) {
+  Matcher<int> m = AllOf(_, Eq(1));
+  EXPECT_TRUE(m.Matches(1));
+}
+
+// Tests the linkage of the AnyOf matcher.
+TEST(LinkTest, TestMatcherAnyOf) {
+  Matcher<int> m = AnyOf(_, Eq(1));
+  EXPECT_TRUE(m.Matches(1));
+}
+
+// Tests the linkage of the Not matcher.
+TEST(LinkTest, TestMatcherNot) {
+  Matcher<int> m = Not(_);
+  EXPECT_FALSE(m.Matches(1));
+}
+
+// Tests the linkage of the MatcherCast<T>() function.
+TEST(LinkTest, TestMatcherCast) {
+  Matcher<const char*> m = MatcherCast<const char*>(_);
+  EXPECT_TRUE(m.Matches(nullptr));
+}
+
+#endif  // GMOCK_TEST_GMOCK_LINK_TEST_H_
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_output_test.py b/GraphBLAS/CUDA/test/googlemock/test/gmock_output_test.py
new file mode 100755
index 0000000000..25f99f2b79
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_output_test.py
@@ -0,0 +1,183 @@
+#!/usr/bin/env python
+#
+# Copyright 2008, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+r"""Tests the text output of Google C++ Mocking Framework.
+
+To update the golden file:
+gmock_output_test.py --build_dir=BUILD/DIR --gengolden
+where BUILD/DIR contains the built gmock_output_test_ file.
+gmock_output_test.py --gengolden
+gmock_output_test.py
+
+"""
+
+from io import open    # pylint: disable=redefined-builtin, g-importing-member
+import os
+import re
+import sys
+import gmock_test_utils
+
+
+# The flag for generating the golden file
+GENGOLDEN_FLAG = '--gengolden'
+
+PROGRAM_PATH = gmock_test_utils.GetTestExecutablePath('gmock_output_test_')
+COMMAND = [PROGRAM_PATH, '--gtest_stack_trace_depth=0', '--gtest_print_time=0']
+GOLDEN_NAME = 'gmock_output_test_golden.txt'
+GOLDEN_PATH = os.path.join(gmock_test_utils.GetSourceDir(), GOLDEN_NAME)
+
+
+def ToUnixLineEnding(s):
+  """Changes all Windows/Mac line endings in s to UNIX line endings."""
+
+  return s.replace('\r\n', '\n').replace('\r', '\n')
+
+
+def RemoveReportHeaderAndFooter(output):
+  """Removes Google Test result report's header and footer from the output."""
+
+  output = re.sub(r'.*gtest_main.*\n', '', output)
+  output = re.sub(r'\[.*\d+ tests.*\n', '', output)
+  output = re.sub(r'\[.* test environment .*\n', '', output)
+  output = re.sub(r'\[=+\] \d+ tests .* ran.*', '', output)
+  output = re.sub(r'.* FAILED TESTS\n', '', output)
+  return output
+
+
+def RemoveLocations(output):
+  """Removes all file location info from a Google Test program's output.
+
+  Args:
+       output:  the output of a Google Test program.
+
+  Returns:
+       output with all file location info (in the form of
+       'DIRECTORY/FILE_NAME:LINE_NUMBER: 'or
+       'DIRECTORY\\FILE_NAME(LINE_NUMBER): ') replaced by
+       'FILE:#: '.
+  """
+
+  return re.sub(r'.*[/\\](.+)(\:\d+|\(\d+\))\:', 'FILE:#:', output)
+
+
+def NormalizeErrorMarker(output):
+  """Normalizes the error marker, which is different on Windows vs on Linux."""
+
+  return re.sub(r' error: ', ' Failure\n', output)
+
+
+def RemoveMemoryAddresses(output):
+  """Removes memory addresses from the test output."""
+
+  return re.sub(r'@\w+', '@0x#', output)
+
+
+def RemoveTestNamesOfLeakedMocks(output):
+  """Removes the test names of leaked mock objects from the test output."""
+
+  return re.sub(r'\(used in test .+\) ', '', output)
+
+
+def GetLeakyTests(output):
+  """Returns a list of test names that leak mock objects."""
+
+  # findall() returns a list of all matches of the regex in output.
+  # For example, if '(used in test FooTest.Bar)' is in output, the
+  # list will contain 'FooTest.Bar'.
+  return re.findall(r'\(used in test (.+)\)', output)
+
+
+def GetNormalizedOutputAndLeakyTests(output):
+  """Normalizes the output of gmock_output_test_.
+
+  Args:
+    output: The test output.
+
+  Returns:
+    A tuple (the normalized test output, the list of test names that have
+    leaked mocks).
+  """
+
+  output = ToUnixLineEnding(output)
+  output = RemoveReportHeaderAndFooter(output)
+  output = NormalizeErrorMarker(output)
+  output = RemoveLocations(output)
+  output = RemoveMemoryAddresses(output)
+  return (RemoveTestNamesOfLeakedMocks(output), GetLeakyTests(output))
+
+
+def GetShellCommandOutput(cmd):
+  """Runs a command in a sub-process, and returns its STDOUT in a string."""
+
+  return gmock_test_utils.Subprocess(cmd, capture_stderr=False).output
+
+
+def GetNormalizedCommandOutputAndLeakyTests(cmd):
+  """Runs a command and returns its normalized output and a list of leaky tests.
+
+  Args:
+    cmd:  the shell command.
+  """
+
+  # Disables exception pop-ups on Windows.
+  os.environ['GTEST_CATCH_EXCEPTIONS'] = '1'
+  return GetNormalizedOutputAndLeakyTests(GetShellCommandOutput(cmd))
+
+
+class GMockOutputTest(gmock_test_utils.TestCase):
+
+  def testOutput(self):
+    (output, leaky_tests) = GetNormalizedCommandOutputAndLeakyTests(COMMAND)
+    golden_file = open(GOLDEN_PATH, 'rb')
+    golden = golden_file.read().decode('utf-8')
+    golden_file.close()
+
+    # The normalized output should match the golden file.
+    self.assertEquals(golden, output)
+
+    # The raw output should contain 2 leaked mock object errors for
+    # test GMockOutputTest.CatchesLeakedMocks.
+    self.assertEquals(['GMockOutputTest.CatchesLeakedMocks',
+                       'GMockOutputTest.CatchesLeakedMocks'],
+                      leaky_tests)
+
+
+if __name__ == '__main__':
+  if sys.argv[1:] == [GENGOLDEN_FLAG]:
+    (output, _) = GetNormalizedCommandOutputAndLeakyTests(COMMAND)
+    golden_file = open(GOLDEN_PATH, 'wb')
+    golden_file.write(output)
+    golden_file.close()
+    # Suppress the error "googletest was imported but a call to its main()
+    # was never detected."
+    os._exit(0)
+  else:
+    gmock_test_utils.Main()
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_output_test_.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock_output_test_.cc
new file mode 100644
index 0000000000..3955c7331a
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_output_test_.cc
@@ -0,0 +1,309 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Tests Google Mock's output in various scenarios.  This ensures that
+// Google Mock's messages are readable and useful.
+
+#include "gmock/gmock.h"
+
+#include <stdio.h>
+#include <string>
+
+#include "gtest/gtest.h"
+
+// Silence C4100 (unreferenced formal parameter)
+#ifdef _MSC_VER
+# pragma warning(push)
+# pragma warning(disable:4100)
+#endif
+
+using testing::_;
+using testing::AnyNumber;
+using testing::Ge;
+using testing::InSequence;
+using testing::NaggyMock;
+using testing::Ref;
+using testing::Return;
+using testing::Sequence;
+using testing::Value;
+
+class MockFoo {
+ public:
+  MockFoo() {}
+
+  MOCK_METHOD3(Bar, char(const std::string& s, int i, double x));
+  MOCK_METHOD2(Bar2, bool(int x, int y));
+  MOCK_METHOD2(Bar3, void(int x, int y));
+
+ private:
+  GTEST_DISALLOW_COPY_AND_ASSIGN_(MockFoo);
+};
+
+class GMockOutputTest : public testing::Test {
+ protected:
+  NaggyMock<MockFoo> foo_;
+};
+
+TEST_F(GMockOutputTest, ExpectedCall) {
+  testing::GMOCK_FLAG(verbose) = "info";
+
+  EXPECT_CALL(foo_, Bar2(0, _));
+  foo_.Bar2(0, 0);  // Expected call
+
+  testing::GMOCK_FLAG(verbose) = "warning";
+}
+
+TEST_F(GMockOutputTest, ExpectedCallToVoidFunction) {
+  testing::GMOCK_FLAG(verbose) = "info";
+
+  EXPECT_CALL(foo_, Bar3(0, _));
+  foo_.Bar3(0, 0);  // Expected call
+
+  testing::GMOCK_FLAG(verbose) = "warning";
+}
+
+TEST_F(GMockOutputTest, ExplicitActionsRunOut) {
+  EXPECT_CALL(foo_, Bar2(_, _))
+      .Times(2)
+      .WillOnce(Return(false));
+  foo_.Bar2(2, 2);
+  foo_.Bar2(1, 1);  // Explicit actions in EXPECT_CALL run out.
+}
+
+TEST_F(GMockOutputTest, UnexpectedCall) {
+  EXPECT_CALL(foo_, Bar2(0, _));
+
+  foo_.Bar2(1, 0);  // Unexpected call
+  foo_.Bar2(0, 0);  // Expected call
+}
+
+TEST_F(GMockOutputTest, UnexpectedCallToVoidFunction) {
+  EXPECT_CALL(foo_, Bar3(0, _));
+
+  foo_.Bar3(1, 0);  // Unexpected call
+  foo_.Bar3(0, 0);  // Expected call
+}
+
+TEST_F(GMockOutputTest, ExcessiveCall) {
+  EXPECT_CALL(foo_, Bar2(0, _));
+
+  foo_.Bar2(0, 0);  // Expected call
+  foo_.Bar2(0, 1);  // Excessive call
+}
+
+TEST_F(GMockOutputTest, ExcessiveCallToVoidFunction) {
+  EXPECT_CALL(foo_, Bar3(0, _));
+
+  foo_.Bar3(0, 0);  // Expected call
+  foo_.Bar3(0, 1);  // Excessive call
+}
+
+TEST_F(GMockOutputTest, UninterestingCall) {
+  foo_.Bar2(0, 1);  // Uninteresting call
+}
+
+TEST_F(GMockOutputTest, UninterestingCallToVoidFunction) {
+  foo_.Bar3(0, 1);  // Uninteresting call
+}
+
+TEST_F(GMockOutputTest, RetiredExpectation) {
+  EXPECT_CALL(foo_, Bar2(_, _))
+      .RetiresOnSaturation();
+  EXPECT_CALL(foo_, Bar2(0, 0));
+
+  foo_.Bar2(1, 1);
+  foo_.Bar2(1, 1);  // Matches a retired expectation
+  foo_.Bar2(0, 0);
+}
+
+TEST_F(GMockOutputTest, UnsatisfiedPrerequisite) {
+  {
+    InSequence s;
+    EXPECT_CALL(foo_, Bar(_, 0, _));
+    EXPECT_CALL(foo_, Bar2(0, 0));
+    EXPECT_CALL(foo_, Bar2(1, _));
+  }
+
+  foo_.Bar2(1, 0);  // Has one immediate unsatisfied pre-requisite
+  foo_.Bar("Hi", 0, 0);
+  foo_.Bar2(0, 0);
+  foo_.Bar2(1, 0);
+}
+
+TEST_F(GMockOutputTest, UnsatisfiedPrerequisites) {
+  Sequence s1, s2;
+
+  EXPECT_CALL(foo_, Bar(_, 0, _))
+      .InSequence(s1);
+  EXPECT_CALL(foo_, Bar2(0, 0))
+      .InSequence(s2);
+  EXPECT_CALL(foo_, Bar2(1, _))
+      .InSequence(s1, s2);
+
+  foo_.Bar2(1, 0);  // Has two immediate unsatisfied pre-requisites
+  foo_.Bar("Hi", 0, 0);
+  foo_.Bar2(0, 0);
+  foo_.Bar2(1, 0);
+}
+
+TEST_F(GMockOutputTest, UnsatisfiedWith) {
+  EXPECT_CALL(foo_, Bar2(_, _)).With(Ge());
+}
+
+TEST_F(GMockOutputTest, UnsatisfiedExpectation) {
+  EXPECT_CALL(foo_, Bar(_, _, _));
+  EXPECT_CALL(foo_, Bar2(0, _))
+      .Times(2);
+
+  foo_.Bar2(0, 1);
+}
+
+TEST_F(GMockOutputTest, MismatchArguments) {
+  const std::string s = "Hi";
+  EXPECT_CALL(foo_, Bar(Ref(s), _, Ge(0)));
+
+  foo_.Bar("Ho", 0, -0.1);  // Mismatch arguments
+  foo_.Bar(s, 0, 0);
+}
+
+TEST_F(GMockOutputTest, MismatchWith) {
+  EXPECT_CALL(foo_, Bar2(Ge(2), Ge(1)))
+      .With(Ge());
+
+  foo_.Bar2(2, 3);  // Mismatch With()
+  foo_.Bar2(2, 1);
+}
+
+TEST_F(GMockOutputTest, MismatchArgumentsAndWith) {
+  EXPECT_CALL(foo_, Bar2(Ge(2), Ge(1)))
+      .With(Ge());
+
+  foo_.Bar2(1, 3);  // Mismatch arguments and mismatch With()
+  foo_.Bar2(2, 1);
+}
+
+TEST_F(GMockOutputTest, UnexpectedCallWithDefaultAction) {
+  ON_CALL(foo_, Bar2(_, _))
+      .WillByDefault(Return(true));   // Default action #1
+  ON_CALL(foo_, Bar2(1, _))
+      .WillByDefault(Return(false));  // Default action #2
+
+  EXPECT_CALL(foo_, Bar2(2, 2));
+  foo_.Bar2(1, 0);  // Unexpected call, takes default action #2.
+  foo_.Bar2(0, 0);  // Unexpected call, takes default action #1.
+  foo_.Bar2(2, 2);  // Expected call.
+}
+
+TEST_F(GMockOutputTest, ExcessiveCallWithDefaultAction) {
+  ON_CALL(foo_, Bar2(_, _))
+      .WillByDefault(Return(true));   // Default action #1
+  ON_CALL(foo_, Bar2(1, _))
+      .WillByDefault(Return(false));  // Default action #2
+
+  EXPECT_CALL(foo_, Bar2(2, 2));
+  EXPECT_CALL(foo_, Bar2(1, 1));
+
+  foo_.Bar2(2, 2);  // Expected call.
+  foo_.Bar2(2, 2);  // Excessive call, takes default action #1.
+  foo_.Bar2(1, 1);  // Expected call.
+  foo_.Bar2(1, 1);  // Excessive call, takes default action #2.
+}
+
+TEST_F(GMockOutputTest, UninterestingCallWithDefaultAction) {
+  ON_CALL(foo_, Bar2(_, _))
+      .WillByDefault(Return(true));   // Default action #1
+  ON_CALL(foo_, Bar2(1, _))
+      .WillByDefault(Return(false));  // Default action #2
+
+  foo_.Bar2(2, 2);  // Uninteresting call, takes default action #1.
+  foo_.Bar2(1, 1);  // Uninteresting call, takes default action #2.
+}
+
+TEST_F(GMockOutputTest, ExplicitActionsRunOutWithDefaultAction) {
+  ON_CALL(foo_, Bar2(_, _))
+      .WillByDefault(Return(true));   // Default action #1
+
+  EXPECT_CALL(foo_, Bar2(_, _))
+      .Times(2)
+      .WillOnce(Return(false));
+  foo_.Bar2(2, 2);
+  foo_.Bar2(1, 1);  // Explicit actions in EXPECT_CALL run out.
+}
+
+TEST_F(GMockOutputTest, CatchesLeakedMocks) {
+  MockFoo* foo1 = new MockFoo;
+  MockFoo* foo2 = new MockFoo;
+
+  // Invokes ON_CALL on foo1.
+  ON_CALL(*foo1, Bar(_, _, _)).WillByDefault(Return('a'));
+
+  // Invokes EXPECT_CALL on foo2.
+  EXPECT_CALL(*foo2, Bar2(_, _));
+  EXPECT_CALL(*foo2, Bar2(1, _));
+  EXPECT_CALL(*foo2, Bar3(_, _)).Times(AnyNumber());
+  foo2->Bar2(2, 1);
+  foo2->Bar2(1, 1);
+
+  // Both foo1 and foo2 are deliberately leaked.
+}
+
+MATCHER_P2(IsPair, first, second, "") {
+  return Value(arg.first, first) && Value(arg.second, second);
+}
+
+TEST_F(GMockOutputTest, PrintsMatcher) {
+  const testing::Matcher<int> m1 = Ge(48);
+  EXPECT_THAT((std::pair<int, bool>(42, true)), IsPair(m1, true));
+}
+
+void TestCatchesLeakedMocksInAdHocTests() {
+  MockFoo* foo = new MockFoo;
+
+  // Invokes EXPECT_CALL on foo.
+  EXPECT_CALL(*foo, Bar2(_, _));
+  foo->Bar2(2, 1);
+
+  // foo is deliberately leaked.
+}
+
+int main(int argc, char **argv) {
+  testing::InitGoogleMock(&argc, argv);
+  // Ensures that the tests pass no matter what value of
+  // --gmock_catch_leaked_mocks and --gmock_verbose the user specifies.
+  testing::GMOCK_FLAG(catch_leaked_mocks) = true;
+  testing::GMOCK_FLAG(verbose) = "warning";
+
+  TestCatchesLeakedMocksInAdHocTests();
+  return RUN_ALL_TESTS();
+}
+
+#ifdef _MSC_VER
+# pragma warning(pop)
+#endif
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_output_test_golden.txt b/GraphBLAS/CUDA/test/googlemock/test/gmock_output_test_golden.txt
new file mode 100644
index 0000000000..4c90b41a3a
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_output_test_golden.txt
@@ -0,0 +1,317 @@
+[ RUN      ] GMockOutputTest.ExpectedCall
+
+FILE:#: EXPECT_CALL(foo_, Bar2(0, _)) invoked
+Stack trace:
+
+FILE:#: Mock function call matches EXPECT_CALL(foo_, Bar2(0, _))...
+    Function call: Bar2(0, 0)
+          Returns: false
+Stack trace:
+[       OK ] GMockOutputTest.ExpectedCall
+[ RUN      ] GMockOutputTest.ExpectedCallToVoidFunction
+
+FILE:#: EXPECT_CALL(foo_, Bar3(0, _)) invoked
+Stack trace:
+
+FILE:#: Mock function call matches EXPECT_CALL(foo_, Bar3(0, _))...
+    Function call: Bar3(0, 0)
+Stack trace:
+[       OK ] GMockOutputTest.ExpectedCallToVoidFunction
+[ RUN      ] GMockOutputTest.ExplicitActionsRunOut
+
+GMOCK WARNING:
+FILE:#: Too few actions specified in EXPECT_CALL(foo_, Bar2(_, _))...
+Expected to be called twice, but has only 1 WillOnce().
+GMOCK WARNING:
+FILE:#: Actions ran out in EXPECT_CALL(foo_, Bar2(_, _))...
+Called 2 times, but only 1 WillOnce() is specified - returning default value.
+Stack trace:
+[       OK ] GMockOutputTest.ExplicitActionsRunOut
+[ RUN      ] GMockOutputTest.UnexpectedCall
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar2(1, 0)
+          Returns: false
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar2(0, _))...
+  Expected arg #0: is equal to 0
+           Actual: 1
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnexpectedCall
+[ RUN      ] GMockOutputTest.UnexpectedCallToVoidFunction
+unknown file: Failure
+
+Unexpected mock function call - returning directly.
+    Function call: Bar3(1, 0)
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar3(0, _))...
+  Expected arg #0: is equal to 0
+           Actual: 1
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnexpectedCallToVoidFunction
+[ RUN      ] GMockOutputTest.ExcessiveCall
+FILE:#: Failure
+Mock function called more times than expected - returning default value.
+    Function call: Bar2(0, 1)
+          Returns: false
+         Expected: to be called once
+           Actual: called twice - over-saturated and active
+[  FAILED  ] GMockOutputTest.ExcessiveCall
+[ RUN      ] GMockOutputTest.ExcessiveCallToVoidFunction
+FILE:#: Failure
+Mock function called more times than expected - returning directly.
+    Function call: Bar3(0, 1)
+         Expected: to be called once
+           Actual: called twice - over-saturated and active
+[  FAILED  ] GMockOutputTest.ExcessiveCallToVoidFunction
+[ RUN      ] GMockOutputTest.UninterestingCall
+
+GMOCK WARNING:
+Uninteresting mock function call - returning default value.
+    Function call: Bar2(0, 1)
+          Returns: false
+NOTE: You can safely ignore the above warning unless this call should not happen.  Do not suppress it by blindly adding an EXPECT_CALL() if you don't mean to enforce the call.  See https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md#knowing-when-to-expect for details.
+[       OK ] GMockOutputTest.UninterestingCall
+[ RUN      ] GMockOutputTest.UninterestingCallToVoidFunction
+
+GMOCK WARNING:
+Uninteresting mock function call - returning directly.
+    Function call: Bar3(0, 1)
+NOTE: You can safely ignore the above warning unless this call should not happen.  Do not suppress it by blindly adding an EXPECT_CALL() if you don't mean to enforce the call.  See https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md#knowing-when-to-expect for details.
+[       OK ] GMockOutputTest.UninterestingCallToVoidFunction
+[ RUN      ] GMockOutputTest.RetiredExpectation
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar2(1, 1)
+          Returns: false
+Google Mock tried the following 2 expectations, but none matched:
+
+FILE:#: tried expectation #0: EXPECT_CALL(foo_, Bar2(_, _))...
+         Expected: the expectation is active
+           Actual: it is retired
+         Expected: to be called once
+           Actual: called once - saturated and retired
+FILE:#: tried expectation #1: EXPECT_CALL(foo_, Bar2(0, 0))...
+  Expected arg #0: is equal to 0
+           Actual: 1
+  Expected arg #1: is equal to 0
+           Actual: 1
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.RetiredExpectation
+[ RUN      ] GMockOutputTest.UnsatisfiedPrerequisite
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar2(1, 0)
+          Returns: false
+Google Mock tried the following 2 expectations, but none matched:
+
+FILE:#: tried expectation #0: EXPECT_CALL(foo_, Bar2(0, 0))...
+  Expected arg #0: is equal to 0
+           Actual: 1
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+FILE:#: tried expectation #1: EXPECT_CALL(foo_, Bar2(1, _))...
+         Expected: all pre-requisites are satisfied
+           Actual: the following immediate pre-requisites are not satisfied:
+FILE:#: pre-requisite #0
+                   (end of pre-requisites)
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnsatisfiedPrerequisite
+[ RUN      ] GMockOutputTest.UnsatisfiedPrerequisites
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar2(1, 0)
+          Returns: false
+Google Mock tried the following 2 expectations, but none matched:
+
+FILE:#: tried expectation #0: EXPECT_CALL(foo_, Bar2(0, 0))...
+  Expected arg #0: is equal to 0
+           Actual: 1
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+FILE:#: tried expectation #1: EXPECT_CALL(foo_, Bar2(1, _))...
+         Expected: all pre-requisites are satisfied
+           Actual: the following immediate pre-requisites are not satisfied:
+FILE:#: pre-requisite #0
+FILE:#: pre-requisite #1
+                   (end of pre-requisites)
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnsatisfiedPrerequisites
+[ RUN      ] GMockOutputTest.UnsatisfiedWith
+FILE:#: Failure
+Actual function call count doesn't match EXPECT_CALL(foo_, Bar2(_, _))...
+    Expected args: are a pair where the first >= the second
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnsatisfiedWith
+[ RUN      ] GMockOutputTest.UnsatisfiedExpectation
+FILE:#: Failure
+Actual function call count doesn't match EXPECT_CALL(foo_, Bar2(0, _))...
+         Expected: to be called twice
+           Actual: called once - unsatisfied and active
+FILE:#: Failure
+Actual function call count doesn't match EXPECT_CALL(foo_, Bar(_, _, _))...
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnsatisfiedExpectation
+[ RUN      ] GMockOutputTest.MismatchArguments
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar(@0x# "Ho", 0, -0.1)
+          Returns: '\0'
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar(Ref(s), _, Ge(0)))...
+  Expected arg #0: references the variable @0x# "Hi"
+           Actual: "Ho", which is located @0x#
+  Expected arg #2: is >= 0
+           Actual: -0.1
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.MismatchArguments
+[ RUN      ] GMockOutputTest.MismatchWith
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar2(2, 3)
+          Returns: false
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar2(Ge(2), Ge(1)))...
+    Expected args: are a pair where the first >= the second
+           Actual: don't match
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.MismatchWith
+[ RUN      ] GMockOutputTest.MismatchArgumentsAndWith
+unknown file: Failure
+
+Unexpected mock function call - returning default value.
+    Function call: Bar2(1, 3)
+          Returns: false
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar2(Ge(2), Ge(1)))...
+  Expected arg #0: is >= 2
+           Actual: 1
+    Expected args: are a pair where the first >= the second
+           Actual: don't match
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.MismatchArgumentsAndWith
+[ RUN      ] GMockOutputTest.UnexpectedCallWithDefaultAction
+unknown file: Failure
+
+Unexpected mock function call - taking default action specified at:
+FILE:#:
+    Function call: Bar2(1, 0)
+          Returns: false
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar2(2, 2))...
+  Expected arg #0: is equal to 2
+           Actual: 1
+  Expected arg #1: is equal to 2
+           Actual: 0
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+unknown file: Failure
+
+Unexpected mock function call - taking default action specified at:
+FILE:#:
+    Function call: Bar2(0, 0)
+          Returns: true
+Google Mock tried the following 1 expectation, but it didn't match:
+
+FILE:#: EXPECT_CALL(foo_, Bar2(2, 2))...
+  Expected arg #0: is equal to 2
+           Actual: 0
+  Expected arg #1: is equal to 2
+           Actual: 0
+         Expected: to be called once
+           Actual: never called - unsatisfied and active
+[  FAILED  ] GMockOutputTest.UnexpectedCallWithDefaultAction
+[ RUN      ] GMockOutputTest.ExcessiveCallWithDefaultAction
+FILE:#: Failure
+Mock function called more times than expected - taking default action specified at:
+FILE:#:
+    Function call: Bar2(2, 2)
+          Returns: true
+         Expected: to be called once
+           Actual: called twice - over-saturated and active
+FILE:#: Failure
+Mock function called more times than expected - taking default action specified at:
+FILE:#:
+    Function call: Bar2(1, 1)
+          Returns: false
+         Expected: to be called once
+           Actual: called twice - over-saturated and active
+[  FAILED  ] GMockOutputTest.ExcessiveCallWithDefaultAction
+[ RUN      ] GMockOutputTest.UninterestingCallWithDefaultAction
+
+GMOCK WARNING:
+Uninteresting mock function call - taking default action specified at:
+FILE:#:
+    Function call: Bar2(2, 2)
+          Returns: true
+NOTE: You can safely ignore the above warning unless this call should not happen.  Do not suppress it by blindly adding an EXPECT_CALL() if you don't mean to enforce the call.  See https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md#knowing-when-to-expect for details.
+
+GMOCK WARNING:
+Uninteresting mock function call - taking default action specified at:
+FILE:#:
+    Function call: Bar2(1, 1)
+          Returns: false
+NOTE: You can safely ignore the above warning unless this call should not happen.  Do not suppress it by blindly adding an EXPECT_CALL() if you don't mean to enforce the call.  See https://github.com/google/googletest/blob/master/googlemock/docs/cook_book.md#knowing-when-to-expect for details.
+[       OK ] GMockOutputTest.UninterestingCallWithDefaultAction
+[ RUN      ] GMockOutputTest.ExplicitActionsRunOutWithDefaultAction
+
+GMOCK WARNING:
+FILE:#: Too few actions specified in EXPECT_CALL(foo_, Bar2(_, _))...
+Expected to be called twice, but has only 1 WillOnce().
+GMOCK WARNING:
+FILE:#: Actions ran out in EXPECT_CALL(foo_, Bar2(_, _))...
+Called 2 times, but only 1 WillOnce() is specified - taking default action specified at:
+FILE:#:
+Stack trace:
+[       OK ] GMockOutputTest.ExplicitActionsRunOutWithDefaultAction
+[ RUN      ] GMockOutputTest.CatchesLeakedMocks
+[       OK ] GMockOutputTest.CatchesLeakedMocks
+[ RUN      ] GMockOutputTest.PrintsMatcher
+FILE:#: Failure
+Value of: (std::pair<int, bool>(42, true))
+Expected: is pair (is >= 48, true)
+  Actual: (42, true) (of type std::pair<int, bool>)
+[  FAILED  ] GMockOutputTest.PrintsMatcher
+[  FAILED  ] GMockOutputTest.UnexpectedCall
+[  FAILED  ] GMockOutputTest.UnexpectedCallToVoidFunction
+[  FAILED  ] GMockOutputTest.ExcessiveCall
+[  FAILED  ] GMockOutputTest.ExcessiveCallToVoidFunction
+[  FAILED  ] GMockOutputTest.RetiredExpectation
+[  FAILED  ] GMockOutputTest.UnsatisfiedPrerequisite
+[  FAILED  ] GMockOutputTest.UnsatisfiedPrerequisites
+[  FAILED  ] GMockOutputTest.UnsatisfiedWith
+[  FAILED  ] GMockOutputTest.UnsatisfiedExpectation
+[  FAILED  ] GMockOutputTest.MismatchArguments
+[  FAILED  ] GMockOutputTest.MismatchWith
+[  FAILED  ] GMockOutputTest.MismatchArgumentsAndWith
+[  FAILED  ] GMockOutputTest.UnexpectedCallWithDefaultAction
+[  FAILED  ] GMockOutputTest.ExcessiveCallWithDefaultAction
+[  FAILED  ] GMockOutputTest.PrintsMatcher
+
+
+FILE:#: ERROR: this mock object should be deleted but never is. Its address is @0x#.
+FILE:#: ERROR: this mock object should be deleted but never is. Its address is @0x#.
+FILE:#: ERROR: this mock object should be deleted but never is. Its address is @0x#.
+ERROR: 3 leaked mock objects found at program exit. Expectations on a mock object is verified when the object is destructed. Leaking a mock means that its expectations aren't verified, which is usually a test bug. If you really intend to leak a mock, you can suppress this error using testing::Mock::AllowLeak(mock_object), or you may use a fake or stub instead of a mock.
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_stress_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock_stress_test.cc
new file mode 100644
index 0000000000..20725d69b7
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_stress_test.cc
@@ -0,0 +1,240 @@
+// Copyright 2007, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Tests that Google Mock constructs can be used in a large number of
+// threads concurrently.
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace testing {
+namespace {
+
+// From gtest-port.h.
+using ::testing::internal::ThreadWithParam;
+
+// The maximum number of test threads (not including helper threads)
+// to create.
+const int kMaxTestThreads = 50;
+
+// How many times to repeat a task in a test thread.
+const int kRepeat = 50;
+
+class MockFoo {
+ public:
+  MOCK_METHOD1(Bar, int(int n));  // NOLINT
+  MOCK_METHOD2(Baz, char(const char* s1, const std::string& s2));  // NOLINT
+};
+
+// Helper for waiting for the given thread to finish and then deleting it.
+template <typename T>
+void JoinAndDelete(ThreadWithParam<T>* t) {
+  t->Join();
+  delete t;
+}
+
+struct Dummy {};
+
+
+// Tests that different mock objects can be used in their respective
+// threads.  This should generate no Google Test failure.
+void TestConcurrentMockObjects(Dummy /* dummy */) {
+  // Creates a mock and does some typical operations on it.
+  MockFoo foo;
+  ON_CALL(foo, Bar(_))
+      .WillByDefault(Return(1));
+  ON_CALL(foo, Baz(_, _))
+      .WillByDefault(Return('b'));
+  ON_CALL(foo, Baz(_, "you"))
+      .WillByDefault(Return('a'));
+
+  EXPECT_CALL(foo, Bar(0))
+      .Times(AtMost(3));
+  EXPECT_CALL(foo, Baz(_, _));
+  EXPECT_CALL(foo, Baz("hi", "you"))
+      .WillOnce(Return('z'))
+      .WillRepeatedly(DoDefault());
+
+  EXPECT_EQ(1, foo.Bar(0));
+  EXPECT_EQ(1, foo.Bar(0));
+  EXPECT_EQ('z', foo.Baz("hi", "you"));
+  EXPECT_EQ('a', foo.Baz("hi", "you"));
+  EXPECT_EQ('b', foo.Baz("hi", "me"));
+}
+
+// Tests invoking methods of the same mock object in multiple threads.
+
+struct Helper1Param {
+  MockFoo* mock_foo;
+  int* count;
+};
+
+void Helper1(Helper1Param param) {
+  for (int i = 0; i < kRepeat; i++) {
+    const char ch = param.mock_foo->Baz("a", "b");
+    if (ch == 'a') {
+      // It was an expected call.
+      (*param.count)++;
+    } else {
+      // It was an excessive call.
+      EXPECT_EQ('\0', ch);
+    }
+
+    // An unexpected call.
+    EXPECT_EQ('\0', param.mock_foo->Baz("x", "y")) << "Expected failure.";
+
+    // An uninteresting call.
+    EXPECT_EQ(1, param.mock_foo->Bar(5));
+  }
+}
+
+// This should generate 3*kRepeat + 1 failures in total.
+void TestConcurrentCallsOnSameObject(Dummy /* dummy */) {
+  MockFoo foo;
+
+  ON_CALL(foo, Bar(_))
+      .WillByDefault(Return(1));
+  EXPECT_CALL(foo, Baz(_, "b"))
+      .Times(kRepeat)
+      .WillRepeatedly(Return('a'));
+  EXPECT_CALL(foo, Baz(_, "c"));  // Expected to be unsatisfied.
+
+  // This chunk of code should generate kRepeat failures about
+  // excessive calls, and 2*kRepeat failures about unexpected calls.
+  int count1 = 0;
+  const Helper1Param param = { &foo, &count1 };
+  ThreadWithParam<Helper1Param>* const t =
+      new ThreadWithParam<Helper1Param>(Helper1, param, nullptr);
+
+  int count2 = 0;
+  const Helper1Param param2 = { &foo, &count2 };
+  Helper1(param2);
+  JoinAndDelete(t);
+
+  EXPECT_EQ(kRepeat, count1 + count2);
+
+  // foo's destructor should generate one failure about unsatisfied
+  // expectation.
+}
+
+// Tests using the same mock object in multiple threads when the
+// expectations are partially ordered.
+
+void Helper2(MockFoo* foo) {
+  for (int i = 0; i < kRepeat; i++) {
+    foo->Bar(2);
+    foo->Bar(3);
+  }
+}
+
+// This should generate no Google Test failures.
+void TestPartiallyOrderedExpectationsWithThreads(Dummy /* dummy */) {
+  MockFoo foo;
+  Sequence s1, s2;
+
+  {
+    InSequence dummy;
+    EXPECT_CALL(foo, Bar(0));
+    EXPECT_CALL(foo, Bar(1))
+        .InSequence(s1, s2);
+  }
+
+  EXPECT_CALL(foo, Bar(2))
+      .Times(2*kRepeat)
+      .InSequence(s1)
+      .RetiresOnSaturation();
+  EXPECT_CALL(foo, Bar(3))
+      .Times(2*kRepeat)
+      .InSequence(s2);
+
+  {
+    InSequence dummy;
+    EXPECT_CALL(foo, Bar(2))
+        .InSequence(s1, s2);
+    EXPECT_CALL(foo, Bar(4));
+  }
+
+  foo.Bar(0);
+  foo.Bar(1);
+
+  ThreadWithParam<MockFoo*>* const t =
+      new ThreadWithParam<MockFoo*>(Helper2, &foo, nullptr);
+  Helper2(&foo);
+  JoinAndDelete(t);
+
+  foo.Bar(2);
+  foo.Bar(4);
+}
+
+// Tests using Google Mock constructs in many threads concurrently.
+TEST(StressTest, CanUseGMockWithThreads) {
+  void (*test_routines[])(Dummy dummy) = {
+    &TestConcurrentMockObjects,
+    &TestConcurrentCallsOnSameObject,
+    &TestPartiallyOrderedExpectationsWithThreads,
+  };
+
+  const int kRoutines = sizeof(test_routines)/sizeof(test_routines[0]);
+  const int kCopiesOfEachRoutine = kMaxTestThreads / kRoutines;
+  const int kTestThreads = kCopiesOfEachRoutine * kRoutines;
+  ThreadWithParam<Dummy>* threads[kTestThreads] = {};
+  for (int i = 0; i < kTestThreads; i++) {
+    // Creates a thread to run the test function.
+    threads[i] = new ThreadWithParam<Dummy>(test_routines[i % kRoutines],
+                                            Dummy(), nullptr);
+    GTEST_LOG_(INFO) << "Thread #" << i << " running . . .";
+  }
+
+  // At this point, we have many threads running.
+  for (int i = 0; i < kTestThreads; i++) {
+    JoinAndDelete(threads[i]);
+  }
+
+  // Ensures that the correct number of failures have been reported.
+  const TestInfo* const info = UnitTest::GetInstance()->current_test_info();
+  const TestResult& result = *info->result();
+  const int kExpectedFailures = (3*kRepeat + 1)*kCopiesOfEachRoutine;
+  GTEST_CHECK_(kExpectedFailures == result.total_part_count())
+      << "Expected " << kExpectedFailures << " failures, but got "
+      << result.total_part_count();
+}
+
+}  // namespace
+}  // namespace testing
+
+int main(int argc, char **argv) {
+  testing::InitGoogleMock(&argc, argv);
+
+  const int exit_code = RUN_ALL_TESTS();  // Expected to fail.
+  GTEST_CHECK_(exit_code != 0) << "RUN_ALL_TESTS() did not fail as expected";
+
+  printf("\nPASS\n");
+  return 0;
+}
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_test.cc b/GraphBLAS/CUDA/test/googlemock/test/gmock_test.cc
new file mode 100644
index 0000000000..e9840a337d
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_test.cc
@@ -0,0 +1,181 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+// Google Mock - a framework for writing C++ mock classes.
+//
+// This file tests code in gmock.cc.
+
+#include "gmock/gmock.h"
+
+#include <string>
+#include "gtest/gtest.h"
+#include "gtest/internal/custom/gtest.h"
+
+#if !defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+
+using testing::GMOCK_FLAG(default_mock_behavior);
+using testing::GMOCK_FLAG(verbose);
+using testing::InitGoogleMock;
+
+// Verifies that calling InitGoogleMock() on argv results in new_argv,
+// and the gmock_verbose flag's value is set to expected_gmock_verbose.
+template <typename Char, int M, int N>
+void TestInitGoogleMock(const Char* (&argv)[M], const Char* (&new_argv)[N],
+                        const ::std::string& expected_gmock_verbose) {
+  const ::std::string old_verbose = GMOCK_FLAG(verbose);
+
+  int argc = M - 1;
+  InitGoogleMock(&argc, const_cast<Char**>(argv));
+  ASSERT_EQ(N - 1, argc) << "The new argv has wrong number of elements.";
+
+  for (int i = 0; i < N; i++) {
+    EXPECT_STREQ(new_argv[i], argv[i]);
+  }
+
+  EXPECT_EQ(expected_gmock_verbose, GMOCK_FLAG(verbose).c_str());
+  GMOCK_FLAG(verbose) = old_verbose;  // Restores the gmock_verbose flag.
+}
+
+TEST(InitGoogleMockTest, ParsesInvalidCommandLine) {
+  const char* argv[] = {nullptr};
+
+  const char* new_argv[] = {nullptr};
+
+  TestInitGoogleMock(argv, new_argv, GMOCK_FLAG(verbose));
+}
+
+TEST(InitGoogleMockTest, ParsesEmptyCommandLine) {
+  const char* argv[] = {"foo.exe", nullptr};
+
+  const char* new_argv[] = {"foo.exe", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, GMOCK_FLAG(verbose));
+}
+
+TEST(InitGoogleMockTest, ParsesSingleFlag) {
+  const char* argv[] = {"foo.exe", "--gmock_verbose=info", nullptr};
+
+  const char* new_argv[] = {"foo.exe", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, "info");
+}
+
+TEST(InitGoogleMockTest, ParsesMultipleFlags) {
+  int old_default_behavior = GMOCK_FLAG(default_mock_behavior);
+  const wchar_t* argv[] = {L"foo.exe", L"--gmock_verbose=info",
+                           L"--gmock_default_mock_behavior=2", nullptr};
+
+  const wchar_t* new_argv[] = {L"foo.exe", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, "info");
+  EXPECT_EQ(2, GMOCK_FLAG(default_mock_behavior));
+  EXPECT_NE(2, old_default_behavior);
+  GMOCK_FLAG(default_mock_behavior) = old_default_behavior;
+}
+
+TEST(InitGoogleMockTest, ParsesUnrecognizedFlag) {
+  const char* argv[] = {"foo.exe", "--non_gmock_flag=blah", nullptr};
+
+  const char* new_argv[] = {"foo.exe", "--non_gmock_flag=blah", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, GMOCK_FLAG(verbose));
+}
+
+TEST(InitGoogleMockTest, ParsesGoogleMockFlagAndUnrecognizedFlag) {
+  const char* argv[] = {"foo.exe", "--non_gmock_flag=blah",
+                        "--gmock_verbose=error", nullptr};
+
+  const char* new_argv[] = {"foo.exe", "--non_gmock_flag=blah", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, "error");
+}
+
+TEST(WideInitGoogleMockTest, ParsesInvalidCommandLine) {
+  const wchar_t* argv[] = {nullptr};
+
+  const wchar_t* new_argv[] = {nullptr};
+
+  TestInitGoogleMock(argv, new_argv, GMOCK_FLAG(verbose));
+}
+
+TEST(WideInitGoogleMockTest, ParsesEmptyCommandLine) {
+  const wchar_t* argv[] = {L"foo.exe", nullptr};
+
+  const wchar_t* new_argv[] = {L"foo.exe", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, GMOCK_FLAG(verbose));
+}
+
+TEST(WideInitGoogleMockTest, ParsesSingleFlag) {
+  const wchar_t* argv[] = {L"foo.exe", L"--gmock_verbose=info", nullptr};
+
+  const wchar_t* new_argv[] = {L"foo.exe", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, "info");
+}
+
+TEST(WideInitGoogleMockTest, ParsesMultipleFlags) {
+  int old_default_behavior = GMOCK_FLAG(default_mock_behavior);
+  const wchar_t* argv[] = {L"foo.exe", L"--gmock_verbose=info",
+                           L"--gmock_default_mock_behavior=2", nullptr};
+
+  const wchar_t* new_argv[] = {L"foo.exe", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, "info");
+  EXPECT_EQ(2, GMOCK_FLAG(default_mock_behavior));
+  EXPECT_NE(2, old_default_behavior);
+  GMOCK_FLAG(default_mock_behavior) = old_default_behavior;
+}
+
+TEST(WideInitGoogleMockTest, ParsesUnrecognizedFlag) {
+  const wchar_t* argv[] = {L"foo.exe", L"--non_gmock_flag=blah", nullptr};
+
+  const wchar_t* new_argv[] = {L"foo.exe", L"--non_gmock_flag=blah", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, GMOCK_FLAG(verbose));
+}
+
+TEST(WideInitGoogleMockTest, ParsesGoogleMockFlagAndUnrecognizedFlag) {
+  const wchar_t* argv[] = {L"foo.exe", L"--non_gmock_flag=blah",
+                           L"--gmock_verbose=error", nullptr};
+
+  const wchar_t* new_argv[] = {L"foo.exe", L"--non_gmock_flag=blah", nullptr};
+
+  TestInitGoogleMock(argv, new_argv, "error");
+}
+
+#endif  // !defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+
+// Makes sure Google Mock flags can be accessed in code.
+TEST(FlagTest, IsAccessibleInCode) {
+  bool dummy = testing::GMOCK_FLAG(catch_leaked_mocks) &&
+      testing::GMOCK_FLAG(verbose) == "";
+  (void)dummy;  // Avoids the "unused local variable" warning.
+}
diff --git a/GraphBLAS/CUDA/test/googlemock/test/gmock_test_utils.py b/GraphBLAS/CUDA/test/googlemock/test/gmock_test_utils.py
new file mode 100755
index 0000000000..7dc4e119d3
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/gmock_test_utils.py
@@ -0,0 +1,108 @@
+# Copyright 2006, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Unit test utilities for Google C++ Mocking Framework."""
+
+import os
+import sys
+
+# Determines path to gtest_test_utils and imports it.
+SCRIPT_DIR = os.path.dirname(__file__) or '.'
+
+# isdir resolves symbolic links.
+gtest_tests_util_dir = os.path.join(SCRIPT_DIR, '../../googletest/test')
+if os.path.isdir(gtest_tests_util_dir):
+  GTEST_TESTS_UTIL_DIR = gtest_tests_util_dir
+else:
+  GTEST_TESTS_UTIL_DIR = os.path.join(SCRIPT_DIR, '../../googletest/test')
+sys.path.append(GTEST_TESTS_UTIL_DIR)
+
+# pylint: disable=C6204
+import gtest_test_utils
+
+
+def GetSourceDir():
+  """Returns the absolute path of the directory where the .py files are."""
+
+  return gtest_test_utils.GetSourceDir()
+
+
+def GetTestExecutablePath(executable_name):
+  """Returns the absolute path of the test binary given its name.
+
+  The function will print a message and abort the program if the resulting file
+  doesn't exist.
+
+  Args:
+    executable_name: name of the test binary that the test script runs.
+
+  Returns:
+    The absolute path of the test binary.
+  """
+
+  return gtest_test_utils.GetTestExecutablePath(executable_name)
+
+
+def GetExitStatus(exit_code):
+  """Returns the argument to exit(), or -1 if exit() wasn't called.
+
+  Args:
+    exit_code: the result value of os.system(command).
+  """
+
+  if os.name == 'nt':
+    # On Windows, os.WEXITSTATUS() doesn't work and os.system() returns
+    # the argument to exit() directly.
+    return exit_code
+  else:
+    # On Unix, os.WEXITSTATUS() must be used to extract the exit status
+    # from the result of os.system().
+    if os.WIFEXITED(exit_code):
+      return os.WEXITSTATUS(exit_code)
+    else:
+      return -1
+
+
+# Suppresses the "Invalid const name" lint complaint
+# pylint: disable-msg=C6409
+
+# Exposes utilities from gtest_test_utils.
+Subprocess = gtest_test_utils.Subprocess
+TestCase = gtest_test_utils.TestCase
+environ = gtest_test_utils.environ
+SetEnvVar = gtest_test_utils.SetEnvVar
+PREMATURE_EXIT_FILE_ENV_VAR = gtest_test_utils.PREMATURE_EXIT_FILE_ENV_VAR
+
+# pylint: enable-msg=C6409
+
+
+def Main():
+  """Runs the unit test."""
+
+  gtest_test_utils.Main()
diff --git a/GraphBLAS/CUDA/test/googlemock/test/pump_test.py b/GraphBLAS/CUDA/test/googlemock/test/pump_test.py
new file mode 100755
index 0000000000..eb5a131341
--- /dev/null
+++ b/GraphBLAS/CUDA/test/googlemock/test/pump_test.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+#
+# Copyright 2010, Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+#     * Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above
+# copyright notice, this list of conditions and the following disclaimer
+# in the documentation and/or other materials provided with the
+# distribution.
+#     * Neither the name of Google Inc. nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Tests for the Pump meta-programming tool."""
+
+from google3.testing.pybase import googletest
+import google3.third_party.googletest.googlemock.scripts.pump
+
+pump = google3.third_party.googletest.googlemock.scripts.pump
+Convert = pump.ConvertFromPumpSource
+StripMetaComments = pump.StripMetaComments
+
+
+class PumpTest(googletest.TestCase):
+
+  def testConvertsEmptyToEmpty(self):
+    self.assertEquals('', Convert('').strip())
+
+  def testConvertsPlainCodeToSame(self):
+    self.assertEquals('#include <stdio.h>\n',
+                      Convert('#include <stdio.h>\n'))
+
+  def testConvertsLongIWYUPragmaToSame(self):
+    long_line = '// IWYU pragma: private, include "' + (80*'a') + '.h"\n'
+    self.assertEquals(long_line, Convert(long_line))
+
+  def testConvertsIWYUPragmaWithLeadingSpaceToSame(self):
+    long_line = ' // IWYU pragma: private, include "' + (80*'a') + '.h"\n'
+    self.assertEquals(long_line, Convert(long_line))
+
+  def testConvertsIWYUPragmaWithSlashStarLeaderToSame(self):
+    long_line = '/* IWYU pragma: private, include "' + (80*'a') + '.h"\n'
+    self.assertEquals(long_line, Convert(long_line))
+
+  def testConvertsIWYUPragmaWithSlashStarAndSpacesToSame(self):
+    long_line = ' /* IWYU pragma: private, include "' + (80*'a') + '.h"\n'
+    self.assertEquals(long_line, Convert(long_line))
+
+  def testIgnoresMetaComment(self):
+    self.assertEquals('',
+                      Convert('$$ This is a Pump meta comment.\n').strip())
+
+  def testSimpleVarDeclarationWorks(self):
+    self.assertEquals('3\n',
+                      Convert('$var m = 3\n'
+                              '$m\n'))
+
+  def testVarDeclarationCanReferenceEarlierVar(self):
+    self.assertEquals('43 != 3;\n',
+                      Convert('$var a = 42\n'
+                              '$var b = a + 1\n'
+                              '$var c = (b - a)*3\n'
+                              '$b != $c;\n'))
+
+  def testSimpleLoopWorks(self):
+    self.assertEquals('1, 2, 3, 4, 5\n',
+                      Convert('$var n = 5\n'
+                              '$range i 1..n\n'
+                              '$for i, [[$i]]\n'))
+
+  def testSimpleLoopWithCommentWorks(self):
+    self.assertEquals('1, 2, 3, 4, 5\n',
+                      Convert('$var n = 5    $$ This is comment 1.\n'
+                              '$range i 1..n $$ This is comment 2.\n'
+                              '$for i, [[$i]]\n'))
+
+  def testNonTrivialRangeExpressionsWork(self):
+    self.assertEquals('1, 2, 3, 4\n',
+                      Convert('$var n = 5\n'
+                              '$range i (n/n)..(n - 1)\n'
+                              '$for i, [[$i]]\n'))
+
+  def testLoopWithoutSeparatorWorks(self):
+    self.assertEquals('a + 1 + 2 + 3;\n',
+                      Convert('$range i 1..3\n'
+                              'a$for i [[ + $i]];\n'))
+
+  def testCanGenerateDollarSign(self):
+    self.assertEquals('$\n', Convert('$($)\n'))
+
+  def testCanIterpolateExpressions(self):
+    self.assertEquals('a[2] = 3;\n',
+                      Convert('$var i = 1\n'
+                              'a[$(i + 1)] = $(i*4 - 1);\n'))
+
+  def testConditionalWithoutElseBranchWorks(self):
+    self.assertEquals('true\n',
+                      Convert('$var n = 5\n'
+                              '$if n > 0 [[true]]\n'))
+
+  def testConditionalWithElseBranchWorks(self):
+    self.assertEquals('true -- really false\n',
+                      Convert('$var n = 5\n'
+                              '$if n > 0 [[true]]\n'
+                              '$else [[false]] -- \n'
+                              '$if n > 10 [[really true]]\n'
+                              '$else [[really false]]\n'))
+
+  def testConditionalWithCascadingElseBranchWorks(self):
+    self.assertEquals('a\n',
+                      Convert('$var n = 5\n'
+                              '$if n > 0 [[a]]\n'
+                              '$elif n > 10 [[b]]\n'
+                              '$else [[c]]\n'))
+    self.assertEquals('b\n',
+                      Convert('$var n = 5\n'
+                              '$if n > 10 [[a]]\n'
+                              '$elif n > 0 [[b]]\n'
+                              '$else [[c]]\n'))
+    self.assertEquals('c\n',
+                      Convert('$var n = 5\n'
+                              '$if n > 10 [[a]]\n'
+                              '$elif n > 8 [[b]]\n'
+                              '$else [[c]]\n'))
+
+  def testNestedLexicalBlocksWork(self):
+    self.assertEquals('a = 5;\n',
+                      Convert('$var n = 5\n'
+                              'a = [[$if n > 0 [[$n]]]];\n'))
+
+
+class StripMetaCommentsTest(googletest.TestCase):
+
+  def testReturnsSameStringIfItContainsNoComment(self):
+    self.assertEquals('', StripMetaComments(''))
+    self.assertEquals(' blah ', StripMetaComments(' blah '))
+    self.assertEquals('A single $ is fine.',
+                      StripMetaComments('A single $ is fine.'))
+    self.assertEquals('multiple\nlines',
+                      StripMetaComments('multiple\nlines'))
+
+  def testStripsSimpleComment(self):
+    self.assertEquals('yes\n', StripMetaComments('yes $$ or no?\n'))
+
+  def testStripsSimpleCommentWithMissingNewline(self):
+    self.assertEquals('yes', StripMetaComments('yes $$ or no?'))
+
+  def testStripsPureCommentLinesEntirely(self):
+    self.assertEquals('yes\n',
+                      StripMetaComments('$$ a pure comment line.\n'
+                                        'yes $$ or no?\n'
+                                        '    $$ another comment line.\n'))
+
+  def testStripsCommentsFromMultiLineText(self):
+    self.assertEquals('multi-\n'
+                      'line\n'
+                      'text is fine.',
+                      StripMetaComments('multi- $$ comment 1\n'
+                                        'line\n'
+                                        'text is fine. $$ comment 2'))
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/GraphBLAS/CUDA/test/jitFactory.hpp b/GraphBLAS/CUDA/test/jitFactory.hpp
new file mode 100644
index 0000000000..5bbb723658
--- /dev/null
+++ b/GraphBLAS/CUDA/test/jitFactory.hpp
@@ -0,0 +1,497 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+  Extended example for building on-the-fly kernels with C interface.
+  Simple examples demonstrating different ways to load source code
+    and call kernels.
+ */
+
+#include "GB_jit_launcher.h"
+#include "../templates/reduceUnrolled.cu.jit"
+#include "../templates/sparseDotProduct.cu.jit"
+#include "../templates/denseDotProduct.cu.jit"
+#include "../templates/GB_jit_AxB_dot3_phase1.cu.jit"
+#include "../templates/GB_jit_AxB_dot3_phase2.cu.jit"
+#include "../templates/GB_jit_AxB_dot3_phase3_dndn.cu.jit"
+#include "../templates/GB_jit_AxB_dot3_phase3_vsvs.cu.jit"
+#include "../templates/GB_jit_AxB_dot3_phase3_vssp.cu.jit"
+#include "../templates/GB_jit_AxB_dot3_phase3_spdn.cu.jit"
+#include "../templates/GB_jit_AxB_dot3_phase3_mp.cu.jit"
+#include "../templates/GB_jit_AxB_dot3_phase3_warpix.cu.jit"
+
+#include "type_name.hpp"
+
+#define JITIFY_PRINT_INSTANTIATION 1
+#define JITIFY_PRINT_SOURCE 1
+#define JITIFY_PRINT_LOG 1
+#define JITIFY_PRINT_PTX 1
+#define JITIFY_PRINT_LINKER_LOG 1
+#define JITIFY_PRINT_LAUNCH 1
+
+#include "dataFactory.hpp"
+#include "semiringFactory.hpp"
+#include "../GB_cuda.h"
+
+
+#if __cplusplus >= 201103L
+
+
+//Kernel jitifiers
+template<typename T> class reduceFactory ;
+template<typename T1, typename T2, typename T3> class dotFactory ;
+template<typename T1, typename T2, typename T3> class spdotFactory ;
+
+
+//AxB_dot3_phase1 kernel launchers
+template<  typename T_C, typename T_M, typename T_A, typename T_B> class phase1launchFactory ;
+
+//AxB_dot3_phase3 kernel launchers
+
+template<  typename T_C, typename T_M, 
+         typename T_A, typename T_B, typename T_xy, typename T_z> class launchFactory ;
+
+
+const std::vector<std::string> compiler_flags{
+   "-std=c++14",
+   "-remove-unused-globals",
+   "-w",
+   "-D__CUDACC_RTC__",
+   "-I.",
+   "-I..",
+   "-I../../Include",
+   "-I../../Source",
+   "-I../../Source/Template",
+   "-Ilocal_cub/block",
+   "-Itemplates",
+   "-I/usr/local/cuda/include"
+};
+
+const std::vector<std::string> header_names ={};
+
+template<  typename T_C, typename T_M, typename T_A, typename T_B> 
+class phase1launchFactory 
+{
+  std::string base_name = "GB_jit_";
+  std::string kernel_name = "AxB_dot3_phase1";
+  std::string template_name = "templates_GB_jit_AxB_dot3_phase1_cu";
+
+public: 
+
+  bool jitGridBlockLaunch(int gridsz, int blocksz, 
+                          int64_t *nanobuckets, int64_t *blockBucket, 
+                          matrix<T_C> *C, matrix<T_M> *M, matrix<T_A> *A, matrix<T_B> *B) 
+     {
+      
+      bool result = false; 
+
+      T_C dumC;
+      T_M dumM;
+      T_A dumA;
+      T_B dumB;
+
+      dim3 grid(gridsz);
+      dim3 block(blocksz);
+
+      std::cout<< kernel_name<<
+                      " with types "<<GET_TYPE_NAME(dumC)<<","
+                                    <<GET_TYPE_NAME(dumM)<<","
+                                    <<GET_TYPE_NAME(dumA)<<","
+                                    <<GET_TYPE_NAME(dumB)<<std::endl;
+
+    
+      jit::launcher( base_name + kernel_name, 
+                     jit_template,
+                     header_names, 
+                     compiler_flags,
+                     file_callback)
+
+                   .set_kernel_inst(  base_name + kernel_name ,
+                                    { GET_TYPE_NAME(dumC),
+                                      GET_TYPE_NAME(dumM),
+                                      GET_TYPE_NAME(dumA),
+                                      GET_TYPE_NAME(dumB),
+                                      })
+                   .configure(grid, block)
+                   .launch( nanobuckets, blockBucket, C, M, A, B);
+
+      checkCudaErrors( cudaDeviceSynchronize() );
+      result= true;
+
+      return result;
+     }
+
+
+};
+
+template<  typename T_C> 
+class phase2launchFactory 
+{
+  std::string base_name = "GB_jit_";
+  std::string kernel_name = "AxB_dot3_phase2";
+  std::string template_name = "templates_GB_jit_AxB_dot3_phase2_cu";
+
+public: 
+
+  bool jitGridBlockLaunch(int gridsz, int blocksz, 
+                          int64_t *nanobuckets, int64_t *blockBucket, 
+                          int64_t *bucketp, int64_t *bucket,
+                          matrix<T_C> *C, const int64_t cnz, const int64_t nblocks )
+     {
+      
+      bool result = false; 
+
+      T_C dumC;
+
+      dim3 grid(gridsz);
+      dim3 block(blocksz);
+
+      std::cout<< kernel_name<<" with types " <<GET_TYPE_NAME(dumC)<<std::endl;
+
+    
+      jit::launcher( base_name + kernel_name, 
+                     jit_template,
+                     header_names, 
+                     compiler_flags,
+                     file_callback)
+
+                   .set_kernel_inst(  base_name + kernel_name ,
+                                    { GET_TYPE_NAME(dumC) })
+                   .configure(grid, block)
+                   .launch( nanobuckets, blockBucket, bucketp, bucket, C, cnz);
+
+      checkCudaErrors( cudaDeviceSynchronize() );
+      result= true;
+
+      return result;
+     }
+
+};
+
+template<  typename T_C> 
+class phase2endlaunchFactory 
+{
+  std::string base_name = "GB_jit_";
+  std::string kernel_name = "AxB_dot3_phase2end";
+  std::string template_name = "templates_GB_jit_AxB_dot3_phase2_cu";
+
+public: 
+
+  bool jitGridBlockLaunch(int gridsz, int blocksz, 
+                          int64_t *nanobuckets, int64_t *blockBucket, 
+                          int64_t *bucketp, int64_t *bucket,
+                          matrix<T_C> *C, const int64_t cnz) 
+     {
+      
+      bool result = false; 
+
+      jit_template = templates_GB_jit_AxB_dot3_phase2_cu;   
+
+      T_C dumC;
+
+      dim3 grid(gridsz);
+      dim3 block(blocksz);
+
+      std::cout<< kernel_name<<" with types " <<GET_TYPE_NAME(dumC)<<std::endl;
+
+    
+      jit::launcher( base_name + kernel_name, 
+                     jit_template,
+                     header_names, 
+                     compiler_flags,
+                     file_callback)
+
+                   .set_kernel_inst(  base_name + kernel_name ,
+                                    { GET_TYPE_NAME(dumC) })
+                   .configure(grid, block)
+                   .launch( nanobuckets, blockBucket, bucketp, bucket, C, cnz);
+
+      checkCudaErrors( cudaDeviceSynchronize() );
+      result= true;
+
+      return result;
+     }
+
+};
+
+template<  typename T_C, typename T_M, typename T_A, typename T_B, typename T_XY, typename T_Z> 
+class launchFactory 
+{
+  std::string base_name = "GB_jit_";
+  std::string kernel_name = "AxB_dot3_phase3_";
+  std::string template_name = "___templates_GB_jit_AxB_dot3_phase3_";
+  std::string OpName;
+  std::string SR;
+
+  GB_callback callback_generator;
+
+public: 
+  launchFactory(std::string SemiRing, std::string Optype) {
+      if (SemiRing == "PLUS_TIMES") {
+         std::cout<<"loading PLUS_TIMES semi-ring"<<std::endl;
+         file_callback = semiring_plus_times_callback;
+         //callback_generator.load_string( "mySemiring.h", semiring_string);
+         //file_callback = callback_generator.callback;
+      }
+      else if (SemiRing == "MIN_PLUS") {
+         std::cout<<"loading MIN_PLUS semi-ring"<<std::endl;
+         file_callback = semiring_min_plus_callback;
+      }
+      else if (SemiRing == "MAX_PLUS") {
+         std::cout<<"loading MAX_PLUS semi-ring"<<std::endl;
+         file_callback = semiring_max_plus_callback;
+      }
+      OpName = Optype;
+      SR = SemiRing;
+
+  }
+
+  bool jitGridBlockLaunch(int gridsz, int blocksz, 
+                          int64_t start, int64_t end, int64_t *Bucket, 
+                          matrix<T_C> *C, matrix<T_M> *M, matrix<T_A> *A, matrix<T_B> *B,
+                          int sz) 
+     {
+      
+      bool result = false; 
+
+      T_C dumC;
+      T_M dumM;
+      T_A dumA;
+      T_B dumB;
+      T_XY dumXY;
+      T_Z dumZ;
+
+      dim3 grid(gridsz);
+      dim3 block(blocksz);
+
+      std::cout<< kernel_name<<SR<<OpName<<
+                      " with types "<<GET_TYPE_NAME(dumC)<<","
+                                    <<GET_TYPE_NAME(dumM)<<","
+                                    <<GET_TYPE_NAME(dumA)<<","
+                                    <<GET_TYPE_NAME(dumB)<<","
+                                    <<GET_TYPE_NAME(dumXY)<<","
+                                    <<GET_TYPE_NAME(dumZ)<<std::endl;
+
+      const char*  jit_template;
+      if (OpName == "dndn") {
+         jit_template = ___templates_GB_jit_AxB_dot3_phase3_dndn_cu;   
+      }
+      if (OpName == "vsvs") {
+         jit_template = ___templates_GB_jit_AxB_dot3_phase3_vsvs_cu;   
+      }
+      if (OpName == "vssp") {
+         jit_template = ___templates_GB_jit_AxB_dot3_phase3_vssp_cu;   
+      }
+      if (OpName == "spdn") {
+         jit_template = ___templates_GB_jit_AxB_dot3_phase3_spdn_cu;   
+      }
+      if (OpName == "mp") {
+         jit_template = ___templates_GB_jit_AxB_dot3_phase3_mp_cu;   
+      }
+      if (OpName == "warp") {
+         jit_template = ___templates_GB_jit_AxB_dot3_phase3_warpix_cu;   
+      }
+
+      jit::launcher( base_name + SR + OpName+ GET_TYPE_NAME(dumZ), 
+                     jit_template,
+                     header_names, 
+                     compiler_flags,
+                     file_callback)
+
+                   .set_kernel_inst(  kernel_name+OpName,
+                                    { GET_TYPE_NAME(dumC),
+                                      GET_TYPE_NAME(dumA),
+                                      GET_TYPE_NAME(dumB),
+                                      GET_TYPE_NAME(dumXY),
+                                      GET_TYPE_NAME(dumXY),
+                                      GET_TYPE_NAME(dumZ)
+                                      })
+                   .configure(grid, block)
+                   .launch( start, end, Bucket,
+                            C, M, A, B, sz);
+
+      checkCudaErrors( cudaDeviceSynchronize() );
+      result= true;
+
+      return result;
+     }
+
+};
+
+template<typename T1, typename T2, typename T3>
+class spdotFactory 
+{
+  std::string base_name = "GBjit_spDot_";
+public: 
+  spdotFactory() {
+  }
+
+  bool jitGridBlockLaunch(int gridsz, int blocksz, unsigned int xn, unsigned int *xi, T1* x,
+                                                   unsigned int yn, unsigned int *yi, T2* y, 
+                                                        T3* output, std::string OpName) 
+  {
+      
+      bool result = false; 
+      if (OpName == "PLUS_TIMES") {
+         file_callback = &semiring_plus_times_callback;
+      }
+      else if (OpName == "MIN_PLUS") {
+         file_callback = &semiring_min_plus_callback;
+      }
+
+      T1 dum1;
+      T2 dum2;
+      T3 dum3;
+
+      dim3 grid(gridsz);
+      dim3 block(blocksz);
+
+      jit::launcher( base_name + OpName, 
+                     ___templates_sparseDotProduct_cu,
+                     header_names, 
+                     compiler_flags,
+                     file_callback)
+
+                   .set_kernel_inst("sparseDotProduct",
+                                    { GET_TYPE_NAME(dum1),
+                                      GET_TYPE_NAME(dum2),
+                                      GET_TYPE_NAME(dum3)})
+                   .configure(grid, block)
+                   .launch(xn, xi, x, yn, yi, y, output);
+
+      
+      checkCudaErrors( cudaDeviceSynchronize() );
+      result= true;
+
+      return result;
+  }
+
+};
+
+template<typename T1, typename T2, typename T3>
+class dotFactory 
+{
+  std::string base_name = "GBjit_dnDot_";
+public: 
+  dotFactory() {
+  }
+
+
+  bool jitGridBlockLaunch(int gridsz, int blocksz, T1* x, T2* y, T3* output, unsigned int N, std::string OpName) 
+  {
+      
+      bool result = false; 
+      if (OpName == "PLUS_TIMES") {
+         file_callback = &semiring_plus_times_callback;
+      }
+      else if (OpName == "MIN_PLUS") {
+         file_callback = &semiring_min_plus_callback;
+      }
+
+      T1 dum1;
+      T2 dum2;
+      T3 dum3;
+
+      dim3 grid(gridsz);
+      dim3 block(blocksz);
+
+      jit::launcher( base_name + OpName, 
+                     ___templates_denseDotProduct_cu,
+                     header_names, 
+                     compiler_flags,
+                     file_callback)
+
+                   .set_kernel_inst("denseDotProduct",
+                                    { GET_TYPE_NAME(dum1),
+                                      GET_TYPE_NAME(dum2),
+                                      GET_TYPE_NAME(dum3)})
+                   .configure(grid, block)
+                   .launch(x, y, output, N);
+
+      checkCudaErrors( cudaDeviceSynchronize() );
+      result= true;
+
+      return result;
+  }
+
+};
+
+template<typename T>
+class reduceFactory 
+{
+  std::string base_name = "GBjit_reduce_";
+
+public: 
+  reduceFactory() {
+  }
+
+  bool jitGridBlockLaunch(int gridsz, int blocksz, 
+                          T* indata, T* output, unsigned int N, 
+                          std::string OpName) 
+  {
+      dim3 grid(gridsz);
+      dim3 block(blocksz);
+      bool result = false; 
+      T dummy;
+
+      std::cout<<" indata type ="<< GET_TYPE_NAME(dummy)<<std::endl;
+      
+      if (OpName == "PLUS") {
+         file_callback = &file_callback_plus;
+      }
+      else if (OpName == "MIN") {
+         file_callback = &file_callback_min;
+      }   
+      else if (OpName == "MAX") {
+         file_callback = &file_callback_max;
+      }
+       
+
+      jit::launcher( base_name + OpName, 
+                     ___templates_reduceUnrolled_cu,
+                     header_names, 
+                     compiler_flags,
+                     file_callback)
+                   .set_kernel_inst("reduceUnrolled",
+                                    { GET_TYPE_NAME(dummy) })
+                   .configure(grid, block)
+                   .launch( indata, output, N);
+      
+      checkCudaErrors( cudaDeviceSynchronize() );
+
+      result= true;
+
+     
+      return result;
+  }
+
+};
+
+#endif  // C++11
+
diff --git a/GraphBLAS/CUDA/test/jitTestFactory.hpp b/GraphBLAS/CUDA/test/jitTestFactory.hpp
new file mode 100644
index 0000000000..4e43a79f69
--- /dev/null
+++ b/GraphBLAS/CUDA/test/jitTestFactory.hpp
@@ -0,0 +1,1886 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cassert>
+#include <cmath>
+#include <random>
+#include <algorithm>
+#include "jitFactory.hpp"
+#include "../binary_search.h"
+#include "GpuTimer.h"
+#include "gtest/gtest.h"
+
+//Operations for test results on CPU
+template<typename T> T myOP_plus( T a, T b) { return  a + b;}
+template<typename T> T myOP_min ( T a, T b) { return  a < b ? a : b;}
+template<typename T> T myOP_max ( T a, T b) { return  a > b ? a : b;}
+template<typename T> T myOP_first ( T a, T b) { return  a ;}
+template<typename T> T myOP_second ( T a, T b) { return  b ;}
+template<typename T> T myOP_times ( T a, T b) { return  a * b ;}
+
+template<typename T> T (*myOpPTR)(T a, T b);
+template<typename T> T (*ADD_ptr)(T a, T b);
+template<typename T> T (*MUL_ptr)(T a, T b);
+
+
+//Test generators using jitify
+template <typename T>
+bool test_reducefactoryUM( unsigned int N, std::string OP) ;
+
+template <typename T1,typename T2,typename T3>
+bool test_dndotfactoryUM( unsigned int N, std::string SEMI_RING) ;
+
+template <typename T1,typename T2,typename T3>
+bool test_spdotfactoryUM( unsigned int N, unsigned int xn, unsigned int yn, std::string SEMI_RING) ;
+
+//AxB_dot3_phase1 kernels
+template <typename T_C, typename T_M, typename T_A,typename T_B>
+bool test_AxB_dot3_phase1_factory( int64_t , int64_t , int64_t , int64_t ) ;
+
+//AxB_dot3_phase2 kernels
+template <typename T_C>
+bool test_AxB_dot3_phase2_factory( int , int64_t , int64_t , int64_t, int64_t ) ;
+
+template <typename T_C>
+bool test_AxB_dot3_phase2end_factory( int , int64_t , int64_t , int64_t ) ;
+
+//AxB_dot3_phase3 kernels
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_dndn_factory( int , int64_t , int64_t , int64_t , std::string&) ;
+
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_vsvs_factory( int , int64_t , int64_t , int64_t , std::string&) ;
+
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_spdn_factory( int , int64_t , int64_t , int64_t , std::string&) ;
+
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_vssp_factory( int , int64_t , int64_t , int64_t , std::string&) ;
+
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_mp_factory( int , int64_t , int64_t , int64_t , std::string&) ;
+
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_warp_factory( int , int64_t , int64_t , int64_t , std::string&) ;
+
+
+//Fixture to generate valid inputs and hold them for tests
+class AxB_dot3_Test : public ::testing::Test
+{
+   void SetUp() 
+   {
+
+
+   }
+
+   void TearDown()
+   {
+
+   }
+
+}
+
+// Test generator code, to allow parameterized tests
+// Uses jitFactory, dataFactory and GB_jit 
+template <typename T_C, typename T_M, typename T_A,typename T_B>
+bool test_AxB_dot3_phase1_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
+
+int gpuID; 
+cudaGetDevice( &gpuID);
+
+std::cout<< "found device "<<gpuID<<std::endl;
+
+phase1launchFactory<T_C, T_M, T_A, T_B> p1lF(); 
+
+SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
+int64_t Annz = N*N;
+int64_t Bnnz = N*N;
+int64_t Cnz = N; 
+float Cnzpercent = (float) Cnz/(N*N);
+
+G.init(N, Annz, Bnnz, Cnzpercent);
+
+G.fill_buckets( testBucket); // all elements go to testbucket= TB 
+
+matrix<T_C>* C = G.getCptr();
+matrix<T_M>* M = G.getMptr();
+matrix<T_A>* A = G.getAptr();
+matrix<T_B>* B = G.getBptr();
+
+T_C *Cx = C->x;
+T_A *Ax = A->x;
+T_B *Bx = B->x;
+
+       int nthrd = 32;
+       int sz = 4;
+       //int m = 256/sz;
+       int nblck = Cnz; 
+       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
+
+       GpuTimer kernTimer;
+       kernTimer.Start();
+       p1lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, Bucket,
+                                C, M, A, B);
+
+       kernTimer.Stop();
+       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+       return true;
+
+}
+
+template <typename T_C, typename T_M, typename T_A,typename T_B>
+bool test_AxB_dot3_phase2_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz) {
+
+int gpuID; 
+cudaGetDevice( &gpuID);
+
+std::cout<< "found device "<<gpuID<<std::endl;
+
+phase2launchFactory<T_C, T_M, T_A, T_B> p2lF(); 
+
+SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
+int64_t Annz = N*N;
+int64_t Bnnz = N*N;
+int64_t Cnz = N; 
+float Cnzpercent = (float) Cnz/(N*N);
+
+G.init(N, Annz, Bnnz, Cnzpercent);
+
+G.fill_buckets( testBucket); // all elements go to testbucket= TB 
+
+matrix<T_C>* C = G.getCptr();
+matrix<T_M>* M = G.getMptr();
+matrix<T_A>* A = G.getAptr();
+matrix<T_B>* B = G.getBptr();
+
+T_C *Cx = C->x;
+T_A *Ax = A->x;
+T_B *Bx = B->x;
+
+       int nthrd = 32;
+       int sz = 4;
+       //int m = 256/sz;
+       int nblck = Cnz; 
+       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
+
+       GpuTimer kernTimer;
+       kernTimer.Start();
+       p1lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, Bucket,
+                                C, M, A, B);
+
+       kernTimer.Stop();
+       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+       return true;
+}
+
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_full_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
+// Generates three randomized matrices, builds buckets and calls a kernel.
+// This is the full version as called in SuiteSparse:GraphBLAS 
+
+phase1launchFactory<T_C, T_M, T_A, T_B> p1lF(); 
+phase2launchFactory<T_C> p2lF(); 
+launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "dndn"); 
+
+int testBucket = TB;
+
+//unsigned seed = 13372801;
+//std::mt19937 r; //random number generator Mersenne Twister
+//r.seed(seed);
+int gpuID; 
+cudaGetDevice( &gpuID);
+
+std::cout<< "found device "<<gpuID<<std::endl;
+
+T_Z MONOID_IDENTITY;
+if (SEMI_RING == "PLUS_TIMES") {
+   std::cout << "Plus Times (+,*) semiring"<<std::endl;
+   MONOID_IDENTITY = 0;
+   ADD_ptr<T_Z> = myOP_plus<T_Z>;
+   MUL_ptr<T_Z> = myOP_times<T_Z>;
+
+}
+else if(SEMI_RING == "MIN_PLUS") {
+   std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
+   MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
+   ADD_ptr<T_Z> = myOP_min<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+
+}
+else if(SEMI_RING == "MAX_PLUS") {
+   MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
+   std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
+   ADD_ptr<T_Z> = myOP_max<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+}
+
+//Generate test data and setup for using a jitify kernel with 'bucket' interface
+// The testBucket arg tells the generator which bucket we want to exercise
+SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G( testBucket);
+int64_t Annz = N*N;
+int64_t Bnnz = N*N;
+int64_t Cnz = N; 
+float Cnzpercent = (float) Cnz/(N*N);
+
+G.init(N, Annz, Bnnz, Cnzpercent);
+
+G.fill_buckets( testBucket); // all elements go to testbucket= TB 
+
+matrix<T_C>* C = G.getCptr();
+matrix<T_M>* M = G.getMptr();
+matrix<T_A>* A = G.getAptr();
+matrix<T_B>* B = G.getBptr();
+
+T_C *Cx = C->x;
+T_A *Ax = A->x;
+T_B *Bx = B->x;
+
+// Set clear zombie count
+C->zombie_count = 0;
+
+//std::cout<<"got all matrices"<<std::endl;
+int64_t *Bucket = G.getBucket();
+int64_t *BucketStart = G.getBucketStart();
+
+int zc_valid = 0;
+
+bool result = false;
+
+   // Phase 1
+       int nthrd = 32;
+       int sz = 4;
+       //int m = 256/sz;
+       int nblck = Cnz; 
+       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
+
+       GpuTimer kernTimer;
+       kernTimer.Start();
+       p1lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, Bucket,
+                                C, M, A, B);
+
+       kernTimer.Stop();
+       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+   // Phase 2
+       int nthrd = 32;
+       int sz = 4;
+       //int m = 256/sz;
+       int nblck = Cnz; 
+       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
+
+       GpuTimer kernTimer;
+       kernTimer.Start();
+       p2lF.jitGridBlockLaunch( nblck, nthrd, nanobuckets, Bucket, bucketp, 
+                                C);
+
+       kernTimer.Stop();
+       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+
+for (int b =0; b < 12; ++b) {// loop on buckets
+
+    int64_t b_start = BucketStart [b] ;
+    int64_t b_end   = BucketStart [b+1] ;
+    int64_t nvecs = b_end - b_start ;
+    if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
+
+    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
+    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
+    if (b == TB) { //test cases for dense-dense kernels
+       int nthrd = 32;
+       int sz = 4;
+       //int m = 256/sz;
+       int nblck = Cnz; 
+       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
+
+       GpuTimer kernTimer;
+       kernTimer.Start();
+       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
+                                C, M, A, B, sz);
+
+       kernTimer.Stop();
+       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+     
+       zc_valid = C->zombie_count;
+       C->zombie_count = 0;
+       for (int i =0 ; i< Cnz; ++i) {
+            //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
+            X_valid[i] = Cx[i];
+            Cx[i] = 0;
+            i_valid[i] = C->i[i]; 
+       }
+       G.loadCj(); 
+
+       for (int64_t pair = b_start ; pair < b_end ; pair++) {
+       
+        // get the kth entry in bucket b
+        //std::cout<< " pair ="<<pair<<std::endl;
+        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
+        int64_t i = M->i[pC] ;          // row index of C(i,j)
+
+        // get C(i,j)
+        int64_t k = (C->i [pC] >> 4) ;    // col index of C(i,j)
+        //ASSERT ((C->i [pC] & 4) == b) ;
+        int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
+        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
+
+        // xvp, xvi, xvals:  A(:,i)
+        // xvp is Ap [i] and Ap [i+1]
+        int64_t pA_start = A->p [i] ;
+        int64_t pA_end   = A->p [i+1] ;
+        // indices are in Ai [pA_start ... pA_end-1]
+        // values  are in Ax [pA_start ... pA_end-1]
+
+        // yvp, yvi, yvals:  B(:,j)
+        // yvp is Bp [j] and Bp [j+1]
+        int64_t pB_start = B->p [j] ;
+        int64_t pB_end   = B->p [j+1] ;
+        // indices are in Bi [pB_start ... pB_end-1]
+        // values  are in Bx [pB_start ... pB_end-1]
+        k = pA_start;
+        int64_t l = pB_start;
+        T_Z cij = MONOID_IDENTITY;
+        while( k < pA_end && l < pB_end) {
+           //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
+           cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
+           k++;
+           l++;
+           //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
+        }
+        //std::cout<< " dot  = "<< sum << std::endl;
+
+        // output for this dot product is
+        
+        if (cij == MONOID_IDENTITY) {
+            C->i [pC] = -1;//GB_FLIP (i)
+            C->zombie_count++;
+        }
+        else {
+            Cx [pC] = (T_C)cij;
+            C->i [pC] = i;
+        }
+    }
+       T_C err = 0;
+       for (int j =0 ; j< N; ++j) {
+         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
+             int64_t i =  C->i[l];
+             //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
+             if (i >= 0) 
+                err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
+         }
+       }
+       std::cout<< " 2-norm of err ="<< err<<std::endl;
+       std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
+      
+       EXPECT_EQ(err,0);
+       EXPECT_EQ( zc_valid, C->get_zombie_count());
+
+       free(X_valid);
+       free(i_valid);
+     }
+    }
+
+G.del();
+
+return result;
+
+}
+
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_dndn_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
+// Assumes all matrices are square so far, so only N dimension given.
+// Sparsity is dense here so Anz = Bnz = N*N. 
+// Generates three randomized matrices, builds buckets and calls a kernel.
+
+
+launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "dndn"); 
+
+int testBucket = TB;
+
+//unsigned seed = 13372801;
+//std::mt19937 r; //random number generator Mersenne Twister
+//r.seed(seed);
+int gpuID; 
+cudaGetDevice( &gpuID);
+
+std::cout<< "found device "<<gpuID<<std::endl;
+
+T_Z MONOID_IDENTITY;
+if (SEMI_RING == "PLUS_TIMES") {
+   std::cout << "Plus Times (+,*) semiring"<<std::endl;
+   MONOID_IDENTITY = 0;
+   ADD_ptr<T_Z> = myOP_plus<T_Z>;
+   MUL_ptr<T_Z> = myOP_times<T_Z>;
+
+}
+else if(SEMI_RING == "MIN_PLUS") {
+   std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
+   MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
+   ADD_ptr<T_Z> = myOP_min<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+
+}
+else if(SEMI_RING == "MAX_PLUS") {
+   MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
+   std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
+   ADD_ptr<T_Z> = myOP_max<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+}
+
+//Generate test data and setup for using a jitify kernel with 'bucket' interface
+SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
+int64_t Annz = N*N;
+int64_t Bnnz = N*N;
+int64_t Cnz = N; 
+float Cnzpercent = (float) Cnz/(N*N);
+
+G.init(N, Annz, Bnnz, Cnzpercent);
+
+G.fill_buckets( testBucket); // all elements go to testbucket= TB 
+
+matrix<T_C>* C = G.getCptr();
+matrix<T_M>* M = G.getMptr();
+matrix<T_A>* A = G.getAptr();
+matrix<T_B>* B = G.getBptr();
+
+T_C *Cx = C->x;
+T_A *Ax = A->x;
+T_B *Bx = B->x;
+
+// Set clear zombie count
+C->zombie_count = 0;
+
+//std::cout<<"got all matrices"<<std::endl;
+int64_t *Bucket = G.getBucket();
+int64_t *BucketStart = G.getBucketStart();
+
+int zc_valid = 0;
+
+bool result = false;
+
+for (int b =0; b < 12; ++b) {// loop on buckets
+
+    int64_t b_start = BucketStart [b] ;
+    int64_t b_end   = BucketStart [b+1] ;
+    int64_t nvecs = b_end - b_start ;
+    if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
+
+    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
+    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
+    if (b == TB) { //test cases for dense-dense kernels
+       int nthrd = 32;
+       int sz = 4;
+       //int m = 256/sz;
+       int nblck = Cnz; 
+       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
+
+       GpuTimer kernTimer;
+       kernTimer.Start();
+       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
+                                C, M, A, B, sz);
+
+       kernTimer.Stop();
+       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+     
+       zc_valid = C->zombie_count;
+       C->zombie_count = 0;
+       for (int i =0 ; i< Cnz; ++i) {
+            //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
+            X_valid[i] = Cx[i];
+            Cx[i] = 0;
+            i_valid[i] = C->i[i]; 
+       }
+       G.loadCj(); 
+
+       for (int64_t pair = b_start ; pair < b_end ; pair++) {
+       
+        // get the kth entry in bucket b
+        //std::cout<< " pair ="<<pair<<std::endl;
+        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
+        int64_t i = M->i[pC] ;          // row index of C(i,j)
+
+        // get C(i,j)
+        int64_t k = (C->i [pC] >> 4) ;    // col index of C(i,j)
+        //ASSERT ((C->i [pC] & 4) == b) ;
+        int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
+        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
+
+        // xvp, xvi, xvals:  A(:,i)
+        // xvp is Ap [i] and Ap [i+1]
+        int64_t pA_start = A->p [i] ;
+        int64_t pA_end   = A->p [i+1] ;
+        // indices are in Ai [pA_start ... pA_end-1]
+        // values  are in Ax [pA_start ... pA_end-1]
+
+        // yvp, yvi, yvals:  B(:,j)
+        // yvp is Bp [j] and Bp [j+1]
+        int64_t pB_start = B->p [j] ;
+        int64_t pB_end   = B->p [j+1] ;
+        // indices are in Bi [pB_start ... pB_end-1]
+        // values  are in Bx [pB_start ... pB_end-1]
+        k = pA_start;
+        int64_t l = pB_start;
+        T_Z cij = MONOID_IDENTITY;
+        while( k < pA_end && l < pB_end) {
+           //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
+           cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
+           k++;
+           l++;
+           //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
+        }
+        //std::cout<< " dot  = "<< sum << std::endl;
+
+        // output for this dot product is
+        
+        if (cij == MONOID_IDENTITY) {
+            C->i [pC] = -1;//GB_FLIP (i)
+            C->zombie_count++;
+        }
+        else {
+            Cx [pC] = (T_C)cij;
+            C->i [pC] = i;
+        }
+    }
+       T_C err = 0;
+       for (int j =0 ; j< N; ++j) {
+         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
+             int64_t i =  C->i[l];
+             //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
+             if (i >= 0) 
+                err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
+         }
+       }
+       std::cout<< " 2-norm of err ="<< err<<std::endl;
+       std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
+      
+       EXPECT_EQ(err,0);
+       EXPECT_EQ( zc_valid, C->get_zombie_count());
+
+       free(X_valid);
+       free(i_valid);
+     }
+    }
+
+G.del();
+
+return result;
+
+}
+
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_vsvs_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
+// Assumes all matrices are square so far, so only N dimension given.
+// Sparsity is controlled by Anz and Bnz vs N*N. 
+// Generates three randomized matrices, builds buckets and calls a kernel.
+
+
+launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "vsvs"); 
+
+int testBucket = TB;
+
+//unsigned seed = 13372801;
+//std::mt19937 r; //random number generator Mersenne Twister
+//r.seed(seed);
+int gpuID; 
+cudaGetDevice( &gpuID);
+std::cout<< "found device "<<gpuID<<std::endl;
+
+//T_Z MONOID_IDENTITY;
+if (SEMI_RING == "PLUS_TIMES") {
+   //MONOID_IDENTITY =(T_Z)0;
+   ADD_ptr<T_Z> = myOP_plus<T_Z>;
+   MUL_ptr<T_Z> = myOP_times<T_Z>;
+
+}
+else if(SEMI_RING == "MIN_PLUS") {
+   //MONOID_IDENTITY = std::numeric_limits<T_Z>::max(); 
+   ADD_ptr<T_Z> = myOP_min<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+
+}
+else if(SEMI_RING == "MAX_PLUS") {
+   //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
+   ADD_ptr<T_Z> = myOP_max<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+}
+
+//Generate test data and setup for using a jitify kernel with 'bucket' interface
+SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
+int64_t Cnz = N;
+float Cnzpercent = (float) Cnz/(N*N);
+
+G.init(N, Anz, Bnz, Cnzpercent);
+
+G.fill_buckets( testBucket); // all elements go to testbucket= TB 
+
+matrix<T_C>* C = G.getCptr();
+matrix<T_M>* M = G.getMptr();
+matrix<T_A>* A = G.getAptr();
+matrix<T_B>* B = G.getBptr();
+
+T_C *Cx = C->x;
+T_A *Ax = A->x;
+T_B *Bx = B->x;
+int64_t *Ci = C->i;
+int64_t *Mi = M->i;
+int64_t *Ai = A->i;
+int64_t *Bi = B->i;
+int64_t *Ap = A->p;
+int64_t *Bp = B->p;
+
+//std::cout<<"got all matrices"<<std::endl;
+int64_t *Bucket = G.getBucket();
+int64_t *BucketStart = G.getBucketStart();
+
+int zc_valid = 0;
+
+bool result = false;
+
+for (int b =0; b < 12; ++b) {// loop on buckets
+
+    int64_t b_start = BucketStart [b] ;
+    int64_t b_end   = BucketStart [b+1] ;
+    int64_t nvecs = b_end - b_start ;
+    if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
+
+    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
+    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
+    if (b == TB) { //test cases for v.sparse-v.sparse kernels
+       int nthrd = 32;
+       int sz = Anz/N;
+       int m = 256/sz;
+       int nblck = (Cnz -1 + m*nthrd )/(m*nthrd) ; 
+       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
+
+       GpuTimer kernTimer;
+       kernTimer.Start();
+       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
+                                C, M, A, B, sz);
+
+       kernTimer.Stop();
+       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+       //std::cout<<"returned from kernel"<<std::endl;
+     
+       zc_valid = C->zombie_count;
+       C->zombie_count = 0;
+       for (int i =0 ; i< Cnz; ++i) {
+            X_valid[i] = Cx[i];
+            Cx[i] = 0;
+            i_valid[i] = Ci[i]; 
+       }
+       G.loadCj(); 
+       for (int64_t pair = b_start ; pair < b_end ; pair++) {
+       
+        // get the kth entry in bucket b
+        //std::cout<< " pair ="<<pair<<std::endl;
+        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
+        int64_t i = Mi[pC] ;          // row index of C(i,j)
+
+        // get C(i,j)
+        int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
+        //ASSERT ((C->i [pC] & 4) == b) ;
+        int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
+        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
+
+        // xvp, xvi, xvals:  A(:,i)
+        // xvp is Ap [i] and Ap [i+1]
+        int64_t pA_start = Ap [i] ;
+        int64_t pA_end   = Ap [i+1] ;
+        // indices are in Ai [pA_start ... pA_end-1]
+        // values  are in Ax [pA_start ... pA_end-1]
+
+        // yvp, yvi, yvals:  B(:,j)
+        // yvp is Bp [j] and Bp [j+1]
+        int64_t pB_start = Bp [j] ;
+        int64_t pB_end   = Bp [j+1] ;
+        // indices are in Bi [pB_start ... pB_end-1]
+        // values  are in Bx [pB_start ... pB_end-1]
+        k = pA_start;
+        int64_t l = pB_start;
+        T_Z cij ;
+        bool cij_exists = false;
+        while( k < pA_end && l < pB_end) {
+            if ( Ai[k] < Bi[l]) ++k;
+            else if ( Ai[k] > Bi[l]) ++l;
+            else {
+                if (cij_exists) {
+                   cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( Ax[k] , Bx[l] ) );
+                }
+                else{
+                   cij_exists = true;
+                   cij = (*MUL_ptr<T_Z>)( Ax[k], Bx[l]);
+                }
+                k++;
+                l++;
+            }
+        }
+        //std::cout<< " dot  = "<< sum << std::endl;
+
+        // output for this dot product is
+        
+        if (cij_exists) {
+            Ci [pC] = i;
+            Cx[pC] = (T_C)cij;
+        }
+        else {
+            Ci [pC] = -1;//GB_FLIP (i)
+            C->zombie_count++;
+        }
+    }
+       T_C err = 0;
+       for (int j =0 ; j< N; ++j) {
+         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
+             //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
+             if (Ci[l] > 0)
+                err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
+         }
+       }
+       std::cout<< " 2-norm of err ="<< err<<std::endl;
+       std::cout<< " zombie count GPU = "<<C->get_zombie_count()<<" zCPU ="<<zc_valid<<std::endl;
+      
+       EXPECT_EQ(err,0);
+       EXPECT_EQ( zc_valid, C->get_zombie_count());
+
+       free(X_valid);
+       free(i_valid);
+     }
+    }
+
+G.del();
+
+return result;
+
+}
+
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_vssp_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
+// Assumes all matrices are square so far, so only N dimension given.
+// Sparsity is controlled by Anz and Bnz vs N*N. 
+// Generates three randomized matrices, builds buckets and calls a kernel.
+
+launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "vssp"); 
+
+int testBucket = TB;
+
+//unsigned seed = 13372801;
+//std::mt19937 r; //random number generator Mersenne Twister
+//r.seed(seed);
+int gpuID; 
+cudaGetDevice( &gpuID);
+std::cout<< "found device "<<gpuID<<std::endl;
+
+//T_Z MONOID_IDENTITY;
+if (SEMI_RING == "PLUS_TIMES") {
+   //MONOID_IDENTITY =(T_Z)0;
+   ADD_ptr<T_Z> = myOP_plus<T_Z>;
+   MUL_ptr<T_Z> = myOP_times<T_Z>;
+
+}
+else if(SEMI_RING == "MIN_PLUS") {
+   //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
+   ADD_ptr<T_Z> = myOP_min<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+
+}
+else if(SEMI_RING == "MAX_PLUS") {
+   //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
+   ADD_ptr<T_Z> = myOP_max<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+}
+
+//Generate test data and setup for using a jitify kernel with 'bucket' interface
+SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
+
+int64_t Cnz = N;
+float Cnzpercent = (float)( Cnz)/(N*N);
+
+G.init(N, Anz, Bnz, Cnzpercent );
+
+G.fill_buckets( testBucket); // all elements go to testbucket= TB 
+
+matrix<T_C>* C = G.getCptr();
+matrix<T_M>* M = G.getMptr();
+matrix<T_A>* A = G.getAptr();
+matrix<T_B>* B = G.getBptr();
+
+T_C *Cx = C->x;
+T_A *Ax = A->x;
+T_B *Bx = B->x;
+int64_t *Ci = C->i;
+int64_t *Mi = M->i;
+int64_t *Ai = A->i;
+int64_t *Bi = B->i;
+int64_t *Ap = A->p;
+int64_t *Bp = B->p;
+
+
+//std::cout<<"got all matrices"<<std::endl;
+int64_t *Bucket = G.getBucket();
+int64_t *BucketStart = G.getBucketStart();
+
+int zc_valid = 0;
+int zc = 0;
+
+bool result = false;
+
+for (int b =0; b < 12; ++b) {// loop on buckets
+
+    int64_t b_start = BucketStart [b] ;
+    int64_t b_end   = BucketStart [b+1] ;
+    int64_t nvecs = b_end - b_start ;
+    if (nvecs == 0) continue; 
+    std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
+
+    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
+    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
+    if (b == TB) { //test cases for v.sparse-dense kernels
+       int nthrd = 32;
+       int sz = 4; 
+       //int m = 256/sz;
+       int nblck = (Cnz -1 + nthrd )/(nthrd) ; 
+       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
+
+       GpuTimer kernTimer;
+       kernTimer.Start();
+       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
+                                C, M, A, B, sz);
+
+       kernTimer.Stop();
+       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+       //std::cout<<"returned from kernel"<<std::endl;
+     
+       zc_valid = C->zombie_count;
+       C->zombie_count = 0;
+       for (int i =0 ; i< Cnz; ++i) {
+            X_valid[i] = Cx[i];
+            Cx[i] = 0;
+            i_valid[i] = C->i[i]; 
+       }
+       G.loadCj(); 
+
+
+       for (int64_t pair = b_start ; pair < b_end ; pair++) {
+       
+        // get the kth entry in bucket b
+        //std::cout<< " pair ="<<pair<<std::endl;
+        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
+
+        int64_t i = Mi[pC] ;          // row index of C(i,j)
+        // get C(i,j)
+        int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
+        //ASSERT ((C->i [pC] & 4) == b) ;
+        int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
+        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
+
+        int64_t pA      = Ap[i];
+        int64_t pA_end  = Ap[i+1];
+        int64_t nnzA = pA_end - pA;
+
+        int64_t pB      = Bp[j]; 
+        int64_t pB_end  = Bp[j+1]; 
+        int64_t nnzB = pB_end - pB;
+
+        //Search for each nonzero in the smaller vector to find intersection 
+        bool cij_exists = false;
+
+        T_A aki;
+        T_B bkj;
+        T_Z cij;
+
+        if (nnzA <= nnzB) {
+            //----------------------------------------------------------------------
+            // A(:,i) is very sparse compared to B(:,j)
+            //----------------------------------------------------------------------
+
+            while (pA < pA_end && pB < pB_end)
+            {
+                int64_t ia = Ai [pA] ;
+                int64_t ib = Bi [pB] ;
+                if (ia < ib)
+                { 
+                    // A(ia,i) appears before B(ib,j)
+                    pA++ ;
+                }
+                else if (ib < ia)
+                { 
+                    // B(ib,j) appears before A(ia,i)
+                    // discard all entries B(ib:ia-1,j)
+                    int64_t pleft = pB + 1 ;
+                    int64_t pright = pB_end - 1 ;
+                    GB_BINARY_TRIM_SEARCH (ia, Bi, pleft, pright) ;
+                    //ASSERT (pleft > pB) ;
+                    pB = pleft ;
+                }
+                else // ia == ib == k
+                { 
+                    // A(k,i) and B(k,j) are the next entries to merge
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cij_exists = true ;
+                    break ;
+                    #else
+                    GB_GETA (aki, Ax, pA) ;             /* aki = A(k,i) */          
+                    GB_GETB (bkj, Bx, pB) ;             /* bkj = B(k,j) */         
+                    if (cij_exists)                                                 
+                    {                                                               
+                        cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki , (T_Z)bkj ) );
+                        /* cij += aki * bkj */      
+                    }                                                               
+                    else                                                           
+                    {                                                               
+                        /* cij = A(k,i) * B(k,j), and add to the pattern */         
+                        cij_exists = true ;                                         
+                        cij=  (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) ;     
+                        /* cij = aki * bkj */       
+                    }                                                               
+                    //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal
+                    pA++ ;
+                    pB++ ;
+                    #endif
+                }
+            }
+        }
+        else {
+            //----------------------------------------------------------------------
+            // B(:,j) is very sparse compared to A(:,i)
+            //----------------------------------------------------------------------
+
+            while (pA < pA_end && pB < pB_end)
+            {
+                int64_t ia = Ai [pA] ;
+                int64_t ib = Bi [pB] ;
+                if (ia < ib)
+                { 
+                    // A(ia,i) appears before B(ib,j)
+                    // discard all entries A(ia:ib-1,i)
+                    int64_t pleft = pA + 1 ;
+                    int64_t pright = pA_end - 1 ;
+                    GB_BINARY_TRIM_SEARCH (ib, Ai, pleft, pright) ;
+                    //ASSERT (pleft > pA) ;
+                    pA = pleft ;
+                }
+                else if (ib < ia)
+                { 
+                    // B(ib,j) appears before A(ia,i)
+                    pB++ ;
+                }
+                else // ia == ib == k
+                { 
+                    // A(k,i) and B(k,j) are the next entries to merge
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cij_exists = true ;
+                    break ;
+                    #else
+                    GB_GETA (aki, Ax, pA) ;             /* aki = A(k,i) */          
+                    GB_GETB (bkj, Bx, pB) ;             /* bkj = B(k,j) */         
+                    if (cij_exists)                                                 
+                    {                                                               
+                        cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki , (T_Z)bkj ) );
+                        /* cij += aki * bkj */      \
+                    }                                                               
+                    else                                                           
+                    {                                                               
+                        /* cij = A(k,i) * B(k,j), and add to the pattern */         
+                        cij_exists = true ;                                         
+                        cij=  (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) ;     
+                    }                                                               
+                    //GB_DOT_TERMINAL (cij) ;         // break if cij == terminal
+                    pA++ ;
+                    pB++ ;
+                    #endif
+                }
+            }
+
+        }
+        if ( cij_exists){
+           Ci[pair] = i;
+           Cx[pair] = (T_C)cij;
+        }
+        else {
+           zc++; 
+           //printf(" %lld, %lld is zombie %d!\n",i,j,zc);
+           Ci[pair] = GB_FLIP( i );
+        }
+
+    }
+       C->zombie_count = zc;
+       T_C err = 0;
+       for (int j =0 ; j< N; ++j) {
+         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
+             int64_t i = Ci[l];
+             //std::cout<<i<<","<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
+             if (i > 0){ //not a zombie!
+                 err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
+             }
+         }
+       }
+       std::cout<< " 2-norm of err ="<< err<<std::endl;
+       std::cout<< " zombie count GPU = "<<C->get_zombie_count()<<" zCPU ="<<zc_valid<<std::endl;
+      
+       EXPECT_EQ(err,0);
+       EXPECT_EQ( zc_valid, C->get_zombie_count());
+
+       free(X_valid);
+       free(i_valid);
+     }
+    }
+
+G.del();
+
+return result;
+
+}
+
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_spdn_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
+// Assumes all matrices are square so far, so only N dimension given.
+// Sparsity is controlled by Anz and Bnz vs N*N. 
+// Generates three randomized matrices, builds buckets and calls a kernel.
+
+launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "spdn"); 
+
+int testBucket = TB;
+
+//unsigned seed = 13372801;
+//std::mt19937 r; //random number generator Mersenne Twister
+//r.seed(seed);
+int gpuID; 
+cudaGetDevice( &gpuID);
+std::cout<< "found device "<<gpuID<<std::endl;
+
+//T_Z MONOID_IDENTITY;
+if (SEMI_RING == "PLUS_TIMES") {
+  // MONOID_IDENTITY =(T_Z)0;
+   ADD_ptr<T_Z> = myOP_plus<T_Z>;
+   MUL_ptr<T_Z> = myOP_times<T_Z>;
+
+}
+else if(SEMI_RING == "MIN_PLUS") {
+  // MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
+   ADD_ptr<T_Z> = myOP_min<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+
+}
+else if(SEMI_RING == "MAX_PLUS") {
+  // MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
+   ADD_ptr<T_Z> = myOP_max<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+}
+
+//Generate test data and setup for using a jitify kernel with 'bucket' interface
+SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
+
+int64_t Cnz = N;
+float Cnzpercent = (float)( Cnz)/(N*N);
+
+//spdn case means B should be dense -> Bnz = N*N;
+G.init(N, Anz, N*N, Cnzpercent );
+
+G.fill_buckets( testBucket); // all elements go to testbucket= TB 
+
+matrix<T_C>* C = G.getCptr();
+matrix<T_M>* M = G.getMptr();
+matrix<T_A>* A = G.getAptr();
+matrix<T_B>* B = G.getBptr();
+
+T_C *Cx = C->x;
+T_A *Ax = A->x;
+T_B *Bx = B->x;
+int64_t *Ci = C->i;
+int64_t *Mi = M->i;
+int64_t *Ai = A->i;
+int64_t *Bi = B->i;
+int64_t *Ap = A->p;
+int64_t *Bp = B->p;
+
+
+//std::cout<<"got all matrices"<<std::endl;
+int64_t *Bucket = G.getBucket();
+int64_t *BucketStart = G.getBucketStart();
+
+int zc_valid = 0;
+
+bool result = false;
+
+for (int b =0; b < 12; ++b) {// loop on buckets
+
+    int64_t b_start = BucketStart [b] ;
+    int64_t b_end   = BucketStart [b+1] ;
+    int64_t nvecs = b_end - b_start ;
+    if (nvecs == 0) continue; 
+    std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
+
+    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
+    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
+    if (b == TB) { //test cases for v.sparse-dense kernels
+       int nthrd = 32;
+       int sz = Anz/N;
+       int m = 256/sz;
+       int nblck = (Cnz -1 + m*nthrd )/(m*nthrd) ; 
+       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
+
+       GpuTimer kernTimer;
+       kernTimer.Start();
+       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
+                                C, M, A, B, sz);
+
+       kernTimer.Stop();
+       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+       //std::cout<<"returned from kernel"<<std::endl;
+     
+       zc_valid = C->zombie_count;
+       C->zombie_count = 0;
+       for (int i =0 ; i< Cnz; ++i) {
+            X_valid[i] = Cx[i];
+            Cx[i] = 0;
+            i_valid[i] = Ci[i]; 
+       }
+       G.loadCj(); 
+       for (int64_t pair = b_start ; pair < b_end ; pair++) {
+       
+        // get the kth entry in bucket b
+        //std::cout<< " pair ="<<pair<<std::endl;
+        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
+        int64_t i = Mi[pC] ;          // row index of C(i,j)
+
+        // get C(i,j)
+        //int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
+        //ASSERT ((C->i [pC] & 4) == b) ;
+        //int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
+        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
+
+         int64_t pA = Ap[i];
+         int64_t pA_end   = Ap[i+1];
+         int64_t nnzA   = pA_end - pA;
+         int64_t pB = Bp[i];
+         int64_t pB_end   = Bp[i+1];
+         int64_t nnzB   = pB_end - pB;
+         T_A aki;
+         T_B bkj;
+         T_Z cij;
+
+         if( nnzA == A->vlen) // A is dense
+         {
+            int64_t k = Bi [pB] ;               // first row index of B(:,j)
+            // cij = A(k,i) * B(k,j)
+            GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
+            GB_GETB (bkj, Bx, pB  ) ;           // bkj = B(k,j)
+            cij = (*MUL_ptr<T_Z>)( aki, bkj) ;           // cij = aki * bkj
+
+            for (int64_t p = pB+1 ; p < pB_end ; p++)
+            { 
+                //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal
+                int64_t k = Bi [p] ;                // next row index of B(:,j)
+                // cij += A(k,i) * B(k,j)
+                GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
+                GB_GETB (bkj, Bx, p   ) ;           // bkj = B(k,j)
+                cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) );
+            }
+
+         }
+         if( nnzB == B->vlen) // B is dense
+         {
+            int64_t k = Ai [pA] ;               // first row index of A(:,i)
+            // cij = A(k,i) * B(k,j)
+            GB_GETA (aki, Ax, pA  ) ;           // aki = A(k,i)
+            GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
+            cij = (*MUL_ptr<T_Z>)( aki, bkj) ;           // cij = aki * bkj
+
+            for (int64_t p = pA+1 ; p < pA_end ; p++)
+            { 
+                //GB_DOT_TERMINAL (cij) ;             // break if cij == terminal
+                int64_t k = Ai [p] ;                // next row index of A(:,i)
+                // cij += A(k,i) * B(k,j)
+                GB_GETA (aki, Ax, p   ) ;           // aki = A(k,i)
+                GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
+                cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)aki, (T_Z)bkj) );
+            }
+         }
+
+         Ci[pair] = i;
+         Cx[pair] = cij;
+        
+      }
+       T_C err = 0;
+       for (int j =0 ; j< N; ++j) {
+         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
+             int64_t i =  Ci[l];
+         //std::cout<<i<<","<<j<<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
+             if (i >=0 )
+                err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
+         }
+       }
+       std::cout<< " 2-norm of err ="<< err<<std::endl;
+       std::cout<< " zombie count GPU = "<<C->get_zombie_count()<<" zCPU ="<<zc_valid<<std::endl;
+      
+       EXPECT_EQ(err,0);
+       EXPECT_EQ( zc_valid, C->get_zombie_count());
+
+       free(X_valid);
+       free(i_valid);
+     }
+    }
+
+G.del();
+
+return result;
+
+}
+
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_mp_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
+// Assumes all matrices are square so far, so only N dimension given.
+// Sparsity is dense here so Anz = Bnz = N*N. 
+// Generates three randomized matrices, builds buckets and calls a kernel.
+
+
+launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "mp"); 
+
+int testBucket = TB;
+
+//unsigned seed = 13372801;
+//std::mt19937 r; //random number generator Mersenne Twister
+//r.seed(seed);
+//int gpuID; 
+//cudaGetDevice( &gpuID);
+
+//std::cout<< "found device "<<gpuID<<std::endl;
+
+//T_Z MONOID_IDENTITY;
+if (SEMI_RING == "PLUS_TIMES") {
+   std::cout << "Plus Times (+,*) semiring"<<std::endl;
+   //MONOID_IDENTITY = 0;
+   ADD_ptr<T_Z> = myOP_plus<T_Z>;
+   MUL_ptr<T_Z> = myOP_times<T_Z>;
+
+}
+else if(SEMI_RING == "MIN_PLUS") {
+   std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
+   //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
+   ADD_ptr<T_Z> = myOP_min<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+
+}
+else if(SEMI_RING == "MAX_PLUS") {
+   //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
+   std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
+   ADD_ptr<T_Z> = myOP_max<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+}
+
+//Generate test data and setup for using a jitify kernel with 'bucket' interface
+SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
+int64_t Annz = Anz;
+int64_t Bnnz = Bnz;
+int64_t Cnz = N; 
+float Cnzpercent = (float) Cnz/(N*N);
+
+G.init(N, Annz, Bnnz, Cnzpercent);
+
+G.fill_buckets( testBucket); // all elements go to testbucket= TB 
+
+matrix<T_C>* C = G.getCptr();
+matrix<T_M>* M = G.getMptr();
+matrix<T_A>* A = G.getAptr();
+matrix<T_B>* B = G.getBptr();
+
+T_C *Cx = C->x;
+T_A *Ax = A->x;
+T_B *Bx = B->x;
+int64_t *Ci = C->i;
+int64_t *Mi = M->i;
+int64_t *Ai = A->i;
+int64_t *Bi = B->i;
+int64_t *Ap = A->p;
+int64_t *Bp = B->p;
+
+// Set clear zombie count
+C->zombie_count = 0;
+
+//std::cout<<"got all matrices"<<std::endl;
+int64_t *Bucket = G.getBucket();
+int64_t *BucketStart = G.getBucketStart();
+
+int zc_valid = 0;
+
+bool result = false;
+
+for (int b =0; b < 12; ++b) {// loop on buckets
+
+    int64_t b_start = BucketStart [b] ;
+    int64_t b_end   = BucketStart [b+1] ;
+    int64_t nvecs = b_end - b_start ;
+    if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
+
+    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
+    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
+    if (b == TB) { //test cases for merge-path kernel 
+       int nthrd = 32;
+       int nblck = Cnz; 
+       int sz = 0;
+       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
+
+       GpuTimer kernTimer;
+       kernTimer.Start();
+       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
+                                C, M, A, B, sz);
+
+       kernTimer.Stop();
+       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+       //std::cout<<"returned from kernel"<<std::endl;
+     
+       zc_valid = C->zombie_count;
+       C->zombie_count = 0;
+       for (int i =0 ; i< Cnz; ++i) {
+            //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
+            X_valid[i] = Cx[i];
+            i_valid[i] = C->i[i]; 
+            // clear values for next test
+            Cx[i] = 0;
+       }
+       G.loadCj(); 
+
+       for (int64_t pair = b_start ; pair < b_end ; pair++) {
+       
+        // get the kth entry in bucket b
+        //std::cout<< " pair ="<<pair<<std::endl;
+        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
+        int64_t i = Mi[pC] ;          // row index of C(i,j)
+
+        // get C(i,j)
+        int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
+        //ASSERT ((C->i [pC] & 4) == b) ;
+        int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
+        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
+
+        int64_t pA_start = Ap [i] ;
+        int64_t pA_end   = Ap [i+1] ;
+
+        int64_t pB_start = Bp [j] ;
+        int64_t pB_end   = Bp [j+1] ;
+        // NOTE: this test code is NOT doing merge-path. This is just a 
+        // single-threaded linear merge for correctness testing.
+        k = pA_start;
+        int64_t l = pB_start;
+        T_Z cij ;
+        bool cij_exists = false;
+        while( k < pA_end && l < pB_end) {
+           if      ( Ai[k] < Bi[l] ) k += 1;
+           else if ( Ai[k] > Bi[l] ) l += 1; 
+           else {
+             if (cij_exists) {
+               //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
+               cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
+             } 
+             else {
+               cij_exists = true; 
+               cij = (*MUL_ptr<T_Z>)( (T_Z)Ax[k], (T_Z)Bx[l] ) ;
+             }
+
+             k++;
+             l++;
+           }
+           //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
+        }
+        //std::cout<< " dot  = "<< sum << std::endl;
+
+        // output for this dot product is
+        
+        if (cij_exists) {
+            Cx [pC] = (T_C)cij;
+            Ci [pC] = i;
+        }
+        else {
+            C->i [pC] = -1;//GB_FLIP (i)
+            C->zombie_count++;
+        }
+    }
+       T_C err = 0;
+       for (int j =0 ; j< N; ++j) {
+         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
+             
+             if (Ci[l] > 0) {
+                //std::cout<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
+                err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
+             }
+         }
+       }
+       std::cout<< " 2-norm of err ="<< err<<std::endl;
+       std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
+      
+       EXPECT_EQ(err,0);
+       EXPECT_EQ( zc_valid, C->get_zombie_count());
+
+       free(X_valid);
+       free(i_valid);
+     }
+    }
+
+G.del();
+
+return result;
+
+}
+
+template <typename T_C, typename T_M, typename T_A,typename T_B, typename T_X, typename T_Y, typename T_Z>
+bool test_AxB_dot3_warp_factory( int TB, int64_t N, int64_t Anz, int64_t Bnz, std::string& SEMI_RING) {
+// Assumes all matrices are square so far, so only N dimension given.
+// Sparsity is dense here so Anz = Bnz = N*N. 
+// Generates three randomized matrices, builds buckets and calls a kernel.
+
+
+launchFactory<T_C, T_M, T_A, T_B, T_X, T_Z > lF(SEMI_RING, "warp"); 
+
+int testBucket = TB;
+
+//unsigned seed = 13372801;
+//std::mt19937 r; //random number generator Mersenne Twister
+//r.seed(seed);
+//int gpuID; 
+//cudaGetDevice( &gpuID);
+
+//std::cout<< "found device "<<gpuID<<std::endl;
+
+//T_Z MONOID_IDENTITY;
+if (SEMI_RING == "PLUS_TIMES") {
+   std::cout << "Plus Times (+,*) semiring"<<std::endl;
+   //MONOID_IDENTITY = 0;
+   ADD_ptr<T_Z> = myOP_plus<T_Z>;
+   MUL_ptr<T_Z> = myOP_times<T_Z>;
+
+}
+else if(SEMI_RING == "MIN_PLUS") {
+   std::cout << "Min Plus Times (min,+) semiring"<<std::endl;
+   //MONOID_IDENTITY = std::numeric_limits<T_Z>::max();
+   ADD_ptr<T_Z> = myOP_min<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+
+}
+else if(SEMI_RING == "MAX_PLUS") {
+   //MONOID_IDENTITY = std::numeric_limits<T_Z>::min();
+   std::cout << "Max Plus Times (max,+) semiring"<<std::endl;
+   ADD_ptr<T_Z> = myOP_max<T_Z>;
+   MUL_ptr<T_Z> = myOP_plus<T_Z>;
+}
+
+//Generate test data and setup for using a jitify kernel with 'bucket' interface
+SpGEMM_problem_generator<T_C, T_M, T_A, T_B> G;
+int64_t Cnz = N; 
+float Cnzpercent = (float) Cnz/(N*N);
+
+G.init(N, Anz, Bnz, Cnzpercent);
+
+G.fill_buckets( testBucket); // all elements go to testbucket= TB 
+
+matrix<T_C>* C = G.getCptr();
+matrix<T_M>* M = G.getMptr();
+matrix<T_A>* A = G.getAptr();
+matrix<T_B>* B = G.getBptr();
+
+T_C *Cx = C->x;
+T_A *Ax = A->x;
+T_B *Bx = B->x;
+int64_t *Ci = C->i;
+int64_t *Mi = M->i;
+int64_t *Ai = A->i;
+int64_t *Bi = B->i;
+int64_t *Ap = A->p;
+int64_t *Bp = B->p;
+
+// Set clear zombie count
+C->zombie_count = 0;
+
+//std::cout<<"got all matrices"<<std::endl;
+int64_t *Bucket = G.getBucket();
+int64_t *BucketStart = G.getBucketStart();
+
+int zc_valid = 0;
+
+bool result = false;
+
+for (int b =0; b < 12; ++b) {// loop on buckets
+
+    int64_t b_start = BucketStart [b] ;
+    int64_t b_end   = BucketStart [b+1] ;
+    int64_t nvecs = b_end - b_start ;
+    if (nvecs > 0) std::cout<< "bucket "<<b<<" has "<<nvecs<<" dots to do"<<std::endl;
+
+    T_C *X_valid  = (T_C*) malloc( Cnz*sizeof(T_C));
+    int64_t *i_valid = (int64_t*)malloc( Cnz *sizeof(int64_t));
+    if (b == TB) { //test cases for merge-path kernel 
+       int nthrd = 32;
+       int nblck = (Cnz + nthrd -1)/nthrd ; 
+       int sz = 0;
+       std::cout<< nblck<< " blocks of "<<nthrd<<" threads, "<<b_start<<","<<b_end<<std::endl;
+
+       GpuTimer kernTimer;
+       kernTimer.Start();
+       lF.jitGridBlockLaunch( nblck, nthrd, b_start, b_end, Bucket,
+                                C, M, A, B, sz);
+
+       kernTimer.Stop();
+       std::cout<<"returned from kernel "<<kernTimer.Elapsed()<<"ms"<<std::endl;
+
+       //std::cout<<"returned from kernel"<<std::endl;
+     
+       zc_valid = C->zombie_count;
+       C->zombie_count = 0;
+       for (int i =0 ; i< Cnz; ++i) {
+            //std::cout<<"Cx[i] = "<<Cx[i]<<std::endl;
+            X_valid[i] = Cx[i];
+            i_valid[i] = C->i[i]; 
+            // clear values for next test
+            Cx[i] = 0;
+       }
+       G.loadCj(); 
+
+       for (int64_t pair = b_start ; pair < b_end ; pair++) {
+       
+        // get the kth entry in bucket b
+        //std::cout<< " pair ="<<pair<<std::endl;
+        int64_t pC = (Bucket == nullptr) ? pair : Bucket [pair] ;
+        int64_t i = Mi[pC] ;          // row index of C(i,j)
+
+        // get C(i,j)
+        int64_t k = (Ci [pC] >> 4) ;    // col index of C(i,j)
+        //ASSERT ((C->i [pC] & 4) == b) ;
+        int64_t j = (C->h == nullptr) ? k : C->h [k] ; // Mh has been copied into Ch
+        //std::cout<<" found dot "<<pair<<" at ("<<i<<","<<j<<")"<<std::endl;
+
+        int64_t pA_start = Ap [i] ;
+        int64_t pA_end   = Ap [i+1] ;
+
+        int64_t pB_start = Bp [j] ;
+        int64_t pB_end   = Bp [j+1] ;
+        // NOTE: this test code is NOT doing merge-path. This is just a 
+        // single-threaded linear merge for correctness testing.
+        k = pA_start;
+        int64_t l = pB_start;
+        T_Z cij ;
+        bool cij_exists = false;
+        while( k < pA_end && l < pB_end) {
+           if      ( Ai[k] < Bi[l] ) k += 1;
+           else if ( Ai[k] > Bi[l] ) l += 1; 
+           else {
+             if (cij_exists) {
+               //std::cout<<" A*B="<< (*MUL_ptr<T_Z>) ( (T_Z)Ax[k] , (T_Z) Bx[l]) <<std::endl ;
+               cij = (*ADD_ptr<T_Z>)( cij, (*MUL_ptr<T_Z>)( (T_Z)Ax[k] , (T_Z) Bx[l]) ) ;
+             } 
+             else {
+               cij_exists = true; 
+               cij = (*MUL_ptr<T_Z>)( (T_Z)Ax[k], (T_Z)Bx[l] ) ;
+             }
+
+             k++;
+             l++;
+           }
+           //std::cout<<"Ak = "<< Ax[k]<< " Bl = "<< Bx[l]<< "sum ="<<sum<<std::endl;
+        }
+        //std::cout<< " dot  = "<< sum << std::endl;
+
+        // output for this dot product is
+        
+        if (cij_exists) {
+            Cx [pC] = (T_C)cij;
+            Ci [pC] = i;
+        }
+        else {
+            C->i [pC] = -1;//GB_FLIP (i)
+            C->zombie_count++;
+        }
+    }
+       T_C err = 0;
+       for (int j =0 ; j< N; ++j) {
+         for ( int l = C->p[j]; l< C->p[j+1]; ++l) {
+             
+             if (Ci[l] > 0) {
+                //std::cout<<j<<","<<l <<" Cx = "<<Cx[l]<<"x_val="<<X_valid[l]<<std::endl;
+                err +=  ( X_valid[l] - Cx[l])*(X_valid[l] - Cx[l]);
+             }
+         }
+       }
+       std::cout<< " 2-norm of err ="<< err<<std::endl;
+       std::cout<< " zombie count CPU = "<<C->get_zombie_count()<<" zGPU ="<<zc_valid<<std::endl;
+      
+       EXPECT_EQ(err,0);
+       EXPECT_EQ( zc_valid, C->get_zombie_count());
+
+       free(X_valid);
+       free(i_valid);
+     }
+    }
+
+G.del();
+
+return result;
+
+}
+
+template <typename T>
+bool test_reducefactoryUM( unsigned int N, std::string OP) {
+
+  reduceFactory<T> rF;
+
+  int block(32);
+  int nblock= (N + 8*block -1)/(8*block);
+  int grid(nblock);
+  T* d_data;
+  T* output;
+
+  //std::cout<<" alloc'ing data and output"<<std::endl;
+  CHECK_CUDA( cudaMallocManaged((void**) &d_data, nblock*sizeof(T)) );
+  CHECK_CUDA( cudaMallocManaged((void**) &output, nblock*sizeof(T)) );
+  //std::cout<<" alloc done"<<std::endl;
+  //std::cout<<" data fill start"<<std::endl;
+
+  fillvector_linear<T> ( N, d_data);
+
+  //std::cout<<" data fill complete"<<std::endl;
+  //we will get a triangular sum = N*(N+1)/2 with this input
+  //for (unsigned int i =0; i < N; ++i) d_data[i] = i; 
+
+  //std::cout<< " init data done"<<std::endl;
+  //for (unsigned int i =0; i < N; ++i) std::cout<< d_data[i] <<" "; 
+
+
+  T sum;
+  std::cout << "Launching reduce"<<OP<<GET_TYPE_NAME(sum)<<" kernel..."<<std::endl;
+  rF.jitGridBlockLaunch( grid, block, d_data, output, N, OP );
+
+  for (int i =0; i< nblock; ++i) std::cout<< output[i] <<" "; 
+
+  if (OP == "PLUS"){
+      sum = (T) 0;
+      myOpPTR<T> = myOP_plus<T>; 
+  }
+  if (OP == "MIN") {
+      sum = (T)std::numeric_limits<T>::max();
+      myOpPTR<T> = myOP_min<T>; 
+  }
+  if (OP == "MAX") {
+      sum = (T)std::numeric_limits<T>::min();
+      myOpPTR<T> = myOP_max<T>; 
+  } 
+
+  for (int i =0; i< nblock; ++i) sum = (*myOpPTR<T>)(sum ,output[i]); 
+
+  T expect;
+  bool result = false;
+  if (OP == "PLUS") {
+     expect  = (T)(N*(N-1)/2);
+     T temp = (sum - expect) ;
+     if (temp < 0) temp = -temp ;
+     //result = (temp < (T)1) ; //adjust formula for leading 0
+     EXPECT_LE(temp, 1);
+  }
+  else if (OP == "MAX") {
+     expect = (T)(N-1);
+     //result = (sum)== (T)(N-1) ; //max is N-1 
+     EXPECT_EQ( sum , (T)(N-1) );
+
+  }
+  else if (OP == "MIN") {
+     expect = (T)0;
+     //result = (sum)== (T)(0) ;   //min is 0
+     EXPECT_EQ( sum , (T)(0) );
+  }
+  else expect = (T) 0;
+  std::cout <<std::endl<<"result of test_reducefactoryUM with "<< OP<< " operation ="<< sum 
+            <<" expected "<<expect << std::endl;
+
+  cudaFree(d_data);
+  cudaFree(output);
+  return result; 
+}
+
+template <typename T1,typename T2,typename T3>
+bool test_dndotfactoryUM( unsigned int N, std::string SEMI_RING) {
+
+  dotFactory<T1,T2,T3> dF;
+
+  int block(512);
+  int nblock= (N + 8*block -1)/(8*block);
+  int grid(nblock);
+  T1* x;
+  T2* y;
+  T3* output;
+  CHECK_CUDA( cudaMallocManaged((void**)&x, N*sizeof(T1)) );
+  CHECK_CUDA( cudaMallocManaged((void**)&y, N*sizeof(T2)) );
+  CHECK_CUDA( cudaMallocManaged((void**)&output, nblock*sizeof(T3)) );
+
+  //we will get a triangular sum = N*(N+1)/2 with these inputs
+  fillvector_linear<T1> (N, x);
+  fillvector_constant<T2> (N, y, T2(1));
+
+  dF.jitGridBlockLaunch( grid, block, x, y, output, N, SEMI_RING );
+
+  T3 sum;
+  if (SEMI_RING == "PLUS_TIMES")
+  {
+      myOpPTR<T3> = myOP_plus<T3>; 
+      sum = (T3)0; 
+  }
+  if (SEMI_RING == "MIN_PLUS")
+  { 
+      sum = std::numeric_limits<T3>::max();
+      myOpPTR<T3> = myOP_min<T3>; 
+  }
+
+  for (int i =0; i< nblock; ++i) sum = (*myOpPTR<T3>)(sum ,output[i]); 
+
+  bool result = false;
+  T3 expect;
+  if (SEMI_RING == "PLUS_TIMES") {
+     expect = (T3)(N*(N-1)/2);
+     T3 temp = (sum -expect) ;
+     if (temp < 0) temp = -temp ;
+     //result = (temp < (T3)1) ; //adjust formula for leading 0
+     EXPECT_LE( temp, (T3)1 );
+  }
+  else if (SEMI_RING == "MIN_PLUS") {
+     expect = (T3) 1;
+     //result = (sum == expect) ;   //min is 1 from the (0,1) pair
+     EXPECT_EQ( sum, expect);
+  }
+  else expect = (T3)0;
+  std::cout <<"test_dotfactoryUM with "<<SEMI_RING<<" semi-ring="<< sum 
+                                       <<" expected "<<expect << std::endl;
+
+  cudaFree(x);
+  cudaFree(y);
+  cudaFree(output);
+  return result; 
+}
+
+
+template <typename T1,typename T2,typename T3>
+bool test_spdotfactoryUM( unsigned int N, unsigned int xn, unsigned int yn, std::string SEMI_RING) {
+
+#define INTMIN( A, B) ( (A) < (B) ) ?  (A) : (B)
+
+  // N here is the index space that the sparse vectors are drawn from.
+  // Indices in xi and yi are in the range (0,N-1)
+  // We will generate a number of random values in this range for test data
+  std::cout<< " xn,yn= "<<xn<<','<<yn<<"min = "<< std::min( xn, yn) <<std::endl;
+  int n_threads = std::min( xn, yn) / 4;
+  std::cout<< "I think we need "<< n_threads<<" threads to do this."<<std::endl;
+  int pad_threads = 2;
+  while ( pad_threads < n_threads) {
+      pad_threads *= 2;
+  }
+  int block= 32;
+  int nblock= ( pad_threads + block -1)/(block);
+  int grid(nblock);
+  std::cout<<"N="<<N<<" xn ="<<xn<<", yn="<<yn<<" nblock="<<nblock<<" block ="<<block<<std::endl; 
+  unsigned int *xi;
+  unsigned int *yi;
+  T1* x;
+  T2* y;
+  T3* output;
+  unsigned int intersection_size = 0; //will be filled in later if needed and xn != yn 
+  unsigned seed = 13372801;
+  std::mt19937 r; //random number generator Mersenne Twister
+  r.seed(seed);
+  cudaMallocManaged((void**)&x, xn*sizeof(T1));
+  cudaMallocManaged((void**)&xi, xn*sizeof(int));
+  cudaMallocManaged((void**)&y, yn*sizeof(T2));
+  cudaMallocManaged((void**)&yi, yn*sizeof(int));
+  cudaMallocManaged((void**)&output, nblock*sizeof(T3));
+
+  int inv_sparsity = N/std::max(xn,yn);  //= values not taken per value occupied in index space
+  std::cout<<" Using inv_sparsity value of "<< inv_sparsity<<std::endl;
+  fillvector_constant<T1> (xn, x, T1(1));
+  fillvector_constant<T2> (yn, y, T2(1));
+
+  if( xn == yn){  // test case : all values intersect, generate 1 random number for both
+      intersection_size = xn;
+      std::cout << " all-intersect case..."<<std::endl;
+      for (unsigned int i =0; i < xn; ++i){  
+          unsigned int rand_i = inv_sparsity*i+ r() %(inv_sparsity);
+          xi[i] = rand_i; //we will get a count of the intersection size  
+          yi[i] = rand_i; //we will get a count of the intersection size  
+      }
+      //std::sort (xi, xi + xn);
+      //std::sort (yi, yi + yn);
+  }
+  else { // generate two different sets of indices, no known intersection pattern
+      for (unsigned int i =0; i < xn; ++i){  
+          unsigned int rand_i = inv_sparsity*i +r() % (inv_sparsity);
+          xi[i] = rand_i; //we will get a count of the intersection size  
+      }
+      for (unsigned int i =0; i < yn; ++i){  
+          unsigned int rand_i = inv_sparsity*i +r() % (inv_sparsity);
+          yi[i] = rand_i; //we will get a count of the intersection size  
+      }
+      //std::sort (xi, xi + xn);
+      //std::sort (yi, yi + yn);
+      unsigned int xp =0;
+      unsigned int yp =0;
+      while (1){  //find the intersection size by merge of two sorted lists
+          if (xi[xp] < yi[yp]) xp++;
+          else if (xi[xp] > yi[yp]) yp++;
+          else {
+              intersection_size++;
+              xp++;
+              yp++;
+          }
+          if ( ( xp == xn ) || ( yp == yn) )  break;
+      }
+  }
+  if( xn < 128 ) {
+
+    std::cout<< " xi = [";
+    for (unsigned int i = 0 ; i < xn; ++i) {
+        std::cout<< xi[i] << ",";
+    }
+    std::cout<< " ]" <<std::endl;
+     
+  }
+  std::cout << " Launching sparseDot CUDA kernel xn = "<<xn<<" yn="<<yn<<std::endl;
+  spdotFactory<T1,T2,T3> spdF;
+  spdF.jitGridBlockLaunch( grid, block, xn, xi, x, yn, yi, y, output, SEMI_RING );
+  
+  cudaDeviceSynchronize ( ) ;
+
+  T3 sum;
+  if (SEMI_RING == "PLUS_TIMES")
+  {
+      myOpPTR<T3> = myOP_plus<T3>; 
+      sum = (T3)0; 
+  }
+  if (SEMI_RING == "MIN_PLUS")
+  { 
+      sum = std::numeric_limits<T3>::max();
+      myOpPTR<T3> = myOP_min<T3>; 
+  }
+
+  for (int i =0; i< nblock; ++i) sum = (*myOpPTR<T3>)(sum ,output[i]); 
+
+  bool result = false;
+  T3 expect;
+  if (SEMI_RING == "PLUS_TIMES") {
+     T3 temp;
+     expect = intersection_size;
+     temp = (sum - expect);
+     if (temp < 0) temp = -temp ;
+     result = (temp < (T3)1) ; //adjust formula for leading 0
+  }
+  else if (SEMI_RING == "MIN_PLUS") {
+     expect = 2;
+     result = (sum== expect) ;   //min is 2 from the (1,1) pair
+  }
+  else expect = (T3) 0;
+
+  std::cout <<"test_spdotfactoryUM with "<<SEMI_RING<<" semi-ring= "
+            << sum << " expected "<<intersection_size<< std::endl;
+  cudaFree(x);
+  cudaFree(xi);
+  cudaFree(y);
+  cudaFree(yi);
+  cudaFree(output);
+  return result; 
+}
diff --git a/GraphBLAS/CUDA/test/jitify.hpp b/GraphBLAS/CUDA/test/jitify.hpp
new file mode 100644
index 0000000000..d2cc4fbd81
--- /dev/null
+++ b/GraphBLAS/CUDA/test/jitify.hpp
@@ -0,0 +1,4186 @@
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+  -----------
+  Jitify 0.9
+  -----------
+  A C++ library for easy integration of CUDA runtime compilation into
+  existing codes.
+
+  --------------
+  How to compile
+  --------------
+  Compiler dependencies: <jitify.hpp>, -std=c++11
+  Linker dependencies:   dl cuda nvrtc
+
+  --------------------------------------
+  Embedding source files into executable
+  --------------------------------------
+  g++  ... -ldl -rdynamic -DJITIFY_ENABLE_EMBEDDED_FILES=1
+  -Wl,-b,binary,my_kernel.cu,include/my_header.cuh,-b,default nvcc ... -ldl
+  -Xcompiler "-rdynamic
+  -Wl\,-b\,binary\,my_kernel.cu\,include/my_header.cuh\,-b\,default"
+  JITIFY_INCLUDE_EMBEDDED_FILE(my_kernel_cu);
+  JITIFY_INCLUDE_EMBEDDED_FILE(include_my_header_cuh);
+
+  ----
+  TODO
+  ----
+  Extract valid compile options and pass the rest to cuModuleLoadDataEx
+  See if can have stringified headers automatically looked-up
+    by having stringify add them to a (static) global map.
+    The global map can be updated by creating a static class instance
+      whose constructor performs the registration.
+    Can then remove all headers from JitCache constructor in example code
+  See other TODOs in code
+*/
+
+/*! \file jitify.hpp
+ *  \brief The Jitify library header
+ */
+
+/*! \mainpage Jitify - A C++ library that simplifies the use of NVRTC
+ *  \p Use class jitify::JitCache to manage and launch JIT-compiled CUDA
+ *    kernels.
+ *
+ *  \p Use namespace jitify::reflection to reflect types and values into
+ *    code-strings.
+ *
+ *  \p Use JITIFY_INCLUDE_EMBEDDED_FILE() to declare files that have been
+ *  embedded into the executable using the GCC linker.
+ *
+ *  \p Use jitify::parallel_for and JITIFY_LAMBDA() to generate and launch
+ *  simple kernels.
+ */
+
+#pragma once
+
+#ifndef JITIFY_THREAD_SAFE
+#define JITIFY_THREAD_SAFE 1
+#endif
+
+#if JITIFY_ENABLE_EMBEDDED_FILES
+#include <dlfcn.h>
+#endif
+#include <stdint.h>
+#include <algorithm>
+#include <cctype>
+#include <cstring>  // For strtok_r etc.
+#include <deque>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#if JITIFY_THREAD_SAFE
+#include <mutex>
+#endif
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>  // For dim3, cudaStream_t
+#if CUDA_VERSION >= 8000
+#define NVRTC_GET_TYPE_NAME 1
+#endif
+#include <nvrtc.h>
+
+// For use by get_current_executable_path().
+#ifdef __linux__
+#include <linux/limits.h>  // For PATH_MAX
+
+#include <cstdlib>  // For realpath
+#define JITIFY_PATH_MAX PATH_MAX
+#elif defined(_WIN32) || defined(_WIN64)
+#include <windows.h>
+#define JITIFY_PATH_MAX MAX_PATH
+#else
+#error "Unsupported platform"
+#endif
+
+#ifdef _MSC_VER       // MSVC compiler
+#include <dbghelp.h>  // For UnDecorateSymbolName
+#else
+#include <cxxabi.h>  // For abi::__cxa_demangle
+#endif
+
+#if defined(_WIN32) || defined(_WIN64)
+// WAR for strtok_r being called strtok_s on Windows
+#pragma push_macro("strtok_r")
+#undef strtok_r
+#define strtok_r strtok_s
+// WAR for min and max possibly being macros defined by windows.h
+#pragma push_macro("min")
+#pragma push_macro("max")
+#undef min
+#undef max
+#endif
+
+#ifndef JITIFY_PRINT_LOG
+#define JITIFY_PRINT_LOG 1
+#endif
+
+#if JITIFY_PRINT_ALL
+#define JITIFY_PRINT_INSTANTIATION 1
+#define JITIFY_PRINT_SOURCE 1
+#define JITIFY_PRINT_LOG 1
+#define JITIFY_PRINT_PTX 1
+#define JITIFY_PRINT_LINKER_LOG 1
+#define JITIFY_PRINT_LAUNCH 1
+#define JITIFY_PRINT_HEADER_PATHS 1
+#endif
+
+#if JITIFY_ENABLE_EMBEDDED_FILES
+#define JITIFY_FORCE_UNDEFINED_SYMBOL(x) void* x##_forced = (void*)&x
+/*! Include a source file that has been embedded into the executable using the
+ *    GCC linker.
+ * \param name The name of the source file (<b>not</b> as a string), which must
+ * be sanitized by replacing non-alpha-numeric characters with underscores.
+ * E.g., \code{.cpp}JITIFY_INCLUDE_EMBEDDED_FILE(my_header_h)\endcode will
+ * include the embedded file "my_header.h".
+ * \note Files declared with this macro can be referenced using
+ * their original (unsanitized) filenames when creating a \p
+ * jitify::Program instance.
+ */
+#define JITIFY_INCLUDE_EMBEDDED_FILE(name)                                \
+  extern "C" uint8_t _jitify_binary_##name##_start[] asm("_binary_" #name \
+                                                         "_start");       \
+  extern "C" uint8_t _jitify_binary_##name##_end[] asm("_binary_" #name   \
+                                                       "_end");           \
+  JITIFY_FORCE_UNDEFINED_SYMBOL(_jitify_binary_##name##_start);           \
+  JITIFY_FORCE_UNDEFINED_SYMBOL(_jitify_binary_##name##_end)
+#endif  // JITIFY_ENABLE_EMBEDDED_FILES
+
+/*! Jitify library namespace
+ */
+namespace jitify {
+
+/*! Source-file load callback.
+ *
+ *  \param filename The name of the requested source file.
+ *  \param tmp_stream A temporary stream that can be used to hold source code.
+ *  \return A pointer to an input stream containing the source code, or NULL
+ *  to defer loading of the file to Jitify's file-loading mechanisms.
+ */
+typedef std::istream* (*file_callback_type)(std::string filename,
+                                            std::iostream& tmp_stream);
+
+// Exclude from Doxygen
+//! \cond
+
+class JitCache;
+
+// Simple cache using LRU discard policy
+template <typename KeyType, typename ValueType>
+class ObjectCache {
+ public:
+  typedef KeyType key_type;
+  typedef ValueType value_type;
+
+ private:
+  typedef std::map<key_type, value_type> object_map;
+  typedef std::deque<key_type> key_rank;
+  typedef typename key_rank::iterator rank_iterator;
+  object_map _objects;
+  key_rank _ranked_keys;
+  size_t _capacity;
+
+  inline void discard_old(size_t n = 0) {
+    if (n > _capacity) {
+      throw std::runtime_error("Insufficient capacity in cache");
+    }
+    while (_objects.size() > _capacity - n) {
+      key_type discard_key = _ranked_keys.back();
+      _ranked_keys.pop_back();
+      _objects.erase(discard_key);
+    }
+  }
+
+ public:
+  inline ObjectCache(size_t capacity = 8) : _capacity(capacity) {}
+  inline void resize(size_t capacity) {
+    _capacity = capacity;
+    this->discard_old();
+  }
+  inline bool contains(const key_type& k) const {
+    return (bool)_objects.count(k);
+  }
+  inline void touch(const key_type& k) {
+    if (!this->contains(k)) {
+      throw std::runtime_error("Key not found in cache");
+    }
+    rank_iterator rank = std::find(_ranked_keys.begin(), _ranked_keys.end(), k);
+    if (rank != _ranked_keys.begin()) {
+      // Move key to front of ranks
+      _ranked_keys.erase(rank);
+      _ranked_keys.push_front(k);
+    }
+  }
+  inline value_type& get(const key_type& k) {
+    if (!this->contains(k)) {
+      throw std::runtime_error("Key not found in cache");
+    }
+    this->touch(k);
+    return _objects[k];
+  }
+  inline value_type& insert(const key_type& k,
+                            const value_type& v = value_type()) {
+    this->discard_old(1);
+    _ranked_keys.push_front(k);
+    return _objects.insert(std::make_pair(k, v)).first->second;
+  }
+  template <typename... Args>
+  inline value_type& emplace(const key_type& k, Args&&... args) {
+    this->discard_old(1);
+    // Note: Use of piecewise_construct allows non-movable non-copyable types
+    auto iter = _objects
+                    .emplace(std::piecewise_construct, std::forward_as_tuple(k),
+                             std::forward_as_tuple(args...))
+                    .first;
+    _ranked_keys.push_front(iter->first);
+    return iter->second;
+  }
+};
+
+namespace detail {
+
+// Convenience wrapper for std::vector that provides handy constructors
+template <typename T>
+class vector : public std::vector<T> {
+  typedef std::vector<T> super_type;
+
+ public:
+  vector() : super_type() {}
+  vector(size_t n) : super_type(n) {}  // Note: Not explicit, allows =0
+  vector(std::vector<T> const& vals) : super_type(vals) {}
+  template <int N>
+  vector(T const (&vals)[N]) : super_type(vals, vals + N) {}
+  vector(std::vector<T>&& vals) : super_type(vals) {}
+  vector(std::initializer_list<T> vals) : super_type(vals) {}
+};
+
+// Helper functions for parsing/manipulating source code
+
+inline std::string replace_characters(std::string str,
+                                      std::string const& oldchars,
+                                      char newchar) {
+  size_t i = str.find_first_of(oldchars);
+  while (i != std::string::npos) {
+    str[i] = newchar;
+    i = str.find_first_of(oldchars, i + 1);
+  }
+  return str;
+}
+inline std::string sanitize_filename(std::string name) {
+  return replace_characters(name, "/\\.-: ?%*|\"<>", '_');
+}
+
+#if JITIFY_ENABLE_EMBEDDED_FILES
+class EmbeddedData {
+  void* _app;
+  EmbeddedData(EmbeddedData const&);
+  EmbeddedData& operator=(EmbeddedData const&);
+
+ public:
+  EmbeddedData() {
+    _app = dlopen(NULL, RTLD_LAZY);
+    if (!_app) {
+      throw std::runtime_error(std::string("dlopen failed: ") + dlerror());
+    }
+    dlerror();  // Clear any existing error
+  }
+  ~EmbeddedData() {
+    if (_app) {
+      dlclose(_app);
+    }
+  }
+  const uint8_t* operator[](std::string key) const {
+    key = sanitize_filename(key);
+    key = "_binary_" + key;
+    uint8_t const* data = (uint8_t const*)dlsym(_app, key.c_str());
+    if (!data) {
+      throw std::runtime_error(std::string("dlsym failed: ") + dlerror());
+    }
+    return data;
+  }
+  const uint8_t* begin(std::string key) const {
+    return (*this)[key + "_start"];
+  }
+  const uint8_t* end(std::string key) const { return (*this)[key + "_end"]; }
+};
+#endif  // JITIFY_ENABLE_EMBEDDED_FILES
+
+inline bool is_tokenchar(char c) {
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') ||
+         (c >= '0' && c <= '9') || c == '_';
+}
+inline std::string replace_token(std::string src, std::string token,
+                                 std::string replacement) {
+  size_t i = src.find(token);
+  while (i != std::string::npos) {
+    if (i == 0 || i == src.size() - token.size() ||
+        (!is_tokenchar(src[i - 1]) && !is_tokenchar(src[i + token.size()]))) {
+      src.replace(i, token.size(), replacement);
+      i += replacement.size();
+    } else {
+      i += token.size();
+    }
+    i = src.find(token, i);
+  }
+  return src;
+}
+inline std::string path_base(std::string p) {
+  // "/usr/local/myfile.dat" -> "/usr/local"
+  // "foo/bar"  -> "foo"
+  // "foo/bar/" -> "foo/bar"
+#if defined _WIN32 || defined _WIN64
+  char sep = '\\';
+#else
+  char sep = '/';
+#endif
+  size_t i = p.find_last_of(sep);
+  if (i != std::string::npos) {
+    return p.substr(0, i);
+  } else {
+    return "";
+  }
+}
+inline std::string path_join(std::string p1, std::string p2) {
+#ifdef _WIN32
+  char sep = '\\';
+#else
+  char sep = '/';
+#endif
+  if (p1.size() && p2.size() && p2[0] == sep) {
+    throw std::invalid_argument("Cannot join to absolute path");
+  }
+  if (p1.size() && p1[p1.size() - 1] != sep) {
+    p1 += sep;
+  }
+  return p1 + p2;
+}
+// Elides "/." and "/.." tokens from path.
+inline std::string path_simplify(const std::string& path) {
+  std::vector<std::string> dirs;
+  std::string cur_dir;
+  bool after_slash = false;
+  for (int i = 0; i < (int)path.size(); ++i) {
+    if (path[i] == '/') {
+      if (after_slash) continue;  // Ignore repeat slashes
+      after_slash = true;
+      if (cur_dir == ".." && !dirs.empty() && dirs.back() != "..") {
+        if (dirs.size() == 1 && dirs.front().empty()) {
+          throw std::runtime_error(
+              "Invalid path: back-traversals exceed depth of absolute path");
+        }
+        dirs.pop_back();
+      } else if (cur_dir != ".") {  // Ignore /./
+        dirs.push_back(cur_dir);
+      }
+      cur_dir.clear();
+    } else {
+      after_slash = false;
+      cur_dir.push_back(path[i]);
+    }
+  }
+  if (!after_slash) {
+    dirs.push_back(cur_dir);
+  }
+  std::stringstream ss;
+  for (int i = 0; i < (int)dirs.size() - 1; ++i) {
+    ss << dirs[i] << "/";
+  }
+  if (!dirs.empty()) ss << dirs.back();
+  if (after_slash) ss << "/";
+  return ss.str();
+}
+inline unsigned long long hash_larson64(const char* s,
+                                        unsigned long long seed = 0) {
+  unsigned long long hash = seed;
+  while (*s) {
+    hash = hash * 101 + *s++;
+  }
+  return hash;
+}
+
+inline uint64_t hash_combine(uint64_t a, uint64_t b) {
+  // Note: The magic number comes from the golden ratio
+  return a ^ (0x9E3779B97F4A7C17ull + b + (b >> 2) + (a << 6));
+}
+
+inline bool extract_include_info_from_compile_error(std::string log,
+                                                    std::string& name,
+                                                    std::string& parent,
+                                                    int& line_num) {
+  static const std::vector<std::string> pattern = {
+      "could not open source file \"", "cannot open source file \""};
+
+  for (auto& p : pattern) {
+    size_t beg = log.find(p);
+    if (beg != std::string::npos) {
+      beg += p.size();
+      size_t end = log.find("\"", beg);
+      name = log.substr(beg, end - beg);
+
+      size_t line_beg = log.rfind("\n", beg);
+      if (line_beg == std::string::npos) {
+        line_beg = 0;
+      } else {
+        line_beg += 1;
+      }
+
+      size_t split = log.find("(", line_beg);
+      parent = log.substr(line_beg, split - line_beg);
+      line_num =
+          atoi(log.substr(split + 1, log.find(")", split + 1) - (split + 1))
+                   .c_str());
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
+inline bool is_include_directive_with_quotes(const std::string& source,
+                                             int line_num) {
+  // TODO: Check each find() for failure.
+  size_t beg = 0;
+  for (int i = 1; i < line_num; ++i) {
+    beg = source.find("\n", beg) + 1;
+  }
+  beg = source.find("include", beg) + 7;
+  beg = source.find_first_of("\"<", beg);
+  return source[beg] == '"';
+}
+
+inline std::string comment_out_code_line(int line_num, std::string source) {
+  size_t beg = 0;
+  for (int i = 1; i < line_num; ++i) {
+    beg = source.find("\n", beg) + 1;
+  }
+  return (source.substr(0, beg) + "//" + source.substr(beg));
+}
+
+inline void print_with_line_numbers(std::string const& source) {
+  int linenum = 1;
+  std::stringstream source_ss(source);
+  for (std::string line; std::getline(source_ss, line); ++linenum) {
+    std::cout << std::setfill(' ') << std::setw(3) << linenum << " " << line
+              << std::endl;
+  }
+}
+
+inline void print_compile_log(std::string program_name,
+                              std::string const& log) {
+  std::cout << "---------------------------------------------------"
+            << std::endl;
+  std::cout << "--- JIT compile log for " << program_name << " ---"
+            << std::endl;
+  std::cout << "---------------------------------------------------"
+            << std::endl;
+  std::cout << log << std::endl;
+  std::cout << "---------------------------------------------------"
+            << std::endl;
+}
+
+inline std::vector<std::string> split_string(std::string str,
+                                             long maxsplit = -1,
+                                             std::string delims = " \t") {
+  std::vector<std::string> results;
+  if (maxsplit == 0) {
+    results.push_back(str);
+    return results;
+  }
+  // Note: +1 to include NULL-terminator
+  std::vector<char> v_str(str.c_str(), str.c_str() + (str.size() + 1));
+  char* c_str = v_str.data();
+  char* saveptr = c_str;
+  char* token = nullptr;
+  for (long i = 0; i != maxsplit; ++i) {
+    token = ::strtok_r(c_str, delims.c_str(), &saveptr);
+    c_str = 0;
+    if (!token) {
+      return results;
+    }
+    results.push_back(token);
+  }
+  // Check if there's a final piece
+  token += ::strlen(token) + 1;
+  if (token - v_str.data() < (ptrdiff_t)str.size()) {
+    // Find the start of the final piece
+    token += ::strspn(token, delims.c_str());
+    if (*token) {
+      results.push_back(token);
+    }
+  }
+  return results;
+}
+
+static const std::map<std::string, std::string>& get_jitsafe_headers_map();
+
+inline bool load_source(
+    std::string filename, std::map<std::string, std::string>& sources,
+    std::string current_dir = "",
+    std::vector<std::string> include_paths = std::vector<std::string>(),
+    file_callback_type file_callback = 0,
+    std::map<std::string, std::string>* fullpaths = nullptr,
+    bool search_current_dir = true) {
+  std::istream* source_stream = 0;
+  std::stringstream string_stream;
+  std::ifstream file_stream;
+  // First detect direct source-code string ("my_program\nprogram_code...")
+  size_t newline_pos = filename.find("\n");
+  if (newline_pos != std::string::npos) {
+    std::string source = filename.substr(newline_pos + 1);
+    filename = filename.substr(0, newline_pos);
+    string_stream << source;
+    source_stream = &string_stream;
+  }
+  if (sources.count(filename)) {
+    // Already got this one
+    return true;
+  }
+  if (!source_stream) {
+    std::string fullpath = path_join(current_dir, filename);
+    // Try loading from callback
+    if (!file_callback ||
+        !(source_stream = file_callback(fullpath, string_stream))) {
+#if JITIFY_ENABLE_EMBEDDED_FILES
+      // Try loading as embedded file
+      EmbeddedData embedded;
+      std::string source;
+      try {
+        source.assign(embedded.begin(fullpath), embedded.end(fullpath));
+        string_stream << source;
+        source_stream = &string_stream;
+      } catch (std::runtime_error const&)
+#endif  // JITIFY_ENABLE_EMBEDDED_FILES
+      {
+        // Try loading from filesystem
+        bool found_file = false;
+        if (search_current_dir) {
+          file_stream.open(fullpath.c_str());
+          if (file_stream) {
+            source_stream = &file_stream;
+            found_file = true;
+          }
+        }
+        // Search include directories
+        if (!found_file) {
+          for (int i = 0; i < (int)include_paths.size(); ++i) {
+            fullpath = path_join(include_paths[i], filename);
+            file_stream.open(fullpath.c_str());
+            if (file_stream) {
+              source_stream = &file_stream;
+              found_file = true;
+              break;
+            }
+          }
+          if (!found_file) {
+            // Try loading from builtin headers
+            fullpath = path_join("__jitify_builtin", filename);
+            auto it = get_jitsafe_headers_map().find(filename);
+            if (it != get_jitsafe_headers_map().end()) {
+              string_stream << it->second;
+              source_stream = &string_stream;
+            } else {
+              return false;
+            }
+          }
+        }
+      }
+    }
+    if (fullpaths) {
+      // Record the full file path corresponding to this include name.
+      (*fullpaths)[filename] = path_simplify(fullpath);
+    }
+  }
+  sources[filename] = std::string();
+  std::string& source = sources[filename];
+  std::string line;
+  size_t linenum = 0;
+  unsigned long long hash = 0;
+  bool pragma_once = false;
+  bool remove_next_blank_line = false;
+  while (std::getline(*source_stream, line)) {
+    ++linenum;
+
+    // HACK WAR for static variables not allowed on the device (unless
+    // __shared__)
+    // TODO: This breaks static member variables
+    // line = replace_token(line, "static const", "/*static*/ const");
+
+    // TODO: Need to watch out for /* */ comments too
+    std::string cleanline =
+        line.substr(0, line.find("//"));  // Strip line comments
+    // if( cleanline.back() == "\r" ) { // Remove Windows line ending
+    //	cleanline = cleanline.substr(0, cleanline.size()-1);
+    //}
+    // TODO: Should trim whitespace before checking .empty()
+    if (cleanline.empty() && remove_next_blank_line) {
+      remove_next_blank_line = false;
+      continue;
+    }
+    // Maintain a file hash for use in #pragma once WAR
+    hash = hash_larson64(line.c_str(), hash);
+    if (cleanline.find("#pragma once") != std::string::npos) {
+      pragma_once = true;
+      // Note: This is an attempt to recover the original line numbering,
+      //         which otherwise gets off-by-one due to the include guard.
+      remove_next_blank_line = true;
+      // line = "//" + line; // Comment out the #pragma once line
+      continue;
+    }
+
+    // HACK WAR for Thrust using "#define FOO #pragma bar"
+    size_t pragma_beg = cleanline.find("#pragma ");
+    if (pragma_beg != std::string::npos) {
+      std::string line_after_pragma = line.substr(pragma_beg);
+      std::vector<std::string> pragma_split =
+          split_string(line_after_pragma, 2);
+      line =
+          (line.substr(0, pragma_beg) + "_Pragma(\"" + pragma_split[1] + "\")");
+      if (pragma_split.size() == 3) {
+        line += " " + pragma_split[2];
+      }
+    }
+
+    source += line + "\n";
+  }
+  // HACK TESTING (WAR for cub)
+  // source = "#define cudaDeviceSynchronize() cudaSuccess\n" + source;
+  ////source = "cudaError_t cudaDeviceSynchronize() { return cudaSuccess; }\n" +
+  /// source;
+
+  // WAR for #pragma once causing problems when there are multiple inclusions
+  //   of the same header from different paths.
+  if (pragma_once) {
+    std::stringstream ss;
+    ss << std::uppercase << std::hex << std::setw(8) << std::setfill('0')
+       << hash;
+    std::string include_guard_name = "_JITIFY_INCLUDE_GUARD_" + ss.str() + "\n";
+    std::string include_guard_header;
+    include_guard_header += "#ifndef " + include_guard_name;
+    include_guard_header += "#define " + include_guard_name;
+    std::string include_guard_footer;
+    include_guard_footer += "#endif // " + include_guard_name;
+    source = include_guard_header + source + "\n" + include_guard_footer;
+  }
+  // return filename;
+  return true;
+}
+
+}  // namespace detail
+
+//! \endcond
+
+/*! Jitify reflection utilities namespace
+ */
+namespace reflection {
+
+//  Provides type and value reflection via a function 'reflect':
+//    reflect<Type>()   -> "Type"
+//    reflect(value)    -> "(T)value"
+//    reflect<VAL>()    -> "VAL"
+//    reflect<Type,VAL> -> "VAL"
+//    reflect_template<float,NonType<int,7>,char>() -> "<float,7,char>"
+//    reflect_template({"float", "7", "char"}) -> "<float,7,char>"
+
+/*! A wrapper class for non-type template parameters.
+ */
+template <typename T, T VALUE_>
+struct NonType {
+  constexpr static T VALUE = VALUE_;
+};
+
+// Forward declaration
+template <typename T>
+inline std::string reflect(T const& value);
+
+//! \cond
+
+namespace detail {
+
+template <typename T>
+inline std::string value_string(const T& x) {
+  std::stringstream ss;
+  ss << x;
+  return ss.str();
+}
+// WAR for non-printable characters
+template <>
+inline std::string value_string<char>(const char& x) {
+  std::stringstream ss;
+  ss << (int)x;
+  return ss.str();
+}
+template <>
+inline std::string value_string<signed char>(const signed char& x) {
+  std::stringstream ss;
+  ss << (int)x;
+  return ss.str();
+}
+template <>
+inline std::string value_string<unsigned char>(const unsigned char& x) {
+  std::stringstream ss;
+  ss << (int)x;
+  return ss.str();
+}
+template <>
+inline std::string value_string<wchar_t>(const wchar_t& x) {
+  std::stringstream ss;
+  ss << (long)x;
+  return ss.str();
+}
+// Specialisation for bool true/false literals
+template <>
+inline std::string value_string<bool>(const bool& x) {
+  return x ? "true" : "false";
+}
+
+// Removes all tokens that start with double underscores.
+inline void strip_double_underscore_tokens(char* s) {
+  using jitify::detail::is_tokenchar;
+  char* w = s;
+  do {
+    if (*s == '_' && *(s + 1) == '_') {
+      while (is_tokenchar(*++s))
+        ;
+    }
+  } while ((*w++ = *s++));
+}
+
+//#if CUDA_VERSION < 8000
+#ifdef _MSC_VER  // MSVC compiler
+inline std::string demangle_cuda_symbol(const char* mangled_name) {
+  // We don't have a way to demangle CUDA symbol names under MSVC.
+  return mangled_name;
+}
+inline std::string demangle_native_type(const std::type_info& typeinfo) {
+  // Get the decorated name and skip over the leading '.'.
+  const char* decorated_name = typeinfo.raw_name() + 1;
+  char undecorated_name[4096];
+  if (UnDecorateSymbolName(
+          decorated_name, undecorated_name,
+          sizeof(undecorated_name) / sizeof(*undecorated_name),
+          UNDNAME_NO_ARGUMENTS |          // Treat input as a type name
+              UNDNAME_NAME_ONLY           // No "class" and "struct" prefixes
+          /*UNDNAME_NO_MS_KEYWORDS*/)) {  // No "__cdecl", "__ptr64" etc.
+    // WAR for UNDNAME_NO_MS_KEYWORDS messing up function types.
+    strip_double_underscore_tokens(undecorated_name);
+    return undecorated_name;
+  }
+  throw std::runtime_error("UnDecorateSymbolName failed");
+}
+#else   // not MSVC
+inline std::string demangle_cuda_symbol(const char* mangled_name) {
+  size_t bufsize = 0;
+  char* buf = nullptr;
+  std::string demangled_name;
+  int status;
+  auto demangled_ptr = std::unique_ptr<char, decltype(free)*>(
+      abi::__cxa_demangle(mangled_name, buf, &bufsize, &status), free);
+  if (status == 0) {
+    demangled_name = demangled_ptr.get();  // all worked as expected
+  } else if (status == -2) {
+    demangled_name = mangled_name;  // we interpret this as plain C name
+  } else if (status == -1) {
+    throw std::runtime_error(
+        std::string("memory allocation failure in __cxa_demangle"));
+  } else if (status == -3) {
+    throw std::runtime_error(std::string("invalid argument to __cxa_demangle"));
+  }
+  return demangled_name;
+}
+inline std::string demangle_native_type(const std::type_info& typeinfo) {
+  return demangle_cuda_symbol(typeinfo.name());
+}
+#endif  // not MSVC
+//#endif // CUDA_VERSION < 8000
+
+template <typename>
+class JitifyTypeNameWrapper_ {};
+
+template <typename T>
+struct type_reflection {
+  inline static std::string name() {
+    //#if CUDA_VERSION < 8000
+    // TODO: Use nvrtcGetTypeName once it has the same behavior as this.
+    // WAR for typeid discarding cv qualifiers on value-types
+    // Wrap type in dummy template class to preserve cv-qualifiers, then strip
+    // off the wrapper from the resulting string.
+    std::string wrapped_name =
+        demangle_native_type(typeid(JitifyTypeNameWrapper_<T>));
+    // Note: The reflected name of this class also has namespace prefixes.
+    const std::string wrapper_class_name = "JitifyTypeNameWrapper_<";
+    size_t start = wrapped_name.find(wrapper_class_name);
+    if (start == std::string::npos) {
+      throw std::runtime_error("Type reflection failed: " + wrapped_name);
+    }
+    start += wrapper_class_name.size();
+    std::string name =
+        wrapped_name.substr(start, wrapped_name.size() - (start + 1));
+    return name;
+    //#else
+    //         std::string ret;
+    //         nvrtcResult status = nvrtcGetTypeName<T>(&ret);
+    //         if( status != NVRTC_SUCCESS ) {
+    //                 throw std::runtime_error(std::string("nvrtcGetTypeName
+    // failed:
+    //")+ nvrtcGetErrorString(status));
+    //         }
+    //         return ret;
+    //#endif
+  }
+};  // namespace detail
+template <typename T, T VALUE>
+struct type_reflection<NonType<T, VALUE> > {
+  inline static std::string name() {
+    return jitify::reflection::reflect(VALUE);
+  }
+};
+
+}  // namespace detail
+
+//! \endcond
+
+/*! Create an Instance object that contains a const reference to the
+ *  value.  We use this to wrap abstract objects from which we want to extract
+ *  their type at runtime (e.g., derived type).  This is used to facilitate
+ *  templating on derived type when all we know at compile time is abstract
+ * type.
+ */
+template <typename T>
+struct Instance {
+  const T& value;
+  Instance(const T& value) : value(value) {}
+};
+
+/*! Create an Instance object from which we can extract the value's run-time
+ * type.
+ *  \param value The const value to be captured.
+ */
+template <typename T>
+inline Instance<T const> instance_of(T const& value) {
+  return Instance<T const>(value);
+}
+
+/*! A wrapper used for representing types as values.
+ */
+template <typename T>
+struct Type {};
+
+// Type reflection
+// E.g., reflect<float>() -> "float"
+// Note: This strips trailing const and volatile qualifiers
+/*! Generate a code-string for a type.
+ *  \code{.cpp}reflect<float>() --> "float"\endcode
+ */
+template <typename T>
+inline std::string reflect() {
+  return detail::type_reflection<T>::name();
+}
+// Value reflection
+// E.g., reflect(3.14f) -> "(float)3.14"
+/*! Generate a code-string for a value.
+ *  \code{.cpp}reflect(3.14f) --> "(float)3.14"\endcode
+ */
+template <typename T>
+inline std::string reflect(T const& value) {
+  return "(" + reflect<T>() + ")" + detail::value_string(value);
+}
+// Non-type template arg reflection (implicit conversion to int64_t)
+// E.g., reflect<7>() -> "(int64_t)7"
+/*! Generate a code-string for an integer non-type template argument.
+ *  \code{.cpp}reflect<7>() --> "(int64_t)7"\endcode
+ */
+template <int64_t N>
+inline std::string reflect() {
+  return reflect<NonType<int64_t, N> >();
+}
+// Non-type template arg reflection (explicit type)
+// E.g., reflect<int,7>() -> "(int)7"
+/*! Generate a code-string for a generic non-type template argument.
+ *  \code{.cpp} reflect<int,7>() --> "(int)7" \endcode
+ */
+template <typename T, T N>
+inline std::string reflect() {
+  return reflect<NonType<T, N> >();
+}
+// Type reflection via value
+// E.g., reflect(Type<float>()) -> "float"
+/*! Generate a code-string for a type wrapped as a Type instance.
+ *  \code{.cpp}reflect(Type<float>()) --> "float"\endcode
+ */
+template <typename T>
+inline std::string reflect(jitify::reflection::Type<T>) {
+  return reflect<T>();
+}
+
+/*! Generate a code-string for a type wrapped as an Instance instance.
+ *  \code{.cpp}reflect(Instance<float>(3.1f)) --> "float"\endcode
+ *  or more simply when passed to a instance_of helper
+ *  \code{.cpp}reflect(instance_of(3.1f)) --> "float"\endcodei
+ *  This is specifically for the case where we want to extract the run-time
+ * type, e.g., derived type, of an object pointer.
+ */
+template <typename T>
+inline std::string reflect(jitify::reflection::Instance<T>& value) {
+  return detail::demangle_native_type(typeid(value.value));
+}
+
+// Type from value
+// E.g., type_of(3.14f) -> Type<float>()
+/*! Create a Type object representing a value's type.
+ *  \param value The value whose type is to be captured.
+ */
+template <typename T>
+inline Type<T> type_of(T& value) {
+  return Type<T>();
+}
+/*! Create a Type object representing a value's type.
+ *  \param value The const value whose type is to be captured.
+ */
+template <typename T>
+inline Type<T const> type_of(T const& value) {
+  return Type<T const>();
+}
+
+// Multiple value reflections one call, returning list of strings
+template <typename... Args>
+inline std::vector<std::string> reflect_all(Args... args) {
+  return {reflect(args)...};
+}
+
+inline std::string reflect_list(jitify::detail::vector<std::string> const& args,
+                                std::string opener = "",
+                                std::string closer = "") {
+  std::stringstream ss;
+  ss << opener;
+  for (int i = 0; i < (int)args.size(); ++i) {
+    if (i > 0) ss << ",";
+    ss << args[i];
+  }
+  ss << closer;
+  return ss.str();
+}
+
+// Template instantiation reflection
+// inline std::string reflect_template(std::vector<std::string> const& args) {
+inline std::string reflect_template(
+    jitify::detail::vector<std::string> const& args) {
+  // Note: The space in " >" is a WAR to avoid '>>' appearing
+  return reflect_list(args, "<", " >");
+}
+// TODO: See if can make this evaluate completely at compile-time
+template <typename... Ts>
+inline std::string reflect_template() {
+  return reflect_template({reflect<Ts>()...});
+  // return reflect_template<sizeof...(Ts)>({reflect<Ts>()...});
+}
+
+}  // namespace reflection
+
+//! \cond
+
+namespace detail {
+
+// Demangles nested variable names using the PTX name mangling scheme
+// (which follows the Itanium64 ABI). E.g., _ZN1a3Foo2bcE -> a::Foo::bc.
+inline std::string demangle_ptx_variable_name(const char* name) {
+  std::stringstream ss;
+  const char* c = name;
+  if (*c++ != '_' || *c++ != 'Z') return name;  // Non-mangled name
+  if (*c++ != 'N') return "";  // Not a nested name, unsupported
+  while (true) {
+    // Parse identifier length.
+    int n = 0;
+    while (std::isdigit(*c)) {
+      n = n * 10 + (*c - '0');
+      c++;
+    }
+    if (!n) return "";  // Invalid or unsupported mangled name
+    // Parse identifier.
+    const char* c0 = c;
+    while (n-- && *c) c++;
+    if (!*c) return "";  // Mangled name is truncated
+    std::string id(c0, c);
+    // Identifiers starting with "_GLOBAL" are anonymous namespaces.
+    ss << (id.substr(0, 7) == "_GLOBAL" ? "(anonymous namespace)" : id);
+    // Nested name specifiers end with 'E'.
+    if (*c == 'E') break;
+    // There are more identifiers to come, add join token.
+    ss << "::";
+  }
+  return ss.str();
+}
+
+static const char* get_current_executable_path() {
+  static const char* path = []() -> const char* {
+    static char buffer[JITIFY_PATH_MAX] = {};
+#ifdef __linux__
+    if (!::realpath("/proc/self/exe", buffer)) return nullptr;
+#elif defined(_WIN32) || defined(_WIN64)
+    if (!GetModuleFileNameA(nullptr, buffer, JITIFY_PATH_MAX)) return nullptr;
+#endif
+    return buffer;
+  }();
+  return path;
+}
+
+inline bool endswith(const std::string& str, const std::string& suffix) {
+  return str.size() >= suffix.size() &&
+         str.substr(str.size() - suffix.size()) == suffix;
+}
+
+// Infers the JIT input type from the filename suffix. If no known suffix is
+// present, the filename is assumed to refer to a library, and the associated
+// suffix (and possibly prefix) is automatically added to the filename.
+inline CUjitInputType get_cuda_jit_input_type(std::string* filename) {
+  if (endswith(*filename, ".ptx")) {
+    return CU_JIT_INPUT_PTX;
+  } else if (endswith(*filename, ".cubin")) {
+    return CU_JIT_INPUT_CUBIN;
+  } else if (endswith(*filename, ".fatbin")) {
+    return CU_JIT_INPUT_FATBINARY;
+  } else if (endswith(*filename,
+#if defined _WIN32 || defined _WIN64
+                      ".obj"
+#else  // Linux
+                      ".o"
+#endif
+                      )) {
+    return CU_JIT_INPUT_OBJECT;
+  } else {  // Assume library
+#if defined _WIN32 || defined _WIN64
+    if (!endswith(*filename, ".lib")) {
+      *filename += ".lib";
+    }
+#else  // Linux
+    if (!endswith(*filename, ".a")) {
+      *filename = "lib" + *filename + ".a";
+    }
+#endif
+    return CU_JIT_INPUT_LIBRARY;
+  }
+}
+
+class CUDAKernel {
+  std::vector<std::string> _link_files;
+  std::vector<std::string> _link_paths;
+  CUlinkState _link_state;
+  CUmodule _module;
+  CUfunction _kernel;
+  std::string _func_name;
+  std::string _ptx;
+  std::map<std::string, std::string> _global_map;
+  std::vector<CUjit_option> _opts;
+  std::vector<void*> _optvals;
+#ifdef JITIFY_PRINT_LINKER_LOG
+  static const unsigned int _log_size = 8192;
+  char _error_log[_log_size];
+  char _info_log[_log_size];
+#endif
+
+  inline void cuda_safe_call(CUresult res) const {
+    if (res != CUDA_SUCCESS) {
+      const char* msg;
+      cuGetErrorName(res, &msg);
+      throw std::runtime_error(msg);
+    }
+  }
+  inline void create_module(std::vector<std::string> link_files,
+                            std::vector<std::string> link_paths) {
+    CUresult result;
+#ifndef JITIFY_PRINT_LINKER_LOG
+    // WAR since linker log does not seem to be constructed using a single call
+    // to cuModuleLoadDataEx.
+    if (link_files.empty()) {
+      result =
+          cuModuleLoadDataEx(&_module, _ptx.c_str(), (unsigned)_opts.size(),
+                             _opts.data(), _optvals.data());
+    } else
+#endif
+    {
+      cuda_safe_call(cuLinkCreate((unsigned)_opts.size(), _opts.data(),
+                                  _optvals.data(), &_link_state));
+      cuda_safe_call(cuLinkAddData(_link_state, CU_JIT_INPUT_PTX,
+                                   (void*)_ptx.c_str(), _ptx.size(),
+                                   "jitified_source.ptx", 0, 0, 0));
+      for (int i = 0; i < (int)link_files.size(); ++i) {
+        std::string link_file = link_files[i];
+        CUjitInputType jit_input_type;
+        if (link_file == ".") {
+          // Special case for linking to current executable.
+          link_file = get_current_executable_path();
+          jit_input_type = CU_JIT_INPUT_OBJECT;
+        } else {
+          // Infer based on filename.
+          jit_input_type = get_cuda_jit_input_type(&link_file);
+        }
+        CUresult result = cuLinkAddFile(_link_state, jit_input_type,
+                                        link_file.c_str(), 0, 0, 0);
+        int path_num = 0;
+        while (result == CUDA_ERROR_FILE_NOT_FOUND &&
+               path_num < (int)link_paths.size()) {
+          std::string filename = path_join(link_paths[path_num++], link_file);
+          result = cuLinkAddFile(_link_state, jit_input_type, filename.c_str(),
+                                 0, 0, 0);
+        }
+#if JITIFY_PRINT_LINKER_LOG
+        if (result == CUDA_ERROR_FILE_NOT_FOUND) {
+          std::cerr << "Linker error: Device library not found: " << link_file
+                    << std::endl;
+        } else if (result != CUDA_SUCCESS) {
+          std::cerr << "Linker error: Failed to add file: " << link_file
+                    << std::endl;
+          std::cerr << _error_log << std::endl;
+        }
+#endif
+        cuda_safe_call(result);
+      }
+      size_t cubin_size;
+      void* cubin;
+      result = cuLinkComplete(_link_state, &cubin, &cubin_size);
+      if (result == CUDA_SUCCESS) {
+        result = cuModuleLoadData(&_module, cubin);
+      }
+    }
+#ifdef JITIFY_PRINT_LINKER_LOG
+    std::cout << "---------------------------------------" << std::endl;
+    std::cout << "--- Linker for "
+              << reflection::detail::demangle_cuda_symbol(_func_name.c_str())
+              << " ---" << std::endl;
+    std::cout << "---------------------------------------" << std::endl;
+    std::cout << _info_log << std::endl;
+    std::cout << std::endl;
+    std::cout << _error_log << std::endl;
+    std::cout << "---------------------------------------" << std::endl;
+#endif
+    cuda_safe_call(result);
+    // Allow _func_name to be empty to support cases where we want to generate
+    // PTX containing extern symbol definitions but no kernels.
+    if (!_func_name.empty()) {
+      cuda_safe_call(
+          cuModuleGetFunction(&_kernel, _module, _func_name.c_str()));
+    }
+  }
+  inline void destroy_module() {
+    if (_link_state) {
+      cuda_safe_call(cuLinkDestroy(_link_state));
+    }
+    _link_state = 0;
+    if (_module) {
+      cuModuleUnload(_module);
+    }
+    _module = 0;
+  }
+
+  // create a map of __constant__ and __device__ variables in the ptx file
+  // mapping demangled to mangled name
+  inline void create_global_variable_map() {
+    size_t pos = 0;
+    while (pos < _ptx.size()) {
+      pos = std::min(_ptx.find(".const .align", pos),
+                     _ptx.find(".global .align", pos));
+      if (pos == std::string::npos) break;
+      size_t end = _ptx.find_first_of(";=", pos);
+      if (_ptx[end] == '=') --end;
+      std::string line = _ptx.substr(pos, end - pos);
+      pos = end;
+      size_t symbol_start = line.find_last_of(" ") + 1;
+      size_t symbol_end = line.find_last_of("[");
+      std::string entry = line.substr(symbol_start, symbol_end - symbol_start);
+      std::string key = detail::demangle_ptx_variable_name(entry.c_str());
+      // Skip unsupported mangled names. E.g., a static variable defined inside
+      // a function (such variables are not directly addressable from outside
+      // the function, so skipping them is the correct behavior).
+      if (key == "") continue;
+      _global_map[key] = entry;
+    }
+  }
+
+  inline void set_linker_log() {
+#ifdef JITIFY_PRINT_LINKER_LOG
+    _opts.push_back(CU_JIT_INFO_LOG_BUFFER);
+    _optvals.push_back((void*)_info_log);
+    _opts.push_back(CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES);
+    _optvals.push_back((void*)(long)_log_size);
+    _opts.push_back(CU_JIT_ERROR_LOG_BUFFER);
+    _optvals.push_back((void*)_error_log);
+    _opts.push_back(CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES);
+    _optvals.push_back((void*)(long)_log_size);
+    _opts.push_back(CU_JIT_LOG_VERBOSE);
+    _optvals.push_back((void*)1);
+#endif
+  }
+
+ public:
+  inline CUDAKernel() : _link_state(0), _module(0), _kernel(0) {}
+  inline CUDAKernel(const CUDAKernel& other) = delete;
+  inline CUDAKernel& operator=(const CUDAKernel& other) = delete;
+  inline CUDAKernel(CUDAKernel&& other) = delete;
+  inline CUDAKernel& operator=(CUDAKernel&& other) = delete;
+  inline CUDAKernel(const char* func_name, const char* ptx,
+                    std::vector<std::string> link_files,
+                    std::vector<std::string> link_paths, unsigned int nopts = 0,
+                    CUjit_option* opts = 0, void** optvals = 0)
+      : _link_files(link_files),
+        _link_paths(link_paths),
+        _link_state(0),
+        _module(0),
+        _kernel(0),
+        _func_name(func_name),
+        _ptx(ptx),
+        _opts(opts, opts + nopts),
+        _optvals(optvals, optvals + nopts) {
+    this->set_linker_log();
+    this->create_module(link_files, link_paths);
+    this->create_global_variable_map();
+  }
+
+  inline CUDAKernel& set(const char* func_name, const char* ptx,
+                         std::vector<std::string> link_files,
+                         std::vector<std::string> link_paths,
+                         unsigned int nopts = 0, CUjit_option* opts = 0,
+                         void** optvals = 0) {
+    this->destroy_module();
+    _func_name = func_name;
+    _ptx = ptx;
+    _link_files = link_files;
+    _link_paths = link_paths;
+    _opts.assign(opts, opts + nopts);
+    _optvals.assign(optvals, optvals + nopts);
+    this->set_linker_log();
+    this->create_module(link_files, link_paths);
+    this->create_global_variable_map();
+    return *this;
+  }
+  inline ~CUDAKernel() { this->destroy_module(); }
+  inline operator CUfunction() const { return _kernel; }
+
+  inline CUresult launch(dim3 grid, dim3 block, unsigned int smem,
+                         CUstream stream, std::vector<void*> arg_ptrs) const {
+    return cuLaunchKernel(_kernel, grid.x, grid.y, grid.z, block.x, block.y,
+                          block.z, smem, stream, arg_ptrs.data(), NULL);
+  }
+
+  inline CUdeviceptr get_global_ptr(const char* name,
+                                    size_t* size = nullptr) const {
+    CUdeviceptr global_ptr = 0;
+    auto global = _global_map.find(name);
+    if (global != _global_map.end()) {
+      cuda_safe_call(cuModuleGetGlobal(&global_ptr, size, _module,
+                                       global->second.c_str()));
+    } else {
+      throw std::runtime_error(std::string("failed to look up global ") + name);
+    }
+    return global_ptr;
+  }
+
+  template <typename T>
+  inline CUresult get_global_data(const char* name, T* data, size_t count,
+                                  CUstream stream = 0) const {
+    size_t size_bytes;
+    CUdeviceptr ptr = get_global_ptr(name, &size_bytes);
+    size_t given_size_bytes = count * sizeof(T);
+    if (given_size_bytes != size_bytes) {
+      throw std::runtime_error(
+          std::string("Value for global variable ") + name +
+          " has wrong size: got " + std::to_string(given_size_bytes) +
+          " bytes, expected " + std::to_string(size_bytes));
+    }
+    return cuMemcpyDtoH(data, ptr, size_bytes);
+  }
+
+  template <typename T>
+  inline CUresult set_global_data(const char* name, const T* data, size_t count,
+                                  CUstream stream = 0) const {
+    size_t size_bytes;
+    CUdeviceptr ptr = get_global_ptr(name, &size_bytes);
+    size_t given_size_bytes = count * sizeof(T);
+    if (given_size_bytes != size_bytes) {
+      throw std::runtime_error(
+          std::string("Value for global variable ") + name +
+          " has wrong size: got " + std::to_string(given_size_bytes) +
+          " bytes, expected " + std::to_string(size_bytes));
+    }
+    return cuMemcpyHtoD(ptr, data, size_bytes);
+  }
+
+  const std::string& function_name() const { return _func_name; }
+  const std::string& ptx() const { return _ptx; }
+  const std::vector<std::string>& link_files() const { return _link_files; }
+  const std::vector<std::string>& link_paths() const { return _link_paths; }
+};
+
+static const char* jitsafe_header_preinclude_h = R"(
+//// WAR for Thrust (which appears to have forgotten to include this in result_of_adaptable_function.h
+//#include <type_traits>
+
+//// WAR for Thrust (which appear to have forgotten to include this in error_code.h)
+//#include <string>
+
+// WAR for Thrust (which only supports gnuc, clang or msvc)
+#define __GNUC__ 4
+
+// WAR for generics/shfl.h
+#define THRUST_STATIC_ASSERT(x)
+
+// WAR for CUB
+#ifdef __host__
+#undef __host__
+#endif
+#define __host__
+
+// WAR to allow exceptions to be parsed
+#define try
+#define catch(...)
+)";
+
+
+static const char* jitsafe_header_float_h = R"(
+#pragma once
+
+#define FLT_RADIX       2
+#define FLT_MANT_DIG    24
+#define DBL_MANT_DIG    53
+#define FLT_DIG         6
+#define DBL_DIG         15
+#define FLT_MIN_EXP     -125
+#define DBL_MIN_EXP     -1021
+#define FLT_MIN_10_EXP  -37
+#define DBL_MIN_10_EXP  -307
+#define FLT_MAX_EXP     128
+#define DBL_MAX_EXP     1024
+#define FLT_MAX_10_EXP  38
+#define DBL_MAX_10_EXP  308
+#define FLT_MAX         3.4028234e38f 
+#define DBL_MAX         1.7976931348623157e308 
+#define FLT_EPSILON     1.19209289e-7f 
+#define DBL_EPSILON     2.220440492503130e-16 
+#define FLT_MIN         1.1754943e-38f; 
+#define DBL_MIN         2.2250738585072013e-308 
+#define FLT_ROUNDS      1
+#if defined __cplusplus && __cplusplus >= 201103L
+#define FLT_EVAL_METHOD 0
+#define DECIMAL_DIG     21
+#endif
+)";
+
+static const char* jitsafe_header_limits_h = R"(
+#pragma once
+
+#if defined _WIN32 || defined _WIN64
+ #define __WORDSIZE 32
+#else
+ #if defined __x86_64__ && !defined __ILP32__
+  #define __WORDSIZE 64
+ #else
+  #define __WORDSIZE 32
+ #endif
+#endif
+#define MB_LEN_MAX  16
+#define CHAR_BIT    8
+#define SCHAR_MIN   (-128)
+#define SCHAR_MAX   127
+#define UCHAR_MAX   255
+enum {
+  _JITIFY_CHAR_IS_UNSIGNED = (char)-1 >= 0,
+  CHAR_MIN = _JITIFY_CHAR_IS_UNSIGNED ? 0 : SCHAR_MIN,
+  CHAR_MAX = _JITIFY_CHAR_IS_UNSIGNED ? UCHAR_MAX : SCHAR_MAX,
+};
+#define SHRT_MIN    (-32768)
+#define SHRT_MAX    32767
+#define USHRT_MAX   65535
+#define INT_MIN     (-INT_MAX - 1)
+#define INT_MAX     2147483647
+#define UINT_MAX    4294967295U
+#if __WORDSIZE == 64
+ # define LONG_MAX  9223372036854775807L
+#else
+ # define LONG_MAX  2147483647L
+#endif
+#define LONG_MIN    (-LONG_MAX - 1L)
+#if __WORDSIZE == 64
+ #define ULONG_MAX  18446744073709551615UL
+#else
+ #define ULONG_MAX  4294967295UL
+#endif
+#define LLONG_MAX  9223372036854775807LL
+#define LLONG_MIN  (-LLONG_MAX - 1LL)
+#define ULLONG_MAX 18446744073709551615ULL
+)";
+
+static const char* jitsafe_header_iterator = R"(
+#pragma once
+
+namespace __jitify_iterator_ns {
+struct output_iterator_tag {};
+struct input_iterator_tag {};
+struct forward_iterator_tag {};
+struct bidirectional_iterator_tag {};
+struct random_access_iterator_tag {};
+template<class Iterator>
+struct iterator_traits {
+  typedef typename Iterator::iterator_category iterator_category;
+  typedef typename Iterator::value_type        value_type;
+  typedef typename Iterator::difference_type   difference_type;
+  typedef typename Iterator::pointer           pointer;
+  typedef typename Iterator::reference         reference;
+};
+template<class T>
+struct iterator_traits<T*> {
+  typedef random_access_iterator_tag iterator_category;
+  typedef T                          value_type;
+  typedef ptrdiff_t                  difference_type;
+  typedef T*                         pointer;
+  typedef T&                         reference;
+};
+template<class T>
+struct iterator_traits<T const*> {
+  typedef random_access_iterator_tag iterator_category;
+  typedef T                          value_type;
+  typedef ptrdiff_t                  difference_type;
+  typedef T const*                   pointer;
+  typedef T const&                   reference;
+};
+} // namespace __jitify_iterator_ns
+namespace std { using namespace __jitify_iterator_ns; }
+using namespace __jitify_iterator_ns;
+)";
+
+// TODO: This is incomplete; need floating point limits
+//   Joe Eaton: added IEEE float and double types, none of the smaller types
+//              using type specific structs since we can't template on floats.
+static const char* jitsafe_header_limits = R"(
+#pragma once
+#include <climits>
+#include <cfloat>
+// TODO: epsilon(), infinity(), etc
+namespace __jitify_detail {
+#if __cplusplus >= 201103L
+#define JITIFY_CXX11_CONSTEXPR constexpr
+#define JITIFY_CXX11_NOEXCEPT noexcept
+#else
+#define JITIFY_CXX11_CONSTEXPR
+#define JITIFY_CXX11_NOEXCEPT
+#endif
+
+struct FloatLimits {
+#if __cplusplus >= 201103L
+   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
+          float lowest() JITIFY_CXX11_NOEXCEPT {   return -FLT_MAX;}
+   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
+          float min() JITIFY_CXX11_NOEXCEPT {      return FLT_MIN; }
+   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
+          float max() JITIFY_CXX11_NOEXCEPT {      return FLT_MAX; }
+#endif  // __cplusplus >= 201103L
+   enum {
+   is_specialized    = true,
+   is_signed         = true,
+   is_integer        = false,
+   is_exact          = false,
+   has_infinity      = true,
+   has_quiet_NaN     = true,
+   has_signaling_NaN = true,
+   has_denorm        = 1,
+   has_denorm_loss   = true,
+   round_style       = 1,
+   is_iec559         = true,
+   is_bounded        = true,
+   is_modulo         = false,
+   digits            = 24,
+   digits10          = 6,
+   max_digits10      = 9,
+   radix             = 2,
+   min_exponent      = -125,
+   min_exponent10    = -37,
+   max_exponent      = 128,
+   max_exponent10    = 38,
+   tinyness_before   = false,
+   traps             = false
+   };
+};
+struct DoubleLimits {
+#if __cplusplus >= 201103L
+   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
+          double lowest() noexcept { return -DBL_MAX; }
+   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
+          double min() noexcept { return DBL_MIN; }
+   static JITIFY_CXX11_CONSTEXPR inline __host__ __device__ 
+          double max() noexcept { return DBL_MAX; }
+#endif  // __cplusplus >= 201103L
+   enum {
+   is_specialized    = true,
+   is_signed         = true,
+   is_integer        = false,
+   is_exact          = false,
+   has_infinity      = true,
+   has_quiet_NaN     = true,
+   has_signaling_NaN = true,
+   has_denorm        = 1,
+   has_denorm_loss   = true,
+   round_style       = 1,
+   is_iec559         = true,
+   is_bounded        = true,
+   is_modulo         = false,
+   digits            = 53,
+   digits10          = 15,
+   max_digits10      = 17,
+   radix             = 2,
+   min_exponent      = -1021,
+   min_exponent10    = -307,
+   max_exponent      = 1024,
+   max_exponent10    = 308,
+   tinyness_before   = false,
+   traps             = false
+   };
+};
+template<class T, T Min, T Max, int Digits=-1>
+struct IntegerLimits {
+	static inline __host__ __device__ T min() { return Min; }
+	static inline __host__ __device__ T max() { return Max; }
+#if __cplusplus >= 201103L
+	static constexpr inline __host__ __device__ T lowest() noexcept {
+		return Min;
+	}
+#endif  // __cplusplus >= 201103L
+	enum {
+       is_specialized = true,
+       digits = (Digits == -1) ? (int)(sizeof(T)*8 - (Min != 0)) : Digits,
+       digits10   = (digits * 30103) / 100000,
+       is_signed  = ((T)(-1)<0),
+       is_integer = true,
+       is_exact   = true,
+       radix      = 2,
+       is_bounded = true,
+       is_modulo  = false
+	};
+};
+} // namespace __jitify_detail
+namespace std { using namespace __jitify_detail; }
+namespace __jitify_limits_ns {
+template<typename T> struct numeric_limits {
+    enum { is_specialized = false };
+};
+template<> struct numeric_limits<bool>               : public 
+__jitify_detail::IntegerLimits<bool,              false,    true,1> {};
+template<> struct numeric_limits<char>               : public 
+__jitify_detail::IntegerLimits<char,              CHAR_MIN, CHAR_MAX> 
+{};
+template<> struct numeric_limits<signed char>        : public 
+__jitify_detail::IntegerLimits<signed char,       SCHAR_MIN,SCHAR_MAX> 
+{};
+template<> struct numeric_limits<unsigned char>      : public 
+__jitify_detail::IntegerLimits<unsigned char,     0,        UCHAR_MAX> 
+{};
+template<> struct numeric_limits<wchar_t>            : public 
+__jitify_detail::IntegerLimits<wchar_t,           INT_MIN,  INT_MAX> {};
+template<> struct numeric_limits<short>              : public 
+__jitify_detail::IntegerLimits<short,             SHRT_MIN, SHRT_MAX> 
+{};
+template<> struct numeric_limits<unsigned short>     : public 
+__jitify_detail::IntegerLimits<unsigned short,    0,        USHRT_MAX> 
+{};
+template<> struct numeric_limits<int>                : public 
+__jitify_detail::IntegerLimits<int,               INT_MIN,  INT_MAX> {};
+template<> struct numeric_limits<unsigned int>       : public 
+__jitify_detail::IntegerLimits<unsigned int,      0,        UINT_MAX> 
+{};
+template<> struct numeric_limits<long>               : public 
+__jitify_detail::IntegerLimits<long,              LONG_MIN, LONG_MAX> 
+{};
+template<> struct numeric_limits<unsigned long>      : public 
+__jitify_detail::IntegerLimits<unsigned long,     0,        ULONG_MAX> 
+{};
+template<> struct numeric_limits<long long>          : public 
+__jitify_detail::IntegerLimits<long long,         LLONG_MIN,LLONG_MAX> 
+{};
+template<> struct numeric_limits<unsigned long long> : public 
+__jitify_detail::IntegerLimits<unsigned long long,0,        ULLONG_MAX> 
+{};
+//template<typename T> struct numeric_limits { static const bool 
+//is_signed = ((T)(-1)<0); };
+template<> struct numeric_limits<float>              : public 
+__jitify_detail::FloatLimits 
+{};
+template<> struct numeric_limits<double>             : public 
+__jitify_detail::DoubleLimits 
+{};
+} // namespace __jitify_limits_ns
+namespace std { using namespace __jitify_limits_ns; }
+using namespace __jitify_limits_ns;
+)";
+
+// TODO: This is highly incomplete
+static const char* jitsafe_header_type_traits = R"(
+    #pragma once
+    #if __cplusplus >= 201103L
+    namespace __jitify_type_traits_ns {
+
+    template<bool B, class T = void> struct enable_if {};
+    template<class T>                struct enable_if<true, T> { typedef T type; };
+    #if __cplusplus >= 201402L
+    template< bool B, class T = void > using enable_if_t = typename enable_if<B,T>::type;
+    #endif
+
+    struct true_type  {
+      enum { value = true };
+      operator bool() const { return true; }
+    };
+    struct false_type {
+      enum { value = false };
+      operator bool() const { return false; }
+    };
+
+    template<typename T> struct is_floating_point    : false_type {};
+    template<> struct is_floating_point<float>       :  true_type {};
+    template<> struct is_floating_point<double>      :  true_type {};
+    template<> struct is_floating_point<long double> :  true_type {};
+
+    template<class T> struct is_integral              : false_type {};
+    template<> struct is_integral<bool>               :  true_type {};
+    template<> struct is_integral<char>               :  true_type {};
+    template<> struct is_integral<signed char>        :  true_type {};
+    template<> struct is_integral<unsigned char>      :  true_type {};
+    template<> struct is_integral<short>              :  true_type {};
+    template<> struct is_integral<unsigned short>     :  true_type {};
+    template<> struct is_integral<int>                :  true_type {};
+    template<> struct is_integral<unsigned int>       :  true_type {};
+    template<> struct is_integral<long>               :  true_type {};
+    template<> struct is_integral<unsigned long>      :  true_type {};
+    template<> struct is_integral<long long>          :  true_type {};
+    template<> struct is_integral<unsigned long long> :  true_type {};
+
+    template<typename T> struct is_signed    : false_type {};
+    template<> struct is_signed<float>       :  true_type {};
+    template<> struct is_signed<double>      :  true_type {};
+    template<> struct is_signed<long double> :  true_type {};
+    template<> struct is_signed<signed char> :  true_type {};
+    template<> struct is_signed<short>       :  true_type {};
+    template<> struct is_signed<int>         :  true_type {};
+    template<> struct is_signed<long>        :  true_type {};
+    template<> struct is_signed<long long>   :  true_type {};
+
+    template<typename T> struct is_unsigned             : false_type {};
+    template<> struct is_unsigned<unsigned char>      :  true_type {};
+    template<> struct is_unsigned<unsigned short>     :  true_type {};
+    template<> struct is_unsigned<unsigned int>       :  true_type {};
+    template<> struct is_unsigned<unsigned long>      :  true_type {};
+    template<> struct is_unsigned<unsigned long long> :  true_type {};
+
+    template<typename T, typename U> struct is_same      : false_type {};
+    template<typename T>             struct is_same<T,T> :  true_type {};
+
+    template<class T> struct is_array : false_type {};
+    template<class T> struct is_array<T[]> : true_type {};
+    template<class T, size_t N> struct is_array<T[N]> : true_type {};
+
+    //partial implementation only of is_function
+    template<class> struct is_function : false_type { };
+    template<class Ret, class... Args> struct is_function<Ret(Args...)> : true_type {}; //regular
+    template<class Ret, class... Args> struct is_function<Ret(Args......)> : true_type {}; // variadic
+
+    template<class> struct result_of;
+    template<class F, typename... Args>
+    struct result_of<F(Args...)> {
+    // TODO: This is a hack; a proper implem is quite complicated.
+    typedef typename F::result_type type;
+    };
+
+    template <class T> struct remove_reference { typedef T type; };
+    template <class T> struct remove_reference<T&> { typedef T type; };
+    template <class T> struct remove_reference<T&&> { typedef T type; };
+    #if __cplusplus >= 201402L
+    template< class T > using remove_reference_t = typename remove_reference<T>::type;
+    #endif
+
+    template<class T> struct remove_extent { typedef T type; };
+    template<class T> struct remove_extent<T[]> { typedef T type; };
+    template<class T, size_t N> struct remove_extent<T[N]> { typedef T type; };
+    #if __cplusplus >= 201402L
+    template< class T > using remove_extent_t = typename remove_extent<T>::type;
+    #endif
+
+    template< class T > struct remove_const          { typedef T type; };
+    template< class T > struct remove_const<const T> { typedef T type; };
+    template< class T > struct remove_volatile             { typedef T type; };
+    template< class T > struct remove_volatile<volatile T> { typedef T type; };
+    template< class T > struct remove_cv { typedef typename remove_volatile<typename remove_const<T>::type>::type type; };
+    #if __cplusplus >= 201402L
+    template< class T > using remove_cv_t       = typename remove_cv<T>::type;
+    template< class T > using remove_const_t    = typename remove_const<T>::type;
+    template< class T > using remove_volatile_t = typename remove_volatile<T>::type;
+    #endif
+
+    template<bool B, class T, class F> struct conditional { typedef T type; };
+    template<class T, class F> struct conditional<false, T, F> { typedef F type; };
+    #if __cplusplus >= 201402L
+    template< bool B, class T, class F > using conditional_t = typename conditional<B,T,F>::type;
+    #endif
+
+    namespace __jitify_detail {
+    template< class T, bool is_function_type = false > struct add_pointer { using type = typename remove_reference<T>::type*; };
+    template< class T > struct add_pointer<T, true> { using type = T; };
+    template< class T, class... Args > struct add_pointer<T(Args...), true> { using type = T(*)(Args...); };
+    template< class T, class... Args > struct add_pointer<T(Args..., ...), true> { using type = T(*)(Args..., ...); };
+    }
+    template< class T > struct add_pointer : __jitify_detail::add_pointer<T, is_function<T>::value> {};
+    #if __cplusplus >= 201402L
+    template< class T > using add_pointer_t = typename add_pointer<T>::type;
+    #endif
+
+    template< class T > struct decay {
+    private:
+      typedef typename remove_reference<T>::type U;
+    public:
+      typedef typename conditional<is_array<U>::value, typename remove_extent<U>::type*,
+        typename conditional<is_function<U>::value,typename add_pointer<U>::type,typename remove_cv<U>::type
+        >::type>::type type;
+    };
+    #if __cplusplus >= 201402L
+    template< class T > using decay_t = typename decay<T>::type;
+    #endif
+
+    } // namespace __jtiify_type_traits_ns
+    namespace std { using namespace __jitify_type_traits_ns; }
+    using namespace __jitify_type_traits_ns;
+    #endif // c++11
+)";
+
+// TODO: INT_FAST8_MAX et al. and a few other misc constants
+static const char* jitsafe_header_stdint_h =
+    "#pragma once\n"
+    "#include <climits>\n"
+    "namespace __jitify_stdint_ns {\n"
+    "typedef signed char      int8_t;\n"
+    "typedef signed short     int16_t;\n"
+    "typedef signed int       int32_t;\n"
+    "typedef signed long long int64_t;\n"
+    "typedef signed char      int_fast8_t;\n"
+    "typedef signed short     int_fast16_t;\n"
+    "typedef signed int       int_fast32_t;\n"
+    "typedef signed long long int_fast64_t;\n"
+    "typedef signed char      int_least8_t;\n"
+    "typedef signed short     int_least16_t;\n"
+    "typedef signed int       int_least32_t;\n"
+    "typedef signed long long int_least64_t;\n"
+    "typedef signed long long intmax_t;\n"
+    "typedef signed long      intptr_t; //optional\n"
+    "typedef unsigned char      uint8_t;\n"
+    "typedef unsigned short     uint16_t;\n"
+    "typedef unsigned int       uint32_t;\n"
+    "typedef unsigned long long uint64_t;\n"
+    "typedef unsigned char      uint_fast8_t;\n"
+    "typedef unsigned short     uint_fast16_t;\n"
+    "typedef unsigned int       uint_fast32_t;\n"
+    "typedef unsigned long long uint_fast64_t;\n"
+    "typedef unsigned char      uint_least8_t;\n"
+    "typedef unsigned short     uint_least16_t;\n"
+    "typedef unsigned int       uint_least32_t;\n"
+    "typedef unsigned long long uint_least64_t;\n"
+    "typedef unsigned long long uintmax_t;\n"
+    "typedef unsigned long      uintptr_t; //optional\n"
+    "#define INT8_MIN    SCHAR_MIN\n"
+    "#define INT16_MIN   SHRT_MIN\n"
+    "#define INT32_MIN   INT_MIN\n"
+    "#define INT64_MIN   LLONG_MIN\n"
+    "#define INT8_MAX    SCHAR_MAX\n"
+    "#define INT16_MAX   SHRT_MAX\n"
+    "#define INT32_MAX   INT_MAX\n"
+    "#define INT64_MAX   LLONG_MAX\n"
+    "#define UINT8_MAX   UCHAR_MAX\n"
+    "#define UINT16_MAX  USHRT_MAX\n"
+    "#define UINT32_MAX  UINT_MAX\n"
+    "#define UINT64_MAX  ULLONG_MAX\n"
+    "#define INTPTR_MIN  LONG_MIN\n"
+    "#define INTMAX_MIN  LLONG_MIN\n"
+    "#define INTPTR_MAX  LONG_MAX\n"
+    "#define INTMAX_MAX  LLONG_MAX\n"
+    "#define UINTPTR_MAX ULONG_MAX\n"
+    "#define UINTMAX_MAX ULLONG_MAX\n"
+    "#define PTRDIFF_MIN INTPTR_MIN\n"
+    "#define PTRDIFF_MAX INTPTR_MAX\n"
+    "#define SIZE_MAX    UINT64_MAX\n"
+    "} // namespace __jitify_stdint_ns\n"
+    "namespace std { using namespace __jitify_stdint_ns; }\n"
+    "using namespace __jitify_stdint_ns;\n";
+
+// TODO: offsetof
+static const char* jitsafe_header_stddef_h =
+    "#pragma once\n"
+    "#include <climits>\n"
+    "namespace __jitify_stddef_ns {\n"
+    "#if __cplusplus >= 201103L\n"
+    "typedef decltype(nullptr) nullptr_t;\n"
+    "#if defined(_MSC_VER)\n"
+    "  typedef double max_align_t;\n"
+    "#elif defined(__APPLE__)\n"
+    "  typedef long double max_align_t;\n"
+    "#else\n"
+    "  // Define max_align_t to match the GCC definition.\n"
+    "  typedef struct {\n"
+    "    long long __jitify_max_align_nonce1\n"
+    "        __attribute__((__aligned__(__alignof__(long long))));\n"
+    "    long double __jitify_max_align_nonce2\n"
+    "        __attribute__((__aligned__(__alignof__(long double))));\n"
+    "  } max_align_t;\n"
+    "#endif\n"
+    "#endif  // __cplusplus >= 201103L\n"
+    "#if __cplusplus >= 201703L\n"
+    "enum class byte : unsigned char {};\n"
+    "#endif  // __cplusplus >= 201703L\n"
+    "} // namespace __jitify_stddef_ns\n"
+    "namespace std {\n"
+    "  // NVRTC provides built-in definitions of ::size_t and ::ptrdiff_t.\n"
+    "  using ::size_t;\n"
+    "  using ::ptrdiff_t;\n"
+    "  using namespace __jitify_stddef_ns;\n"
+    "} // namespace std\n"
+    "using namespace __jitify_stddef_ns;\n";
+
+static const char* jitsafe_header_stdlib_h =
+    "#pragma once\n"
+    "#include <stddef.h>\n";
+static const char* jitsafe_header_stdio_h =
+    "#pragma once\n"
+    "#include <stddef.h>\n"
+    "#define FILE int\n"
+    "int fflush ( FILE * stream );\n"
+    "int fprintf ( FILE * stream, const char * format, ... );\n";
+
+static const char* jitsafe_header_string_h =
+    "#pragma once\n"
+    "char* strcpy ( char * destination, const char * source );\n"
+    "int strcmp ( const char * str1, const char * str2 );\n"
+    "char* strerror( int errnum );\n";
+
+static const char* jitsafe_header_cstring =
+    "#pragma once\n"
+    "\n"
+    "namespace __jitify_cstring_ns {\n"
+    "char* strcpy ( char * destination, const char * source );\n"
+    "int strcmp ( const char * str1, const char * str2 );\n"
+    "char* strerror( int errnum );\n"
+    "} // namespace __jitify_cstring_ns\n"
+    "namespace std { using namespace __jitify_cstring_ns; }\n"
+    "using namespace __jitify_cstring_ns;\n";
+
+// HACK TESTING (WAR for cub)
+static const char* jitsafe_header_iostream =
+    "#pragma once\n"
+    "#include <ostream>\n"
+    "#include <istream>\n";
+// HACK TESTING (WAR for Thrust)
+static const char* jitsafe_header_ostream =
+    "#pragma once\n"
+    "\n"
+    "namespace __jitify_ostream_ns {\n"
+    "template<class CharT,class Traits=void>\n"  // = std::char_traits<CharT>
+                                                 // >\n"
+    "struct basic_ostream {\n"
+    "};\n"
+    "typedef basic_ostream<char> ostream;\n"
+    "ostream& endl(ostream& os);\n"
+    "ostream& operator<<( ostream&, ostream& (*f)( ostream& ) );\n"
+    "template< class CharT, class Traits > basic_ostream<CharT, Traits>& endl( "
+    "basic_ostream<CharT, Traits>& os );\n"
+    "template< class CharT, class Traits > basic_ostream<CharT, Traits>& "
+    "operator<<( basic_ostream<CharT,Traits>& os, const char* c );\n"
+    "#if __cplusplus >= 201103L\n"
+    "template< class CharT, class Traits, class T > basic_ostream<CharT, "
+    "Traits>& operator<<( basic_ostream<CharT,Traits>&& os, const T& value );\n"
+    "#endif  // __cplusplus >= 201103L\n"
+    "} // namespace __jitify_ostream_ns\n"
+    "namespace std { using namespace __jitify_ostream_ns; }\n"
+    "using namespace __jitify_ostream_ns;\n";
+
+static const char* jitsafe_header_istream =
+    "#pragma once\n"
+    "\n"
+    "namespace __jitify_istream_ns {\n"
+    "template<class CharT,class Traits=void>\n"  // = std::char_traits<CharT>
+                                                 // >\n"
+    "struct basic_istream {\n"
+    "};\n"
+    "typedef basic_istream<char> istream;\n"
+    "} // namespace __jitify_istream_ns\n"
+    "namespace std { using namespace __jitify_istream_ns; }\n"
+    "using namespace __jitify_istream_ns;\n";
+
+static const char* jitsafe_header_sstream =
+    "#pragma once\n"
+    "#include <ostream>\n"
+    "#include <istream>\n";
+
+static const char* jitsafe_header_utility =
+    "#pragma once\n"
+    "namespace __jitify_utility_ns {\n"
+    "template<class T1, class T2>\n"
+    "struct pair {\n"
+    "	T1 first;\n"
+    "	T2 second;\n"
+    "	inline pair() {}\n"
+    "	inline pair(T1 const& first_, T2 const& second_)\n"
+    "		: first(first_), second(second_) {}\n"
+    "	// TODO: Standard includes many more constructors...\n"
+    "	// TODO: Comparison operators\n"
+    "};\n"
+    "template<class T1, class T2>\n"
+    "pair<T1,T2> make_pair(T1 const& first, T2 const& second) {\n"
+    "	return pair<T1,T2>(first, second);\n"
+    "}\n"
+    "} // namespace __jitify_utility_ns\n"
+    "namespace std { using namespace __jitify_utility_ns; }\n"
+    "using namespace __jitify_utility_ns;\n";
+
+// TODO: incomplete
+static const char* jitsafe_header_vector =
+    "#pragma once\n"
+    "namespace __jitify_vector_ns {\n"
+    "template<class T, class Allocator=void>\n"  // = std::allocator> \n"
+    "struct vector {\n"
+    "};\n"
+    "} // namespace __jitify_vector_ns\n"
+    "namespace std { using namespace __jitify_vector_ns; }\n"
+    "using namespace __jitify_vector_ns;\n";
+
+// TODO: incomplete
+static const char* jitsafe_header_string =
+    "#pragma once\n"
+    "namespace __jitify_string_ns {\n"
+    "template<class CharT,class Traits=void,class Allocator=void>\n"
+    "struct basic_string {\n"
+    "basic_string();\n"
+    "basic_string( const CharT* s );\n"  //, const Allocator& alloc =
+                                         // Allocator() );\n"
+    "const CharT* c_str() const;\n"
+    "bool empty() const;\n"
+    "void operator+=(const char *);\n"
+    "void operator+=(const basic_string &);\n"
+    "};\n"
+    "typedef basic_string<char> string;\n"
+    "} // namespace __jitify_string_ns\n"
+    "namespace std { using namespace __jitify_string_ns; }\n"
+    "using namespace __jitify_string_ns;\n";
+
+// TODO: incomplete
+static const char* jitsafe_header_stdexcept =
+    "#pragma once\n"
+    "namespace __jitify_stdexcept_ns {\n"
+    "struct runtime_error {\n"
+    "explicit runtime_error( const std::string& what_arg );"
+    "explicit runtime_error( const char* what_arg );"
+    "virtual const char* what() const;\n"
+    "};\n"
+    "} // namespace __jitify_stdexcept_ns\n"
+    "namespace std { using namespace __jitify_stdexcept_ns; }\n"
+    "using namespace __jitify_stdexcept_ns;\n";
+
+// TODO: incomplete
+static const char* jitsafe_header_complex =
+    "#pragma once\n"
+    "namespace __jitify_complex_ns {\n"
+    "template<typename T>\n"
+    "class complex {\n"
+    "	T _real;\n"
+    "	T _imag;\n"
+    "public:\n"
+    "	complex() : _real(0), _imag(0) {}\n"
+    "	complex(T const& real, T const& imag)\n"
+    "		: _real(real), _imag(imag) {}\n"
+    "	complex(T const& real)\n"
+    "               : _real(real), _imag(static_cast<T>(0)) {}\n"
+    "	T const& real() const { return _real; }\n"
+    "	T&       real()       { return _real; }\n"
+    "	void real(const T &r) { _real = r; }\n"
+    "	T const& imag() const { return _imag; }\n"
+    "	T&       imag()       { return _imag; }\n"
+    "	void imag(const T &i) { _imag = i; }\n"
+    "       complex<T>& operator+=(const complex<T> z)\n"
+    "         { _real += z.real(); _imag += z.imag(); return *this; }\n"
+    "};\n"
+    "template<typename T>\n"
+    "complex<T> operator*(const complex<T>& lhs, const complex<T>& rhs)\n"
+    "  { return complex<T>(lhs.real()*rhs.real()-lhs.imag()*rhs.imag(),\n"
+    "                      lhs.real()*rhs.imag()+lhs.imag()*rhs.real()); }\n"
+    "template<typename T>\n"
+    "complex<T> operator*(const complex<T>& lhs, const T & rhs)\n"
+    "  { return complexs<T>(lhs.real()*rhs,lhs.imag()*rhs); }\n"
+    "template<typename T>\n"
+    "complex<T> operator*(const T& lhs, const complex<T>& rhs)\n"
+    "  { return complexs<T>(rhs.real()*lhs,rhs.imag()*lhs); }\n"
+    "} // namespace __jitify_complex_ns\n"
+    "namespace std { using namespace __jitify_complex_ns; }\n"
+    "using namespace __jitify_complex_ns;\n";
+
+// TODO: This is incomplete (missing binary and integer funcs, macros,
+// constants, types)
+static const char* jitsafe_header_math =
+    "#pragma once\n"
+    "namespace __jitify_math_ns {\n"
+    "#if __cplusplus >= 201103L\n"
+    "#define DEFINE_MATH_UNARY_FUNC_WRAPPER(f) \\\n"
+    "	inline double      f(double x)         { return ::f(x); } \\\n"
+    "	inline float       f##f(float x)       { return ::f(x); } \\\n"
+    "	/*inline long double f##l(long double x) { return ::f(x); }*/ \\\n"
+    "	inline float       f(float x)          { return ::f(x); } \\\n"
+    "	/*inline long double f(long double x)    { return ::f(x); }*/\n"
+    "#else\n"
+    "#define DEFINE_MATH_UNARY_FUNC_WRAPPER(f) \\\n"
+    "	inline double      f(double x)         { return ::f(x); } \\\n"
+    "	inline float       f##f(float x)       { return ::f(x); } \\\n"
+    "	/*inline long double f##l(long double x) { return ::f(x); }*/\n"
+    "#endif\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(cos)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(sin)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(tan)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(acos)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(asin)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(atan)\n"
+    "template<typename T> inline T atan2(T y, T x) { return ::atan2(y, x); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(cosh)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(sinh)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(tanh)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(exp)\n"
+    "template<typename T> inline T frexp(T x, int* exp) { return ::frexp(x, "
+    "exp); }\n"
+    "template<typename T> inline T ldexp(T x, int  exp) { return ::ldexp(x, "
+    "exp); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(log)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(log10)\n"
+    "template<typename T> inline T modf(T x, T* intpart) { return ::modf(x, "
+    "intpart); }\n"
+    "template<typename T> inline T pow(T x, T y) { return ::pow(x, y); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(sqrt)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(ceil)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(floor)\n"
+    "template<typename T> inline T fmod(T n, T d) { return ::fmod(n, d); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(fabs)\n"
+    "template<typename T> inline T abs(T x) { return ::abs(x); }\n"
+    "#if __cplusplus >= 201103L\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(acosh)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(asinh)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(atanh)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(exp2)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(expm1)\n"
+    "template<typename T> inline int ilogb(T x) { return ::ilogb(x); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(log1p)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(log2)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(logb)\n"
+    "template<typename T> inline T scalbn (T x, int n)  { return ::scalbn(x, "
+    "n); }\n"
+    "template<typename T> inline T scalbln(T x, long n) { return ::scalbn(x, "
+    "n); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(cbrt)\n"
+    "template<typename T> inline T hypot(T x, T y) { return ::hypot(x, y); }\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(erf)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(erfc)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(tgamma)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(lgamma)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(trunc)\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(round)\n"
+    "template<typename T> inline long lround(T x) { return ::lround(x); }\n"
+    "template<typename T> inline long long llround(T x) { return ::llround(x); "
+    "}\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(rint)\n"
+    "template<typename T> inline long lrint(T x) { return ::lrint(x); }\n"
+    "template<typename T> inline long long llrint(T x) { return ::llrint(x); "
+    "}\n"
+    "DEFINE_MATH_UNARY_FUNC_WRAPPER(nearbyint)\n"
+    // TODO: remainder, remquo, copysign, nan, nextafter, nexttoward, fdim,
+    // fmax, fmin, fma
+    "#endif\n"
+    "#undef DEFINE_MATH_UNARY_FUNC_WRAPPER\n"
+    "} // namespace __jitify_math_ns\n"
+    "namespace std { using namespace __jitify_math_ns; }\n"
+    "#define M_PI 3.14159265358979323846\n"
+    // Note: Global namespace already includes CUDA math funcs
+    "//using namespace __jitify_math_ns;\n";
+
+static const char* jitsafe_header_memory_h = R"(
+    #pragma once
+    #include <string.h>
+ )";
+
+// TODO: incomplete
+static const char* jitsafe_header_mutex = R"(
+    #pragma once
+    #if __cplusplus >= 201103L
+    namespace __jitify_mutex_ns {
+    class mutex {
+    public:
+    void lock();
+    bool try_lock();
+    void unlock();
+    };
+    } // namespace __jitify_mutex_ns
+    namespace std { using namespace __jitify_mutex_ns; }
+    using namespace __jitify_mutex_ns;
+    #endif
+ )";
+
+static const char* jitsafe_header_algorithm = R"(
+    #pragma once
+    #if __cplusplus >= 201103L
+    namespace __jitify_algorithm_ns {
+
+    #if __cplusplus == 201103L
+    #define JITIFY_CXX14_CONSTEXPR
+    #else
+    #define JITIFY_CXX14_CONSTEXPR constexpr
+    #endif
+
+    template<class T> JITIFY_CXX14_CONSTEXPR const T& max(const T& a, const T& b)
+    {
+      return (b > a) ? b : a;
+    }
+    template<class T> JITIFY_CXX14_CONSTEXPR const T& min(const T& a, const T& b)
+    {
+      return (b < a) ? b : a;
+    }
+
+    } // namespace __jitify_algorithm_ns
+    namespace std { using namespace __jitify_algorithm_ns; }
+    using namespace __jitify_algorithm_ns;
+    #endif
+ )";
+
+static const char* jitsafe_header_time_h = R"(
+    #pragma once
+    #define NULL 0
+    #define CLOCKS_PER_SEC 1000000
+    namespace __jitify_time_ns {
+    typedef long time_t;
+    struct tm {
+      int tm_sec;
+      int tm_min;
+      int tm_hour;
+      int tm_mday;
+      int tm_mon;
+      int tm_year;
+      int tm_wday;
+      int tm_yday;
+      int tm_isdst;
+    };
+    #if __cplusplus >= 201703L
+    struct timespec {
+      time_t tv_sec;
+      long tv_nsec;
+    };
+    #endif
+    }  // namespace __jitify_time_ns
+    namespace std {
+      // NVRTC provides built-in definitions of ::size_t and ::clock_t.
+      using ::size_t;
+      using ::clock_t;
+      using namespace __jitify_time_ns;
+    }
+    using namespace __jitify_time_ns;
+ )";
+
+// WAR: These need to be pre-included as a workaround for NVRTC implicitly using
+// /usr/include as an include path. The other built-in headers will be included
+// lazily as needed.
+static const char* preinclude_jitsafe_header_names[] = {
+    "jitify_preinclude.h",
+    "limits.h",
+    "math.h",
+    "memory.h",
+    "stdint.h",
+    "stdlib.h",
+    "stdio.h",
+    "string.h",
+    "time.h",
+};
+
+template <class T, int N>
+int array_size(T (&)[N]) {
+  return N;
+}
+const int preinclude_jitsafe_headers_count =
+    array_size(preinclude_jitsafe_header_names);
+
+static const std::map<std::string, std::string>& get_jitsafe_headers_map() {
+  static const std::map<std::string, std::string> jitsafe_headers_map = {
+      {"jitify_preinclude.h", jitsafe_header_preinclude_h},
+      {"float.h", jitsafe_header_float_h},
+      {"cfloat", jitsafe_header_float_h},
+      {"limits.h", jitsafe_header_limits_h},
+      {"climits", jitsafe_header_limits_h},
+      {"stdint.h", jitsafe_header_stdint_h},
+      {"cstdint", jitsafe_header_stdint_h},
+      {"stddef.h", jitsafe_header_stddef_h},
+      {"cstddef", jitsafe_header_stddef_h},
+      {"stdlib.h", jitsafe_header_stdlib_h},
+      {"cstdlib", jitsafe_header_stdlib_h},
+      {"stdio.h", jitsafe_header_stdio_h},
+      {"cstdio", jitsafe_header_stdio_h},
+      {"string.h", jitsafe_header_string_h},
+      {"cstring", jitsafe_header_cstring},
+      {"iterator", jitsafe_header_iterator},
+      {"limits", jitsafe_header_limits},
+      {"type_traits", jitsafe_header_type_traits},
+      {"utility", jitsafe_header_utility},
+      {"math.h", jitsafe_header_math},
+      {"cmath", jitsafe_header_math},
+      {"memory.h", jitsafe_header_memory_h},
+      {"complex", jitsafe_header_complex},
+      {"iostream", jitsafe_header_iostream},
+      {"ostream", jitsafe_header_ostream},
+      {"istream", jitsafe_header_istream},
+      {"sstream", jitsafe_header_sstream},
+      {"vector", jitsafe_header_vector},
+      {"string", jitsafe_header_string},
+      {"stdexcept", jitsafe_header_stdexcept},
+      {"mutex", jitsafe_header_mutex},
+      {"algorithm", jitsafe_header_algorithm},
+      {"time.h", jitsafe_header_time_h},
+      {"ctime", jitsafe_header_time_h},
+  };
+  return jitsafe_headers_map;
+}
+
+inline void add_options_from_env(std::vector<std::string>& options) {
+  // Add options from environment variable
+  const char* env_options = std::getenv("JITIFY_OPTIONS");
+  if (env_options) {
+    std::stringstream ss;
+    ss << env_options;
+    std::string opt;
+    while (!(ss >> opt).fail()) {
+      options.push_back(opt);
+    }
+  }
+  // Add options from JITIFY_OPTIONS macro
+#ifdef JITIFY_OPTIONS
+#define JITIFY_TOSTRING_IMPL(x) #x
+#define JITIFY_TOSTRING(x) JITIFY_TOSTRING_IMPL(x)
+  std::stringstream ss;
+  ss << JITIFY_TOSTRING(JITIFY_OPTIONS);
+  std::string opt;
+  while (!(ss >> opt).fail()) {
+    options.push_back(opt);
+  }
+#undef JITIFY_TOSTRING
+#undef JITIFY_TOSTRING_IMPL
+#endif  // JITIFY_OPTIONS
+}
+
+inline void detect_and_add_cuda_arch(std::vector<std::string>& options) {
+  for (int i = 0; i < (int)options.size(); ++i) {
+    // Note that this will also match the middle of "--gpu-architecture".
+    if (options[i].find("-arch") != std::string::npos) {
+      // Arch already specified in options
+      return;
+    }
+  }
+  // Use the compute capability of the current device
+  // TODO: Check these API calls for errors
+  cudaError_t status;
+  int device;
+  status = cudaGetDevice(&device);
+  if (status != cudaSuccess) {
+    throw std::runtime_error(
+        std::string(
+            "Failed to detect GPU architecture: cudaGetDevice failed: ") +
+        cudaGetErrorString(status));
+  }
+  int cc_major;
+  cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device);
+  int cc_minor;
+  cudaDeviceGetAttribute(&cc_minor, cudaDevAttrComputeCapabilityMinor, device);
+  int cc = cc_major * 10 + cc_minor;
+  // Note: We must limit the architecture to the max supported by the current
+  //         version of NVRTC, otherwise newer hardware will cause errors
+  //         on older versions of CUDA.
+  // TODO: It would be better to detect this somehow, rather than hard-coding it
+
+  // Tegra chips do not have forwards compatibility so we need to special case
+  // them.
+  bool is_tegra = ((cc_major == 3 && cc_minor == 2) ||  // Logan
+                   (cc_major == 5 && cc_minor == 3) ||  // Erista
+                   (cc_major == 6 && cc_minor == 2) ||  // Parker
+                   (cc_major == 7 && cc_minor == 2));   // Xavier
+  if (!is_tegra) {
+    // ensure that future CUDA versions just work (even if suboptimal)
+    const int cuda_major = std::min(10, CUDA_VERSION / 1000);
+    // clang-format off
+    switch (cuda_major) {
+      case 10: cc = std::min(cc, 75); break; // Turing
+      case  9: cc = std::min(cc, 70); break; // Volta
+      case  8: cc = std::min(cc, 61); break; // Pascal
+      case  7: cc = std::min(cc, 52); break; // Maxwell
+      default:
+        throw std::runtime_error("Unexpected CUDA major version " +
+                                 std::to_string(cuda_major));
+    }
+    // clang-format on
+  }
+
+  std::stringstream ss;
+  ss << cc;
+  options.push_back("-arch=compute_" + ss.str());
+}
+
+inline void detect_and_add_cxx11_flag(std::vector<std::string>& options) {
+  // Reverse loop so we can erase on the fly.
+  for (int i = (int)options.size() - 1; i >= 0; --i) {
+    if (options[i].find("-std=c++98") != std::string::npos) {
+      // NVRTC doesn't support specifying c++98 explicitly, so we remove it.
+      options.erase(options.begin() + i);
+      return;
+    } else if (options[i].find("-std") != std::string::npos) {
+      // Some other standard was explicitly specified, don't change anything.
+      return;
+    }
+  }
+  // Jitify must be compiled with C++11 support, so we default to enabling it
+  // for the JIT-compiled code too.
+  options.push_back("-std=c++11");
+}
+
+inline void split_compiler_and_linker_options(
+    std::vector<std::string> options,
+    std::vector<std::string>* compiler_options,
+    std::vector<std::string>* linker_files,
+    std::vector<std::string>* linker_paths) {
+  for (int i = 0; i < (int)options.size(); ++i) {
+    std::string opt = options[i];
+    std::string flag = opt.substr(0, 2);
+    std::string value = opt.substr(2);
+    if (flag == "-l") {
+      linker_files->push_back(value);
+    } else if (flag == "-L") {
+      linker_paths->push_back(value);
+    } else {
+      compiler_options->push_back(opt);
+    }
+  }
+}
+
+inline bool pop_remove_unused_globals_flag(std::vector<std::string>* options) {
+  auto it = std::remove_if(
+      options->begin(), options->end(), [](const std::string& opt) {
+        return opt.find("-remove-unused-globals") != std::string::npos;
+      });
+  if (it != options->end()) {
+    options->resize(it - options->begin());
+    return true;
+  }
+  return false;
+}
+
+inline std::string ptx_parse_decl_name(const std::string& line) {
+  size_t name_end = line.find_first_of("[;");
+  if (name_end == std::string::npos) {
+    throw std::runtime_error(
+        "Failed to parse .global/.const declaration in PTX: expected a "
+        "semicolon");
+  }
+  size_t name_start_minus1 = line.find_last_of(" \t", name_end);
+  if (name_start_minus1 == std::string::npos) {
+    throw std::runtime_error(
+        "Failed to parse .global/.const declaration in PTX: expected "
+        "whitespace");
+  }
+  size_t name_start = name_start_minus1 + 1;
+  std::string name = line.substr(name_start, name_end - name_start);
+  return name;
+}
+
+inline void ptx_remove_unused_globals(std::string* ptx) {
+  std::istringstream iss(*ptx);
+  std::vector<std::string> lines;
+  std::unordered_map<size_t, std::string> line_num_to_global_name;
+  std::unordered_set<std::string> name_set;
+  for (std::string line; std::getline(iss, line);) {
+    size_t line_num = lines.size();
+    lines.push_back(line);
+    auto terms = split_string(line);
+    if (terms.size() <= 1) continue;  // Ignore lines with no arguments
+    if (terms[0].substr(0, 2) == "//") continue;  // Ignore comment lines
+    if (terms[0].substr(0, 7) == ".global" ||
+        terms[0].substr(0, 6) == ".const") {
+      line_num_to_global_name.emplace(line_num, ptx_parse_decl_name(line));
+      continue;
+    }
+    if (terms[0][0] == '.') continue;  // Ignore .version, .reg, .param etc.
+    // Note: The first term will always be an instruction name; starting at 1
+    // also allows unchecked inspection of the previous term.
+    for (int i = 1; i < (int)terms.size(); ++i) {
+      if (terms[i].substr(0, 2) == "//") break;  // Ignore comments
+      // Note: The characters '.' and '%' are not treated as delimiters.
+      const char* token_delims = " \t()[]{},;+-*/~&|^?:=!<>\"'\\";
+      for (auto token : split_string(terms[i], -1, token_delims)) {
+        if (  // Ignore non-names
+            !(std::isalpha(token[0]) || token[0] == '_' || token[0] == '$') ||
+            token.find('.') != std::string::npos ||
+            // Ignore variable/parameter declarations
+            terms[i - 1][0] == '.' ||
+            // Ignore branch instructions
+            (token == "bra" && terms[i - 1][0] == '@') ||
+            // Ignore branch labels
+            (token.substr(0, 2) == "BB" &&
+             terms[i - 1].substr(0, 3) == "bra")) {
+          continue;
+        }
+        name_set.insert(token);
+      }
+    }
+  }
+  std::ostringstream oss;
+  for (size_t line_num = 0; line_num < lines.size(); ++line_num) {
+    auto it = line_num_to_global_name.find(line_num);
+    if (it != line_num_to_global_name.end()) {
+      const std::string& name = it->second;
+      if (!name_set.count(name)) {
+        continue;  // Remove unused .global declaration.
+      }
+    }
+    oss << lines[line_num] << '\n';
+  }
+  *ptx = oss.str();
+}
+
+inline nvrtcResult compile_kernel(std::string program_name,
+                                  std::map<std::string, std::string> sources,
+                                  std::vector<std::string> options,
+                                  std::string instantiation = "",
+                                  std::string* log = 0, std::string* ptx = 0,
+                                  std::string* mangled_instantiation = 0) {
+  std::string program_source = sources[program_name];
+  // Build arrays of header names and sources
+  std::vector<const char*> header_names_c;
+  std::vector<const char*> header_sources_c;
+  int num_headers = (int)(sources.size() - 1);
+  header_names_c.reserve(num_headers);
+  header_sources_c.reserve(num_headers);
+  typedef std::map<std::string, std::string> source_map;
+  for (source_map::const_iterator iter = sources.begin(); iter != sources.end();
+       ++iter) {
+    std::string const& name = iter->first;
+    std::string const& code = iter->second;
+    if (name == program_name) {
+      continue;
+    }
+    header_names_c.push_back(name.c_str());
+    header_sources_c.push_back(code.c_str());
+  }
+
+  // TODO: This WAR is expected to be unnecessary as of CUDA > 10.2.
+  bool should_remove_unused_globals =
+      detail::pop_remove_unused_globals_flag(&options);
+
+  std::vector<const char*> options_c(options.size() + 2);
+  options_c[0] = "--device-as-default-execution-space";
+  options_c[1] = "--pre-include=jitify_preinclude.h";
+  for (int i = 0; i < (int)options.size(); ++i) {
+    options_c[i + 2] = options[i].c_str();
+  }
+
+#if CUDA_VERSION < 8000
+  std::string inst_dummy;
+  if (!instantiation.empty()) {
+    // WAR for no nvrtcAddNameExpression before CUDA 8.0
+    // Force template instantiation by adding dummy reference to kernel
+    inst_dummy = "__jitify_instantiation";
+    program_source +=
+        "\nvoid* " + inst_dummy + " = (void*)" + instantiation + ";\n";
+  }
+#endif
+
+#define CHECK_NVRTC(call)       \
+  do {                          \
+    nvrtcResult ret = call;     \
+    if (ret != NVRTC_SUCCESS) { \
+      return ret;               \
+    }                           \
+  } while (0)
+
+  nvrtcProgram nvrtc_program;
+  CHECK_NVRTC(nvrtcCreateProgram(
+      &nvrtc_program, program_source.c_str(), program_name.c_str(), num_headers,
+      header_sources_c.data(), header_names_c.data()));
+
+#if CUDA_VERSION >= 8000
+  if (!instantiation.empty()) {
+    CHECK_NVRTC(nvrtcAddNameExpression(nvrtc_program, instantiation.c_str()));
+  }
+#endif
+
+  nvrtcResult ret = nvrtcCompileProgram(nvrtc_program, (int)options_c.size(),
+                                        options_c.data());
+  if (log) {
+    size_t logsize;
+    CHECK_NVRTC(nvrtcGetProgramLogSize(nvrtc_program, &logsize));
+    std::vector<char> vlog(logsize, 0);
+    CHECK_NVRTC(nvrtcGetProgramLog(nvrtc_program, vlog.data()));
+    log->assign(vlog.data(), logsize);
+  }
+  if (ret != NVRTC_SUCCESS) {
+    return ret;
+  }
+
+  if (ptx) {
+    size_t ptxsize;
+    CHECK_NVRTC(nvrtcGetPTXSize(nvrtc_program, &ptxsize));
+    std::vector<char> vptx(ptxsize);
+    CHECK_NVRTC(nvrtcGetPTX(nvrtc_program, vptx.data()));
+    ptx->assign(vptx.data(), ptxsize);
+    if (should_remove_unused_globals) {
+      detail::ptx_remove_unused_globals(ptx);
+    }
+  }
+
+  if (!instantiation.empty() && mangled_instantiation) {
+#if CUDA_VERSION >= 8000
+    const char* mangled_instantiation_cstr;
+    // Note: The returned string pointer becomes invalid after
+    //         nvrtcDestroyProgram has been called, so we save it.
+    CHECK_NVRTC(nvrtcGetLoweredName(nvrtc_program, instantiation.c_str(),
+                                    &mangled_instantiation_cstr));
+    *mangled_instantiation = mangled_instantiation_cstr;
+#else
+    // Extract mangled kernel template instantiation from PTX
+    inst_dummy += " = ";  // Note: This must match how the PTX is generated
+    int mi_beg = ptx->find(inst_dummy) + inst_dummy.size();
+    int mi_end = ptx->find(";", mi_beg);
+    *mangled_instantiation = ptx->substr(mi_beg, mi_end - mi_beg);
+#endif
+  }
+
+  CHECK_NVRTC(nvrtcDestroyProgram(&nvrtc_program));
+#undef CHECK_NVRTC
+  return NVRTC_SUCCESS;
+}
+
+inline void load_program(std::string const& cuda_source,
+                         std::vector<std::string> const& headers,
+                         file_callback_type file_callback,
+                         std::vector<std::string>* include_paths,
+                         std::map<std::string, std::string>* program_sources,
+                         std::vector<std::string>* program_options,
+                         std::string* program_name) {
+  // Extract include paths from compile options
+  std::vector<std::string>::iterator iter = program_options->begin();
+  while (iter != program_options->end()) {
+    std::string const& opt = *iter;
+    if (opt.substr(0, 2) == "-I") {
+      include_paths->push_back(opt.substr(2));
+      iter = program_options->erase(iter);
+    } else {
+      ++iter;
+    }
+  }
+
+  // Load program source
+  if (!detail::load_source(cuda_source, *program_sources, "", *include_paths,
+                           file_callback)) {
+    throw std::runtime_error("Source not found: " + cuda_source);
+  }
+  *program_name = program_sources->begin()->first;
+
+  // Maps header include names to their full file paths.
+  std::map<std::string, std::string> header_fullpaths;
+
+  // Load header sources
+  for (std::string const& header : headers) {
+    if (!detail::load_source(header, *program_sources, "", *include_paths,
+                             file_callback, &header_fullpaths)) {
+      // **TODO: Deal with source not found
+      throw std::runtime_error("Source not found: " + header);
+    }
+  }
+
+#if JITIFY_PRINT_SOURCE
+  std::string& program_source = (*program_sources)[*program_name];
+  std::cout << "---------------------------------------" << std::endl;
+  std::cout << "--- Source of " << *program_name << " ---" << std::endl;
+  std::cout << "---------------------------------------" << std::endl;
+  detail::print_with_line_numbers(program_source);
+  std::cout << "---------------------------------------" << std::endl;
+#endif
+
+  std::vector<std::string> compiler_options, linker_files, linker_paths;
+  detail::split_compiler_and_linker_options(*program_options, &compiler_options,
+                                            &linker_files, &linker_paths);
+
+  // If no arch is specified at this point we use whatever the current
+  // context is. This ensures we pick up the correct internal headers
+  // for arch-dependent compilation, e.g., some intrinsics are only
+  // present for specific architectures.
+  detail::detect_and_add_cuda_arch(compiler_options);
+  detail::detect_and_add_cxx11_flag(compiler_options);
+
+  // Iteratively try to compile the sources, and use the resulting errors to
+  // identify missing headers.
+  std::string log;
+  nvrtcResult ret;
+  while ((ret = detail::compile_kernel(*program_name, *program_sources,
+                                       compiler_options, "", &log)) ==
+         NVRTC_ERROR_COMPILATION) {
+    std::string include_name;
+    std::string include_parent;
+    int line_num = 0;
+    if (!detail::extract_include_info_from_compile_error(
+            log, include_name, include_parent, line_num)) {
+#if JITIFY_PRINT_LOG
+      detail::print_compile_log(*program_name, log);
+#endif
+      // There was a non include-related compilation error
+      // TODO: How to handle error?
+      throw std::runtime_error("Runtime compilation failed");
+    }
+
+    bool is_included_with_quotes = false;
+    if (program_sources->count(include_parent)) {
+      const std::string& parent_source = (*program_sources)[include_parent];
+      is_included_with_quotes =
+          is_include_directive_with_quotes(parent_source, line_num);
+    }
+
+    // Try to load the new header
+    // Note: This fullpath lookup is needed because the compiler error
+    // messages have the include name of the header instead of its full path.
+    std::string include_parent_fullpath = header_fullpaths[include_parent];
+    std::string include_path = detail::path_base(include_parent_fullpath);
+    if (detail::load_source(include_name, *program_sources, include_path,
+                            *include_paths, file_callback, &header_fullpaths,
+                            is_included_with_quotes)) {
+#if JITIFY_PRINT_HEADER_PATHS
+      std::cout << "Found #include " << include_name << " from "
+                << include_parent << ":" << line_num << " ["
+                << include_parent_fullpath << "]"
+                << " at:\n  " << header_fullpaths[include_name] << std::endl;
+#endif
+    } else {  // Failed to find header file.
+      // Comment-out the include line and print a warning
+      if (!program_sources->count(include_parent)) {
+        // ***TODO: Unless there's another mechanism (e.g., potentially
+        //            the parent path vs. filename problem), getting
+        //            here means include_parent was found automatically
+        //            in a system include path.
+        //            We need a WAR to zap it from *its parent*.
+
+        typedef std::map<std::string, std::string> source_map;
+        for (source_map::const_iterator it = program_sources->begin();
+             it != program_sources->end(); ++it) {
+          std::cout << "  " << it->first << std::endl;
+        }
+        throw std::out_of_range(include_parent +
+                                " not in loaded sources!"
+                                " This may be due to a header being loaded by"
+                                " NVRTC without Jitify's knowledge.");
+      }
+      std::string& parent_source = (*program_sources)[include_parent];
+      parent_source = detail::comment_out_code_line(line_num, parent_source);
+#if JITIFY_PRINT_LOG
+      std::cout << include_parent << "(" << line_num
+                << "): warning: " << include_name << ": [jitify] File not found"
+                << std::endl;
+#endif
+    }
+  }
+  if (ret != NVRTC_SUCCESS) {
+#if JITIFY_PRINT_LOG
+    if (ret == NVRTC_ERROR_INVALID_OPTION) {
+      std::cout << "Compiler options: ";
+      for (int i = 0; i < (int)compiler_options.size(); ++i) {
+        std::cout << compiler_options[i] << " ";
+      }
+      std::cout << std::endl;
+    }
+#endif
+    throw std::runtime_error(std::string("NVRTC error: ") +
+                             nvrtcGetErrorString(ret));
+  }
+}
+
+inline void instantiate_kernel(
+    std::string const& program_name,
+    std::map<std::string, std::string> const& program_sources,
+    std::string const& instantiation, std::vector<std::string> const& options,
+    std::string* log, std::string* ptx, std::string* mangled_instantiation,
+    std::vector<std::string>* linker_files,
+    std::vector<std::string>* linker_paths) {
+  std::vector<std::string> compiler_options;
+  detail::split_compiler_and_linker_options(options, &compiler_options,
+                                            linker_files, linker_paths);
+
+  nvrtcResult ret =
+      detail::compile_kernel(program_name, program_sources, compiler_options,
+                             instantiation, log, ptx, mangled_instantiation);
+#if JITIFY_PRINT_LOG
+  if (log->size() > 1) {
+    detail::print_compile_log(program_name, *log);
+  }
+#endif
+  if (ret != NVRTC_SUCCESS) {
+    throw std::runtime_error(std::string("NVRTC error: ") +
+                             nvrtcGetErrorString(ret));
+  }
+
+#if JITIFY_PRINT_PTX
+  std::cout << "---------------------------------------" << std::endl;
+  std::cout << *mangled_instantiation << std::endl;
+  std::cout << "---------------------------------------" << std::endl;
+  std::cout << "--- PTX for " << mangled_instantiation << " in " << program_name
+            << " ---" << std::endl;
+  std::cout << "---------------------------------------" << std::endl;
+  std::cout << *ptx << std::endl;
+  std::cout << "---------------------------------------" << std::endl;
+#endif
+}
+
+inline void get_1d_max_occupancy(CUfunction func,
+                                 CUoccupancyB2DSize smem_callback,
+                                 unsigned int* smem, int max_block_size,
+                                 unsigned int flags, int* grid, int* block) {
+  if (!func) {
+    throw std::runtime_error(
+        "Kernel pointer is NULL; you may need to define JITIFY_THREAD_SAFE "
+        "1");
+  }
+  CUresult res = cuOccupancyMaxPotentialBlockSizeWithFlags(
+      grid, block, func, smem_callback, *smem, max_block_size, flags);
+  if (res != CUDA_SUCCESS) {
+    const char* msg;
+    cuGetErrorName(res, &msg);
+    throw std::runtime_error(msg);
+  }
+  if (smem_callback) {
+    *smem = (unsigned int)smem_callback(*block);
+  }
+}
+
+}  // namespace detail
+
+//! \endcond
+
+class KernelInstantiation;
+class Kernel;
+class Program;
+class JitCache;
+
+struct ProgramConfig {
+  std::vector<std::string> options;
+  std::vector<std::string> include_paths;
+  std::string name;
+  typedef std::map<std::string, std::string> source_map;
+  source_map sources;
+};
+
+class JitCache_impl {
+  friend class Program_impl;
+  friend class KernelInstantiation_impl;
+  friend class KernelLauncher_impl;
+  typedef uint64_t key_type;
+  jitify::ObjectCache<key_type, detail::CUDAKernel> _kernel_cache;
+  jitify::ObjectCache<key_type, ProgramConfig> _program_config_cache;
+  std::vector<std::string> _options;
+#if JITIFY_THREAD_SAFE
+  std::mutex _kernel_cache_mutex;
+  std::mutex _program_cache_mutex;
+#endif
+ public:
+  inline JitCache_impl(size_t cache_size)
+      : _kernel_cache(cache_size), _program_config_cache(cache_size) {
+    detail::add_options_from_env(_options);
+
+    // Bootstrap the cuda context to avoid errors
+    cudaFree(0);
+  }
+};
+
+class Program_impl {
+  // A friendly class
+  friend class Kernel_impl;
+  friend class KernelLauncher_impl;
+  friend class KernelInstantiation_impl;
+  // TODO: This can become invalid if JitCache is destroyed before the
+  //         Program object is. However, this can't happen if JitCache
+  //           instances are static.
+  JitCache_impl& _cache;
+  uint64_t _hash;
+  ProgramConfig* _config;
+  void load_sources(std::string source, std::vector<std::string> headers,
+                    std::vector<std::string> options,
+                    file_callback_type file_callback);
+
+ public:
+  inline Program_impl(JitCache_impl& cache, std::string source,
+                      jitify::detail::vector<std::string> headers = 0,
+                      jitify::detail::vector<std::string> options = 0,
+                      file_callback_type file_callback = 0);
+  inline Program_impl(Program_impl const&) = default;
+  inline Program_impl(Program_impl&&) = default;
+  inline std::vector<std::string> const& options() const {
+    return _config->options;
+  }
+  inline std::string const& name() const { return _config->name; }
+  inline ProgramConfig::source_map const& sources() const {
+    return _config->sources;
+  }
+  inline std::vector<std::string> const& include_paths() const {
+    return _config->include_paths;
+  }
+};
+
+class Kernel_impl {
+  friend class KernelLauncher_impl;
+  friend class KernelInstantiation_impl;
+  Program_impl _program;
+  std::string _name;
+  std::vector<std::string> _options;
+  uint64_t _hash;
+
+ public:
+  inline Kernel_impl(Program_impl const& program, std::string name,
+                     jitify::detail::vector<std::string> options = 0);
+  inline Kernel_impl(Kernel_impl const&) = default;
+  inline Kernel_impl(Kernel_impl&&) = default;
+};
+
+class KernelInstantiation_impl {
+  friend class KernelLauncher_impl;
+  Kernel_impl _kernel;
+  uint64_t _hash;
+  std::string _template_inst;
+  std::vector<std::string> _options;
+  detail::CUDAKernel* _cuda_kernel;
+  inline void print() const;
+  void build_kernel();
+
+ public:
+  inline KernelInstantiation_impl(
+      Kernel_impl const& kernel, std::vector<std::string> const& template_args);
+  inline KernelInstantiation_impl(KernelInstantiation_impl const&) = default;
+  inline KernelInstantiation_impl(KernelInstantiation_impl&&) = default;
+  detail::CUDAKernel const& cuda_kernel() const { return *_cuda_kernel; }
+};
+
+class KernelLauncher_impl {
+  KernelInstantiation_impl _kernel_inst;
+  dim3 _grid;
+  dim3 _block;
+  unsigned int _smem;
+  cudaStream_t _stream;
+
+ public:
+  inline KernelLauncher_impl(KernelInstantiation_impl const& kernel_inst,
+                             dim3 grid, dim3 block, unsigned int smem = 0,
+                             cudaStream_t stream = 0)
+      : _kernel_inst(kernel_inst),
+        _grid(grid),
+        _block(block),
+        _smem(smem),
+        _stream(stream) {}
+  inline KernelLauncher_impl(KernelLauncher_impl const&) = default;
+  inline KernelLauncher_impl(KernelLauncher_impl&&) = default;
+  inline CUresult launch(
+      jitify::detail::vector<void*> arg_ptrs,
+      jitify::detail::vector<std::string> arg_types = 0) const;
+};
+
+/*! An object representing a configured and instantiated kernel ready
+ *    for launching.
+ */
+class KernelLauncher {
+  std::unique_ptr<KernelLauncher_impl const> _impl;
+
+ public:
+  inline KernelLauncher(KernelInstantiation const& kernel_inst, dim3 grid,
+                        dim3 block, unsigned int smem = 0,
+                        cudaStream_t stream = 0);
+
+  // Note: It's important that there is no implicit conversion required
+  //         for arg_ptrs, because otherwise the parameter pack version
+  //         below gets called instead (probably resulting in a segfault).
+  /*! Launch the kernel.
+   *
+   *  \param arg_ptrs  A vector of pointers to each function argument for the
+   *    kernel.
+   *  \param arg_types A vector of function argument types represented
+   *    as code-strings. This parameter is optional and is only used to print
+   *    out the function signature.
+   */
+  inline CUresult launch(
+      std::vector<void*> arg_ptrs = std::vector<void*>(),
+      jitify::detail::vector<std::string> arg_types = 0) const {
+    return _impl->launch(arg_ptrs, arg_types);
+  }
+  // Regular function call syntax
+  /*! Launch the kernel.
+   *
+   *  \see launch
+   */
+  template <typename... ArgTypes>
+  inline CUresult operator()(ArgTypes... args) const {
+    return this->launch(args...);
+  }
+  /*! Launch the kernel.
+   *
+   *  \param args Function arguments for the kernel.
+   */
+  template <typename... ArgTypes>
+  inline CUresult launch(ArgTypes... args) const {
+    return this->launch(std::vector<void*>({(void*)&args...}),
+                        {reflection::reflect<ArgTypes>()...});
+  }
+};
+
+/*! An object representing a kernel instantiation made up of a Kernel and
+ *    template arguments.
+ */
+class KernelInstantiation {
+  friend class KernelLauncher;
+  std::unique_ptr<KernelInstantiation_impl const> _impl;
+
+ public:
+  inline KernelInstantiation(Kernel const& kernel,
+                             std::vector<std::string> const& template_args);
+
+  /*! Implicit conversion to the underlying CUfunction object.
+   *
+   * \note This allows use of CUDA APIs like
+   *   cuOccupancyMaxActiveBlocksPerMultiprocessor.
+   */
+  inline operator CUfunction() const { return _impl->cuda_kernel(); }
+
+  /*! Configure the kernel launch.
+   *
+   *  \see configure
+   */
+  inline KernelLauncher operator()(dim3 grid, dim3 block, unsigned int smem = 0,
+                                   cudaStream_t stream = 0) const {
+    return this->configure(grid, block, smem, stream);
+  }
+  /*! Configure the kernel launch.
+   *
+   *  \param grid   The thread grid dimensions for the launch.
+   *  \param block  The thread block dimensions for the launch.
+   *  \param smem   The amount of shared memory to dynamically allocate, in
+   * bytes.
+   *  \param stream The CUDA stream to launch the kernel in.
+   */
+  inline KernelLauncher configure(dim3 grid, dim3 block, unsigned int smem = 0,
+                                  cudaStream_t stream = 0) const {
+    return KernelLauncher(*this, grid, block, smem, stream);
+  }
+  /*! Configure the kernel launch with a 1-dimensional block and grid chosen
+   *  automatically to maximise occupancy.
+   *
+   * \param max_block_size  The upper limit on the block size, or 0 for no
+   * limit.
+   * \param smem  The amount of shared memory to dynamically allocate, in bytes.
+   * \param smem_callback  A function returning smem for a given block size (overrides \p smem).
+   * \param stream The CUDA stream to launch the kernel in.
+   * \param flags The flags to pass to cuOccupancyMaxPotentialBlockSizeWithFlags.
+   */
+  inline KernelLauncher configure_1d_max_occupancy(
+      int max_block_size = 0, unsigned int smem = 0,
+      CUoccupancyB2DSize smem_callback = 0, cudaStream_t stream = 0,
+      unsigned int flags = 0) const {
+    int grid;
+    int block;
+    CUfunction func = _impl->cuda_kernel();
+    detail::get_1d_max_occupancy(func, smem_callback, &smem, max_block_size,
+                                 flags, &grid, &block);
+    return this->configure(grid, block, smem, stream);
+  }
+
+  /*
+   * \deprecated Use \p get_global_ptr instead.
+   */
+  inline CUdeviceptr get_constant_ptr(const char* name,
+                                      size_t* size = nullptr) const {
+    return get_global_ptr(name, size);
+  }
+
+  /*
+   * Get a device pointer to a global __constant__ or __device__ variable using
+   * its un-mangled name. If provided, *size is set to the size of the variable
+   * in bytes.
+   */
+  inline CUdeviceptr get_global_ptr(const char* name,
+                                    size_t* size = nullptr) const {
+    return _impl->cuda_kernel().get_global_ptr(name, size);
+  }
+
+  /*
+   * Copy data from a global __constant__ or __device__ array to the host using
+   * its un-mangled name.
+   */
+  template <typename T>
+  inline CUresult get_global_array(const char* name, T* data, size_t count,
+                                   CUstream stream = 0) const {
+    return _impl->cuda_kernel().get_global_data(name, data, count, stream);
+  }
+
+  /*
+   * Copy a value from a global __constant__ or __device__ variable to the host
+   * using its un-mangled name.
+   */
+  template <typename T>
+  inline CUresult get_global_value(const char* name, T* value,
+                                   CUstream stream = 0) const {
+    return get_global_array(name, value, 1, stream);
+  }
+
+  /*
+   * Copy data from the host to a global __constant__ or __device__ array using
+   * its un-mangled name.
+   */
+  template <typename T>
+  inline CUresult set_global_array(const char* name, const T* data,
+                                   size_t count, CUstream stream = 0) const {
+    return _impl->cuda_kernel().set_global_data(name, data, count, stream);
+  }
+
+  /*
+   * Copy a value from the host to a global __constant__ or __device__ variable
+   * using its un-mangled name.
+   */
+  template <typename T>
+  inline CUresult set_global_value(const char* name, const T& value,
+                                   CUstream stream = 0) const {
+    return set_global_array(name, &value, 1, stream);
+  }
+
+  const std::string& mangled_name() const {
+    return _impl->cuda_kernel().function_name();
+  }
+
+  const std::string& ptx() const { return _impl->cuda_kernel().ptx(); }
+
+  const std::vector<std::string>& link_files() const {
+    return _impl->cuda_kernel().link_files();
+  }
+
+  const std::vector<std::string>& link_paths() const {
+    return _impl->cuda_kernel().link_paths();
+  }
+};
+
+/*! An object representing a kernel made up of a Program, a name and options.
+ */
+class Kernel {
+  friend class KernelInstantiation;
+  std::unique_ptr<Kernel_impl const> _impl;
+
+ public:
+  Kernel(Program const& program, std::string name,
+         jitify::detail::vector<std::string> options = 0);
+
+  /*! Instantiate the kernel.
+   *
+   *  \param template_args A vector of template arguments represented as
+   *    code-strings. These can be generated using
+   *    \code{.cpp}jitify::reflection::reflect<type>()\endcode or
+   *    \code{.cpp}jitify::reflection::reflect(value)\endcode
+   *
+   *  \note Template type deduction is not possible, so all types must be
+   *    explicitly specified.
+   */
+  // inline KernelInstantiation instantiate(std::vector<std::string> const&
+  // template_args) const {
+  inline KernelInstantiation instantiate(
+      std::vector<std::string> const& template_args =
+          std::vector<std::string>()) const {
+    return KernelInstantiation(*this, template_args);
+  }
+
+  // Regular template instantiation syntax (note limited flexibility)
+  /*! Instantiate the kernel.
+   *
+   *  \note The template arguments specified on this function are
+   *    used to instantiate the kernel. Non-type template arguments must
+   *    be wrapped with
+   *    \code{.cpp}jitify::reflection::NonType<type,value>\endcode
+   *
+   *  \note Template type deduction is not possible, so all types must be
+   *    explicitly specified.
+   */
+  template <typename... TemplateArgs>
+  inline KernelInstantiation instantiate() const {
+    return this->instantiate(
+        std::vector<std::string>({reflection::reflect<TemplateArgs>()...}));
+  }
+  // Template-like instantiation syntax
+  //   E.g., instantiate(myvar,Type<MyType>())(grid,block)
+  /*! Instantiate the kernel.
+   *
+   *  \param targs The template arguments for the kernel, represented as
+   *    values. Types must be wrapped with
+   *    \code{.cpp}jitify::reflection::Type<type>()\endcode or
+   *    \code{.cpp}jitify::reflection::type_of(value)\endcode
+   *
+   *  \note Template type deduction is not possible, so all types must be
+   *    explicitly specified.
+   */
+  template <typename... TemplateArgs>
+  inline KernelInstantiation instantiate(TemplateArgs... targs) const {
+    return this->instantiate(
+        std::vector<std::string>({reflection::reflect(targs)...}));
+  }
+};
+
+/*! An object representing a program made up of source code, headers
+ *    and options.
+ */
+class Program {
+  friend class Kernel;
+  std::unique_ptr<Program_impl const> _impl;
+
+ public:
+  Program(JitCache& cache, std::string source,
+          jitify::detail::vector<std::string> headers = 0,
+          jitify::detail::vector<std::string> options = 0,
+          file_callback_type file_callback = 0);
+
+  /*! Select a kernel.
+   *
+   * \param name The name of the kernel (unmangled and without
+   * template arguments).
+   * \param options A vector of options to be passed to the NVRTC
+   * compiler when compiling this kernel.
+   */
+  inline Kernel kernel(std::string name,
+                       jitify::detail::vector<std::string> options = 0) const {
+    return Kernel(*this, name, options);
+  }
+  /*! Select a kernel.
+   *
+   *  \see kernel
+   */
+  inline Kernel operator()(
+      std::string name, jitify::detail::vector<std::string> options = 0) const {
+    return this->kernel(name, options);
+  }
+};
+
+/*! An object that manages a cache of JIT-compiled CUDA kernels.
+ *
+ */
+class JitCache {
+  friend class Program;
+  std::unique_ptr<JitCache_impl> _impl;
+
+ public:
+  /*! JitCache constructor.
+   *  \param cache_size The number of kernels to hold in the cache
+   *    before overwriting the least-recently-used ones.
+   */
+  enum { DEFAULT_CACHE_SIZE = 128 };
+  JitCache(size_t cache_size = DEFAULT_CACHE_SIZE)
+      : _impl(new JitCache_impl(cache_size)) {}
+
+  /*! Create a program.
+   *
+   *  \param source A string containing either the source filename or
+   *    the source itself; in the latter case, the first line must be
+   *    the name of the program.
+   *  \param headers A vector of strings representing the source of
+   *    each header file required by the program. Each entry can be
+   *    either the header filename or the header source itself; in
+   *    the latter case, the first line must be the name of the header
+   *    (i.e., the name by which the header is #included).
+   *  \param options A vector of options to be passed to the
+   *    NVRTC compiler. Include paths specified with \p -I
+   *    are added to the search paths used by Jitify. The environment
+   *    variable JITIFY_OPTIONS can also be used to define additional
+   *    options.
+   *  \param file_callback A pointer to a callback function that is
+   *    invoked whenever a source file needs to be loaded. Inside this
+   *    function, the user can either load/specify the source themselves
+   *    or defer to Jitify's file-loading mechanisms.
+   *  \note Program or header source files referenced by filename are
+   *  looked-up using the following mechanisms (in this order):
+   *  \note 1) By calling file_callback.
+   *  \note 2) By looking for the file embedded in the executable via the GCC
+   * linker.
+   *  \note 3) By looking for the file in the filesystem.
+   *
+   *  \note Jitify recursively scans all source files for \p #include
+   *  directives and automatically adds them to the set of headers needed
+   *  by the program.
+   *  If a \p #include directive references a header that cannot be found,
+   *  the directive is automatically removed from the source code to prevent
+   *  immediate compilation failure. This may result in compilation errors
+   *  if the header was required by the program.
+   *
+   *  \note Jitify automatically includes NVRTC-safe versions of some
+   *  standard library headers.
+   */
+  inline Program program(std::string source,
+                         jitify::detail::vector<std::string> headers = 0,
+                         jitify::detail::vector<std::string> options = 0,
+                         file_callback_type file_callback = 0) {
+    return Program(*this, source, headers, options, file_callback);
+  }
+};
+
+inline Program::Program(JitCache& cache, std::string source,
+                        jitify::detail::vector<std::string> headers,
+                        jitify::detail::vector<std::string> options,
+                        file_callback_type file_callback)
+    : _impl(new Program_impl(*cache._impl, source, headers, options,
+                             file_callback)) {}
+
+inline Kernel::Kernel(Program const& program, std::string name,
+                      jitify::detail::vector<std::string> options)
+    : _impl(new Kernel_impl(*program._impl, name, options)) {}
+
+inline KernelInstantiation::KernelInstantiation(
+    Kernel const& kernel, std::vector<std::string> const& template_args)
+    : _impl(new KernelInstantiation_impl(*kernel._impl, template_args)) {}
+
+inline KernelLauncher::KernelLauncher(KernelInstantiation const& kernel_inst,
+                                      dim3 grid, dim3 block, unsigned int smem,
+                                      cudaStream_t stream)
+    : _impl(new KernelLauncher_impl(*kernel_inst._impl, grid, block, smem,
+                                    stream)) {}
+
+inline std::ostream& operator<<(std::ostream& stream, dim3 d) {
+  if (d.y == 1 && d.z == 1) {
+    stream << d.x;
+  } else {
+    stream << "(" << d.x << "," << d.y << "," << d.z << ")";
+  }
+  return stream;
+}
+
+inline CUresult KernelLauncher_impl::launch(
+    jitify::detail::vector<void*> arg_ptrs,
+    jitify::detail::vector<std::string> arg_types) const {
+#if JITIFY_PRINT_LAUNCH
+  Kernel_impl const& kernel = _kernel_inst._kernel;
+  std::string arg_types_string =
+      (arg_types.empty() ? "..." : reflection::reflect_list(arg_types));
+  std::cout << "Launching " << kernel._name << _kernel_inst._template_inst
+            << "<<<" << _grid << "," << _block << "," << _smem << "," << _stream
+            << ">>>"
+            << "(" << arg_types_string << ")" << std::endl;
+#endif
+  if (!_kernel_inst._cuda_kernel) {
+    throw std::runtime_error(
+        "Kernel pointer is NULL; you may need to define JITIFY_THREAD_SAFE 1");
+  }
+  return _kernel_inst._cuda_kernel->launch(_grid, _block, _smem, _stream,
+                                           arg_ptrs);
+}
+
+inline KernelInstantiation_impl::KernelInstantiation_impl(
+    Kernel_impl const& kernel, std::vector<std::string> const& template_args)
+    : _kernel(kernel), _options(kernel._options) {
+  _template_inst =
+      (template_args.empty() ? ""
+                             : reflection::reflect_template(template_args));
+  using detail::hash_combine;
+  using detail::hash_larson64;
+  _hash = _kernel._hash;
+  _hash = hash_combine(_hash, hash_larson64(_template_inst.c_str()));
+  JitCache_impl& cache = _kernel._program._cache;
+  uint64_t cache_key = _hash;
+#if JITIFY_THREAD_SAFE
+  std::lock_guard<std::mutex> lock(cache._kernel_cache_mutex);
+#endif
+  if (cache._kernel_cache.contains(cache_key)) {
+#if JITIFY_PRINT_INSTANTIATION
+    std::cout << "Found ";
+    this->print();
+#endif
+    _cuda_kernel = &cache._kernel_cache.get(cache_key);
+  } else {
+#if JITIFY_PRINT_INSTANTIATION
+    std::cout << "Building ";
+    this->print();
+#endif
+    _cuda_kernel = &cache._kernel_cache.emplace(cache_key);
+    this->build_kernel();
+  }
+}
+
+inline void KernelInstantiation_impl::print() const {
+  std::string options_string = reflection::reflect_list(_options);
+  std::cout << _kernel._name << _template_inst << " [" << options_string << "]"
+            << std::endl;
+}
+
+inline void KernelInstantiation_impl::build_kernel() {
+  Program_impl const& program = _kernel._program;
+
+  std::string instantiation = _kernel._name + _template_inst;
+
+  std::string log, ptx, mangled_instantiation;
+  std::vector<std::string> linker_files, linker_paths;
+  detail::instantiate_kernel(program.name(), program.sources(), instantiation,
+                             _options, &log, &ptx, &mangled_instantiation,
+                             &linker_files, &linker_paths);
+
+  _cuda_kernel->set(mangled_instantiation.c_str(), ptx.c_str(), linker_files,
+                    linker_paths);
+}
+
+Kernel_impl::Kernel_impl(Program_impl const& program, std::string name,
+                         jitify::detail::vector<std::string> options)
+    : _program(program), _name(name), _options(options) {
+  // Merge options from parent
+  _options.insert(_options.end(), _program.options().begin(),
+                  _program.options().end());
+  detail::detect_and_add_cuda_arch(_options);
+  detail::detect_and_add_cxx11_flag(_options);
+  std::string options_string = reflection::reflect_list(_options);
+  using detail::hash_combine;
+  using detail::hash_larson64;
+  _hash = _program._hash;
+  _hash = hash_combine(_hash, hash_larson64(_name.c_str()));
+  _hash = hash_combine(_hash, hash_larson64(options_string.c_str()));
+}
+
+Program_impl::Program_impl(JitCache_impl& cache, std::string source,
+                           jitify::detail::vector<std::string> headers,
+                           jitify::detail::vector<std::string> options,
+                           file_callback_type file_callback)
+    : _cache(cache) {
+  // Compute hash of source, headers and options
+  std::string options_string = reflection::reflect_list(options);
+  using detail::hash_combine;
+  using detail::hash_larson64;
+  _hash = hash_combine(hash_larson64(source.c_str()),
+                       hash_larson64(options_string.c_str()));
+  for (size_t i = 0; i < headers.size(); ++i) {
+    _hash = hash_combine(_hash, hash_larson64(headers[i].c_str()));
+  }
+  _hash = hash_combine(_hash, (uint64_t)file_callback);
+  // Add pre-include built-in JIT-safe headers
+  for (int i = 0; i < detail::preinclude_jitsafe_headers_count; ++i) {
+    const char* hdr_name = detail::preinclude_jitsafe_header_names[i];
+    const std::string& hdr_source =
+        detail::get_jitsafe_headers_map().at(hdr_name);
+    headers.push_back(std::string(hdr_name) + "\n" + hdr_source);
+  }
+  // Merge options from parent
+  options.insert(options.end(), _cache._options.begin(), _cache._options.end());
+  // Load sources
+#if JITIFY_THREAD_SAFE
+  std::lock_guard<std::mutex> lock(cache._program_cache_mutex);
+#endif
+  if (!cache._program_config_cache.contains(_hash)) {
+    _config = &cache._program_config_cache.insert(_hash);
+    this->load_sources(source, headers, options, file_callback);
+  } else {
+    _config = &cache._program_config_cache.get(_hash);
+  }
+}
+
+inline void Program_impl::load_sources(std::string source,
+                                       std::vector<std::string> headers,
+                                       std::vector<std::string> options,
+                                       file_callback_type file_callback) {
+  _config->options = options;
+  detail::load_program(source, headers, file_callback, &_config->include_paths,
+                       &_config->sources, &_config->options, &_config->name);
+}
+
+enum Location { HOST, DEVICE };
+
+/*! Specifies location and parameters for execution of an algorithm.
+ *  \param stream        The CUDA stream on which to execute.
+ *  \param headers       A vector of headers to include in the code.
+ *  \param options       Options to pass to the NVRTC compiler.
+ *  \param file_callback See jitify::Program.
+ *  \param block_size    The size of the CUDA thread block with which to
+ * execute.
+ *  \param cache_size    The number of kernels to store in the cache
+ * before overwriting the least-recently-used ones.
+ */
+struct ExecutionPolicy {
+  /*! Location (HOST or DEVICE) on which to execute.*/
+  Location location;
+  /*! List of headers to include when compiling the algorithm.*/
+  std::vector<std::string> headers;
+  /*! List of compiler options.*/
+  std::vector<std::string> options;
+  /*! Optional callback for loading source files.*/
+  file_callback_type file_callback;
+  /*! CUDA stream on which to execute.*/
+  cudaStream_t stream;
+  /*! CUDA device on which to execute.*/
+  int device;
+  /*! CUDA block size with which to execute.*/
+  int block_size;
+  /*! The number of instantiations to store in the cache before overwriting
+   *  the least-recently-used ones.*/
+  size_t cache_size;
+  ExecutionPolicy(Location location_ = DEVICE,
+                  jitify::detail::vector<std::string> headers_ = 0,
+                  jitify::detail::vector<std::string> options_ = 0,
+                  file_callback_type file_callback_ = 0,
+                  cudaStream_t stream_ = 0, int device_ = 0,
+                  int block_size_ = 256,
+                  size_t cache_size_ = JitCache::DEFAULT_CACHE_SIZE)
+      : location(location_),
+        headers(headers_),
+        options(options_),
+        file_callback(file_callback_),
+        stream(stream_),
+        device(device_),
+        block_size(block_size_),
+        cache_size(cache_size_) {}
+};
+
+template <class Func>
+class Lambda;
+
+/*! An object that captures a set of variables for use in a parallel_for
+ *    expression. See JITIFY_CAPTURE().
+ */
+class Capture {
+ public:
+  std::vector<std::string> _arg_decls;
+  std::vector<void*> _arg_ptrs;
+
+ public:
+  template <typename... Args>
+  inline Capture(std::vector<std::string> arg_names, Args const&... args)
+      : _arg_ptrs{(void*)&args...} {
+    std::vector<std::string> arg_types = {reflection::reflect<Args>()...};
+    _arg_decls.resize(arg_names.size());
+    for (int i = 0; i < (int)arg_names.size(); ++i) {
+      _arg_decls[i] = arg_types[i] + " " + arg_names[i];
+    }
+  }
+};
+
+/*! An object that captures the instantiated Lambda function for use
+    in a parallel_for expression and the function string for NVRTC
+    compilation
+ */
+template <class Func>
+class Lambda {
+ public:
+  Capture _capture;
+  std::string _func_string;
+  Func _func;
+
+ public:
+  inline Lambda(Capture const& capture, std::string func_string, Func func)
+      : _capture(capture), _func_string(func_string), _func(func) {}
+};
+
+template <typename T>
+inline Lambda<T> make_Lambda(Capture const& capture, std::string func,
+                             T lambda) {
+  return Lambda<T>(capture, func, lambda);
+}
+
+#define JITIFY_CAPTURE(...)                                            \
+  jitify::Capture(jitify::detail::split_string(#__VA_ARGS__, -1, ","), \
+                  __VA_ARGS__)
+
+#define JITIFY_MAKE_LAMBDA(capture, x, ...)               \
+  jitify::make_Lambda(capture, std::string(#__VA_ARGS__), \
+                      [x](int i) { __VA_ARGS__; })
+
+#define JITIFY_ARGS(...) __VA_ARGS__
+
+#define JITIFY_LAMBDA_(x, ...) \
+  JITIFY_MAKE_LAMBDA(JITIFY_CAPTURE(x), JITIFY_ARGS(x), __VA_ARGS__)
+
+// macro sequence to strip surrounding brackets
+#define JITIFY_STRIP_PARENS(X) X
+#define JITIFY_PASS_PARAMETERS(X) JITIFY_STRIP_PARENS(JITIFY_ARGS X)
+
+/*! Creates a Lambda object with captured variables and a function
+ *    definition.
+ *  \param capture A bracket-enclosed list of variables to capture.
+ *  \param ...     The function definition.
+ *
+ *  \code{.cpp}
+ *  float* capture_me;
+ *  int    capture_me_too;
+ *  auto my_lambda = JITIFY_LAMBDA( (capture_me, capture_me_too),
+ *                                  capture_me[i] = i*capture_me_too );
+ *  \endcode
+ */
+#define JITIFY_LAMBDA(capture, ...)                            \
+  JITIFY_LAMBDA_(JITIFY_ARGS(JITIFY_PASS_PARAMETERS(capture)), \
+                 JITIFY_ARGS(__VA_ARGS__))
+
+// TODO: Try to implement for_each that accepts iterators instead of indices
+//       Add compile guard for NOCUDA compilation
+/*! Call a function for a range of indices
+ *
+ *  \param policy Determines the location and device parameters for
+ *  execution of the parallel_for.
+ *  \param begin  The starting index.
+ *  \param end    The ending index.
+ *  \param lambda A Lambda object created using the JITIFY_LAMBDA() macro.
+ *
+ *  \code{.cpp}
+ *  char const* in;
+ *  float*      out;
+ *  parallel_for(0, 100, JITIFY_LAMBDA( (in, out), {char x = in[i]; out[i] =
+ * x*x; } ); \endcode
+ */
+template <typename IndexType, class Func>
+CUresult parallel_for(ExecutionPolicy policy, IndexType begin, IndexType end,
+                      Lambda<Func> const& lambda) {
+  using namespace jitify;
+
+  if (policy.location == HOST) {
+#ifdef _OPENMP
+#pragma omp parallel for
+#endif
+    for (IndexType i = begin; i < end; i++) {
+      lambda._func(i);
+    }
+    return CUDA_SUCCESS;  // FIXME - replace with non-CUDA enum type?
+  }
+
+  thread_local static JitCache kernel_cache(policy.cache_size);
+
+  std::vector<std::string> arg_decls;
+  arg_decls.push_back("I begin, I end");
+  arg_decls.insert(arg_decls.end(), lambda._capture._arg_decls.begin(),
+                   lambda._capture._arg_decls.end());
+
+  std::stringstream source_ss;
+  source_ss << "parallel_for_program\n";
+  for (auto const& header : policy.headers) {
+    std::string header_name = header.substr(0, header.find("\n"));
+    source_ss << "#include <" << header_name << ">\n";
+  }
+  source_ss << "template<typename I>\n"
+               "__global__\n"
+               "void parallel_for_kernel("
+            << reflection::reflect_list(arg_decls)
+            << ") {\n"
+               "	I i0 = threadIdx.x + blockDim.x*blockIdx.x;\n"
+               "	for( I i=i0+begin; i<end; i+=blockDim.x*gridDim.x ) {\n"
+               "	"
+            << "\t" << lambda._func_string << ";\n"
+            << "	}\n"
+               "}\n";
+
+  Program program = kernel_cache.program(source_ss.str(), policy.headers,
+                                         policy.options, policy.file_callback);
+
+  std::vector<void*> arg_ptrs;
+  arg_ptrs.push_back(&begin);
+  arg_ptrs.push_back(&end);
+  arg_ptrs.insert(arg_ptrs.end(), lambda._capture._arg_ptrs.begin(),
+                  lambda._capture._arg_ptrs.end());
+
+  size_t n = end - begin;
+  dim3 block(policy.block_size);
+  dim3 grid((unsigned int)std::min((n - 1) / block.x + 1, size_t(65535)));
+  cudaSetDevice(policy.device);
+  return program.kernel("parallel_for_kernel")
+      .instantiate<IndexType>()
+      .configure(grid, block, 0, policy.stream)
+      .launch(arg_ptrs);
+}
+
+namespace experimental {
+
+using jitify::file_callback_type;
+
+namespace serialization {
+
+namespace detail {
+
+// This should be incremented whenever the serialization format changes in any
+// incompatible way.
+static constexpr const size_t kSerializationVersion = 1;
+
+inline void serialize(std::ostream& stream, size_t u) {
+  uint64_t u64 = u;
+  stream.write(reinterpret_cast<char*>(&u64), sizeof(u64));
+}
+
+inline bool deserialize(std::istream& stream, size_t* size) {
+  uint64_t u64;
+  stream.read(reinterpret_cast<char*>(&u64), sizeof(u64));
+  *size = u64;
+  return stream.good();
+}
+
+inline void serialize(std::ostream& stream, std::string const& s) {
+  serialize(stream, s.size());
+  stream.write(s.data(), s.size());
+}
+
+inline bool deserialize(std::istream& stream, std::string* s) {
+  size_t size;
+  if (!deserialize(stream, &size)) return false;
+  s->resize(size);
+  if (s->size()) {
+    stream.read(&(*s)[0], s->size());
+  }
+  return stream.good();
+}
+
+inline void serialize(std::ostream& stream, std::vector<std::string> const& v) {
+  serialize(stream, v.size());
+  for (auto const& s : v) {
+    serialize(stream, s);
+  }
+}
+
+inline bool deserialize(std::istream& stream, std::vector<std::string>* v) {
+  size_t size;
+  if (!deserialize(stream, &size)) return false;
+  v->resize(size);
+  for (auto& s : *v) {
+    if (!deserialize(stream, &s)) return false;
+  }
+  return true;
+}
+
+inline void serialize(std::ostream& stream,
+                      std::map<std::string, std::string> const& m) {
+  serialize(stream, m.size());
+  for (auto const& kv : m) {
+    serialize(stream, kv.first);
+    serialize(stream, kv.second);
+  }
+}
+
+inline bool deserialize(std::istream& stream,
+                        std::map<std::string, std::string>* m) {
+  size_t size;
+  if (!deserialize(stream, &size)) return false;
+  for (size_t i = 0; i < size; ++i) {
+    std::string key;
+    if (!deserialize(stream, &key)) return false;
+    if (!deserialize(stream, &(*m)[key])) return false;
+  }
+  return true;
+}
+
+template <typename T, typename... Rest>
+inline void serialize(std::ostream& stream, T const& value, Rest... rest) {
+  serialize(stream, value);
+  serialize(stream, rest...);
+}
+
+template <typename T, typename... Rest>
+inline bool deserialize(std::istream& stream, T* value, Rest... rest) {
+  if (!deserialize(stream, value)) return false;
+  return deserialize(stream, rest...);
+}
+
+inline void serialize_magic_number(std::ostream& stream) {
+  stream.write("JTFY", 4);
+  serialize(stream, kSerializationVersion);
+}
+
+inline bool deserialize_magic_number(std::istream& stream) {
+  char magic_number[4] = {0, 0, 0, 0};
+  stream.read(&magic_number[0], 4);
+  if (!(magic_number[0] == 'J' && magic_number[1] == 'T' &&
+        magic_number[2] == 'F' && magic_number[3] == 'Y')) {
+    return false;
+  }
+  size_t serialization_version;
+  if (!deserialize(stream, &serialization_version)) return false;
+  return serialization_version == kSerializationVersion;
+}
+
+}  // namespace detail
+
+template <typename... Values>
+inline std::string serialize(Values const&... values) {
+  std::ostringstream ss(std::stringstream::out | std::stringstream::binary);
+  detail::serialize_magic_number(ss);
+  detail::serialize(ss, values...);
+  return ss.str();
+}
+
+template <typename... Values>
+inline bool deserialize(std::string const& serialized, Values*... values) {
+  std::istringstream ss(serialized,
+                        std::stringstream::in | std::stringstream::binary);
+  if (!detail::deserialize_magic_number(ss)) return false;
+  return detail::deserialize(ss, values...);
+}
+
+}  // namespace serialization
+
+class Program;
+class Kernel;
+class KernelInstantiation;
+class KernelLauncher;
+
+/*! An object representing a program made up of source code, headers
+ *    and options.
+ */
+class Program {
+ private:
+  friend class KernelInstantiation;
+  std::string _name;
+  std::vector<std::string> _options;
+  std::map<std::string, std::string> _sources;
+
+  // Private constructor used by deserialize()
+  Program() {}
+
+ public:
+  /*! Create a program.
+   *
+   *  \param source A string containing either the source filename or
+   *    the source itself; in the latter case, the first line must be
+   *    the name of the program.
+   *  \param headers A vector of strings representing the source of
+   *    each header file required by the program. Each entry can be
+   *    either the header filename or the header source itself; in
+   *    the latter case, the first line must be the name of the header
+   *    (i.e., the name by which the header is #included).
+   *  \param options A vector of options to be passed to the
+   *    NVRTC compiler. Include paths specified with \p -I
+   *    are added to the search paths used by Jitify. The environment
+   *    variable JITIFY_OPTIONS can also be used to define additional
+   *    options.
+   *  \param file_callback A pointer to a callback function that is
+   *    invoked whenever a source file needs to be loaded. Inside this
+   *    function, the user can either load/specify the source themselves
+   *    or defer to Jitify's file-loading mechanisms.
+   *  \note Program or header source files referenced by filename are
+   *  looked-up using the following mechanisms (in this order):
+   *  \note 1) By calling file_callback.
+   *  \note 2) By looking for the file embedded in the executable via the GCC
+   * linker.
+   *  \note 3) By looking for the file in the filesystem.
+   *
+   *  \note Jitify recursively scans all source files for \p #include
+   *  directives and automatically adds them to the set of headers needed
+   *  by the program.
+   *  If a \p #include directive references a header that cannot be found,
+   *  the directive is automatically removed from the source code to prevent
+   *  immediate compilation failure. This may result in compilation errors
+   *  if the header was required by the program.
+   *
+   *  \note Jitify automatically includes NVRTC-safe versions of some
+   *  standard library headers.
+   */
+  Program(std::string const& cuda_source,
+          std::vector<std::string> const& given_headers = {},
+          std::vector<std::string> const& given_options = {},
+          file_callback_type file_callback = nullptr) {
+    // Add pre-include built-in JIT-safe headers
+    std::vector<std::string> headers = given_headers;
+    for (int i = 0; i < detail::preinclude_jitsafe_headers_count; ++i) {
+      const char* hdr_name = detail::preinclude_jitsafe_header_names[i];
+      const std::string& hdr_source =
+          detail::get_jitsafe_headers_map().at(hdr_name);
+      headers.push_back(std::string(hdr_name) + "\n" + hdr_source);
+    }
+
+    _options = given_options;
+    detail::add_options_from_env(_options);
+    std::vector<std::string> include_paths;
+    detail::load_program(cuda_source, headers, file_callback, &include_paths,
+                         &_sources, &_options, &_name);
+  }
+
+  /*! Restore a serialized program.
+   *
+   * \param serialized_program The serialized program to restore.
+   *
+   * \see serialize
+   */
+  static Program deserialize(std::string const& serialized_program) {
+    Program program;
+    if (!serialization::deserialize(serialized_program, &program._name,
+                                    &program._options, &program._sources)) {
+      throw std::runtime_error("Failed to deserialize program");
+    }
+    return program;
+  }
+
+  /*! Save the program.
+   *
+   * \see deserialize
+   */
+  std::string serialize() const {
+    // Note: Must update kSerializationVersion if this is changed.
+    return serialization::serialize(_name, _options, _sources);
+  };
+
+  /*! Select a kernel.
+   *
+   * \param name The name of the kernel (unmangled and without
+   * template arguments).
+   * \param options A vector of options to be passed to the NVRTC
+   * compiler when compiling this kernel.
+   */
+  Kernel kernel(std::string const& name,
+                std::vector<std::string> const& options = {}) const;
+};
+
+class Kernel {
+  friend class KernelInstantiation;
+  Program const* _program;
+  std::string _name;
+  std::vector<std::string> _options;
+
+ public:
+  Kernel(Program const* program, std::string const& name,
+         std::vector<std::string> const& options = {})
+      : _program(program), _name(name), _options(options) {}
+
+  /*! Instantiate the kernel.
+   *
+   *  \param template_args A vector of template arguments represented as
+   *    code-strings. These can be generated using
+   *    \code{.cpp}jitify::reflection::reflect<type>()\endcode or
+   *    \code{.cpp}jitify::reflection::reflect(value)\endcode
+   *
+   *  \note Template type deduction is not possible, so all types must be
+   *    explicitly specified.
+   */
+  KernelInstantiation instantiate(
+      std::vector<std::string> const& template_args =
+          std::vector<std::string>()) const;
+
+  // Regular template instantiation syntax (note limited flexibility)
+  /*! Instantiate the kernel.
+   *
+   *  \note The template arguments specified on this function are
+   *    used to instantiate the kernel. Non-type template arguments must
+   *    be wrapped with
+   *    \code{.cpp}jitify::reflection::NonType<type,value>\endcode
+   *
+   *  \note Template type deduction is not possible, so all types must be
+   *    explicitly specified.
+   */
+  template <typename... TemplateArgs>
+  KernelInstantiation instantiate() const;
+
+  // Template-like instantiation syntax
+  //   E.g., instantiate(myvar,Type<MyType>())(grid,block)
+  /*! Instantiate the kernel.
+   *
+   *  \param targs The template arguments for the kernel, represented as
+   *    values. Types must be wrapped with
+   *    \code{.cpp}jitify::reflection::Type<type>()\endcode or
+   *    \code{.cpp}jitify::reflection::type_of(value)\endcode
+   *
+   *  \note Template type deduction is not possible, so all types must be
+   *    explicitly specified.
+   */
+  template <typename... TemplateArgs>
+  KernelInstantiation instantiate(TemplateArgs... targs) const;
+};
+
+class KernelInstantiation {
+  friend class KernelLauncher;
+  std::unique_ptr<detail::CUDAKernel> _cuda_kernel;
+
+  // Private constructor used by deserialize()
+  KernelInstantiation(std::string const& func_name, std::string const& ptx,
+                      std::vector<std::string> const& link_files,
+                      std::vector<std::string> const& link_paths)
+      : _cuda_kernel(new detail::CUDAKernel(func_name.c_str(), ptx.c_str(),
+                                            link_files, link_paths)) {}
+
+ public:
+  KernelInstantiation(Kernel const& kernel,
+                      std::vector<std::string> const& template_args) {
+    Program const* program = kernel._program;
+
+    std::string template_inst =
+        (template_args.empty() ? ""
+                               : reflection::reflect_template(template_args));
+    std::string instantiation = kernel._name + template_inst;
+
+    std::vector<std::string> options;
+    options.insert(options.begin(), program->_options.begin(),
+                   program->_options.end());
+    options.insert(options.begin(), kernel._options.begin(),
+                   kernel._options.end());
+    detail::detect_and_add_cuda_arch(options);
+    detail::detect_and_add_cxx11_flag(options);
+
+    std::string log, ptx, mangled_instantiation;
+    std::vector<std::string> linker_files, linker_paths;
+    detail::instantiate_kernel(program->_name, program->_sources, instantiation,
+                               options, &log, &ptx, &mangled_instantiation,
+                               &linker_files, &linker_paths);
+
+    _cuda_kernel.reset(new detail::CUDAKernel(mangled_instantiation.c_str(),
+                                              ptx.c_str(), linker_files,
+                                              linker_paths));
+  }
+
+  /*! Implicit conversion to the underlying CUfunction object.
+   *
+   * \note This allows use of CUDA APIs like
+   *   cuOccupancyMaxActiveBlocksPerMultiprocessor.
+   */
+  operator CUfunction() const { return *_cuda_kernel; }
+
+  /*! Restore a serialized kernel instantiation.
+   *
+   * \param serialized_kernel_inst The serialized kernel instantiation to
+   * restore.
+   *
+   * \see serialize
+   */
+  static KernelInstantiation deserialize(
+      std::string const& serialized_kernel_inst) {
+    std::string func_name, ptx;
+    std::vector<std::string> link_files, link_paths;
+    if (!serialization::deserialize(serialized_kernel_inst, &func_name, &ptx,
+                                    &link_files, &link_paths)) {
+      throw std::runtime_error("Failed to deserialize kernel instantiation");
+    }
+    return KernelInstantiation(func_name, ptx, link_files, link_paths);
+  }
+
+  /*! Save the program.
+   *
+   * \see deserialize
+   */
+  std::string serialize() const {
+    // Note: Must update kSerializationVersion if this is changed.
+    return serialization::serialize(
+        _cuda_kernel->function_name(), _cuda_kernel->ptx(),
+        _cuda_kernel->link_files(), _cuda_kernel->link_paths());
+  }
+
+  /*! Configure the kernel launch.
+   *
+   *  \param grid   The thread grid dimensions for the launch.
+   *  \param block  The thread block dimensions for the launch.
+   *  \param smem   The amount of shared memory to dynamically allocate, in
+   * bytes.
+   *  \param stream The CUDA stream to launch the kernel in.
+   */
+  KernelLauncher configure(dim3 grid, dim3 block, unsigned int smem = 0,
+                           cudaStream_t stream = 0) const;
+
+  /*! Configure the kernel launch with a 1-dimensional block and grid chosen
+   *  automatically to maximise occupancy.
+   *
+   * \param max_block_size  The upper limit on the block size, or 0 for no
+   * limit.
+   * \param smem  The amount of shared memory to dynamically allocate, in bytes.
+   * \param smem_callback  A function returning smem for a given block size
+   * (overrides \p smem).
+   * \param stream The CUDA stream to launch the kernel in.
+   * \param flags The flags to pass to
+   * cuOccupancyMaxPotentialBlockSizeWithFlags.
+   */
+  KernelLauncher configure_1d_max_occupancy(
+      int max_block_size = 0, unsigned int smem = 0,
+      CUoccupancyB2DSize smem_callback = 0, cudaStream_t stream = 0,
+      unsigned int flags = 0) const;
+
+  /*
+   * \deprecated Use \p get_global_ptr instead.
+   */
+  CUdeviceptr get_constant_ptr(const char* name, size_t* size = nullptr) const {
+    return get_global_ptr(name, size);
+  }
+
+  /*
+   * Get a device pointer to a global __constant__ or __device__ variable using
+   * its un-mangled name. If provided, *size is set to the size of the variable
+   * in bytes.
+   */
+  CUdeviceptr get_global_ptr(const char* name, size_t* size = nullptr) const {
+    return _cuda_kernel->get_global_ptr(name, size);
+  }
+
+  /*
+   * Copy data from a global __constant__ or __device__ array to the host using
+   * its un-mangled name.
+   */
+  template <typename T>
+  CUresult get_global_array(const char* name, T* data, size_t count,
+                            CUstream stream = 0) const {
+    return _cuda_kernel->get_global_data(name, data, count, stream);
+  }
+
+  /*
+   * Copy a value from a global __constant__ or __device__ variable to the host
+   * using its un-mangled name.
+   */
+  template <typename T>
+  CUresult get_global_value(const char* name, T* value,
+                            CUstream stream = 0) const {
+    return get_global_array(name, value, 1, stream);
+  }
+
+  /*
+   * Copy data from the host to a global __constant__ or __device__ array using
+   * its un-mangled name.
+   */
+  template <typename T>
+  CUresult set_global_array(const char* name, const T* data, size_t count,
+                            CUstream stream = 0) const {
+    return _cuda_kernel->set_global_data(name, data, count, stream);
+  }
+
+  /*
+   * Copy a value from the host to a global __constant__ or __device__ variable
+   * using its un-mangled name.
+   */
+  template <typename T>
+  CUresult set_global_value(const char* name, const T& value,
+                            CUstream stream = 0) const {
+    return set_global_array(name, &value, 1, stream);
+  }
+
+  const std::string& mangled_name() const {
+    return _cuda_kernel->function_name();
+  }
+
+  const std::string& ptx() const { return _cuda_kernel->ptx(); }
+
+  const std::vector<std::string>& link_files() const {
+    return _cuda_kernel->link_files();
+  }
+
+  const std::vector<std::string>& link_paths() const {
+    return _cuda_kernel->link_paths();
+  }
+};
+
+class KernelLauncher {
+  KernelInstantiation const* _kernel_inst;
+  dim3 _grid;
+  dim3 _block;
+  unsigned int _smem;
+  cudaStream_t _stream;
+
+ public:
+  KernelLauncher(KernelInstantiation const* kernel_inst, dim3 grid, dim3 block,
+                 unsigned int smem = 0, cudaStream_t stream = 0)
+      : _kernel_inst(kernel_inst),
+        _grid(grid),
+        _block(block),
+        _smem(smem),
+        _stream(stream) {}
+
+  // Note: It's important that there is no implicit conversion required
+  //         for arg_ptrs, because otherwise the parameter pack version
+  //         below gets called instead (probably resulting in a segfault).
+  /*! Launch the kernel.
+   *
+   *  \param arg_ptrs  A vector of pointers to each function argument for the
+   *    kernel.
+   *  \param arg_types A vector of function argument types represented
+   *    as code-strings. This parameter is optional and is only used to print
+   *    out the function signature.
+   */
+  CUresult launch(std::vector<void*> arg_ptrs = {},
+                  std::vector<std::string> arg_types = {}) const {
+#if JITIFY_PRINT_LAUNCH
+    std::string arg_types_string =
+        (arg_types.empty() ? "..." : reflection::reflect_list(arg_types));
+    std::cout << "Launching " << _kernel_inst->_cuda_kernel->function_name()
+              << "<<<" << _grid << "," << _block << "," << _smem << ","
+              << _stream << ">>>"
+              << "(" << arg_types_string << ")" << std::endl;
+#endif
+    return _kernel_inst->_cuda_kernel->launch(_grid, _block, _smem, _stream,
+                                              arg_ptrs);
+  }
+
+  /*! Launch the kernel.
+   *
+   *  \param args Function arguments for the kernel.
+   */
+  template <typename... ArgTypes>
+  CUresult launch(ArgTypes... args) const {
+    return this->launch(std::vector<void*>({(void*)&args...}),
+                        {reflection::reflect<ArgTypes>()...});
+  }
+};
+
+inline Kernel Program::kernel(std::string const& name,
+                              std::vector<std::string> const& options) const {
+  return Kernel(this, name, options);
+}
+
+inline KernelInstantiation Kernel::instantiate(
+    std::vector<std::string> const& template_args) const {
+  return KernelInstantiation(*this, template_args);
+}
+
+template <typename... TemplateArgs>
+inline KernelInstantiation Kernel::instantiate() const {
+  return this->instantiate(
+      std::vector<std::string>({reflection::reflect<TemplateArgs>()...}));
+}
+
+template <typename... TemplateArgs>
+inline KernelInstantiation Kernel::instantiate(TemplateArgs... targs) const {
+  return this->instantiate(
+      std::vector<std::string>({reflection::reflect(targs)...}));
+}
+
+inline KernelLauncher KernelInstantiation::configure(
+    dim3 grid, dim3 block, unsigned int smem, cudaStream_t stream) const {
+  return KernelLauncher(this, grid, block, smem, stream);
+}
+
+inline KernelLauncher KernelInstantiation::configure_1d_max_occupancy(
+    int max_block_size, unsigned int smem, CUoccupancyB2DSize smem_callback,
+    cudaStream_t stream, unsigned int flags) const {
+  int grid;
+  int block;
+  CUfunction func = *_cuda_kernel;
+  detail::get_1d_max_occupancy(func, smem_callback, &smem, max_block_size,
+                               flags, &grid, &block);
+  return this->configure(grid, block, smem, stream);
+}
+
+}  // namespace experimental
+
+}  // namespace jitify
+
+#if defined(_WIN32) || defined(_WIN64)
+#pragma pop_macro("max")
+#pragma pop_macro("min")
+#pragma pop_macro("strtok_r")
+#endif
diff --git a/GraphBLAS/CUDA/test/semiringFactory.hpp b/GraphBLAS/CUDA/test/semiringFactory.hpp
new file mode 100644
index 0000000000..be999b29a8
--- /dev/null
+++ b/GraphBLAS/CUDA/test/semiringFactory.hpp
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: Apache-2.0
+
+// Implementations of string callbacks
+#include <limits>
+#include <iostream>
+#include "GB_callback.hpp"
+
+// Define function pointer we will use later
+std::istream* (*file_callback)(std::string, std::iostream&);
+
+
+//Semi-ring callbacks
+
+std::istream* semiring_plus_times_callback( std::string filename, std::iostream& tmp_stream);
+
+std::istream* semiring_min_plus_callback( std::string filename, std::iostream& tmp_stream);
+
+std::istream* semiring_max_plus_callback( std::string filename, std::iostream& tmp_stream);
+
+
+//Monoid callbacks
+
+std::istream* file_callback_plus(std::string filename, std::iostream& tmp_stream);
+
+std::istream* file_callback_max(std::string filename, std::iostream& tmp_stream);
+
+std::istream* file_callback_min(std::string filename, std::iostream& tmp_stream);
+
+
+std::istream* semiring_plus_times_callback( std::string filename, 
+                                             std::iostream& tmp_stream)
+{
+  if (filename == "mySemiRing.h") {
+    tmp_stream << "#define MONOID_IDENTITY (T_Z)0\n"
+                  "#define MULADD( c, a, b ) (c) += (T_Z)( (a) * (b) )\n" 
+                  "#define MUL( a, b) (a) * (b)\n"
+                  "#define ADD( a, b) (a) + (b)\n";
+    return &tmp_stream;
+  }
+  else {
+    // Find this file through other mechanisms
+    return 0;
+  }
+
+}
+
+std::istream* semiring_min_plus_callback( std::string filename, 
+                                           std::iostream& tmp_stream)
+{ // Define the identity and operations for the (MIN,PLUS) semi-ring. mul->+, add -> min
+  if (filename == "mySemiRing.h") {
+    tmp_stream << "#define MONOID_IDENTITY std::numeric_limits<T_Z>::max()\n"
+                  "#define MUL( a, b) (a) + (b)\n"
+                  "#define ADD( a, b) (a) < (b) ? (a) : (b)\n";
+    return &tmp_stream;
+  }
+  else {
+    // Find this file through other mechanisms
+    return 0;
+  }
+
+}
+
+std::istream* semiring_max_plus_callback( std::string filename, 
+                                           std::iostream& tmp_stream)
+{ // Define the identity and operations for the (MAX,PLUS) semi-ring. mul->+, add -> max
+  if (filename == "mySemiRing.h") {
+    tmp_stream << "#define MONOID_IDENTITY std::numeric_limits<T_Z>::min()\n"
+                  "#define MUL( a, b) (a) + (b)\n"
+                  "#define ADD( a, b) (a) > (b) ? (a) : (b)\n";
+    return &tmp_stream;
+  }
+  else {
+    // Find this file through other mechanisms
+    return 0;
+  }
+
+}
+
+std::istream* file_callback_plus(std::string filename, std::iostream& tmp_stream) {
+  // User returns NULL or pointer to stream containing file source
+  // Note: tmp_stream is provided for convenience
+  if (filename == "myOp.h") {
+    tmp_stream << "#pragma once\n"
+                  "#define MONOID_IDENTITY (T)0\n"
+                  "#define OP( a, b) (a) + (b)\n";
+    return &tmp_stream;
+  }
+  else {
+    // Find this file through other mechanisms
+    return 0;
+  }
+}
+
+std::istream* file_callback_max(std::string filename, std::iostream& tmp_stream) {
+  // User returns NULL or pointer to stream containing file source
+  // Note: tmp_stream is provided for convenience
+  if (filename == "myOp.h") {
+    tmp_stream << "#pragma once\n"
+                  "#include <limits>\n"
+                  "#define MONOID_IDENTITY std::numeric_limits<T>::min()\n"
+                  "#define OP( a, b) (a) > (b) ? (a) : (b)\n";
+
+    return &tmp_stream;
+  }
+  else {
+    // Find this file through other mechanisms
+    return 0;
+  }
+}
+
+std::istream* file_callback_min(std::string filename, std::iostream& tmp_stream) {
+  // User returns NULL or pointer to stream containing file source
+  // Note: tmp_stream is provided for convenience
+  if (filename == "myOp.h") {
+    tmp_stream << "#pragma once\n"
+                  "#include <limits>\n"
+                  "#define MONOID_IDENTITY std::numeric_limits<T>::max()\n"
+                  "#define OP( a, b) (a) < (b) ? (a) : (b)\n";
+
+    return &tmp_stream;
+  }
+  else {
+    // Find this file through other mechanisms
+    return 0;
+  }
+}
+
+
diff --git a/GraphBLAS/CUDA/test/spGEMMfixtures.hpp b/GraphBLAS/CUDA/test/spGEMMfixtures.hpp
new file mode 100644
index 0000000000..694b25c4ee
--- /dev/null
+++ b/GraphBLAS/CUDA/test/spGEMMfixtures.hpp
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: Apache-2.0
+
+// SpGEMM Test Fixtures
+// Provides test setup and teardown, data generators and covers
+// all 12 cases for the masked GEMM ( C, M, A, B) in GraphBLAS
+// Connects to the jitFactory for launches.
+
+#include <cassert>
+#include <cmath>
+#include <random>
+#include <algorithm>
+#include <cstdint>
+#include "jitTestFactory.hpp"
+#include "gtest/gtest.h"
+
+
+//Test generators using jitify
+
+
+TEST(SpGEMMvsvsTest, PlusTimesLongBoolIntInt) {
+  test_spGEMM_vsvs_factory<int64_t, uint8_t, int, int>(5, 32, 256, 128, "PLUS_TIMES"); 
+}
+
+TEST(SpGEMMvsvsTest, PlusTimesInt4Test ) {
+
+  test_spGEMM_vsvs_factory<int, int, int, int>(5, 32, 256, 128, "PLUS_TIMES"); 
+}
+
+TEST(SpGEMMvsvsTest, PlusTimesInt4TestMed ) {
+
+  test_spGEMM_vsvs_factory<int, int, int, int>(5, 4096, 25600, 25600, "PLUS_TIMES"); 
+}
+
+TEST(SpGEMMvsvsTest, PlusTimesFloat4Test ) {
+
+  test_spGEMM_vsvs_factory<float, float, float, float>(5, 32, 256, 128, "PLUS_TIMES"); 
+}
+
+TEST(SpGEMMvsdnTest, PlusTimesInt4Test) {
+
+  test_spGEMM_vsdn_factory<int, int, int, int>(5, 32, 256, 32*32, "PLUS_TIMES"); 
+}
+TEST(SpGEMMvsdnTest, PlusTimesInt4TestMed) {
+
+  test_spGEMM_vsdn_factory<int, int, int, int>(5, 256, 4096, 256*256, "PLUS_TIMES"); 
+}
+
+TEST( Reductions, PlusFloat) {
+  test_reducefactoryUM<float>(4096, "PLUS");
+}
+
+TEST( Reductions, PlusDouble) {
+  test_reducefactoryUM<double>(4096, "PLUS");
+}
+
+TEST( Reductions, MinFloat) {
+  test_reducefactoryUM<float>(32,"MIN");
+}
+
+TEST( Reductions, MinInt) {
+  test_reducefactoryUM<int>(32,"MIN");
+}
+
+TEST( Reductions, MaxInt) {
+  test_reducefactoryUM<int>(32,"MAX");
+}
+
diff --git a/GraphBLAS/CUDA/test/testGen.py b/GraphBLAS/CUDA/test/testGen.py
new file mode 100644
index 0000000000..4b78c942d2
--- /dev/null
+++ b/GraphBLAS/CUDA/test/testGen.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0 
+# Generate test instances from a large tensor product set of options
+
+Monoids = ["PLUS","MIN","MAX","TIMES","ANY"]
+Binops  = ["TIMES", "PLUS", "MIN", "MAX", "DIV","MINUS", "RDIV","RMINUS","FIRST","SECOND","PAIR"]
+Semirings = ["PLUS_TIMES", "MIN_PLUS", "MAX_PLUS"]
+#Semirings = ["PLUS_TIMES"]#,"MIN_PLUS"] #, "MAX_PLUS"]
+
+#DataTypes = ["bool","int8_t","int16_t", "int32_t", "int64_t",
+#                    "uint8_t", "uint16_t", "uint32_t", "uint64_t",
+#                    "float","double"]
+DataTypes = ["int32_t", "int64_t", "uint32_t","uint64_t","float","double"]
+#DataTypes = ["float","double"]
+DataTypes = ["int32_t","uint64_t"]
+
+DataShapes ={
+             "tinyxtiny": {'N':32, 'Anz':256, 'Bnz':128}, 
+             "smallxsmall": {'N':1024, 'Anz': 65_536, 'Bnz':65_536}
+            # "medxmed": {'N':4096, 'Anz': 2**20, 'Bnz':2**20} 
+            # "largexlarge": {'N':2**16, 'Anz': 64*2**20, 'Bnz':64*2**20} 
+             }
+
+Kernels= ["warp","mp", "vsvs","dndn", "spdn","vssp"]
+Kernels= ["warp"] #, "vsvs","dndn", "spdn","vssp"]
+
+
+
+def buildTest(ts="TestsuiteName",kern="vsvs", ds= "tiny-tiny", SR = "PLUS_TIMES",phase=3, 
+              typeC="int",typeM="int",typeA="int",typeB="int",type_x="int",type_y="int",type_z="int"):
+
+    # build string interpolation from pieces
+    Test_name = f"{ds}{SR}C{typeC}M{typeM}A{typeA}B{typeB}X{type_x}Y{type_y}Z{type_z}"
+
+    Test_suite = ts
+    #print(Test_suite)
+    TEST_HEAD = f"""TEST( {Test_suite}, {Test_name})""" 
+    #print(TEST_HEAD)
+    N = DataShapes[ds]['N']
+    Anz = DataShapes[ds]['Anz']
+    Bnz = DataShapes[ds]['Bnz']
+    phase1_body= f""" test_AxB_dot3_phase1_factory< {typeC}, {typeM}, {typeA}, {typeB}>( 5, {N}, {Anz},{Bnz});""" 
+    phase2_body= f""" test_AxB_dot3_phase2_factory< {typeC} >( 5, {N}, {Anz},{Bnz});"""  
+    phase3_body = f""" test_AxB_dot3_{kern}_factory< {typeC},{typeM},{typeA},{typeB},{type_x},{type_y},{type_z} > (5, {N}, {Anz}, {Bnz}, SR);"""
+    #print( TEST_BODY)
+    phasedict = { 1: phase1_body, 2: phase2_body, 3: phase3_body}
+    TEST_BODY= phasedict[phase] 
+
+    return TEST_HEAD,TEST_BODY 
+
+
+if __name__ == "__main__":
+
+
+   #print( buildTest()) #test if anything works
+
+    
+   outfile = f"""AxB_dot3_test_instances.hpp""" 
+   fp = open(outfile, 'w')
+
+
+   for k in Kernels:
+       Test_suite = f'AxB_dot3_tests_{k}'
+       for SR in Semirings:
+           for dtC in DataTypes:
+               dtX = dtC 
+               dtY = dtC 
+               dtZ = dtC
+               for dtM in ["bool", "int32_t"]: 
+                   for dtA in DataTypes:
+                       for dtB in DataTypes:
+                           for ds in DataShapes:
+                               for phase in [3]: 
+
+                                   TEST_HEAD, TEST_BODY = buildTest( Test_suite, k, ds, SR, phase,
+                                                                     dtC, dtM, dtA, dtB, dtX, dtY, dtZ)  
+                                   fp.write( TEST_HEAD)
+                                   fp.write( """{ std::string SR = "%s"; """%SR)
+                                   fp.write( TEST_BODY)
+                                   fp.write( "}\n")
+
+          
+   fp.close()
+
diff --git a/GraphBLAS/CUDA/test/testJit.cpp b/GraphBLAS/CUDA/test/testJit.cpp
new file mode 100644
index 0000000000..9d7032c00a
--- /dev/null
+++ b/GraphBLAS/CUDA/test/testJit.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in the
+ *   documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ *   contributors may be used to endorse or promote products derived
+ *   from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+  Extended example for building on-the-fly kernels with C interface.
+  Simple examples demonstrating different ways to load source code
+    and call kernels.
+ */
+
+
+#include "jitTestFactory.hpp"
+
+
+int main(int argc, char* argv[]) {
+#if __cplusplus >= 201103L
+
+#define TEST_RESULT(result) (result ? "PASSED" : "FAILED")
+std::cout << "Running tests..."<<std::endl;
+
+  std::cout<<" spdot_plus_times_ffd_non_uniform"<<std::endl;
+  bool test_spdot_plus_times_ffd_nu = test_spdotfactoryUM<float,float,double>(256, 32,120,"PLUS_TIMES");
+  std::cout << "test_spdotfactoryUM<float,float,double> non-uniform uncached:       " 
+            << TEST_RESULT(test_spdot_plus_times_ffd_nu)
+            << std::endl;
+
+  std::cout<<" spdot_plus_times_ffd_large_non_uniform"<<std::endl;
+  bool test_spdot_plus_times_ffd_lrg_nu = test_spdotfactoryUM<float,float,double>(4096, 256,256,"PLUS_TIMES");
+  std::cout << "test_spdotfactoryUM<float,float,double> non-uniform uncached:       " 
+            << TEST_RESULT(test_spdot_plus_times_ffd_lrg_nu)
+            << std::endl;
+
+  std::cout<<" spdot_plus_times_fff"<<std::endl;
+  bool test_spdot_plus_times_fff = test_spdotfactoryUM<float,float,float>(256, 32,32,"PLUS_TIMES");
+  std::cout << "test_spdotfactoryUM<float,float,float> uncached:       " 
+            << TEST_RESULT(test_spdot_plus_times_fff)
+            << std::endl;
+
+  std::cout<<" spdot_plus_times_ffd"<<std::endl;
+  bool test_spdot_plus_times_ffd = test_spdotfactoryUM<float,float,double>(256, 32,32,"PLUS_TIMES");
+  std::cout << "test_spdotfactoryUM<float,float,double> uncached:       " 
+            << TEST_RESULT(test_spdot_plus_times_ffd)
+            << std::endl;
+
+
+  cudaSetDevice(0); 
+  bool test_spdot_batch_ffff = test_spdot_batch_factoryUM<float, float, float, float>(5, 32, 128, 128, "PLUS_TIMES"); 
+  std::cout << "test_spdot_batchUM<float,float,float,float> uncached:       " 
+            << TEST_RESULT(test_spdot_batch_ffff)
+            << std::endl;
+
+  bool test_spdot_batch_iiii = test_spdot_batch_factoryUM<int, int, int, int>(5, 32, 128, 128, "PLUS_TIMES"); 
+  std::cout << "test_spdot_batchUM<int,int,int,int> uncached:       " 
+            << TEST_RESULT(test_spdot_batch_iiii)
+            << std::endl;
+
+  cudaSetDevice(1); 
+
+  bool test_spdot_batch_liii= test_spdot_batch_factoryUM<int64_t, int, int, int>(5, 32, 256, 128, "PLUS_TIMES"); 
+  std::cout << "test_spdot_batchUM<int64_t,int, int,int> uncached:       " 
+            << TEST_RESULT(test_spdot_batch_liii)
+            << std::endl;
+/*
+
+
+
+  bool test_dot_min_plus_iil = test_dotfactoryUM<int,int,long>(4096,"MIN_PLUS");
+  std::cout << "test_dotfactoryUM<int,int,long> uncached:       " 
+            << TEST_RESULT(test_dot_min_plus_iil)
+            << std::endl;
+
+  bool test_dot_min_plus_ffd = test_dotfactoryUM<float,float,double>(4096,"MIN_PLUS");
+  std::cout << "test_dotfactoryUM<float,float,double> uncached:       " 
+            << TEST_RESULT(test_dot_min_plus_ffd)
+            << std::endl;
+
+  bool test_dot_plus_times_ffd = test_dotfactoryUM<float,float,double>(4096,"PLUS_TIMES");
+  std::cout << "test_dotfactoryUM<float,float,double> uncached:       " 
+            << TEST_RESULT(test_dot_plus_times_ffd)
+            << std::endl;
+
+  bool test_dot_plus_times_fii = test_dotfactoryUM<float,int,int>(4096,"PLUS_TIMES");
+  std::cout << "test_dotfactoryUM<float,int,int> uncached:       " 
+            << TEST_RESULT(test_dot_plus_times_fii)
+            << std::endl;
+
+  bool test_dot_plus_times_iil = test_dotfactoryUM<int,int,long>(4096,"PLUS_TIMES");
+  std::cout << "test_dotfactoryUM<int,int,long> uncached:       " 
+            << TEST_RESULT(test_dot_plus_times_iil)
+            << std::endl;
+
+  bool test_reducefactory_float_result = test_reducefactoryUM<float>(4096, "PLUS");
+  std::cout << "test_reducefactoryUM<float> uncached:       " 
+            << TEST_RESULT(test_reducefactory_float_result)
+            << std::endl;
+
+  bool test_reducefactory_double_plus_result = test_reducefactoryUM<double>(4096, "PLUS");
+  std::cout << "test_reducefactoryUM<double> uncached:       " 
+            << TEST_RESULT(test_reducefactory_double_plus_result)
+            << std::endl;
+
+  std::cout << "testing cached kernel" <<std::endl;
+  bool test2_reducefactory_double_plus_result = test_reducefactoryUM<double>(4096, "PLUS");
+  std::cout << "test_reducefactoryUM<double> cached:       " 
+            << TEST_RESULT(test2_reducefactory_double_plus_result)
+            << std::endl;
+
+  bool test_reducefactory_float_min_result = test_reducefactoryUM<float>(32,"MIN");
+  std::cout << "test_reducefactoryUM<float> MIN uncached:       " 
+            << TEST_RESULT(test_reducefactory_float_min_result)
+            << std::endl;
+
+  bool test_reducefactory_int_min_result = test_reducefactoryUM<int>(32,"MIN");
+  std::cout << "test_reducefactoryUM<int> MIN uncached:       " 
+            << TEST_RESULT(test_reducefactory_int_min_result)
+            << std::endl;
+
+  bool test_reducefactory_int_max_result = test_reducefactoryUM<int>(32,"MAX");
+  std::cout << "test_reducefactoryUM<int> MAX uncached:       " 
+            << TEST_RESULT(test_reducefactory_int_max_result)
+            << std::endl;
+
+  bool test_reducefactory_int_result = test_reducefactoryUM<int>(4096,"PLUS");
+  std::cout << "test_reducefactoryUM<int> PLUS uncached:       " 
+            << TEST_RESULT(test_reducefactory_int_result)
+            << std::endl;
+
+  bool test_reducefactory_int_cache_result = 
+                test_reducefactoryUM<int>(4096,"PLUS");
+  std::cout << "test_reducefactoryUM<int> PLUS cached:          " 
+            << TEST_RESULT(test_reducefactory_int_cache_result)
+            << std::endl;
+*/
+  return 0;
+
+#else
+  std::cout << "Tests require building with C++14 support (make CXX14=1)"
+            << std::endl;
+  return 0;
+#endif
+}
diff --git a/GraphBLAS/CUDA/type_name.hpp b/GraphBLAS/CUDA/type_name.hpp
new file mode 100644
index 0000000000..3536c45a5b
--- /dev/null
+++ b/GraphBLAS/CUDA/type_name.hpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2019,2020 NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef GB_TYPE_NAME_H
+#define GB_TYPE_NAME_H
+
+#include <string>
+#include <typeinfo>
+#include <type_traits>
+#include <memory>
+#include <cstdlib>
+
+/**---------------------------------------------------------------------------*
+ * @file type_name.hpp
+ * @brief Defines the mapping between concrete C++ types and strings.
+ *---------------------------------------------------------------------------**/
+namespace jit {
+
+template <typename T> class type_name {
+public:
+  static const char *name;
+};
+
+#define DECLARE_TYPE_NAME(x) template<> const char *jit::type_name<x>::name = #x;
+#define GET_TYPE_NAME(x) (jit::type_name<decltype(x)>::name)
+
+DECLARE_TYPE_NAME(int);
+DECLARE_TYPE_NAME(int&);
+DECLARE_TYPE_NAME(int*);
+DECLARE_TYPE_NAME(int8_t);
+DECLARE_TYPE_NAME(int8_t&);
+DECLARE_TYPE_NAME(int8_t*);
+DECLARE_TYPE_NAME(unsigned char);
+DECLARE_TYPE_NAME(unsigned char&);
+DECLARE_TYPE_NAME(unsigned char*);
+DECLARE_TYPE_NAME(unsigned int);
+DECLARE_TYPE_NAME(unsigned int&);
+DECLARE_TYPE_NAME(unsigned int*);
+DECLARE_TYPE_NAME(unsigned long);
+DECLARE_TYPE_NAME(unsigned long&);
+DECLARE_TYPE_NAME(unsigned long*);
+DECLARE_TYPE_NAME(long);
+DECLARE_TYPE_NAME(long&);
+DECLARE_TYPE_NAME(long*);
+DECLARE_TYPE_NAME(float);
+DECLARE_TYPE_NAME(float&);
+DECLARE_TYPE_NAME(float*);
+DECLARE_TYPE_NAME(double);
+DECLARE_TYPE_NAME(double&);
+DECLARE_TYPE_NAME(double*);
+
+
+
+}  // namespace jit 
+#endif
diff --git a/GraphBLAS/Config/GraphBLAS.h.in b/GraphBLAS/Config/GraphBLAS.h.in
index 48b3d86502..9d591f9349 100644
--- a/GraphBLAS/Config/GraphBLAS.h.in
+++ b/GraphBLAS/Config/GraphBLAS.h.in
@@ -2,18 +2,18 @@
 // GraphBLAS.h: definitions for the GraphBLAS package
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS is an full implementation of the GraphBLAS standard,
-// which defines a set of sparse matrix operations on an extended algebra of
-// semirings, using an almost unlimited variety of operators and types.  When
-// applied to sparse adjacency matrices, these algebraic operations are
-// equivalent to computations on graphs.  GraphBLAS provides a powerful and
-// expressive framework creating graph algorithms based on the elegant
-// mathematics of sparse matrix operations on a semiring.
+// SuiteSparse:GraphBLAS is a complete implementation of the GraphBLAS
+// standard, which defines a set of sparse matrix operations on an extended
+// algebra of semirings, using an almost unlimited variety of operators and
+// types.  When applied to sparse adjacency matrices, these algebraic
+// operations are equivalent to computations on graphs.  GraphBLAS provides a
+// powerful and expressive framework creating graph algorithms based on the
+// elegant mathematics of sparse matrix operations on a semiring.
 
 // This GraphBLAS.h file contains GraphBLAS definitions for user applications
 // to #include.  Functions and variables with the prefix GB_ need to be defined
@@ -23,32 +23,38 @@
 // example GrB_free is a macro that uses _Generic to select the right method,
 // depending on the type of its argument.
 
-// This implementation fully conforms to the GraphBLAS API Specification, but
-// also includes functions and features that are extensions to the spec.  These
-// are cataloged here and tagged with "SPEC."
-
-// All functions and definitions that are extensions to the spec are given
-// names of the form GxB_* for functions, built-in objects, and macros, so it
-// is clear which are in the spec and which are extensions.  Extensions with
-// the name GxB_* are user-accessible in SuiteSparse:GraphBLAS but cannot be
-// guaranteed to appear in all GraphBLAS implementations.  In the future, if
-// any GxB_* functions are included as-is in the GraphBLAS API spec with GrB_*
-// names, the prior GxB_* variants that appear here will be kept for backward
-// compatibility.  If they must change for inclusion in the spec, a reasonable
-// attempt will be made to keep the prior GxB_* variant alongside the GrB_*
-// version, also for backward compatibility.
-
-// CUDA and MKL integration are in progress.  This file includes some
-// defintions for related support functions and macros.  These are tagged
-// as DRAFT below, and are not yet documented.  Do not use them; they will
-// likely change when these features are added for public usage.
+// This implementation (nearly) fully conforms to the GraphBLAS API
+// Specification (see the notes in the User Guide regarding GrB_wait,
+// GrB_error, and GrB_Matrix_reduce_BinaryOp).
+
+// It also includes functions and features that are extensions to the spec,
+// which are given names of the form GxB_* for functions, built-in objects, and
+// macros, so it is clear which are in the spec and which are extensions.
+// Extensions with the name GxB_* are user-accessible in SuiteSparse:GraphBLAS
+// but cannot be guaranteed to appear in all GraphBLAS implementations.
 
 #ifndef GRAPHBLAS_H
 #define GRAPHBLAS_H
 
-//------------------------------------------------------------------------------
+//==============================================================================
+// include files required by GraphBLAS
+//==============================================================================
+
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stddef.h>
+#include <limits.h>
+#include <math.h>
+#include <stdarg.h>
+
+//==============================================================================
 // compiler variations
-//------------------------------------------------------------------------------
+//==============================================================================
 
 // Exporting/importing symbols for Microsoft Visual Studio
 
@@ -82,134 +88,6 @@
 #define GxB_STDC_VERSION 199001L
 #endif
 
-//------------------------------------------------------------------------------
-// GraphBLAS version
-//------------------------------------------------------------------------------
-
-// SPEC: the following macros are extensions to the spec
-
-// There are two version numbers that user codes can check against with
-// compile-time #if tests:  the version of this GraphBLAS implementation,
-// and the version of the GraphBLAS specification it conforms to.  User code
-// can use tests like this:
-//
-//      #if GxB_SPEC_VERSION >= GxB_VERSION (2,0,3)
-//      ... use features in GraphBLAS specification 2.0.3 ...
-//      #else
-//      ... only use features in early specifications
-//      #endif
-//
-//      #if GxB_IMPLEMENTATION > GxB_VERSION (1,4,0)
-//      ... use features from version 1.4.0 of a GraphBLAS package
-//      #endif
-
-// X_GRAPHBLAS: names this particular implementation:
-#define GxB_SUITESPARSE_GRAPHBLAS
-
-// GxB_VERSION: a single integer for comparing spec and version levels
-#define GxB_VERSION(major,minor,sub) \
-    (((major)*1000ULL + (minor))*1000ULL + (sub))
-
-// The version of this implementation, and the GraphBLAS API version:
-#define GxB_IMPLEMENTATION_NAME "SuiteSparse:GraphBLAS"
-#define GxB_IMPLEMENTATION_DATE "@GraphBLAS_DATE@"
-#define GxB_IMPLEMENTATION_MAJOR @GraphBLAS_VERSION_MAJOR@
-#define GxB_IMPLEMENTATION_MINOR @GraphBLAS_VERSION_MINOR@
-#define GxB_IMPLEMENTATION_SUB   @GraphBLAS_VERSION_SUB@
-#define GxB_SPEC_DATE "@GraphBLAS_API_DATE@"
-#define GxB_SPEC_MAJOR @GraphBLAS_API_VERSION_MAJOR@
-#define GxB_SPEC_MINOR @GraphBLAS_API_VERSION_MINOR@
-#define GxB_SPEC_SUB   @GraphBLAS_API_VERSION_SUB@
-
-#define GxB_IMPLEMENTATION \
-        GxB_VERSION (GxB_IMPLEMENTATION_MAJOR, \
-                     GxB_IMPLEMENTATION_MINOR, \
-                     GxB_IMPLEMENTATION_SUB)
-
-// The 'about' string the describes this particular implementation of GraphBLAS:
-#define GxB_IMPLEMENTATION_ABOUT \
-"SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, "                   \
-"All Rights Reserved.\n"                                                     \
-"http://suitesparse.com  Dept of Computer Sci. & Eng, Texas A&M University\n"
-
-// The GraphBLAS license for this particular implementation of GraphBLAS:
-#define GxB_IMPLEMENTATION_LICENSE \
-"SuiteSparse:GraphBLAS, Copyright 2017-2020, Timothy A. Davis\n"             \
-"\n"                                                                         \
-"Licensed under the Apache License, Version 2.0 (the \"License\");\n"        \
-"you may not use SuiteSparse:GraphBLAS except in compliance with the\n"      \
-"License.  You may obtain a copy of the License at\n"                        \
-"\n"                                                                         \
-"    http://www.apache.org/licenses/LICENSE-2.0  \n"                         \
-"\n"                                                                         \
-"Unless required by applicable law or agreed to in writing, software\n"      \
-"distributed under the License is distributed on an \"AS IS\" BASIS,\n"      \
-"WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" \
-"See the License for the specific language governing permissions and\n"      \
-"limitations under the License.\n"
-
-//------------------------------------------------------------------------------
-// GraphBLAS C API version
-//------------------------------------------------------------------------------
-
-#define GxB_SPEC_VERSION GxB_VERSION(GxB_SPEC_MAJOR,GxB_SPEC_MINOR,GxB_SPEC_SUB)
-
-// The 'spec' string describes the GraphBLAS spec:
-#define GxB_SPEC_ABOUT \
-"GraphBLAS C API, by Aydin Buluc, Timothy Mattson, Scott McMillan,\n"   \
-"Jose' Moreira, Carl Yang.  Based on \"GraphBLAS Mathematics\" by\n"    \
-"Jeremy Kepner.  See also \"Graph Algorithms in the Language of\n"      \
-"Linear Algebra\", edited by J. Kepner and J. Gilbert, SIAM, 2011.\n"
-
-//------------------------------------------------------------------------------
-// deprecrated macros
-//------------------------------------------------------------------------------
-
-// Use the definitions on the right, not on the left.
-
-#define GXB_SUITESPARSE_GRAPHBLAS
-#define GXB_VERSION(major,minor,sub)    GxB_VERSION(major,minor,sub)
-#define GXB_DATE                        GxB_IMPLEMENTATION_DATE
-#define GXB_IMPLEMENTATION_MAJOR        GxB_IMPLEMENTATION_MAJOR
-#define GXB_IMPLEMENTATION_MINOR        GxB_IMPLEMENTATION_MINOR
-#define GXB_IMPLEMENTATION_SUB          GxB_IMPLEMENTATION_SUB
-#define GXB_IMPLEMENTATION              GxB_IMPLEMENTATION
-#define GXB_ABOUT                       GxB_IMPLEMENTATION_ABOUT
-#define GXB_LICENSE                     GxB_IMPLEMENTATION_LICENSE
-
-#define GXB_SPEC_DATE   GxB_SPEC_DATE
-#define GXB_MAJOR       GxB_SPEC_MAJOR
-#define GXB_MINOR       GxB_SPEC_MINOR
-#define GXB_SUB         GxB_SPEC_SUB
-#define GXB             GxB_SPEC_VERSION
-#define GXB_SPEC        GxB_SPEC_ABOUT
-
-#define GxB             GxB_SPEC_VERSION
-#define GxB_MAJOR       GxB_SPEC_MAJOR
-#define GxB_MINOR       GxB_SPEC_MINOR
-#define GxB_SUB         GxB_SPEC_SUB
-#define GxB_SPEC        GxB_SPEC_ABOUT
-
-#define GxB_DATE        GxB_IMPLEMENTATION_DATE
-#define GxB_ABOUT       GxB_IMPLEMENTATION_ABOUT
-#define GxB_LICENSE     GxB_IMPLEMENTATION_LICENSE
-
-//------------------------------------------------------------------------------
-// include files required by GraphBLAS
-//------------------------------------------------------------------------------
-
-#include <stdio.h>
-#include <errno.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <inttypes.h>
-#include <stddef.h>
-#include <limits.h>
-#include <math.h>
-#include <stdarg.h>
-
 //------------------------------------------------------------------------------
 // definitions for complex types
 //------------------------------------------------------------------------------
@@ -270,55 +148,100 @@
 
 #endif
 
-//------------------------------------------------------------------------------
-// user threading model
-//------------------------------------------------------------------------------
+//==============================================================================
+// version control
+//==============================================================================
+
+// There are two version numbers that user codes can check against with
+// compile-time #if tests:  the version of this GraphBLAS implementation,
+// and the version of the GraphBLAS specification it conforms to.  User code
+// can use tests like this:
+//
+//      #if GxB_SPEC_VERSION >= GxB_VERSION (2,0,3)
+//      ... use features in GraphBLAS specification 2.0.3 ...
+//      #else
+//      ... only use features in early specifications
+//      #endif
+//
+//      #if GxB_IMPLEMENTATION > GxB_VERSION (1,4,0)
+//      ... use features from version 1.4.0 of a GraphBLAS package
+//      #endif
 
-#if defined (USER_POSIX_THREADS)
-// POSIX pthreads
-#include <pthread.h>
+// X_GRAPHBLAS: names this particular implementation:
+#define GxB_SUITESPARSE_GRAPHBLAS
 
-#elif defined (_OPENMP) || defined (USER_OPENMP_THREADS)
-// OpenMP threads: this is the default, if OpenMP is available
-#include <omp.h>
+// GxB_VERSION: a single integer for comparing spec and version levels
+#define GxB_VERSION(major,minor,sub) \
+    (((major)*1000ULL + (minor))*1000ULL + (sub))
 
-#else // USER_NO_THREADS
-// no user threads
-#endif
+// The version of this implementation, and the GraphBLAS API version:
+#define GxB_IMPLEMENTATION_NAME "SuiteSparse:GraphBLAS"
+#define GxB_IMPLEMENTATION_DATE "@GraphBLAS_DATE@"
+#define GxB_IMPLEMENTATION_MAJOR @GraphBLAS_VERSION_MAJOR@
+#define GxB_IMPLEMENTATION_MINOR @GraphBLAS_VERSION_MINOR@
+#define GxB_IMPLEMENTATION_SUB   @GraphBLAS_VERSION_SUB@
+#define GxB_SPEC_DATE "@GraphBLAS_API_DATE@"
+#define GxB_SPEC_MAJOR @GraphBLAS_API_VERSION_MAJOR@
+#define GxB_SPEC_MINOR @GraphBLAS_API_VERSION_MINOR@
+#define GxB_SPEC_SUB   @GraphBLAS_API_VERSION_SUB@
+
+#define GxB_IMPLEMENTATION \
+        GxB_VERSION (GxB_IMPLEMENTATION_MAJOR, \
+                     GxB_IMPLEMENTATION_MINOR, \
+                     GxB_IMPLEMENTATION_SUB)
+
+// The 'about' string the describes this particular implementation of GraphBLAS:
+#define GxB_IMPLEMENTATION_ABOUT \
+"SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved." \
+"\nhttp://suitesparse.com  Dept of Computer Sci. & Eng, Texas A&M University.\n"
+
+// The GraphBLAS license for this particular implementation of GraphBLAS:
+#define GxB_IMPLEMENTATION_LICENSE \
+"SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved." \
+"\nLicensed under the Apache License, Version 2.0 (the \"License\"); you may\n"\
+"not use SuiteSparse:GraphBLAS except in compliance with the License.  You\n"  \
+"may obtain a copy of the License at\n\n"                                      \
+"    http://www.apache.org/licenses/LICENSE-2.0\n\n"                           \
+"Unless required by applicable law or agreed to in writing, software\n"        \
+"distributed under the License is distributed on an \"AS IS\" BASIS,\n"        \
+"WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"   \
+"See the License for the specific language governing permissions and\n"        \
+"limitations under the License.\n"
 
 //------------------------------------------------------------------------------
-// the GraphBLAS integer
+// GraphBLAS C API version
 //------------------------------------------------------------------------------
 
+#define GxB_SPEC_VERSION GxB_VERSION(GxB_SPEC_MAJOR,GxB_SPEC_MINOR,GxB_SPEC_SUB)
+
+// The 'spec' string describes the GraphBLAS spec:
+#define GxB_SPEC_ABOUT \
+"GraphBLAS C API, by Aydin Buluc, Timothy Mattson, Scott McMillan,\n"         \
+"Jose' Moreira, Carl Yang, and Benjamin Brock.  Based on 'GraphBLAS\n"        \
+"Mathematics by Jeremy Kepner.  See also 'Graph Algorithms in the Language\n" \
+"of Linear Algebra,' edited by J. Kepner and J. Gilbert, SIAM, 2011.\n"
+
+//==============================================================================
+// GrB_Index: the GraphBLAS integer
+//==============================================================================
+
 // GrB_Index: row or column index, or matrix dimension.  This typedef is used
 // for row and column indices, or matrix and vector dimensions.
 
 typedef uint64_t GrB_Index ;
 
 // The largest valid dimension permitted in this implementation is 2^60.
-// Matrices with that many rows and/or columns can be actually be easily
-// created, particularly if they are hypersparse since in that case O(nrows) or
-// O(ncols) memory is not needed.  For the standard formats, O(ncols) space is
-// needed for CSC and O(nrows) space is needed for CSR.  For hypersparse
-// matrices, the time complexity does not depend on O(nrows) or O(ncols).
-
 #define GxB_INDEX_MAX ((GrB_Index) (1ULL << 60))
 
-//------------------------------------------------------------------------------
+//==============================================================================
 // GraphBLAS error and informational codes
-//------------------------------------------------------------------------------
+//==============================================================================
 
 // All GraphBLAS functions return a code that indicates if it was successful
 // or not.  If more information is required, the GrB_error function can be
 // called, which returns a string that provides more information on the last
 // return value from GraphBLAS.
 
-// SPEC: all enum values in the spec should be defined.  They are not, so as a
-// result, a user code cannot be linked against an arbitrary GraphBLAS library
-// after it is compiled.  It must be linked with the same GraphBLAS library it
-// is compiled with.  SuiteSparse:GraphBLAS defines all user-visible enum
-// values explicitly.
-
 typedef enum
 {
 
@@ -328,38 +251,17 @@ typedef enum
     // informational codes, not an error:
     //--------------------------------------------------------------------------
 
-    // The GraphBLAS spec lists GrB_NO_VALUE as an 'error' code; it means that
-    // A(i,j) is not present in the matrix, having been requested by
-    // GrB_*_extractElement.  The function cannot return the proper value
-    // because the value of 'implicit zeros' depends on the semiring.  For the
-    // conventational plus-times semiring, the implied 'zero' actually has the
-    // value of zero.  For the max-plus semiring, it has the value -infinity.
-    // A matrix does not keep track of its semiring, and the user can change
-    // the semiring used to operate on the matrix.  How mathematically
-    // well-defined that change of semiring is depends the user; GraphBLAS will
-    // not change the explicit values in the matrix if the semiring changes.
-    // As a result, GraphBLAS needs to return not a value, but an indication
-    // that the value of A(i,j) is implicit.  The user application can use this
-    // indicator (GrB_NO_VALUE) to use the semiring's addititive identity, or
-    // it can take other action, as it chooses.  In either case, it is safe to
-    // ask for values that are not there, which is why this return condition is
-    // not really an 'error' code but an informational code.
-
     GrB_NO_VALUE = 1,           // A(i,j) requested but not there
 
     //--------------------------------------------------------------------------
     // API errors:
     //--------------------------------------------------------------------------
 
-    // In non-blocking mode, these errors are caught right away.
-
     GrB_UNINITIALIZED_OBJECT = 2,   // object has not been initialized
     GrB_INVALID_OBJECT = 3,         // object is corrupted
     GrB_NULL_POINTER = 4,           // input pointer is NULL
     GrB_INVALID_VALUE = 5,          // generic error code; some value is bad
-    GrB_INVALID_INDEX = 6,          // a row or column index is out of bounds;
-                                    // used for indices passed as scalars, not
-                                    // in a list.
+    GrB_INVALID_INDEX = 6,          // a row or column index is out of bounds
     GrB_DOMAIN_MISMATCH = 7,        // object domains are not compatible
     GrB_DIMENSION_MISMATCH = 8,     // matrix dimensions do not match
     GrB_OUTPUT_NOT_EMPTY = 9,       // output matrix already has values in it
@@ -368,20 +270,15 @@ typedef enum
     // execution errors:
     //--------------------------------------------------------------------------
 
-    // In non-blocking mode, these errors can be deferred.
-
     GrB_OUT_OF_MEMORY = 10,         // out of memory
     GrB_INSUFFICIENT_SPACE = 11,    // output array not large enough
-    GrB_INDEX_OUT_OF_BOUNDS = 12,   // a row or column index is out of bounds;
-                                    // used for indices in a list of indices.
-    GrB_PANIC = 13                  // SuiteSparse:GraphBLAS only panics if
-                                    // a critical section fails
-
+    GrB_INDEX_OUT_OF_BOUNDS = 12,   // a row or column index is out of bounds
+    GrB_PANIC = 13                  // unknown error, or GrB_init not called.
 }
 GrB_Info ;
 
 //==============================================================================
-//=== GraphBLAS context methods ================================================
+// GrB_init / GrB_finalize
 //==============================================================================
 
 // GrB_init must called before any other GraphBLAS operation.  GrB_finalize
@@ -408,27 +305,10 @@ GrB_Info GrB_init           // start up GraphBLAS
     GrB_Mode mode           // blocking or non-blocking mode
 ) ;
 
-// SPEC: GxB_init is an extension to the spec.  It does the same thing as
-// GrB_init, but it also defines the memory management functions that GraphBLAS
-// will use internally.  The functions can only be defined once, in GxB_init.
-// The GxB_*import* and GxB_*export* functions require that the user
-// application and the GraphBLAS library agree on the same
-// malloc/calloc/realloc/free functions to use, thus GxB_init is required so
-// the user application can define them for SuiteSparse:GraphBLAS.  The
-// user_malloc_is_thread_safe parameter tells SuiteSparse:GraphBLAS whether or
-// not the user-provided functions are thread-safe.  If false, then the
-// functions are only called from within an OpenMP critical section, to provide
-// thread safety.
-
-// SuiteSparse:GraphBLAS V3.0 added user_malloc_is_thread_safe argument to
-// GxB_init, and the Thunk argument changed in GxB_select.  As a result,
-// GxB_init and GxB_select in V3.0 are not backward compatible with V2.x.
-
 GB_PUBLIC
 GrB_Info GxB_init           // start up GraphBLAS and also define malloc, etc
 (
     GrB_Mode mode,          // blocking or non-blocking mode
-
     // pointers to memory management functions
     void * (* user_malloc_function  ) (size_t),
     void * (* user_calloc_function  ) (size_t, size_t),
@@ -447,13 +327,15 @@ GrB_Info GxB_cuda_init      // start up GraphBLAS for use with CUDA
 GB_PUBLIC
 GrB_Info GrB_finalize (void) ;     // finish GraphBLAS
 
+//==============================================================================
+// GrB_getVersion: GraphBLAS C API version
+//==============================================================================
+
 // compile-time access to the C API Version number of this library.
 #define GRB_VERSION     GxB_SPEC_MAJOR
 #define GRB_SUBVERSION  GxB_SPEC_MINOR
 
-// If the user program was compiled with one version of the library but linked
-// with a different one later on, the compile-time version check would be
-// stale.  GrB_getVersion thus provides a runtime access of the C API Version.
+// GrB_getVersion provides a runtime access of the C API Version.
 GB_PUBLIC
 GrB_Info GrB_getVersion         // runtime access to C API version number
 (
@@ -462,33 +344,9 @@ GrB_Info GrB_getVersion         // runtime access to C API version number
 ) ;
 
 //==============================================================================
-//=== GraphBLAS error handling =================================================
-//==============================================================================
-
-// Each GraphBLAS method and operation returns a GrB_Info error code.
-// GrB_error returns additional information on the error in a thread-safe
-// null-terminated string.  The string returned by GrB_error is statically
-// allocated in thread local storage and must not be free'd.
-
-GB_PUBLIC
-const char *GrB_error (void) ;     // return a string describing the last error
-
-//==============================================================================
-//=== GraphBLAS types, operators, monoids, and semirings =======================
+// GrB_Type: data types
 //==============================================================================
 
-//------------------------------------------------------------------------------
-// GraphBLAS types
-//------------------------------------------------------------------------------
-
-// A GraphBLAS GrB_Type defines the type of scalar values that a matrix
-// contains, and the type of scalar operands for a unary or binary operator.
-// There are 13 built-in types, and a user application can define any types of
-// its own as well.  The built-in types correspond to built-in types in C and
-// the classes in MATLAB, as listed below.  The user application can also
-// define new types based on any typedef in the C language whose values are
-// held in a contiguous region of memory.
-
 typedef struct GB_Type_opaque *GrB_Type ;
 
 // GraphBLAS predefined types and their counterparts in pure C and in MATLAB
@@ -507,17 +365,10 @@ GB_PUBLIC GrB_Type
     GxB_FC32   ,        // in C: float complex      in MATLAB: single complex
     GxB_FC64   ;        // in C: double complex     in MATLAB: double complex
 
-// SPEC: complex types are an extension to the spec.
-
 //------------------------------------------------------------------------------
-// GB_ helper macro for polymorphic functions
+// GB_ helper macro for polymorphic functions: do not use outside this file
 //------------------------------------------------------------------------------
 
-// This macro is not intended for use outside this file.  It provides the case
-// statements for the _Generic macros used in polymorphic functions, to select
-// a function based on one of the pre-defined types listed above, or a
-// user-defined type.
-
 #if GxB_STDC_VERSION >= 201112L
 #define GB_(p,prefix,func)                                      \
         const bool       p : prefix ## _ ## func ## _BOOL   ,   \
@@ -557,13 +408,7 @@ GB_PUBLIC GrB_Type
 // GrB_Type_new is implemented both as a macro and a function.  Both are
 // user-callable.  The default is to use the macro, since this allows the name
 // of the type to be saved as a string, for subsequent error reporting by
-// GrB_error.  It is also provided as a function so that applications that
-// require a function instead of macro can access it.  User code can simply do
-// #undef GrB_Type_new before using the function.  This approach also places
-// the function GrB_Type_new in the linkable SuiteSparse:GraphBLAS library so
-// that it is visible for linking with applications in languages other than
-// ANSI C99.  The function version does not allow the name of the ctype to be
-// saved in the new GraphBLAS type, however.  It is given a generic name.
+// GrB_error.
 
 // If SuiteSparse:GraphBLAS is compiled with -DNMACRO then the macro versions
 // of GrB_Type_new, GrB_UnaryOp_new, GrB_BinaryOp_new, and GxB_SelectOp_new
@@ -599,8 +444,6 @@ GrB_Info GB_Type_new            // not user-callable; use GrB_Type_new instead
     const char *name            // name of the type, as "sizeof (ctype)"
 ) ;
 
-// SPEC: GxB_Type_size is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_Type_size          // determine the size of the type
 (
@@ -614,20 +457,9 @@ GrB_Info GrB_Type_free          // free a user-defined type
     GrB_Type *type              // handle of user-defined type to free
 ) ;
 
-//------------------------------------------------------------------------------
-// GraphBLAS unary and binary operators
-//------------------------------------------------------------------------------
-
-// GraphBLAS defines built-in unary and binary operators, and the user may also
-// define new ones via function pointers.  When a user function z=f(x,y) or
-// z=f(x) is called by GraphBLAS, the pointers x, y, and z are guaranteed to be
-// non-NULL and to point to unique valid space of the expected type.  Built-in
-// types are statically allocated and need not be freed when the application
-// finishes.
-
-//------------------------------------------------------------------------------
-// unary operators
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_UnaryOp: unary operators
+//==============================================================================
 
 // GrB_UnaryOp: a function z=f(x).  The function f must have the signature:
 
@@ -686,13 +518,7 @@ GB_PUBLIC GrB_UnaryOp
     // GxB_LNOT_BOOL; it just has a different name.
     GrB_LNOT ;
 
-//------------------------------------------------------------------------------
-// operators for backward compatibilty
-//------------------------------------------------------------------------------
-
-// Now with GrB* names in the current specification.  Kept for backward
-// compatibility.
-
+// GxB_ABS is now in the v1.3 spec, the following names are deprecated:
 GB_PUBLIC GrB_UnaryOp
 
     // z = abs(x)
@@ -821,11 +647,6 @@ GB_PUBLIC GrB_UnaryOp
 // methods for unary operators
 //------------------------------------------------------------------------------
 
-// GrB_UnaryOp_new is implemented both as a macro and a function.  Both are
-// user-callable.  The default is to use the macro, since this allows the name
-// of the unary function to be kept in the new operator as a string.  See the
-// discussion of GrB_Type_new above.
-
 typedef void (*GxB_unary_function)  (void *, const void *) ;
 
 #undef GrB_UnaryOp_new
@@ -853,8 +674,6 @@ GrB_Info GB_UnaryOp_new             // not user-callable; use GrB_UnaryOp_new
     const char *name                // name of the underlying function
 ) ;
 
-// SPEC: GxB_UnaryOp_ztype is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_UnaryOp_ztype          // return the type of z
 (
@@ -862,8 +681,6 @@ GrB_Info GxB_UnaryOp_ztype          // return the type of z
     GrB_UnaryOp unaryop             // unary operator
 ) ;
 
-// SPEC: GxB_UnaryOp_xtype is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_UnaryOp_xtype          // return the type of x
 (
@@ -877,9 +694,9 @@ GrB_Info GrB_UnaryOp_free           // free a user-created unary operator
     GrB_UnaryOp *unaryop            // handle of unary operator to free
 ) ;
 
-//------------------------------------------------------------------------------
-// binary operators
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_BinaryOp: binary operators
+//==============================================================================
 
 // GrB_BinaryOp: a function z=f(x,y).  The function f must have the signature:
 
@@ -949,12 +766,7 @@ GB_PUBLIC GrB_BinaryOp
     // The GxB_IS* comparison operators z=f(x,y) return the same type as their
     // inputs.  Each of them compute z = (x OP y), where x, y, and z all have
     // the same type.  The value z is either 1 for true or 0 for false, but it
-    // is a value with the same type as x and y.  Z is not bool (unless x and y
-    // are also bool).  These operators compute the same thing as the 6 sets of
-    // EQ, NE, GT, LT, GE, and LE operators.  They just return their result z
-    // as the same type as x and y, instead of returning a value z that is
-    // boolean.  Since their ztype is non-boolean, they can be used as multiply
-    // operators in a semring with non-boolean monoids (PLUS, for example).
+    // is a value with the same type as x and y.
 
     // z = (x == y)     z = (x != y)        
     GxB_ISEQ_BOOL,      GxB_ISNE_BOOL,      
@@ -972,8 +784,6 @@ GB_PUBLIC GrB_BinaryOp
     GxB_ISEQ_FC32,      GxB_ISNE_FC32,
     GxB_ISEQ_FC64,      GxB_ISNE_FC64,
 
-    // The following operators are not defined for complex types:
-
     // z = (x > y)      z = (x < y)         z = (x >= y)     z = (x <= y)
     GxB_ISGT_BOOL,      GxB_ISLT_BOOL,      GxB_ISGE_BOOL,      GxB_ISLE_BOOL,
     GxB_ISGT_INT8,      GxB_ISLT_INT8,      GxB_ISGE_INT8,      GxB_ISLE_INT8,
@@ -1001,7 +811,7 @@ GB_PUBLIC GrB_BinaryOp
     GrB_MIN_FP64,       GrB_MAX_FP64,
 
     // Binary operators for each of the 11 real types:
-    
+
     // The operators convert non-boolean types internally to boolean and return
     // a value 1 or 0 in the same type, for true or false.  Each computes z =
     // ((x != 0) OP (y != 0)), where x, y, and z all the same type.  These
@@ -1021,8 +831,8 @@ GB_PUBLIC GrB_BinaryOp
     GxB_LOR_FP32,       GxB_LAND_FP32,      GxB_LXOR_FP32,
     GxB_LOR_FP64,       GxB_LAND_FP64,      GxB_LXOR_FP64,
 
-    // Binary operators operate only on boolean types: LOR, LAND, LXOR, and
-    // LXNOR.  The naming convention differs (_BOOL is not appended to the
+    // Binary operators that operate only on boolean types: LOR, LAND, LXOR,
+    // and LXNOR.  The naming convention differs (_BOOL is not appended to the
     // name).  They are the same as GxB_LOR_BOOL, GxB_LAND_BOOL, and
     // GxB_LXOR_BOOL, and GrB_EQ_BOOL, respectively.
 
@@ -1098,7 +908,7 @@ GB_PUBLIC GrB_BinaryOp
     // the type of x and y since z is always boolean.  If used as multiply
     // operators in a semiring, they can only be combined with boolean monoids.
     // The _BOOL versions of these operators give the same results as their
-    // IS*_BOOL counterparts.
+    // IS*_BOOL counterparts.  GrB_EQ_BOOL and GrB_LXNOR are identical.
 
     // z = (x == y)     z = (x != y)        z = (x > y)         z = (x < y)
     GrB_EQ_BOOL,        GrB_NE_BOOL,        GrB_GT_BOOL,        GrB_LT_BOOL,
@@ -1139,9 +949,61 @@ GB_PUBLIC GrB_BinaryOp
     GxB_CMPLX_FP32,
     GxB_CMPLX_FP64 ;
 
-//------------------------------------------------------------------------------
+//==============================================================================
+// positional GrB_UnaryOp and GrB_BinaryOp operators
+//==============================================================================
+
+// Positional operators do not depend on the value of an entry, but its row or
+// column index in the matrix instead.  For example, for an entry A(i,j),
+// first_i(A(i,j),y) is equal to i.  These operators are useful for returning
+// node id's as the result of a semiring operation.  If used as a mask, zero
+// has a special value, and thus z=first_i1(A(i,j),j) returns i+1 instead of i.
+// This can be useful when using a positional operator to construct a mask
+// matrix or vector for another GraphBLAS operation.  It is also essential for
+// the MATLAB interface, since the user view of matrix indices in MATLAB is
+// 1-based, not 0-based.
+
+// When applied to a vector, j is always equal to 0.  For a GxB_SCALAR,
+// both i and j are always zero.
+
+// GraphBLAS defines a GrB_Index as uint64_t, but these operators return a
+// GrB_INT32 or GrB_INT64 type, which is more flexible to use because the
+// result of this operator can be negated, to flag an entry for example.  The
+// value -1 can be used to denote "no node" or "no position".  GrB_INT32 is
+// useful for graphs smaller than 2^31 nodes.  If the row or column index
+// exceeds INT32_MAX, the result is determined by the typecast from the
+// 64-bit index to the smaller 32-bit index.
+
+// Positional operators cannot be used to construct monoids.  They can be used
+// as multiplicative operators in semirings, and as operators for GrB_eWise*,
+// and GrB_apply (bind first or second).  For the latter, the operator cannot
+// depend on the bound scalar.
+
+// When used as multiplicative operators in a semiring, FIRSTJ and SECONDI
+// are identical.  If C(i,j) += t is computed where t = A(i,k)*B(k,j), then
+// t = k in both cases.  Likewise, FIRSTJ1 and SECONDI1 are identical.
+
+GB_PUBLIC GrB_BinaryOp
+
+    GxB_FIRSTI_INT32,   GxB_FIRSTI_INT64,    // z = first_i(A(i,j),y) == i
+    GxB_FIRSTI1_INT32,  GxB_FIRSTI1_INT64,   // z = first_i1(A(i,j),y) == i+1
+    GxB_FIRSTJ_INT32,   GxB_FIRSTJ_INT64,    // z = first_j(A(i,j),y) == j
+    GxB_FIRSTJ1_INT32,  GxB_FIRSTJ1_INT64,   // z = first_j1(A(i,j),y) == j+1
+    GxB_SECONDI_INT32,  GxB_SECONDI_INT64,   // z = second_i(x,B(i,j)) == i
+    GxB_SECONDI1_INT32, GxB_SECONDI1_INT64,  // z = second_i1(x,B(i,j)) == i+1
+    GxB_SECONDJ_INT32,  GxB_SECONDJ_INT64,   // z = second_j(x,B(i,j)) == j
+    GxB_SECONDJ1_INT32, GxB_SECONDJ1_INT64 ; // z = second_j1(x,B(i,j)) == j+1
+
+GB_PUBLIC GrB_UnaryOp
+
+    GxB_POSITIONI_INT32,  GxB_POSITIONI_INT64,  // z=position_i(A(i,j)) == i
+    GxB_POSITIONI1_INT32, GxB_POSITIONI1_INT64, // z=position_i1(A(i,j)) == i+1
+    GxB_POSITIONJ_INT32,  GxB_POSITIONJ_INT64,  // z=position_j(A(i,j)) == j
+    GxB_POSITIONJ1_INT32, GxB_POSITIONJ1_INT64 ;// z=position_j1(A(i,j)) == j+1
+
+//==============================================================================
 // About boolean and bitwise binary operators
-//------------------------------------------------------------------------------
+//==============================================================================
 
 // Some of the boolean operators compute the same thing with different names.
 // For example, x*y and x&&y give the same results for boolean x and y.
@@ -1158,8 +1020,6 @@ GB_PUBLIC GrB_BinaryOp
 //  1 0  1   0   0   1   1  1  0  1  1  0   1   0  1  1 0 1  0  0 1   1
 //  1 1  1   1   1   1   1  0  1  1  1  1   0   1  0  0 0 1  1  1 1   1
 
-// SPEC: the definition of divide-by-zero is an extension to the spec
-
 // GraphBLAS includes a GrB_DIV_BOOL operator in its specification, but does
 // not define what boolean "division" means.  SuiteSparse:GraphBLAS makes the
 // following interpretation.
@@ -1203,7 +1063,7 @@ GB_PUBLIC GrB_BinaryOp
 //      z = (x >= y)    1 0 1 1     GE, ISGE, POW, and "x implies y"
 //
 //      z = ~x          1 1 0 0     (not(x), not predefined)
-//      z = (x >= y)    1 1 0 1     LE, ISLE, and "y implies x"
+//      z = (x <= y)    1 1 0 1     LE, ISLE, and "y implies x"
 //      z = ~(x && y)   1 1 1 0     (nand(x,y) function, not predefined)
 //      z = 1           1 1 1 1     PAIR
 //
@@ -1224,11 +1084,6 @@ GB_PUBLIC GrB_BinaryOp
 // methods for binary operators
 //------------------------------------------------------------------------------
 
-// GrB_BinaryOp_new is implemented both as a macro and a function.  Both are
-// user-callable.  The default is to use the macro, since this allows the name
-// of the unary function to be kept in the new operator as a string.  See the
-// discussion of GrB_Type_new above.
-
 typedef void (*GxB_binary_function) (void *, const void *, const void *) ;
 
 #undef GrB_BinaryOp_new
@@ -1258,8 +1113,6 @@ GrB_Info GB_BinaryOp_new            // not user-callable; use GrB_BinaryOp_new
     const char *name                // name of the underlying function
 ) ;
 
-// SPEC: GxB_BinaryOp_ztype is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_BinaryOp_ztype         // return the type of z
 (
@@ -1267,8 +1120,6 @@ GrB_Info GxB_BinaryOp_ztype         // return the type of z
     GrB_BinaryOp binaryop           // binary operator to query
 ) ;
 
-// SPEC: GxB_BinaryOp_xtype is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_BinaryOp_xtype         // return the type of x
 (
@@ -1276,8 +1127,6 @@ GrB_Info GxB_BinaryOp_xtype         // return the type of x
     GrB_BinaryOp binaryop           // binary operator to query
 ) ;
 
-// SPEC: GxB_BinaryOp_ytype is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_BinaryOp_ytype         // return the type of y
 (
@@ -1291,22 +1140,19 @@ GrB_Info GrB_BinaryOp_free          // free a user-created binary operator
     GrB_BinaryOp *binaryop          // handle of binary operator to free
 ) ;
 
-//------------------------------------------------------------------------------
-// Select operators
-//------------------------------------------------------------------------------
-
-// SPEC: GxB_SelectOp and all related functions are an extenstion to the spec.
+//==============================================================================
+// GxB_SelectOp: select operators
+//==============================================================================
 
 // GxB_SelectOp is an operator used by GxB_select to select entries from an
 // input matrix A that are kept in the output C.  If an entry A(i,j) in the
 // matrix A, of size nrows-by-ncols, has the value aij, then it calls the
-// select function as result = f (i, j, nrows, ncols, aij, thunk).  If the
-// function returns true, the entry is kept in the output C.  If f returns
-// false, the entry is not kept in C.  The type of x for the GxB_SelectOp
-// operator may be any of the 11 built-in types, or any user-defined type.  It
-// may also be GrB_NULL, to indicate that the function is type-generic and does
-// not depend at all on the value aij.  In this case, x is passed to f as a
-// NULL pointer.
+// select function as result = f (i, j, aij, thunk).  If the function returns
+// true, the entry is kept in the output C.  If f returns false, the entry is
+// not kept in C.  The type of x for the GxB_SelectOp operator may be any of
+// the 11 built-in types, or any user-defined type.  It may also be GrB_NULL,
+// to indicate that the function is type-generic and does not depend at all on
+// the value aij.  In this case, x is passed to f as a NULL pointer.
 
 // The optional Thunk parameter to GxB_select is a GxB_Scalar.  For built-in
 // select operators (TRIL, TRIU, DIAG, and OFFDIAG), Thunk must have any
@@ -1319,22 +1165,17 @@ GrB_Info GrB_BinaryOp_free          // free a user-created binary operator
 // the user operator is defined with a non-NULL Thunk input, then it must
 // be non-NULL and of the same type, when calling GxB_select.
 
-// GxB_SelectOp:  a function z=f(i,j,m,n,x,thunk) for the GxB_Select operation.
+// GxB_SelectOp:  a function z=f(i,j,x,thunk) for the GxB_Select operation.
 // The function f must have the signature:
 
 //      bool f (GrB_Index i, GrB_Index j,
-//              GrB_Index nrows, GrB_Index ncols,
 //              const void *x, const void *thunk) ;
 
-// Note that in Version 2.x of SuiteSparse:GraphBLAS, Thunk was passed to
-// GxB_select as a const void * pointer.  However, this design was incompatible
-// with non-blocking mode, when the GxB_select is computed in parallel.  Thus,
-// in Version 3.0 and following of SuiteSparse:GraphBLAS, Thunk becomes a
-// GxB_Scalar.  The function signature of the user-defined select operator, f,
-// remains the same.
+// NOTE: GxB_SelectOp has changed in v4.  In v3.3.3 it had this syntax:
 
-// ADDED in V3.0:  thunk changed from (const void *) to a GxB_Scalar.  This
-// change is not backward compatible with SuiteSparse:GraphBLAS V2.x.
+//      bool f (GrB_Index i, GrB_Index j,
+//              GrB_Index nrows, GrB_Index ncols,
+//              const void *x, const void *thunk) ;
 
 typedef struct GB_SelectOp_opaque *GxB_SelectOp ;
 
@@ -1386,17 +1227,10 @@ GB_PUBLIC GxB_SelectOp
 // select operators
 //------------------------------------------------------------------------------
 
-// GxB_SelectOp_new is implemented both as a macro and a function.  Both are
-// user-callable.  The default is to use the macro, since this allows the name
-// of the select function to be kept in the new operator as a string.  See the
-// discussion of GrB_Type_new above.
-
 typedef bool (*GxB_select_function)      // return true if A(i,j) is kept
 (
     GrB_Index i,                // row index of A(i,j)
     GrB_Index j,                // column index of A(i,j)
-    GrB_Index nrows,            // number of rows of A
-    GrB_Index ncols,            // number of columns of A
     const void *x,              // value of A(i,j)
     const void *thunk           // optional input for select function
 ) ;
@@ -1446,9 +1280,9 @@ GrB_Info GxB_SelectOp_free      // free a user-created select operator
     GxB_SelectOp *selectop      // handle of select operator to free
 ) ;
 
-//------------------------------------------------------------------------------
-// GraphBLAS Monoid
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_Monoid
+//==============================================================================
 
 // A monoid is an associative operator z=op(x,y) where all three types of z, x,
 // and y are identical.  The monoid also has an identity element, such that
@@ -1456,9 +1290,6 @@ GrB_Info GxB_SelectOp_free      // free a user-created select operator
 
 typedef struct GB_Monoid_opaque *GrB_Monoid ;
 
-// Create a new Monoid with a specific type of identity, which must match
-// the binary_op type.  The binary_op's three types must all be the same.
-
 GB_PUBLIC
 GrB_Info GrB_Monoid_new_BOOL        // create a new boolean monoid
 (
@@ -1742,9 +1573,6 @@ GrB_Info GxB_Monoid_terminal_new             // create a monoid
     (monoid, op, identity, terminal) ;
 #endif
 
-// SPEC: GxB_Monoid_terminal_new is an extension to the spec
-
-// SPEC: GxB_Monoid_operator is an extension to the spec
 GB_PUBLIC
 GrB_Info GxB_Monoid_operator        // return the monoid operator
 (
@@ -1752,7 +1580,6 @@ GrB_Info GxB_Monoid_operator        // return the monoid operator
     GrB_Monoid monoid               // monoid to query
 ) ;
 
-// SPEC: GxB_Monoid_identity is an extension to the spec
 GB_PUBLIC
 GrB_Info GxB_Monoid_identity        // return the monoid identity
 (
@@ -1760,7 +1587,6 @@ GrB_Info GxB_Monoid_identity        // return the monoid identity
     GrB_Monoid monoid               // monoid to query
 ) ;
 
-// SPEC: GxB_Monoid_terminal is an extension to the spec
 GB_PUBLIC
 GrB_Info GxB_Monoid_terminal        // return the monoid terminal
 (
@@ -1776,15 +1602,9 @@ GrB_Info GrB_Monoid_free            // free a user-created monoid
     GrB_Monoid *monoid              // handle of monoid to free
 ) ;
 
-//------------------------------------------------------------------------------
-// GraphBLAS Semiring
-//------------------------------------------------------------------------------
-
-// A semiring defines all the operators required to define the multiplication
-// of two sparse matrices in GraphBLAS, C=A*B.  The "add" operator is a
-// commutative and associative monoid, and the binary "multiply" operator
-// defines a function z=fmult(x,y) where the type of z matches the exactly with
-// the monoid type.
+//==============================================================================
+// GrB_Semiring
+//==============================================================================
 
 typedef struct GB_Semiring_opaque *GrB_Semiring ;
 
@@ -1796,8 +1616,6 @@ GrB_Info GrB_Semiring_new           // create a semiring
     GrB_BinaryOp multiply           // multiply operator of the semiring
 ) ;
 
-// SPEC: GxB_Semiring_add is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_Semiring_add           // return the add monoid of a semiring
 (
@@ -1805,8 +1623,6 @@ GrB_Info GxB_Semiring_add           // return the add monoid of a semiring
     GrB_Semiring semiring           // semiring to query
 ) ;
 
-// SPEC: GxB_Semiring_multiply is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_Semiring_multiply      // return multiply operator of a semiring
 (
@@ -1821,37 +1637,13 @@ GrB_Info GrB_Semiring_free          // free a user-created semiring
 ) ;
 
 //==============================================================================
-//=== GraphBLAS Matrix, Vector, and Scalar objects =============================
+// GxB_Scalar: a GraphBLAS scalar
 //==============================================================================
 
-// Sparse matrices and vectors are the primary objects in GraphBLAS.  All other
-// objects exist to support them, and all the operations do their work on them.
+typedef struct GB_Scalar_opaque *GxB_Scalar ;
 
-// A sparse matrix is nrows-by-ncols and stored in a compressed sparse column
-// form.  The row indices are kept sorted.  Also present is a list of pending
-// tuples, held in (i,j,x) form in an unsorted format.  These are pending
-// updates to the matrix, having been put there by the setElement method and/or
-// assign operations.  The row and column indices of a matrix are of type
-// GrB_Index, and they range from 0 to the dimesion minus 1.  That is, they are
-// zero-based.
-
-// Like all GraphBLAS objects, the GrB_Matrix, GrB_Vector, and GxB_Scalar are
-// opaque to the user; their internal structure may change in future releases.
-
-typedef struct GB_Matrix_opaque *GrB_Matrix ;
-typedef struct GB_Vector_opaque *GrB_Vector ;
-typedef struct GB_Scalar_opaque *GxB_Scalar ;
-
-//==============================================================================
-//=== GraphBLAS Scalar methods =================================================
-//==============================================================================
-
-// SPEC: the GxB_Scalar is an extension to the spec.  A GxB_Scalar acts just
-// like a GrB_Vector of length 1.  It can be sparse, so its entry need not be
-// present.
-
-// These methods create, free, copy, and clear a GxB_Scalar.  The nvals,
-// and type methods return basic information about a GxB_Scalar.
+// These methods create, free, copy, and clear a GxB_Scalar.  The nvals,
+// and type methods return basic information about a GxB_Scalar.
 
 GB_PUBLIC
 GrB_Info GxB_Scalar_new     // create a new GxB_Scalar with no entry
@@ -2024,9 +1816,6 @@ GrB_Info GxB_Scalar_setElement          // s = x
 // Extract a single entry from a GxB_Scalar, x = s, typecasting from the type
 // of s to the type of x as needed.
 
-// Returns GrB_SUCCESS if s has an entry, and sets x to its value.
-// Returns GrB_NO_VALUE if s does not an entry, and x is unmodified.
-
 GB_PUBLIC
 GrB_Info GxB_Scalar_extractElement_BOOL     // x = s
 (
@@ -2145,9 +1934,11 @@ GrB_Info GxB_Scalar_extractElement  // x = s
 #endif
 
 //==============================================================================
-//=== GraphBLAS Vector methods =================================================
+// GrB_Vector: a GraphBLAS vector
 //==============================================================================
 
+typedef struct GB_Vector_opaque *GrB_Vector ;
+
 // These methods create, free, copy, and clear a vector.  The size, nvals,
 // and type methods return basic information about a vector.
 
@@ -2186,8 +1977,6 @@ GrB_Info GrB_Vector_nvals   // get the number of entries in a vector
     const GrB_Vector v      // vector to query
 ) ;
 
-// SPEC: GxB_Vector_type is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_Vector_type    // get the type of a vector
 (
@@ -2208,30 +1997,6 @@ GrB_Info GrB_Vector_free    // free a vector
 // GrB_Vector_build:  w = sparse (I,1,X) in MATLAB notation, but using any
 // associative operator to assemble duplicate entries.
 
-// Build a vector w from a set of (i,x) tuples.  The type and dimension of the
-// vector is already defined in w (via GrB_Vector_new), which must initially
-// have no entries.  I [0..nvals-1] is the list of row indices, and X
-// [0..nvals-1] is the list of numerical values.  The kth tuple is (I[k],X[k]),
-// and tuples can appear in any order.  Values are typecasted from X into the
-// type of the dup operator, as needed (user-defined types cannot be cast).
-// Duplicates are assembled together with the dup operator.  If two tuples
-// (i,x1) and (i,x2) have the same row index, then w(i) = dup (x1,x2).  All
-// three types of x,y,z of z=dup(x,y) must be the same.  The types of C, X, and
-// dup must be compatible.
-
-// SPEC: extension: well-defined behavior of a non-associative dup operator.
-
-// The GraphBLAS spec requires dup to be associative and does not define the
-// order in which duplicates are assembled.  Currently this implementation
-// assembles duplicates in the order they appear in I and X.  For example, if
-// (i,x1), (i,x2), and (i,x3) appear in that order in I and X, then w(i) =
-// dup(dup(x1,x2),x3).  This means that using the non-associative FIRST
-// operator as dup means that w(i) is set equal to the first entry in the list,
-// x1, and SECOND gives the last one, x3.  SuiteSparse:GraphBLAS guarantees
-// this ordering.  However, per the spec, this order of assembly is not
-// guaranteed in all implementations.  Thus dup must be associative and results
-// are not guaranteed in all implementations if it is not.
-
 GB_PUBLIC
 GrB_Info GrB_Vector_build_BOOL      // build a vector from (I,X) tuples
 (
@@ -2541,9 +2306,6 @@ GrB_Info GrB_Vector_setElement          // w(i) = x
 // Extract a single entry from a vector, x = v(i), typecasting from the type of
 // v to the type of x as needed.
 
-// Returns GrB_SUCCESS if v(i) is present, and sets x to its value.
-// Returns GrB_NO_VALUE if v(i) is not present, and x is unmodified.
-
 GB_PUBLIC
 GrB_Info GrB_Vector_extractElement_BOOL     // x = v(i)
 (
@@ -2698,12 +2460,6 @@ GrB_Info GrB_Vector_removeElement
 // example, to extract just the row indices, pass I as non-NULL, and X as NULL.
 // This is like [I,~,~] = find (v) in MATLAB.
 
-// The size of the I and X arrays (those that are not NULL) is given by nvals,
-// which must be at least as large as GrB_Vector_nvals (&nvals, v).  The values
-// in the vector are typecasted to the type of X, as needed.
-
-// SPEC: allowing I and/or X to be NULL is an extension to the spec.
-
 GB_PUBLIC
 GrB_Info GrB_Vector_extractTuples_BOOL      // [I,~,X] = find (v)
 (
@@ -2852,9 +2608,11 @@ GrB_Info GrB_Vector_extractTuples           // [I,~,X] = find (v)
 #endif
 
 //==============================================================================
-//=== GraphBLAS Matrix methods =================================================
+// GrB_Matrix: a GraphBLAS matrix
 //==============================================================================
 
+typedef struct GB_Matrix_opaque *GrB_Matrix ;
+
 // These methods create, free, copy, and clear a matrix.  The nrows, ncols,
 // nvals, and type methods return basic information about a matrix.
 
@@ -2901,8 +2659,6 @@ GrB_Info GrB_Matrix_nvals   // get the number of entries in a matrix
     const GrB_Matrix A      // matrix to query
 ) ;
 
-// SPEC: GxB_Matrix_type is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_Matrix_type    // get the type of a matrix
 (
@@ -2923,24 +2679,6 @@ GrB_Info GrB_Matrix_free    // free a matrix
 // GrB_Matrix_build:  C = sparse (I,J,X) in MATLAB notation, but using any
 // associative operator to assemble duplicate entries.
 
-// Builds a matrix C from a set of (i,j,x) tuples.  The type and dimension of
-// the matrix is already defined in C (via GrB_Matrix_new), which must
-// initially have no entries.  I [0..nvals-1] is the list of row indices, J
-// [0..nvals-1] is the list of column indices, and X [0..nvals-1] is the list
-// of numerical values.  The kth triplet is (I[k],J[k],X[k]), and tuples can
-// appear in any order.  Values are typecasted from X into the type of C, as
-// needed (user-defined types cannot be cast).  Duplicates are assembled
-// together with the dup operator.  If two tuples (i,j,x1) and (i,j,x2) have
-// the same row index, then C(i,j) = dup(x1,x2).  All three types of x,y,z
-// for z=dup(x,y) must be the same; and dup, C, and X must be compatible.
-
-// SPEC: extension: well-defined behavior of a non-associative dup operator.
-
-// The dup operator must be associative in general, and the GraphBLAS spec
-// states the order of assembly is not defined.  However, SuiteSparse:GraphBLAS
-// does guarantee an ordering; see the description of GrB_Vector_build for more
-// details.
-
 GB_PUBLIC
 GrB_Info GrB_Matrix_build_BOOL      // build a matrix from (I,J,X) tuples
 (
@@ -3280,9 +3018,6 @@ GrB_Info GrB_Matrix_setElement          // C (i,j) = x
 // Extract a single entry from a matrix, x = A(i,j), typecasting from the type
 // of A to the type of x, as needed.
 
-// Returns GrB_SUCCESS if A(i,j) is present, and sets x to its value.
-// Returns GrB_NO_VALUE if A(i,j) is not present, and x is unmodified.
-
 GB_PUBLIC
 GrB_Info GrB_Matrix_extractElement_BOOL     // x = A(i,j)
 (
@@ -3453,12 +3188,6 @@ GrB_Info GrB_Matrix_removeElement
 // For example, to extract just the row and col indices, pass I and J as
 // non-NULL, and X as NULL.  This is like [I,J,~] = find (A).
 
-// The size of the I, J, and X arrays (those that are not NULL) is given by
-// nvals, which must be at least as large as GrB_Matrix_nvals (&nvals, A).  The
-// values in the matrix are typecasted to the type of X, as needed.
-
-// SPEC: allowing I, J and/or X to be NULL is an extension to the spec.
-
 GB_PUBLIC
 GrB_Info GrB_Matrix_extractTuples_BOOL      // [I,J,X] = find (A)
 (
@@ -3622,7 +3351,7 @@ GrB_Info GrB_Matrix_extractTuples           // [I,J,X] = find (A)
 #endif
 
 //==============================================================================
-//=== GraphBLAS Descriptor =====================================================
+// GrB_Descriptor: the GraphBLAS descriptor
 //==============================================================================
 
 // The GrB_Descriptor is used to modify the behavior of GraphBLAS operations.
@@ -3655,22 +3384,22 @@ GrB_Info GrB_Matrix_extractTuples           // [I,J,X] = find (A)
 // GxB_CHUNK: an integer parameter that determines the number of threads to use
 //      for a small problem.  If w is the work to be performed, and chunk is
 //      the value of this parameter, then the # of threads is limited to floor
-//      (w/chunk).  The default chunk is currently 4096, but this may change in
+//      (w/chunk).  The default chunk is currently 64K, but this may change in
 //      the future.  If chunk is set to <= GxB_DEFAULT (that is, zero), the
 //      default is used.
 //
 // GxB_AxB_METHOD: this is a hint to SuiteSparse:GraphBLAS on which algorithm
 //      it should use to compute C=A*B, in GrB_mxm, GrB_mxv, and GrB_vxm.
-//      SuiteSparse:GraphBLAS has three different methods, and the default
+//      SuiteSparse:GraphBLAS has four different heuristics, and the default
 //      method (GxB_DEFAULT) selects between them automatically.  The complete
 //      rule is in the User Guide.  The brief discussion here assumes all
 //      matrices are stored by column.  All methods compute the same result,
 //      except that floating-point roundoff may differ when working on
 //      floating-point data types.
 //
-//      GxB_AxB_SAXPY:  C(:,j)=A*B(:,j) is computed using a mix of Gustavson,
-//          Hash, and (in the future) the Heap method.  Each task in the
-//          parallel computation makes its own decision, via a heuristic.
+//      GxB_AxB_SAXPY:  C(:,j)=A*B(:,j) is computed using a mix of Gustavson
+//          and Hash methods.  Each task in the parallel computation makes its
+//          own decision between these two methods, via a heuristic.
 //
 //      GxB_AxB_GUSTAVSON:  This is the same as GxB_AxB_SAXPY, except that
 //          every task uses Gustavon's method, computing C(:,j)=A*B(:,j) via a
@@ -3678,68 +3407,35 @@ GrB_Info GrB_Matrix_extractTuples           // [I,J,X] = find (A)
 //          Very good general-purpose method, but sometimes the workspace can
 //          be too large when many threads are used.
 //
-//      GxB_AxB_HEAP: a heap-based saxpy-style method, computing
-//          C(:,j)=A*B(:,j) via a heap of size equal to the maximum number of
-//          entries in any column of B.  Very good for hypersparse matrices,
-//          particularly when nnz(B) is less than the number of rows of A.
-//          The Heap method is no longer available in v3.2, so it is silently
-//          replaced with GxB_AxB_HASH.  It may reappear in a future version.
-//
 //      GxB_AxB_HASH: This is the same as GxB_AxB_SAXPY, except that every
-//          task uses the Hash method.  Like the Heap method, it is very good
-//          for hypersparse matrices and uses very little workspace (but more
-//          workspace than the Heap method).
+//          task uses the Hash method.  It is very good for hypersparse
+//          matrices and uses very little workspace, and so it scales well to
+//          many threads.
 //
 //      GxB_AxB_DOT: computes C(i,j) = A(:,i)'*B(:,j), for each entry C(i,j).
 //          A very specialized method that works well only if the mask is
-//          present, very sparse, and not complemented, when C is a dense
-//          vector or matrix, or when C is tiny.  It is impossibly slow if C is
-//          large and the mask is not present, since it takes Omega(m*n) time
-//          if C is m-by-n.
+//          present, very sparse, and not complemented, or when C is a dense
+//          vector or matrix, or when C is small.
+//
+// GxB_SORT: GrB_mxm and other methods may return a matrix in a 'jumbled'
+//      state, with indices out of order.  The sort is left pending.  Some
+//      methods can tolerate jumbled matrices on input, so this can be faster.
+//      However, in some cases, it can be faster for GrB_mxm to sort its output
+//      as it is computed.  With GxB_SORT set to GxB_DEFAULT, the sort is left
+//      pending.  With GxB_SORT set to a nonzero value, GrB_mxm typically sorts
+//      the resulting matrix C (but not always; this is just a hint).  If
+//      GrB_init is called with GrB_BLOCKING mode, the sort will always be
+//      done, and this setting has no effect.
 
 // The following are enumerated values in both the GrB_Desc_Field and the
-// GxB_Option_Field.  They are defined with the same integer value for both
-// enums, so the user can use them for both.
+// GxB_Option_Field for global options.  They are defined with the same integer
+// value for both enums, so the user can use them for both.
 #define GxB_NTHREADS 5
 #define GxB_CHUNK 7
 
-// GxB_MKL (DRAFT: in progress, do not use) a boolean that controls the usage
-// of the Intel MKL.  If true, then MKL may be used; if false, MKL is not
-// called. 
-#define GxB_MKL 31
-
-// CUDA support (DRAFT: in progress, do not use)
-// SuiteSparse:GraphBLAS can exploit a CUDA-aware GPU.  CUDA must be avaiable
-// when GraphBLAS is compiled (see the installations instructions), and it
-// must also be requested at run time by called GxB_cuda_init instead of
-// GrB_init or GxB_init.
-//
-//      GxB_GPU_CONTROL:  determines where the computation is performed.
-//
-//          GxB_DEFAULT:    decide based on where the matrix is, etc. 
-//          GxB_GPU_ALWAYS: always use the GPU
-//          GxB_GPU_NEVER:  never use the GPU
-//          type: GrB_Desc_Value (an enum)
-//
-//      GxB_GPU_CHUNK: used by the GxB_GPU_AUTO rule, to decide when a
-//          problem is large enough to use the GPU.  A double value.
-//
-//      GxB_GPU_SET: an array of GPUs to use, defined by integers 0 to
-//          # of GPUs available - 1. For example: [2 4 6], if there are
-//          8 GPUs.  (FUTURE)
-//
-// GxB_cuda_init is not used, or if CUDA is not available when GraphBLAS is
-// compiled as a library, then no GPUs are used and these settings are silently
-// ignored.
-
+// GPU control (DRAFT: in progress, do not use)
 #define GxB_GPU_CONTROL 21
 #define GxB_GPU_CHUNK   22
-// #define GxB_GPU_SET  23      // FUTURE
-
-// GxB_NTHREADS_MAX is no longer used, as of v3.2.0.
-#ifndef GxB_NTHREADS_MAX
-#define GxB_NTHREADS_MAX INT32_MAX
-#endif
 
 typedef enum
 {
@@ -3758,22 +3454,12 @@ typedef enum
     // GPU control (DRAFT: in progress, do not use)
     GxB_DESCRIPTOR_GPU_CONTROL = GxB_GPU_CONTROL,
     GxB_DESCRIPTOR_GPU_CHUNK   = GxB_GPU_CHUNK,
-    // GxB_DESCRIPTOR_GPU_SET  = GxB_GPU_SET,       // FUTURE
-
-    // MKL control (DRAFT: in progress, do not use)
-    GxB_DESCRIPTOR_MKL = GxB_MKL,   // control usage of Intel MKL
 
-    // SuiteSparse:GraphBLAS extensions are given large values so they do not
-    // conflict with future enum values added to the spec:
-    GxB_AxB_METHOD = 1000   // descriptor for selecting C=A*B algorithm
+    GxB_AxB_METHOD = 1000,  // descriptor for selecting C=A*B algorithm
+    GxB_SORT = 35           // control sort in GrB_mxm
 }
 GrB_Desc_Field ;
 
-// SPEC: GxB_DEFAULT, GxB_NTHREADS, GxB_CHUNK, and GxB_AxB_* are extensions.
-// In the spec, setting both GrB_COMP and GrB_STRUCTURE can be done with two
-// calls to GrB_Descriptor_set.  As an extension to the spec, they can also be
-// set with a single call, using the setting GrB_COMP+GrB_STRUCTURE.
-
 typedef enum
 {
     // for all GrB_Descriptor fields:
@@ -3790,13 +3476,12 @@ typedef enum
     // for GrB_INP0 and GrB_INP1 only:
     GrB_TRAN = 3,       // use the transpose of the input
 
-    // for GxB_GPU_CONTROL only:
+    // for GxB_GPU_CONTROL only (DRAFT: in progress, do not use)
     GxB_GPU_ALWAYS  = 2001,
     GxB_GPU_NEVER   = 2002,
 
     // for GxB_AxB_METHOD only:
     GxB_AxB_GUSTAVSON = 1001,   // gather-scatter saxpy method
-    GxB_AxB_HEAP      = 1002,   // heap-based saxpy method
     GxB_AxB_DOT       = 1003,   // dot product
     GxB_AxB_HASH      = 1004,   // hash-based saxpy method
     GxB_AxB_SAXPY     = 1005    // saxpy method (any kind)
@@ -3827,30 +3512,6 @@ GrB_Info GxB_Descriptor_get     // get a parameter from a descriptor
     GrB_Desc_Field field        // parameter to query
 ) ;
 
-// SPEC: GxB_Descriptor_get and GxB_Desc_get are extensions to the spec.  The
-// two functions are identical except for the order of the parameters, and the
-// type of the val parameter.  GxB_Desc_set is also an extension to the spec.
-
-// GxB_Descriptor_get was introduced in SuiteSparse:GraphBLAS Version 1.0 as an
-// extenstion to the spec.  Version 2.1.0 includes GxB_Desc_set and
-// GxB_Desc_get to have the same parameter ordering as the other GxB_*set/get
-// functions introduced in Version 2.1.  The third argument of GxB_*set is a
-// variable type, depending on the field.  The third argument of GxB_*get is a
-// pointer to a variable type, also depending on the field.
-
-// For the future, GxB_Descriptor_get will only be able to query the descriptor
-// fields in the spec of type GrB_Desc_Value (GrB_OUTP, GrB_MASK, GrB_INP0, and
-// GrB_INP1).  It does not extend to fields of arbitrary type.  GxB_Desc_get is
-// able to extend to arbitrary types, as is GxB_Desc_set.  Thus,
-// GxB_Desc_set/get, are preferred for future use.   GxB_Descriptor_get shall
-// be preserved into the future, for backward compatibility, however.
-
-// The simplest way to set/get a value of a GrB_Descriptor is with
-// the generic GxB_set and GxB_get functions:
-
-//      GxB_set (desc, field, value) ;
-//      GxB_get (desc, field, &value) ;
-
 GB_PUBLIC
 GrB_Info GxB_Desc_set           // set a parameter in a descriptor
 (
@@ -3931,11 +3592,9 @@ GrB_DESC_RSCT0T1 ; // GrB_REPLACE  GrB_STRUCTURE  GrB_COMP   GrB_TRAN  GrB_TRAN
 // them results in an error (GrB_INVALID_VALUE).  Attempts to free them are
 // silently ignored.
 
-//==============================================================================
-//=== SuiteSparse:GraphBLAS options ============================================
-//==============================================================================
-
-// SPEC: GxB_*_Option_* are extensions to the specification.
+//------------------------------------------------------------------------------
+// SuiteSparse:GraphBLAS options
+//------------------------------------------------------------------------------
 
 // The following options modify how SuiteSparse:GraphBLAS stores and operates
 // on its matrices.  The GxB_*Option* methods allow the user to suggest how the
@@ -3946,44 +3605,29 @@ GrB_DESC_RSCT0T1 ; // GrB_REPLACE  GrB_STRUCTURE  GrB_COMP   GrB_TRAN  GrB_TRAN
 
 //      GxB_Matrix_Option_set:  sets an option for a specific matrix
 //      GxB_Matrix_Option_get:  queries the current option of a specific matrix
+//      GxB_Vector_Option_set:  sets an option for a specific vector
+//      GxB_Vector_Option_get:  queries the current option of a specific vector
 //      GxB_Global_Option_set:  sets an option for all future matrices
 //      GxB_Global_Option_get:  queries current option for all future matrices
 
-// A pair generic functions are available to set and query the global options,
-// the matrix options, and the values of the GrB_Descriptor:
-
-//  GxB_set: sets a global option, a GrB_Matrix option or a GrB_Descriptor
-//  GxB_get: queries a global option, a GrB_Matrix option or a GrB_Descriptor
-
-// ADDED in V3.0: GxB_CHUNK, GxB_LIBRARY_*, GxB_API_* options:
-// ADDED in V3.3: GxB_MKL and GxB_GPU* (DRAFT: in progress, do not use)
+#define GxB_HYPER 0     // (deprecated, use GxB_HYPER_SWITCH)
 
 typedef enum            // for global options or matrix options
 {
-    // GxB_Matrix_Option_get/set and GxB_Global_Option_get/set:
-    GxB_HYPER = 0,      // defines switch to hypersparse format (a double value)
-    GxB_FORMAT = 1,     // defines CSR/CSC format: GxB_BY_ROW or GxB_BY_COL
-
-    // GxB_Global_Option_get only:
-    GxB_MODE = 2,       // mode passed to GrB_init (blocking or non-blocking)
-
-    GxB_THREAD_SAFETY = 3,  // thread library that allows GraphBLAS to
-                        // be thread-safe for user threads.
 
-    GxB_THREADING = 4,  // thread library used for internal GraphBLAS threads
-
-    // GxB_Global_Option_get/set only:
-    GxB_GLOBAL_NTHREADS = GxB_NTHREADS,  // max number of threads to use
-                        // If <= GxB_DEFAULT, then GraphBLAS selects the number
-                        // of threads automatically.
+    //------------------------------------------------------------
+    // for GxB_Matrix_Option_get/set and GxB_Global_Option_get/set:
+    //------------------------------------------------------------
 
-    GxB_GLOBAL_CHUNK = GxB_CHUNK,       // chunk size for small problems.
-                        // If <= GxB_DEFAULT, then the default is used.
+    GxB_HYPER_SWITCH = 0,   // defines switch to hypersparse (a double value)
+    GxB_BITMAP_SWITCH = 34, // defines switch to bitmap (a double value)
+    GxB_FORMAT = 1,         // defines CSR/CSC format: GxB_BY_ROW or GxB_BY_COL
 
-    // GxB_Matrix_Option_get only:
-    GxB_IS_HYPER = 6,   // query a matrix to see if it hypersparse or not
+    //------------------------------------------------------------
+    // for GxB_Global_Option_get only:
+    //------------------------------------------------------------
 
-    // GxB_Global_Option_get only:
+    GxB_MODE = 2,       // mode passed to GrB_init (blocking or non-blocking)
     GxB_LIBRARY_NAME = 8,           // name of the library (char *)
     GxB_LIBRARY_VERSION = 9,        // library version (3 int's)
     GxB_LIBRARY_DATE = 10,          // date of the library (char *)
@@ -3997,16 +3641,38 @@ typedef enum            // for global options or matrix options
     GxB_API_ABOUT = 18,             // about the API (char *)
     GxB_API_URL = 19,               // URL for the API (char *)
 
-    // GPU control (DRAFT: in progress, do not use)
-    GxB_GPU_COUNT = 20,             // # of GPUs (query only)
-    GxB_GLOBAL_GPU_CONTROL = GxB_GPU_CONTROL,
-    GxB_GLOBAL_GPU_CHUNK   = GxB_GPU_CHUNK,
-    // GxB_GLOBAL_GPU_SET  = GxB_GPU_SET,       // FUTURE
+    //------------------------------------------------------------
+    // for GxB_Global_Option_get/set only:
+    //------------------------------------------------------------
+
+    GxB_GLOBAL_NTHREADS = GxB_NTHREADS,  // max number of threads to use
+                        // If <= GxB_DEFAULT, then GraphBLAS selects the number
+                        // of threads automatically.
+
+    GxB_GLOBAL_CHUNK = GxB_CHUNK,       // chunk size for small problems.
+                        // If <= GxB_DEFAULT, then the default is used.
+
+    GxB_BURBLE = 99,    // diagnostic output (bool *)
+
+    //------------------------------------------------------------
+    // for GxB_Matrix_Option_get only:
+    //------------------------------------------------------------
+
+    GxB_SPARSITY_STATUS = 33,       // hyper, sparse, bitmap or full (1,2,4,8)
+    GxB_IS_HYPER = 6,               // deprecated; use GxB_SPARSITY_STATUS
 
-    // MKL control (DRAFT: in progress, do not use)
-    GxB_GLOBAL_MKL = GxB_MKL,       // control usage of Intel MKL
+    //------------------------------------------------------------
+    // for GxB_Matrix_Option_get/set only:
+    //------------------------------------------------------------
 
-    GxB_BURBLE = 99                 // development only (bool *)
+    GxB_SPARSITY_CONTROL = 32,      // sparsity control: 0 to 15; see below
+
+    //------------------------------------------------------------
+    // GPU and options (DRAFT: do not use)
+    //------------------------------------------------------------
+
+    GxB_GLOBAL_GPU_CONTROL = GxB_GPU_CONTROL,
+    GxB_GLOBAL_GPU_CHUNK   = GxB_GPU_CHUNK,
 
 } GxB_Option_Field ;
 
@@ -4019,42 +3685,86 @@ typedef enum
 }
 GxB_Format_Value ;
 
-// GxB_THREAD_SAFETY and GxB_THREADING can be one of the following:
-typedef enum
-{
-    GxB_THREAD_NONE = 0,    // no threading
-    GxB_THREAD_OPENMP = 1,  // OpenMP
-    GxB_THREAD_POSIX = 2,   // POSIX pthreads
-    GxB_THREAD_WINDOWS = 3, // Windows threads
-    GxB_THREAD_ANSI = 4     // ANSI C11 threads
-}
-GxB_Thread_Model ;
-
 // The default format is by column, just like MATLAB.  These constants are
 // defined as GB_PUBLIC const, so that if SuiteSparse:GraphBLAS is recompiled
 // with a different default format, and the application is relinked but not
 // recompiled, it will acquire the new default values.
 GB_PUBLIC const GxB_Format_Value GxB_FORMAT_DEFAULT ;
 
-// the default hypersparsity ratio
+// the default hyper_switch parameter
 GB_PUBLIC const double GxB_HYPER_DEFAULT ;
 
-// Let k be the actual number of non-empty vectors (with at least one entry).
-// This value k is not dependent on whether or not the matrix is stored in
-// hypersparse format.  Let n be the number of vectors (the # of columns if
-// CSC, or rows if CSR).  Let h be the value of the GxB_HYPER setting of the
-// matrix.
+// GxB_SPARSITY_CONTROL can be any sum or bitwise OR of these 4 values:
+#define GxB_HYPERSPARSE 1   // store matrix in hypersparse form
+#define GxB_SPARSE      2   // store matrix as sparse form (compressed vector)
+#define GxB_BITMAP      4   // store matrix as a bitmap
+#define GxB_FULL        8   // store matrix as full; all entries must be present
+
+// size of b array for GxB_set/get (GxB_BITMAP_SWITCH, b)
+#define GxB_NBITMAP_SWITCH 8    // size of bitmap_switch parameter array
+
+// any sparsity value:
+#define GxB_ANY_SPARSITY (GxB_HYPERSPARSE + GxB_SPARSE + GxB_BITMAP + GxB_FULL)
+
+// the default sparsity control is any format:
+#define GxB_AUTO_SPARSITY GxB_ANY_SPARSITY
+
+// GxB_Matrix_Option_set (A, GxB_SPARSITY_CONTROL, scontrol) provides hints
+// about which data structure GraphBLAS should use for the matrix A:
+//
+//      GxB_AUTO_SPARSITY: GraphBLAS selects automatically.
+//      GxB_HYPERSPARSE: always hypersparse, taking O(nvals(A)) space.
+//      GxB_SPARSE: always in a sparse struture: compressed-sparse row/column,
+//          taking O(nrows+nvals(A)) space if stored by row, or
+//          O(ncols+nvals(A)) if stored by column.
+//      GxB_BITMAP: always in a bitmap struture, taking O(nrows*ncols) space.
+//      GxB_FULL: always in a full structure, taking O(nrows*ncols) space,
+//          unless not all entries are present, in which case the bitmap
+//          storage is used.
+//
+// These options can be summed.  For example, to allow a matrix to be sparse
+// or hypersparse, but not bitmap or full, use GxB_SPARSE + GxB_HYPERSPARSE.
+// Since GxB_FULL can only be used when all entries are present, matrices with
+// the just GxB_FULL control setting are stored in bitmap form if any entries
+// are not present.
+//
+// Only the least 4 bits of the sparsity control are considered, so the
+// formats can be bitwise negated.  For example, to allow for any format
+// except full, use ~GxB_FULL.
+//
+// GxB_Matrix_Option_get (A, GxB_SPARSITY_STATUS, &sparsity) returns the
+// current data structure currently used for the matrix A (either hypersparse,
+// sparse, bitmap, or full).
 //
-// If a matrix is currently hypersparse, it can be converted to non-hypersparse
-// if (n <= 1  || k > 2*n*h).  Otherwise ti stays hypersparse.  If (n <= 1) the
-// matrix is always stored as non-hypersparse.
+// GxB_Matrix_Option_get (A, GxB_SPARSITY_CONTROL, &scontrol) returns the hint
+// for how A should be stored (hypersparse, sparse, bitmap, or full, or any
+// combination).
+
+// GxB_HYPER_SWITCH:
+//      If the matrix or vector structure can be sparse or hypersparse, the
+//      GxB_HYPER_SWITCH parameter controls when each of these structures are
+//      used.  The parameter is not used if the matrix or vector is full or
+//      bitmap.
 //
-// If currently non-hypersparse, it can be converted to hypersparse if (n > 1
-// && k <= n*h).  Otherwise, it stays non-hypersparse.  If (n <= 1) the matrix
-// always remains non-hypersparse.
+//      Let k be the actual number of non-empty vectors (with at least one
+//      entry).  This value k is not dependent on whether or not the matrix is
+//      stored in hypersparse structure.  Let n be the number of vectors (the #
+//      of columns if CSC, or rows if CSR).  Let h be the value of the
+//      GxB_HYPER_SWITCH setting of the matrix.
+//
+//      If a matrix is currently hypersparse, it can be converted to
+//      non-hypersparse if (n <= 1  || k > 2*n*h).  Otherwise it stays
+//      hypersparse.  If (n <= 1) the matrix is always stored as
+//      non-hypersparse.
+//
+//      If currently non-hypersparse, it can be converted to hypersparse if (n
+//      > 1 && k <= n*h).  Otherwise, it stays non-hypersparse.  If (n <= 1)
+//      the matrix always remains non-hypersparse.
+//
+//      Setting GxB_HYPER_SWITCH to GxB_ALWAYS_HYPER or GxB_NEVER_HYPER ensures
+//      a matrix always stays hypersparse, or always stays non-hypersparse,
+//      respectively.
 
-// setting GxB_HYPER to either of these values ensures a matrix always
-// stays hypersparse, or always stays non-hypersparse, respectively
 GB_PUBLIC const double GxB_ALWAYS_HYPER, GxB_NEVER_HYPER ;
 
 GB_PUBLIC
@@ -4073,17 +3783,34 @@ GrB_Info GxB_Matrix_Option_get      // gets the current option of a matrix
     ...                             // return value of the matrix option
 ) ;
 
+GB_PUBLIC
+GrB_Info GxB_Vector_Option_set      // set an option in a vector
+(
+    GrB_Vector A,                   // vector to modify
+    GxB_Option_Field field,         // option to change
+    ...                             // value to change it to
+) ;
+
+GB_PUBLIC
+GrB_Info GxB_Vector_Option_get      // gets the current option of a vector
+(
+    GrB_Vector A,                   // vector to query
+    GxB_Option_Field field,         // option to query
+    ...                             // return value of the vector option
+) ;
+
 // GxB_Global_Option_set controls the global defaults used when a new matrix is
 // created.  GrB_init defines the following initial settings:
 //
-//      GxB_Global_Option_set (GxB_HYPER, GxB_HYPER_DEFAULT) ;
+//      GxB_Global_Option_set (GxB_HYPER_SWITCH, GxB_HYPER_DEFAULT) ;
+//      GxB_Global_Option_set (GxB_BITMAP_SWITCH, NULL) ;
 //      GxB_Global_Option_set (GxB_FORMAT, GxB_FORMAT_DEFAULT) ;
 //
 // The compile-time constants GxB_HYPER_DEFAULT and GxB_FORMAT_DEFAULT are
-// equal to 0.625 and GxB_BY_ROW, by default.  That is, by default, all new
+// equal to 0.0625 and GxB_BY_ROW, by default.  That is, by default, all new
 // matrices are held by row in CSR format.  If a matrix has fewer than n/16
-// columns, it can be converted to hypersparse format.  If it has more than n/8
-// columns, it can be converted to non-hypersparse format.  Modifying these
+// columns, it can be converted to hypersparse structure.  If it has more than
+// n/8 columns, it can be converted to a sparse structure.  Modifying these
 // global settings via GxB_Global_Option_set has no effect on matrices already
 // created.
 
@@ -4101,9 +3828,15 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
     ...                             // return value of the global option
 ) ;
 
-//==============================================================================
-// === GxB_set and GxB_get =====================================================
-//==============================================================================
+//------------------------------------------------------------------------------
+// GxB_set and GxB_get
+//------------------------------------------------------------------------------
+
+// The simplest way to set/get a value of a GrB_Descriptor is with
+// the generic GxB_set and GxB_get functions:
+
+//      GxB_set (desc, field, value) ;
+//      GxB_get (desc, field, &value) ;
 
 // GxB_set and GxB_get are generic methods that and set or query the options in
 // a GrB_Matrix, a GrB_Descriptor, or in the global options.  They can be used
@@ -4112,10 +3845,15 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
 
 // To set/get the global options:
 //
-//      GxB_set (GxB_HYPER, double h) ;
-//      GxB_set (GxB_HYPER, GxB_ALWAYS_HYPER) ;
-//      GxB_set (GxB_HYPER, GxB_NEVER_HYPER) ;
-//      GxB_get (GxB_HYPER, double *h) ;
+//      GxB_set (GxB_HYPER_SWITCH, double h) ;
+//      GxB_set (GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER) ;
+//      GxB_set (GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
+//      GxB_get (GxB_HYPER_SWITCH, double *h) ;
+//
+//      double b [GxB_NBITMAP_SWITCH] ;
+//      GxB_set (GxB_BITMAP_SWITCH, b) ;
+//      GxB_set (GxB_BITMAP_SWITCH, NULL) ;     // set defaults
+//      GxB_get (GxB_BITMAP_SWITCH, b) ;
 //
 //      GxB_set (GxB_FORMAT, GxB_BY_ROW) ;
 //      GxB_set (GxB_FORMAT, GxB_BY_COL) ;
@@ -4130,53 +3868,44 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
 //      GxB_set (GxB_BURBLE, bool burble) ;
 //      GxB_get (GxB_BURBLE, bool *burble) ;
 
-// To set/get the global GPU options: (DRAFT: in progress, do not use)
-//
-//      GxB_set (GxB_GPU_CONTROL, GxB_DEFAULT) ;
-//      GxB_set (GxB_GPU_CONTROL, GxB_GPU_ALWAYS) ;
-//      GxB_set (GxB_GPU_CONTROL, GxB_GPU_NEVER) ;
-//      GxB_get (GxB_GPU_CONTROL, GrB_Desc_Value *)
-//
-//      GxB_set (GxB_GPU_CHUNK, double chunk) ;
-//      GxB_get (GxB_GPU_CHUNK, double *chunk) ;
-//
-//      GxB_get (GxB_GPU_COUNT, int *ngpus) ;   // query only
-
-// To set/get the global MKL options: (DRAFT: in progress, do not use)
-//
-//      GxB_set (GxB_MKL, bool use_mkl) ;
-//      GxB_get (GxB_MKL, bool *use_mkl) ;
-
 // To get global options that can be queried but not modified:
 //
-//      GxB_get (GxB_MODE,          GrB_Mode *mode) ;
-//      GxB_get (GxB_THREAD_SAFETY, GxB_Thread_Model *thread_safety) ;
-//      GxB_get (GxB_THREADING,     GxB_Thread_Model *threading) ;
+//      GxB_get (GxB_MODE, GrB_Mode *mode) ;
 
 // To set/get a matrix option:
 //
-//      GxB_set (GrB_Matrix A, GxB_HYPER, double h) ;
-//      GxB_set (GrB_Matrix A, GxB_HYPER, GxB_ALWAYS_HYPER) ;
-//      GxB_set (GrB_Matrix A, GxB_HYPER, GxB_NEVER_HYPER) ;
-//      GxB_get (GrB_Matrix A, GxB_HYPER, double *h) ;
+//      GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, double h) ;
+//      GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER) ;
+//      GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
+//      GxB_get (GrB_Matrix A, GxB_HYPER_SWITCH, double *h) ;
+//
+//      GxB_set (GrB_Matrix A, GxB_BITMAP_SWITCH, double b) ;
+//      GxB_get (GrB_Matrix A, GxB_BITMAP_SWITCH, double *b) ;
 //
 //      GxB_set (GrB_Matrix A, GxB_FORMAT, GxB_BY_ROW) ;
 //      GxB_set (GrB_Matrix A, GxB_FORMAT, GxB_BY_COL) ;
 //      GxB_get (GrB_Matrix A, GxB_FORMAT, GxB_Format_Value *s) ;
-
-// To set/get the matrix GPU options: (DRAFT: in progress, do not use)
 //
-//      GxB_set (GrB_Matrix A, GxB_GPU_CONTROL, GxB_DEFAULT) ;
-//      GxB_set (GrB_Matrix A, GxB_GPU_CONTROL, GxB_GPU_ALWAYS) ;
-//      GxB_set (GrB_Matrix A, GxB_GPU_CONTROL, GxB_GPU_NEVER) ;
-//      GxB_get (GrB_Matrix A, GxB_GPU_CONTROL, GrB_Desc_Value *)
+//      GxB_set (GrB_Matrix A, GxB_SPARSITY_CONTROL, GxB_AUTO_SPARSITY) ;
+//      GxB_set (GrB_Matrix A, GxB_SPARSITY_CONTROL, scontrol) ;
+//      GxB_get (GrB_Matrix A, GxB_SPARSITY_CONTROL, int *scontrol) ;
 //
-//      GxB_set (GrB_Matrix A, GxB_GPU_CHUNK, double chunk) ;
-//      GxB_get (GrB_Matrix A, GxB_GPU_CHUNK, double *chunk) ;
+//      GxB_get (GrB_Matrix A, GxB_SPARSITY_STATUS, int *sparsity) ;
 
-// To get a matrix status (modified with GxB_HYPER, double h parameter):
+// To set/get a vector option or status:
+//
+//      GxB_set (GrB_Vector v, GxB_BITMAP_SWITCH, double b) ;
+//      GxB_get (GrB_Vector v, GxB_BITMAP_SWITCH, double *b) ;
+//
+//      GxB_set (GrB_Vector v, GxB_FORMAT, GxB_BY_ROW) ;
+//      GxB_set (GrB_Vector v, GxB_FORMAT, GxB_BY_COL) ;
+//      GxB_get (GrB_Vector v, GxB_FORMAT, GxB_Format_Value *s) ;
+//
+//      GxB_set (GrB_Vector v, GxB_SPARSITY_CONTROL, GxB_AUTO_SPARSITY) ;
+//      GxB_set (GrB_Vector v, GxB_SPARSITY_CONTROL, scontrol) ;
+//      GxB_get (GrB_Vector v, GxB_SPARSITY_CONTROL, int *scontrol) ;
 //
-//      GxB_get (GrB_Matrix A, GxB_IS_HYPER, bool *is_hyper) ;
+//      GxB_get (GrB_Vector v, GxB_SPARSITY_STATUS, int *sparsity) ;
 
 // To set/get a descriptor field:
 //
@@ -4200,7 +3929,6 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
 //
 //      GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_DEFAULT) ;
 //      GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_GUSTAVSON) ;
-//      GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_HEAP) ;
 //      GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_HASH) ;
 //      GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_SAXPY) ;
 //      GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_DOT) ;
@@ -4211,21 +3939,9 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
 //
 //      GxB_set (GrB_Descriptor d, GxB_CHUNK, double chunk) ;
 //      GxB_get (GrB_Descriptor d, GxB_CHUNK, double *chunk) ;
-
-// To set/get the descriptor MKL options: (DRAFT: in progress, do not use)
-//
-//      GxB_set (GrB_Descriptor d, GxB_MKL, bool use_mkl) ;
-//      GxB_get (GrB_Descriptor d, GxB_MKL, bool *use_mkl) ;
-
-// To set/get the descriptor GPU options: (DRAFT: in progress, do not use)
-//
-//      GxB_set (GrB_Descriptor d, GxB_GPU_CONTROL, GxB_DEFAULT) ;
-//      GxB_set (GrB_Descriptor d, GxB_GPU_CONTROL, GxB_GPU_ALWAYS) ;
-//      GxB_set (GrB_Descriptor d, GxB_GPU_CONTROL, GxB_GPU_NEVER) ;
-//      GxB_get (GrB_Descriptor d, GxB_GPU_CONTROL, GrB_Desc_Value *)
 //
-//      GxB_set (GrB_Descriptor d, GxB_GPU_CHUNK, double chunk) ;
-//      GxB_get (GrB_Descriptor d, GxB_GPU_CHUNK, double *chunk) ;
+//      GxB_set (GrB_Descriptor d, GxB_SORT, sort) ;
+//      GxB_get (GrB_Descriptor d, GxB_SORT, int *sort) ;
 
 #if GxB_STDC_VERSION >= 201112L
 #define GxB_set(arg1,...)                                   \
@@ -4234,6 +3950,7 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
         (arg1),                                             \
               int              : GxB_Global_Option_set ,    \
               GxB_Option_Field : GxB_Global_Option_set ,    \
+              GrB_Vector       : GxB_Vector_Option_set ,    \
               GrB_Matrix       : GxB_Matrix_Option_set ,    \
               GrB_Descriptor   : GxB_Desc_set               \
     )                                                       \
@@ -4247,6 +3964,8 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
               int              : GxB_Global_Option_get ,    \
         const GxB_Option_Field : GxB_Global_Option_get ,    \
               GxB_Option_Field : GxB_Global_Option_get ,    \
+        const GrB_Vector       : GxB_Vector_Option_get ,    \
+              GrB_Vector       : GxB_Vector_Option_get ,    \
         const GrB_Matrix       : GxB_Matrix_Option_get ,    \
               GrB_Matrix       : GxB_Matrix_Option_get ,    \
         const GrB_Descriptor   : GxB_Desc_get          ,    \
@@ -4256,16 +3975,12 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
 #endif
 
 //==============================================================================
-//=== GrB_free =================================================================
+// GrB_free: free any GraphBLAS object
 //==============================================================================
 
-// GrB_free: free a GraphBLAS object.  Each GraphBLAS object has a specific
-// GrB_*_new and GrB_*_free method.  There is no generic GrB_new, but the
-// generic GrB_free method can free any GraphBLAS object.  It is safe to free
-// an object twice, and it is also safe to (attempt to) free a built-in object.
-// In that case, GrB_free silently does nothing and returns GrB_SUCCESS.  By
-// the GraphBLAS spec, GrB_*_free functions can return GrB_SUCCESS or
-// GrB_PANIC; in this implementation they never panic.
+// for null and invalid objects
+#define GrB_NULL NULL
+#define GrB_INVALID_HANDLE NULL
 
 #if GxB_STDC_VERSION >= 201112L
 #define GrB_free(object)                         \
@@ -4287,7 +4002,7 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
 #endif
 
 //==============================================================================
-//=== GraphBLAS sequence termination ===========================================
+// GrB_wait: finish computations
 //==============================================================================
 
 // Finish all pending work in a specific object.
@@ -4303,15 +4018,8 @@ GB_PUBLIC GrB_Info GxB_Scalar_wait     (GxB_Scalar     *s       ) ;
 GB_PUBLIC GrB_Info GrB_Vector_wait     (GrB_Vector     *v       ) ;
 GB_PUBLIC GrB_Info GrB_Matrix_wait     (GrB_Matrix     *A       ) ;
 
-// TODO in 4.0: GrB_wait (with no inputs) is deprecated, and also not
-// compatible with the polymorphic GrB_wait (&object).  In V4.0,
-// GrB_wait ( ) will be removed, and the polymorphic GrB_wait (&object)
-// will be added.
-
-GB_PUBLIC GrB_Info GrB_wait (void) ;        // DEPRECATED: TODO in 4.0: delete
-
-// TODO in 4.0: add GrB_wait (&object) polymorphic function:
-/*
+// GrB_wait (&object) polymorphic function:
+#if GxB_STDC_VERSION >= 201112L
 #define GrB_wait(object)                         \
     _Generic                                     \
     (                                            \
@@ -4329,44 +4037,60 @@ GB_PUBLIC GrB_Info GrB_wait (void) ;        // DEPRECATED: TODO in 4.0: delete
     )                                            \
     (object)
 #endif
-*/
 
 //==============================================================================
-//=== GraphBLAS operations =====================================================
+// GrB_error: error handling
 //==============================================================================
 
-// Each GraphBLAS operation can be modified by an optional Mask, an optional
-// accum operator, and a descriptor.
-
-// The primary computation of an operation computes a matrix or vector T.  If
-// accum is NULL, Z=T.  Otherwise, Z=accum(C,T) is computed, where accum is a
-// binary operator applied in an element-wise add manner.  Next, C is
-// optionally cleared if the REPLACE descriptor is enabled.  Finally, C<Mask>=Z
-// is computed.  If there is no Mask, C=Z, or if an empty Mask (Mask==NULL) is
-// complemented via the descriptor, C is not modified at all.  Otherwise
-// C(Mask)=Z(Mask) is computed using MATLAB-style logical index, if the Mask is
-// not complemented.  Otherwise C(~Mask)=Z(~Mask) is computed.  This
-// description is terse; see the User Guide for more details.
-
-// GrB_NULL is used for the accum argument when no accum operation is desired,
-// for the Mask argument when no Mask is desired, and for the descriptor
-// argument when the default descriptor is desired.
-
-#define GrB_NULL NULL
-
-// An object that has been freed is a GrB_INVALID_HANDLE, a NULL pointer.
-
-#define GrB_INVALID_HANDLE NULL
-
-//------------------------------------------------------------------------------
-// matrix and vector multiplication over a semiring
-//------------------------------------------------------------------------------
+// Each GraphBLAS method and operation returns a GrB_Info error code.
+// GrB_error returns additional information on the error in a thread-safe
+// null-terminated string.  The string returned by GrB_error is owned by
+// the GraphBLAS library and must not be free'd.
+
+GB_PUBLIC GrB_Info GrB_Type_error       (const char **error, const GrB_Type       type) ;
+GB_PUBLIC GrB_Info GrB_UnaryOp_error    (const char **error, const GrB_UnaryOp    op) ;
+GB_PUBLIC GrB_Info GrB_BinaryOp_error   (const char **error, const GrB_BinaryOp   op) ;
+GB_PUBLIC GrB_Info GxB_SelectOp_error   (const char **error, const GxB_SelectOp   op) ;
+GB_PUBLIC GrB_Info GrB_Monoid_error     (const char **error, const GrB_Monoid     monoid) ;
+GB_PUBLIC GrB_Info GrB_Semiring_error   (const char **error, const GrB_Semiring   semiring) ;
+GB_PUBLIC GrB_Info GxB_Scalar_error     (const char **error, const GxB_Scalar     s) ;
+GB_PUBLIC GrB_Info GrB_Vector_error     (const char **error, const GrB_Vector     v) ;
+GB_PUBLIC GrB_Info GrB_Matrix_error     (const char **error, const GrB_Matrix     A) ;
+GB_PUBLIC GrB_Info GrB_Descriptor_error (const char **error, const GrB_Descriptor d) ;
+
+// GrB_error (error,object) polymorphic function:
+#if GxB_STDC_VERSION >= 201112L
+#define GrB_error(error,object)                         \
+    _Generic                                            \
+    (                                                   \
+        (object),                                       \
+        const GrB_Type       : GrB_Type_error       ,   \
+              GrB_Type       : GrB_Type_error       ,   \
+        const GrB_UnaryOp    : GrB_UnaryOp_error    ,   \
+              GrB_UnaryOp    : GrB_UnaryOp_error    ,   \
+        const GrB_BinaryOp   : GrB_BinaryOp_error   ,   \
+              GrB_BinaryOp   : GrB_BinaryOp_error   ,   \
+        const GxB_SelectOp   : GxB_SelectOp_error   ,   \
+              GxB_SelectOp   : GxB_SelectOp_error   ,   \
+        const GrB_Monoid     : GrB_Monoid_error     ,   \
+              GrB_Monoid     : GrB_Monoid_error     ,   \
+        const GrB_Semiring   : GrB_Semiring_error   ,   \
+              GrB_Semiring   : GrB_Semiring_error   ,   \
+        const GxB_Scalar     : GxB_Scalar_error     ,   \
+              GxB_Scalar     : GxB_Scalar_error     ,   \
+        const GrB_Vector     : GrB_Vector_error     ,   \
+              GrB_Vector     : GrB_Vector_error     ,   \
+        const GrB_Matrix     : GrB_Matrix_error     ,   \
+              GrB_Matrix     : GrB_Matrix_error     ,   \
+        const GrB_Descriptor : GrB_Descriptor_error ,   \
+              GrB_Descriptor : GrB_Descriptor_error     \
+    )                                                   \
+    (error, object)
+#endif
 
-// Each of these methods compute a matrix multiplication over a semiring.  The
-// inputs are typecasted into the inputs of the semiring's multiply operator.
-// The result T=A*B has the type of the multiplier output, which is also the 3
-// types of the 'add' operator.  The 'add' operator is a commutatitive and
-// associative monoid.
+//==============================================================================
+// GrB_mxm, vxm, mxv: matrix multiplication over a semiring
+//==============================================================================
 
 GB_PUBLIC
 GrB_Info GrB_mxm                    // C<Mask> = accum (C, A*B)
@@ -4404,25 +4128,13 @@ GrB_Info GrB_mxv                    // w<Mask> = accum (w, A*u)
     const GrB_Descriptor desc       // descriptor for w, mask, and A
 ) ;
 
-//------------------------------------------------------------------------------
-// element-wise matrix and vector operations: using set intersection
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_eWiseMult: element-wise matrix and vector operations, set intersection
+//==============================================================================
 
 // GrB_eWiseMult computes C<Mask> = accum (C, A.*B), where ".*" is MATLAB
 // notation, and where pairs of elements in two matrices (or vectors) are
-// pairwise "multiplied" with C(i,j) = mult (A(i,j),B(i,j)).  The
-// "multiplication" operator can be any binary operator.  This is not matrix
-// multiplication in the conventional linear algebra sense; see GrB_mxm and
-// related methods for that operation.  The pattern of the result T=A.*B is the
-// set intersection (not union) of A and B.  Entries outside of the
-// intersection are not computed.  This is primary difference with
-// GrB_eWiseAdd.
-
-// The input matrices A and/or B may be transposed first, via the descriptor.
-
-// For a semiring, the mult operator is the semiring's multiply operator; note
-// that this differs from the eWiseAdd methods which use the semiring's add
-// operator instead. For a monoid, the mult operator is the monoid operator.
+// pairwise "multiplied" with C(i,j) = mult (A(i,j),B(i,j)).
 
 GB_PUBLIC
 GrB_Info GrB_Vector_eWiseMult_Semiring       // w<Mask> = accum (w, u.*v)
@@ -4496,35 +4208,6 @@ GrB_Info GrB_Matrix_eWiseMult_BinaryOp       // C<Mask> = accum (C, A.*B)
     const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
 ) ;
 
-// misnamed, deprecated functions:  These function names do not appear
-// in the GraphBLAS C API Specification.  They were misnamed in
-// earlier versions of SuiteSparse:GraphBLAS.  The corrected versions
-// appear above.
-
-GB_PUBLIC GrB_Info GrB_eWiseMult_Vector_Semiring (GrB_Vector,
-    const GrB_Vector, const GrB_BinaryOp, const GrB_Semiring,
-    const GrB_Vector, const GrB_Vector, const GrB_Descriptor) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseMult_Vector_Monoid (GrB_Vector,
-    const GrB_Vector, const GrB_BinaryOp, const GrB_Monoid,
-    const GrB_Vector, const GrB_Vector, const GrB_Descriptor desc) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseMult_Vector_BinaryOp (GrB_Vector,
-    const GrB_Vector, const GrB_BinaryOp, const GrB_BinaryOp,
-    const GrB_Vector, const GrB_Vector, const GrB_Descriptor desc) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseMult_Matrix_Semiring (GrB_Matrix,
-    const GrB_Matrix, const GrB_BinaryOp, const GrB_Semiring,
-    const GrB_Matrix, const GrB_Matrix, const GrB_Descriptor desc) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseMult_Matrix_Monoid (GrB_Matrix,
-    const GrB_Matrix, const GrB_BinaryOp, const GrB_Monoid,
-    const GrB_Matrix, const GrB_Matrix, const GrB_Descriptor) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseMult_Matrix_BinaryOp (GrB_Matrix,
-    const GrB_Matrix, const GrB_BinaryOp, const GrB_BinaryOp,
-    const GrB_Matrix, const GrB_Matrix, const GrB_Descriptor desc) ;
-
 // All 6 of the above type-specific functions are captured in a single
 // type-generic function, GrB_eWiseMult:
 
@@ -4559,29 +4242,12 @@ GB_PUBLIC GrB_Info GrB_eWiseMult_Matrix_BinaryOp (GrB_Matrix,
     (C, Mask, accum, op, A, B, desc)
 #endif
 
-//------------------------------------------------------------------------------
-// element-wise matrix and vector operations: using set union
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_eWiseAdd: element-wise matrix and vector operations, set union
+//==============================================================================
 
 // GrB_eWiseAdd computes C<Mask> = accum (C, A+B), where pairs of elements in
-// two matrices (or two vectors) are pairwise "added".  The "add" operator can
-// be any binary operator.  With the plus operator, this is the same matrix
-// addition in conventional linear algebra.  The pattern of the result T=A+B is
-// the set union (not intersection) of A and B.  Entries outside of the union
-// are not computed.  That is, if both A(i,j) and B(i,j) are present in the
-// pattern of A and B, then T(i,j) = A(i,j) "+" B(i,j).  If only A(i,j) is
-// present then T(i,j) = A (i,j) and the "+" operator is not used.  Likewise,
-// if only B(i,j) is in the pattern of B but A(i,j) is not in the pattern of A,
-// then T(i,j) = B(i,j).  This is the primary difference between GrB_eWiseAdd and
-// GrB_eWiseMult; the same set of binary operators can be used in both methods,
-// and the action they take on entries in the intersection of the pattern of A
-// and B is identical.
-
-// The input matrices A and/or B may be transposed first, via the descriptor.
-
-// For a semiring, the mult operator is the semiring's add operator; note that
-// this differs from the eWiseMult methods which use the semiring's multiply
-// operator instead. For a monoid, the mult operator is the monoid operator.
+// two matrices (or two vectors) are pairwise "added".
 
 GB_PUBLIC
 GrB_Info GrB_Vector_eWiseAdd_Semiring       // w<Mask> = accum (w, u+v)
@@ -4655,35 +4321,6 @@ GrB_Info GrB_Matrix_eWiseAdd_BinaryOp       // C<Mask> = accum (C, A+B)
     const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
 ) ;
 
-// misnamed, deprecated functions:  These function names do not appear
-// in the GraphBLAS C API Specification.  They were misnamed in
-// earlier versions of SuiteSparse:GraphBLAS.  The corrected versions
-// appear above.
-
-GB_PUBLIC GrB_Info GrB_eWiseAdd_Vector_Semiring (GrB_Vector,
-    const GrB_Vector, const GrB_BinaryOp, const GrB_Semiring,
-    const GrB_Vector, const GrB_Vector, const GrB_Descriptor desc) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseAdd_Vector_Monoid (GrB_Vector,
-    const GrB_Vector, const GrB_BinaryOp, const GrB_Monoid,
-    const GrB_Vector, const GrB_Vector, const GrB_Descriptor desc) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseAdd_Vector_BinaryOp (GrB_Vector,
-    const GrB_Vector, const GrB_BinaryOp, const GrB_BinaryOp,
-    const GrB_Vector, const GrB_Vector, const GrB_Descriptor) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseAdd_Matrix_Semiring (GrB_Matrix,
-    const GrB_Matrix, const GrB_BinaryOp, const GrB_Semiring,
-    const GrB_Matrix, const GrB_Matrix, const GrB_Descriptor) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseAdd_Matrix_Monoid (GrB_Matrix,
-    const GrB_Matrix, const GrB_BinaryOp, const GrB_Monoid,
-    const GrB_Matrix, const GrB_Matrix, const GrB_Descriptor) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseAdd_Matrix_BinaryOp (GrB_Matrix,
-    const GrB_Matrix, const GrB_BinaryOp, const GrB_BinaryOp,
-    const GrB_Matrix, const GrB_Matrix, const GrB_Descriptor) ;
-
 #if GxB_STDC_VERSION >= 201112L
 #define GrB_eWiseAdd(C,Mask,accum,op,A,B,desc)                          \
     _Generic                                                            \
@@ -4715,27 +4352,19 @@ GB_PUBLIC GrB_Info GrB_eWiseAdd_Matrix_BinaryOp (GrB_Matrix,
     (C, Mask, accum, op, A, B, desc)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix and vector extract
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_extract: extract a submatrix or subvector
+//==============================================================================
 
 // Extract entries from a matrix or vector; T = A(I,J) in MATLAB notation.
 // This (like most GraphBLAS methods) is then followed by C<Mask>=accum(C,T).
 
-// The input matrix A may be transposed first, via the descriptor.
-
 // To extract all rows of a matrix or vector, as in A (:,J) in MATLAB, use
 // I=GrB_ALL as the input argument.  For all columns of a matrix, use
-// J=GrB_ALL.  GrB_ALL is a predefined pointer that is not NULL so that
-// out-of-memory conditions can be (I=NULL) distinguished from a request for
-// all rows (I=GrB_ALL).  The pointer GrB_ALL should never dereferenced, and it
-// must not be freed or modified.
+// J=GrB_ALL.
 
 GB_PUBLIC const uint64_t *GrB_ALL ;
 
-// SPEC:  GxB_RANGE (where I = begin:end) and GxB_STRIDE (where I =
-// begin:inc:end) are extensions to the spec.
-
 // To extract a range of rows and columns, I and J can be a list of 2 or 3
 // indices that defines a range (begin:end) or a strided range (begin:inc:end),
 // in MATLAB notation.  To specify the MATLAB syntax I = begin:end, the array I
@@ -4768,9 +4397,6 @@ GB_PUBLIC const uint64_t *GrB_ALL ;
 //      I [GxB_INC   ] = 2 ;                // the magnitude of the increment
 //      I [GxB_END   ] = 1 ;                // the end of the sequence
 
-
-// Each of the following can be used with their generic name, GrB_extract.
-
 GB_PUBLIC
 GrB_Info GrB_Vector_extract         // w<mask> = accum (w, u(I))
 (
@@ -4839,78 +4465,23 @@ GrB_Info GrB_Col_extract            // w<mask> = accum (w, A(I,j))
     (arg1, Mask, accum, arg4, __VA_ARGS__)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix and vector subassign: C(I,J)<Mask> = accum (C(I,J), A)
-//------------------------------------------------------------------------------
+//==============================================================================
+// GxB_subassign: matrix and vector subassign: C(I,J)<Mask> = accum (C(I,J), A)
+//==============================================================================
 
 // Assign entries in a matrix or vector; C(I,J) = A in MATLAB notation.
-// Each of these can be used with their generic name, GxB_subassign.
-
-// SPEC: The GxB_*_subassign functions are extensions to the spec.
 
 // Each GxB_subassign function is very similar to its corresponding GrB_assign
-// function in the spec, but they differ in two ways:
-
-// (1) the mask in the GxB_subassign functions has the same dimensions as
-//      w(I) for vectors and C(I,J) for matrices.  In GrB_assign, the mask is
-//      the same size as w or C, respectively (except for GrB_Row_assign and
-//      GrB_Col_assign, in which case the mask is the same size as a row or
-//      column of C, respectively).  The two masks are related.  If M is the
-//      mask for GrB_assign, then M(I,J) is the mask for GxB_subassign.  If
-//      there is no mask, or if I and J are both GrB_ALL, then the two masks
-//      are the same.
-
-//      For GrB_Row_assign and GrB_Col_assign, the mask vector is the same
-//      size as a row or column of C, respectively.  For the corresponding
-//      GxB_Row_subassign and GxB_Col_subassign operations, the mask is the
-//      same size as the subrow C(i,J) or subcolumn C(I,j), respectively.
-
-// (2) They differ in how C is affected in areas outside the C(I,J) submatrix.
-//      In GxB_subassign, C(I,J) is the only part of C that can be modified,
-//      and no part of C outside the submatrix is ever modified.  In
-//      GrB_assign, it is possible to modify C outside the submatrix, but only
-//      in one specific manner.  Suppose the mask M is present (or, suppose it
-//      is not present but GrB_COMP is true).  After (optionally) complementing
-//      the mask, the value of M(i,j) can be 0 for some entry outside the
-//      C(I,J) submatrix.  If the GrB_REPLACE descriptor is true, the
-//      GrB_assign deletes this entry.  This case does not occur if GrB_REPLACE
-//      is false.  With GrB_assign, it is not possible to change entries
-//      outside the submatrix C(I,J), except to delete them in this
-//      circumstance.
-
-// GxB_subassign and GrB_assign are identical if GrB_REPLACE is set to its
-// default value of false, or if the masks happen to be the same.  The two
-// masks can be the same in two cases:  either there is no mask (and GrB_COMP
-// is false), or I and J are both GrB_ALL.  In this case, the two algorithms
-// are identical and have the same performance.
-
-// GxB_subassign is much faster than GrB_assign, when the latter must examine
-// the entire matrix C to delete entries (when GrB__REPLACE is true), and it
-// must deal with a much larger Mask matrix.  However, both methods have
-// specific uses.  Consider using C(I,J)+=F for many submatrices F (for
-// example, when assembling a finite-element matrix).  If the Mask is meant as
-// a specification for which entries of C should appear in the final result,
-// then use GrB_assign.  If the Mask is meant to control which entries of the
-// submatrix C(I,J) are modified by the finite-element F, then use
-// GxB_subassign.  This is particularly useful is the Mask is a "template" that
-// follows along with the finite-element F, independent of where it is applied
-// C.  Using GrB_assign would be very difficult in this case since a new Mask,
-// the same size as C, would need to be constructed for each finite-element F.
+// function in the spec, but they differ in two ways: (1) the mask in
+// GxB_subassign has the same size as w(I) for vectors and C(I,J) for matrices,
+// and (2) they differ in the GrB_REPLACE option.  See the user guide for
+// details.
 
 // In GraphBLAS notation, the two methods can be described as follows:
 
 // matrix and vector subassign: C(I,J)<Mask> = accum (C(I,J), A)
 // matrix and vector    assign: C<Mask>(I,J) = accum (C(I,J), A)
 
-// This notation does not include the details of the GrB_COMP and GrB_REPLACE
-// descriptors, but it does illustrate the difference in the Mask.  In the
-// subassign, Mask is the same size as C(I,J) and A.  If I[0]=i and J[0]=j,
-// Then Mask(0,0) controls how C(i,j) is modified by the subassign, from the
-// value A(0,0).  In the assign, Mask is the same size as C, and Mask(i,j)
-// controls how C(i,j) is modified.
-
-// Summary:
-
 // --- assign ------------------------------------------------------------------
 //
 // GrB_Matrix_assign      C<M>(I,J) += A        M same size as matrix C.
@@ -4921,13 +4492,11 @@ GrB_Info GrB_Col_extract            // w<mask> = accum (w, A(I,j))
 //
 // GrB_Row_assign         C<m'>(i,J) += u'      m is a column vector the same
 //                                              size as a row of C.
-//                                              u is |J|-by-1
-//                                              i is a scalar.
+//                                              u is |J|-by-1, i is a scalar.
 //
 // GrB_Col_assign         C<m>(I,j) += u        m is a column vector the same
 //                                              size as a column of C.
-//                                              u is |I|-by-1
-//                                              j is a scalar.
+//                                              u is |I|-by-1, j is a scalar.
 //
 // --- subassign ---------------------------------------------------------------
 //
@@ -4938,19 +4507,10 @@ GrB_Info GrB_Col_extract            // w<mask> = accum (w, A(I,j))
 //                                              u is |I|-by-1
 //
 // GxB_Row_subassign      C(i,J)<m'> += u'      m same size as column vector u.
-//                                              u is |J|-by-1
-//                                              i is a scalar.
+//                                              u is |J|-by-1, i is a scalar.
 //
 // GxB_Col_subassign      C(I,j)<m> += u        m same size as column vector u.
-//                                              u is |I|-by-1
-//                                              j is a scalar.
-
-// For the scalar variants of the matrix and vector assign and subassign,
-// the input scalar is implicitly expanded to a dense matrix A or dense
-// vector u.
-
-// The GxB_subassign and GrB_assign functions have the same signatures; they
-// differ only in how they consider the Mask and the GrB_REPLACE descriptor.
+//                                              u is |I|-by-1, j is a scalar.
 
 GB_PUBLIC
 GrB_Info GxB_Vector_subassign       // w(I)<mask> = accum (w(I),u)
@@ -5012,8 +4572,6 @@ GrB_Info GxB_Row_subassign          // C(i,J)<mask'> = accum (C(i,J),u')
 // scalar x is implicitly expanded into a vector u of size ni-by-1, with each
 // entry in u equal to x, and then w(I)<mask> = accum(w(I),u) is done.
 
-// Each of these can be used with their generic name, GxB_subassign.
-
 GB_PUBLIC
 GrB_Info GxB_Vector_subassign_BOOL  // w(I)<mask> = accum (w(I),x)
 (
@@ -5190,8 +4748,6 @@ GrB_Info GxB_Vector_subassign_UDT      // w(I)<mask> = accum (w(I),x)
 // scalar x is implicitly expanded into a matrix A of size ni-by-nj, with each
 // entry in A equal to x, and then C(I,J)<Mask> = accum(C(I,J),A) is done.
 
-// Each of these can be used with their generic name, GxB_subassign.
-
 GB_PUBLIC
 GrB_Info GxB_Matrix_subassign_BOOL  // C(I,J)<Mask> = accum (C(I,J),x)
 (
@@ -5441,9 +4997,9 @@ GrB_Info GxB_Matrix_subassign_UDT      // C(I,J)<Mask> = accum (C(I,J),x)
     (arg1, Mask, accum, arg4, arg5, __VA_ARGS__)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix and vector assign: C<Mask>(I,J) = accum (C(I,J), A)
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_assign: matrix and vector assign: C<Mask>(I,J) = accum (C(I,J), A)
+//==============================================================================
 
 // Assign entries in a matrix or vector; C(I,J) = A in MATLAB notation.
 // Each of these can be used with their generic name, GrB_assign.
@@ -5508,8 +5064,6 @@ GrB_Info GrB_Row_assign             // C<mask'>(i,J) = accum (C(i,J),u')
 // scalar x is implicitly expanded into a vector u of size ni-by-1, with each
 // entry in u equal to x, and then w<mask>(I) = accum(w(I),u) is done.
 
-// Each of these can be used with their generic name, GrB_assign.
-
 GB_PUBLIC
 GrB_Info GrB_Vector_assign_BOOL     // w<mask>(I) = accum (w(I),x)
 (
@@ -5686,8 +5240,6 @@ GrB_Info GrB_Vector_assign_UDT      // w<mask>(I) = accum (w(I),x)
 // scalar x is implicitly expanded into a matrix A of size ni-by-nj, with each
 // entry in A equal to x, and then C<Mask>(I,J) = accum(C(I,J),A) is done.
 
-// Each of these can be used with their generic name, GrB_assign.
-
 GB_PUBLIC
 GrB_Info GrB_Matrix_assign_BOOL     // C<Mask>(I,J) = accum (C(I,J),x)
 (
@@ -5937,14 +5489,12 @@ GrB_Info GrB_Matrix_assign_UDT      // C<Mask>(I,J) = accum (C(I,J),x)
     (arg1, Mask, accum, arg4, arg5, __VA_ARGS__)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix and vector apply
-//------------------------------------------------------------------------------
-
-// Apply a unary operator to the entries in a matrix or vector,
-// C<Mask> = accum (C, op (A)).
+//==============================================================================
+// GrB_apply: matrix and vector apply
+//==============================================================================
 
-// The input matrix A may be optionally transposed first, via the descriptor.
+// Apply a unary operator to entries in a matrix or vector,
+// C<M> = accum (C, op (A)).
 
 GB_PUBLIC
 GrB_Info GrB_Vector_apply           // w<mask> = accum (w, op(u))
@@ -6726,8 +6276,6 @@ GrB_Info GrB_Matrix_apply_BinaryOp2nd_UDT       // C<M>=accum(C,op(x,A))
 // GrB_Vector_apply (w,mask,acc,op,u,d)  // w<mask> = accum (w, op(u))
 // GrB_Matrix_apply (C,Mask,acc,op,A,d)  // C<Mask> = accum (C, op(A))
 
-// It has been extended in the v1.3 spec to binary operators:
-
 // GrB_Vector_apply                  (w,m,acc,unop ,u,d)
 // GxB_Vector_apply_BinaryOp1st      (w,m,acc,binop,x,u,d)
 // GrB_Vector_apply_BinaryOp1st_TYPE (w,m,acc,binop,x,u,d)
@@ -6779,19 +6327,14 @@ GrB_Info GrB_Matrix_apply_BinaryOp2nd_UDT       // C<M>=accum(C,op(x,A))
     (C, Mask, accum, op, __VA_ARGS__)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix and vector selection
-//------------------------------------------------------------------------------
+//==============================================================================
+// GxB_select: matrix and vector selection
+//==============================================================================
 
 // Select a subset of entries from a matrix or vector.
 // C<Mask> = accum (C, op (A,k)), where the entries of op(A,k) are a subset of
 // the entries of A.
 
-// The input matrix A may be optionally transposed first, via the descriptor.
-
-// ADDED in V3.0:  thunk changed from (const void *) to a GxB_Scalar.  This
-// change is not backward compatible with SuiteSparse:GraphBLAS V2.x.
-
 GB_PUBLIC
 GrB_Info GxB_Vector_select          // w<mask> = accum (w, op(u,k))
 (
@@ -6837,16 +6380,20 @@ GrB_Info GxB_Matrix_select          // C<Mask> = accum (C, op(A,k)) or op(A',k)
     (C, Mask, accum, op, A, Thunk, desc)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix and vector reduction
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_reduce: matrix and vector reduction
+//==============================================================================
 
-// Reduce the entries in a matrix to a vector.  By default these methods
-// compute a column vector t such that t(i) = sum (A (i,:)), and where "sum" is
-// a commutative and associative monoid with an identity value.  A can be
-// transposed, which reduces down the columns instead of the rows.  This
-// behavior is the transpose of the MATLAB convention, where r=sum(A) produces
-// a row vector and sums each column.
+// Reduce the entries in a matrix to a vector, a column vector t such that
+// t(i) = sum (A (i,:)), and where "sum" is a commutative and associative
+// monoid with an identity value.  A can be transposed, which reduces down the
+// columns instead of the rows.  This behavior is the transpose of the MATLAB
+// convention, where r=sum(A) produces a row vector and sums each column.
+
+// For GrB_Matrix_reduce_BinaryOp, the GrB_BinaryOp op must correspond to a
+// known built-in GrB_Monoid.  User-defined binary operators are not supported.
+// the use of GrB_Matrix_reduce_BinaryOp is discouraged; use GrB_reduce with a
+// monoid instead.
 
 GB_PUBLIC
 GrB_Info GrB_Matrix_reduce_Monoid   // w<mask> = accum (w,reduce(A))
@@ -6876,18 +6423,6 @@ GrB_Info GrB_Matrix_reduce_BinaryOp // w<mask> = accum (w,reduce(A))
 
 // Reduce entries in a vector to a scalar, c = accum (c, reduce_to_scalar(u))
 
-// All entries in the vector are "summed" to a single scalar t using the reduce
-// monoid, which must be associative (otherwise the results are undefined).
-// The result is either assigned to the output scalar c (if accum is NULL), or
-// it accumulated in the result c via c = accum(c,t).  If the vector has no
-// entries, the result t is the identity value of the monoid.  Unlike most
-// other GraphBLAS operations, this operation uses an accum operator but no
-// mask.
-
-// Like all GraphBLAS operations, these take a last argument of a GraphBLAS
-// descriptor.  However, it is unused in the current GraphBLAS spec.  It may be
-// used in the future.
-
 GB_PUBLIC
 GrB_Info GrB_Vector_reduce_BOOL     // c = accum (c, reduce_to_scalar (u))
 (
@@ -7034,14 +6569,6 @@ GrB_Info GrB_Vector_reduce_UDT      // c = accum (c, reduce_to_scalar (u))
 
 // Reduce entries in a matrix to a scalar, c = accum (c, reduce_to_scalar(A))
 
-// All entries in the matrix are "summed" to a single scalar t using the reduce
-// monoid, which must be associative (otherwise the results are undefined).
-// The result is either assigned to the output scalar c (if accum is NULL), or
-// it accumulated in the result c via c = accum(c,t).  If the matrix has no
-// entries, the result t is the identity value of the monoid.  Unlike most
-// other GraphBLAS operations, this operation uses an accum operator but no
-// mask.
-
 GB_PUBLIC
 GrB_Info GrB_Matrix_reduce_BOOL     // c = accum (c, reduce_to_scalar (A))
 (
@@ -7221,15 +6748,9 @@ GrB_Info GrB_Matrix_reduce_UDT      // c = accum (c, reduce_to_scalar (A))
     (arg1, arg2, arg3, arg4, __VA_ARGS__)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix transpose
-//------------------------------------------------------------------------------
-
-// T = A' is computed by default, but A can also be transposed via the
-// descriptor.  In this case A is not transposed at all, and T = A.  The result
-// is then passed through the Mask and accum, like almost all other GraphBLAS
-// operations.  This makes GrB_transpose a direct interface to the accum/mask
-// operation, C<Mask> = accum (C,A), or C<Mask> = accum (C,A') by default.
+//==============================================================================
+// GrB_transpose: matrix transpose
+//==============================================================================
 
 GB_PUBLIC
 GrB_Info GrB_transpose              // C<Mask> = accum (C, A')
@@ -7242,31 +6763,77 @@ GrB_Info GrB_transpose              // C<Mask> = accum (C, A')
 ) ;
 
 //==============================================================================
-// additional predefined objects
+// GrB_kronecker:  Kronecker product
 //==============================================================================
 
-// SPEC: predefined monoids and semirings are extensions to the spec.
-// The v1.3 spec added many GrB_*_MONOIDs; these are listed below.
-// Prior GxB_* monoids are kept for backward compatbility.
+// GxB_kron is deprecated; use GrB_kronecker instead
+GB_PUBLIC
+GrB_Info GxB_kron                   // C<Mask> = accum(C,kron(A,B)) (deprecated)
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_BinaryOp op,          // defines '*' for T=kron(A,B)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
+) ;
 
-//------------------------------------------------------------------------------
-// built-in monoids
-//------------------------------------------------------------------------------
+GB_PUBLIC
+GrB_Info GrB_Matrix_kronecker_BinaryOp  // C<M> = accum (C, kron(A,B))
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix M,             // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_BinaryOp op,          // defines '*' for T=kron(A,B)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Descriptor desc       // descriptor for C, M, A, and B
+) ;
 
-// 77 monoids constructed using built-in types and operators are defined below.
-// Five operators (min, max, plus, times, any) are available for each of the 10
-// real non-Boolean types, plus five purely Boolean monoids are available.
-// Three operators (plus, times, any) are available for both complex types.
-// Sixteen monoids are pre-defined for bitwise operators (only for unsigned
-// integers).
+GB_PUBLIC
+GrB_Info GrB_Matrix_kronecker_Monoid  // C<M> = accum (C, kron(A,B))
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix M,             // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_Monoid monoid,        // defines '*' for T=kron(A,B)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Descriptor desc       // descriptor for C, M, A, and B
+) ;
 
-// 44 pre-defined monoids now appear with GrB_* names in the v1.3 C API.
-// These are identical to the 44 GxB* monoids listed below, just with different
-// names.  The GrB* names are preferred.
+GB_PUBLIC
+GrB_Info GrB_Matrix_kronecker_Semiring  // C<M> = accum (C, kron(A,B))
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix M,             // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_Semiring semiring,    // defines '*' for T=kron(A,B)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Descriptor desc       // descriptor for C, M, A, and B
+) ;
+
+#if GxB_STDC_VERSION >= 201112L
+#define GrB_kronecker(C,Mask,accum,op,A,B,desc)                 \
+    _Generic                                                    \
+    (                                                           \
+        (op),                                                   \
+        const GrB_Semiring : GrB_Matrix_kronecker_Semiring ,    \
+              GrB_Semiring : GrB_Matrix_kronecker_Semiring ,    \
+        const GrB_Monoid   : GrB_Matrix_kronecker_Monoid   ,    \
+              GrB_Monoid   : GrB_Matrix_kronecker_Monoid   ,    \
+        const GrB_BinaryOp : GrB_Matrix_kronecker_BinaryOp ,    \
+              GrB_BinaryOp : GrB_Matrix_kronecker_BinaryOp      \
+    )                                                           \
+    (C, Mask, accum, op, A, B, desc)
+#endif
 
-// Bitwise monoids can be constructed for signed integer types, but these are
-// not well-defined by the ANSI C specification, so they are excluded from the
-// pre-defined monoids in SuiteSparse:GraphBLAS.
+
+//==============================================================================
+// GrB_Monoid: built-in monoids
+//==============================================================================
 
 GB_PUBLIC GrB_Monoid
 
@@ -7274,193 +6841,191 @@ GB_PUBLIC GrB_Monoid
     // 10 MIN monoids: (not for complex types)
     //--------------------------------------------------------------------------
 
-    GxB_MIN_INT8_MONOID,          // identity: INT8_MAX     terminal: INT8_MIN
-    GxB_MIN_INT16_MONOID,         // identity: INT16_MAX    terminal: INT16_MIN
-    GxB_MIN_INT32_MONOID,         // identity: INT32_MAX    terminal: INT32_MIN
-    GxB_MIN_INT64_MONOID,         // identity: INT64_MAX    terminal: INT32_MIN
-    GxB_MIN_UINT8_MONOID,         // identity: UINT8_MAX    terminal: 0
-    GxB_MIN_UINT16_MONOID,        // identity: UINT16_MAX   terminal: 0
-    GxB_MIN_UINT32_MONOID,        // identity: UINT32_MAX   terminal: 0
-    GxB_MIN_UINT64_MONOID,        // identity: UINT64_MAX   terminal: 0
-    GxB_MIN_FP32_MONOID,          // identity: INFINITY     terminal: -INFINITY
-    GxB_MIN_FP64_MONOID,          // identity: INFINITY     terminal: -INFINITY
-
-    // all of the MIN monoids are now in the v1.3 spec with GrB_* names.
-    // The are identical to the GxB_* versions above.
-    GrB_MIN_MONOID_INT8,
-    GrB_MIN_MONOID_INT16,
-    GrB_MIN_MONOID_INT32,
-    GrB_MIN_MONOID_INT64,
-    GrB_MIN_MONOID_UINT8,
-    GrB_MIN_MONOID_UINT16,
-    GrB_MIN_MONOID_UINT32,
-    GrB_MIN_MONOID_UINT64,
-    GrB_MIN_MONOID_FP32,
-    GrB_MIN_MONOID_FP64,
+    // GxB_MIN monoids, deprecated, use GrB_MIN_MONOID_* instead:
+    GxB_MIN_INT8_MONOID,        // identity: INT8_MAX     terminal: INT8_MIN
+    GxB_MIN_INT16_MONOID,       // identity: INT16_MAX    terminal: INT16_MIN
+    GxB_MIN_INT32_MONOID,       // identity: INT32_MAX    terminal: INT32_MIN
+    GxB_MIN_INT64_MONOID,       // identity: INT64_MAX    terminal: INT32_MIN
+    GxB_MIN_UINT8_MONOID,       // identity: UINT8_MAX    terminal: 0
+    GxB_MIN_UINT16_MONOID,      // identity: UINT16_MAX   terminal: 0
+    GxB_MIN_UINT32_MONOID,      // identity: UINT32_MAX   terminal: 0
+    GxB_MIN_UINT64_MONOID,      // identity: UINT64_MAX   terminal: 0
+    GxB_MIN_FP32_MONOID,        // identity: INFINITY     terminal: -INFINITY
+    GxB_MIN_FP64_MONOID,        // identity: INFINITY     terminal: -INFINITY
+
+    // preferred names from the v1.3 spec:
+    GrB_MIN_MONOID_INT8,        // identity: INT8_MAX     terminal: INT8_MIN
+    GrB_MIN_MONOID_INT16,       // identity: INT16_MAX    terminal: INT16_MIN
+    GrB_MIN_MONOID_INT32,       // identity: INT32_MAX    terminal: INT32_MIN
+    GrB_MIN_MONOID_INT64,       // identity: INT64_MAX    terminal: INT32_MIN
+    GrB_MIN_MONOID_UINT8,       // identity: UINT8_MAX    terminal: 0
+    GrB_MIN_MONOID_UINT16,      // identity: UINT16_MAX   terminal: 0
+    GrB_MIN_MONOID_UINT32,      // identity: UINT32_MAX   terminal: 0
+    GrB_MIN_MONOID_UINT64,      // identity: UINT64_MAX   terminal: 0
+    GrB_MIN_MONOID_FP32,        // identity: INFINITY     terminal: -INFINITY
+    GrB_MIN_MONOID_FP64,        // identity: INFINITY     terminal: -INFINITY
 
     //--------------------------------------------------------------------------
     // 10 MAX monoids:
     //--------------------------------------------------------------------------
 
-    GxB_MAX_INT8_MONOID,          // identity: INT8_MIN     terminal: INT8_MAX
-    GxB_MAX_INT16_MONOID,         // identity: INT16_MIN    terminal: INT16_MAX
-    GxB_MAX_INT32_MONOID,         // identity: INT32_MIN    terminal: INT32_MAX
-    GxB_MAX_INT64_MONOID,         // identity: INT64_MIN    terminal: INT64_MAX
-    GxB_MAX_UINT8_MONOID,         // identity: 0            terminal: UINT8_MAX
-    GxB_MAX_UINT16_MONOID,        // identity: 0            terminal: UINT16_MAX
-    GxB_MAX_UINT32_MONOID,        // identity: 0            terminal: UINT32_MAX
-    GxB_MAX_UINT64_MONOID,        // identity: 0            terminal: UINT64_MAX
-    GxB_MAX_FP32_MONOID,          // identity: -INFINITY    terminal: INFINITY
-    GxB_MAX_FP64_MONOID,          // identity: -INFINITY    terminal: INFINITY
-
-    // identical monoids now in the v1.3 spec:
-    GrB_MAX_MONOID_INT8,
-    GrB_MAX_MONOID_INT16,
-    GrB_MAX_MONOID_INT32,
-    GrB_MAX_MONOID_INT64,
-    GrB_MAX_MONOID_UINT8,
-    GrB_MAX_MONOID_UINT16,
-    GrB_MAX_MONOID_UINT32,
-    GrB_MAX_MONOID_UINT64,
-    GrB_MAX_MONOID_FP32,
-    GrB_MAX_MONOID_FP64,
+    // GxB_MAX monoids, deprecated, use GrB_MAX_MONOID_* instead:
+    GxB_MAX_INT8_MONOID,        // identity: INT8_MIN     terminal: INT8_MAX
+    GxB_MAX_INT16_MONOID,       // identity: INT16_MIN    terminal: INT16_MAX
+    GxB_MAX_INT32_MONOID,       // identity: INT32_MIN    terminal: INT32_MAX
+    GxB_MAX_INT64_MONOID,       // identity: INT64_MIN    terminal: INT64_MAX
+    GxB_MAX_UINT8_MONOID,       // identity: 0            terminal: UINT8_MAX
+    GxB_MAX_UINT16_MONOID,      // identity: 0            terminal: UINT16_MAX
+    GxB_MAX_UINT32_MONOID,      // identity: 0            terminal: UINT32_MAX
+    GxB_MAX_UINT64_MONOID,      // identity: 0            terminal: UINT64_MAX
+    GxB_MAX_FP32_MONOID,        // identity: -INFINITY    terminal: INFINITY
+    GxB_MAX_FP64_MONOID,        // identity: -INFINITY    terminal: INFINITY
+
+    // preferred names from the v1.3 spec:
+    GrB_MAX_MONOID_INT8,        // identity: INT8_MIN     terminal: INT8_MAX
+    GrB_MAX_MONOID_INT16,       // identity: INT16_MIN    terminal: INT16_MAX
+    GrB_MAX_MONOID_INT32,       // identity: INT32_MIN    terminal: INT32_MAX
+    GrB_MAX_MONOID_INT64,       // identity: INT64_MIN    terminal: INT64_MAX
+    GrB_MAX_MONOID_UINT8,       // identity: 0            terminal: UINT8_MAX
+    GrB_MAX_MONOID_UINT16,      // identity: 0            terminal: UINT16_MAX
+    GrB_MAX_MONOID_UINT32,      // identity: 0            terminal: UINT32_MAX
+    GrB_MAX_MONOID_UINT64,      // identity: 0            terminal: UINT64_MAX
+    GrB_MAX_MONOID_FP32,        // identity: -INFINITY    terminal: INFINITY
+    GrB_MAX_MONOID_FP64,        // identity: -INFINITY    terminal: INFINITY
 
     //--------------------------------------------------------------------------
     // 12 PLUS monoids:
     //--------------------------------------------------------------------------
 
-    GxB_PLUS_INT8_MONOID,         // identity: 0            terminal: none
-    GxB_PLUS_INT16_MONOID,        // identity: 0            terminal: none
-    GxB_PLUS_INT32_MONOID,        // identity: 0            terminal: none
-    GxB_PLUS_INT64_MONOID,        // identity: 0            terminal: none
-    GxB_PLUS_UINT8_MONOID,        // identity: 0            terminal: none
-    GxB_PLUS_UINT16_MONOID,       // identity: 0            terminal: none
-    GxB_PLUS_UINT32_MONOID,       // identity: 0            terminal: none
-    GxB_PLUS_UINT64_MONOID,       // identity: 0            terminal: none
-    GxB_PLUS_FP32_MONOID,         // identity: 0            terminal: none
-    GxB_PLUS_FP64_MONOID,         // identity: 0            terminal: none
-    GxB_PLUS_FC32_MONOID,         // identity: 0            terminal: none
-    GxB_PLUS_FC64_MONOID,         // identity: 0            terminal: none
-
-    // identical monoids now in the v1.3 spec:  note the complex monoids
-    // do not appear in the v1.3 spec.  They are SuiteSparse:GraphBLAS
-    // extensions only.
-    GrB_PLUS_MONOID_INT8,
-    GrB_PLUS_MONOID_INT16,
-    GrB_PLUS_MONOID_INT32,
-    GrB_PLUS_MONOID_INT64,
-    GrB_PLUS_MONOID_UINT8,
-    GrB_PLUS_MONOID_UINT16,
-    GrB_PLUS_MONOID_UINT32,
-    GrB_PLUS_MONOID_UINT64,
-    GrB_PLUS_MONOID_FP32,
-    GrB_PLUS_MONOID_FP64,
+    // GxB_PLUS monoids, deprecated, use GrB_PLUS_MONOID_* instead:
+    GxB_PLUS_INT8_MONOID,       // identity: 0
+    GxB_PLUS_INT16_MONOID,      // identity: 0
+    GxB_PLUS_INT32_MONOID,      // identity: 0
+    GxB_PLUS_INT64_MONOID,      // identity: 0
+    GxB_PLUS_UINT8_MONOID,      // identity: 0
+    GxB_PLUS_UINT16_MONOID,     // identity: 0
+    GxB_PLUS_UINT32_MONOID,     // identity: 0
+    GxB_PLUS_UINT64_MONOID,     // identity: 0
+    GxB_PLUS_FP32_MONOID,       // identity: 0
+    GxB_PLUS_FP64_MONOID,       // identity: 0
+
+    // preferred names from the v1.3 spec:
+    GrB_PLUS_MONOID_INT8,       // identity: 0
+    GrB_PLUS_MONOID_INT16,      // identity: 0
+    GrB_PLUS_MONOID_INT32,      // identity: 0
+    GrB_PLUS_MONOID_INT64,      // identity: 0
+    GrB_PLUS_MONOID_UINT8,      // identity: 0
+    GrB_PLUS_MONOID_UINT16,     // identity: 0
+    GrB_PLUS_MONOID_UINT32,     // identity: 0
+    GrB_PLUS_MONOID_UINT64,     // identity: 0
+    GrB_PLUS_MONOID_FP32,       // identity: 0
+    GrB_PLUS_MONOID_FP64,       // identity: 0
+
+    // complex monoids:
+    GxB_PLUS_FC32_MONOID,       // identity: 0
+    GxB_PLUS_FC64_MONOID,       // identity: 0
 
     //--------------------------------------------------------------------------
-    // 12 TIMES monoids:
+    // 12 TIMES monoids: identity value is 1, int* and uint* are terminal
     //--------------------------------------------------------------------------
 
-    GxB_TIMES_INT8_MONOID,        // identity: 1            terminal: 0
-    GxB_TIMES_INT16_MONOID,       // identity: 1            terminal: 0
-    GxB_TIMES_INT32_MONOID,       // identity: 1            terminal: 0
-    GxB_TIMES_INT64_MONOID,       // identity: 1            terminal: 0
-    GxB_TIMES_UINT8_MONOID,       // identity: 1            terminal: 0
-    GxB_TIMES_UINT16_MONOID,      // identity: 1            terminal: 0
-    GxB_TIMES_UINT32_MONOID,      // identity: 1            terminal: 0
-    GxB_TIMES_UINT64_MONOID,      // identity: 1            terminal: 0
-    GxB_TIMES_FP32_MONOID,        // identity: 1            terminal: none
-    GxB_TIMES_FP64_MONOID,        // identity: 1            terminal: none
-    GxB_TIMES_FC32_MONOID,        // identity: 1            terminal: none
-    GxB_TIMES_FC64_MONOID,        // identity: 1            terminal: none
-
-    // identical monoids now in the v1.3 spec:  note the complex monoids
-    // do not appear in the v1.3 spec.  They are SuiteSparse:GraphBLAS
-    // extensions only.
-    GrB_TIMES_MONOID_INT8,
-    GrB_TIMES_MONOID_INT16,
-    GrB_TIMES_MONOID_INT32,
-    GrB_TIMES_MONOID_INT64,
-    GrB_TIMES_MONOID_UINT8,
-    GrB_TIMES_MONOID_UINT16,
-    GrB_TIMES_MONOID_UINT32,
-    GrB_TIMES_MONOID_UINT64,
-    GrB_TIMES_MONOID_FP32,
-    GrB_TIMES_MONOID_FP64,
+    // GxB_TIMES monoids, deprecated, use GrB_TIMES_MONOID_* instead:
+    GxB_TIMES_INT8_MONOID,      // identity: 1            terminal: 0
+    GxB_TIMES_INT16_MONOID,     // identity: 1            terminal: 0
+    GxB_TIMES_INT32_MONOID,     // identity: 1            terminal: 0
+    GxB_TIMES_INT64_MONOID,     // identity: 1            terminal: 0
+    GxB_TIMES_UINT8_MONOID,     // identity: 1            terminal: 0
+    GxB_TIMES_UINT16_MONOID,    // identity: 1            terminal: 0
+    GxB_TIMES_UINT32_MONOID,    // identity: 1            terminal: 0
+    GxB_TIMES_UINT64_MONOID,    // identity: 1            terminal: 0
+    GxB_TIMES_FP32_MONOID,      // identity: 1
+    GxB_TIMES_FP64_MONOID,      // identity: 1
+
+    // preferred names from the v1.3 spec:
+    GrB_TIMES_MONOID_INT8,      // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_INT16,     // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_INT32,     // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_INT64,     // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_UINT8,     // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_UINT16,    // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_UINT32,    // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_UINT64,    // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_FP32,      // identity: 1
+    GrB_TIMES_MONOID_FP64,      // identity: 1
+
+    // complex monoids:
+    GxB_TIMES_FC32_MONOID,      // identity: 1
+    GxB_TIMES_FC64_MONOID,      // identity: 1
 
     //--------------------------------------------------------------------------
-    // 12 ANY monoids: (excluding boolean, listed below)
+    // 13 ANY monoids:
     //--------------------------------------------------------------------------
 
-    // These do not appear in the v1.3 C API of the GraphBLAS Specification,
-    // since the ANY operator is a SuiteSparse:GraphBLAS extension.
-
-    GxB_ANY_INT8_MONOID,          // identity: any value    terminal: any value
-    GxB_ANY_INT16_MONOID,         // identity: any value    terminal: any value
-    GxB_ANY_INT32_MONOID,         // identity: any value    terminal: any value
-    GxB_ANY_INT64_MONOID,         // identity: any value    terminal: any value
-    GxB_ANY_UINT8_MONOID,         // identity: any value    terminal: any value
-    GxB_ANY_UINT16_MONOID,        // identity: any value    terminal: any value
-    GxB_ANY_UINT32_MONOID,        // identity: any value    terminal: any value
-    GxB_ANY_UINT64_MONOID,        // identity: any value    terminal: any value
-    GxB_ANY_FP32_MONOID,          // identity: any value    terminal: any value
-    GxB_ANY_FP64_MONOID,          // identity: any value    terminal: any value
-    GxB_ANY_FC32_MONOID,          // identity: any value    terminal: any value
-    GxB_ANY_FC64_MONOID,          // identity: any value    terminal: any value
+    GxB_ANY_BOOL_MONOID,        // identity: any value    terminal: any value
+    GxB_ANY_INT8_MONOID,        // identity: any value    terminal: any value
+    GxB_ANY_INT16_MONOID,       // identity: any value    terminal: any value
+    GxB_ANY_INT32_MONOID,       // identity: any value    terminal: any value
+    GxB_ANY_INT64_MONOID,       // identity: any value    terminal: any value
+    GxB_ANY_UINT8_MONOID,       // identity: any value    terminal: any value
+    GxB_ANY_UINT16_MONOID,      // identity: any value    terminal: any value
+    GxB_ANY_UINT32_MONOID,      // identity: any value    terminal: any value
+    GxB_ANY_UINT64_MONOID,      // identity: any value    terminal: any value
+    GxB_ANY_FP32_MONOID,        // identity: any value    terminal: any value
+    GxB_ANY_FP64_MONOID,        // identity: any value    terminal: any value
+    GxB_ANY_FC32_MONOID,        // identity: any value    terminal: any value
+    GxB_ANY_FC64_MONOID,        // identity: any value    terminal: any value
 
     //--------------------------------------------------------------------------
-    // 5 Boolean monoids:
+    // 4 Boolean monoids: (see also the GxB_ANY_BOOL_MONOID above)
     //--------------------------------------------------------------------------
 
-    GxB_ANY_BOOL_MONOID,          // identity: any value    terminal: any value
-    GxB_LOR_BOOL_MONOID,          // identity: false        terminal: true
-    GxB_LAND_BOOL_MONOID,         // identity: true         terminal: false
-    GxB_LXOR_BOOL_MONOID,         // identity: false
-    GxB_LXNOR_BOOL_MONOID,        // identity: true
-    GxB_EQ_BOOL_MONOID,           // identity: true (same as LXNOR monoid)
+    // GxB boolean monoids, deprecated, use GrB instead:
+    GxB_LOR_BOOL_MONOID,        // identity: false        terminal: true
+    GxB_LAND_BOOL_MONOID,       // identity: true         terminal: false
+    GxB_LXOR_BOOL_MONOID,       // identity: false
+    GxB_LXNOR_BOOL_MONOID,      // identity: true
+    GxB_EQ_BOOL_MONOID,         // (alternative name for GrB_LXNOR_MONOID_BOOL)
 
-    // the LOR, LAND, LXOR, and LXNOR monoids now appear in the v1.3 C API:
-    GrB_LOR_MONOID_BOOL,
-    GrB_LAND_MONOID_BOOL,
-    GrB_LXOR_MONOID_BOOL,
-    GrB_LXNOR_MONOID_BOOL,
+    // preferred names from the v1.3 spec:
+    GrB_LOR_MONOID_BOOL,        // identity: false        terminal: true
+    GrB_LAND_MONOID_BOOL,       // identity: true         terminal: false
+    GrB_LXOR_MONOID_BOOL,       // identity: false
+    GrB_LXNOR_MONOID_BOOL,      // identity: true
 
     //--------------------------------------------------------------------------
     // 16 Bitwise-or monoids:
     //--------------------------------------------------------------------------
 
-    // The v1.3 specification adds the bitwise operators, but no predefined
-    // monoids or semirings that use them.
-
     // BOR monoids (bitwise or):
-    GxB_BOR_UINT8_MONOID,         // identity: 0   terminal: 0xFF
-    GxB_BOR_UINT16_MONOID,        // identity: 0   terminal: 0xFFFF
-    GxB_BOR_UINT32_MONOID,        // identity: 0   terminal: 0xFFFFFFFF
-    GxB_BOR_UINT64_MONOID,        // identity: 0   terminal: 0xFFFFFFFFFFFFFFFF
+    GxB_BOR_UINT8_MONOID,       // identity: 0   terminal: 0xFF
+    GxB_BOR_UINT16_MONOID,      // identity: 0   terminal: 0xFFFF
+    GxB_BOR_UINT32_MONOID,      // identity: 0   terminal: 0xFFFFFFFF
+    GxB_BOR_UINT64_MONOID,      // identity: 0   terminal: 0xFFFFFFFFFFFFFFFF
 
     // BAND monoids (bitwise and):
-    GxB_BAND_UINT8_MONOID,        // identity: 0xFF               terminal: 0
-    GxB_BAND_UINT16_MONOID,       // identity: 0xFFFF             terminal: 0
-    GxB_BAND_UINT32_MONOID,       // identity: 0xFFFFFFFF         terminal: 0
-    GxB_BAND_UINT64_MONOID,       // identity: 0xFFFFFFFFFFFFFFFF terminal: 0
+    GxB_BAND_UINT8_MONOID,      // identity: 0xFF               terminal: 0
+    GxB_BAND_UINT16_MONOID,     // identity: 0xFFFF             terminal: 0
+    GxB_BAND_UINT32_MONOID,     // identity: 0xFFFFFFFF         terminal: 0
+    GxB_BAND_UINT64_MONOID,     // identity: 0xFFFFFFFFFFFFFFFF terminal: 0
 
     // BXOR monoids (bitwise xor):
-    GxB_BXOR_UINT8_MONOID,        // identity: 0
-    GxB_BXOR_UINT16_MONOID,       // identity: 0
-    GxB_BXOR_UINT32_MONOID,       // identity: 0
-    GxB_BXOR_UINT64_MONOID,       // identity: 0
+    GxB_BXOR_UINT8_MONOID,      // identity: 0
+    GxB_BXOR_UINT16_MONOID,     // identity: 0
+    GxB_BXOR_UINT32_MONOID,     // identity: 0
+    GxB_BXOR_UINT64_MONOID,     // identity: 0
 
     // BXNOR monoids (bitwise xnor):
-    GxB_BXNOR_UINT8_MONOID,       // identity: 0xFF
-    GxB_BXNOR_UINT16_MONOID,      // identity: 0xFFFF
-    GxB_BXNOR_UINT32_MONOID,      // identity: 0xFFFFFFFF
-    GxB_BXNOR_UINT64_MONOID ;     // identity: 0xFFFFFFFFFFFFFFFF
+    GxB_BXNOR_UINT8_MONOID,     // identity: 0xFF
+    GxB_BXNOR_UINT16_MONOID,    // identity: 0xFFFF
+    GxB_BXNOR_UINT32_MONOID,    // identity: 0xFFFFFFFF
+    GxB_BXNOR_UINT64_MONOID ;   // identity: 0xFFFFFFFFFFFFFFFF
 
-//------------------------------------------------------------------------------
-// built-in semirings
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_Semiring: built-in semirings
+//==============================================================================
 
 // Using built-in types and operators, SuiteSparse:GraphBLAS provides
-// 1473 pre-defined, built-in semirings:
+// 1553 pre-defined, built-in semirings:
 
 // 1000 semirings with a multiply operator TxT -> T where T is non-Boolean,
 // from the complete cross product of:
@@ -7479,20 +7044,20 @@ GB_PUBLIC GrB_Monoid
 // 300 semirings with a comparison operator TxT -> bool, where T is
 // non-Boolean, from the complete cross product of:
 
-//      5 Boolean monoids: LAND, LOR, LXOR, EQ, ANY
+//      5 Boolean monoids: LAND, LOR, LXOR, EQ (=LXNOR), ANY
 //      6 multiply operators: EQ, NE, GT, LT, GE, LE
 //      10 non-Boolean real types, T
 
 // 55 semirings with purely Boolean types, bool x bool -> bool, from the
 // complete cross product of:
 
-//      5 Boolean monoids LAND, LOR, LXOR, EQ, ANY
+//      5 Boolean monoids LAND, LOR, LXOR, EQ (=LXNOR), ANY
 //      11 multiply operators:
-//          FIRST, SECOND, LOR, LAND, LXOR, EQ, GT, LT, GE, LE, PAIR
+//          FIRST, SECOND, LOR, LAND, LXOR, EQ (=LXNOR), GT, LT, GE, LE, PAIR
 //
-//      Note that lor_pair, land_pair, and eq_pair are all identical to any_pair.
-//      These 3 semirings are named below, but are internally remapped to
-//      any_pair_bool semiring.
+//      Note that lor_pair, land_pair, and eq_pair are all identical to
+//      any_pair.  These 3 semirings are named below, but are internally
+//      remapped to any_pair_bool semiring.
 
 // 54 complex semirings: TxT -> T where T is float complex or double complex:
 
@@ -7511,6 +7076,15 @@ GB_PUBLIC GrB_Monoid
 //      4 bitwise multiply operators: BOR, BAND, BXOR, BXNOR
 //      4 unsigned integer types: UINT8, UINT16, UINT32, UINT64
 
+// 80 positional semirings: XxX -> T where T is int64 or int32, and the type of
+// X is ignored:
+
+//      5 monoids: MIN, MAX, PLUS, TIMES, ANY
+//      8 multiply operators:
+//          FIRSTI, FIRSTI1, FIRSTJ, FIRSTJ1,
+//          SECONDI, SECONDI1, SECONDJ, SECONDJ1
+//      2 types: int32, int64
+
 // The ANY operator is also valid to use as a multiplicative operator in a
 // semiring, but serves no purpose in that case.  The ANY operator is meant as
 // a fast additive operator for a monoid, that terminates, or short-circuits,
@@ -7530,11 +7104,11 @@ GB_PUBLIC GrB_Monoid
 // always the same.  This is the type T for the first set, and Boolean for
 // the second and third sets of semirngs.
 
-// 1473 = 1000 + 300 + 55 + 54 + 64 semirings are named below, but 35 = 30 + 3
-// + 2 are identical to the corresponding any_pair semirings of the same type.
-// There are thus 1438 unique semirings listed below.  The PAIR multiplier thus
-// appears in 26 unique semirings: 13 any_pair (one per 13 types), 12 plus_pair
-// (for all but bool), and lxor_pair for bool.
+// 1553 = 1000 + 300 + 55 + 54 + 64 + 80 semirings are named below, but 35 = 30
+// + 3 + 2 are identical to the corresponding any_pair semirings of the same
+// type.  For positional semirings, the mulitiply ops FIRSTJ and SECONDI are
+// identical, as are FIRSTJ1 and SECONDI1.  These semirings still appear as
+// predefined, for convenience.
 
 GB_PUBLIC GrB_Semiring
 
@@ -7787,6 +7361,10 @@ GB_PUBLIC GrB_Semiring
 // 300 semirings with a comparison operator TxT -> bool, where T is non-Boolean
 //------------------------------------------------------------------------------
 
+    // In the 4th column the GxB_EQ_*_* semirings could also be called
+    // GxB_LXNOR_*_*, since the EQ and LXNOR boolean operators are identical
+    // but those names are not included.
+
     // semirings with multiply op: z = EQ (x,y), where z is boolean and x,y are given by the suffix:
     GxB_LOR_EQ_INT8        , GxB_LAND_EQ_INT8       , GxB_LXOR_EQ_INT8       , GxB_EQ_EQ_INT8         , GxB_ANY_EQ_INT8        ,
     GxB_LOR_EQ_INT16       , GxB_LAND_EQ_INT16      , GxB_LXOR_EQ_INT16      , GxB_EQ_EQ_INT16        , GxB_ANY_EQ_INT16       ,
@@ -7864,9 +7442,11 @@ GB_PUBLIC GrB_Semiring
 //------------------------------------------------------------------------------
 
     // Note that lor_pair, land_pair, and eq_pair are all identical to any_pair.
-    // These 3 are marked below.
+    // These 3 are marked below.  GxB_EQ_*_BOOL could be called
+    // GxB_LXNOR_*_BOOL, and GxB_*_EQ_BOOL could be called GxB_*_LXNOR_BOOL,
+    // but those names are not included.
 
-    // purely boolean semirings (in the form GxB_(add monoid)_(multipy operator)_BOOL:
+    // purely boolean semirings in the form GxB_(add monoid)_(multipy operator)_BOOL:
     GxB_LOR_FIRST_BOOL     , GxB_LAND_FIRST_BOOL    , GxB_LXOR_FIRST_BOOL    , GxB_EQ_FIRST_BOOL      , GxB_ANY_FIRST_BOOL     ,
     GxB_LOR_SECOND_BOOL    , GxB_LAND_SECOND_BOOL   , GxB_LXOR_SECOND_BOOL   , GxB_EQ_SECOND_BOOL     , GxB_ANY_SECOND_BOOL    ,
     GxB_LOR_PAIR_BOOL/**/  , GxB_LAND_PAIR_BOOL/**/ , GxB_LXOR_PAIR_BOOL     , GxB_EQ_PAIR_BOOL/**/   , GxB_ANY_PAIR_BOOL      ,
@@ -7884,8 +7464,6 @@ GB_PUBLIC GrB_Semiring
 //------------------------------------------------------------------------------
 
     // 3 monoids (plus, times, any), 2 types (FC32 and FC64), and 9
-    // multiplicative operators.  This list is not exhaustive, since it is
-    // possible to build complex semirings POW, ANY, ISEQ, and ISNE as the
     // multiplicative operators.
 
     // Note that times_pair is identical to any_pair.
@@ -7926,13 +7504,6 @@ GB_PUBLIC GrB_Semiring
     // mult:    (BOR, BAND, BXOR, BXNOR) x 
     // types:   (UINT8, UINT16, UINT32, UINT64)
 
-    // Many other bitwise semirings can be constructed using predefined types
-    // and operators.  Bitwise monoids can be constructed for signed integer
-    // types, but these are not well-defined by the ANSI C specification, so
-    // they are excluded from the pre-defined monoids in SuiteSparse:GraphBLAS.
-    // Additional semirings can also be constructed with a multiplicative
-    // binary operator on any signed or unsigned integer type, as well.
-
     GxB_BOR_BOR_UINT8      , GxB_BOR_BOR_UINT16     , GxB_BOR_BOR_UINT32     , GxB_BOR_BOR_UINT64     ,
     GxB_BOR_BAND_UINT8     , GxB_BOR_BAND_UINT16    , GxB_BOR_BAND_UINT32    , GxB_BOR_BAND_UINT64    ,
     GxB_BOR_BXOR_UINT8     , GxB_BOR_BXOR_UINT16    , GxB_BOR_BXOR_UINT32    , GxB_BOR_BXOR_UINT64    ,
@@ -7951,17 +7522,73 @@ GB_PUBLIC GrB_Semiring
     GxB_BXNOR_BOR_UINT8    , GxB_BXNOR_BOR_UINT16   , GxB_BXNOR_BOR_UINT32   , GxB_BXNOR_BOR_UINT64   ,
     GxB_BXNOR_BAND_UINT8   , GxB_BXNOR_BAND_UINT16  , GxB_BXNOR_BAND_UINT32  , GxB_BXNOR_BAND_UINT64  ,
     GxB_BXNOR_BXOR_UINT8   , GxB_BXNOR_BXOR_UINT16  , GxB_BXNOR_BXOR_UINT32  , GxB_BXNOR_BXOR_UINT64  ,
-    GxB_BXNOR_BXNOR_UINT8  , GxB_BXNOR_BXNOR_UINT16 , GxB_BXNOR_BXNOR_UINT32 , GxB_BXNOR_BXNOR_UINT64 ;
+    GxB_BXNOR_BXNOR_UINT8  , GxB_BXNOR_BXNOR_UINT16 , GxB_BXNOR_BXNOR_UINT32 , GxB_BXNOR_BXNOR_UINT64 ,
 
 //------------------------------------------------------------------------------
-// GrB_* semirings in the specification
+// 80 positional semirings
+//------------------------------------------------------------------------------
+
+    // monoids: (MIN, MAX, ANY, PLUS, TIMES) x
+    // mult:    (FIRSTI, FIRSTI1, FIRSTJ, FIRSTJ1, SECONDI, SECONDI1, SECONDJ, SECONDJ1)
+    // types:   (INT32, INT64)
+
+    GxB_MIN_FIRSTI_INT32,     GxB_MIN_FIRSTI_INT64,
+    GxB_MAX_FIRSTI_INT32,     GxB_MAX_FIRSTI_INT64,
+    GxB_ANY_FIRSTI_INT32,     GxB_ANY_FIRSTI_INT64,
+    GxB_PLUS_FIRSTI_INT32,    GxB_PLUS_FIRSTI_INT64,
+    GxB_TIMES_FIRSTI_INT32,   GxB_TIMES_FIRSTI_INT64,
+
+    GxB_MIN_FIRSTI1_INT32,    GxB_MIN_FIRSTI1_INT64,
+    GxB_MAX_FIRSTI1_INT32,    GxB_MAX_FIRSTI1_INT64,
+    GxB_ANY_FIRSTI1_INT32,    GxB_ANY_FIRSTI1_INT64,
+    GxB_PLUS_FIRSTI1_INT32,   GxB_PLUS_FIRSTI1_INT64,
+    GxB_TIMES_FIRSTI1_INT32,  GxB_TIMES_FIRSTI1_INT64,
+
+    GxB_MIN_FIRSTJ_INT32,     GxB_MIN_FIRSTJ_INT64,
+    GxB_MAX_FIRSTJ_INT32,     GxB_MAX_FIRSTJ_INT64,
+    GxB_ANY_FIRSTJ_INT32,     GxB_ANY_FIRSTJ_INT64,
+    GxB_PLUS_FIRSTJ_INT32,    GxB_PLUS_FIRSTJ_INT64,
+    GxB_TIMES_FIRSTJ_INT32,   GxB_TIMES_FIRSTJ_INT64,
+
+    GxB_MIN_FIRSTJ1_INT32,    GxB_MIN_FIRSTJ1_INT64,
+    GxB_MAX_FIRSTJ1_INT32,    GxB_MAX_FIRSTJ1_INT64,
+    GxB_ANY_FIRSTJ1_INT32,    GxB_ANY_FIRSTJ1_INT64,
+    GxB_PLUS_FIRSTJ1_INT32,   GxB_PLUS_FIRSTJ1_INT64,
+    GxB_TIMES_FIRSTJ1_INT32,  GxB_TIMES_FIRSTJ1_INT64,
+
+    GxB_MIN_SECONDI_INT32,    GxB_MIN_SECONDI_INT64,
+    GxB_MAX_SECONDI_INT32,    GxB_MAX_SECONDI_INT64,
+    GxB_ANY_SECONDI_INT32,    GxB_ANY_SECONDI_INT64,
+    GxB_PLUS_SECONDI_INT32,   GxB_PLUS_SECONDI_INT64,
+    GxB_TIMES_SECONDI_INT32,  GxB_TIMES_SECONDI_INT64,
+
+    GxB_MIN_SECONDI1_INT32,   GxB_MIN_SECONDI1_INT64,
+    GxB_MAX_SECONDI1_INT32,   GxB_MAX_SECONDI1_INT64,
+    GxB_ANY_SECONDI1_INT32,   GxB_ANY_SECONDI1_INT64,
+    GxB_PLUS_SECONDI1_INT32,  GxB_PLUS_SECONDI1_INT64,
+    GxB_TIMES_SECONDI1_INT32, GxB_TIMES_SECONDI1_INT64,
+
+    GxB_MIN_SECONDJ_INT32,    GxB_MIN_SECONDJ_INT64,
+    GxB_MAX_SECONDJ_INT32,    GxB_MAX_SECONDJ_INT64,
+    GxB_ANY_SECONDJ_INT32,    GxB_ANY_SECONDJ_INT64,
+    GxB_PLUS_SECONDJ_INT32,   GxB_PLUS_SECONDJ_INT64,
+    GxB_TIMES_SECONDJ_INT32,  GxB_TIMES_SECONDJ_INT64,
+
+    GxB_MIN_SECONDJ1_INT32,   GxB_MIN_SECONDJ1_INT64,
+    GxB_MAX_SECONDJ1_INT32,   GxB_MAX_SECONDJ1_INT64,
+    GxB_ANY_SECONDJ1_INT32,   GxB_ANY_SECONDJ1_INT64,
+    GxB_PLUS_SECONDJ1_INT32,  GxB_PLUS_SECONDJ1_INT64,
+    GxB_TIMES_SECONDJ1_INT32, GxB_TIMES_SECONDJ1_INT64 ;
+
+//------------------------------------------------------------------------------
+// GrB_* semirings
 //------------------------------------------------------------------------------
 
 // The v1.3 C API for GraphBLAS adds the following 124 predefined semirings,
 // with GrB* names.  They are identical to 124 GxB* semirings defined above,
 // with the same name, except that GrB_LXNOR_LOR_SEMIRING_BOOL is identical to
 // GxB_EQ_LOR_BOOL (since GrB_EQ_BOOL == GrB_LXNOR).  The old names are listed
-// below alongside each new name; the new names are preferred.
+// below alongside each new name; the new GrB* names are preferred.
 
 // 12 kinds of GrB* semirings are available for all 10 real, non-boolean types:
 
@@ -7973,6 +7600,8 @@ GB_PUBLIC GrB_Semiring
 
     // LOR_LAND, LAND_LOR, LXOR_LAND, LXNOR_LOR.
 
+// GxB* semirings corresponding to the equivalent GrB* semiring are deprecated.
+
 GB_PUBLIC GrB_Semiring
 
     //--------------------------------------------------------------------------
@@ -8140,21 +7769,17 @@ GB_PUBLIC GrB_Semiring
     GrB_LXOR_LAND_SEMIRING_BOOL,        // GxB_LXOR_LAND_BOOL
     GrB_LXNOR_LOR_SEMIRING_BOOL ;       // GxB_EQ_LOR_BOOL (note EQ == LXNOR)
 
-//------------------------------------------------------------------------------
+//==============================================================================
 // GrB_*_resize:  change the size of a matrix or vector
-//------------------------------------------------------------------------------
+//==============================================================================
 
 // If the dimensions decrease, entries that fall outside the resized matrix or
-// vector are deleted.  GrB_Matrix_resize and GrB_Vector_resize now appear in
-// the spec, with the identical behaviour as the earlier GxB_*_resize
-// functions.  The Generic GxB_resize does not appear in the spec.  The old
-// GxB* names are kept for backward compatibility, but new code should use
-// the GrB* names.
+// vector are deleted.
 
 GB_PUBLIC
 GrB_Info GrB_Matrix_resize      // change the size of a matrix
 (
-    GrB_Matrix A,               // matrix to modify
+    GrB_Matrix C,               // matrix to modify
     GrB_Index nrows_new,        // new number of rows in matrix
     GrB_Index ncols_new         // new number of columns in matrix
 ) ;
@@ -8162,22 +7787,23 @@ GrB_Info GrB_Matrix_resize      // change the size of a matrix
 GB_PUBLIC
 GrB_Info GrB_Vector_resize      // change the size of a vector
 (
-    GrB_Vector u,               // vector to modify
+    GrB_Vector w,               // vector to modify
     GrB_Index nrows_new         // new number of rows in vector
 ) ;
 
+// GxB_*_resize are identical to the GrB*resize methods above
 GB_PUBLIC
-GrB_Info GxB_Matrix_resize      // change the size of a matrix
+GrB_Info GxB_Matrix_resize      // change the size of a matrix (deprecated)
 (
-    GrB_Matrix A,               // matrix to modify
+    GrB_Matrix C,               // matrix to modify
     GrB_Index nrows_new,        // new number of rows in matrix
     GrB_Index ncols_new         // new number of columns in matrix
 ) ;
 
 GB_PUBLIC
-GrB_Info GxB_Vector_resize      // change the size of a vector
+GrB_Info GxB_Vector_resize      // change the size of a vector (deprecated)
 (
-    GrB_Vector u,               // vector to modify
+    GrB_Vector w,               // vector to modify
     GrB_Index nrows_new         // new number of rows in vector
 ) ;
 
@@ -8197,81 +7823,9 @@ GrB_Info GxB_Vector_resize      // change the size of a vector
     (arg1, __VA_ARGS__)
 #endif
 
-//------------------------------------------------------------------------------
-// GrB_kronecker:  Kronecker product
-//------------------------------------------------------------------------------
-
-// GxB_kron is now called GrB_Matrix_kronecker_BinaryOp, and can also be used
-// by the generic GrB_kronecker.  The GxB_kron name is kept for backward
-// compatibility.  GxB_kron will be kept for backward compatibility, but
-// new user code should switch to GrB_kronecker.
-
-GB_PUBLIC
-GrB_Info GxB_kron                   // C<Mask> = accum (C, kron(A,B))
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_BinaryOp op,          // defines '*' for T=kron(A,B)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
-) ;
-
-GB_PUBLIC
-GrB_Info GrB_Matrix_kronecker_BinaryOp  // C<M> = accum (C, kron(A,B))
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix M,             // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_BinaryOp op,          // defines '*' for T=kron(A,B)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Descriptor desc       // descriptor for C, M, A, and B
-) ;
-
-GB_PUBLIC
-GrB_Info GrB_Matrix_kronecker_Monoid  // C<M> = accum (C, kron(A,B))
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix M,             // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_Monoid monoid,        // defines '*' for T=kron(A,B)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Descriptor desc       // descriptor for C, M, A, and B
-) ;
-
-GB_PUBLIC
-GrB_Info GrB_Matrix_kronecker_Semiring  // C<M> = accum (C, kron(A,B))
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix M,             // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_Semiring semiring,    // defines '*' for T=kron(A,B)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Descriptor desc       // descriptor for C, M, A, and B
-) ;
-
-#if GxB_STDC_VERSION >= 201112L
-#define GrB_kronecker(C,Mask,accum,op,A,B,desc)                 \
-    _Generic                                                    \
-    (                                                           \
-        (op),                                                   \
-        const GrB_Semiring : GrB_Matrix_kronecker_Semiring ,    \
-              GrB_Semiring : GrB_Matrix_kronecker_Semiring ,    \
-        const GrB_Monoid   : GrB_Matrix_kronecker_Monoid   ,    \
-              GrB_Monoid   : GrB_Matrix_kronecker_Monoid   ,    \
-        const GrB_BinaryOp : GrB_Matrix_kronecker_BinaryOp ,    \
-              GrB_BinaryOp : GrB_Matrix_kronecker_BinaryOp      \
-    )                                                           \
-    (C, Mask, accum, op, A, B, desc)
-#endif
-
-//------------------------------------------------------------------------------
+//==============================================================================
 // GxB_fprint and GxB_print: print the contents of a GraphBLAS object
-//------------------------------------------------------------------------------
+//==============================================================================
 
 // GxB_fprint (object, GxB_Print_Level pr, FILE *f) prints the contents of any
 // of the 9 GraphBLAS objects to the file f, and also does an extensive test on
@@ -8449,7 +8003,7 @@ GrB_Info GxB_Scalar_fprint          // print and check a GxB_Scalar
 #endif
 
 //==============================================================================
-// Matrix and vector import/export
+// GxB_import* and GxB_export*: Matrix and vector import/export
 //==============================================================================
 
 // The import/export functions allow the user application to create a
@@ -8485,115 +8039,149 @@ GrB_Info GxB_Scalar_fprint          // print and check a GxB_Scalar
 // for the user (via the ANSI C malloc function), fill them with the GrB_Matrix
 // or GrB_Vector data, and then return the newly allocated arrays to the user.
 
-// Four different formats are provided for import/export.  For each format, the
-// Ax array has a C-type <type> corresponding to one of the 11 built-in types
-// in GraphBLAS (bool, int*_t, uint*_t, float, and double), or a user-defined
-// type.
+// Eight different formats are provided for import/export.  For each format,
+// the Ax array has a C-type <type> corresponding to one of the 13 built-in
+// types in GraphBLAS (bool, int*_t, uint*_t, float, double, float complex, or
+// double complex), or a user-defined type.
+
+// On import, the required user arrays Ah, Ap, Ab, Ai, Aj, and/or Ax must be
+// non-NULL pointers to memory space allocated by the ANSI C malloc (or calloc,
+// or realloc), unless nzmax is zero (in which case the Ab, Ai, Aj, Ax, vb, vi,
+// and vx arrays may all be NULL).  Just like GrB_*_new, the GrB_Matrix A (or
+// GrB_Vector v) is undefined on input.  If the import is successful, the
+// GrB_Matrix A or GrB_Vector v is created, and the pointers to the user input
+// arrays have been set to NULL.  These user arrays have either been
+// incorporated directly into the GrB_Matrix A or GrB_Vector v, in which case
+// the user input arrays will eventually be freed by GrB_free (&A), or their
+// contents have been copied and the arrays freed.  This decision is made by
+// the GraphBLAS library itself, and the user application has no control over
+// this decision.
+
+// If any of the arrays Ab, Aj, Ai, Ax, vb, vi, or vx have zero size (with
+// nzmax of zero), they are allowed to be be NULL pointers on input.
+
+// No error checking is performed on the content of the user input arrays.  If
+// the user input arrays do not conform to the precise specifications above,
+// results are undefined.  No typecasting of the values of the matrix or vector
+// entries is performed on import or export.
+
+// SuiteSparse:GraphBLAS supports all eight formats natively (CSR, CSC,
+// HyperCSR, and HyperCSC, BitmapR, BitmapC, FullR, FullC).  For vectors, only
+// CSC, BitmapC, and FullC formats are used.  On import, the all eight formats
+// take O(1) time and memory to import.  On export, if the GrB_Matrix or
+// GrB_Vector is already in this particular format, then the export takes O(1)
+// time and no memory copying is performed.
 
+// If the import is not successful, the GxB_Matrix_import_* functions return A
+// as NULL, GxB_Vector_import returns v as NULL, and the user input arrays are
+// neither modified nor freed.  They are still owned by the user application.
+
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_CSR: import a CSR matrix
 //------------------------------------------------------------------------------
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_import_CSR      // import a CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // CSR format:
-    int64_t nonempty,       // number of rows with at least one entry:
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index **Ap,         // row "pointers", size nrows+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nrows+1
+    GrB_Index **Aj,     // row indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Aj_size,  // size of Aj
+    GrB_Index Ax_size,  // size of Ax
+    bool jumbled,       // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
 ) ;
 
     // CSR:  an nrows-by-ncols matrix with nvals entries in CSR format consists
-    // of 3 arrays:
+    // of 3 arrays, where nvals = Ap [nrows]:
     //
     //          GrB_Index Ap [nrows+1], Aj [nvals] ; <type> Ax [nvals] ;
     //
     //      The column indices of entries in the ith row of the matrix are held
     //      in Aj [Ap [i] ... Ap[i+1]], and the corresponding values are held
     //      in the same positions in Ax.  Column indices must be in the range 0
-    //      to ncols-1, and must appear in sorted order within each row.  No
-    //      duplicate column indices may appear in any row.  Ap [0] must equal
-    //      zero, and Ap [nrows] must equal nvals.  The Ap array must be of
-    //      size nrows+1 (or larger), and the Aj and Ax arrays must have size
-    //      at least nvals.  If nvals is zero, then the Aj and Ax arrays need
-    //      not be present and can be NULL.
-
-    //      The nonempty parameter is optional.  It states the number of rows
-    //      that have at least one entry: if not known, use -1;
-    //      if nonempty >= 0 the value must be exact.
+    //      to ncols-1.  If jumbled is false, the column indices must appear in
+    //      sorted order within each row.  No duplicate column indices may
+    //      appear in any row.  Ap [0] must equal zero, and Ap [nrows] must
+    //      equal nvals.  The Ap array must be of size nrows+1 (or larger), and
+    //      the Aj and Ax arrays must have size at least nvals.  If nvals is
+    //      zero, then the Aj and Ax arrays need not be present and can be
+    //      NULL.
 
 //------------------------------------------------------------------------------
+// GxB_Matrix_import_CSC: import a CSC matrix
+//------------------------------------------------------------------------------
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_import_CSC      // import a CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // CSC format:
-    int64_t nonempty,       // number of columns with at least one entry:
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index **Ap,         // column "pointers", size ncols+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    GrB_Index **Ap,     // column "pointers", Ap_size >= ncols+1
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Ai_size,  // size of Ai
+    GrB_Index Ax_size,  // size of Ax
+    bool jumbled,       // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
 ) ;
 
     // CSC:  an nrows-by-ncols matrix with nvals entries in CSC format consists
-    // of 3 arrays:
+    // of 3 arrays, where nvals = Ap [ncols]:
     //
     //          GrB_Index Ap [ncols+1], Ai [nvals] ; <type> Ax [nvals] ;
     //
     //      The row indices of entries in the jth column of the matrix are held
     //      in Ai [Ap [j] ... Ap[j+1]], and the corresponding values are held
     //      in the same positions in Ax.  Row indices must be in the range 0 to
-    //      nrows-1, and must appear in sorted order within each column.  No
-    //      duplicate row indices may appear in any column.  Ap [0] must equal
-    //      zero, and Ap [ncols] must equal nvals.  The Ap array must be of
-    //      size ncols+1 (or larger), and the Ai and Ax arrays must have size
-    //      at least nvals.  If nvals is zero, then the Ai and Ax arrays need
-    //      not be present and can be NULL.
-
-    //      The nonempty parameter is optional.  It states the number of
-    //      columns that have at least one entry: if not known, use -1;
-    //      if nonempty >= 0 the value must be exact.
+    //      nrows-1.  If jumbled is false, the row indices must appear in
+    //      sorted order within each column.  No duplicate row indices may
+    //      appear in any column.  Ap [0] must equal zero, and Ap [ncols] must
+    //      equal nvals.  The Ap array must be of size ncols+1 (or larger), and
+    //      the Ai and Ax arrays must have size at least nvals.  If nvals is
+    //      zero, then the Ai and Ax arrays need not be present and can be
+    //      NULL.
 
 //------------------------------------------------------------------------------
+// GxB_Matrix_import_HyperCSR: import a hypersparse CSR matrix
+//------------------------------------------------------------------------------
 
 GB_PUBLIC
-GrB_Info GxB_Matrix_import_HyperCSR     // import a hypersparse CSR matrix
+GrB_Info GxB_Matrix_import_HyperCSR      // import a hypersparse CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // hypersparse CSR format:
-    int64_t nonempty,       // number of rows in Ah with at least one entry,
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index nvec,         // number of rows in Ah list
-    GrB_Index **Ah,         // list of size nvec of rows that appear in A
-    GrB_Index **Ap,         // row "pointers", size nvec+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
-) ;
-
-    // HYPER_CSR: an nrows-by-ncols matrix with nvals entries and nvec
-    // rows that may have entries in HYPER_CSR format consists of 4 arrays:
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // row indices, Ah_size >= nvec
+    GrB_Index **Aj,     // column indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Ah_size,  // size of Ah
+    GrB_Index Aj_size,  // size of Aj
+    GrB_Index Ax_size,  // size of Ax
+    GrB_Index nvec,     // number of rows that appear in Ah
+    bool jumbled,       // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
+) ;
+
+    // HyperCSR: an nrows-by-ncols matrix with nvals entries and nvec
+    // rows that may have entries in HyperCSR format consists of 4 arrays,
+    // where nvals = Ap [nvec]:
     //
     //          GrB_Index Ah [nvec], Ap [nvec+1], Aj [nvals] ;
     //          <type> Ax [nvals] ;
     //
-    //      The Aj and Ax arrays are the same for a matrix in CSR or HYPER_CSR
+    //      The Aj and Ax arrays are the same for a matrix in CSR or HyperCSR
     //      format.  Only Ap and Ah differ.
     //
     //      The Ah array is a list of the row indices of rows that appear in
@@ -8609,41 +8197,39 @@ GrB_Info GxB_Matrix_import_HyperCSR     // import a hypersparse CSR matrix
     //      Aj and Ax must be at least of size nvals.  If nvals is zero, then
     //      the Aj and Ax arrays need not be present and can be NULL.
 
-    //      The nonempty parameter is optional.  Row indices that do not appear
-    //      in the Ah list have no entries.  Row indices that do appear in Ah
-    //      have >= 0 entries.  The nonempty parameter states the number of
-    //      rows in the Ah list that have at least one entry: if not known, use
-    //      -1.  If nonempty >= 0 the value must be exact.
-
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_HyperCSC: import a hypersparse CSC matrix
 //------------------------------------------------------------------------------
 
 GB_PUBLIC
-GrB_Info GxB_Matrix_import_HyperCSC     // import a hypersparse CSC matrix
+GrB_Info GxB_Matrix_import_HyperCSC      // import a hypersparse CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // hypersparse CSC format:
-    int64_t nonempty,       // number of columns in Ah with at least one entry,
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index nvec,         // number of columns in Ah list
-    GrB_Index **Ah,         // list of size nvec of columns that appear in A
-    GrB_Index **Ap,         // column "pointers", size nvec+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
-) ;
-
-    // HYPER_CSC: an nrows-by-ncols matrix with nvals entries and nvec
-    // columns that may have entries in HYPER_CSC format consists of 4 arrays:
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    GrB_Index **Ap,     // column "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // column indices, Ah_size >= nvec
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Ah_size,  // size of Ah
+    GrB_Index Ai_size,  // size of Ai
+    GrB_Index Ax_size,  // size of Ax
+    GrB_Index nvec,     // number of columns that appear in Ah
+    bool jumbled,       // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
+) ;
+
+    // HyperCSC: an nrows-by-ncols matrix with nvals entries and nvec
+    // columns that may have entries in HyperCSC format consists of 4 arrays,
+    // where nvals = Ap [nvec]:
     //
     //
     //          GrB_Index Ah [nvec], Ap [nvec+1], Ai [nvals] ;
     //          <type> Ax [nvals] ;
     //
-    //      The Ai and Ax arrays are the same for a matrix in CSC or HYPER_CSC
+    //      The Ai and Ax arrays are the same for a matrix in CSC or HyperCSC
     //      format.  Only Ap and Ah differ.
     //
     //      The Ah array is a list of the column indices of non-empty columns.
@@ -8659,75 +8245,177 @@ GrB_Info GxB_Matrix_import_HyperCSC     // import a hypersparse CSC matrix
     //      If nvals is zero, then the Ai and Ax arrays need not be present and
     //      can be NULL.
 
-    //      The nonempty parameter is optional.  Column indices that do not
-    //      appear in the Ah list have no entries.  Column indices that do
-    //      appear in Ah have >= 0 entries.  The nonempty parameter states the
-    //      number of columns in the Ah list that have at least one entry: if
-    //      not known, use -1.  If nonempty >= 0 the value must be exact.
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_BitmapR: import a bitmap matrix, held by row
+//------------------------------------------------------------------------------
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_import_BitmapR  // import a bitmap matrix, held by row
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index Ab_size,  // size of Ab
+    GrB_Index Ax_size,  // size of Ax
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+
+    // BitmapR: a dense format, but able to represent sparsity structure of A.
+    //
+    //          int8_t Ab [nrows*ncols] ;
+    //          <type> Ax [nrows*ncols] ;
+    //
+    //      Ab and Ax are both of size nrows*ncols.  Ab [i*ncols+j] = 1 if the
+    //      A(i,j) entry is present with value Ax [i*ncols+j], or 0 if A(i,j)
+    //      is not present.  nvals must equal the number of 1's in the Ab
+    //      array.
 
 //------------------------------------------------------------------------------
+// GxB_Matrix_import_BitmapC: import a bitmap matrix, held by column
+//------------------------------------------------------------------------------
 
-// On import, the required user arrays Ah, Ap, Ai, Aj, and/or Ax must be
-// non-NULL pointers to memory space allocted by the ANSI C malloc (or calloc,
-// or realloc).  Just like GrB_*_new, the GrB_Matrix A (or GrB_Vector v) is
-// undefined on input.  If the import is successful, the GrB_Matrix A or
-// GrB_Vector v is created, and the pointers to the user input arrays have been
-// set to NULL.  These user arrays have either been incorporated directly into
-// the GrB_Matrix A or GrB_Vector v, in which case the user input arrays will
-// eventually be freed by GrB_free (&A), or their contents have been copied and
-// the arrays freed.  This decision is made by the GraphBLAS library itself,
-// and the user application has no control over this decision.
-
-// If any of the above arrays Ap, Ah, Aj, Ai, or Ax have zero size, they must
-// still be non-NULL pointers to malloc'd space on input (effectively of size
-// at least 1 byte).  No error checking is performed on the user input arrays.
-// If the user input arrays do not conform to the precise specifications above,
-// results are undefined.  No typecasting of the values of the matrix or vector
-// entries is performed on import or export.
+GB_PUBLIC
+GrB_Info GxB_Matrix_import_BitmapC  // import a bitmap matrix, held by column
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index Ab_size,  // size of Ab
+    GrB_Index Ax_size,  // size of Ax
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
 
-// SuiteSparse:GraphBLAS supports the first four formats natively (CSR, CSC,
-// HYPER_CSR, and HYPER_CSC).  On import, the first four formats take O(1) time
-// and memory to import.  On export, if the GrB_Matrix or GrB_Vector is already
-// in this particular format, then the export takes O(1) time and no memory
-// copying is performed.
+    // BitmapC: a dense format, but able to represent sparsity structure of A.
+    //
+    //          int8_t Ab [nrows*ncols] ;
+    //          <type> Ax [nrows*ncols] ;
+    //
+    //      Ab and Ax are both of size nrows*ncols.  Ab [i+j*nrows] = 1 if the
+    //      A(i,j) entry is present with value Ax [i+j*nrows], or 0 if A(i,j)
+    //      is not present.  nvals must equal the number of 1's in the Ab
+    //      array.
 
-// GxB_Vector_import:
-//
-//      For the import of a GrB_Vector, the four formats are all identical to
-//      one another (CSR, CSC, HYPER_CSR, HYPER_CSC).  The Ap and Ah arrays do
-//      not appear, and implicitly refer to a single sparse vector.  The
-//      GrB_Vector is treated as if it were a single row of an 1-by-n matrix in
-//      CSR format, or equivalently as a single column of an n-by-1 matrix in
-//      CSC format.  If nvals is zero, then the vi and vx arrays need not be
-//      present and can be NULL.
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_FullR:  import a full matrix, held by row
+//------------------------------------------------------------------------------
 
 GB_PUBLIC
-GrB_Info GxB_Vector_import  // import a vector in CSC format
+GrB_Info GxB_Matrix_import_FullR  // import a full matrix, held by row
 (
-    GrB_Vector *vhandle,    // handle of vector to create
-    GrB_Type type,          // type of vector to create
-    GrB_Index n,            // vector length
-    GrB_Index nvals,        // number of entries in the vector
-    // CSR/CSC format:
-    GrB_Index **vi,         // indices, size nvals (in sorted order)
-    void      **vx,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index Ax_size,  // size of Ax
+    const GrB_Descriptor desc
 ) ;
 
-// If the import is not successful, the GxB_Matrix_import_* functions return A
-// as NULL, GxB_Vector_import returns v as NULL, and the user input arrays are
-// neither modified nor freed.  They are still owned by the user application.
+    // FullR: an nrows-by-ncols full matrix held in row-major order:
+    //
+    //  <type> Ax [nrows*ncols] ;
+    //
+    //      Ax is an array of size nrows*ncols, where A(i,j) is held in
+    //      Ax [i*ncols+j].  All entries in A are present.
+
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_FullC: import a full matrix, held by column
+//------------------------------------------------------------------------------
 
-// Note that the first 4 arguments of GxB_Matrix_import_*, and the first 3
-// of GxB_Vector_import, are identical to GrB_Matrix_new and GrB_Vector_new,
-// respectively.
+GB_PUBLIC
+GrB_Info GxB_Matrix_import_FullC  // import a full matrix, held by column
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index Ax_size,  // size of Ax
+    const GrB_Descriptor desc
+) ;
+
+    // FullC: an nrows-by-ncols full matrix held in column-major order:
+    //
+    //  <type> Ax [nrows*ncols] ;
+    //
+    //      Ax is an array of size nrows*ncols, where A(i,j) is held in
+    //      Ax [i+j*nrows].  All entries in A are present.
 
 //------------------------------------------------------------------------------
+// GxB_Vector_import_CSC: import a vector in CSC format
+//------------------------------------------------------------------------------
 
-// The GrB_*_export functions are symmetric with the GrB_*_import functions.
-//
-// GxB_Matrix_export and GxB_Vector_export force completion of any pending
-// operations, prior to the export.
+GB_PUBLIC
+GrB_Info GxB_Vector_import_CSC  // import a vector in CSC format
+(
+    GrB_Vector *v,      // handle of vector to create
+    GrB_Type type,      // type of vector to create
+    GrB_Index n,        // vector length
+    GrB_Index **vi,     // indices, vi_size >= nvals(v)
+    void **vx,          // values, vx_size >= nvals(v)
+    GrB_Index vi_size,  // size of vi
+    GrB_Index vx_size,  // size of vx
+    GrB_Index nvals,    // # of entries in vector
+    bool jumbled,       // if true, indices may be unsorted
+    const GrB_Descriptor desc
+) ;
+
+    // The GrB_Vector is treated as if it was a single column of an n-by-1
+    // matrix in CSC format, except that no vp array is required.  If nvals is
+    // zero, then the vi and vx arrays need not be present and can be NULL.
+
+//------------------------------------------------------------------------------
+// GxB_Vector_import_Bitmap: import a vector in bitmap format
+//------------------------------------------------------------------------------
+
+GB_PUBLIC
+GrB_Info GxB_Vector_import_Bitmap // import a bitmap vector
+(
+    GrB_Vector *v,      // handle of vector to create
+    GrB_Type type,      // type of vector to create
+    GrB_Index n,        // vector length
+    int8_t **vb,        // bitmap, vb_size >= n
+    void **vx,          // values, vx_size >= n
+    GrB_Index vb_size,  // size of vb
+    GrB_Index vx_size,  // size of vx
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+
+    // The GrB_Vector is treated as if it was a single column of an n-by-1
+    // matrix in BitmapC format.
+
+//------------------------------------------------------------------------------
+// GxB_Vector_import_Full: import a vector in full format
+//------------------------------------------------------------------------------
+
+GB_PUBLIC
+GrB_Info GxB_Vector_import_Full // import a full vector
+(
+    GrB_Vector *v,      // handle of vector to create
+    GrB_Type type,      // type of vector to create
+    GrB_Index n,        // vector length
+    void **vx,          // values, vx_size >= nvals(v)
+    GrB_Index vx_size,  // size of vx
+    const GrB_Descriptor desc
+) ;
+
+    // The GrB_Vector is treated as if it was a single column of an n-by-1
+    // matrix in FullC format.
+
+//------------------------------------------------------------------------------
+
+// The GxB_*_export functions are symmetric with the GxB_*_import functions.
+// GxB_*export* functions force completion of any pending operations, prior to
+// the export, except if the only pending operation is to unjumble the matrix.
 //
 // If there are no entries in the matrix or vector, then the index arrays
 // (Ai, Aj, or vi) and value arrays (Ax or vx) are returned as NULL.  This is
@@ -8740,108 +8428,192 @@ GrB_Info GxB_Vector_import  // import a vector in CSC format
 //      arrays Ah, Ap, Ai, Aj, and/or Ax are returned to the user application
 //      as arrays allocated by the ANSI C malloc function.  The four formats
 //      are the same as the import formats for GrB_Matrix_import_*.
+//
+//      If jumbled is NULL on input, this indicates to GxB_*export* that the
+//      exported matrix cannot be returned in a jumbled format.  In this case,
+//      if the matrix is jumbled, it is sorted before exporting it to the
+//      caller.
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_export_CSR  // export and free a CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // CSR format:
-    int64_t *nonempty,      // number of rows with at least one entry
-    GrB_Index **Ap,         // row "pointers", size nrows+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nrows+1
+    GrB_Index **Aj,     // row indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Aj_size, // size of Aj
+    GrB_Index *Ax_size, // size of Ax
+    bool *jumbled,      // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
 ) ;
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_export_CSC  // export and free a CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // CSC format:
-    int64_t *nonempty,      // number of columns with at least one entry
-    GrB_Index **Ap,         // column "pointers", size ncols+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    GrB_Index **Ap,     // column "pointers", Ap_size >= ncols+1
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Ai_size, // size of Ai
+    GrB_Index *Ax_size, // size of Ax
+    bool *jumbled,      // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
 ) ;
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_export_HyperCSR  // export and free a hypersparse CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // hypersparse CSR format:
-    int64_t *nonempty,      // number of rows in Ah with at least one entry
-    GrB_Index *nvec,        // number of rows in Ah list
-    GrB_Index **Ah,         // list of size nvec of rows that appear in A
-    GrB_Index **Ap,         // row "pointers", size nvec+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // row indices, Ah_size >= nvec
+    GrB_Index **Aj,     // column indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Ah_size, // size of Ah
+    GrB_Index *Aj_size, // size of Aj
+    GrB_Index *Ax_size, // size of Ax
+    GrB_Index *nvec,    // number of rows that appear in Ah
+    bool *jumbled,      // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
 ) ;
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_export_HyperCSC  // export and free a hypersparse CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // hypersparse CSC format:
-    int64_t *nonempty,      // number of columns in Ah with at least one entry
-    GrB_Index *nvec,        // number of columns in Ah list
-    GrB_Index **Ah,         // list of size nvec of columns that appear in A
-    GrB_Index **Ap,         // columns "pointers", size nvec+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
-) ;
-
-// GxB_Vector_export:
-//
-//      GxB_Vector_export exports a vector in CSC format for GxB_Vector_import,
-//      in which the indices are returned in sorted order.
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    GrB_Index **Ap,     // column "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // column indices, Ah_size >= nvec
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Ah_size, // size of Ah
+    GrB_Index *Ai_size, // size of Ai
+    GrB_Index *Ax_size, // size of Ax
+    GrB_Index *nvec,    // number of columns that appear in Ah
+    bool *jumbled,      // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
+) ;
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_export_BitmapR  // export and free a bitmap matrix, by row
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index *Ab_size, // size of Ab
+    GrB_Index *Ax_size, // size of Ax
+    GrB_Index *nvals,   // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
 
 GB_PUBLIC
-GrB_Info GxB_Vector_export  // export and free a vector
+GrB_Info GxB_Matrix_export_BitmapC  // export and free a bitmap matrix, by col
 (
-    GrB_Vector *vhandle,    // handle of vector to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *n,           // length of the vector
-    GrB_Index *nvals,       // number of entries in the vector
-    // CSR/CSC format:
-    GrB_Index **vi,         // indices, size nvals
-    void      **vx,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index *Ab_size, // size of Ab
+    GrB_Index *Ax_size, // size of Ax
+    GrB_Index *nvals,   // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_export_FullR  // export and free a full matrix, by row
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index *Ax_size, // size of Ax
+    const GrB_Descriptor desc
+) ;
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_export_FullC  // export and free a full matrix, by column
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index *Ax_size, // size of Ax
+    const GrB_Descriptor desc
+) ;
+
+    // For GxB_Matrix_export_Full*, all entries in A must be present.  That is,
+    // GrB_Matrix_nvals must report nvals equal to nrows*ncols.  If this
+    // condition does not hold, the matrix is not exported, and
+    // GrB_INVALID_VALUE is returned.
+
+GB_PUBLIC
+GrB_Info GxB_Vector_export_CSC  // export and free a CSC vector
+(
+    GrB_Vector *v,      // handle of vector to export and free
+    GrB_Type *type,     // type of vector exported
+    GrB_Index *n,       // length of the vector
+    GrB_Index **vi,     // indices, vi_size >= nvals(v)
+    void **vx,          // values, vx_size >= nvals(v)
+    GrB_Index *vi_size, // size of vi
+    GrB_Index *vx_size, // size of vx
+    GrB_Index *nvals,   // # of entries in vector
+    bool *jumbled,      // if true, indices may be unsorted
+    const GrB_Descriptor desc
+) ;
+
+GB_PUBLIC
+GrB_Info GxB_Vector_export_Bitmap   // export and free a bitmap vector
+(
+    GrB_Vector *v,      // handle of vector to export and free
+    GrB_Type *type,     // type of vector exported
+    GrB_Index *n,       // length of the vector
+    int8_t **vb,        // bitmap, vb_size >= n
+    void **vx,          // values, vx_size >= n
+    GrB_Index *vb_size, // size of vb
+    GrB_Index *vx_size, // size of vx
+    GrB_Index *nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+
+GB_PUBLIC
+GrB_Info GxB_Vector_export_Full   // export and free a full vector
+(
+    GrB_Vector *v,      // handle of vector to export and free
+    GrB_Type *type,     // type of vector exported
+    GrB_Index *n,       // length of the vector
+    void **vx,          // values, vx_size >= nvals(v)
+    GrB_Index *vx_size, // size of vx
+    const GrB_Descriptor desc
 ) ;
 
 // If the export is not successful, the GxB_Matrix_export_* functions do not
 // modify A, the GxB_Vector_export does not modify v, and the user arrays are
 // returned as NULL.
 
-// SuiteSparse:GraphBLAS supports all four formats natively (CSR, CSC,
-// HYPER_CSR, and HYPER_CSC).  On export, they take O(1) time if the internal
-// format matches the requested output format.  The internal format can be
-// queried via GxB_Matrix_Option_get, to determine if the format is by row or
-// by column, if desired.  If the formats do not match, SuiteSparse:GraphBLAS
-// first reformats the GrB_Matrix A into the desired format, and then exports
-// the result.
-
-//------------------------------------------------------------------------------
+//==============================================================================
 // CUDA memory management (DRAFT: in progress, do not use)
-//------------------------------------------------------------------------------
+//==============================================================================
 
 // These functions are made available to the user application, since the
 // GxB_import/export functions require the user application and the GraphBLAS
@@ -8863,21 +8635,5 @@ void *GxB_cuda_malloc (size_t size) ;           // standard malloc signature
 void *GxB_cuda_calloc (size_t n, size_t size) ; // standard calloc signature
 void  GxB_cuda_free (void *p) ;                 // standard free signature
 
-//------------------------------------------------------------------------------
-// MKL optimization (DRAFT: in progress, do not use)
-//------------------------------------------------------------------------------
-
-GrB_Info GxB_mxv_optimize           // analyze A for subsequent use in mxv
-(
-    GrB_Matrix A,                   // input/output matrix
-    int64_t ncalls,                 // estimate # of future calls to GrB_mxv
-    const GrB_Descriptor desc       // currently unused
-) ;
-
-GrB_Info GxB_mxv_optimize_free      // analyze A for subsequent use in mxv
-(
-    GrB_Matrix A                    // input/output matrix
-) ;
-
 #endif
 
diff --git a/GraphBLAS/Config/README.md.in b/GraphBLAS/Config/README.md.in
index fb93a4bd73..e15637e644 100644
--- a/GraphBLAS/Config/README.md.in
+++ b/GraphBLAS/Config/README.md.in
@@ -1,11 +1,11 @@
 # SuiteSparse:GraphBLAS
 
-SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
 
 VERSION @GraphBLAS_VERSION_MAJOR@.@GraphBLAS_VERSION_MINOR@.@GraphBLAS_VERSION_SUB@, @GraphBLAS_DATE@
 
-SuiteSparse:GraphBLAS is an full implementation of the GraphBLAS standard,
+SuiteSparse:GraphBLAS is complete implementation of the GraphBLAS standard,
 which defines a set of sparse matrix operations on an extended algebra of
 semirings using an almost unlimited variety of operators and types.  When
 applied to sparse adjacency matrices, these algebraic operations are equivalent
@@ -62,12 +62,6 @@ Demo:           a set of demos on how to use GraphBLAS
 
 Doc:            SuiteSparse:GraphBLAS User Guide and license
 
-Extras:         parallel methods: triangle counting, k-truss, and a
-                massively parallel (MPI) Kronecker product matrix generator.
-                These are stand-along package that rely on GraphBLAS.  They
-                are not compiled by the cmake script.  See Extras/README.txt
-                for more details.
-
 GraphBLAS:      the MATLAB interface.  This folder is called 'GraphBLAS' so
                 that typing 'help graphblas' or 'doc graphblas' in the MATLAB
                 Command Window can locate the Contents.m file.
@@ -93,13 +87,25 @@ alternative:    an alternative to CMake; edit the alternative/Makefile and do
 
 --------------------------------------------------------------------------------
 
-## SPEC:
+## GraphBLAS C API Specification:
 
 This version fully conforms to the version @GraphBLAS_API_VERSION_MAJOR@.@GraphBLAS_API_VERSION_MINOR@.@GraphBLAS_API_VERSION_SUB@ (@GraphBLAS_API_DATE@)
 of the GraphBLAS C API Specification.  It includes several additional functions
-and features as extensions to the spec.  These extensions are tagged with the
-keyword SPEC: in the code and in the User Guide, and in the Include/GraphBLAS.h
-file.  All functions, objects, and macros with the prefix GxB are extensions to
+and features as extensions to the spec.
+
+All functions, objects, and macros with the prefix GxB are extensions to
 the spec.  Functions, objects, and macros with prefix GB must not be accessed
 by user code.  They are for internal use in GraphBLAS only.
 
+--------------------------------------------------------------------------------
+
+## About NUMA systems
+
+I have tested this package extensively on multicore single-socket systems, but
+have not yet optimized it for multi-socket systems with a NUMA architecture.
+That will be done in a future release.  If you publish benchmark comparisons
+with this package, please state the SuiteSparse:GraphBLAS version, and a caveat
+if appropriate.  If you see significant performance issues when going from a
+single-socket to multi-socket system, I would like to hear from you so I can
+look into it.  Contact me at davis@tamu.edu.
+
diff --git a/GraphBLAS/Demo/InProgress/gpu_reduce_demo.c b/GraphBLAS/Demo/InProgress/gpu_reduce_demo.c
index 1d35778cc7..89338c2e14 100644
--- a/GraphBLAS/Demo/InProgress/gpu_reduce_demo.c
+++ b/GraphBLAS/Demo/InProgress/gpu_reduce_demo.c
@@ -2,6 +2,11 @@
 // GraphBLAS/Demo/Program/reduce_demo: reduce a matrix to a scalar
 //------------------------------------------------------------------------------
 
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
 // TODO for GPU: add this to CMakelists.txt, or merge with reduce_demo.c
 
 #include "GraphBLAS.h"
diff --git a/GraphBLAS/Demo/Include/graphblas_demos.h b/GraphBLAS/Demo/Include/graphblas_demos.h
index 9b5fd4e9b5..62809df148 100644
--- a/GraphBLAS/Demo/Include/graphblas_demos.h
+++ b/GraphBLAS/Demo/Include/graphblas_demos.h
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Include/graphblas_demos.h: include file for all demo programs
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,22 +18,30 @@
 #pragma warning (disable: 58 167 144 177 181 186 188 589 593 869 981 1418 1419 1572 1599 2259 2282 2557 2547 3280 )
 #elif defined __GNUC__
 
+// disable warnings for gcc 5.x and higher:
+#if (__GNUC__ > 4)
+// disable warnings
+// #pragma GCC diagnostic ignored "-Wunknown-warning-option"
+#pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#pragma GCC diagnostic ignored "-Wformat-truncation="
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+// enable these warnings as errors
+#pragma GCC diagnostic error "-Wmisleading-indentation"
+#endif
+
 #pragma GCC diagnostic ignored "-Wunknown-pragmas"
 #if !defined ( __cplusplus )
 #pragma GCC diagnostic ignored "-Wincompatible-pointer-types"
 #else
 #pragma GCC diagnostic ignored "-Wwrite-strings"
 #endif
-#pragma GCC diagnostic ignored "-Wformat-truncation="
 #pragma GCC diagnostic ignored "-Wunused-variable"
 #pragma GCC diagnostic ignored "-Wunused-result"
-#pragma GCC diagnostic ignored "-Wint-in-bool-context"
 #pragma GCC diagnostic ignored "-Wunused-parameter"
 #pragma GCC diagnostic ignored "-Wsign-compare"
 #pragma GCC diagnostic ignored "-Wtype-limits"
 
 // enable these warnings as errors
-#pragma GCC diagnostic error "-Wmisleading-indentation"
 #pragma GCC diagnostic error "-Wswitch-default"
 #endif
 
@@ -293,7 +301,7 @@ GrB_Info import_test (GrB_Matrix *C_handle, int format, bool dump) ;
 }
 
 //------------------------------------------------------------------------------
-// OK: call a GraphBLAS method and check the result
+// OK  call a GraphBLAS method and check the result
 //------------------------------------------------------------------------------
 
 // OK(method) is a macro that calls a GraphBLAS method and checks the status;
@@ -305,7 +313,7 @@ GrB_Info import_test (GrB_Matrix *C_handle, int format, bool dump) ;
     info = method ;                                                     \
     if (!(info == GrB_SUCCESS || info == GrB_NO_VALUE))                 \
     {                                                                   \
-        printf ("GraphBLAS error:\n%s\n", GrB_error ( )) ;              \
+        printf ("GraphBLAS error: %d\n", info) ;                        \
         CHECK (false, info) ;                                           \
     }                                                                   \
 }
diff --git a/GraphBLAS/Demo/Include/prand.h b/GraphBLAS/Demo/Include/prand.h
index dfafc6ea17..16fb9b3692 100644
--- a/GraphBLAS/Demo/Include/prand.h
+++ b/GraphBLAS/Demo/Include/prand.h
@@ -62,8 +62,8 @@
         prand_finalize ( ) ;            // free the prand types and operators
 */
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 #ifndef PRAND_H
 #define PRAND_H
diff --git a/GraphBLAS/Demo/Include/simple_rand.h b/GraphBLAS/Demo/Include/simple_rand.h
index b1591cf08d..817484f74c 100644
--- a/GraphBLAS/Demo/Include/simple_rand.h
+++ b/GraphBLAS/Demo/Include/simple_rand.h
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Include/simple_rand.h: a very simple random number generator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Include/simple_timer.h b/GraphBLAS/Demo/Include/simple_timer.h
index 973a5f7b3d..2a7dd99b83 100644
--- a/GraphBLAS/Demo/Include/simple_timer.h
+++ b/GraphBLAS/Demo/Include/simple_timer.h
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Include/simple_timer.h: a timer for performance measurements
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Include/usercomplex.h b/GraphBLAS/Demo/Include/usercomplex.h
index 51240de477..bc901422b3 100644
--- a/GraphBLAS/Demo/Include/usercomplex.h
+++ b/GraphBLAS/Demo/Include/usercomplex.h
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Include/usercomplex.h:  complex numbers as a user-defined type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/MATLAB/Contents.m b/GraphBLAS/Demo/MATLAB/Contents.m
index cbb52fcf06..297a69d1d2 100644
--- a/GraphBLAS/Demo/MATLAB/Contents.m
+++ b/GraphBLAS/Demo/MATLAB/Contents.m
@@ -12,5 +12,5 @@
 %   rowscale     - row scale an adjacency matrix by out-degree
 %   dpagerank2   - compute the pagerank of nodes in a graph using a real semiring
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
diff --git a/GraphBLAS/Demo/MATLAB/adj_to_edges.m b/GraphBLAS/Demo/MATLAB/adj_to_edges.m
index 70da047a09..68e773ebb4 100644
--- a/GraphBLAS/Demo/MATLAB/adj_to_edges.m
+++ b/GraphBLAS/Demo/MATLAB/adj_to_edges.m
@@ -8,8 +8,8 @@
 % E has size n-by-nnz(A), where n=size(A,1).  A must be square, and
 % its diagonal is ignored.  A is symmetrized with A=A+A'.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m n] = size (A) ;
 if (m ~= n)
diff --git a/GraphBLAS/Demo/MATLAB/check_adj.m b/GraphBLAS/Demo/MATLAB/check_adj.m
index ca775f0ae0..f768887572 100644
--- a/GraphBLAS/Demo/MATLAB/check_adj.m
+++ b/GraphBLAS/Demo/MATLAB/check_adj.m
@@ -4,8 +4,8 @@ function check_adj (A)
 %
 % A must be square, symmetric, binary, with no entries on the diagonal
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m n] = size (A) ;
 if (m ~= n)
diff --git a/GraphBLAS/Demo/MATLAB/dpagerank.m b/GraphBLAS/Demo/MATLAB/dpagerank.m
index 20a13cf927..da4f81899d 100644
--- a/GraphBLAS/Demo/MATLAB/dpagerank.m
+++ b/GraphBLAS/Demo/MATLAB/dpagerank.m
@@ -11,8 +11,8 @@
 %
 % See also ipagerank.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % original problem in real arithmetic
 n = size (A,1) ;        % number of nodes
diff --git a/GraphBLAS/Demo/MATLAB/dpagerank2.m b/GraphBLAS/Demo/MATLAB/dpagerank2.m
index 919fa6477e..6515eb771b 100644
--- a/GraphBLAS/Demo/MATLAB/dpagerank2.m
+++ b/GraphBLAS/Demo/MATLAB/dpagerank2.m
@@ -21,8 +21,8 @@
 %
 % See also ipagerank.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 2)
     tol = 1e-5 ;        % stopping criterion
diff --git a/GraphBLAS/Demo/MATLAB/edges_to_adj.m b/GraphBLAS/Demo/MATLAB/edges_to_adj.m
index f6a8e9b869..e755eb6536 100644
--- a/GraphBLAS/Demo/MATLAB/edges_to_adj.m
+++ b/GraphBLAS/Demo/MATLAB/edges_to_adj.m
@@ -8,8 +8,8 @@
 %
 % C is a symmetric binary matrix with no self edges.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % check input E
 [i j x] = find (E) ;
diff --git a/GraphBLAS/Demo/MATLAB/ipagerank.m b/GraphBLAS/Demo/MATLAB/ipagerank.m
index 9ca2e77158..6959fa8c89 100644
--- a/GraphBLAS/Demo/MATLAB/ipagerank.m
+++ b/GraphBLAS/Demo/MATLAB/ipagerank.m
@@ -11,8 +11,8 @@
 %
 % See also dpagerank.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % original problem in real arithmetic
 n = size (A,1) ;        % number of nodes
diff --git a/GraphBLAS/Demo/MATLAB/kron_demo.m b/GraphBLAS/Demo/MATLAB/kron_demo.m
index 53f0f29210..44c18d8c62 100644
--- a/GraphBLAS/Demo/MATLAB/kron_demo.m
+++ b/GraphBLAS/Demo/MATLAB/kron_demo.m
@@ -3,8 +3,8 @@
 % Usage:
 % [C err] = kron_demo (A,B)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % write A to a file
 Afile = fopen ('A.tsv', 'w') ;
diff --git a/GraphBLAS/Demo/MATLAB/kron_test.m b/GraphBLAS/Demo/MATLAB/kron_test.m
index 71911ddb3f..7f77079c24 100644
--- a/GraphBLAS/Demo/MATLAB/kron_test.m
+++ b/GraphBLAS/Demo/MATLAB/kron_test.m
@@ -1,7 +1,7 @@
 %KRON_TEST test kron_demo.m
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear
 A = sprand (10, 20, 0.1) ;
diff --git a/GraphBLAS/Demo/MATLAB/rowscale.m b/GraphBLAS/Demo/MATLAB/rowscale.m
index 7b0560bf4a..5d5a011692 100644
--- a/GraphBLAS/Demo/MATLAB/rowscale.m
+++ b/GraphBLAS/Demo/MATLAB/rowscale.m
@@ -2,8 +2,8 @@
 %ROWSCALE row scale an adjacency matrix by out-degree
 % C = rowscale (A)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % scale the adjacency matrix by out-degree
 dout = sum (A,2) ;              % dout(i) is the out-degree of node i
diff --git a/GraphBLAS/Demo/MATLAB/tri_matlab.m b/GraphBLAS/Demo/MATLAB/tri_matlab.m
index df7846f361..380be97fef 100644
--- a/GraphBLAS/Demo/MATLAB/tri_matlab.m
+++ b/GraphBLAS/Demo/MATLAB/tri_matlab.m
@@ -1,7 +1,7 @@
 %TRI_MATLAB run tricount tests in MATLAB
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear
 diary tri_matlab_out.txt
diff --git a/GraphBLAS/Demo/MATLAB/tricount.m b/GraphBLAS/Demo/MATLAB/tricount.m
index 2f937b99bf..63ba30eaf7 100644
--- a/GraphBLAS/Demo/MATLAB/tricount.m
+++ b/GraphBLAS/Demo/MATLAB/tricount.m
@@ -42,8 +42,8 @@
 % sparse column form, so the MATLAB equivalent of the Sandia method is
 % sum(sum((U*U).*U)).
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % check inputs
diff --git a/GraphBLAS/Demo/Output/bfs_demo.out b/GraphBLAS/Demo/Output/bfs_demo.out
index 7e82bf2854..8dca5e94cc 100644
--- a/GraphBLAS/Demo/Output/bfs_demo.out
+++ b/GraphBLAS/Demo/Output/bfs_demo.out
@@ -2,83 +2,83 @@ Wathen: nx 4 ny 4 n 65 nz 817 method 0, time: 0.000 sec
 number of nodes: 65
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000103
+BFS time in seconds:       0.000174
 nodes reachable from node 0: 65 out of 65
 max BFS level: 5
 
 method 5: same but check each result
-BFS time in seconds:       0.000053
+BFS time in seconds:       0.000035
 nodes reachable from node 0: 65 out of 65
 max BFS level: 5
 
 method 6: apply unary operator
-BFS time in seconds:       0.000045
+BFS time in seconds:       0.000053
 nodes reachable from node 0: 65 out of 65
 max BFS level: 5
 
 method 6: same but check each result
-BFS time in seconds:       0.000035
+BFS time in seconds:       0.000032
 nodes reachable from node 0: 65 out of 65
 max BFS level: 5
 random 5 by 5, nz: 21, method 1 time 0.000 sec
 number of nodes: 5
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000066
+BFS time in seconds:       0.000126
 nodes reachable from node 0: 5 out of 5
 max BFS level: 3
 
 method 5: same but check each result
-BFS time in seconds:       0.000024
+BFS time in seconds:       0.000023
 nodes reachable from node 0: 5 out of 5
 max BFS level: 3
 
 method 6: apply unary operator
-BFS time in seconds:       0.000029
+BFS time in seconds:       0.000035
 nodes reachable from node 0: 5 out of 5
 max BFS level: 3
 
 method 6: same but check each result
-BFS time in seconds:       0.000022
+BFS time in seconds:       0.000019
 nodes reachable from node 0: 5 out of 5
 max BFS level: 3
 matrix 3 by 3, 3 entries, from stdin
 number of nodes: 3
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000048
+BFS time in seconds:       0.000082
 nodes reachable from node 0: 1 out of 3
 max BFS level: 1
 
 method 5: same but check each result
-BFS time in seconds:       0.000018
+BFS time in seconds:       0.000017
 nodes reachable from node 0: 1 out of 3
 max BFS level: 1
 
 method 6: apply unary operator
-BFS time in seconds:       0.000022
+BFS time in seconds:       0.000031
 nodes reachable from node 0: 1 out of 3
 max BFS level: 1
 
 method 6: same but check each result
-BFS time in seconds:       0.000015
+BFS time in seconds:       0.000014
 nodes reachable from node 0: 1 out of 3
 max BFS level: 1
 matrix 4 by 4, 8 entries, from stdin
 number of nodes: 4
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000050
+BFS time in seconds:       0.000082
 nodes reachable from node 0: 2 out of 4
 max BFS level: 2
 
 method 5: same but check each result
-BFS time in seconds:       0.000019
+BFS time in seconds:       0.000021
 nodes reachable from node 0: 2 out of 4
 max BFS level: 2
 
 method 6: apply unary operator
-BFS time in seconds:       0.000025
+BFS time in seconds:       0.000034
 nodes reachable from node 0: 2 out of 4
 max BFS level: 2
 
@@ -87,148 +87,138 @@ BFS time in seconds:       0.000018
 nodes reachable from node 0: 2 out of 4
 max BFS level: 2
 
-------------------- this fails:
-GraphBLAS error: GrB_DOMAIN_MISMATCH
-function: GrB_Monoid_new_INT32 (&monoid, op, identity)
-Identity type [int32_t]
-must be identical to monoid operator z=or(x,y) of type [bool]
-
+------------------- this fails: info 7
 
-------------------- this is OK: 0 (should be GrB_SUCCESS = 0)
+------------------- this is OK 0 (should be GrB_SUCCESS = 0)
 matrix 4 by 4, 14 entries, from stdin
 number of nodes: 4
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000065
+BFS time in seconds:       0.000107
 nodes reachable from node 0: 4 out of 4
 max BFS level: 2
 
 method 5: same but check each result
-BFS time in seconds:       0.000019
+BFS time in seconds:       0.000023
 nodes reachable from node 0: 4 out of 4
 max BFS level: 2
 
 method 6: apply unary operator
-BFS time in seconds:       0.000026
+BFS time in seconds:       0.000046
 nodes reachable from node 0: 4 out of 4
 max BFS level: 2
 
 method 6: same but check each result
-BFS time in seconds:       0.000017
+BFS time in seconds:       0.000020
 nodes reachable from node 0: 4 out of 4
 max BFS level: 2
 
-------------------- this fails:
-GraphBLAS error: GrB_DOMAIN_MISMATCH
-function: GrB_Monoid_new_INT32 (&monoid, op, identity)
-Identity type [int32_t]
-must be identical to monoid operator z=or(x,y) of type [bool]
-
+------------------- this fails: info 7
 
-------------------- this is OK: 0 (should be GrB_SUCCESS = 0)
+------------------- this is OK 0 (should be GrB_SUCCESS = 0)
 matrix 7 by 7, 16 entries, from stdin
 number of nodes: 7
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000052
+BFS time in seconds:       0.000079
 nodes reachable from node 0: 7 out of 7
 max BFS level: 5
 
 method 5: same but check each result
-BFS time in seconds:       0.000030
+BFS time in seconds:       0.000028
 nodes reachable from node 0: 7 out of 7
 max BFS level: 5
 
 method 6: apply unary operator
-BFS time in seconds:       0.000039
+BFS time in seconds:       0.000044
 nodes reachable from node 0: 7 out of 7
 max BFS level: 5
 
 method 6: same but check each result
-BFS time in seconds:       0.000030
+BFS time in seconds:       0.000027
 nodes reachable from node 0: 7 out of 7
 max BFS level: 5
 matrix 304 by 304, 876 entries, from stdin
 number of nodes: 304
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000144
+BFS time in seconds:       0.000202
 nodes reachable from node 0: 304 out of 304
 max BFS level: 17
 
 method 5: same but check each result
-BFS time in seconds:       0.000100
+BFS time in seconds:       0.000136
 nodes reachable from node 0: 304 out of 304
 max BFS level: 17
 
 method 6: apply unary operator
-BFS time in seconds:       0.000116
+BFS time in seconds:       0.000167
 nodes reachable from node 0: 304 out of 304
 max BFS level: 17
 
 method 6: same but check each result
-BFS time in seconds:       0.000107
+BFS time in seconds:       0.000151
 nodes reachable from node 0: 304 out of 304
 max BFS level: 17
 matrix 48 by 48, 400 entries, from stdin
 number of nodes: 48
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000059
+BFS time in seconds:       0.000104
 nodes reachable from node 0: 48 out of 48
 max BFS level: 4
 
 method 5: same but check each result
-BFS time in seconds:       0.000030
+BFS time in seconds:       0.000036
 nodes reachable from node 0: 48 out of 48
 max BFS level: 4
 
 method 6: apply unary operator
-BFS time in seconds:       0.000035
+BFS time in seconds:       0.000049
 nodes reachable from node 0: 48 out of 48
 max BFS level: 4
 
 method 6: same but check each result
-BFS time in seconds:       0.000030
+BFS time in seconds:       0.000035
 nodes reachable from node 0: 48 out of 48
 max BFS level: 4
 matrix 4884 by 4884, 290378 entries, from stdin
 number of nodes: 4884
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.001652
+BFS time in seconds:       0.001490
 nodes reachable from node 0: 4810 out of 4884
 max BFS level: 44
 
 method 5: same but check each result
-BFS time in seconds:       0.001546
+BFS time in seconds:       0.001377
 nodes reachable from node 0: 4810 out of 4884
 max BFS level: 44
 
 method 6: apply unary operator
-BFS time in seconds:       0.001583
+BFS time in seconds:       0.001396
 nodes reachable from node 0: 4810 out of 4884
 max BFS level: 44
 
 method 6: same but check each result
-BFS time in seconds:       0.001540
+BFS time in seconds:       0.001349
 nodes reachable from node 0: 4810 out of 4884
 max BFS level: 44
 matrix 183 by 183, 1585 entries, from stdin
 number of nodes: 183
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000083
+BFS time in seconds:       0.000087
 nodes reachable from node 0: 183 out of 183
 max BFS level: 4
 
 method 5: same but check each result
-BFS time in seconds:       0.000042
+BFS time in seconds:       0.000036
 nodes reachable from node 0: 183 out of 183
 max BFS level: 4
 
 method 6: apply unary operator
-BFS time in seconds:       0.000048
+BFS time in seconds:       0.000045
 nodes reachable from node 0: 183 out of 183
 max BFS level: 4
 
@@ -240,56 +230,56 @@ matrix 63 by 63, 246 entries, from stdin
 number of nodes: 63
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000065
+BFS time in seconds:       0.000078
 nodes reachable from node 0: 63 out of 63
 max BFS level: 6
 
 method 5: same but check each result
-BFS time in seconds:       0.000036
+BFS time in seconds:       0.000033
 nodes reachable from node 0: 63 out of 63
 max BFS level: 6
 
 method 6: apply unary operator
-BFS time in seconds:       0.000043
+BFS time in seconds:       0.000044
 nodes reachable from node 0: 63 out of 63
 max BFS level: 6
 
 method 6: same but check each result
-BFS time in seconds:       0.000036
+BFS time in seconds:       0.000034
 nodes reachable from node 0: 63 out of 63
 max BFS level: 6
 matrix 63 by 63, 246 entries, from stdin
 number of nodes: 63
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000063
+BFS time in seconds:       0.000081
 nodes reachable from node 0: 63 out of 63
 max BFS level: 6
 
 method 5: same but check each result
-BFS time in seconds:       0.000035
+BFS time in seconds:       0.000031
 nodes reachable from node 0: 63 out of 63
 max BFS level: 6
 
 method 6: apply unary operator
-BFS time in seconds:       0.000047
+BFS time in seconds:       0.000043
 nodes reachable from node 0: 63 out of 63
 max BFS level: 6
 
 method 6: same but check each result
-BFS time in seconds:       0.000035
+BFS time in seconds:       0.000032
 nodes reachable from node 0: 63 out of 63
 max BFS level: 6
 matrix 78 by 78, 204 entries, from stdin
 number of nodes: 78
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000087
+BFS time in seconds:       0.000101
 nodes reachable from node 0: 78 out of 78
 max BFS level: 10
 
 method 5: same but check each result
-BFS time in seconds:       0.000061
+BFS time in seconds:       0.000050
 nodes reachable from node 0: 78 out of 78
 max BFS level: 10
 
@@ -299,160 +289,160 @@ nodes reachable from node 0: 78 out of 78
 max BFS level: 10
 
 method 6: same but check each result
-BFS time in seconds:       0.000057
+BFS time in seconds:       0.000058
 nodes reachable from node 0: 78 out of 78
 max BFS level: 10
 matrix 982 by 982, 99840 entries, from stdin
 number of nodes: 982
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000507
+BFS time in seconds:       0.000331
 nodes reachable from node 0: 933 out of 982
 max BFS level: 5
 
 method 5: same but check each result
-BFS time in seconds:       0.000453
+BFS time in seconds:       0.000184
 nodes reachable from node 0: 933 out of 982
 max BFS level: 5
 
 method 6: apply unary operator
-BFS time in seconds:       0.000453
+BFS time in seconds:       0.000204
 nodes reachable from node 0: 933 out of 982
 max BFS level: 5
 
 method 6: same but check each result
-BFS time in seconds:       0.000441
+BFS time in seconds:       0.000180
 nodes reachable from node 0: 933 out of 982
 max BFS level: 5
 matrix 67 by 67, 576 entries, from stdin
 number of nodes: 67
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.000068
+BFS time in seconds:       0.000090
 nodes reachable from node 0: 67 out of 67
 max BFS level: 5
 
 method 5: same but check each result
-BFS time in seconds:       0.000034
+BFS time in seconds:       0.000031
 nodes reachable from node 0: 67 out of 67
 max BFS level: 5
 
 method 6: apply unary operator
-BFS time in seconds:       0.000044
+BFS time in seconds:       0.000049
 nodes reachable from node 0: 67 out of 67
 max BFS level: 5
 
 method 6: same but check each result
-BFS time in seconds:       0.000034
+BFS time in seconds:       0.000035
 nodes reachable from node 0: 67 out of 67
 max BFS level: 5
-Wathen: nx 200 ny 200 n 120801 nz 1883201 method 0, time: 0.114 sec
+Wathen: nx 200 ny 200 n 120801 nz 1883201 method 0, time: 0.161 sec
 number of nodes: 120801
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.062683
+BFS time in seconds:       0.115048
 nodes reachable from node 0: 120801 out of 120801
 max BFS level: 201
 
 method 5: same but check each result
-BFS time in seconds:       0.048605
+BFS time in seconds:       0.101793
 nodes reachable from node 0: 120801 out of 120801
 max BFS level: 201
 
 method 6: apply unary operator
-BFS time in seconds:       0.044805
+BFS time in seconds:       0.115544
 nodes reachable from node 0: 120801 out of 120801
 max BFS level: 201
 
 method 6: same but check each result
-BFS time in seconds:       0.047597
+BFS time in seconds:       0.129839
 nodes reachable from node 0: 120801 out of 120801
 max BFS level: 201
-random 10000 by 10000, nz: 199777, method 0 time 0.023 sec
+random 10000 by 10000, nz: 199777, method 0 time 0.026 sec
 number of nodes: 10000
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.001472
+BFS time in seconds:       0.000992
 nodes reachable from node 0: 10000 out of 10000
 max BFS level: 5
 
 method 5: same but check each result
-BFS time in seconds:       0.001356
+BFS time in seconds:       0.000832
 nodes reachable from node 0: 10000 out of 10000
 max BFS level: 5
 
 method 6: apply unary operator
-BFS time in seconds:       0.001365
+BFS time in seconds:       0.000820
 nodes reachable from node 0: 10000 out of 10000
 max BFS level: 5
 
 method 6: same but check each result
-BFS time in seconds:       0.001336
+BFS time in seconds:       0.000798
 nodes reachable from node 0: 10000 out of 10000
 max BFS level: 5
-random 10000 by 10000, nz: 199777, method 1 time 0.019 sec
+random 10000 by 10000, nz: 199777, method 1 time 0.022 sec
 number of nodes: 10000
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.001486
+BFS time in seconds:       0.000940
 nodes reachable from node 0: 10000 out of 10000
 max BFS level: 5
 
 method 5: same but check each result
-BFS time in seconds:       0.001357
+BFS time in seconds:       0.000743
 nodes reachable from node 0: 10000 out of 10000
 max BFS level: 5
 
 method 6: apply unary operator
-BFS time in seconds:       0.001401
+BFS time in seconds:       0.000759
 nodes reachable from node 0: 10000 out of 10000
 max BFS level: 5
 
 method 6: same but check each result
-BFS time in seconds:       0.001358
+BFS time in seconds:       0.000712
 nodes reachable from node 0: 10000 out of 10000
 max BFS level: 5
-random 100000 by 100000, nz: 19980256, method 0 time 2.289 sec
+random 100000 by 100000, nz: 19980256, method 0 time 2.563 sec
 number of nodes: 100000
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.057052
+BFS time in seconds:       0.028316
 nodes reachable from node 0: 100000 out of 100000
 max BFS level: 4
 
 method 5: same but check each result
-BFS time in seconds:       0.048007
+BFS time in seconds:       0.025765
 nodes reachable from node 0: 100000 out of 100000
 max BFS level: 4
 
 method 6: apply unary operator
-BFS time in seconds:       0.046741
+BFS time in seconds:       0.023611
 nodes reachable from node 0: 100000 out of 100000
 max BFS level: 4
 
 method 6: same but check each result
-BFS time in seconds:       0.047456
+BFS time in seconds:       0.024619
 nodes reachable from node 0: 100000 out of 100000
 max BFS level: 4
-random 100000 by 100000, nz: 19980256, method 1 time 1.723 sec
+random 100000 by 100000, nz: 19980256, method 1 time 1.987 sec
 number of nodes: 100000
 
 method 5: vector assign and reduce:
-BFS time in seconds:       0.057582
+BFS time in seconds:       0.033926
 nodes reachable from node 0: 100000 out of 100000
 max BFS level: 4
 
 method 5: same but check each result
-BFS time in seconds:       0.049565
+BFS time in seconds:       0.023769
 nodes reachable from node 0: 100000 out of 100000
 max BFS level: 4
 
 method 6: apply unary operator
-BFS time in seconds:       0.046042
+BFS time in seconds:       0.024567
 nodes reachable from node 0: 100000 out of 100000
 max BFS level: 4
 
 method 6: same but check each result
-BFS time in seconds:       0.046488
+BFS time in seconds:       0.024171
 nodes reachable from node 0: 100000 out of 100000
 max BFS level: 4
diff --git a/GraphBLAS/Demo/Output/import_demo.out b/GraphBLAS/Demo/Output/import_demo.out
index c68ea1ac8d..4e60923fef 100644
--- a/GraphBLAS/Demo/Output/import_demo.out
+++ b/GraphBLAS/Demo/Output/import_demo.out
@@ -3,598 +3,583 @@
 
 ========================= import_test: format 0
 
-  67x67 GraphBLAS double matrix, sparse by row:
+  67x67 GraphBLAS double matrix, sparse by row
   C to export, 294 entries
 
-    (0,7)    -0.8341818
-    (0,12)    1.265823
-    (0,17)    -0.3361556
-    (1,8)    -0.8341818
-    (1,13)    1.012658
-    (1,17)    -0.2939196
-    (2,9)    -0.8341818
-    (2,14)    0.7594937
-    (2,17)    -0.2214815
-    (3,10)    -0.8341818
-    (3,15)    0.5063291
+    (0,7)    -0.834182
+    (0,12)    1.26582
+    (0,17)    -0.336156
+    (1,8)    -0.834182
+    (1,13)    1.01266
+    (1,17)    -0.29392
+    (2,9)    -0.834182
+    (2,14)    0.759494
+    (2,17)    -0.221481
+    (3,10)    -0.834182
+    (3,15)    0.506329
     (3,17)    -0.118986
-    (4,0)    -0.2788416
+    (4,0)    -0.278842
     (4,1)    -0.8
-    (4,6)    0.1344622
+    (4,6)    0.134462
     (4,7)    0.4
     (4,12)    0.4
-    (5,0)    -0.2680186
+    (5,0)    -0.268019
     (5,2)    -0.8
-    (5,6)    0.1175679
+    (5,6)    0.117568
     (5,8)    0.4
     (5,13)    0.4
-    (6,0)    -0.2323717
+    (6,0)    -0.232372
     (6,3)    -0.8
-    (6,6)    0.08859262
+    (6,6)    0.0885926
     (6,9)    0.4
     (6,14)    0.4
-    (7,0)    -0.1575082
+    (7,0)    -0.157508
     (7,4)    -0.8
-    (7,6)    0.04759439
     ...
 
 ========================= import_test: format 1
 
-  67x67 GraphBLAS double matrix, sparse by row:
+  67x67 GraphBLAS double matrix, sparse by row
   C to export, 294 entries
 
-    (0,7)    -0.8341818
-    (0,12)    1.265823
-    (0,17)    -0.3361556
-    (1,8)    -0.8341818
-    (1,13)    1.012658
-    (1,17)    -0.2939196
-    (2,9)    -0.8341818
-    (2,14)    0.7594937
-    (2,17)    -0.2214815
-    (3,10)    -0.8341818
-    (3,15)    0.5063291
+    (0,7)    -0.834182
+    (0,12)    1.26582
+    (0,17)    -0.336156
+    (1,8)    -0.834182
+    (1,13)    1.01266
+    (1,17)    -0.29392
+    (2,9)    -0.834182
+    (2,14)    0.759494
+    (2,17)    -0.221481
+    (3,10)    -0.834182
+    (3,15)    0.506329
     (3,17)    -0.118986
-    (4,0)    -0.2788416
+    (4,0)    -0.278842
     (4,1)    -0.8
-    (4,6)    0.1344622
+    (4,6)    0.134462
     (4,7)    0.4
     (4,12)    0.4
-    (5,0)    -0.2680186
+    (5,0)    -0.268019
     (5,2)    -0.8
-    (5,6)    0.1175679
+    (5,6)    0.117568
     (5,8)    0.4
     (5,13)    0.4
-    (6,0)    -0.2323717
+    (6,0)    -0.232372
     (6,3)    -0.8
-    (6,6)    0.08859262
+    (6,6)    0.0885926
     (6,9)    0.4
     (6,14)    0.4
-    (7,0)    -0.1575082
+    (7,0)    -0.157508
     (7,4)    -0.8
-    (7,6)    0.04759439
     ...
 
 ========================= import_test: format 2
 
-  67x67 GraphBLAS double matrix, sparse by row:
+  67x67 GraphBLAS double matrix, sparse by row
   C to export, 294 entries
 
-    (0,7)    -0.8341818
-    (0,12)    1.265823
-    (0,17)    -0.3361556
-    (1,8)    -0.8341818
-    (1,13)    1.012658
-    (1,17)    -0.2939196
-    (2,9)    -0.8341818
-    (2,14)    0.7594937
-    (2,17)    -0.2214815
-    (3,10)    -0.8341818
-    (3,15)    0.5063291
+    (0,7)    -0.834182
+    (0,12)    1.26582
+    (0,17)    -0.336156
+    (1,8)    -0.834182
+    (1,13)    1.01266
+    (1,17)    -0.29392
+    (2,9)    -0.834182
+    (2,14)    0.759494
+    (2,17)    -0.221481
+    (3,10)    -0.834182
+    (3,15)    0.506329
     (3,17)    -0.118986
-    (4,0)    -0.2788416
+    (4,0)    -0.278842
     (4,1)    -0.8
-    (4,6)    0.1344622
+    (4,6)    0.134462
     (4,7)    0.4
     (4,12)    0.4
-    (5,0)    -0.2680186
+    (5,0)    -0.268019
     (5,2)    -0.8
-    (5,6)    0.1175679
+    (5,6)    0.117568
     (5,8)    0.4
     (5,13)    0.4
-    (6,0)    -0.2323717
+    (6,0)    -0.232372
     (6,3)    -0.8
-    (6,6)    0.08859262
+    (6,6)    0.0885926
     (6,9)    0.4
     (6,14)    0.4
-    (7,0)    -0.1575082
+    (7,0)    -0.157508
     (7,4)    -0.8
-    (7,6)    0.04759439
     ...
 
 ========================= import_test: format 3
 
-  67x67 GraphBLAS double matrix, sparse by row:
+  67x67 GraphBLAS double matrix, sparse by row
   C to export, 294 entries
 
-    (0,7)    -0.8341818
-    (0,12)    1.265823
-    (0,17)    -0.3361556
-    (1,8)    -0.8341818
-    (1,13)    1.012658
-    (1,17)    -0.2939196
-    (2,9)    -0.8341818
-    (2,14)    0.7594937
-    (2,17)    -0.2214815
-    (3,10)    -0.8341818
-    (3,15)    0.5063291
+    (0,7)    -0.834182
+    (0,12)    1.26582
+    (0,17)    -0.336156
+    (1,8)    -0.834182
+    (1,13)    1.01266
+    (1,17)    -0.29392
+    (2,9)    -0.834182
+    (2,14)    0.759494
+    (2,17)    -0.221481
+    (3,10)    -0.834182
+    (3,15)    0.506329
     (3,17)    -0.118986
-    (4,0)    -0.2788416
+    (4,0)    -0.278842
     (4,1)    -0.8
-    (4,6)    0.1344622
+    (4,6)    0.134462
     (4,7)    0.4
     (4,12)    0.4
-    (5,0)    -0.2680186
+    (5,0)    -0.268019
     (5,2)    -0.8
-    (5,6)    0.1175679
+    (5,6)    0.117568
     (5,8)    0.4
     (5,13)    0.4
-    (6,0)    -0.2323717
+    (6,0)    -0.232372
     (6,3)    -0.8
-    (6,6)    0.08859262
+    (6,6)    0.0885926
     (6,9)    0.4
     (6,14)    0.4
-    (7,0)    -0.1575082
+    (7,0)    -0.157508
     (7,4)    -0.8
-    (7,6)    0.04759439
     ...
 
 ######### input A: hyper 0 csc 1
 
 ========================= import_test: format 0
 
-  67x67 GraphBLAS double matrix, sparse by col:
+  67x67 GraphBLAS double matrix, sparse by col
   C to export, 294 entries
 
-    (4,0)    -0.2788416
-    (5,0)    -0.2680186
-    (6,0)    -0.2323717
-    (7,0)    -0.1575082
-    (8,0)    -0.06325978
-    (24,0)    0.1394208
-    (25,0)    0.1340093
-    (26,0)    0.1161859
-    (27,0)    0.07875411
-    (28,0)    0.03162989
+    (4,0)    -0.278842
+    (5,0)    -0.268019
+    (6,0)    -0.232372
+    (7,0)    -0.157508
+    (8,0)    -0.0632598
+    (24,0)    0.139421
+    (25,0)    0.134009
+    (26,0)    0.116186
+    (27,0)    0.0787541
+    (28,0)    0.0316299
     (4,1)    -0.8
-    (20,1)    -0.9159533
+    (20,1)    -0.915953
     (24,1)    0.4
     (60,1)    1
     (5,2)    -0.8
-    (21,2)    -0.9159533
+    (21,2)    -0.915953
     (25,2)    0.4
     (60,2)    1
     (6,3)    -0.8
-    (22,3)    -0.9159533
+    (22,3)    -0.915953
     (26,3)    0.4
     (60,3)    1
     (7,4)    -0.8
-    (23,4)    -0.9159533
+    (23,4)    -0.915953
     (27,4)    0.4
     (60,4)    1
     (8,5)    -0.8
     (28,5)    0.4
     (60,5)    1
-    (4,6)    0.1344622
     ...
 
 ========================= import_test: format 1
 
-  67x67 GraphBLAS double matrix, sparse by col:
+  67x67 GraphBLAS double matrix, sparse by col
   C to export, 294 entries
 
-    (4,0)    -0.2788416
-    (5,0)    -0.2680186
-    (6,0)    -0.2323717
-    (7,0)    -0.1575082
-    (8,0)    -0.06325978
-    (24,0)    0.1394208
-    (25,0)    0.1340093
-    (26,0)    0.1161859
-    (27,0)    0.07875411
-    (28,0)    0.03162989
+    (4,0)    -0.278842
+    (5,0)    -0.268019
+    (6,0)    -0.232372
+    (7,0)    -0.157508
+    (8,0)    -0.0632598
+    (24,0)    0.139421
+    (25,0)    0.134009
+    (26,0)    0.116186
+    (27,0)    0.0787541
+    (28,0)    0.0316299
     (4,1)    -0.8
-    (20,1)    -0.9159533
+    (20,1)    -0.915953
     (24,1)    0.4
     (60,1)    1
     (5,2)    -0.8
-    (21,2)    -0.9159533
+    (21,2)    -0.915953
     (25,2)    0.4
     (60,2)    1
     (6,3)    -0.8
-    (22,3)    -0.9159533
+    (22,3)    -0.915953
     (26,3)    0.4
     (60,3)    1
     (7,4)    -0.8
-    (23,4)    -0.9159533
+    (23,4)    -0.915953
     (27,4)    0.4
     (60,4)    1
     (8,5)    -0.8
     (28,5)    0.4
     (60,5)    1
-    (4,6)    0.1344622
     ...
 
 ========================= import_test: format 2
 
-  67x67 GraphBLAS double matrix, sparse by col:
+  67x67 GraphBLAS double matrix, sparse by col
   C to export, 294 entries
 
-    (4,0)    -0.2788416
-    (5,0)    -0.2680186
-    (6,0)    -0.2323717
-    (7,0)    -0.1575082
-    (8,0)    -0.06325978
-    (24,0)    0.1394208
-    (25,0)    0.1340093
-    (26,0)    0.1161859
-    (27,0)    0.07875411
-    (28,0)    0.03162989
+    (4,0)    -0.278842
+    (5,0)    -0.268019
+    (6,0)    -0.232372
+    (7,0)    -0.157508
+    (8,0)    -0.0632598
+    (24,0)    0.139421
+    (25,0)    0.134009
+    (26,0)    0.116186
+    (27,0)    0.0787541
+    (28,0)    0.0316299
     (4,1)    -0.8
-    (20,1)    -0.9159533
+    (20,1)    -0.915953
     (24,1)    0.4
     (60,1)    1
     (5,2)    -0.8
-    (21,2)    -0.9159533
+    (21,2)    -0.915953
     (25,2)    0.4
     (60,2)    1
     (6,3)    -0.8
-    (22,3)    -0.9159533
+    (22,3)    -0.915953
     (26,3)    0.4
     (60,3)    1
     (7,4)    -0.8
-    (23,4)    -0.9159533
+    (23,4)    -0.915953
     (27,4)    0.4
     (60,4)    1
     (8,5)    -0.8
     (28,5)    0.4
     (60,5)    1
-    (4,6)    0.1344622
     ...
 
 ========================= import_test: format 3
 
-  67x67 GraphBLAS double matrix, sparse by col:
+  67x67 GraphBLAS double matrix, sparse by col
   C to export, 294 entries
 
-    (4,0)    -0.2788416
-    (5,0)    -0.2680186
-    (6,0)    -0.2323717
-    (7,0)    -0.1575082
-    (8,0)    -0.06325978
-    (24,0)    0.1394208
-    (25,0)    0.1340093
-    (26,0)    0.1161859
-    (27,0)    0.07875411
-    (28,0)    0.03162989
+    (4,0)    -0.278842
+    (5,0)    -0.268019
+    (6,0)    -0.232372
+    (7,0)    -0.157508
+    (8,0)    -0.0632598
+    (24,0)    0.139421
+    (25,0)    0.134009
+    (26,0)    0.116186
+    (27,0)    0.0787541
+    (28,0)    0.0316299
     (4,1)    -0.8
-    (20,1)    -0.9159533
+    (20,1)    -0.915953
     (24,1)    0.4
     (60,1)    1
     (5,2)    -0.8
-    (21,2)    -0.9159533
+    (21,2)    -0.915953
     (25,2)    0.4
     (60,2)    1
     (6,3)    -0.8
-    (22,3)    -0.9159533
+    (22,3)    -0.915953
     (26,3)    0.4
     (60,3)    1
     (7,4)    -0.8
-    (23,4)    -0.9159533
+    (23,4)    -0.915953
     (27,4)    0.4
     (60,4)    1
     (8,5)    -0.8
     (28,5)    0.4
     (60,5)    1
-    (4,6)    0.1344622
     ...
 
 ######### input A: hyper 1 csc 0
 
 ========================= import_test: format 0
 
-  67x67 GraphBLAS double matrix, hypersparse by row:
+  67x67 GraphBLAS double matrix, hypersparse by row
   C to export, 294 entries
 
-    (0,7)    -0.8341818
-    (0,12)    1.265823
-    (0,17)    -0.3361556
-    (1,8)    -0.8341818
-    (1,13)    1.012658
-    (1,17)    -0.2939196
-    (2,9)    -0.8341818
-    (2,14)    0.7594937
-    (2,17)    -0.2214815
-    (3,10)    -0.8341818
-    (3,15)    0.5063291
+    (0,7)    -0.834182
+    (0,12)    1.26582
+    (0,17)    -0.336156
+    (1,8)    -0.834182
+    (1,13)    1.01266
+    (1,17)    -0.29392
+    (2,9)    -0.834182
+    (2,14)    0.759494
+    (2,17)    -0.221481
+    (3,10)    -0.834182
+    (3,15)    0.506329
     (3,17)    -0.118986
-    (4,0)    -0.2788416
+    (4,0)    -0.278842
     (4,1)    -0.8
-    (4,6)    0.1344622
+    (4,6)    0.134462
     (4,7)    0.4
     (4,12)    0.4
-    (5,0)    -0.2680186
+    (5,0)    -0.268019
     (5,2)    -0.8
-    (5,6)    0.1175679
+    (5,6)    0.117568
     (5,8)    0.4
     (5,13)    0.4
-    (6,0)    -0.2323717
+    (6,0)    -0.232372
     (6,3)    -0.8
-    (6,6)    0.08859262
+    (6,6)    0.0885926
     (6,9)    0.4
     (6,14)    0.4
-    (7,0)    -0.1575082
+    (7,0)    -0.157508
     (7,4)    -0.8
-    (7,6)    0.04759439
     ...
 
 ========================= import_test: format 1
 
-  67x67 GraphBLAS double matrix, hypersparse by row:
+  67x67 GraphBLAS double matrix, hypersparse by row
   C to export, 294 entries
 
-    (0,7)    -0.8341818
-    (0,12)    1.265823
-    (0,17)    -0.3361556
-    (1,8)    -0.8341818
-    (1,13)    1.012658
-    (1,17)    -0.2939196
-    (2,9)    -0.8341818
-    (2,14)    0.7594937
-    (2,17)    -0.2214815
-    (3,10)    -0.8341818
-    (3,15)    0.5063291
+    (0,7)    -0.834182
+    (0,12)    1.26582
+    (0,17)    -0.336156
+    (1,8)    -0.834182
+    (1,13)    1.01266
+    (1,17)    -0.29392
+    (2,9)    -0.834182
+    (2,14)    0.759494
+    (2,17)    -0.221481
+    (3,10)    -0.834182
+    (3,15)    0.506329
     (3,17)    -0.118986
-    (4,0)    -0.2788416
+    (4,0)    -0.278842
     (4,1)    -0.8
-    (4,6)    0.1344622
+    (4,6)    0.134462
     (4,7)    0.4
     (4,12)    0.4
-    (5,0)    -0.2680186
+    (5,0)    -0.268019
     (5,2)    -0.8
-    (5,6)    0.1175679
+    (5,6)    0.117568
     (5,8)    0.4
     (5,13)    0.4
-    (6,0)    -0.2323717
+    (6,0)    -0.232372
     (6,3)    -0.8
-    (6,6)    0.08859262
+    (6,6)    0.0885926
     (6,9)    0.4
     (6,14)    0.4
-    (7,0)    -0.1575082
+    (7,0)    -0.157508
     (7,4)    -0.8
-    (7,6)    0.04759439
     ...
 
 ========================= import_test: format 2
 
-  67x67 GraphBLAS double matrix, hypersparse by row:
+  67x67 GraphBLAS double matrix, hypersparse by row
   C to export, 294 entries
 
-    (0,7)    -0.8341818
-    (0,12)    1.265823
-    (0,17)    -0.3361556
-    (1,8)    -0.8341818
-    (1,13)    1.012658
-    (1,17)    -0.2939196
-    (2,9)    -0.8341818
-    (2,14)    0.7594937
-    (2,17)    -0.2214815
-    (3,10)    -0.8341818
-    (3,15)    0.5063291
+    (0,7)    -0.834182
+    (0,12)    1.26582
+    (0,17)    -0.336156
+    (1,8)    -0.834182
+    (1,13)    1.01266
+    (1,17)    -0.29392
+    (2,9)    -0.834182
+    (2,14)    0.759494
+    (2,17)    -0.221481
+    (3,10)    -0.834182
+    (3,15)    0.506329
     (3,17)    -0.118986
-    (4,0)    -0.2788416
+    (4,0)    -0.278842
     (4,1)    -0.8
-    (4,6)    0.1344622
+    (4,6)    0.134462
     (4,7)    0.4
     (4,12)    0.4
-    (5,0)    -0.2680186
+    (5,0)    -0.268019
     (5,2)    -0.8
-    (5,6)    0.1175679
+    (5,6)    0.117568
     (5,8)    0.4
     (5,13)    0.4
-    (6,0)    -0.2323717
+    (6,0)    -0.232372
     (6,3)    -0.8
-    (6,6)    0.08859262
+    (6,6)    0.0885926
     (6,9)    0.4
     (6,14)    0.4
-    (7,0)    -0.1575082
+    (7,0)    -0.157508
     (7,4)    -0.8
-    (7,6)    0.04759439
     ...
 
 ========================= import_test: format 3
 
-  67x67 GraphBLAS double matrix, hypersparse by row:
+  67x67 GraphBLAS double matrix, hypersparse by row
   C to export, 294 entries
 
-    (0,7)    -0.8341818
-    (0,12)    1.265823
-    (0,17)    -0.3361556
-    (1,8)    -0.8341818
-    (1,13)    1.012658
-    (1,17)    -0.2939196
-    (2,9)    -0.8341818
-    (2,14)    0.7594937
-    (2,17)    -0.2214815
-    (3,10)    -0.8341818
-    (3,15)    0.5063291
+    (0,7)    -0.834182
+    (0,12)    1.26582
+    (0,17)    -0.336156
+    (1,8)    -0.834182
+    (1,13)    1.01266
+    (1,17)    -0.29392
+    (2,9)    -0.834182
+    (2,14)    0.759494
+    (2,17)    -0.221481
+    (3,10)    -0.834182
+    (3,15)    0.506329
     (3,17)    -0.118986
-    (4,0)    -0.2788416
+    (4,0)    -0.278842
     (4,1)    -0.8
-    (4,6)    0.1344622
+    (4,6)    0.134462
     (4,7)    0.4
     (4,12)    0.4
-    (5,0)    -0.2680186
+    (5,0)    -0.268019
     (5,2)    -0.8
-    (5,6)    0.1175679
+    (5,6)    0.117568
     (5,8)    0.4
     (5,13)    0.4
-    (6,0)    -0.2323717
+    (6,0)    -0.232372
     (6,3)    -0.8
-    (6,6)    0.08859262
+    (6,6)    0.0885926
     (6,9)    0.4
     (6,14)    0.4
-    (7,0)    -0.1575082
+    (7,0)    -0.157508
     (7,4)    -0.8
-    (7,6)    0.04759439
     ...
 
 ######### input A: hyper 1 csc 1
 
 ========================= import_test: format 0
 
-  67x67 GraphBLAS double matrix, hypersparse by col:
+  67x67 GraphBLAS double matrix, hypersparse by col
   C to export, 294 entries
 
-    (4,0)    -0.2788416
-    (5,0)    -0.2680186
-    (6,0)    -0.2323717
-    (7,0)    -0.1575082
-    (8,0)    -0.06325978
-    (24,0)    0.1394208
-    (25,0)    0.1340093
-    (26,0)    0.1161859
-    (27,0)    0.07875411
-    (28,0)    0.03162989
+    (4,0)    -0.278842
+    (5,0)    -0.268019
+    (6,0)    -0.232372
+    (7,0)    -0.157508
+    (8,0)    -0.0632598
+    (24,0)    0.139421
+    (25,0)    0.134009
+    (26,0)    0.116186
+    (27,0)    0.0787541
+    (28,0)    0.0316299
     (4,1)    -0.8
-    (20,1)    -0.9159533
+    (20,1)    -0.915953
     (24,1)    0.4
     (60,1)    1
     (5,2)    -0.8
-    (21,2)    -0.9159533
+    (21,2)    -0.915953
     (25,2)    0.4
     (60,2)    1
     (6,3)    -0.8
-    (22,3)    -0.9159533
+    (22,3)    -0.915953
     (26,3)    0.4
     (60,3)    1
     (7,4)    -0.8
-    (23,4)    -0.9159533
+    (23,4)    -0.915953
     (27,4)    0.4
     (60,4)    1
     (8,5)    -0.8
     (28,5)    0.4
     (60,5)    1
-    (4,6)    0.1344622
     ...
 
 ========================= import_test: format 1
 
-  67x67 GraphBLAS double matrix, hypersparse by col:
+  67x67 GraphBLAS double matrix, hypersparse by col
   C to export, 294 entries
 
-    (4,0)    -0.2788416
-    (5,0)    -0.2680186
-    (6,0)    -0.2323717
-    (7,0)    -0.1575082
-    (8,0)    -0.06325978
-    (24,0)    0.1394208
-    (25,0)    0.1340093
-    (26,0)    0.1161859
-    (27,0)    0.07875411
-    (28,0)    0.03162989
+    (4,0)    -0.278842
+    (5,0)    -0.268019
+    (6,0)    -0.232372
+    (7,0)    -0.157508
+    (8,0)    -0.0632598
+    (24,0)    0.139421
+    (25,0)    0.134009
+    (26,0)    0.116186
+    (27,0)    0.0787541
+    (28,0)    0.0316299
     (4,1)    -0.8
-    (20,1)    -0.9159533
+    (20,1)    -0.915953
     (24,1)    0.4
     (60,1)    1
     (5,2)    -0.8
-    (21,2)    -0.9159533
+    (21,2)    -0.915953
     (25,2)    0.4
     (60,2)    1
     (6,3)    -0.8
-    (22,3)    -0.9159533
+    (22,3)    -0.915953
     (26,3)    0.4
     (60,3)    1
     (7,4)    -0.8
-    (23,4)    -0.9159533
+    (23,4)    -0.915953
     (27,4)    0.4
     (60,4)    1
     (8,5)    -0.8
     (28,5)    0.4
     (60,5)    1
-    (4,6)    0.1344622
     ...
 
 ========================= import_test: format 2
 
-  67x67 GraphBLAS double matrix, hypersparse by col:
+  67x67 GraphBLAS double matrix, hypersparse by col
   C to export, 294 entries
 
-    (4,0)    -0.2788416
-    (5,0)    -0.2680186
-    (6,0)    -0.2323717
-    (7,0)    -0.1575082
-    (8,0)    -0.06325978
-    (24,0)    0.1394208
-    (25,0)    0.1340093
-    (26,0)    0.1161859
-    (27,0)    0.07875411
-    (28,0)    0.03162989
+    (4,0)    -0.278842
+    (5,0)    -0.268019
+    (6,0)    -0.232372
+    (7,0)    -0.157508
+    (8,0)    -0.0632598
+    (24,0)    0.139421
+    (25,0)    0.134009
+    (26,0)    0.116186
+    (27,0)    0.0787541
+    (28,0)    0.0316299
     (4,1)    -0.8
-    (20,1)    -0.9159533
+    (20,1)    -0.915953
     (24,1)    0.4
     (60,1)    1
     (5,2)    -0.8
-    (21,2)    -0.9159533
+    (21,2)    -0.915953
     (25,2)    0.4
     (60,2)    1
     (6,3)    -0.8
-    (22,3)    -0.9159533
+    (22,3)    -0.915953
     (26,3)    0.4
     (60,3)    1
     (7,4)    -0.8
-    (23,4)    -0.9159533
+    (23,4)    -0.915953
     (27,4)    0.4
     (60,4)    1
     (8,5)    -0.8
     (28,5)    0.4
     (60,5)    1
-    (4,6)    0.1344622
     ...
 
 ========================= import_test: format 3
 
-  67x67 GraphBLAS double matrix, hypersparse by col:
+  67x67 GraphBLAS double matrix, hypersparse by col
   C to export, 294 entries
 
-    (4,0)    -0.2788416
-    (5,0)    -0.2680186
-    (6,0)    -0.2323717
-    (7,0)    -0.1575082
-    (8,0)    -0.06325978
-    (24,0)    0.1394208
-    (25,0)    0.1340093
-    (26,0)    0.1161859
-    (27,0)    0.07875411
-    (28,0)    0.03162989
+    (4,0)    -0.278842
+    (5,0)    -0.268019
+    (6,0)    -0.232372
+    (7,0)    -0.157508
+    (8,0)    -0.0632598
+    (24,0)    0.139421
+    (25,0)    0.134009
+    (26,0)    0.116186
+    (27,0)    0.0787541
+    (28,0)    0.0316299
     (4,1)    -0.8
-    (20,1)    -0.9159533
+    (20,1)    -0.915953
     (24,1)    0.4
     (60,1)    1
     (5,2)    -0.8
-    (21,2)    -0.9159533
+    (21,2)    -0.915953
     (25,2)    0.4
     (60,2)    1
     (6,3)    -0.8
-    (22,3)    -0.9159533
+    (22,3)    -0.915953
     (26,3)    0.4
     (60,3)    1
     (7,4)    -0.8
-    (23,4)    -0.9159533
+    (23,4)    -0.915953
     (27,4)    0.4
     (60,4)    1
     (8,5)    -0.8
     (28,5)    0.4
     (60,5)    1
-    (4,6)    0.1344622
     ...
+import_demp: all tests passed
diff --git a/GraphBLAS/Demo/Output/mis_demo.out b/GraphBLAS/Demo/Output/mis_demo.out
index a21186fe04..88c4b383a6 100644
--- a/GraphBLAS/Demo/Output/mis_demo.out
+++ b/GraphBLAS/Demo/Output/mis_demo.out
@@ -1,220 +1,220 @@
 Wathen: nx 4 ny 4 n 65 nz 752 method 0, time: 0.000 sec
-MIS time in seconds:       0.006143
-MIS time in seconds:       0.000209
+MIS time in seconds:       0.000344
+MIS time in seconds:       0.000114
 independent set found: 13 of 65 nodes
 independent set found: 13 of 65 nodes
 isize: 13 13
-MIS time in seconds:       0.000217
-MIS time in seconds:       0.000185
+MIS time in seconds:       0.000175
+MIS time in seconds:       0.000131
 independent set found: 14 of 65 nodes
 independent set found: 14 of 65 nodes
 isize: 14 14
 random 5 by 5, nz: 18, method 1 time 0.000 sec
-MIS time in seconds:       0.005617
-MIS time in seconds:       0.000083
+MIS time in seconds:       0.000646
+MIS time in seconds:       0.000052
 independent set found: 1 of 5 nodes
 independent set found: 1 of 5 nodes
 isize: 1 1
-MIS time in seconds:       0.000131
-MIS time in seconds:       0.000117
+MIS time in seconds:       0.000085
+MIS time in seconds:       0.000061
 independent set found: 2 of 5 nodes
 independent set found: 2 of 5 nodes
 isize: 2 2
 matrix 3 by 3, 0 entries, from stdin
-MIS time in seconds:       0.003173
-MIS time in seconds:       0.000034
+MIS time in seconds:       0.004340
+MIS time in seconds:       0.000029
 independent set found: 3 of 3 nodes
 independent set found: 3 of 3 nodes
 isize: 3 3
-MIS time in seconds:       0.000039
-MIS time in seconds:       0.000031
+MIS time in seconds:       0.000040
+MIS time in seconds:       0.000024
 independent set found: 3 of 3 nodes
 independent set found: 3 of 3 nodes
 isize: 3 3
 matrix 4 by 4, 4 entries, from stdin
-MIS time in seconds:       0.000671
-MIS time in seconds:       0.000080
+MIS time in seconds:       0.002961
+MIS time in seconds:       0.000052
 independent set found: 2 of 4 nodes
 independent set found: 2 of 4 nodes
 isize: 2 2
-MIS time in seconds:       0.000093
-MIS time in seconds:       0.000074
+MIS time in seconds:       0.000069
+MIS time in seconds:       0.000043
 independent set found: 2 of 4 nodes
 independent set found: 2 of 4 nodes
 isize: 2 2
 matrix 4 by 4, 10 entries, from stdin
-MIS time in seconds:       0.004128
-MIS time in seconds:       0.000124
+MIS time in seconds:       0.000538
+MIS time in seconds:       0.000065
 independent set found: 2 of 4 nodes
 independent set found: 2 of 4 nodes
 isize: 2 2
-MIS time in seconds:       0.000089
-MIS time in seconds:       0.000073
+MIS time in seconds:       0.000066
+MIS time in seconds:       0.000044
 independent set found: 2 of 4 nodes
 independent set found: 2 of 4 nodes
 isize: 2 2
 matrix 7 by 7, 16 entries, from stdin
-MIS time in seconds:       0.002746
-MIS time in seconds:       0.000122
+MIS time in seconds:       0.000630
+MIS time in seconds:       0.000068
 independent set found: 4 of 7 nodes
 independent set found: 4 of 7 nodes
 isize: 4 4
-MIS time in seconds:       0.000128
-MIS time in seconds:       0.000114
+MIS time in seconds:       0.000095
+MIS time in seconds:       0.000063
 independent set found: 3 of 7 nodes
 independent set found: 3 of 7 nodes
 isize: 3 3
 matrix 304 by 304, 876 entries, from stdin
-MIS time in seconds:       0.009878
-MIS time in seconds:       0.000203
+MIS time in seconds:       0.002792
+MIS time in seconds:       0.000166
 independent set found: 215 of 304 nodes
 independent set found: 215 of 304 nodes
 isize: 215 215
-MIS time in seconds:       0.000198
-MIS time in seconds:       0.000185
+MIS time in seconds:       0.000222
+MIS time in seconds:       0.000146
 independent set found: 216 of 304 nodes
 independent set found: 216 of 304 nodes
 isize: 216 216
 matrix 48 by 48, 352 entries, from stdin
-MIS time in seconds:       0.001811
-MIS time in seconds:       0.000139
+MIS time in seconds:       0.000887
+MIS time in seconds:       0.000087
 independent set found: 12 of 48 nodes
 independent set found: 12 of 48 nodes
 isize: 12 12
-MIS time in seconds:       0.000147
-MIS time in seconds:       0.000134
+MIS time in seconds:       0.000090
+MIS time in seconds:       0.000080
 independent set found: 11 of 48 nodes
 independent set found: 11 of 48 nodes
 isize: 11 11
 matrix 4884 by 4884, 285494 entries, from stdin
-MIS time in seconds:       0.007108
-MIS time in seconds:       0.007478
+MIS time in seconds:       0.002208
+MIS time in seconds:       0.001941
 independent set found: 286 of 4884 nodes
 independent set found: 286 of 4884 nodes
 isize: 286 286
-MIS time in seconds:       0.003445
-MIS time in seconds:       0.003873
+MIS time in seconds:       0.002254
+MIS time in seconds:       0.004493
 independent set found: 284 of 4884 nodes
 independent set found: 284 of 4884 nodes
 isize: 284 284
 matrix 183 by 183, 1402 entries, from stdin
-MIS time in seconds:       0.001380
-MIS time in seconds:       0.000339
+MIS time in seconds:       0.001480
+MIS time in seconds:       0.000228
 independent set found: 91 of 183 nodes
 independent set found: 91 of 183 nodes
 isize: 91 91
-MIS time in seconds:       0.000254
-MIS time in seconds:       0.000231
+MIS time in seconds:       0.000184
+MIS time in seconds:       0.000183
 independent set found: 87 of 183 nodes
 independent set found: 87 of 183 nodes
 isize: 87 87
 matrix 63 by 63, 246 entries, from stdin
-MIS time in seconds:       0.003807
-MIS time in seconds:       0.000196
+MIS time in seconds:       0.000499
+MIS time in seconds:       0.000566
 independent set found: 23 of 63 nodes
 independent set found: 23 of 63 nodes
 isize: 23 23
-MIS time in seconds:       0.000167
-MIS time in seconds:       0.000142
+MIS time in seconds:       0.000094
+MIS time in seconds:       0.000081
 independent set found: 23 of 63 nodes
 independent set found: 23 of 63 nodes
 isize: 23 23
 matrix 63 by 63, 246 entries, from stdin
-MIS time in seconds:       0.000688
-MIS time in seconds:       0.000184
+MIS time in seconds:       0.000513
+MIS time in seconds:       0.000105
 independent set found: 25 of 63 nodes
 independent set found: 25 of 63 nodes
 isize: 25 25
-MIS time in seconds:       0.000152
-MIS time in seconds:       0.000139
+MIS time in seconds:       0.000109
+MIS time in seconds:       0.000082
 independent set found: 26 of 63 nodes
 independent set found: 26 of 63 nodes
 isize: 26 26
 matrix 78 by 78, 204 entries, from stdin
-MIS time in seconds:       0.003376
-MIS time in seconds:       0.000188
+MIS time in seconds:       0.002682
+MIS time in seconds:       0.000132
 independent set found: 51 of 78 nodes
 independent set found: 51 of 78 nodes
 isize: 51 51
-MIS time in seconds:       0.000192
-MIS time in seconds:       0.000179
+MIS time in seconds:       0.000147
+MIS time in seconds:       0.000125
 independent set found: 50 of 78 nodes
 independent set found: 50 of 78 nodes
 isize: 50 50
 matrix 982 by 982, 99840 entries, from stdin
-MIS time in seconds:       0.002870
-MIS time in seconds:       0.002080
+MIS time in seconds:       0.005658
+MIS time in seconds:       0.001422
 independent set found: 355 of 982 nodes
 independent set found: 355 of 982 nodes
 isize: 355 355
-MIS time in seconds:       0.002066
-MIS time in seconds:       0.002033
+MIS time in seconds:       0.001347
+MIS time in seconds:       0.001322
 independent set found: 362 of 982 nodes
 independent set found: 362 of 982 nodes
 isize: 362 362
 matrix 67 by 67, 574 entries, from stdin
-MIS time in seconds:       0.003863
-MIS time in seconds:       0.001124
+MIS time in seconds:       0.002252
+MIS time in seconds:       0.000922
 independent set found: 17 of 67 nodes
 independent set found: 17 of 67 nodes
 isize: 17 17
-MIS time in seconds:       0.000197
-MIS time in seconds:       0.000184
+MIS time in seconds:       0.001098
+MIS time in seconds:       0.000111
 independent set found: 17 of 67 nodes
 independent set found: 17 of 67 nodes
 isize: 17 17
-Wathen: nx 200 ny 200 n 120801 nz 1762400 method 0, time: 0.130 sec
-MIS time in seconds:       0.052224
-MIS time in seconds:       0.043820
+Wathen: nx 200 ny 200 n 120801 nz 1762400 method 0, time: 0.164 sec
+MIS time in seconds:       0.040803
+MIS time in seconds:       0.035669
 independent set found: 18220 of 120801 nodes
 independent set found: 18220 of 120801 nodes
 isize: 18220 18220
-MIS time in seconds:       0.047010
-MIS time in seconds:       0.043788
+MIS time in seconds:       0.035829
+MIS time in seconds:       0.038435
 independent set found: 18241 of 120801 nodes
 independent set found: 18241 of 120801 nodes
 isize: 18241 18241
-random 10000 by 10000, nz: 199768, method 0 time 0.023 sec
-MIS time in seconds:       0.007380
-MIS time in seconds:       0.006517
+random 10000 by 10000, nz: 199768, method 0 time 0.024 sec
+MIS time in seconds:       0.004213
+MIS time in seconds:       0.003491
 independent set found: 1677 of 10000 nodes
 independent set found: 1677 of 10000 nodes
 isize: 1677 1677
-MIS time in seconds:       0.005862
-MIS time in seconds:       0.005842
+MIS time in seconds:       0.003268
+MIS time in seconds:       0.003257
 independent set found: 1664 of 10000 nodes
 independent set found: 1664 of 10000 nodes
 isize: 1664 1664
-random 10000 by 10000, nz: 199768, method 1 time 0.019 sec
-MIS time in seconds:       0.006514
-MIS time in seconds:       0.006300
+random 10000 by 10000, nz: 199768, method 1 time 0.016 sec
+MIS time in seconds:       0.003672
+MIS time in seconds:       0.005247
 independent set found: 1677 of 10000 nodes
 independent set found: 1677 of 10000 nodes
 isize: 1677 1677
-MIS time in seconds:       0.007062
-MIS time in seconds:       0.006443
+MIS time in seconds:       0.003306
+MIS time in seconds:       0.003435
 independent set found: 1664 of 10000 nodes
 independent set found: 1664 of 10000 nodes
 isize: 1664 1664
-random 100000 by 100000, nz: 19980330, method 0 time 2.185 sec
-MIS time in seconds:       0.215470
-MIS time in seconds:       0.213781
+random 100000 by 100000, nz: 19980330, method 0 time 2.381 sec
+MIS time in seconds:       0.159362
+MIS time in seconds:       0.155102
 independent set found: 2799 of 100000 nodes
 independent set found: 2799 of 100000 nodes
 isize: 2799 2799
-MIS time in seconds:       0.222207
-MIS time in seconds:       0.220219
+MIS time in seconds:       0.213847
+MIS time in seconds:       0.354706
 independent set found: 2815 of 100000 nodes
 independent set found: 2815 of 100000 nodes
 isize: 2815 2815
-random 100000 by 100000, nz: 19980330, method 1 time 1.743 sec
-MIS time in seconds:       0.215240
-MIS time in seconds:       0.213371
+random 100000 by 100000, nz: 19980330, method 1 time 2.077 sec
+MIS time in seconds:       0.201872
+MIS time in seconds:       0.159590
 independent set found: 2799 of 100000 nodes
 independent set found: 2799 of 100000 nodes
 isize: 2799 2799
-MIS time in seconds:       0.222707
-MIS time in seconds:       0.223993
+MIS time in seconds:       0.179876
+MIS time in seconds:       0.224400
 independent set found: 2815 of 100000 nodes
 independent set found: 2815 of 100000 nodes
 isize: 2815 2815
diff --git a/GraphBLAS/Demo/Output/openmp_demo.out b/GraphBLAS/Demo/Output/openmp_demo.out
index b12e3c03cf..14a31b30bf 100644
--- a/GraphBLAS/Demo/Output/openmp_demo.out
+++ b/GraphBLAS/Demo/Output/openmp_demo.out
@@ -1,131 +1,132 @@
 Demo: ../build/openmp_demo:
-GraphBLAS is using an OpenMP critical section
-to synchronize user threads.
 User threads in this program are OpenMP threads.
 
-================= worker 0 starts:
-
-================= worker 4 starts:
-
-================= worker 3 starts:
-
-================= worker 5 starts:
+================= worker 7 starts:
 
 ================= worker 1 starts:
 
-================= worker 7 starts:
+================= worker 5 starts:
 
 ================= worker 6 starts:
 
-================= worker 2 starts:
+================= worker 0 starts:
 
------------------ worker 0 intentional error:
+================= worker 4 starts:
+
+----------------- worker 7 intentional error:
 GraphBLAS error: GrB_INVALID_INDEX
 function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
+Row index 1007 out of range; must be < 6
 
+================= worker 3 starts:
 
------------------ worker 5 intentional error:
+================= worker 2 starts:
+
+----------------- worker 7 is done:
+
+  6x6 GraphBLAS double matrix, full by row
+  A, 36 entries
+
+    (0,0)    101007
+    (0,1)    102007
+    (0,2)    103007
+    (0,3)    104007
+    (0,4)    105007
+    (0,5)    106007
+    (1,0)    201007
+    (1,1)    202007
+    (1,2)    203007
+    (1,3)    204007
+    (1,4)    205007
+    (1,5)    206007
+    (2,0)    301007
+    (2,1)    302007
+    (2,2)    303007
+    (2,3)    304007
+    (2,4)    305007
+    (2,5)    306007
+    (3,0)    401007
+    (3,1)    402007
+    (3,2)    403007
+    (3,3)    404007
+    (3,4)    405007
+    (3,5)    406007
+    (4,0)    501007
+    (4,1)    502007
+    (4,2)    503007
+    (4,3)    504007
+    (4,4)    505007
+    ...
+
+----------------- worker 6 intentional error:
 GraphBLAS error: GrB_INVALID_INDEX
 function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
+Row index 1006 out of range; must be < 6
 
 ----------------- worker 3 intentional error:
 GraphBLAS error: GrB_INVALID_INDEX
 function: GrB_Matrix_setElement_INT32 (C, row, col, x)
 Row index 1003 out of range; must be < 6
 
-
 ----------------- worker 4 intentional error:
 GraphBLAS error: GrB_INVALID_INDEX
 function: GrB_Matrix_setElement_INT32 (C, row, col, x)
 Row index 1004 out of range; must be < 6
 
+----------------- worker 0 intentional error:
+GraphBLAS error: GrB_INVALID_INDEX
+function: GrB_Matrix_setElement_INT32 (C, row, col, x)
+Row index 1000 out of range; must be < 6
 
------------------ worker 0 is done:
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101000
-    (0,1)    102000
-    (0,2)    103000
-    (0,3)    104000
-    (0,4)    105000
-    (0,5)    106000
-    (1,0)    201000
-    (1,1)    202000
-    (1,2)    203000
-    (1,3)    204000
-    (1,4)    205000
-    (1,5)    206000
-    (2,0)    301000
-    (2,1)    302000
-    (2,2)    303000
-    (2,3)    304000
-    (2,4)    305000
-    (2,5)    306000
-    (3,0)    401000
-    (3,1)    402000
-    (3,2)    403000
-    (3,3)    404000
-    (3,4)    405000
-    (3,5)    406000
-    (4,0)    501000
-    (4,1)    502000
-    (4,2)    503000
-    (4,3)    504000
-    (4,4)    505000
-    (4,5)    506000
-    ...
+----------------- worker 5 intentional error:
+GraphBLAS error: GrB_INVALID_INDEX
+function: GrB_Matrix_setElement_INT32 (C, row, col, x)
+Row index 1005 out of range; must be < 6
 
------------------ worker 7 intentional error:
+----------------- worker 7 error should be same:
 GraphBLAS error: GrB_INVALID_INDEX
 function: GrB_Matrix_setElement_INT32 (C, row, col, x)
 Row index 1007 out of range; must be < 6
 
+----------------- worker 4 is done:
 
------------------ worker 5 is done:
-
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
-    (0,0)    101005
-    (0,1)    102005
-    (0,2)    103005
-    (0,3)    104005
-    (0,4)    105005
-    (0,5)    106005
-    (1,0)    201005
-    (1,1)    202005
-    (1,2)    203005
-    (1,3)    204005
-    (1,4)    205005
-    (1,5)    206005
-    (2,0)    301005
-    (2,1)    302005
-    (2,2)    303005
-    (2,3)    304005
-    (2,4)    305005
-    (2,5)    306005
-    (3,0)    401005
-    (3,1)    402005
-    (3,2)    403005
-    (3,3)    404005
-    (3,4)    405005
-    (3,5)    406005
-    (4,0)    501005
-    (4,1)    502005
-    (4,2)    503005
-    (4,3)    504005
-    (4,4)    505005
-    (4,5)    506005
+    (0,0)    101004
+    (0,1)    102004
+    (0,2)    103004
+    (0,3)    104004
+    (0,4)    105004
+    (0,5)    106004
+    (1,0)    201004
+    (1,1)    202004
+    (1,2)    203004
+    (1,3)    204004
+    (1,4)    205004
+    (1,5)    206004
+    (2,0)    301004
+    (2,1)    302004
+    (2,2)    303004
+    (2,3)    304004
+    (2,4)    305004
+    (2,5)    306004
+    (3,0)    401004
+    (3,1)    402004
+    (3,2)    403004
+    (3,3)    404004
+    (3,4)    405004
+    (3,5)    406004
+    (4,0)    501004
+    (4,1)    502004
+    (4,2)    503004
+    (4,3)    504004
+    (4,4)    505004
     ...
 
 ----------------- worker 3 is done:
 
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
     (0,0)    101003
@@ -157,110 +158,52 @@ Row index 1007 out of range; must be < 6
     (4,2)    503003
     (4,3)    504003
     (4,4)    505003
-    (4,5)    506003
     ...
 
------------------ worker 0 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 6 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 1 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 5 error should be same:
+----------------- worker 4 error should be same:
 GraphBLAS error: GrB_INVALID_INDEX
 function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 7 is done:
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101007
-    (0,1)    102007
-    (0,2)    103007
-    (0,3)    104007
-    (0,4)    105007
-    (0,5)    106007
-    (1,0)    201007
-    (1,1)    202007
-    (1,2)    203007
-    (1,3)    204007
-    (1,4)    205007
-    (1,5)    206007
-    (2,0)    301007
-    (2,1)    302007
-    (2,2)    303007
-    (2,3)    304007
-    (2,4)    305007
-    (2,5)    306007
-    (3,0)    401007
-    (3,1)    402007
-    (3,2)    403007
-    (3,3)    404007
-    (3,4)    405007
-    (3,5)    406007
-    (4,0)    501007
-    (4,1)    502007
-    (4,2)    503007
-    (4,3)    504007
-    (4,4)    505007
-    (4,5)    506007
-    ...
+Row index 1004 out of range; must be < 6
 
------------------ worker 1 is done:
+----------------- worker 5 is done:
 
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
-    (0,0)    101001
-    (0,1)    102001
-    (0,2)    103001
-    (0,3)    104001
-    (0,4)    105001
-    (0,5)    106001
-    (1,0)    201001
-    (1,1)    202001
-    (1,2)    203001
-    (1,3)    204001
-    (1,4)    205001
-    (1,5)    206001
-    (2,0)    301001
-    (2,1)    302001
-    (2,2)    303001
-    (2,3)    304001
-    (2,4)    305001
-    (2,5)    306001
-    (3,0)    401001
-    (3,1)    402001
-    (3,2)    403001
-    (3,3)    404001
-    (3,4)    405001
-    (3,5)    406001
-    (4,0)    501001
-    (4,1)    502001
-    (4,2)    503001
-    (4,3)    504001
-    (4,4)    505001
-    (4,5)    506001
+    (0,0)    101005
+    (0,1)    102005
+    (0,2)    103005
+    (0,3)    104005
+    (0,4)    105005
+    (0,5)    106005
+    (1,0)    201005
+    (1,1)    202005
+    (1,2)    203005
+    (1,3)    204005
+    (1,4)    205005
+    (1,5)    206005
+    (2,0)    301005
+    (2,1)    302005
+    (2,2)    303005
+    (2,3)    304005
+    (2,4)    305005
+    (2,5)    306005
+    (3,0)    401005
+    (3,1)    402005
+    (3,2)    403005
+    (3,3)    404005
+    (3,4)    405005
+    (3,5)    406005
+    (4,0)    501005
+    (4,1)    502005
+    (4,2)    503005
+    (4,3)    504005
+    (4,4)    505005
     ...
 
 ----------------- worker 6 is done:
 
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
     (0,0)    101006
@@ -292,85 +235,72 @@ Row index 1005 out of range; must be < 6
     (4,2)    503006
     (4,3)    504006
     (4,4)    505006
-    (4,5)    506006
     ...
 
------------------ worker 7 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
 ----------------- worker 3 error should be same:
 GraphBLAS error: GrB_INVALID_INDEX
 function: GrB_Matrix_setElement_INT32 (C, row, col, x)
 Row index 1003 out of range; must be < 6
 
+----------------- worker 0 is done:
 
------------------ worker 6 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 4 is done:
-
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
-    (0,0)    101004
-    (0,1)    102004
-    (0,2)    103004
-    (0,3)    104004
-    (0,4)    105004
-    (0,5)    106004
-    (1,0)    201004
-    (1,1)    202004
-    (1,2)    203004
-    (1,3)    204004
-    (1,4)    205004
-    (1,5)    206004
-    (2,0)    301004
-    (2,1)    302004
-    (2,2)    303004
-    (2,3)    304004
-    (2,4)    305004
-    (2,5)    306004
-    (3,0)    401004
-    (3,1)    402004
-    (3,2)    403004
-    (3,3)    404004
-    (3,4)    405004
-    (3,5)    406004
-    (4,0)    501004
-    (4,1)    502004
-    (4,2)    503004
-    (4,3)    504004
-    (4,4)    505004
-    (4,5)    506004
+    (0,0)    101000
+    (0,1)    102000
+    (0,2)    103000
+    (0,3)    104000
+    (0,4)    105000
+    (0,5)    106000
+    (1,0)    201000
+    (1,1)    202000
+    (1,2)    203000
+    (1,3)    204000
+    (1,4)    205000
+    (1,5)    206000
+    (2,0)    301000
+    (2,1)    302000
+    (2,2)    303000
+    (2,3)    304000
+    (2,4)    305000
+    (2,5)    306000
+    (3,0)    401000
+    (3,1)    402000
+    (3,2)    403000
+    (3,3)    404000
+    (3,4)    405000
+    (3,5)    406000
+    (4,0)    501000
+    (4,1)    502000
+    (4,2)    503000
+    (4,3)    504000
+    (4,4)    505000
     ...
 
------------------ worker 1 error should be same:
+----------------- worker 6 error should be same:
 GraphBLAS error: GrB_INVALID_INDEX
 function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
+Row index 1006 out of range; must be < 6
 
------------------ worker 4 error should be same:
+----------------- worker 5 error should be same:
 GraphBLAS error: GrB_INVALID_INDEX
 function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
+Row index 1005 out of range; must be < 6
 
+----------------- worker 0 error should be same:
+GraphBLAS error: GrB_INVALID_INDEX
+function: GrB_Matrix_setElement_INT32 (C, row, col, x)
+Row index 1000 out of range; must be < 6
 
 ----------------- worker 2 intentional error:
 GraphBLAS error: GrB_INVALID_INDEX
 function: GrB_Matrix_setElement_INT32 (C, row, col, x)
 Row index 1002 out of range; must be < 6
 
-
 ----------------- worker 2 is done:
 
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
     (0,0)    101002
@@ -402,7 +332,6 @@ Row index 1002 out of range; must be < 6
     (4,2)    503002
     (4,3)    504002
     (4,4)    505002
-    (4,5)    506002
     ...
 
 ----------------- worker 2 error should be same:
@@ -410,10 +339,55 @@ GraphBLAS error: GrB_INVALID_INDEX
 function: GrB_Matrix_setElement_INT32 (C, row, col, x)
 Row index 1002 out of range; must be < 6
 
+----------------- worker 1 intentional error:
+GraphBLAS error: GrB_INVALID_INDEX
+function: GrB_Matrix_setElement_INT32 (C, row, col, x)
+Row index 1001 out of range; must be < 6
+
+----------------- worker 1 is done:
+
+  6x6 GraphBLAS double matrix, full by row
+  A, 36 entries
+
+    (0,0)    101001
+    (0,1)    102001
+    (0,2)    103001
+    (0,3)    104001
+    (0,4)    105001
+    (0,5)    106001
+    (1,0)    201001
+    (1,1)    202001
+    (1,2)    203001
+    (1,3)    204001
+    (1,4)    205001
+    (1,5)    206001
+    (2,0)    301001
+    (2,1)    302001
+    (2,2)    303001
+    (2,3)    304001
+    (2,4)    305001
+    (2,5)    306001
+    (3,0)    401001
+    (3,1)    402001
+    (3,2)    403001
+    (3,3)    404001
+    (3,4)    405001
+    (3,5)    406001
+    (4,0)    501001
+    (4,1)    502001
+    (4,2)    503001
+    (4,3)    504001
+    (4,4)    505001
+    ...
+
+----------------- worker 1 error should be same:
+GraphBLAS error: GrB_INVALID_INDEX
+function: GrB_Matrix_setElement_INT32 (C, row, col, x)
+Row index 1001 out of range; must be < 6
 
----- Master prints matrix 0
+---- Leader prints matrix 0
 
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
     (0,0)    101000
@@ -445,12 +419,11 @@ Row index 1002 out of range; must be < 6
     (4,2)    503000
     (4,3)    504000
     (4,4)    505000
-    (4,5)    506000
     ...
 
----- Master prints matrix 1
+---- Leader prints matrix 1
 
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
     (0,0)    101001
@@ -482,12 +455,11 @@ Row index 1002 out of range; must be < 6
     (4,2)    503001
     (4,3)    504001
     (4,4)    505001
-    (4,5)    506001
     ...
 
----- Master prints matrix 2
+---- Leader prints matrix 2
 
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
     (0,0)    101002
@@ -519,12 +491,11 @@ Row index 1002 out of range; must be < 6
     (4,2)    503002
     (4,3)    504002
     (4,4)    505002
-    (4,5)    506002
     ...
 
----- Master prints matrix 3
+---- Leader prints matrix 3
 
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
     (0,0)    101003
@@ -556,12 +527,11 @@ Row index 1002 out of range; must be < 6
     (4,2)    503003
     (4,3)    504003
     (4,4)    505003
-    (4,5)    506003
     ...
 
----- Master prints matrix 4
+---- Leader prints matrix 4
 
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
     (0,0)    101004
@@ -593,12 +563,11 @@ Row index 1002 out of range; must be < 6
     (4,2)    503004
     (4,3)    504004
     (4,4)    505004
-    (4,5)    506004
     ...
 
----- Master prints matrix 5
+---- Leader prints matrix 5
 
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
     (0,0)    101005
@@ -630,12 +599,11 @@ Row index 1002 out of range; must be < 6
     (4,2)    503005
     (4,3)    504005
     (4,4)    505005
-    (4,5)    506005
     ...
 
----- Master prints matrix 6
+---- Leader prints matrix 6
 
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
     (0,0)    101006
@@ -667,12 +635,11 @@ Row index 1002 out of range; must be < 6
     (4,2)    503006
     (4,3)    504006
     (4,4)    505006
-    (4,5)    506006
     ...
 
----- Master prints matrix 7
+---- Leader prints matrix 7
 
-  6x6 GraphBLAS double matrix, sparse by row:
+  6x6 GraphBLAS double matrix, full by row
   A, 36 entries
 
     (0,0)    101007
@@ -704,12 +671,4 @@ Row index 1002 out of range; must be < 6
     (4,2)    503007
     (4,3)    504007
     (4,4)    505007
-    (4,5)    506007
     ...
-
-
----- Master thread prints an error message:
-Error: GraphBLAS error: GrB_NULL_POINTER
-function: GrB_Matrix_new (&A, type, nrows, ncols)
-Required argument is null: [A]
-
diff --git a/GraphBLAS/Demo/Output/pagerank_demo.out b/GraphBLAS/Demo/Output/pagerank_demo.out
index ec04b6b52e..02c31b0b81 100644
--- a/GraphBLAS/Demo/Output/pagerank_demo.out
+++ b/GraphBLAS/Demo/Output/pagerank_demo.out
@@ -3,20 +3,20 @@ pagerank_demo: nthreads: 8
 ntuples: 3
 nrows 3 ncols 3
 time to prune self-edges:     0.000000
-time to build the graph with GrB_Matrix_build:     0.000033
+time to build the graph with GrB_Matrix_build:     0.000044
 leave A as-is
 
 Matrix from file:
 
-  3x3 GraphBLAS bool matrix, sparse by row:
+  3x3 GraphBLAS bool matrix, bitmap by row
   *A_output, 3 entries
 
     (0,0)   1
     (1,1)   1
     (2,2)   1
-n 3 edges 3  dpagerank time :       0.000128 iters: 20
-n 3 edges 3  ipagerank time :       0.000080 iters: 20
-n 3 edges 3  dpagerank time :       0.000031 iters: 1
+n 3 edges 3  dpagerank time :       0.000178 iters: 20
+n 3 edges 3  ipagerank time :       0.000107 iters: 20
+n 3 edges 3  dpagerank time :       0.000035 iters: 1
 Top 3 nodes:
     0 d:[     0 :   3.38861694e-01] i:[     0 :   2.86096486e+08] x:[     0 :   3.33333333e-01]
     1 d:[     2 :   3.33331362e-01] i:[     2 :   2.81427298e+08] x:[     1 :   3.33333333e-01]mismatch
@@ -25,13 +25,13 @@ Top 3 nodes:
 pagerank_demo: nthreads: 8
 ntuples: 8
 nrows 4 ncols 4
-time to prune self-edges:     0.000001
-time to build the graph with GrB_Matrix_build:     0.000033
+time to prune self-edges:     0.000000
+time to build the graph with GrB_Matrix_build:     0.000056
 leave A as-is
 
 Matrix from file:
 
-  4x4 GraphBLAS bool matrix, sparse by row:
+  4x4 GraphBLAS bool matrix, bitmap by row
   *A_output, 8 entries
 
     (0,0)   1
@@ -42,9 +42,9 @@ Matrix from file:
     (2,3)   1
     (3,2)   1
     (3,3)   1
-n 4 edges 8  dpagerank time :       0.000152 iters: 20
-n 4 edges 8  ipagerank time :       0.000108 iters: 20
-n 4 edges 8  dpagerank time :       0.000029 iters: 1
+n 4 edges 8  dpagerank time :       0.000138 iters: 20
+n 4 edges 8  ipagerank time :       0.000107 iters: 20
+n 4 edges 8  dpagerank time :       0.000026 iters: 1
 Top 4 nodes:
     0 d:[     0 :   2.53865987e-01] i:[     0 :   2.04675869e+08] x:[     0 :   2.50000000e-01]
     1 d:[     1 :   2.53865987e-01] i:[     1 :   2.04675869e+08] x:[     1 :   2.50000000e-01]
@@ -55,12 +55,12 @@ pagerank_demo: nthreads: 8
 ntuples: 10
 nrows 4 ncols 4
 time to prune self-edges:     0.000000
-time to build the graph with GrB_Matrix_build:     0.000033
+time to build the graph with GrB_Matrix_build:     0.000044
 leave A as-is
 
 Matrix from file:
 
-  4x4 GraphBLAS bool matrix, sparse by row:
+  4x4 GraphBLAS bool matrix, bitmap by row
   *A_output, 10 entries
 
     (0,0)   1
@@ -73,9 +73,9 @@ Matrix from file:
     (3,0)   1
     (3,1)   1
     (3,3)   1
-n 4 edges 10  dpagerank time :       0.000144 iters: 20
-n 4 edges 10  ipagerank time :       0.000110 iters: 20
-n 4 edges 10  dpagerank time :       0.000108 iters: 12
+n 4 edges 10  dpagerank time :       0.000131 iters: 20
+n 4 edges 10  ipagerank time :       0.000090 iters: 20
+n 4 edges 10  dpagerank time :       0.000057 iters: 12
 Top 4 nodes:
     0 d:[     0 :   2.83169320e-01] i:[     0 :   2.28301264e+08] x:[     0 :   2.83169438e-01]
     1 d:[     1 :   2.79491864e-01] i:[     1 :   2.25336367e+08] x:[     1 :   2.79491122e-01]
@@ -86,12 +86,12 @@ pagerank_demo: nthreads: 8
 ntuples: 224
 nrows 48 ncols 48
 time to prune self-edges:     0.000000
-time to build the graph with GrB_Matrix_build:     0.000049
+time to build the graph with GrB_Matrix_build:     0.000057
 leave A as-is
 
 Matrix from file:
 
-  48x48 GraphBLAS bool matrix, sparse by row:
+  48x48 GraphBLAS bool matrix, sparse by row
   *A_output, 224 entries
 
     (0,0)   1
@@ -115,10 +115,18 @@ Matrix from file:
     (8,2)   1
     (8,8)   1
     (9,1)   1
+    (9,3)   1
+    (9,7)   1
+    (9,8)   1
+    (9,9)   1
+    (10,0)   1
+    (10,4)   1
+    (10,6)   1
+    (10,8)   1
     ...
-n 48 edges 224  dpagerank time :       0.000188 iters: 20
-n 48 edges 224  ipagerank time :       0.000180 iters: 20
-n 48 edges 224  dpagerank time :       0.000289 iters: 16
+n 48 edges 224  dpagerank time :       0.000252 iters: 20
+n 48 edges 224  ipagerank time :       0.000217 iters: 20
+n 48 edges 224  dpagerank time :       0.000214 iters: 16
 Top 48 nodes:
     0 d:[     2 :   2.00526870e-01] i:[     2 :   2.04212630e+08] x:[     2 :   2.01807617e-01]
     1 d:[     0 :   1.93646215e-01] i:[     0 :   1.97205508e+08] x:[     0 :   1.92211713e-01]
@@ -172,13 +180,13 @@ Top 48 nodes:
 pagerank_demo: nthreads: 8
 ntuples: 147631
 nrows 4884 ncols 4884
-time to prune self-edges:     0.000001
-time to build the graph with GrB_Matrix_build:     0.006729
+time to prune self-edges:     0.000000
+time to build the graph with GrB_Matrix_build:     0.006355
 leave A as-is
 
 Matrix from file:
 
-  4884x4884 GraphBLAS bool matrix, sparse by row:
+  4884x4884 GraphBLAS bool matrix, sparse by row
   *A_output, 147631 entries
 
     (0,0)   1
@@ -206,10 +214,14 @@ Matrix from file:
     (7,7)   1
     (8,8)   1
     (9,3)   1
+    (9,4)   1
+    (9,6)   1
+    (9,7)   1
+    (9,9)   1
     ...
-n 4884 edges 147631  dpagerank time :       0.008657 iters: 20
-n 4884 edges 147631  ipagerank time :       0.008946 iters: 20
-n 4884 edges 147631  dpagerank time :       0.042111 iters: 46
+n 4884 edges 147631  dpagerank time :       0.004990 iters: 20
+n 4884 edges 147631  ipagerank time :       0.004938 iters: 20
+n 4884 edges 147631  dpagerank time :       0.022664 iters: 46
 Top 4884 nodes:
     0 d:[     0 :   2.91912462e-02] i:[     0 :   3.10526730e+07] x:[     0 :   3.32228314e-02]
     1 d:[     1 :   5.00313528e-03] i:[     1 :   5.32204600e+06] x:[     1 :   4.99132478e-03]
@@ -5099,13 +5111,13 @@ Top 4884 nodes:
 pagerank_demo: nthreads: 8
 ntuples: 1069
 nrows 183 ncols 183
-time to prune self-edges:     0.000000
-time to build the graph with GrB_Matrix_build:     0.000138
+time to prune self-edges:     0.000001
+time to build the graph with GrB_Matrix_build:     0.000182
 leave A as-is
 
 Matrix from file:
 
-  183x183 GraphBLAS bool matrix, sparse by row:
+  183x183 GraphBLAS bool matrix, sparse by row
   *A_output, 1069 entries
 
     (0,0)   1
@@ -5137,11 +5149,10 @@ Matrix from file:
     (0,86)   1
     (0,89)   1
     (0,90)   1
-    (0,91)   1
     ...
-n 183 edges 1069  dpagerank time :       0.000408 iters: 20
-n 183 edges 1069  ipagerank time :       0.000374 iters: 20
-n 183 edges 1069  dpagerank time :       0.001947 iters: 37
+n 183 edges 1069  dpagerank time :       0.000378 iters: 20
+n 183 edges 1069  ipagerank time :       0.000260 iters: 20
+n 183 edges 1069  dpagerank time :       0.000660 iters: 37
 Top 183 nodes:
     0 d:[   136 :   1.44427493e-01] i:[   136 :   1.51941845e+08] x:[   136 :   1.44252744e-01]
     1 d:[     0 :   6.91824797e-02] i:[     0 :   7.27819210e+07] x:[     0 :   6.91214864e-02]
@@ -5331,12 +5342,12 @@ pagerank_demo: nthreads: 8
 ntuples: 299
 nrows 67 ncols 67
 time to prune self-edges:     0.000000
-time to build the graph with GrB_Matrix_build:     0.000077
+time to build the graph with GrB_Matrix_build:     0.000108
 leave A as-is
 
 Matrix from file:
 
-  67x67 GraphBLAS bool matrix, sparse by row:
+  67x67 GraphBLAS bool matrix, sparse by row
   *A_output, 294 entries
 
     (0,7)   1
@@ -5368,11 +5379,10 @@ Matrix from file:
     (6,14)   1
     (7,0)   1
     (7,4)   1
-    (7,6)   1
     ...
-n 67 edges 294  dpagerank time :       0.000244 iters: 20
-n 67 edges 294  ipagerank time :       0.000195 iters: 20
-n 67 edges 294  dpagerank time :       0.000278 iters: 11
+n 67 edges 294  dpagerank time :       0.000227 iters: 20
+n 67 edges 294  ipagerank time :       0.000153 iters: 20
+n 67 edges 294  dpagerank time :       0.000120 iters: 11
 Top 67 nodes:
     0 d:[    19 :   3.94516997e-02] i:[    19 :   4.03104180e+07] x:[    19 :   3.94480927e-02]
     1 d:[    30 :   3.14483462e-02] i:[    30 :   3.21328600e+07] x:[    30 :   3.14475525e-02]
diff --git a/GraphBLAS/Demo/Output/pthread_demo.out b/GraphBLAS/Demo/Output/pthread_demo.out
deleted file mode 100644
index f595670914..0000000000
--- a/GraphBLAS/Demo/Output/pthread_demo.out
+++ /dev/null
@@ -1,716 +0,0 @@
-Demo: ../build/pthread_demo:
-pthread demo, nthreads: 8
-GraphBLAS is using an OpenMP critical section
-to synchronize user threads.
-User threads in this program are POSIX pthreads.
-
-================= worker 1 starts:
-
-================= worker 2 starts:
-
-================= worker 0 starts:
-
-================= worker 3 starts:
-
-================= worker 4 starts:
-
------------------ worker 1 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 2 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 0 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
-================= worker 5 starts:
-
------------------ worker 3 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
-================= worker 6 starts:
-
------------------ worker 4 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 1 is done:
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101001
-    (0,1)    102001
-    (0,2)    103001
-    (0,3)    104001
-    (0,4)    105001
-    (0,5)    106001
-    (1,0)    201001
-    (1,1)    202001
-    (1,2)    203001
-    (1,3)    204001
-    (1,4)    205001
-    (1,5)    206001
-    (2,0)    301001
-    (2,1)    302001
-    (2,2)    303001
-    (2,3)    304001
-    (2,4)    305001
-    (2,5)    306001
-    (3,0)    401001
-    (3,1)    402001
-    (3,2)    403001
-    (3,3)    404001
-    (3,4)    405001
-    (3,5)    406001
-    (4,0)    501001
-    (4,1)    502001
-    (4,2)    503001
-    (4,3)    504001
-    (4,4)    505001
-    (4,5)    506001
-    ...
-
------------------ worker 1 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
-================= worker 7 starts:
-
------------------ worker 0 is done:
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101000
-    (0,1)    102000
-    (0,2)    103000
-    (0,3)    104000
-    (0,4)    105000
-    (0,5)    106000
-    (1,0)    201000
-    (1,1)    202000
-    (1,2)    203000
-    (1,3)    204000
-    (1,4)    205000
-    (1,5)    206000
-    (2,0)    301000
-    (2,1)    302000
-    (2,2)    303000
-    (2,3)    304000
-    (2,4)    305000
-    (2,5)    306000
-    (3,0)    401000
-    (3,1)    402000
-    (3,2)    403000
-    (3,3)    404000
-    (3,4)    405000
-    (3,5)    406000
-    (4,0)    501000
-    (4,1)    502000
-    (4,2)    503000
-    (4,3)    504000
-    (4,4)    505000
-    (4,5)    506000
-    ...
-
------------------ worker 0 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 3 is done:
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101003
-    (0,1)    102003
-    (0,2)    103003
-    (0,3)    104003
-    (0,4)    105003
-    (0,5)    106003
-    (1,0)    201003
-    (1,1)    202003
-    (1,2)    203003
-    (1,3)    204003
-    (1,4)    205003
-    (1,5)    206003
-    (2,0)    301003
-    (2,1)    302003
-    (2,2)    303003
-    (2,3)    304003
-    (2,4)    305003
-    (2,5)    306003
-    (3,0)    401003
-    (3,1)    402003
-    (3,2)    403003
-    (3,3)    404003
-    (3,4)    405003
-    (3,5)    406003
-    (4,0)    501003
-    (4,1)    502003
-    (4,2)    503003
-    (4,3)    504003
-    (4,4)    505003
-    (4,5)    506003
-    ...
-
------------------ worker 3 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 5 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 4 is done:
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101004
-    (0,1)    102004
-    (0,2)    103004
-    (0,3)    104004
-    (0,4)    105004
-    (0,5)    106004
-    (1,0)    201004
-    (1,1)    202004
-    (1,2)    203004
-    (1,3)    204004
-    (1,4)    205004
-    (1,5)    206004
-    (2,0)    301004
-    (2,1)    302004
-    (2,2)    303004
-    (2,3)    304004
-    (2,4)    305004
-    (2,5)    306004
-    (3,0)    401004
-    (3,1)    402004
-    (3,2)    403004
-    (3,3)    404004
-    (3,4)    405004
-    (3,5)    406004
-    (4,0)    501004
-    (4,1)    502004
-    (4,2)    503004
-    (4,3)    504004
-    (4,4)    505004
-    (4,5)    506004
-    ...
-
------------------ worker 2 is done:
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101002
-    (0,1)    102002
-    (0,2)    103002
-    (0,3)    104002
-    (0,4)    105002
-    (0,5)    106002
-    (1,0)    201002
-    (1,1)    202002
-    (1,2)    203002
-    (1,3)    204002
-    (1,4)    205002
-    (1,5)    206002
-    (2,0)    301002
-    (2,1)    302002
-    (2,2)    303002
-    (2,3)    304002
-    (2,4)    305002
-    (2,5)    306002
-    (3,0)    401002
-    (3,1)    402002
-    (3,2)    403002
-    (3,3)    404002
-    (3,4)    405002
-    (3,5)    406002
-    (4,0)    501002
-    (4,1)    502002
-    (4,2)    503002
-    (4,3)    504002
-    (4,4)    505002
-    (4,5)    506002
-    ...
-
------------------ worker 2 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 6 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 7 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 5 is done:
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101005
-    (0,1)    102005
-    (0,2)    103005
-    (0,3)    104005
-    (0,4)    105005
-    (0,5)    106005
-    (1,0)    201005
-    (1,1)    202005
-    (1,2)    203005
-    (1,3)    204005
-    (1,4)    205005
-    (1,5)    206005
-    (2,0)    301005
-    (2,1)    302005
-    (2,2)    303005
-    (2,3)    304005
-    (2,4)    305005
-    (2,5)    306005
-    (3,0)    401005
-    (3,1)    402005
-    (3,2)    403005
-    (3,3)    404005
-    (3,4)    405005
-    (3,5)    406005
-    (4,0)    501005
-    (4,1)    502005
-    (4,2)    503005
-    (4,3)    504005
-    (4,4)    505005
-    (4,5)    506005
-    ...
-
------------------ worker 5 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 4 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 7 is done:
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101007
-    (0,1)    102007
-    (0,2)    103007
-    (0,3)    104007
-    (0,4)    105007
-    (0,5)    106007
-    (1,0)    201007
-    (1,1)    202007
-    (1,2)    203007
-    (1,3)    204007
-    (1,4)    205007
-    (1,5)    206007
-    (2,0)    301007
-    (2,1)    302007
-    (2,2)    303007
-    (2,3)    304007
-    (2,4)    305007
-    (2,5)    306007
-    (3,0)    401007
-    (3,1)    402007
-    (3,2)    403007
-    (3,3)    404007
-    (3,4)    405007
-    (3,5)    406007
-    (4,0)    501007
-    (4,1)    502007
-    (4,2)    503007
-    (4,3)    504007
-    (4,4)    505007
-    (4,5)    506007
-    ...
-
------------------ worker 7 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 6 is done:
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101006
-    (0,1)    102006
-    (0,2)    103006
-    (0,3)    104006
-    (0,4)    105006
-    (0,5)    106006
-    (1,0)    201006
-    (1,1)    202006
-    (1,2)    203006
-    (1,3)    204006
-    (1,4)    205006
-    (1,5)    206006
-    (2,0)    301006
-    (2,1)    302006
-    (2,2)    303006
-    (2,3)    304006
-    (2,4)    305006
-    (2,5)    306006
-    (3,0)    401006
-    (3,1)    402006
-    (3,2)    403006
-    (3,3)    404006
-    (3,4)    405006
-    (3,5)    406006
-    (4,0)    501006
-    (4,1)    502006
-    (4,2)    503006
-    (4,3)    504006
-    (4,4)    505006
-    (4,5)    506006
-    ...
-
------------------ worker 6 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
----- Master prints matrix 0
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101000
-    (0,1)    102000
-    (0,2)    103000
-    (0,3)    104000
-    (0,4)    105000
-    (0,5)    106000
-    (1,0)    201000
-    (1,1)    202000
-    (1,2)    203000
-    (1,3)    204000
-    (1,4)    205000
-    (1,5)    206000
-    (2,0)    301000
-    (2,1)    302000
-    (2,2)    303000
-    (2,3)    304000
-    (2,4)    305000
-    (2,5)    306000
-    (3,0)    401000
-    (3,1)    402000
-    (3,2)    403000
-    (3,3)    404000
-    (3,4)    405000
-    (3,5)    406000
-    (4,0)    501000
-    (4,1)    502000
-    (4,2)    503000
-    (4,3)    504000
-    (4,4)    505000
-    (4,5)    506000
-    ...
-
----- Master prints matrix 1
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101001
-    (0,1)    102001
-    (0,2)    103001
-    (0,3)    104001
-    (0,4)    105001
-    (0,5)    106001
-    (1,0)    201001
-    (1,1)    202001
-    (1,2)    203001
-    (1,3)    204001
-    (1,4)    205001
-    (1,5)    206001
-    (2,0)    301001
-    (2,1)    302001
-    (2,2)    303001
-    (2,3)    304001
-    (2,4)    305001
-    (2,5)    306001
-    (3,0)    401001
-    (3,1)    402001
-    (3,2)    403001
-    (3,3)    404001
-    (3,4)    405001
-    (3,5)    406001
-    (4,0)    501001
-    (4,1)    502001
-    (4,2)    503001
-    (4,3)    504001
-    (4,4)    505001
-    (4,5)    506001
-    ...
-
----- Master prints matrix 2
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101002
-    (0,1)    102002
-    (0,2)    103002
-    (0,3)    104002
-    (0,4)    105002
-    (0,5)    106002
-    (1,0)    201002
-    (1,1)    202002
-    (1,2)    203002
-    (1,3)    204002
-    (1,4)    205002
-    (1,5)    206002
-    (2,0)    301002
-    (2,1)    302002
-    (2,2)    303002
-    (2,3)    304002
-    (2,4)    305002
-    (2,5)    306002
-    (3,0)    401002
-    (3,1)    402002
-    (3,2)    403002
-    (3,3)    404002
-    (3,4)    405002
-    (3,5)    406002
-    (4,0)    501002
-    (4,1)    502002
-    (4,2)    503002
-    (4,3)    504002
-    (4,4)    505002
-    (4,5)    506002
-    ...
-
----- Master prints matrix 3
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101003
-    (0,1)    102003
-    (0,2)    103003
-    (0,3)    104003
-    (0,4)    105003
-    (0,5)    106003
-    (1,0)    201003
-    (1,1)    202003
-    (1,2)    203003
-    (1,3)    204003
-    (1,4)    205003
-    (1,5)    206003
-    (2,0)    301003
-    (2,1)    302003
-    (2,2)    303003
-    (2,3)    304003
-    (2,4)    305003
-    (2,5)    306003
-    (3,0)    401003
-    (3,1)    402003
-    (3,2)    403003
-    (3,3)    404003
-    (3,4)    405003
-    (3,5)    406003
-    (4,0)    501003
-    (4,1)    502003
-    (4,2)    503003
-    (4,3)    504003
-    (4,4)    505003
-    (4,5)    506003
-    ...
-
----- Master prints matrix 4
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101004
-    (0,1)    102004
-    (0,2)    103004
-    (0,3)    104004
-    (0,4)    105004
-    (0,5)    106004
-    (1,0)    201004
-    (1,1)    202004
-    (1,2)    203004
-    (1,3)    204004
-    (1,4)    205004
-    (1,5)    206004
-    (2,0)    301004
-    (2,1)    302004
-    (2,2)    303004
-    (2,3)    304004
-    (2,4)    305004
-    (2,5)    306004
-    (3,0)    401004
-    (3,1)    402004
-    (3,2)    403004
-    (3,3)    404004
-    (3,4)    405004
-    (3,5)    406004
-    (4,0)    501004
-    (4,1)    502004
-    (4,2)    503004
-    (4,3)    504004
-    (4,4)    505004
-    (4,5)    506004
-    ...
-
----- Master prints matrix 5
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101005
-    (0,1)    102005
-    (0,2)    103005
-    (0,3)    104005
-    (0,4)    105005
-    (0,5)    106005
-    (1,0)    201005
-    (1,1)    202005
-    (1,2)    203005
-    (1,3)    204005
-    (1,4)    205005
-    (1,5)    206005
-    (2,0)    301005
-    (2,1)    302005
-    (2,2)    303005
-    (2,3)    304005
-    (2,4)    305005
-    (2,5)    306005
-    (3,0)    401005
-    (3,1)    402005
-    (3,2)    403005
-    (3,3)    404005
-    (3,4)    405005
-    (3,5)    406005
-    (4,0)    501005
-    (4,1)    502005
-    (4,2)    503005
-    (4,3)    504005
-    (4,4)    505005
-    (4,5)    506005
-    ...
-
----- Master prints matrix 6
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101006
-    (0,1)    102006
-    (0,2)    103006
-    (0,3)    104006
-    (0,4)    105006
-    (0,5)    106006
-    (1,0)    201006
-    (1,1)    202006
-    (1,2)    203006
-    (1,3)    204006
-    (1,4)    205006
-    (1,5)    206006
-    (2,0)    301006
-    (2,1)    302006
-    (2,2)    303006
-    (2,3)    304006
-    (2,4)    305006
-    (2,5)    306006
-    (3,0)    401006
-    (3,1)    402006
-    (3,2)    403006
-    (3,3)    404006
-    (3,4)    405006
-    (3,5)    406006
-    (4,0)    501006
-    (4,1)    502006
-    (4,2)    503006
-    (4,3)    504006
-    (4,4)    505006
-    (4,5)    506006
-    ...
-
----- Master prints matrix 7
-
-  6x6 GraphBLAS double matrix, sparse by row:
-  A, 36 entries
-
-    (0,0)    101007
-    (0,1)    102007
-    (0,2)    103007
-    (0,3)    104007
-    (0,4)    105007
-    (0,5)    106007
-    (1,0)    201007
-    (1,1)    202007
-    (1,2)    203007
-    (1,3)    204007
-    (1,4)    205007
-    (1,5)    206007
-    (2,0)    301007
-    (2,1)    302007
-    (2,2)    303007
-    (2,3)    304007
-    (2,4)    305007
-    (2,5)    306007
-    (3,0)    401007
-    (3,1)    402007
-    (3,2)    403007
-    (3,3)    404007
-    (3,4)    405007
-    (3,5)    406007
-    (4,0)    501007
-    (4,1)    502007
-    (4,2)    503007
-    (4,3)    504007
-    (4,4)    505007
-    (4,5)    506007
-    ...
-
-
----- Master thread prints an error message:
-master -1 : Error: GraphBLAS error: GrB_NULL_POINTER
-function: GrB_Matrix_new (&A, type, nrows, ncols)
-Required argument is null: [A]
-
diff --git a/GraphBLAS/Demo/Output/simple_demo.out b/GraphBLAS/Demo/Output/simple_demo.out
index cd420f309c..5ee4fdfb29 100644
--- a/GraphBLAS/Demo/Output/simple_demo.out
+++ b/GraphBLAS/Demo/Output/simple_demo.out
@@ -1,5 +1,5 @@
-time to call simple_tic 1 million times: 0.0337404
-time to generate 10 million random numbers: 0.178885
+time to call simple_tic 1 million times: 0.0455706
+time to generate 10 million random numbers: 0.182001
 first 10 random numbers:
     0.257524
     0.688456
@@ -11,4 +11,4 @@ first 10 random numbers:
     0.566362
     0.674999
     0.681490
-time to generate 10 million random uint64: 0.106817
+time to generate 10 million random uint64: 0.107747
diff --git a/GraphBLAS/Demo/Output/tri_demo.out b/GraphBLAS/Demo/Output/tri_demo.out
index 861568b782..05d0e515fe 100644
--- a/GraphBLAS/Demo/Output/tri_demo.out
+++ b/GraphBLAS/Demo/Output/tri_demo.out
@@ -1,305 +1,293 @@
 --------------------------------------------------------------
 Wathen: nx 4 ny 4 n 65 nz 752 method 0, time: 0.000 sec
 
-total time to read A matrix:       0.000279 sec
+total time to read A matrix:       0.000254 sec
 
 n 65 # edges 376
-U=triu(A) time:        0.000032 sec
-L=tril(A) time:        0.000008 sec
+U=triu(A) time:        0.000028 sec
+L=tril(A) time:        0.000007 sec
 
 ------------------------------------- dot product method:
 # triangles 872
-L*U' time (dot):         0.000055 sec
-tricount time:         0.000071 sec (dot product method)
-tri+prep time:         0.000111 sec (incl time to compute L and U)
-compute C time:        0.000055 sec
-reduce (C) time:       0.000016 sec
-rate     3.39 million edges/sec (incl time for U=triu(A))
-rate     5.27 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000015 sec (nthreads: 2 speedup 3.70945)
-tricount time:         0.000020 sec (dot product method)
-tri+prep time:         0.000059 sec (incl time to compute L and U)
-compute C time:        0.000015 sec
+L*U' time (dot):         0.000057 sec
+tricount time:         0.000061 sec (dot product method)
+tri+prep time:         0.000097 sec (incl time to compute L and U)
+compute C time:        0.000057 sec
 reduce (C) time:       0.000005 sec
-rate     6.34 million edges/sec (incl time for U=triu(A))
-rate    19.13 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000013 sec (nthreads: 4 speedup 4.16316)
-tricount time:         0.000017 sec (dot product method)
-tri+prep time:         0.000057 sec (incl time to compute L and U)
-compute C time:        0.000013 sec
-reduce (C) time:       0.000004 sec
-rate     6.58 million edges/sec (incl time for U=triu(A))
-rate    21.52 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000012 sec (nthreads: 8 speedup 4.43079)
-tricount time:         0.000016 sec (dot product method)
-tri+prep time:         0.000056 sec (incl time to compute L and U)
-compute C time:        0.000012 sec
-reduce (C) time:       0.000004 sec
-rate     6.72 million edges/sec (incl time for U=triu(A))
-rate    23.04 million edges/sec (just tricount itself)
+rate     3.89 million edges/sec (incl time for U=triu(A))
+rate     6.13 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000014 sec (nthreads: 2 speedup 4.15599)
+tricount time:         0.000015 sec (dot product method)
+tri+prep time:         0.000051 sec (incl time to compute L and U)
+compute C time:        0.000014 sec
+reduce (C) time:       0.000002 sec
+rate     7.42 million edges/sec (incl time for U=triu(A))
+rate    24.38 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000011 sec (nthreads: 4 speedup 5.05627)
+tricount time:         0.000013 sec (dot product method)
+tri+prep time:         0.000048 sec (incl time to compute L and U)
+compute C time:        0.000011 sec
+reduce (C) time:       0.000002 sec
+rate     7.84 million edges/sec (incl time for U=triu(A))
+rate    29.50 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000011 sec (nthreads: 8 speedup 5.0794)
+tricount time:         0.000013 sec (dot product method)
+tri+prep time:         0.000048 sec (incl time to compute L and U)
+compute C time:        0.000011 sec
+reduce (C) time:       0.000002 sec
+rate     7.85 million edges/sec (incl time for U=triu(A))
+rate    29.65 million edges/sec (just tricount itself)
 L*U' time (dot):         0.000016 sec
-tricount time:         0.000021 sec (dot product method)
-tri+prep time:         0.000060 sec (incl time to compute L and U)
+tricount time:         0.000018 sec (dot product method)
+tri+prep time:         0.000053 sec (incl time to compute L and U)
 compute C time:        0.000016 sec
-reduce (C) time:       0.000004 sec
-rate     6.25 million edges/sec (incl time for U=triu(A))
-rate    18.32 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000013 sec (nthreads: 2 speedup 1.23168)
-tricount time:         0.000017 sec (dot product method)
-tri+prep time:         0.000057 sec (incl time to compute L and U)
-compute C time:        0.000013 sec
-reduce (C) time:       0.000004 sec
-rate     6.65 million edges/sec (incl time for U=triu(A))
-rate    22.27 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000013 sec (nthreads: 4 speedup 1.27601)
-tricount time:         0.000017 sec (dot product method)
-tri+prep time:         0.000056 sec (incl time to compute L and U)
-compute C time:        0.000013 sec
-reduce (C) time:       0.000004 sec
-rate     6.67 million edges/sec (incl time for U=triu(A))
-rate    22.52 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000012 sec (nthreads: 8 speedup 1.31332)
-tricount time:         0.000016 sec (dot product method)
-tri+prep time:         0.000056 sec (incl time to compute L and U)
+reduce (C) time:       0.000002 sec
+rate     7.07 million edges/sec (incl time for U=triu(A))
+rate    20.95 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000012 sec (nthreads: 2 speedup 1.36091)
+tricount time:         0.000013 sec (dot product method)
+tri+prep time:         0.000049 sec (incl time to compute L and U)
 compute C time:        0.000012 sec
-reduce (C) time:       0.000004 sec
-rate     6.77 million edges/sec (incl time for U=triu(A))
-rate    23.59 million edges/sec (just tricount itself)
+reduce (C) time:       0.000002 sec
+rate     7.72 million edges/sec (incl time for U=triu(A))
+rate    27.87 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000012 sec (nthreads: 4 speedup 1.38573)
+tricount time:         0.000013 sec (dot product method)
+tri+prep time:         0.000049 sec (incl time to compute L and U)
+compute C time:        0.000012 sec
+reduce (C) time:       0.000002 sec
+rate     7.75 million edges/sec (incl time for U=triu(A))
+rate    28.26 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000012 sec (nthreads: 8 speedup 1.39356)
+tricount time:         0.000013 sec (dot product method)
+tri+prep time:         0.000048 sec (incl time to compute L and U)
+compute C time:        0.000012 sec
+reduce (C) time:       0.000002 sec
+rate     7.77 million edges/sec (incl time for U=triu(A))
+rate    28.54 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 872
-C<L>=L*L time (saxpy):         0.000045 sec
-tricount time:         0.000047 sec (saxpy method)
-tri+prep time:         0.000055 sec (incl time to compute L)
-compute C time:        0.000045 sec
+C<L>=L*L time (saxpy):         0.000051 sec
+tricount time:         0.000052 sec (saxpy method)
+tri+prep time:         0.000060 sec (incl time to compute L)
+compute C time:        0.000051 sec
+reduce (C) time:       0.000002 sec
+rate     6.31 million edges/sec (incl time for L=tril(A))
+rate     7.17 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000025 sec (nthreads: 2 speedup 2.00982)
+tricount time:         0.000027 sec (saxpy method)
+tri+prep time:         0.000034 sec (incl time to compute L)
+compute C time:        0.000025 sec
 reduce (C) time:       0.000001 sec
-rate     6.88 million edges/sec (incl time for L=tril(A))
-rate     8.06 million edges/sec (just tricount itself)
-triangles, method 3: 872
-C<L>=L*L time (saxpy):         0.000021 sec (nthreads: 2 speedup 2.10986)
+rate    11.11 million edges/sec (incl time for L=tril(A))
+rate    14.05 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000022 sec (nthreads: 4 speedup 2.34526)
 tricount time:         0.000023 sec (saxpy method)
-tri+prep time:         0.000031 sec (incl time to compute L)
-compute C time:        0.000021 sec
+tri+prep time:         0.000030 sec (incl time to compute L)
+compute C time:        0.000022 sec
 reduce (C) time:       0.000001 sec
-rate    12.24 million edges/sec (incl time for L=tril(A))
-rate    16.56 million edges/sec (just tricount itself)
-triangles, method 3: 872
-C<L>=L*L time (saxpy):         0.000022 sec (nthreads: 4 speedup 2.06334)
-tricount time:         0.000023 sec (saxpy method)
+rate    12.45 million edges/sec (incl time for L=tril(A))
+rate    16.29 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000022 sec (nthreads: 8 speedup 2.29399)
+tricount time:         0.000024 sec (saxpy method)
 tri+prep time:         0.000031 sec (incl time to compute L)
 compute C time:        0.000022 sec
-reduce (C) time:       0.000001 sec
-rate    12.00 million edges/sec (incl time for L=tril(A))
-rate    16.12 million edges/sec (just tricount itself)
-triangles, method 3: 872
-C<L>=L*L time (saxpy):         0.000019 sec (nthreads: 8 speedup 2.35071)
-tricount time:         0.000021 sec (saxpy method)
-tri+prep time:         0.000029 sec (incl time to compute L)
-compute C time:        0.000019 sec
-reduce (C) time:       0.000001 sec
-rate    13.13 million edges/sec (incl time for L=tril(A))
-rate    18.22 million edges/sec (just tricount itself)
+reduce (C) time:       0.000002 sec
+rate    12.23 million edges/sec (incl time for L=tril(A))
+rate    15.90 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
 random 5 by 5, nz: 18, method 1 time 0.000 sec
 
-total time to read A matrix:       0.000128 sec
+total time to read A matrix:       0.000101 sec
 
 n 5 # edges 9
-U=triu(A) time:        0.000041 sec
-L=tril(A) time:        0.000006 sec
+U=triu(A) time:        0.000024 sec
+L=tril(A) time:        0.000003 sec
 
 ------------------------------------- dot product method:
 # triangles 7
-L*U' time (dot):         0.000039 sec
-tricount time:         0.000053 sec (dot product method)
-tri+prep time:         0.000100 sec (incl time to compute L and U)
-compute C time:        0.000039 sec
-reduce (C) time:       0.000014 sec
-rate     0.09 million edges/sec (incl time for U=triu(A))
-rate     0.17 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 7.69034)
-tricount time:         0.000008 sec (dot product method)
+L*U' time (dot):         0.000024 sec
+tricount time:         0.000027 sec (dot product method)
 tri+prep time:         0.000054 sec (incl time to compute L and U)
-compute C time:        0.000005 sec
-reduce (C) time:       0.000002 sec
-rate     0.17 million edges/sec (incl time for U=triu(A))
-rate     1.18 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 11.3518)
-tricount time:         0.000006 sec (dot product method)
-tri+prep time:         0.000053 sec (incl time to compute L and U)
-compute C time:        0.000003 sec
+compute C time:        0.000024 sec
 reduce (C) time:       0.000003 sec
 rate     0.17 million edges/sec (incl time for U=triu(A))
-rate     1.50 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000006 sec (nthreads: 8 speedup 6.84178)
-tricount time:         0.000009 sec (dot product method)
-tri+prep time:         0.000056 sec (incl time to compute L and U)
-compute C time:        0.000006 sec
-reduce (C) time:       0.000003 sec
-rate     0.16 million edges/sec (incl time for U=triu(A))
-rate     0.98 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000008 sec
-tricount time:         0.000010 sec (dot product method)
-tri+prep time:         0.000057 sec (incl time to compute L and U)
-compute C time:        0.000008 sec
-reduce (C) time:       0.000003 sec
-rate     0.16 million edges/sec (incl time for U=triu(A))
-rate     0.87 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 1.63061)
-tricount time:         0.000007 sec (dot product method)
-tri+prep time:         0.000054 sec (incl time to compute L and U)
+rate     0.33 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 4.80951)
+tricount time:         0.000006 sec (dot product method)
+tri+prep time:         0.000033 sec (incl time to compute L and U)
 compute C time:        0.000005 sec
-reduce (C) time:       0.000002 sec
-rate     0.17 million edges/sec (incl time for U=triu(A))
-rate     1.25 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000006 sec (nthreads: 4 speedup 1.29289)
-tricount time:         0.000009 sec (dot product method)
-tri+prep time:         0.000055 sec (incl time to compute L and U)
-compute C time:        0.000006 sec
-reduce (C) time:       0.000002 sec
-rate     0.16 million edges/sec (incl time for U=triu(A))
-rate     1.06 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 2.37394)
+reduce (C) time:       0.000001 sec
+rate     0.27 million edges/sec (incl time for U=triu(A))
+rate     1.57 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000004 sec (nthreads: 4 speedup 6.80389)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000031 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
+reduce (C) time:       0.000000 sec
+rate     0.29 million edges/sec (incl time for U=triu(A))
+rate     2.26 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 6.99995)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000031 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     0.29 million edges/sec (incl time for U=triu(A))
+rate     2.34 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000005 sec
 tricount time:         0.000005 sec (dot product method)
-tri+prep time:         0.000052 sec (incl time to compute L and U)
+tri+prep time:         0.000032 sec (incl time to compute L and U)
+compute C time:        0.000005 sec
+reduce (C) time:       0.000001 sec
+rate     0.28 million edges/sec (incl time for U=triu(A))
+rate     1.74 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000004 sec (nthreads: 2 speedup 1.28269)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000031 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
+reduce (C) time:       0.000000 sec
+rate     0.29 million edges/sec (incl time for U=triu(A))
+rate     2.27 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000004 sec (nthreads: 4 speedup 1.23356)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000031 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
+reduce (C) time:       0.000000 sec
+rate     0.29 million edges/sec (incl time for U=triu(A))
+rate     2.18 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 1.37847)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000031 sec (incl time to compute L and U)
 compute C time:        0.000003 sec
-reduce (C) time:       0.000002 sec
-rate     0.17 million edges/sec (incl time for U=triu(A))
-rate     1.73 million edges/sec (just tricount itself)
+reduce (C) time:       0.000000 sec
+rate     0.29 million edges/sec (incl time for U=triu(A))
+rate     2.42 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 7
-C<L>=L*L time (saxpy):         0.000029 sec
-tricount time:         0.000030 sec (saxpy method)
-tri+prep time:         0.000036 sec (incl time to compute L)
-compute C time:        0.000029 sec
+C<L>=L*L time (saxpy):         0.000012 sec
+tricount time:         0.000013 sec (saxpy method)
+tri+prep time:         0.000016 sec (incl time to compute L)
+compute C time:        0.000012 sec
 reduce (C) time:       0.000001 sec
-rate     0.25 million edges/sec (incl time for L=tril(A))
-rate     0.30 million edges/sec (just tricount itself)
-triangles, method 3: 7
-C<L>=L*L time (saxpy):         0.000007 sec (nthreads: 2 speedup 4.44424)
-tricount time:         0.000007 sec (saxpy method)
-tri+prep time:         0.000013 sec (incl time to compute L)
-compute C time:        0.000007 sec
+rate     0.56 million edges/sec (incl time for L=tril(A))
+rate     0.69 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000003 sec (nthreads: 2 speedup 4.12281)
+tricount time:         0.000003 sec (saxpy method)
+tri+prep time:         0.000007 sec (incl time to compute L)
+compute C time:        0.000003 sec
 reduce (C) time:       0.000000 sec
-rate     0.69 million edges/sec (incl time for L=tril(A))
-rate     1.29 million edges/sec (just tricount itself)
-triangles, method 3: 7
-C<L>=L*L time (saxpy):         0.000005 sec (nthreads: 4 speedup 5.59661)
-tricount time:         0.000006 sec (saxpy method)
-tri+prep time:         0.000012 sec (incl time to compute L)
-compute C time:        0.000005 sec
+rate     1.37 million edges/sec (incl time for L=tril(A))
+rate     2.68 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000002 sec (nthreads: 4 speedup 5.12891)
+tricount time:         0.000003 sec (saxpy method)
+tri+prep time:         0.000006 sec (incl time to compute L)
+compute C time:        0.000002 sec
 reduce (C) time:       0.000000 sec
-rate     0.78 million edges/sec (incl time for L=tril(A))
-rate     1.61 million edges/sec (just tricount itself)
-triangles, method 3: 7
-C<L>=L*L time (saxpy):         0.000010 sec (nthreads: 8 speedup 3.02562)
-tricount time:         0.000011 sec (saxpy method)
-tri+prep time:         0.000017 sec (incl time to compute L)
-compute C time:        0.000010 sec
-reduce (C) time:       0.000001 sec
-rate     0.54 million edges/sec (incl time for L=tril(A))
-rate     0.85 million edges/sec (just tricount itself)
+rate     1.50 million edges/sec (incl time for L=tril(A))
+rate     3.25 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000003 sec (nthreads: 8 speedup 3.73818)
+tricount time:         0.000004 sec (saxpy method)
+tri+prep time:         0.000007 sec (incl time to compute L)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     1.28 million edges/sec (incl time for L=tril(A))
+rate     2.37 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
 matrix 3 by 3, 0 entries, from stdin
 
-total time to read A matrix:       0.000168 sec
+total time to read A matrix:       0.000136 sec
 
 n 3 # edges 0
-U=triu(A) time:        0.000052 sec
-L=tril(A) time:        0.000007 sec
+U=triu(A) time:        0.000023 sec
+L=tril(A) time:        0.000004 sec
 
 ------------------------------------- dot product method:
 # triangles 0
-L*U' time (dot):         0.000039 sec
-tricount time:         0.000043 sec (dot product method)
-tri+prep time:         0.000102 sec (incl time to compute L and U)
-compute C time:        0.000039 sec
-reduce (C) time:       0.000004 sec
+L*U' time (dot):         0.000032 sec
+tricount time:         0.000034 sec (dot product method)
+tri+prep time:         0.000061 sec (incl time to compute L and U)
+compute C time:        0.000032 sec
+reduce (C) time:       0.000002 sec
 rate     0.00 million edges/sec (incl time for U=triu(A))
 rate     0.00 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec (nthreads: 2 speedup 9.51825)
+L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 6.70786)
 tricount time:         0.000005 sec (dot product method)
-tri+prep time:         0.000064 sec (incl time to compute L and U)
-compute C time:        0.000004 sec
-reduce (C) time:       0.000000 sec
+tri+prep time:         0.000032 sec (incl time to compute L and U)
+compute C time:        0.000005 sec
+reduce (C) time:       0.000001 sec
 rate     0.00 million edges/sec (incl time for U=triu(A))
 rate     0.00 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 13.9081)
-tricount time:         0.000003 sec (dot product method)
-tri+prep time:         0.000062 sec (incl time to compute L and U)
-compute C time:        0.000003 sec
+L*U' time (dot):         0.000004 sec (nthreads: 4 speedup 8.34564)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000031 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
 reduce (C) time:       0.000000 sec
 rate     0.00 million edges/sec (incl time for U=triu(A))
 rate     0.00 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000002 sec (nthreads: 8 speedup 15.989)
-tricount time:         0.000003 sec (dot product method)
-tri+prep time:         0.000062 sec (incl time to compute L and U)
-compute C time:        0.000002 sec
+L*U' time (dot):         0.000004 sec (nthreads: 8 speedup 8.56331)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000031 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
 reduce (C) time:       0.000000 sec
 rate     0.00 million edges/sec (incl time for U=triu(A))
 rate     0.00 million edges/sec (just tricount itself)
 L*U' time (dot):         0.000003 sec
 tricount time:         0.000003 sec (dot product method)
-tri+prep time:         0.000062 sec (incl time to compute L and U)
+tri+prep time:         0.000030 sec (incl time to compute L and U)
 compute C time:        0.000003 sec
 reduce (C) time:       0.000000 sec
 rate     0.00 million edges/sec (incl time for U=triu(A))
 rate     0.00 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 2 speedup 0.949015)
-tricount time:         0.000003 sec (dot product method)
-tri+prep time:         0.000062 sec (incl time to compute L and U)
-compute C time:        0.000003 sec
+L*U' time (dot):         0.000002 sec (nthreads: 2 speedup 1.32441)
+tricount time:         0.000002 sec (dot product method)
+tri+prep time:         0.000029 sec (incl time to compute L and U)
+compute C time:        0.000002 sec
 reduce (C) time:       0.000000 sec
 rate     0.00 million edges/sec (incl time for U=triu(A))
 rate     0.00 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 0.985999)
-tricount time:         0.000003 sec (dot product method)
-tri+prep time:         0.000062 sec (incl time to compute L and U)
-compute C time:        0.000003 sec
+L*U' time (dot):         0.000002 sec (nthreads: 4 speedup 1.26397)
+tricount time:         0.000002 sec (dot product method)
+tri+prep time:         0.000029 sec (incl time to compute L and U)
+compute C time:        0.000002 sec
 reduce (C) time:       0.000000 sec
 rate     0.00 million edges/sec (incl time for U=triu(A))
 rate     0.00 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000002 sec (nthreads: 8 speedup 1.04575)
-tricount time:         0.000003 sec (dot product method)
-tri+prep time:         0.000062 sec (incl time to compute L and U)
-compute C time:        0.000002 sec
+L*U' time (dot):         0.000004 sec (nthreads: 8 speedup 0.710361)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000031 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
 reduce (C) time:       0.000000 sec
 rate     0.00 million edges/sec (incl time for U=triu(A))
 rate     0.00 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000022 sec
-tricount time:         0.000023 sec (saxpy method)
-tri+prep time:         0.000029 sec (incl time to compute L)
-compute C time:        0.000022 sec
-reduce (C) time:       0.000000 sec
+C<L>=L*L time (saxpy):         0.000026 sec
+tricount time:         0.000026 sec (saxpy method)
+tri+prep time:         0.000031 sec (incl time to compute L)
+compute C time:        0.000026 sec
+reduce (C) time:       0.000001 sec
 rate     0.00 million edges/sec (incl time for L=tril(A))
 rate     0.00 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000005 sec (nthreads: 2 speedup 4.57078)
-tricount time:         0.000005 sec (saxpy method)
-tri+prep time:         0.000012 sec (incl time to compute L)
-compute C time:        0.000005 sec
+C<L>=L*L time (saxpy):         0.000006 sec (nthreads: 2 speedup 3.9542)
+tricount time:         0.000007 sec (saxpy method)
+tri+prep time:         0.000011 sec (incl time to compute L)
+compute C time:        0.000006 sec
 reduce (C) time:       0.000000 sec
 rate     0.00 million edges/sec (incl time for L=tril(A))
 rate     0.00 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000004 sec (nthreads: 4 speedup 5.3183)
+C<L>=L*L time (saxpy):         0.000004 sec (nthreads: 4 speedup 6.0321)
 tricount time:         0.000005 sec (saxpy method)
-tri+prep time:         0.000011 sec (incl time to compute L)
+tri+prep time:         0.000009 sec (incl time to compute L)
 compute C time:        0.000004 sec
 reduce (C) time:       0.000000 sec
 rate     0.00 million edges/sec (incl time for L=tril(A))
 rate     0.00 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000006 sec (nthreads: 8 speedup 3.45239)
-tricount time:         0.000007 sec (saxpy method)
-tri+prep time:         0.000014 sec (incl time to compute L)
-compute C time:        0.000006 sec
+C<L>=L*L time (saxpy):         0.000005 sec (nthreads: 8 speedup 4.8894)
+tricount time:         0.000006 sec (saxpy method)
+tri+prep time:         0.000010 sec (incl time to compute L)
+compute C time:        0.000005 sec
 reduce (C) time:       0.000000 sec
 rate     0.00 million edges/sec (incl time for L=tril(A))
 rate     0.00 million edges/sec (just tricount itself)
@@ -307,1734 +295,1666 @@ rate     0.00 million edges/sec (just tricount itself)
 --------------------------------------------------------------
 matrix 4 by 4, 4 entries, from stdin
 
-total time to read A matrix:       0.000174 sec
+total time to read A matrix:       0.000182 sec
 
 n 4 # edges 2
-U=triu(A) time:        0.000044 sec
-L=tril(A) time:        0.000007 sec
+U=triu(A) time:        0.000042 sec
+L=tril(A) time:        0.000005 sec
 
 ------------------------------------- dot product method:
 # triangles 0
-L*U' time (dot):         0.000029 sec
-tricount time:         0.000039 sec (dot product method)
-tri+prep time:         0.000090 sec (incl time to compute L and U)
-compute C time:        0.000029 sec
-reduce (C) time:       0.000009 sec
+L*U' time (dot):         0.000035 sec
+tricount time:         0.000038 sec (dot product method)
+tri+prep time:         0.000085 sec (incl time to compute L and U)
+compute C time:        0.000035 sec
+reduce (C) time:       0.000003 sec
 rate     0.02 million edges/sec (incl time for U=triu(A))
 rate     0.05 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000007 sec (nthreads: 2 speedup 4.16357)
-tricount time:         0.000012 sec (dot product method)
-tri+prep time:         0.000064 sec (incl time to compute L and U)
-compute C time:        0.000007 sec
-reduce (C) time:       0.000005 sec
-rate     0.03 million edges/sec (incl time for U=triu(A))
-rate     0.16 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000005 sec (nthreads: 4 speedup 5.49784)
-tricount time:         0.000009 sec (dot product method)
-tri+prep time:         0.000060 sec (incl time to compute L and U)
+L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 6.83555)
+tricount time:         0.000006 sec (dot product method)
+tri+prep time:         0.000053 sec (incl time to compute L and U)
 compute C time:        0.000005 sec
-reduce (C) time:       0.000003 sec
-rate     0.03 million edges/sec (incl time for U=triu(A))
-rate     0.23 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 9.85422)
-tricount time:         0.000005 sec (dot product method)
-tri+prep time:         0.000057 sec (incl time to compute L and U)
-compute C time:        0.000003 sec
-reduce (C) time:       0.000002 sec
+reduce (C) time:       0.000001 sec
 rate     0.04 million edges/sec (incl time for U=triu(A))
-rate     0.36 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec
-tricount time:         0.000006 sec (dot product method)
-tri+prep time:         0.000058 sec (incl time to compute L and U)
+rate     0.35 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 11.2006)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000050 sec (incl time to compute L and U)
 compute C time:        0.000003 sec
-reduce (C) time:       0.000003 sec
-rate     0.03 million edges/sec (incl time for U=triu(A))
-rate     0.32 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 0.592958)
-tricount time:         0.000008 sec (dot product method)
-tri+prep time:         0.000060 sec (incl time to compute L and U)
-compute C time:        0.000005 sec
-reduce (C) time:       0.000003 sec
-rate     0.03 million edges/sec (incl time for U=triu(A))
-rate     0.24 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 1.04212)
-tricount time:         0.000005 sec (dot product method)
-tri+prep time:         0.000057 sec (incl time to compute L and U)
+reduce (C) time:       0.000000 sec
+rate     0.04 million edges/sec (incl time for U=triu(A))
+rate     0.57 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 10.4373)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000051 sec (incl time to compute L and U)
 compute C time:        0.000003 sec
-reduce (C) time:       0.000002 sec
+reduce (C) time:       0.000000 sec
+rate     0.04 million edges/sec (incl time for U=triu(A))
+rate     0.54 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000006 sec
+tricount time:         0.000007 sec (dot product method)
+tri+prep time:         0.000054 sec (incl time to compute L and U)
+compute C time:        0.000006 sec
+reduce (C) time:       0.000001 sec
 rate     0.04 million edges/sec (incl time for U=triu(A))
-rate     0.38 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 1.20878)
+rate     0.30 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 1.09848)
 tricount time:         0.000006 sec (dot product method)
-tri+prep time:         0.000057 sec (incl time to compute L and U)
+tri+prep time:         0.000053 sec (incl time to compute L and U)
+compute C time:        0.000005 sec
+reduce (C) time:       0.000000 sec
+rate     0.04 million edges/sec (incl time for U=triu(A))
+rate     0.34 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000004 sec (nthreads: 4 speedup 1.68923)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000051 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
+reduce (C) time:       0.000000 sec
+rate     0.04 million edges/sec (incl time for U=triu(A))
+rate     0.51 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 1.97691)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000050 sec (incl time to compute L and U)
 compute C time:        0.000003 sec
-reduce (C) time:       0.000003 sec
-rate     0.03 million edges/sec (incl time for U=triu(A))
-rate     0.36 million edges/sec (just tricount itself)
+reduce (C) time:       0.000000 sec
+rate     0.04 million edges/sec (incl time for U=triu(A))
+rate     0.60 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000042 sec
-tricount time:         0.000043 sec (saxpy method)
-tri+prep time:         0.000050 sec (incl time to compute L)
-compute C time:        0.000042 sec
+C<L>=L*L time (saxpy):         0.000022 sec
+tricount time:         0.000023 sec (saxpy method)
+tri+prep time:         0.000028 sec (incl time to compute L)
+compute C time:        0.000022 sec
 reduce (C) time:       0.000001 sec
-rate     0.04 million edges/sec (incl time for L=tril(A))
-rate     0.05 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000014 sec (nthreads: 2 speedup 2.90987)
-tricount time:         0.000015 sec (saxpy method)
-tri+prep time:         0.000022 sec (incl time to compute L)
-compute C time:        0.000014 sec
-reduce (C) time:       0.000000 sec
-rate     0.09 million edges/sec (incl time for L=tril(A))
-rate     0.13 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000014 sec (nthreads: 4 speedup 3.02766)
-tricount time:         0.000014 sec (saxpy method)
-tri+prep time:         0.000022 sec (incl time to compute L)
-compute C time:        0.000014 sec
+rate     0.07 million edges/sec (incl time for L=tril(A))
+rate     0.09 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000008 sec (nthreads: 2 speedup 2.89783)
+tricount time:         0.000008 sec (saxpy method)
+tri+prep time:         0.000013 sec (incl time to compute L)
+compute C time:        0.000008 sec
 reduce (C) time:       0.000001 sec
-rate     0.09 million edges/sec (incl time for L=tril(A))
-rate     0.14 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000017 sec (nthreads: 8 speedup 2.42674)
-tricount time:         0.000018 sec (saxpy method)
-tri+prep time:         0.000025 sec (incl time to compute L)
-compute C time:        0.000017 sec
+rate     0.15 million edges/sec (incl time for L=tril(A))
+rate     0.24 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000004 sec (nthreads: 4 speedup 4.92595)
+tricount time:         0.000005 sec (saxpy method)
+tri+prep time:         0.000010 sec (incl time to compute L)
+compute C time:        0.000004 sec
 reduce (C) time:       0.000000 sec
-rate     0.08 million edges/sec (incl time for L=tril(A))
-rate     0.11 million edges/sec (just tricount itself)
+rate     0.20 million edges/sec (incl time for L=tril(A))
+rate     0.42 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000008 sec (nthreads: 8 speedup 2.76205)
+tricount time:         0.000008 sec (saxpy method)
+tri+prep time:         0.000014 sec (incl time to compute L)
+compute C time:        0.000008 sec
+reduce (C) time:       0.000001 sec
+rate     0.15 million edges/sec (incl time for L=tril(A))
+rate     0.24 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
 matrix 4 by 4, 10 entries, from stdin
 
-total time to read A matrix:       0.000112 sec
+total time to read A matrix:       0.000188 sec
 
 n 4 # edges 5
-U=triu(A) time:        0.000024 sec
-L=tril(A) time:        0.000005 sec
+U=triu(A) time:        0.000030 sec
+L=tril(A) time:        0.000004 sec
 
 ------------------------------------- dot product method:
 # triangles 2
-L*U' time (dot):         0.000021 sec
-tricount time:         0.000027 sec (dot product method)
-tri+prep time:         0.000056 sec (incl time to compute L and U)
-compute C time:        0.000021 sec
-reduce (C) time:       0.000006 sec
-rate     0.09 million edges/sec (incl time for U=triu(A))
-rate     0.18 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec (nthreads: 2 speedup 4.75515)
-tricount time:         0.000007 sec (dot product method)
-tri+prep time:         0.000035 sec (incl time to compute L and U)
-compute C time:        0.000004 sec
-reduce (C) time:       0.000002 sec
-rate     0.14 million edges/sec (incl time for U=triu(A))
-rate     0.74 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 6.41806)
-tricount time:         0.000005 sec (dot product method)
-tri+prep time:         0.000034 sec (incl time to compute L and U)
-compute C time:        0.000003 sec
-reduce (C) time:       0.000002 sec
-rate     0.15 million edges/sec (incl time for U=triu(A))
-rate     0.98 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 7.51145)
+L*U' time (dot):         0.000028 sec
+tricount time:         0.000031 sec (dot product method)
+tri+prep time:         0.000066 sec (incl time to compute L and U)
+compute C time:        0.000028 sec
+reduce (C) time:       0.000003 sec
+rate     0.08 million edges/sec (incl time for U=triu(A))
+rate     0.16 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 5.73686)
+tricount time:         0.000006 sec (dot product method)
+tri+prep time:         0.000040 sec (incl time to compute L and U)
+compute C time:        0.000005 sec
+reduce (C) time:       0.000001 sec
+rate     0.13 million edges/sec (incl time for U=triu(A))
+rate     0.90 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 8.37966)
 tricount time:         0.000004 sec (dot product method)
-tri+prep time:         0.000033 sec (incl time to compute L and U)
-compute C time:        0.000003 sec
-reduce (C) time:       0.000002 sec
-rate     0.15 million edges/sec (incl time for U=triu(A))
-rate     1.12 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec
-tricount time:         0.000005 sec (dot product method)
-tri+prep time:         0.000033 sec (incl time to compute L and U)
+tri+prep time:         0.000038 sec (incl time to compute L and U)
 compute C time:        0.000003 sec
-reduce (C) time:       0.000002 sec
-rate     0.15 million edges/sec (incl time for U=triu(A))
-rate     1.00 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 2 speedup 1.02638)
-tricount time:         0.000005 sec (dot product method)
-tri+prep time:         0.000033 sec (incl time to compute L and U)
+reduce (C) time:       0.000000 sec
+rate     0.13 million edges/sec (incl time for U=triu(A))
+rate     1.32 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 8.68117)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000038 sec (incl time to compute L and U)
 compute C time:        0.000003 sec
-reduce (C) time:       0.000002 sec
-rate     0.15 million edges/sec (incl time for U=triu(A))
-rate     1.07 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 1.11222)
+reduce (C) time:       0.000000 sec
+rate     0.13 million edges/sec (incl time for U=triu(A))
+rate     1.35 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000004 sec
 tricount time:         0.000004 sec (dot product method)
-tri+prep time:         0.000033 sec (incl time to compute L and U)
+tri+prep time:         0.000038 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
+reduce (C) time:       0.000001 sec
+rate     0.13 million edges/sec (incl time for U=triu(A))
+rate     1.17 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 2 speedup 1.20088)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000038 sec (incl time to compute L and U)
 compute C time:        0.000003 sec
-reduce (C) time:       0.000002 sec
-rate     0.15 million edges/sec (incl time for U=triu(A))
-rate     1.14 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 1.11541)
+reduce (C) time:       0.000000 sec
+rate     0.13 million edges/sec (incl time for U=triu(A))
+rate     1.40 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 1.08018)
 tricount time:         0.000004 sec (dot product method)
-tri+prep time:         0.000033 sec (incl time to compute L and U)
+tri+prep time:         0.000038 sec (incl time to compute L and U)
 compute C time:        0.000003 sec
-reduce (C) time:       0.000002 sec
-rate     0.15 million edges/sec (incl time for U=triu(A))
-rate     1.16 million edges/sec (just tricount itself)
+reduce (C) time:       0.000000 sec
+rate     0.13 million edges/sec (incl time for U=triu(A))
+rate     1.27 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000002 sec (nthreads: 8 speedup 1.68545)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000037 sec (incl time to compute L and U)
+compute C time:        0.000002 sec
+reduce (C) time:       0.000001 sec
+rate     0.14 million edges/sec (incl time for U=triu(A))
+rate     1.79 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 2
-C<L>=L*L time (saxpy):         0.000018 sec
-tricount time:         0.000019 sec (saxpy method)
-tri+prep time:         0.000024 sec (incl time to compute L)
-compute C time:        0.000018 sec
+C<L>=L*L time (saxpy):         0.000008 sec
+tricount time:         0.000009 sec (saxpy method)
+tri+prep time:         0.000013 sec (incl time to compute L)
+compute C time:        0.000008 sec
 reduce (C) time:       0.000000 sec
-rate     0.21 million edges/sec (incl time for L=tril(A))
-rate     0.27 million edges/sec (just tricount itself)
-triangles, method 3: 2
-C<L>=L*L time (saxpy):         0.000006 sec (nthreads: 2 speedup 3.25856)
-tricount time:         0.000006 sec (saxpy method)
-tri+prep time:         0.000011 sec (incl time to compute L)
-compute C time:        0.000006 sec
+rate     0.40 million edges/sec (incl time for L=tril(A))
+rate     0.57 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000002 sec (nthreads: 2 speedup 3.33955)
+tricount time:         0.000003 sec (saxpy method)
+tri+prep time:         0.000007 sec (incl time to compute L)
+compute C time:        0.000002 sec
 reduce (C) time:       0.000000 sec
-rate     0.46 million edges/sec (incl time for L=tril(A))
-rate     0.84 million edges/sec (just tricount itself)
-triangles, method 3: 2
-C<L>=L*L time (saxpy):         0.000005 sec (nthreads: 4 speedup 3.87265)
-tricount time:         0.000005 sec (saxpy method)
-tri+prep time:         0.000010 sec (incl time to compute L)
-compute C time:        0.000005 sec
+rate     0.75 million edges/sec (incl time for L=tril(A))
+rate     1.75 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000002 sec (nthreads: 4 speedup 4.10873)
+tricount time:         0.000002 sec (saxpy method)
+tri+prep time:         0.000006 sec (incl time to compute L)
+compute C time:        0.000002 sec
 reduce (C) time:       0.000000 sec
-rate     0.50 million edges/sec (incl time for L=tril(A))
-rate     0.99 million edges/sec (just tricount itself)
-triangles, method 3: 2
-C<L>=L*L time (saxpy):         0.000005 sec (nthreads: 8 speedup 3.318)
-tricount time:         0.000006 sec (saxpy method)
-tri+prep time:         0.000011 sec (incl time to compute L)
-compute C time:        0.000005 sec
+rate     0.81 million edges/sec (incl time for L=tril(A))
+rate     2.11 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000004 sec (nthreads: 8 speedup 1.98804)
+tricount time:         0.000005 sec (saxpy method)
+tri+prep time:         0.000008 sec (incl time to compute L)
+compute C time:        0.000004 sec
 reduce (C) time:       0.000000 sec
-rate     0.46 million edges/sec (incl time for L=tril(A))
-rate     0.84 million edges/sec (just tricount itself)
+rate     0.59 million edges/sec (incl time for L=tril(A))
+rate     1.07 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
 matrix 7 by 7, 16 entries, from stdin
 
-total time to read A matrix:       0.000151 sec
+total time to read A matrix:       0.000242 sec
 
 n 7 # edges 8
-U=triu(A) time:        0.000020 sec
-L=tril(A) time:        0.000005 sec
+U=triu(A) time:        0.000018 sec
+L=tril(A) time:        0.000003 sec
 
 ------------------------------------- dot product method:
 # triangles 0
-L*U' time (dot):         0.000027 sec
-tricount time:         0.000034 sec (dot product method)
-tri+prep time:         0.000059 sec (incl time to compute L and U)
-compute C time:        0.000027 sec
-reduce (C) time:       0.000007 sec
+L*U' time (dot):         0.000033 sec
+tricount time:         0.000035 sec (dot product method)
+tri+prep time:         0.000057 sec (incl time to compute L and U)
+compute C time:        0.000033 sec
+reduce (C) time:       0.000003 sec
 rate     0.14 million edges/sec (incl time for U=triu(A))
-rate     0.24 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 5.12151)
-tricount time:         0.000009 sec (dot product method)
-tri+prep time:         0.000034 sec (incl time to compute L and U)
+rate     0.23 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 6.33494)
+tricount time:         0.000006 sec (dot product method)
+tri+prep time:         0.000027 sec (incl time to compute L and U)
 compute C time:        0.000005 sec
-reduce (C) time:       0.000003 sec
-rate     0.24 million edges/sec (incl time for U=triu(A))
-rate     0.93 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 9.05404)
-tricount time:         0.000005 sec (dot product method)
-tri+prep time:         0.000030 sec (incl time to compute L and U)
-compute C time:        0.000003 sec
-reduce (C) time:       0.000002 sec
-rate     0.26 million edges/sec (incl time for U=triu(A))
-rate     1.46 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000006 sec (nthreads: 8 speedup 4.59057)
-tricount time:         0.000010 sec (dot product method)
-tri+prep time:         0.000035 sec (incl time to compute L and U)
-compute C time:        0.000006 sec
-reduce (C) time:       0.000004 sec
-rate     0.23 million edges/sec (incl time for U=triu(A))
-rate     0.79 million edges/sec (just tricount itself)
+reduce (C) time:       0.000001 sec
+rate     0.29 million edges/sec (incl time for U=triu(A))
+rate     1.40 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000004 sec (nthreads: 4 speedup 8.49876)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000026 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
+reduce (C) time:       0.000000 sec
+rate     0.31 million edges/sec (incl time for U=triu(A))
+rate     1.90 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000004 sec (nthreads: 8 speedup 9.15498)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000025 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
+reduce (C) time:       0.000000 sec
+rate     0.31 million edges/sec (incl time for U=triu(A))
+rate     2.04 million edges/sec (just tricount itself)
 L*U' time (dot):         0.000004 sec
-tricount time:         0.000007 sec (dot product method)
-tri+prep time:         0.000032 sec (incl time to compute L and U)
+tricount time:         0.000005 sec (dot product method)
+tri+prep time:         0.000026 sec (incl time to compute L and U)
 compute C time:        0.000004 sec
-reduce (C) time:       0.000003 sec
-rate     0.25 million edges/sec (incl time for U=triu(A))
-rate     1.16 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 2 speedup 1.18959)
-tricount time:         0.000006 sec (dot product method)
-tri+prep time:         0.000031 sec (incl time to compute L and U)
-compute C time:        0.000003 sec
-reduce (C) time:       0.000003 sec
-rate     0.26 million edges/sec (incl time for U=triu(A))
-rate     1.28 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000006 sec (nthreads: 4 speedup 0.659947)
-tricount time:         0.000009 sec (dot product method)
-tri+prep time:         0.000034 sec (incl time to compute L and U)
-compute C time:        0.000006 sec
-reduce (C) time:       0.000004 sec
-rate     0.23 million edges/sec (incl time for U=triu(A))
-rate     0.86 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 1.16779)
-tricount time:         0.000006 sec (dot product method)
-tri+prep time:         0.000031 sec (incl time to compute L and U)
+reduce (C) time:       0.000000 sec
+rate     0.31 million edges/sec (incl time for U=triu(A))
+rate     1.71 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000004 sec (nthreads: 2 speedup 1.09382)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000026 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
+reduce (C) time:       0.000000 sec
+rate     0.31 million edges/sec (incl time for U=triu(A))
+rate     1.89 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000004 sec (nthreads: 4 speedup 1.21016)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000025 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
+reduce (C) time:       0.000000 sec
+rate     0.32 million edges/sec (incl time for U=triu(A))
+rate     2.07 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 1.26045)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000025 sec (incl time to compute L and U)
 compute C time:        0.000003 sec
-reduce (C) time:       0.000003 sec
-rate     0.26 million edges/sec (incl time for U=triu(A))
-rate     1.31 million edges/sec (just tricount itself)
+reduce (C) time:       0.000000 sec
+rate     0.32 million edges/sec (incl time for U=triu(A))
+rate     2.16 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000029 sec
-tricount time:         0.000029 sec (saxpy method)
-tri+prep time:         0.000035 sec (incl time to compute L)
-compute C time:        0.000029 sec
-reduce (C) time:       0.000001 sec
-rate     0.23 million edges/sec (incl time for L=tril(A))
-rate     0.27 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000014 sec (nthreads: 2 speedup 2.0437)
-tricount time:         0.000014 sec (saxpy method)
-tri+prep time:         0.000020 sec (incl time to compute L)
-compute C time:        0.000014 sec
+C<L>=L*L time (saxpy):         0.000020 sec
+tricount time:         0.000020 sec (saxpy method)
+tri+prep time:         0.000024 sec (incl time to compute L)
+compute C time:        0.000020 sec
 reduce (C) time:       0.000000 sec
-rate     0.41 million edges/sec (incl time for L=tril(A))
-rate     0.55 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000013 sec (nthreads: 4 speedup 2.19743)
-tricount time:         0.000014 sec (saxpy method)
-tri+prep time:         0.000019 sec (incl time to compute L)
-compute C time:        0.000013 sec
-reduce (C) time:       0.000001 sec
-rate     0.43 million edges/sec (incl time for L=tril(A))
-rate     0.59 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000035 sec (nthreads: 8 speedup 0.811284)
-tricount time:         0.000036 sec (saxpy method)
-tri+prep time:         0.000041 sec (incl time to compute L)
-compute C time:        0.000035 sec
-reduce (C) time:       0.000001 sec
-rate     0.20 million edges/sec (incl time for L=tril(A))
-rate     0.22 million edges/sec (just tricount itself)
+rate     0.34 million edges/sec (incl time for L=tril(A))
+rate     0.40 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000005 sec (nthreads: 2 speedup 4.04016)
+tricount time:         0.000005 sec (saxpy method)
+tri+prep time:         0.000009 sec (incl time to compute L)
+compute C time:        0.000005 sec
+reduce (C) time:       0.000000 sec
+rate     0.93 million edges/sec (incl time for L=tril(A))
+rate     1.53 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000004 sec (nthreads: 4 speedup 4.86751)
+tricount time:         0.000004 sec (saxpy method)
+tri+prep time:         0.000008 sec (incl time to compute L)
+compute C time:        0.000004 sec
+reduce (C) time:       0.000000 sec
+rate     1.03 million edges/sec (incl time for L=tril(A))
+rate     1.83 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000005 sec (nthreads: 8 speedup 3.92911)
+tricount time:         0.000005 sec (saxpy method)
+tri+prep time:         0.000009 sec (incl time to compute L)
+compute C time:        0.000005 sec
+reduce (C) time:       0.000000 sec
+rate     0.91 million edges/sec (incl time for L=tril(A))
+rate     1.47 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
 matrix 304 by 304, 876 entries, from stdin
 
-total time to read A matrix:       0.000324 sec
+total time to read A matrix:       0.000394 sec
 
 n 304 # edges 438
-U=triu(A) time:        0.000021 sec
-L=tril(A) time:        0.000007 sec
+U=triu(A) time:        0.000025 sec
+L=tril(A) time:        0.000008 sec
 
 ------------------------------------- dot product method:
 # triangles 0
-L*U' time (dot):         0.000029 sec
+L*U' time (dot):         0.000036 sec
 tricount time:         0.000039 sec (dot product method)
-tri+prep time:         0.000066 sec (incl time to compute L and U)
-compute C time:        0.000029 sec
-reduce (C) time:       0.000010 sec
-rate     6.59 million edges/sec (incl time for U=triu(A))
-rate    11.34 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000011 sec (nthreads: 2 speedup 2.6523)
-tricount time:         0.000017 sec (dot product method)
-tri+prep time:         0.000045 sec (incl time to compute L and U)
-compute C time:        0.000011 sec
-reduce (C) time:       0.000007 sec
-rate     9.66 million edges/sec (incl time for U=triu(A))
-rate    25.07 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000010 sec (nthreads: 4 speedup 3.01903)
-tricount time:         0.000015 sec (dot product method)
+tri+prep time:         0.000071 sec (incl time to compute L and U)
+compute C time:        0.000036 sec
+reduce (C) time:       0.000002 sec
+rate     6.16 million edges/sec (incl time for U=triu(A))
+rate    11.25 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000010 sec (nthreads: 2 speedup 3.71621)
+tricount time:         0.000011 sec (dot product method)
 tri+prep time:         0.000043 sec (incl time to compute L and U)
 compute C time:        0.000010 sec
-reduce (C) time:       0.000006 sec
-rate    10.12 million edges/sec (incl time for U=triu(A))
-rate    28.41 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000009 sec (nthreads: 8 speedup 3.12291)
-tricount time:         0.000015 sec (dot product method)
-tri+prep time:         0.000043 sec (incl time to compute L and U)
-compute C time:        0.000009 sec
-reduce (C) time:       0.000005 sec
-rate    10.29 million edges/sec (incl time for U=triu(A))
-rate    29.82 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000009 sec
-tricount time:         0.000014 sec (dot product method)
-tri+prep time:         0.000042 sec (incl time to compute L and U)
-compute C time:        0.000009 sec
-reduce (C) time:       0.000005 sec
-rate    10.50 million edges/sec (incl time for U=triu(A))
-rate    31.63 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000008 sec (nthreads: 2 speedup 1.04126)
-tricount time:         0.000013 sec (dot product method)
+reduce (C) time:       0.000001 sec
+rate    10.23 million edges/sec (incl time for U=triu(A))
+rate    41.16 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000008 sec (nthreads: 4 speedup 4.44921)
+tricount time:         0.000009 sec (dot product method)
 tri+prep time:         0.000041 sec (incl time to compute L and U)
 compute C time:        0.000008 sec
-reduce (C) time:       0.000005 sec
-rate    10.69 million edges/sec (incl time for U=triu(A))
-rate    33.47 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000008 sec (nthreads: 4 speedup 1.02962)
-tricount time:         0.000013 sec (dot product method)
+reduce (C) time:       0.000001 sec
+rate    10.67 million edges/sec (incl time for U=triu(A))
+rate    49.47 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000008 sec (nthreads: 8 speedup 4.49582)
+tricount time:         0.000009 sec (dot product method)
 tri+prep time:         0.000041 sec (incl time to compute L and U)
 compute C time:        0.000008 sec
-reduce (C) time:       0.000005 sec
-rate    10.66 million edges/sec (incl time for U=triu(A))
-rate    33.15 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000008 sec (nthreads: 8 speedup 1.06811)
-tricount time:         0.000013 sec (dot product method)
+reduce (C) time:       0.000001 sec
+rate    10.69 million edges/sec (incl time for U=triu(A))
+rate    49.93 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000008 sec
+tricount time:         0.000008 sec (dot product method)
 tri+prep time:         0.000041 sec (incl time to compute L and U)
 compute C time:        0.000008 sec
-reduce (C) time:       0.000005 sec
-rate    10.79 million edges/sec (incl time for U=triu(A))
-rate    34.38 million edges/sec (just tricount itself)
+reduce (C) time:       0.000001 sec
+rate    10.80 million edges/sec (incl time for U=triu(A))
+rate    52.26 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000007 sec (nthreads: 2 speedup 1.081)
+tricount time:         0.000008 sec (dot product method)
+tri+prep time:         0.000040 sec (incl time to compute L and U)
+compute C time:        0.000007 sec
+reduce (C) time:       0.000001 sec
+rate    10.99 million edges/sec (incl time for U=triu(A))
+rate    57.20 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000007 sec (nthreads: 4 speedup 1.09474)
+tricount time:         0.000008 sec (dot product method)
+tri+prep time:         0.000040 sec (incl time to compute L and U)
+compute C time:        0.000007 sec
+reduce (C) time:       0.000001 sec
+rate    11.03 million edges/sec (incl time for U=triu(A))
+rate    58.17 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000007 sec (nthreads: 8 speedup 1.109)
+tricount time:         0.000007 sec (dot product method)
+tri+prep time:         0.000040 sec (incl time to compute L and U)
+compute C time:        0.000007 sec
+reduce (C) time:       0.000001 sec
+rate    11.05 million edges/sec (incl time for U=triu(A))
+rate    58.79 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000028 sec
+C<L>=L*L time (saxpy):         0.000048 sec
+tricount time:         0.000048 sec (saxpy method)
+tri+prep time:         0.000056 sec (incl time to compute L)
+compute C time:        0.000048 sec
+reduce (C) time:       0.000001 sec
+rate     7.82 million edges/sec (incl time for L=tril(A))
+rate     9.06 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000028 sec (nthreads: 2 speedup 1.71429)
 tricount time:         0.000028 sec (saxpy method)
 tri+prep time:         0.000036 sec (incl time to compute L)
 compute C time:        0.000028 sec
 reduce (C) time:       0.000000 sec
-rate    12.34 million edges/sec (incl time for L=tril(A))
-rate    15.52 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000020 sec (nthreads: 2 speedup 1.37094)
-tricount time:         0.000021 sec (saxpy method)
-tri+prep time:         0.000028 sec (incl time to compute L)
-compute C time:        0.000020 sec
-reduce (C) time:       0.000000 sec
-rate    15.64 million edges/sec (incl time for L=tril(A))
-rate    21.15 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000019 sec (nthreads: 4 speedup 1.46644)
-tricount time:         0.000019 sec (saxpy method)
-tri+prep time:         0.000027 sec (incl time to compute L)
-compute C time:        0.000019 sec
+rate    12.16 million edges/sec (incl time for L=tril(A))
+rate    15.46 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000027 sec (nthreads: 4 speedup 1.75412)
+tricount time:         0.000028 sec (saxpy method)
+tri+prep time:         0.000035 sec (incl time to compute L)
+compute C time:        0.000027 sec
 reduce (C) time:       0.000000 sec
-rate    16.49 million edges/sec (incl time for L=tril(A))
-rate    22.72 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000025 sec (nthreads: 8 speedup 1.09045)
-tricount time:         0.000026 sec (saxpy method)
-tri+prep time:         0.000033 sec (incl time to compute L)
-compute C time:        0.000025 sec
+rate    12.39 million edges/sec (incl time for L=tril(A))
+rate    15.83 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000030 sec (nthreads: 8 speedup 1.59233)
+tricount time:         0.000030 sec (saxpy method)
+tri+prep time:         0.000038 sec (incl time to compute L)
+compute C time:        0.000030 sec
 reduce (C) time:       0.000000 sec
-rate    13.17 million edges/sec (incl time for L=tril(A))
-rate    16.87 million edges/sec (just tricount itself)
+rate    11.49 million edges/sec (incl time for L=tril(A))
+rate    14.39 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
 matrix 48 by 48, 352 entries, from stdin
 
-total time to read A matrix:       0.000230 sec
+total time to read A matrix:       0.000287 sec
 
 n 48 # edges 176
-U=triu(A) time:        0.000027 sec
-L=tril(A) time:        0.000006 sec
+U=triu(A) time:        0.000028 sec
+L=tril(A) time:        0.000009 sec
 
 ------------------------------------- dot product method:
 # triangles 160
-L*U' time (dot):         0.000027 sec
-tricount time:         0.000034 sec (dot product method)
-tri+prep time:         0.000068 sec (incl time to compute L and U)
-compute C time:        0.000027 sec
-reduce (C) time:       0.000007 sec
-rate     2.60 million edges/sec (incl time for U=triu(A))
-rate     5.16 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000010 sec (nthreads: 2 speedup 2.57265)
-tricount time:         0.000014 sec (dot product method)
-tri+prep time:         0.000048 sec (incl time to compute L and U)
-compute C time:        0.000010 sec
-reduce (C) time:       0.000004 sec
-rate     3.70 million edges/sec (incl time for U=triu(A))
-rate    12.62 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000008 sec (nthreads: 4 speedup 3.32946)
+L*U' time (dot):         0.000043 sec
+tricount time:         0.000047 sec (dot product method)
+tri+prep time:         0.000084 sec (incl time to compute L and U)
+compute C time:        0.000043 sec
+reduce (C) time:       0.000003 sec
+rate     2.10 million edges/sec (incl time for U=triu(A))
+rate     3.78 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000010 sec (nthreads: 2 speedup 4.16297)
 tricount time:         0.000012 sec (dot product method)
-tri+prep time:         0.000045 sec (incl time to compute L and U)
-compute C time:        0.000008 sec
-reduce (C) time:       0.000004 sec
-rate     3.90 million edges/sec (incl time for U=triu(A))
-rate    15.24 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000010 sec (nthreads: 8 speedup 2.6774)
-tricount time:         0.000013 sec (dot product method)
-tri+prep time:         0.000047 sec (incl time to compute L and U)
+tri+prep time:         0.000049 sec (incl time to compute L and U)
 compute C time:        0.000010 sec
-reduce (C) time:       0.000003 sec
-rate     3.78 million edges/sec (incl time for U=triu(A))
-rate    13.55 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000011 sec
+reduce (C) time:       0.000001 sec
+rate     3.60 million edges/sec (incl time for U=triu(A))
+rate    14.98 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000007 sec (nthreads: 4 speedup 5.94283)
+tricount time:         0.000008 sec (dot product method)
+tri+prep time:         0.000045 sec (incl time to compute L and U)
+compute C time:        0.000007 sec
+reduce (C) time:       0.000001 sec
+rate     3.88 million edges/sec (incl time for U=triu(A))
+rate    21.31 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000011 sec (nthreads: 8 speedup 4.00864)
+tricount time:         0.000012 sec (dot product method)
+tri+prep time:         0.000049 sec (incl time to compute L and U)
+compute C time:        0.000011 sec
+reduce (C) time:       0.000001 sec
+rate     3.57 million edges/sec (incl time for U=triu(A))
+rate    14.40 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000014 sec
 tricount time:         0.000015 sec (dot product method)
-tri+prep time:         0.000048 sec (incl time to compute L and U)
+tri+prep time:         0.000052 sec (incl time to compute L and U)
+compute C time:        0.000014 sec
+reduce (C) time:       0.000001 sec
+rate     3.37 million edges/sec (incl time for U=triu(A))
+rate    11.63 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000009 sec (nthreads: 2 speedup 1.56815)
+tricount time:         0.000010 sec (dot product method)
+tri+prep time:         0.000047 sec (incl time to compute L and U)
+compute C time:        0.000009 sec
+reduce (C) time:       0.000001 sec
+rate     3.76 million edges/sec (incl time for U=triu(A))
+rate    18.17 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000011 sec (nthreads: 4 speedup 1.21667)
+tricount time:         0.000013 sec (dot product method)
+tri+prep time:         0.000050 sec (incl time to compute L and U)
 compute C time:        0.000011 sec
-reduce (C) time:       0.000004 sec
-rate     3.64 million edges/sec (incl time for U=triu(A))
-rate    11.91 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000010 sec (nthreads: 2 speedup 1.10836)
-tricount time:         0.000014 sec (dot product method)
-tri+prep time:         0.000048 sec (incl time to compute L and U)
-compute C time:        0.000010 sec
-reduce (C) time:       0.000004 sec
-rate     3.67 million edges/sec (incl time for U=triu(A))
-rate    12.25 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000010 sec (nthreads: 4 speedup 1.10109)
+reduce (C) time:       0.000001 sec
+rate     3.54 million edges/sec (incl time for U=triu(A))
+rate    13.86 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000011 sec (nthreads: 8 speedup 1.20288)
 tricount time:         0.000013 sec (dot product method)
-tri+prep time:         0.000047 sec (incl time to compute L and U)
-compute C time:        0.000010 sec
-reduce (C) time:       0.000003 sec
-rate     3.77 million edges/sec (incl time for U=triu(A))
-rate    13.40 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000007 sec (nthreads: 8 speedup 1.50666)
-tricount time:         0.000010 sec (dot product method)
-tri+prep time:         0.000043 sec (incl time to compute L and U)
-compute C time:        0.000007 sec
-reduce (C) time:       0.000002 sec
-rate     4.06 million edges/sec (incl time for U=triu(A))
-rate    17.96 million edges/sec (just tricount itself)
+tri+prep time:         0.000050 sec (incl time to compute L and U)
+compute C time:        0.000011 sec
+reduce (C) time:       0.000001 sec
+rate     3.53 million edges/sec (incl time for U=triu(A))
+rate    13.81 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 160
-C<L>=L*L time (saxpy):         0.000031 sec
-tricount time:         0.000032 sec (saxpy method)
-tri+prep time:         0.000038 sec (incl time to compute L)
-compute C time:        0.000031 sec
+C<L>=L*L time (saxpy):         0.000047 sec
+tricount time:         0.000048 sec (saxpy method)
+tri+prep time:         0.000057 sec (incl time to compute L)
+compute C time:        0.000047 sec
 reduce (C) time:       0.000001 sec
-rate     4.59 million edges/sec (incl time for L=tril(A))
-rate     5.51 million edges/sec (just tricount itself)
-triangles, method 3: 160
-C<L>=L*L time (saxpy):         0.000015 sec (nthreads: 2 speedup 2.0114)
-tricount time:         0.000016 sec (saxpy method)
-tri+prep time:         0.000022 sec (incl time to compute L)
-compute C time:        0.000015 sec
+rate     3.06 million edges/sec (incl time for L=tril(A))
+rate     3.66 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000019 sec (nthreads: 2 speedup 2.466)
+tricount time:         0.000020 sec (saxpy method)
+tri+prep time:         0.000029 sec (incl time to compute L)
+compute C time:        0.000019 sec
 reduce (C) time:       0.000001 sec
-rate     7.84 million edges/sec (incl time for L=tril(A))
-rate    10.98 million edges/sec (just tricount itself)
-triangles, method 3: 160
-C<L>=L*L time (saxpy):         0.000012 sec (nthreads: 4 speedup 2.52172)
-tricount time:         0.000013 sec (saxpy method)
-tri+prep time:         0.000020 sec (incl time to compute L)
-compute C time:        0.000012 sec
+rate     6.04 million edges/sec (incl time for L=tril(A))
+rate     8.93 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000013 sec (nthreads: 4 speedup 3.50546)
+tricount time:         0.000014 sec (saxpy method)
+tri+prep time:         0.000023 sec (incl time to compute L)
+compute C time:        0.000013 sec
 reduce (C) time:       0.000001 sec
-rate     8.98 million edges/sec (incl time for L=tril(A))
-rate    13.34 million edges/sec (just tricount itself)
-triangles, method 3: 160
-C<L>=L*L time (saxpy):         0.000015 sec (nthreads: 8 speedup 2.03312)
+rate     7.51 million edges/sec (incl time for L=tril(A))
+rate    12.58 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000015 sec (nthreads: 8 speedup 3.0676)
 tricount time:         0.000016 sec (saxpy method)
-tri+prep time:         0.000023 sec (incl time to compute L)
+tri+prep time:         0.000025 sec (incl time to compute L)
 compute C time:        0.000015 sec
 reduce (C) time:       0.000001 sec
-rate     7.78 million edges/sec (incl time for L=tril(A))
-rate    10.86 million edges/sec (just tricount itself)
+rate     6.91 million edges/sec (incl time for L=tril(A))
+rate    10.97 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
 matrix 4884 by 4884, 285494 entries, from stdin
 
-total time to read A matrix:       0.070533 sec
+total time to read A matrix:       0.073128 sec
 
 n 4884 # edges 142747
-U=triu(A) time:        0.000237 sec
-L=tril(A) time:        0.000177 sec
+U=triu(A) time:        0.000225 sec
+L=tril(A) time:        0.000142 sec
 
 ------------------------------------- dot product method:
 # triangles 1512964
-L*U' time (dot):         0.015604 sec
-tricount time:         0.016610 sec (dot product method)
-tri+prep time:         0.017023 sec (incl time to compute L and U)
-compute C time:        0.015604 sec
-reduce (C) time:       0.001006 sec
-rate     8.39 million edges/sec (incl time for U=triu(A))
-rate     8.59 million edges/sec (just tricount itself)
-L*U' time (dot):         0.007760 sec (nthreads: 2 speedup 2.01076)
-tricount time:         0.008425 sec (dot product method)
-tri+prep time:         0.008838 sec (incl time to compute L and U)
-compute C time:        0.007760 sec
-reduce (C) time:       0.000664 sec
-rate    16.15 million edges/sec (incl time for U=triu(A))
-rate    16.94 million edges/sec (just tricount itself)
-L*U' time (dot):         0.004202 sec (nthreads: 4 speedup 3.71338)
-tricount time:         0.004798 sec (dot product method)
-tri+prep time:         0.005212 sec (incl time to compute L and U)
-compute C time:        0.004202 sec
-reduce (C) time:       0.000596 sec
-rate    27.39 million edges/sec (incl time for U=triu(A))
-rate    29.75 million edges/sec (just tricount itself)
-L*U' time (dot):         0.002794 sec (nthreads: 8 speedup 5.58414)
-tricount time:         0.003421 sec (dot product method)
-tri+prep time:         0.003834 sec (incl time to compute L and U)
-compute C time:        0.002794 sec
-reduce (C) time:       0.000626 sec
-rate    37.23 million edges/sec (incl time for U=triu(A))
-rate    41.73 million edges/sec (just tricount itself)
-L*U' time (dot):         0.014963 sec
-tricount time:         0.015630 sec (dot product method)
-tri+prep time:         0.016043 sec (incl time to compute L and U)
-compute C time:        0.014963 sec
-reduce (C) time:       0.000667 sec
-rate     8.90 million edges/sec (incl time for U=triu(A))
-rate     9.13 million edges/sec (just tricount itself)
-L*U' time (dot):         0.007483 sec (nthreads: 2 speedup 1.99945)
-tricount time:         0.007839 sec (dot product method)
-tri+prep time:         0.008253 sec (incl time to compute L and U)
-compute C time:        0.007483 sec
-reduce (C) time:       0.000356 sec
-rate    17.30 million edges/sec (incl time for U=triu(A))
-rate    18.21 million edges/sec (just tricount itself)
-L*U' time (dot):         0.004180 sec (nthreads: 4 speedup 3.57986)
-tricount time:         0.004810 sec (dot product method)
-tri+prep time:         0.005223 sec (incl time to compute L and U)
-compute C time:        0.004180 sec
-reduce (C) time:       0.000630 sec
-rate    27.33 million edges/sec (incl time for U=triu(A))
-rate    29.68 million edges/sec (just tricount itself)
-L*U' time (dot):         0.002833 sec (nthreads: 8 speedup 5.28111)
-tricount time:         0.003473 sec (dot product method)
-tri+prep time:         0.003887 sec (incl time to compute L and U)
-compute C time:        0.002833 sec
-reduce (C) time:       0.000640 sec
-rate    36.73 million edges/sec (incl time for U=triu(A))
-rate    41.10 million edges/sec (just tricount itself)
+L*U' time (dot):         0.013911 sec
+tricount time:         0.014396 sec (dot product method)
+tri+prep time:         0.014764 sec (incl time to compute L and U)
+compute C time:        0.013911 sec
+reduce (C) time:       0.000486 sec
+rate     9.67 million edges/sec (incl time for U=triu(A))
+rate     9.92 million edges/sec (just tricount itself)
+L*U' time (dot):         0.006919 sec (nthreads: 2 speedup 2.01037)
+tricount time:         0.007159 sec (dot product method)
+tri+prep time:         0.007527 sec (incl time to compute L and U)
+compute C time:        0.006919 sec
+reduce (C) time:       0.000239 sec
+rate    18.97 million edges/sec (incl time for U=triu(A))
+rate    19.94 million edges/sec (just tricount itself)
+L*U' time (dot):         0.003827 sec (nthreads: 4 speedup 3.63466)
+tricount time:         0.004121 sec (dot product method)
+tri+prep time:         0.004488 sec (incl time to compute L and U)
+compute C time:        0.003827 sec
+reduce (C) time:       0.000293 sec
+rate    31.80 million edges/sec (incl time for U=triu(A))
+rate    34.64 million edges/sec (just tricount itself)
+L*U' time (dot):         0.005970 sec (nthreads: 8 speedup 2.33004)
+tricount time:         0.006280 sec (dot product method)
+tri+prep time:         0.006648 sec (incl time to compute L and U)
+compute C time:        0.005970 sec
+reduce (C) time:       0.000310 sec
+rate    21.47 million edges/sec (incl time for U=triu(A))
+rate    22.73 million edges/sec (just tricount itself)
+L*U' time (dot):         0.015373 sec
+tricount time:         0.015847 sec (dot product method)
+tri+prep time:         0.016215 sec (incl time to compute L and U)
+compute C time:        0.015373 sec
+reduce (C) time:       0.000475 sec
+rate     8.80 million edges/sec (incl time for U=triu(A))
+rate     9.01 million edges/sec (just tricount itself)
+L*U' time (dot):         0.007376 sec (nthreads: 2 speedup 2.08416)
+tricount time:         0.007622 sec (dot product method)
+tri+prep time:         0.007989 sec (incl time to compute L and U)
+compute C time:        0.007376 sec
+reduce (C) time:       0.000246 sec
+rate    17.87 million edges/sec (incl time for U=triu(A))
+rate    18.73 million edges/sec (just tricount itself)
+L*U' time (dot):         0.004246 sec (nthreads: 4 speedup 3.62042)
+tricount time:         0.004506 sec (dot product method)
+tri+prep time:         0.004874 sec (incl time to compute L and U)
+compute C time:        0.004246 sec
+reduce (C) time:       0.000260 sec
+rate    29.29 million edges/sec (incl time for U=triu(A))
+rate    31.68 million edges/sec (just tricount itself)
+L*U' time (dot):         0.006729 sec (nthreads: 8 speedup 2.28465)
+tricount time:         0.007020 sec (dot product method)
+tri+prep time:         0.007388 sec (incl time to compute L and U)
+compute C time:        0.006729 sec
+reduce (C) time:       0.000292 sec
+rate    19.32 million edges/sec (incl time for U=triu(A))
+rate    20.33 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 1512964
-C<L>=L*L time (saxpy):         0.013017 sec
-tricount time:         0.013408 sec (saxpy method)
-tri+prep time:         0.013585 sec (incl time to compute L)
-compute C time:        0.013017 sec
-reduce (C) time:       0.000392 sec
-rate    10.51 million edges/sec (incl time for L=tril(A))
-rate    10.65 million edges/sec (just tricount itself)
-triangles, method 3: 1512964
-C<L>=L*L time (saxpy):         0.006626 sec (nthreads: 2 speedup 1.96461)
-tricount time:         0.006824 sec (saxpy method)
-tri+prep time:         0.007000 sec (incl time to compute L)
-compute C time:        0.006626 sec
-reduce (C) time:       0.000198 sec
-rate    20.39 million edges/sec (incl time for L=tril(A))
-rate    20.92 million edges/sec (just tricount itself)
-triangles, method 3: 1512964
-C<L>=L*L time (saxpy):         0.004039 sec (nthreads: 4 speedup 3.22239)
-tricount time:         0.004249 sec (saxpy method)
-tri+prep time:         0.004426 sec (incl time to compute L)
-compute C time:        0.004039 sec
-reduce (C) time:       0.000210 sec
-rate    32.25 million edges/sec (incl time for L=tril(A))
-rate    33.59 million edges/sec (just tricount itself)
-triangles, method 3: 1512964
-C<L>=L*L time (saxpy):         0.002870 sec (nthreads: 8 speedup 4.53567)
-tricount time:         0.003111 sec (saxpy method)
-tri+prep time:         0.003288 sec (incl time to compute L)
-compute C time:        0.002870 sec
-reduce (C) time:       0.000241 sec
-rate    43.42 million edges/sec (incl time for L=tril(A))
-rate    45.88 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.014019 sec
+tricount time:         0.014413 sec (saxpy method)
+tri+prep time:         0.014556 sec (incl time to compute L)
+compute C time:        0.014019 sec
+reduce (C) time:       0.000394 sec
+rate     9.81 million edges/sec (incl time for L=tril(A))
+rate     9.90 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.007036 sec (nthreads: 2 speedup 1.99254)
+tricount time:         0.007225 sec (saxpy method)
+tri+prep time:         0.007367 sec (incl time to compute L)
+compute C time:        0.007036 sec
+reduce (C) time:       0.000189 sec
+rate    19.38 million edges/sec (incl time for L=tril(A))
+rate    19.76 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.004042 sec (nthreads: 4 speedup 3.46866)
+tricount time:         0.004236 sec (saxpy method)
+tri+prep time:         0.004378 sec (incl time to compute L)
+compute C time:        0.004042 sec
+reduce (C) time:       0.000194 sec
+rate    32.60 million edges/sec (incl time for L=tril(A))
+rate    33.70 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.003180 sec (nthreads: 8 speedup 4.40848)
+tricount time:         0.003398 sec (saxpy method)
+tri+prep time:         0.003540 sec (incl time to compute L)
+compute C time:        0.003180 sec
+reduce (C) time:       0.000218 sec
+rate    40.32 million edges/sec (incl time for L=tril(A))
+rate    42.01 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
 matrix 183 by 183, 1402 entries, from stdin
 
-total time to read A matrix:       0.000648 sec
+total time to read A matrix:       0.000637 sec
 
 n 183 # edges 701
-U=triu(A) time:        0.000034 sec
+U=triu(A) time:        0.000030 sec
 L=tril(A) time:        0.000010 sec
 
 ------------------------------------- dot product method:
 # triangles 863
 L*U' time (dot):         0.000067 sec
-tricount time:         0.000081 sec (dot product method)
-tri+prep time:         0.000125 sec (incl time to compute L and U)
+tricount time:         0.000072 sec (dot product method)
+tri+prep time:         0.000112 sec (incl time to compute L and U)
 compute C time:        0.000067 sec
-reduce (C) time:       0.000014 sec
-rate     5.62 million edges/sec (incl time for U=triu(A))
-rate     8.68 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000041 sec (nthreads: 2 speedup 1.64645)
-tricount time:         0.000048 sec (dot product method)
-tri+prep time:         0.000092 sec (incl time to compute L and U)
-compute C time:        0.000041 sec
-reduce (C) time:       0.000007 sec
-rate     7.60 million edges/sec (incl time for U=triu(A))
-rate    14.54 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000037 sec (nthreads: 4 speedup 1.80399)
-tricount time:         0.000044 sec (dot product method)
-tri+prep time:         0.000088 sec (incl time to compute L and U)
-compute C time:        0.000037 sec
-reduce (C) time:       0.000007 sec
-rate     7.96 million edges/sec (incl time for U=triu(A))
-rate    15.91 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000036 sec (nthreads: 8 speedup 1.86393)
-tricount time:         0.000042 sec (dot product method)
-tri+prep time:         0.000086 sec (incl time to compute L and U)
-compute C time:        0.000036 sec
-reduce (C) time:       0.000006 sec
-rate     8.12 million edges/sec (incl time for U=triu(A))
-rate    16.58 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000041 sec
-tricount time:         0.000048 sec (dot product method)
-tri+prep time:         0.000092 sec (incl time to compute L and U)
-compute C time:        0.000041 sec
-reduce (C) time:       0.000007 sec
-rate     7.62 million edges/sec (incl time for U=triu(A))
-rate    14.61 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000038 sec (nthreads: 2 speedup 1.07256)
-tricount time:         0.000045 sec (dot product method)
-tri+prep time:         0.000089 sec (incl time to compute L and U)
-compute C time:        0.000038 sec
-reduce (C) time:       0.000007 sec
-rate     7.91 million edges/sec (incl time for U=triu(A))
-rate    15.69 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000036 sec (nthreads: 4 speedup 1.12909)
+reduce (C) time:       0.000005 sec
+rate     6.28 million edges/sec (incl time for U=triu(A))
+rate     9.71 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000039 sec (nthreads: 2 speedup 1.74055)
 tricount time:         0.000042 sec (dot product method)
-tri+prep time:         0.000086 sec (incl time to compute L and U)
-compute C time:        0.000036 sec
-reduce (C) time:       0.000006 sec
-rate     8.11 million edges/sec (incl time for U=triu(A))
-rate    16.51 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000035 sec (nthreads: 8 speedup 1.15865)
-tricount time:         0.000041 sec (dot product method)
-tri+prep time:         0.000085 sec (incl time to compute L and U)
-compute C time:        0.000035 sec
-reduce (C) time:       0.000006 sec
-rate     8.22 million edges/sec (incl time for U=triu(A))
-rate    16.99 million edges/sec (just tricount itself)
-
------------------------------------ saxpy method:
-triangles, method 3: 863
-C<L>=L*L time (saxpy):         0.000055 sec
-tricount time:         0.000056 sec (saxpy method)
-tri+prep time:         0.000066 sec (incl time to compute L)
-compute C time:        0.000055 sec
-reduce (C) time:       0.000001 sec
-rate    10.62 million edges/sec (incl time for L=tril(A))
-rate    12.41 million edges/sec (just tricount itself)
-triangles, method 3: 863
-C<L>=L*L time (saxpy):         0.000035 sec (nthreads: 2 speedup 1.57783)
-tricount time:         0.000036 sec (saxpy method)
-tri+prep time:         0.000046 sec (incl time to compute L)
+tri+prep time:         0.000081 sec (incl time to compute L and U)
+compute C time:        0.000039 sec
+reduce (C) time:       0.000003 sec
+rate     8.65 million edges/sec (incl time for U=triu(A))
+rate    16.83 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000035 sec (nthreads: 4 speedup 1.92135)
+tricount time:         0.000038 sec (dot product method)
+tri+prep time:         0.000077 sec (incl time to compute L and U)
 compute C time:        0.000035 sec
-reduce (C) time:       0.000001 sec
-rate    15.33 million edges/sec (incl time for L=tril(A))
-rate    19.36 million edges/sec (just tricount itself)
-triangles, method 3: 863
-C<L>=L*L time (saxpy):         0.000032 sec (nthreads: 4 speedup 1.74196)
-tricount time:         0.000033 sec (saxpy method)
-tri+prep time:         0.000042 sec (incl time to compute L)
+reduce (C) time:       0.000003 sec
+rate     9.09 million edges/sec (incl time for U=triu(A))
+rate    18.58 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000032 sec (nthreads: 8 speedup 2.0853)
+tricount time:         0.000035 sec (dot product method)
+tri+prep time:         0.000074 sec (incl time to compute L and U)
 compute C time:        0.000032 sec
-reduce (C) time:       0.000001 sec
-rate    16.51 million edges/sec (incl time for L=tril(A))
-rate    21.29 million edges/sec (just tricount itself)
-triangles, method 3: 863
-C<L>=L*L time (saxpy):         0.000031 sec (nthreads: 8 speedup 1.78544)
-tricount time:         0.000032 sec (saxpy method)
-tri+prep time:         0.000042 sec (incl time to compute L)
-compute C time:        0.000031 sec
-reduce (C) time:       0.000001 sec
-rate    16.75 million edges/sec (incl time for L=tril(A))
-rate    21.69 million edges/sec (just tricount itself)
-
---------------------------------------------------------------
-matrix 63 by 63, 246 entries, from stdin
-
-total time to read A matrix:       0.000285 sec
-
-n 63 # edges 123
-U=triu(A) time:        0.000035 sec
-L=tril(A) time:        0.000008 sec
-
-------------------------------------- dot product method:
-# triangles 0
-L*U' time (dot):         0.000037 sec
+reduce (C) time:       0.000003 sec
+rate     9.45 million edges/sec (incl time for U=triu(A))
+rate    20.15 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000044 sec
 tricount time:         0.000047 sec (dot product method)
-tri+prep time:         0.000091 sec (incl time to compute L and U)
-compute C time:        0.000037 sec
-reduce (C) time:       0.000010 sec
-rate     1.35 million edges/sec (incl time for U=triu(A))
-rate     2.61 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 7.05837)
-tricount time:         0.000009 sec (dot product method)
-tri+prep time:         0.000053 sec (incl time to compute L and U)
-compute C time:        0.000005 sec
-reduce (C) time:       0.000004 sec
-rate     2.31 million edges/sec (incl time for U=triu(A))
-rate    12.98 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec (nthreads: 4 speedup 9.18526)
-tricount time:         0.000008 sec (dot product method)
-tri+prep time:         0.000051 sec (incl time to compute L and U)
-compute C time:        0.000004 sec
-reduce (C) time:       0.000004 sec
-rate     2.39 million edges/sec (incl time for U=triu(A))
-rate    16.02 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000006 sec (nthreads: 8 speedup 6.37217)
-tricount time:         0.000010 sec (dot product method)
-tri+prep time:         0.000054 sec (incl time to compute L and U)
-compute C time:        0.000006 sec
-reduce (C) time:       0.000004 sec
-rate     2.28 million edges/sec (incl time for U=triu(A))
-rate    12.12 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000006 sec
-tricount time:         0.000010 sec (dot product method)
-tri+prep time:         0.000054 sec (incl time to compute L and U)
-compute C time:        0.000006 sec
-reduce (C) time:       0.000004 sec
-rate     2.29 million edges/sec (incl time for U=triu(A))
-rate    12.29 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec (nthreads: 2 speedup 1.65)
-tricount time:         0.000007 sec (dot product method)
-tri+prep time:         0.000051 sec (incl time to compute L and U)
-compute C time:        0.000004 sec
+tri+prep time:         0.000086 sec (incl time to compute L and U)
+compute C time:        0.000044 sec
 reduce (C) time:       0.000003 sec
-rate     2.43 million edges/sec (incl time for U=triu(A))
-rate    18.14 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000006 sec (nthreads: 4 speedup 1.08933)
-tricount time:         0.000010 sec (dot product method)
-tri+prep time:         0.000054 sec (incl time to compute L and U)
-compute C time:        0.000006 sec
-reduce (C) time:       0.000005 sec
-rate     2.27 million edges/sec (incl time for U=triu(A))
-rate    11.81 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000007 sec (nthreads: 8 speedup 0.944525)
-tricount time:         0.000012 sec (dot product method)
-tri+prep time:         0.000055 sec (incl time to compute L and U)
-compute C time:        0.000007 sec
-reduce (C) time:       0.000005 sec
-rate     2.22 million edges/sec (incl time for U=triu(A))
-rate    10.53 million edges/sec (just tricount itself)
+rate     8.15 million edges/sec (incl time for U=triu(A))
+rate    15.02 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000040 sec (nthreads: 2 speedup 1.10621)
+tricount time:         0.000042 sec (dot product method)
+tri+prep time:         0.000082 sec (incl time to compute L and U)
+compute C time:        0.000040 sec
+reduce (C) time:       0.000003 sec
+rate     8.59 million edges/sec (incl time for U=triu(A))
+rate    16.61 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000050 sec (nthreads: 4 speedup 0.871133)
+tricount time:         0.000053 sec (dot product method)
+tri+prep time:         0.000092 sec (incl time to compute L and U)
+compute C time:        0.000050 sec
+reduce (C) time:       0.000003 sec
+rate     7.59 million edges/sec (incl time for U=triu(A))
+rate    13.24 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000035 sec (nthreads: 8 speedup 1.24625)
+tricount time:         0.000038 sec (dot product method)
+tri+prep time:         0.000077 sec (incl time to compute L and U)
+compute C time:        0.000035 sec
+reduce (C) time:       0.000002 sec
+rate     9.10 million edges/sec (incl time for U=triu(A))
+rate    18.60 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000038 sec
-tricount time:         0.000039 sec (saxpy method)
-tri+prep time:         0.000048 sec (incl time to compute L)
-compute C time:        0.000038 sec
-reduce (C) time:       0.000001 sec
-rate     2.59 million edges/sec (incl time for L=tril(A))
-rate     3.15 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000018 sec (nthreads: 2 speedup 2.11677)
-tricount time:         0.000019 sec (saxpy method)
-tri+prep time:         0.000027 sec (incl time to compute L)
-compute C time:        0.000018 sec
-reduce (C) time:       0.000001 sec
-rate     4.52 million edges/sec (incl time for L=tril(A))
-rate     6.55 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000018 sec (nthreads: 4 speedup 2.10611)
-tricount time:         0.000019 sec (saxpy method)
-tri+prep time:         0.000027 sec (incl time to compute L)
-compute C time:        0.000018 sec
+C<L>=L*L time (saxpy):         0.000059 sec
+tricount time:         0.000060 sec (saxpy method)
+tri+prep time:         0.000070 sec (incl time to compute L)
+compute C time:        0.000059 sec
+reduce (C) time:       0.000002 sec
+rate    10.03 million edges/sec (incl time for L=tril(A))
+rate    11.64 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000035 sec (nthreads: 2 speedup 1.664)
+tricount time:         0.000037 sec (saxpy method)
+tri+prep time:         0.000046 sec (incl time to compute L)
+compute C time:        0.000035 sec
 reduce (C) time:       0.000001 sec
-rate     4.52 million edges/sec (incl time for L=tril(A))
-rate     6.55 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000030 sec (nthreads: 8 speedup 1.2624)
-tricount time:         0.000031 sec (saxpy method)
-tri+prep time:         0.000039 sec (incl time to compute L)
-compute C time:        0.000030 sec
+rate    15.16 million edges/sec (incl time for L=tril(A))
+rate    19.14 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000032 sec (nthreads: 4 speedup 1.80297)
+tricount time:         0.000034 sec (saxpy method)
+tri+prep time:         0.000044 sec (incl time to compute L)
+compute C time:        0.000032 sec
 reduce (C) time:       0.000001 sec
-rate     3.12 million edges/sec (incl time for L=tril(A))
-rate     3.97 million edges/sec (just tricount itself)
+rate    16.10 million edges/sec (incl time for L=tril(A))
+rate    20.67 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000032 sec (nthreads: 8 speedup 1.81218)
+tricount time:         0.000034 sec (saxpy method)
+tri+prep time:         0.000043 sec (incl time to compute L)
+compute C time:        0.000032 sec
+reduce (C) time:       0.000002 sec
+rate    16.13 million edges/sec (incl time for L=tril(A))
+rate    20.72 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
 matrix 63 by 63, 246 entries, from stdin
 
-total time to read A matrix:       0.000222 sec
+total time to read A matrix:       0.000214 sec
 
 n 63 # edges 123
-U=triu(A) time:        0.000024 sec
+U=triu(A) time:        0.000016 sec
 L=tril(A) time:        0.000005 sec
 
 ------------------------------------- dot product method:
 # triangles 0
-L*U' time (dot):         0.000030 sec
-tricount time:         0.000037 sec (dot product method)
-tri+prep time:         0.000067 sec (incl time to compute L and U)
-compute C time:        0.000030 sec
-reduce (C) time:       0.000008 sec
-rate     1.84 million edges/sec (incl time for U=triu(A))
-rate     3.28 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 6.03715)
-tricount time:         0.000008 sec (dot product method)
-tri+prep time:         0.000038 sec (incl time to compute L and U)
-compute C time:        0.000005 sec
-reduce (C) time:       0.000003 sec
-rate     3.26 million edges/sec (incl time for U=triu(A))
-rate    14.95 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec (nthreads: 4 speedup 7.86954)
-tricount time:         0.000007 sec (dot product method)
-tri+prep time:         0.000037 sec (incl time to compute L and U)
-compute C time:        0.000004 sec
-reduce (C) time:       0.000003 sec
-rate     3.36 million edges/sec (incl time for U=triu(A))
-rate    17.25 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec (nthreads: 8 speedup 8.28384)
-tricount time:         0.000006 sec (dot product method)
-tri+prep time:         0.000036 sec (incl time to compute L and U)
-compute C time:        0.000004 sec
-reduce (C) time:       0.000003 sec
-rate     3.45 million edges/sec (incl time for U=triu(A))
-rate    19.84 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec
-tricount time:         0.000007 sec (dot product method)
-tri+prep time:         0.000037 sec (incl time to compute L and U)
-compute C time:        0.000004 sec
-reduce (C) time:       0.000003 sec
-rate     3.35 million edges/sec (incl time for U=triu(A))
-rate    17.09 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec (nthreads: 2 speedup 1.06995)
-tricount time:         0.000007 sec (dot product method)
-tri+prep time:         0.000036 sec (incl time to compute L and U)
-compute C time:        0.000004 sec
-reduce (C) time:       0.000003 sec
-rate     3.37 million edges/sec (incl time for U=triu(A))
-rate    17.57 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec (nthreads: 4 speedup 1.17271)
-tricount time:         0.000006 sec (dot product method)
-tri+prep time:         0.000036 sec (incl time to compute L and U)
-compute C time:        0.000004 sec
-reduce (C) time:       0.000003 sec
-rate     3.46 million edges/sec (incl time for U=triu(A))
-rate    20.24 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec (nthreads: 8 speedup 1.19076)
-tricount time:         0.000006 sec (dot product method)
-tri+prep time:         0.000035 sec (incl time to compute L and U)
+L*U' time (dot):         0.000021 sec
+tricount time:         0.000022 sec (dot product method)
+tri+prep time:         0.000043 sec (incl time to compute L and U)
+compute C time:        0.000021 sec
+reduce (C) time:       0.000001 sec
+rate     2.86 million edges/sec (incl time for U=triu(A))
+rate     5.53 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000004 sec (nthreads: 2 speedup 4.69135)
+tricount time:         0.000005 sec (dot product method)
+tri+prep time:         0.000026 sec (incl time to compute L and U)
 compute C time:        0.000004 sec
-reduce (C) time:       0.000002 sec
-rate     3.47 million edges/sec (incl time for U=triu(A))
-rate    20.68 million edges/sec (just tricount itself)
+reduce (C) time:       0.000000 sec
+rate     4.79 million edges/sec (incl time for U=triu(A))
+rate    25.15 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 6.61758)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000024 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     5.07 million edges/sec (incl time for U=triu(A))
+rate    35.47 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 7.13425)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000024 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     5.12 million edges/sec (incl time for U=triu(A))
+rate    37.80 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000025 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     4.98 million edges/sec (incl time for U=triu(A))
+rate    31.26 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 2 speedup 1.15055)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000024 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     5.09 million edges/sec (incl time for U=triu(A))
+rate    36.33 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 1.26994)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000024 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     5.16 million edges/sec (incl time for U=triu(A))
+rate    39.87 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 1.35606)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000024 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     5.19 million edges/sec (incl time for U=triu(A))
+rate    42.14 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000021 sec
-tricount time:         0.000022 sec (saxpy method)
-tri+prep time:         0.000027 sec (incl time to compute L)
-compute C time:        0.000021 sec
+C<L>=L*L time (saxpy):         0.000023 sec
+tricount time:         0.000023 sec (saxpy method)
+tri+prep time:         0.000028 sec (incl time to compute L)
+compute C time:        0.000023 sec
+reduce (C) time:       0.000000 sec
+rate     4.44 million edges/sec (incl time for L=tril(A))
+rate     5.33 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000013 sec (nthreads: 2 speedup 1.72664)
+tricount time:         0.000013 sec (saxpy method)
+tri+prep time:         0.000018 sec (incl time to compute L)
+compute C time:        0.000013 sec
 reduce (C) time:       0.000000 sec
-rate     4.63 million edges/sec (incl time for L=tril(A))
-rate     5.69 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000012 sec (nthreads: 2 speedup 1.768)
+rate     6.83 million edges/sec (incl time for L=tril(A))
+rate     9.19 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000012 sec (nthreads: 4 speedup 1.85064)
 tricount time:         0.000012 sec (saxpy method)
 tri+prep time:         0.000017 sec (incl time to compute L)
 compute C time:        0.000012 sec
 reduce (C) time:       0.000000 sec
-rate     7.09 million edges/sec (incl time for L=tril(A))
-rate     9.95 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000011 sec (nthreads: 4 speedup 1.90467)
-tricount time:         0.000012 sec (saxpy method)
-tri+prep time:         0.000016 sec (incl time to compute L)
-compute C time:        0.000011 sec
+rate     7.20 million edges/sec (incl time for L=tril(A))
+rate     9.85 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000016 sec (nthreads: 8 speedup 1.40767)
+tricount time:         0.000016 sec (saxpy method)
+tri+prep time:         0.000021 sec (incl time to compute L)
+compute C time:        0.000016 sec
+reduce (C) time:       0.000000 sec
+rate     5.84 million edges/sec (incl time for L=tril(A))
+rate     7.48 million edges/sec (just tricount itself)
+
+--------------------------------------------------------------
+matrix 63 by 63, 246 entries, from stdin
+
+total time to read A matrix:       0.000211 sec
+
+n 63 # edges 123
+U=triu(A) time:        0.000019 sec
+L=tril(A) time:        0.000005 sec
+
+------------------------------------- dot product method:
+# triangles 0
+L*U' time (dot):         0.000026 sec
+tricount time:         0.000027 sec (dot product method)
+tri+prep time:         0.000051 sec (incl time to compute L and U)
+compute C time:        0.000026 sec
+reduce (C) time:       0.000002 sec
+rate     2.42 million edges/sec (incl time for U=triu(A))
+rate     4.49 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 5.61418)
+tricount time:         0.000005 sec (dot product method)
+tri+prep time:         0.000028 sec (incl time to compute L and U)
+compute C time:        0.000005 sec
+reduce (C) time:       0.000000 sec
+rate     4.34 million edges/sec (incl time for U=triu(A))
+rate    24.47 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 8.20958)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000027 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     4.59 million edges/sec (incl time for U=triu(A))
+rate    35.33 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 8.76997)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000027 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     4.62 million edges/sec (incl time for U=triu(A))
+rate    37.63 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000027 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     4.52 million edges/sec (incl time for U=triu(A))
+rate    31.65 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 2 speedup 1.0719)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000027 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     4.57 million edges/sec (incl time for U=triu(A))
+rate    34.42 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 1.18954)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000027 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     4.62 million edges/sec (incl time for U=triu(A))
+rate    37.64 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 1.21036)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000027 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     4.64 million edges/sec (incl time for U=triu(A))
+rate    38.43 million edges/sec (just tricount itself)
+
+----------------------------------- saxpy method:
+C<L>=L*L time (saxpy):         0.000025 sec
+tricount time:         0.000025 sec (saxpy method)
+tri+prep time:         0.000030 sec (incl time to compute L)
+compute C time:        0.000025 sec
 reduce (C) time:       0.000000 sec
-rate     7.46 million edges/sec (incl time for L=tril(A))
-rate    10.69 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000013 sec (nthreads: 8 speedup 1.69722)
+rate     4.13 million edges/sec (incl time for L=tril(A))
+rate     4.87 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000013 sec (nthreads: 2 speedup 1.89479)
 tricount time:         0.000013 sec (saxpy method)
 tri+prep time:         0.000018 sec (incl time to compute L)
 compute C time:        0.000013 sec
 reduce (C) time:       0.000000 sec
-rate     6.88 million edges/sec (incl time for L=tril(A))
-rate     9.53 million edges/sec (just tricount itself)
+rate     6.84 million edges/sec (incl time for L=tril(A))
+rate     9.15 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000012 sec (nthreads: 4 speedup 1.99589)
+tricount time:         0.000013 sec (saxpy method)
+tri+prep time:         0.000017 sec (incl time to compute L)
+compute C time:        0.000012 sec
+reduce (C) time:       0.000000 sec
+rate     7.11 million edges/sec (incl time for L=tril(A))
+rate     9.65 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000016 sec (nthreads: 8 speedup 1.52101)
+tricount time:         0.000017 sec (saxpy method)
+tri+prep time:         0.000021 sec (incl time to compute L)
+compute C time:        0.000016 sec
+reduce (C) time:       0.000000 sec
+rate     5.80 million edges/sec (incl time for L=tril(A))
+rate     7.39 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
 matrix 78 by 78, 204 entries, from stdin
 
-total time to read A matrix:       0.000178 sec
+total time to read A matrix:       0.000179 sec
 
 n 78 # edges 102
-U=triu(A) time:        0.000021 sec
-L=tril(A) time:        0.000006 sec
+U=triu(A) time:        0.000017 sec
+L=tril(A) time:        0.000004 sec
 
 ------------------------------------- dot product method:
 # triangles 0
-L*U' time (dot):         0.000031 sec
-tricount time:         0.000038 sec (dot product method)
-tri+prep time:         0.000066 sec (incl time to compute L and U)
-compute C time:        0.000031 sec
-reduce (C) time:       0.000008 sec
-rate     1.55 million edges/sec (incl time for U=triu(A))
-rate     2.66 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 5.82485)
-tricount time:         0.000009 sec (dot product method)
-tri+prep time:         0.000036 sec (incl time to compute L and U)
+L*U' time (dot):         0.000021 sec
+tricount time:         0.000023 sec (dot product method)
+tri+prep time:         0.000044 sec (incl time to compute L and U)
+compute C time:        0.000021 sec
+reduce (C) time:       0.000002 sec
+rate     2.32 million edges/sec (incl time for U=triu(A))
+rate     4.46 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000005 sec (nthreads: 2 speedup 4.33904)
+tricount time:         0.000005 sec (dot product method)
+tri+prep time:         0.000026 sec (incl time to compute L and U)
 compute C time:        0.000005 sec
-reduce (C) time:       0.000004 sec
-rate     2.81 million edges/sec (incl time for U=triu(A))
-rate    11.53 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec (nthreads: 4 speedup 7.9056)
-tricount time:         0.000007 sec (dot product method)
-tri+prep time:         0.000034 sec (incl time to compute L and U)
-compute C time:        0.000004 sec
-reduce (C) time:       0.000003 sec
-rate     2.99 million edges/sec (incl time for U=triu(A))
-rate    15.34 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 9.02629)
-tricount time:         0.000008 sec (dot product method)
-tri+prep time:         0.000035 sec (incl time to compute L and U)
+reduce (C) time:       0.000001 sec
+rate     3.85 million edges/sec (incl time for U=triu(A))
+rate    18.71 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 6.50257)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000025 sec (incl time to compute L and U)
 compute C time:        0.000003 sec
-reduce (C) time:       0.000005 sec
-rate     2.88 million edges/sec (incl time for U=triu(A))
-rate    12.84 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000006 sec
-tricount time:         0.000010 sec (dot product method)
-tri+prep time:         0.000037 sec (incl time to compute L and U)
-compute C time:        0.000006 sec
-reduce (C) time:       0.000004 sec
-rate     2.76 million edges/sec (incl time for U=triu(A))
-rate    10.69 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec (nthreads: 2 speedup 1.45911)
-tricount time:         0.000007 sec (dot product method)
-tri+prep time:         0.000034 sec (incl time to compute L and U)
-compute C time:        0.000004 sec
-reduce (C) time:       0.000003 sec
-rate     2.98 million edges/sec (incl time for U=triu(A))
-rate    15.00 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000004 sec (nthreads: 4 speedup 1.47275)
-tricount time:         0.000006 sec (dot product method)
-tri+prep time:         0.000034 sec (incl time to compute L and U)
-compute C time:        0.000004 sec
-reduce (C) time:       0.000003 sec
-rate     3.01 million edges/sec (incl time for U=triu(A))
-rate    15.96 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 1.66235)
-tricount time:         0.000006 sec (dot product method)
-tri+prep time:         0.000033 sec (incl time to compute L and U)
+reduce (C) time:       0.000000 sec
+rate     4.13 million edges/sec (incl time for U=triu(A))
+rate    28.13 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 8 speedup 7.15529)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000024 sec (incl time to compute L and U)
 compute C time:        0.000003 sec
-reduce (C) time:       0.000002 sec
-rate     3.07 million edges/sec (incl time for U=triu(A))
-rate    17.70 million edges/sec (just tricount itself)
+reduce (C) time:       0.000000 sec
+rate     4.19 million edges/sec (incl time for U=triu(A))
+rate    30.93 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000025 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     4.13 million edges/sec (incl time for U=triu(A))
+rate    27.94 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 2 speedup 1.08112)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000024 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     4.18 million edges/sec (incl time for U=triu(A))
+rate    30.56 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000003 sec (nthreads: 4 speedup 1.16769)
+tricount time:         0.000003 sec (dot product method)
+tri+prep time:         0.000024 sec (incl time to compute L and U)
+compute C time:        0.000003 sec
+reduce (C) time:       0.000000 sec
+rate     4.22 million edges/sec (incl time for U=triu(A))
+rate    32.77 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000004 sec (nthreads: 8 speedup 0.821643)
+tricount time:         0.000004 sec (dot product method)
+tri+prep time:         0.000025 sec (incl time to compute L and U)
+compute C time:        0.000004 sec
+reduce (C) time:       0.000000 sec
+rate     4.01 million edges/sec (incl time for U=triu(A))
+rate    23.28 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000030 sec
-tricount time:         0.000030 sec (saxpy method)
-tri+prep time:         0.000037 sec (incl time to compute L)
-compute C time:        0.000030 sec
-reduce (C) time:       0.000001 sec
-rate     2.77 million edges/sec (incl time for L=tril(A))
-rate     3.36 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000013 sec (nthreads: 2 speedup 2.37011)
+C<L>=L*L time (saxpy):         0.000022 sec
+tricount time:         0.000022 sec (saxpy method)
+tri+prep time:         0.000026 sec (incl time to compute L)
+compute C time:        0.000022 sec
+reduce (C) time:       0.000000 sec
+rate     3.86 million edges/sec (incl time for L=tril(A))
+rate     4.59 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000013 sec (nthreads: 2 speedup 1.7295)
 tricount time:         0.000013 sec (saxpy method)
-tri+prep time:         0.000019 sec (incl time to compute L)
+tri+prep time:         0.000017 sec (incl time to compute L)
 compute C time:        0.000013 sec
 reduce (C) time:       0.000000 sec
-rate     5.29 million edges/sec (incl time for L=tril(A))
-rate     7.92 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000013 sec (nthreads: 4 speedup 2.26979)
-tricount time:         0.000014 sec (saxpy method)
-tri+prep time:         0.000020 sec (incl time to compute L)
-compute C time:        0.000013 sec
-reduce (C) time:       0.000001 sec
-rate     5.06 million edges/sec (incl time for L=tril(A))
-rate     7.41 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000026 sec (nthreads: 8 speedup 1.16697)
-tricount time:         0.000026 sec (saxpy method)
-tri+prep time:         0.000033 sec (incl time to compute L)
-compute C time:        0.000026 sec
-reduce (C) time:       0.000001 sec
-rate     3.13 million edges/sec (incl time for L=tril(A))
-rate     3.89 million edges/sec (just tricount itself)
+rate     5.94 million edges/sec (incl time for L=tril(A))
+rate     7.86 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000012 sec (nthreads: 4 speedup 1.76988)
+tricount time:         0.000013 sec (saxpy method)
+tri+prep time:         0.000017 sec (incl time to compute L)
+compute C time:        0.000012 sec
+reduce (C) time:       0.000000 sec
+rate     6.06 million edges/sec (incl time for L=tril(A))
+rate     8.07 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000025 sec (nthreads: 8 speedup 0.885225)
+tricount time:         0.000025 sec (saxpy method)
+tri+prep time:         0.000029 sec (incl time to compute L)
+compute C time:        0.000025 sec
+reduce (C) time:       0.000000 sec
+rate     3.49 million edges/sec (incl time for L=tril(A))
+rate     4.07 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
 matrix 982 by 982, 99840 entries, from stdin
 
-total time to read A matrix:       0.027242 sec
+total time to read A matrix:       0.029471 sec
 
 n 982 # edges 49920
-U=triu(A) time:        0.000163 sec
-L=tril(A) time:        0.000136 sec
+U=triu(A) time:        0.000174 sec
+L=tril(A) time:        0.000148 sec
 
 ------------------------------------- dot product method:
 # triangles 0
-L*U' time (dot):         0.000341 sec
-tricount time:         0.000413 sec (dot product method)
-tri+prep time:         0.000711 sec (incl time to compute L and U)
-compute C time:        0.000341 sec
-reduce (C) time:       0.000072 sec
-rate    70.16 million edges/sec (incl time for U=triu(A))
-rate   120.85 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000306 sec (nthreads: 2 speedup 1.11459)
-tricount time:         0.000369 sec (dot product method)
-tri+prep time:         0.000668 sec (incl time to compute L and U)
-compute C time:        0.000306 sec
-reduce (C) time:       0.000064 sec
-rate    74.75 million edges/sec (incl time for U=triu(A))
-rate   135.12 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000250 sec (nthreads: 4 speedup 1.36541)
-tricount time:         0.000314 sec (dot product method)
-tri+prep time:         0.000613 sec (incl time to compute L and U)
-compute C time:        0.000250 sec
-reduce (C) time:       0.000065 sec
-rate    81.48 million edges/sec (incl time for U=triu(A))
-rate   158.86 million edges/sec (just tricount itself)
-L*U' time (dot):         0.001259 sec (nthreads: 8 speedup 0.270668)
-tricount time:         0.001334 sec (dot product method)
-tri+prep time:         0.001632 sec (incl time to compute L and U)
-compute C time:        0.001259 sec
-reduce (C) time:       0.000075 sec
-rate    30.58 million edges/sec (incl time for U=triu(A))
-rate    37.42 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000366 sec
-tricount time:         0.000436 sec (dot product method)
-tri+prep time:         0.000734 sec (incl time to compute L and U)
-compute C time:        0.000366 sec
-reduce (C) time:       0.000070 sec
-rate    67.98 million edges/sec (incl time for U=triu(A))
-rate   114.50 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000291 sec (nthreads: 2 speedup 1.2581)
-tricount time:         0.000355 sec (dot product method)
-tri+prep time:         0.000654 sec (incl time to compute L and U)
-compute C time:        0.000291 sec
-reduce (C) time:       0.000064 sec
-rate    76.36 million edges/sec (incl time for U=triu(A))
-rate   140.49 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000247 sec (nthreads: 4 speedup 1.4854)
-tricount time:         0.000319 sec (dot product method)
-tri+prep time:         0.000617 sec (incl time to compute L and U)
-compute C time:        0.000247 sec
-reduce (C) time:       0.000072 sec
-rate    80.90 million edges/sec (incl time for U=triu(A))
-rate   156.64 million edges/sec (just tricount itself)
-L*U' time (dot):         0.002759 sec (nthreads: 8 speedup 0.132738)
-tricount time:         0.002832 sec (dot product method)
-tri+prep time:         0.003130 sec (incl time to compute L and U)
-compute C time:        0.002759 sec
-reduce (C) time:       0.000073 sec
-rate    15.95 million edges/sec (incl time for U=triu(A))
-rate    17.63 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000362 sec
+tricount time:         0.000395 sec (dot product method)
+tri+prep time:         0.000716 sec (incl time to compute L and U)
+compute C time:        0.000362 sec
+reduce (C) time:       0.000032 sec
+rate    69.75 million edges/sec (incl time for U=triu(A))
+rate   126.53 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000333 sec (nthreads: 2 speedup 1.08962)
+tricount time:         0.000363 sec (dot product method)
+tri+prep time:         0.000684 sec (incl time to compute L and U)
+compute C time:        0.000333 sec
+reduce (C) time:       0.000030 sec
+rate    72.98 million edges/sec (incl time for U=triu(A))
+rate   137.56 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000273 sec (nthreads: 4 speedup 1.32884)
+tricount time:         0.000318 sec (dot product method)
+tri+prep time:         0.000639 sec (incl time to compute L and U)
+compute C time:        0.000273 sec
+reduce (C) time:       0.000045 sec
+rate    78.14 million edges/sec (incl time for U=triu(A))
+rate   157.11 million edges/sec (just tricount itself)
+L*U' time (dot):         0.002922 sec (nthreads: 8 speedup 0.124043)
+tricount time:         0.002957 sec (dot product method)
+tri+prep time:         0.003278 sec (incl time to compute L and U)
+compute C time:        0.002922 sec
+reduce (C) time:       0.000035 sec
+rate    15.23 million edges/sec (incl time for U=triu(A))
+rate    16.88 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000374 sec
+tricount time:         0.000402 sec (dot product method)
+tri+prep time:         0.000723 sec (incl time to compute L and U)
+compute C time:        0.000374 sec
+reduce (C) time:       0.000028 sec
+rate    69.08 million edges/sec (incl time for U=triu(A))
+rate   124.32 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000279 sec (nthreads: 2 speedup 1.33844)
+tricount time:         0.000307 sec (dot product method)
+tri+prep time:         0.000628 sec (incl time to compute L and U)
+compute C time:        0.000279 sec
+reduce (C) time:       0.000027 sec
+rate    79.52 million edges/sec (incl time for U=triu(A))
+rate   162.79 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000236 sec (nthreads: 4 speedup 1.58021)
+tricount time:         0.000266 sec (dot product method)
+tri+prep time:         0.000587 sec (incl time to compute L and U)
+compute C time:        0.000236 sec
+reduce (C) time:       0.000030 sec
+rate    85.01 million edges/sec (incl time for U=triu(A))
+rate   187.59 million edges/sec (just tricount itself)
+L*U' time (dot):         0.001664 sec (nthreads: 8 speedup 0.224596)
+tricount time:         0.001696 sec (dot product method)
+tri+prep time:         0.002017 sec (incl time to compute L and U)
+compute C time:        0.001664 sec
+reduce (C) time:       0.000033 sec
+rate    24.75 million edges/sec (incl time for U=triu(A))
+rate    29.43 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000248 sec
-tricount time:         0.000248 sec (saxpy method)
-tri+prep time:         0.000384 sec (incl time to compute L)
-compute C time:        0.000248 sec
+C<L>=L*L time (saxpy):         0.000412 sec
+tricount time:         0.000413 sec (saxpy method)
+tri+prep time:         0.000560 sec (incl time to compute L)
+compute C time:        0.000412 sec
 reduce (C) time:       0.000001 sec
-rate   130.04 million edges/sec (incl time for L=tril(A))
-rate   200.99 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000232 sec (nthreads: 2 speedup 1.06781)
-tricount time:         0.000233 sec (saxpy method)
-tri+prep time:         0.000368 sec (incl time to compute L)
-compute C time:        0.000232 sec
+rate    89.11 million edges/sec (incl time for L=tril(A))
+rate   120.99 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000348 sec (nthreads: 2 speedup 1.1835)
+tricount time:         0.000349 sec (saxpy method)
+tri+prep time:         0.000496 sec (incl time to compute L)
+compute C time:        0.000348 sec
 reduce (C) time:       0.000001 sec
-rate   135.59 million edges/sec (incl time for L=tril(A))
-rate   214.58 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000231 sec (nthreads: 4 speedup 1.07507)
-tricount time:         0.000231 sec (saxpy method)
-tri+prep time:         0.000367 sec (incl time to compute L)
-compute C time:        0.000231 sec
+rate   100.62 million edges/sec (incl time for L=tril(A))
+rate   143.23 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000373 sec (nthreads: 4 speedup 1.10476)
+tricount time:         0.000373 sec (saxpy method)
+tri+prep time:         0.000521 sec (incl time to compute L)
+compute C time:        0.000373 sec
 reduce (C) time:       0.000001 sec
-rate   136.18 million edges/sec (incl time for L=tril(A))
-rate   216.05 million edges/sec (just tricount itself)
-triangles, method 3: 0
-C<L>=L*L time (saxpy):         0.000232 sec (nthreads: 8 speedup 1.06746)
-tricount time:         0.000233 sec (saxpy method)
-tri+prep time:         0.000368 sec (incl time to compute L)
-compute C time:        0.000232 sec
+rate    95.85 million edges/sec (incl time for L=tril(A))
+rate   133.75 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000377 sec (nthreads: 8 speedup 1.0916)
+tricount time:         0.000378 sec (saxpy method)
+tri+prep time:         0.000526 sec (incl time to compute L)
+compute C time:        0.000377 sec
 reduce (C) time:       0.000001 sec
-rate   135.55 million edges/sec (incl time for L=tril(A))
-rate   214.48 million edges/sec (just tricount itself)
+rate    94.97 million edges/sec (incl time for L=tril(A))
+rate   132.06 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
 matrix 67 by 67, 574 entries, from stdin
 
-total time to read A matrix:       0.000244 sec
+total time to read A matrix:       0.000275 sec
 
 n 67 # edges 287
-U=triu(A) time:        0.000023 sec
+U=triu(A) time:        0.000024 sec
 L=tril(A) time:        0.000007 sec
 
 ------------------------------------- dot product method:
 # triangles 120
-L*U' time (dot):         0.000029 sec
-tricount time:         0.000036 sec (dot product method)
-tri+prep time:         0.000066 sec (incl time to compute L and U)
-compute C time:        0.000029 sec
-reduce (C) time:       0.000007 sec
-rate     4.35 million edges/sec (incl time for U=triu(A))
-rate     7.90 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000012 sec (nthreads: 2 speedup 2.40315)
-tricount time:         0.000016 sec (dot product method)
-tri+prep time:         0.000046 sec (incl time to compute L and U)
-compute C time:        0.000012 sec
-reduce (C) time:       0.000004 sec
-rate     6.29 million edges/sec (incl time for U=triu(A))
-rate    18.03 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000010 sec (nthreads: 4 speedup 2.98421)
-tricount time:         0.000013 sec (dot product method)
-tri+prep time:         0.000043 sec (incl time to compute L and U)
-compute C time:        0.000010 sec
+L*U' time (dot):         0.000032 sec
+tricount time:         0.000035 sec (dot product method)
+tri+prep time:         0.000065 sec (incl time to compute L and U)
+compute C time:        0.000032 sec
 reduce (C) time:       0.000003 sec
-rate     6.69 million edges/sec (incl time for U=triu(A))
-rate    21.80 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000009 sec (nthreads: 8 speedup 3.27907)
+rate     4.41 million edges/sec (incl time for U=triu(A))
+rate     8.25 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000011 sec (nthreads: 2 speedup 2.86698)
 tricount time:         0.000012 sec (dot product method)
-tri+prep time:         0.000042 sec (incl time to compute L and U)
-compute C time:        0.000009 sec
-reduce (C) time:       0.000003 sec
-rate     6.89 million edges/sec (incl time for U=triu(A))
-rate    24.07 million edges/sec (just tricount itself)
+tri+prep time:         0.000043 sec (incl time to compute L and U)
+compute C time:        0.000011 sec
+reduce (C) time:       0.000001 sec
+rate     6.75 million edges/sec (incl time for U=triu(A))
+rate    23.50 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000008 sec (nthreads: 4 speedup 3.78072)
+tricount time:         0.000009 sec (dot product method)
+tri+prep time:         0.000040 sec (incl time to compute L and U)
+compute C time:        0.000008 sec
+reduce (C) time:       0.000001 sec
+rate     7.26 million edges/sec (incl time for U=triu(A))
+rate    31.12 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000007 sec (nthreads: 8 speedup 4.41994)
+tricount time:         0.000008 sec (dot product method)
+tri+prep time:         0.000038 sec (incl time to compute L and U)
+compute C time:        0.000007 sec
+reduce (C) time:       0.000001 sec
+rate     7.52 million edges/sec (incl time for U=triu(A))
+rate    36.41 million edges/sec (just tricount itself)
 L*U' time (dot):         0.000012 sec
-tricount time:         0.000016 sec (dot product method)
-tri+prep time:         0.000045 sec (incl time to compute L and U)
-compute C time:        0.000012 sec
-reduce (C) time:       0.000004 sec
-rate     6.31 million edges/sec (incl time for U=triu(A))
-rate    18.20 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000011 sec (nthreads: 2 speedup 1.12003)
-tricount time:         0.000014 sec (dot product method)
+tricount time:         0.000013 sec (dot product method)
 tri+prep time:         0.000044 sec (incl time to compute L and U)
-compute C time:        0.000011 sec
-reduce (C) time:       0.000003 sec
-rate     6.59 million edges/sec (incl time for U=triu(A))
-rate    20.81 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000010 sec (nthreads: 4 speedup 1.22214)
-tricount time:         0.000012 sec (dot product method)
-tri+prep time:         0.000042 sec (incl time to compute L and U)
-compute C time:        0.000010 sec
-reduce (C) time:       0.000003 sec
-rate     6.81 million edges/sec (incl time for U=triu(A))
-rate    23.09 million edges/sec (just tricount itself)
-L*U' time (dot):         0.000009 sec (nthreads: 8 speedup 1.3117)
+compute C time:        0.000012 sec
+reduce (C) time:       0.000001 sec
+rate     6.56 million edges/sec (incl time for U=triu(A))
+rate    21.38 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000010 sec (nthreads: 2 speedup 1.20171)
 tricount time:         0.000011 sec (dot product method)
 tri+prep time:         0.000041 sec (incl time to compute L and U)
+compute C time:        0.000010 sec
+reduce (C) time:       0.000001 sec
+rate     6.94 million edges/sec (incl time for U=triu(A))
+rate    25.94 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000009 sec (nthreads: 4 speedup 1.38725)
+tricount time:         0.000010 sec (dot product method)
+tri+prep time:         0.000040 sec (incl time to compute L and U)
 compute C time:        0.000009 sec
-reduce (C) time:       0.000002 sec
-rate     6.97 million edges/sec (incl time for U=triu(A))
-rate    25.02 million edges/sec (just tricount itself)
+reduce (C) time:       0.000001 sec
+rate     7.19 million edges/sec (incl time for U=triu(A))
+rate    29.86 million edges/sec (just tricount itself)
+L*U' time (dot):         0.000008 sec (nthreads: 8 speedup 1.50481)
+tricount time:         0.000009 sec (dot product method)
+tri+prep time:         0.000039 sec (incl time to compute L and U)
+compute C time:        0.000008 sec
+reduce (C) time:       0.000001 sec
+rate     7.33 million edges/sec (incl time for U=triu(A))
+rate    32.36 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 120
-C<L>=L*L time (saxpy):         0.000028 sec
-tricount time:         0.000029 sec (saxpy method)
-tri+prep time:         0.000035 sec (incl time to compute L)
-compute C time:        0.000028 sec
+C<L>=L*L time (saxpy):         0.000033 sec
+tricount time:         0.000034 sec (saxpy method)
+tri+prep time:         0.000041 sec (incl time to compute L)
+compute C time:        0.000033 sec
 reduce (C) time:       0.000001 sec
-rate     8.17 million edges/sec (incl time for L=tril(A))
-rate    10.03 million edges/sec (just tricount itself)
-triangles, method 3: 120
-C<L>=L*L time (saxpy):         0.000016 sec (nthreads: 2 speedup 1.73448)
+rate     7.05 million edges/sec (incl time for L=tril(A))
+rate     8.43 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000016 sec (nthreads: 2 speedup 2.01304)
 tricount time:         0.000017 sec (saxpy method)
-tri+prep time:         0.000023 sec (incl time to compute L)
+tri+prep time:         0.000024 sec (incl time to compute L)
 compute C time:        0.000016 sec
 reduce (C) time:       0.000001 sec
-rate    12.39 million edges/sec (incl time for L=tril(A))
-rate    17.24 million edges/sec (just tricount itself)
-triangles, method 3: 120
-C<L>=L*L time (saxpy):         0.000013 sec (nthreads: 4 speedup 2.10791)
+rate    12.08 million edges/sec (incl time for L=tril(A))
+rate    16.79 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000014 sec (nthreads: 4 speedup 2.44745)
 tricount time:         0.000014 sec (saxpy method)
-tri+prep time:         0.000020 sec (incl time to compute L)
-compute C time:        0.000013 sec
+tri+prep time:         0.000021 sec (incl time to compute L)
+compute C time:        0.000014 sec
 reduce (C) time:       0.000001 sec
-rate    14.15 million edges/sec (incl time for L=tril(A))
-rate    20.84 million edges/sec (just tricount itself)
-triangles, method 3: 120
-C<L>=L*L time (saxpy):         0.000012 sec (nthreads: 8 speedup 2.27045)
-tricount time:         0.000013 sec (saxpy method)
-tri+prep time:         0.000019 sec (incl time to compute L)
-compute C time:        0.000012 sec
+rate    13.81 million edges/sec (incl time for L=tril(A))
+rate    20.31 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.000014 sec (nthreads: 8 speedup 2.43042)
+tricount time:         0.000014 sec (saxpy method)
+tri+prep time:         0.000021 sec (incl time to compute L)
+compute C time:        0.000014 sec
 reduce (C) time:       0.000001 sec
-rate    14.77 million edges/sec (incl time for L=tril(A))
-rate    22.23 million edges/sec (just tricount itself)
+rate    13.68 million edges/sec (incl time for L=tril(A))
+rate    20.04 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
-Wathen: nx 200 ny 200 n 120801 nz 1762400 method 0, time: 0.126 sec
+Wathen: nx 200 ny 200 n 120801 nz 1762400 method 0, time: 0.166 sec
 
-total time to read A matrix:       0.127766 sec
+total time to read A matrix:       0.168617 sec
 
 n 120801 # edges 881200
-U=triu(A) time:        0.002563 sec
-L=tril(A) time:        0.002483 sec
+U=triu(A) time:        0.002978 sec
+L=tril(A) time:        0.002865 sec
 
 ------------------------------------- dot product method:
 # triangles 2160400
-L*U' time (dot):         0.031311 sec
-tricount time:         0.037246 sec (dot product method)
-tri+prep time:         0.042292 sec (incl time to compute L and U)
-compute C time:        0.031311 sec
-reduce (C) time:       0.005935 sec
-rate    20.84 million edges/sec (incl time for U=triu(A))
-rate    23.66 million edges/sec (just tricount itself)
-L*U' time (dot):         0.011505 sec (nthreads: 2 speedup 2.72157)
-tricount time:         0.014218 sec (dot product method)
-tri+prep time:         0.019264 sec (incl time to compute L and U)
-compute C time:        0.011505 sec
-reduce (C) time:       0.002713 sec
-rate    45.74 million edges/sec (incl time for U=triu(A))
-rate    61.98 million edges/sec (just tricount itself)
-L*U' time (dot):         0.007691 sec (nthreads: 4 speedup 4.07101)
-tricount time:         0.010764 sec (dot product method)
-tri+prep time:         0.015810 sec (incl time to compute L and U)
-compute C time:        0.007691 sec
-reduce (C) time:       0.003073 sec
-rate    55.74 million edges/sec (incl time for U=triu(A))
-rate    81.87 million edges/sec (just tricount itself)
-L*U' time (dot):         0.009845 sec (nthreads: 8 speedup 3.18048)
-tricount time:         0.011741 sec (dot product method)
-tri+prep time:         0.016787 sec (incl time to compute L and U)
-compute C time:        0.009845 sec
-reduce (C) time:       0.001896 sec
-rate    52.49 million edges/sec (incl time for U=triu(A))
-rate    75.05 million edges/sec (just tricount itself)
-L*U' time (dot):         0.023385 sec
-tricount time:         0.027309 sec (dot product method)
-tri+prep time:         0.032355 sec (incl time to compute L and U)
-compute C time:        0.023385 sec
-reduce (C) time:       0.003925 sec
-rate    27.24 million edges/sec (incl time for U=triu(A))
-rate    32.27 million edges/sec (just tricount itself)
-L*U' time (dot):         0.010843 sec (nthreads: 2 speedup 2.1567)
-tricount time:         0.013844 sec (dot product method)
-tri+prep time:         0.018890 sec (incl time to compute L and U)
-compute C time:        0.010843 sec
-reduce (C) time:       0.003001 sec
-rate    46.65 million edges/sec (incl time for U=triu(A))
-rate    63.65 million edges/sec (just tricount itself)
-L*U' time (dot):         0.007708 sec (nthreads: 4 speedup 3.03387)
-tricount time:         0.011627 sec (dot product method)
-tri+prep time:         0.016673 sec (incl time to compute L and U)
-compute C time:        0.007708 sec
-reduce (C) time:       0.003919 sec
-rate    52.85 million edges/sec (incl time for U=triu(A))
-rate    75.79 million edges/sec (just tricount itself)
-L*U' time (dot):         0.006164 sec (nthreads: 8 speedup 3.79354)
-tricount time:         0.008035 sec (dot product method)
-tri+prep time:         0.013081 sec (incl time to compute L and U)
-compute C time:        0.006164 sec
-reduce (C) time:       0.001871 sec
-rate    67.36 million edges/sec (incl time for U=triu(A))
-rate   109.67 million edges/sec (just tricount itself)
+L*U' time (dot):         0.029921 sec
+tricount time:         0.032427 sec (dot product method)
+tri+prep time:         0.038270 sec (incl time to compute L and U)
+compute C time:        0.029921 sec
+reduce (C) time:       0.002506 sec
+rate    23.03 million edges/sec (incl time for U=triu(A))
+rate    27.18 million edges/sec (just tricount itself)
+L*U' time (dot):         0.011246 sec (nthreads: 2 speedup 2.66055)
+tricount time:         0.012483 sec (dot product method)
+tri+prep time:         0.018327 sec (incl time to compute L and U)
+compute C time:        0.011246 sec
+reduce (C) time:       0.001237 sec
+rate    48.08 million edges/sec (incl time for U=triu(A))
+rate    70.59 million edges/sec (just tricount itself)
+L*U' time (dot):         0.008601 sec (nthreads: 4 speedup 3.47878)
+tricount time:         0.009216 sec (dot product method)
+tri+prep time:         0.015059 sec (incl time to compute L and U)
+compute C time:        0.008601 sec
+reduce (C) time:       0.000615 sec
+rate    58.52 million edges/sec (incl time for U=triu(A))
+rate    95.62 million edges/sec (just tricount itself)
+L*U' time (dot):         0.006418 sec (nthreads: 8 speedup 4.66232)
+tricount time:         0.006907 sec (dot product method)
+tri+prep time:         0.012751 sec (incl time to compute L and U)
+compute C time:        0.006418 sec
+reduce (C) time:       0.000490 sec
+rate    69.11 million edges/sec (incl time for U=triu(A))
+rate   127.57 million edges/sec (just tricount itself)
+L*U' time (dot):         0.023521 sec
+tricount time:         0.026194 sec (dot product method)
+tri+prep time:         0.032037 sec (incl time to compute L and U)
+compute C time:        0.023521 sec
+reduce (C) time:       0.002673 sec
+rate    27.51 million edges/sec (incl time for U=triu(A))
+rate    33.64 million edges/sec (just tricount itself)
+L*U' time (dot):         0.011493 sec (nthreads: 2 speedup 2.04665)
+tricount time:         0.012796 sec (dot product method)
+tri+prep time:         0.018639 sec (incl time to compute L and U)
+compute C time:        0.011493 sec
+reduce (C) time:       0.001303 sec
+rate    47.28 million edges/sec (incl time for U=triu(A))
+rate    68.87 million edges/sec (just tricount itself)
+L*U' time (dot):         0.006705 sec (nthreads: 4 speedup 3.50821)
+tricount time:         0.007384 sec (dot product method)
+tri+prep time:         0.013228 sec (incl time to compute L and U)
+compute C time:        0.006705 sec
+reduce (C) time:       0.000680 sec
+rate    66.62 million edges/sec (incl time for U=triu(A))
+rate   119.34 million edges/sec (just tricount itself)
+L*U' time (dot):         0.009763 sec (nthreads: 8 speedup 2.4093)
+tricount time:         0.010669 sec (dot product method)
+tri+prep time:         0.016512 sec (incl time to compute L and U)
+compute C time:        0.009763 sec
+reduce (C) time:       0.000906 sec
+rate    53.37 million edges/sec (incl time for U=triu(A))
+rate    82.59 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 2160400
-C<L>=L*L time (saxpy):         0.028242 sec
-tricount time:         0.030344 sec (saxpy method)
-tri+prep time:         0.032826 sec (incl time to compute L)
-compute C time:        0.028242 sec
-reduce (C) time:       0.002101 sec
-rate    26.84 million edges/sec (incl time for L=tril(A))
-rate    29.04 million edges/sec (just tricount itself)
-triangles, method 3: 2160400
-C<L>=L*L time (saxpy):         0.024173 sec (nthreads: 2 speedup 1.16837)
-tricount time:         0.025586 sec (saxpy method)
-tri+prep time:         0.028068 sec (incl time to compute L)
-compute C time:        0.024173 sec
-reduce (C) time:       0.001413 sec
-rate    31.39 million edges/sec (incl time for L=tril(A))
-rate    34.44 million edges/sec (just tricount itself)
-triangles, method 3: 2160400
-C<L>=L*L time (saxpy):         0.012087 sec (nthreads: 4 speedup 2.33654)
-tricount time:         0.012740 sec (saxpy method)
-tri+prep time:         0.015223 sec (incl time to compute L)
-compute C time:        0.012087 sec
-reduce (C) time:       0.000653 sec
-rate    57.89 million edges/sec (incl time for L=tril(A))
-rate    69.17 million edges/sec (just tricount itself)
-triangles, method 3: 2160400
-C<L>=L*L time (saxpy):         0.008452 sec (nthreads: 8 speedup 3.34138)
-tricount time:         0.008817 sec (saxpy method)
-tri+prep time:         0.011300 sec (incl time to compute L)
-compute C time:        0.008452 sec
-reduce (C) time:       0.000365 sec
-rate    77.99 million edges/sec (incl time for L=tril(A))
-rate    99.94 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.026566 sec
+tricount time:         0.028627 sec (saxpy method)
+tri+prep time:         0.031492 sec (incl time to compute L)
+compute C time:        0.026566 sec
+reduce (C) time:       0.002061 sec
+rate    27.98 million edges/sec (incl time for L=tril(A))
+rate    30.78 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.022131 sec (nthreads: 2 speedup 1.2004)
+tricount time:         0.023573 sec (saxpy method)
+tri+prep time:         0.026438 sec (incl time to compute L)
+compute C time:        0.022131 sec
+reduce (C) time:       0.001442 sec
+rate    33.33 million edges/sec (incl time for L=tril(A))
+rate    37.38 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.011668 sec (nthreads: 4 speedup 2.27679)
+tricount time:         0.012288 sec (saxpy method)
+tri+prep time:         0.015153 sec (incl time to compute L)
+compute C time:        0.011668 sec
+reduce (C) time:       0.000620 sec
+rate    58.15 million edges/sec (incl time for L=tril(A))
+rate    71.71 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.016841 sec (nthreads: 8 speedup 1.57751)
+tricount time:         0.018066 sec (saxpy method)
+tri+prep time:         0.020931 sec (incl time to compute L)
+compute C time:        0.016841 sec
+reduce (C) time:       0.001225 sec
+rate    42.10 million edges/sec (incl time for L=tril(A))
+rate    48.78 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
-random 10000 by 10000, nz: 199768, method 0 time 0.023 sec
+random 10000 by 10000, nz: 199768, method 0 time 0.027 sec
 
-total time to read A matrix:       0.023617 sec
+total time to read A matrix:       0.028004 sec
 
 n 10000 # edges 99884
-U=triu(A) time:        0.000328 sec
-L=tril(A) time:        0.000219 sec
+U=triu(A) time:        0.000362 sec
+L=tril(A) time:        0.000234 sec
 
 ------------------------------------- dot product method:
 # triangles 1357
-L*U' time (dot):         0.008270 sec
-tricount time:         0.008607 sec (dot product method)
-tri+prep time:         0.009154 sec (incl time to compute L and U)
-compute C time:        0.008270 sec
-reduce (C) time:       0.000337 sec
-rate    10.91 million edges/sec (incl time for U=triu(A))
-rate    11.60 million edges/sec (just tricount itself)
-L*U' time (dot):         0.004090 sec (nthreads: 2 speedup 2.02205)
-tricount time:         0.004246 sec (dot product method)
-tri+prep time:         0.004793 sec (incl time to compute L and U)
-compute C time:        0.004090 sec
-reduce (C) time:       0.000156 sec
-rate    20.84 million edges/sec (incl time for U=triu(A))
-rate    23.53 million edges/sec (just tricount itself)
-L*U' time (dot):         0.002451 sec (nthreads: 4 speedup 3.37446)
-tricount time:         0.002603 sec (dot product method)
-tri+prep time:         0.003150 sec (incl time to compute L and U)
-compute C time:        0.002451 sec
-reduce (C) time:       0.000153 sec
-rate    31.71 million edges/sec (incl time for U=triu(A))
-rate    38.37 million edges/sec (just tricount itself)
-L*U' time (dot):         0.002521 sec (nthreads: 8 speedup 3.28008)
-tricount time:         0.002687 sec (dot product method)
-tri+prep time:         0.003234 sec (incl time to compute L and U)
-compute C time:        0.002521 sec
+L*U' time (dot):         0.011664 sec
+tricount time:         0.011843 sec (dot product method)
+tri+prep time:         0.012439 sec (incl time to compute L and U)
+compute C time:        0.011664 sec
+reduce (C) time:       0.000179 sec
+rate     8.03 million edges/sec (incl time for U=triu(A))
+rate     8.43 million edges/sec (just tricount itself)
+L*U' time (dot):         0.005893 sec (nthreads: 2 speedup 1.97936)
+tricount time:         0.006089 sec (dot product method)
+tri+prep time:         0.006686 sec (incl time to compute L and U)
+compute C time:        0.005893 sec
+reduce (C) time:       0.000196 sec
+rate    14.94 million edges/sec (incl time for U=triu(A))
+rate    16.40 million edges/sec (just tricount itself)
+L*U' time (dot):         0.003444 sec (nthreads: 4 speedup 3.387)
+tricount time:         0.003609 sec (dot product method)
+tri+prep time:         0.004206 sec (incl time to compute L and U)
+compute C time:        0.003444 sec
 reduce (C) time:       0.000165 sec
-rate    30.89 million edges/sec (incl time for U=triu(A))
-rate    37.18 million edges/sec (just tricount itself)
-L*U' time (dot):         0.007348 sec
-tricount time:         0.007734 sec (dot product method)
-tri+prep time:         0.008280 sec (incl time to compute L and U)
-compute C time:        0.007348 sec
-reduce (C) time:       0.000386 sec
-rate    12.06 million edges/sec (incl time for U=triu(A))
-rate    12.92 million edges/sec (just tricount itself)
-L*U' time (dot):         0.004463 sec (nthreads: 2 speedup 1.64641)
-tricount time:         0.004677 sec (dot product method)
-tri+prep time:         0.005224 sec (incl time to compute L and U)
-compute C time:        0.004463 sec
-reduce (C) time:       0.000214 sec
-rate    19.12 million edges/sec (incl time for U=triu(A))
-rate    21.36 million edges/sec (just tricount itself)
-L*U' time (dot):         0.002655 sec (nthreads: 4 speedup 2.76719)
-tricount time:         0.002886 sec (dot product method)
-tri+prep time:         0.003433 sec (incl time to compute L and U)
-compute C time:        0.002655 sec
-reduce (C) time:       0.000231 sec
-rate    29.09 million edges/sec (incl time for U=triu(A))
-rate    34.60 million edges/sec (just tricount itself)
-L*U' time (dot):         0.004552 sec (nthreads: 8 speedup 1.61416)
-tricount time:         0.004864 sec (dot product method)
-tri+prep time:         0.005411 sec (incl time to compute L and U)
-compute C time:        0.004552 sec
-reduce (C) time:       0.000311 sec
-rate    18.46 million edges/sec (incl time for U=triu(A))
-rate    20.54 million edges/sec (just tricount itself)
+rate    23.75 million edges/sec (incl time for U=triu(A))
+rate    27.67 million edges/sec (just tricount itself)
+L*U' time (dot):         0.002678 sec (nthreads: 8 speedup 4.35594)
+tricount time:         0.002885 sec (dot product method)
+tri+prep time:         0.003481 sec (incl time to compute L and U)
+compute C time:        0.002678 sec
+reduce (C) time:       0.000207 sec
+rate    28.69 million edges/sec (incl time for U=triu(A))
+rate    34.63 million edges/sec (just tricount itself)
+L*U' time (dot):         0.012640 sec
+tricount time:         0.012779 sec (dot product method)
+tri+prep time:         0.013376 sec (incl time to compute L and U)
+compute C time:        0.012640 sec
+reduce (C) time:       0.000139 sec
+rate     7.47 million edges/sec (incl time for U=triu(A))
+rate     7.82 million edges/sec (just tricount itself)
+L*U' time (dot):         0.004852 sec (nthreads: 2 speedup 2.60499)
+tricount time:         0.004964 sec (dot product method)
+tri+prep time:         0.005561 sec (incl time to compute L and U)
+compute C time:        0.004852 sec
+reduce (C) time:       0.000112 sec
+rate    17.96 million edges/sec (incl time for U=triu(A))
+rate    20.12 million edges/sec (just tricount itself)
+L*U' time (dot):         0.002892 sec (nthreads: 4 speedup 4.37131)
+tricount time:         0.002976 sec (dot product method)
+tri+prep time:         0.003572 sec (incl time to compute L and U)
+compute C time:        0.002892 sec
+reduce (C) time:       0.000085 sec
+rate    27.96 million edges/sec (incl time for U=triu(A))
+rate    33.56 million edges/sec (just tricount itself)
+L*U' time (dot):         0.004180 sec (nthreads: 8 speedup 3.02402)
+tricount time:         0.004349 sec (dot product method)
+tri+prep time:         0.004946 sec (incl time to compute L and U)
+compute C time:        0.004180 sec
+reduce (C) time:       0.000169 sec
+rate    20.20 million edges/sec (incl time for U=triu(A))
+rate    22.97 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 1357
-C<L>=L*L time (saxpy):         0.003086 sec
-tricount time:         0.003091 sec (saxpy method)
-tri+prep time:         0.003310 sec (incl time to compute L)
-compute C time:        0.003086 sec
-reduce (C) time:       0.000005 sec
-rate    30.17 million edges/sec (incl time for L=tril(A))
-rate    32.31 million edges/sec (just tricount itself)
-triangles, method 3: 1357
-C<L>=L*L time (saxpy):         0.001912 sec (nthreads: 2 speedup 1.61403)
-tricount time:         0.001916 sec (saxpy method)
-tri+prep time:         0.002136 sec (incl time to compute L)
-compute C time:        0.001912 sec
-reduce (C) time:       0.000004 sec
-rate    46.77 million edges/sec (incl time for L=tril(A))
-rate    52.12 million edges/sec (just tricount itself)
-triangles, method 3: 1357
-C<L>=L*L time (saxpy):         0.001379 sec (nthreads: 4 speedup 2.23754)
-tricount time:         0.001383 sec (saxpy method)
-tri+prep time:         0.001603 sec (incl time to compute L)
-compute C time:        0.001379 sec
-reduce (C) time:       0.000004 sec
-rate    62.32 million edges/sec (incl time for L=tril(A))
-rate    72.20 million edges/sec (just tricount itself)
-triangles, method 3: 1357
-C<L>=L*L time (saxpy):         0.001191 sec (nthreads: 8 speedup 2.59029)
-tricount time:         0.001196 sec (saxpy method)
-tri+prep time:         0.001416 sec (incl time to compute L)
-compute C time:        0.001191 sec
-reduce (C) time:       0.000005 sec
-rate    70.56 million edges/sec (incl time for L=tril(A))
-rate    83.49 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.003369 sec
+tricount time:         0.003378 sec (saxpy method)
+tri+prep time:         0.003612 sec (incl time to compute L)
+compute C time:        0.003369 sec
+reduce (C) time:       0.000009 sec
+rate    27.65 million edges/sec (incl time for L=tril(A))
+rate    29.57 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.002108 sec (nthreads: 2 speedup 1.59874)
+tricount time:         0.002115 sec (saxpy method)
+tri+prep time:         0.002349 sec (incl time to compute L)
+compute C time:        0.002108 sec
+reduce (C) time:       0.000007 sec
+rate    42.53 million edges/sec (incl time for L=tril(A))
+rate    47.24 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.001484 sec (nthreads: 4 speedup 2.27006)
+tricount time:         0.001490 sec (saxpy method)
+tri+prep time:         0.001724 sec (incl time to compute L)
+compute C time:        0.001484 sec
+reduce (C) time:       0.000006 sec
+rate    57.92 million edges/sec (incl time for L=tril(A))
+rate    67.02 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.005230 sec (nthreads: 8 speedup 0.644297)
+tricount time:         0.005238 sec (saxpy method)
+tri+prep time:         0.005472 sec (incl time to compute L)
+compute C time:        0.005230 sec
+reduce (C) time:       0.000008 sec
+rate    18.25 million edges/sec (incl time for L=tril(A))
+rate    19.07 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
-random 10000 by 10000, nz: 199768, method 1 time 0.018 sec
+random 10000 by 10000, nz: 199768, method 1 time 0.017 sec
 
-total time to read A matrix:       0.019086 sec
+total time to read A matrix:       0.017593 sec
 
 n 10000 # edges 99884
-U=triu(A) time:        0.000581 sec
-L=tril(A) time:        0.000302 sec
+U=triu(A) time:        0.000807 sec
+L=tril(A) time:        0.000660 sec
 
 ------------------------------------- dot product method:
 # triangles 1357
-L*U' time (dot):         0.008025 sec
-tricount time:         0.008343 sec (dot product method)
-tri+prep time:         0.009226 sec (incl time to compute L and U)
-compute C time:        0.008025 sec
-reduce (C) time:       0.000319 sec
-rate    10.83 million edges/sec (incl time for U=triu(A))
-rate    11.97 million edges/sec (just tricount itself)
-L*U' time (dot):         0.004084 sec (nthreads: 2 speedup 1.96498)
-tricount time:         0.004253 sec (dot product method)
-tri+prep time:         0.005136 sec (incl time to compute L and U)
-compute C time:        0.004084 sec
-reduce (C) time:       0.000169 sec
-rate    19.45 million edges/sec (incl time for U=triu(A))
-rate    23.48 million edges/sec (just tricount itself)
-L*U' time (dot):         0.002465 sec (nthreads: 4 speedup 3.25503)
-tricount time:         0.002619 sec (dot product method)
-tri+prep time:         0.003501 sec (incl time to compute L and U)
-compute C time:        0.002465 sec
-reduce (C) time:       0.000154 sec
-rate    28.53 million edges/sec (incl time for U=triu(A))
-rate    38.14 million edges/sec (just tricount itself)
-L*U' time (dot):         0.002651 sec (nthreads: 8 speedup 3.02704)
-tricount time:         0.003370 sec (dot product method)
-tri+prep time:         0.004252 sec (incl time to compute L and U)
-compute C time:        0.002651 sec
-reduce (C) time:       0.000719 sec
-rate    23.49 million edges/sec (incl time for U=triu(A))
-rate    29.64 million edges/sec (just tricount itself)
-L*U' time (dot):         0.007855 sec
-tricount time:         0.008275 sec (dot product method)
-tri+prep time:         0.009157 sec (incl time to compute L and U)
-compute C time:        0.007855 sec
-reduce (C) time:       0.000420 sec
-rate    10.91 million edges/sec (incl time for U=triu(A))
-rate    12.07 million edges/sec (just tricount itself)
-L*U' time (dot):         0.004511 sec (nthreads: 2 speedup 1.7412)
-tricount time:         0.004818 sec (dot product method)
-tri+prep time:         0.005701 sec (incl time to compute L and U)
-compute C time:        0.004511 sec
-reduce (C) time:       0.000307 sec
-rate    17.52 million edges/sec (incl time for U=triu(A))
-rate    20.73 million edges/sec (just tricount itself)
-L*U' time (dot):         0.002717 sec (nthreads: 4 speedup 2.89116)
-tricount time:         0.002905 sec (dot product method)
-tri+prep time:         0.003787 sec (incl time to compute L and U)
-compute C time:        0.002717 sec
-reduce (C) time:       0.000188 sec
-rate    26.37 million edges/sec (incl time for U=triu(A))
-rate    34.39 million edges/sec (just tricount itself)
-L*U' time (dot):         0.002350 sec (nthreads: 8 speedup 3.34247)
-tricount time:         0.002517 sec (dot product method)
-tri+prep time:         0.003400 sec (incl time to compute L and U)
-compute C time:        0.002350 sec
-reduce (C) time:       0.000167 sec
-rate    29.38 million edges/sec (incl time for U=triu(A))
-rate    39.68 million edges/sec (just tricount itself)
+L*U' time (dot):         0.014539 sec
+tricount time:         0.014694 sec (dot product method)
+tri+prep time:         0.016162 sec (incl time to compute L and U)
+compute C time:        0.014539 sec
+reduce (C) time:       0.000156 sec
+rate     6.18 million edges/sec (incl time for U=triu(A))
+rate     6.80 million edges/sec (just tricount itself)
+L*U' time (dot):         0.005467 sec (nthreads: 2 speedup 2.65947)
+tricount time:         0.005544 sec (dot product method)
+tri+prep time:         0.007011 sec (incl time to compute L and U)
+compute C time:        0.005467 sec
+reduce (C) time:       0.000077 sec
+rate    14.25 million edges/sec (incl time for U=triu(A))
+rate    18.02 million edges/sec (just tricount itself)
+L*U' time (dot):         0.003181 sec (nthreads: 4 speedup 4.57045)
+tricount time:         0.003257 sec (dot product method)
+tri+prep time:         0.004724 sec (incl time to compute L and U)
+compute C time:        0.003181 sec
+reduce (C) time:       0.000076 sec
+rate    21.14 million edges/sec (incl time for U=triu(A))
+rate    30.67 million edges/sec (just tricount itself)
+L*U' time (dot):         0.002482 sec (nthreads: 8 speedup 5.85712)
+tricount time:         0.002570 sec (dot product method)
+tri+prep time:         0.004037 sec (incl time to compute L and U)
+compute C time:        0.002482 sec
+reduce (C) time:       0.000088 sec
+rate    24.74 million edges/sec (incl time for U=triu(A))
+rate    38.87 million edges/sec (just tricount itself)
+L*U' time (dot):         0.013548 sec
+tricount time:         0.013735 sec (dot product method)
+tri+prep time:         0.015202 sec (incl time to compute L and U)
+compute C time:        0.013548 sec
+reduce (C) time:       0.000187 sec
+rate     6.57 million edges/sec (incl time for U=triu(A))
+rate     7.27 million edges/sec (just tricount itself)
+L*U' time (dot):         0.005883 sec (nthreads: 2 speedup 2.30282)
+tricount time:         0.006074 sec (dot product method)
+tri+prep time:         0.007542 sec (incl time to compute L and U)
+compute C time:        0.005883 sec
+reduce (C) time:       0.000191 sec
+rate    13.24 million edges/sec (incl time for U=triu(A))
+rate    16.44 million edges/sec (just tricount itself)
+L*U' time (dot):         0.003481 sec (nthreads: 4 speedup 3.89243)
+tricount time:         0.003664 sec (dot product method)
+tri+prep time:         0.005131 sec (incl time to compute L and U)
+compute C time:        0.003481 sec
+reduce (C) time:       0.000183 sec
+rate    19.47 million edges/sec (incl time for U=triu(A))
+rate    27.26 million edges/sec (just tricount itself)
+L*U' time (dot):         0.002990 sec (nthreads: 8 speedup 4.53042)
+tricount time:         0.003239 sec (dot product method)
+tri+prep time:         0.004706 sec (incl time to compute L and U)
+compute C time:        0.002990 sec
+reduce (C) time:       0.000249 sec
+rate    21.22 million edges/sec (incl time for U=triu(A))
+rate    30.84 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 1357
-C<L>=L*L time (saxpy):         0.002995 sec
-tricount time:         0.002999 sec (saxpy method)
-tri+prep time:         0.003301 sec (incl time to compute L)
-compute C time:        0.002995 sec
-reduce (C) time:       0.000004 sec
-rate    30.26 million edges/sec (incl time for L=tril(A))
-rate    33.31 million edges/sec (just tricount itself)
-triangles, method 3: 1357
-C<L>=L*L time (saxpy):         0.001938 sec (nthreads: 2 speedup 1.54549)
-tricount time:         0.001942 sec (saxpy method)
-tri+prep time:         0.002244 sec (incl time to compute L)
-compute C time:        0.001938 sec
-reduce (C) time:       0.000005 sec
-rate    44.51 million edges/sec (incl time for L=tril(A))
-rate    51.43 million edges/sec (just tricount itself)
-triangles, method 3: 1357
-C<L>=L*L time (saxpy):         0.001348 sec (nthreads: 4 speedup 2.22153)
-tricount time:         0.001352 sec (saxpy method)
-tri+prep time:         0.001654 sec (incl time to compute L)
-compute C time:        0.001348 sec
-reduce (C) time:       0.000004 sec
-rate    60.40 million edges/sec (incl time for L=tril(A))
-rate    73.86 million edges/sec (just tricount itself)
-triangles, method 3: 1357
-C<L>=L*L time (saxpy):         0.001254 sec (nthreads: 8 speedup 2.38776)
-tricount time:         0.001259 sec (saxpy method)
-tri+prep time:         0.001561 sec (incl time to compute L)
-compute C time:        0.001254 sec
+C<L>=L*L time (saxpy):         0.004303 sec
+tricount time:         0.004314 sec (saxpy method)
+tri+prep time:         0.004974 sec (incl time to compute L)
+compute C time:        0.004303 sec
+reduce (C) time:       0.000011 sec
+rate    20.08 million edges/sec (incl time for L=tril(A))
+rate    23.15 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.002223 sec (nthreads: 2 speedup 1.93561)
+tricount time:         0.002230 sec (saxpy method)
+tri+prep time:         0.002890 sec (incl time to compute L)
+compute C time:        0.002223 sec
+reduce (C) time:       0.000008 sec
+rate    34.56 million edges/sec (incl time for L=tril(A))
+rate    44.78 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.001506 sec (nthreads: 4 speedup 2.8577)
+tricount time:         0.001511 sec (saxpy method)
+tri+prep time:         0.002171 sec (incl time to compute L)
+compute C time:        0.001506 sec
 reduce (C) time:       0.000005 sec
-rate    63.99 million edges/sec (incl time for L=tril(A))
-rate    79.32 million edges/sec (just tricount itself)
+rate    46.01 million edges/sec (incl time for L=tril(A))
+rate    66.10 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         0.001319 sec (nthreads: 8 speedup 3.26257)
+tricount time:         0.001325 sec (saxpy method)
+tri+prep time:         0.001985 sec (incl time to compute L)
+compute C time:        0.001319 sec
+reduce (C) time:       0.000006 sec
+rate    50.33 million edges/sec (incl time for L=tril(A))
+rate    75.40 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
-random 100000 by 100000, nz: 19980330, method 0 time 2.296 sec
+random 100000 by 100000, nz: 19980330, method 0 time 2.496 sec
 
-total time to read A matrix:       2.325069 sec
+total time to read A matrix:       2.523121 sec
 
 n 100000 # edges 9990165
-U=triu(A) time:        0.017604 sec
-L=tril(A) time:        0.019649 sec
+U=triu(A) time:        0.018984 sec
+L=tril(A) time:        0.020506 sec
 
 ------------------------------------- dot product method:
 # triangles 1330131
-L*U' time (dot):         7.128369 sec
-tricount time:         7.163533 sec (dot product method)
-tri+prep time:         7.200786 sec (incl time to compute L and U)
-compute C time:        7.128369 sec
-reduce (C) time:       0.035164 sec
-rate     1.39 million edges/sec (incl time for U=triu(A))
-rate     1.39 million edges/sec (just tricount itself)
-L*U' time (dot):         3.510440 sec (nthreads: 2 speedup 2.03062)
-tricount time:         3.532405 sec (dot product method)
-tri+prep time:         3.569658 sec (incl time to compute L and U)
-compute C time:        3.510440 sec
-reduce (C) time:       0.021965 sec
-rate     2.80 million edges/sec (incl time for U=triu(A))
-rate     2.83 million edges/sec (just tricount itself)
-L*U' time (dot):         1.912733 sec (nthreads: 4 speedup 3.7268)
-tricount time:         1.929098 sec (dot product method)
-tri+prep time:         1.966351 sec (incl time to compute L and U)
-compute C time:        1.912733 sec
-reduce (C) time:       0.016365 sec
-rate     5.08 million edges/sec (incl time for U=triu(A))
-rate     5.18 million edges/sec (just tricount itself)
-L*U' time (dot):         1.096600 sec (nthreads: 8 speedup 6.50043)
-tricount time:         1.113480 sec (dot product method)
-tri+prep time:         1.150733 sec (incl time to compute L and U)
-compute C time:        1.096600 sec
-reduce (C) time:       0.016880 sec
-rate     8.68 million edges/sec (incl time for U=triu(A))
-rate     8.97 million edges/sec (just tricount itself)
-L*U' time (dot):         8.085635 sec
-tricount time:         8.117637 sec (dot product method)
-tri+prep time:         8.154890 sec (incl time to compute L and U)
-compute C time:        8.085635 sec
-reduce (C) time:       0.032002 sec
-rate     1.23 million edges/sec (incl time for U=triu(A))
-rate     1.23 million edges/sec (just tricount itself)
-L*U' time (dot):         3.666974 sec (nthreads: 2 speedup 2.20499)
-tricount time:         3.686753 sec (dot product method)
-tri+prep time:         3.724006 sec (incl time to compute L and U)
-compute C time:        3.666974 sec
-reduce (C) time:       0.019779 sec
-rate     2.68 million edges/sec (incl time for U=triu(A))
-rate     2.71 million edges/sec (just tricount itself)
-L*U' time (dot):         2.156598 sec (nthreads: 4 speedup 3.74925)
-tricount time:         2.181005 sec (dot product method)
-tri+prep time:         2.218258 sec (incl time to compute L and U)
-compute C time:        2.156598 sec
-reduce (C) time:       0.024407 sec
-rate     4.50 million edges/sec (incl time for U=triu(A))
-rate     4.58 million edges/sec (just tricount itself)
-L*U' time (dot):         1.662084 sec (nthreads: 8 speedup 4.86476)
-tricount time:         1.681525 sec (dot product method)
-tri+prep time:         1.718778 sec (incl time to compute L and U)
-compute C time:        1.662084 sec
-reduce (C) time:       0.019441 sec
-rate     5.81 million edges/sec (incl time for U=triu(A))
-rate     5.94 million edges/sec (just tricount itself)
+L*U' time (dot):        10.037756 sec
+tricount time:        10.065191 sec (dot product method)
+tri+prep time:        10.104681 sec (incl time to compute L and U)
+compute C time:       10.037756 sec
+reduce (C) time:       0.027436 sec
+rate     0.99 million edges/sec (incl time for U=triu(A))
+rate     0.99 million edges/sec (just tricount itself)
+L*U' time (dot):         5.268859 sec (nthreads: 2 speedup 1.90511)
+tricount time:         5.287288 sec (dot product method)
+tri+prep time:         5.326778 sec (incl time to compute L and U)
+compute C time:        5.268859 sec
+reduce (C) time:       0.018428 sec
+rate     1.88 million edges/sec (incl time for U=triu(A))
+rate     1.89 million edges/sec (just tricount itself)
+L*U' time (dot):         3.710080 sec (nthreads: 4 speedup 2.70554)
+tricount time:         3.724638 sec (dot product method)
+tri+prep time:         3.764128 sec (incl time to compute L and U)
+compute C time:        3.710080 sec
+reduce (C) time:       0.014557 sec
+rate     2.65 million edges/sec (incl time for U=triu(A))
+rate     2.68 million edges/sec (just tricount itself)
+L*U' time (dot):         2.599948 sec (nthreads: 8 speedup 3.86075)
+tricount time:         2.615894 sec (dot product method)
+tri+prep time:         2.655384 sec (incl time to compute L and U)
+compute C time:        2.599948 sec
+reduce (C) time:       0.015946 sec
+rate     3.76 million edges/sec (incl time for U=triu(A))
+rate     3.82 million edges/sec (just tricount itself)
+L*U' time (dot):        10.711924 sec
+tricount time:        10.739376 sec (dot product method)
+tri+prep time:        10.778866 sec (incl time to compute L and U)
+compute C time:       10.711924 sec
+reduce (C) time:       0.027452 sec
+rate     0.93 million edges/sec (incl time for U=triu(A))
+rate     0.93 million edges/sec (just tricount itself)
+L*U' time (dot):         6.001916 sec (nthreads: 2 speedup 1.78475)
+tricount time:         6.019951 sec (dot product method)
+tri+prep time:         6.059441 sec (incl time to compute L and U)
+compute C time:        6.001916 sec
+reduce (C) time:       0.018035 sec
+rate     1.65 million edges/sec (incl time for U=triu(A))
+rate     1.66 million edges/sec (just tricount itself)
+L*U' time (dot):         3.885379 sec (nthreads: 4 speedup 2.75698)
+tricount time:         3.899436 sec (dot product method)
+tri+prep time:         3.938926 sec (incl time to compute L and U)
+compute C time:        3.885379 sec
+reduce (C) time:       0.014056 sec
+rate     2.54 million edges/sec (incl time for U=triu(A))
+rate     2.56 million edges/sec (just tricount itself)
+L*U' time (dot):         2.636954 sec (nthreads: 8 speedup 4.06223)
+tricount time:         2.652757 sec (dot product method)
+tri+prep time:         2.692247 sec (incl time to compute L and U)
+compute C time:        2.636954 sec
+reduce (C) time:       0.015802 sec
+rate     3.71 million edges/sec (incl time for U=triu(A))
+rate     3.77 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 1330131
-C<L>=L*L time (saxpy):         5.713133 sec
-tricount time:         5.717672 sec (saxpy method)
-tri+prep time:         5.737321 sec (incl time to compute L)
-compute C time:        5.713133 sec
-reduce (C) time:       0.004539 sec
-rate     1.74 million edges/sec (incl time for L=tril(A))
-rate     1.75 million edges/sec (just tricount itself)
-triangles, method 3: 1330131
-C<L>=L*L time (saxpy):         3.236083 sec (nthreads: 2 speedup 1.76545)
-tricount time:         3.240762 sec (saxpy method)
-tri+prep time:         3.260411 sec (incl time to compute L)
-compute C time:        3.236083 sec
-reduce (C) time:       0.004679 sec
-rate     3.06 million edges/sec (incl time for L=tril(A))
-rate     3.08 million edges/sec (just tricount itself)
-triangles, method 3: 1330131
-C<L>=L*L time (saxpy):         2.169659 sec (nthreads: 4 speedup 2.63319)
-tricount time:         2.172323 sec (saxpy method)
-tri+prep time:         2.191972 sec (incl time to compute L)
-compute C time:        2.169659 sec
-reduce (C) time:       0.002664 sec
-rate     4.56 million edges/sec (incl time for L=tril(A))
-rate     4.60 million edges/sec (just tricount itself)
-triangles, method 3: 1330131
-C<L>=L*L time (saxpy):         2.109164 sec (nthreads: 8 speedup 2.70872)
-tricount time:         2.110115 sec (saxpy method)
-tri+prep time:         2.129765 sec (incl time to compute L)
-compute C time:        2.109164 sec
-reduce (C) time:       0.000952 sec
-rate     4.69 million edges/sec (incl time for L=tril(A))
-rate     4.73 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         5.043538 sec
+tricount time:         5.049500 sec (saxpy method)
+tri+prep time:         5.070006 sec (incl time to compute L)
+compute C time:        5.043538 sec
+reduce (C) time:       0.005962 sec
+rate     1.97 million edges/sec (incl time for L=tril(A))
+rate     1.98 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         3.138652 sec (nthreads: 2 speedup 1.60691)
+tricount time:         3.141832 sec (saxpy method)
+tri+prep time:         3.162339 sec (incl time to compute L)
+compute C time:        3.138652 sec
+reduce (C) time:       0.003181 sec
+rate     3.16 million edges/sec (incl time for L=tril(A))
+rate     3.18 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         1.656651 sec (nthreads: 4 speedup 3.04442)
+tricount time:         1.658998 sec (saxpy method)
+tri+prep time:         1.679504 sec (incl time to compute L)
+compute C time:        1.656651 sec
+reduce (C) time:       0.002346 sec
+rate     5.95 million edges/sec (incl time for L=tril(A))
+rate     6.02 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         1.782248 sec (nthreads: 8 speedup 2.82988)
+tricount time:         1.783162 sec (saxpy method)
+tri+prep time:         1.803668 sec (incl time to compute L)
+compute C time:        1.782248 sec
+reduce (C) time:       0.000914 sec
+rate     5.54 million edges/sec (incl time for L=tril(A))
+rate     5.60 million edges/sec (just tricount itself)
 
 --------------------------------------------------------------
-random 100000 by 100000, nz: 19980330, method 1 time 2.239 sec
+random 100000 by 100000, nz: 19980330, method 1 time 1.848 sec
 
-total time to read A matrix:       2.273442 sec
+total time to read A matrix:       1.877002 sec
 
 n 100000 # edges 9990165
-U=triu(A) time:        0.021285 sec
-L=tril(A) time:        0.019572 sec
+U=triu(A) time:        0.019503 sec
+L=tril(A) time:        0.026980 sec
 
 ------------------------------------- dot product method:
 # triangles 1330131
-L*U' time (dot):         7.395066 sec
-tricount time:         7.429692 sec (dot product method)
-tri+prep time:         7.470549 sec (incl time to compute L and U)
-compute C time:        7.395066 sec
-reduce (C) time:       0.034626 sec
-rate     1.34 million edges/sec (incl time for U=triu(A))
-rate     1.34 million edges/sec (just tricount itself)
-L*U' time (dot):         3.486968 sec (nthreads: 2 speedup 2.12077)
-tricount time:         3.508906 sec (dot product method)
-tri+prep time:         3.549763 sec (incl time to compute L and U)
-compute C time:        3.486968 sec
-reduce (C) time:       0.021938 sec
-rate     2.81 million edges/sec (incl time for U=triu(A))
-rate     2.85 million edges/sec (just tricount itself)
-L*U' time (dot):         2.862556 sec (nthreads: 4 speedup 2.58338)
-tricount time:         2.902115 sec (dot product method)
-tri+prep time:         2.942972 sec (incl time to compute L and U)
-compute C time:        2.862556 sec
-reduce (C) time:       0.039559 sec
-rate     3.39 million edges/sec (incl time for U=triu(A))
-rate     3.44 million edges/sec (just tricount itself)
-L*U' time (dot):         1.379506 sec (nthreads: 8 speedup 5.36066)
-tricount time:         1.406556 sec (dot product method)
-tri+prep time:         1.447413 sec (incl time to compute L and U)
-compute C time:        1.379506 sec
-reduce (C) time:       0.027050 sec
-rate     6.90 million edges/sec (incl time for U=triu(A))
-rate     7.10 million edges/sec (just tricount itself)
-L*U' time (dot):        13.986013 sec
-tricount time:        14.047180 sec (dot product method)
-tri+prep time:        14.088037 sec (incl time to compute L and U)
-compute C time:       13.986013 sec
-reduce (C) time:       0.061167 sec
-rate     0.71 million edges/sec (incl time for U=triu(A))
-rate     0.71 million edges/sec (just tricount itself)
-L*U' time (dot):         5.212362 sec (nthreads: 2 speedup 2.68324)
-tricount time:         5.232665 sec (dot product method)
-tri+prep time:         5.273522 sec (incl time to compute L and U)
-compute C time:        5.212362 sec
-reduce (C) time:       0.020303 sec
-rate     1.89 million edges/sec (incl time for U=triu(A))
-rate     1.91 million edges/sec (just tricount itself)
-L*U' time (dot):         2.603460 sec (nthreads: 4 speedup 5.37209)
-tricount time:         2.621100 sec (dot product method)
-tri+prep time:         2.661957 sec (incl time to compute L and U)
-compute C time:        2.603460 sec
-reduce (C) time:       0.017640 sec
+L*U' time (dot):         9.740372 sec
+tricount time:         9.767869 sec (dot product method)
+tri+prep time:         9.814351 sec (incl time to compute L and U)
+compute C time:        9.740372 sec
+reduce (C) time:       0.027498 sec
+rate     1.02 million edges/sec (incl time for U=triu(A))
+rate     1.02 million edges/sec (just tricount itself)
+L*U' time (dot):         5.274905 sec (nthreads: 2 speedup 1.84655)
+tricount time:         5.291900 sec (dot product method)
+tri+prep time:         5.338383 sec (incl time to compute L and U)
+compute C time:        5.274905 sec
+reduce (C) time:       0.016996 sec
+rate     1.87 million edges/sec (incl time for U=triu(A))
+rate     1.89 million edges/sec (just tricount itself)
+L*U' time (dot):         3.592400 sec (nthreads: 4 speedup 2.71138)
+tricount time:         3.607020 sec (dot product method)
+tri+prep time:         3.653502 sec (incl time to compute L and U)
+compute C time:        3.592400 sec
+reduce (C) time:       0.014619 sec
+rate     2.73 million edges/sec (incl time for U=triu(A))
+rate     2.77 million edges/sec (just tricount itself)
+L*U' time (dot):         2.499505 sec (nthreads: 8 speedup 3.89692)
+tricount time:         2.515554 sec (dot product method)
+tri+prep time:         2.562037 sec (incl time to compute L and U)
+compute C time:        2.499505 sec
+reduce (C) time:       0.016050 sec
+rate     3.90 million edges/sec (incl time for U=triu(A))
+rate     3.97 million edges/sec (just tricount itself)
+L*U' time (dot):        10.443740 sec
+tricount time:        10.472049 sec (dot product method)
+tri+prep time:        10.518531 sec (incl time to compute L and U)
+compute C time:       10.443740 sec
+reduce (C) time:       0.028309 sec
+rate     0.95 million edges/sec (incl time for U=triu(A))
+rate     0.95 million edges/sec (just tricount itself)
+L*U' time (dot):         5.903907 sec (nthreads: 2 speedup 1.76895)
+tricount time:         5.922306 sec (dot product method)
+tri+prep time:         5.968789 sec (incl time to compute L and U)
+compute C time:        5.903907 sec
+reduce (C) time:       0.018399 sec
+rate     1.67 million edges/sec (incl time for U=triu(A))
+rate     1.69 million edges/sec (just tricount itself)
+L*U' time (dot):         3.949521 sec (nthreads: 4 speedup 2.64431)
+tricount time:         3.962544 sec (dot product method)
+tri+prep time:         4.009026 sec (incl time to compute L and U)
+compute C time:        3.949521 sec
+reduce (C) time:       0.013023 sec
+rate     2.49 million edges/sec (incl time for U=triu(A))
+rate     2.52 million edges/sec (just tricount itself)
+L*U' time (dot):         2.604668 sec (nthreads: 8 speedup 4.00962)
+tricount time:         2.620189 sec (dot product method)
+tri+prep time:         2.666671 sec (incl time to compute L and U)
+compute C time:        2.604668 sec
+reduce (C) time:       0.015521 sec
 rate     3.75 million edges/sec (incl time for U=triu(A))
 rate     3.81 million edges/sec (just tricount itself)
-L*U' time (dot):         1.643353 sec (nthreads: 8 speedup 8.51066)
-tricount time:         1.678784 sec (dot product method)
-tri+prep time:         1.719641 sec (incl time to compute L and U)
-compute C time:        1.643353 sec
-reduce (C) time:       0.035431 sec
-rate     5.81 million edges/sec (incl time for U=triu(A))
-rate     5.95 million edges/sec (just tricount itself)
 
 ----------------------------------- saxpy method:
-triangles, method 3: 1330131
-C<L>=L*L time (saxpy):         8.392654 sec
-tricount time:         8.397451 sec (saxpy method)
-tri+prep time:         8.417023 sec (incl time to compute L)
-compute C time:        8.392654 sec
-reduce (C) time:       0.004796 sec
-rate     1.19 million edges/sec (incl time for L=tril(A))
-rate     1.19 million edges/sec (just tricount itself)
-triangles, method 3: 1330131
-C<L>=L*L time (saxpy):         3.096104 sec (nthreads: 2 speedup 2.71071)
-tricount time:         3.099485 sec (saxpy method)
-tri+prep time:         3.119057 sec (incl time to compute L)
-compute C time:        3.096104 sec
-reduce (C) time:       0.003381 sec
-rate     3.20 million edges/sec (incl time for L=tril(A))
-rate     3.22 million edges/sec (just tricount itself)
-triangles, method 3: 1330131
-C<L>=L*L time (saxpy):         1.860165 sec (nthreads: 4 speedup 4.51178)
-tricount time:         1.862493 sec (saxpy method)
-tri+prep time:         1.882065 sec (incl time to compute L)
-compute C time:        1.860165 sec
-reduce (C) time:       0.002328 sec
-rate     5.31 million edges/sec (incl time for L=tril(A))
-rate     5.36 million edges/sec (just tricount itself)
-triangles, method 3: 1330131
-C<L>=L*L time (saxpy):         2.169445 sec (nthreads: 8 speedup 3.86857)
-tricount time:         2.170250 sec (saxpy method)
-tri+prep time:         2.189823 sec (incl time to compute L)
-compute C time:        2.169445 sec
-reduce (C) time:       0.000806 sec
-rate     4.56 million edges/sec (incl time for L=tril(A))
-rate     4.60 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         4.623672 sec
+tricount time:         4.629221 sec (saxpy method)
+tri+prep time:         4.656200 sec (incl time to compute L)
+compute C time:        4.623672 sec
+reduce (C) time:       0.005549 sec
+rate     2.15 million edges/sec (incl time for L=tril(A))
+rate     2.16 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         2.570878 sec (nthreads: 2 speedup 1.79848)
+tricount time:         2.574308 sec (saxpy method)
+tri+prep time:         2.601288 sec (incl time to compute L)
+compute C time:        2.570878 sec
+reduce (C) time:       0.003430 sec
+rate     3.84 million edges/sec (incl time for L=tril(A))
+rate     3.88 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         1.508288 sec (nthreads: 4 speedup 3.06551)
+tricount time:         1.510577 sec (saxpy method)
+tri+prep time:         1.537557 sec (incl time to compute L)
+compute C time:        1.508288 sec
+reduce (C) time:       0.002289 sec
+rate     6.50 million edges/sec (incl time for L=tril(A))
+rate     6.61 million edges/sec (just tricount itself)
+C<L>=L*L time (saxpy):         1.565095 sec (nthreads: 8 speedup 2.95424)
+tricount time:         1.578662 sec (saxpy method)
+tri+prep time:         1.605642 sec (incl time to compute L)
+compute C time:        1.565095 sec
+reduce (C) time:       0.013567 sec
+rate     6.22 million edges/sec (incl time for L=tril(A))
+rate     6.33 million edges/sec (just tricount itself)
 
diff --git a/GraphBLAS/Demo/Output/user_none_grb_none.out b/GraphBLAS/Demo/Output/user_none_grb_none.out
deleted file mode 100644
index 8f40d0741c..0000000000
--- a/GraphBLAS/Demo/Output/user_none_grb_none.out
+++ /dev/null
@@ -1,859 +0,0 @@
-Demo: ../build/openmp_demo:
-GraphBLAS is using nothing (OK since user program is single-threaded)
-to synchronize user threads.
-This user program is single threaded.
-
-================= worker 0 starts:
-
------------------ worker 0 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 0 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 0 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
-================= worker 1 starts:
-
------------------ worker 1 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 1 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 1 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
-================= worker 2 starts:
-
------------------ worker 2 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 2 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 2 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
-================= worker 3 starts:
-
------------------ worker 3 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 3 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 3 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
-================= worker 4 starts:
-
------------------ worker 4 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 4 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 4 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
-================= worker 5 starts:
-
------------------ worker 5 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 5 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 5 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
-================= worker 6 starts:
-
------------------ worker 6 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 6 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 6 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
-================= worker 7 starts:
-
------------------ worker 7 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 7 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 7 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
----- Master prints matrix 0
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 1
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 2
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 3
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 4
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 5
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 6
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 7
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
-
----- Master thread prints an error message:
-master -1 : Error: GraphBLAS error: GrB_NULL_POINTER
-function: GrB_Matrix_new (&A, type, nrows, ncols)
-Required argument is null: [A]
-
diff --git a/GraphBLAS/Demo/Output/user_none_grb_openmp.out b/GraphBLAS/Demo/Output/user_none_grb_openmp.out
deleted file mode 100644
index d0b806cd13..0000000000
--- a/GraphBLAS/Demo/Output/user_none_grb_openmp.out
+++ /dev/null
@@ -1,859 +0,0 @@
-Demo: ./openmp_demo:
-GraphBLAS is using an OpenMP critical section
-to synchronize user threads.
-This user program is single threaded.
-
-================= worker 0 starts:
-
------------------ worker 0 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 0 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 0 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
-================= worker 1 starts:
-
------------------ worker 1 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 1 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 1 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
-================= worker 2 starts:
-
------------------ worker 2 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 2 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 2 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
-================= worker 3 starts:
-
------------------ worker 3 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 3 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 3 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
-================= worker 4 starts:
-
------------------ worker 4 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 4 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 4 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
-================= worker 5 starts:
-
------------------ worker 5 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 5 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 5 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
-================= worker 6 starts:
-
------------------ worker 6 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 6 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 6 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
-================= worker 7 starts:
-
------------------ worker 7 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 7 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 7 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
----- Master prints matrix 0
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 1
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 2
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 3
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 4
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 5
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 6
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 7
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
-
----- Master thread prints an error message:
-master -1 : Error: GraphBLAS error: GrB_NULL_POINTER
-function: GrB_Matrix_new (&A, type, nrows, ncols)
-Required argument is null: [A]
-
diff --git a/GraphBLAS/Demo/Output/user_none_grb_pthread.out b/GraphBLAS/Demo/Output/user_none_grb_pthread.out
deleted file mode 100644
index 336184072d..0000000000
--- a/GraphBLAS/Demo/Output/user_none_grb_pthread.out
+++ /dev/null
@@ -1,859 +0,0 @@
-Demo: ../build/openmp_demo:
-GraphBLAS is using a POSIX pthread mutex
-to synchronize user threads.
-This user program is single threaded.
-
-================= worker 0 starts:
-
------------------ worker 0 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 0 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 0 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
-================= worker 1 starts:
-
------------------ worker 1 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 1 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 1 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
-================= worker 2 starts:
-
------------------ worker 2 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 2 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 2 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
-================= worker 3 starts:
-
------------------ worker 3 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 3 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 3 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
-================= worker 4 starts:
-
------------------ worker 4 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 4 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 4 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
-================= worker 5 starts:
-
------------------ worker 5 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 5 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 5 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
-================= worker 6 starts:
-
------------------ worker 6 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 6 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 6 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
-================= worker 7 starts:
-
------------------ worker 7 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 7 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 7 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
----- Master prints matrix 0
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 1
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 2
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 3
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 4
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 5
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 6
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 7
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
-
----- Master thread prints an error message:
-master -1 : Error: GraphBLAS error: GrB_NULL_POINTER
-function: GrB_Matrix_new (&A, type, nrows, ncols)
-Required argument is null: [A]
-
diff --git a/GraphBLAS/Demo/Output/user_openmp_grb_none.out b/GraphBLAS/Demo/Output/user_openmp_grb_none.out
deleted file mode 100644
index abbe4db54f..0000000000
--- a/GraphBLAS/Demo/Output/user_openmp_grb_none.out
+++ /dev/null
@@ -1,718 +0,0 @@
-Demo: ./openmp_demo:
-GraphBLAS is using (nothing! This will fail!)
-to synchronize user threads.
-User threads in this program are OpenMP pthreads.
-
-================= worker 0 starts:
-
-================= worker 2 starts:
-
-================= worker 1 starts:
-
-================= worker 7 starts:
-
-================= worker 3 starts:
-
-================= worker 4 starts:
-
-================= worker 6 starts:
-
-================= worker 5 starts:
-
------------------ worker 0 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 4 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 7 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 2 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 3 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 6 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 1 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 5 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 7 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 6 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 2 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 4 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-queued state inconsistent: [0] != [1]
-
------------------ worker 7 error should be same:
-GraphBLAS error: GrB_INVALID_OBJECT
-function: GxB_Matrix_fprint (A, name, pr, f)
-matrix queued state inconsistent: [A], [0] != [1]
-
-Failure (id: 4, info: 3): GraphBLAS error: GrB_INVALID_OBJECT
-function: GxB_Matrix_fprint (A, name, pr, f)
-matrix queued state inconsistent: [A], [0] != [1]
-
-
------------------ worker 1 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 6 error should be same:
-GraphBLAS error: GrB_INVALID_OBJECT
-function: GxB_Matrix_fprint (A, name, pr, f)
-matrix queued state inconsistent: [A], [0] != [1]
-
-
------------------ worker 3 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 2 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 1 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 3 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 0 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 5 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 0 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 5 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
----- Master prints matrix 0
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 1
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 2
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 3
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 4
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-queued state inconsistent: [0] != [1]
-Failure (id: 4, info: 3): GraphBLAS error: GrB_INVALID_OBJECT
-function: GxB_Matrix_fprint (A, name, pr, f)
-matrix queued state inconsistent: [A], [0] != [1]
-
diff --git a/GraphBLAS/Demo/Output/user_openmp_grb_openmp.out b/GraphBLAS/Demo/Output/user_openmp_grb_openmp.out
deleted file mode 100644
index 45263902cd..0000000000
--- a/GraphBLAS/Demo/Output/user_openmp_grb_openmp.out
+++ /dev/null
@@ -1,859 +0,0 @@
-Demo: ./openmp_demo:
-GraphBLAS is using an OpenMP critical section
-to synchronize user threads.
-User threads in this program are OpenMP pthreads.
-
-================= worker 0 starts:
-
-================= worker 3 starts:
-
-================= worker 2 starts:
-
-================= worker 1 starts:
-
-================= worker 4 starts:
-
-================= worker 7 starts:
-
-================= worker 6 starts:
-
-================= worker 5 starts:
-
------------------ worker 6 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 4 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 3 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 2 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 0 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 7 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 1 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 5 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 6 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 7 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 6 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 1 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 4 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 1 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 2 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 7 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 4 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 3 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 2 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 5 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 3 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 0 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 5 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 0 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
----- Master prints matrix 0
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 1
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 2
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 3
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 4
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 5
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 6
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 7
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
-
----- Master thread prints an error message:
-master -1 : Error: GraphBLAS error: GrB_NULL_POINTER
-function: GrB_Matrix_new (&A, type, nrows, ncols)
-Required argument is null: [A]
-
diff --git a/GraphBLAS/Demo/Output/user_openmp_grb_pthread.out b/GraphBLAS/Demo/Output/user_openmp_grb_pthread.out
deleted file mode 100644
index 9337e31cfd..0000000000
--- a/GraphBLAS/Demo/Output/user_openmp_grb_pthread.out
+++ /dev/null
@@ -1,859 +0,0 @@
-Demo: ./openmp_demo:
-GraphBLAS is using a POSIX pthread mutex
-to synchronize user threads.
-User threads in this program are OpenMP pthreads.
-
-================= worker 0 starts:
-
-================= worker 1 starts:
-
-================= worker 4 starts:
-
-================= worker 6 starts:
-
-================= worker 7 starts:
-
-================= worker 5 starts:
-
-================= worker 3 starts:
-
-================= worker 2 starts:
-
------------------ worker 1 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 0 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 5 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 3 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 2 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 4 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 7 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 6 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 3 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 7 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 1 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 6 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 4 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 7 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 3 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 5 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 1 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 0 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 5 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 6 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 0 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 4 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 2 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 2 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
----- Master prints matrix 0
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 1
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 2
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 3
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 4
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 5
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 6
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 7
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
-
----- Master thread prints an error message:
-master -1 : Error: GraphBLAS error: GrB_NULL_POINTER
-function: GrB_Matrix_new (&A, type, nrows, ncols)
-Required argument is null: [A]
-
diff --git a/GraphBLAS/Demo/Output/user_pthread_grb_none.out b/GraphBLAS/Demo/Output/user_pthread_grb_none.out
deleted file mode 100644
index 36403e9e4b..0000000000
--- a/GraphBLAS/Demo/Output/user_pthread_grb_none.out
+++ /dev/null
@@ -1,578 +0,0 @@
-Demo: ../build/pthread_demo:
-GraphBLAS is using (nothing! This will fail!)
-to synchronize user threads.
-User threads in this program are POSIX pthreads.
-
-================= worker 0 starts:
-
-================= worker 1 starts:
-
-================= worker 2 starts:
-
-================= worker 3 starts:
-
-================= worker 4 starts:
-
-================= worker 5 starts:
-
------------------ worker 0 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
-================= worker 6 starts:
-
------------------ worker 3 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 1 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 2 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 4 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
-================= worker 7 starts:
-
------------------ worker 6 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 5 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 7 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 4 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 1 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-queued state inconsistent: [0] != [1]
-Failure (id: 1, info: 3): GraphBLAS error: GrB_INVALID_OBJECT
-function: GxB_Matrix_fprint (A, name, pr, f)
-matrix queued state inconsistent: [A], [0] != [1]
-
-
------------------ worker 6 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-queued state inconsistent: [0] != [1]
-Failure (id: 6, info: 3): GraphBLAS error: GrB_INVALID_OBJECT
-function: GxB_Matrix_fprint (A, name, pr, f)
-matrix queued state inconsistent: [A], [0] != [1]
-
-
------------------ worker 3 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 3 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 2 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 2 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 7 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 7 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 4 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 5 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-queued state inconsistent: [0] != [1]
-Failure (id: 5, info: 3): GraphBLAS error: GrB_INVALID_OBJECT
-function: GxB_Matrix_fprint (A, name, pr, f)
-matrix queued state inconsistent: [A], [0] != [1]
-
-
------------------ worker 0 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 0 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
----- Master prints matrix 0
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 1
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-queued state inconsistent: [0] != [1]
-Failure (id: 1, info: 3): GraphBLAS error: GrB_INVALID_OBJECT
-function: GxB_Matrix_fprint (A, name, pr, f)
-matrix queued state inconsistent: [A], [0] != [1]
-
diff --git a/GraphBLAS/Demo/Output/user_pthread_grb_openmp.out b/GraphBLAS/Demo/Output/user_pthread_grb_openmp.out
deleted file mode 100644
index c66aa58fc0..0000000000
--- a/GraphBLAS/Demo/Output/user_pthread_grb_openmp.out
+++ /dev/null
@@ -1,859 +0,0 @@
-Demo: ./pthread_demo:
-GraphBLAS is using an OpenMP critical section
-to synchronize user threads.
-User threads in this program are POSIX pthreads.
-
-================= worker 1 starts:
-
-================= worker 0 starts:
-
-================= worker 2 starts:
-
-================= worker 3 starts:
-
-================= worker 4 starts:
-
-================= worker 5 starts:
-
-================= worker 6 starts:
-
-================= worker 7 starts:
-
------------------ worker 2 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 5 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 3 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 4 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 1 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 0 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 6 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 7 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 4 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 2 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 0 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 3 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 3 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 7 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 1 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 1 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 5 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 5 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 2 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 0 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 6 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 6 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 7 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 4 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
----- Master prints matrix 0
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 1
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 2
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 3
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 4
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 5
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 6
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 7
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
-
----- Master thread prints an error message:
-master -1 : Error: GraphBLAS error: GrB_NULL_POINTER
-function: GrB_Matrix_new (&A, type, nrows, ncols)
-Required argument is null: [A]
-
diff --git a/GraphBLAS/Demo/Output/user_pthread_grb_pthread.out b/GraphBLAS/Demo/Output/user_pthread_grb_pthread.out
deleted file mode 100644
index bbec52f4a8..0000000000
--- a/GraphBLAS/Demo/Output/user_pthread_grb_pthread.out
+++ /dev/null
@@ -1,859 +0,0 @@
-Demo: ./pthread_demo:
-GraphBLAS is using a POSIX pthread mutex
-to synchronize user threads.
-User threads in this program are POSIX pthreads.
-
-================= worker 0 starts:
-
-================= worker 1 starts:
-
-================= worker 3 starts:
-
-================= worker 2 starts:
-
-================= worker 4 starts:
-
-================= worker 5 starts:
-
-================= worker 6 starts:
-
-================= worker 7 starts:
-
------------------ worker 6 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 5 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 4 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 3 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 0 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 2 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
------------------ worker 1 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 7 intentional error:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 1 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 3 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 6 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 7 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 7 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1007 out of range; must be < 6
-
-
------------------ worker 4 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 0 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 0 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1000 out of range; must be < 6
-
-
------------------ worker 5 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 5 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1005 out of range; must be < 6
-
-
------------------ worker 1 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1001 out of range; must be < 6
-
-
------------------ worker 3 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1003 out of range; must be < 6
-
-
------------------ worker 6 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1006 out of range; must be < 6
-
-
------------------ worker 4 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1004 out of range; must be < 6
-
-
------------------ worker 2 is done:
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
------------------ worker 2 error should be same:
-GraphBLAS error: GrB_INVALID_INDEX
-function: GrB_Matrix_setElement_INT32 (C, row, col, x)
-Row index 1002 out of range; must be < 6
-
-
----- Master prints matrix 0
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101000
-    column 1: double 102000
-    column 2: double 103000
-    column 3: double 104000
-    column 4: double 105000
-    column 5: double 106000
-row: 1 : 6 entries [6:11]
-    column 0: double 201000
-    column 1: double 202000
-    column 2: double 203000
-    column 3: double 204000
-    column 4: double 205000
-    column 5: double 206000
-row: 2 : 6 entries [12:17]
-    column 0: double 301000
-    column 1: double 302000
-    column 2: double 303000
-    column 3: double 304000
-    column 4: double 305000
-    column 5: double 306000
-row: 3 : 6 entries [18:23]
-    column 0: double 401000
-    column 1: double 402000
-    column 2: double 403000
-    column 3: double 404000
-    column 4: double 405000
-    column 5: double 406000
-row: 4 : 6 entries [24:29]
-    column 0: double 501000
-    column 1: double 502000
-    column 2: double 503000
-    column 3: double 504000
-    column 4: double 505000
-    column 5: double 506000
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 1
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101001
-    column 1: double 102001
-    column 2: double 103001
-    column 3: double 104001
-    column 4: double 105001
-    column 5: double 106001
-row: 1 : 6 entries [6:11]
-    column 0: double 201001
-    column 1: double 202001
-    column 2: double 203001
-    column 3: double 204001
-    column 4: double 205001
-    column 5: double 206001
-row: 2 : 6 entries [12:17]
-    column 0: double 301001
-    column 1: double 302001
-    column 2: double 303001
-    column 3: double 304001
-    column 4: double 305001
-    column 5: double 306001
-row: 3 : 6 entries [18:23]
-    column 0: double 401001
-    column 1: double 402001
-    column 2: double 403001
-    column 3: double 404001
-    column 4: double 405001
-    column 5: double 406001
-row: 4 : 6 entries [24:29]
-    column 0: double 501001
-    column 1: double 502001
-    column 2: double 503001
-    column 3: double 504001
-    column 4: double 505001
-    column 5: double 506001
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 2
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101002
-    column 1: double 102002
-    column 2: double 103002
-    column 3: double 104002
-    column 4: double 105002
-    column 5: double 106002
-row: 1 : 6 entries [6:11]
-    column 0: double 201002
-    column 1: double 202002
-    column 2: double 203002
-    column 3: double 204002
-    column 4: double 205002
-    column 5: double 206002
-row: 2 : 6 entries [12:17]
-    column 0: double 301002
-    column 1: double 302002
-    column 2: double 303002
-    column 3: double 304002
-    column 4: double 305002
-    column 5: double 306002
-row: 3 : 6 entries [18:23]
-    column 0: double 401002
-    column 1: double 402002
-    column 2: double 403002
-    column 3: double 404002
-    column 4: double 405002
-    column 5: double 406002
-row: 4 : 6 entries [24:29]
-    column 0: double 501002
-    column 1: double 502002
-    column 2: double 503002
-    column 3: double 504002
-    column 4: double 505002
-    column 5: double 506002
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 3
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101003
-    column 1: double 102003
-    column 2: double 103003
-    column 3: double 104003
-    column 4: double 105003
-    column 5: double 106003
-row: 1 : 6 entries [6:11]
-    column 0: double 201003
-    column 1: double 202003
-    column 2: double 203003
-    column 3: double 204003
-    column 4: double 205003
-    column 5: double 206003
-row: 2 : 6 entries [12:17]
-    column 0: double 301003
-    column 1: double 302003
-    column 2: double 303003
-    column 3: double 304003
-    column 4: double 305003
-    column 5: double 306003
-row: 3 : 6 entries [18:23]
-    column 0: double 401003
-    column 1: double 402003
-    column 2: double 403003
-    column 3: double 404003
-    column 4: double 405003
-    column 5: double 406003
-row: 4 : 6 entries [24:29]
-    column 0: double 501003
-    column 1: double 502003
-    column 2: double 503003
-    column 3: double 504003
-    column 4: double 505003
-    column 5: double 506003
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 4
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101004
-    column 1: double 102004
-    column 2: double 103004
-    column 3: double 104004
-    column 4: double 105004
-    column 5: double 106004
-row: 1 : 6 entries [6:11]
-    column 0: double 201004
-    column 1: double 202004
-    column 2: double 203004
-    column 3: double 204004
-    column 4: double 205004
-    column 5: double 206004
-row: 2 : 6 entries [12:17]
-    column 0: double 301004
-    column 1: double 302004
-    column 2: double 303004
-    column 3: double 304004
-    column 4: double 305004
-    column 5: double 306004
-row: 3 : 6 entries [18:23]
-    column 0: double 401004
-    column 1: double 402004
-    column 2: double 403004
-    column 3: double 404004
-    column 4: double 405004
-    column 5: double 406004
-row: 4 : 6 entries [24:29]
-    column 0: double 501004
-    column 1: double 502004
-    column 2: double 503004
-    column 3: double 504004
-    column 4: double 505004
-    column 5: double 506004
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 5
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101005
-    column 1: double 102005
-    column 2: double 103005
-    column 3: double 104005
-    column 4: double 105005
-    column 5: double 106005
-row: 1 : 6 entries [6:11]
-    column 0: double 201005
-    column 1: double 202005
-    column 2: double 203005
-    column 3: double 204005
-    column 4: double 205005
-    column 5: double 206005
-row: 2 : 6 entries [12:17]
-    column 0: double 301005
-    column 1: double 302005
-    column 2: double 303005
-    column 3: double 304005
-    column 4: double 305005
-    column 5: double 306005
-row: 3 : 6 entries [18:23]
-    column 0: double 401005
-    column 1: double 402005
-    column 2: double 403005
-    column 3: double 404005
-    column 4: double 405005
-    column 5: double 406005
-row: 4 : 6 entries [24:29]
-    column 0: double 501005
-    column 1: double 502005
-    column 2: double 503005
-    column 3: double 504005
-    column 4: double 505005
-    column 5: double 506005
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 6
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101006
-    column 1: double 102006
-    column 2: double 103006
-    column 3: double 104006
-    column 4: double 105006
-    column 5: double 106006
-row: 1 : 6 entries [6:11]
-    column 0: double 201006
-    column 1: double 202006
-    column 2: double 203006
-    column 3: double 204006
-    column 4: double 205006
-    column 5: double 206006
-row: 2 : 6 entries [12:17]
-    column 0: double 301006
-    column 1: double 302006
-    column 2: double 303006
-    column 3: double 304006
-    column 4: double 305006
-    column 5: double 306006
-row: 3 : 6 entries [18:23]
-    column 0: double 401006
-    column 1: double 402006
-    column 2: double 403006
-    column 3: double 404006
-    column 4: double 405006
-    column 5: double 406006
-row: 4 : 6 entries [24:29]
-    column 0: double 501006
-    column 1: double 502006
-    column 2: double 503006
-    column 3: double 504006
-    column 4: double 505006
-    column 5: double 506006
-row: 5 : 6 entries [30:35]
-    ...
-
----- Master prints matrix 7
-
-GraphBLAS matrix: A 
-nrows: 6 ncols: 6 max # entries: 36
-format: standard CSR vlen: 6 nvec_nonempty: 6 nvec: 6 plen: 6 vdim: 6
-hyper_ratio 0.0625
-GraphBLAS type:  double size: 8
-number of entries: 36 
-row: 0 : 6 entries [0:5]
-    column 0: double 101007
-    column 1: double 102007
-    column 2: double 103007
-    column 3: double 104007
-    column 4: double 105007
-    column 5: double 106007
-row: 1 : 6 entries [6:11]
-    column 0: double 201007
-    column 1: double 202007
-    column 2: double 203007
-    column 3: double 204007
-    column 4: double 205007
-    column 5: double 206007
-row: 2 : 6 entries [12:17]
-    column 0: double 301007
-    column 1: double 302007
-    column 2: double 303007
-    column 3: double 304007
-    column 4: double 305007
-    column 5: double 306007
-row: 3 : 6 entries [18:23]
-    column 0: double 401007
-    column 1: double 402007
-    column 2: double 403007
-    column 3: double 404007
-    column 4: double 405007
-    column 5: double 406007
-row: 4 : 6 entries [24:29]
-    column 0: double 501007
-    column 1: double 502007
-    column 2: double 503007
-    column 3: double 504007
-    column 4: double 505007
-    column 5: double 506007
-row: 5 : 6 entries [30:35]
-    ...
-
-
----- Master thread prints an error message:
-master -1 : Error: GraphBLAS error: GrB_NULL_POINTER
-function: GrB_Matrix_new (&A, type, nrows, ncols)
-Required argument is null: [A]
-
diff --git a/GraphBLAS/Demo/Output/wildtype_demo.out b/GraphBLAS/Demo/Output/wildtype_demo.out
index 431e87a45f..3b5077ed9a 100644
--- a/GraphBLAS/Demo/Output/wildtype_demo.out
+++ b/GraphBLAS/Demo/Output/wildtype_demo.out
@@ -42,7 +42,7 @@ multiplication C=A*B InTheWild semiring:
 
 Printing the matrix with GxB_Matrix_fprint:
 
-  10x10 GraphBLAS wildtype matrix, sparse by row:
+  10x10 GraphBLAS wildtype matrix, sparse by row
   input A, 3 entries
 
     (2,4) [user-defined value]
@@ -80,7 +80,7 @@ a wildtype scalar:  [this is A(3,7)]
 
 Printing the matrix with GxB_Matrix_fprint:
 
-  10x10 GraphBLAS wildtype matrix, hypersparse by row:
+  10x10 GraphBLAS wildtype matrix, hypersparse by row
   input B, no entries
   pending tuples: 3 max pending: 256 zombies: 0
   pending tuples:
@@ -247,7 +247,7 @@ a wildtype scalar: z = x*y: [this was multiplied]
 
 Printing the matrix with GxB_Matrix_fprint:
 
-  10x10 GraphBLAS wildtype matrix, sparse by row:
+  10x10 GraphBLAS wildtype matrix, sparse by row
   output C, 4 entries
 
     (2,2) [user-defined value]
@@ -298,7 +298,7 @@ internal data structure.
 
 Printing the matrix with GxB_Matrix_fprint:
 
-  10x10 GraphBLAS wildtype matrix, sparse by col:
+  10x10 GraphBLAS wildtype matrix, sparse by col
   output C, 4 entries
 
     (2,2) [user-defined value]
@@ -345,16 +345,31 @@ a wildtype scalar:  [this was multiplied]
 
 Printing the matrix with GxB_Matrix_fprint:
 
-  10x10 GraphBLAS float matrix, hypersparse by row:
+  10x10 GraphBLAS float matrix, hypersparse by row
   D, no entries
 
 
 The matrix D is not wild enough to print.
 
+  10x10 GraphBLAS int64_t matrix, sparse by row
+  E (positional i), 3 entries
+
+    (2,4)   2
+    (2,7)   2
+    (3,7)   3
+
+
+  10x10 GraphBLAS int64_t matrix, sparse by row
+  E (positional j), 3 entries
+
+    (2,4)   4
+    (2,7)   7
+    (3,7)   7
+
+
 This is supposed to fail, as a demo of GrB_error:
 GraphBLAS error: GrB_DOMAIN_MISMATCH
 function: GrB_Matrix_eWiseAdd_BinaryOp (C, M, accum, add, A, B, desc)
 Incompatible type for z=wildtype_add(x,y):
 second input of type [float]
 cannot be typecast to y input of type [wildtype]
-
diff --git a/GraphBLAS/Demo/Program/bfs_demo.c b/GraphBLAS/Demo/Program/bfs_demo.c
index 7d48f0728d..0441a75009 100644
--- a/GraphBLAS/Demo/Program/bfs_demo.c
+++ b/GraphBLAS/Demo/Program/bfs_demo.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Program/bfs_demo.c: breadth first search using vxm with a mask
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -56,6 +56,7 @@ int main (int argc, char **argv)
     OK (GrB_init (GrB_NONBLOCKING)) ;
     int nthreads ;
     OK (GxB_Global_Option_get (GxB_GLOBAL_NTHREADS, &nthreads)) ;
+    OK (GxB_Global_Option_set (GxB_BURBLE, false)) ;
     fprintf (stderr, "bfs_demo: nthreads %d\n", nthreads) ;
 
     //--------------------------------------------------------------------------
@@ -92,7 +93,6 @@ int main (int argc, char **argv)
         // All methods give identical results, just using different methods
 
         GrB_Index s = 0 ;
-        GxB_Global_Option_set (GxB_GLOBAL_NTHREADS, 2) ;
 
         switch (method)
         {
@@ -214,12 +214,12 @@ int main (int argc, char **argv)
         // constant "0".
         GrB_Monoid Lor ;
         info = GrB_Monoid_new_INT32 (&Lor, GrB_LOR, false) ;        
-        printf ("\n------------------- this fails:\n%s\n", GrB_error ( )) ;
+        printf ("\n------------------- this fails: info %d\n", info) ;
         GrB_Monoid_free (&Lor) ;
 
         // this selects the correct GrB_Monoid_new_BOOL function
         info = GrB_Monoid_new_BOOL (&Lor, GrB_LOR, (bool) false) ;        
-        printf ("\n------------------- this is OK: %d (should be"
+        printf ("\n------------------- this is OK %d (should be"
             " GrB_SUCCESS = %d)\n", info, GrB_SUCCESS) ;
         GrB_Monoid_free (&Lor) ;
     }
diff --git a/GraphBLAS/Demo/Program/complex_demo.c b/GraphBLAS/Demo/Program/complex_demo.c
index faf72b8b75..07df55380c 100644
--- a/GraphBLAS/Demo/Program/complex_demo.c
+++ b/GraphBLAS/Demo/Program/complex_demo.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Program/complex_demo.c: demo for user-defined complex type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -90,7 +90,7 @@ int main (int argc, char **argv)
     info = Complex_init (predefined) ;
     if (info != GrB_SUCCESS)
     {
-        fprintf (stderr, "Complex init failed: %s\n", GrB_error ( )) ;
+        fprintf (stderr, "Complex init failed\n") ;
         abort ( ) ;
     }
 
diff --git a/GraphBLAS/Demo/Program/import_demo.c b/GraphBLAS/Demo/Program/import_demo.c
index 246667f1c3..82be61b64b 100644
--- a/GraphBLAS/Demo/Program/import_demo.c
+++ b/GraphBLAS/Demo/Program/import_demo.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Program/import_demo.c: test import/export
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -55,7 +55,7 @@ int main (int argc, char **argv)
             for (int format = 0 ; format <= 3 ; format++)
             {
 
-                OK (GxB_Matrix_Option_set (A, GxB_HYPER, h)) ;
+                OK (GxB_Matrix_Option_set (A, GxB_HYPER_SWITCH, h)) ;
                 OK (GxB_Matrix_Option_set (A, GxB_FORMAT, f)) ;
                 OK (import_test (&A, format, dump)) ;
             }
@@ -65,5 +65,6 @@ int main (int argc, char **argv)
     FREE_ALL ;
 
     OK (GrB_finalize ( )) ;
+    printf ("import_demp: all tests passed\n") ;
 }
 
diff --git a/GraphBLAS/Demo/Program/kron_demo.c b/GraphBLAS/Demo/Program/kron_demo.c
index b6e4d48ab4..7caea4f157 100644
--- a/GraphBLAS/Demo/Program/kron_demo.c
+++ b/GraphBLAS/Demo/Program/kron_demo.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Program/kron_demo.c: Kronkecker product
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Program/mis_demo.c b/GraphBLAS/Demo/Program/mis_demo.c
index 93a4f74778..0cd9521ed2 100644
--- a/GraphBLAS/Demo/Program/mis_demo.c
+++ b/GraphBLAS/Demo/Program/mis_demo.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Program/mis_demo.c: maximal independent set
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -163,7 +163,7 @@ GrB_Info mis_check_results
 
     free (I) ; I = NULL ;
 
-    fprintf (stderr, "maximal independent set OK: %.16g of %.16g nodes"
+    fprintf (stderr, "maximal independent set OK %.16g of %.16g nodes"
         " time: %g\n", (double) isize, (double) n, t) ;
     return (GrB_SUCCESS) ;
 }
diff --git a/GraphBLAS/Demo/Program/openmp_demo.c b/GraphBLAS/Demo/Program/openmp_demo.c
index 641cf90b93..0b16adf733 100644
--- a/GraphBLAS/Demo/Program/openmp_demo.c
+++ b/GraphBLAS/Demo/Program/openmp_demo.c
@@ -2,15 +2,13 @@
 // GraphBLAS/Demo/Program/openmp_demo: example of user multithreading
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// This demo uses OpenMP, and should work if GraphBLAS is compiled to
-// use either OpenMP or pthreads to synchronize multiple user threadds.
-// If OpenMP is not available, this program will work fine without it, in a
-// single user thread, regardless of the thread mechanism used by GraphBLAS.
+// This demo uses OpenMP, and illustrates how GraphBLAS can be called from
+// a multi-threaded user program.
 
 #include "GraphBLAS.h"
 
@@ -37,8 +35,7 @@
     GrB_Info info = method ;                                        \
     if (! (info == GrB_SUCCESS || info == GrB_NO_VALUE))            \
     {                                                               \
-        printf ("Failure (id: %d, info: %d): %s\n",                 \
-            id, info, GrB_error ( )) ;                              \
+        printf ("Failure (id: %d, info: %d):\n", id, info) ;        \
         /* return to caller (do not use inside critical section) */ \
         return (0) ;                                                \
     }                                                               \
@@ -65,7 +62,9 @@ int worker (GrB_Matrix *Ahandle, int id)
     {
         // critical section
         printf ("\n----------------- worker %d intentional error:\n", id) ;
-        printf ("%s\n", GrB_error ( )) ;
+        char *s ;
+        GrB_Matrix_error (&s, A) ;
+        printf ("%s\n", s) ;
     }
 
     for (int hammer_hard = 0 ; hammer_hard < NTRIALS ; hammer_hard++)
@@ -80,8 +79,7 @@ int worker (GrB_Matrix *Ahandle, int id)
         }
 
         // force completion
-        GrB_Index nvals ;
-        OK (GrB_Matrix_nvals (&nvals, A)) ;
+        OK (GrB_Matrix_wait (&A)) ;
     }
 
     // Printing is done in a critical section, just so it is not overly
@@ -108,7 +106,9 @@ int worker (GrB_Matrix *Ahandle, int id)
     {
         // critical section
         printf ("\n----------------- worker %d error should be same:\n", id) ;
-        printf ("%s\n", GrB_error ( )) ;
+        char *s ;
+        GrB_Matrix_error (&s, A) ;
+        printf ("%s\n", s) ;
     }
     return (0) ;
 }
@@ -132,33 +132,6 @@ int main (int argc, char **argv)
     fprintf (stderr, "openmp demo, nthreads %d\n", nthreads) ;
 
     // Determine which user-threading model is being used.
-    GxB_Thread_Model thread_safety ;
-    GxB_Global_Option_get (GxB_THREAD_SAFETY, &thread_safety) ;
-    printf ("GraphBLAS is using ") ;
-    switch (thread_safety)
-    {
-        case GxB_THREAD_POSIX :
-            printf ("a POSIX pthread mutex\n") ;
-            break ;
-        case GxB_THREAD_WINDOWS :
-            printf ("a Windows CriticalSection\n") ;
-            break ;
-        case GxB_THREAD_ANSI :
-            printf ("an ANSI C11 mtx_lock\n") ;
-            break ;
-        case GxB_THREAD_OPENMP :
-            printf ("an OpenMP critical section\n") ;
-            break ;
-        default : // GxB_THREAD_NONE
-            #ifdef _OPENMP
-            printf ("(nothing! This will fail!)\n") ;
-            #else
-            printf ("nothing (OK since user program is single-threaded)\n") ;
-            #endif
-            break ;
-    }
-    printf ("to synchronize user threads.\n") ;
-
     #ifdef _OPENMP
     printf ("User threads in this program are OpenMP threads.\n") ;
     #else
@@ -174,20 +147,15 @@ int main (int argc, char **argv)
         worker (&Aarray [id], id) ;
     }
 
-    // the master thread prints them again, and frees them
+    // the leader thread prints them again, and frees them
     for (int id = 0 ; id < NTHREADS ; id++)
     {
         GrB_Matrix A = Aarray [id] ;
-        printf ("\n---- Master prints matrix %d\n", id) ;
+        printf ("\n---- Leader prints matrix %d\n", id) ;
         OK (GxB_Matrix_fprint (A, "A", GxB_SHORT, stdout)) ;
         GrB_Matrix_free (&A) ;
     }
 
-    // print an error message
-    printf ("\n\n---- Master thread prints an error message:\n") ;
-    GrB_Matrix_new (NULL, GrB_FP64, 1, 1) ;
-    printf ("Error: %s\n", GrB_error ( )) ;
-
     // finish GraphBLAS
     GrB_finalize ( ) ;
 
diff --git a/GraphBLAS/Demo/Program/pagerank_demo.c b/GraphBLAS/Demo/Program/pagerank_demo.c
index 294da64c55..a4732955da 100644
--- a/GraphBLAS/Demo/Program/pagerank_demo.c
+++ b/GraphBLAS/Demo/Program/pagerank_demo.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Program/pagerank_demo.c: PageRank via various semirings
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Program/pthread_demo.c b/GraphBLAS/Demo/Program/pthread_demo.c
deleted file mode 100644
index d46c0faefc..0000000000
--- a/GraphBLAS/Demo/Program/pthread_demo.c
+++ /dev/null
@@ -1,217 +0,0 @@
-//------------------------------------------------------------------------------
-// GraphBLAS/Demo/Program/pthread_demo: example of user multithreading
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// This demo requires pthreads, and should work if GraphBLAS is compiled to
-// use either OpenMP or pthreads to synchronize multiple user threadds.
-
-#include "GraphBLAS.h"
-
-#ifdef HAVE_PTHREADS
-
-#include <pthread.h>
-
-#if defined __INTEL_COMPILER
-#pragma warning (disable: 58 167 144 177 181 186 188 589 593 869 981 1418 1419 1572 1599 2259 2282 2557 2547 3280 )
-#elif defined __GNUC__
-#pragma GCC diagnostic ignored "-Wunused-parameter"
-#endif
-
-#define NTHREADS 8
-#define NTRIALS 10
-#define N 6
-
-#define OK(method)                                                  \
-{                                                                   \
-    GrB_Info info = method ;                                        \
-    if (! (info == GrB_SUCCESS || info == GrB_NO_VALUE))            \
-    {                                                               \
-        printf ("Failure (id: %d, info: %d): %s\n",                 \
-            id, info, GrB_error ( )) ;                              \
-        /* return to caller (do not use inside critical section) */ \
-        return (0) ;                                                \
-    }                                                               \
-}
-
-pthread_mutex_t sync ;
-
-struct thread_arg_struct
-{
-    GrB_Matrix A ;      // matrix owned by this thread
-    int id ;            // thread id
-} ;
-
-typedef struct thread_arg_struct *thread_arg ;
-
-//------------------------------------------------------------------------------
-// worker
-//------------------------------------------------------------------------------
-
-void *worker (void *arg)
-{
-    thread_arg my = (thread_arg) arg ;
-    int id = my->id ;
-
-    printf ("\n================= worker %d starts:\n", id) ;
-    fprintf (stderr, "worker %d\n", id) ;
-
-    OK (GrB_Matrix_new (&(my->A), GrB_FP64, N, N)) ;
-
-    GrB_Matrix A = my->A ;
-
-    // worker generates an intentional error message
-    GrB_Matrix_setElement_INT32 (A, 42, 1000+id, 1000+id) ;
-
-    // print the intentional error generated when the worker started
-    pthread_mutex_lock (&sync) ;
-    {
-        // critical section
-        printf ("\n----------------- worker %d intentional error:\n", id) ;
-        printf ("%s\n", GrB_error ( )) ;
-    }
-    pthread_mutex_unlock (&sync) ;
-
-    for (int hammer_hard = 0 ; hammer_hard < NTRIALS ; hammer_hard++)
-    {
-        for (int i = 0 ; i < N ; i++)
-        {
-            for (int j = 0 ; j < N ; j++)
-            {
-                double x = (i+1)*100000 + (j+1)*1000 + id ;
-                OK (GrB_Matrix_setElement_FP64 (A, x, i, j)) ;
-            } 
-        }
-
-        // force completion
-        GrB_Index nvals ;
-        OK (GrB_Matrix_nvals (&nvals, A)) ;
-    }
-
-    // Printing is done in a critical section, just so it is not overly
-    // jumbled.  Each matrix and error will print in a single body of text,
-    // but the order of the matrices and errors printed will be out of order
-    // because the critical section does not enforce the order that the
-    // threads enter.
-
-    GrB_Info info2 ;
-    pthread_mutex_lock (&sync) ;
-    {
-        // critical section
-        printf ("\n----------------- worker %d is done:\n", id) ;
-        info2 = GxB_Matrix_fprint (A, "A", GxB_SHORT, stdout) ;
-    }
-    pthread_mutex_unlock (&sync) ;
-    OK (info2) ;
-
-    // worker generates an intentional error message
-    GrB_Matrix_setElement_INT32 (A, 42, 1000+id, 1000+id) ;
-
-    // print the intentional error generated when the worker started
-    // It should be unchanged.
-    pthread_mutex_lock (&sync) ;
-    {
-        // critical section
-        printf ("\n----------------- worker %d error should be same:\n", id) ;
-        printf ("%s\n", GrB_error ( )) ;
-    }
-    pthread_mutex_unlock (&sync) ;
-    return (0) ;
-}
-
-//------------------------------------------------------------------------------
-// pthread_demo main program
-//------------------------------------------------------------------------------
-
-int main (int argc, char **argv)
-{
-    fprintf (stderr, "Demo: %s:\n", argv [0]) ;
-    printf ("Demo: %s:\n", argv [0]) ;
-
-    // initialize the mutex
-    pthread_mutex_init (&sync, NULL) ;
-    int id = -1 ;
-
-    // start GraphBLAS
-    OK (GrB_init (GrB_NONBLOCKING)) ;
-    int nthreads ;
-    OK (GxB_Global_Option_get (GxB_NTHREADS, &nthreads)) ;
-    printf ("pthread demo, nthreads: %d\n", nthreads) ;
-
-    // Determine which user-threading model is being used.
-    GxB_Thread_Model thread_safety ;
-    GxB_Global_Option_get (GxB_THREAD_SAFETY, &thread_safety) ;
-    printf ("GraphBLAS is using ") ;
-    switch (thread_safety)
-    {
-        case GxB_THREAD_POSIX :
-            printf ("a POSIX pthread mutex\n") ;
-            break ;
-        case GxB_THREAD_WINDOWS :
-            printf ("a Windows CriticalSection\n") ;
-            break ;
-        case GxB_THREAD_ANSI :
-            printf ("an ANSI C11 mtx_lock\n") ;
-            break ;
-        case GxB_THREAD_OPENMP :
-            printf ("an OpenMP critical section\n") ;
-            break ;
-        default : // GxB_THREAD_NONE
-            printf ("(nothing! This will fail!)\n") ;
-            break ;
-    }
-    printf ("to synchronize user threads.\n") ;
-    printf ("User threads in this program are POSIX pthreads.\n") ;
-
-    pthread_t threads [NTHREADS] ;
-    struct thread_arg_struct arg [NTHREADS] ;
-
-    // create the threads
-    for (int id = 0 ; id < NTHREADS ; id++)
-    {
-        arg [id].id = id ;
-        pthread_create (& (threads [id]), NULL, worker, &(arg [id]) ) ;
-    }
-
-    // join the threads
-    for (int id = 0 ; id < NTHREADS ; id++)
-    {
-        pthread_join (threads [id], NULL) ;
-    }
-
-    // the master thread prints them again, and frees them
-    for (int id = 0 ; id < NTHREADS ; id++)
-    {
-        GrB_Matrix A = arg [id].A ;
-        printf ("\n---- Master prints matrix %d\n", id) ;
-        OK (GxB_Matrix_fprint (A, "A", GxB_SHORT, stdout)) ;
-        GrB_Matrix_free (&A) ;
-    }
-
-    // print an error message
-    printf ("\n\n---- Master thread prints an error message:\n") ;
-    GrB_Matrix_new (NULL, GrB_FP64, 1, 1) ;
-    printf ("master %d : Error: %s\n", id, GrB_error ( )) ;
-
-    // finish GraphBLAS
-    GrB_finalize ( ) ;
-
-    // finish pthreads
-    pthread_mutex_destroy (&sync) ;
-    pthread_exit (NULL) ;
-    exit (0) ;
-}
-
-#else
-
-int main (void)
-{
-    printf ("pthread_demo: pthreads not available\n") ;
-    exit (1) ;
-}
-
-#endif
diff --git a/GraphBLAS/Demo/Program/reduce_demo.c b/GraphBLAS/Demo/Program/reduce_demo.c
index af75779ee0..4d6bcda029 100644
--- a/GraphBLAS/Demo/Program/reduce_demo.c
+++ b/GraphBLAS/Demo/Program/reduce_demo.c
@@ -2,12 +2,15 @@
 // GraphBLAS/Demo/Program/reduce_demo: reduce a matrix to a scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "GraphBLAS.h"
+#if defined ( _OPENMP )
+#include <omp.h>
+#endif
 
 // #define N 65536
    #define N 16384
diff --git a/GraphBLAS/Demo/Program/simple_demo.c b/GraphBLAS/Demo/Program/simple_demo.c
index 9db4746bac..a5b8cd48ca 100644
--- a/GraphBLAS/Demo/Program/simple_demo.c
+++ b/GraphBLAS/Demo/Program/simple_demo.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Program/simple_demo.c: tests simple_rand and simple_timer
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Program/tri_demo.c b/GraphBLAS/Demo/Program/tri_demo.c
index 100b60401e..65c2267333 100644
--- a/GraphBLAS/Demo/Program/tri_demo.c
+++ b/GraphBLAS/Demo/Program/tri_demo.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Program/tri_demo.c: count triangles
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -229,7 +229,6 @@ int main (int argc, char **argv)
 
         double t_mark [2] = { 0, 0 } ;
         OK (tricount (&ntri1 [nthreads], 3, NULL, NULL, L, NULL, t_mark)) ;
-        printf ("triangles, method 3: %0.16g\n", (double) ntri1 [nthreads]) ;
         if (ntri1 [nthreads] != nt)
         {
             printf ("error 3!\n") ;
diff --git a/GraphBLAS/Demo/Program/wildtype_demo.c b/GraphBLAS/Demo/Program/wildtype_demo.c
index ed1f111f94..acfdf417dc 100644
--- a/GraphBLAS/Demo/Program/wildtype_demo.c
+++ b/GraphBLAS/Demo/Program/wildtype_demo.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Program/wildtype_demo: an arbitrary user-defined type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -185,16 +185,16 @@ int main (void)
     char *api_url ;   GxB_Global_Option_get (GxB_API_URL,          &api_url) ;
 
     fprintf (stderr, LINE2 "%s Version %d.%d.%d, %s\n" LINE2 "%s"
-        "(%s)\n" LINE "License: %s" LINE "GraphBLAS API Version %d.%d.%d, %s"
+        "(%s)\n" LINE "License:\n%s" LINE "GraphBLAS API Version %d.%d.%d, %s"
         " (%s)\n%s" LINE2,
         library, version [0], version [1], version [2], date, about, url,
         license, api_ver [0], api_ver [1], api_ver [2], api_date, api_url,
         api_about) ;
     fprintf (stderr, "compiled: %s %s\n", cdate, ctime) ;
 
-    double hyper_ratio ;
-    GxB_Global_Option_get (GxB_HYPER, &hyper_ratio) ;
-    fprintf (stderr, "hyper ratio: %g\n", hyper_ratio) ;
+    double hyper_switch ;
+    GxB_Global_Option_get (GxB_HYPER_SWITCH, &hyper_switch) ;
+    fprintf (stderr, "hyper switch: %g\n", hyper_switch) ;
 
     GxB_Format_Value format ;
     GxB_Global_Option_get (GxB_FORMAT, &format) ;
@@ -205,32 +205,6 @@ int main (void)
     fprintf (stderr, "mode: %s\n", (mode == GrB_BLOCKING) ?
         "blocking" : "non-blocking") ;
 
-    GxB_Thread_Model thread_safety ;
-    GxB_Global_Option_get (GxB_THREAD_SAFETY, &thread_safety) ;
-    fprintf (stderr, "user thread safety via: ") ;
-    switch (thread_safety)
-    {
-        case GxB_THREAD_OPENMP :  fprintf (stderr, "OpenMP\n") ;         break ;
-        case GxB_THREAD_POSIX :   fprintf (stderr, "POSIX threads\n") ;  break ;
-        case GxB_THREAD_WINDOWS : fprintf (stderr, "Windowsthreads\n") ; break ;
-        case GxB_THREAD_ANSI :    fprintf (stderr, "ANSI threads\n") ;   break ;
-        case GxB_THREAD_NONE : 
-        default :                 fprintf (stderr, "none\n") ;
-    }
-
-    GxB_Thread_Model threading ;
-    GxB_Global_Option_get (GxB_THREADING, &threading) ;
-    fprintf (stderr, "GraphBLAS parallelism via: ") ;
-    switch (threading)
-    {
-        case GxB_THREAD_OPENMP :  fprintf (stderr, "OpenMP\n") ; break ;
-        case GxB_THREAD_POSIX :   
-        case GxB_THREAD_WINDOWS : 
-        case GxB_THREAD_ANSI :    
-        case GxB_THREAD_NONE : 
-        default :                 fprintf (stderr, "none\n") ;
-    }
-
     int nthreads_max ;
     GxB_Global_Option_get (GxB_GLOBAL_NTHREADS, &nthreads_max) ;
     fprintf (stderr, "max # of threads used internally: %d\n", nthreads_max) ;
@@ -366,12 +340,23 @@ int main (void)
     GrB_Matrix_new (&D, GrB_FP32, 10, 10) ;
     wildtype_print_matrix (D, "D") ;
 
+    // apply some positional operators
+    GrB_Matrix E ;
+    GrB_Matrix_new (&E, GrB_INT64, 10, 10) ;
+
+    GrB_Matrix_apply (E, NULL, NULL, GxB_POSITIONI_INT64, A, NULL) ;
+    GxB_Matrix_fprint (E, "E (positional i)", GxB_COMPLETE, NULL) ;
+
+    GrB_Matrix_apply (E, NULL, NULL, GxB_POSITIONJ_INT64, A, NULL) ;
+    GxB_Matrix_fprint (E, "E (positional j)", GxB_COMPLETE, NULL) ;
+
     // do something invalid
     info = GrB_Matrix_eWiseAdd_BinaryOp (C, NULL, NULL, WildAdd, A, D, NULL) ;
     if (info != GrB_SUCCESS)
     {
-        printf ("\nThis is supposed to fail, as a demo of GrB_error:\n%s\n",
-            GrB_error ( )) ;
+        char *s ;
+        GrB_Matrix_error (&s, C) ;
+        printf ("\nThis is supposed to fail, as a demo of GrB_error:\n%s\n", s);
     }
 
     // free everyting
@@ -379,6 +364,7 @@ int main (void)
     GrB_Matrix_free (&A) ;
     GrB_Matrix_free (&B) ;
     GrB_Matrix_free (&D) ;
+    GrB_Matrix_free (&E) ;
     GrB_Semiring_free (&InTheWild) ;
     GrB_Monoid_free (&WildAdder) ;
     GrB_BinaryOp_free (&WildAdd) ;
diff --git a/GraphBLAS/Demo/README.txt b/GraphBLAS/Demo/README.txt
index b9ceb97c09..c3c0460c54 100644
--- a/GraphBLAS/Demo/README.txt
+++ b/GraphBLAS/Demo/README.txt
@@ -1,5 +1,5 @@
-SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
 
 This is the GraphBLAS/Demo folder.  It contains a set of simple demo programs
 that illustrate the use of GraphBLAS.  To compile and run the demos, see
@@ -57,16 +57,11 @@ in Demo/Program:
     wildtype_demo.c         demo program, arbitrary struct as user-defined type
     pagerank_demo.c         demo program to test dpagerank and ipagerank
     openmp_demo.c           demo program using OpenMP
-    pthread_demo.c          demo program using POSIX pthreads
 
 --------------------------------------------------------------------------------
 in Demo/Output:
 --------------------------------------------------------------------------------
 
-Output generated on an NVIDIA DGX Workstation, Intel Xeon E5-2698 @ 2.2GHz,
-with 20 hardware cores (40 threads), 256GB RAM, Ubuntu 16.04, using the
-icc 19.0.3.199 20190206 compiler, and GraphBLAS v3.0.1 (July 21, 2019, draft):
-
     bfs_demo.out        output of bfs_demo
     complex_demo_out.m  output of complex_demo, run in MATLAB to check results
     mis_demo.out        output of mis_demo
@@ -76,28 +71,6 @@ icc 19.0.3.199 20190206 compiler, and GraphBLAS v3.0.1 (July 21, 2019, draft):
     pagerank_demo.out   output of pagerank_demo
     import_demo.out     output of import_demo
 
-Output generated from an earlier version, MacBook Pro, gcc 8.3, Apr 11, 2019:
-
-    6 output files from openmp_demo.c:
-    user_openmp_grb_openmp.out   user threads: OpenMP, GraphBLAS: OpenMP
-    user_openmp_grb_pthread.out  user threads: OpenMP, GraphBLAS: POSIX 
-    user_openmp_grb_none.out     user threads: OpenMP,
-                                 GraphBLAS: none, expect failure
-    user_none_grb_openmp.out     user threads: none, GraphBLAS: OpenMP
-    user_none_grb_pthread.out    user threads: none, GraphBLAS: POSIX 
-    user_none_grb_none.out       user threads: none, GraphBLAS: none (OK)
-
-    3 output files from pthread_demo.c:
-    user_pthread_grb_openmp.out  user threads: POSIX, GraphBLAS: OpenMP
-    user_pthread_grb_pthread.out user threads: POSIX, GraphBLAS: POSIX 
-    user_pthread_grb_none.out    user threads: POSIX,
-                                 GraphBLAS: none, expect failure
-
-Note that two of the above files show expected failures: synchronization
-failures (queue invalid) and mangled error messages, because the user
-application is multithreaded but GraphBLAS was compiled with no threading
-library and is thus not thread-safe.
-
 --------------------------------------------------------------------------------
 in Demo/Include:
 --------------------------------------------------------------------------------
diff --git a/GraphBLAS/Demo/Source/bfs5m.c b/GraphBLAS/Demo/Source/bfs5m.c
index 70f2fa1f71..e3879ffae3 100644
--- a/GraphBLAS/Demo/Source/bfs5m.c
+++ b/GraphBLAS/Demo/Source/bfs5m.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/bfs5m.c: breadth first search (vxm and assign/reduce)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/bfs5m_check.c b/GraphBLAS/Demo/Source/bfs5m_check.c
index 4994c82e59..0808baf579 100644
--- a/GraphBLAS/Demo/Source/bfs5m_check.c
+++ b/GraphBLAS/Demo/Source/bfs5m_check.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/bfs5m_check.c: BFS with vxm and assign/reduce
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/bfs6.c b/GraphBLAS/Demo/Source/bfs6.c
index 94e7ccc1a7..50af9d4d19 100644
--- a/GraphBLAS/Demo/Source/bfs6.c
+++ b/GraphBLAS/Demo/Source/bfs6.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/bfs6.c: breadth first search (vxm and apply)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/bfs6_check.c b/GraphBLAS/Demo/Source/bfs6_check.c
index 465d0a0e8e..634201c69e 100644
--- a/GraphBLAS/Demo/Source/bfs6_check.c
+++ b/GraphBLAS/Demo/Source/bfs6_check.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/bfs6_check.c: breadth first search using vxm
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/dpagerank.c b/GraphBLAS/Demo/Source/dpagerank.c
index 6822210534..67bd177568 100644
--- a/GraphBLAS/Demo/Source/dpagerank.c
+++ b/GraphBLAS/Demo/Source/dpagerank.c
@@ -2,8 +2,8 @@
 // SuiteSparse/GraphBLAS/Demo/Source/dpagerank: pagerank using a real semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -187,7 +187,7 @@ GrB_Info dpagerank          // GrB_SUCCESS or error condition
     OK (GrB_Vector_extractTuples_FP64 (I, X, &nvals, r)) ;
 
     // this will always be true since r is dense, but double-check anyway:
-    CHECK (nvals == n, GrB_PANIC) ;
+    CHECK (nvals == n, GrB_INVALID_VALUE) ;
 
     // r no longer needed
     GrB_Vector_free (&r) ;
diff --git a/GraphBLAS/Demo/Source/dpagerank2.c b/GraphBLAS/Demo/Source/dpagerank2.c
index a5d68a6a32..eea942385d 100644
--- a/GraphBLAS/Demo/Source/dpagerank2.c
+++ b/GraphBLAS/Demo/Source/dpagerank2.c
@@ -2,8 +2,8 @@
 // SuiteSparse/GraphBLAS/Demo/Source/dpagerank2: pagerank using a real semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/drowscale.c b/GraphBLAS/Demo/Source/drowscale.c
index 246956293e..1d607811f1 100644
--- a/GraphBLAS/Demo/Source/drowscale.c
+++ b/GraphBLAS/Demo/Source/drowscale.c
@@ -2,8 +2,8 @@
 // drowscale: scale the rows of an adjacency matrix by out-degree
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -107,7 +107,7 @@ GrB_Info drowscale          // GrB_SUCCESS or error condition
     OK (GrB_Vector_extractTuples_FP64 (I, X, &nvals, dout)) ;
 
     // I and X exclude empty columns of A.  This condition is always true.
-    CHECK (nvals <= n, GrB_PANIC) ;
+    CHECK (nvals <= n, GrB_INVALID_VALUE) ;
 
     // D = diag (1./dout) ;
     OK (GrB_Matrix_new (&D, GrB_FP64, n, n)) ;
diff --git a/GraphBLAS/Demo/Source/get_matrix.c b/GraphBLAS/Demo/Source/get_matrix.c
index 8258f2cf21..2d25c1e5c3 100644
--- a/GraphBLAS/Demo/Source/get_matrix.c
+++ b/GraphBLAS/Demo/Source/get_matrix.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/get_matrix.c: get matrix from file, or create random
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/import_test.c b/GraphBLAS/Demo/Source/import_test.c
index 1898f4f543..8991801a90 100644
--- a/GraphBLAS/Demo/Source/import_test.c
+++ b/GraphBLAS/Demo/Source/import_test.c
@@ -2,8 +2,8 @@
 // SuiteSparse/GraphBLAS/Demo/Source/import_test: test import/export
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -67,9 +67,15 @@ GrB_Info import_test (GrB_Matrix *C_handle, int format, bool dump)
 {
     
     GrB_Type type ;
-    GrB_Index nrows, ncols, nvals, nvec ;
+    GrB_Index nrows, ncols, nvec ;
     GrB_Index *Ap = NULL, *Ah = NULL, *Ai = NULL, *Aj = NULL ;
-    int64_t nonempty ;
+    GrB_Index Ap_size = 0 ;
+    GrB_Index Ah_size = 0 ;
+    GrB_Index Ab_size = 0 ;
+    GrB_Index Ai_size = 0 ;
+    GrB_Index Aj_size = 0 ;
+    GrB_Index Ax_size = 0 ;
+    bool jumbled ;
 
     void *Ax = NULL ;
 
@@ -99,7 +105,7 @@ GrB_Info import_test (GrB_Matrix *C_handle, int format, bool dump)
         //----------------------------------------------------------------------
 
             OK (GxB_Matrix_export_CSR (C_handle, &type, &nrows, &ncols,
-                &nvals, &nonempty, &Ap, &Aj, &Ax, NULL)) ;
+                &Ap, &Aj, &Ax, &Ap_size, &Aj_size, &Ax_size, &jumbled, NULL)) ;
 
             // the export destroys the matrix (*C_handle), returning its
             // contents in Ap, Aj, and Ax.
@@ -107,8 +113,8 @@ GrB_Info import_test (GrB_Matrix *C_handle, int format, bool dump)
 
             if (dump)
             {
-                printf ("export standard CSR: %g-by-%g, nvals %g:\n",
-                    (double) nrows, (double) ncols, (double) nvals) ;
+                printf ("export standard CSR: %g-by-%g, Ax_size %g:\n",
+                    (double) nrows, (double) ncols, (double) Ax_size) ;
                 OK (GxB_Type_fprint (type, "type", GxB_COMPLETE, stdout)) ;
                 GETVAL ;
                 printf ("Ap %p Aj %p Ax %p\n", (void *) Ap, (void *) Aj, Ax) ;
@@ -127,7 +133,7 @@ GrB_Info import_test (GrB_Matrix *C_handle, int format, bool dump)
 
             // reimport the matrix
             OK (GxB_Matrix_import_CSR (C_handle, type, nrows, ncols,
-                nvals, nonempty, &Ap, &Aj, &Ax, NULL)) ;
+                &Ap, &Aj, &Ax, Ap_size, Aj_size, Ax_size, jumbled, NULL)) ;
 
             OK (GxB_Matrix_fprint ((*C_handle), "C reimported",
                 dump ? GxB_COMPLETE : GxB_SILENT, stdout)) ;
@@ -138,14 +144,14 @@ GrB_Info import_test (GrB_Matrix *C_handle, int format, bool dump)
         //----------------------------------------------------------------------
 
             OK (GxB_Matrix_export_CSC (C_handle, &type, &nrows, &ncols,
-                &nvals, &nonempty, &Ap, &Ai, &Ax, NULL)) ;
+                &Ap, &Ai, &Ax, &Ap_size, &Ai_size, &Ax_size, &jumbled, NULL)) ;
 
             CHECK (*C_handle == NULL, GrB_INVALID_VALUE) ;
 
             if (dump)
             {
-                printf ("export standard CSC: %g-by-%g, nvals %g:\n",
-                    (double) nrows, (double) ncols, (double) nvals) ;
+                printf ("export standard CSC: %g-by-%g, Ax_size %g:\n",
+                    (double) nrows, (double) ncols, (double) Ax_size) ;
                 OK (GxB_Type_fprint (type, "type", GxB_COMPLETE, stdout)) ;
                 GETVAL ;
 
@@ -163,7 +169,7 @@ GrB_Info import_test (GrB_Matrix *C_handle, int format, bool dump)
             }
 
             OK (GxB_Matrix_import_CSC (C_handle, type, nrows, ncols,
-                nvals, nonempty, &Ap, &Ai, &Ax, NULL)) ;
+                &Ap, &Ai, &Ax, Ap_size, Ai_size, Ax_size, jumbled, NULL)) ;
 
             OK (GxB_Matrix_fprint ((*C_handle), "C reimported",
                 dump ? GxB_COMPLETE : GxB_SILENT, stdout)) ;
@@ -174,14 +180,15 @@ GrB_Info import_test (GrB_Matrix *C_handle, int format, bool dump)
         //----------------------------------------------------------------------
 
             OK (GxB_Matrix_export_HyperCSR (C_handle, &type, &nrows, &ncols,
-                &nvals, &nonempty, &nvec, &Ah, &Ap, &Aj, &Ax, NULL)) ;
+                &Ap, &Ah, &Aj, &Ax, &Ap_size, &Ah_size, &Aj_size, &Ax_size,
+                &nvec, &jumbled, NULL)) ;
 
             CHECK (*C_handle == NULL, GrB_INVALID_VALUE) ;
 
             if (dump)
             {
-                printf ("export hyper CSR: %g-by-%g, nvals %g, nvec %g:\n",
-                (double) nrows, (double) ncols, (double) nvals, (double) nvec) ;
+                printf ("export hyper CSR: %g-by-%g, Ax_size %g, nvec %g:\n",
+                (double) nrows, (double) ncols, (double) Ax_size, (double) nvec) ;
                 OK (GxB_Type_fprint (type, "type", GxB_COMPLETE, stdout)) ;
                 GETVAL ;
 
@@ -199,7 +206,8 @@ GrB_Info import_test (GrB_Matrix *C_handle, int format, bool dump)
             }
 
             OK (GxB_Matrix_import_HyperCSR (C_handle, type, nrows, ncols,
-                nvals, nonempty, nvec, &Ah, &Ap, &Aj, &Ax, NULL)) ;
+                &Ap, &Ah, &Aj, &Ax, Ap_size, Ah_size, Aj_size, Ax_size,
+                nvec, jumbled, NULL)) ;
 
             OK (GxB_Matrix_fprint ((*C_handle), "C reimported",
                 dump ? GxB_COMPLETE : GxB_SILENT, stdout)) ;
@@ -210,14 +218,15 @@ GrB_Info import_test (GrB_Matrix *C_handle, int format, bool dump)
         //----------------------------------------------------------------------
 
             OK (GxB_Matrix_export_HyperCSC (C_handle, &type, &nrows, &ncols,
-                &nvals, &nonempty, &nvec, &Ah, &Ap, &Ai, &Ax, NULL)) ;
+                &Ap, &Ah, &Ai, &Ax, &Ap_size, &Ah_size, &Ai_size, &Ax_size,
+                &nvec, &jumbled, NULL)) ;
 
             CHECK (*C_handle == NULL, GrB_INVALID_VALUE) ;
 
             if (dump)
             {
-                printf ("export hyper CSC: %g-by-%g, nvals %g, nvec %g:\n",
-                (double) nrows, (double) ncols, (double) nvals, (double) nvec) ;
+                printf ("export hyper CSC: %g-by-%g, Ax_size %g, nvec %g:\n",
+                (double) nrows, (double) ncols, (double) Ax_size, (double) nvec) ;
                 OK (GxB_Type_fprint (type, "type", GxB_COMPLETE, stdout)) ;
                 GETVAL ;
 
@@ -235,7 +244,8 @@ GrB_Info import_test (GrB_Matrix *C_handle, int format, bool dump)
             }
 
             OK (GxB_Matrix_import_HyperCSC (C_handle, type, nrows, ncols,
-                nvals, nonempty, nvec, &Ah, &Ap, &Ai, &Ax, NULL)) ;
+                &Ap, &Ah, &Ai, &Ax, Ap_size, Ah_size, Ai_size, Ax_size,
+                nvec, jumbled, NULL)) ;
 
             OK (GxB_Matrix_fprint ((*C_handle), "C reimported",
                 dump ? GxB_COMPLETE : GxB_SILENT, stdout)) ;
diff --git a/GraphBLAS/Demo/Source/ipagerank.c b/GraphBLAS/Demo/Source/ipagerank.c
index 485c9dd024..4d230bca74 100644
--- a/GraphBLAS/Demo/Source/ipagerank.c
+++ b/GraphBLAS/Demo/Source/ipagerank.c
@@ -1,8 +1,8 @@
 //------------------------------------------------------------------------------
 // SuiteSparse/GraphBLAS/Demo/Source/ipagerank: pagerank using uint64 semiring
 //------------------------------------------------------------------------------ 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------ 
 // A is a square unsymmetric binary matrix of size n-by-n, where A(i,j) is the
@@ -231,7 +231,7 @@ GrB_Info ipagerank          // GrB_SUCCESS or error condition
     OK (GrB_Vector_extractTuples_UINT64 (I, X, &nvals, r)) ;
 
     // this will always be true since r is dense, but double-check anyway:
-    CHECK (nvals == n, GrB_PANIC) ;
+    CHECK (nvals == n, GrB_INVALID_VALUE) ;
 
     // r no longer needed
     GrB_Vector_free (&r) ;
diff --git a/GraphBLAS/Demo/Source/irowscale.c b/GraphBLAS/Demo/Source/irowscale.c
index 860e82819c..634afdeadc 100644
--- a/GraphBLAS/Demo/Source/irowscale.c
+++ b/GraphBLAS/Demo/Source/irowscale.c
@@ -2,8 +2,8 @@
 // irowscale: scale the rows of an adjacency matrix by out-degree
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -110,7 +110,7 @@ GrB_Info irowscale          // GrB_SUCCESS or error condition
     OK (GrB_Vector_extractTuples_UINT64 (I, X, &nvals, dout)) ;
 
     // I and X exclude empty columns of A.  This condition is always true.
-    CHECK (nvals <= n, GrB_PANIC) ;
+    CHECK (nvals <= n, GrB_INVALID_VALUE) ;
 
     // D = diag (ZSCALE./dout) ;
     OK (GrB_Matrix_new (&D, GrB_UINT64, n, n)) ;
diff --git a/GraphBLAS/Demo/Source/isequal.c b/GraphBLAS/Demo/Source/isequal.c
index 1051a0e4c4..6789771578 100644
--- a/GraphBLAS/Demo/Source/isequal.c
+++ b/GraphBLAS/Demo/Source/isequal.c
@@ -2,8 +2,8 @@
 // isequal: check two matrices for exact equality
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/mis.c b/GraphBLAS/Demo/Source/mis.c
index 7942852327..3f908960bd 100644
--- a/GraphBLAS/Demo/Source/mis.c
+++ b/GraphBLAS/Demo/Source/mis.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/mis.c: maximal independent set
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/mis_check.c b/GraphBLAS/Demo/Source/mis_check.c
index d469910d0f..4605d20154 100644
--- a/GraphBLAS/Demo/Source/mis_check.c
+++ b/GraphBLAS/Demo/Source/mis_check.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/mis_check.c: maximal independent set, w/error checking
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/mis_score.c b/GraphBLAS/Demo/Source/mis_score.c
index c410ed09a3..fd7775a24b 100644
--- a/GraphBLAS/Demo/Source/mis_score.c
+++ b/GraphBLAS/Demo/Source/mis_score.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/mis_score.c: set random score
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/prand.c b/GraphBLAS/Demo/Source/prand.c
index 745080cc55..250ca3f376 100644
--- a/GraphBLAS/Demo/Source/prand.c
+++ b/GraphBLAS/Demo/Source/prand.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/prand: parallel random number generator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -127,7 +127,7 @@ void prand_dup_f (prand_t *z, /* unused: */ const prand_t *x, const prand_t *y)
     if (info != GrB_SUCCESS)                                \
     {                                                       \
         PRAND_FREE_ALL ;                                    \
-        printf ("GraphBLAS error:\n%s\n", GrB_error ( )) ;  \
+        printf ("GraphBLAS error: %d\n", info) ;            \
         return (info) ;                                     \
     }                                                       \
 }
diff --git a/GraphBLAS/Demo/Source/random_matrix.c b/GraphBLAS/Demo/Source/random_matrix.c
index 7656ae93bd..f3b457a90e 100644
--- a/GraphBLAS/Demo/Source/random_matrix.c
+++ b/GraphBLAS/Demo/Source/random_matrix.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/Source/random_matrix.c: create a random matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/read_matrix.c b/GraphBLAS/Demo/Source/read_matrix.c
index b71cb7db4a..31202109ec 100644
--- a/GraphBLAS/Demo/Source/read_matrix.c
+++ b/GraphBLAS/Demo/Source/read_matrix.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/read_matrix.c: read a matrix from stdin
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/simple_rand.c b/GraphBLAS/Demo/Source/simple_rand.c
index fda0f2a259..255f363175 100644
--- a/GraphBLAS/Demo/Source/simple_rand.c
+++ b/GraphBLAS/Demo/Source/simple_rand.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/simple_rand.c: a very simple random number generator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/simple_timer.c b/GraphBLAS/Demo/Source/simple_timer.c
index 57f708b522..8b53c0227d 100644
--- a/GraphBLAS/Demo/Source/simple_timer.c
+++ b/GraphBLAS/Demo/Source/simple_timer.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/simple_timer.c: a timer for performance measurements
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/Source/tricount.c b/GraphBLAS/Demo/Source/tricount.c
index b48434e6b3..2c44a8b8aa 100644
--- a/GraphBLAS/Demo/Source/tricount.c
+++ b/GraphBLAS/Demo/Source/tricount.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/tricount.c: count the number of triangles in a graph
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -187,7 +187,6 @@ GrB_Info tricount           // count # of triangles
 
             OK (GrB_Matrix_nrows (&n, U)) ;
             OK (GrB_Matrix_new (&C, ctype, n, n)) ;
-            OK (GrB_Descriptor_new (&d)) ;
             OK (GxB_Desc_set (d, GrB_INP1, GrB_TRAN)) ;
             // mxm:  dot product method, with mask
             OK (GxB_Desc_set (d, GxB_AxB_METHOD, GxB_AxB_DOT)) ;
@@ -202,7 +201,6 @@ GrB_Info tricount           // count # of triangles
 
             OK (GrB_Matrix_nrows (&n, U)) ;
             OK (GrB_Matrix_new (&C, ctype, n, n)) ;
-            OK (GrB_Descriptor_new (&d)) ;
             OK (GxB_Desc_set (d, GrB_INP1, GrB_TRAN)) ;
             // mxm:  dot product method, with mask
             OK (GxB_Desc_set (d, GxB_AxB_METHOD, GxB_AxB_DOT)) ;
@@ -215,6 +213,7 @@ GrB_Info tricount           // count # of triangles
 
         default:    // invalid method
 
+            FREE_ALL ;
             return (GrB_INVALID_VALUE) ;
             break ;
     }
diff --git a/GraphBLAS/Demo/Source/usercomplex.c b/GraphBLAS/Demo/Source/usercomplex.c
index 2e9228d266..88b9555a82 100644
--- a/GraphBLAS/Demo/Source/usercomplex.c
+++ b/GraphBLAS/Demo/Source/usercomplex.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/usercomplex.c:  complex numbers as a user-defined type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -304,7 +304,7 @@ GB_PUBLIC
 void complex_complex_imag (C Z, const double X) { Z = GxB_CMPLX (0, X) ; }
 
 //------------------------------------------------------------------------------
-// OK: check if a method fails
+// macro to check if a method fails
 //------------------------------------------------------------------------------
 
 #undef OK
diff --git a/GraphBLAS/Demo/Source/wathen.c b/GraphBLAS/Demo/Source/wathen.c
index 3b7ebf9323..b265615e1f 100644
--- a/GraphBLAS/Demo/Source/wathen.c
+++ b/GraphBLAS/Demo/Source/wathen.c
@@ -2,8 +2,8 @@
 // GraphBLAS/Demo/Source/wathen.c: a finite-element matrix on a regular mesh
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Demo/bdemo b/GraphBLAS/Demo/bdemo
index f1ca56c429..4be9d6d214 100755
--- a/GraphBLAS/Demo/bdemo
+++ b/GraphBLAS/Demo/bdemo
@@ -1,5 +1,8 @@
 #!/bin/sh
 
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 ../build/bfs_demo 1 4 4                        > bfs_demo.out
 ../build/bfs_demo 0 5 5 30 1                  >> bfs_demo.out
 ../build/bfs_demo < Matrix/eye3               >> bfs_demo.out
diff --git a/GraphBLAS/Demo/demo b/GraphBLAS/Demo/demo
index 07aac1ccf8..12e42ace9d 100755
--- a/GraphBLAS/Demo/demo
+++ b/GraphBLAS/Demo/demo
@@ -1,9 +1,12 @@
 #!/bin/sh
+
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 ../build/wildtype_demo                         > wildtype_demo.out
 ../build/simple_demo                           > simple_demo.out
 ../build/complex_demo                          > complex_demo_out.m
 ../build/complex_demo 1                        > complex_demo_out2.m
-../build/pthread_demo                          > pthread_demo.out
 ../build/openmp_demo                           > openmp_demo.out
 
 ../build/import_demo   < Matrix/west0067            > import_demo.out
@@ -96,7 +99,7 @@ echo "number of mismatches between dpagerank and ipagerank in your tests: "
 grep mismatch pagerank_demo.out | wc -l
 echo "number of mismatches between dpagerank and ipagerank in Demo/Output: "
 grep mismatch Output/pagerank_demo.out | wc -l
-echo "Compare pthread_demo.out and openmp_demo.out with Output/* manually,"
+echo "Compare openmp_demo.out with Output/* manually,"
 echo "since the threads can print their results in random order."
 
 ../build/wildtype_demo                         > wildtype_demo.out
diff --git a/GraphBLAS/Demo/go b/GraphBLAS/Demo/go
index ae104444e7..277f936059 100755
--- a/GraphBLAS/Demo/go
+++ b/GraphBLAS/Demo/go
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 ./tri_run /raid/research/davisgroup/GraphChallenge/2018/ssget/Mallya/lhr71_adj.tsv.gz
 ./tri_run /raid/research/davisgroup/GraphChallenge/2018/ssget/Freescale/Freescale2_adj.tsv.gz
 ./tri_run /raid/research/davisgroup/GraphChallenge/2018/snap/cit-HepPh/cit-HepPh_adj.tsv.gz
diff --git a/GraphBLAS/Demo/go2 b/GraphBLAS/Demo/go2
index c3c116d647..8bee46d397 100755
--- a/GraphBLAS/Demo/go2
+++ b/GraphBLAS/Demo/go2
@@ -1,4 +1,7 @@
 #!/bin/bash
 
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 ./tri_run /raid/research/davisgroup/GraphChallenge/2018/ssget/Mallya/lhr71_adj.tsv.gz
 ./tri_run /raid/research/davisgroup/GraphChallenge/2018/ssget/Freescale/Freescale2_adj.tsv.gz
diff --git a/GraphBLAS/Demo/go3 b/GraphBLAS/Demo/go3
index dc8d40a30c..966b690399 100755
--- a/GraphBLAS/Demo/go3
+++ b/GraphBLAS/Demo/go3
@@ -1,5 +1,8 @@
 #!/bin/bash
 
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 ./tri_run /raid/research/davisgroup/GraphChallenge/2018/snap/roadNet-CA/roadNet-CA_adj.tsv.gz
 ./tri_run /raid/research/davisgroup/GraphChallenge/2018/snap/roadNet-TX/roadNet-TX_adj.tsv.gz
 ./tri_run /raid/research/davisgroup/GraphChallenge/2018/ssget/DIMACS10/hugebubbles-00020_adj.tsv.gz
diff --git a/GraphBLAS/Demo/md b/GraphBLAS/Demo/md
new file mode 100755
index 0000000000..6a090a161f
--- /dev/null
+++ b/GraphBLAS/Demo/md
@@ -0,0 +1,7 @@
+#!/bin/sh
+
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+../build/mis_demo 1 200 200 0                 
+
diff --git a/GraphBLAS/Demo/mdemo b/GraphBLAS/Demo/mdemo
index 9d68de33bc..1f2f4d24a5 100755
--- a/GraphBLAS/Demo/mdemo
+++ b/GraphBLAS/Demo/mdemo
@@ -1,5 +1,8 @@
 #!/bin/sh
 
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 ../build/mis_demo 1 4 4                        > mis_demo.out
 ../build/mis_demo 0 5 5 30 1                  >> mis_demo.out
 ../build/mis_demo < Matrix/eye3               >> mis_demo.out
diff --git a/GraphBLAS/Demo/pdemo b/GraphBLAS/Demo/pdemo
index 67b7315624..b8c2daef6c 100755
--- a/GraphBLAS/Demo/pdemo
+++ b/GraphBLAS/Demo/pdemo
@@ -1,5 +1,8 @@
 #!/bin/sh
 
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 ../build/pagerank_demo < Matrix/eye3                > pagerank_demo.out
 ../build/pagerank_demo < Matrix/2blocks            >> pagerank_demo.out
 ../build/pagerank_demo < Matrix/t1                 >> pagerank_demo.out
diff --git a/GraphBLAS/Demo/pp b/GraphBLAS/Demo/pp
index e2a8ef0e7e..b81b09273e 100755
--- a/GraphBLAS/Demo/pp
+++ b/GraphBLAS/Demo/pp
@@ -1,5 +1,7 @@
 #!/bin/sh
 
-../build/pthread_demo                          > pthread_demo.out
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 ../build/openmp_demo                           > openmp_demo.out
 
diff --git a/GraphBLAS/Demo/tdemo b/GraphBLAS/Demo/tdemo
index 367b867968..ec96b417fc 100755
--- a/GraphBLAS/Demo/tdemo
+++ b/GraphBLAS/Demo/tdemo
@@ -1,5 +1,7 @@
 #!/bin/sh
 
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 ../build/tri_demo 1 4 4                        > tri_demo.out
 ../build/tri_demo 0 5 5 30 1                  >> tri_demo.out
diff --git a/GraphBLAS/Demo/tri_run b/GraphBLAS/Demo/tri_run
index b58a299bc6..27885ef189 100755
--- a/GraphBLAS/Demo/tri_run
+++ b/GraphBLAS/Demo/tri_run
@@ -1,4 +1,8 @@
 #!/bin/bash
+
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 # usage: tri_run files
 # foreach file ($argv[1-])
         echo ''
diff --git a/GraphBLAS/Demo/vdemo b/GraphBLAS/Demo/vdemo
index 33f1e945a3..ebd927e6b2 100755
--- a/GraphBLAS/Demo/vdemo
+++ b/GraphBLAS/Demo/vdemo
@@ -1,12 +1,14 @@
 #!/bin/sh
 
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 # V="valgrind --error-limit=no --tool=memcheck -q --log-file=valgrind_%p.log --leak-check=full --show-leak-kinds=all"
 V="valgrind --error-limit=no --tool=memcheck -q --log-file=valgrind_%p.log"
 
 $V ../build/wildtype_demo                         > wildtype_demo.out
 $V ../build/simple_demo                           > simple_demo.out
 $V ../build/complex_demo                          > complex_demo_out.m
-$V ../build/pthread_demo                          > pthread_demo.out
 
 # NOTE: OpenMP will cause valgrind to report memory leaks.  These are not a
 # problem.  They are malloc'ed objects that OpenMP intentionaly keeps until the
@@ -98,7 +100,7 @@ echo "number of mismatches between dpagerank and ipagerank in your tests: "
 grep mismatch pagerank_demo.out | wc -l
 echo "number of mismatches between dpagerank and ipagerank in Demo/Output: "
 grep mismatch Output/pagerank_demo.out | wc -l
-echo "Compare pthread_demo.out and openmp_demo.out with Output/* manually,"
+echo "Compare openmp_demo.out with Output/* manually,"
 echo "since the threads can print their results in random order."
 
 exit 0
diff --git a/GraphBLAS/Doc/CONTRIBUTOR-LICENSE.txt b/GraphBLAS/Doc/CONTRIBUTOR-LICENSE.txt
index 87d8606940..142bf61ecc 100644
--- a/GraphBLAS/Doc/CONTRIBUTOR-LICENSE.txt
+++ b/GraphBLAS/Doc/CONTRIBUTOR-LICENSE.txt
@@ -6,7 +6,8 @@ This contributor agreement ("Agreement") documents the rights granted by
 contributors to the SuiteSparse:GraphBLAS authors ("We" and "Us"). To make this
 document effective, please sign it and send it to Us by electronic submission.
 This is a legally binding document, so please read it carefully before agreeing
-to it.  The Agreement may cover more than one software project managed by Us.
+to it.  This Agreement is specific to SuiteSparse:GraphBLAS and does not
+cover any other software project managed by Us.
 
 1. Definitions
 
@@ -72,8 +73,13 @@ to it.  The Agreement may cover more than one software project managed by Us.
 
         Based on the grant of rights in Sections 2.1 and 2.2, if We include
         Your Contribution in a Material, We may license the Contribution under
-        any license, including copyleft, permissive, commercial, or proprietary
-        licenses.
+        any kind of permissive, commercial, or proprietary licenses, but
+        excluding any kind of copyleft license.  Neither the Contribution, nor
+        SuiteSparse:GraphBLAS itself, will be licensed under a copyleft license
+        (including but not limited to GNU GPL, GNU LGPL, GNU AGPL, or
+        CC-BY_SA).  Note that this restriction of copyleft licensing applies to
+        just SuiteSparse:GraphBLAS, not to any other packages in SuiteSparse,
+        many of which are already licensed under the GNU GPL or GNU LGPL.
 
     2.4 Moral Rights.
 
diff --git a/GraphBLAS/Doc/ChangeLog b/GraphBLAS/Doc/ChangeLog
index c41cb317c3..3744e1a905 100644
--- a/GraphBLAS/Doc/ChangeLog
+++ b/GraphBLAS/Doc/ChangeLog
@@ -1,16 +1,44 @@
-Version 4.0.0, FUTURE, 2020 (this list is tentative):
-
-    * GrB_wait(), with no inputs: will be removed
-    * GrB_wait(&object): polymorphic function will be added
-    * GrB_*_nvals: will no longer guarantee completion; use GrB_wait(&object)
-        or non-polymorphic GrB_*_wait (&object) instead
-    * GrB_error: will have two inputs: a string (char **) and an object
+Version 4.0.3, Jan 19, 2021
+
+    * faster min/max monoids
+    * MATLAB: G=GrB(G) converts G from v3 object to v4
+
+Version 4.0.2, Jan 11, 2021
+
+    * added ability to load *.mat files saved with GraphBLAS v3.
+    * GrB_wait: added #pragma omp flush after any GrB_wait
+
+Version 4.0.1, Jan 4, 2021
+
+    * substantial performance gains: compared with v3.3.3: up to 5x faster in
+        BFS (with LAGraph_bfs_parent2), 2x faster in Betweeness-Centrality (in
+        LAgraph_bc_bactch5); BC now faster than GAP for kron, urand, and
+        twitter (the 3 largest matrices in the GAP benchmark).
+    * GrB_wait(), with no inputs: removed
+    * GrB_wait(&object): polymorphic function added
+    * GrB_*_nvals: no longer guarantees completion; use GrB_wait for that
+    * GrB_error: now has two parameters: a string (char **) and an object
+    * GrB_Matrix_reduce_BinaryOp: limited to certain built-in ops
+    * GrB_*_extractTuples: may return indices out of order
+    * removed internal features: GBI iterator, slice and hyperslice matrices
+    * bitmap/full matrices and vectors added: faster and lower memory usage
+        when many entries present in a matrix or vector
+    * GxB_SPARSITY_CONTROL, GxB_SPARSITY_STATUS added: for matrix get/set
+    * positional operators and semirings: GxB_FIRSTI_INT32 and related ops
+    * jumbled matrices: sort left pending, like zombies and pending tuples
+    * GxB_get/set: added GxB_SPARSITY_* (hyper, sparse, bitmap, or full)
+    * GxB_HYPER: enum renamed to GxB_HYPER_SWITCH
+    * GxB_BITMAP_SWITCH added: for matrix/vector get/set and global get/set
+    * GxB*import/export: API modified
+    * GxB_SelectOp: nrows and ncols removed from function signature
+    * OpenMP tasking removed from mergesort: just as fast; now ports to Windows
+    * GxB_BURBLE added: for diagnostic output
+    * (21) bug fix: A({lo,hi})=scalar in MATLAB; A(lo:hi)=scalar was OK
 
 Version 3.3.3, July 14, 2020
 
     * (20) bug fix: w<mask>=A*u with mask non-empty and u empty (issue #13).
         affects mxm (A*B when B is a vector), vxm, and mxv.
-    * GraphBLAS v3.3.3 released as part of SuiteSparse v5.8.1
 
 Version 3.3.2, July 3, 2020
 
@@ -44,10 +72,6 @@ Version 3.3.0, June 26, 2020
     * (18) bug fix: fixed typecasting in GB_dense_subassign_23, generic case
     * (17) bug fix: non-polymorphic GrB_eWiseAdd and eWiseMult functions
         were misnamed.
-    * DRAFT interfaces: A few functions have been added to use CUDA and the
-        Intel MKL library.  These are visible in GraphBLAS.h but are
-        undocumented; do *not* use them yet.  They will likely change without
-        warning, and without changing the SuiteSparse:GraphBLAS version number.
 
 Version 3.2.2, Apr 2, 2020
 
diff --git a/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf b/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf
index 47b945e7e2..b4f8597248 100644
Binary files a/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf and b/GraphBLAS/Doc/GraphBLAS_UserGuide.pdf differ
diff --git a/GraphBLAS/Doc/GraphBLAS_UserGuide.tex b/GraphBLAS/Doc/GraphBLAS_UserGuide.tex
index 9e0e5bda4e..8864a79590 100644
--- a/GraphBLAS/Doc/GraphBLAS_UserGuide.tex
+++ b/GraphBLAS/Doc/GraphBLAS_UserGuide.tex
@@ -1,4 +1,5 @@
 \documentclass[12pt]{article}
+\batchmode
 \usepackage{url}
 \urlstyle{sf}
 \usepackage[svgnames]{xcolor}
@@ -10,7 +11,7 @@
 % \usepackage{geometry}
 % \usepackage{pdflscape}
 \newmdenv[backgroundcolor=white]{spec}
-\newmdenv[backgroundcolor=yellow]{specbeta}
+\newmdenv[backgroundcolor=yellow]{alert}
 \hyphenation{Suite-Sparse}
 \hyphenation{Graph-BLAS}
 \hyphenation{Suite-Sparse-Graph-BLAS}
@@ -43,10 +44,14 @@
 \small
 davis@tamu.edu, Texas A\&M University. \\
 \small
-http://suitesparse.com and http://aldenmath.com
+\url{http://suitesparse.com} \\
+\small
+\url{https://people.engr.tamu.edu/davis} \\
+\small
+\url{https://twitter.com/DocSparse}
 }
 
-% version and date are set by cmake (see GraphBLAS/CMakeLists.txt)
+% version and date are set by cmake
 \input{GraphBLAS_version.tex}
 
 %-------------------------------------------------------------------------------
@@ -60,8 +65,8 @@
 semirings using an almost unlimited variety of operators and types.  When
 applied to sparse adjacency matrices, these algebraic operations are equivalent
 to computations on graphs.  GraphBLAS provides a powerful and expressive
-framework for creating graph algorithms based on the elegant mathematics of
-sparse matrix operations on a semiring.
+framework for creating high-performance graph algorithms based on the elegant
+mathematics of sparse matrix operations on a semiring.
 \end{abstract}
 
 \newpage
@@ -92,62 +97,96 @@ \section{Introduction} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 application to create new types and operators without needing to recompile the
 GraphBLAS library.
 
-% API version and date are set by cmake (see GraphBLAS/CMakeLists.txt)
-
 For more details on SuiteSparse:GraphBLAS, and its use in LAGraph, see
 \cite{Davis18,Davis18b,DavisAznavehKolodziej19,Davis20,Mattson19}.
 
-A full and precise
-definition of the GraphBLAS specification is provided in {\em The GraphBLAS C
-API Specification} by {Ayd\i n Bulu\c{c}, Timothy Mattson, Scott McMillan,
-Jos\'e Moreira, and Carl Yang} \cite{BulucMattsonMcMillanMoreiraYang17,spec},
-based on {\em GraphBLAS Mathematics} by Jeremy Kepner \cite{Kepner2017}.  The
-GraphBLAS C API Specification is available at \url{http://graphblas.org}.  This
-version of SuiteSparse:GraphBLAS conforms to Version
-\input{GraphBLAS_API_version.tex}
-of {\em The GraphBLAS C API specification}, with one exception:
-% TODO in 4.0: Remove when GrB_wait() is removed:
-the single-input polymorphic function \verb'GrB_wait(&object)' does not appear
-in this version, since it conflicts with the no-input \verb'GrB_wait()'.  Use
-the non-polymorphic versions instead (\verb'GrB_Matrix_wait(&C)' for example).
+A full and precise definition of the GraphBLAS specification is provided in
+{\em The GraphBLAS C API Specification} by {Ayd\i n Bulu\c{c}, Timothy Mattson,
+Scott McMillan, Jos\'e Moreira, Carl Yang, and Benjamin Brock}
+\cite{BulucMattsonMcMillanMoreiraYang17,spec}, based on {\em GraphBLAS
+Mathematics} by Jeremy Kepner \cite{Kepner2017}.  The GraphBLAS C API
+Specification is available at \url{http://graphblas.org}.  This version of
+SuiteSparse:GraphBLAS conforms to Version \input{GraphBLAS_API_version.tex} of
+{\em The GraphBLAS C API specification}, with three exceptions
+(\verb'GrB_wait', \verb'GrB_error', and \verb'GrB_Matrix_reduce_BinaryOp').
 
 In this User Guide, aspects of the GraphBLAS specification that would be true
 for any GraphBLAS implementation are simply called ``GraphBLAS.'' Details
 unique to this particular implementation are referred to as
 SuiteSparse:GraphBLAS.
 
-\begin{spec}
-{\bf SPEC:} See the tag {\bf SPEC:} for SuiteSparse extensions to the spec.
-They are also placed in text boxes like this one.  All functions, objects, and
-macros with a name of the form \verb'GxB_*' are extensions to the spec.
-\end{spec}
+All functions, objects, and macros with a name of the form \verb'GxB_*' are
+SuiteSparse-specific extensions to the spec.
+
+\begin{alert}
+{\bf SPEC:} Non-obvious deviations or additions to the v1.3 GraphBLAS C API
+Specification are highlighted in a box like this one, except for \verb'GxB*'
+methods.  They are not highlighted since their name makes it clear that they
+are extensions to the v1.3 GraphBLAS C API.
+\end{alert}
 
 \newpage
-\subsection{Future plans:}
+%-------------------------------------------------------------------------------
+\subsection{Release Notes}
+%-------------------------------------------------------------------------------
 
 \begin{itemize}
 
-\item Version 4.0.0 (likely in July, 2020), will follow the V2.0 of the C API.
-    The following changes are tentative, and depend on the final release of the
-    V2.0 C API.
+\item Version 4.0.3 (Jan 19, 2021)
 
-    % TODO in 4.0: revise this as needed:
-    \verb'GrB_wait()', with no inputs: will be removed.
-    \verb'GrB_wait(&object)': polymorphic function will be added.
-    \verb'GrB_*_nvals' and related functions:
-        will no longer guarantee completion
-        (per the v1.3 C API);
-        use \verb'GrB_wait(&object)'
-        or non-polymorphic \verb'GrB_*_wait(&object)' instead.
-    \verb'GrB_error' will change; it will take two parameters,
-    \verb'GrB_error(&s,C)' where \verb's' is the error string generated
-    when \verb'C' was last operated on.
-    V4.0 will otherwise be identical to V3.3.1.
+    \begin{packed_itemize}
+    \item faster min/max monoids
+    \item MATLAB: \verb'G=GrB(G)' converts \verb'G' from v3 object to v4
+    \end{packed_itemize}
 
-\end{itemize}
+\item Version 4.0.2 (Jan 13, 2021)
 
-\subsection{Release Notes:}
-\begin{itemize}
+    \begin{packed_itemize}
+    \item ability to load \verb'*.mat' files saved with the v3 \verb'GrB'
+        MATLAB interface.
+    \end{packed_itemize}
+
+\item Version 4.0.1 (Jan 4, 2021)
+
+    \begin{packed_itemize}
+    \item significant performance improvements: compared with v3.3.3,
+        up to 5x faster in breadth-first-search (using 
+        \verb'LAGraph_bfs_parent2'), and 2x faster in
+        Betweenness-Centrality (using \verb'LAGraph_bc_batch5').
+    \item \verb'GrB_wait(void)', with no inputs: removed
+    \item \verb'GrB_wait(&object)': polymorphic function added
+    \item \verb'GrB_*_nvals': no longer guarantees completion;
+        use \verb'GrB_wait(&object)'
+        or non-polymorphic \verb'GrB_*_wait (&object)' instead
+    \item \verb'GrB_error': now has two parameters: a string
+        (\verb'char **') and an object.
+    \item \verb'GrB_Matrix_reduce_BinaryOp' limited to built-in operators that
+        correspond to known monoids.
+    \item \verb'GrB_*_extractTuples': may return indices out of order
+    \item removed internal features: GBI iterator, slice and hyperslice matrices
+    \item bitmap/full matrices and vectors added
+    \item positional operators and semirings:
+        \verb'GxB_FIRSTI_INT32' and related ops
+    \item jumbled matrices: sort left pending, like zombies and pending tuples
+    \item \verb'GxB_get/set': added \verb'GxB_SPARSITY_*'
+        (hyper, sparse, bitmap, or full) and \verb'GxB_BITMAP_SWITCH'.
+    \item \verb'GxB_HYPER': enum renamed to \verb'GxB_HYPER_SWITCH'
+    \item \verb'GxB*import/export': API modified
+    \item \verb'GxB_SelectOp': \verb'nrows' and \verb'ncols' removed
+        from function signature.
+    \item OpenMP tasking removed from mergesort and replaced with parallel
+        for loops.  Just as fast on Linux/Mac; now the performance ports to
+        Windows.
+    \item \verb'GxB_BURBLE' added as a supported feature.  This was an
+        undocumented feature of prior versions.
+    \item bug fix: \verb'A({lo,hi})=scalar' in MATLAB,
+        \verb'A(lo:hi)=scalar' was OK
+    \end{packed_itemize}
+
+\item Version 3.3.3 (July 14, 2020).
+    Bug fix: \verb'w<m>=A*u' with mask non-empty and u empty.
+
+\item Version 3.3.2 (July 3, 2020).  Minor changes to build system.
 
 \item Version 3.3.1 (June 30, 2020).  Bug fix to \verb'GrB_assign' and
     \verb'GxB_subassign' when the assignment is simple (\verb'C=A') but
@@ -235,18 +274,9 @@ \subsection{Release Notes:}
 \verb'GraphBLAS/User', which use the \verb'GxB_*_define' macros 
 (NOTE: feature removed in v3.2).
 The default matrix format is now \verb'GxB_BY_ROW'.
-% If you want the default format to be by column (the default in Version 2.1 and
-% earlier), just compile with \verb'-DBYCOL', or add \newline
-% \verb'GxB_set (GxB_FORMAT, GxB_BY_COL) ;'
-% after calling \verb'GrB_init'.
 Also added are the \verb'GxB_*print' methods for printing the contents of each
 GraphBLAS object (Section~\ref{fprint}).   PageRank demos have been added to
 the \verb'Demos' folder.
-Prior versions required GraphBLAS to be compiled with OpenMP, for it to be
-thread-safe.  It can now be compiled with POSIX pthreads.  The \verb'cmake'
-script automatically detects if OpenMP and/or POSIX pthreads are available.
-Demos have been added to show how GraphBLAS can be called from a multi-threaded
-user application.
 
 \item
 Version 2.1 (Oct 2018) was
@@ -315,17 +345,18 @@ \subsection{Graphs and sparse matrices} %=======================================
 
 Graphs can be huge, with many nodes and edges.  A dense adjacency matrix ${\bf
 A}$ for a graph of $n$ nodes takes $O(n^2)$ memory, which is impossible if $n$
-is, say, a million.  Most graphs arising in practice are sparse, however, with
-only $|{\bf A}|=O(n)$ edges, where $|{\bf A}|$ denotes the number of edges in
-the graph, or the number of explicit entries present in the data structure for
-the matrix ${\bf A}$.  Sparse graphs with millions of nodes and edges can
-easily be created by representing them as sparse matrices, where only explicit
-values need to be stored.  Some graphs are {\em hypersparse}, with ${|\bf A}|
-<< n$.  SuiteSparse:GraphBLAS supports two kinds of sparse matrix formats: a
-regular sparse format, taking $O(n+|{\bf A}|)$ space, and a hypersparse format
-taking only $O(|{\bf A}|)$ space.  As a result, creating a sparse matrix of
-size $n$-by-$n$ where $n=2^{60}$ (about $10^{18}$) can be done on quite easily
-on a commodity laptop, limited only by $|{\bf A}|$.
+is, say, a million.  Let $|{\bf A}|$ denote the number of entries in a matrix.
+Most graphs arising in practice are sparse, however, with only $|{\bf A}|=O(n)$
+edges, where $|{\bf A}|$ denotes the number of edges in the graph, or the
+number of explicit entries present in the data structure for the matrix ${\bf
+A}$.  Sparse graphs with millions of nodes and edges can easily be created by
+representing them as sparse matrices, where only explicit values need to be
+stored.  Some graphs are {\em hypersparse}, with ${|\bf A}| << n$.
+SuiteSparse:GraphBLAS supports two kinds of sparse matrix formats: a regular
+sparse format, taking $O(n+|{\bf A}|)$ space, and a hypersparse format taking
+only $O(|{\bf A}|)$ space.  As a result, creating a sparse matrix of size
+$n$-by-$n$ where $n=2^{60}$ (about $10^{18}$) can be done on quite easily on a
+commodity laptop, limited only by $|{\bf A}|$.
 
 A sparse matrix data structure only stores a subset of the possible $n^2$
 entries, and it assumes the values of entries not stored have some implicit
@@ -365,7 +396,7 @@ \subsection{Overview of GraphBLAS methods and operations} %=====================
 \label{overview}
 
 GraphBLAS provides a collection of {\em methods} to create, query, and free its
-of objects: sparse matrices, sparse vectors, sparse scalars, types, operators,
+of objects: sparse matrices, sparse vectors, scalars, types, operators,
 monoids, semirings, and a descriptor object used for parameter settings.
 Details are given in Section~\ref{objects}.  Once these objects are created
 they can be used in mathematical {\em operations} (not to be confused with the
@@ -383,6 +414,7 @@ \subsection{Overview of GraphBLAS methods and operations} %=====================
 transpose                           & \verb"C=A'" \\
 submatrix extraction                & \verb'C=A(I,J)' \\
 submatrix assignment                & \verb'C(I,J)=A' \\
+select                              & \verb'C=tril(A)' \\
 \hline
 \end{tabular}
 \vspace{0.1in}
@@ -423,9 +455,9 @@ \subsection{Overview of GraphBLAS methods and operations} %=====================
 and multiply operators, as long these few rules are followed.
 
 Just considering built-in types and operators, GraphBLAS can perform
-\verb'C=A*B' in 2,438 unique semirings.  With typecasting, any of these 2,438
-semirings can be applied to matrices \verb'C', \verb'A', and \verb'B'
-of 13 predefined types, in any combination.  This gives over 5 million possible
+\verb'C=A*B' in thousands of unique semirings.  With typecasting, any of these
+semirings can be applied to matrices \verb'C', \verb'A', and \verb'B' of 13
+predefined types, in any combination.  This results in millions of possible
 kinds of sparse matrix multiplication supported by GraphBLAS, and this is
 counting just built-in types and operators.  By contrast, MATLAB provides just
 two semirings for its sparse matrix multiplication \verb'C=A*B':
@@ -586,7 +618,8 @@ \subsection{The accumulator and the mask} %=====================================
     (that is, entries in ${\bf T}$ but not ${\bf C}$) \\
     \> \> $z_{ij} = t_{ij}$
     \end{tabbing} }
-The Accumulator Phase is followed by the Mask/Replace Phase, ${\bf C \langle M \rangle = Z}$
+The Accumulator Phase is followed by the Mask/Replace Phase,
+${\bf C \langle M \rangle = Z}$
 as controlled by the \verb'GrB_REPLACE' and \verb'GrB_COMP' descriptor options:
     \vspace{-0.2in}
     % mask/replace/scmp: C<M> = Z
@@ -767,11 +800,12 @@ \subsection{Typecasting} %======================================================
 unlike MATLAB, which does not allow a typecast of a \verb'NaN' to the MATLAB
 logical type.
 
-\begin{spec}
-{\bf SPEC:} the GraphBLAS API states that typecasting follows the rules of ANSI
-C.  Yet C leaves some typecasting undefined.  SuiteSparse:GraphBLAS provides a
-precise definition for all typecasting as an extension to the spec.
-\end{spec}
+\begin{alert}
+{\bf SPEC:} the GraphBLAS API C Specification states that typecasting follows
+the rules of ANSI C.  Yet C leaves some typecasting undefined.  All typecasting
+between built-in types in SuiteSparse:GraphBLAS is precisely defined, as an
+extension to the spec.
+\end{alert}
 
 %===============================================================================
 \subsection{Notation and list of GraphBLAS operations} %========================
@@ -779,9 +813,9 @@ \subsection{Notation and list of GraphBLAS operations} %========================
 \label{list}
 
 As a summary of what GraphBLAS can do, the following table lists all GraphBLAS
-operations (where \verb'GxB_*' are in SuiteSparse:GraphBLAS only).  Upper case
-letters denote a matrix, lower case letters are vectors, and ${\bf AB}$
-denote the multiplication of two matrices over a semiring.
+operations.  Upper case letters denote a matrix, lower case letters are
+vectors, and ${\bf AB}$ denote the multiplication of two matrices over a
+semiring.
 
 \vspace{0.05in}
 {\footnotesize
@@ -849,7 +883,7 @@ \subsection{Notation and list of GraphBLAS operations} %========================
 
 \newpage
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\section{Interfaces to MATLAB, Python, Julia, Java} %%%%%%%%%%%%%%%%%%%%%%%
+\section{Interfaces to MATLAB, Python, Julia, Java} %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 The MATLAB interface to SuiteSparse:GraphBLAS is included with this
@@ -861,15 +895,17 @@ \section{Interfaces to MATLAB, Python, Julia, Java} %%%%%%%%%%%%%%%%%%%%%%%
 Julia interface.  These are not part of the SuiteSparse:GraphBLAS distribution.
 See the links below (see Sections \ref{python} and \ref{julia}).
 
+%===============================================================================
 \subsection{MATLAB Interface}
+%===============================================================================
 \label{matlab}
 
-As of Version 3.1, a MATLAB interface is now available.  Refer to the
+An easy-to-use MATLAB interface for SuiteSparse:GraphBLAS is available; see the
 documentation in the \verb'GraphBLAS/GraphBLAS' folder for details.  Start with
 the \verb'README.md' file in that directory.  An easy-to-read output of the
 MATLAB demos can be found in \verb'GraphBLAS/GraphBLAS/demo/html'.
 
-The MATLAB interface adds the \verb'GrB' class, which is an opaque MATLAB
+The MATLAB interface adds the \verb'@GrB' class, which is an opaque MATLAB
 object that contains a GraphBLAS matrix, either double or single precision
 (real or complex), boolean, or any of the built-in integer types.  MATLAB
 sparse and full matrices can be arbitrarily mixed with GraphBLAS matrices.  The
@@ -893,7 +929,9 @@ \subsection{MATLAB Interface}
 \begin{packed_itemize}
     \item Saving a GrB matrix object from MATLAB can be done, but the
         resulting \verb'*.mat' file must be read in by the same version
-        of GraphBLAS.
+        of GraphBLAS.  To save an object in an upward compatible manner,
+        extract a struct as \verb'S=struct(G)', then save \verb'S'.
+        After loading \verb'S', do \verb'G=GrB(S)' to restore the object.
     \item \verb'GrB' matrices with dimension larger than \verb'2^53' do not
         display properly in the MATLAB \verb'whos' command.  MATLAB gets this
         information from \verb'size(A)', which returns a correct result, but
@@ -924,31 +962,35 @@ \subsection{MATLAB Interface}
         MATLAB result.
     \item Solvers, so that \verb'x=A\b' could return a GF(2) solution,
         for example.
-    \item Sparse matrices with dimension higher than 2.  It would be
-        possible to map an N-dimensional matrix to a large 2D
-        hypersparse GraphBLAS matrix.
+    \item Sparse matrices with dimension higher than 2.
 \end{packed_itemize}
 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Python Interface} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%===============================================================================
+\subsection{Python Interface}
+%===============================================================================
 \label{python}
 
 See Michel Pelletier's Python interface at
-\href{https://github.com/michelp/pygraphblas}{https://github.com/michelp/pygraphblas}.
-Anaconda is also developing a Python interface to SuiteSparse:GraphBLAS.
+\href{https://github.com/michelp/pygraphblas}{https://github.com/michelp/pygraphblas};
+it also appears at
+\href{https://anaconda.org/conda-forge/pygraphblas}{https://anaconda.org/conda-forge/pygraphblas}.
 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Julia Interface} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+See Jim Kitchen and Erik Welch's (both from Anaconda, Inc.) Python interface at
+\href{https://github.com/metagraph-dev/grblas}{https://github.com/metagraph-dev/grblas}.
+See also
+\href{https://anaconda.org/conda-forge/graphblas}{https://anaconda.org/conda-forge/graphblas}.
+
+%===============================================================================
+\subsection{Julia Interface}
+%===============================================================================
 \label{julia}
 
 See Abhinav Mehndiratta's Julia interface at  \\
 \href{https://github.com/abhinavmehndiratta/SuiteSparseGraphBLAS.jl}{https://github.com/abhinavmehndiratta/SuiteSparseGraphBLAS.jl}.
 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-\subsection{Java Interface} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%===============================================================================
+\subsection{Java Interface}
+%===============================================================================
 \label{java}
 
 Fabian Murariu is working on a Java interface.
@@ -975,13 +1017,7 @@ \section{GraphBLAS Context and Sequence} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \verb'GrB_' and \verb'GxB_' that may be used in user applications.  The prefix
 \verb'GrB_' denote items that appear in the official {\em GraphBLAS C API
 Specification}.  The prefix \verb'GxB_' refers to SuiteSparse-specific
-extensions to the GraphBLAS API.  Both may be used in user applications but be
-aware that items with prefixes \verb'GxB_' will not appear in other
-implementations of the GraphBLAS standard.
-
-\begin{spec}
-{\bf SPEC:} The following macros are extensions to the spec.
-\end{spec}
+extensions to the GraphBLAS API.
 
 The \verb'GraphBLAS.h' file includes all the definitions required to use
 GraphBLAS, including the following macros that can assist a user application in
@@ -1000,11 +1036,11 @@ \section{GraphBLAS Context and Sequence} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
     ... only use features in early specifications
     #endif
 
-    #if GxB_IMPLEMENTATION > GxB_VERSION (1,4,0)
-    ... use features from version 1.4.0 of a specific GraphBLAS implementation
+    #if GxB_IMPLEMENTATION >= GxB_VERSION (4,0,0)
+    ... use features from version 4.0.1 (or later)
+    of a specific GraphBLAS implementation
     #endif \end{verbatim}}
 
-
 SuiteSparse:GraphBLAS also defines the following strings with \verb'#define'.
 Refer to the \verb'GraphBLAS.h' file for details.
 
@@ -1088,6 +1124,32 @@ \section{GraphBLAS Context and Sequence} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 }
 \vspace{0.2in}
 
+%===============================================================================
+\subsection{{\sf GrB\_Index:} the GraphBLAS integer} %==========================
+%===============================================================================
+\label{grbindex}
+
+Matrix and vector dimensions and indexing rely on a specific integer,
+\verb'GrB_Index', which is defined in \verb'GraphBLAS.h' as
+
+    {\footnotesize
+    \begin{verbatim}
+    typedef uint64_t GrB_Index ; \end{verbatim}}
+
+Row and column indices of an \verb'nrows'-by-\verb'ncols' matrix range from
+zero to the \verb'nrows-1' for the rows, and zero to \verb'ncols-1' for the
+columns.  Indices are zero-based, like C, and not one-based, like MATLAB.  In
+SuiteSparse:GraphBLAS, the largest size permitted for any integer of
+\verb'GrB_Index' is $2^{60}$.  The largest \verb'GrB_Matrix' that
+SuiteSparse:GraphBLAS can construct is thus $2^{60}$-by-$2^{60}$.  An
+$n$-by-$n$ matrix $A$ that size can easily be constructed in practice with
+$O(|{\bf A}|)$ memory requirements, where $|{\bf A}|$ denotes the number of
+entries that explicitly appear in the pattern of ${\bf A}$.  The time and
+memory required to construct a matrix that large does not depend on $n$, since
+SuiteSparse:GraphBLAS can represent ${\bf A}$ in hypersparse form (see
+Section~\ref{hypersparse}).  The largest \verb'GrB_Vector' that can be
+constructed is $2^{60}$-by-1.
+
 %===============================================================================
 \subsection{{\sf GrB\_init:} initialize GraphBLAS} %============================
 %===============================================================================
@@ -1125,7 +1187,7 @@ \subsection{{\sf GrB\_init:} initialize GraphBLAS} %============================
 debugging a user application.  The mode cannot be changed once it is set by
 \verb'GrB_init'.
 
-GraphBLAS objects are opaque to the user application.  This allows GraphBLAS to
+GraphBLAS objects are opaque.  This allows GraphBLAS to
 postpone operations and then do them later in a more efficient manner by
 rearranging them and grouping them together.  In non-blocking mode, the
 computations required to construct an opaque GraphBLAS object might not be
@@ -1135,15 +1197,13 @@ \subsection{{\sf GrB\_init:} initialize GraphBLAS} %============================
 \verb'GrB_Matrix_extractTuples') always finish reading them, or creating them,
 when the method or operation returns to the user application.
 
-% TODO in 4.0: Revise for v4.0:
-In addition, all methods and operations that extract values from a GraphBLAS
-object and return them into non-opaque user arrays always ensure that the
-computations for that object are completed when the method returns, namely:
-\verb'GrB_*_nvals', \verb'GrB_*_extractElement', \verb'GrB_*_extractTuples',
-and \verb'GrB_*_reduce' (to scalar).
-{\bf NOTE: this behavior will change in SuiteSparse:GraphBLAS v4.0.  These
-functions will only guarantee that the user-visible arrays are fully populated;
-they will not guarantee completion.  Use \verb'GrB_*_wait(&object)' instead.}
+All methods and operations that extract values from a GraphBLAS object and
+return them into non-opaque user arrays always ensure that the user-visible
+arrays are fully populated when they return: \verb'GrB_*_reduce' (to scalar),
+\verb'GrB_*_nvals', \verb'GrB_*_extractElement', and
+\verb'GrB_*_extractTuples'.  These functions do {\em not} guarantee that the
+opaque objects they depend on are finalized.  To do that, use
+\verb'GrB_wait(&object)' instead.
 
 SuiteSparse:GraphBLAS is multithreaded internally, via OpenMP, and it is also
 safe to use in a multithreaded user application.  See Section~\ref{sec:install}
@@ -1166,15 +1226,8 @@ \subsection{{\sf GrB\_init:} initialize GraphBLAS} %============================
 When the user application is finished, exactly one user thread must call
 \verb'GrB_finalize', after which no user thread may call any \verb'GrB_*' or
 \verb'GxB_*' function.
-
-You can query the mode of a GraphBLAS session with the following
-(see Section~\ref{options}), which returns the \verb'mode' passed to
-\verb'GrB_init':
-
-{\footnotesize
-\begin{verbatim}
-    GrB_mode mode ;
-    GxB_get (GxB_MODE, &mode) ; \end{verbatim} }
+The mode of a GraphBLAS session can be queried with \verb'GxB_get';
+see Section~\ref{options} for details.
 
 \newpage
 %===============================================================================
@@ -1203,17 +1256,16 @@ \subsection{{\sf GrB\_getVersion:} determine the C API Version} %===============
 \verb'GrB_getVersion' thus provides a runtime access of the version of the C
 API Specification supported by the library.
 
+\begin{alert}
+{\bf SPEC:} 
 This version of SuiteSparse:GraphBLAS supports
 \input{GraphBLAS_API_version.tex}
-of the C API Specification,
-% TODO in 4.0: Remove when GrB_wait(no inputs) is removed;
-except for the polymorphic \verb'GrB_wait(&object)' with one input.
-That method will appear in SuiteSparse:GraphBLAS V4.0.0.  In the meantime, use
-the non-polymorphic methods \verb'GrB_Matrix_wait(&C)',
-\verb'GrB_Vector_wait(&v)', and so on.
+of the C API Specification, with the exception of changes to \verb'GrB_wait',
+\verb'GrB_error'. and \verb'GrB_Matrix_reduce_BinaryOp'.
+\end{alert}
 
 %===============================================================================
-\subsection{{\sf GxB\_init:} initialize with alternate malloc} %======
+\subsection{{\sf GxB\_init:} initialize with alternate malloc} %================
 %===============================================================================
 \label{xinit}
 
@@ -1223,7 +1275,6 @@ \subsection{{\sf GxB\_init:} initialize with alternate malloc} %======
 GrB_Info GxB_init           // start up GraphBLAS and also define malloc, etc
 (
     GrB_Mode mode,          // blocking or non-blocking mode
-
     // pointers to memory management functions.
     void * (* user_malloc_function  ) (size_t),
     void * (* user_calloc_function  ) (size_t, size_t),
@@ -1287,10 +1338,6 @@ \subsection{{\sf GxB\_init:} initialize with alternate malloc} %======
     GxB_init (mode, scalable_malloc, scalable_calloc, scalable_realloc,
         scalable_free, true) ; \end{verbatim}}
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_init' is an extension to the spec.
-\end{spec}
-
 \newpage
 %===============================================================================
 \subsection{{\sf GrB\_Info:} status code returned by GraphBLAS} %===============
@@ -1310,23 +1357,20 @@ \subsection{{\sf GrB\_Info:} status code returned by GraphBLAS} %===============
 \verb'GrB_SUCCESS'              & 0 & the method or operation was successful \\
 \verb'GrB_NO_VALUE'             & 1 & the method was successful, but the entry \\
                                 &   & does not appear in the matrix or vector. \\
-                                &   & Its value is implicit. \\
 \hline
 \hline
 \verb'GrB_UNINITIALIZED_OBJECT' & 2 & object has not been initialized \\
 \verb'GrB_INVALID_OBJECT'       & 3 & object is corrupted \\
 \verb'GrB_NULL_POINTER'         & 4 & input pointer is \verb'NULL' \\
 \verb'GrB_INVALID_VALUE'        & 5 & generic error code; some value is bad \\
-\verb'GrB_INVALID_INDEX'        & 6 & a row or column index is out of bounds;
-                                      for indices passed as scalars, not in a list. \\
+\verb'GrB_INVALID_INDEX'        & 6 & a row or column index is out of bounds \\
 \verb'GrB_DOMAIN_MISMATCH'      & 7 & object domains are not compatible \\
 \verb'GrB_DIMENSION_MISMATCH'   & 8 & matrix dimensions do not match \\
 \verb'GrB_OUTPUT_NOT_EMPTY'     & 9 & output matrix already has values in it \\
 \hline
 \verb'GrB_OUT_OF_MEMORY'        & 10 & out of memory \\
 \verb'GrB_INSUFFICIENT_SPACE'   & 11 & output array not large enough \\
-\verb'GrB_INDEX_OUT_OF_BOUNDS'  & 12 & a row or column index is out of bounds;
-                                  for indices in a list of indices. \\
+\verb'GrB_INDEX_OUT_OF_BOUNDS'  & 12 & a row or column index is out of bounds \\
 \hline
 \verb'GrB_PANIC'                & 13 & unrecoverable error.
 \\
@@ -1339,7 +1383,7 @@ \subsection{{\sf GrB\_Info:} status code returned by GraphBLAS} %===============
 GraphBLAS method or operation can return an out-of-memory condition,
 \verb'GrB_OUT_OF_MEMORY', or a panic, \verb'GrB_PANIC'.  These two errors, and
 the \verb'GrB_INDEX_OUT_OF_BOUNDS' error, are called {\em execution errors}.
-The other errors are called {\em API} errors.  An API error is detecting
+The other errors are called {\em API} errors.  An API error is detected
 immediately, regardless of the blocking mode.  The detection of an execution
 error may be deferred until the pending operations complete.
 
@@ -1359,25 +1403,35 @@ \subsection{{\sf GrB\_error:} get more details on the last error} %=============
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-const char *GrB_error ( ) ;     // return a string describing the last error
+GrB_Info GrB_error      // return a string describing the last error
+( 
+    const char **error, // error string
+    <type> object       // a GrB_matrix, GrB_Vector, etc.
+) ;
 \end{verbatim}
 }\end{mdframed}
 
 Each GraphBLAS method and operation returns a \verb'GrB_Info' error code.  The
-\verb'GrB_error' function returns additional information on the error in a
-thread-safe null-terminated string.  The string returned by \verb'GrB_error' is
-allocated in thread local storage and must not be freed or modified.  Each user
-thread has its own error status.  The simplest way to use it is just to print
-it out, such as:
+\verb'GrB_error' function returns additional information on the error for a
+particular object in a null-terminated string.  The string returned by
+\verb'GrB_error' is never a \verb'NULL' string, but it may have length zero
+(with the first entry being the \verb"'\0'" string-termination value).  The
+string must not be freed or modified.
 
     {\footnotesize
     \begin{verbatim}
-    info = GrB_some_method_here (...) ;
+    info = GrB_some_method_here (C, ...) ;
     if (! (info == GrB_SUCCESS || info == GrB_NO_VALUE))
     {
-        printf ("info: %d error: %s\n", info, GrB_error ( )) ;
+        char *err ;
+        GrB_error (&err, C) ;
+        printf ("info: %d error: %s\n", info, err) ;
     } \end{verbatim}}
 
+If \verb'C' has no error status, or if the error is not recorded in
+the string, an empty non-null string is returned.  In particular,
+out-of-memory conditions result in an empty string fro \verb'GrB_error'.
+
 SuiteSparse:GraphBLAS reports many helpful details via \verb'GrB_error'.  For
 example, if a row or column index is out of bounds, the report will state what
 those bounds are.  If a matrix dimension is incorrect, the mismatching
@@ -1385,30 +1439,85 @@ \subsection{{\sf GrB\_error:} get more details on the last error} %=============
 and \verb'GxB_SelectOp_new' record the name the function passed to them, and
 \verb'GrB_Type_new' records the name of its type parameter, and these are
 printed if the user-defined types and operators are used incorrectly.  Refer to
-the output of the example programs in the \verb'Demo' folder, which
-intentionally generate errors to illustrate the use of \verb'GrB_error'.
+the output of the example programs in the \verb'Demo' and \verb'Test' folder,
+which intentionally generate errors to illustrate the use of \verb'GrB_error'.
+
+The only functions in GraphBLAS that return an error string are functions that
+have a single input/output argument \verb'C', as a \verb'GrB_Matrix',
+\verb'GrB_Vector', \verb'GxB_Scalar', or \verb'GrB_Descriptor'. Methods that
+create these objects (such as \verb'GrB_Matrix_new') return a \verb'NULL'
+object on failure, so these methods cannot also return an error string in
+\verb'C'.
 
-Successful GraphBLAS methods do not modify the last error message recorded.  If
-a GraphBLAS method fails and then subsequent GraphBLAS method succeeds, the
-error message is not modified from the last failure.  A subsequent failure
-will cause \verb'GrB_error' to return a different error message.
+Any subsequent GraphBLAS method that modifies the object \verb'C' clears the
+error string.
 
 Note that \verb'GrB_NO_VALUE' is an not error, but an informational status.
 \verb'GrB_*_extractElment(&x,A,i,j)', which does \verb'x=A(i,j)', returns this
-value to indicate that \verb'A(i,j)' is not present in the matrix.
+value to indicate that \verb'A(i,j)' is not present in the matrix.  That
+method does not have an input/output object so it cannot return an error
+string.
 
-In SuiteSparse:GraphBLAS, some failures cannot be safely recorded for
-\verb'GrB_error' to print.  These include \verb'GrB_PANIC' and errors in
-\verb'GrB_init' and \verb'GxB_init'.
+The \verb'GrB_error' function is a polymorphic function for the
+following variants:
 
-% TODO in 4.0: revise this statement when the change to the API is finalized:
-{\bf NOTE:} \verb'GrB_error' may change in the future.  It may have the signature
-\verb'GrB_error(&s,C)' where \verb's' is the error string that describes the
-error when \verb'C' is the output object of a GraphBLAS method.  This change to
-the C API is tentative.  If this change is made, it will be reflected in
-SuiteSparse:GraphBLAS v4.0.
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_Type_error       (const char **error, const GrB_Type type) ;
+GrB_Info GrB_UnaryOp_error    (const char **error, const GrB_UnaryOp op) ;
+GrB_Info GrB_BinaryOp_error   (const char **error, const GrB_BinaryOp op) ;
+GrB_Info GxB_SelectOp_error   (const char **error, const GxB_SelectOp op) ;
+GrB_Info GrB_Monoid_error     (const char **error, const GrB_Monoid monoid) ;
+GrB_Info GrB_Semiring_error   (const char **error, const GrB_Semiring semiring) ;
+GrB_Info GxB_Scalar_error     (const char **error, const GxB_Scalar s) ;
+GrB_Info GrB_Vector_error     (const char **error, const GrB_Vector v) ;
+GrB_Info GrB_Matrix_error     (const char **error, const GrB_Vector A) ;
+GrB_Info GrB_Descriptor_error (const char **error, const GrB_Descriptor d) ;
+\end{verbatim}
+}\end{mdframed}
+
+Currently, only \verb'GrB_Matrix_error', \verb'GrB_Vector_error',
+\verb'GxB_Scalar_error', and \verb'GrB_Descriptor_error' are able to return
+non-empty error strings.  The latter can return an error string only from
+\verb'GrB_Descriptor_set' and \verb'GxB_set(d,...)'.
+
+The only GraphBLAS methods (Section~\ref{objects}) that return an error string
+are \verb'*setElement', \verb'*removeElement',
+\verb'GxB_Matrix_Option_set(A,...)', \newline
+\verb'GxB_Vector_Option_set(v,...)', \verb'GrB_Descriptor_set', and
+\verb'GxB_Desc_set(d,...)'.  All GraphBLAS operations discussed in
+Section~\ref{operations} can return an error string in their input/output
+object, except for \verb'GrB_reduce' when reducing to a scalar.
+
+\begin{alert}
+{\bf SPEC:} \verb'GrB_error' conforms to a draft of the v2.0 GraphBLAS C
+Specification.  The v1.3 version of this function has the signature \newline
+\verb'const char *GrB_error (void)', with no inputs, which is no
+longer supported in SuiteSparse:GraphBLAS v4 or later.
+\end{alert}
 
 \newpage
+%===============================================================================
+\subsection{{\sf GrB\_wait:} on all objects} %==================================
+%===============================================================================
+\label{wait_all}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_wait ( ) ;         // wait for all objects: NOT SUPPORTED
+\end{verbatim}
+}\end{mdframed}
+
+\begin{alert}
+{\bf SPEC:} The v1.3 GraphBLAS C API Specification includes \verb'GrB_wait ( )'
+with no inputs, which waits for all objects computed by any user thread.  This
+has serious performance issues and thus it is no longer implemented in
+SuiteSparse:GraphBLAS v4 and later.  SuiteSparse:GraphBLAS only provides
+\verb'GrB_wait (&object)', to wait on a single object.
+\end{alert}
+
 %===============================================================================
 \subsection{{\sf GrB\_finalize:} finish GraphBLAS} %============================
 %===============================================================================
@@ -1440,7 +1549,7 @@ \section{GraphBLAS Objects and their Methods} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 their scalar data type (or domain), binary and unary operators on scalar types,
 monoids, semirings, and a {\em descriptor} object used to specify optional
 parameters that modify the behavior of a GraphBLAS operation.
-SuiteSparse:GraphBLAS adds two additional objects: a sparse scalar
+SuiteSparse:GraphBLAS adds two additional objects: a scalar
 (\verb'GxB_Scalar'), and an operator for selecting entries from a matrix or
 vector (\verb'GxB_SelectOp').
 
@@ -1464,7 +1573,7 @@ \section{GraphBLAS Objects and their Methods} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
                      & that defines the ``multiply'' for an algebraic semiring \\
 \verb'GrB_Matrix'    & a 2D sparse matrix of any type \\
 \verb'GrB_Vector'    & a 1D sparse column vector of any type \\
-\verb'GxB_Scalar'    & a sparse scalar of any type \\
+\verb'GxB_Scalar'    & a scalar of any type \\
 \verb'GrB_Descriptor'& a collection of parameters that modify an operation \\
 \hline
 \end{tabular}
@@ -1477,11 +1586,6 @@ \section{GraphBLAS Objects and their Methods} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 back to GraphBLAS which will do the work.  Assigning one handle to another
 is valid but it does not make a copy of the underlying object.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_SelectOp' and \verb'GxB_Scalar' are extensions to
-GraphBLAS.
-\end{spec}
-
 \newpage
 %===============================================================================
 \subsection{The GraphBLAS type: {\sf GrB\_Type}} %==============================
@@ -1635,21 +1739,8 @@ \subsubsection{{\sf GrB\_Type\_new:} create a user-defined type}
 a user-defined semiring with this type.
 
 Performing arithmetic on matrices and vectors with user-defined types requires
-operators to be defined.  For example, the user application can define its own
-type for complex numbers, but then transposing the matrix with GraphBLAS will
-not compute the complex conjugate transpose.  This corresponds to the array
-transpose in MATLAB (\verb"C=A.'") instead of the complex conjugate transpose
-(\verb"C=A'").  To compute the complex conjugate transpose, the application
-would need to create a user-defined unary operator to conjugate a user-defined
-complex scalar, and then apply it to the matrix before or after the transpose,
-via \verb'GrB_apply'.  An extensive set of complex operators are provided in
-the \verb'usercomplex.c' example in the \verb'Demo' folder, along with an
-include file, \verb'usercomplex.h', that is suitable for inclusion in any user
-application.  GraphBLAS does not include any complex types or operators,
-SuiteSparse:GraphBLAS provides them in two simple ``user'' files in the
-\verb'Demo' folder, as user-defined types.  They also now appear as built-in
-types, \verb'GxB_FC32' and \verb'GxB_FC64'.  Refer to Section~\ref{user} for
-more details on these example user-defined types.
+operators to be defined.  Refer to Section~\ref{user} for more details on these
+example user-defined types.
 
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Type\_wait:} wait for a type}
@@ -1658,7 +1749,7 @@ \subsubsection{{\sf GrB\_Type\_wait:} wait for a type}
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_Type_wait          // wait for a user-defined type
+GrB_Info GrB_wait               // wait for a user-defined type
 (
     GrB_Type *type              // type to wait for
 ) ;
@@ -1670,7 +1761,6 @@ \subsubsection{{\sf GrB\_Type\_wait:} wait for a type}
 the \verb'type' is completed.  SuiteSparse:GraphBLAS currently does nothing for
 \verb'GrB_Type_wait(&type)', except to ensure that \verb'type' is valid.
 
-\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_Type\_size:} return the size of a type}
 %-------------------------------------------------------------------------------
@@ -1690,10 +1780,7 @@ \subsubsection{{\sf GxB\_Type\_size:} return the size of a type}
 example \verb'GxB_Type_size (&s, GrB_INT32)' sets \verb's' to 4, the same as
 \verb'sizeof(int32_t)'.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Type_size' is an extension to the spec.
-\end{spec}
-
+\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Type\_free:} free a user-defined type}
 %-------------------------------------------------------------------------------
@@ -1785,7 +1872,6 @@ \subsection{GraphBLAS unary operators: {\sf GrB\_UnaryOp}, $z=f(x)$} %==========
 \verb'GrB_MINV_'$T$     & $T \rightarrow T$ & $z = 1/x$     & multiplicative inverse \\
 \hline
 \end{tabular}
-\vspace{0.2in}
 
 \vspace{0.2in}
 \begin{tabular}{|llll|}
@@ -1802,6 +1888,20 @@ \subsection{GraphBLAS unary operators: {\sf GrB\_UnaryOp}, $z=f(x)$} %==========
 \verb'GrB_BNOT_'$I$     & $I \rightarrow I$ & $z = \lnot x$ & bitwise negation \\
 \hline
 \end{tabular}
+
+\vspace{0.2in}
+\begin{tabular}{|llll|}
+\hline
+\multicolumn{4}{|c|}{Positional unary operators for any type (including user-defined)} \\
+\hline
+GraphBLAS name            & types (domains)   & $z=f(a_{ij})$      & description \\
+\hline
+\verb'GxB_POSITIONI_'$T$  & $ \rightarrow T$  & $z = i$       & row index (0-based) \\
+\verb'GxB_POSITIONI1_'$T$ & $ \rightarrow T$  & $z = i+1$     & row index (1-based) \\
+\verb'GxB_POSITIONJ_'$T$  & $ \rightarrow T$  & $z = j$       & column index (0-based) \\
+\verb'GxB_POSITIONJ1_'$T$ & $ \rightarrow T$  & $z = j+1$     & column index (1-based) \\
+\hline
+\end{tabular}
 \vspace{0.2in}
 
 \begin{tabular}{|llll|}
@@ -1858,19 +1958,6 @@ \subsection{GraphBLAS unary operators: {\sf GrB\_UnaryOp}, $z=f(x)$} %==========
 \end{tabular}
 \vspace{0.2in}
 
-\verb'GxB_FREXPX' \verb'GxB_FREXPE' return the mantissa and exponent, respectively,
-from the ANSI C11 \verb'frexp' function.  The exponent is returned as a
-floating-point value, not an integer.
-
-The functions \verb'casin', \verb'casinf', \verb'casinh', and \verb'casinhf'
-provided by Microsoft Visual Studio for computing $\sin^{-1}(x)$ and
-$\sinh^{-1}(x)$ when $x$ is complex do not compute the correct result.  Thus,
-the unary operators \verb'GxB_ASIN_FC32', \verb'GxB_ASIN_FC64'
-\verb'GxB_ASINH_FC32', and \verb'GxB_ASINH_FC64' do not work properly if the MS
-Visual Studio compiler is used.  These functions work properly if the gcc, icc,
-or clang compilers are used on Linux or MacOS.
-
-\vspace{0.2in}
 \begin{tabular}{|llll|}
 \hline
 \multicolumn{4}{|c|}{Unary operators for complex types} \\
@@ -1887,13 +1974,38 @@ \subsection{GraphBLAS unary operators: {\sf GrB\_UnaryOp}, $z=f(x)$} %==========
 }
 \vspace{0.2in}
 
+A positional unary operator return the row or column index of an entry.  For a
+matrix $z=f(a_{ij})$ returns $z = i$ or $z = j$, or +1 for 1-based indices.
+The latter is useful in the MATLAB interface, where row and column indices are
+1-based.  When applied to a vector, $j$ is always zero, and $i$ is the index in
+the vector.  Positional unary operators come in two types: \verb'INT32' and
+\verb'INT64', which is the type of the output, $z$.  The functions are agnostic
+to the type of their inputs; they only depend on the position of the entries,
+not their values.
+User-defined positional operators cannot be defined by \verb'GrB_UnaryOp_new'.
+
+\verb'GxB_FREXPX' \verb'GxB_FREXPE' return the mantissa and exponent,
+respectively, from the ANSI C11 \verb'frexp' function.  The exponent is
+returned as a floating-point value, not an integer.
+
+The operators \verb'GxB_EXPM1_FC*' and \verb'GxB_LOG1P_FC*' for complex
+types are currently not accurate.  They will be revised in a future version.
+
+The functions \verb'casin', \verb'casinf', \verb'casinh', and \verb'casinhf'
+provided by Microsoft Visual Studio for computing $\sin^{-1}(x)$ and
+$\sinh^{-1}(x)$ when $x$ is complex do not compute the correct result.  Thus,
+the unary operators \verb'GxB_ASIN_FC32', \verb'GxB_ASIN_FC64'
+\verb'GxB_ASINH_FC32', and \verb'GxB_ASINH_FC64' do not work properly if the MS
+Visual Studio compiler is used.  These functions work properly if the gcc, icc,
+or clang compilers are used on Linux or MacOS.
+
 Integer division by zero normally terminates an application, but this is
 avoided in SuiteSparse:GraphBLAS.  For details, see the binary
 \verb'GrB_DIV_'$T$ operators.
 
-\begin{spec}
+\begin{alert}
 {\bf SPEC:} The definition of integer division by zero is an extension to the spec.
-\end{spec}
+\end{alert}
 
 The next sections define the following methods for the \verb'GrB_UnaryOp'
 object:
@@ -1912,6 +2024,7 @@ \subsection{GraphBLAS unary operators: {\sf GrB\_UnaryOp}, $z=f(x)$} %==========
 }
 \vspace{0.1in}
 
+\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_UnaryOp\_new:} create a user-defined unary operator}
 %-------------------------------------------------------------------------------
@@ -1950,7 +2063,7 @@ \subsubsection{{\sf GrB\_UnaryOp\_new:} create a user-defined unary operator}
 passed as \verb'(void *)' pointers, but they will be pointers to values of the
 correct type, defined by \verb'ztype' and \verb'xtype', respectively, when the
 operator was created.
-% V2.1 and later:
+
 {\bf NOTE:}
 The pointers may not be unique.  That is, the user function may be
 called with multiple pointers that point to the same space, such as when
@@ -1958,8 +2071,7 @@ \subsubsection{{\sf GrB\_UnaryOp\_new:} create a user-defined unary operator}
 unary operator.  Any parameters passed to the user-callable function may be
 aliased to each other.
 
-% SPEC: the spec is silent on aliasing in user-defined functions
-
+\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_UnaryOp\_wait:} wait for a unary operator}
 %-------------------------------------------------------------------------------
@@ -1967,7 +2079,7 @@ \subsubsection{{\sf GrB\_UnaryOp\_wait:} wait for a unary operator}
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_UnaryOp_wait       // wait for a user-defined unary operator
+GrB_Info GrB_wait               // wait for a user-defined unary operator
 (
     GrB_UnaryOp *unaryop        // unary operator to wait for
 ) ;
@@ -2000,10 +2112,6 @@ \subsubsection{{\sf GxB\_UnaryOp\_ztype:} return the type of $z$}
 \verb'GxB_UnaryOp_ztype' returns the \verb'ztype' of the unary operator, which
 is the type of $z$ in the function $z=f(x)$.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_UnaryOp_ztype' is an extension to the spec.
-\end{spec}
-
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_UnaryOp\_xtype:} return the type of $x$}
 %-------------------------------------------------------------------------------
@@ -2023,11 +2131,7 @@ \subsubsection{{\sf GxB\_UnaryOp\_xtype:} return the type of $x$}
 \verb'GxB_UnaryOp_xtype' returns the \verb'xtype' of the unary operator, which
 is the type of $x$ in the function $z=f(x)$.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_UnaryOp_xtype' is an extension to the spec.
-\end{spec}
-
-% \newpage
+\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_UnaryOp\_free:} free a user-defined unary operator}
 %-------------------------------------------------------------------------------
@@ -2070,10 +2174,10 @@ \subsection{GraphBLAS binary operators: {\sf GrB\_BinaryOp}, $z=f(x,y)$} %======
 (\verb'GxB_FC32' and \verb'GxB_FC64').  For those types, the operator name
 always starts with \verb'GxB', not \verb'GrB').
 
-The six \verb'GxB_IS*' comparison operators and the \verb'GxB_*' logical operators all
-return a result one for true and zero for false, in the same domain $T$ or $R$ as
-their inputs.  These six comparison operators are useful as ``multiply''
-operators for creating semirings with non-Boolean monoids.
+The six \verb'GxB_IS*' comparison operators and the \verb'GxB_*' logical
+operators all return a result one for true and zero for false, in the same
+domain $T$ or $R$ as their inputs.  These six comparison operators are useful
+as ``multiply'' operators for creating semirings with non-Boolean monoids.
 
 \vspace{0.2in}
 {\footnotesize
@@ -2154,7 +2258,7 @@ \subsection{GraphBLAS binary operators: {\sf GrB\_BinaryOp}, $z=f(x,y)$} %======
 \hline
 GraphBLAS name        & types (domains)            & $z=f(x,y)$      & description \\
 \hline
-% 6 TxT->bool comparison
+% 6 TxT -> bool comparison
 \verb'GrB_EQ_'$T$     & $T \times T \rightarrow $\verb'bool' & $z = (x == y)$  & equal \\
 \verb'GrB_NE_'$T$     & $T \times T \rightarrow $\verb'bool' & $z = (x \ne y)$ & not equal \\
 \hline
@@ -2228,7 +2332,7 @@ \subsection{GraphBLAS binary operators: {\sf GrB\_BinaryOp}, $z=f(x,y)$} %======
 }
 \vspace{0.2in}
 
-Finally, eight bitwise operators are predefined for signed and unsigned integers.
+Eight bitwise operators are predefined for signed and unsigned integers.
 
 \vspace{0.2in}
 {\footnotesize
@@ -2278,9 +2382,9 @@ \subsection{GraphBLAS binary operators: {\sf GrB\_BinaryOp}, $z=f(x,y)$} %======
 zero for false.  The set of \verb'GxB_L*_'$T$ operators are useful since they
 can be combined with non-Boolean monoids in a semiring.
 
-\begin{spec}
+\begin{alert}
 {\bf SPEC:} The definition of integer division by zero is an extension to the spec.
-\end{spec}
+\end{alert}
 
 Floating-point operations follow the IEEE 754 standard.  Thus, computing $x/0$
 for a floating-point $x$ results in \verb'+Inf' if $x$ is positive, \verb'-Inf'
@@ -2294,6 +2398,60 @@ \subsection{GraphBLAS binary operators: {\sf GrB\_BinaryOp}, $z=f(x,y)$} %======
 $2^{31}-1$ and (-1)/0 is $-2^{31}$.  Refer to Section~\ref{type} for a list of
 integer ranges.
 
+Eight positional operators are predefined.  They differ when used in a semiring
+and when used in \verb'GrB_eWise*' and \verb'GrB_apply'.  Positional operators
+cannot be used in \verb'GrB_build', nor can they be used as the \verb'accum'
+operator for any operation.
+
+The positional binary operators do not depend on the type or numerical value of
+their inputs, just their position in a matrix or vector.  For a vector, $j$ is
+always 0, and $i$ is the index into the vector.  There are two types $T$
+available: \verb'INT32' and \verb'INT64', which is the type of the output $z$.
+User-defined positional operators cannot be defined by \verb'GrB_BinaryOp_new'.
+
+\vspace{0.2in}
+{\footnotesize
+\begin{tabular}{|llll|}
+\hline
+\multicolumn{4}{|c|}{Positional binary operators for any type (including user-defined)} \\
+\multicolumn{4}{|c|}{when used as a multiplicative operator in a semiring} \\
+\hline
+GraphBLAS name            & types (domains)   & $z=f(a_{ik},b_{kj})$      & description \\
+\hline
+\verb'GxB_FIRSTI_'$T$    & $ \rightarrow T$  & $z = i$       & row index of $a_{ik}$ (0-based) \\
+\verb'GxB_FIRSTI1_'$T$   & $ \rightarrow T$  & $z = i+1$     & row index of $a_{ik}$ (1-based) \\
+\verb'GxB_FIRSTJ_'$T$    & $ \rightarrow T$  & $z = k$       & column index of $a_{ik}$ (0-based) \\
+\verb'GxB_FIRSTJ1_'$T$   & $ \rightarrow T$  & $z = k+1$     & column index of $a_{ik}$ (1-based) \\
+\verb'GxB_SECONDI_'$T$   & $ \rightarrow T$  & $z = k$       & row index of $b_{kj}$ (0-based) \\
+\verb'GxB_SECONDI1_'$T$  & $ \rightarrow T$  & $z = k+1$     & row index of $b_{kj}$ (1-based) \\
+\verb'GxB_SECONDJ_'$T$   & $ \rightarrow T$  & $z = j$       & column index of $b_{kj}$ (0-based) \\
+\verb'GxB_SECONDJ1_'$T$  & $ \rightarrow T$  & $z = j+1$     & column index of $b_{kj}$ (1-based) \\
+\hline
+\end{tabular}
+}
+
+\vspace{0.2in}
+{\footnotesize
+\begin{tabular}{|llll|}
+\hline
+\multicolumn{4}{|c|}{Positional binary operators for any type (including user-defined)} \\
+\multicolumn{4}{|c|}{when used in all other methods} \\
+\hline
+GraphBLAS name            & types (domains)   & $z=f(a_{ij},b_{ij})$      & description \\
+\hline
+\verb'GxB_FIRSTI_'$T$    & $ \rightarrow T$  & $z = i$       & row index of $a_{ij}$ (0-based) \\
+\verb'GxB_FIRSTI1_'$T$   & $ \rightarrow T$  & $z = i+1$     & row index of $a_{ij}$ (1-based) \\
+\verb'GxB_FIRSTJ_'$T$    & $ \rightarrow T$  & $z = j$       & column index of $a_{ij}$ (0-based) \\
+\verb'GxB_FIRSTJ1_'$T$   & $ \rightarrow T$  & $z = j+1$     & column index of $a_{ij}$ (1-based) \\
+\verb'GxB_SECONDI_'$T$   & $ \rightarrow T$  & $z = i$       & row index of $b_{ij}$ (0-based) \\
+\verb'GxB_SECONDI1_'$T$  & $ \rightarrow T$  & $z = i+1$     & row index of $b_{ij}$ (1-based) \\
+\verb'GxB_SECONDJ_'$T$   & $ \rightarrow T$  & $z = j$       & column index of $b_{ij}$ (0-based) \\
+\verb'GxB_SECONDJ1_'$T$  & $ \rightarrow T$  & $z = j+1$     & column index of $b_{ij}$ (1-based) \\
+\hline
+\end{tabular}
+}
+\vspace{0.2in}
+
 The next sections define the following methods for the \verb'GrB_BinaryOp'
 object:
 
@@ -2353,11 +2511,12 @@ \subsubsection{{\sf GrB\_BinaryOp\_new:} create a user-defined binary operator}
 \verb'y' are passed as \verb'(void *)' pointers, but they will be pointers to
 values of the correct type, defined by \verb'ztype', \verb'xtype', and
 \verb'ytype', respectively, when the operator was created.
-% V2.1 and later:
+
 {\bf NOTE:} SuiteSparse:GraphBLAS may call the function with the pointers
 \verb'z' and \verb'x' equal to one another, in which case \verb'z=f(z,y)'
 should be computed.  Future versions may use additional pointer aliasing.
 
+\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_BinaryOp\_wait:} wait for a binary operator}
 %-------------------------------------------------------------------------------
@@ -2365,7 +2524,7 @@ \subsubsection{{\sf GrB\_BinaryOp\_wait:} wait for a binary operator}
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_BinaryOp_wait      // wait for a user-defined binary operator
+GrB_Info GrB_wait               // wait for a user-defined binary operator
 (
     GrB_BinaryOp *binaryop      // binary operator to wait for
 ) ;
@@ -2379,7 +2538,6 @@ \subsubsection{{\sf GrB\_BinaryOp\_wait:} wait for a binary operator}
 \verb'GrB_BinaryOp_wait(&binaryop)', except to ensure that the \verb'binaryop'
 is valid.
 
-% \newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_BinaryOp\_ztype:} return the type of $z$}
 %-------------------------------------------------------------------------------
@@ -2400,10 +2558,6 @@ \subsubsection{{\sf GxB\_BinaryOp\_ztype:} return the type of $z$}
 returns the \verb'ztype' of the binary operator, which is the
 type of $z$ in the function $z=f(x,y)$.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_BinaryOp_ztype' is an extension to the spec.
-\end{spec}
-
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_BinaryOp\_xtype:} return the type of $x$}
 %-------------------------------------------------------------------------------
@@ -2424,10 +2578,7 @@ \subsubsection{{\sf GxB\_BinaryOp\_xtype:} return the type of $x$}
 returns the \verb'xtype' of the binary operator, which is the
 type of $x$ in the function $z=f(x,y)$.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_BinaryOp_xtype' is an extension to the spec.
-\end{spec}
-
+\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_BinaryOp\_ytype:} return the type of $y$}
 %-------------------------------------------------------------------------------
@@ -2448,11 +2599,6 @@ \subsubsection{{\sf GxB\_BinaryOp\_ytype:} return the type of $y$}
 returns the \verb'ytype' of the binary operator, which is the
 type of $y$ in the function $z=f(x,y)$.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_BinaryOp_ytype' is an extension to the spec.
-\end{spec}
-
-\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_BinaryOp\_free:} free a user-defined binary operator}
 %-------------------------------------------------------------------------------
@@ -2491,7 +2637,7 @@ \subsubsection{{\sf ANY} and {\sf PAIR} operators}
 \verb'PAIR'.
 
 The \verb'PAIR' operator is simple to describe: just $f(x,y)=1$.  It is called
-the \verb'PAIR' operator since it returns 1 in a semiring when a pair of
+the \verb'PAIR' operator since it returns $1$ in a semiring when a pair of
 entries $a_{ik}$ and $b_{kj}$ is found in the matrix multiply.  This operator
 is simple yet very useful.  It allows purely symbolic computations to be
 performed on matrices of any type, without having to typecast them to Boolean
@@ -2500,10 +2646,10 @@ \subsubsection{{\sf ANY} and {\sf PAIR} operators}
 the values of the matrix, so it is a very fast operator to use.
 
 The \verb'ANY' operator is very unusual, but very powerful.  It is the function
-$f(x,y)=x$, or $y$, where GraphBLAS has to freedom to select either $x$, or
-$y$, at its own discretion.  Do not confuse the \verb'ANY' operator with the
-\verb'any' function in MATLAB, which computes a reduction using the logical OR
-operator.
+$f_{\mbox{any}}(x,y)=x$, or $y$, where GraphBLAS has to freedom to select
+either $x$, or $y$, at its own discretion.  Do not confuse the \verb'ANY'
+operator with the \verb'any' function in MATLAB, which computes a reduction
+using the logical OR operator.
 
 The \verb'ANY' function is associative and commutative, and can thus serve as
 an operator for a monoid.  The selection of $x$ are $y$ is not randomized.
@@ -2514,10 +2660,10 @@ \subsubsection{{\sf ANY} and {\sf PAIR} operators}
 operator allows for a relaxed form of synchronization to be used, resulting
 in a fast benign race condition.
 
-The result of the \verb'ANY' monoid is non-deterministic, unless it is
-coupled with the \verb'PAIR' multiplicative operator.  In this case,
-the \verb'ANY_PAIR' semiring will return a deterministic result,
-since $f(1,1)$ is always 1, for the \verb'ANY' operator $f(x,y)$.
+Because of this benign race condition, the result of the \verb'ANY' monoid can
+be non-deterministic, unless it is coupled with the \verb'PAIR' multiplicative
+operator.  In this case, the \verb'ANY_PAIR' semiring will return a
+deterministic result, since $f_{\mbox{any}}(1,1)$ is always 1.
 
 When paired with a different operator, the results are non-deterministic.  This
 gives a powerful method when computing results for which any value selected by
@@ -2535,8 +2681,15 @@ \subsection{SuiteSparse:GraphBLAS select operators: {\sf GxB\_SelectOp}} %======
 %===============================================================================
 \label{selectop}
 
+\begin{alert}
+NOTE: the API for the select function has changed in v4.0.1 of
+SuiteSparse:GraphBLAS.  The former function signature (in v3.3.3 and earlier)
+included the dimensions of the matrix.  These are not needed since they can be
+passed in as part of the thunk, if needed.
+\end{alert}
+
 A select operator is a scalar function of the form
-$z=f(i,j,m,n,a_{ij},\mbox{thunk})$ that is applied to the entries $a_{ij}$ of
+$z=f(i,j,a_{ij},\mbox{thunk})$ that is applied to the entries $a_{ij}$ of
 an $m$-by-$n$ matrix.  The domain (type) of $z$ is always boolean.  The domain
 (type) of $a_{ij}$ can be any built-in or user-defined type, or it can be
 \verb'GrB_NULL' if the operator is type-generic.
@@ -2553,8 +2706,6 @@ \subsection{SuiteSparse:GraphBLAS select operators: {\sf GxB\_SelectOp}} %======
 (
     const GrB_Index i,      // row index of A(i,j)
     const GrB_Index j,      // column index of A(i,j)
-    const GrB_Index nrows,  // number of rows of A
-    const GrB_Index ncols,  // number of columns of A
     const void *x,          // value of A(i,j), or NULL if f is type-generic
     const void *thunk       // user-defined auxiliary data
 ) ; \end{verbatim}}
@@ -2594,11 +2745,6 @@ \subsection{SuiteSparse:GraphBLAS select operators: {\sf GxB\_SelectOp}} %======
 }
 \vspace{0.2in}
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_SelectOp' and the table above
-are extensions to the spec.
-\end{spec}
-
 The following methods operate on the \verb'GxB_SelectOp' object:
 
 \vspace{0.1in}
@@ -2615,6 +2761,7 @@ \subsection{SuiteSparse:GraphBLAS select operators: {\sf GxB\_SelectOp}} %======
 }
 \vspace{0.1in}
 
+\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_SelectOp\_new:} create a user-defined select operator}
 %-------------------------------------------------------------------------------
@@ -2638,18 +2785,18 @@ \subsubsection{{\sf GxB\_SelectOp\_new:} create a user-defined select operator}
 
 The \verb'function' argument to \verb'GxB_SelectOp_new' is a pointer to a
 user-defined function whose signature is given at the beginning of
-Section~\ref{selectop}.  Given the properties of an entry $a_{ij}$ in an
-$m$-by-$n$ matrix, the \verb'function' should return \verb'true' if the entry
+Section~\ref{selectop}.  Given the properties of an entry $a_{ij}$ in a
+matrix, the \verb'function' should return \verb'true' if the entry
 should be kept in the output of \verb'GxB_select', or \verb'false' if it should
 not appear in the output.
 
 The type \verb'xtype' is the GraphBLAS type of the input $x$ of the
-user-defined function $z=f(i,j,m,n,x,\mbox{thunk})$.  The type may be built-in
+user-defined function $z=f(i,j,x,\mbox{thunk})$.  The type may be built-in
 or user-defined, or it may even be \verb'GrB_NULL'.  If the \verb'xtype' is
 \verb'GrB_NULL', then the \verb'selectop' is type-generic.
 
 The type \verb'ttype' is the GraphBLAS type of the input {\em thunk} of the
-user-defined function $z=f(i,j,m,n,x,\mbox{thunk})$.  The type may be built-in
+user-defined function $z=f(i,j,x,\mbox{thunk})$.  The type may be built-in
 or user-defined, or it may even be \verb'GrB_NULL'.  If the \verb'ttype' is
 \verb'GrB_NULL', then the \verb'selectop' does not access this parameter.
 The \verb'const void *thunk' parameter on input to the user \verb'function'
@@ -2662,7 +2809,7 @@ \subsubsection{{\sf GB\_SelectOp\_wait:} wait for a select operator}
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_SelectOp_wait      // wait for a user-defined select operator
+GrB_Info GrB_wait               // wait for a user-defined select operator
 (
     GxB_SelectOp *selectop      // select operator to wait for
 ) ;
@@ -2694,7 +2841,7 @@ \subsubsection{{\sf GxB\_SelectOp\_xtype:} return the type of $x$}
 }\end{mdframed}
 
 \verb'GxB_SelectOp_xtype' returns the \verb'xtype' of the select operator,
-which is the type of $x$ in the function $z=f(i,j,m,n,x,\mbox{thunk})$.  If the
+which is the type of $x$ in the function $z=f(i,j,x,\mbox{thunk})$.  If the
 select operator is type-generic, \verb'xtype' is returned as \verb'GrB_NULL'.
 This is not an error condition, but simply indicates that the
 \verb'selectop' is type-generic.
@@ -2717,7 +2864,7 @@ \subsubsection{{\sf GxB\_SelectOp\_ttype:} return the type of the {\em thunk}}
 }\end{mdframed}
 
 \verb'GxB_SelectOp_ttype' returns the \verb'ttype' of the select operator,
-which is the type of {\em thunk} in the function $z=f(i,j,m,n,x,\mbox{thunk})$.
+which is the type of {\em thunk} in the function $z=f(i,j,x,\mbox{thunk})$.
 If the select operator does not use this parameter, \verb'ttype' is returned as
 \verb'GrB_NULL'.  This is not an error condition, but simply indicates that the
 \verb'selectop' does not use this parameter.
@@ -2863,10 +3010,6 @@ \subsection{GraphBLAS monoids: {\sf GrB\_Monoid}} %=============================
 }
 \vspace{0.2in}
 
-\begin{spec}
-{\bf SPEC:} The predefined \verb'GxB*' monoids are an extension to the spec.
-\end{spec}
-
 \newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Monoid\_new:} create a monoid}
@@ -2908,7 +3051,7 @@ \subsubsection{{\sf GrB\_Monoid\_wait:} wait for a monoid}
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_Monoid_wait        // wait for a user-defined monoid
+GrB_Info GrB_wait               // wait for a user-defined monoid
 (
     GrB_Monoid *monoid          // monoid to wait for
 ) ;
@@ -2986,11 +3129,6 @@ \subsubsection{{\sf GxB\_Monoid\_terminal\_new:} create a monoid with terminal}
 \verb'GrB_MIN_FP32', and then create a monoid based on this user-defined
 operator with a terminal value of zero and an identity of \verb'+INFINITY'.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Monoid_terminal_new' is an extension to the spec.
-\end{spec}
-
-% \newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_Monoid\_operator:} return the monoid operator}
 %-------------------------------------------------------------------------------
@@ -3009,11 +3147,6 @@ \subsubsection{{\sf GxB\_Monoid\_operator:} return the monoid operator}
 
 \verb'GxB_Monoid_operator' returns the binary operator of the monoid.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Monoid_operator' is an extension to the spec.
-\end{spec}
-
-% \newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_Monoid\_identity:} return the monoid identity}
 %-------------------------------------------------------------------------------
@@ -3036,11 +3169,7 @@ \subsubsection{{\sf GxB\_Monoid\_identity:} return the monoid identity}
 return the monoid additive operator, then \verb'GxB_BinaryOp_ztype' to obtain
 the \verb'ztype', followed by \verb'GxB_Type_size' to get its size.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Monoid_identity' is an extension to the spec.
-\end{spec}
-
-% \newpage
+\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_Monoid\_terminal:} return the monoid terminal value}
 %-------------------------------------------------------------------------------
@@ -3069,10 +3198,6 @@ \subsubsection{{\sf GxB\_Monoid\_terminal:} return the monoid terminal value}
 terminal value, then \verb'has_terminal' is \verb'false', and the
 \verb'terminal' parameter is not modified.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Monoid_terminal' is an extension to the spec.
-\end{spec}
-
 % \newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Monoid\_free:} free a monoid}
@@ -3170,12 +3295,11 @@ \subsubsection{{\sf GrB\_Semiring\_new:} create a semiring}
 Four semirings are for boolean types only: 
     \verb'LOR_LAND', \verb'LAND_LOR', \verb'LXOR_LAND', and \verb'LXNOR_LOR'.
 
-SuiteSparse:GraphBLAS pre-defines 1,473 of the 2,438 unique semirings that can
-be constructed from built-in types and operators, listed below, as an extension
-to the spec.  The naming convention is \verb'GxB_add_mult_type'.  The 124
-\verb'GrB*' semirings are a subset of the list below, included with two names:
-\verb'GrB*' and \verb'GxB*'.  If the \verb'GrB*' name is provided, its use is
-preferred, for portability to other GraphBLAS implementations.
+SuiteSparse:GraphBLAS pre-defines 1,553 semirings from built-in types and
+operators, listed below.  The naming convention is \verb'GxB_add_mult_type'.
+The 124 \verb'GrB*' semirings are a subset of the list below, included with two
+names: \verb'GrB*' and \verb'GxB*'.  If the \verb'GrB*' name is provided, its
+use is preferred, for portability to other GraphBLAS implementations.
 
 \vspace{-0.05in}
 \begin{itemize}
@@ -3184,7 +3308,7 @@ \subsubsection{{\sf GrB\_Semiring\_new:} create a semiring}
 
     \vspace{-0.05in}
     \begin{itemize}
-    \item 5 add monoids (\verb'MIN', \verb'MAX', \verb'PLUS', \verb'TIMES', \verb'ANY')
+    \item 5 monoids (\verb'MIN', \verb'MAX', \verb'PLUS', \verb'TIMES', \verb'ANY')
     \item 20 multiply operators
     (\verb'FIRST', \verb'SECOND', \verb'PAIR', \verb'MIN', \verb'MAX',
     \verb'PLUS', \verb'MINUS', \verb'RMINUS', \verb'TIMES', \verb'DIV', \verb'RDIV',
@@ -3199,7 +3323,7 @@ \subsubsection{{\sf GrB\_Semiring\_new:} create a semiring}
 
     \vspace{-0.05in}
     \begin{itemize}
-    \item 5 Boolean add monoids
+    \item 5 Boolean monoids
     (\verb'LAND', \verb'LOR', \verb'LXOR', \verb'EQ', \verb'ANY')
     \item 6 multiply operators
     (\verb'EQ', \verb'NE', \verb'GT', \verb'LT', \verb'GE', \verb'LE')
@@ -3211,7 +3335,7 @@ \subsubsection{{\sf GrB\_Semiring\_new:} create a semiring}
 
     \vspace{-0.05in}
     \begin{itemize}
-    \item 5 Boolean add monoids
+    \item 5 Boolean monoids
     (\verb'LAND', \verb'LOR', \verb'LXOR', \verb'EQ', \verb'ANY')
     \item 11 multiply operators
     (\verb'FIRST', \verb'SECOND', \verb'PAIR', \verb'LOR', \verb'LAND', \verb'LXOR',
@@ -3225,7 +3349,7 @@ \subsubsection{{\sf GrB\_Semiring\_new:} create a semiring}
     \vspace{-0.05in}
     \begin{itemize}
     \item 3 complex monoids (\verb'PLUS', \verb'TIMES', \verb'ANY')
-    \item 9 complex multiply operators:
+    \item 9 complex multiply operators
         (\verb'FIRST', \verb'SECOND', \verb'PAIR', \verb'PLUS', \verb'MINUS',
             \verb'TIMES', \verb'DIV', \verb'RDIV', \verb'RMINUS')
     \item 2 complex types, $Z$
@@ -3241,11 +3365,19 @@ \subsubsection{{\sf GrB\_Semiring\_new:} create a semiring}
     \item 4 unsigned integer types
     \end{itemize}
 
-\end{itemize}
+\item 80 positional semirings, $X \times X \rightarrow T$ where $T$ is
+    \verb'INT32' or \verb'INT64':
+
+    \vspace{-0.05in}
+    \begin{itemize}
+    \item 5 monoids (\verb'MIN', \verb'MAX', \verb'PLUS', \verb'TIMES', \verb'ANY')
+    \item 8 positional operators
+        (\verb'FIRSTI', \verb'FIRSTI1', \verb'FIRSTJ', \verb'FIRSTJ1',
+        \verb'SECONDI', \verb'SECONDI1', \verb'SECONDJ', \verb'SECONDJ1')
+    \item 2 integer types (\verb'INT32', \verb'INT64')
+    \end{itemize}
 
-\begin{spec}
-{\bf SPEC:} Predefined \verb'GxB*' semirings are an extension to the spec.
-\end{spec}
+\end{itemize}
 
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Semiring\_wait:} wait for a semiring}
@@ -3254,7 +3386,7 @@ \subsubsection{{\sf GrB\_Semiring\_wait:} wait for a semiring}
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_Semiring_wait      // wait for a user-defined semiring
+GrB_Info GrB_wait               // wait for a user-defined semiring
 (
     GrB_Semiring *semiring      // semiring to wait for
 ) ;
@@ -3286,10 +3418,6 @@ \subsubsection{{\sf GxB\_Semiring\_add:} return the additive monoid of a semirin
 
 \verb'GxB_Semiring_add' returns the additive monoid of a semiring.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Semiring_add' is an extension to the spec.
-\end{spec}
-
 \newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_Semiring\_multiply:} return multiply operator of a semiring}
@@ -3310,11 +3438,6 @@ \subsubsection{{\sf GxB\_Semiring\_multiply:} return multiply operator of a semi
 \verb'GxB_Semiring_multiply' returns the binary multiplicative operator of a
 semiring.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Semiring_multiply' is an extension to the spec.
-\end{spec}
-
-
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Semiring\_free:} free a semiring}
 %-------------------------------------------------------------------------------
@@ -3349,32 +3472,28 @@ \subsection{GraphBLAS scalars: {\sf GxB\_Scalar}} %=============================
 \label{scalar}
 
 This section describes a set of methods that create, modify, query,
-and destroy a GraphBLAS sparse scalar, \verb'GxB_Scalar':
-
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Scalar' is an extension to the spec.
-\end{spec}
+and destroy a SuiteSparse:GraphBLAS scalar, \verb'GxB_Scalar':
 
 \vspace{0.2in}
 {\footnotesize
 \begin{tabular}{ll}
 \hline
-\verb'GxB_Scalar_new'            & create a sparse scalar \\
+\verb'GxB_Scalar_new'            & create a scalar \\
 \verb'GxB_Scalar_wait'           & wait for a scalar \\
-\verb'GxB_Scalar_dup'            & copy a sparse scalar \\
-\verb'GxB_Scalar_clear'          & clear a sparse scalar of its entry \\
+\verb'GxB_Scalar_dup'            & copy a scalar \\
+\verb'GxB_Scalar_clear'          & clear a scalar of its entry \\
 \verb'GxB_Scalar_nvals'          & return the number of entries in a
-                                   sparse scalar (0 or 1) \\
-\verb'GxB_Scalar_type'           & return the type of a sparse scalar \\
-\verb'GxB_Scalar_setElement'     & set the single entry of a sparse scalar \\
-\verb'GxB_Scalar_extractElement' & get the single entry from a sparse scalar \\
-\verb'GxB_Scalar_free'           & free a sparse scalar \\
+                                   scalar (0 or 1) \\
+\verb'GxB_Scalar_type'           & return the type of a scalar \\
+\verb'GxB_Scalar_setElement'     & set the single entry of a scalar \\
+\verb'GxB_Scalar_extractElement' & get the single entry from a scalar \\
+\verb'GxB_Scalar_free'           & free a scalar \\
 \hline
 \end{tabular}
 }
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Scalar\_new:} create a sparse scalar}
+\subsubsection{{\sf GxB\_Scalar\_new:} create a scalar}
 %-------------------------------------------------------------------------------
 \label{scalar_new}
 
@@ -3389,9 +3508,9 @@ \subsubsection{{\sf GxB\_Scalar\_new:} create a sparse scalar}
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Scalar_new' creates a new sparse scalar with no
+\verb'GxB_Scalar_new' creates a new scalar with no
 entry in it, of the given type.  This is analogous to MATLAB statement
-\verb's = sparse (0)', except that GraphBLAS can create sparse scalars any
+\verb's = sparse(0)', except that GraphBLAS can create scalars any
 type.  The pattern of the new scalar is empty.
 
 %-------------------------------------------------------------------------------
@@ -3401,7 +3520,7 @@ \subsubsection{{\sf GxB\_Scalar\_wait:} wait for a scalar}
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Scalar_wait        // wait for a scalar
+GrB_Info GrB_wait               // wait for a scalar
 (
     GxB_Scalar *s               // scalar to wait for
 ) ;
@@ -3416,7 +3535,7 @@ \subsubsection{{\sf GxB\_Scalar\_wait:} wait for a scalar}
 
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Scalar\_dup:} copy a sparse scalar}
+\subsubsection{{\sf GxB\_Scalar\_dup:} copy a scalar}
 %-------------------------------------------------------------------------------
 \label{scalar_dup}
 
@@ -3431,7 +3550,7 @@ \subsubsection{{\sf GxB\_Scalar\_dup:} copy a sparse scalar}
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Scalar_dup' makes a deep copy of a sparse scalar, like \verb's=t' in
+\verb'GxB_Scalar_dup' makes a deep copy of a scalar, like \verb's=t' in
 MATLAB.  In GraphBLAS, it is possible, and valid, to write the following:
 
     {\footnotesize
@@ -3443,7 +3562,7 @@ \subsubsection{{\sf GxB\_Scalar\_dup:} copy a sparse scalar}
 Then \verb's' and \verb't' can be used interchangeably.  However, only a pointer
 reference is made, and modifying one of them modifies both, and freeing one of
 them leaves the other as a dangling handle that should not be used.
-If two different sparse scalars are needed, then this should be used instead:
+If two different scalars are needed, then this should be used instead:
 
     {\footnotesize
     \begin{verbatim}
@@ -3451,12 +3570,12 @@ \subsubsection{{\sf GxB\_Scalar\_dup:} copy a sparse scalar}
     GxB_Scalar_new (&t, GrB_FP64) ;
     GxB_Scalar_dup (&s, t) ;        // like s = t, but making a deep copy \end{verbatim}}
 
-Then \verb's' and \verb't' are two different sparse scalars that currently have
+Then \verb's' and \verb't' are two different scalars that currently have
 the same value, but they do not depend on each other.  Modifying one has no
 effect on the other.
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Scalar\_clear:} clear a sparse scalar of its entry}
+\subsubsection{{\sf GxB\_Scalar\_clear:} clear a scalar of its entry}
 %-------------------------------------------------------------------------------
 \label{scalar_clear}
 
@@ -3470,14 +3589,14 @@ \subsubsection{{\sf GxB\_Scalar\_clear:} clear a sparse scalar of its entry}
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Scalar_clear' clears the entry from a sparse scalar.  The pattern of
+\verb'GxB_Scalar_clear' clears the entry from a scalar.  The pattern of
 \verb's' is empty, just as if it were created fresh with \verb'GxB_Scalar_new'.
 Analogous with \verb's = sparse (0)' in MATLAB.  The type of \verb's' does not
-change.  Any pending updates to the sparse scalar are discarded.
+change.  Any pending updates to the scalar are discarded.
 
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Scalar\_nvals:} return the number of entries in a sparse scalar}
+\subsubsection{{\sf GxB\_Scalar\_nvals:} return the number of entries in a scalar}
 %-------------------------------------------------------------------------------
 \label{scalar_nvals}
 
@@ -3492,14 +3611,14 @@ \subsubsection{{\sf GxB\_Scalar\_nvals:} return the number of entries in a spars
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Scalar_nvals' returns the number of entries in a sparse scalar, which
+\verb'GxB_Scalar_nvals' returns the number of entries in a scalar, which
 is either 0 or 1.  Roughly analogous to \verb'nvals = nnz(s)' in MATLAB, except
 that the implicit value in GraphBLAS need not be zero and \verb'nnz' (short for
 ``number of nonzeros'') in MATLAB is better described as ``number of entries''
 in GraphBLAS.
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Scalar\_type:} return the type of a sparse scalar}
+\subsubsection{{\sf GxB\_Scalar\_type:} return the type of a scalar}
 %-------------------------------------------------------------------------------
 \label{scalar_type}
 
@@ -3514,11 +3633,11 @@ \subsubsection{{\sf GxB\_Scalar\_type:} return the type of a sparse scalar}
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Scalar_type' returns the type of a sparse scalar.  Analogous to
+\verb'GxB_Scalar_type' returns the type of a scalar.  Analogous to
 \verb'type = class (s)' in MATLAB.
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Scalar\_setElement:} set the single entry of a sparse scalar}
+\subsubsection{{\sf GxB\_Scalar\_setElement:} set the single entry of a scalar}
 %-------------------------------------------------------------------------------
 \label{scalar_setElement}
 
@@ -3532,13 +3651,14 @@ \subsubsection{{\sf GxB\_Scalar\_setElement:} set the single entry of a sparse s
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GxB_Scalar_setElement' sets the single entry in a sparse scalar, like
+\verb'GxB_Scalar_setElement' sets the single entry in a scalar, like
 \verb's = sparse(x)' in MATLAB notation.  For further details of this function,
 see \verb'GxB_Matrix_setElement' in Section~\ref{matrix_setElement}.
+If an error occurs, \verb'GrB_error(&err,s)' returns details about the error.
 
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Scalar\_extractElement:} get the single entry from a sparse scalar}
+\subsubsection{{\sf GxB\_Scalar\_extractElement:} get the single entry from a scalar}
 %-------------------------------------------------------------------------------
 \label{scalar_extractElement}
 
@@ -3556,11 +3676,11 @@ \subsubsection{{\sf GxB\_Scalar\_extractElement:} get the single entry from a sp
 scalar, like \verb'x = full(s)' in MATLAB.  Further details of this method are
 discussed in Section~\ref{matrix_extractElement}, which discusses
 \verb'GrB_Matrix_extractElement'.  {\bf NOTE: }  if no entry is present in the
-sparse scalar \verb's', then \verb'x' is not modified, and the return value of
+scalar \verb's', then \verb'x' is not modified, and the return value of
 \verb'GxB_Scalar_extractElement' is \verb'GrB_NO_VALUE'.
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Scalar\_free:} free a sparse scalar}
+\subsubsection{{\sf GxB\_Scalar\_free:} free a scalar}
 %-------------------------------------------------------------------------------
 \label{scalar_free}
 
@@ -3574,7 +3694,7 @@ \subsubsection{{\sf GxB\_Scalar\_free:} free a sparse scalar}
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Scalar_free' frees a sparse scalar.  Either usage:
+\verb'GxB_Scalar_free' frees a scalar.  Either usage:
 
     {\small
     \begin{verbatim}
@@ -3582,9 +3702,9 @@ \subsubsection{{\sf GxB\_Scalar\_free:} free a sparse scalar}
     GrB_free (&s) ; \end{verbatim}}
 
 \noindent
-frees the sparse scalar \verb's' and sets \verb's' to \verb'NULL'.  It safely
+frees the scalar \verb's' and sets \verb's' to \verb'NULL'.  It safely
 does nothing if passed a \verb'NULL' handle, or if \verb's == NULL' on input.
-Any pending updates to the sparse scalar are abandoned.
+Any pending updates to the scalar are abandoned.
 
 \newpage
 %===============================================================================
@@ -3592,29 +3712,6 @@ \subsection{GraphBLAS vectors: {\sf GrB\_Vector}} %=============================
 %===============================================================================
 \label{vector}
 
-Many of the methods for GraphBLAS vectors require a row index or a size.  Many
-methods for matrices require both a row and column index, or a row and column
-dimension.  These are all integers of a specific type, \verb'GrB_Index',
-which is defined in \verb'GraphBLAS.h' as
-
-    {\footnotesize
-    \begin{verbatim}
-    typedef uint64_t GrB_Index ; \end{verbatim}}
-
-Row and column indices of an \verb'nrows'-by-\verb'ncols' matrix range from
-zero to the \verb'nrows-1' for the rows, and zero to \verb'ncols-1' for the
-columns.  Indices are zero-based, like C, and not one-based, like MATLAB.  In
-SuiteSparse:GraphBLAS, the largest size permitted for any integer of
-\verb'GrB_Index' is $2^{60}$.  The largest \verb'GrB_Matrix' that
-SuiteSparse:GraphBLAS can construct is thus $2^{60}$-by-$2^{60}$.  An
-$n$-by-$n$ matrix $A$ that size can easily be constructed in practice with
-$O(|{\bf A}|)$ memory requirements, where $|{\bf A}|$ denotes the number of
-entries that explicitly appear in the pattern of ${\bf A}$.  The time and
-memory required to construct a matrix that large does not depend on $n$, since
-SuiteSparse:GraphBLAS can represent ${\bf A}$ in hypersparse form (see
-Section~\ref{hypersparse}).  The largest \verb'GrB_Vector' that can be
-constructed is $2^{60}$-by-1.
-
 This section describes a set of methods that create, modify, query,
 and destroy a GraphBLAS sparse vector, \verb'GrB_Vector':
 
@@ -3630,21 +3727,29 @@ \subsection{GraphBLAS vectors: {\sf GrB\_Vector}} %=============================
 \verb'GrB_Vector_nvals'          & return the number of entries in a vector \\
 \verb'GxB_Vector_type'           & return the type of a vector \\
 \verb'GrB_Vector_build'          & build a vector from a set of tuples \\
-\verb'GrB_Vector_setElement'     & add a single entry to a vector \\
-\verb'GrB_Vector_extractElement' & get a single entry from a vector \\
-\verb'GrB_Vector_removeElement'  & remove a single entry from a vector \\
+\verb'GrB_Vector_setElement'     & add an entry to a vector \\
+\verb'GrB_Vector_extractElement' & get an entry from a vector \\
+\verb'GrB_Vector_removeElement'  & remove an entry from a vector \\
 \verb'GrB_Vector_extractTuples'  & get all entries from a vector \\
 \verb'GrB_Vector_resize'         & resize a vector \\
 \verb'GrB_Vector_free'           & free a vector \\
 \hline
-\verb'GxB_Vector_import'         & import a vector
-                                    (see Section~\ref{import_export})\\
-\verb'GxB_Vector_export'         & export a vector
-                                    (see Section~\ref{import_export})\\
+\hline
+\verb'GxB_Vector_import_CSC'     & import a vector in CSC format \\
+\verb'GxB_Vector_export_CSC'     & export a vector in CSC format \\
+\hline
+\verb'GxB_Vector_import_Bitmap'  & import a vector in bitmap format \\
+\verb'GxB_Vector_export_Bitmap'  & export a vector in bitmap format \\
+\hline
+\verb'GxB_Vector_import_Full'    & import a vector in full format \\
+\verb'GxB_Vector_export_Full'    & export a vector in full format \\
 \hline
 \end{tabular}
 }
 
+\vspace{0.2in}
+Refer to Section~\ref{import_export} for a discussion the import/export
+methods.
 
 \newpage
 %-------------------------------------------------------------------------------
@@ -3676,7 +3781,7 @@ \subsubsection{{\sf GrB\_Vector\_wait:} wait for a vector}
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_Vector_wait        // wait for a vector
+GrB_Info GrB_wait               // wait for a vector
 (
     GrB_Vector *w               // vector to wait for
 ) ;
@@ -3813,10 +3918,6 @@ \subsubsection{{\sf GxB\_Vector\_type:}          return the type of a vector}
 \verb'GxB_Vector_type' returns the type of a vector.  Analogous to
 \verb'type = class (v)' in MATLAB.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Vector_type' is an extension to the spec.
-\end{spec}
-
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Vector\_build:}         build a vector from a set of tuples}
 %-------------------------------------------------------------------------------
@@ -3848,12 +3949,12 @@ \subsubsection{{\sf GrB\_Vector\_build:}         build a vector from a set of tu
 in \verb'GrB_Matrix_build' is implicitly a vector of length \verb'nvals' all
 equal to zero.  Otherwise the methods are identical.
 
-\begin{spec}
+\begin{alert}
 {\bf SPEC:} As an extension to the spec, results are defined even if \verb'dup' is non-associative.
-\end{spec}
+\end{alert}
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Vector\_setElement:}    add a single entry to a vector}
+\subsubsection{{\sf GrB\_Vector\_setElement:}    add an entry to a vector}
 %-------------------------------------------------------------------------------
 \label{vector_setElement}
 
@@ -3873,9 +3974,10 @@ \subsubsection{{\sf GrB\_Vector\_setElement:}    add a single entry to a vector}
 matrix, \verb'A(i,0) = x', where the column index for a vector is implicitly
 \verb'j=0'.  For further details of this function, see
 \verb'GrB_Matrix_setElement' in Section~\ref{matrix_setElement}.
+If an error occurs, \verb'GrB_error(&err,w)' returns details about the error.
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Vector\_extractElement:} get a single entry from a vector}
+\subsubsection{{\sf GrB\_Vector\_extractElement:} get an entry from a vector}
 %-------------------------------------------------------------------------------
 \label{vector_extractElement}
 
@@ -3900,8 +4002,9 @@ \subsubsection{{\sf GrB\_Vector\_extractElement:} get a single entry from a vect
 \verb'x' is not modified, and the return value of
 \verb'GrB_Vector_extractElement' is \verb'GrB_NO_VALUE'.
 
+\newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Vector\_removeElement:} remove a single entry from a vector}
+\subsubsection{{\sf GrB\_Vector\_removeElement:} remove an entry from a vector}
 %-------------------------------------------------------------------------------
 \label{vector_removeElement}
 
@@ -3910,13 +4013,14 @@ \subsubsection{{\sf GrB\_Vector\_removeElement:} remove a single entry from a ve
 \begin{verbatim}
 GrB_Info GrB_Vector_removeElement
 (
-    GrB_Vector v,                   // vector to remove an entry from
+    GrB_Vector w,                   // vector to remove an entry from
     GrB_Index i                     // index
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Vector_removeElement' removes a single entry \verb'v(i)' from a vector.
-If no entry is present at \verb'v(i)', then the vector is not modified.
+\verb'GrB_Vector_removeElement' removes a single entry \verb'w(i)' from a vector.
+If no entry is present at \verb'w(i)', then the vector is not modified.
+If an error occurs, \verb'GrB_error(&err,w)' returns details about the error.
 
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Vector\_extractTuples:} get all entries from a vector}
@@ -4010,35 +4114,46 @@ \subsection{GraphBLAS matrices: {\sf GrB\_Matrix}} %============================
 \verb'GrB_Matrix_nvals'         & return the number of entries in a matrix \\
 \verb'GxB_Matrix_type'          & return the type of a matrix \\
 \verb'GrB_Matrix_build'         & build a matrix from a set of tuples \\
-\verb'GrB_Matrix_setElement'    & add a single entry to a matrix \\
-\verb'GrB_Matrix_extractElement'& get a single entry from a matrix \\
-\verb'GrB_Matrix_removeElement' & remove a single entry from a matrix \\
+\verb'GrB_Matrix_setElement'    & add an entry to a matrix \\
+\verb'GrB_Matrix_extractElement'& get an entry from a matrix \\
+\verb'GrB_Matrix_removeElement' & remove an entry from a matrix \\
 \verb'GrB_Matrix_extractTuples' & get all entries from a matrix \\
 \verb'GrB_Matrix_resize'        & resize a matrix \\
 \verb'GrB_Matrix_free'          & free a matrix \\
 \hline
-\verb'GxB_Matrix_import_CSR'            & import a matrix in CSR form
-                                          (see Section~\ref{import_export})\\
-\verb'GxB_Matrix_import_CSC'            & import a matrix in CSC form
-                                          (see Section~\ref{import_export})\\
-\verb'GxB_Matrix_import_HyperCSR'       & import a matrix in HyperCSR form
-                                          (see Section~\ref{import_export})\\
-\verb'GxB_Matrix_import_HyperCSC'       & import a matrix in HyperCSC form
-                                          (see Section~\ref{import_export})\\
-\verb'GxB_Matrix_export_CSR'            & export a matrix in CSR form
-                                          (see Section~\ref{import_export})\\
-\verb'GxB_Matrix_export_CSC'            & export a matrix in CSC form
-                                          (see Section~\ref{import_export})\\
-\verb'GxB_Matrix_export_HyperCSR'       & export a matrix in HyperCSR form
-                                          (see Section~\ref{import_export})\\
-\verb'GxB_Matrix_export_HyperCSC'       & export a matrix in HyperCSC form
-                                          (see Section~\ref{import_export})\\
+\hline
+\verb'GxB_Matrix_import_CSR'      & import a matrix in CSR form \\
+\verb'GxB_Matrix_export_CSR'      & export a matrix in CSR form \\
+\hline
+\verb'GxB_Matrix_import_CSC'      & import a matrix in CSC form \\
+\verb'GxB_Matrix_export_CSC'      & export a matrix in CSC form \\
+\hline
+\verb'GxB_Matrix_import_HyperCSR' & import a matrix in HyperCSR form \\
+\verb'GxB_Matrix_export_HyperCSR' & export a matrix in HyperCSR form \\
+\hline
+\verb'GxB_Matrix_import_HyperCSC' & import a matrix in HyperCSC form \\
+\verb'GxB_Matrix_export_HyperCSC' & export a matrix in HyperCSC form \\
+\hline
+\verb'GxB_Matrix_import_BitmapR'  & import a matrix in BitmapR form \\
+\verb'GxB_Matrix_export_BitmapR'  & export a matrix in BitmapR form \\
+\hline
+\verb'GxB_Matrix_import_BitmapC'  & import a matrix in BitmapC form \\
+\verb'GxB_Matrix_export_BitmapC'  & export a matrix in BitmapC form \\
+\hline
+\verb'GxB_Matrix_import_FullR'    & import a matrix in FullR form \\
+\verb'GxB_Matrix_export_FullR'    & export a matrix in FullR form \\
+\hline
+\verb'GxB_Matrix_import_FullC'    & import a matrix in FullC form \\
+\verb'GxB_Matrix_export_FullC'    & export a matrix in FullC form \\
 \hline
 \end{tabular}
 }
 \vspace{0.2in}
 
-% \newpage
+Refer to Section~\ref{import_export} for a discussion the import/export
+methods.
+
+\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Matrix\_new:}          create a matrix}
 %-------------------------------------------------------------------------------
@@ -4065,12 +4180,10 @@ \subsubsection{{\sf GrB\_Matrix\_new:}          create a matrix}
 \subsubsection{{\sf GrB\_Matrix\_wait:} wait for a matrix}
 %-------------------------------------------------------------------------------
 
-% GB_PUBLIC GrB_Info GrB_Descriptor_wait (GrB_Descriptor *desc    ) ;
-
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_Matrix_wait        // wait for a matrix
+GrB_Info GrB_wait               // wait for a matrix
 (
     GrB_Matrix *C               // matrix to wait for
 ) ;
@@ -4083,7 +4196,7 @@ \subsubsection{{\sf GrB\_Matrix\_wait:} wait for a matrix}
 \verb'GrB_Matrix_wait(&C)'.  After this call, different user threads may safely
 call GraphBLAS operations that use the matrix \verb'C' as an input parameter.
 
-% \newpage
+\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Matrix\_dup:}          copy a matrix}
 %-------------------------------------------------------------------------------
@@ -4123,7 +4236,6 @@ \subsubsection{{\sf GrB\_Matrix\_dup:}          copy a matrix}
 same set of values, but they do not depend on each other.  Modifying one has
 no effect on the other.
 
-\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Matrix\_clear:}        clear a matrix of all entries}
 %-------------------------------------------------------------------------------
@@ -4184,7 +4296,6 @@ \subsubsection{{\sf GrB\_Matrix\_ncols:}        return the number of columns of
 \verb'GrB_Matrix_ncols' returns the number of columns of a matrix
 (\verb'ncols=size(A,2)' in MATLAB).
 
-\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Matrix\_nvals:}        return the number of entries in a matrix}
 %-------------------------------------------------------------------------------
@@ -4205,6 +4316,7 @@ \subsubsection{{\sf GrB\_Matrix\_nvals:}        return the number of entries in
 GraphBLAS need not be zero and \verb'nnz' (short for ``number of nonzeros'') in
 MATLAB is better described as ``number of entries'' in GraphBLAS.
 
+\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_Matrix\_type:}         return the type of a matrix}
 %-------------------------------------------------------------------------------
@@ -4215,7 +4327,6 @@ \subsubsection{{\sf GxB\_Matrix\_type:}         return the type of a matrix}
 \begin{verbatim}
 GrB_Info GxB_Matrix_type    // get the type of a matrix
 (
-\newpage
     GrB_Type *type,         // returns the type of the matrix
     const GrB_Matrix A      // matrix to query
 ) ;
@@ -4224,10 +4335,6 @@ \subsubsection{{\sf GxB\_Matrix\_type:}         return the type of a matrix}
 \verb'GxB_Matrix_type' returns the type of a matrix, like \verb'type=class(A)'
 in MATLAB.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Matrix_type' is an extension to the spec.
-\end{spec}
-
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Matrix\_build:} build a matrix from a set of tuples}
 %-------------------------------------------------------------------------------
@@ -4271,9 +4378,10 @@ \subsubsection{{\sf GrB\_Matrix\_build:} build a matrix from a set of tuples}
 typecasted into the result \verb'C'.  That is, typecasting does not occur at
 the same time as the assembly of duplicates.
 
-\begin{spec}
-{\bf SPEC:} As an extension to the spec, results are defined even if \verb'dup' is non-associative.
-\end{spec}
+\begin{alert}
+{\bf SPEC:} As an extension to the spec, results are defined even if \verb'dup'
+is non-associative.
+\end{alert}
 
 The GraphBLAS API requires \verb'dup' to be associative so
 that entries can be assembled in any order, and states that the result is
@@ -4328,7 +4436,7 @@ \subsubsection{{\sf GrB\_Matrix\_build:} build a matrix from a set of tuples}
 complex, and logical sparse matrices.
 
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_setElement:}   add a single entry to a matrix}
+\subsubsection{{\sf GrB\_Matrix\_setElement:}   add an entry to a matrix}
 %-------------------------------------------------------------------------------
 \label{matrix_setElement}
 
@@ -4402,10 +4510,11 @@ \subsubsection{{\sf GrB\_Matrix\_setElement:}   add a single entry to a matrix}
 
 See Section~\ref{random} for an example of how to use
 \verb'GrB_Matrix_setElement'.
+If an error occurs, \verb'GrB_error(&err,C)' returns details about the error.
 
-% \newpage
+\newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_extractElement:} get a single entry from a matrix}
+\subsubsection{{\sf GrB\_Matrix\_extractElement:} get an entry from a matrix}
 %-------------------------------------------------------------------------------
 \label{matrix_extractElement}
 
@@ -4458,8 +4567,9 @@ \subsubsection{{\sf GrB\_Matrix\_extractElement:} get a single entry from a matr
 functions.  Everything will work correctly and results will be predictable, it
 will just be slow.
 
+\newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_removeElement:} remove a single entry from a matrix}
+\subsubsection{{\sf GrB\_Matrix\_removeElement:} remove an entry from a matrix}
 %-------------------------------------------------------------------------------
 \label{matrix_removeElement}
 
@@ -4476,8 +4586,8 @@ \subsubsection{{\sf GrB\_Matrix\_removeElement:} remove a single entry from a ma
 
 \verb'GrB_Matrix_removeElement' removes a single entry \verb'A(i,j)' from a matrix.
 If no entry is present at \verb'A(i,j)', then the matrix is not modified.
+If an error occurs, \verb'GrB_error(&err,A)' returns details about the error.
 
-% \newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Matrix\_extractTuples:} get all entries from a matrix}
 %-------------------------------------------------------------------------------
@@ -4501,9 +4611,10 @@ \subsubsection{{\sf GrB\_Matrix\_extractTuples:} get all entries from a matrix}
 \verb'[I,J,X]=find(A)' in MATLAB.  Entries in the tuples \verb'[I,J,X]' are
 unique.  No pair of row and column indices \verb'(i,j)' appears more than once.
 
-The GraphBLAS API states the tuples can be returned in any order.
-SuiteSparse:GraphBLAS chooses to always return them in sorted order, depending
-on whether the matrix is stored by row or by column.
+The GraphBLAS API states the tuples can be returned in any order.  If
+\verb'GrB_wait(&A)' is called first, then SuiteSparse:GraphBLAS chooses to
+always return them in sorted order, depending on whether the matrix is stored
+by row or by column.  Otherwise, the indices can be returned in any order.
 
 The number of tuples in the matrix \verb'A' is given by
 \verb'GrB_Matrix_nvals(&anvals,A)'.  If \verb'anvals' is larger than the size
@@ -4515,6 +4626,13 @@ \subsubsection{{\sf GrB\_Matrix\_extractTuples:} get all entries from a matrix}
 left unchanged.  On output, \verb'nvals' contains the number of tuples
 extracted.
 
+\begin{alert}
+{\bf SPEC:} As an extension to the spec, the arrays \verb'I', \verb'J', and/or
+\verb'X' may be passed in as \verb'NULL' pointers.  In this case,
+\verb'GrB_Matrix_extractTuples' does not return a component specified as
+\verb'NULL'.  This is not an error condition.
+\end{alert}
+
 % \newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Matrix\_resize:}          resize a matrix}
@@ -4614,20 +4732,58 @@ \subsection{GraphBLAS matrix and vector import/export} %========================
 re-imported into a \verb'GrB_Matrix' or \verb'GrB_Vector', at which point they
 again become the responsibility of GraphBLAS.
 
-For a matrix export, if the output format matches the current internal format
-of the matrix then these arrays are returned to the user application in $O(1)$
-time and with no memory copying performed.  Otherwise, the \verb'GrB_Matrix' is
-first converted into the requested format, and then exported.
-
-The vector import/export methods use a single format for a \verb'GrB_Vector'.
-Four different formats are provided for the import/export of a
-\verb'GrB_Matrix'.  For each format, the \verb'Ax' array has a C type
-corresponding to one of the 13 built-in types in GraphBLAS (\verb'bool',
-\verb'int*_t', \verb'uint*_t',
-\verb'float', \verb'double'
-\verb'float complex', \verb'double complex'), or that
-corresponds with the user-defined type.  No typecasting is done on import or
-export.
+For an export, if the output format matches the current internal format of the
+matrix or vector then these arrays are returned to the user application in
+$O(1)$ time and with no memory copying performed.  Otherwise, the
+\verb'GrB_Matrix' or \verb'GrB_Vector' is first converted into the requested
+format, and then exported.
+
+Exporting a matrix or vector forces completion of any pending operations on the
+matrix, with one exception.  SuiteSparse:GraphBLAS supports three kinds of
+pending operations: {\em zombies} (pending deletions), {\em pending tuples}
+(pending insertions), and a {\em lazy sort}.  In the latter, if the matrix or
+vector is left in a {\em jumbled} state, indices in any row or column may
+appear out of order.  If unjumbled, the indices always appear in ascending
+order.
+
+The vector import/export methods use a three formats for a \verb'GrB_Vector'.
+Eight different formats are provided for the import/export of a
+\verb'GrB_Matrix'.  For each format, the numerical value array (\verb'Ax' or
+\verb'vx') has a C type corresponding to one of the 13 built-in types in
+GraphBLAS (\verb'bool', \verb'int*_t', \verb'uint*_t', \verb'float',
+\verb'double' \verb'float complex', \verb'double complex'), or that corresponds
+with the user-defined type.  No typecasting is done on import or export.
+
+\begin{alert}
+{\bf FUTURE:}
+For the import methods, the numerical array must be large enough to hold all
+the entries, but in a future release of SuiteSparse: GraphBLAS, it may be
+specified as an array of length one.  This will indicate that all entries in
+the matrix or vector being constructed have the same value, given by
+\verb'Ax[0]' for matrices and \verb'vx[0]' for vectors.  Likewise, on export, a
+future release of SuiteSparse:GraphBLAS may return arrays of size 1, where
+\verb'Ax_size' or \verb'vx_size' is returned as 1, even though there may be
+many more entries than that in the matrix or vector.  This will indicate to the
+caller that all entries in the matrix or vector have the same value.
+\end{alert}
+
+The export of a \verb'GrB_Vector' in \verb'CSC' format may return the indices
+in a jumbled state, in any order.
+
+For a \verb'GrB_Matrix' in \verb'CSR' or \verb'HyperCSR' format, if the matrix
+is returned as jumbled, the column indices in any given row may appear out of
+order.  For \verb'CSC' or \verb'HyperCSC' formats, if the matrix is returned as
+jumbled, the row indices in any given column may appear out of order.
+
+On import, if the user-provided arrays contain jumbled row or column vectors,
+then the input flag \verb'jumbled' must be passed in as \verb'true'.  On
+export, if \verb'*jumbled' is \verb'NULL', this indicates to the export method
+that the user expects the exported matrix or vector to be returned in an
+ordered, unjumbled state.  If \verb'*jumbled' is provided, then it is return as
+\verb'true' if the indices may appear out of order, or \verb'false' if they are
+known to be in ascending order.
+
+Matrices and vectors in bitmap or full format are never jumbled.
 
 The table below lists the methods presented in this section.
 
@@ -4637,165 +4793,148 @@ \subsection{GraphBLAS matrix and vector import/export} %========================
 \hline
 method & purpose & Section \\
 \hline
-\verb'GxB_Vector_import'         & import a vector &
-                                    \ref{vector_import} \\
-\verb'GxB_Vector_export'         & export a vector &
-                                    \ref{vector_export} \\
-\hline
-\verb'GxB_Matrix_import_CSR'            & import a matrix in CSR form &
-                                          \ref{matrix_import_csr} \\
-\verb'GxB_Matrix_import_CSC'            & import a matrix in CSC form &
-                                          \ref{matrix_import_csc} \\
-\verb'GxB_Matrix_import_HyperCSR'       & import a matrix in HyperCSR form &
-                                          \ref{matrix_import_hypercsr} \\
-\verb'GxB_Matrix_import_HyperCSC'       & import a matrix in HyperCSC form &
-                                          \ref{matrix_import_hypercsc} \\
-\verb'GxB_Matrix_export_CSR'            & export a matrix in CSR form &
-                                          \ref{matrix_export_csr} \\
-\verb'GxB_Matrix_export_CSC'            & export a matrix in CSC form &
-                                          \ref{matrix_export_csc} \\
-\verb'GxB_Matrix_export_HyperCSR'       & export a matrix in HyperCSR form &
-                                          \ref{matrix_export_hypercsr} \\
-\verb'GxB_Matrix_export_HyperCSC'       & export a matrix in HyperCSC form &
-                                          \ref{matrix_export_hypercsc} \\
+\verb'GxB_Vector_import_CSC'     & import a vector in CSC format
+                                    & \ref{vector_import_csc} \\
+\verb'GxB_Vector_export_CSC'     & export a vector in CSC format
+                                    & \ref{vector_export_csc} \\
+\hline
+\verb'GxB_Vector_import_Bitmap'  & import a vector in bitmap format
+                                    & \ref{vector_import_bitmap} \\
+\verb'GxB_Vector_export_Bitmap'  & export a vector in bitmap format
+                                    & \ref{vector_export_bitmap} \\
+\hline
+\verb'GxB_Vector_import_Full'    & import a vector in full format
+                                    & \ref{vector_import_full} \\
+\verb'GxB_Vector_export_Full'    & export a vector in full format
+                                    & \ref{vector_export_full} \\
+\hline
+\hline
+\verb'GxB_Matrix_import_CSR'      & import a matrix in CSR form
+                                    & \ref{matrix_import_csr} \\
+\verb'GxB_Matrix_export_CSR'      & export a matrix in CSR form
+                                    & \ref{matrix_export_csr} \\
+\hline
+\verb'GxB_Matrix_import_CSC'      & import a matrix in CSC form
+                                    & \ref{matrix_import_csc} \\
+\verb'GxB_Matrix_export_CSC'      & export a matrix in CSC form
+                                    & \ref{matrix_export_csc} \\
+\hline
+\verb'GxB_Matrix_import_HyperCSR' & import a matrix in HyperCSR form
+                                    & \ref{matrix_import_hypercsr} \\
+\verb'GxB_Matrix_export_HyperCSR' & export a matrix in HyperCSR form
+                                    & \ref{matrix_export_hypercsr} \\
+\hline
+\verb'GxB_Matrix_import_HyperCSC' & import a matrix in HyperCSC form
+                                    & \ref{matrix_import_hypercsc} \\
+\verb'GxB_Matrix_export_HyperCSC' & export a matrix in HyperCSC form
+                                    & \ref{matrix_export_hypercsc} \\
+\hline
+\verb'GxB_Matrix_import_BitmapR'  & import a matrix in BitmapR form
+                                    & \ref{matrix_import_bitmapr} \\
+\verb'GxB_Matrix_export_BitmapR'  & export a matrix in BitmapR form
+                                    & \ref{matrix_export_bitmapr} \\
+\hline
+\verb'GxB_Matrix_import_BitmapC'  & import a matrix in BitmapC form
+                                    & \ref{matrix_import_bitmapc} \\
+\verb'GxB_Matrix_export_BitmapC'  & export a matrix in BitmapC form
+                                    & \ref{matrix_export_bitmapc} \\
+\hline
+\verb'GxB_Matrix_import_FullR'    & import a matrix in FullR form
+                                    & \ref{matrix_import_fullr} \\
+\verb'GxB_Matrix_export_FullR'    & export a matrix in FullR form
+                                    & \ref{matrix_export_fullr} \\
+\hline
+\verb'GxB_Matrix_import_FullC'    & import a matrix in FullC form
+                                    & \ref{matrix_import_fullc} \\
+\verb'GxB_Matrix_export_FullC'    & export a matrix in FullC form
+                                    & \ref{matrix_export_fullc} \\
 \hline
 \end{tabular}
 }
 \vspace{0.2in}
 
-\begin{spec}
-{\bf SPEC:} The import/export methods are extensions to the spec.  However,
-they have been implemented in SuiteSparse:GraphBLAS at the request of the
-GraphBLAS C API Committee, as a prototype for future consideration for
-inclusion in a future specification.  Their calling sequence may change if
-these functions are added to the specification as \verb'GrB_*' functions.  A
-GraphBLAS library need not implement these methods in constant time and memory.
-On import, a library may choose to copy the content of the user arrays into its
-internal data structure and then \verb'free' the user arrays.  On export, it
-may chose to \verb'malloc' the output arrays, fill them with the requested
-data, and then \verb'GrB_free' the GraphBLAS object being exported.  The
-semantics of these options are the same as a move constructor; they just take
-more time and memory.  The choice is up to the GraphBLAS implementation since
-the internal data structure is opaque to the user application.
-\end{spec}
-
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Vector\_import:}        import a vector}
+\subsubsection{{\sf GxB\_Vector\_import\_CSC} import a vector in CSC form}
 %-------------------------------------------------------------------------------
-\label{vector_import}
+\label{vector_import_csc}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Vector_import  // import a vector in CSC format
+GrB_Info GxB_Vector_import_CSC  // import a vector in CSC format
 (
-    GrB_Vector *v,          // vector to create
-    GrB_Type type,          // type of vector to create
-    GrB_Index n,            // vector length
-    GrB_Index nvals,        // number of entries in the vector
-    GrB_Index **vi,         // indices, size nvals (in sorted order)
-    void      **vx,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Vector *v,      // handle of vector to create
+    GrB_Type type,      // type of vector to create
+    GrB_Index n,        // vector length
+    GrB_Index **vi,     // indices, vi_size >= nvals(v)
+    void **vx,          // values, vx_size >= nvals(v)
+    GrB_Index vi_size,  // size of vi
+    GrB_Index vx_size,  // size of vx
+    GrB_Index nvals,    // # of entries in vector
+    bool jumbled,       // if true, indices may be unsorted
+    const GrB_Descriptor desc
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-The \verb'GxB_Vector_import' function is a fast way to construct a
-\verb'GrB_Vector', always taking just $O(1)$ time.  Calling
-\verb'GxB_Vector_import' with:
-
-{\footnotesize
-\begin{verbatim}
-    GxB_Vector_import (&v, type, n, nvals, &vi, &vx, desc) ;
-\end{verbatim}}
-
-is identical to the following:
-
-{\footnotesize
-\begin{verbatim}
-    int64_t *Ap = calloc (2, sizeof (int64_t)) ;
-    Ap [1] = nvals ;
-    GxB_Matrix_import_CSC (&A, type, n, 1, nvals, -1, &Ap, &vi, &vx, desc) ;
-\end{verbatim}}
-
 \noindent
-except that the latter creates an \verb'n'-by-1 matrix instead.  For the vector
-import, described here, the first argument is a \verb'GrB_Vector'.  The
-arguments \verb'vi' and \verb'vx' take the place of \verb'Ai' and \verb'Ax',
-and the \verb'Ap' array for the CSC matrix import is not provided for a vector
-import.  Refer to the description of \verb'GxB_Matrix_import_CSC' for details
+\verb'GxB_Vector_import_CSC' is analogous to \verb'GxB_Matrix_import_CSC'.
+Refer to the description of \verb'GxB_Matrix_import_CSC' for details
 (Section~\ref{matrix_import_csc}).
 
-If successful, \verb'v' is created as a \verb'n'-by-1 vector.  Its entries are
+If successful, \verb'v' is created as a \verb'n'-by-1 \verb'GrB_Vector'.  Its entries are
 the row indices given by \verb'vi', with the corresponding values in \verb'vx'.
 The two pointers \verb'vi' and \verb'vx' are returned as \verb'NULL', which
 denotes that they are no longer owned by the user application.  They have
-instead been moved into the new vector \verb'v'.  The row indices in \verb'vi'
-must appear in sorted order, and no duplicates can appear.  These conditions
-are not checked, so results are undefined if they are not met exactly.  The
-user application can check the resulting vector \verb'v' with \verb'GxB_print',
-if desired, which will determine if these conditions hold.
+instead been moved into the new \verb'GrB_Vector' \verb'v'.  If \verb'jumbled'
+is true, the row indices in \verb'vi' must appear in sorted order.  No
+duplicates can appear.  These conditions are not checked, so results are
+undefined if they are not met exactly.  The user application can check the
+resulting vector \verb'v' with \verb'GxB_print', if desired, which will
+determine if these conditions hold.
 
 If not successful, \verb'v' is returned as \verb'NULL' and \verb'vi' and
 \verb'vx' are not modified.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Vector_import' is an extension to the spec.
-\end{spec}
-
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Vector\_export:}        export a vector}
+\subsubsection{{\sf GxB\_Vector\_export\_CSC:} export a vector in CSC form}
 %-------------------------------------------------------------------------------
-\label{vector_export}
+\label{vector_export_csc}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Vector_export  // export and free a vector
+GrB_Info GxB_Vector_export_CSC  // export and free a CSC vector
 (
-    GrB_Vector *v,          // vector to export and free
-    GrB_Type *type,         // type of vector exported
-    GrB_Index *n,           // length of the vector
-    GrB_Index *nvals,       // number of entries in the vector
-    GrB_Index **vi,         // indices, size nvals
-    void      **vx,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Vector *v,      // handle of vector to export and free
+    GrB_Type *type,     // type of vector exported
+    GrB_Index *n,       // length of the vector
+    GrB_Index **vi,     // indices, vi_size >= nvals(v)
+    void **vx,          // values, vx_size >= nvals(v)
+    GrB_Index *vi_size, // size of vi
+    GrB_Index *vx_size, // size of vx
+    GrB_Index *nvals,   // # of entries in vector
+    bool *jumbled,      // if true, indices may be unsorted
+    const GrB_Descriptor desc
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-The \verb'GxB_Vector_export' function is a fast way to extract the contents of
-a \verb'GrB_Vector', always taking just $O(1)$ time.  Using
-\verb'GxB_Vector_export' with:
-
-{\footnotesize
-\begin{verbatim}
-    GxB_Vector_export (&v, &type, &n, &nvals, &vi, &vx, desc) ;
-\end{verbatim}}
-
-is analogous to:
-
-{\footnotesize
-\begin{verbatim}
-    GxB_Matrix_export_CSC (&A, &type, &n, &one, &nvals, &nonempty,
-        &Ap, &Ai, &Ax, desc)
-\end{verbatim}}
-
-\noindent
-if \verb'A' were an \verb'n'-by-1 matrix.  For the vector export, described
-here, the first argument is a \verb'GrB_Vector'.  The arguments \verb'vi' and
-\verb'vx' take the place of \verb'Ai' and \verb'Ax', and the \verb'Ap' array
-for the CSC matrix export is not returned from a vector export.  Refer to the
-description of \verb'GxB_Matrix_export_CSC' for details.
+\verb'GxB_Vector_export_CSC' is analogous to \verb'GxB_Matrix_export_CSC'.
+Refer to the description of \verb'GxB_Matrix_export_CSC' for details
 (Section~\ref{matrix_export_csc}).
 
-Exporting a vector forces completion of any pending operations on the vector.
+Exporting a vector forces completion of any pending operations on the vector,
+except that indices may be exported out of order (\verb'jumbled' is \verb'true'
+if they may be out of order, \verb'false' if sorted in ascending order).  If
+\verb'jumbled' is \verb'NULL' on input, then the indices are always returned in
+sorted order.
 
 If successful, \verb'v' is returned as \verb'NULL', and its contents are
 returned to the user, with its \verb'type', dimension \verb'n', and number of
-entries \verb'nvals'.  A sorted list of row indices of entries that were in
+entries \verb'nvals'.  A list of row indices of entries that were in
 \verb'v' is returned in \verb'vi', and the corresponding numerical values are
 returned in \verb'vx'.  If \verb'nvals' is zero, the \verb'vi' and \verb'vx'
 arrays are returned as \verb'NULL'; this is not an error condition.
@@ -4803,41 +4942,203 @@ \subsubsection{{\sf GxB\_Vector\_export:}        export a vector}
 If not successful, \verb'v' is unmodified and \verb'vi' and \verb'vx' are
 not modified.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Vector_export' is an extension to the spec.
-\end{spec}
-
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_import\_CSR:} import a CSR matrix}
+\subsubsection{{\sf GxB\_Vector\_import\_Bitmap} import a vector in bitmap form}
 %-------------------------------------------------------------------------------
-\label{matrix_import_csr}
+\label{vector_import_bitmap}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_import_CSR      // import a CSR matrix
+GrB_Info GxB_Vector_import_Bitmap // import a bitmap vector
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // CSR format:
-    int64_t nonempty,       // number of rows with at least one entry:
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index **Ap,         // row "pointers", size nrows+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Vector *v,      // handle of vector to create
+    GrB_Type type,      // type of vector to create
+    GrB_Index n,        // vector length
+    int8_t **vb,        // bitmap, vb_size >= n
+    void **vx,          // values, vx_size >= n
+    GrB_Index vb_size,  // size of vb
+    GrB_Index vx_size,  // size of vx
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\noindent
+\verb'GxB_Vector_import_Bitmap' is analogous to
+\verb'GxB_Matrix_import_BitmapC'.  Refer to the description of
+\verb'GxB_Matrix_import_BitmapC' for details
+(Section~\ref{matrix_import_bitmapc}).
+
+If successful, \verb'v' is created as a \verb'n'-by-1 \verb'GrB_Vector'.
+Its entries are determined by \verb'vb', where \verb'vb[i]=1' denotes that
+the entry $v(i)$ is present with value given by \verb'vx[i]', and
+\verb'vb[i]=0' denotes that the entry $v(i)$ is not present (\verb'vx[i]' is
+ignored in this case).
+
+The two pointers \verb'vb' and \verb'vx' are returned as \verb'NULL', which
+denotes that they are no longer owned by the user application.  They have
+instead been moved into the new \verb'GrB_Vector' \verb'v'.
+
+The \verb'vb' array must not hold any values other than 0 and 1.  The value
+\verb'nvals' must exactly match the number of 1s in the \verb'vb' array.  These
+conditions are not checked, so results are undefined if they are not met
+exactly.  The user application can check the resulting vector \verb'v' with
+\verb'GxB_print', if desired, which will determine if these conditions hold.
+
+If not successful, \verb'v' is returned as \verb'NULL' and \verb'vb' and
+\verb'vx' are not modified.
+
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Vector\_export\_Bitmap:} export a vector in bitmap form}
+%-------------------------------------------------------------------------------
+\label{vector_export_bitmap}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Vector_export_Bitmap   // export and free a bitmap vector
+(
+    GrB_Vector *v,      // handle of vector to export and free
+    GrB_Type *type,     // type of vector exported
+    GrB_Index *n,       // length of the vector
+    int8_t **vb,        // bitmap, vb_size >= n
+    void **vx,          // values, vx_size >= n
+    GrB_Index *vb_size, // size of vb
+    GrB_Index *vx_size, // size of vx
+    GrB_Index *nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Vector_export_Bitmap' is analogous to
+\verb'GxB_Matrix_export_BitmapC'.  Refer to the description of
+\verb'GxB_Matrix_export_BitmapC' for details
+(Section~\ref{matrix_export_bitmapc}).
+
+Exporting a vector forces completion of any pending operations on the vector.
+
+If successful, \verb'v' is returned as \verb'NULL', and its contents are
+returned to the user, with its \verb'type', dimension \verb'n', and number of
+entries \verb'nvals'.  The entries that were in \verb'v' are returned in
+\verb'vb', where \verb'vb[i]=1' means $v(i)$ is present with value
+\verb'vx[i]', and \verb'vb[i]=0' means $v(i)$ is not present (\verb'vx[i]' is
+undefined in this case).  The corresponding numerical values are returned in
+\verb'vx'.
+
+If not successful, \verb'v' is unmodified and \verb'vb' and \verb'vx' are not
+modified.
+
+
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Vector\_import\_Full} import a vector in full form}
+%-------------------------------------------------------------------------------
+\label{vector_import_full}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Vector_import_Full // import a full vector
+(
+    GrB_Vector *v,      // handle of vector to create
+    GrB_Type type,      // type of vector to create
+    GrB_Index n,        // vector length
+    void **vx,          // values, vx_size >= nvals(v)
+    GrB_Index vx_size,  // size of vx
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\noindent
+\verb'GxB_Vector_import_Full' is analogous to \verb'GxB_Matrix_import_FullC'.
+Refer to the description of \verb'GxB_Matrix_import_BitmapC' for details
+(Section~\ref{matrix_import_fullc}).
+
+If successful, \verb'v' is created as a \verb'n'-by-1 \verb'GrB_Vector'.
+All entries are present, and the value of $v(i)$ is given by \verb'vx[i]'.
+
+The pointer \verb'vx' is returned as \verb'NULL', which denotes that it is no
+longer owned by the user application.  It has instead been moved into the new
+\verb'GrB_Vector' \verb'v'.
+
+If not successful, \verb'v' is returned as \verb'NULL' and
+\verb'vx' is not modified.
+
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Vector\_export\_Full:} export a vector in full form}
+%-------------------------------------------------------------------------------
+\label{vector_export_full}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Vector_export_Full   // export and free a full vector
+(
+    GrB_Vector *v,      // handle of vector to export and free
+    GrB_Type *type,     // type of vector exported
+    GrB_Index *n,       // length of the vector
+    void **vx,          // values, vx_size >= nvals(v)
+    GrB_Index *vx_size, // size of vx
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Vector_export_Full' is analogous to \verb'GxB_Matrix_export_FullC'.
+Refer to the description of \verb'GxB_Matrix_export_FullC' for details
+(Section~\ref{matrix_export_fullc}).
+
+Exporting a vector forces completion of any pending operations on the vector.
+All entries in \verb'v' must be present.  In other words, prior to the export,
+\verb'GrB_Vector_nvals' for a vector of length \verb'n' must report that the
+vector contains \verb'n' entries; \verb'GrB_INVALID_VALUE' is returned if this
+condition does not hold.
+
+If successful, \verb'v' is returned as \verb'NULL', and its contents are
+returned to the user, with its \verb'type' and dimension \verb'n'.  The entries
+that were in \verb'v' are returned in the array \verb'vx', \verb'vb', where
+\verb'vb[i]=1' means $v(i)$ is present with value where the value of $v(i)$ is
+\verb'vx[i]'.
+
+If not successful, \verb'v' is unmodified and \verb'vx' is not modified.
+
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_import\_CSR:} import a CSR matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_import_csr}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_import_CSR      // import a CSR matrix
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nrows+1
+    GrB_Index **Aj,     // row indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Aj_size,  // size of Aj
+    GrB_Index Ax_size,  // size of Ax
+    bool jumbled,       // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
 \verb'GxB_Matrix_import_CSR' imports a matrix from 3 user arrays in CSR format.
-In the resulting \verb'GrB_Matrix A', the \verb'CSR' format is a matrix with a
-format (\verb'GxB_FORMAT') of \verb'GxB_BY_ROW', in standard for instead of
-hypersparse form (See Section~\ref{hypersparse}).
+In the resulting \verb'GrB_Matrix A', the \verb'CSR' format is a sparse matrix
+with a format (\verb'GxB_FORMAT') of \verb'GxB_BY_ROW'.
 
 The first four arguments of \verb'GxB_Matrix_import_CSR' are the same as
 all four arguments of \verb'GrB_Matrix_new', because this function is similar.
@@ -4862,7 +5163,7 @@ \subsubsection{{\sf GxB\_Matrix\_import\_CSR:} import a CSR matrix}
 \item \verb'GrB_Index Aj [nvals] ;'  The \verb'Aj' array defines the
 column indices of entries in each row.
 
-\item \verb'ctype Aj [nvals] ;'  The \verb'Ax' array defines the values of
+\item \verb'ctype Ax [nvals] ;'  The \verb'Ax' array defines the values of
 entries in each row.  It is passed in as a \verb'(void *)' pointer, but it must
 point to an array of size \verb'nvals' values, each of size
 \verb'sizeof(ctype)', where \verb'ctype' is the exact type in C that corresponds
@@ -4879,20 +5180,18 @@ \subsubsection{{\sf GxB\_Matrix\_import\_CSR:} import a CSR matrix}
 The column indices of entries in the ith row of the matrix are held in
 \verb'Aj [Ap [i] ... Ap[i+1]]', and the corresponding values are held in the
 same positions in \verb'Ax'.  Column indices must be in the range 0 to
-\verb'ncols'-1, and must appear in sorted order within each row.  No duplicate
-column indices may appear in any row.  \verb'Ap [0]' must equal zero, and
-\verb'Ap [nrows]' must equal nvals.  The \verb'Ap' array must be of size
-\verb'nrows'+1 (or larger), and the \verb'Aj' and \verb'Ax' arrays must have
-size at least \verb'nvals'.
+\verb'ncols'-1.  If \verb'jumbled' is \verb'false', column indices must appear
+in ascending order within each row.  If \verb'jumbled' is \verb'true', column
+indices may appear in any order within each row.  No duplicate column indices
+may appear in any row.  \verb'Ap [0]' must equal zero, and \verb'Ap [nrows]'
+must equal nvals.  The \verb'Ap' array must be of size \verb'nrows'+1 (or
+larger), and the \verb'Aj' and \verb'Ax' arrays must have size at least
+\verb'nvals'.
 
 If \verb'nvals' is zero, then the content of the \verb'Aj' and \verb'Ax' arrays
 is not accessed and they may be \verb'NULL' on input (if not \verb'NULL', they
 are still freed and returned as \verb'NULL', if the method is successful).
 
-The \verb'nonempty' parameter is optional.  It states the number of rows
-that have at least one entry: if not known, use -1;
-if $\ge 0$, it must be exact.
-
 An example of the CSR format is shown below.  Consider the following
 matrix with 10 nonzero entries, and suppose the zeros are not stored.
 
@@ -4927,7 +5226,8 @@ \subsubsection{{\sf GxB\_Matrix\_import\_CSR:} import a CSR matrix}
 same positions in \verb'Ax'.
 
 To iterate over the rows and entries of this matrix, the following code can be
-used:
+used
+(assuming it has type \verb'GrB_FP64'):
 
     {\footnotesize
     \begin{verbatim}
@@ -4949,13 +5249,8 @@ \subsubsection{{\sf GxB\_Matrix\_import\_CSR:} import a CSR matrix}
 Internally, GraphBLAS has moved these arrays into its internal data structure.
 They will eventually be freed no later than when the user does
 \verb'GrB_free(&A)', but they may be freed or resized later, if the matrix
-changes.
-
-If the matrix \verb'A' is later exported in CSR form, and GraphBLAS has not yet
-reallocated these arrays, then these same three arrays are returned to the user
-by \verb'GxB_Matrix_export_CSR' (see Section~\ref{matrix_export_csr}).  If an
-export is performed, the freeing of these three arrays again becomes the
-responsibility of the user application.
+changes.  If an export is performed, the freeing of these three arrays again
+becomes the responsibility of the user application.
 
 The \verb'GxB_Matrix_import_CSR' function will rarely fail, since it allocates
 just $O(1)$ space.  If it does fail, it returns \verb'GrB_OUT_OF_MEMORY',
@@ -4963,9 +5258,60 @@ \subsubsection{{\sf GxB\_Matrix\_import\_CSR:} import a CSR matrix}
 the user application, which is eventually responsible for freeing them with
 \verb'free(Ap)', etc.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Matrix_import_CSR' is an extension to the spec.
-\end{spec}
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_export\_CSR:} export a CSR matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_export_csr}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_export_CSR  // export and free a CSR matrix
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nrows+1
+    GrB_Index **Aj,     // row indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Aj_size, // size of Aj
+    GrB_Index *Ax_size, // size of Ax
+    bool *jumbled,      // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
+) ;
+
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Matrix_export_CSR' exports a matrix in CSR form.
+
+If successful, the \verb'GrB_Matrix A' is freed, and \verb'A' is returned as
+\verb'NULL'.  Its type is returned in the \verb'type' parameter, its dimensions
+in \verb'nrows' and \verb'ncols', and the CSR format is in the three arrays
+\verb'Ap', \verb'Aj', and \verb'Ax'.  If the matrix has no entries, the
+\verb'Aj' and \verb'Ax' arrays may be returned as \verb'NULL'; this is not an
+error, and \verb'GxB_Matrix_import_CSR' also allows these two arrays to be
+\verb'NULL' on input when the matrix has no entries.  After a successful
+export, the user application is responsible for freeing these three arrays via
+\verb'free' (or the \verb'free' function passed to \verb'GxB_init').  The CSR
+format is described in Section~\ref{matrix_import_csr}.
+
+If \verb'jumbled' is returned as \verb'false', column indices will appear in
+ascending order within each row.  If \verb'jumbled' is returned as \verb'true',
+column indices may appear in any order within each row.  If \verb'jumbled' is
+passed in as \verb'NULL', then column indices will be returned in ascending
+order in each row.  No duplicate column indices will appear in any row.
+\verb'Ap [0]' is zero, and \verb'Ap [nrows]' is equal to the number of entries
+in the matrix (\verb'nvals').  The \verb'Ap' array will be of size
+\verb'nrows'+1 (or larger), and the \verb'Aj' and \verb'Ax' arrays will have
+size at least \verb'nvals'.
+
+This method takes $O(1)$ time if the matrix is already in CSR format
+internally.  Otherwise, the matrix is converted to CSR format and then
+exported.
 
 \newpage
 %-------------------------------------------------------------------------------
@@ -4978,18 +5324,18 @@ \subsubsection{{\sf GxB\_Matrix\_import\_CSC:} import a CSC matrix}
 \begin{verbatim}
 GrB_Info GxB_Matrix_import_CSC      // import a CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // CSC format:
-    int64_t nonempty,       // number of columns with at least one entry:
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index **Ap,         // column "pointers", size ncols+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    GrB_Index **Ap,     // column "pointers", Ap_size >= ncols+1
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Ai_size,  // size of Ai
+    GrB_Index Ax_size,  // size of Ax
+    bool jumbled,       // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
 ) ;
 \end{verbatim}
 } \end{mdframed}
@@ -5001,13 +5347,9 @@ \subsubsection{{\sf GxB\_Matrix\_import\_CSC:} import a CSC matrix}
 interpreted.  The column ``pointer'' array has size \verb'ncols+1'.  The row
 indices of the columns are in \verb'Ai', and must appear in ascending order in
 each column.  The corresponding numerical values are held in \verb'Ax'.  The
-row indices of column \verb'j' are held in \verb'Ai [Ap [j]...Ap [j+1]-1',
+row indices of column \verb'j' are held in \verb'Ai [Ap [j]...Ap [j+1]-1]',
 and the corresponding numerical values are in the same locations in \verb'Ax'.
 
-The \verb'nonempty' parameter is optional.  It states the number of columns
-that have at least one entry: if not known, use -1;
-if $\ge 0$, it must be exact.
-
 The same matrix from Equation~\ref{eqn:Aexample}in
 the last section (repeated here):
 
@@ -5035,7 +5377,8 @@ \subsubsection{{\sf GxB\_Matrix\_import\_CSC:} import a CSC matrix}
 since \verb'Ap [1] = 3' and \verb'Ap [2]-1 = 5'.
 
 To iterate over the columns and entries of this matrix, the following code can
-be used:
+be used
+(assuming it has type \verb'GrB_FP64'):
 
     {\footnotesize
     \begin{verbatim}
@@ -5052,17 +5395,55 @@ \subsubsection{{\sf GxB\_Matrix\_import\_CSC:} import a CSC matrix}
     } \end{verbatim}}
 
 The method is identical to \verb'GxB_Matrix_import_CSR'; just the format is
-different.  That is, if the method is successful, the 3 user arrays are
-imported into the new \verb'GrB_Matrix A', with the given type and dimensions,
-and returned as \verb'NULL' pointers to the user application.
+transposed.
 
-If \verb'nvals' is zero, then the content of the \verb'Ai' and \verb'Ax' arrays
+If \verb'Ap [ncols]' is zero, then the content of the \verb'Ai' and \verb'Ax' arrays
 is not accessed and they may be \verb'NULL' on input (if not \verb'NULL', they
 are still freed and returned as \verb'NULL', if the method is successful).
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Matrix_import_CSC' is an extension to the spec.
-\end{spec}
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_export\_CSC:} export a CSC matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_export_csc}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_export_CSC  // export and free a CSC matrix
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    GrB_Index **Ap,     // column "pointers", Ap_size >= ncols+1
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Ai_size, // size of Ai
+    GrB_Index *Ax_size, // size of Ax
+    bool *jumbled,      // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Matrix_export_CSC' exports a matrix in CSC form.
+
+If successful, the \verb'GrB_Matrix A' is freed, and \verb'A' is returned as
+\verb'NULL'.  Its type is returned in the \verb'type' parameter, its dimensions
+in \verb'nrows' and \verb'ncols', and the CSC format is in the three arrays
+\verb'Ap', \verb'Ai', and \verb'Ax'.  If the matrix has no entries, \verb'Ai'
+and \verb'Ax' arrays are returned as \verb'NULL'; this is not an error, and
+\verb'GxB_Matrix_import_CSC' also allows these two arrays to be \verb'NULL' on
+input when the matrix has no entries.  After a successful export, the user
+application is responsible for freeing these three arrays via \verb'free' (or
+the \verb'free' function passed to \verb'GxB_init').  The CSC format is
+described in Section~\ref{matrix_import_csc}.
+
+This method takes $O(1)$ time if the matrix is already in CSC format
+internally.  Otherwise, the matrix is converted to CSC format and then
+exported.
 
 \newpage
 %-------------------------------------------------------------------------------
@@ -5073,33 +5454,34 @@ \subsubsection{{\sf GxB\_Matrix\_import\_HyperCSR:} import a HyperCSR matrix}
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_import_HyperCSR     // import a hypersparse CSR matrix
+GrB_Info GxB_Matrix_import_HyperCSR      // import a hypersparse CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // hypersparse CSR format:
-    int64_t nonempty,       // number of rows in Ah with at least one entry,
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index nvec,         // number of rows in Ah list
-    GrB_Index **Ah,         // list of size nvec of rows that appear in A
-    GrB_Index **Ap,         // row "pointers", size nvec+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // row indices, Ah_size >= nvec
+    GrB_Index **Aj,     // column indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Ah_size,  // size of Ah
+    GrB_Index Aj_size,  // size of Aj
+    GrB_Index Ax_size,  // size of Ax
+    GrB_Index nvec,     // number of rows that appear in Ah
+    bool jumbled,       // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Matrix_import_HyperCSR' imports a matrix in hypersparse CSR format in
-$O(1)$ time.  In the hypersparse format, the \verb'Ap' array itself becomes
-sparse, if the matrix has rows that are completely empty.  An array \verb'Ah'
-of size \verb'nvec' provides a list of rows that appear in the data structure.
-For example, consider Equation~\ref{eqn:Ahyper}, which is a sparser version of
-the matrix in Equation~\ref{eqn:Aexample}.  Row 2 and column 1 of this matrix
-are all zero.
+\verb'GxB_Matrix_import_HyperCSR' imports a matrix in hypersparse CSR format.
+The hypersparse HyperCSR format is identical to the CSR format, except that the
+\verb'Ap' array itself becomes sparse, if the matrix has rows that are
+completely empty.  An array \verb'Ah' of size \verb'nvec' provides a list of
+rows that appear in the data structure.  For example, consider
+Equation~\ref{eqn:Ahyper}, which is a sparser version of the matrix in
+Equation~\ref{eqn:Aexample}.  Row 2 and column 1 of this matrix are all zero.
 
     \begin{equation}
     \label{eqn:Ahyper}
@@ -5129,7 +5511,6 @@ \subsubsection{{\sf GxB\_Matrix\_import\_HyperCSR:} import a HyperCSR matrix}
 another array, \verb'Ah', that keeps track of the rows that appear
 in the data structure.
 
-\newpage
 {\footnotesize
 \begin{verbatim}
     int64_t nvec = 3 ;
@@ -5138,9 +5519,10 @@ \subsubsection{{\sf GxB\_Matrix\_import\_HyperCSR:} import a HyperCSR matrix}
     int64_t Aj [ ] = { 0,   2,   0,   3,   0    3   }
     double  Ax [ ] = { 4.5, 3.2, 3.1, 0.9, 3.5, 1.0 } ; \end{verbatim} }
 
-Note that the \verb'Aj' and \verb'Ax' arrays are the same in the standard and
-hypersparse CSR formats.  The row indices in \verb'Ah' must appear in ascending
-order, and no duplicates can appear.  To iterate over this data structure:
+Note that the \verb'Aj' and \verb'Ax' arrays are the same in the CSR and
+HyperCSR formats.  The row indices in \verb'Ah' must appear in ascending
+order, and no duplicates can appear.  To iterate over this data structure
+(assuming it has type \verb'GrB_FP64'):
 
     {\footnotesize
     \begin{verbatim}
@@ -5158,13 +5540,13 @@ \subsubsection{{\sf GxB\_Matrix\_import\_HyperCSR:} import a HyperCSR matrix}
     } \end{verbatim}}
 
 \vspace{-0.05in}
-This is more complex than the standard CSR format, but it requires at most
+This is more complex than the CSR format, but it requires at most
 $O(e)$ space, where $A$ is $m$-by-$n$ with $e$ = \verb'nvals' entries.  The
-standard CSR format requires $O(m+e)$ space.  If $e << m$, then the size $m+1$
+CSR format requires $O(m+e)$ space.  If $e << m$, then the size $m+1$
 of \verb'Ap' can dominate the memory required.  In the hypersparse form,
 \verb'Ap' takes on size \verb'nvec+1', and \verb'Ah' has size \verb'nvec',
 where \verb'nvec' is the number of rows that appear in the data structure.
-The standard CSR format can be viewed as a dense array (of size \verb'nrows')
+The CSR format can be viewed as a dense array (of size \verb'nrows')
 of sparse row vectors.   By contrast, the hypersparse CSR format is a sparse
 array (of size \verb'nvec') of sparse row vectors.
 
@@ -5172,16 +5554,56 @@ \subsubsection{{\sf GxB\_Matrix\_import\_HyperCSR:} import a HyperCSR matrix}
 \verb'Ap', \verb'Aj', and \verb'Ax' are returned as \verb'NULL', and the
 hypersparse \verb'GrB_Matrix A' is created.
 
-If \verb'nvals' is zero, then the content of the \verb'Aj' and \verb'Ax' arrays
+If the matrix has no entries, then the content of the \verb'Aj' and \verb'Ax' arrays
 is not accessed and they may be \verb'NULL' on input (if not \verb'NULL', they
 are still freed and returned as \verb'NULL', if the method is successful).
-The \verb'nonempty' parameter is optional.  It states the number of rows
-that have at least one entry: if not known, use -1;
-if $\ge 0$, it must be exact.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Matrix_import_HyperCSR' is an extension to the spec.
-\end{spec}
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_export\_HyperCSR:} export a HyperCSR matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_export_hypercsr}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_export_HyperCSR  // export and free a hypersparse CSR matrix
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // row indices, Ah_size >= nvec
+    GrB_Index **Aj,     // column indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Ah_size, // size of Ah
+    GrB_Index *Aj_size, // size of Aj
+    GrB_Index *Ax_size, // size of Ax
+    GrB_Index *nvec,    // number of rows that appear in Ah
+    bool *jumbled,      // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Matrix_export_HyperCSR' exports a matrix in HyperCSR format.
+
+If successful, the \verb'GrB_Matrix A' is freed, and \verb'A' is returned as
+\verb'NULL'.  Its type is returned in the \verb'type' parameter, its dimensions
+in \verb'nrows' and \verb'ncols'.  and the number of non-empty rows in
+\verb'nvec'.  The hypersparse CSR format is in the four arrays \verb'Ah',
+\verb'Ap', \verb'Aj', and \verb'Ax'.  If the matrix has no entries, the
+\verb'Aj' and \verb'Ax' arrays are returned as \verb'NULL'; this is not an
+error.  After a successful export, the user application is responsible for
+freeing these three arrays via \verb'free' (or the \verb'free' function passed
+to \verb'GxB_init').  The hypersparse CSR format is described in
+Section~\ref{matrix_import_hypercsr}.
+
+This method takes $O(1)$ time if the matrix is already in HyperCSR format
+internally.  Otherwise, the matrix is converted to HyperCSR format and then
+exported.
 
 \newpage
 %-------------------------------------------------------------------------------
@@ -5192,31 +5614,33 @@ \subsubsection{{\sf GxB\_Matrix\_import\_HyperCSC:} import a HyperCSC matrix}
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_import_HyperCSC     // import a hypersparse CSC matrix
+GrB_Info GxB_Matrix_import_HyperCSC      // import a hypersparse CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // hypersparse CSC format:
-    int64_t nonempty,       // number of columns in Ah with at least one entry,
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index nvec,         // number of columns in Ah list
-    GrB_Index **Ah,         // list of size nvec of columns that appear in A
-    GrB_Index **Ap,         // column "pointers", size nvec+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    GrB_Index **Ap,     // column "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // column indices, Ah_size >= nvec
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Ah_size,  // size of Ah
+    GrB_Index Ai_size,  // size of Ai
+    GrB_Index Ax_size,  // size of Ax
+    GrB_Index nvec,     // number of columns that appear in Ah
+    bool jumbled,       // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Matrix_import_HyperCSC' imports a matrix in hypersparse CSC format in
-$O(1)$ time.  It is identical to \verb'GxB_Matrix_import_HyperCSR', except for
-the data structure defined by the four arrays \verb'Ah', \verb'Ap', \verb'Ai',
-and \verb'Ax'.  It is a sparse array of size \verb'nvec' of sparse column
-vectors.  The following code iterates over the matrix:
+\verb'GxB_Matrix_import_HyperCSC' imports a matrix in hypersparse CSC format.
+It is identical to \verb'GxB_Matrix_import_HyperCSR', except the data
+structure defined by the four arrays \verb'Ah', \verb'Ap', \verb'Ai', and
+\verb'Ax' holds the matrix as a sparse array of \verb'nvec' sparse column
+vectors.  The following code iterates over the matrix,
+assuming it has type \verb'GrB_FP64':
 
     \vspace{-0.10in}
     {\footnotesize
@@ -5234,249 +5658,390 @@ \subsubsection{{\sf GxB\_Matrix\_import\_HyperCSC:} import a HyperCSC matrix}
         }
     } \end{verbatim}}
 
-\vspace{-0.12in}
-The \verb'nonempty' parameter is optional.  It states the number of columns
-that have at least one entry: if not known, use -1;
-if $\ge 0$, it must be exact.
-
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Matrix_import_HyperCSC' is an extension to the spec.
-\end{spec}
-
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_export\_CSR:} export a CSR matrix}
+\subsubsection{{\sf GxB\_Matrix\_export\_HyperCSC:} export a HyperCSC matrix}
 %-------------------------------------------------------------------------------
-\label{matrix_export_csr}
+\label{matrix_export_hypercsc}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_export_CSR  // export and free a CSR matrix
+GrB_Info GxB_Matrix_export_HyperCSC  // export and free a hypersparse CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // CSR format:
-    int64_t *nonempty,      // number of rows with at least one entry
-    GrB_Index **Ap,         // row "pointers", size nrows+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    GrB_Index **Ap,     // column "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // column indices, Ah_size >= nvec
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Ah_size, // size of Ah
+    GrB_Index *Ai_size, // size of Ai
+    GrB_Index *Ax_size, // size of Ax
+    GrB_Index *nvec,    // number of columns that appear in Ah
+    bool *jumbled,      // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Matrix_export_CSR' exports a matrix in CSR form:
+\verb'GxB_Matrix_export_HyperCSC' exports a matrix in HyperCSC form.
 
+If successful, the \verb'GrB_Matrix A' is freed, and \verb'A' is
+returned as \verb'NULL'.  Its type is returned in the \verb'type' parameter,
+its dimensions in \verb'nrows' and \verb'ncols', and the number of non-empty
+rows in \verb'nvec'.  The hypersparse CSC format is in the four arrays
+\verb'Ah', \verb'Ap', \verb'Ai', and \verb'Ax'.  If the matrix has no entries,
+the \verb'Ai' and \verb'Ax' arrays are returned as \verb'NULL'; this is not an
+error.  After a successful export, the user application is responsible for
+freeing these three arrays via \verb'free' (or the \verb'free' function passed
+to \verb'GxB_init').  The hypersparse CSC format is described in
+Section~\ref{matrix_import_hypercsc}.
+
+This method takes $O(1)$ time if the matrix is already in HyperCSC format
+internally.  Otherwise, the matrix is converted to HyperCSC format and then
+exported.
+
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_import\_BitmapR:} import a BitmapR matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_import_bitmapr}
+{
+\begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GxB_Matrix_export_CSR (&A, &type, &nrows, &ncols, &nvals, &nonempty,
-                       &Ap, &Aj, &Ax, desc) ;
-\end{verbatim}}
+GrB_Info GxB_Matrix_import_BitmapR  // import a bitmap matrix, held by row
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index Ab_size,  // size of Ab
+    GrB_Index Ax_size,  // size of Ax
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
 
-On successful output, the \verb'GrB_Matrix A' is freed, and \verb'A' is
-returned as \verb'NULL'.  Its type is returned in the \verb'type' parameter,
-its dimensions in \verb'nrows' and \verb'ncols', its number of entries in
-\verb'nvals', and the CSR format is in the three arrays \verb'Ap', \verb'Aj',
-and \verb'Ax'.  If \verb'nvals' is zero, the \verb'Aj' and \verb'Ax' arrays are
-returned as \verb'NULL'; this is not an error, and \verb'GxB_Matrix_import_CSR'
-also allows these two arrays to be \verb'NULL' on input when \verb'nvals' is
-zero.  After a successful export, the user application is responsible for
-freeing these three arrays via \verb'free' (or the \verb'free' function passed to \verb'GxB_init').  The CSR format is
-described in Section~\ref{matrix_import_csr}.
+\verb'GxB_Matrix_import_BitmapR' imports a matrix from 2 user arrays in BitmapR
+format.
 
-This method takes $O(1)$ time if the matrix is already in standard
-(non-hypersparse) CSR format internally.  If it is in hypersparse CSR form, the
-export must first convert the matrix to standard CSR form, taking $O(m)$ time
-and memory, where $m$ = \verb'nrows'.  If the matrix is in CSC format, it is
-first transposed to convert it to CSR format, and then exported.  This takes
-$O(m+n+e)$ or $O(m+e \log e)$ time and memory, whichever is less, where $n=$
-\verb'ncols' and $e=$ \verb'nvals'.
+The first four arguments of \verb'GxB_Matrix_import_BitmapR' are the same as
+all four arguments of \verb'GrB_Matrix_new', because this function is similar.
+It creates a new \verb'GrB_Matrix A', with the given type and dimensions.
+The \verb'GrB_Matrix A' does not exist on input.
+
+The \verb'GrB_Matrix' \verb'A' is created from the arrays \verb'Ab' and
+\verb'Ax', each of which are size \verb'nrows*ncols'.  Both arrays must have
+been created with the ANSI C \verb'malloc', \verb'calloc', or \verb'realloc'
+functions (by default), or by the corresponding \verb'malloc_function',
+\verb'calloc_function', or \verb'realloc_function' provided to \verb'GxB_init'.
+These arrays define the pattern and values of the new matrix \verb'A':
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Matrix_export_CSR' is an extension to the spec.
-\end{spec}
+\begin{itemize}
+\item \verb'int8_t Ab [nrows*ncols] ;'  The \verb'Ab' array defines which
+entries of \verb'A' are present.  If \verb'Ab[i*ncols+j]=1', then the entry
+$A(i,j)$ is present, with value \verb'Ax[i*ncols+j]'.  If
+\verb'Ab[i*ncols+j]=0', then the entry $A(i,j)$ is not present.  The \verb'Ab'
+array must contain only 0s and 1s.  The \verb'nvals' input must exactly match
+the number of 1s in the \verb'Ab' array.
+
+\item \verb'ctype Ax [nrows*ncols] ;'  The \verb'Ax' array defines the values
+of entries in the matrix.  It is passed in as a \verb'(void *)' pointer, but it
+must point to an array of size \verb'nrows*ncols' values, each of size
+\verb'sizeof(ctype)', where \verb'ctype' is the exact type in C that
+corresponds to the \verb'GrB_Type type' parameter.  That is, if \verb'type' is
+\verb'GrB_INT32', then \verb'ctype' is \verb'int32_t'.  User types may be used,
+just the same as built-in types.
+If \verb'Ab[p]' is zero, the value of \verb'Ax[p]' is ignored.
+
+\end{itemize}
 
+To iterate over the rows and entries of this matrix, the following code can be
+used (assuming it has type \verb'GrB_FP64'):
+
+    {\footnotesize
+    \begin{verbatim}
+    for (int64_t i = 0 ; i < nrows ; i++)
+    {
+        // get A(i,:)
+        for (int64_t j = 0 ; j < ncols ; j++)
+        {
+            // get A(i,j)
+            int64_t p = i*ncols + j ;
+            if (Ab [p])
+            {
+                double aij = Ax [p] ;           // numerical value
+            }
+        }
+    } \end{verbatim}}
+
+On successful creation of \verb'A', the two pointers \verb'Ab', \verb'Ax',
+are set to \verb'NULL' on output.  This denotes to the user
+application that it is no longer responsible for freeing these arrays.
+Internally, GraphBLAS has moved these arrays into its internal data structure.
+They will eventually be freed no later than when the user does
+\verb'GrB_free(&A)', but they may be freed or resized later, if the matrix
+changes.  If an export is performed, the freeing of these three arrays again
+becomes the responsibility of the user application.
+
+The \verb'GxB_Matrix_import_BitmapR' function will rarely fail, since it allocates
+just $O(1)$ space.  If it does fail, it returns \verb'GrB_OUT_OF_MEMORY',
+and it leaves the two user arrays unmodified.  They are still owned by
+the user application, which is eventually responsible for freeing them with
+\verb'free(Ab)', etc.
+}
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_export\_CSC:} export a CSC matrix}
+\subsubsection{{\sf GxB\_Matrix\_export\_BitmapR:} export a BitmapR matrix}
 %-------------------------------------------------------------------------------
-\label{matrix_export_csc}
+\label{matrix_export_bitmapr}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_export_CSC  // export and free a CSC matrix
+GrB_Info GxB_Matrix_export_BitmapR  // export and free a bitmap matrix, by row
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // CSC format:
-    int64_t *nonempty,      // number of columns with at least one entry
-    GrB_Index **Ap,         // column "pointers", size ncols+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index *Ab_size, // size of Ab
+    GrB_Index *Ax_size, // size of Ax
+    GrB_Index *nvals,   // # of entries in bitmap
+    const GrB_Descriptor desc
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Matrix_export_CSC' exports a matrix in CSC form:
+\verb'GxB_Matrix_export_BitmapR' exports a matrix in BitmapR form.
 
-{\footnotesize
-\begin{verbatim}
-GxB_Matrix_export_CSC (&A, &type, &nrows, &ncols, &nvals, &nonempty,
-                       &Ap, &Ai, &Ax, desc) ;
-\end{verbatim}}
+If successful, the \verb'GrB_Matrix A' is freed, and \verb'A' is returned as
+\verb'NULL'.  Its type is returned in the \verb'type' parameter, its dimensions
+in \verb'nrows' and \verb'ncols', and the number of entries in \verb'nvals'.
 
-On successful output, the \verb'GrB_Matrix A' is freed, and \verb'A' is
-returned as \verb'NULL'.  Its type is returned in the \verb'type' parameter,
-its dimensions in \verb'nrows' and \verb'ncols', its number of entries in
-\verb'nvals', and the CSC format is in the three arrays \verb'Ap', \verb'Ai',
-and \verb'Ax'.  If \verb'nvals' is zero, the \verb'Ai' and \verb'Ax' arrays are
-returned as \verb'NULL'; this is not an error, and \verb'GxB_Matrix_import_CSC'
-also allows these two arrays to be \verb'NULL' on input when \verb'nvals' is
-zero.  After a successful export, the user application is responsible for
-freeing these three arrays via \verb'free' (or the \verb'free' function passed to \verb'GxB_init').  The CSC format is
-described in Section~\ref{matrix_import_csc}.
+The BitmapR format is in the two arrays \verb'Ab', and \verb'Ax'.  After a
+successful export, the user application is responsible for freeing these three
+arrays via \verb'free' (or the \verb'free' function passed to \verb'GxB_init').
+The BitmapR format is described in Section~\ref{matrix_import_bitmapr}.
+If \verb'Ab[p]' is zero, the value of \verb'Ax[p]' is undefined.
 
-This method takes $O(1)$ time if the matrix is already in standard
-(non-hypersparse) CSC format internally.  If it is in hypersparse CSC form, the
-export must first convert the matrix to standard CSC form, taking $O(n)$ time
-and memory, where $n$ = \verb'ncols'.  If the matrix is in CSR
-format, it is first transposed to convert it to CSC format, and then exported.
-This takes $O(m+n+e)$ or $O(n+e \log e)$ time and memory, whichever is less,
-where $m=$ \verb'nrows' and $e=$ \verb'nvals'.
+This method takes $O(1)$ time if the matrix is already in BitmapR format
+internally.  Otherwise, the matrix is converted to BitmapR format and then
+exported.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Matrix_export_CSC' is an extension to the spec.
-\end{spec}
 
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_export\_HyperCSR:} export a HyperCSR matrix}
+\subsubsection{{\sf GxB\_Matrix\_import\_BitmapC:} import a BitmapC matrix}
 %-------------------------------------------------------------------------------
-\label{matrix_export_hypercsr}
+\label{matrix_import_bitmapc}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_export_HyperCSR  // export and free a hypersparse CSR matrix
+GrB_Info GxB_Matrix_import_BitmapC  // import a bitmap matrix, held by column
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // hypersparse CSR format:
-    int64_t *nonempty,      // number of rows in Ah with at least one entry
-    GrB_Index *nvec,        // number of rows in Ah list
-    GrB_Index **Ah,         // list of size nvec of rows that appear in A
-    GrB_Index **Ap,         // row "pointers", size nvec+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index Ab_size,  // size of Ab
+    GrB_Index Ax_size,  // size of Ax
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Matrix_export_HyperCSR' exports a matrix in CSR form:
+\verb'GxB_Matrix_import_BitmapC' imports a matrix from 2 user arrays in BitmapC
+format.  It is identical to \verb'GxB_Matrix_import_BitmapR', except that the
+entry $A(i,j)$ is held in \verb'Ab[i+j*nrows]' and \verb'Ax[i+j*nrows]',
+in column-major format.
 
-\vspace{-0.05in}
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_export\_BitmapC:} export a BitmapC matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_export_bitmapc}
+
+\begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GxB_Matrix_export_HyperCSR (&A, &type, &nrows, &ncols, &nvals, &nonempty,
-                            &nvec, &Ah, &Ap, &Aj, &Ax, desc) ; \end{verbatim}}
+GrB_Info GxB_Matrix_export_BitmapC  // export and free a bitmap matrix, by col
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index *Ab_size, // size of Ab
+    GrB_Index *Ax_size, // size of Ax
+    GrB_Index *nvals,   // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
 
-\vspace{-0.10in}
-On successful output, the \verb'GrB_Matrix A' is freed, and \verb'A' is
-returned as \verb'NULL'.  Its type is returned in the \verb'type' parameter,
-its dimensions in \verb'nrows' and \verb'ncols', its number of entries in
-\verb'nvals', and the number of non-empty rows in \verb'nvec'.  The hypersparse
-CSR format is in the four arrays \verb'Ah', \verb'Ap', \verb'Aj', and
-\verb'Ax'.  If \verb'nvals' is zero, the \verb'Aj' and \verb'Ax' arrays are
-returned as \verb'NULL'; this is not an error.  After a successful export, the
-user application is responsible for freeing these three arrays via
-\verb'free' (or the \verb'free' function passed to \verb'GxB_init').  The hypersparse CSR format is described in
-Section~\ref{matrix_import_hypercsr}.
+\verb'GxB_Matrix_export_BitmapC' exports a matrix in BitmapC form.
+It is identical to \verb'GxB_Matrix_export_BitmapR', except that the
+entry $A(i,j)$ is held in \verb'Ab[i+j*nrows]' and \verb'Ax[i+j*nrows]',
+in column-major format.
 
-This method takes $O(1)$ time if the matrix is already in hypersparse CSR
-format internally.  If it is in standard CSR form, the export must first
-convert the matrix to hypersparse CSR form, taking $O(m)$ time and memory,
-where $m$ = \verb'nrows'.  If the matrix is in CSC format, it is first
-transposed to convert it to hypersparse CSR format, and then exported.  If in
-standard CSC form, the transpose takes $O(m+n+e)$ or $O(n + e \log e)$ time and
-memory, whichever is less.  If in hypersparse CSC format, it takes $O(e \log
-e)$ time.
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_import\_FullR:} import a FullR matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_import_fullr}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_import_FullR  // import a full matrix, held by row
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index Ax_size,  // size of Ax
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Matrix_export_HyperCSR' is an extension to the spec.
-\end{spec}
+\verb'GxB_Matrix_import_FullR' imports a matrix from a user arrays in FullR
+format.  The \verb'FullR' format is identical to \verb'BitmapR', except that
+all entries are present.  The value of $A(i,j)$ is \verb'Ax[i*ncols+j]'.
+
+To iterate over the rows and entries of this matrix, the following code can be
+used (assuming it has type \verb'GrB_FP64'):
+
+    {\footnotesize
+    \begin{verbatim}
+    for (int64_t i = 0 ; i < nrows ; i++)
+    {
+        // get A(i,:)
+        for (int64_t j = 0 ; j < ncols ; j++)
+        {
+            // get A(i,j)
+            int64_t p = i*ncols + j ;
+            double aij = Ax [p] ;           // numerical value
+        }
+    } \end{verbatim}}
 
 \newpage
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GxB\_Matrix\_export\_HyperCSC:} export a HyperCSC matrix}
+\subsubsection{{\sf GxB\_Matrix\_export\_FullR:} export a FullR matrix}
 %-------------------------------------------------------------------------------
-\label{matrix_export_hypercsc}
+\label{matrix_export_fullr}
 
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GxB_Matrix_export_HyperCSC  // export and free a hypersparse CSC matrix
+GrB_Info GxB_Matrix_export_FullR  // export and free a full matrix, by row
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // hypersparse CSC format:
-    int64_t *nonempty,      // number of columns in Ah with at least one entry
-    GrB_Index *nvec,        // number of columns in Ah list
-    GrB_Index **Ah,         // list of size nvec of columns that appear in A
-    GrB_Index **Ap,         // columns "pointers", size nvec+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index *Ax_size, // size of Ax
+    const GrB_Descriptor desc
 ) ;
 \end{verbatim}
 } \end{mdframed}
 
-\verb'GxB_Matrix_export_HyperCSC' exports a matrix in CSC form:
+\verb'GxB_Matrix_export_FullR' exports a matrix in FullR form.  It is identical
+to \verb'GxB_Matrix_export_BitmapR', except that all entries must be present.
+
+That is, prior to export, \verb'GrB_Matrix_nvals (&nvals, A)' must return
+\verb'nvals' equal to \verb'nrows*ncols'.  Otherwise, if the \verb'A' is
+exported with \newline \verb'GxB_Matrix_export_FullR', an error is returned
+(\verb'GrB_INVALID_VALUE') and the matrix is not exported.
 
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_import\_FullC:} import a FullC matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_import_fullc}
+
+\begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GxB_Matrix_export_HyperCSC (&A, &type, &nrows, &ncols, &nvals, &nonempty,
-                            &nvec, &Ah, &Ap, &Ai, &Ax, desc) ; \end{verbatim}}
+GrB_Info GxB_Matrix_import_FullC  // import a full matrix, held by column
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index Ax_size,  // size of Ax
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
 
-\vspace{-0.05in}
-On successful output, the \verb'GrB_Matrix A' is freed, and \verb'A' is
-returned as \verb'NULL'.  Its type is returned in the \verb'type' parameter,
-its dimensions in \verb'nrows' and \verb'ncols', its number of entries in
-\verb'nvals', and the number of non-empty rows in \verb'nvec'.  The hypersparse
-CSC format is in the four arrays \verb'Ah', \verb'Ap', \verb'Ai', and
-\verb'Ax'.  If \verb'nvals' is zero, the \verb'Ai' and \verb'Ax' arrays are
-returned as \verb'NULL'; this is not an error.  After a successful export, the
-user application is responsible for freeing these three arrays via
-\verb'free' (or the \verb'free' function passed to \verb'GxB_init').  The hypersparse CSC format is described in
-Section~\ref{matrix_import_hypercsc}.
+\verb'GxB_Matrix_import_FullC' imports a matrix from a user arrays in FullC
+format.  The \verb'FullC' format is identical to \verb'BitmapC', except that
+all entries are present.  The value of $A(i,j)$ is \verb'Ax[i+j*nrows]'.
 
-This method takes $O(1)$ time if the matrix is already in hypersparse CSR
-format internally.  If it is in standard CSR form, the export must first
-convert the matrix to hypersparse CSR form, taking $O(m)$ time and memory,
-where $m$ = \verb'nrows'.  If the matrix is in CSC format, it is first
-transposed to convert it to hypersparse CSR format, and then exported.  If in
-standard CSC form, the transpose takes $O(m+n+e)$ or $O(n + e \log e)$ time and
-memory, whichever is less.  If in hypersparse CSC format, it takes $O(e \log
-e)$ time.
+To iterate over the rows and entries of this matrix, the following code can be
+used (assuming it has type \verb'GrB_FP64'):
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Matrix_export_HyperCSC' is an extension to the spec.
-\end{spec}
+    {\footnotesize
+    \begin{verbatim}
+    for (int64_t i = 0 ; i < nrows ; i++)
+    {
+        // get A(i,:)
+        for (int64_t j = 0 ; j < ncols ; j++)
+        {
+            // get A(i,j)
+            int64_t p = i + j*nrows ;
+            double aij = Ax [p] ;           // numerical value
+        }
+    } \end{verbatim}}
+
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GxB\_Matrix\_export\_FullC:} export a FullC matrix}
+%-------------------------------------------------------------------------------
+\label{matrix_export_fullc}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GxB_Matrix_export_FullC  // export and free a full matrix, by column
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index *Ax_size, // size of Ax
+    const GrB_Descriptor desc
+) ;
+\end{verbatim}
+} \end{mdframed}
+
+\verb'GxB_Matrix_export_FullC' exports a matrix in FullC form.  It is identical
+to \verb'GxB_Matrix_export_BitmapC', except that all entries must be present.
+
+That is, prior to export, \verb'GrB_Matrix_nvals (&nvals, A)' must return
+\verb'nvals' equal to \verb'nrows*ncols'.  Otherwise, if the \verb'A' is
+exported with \newline \verb'GxB_Matrix_export_FullC', an error is returned
+(\verb'GrB_INVALID_VALUE') and the matrix is not exported.
 
 \newpage
 %===============================================================================
@@ -5485,13 +6050,7 @@ \subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
 \label{descriptor}
 
 A GraphBLAS {\em descriptor} modifies the behavior of a GraphBLAS operation.
-% (not a operator).
-% GraphBLAS operations are described in
-% Section~\ref{operations}, and all of them have a final parameter of a
-% descriptor.
 If the descriptor is \verb'GrB_NULL', defaults are used.
-% No GraphBLAS method (Section~\ref{objects}) is modified by a descriptor, and
-% neither are any unary or binary operators.
 
 The access to these parameters and their values is governed
 by two \verb'enum' types, \verb'GrB_Desc_Field' and \verb'GrB_Desc_Value':
@@ -5510,6 +6069,7 @@ \subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
     GxB_DESCRIPTOR_NTHREADS = GxB_NTHREADS,   // number of threads to use
     GxB_DESCRIPTOR_CHUNK = GxB_CHUNK,   // chunk size for small problems
     GxB_AxB_METHOD = 1000, // descriptor for selecting C=A*B algorithm
+    GxB_SORT = 35   // control sort in GrB_mxm
 }
 GrB_Desc_Field ;
 
@@ -5526,7 +6086,6 @@ \subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
     GrB_TRAN = 3,       // use the transpose of the input
     // for GxB_AxB_METHOD only:
     GxB_AxB_GUSTAVSON = 1001,   // gather-scatter saxpy method
-    GxB_AxB_HEAP      = 1002,   // heap-based saxpy method
     GxB_AxB_DOT       = 1003,   // dot product
     GxB_AxB_HASH      = 1004,   // hash-based saxpy method
     GxB_AxB_SAXPY     = 1005    // saxpy method (any kind)
@@ -5536,20 +6095,9 @@ \subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
 
 \newpage
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_DEFAULT', \verb'GxB_NTHREADS', \verb'GxB_CHUNK',
-\verb'GxB_AxB_METHOD', and \verb'GxB_AxB_*'
-are extensions to the spec.
-\end{spec}
-
-The internal representation is opaque to the user, but in this User Guide the
-five descriptor fields of a descriptor \verb'desc' are illustrated as an array
-of five items, as described in the list below.  The underlying implementation
-need not be an array:
-
 \begin{itemize}
-\item \verb'desc [GrB_OUTP]' is a parameter that modifies the output of a
-    GraphBLAS operation.  Currently, there are two possible settings.  In the
+\item \verb'GrB_OUTP' is a parameter that modifies the output of a
+    GraphBLAS operation.  In the
     default case, the output is not cleared, and ${\bf C \langle M \rangle = Z
     = C \odot T}$ is computed as-is, where ${\bf T}$ is the results of the
     particular GraphBLAS operation.
@@ -5560,7 +6108,7 @@ \subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
     output is cleared of its entries.  Next, the assignment ${\bf C \langle M
     \rangle = Z}$ is performed.
 
-\item \verb'desc [GrB_MASK]' is a parameter that modifies the \verb'Mask',
+\item \verb'GrB_MASK' is a parameter that modifies the \verb'Mask',
     even if the mask is not present.
 
     If this parameter is set to its default value, and if the mask is not
@@ -5571,7 +6119,7 @@ \subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
     modified, even if \verb'Z(i,j)' is an entry with a different value; that
     value is simply discarded.
 
-    If the \verb'desc [GrB_MASK]' parameter is set to \verb'GrB_COMP', then the
+    If the \verb'GrB_MASK' parameter is set to \verb'GrB_COMP', then the
     use of the mask is complemented.  In this case, if the mask is not present
     (\verb'Mask==NULL') then implicitly \verb'Mask(i,j)=0' for all \verb'i' and
     \verb'j'.  This means that none of ${\bf C}$ is modified and the entire
@@ -5585,7 +6133,7 @@ \subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
     \verb'Z(i,j)' is an entry with a different value; that value is simply
     discarded.
 
-    If the \verb'desc [GrB_MASK]' parameter is set to \verb'GrB_STRUCTURE',
+    If the \verb'GrB_MASK' parameter is set to \verb'GrB_STRUCTURE',
     then the values of the mask are ignored, and just the pattern of the
     entries is used.  Any entry \verb'M(i,j)' in the pattern is treated as if
     it were true.
@@ -5604,38 +6152,38 @@ \subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
     to indicate those places, and use a descriptor \verb'GrB_MASK' that
     complements the use of the mask.
 
-\item \verb'desc [GrB_INP0]' and \verb'desc [GrB_INP1]' modify the use of the
+\item \verb'GrB_INP0' and \verb'GrB_INP1' modify the use of the
     first and second input matrices \verb'A' and \verb'B' of the GraphBLAS
     operation.
 
-    If the \verb'desc [GrB_INP0]' is set to \verb'GrB_TRAN', then \verb'A' is
+    If the \verb'GrB_INP0' is set to \verb'GrB_TRAN', then \verb'A' is
     transposed before using it in the operation.  Likewise, if
-    \verb'desc [GrB_INP1]' is set to \verb'GrB_TRAN', then the second input,
+    \verb'GrB_INP1' is set to \verb'GrB_TRAN', then the second input,
     typically called \verb'B', is transposed.
 
     Vectors and scalars are never transposed via the descriptor.  If a method's
     first parameter is a matrix and the second a vector or scalar, then
-    \verb'desc [GrB_INP0]' modifies the matrix parameter and
-    \verb'desc [GrB_INP1]' is ignored.  If a method's first parameter is a
-    vector or scalar and the second a matrix, then \verb'desc [GrB_INP1]'
-    modifies the matrix parameter and \verb'desc [GrB_INP0]' is ignored.
+    \verb'GrB_INP0' modifies the matrix parameter and
+    \verb'GrB_INP1' is ignored.  If a method's first parameter is a
+    vector or scalar and the second a matrix, then \verb'GrB_INP1'
+    modifies the matrix parameter and \verb'GrB_INP0' is ignored.
 
     To clarify this in each function, the inputs are labeled as
     \verb'first input:' and \verb'second input:' in the function signatures.
 
-\item \verb'desc [GxB_AxB_METHOD]' suggests the method that should be
+\item \verb'GxB_AxB_METHOD' suggests the method that should be
     used to compute \verb'C=A*B'.  All the methods compute the same result,
     except they may have different floating-point roundoff errors.  This
     descriptor should be considered as a hint; SuiteSparse:GraphBLAS is
-    free to ignore it.  The current version always follows the hint, however.
+    free to ignore it.
 
     \begin{itemize}
 
     \item \verb'GxB_DEFAULT' means that a method is selected automatically.
 
     \item \verb'GxB_AxB_SAXPY': select any saxpy-based method:
-        \verb'GxB_AxB_GUSTAVSON', \verb'GxB_AxB_HEAP', and/or
-        \verb'GxB_AxB_HASH', or any mix of the three,
+        \verb'GxB_AxB_GUSTAVSON', and/or
+        \verb'GxB_AxB_HASH', or any mix of the two,
         in contrast to the dot-product method.
 
     \item \verb'GxB_AxB_GUSTAVSON':  an extended version of Gustavson's method
@@ -5643,20 +6191,22 @@ \subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
     sometimes the workspace can be too large.  Assuming all matrices are stored
     by column, it computes \verb'C(:,j)=A*B(:,j)' with a sequence of {\em
     saxpy} operations (\verb'C(:,j)+=A(:,k)*B(k:,j)' for each nonzero
-    \verb'B(k,j)').  Each internal thread requires workspace of size $m$, to
-    the number of rows of \verb'C', which is not suitable if the matrices are
-    extremely sparse or if there are many threads.  If all matrices are stored
-    by row, then it computes \verb'C(i,:)=A(i,:)*B' in a sequence of sparse
-    {\em saxpy} operations, and using workspace of size $n$ per thread,
-    corresponding to the number of columns of \verb'C'.
-
-    \item \verb'GxB_AxB_HEAP': no longer appears in SuiteSparse:GraphBLAS, but
-    may be reintroduced in a future version.  This is silently replaced with
-    \verb'GxB_AxB_HASH'.
+    \verb'B(k,j)').  In the {\em coarse Gustavson} method, each internal thread
+    requires workspace of size $m$, to the number of rows of \verb'C', which is
+    not suitable if the matrices are extremely sparse or if there are many
+    threads.  For the {\em fine Gustavson} method, threads can share workspace
+    and update it via atomic operations.  If all matrices are stored by row,
+    then it computes \verb'C(i,:)=A(i,:)*B' in a sequence of sparse {\em saxpy}
+    operations, and using workspace of size $n$ per thread, or group of
+    threads, corresponding to the number of columns of \verb'C'.
 
     \item \verb'GxB_AxB_HASH':  a hash-based method, based on
-        \cite{10.1145/3229710.3229720}.  Very efficient for hypersparse
+        \cite{10.1145/3229710.3229720}.  It is very efficient for hypersparse
         matrices, matrix-vector-multiply, and when $|{\bf B}|$ is small.
+        SuiteSparse:GraphBLAS includes a {\em coarse hash} method, in which
+        each thread has its own hash workspace, and a {\em fine hash}
+        method, in which groups of threads share a single hash workspace,
+        as concurrent data structure, using atomics.
 
 % [2] Yusuke Nagasaka, Satoshi Matsuoka, Ariful Azad, and Aydın Buluç. 2018.
 % High-Performance Sparse Matrix-Matrix Products on Intel KNL and Multicore
@@ -5664,11 +6214,12 @@ \subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
 % Association for Computing Machinery, New York, NY, USA, Article 34, 1–10.
 % DOI:https://doi.org/10.1145/3229710.3229720
 
-    \item \verb'GxB_AxB_DOT': computes \verb"C(i,j)=A(i,:)*B(j,:)'", for each
+\item \verb'GxB_AxB_DOT': computes \verb"C(i,j)=A(i,:)*B(j,:)'", for each
     entry \verb'C(i,j)'.  If the mask is present and not complemented, only
     entries for which \verb'M(i,j)=1' are computed.  This is a very specialized
     method that works well only if the mask is present, very sparse, and not
-    complemented, or when \verb'C' is tiny.  For example, it works very well
+    complemented, when \verb'C' is small, or when \verb'C' is bitmap or full.
+    For example, it works very well
     when \verb'A' and \verb'B' are tall and thin, and \verb"C<M>=A*B'" or
     \verb"C=A*B'" are computed.  These expressions assume all matrices are in
     CSR format.  If in CSC format, then the dot-product method used for
@@ -5680,9 +6231,32 @@ \subsection{GraphBLAS descriptors: {\sf GrB\_Descriptor}} %=====================
 
     \end{itemize}
 
+\item \verb'GxB_NTHREADS' controls how many threads a method uses.
+    By default (if set to zero, or \verb'GxB_DEFAULT'), all available threads
+    are used.  The maximum available threads is controlled by the global
+    setting, which is \verb'omp_get_max_threads ( )' by default.  If set to
+    some positive integer \verb'nthreads' less than this maximum, at most
+    \verb'nthreads' threads will be used.  See Section~\ref{omp_parallelism}
+    for details.
+
+\item \verb'GxB_CHUNK' is a \verb'double' value that controls how many threads
+    a method uses for small problems.  See Section~\ref{omp_parallelism} for
+    details.
+
+\item \verb'GxB_SORT' provides a hint to \verb'GrB_mxm', \verb'GrB_mxv',
+    \verb'GrB_vxm', and \verb'GrB_reduce' (to vector).  These methods can leave
+    the output matrix or vector in a jumbled state, where the final sort is
+    left as pending work.  This is typically fastest, since some algorithms can
+    tolerate jumbled matrices on input, and sometimes the sort can be skipped
+    entirely.  However, if the matrix or vector will be immediately exported in
+    unjumbled form, or provided as input to a method that requires it to not be
+    jumbled, then sorting it during the matrix multiplication is faster.
+    By default, these methods leave the result in jumbled form (a {\em lazy
+    sort}), if \verb'GxB_SORT' is set to zero (\verb'GxB_DEFAULT').  A nonzero
+    value will inform the matrix multiplication to sort its result, instead.
+
 \end{itemize}
 
-\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Descriptor\_new:}  create a new descriptor}
 %-------------------------------------------------------------------------------
@@ -5699,8 +6273,9 @@ \subsubsection{{\sf GrB\_Descriptor\_new:}  create a new descriptor}
 
 \verb'GrB_Descriptor_new' creates a new descriptor, with all fields set to
 their defaults (output is not replaced, the mask is not complemented, the mask
-is valued not structural, neither input matrix is transposed, and the method
-used in \verb'C=A*B' is selected automatically).
+is valued not structural, neither input matrix is transposed, the method
+used in \verb'C=A*B' is selected automatically, and \verb'GrB_mxm' leaves
+the final sort as pending work).
 
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Descriptor\_wait:} wait for a descriptor}
@@ -5709,7 +6284,7 @@ \subsubsection{{\sf GrB\_Descriptor\_wait:} wait for a descriptor}
 \begin{mdframed}[userdefinedwidth=6in]
 {\footnotesize
 \begin{verbatim}
-GrB_Info GrB_Descriptor_wait        // wait for a descriptor
+GrB_Info GrB_wait                   // wait for a descriptor
 (
     GrB_Descriptor *descriptor      // descriptor to wait for
 ) ;
@@ -5722,7 +6297,7 @@ \subsubsection{{\sf GrB\_Descriptor\_wait:} wait for a descriptor}
 SuiteSparse:GraphBLAS currently does nothing for
 \verb'GrB_Descriptor_wait(&d)', except to ensure that \verb'd' is valid.
 
-% \newpage
+\newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Descriptor\_set:}  set a parameter in a descriptor}
 %-------------------------------------------------------------------------------
@@ -5741,17 +6316,13 @@ \subsubsection{{\sf GrB\_Descriptor\_set:}  set a parameter in a descriptor}
 
 \verb'GrB_Descriptor_set' sets a descriptor field (\verb'GrB_OUTP',
 \verb'GrB_MASK', \verb'GrB_INP0', \verb'GrB_INP1', or \verb'GxB_AxB_METHOD') to
-a particular value (\verb'GxB_DEFAULT', \verb'GrB_COMP',
-\verb'GrB_STRUCTURE', \verb'GrB_COMP+GrB_STRUCTURE', \verb'GrB_TRAN',
-\verb'GrB_REPLACE', \verb'GxB_AxB_GUSTAVSON', \verb'GxB_AxB_HEAP',
-\verb'GxB_AxB_HASH',
-\verb'GxB_AxB_SAXPY',
-or
-\verb'GxB_AxB_DOT').
+a particular value.  Use \verb'GxB_Dec_set' to set the value of
+\verb'GxB_NTHREADS', \verb'GxB_CHUNK', and \verb'GxB_SORT'.
+If an error occurs, \verb'GrB_error(&err,desc)' returns details about the error.
 
 \vspace{0.2in}
 \noindent
-{\small
+{\footnotesize
 \begin{tabular}{|l|p{2.4in}|p{2.2in}|}
 \hline
 Descriptor & Default   & Non-default \\
@@ -5783,7 +6354,8 @@ \subsubsection{{\sf GrB\_Descriptor\_set:}  set a parameter in a descriptor}
     The values of the Mask are ignored.  If \verb'Mask(i,j)' is an entry
     in the \verb'Mask' matrix, it is treated as if \verb'Mask(i,j)=1'.
     The two options \verb'GrB_COMP' and \verb'GrB_STRUCTURE' can be
-    combined.  \\
+    combined, with two subsequent calls, or with a single call with the setting
+    \verb'GrB_COMP+GrB_STRUCTURE'.  \\
 
 \hline
 
@@ -5807,7 +6379,7 @@ \subsubsection{{\sf GrB\_Descriptor\_set:}  set a parameter in a descriptor}
 
 \verb'GrB_AxB_METHOD'
     & \verb'GxB_DEFAULT':
-    The method used for computing \verb'C=A*B' is selected automatically.
+    The method for \verb'C=A*B' is selected automatically.
     & \verb'GxB_AxB_'{\em method}: The selected method is used to compute
     \verb'C=A*B'.  \\
 
@@ -5832,14 +6404,12 @@ \subsubsection{{\sf GxB\_Desc\_set:}  set a parameter in a descriptor}
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GxB_Desc_set' is like \verb'GrB_Descriptor_set', except that the
-type of the third parameter can vary with the field.   This function can
-modify descriptor settings that do not have the type \verb'GrB_Desc_Value'.
-See also \verb'GxB_set' described in Section~\ref{options}.
-
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Desc_set' is an extension to the spec.
-\end{spec}
+\verb'GxB_Desc_set' is like \verb'GrB_Descriptor_set', except that the type of
+the third parameter can vary with the field.   This function can modify all
+descriptor settings, including those that do not have the type
+\verb'GrB_Desc_Value'.  See also \verb'GxB_set' described in
+Section~\ref{options}.  If an error occurs, \verb'GrB_error(&err,desc)' returns
+details about the error.
 
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_Desc\_get:}  get a parameter from a descriptor}
@@ -5861,10 +6431,6 @@ \subsubsection{{\sf GxB\_Desc\_get:}  get a parameter from a descriptor}
 type of the third parameter is a pointer to a variable type, whose type depends
 on the field.  See also \verb'GxB_get' described in Section~\ref{options}.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_Desc_get' is an extension to the spec.
-\end{spec}
-
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Descriptor\_free:} free a descriptor}
 %-------------------------------------------------------------------------------
@@ -5900,16 +6466,21 @@ \subsubsection{{\sf GrB\_DESC\_*:}  predefined descriptors}
 
 Version 1.3 of the GraphBLAS C API Specification adds predefined descriptors,
 and these have been added as of v3.2.0 of SuiteSparse:GraphBLAS.  They are
-listed in the table below.  These descriptors may not be modified or freed.
+listed in the table below.  A dash in the table indicates the default.
+These descriptors may not be modified or freed.
 Attempts to modify them result in an error (\verb'GrB_INVALID_VALUE'); attempts
-to free them are silently ignored.
-\verb'GrB_NULL' is the default descriptor, with all settings at their defaults:
-\verb'OUTP': do not replace the output,
-\verb'MASK': mask is valued and not complemented,
-\verb'INP0': first input not transposed, and
-\verb'INP1': second input not transposed.
-
-\vspace{0.02in}
+to free them are silently ignored. 
+% \verb'GrB_NULL' is the default descriptor, with all settings at their defaults:
+% \verb'OUTP': do not replace the output,
+% \verb'MASK': mask is valued and not complemented,
+% \verb'INP0': first input not transposed, and
+% \verb'INP1': second input not transposed.
+% For these pre-defined descriptors, the
+% \verb'GxB_NTHREADS',
+% \verb'GxB_CHUNK', and
+% \verb'GxB_SORT' settings are at their default values.
+
+\vspace{0.2in}
 \noindent
 {\footnotesize
 \begin{tabular}{|l|lllll|}
@@ -5996,9 +6567,8 @@ \subsection{{\sf GrB\_free:} free any GraphBLAS object} %=======================
 
 Some objects are predefined, such as the built-in types.  If a user application
 attempts to free a built-in object, SuiteSparse:GraphBLAS will safely do
-nothing.  The \verb'GrB_free' function in SuiteSparse:GraphBLAS returns
-\verb'GrB_SUCCESS' or \verb'GrB_PANIC' in the unlikely event of a failure.
-% TODO in 4.0: GrB_PANIC will not be returned.
+nothing.  The \verb'GrB_free' function in SuiteSparse:GraphBLAS always
+returns \verb'GrB_SUCCESS'.
 
 \newpage
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -6019,7 +6589,7 @@ \section{The mask, accumulator, and replace option} %%%%%%%%%%%%%%%%%%%%%%%%%%%%
 it may be complemented, or not.  These options may be combined, for a total of
 8 cases, although the structural/valued option as no effect if ${\bf M}$ is not
 present.  If ${\bf M}$ is not present and not complemented, then $m_{ij}$ is
-implicitly true.  If not present yet complemeted, then all $m_{ij}$ entries are
+implicitly true.  If not present yet complemented, then all $m_{ij}$ entries are
 implicitly zero; in this case, ${\bf T}$ need not be computed at all.  Either
 ${\bf C}$ is not modified, or all its entries are cleared if the replace option
 is enabled.  If ${\bf M}$ is present, and the structural option is used, then
@@ -6060,38 +6630,33 @@ \section{The mask, accumulator, and replace option} %%%%%%%%%%%%%%%%%%%%%%%%%%%%
     -  &-   &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
     -  &-   & $c_{ij}$ &  -        & 1    &  delete $c_{ij}$ because $t_{ij}$ not present \\
     -  &-   &  -       &  -        & 1    &   \\
-\hline
-    yes&-   & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, update \\
-    yes&-   &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
-    yes&-   & $c_{ij}$ &  -        & 1    &  delete $c_{ij}$ because $t_{ij}$ not present \\
-    yes&-   &  -       &  -        & 1    &   \\
-\hline
-    -  &yes & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = c_{ij} \odot t_{ij}$, apply accumulator \\
-    -  &yes &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
-    -  &yes & $c_{ij}$ &  -        & 1    &   \\
-    -  &yes &  -       &  -        & 1    &   \\
-\hline
-    yes&yes & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = c_{ij} \odot t_{ij}$, apply accumulator \\
-    yes&yes &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
-    yes&yes & $c_{ij}$ &  -        & 1    &   \\
-    yes&yes &  -       &  -        & 1    &   \\
-\hline
-\hline
     -  &-   & $c_{ij}$ & $t_{ij}$  & 0    &   \\
     -  &-   &  -       & $t_{ij}$  & 0    &   \\
     -  &-   & $c_{ij}$ &  -        & 0    &   \\
     -  &-   &  -       &  -        & 0    &   \\
 \hline
+    yes&-   & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, update \\
+    yes&-   &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
+    yes&-   & $c_{ij}$ &  -        & 1    &  delete $c_{ij}$ because $t_{ij}$ not present \\
+    yes&-   &  -       &  -        & 1    &   \\
     yes&-   & $c_{ij}$ & $t_{ij}$  & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
     yes&-   &  -       & $t_{ij}$  & 0    &   \\
     yes&-   & $c_{ij}$ &  -        & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
     yes&-   &  -       &  -        & 0    &   \\
 \hline
+    -  &yes & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = c_{ij} \odot t_{ij}$, apply accumulator \\
+    -  &yes &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
+    -  &yes & $c_{ij}$ &  -        & 1    &   \\
+    -  &yes &  -       &  -        & 1    &   \\
     -  &yes & $c_{ij}$ & $t_{ij}$  & 0    &   \\
     -  &yes &  -       & $t_{ij}$  & 0    &   \\
     -  &yes & $c_{ij}$ &  -        & 0    &   \\
     -  &yes &  -       &  -        & 0    &   \\
 \hline
+    yes&yes & $c_{ij}$ & $t_{ij}$  & 1    &  $c_{ij} = c_{ij} \odot t_{ij}$, apply accumulator \\
+    yes&yes &  -       & $t_{ij}$  & 1    &  $c_{ij} = t_{ij}$, insert \\
+    yes&yes & $c_{ij}$ &  -        & 1    &   \\
+    yes&yes &  -       &  -        & 1    &   \\
     yes&yes & $c_{ij}$ & $t_{ij}$  & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
     yes&yes &  -       & $t_{ij}$  & 0    &   \\
     yes&yes & $c_{ij}$ &  -        & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
@@ -6109,11 +6674,6 @@ \section{SuiteSparse:GraphBLAS Options} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \label{options}
 
-\begin{spec}
-{\bf SPEC:} {\sf GxB\_set} and {\sf GxB\_get} are extensions to the
-specification.
-\end{spec}
-
 SuiteSparse:GraphBLAS includes two type-generic methods, \verb'GxB_set' and
 \verb'GxB_get', that set and query various options and parameters settings,
 including a generic way to set values in the \verb'GrB_Descriptor' object.
@@ -6127,45 +6687,175 @@ \section{SuiteSparse:GraphBLAS Options} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 \item \verb'GxB_set (field, value)' provides hints to
     SuiteSparse:GraphBLAS on how it should store all matrices created after
-    calling this function:  by row, by column, and whether or not to use a {\em
-    hypersparse} format \cite{BulucGilbert08,BulucGilbert12}.  These are global
-    options that modify all matrices created after calling this method.
-    The global settings also control the number of threads used, and the
-    heuristic for selecting the number of threads for small problems.
+    calling this function:  by row, by column.  It provides hints as to when to
+    use {\em hypersparse} \cite{BulucGilbert08,BulucGilbert12} or {\em bitmap}
+    formats.  These are global options that modify all matrices created after
+    calling this method.  The global settings also control the number of
+    threads used, and the heuristic for selecting the number of threads for
+    small problems (the ``chunk'').
+
+{\footnotesize
+\begin{tabular}{lll}
+field                       & value         & description \\
+\hline
+\verb'GxB_HYPER_SWITCH'     & \verb'double' & hypersparsity control (0 to 1) \\
+\verb'GxB_BITMAP_SWITCH'    & \verb'double [8]' & bitmap control \\
+\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
+                                              or \verb'GxB_BY_COL' \\
+\verb'GxB_GLOBAL_NTHREADS'  & \verb'int'    & number of threads to use \\
+\verb'GxB_NTHREADS'         & \verb'int'    & number of threads to use \\
+\verb'GxB_GLOBAL_CHUNK'     & \verb'double' & chunk size \\
+\verb'GxB_CHUNK'            & \verb'double' & chunk size \\
+\verb'GxB_BURBLE'           & \verb'int'    & diagnostic output \\
+\end{tabular}
+}
 
 \item \verb'GxB_set (GrB_Matrix A, field, value)' provides hints to
-    SuiteSparse: GraphBLAS on how to store a particular matrix.  This method
-    allows SuiteSparse:GraphBLAS to transform a specific matrix from one format
-    to another.  The format has no effect on the result computed by GraphBLAS;
-    it only affects the time and memory taken to do the computations.
+    SuiteSparse: GraphBLAS on how to store a particular matrix.
+
+{\footnotesize
+\begin{tabular}{lll}
+field                       & value         & description \\
+\hline
+\verb'GxB_HYPER_SWITCH'     & \verb'double' & hypersparsity control (0 to 1) \\
+\verb'GxB_BITMAP_SWITCH'    & \verb'double' & bitmap control (0 to 1) \\
+\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
+                                              or \verb'GxB_BY_COL' \\
+\verb'GxB_SPARSITY_CONTROL' & \verb'int'    & 0 to 15 \\
+\end{tabular}
+}
+
+\item \verb'GxB_set (GrB_Vector v, field, value)' provides hints to
+    SuiteSparse: GraphBLAS on how to store a particular vector.
 
-\item \verb'GxB_set (GrB_Descriptor desc, field, value)' is another way to
-    set the value of a field in a \verb'GrB_Descriptor'.  It is identical to \\
-    \verb'GrB_Descriptor_set', just with a generic name.
+{\footnotesize
+\begin{tabular}{lll}
+field                       & value         & description \\
+\hline
+\verb'GxB_BITMAP_SWITCH'    & \verb'double' & bitmap control (0 to 1) \\
+\verb'GxB_SPARSITY_CONTROL' & \verb'int'    & 0 to 15 \\
+\end{tabular}
+}
+
+\item \verb'GxB_set (GrB_Descriptor desc, field, value)' sets
+    the value of a field in a \verb'GrB_Descriptor'.
+
+{\footnotesize
+\begin{tabular}{lll}
+field                       & value         & description \\
+\hline
+\verb'GrB_OUTP'     & \verb'GrB_Desc_field' & replace option \\
+\verb'GrB_MASK'     & \verb'GrB_Desc_field' & mask option \\
+\verb'GrB_INP0'     & \verb'GrB_Desc_field' & transpose input 0 \\
+\verb'GrB_INP1'     & \verb'GrB_Desc_field' & transpose input 1 \\
+\verb'GxB_DESCRIPTOR_NTHREADS'  & \verb'int' & number of threads to use \\
+\verb'GxB_NTHREADS'             & \verb'int' & number of threads to use \\
+\verb'GxB_DESCRIPTOR_CHUNK'     & \verb'double' & chunk size \\
+\verb'GxB_CHUNK'                & \verb'double' & chunk size \\
+\verb'GxB_AxB_METHOD'           & \verb'int' & method for matrix multiply \\
+\verb'GxB_SORT'                 & \verb'int' & lazy vs aggressive sort \\
+\end{tabular}
+}
 
 \end{itemize}
 
 The \verb'GxB_get' method queries a \verb'GrB_Descriptor', a \verb'GrB_Matrix',
-or the global options.
+a \verb'GrB_Vector', or the global options.
 
 \begin{itemize}
 
 \item \verb'GxB_get (field, &value)' retrieves the value of
     a global option.
 
+{\footnotesize
+\begin{tabular}{lll}
+field                       & value         & description \\
+\hline
+\verb'GxB_HYPER_SWITCH'     & \verb'double' & hypersparsity control (0 to 1) \\
+\verb'GxB_BITMAP_SWITCH'    & \verb'double [8]' & bitmap control \\
+\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
+                                              or \verb'GxB_BY_COL' \\
+\verb'GxB_GLOBAL_NTHREADS'  & \verb'int'    & number of threads to use \\
+\verb'GxB_NTHREADS'         & \verb'int'    & number of threads to use \\
+\verb'GxB_GLOBAL_CHUNK'     & \verb'double' & chunk size \\
+\verb'GxB_CHUNK'            & \verb'double' & chunk size \\
+\verb'GxB_BURBLE'           & \verb'int'    & diagnostic output \\
+\hline
+\verb'GxB_MODE'                 & \verb'int'    & blocking/non-blocking \\
+\verb'GxB_LIBRARY_NAME'         & \verb'char *' & name of library \\
+\verb'GxB_LIBRARY_VERSION'      & \verb'int [3]' & library version \\
+\verb'GxB_LIBRARY_DATE'         & \verb'char *' & release date \\
+\verb'GxB_LIBRARY_ABOUT'        & \verb'char *' & about the library \\
+\verb'GxB_LIBRARY_LICENSE'      & \verb'char *' & license \\
+\verb'GxB_LIBRARY_COMPILE_DATE' & \verb'char *' & date of compilation \\
+\verb'GxB_LIBRARY_COMPILE_TIME' & \verb'char *' & time of compilation \\
+\verb'GxB_LIBRARY_URL'          & \verb'char *' & url of library \\
+\verb'GxB_API_VERSION'          & \verb'int [3]' & C API version \\
+\verb'GxB_API_DATE'             & \verb'char *' & C API date \\
+\verb'GxB_API_ABOUT'            & \verb'char *' & about the C API \\
+\verb'GxB_API_URL'              & \verb'char *' & \verb'http://graphblas.org' \\
+\end{tabular}
+}
+
 \item \verb'GxB_get (GrB_Matrix A, field, &value)' retrieves the current
     value of an option from a particular matrix \verb'A'.
 
+{\footnotesize
+\begin{tabular}{lll}
+field                       & value         & description \\
+\hline
+\verb'GxB_HYPER_SWITCH'     & \verb'double' & hypersparsity control (0 to 1) \\
+\verb'GxB_BITMAP_SWITCH'    & \verb'double' & bitmap control (0 to 1) \\
+\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
+                                              or \verb'GxB_BY_COL' \\
+\verb'GxB_SPARSITY_CONTROL' & \verb'int'    & 0 to 15 \\
+\verb'GxB_SPARSITY_STATUS'  & \verb'int'    & 1, 2, 4, or 8 \\
+\end{tabular}
+}
+
+\item \verb'GxB_get (GrB_Vector A, field, &value)' retrieves the current
+    value of an option from a particular vector \verb'v'.
+
+{\footnotesize
+\begin{tabular}{lll}
+field                       & value         & description \\
+\hline
+\verb'GxB_BITMAP_SWITCH'    & \verb'double' & bitmap control (0 to 1) \\
+\verb'GxB_FORMAT'           & \verb'int'    & \verb'GxB_BY_ROW'
+                                              or \verb'GxB_BY_COL' \\
+\verb'GxB_SPARSITY_CONTROL' & \verb'int'    & 0 to 15 \\
+\verb'GxB_SPARSITY_STATUS'  & \verb'int'    & 1, 2, 4, or 8 \\
+\end{tabular}
+}
+
 \item \verb'GxB_get (GrB_Descriptor desc, field, &value)' retrieves the value
     of a field in a descriptor.
 
+{\footnotesize
+\begin{tabular}{lll}
+field                       & value         & description \\
+\hline
+\verb'GrB_OUTP'     & \verb'GrB_Desc_field' & replace option \\
+\verb'GrB_MASK'     & \verb'GrB_Desc_field' & mask option \\
+\verb'GrB_INP0'     & \verb'GrB_Desc_field' & transpose input 0 \\
+\verb'GrB_INP1'     & \verb'GrB_Desc_field' & transpose input 1 \\
+\verb'GxB_DESCRIPTOR_NTHREADS'  & \verb'int' & number of threads to use \\
+\verb'GxB_NTHREADS'             & \verb'int' & number of threads to use \\
+\verb'GxB_DESCRIPTOR_CHUNK'     & \verb'double' & chunk size \\
+\verb'GxB_CHUNK'                & \verb'double' & chunk size \\
+\verb'GxB_AxB_METHOD'           & \verb'int' & method for matrix multiply \\
+\verb'GxB_SORT'                 & \verb'int' & lazy vs aggressive sort \\
+\end{tabular}
+}
+
 \end{itemize}
 
 %-------------------------------------------------------------------------------
 \subsection{OpenMP parallelism}
 %-------------------------------------------------------------------------------
+\label{omp_parallelism}
 
-SuiteSparse:GraphBLAS Version 3 is a parallel library, based on OpenMP.  By
+SuiteSparse:GraphBLAS is a parallel library, based on OpenMP.  By
 default, all GraphBLAS operations will use up to the maximum number of threads
 specified by the \verb'omp_get_max_threads' OpenMP function.  For small
 problems, GraphBLAS may choose to use fewer threads, using two parameters: the
@@ -6329,20 +7019,20 @@ \subsection{Hypersparse matrices}
 columns of a matrix are much faster when working with hypersparse matrices,
 since empty columns can be skipped.
 
-The \verb'hyper_ratio' parameter controls the hypersparsity of the internal
+The \verb'hyper_switch' parameter controls the hypersparsity of the internal
 data structure for a matrix.  The parameter is typically in the range 0 to 1.
-The default is \verb'hyper_ratio' = \verb'GxB_HYPER_DEFAULT', which is an
+The default is \verb'hyper_switch' = \verb'GxB_HYPER_DEFAULT', which is an
 \verb'extern' \verb'const' \verb'double' value, currently set to 0.0625, or
 1/16.  This default ratio may change in the future.
 
-The \verb'hyper_ratio' determines how the matrix is converted between the
+The \verb'hyper_switch' determines how the matrix is converted between the
 hypersparse and non-hypersparse formats.  Let $n$ be the number of columns of a
 CSC matrix, or the number of rows of a CSR matrix.  The matrix can have at most
 $n$ non-empty vectors.
 
 Let $k$ be the actual number of non-empty vectors.  That is, for the CSC
 format, $k \le n$ is the number of columns that have at least one entry.  Let
-$h$ be the value of \verb'hyper_ratio'.
+$h$ be the value of \verb'hyper_switch'.
 
 If a matrix is currently hypersparse, it can be converted to non-hypersparse if
 the either condition $n \le 1$ or $k > 2nh$ holds, or both.  Otherwise, it
@@ -6354,20 +7044,20 @@ \subsection{Hypersparse matrices}
 non-hypersparse.  Note that if $n \le 1$ the matrix always remains
 non-hypersparse.
 
-The default value of \verb'hyper_ratio' is assigned at startup by
+The default value of \verb'hyper_switch' is assigned at startup by
 \verb'GrB_init', and can then be modified globally with \verb'GxB_set'.  All
-new matrices are created with the same \verb'hyper_ratio', determined by the
+new matrices are created with the same \verb'hyper_switch', determined by the
 global value.  Once a particular matrix \verb'A' has been constructed, its
 hypersparsity ratio can be modified from the default with:
 
     {\footnotesize
     \begin{verbatim}
-    double hyper_ratio = 0.2 ;
-    GxB_set (A, GxB_HYPER, hyper_ratio) ; \end{verbatim}}
+    double hyper_switch = 0.2 ;
+    GxB_set (A, GxB_HYPER_SWITCH, hyper_switch) ; \end{verbatim}}
 
-To force a matrix to always be non-hypersparse, use \verb'hyper_ratio' equal to
+To force a matrix to always be non-hypersparse, use \verb'hyper_switch' equal to
 \verb'GxB_NEVER_HYPER'.  To force a matrix to always stay hypersparse, set
-\verb'hyper_ratio' to \verb'GxB_ALWAYS_HYPER'.
+\verb'hyper_switch' to \verb'GxB_ALWAYS_HYPER'.
 
 A \verb'GrB_Matrix' can thus be held in one of four formats: any combination of
 hyper/non-hyper and CSR/CSC.  All \verb'GrB_Vector' objects are always stored
@@ -6375,15 +7065,65 @@ \subsection{Hypersparse matrices}
 
 A new matrix created via \verb'GrB_Matrix_new' starts with $k=0$ and is created
 in hypersparse form by default unless $n \le 1$ or if $h<0$, where $h$ is the
-global \verb'hyper_ratio' value.  The matrix is created in either
+global \verb'hyper_switch' value.  The matrix is created in either
 \verb'GxB_BY_ROW' or \verb'GxB_BY_COL' format, as determined by the last call
 to \verb'GxB_set(GxB_FORMAT,...)' or \verb'GrB_init'.
 
 A new matrix \verb'C' created via \verb'GrB_dup (&C,A)' inherits the CSR/CSC
-format, hypersparsity format, and \verb'hyper_ratio' from \verb'A'.
+format, hypersparsity format, and \verb'hyper_switch' from \verb'A'.
+
+%-------------------------------------------------------------------------------
+\subsection{Bitmap matrices}
+\label{bitmap_switch}
+%-------------------------------------------------------------------------------
+
+By default, SuiteSparse:GraphBLAS switches between bitmap and
+sparse/hypersparse formats automatically.  Let $d = |{\bf A}|/mn$ for an
+$m$-by-$n$ matrix $\bf A$ with $|{\bf A}|$ entries.  If the matrix is currently
+in sparse or hypersparse format, and is modified so that $d$ exceeds a given
+threshold, it is converted into bitmap format.  The default threshold is
+controlled by the \verb'GxB_BITMAP_SWITCH' setting, which can be set globally,
+or for a particular matrix or vector.
+
+The default value of the switch to bitmap format depends on $\min(m,n)$, for a
+matrix of size $m$-by-$n$.  For the global setting, the bitmap switch is a
+\verb'double' array of size \verb'GxB_NBITMAP_SWITCH'.  The defaults are given
+below:
+
+\vspace{0.2in}
+{\small
+\begin{tabular}{lll}
+parameter & default & matrix sizes \\
+\hline
+\verb'bitmap_switch [0]' & 0.04 & $\min(m,n) = 1$ (and all vectors) \\
+\verb'bitmap_switch [1]' & 0.05 & $\min(m,n) = 2$ \\
+\verb'bitmap_switch [2]' & 0.06 & $\min(m,n) = 3$ to 4 \\
+\verb'bitmap_switch [3]' & 0.08 & $\min(m,n) = 5$ to 8 \\
+\verb'bitmap_switch [4]' & 0.10 & $\min(m,n) = 9$ to 16\\
+\verb'bitmap_switch [5]' & 0.20 & $\min(m,n) = 17$ to 32\\
+\verb'bitmap_switch [6]' & 0.30 & $\min(m,n) = 33$ to 64 \\
+\verb'bitmap_switch [7]' & 0.40 & $\min(m,n) > 64$ \\
+\end{tabular}
+}
+\vspace{0.2in}
+
+That is, by default a \verb'GrB_Vector' is held in bitmap format if its density
+exceeds 4\%.  To change the global settings, do the following:
+
+{\footnotesize
+\begin{verbatim}
+    double bswitch [GxB_NBITMAP_SWITCH] = { 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 } ;
+    GxB_set (GxB_BITMAP_SWITCH, bswitch) ;
+\end{verbatim}
+}
+
+If the matrix is currently in bitmap format, it is converted to full if all
+entries are present, or to sparse/hypersparse if $d$ drops below $b/2$, if its
+bitmap switch is $b$.  A matrix or vector with $d$ between $b/2$ and $b$
+remains in its current format.
 
 %-------------------------------------------------------------------------------
-{\bf Parameter types:}
+\subsection{Parameter types}
 %-------------------------------------------------------------------------------
 The \verb'GxB_Option_Field' enumerated type gives the type of the \verb'field'
 parameter for the second argument of \verb'GxB_set' and \verb'GxB_get',
@@ -6393,15 +7133,34 @@ \subsection{Hypersparse matrices}
 \begin{verbatim}
 typedef enum
 {
-    GxB_HYPER = 0,      // defines switch to hypersparse format (double value)
+    // for matrix/vector get/set and global get/set:
+    GxB_HYPER_SWITCH = 0,    // defines switch to hypersparse (double value)
+    GxB_BITMAP_SWITCH = 34,  // defines switch to hypersparse (double value)
     GxB_FORMAT = 1,     // defines CSR/CSC format: GxB_BY_ROW or GxB_BY_COL
-    GxB_MODE = 2,       // mode passed to GrB_init (blocking or non-blocking)
-    GxB_THREAD_SAFETY = 3,  // thread library for thread safety
-    GxB_THREADING = 4,      // currently none (in progress)
+    GxB_SPARSITY_CONTROL = 32,  // control the sparsity of a matrix or vector
+
+    // for global get/set only:
     GxB_GLOBAL_NTHREADS = GxB_NTHREADS, // max number of threads to use
     GxB_GLOBAL_CHUNK = GxB_CHUNK,       // chunk size for small problems
-    GxB_IS_HYPER = 6     // query a matrix to see if it hypersparse or not
-                         // (GxB_Matrix_Option_get only)
+    GxB_BURBLE = 99,                    // diagnositic output
+
+    // for matrix/vector get only:
+    GxB_SPARSITY_STATUS = 33,   // query the sparsity of a matrix or vector
+
+    // for global get only:
+    GxB_MODE = 2,       // mode passed to GrB_init (blocking or non-blocking)
+    GxB_LIBRARY_NAME = 8,           // name of the library (char *)
+    GxB_LIBRARY_VERSION = 9,        // library version (3 int's)
+    GxB_LIBRARY_DATE = 10,          // date of the library (char *)
+    GxB_LIBRARY_ABOUT = 11,         // about the library (char *)
+    GxB_LIBRARY_URL = 12,           // URL for the library (char *)
+    GxB_LIBRARY_LICENSE = 13,       // license of the library (char *)
+    GxB_LIBRARY_COMPILE_DATE = 14,  // date library was compiled (char *)
+    GxB_LIBRARY_COMPILE_TIME = 15,  // time library was compiled (char *)
+    GxB_API_VERSION = 16,           // API version (3 int's)
+    GxB_API_DATE = 17,              // date of the API (char *)
+    GxB_API_ABOUT = 18,             // about the API (char *)
+    GxB_API_URL = 19,               // URL for the API (char *)
 }
 GxB_Option_Field ;
 \end{verbatim} }
@@ -6430,14 +7189,14 @@ \subsection{Hypersparse matrices}
 \verb'GxB_FORMAT_DEFAULT' to \verb'GxB_BY_COL'.  The default hypersparsity
 ratio is 0.0625 (1/16), but this value may change in the future.
 
-Setting the \verb'GxB_HYPER' field to \verb'GxB_ALWAYS_HYPER' ensures a matrix
+Setting the \verb'GxB_HYPER_SWITCH' field to \verb'GxB_ALWAYS_HYPER' ensures a matrix
 always stays hypersparse.  If set to \verb'GxB_NEVER_HYPER', it always stays
 non-hypersparse.  At startup, \verb'GrB_init' defines the following initial
 settings:
 
 {\footnotesize
 \begin{verbatim}
-    GxB_set (GxB_HYPER, GxB_HYPER_DEFAULT) ;
+    GxB_set (GxB_HYPER_SWITCH, GxB_HYPER_DEFAULT) ;
     GxB_set (GxB_FORMAT, GxB_FORMAT_DEFAULT) ;
 \end{verbatim} }
 
@@ -6451,7 +7210,7 @@ \subsection{Hypersparse matrices}
 
 {\footnotesize
 \begin{verbatim}
-    GxB_set (GxB_HYPER, GxB_NEVER_HYPER) ;
+    GxB_set (GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
     GxB_set (GxB_FORMAT, GxB_BY_COL) ;
 \end{verbatim} }
 
@@ -6459,7 +7218,7 @@ \subsection{Hypersparse matrices}
 
 {\footnotesize
 \begin{verbatim}
-    GxB_set (A, GxB_HYPER, 0.1) ;
+    GxB_set (A, GxB_HYPER_SWITCH, 0.1) ;
     GxB_set (A, GxB_FORMAT, GxB_BY_ROW) ;
 \end{verbatim} }
 
@@ -6474,12 +7233,12 @@ \subsection{Hypersparse matrices}
 {\footnotesize
 \begin{verbatim}
     GrB_init (...) ;
-    GxB_set (GxB_HYPER, GxB_NEVER_HYPER) ;
+    GxB_set (GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
     GxB_set (GxB_FORMAT, GxB_BY_COL) ;
-    // no subsequent use of GxB_HYPER or GxB_FORMAT
+    // no subsequent use of GxB_HYPER_SWITCH or GxB_FORMAT
 \end{verbatim} }
 
-The \verb'GxB_HYPER' and \verb'GxB_FORMAT' options should be considered as
+The \verb'GxB_HYPER_SWITCH' and \verb'GxB_FORMAT' options should be considered as
 suggestions from the user application as to how SuiteSparse:GraphBLAS can
 obtain the best performance for a particular application.
 SuiteSparse:GraphBLAS is free to ignore any of these suggestions, both now and
@@ -6488,90 +7247,88 @@ \subsection{Hypersparse matrices}
 SuiteSparse:GraphBLAS will be silently ignored, so the use these options is
 safe for future updates.
 
-The hypersparse status of a matrix can be queried with the following:
+The sparsity status of a matrix can be queried with the following, which
+returns a value of \verb'GxB_HYPERSPARSE' \verb'GxB_SPARSE' \verb'GxB_BITMAP'
+or \verb'GxB_FULL'.
 
 {\footnotesize
 \begin{verbatim}
-    bool is_hyper ;
-    GxB_get (A, GxB_IS_HYPER, &is_hyper) ;
-    printf (is_hyper ? "A is hypersparse" : "A is standard sparse") ; \end{verbatim}}
-
-%-------------------------------------------------------------------------------
-\subsection{Other global options}
-%-------------------------------------------------------------------------------
+    int sparsity ;
+    GxB_get (A, GxB_SPARSITY_STATUS, &sparsity) ; \end{verbatim}}
 
-\verb'GxB_MODE', \verb'GxB_THREAD_SAFETY', and \verb'GxB_THREADING' can only be
-queried by \verb'GxB_get'; they cannot be modified by \verb'GxB_set'.  The mode
-is the value passed to \verb'GrB_init' (blocking or non-blocking).  The
-\verb'GxB_THREAD*' options are returned as an \verb'enum' type with one of the
-following options:
+The sparsity format of a matrix can be controlled with \verb'GxB_set', which
+can be any mix (a sum or bitwise or) of \verb'GxB_HYPERSPARSE'
+\verb'GxB_SPARSE' \verb'GxB_BITMAP', and \verb'GxB_FULL'.  By default, a matrix
+or vector can be held in any format, with the default setting
+\verb'GxB_AUTO_SPARSITY', which is equal to \verb'GxB_HYPERSPARSE' +
+\verb'GxB_SPARSE' + \verb'GxB_BITMAP' + \verb'GxB_FULL'.  To enable a matrix to
+take on just \verb'GxB_SPARSE' or \verb'GxB_FULL' formats, but not
+\verb'GxB_HYPERSPARSE' or \verb'GxB_BITMAP', for example, use the following:
 
 {\footnotesize
 \begin{verbatim}
-    typedef enum
-    {
-        GxB_THREAD_NONE = 0,    // no threading
-        GxB_THREAD_OPENMP = 1,  // OpenMP
-        GxB_THREAD_POSIX = 2,   // POSIX pthreads
-        GxB_THREAD_WINDOWS = 3, // Windows threads
-        GxB_THREAD_ANSI = 4     // ANSI C11 threads
-    }
-    GxB_Thread_Model ; \end{verbatim} }
+    GxB_set (A, GxB_SPARSITY_CONTROL, GxB_SPARSE + GxB_FULL) ; \end{verbatim}}
 
-SuiteSparse:GraphBLAS multi-threaded, using only OpenMP for its internal
-parallelism.  It is also thread-safe if it is compiled with OpenMP or POSIX
-pthreads, and if the user application threads do not operate on the same
-matrices at the same time.  The user threads may use OpenMP or POSIX pthreads.
-If multiple user threads make simultaneous calls to GraphBLAS, then output
-matrices and vectors used by different threads must be different, and input
-matrices and vectors can be safely used only if any pending computations on
-them have finished, via \verb'GrB_Matrix_wait', \verb'GrB_Vector_wait', or
-\verb'GxB_Scalar_wait'.
+In this case, SuiteSparse:GraphBLAS will hold the matrix in sparse format
+(\verb'CSC' or \verb'CSC', depending on its \verb'GxB_FORMAT'), unless all
+entries are present, in which case it will be converted to full format.
 
-The \verb'GxB_THREAD_SAFETY' option returns the threading model used internally
-to synchronize user threads, solely for the now-deprecated \verb'GrB_wait()'.
-This is determined during installation (see Section~\ref{sec:threads}).  Since
-\verb'GxB_THREAD_NONE' is zero, the following can be used:
+Only the least 4 bits of the sparsity control are considered, so the
+formats can be bitwise negated.  For example, to allow for any format
+except full:
 
 {\footnotesize
 \begin{verbatim}
-    GxB_Thread_Model thread_safety ;
-    GxB_get (GxB_THREAD_SAFETY, &thread_safety) ;
-    if (thread_safety)
-    {
-        printf ("GraphBLAS is thread-safe\n") ;
-    }
-    else
-    {
-        // neither OpenMP, POSIX pthreads, nor any other threading model
-        // was available at compile-time
-        printf ("GraphBLAS is not thread-safe!\n") ;
-    }
-\end{verbatim} }
+    GxB_set (A, GxB_SPARSITY_CONTROL, ~GxB_FULL) ; \end{verbatim}}
+
+%-------------------------------------------------------------------------------
+\subsection{{\sf GxB\_BURBLE}: diagnostic output}
+%-------------------------------------------------------------------------------
 
-The \verb'GxB_THREADING' option returns the internal parallelism used inside
-SuiteSparse:GraphBLAS, depending on how the library was compiled:
+\verb'GxB_set (GxB_BURBLE, ...)' controls the burble setting.  It can also be
+controlled via \verb'GrB.burble(b)' in the MATLAB interface.
 
 {\footnotesize
 \begin{verbatim}
-    GxB_Thread_Model threading ;
-    GxB_get (GxB_THREADING, &threading) ;
-    if (threading == GxB_THREAD_NONE)
-    {
-        printf ("GraphBLAS is single-threaded, internally.\n") ;
-    }
-    else
-    {
-        printf ("GraphBLAS is multi-threaded, internally, using OpenMP.\n") ;
-    }
-\end{verbatim} }
+     GxB_set (GxB_BURBLE, true) ;   // enable burble
+     GxB_set (GxB_BURBLE, false) ;  // disable burble \end{verbatim}}
+
+If enabled, SuiteSparse:GraphBLAS reports which internal kernels it uses, and
+how much time is spent.  If you see the word \verb'generic', it means that
+SuiteSparse:GraphBLAS was unable to use is faster kernels in
+\verb'Source/Generated', but used a generic kernel that relies on function
+pointers.  This is done for user-defined types and operators, and when
+typecasting is performed, and it is typically slower than the kernels in
+\verb'Source/Generated'.
+
+If you see a lot of \verb'wait' statements, it may mean that a lot of time is
+spent finishing a matrix or vector.  This may be the result of an inefficient
+use of the \verb'setElement' and \verb'assign' methods.  If this occurs you
+might try changing the sparsity format of a vector or matrix to
+\verb'GxB_BITMAP', assuming there's enough space for it.
+
+%-------------------------------------------------------------------------------
+\subsection{Other global options}
+%-------------------------------------------------------------------------------
+
+\verb'GxB_MODE' can only be
+queried by \verb'GxB_get'; it cannot be modified by \verb'GxB_set'.  The mode
+is the value passed to \verb'GrB_init' (blocking or non-blocking).
 
 All threads in the same user application share the same global options,
-including hypersparsity and CSR/CSC format determined by \verb'GxB_set', the
-blocking mode determined by \verb'GrB_init', and the threading options.
+including hypersparsity, bitmap options, and CSR/CSC format determined by
+\verb'GxB_set', and the the blocking mode determined by \verb'GrB_init'.
 Specific format and hypersparsity parameters of each matrix are specific to
 that matrix and can be independently changed.
 
+The \verb'GxB_LIBRARY_*' options can be used with \verb'GxB_get' to query the
+current implementation.  For all of these, \verb'GxB_get' returns a string
+(\verb'char *'), except for \verb'GxB_LIBRARY_VERSION', which takes as input an
+\verb'int' array of size three.  The \verb'GxB_API_*' options can be used with
+\verb'GxB_get' to query the current GraphBLAS C API Specification.  For all of
+these, \verb'GxB_get' returns a string (\verb'char *'), except for
+\verb'GxB_API_VERSION', which takes as input an \verb'int' array of size three.  
+
 \newpage
 %===============================================================================
 \subsection{{\sf GxB\_Global\_Option\_set:} set a global option}
@@ -6588,20 +7345,26 @@ \subsection{{\sf GxB\_Global\_Option\_set:} set a global option}
 \end{verbatim} } \end{mdframed}
 
 This usage of \verb'GxB_set' sets the value of a global option.
-The \verb'field' parameter can be \verb'GxB_HYPER', \verb'GxB_FORMAT',
-\verb'GxB_NTHREADS', or \verb'GxB_CHUNK'.
+The \verb'field' parameter can be
+\verb'GxB_HYPER_SWITCH',
+\verb'GxB_BITMAP_SWITCH',
+\verb'GxB_FORMAT',
+\verb'GxB_NTHREADS',
+\verb'GxB_CHUNK', or
+\verb'GxB_BURBLE'.
 
 For example, the following usage sets the global hypersparsity ratio to 0.2,
 the format of future matrices to \verb'GxB_BY_COL', the maximum number
-of threads to 4, and the chunk size to 10000.
+of threads to 4, the chunk size to 10000, and enables the burble.
 No existing matrices are changed.
 
 {\footnotesize
 \begin{verbatim}
-    GxB_set (GxB_HYPER, 0.2) ;
+    GxB_set (GxB_HYPER_SWITCH, 0.2) ;
     GxB_set (GxB_FORMAT, GxB_BY_COL) ;
     GxB_set (GxB_NTHREADS, 4) ;
     GxB_set (GxB_CHUNK, (double) 10000) ;
+    GxB_set (GxB_BURBLE, true) ;
 \end{verbatim} }
 
 %===============================================================================
@@ -6621,24 +7384,35 @@ \subsection{{\sf GxB\_Matrix\_Option\_set:} set a matrix option}
 
 This usage of \verb'GxB_set' sets the value of a matrix option, for a
 particular matrix.
-The \verb'field' parameter can be \verb'GxB_HYPER' or \verb'GxB_FORMAT'.
+The \verb'field' parameter can be
+\verb'GxB_HYPER_SWITCH',
+\verb'GxB_BITMAP_SWITCH',
+\verb'GxB_SPARSITY_CONTROL', or
+\verb'GxB_FORMAT'.
+
+For example, the following usage sets the hypersparsity ratio to 0.2, and the
+format of \verb'GxB_BY_COL', for a particular matrix \verb'A', and sets the
+sparsity control to \verb'GxB_SPARSE+GxB_FULL' (allowing the matrix to be held
+in CSC or FullC formats, but not BitmapC or HyperCSC).  SuiteSparse:GraphBLAS
+currently applies these changes immediately, but since they are simply hints,
+future versions of SuiteSparse:GraphBLAS may delay the change in format if it
+can obtain better performance.
 
-For example, the following usage sets the hypersparsity
-ratio to 0.2, and the format of \verb'GxB_BY_COL', for a particular matrix
-\verb'A'.  SuiteSparse:GraphBLAS currently applies these changes immediately,
-but since they are simply hints, future versions of SuiteSparse:GraphBLAS may
-delay the change in format if it can obtain better performance.
+If the setting is just \verb'GxB_FULL' and some entries are missing, then
+the matrix is held in bitmap format.
 
 {\footnotesize
 \begin{verbatim}
-    GxB_set (A, GxB_HYPER, 0.2) ;
+    GxB_set (A, GxB_HYPER_SWITCH, 0.2) ;
     GxB_set (A, GxB_FORMAT, GxB_BY_COL) ;
+    GxB_set (A, GxB_SPARSITY_CONTROL, GxB_SPARSE + GxB_FULL) ;
 \end{verbatim} }
 
 For performance, the matrix option should be set as soon as it is created with
 \verb'GrB_Matrix_new', so the internal transformation takes less time.
 
-\newpage
+If an error occurs, \verb'GrB_error(&err,A)' returns details about the error.
+
 %===============================================================================
 \subsection{{\sf GxB\_Desc\_set:} set a {\sf GrB\_Descriptor} value}
 %===============================================================================
@@ -6657,10 +7431,13 @@ \subsection{{\sf GxB\_Desc\_set:} set a {\sf GrB\_Descriptor} value}
 
 This usage is similar to \verb'GrB_Descriptor_set', just with a name that is
 consistent with the other usages of this generic function.  Unlike
-\verb'GrB_Descriptor_set', the \verb'field' may also be \verb'GxB_NTHREADS', or
-\verb'GxB_CHUNK'.  Refer to Sections~\ref{descriptor_set}~and~\ref{desc_set}
-for details.
+\verb'GrB_Descriptor_set', the \verb'field' may also be \verb'GxB_NTHREADS',
+\verb'GxB_CHUNK', or \verb'GxB_SORT'.  Refer to
+Sections~\ref{descriptor_set}~and~\ref{desc_set} for details.
+
+If an error occurs, \verb'GrB_error(&err,desc)' returns details about the error.
 
+\newpage
 %===============================================================================
 \subsection{{\sf GxB\_Global\_Option\_get:} retrieve a global option}
 %===============================================================================
@@ -6677,26 +7454,76 @@ \subsection{{\sf GxB\_Global\_Option\_get:} retrieve a global option}
 \end{verbatim} } \end{mdframed}
 
 This usage of \verb'GxB_get' retrieves the value of a global option.  The
-\verb'field' parameter can be \verb'GxB_HYPER', \verb'GxB_FORMAT'.
-\verb'GxB_MODE', \verb'GxB_THREAD_SAFETY', \verb'GxB_THREADING',
-\verb'GxB_NTHREADS', or \verb'GxB_CHUNK'.
+\verb'field' parameter can be one of the following:
+
+\vspace{0.2in}
+{\footnotesize
+\begin{tabular}{ll}
+        \hline
+        \verb'GxB_HYPER_SWITCH'         & sparse/hyper setting \\
+        \verb'GxB_BITMAP_SWITCH'        & bitmap/sparse setting \\
+        \verb'GxB_FORMAT'               & by row/col setting \\
+        \verb'GxB_MODE'                 & blocking / non-blocking \\
+        \verb'GxB_NTHREADS'             & default number of threads \\
+        \verb'GxB_CHUNK'                & default chunk size \\
+        \verb'GxB_BURBLE'       & burble setting \\
+        \hline
+        \verb'GxB_LIBRARY_NAME'         & the string
+                                        \verb'"SuiteSparse:GraphBLAS"' \\
+        \verb'GxB_LIBRARY_VERSION'      & \verb'int' array of size 3 \\
+        \verb'GxB_LIBRARY_DATE'         & date of release \\
+        \verb'GxB_LIBRARY_ABOUT'        & author, copyright \\
+        \verb'GxB_LIBRARY_LICENSE'      & license for the library \\
+        \verb'GxB_LIBRARY_COMPILE_DATE' & date of compilation \\
+        \verb'GxB_LIBRARY_COMPILE_TIME' & time of compilation \\
+        \verb'GxB_LIBRARY_URL'          & URL of the library \\
+        \hline
+        \verb'GxB_API_VERSION'  & GraphBLAS C API Specification Version \\
+        \verb'GxB_API_DATE'     & date of the C API Spec.  \\
+        \verb'GxB_API_ABOUT'    & about of the C API Spec. \\
+        \verb'GxB_API_URL'      & URL of the spec \\
+        \hline
+\end{tabular}
+}
+\vspace{0.2in}
+
 For example:
 
 {\footnotesize
 \begin{verbatim}
     double h ;
-    GxB_get (GxB_HYPER, &h) ;
-    printf ("hyper_ratio = %g for all new matrices\n", h) ;
+    GxB_get (GxB_HYPER_SWITCH, &h) ;
+    printf ("hyper_switch = %g for all new matrices\n", h) ;
+
+    double b [GxB_BITMAP_SWITCH] ;
+    GxB_get (GxB_BITMAP_SWITCH, b) ;
+    for (int k = 0 ; k < GxB_NBITMAP_SWITCH ; k++)
+    {
+        printf ("bitmap_switch [%d] = %g ", k, b [k]) ;
+        if (k == 0)
+        {
+            printf ("for vectors and matrices with 1 row or column\n") ;
+        }
+        else if (k == GxB_NBITMAP_SWITCH - 1) 
+        {
+            printf ("for matrices with min dimension > %d\n", 1 << (k-1)) ;
+        }
+        else
+        {
+            printf ("for matrices with min dimension %d to %d\n",
+                (1 << (k-1)) + 1, 1 << k) ;
+        }
+    }
 
     GxB_Format_Value s ;
     GxB_get (GxB_FORMAT, &s) ;
-    if (s == GxB_BY_COL) printf ("all new matrices are stored by column\n") :
+    if (s == GxB_BY_COL) printf ("all new matrices are stored by column\n") ;
     else printf ("all new matrices are stored by row\n") ;
 
     GrB_mode mode ;
     GxB_get (GxB_MODE, &mode) ;
-    if (mode == GrB_BLOCKING) printf ("GrB_init(GrB_BLOCKING) was called.\n") :
-    else printf ("GrB_init(GrB_NONBLOCK) was called.\n") ;
+    if (mode == GrB_BLOCKING) printf ("GrB_init(GrB_BLOCKING) was called.\n") ;
+    else printf ("GrB_init(GrB_NONBLOCKING) was called.\n") ;
 
     int nthreads_max ;
     GxB_get (GxB_NTHREADS, &nthreads_max) ;
@@ -6706,11 +7533,13 @@ \subsection{{\sf GxB\_Global\_Option\_get:} retrieve a global option}
     GxB_get (GxB_CHUNK, &chunk) ;
     printf ("chunk size: %g\n", chunk) ;
 
-    // see Demo/Program/pthread_demo.c and openmp_demo.c for examples:
-    GxB_Threading_Model thread_safety, threading ;
-    GxB_get (GxB_THREAD_SAFETY, &thread_safey) ;
-    GxB_get (GxB_THREADING, &threading) ; \end{verbatim} }
+    char *name ;
+    int ver [3] ;
+    GxB_get (GxB_LIBRARY_NAME, &name) ;
+    GxB_get (GxB_LIBRARY_VERSION, ver) ;
+    printf ("Library %s, version %d.%d.%d\n", name, ver [0], ver [1], ver [2]) ; \end{verbatim} }
 
+\newpage
 %===============================================================================
 \subsection{{\sf GxB\_Matrix\_Option\_get:} retrieve a matrix option}
 %===============================================================================
@@ -6726,24 +7555,43 @@ \subsection{{\sf GxB\_Matrix\_Option\_get:} retrieve a matrix option}
 ) ;
 \end{verbatim} } \end{mdframed}
 
-This usage of \verb'GxB_get' retrieves the value of a matrix option.
-The \verb'field' parameter can be \verb'GxB_HYPER', \verb'GxB_IS_HYPER',
-or \verb'GxB_FORMAT'.
+This usage of \verb'GxB_get' retrieves the value of a matrix option.  The
+\verb'field' parameter can be
+\verb'GxB_HYPER_SWITCH',
+\verb'GxB_BITMAP_SWITCH',
+\verb'GxB_SPARSITY_CONTROL',
+\verb'GxB_SPARSITY_STATUS',
+or
+\verb'GxB_FORMAT'.
 For example:
 
 \vspace{-0.1in}
 {\footnotesize
 \begin{verbatim}
-    double h ;
-    bool is_hyper ;
-    GxB_get (A, GxB_IS_HYPER, &is_hyper) ;
-    GxB_get (A, GxB_HYPER, &h) ;
-    printf ("matrix A has hyper_ratio = %g\n", h) ;
-    printf ("matrix A is currently %shypersparse\n", is_hyper ? "not " : " ") ; 
+    double h, b  ;
+    int sparsity, scontrol ;
+    GxB_get (A, GxB_SPARSITY_STATUS, &sparsity) ;
+    GxB_get (A, GxB_HYPER_SWITCH, &h) ;
+    printf ("matrix A has hyper_switch = %g\n", h) ;
+    GxB_get (A, GxB_BITMAP_SWITCH, &b) ;
+    printf ("matrix A has bitmap_switch = %g\n", b) ;
+    switch (sparsity)
+    {
+        case GxB_HYPERSPARSE: printf ("matrix A is hypersparse\n") ; break ;
+        case GxB_SPARSE:      printf ("matrix A is sparse\n"     ) ; break ;
+        case GxB_BITMAP:      printf ("matrix A is bitmap\n"     ) ; break ;
+        case GxB_FULL:        printf ("matrix A is full\n"       ) ; break ;
+    }
     GxB_Format_Value s ;
     GxB_get (A, GxB_FORMAT, &s) ;
-    printf ("matrix A is stored by %s\n", (s == GxB_BY_COL) ? "col" : "row") ; \end{verbatim} }
+    printf ("matrix A is stored by %s\n", (s == GxB_BY_COL) ? "col" : "row") ;
+    GxB_get (A, GxB_SPARSITY_CONTROL, &scontrol) ;
+    if (scontrol & GxB_HYPERSPARSE) printf ("A may become hypersparse\n") ;
+    if (scontrol & GxB_SPARSE     ) printf ("A may become sparse\n") ;
+    if (scontrol & GxB_BITMAP     ) printf ("A may become bitmap\n") ;
+    if (scontrol & GxB_FULL       ) printf ("A may become full\n") ; \end{verbatim} }
 
+\newpage
 %===============================================================================
 \subsection{{\sf GxB\_Desc\_get:} retrieve a {\sf GrB\_Descriptor} value}
 %===============================================================================
@@ -6762,11 +7610,12 @@ \subsection{{\sf GxB\_Desc\_get:} retrieve a {\sf GrB\_Descriptor} value}
 This usage is the same as \verb'GxB_Desc_get'.  The \verb'field' parameter can
 be \verb'GrB_OUTP', \verb'GrB_MASK', \verb'GrB_INP0', \verb'GrB_INP1',
 \verb'GxB_AxB_METHOD',
-\verb'GxB_NTHREADS', or \verb'GxB_CHUNK'.
+\verb'GxB_NTHREADS',
+\verb'GxB_CHUNK', or
+\verb'GxB_SORT'.
 Refer to Section~\ref{desc_get} for details.
 
 %===============================================================================
-\newpage
 \subsection{Summary of usage of {\sf GxB\_set} and {\sf GxB\_get}}
 %===============================================================================
 
@@ -6777,10 +7626,15 @@ \subsection{Summary of usage of {\sf GxB\_set} and {\sf GxB\_get}}
 
     {\footnotesize
     \begin{verbatim}
-    GxB_set (GxB_HYPER, double h) ;
-    GxB_set (GxB_HYPER, GxB_ALWAYS_HYPER) ;
-    GxB_set (GxB_HYPER, GxB_NEVER_HYPER) ;
-    GxB_get (GxB_HYPER, double *h) ;
+    GxB_set (GxB_HYPER_SWITCH, double h) ;
+    GxB_set (GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER) ;
+    GxB_set (GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
+    GxB_get (GxB_HYPER_SWITCH, double *h) ;
+
+    double b [GxB_NBITMAP_SWITCH] ;
+    GxB_set (GxB_BITMAP_SWITCH, b) ;
+    GxB_set (GxB_BITMAP_SWITCH, NULL) ;     // set defaults
+    GxB_get (GxB_BITMAP_SWITCH, b) ;
 
     GxB_set (GxB_FORMAT, GxB_BY_ROW) ;
     GxB_set (GxB_FORMAT, GxB_BY_COL) ;
@@ -6800,32 +7654,64 @@ \subsection{Summary of usage of {\sf GxB\_set} and {\sf GxB\_get}}
 
     {\footnotesize
     \begin{verbatim}
-    GxB_get (GxB_MODE,          GrB_Mode *mode) ;
-    GxB_get (GxB_THREAD_SAFETY, GxB_Thread_Model *thread_safety) ;
-    GxB_get (GxB_THREADING,     GxB_Thread_Model *threading) ; \end{verbatim} }
+    GxB_get (GxB_MODE,                 GrB_Mode *mode) ;
+    GxB_get (GxB_LIBRARY_NAME,         char **) ;
+    GxB_get (GxB_LIBRARY_VERSION,      int *) ;
+    GxB_get (GxB_LIBRARY_DATE,         char **) ;
+    GxB_get (GxB_LIBRARY_ABOUT,        char **) ;
+    GxB_get (GxB_LIBRARY_LICENSE,      char **) ;
+    GxB_get (GxB_LIBRARY_COMPILE_DATE, char **) ;
+    GxB_get (GxB_LIBRARY_COMPILE_TIME, char **) ;
+    GxB_get (GxB_LIBRARY_URL,          char **) ;
+    GxB_get (GxB_API_VERSION,          int *) ;
+    GxB_get (GxB_API_DATE,             char **) ;
+    GxB_get (GxB_API_ABOUT,            char **) ;
+    GxB_get (GxB_API_URL,              char **) ;
+    \end{verbatim} }
 
 \noindent
-To set/get a matrix option:
+To set/get a matrix option or status
 
     {\footnotesize
     \begin{verbatim}
-    GxB_set (GrB_Matrix A, GxB_HYPER, double h) ;
-    GxB_set (GrB_Matrix A, GxB_HYPER, GxB_ALWAYS_HYPER) ;
-    GxB_set (GrB_Matrix A, GxB_HYPER, GxB_NEVER_HYPER) ;
-    GxB_get (GrB_Matrix A, GxB_HYPER, double *h) ;
+    GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, double h) ;
+    GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER) ;
+    GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
+    GxB_get (GrB_Matrix A, GxB_HYPER_SWITCH, double *h) ;
+
+    GxB_set (GrB_Matrix A, GxB_BITMAP_SWITCH, double b) ;
+    GxB_get (GrB_Matrix A, GxB_BITMAP_SWITCH, double *b) ;
 
     GxB_set (GrB_Matrix A, GxB_FORMAT, GxB_BY_ROW) ;
     GxB_set (GrB_Matrix A, GxB_FORMAT, GxB_BY_COL) ;
-    GxB_get (GrB_Matrix A, GxB_FORMAT, GxB_Format_Value *s) ; \end{verbatim} }
+    GxB_get (GrB_Matrix A, GxB_FORMAT, GxB_Format_Value *s) ;
+
+    GxB_set (GrB_Matrix A, GxB_SPARSITY_CONTROL, GxB_AUTO_SPARSITY) ;
+    GxB_set (GrB_Matrix A, GxB_SPARSITY_CONTROL, scontrol) ;
+    GxB_get (GrB_Matrix A, GxB_SPARSITY_CONTROL, int *scontrol) ;
+
+    GxB_get (GrB_Matrix A, GxB_SPARSITY_STATUS, int *sparsity) ;
+    \end{verbatim} }
 
 \noindent
-To get the hypersparse status of a matrix:
+To set/get a vector option or status:
 
     {\footnotesize
     \begin{verbatim}
-    GxB_get (GrB_Matrix A, GxB_IS_HYPER, bool *is_hyper) ; \end{verbatim} }
+    GxB_set (GrB_Vector v, GxB_BITMAP_SWITCH, double b) ;
+    GxB_get (GrB_Vector v, GxB_BITMAP_SWITCH, double *b) ;
+
+    GxB_set (GrB_Vector v, GxB_FORMAT, GxB_BY_ROW) ;
+    GxB_set (GrB_Vector v, GxB_FORMAT, GxB_BY_COL) ;
+    GxB_get (GrB_Vector v, GxB_FORMAT, GxB_Format_Value *s) ;
+
+    GxB_set (GrB_Vector v, GxB_SPARSITY_CONTROL, GxB_AUTO_SPARSITY) ;
+    GxB_set (GrB_Vector v, GxB_SPARSITY_CONTROL, scontrol) ;
+    GxB_get (GrB_Vector v, GxB_SPARSITY_CONTROL, int *scontrol) ;
+
+    GxB_get (GrB_Vector v, GxB_SPARSITY_STATUS, int *sparsity) ;
+    \end{verbatim} }
 
-\newpage
 \noindent
 To set/get a descriptor field:
 
@@ -6851,7 +7737,6 @@ \subsection{Summary of usage of {\sf GxB\_set} and {\sf GxB\_get}}
 
     GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_DEFAULT) ;
     GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_GUSTAVSON) ;
-    GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_HEAP) ;
     GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_HASH) ;
     GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_SAXPY) ;
     GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_DOT) ;
@@ -6861,7 +7746,10 @@ \subsection{Summary of usage of {\sf GxB\_set} and {\sf GxB\_get}}
     GxB_get (GrB_Descriptor d, GxB_NTHREADS, int *nthreads) ;
 
     GxB_set (GrB_Descriptor d, GxB_CHUNK, double chunk) ;
-    GxB_get (GrB_Descriptor d, GxB_CHUNK, double *chunk) ; \end{verbatim} }
+    GxB_get (GrB_Descriptor d, GxB_CHUNK, double *chunk) ;
+    
+    GxB_set (GrB_Descriptor d, GxB_SORT, sort) ;
+    GxB_get (GrB_Descriptor d, GxB_SORT, int *sort) ; \end{verbatim} }
 
 \newpage
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -6933,11 +7821,13 @@ \section{SuiteSparse:GraphBLAS Colon and Index Notation} %%%%%%%%%%%%%%%%%%%%%%%
 (described in the Section~\ref{operations}) each have parameters that define a
 list of integer indices, using two parameters:
 
+    \vspace{-0.05in}
     {\footnotesize
     \begin{verbatim}
     const GrB_Index *I ;    // an array, or a special value GrB_ALL
     GrB_Index ni ;          // the size of I, or a special value \end{verbatim}}
 
+\vspace{-0.05in}
 These two parameters define five kinds of index lists, which can be used to
 specify either an explicit or implicit list of row indices and/or column
 indices.  The length of the list of indices is denoted \verb'|I|'.  This
@@ -6972,6 +7862,7 @@ \section{SuiteSparse:GraphBLAS Colon and Index Notation} %%%%%%%%%%%%%%%%%%%%%%%
     \verb'GrB_Index' is an unsigned integer (\verb'uint64_t').  The value of
     \verb'I[GxB_INC]' is ignored.
 
+    \vspace{-0.05in}
     {\footnotesize
     \begin{verbatim}
     // to specify I = 10:20
@@ -6979,6 +7870,7 @@ \section{SuiteSparse:GraphBLAS Colon and Index Notation} %%%%%%%%%%%%%%%%%%%%%%%
     I [GxB_BEGIN] = 10 ;      // the start of the sequence
     I [GxB_END  ] = 20 ;      // the end of the sequence \end{verbatim}}
 
+    \vspace{-0.05in}
     Let $b$ = \verb'I[GxB_BEGIN]', let $e$ = \verb'I[GxB_END]',
     The sequence has length zero if $b > e$; otherwise the length is
     $|I| = (e-b) + 1$.
@@ -6989,6 +7881,7 @@ \section{SuiteSparse:GraphBLAS Colon and Index Notation} %%%%%%%%%%%%%%%%%%%%%%%
     length 4.  Note that 10 does not appear in the list.  The end point need
     not appear if the increment goes past it.
 
+    \vspace{-0.05in}
     {\footnotesize
     \begin{verbatim}
     // to specify I = 3:2:10
@@ -6997,9 +7890,11 @@ \section{SuiteSparse:GraphBLAS Colon and Index Notation} %%%%%%%%%%%%%%%%%%%%%%%
     I [GxB_INC   ] = 2 ;      // the increment
     I [GxB_END   ] = 10 ;     // the end of the sequence \end{verbatim}}
 
+    \vspace{-0.05in}
     The \verb'GxB_STRIDE' sequence is the same as the \verb'List' generated by
     the following for loop:
 
+    \vspace{-0.05in}
     {\footnotesize
     \begin{verbatim}
     int64_t k = 0 ;
@@ -7010,6 +7905,7 @@ \section{SuiteSparse:GraphBLAS Colon and Index Notation} %%%%%%%%%%%%%%%%%%%%%%%
         List [k++] = i ;
     } \end{verbatim}}
 
+    \vspace{-0.05in}
     Then passing the explicit array \verb'List' and its length \verb'ni=k' has
     the same effect as passing in the array \verb'I' of size 3, with
     \verb'ni=GxB_STRIDE'.  The latter is simply much faster to produce, and
@@ -7077,12 +7973,7 @@ \section{SuiteSparse:GraphBLAS Colon and Index Notation} %%%%%%%%%%%%%%%%%%%%%%%
 \verb'I[GxB_BEGIN]=5' and \verb'I[GxB_END]=3'.  This has the same
 effect as array \verb'I' with \verb'ni=0'.
 
-\begin{spec}
-{\bf SPEC:} \verb'GxB_RANGE', \verb'GxB_STRIDE', and \verb'GxB_BACKWARDS'
-are extensions to the specification.
-\end{spec}
-
-% \newpage
+\newpage
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \section{GraphBLAS Operations} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -7138,6 +8029,11 @@ \section{GraphBLAS Operations} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 }
 \vspace{0.2in}
 
+If an error occurs, \verb'GrB_error(&err,C)' or \verb'GrB_error(&err,w)'
+returns details about the error, for operations that return a modified matrix
+\verb'C' or vector \verb'w'.  The only operation that cannot return an error
+string is reduction to a scalar with \verb'GrB_reduce'.
+
 \newpage
 %===============================================================================
 \subsection{{\sf GrB\_mxm:} matrix-matrix multiply} %===========================
@@ -7213,6 +8109,12 @@ \subsection{{\sf GrB\_mxm:} matrix-matrix multiply} %===========================
 two copies: one stored by row, and other by column, and use the copy that
 results in the fastest computation.
 
+By default, \verb'GrB_mxm', \verb'GrB_mxv', \verb'GrB_vxm', and
+\verb'GrB_reduce' (to vector) can return their result in a jumbled state, with
+the sort left pending.  It can sometimes be faster for these methods to do the
+sort as they compute their result.  Use the \verb'GxB_SORT' descriptor setting
+to select this option.  Refer to Section~\ref{descriptor} for details.
+
 \newpage
 %===============================================================================
 \subsection{{\sf GrB\_vxm:} vector-matrix multiply} %===========================
@@ -7266,8 +8168,9 @@ \subsection{{\sf GrB\_vxm:} vector-matrix multiply} %===========================
 
 The breadth-first search presented in Section~\ref{bfs} of this User Guide uses
 \verb'GrB_vxm' instead of \verb'GrB_mxv', since the default format in
-SuiteSparse:GraphBLAS is \verb'GxB_BY_ROW'.  If the matrix is stored by column,
-then use \verb'GrB_mxv' instead.
+SuiteSparse:GraphBLAS is \verb'GxB_BY_ROW'.  This corresponds to ``push'' step
+of a direction-optimized BFS.  If the matrix is stored by column, then use
+\verb'GrB_mxv' for the ``push'' instead.
 
 \newpage
 %===============================================================================
@@ -7790,10 +8693,6 @@ \subsection{{\sf GxB\_subassign:} submatrix assignment} %=======================
 Section~\ref{compare_assign}.  For a discussion of how duplicate indices
 are handled in \verb'I' and \verb'J', see Section~\ref{duplicates}.
 
-\begin{spec}
-{\bf SPEC:} All variants of \verb'GxB_subassign' are extensions to the spec.
-\end{spec}
-
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_Vector\_subassign:} assign to a subvector }
 %-------------------------------------------------------------------------------
@@ -8266,16 +9165,16 @@ \subsubsection{{\sf GrB\_Row\_assign:} assign to a sub-row of a matrix}
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GxB_Row_subassign' modifies a single sub-row of a matrix \verb'C'.  It is
-the same as \verb'GxB_Matrix_subassign' where the index vector \verb'I[0]=i' is
-a single row index, and where all matrices in \verb'GxB_Matrix_subassign'
+\verb'GrB_Row_assign' modifies a single sub-row of a matrix \verb'C'.  It is
+the same as \verb'GrB_Matrix_assign' where the index vector \verb'I[0]=i' is
+a single row index, and where all matrices in \verb'GrB_Matrix_assign'
 (except \verb'C') consist of a single row.
 
 Unlike \verb'GrB_Matrix_assign', the \verb'mask' is a vector with the same size
 as a single row of \verb'C'.
 
 The input descriptor \verb'GrB_INP0' is ignored; the input vector \verb'u' is
-not transposed.  Refer to \verb'GxB_Matrix_subassign' for further details.
+not transposed.  Refer to \verb'GrB_Matrix_assign' for further details.
 
 \paragraph{\bf Performance considerations:} % C(i,J) = u'
 \verb'GrB_Col_assign' is much faster than \verb'GrB_Row_assign' if the format
@@ -8383,7 +9282,7 @@ \subsection{Duplicate indices in {\sf GrB\_assign} and {\sf GxB\_subassign}}
 
 According to the GraphBLAS C API Specification if the index vectors \verb'I' or
 \verb'J' contain duplicate indices, the results are undefined for
-\verb'GrB_Matrix_assign' \verb'GrB_Matrix_assign' \verb'GrB_Col_assign' and
+\verb'GrB_Matrix_assign', \verb'GrB_Matrix_assign', \verb'GrB_Col_assign', and
 \verb'GrB_Row_assign'.  Only the scalar assignment operations
 (\verb'GrB_Matrix_assign_TYPE' and \verb'GrB_Matrix_assign_TYPE') are
 well-defined when duplicates appear in \verb'I' and \verb'J'.  In those two
@@ -8497,14 +9396,13 @@ \subsection{Duplicate indices in {\sf GrB\_assign} and {\sf GxB\_subassign}}
 You see that the result is \verb'x(0)=7', since the \verb'y(0)=5' entry
 has been ignored because of the duplicate indices in \verb'I'.
 
-\begin{spec}
+\begin{alert}
 {\bf SPEC:} Providing a well-defined behavior for duplicate
 indices with matrix and vector assignment is an extension to the spec.
 The spec only defines the behavior when assigning a scalar into a matrix
 or vector, and states that duplicate indices otherwise lead to undefined
 results.
-\end{spec}
-
+\end{alert}
 
 
 \newpage
@@ -8513,10 +9411,6 @@ \subsection{Comparing {\sf GrB\_assign} and {\sf GxB\_subassign}} %=============
 %===============================================================================
 \label{compare_assign}
 
-% \begin{spec}
-% {\bf SPEC:} \verb'GxB_subassign' is an extension to the spec.
-% \end{spec}
-
 The \verb'GxB_subassign' and \verb'GrB_assign' operations are very similar, but
 they differ in two ways:
 
@@ -8678,38 +9572,33 @@ \subsection{Comparing {\sf GrB\_assign} and {\sf GxB\_subassign}} %=============
     -  &-   &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
     -  &-   & $c_{ij}$ &  -          & 1    &  delete $c_{ij}$ because $a_{i'j'}$ not present \\
     -  &-   &  -       &  -          & 1    &   \\
-\hline
-    yes&-   & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, update \\
-    yes&-   &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
-    yes&-   & $c_{ij}$ &  -          & 1    &  delete $c_{ij}$ because $a_{i'j'}$ not present \\
-    yes&-   &  -       &  -          & 1    &   \\
-\hline
-    -  &yes & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = c_{ij} \odot a_{i'j'}$, apply accumulator \\
-    -  &yes &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
-    -  &yes & $c_{ij}$ &  -          & 1    &   \\
-    -  &yes &  -       &  -          & 1    &   \\
-\hline
-    yes&yes & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = c_{ij} \odot a_{i'j'}$, apply accumulator \\
-    yes&yes &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
-    yes&yes & $c_{ij}$ &  -          & 1    &   \\
-    yes&yes &  -       &  -          & 1    &   \\
-\hline
-\hline
     -  &-   & $c_{ij}$ & $a_{i'j'}$  & 0    &   \\
     -  &-   &  -       & $a_{i'j'}$  & 0    &   \\
     -  &-   & $c_{ij}$ &  -          & 0    &   \\
     -  &-   &  -       &  -          & 0    &   \\
 \hline
+    yes&-   & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, update \\
+    yes&-   &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
+    yes&-   & $c_{ij}$ &  -          & 1    &  delete $c_{ij}$ because $a_{i'j'}$ not present \\
+    yes&-   &  -       &  -          & 1    &   \\
     yes&-   & $c_{ij}$ & $a_{i'j'}$  & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
     yes&-   &  -       & $a_{i'j'}$  & 0    &   \\
     yes&-   & $c_{ij}$ &  -          & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
     yes&-   &  -       &  -          & 0    &   \\
 \hline
+    -  &yes & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = c_{ij} \odot a_{i'j'}$, apply accumulator \\
+    -  &yes &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
+    -  &yes & $c_{ij}$ &  -          & 1    &   \\
+    -  &yes &  -       &  -          & 1    &   \\
     -  &yes & $c_{ij}$ & $a_{i'j'}$  & 0    &   \\
     -  &yes &  -       & $a_{i'j'}$  & 0    &   \\
     -  &yes & $c_{ij}$ &  -          & 0    &   \\
     -  &yes &  -       &  -          & 0    &   \\
 \hline
+    yes&yes & $c_{ij}$ & $a_{i'j'}$  & 1    &  $c_{ij} = c_{ij} \odot a_{i'j'}$, apply accumulator \\
+    yes&yes &  -       & $a_{i'j'}$  & 1    &  $c_{ij} = a_{i'j'}$, insert \\
+    yes&yes & $c_{ij}$ &  -          & 1    &   \\
+    yes&yes &  -       &  -          & 1    &   \\
     yes&yes & $c_{ij}$ & $a_{i'j'}$  & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
     yes&yes &  -       & $a_{i'j'}$  & 0    &   \\
     yes&yes & $c_{ij}$ &  -          & 0    &  delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
@@ -8753,26 +9642,21 @@ \subsection{Comparing {\sf GrB\_assign} and {\sf GxB\_subassign}} %=============
 \hline
    -   &-     & $c_{ij}$ & $c_{ij}$ & 1 &  \\
    -   &-     &  -       & -        & 1 &  \\
-\hline
-   yes &  -   & $c_{ij}$ & $c_{ij}$ & 1 &  \\
-   yes &  -   &    -     &     -    & 1 &  \\
-\hline
-   -   &yes   & $c_{ij}$ & $c_{ij}$ & 1 &  \\
-   -   &yes   &    -     &  -       & 1 &  \\
-\hline
-   yes &  yes & $c_{ij}$ & $c_{ij}$ & 1 &  \\
-   yes &  yes &   -      &  -       & 1 &  \\
-\hline
-\hline
    -   &-     & $c_{ij}$ & $c_{ij}$ & 0 &  \\
    -   &-     &  -       & -        & 0 &  \\
 \hline
+   yes &  -   & $c_{ij}$ & $c_{ij}$ & 1 &  \\
+   yes &  -   &    -     &     -    & 1 &  \\
    yes &  -   & $c_{ij}$ & $c_{ij}$ & 0 & delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
    yes &  -   &    -     &  -       & 0 &  \\
 \hline
+   -   &yes   & $c_{ij}$ & $c_{ij}$ & 1 &  \\
+   -   &yes   &    -     &  -       & 1 &  \\
    -   &yes   & $c_{ij}$ & $c_{ij}$ & 0 &  \\
    -   &yes   &    -     &  -       & 0 &  \\
 \hline
+   yes &  yes & $c_{ij}$ & $c_{ij}$ & 1 &  \\
+   yes &  yes &   -      &  -       & 1 &  \\
    yes &  yes & $c_{ij}$ & $c_{ij}$ & 0 & delete $c_{ij}$  (because of \verb'GrB_REPLACE') \\
    yes &  yes &   -      &  -       & 0 &  \\
 \hline
@@ -9188,11 +10072,6 @@ \subsection{{\sf GxB\_select:} apply a select operator} %=======================
 describing each variation.  When discussing features that apply to both
 versions, the simple name \verb'GxB_select' is used.
 
-\begin{spec}
-{\bf SPEC:} The \verb'GxB_select' operation and \verb'GxB_SelectOp' operator
-are extensions to the spec.
-\end{spec}
-
 % \newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GxB\_Vector\_select:} apply a select operator to a vector}
@@ -9377,9 +10256,8 @@ \subsection{{\sf GrB\_reduce:} reduce to a vector or scalar} %==================
 functions discussed in this section.  When the details of a specific function
 are discussed, the specific name is used for clarity.
 
-
 %-------------------------------------------------------------------------------
-\subsubsection{{\sf GrB\_Matrix\_reduce\_$<$op$>$:} reduce a matrix to a vector}
+\subsubsection{{\sf GrB\_Matrix\_reduce\_Monoid} reduce a matrix to a vector}
 %-------------------------------------------------------------------------------
 \label{reduce_to_vector}
 
@@ -9391,25 +10269,18 @@ \subsubsection{{\sf GrB\_Matrix\_reduce\_$<$op$>$:} reduce a matrix to a vector}
     GrB_Vector w,                   // input/output vector for results
     const GrB_Vector mask,          // optional mask for w, unused if NULL
     const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
-    const <operator> reduce,        // reduce operator for t=reduce(A)
+    const GrB_Monoid monoid,        // reduce monoid for t=reduce(A)
     const GrB_Matrix A,             // first input:  matrix A
     const GrB_Descriptor desc       // descriptor for w, mask, and A
 ) ;
 \end{verbatim} } \end{mdframed}
 
-\verb'GrB_Matrix_reduce_<op>' is a generic name for two specific methods.  Both
-methods reduce a matrix to a column vector using an operator, roughly analogous
+\verb'GrB_Matrix_reduce_Monoid'
+reduces a matrix to a column vector using a monoid, roughly analogous
 to \verb"t = sum (A')" in MATLAB, in the default case, where \verb't' is a
 column vector.  By default, the method reduces across the rows to
 obtain a column vector; use \verb'GrB_TRAN' to reduce down the columns.
 
-\verb'GrB_Matrix_reduce_BinaryOp' relies on a binary operator for the
-reduction: the fourth argument \verb'reduce', a \verb'GrB_BinaryOp'.  All three
-domains of the operator must be the same.  \verb'GrB_Matrix_reduce_Monoid'
-performs the same reduction using a \verb'GrB_Monoid' as its fourth argument.
-In both cases the reduction operator must be commutative and associative.
-Otherwise the results are undefined.
-
 The input matrix \verb'A' may be transposed first.  Its entries are then
 typecast into the type of the \verb'reduce' operator or monoid.  The reduction
 is applied to all entries in \verb'A (i,:)' to produce the scalar \verb't (i)'.
@@ -9425,6 +10296,43 @@ \subsubsection{{\sf GrB\_Matrix\_reduce\_$<$op$>$:} reduce a matrix to a vector}
 in Section~\ref{accummask}, except that all the
 terms are column vectors instead of matrices.
 
+\newpage
+%-------------------------------------------------------------------------------
+\subsubsection{{\sf GrB\_Matrix\_reduce\_BinaryOp} reduce a matrix to a vector}
+%-------------------------------------------------------------------------------
+\label{reduce_to_vector_binop}
+
+\begin{mdframed}[userdefinedwidth=6in]
+{\footnotesize
+\begin{verbatim}
+GrB_Info GrB_reduce                 // w<mask> = accum (w,reduce(A))
+(
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector mask,          // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_BinaryOp op,          // reduce op for t=reduce(A)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Descriptor desc       // descriptor for w, mask, and A
+) ;
+\end{verbatim} } \end{mdframed}
+
+\verb'GrB_Matrix_reduce_BinaryOp' is the same as
+\verb'GrB_Matrix_reduce_Monoid', except that a binary operator is specified
+instead.  In SuiteSparse:GraphBLAS v4, it can only be used with built-in binary
+operators that correspond to a known monoid.  Its use is discouraged in
+SuiteSparse:GraphBLAS.  Use \verb'GrB_Matrix_reduce_Monoid' instead, or
+equivalently, use \verb'GrB_reduce' with a \verb'GrB_Monoid' as the fourth
+input parameter.
+
+\begin{alert}
+{\bf SPEC:} 
+\verb'GrB_Matrix_reduce_BinaryOp' in the v1.3 GraphBLAS C API Specification can
+use any binary operator, including user-defined ones.  However, the efficient
+parallel implementation in SuiteSparse:GraphBLAS v4 requires the identity
+value, and so only binary operators that correspond to known monoids are
+supported.  Use \verb'GrB_Matrix_reduce_Monoid' instead.
+\end{alert}
+
 \newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Vector\_reduce\_$<$type$>$:} reduce a vector to a scalar}
@@ -9449,20 +10357,9 @@ \subsubsection{{\sf GrB\_Vector\_reduce\_$<$type$>$:} reduce a vector to a scala
 except that in GraphBLAS any commutative and associative monoid can be used
 in the reduction.
 
-% There is no mask since the output is a mere scalar, not a GraphBLAS vector or
-% matrix.  The result does not depend on whether or not the input can be
-% transposed (and vectors cannot be transposed in any case).  The
-% \verb'replace' option is not implemented for this function.  Thus, no
-% parameters from the descriptor are used.
-
-The reduction operator is a commutative and associative monoid with an identity
-value.  Results are undefined if the monoid does not have these properties.
-This function differs from \verb'GrB_Matrix_reduce_BinaryOp' (which reduces
-a matrix to a vector) in that it requires a
-valid monoid additive identity value.  If the vector \verb'u' has no entries,
-that identity value is copied into the scalar \verb't'.  Otherwise, all of the
-entries in the vector are reduced to a single scalar using the \verb'reduce'
-operator.
+If the vector \verb'u' has no entries, that identity value of the \verb'monoid'
+is copied into the scalar \verb't'.  Otherwise, all of the entries in the
+vector are reduced to a single scalar using the \verb'monoid'.
 
 The scalar type is any of the built-in types, or a user-defined type.  In the
 function signature it is a C type: \verb'bool', \verb'int8_t', ...
@@ -9485,6 +10382,9 @@ \subsubsection{{\sf GrB\_Vector\_reduce\_$<$type$>$:} reduce a vector to a scala
 the \verb'ztype' of the \verb'accum' operator.  Finally, \verb'z' is typecast
 into the final result, \verb'c'.
 
+Since this operation does not have a GraphBLAS input/output object, it
+cannot return an error string for \verb'GrB_error'.
+
 \newpage
 %-------------------------------------------------------------------------------
 \subsubsection{{\sf GrB\_Matrix\_reduce\_$<$type$>$:} reduce a matrix to a scalar}
@@ -9510,6 +10410,9 @@ \subsubsection{{\sf GrB\_Matrix\_reduce\_$<$type$>$:} reduce a matrix to a scala
 or vector have no effect on the result.  Refer to the reduction to scalar
 described in the previous Section~\ref{reduce_vector_to_scalar}.
 
+Since this operation does not have a GraphBLAS input/output object, it
+cannot return an error string for \verb'GrB_error'.
+
 \newpage
 %===============================================================================
 \subsection{{\sf GrB\_transpose:} transpose a matrix} %=========================
@@ -9670,11 +10573,6 @@ \section{Printing GraphBLAS objects} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \label{fprint}
 
-\begin{spec}
-{\bf SPEC:} The GraphBLAS API has no mechanism for printing the contents of
-GraphBLAS objects.  This entire section is an extension to the specification.
-\end{spec}
-
 The ten different objects handled by SuiteSparse:GraphBLAS are all opaque,
 although nearly all of their contents can be extracted via methods such as
 \verb'GrB_Matrix_extractTuples', \verb'GrB_Matrix_extractElement',
@@ -9705,7 +10603,10 @@ \section{Printing GraphBLAS objects} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 }
 \vspace{0.2in}
 
-These methods do not modify the status of any object.  If a matrix or vector
+These methods do not modify the status of any object, and thus they
+cannot return an error string for use by \verb'GrB_error'.
+
+If a matrix or vector
 has not been completed, the pending computations are guaranteed to {\em not} be
 performed. The reason is simple.  It is possible for a bug in the user
 application (such as accessing memory outside the bounds of an array) to mangle
@@ -9757,8 +10658,7 @@ \section{Printing GraphBLAS objects} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \item \verb'GrB_UNINITIALIZED_OBJECT':  object is not initialized
 \item \verb'GrB_INVALID_OBJECT':        object is not valid
 \item \verb'GrB_NULL_POINTER':          object is a NULL pointer
-\item \verb'GrB_INVALID_VALUE':         \verb'fprintf' returned an I/O error;
-    see the ANSI C \verb'errno' or \verb'GrB_error( )' for details.
+\item \verb'GrB_INVALID_VALUE':         \verb'fprintf' returned an I/O error.
 \end{packed_itemize}
 
 The content of any GraphBLAS object is opaque, and subject to change.  As a
@@ -10015,7 +10915,7 @@ \subsection{{\sf GxB\_Scalar\_fprint:} Print a {\sf GxB\_Scalar}}
 \end{verbatim} } \end{mdframed}
 
 For example, \verb'GxB_Scalar_fprint (s, "my scalar", GxB_SHORT, f)'
-prints a short description of the sparse scalar \verb's' to the file \verb'f'.
+prints a short description of the scalar \verb's' to the file \verb'f'.
 
 \newpage
 %===============================================================================
@@ -10051,6 +10951,16 @@ \section{Examples} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \label{examples}
 
+\begin{alert}
+{\bf NOTE:} The programs in the \verb'Demo' folder are not always the fastest
+methods.  They are simple methods for illustration only, not performance.  Do
+not benchmark them.  Refer to the latest (draft) \verb'LAGraph' package for the
+fastest methods.  Be sure to use the right combination of package versions
+between LAGraph and SuiteSparse:GraphBLAS.  Contact the author (davis@tamu.edu)
+if you have any questions about how to properly benchmark LAGraph +
+SuiteSparse:GraphBLAS.
+\end{alert}
+
 Several examples of how to use GraphBLAS are listed below.  They all
 appear in the \verb'Demo' folder of SuiteSparse:GraphBLAS.
 
@@ -10067,9 +10977,7 @@ \section{Examples} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 \end{enumerate}
 
 Additional examples appear in the newly created LAGraph project, currently in
-progress.  Finally, the \verb'Extras' folder includes triangle counting and
-k-truss algorithms in GraphBLAS, and methods that do not GraphBLAS (both simple
-sequential methods, and methods that use OpenMP).
+progress.
 
 %-------------------------------------------------------------------------------
 \subsection{LAGraph}
@@ -10078,14 +10986,12 @@ \subsection{LAGraph}
 
 The LAGraph project is a community-wide effort to create graph algorithms based
 on GraphBLAS (any implementation of the API, not just SuiteSparse: GraphBLAS).
-As of Oct, 2019, the library includes the algorithms and utilities listed in
-the table below.  Many additional algorithms are planned, such as betweenness
-centrality, PageRank, single-source shortest path (via delta stepping), minimum
-spanning trees, connected components, and many more.  Refer to
+Some of the algorithms and utilities in LAGraph are listed in the table below.
+Many additional algorithms are planned.  Refer to
 \url{https://github.com/GraphBLAS/LAGraph} for a current list of algorithms
 (the one below will soon be out of date).  Most of the functions in the
-\verb'Demo/' and the \verb'Extras' folder in SuiteSparse:GraphBLAS will
-eventually be translated into algorithms or utilities for LAGraph.
+\verb'Demo/' folder in SuiteSparse:GraphBLAS will eventually be translated into
+algorithms or utilities for LAGraph.
 
 To use LAGraph with SuiteSparse:GraphBLAS, place the two folders \verb'LAGraph'
 and \verb'GraphBLAS' in the same parent directory.  This allows the
@@ -10106,7 +11012,7 @@ \subsection{LAGraph}
 Algorithms & description \\
 \hline
 \hline
-\verb'LAGraph_bfs_pushpull' & a direction-optimized BFS
+\verb'LAGraph_bfs_parent2'  & a direction-optimized BFS
                                 \cite{Beamer:2012:DOB,Yang:2018:IPE}, \\
                             & typically 2x faster than \verb'bfs5m' \\
 \verb'LAGraph_bfs_simple'   & a simple BFS (about the same as \verb'bfs5m') \\
@@ -10503,9 +11409,9 @@ \subsection{Creating a random matrix}
     if (I == NULL || J == NULL || X == NULL)
     {
         // out of memory
-        if (I != NULL) free (I) :
-        if (J != NULL) free (J) :
-        if (X != NULL) free (X) :
+        if (I != NULL) free (I) ;
+        if (J != NULL) free (J) ;
+        if (X != NULL) free (X) ;
         return (GrB_OUT_OF_MEMORY) ;
     }
     int64_t ntuples = 0 ;
@@ -10539,26 +11445,6 @@ \subsection{Creating a random matrix}
 methods, \verb'setElement' and \verb'build' in SuiteSparse:GraphBLAS, and
 \verb'sparse' in MATLAB, are equally fast.
 
-% It is not possible to build such a matrix one entry at a time in MATLAB.
-% using a comparable method.  The MATLAB equivalent to \verb'setElement',
-% below, takes 105 seconds for the first 200,000 entries and 381 seconds for
-% the last 1,000.  The time complexity is $O(nz^2)$.  Extrapolation from this
-% data gives an estimated run time of $4 \times 10^7$ seconds (462 days),
-% which is nearly a million times slower than the other three methods.
-
-%
-%     {\footnotesize
-%     \begin{verbatim}
-%     A = sparse (n,n) ;
-%     for k = 1:length (I)
-%         A (I (k), J (k)) = X (k) ;
-%     end \end{verbatim}}
-
-% The problem is not the time spent in interpreting the \verb'for' loop.  A
-% \verb'for' loop over 200 million iterations takes only 8 seconds.  The
-% problem is that the sparse matrices in MATLAB do not allow computations to be
-% left pending.
-
 \newpage
 %-------------------------------------------------------------------------------
 \subsection{Creating a finite-element matrix}
@@ -10711,7 +11597,7 @@ \subsection{Reading a matrix from a file}
 %-------------------------------------------------------------------------------
 \label{read}
 
-{\bf NOTE:} see also \verb'LAGraph_mmread' and \verb'LAGraph_mmwrite', which
+See also \verb'LAGraph_mmread' and \verb'LAGraph_mmwrite', which
 can read and write any matrix in Matrix Market format, and
 \verb'LAGraph_binread' and \verb'LAGraph_binwrite', which read/write a matrix
 from a binary file.  The binary file I/O functions are much faster than
@@ -10728,7 +11614,7 @@ \subsection{Reading a matrix from a file}
 The function can return the matrix as-is, which may be rectangular or
 unsymmetric.  If an input parameter is set to make the matrix symmetric,
 \verb'read_matrix' computes \verb"A=(A+A')/2" if \verb'A' is square (turning
-all directed edges into undirected ones.  If \verb'A' is rectangular, it
+all directed edges into undirected ones).  If \verb'A' is rectangular, it
 creates a bipartite graph, which is the same as the augmented matrix,
 \verb"A = [0 A ; A' 0]".
 If \verb'C' is an \verb'n'-by-\verb'n' matrix, then \verb"C=(C+C')/2" can be
@@ -10985,65 +11871,24 @@ \subsection{User-defined types and operators}
 
 \newpage
 %-------------------------------------------------------------------------------
-\subsection{User applications using OpenMP or POSIX pthreads}
+\subsection{User applications using OpenMP or other threading models}
 %-------------------------------------------------------------------------------
 \label{threads}
 
-Two example demo programs are included that illustrate how a multi-threaded
-user application can use GraphBLAS:  \verb'openmp_demo' uses OpenMP for its
-user threads and \verb'pthread_demo' uses POSIX pthreads.
-% TODO in 4.0: delete this note in bold:
-{\bf To be thread-safe, SuiteSparse:GraphBLAS must be compiled with a threading
-library, either OpenMP or POSIX}.  Either option used inside GraphBLAS can
-typically be combined with any user threading model.  See
-Section~\ref{sec:install}.
-
-The \verb'openmp_demo' can be compiled without OpenMP, in which case it
-becomes single-threaded.  GraphBLAS can be compiled with OpenMP, POSIX
-pthreads, or no threading support (and is not thread-safe in this latter
-case).  This gives 9 different combinations:
-
-\vspace{0.1in}
-{\footnotesize
-\begin{tabular}{llll}
-\hline
-User    & GraphBLAS & \verb'Demo/Output' file & comments \\
-applic. &           & & \\
-\hline
-none    & none      & \verb'user_none_grb_none.out'      & OK \\
-none    & OpenMP    & \verb'user_none_grb_openmp.out'    & OK \\
-none    & pthread   & \verb'user_none_grb_pthread.out'   & OK \\
-\hline
-OpenMP  & none      & \verb'user_openmp_grb_none.out'    & fail \\
-OpenMP  & OpenMP    & \verb'user_openmp_grb_openmp.out'  & OK, random \\
-OpenMP  & pthread   & \verb'user_openmp_grb_pthread.out' & OK, random \\
-\hline
-pthread & none      & \verb'user_pthread_grb_none.out'   & fail \\
-pthread & OpenMP    & \verb'user_pthread_grb_openmp.out' & OK, random \\
-pthread & pthread   & \verb'user_pthread_grb_pthread.out'& OK, random \\
-\hline
-\end{tabular}}
-\vspace{0.1in}
+An example demo program (\verb'openmp_demo') is included that illustrates how a
+multi-threaded user application can use GraphBLAS.
 
-\noindent
-When the user application is multithreaded, GraphBLAS must be compiled with a
-threading library to be thread-safe.  The results listed above as {\em OK,
-random} mean that the output of the program will appear out of order.  This is
-by design, simply to show that the user application is running in parallel.
+The results from the \verb'openmp_demo' program may appear out of order.  This
+is by design, simply to show that the user application is running in parallel.
 The output of each thread should be the same.  In particular, each thread
 generates an intentional error, and later on prints it with \verb'GrB_error'.
 It will print its own error, not an error from another thread.  When all the
-threads finish, the master thread prints out each matrix generated by each
-thread, and these results are identical for all 7 cases listed above as OK.
+threads finish, the leader thread prints out each matrix generated by each
+thread.
 
-% TODO in 4.0: remove this; GraphBLAS will always be thread-safe:
-The GraphBLAS C API requires GraphBLAS to be thread-safe.  If
-SuiteSparse:GraphBLAS is not compiled with a threading library it will not be
-thread-safe (the two {\em fail} cases above).  For these cases, a thread will
-not retrieve its own error, but the last error of any thread.  In addition,
-since there is no critical section that SuiteSparse:GraphBLAS can use, the
-output will include errors about an invalid state of the global matrix queue.
-These errors are to be expected if SuiteSparse:GraphBLAS is not thread-safe.
+GraphBLAS can also be combined with user applications that rely on MPI, the
+Intel TBB threading library, POSIX pthreads, Microsoft Windows threads, or any
+other threading library.  In all cases, GraphBLAS will be thread safe.
 
 \newpage
 %-------------------------------------------------------------------------------
@@ -11070,15 +11915,24 @@ \subsection{On Linux and Mac}
 such as \verb'GrB_assign'.  You will need to use the non-polymorphic functions
 instead.
 
-{\bf NOTE: icc is generally an excellent compiler, but it will generate slower
-code than gcc for v3.2.0 and later.  This is merely because of how the two
-compilers treat \verb'#pragma omp atomic read' and \verb'#pragma omp atomic write'.
-The use of gcc for SuiteSparse:GraphBLAS v3.2.0 and later is recommended.  This
-difference in performance should be resolved in a future version.}
+\begin{alert}
+{\bf NOTE:} icc is generally an excellent compiler, but it will generate slower
+code than gcc for SuiteSparse:GraphBLAS v3.2.0 and later.  This is because of
+how the two compilers treat \verb'#pragma omp atomic' The use of gcc for
+SuiteSparse:GraphBLAS v3.2.0 and later is recommended.  Atomics are slower in
+icc as compared to gcc.
+\end{alert}
 
 To compile SuiteSparse:GraphBLAS and the demo programs, simply type \verb'make'
-in the main GraphBLAS folder, which compiles the library.  To use a
-non-default compiler:
+in the main GraphBLAS folder, which compiles the library.  This will be a
+single-threaded compilation, which will take a long time.  To compile in 
+parallel (40 threads for example), use:
+
+    {\small
+    \begin{verbatim}
+    make JOBS=40 \end{verbatim} }
+
+To use a non-default compiler with 4 threads:
 
     {\small
     \begin{verbatim}
@@ -11114,32 +11968,26 @@ \subsection{On Linux and Mac}
     \begin{verbatim}
     JOBS=32 CC=gcc make \end{verbatim} }
 
+If you do not have \verb'cmake', refer to Section~\ref{altmake}.
+
 %----------------------------------------
 \subsection{On Microsoft Windows}
 \label{sec:windows}
 %----------------------------------------
 
 SuiteSparse:GraphBLAS is now ported to Microsoft Visual Studio.  However, that
-compiler is not ANSI C11 compliant and does not support OpenMP v4.0.  As a
-result, GraphBLAS on Windows will have a few limitations.
+compiler is not ANSI C11 compliant. As a result, GraphBLAS on Windows will have
+a few minor limitations.
 
 \begin{itemize}
-
 \item The MS Visual Studio compiler does not support the \verb'_Generic'
 keyword, required for the polymorphic GraphBLAS functions.  So for example, you
 will need to use \verb'GrB_Matrix_free' instead of just \verb'GrB_free'.
 
-\item Another limitation is the lack of support for OpenMP tasking, used in the
-parallel sort inside GraphBLAS.  With Microsoft Visual Studio, the sort is
-compiled to use just a single thread.  The sort is used for
-\verb'GrB_Matrix_build' and \verb'GrB_Vector_build', and for \verb'GrB_assign'
-and  \verb'GxB_subassign' when the index lists are unsorted on input.  The
-internal sort still works as specified; it will just be single-threaded and
-thus these GraphBLAS functions will be slower on Windows as compared to Linux
-or MacOS.
-
-\item In addition, variable-length arrays are not supported, so user-defined
-types are limited to 128 bytes in size.
+\item Variable-length arrays are not supported, so user-defined
+types are limited to 128 bytes in size.  This can be changed by editing
+\verb'GB_VLA_MAXSIZE' in \verb'Source/GB_compiler.h', and recompiling
+SuiteSparse:GraphBLAS.
 \end{itemize}
 
 If you use a recent \verb'gcc' or \verb'icc' compiler on Windows other than the
@@ -11288,82 +12136,12 @@ \subsection{Compiling the MATLAB interface}
     Windows.  This affects the \verb'GrB/asin', \verb'GrB/acsc',
     \verb'GrB/asinh', and \verb'GrB/acsch', functions in the MATLAB interface.
     See the MATLAB tests bypassed in \verb'gbtest76.m' for details, in the
+    \newline
     \verb'GraphBLAS/GraphBLAS/test' folder.
     %% FUTURE: fix asin and acsc on Windows for the complex case.
 
 \end{enumerate}
 
-%----------------------------------------
-\subsection{Thread-safety in multithreaded user applications}
-\label{sec:threads}
-%----------------------------------------
-
-% TODO in 4.0: this entire section will likely be deleted, with the
-% pending change to GrB_wait and GrB_error.
-
-SuiteSparse:GraphBLAS is parallel, based on OpenMP.  It is thread-safe if
-multiple simultaneous calls are made to GraphBLAS functions, from user threads
-that rely on either OpenMP or POSIX pthreads.  The output variables of those
-calls to GraphBLAS must be unique; you cannot safely modify one GraphBLAS
-object in parallel, with two or more simultaneous GraphBLAS functions operating
-on the same output object.  In addition, all pending operations of objects that
-appear in parallel calls to GraphBLAS must be complete.  This can be done for
-all objects via \verb'GrB_Matrix_wait', \verb'GrB_Vector_wait', and
-\verb'GxB_Scalar_wait', which force completion of a particular object.  If
-multiple parallel calls to GraphBLAS functions operate on unique inputs, then
-those input objects can safely have pending operations.
-
-% TODO in 4.0: delete this for V4.0 when GrB_wait is removed:
-{\bf NOTE: the following will no longer be required in a future version,
-when \verb'GrB_wait()' is removed.}
-
-To use GraphBLAS from a multithreaded user application, GraphBLAS requires
-access to a critical section for the (now deprecated) \verb'GrB_wait()' queue
-of matrices with pending operations, and to a thread-local storage space so
-that each user thread can safely retrieve its own error message with
-\verb'GrB_error'.  In v4.0, the no-argument \verb'GrB_wait()' function will be
-removed, and thus the user-thread-based critical section will be no longer
-needed.
-
-SuiteSparse:GraphBLAS supports the following user threading models.  By
-default, the \verb'cmake' script detects the presence of OpenMP and POSIX
-pthreads.  If OpenMP is present, it uses OpenMP critical sections for
-\verb'GrB_wait()' and OpenMP \verb'threadprivate(...)' for thread-local storage
-for \verb'GrB_error'.  Otherwise, if POSIX pthreads are available, it uses a
-POSIX \verb'mutex', and POSIX thread-local storage via
-\verb'pthread_key_create'.
-
-These methods used inside GraphBLAS can typically inter-operate with any user
-threading model.  That is, a user application that relies on POSIX threads,
-OpenMP, ANSI C11 threads, or Microsoft Windows threads will find GraphBLAS
-thread-safe, even though GraphBLAS uses OpenMP or POSIX internally to
-synchronize the user threads.  However, for the most reliable results, the
-preferred approach is to use the same threading model in GraphBLAS as is used
-in the user application.
-
-You can modify the automatic selection of a user thread synchronization model
-by adding the following settings for \verb'cmake'.  This setting does not
-determine how SuiteSparse:GraphBLAS creates and exploits multiple threads {\em
-inside} any given GraphBLAS operation. Rather, it determines which threading
-library it will use to synchronize multiple calls to GraphBLAS from more than
-one user thread.
-
-\begin{itemize}
-\item OpenMP: this is the default if your compiler supports OpenMP.
-    It can also be specified with \verb'cmake -DUSER_OPENMP=1' in the
-    \verb'cmake' command line.  Internal parallelism in
-    SuiteSparse:GraphBLAS version is based on OpenMP.  This is
-    typically safe to use with any user threading models.
-
-\item POSIX: this is used if OpenMP is not available.
-    If OpenMP is available but you still want GraphBLAS to use POSIX
-    synchronization, compile with \verb'cmake -DUSER_POSIX=1'
-
-\item no user threading:  compile with \verb'cmake -DUSER_NONE=1'.
-    {\bf GraphBLAS will not be thread-safe}.
-
-\end{itemize}
-
 %----------------------------------------
 \subsection{Default matrix format}
 %----------------------------------------
@@ -11383,7 +12161,7 @@ \subsection{Default matrix format}
     \begin{verbatim}
     GxB_Format_Value s ;
     GxB_get (GxB_FORMAT, &s) ;
-    if (s == GxB_BY_COL) printf ("all new matrices are stored by column\n") :
+    if (s == GxB_BY_COL) printf ("all new matrices are stored by column\n") ;
     else printf ("all new matrices are stored by row\n") ; \end{verbatim} }
 
 %----------------------------------------
@@ -11392,13 +12170,12 @@ \subsection{Setting the C flags and using CMake}
 
 The above options can also be combined.  For example, to use the \verb'gcc'
 compiler, to change the default format \verb'GxB_FORMAT_DEFAULT' to
-\verb'GxB_BY_COL', and to use a POSIX mutex inside GraphBLAS to synchronize
-user threads, use the following \verb'cmake' command while in the
+\verb'GxB_BY_COL', use the following \verb'cmake' command while in the
 \verb'GraphBLAS/build' directory:
 
     {\small
     \begin{verbatim}
-    CC=gcc cmake -DBYCOL=1 -DUSER_POSIX=1 .. \end{verbatim}}
+    CC=gcc cmake -DBYCOL=1 .. \end{verbatim}}
 
 \noindent
 Then do \verb'make' in the \verb'build' directory.  If this still fails, see
@@ -11433,22 +12210,9 @@ \subsection{Setting the C flags and using CMake}
 file, but these are meant only for code development of SuiteSparse:GraphBLAS
 itself, not for end-users of SuiteSparse:GraphBLAS.
 
-One particularly useful option is the \verb'BURBLE' setting.  It must be
-enabled both at compile time and then at run time with \verb'GxB_set'
-\verb'(GxB_BURBLE, true)', or \verb'GrB.burble(1)' in the MATLAB interface.  If
-enabled, SuiteSparse:GraphBLAS will print out a report as to which internal
-kernels it uses, and how much time is spent.  If you see the word
-\verb'generic', it means that SuiteSparse:GraphBLAS was unable to use is faster
-kernels in \verb'Source/Generated', but used a generic kernel that relies on
-function pointers.  This is done for user-defined types and operators, and when
-typecasting is performed, and it is typically slower than the kernels in
-\verb'Source/Generated'.  If you see a lot of \verb'wait' statements, it may
-mean that a lot of time is spent finishing a matrix or vector.  This may be
-the result of an inefficient use of the \verb'setElement' and \verb'assign'
-methods.
-
 %----------------------------------------
 \subsection{Using a plain makefile}
+\label{altmake}
 %----------------------------------------
 
 The \verb'GraphBLAS/alternative' directory contains a simple \verb'Makefile'
@@ -11461,8 +12225,8 @@ \subsection{Using a plain makefile}
 \subsection{Running the Demos}
 %----------------------------------------
 
-By default, \verb'make' in the top-level directory compiles the library
-and runs the demos.  You can also run the demos after compiling:
+After \verb'make' in the top-level directory to compile the library, type
+\verb'make run' to run the demos.  You can also run the demos after compiling:
 
     {\small
     \begin{verbatim}
@@ -11508,7 +12272,19 @@ \subsection{Cleaning up}
 To remove all compiled files, type \verb'make' \verb'distclean' in the top-level
 GraphBLAS folder.
 
-\newpage
+%-------------------------------------------------------------------------------
+\section{About NUMA systems}
+%-------------------------------------------------------------------------------
+
+I have tested this package extensively on multicore single-socket systems, but
+have not yet optimized it for multi-socket systems with a NUMA architecture.
+That will be done in a future release.  If you publish benchmark comparisons
+with this package, please state the SuiteSparse:GraphBLAS version, and a caveat
+if appropriate.  If you see significant performance issues when going from a
+single-socket to multi-socket system, I would like to hear from you so I can
+look into it.
+
+% \newpage
 %-------------------------------------------------------------------------------
 \section{Acknowledgments}
 %-------------------------------------------------------------------------------
@@ -11517,23 +12293,33 @@ \section{Acknowledgments}
 Center), and the GraphBLAS API Committee: Ayd\i n Bulu\c{c} (Lawrence Berkeley
 National Laboratory), Timothy G. Mattson (Intel Corporation) Scott McMillan
 (Software Engineering Institute at Carnegie Mellon University), Jos\'e Moreira
-(IBM Corporation), and Carl Yang (UC Davis), for creating the GraphBLAS
-specification and for patiently answering my many questions while I was
-implementing it.
+(IBM Corporation), Carl Yang (UC Davis), and Benjamin Brock (UC Berkeley), for
+creating the GraphBLAS specification and for patiently answering my many
+questions while I was implementing it.
 
 I would like to thank Tim Mattson and Henry Gabb, Intel, Inc., for their
-collaboration and for the support of Intel.  In particular, I would like to
-thank Tim Mattson for parallelizing the merge sort using OpenMP tasks.  The
-parallel merge sort is used for \verb'GrB_Matrix_build',
-\verb'GrB_Vector_build', and some instances of \verb'GrB_transpose'
+collaboration and for the support of Intel.
+
+I would like to thank Joe Eaton for his collaboration on the CUDA kernels
+(still in progress), and for the support of NVIDIA.
+
+I would like to thank Michel Pelletier for his collaboration and work on the
+pygraphblas interface, and Jim Kitchen and Erik Welch for their work on
+Anaconda's python interface.
 
 I would like to thank John Gilbert (UC Santa Barbara) for our many discussions
 on GraphBLAS, and for our decades-long conversation and collaboration on sparse
 matrix computations, and sparse matrices in MATLAB in particular.
 
-I would like to thank Cleve Moler (MathWorks) for our many discussions on
-MATLAB, and for creating MATLAB in the first place.  Without MATLAB,
-SuiteSparse:GraphBLAS would have been impossible to implement and test.
+I would like to thank Cleve Moler, Penny Anderson, Pat Quillen, and Bobby Cheng
+(MathWorks) for our many discussions on MATLAB, and for creating MATLAB in the
+first place.  I would like to thank Lucas Jarman, MathWorks, for his help in
+porting GraphBLAS to Microsoft Windows.  Without MATLAB, SuiteSparse:GraphBLAS
+would have been impossible to implement and test.  I've also appreciated the
+opportunity to contribute so much code to MATLAB over the years: AMD, COLAMD,
+UMFPACK, CHOLMOD, SuiteSparseQR, dmperm (in CSparse), etree,
+sparse-times-sparse matrix multiply, and probably more (I've lost count!).
+Most of these appear in \verb'x=A\b' when \verb'A' is sparse.
 
 I would like to thank S\'ebastien Villemot (Debian Developer,
 \url{http://sebastien.villemot.name}) for helping me with various build issues
@@ -11548,9 +12334,6 @@ \section{Acknowledgments}
 \url{https://youtu.be/9h3Qco_x0QE} \newline
 \url{https://redislabs.com/blog/new-redisgraph-1-0-achieves-600x-faster-performance-graph-databases/}}).
 
-I would like to thank Lucas Jarman, MathWorks (\url{http://mathworks.com}),
-for his help in porting GraphBLAS to Microsoft Windows.
-
 SuiteSparse:GraphBLAS was developed with support from
 NVIDIA, Intel, MIT Lincoln Lab, Redis Labs, IBM,
 and the National Science Foundation (1514406, 1835499).
diff --git a/GraphBLAS/Doc/GraphBLAS_version.tex b/GraphBLAS/Doc/GraphBLAS_version.tex
index 61d4b97713..8926c1fa1d 100644
--- a/GraphBLAS/Doc/GraphBLAS_version.tex
+++ b/GraphBLAS/Doc/GraphBLAS_version.tex
@@ -1,5 +1,5 @@
 % version of SuiteSparse:GraphBLAS
 \date{VERSION
-3.3.3,
-July 14, 2020}
+4.0.3,
+Jan 19, 2021}
 
diff --git a/GraphBLAS/Doc/License.txt b/GraphBLAS/Doc/License.txt
index 9d5a07aa86..696e144223 100644
--- a/GraphBLAS/Doc/License.txt
+++ b/GraphBLAS/Doc/License.txt
@@ -1,4 +1,5 @@
-SuiteSparse:GraphBLAS, Copyright 2017-2020, Timothy A. Davis
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use SuiteSparse:GraphBLAS except in compliance with the
diff --git a/GraphBLAS/Doc/Makefile b/GraphBLAS/Doc/Makefile
index 04f27a2c4f..2d29532ab3 100644
--- a/GraphBLAS/Doc/Makefile
+++ b/GraphBLAS/Doc/Makefile
@@ -3,7 +3,7 @@
 #-------------------------------------------------------------------------------
 
 # SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2018, All Rights Reserved.
-# http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+# SPDX-License-Identifier: Apache-2.0
 
 #-------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Doc/README_Doc.txt b/GraphBLAS/Doc/README_Doc.txt
index a0d8ceb21b..fe3b9ab43a 100644
--- a/GraphBLAS/Doc/README_Doc.txt
+++ b/GraphBLAS/Doc/README_Doc.txt
@@ -16,7 +16,7 @@ This folder contains the following files:
     Makefile                    to create the User Gude
 
     GraphBLAS_version.tex       this version of SuiteSparse:GraphBLAS
-    License.txt                 the license: Apache 2.0
+    License.txt                 SPDX-License-Identifier: Apache-2.0
     README_Doc.txt              this file
 
 Papers on SuiteSparse:GraphBLAS.  See the User Guide for the full citations.
@@ -34,11 +34,8 @@ Additional installation notes are below.
 
 --------------------------------------------------------------------------------
 
-SuiteSparse:GraphBLAS requires OpenMP for its internal parallelism.  It also
-needs either POSIX pthreads or OpenMP to be thread-safe, for multithreaded user
-applications.  The Mac has POSIX pthreads built-in, which works fine for
-user-thread safety, but will not be enough to get internal parallelism in
-GraphBLAS.  To install OpenMP on the Mac, try these instructions.
+SuiteSparse:GraphBLAS requires OpenMP for its internal parallelism.
+To install OpenMP on the Mac, try these instructions.
 
 To use OpenMP in GraphBLAS on the Mac:
 
diff --git a/GraphBLAS/Doc/toms_graphblas.pdf b/GraphBLAS/Doc/toms_graphblas.pdf
index e35215f544..2f1fa8ceb2 100644
Binary files a/GraphBLAS/Doc/toms_graphblas.pdf and b/GraphBLAS/Doc/toms_graphblas.pdf differ
diff --git a/GraphBLAS/Extras/ExactKronGen/Makefile b/GraphBLAS/Extras/ExactKronGen/Makefile
deleted file mode 100644
index f26d99c052..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/Makefile
+++ /dev/null
@@ -1,28 +0,0 @@
-test:  kron
-	./kron a.tsv b.tsv c.tsv
-	./kron a.tsv b.tsv c_btype1.tsv 1 0 1
-	./kron a.tsv b.tsv c_btype2.tsv 1 0 2
-
-kron: kron.c kron_submatrix.c read_tuples.c kron.h simple_timer.c simple_timer.h
-	$(CC) -o kron kron.c kron_submatrix.c read_tuples.c simple_timer.c \
-		-lgraphblas
-
-kron_mpi: kron_mpi.c kron_submatrix.c read_tuples.c kron.h simple_timer.c \
-	simple_timer.h
-	mpicc -o kron_mpi kron_mpi.c kron_submatrix.c read_tuples.c \
-		simple_timer.c -lgraphblas
-
-mpi: kron_mpi
-	mpirun -np 4 ./kron_mpi a.tsv b.tsv e.tsv
-	cat *_e.tsv | sort -n > e_all.tsv
-	sort -n corig.tsv > c_all.tsv
-	diff e_all.tsv c_all.tsv
-
-distclean: purge
-
-purge: clean
-	$(RM) *_C.tsv Csort?.tsv *_e.tsv e_all.tsv c_btype?.tsv C.tsv c_all.tsv e.tsv c.tsv
-
-clean:
-	$(RM) *.o kron kron_mpi
-
diff --git a/GraphBLAS/Extras/ExactKronGen/README.txt b/GraphBLAS/Extras/ExactKronGen/README.txt
deleted file mode 100644
index ed689753dd..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/README.txt
+++ /dev/null
@@ -1,59 +0,0 @@
-kron: Kronkecker product using GraphBLAS
-Timothy A. Davis, (c) 2018, All Rights Reserved.  License: Apache 2.0
-(same as GraphBLAS)
-
-ExactKronGen, July 19, 2018
-
-First install GraphBLAS in a system-wide location (so that -lgraphblas and
-#include<GraphBLAS.h> work).  Then, to compile and test without MPI:
-
-    make
-
-To compile and test with MPI:
-
-    make mpi
-
-The Makefile assumes -lgraphblas exists, and #include<GraphBLAS.h> works.  Be
-sure to use gcc 4.9 or later, or icc version 18 or later.  See the
-GraphBLAS/CMakeLists.txt for details.  For example:
-
-    module load beta-gcc-5.2.0
-    make CC=gcc
-    make CC=gcc mpi
-
-See kron.c and kron_mpi.c for how to use the kron program from
-the command line.
-
-On the MIT Lincoln Lab systems, the gokron and gokron_mpi
-scripts can be used, which sets the library path to where
-GraphBLAS is installed:
-
-    ./gokron
-    ./gokron_mpi
-
-Files:
-
-    README.txt          this file
-
-    Makefile            to compile and test kron and kron_mpi
-    a.tsv               sample input
-    b.tsv               sample input
-    corig.tsv           sample output
-
-    kron_mpi.c          main program, with MPI
-    kron.c              main program, no MPI
-    kron.h              include file
-    kron_submatrix.c    create submatrix of C=kron(A,B)
-    kron_test.m         test kron.c in MATLAB
-    read_tuples.c       read tuples from a file
-    simple_timer.c      a simple timer
-    simple_timer.h      include file for simple_timer.c
-
-    system specific, only works on MIT system:
-    data                symbolic link to GraphChallenge files
-    gokron              run on MIT system, 4 submatrices, no MPI
-    gokron_mpi          run on MIT system, 4 submatrices, with MPI
-
-    gokron_output.txt       output of gokron
-    gokron_mpi_output.txt   output of gokron_mpi
-
diff --git a/GraphBLAS/Extras/ExactKronGen/a.tsv b/GraphBLAS/Extras/ExactKronGen/a.tsv
deleted file mode 100644
index 864457a597..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/a.tsv
+++ /dev/null
@@ -1,10 +0,0 @@
-3 3 3.0
-2 1 3.5
-4 4 1.0
-1 3 3.5
-2 2 3.0
-4 1 3.5
-4 2 0.5
-2 4 1.0
-1 1 4.5
-3 2 1.0
diff --git a/GraphBLAS/Extras/ExactKronGen/b.tsv b/GraphBLAS/Extras/ExactKronGen/b.tsv
deleted file mode 100644
index 6b2928be72..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/b.tsv
+++ /dev/null
@@ -1,8 +0,0 @@
-3 1 2.0
-4 1 3.0
-1 2 2.0
-2 2 1.0
-2 3 1.0
-4 2 3.0
-4 3 3.0
-1 3 6.5
diff --git a/GraphBLAS/Extras/ExactKronGen/corig.tsv b/GraphBLAS/Extras/ExactKronGen/corig.tsv
deleted file mode 100644
index 1678c07c7f..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/corig.tsv
+++ /dev/null
@@ -1,80 +0,0 @@
-3	1	9
-4	1	13.5
-7	1	7
-8	1	10.5
-15	1	7
-16	1	10.5
-1	2	9
-2	2	4.5
-4	2	13.5
-5	2	7
-6	2	3.5
-8	2	10.5
-13	2	7
-14	2	3.5
-16	2	10.5
-1	3	29.25
-2	3	4.5
-4	3	13.5
-5	3	22.75
-6	3	3.5
-8	3	10.5
-13	3	22.75
-14	3	3.5
-16	3	10.5
-7	4	6
-8	4	9
-11	4	2
-12	4	3
-15	4	1
-16	4	1.5
-5	5	6
-6	5	3
-8	5	9
-9	5	2
-10	5	1
-12	5	3
-13	5	1
-14	5	0.5
-16	5	1.5
-5	6	19.5
-6	6	3
-8	6	9
-9	6	6.5
-10	6	1
-12	6	3
-13	6	3.25
-14	6	0.5
-16	6	1.5
-3	7	7
-4	7	10.5
-11	7	6
-12	7	9
-1	8	7
-2	8	3.5
-4	8	10.5
-9	8	6
-10	8	3
-12	8	9
-1	9	22.75
-2	9	3.5
-4	9	10.5
-9	9	19.5
-10	9	3
-12	9	9
-7	10	2
-8	10	3
-15	10	2
-16	10	3
-5	11	2
-6	11	1
-8	11	3
-13	11	2
-14	11	1
-16	11	3
-5	12	6.5
-6	12	1
-8	12	3
-13	12	6.5
-14	12	1
-16	12	3
diff --git a/GraphBLAS/Extras/ExactKronGen/gokron b/GraphBLAS/Extras/ExactKronGen/gokron
deleted file mode 100755
index a046882900..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/gokron
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-module load beta-gcc-5.2.0
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/gridsan/groups/GraphBLAS/GraphBLAS/build
-gcc -O kron.c kron_submatrix.c read_tuples.c simple_timer.c -o kron -L/home/gridsan/groups/GraphBLAS/GraphBLAS/build -lgraphblas -I../GraphBLAS/Include
-./kron data/Theory-3-4-5-9-Bk.tsv data/Theory-16-25-Bk.tsv C.tsv
-diff -q C.tsv data/0/3-4-5-9-Bk-x-16-25-Bk.0.tsv
-
-# now try with np=4 (but no MPI, just fake parallelism)
-./kron data/Theory-3-4-5-9-Bk.tsv data/Theory-16-25-Bk.tsv C.tsv 4 0
-./kron data/Theory-3-4-5-9-Bk.tsv data/Theory-16-25-Bk.tsv C.tsv 4 1
-./kron data/Theory-3-4-5-9-Bk.tsv data/Theory-16-25-Bk.tsv C.tsv 4 2
-./kron data/Theory-3-4-5-9-Bk.tsv data/Theory-16-25-Bk.tsv C.tsv 4 3
-cat *_C.tsv | sort > Csort1.tsv
-sort data/0/3-4-5-9-Bk-x-16-25-Bk.0.tsv > Csort2.tsv
-diff -q Csort1.tsv Csort2.tsv
-
diff --git a/GraphBLAS/Extras/ExactKronGen/gokron_mpi b/GraphBLAS/Extras/ExactKronGen/gokron_mpi
deleted file mode 100755
index bfcf71bc27..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/gokron_mpi
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/bash
-
-module load beta-gcc-5.2.0
-module load beta-openmpi-2.0.1
-
-# without MPI
-export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/home/gridsan/groups/GraphBLAS/GraphBLAS/build
-gcc -O kron.c kron_submatrix.c read_tuples.c simple_timer.c -o kron \
-    -L/home/gridsan/groups/GraphBLAS/GraphBLAS/build -lgraphblas \
-        -I../GraphBLAS/Include
-./kron data/Theory-3-4-5-9-Bk.tsv data/Theory-16-25-Bk.tsv C.tsv
-diff -q C.tsv data/0/3-4-5-9-Bk-x-16-25-Bk.0.tsv
-
-# now try with np=4 (using MPI)
-mpicc -o kron_mpi kron_mpi.c kron_submatrix.c read_tuples.c simple_timer.c \
-    -L/home/gridsan/groups/GraphBLAS/GraphBLAS/build -lgraphblas \
-        -I../GraphBLAS/Include
-time mpirun -np 4 ./kron_mpi data/Theory-3-4-5-9-Bk.tsv data/Theory-16-25-Bk.tsv C.tsv
-cat *_C.tsv | sort > Csort1.tsv
-sort data/0/3-4-5-9-Bk-x-16-25-Bk.0.tsv > Csort2.tsv
-diff -q Csort1.tsv Csort2.tsv
-
diff --git a/GraphBLAS/Extras/ExactKronGen/gokron_mpi_output.txt b/GraphBLAS/Extras/ExactKronGen/gokron_mpi_output.txt
deleted file mode 100644
index 0461261207..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/gokron_mpi_output.txt
+++ /dev/null
@@ -1,73 +0,0 @@
-
-mit $ ./gokron_mpi
-
-kron: np:1 pid:0 btype:0
-    A file: data/Theory-3-4-5-9-Bk.tsv
-    B file: data/Theory-16-25-Bk.tsv
-    C file: C.tsv
-    kron [pid:0]: time to read A and B: 0.270757 sec
-    kron [pid:0]: time to convert A and B: 0.000222231 sec
-    GraphBLAS GxB_kron [pid:0]:
-    my_A: 1200-by-1200, 8640 entries.
-    B: 442-by-442, 1600 entries.
-    my_C: 530400-by-530400, 13824000 entries.
-    time: 0.172412 seconds, rate: nval(C)/t = 80.1799 million/sec
-    kron [pid:0]: time to write C: 9.00366 sec
-
-kron: np:4 pid:3 btype:0
-    A file: data/Theory-3-4-5-9-Bk.tsv
-    B file: data/Theory-16-25-Bk.tsv
-    C file: 3_C.tsv
-
-kron: np:4 pid:0 btype:0
-    A file: data/Theory-3-4-5-9-Bk.tsv
-    B file: data/Theory-16-25-Bk.tsv
-    C file: 0_C.tsv
-
-kron: np:4 pid:1 btype:0
-    A file: data/Theory-3-4-5-9-Bk.tsv
-    B file: data/Theory-16-25-Bk.tsv
-    C file: 1_C.tsv
-
-kron: np:4 pid:2 btype:0
-    A file: data/Theory-3-4-5-9-Bk.tsv
-    B file: data/Theory-16-25-Bk.tsv
-    C file: 2_C.tsv
-    kron [pid:3]: time to read A and B: 0.061851 sec
-    kron [pid:3]: time to convert A and B: 0.000167789 sec
-    kron [pid:2]: time to read A and B: 0.0744057 sec
-    kron [pid:2]: time to convert A and B: 0.00016254 sec
-    kron [pid:0]: time to read A and B: 0.101526 sec
-    kron [pid:0]: time to convert A and B: 0.000272522 sec
-    GraphBLAS GxB_kron [pid:2]:
-    my_A: 1200-by-360, 2160 entries.
-    B: 442-by-442, 1600 entries.
-    my_C: 530400-by-159120, 3456000 entries.
-    time: 0.0556038 seconds, rate: nval(C)/t = 62.154 million/sec
-    GraphBLAS GxB_kron [pid:3]:
-    my_A: 1200-by-540, 2160 entries.
-    B: 442-by-442, 1600 entries.
-    my_C: 530400-by-238680, 3456000 entries.
-    time: 0.0730674 seconds, rate: nval(C)/t = 47.2988 million/sec
-    kron [pid:1]: time to read A and B: 0.16212 sec
-    kron [pid:1]: time to convert A and B: 0.000167091 sec
-    GraphBLAS GxB_kron [pid:0]:
-    my_A: 1200-by-60, 2160 entries.
-    B: 442-by-442, 1600 entries.
-    my_C: 530400-by-26520, 3456000 entries.
-    time: 0.0881891 seconds, rate: nval(C)/t = 39.1885 million/sec
-    GraphBLAS GxB_kron [pid:1]:
-    my_A: 1200-by-240, 2160 entries.
-    B: 442-by-442, 1600 entries.
-    my_C: 530400-by-106080, 3456000 entries.
-    time: 0.112145 seconds, rate: nval(C)/t = 30.8173 million/sec
-    kron [pid:2]: time to write C: 2.78587 sec
-    kron [pid:0]: time to write C: 2.84308 sec
-    kron [pid:3]: time to write C: 3.01881 sec
-    kron [pid:1]: time to write C: 2.89315 sec
-
-real    0m3.659s
-user    0m0.000s
-sys 0m0.000s
-mit $ 
-
diff --git a/GraphBLAS/Extras/ExactKronGen/gokron_output.txt b/GraphBLAS/Extras/ExactKronGen/gokron_output.txt
deleted file mode 100644
index cec0df57ea..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/gokron_output.txt
+++ /dev/null
@@ -1,68 +0,0 @@
-
-mit $ ./gokron
-
-kron: np:1 pid:0 btype:0
-    A file: data/Theory-3-4-5-9-Bk.tsv
-    B file: data/Theory-16-25-Bk.tsv
-    C file: C.tsv
-    kron [pid:0]: time to read A and B: 0.00645062 sec
-    kron [pid:0]: time to convert A and B: 0.000223999 sec
-    GraphBLAS GxB_kron [pid:0]:
-    my_A: 1200-by-1200, 8640 entries.
-    B: 442-by-442, 1600 entries.
-    my_C: 530400-by-530400, 13824000 entries.
-    time: 0.177024 seconds, rate: nval(C)/t = 78.0912 million/sec
-    kron [pid:0]: time to write C: 8.99254 sec
-
-kron: np:4 pid:0 btype:0
-    A file: data/Theory-3-4-5-9-Bk.tsv
-    B file: data/Theory-16-25-Bk.tsv
-    C file: 0_C.tsv
-    kron [pid:0]: time to read A and B: 0.00693548 sec
-    kron [pid:0]: time to convert A and B: 0.000133124 sec
-    GraphBLAS GxB_kron [pid:0]:
-    my_A: 1200-by-60, 2160 entries.
-    B: 442-by-442, 1600 entries.
-    my_C: 530400-by-26520, 3456000 entries.
-    time: 0.0391137 seconds, rate: nval(C)/t = 88.3579 million/sec
-    kron [pid:0]: time to write C: 2.34006 sec
-
-kron: np:4 pid:1 btype:0
-    A file: data/Theory-3-4-5-9-Bk.tsv
-    B file: data/Theory-16-25-Bk.tsv
-    C file: 1_C.tsv
-    kron [pid:1]: time to read A and B: 0.00688623 sec
-    kron [pid:1]: time to convert A and B: 0.000132213 sec
-    GraphBLAS GxB_kron [pid:1]:
-    my_A: 1200-by-240, 2160 entries.
-    B: 442-by-442, 1600 entries.
-    my_C: 530400-by-106080, 3456000 entries.
-    time: 0.040664 seconds, rate: nval(C)/t = 84.9892 million/sec
-    kron [pid:1]: time to write C: 2.29284 sec
-
-kron: np:4 pid:2 btype:0
-    A file: data/Theory-3-4-5-9-Bk.tsv
-    B file: data/Theory-16-25-Bk.tsv
-    C file: 2_C.tsv
-    kron [pid:2]: time to read A and B: 0.00720694 sec
-    kron [pid:2]: time to convert A and B: 0.000162253 sec
-    GraphBLAS GxB_kron [pid:2]:
-    my_A: 1200-by-360, 2160 entries.
-    B: 442-by-442, 1600 entries.
-    my_C: 530400-by-159120, 3456000 entries.
-    time: 0.0422027 seconds, rate: nval(C)/t = 81.8905 million/sec
-    kron [pid:2]: time to write C: 2.36019 sec
-
-kron: np:4 pid:3 btype:0
-    A file: data/Theory-3-4-5-9-Bk.tsv
-    B file: data/Theory-16-25-Bk.tsv
-    C file: 3_C.tsv
-    kron [pid:3]: time to read A and B: 0.00719911 sec
-    kron [pid:3]: time to convert A and B: 0.000152018 sec
-    GraphBLAS GxB_kron [pid:3]:
-    my_A: 1200-by-540, 2160 entries.
-    B: 442-by-442, 1600 entries.
-    my_C: 530400-by-238680, 3456000 entries.
-    time: 0.0468242 seconds, rate: nval(C)/t = 73.8079 million/sec
-    kron [pid:3]: time to write C: 2.30853 sec
-
diff --git a/GraphBLAS/Extras/ExactKronGen/kron.c b/GraphBLAS/Extras/ExactKronGen/kron.c
deleted file mode 100644
index 4341dcf45e..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/kron.c
+++ /dev/null
@@ -1,73 +0,0 @@
-//------------------------------------------------------------------------------
-// kron.c: Kronkecker product using GraphBLAS
-//------------------------------------------------------------------------------
-
-// Timothy A. Davis, (c) 2018, All Rights Reserved.  License: Apache 2.0
-// (same as GraphBLAS)
-
-//------------------------------------------------------------------------------
-
-// Reads two graphs from two files and computes their Kronecker product,
-// as C = kron (A,B) in MATLAB, writing the result to a file.
-//
-//  kron A.tsv B.tsv C.tsv np pid btype
-//
-// Where A.tsv and B.tsv are two tab- or space-delimited triplet files with
-// 1-based indices.  Each line has the form:
-//
-//  i j x
-//
-// where A(i,j)=x is performed by GrB_Matrix_build, to construct the matrix.
-// The dimensions of A and B are assumed to be the largest row and column
-// indices that appear in the files.  The file C.tsv is the filename of the
-// output file for C=kron(A,B), also with 1-based indices.
-//
-// The np, pid, and btype parameters are optional.
-//
-// The defaults for np and pid, if not present, are np=1 and pid=0.  np is the
-// number of processors who are working together to create C, and pid is in the
-// range 0 to np-1.  If present, the output filename C.tsv is prepended with
-// "<pid>_", so C.tsv becomes 0_C.tsv, 1_C.tsv, etc.
-//
-// If not present, the default for btype is zero.  If btype=1 or 2, then A and
-// B are modified to generate lots (btype=1) or some (btype=2) extra triangles.
-
-#include "kron.h"
-
-int main (int argc, char **argv)
-{
-
-    //--------------------------------------------------------------------------
-    // check command-line inputs
-    //--------------------------------------------------------------------------
-
-    if (! (argc == 4 || argc == 6 || argc == 7))
-    {
-        fprintf (stderr, "usage: kron A.tsv B.tsv C.tsv np pid btype\n") ;
-        exit (1) ;
-    }
-
-    char *Afilename = argv [1] ;
-    char *Bfilename = argv [2] ;
-    char *Cfilename = argv [3] ;
-
-    int np = 1, pid = 0 ;
-    if (argc >= 6)
-    {
-        sscanf (argv [4], "%d", &np) ;
-        sscanf (argv [5], "%d", &pid) ;
-    }
-
-    int btype = 0 ;
-    if (argc == 7)
-    {
-        sscanf (argv [6], "%d", &btype) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // construct C = kron(A,B) or a submatrix of C
-    //--------------------------------------------------------------------------
-
-    return (kron_submatrix (Afilename, Bfilename, Cfilename, np, pid, btype)) ;
-}
-
diff --git a/GraphBLAS/Extras/ExactKronGen/kron.h b/GraphBLAS/Extras/ExactKronGen/kron.h
deleted file mode 100644
index 249caaaeef..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/kron.h
+++ /dev/null
@@ -1,39 +0,0 @@
-//------------------------------------------------------------------------------
-// kron.h: include file for kron.c main program
-//------------------------------------------------------------------------------
-
-// Timothy A. Davis, (c) 2018, All Rights Reserved.  License: Apache 2.0
-// (same as GraphBLAS)
-
-#include <stdio.h>
-#include "GraphBLAS.h"
-#include "simple_timer.h"
-
-#define MIN(a,b) (((a) < (b)) ? (a) : (b))
-#define MAX(a,b) (((a) > (b)) ? (a) : (b))
-
-GrB_Info read_tuples      // read a file of tuples
-(
-    // input: file must be already open
-    FILE *f,                // file to read the tuples from
-
-    // output: not defined on input
-    GrB_Index **I_handle,           // row indices (in range 0 to nrows-1)
-    GrB_Index **J_handle,           // column indices (in range 0 to ncols-1)
-    double **X_handle,              // values
-    GrB_Index *p_ntuples,           // number of tuples read in
-    GrB_Index *p_len,               // length of I, J, X
-    GrB_Index *p_nrows,             // 1 + max (I)
-    GrB_Index *p_ncols              // 1 + max (J)
-) ;
-
-int kron_submatrix      // 0: success, 1: failure
-(
-    char *Afilename,    // filename with triplets of A 
-    char *Bfilename,    // filename with triplets of B
-    char *Cfilename,    // filename with output triplets C
-    int np,             // # of threads being used, must be > 0
-    int pid,            // id of this thread (in range 0 to np-1)
-    int btype           // 0: no triangles, 1: lots, 2: some
-) ;
-
diff --git a/GraphBLAS/Extras/ExactKronGen/kron_mpi.c b/GraphBLAS/Extras/ExactKronGen/kron_mpi.c
deleted file mode 100644
index c10d79f394..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/kron_mpi.c
+++ /dev/null
@@ -1,98 +0,0 @@
-//------------------------------------------------------------------------------
-// kron.c: Kronkecker product using GraphBLAS
-//------------------------------------------------------------------------------
-
-// Timothy A. Davis, (c) 2018, All Rights Reserved.  License: Apache 2.0
-// (same as GraphBLAS)
-
-//------------------------------------------------------------------------------
-
-// Reads two graphs from two files and computes their Kronecker product,
-// as C = kron (A,B) in MATLAB, writing the result to a file.
-//
-//  mpirun -np 4 ./kron_mpi A.tsv B.tsv C.tsv btype
-//
-// Where A.tsv and B.tsv are two tab- or space-delimited triplet files with
-// 1-based indices.  Each line has the form:
-//
-//  i j x
-//
-// where A(i,j)=x is performed by GrB_Matrix_build, to construct the matrix.
-// The dimensions of A and B are assumed to be the largest row and column
-// indices that appear in the files.  The file C.tsv is the filename of the
-// output file for C=kron(A,B), also with 1-based indices.
-//
-// The btype parameter is optional.
-//
-// If not present, the default for btype is zero.  If btype=1 or 2, then A and
-// B are modified to generate lots (btype=1) or some (btype=2) extra triangles.
-//
-// If np is the number of MPI processors who are working together to create C,
-// and pid is in the range 0 to np-1, then the output filename C.tsv is
-// prepended with "<pid>_", so C.tsv becomes 0_C.tsv, 1_C.tsv, etc.
-
-#include "kron.h"
-#include <mpi.h>
-
-// call an MPI function and abort if an error occurs
-#define OK(mpi_operation)                                               \
-{                                                                       \
-    int ierr = mpi_operation ;                                          \
-    if (ierr != MPI_SUCCESS)                                            \
-    {                                                                   \
-        fprintf (stderr, "MPI error (%s line %d, pid %d of %d): %d\n",  \
-            __FILE__, __LINE__, pid, np, ierr) ;                        \
-        MPI_Abort (MPI_COMM_WORLD, ierr) ;                              \
-    }                                                                   \
-}
-
-int main (int argc, char **argv)
-{
-
-    //--------------------------------------------------------------------------
-    // start MPI
-    //--------------------------------------------------------------------------
-
-    int np = 1, pid = 0 ;
-    OK (MPI_Init (&argc, &argv)) ;                  // start MPI
-    OK (MPI_Comm_size (MPI_COMM_WORLD, &np)) ;      // get # of MPI processes
-    OK (MPI_Comm_rank (MPI_COMM_WORLD, &pid)) ;     // get this process id
-
-    //--------------------------------------------------------------------------
-    // check command-line inputs
-    //--------------------------------------------------------------------------
-
-    if (! (argc == 4 || argc == 5))
-    {
-        fprintf (stderr, "usage: kron_mpi A.tsv B.tsv C.tsv np pid btype\n") ;
-        exit (1) ;
-    }
-    char *Afilename = argv [1] ;
-    char *Bfilename = argv [2] ;
-    char *Cfilename = argv [3] ;
-    int btype = 0 ;
-    if (argc == 7)
-    {
-        sscanf (argv [4], "%d", &btype) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // construct C = kron(A,B) or a submatrix of C
-    //--------------------------------------------------------------------------
-
-    int err = kron_submatrix (Afilename, Bfilename, Cfilename, np, pid, btype) ;
-
-    //--------------------------------------------------------------------------
-    // wrap up
-    //--------------------------------------------------------------------------
-
-    if (err)
-    {
-        fprintf (stderr, "kron_submatrix failed (%s line %d, pid %d of %d)\n",
-            __FILE__, __LINE__, pid, np) ;
-        MPI_Abort (MPI_COMM_WORLD, MPI_ERR_OTHER) ;
-    }
-
-    MPI_Finalize ( ) ;
-}
-
diff --git a/GraphBLAS/Extras/ExactKronGen/kron_submatrix.c b/GraphBLAS/Extras/ExactKronGen/kron_submatrix.c
deleted file mode 100644
index 033aefc4f5..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/kron_submatrix.c
+++ /dev/null
@@ -1,326 +0,0 @@
-//------------------------------------------------------------------------------
-// kron_submatrix: construct a submatrix of C=kron(A,B)
-//------------------------------------------------------------------------------
-
-// if np > 1, the output Cfilename is prepended with "<pid>_"
-
-#include "kron.h"
-
-// macro used by OK(...) to free workspace if an error occurs
-#define FREE_ALL                        \
-    GrB_free (&my_A) ;                  \
-    GrB_free (&B) ;                     \
-    GrB_free (&my_C) ;                  \
-    if (Afile != NULL) fclose (Afile) ; \
-    if (Bfile != NULL) fclose (Bfile) ; \
-    if (Cfile != NULL) fclose (Cfile) ; \
-    if (Ai != NULL) free (Ai) ;         \
-    if (Aj != NULL) free (Aj) ;         \
-    if (Ax != NULL) free (Ax) ;         \
-    if (Bi != NULL) free (Bi) ;         \
-    if (Bj != NULL) free (Bj) ;         \
-    if (Bx != NULL) free (Bx) ;         \
-    if (my_Ci != NULL) free (my_Ci) ;   \
-    if (my_Cj != NULL) free (my_Cj) ;   \
-    if (my_Cx != NULL) free (my_Cx) ;   \
-    GrB_finalize ( ) ;
-
-// OK(method) is a macro that calls a GraphBLAS method and checks the status;
-// if a failure occurs, it prints the detailed error message, frees all
-// allocated workspace and returns the error code.
-#define OK(method)                                          \
-{                                                           \
-    info = method ;                                         \
-    if (! (info == GrB_SUCCESS || info == GrB_NO_VALUE))    \
-    {                                                       \
-        fprintf (stderr, "file %s line %d\n", __FILE__, __LINE__) ;  \
-        fprintf (stderr, "%s\n", GrB_error ( )) ;                    \
-        FREE_ALL ;                                          \
-        return (1) ;                                        \
-    }                                                       \
-}
-
-int kron_submatrix      // 0: success, 1: failure
-(
-    char *Afilename,    // filename with triplets of A 
-    char *Bfilename,    // filename with triplets of B
-    char *Cfilename,    // filename with output triplets C
-    int np,             // # of threads being used, must be > 0
-    int pid,            // id of this thread (in range 0 to np-1)
-    int btype           // 0: no triangles, 1: lots, 2: some
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    fprintf (stderr, "\nkron: np:%d pid:%d btype:%d\n", np, pid, btype) ;
-    if (Afilename == NULL || Bfilename == NULL || Cfilename == NULL)
-    {
-        fprintf (stderr, "one or more filenames are null\n") ;
-        return (1) ;
-    }
-    if (np < 1 || pid < 0 || pid >= np)
-    {
-        fprintf (stderr, "np must be >= 1, and pid in range 0 to np-1\n") ;
-        return (1) ;
-    }
-    if (btype < 0 || btype > 2)
-    {
-        fprintf (stderr, "invalid btype, must be 0, 1, or 2\n") ;
-        return (1) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // initializations
-    //--------------------------------------------------------------------------
-
-    GrB_Matrix my_A = NULL, B = NULL, my_C = NULL ;
-    GrB_Index *my_Ci = NULL, *my_Cj = NULL ;
-    FILE *Afile = NULL, *Bfile = NULL, *Cfile = NULL ;
-    double *my_Cx = NULL ;
-    GrB_Info info ;
-    double tic [2], t_read, t_convert, t_kron, t_write ;
-    GrB_Index *Ai = NULL, *Aj = NULL, A_ntuples = 0, A_len = 0 ;
-    GrB_Index *Bi = NULL, *Bj = NULL, B_ntuples = 0, B_len = 0 ;
-    double *Ax = NULL ;
-    double *Bx = NULL ;
-
-    #define CLEN 2048
-    char Cfilename2 [CLEN+1] ;
-
-    // adjust the my_C output file name if np > 1
-    if (np > 1)
-    {
-        snprintf (Cfilename2, CLEN, "%d_%s", pid, Cfilename) ;
-        Cfilename = Cfilename2 ;
-    }
-
-    OK (GrB_init (GrB_NONBLOCKING)) ;
-
-    //--------------------------------------------------------------------------
-    // get A and B tuples from input files
-    //--------------------------------------------------------------------------
-
-    fprintf (stderr, "    A file: %s\n", Afilename) ;
-    fprintf (stderr, "    B file: %s\n", Bfilename) ;
-    fprintf (stderr, "    C file: %s\n", Cfilename) ;
-
-    simple_tic (tic) ;
-    Afile = fopen (Afilename, "r") ;
-    Bfile = fopen (Bfilename, "r") ;
-    Cfile = fopen (Cfilename, "w") ;
-    if (Afile == NULL || Bfile == NULL || Cfile == NULL)
-    {
-        fprintf (stderr, "unable to read input files or create output file\n") ;
-        FREE_ALL ;
-        return (1) ;
-    }
-
-    GrB_Index A_nrows, A_ncols, B_nrows, B_ncols ;
-    OK (read_tuples (Afile,
-        &Ai, &Aj, &Ax, &A_ntuples, &A_len, &A_nrows, &A_ncols)) ;
-    OK (read_tuples (Bfile,
-        &Bi, &Bj, &Bx, &B_ntuples, &B_len, &B_nrows, &B_ncols)) ;
-    fclose (Afile) ;
-    fclose (Bfile) ;
-    Afile = NULL ;
-    Bfile = NULL ;
-    t_read = simple_toc (tic) ;
-    fprintf (stderr, "    kron [pid:%d]: time to read A and B: %g sec\n",
-        pid, t_read) ;
-
-    //--------------------------------------------------------------------------
-    // determine my_A, the submatrix of A for this thread
-    //--------------------------------------------------------------------------
-
-    int64_t ntuples_per_thread = A_ntuples / np ;
-    int64_t my_first_A_tuple   = pid * ntuples_per_thread ;
-    int64_t my_last_A_tuple    = (pid == np-1) ?
-        (A_ntuples-1) : ((pid+1) * ntuples_per_thread - 1) ;
-    int64_t my_A_ntuples = my_last_A_tuple - my_first_A_tuple + 1 ;
-
-    // get the tuples of my submatrix of A
-    GrB_Index *my_Ai = Ai + my_first_A_tuple ;
-    GrB_Index *my_Aj = Aj + my_first_A_tuple ;
-    double    *my_Ax = Ax + my_first_A_tuple ;
-
-    // find the first and last column of my submatrix of A
-    int64_t my_Aj_min = -1, my_Aj_max = -2 ;
-    if (np == 1)
-    {
-        my_Aj_min = 0 ;
-        my_Aj_max = A_ncols-1 ;
-    }
-    else
-    {
-        // note that if my_A_ntuples == 0 then range is [-1,-2]
-        for (int64_t k = 0 ; k < my_A_ntuples ; k++)
-        {
-            int64_t j = my_Aj [k] ;
-            if (k == 0)
-            {
-                my_Aj_min = j ;
-                my_Aj_max = j ;
-            }
-            else
-            {
-                my_Aj_min = MIN (my_Aj_min, j) ;
-                my_Aj_max = MAX (my_Aj_max, j) ;
-            }
-        }
-    }
-
-    // my_A_ncols is zero if my_A has no tuples
-    int64_t my_A_ncols = my_Aj_max - my_Aj_min + 1 ;
-
-    // shift the column indices of my submatrix of A to start at column zero
-    if (my_Aj_min > 0)
-    {
-        for (int64_t k = 0 ; k < my_A_ntuples ; k++)
-        {
-            my_Aj [k] -= my_Aj_min ;
-        }
-    }
-
-    //--------------------------------------------------------------------------
-    // convert my_A and B to GraphBLAS matrices
-    //--------------------------------------------------------------------------
-
-    simple_tic (tic) ;
-    OK (GrB_Matrix_new (&my_A, GrB_FP64, A_nrows, my_A_ncols)) ;
-    OK (GrB_Matrix_new (&B, GrB_FP64, B_nrows, B_ncols)) ;
-
-    OK (GrB_Matrix_build (my_A, my_Ai, my_Aj, my_Ax, my_A_ntuples,
-        GrB_PLUS_FP64)) ;
-    OK (GrB_Matrix_build (B, Bi, Bj, Bx, B_ntuples, GrB_PLUS_FP64)) ;
-
-    // free the A and B tuples
-    free (Ai) ; Ai = NULL ;
-    free (Aj) ; Aj = NULL ;
-    free (Ax) ; Ax = NULL ;
-    free (Bi) ; Bi = NULL ;
-    free (Bj) ; Bj = NULL ;
-    free (Bx) ; Bx = NULL ;
-    t_convert = simple_toc (tic) ;
-    fprintf (stderr, "    kron [pid:%d]: time to convert A and B: %g sec\n",
-        pid, t_convert) ;
-
-    //--------------------------------------------------------------------------
-    // handle the btype
-    //--------------------------------------------------------------------------
-
-    if (btype == 1)
-    {
-        if (pid == 0)
-        {
-            // add a single self-loop: my_A (0,0) = 1
-            fprintf (stderr, "    add my_A (0,0)=1\n") ;
-            OK (GrB_Matrix_setElement (my_A, 1, 0, 0)) ;
-        }
-        // add a single self-loop: B (0,0) = 1
-        fprintf (stderr, "    add B (0,0)=1\n") ;
-        OK (GrB_Matrix_setElement (B, 1, 0, 0)) ;
-    }
-    else if (btype == 2)
-    {
-        if (pid == np-1)
-        {
-            // add a single self-loop: my_A (end,end) = 1
-            fprintf (stderr, "    add my_A (end,end)=1\n") ;
-            OK (GrB_Matrix_setElement (my_A, 1, A_nrows-1, my_A_ncols-1)) ;
-        }
-        // add a single self-loop: B (end,end) = 1
-        fprintf (stderr, "    add B (end,end)=1\n") ;
-        OK (GrB_Matrix_setElement (B, 1, B_nrows-1, B_ncols-1)) ;
-    }
-
-    GrB_Index my_A_nvals, B_nvals ;
-    OK (GrB_Matrix_nvals (&my_A_nvals, my_A)) ;
-    OK (GrB_Matrix_nvals (&B_nvals, B)) ;
-
-    //--------------------------------------------------------------------------
-    // my_C = kron (my_A,B)
-    //--------------------------------------------------------------------------
-
-    simple_tic (tic) ;
-    OK (GrB_Matrix_new (&my_C,
-        GrB_FP64, A_nrows * B_nrows, my_A_ncols * B_ncols));
-    OK (GxB_kron (my_C, NULL, NULL, GrB_TIMES_FP64, my_A, B, NULL)) ;
-    t_kron = simple_toc (tic) ;
-
-    OK (GrB_free (&my_A)) ;
-    OK (GrB_free (&B)) ;
-
-    //--------------------------------------------------------------------------
-    // report results
-    //--------------------------------------------------------------------------
-
-    GrB_Index C_nrows, my_C_ncols, my_C_nvals ;
-    OK (GrB_Matrix_nrows (&C_nrows, my_C)) ;
-    OK (GrB_Matrix_ncols (&my_C_ncols, my_C)) ;
-    OK (GrB_Matrix_nvals (&my_C_nvals, my_C)) ;
-
-    fprintf (stderr, "    GraphBLAS GxB_kron [pid:%d]:\n"
-    "    my_A: %lld-by-%lld, %lld entries.\n"
-    "    B: %lld-by-%lld, %lld entries.\n"
-    "    my_C: %lld-by-%lld, %lld entries.\n"
-    "    time: %g seconds, rate: nval(C)/t = %g million/sec\n", pid,
-    A_nrows, my_A_ncols, my_A_nvals,
-    B_nrows,    B_ncols,    B_nvals,
-    C_nrows, my_C_ncols, my_C_nvals,
-    t_kron, 1e-6*((double) my_C_nvals) / t_kron) ;
-
-    //--------------------------------------------------------------------------
-    // write my_C to the output file
-    //--------------------------------------------------------------------------
-
-    simple_tic (tic) ;
-    my_Ci = (GrB_Index *) malloc ((my_C_nvals+1) * sizeof (GrB_Index)) ;
-    my_Cj = (GrB_Index *) malloc ((my_C_nvals+1) * sizeof (GrB_Index)) ;
-    my_Cx = (double    *) malloc ((my_C_nvals+1) * sizeof (double   )) ;
-    if (my_Ci == NULL || my_Cj == NULL || my_Cx == NULL)
-    {
-        fprintf (stderr, "out of memory\n") ;
-        FREE_ALL ;
-        return (1) ;
-    }
-
-    OK (GrB_Matrix_extractTuples (my_Ci, my_Cj, my_Cx, &my_C_nvals, my_C)) ;
-
-    // shift the column indices of my_C to reflect columns in C instead
-    GrB_Index my_Cj_min = my_Aj_min * B_ncols ;
-
-    for (int64_t k = 0 ; k < my_C_nvals ; k++)
-    {
-        int64_t ci = my_Ci [k] ;
-        int64_t cj = my_Cj [k] ;
-
-        // handle the btype
-        if (btype != 0)
-        {
-            if (my_Aj_min == 0)
-            {
-                // remove self-loop C (0,0)
-                if (ci == 0 && cj == 0) continue ;
-            }
-            if (my_Aj_max == A_ncols-1)
-            {
-                // remove self-loop C (end,end)
-                if (ci == C_nrows-1 && cj == my_C_ncols-1) continue ; 
-            }
-        }
-
-        fprintf (Cfile, "%lld\t%lld\t%.17g\n",
-            1 + ci, 1 + cj + my_Cj_min, my_Cx [k]) ;
-    }
-
-    // success
-    FREE_ALL ;
-    t_write = simple_toc (tic) ;
-    fprintf (stderr, "    kron [pid:%d]: time to write C: %g sec\n",
-        pid, t_write) ;
-    return (0) ;
-}
-
diff --git a/GraphBLAS/Extras/ExactKronGen/kron_test.m b/GraphBLAS/Extras/ExactKronGen/kron_test.m
deleted file mode 100644
index 98c8d7cceb..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/kron_test.m
+++ /dev/null
@@ -1,36 +0,0 @@
-% test kron.c by comparing it with MATLAB
-%
-% usage: kron_test
-
-clear
-system ('make kron ; ./kron a.tsv b.tsv c.tsv') ;
-load a.tsv
-load b.tsv
-load c.tsv
-
-A = spconvert (a) ;
-B = spconvert (b) ;
-C = kron (A,B) ;
-
-C1 = spconvert (c) ;
-err = norm (C-C1, 1) ;
-assert (err == 0) ;
-
-for np = 1:16
-    fprintf ('\nnp: %d\n', np) ;
-    cc = zeros (0,3) ;
-    for pid = 0:np-1
-        system (sprintf ('./kron a.tsv b.tsv cc.tsv %d %d', np, pid)) ;
-        if (np > 1)
-            myc = sprintf ('%d_cc.tsv', pid) ;
-        else
-            myc = 'cc.tsv' ;
-        end
-        c_pid = load (myc) ;
-        delete (myc) ;
-        cc = [cc ; c_pid] ;
-    end
-    C2 = spconvert (cc) ;
-    err = norm (C-C2, 1) ;
-    assert (err == 0) ;
-end
diff --git a/GraphBLAS/Extras/ExactKronGen/read_tuples.c b/GraphBLAS/Extras/ExactKronGen/read_tuples.c
deleted file mode 100644
index 127c8d46e4..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/read_tuples.c
+++ /dev/null
@@ -1,132 +0,0 @@
-//------------------------------------------------------------------------------
-// read_tuples:  reads in a set if I,J,X tuples
-//------------------------------------------------------------------------------
-
-// Timothy A. Davis, (c) 2018, All Rights Reserved.  License: Apache 2.0
-// (same as GraphBLAS)
-
-// Usage:
-//
-//      GrB_Index *I, *J, ntuples, len, nrows, ncols ;
-//      double *X ;
-//      GrB_Info info ;
-//      FILE *f = stdin ;    // for example
-//      info = read_tuples (f, &I, &J, &X, &ntuples, &len, &nrows, &ncols) ;
-//
-// I, J, X and undefined on input.  On output, they are NULL if an error
-// occurred.  If info == GrB_SUCCESS, then I, J, and X point to newly allocated
-// space containing the list of tuples read in from the file f.  ntuples are
-// the number of tuples read in, and len is the size of I, J, and X (in terms
-// of the # of tuples they could hold).  The tuples in the file are assumed
-// to be 1-based, but they are returned in I,J,X as zero-based.   On input, all
-// arguments must be non-NULL pointers.
-
-#include "kron.h"
-
-GrB_Info read_tuples      // read a file of tuples
-(
-    // input: file must be already open
-    FILE *f,                // file to read the tuples from
-
-    // output: not defined on input
-    GrB_Index **I_handle,           // row indices (in range 0 to nrows-1)
-    GrB_Index **J_handle,           // column indices (in range 0 to ncols-1)
-    double **X_handle,              // values
-    GrB_Index *p_ntuples,           // number of tuples read in
-    GrB_Index *p_len,               // length of I, J, X
-    GrB_Index *p_nrows,             // 1 + max (I)
-    GrB_Index *p_ncols              // 1 + max (J)
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    if (f == NULL || I_handle == NULL || J_handle == NULL || X_handle == NULL ||
-        p_ntuples == NULL || p_len == NULL || p_nrows == NULL ||
-        p_ncols == NULL)
-    {
-        return (GrB_NULL_POINTER) ;
-    }
-
-    *I_handle = NULL ;
-    *J_handle = NULL ;
-    *X_handle = NULL ;
-    (*p_ntuples) = 0 ;
-    (*p_len) = 0 ;
-    (*p_nrows) = 0 ;
-    (*p_ncols) = 0 ;
-
-    //--------------------------------------------------------------------------
-    // allocate initial space for tuples
-    //--------------------------------------------------------------------------
-
-    int64_t len = 256 ;
-    GrB_Index *I = malloc (len * sizeof (int64_t)), *I2 = NULL ;
-    GrB_Index *J = malloc (len * sizeof (int64_t)), *J2 = NULL ;
-    double    *X = malloc (len * sizeof (double )), *X2 = NULL ;
-    if (I == NULL || J == NULL || X == NULL)
-    {
-        // out of memory
-        if (I != NULL) free (I) ;
-        if (J != NULL) free (J) ;
-        if (X != NULL) free (X) ;
-        return (GrB_OUT_OF_MEMORY) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // read in the tuples from stdin, one per line
-    //--------------------------------------------------------------------------
-
-    // format warnings vary with compilers, so read in as double
-    double ii, jj, x ;
-    GrB_Index nrows = 0 ;
-    GrB_Index ncols = 0 ;
-    GrB_Index ntuples = 0 ;
-    while (fscanf (f, "%lg %lg %lg\n", &ii, &jj, &x) != EOF)
-    {
-        int64_t i = (int64_t) ii ;
-        int64_t j = (int64_t) jj ;
-        if (ntuples >= len)
-        {
-            I2 = realloc (I, 2 * len * sizeof (int64_t)) ;
-            J2 = realloc (J, 2 * len * sizeof (int64_t)) ;
-            X2 = realloc (X, 2 * len * sizeof (double)) ;
-            if (I2 == NULL || J2 == NULL || X2 == NULL)
-            {
-                free (I) ;
-                free (J) ;
-                free (X) ;
-                return (GrB_OUT_OF_MEMORY) ;
-            }
-            I = I2 ; I2 = NULL ;
-            J = J2 ; J2 = NULL ;
-            X = X2 ; X2 = NULL ;
-            len = len * 2 ;
-        }
-        if (i > nrows) nrows = i ;
-        if (j > ncols) ncols = j ;
-        // tuples in file are 1-based so convert to 0-based
-        i-- ;
-        j-- ;
-        I [ntuples] = i ;
-        J [ntuples] = j ;
-        X [ntuples] = x ;
-        ntuples++ ;
-    }
-
-    //--------------------------------------------------------------------------
-    // return results
-    //--------------------------------------------------------------------------
-
-    *I_handle = I ;
-    *J_handle = J ;
-    *X_handle = X ;
-    (*p_ntuples) = ntuples ;
-    (*p_len) = len ;
-    (*p_nrows) = nrows ;
-    (*p_ncols) = ncols ;
-    return (GrB_SUCCESS) ;
-}
-
diff --git a/GraphBLAS/Extras/ExactKronGen/simple_timer.c b/GraphBLAS/Extras/ExactKronGen/simple_timer.c
deleted file mode 100644
index de6a600b46..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/simple_timer.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/* -------------------------------------------------------------------------- */
-/* GraphBLAS/Demo/simple_timer.c: a timer for performance measurements        */
-/* -------------------------------------------------------------------------- */
-
-/* SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017, All Rights Reserved.    */
-/* http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.        */
-
-/* -------------------------------------------------------------------------- */
-
-/* simple_timer:  a portable timer for accurate performance measurements */
-
-#include "simple_timer.h"
-
-/* -------------------------------------------------------------------------- */
-/* simple_tic: return the current wallclock time in high resolution           */
-/* -------------------------------------------------------------------------- */
-
-void simple_tic         /* returns current time in seconds and nanoseconds */
-(
-    double tic [2]      /* tic [0]: seconds, tic [1]: nanoseconds */
-)
-{
-
-    #if defined ( _OPENMP )
-
-        /* OpenMP is available; use the OpenMP timer function */
-        tic [0] = omp_get_wtime ( ) ;
-        tic [1] = 0 ;
-
-    #elif defined ( __linux__ )
-
-        /* Linux has a very low resolution clock() function, so use the high
-           resolution clock_gettime instead.  May require -lrt */
-        struct timespec t ;
-        clock_gettime (CLOCK_MONOTONIC, &t) ;
-        tic [0] = (double) t.tv_sec ;
-        tic [1] = (double) t.tv_nsec ;
-
-    #else
-
-        /* The ANSI C11 clock() function is used instead.  This gives the
-           processor time, not the wallclock time, and it might have low
-           resolution.  It returns the time since some unspecified fixed time
-           in the past, as a clock_t integer.  The clock ticks per second are
-           given by CLOCKS_PER_SEC.  In Mac OSX this is a very high resolution
-           clock, and clock ( ) is faster than clock_get_time (...) ; */
-        clock_t t = clock ( ) ;
-        tic [0] = ((double) t) / ((double) CLOCKS_PER_SEC) ;
-        tic [1] = 0 ;
-
-    #endif
-
-}
-
-/* -------------------------------------------------------------------------- */
-/* simple_toc: return the time since the last simple_tic                      */
-/* -------------------------------------------------------------------------- */
-
-double simple_toc           /* returns time since last simple_tic */
-(
-    const double tic [2]    /* tic from last call to simple_tic */
-)
-{
-    double toc [2] ;
-    simple_tic (toc) ;
-    return ((toc [0] - tic [0]) + 1e-9 * (toc [1] - tic [1])) ;
-}
-
diff --git a/GraphBLAS/Extras/ExactKronGen/simple_timer.h b/GraphBLAS/Extras/ExactKronGen/simple_timer.h
deleted file mode 100644
index d6622033a1..0000000000
--- a/GraphBLAS/Extras/ExactKronGen/simple_timer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* -------------------------------------------------------------------------- */
-/* GraphBLAS/Demo/simple_timer.h: a timer for performance measurements        */
-/* -------------------------------------------------------------------------- */
-
-/* SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017, All Rights Reserved.    */
-/* http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.        */
-
-/* -------------------------------------------------------------------------- */
-
-/*
-   There is no method that works on all operating systems for finding the
-   current time with high resolution that is suitable for performance
-   measurements.  The simple_timer.c and simple_timer.h functions provide a
-   portable alternative.
-
-   simple_tic (tic) ; gets the current time and saves it in tic [0..1].
-
-   t = simple_toc (tic) ; returns the time in seconds since the last call to
-   simple_toc, as a single double value.
-
-   Usage:
-
-        #include "simple_timer.h"
-        double tic [2], r, s, t ;
-
-        simple_tic (tic) ;          // start the timer
-        // do some work A
-        t = simple_toc (tic) ;      // t is time for work A, in seconds
-        // do some work B
-        s = simple_toc (tic) ;      // s is time for work A and B, in seconds
-
-        simple_tic (tic) ;          // restart the timer
-        // do some work C
-        r = simple_toc (tic) ;      // r is time for work C, in seconds
-*/
-
-#define _POSIX_C_SOURCE 200809L
-#include <time.h>
-
-#if defined ( __linux__ )
-#include <sys/time.h>
-#endif
-
-#if defined ( _OPENMP )
-#include <omp.h>
-#endif
-
-void simple_tic         /* returns current time in seconds and nanoseconds */
-(
-    double tic [2]      /* tic [0]: seconds, tic [1]: nanoseconds */
-) ;
-
-
-double simple_toc           /* returns time since last simple_tic */
-(
-    const double tic [2]    /* tic from last call to simple_tic */
-) ;
-
diff --git a/GraphBLAS/Extras/README.txt b/GraphBLAS/Extras/README.txt
deleted file mode 100644
index 3a46f72eee..0000000000
--- a/GraphBLAS/Extras/README.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-GraphBLAS extras:
-
-These programs are not necessarily ported to any system.  You may need to
-modify the Makefiles or the scripts the run the tests.
-
-    ExactKronGen:
-    
-        Massively-parallel Kronecker product computation.This package is a
-        simple program for constructing a huge Kronecker product in parallel.
-        It includes a top-level MPI-based program. It also includes a simple
-        function (kron_submatrix) that can be used inside a larger MPI
-        application, to run on a single MPI process or also without MPI at all.
-        The function computes a submatrix of a larger Kronecker product and
-        writes its submatrix to a uniquely-named file. Concatenating all such
-        files together gives the resulting Kronecker product. The package uses
-        GxB_kron in SuiteSparse:GraphBLAS.
-
-    tri: triangle counting in OpenMP.  See also GraphBLAS/Demo/tricount.c
-
-    ktruss: K-truss computation in OpenMP and GraphBLAS
-
diff --git a/GraphBLAS/Extras/ktruss/.gitignore b/GraphBLAS/Extras/ktruss/.gitignore
deleted file mode 100644
index 7c525e3553..0000000000
--- a/GraphBLAS/Extras/ktruss/.gitignore
+++ /dev/null
@@ -1,12 +0,0 @@
-# Ignore these files:
-*.o
-
-a.out
-allktruss_graphblas_main
-allktruss_main
-ktruss_graphblas_main
-ktruss_main
-ktruss_mex.mex*
-
-# Do not ignore this file
-!.gitignore
diff --git a/GraphBLAS/Extras/ktruss/Makefile b/GraphBLAS/Extras/ktruss/Makefile
deleted file mode 100644
index f9f3bae56c..0000000000
--- a/GraphBLAS/Extras/ktruss/Makefile
+++ /dev/null
@@ -1,49 +0,0 @@
-# compile 4 main programs: *_main
-
-# select your compiler
-# CC = gcc-6    # Mac
-  CC = xlc      # IBM Minsky system
-# CC = gcc
-# CC = cc
-
-CF = -O3 -fexceptions -fPIC -std=c11 -fopenmp
-
-LIB = -L/usr/local/lib -lgraphblas ../../build/libgraphblasdemo_static.a -lm
-  I = -I../../Include -I../../Demo/Include 
-
-%.o: %.c $(INC)
-	$(CC) $(CF) $(I) -c $<
-
-# default: compiler all 4 main programs and run each of them
-all: ktruss_main ktruss_graphblas_main allktruss_main allktruss_graphblas_main
-	./ktruss_main < bcsstk01
-	./allktruss_main < bcsstk01
-	./ktruss_graphblas_main 1 < bcsstk01
-	./allktruss_graphblas_main 1 < bcsstk01
-
-# construct a k-truss (pure C, with OpenMP)
-ktruss_main: ktruss_main.o ktruss_read.o ktruss.o ktruss_ntriangles.o
-	$(CC) $(CF) $^ -o $@ -lm
-
-# construct all k-trusses (pure C, with OpenMP)
-allktruss_main: allktruss_main.o ktruss_read.o allktruss.o ktruss_ntriangles.o
-	$(CC) $(CF) $^ -o $@ -lm
-
-# construct a k-truss (GraphBLAS)
-ktruss_graphblas_main: ktruss_graphblas_main.c ktruss_graphblas.c
-	$(CC) $(CF) $(I) $^ -o $@ $(LIB)
-
-# construct all k-trusses (GraphBLAS)
-allktruss_graphblas_main: allktruss_graphblas_main.c allktruss_graphblas.c
-	$(CC) $(CF) $(I) $^ -o $@ $(LIB)
-
-prog: ktruss_main ktruss_graphblas_main allktruss_main allktruss_graphblas_main
-
-clean:
-	- $(RM) -r *.o
-
-purge: distclean
-
-distclean: clean
-	- $(RM) -r *_main *.mex* 
-
diff --git a/GraphBLAS/Extras/ktruss/README.txt b/GraphBLAS/Extras/ktruss/README.txt
deleted file mode 100644
index cccb2f5db2..0000000000
--- a/GraphBLAS/Extras/ktruss/README.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-ktruss:  construct the k-truss of a graph, using MATLAB, pure C code (in
-parallel using OpenMP, not using GraphBLAS), and GraphBLAS.
-
-June 23, 2018
-
-See ktruss.m for a description.  Files in this package:
-
-    Makefile                    compile each program
-    README.txt                  this file
-    allktruss.c                 construct all k-trusses (pure C)
-    allktruss.m                 construct all k-trusses (in MATLAB)
-    allktruss_graphblas.c       construct all k-trusses (with GraphBLAS)
-    allktruss_graphblas_main.c  main program for allktruss_graphblas.c
-    allktruss_main.c            main program for allktruss.c
-    allktruss_mex.c             MATLAB mexFunction interface for allktruss.c
-    bcsstk01                    small input test matrix
-    kgo                         run all tests
-    kgo.m                       run tests in MATLAB
-    kgo1                        run just two matrices
-    kmake.m                     compile the mexFunctions
-    krun                        run one matrix
-    ktruss.c                    construct a ktruss (pure C)
-    ktruss.m                    construct a ktruss (in MATLAB)
-    ktruss_def.h                include file for allktruss and ktruss
-    ktruss_graphblas.c          construct a ktruss (in GraphBLAS)
-    ktruss_graphblas_def.h      include file for *graphblas.c
-    ktruss_graphblas_main.c     main program for ktruss_graphblas.c
-    ktruss_main.c               main program for ktruss.c
-    ktruss_mex.c                MATLAB mexFunction interface for ktruss.c
-    ktruss_ntriangles.c         count # of triangles
-    ktruss_read.c               read a matrix from a file
-
-    allktruss_grb_results.m     allktruss GraphBLAS results
-    allktruss_results.m         allktruss.c results
-    ktruss_grb_results.m        ktruss GraphBLAS results
-    ktruss_results.m            ktruss.c results
-
-    chol2_results.txt           raw results on cholesky.cse.tamu.edu, IBM Minsky
-    filetrim.m                  trim filename
-    k1                          run one matrix (friendster)
-
-    kres14.m                    analyze results (except for last matrix)
-    kres.m                      analyze results
-    ksub                        run subset of GraphChallenge for HPEC18
-
-To compile the package:
-
-    Edit the Makefile and pick your compiler.
-    Install SuiteSparse:GraphBLAS.  Version 2.0.1 was used.
-    Edit ktruss_main.c to select the max # of threads.
-    Do:
-
-        make
-        ./kgo1
-        ./kgo
-
-For the kgo* scipts, you will need the matrices from the GraphChallenge
-data sets in ~/GraphChallenge, gzipped to save space.  You will also need
-some matrices from the SuiteSparse collection (the ~/GraphChallenge/ssget/
-folder).
-
diff --git a/GraphBLAS/Extras/ktruss/allktruss.c b/GraphBLAS/Extras/ktruss/allktruss.c
deleted file mode 100644
index 9cf8faa793..0000000000
--- a/GraphBLAS/Extras/ktruss/allktruss.c
+++ /dev/null
@@ -1,334 +0,0 @@
-//------------------------------------------------------------------------------
-// allktruss: construct all k-trusses of a graph (OpenMP, not with GraphBLAS)
-//------------------------------------------------------------------------------
-
-// C = allktruss (A), all k-trusses of the graph A
-
-// On input, A is the adjacency matrix of a graph, which must be square with
-// symmetric pattern, and no diagonal entries.  These conditions are not
-// checked.  A is treated as if binary on input so the content of Ax is ignored
-// on input.  The matrix A is represented in compressed sparse column form as
-// Ap, Ai, and n on input.  That is, the pattern of column A(:,j) is held in
-// Ai [Ap [j] ... Ap [j+1]-1], where Ap [0] = 0 and Ap [n] = nnz (A).
-
-// The value of k for the requested k-truss is provided as the scalar input,
-// support = k-2, which must be > 0.
-
-// On output, the input graph A is destroyed.  The k-trusses of A are returned
-// as a list of matrices (Cps, Cis, and Cxs).  The k-truss of A is held in:
-//      Cp = Cps [k]: array of size n+1, column pointers.
-//      Ci = Cis [k]: row indices, of size nz = Cp [n].
-//      Cx = Cxs [k]: values, of size nz.
-// In this matrix, C(i,j) is the # of ntriangles in A that are in the k-truss
-// of A.  The edges of the k-truss are a subset of the input graph A.  Each
-// edge in C is part of at least k-2 triangles in C.  The pattern of C, (that
-// is, spones(C) in MATLAB notation), is the adjacency matrix of the k-truss
-// subgraph of A.  The edge weights of C are the support of each edge.  That
-// is, C(i,j)=nt if the edge (i,j) is part of nt triangles in C.  All edges in
-// C have support of at least k-2.  The total number of triangles in C is
-// sum(sum(C))/6 in MATLAB notation.  The number of edges in C is nnz(C)/2, in
-// MATLAB notation, or Cp [n]/2.  C is returned as symmetric with a zero-free
-// diagonal, with all entries greater than or equal to k-2.  The matrix C is
-// returned on output in compressed sparse column form in Cp, Ci, Cx, in the
-// entries Cps [k], Cis [k], and Cxs [k], respectively.  That is, the pattern
-// and values of C(:,j) are held in Ci [Cp [j] ... Cp [j+1]-1] and Cx [Cp [j]
-// ... Cp [j+1]-1], where Cp [0] = 0 and Cp [n] = nnz (C).  Note that returning
-// the k-trusses is optional; they are only returned if Cps, Cis, and Cxs are
-// non-NULL on input.  The last k-truss, when k=kmax, is empty and this matrix
-// is not returned (Cps [kmax], Cis [kmax], and Cxs [kmax] are NULL).
-
-#include "ktruss_def.h"
-
-//------------------------------------------------------------------------------
-// cmult: C<A>=A*A
-//------------------------------------------------------------------------------
-
-// workspace for each thread:
-_Thread_local Index *restrict w    = NULL ;
-_Thread_local bool  *restrict Mark = NULL ;
-
-void cmult                      // C = (A*A) .* A, overwriting A with C
-(
-    // input/output:
-    int64_t *restrict Ap,       // column pointers, size n+1
-    Index   *restrict Ai,       // row indices, size anz = Ap [n]
-    // output, content not defined on input:
-    Index   *restrict Ax,       // values
-    // input, not modified:
-    const Index n,              // A is n-by-n
-    const int nthreads,         // # of threads
-    const Index chunk           // scheduler chunk size
-)
-{
-
-    //----------------------------------------------------------------------
-    // C = (A*A) .* A
-    //----------------------------------------------------------------------
-
-    // This step computes the values of C in Ax.  The pattern of A and C
-    // are the same, and are in Ap, Ai, and n.  A is treated as if binary
-    // so its values are ignored.
-
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,chunk)
-    for (Index j = 0 ; j < n ; j++)
-    {
-        // scatter A(:,j) into Mark.  All of w is zero.
-        for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-        {
-            Mark [Ai [p]] = 1 ;
-        }
-        // C(:,j) = (A * A(:,j)) .* Mark
-        for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-        {
-            const Index k = Ai [p] ;                // (row k, not k-truss)
-            // C(:,j) += (A(:,k) * A(k,j)) .* Mark
-            for (int64_t pa = Ap [k] ; pa < Ap [k+1] ; pa++)
-            {
-                // C(i,j) += (A(i,k) * A(k,j)) .* Mark
-                Index i = Ai [pa] ;
-                if (Mark [i]) w [i]++ ;
-            }
-        }
-        // gather C(:,j) from the workspace and clear the Mark
-        for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-        {
-            Index i = Ai [p] ;
-            Ax [p] = w [i] ;
-            Mark [i] = 0 ;
-            w [i] = 0 ;
-        }
-    }
-}
-
-//------------------------------------------------------------------------------
-// allktruss: construct all k-trusses of a graph
-//------------------------------------------------------------------------------
-
-bool allktruss                  // true if successful, false otherwise
-(
-    // input/output:
-    int64_t *restrict Ap,       // column pointers, size n+1
-    Index   *restrict Ai,       // row indices, size anz = Ap [n]
-    // output, content not defined on input:
-    Index   *restrict Ax,       // values
-    // input, not modified:
-    const Index n,              // A is n-by-n
-    const int threads,          // # of threads
-    const Index chunk,          // scheduler chunk size
-
-    // output statistics
-    int64_t *restrict kmax,     // smallest k where k-truss is empty
-    int64_t *restrict ntris,    // size n, ntris [k] is #triangles in k-truss
-    int64_t *restrict nedges,   // size n, nedges [k] is #edges in k-truss
-    int64_t *restrict nstepss,  // size n, nsteps [k] is #steps for k-truss
-
-    // optional output k-trusses, if present
-    int64_t **restrict Cps,     // size n
-    Index   **restrict Cis,     // size n
-    Index   **restrict Cxs      // size n
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    if (nstepss == NULL || kmax == NULL || ntris == NULL || nedges == NULL)
-    {
-        return (false) ;
-    }
-
-    bool keep_all_ktrusses = (Cps != NULL && Cis != NULL && Cxs != NULL) ;
-
-    int nthreads = (n < chunk) ? 1 : threads ;
-
-    for (Index k = 0 ; k < 3 ; k++)
-    {
-        if (keep_all_ktrusses)
-        {
-            Cps [k] = NULL ;
-            Cis [k] = NULL ;
-            Cxs [k] = NULL ;
-        }
-        ntris   [k] = 0 ;
-        nedges  [k] = 0 ;
-        nstepss [k] = 0 ;
-    }
-    (*kmax) = 0 ;
-
-    //--------------------------------------------------------------------------
-    // allocate workspace
-    //--------------------------------------------------------------------------
-
-    bool ok = true ;
-
-    #pragma omp parallel num_threads(nthreads) reduction(&&:ok)
-    {
-        w    = (Index *) calloc (n, sizeof (Index)) ;
-        Mark = (bool  *) calloc (n, sizeof (bool )) ;
-        ok = (Mark != NULL && w != NULL) ;
-    }
-
-    #pragma omp parallel num_threads(nthreads)
-    {
-        if (!ok)
-        {
-            // out of memory
-            if (w    != NULL) free (w   ) ;
-            if (Mark != NULL) free (Mark) ;
-        }
-    }
-
-    if (!ok) return (false) ;
-
-    //--------------------------------------------------------------------------
-    // C = allktruss (A)
-    //--------------------------------------------------------------------------
-
-    double tmult = 0 ;
-    double tsel  = 0 ;
-    double t1 = omp_get_wtime ( ) ;
-
-    // C = (A*A) .* A, overwriting A with C
-    int64_t last_cnz = Ap [n] ;
-    cmult (Ap, Ai, Ax, n, nthreads, chunk) ;
-    int64_t nsteps = 1 ;
-
-    double t2 = omp_get_wtime ( ) ;
-    printf ("cmult time: %g\n", t2-t1) ;
-    tmult += (t2-t1) ;
-
-    //--------------------------------------------------------------------------
-
-    for (Index k = 3 ; ; k++)
-    {
-        Index support = k-2 ;
-
-        while (1)
-        {
-
-            //------------------------------------------------------------------
-            // C = C .* (C >= support)
-            //------------------------------------------------------------------
-
-            // C is now in Ap, Ai, Ax, and n.
-            // Prune all entries C(i,j) < support.
-
-            double t1 = omp_get_wtime ( ) ;
-
-            int64_t cnz = 0 ;
-            for (Index j = 0 ; j < n ; j++)
-            {
-                // log the start of column C(:,j)
-                int64_t p1 = Ap [j] ;
-                Ap [j] = cnz ;
-                for (int64_t p = p1 ; p < Ap [j+1] ; p++)
-                {
-                    // consider the edge C(i,j)
-                    Index i   = Ai [p] ;
-                    Index cij = Ax [p] ;
-                    if (cij >= support)
-                    {
-                        // the edge C(i,j) has enough support; keep it
-                        Ai [cnz  ] = i ;
-                        Ax [cnz++] = cij ;
-                    }
-                }
-            }
-            Ap [n] = cnz ;
-
-            t2 = omp_get_wtime ( ) ;
-            printf ("select time: %g\n", t2-t1) ;
-            tsel += (t2-t1) ;
-
-            //------------------------------------------------------------------
-            // check if k-truss has been found
-            //------------------------------------------------------------------
-
-            if (cnz == last_cnz)
-            {
-                // k-truss has been found
-                ntris   [k] = ktruss_ntriangles (cnz, Ax) ;
-                nedges  [k] = cnz / 2 ;
-                nstepss [k] = nsteps ;
-                nsteps = 0 ;
-                if (cnz == 0)
-                {
-                    // this is the last k-truss (an empty matrix)
-                    if (keep_all_ktrusses)
-                    {
-                        Cps [k] = NULL ;
-                        Cis [k] = NULL ;
-                        Cxs [k] = NULL ;
-                    }
-                    (*kmax) = k ;
-                    #pragma omp parallel num_threads(nthreads)
-                    {
-                        free (w) ;
-                        free (Mark) ;
-                    }
-                    printf ("allktruss nthreads %d done: tmult %g tsel %g\n", nthreads, tmult, tsel) ;
-                    return (true) ;
-                }
-                else if (keep_all_ktrusses)
-                {
-                    // save the k-truss in the list of output k-trusses
-                    int64_t *Cp = (int64_t *) malloc ((n+1) * sizeof (int64_t));
-                    Index   *Ci = (Index   *) malloc (cnz   * sizeof (Index)) ;
-                    Index   *Cx = (Index   *) malloc (cnz   * sizeof (Index)) ;
-                    Cps [k] = Cp ;
-                    Cis [k] = Ci ;
-                    Cxs [k] = Cx ;
-                    if (Cp == NULL || Ci == NULL || Cx == NULL)
-                    {
-                        // out of memory: free all outputs
-                        for (Index j = 3 ; j <= k ; j++)
-                        {
-                            if (Cps [j] != NULL) free (Cps [j]) ;
-                            if (Cis [j] != NULL) free (Cis [j]) ;
-                            if (Cxs [j] != NULL) free (Cxs [j]) ;
-                            Cps [j] = NULL ;
-                            Cis [j] = NULL ;
-                            Cxs [j] = NULL ;
-                        }
-                        #pragma omp parallel num_threads(nthreads)
-                        {
-                            free (w) ;
-                            free (Mark) ;
-                        }
-                        return (false) ;
-                    }
-                    memcpy (Cp, Ap, (n+1) * sizeof (int64_t)) ;
-                    memcpy (Ci, Ai, cnz   * sizeof (Index)) ;
-                    memcpy (Cx, Ax, cnz   * sizeof (Index)) ;
-                }
-
-                // start finding the next k-truss
-                break ;
-            }
-
-            //------------------------------------------------------------------
-            // continue searching for this k-truss
-            //------------------------------------------------------------------
-
-            nsteps++ ;
-            
-            //------------------------------------------------------------------
-            // count the triangles for the next iteration
-            //------------------------------------------------------------------
-
-            t1 = omp_get_wtime ( ) ;
-
-            // C = (A*A) .* A, overwriting A with C
-            cmult (Ap, Ai, Ax, n, nthreads, chunk) ;
-            last_cnz = Ap [n] ;
-
-            t2 = omp_get_wtime ( ) ;
-            printf ("mult time: %g\n", t2-t1) ;
-            tmult += (t2-t1) ;
-        }
-    }
-
-    return (false) ;
-}
-
diff --git a/GraphBLAS/Extras/ktruss/allktruss.m b/GraphBLAS/Extras/ktruss/allktruss.m
deleted file mode 100644
index 6d842c96a3..0000000000
--- a/GraphBLAS/Extras/ktruss/allktruss.m
+++ /dev/null
@@ -1,92 +0,0 @@
-function [stats,Cout] = allktruss (A)
-%ALLKTRUSS compute all k-trusses of a graph.
-%
-% [stats,C] = allktruss (A)
-%
-% A is the adjacency matrix of a graph, and must be square, symmetric, and with
-% a zero-free diagonal.  It is treated as if binary (A = spones(A)).
-%
-% With a single output, only the statistics on all non-empty k-trusses of A
-% are returned:
-%
-%   stats.kmax is the smallest k for which the k-truss of A is empty.
-%   stats.time (3:kmax) is the time take for each k-truss
-%   stats.ntri (3:kmax) is the number of triangles in each k-truss
-%   stats.nedges (3:kmax) is the number of edges in the k-truss
-%   stats.nsteps (3:kmax) is the # of steps required to compute each k-truss
-%
-% An optional second output, C, is a cell array of each k-truss.  C{1} and C{2}
-% are empty.  C{3} to C{kmax} are the 3-truss to kmax-truss of A.  See ktruss.m
-% for a description of each k-truss. The kmax-truss is the first non-empty
-% k-truss.
-%
-% See also allktruss_mex, which computes the same thing in a mexFunction.
-
-%-------------------------------------------------------------------------------
-% initializations
-%-------------------------------------------------------------------------------
-
-n = size (A,1) ;
-stats.kmax = 0 ;
-stats.time = zeros (1,n+1) ;
-stats.ntri = zeros (1,n+1) ;
-stats.nedges = zeros (1,n+1) ;
-stats.nsteps = zeros (1,n+1) ;
-
-nn = 0 ;
-
-% count the triangles in the original graph
-tic ;
-last_cnz = nnz (A) ;
-C = spones (A) ;
-C = (C*C) .* C ;
-nsteps = 1 ;
-
-for k = 3:(n+1)
-
-    % find the k-truss
-    while (1)
-
-        % remove edges with not enough support
-        if (k > 3)
-            C = C .* (C >= (k-2)) ;
-        end
-
-        % check if the k-truss has been found
-        cnz = nnz (C) ;
-        if (cnz == last_cnz)
-            % C is the k-truss of A
-            stats.ntri (k) = full (sum (sum (C))) / 6 ;
-            stats.nedges (k) = cnz / 2 ;
-            stats.nsteps (k) = nsteps ;
-            nsteps = 0 ;
-            if (nargout > 1)
-                Cout {k} = C ;
-            end
-            stats.time (k) = toc ;
-
-            if (cnz == 0)
-                % this is the last k-truss
-                stats.kmax = k ;
-                stats.time = stats.time (1:k) ;
-                stats.ntri = stats.ntri (1:k) ;
-                stats.nedges = stats.nedges (1:k) ;
-                stats.nsteps = stats.nsteps (1:k) ;
-                return
-            end
-
-            % start finding the next k-truss
-            tic ;
-            break ;
-        end
-
-        % continue searching for this k-truss
-        C = spones (C) ;
-        last_cnz = cnz ;
-        nsteps = nsteps + 1 ;
-
-        % count the triangles for the next iteration
-        C = (C*C) .* C ;
-    end
-end
-
diff --git a/GraphBLAS/Extras/ktruss/allktruss_graphblas.c b/GraphBLAS/Extras/ktruss/allktruss_graphblas.c
deleted file mode 100644
index f12d4a0092..0000000000
--- a/GraphBLAS/Extras/ktruss/allktruss_graphblas.c
+++ /dev/null
@@ -1,226 +0,0 @@
-//------------------------------------------------------------------------------
-// allktruss_graphblas.c: find all k-trusses of a graph via GraphBLAS
-//------------------------------------------------------------------------------
-
-// Given a symmetric graph A with no-self edges, ktruss_graphblas finds all
-// k-trusses of A.
-
-// The edge weights of A are treated as binary.  Explicit zero entries in A are
-// treated as non-edges.  Any type will work, but int64 is recommended for
-// fastest results since that is the type used here for the semiring.
-// GraphBLAS will do typecasting internally, but that takes extra time. 
-
-// The optional output matrices Cset [3..kmax-1] are the k-trusses of A.  Their
-// edges are a subset of A.  Each edge in C = Cset [k] is part of at least k-2
-// triangles in C.  The pattern of C is the adjacency matrix of the k-truss
-// subgraph of A.  The edge weights of C are the support of each edge.  That
-// is, C(i,j)=nt if the edge (i,j) is part of nt triangles in C.  All edges in
-// C have support of at least k-2.  The total number of triangles in C is
-// reduce(C,'plus')/6.  The number of edges in C is nnz(C)/2.  C is returned as
-// symmetric with a zero-free diagonal.  The k-trusses are not returned if Cset
-// is NULL.  Cset [kmax] is NULL since the kmax-truss is empty.
-
-// Usage: constructs k-trusses of A, for k = 3:kmax
-//      GrB_Info info = allktruss_graphblas (&Cset, A, &kmax,
-//          ntries, nedges, nstepss) ;
-
-// Compare this function with the MATLAB equivalent, allktruss.m.
-
-// Modified for SuiteSparse:GraphBLAS V3.0:  support changed to a
-// GrB_Vector, for input to GxB_select.
-
-#define FREE_ALL                                \
-    if (keep_all_ktrusses)                      \
-    {                                           \
-        for (int64_t kk = 3 ; kk <= k ; kk++)   \
-        {                                       \
-            GrB_free (&(Cset [kk])) ;           \
-        }                                       \
-    }                                           \
-    GrB_free (Support) ;                        \
-    GrB_free (&supportop) ;                     \
-    GrB_free (&C) ;
-
-#include "ktruss_graphblas_def.h"
-
-//------------------------------------------------------------------------------
-// support_function:  select function for GxB_SelectOp and GxB_select
-//------------------------------------------------------------------------------
-
-bool support_function (const GrB_Index i, const GrB_Index j,
-    const GrB_Index nrows, const GrB_Index ncols,
-    const int64_t *x, const int64_t *support)
-{
-    return ((*x) >= (*support)) ;
-}
-
-//------------------------------------------------------------------------------
-// C = allktruss_graphblas (A,k): find all k-trusses a graph
-//------------------------------------------------------------------------------
-
-GrB_Info allktruss_graphblas    // compute all k-trusses of a graph
-(
-    GrB_Matrix *Cset,           // output k-truss subgraphs (optional)
-    GrB_Matrix A,               // input adjacency matrix, A, not modified
-
-    // output statistics
-    int64_t *kmax,              // smallest k where k-truss is empty
-    int64_t *ntris,             // size n, ntris [k] is #triangles in k-truss
-    int64_t *nedges,            // size n, nedges [k] is #edges in k-truss
-    int64_t *nstepss            // size n, nstepss [k] is #steps for k-truss
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    if (nstepss == NULL || kmax == NULL || ntris == NULL || nedges == NULL)
-    {
-        return (GrB_NULL_POINTER) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // initializations
-    //--------------------------------------------------------------------------
-
-    bool keep_all_ktrusses = (Cset != NULL) ;
-
-    int64_t k ;
-    for (k = 0 ; k < 3 ; k++)
-    {
-        if (keep_all_ktrusses)
-        {
-            Cset [k] = NULL ;
-        }
-        ntris   [k] = 0 ;
-        nedges  [k] = 0 ;
-        nstepss [k] = 0 ;
-    }
-    (*kmax) = 0 ;
-    k = 0 ;
-
-    GrB_Info info ;
-
-    // the current k-truss
-    GrB_Matrix C = NULL ;
-
-    // select operator
-    GxB_SelectOp supportop = NULL ;
-    GrB_Vector Support = NULL ;
-
-    // get the size of A
-    GrB_Index n ;
-    OK (GrB_Matrix_nrows (&n, A)) ;
-
-    // create a select operator for GxB_select
-    OK (GxB_SelectOp_new (&supportop, support_function, GrB_INT64, GrB_INT64)) ;
-    OK (GrB_Vector_new (&Support, GrB_INT64, 1)) ;
-
-    //--------------------------------------------------------------------------
-    // C<A> = A*A
-    //--------------------------------------------------------------------------
-
-    double tmult = 0 ;
-    double tsel  = 0 ;
-    double t1 = omp_get_wtime ( ) ;
-
-    GrB_Index last_cnz ;
-    OK (GrB_Matrix_nvals (&last_cnz, A)) ;       // last_cnz = nnz (A)
-    OK (GrB_Matrix_new (&C, GrB_INT64, n, n)) ;
-    OK (GrB_mxm (C, A, NULL, GxB_PLUS_LAND_INT64, A, A, NULL)) ;
-    int64_t nsteps = 1 ;
-
-    double t2 = omp_get_wtime ( ) ;
-    printf ("cmult time: %g\n", t2-t1) ;
-    tmult += (t2-t1) ;
-
-    //--------------------------------------------------------------------------
-    // find all k-trusses
-    //--------------------------------------------------------------------------
-
-    for (k = 3 ; ; k++)
-    {
-
-        //----------------------------------------------------------------------
-        // find the k-truss
-        //----------------------------------------------------------------------
-
-        int64_t support = (k-2) ;
-        OK (GrB_Vector_setElement (Support, support, 0)) ;
-
-        while (1)
-        {
-
-            //------------------------------------------------------------------
-            // C = C .* (C >= support)
-            //------------------------------------------------------------------
-
-            double t1 = omp_get_wtime ( ) ;
-
-            OK (GxB_select (C, NULL, NULL, supportop, C, Support, NULL)) ;
-
-            t2 = omp_get_wtime ( ) ;
-            printf ("select time: %g\n", t2-t1) ;
-            tsel += (t2-t1) ;
-
-            //------------------------------------------------------------------
-            // check if k-truss has been found
-            //------------------------------------------------------------------
-
-            GrB_Index cnz ;
-            OK (GrB_Matrix_nvals (&cnz, C)) ;
-            if (cnz == last_cnz)
-            {
-                // k-truss has been found
-                int64_t nt = 0 ;
-                OK (GrB_reduce (&nt, NULL, GxB_PLUS_INT64_MONOID, C, NULL)) ;
-                ntris   [k] = nt / 6 ;
-                nedges  [k] = cnz / 2 ;
-                nstepss [k] = nsteps ;
-                nsteps = 0 ;
-                if (cnz == 0)
-                {
-                    // this is the last k-truss
-                    OK (GrB_free (&supportop)) ;    // free the select operator
-                    OK (GrB_free (Support)) ;       // free the select Thunk
-                    OK (GrB_free (&C)) ;            // free last empty k-truss
-                    (*kmax) = k ;
-                    if (keep_all_ktrusses)
-                    {
-                        Cset [k] = NULL ;
-                    }
-                    printf ("allktruss graphblas done: tmult %g tsel %g\n",
-                        tmult, tsel) ;
-                    return (GrB_SUCCESS) ;
-                }
-                else if (keep_all_ktrusses)
-                {
-                    // save the k-truss in the list of output k-trusses
-                    OK (GrB_Matrix_dup (&(Cset [k]), C)) ;
-                }
-                // start finding the next k-truss
-                break ;
-            }
-
-            // continue searching for this k-truss
-            last_cnz = cnz ;
-            nsteps++ ;
-
-            //------------------------------------------------------------------
-            // C<C> = C*C
-            //------------------------------------------------------------------
-
-            t1 = omp_get_wtime ( ) ;
-
-            OK (GrB_mxm (C, C, NULL, GxB_PLUS_LAND_INT64, C, C, NULL)) ;
-
-            t2 = omp_get_wtime ( ) ;
-            printf ("mult time: %g\n", t2-t1) ;
-            tmult += (t2-t1) ;
-        }
-    }
-}
-
-#undef FREE_ALL
-
diff --git a/GraphBLAS/Extras/ktruss/allktruss_graphblas_main.c b/GraphBLAS/Extras/ktruss/allktruss_graphblas_main.c
deleted file mode 100644
index fe4e7587d5..0000000000
--- a/GraphBLAS/Extras/ktruss/allktruss_graphblas_main.c
+++ /dev/null
@@ -1,166 +0,0 @@
-//------------------------------------------------------------------------------
-// allktruss_graphblas_main.c: find all k-trusses of a graph using GraphBLAS
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Read a graph from a file and find the k-truss
-// Usage:
-//
-//  allktruss_graphblas_main   < infile
-//  allktruss_graphblas_main 1 < infile
-//  allktruss_graphblas_main 0 nrows ncols ntuples method
-//  allktruss_graphblas_main 1 nx ny method
-//
-// Where infile has one line per edge in the graph; these have the form
-//
-//  i j x
-//
-// where A(i,j)=x is performed by GrB_Matrix_build, to construct the matrix.
-// The default file format is 0-based, but with "ktruss_main 1 < infile" the
-// matrix is assumed to be 1-based.
-
-// The dimensions of A are assumed to be the largest row and column indices,
-// plus one if the matrix is 1-based.  This is done in read_matrix.c.
-//
-// For the second usage (ktruss_main 0 ...), a random symmetric matrix is
-// created of size nrows-by-ncols with ntuples edges (some will be duplicates
-// so the actual number of edges will be slightly less).  The method is 0 for
-// setElement and 1 for build.  The matrix will not have any self-edges, which
-// cause the method to fail.
-//
-// The 3rd usage (ktruss_main 1 ...) creates a finite-element matrix on an
-// nx-by-ny grid.  Method is 0 to 3; refer to wathen.c for details.
-
-#ifndef MATLAB_MEX_FILE
-
-// macro used by OK(...) to free workspace if an error occurs
-#define FREE_ALL                                            \
-    if (Cset != NULL)                                       \
-    {                                                       \
-        for (int64_t k = 0 ; k < n+1 ; k++)                 \
-        {                                                   \
-            if (Cset [k] != NULL) GrB_free (&(Cset [k])) ;  \
-        }                                                   \
-    }                                                       \
-    if (ntris != NULL) free (ntris) ;                       \
-    if (nedges != NULL) free (nedges) ;                     \
-    if (nstepss != NULL) free (nstepss) ;                   \
-    GrB_free (&T) ;                                         \
-    GrB_free (&A) ;
-
-#include "ktruss_graphblas_def.h"
-
-int main (int argc, char **argv)
-{
-    GrB_Index n = 0, anedges ;
-    GrB_Matrix T = NULL, A = NULL, *Cset = NULL ;
-    GrB_Info info ;
-    double tic [2] ;
-
-    int64_t kmax ;              // smallest k where k-truss is empty
-    int64_t *ntris = NULL ;     // size n, ntris [k] is #triangles in k-truss
-    int64_t *nedges = NULL ;    // size n, nedges [k] is #edges in k-truss
-    int64_t *nstepss = NULL ;   // size n, nsteps [k] is #steps for k-truss
-
-    GrB_init (GrB_NONBLOCKING) ;
-    printf ("--------------------------------------------------------------\n");
-
-    //--------------------------------------------------------------------------
-    // get a symmetric matrix with no self edges
-    //--------------------------------------------------------------------------
-
-    // get_matrix reads in a double-precision matrix.  It could easily be
-    // changed to read in int64 matrix instead, but this would affect the
-    // other GraphBLAS demos.  So the time to typecast A = (int64) T is added
-    // to the read time, not the prep time for finding the k-truss.
-    simple_tic (tic) ;
-    OK (get_matrix (&T, argc, argv, true, false)) ;
-    OK (GrB_Matrix_nrows (&n, T)) ;
-
-    // A = spones (T), and typecast to int64
-    OK (GrB_Matrix_new (&A, GrB_INT64, n, n)) ;
-    OK (GrB_apply (A, NULL, NULL, GxB_ONE_INT64, T, NULL)) ;
-    double t_read = simple_toc (tic) ;
-    printf ("\ntotal time to read A matrix: %14.6f sec\n", t_read) ;
-    GrB_free (&T) ;
-    OK (GrB_Matrix_nvals (&anedges, A)) ;
-    anedges /= 2 ;
-
-    ntris   = (int64_t *) malloc ((n+1) * sizeof (int64_t)) ;
-    nedges  = (int64_t *) malloc ((n+1) * sizeof (int64_t)) ;
-    nstepss = (int64_t *) malloc ((n+1) * sizeof (int64_t)) ;
-
-    if (ntris == NULL || nedges == NULL || nstepss == NULL)
-    {
-        FREE_ALL ;
-        printf ("out of memory\n") ;
-        exit (1) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // find all k-trusses
-    //--------------------------------------------------------------------------
-
-    // for further MATLAB analysis
-    FILE *fm = fopen ("allktruss_grb_results.m", "a") ;
-    fprintf (fm, "\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n") ;
-    fprintf (fm, "id = id + 1 ;\n") ;
-    fprintf (fm, "N (id) = %" PRIu64 ";\n", n) ;
-    fprintf (fm, "Nedges (id) = %" PRIu64 " ;\n", anedges) ;
-    fprintf (fm, "%% Tgb (keep, id) = time for all-k-truss\n") ;
-
-    // for (int keep = 0 ; keep <= 1 ; keep++)
-    int keep = 0 ;
-    {
-        printf ("\nkeep: %d\n", keep) ;
-        if (keep)
-        {
-            // construct and keep all k-trusses
-            Cset = (GrB_Matrix *) calloc (n+1, sizeof (GrB_Matrix)) ;
-            if (Cset == NULL)
-            {
-                FREE_ALL ;
-                printf ("out of memory\n") ;
-                exit (1) ;
-            }
-        }
-
-        simple_tic (tic) ;
-        OK (allktruss_graphblas (Cset, A, &kmax, ntris, nedges, nstepss)) ;
-        double t = simple_toc (tic) ;
-
-        if (keep == 0) fprintf (fm, "Kmax (id) = %" PRId64 " ;\n", kmax) ;
-        fprintf (fm, "Tgb (%d,id) = %12.6g ;\n", 1+keep,t) ;
-
-        for (int64_t k = 3 ; k <= kmax ; k++)
-        {
-            int64_t nt = ntris [k] ;
-            int64_t ne = nedges [k] ;
-            int64_t nsteps = nstepss [k] ;
-            printf ("allktruss_grblas : "
-                "k %4"PRId64" ne %10"PRId64" nt %10"PRId64" steps %4"PRId64"\n",
-                k, ne, nt, nsteps) ;
-            if (keep && Cset [k] != NULL)
-            {
-                GrB_Index cnz = 0 ;
-                GrB_Matrix_nvals (&cnz, Cset [k]) ;
-                if (cnz != 2*ne) { printf ("!!!\n") ; exit (1) ; }
-            }
-        }
-        printf ("allktruss graphblas : %12.6f sec, rate %7.2f keep: %d\n",
-            t, (kmax-2) * 1e-6*anedges/t, keep) ;
-    }
-
-    fprintf (fm, "File {id} = filetrim (file) ;\n\n") ;
-    fclose (fm) ;
-    FREE_ALL ;
-    GrB_finalize ( ) ;
-}
-
-#undef FREE_ALL
-#endif
-
diff --git a/GraphBLAS/Extras/ktruss/allktruss_grb_results.m b/GraphBLAS/Extras/ktruss/allktruss_grb_results.m
deleted file mode 100644
index faad3c1b4f..0000000000
--- a/GraphBLAS/Extras/ktruss/allktruss_grb_results.m
+++ /dev/null
@@ -1,1507 +0,0 @@
-function [Tgb, File, N, Nedges, Kmax] = allktruss_grb_results
-id = 0 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 442;
-Nedges (id) = 841 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =  0.000253332 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 442;
-Nedges (id) = 841 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =  0.000113724 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 442;
-Nedges (id) = 800 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =   6.0558e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 120;
-Nedges (id) = 346 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 7 ;
-Tgb (1,id) =  0.000207554 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 120;
-Nedges (id) = 346 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =   7.8805e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 120;
-Nedges (id) = 240 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =   3.9467e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20;
-Nedges (id) = 31 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =   5.2063e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20;
-Nedges (id) = 31 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =   4.3571e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20;
-Nedges (id) = 24 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =   3.3209e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 300;
-Nedges (id) = 720 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =   5.3491e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 30;
-Nedges (id) = 49 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =    5.559e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 30;
-Nedges (id) = 49 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =   4.5722e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 30;
-Nedges (id) = 40 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =   3.3551e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 60;
-Nedges (id) = 104 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =   6.5182e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 60;
-Nedges (id) = 104 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =    4.842e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 60;
-Nedges (id) = 90 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =   3.5953e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170;
-Nedges (id) = 313 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =  0.000111658 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170;
-Nedges (id) = 313 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =   6.7433e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170;
-Nedges (id) = 288 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =    4.317e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 300;
-Nedges (id) = 940 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 8 ;
-Tgb (1,id) =  0.000522005 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 300;
-Nedges (id) = 940 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =  0.000142699 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1020;
-Nedges (id) = 3448 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 9 ;
-Tgb (1,id) =   0.00268101 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1020;
-Nedges (id) = 3448 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =  0.000619642 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1020;
-Nedges (id) = 2880 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =  0.000104488 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2132;
-Nedges (id) = 4156 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =   0.00175059 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2132;
-Nedges (id) = 4156 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =  0.000610993 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2132;
-Nedges (id) = 4050 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =  0.000158561 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1200;
-Nedges (id) = 4320 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =  0.000129923 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1200;
-Nedges (id) = 6583 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 11 ;
-Tgb (1,id) =   0.00950736 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1200;
-Nedges (id) = 6583 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 6 ;
-Tgb (1,id) =   0.00121635 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4420;
-Nedges (id) = 14400 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =  0.000350233 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/as20000102/as20000102_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 6474;
-Nedges (id) = 12572 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 11 ;
-Tgb (1,id) =    0.0261303 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4420;
-Nedges (id) = 15988 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 13 ;
-Tgb (1,id) =    0.0244848 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4420;
-Nedges (id) = 15988 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =   0.00478579 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-GrQc/ca-GrQc_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5242;
-Nedges (id) = 14484 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 45 ;
-Tgb (1,id) =    0.0578998 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5100;
-Nedges (id) = 23040 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =  0.000456607 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella08/p2p-Gnutella08_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 6301;
-Nedges (id) = 20777 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 6 ;
-Tgb (1,id) =   0.00621734 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010331/oregon1_010331_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10670;
-Nedges (id) = 22002 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 17 ;
-Tgb (1,id) =    0.0809244 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010407/oregon1_010407_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10729;
-Nedges (id) = 21999 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 15 ;
-Tgb (1,id) =    0.0796919 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010414/oregon1_010414_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10790;
-Nedges (id) = 22469 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 16 ;
-Tgb (1,id) =    0.0884489 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010421/oregon1_010421_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10859;
-Nedges (id) = 22747 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 16 ;
-Tgb (1,id) =    0.0885706 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010428/oregon1_010428_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10886;
-Nedges (id) = 22493 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 16 ;
-Tgb (1,id) =    0.0886218 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010505/oregon1_010505_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10943;
-Nedges (id) = 22607 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 15 ;
-Tgb (1,id) =    0.0846732 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010512/oregon1_010512_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11011;
-Nedges (id) = 22677 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 16 ;
-Tgb (1,id) =    0.0893416 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010519/oregon1_010519_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11051;
-Nedges (id) = 22724 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 16 ;
-Tgb (1,id) =    0.0864163 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010526/oregon1_010526_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11174;
-Nedges (id) = 23409 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 15 ;
-Tgb (1,id) =    0.0967056 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5100;
-Nedges (id) = 31036 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 13 ;
-Tgb (1,id) =    0.0951249 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5100;
-Nedges (id) = 31036 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 6 ;
-Tgb (1,id) =    0.0130914 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella09/p2p-Gnutella09_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8114;
-Nedges (id) = 26013 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 6 ;
-Tgb (1,id) =   0.00780354 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-HepTh/ca-HepTh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 9877;
-Nedges (id) = 25973 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 33 ;
-Tgb (1,id) =    0.0451929 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010331/oregon2_010331_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10900;
-Nedges (id) = 31180 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 26 ;
-Tgb (1,id) =     0.492072 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010407/oregon2_010407_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10981;
-Nedges (id) = 30855 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 25 ;
-Tgb (1,id) =     0.438208 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010505/oregon2_010505_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11157;
-Nedges (id) = 30943 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 22 ;
-Tgb (1,id) =     0.434718 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010414/oregon2_010414_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11019;
-Nedges (id) = 31761 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 25 ;
-Tgb (1,id) =     0.594638 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010421/oregon2_010421_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11080;
-Nedges (id) = 31538 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 23 ;
-Tgb (1,id) =     0.498337 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010428/oregon2_010428_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11113;
-Nedges (id) = 31434 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 22 ;
-Tgb (1,id) =     0.444189 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010512/oregon2_010512_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11260;
-Nedges (id) = 31303 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 22 ;
-Tgb (1,id) =     0.404582 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010519/oregon2_010519_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11375;
-Nedges (id) = 32287 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 25 ;
-Tgb (1,id) =     0.554299 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-81-256-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 21074;
-Nedges (id) = 41809 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =    0.0427856 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-81-256-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 21074;
-Nedges (id) = 41472 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =   0.00129614 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010526/oregon2_010526_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11461;
-Nedges (id) = 32730 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 26 ;
-Tgb (1,id) =     0.537657 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella06/p2p-Gnutella06_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8717;
-Nedges (id) = 31525 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =   0.00515368 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-81-256-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 21074;
-Nedges (id) = 41809 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =    0.0119788 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella05/p2p-Gnutella05_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8846;
-Nedges (id) = 31839 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =   0.00533332 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella04/p2p-Gnutella04_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10876;
-Nedges (id) = 39994 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =   0.00533533 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/as-caida20071105/as-caida20071105_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26475;
-Nedges (id) = 53381 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 17 ;
-Tgb (1,id) =     0.260024 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella25/p2p-Gnutella25_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 22687;
-Nedges (id) = 54705 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =   0.00875267 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella24/p2p-Gnutella24_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26518;
-Nedges (id) = 65369 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =    0.0104361 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/facebook_combined/facebook_combined_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4039;
-Nedges (id) = 88234 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 98 ;
-Tgb (1,id) =      27.8845 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36244;
-Nedges (id) = 129600 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =   0.00274188 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella30/p2p-Gnutella30_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36682;
-Nedges (id) = 88328 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =    0.0136238 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36244;
-Nedges (id) = 137164 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 20 ;
-Tgb (1,id) =     0.796587 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36244;
-Nedges (id) = 137164 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =     0.156667 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20400;
-Nedges (id) = 138240 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =   0.00215212 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26520;
-Nedges (id) = 144000 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =   0.00245312 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-CondMat/ca-CondMat_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 23133;
-Nedges (id) = 93439 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 27 ;
-Tgb (1,id) =     0.544507 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-HepPh/ca-HepPh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 12008;
-Nedges (id) = 118489 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 240 ;
-Tgb (1,id) =      19.5089 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26520;
-Nedges (id) = 175873 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 20 ;
-Tgb (1,id) =      1.54039 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26520;
-Nedges (id) = 175873 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 6 ;
-Tgb (1,id) =     0.268682 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella31/p2p-Gnutella31_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 62586;
-Nedges (id) = 147892 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =    0.0238263 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20400;
-Nedges (id) = 217255 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 7 ;
-Tgb (1,id) =      0.24483 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20400;
-Nedges (id) = 217255 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 20 ;
-Tgb (1,id) =      2.54206 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/email-Enron/email-Enron_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36692;
-Nedges (id) = 183831 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 23 ;
-Tgb (1,id) =      8.22727 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-260610-65536_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 65536;
-Nedges (id) = 260610 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =    0.0422552 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-AstroPh/ca-AstroPh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 18772;
-Nedges (id) = 198050 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 58 ;
-Tgb (1,id) =      14.9426 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/loc-brightkite_edges/loc-brightkite_edges_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 58228;
-Nedges (id) = 214078 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 44 ;
-Tgb (1,id) =      4.83651 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-256-625-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 160882;
-Nedges (id) = 320000 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =    0.0101437 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-256-625-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 160882;
-Nedges (id) = 320881 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =     0.789579 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-256-625-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 160882;
-Nedges (id) = 320881 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =     0.194539 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-HepTh/cit-HepTh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 27770;
-Nedges (id) = 352285 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 31 ;
-Tgb (1,id) =      23.4498 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Epinions1/soc-Epinions1_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 75879;
-Nedges (id) = 405740 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 34 ;
-Tgb (1,id) =       32.005 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 265214;
-Nedges (id) = 364481 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 21 ;
-Tgb (1,id) =      3.88151 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-HepPh/cit-HepPh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 34546;
-Nedges (id) = 420877 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 26 ;
-Tgb (1,id) =       15.832 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Slashdot0811/soc-Slashdot0811_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 77360;
-Nedges (id) = 469180 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 36 ;
-Tgb (1,id) =       7.5609 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 82168;
-Nedges (id) = 504230 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 37 ;
-Tgb (1,id) =      8.54298 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-1045506-262144_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 262144;
-Nedges (id) = 1045506 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =     0.172318 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 132600;
-Nedges (id) = 1152000 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =    0.0170821 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/loc-gowalla_edges/loc-gowalla_edges_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 196591;
-Nedges (id) = 950327 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 30 ;
-Tgb (1,id) =      39.6969 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0302/amazon0302_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 262111;
-Nedges (id) = 899792 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 8 ;
-Tgb (1,id) =      1.87318 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 132600;
-Nedges (id) = 1582861 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 29 ;
-Tgb (1,id) =      66.4336 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 132600;
-Nedges (id) = 1582861 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 7 ;
-Tgb (1,id) =      7.75588 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-256-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 547924;
-Nedges (id) = 2073600 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =    0.0493278 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-256-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 547924;
-Nedges (id) = 2132284 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 29 ;
-Tgb (1,id) =       89.153 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-256-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 547924;
-Nedges (id) = 2132284 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =      22.2272 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-PA/roadNet-PA_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1088092;
-Nedges (id) = 1541898 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =     0.257129 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 362440;
-Nedges (id) = 2332800 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =    0.0399085 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 362440;
-Nedges (id) = 2606125 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 29 ;
-Tgb (1,id) =      154.091 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 362440;
-Nedges (id) = 2606125 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 6 ;
-Tgb (1,id) =      27.1739 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1379917;
-Nedges (id) = 1921660 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =      0.32205 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-4188162-1048576_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1048576;
-Nedges (id) = 4188162 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =     0.710848 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/flickrEdges/flickrEdges_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 105938;
-Nedges (id) = 2316948 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 575 ;
-Tgb (1,id) =      8210.31 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0312/amazon0312_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 400727;
-Nedges (id) = 2349869 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 12 ;
-Tgb (1,id) =      24.5664 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0505/amazon0505_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 410236;
-Nedges (id) = 2439437 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 12 ;
-Tgb (1,id) =       28.217 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0601/amazon0601_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 403394;
-Nedges (id) = 2443408 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 12 ;
-Tgb (1,id) =      22.8755 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-CA/roadNet-CA_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1965206;
-Nedges (id) = 2766607 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =     0.481011 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale18-ef16/graph500-scale18-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 174147;
-Nedges (id) = 3800348 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 160 ;
-Tgb (1,id) =      7946.16 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 530400;
-Nedges (id) = 6912000 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =     0.105715 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 530400;
-Nedges (id) = 11080030 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 63 ;
-Tgb (1,id) =       1889.9 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 530400;
-Nedges (id) = 11080030 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 8 ;
-Tgb (1,id) =      155.664 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale19-ef16/graph500-scale19-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 335318;
-Nedges (id) = 7729675 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 214 ;
-Tgb (1,id) =      25750.5 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-16764930-4194304_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4194304;
-Nedges (id) = 16764930 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =      2.87209 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2174640;
-Nedges (id) = 23328000 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 3 ;
-Tgb (1,id) =     0.480804 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale20-ef16/graph500-scale20-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 645820;
-Nedges (id) = 15680861 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 285 ;
-Tgb (1,id) =      86611.1 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2174640;
-Nedges (id) = 28667380 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 7 ;
-Tgb (1,id) =      1780.28 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2174640;
-Nedges (id) = 28667380 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 85 ;
-Tgb (1,id) =      10872.6 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 3774768;
-Nedges (id) = 16518947 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 37 ;
-Tgb (1,id) =      108.743 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale21-ef16/graph500-scale21-ef16_adj.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-67084290-16777216_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 16777216;
-Nedges (id) = 67084290 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =      11.9693 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc6/V2a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 55042369;
-Nedges (id) = 58608800 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =      12.3415 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/U1a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 67716231;
-Nedges (id) = 69389281 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =      14.2651 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 67108864;
-Nedges (id) = 268386306 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =      46.1308 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/P1a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 139353211;
-Nedges (id) = 148914992 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =      30.4777 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/A2a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170728175;
-Nedges (id) = 180292586 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =      35.9943 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/V1r.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 214005017;
-Nedges (id) = 232705452 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 4 ;
-Tgb (1,id) =      31.5117 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-1073643522-268435456_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 268435456;
-Nedges (id) = 1073643522 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 5 ;
-Tgb (1,id) =      189.567 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/snap/friendster/friendster_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 119432957;
-Nedges (id) = 1799999986 ;
-% Tgb (keep, id) = time for all-k-truss
-Kmax (id) = 6 ;
-Tgb (1,id) =      15274.6 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz ';
-id = id + 1 ;
diff --git a/GraphBLAS/Extras/ktruss/allktruss_main.c b/GraphBLAS/Extras/ktruss/allktruss_main.c
deleted file mode 100644
index acdf4e0f4b..0000000000
--- a/GraphBLAS/Extras/ktruss/allktruss_main.c
+++ /dev/null
@@ -1,194 +0,0 @@
-//------------------------------------------------------------------------------
-// allktruss_main.c: construct all k-trusses of a graph (without GraphBLAS)
-//------------------------------------------------------------------------------
-
-// Read a graph from a file and construct all k-trusses
-// Usage:
-//
-//  allktruss_main < infile
-//
-// See the "kgo" script for the whole GraphChallenge collection.
-
-#include "ktruss_def.h"
-
-#define CHUNK 1000
-
-// select the system:
-
-// cholesky.cse.tamu.edu: 160 hardware threads (20 cores, SMT8),
-// IBM Power8 8335-GTB, 4GHz, 1TB RAM
-
-#define MAX_THREADS 160
-#define FOR_ALL_THREADS(max_threads) \
-    for (int nthreads = 1 ; nthreads <= max_threads ; \
-        nthreads = (nthreads == 128) ? 160 : (2*nthreads))
-/*
-*/
-
-// backslash.cse.tamu.edu: 24 cores, Intel Xeon CPU E5-2695 v2 @ 2.4GHz
-// 3/4 TB RAM
-
-/*
-#define MAX_THREADS 48
-#define FOR_ALL_THREADS(max_threads) \
-    for (int nthreads = 1 ; nthreads <= max_threads ; \
-        nthreads = (nthreads == 16) ? 24 : (2*nthreads))
-*/
-
-// slash MacBook: 4 cores, Intel Core i7, 2.8Ghz, 16GB RAM
-/*
-#define MAX_THREADS 4
-#define FOR_ALL_THREADS(max_threads) \
-    for (int nthreads = 1 ; nthreads <= max_threads ; nthreads++)
-*/
-
-int main (int argc, char **argv)
-{
-
-    double tic, Time [MAX_THREADS+1] ;
-
-    //--------------------------------------------------------------------------
-    // get a 1-based symmetric matrix with no self edges, from stdin
-    //--------------------------------------------------------------------------
-
-    FILE *f ;
-    int64_t *Ap ;
-    Index *Ai, n ;
-
-    printf ("=============================================================\n") ;
-
-    tic = omp_get_wtime ( ) ;
-    if (argc > 1)
-    {
-        fprintf (stderr, "%s\n", argv [1]) ;
-        printf ("\nfile: %s ", argv [1]) ;
-        f = fopen (argv [1], "r") ;
-        if (f == NULL) { printf (": no such file\n") ; exit (1) ; }
-    }
-    else
-    {
-        f = stdin ;
-    }
-    if (!ktruss_read (f, &Ap, &Ai, &n))
-    {
-        printf ("failed to read matrix\n") ;
-        exit (1) ;
-    }
-    if (f != stdin) fclose (f) ;
-    double tread = omp_get_wtime ( ) - tic ;
-
-    int64_t nnz = Ap [n] ;
-    int64_t anedges = nnz / 2 ;
-    printf ("n %10"PRId64" edges %12"PRId64" read time: %10.4f sec\n",
-        (int64_t) n, anedges, tread) ;
-
-    /*
-    Index *Ax = (Index *) malloc ((nnz+1) * sizeof (Index)) ;
-    if (Ax == NULL)
-    {
-        printf ("out of memory\n") ;
-        exit (1) ;
-    }
-    */
-
-    int64_t *Sp = (int64_t *) malloc ((n+1)   * sizeof (int64_t)) ;
-    Index   *Si = (Index   *) malloc ((nnz+1) * sizeof (Index)) ;
-    Index   *Sx = (Index   *) malloc ((nnz+1) * sizeof (Index)) ;
-    if (Sp == NULL || Si == NULL || Sx == NULL)
-    {
-        printf ("out of memory\n") ;
-        exit (1) ;
-    }
-
-    int64_t kmax ;              // smallest k where k-truss is empty
-    int64_t *ntris = NULL ;     // size n, ntris [k] is #triangles in k-truss
-    int64_t *nedges = NULL ;    // size n, nedges [k] is #edges in k-truss
-    int64_t *nstepss = NULL ;   // size n, nsteps [k] is #steps for k-truss
-
-    ntris   = (int64_t *) malloc ((n+1) * sizeof (int64_t)) ;
-    nedges  = (int64_t *) malloc ((n+1) * sizeof (int64_t)) ;
-    nstepss = (int64_t *) malloc ((n+1) * sizeof (int64_t)) ;
-
-    if (ntris == NULL || nedges == NULL || nstepss == NULL)
-    {
-        printf ("out of memory\n") ;
-        exit (1) ;
-    }
-
-    //--------------------------------------------------------------------------
-
-    printf ("\n") ;
-
-    int64_t **Cps = NULL ;
-    Index   **Cpi = NULL ;
-    Index   **Cpx = NULL ;
-
-    // for further MATLAB analysis
-    FILE *fm = fopen ("allktruss_results.m", "a") ;
-    fprintf (fm, "\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n") ;
-    fprintf (fm, "id = id + 1 ;\n") ;
-    fprintf (fm, "N (id) = %" PRIu64 " ;\n", n) ;
-    fprintf (fm, "Nedges (id) = %" PRIu64 " ;\n", anedges) ;
-    fprintf (fm, "%% T (keep, nthreads, id) = time for all-k-truss\n") ;
-    fprintf (fm, "T (1:2, 1:%d, id) = nan ;\n", MAX_THREADS) ;
-
-    // for (int keep = 0 ; keep <= 1 ; keep++)
-    int keep = 0 ;
-    {
-        printf ("keep %d\n", keep) ;
-
-        if (keep)
-        {
-            printf ("\n=== keep all k-trusses:\n") ;
-            Cps = (int64_t **) malloc ((n+1) * sizeof (int64_t *)) ;
-            Cpi = (Index   **) malloc ((n+1) * sizeof (Index   *)) ;
-            Cpx = (Index   **) malloc ((n+1) * sizeof (Index   *)) ;
-        }
-        else
-        {
-            printf ("\n=== compute but then discard all k-trusses:\n") ;
-        }
-
-        FOR_ALL_THREADS (MAX_THREADS)
-        {
-            printf ("allktruss, threads: %d\n", nthreads) ;
-
-            memcpy (Sp, Ap, (n+1) * sizeof (int64_t)) ;
-            memcpy (Si, Ai, (nnz+1) * sizeof (Index)) ;
-
-            tic = omp_get_wtime ( ) ;
-            if (!allktruss (Sp, Si, Sx, n, nthreads, CHUNK,
-                &kmax, ntris, nedges, nstepss, Cps, Cpi, Cpx))
-            {
-                printf ("failure\n") ;
-                exit (1) ;
-            }
-            double t = omp_get_wtime ( ) - tic ;
-            Time [nthreads] = t ;
-
-            for (Index k = 3 ; k <= kmax ; k++)
-            {
-                int64_t nt = ntris [k] ;
-                int64_t ne = nedges [k] ;
-                int64_t nsteps = nstepss [k] ;
-                printf ("allktruss : k %4"PRId64" ne %10"PRId64" nt %10"PRId64" steps %4"PRId64"\n",
-                    k, ne, nt, nsteps) ;
-            }
-            printf ("allktruss     nthreads %3d : %12.6f sec, rate %7.2f"
-                " speedup %6.2f\n", nthreads, t, (kmax-2) * 1e-6*anedges/t,
-                Time [1] / t) ;
-        }
-
-        if (keep == 0) fprintf (fm, "Kmax (id) = %" PRId64 " ;\n", kmax) ;
-
-        FOR_ALL_THREADS (MAX_THREADS)
-        {
-            fprintf (fm, "T (%d,%d,id) = %12.6g ;\n",
-                1+keep, nthreads, Time [nthreads]) ;
-        }
-    }
-    fprintf (fm, "File {id} = filetrim (file) ;\n\n") ;
-
-    fclose (fm) ;
-}
-
diff --git a/GraphBLAS/Extras/ktruss/allktruss_mex.c b/GraphBLAS/Extras/ktruss/allktruss_mex.c
deleted file mode 100644
index 9d3a59042c..0000000000
--- a/GraphBLAS/Extras/ktruss/allktruss_mex.c
+++ /dev/null
@@ -1,129 +0,0 @@
-//------------------------------------------------------------------------------
-// allktruss_mex.c:  construct all k-trusses of a graph
-//------------------------------------------------------------------------------
-
-// usage:  [stats,AllC] = allktruss_mex (A)
-
-// This function computes the set of all k-trusses.  It is identical to
-// allktruss.m; see that function for a complete description.
-
-#include "ktruss_def.h"
-
-static const char *stat_fields [ ] = { "kmax", "ntris", "nedges", "nsteps" } ;
-
-mxArray *createstat (int64_t kmax, int64_t *stat)
-{
-    mxArray *x = mxCreateDoubleMatrix (1, kmax, mxREAL) ;
-    double *p = mxGetPr (x) ;
-    for (int64_t i = 1 ; i <= kmax ; i++)
-    {
-        p [i-1] = stat [i] ;
-    }
-    return (x) ;
-}
-
-void mexFunction
-(
-    int nargout,
-    mxArray *pargout [ ],
-    int nargin,
-    const mxArray *pargin [ ]
-)
-{
-    
-    // check inputs
-    if (nargin != 1 || nargout > 2)
-    {
-        mexErrMsgTxt ("usage: [stats,AllC] = allktruss_mex (A)") ;
-    }
-
-    // get inputs
-    int64_t n = (int64_t) mxGetN (pargin [0]) ;
-    int64_t *Ap = (int64_t *) mxGetJc (pargin [0]) ;
-    int64_t *Ai = (int64_t *) mxGetIr (pargin [0]) ;
-    int64_t nnz = Ap [n] ;
-
-    // create input for allktruss
-    int64_t *Cp = (int64_t *) mxMalloc ((n+1) * sizeof (int64_t)) ;
-    int64_t *Ci = (int64_t *) mxMalloc ((nnz+1) * sizeof (int64_t)) ;
-    int64_t *Ce = (int64_t *) mxMalloc ((nnz+1) * sizeof (int64_t)) ;
-    memcpy (Cp, Ap, (n+1) * sizeof (int64_t)) ;
-    memcpy (Ci, Ai, (nnz) * sizeof (int64_t)) ;
-
-    // create the output statistics
-    int64_t kmax ;
-    int64_t *ntris   = (int64_t *) mxMalloc ((n+1) * sizeof (int64_t)) ;
-    int64_t *nedges  = (int64_t *) mxMalloc ((n+1) * sizeof (int64_t)) ;
-    int64_t *nstepss = (int64_t *) mxMalloc ((n+1) * sizeof (int64_t)) ;
-
-    // create the arrays to hold all output k-trusses
-    int64_t **Cps = NULL ;
-    int64_t **Cis = NULL ;
-    int64_t **Cxs = NULL ;
-    if (nargout > 1)
-    {
-        Cps = (int64_t **) mxMalloc ((n+1) * sizeof (int64_t *)) ;
-        Cis = (int64_t **) mxMalloc ((n+1) * sizeof (int64_t *)) ;
-        Cxs = (int64_t **) mxMalloc ((n+1) * sizeof (int64_t *)) ;
-    }
-
-    // construct all ktrusses
-    allktruss (Cp, Ci, Ce, n, 1, n, &kmax, ntris, nedges, nstepss,
-        Cps, Cis, Cxs) ;
-
-    // create the MATLAB cell array for all k-trusses
-    if (nargout > 1)
-    {
-        // printf ("create all %ld ktrusses\n", kmax) ;
-        pargout [1] = mxCreateCellMatrix (1, kmax) ;
-
-        for (int64_t k = 1 ; k <= 2 ; k++)
-        {
-            // there is no 1-truss or 2-truss
-            mxArray *C = mxCreateDoubleMatrix (0, 0, mxREAL) ;
-            mxSetCell (pargout [1], k-1, C) ;
-        }
-
-        // the kmax-truss is empty
-        mxArray *C = mxCreateSparse (n, n, 0, mxREAL) ;
-        mxSetCell (pargout [1], kmax-1, C) ;
-
-        // non-empty k-trusses
-        for (int64_t k = 3 ; k <= kmax-1 ; k++)
-        {
-            // copy the k-truss into the MATLAB cell array
-            int64_t *Sp = Cps [k] ;
-            int64_t *Si = Cis [k] ;
-            int64_t *Se = Cxs [k] ;
-            int64_t snz = Sp [n] ;
-            mxArray *C = mxCreateSparse (n, n, snz, mxREAL) ;
-            memcpy (mxGetJc (C), Sp, (n+1) * sizeof (int64_t)) ;
-            memcpy (mxGetIr (C), Si, snz   * sizeof (int64_t)) ;
-            double *Cx = mxGetPr (C) ;
-            for (int64_t p = 0 ; p < snz ; p++)
-            {
-                Cx [p] = (double) (Se [p]) ;
-            }
-            mxFree (Sp) ;
-            mxFree (Si) ;
-            mxFree (Se) ;
-            mxSetCell (pargout [1], k-1, C) ;
-        }
-    }
-
-    mxFree (Cp) ;
-    mxFree (Ci) ;
-    mxFree (Ce) ;
-
-    // return statistics
-    pargout [0] = mxCreateStructMatrix (1, 1, 4, stat_fields) ;
-    mxSetFieldByNumber (pargout [0], 0, 0,
-            mxCreateDoubleScalar ((double) kmax)) ;
-    mxSetFieldByNumber (pargout [0], 0, 1, createstat (kmax, ntris)) ;
-    mxSetFieldByNumber (pargout [0], 0, 2, createstat (kmax, nedges)) ;
-    mxSetFieldByNumber (pargout [0], 0, 3, createstat (kmax, nstepss)) ;
-    mxFree (ntris) ;
-    mxFree (nedges) ;
-    mxFree (nstepss) ;
-}
-
diff --git a/GraphBLAS/Extras/ktruss/allktruss_results.m b/GraphBLAS/Extras/ktruss/allktruss_results.m
deleted file mode 100644
index 8d77d93bbc..0000000000
--- a/GraphBLAS/Extras/ktruss/allktruss_results.m
+++ /dev/null
@@ -1,2722 +0,0 @@
-function [T, File, N, Nedges, Kmax] = allktruss_results
-id = 0 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 442 ;
-Nedges (id) = 841 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =   0.00896674 ;
-T (1,2,id) =  0.000258044 ;
-T (1,4,id) =  0.000257569 ;
-T (1,8,id) =    0.0002575 ;
-T (1,16,id) =  0.000257447 ;
-T (1,32,id) =  0.000295462 ;
-T (1,64,id) =  0.000257779 ;
-T (1,128,id) =  0.000257222 ;
-T (1,160,id) =  0.000257251 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 442 ;
-Nedges (id) = 841 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =   0.00891291 ;
-T (1,2,id) =  0.000196568 ;
-T (1,4,id) =  0.000196267 ;
-T (1,8,id) =   0.00019623 ;
-T (1,16,id) =  0.000196001 ;
-T (1,32,id) =  0.000196031 ;
-T (1,64,id) =  0.000195955 ;
-T (1,128,id) =  0.000196015 ;
-T (1,160,id) =  0.000195905 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 442 ;
-Nedges (id) = 800 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =   0.00893932 ;
-T (1,2,id) =   0.00018553 ;
-T (1,4,id) =  0.000185328 ;
-T (1,8,id) =  0.000185322 ;
-T (1,16,id) =   0.00018496 ;
-T (1,32,id) =  0.000185018 ;
-T (1,64,id) =  0.000185236 ;
-T (1,128,id) =  0.000185065 ;
-T (1,160,id) =  0.000184885 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 120 ;
-Nedges (id) = 346 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 7 ;
-T (1,1,id) =   0.00886242 ;
-T (1,2,id) =  8.23671e-05 ;
-T (1,4,id) =  8.12244e-05 ;
-T (1,8,id) =  8.08751e-05 ;
-T (1,16,id) =  8.07391e-05 ;
-T (1,32,id) =  8.13249e-05 ;
-T (1,64,id) =  8.10344e-05 ;
-T (1,128,id) =  8.10614e-05 ;
-T (1,160,id) =   8.1067e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 120 ;
-Nedges (id) = 346 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =   0.00876287 ;
-T (1,2,id) =   3.7875e-05 ;
-T (1,4,id) =  3.70648e-05 ;
-T (1,8,id) =  3.69381e-05 ;
-T (1,16,id) =  3.65153e-05 ;
-T (1,32,id) =  3.64939e-05 ;
-T (1,64,id) =  3.65479e-05 ;
-T (1,128,id) =  3.64874e-05 ;
-T (1,160,id) =  3.66196e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 120 ;
-Nedges (id) = 240 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =   0.00879358 ;
-T (1,2,id) =  2.11252e-05 ;
-T (1,4,id) =   2.0856e-05 ;
-T (1,8,id) =  2.07741e-05 ;
-T (1,16,id) =  2.07499e-05 ;
-T (1,32,id) =  2.04816e-05 ;
-T (1,64,id) =   2.0721e-05 ;
-T (1,128,id) =  2.05394e-05 ;
-T (1,160,id) =  2.08654e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20 ;
-Nedges (id) = 31 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =   0.00875139 ;
-T (1,2,id) =  1.54767e-05 ;
-T (1,4,id) =  1.49673e-05 ;
-T (1,8,id) =  1.47289e-05 ;
-T (1,16,id) =  1.45538e-05 ;
-T (1,32,id) =  1.45389e-05 ;
-T (1,64,id) =   1.4645e-05 ;
-T (1,128,id) =  1.44877e-05 ;
-T (1,160,id) =  1.45491e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20 ;
-Nedges (id) = 31 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =   0.00870772 ;
-T (1,2,id) =  1.27014e-05 ;
-T (1,4,id) =  1.22096e-05 ;
-T (1,8,id) =  1.21119e-05 ;
-T (1,16,id) =  1.17868e-05 ;
-T (1,32,id) =  1.19647e-05 ;
-T (1,64,id) =  1.18883e-05 ;
-T (1,128,id) =  1.18148e-05 ;
-T (1,160,id) =  1.22115e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20 ;
-Nedges (id) = 24 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =   0.00880038 ;
-T (1,2,id) =  9.58517e-06 ;
-T (1,4,id) =  8.97702e-06 ;
-T (1,8,id) =  8.97143e-06 ;
-T (1,16,id) =  9.10275e-06 ;
-T (1,32,id) =  9.18005e-06 ;
-T (1,64,id) =  9.04594e-06 ;
-T (1,128,id) =  8.87457e-06 ;
-T (1,160,id) =  9.07946e-06 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 300 ;
-Nedges (id) = 720 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =   0.00880344 ;
-T (1,2,id) =  7.58218e-05 ;
-T (1,4,id) =  7.58385e-05 ;
-T (1,8,id) =  7.53999e-05 ;
-T (1,16,id) =  7.53524e-05 ;
-T (1,32,id) =  7.49184e-05 ;
-T (1,64,id) =  7.47563e-05 ;
-T (1,128,id) =  7.48057e-05 ;
-T (1,160,id) =   7.5385e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 30 ;
-Nedges (id) = 49 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =   0.00872731 ;
-T (1,2,id) =  1.76234e-05 ;
-T (1,4,id) =  1.69119e-05 ;
-T (1,8,id) =  1.67266e-05 ;
-T (1,16,id) =  1.65002e-05 ;
-T (1,32,id) =  1.64472e-05 ;
-T (1,64,id) =  1.65151e-05 ;
-T (1,128,id) =  1.65114e-05 ;
-T (1,160,id) =  1.62898e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 30 ;
-Nedges (id) = 49 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =   0.00872247 ;
-T (1,2,id) =  1.37007e-05 ;
-T (1,4,id) =  1.34204e-05 ;
-T (1,8,id) =  1.31959e-05 ;
-T (1,16,id) =  1.31056e-05 ;
-T (1,32,id) =  1.30683e-05 ;
-T (1,64,id) =  1.31307e-05 ;
-T (1,128,id) =  1.32229e-05 ;
-T (1,160,id) =  1.31289e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 30 ;
-Nedges (id) = 40 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =   0.00875355 ;
-T (1,2,id) =  1.00229e-05 ;
-T (1,4,id) =  9.77982e-06 ;
-T (1,8,id) =    9.696e-06 ;
-T (1,16,id) =  9.60659e-06 ;
-T (1,32,id) =  9.43337e-06 ;
-T (1,64,id) =  9.48273e-06 ;
-T (1,128,id) =  9.45479e-06 ;
-T (1,160,id) =  9.44454e-06 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 60 ;
-Nedges (id) = 104 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =   0.00875619 ;
-T (1,2,id) =  2.30651e-05 ;
-T (1,4,id) =  2.28342e-05 ;
-T (1,8,id) =  2.25622e-05 ;
-T (1,16,id) =  2.25492e-05 ;
-T (1,32,id) =  2.23815e-05 ;
-T (1,64,id) =  2.24002e-05 ;
-T (1,128,id) =  2.23862e-05 ;
-T (1,160,id) =  2.25827e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 60 ;
-Nedges (id) = 104 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =   0.00873085 ;
-T (1,2,id) =  1.78749e-05 ;
-T (1,4,id) =  1.72462e-05 ;
-T (1,8,id) =  1.71475e-05 ;
-T (1,16,id) =  1.71149e-05 ;
-T (1,32,id) =   1.7154e-05 ;
-T (1,64,id) =  1.72285e-05 ;
-T (1,128,id) =  1.74511e-05 ;
-T (1,160,id) =  1.69724e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 60 ;
-Nedges (id) = 90 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =    0.0087109 ;
-T (1,2,id) =  1.35675e-05 ;
-T (1,4,id) =  1.30935e-05 ;
-T (1,8,id) =  1.32211e-05 ;
-T (1,16,id) =  1.31419e-05 ;
-T (1,32,id) =  1.30832e-05 ;
-T (1,64,id) =  1.29277e-05 ;
-T (1,128,id) =  1.31335e-05 ;
-T (1,160,id) =  1.31475e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170 ;
-Nedges (id) = 313 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =   0.00876859 ;
-T (1,2,id) =  6.10985e-05 ;
-T (1,4,id) =  6.07697e-05 ;
-T (1,8,id) =  6.03404e-05 ;
-T (1,16,id) =  6.04298e-05 ;
-T (1,32,id) =  6.05164e-05 ;
-T (1,64,id) =  6.02398e-05 ;
-T (1,128,id) =  6.01802e-05 ;
-T (1,160,id) =  6.03339e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170 ;
-Nedges (id) = 313 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =   0.00875565 ;
-T (1,2,id) =  4.38327e-05 ;
-T (1,4,id) =    4.368e-05 ;
-T (1,8,id) =  4.32869e-05 ;
-T (1,16,id) =   4.3299e-05 ;
-T (1,32,id) =  4.30411e-05 ;
-T (1,64,id) =  4.31556e-05 ;
-T (1,128,id) =  4.32422e-05 ;
-T (1,160,id) =  4.29535e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170 ;
-Nedges (id) = 288 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =   0.00877734 ;
-T (1,2,id) =  3.76087e-05 ;
-T (1,4,id) =  3.70657e-05 ;
-T (1,8,id) =  3.72706e-05 ;
-T (1,16,id) =  3.71914e-05 ;
-T (1,32,id) =   3.6926e-05 ;
-T (1,64,id) =  3.71411e-05 ;
-T (1,128,id) =  3.70024e-05 ;
-T (1,160,id) =  3.70452e-05 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 300 ;
-Nedges (id) = 940 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 8 ;
-T (1,1,id) =   0.00897467 ;
-T (1,2,id) =  0.000286102 ;
-T (1,4,id) =  0.000268277 ;
-T (1,8,id) =  0.000267698 ;
-T (1,16,id) =  0.000267531 ;
-T (1,32,id) =  0.000267472 ;
-T (1,64,id) =  0.000267235 ;
-T (1,128,id) =  0.000266104 ;
-T (1,160,id) =  0.000273933 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 300 ;
-Nedges (id) = 940 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =   0.00882046 ;
-T (1,2,id) =  0.000108533 ;
-T (1,4,id) =  0.000107594 ;
-T (1,8,id) =  0.000107486 ;
-T (1,16,id) =  0.000107273 ;
-T (1,32,id) =  0.000107158 ;
-T (1,64,id) =  0.000107369 ;
-T (1,128,id) =  0.000107402 ;
-T (1,160,id) =  0.000107231 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1020 ;
-Nedges (id) = 3448 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 9 ;
-T (1,1,id) =    0.0111284 ;
-T (1,2,id) =   0.00202815 ;
-T (1,4,id) =   0.00204456 ;
-T (1,8,id) =   0.00211742 ;
-T (1,16,id) =   0.00238912 ;
-T (1,32,id) =   0.00373162 ;
-T (1,64,id) =    0.0148525 ;
-T (1,128,id) =    0.0570612 ;
-T (1,160,id) =    0.0359875 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1020 ;
-Nedges (id) = 3448 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =   0.00977213 ;
-T (1,2,id) =  0.000919501 ;
-T (1,4,id) =  0.000955553 ;
-T (1,8,id) =   0.00103027 ;
-T (1,16,id) =   0.00117316 ;
-T (1,32,id) =   0.00264952 ;
-T (1,64,id) =   0.00562743 ;
-T (1,128,id) =    0.0295102 ;
-T (1,160,id) =    0.0273461 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1020 ;
-Nedges (id) = 2880 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =   0.00956481 ;
-T (1,2,id) =  0.000787999 ;
-T (1,4,id) =   0.00082915 ;
-T (1,8,id) =  0.000897923 ;
-T (1,16,id) =   0.00102664 ;
-T (1,32,id) =   0.00179597 ;
-T (1,64,id) =   0.00578601 ;
-T (1,128,id) =    0.0168351 ;
-T (1,160,id) =    0.0317071 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2132 ;
-Nedges (id) = 4156 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =    0.0134912 ;
-T (1,2,id) =   0.00257218 ;
-T (1,4,id) =   0.00253802 ;
-T (1,8,id) =   0.00258834 ;
-T (1,16,id) =   0.00284792 ;
-T (1,32,id) =    0.0045139 ;
-T (1,64,id) =    0.0140797 ;
-T (1,128,id) =    0.0477942 ;
-T (1,160,id) =    0.0571381 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2132 ;
-Nedges (id) = 4156 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =    0.0130128 ;
-T (1,2,id) =    0.0022366 ;
-T (1,4,id) =   0.00202899 ;
-T (1,8,id) =   0.00302213 ;
-T (1,16,id) =   0.00426377 ;
-T (1,32,id) =   0.00674314 ;
-T (1,64,id) =    0.0143067 ;
-T (1,128,id) =    0.0302031 ;
-T (1,160,id) =    0.0531629 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2132 ;
-Nedges (id) = 4050 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =    0.0129672 ;
-T (1,2,id) =     0.002182 ;
-T (1,4,id) =   0.00264085 ;
-T (1,8,id) =   0.00284115 ;
-T (1,16,id) =   0.00221848 ;
-T (1,32,id) =   0.00585989 ;
-T (1,64,id) =    0.0133775 ;
-T (1,128,id) =    0.0435949 ;
-T (1,160,id) =    0.0628726 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1200 ;
-Nedges (id) = 4320 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =   0.00955095 ;
-T (1,2,id) =  0.000648642 ;
-T (1,4,id) =  0.000684993 ;
-T (1,8,id) =  0.000734456 ;
-T (1,16,id) =  0.000881717 ;
-T (1,32,id) =   0.00150904 ;
-T (1,64,id) =   0.00548847 ;
-T (1,128,id) =     0.030671 ;
-T (1,160,id) =    0.0306281 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1200 ;
-Nedges (id) = 6583 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 11 ;
-T (1,1,id) =    0.0143789 ;
-T (1,2,id) =   0.00485511 ;
-T (1,4,id) =    0.0049061 ;
-T (1,8,id) =   0.00495416 ;
-T (1,16,id) =   0.00533164 ;
-T (1,32,id) =   0.00859326 ;
-T (1,64,id) =    0.0303624 ;
-T (1,128,id) =    0.0594176 ;
-T (1,160,id) =    0.0600551 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1200 ;
-Nedges (id) = 6583 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 6 ;
-T (1,1,id) =   0.00999708 ;
-T (1,2,id) =  0.000993313 ;
-T (1,4,id) =   0.00103714 ;
-T (1,8,id) =   0.00110134 ;
-T (1,16,id) =   0.00122242 ;
-T (1,32,id) =   0.00153075 ;
-T (1,64,id) =   0.00737859 ;
-T (1,128,id) =    0.0360699 ;
-T (1,160,id) =    0.0451368 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4420 ;
-Nedges (id) = 14400 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =    0.0237781 ;
-T (1,2,id) =   0.00812053 ;
-T (1,4,id) =   0.00484782 ;
-T (1,8,id) =   0.00505546 ;
-T (1,16,id) =   0.00733852 ;
-T (1,32,id) =    0.0136964 ;
-T (1,64,id) =    0.0246046 ;
-T (1,128,id) =    0.0467091 ;
-T (1,160,id) =    0.0574572 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/as20000102/as20000102_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 6474 ;
-Nedges (id) = 12572 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 11 ;
-T (1,1,id) =    0.0223098 ;
-T (1,2,id) =   0.00809267 ;
-T (1,4,id) =   0.00604968 ;
-T (1,8,id) =   0.00564359 ;
-T (1,16,id) =   0.00609027 ;
-T (1,32,id) =    0.0131666 ;
-T (1,64,id) =    0.0318606 ;
-T (1,128,id) =     0.124952 ;
-T (1,160,id) =     0.122121 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4420 ;
-Nedges (id) = 15988 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 13 ;
-T (1,1,id) =    0.0358047 ;
-T (1,2,id) =    0.0145459 ;
-T (1,4,id) =    0.0112338 ;
-T (1,8,id) =    0.0103008 ;
-T (1,16,id) =    0.0107204 ;
-T (1,32,id) =    0.0219033 ;
-T (1,64,id) =    0.0617531 ;
-T (1,128,id) =     0.145984 ;
-T (1,160,id) =     0.208075 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4420 ;
-Nedges (id) = 15988 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =    0.0244967 ;
-T (1,2,id) =   0.00862725 ;
-T (1,4,id) =   0.00536118 ;
-T (1,8,id) =   0.00409748 ;
-T (1,16,id) =   0.00601094 ;
-T (1,32,id) =     0.010459 ;
-T (1,64,id) =    0.0242302 ;
-T (1,128,id) =    0.0514148 ;
-T (1,160,id) =    0.0572082 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-GrQc/ca-GrQc_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5242 ;
-Nedges (id) = 14484 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 45 ;
-T (1,1,id) =    0.0315571 ;
-T (1,2,id) =    0.0148764 ;
-T (1,4,id) =    0.0109273 ;
-T (1,8,id) =    0.0134167 ;
-T (1,16,id) =    0.0149273 ;
-T (1,32,id) =    0.0241367 ;
-T (1,64,id) =     0.050742 ;
-T (1,128,id) =     0.168167 ;
-T (1,160,id) =     0.161221 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5100 ;
-Nedges (id) = 23040 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =    0.0227255 ;
-T (1,2,id) =   0.00814225 ;
-T (1,4,id) =   0.00551343 ;
-T (1,8,id) =   0.00518886 ;
-T (1,16,id) =   0.00727634 ;
-T (1,32,id) =    0.0114037 ;
-T (1,64,id) =    0.0183643 ;
-T (1,128,id) =    0.0590851 ;
-T (1,160,id) =    0.0828631 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella08/p2p-Gnutella08_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 6301 ;
-Nedges (id) = 20777 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 6 ;
-T (1,1,id) =     0.012392 ;
-T (1,2,id) =    0.0021561 ;
-T (1,4,id) =   0.00159382 ;
-T (1,8,id) =   0.00151306 ;
-T (1,16,id) =   0.00171585 ;
-T (1,32,id) =   0.00319854 ;
-T (1,64,id) =   0.00769339 ;
-T (1,128,id) =    0.0184293 ;
-T (1,160,id) =    0.0715431 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010331/oregon1_010331_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10670 ;
-Nedges (id) = 22002 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 17 ;
-T (1,1,id) =    0.0506365 ;
-T (1,2,id) =    0.0234902 ;
-T (1,4,id) =    0.0159672 ;
-T (1,8,id) =      0.01855 ;
-T (1,16,id) =    0.0221081 ;
-T (1,32,id) =    0.0274829 ;
-T (1,64,id) =    0.0584313 ;
-T (1,128,id) =     0.109024 ;
-T (1,160,id) =      0.13008 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010407/oregon1_010407_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10729 ;
-Nedges (id) = 21999 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 15 ;
-T (1,1,id) =    0.0498316 ;
-T (1,2,id) =    0.0227986 ;
-T (1,4,id) =    0.0159519 ;
-T (1,8,id) =    0.0189704 ;
-T (1,16,id) =    0.0186859 ;
-T (1,32,id) =    0.0289034 ;
-T (1,64,id) =    0.0545833 ;
-T (1,128,id) =     0.106973 ;
-T (1,160,id) =     0.166712 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010414/oregon1_010414_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10790 ;
-Nedges (id) = 22469 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 16 ;
-T (1,1,id) =    0.0542196 ;
-T (1,2,id) =    0.0250321 ;
-T (1,4,id) =     0.017075 ;
-T (1,8,id) =    0.0169675 ;
-T (1,16,id) =    0.0234488 ;
-T (1,32,id) =    0.0280926 ;
-T (1,64,id) =    0.0692693 ;
-T (1,128,id) =     0.119753 ;
-T (1,160,id) =     0.177994 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010421/oregon1_010421_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10859 ;
-Nedges (id) = 22747 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 16 ;
-T (1,1,id) =    0.0545078 ;
-T (1,2,id) =    0.0252104 ;
-T (1,4,id) =    0.0174017 ;
-T (1,8,id) =     0.017715 ;
-T (1,16,id) =    0.0235216 ;
-T (1,32,id) =    0.0283313 ;
-T (1,64,id) =     0.067138 ;
-T (1,128,id) =     0.144406 ;
-T (1,160,id) =     0.174362 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010428/oregon1_010428_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10886 ;
-Nedges (id) = 22493 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 16 ;
-T (1,1,id) =    0.0544161 ;
-T (1,2,id) =    0.0252515 ;
-T (1,4,id) =    0.0173275 ;
-T (1,8,id) =     0.017371 ;
-T (1,16,id) =    0.0222281 ;
-T (1,32,id) =    0.0314774 ;
-T (1,64,id) =    0.0691943 ;
-T (1,128,id) =     0.188582 ;
-T (1,160,id) =     0.177755 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010505/oregon1_010505_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10943 ;
-Nedges (id) = 22607 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 15 ;
-T (1,1,id) =    0.0524153 ;
-T (1,2,id) =    0.0238681 ;
-T (1,4,id) =    0.0179759 ;
-T (1,8,id) =     0.020832 ;
-T (1,16,id) =     0.023625 ;
-T (1,32,id) =    0.0343452 ;
-T (1,64,id) =    0.0623386 ;
-T (1,128,id) =     0.169796 ;
-T (1,160,id) =     0.179423 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010512/oregon1_010512_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11011 ;
-Nedges (id) = 22677 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 16 ;
-T (1,1,id) =    0.0546147 ;
-T (1,2,id) =    0.0252078 ;
-T (1,4,id) =    0.0186181 ;
-T (1,8,id) =    0.0218685 ;
-T (1,16,id) =    0.0216275 ;
-T (1,32,id) =    0.0313633 ;
-T (1,64,id) =    0.0641708 ;
-T (1,128,id) =     0.155361 ;
-T (1,160,id) =      0.16473 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010519/oregon1_010519_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11051 ;
-Nedges (id) = 22724 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 16 ;
-T (1,1,id) =    0.0532811 ;
-T (1,2,id) =    0.0249418 ;
-T (1,4,id) =    0.0177176 ;
-T (1,8,id) =    0.0180668 ;
-T (1,16,id) =    0.0193609 ;
-T (1,32,id) =    0.0268924 ;
-T (1,64,id) =    0.0641352 ;
-T (1,128,id) =     0.121562 ;
-T (1,160,id) =      0.12646 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010526/oregon1_010526_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11174 ;
-Nedges (id) = 23409 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 15 ;
-T (1,1,id) =    0.0585221 ;
-T (1,2,id) =    0.0277054 ;
-T (1,4,id) =    0.0205242 ;
-T (1,8,id) =    0.0236072 ;
-T (1,16,id) =    0.0252923 ;
-T (1,32,id) =    0.0356198 ;
-T (1,64,id) =    0.0733045 ;
-T (1,128,id) =     0.193875 ;
-T (1,160,id) =     0.190839 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5100 ;
-Nedges (id) = 31036 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 13 ;
-T (1,1,id) =    0.0732118 ;
-T (1,2,id) =    0.0331973 ;
-T (1,4,id) =    0.0271648 ;
-T (1,8,id) =    0.0265594 ;
-T (1,16,id) =    0.0284492 ;
-T (1,32,id) =    0.0435534 ;
-T (1,64,id) =     0.164279 ;
-T (1,128,id) =      0.27883 ;
-T (1,160,id) =     0.292386 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5100 ;
-Nedges (id) = 31036 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 6 ;
-T (1,1,id) =    0.0273977 ;
-T (1,2,id) =    0.0108326 ;
-T (1,4,id) =   0.00783003 ;
-T (1,8,id) =   0.00682293 ;
-T (1,16,id) =   0.00654352 ;
-T (1,32,id) =    0.0139796 ;
-T (1,64,id) =    0.0282609 ;
-T (1,128,id) =    0.0657264 ;
-T (1,160,id) =     0.126398 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella09/p2p-Gnutella09_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8114 ;
-Nedges (id) = 26013 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 6 ;
-T (1,1,id) =    0.0132436 ;
-T (1,2,id) =   0.00266645 ;
-T (1,4,id) =   0.00190721 ;
-T (1,8,id) =   0.00196904 ;
-T (1,16,id) =   0.00216771 ;
-T (1,32,id) =   0.00299308 ;
-T (1,64,id) =   0.00766447 ;
-T (1,128,id) =    0.0755171 ;
-T (1,160,id) =    0.0406347 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-HepTh/ca-HepTh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 9877 ;
-Nedges (id) = 25973 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 33 ;
-T (1,1,id) =     0.031381 ;
-T (1,2,id) =    0.0136815 ;
-T (1,4,id) =   0.00975401 ;
-T (1,8,id) =   0.00801327 ;
-T (1,16,id) =   0.00841741 ;
-T (1,32,id) =    0.0135253 ;
-T (1,64,id) =     0.027366 ;
-T (1,128,id) =     0.123666 ;
-T (1,160,id) =     0.123322 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010331/oregon2_010331_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10900 ;
-Nedges (id) = 31180 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 26 ;
-T (1,1,id) =     0.275493 ;
-T (1,2,id) =     0.145973 ;
-T (1,4,id) =    0.0959591 ;
-T (1,8,id) =     0.128217 ;
-T (1,16,id) =     0.110275 ;
-T (1,32,id) =      0.17153 ;
-T (1,64,id) =     0.357827 ;
-T (1,128,id) =     0.605463 ;
-T (1,160,id) =     0.669217 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010407/oregon2_010407_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10981 ;
-Nedges (id) = 30855 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 25 ;
-T (1,1,id) =     0.246445 ;
-T (1,2,id) =      0.12677 ;
-T (1,4,id) =    0.0822344 ;
-T (1,8,id) =    0.0970999 ;
-T (1,16,id) =     0.112071 ;
-T (1,32,id) =     0.160766 ;
-T (1,64,id) =     0.306554 ;
-T (1,128,id) =     0.523025 ;
-T (1,160,id) =      0.59017 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010505/oregon2_010505_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11157 ;
-Nedges (id) = 30943 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 22 ;
-T (1,1,id) =     0.242552 ;
-T (1,2,id) =     0.138334 ;
-T (1,4,id) =     0.087329 ;
-T (1,8,id) =    0.0958906 ;
-T (1,16,id) =     0.111072 ;
-T (1,32,id) =     0.176612 ;
-T (1,64,id) =     0.322418 ;
-T (1,128,id) =     0.520292 ;
-T (1,160,id) =     0.595167 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010414/oregon2_010414_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11019 ;
-Nedges (id) = 31761 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 25 ;
-T (1,1,id) =     0.330135 ;
-T (1,2,id) =     0.179831 ;
-T (1,4,id) =     0.123711 ;
-T (1,8,id) =     0.128997 ;
-T (1,16,id) =     0.157131 ;
-T (1,32,id) =      0.23581 ;
-T (1,64,id) =     0.448011 ;
-T (1,128,id) =     0.749771 ;
-T (1,160,id) =     0.943367 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010421/oregon2_010421_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11080 ;
-Nedges (id) = 31538 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 23 ;
-T (1,1,id) =     0.278275 ;
-T (1,2,id) =     0.155192 ;
-T (1,4,id) =     0.101091 ;
-T (1,8,id) =    0.0958488 ;
-T (1,16,id) =     0.124947 ;
-T (1,32,id) =     0.196882 ;
-T (1,64,id) =     0.395334 ;
-T (1,128,id) =      0.41214 ;
-T (1,160,id) =     0.494946 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010428/oregon2_010428_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11113 ;
-Nedges (id) = 31434 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 22 ;
-T (1,1,id) =      0.24929 ;
-T (1,2,id) =     0.138062 ;
-T (1,4,id) =    0.0898903 ;
-T (1,8,id) =    0.0962475 ;
-T (1,16,id) =     0.110724 ;
-T (1,32,id) =     0.179926 ;
-T (1,64,id) =     0.323959 ;
-T (1,128,id) =     0.578761 ;
-T (1,160,id) =     0.608178 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010512/oregon2_010512_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11260 ;
-Nedges (id) = 31303 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 22 ;
-T (1,1,id) =     0.225182 ;
-T (1,2,id) =     0.128398 ;
-T (1,4,id) =    0.0825821 ;
-T (1,8,id) =      0.10042 ;
-T (1,16,id) =     0.108116 ;
-T (1,32,id) =     0.150628 ;
-T (1,64,id) =     0.299138 ;
-T (1,128,id) =     0.537952 ;
-T (1,160,id) =     0.590516 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010519/oregon2_010519_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11375 ;
-Nedges (id) = 32287 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 25 ;
-T (1,1,id) =     0.305691 ;
-T (1,2,id) =     0.181573 ;
-T (1,4,id) =     0.128482 ;
-T (1,8,id) =     0.106207 ;
-T (1,16,id) =     0.117615 ;
-T (1,32,id) =     0.189068 ;
-T (1,64,id) =     0.383122 ;
-T (1,128,id) =     0.706584 ;
-T (1,160,id) =     0.747559 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-81-256-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 21074 ;
-Nedges (id) = 41809 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =     0.425296 ;
-T (1,2,id) =     0.216239 ;
-T (1,4,id) =     0.121668 ;
-T (1,8,id) =    0.0764984 ;
-T (1,16,id) =    0.0571498 ;
-T (1,32,id) =    0.0778761 ;
-T (1,64,id) =     0.165335 ;
-T (1,128,id) =     0.425606 ;
-T (1,160,id) =     0.539155 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-81-256-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 21074 ;
-Nedges (id) = 41472 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =     0.405876 ;
-T (1,2,id) =     0.206508 ;
-T (1,4,id) =     0.113698 ;
-T (1,8,id) =    0.0704798 ;
-T (1,16,id) =    0.0440766 ;
-T (1,32,id) =     0.085567 ;
-T (1,64,id) =     0.124329 ;
-T (1,128,id) =     0.267768 ;
-T (1,160,id) =     0.293208 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010526/oregon2_010526_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11461 ;
-Nedges (id) = 32730 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 26 ;
-T (1,1,id) =     0.295191 ;
-T (1,2,id) =     0.176876 ;
-T (1,4,id) =     0.127344 ;
-T (1,8,id) =     0.104957 ;
-T (1,16,id) =     0.117872 ;
-T (1,32,id) =     0.212626 ;
-T (1,64,id) =      0.39915 ;
-T (1,128,id) =     0.689934 ;
-T (1,160,id) =     0.789085 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella06/p2p-Gnutella06_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8717 ;
-Nedges (id) = 31525 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =    0.0121645 ;
-T (1,2,id) =   0.00196977 ;
-T (1,4,id) =   0.00139617 ;
-T (1,8,id) =   0.00119581 ;
-T (1,16,id) =    0.0014029 ;
-T (1,32,id) =   0.00209786 ;
-T (1,64,id) =   0.00510465 ;
-T (1,128,id) =    0.0375686 ;
-T (1,160,id) =    0.0476487 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-81-256-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 21074 ;
-Nedges (id) = 41809 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =     0.405374 ;
-T (1,2,id) =     0.206858 ;
-T (1,4,id) =     0.113957 ;
-T (1,8,id) =     0.065371 ;
-T (1,16,id) =    0.0442276 ;
-T (1,32,id) =    0.0710308 ;
-T (1,64,id) =     0.125006 ;
-T (1,128,id) =      0.33323 ;
-T (1,160,id) =     0.329904 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella05/p2p-Gnutella05_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8846 ;
-Nedges (id) = 31839 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =    0.0122676 ;
-T (1,2,id) =   0.00204279 ;
-T (1,4,id) =   0.00142459 ;
-T (1,8,id) =   0.00127172 ;
-T (1,16,id) =   0.00139935 ;
-T (1,32,id) =   0.00223822 ;
-T (1,64,id) =   0.00572416 ;
-T (1,128,id) =    0.0360369 ;
-T (1,160,id) =    0.0524145 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella04/p2p-Gnutella04_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10876 ;
-Nedges (id) = 39994 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =    0.0125447 ;
-T (1,2,id) =   0.00210687 ;
-T (1,4,id) =   0.00155694 ;
-T (1,8,id) =   0.00131658 ;
-T (1,16,id) =   0.00156334 ;
-T (1,32,id) =    0.0025384 ;
-T (1,64,id) =   0.00512098 ;
-T (1,128,id) =    0.0597023 ;
-T (1,160,id) =     0.055399 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/as-caida20071105/as-caida20071105_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26475 ;
-Nedges (id) = 53381 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 17 ;
-T (1,1,id) =     0.143086 ;
-T (1,2,id) =    0.0737604 ;
-T (1,4,id) =    0.0501793 ;
-T (1,8,id) =    0.0427385 ;
-T (1,16,id) =    0.0446296 ;
-T (1,32,id) =    0.0567115 ;
-T (1,64,id) =     0.113298 ;
-T (1,128,id) =     0.214196 ;
-T (1,160,id) =     0.319907 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella25/p2p-Gnutella25_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 22687 ;
-Nedges (id) = 54705 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =    0.0142573 ;
-T (1,2,id) =   0.00325365 ;
-T (1,4,id) =   0.00226253 ;
-T (1,8,id) =   0.00190842 ;
-T (1,16,id) =   0.00186647 ;
-T (1,32,id) =   0.00252915 ;
-T (1,64,id) =   0.00731169 ;
-T (1,128,id) =    0.0833423 ;
-T (1,160,id) =    0.0876733 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella24/p2p-Gnutella24_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26518 ;
-Nedges (id) = 65369 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =     0.015435 ;
-T (1,2,id) =   0.00390566 ;
-T (1,4,id) =   0.00252961 ;
-T (1,8,id) =   0.00205238 ;
-T (1,16,id) =   0.00222043 ;
-T (1,32,id) =   0.00404854 ;
-T (1,64,id) =   0.00786154 ;
-T (1,128,id) =    0.0981778 ;
-T (1,160,id) =    0.0491502 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/facebook_combined/facebook_combined_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4039 ;
-Nedges (id) = 88234 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 98 ;
-T (1,1,id) =      13.6345 ;
-T (1,2,id) =      8.69397 ;
-T (1,4,id) =      8.48782 ;
-T (1,8,id) =      8.65297 ;
-T (1,16,id) =       9.4291 ;
-T (1,32,id) =      13.4371 ;
-T (1,64,id) =      23.1507 ;
-T (1,128,id) =      49.8205 ;
-T (1,160,id) =      64.9264 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36244 ;
-Nedges (id) = 129600 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =      1.07325 ;
-T (1,2,id) =     0.536178 ;
-T (1,4,id) =     0.273641 ;
-T (1,8,id) =     0.152374 ;
-T (1,16,id) =     0.100914 ;
-T (1,32,id) =    0.0739153 ;
-T (1,64,id) =     0.189554 ;
-T (1,128,id) =     0.552224 ;
-T (1,160,id) =     0.565161 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella30/p2p-Gnutella30_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36682 ;
-Nedges (id) = 88328 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =    0.0174905 ;
-T (1,2,id) =   0.00498406 ;
-T (1,4,id) =   0.00336361 ;
-T (1,8,id) =   0.00253378 ;
-T (1,16,id) =   0.00262231 ;
-T (1,32,id) =   0.00370094 ;
-T (1,64,id) =   0.00882139 ;
-T (1,128,id) =    0.0612219 ;
-T (1,160,id) =    0.0419998 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36244 ;
-Nedges (id) = 137164 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 20 ;
-T (1,1,id) =      1.48883 ;
-T (1,2,id) =     0.745618 ;
-T (1,4,id) =      0.39757 ;
-T (1,8,id) =     0.233663 ;
-T (1,16,id) =     0.189912 ;
-T (1,32,id) =      0.25394 ;
-T (1,64,id) =     0.568455 ;
-T (1,128,id) =      1.10826 ;
-T (1,160,id) =     0.781043 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36244 ;
-Nedges (id) = 137164 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =      1.09077 ;
-T (1,2,id) =     0.544162 ;
-T (1,4,id) =      0.29542 ;
-T (1,8,id) =     0.159042 ;
-T (1,16,id) =     0.119743 ;
-T (1,32,id) =    0.0982356 ;
-T (1,64,id) =      0.19962 ;
-T (1,128,id) =     0.533773 ;
-T (1,160,id) =      0.51579 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20400 ;
-Nedges (id) = 138240 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =     0.171992 ;
-T (1,2,id) =    0.0829698 ;
-T (1,4,id) =    0.0452957 ;
-T (1,8,id) =    0.0302156 ;
-T (1,16,id) =    0.0222816 ;
-T (1,32,id) =     0.024511 ;
-T (1,64,id) =    0.0494825 ;
-T (1,128,id) =     0.201421 ;
-T (1,160,id) =      0.19598 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26520 ;
-Nedges (id) = 144000 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =     0.444899 ;
-T (1,2,id) =      0.22225 ;
-T (1,4,id) =     0.121923 ;
-T (1,8,id) =    0.0768365 ;
-T (1,16,id) =    0.0532525 ;
-T (1,32,id) =    0.0690862 ;
-T (1,64,id) =     0.113715 ;
-T (1,128,id) =      0.36037 ;
-T (1,160,id) =      0.39254 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-CondMat/ca-CondMat_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 23133 ;
-Nedges (id) = 93439 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 27 ;
-T (1,1,id) =     0.312921 ;
-T (1,2,id) =     0.165548 ;
-T (1,4,id) =     0.103238 ;
-T (1,8,id) =    0.0646463 ;
-T (1,16,id) =     0.065417 ;
-T (1,32,id) =    0.0751136 ;
-T (1,64,id) =     0.162627 ;
-T (1,128,id) =     0.368355 ;
-T (1,160,id) =     0.260513 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-HepPh/ca-HepPh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 12008 ;
-Nedges (id) = 118489 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 240 ;
-T (1,1,id) =      8.42882 ;
-T (1,2,id) =      4.38632 ;
-T (1,4,id) =      2.40019 ;
-T (1,8,id) =      1.51234 ;
-T (1,16,id) =      1.69029 ;
-T (1,32,id) =      2.99504 ;
-T (1,64,id) =      5.64439 ;
-T (1,128,id) =      8.29946 ;
-T (1,160,id) =      8.89355 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26520 ;
-Nedges (id) = 175873 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 20 ;
-T (1,1,id) =      1.28519 ;
-T (1,2,id) =     0.643579 ;
-T (1,4,id) =     0.372497 ;
-T (1,8,id) =     0.204092 ;
-T (1,16,id) =     0.180606 ;
-T (1,32,id) =     0.239584 ;
-T (1,64,id) =     0.599023 ;
-T (1,128,id) =     0.908228 ;
-T (1,160,id) =      1.07176 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26520 ;
-Nedges (id) = 175873 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 6 ;
-T (1,1,id) =     0.522706 ;
-T (1,2,id) =       0.2574 ;
-T (1,4,id) =     0.133607 ;
-T (1,8,id) =      0.08059 ;
-T (1,16,id) =    0.0555211 ;
-T (1,32,id) =    0.0640427 ;
-T (1,64,id) =     0.154449 ;
-T (1,128,id) =     0.346436 ;
-T (1,160,id) =      0.51196 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella31/p2p-Gnutella31_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 62586 ;
-Nedges (id) = 147892 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =    0.0234862 ;
-T (1,2,id) =   0.00826007 ;
-T (1,4,id) =   0.00549851 ;
-T (1,8,id) =   0.00456491 ;
-T (1,16,id) =   0.00394199 ;
-T (1,32,id) =    0.0051779 ;
-T (1,64,id) =    0.0112682 ;
-T (1,128,id) =    0.0550957 ;
-T (1,160,id) =    0.0743724 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20400 ;
-Nedges (id) = 217255 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 7 ;
-T (1,1,id) =     0.278751 ;
-T (1,2,id) =     0.136095 ;
-T (1,4,id) =    0.0891792 ;
-T (1,8,id) =    0.0536255 ;
-T (1,16,id) =    0.0566516 ;
-T (1,32,id) =    0.0563387 ;
-T (1,64,id) =     0.104275 ;
-T (1,128,id) =     0.486174 ;
-T (1,160,id) =     0.360642 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20400 ;
-Nedges (id) = 217255 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 20 ;
-T (1,1,id) =      1.54827 ;
-T (1,2,id) =     0.774366 ;
-T (1,4,id) =     0.433472 ;
-T (1,8,id) =     0.278196 ;
-T (1,16,id) =     0.277133 ;
-T (1,32,id) =     0.522934 ;
-T (1,64,id) =     0.641447 ;
-T (1,128,id) =      1.46349 ;
-T (1,160,id) =      1.80382 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/email-Enron/email-Enron_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36692 ;
-Nedges (id) = 183831 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 23 ;
-T (1,1,id) =      4.52415 ;
-T (1,2,id) =      2.31514 ;
-T (1,4,id) =      1.26147 ;
-T (1,8,id) =     0.937884 ;
-T (1,16,id) =      1.01139 ;
-T (1,32,id) =      1.32662 ;
-T (1,64,id) =      2.87595 ;
-T (1,128,id) =      4.25863 ;
-T (1,160,id) =       3.9252 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-260610-65536_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 65536 ;
-Nedges (id) = 260610 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =    0.0219702 ;
-T (1,2,id) =   0.00824096 ;
-T (1,4,id) =   0.00652124 ;
-T (1,8,id) =   0.00583398 ;
-T (1,16,id) =   0.00548493 ;
-T (1,32,id) =   0.00841986 ;
-T (1,64,id) =    0.0265157 ;
-T (1,128,id) =    0.0646661 ;
-T (1,160,id) =     0.110982 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-AstroPh/ca-AstroPh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 18772 ;
-Nedges (id) = 198050 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 58 ;
-T (1,1,id) =       7.9195 ;
-T (1,2,id) =      4.18058 ;
-T (1,4,id) =      2.21584 ;
-T (1,8,id) =      1.41713 ;
-T (1,16,id) =      1.15085 ;
-T (1,32,id) =      1.71983 ;
-T (1,64,id) =      3.42333 ;
-T (1,128,id) =      4.64042 ;
-T (1,160,id) =      4.49055 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/loc-brightkite_edges/loc-brightkite_edges_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 58228 ;
-Nedges (id) = 214078 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 44 ;
-T (1,1,id) =      2.63613 ;
-T (1,2,id) =      1.42407 ;
-T (1,4,id) =      1.09468 ;
-T (1,8,id) =       1.0008 ;
-T (1,16,id) =      1.12452 ;
-T (1,32,id) =      1.43122 ;
-T (1,64,id) =       2.6198 ;
-T (1,128,id) =      4.77321 ;
-T (1,160,id) =      5.55964 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-256-625-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 160882 ;
-Nedges (id) = 320000 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =      23.2921 ;
-T (1,2,id) =      11.6911 ;
-T (1,4,id) =      5.92685 ;
-T (1,8,id) =      3.03116 ;
-T (1,16,id) =      1.63743 ;
-T (1,32,id) =       1.1381 ;
-T (1,64,id) =      1.11626 ;
-T (1,128,id) =      1.37959 ;
-T (1,160,id) =      1.57261 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-256-625-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 160882 ;
-Nedges (id) = 320881 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =      23.8353 ;
-T (1,2,id) =      12.0294 ;
-T (1,4,id) =      6.08224 ;
-T (1,8,id) =      3.16755 ;
-T (1,16,id) =      1.73611 ;
-T (1,32,id) =      1.37995 ;
-T (1,64,id) =      1.76984 ;
-T (1,128,id) =      1.85445 ;
-T (1,160,id) =      2.37097 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-256-625-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 160882 ;
-Nedges (id) = 320881 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =      23.3885 ;
-T (1,2,id) =      11.7296 ;
-T (1,4,id) =      5.90908 ;
-T (1,8,id) =      3.03494 ;
-T (1,16,id) =      1.67424 ;
-T (1,32,id) =      1.32616 ;
-T (1,64,id) =       1.9055 ;
-T (1,128,id) =      1.26709 ;
-T (1,160,id) =      1.63941 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-HepTh/cit-HepTh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 27770 ;
-Nedges (id) = 352285 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 31 ;
-T (1,1,id) =      12.7937 ;
-T (1,2,id) =      6.57578 ;
-T (1,4,id) =      3.57511 ;
-T (1,8,id) =      2.02187 ;
-T (1,16,id) =      1.72666 ;
-T (1,32,id) =      2.37187 ;
-T (1,64,id) =      4.63472 ;
-T (1,128,id) =      6.19391 ;
-T (1,160,id) =      6.47174 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Epinions1/soc-Epinions1_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 75879 ;
-Nedges (id) = 405740 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 34 ;
-T (1,1,id) =      16.7755 ;
-T (1,2,id) =      8.70778 ;
-T (1,4,id) =      4.65515 ;
-T (1,8,id) =      2.82935 ;
-T (1,16,id) =      2.16071 ;
-T (1,32,id) =      2.36544 ;
-T (1,64,id) =      3.99393 ;
-T (1,128,id) =       5.7295 ;
-T (1,160,id) =       6.3244 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 265214 ;
-Nedges (id) = 364481 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 21 ;
-T (1,1,id) =      1.99081 ;
-T (1,2,id) =      1.05483 ;
-T (1,4,id) =     0.606597 ;
-T (1,8,id) =     0.384882 ;
-T (1,16,id) =     0.286161 ;
-T (1,32,id) =     0.585399 ;
-T (1,64,id) =     0.498415 ;
-T (1,128,id) =      1.11705 ;
-T (1,160,id) =      1.49492 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-HepPh/cit-HepPh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 34546 ;
-Nedges (id) = 420877 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 26 ;
-T (1,1,id) =      8.51619 ;
-T (1,2,id) =      4.37316 ;
-T (1,4,id) =      2.33009 ;
-T (1,8,id) =      1.40043 ;
-T (1,16,id) =       1.0849 ;
-T (1,32,id) =      1.02809 ;
-T (1,64,id) =      1.87521 ;
-T (1,128,id) =      3.54215 ;
-T (1,160,id) =      3.68758 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Slashdot0811/soc-Slashdot0811_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 77360 ;
-Nedges (id) = 469180 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 36 ;
-T (1,1,id) =      4.04752 ;
-T (1,2,id) =      2.07619 ;
-T (1,4,id) =      1.22425 ;
-T (1,8,id) =     0.816174 ;
-T (1,16,id) =     0.744722 ;
-T (1,32,id) =      1.05633 ;
-T (1,64,id) =      1.87959 ;
-T (1,128,id) =      2.66643 ;
-T (1,160,id) =      2.66485 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 82168 ;
-Nedges (id) = 504230 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 37 ;
-T (1,1,id) =      4.52052 ;
-T (1,2,id) =      2.32608 ;
-T (1,4,id) =      1.30932 ;
-T (1,8,id) =     0.837237 ;
-T (1,16,id) =     0.768571 ;
-T (1,32,id) =     0.984153 ;
-T (1,64,id) =      1.74539 ;
-T (1,128,id) =       2.4841 ;
-T (1,160,id) =      2.46568 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-1045506-262144_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 262144 ;
-Nedges (id) = 1045506 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =    0.0623617 ;
-T (1,2,id) =    0.0346798 ;
-T (1,4,id) =    0.0275235 ;
-T (1,8,id) =    0.0229234 ;
-T (1,16,id) =    0.0217311 ;
-T (1,32,id) =    0.0338099 ;
-T (1,64,id) =    0.0963001 ;
-T (1,128,id) =     0.185783 ;
-T (1,160,id) =     0.256731 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 132600 ;
-Nedges (id) = 1152000 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =      8.69847 ;
-T (1,2,id) =      4.36773 ;
-T (1,4,id) =      2.22079 ;
-T (1,8,id) =      1.15686 ;
-T (1,16,id) =     0.694737 ;
-T (1,32,id) =     0.570187 ;
-T (1,64,id) =     0.596309 ;
-T (1,128,id) =      0.52556 ;
-T (1,160,id) =     0.803388 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/loc-gowalla_edges/loc-gowalla_edges_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 196591 ;
-Nedges (id) = 950327 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 30 ;
-T (1,1,id) =      20.9004 ;
-T (1,2,id) =      10.6427 ;
-T (1,4,id) =      5.53803 ;
-T (1,8,id) =      3.02684 ;
-T (1,16,id) =      2.06212 ;
-T (1,32,id) =       2.4115 ;
-T (1,64,id) =       4.5728 ;
-T (1,128,id) =      6.52434 ;
-T (1,160,id) =      7.02459 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0302/amazon0302_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 262111 ;
-Nedges (id) = 899792 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 8 ;
-T (1,1,id) =     0.979186 ;
-T (1,2,id) =     0.509604 ;
-T (1,4,id) =     0.302029 ;
-T (1,8,id) =     0.216742 ;
-T (1,16,id) =      0.15447 ;
-T (1,32,id) =     0.193948 ;
-T (1,64,id) =      0.53802 ;
-T (1,128,id) =     0.716808 ;
-T (1,160,id) =     0.736163 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 132600 ;
-Nedges (id) = 1582861 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 29 ;
-T (1,1,id) =      45.5668 ;
-T (1,2,id) =      22.7762 ;
-T (1,4,id) =      11.6381 ;
-T (1,8,id) =      6.10271 ;
-T (1,16,id) =      3.51891 ;
-T (1,32,id) =       2.8183 ;
-T (1,64,id) =      4.06943 ;
-T (1,128,id) =       10.522 ;
-T (1,160,id) =      11.9904 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 132600 ;
-Nedges (id) = 1582861 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 7 ;
-T (1,1,id) =      11.5782 ;
-T (1,2,id) =       5.8131 ;
-T (1,4,id) =      2.94442 ;
-T (1,8,id) =      1.53016 ;
-T (1,16,id) =     0.885469 ;
-T (1,32,id) =     0.717949 ;
-T (1,64,id) =      0.95988 ;
-T (1,128,id) =      1.05289 ;
-T (1,160,id) =      1.10476 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-256-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 547924 ;
-Nedges (id) = 2073600 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =      260.039 ;
-T (1,2,id) =      129.797 ;
-T (1,4,id) =      65.2517 ;
-T (1,8,id) =      32.8471 ;
-T (1,16,id) =      17.0916 ;
-T (1,32,id) =      10.4133 ;
-T (1,64,id) =      8.55363 ;
-T (1,128,id) =      8.83027 ;
-T (1,160,id) =      10.0159 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-256-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 547924 ;
-Nedges (id) = 2132284 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 29 ;
-T (1,1,id) =      307.812 ;
-T (1,2,id) =      154.666 ;
-T (1,4,id) =      78.1081 ;
-T (1,8,id) =       39.519 ;
-T (1,16,id) =      20.9101 ;
-T (1,32,id) =      12.4831 ;
-T (1,64,id) =      11.5201 ;
-T (1,128,id) =      15.6723 ;
-T (1,160,id) =      16.8945 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-256-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 547924 ;
-Nedges (id) = 2132284 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =      260.559 ;
-T (1,2,id) =      131.439 ;
-T (1,4,id) =      65.1628 ;
-T (1,8,id) =      33.0017 ;
-T (1,16,id) =       17.148 ;
-T (1,32,id) =      10.3499 ;
-T (1,64,id) =      8.99117 ;
-T (1,128,id) =      9.05474 ;
-T (1,160,id) =      10.2436 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-PA/roadNet-PA_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1088092 ;
-Nedges (id) = 1541898 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =      0.14646 ;
-T (1,2,id) =    0.0823455 ;
-T (1,4,id) =    0.0584402 ;
-T (1,8,id) =     0.062069 ;
-T (1,16,id) =    0.0544554 ;
-T (1,32,id) =    0.0565415 ;
-T (1,64,id) =      0.14903 ;
-T (1,128,id) =     0.273765 ;
-T (1,160,id) =     0.249736 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 362440 ;
-Nedges (id) = 2332800 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =      96.7276 ;
-T (1,2,id) =      48.0162 ;
-T (1,4,id) =      24.1988 ;
-T (1,8,id) =      12.3296 ;
-T (1,16,id) =      6.45751 ;
-T (1,32,id) =      4.30366 ;
-T (1,64,id) =      4.17453 ;
-T (1,128,id) =        4.299 ;
-T (1,160,id) =      4.06891 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 362440 ;
-Nedges (id) = 2606125 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 29 ;
-T (1,1,id) =      180.364 ;
-T (1,2,id) =      90.6031 ;
-T (1,4,id) =      45.9674 ;
-T (1,8,id) =      23.4946 ;
-T (1,16,id) =      12.5109 ;
-T (1,32,id) =      8.05306 ;
-T (1,64,id) =      8.72197 ;
-T (1,128,id) =      15.9832 ;
-T (1,160,id) =      19.8144 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 362440 ;
-Nedges (id) = 2606125 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 6 ;
-T (1,1,id) =      100.679 ;
-T (1,2,id) =      50.4733 ;
-T (1,4,id) =      25.4787 ;
-T (1,8,id) =      13.0529 ;
-T (1,16,id) =      6.80285 ;
-T (1,32,id) =      4.19067 ;
-T (1,64,id) =      3.98904 ;
-T (1,128,id) =       4.7659 ;
-T (1,160,id) =      4.69854 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1379917 ;
-Nedges (id) = 1921660 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =     0.177397 ;
-T (1,2,id) =     0.101053 ;
-T (1,4,id) =    0.0732258 ;
-T (1,8,id) =    0.0573871 ;
-T (1,16,id) =    0.0508762 ;
-T (1,32,id) =    0.0702024 ;
-T (1,64,id) =     0.223016 ;
-T (1,128,id) =      0.30292 ;
-T (1,160,id) =      0.41121 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-4188162-1048576_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1048576 ;
-Nedges (id) = 4188162 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =     0.221168 ;
-T (1,2,id) =     0.146227 ;
-T (1,4,id) =      0.10752 ;
-T (1,8,id) =      0.08932 ;
-T (1,16,id) =    0.0801886 ;
-T (1,32,id) =     0.129548 ;
-T (1,64,id) =     0.233972 ;
-T (1,128,id) =     0.416521 ;
-T (1,160,id) =     0.707234 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/flickrEdges/flickrEdges_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 105938 ;
-Nedges (id) = 2316948 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 575 ;
-T (1,1,id) =      4317.07 ;
-T (1,2,id) =      2184.77 ;
-T (1,4,id) =       1122.8 ;
-T (1,8,id) =      579.966 ;
-T (1,16,id) =      310.577 ;
-T (1,32,id) =      347.251 ;
-T (1,64,id) =      612.211 ;
-T (1,128,id) =      1135.71 ;
-T (1,160,id) =      1345.75 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0312/amazon0312_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 400727 ;
-Nedges (id) = 2349869 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 12 ;
-T (1,1,id) =      13.5043 ;
-T (1,2,id) =      6.49237 ;
-T (1,4,id) =      3.55214 ;
-T (1,8,id) =      2.04825 ;
-T (1,16,id) =       1.8345 ;
-T (1,32,id) =      1.88598 ;
-T (1,64,id) =      3.69351 ;
-T (1,128,id) =      4.97214 ;
-T (1,160,id) =      5.22681 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0505/amazon0505_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 410236 ;
-Nedges (id) = 2439437 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 12 ;
-T (1,1,id) =      14.6735 ;
-T (1,2,id) =      7.40945 ;
-T (1,4,id) =      4.08553 ;
-T (1,8,id) =      2.36228 ;
-T (1,16,id) =      1.77821 ;
-T (1,32,id) =      2.54771 ;
-T (1,64,id) =       2.4744 ;
-T (1,128,id) =      5.14821 ;
-T (1,160,id) =      5.54852 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0601/amazon0601_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 403394 ;
-Nedges (id) = 2443408 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 12 ;
-T (1,1,id) =       12.437 ;
-T (1,2,id) =      6.27868 ;
-T (1,4,id) =        3.526 ;
-T (1,8,id) =      2.02135 ;
-T (1,16,id) =      1.48133 ;
-T (1,32,id) =      2.05967 ;
-T (1,64,id) =      3.37399 ;
-T (1,128,id) =      5.09306 ;
-T (1,160,id) =      5.21785 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-CA/roadNet-CA_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1965206 ;
-Nedges (id) = 2766607 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =     0.255167 ;
-T (1,2,id) =     0.148338 ;
-T (1,4,id) =     0.106718 ;
-T (1,8,id) =    0.0838897 ;
-T (1,16,id) =    0.0731075 ;
-T (1,32,id) =     0.150056 ;
-T (1,64,id) =     0.320647 ;
-T (1,128,id) =     0.438233 ;
-T (1,160,id) =     0.481844 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale18-ef16/graph500-scale18-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 174147 ;
-Nedges (id) = 3800348 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 160 ;
-T (1,1,id) =      4022.26 ;
-T (1,2,id) =      2052.54 ;
-T (1,4,id) =      1072.89 ;
-T (1,8,id) =      574.688 ;
-T (1,16,id) =      379.203 ;
-T (1,32,id) =      397.585 ;
-T (1,64,id) =      582.199 ;
-T (1,128,id) =      1020.85 ;
-T (1,160,id) =      1238.27 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 530400 ;
-Nedges (id) = 6912000 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =      104.297 ;
-T (1,2,id) =      52.2073 ;
-T (1,4,id) =       26.209 ;
-T (1,8,id) =      13.3171 ;
-T (1,16,id) =      6.95648 ;
-T (1,32,id) =      4.81021 ;
-T (1,64,id) =      4.48926 ;
-T (1,128,id) =      4.26037 ;
-T (1,160,id) =      4.13254 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 530400 ;
-Nedges (id) = 11080030 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 63 ;
-T (1,1,id) =      1168.69 ;
-T (1,2,id) =      600.903 ;
-T (1,4,id) =      311.353 ;
-T (1,8,id) =      161.573 ;
-T (1,16,id) =      87.3518 ;
-T (1,32,id) =      58.5225 ;
-T (1,64,id) =      61.3947 ;
-T (1,128,id) =      116.454 ;
-T (1,160,id) =      149.759 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 530400 ;
-Nedges (id) = 11080030 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 8 ;
-T (1,1,id) =      173.728 ;
-T (1,2,id) =      87.1452 ;
-T (1,4,id) =      43.8591 ;
-T (1,8,id) =      22.4845 ;
-T (1,16,id) =      11.9468 ;
-T (1,32,id) =      7.43031 ;
-T (1,64,id) =      7.17181 ;
-T (1,128,id) =      9.23574 ;
-T (1,160,id) =      9.66307 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale19-ef16/graph500-scale19-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 335318 ;
-Nedges (id) = 7729675 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 214 ;
-T (1,1,id) =      13057.2 ;
-T (1,2,id) =       6729.5 ;
-T (1,4,id) =      3535.54 ;
-T (1,8,id) =      1822.97 ;
-T (1,16,id) =      1050.93 ;
-T (1,32,id) =      983.469 ;
-T (1,64,id) =      1434.62 ;
-T (1,128,id) =      2588.93 ;
-T (1,160,id) =      3191.23 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-16764930-4194304_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4194304 ;
-Nedges (id) = 16764930 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =     0.855091 ;
-T (1,2,id) =     0.544162 ;
-T (1,4,id) =     0.406684 ;
-T (1,8,id) =     0.353421 ;
-T (1,16,id) =     0.310543 ;
-T (1,32,id) =      0.33857 ;
-T (1,64,id) =      1.55762 ;
-T (1,128,id) =      2.85292 ;
-T (1,160,id) =      3.05054 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2174640 ;
-Nedges (id) = 23328000 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 3 ;
-T (1,1,id) =      3082.96 ;
-T (1,2,id) =      1651.11 ;
-T (1,4,id) =      912.823 ;
-T (1,8,id) =      526.621 ;
-T (1,16,id) =      349.051 ;
-T (1,32,id) =      202.177 ;
-T (1,64,id) =      147.983 ;
-T (1,128,id) =      106.673 ;
-T (1,160,id) =      99.5894 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale20-ef16/graph500-scale20-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 645820 ;
-Nedges (id) = 15680861 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 285 ;
-T (1,1,id) =        44551 ;
-T (1,2,id) =      23223.7 ;
-T (1,4,id) =      12137.4 ;
-T (1,8,id) =      6156.43 ;
-T (1,16,id) =      3293.32 ;
-T (1,32,id) =      2877.55 ;
-T (1,64,id) =      4023.32 ;
-T (1,128,id) =      7564.58 ;
-T (1,160,id) =      9539.46 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2174640 ;
-Nedges (id) = 28667380 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 7 ;
-T (1,1,id) =      3594.82 ;
-T (1,2,id) =      1932.84 ;
-T (1,4,id) =      1066.17 ;
-T (1,8,id) =      610.342 ;
-T (1,16,id) =      407.613 ;
-T (1,32,id) =      241.763 ;
-T (1,64,id) =      170.743 ;
-T (1,128,id) =      122.453 ;
-T (1,160,id) =      122.574 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2174640 ;
-Nedges (id) = 28667380 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 85 ;
-T (1,1,id) =      9803.11 ;
-T (1,2,id) =      5395.52 ;
-T (1,4,id) =      2910.09 ;
-T (1,8,id) =      1639.86 ;
-T (1,16,id) =      950.174 ;
-T (1,32,id) =      616.821 ;
-T (1,64,id) =      471.678 ;
-T (1,128,id) =      416.497 ;
-T (1,160,id) =      433.061 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 3774768 ;
-Nedges (id) = 16518947 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 37 ;
-T (1,1,id) =      59.5397 ;
-T (1,2,id) =      32.5786 ;
-T (1,4,id) =      18.2487 ;
-T (1,8,id) =      11.0605 ;
-T (1,16,id) =      9.95694 ;
-T (1,32,id) =      9.02624 ;
-T (1,64,id) =      9.29766 ;
-T (1,128,id) =      16.9567 ;
-T (1,160,id) =      27.5845 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale21-ef16/graph500-scale21-ef16_adj.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-67084290-16777216_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 16777216 ;
-Nedges (id) = 67084290 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =      3.42676 ;
-T (1,2,id) =      2.21008 ;
-T (1,4,id) =      1.64032 ;
-T (1,8,id) =      1.46414 ;
-T (1,16,id) =      1.31031 ;
-T (1,32,id) =      1.36099 ;
-T (1,64,id) =      5.11525 ;
-T (1,128,id) =      11.6675 ;
-T (1,160,id) =      11.3921 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc6/V2a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 55042369 ;
-Nedges (id) = 58608800 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =      7.08185 ;
-T (1,2,id) =       4.4275 ;
-T (1,4,id) =      2.60206 ;
-T (1,8,id) =      1.97633 ;
-T (1,16,id) =      2.17671 ;
-T (1,32,id) =      3.00461 ;
-T (1,64,id) =      3.53026 ;
-T (1,128,id) =      5.87189 ;
-T (1,160,id) =      8.67424 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/U1a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 67716231 ;
-Nedges (id) = 69389281 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =      9.64385 ;
-T (1,2,id) =      5.13199 ;
-T (1,4,id) =       3.0627 ;
-T (1,8,id) =      2.43569 ;
-T (1,16,id) =      2.58199 ;
-T (1,32,id) =      2.47743 ;
-T (1,64,id) =      4.15124 ;
-T (1,128,id) =      7.53517 ;
-T (1,160,id) =      10.3673 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 67108864 ;
-Nedges (id) = 268386306 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =      14.0598 ;
-T (1,2,id) =      9.31096 ;
-T (1,4,id) =      7.07757 ;
-T (1,8,id) =      5.42716 ;
-T (1,16,id) =      5.48361 ;
-T (1,32,id) =      8.77984 ;
-T (1,64,id) =      19.7201 ;
-T (1,128,id) =      34.8034 ;
-T (1,160,id) =      41.9522 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/P1a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 139353211 ;
-Nedges (id) = 148914992 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =      19.8653 ;
-T (1,2,id) =      11.0306 ;
-T (1,4,id) =      6.94273 ;
-T (1,8,id) =      5.48841 ;
-T (1,16,id) =      5.25357 ;
-T (1,32,id) =      5.99685 ;
-T (1,64,id) =      11.1221 ;
-T (1,128,id) =      19.3754 ;
-T (1,160,id) =      22.0426 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/A2a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170728175 ;
-Nedges (id) = 180292586 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =      23.8366 ;
-T (1,2,id) =       13.701 ;
-T (1,4,id) =       7.7053 ;
-T (1,8,id) =       6.8874 ;
-T (1,16,id) =      6.50705 ;
-T (1,32,id) =      6.51311 ;
-T (1,64,id) =      7.55905 ;
-T (1,128,id) =      21.9834 ;
-T (1,160,id) =      25.9868 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/V1r.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 214005017 ;
-Nedges (id) = 232705452 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 4 ;
-T (1,1,id) =      15.8887 ;
-T (1,2,id) =      10.2235 ;
-T (1,4,id) =      6.19965 ;
-T (1,8,id) =      5.51504 ;
-T (1,16,id) =      5.34073 ;
-T (1,32,id) =      7.09256 ;
-T (1,64,id) =      15.1111 ;
-T (1,128,id) =      20.5942 ;
-T (1,160,id) =      27.1122 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-1073643522-268435456_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 268435456 ;
-Nedges (id) = 1073643522 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 5 ;
-T (1,1,id) =      57.9981 ;
-T (1,2,id) =      39.5611 ;
-T (1,4,id) =      28.2131 ;
-T (1,8,id) =      22.6554 ;
-T (1,16,id) =      21.5392 ;
-T (1,32,id) =      24.5843 ;
-T (1,64,id) =      60.8065 ;
-T (1,128,id) =      123.238 ;
-T (1,160,id) =      172.573 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/snap/friendster/friendster_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 119432957 ;
-Nedges (id) = 1799999986 ;
-% T (keep, nthreads, id) = time for all-k-truss
-T (1:2, 1:160, id) = nan ;
-Kmax (id) = 6 ;
-T (1,1,id) =      10765.1 ;
-T (1,2,id) =      5111.73 ;
-T (1,4,id) =      2958.99 ;
-T (1,8,id) =      2582.99 ;
-T (1,16,id) =      3119.79 ;
-T (1,32,id) =       2148.1 ;
-T (1,64,id) =      1838.66 ;
-T (1,128,id) =      2017.87 ;
-T (1,160,id) =      2051.17 ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz ';
-id = id + 1 ;
diff --git a/GraphBLAS/Extras/ktruss/bcsstk01 b/GraphBLAS/Extras/ktruss/bcsstk01
deleted file mode 100644
index 9712eb14ec..0000000000
--- a/GraphBLAS/Extras/ktruss/bcsstk01
+++ /dev/null
@@ -1,400 +0,0 @@
-1 1 1
-5 1 1
-6 1 1
-7 1 1
-11 1 1
-19 1 1
-25 1 1
-30 1 1
-2 2 1
-4 2 1
-6 2 1
-8 2 1
-10 2 1
-20 2 1
-24 2 1
-26 2 1
-3 3 1
-4 3 1
-5 3 1
-9 3 1
-21 3 1
-23 3 1
-27 3 1
-28 3 1
-2 4 1
-3 4 1
-4 4 1
-8 4 1
-10 4 1
-22 4 1
-27 4 1
-28 4 1
-1 5 1
-3 5 1
-5 5 1
-7 5 1
-11 5 1
-21 5 1
-23 5 1
-29 5 1
-1 6 1
-2 6 1
-6 6 1
-12 6 1
-20 6 1
-24 6 1
-25 6 1
-30 6 1
-1 7 1
-5 7 1
-7 7 1
-11 7 1
-12 7 1
-13 7 1
-31 7 1
-36 7 1
-2 8 1
-4 8 1
-8 8 1
-10 8 1
-12 8 1
-14 8 1
-18 8 1
-32 8 1
-3 9 1
-9 9 1
-10 9 1
-11 9 1
-15 9 1
-17 9 1
-33 9 1
-34 9 1
-2 10 1
-4 10 1
-8 10 1
-9 10 1
-10 10 1
-16 10 1
-33 10 1
-34 10 1
-1 11 1
-5 11 1
-7 11 1
-9 11 1
-11 11 1
-15 11 1
-17 11 1
-35 11 1
-6 12 1
-7 12 1
-8 12 1
-12 12 1
-14 12 1
-18 12 1
-31 12 1
-36 12 1
-7 13 1
-13 13 1
-17 13 1
-18 13 1
-19 13 1
-23 13 1
-37 13 1
-42 13 1
-43 13 1
-47 13 1
-48 13 1
-8 14 1
-12 14 1
-14 14 1
-15 14 1
-16 14 1
-18 14 1
-20 14 1
-22 14 1
-38 14 1
-44 14 1
-45 14 1
-46 14 1
-9 15 1
-11 15 1
-14 15 1
-15 15 1
-16 15 1
-17 15 1
-21 15 1
-39 15 1
-40 15 1
-44 15 1
-45 15 1
-46 15 1
-10 16 1
-14 16 1
-15 16 1
-16 16 1
-20 16 1
-22 16 1
-39 16 1
-40 16 1
-44 16 1
-45 16 1
-46 16 1
-9 17 1
-11 17 1
-13 17 1
-15 17 1
-17 17 1
-18 17 1
-19 17 1
-23 17 1
-41 17 1
-43 17 1
-47 17 1
-48 17 1
-8 18 1
-12 18 1
-13 18 1
-14 18 1
-17 18 1
-18 18 1
-24 18 1
-37 18 1
-42 18 1
-43 18 1
-47 18 1
-48 18 1
-1 19 1
-13 19 1
-17 19 1
-19 19 1
-23 19 1
-24 19 1
-43 19 1
-48 19 1
-2 20 1
-6 20 1
-14 20 1
-16 20 1
-20 20 1
-22 20 1
-24 20 1
-44 20 1
-3 21 1
-5 21 1
-15 21 1
-21 21 1
-22 21 1
-23 21 1
-45 21 1
-46 21 1
-4 22 1
-14 22 1
-16 22 1
-20 22 1
-21 22 1
-22 22 1
-45 22 1
-46 22 1
-3 23 1
-5 23 1
-13 23 1
-17 23 1
-19 23 1
-21 23 1
-23 23 1
-47 23 1
-2 24 1
-6 24 1
-18 24 1
-19 24 1
-20 24 1
-24 24 1
-43 24 1
-48 24 1
-1 25 1
-6 25 1
-25 25 1
-29 25 1
-30 25 1
-31 25 1
-35 25 1
-2 26 1
-26 26 1
-28 26 1
-32 26 1
-34 26 1
-3 27 1
-4 27 1
-27 27 1
-28 27 1
-33 27 1
-3 28 1
-4 28 1
-26 28 1
-27 28 1
-28 28 1
-32 28 1
-34 28 1
-5 29 1
-25 29 1
-29 29 1
-31 29 1
-35 29 1
-1 30 1
-6 30 1
-25 30 1
-30 30 1
-36 30 1
-7 31 1
-12 31 1
-25 31 1
-29 31 1
-31 31 1
-35 31 1
-36 31 1
-37 31 1
-8 32 1
-26 32 1
-28 32 1
-32 32 1
-34 32 1
-36 32 1
-38 32 1
-42 32 1
-9 33 1
-10 33 1
-27 33 1
-33 33 1
-34 33 1
-35 33 1
-39 33 1
-41 33 1
-9 34 1
-10 34 1
-26 34 1
-28 34 1
-32 34 1
-33 34 1
-34 34 1
-40 34 1
-11 35 1
-25 35 1
-29 35 1
-31 35 1
-33 35 1
-35 35 1
-39 35 1
-41 35 1
-7 36 1
-12 36 1
-30 36 1
-31 36 1
-32 36 1
-36 36 1
-38 36 1
-42 36 1
-13 37 1
-18 37 1
-31 37 1
-37 37 1
-41 37 1
-42 37 1
-43 37 1
-47 37 1
-14 38 1
-32 38 1
-36 38 1
-38 38 1
-40 38 1
-42 38 1
-44 38 1
-46 38 1
-15 39 1
-16 39 1
-33 39 1
-35 39 1
-39 39 1
-40 39 1
-41 39 1
-45 39 1
-15 40 1
-16 40 1
-34 40 1
-38 40 1
-39 40 1
-40 40 1
-44 40 1
-46 40 1
-17 41 1
-33 41 1
-35 41 1
-37 41 1
-39 41 1
-41 41 1
-43 41 1
-47 41 1
-13 42 1
-18 42 1
-32 42 1
-36 42 1
-37 42 1
-38 42 1
-42 42 1
-48 42 1
-13 43 1
-17 43 1
-18 43 1
-19 43 1
-24 43 1
-37 43 1
-41 43 1
-43 43 1
-47 43 1
-48 43 1
-14 44 1
-15 44 1
-16 44 1
-20 44 1
-38 44 1
-40 44 1
-44 44 1
-45 44 1
-46 44 1
-14 45 1
-15 45 1
-16 45 1
-21 45 1
-22 45 1
-39 45 1
-44 45 1
-45 45 1
-46 45 1
-14 46 1
-15 46 1
-16 46 1
-21 46 1
-22 46 1
-38 46 1
-40 46 1
-44 46 1
-45 46 1
-46 46 1
-13 47 1
-17 47 1
-18 47 1
-23 47 1
-37 47 1
-41 47 1
-43 47 1
-47 47 1
-48 47 1
-13 48 1
-17 48 1
-18 48 1
-19 48 1
-24 48 1
-42 48 1
-43 48 1
-47 48 1
-48 48 1
diff --git a/GraphBLAS/Extras/ktruss/d.awk b/GraphBLAS/Extras/ktruss/d.awk
deleted file mode 100644
index d3ab6f898f..0000000000
--- a/GraphBLAS/Extras/ktruss/d.awk
+++ /dev/null
@@ -1,5 +0,0 @@
-/sec rate/
-
-/sec step/
-
-/davis/
diff --git a/GraphBLAS/Extras/ktruss/filetrim.m b/GraphBLAS/Extras/ktruss/filetrim.m
deleted file mode 100644
index b7f4c93bae..0000000000
--- a/GraphBLAS/Extras/ktruss/filetrim.m
+++ /dev/null
@@ -1,21 +0,0 @@
-function f = filetrim (filename)
-% f = filetrim (filename)
-%
-% removes leading path and trailing "_adj.tsv.gz" from a filename
-
-f = filename ;
-
-i = find (f == '/', 1, 'last') ;
-if (~isempty (i))
-    f = f (i+1:end) ;
-end
-
-i = strfind (f, '_adj') ;
-if (~isempty (i))
-    f = f (1:i-1) ;
-end
-
-i = strfind (f, '.tsv') ;
-if (~isempty (i))
-    f = f (1:i-1) ;
-end
diff --git a/GraphBLAS/Extras/ktruss/k.awk b/GraphBLAS/Extras/ktruss/k.awk
deleted file mode 100644
index 2e7bc33dd4..0000000000
--- a/GraphBLAS/Extras/ktruss/k.awk
+++ /dev/null
@@ -1,3 +0,0 @@
-/GraphChallenge/
-
-/rate/
diff --git a/GraphBLAS/Extras/ktruss/k1 b/GraphBLAS/Extras/ktruss/k1
deleted file mode 100755
index 146edcf775..0000000000
--- a/GraphBLAS/Extras/ktruss/k1
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/csh
-gunzip -c ~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz | ./ktruss_main
-
diff --git a/GraphBLAS/Extras/ktruss/k2 b/GraphBLAS/Extras/ktruss/k2
deleted file mode 100755
index bb335f4bb9..0000000000
--- a/GraphBLAS/Extras/ktruss/k2
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/csh
-./krun \
-	~/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz \
-	~/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-4188162-1048576_adj.tsv.gz \
-
diff --git a/GraphBLAS/Extras/ktruss/kall b/GraphBLAS/Extras/ktruss/kall
deleted file mode 100755
index 1b0a838a41..0000000000
--- a/GraphBLAS/Extras/ktruss/kall
+++ /dev/null
@@ -1,158 +0,0 @@
-#!/bin/csh
-
-echo "function [Tgb, File, N, Nedges, Kmax] = allktruss_grb_results" > allktruss_grb_results.m
-echo "id = 0 ;" >> allktruss_grb_results.m
-echo "function [T, File, N, Nedges, Kmax] = allktruss_results" > allktruss_results.m
-echo "id = 0 ;" >> allktruss_results.m
-echo "function [T, File, N, Nedges] = ktruss_grb_results" > ktruss_grb_results.m
-echo "id = 0 ;" >> ktruss_grb_results.m
-echo "function [T, File, N, Nedges]  = ktruss_results" > ktruss_results.m
-echo "id = 0 ;" >> ktruss_results.m
-
-./krun2 \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/snap/as20000102/as20000102_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/snap/ca-GrQc/ca-GrQc_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-Bk.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella08/p2p-Gnutella08_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010331/oregon1_010331_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010407/oregon1_010407_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010414/oregon1_010414_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010421/oregon1_010421_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010428/oregon1_010428_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010505/oregon1_010505_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010512/oregon1_010512_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010519/oregon1_010519_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010526/oregon1_010526_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B2k.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella09/p2p-Gnutella09_adj.tsv.gz \
-    ~/GraphChallenge/snap/ca-HepTh/ca-HepTh_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010331/oregon2_010331_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010407/oregon2_010407_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010505/oregon2_010505_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010414/oregon2_010414_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010421/oregon2_010421_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010428/oregon2_010428_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010512/oregon2_010512_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010519/oregon2_010519_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-81-256-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-81-256-Bk.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010526/oregon2_010526_adj.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella06/p2p-Gnutella06_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-81-256-B2k.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella05/p2p-Gnutella05_adj.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella04/p2p-Gnutella04_adj.tsv.gz \
-    ~/GraphChallenge/snap/as-caida20071105/as-caida20071105_adj.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella25/p2p-Gnutella25_adj.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella24/p2p-Gnutella24_adj.tsv.gz \
-    ~/GraphChallenge/snap/facebook_combined/facebook_combined_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella30/p2p-Gnutella30_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/snap/ca-CondMat/ca-CondMat_adj.tsv.gz \
-    ~/GraphChallenge/snap/ca-HepPh/ca-HepPh_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella31/p2p-Gnutella31_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B1k.tsv.gz \
-    ~/GraphChallenge/snap/email-Enron/email-Enron_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-260610-65536_adj.tsv.gz \
-    ~/GraphChallenge/snap/ca-AstroPh/ca-AstroPh_adj.tsv.gz \
-    ~/GraphChallenge/snap/loc-brightkite_edges/loc-brightkite_edges_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-256-625-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-256-625-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-256-625-B2k.tsv.gz \
-    ~/GraphChallenge/snap/cit-HepTh/cit-HepTh_adj.tsv.gz \
-    ~/GraphChallenge/snap/soc-Epinions1/soc-Epinions1_adj.tsv.gz \
-    ~/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz \
-    ~/GraphChallenge/snap/cit-HepPh/cit-HepPh_adj.tsv.gz \
-    ~/GraphChallenge/snap/soc-Slashdot0811/soc-Slashdot0811_adj.tsv.gz \
-    ~/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-1045506-262144_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/snap/loc-gowalla_edges/loc-gowalla_edges_adj.tsv.gz \
-    ~/GraphChallenge/snap/amazon0302/amazon0302_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-256-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-256-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-256-B2k.tsv.gz \
-    ~/GraphChallenge/snap/roadNet-PA/roadNet-PA_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-4188162-1048576_adj.tsv.gz \
-    ~/GraphChallenge/snap/flickrEdges/flickrEdges_adj.tsv.gz \
-    ~/GraphChallenge/snap/amazon0312/amazon0312_adj.tsv.gz \
-    ~/GraphChallenge/snap/amazon0505/amazon0505_adj.tsv.gz \
-    ~/GraphChallenge/snap/amazon0601/amazon0601_adj.tsv.gz \
-    ~/GraphChallenge/snap/roadNet-CA/roadNet-CA_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale18-ef16/graph500-scale18-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale19-ef16/graph500-scale19-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-16764930-4194304_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale20-ef16/graph500-scale20-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale21-ef16/graph500-scale21-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-67084290-16777216_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/U1a.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/P1a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/A2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V1r.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-1073643522-268435456_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz \
-    ~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz
-
diff --git a/GraphBLAS/Extras/ktruss/kall3 b/GraphBLAS/Extras/ktruss/kall3
deleted file mode 100755
index a4978309b3..0000000000
--- a/GraphBLAS/Extras/ktruss/kall3
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/csh
-
-./krun3 \
-    ~/GraphChallenge/synthetic/image-grid/g-67084290-16777216_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/U1a.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/P1a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/A2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V1r.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-1073643522-268435456_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz \
-    ~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz
-
-./krun3b \
-    ~/GraphChallenge/synthetic/graph500-scale21-ef16/graph500-scale21-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-67084290-16777216_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/U1a.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/P1a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/A2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V1r.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-1073643522-268435456_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz \
-    ~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz
-
diff --git a/GraphBLAS/Extras/ktruss/kall4 b/GraphBLAS/Extras/ktruss/kall4
deleted file mode 100755
index bdf2931950..0000000000
--- a/GraphBLAS/Extras/ktruss/kall4
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/csh
-
-./kskip \
-    ~/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz
-
-./kskip2 \
-    ~/GraphChallenge/synthetic/graph500-scale21-ef16/graph500-scale21-ef16_adj.tsv.gz \
-
-./krun3b \
-    ~/GraphChallenge/synthetic/image-grid/g-67084290-16777216_adj.tsv.gz \
-
-./kskip2 \
-    ~/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz \
-
-./krun3b \
-    ~/GraphChallenge/synthetic/gc6/V2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/U1a.tsv.gz \
-
-./kskip2 \
-    ~/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz \
-
-./krun3b \
-    ~/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/P1a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/A2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V1r.tsv.gz \
-
-./kskip2 \
-    ~/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz \
-
-./krun3b \
-    ~/GraphChallenge/synthetic/image-grid/g-1073643522-268435456_adj.tsv.gz \
-
-./kskip2 \
-    ~/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz \
-
-./krun3b \
-    ~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz \
-
-./kskip2 \
-    ~/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz
-
diff --git a/GraphBLAS/Extras/ktruss/kgo b/GraphBLAS/Extras/ktruss/kgo
deleted file mode 100755
index 05eb9e5955..0000000000
--- a/GraphBLAS/Extras/ktruss/kgo
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/bin/csh
-
-./krun \
-	~/GraphChallenge/ssget/Mallya/lhr71_adj.tsv.gz \
-	~/GraphChallenge/ssget/Mallya/lhr71_withz_adj.tsv.gz \
-	~/GraphChallenge/ssget/Freescale/Freescale2_adj.tsv.gz \
-	~/GraphChallenge/ssget/Freescale/Freescale2_withz_adj.tsv.gz \
-	~/GraphChallenge/snap/cit-HepPh/cit-HepPh_adj.tsv.gz \
-	~/GraphChallenge/snap/cit-HepTh/cit-HepTh_adj.tsv.gz \
-	~/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz \
-	~/GraphChallenge/snap/soc-Epinions1/soc-Epinions1_adj.tsv.gz \
-	~/GraphChallenge/snap/soc-Slashdot0811/soc-Slashdot0811_adj.tsv.gz \
-	~/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz \
-	~/GraphChallenge/snap/amazon0312/amazon0312_adj.tsv.gz \
-	~/GraphChallenge/snap/amazon0505/amazon0505_adj.tsv.gz \
-	~/GraphChallenge/snap/amazon0601/amazon0601_adj.tsv.gz \
-	~/GraphChallenge/snap/flickrEdges/flickrEdges_adj.tsv.gz \
-	~/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz \
-	~/GraphChallenge/ssget/SNAP/soc-LiveJournal1_adj.tsv.gz \
-	~/GraphChallenge/ssget/Gleich/wb-edu_adj.tsv.gz \
-	~/GraphChallenge/snap/amazon0302/amazon0302_adj.tsv.gz \
-	~/GraphChallenge/snap/as-caida20071105/as-caida20071105_adj.tsv.gz \
-	~/GraphChallenge/snap/as20000102/as20000102_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-AstroPh/ca-AstroPh_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-CondMat/ca-CondMat_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-GrQc/ca-GrQc_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-HepPh/ca-HepPh_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-HepTh/ca-HepTh_adj.tsv.gz \
-	~/GraphChallenge/snap/email-Enron/email-Enron_adj.tsv.gz \
-	~/GraphChallenge/snap/facebook_combined/facebook_combined_adj.tsv.gz \
-	~/GraphChallenge/snap/loc-brightkite_edges/loc-brightkite_edges_adj.tsv.gz \
-	~/GraphChallenge/snap/loc-gowalla_edges/loc-gowalla_edges_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010331/oregon1_010331_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010407/oregon1_010407_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010414/oregon1_010414_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010421/oregon1_010421_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010428/oregon1_010428_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010505/oregon1_010505_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010512/oregon1_010512_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010519/oregon1_010519_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010526/oregon1_010526_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010331/oregon2_010331_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010407/oregon2_010407_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010414/oregon2_010414_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010421/oregon2_010421_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010428/oregon2_010428_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010505/oregon2_010505_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010512/oregon2_010512_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010519/oregon2_010519_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010526/oregon2_010526_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella04/p2p-Gnutella04_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella05/p2p-Gnutella05_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella06/p2p-Gnutella06_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella08/p2p-Gnutella08_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella09/p2p-Gnutella09_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella24/p2p-Gnutella24_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella25/p2p-Gnutella25_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella30/p2p-Gnutella30_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella31/p2p-Gnutella31_adj.tsv.gz \
-	~/GraphChallenge/snap/roadNet-CA/roadNet-CA_adj.tsv.gz \
-	~/GraphChallenge/snap/roadNet-PA/roadNet-PA_adj.tsv.gz \
-	~/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-1045506-262144_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-16764930-4194304_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-260610-65536_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-4188162-1048576_adj.tsv.gz \
-	~/GraphChallenge/ssget/DIMACS10/hugebubbles-00020_adj.tsv.gz \
-	~/GraphChallenge/ssget/vanHeukelum/cage15_adj.tsv.gz \
-	~/GraphChallenge/ssget/Freescale/circuit5M_adj.tsv.gz \
-	~/GraphChallenge/synthetic/graph500-scale18-ef16/graph500-scale18-ef16_adj.tsv.gz \
-	~/GraphChallenge/synthetic/graph500-scale19-ef16/graph500-scale19-ef16_adj.tsv.gz \
-	~/GraphChallenge/synthetic/graph500-scale20-ef16/graph500-scale20-ef16_adj.tsv.gz \
-	~/GraphChallenge/synthetic/graph500-scale21-ef16/graph500-scale21-ef16_adj.tsv.gz \
-	~/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz \
- 	~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz \
- 	~/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz \
- 	~/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz \
- 	~/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz
-
diff --git a/GraphBLAS/Extras/ktruss/kgo.m b/GraphBLAS/Extras/ktruss/kgo.m
deleted file mode 100644
index 2c9558e9ff..0000000000
--- a/GraphBLAS/Extras/ktruss/kgo.m
+++ /dev/null
@@ -1,140 +0,0 @@
-% Test ktruss.m, allktruss.m, and the MATLAB mexFunction interfaces for
-% ktruss.c and allktruss.c
-
-clear
-mex -largeArrayDims ...
-    -I../GraphBLAS/Demo/Include -I../GraphBLAS/Include ...
-    ktruss_mex.c ktruss.c ktruss_ntriangles.c
-
-mex -largeArrayDims ...
-    -I../GraphBLAS/Demo/Include -I../GraphBLAS/Include ...
-    allktruss_mex.c allktruss.c ktruss_ntriangles.c
-
-index = ssget ;
-f = find (index.pattern_symmetry == 1) ;
-[~,i] = sort (index.nnz (f)) ;
-f = f (i) ;
-
-% f = [739 750 2662];
-  f = [168 739 750] ; 
-% f = [168] 
-
-nmat = length (f)
-skip = [ 1415 ] ;
-
-% nmat = 9
-for kk = 1:nmat
-    id = f (kk) ;
-    Prob = ssget (id, index) ;
-    A = Prob.A ;
-    if (isfield (Prob, 'Zeros')) 
-        A = A + Prob.Zeros ;
-    end
-    fprintf ('\n%4d %4d %-40s\n', kk, id, Prob.name) ;
-    clear Prob 
-    A = spones (A) ;
-    A = spones (A + A') ;
-    A = A - diag (diag (A)) ;
-    assert (isequal (A, A')) ;
-
-    n = size (A,1) ;
-    stats1.kmax = 0 ;
-    stats1.time_mex = zeros (1,n+1) ;
-    stats1.time_matlab = zeros (1,n+1) ;
-    stats1.ntri = zeros (1,n+1) ;
-    stats1.nedges = zeros (1,n+1) ;
-    stats1.nsteps = zeros (1,n+1) ;
-
-    clear AllC1 AllC2
-
-    % find all the k-trusses
-    k = 3 ;
-    while (1)
-
-        tic
-        [C2,nsteps] = ktruss_mex (A,k) ;
-        t2 = toc ;
-        stats1.time_mex (k) = t2 ;
-
-        ne = nnz (C2) / 2 ;
-        nt = full (sum (sum (C2))) / 6 ;
-
-        stats1.nedges (k) = ne ;
-        stats1.ntri (k) = nt ;
-        stats1.nsteps (k) = nsteps ;
-
-        fprintf ('k %4d %12.6f sec ne %10d nt %10d ', k, t2, ne, nt) ;
-
-        if (~any (id == skip))
-            tic
-            C1 = ktruss (A,k) ;
-            t1 = toc ;
-            stats1.time_matlab (k) = t1 ;
-            fprintf ('MATLAB %12.6f sec speedup %7.2f', t1, t1/t2) ;
-            assert (isequal (C1, C2)) ;
-        end
-
-        fprintf ('\n') ;
-
-        AllC1 {k} = C2 ;
-
-        if (ne == 0)
-            break ;
-        end
-
-        % assert (spok (C2) == 1)
-        clear C1 C2
-        k = k + 1 ;
-    end
-
-    stats1.kmax = k ;
-    stats1.time_mex = stats1.time_mex (1:k) ;
-    stats1.time_matlab = stats1.time_matlab (1:k) ;
-    stats1.ntri = stats1.ntri (1:k) ;
-    stats1.nedges = stats1.nedges (1:k) ;
-    stats1.nsteps = stats1.nsteps (1:k) ;
-
-    fprintf ('anyk total steps %6d times: mex %12.6f matlab %12.6f\n', ...
-        sum (stats1.nsteps), sum (stats1.time_mex), sum (stats1.time_matlab)) ;
-
-    % find all the k-trusses in MATLAB
-    % [stats, AllC2] = allktruss (A) ;
-    [stats, AllC2] = allktruss (A) ;
-
-%   for k = 3:max (length (AllC1), length (AllC2))
-%       k
-%       assert (isequal (AllC1 {k}, AllC2 {k})) ;
-%   end
-
-    % assert (isequal (AllC1, AllC2)) ;
-    assert (isequal (stats1.kmax, stats.kmax)) ;
-    assert (isequal (stats1.nedges, stats.nedges)) ;
-    assert (isequal (stats1.ntri, stats.ntri)) ;
-    % assert (isequal (stats1.nsteps, stats.nsteps)) ;
-
-    % find all the k-trusses in C
-    tic
-    [stats3, AllC3] = allktruss_mex (A) ;
-    t = toc ;
-
-    fprintf ('allk total steps %6d times: mex %12.6f matlab %12.6f\n', ...
-        sum (stats.nsteps), t, sum (stats.time)) ;
-
-%   for k = 1:stats1.kmax-1
-%       k
-%       c2 = AllC2 {k} ;
-%       c3 = AllC3 {k} ;
-%       assert (isequal (c2, c3)) ;
-%   end
-
-    assert (isequal (AllC2, AllC3)) ;
-
-    % stats1
-    % stats3
-    assert (isequal (stats.kmax,   stats3.kmax)) ;
-    assert (isequal (stats.nedges, stats3.nedges)) ;
-    assert (isequal (stats.ntri,   stats3.ntris)) ;
-    assert (isequal (stats.nsteps, stats3.nsteps)) ;
-
-end
-
diff --git a/GraphBLAS/Extras/ktruss/kgo1 b/GraphBLAS/Extras/ktruss/kgo1
deleted file mode 100755
index 1149e15caf..0000000000
--- a/GraphBLAS/Extras/ktruss/kgo1
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/csh
-./krun \
-	~/GraphChallenge/ssget/Mallya/lhr71_withz_adj.tsv.gz \
-	~/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz
diff --git a/GraphBLAS/Extras/ktruss/kmake.m b/GraphBLAS/Extras/ktruss/kmake.m
deleted file mode 100644
index 16ceb3bca2..0000000000
--- a/GraphBLAS/Extras/ktruss/kmake.m
+++ /dev/null
@@ -1,11 +0,0 @@
-% compile the ktruss_mex and allktruss_mex mexFunction interfaces to
-% ktruss.c and allktruss.c
-
-mex -largeArrayDims ...
-    -I../GraphBLAS/Demo/Include -I../GraphBLAS/Include ...
-    ktruss_mex.c ktruss.c ktruss_ntriangles.c
-
-mex -largeArrayDims ...
-    -I../GraphBLAS/Demo/Include -I../GraphBLAS/Include ...
-    allktruss_mex.c allktruss.c ktruss_ntriangles.c
-
diff --git a/GraphBLAS/Extras/ktruss/kres.m b/GraphBLAS/Extras/ktruss/kres.m
deleted file mode 100644
index efd1102178..0000000000
--- a/GraphBLAS/Extras/ktruss/kres.m
+++ /dev/null
@@ -1,131 +0,0 @@
-
-clear
-
-% get file names and graph sizes
-[~, ~, N, Nedges, ~, File] = tri_results ;
-nmat = length (File)
-
-% load all results
-[T_allktruss_grb, File2,  N2,  Nedges2,  Kmax] = allktruss_grb_results ;
-
-nmat1 = length (N2) ;
-% assert (isequal (N (1:nmat1), N2)) ;
-% assert (isequal (Nedges (1:nmat1), Nedges2)) ;
-% assert (isequal (File (1:nmat1), File2)) ;
-for id = (nmat1+1):nmat
-    T_allktruss_grb (id) = nan ;
-    Kmax (id) = nan ;
-end
-
-[T_allktruss, File2, N2, Nedges2, Kmax2] = allktruss_results ;
-
-
-nmat1 = length (N2) ;
-% assert (isequal (N (1:nmat1), N2)) ;
-% assert (isequal (Nedges (1:nmat1), Nedges2)) ;
-% assert (isequal (File (1:nmat1), File2)) ;
-for id = (nmat1+1):nmat
-    T_allktruss (:,:,id) = nan ;
-    Kmax2 (id) = nan ;
-end
-
-% assert (isequalwithequalnans (Kmax, Kmax2)) ;
-
-[T_ktruss_grb, File2, N2, Nedges2] = ktruss_grb_results ;
-
-nmat1 = length (N2) ;
-% assert (isequal (N (1:nmat1), N2)) ;
-% assert (isequal (Nedges (1:nmat1), Nedges2)) ;
-% assert (isequal (File (1:nmat1), File2)) ;
-for id = (nmat1+1):nmat
-    T_ktruss_grb {id} = nan (3,1) ;
-end
-
-[T_ktruss,     File2, N2, Nedges2]  = ktruss_results ;
-
-nmat1 = length (N2) ;
-% assert (isequal (N (1:nmat1), N2)) ;
-% assert (isequal (Nedges (1:nmat1), Nedges2)) ;
-% assert (isequal (File (1:nmat1), File2)) ;
-for id = (nmat1+1):nmat
-    T_ktruss {id} = nan (3,160) ;
-end
-
-clear N2 Nedges2 Kmax2 File2
-whos
-
-% find the # of threads used
-
-Time = T_allktruss ;
-nthreads_max = size (Time, 2)
-threads = [ ] ;
-for nth = 1:nthreads_max
-    if (all (all (all (~isnan (Time (1, nth, 1))))))
-        threads = [threads nth] ;
-    end
-end
-
-threads
-% nmat = size (Time, 3)
-esort = elist ;
-
-which_keep = 1 ; % time in allktruss without keeping each k-truss
-
-fprintf ('id | 3-truss                           | allktruss\n') ;
-fprintf ('   | grb seq para                      | kmax | grb seq para\n') ;
-
-% for id = 1:nmat
-for kkk = 1:length (esort)
-    id = esort (kkk) ;
-    e = Nedges (id) ;
-    if (e < 2e5)
-        continue ;
-    end
-
-    n = N (id) ;
-    e = Nedges (id) ;
-    file = fixmawi (File {id}) ;
-%   fprintf ('%10d %12d : ', n, e) ;
-
-%   fprintf ('%3d %30s %10d %12d ', id, file, N(id), Nedges(id)) ;
-    what = kkk ;
-    % what = id ;
-    fprintf ('%3d& %24s & %6.2f & %7.2f ', ...
-        what, file, n/1e6, e/1e6) ;
-
-    % k-truss in GraphBLAS: 1D array of size kmax, just one thread
-    Tk_grb = T_ktruss_grb {id} ;
-    Tk_grb_3rate = 1e-6 * e / Tk_grb (3) ;      % 3-truss rate in GrB
-
-    % k-truss in pure C, Tk (3:kmax, threads)
-    Tk = T_ktruss {id} ;
-    Tk_seq =      Tk (:, 1) ;                   % 1 thread
-    Tk_par = min (Tk (:, threads), [ ], 2) ;    % best of all threads
-    Tk_seq_3rate = 1e-6 * e / Tk_seq (3) ;      % sequential 3-truss rate
-    Tk_par_3rate = 1e-6 * e / Tk_par (3) ;      % best parallel 3-truss rate
-
-    fprintf (' & %8.1f  & %8.1f  & %8.1f ', ... % id, ...
-        Tk_grb_3rate, Tk_seq_3rate, Tk_par_3rate) ; 
-
-    kmax = Kmax (id) ;
-    fprintf (' & %4d ', kmax) ;
-
-    % all-k-truss in GraphBLAS
-    Tall_grb = T_allktruss_grb (which_keep,id) ;
-    Tall_grb_rate = 1e-6 * (kmax-2) * e / Tall_grb ; % all-truss rate in GrB
-    fprintf (' & %8.1f ', Tall_grb_rate) ;
-
-    % all-k-truss in pure C
-    Tall = T_allktruss (:,:,id) ;
-    Tall_seq =      Tall (which_keep,1)  ;  % 1 thread
-    Tall_par = min (Tall (which_keep,:)) ;  % best of all threads
-
-    Tall_seq_rate = 1e-6 * (kmax-2) * e / Tall_seq ; % sequential all-k rate
-    Tall_par_rate = 1e-6 * (kmax-2) * e / Tall_par ; % best parallel all-k rate
-
-    fprintf (' & %8.1f ', Tall_seq_rate) ;
-    fprintf (' & %8.1f ', Tall_par_rate) ;
-
-    fprintf ('\\\\\n') ;
-end
-
diff --git a/GraphBLAS/Extras/ktruss/kres14.m b/GraphBLAS/Extras/ktruss/kres14.m
deleted file mode 100644
index a76513e353..0000000000
--- a/GraphBLAS/Extras/ktruss/kres14.m
+++ /dev/null
@@ -1,93 +0,0 @@
-
-clear
-
-% load all results
-[T_allktruss_grb, File,  N,  Nedges,  Kmax] = allktruss_grb_results ;
-
-[T_allktruss,     File2, N2, Nedges2, Kmax2] = allktruss_results ;
-
-assert (isequal (N, N2)) ;
-assert (isequal (Nedges, Nedges2)) ;
-assert (isequal (Kmax, Kmax2)) ;
-assert (isequal (File, File2)) ;
-
-[T_ktruss_grb, File2, N2, Nedges2] = ktruss_grb_results ;
-
-%{
-assert (isequal (N, N2)) ;
-assert (isequal (Nedges, Nedges2)) ;
-% assert (isequal (Kmax, Kmax2)) ;
-assert (isequal (File, File2)) ;
-%}
-
-[T_ktruss,     File2, N2, Nedges2]  = ktruss_results ;
-
-%{
-assert (isequal (N, N2)) ;
-assert (isequal (Nedges, Nedges2)) ;
-% assert (isequal (Kmax, Kmax2)) ;
-assert (isequal (File, File2)) ;
-%}
-
-clear N2 Nedges2 Kmax2 File2
-
-% find the # of threads used
-
-Time = T_allktruss ;
-nthreads_max = size (Time, 2)
-threads = [ ] ;
-for nth = 1:nthreads_max
-    if (all (all (all (~isnan (Time (1, nth, 1))))))
-        threads = [threads nth] ;
-    end
-end
-
-threads
-nmat = 19
-
-which_keep = 1 ; % time in allktruss without keeping each k-truss
-
-fprintf ('id | 3-truss                           | allktruss\n') ;
-fprintf ('   | grb seq para                      | kmax | grb seq para\n') ;
-for id = 1:nmat
-
-    n = N (id) ;
-    e = Nedges (id) ;
-    fprintf ('%10d %12d : ', n, e) ;
-
-    % k-truss in GraphBLAS: 1D array of size kmax, just one thread
-    Tk_grb = T_ktruss_grb {id} ;
-    Tk_grb_3rate = 1e-6 * e / Tk_grb (3) ;      % 3-truss rate in GrB
-
-    % k-truss in pure C, Tk (3:kmax, threads)
-    Tk = T_ktruss {id} ;
-    Tk_seq =      Tk (:, 1) ;                   % 1 thread
-    Tk_par = min (Tk (:, threads), [ ], 2) ;    % best of all threads
-    Tk_seq_3rate = 1e-6 * e / Tk_seq (3) ;      % sequential 3-truss rate
-    Tk_par_3rate = 1e-6 * e / Tk_par (3) ;      % best parallel 3-truss rate
-
-    fprintf ('%2d & %8.1f  & %8.1f  & %8.1f ', id, ...
-        Tk_grb_3rate, Tk_seq_3rate, Tk_par_3rate) ; 
-
-    kmax = Kmax (id) ;
-    fprintf (' & %4d ', kmax) ;
-
-    % all-k-truss in GraphBLAS
-    Tall_grb = T_allktruss_grb (which_keep,id) ;
-    Tall_grb_rate = 1e-6 * (kmax-2) * e / Tall_grb ; % all-truss rate in GrB
-    fprintf (' & %8.1f ', Tall_grb_rate) ;
-
-    % all-k-truss in pure C
-    Tall = T_allktruss (:,:,id) ;
-    Tall_seq =      Tall (which_keep,1)  ;  % 1 thread
-    Tall_par = min (Tall (which_keep,:)) ;  % best of all threads
-
-    Tall_seq_rate = 1e-6 * (kmax-2) * e / Tall_seq ; % sequential all-k rate
-    Tall_par_rate = 1e-6 * (kmax-2) * e / Tall_par ; % best parallel all-k rate
-
-    fprintf (' & %8.1f ', Tall_seq_rate) ;
-    fprintf (' & %8.1f ', Tall_par_rate) ;
-
-    fprintf ('\\\\\n') ;
-end
-
diff --git a/GraphBLAS/Extras/ktruss/krun b/GraphBLAS/Extras/ktruss/krun
deleted file mode 100755
index 2c0044adc1..0000000000
--- a/GraphBLAS/Extras/ktruss/krun
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/csh
-# usage: krun files
-
-echo "function [Tgb, File, N, Nedges, Kmax] = allktruss_grb_results" > allktruss_grb_results.m
-echo "id = 0 ;" >> allktruss_grb_results.m
-echo "function [T, File, N, Nedges, Kmax] = allktruss_results" > allktruss_results.m
-echo "id = 0 ;" >> allktruss_results.m
-echo "function [T, File, N, Nedges] = ktruss_grb_results" > ktruss_grb_results.m
-echo "id = 0 ;" >> ktruss_grb_results.m
-echo "function [T, File, N, Nedges]  = ktruss_results" > ktruss_results.m
-echo "id = 0 ;" >> ktruss_results.m
-
-foreach file ($argv[1-])
-        echo ''
-        echo '--------------------------------------------------------------------------------'
-	echo $file
-	echo $file >& /dev/tty
-
-        echo "file = '" $file "';" >> allktruss_grb_results.m
-        echo "file = '" $file "';" >> allktruss_results.m
-        echo "file = '" $file "';" >> ktruss_grb_results.m
-        echo "file = '" $file "';" >> ktruss_results.m
-
-	gunzip -c $file | ./ktruss_main
-	gunzip -c $file | ./ktruss_graphblas_main 1
-	gunzip -c $file | ./allktruss_main
-	gunzip -c $file | ./allktruss_graphblas_main 1
-end
diff --git a/GraphBLAS/Extras/ktruss/krun2 b/GraphBLAS/Extras/ktruss/krun2
deleted file mode 100755
index 5ac88af5ab..0000000000
--- a/GraphBLAS/Extras/ktruss/krun2
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/csh
-# usage: krun files
-
-# echo "function [Tgb, File, N, Nedges, Kmax] = allktruss_grb_results" > allktruss_grb_results.m
-# echo "id = 0 ;" >> allktruss_grb_results.m
-# echo "function [T, File, N, Nedges, Kmax] = allktruss_results" > allktruss_results.m
-# echo "id = 0 ;" >> allktruss_results.m
-# echo "function [T, File, N, Nedges] = ktruss_grb_results" > ktruss_grb_results.m
-# echo "id = 0 ;" >> ktruss_grb_results.m
-# echo "function [T, File, N, Nedges]  = ktruss_results" > ktruss_results.m
-# echo "id = 0 ;" >> ktruss_results.m
-
-foreach file ($argv[1-])
-        echo ''
-        echo '--------------------------------------------------------------------------------'
-	echo $file
-	echo $file >& /dev/tty
-
-        echo "file = '" $file "';" >> allktruss_grb_results.m
-        echo "file = '" $file "';" >> allktruss_results.m
-        echo "file = '" $file "';" >> ktruss_grb_results.m
-        echo "file = '" $file "';" >> ktruss_results.m
-
-	gunzip -c $file | ./ktruss_main
-	gunzip -c $file | ./ktruss_graphblas_main 1
-	gunzip -c $file | ./allktruss_main
-	gunzip -c $file | ./allktruss_graphblas_main 1
-end
diff --git a/GraphBLAS/Extras/ktruss/krun3 b/GraphBLAS/Extras/ktruss/krun3
deleted file mode 100755
index 8cf5ed8703..0000000000
--- a/GraphBLAS/Extras/ktruss/krun3
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/csh
-# usage: krun files
-
-foreach file ($argv[1-])
-        echo ''
-        echo '--------------------------------------------------------------------------------'
-	echo $file
-	echo $file >& /dev/tty
-
-#       echo "file = '" $file "';" >> allktruss_grb_results.m
-#       echo "file = '" $file "';" >> allktruss_results.m
-        echo "file = '" $file "';" >> ktruss_grb_results.m
-        echo "file = '" $file "';" >> ktruss_results.m
-
-	gunzip -c $file | ./ktruss_main
-	gunzip -c $file | ./ktruss_graphblas_main 1
-#	gunzip -c $file | ./allktruss_main
-#	gunzip -c $file | ./allktruss_graphblas_main 1
-end
diff --git a/GraphBLAS/Extras/ktruss/krun3b b/GraphBLAS/Extras/ktruss/krun3b
deleted file mode 100755
index 716271f981..0000000000
--- a/GraphBLAS/Extras/ktruss/krun3b
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/csh
-# usage: krun files
-
-foreach file ($argv[1-])
-        echo ''
-        echo '--------------------------------------------------------------------------------'
-	echo $file
-	echo $file >& /dev/tty
-
-        echo "file = '" $file "';" >> allktruss_grb_results.m
-        echo "file = '" $file "';" >> allktruss_results.m
-#       echo "file = '" $file "';" >> ktruss_grb_results.m
-#       echo "file = '" $file "';" >> ktruss_results.m
-
-#	gunzip -c $file | ./ktruss_main
-#	gunzip -c $file | ./ktruss_graphblas_main 1
- 	gunzip -c $file | ./allktruss_main
- 	gunzip -c $file | ./allktruss_graphblas_main 1
-end
diff --git a/GraphBLAS/Extras/ktruss/krun4 b/GraphBLAS/Extras/ktruss/krun4
deleted file mode 100755
index 8cf5ed8703..0000000000
--- a/GraphBLAS/Extras/ktruss/krun4
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/csh
-# usage: krun files
-
-foreach file ($argv[1-])
-        echo ''
-        echo '--------------------------------------------------------------------------------'
-	echo $file
-	echo $file >& /dev/tty
-
-#       echo "file = '" $file "';" >> allktruss_grb_results.m
-#       echo "file = '" $file "';" >> allktruss_results.m
-        echo "file = '" $file "';" >> ktruss_grb_results.m
-        echo "file = '" $file "';" >> ktruss_results.m
-
-	gunzip -c $file | ./ktruss_main
-	gunzip -c $file | ./ktruss_graphblas_main 1
-#	gunzip -c $file | ./allktruss_main
-#	gunzip -c $file | ./allktruss_graphblas_main 1
-end
diff --git a/GraphBLAS/Extras/ktruss/kskip b/GraphBLAS/Extras/ktruss/kskip
deleted file mode 100755
index 0084e0c19b..0000000000
--- a/GraphBLAS/Extras/ktruss/kskip
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/bin/csh
-# usage: krun files
-
-foreach file ($argv[1-])
-        echo ''
-        echo '--------------------------------------------------------------------------------'
-	echo $file
-	echo $file >& /dev/tty
-
-#       echo "file = '" $file "';" >> allktruss_grb_results.m
-#       echo "file = '" $file "';" >> allktruss_results.m
-        echo "file = '" $file "';" >> ktruss_grb_results.m
-        echo "id = id + 1 ;"       >> ktruss_grb_results.m
-
-        echo "file = '" $file "';" >> ktruss_results.m
-        echo "id = id + 1 ;"       >> ktruss_results.m
-
-#	gunzip -c $file | ./ktruss_main
-#	gunzip -c $file | ./ktruss_graphblas_main 1
-#	gunzip -c $file | ./allktruss_main
-#	gunzip -c $file | ./allktruss_graphblas_main 1
-end
diff --git a/GraphBLAS/Extras/ktruss/kskip2 b/GraphBLAS/Extras/ktruss/kskip2
deleted file mode 100755
index 61cb0fd9a7..0000000000
--- a/GraphBLAS/Extras/ktruss/kskip2
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/csh
-# usage: krun files
-
-foreach file ($argv[1-])
-        echo ''
-        echo '--------------------------------------------------------------------------------'
-	echo $file
-	echo $file >& /dev/tty
-
-        echo "file = '" $file "';" >> allktruss_grb_results.m
-        echo "id = id + 1 ;"       >> allktruss_grb_results.m
-
-        echo "file = '" $file "';" >> allktruss_results.m
-        echo "id = id + 1 ;"       >> allktruss_results.m
-
-#       echo "file = '" $file "';" >> ktruss_grb_results.m
-#       echo "file = '" $file "';" >> ktruss_results.m
-
-#	gunzip -c $file | ./ktruss_main
-#	gunzip -c $file | ./ktruss_graphblas_main 1
-#	gunzip -c $file | ./allktruss_main
-#	gunzip -c $file | ./allktruss_graphblas_main 1
-end
diff --git a/GraphBLAS/Extras/ktruss/ksub b/GraphBLAS/Extras/ktruss/ksub
deleted file mode 100755
index 2e7c717b4d..0000000000
--- a/GraphBLAS/Extras/ktruss/ksub
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/csh
-./krun \
-	~/GraphChallenge/snap/facebook_combined/facebook_combined_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-CondMat/ca-CondMat_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-HepPh/ca-HepPh_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella31/p2p-Gnutella31_adj.tsv.gz \
-	~/GraphChallenge/snap/email-Enron/email-Enron_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-AstroPh/ca-AstroPh_adj.tsv.gz \
-	~/GraphChallenge/snap/loc-brightkite_edges/loc-brightkite_edges_adj.tsv.gz \
-	~/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz \
-	~/GraphChallenge/snap/soc-Epinions1/soc-Epinions1_adj.tsv.gz \
-	~/GraphChallenge/snap/cit-HepPh/cit-HepPh_adj.tsv.gz \
-	~/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz \
-	~/GraphChallenge/snap/amazon0302/amazon0302_adj.tsv.gz \
-	~/GraphChallenge/snap/loc-gowalla_edges/loc-gowalla_edges_adj.tsv.gz \
-	~/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz \
-	~/GraphChallenge/snap/flickrEdges/flickrEdges_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-4188162-1048576_adj.tsv.gz \
-	~/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-16764930-4194304_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz \
- 	~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz
-
diff --git a/GraphBLAS/Extras/ktruss/ktruss.c b/GraphBLAS/Extras/ktruss/ktruss.c
deleted file mode 100644
index a3058d5c6a..0000000000
--- a/GraphBLAS/Extras/ktruss/ktruss.c
+++ /dev/null
@@ -1,205 +0,0 @@
-//------------------------------------------------------------------------------
-// ktruss: construct the k_truss of a graph
-//------------------------------------------------------------------------------
-
-// C = ktruss (A,k), the k-truss of the graph A
-
-// On input, A is the adjacency matrix of a graph, which must be square with
-// symmetric pattern, and no diagonal entries.  These conditions are not
-// checked.  A is treated as if binary on input so the content of Ax is ignored
-// on input.  The matrix A is represented in compressed sparse column form as
-// Ap, Ai, and n on input.  That is, the pattern of column A(:,j) is held in
-// Ai [Ap [j] ... Ap [j+1]-1], where Ap [0] = 0 and Ap [n] = nnz (A).
-
-// The value of k for the requested k-truss is provided as the scalar input,
-// support = k-2, which must be > 0.
-
-// On output, the input graph A is overwitten with the graph C, which is the
-// k-truss subgraph of A.  Its edges are a subset of the input graph A.  Each
-// edge in C is part of at least k-2 triangles in C.  The pattern of C, (that
-// is, spones(C) in MATLAB notation), is the adjacency matrix of the k-truss
-// subgraph of A.  The edge weights of C are the support of each edge.  That
-// is, C(i,j)=nt if the edge (i,j) is part of nt triangles in C.  All edges in
-// C have support of at least k-2.  The total number of triangles in C is
-// sum(sum(C))/6 in MATLAB notation.  The number of edges in C is nnz(C)/2, in
-// MATLAB notation, or Ap [n]/2.  C is returned as symmetric with a zero-free
-// diagonal, with all entries greater than or equal to k-2.  The matrix C is
-// returned on output in compressed sparse column form in Ap, Ai, Ax, and n (n
-// doesn't change).  That is, the pattern and values of C(:,j) are held in Ai
-// [Ap [j] ... Ap [j+1]-1] and Ax [Ap [j] ... Ap [j+1]-1], where Ap [0] = 0 and
-// Ap [n] = nnz (C).
-
-// The return value is the # of steps the algorithm performed, or <= 0 on
-// error, where 0 indicates that the support input was invalid, and -1
-// indicates an out-of-memory condition.
-
-#include "ktruss_def.h"
-
-_Thread_local Index *restrict w    = NULL ;
-_Thread_local bool  *restrict Mark = NULL ;
-
-int64_t ktruss                  // # steps taken, or <= 0 if error
-(
-    // input/output:
-    int64_t *restrict Ap,       // column pointers, size n+1
-    Index   *restrict Ai,       // row indices, size anz = Ap [n]
-    // output, content not defined on input:
-    Index   *restrict Ax,       // values
-
-    // input, not modified:
-    const Index n,              // A is n-by-n
-    const Index support,        // support = (k-2) for a k-truss, must be > 0
-    const int threads,          // # of threads
-    const Index chunk           // scheduler chunk size
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    if (support <= 0) return (0) ;
-
-    int nthreads = (n < chunk) ? 1 : threads ;
-
-    //--------------------------------------------------------------------------
-    // allocate workspace
-    //--------------------------------------------------------------------------
-
-    bool ok = true ;
-
-    #pragma omp parallel num_threads(nthreads) reduction(&&:ok)
-    {
-        w    = (Index *) calloc (n, sizeof (Index)) ;
-        Mark = (bool  *) calloc (n, sizeof (bool )) ;
-        ok = (Mark != NULL && w != NULL) ;
-    }
-
-    #pragma omp parallel num_threads(nthreads)
-    {
-        if (!ok)
-        {
-            // out of memory
-            if (w    != NULL) free (w   ) ;
-            if (Mark != NULL) free (Mark) ;
-        }
-    }
-
-    if (!ok) return (-1) ;
-
-    double tmult = 0 ;
-    double tsel  = 0 ;
-
-    //--------------------------------------------------------------------------
-    // C = ktruss (A)
-    //--------------------------------------------------------------------------
-
-    for (int64_t nsteps = 1 ; ; nsteps++)
-    {
-
-        //----------------------------------------------------------------------
-        // C = (A*A) .* A
-        //----------------------------------------------------------------------
-
-        // This step computes the values of C in Ax.  The pattern of A and C
-        // are the same, and are in Ap, Ai, and n.  A is treated as if binary
-        // so its values are ignored.  This computation is a mimic for
-        // GrB_mxm (C, A, NULL, GxB_PLUS_LAND_INT64, A, A, NULL), except that
-        // here, the output C can overwrite the input A.
-
-        printf ("step %" PRId64"\n", nsteps) ;
-        double t1 = omp_get_wtime ( ) ;
-
-        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,chunk)
-        for (Index j = 0 ; j < n ; j++)
-        {
-            // scatter A(:,j) into Mark.  All of w is zero.
-            for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-            {
-                Mark [Ai [p]] = 1 ;
-            }
-            // C(:,j) = (A * A(:,j)) .* Mark
-            for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-            {
-                const Index k = Ai [p] ;                // (row k, not k-truss)
-                // C(:,j) += (A(:,k) * A(k,j)) .* Mark
-                for (int64_t pa = Ap [k] ; pa < Ap [k+1] ; pa++)
-                {
-                    // C(i,j) += (A(i,k) * A(k,j)) .* Mark
-                    Index i = Ai [pa] ;
-                    if (Mark [i]) w [i]++ ;
-                }
-            }
-            // gather C(:,j) from the workspace and clear the Mark
-            for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-            {
-                Index i = Ai [p] ;
-                Ax [p] = w [i] ;
-                Mark [i] = 0 ;
-                w [i] = 0 ;
-            }
-        }
-
-        double t2 = omp_get_wtime ( ) ;
-        printf ("C<C>=C*C time: %g\n", t2-t1) ;
-        tmult += (t2-t1) ;
-
-        //----------------------------------------------------------------------
-        // anz = nnz (A)
-        //----------------------------------------------------------------------
-
-        int64_t anz = Ap [n] ;
-
-        //----------------------------------------------------------------------
-        // C = C .* (C >= support)
-        //----------------------------------------------------------------------
-
-        // C is now in Ap, Ai, Ax, and n.  Prune all entries C(i,j) < support.
-        // This code snippet is a mimic for
-        // GxB_select (T, NULL, NULL, supportop, C, Support, NULL)
-        // except that this code can operate on C in-place.
-
-        int64_t cnz = 0 ;
-        for (Index j = 0 ; j < n ; j++)
-        {
-            // log the start of column C(:,j)
-            int64_t p1 = Ap [j] ;
-            Ap [j] = cnz ;
-            for (int64_t p = p1 ; p < Ap [j+1] ; p++)
-            {
-                // consider the edge C(i,j)
-                Index i   = Ai [p] ;
-                Index cij = Ax [p] ;
-                if (cij >= support)
-                {
-                    // the edge C(i,j) has enough support; keep it
-                    Ai [cnz  ] = i ;
-                    Ax [cnz++] = cij ;
-                }
-            }
-        }
-        Ap [n] = cnz ;
-
-        double t3 = omp_get_wtime ( ) ;
-        printf ("select time: %g\n", t3-t2) ;
-        tsel += (t3-t2) ;
-
-        //----------------------------------------------------------------------
-        // if (nnz (C) == nnz (A)) return
-        //----------------------------------------------------------------------
-
-        if (cnz == anz)
-        {
-            // k-truss has been found, free workspace and return result
-            printf ("ktruss nthreads %d done: tmult %g tsel %g\n",
-                nthreads, tmult, tsel) ;
-            #pragma omp parallel num_threads(nthreads)
-            {
-                free (w) ;
-                free (Mark) ;
-            }
-            return (nsteps) ;
-        }
-    }
-}
-
diff --git a/GraphBLAS/Extras/ktruss/ktruss.m b/GraphBLAS/Extras/ktruss/ktruss.m
deleted file mode 100644
index c8e2794f40..0000000000
--- a/GraphBLAS/Extras/ktruss/ktruss.m
+++ /dev/null
@@ -1,81 +0,0 @@
-function [C,nsteps] = ktruss (A,k)
-%KTRUSS k-truss of a graph.
-%
-% C = ktruss (A,k)
-%
-% A is the adjacency matrix of a graph, and must be square, symmetric, binary,
-% and with a zero-free diagonal.  These conditions are not checked.
-%
-% k defines the k-truss to find.  k >= 3 is required.
-%
-% C is the k-truss subgraph of A.  Its edges are a subset of A.  Each edge in C
-% is part of at least k-2 triangles in C.  The pattern of C, (that is,
-% spones(C)), is the adjacency matrix of the k-truss subgraph of A.  The edge
-% weights of C are the support of each edge.  That is, C(i,j)=nt if the edge
-% (i,j) is part of nt triangles in C.  All edges in C have support of at least
-% k-2.  The total number of triangles in C is sum(sum(C))/6.  The number of
-% edges in C is nnz(C)/2.  C is returned as symmetric with a zero-free
-% diagonal, with all entries greater than or equal to k-2.
-%
-% The 2nd optional output [C,nsteps] = ktruss (...), returns the # of steps
-% the algorithm performed.
-
-%-------------------------------------------------------------------------------
-% initializations
-%-------------------------------------------------------------------------------
-
-if (nargin < 2)
-    % default is 3-truss: each edge of C is in at least one triangle
-    k = 3 ;
-end
-
-% ensure k is 3 or more
-if (k < 3)
-    error ('k must be >= 3')
-end
-
-% each edge of C must be incident on k-2 triangles
-support = k-2 ;
-
-last_cnz = nnz (A) ;
-C = A ;
-
-%-------------------------------------------------------------------------------
-% find the k-truss of A
-%-------------------------------------------------------------------------------
-
-nsteps = 1 ;
-while (1)
-
-    % Use the Burkhardt method to count the triangles (see tricount.m), where
-    % C(i,j) = # of triangles incident on edge (i,j).  Each triangle is counted
-    % 6 times.  Suppose there is a triangle involving edges (i,j,k).  Then it
-    % is counted for each 3 choose 2: (i,j), (j,i), (i,k), (k,i), (j,k), (k,j).
-    % Note that (i,j) can be an edge in the graph, but if it is not incident on
-    % any triangles, then the (i,j)th entry of C*C is zero, so it gets dropped.
-    % This step requires C (on input) to be binary.
-    C = (C*C) .* C ;
-
-    % The support of the edge (i,j) is C(i,j), which is the # of triangles
-    % incident on that edge.  This is the number of triangles that contain the
-    % edge (i,j).  The next step removes edges with support less than k-2.
-    % If k=3 this step is not needed, since edges with no suport have already
-    % been removed by the step above (MATLAB removes all explicit zeros).
-    if (k > 3)
-        C = C .* (C >= support) ;
-    end
-
-    % If no edges have been removed by the two statements above, then C has
-    % converged on the k-truss subgraph of the input matrix A.
-    cnz = nnz (C) ;
-    if (cnz == last_cnz)
-        return ;
-    end
-
-    % otherwise, continue with the new graph, but make it binary first
-    C = spones (C) ;
-    last_cnz = cnz ;
-    nsteps = nsteps + 1 ;
-
-end
-
diff --git a/GraphBLAS/Extras/ktruss/ktruss_def.h b/GraphBLAS/Extras/ktruss/ktruss_def.h
deleted file mode 100644
index 1867d49fb3..0000000000
--- a/GraphBLAS/Extras/ktruss/ktruss_def.h
+++ /dev/null
@@ -1,89 +0,0 @@
-//------------------------------------------------------------------------------
-// ktruss_def.h: definitions for ktruss and allktruss functions
-//------------------------------------------------------------------------------
-
-// These functions do not use GraphBLAS
-
-#include <stdint.h>
-#include <inttypes.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <string.h>
-#include <stdlib.h>
-
-#ifdef MATLAB_MEX_FILE
-#include "mex.h"
-#include "matrix.h"
-#define malloc mxMalloc
-#define calloc mxCalloc
-#define free   mxFree
-#else
-#include <omp.h>
-#endif
-
-#undef MIN
-#undef MAX
-#define MIN(a,b) (((a) < (b)) ? (a) : (b))
-#define MAX(a,b) (((a) > (b)) ? (a) : (b))
-
-// select the 64-bit version
-#define LP64
-
-#if defined (MATLAB_MEX_FILE) || defined (LP64)
-typedef int64_t Index ;
-#define INDEX_MAX ((Index) ((1ULL << 48)-1))
-#else
-typedef int32_t Index ;
-#define INDEX_MAX ((Index) ((1ULL << 31)-1))
-#endif
-
-#define NWORKSPACES 256
-
-bool ktruss_read        // true if successful, false otherwise
-(
-    FILE *f,            // file for reading, already open (can be stdin)
-    int64_t **p_Ap,     // Ap: column pointers, of size n+1
-    Index **p_Ai,       // Ai: row indices, of size nz = Ap [n]
-    Index *p_n          // A is n-by-n
-) ;
-
-int64_t ktruss                  // # steps taken, or <= 0 if error
-(
-    // input/output:
-    int64_t *restrict Ap,       // column pointers, size n+1
-    Index   *restrict Ai,       // row indices, size anz = Ap [n]
-    // output, content not defined on inpu:
-    Index   *restrict Ax,       // values
-    // input, not modified:
-    const Index n,              // A is n-by-n
-    const Index support,        // support = (k-2) for a k-truss, must be > 0
-    const int threads,          // # of threads
-    const Index chunck          // scheduler chunk size
-) ;
-
-int64_t ktruss_ntriangles (const int64_t anz, const Index *Ax) ;
-
-bool allktruss                  // true if successful, false otherwise
-(
-    // input/output:
-    int64_t *restrict Ap,       // column pointers, size n+1
-    Index   *restrict Ai,       // row indices, size anz = Ap [n]
-    // output, content not defined on input:
-    Index   *restrict Ax,       // values
-    // input, not modified:
-    const Index n,              // A is n-by-n
-    const int threads,          // # of threads
-    const Index chunk,          // scheduler chunk size
-
-    // output statistics
-    int64_t *restrict kmax,     // smallest k where k-truss is empty
-    int64_t *restrict ntris,    // size n, ntris [k] is #triangles in k-truss
-    int64_t *restrict nedges,   // size n, nedges [k] is #edges in k-truss
-    int64_t *restrict nstepss,  // size n, nsteps [k] is #steps for k-truss
-
-    // optional output k-trusses, if present
-    int64_t **restrict Cps,     // size n
-    Index   **restrict Cis,     // size n
-    Index   **restrict Cxs      // size n
-) ;
-
diff --git a/GraphBLAS/Extras/ktruss/ktruss_graphblas.c b/GraphBLAS/Extras/ktruss/ktruss_graphblas.c
deleted file mode 100644
index 7157a33172..0000000000
--- a/GraphBLAS/Extras/ktruss/ktruss_graphblas.c
+++ /dev/null
@@ -1,144 +0,0 @@
-//------------------------------------------------------------------------------
-// ktruss_graphblas.c: find the k-truss subgraph of a graph via GraphBLAS
-//------------------------------------------------------------------------------
-
-// Given a symmetric graph A with no-self edges, ktruss_graphblas finds the
-// k-truss subgraph of A.
-
-// The edge weights of A are treated as binary.  Explicit zero entries in A are
-// treated as non-edges.  Any type will work, but int64 is recommended for
-// fastest results since that is the type used here for the semiring.
-// GraphBLAS will do typecasting internally, but that takes extra time. 
-
-// The output matrix C is the k-truss subgraph of A.  Its edges are a subset of
-// A.  Each edge in C is part of at least k-2 triangles in C.  The pattern of C
-// is the adjacency matrix of the k-truss subgraph of A.  The edge weights of C
-// are the support of each edge.  That is, C(i,j)=nt if the edge (i,j) is part
-// of nt triangles in C.  All edges in C have support of at least k-2.  The
-// total number of triangles in C is reduce(C,'plus')/6.  The number of edges
-// in C is nnz(C)/2.  C is returned as symmetric with a zero-free diagonal.
-
-// Usage: constructs C as the k-truss of A
-//      GrB_Matrix C = NULL ;
-//      int64_t nsteps ;
-//      GrB_Info info = ktruss_graphblas (&C, A, k, &nsteps) ;
-
-// Compare this function with the MATLAB equivalent, ktruss.m.
-
-// Modified for SuiteSparse:GraphBLAS V3.0:  support changed to a
-// GrB_Vector, for input to GxB_select.
-
-#define FREE_ALL                        \
-    GrB_free (Support) ;                \
-    GrB_free (&supportop) ;             \
-    GrB_free (&C) ;
-
-#include "ktruss_graphblas_def.h"
-
-//------------------------------------------------------------------------------
-// support_function:  select function for GxB_SelectOp and GxB_select
-//------------------------------------------------------------------------------
-
-bool support_function (const GrB_Index i, const GrB_Index j,
-    const GrB_Index nrows, const GrB_Index ncols,
-    const int64_t *x, const int64_t *support)
-{
-    return ((*x) >= (*support)) ;
-}
-
-//------------------------------------------------------------------------------
-// C = ktruss_graphblas (A,k): find the k-truss subgraph of a graph
-//------------------------------------------------------------------------------
-
-GrB_Info ktruss_graphblas       // compute the k-truss of a graph
-(
-    GrB_Matrix *p_C,            // output k-truss subgraph, C
-    GrB_Matrix A,               // input adjacency matrix, A, not modified
-    const int64_t k,            // find the k-truss, where k >= 3
-    int64_t *p_nsteps           // # of steps taken
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    // ensure k is 3 or more
-    if (k < 3) return (GrB_INVALID_VALUE) ;
-
-    if (p_C == NULL || p_nsteps == NULL) return (GrB_NULL_POINTER) ;
-
-    //--------------------------------------------------------------------------
-    // initializations
-    //--------------------------------------------------------------------------
-
-    GrB_Info info ;
-    GxB_SelectOp supportop = NULL ;
-    GrB_Vector Support = NULL ;
-
-    GrB_Index n ;
-    GrB_Matrix C = NULL ;
-    OK (GrB_Matrix_nrows (&n, A)) ;
-    OK (GrB_Matrix_new (&C, GrB_INT64, n, n)) ;
-
-    // select operator
-    int64_t support = (k-2) ;
-    OK (GxB_SelectOp_new (&supportop, support_function, GrB_INT64, GrB_INT64)) ;
-    OK (GrB_Vector_new (&Support, GrB_INT64, 1)) ;
-    OK (GrB_Vector_setElement (Support, support, 0)) ;
-
-    // last_cnz = nnz (A)
-    GrB_Index cnz, last_cnz ;
-    OK (GrB_Matrix_nvals (&last_cnz, A)) ;
-
-    //--------------------------------------------------------------------------
-    // find the k-truss of A
-    //--------------------------------------------------------------------------
-
-    double tmult = 0 ;
-    double tsel  = 0 ;
-
-    for (int64_t nsteps = 1 ; ; nsteps++)
-    {
-
-        //----------------------------------------------------------------------
-        // C<C> = C*C
-        //----------------------------------------------------------------------
-
-        GrB_Matrix Cin = (nsteps == 1) ? A : C ;
-        double t1 = omp_get_wtime ( ) ;
-        OK (GrB_mxm (C, Cin, NULL, GxB_PLUS_LAND_INT64, Cin, Cin, NULL)) ;
-        double t2 = omp_get_wtime ( ) ;
-        printf ("C<C>=C*C time: %g\n", t2-t1) ;
-        tmult += (t2-t1) ;
-
-        //----------------------------------------------------------------------
-        // C = C .* (C >= support)
-        //----------------------------------------------------------------------
-
-        OK (GxB_select (C, NULL, NULL, supportop, C, Support, NULL)) ;
-
-        double t3 = omp_get_wtime ( ) ;
-        printf ("select time: %g\n", t3-t2) ;
-        tsel += (t3-t2) ;
-
-        //----------------------------------------------------------------------
-        // check if the k-truss has been found
-        //----------------------------------------------------------------------
-
-        OK (GrB_Matrix_nvals (&cnz, C)) ;
-        if (cnz == last_cnz)
-        {
-            printf ("ktruss_grb done: tmult %g tsel %g\n", tmult, tsel) ;
-            (*p_C) = C ;                        // return the output matrix C
-            (*p_nsteps) = nsteps ;              // return # of steps
-            OK (GrB_free (&supportop)) ;        // free the select operator
-            OK (GrB_free (Support)) ;           // free the select Thunk
-            return (GrB_SUCCESS) ;
-        }
-        last_cnz = cnz ;
-    }
-}
-
-#undef FREE_ALL
-
diff --git a/GraphBLAS/Extras/ktruss/ktruss_graphblas_def.h b/GraphBLAS/Extras/ktruss/ktruss_graphblas_def.h
deleted file mode 100644
index 3315a7527b..0000000000
--- a/GraphBLAS/Extras/ktruss/ktruss_graphblas_def.h
+++ /dev/null
@@ -1,29 +0,0 @@
-//------------------------------------------------------------------------------
-// ktruss_graphblas_def.h:  include file for k-truss methods using GraphBLAS
-//------------------------------------------------------------------------------
-
-#include "GraphBLAS.h"
-
-// GraphBLAS/Demo:  uses simple_tic, simple_toc, OK(...) macro, and get_matrix
-#include "demos.h"
-
-GrB_Info ktruss_graphblas       // compute the k-truss of a graph
-(
-    GrB_Matrix *p_C_output,     // output k-truss subgraph, C
-    GrB_Matrix A_input,         // input adjacency matrix, A, not modified
-    const int64_t k,            // find the k-truss, where k >= 3
-    int64_t *p_nsteps           // # of steps taken
-) ;
-
-GrB_Info allktruss_graphblas    // compute all k-trusses of a graph
-(
-    GrB_Matrix *Cset,           // output k-truss subgraphs (optional)
-    GrB_Matrix A,               // input adjacency matrix, A, not modified
-
-    // output statistics
-    int64_t *kmax,              // smallest k where k-truss is empty
-    int64_t *ntris,             // size n, ntris [k] is #triangles in k-truss
-    int64_t *nedges,            // size n, nedges [k] is #edges in k-truss
-    int64_t *nstepss            // size n, nstepss [k] is #steps for k-truss
-) ;
-
diff --git a/GraphBLAS/Extras/ktruss/ktruss_graphblas_main.c b/GraphBLAS/Extras/ktruss/ktruss_graphblas_main.c
deleted file mode 100644
index b8d7e28bf6..0000000000
--- a/GraphBLAS/Extras/ktruss/ktruss_graphblas_main.c
+++ /dev/null
@@ -1,153 +0,0 @@
-//------------------------------------------------------------------------------
-// ktruss_graphblas_main.c: find the k-truss of a graph using GraphBLAS
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Read a graph from a file and find the k-truss
-// Usage:
-//
-//  ktruss_graphblas_main   < infile
-//  ktruss_graphblas_main 1 < infile
-//  ktruss_graphblas_main 0 nrows ncols ntuples method
-//  ktruss_graphblas_main 1 nx ny method
-//
-// Where infile has one line per edge in the graph; these have the form
-//
-//  i j x
-//
-// where A(i,j)=x is performed by GrB_Matrix_build, to construct the matrix.
-// The default file format is 0-based, but with "ktruss_main 1 < infile" the
-// matrix is assumed to be 1-based.
-
-// The dimensions of A are assumed to be the largest row and column indices,
-// plus one if the matrix is 1-based.  This is done in get_matrix.c.
-//
-// For the second usage (ktruss_main 0 ...), a random symmetric matrix is
-// created of size nrows-by-ncols with ntuples edges (some will be duplicates
-// so the actual number of edges will be slightly less).  The method is 0 for
-// setElement and 1 for build.  The matrix will not have any self-edges, which
-// cause the method to fail.
-//
-// The 3rd usage (ktruss_main 1 ...) creates a finite-element matrix on an
-// nx-by-ny grid.  Method is 0 to 3; refer to wathen.c for details.
-
-#ifndef MATLAB_MEX_FILE
-
-// macro used by OK(...) to free workspace if an error occurs
-#define FREE_ALL                \
-    GrB_free (&C) ;             \
-    GrB_free (&A) ;
-
-#include "ktruss_graphblas_def.h"
-
-int main (int argc, char **argv)
-{
-    GrB_Matrix C = NULL, A = NULL ;
-    GrB_Info info ;
-    double tic [2] ;
-    OK (GrB_init (GrB_NONBLOCKING)) ;
-    printf ("--------------------------------------------------------------\n");
-
-    //--------------------------------------------------------------------------
-    // get a symmetric matrix with no self edges
-    //--------------------------------------------------------------------------
-
-    // get_matrix reads in a double-precision matrix.  It could easily be
-    // changed to read in int64 matrix instead, but this would affect the
-    // other GraphBLAS demos.  So the time to typecast A = (int64) C is added
-    // to the read time, not the prep time for finding the k-truss.
-    simple_tic (tic) ;
-    OK (get_matrix (&C, argc, argv, true, false)) ;
-    GrB_Index n, nedges ;
-    OK (GrB_Matrix_nrows (&n, C)) ;
-    // GxB_print (C, GxB_COMPLETE) ;
-
-    // A = spones (C), and typecast to int64
-    OK (GrB_Matrix_new (&A, GrB_INT64, n, n)) ;
-    OK (GrB_apply (A, NULL, NULL, GxB_ONE_INT64, C, NULL)) ;
-    double t_read = simple_toc (tic) ;
-    printf ("\ntotal time to read A matrix: %14.6f sec\n", t_read) ;
-    GrB_free (&C) ;
-    OK (GrB_Matrix_nvals (&nedges, A)) ;
-    nedges /= 2 ;
-
-    // for further MATLAB analysis
-    FILE *fm = fopen ("ktruss_grb_results.m", "a") ;
-    fprintf (fm, "\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n") ;
-    fprintf (fm, "id = id + 1 ;\n") ;
-    fprintf (fm, "N (id) = %" PRIu64 " ;\n", n) ;
-    fprintf (fm, "Nedges (id) = %" PRIu64 " ;\n", nedges) ;
-    fprintf (fm, "%% Time (3:kmax) = time for k-truss\n") ;
-
-    //--------------------------------------------------------------------------
-    // find all k-trusses
-    //--------------------------------------------------------------------------
-
-    // for (int64_t k = 3 ; ; k++)
-    int64_t k = 3 ;
-    {
-
-        //----------------------------------------------------------------------
-        // find the k-truss
-        //----------------------------------------------------------------------
-
-        int64_t nsteps ;
-        simple_tic (tic) ;
-        OK (ktruss_graphblas (&C, A, k, &nsteps)) ;
-        double t = simple_toc (tic) ;
-
-        //----------------------------------------------------------------------
-        // check result and free workspace
-        //----------------------------------------------------------------------
-
-        GrB_Index nnz ;
-        OK (GrB_Matrix_nvals (&nnz, C)) ;
-        GrB_Index ne = nnz / 2 ;
-
-        int64_t nt ;
-        OK (GrB_reduce (&nt, NULL, GxB_PLUS_INT64_MONOID, C, NULL)) ;
-        nt /= 6 ;
-
-        printf (
-        "ktruss_grblas nthreads %3d : k %4"PRId64" ne %10" PRId64" nt %10" PRId64" %12.6f sec"
-        " rate %7.2f steps %4" PRId64"\n", 1, k, ne, nt, t, 1e-6*nedges/t, nsteps) ;
-
-        // print the entire k-truss if it's small
-        /*
-        #define NZMAX 400
-        if (nnz > 0 && nnz < NZMAX)
-        {
-            printf ("\nEntire %lld-truss of the graph:\n", k) ;
-            GrB_Index I [NZMAX], J [NZMAX] ;
-            int64_t X [NZMAX] ;
-            OK (GrB_Matrix_extractTuples (I, J, X, &nnz, C)) ;
-            for (int p = 0 ; p < nnz ; p++)
-            {
-                if (I [p] < J [p])
-                {
-                    printf ("    (%lld,%lld): support %u\n",
-                        I [p], J [p], X [p]) ;
-                }
-            }
-        }
-        */
-
-        GrB_free (&C) ;
-        fprintf (fm, "Time (%"PRId64") = %12.6g ;\n", k, t) ;
-
-        // if (nnz == 0) break ;
-    }
-
-    fprintf (fm, "T {id} = Time' ;\n") ;
-    fprintf (fm, "File {id} = filetrim (file) ;\n\n") ;
-    fclose (fm) ;
-    FREE_ALL ;
-    GrB_finalize ( ) ;
-}
-
-#undef FREE_ALL
-#endif
diff --git a/GraphBLAS/Extras/ktruss/ktruss_grb_results.m b/GraphBLAS/Extras/ktruss/ktruss_grb_results.m
deleted file mode 100644
index 3214bc29f8..0000000000
--- a/GraphBLAS/Extras/ktruss/ktruss_grb_results.m
+++ /dev/null
@@ -1,1552 +0,0 @@
-function [T, File, N, Nedges] = ktruss_grb_results
-id = 0 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 442 ;
-Nedges (id) = 841 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =  0.000216412 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 442 ;
-Nedges (id) = 841 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    8.791e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 442 ;
-Nedges (id) = 800 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   5.8763e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 120 ;
-Nedges (id) = 346 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =  0.000124596 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 120 ;
-Nedges (id) = 346 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   5.4105e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 120 ;
-Nedges (id) = 240 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   4.0021e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20 ;
-Nedges (id) = 31 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   3.5269e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20 ;
-Nedges (id) = 31 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   3.3363e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20 ;
-Nedges (id) = 24 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   3.2015e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 300 ;
-Nedges (id) = 720 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   5.1988e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 30 ;
-Nedges (id) = 49 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   3.7842e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 30 ;
-Nedges (id) = 49 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   3.4795e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 30 ;
-Nedges (id) = 40 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   3.2889e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 60 ;
-Nedges (id) = 104 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    4.558e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 60 ;
-Nedges (id) = 104 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   3.8181e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 60 ;
-Nedges (id) = 90 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   3.4658e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170 ;
-Nedges (id) = 313 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   8.3857e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170 ;
-Nedges (id) = 313 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    5.141e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170 ;
-Nedges (id) = 288 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   4.1984e-05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 300 ;
-Nedges (id) = 940 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =  0.000376801 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 300 ;
-Nedges (id) = 940 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =  0.000103928 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1020 ;
-Nedges (id) = 3448 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00234159 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1020 ;
-Nedges (id) = 3448 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =  0.000534437 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1020 ;
-Nedges (id) = 2880 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =  0.000103422 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2132 ;
-Nedges (id) = 4156 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00160319 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2132 ;
-Nedges (id) = 4156 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =  0.000527824 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2132 ;
-Nedges (id) = 4050 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00015743 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1200 ;
-Nedges (id) = 4320 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =  0.000126813 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1200 ;
-Nedges (id) = 6583 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00692833 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1200 ;
-Nedges (id) = 6583 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00107539 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4420 ;
-Nedges (id) = 14400 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =  0.000343073 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/as20000102/as20000102_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 6474 ;
-Nedges (id) = 12572 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0131207 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4420 ;
-Nedges (id) = 15988 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0228703 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4420 ;
-Nedges (id) = 15988 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00450108 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-GrQc/ca-GrQc_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5242 ;
-Nedges (id) = 14484 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00511122 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5100 ;
-Nedges (id) = 23040 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =  0.000453986 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella08/p2p-Gnutella08_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 6301 ;
-Nedges (id) = 20777 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0032062 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010331/oregon1_010331_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10670 ;
-Nedges (id) = 22002 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0347631 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010407/oregon1_010407_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10729 ;
-Nedges (id) = 21999 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.034996 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010414/oregon1_010414_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10790 ;
-Nedges (id) = 22469 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0367419 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010421/oregon1_010421_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10859 ;
-Nedges (id) = 22747 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.037403 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010428/oregon1_010428_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10886 ;
-Nedges (id) = 22493 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0374954 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010505/oregon1_010505_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10943 ;
-Nedges (id) = 22607 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0375786 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010512/oregon1_010512_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11011 ;
-Nedges (id) = 22677 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0377513 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010519/oregon1_010519_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11051 ;
-Nedges (id) = 22724 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0380022 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010526/oregon1_010526_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11174 ;
-Nedges (id) = 23409 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0391321 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5100 ;
-Nedges (id) = 31036 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0806769 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5100 ;
-Nedges (id) = 31036 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0125731 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella09/p2p-Gnutella09_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8114 ;
-Nedges (id) = 26013 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00377212 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-HepTh/ca-HepTh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 9877 ;
-Nedges (id) = 25973 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0079193 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010331/oregon2_010331_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10900 ;
-Nedges (id) = 31180 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0467702 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010407/oregon2_010407_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10981 ;
-Nedges (id) = 30855 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0466226 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010505/oregon2_010505_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11157 ;
-Nedges (id) = 30943 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0475886 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010414/oregon2_010414_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11019 ;
-Nedges (id) = 31761 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0487696 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010421/oregon2_010421_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11080 ;
-Nedges (id) = 31538 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0485223 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010428/oregon2_010428_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11113 ;
-Nedges (id) = 31434 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0485012 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010512/oregon2_010512_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11260 ;
-Nedges (id) = 31303 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0483449 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010519/oregon2_010519_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11375 ;
-Nedges (id) = 32287 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0508012 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-81-256-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 21074 ;
-Nedges (id) = 41809 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0414728 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-81-256-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 21074 ;
-Nedges (id) = 41472 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00129611 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010526/oregon2_010526_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11461 ;
-Nedges (id) = 32730 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.052027 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella06/p2p-Gnutella06_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8717 ;
-Nedges (id) = 31525 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00363995 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-81-256-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 21074 ;
-Nedges (id) = 41809 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0112729 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella05/p2p-Gnutella05_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8846 ;
-Nedges (id) = 31839 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00370588 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella04/p2p-Gnutella04_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10876 ;
-Nedges (id) = 39994 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.004424 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/as-caida20071105/as-caida20071105_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26475 ;
-Nedges (id) = 53381 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0872111 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella25/p2p-Gnutella25_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 22687 ;
-Nedges (id) = 54705 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00566568 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella24/p2p-Gnutella24_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26518 ;
-Nedges (id) = 65369 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00710609 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/facebook_combined/facebook_combined_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4039 ;
-Nedges (id) = 88234 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.136951 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36244 ;
-Nedges (id) = 129600 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00274087 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella30/p2p-Gnutella30_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36682 ;
-Nedges (id) = 88328 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0096133 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36244 ;
-Nedges (id) = 137164 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.782534 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36244 ;
-Nedges (id) = 137164 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.154476 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20400 ;
-Nedges (id) = 138240 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00213147 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26520 ;
-Nedges (id) = 144000 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =   0.00243491 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-CondMat/ca-CondMat_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 23133 ;
-Nedges (id) = 93439 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.042131 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-HepPh/ca-HepPh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 12008 ;
-Nedges (id) = 118489 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      0.21285 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26520 ;
-Nedges (id) = 175873 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      1.43414 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26520 ;
-Nedges (id) = 175873 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.266344 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella31/p2p-Gnutella31_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 62586 ;
-Nedges (id) = 147892 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.016643 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20400 ;
-Nedges (id) = 217255 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.242383 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20400 ;
-Nedges (id) = 217255 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      1.87706 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/email-Enron/email-Enron_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36692 ;
-Nedges (id) = 183831 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.249566 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-260610-65536_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 65536 ;
-Nedges (id) = 260610 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0228028 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-AstroPh/ca-AstroPh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 18772 ;
-Nedges (id) = 198050 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.207624 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/loc-brightkite_edges/loc-brightkite_edges_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 58228 ;
-Nedges (id) = 214078 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.147899 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-256-625-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 160882 ;
-Nedges (id) = 320000 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0101209 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-256-625-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 160882 ;
-Nedges (id) = 320881 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      0.77993 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-256-625-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 160882 ;
-Nedges (id) = 320881 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.188931 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-HepTh/cit-HepTh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 27770 ;
-Nedges (id) = 352285 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.460862 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Epinions1/soc-Epinions1_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 75879 ;
-Nedges (id) = 405740 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.770484 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 265214 ;
-Nedges (id) = 364481 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.873318 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-HepPh/cit-HepPh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 34546 ;
-Nedges (id) = 420877 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.422364 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Slashdot0811/soc-Slashdot0811_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 77360 ;
-Nedges (id) = 469180 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.539958 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 82168 ;
-Nedges (id) = 504230 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.596186 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-1045506-262144_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 262144 ;
-Nedges (id) = 1045506 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0921073 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 132600 ;
-Nedges (id) = 1152000 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0170281 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/loc-gowalla_edges/loc-gowalla_edges_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 196591 ;
-Nedges (id) = 950327 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      2.22921 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0302/amazon0302_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 262111 ;
-Nedges (id) = 899792 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.414458 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 132600 ;
-Nedges (id) = 1582861 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      56.7368 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 132600 ;
-Nedges (id) = 1582861 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =       7.7528 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-256-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 547924 ;
-Nedges (id) = 2073600 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0497507 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-256-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 547924 ;
-Nedges (id) = 2132284 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      89.1714 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-256-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 547924 ;
-Nedges (id) = 2132284 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      22.3308 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-PA/roadNet-PA_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1088092 ;
-Nedges (id) = 1541898 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.138852 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 362440 ;
-Nedges (id) = 2332800 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =    0.0399433 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 362440 ;
-Nedges (id) = 2606125 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      150.815 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 362440 ;
-Nedges (id) = 2606125 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =       27.277 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1379917 ;
-Nedges (id) = 1921660 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.177162 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-4188162-1048576_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1048576 ;
-Nedges (id) = 4188162 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.380094 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/flickrEdges/flickrEdges_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 105938 ;
-Nedges (id) = 2316948 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      12.5244 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0312/amazon0312_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 400727 ;
-Nedges (id) = 2349869 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      1.86176 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0505/amazon0505_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 410236 ;
-Nedges (id) = 2439437 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      1.95405 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0601/amazon0601_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 403394 ;
-Nedges (id) = 2443408 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      1.95774 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-CA/roadNet-CA_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1965206 ;
-Nedges (id) = 2766607 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.263203 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale18-ef16/graph500-scale18-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 174147 ;
-Nedges (id) = 3800348 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      48.5297 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 530400 ;
-Nedges (id) = 6912000 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.105463 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 530400 ;
-Nedges (id) = 11080030 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      1348.84 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 530400 ;
-Nedges (id) = 11080030 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      155.776 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale19-ef16/graph500-scale19-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 335318 ;
-Nedges (id) = 7729675 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      144.717 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-16764930-4194304_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4194304 ;
-Nedges (id) = 16764930 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      1.59592 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2174640 ;
-Nedges (id) = 23328000 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =     0.482845 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale20-ef16/graph500-scale20-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 645820 ;
-Nedges (id) = 15680861 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      394.261 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2174640 ;
-Nedges (id) = 28667380 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      1783.37 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2174640 ;
-Nedges (id) = 28667380 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      10198.5 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 3774768 ;
-Nedges (id) = 16518947 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      17.9391 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale21-ef16/graph500-scale21-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1243072 ;
-Nedges (id) = 31731650 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      1113.05 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-67084290-16777216_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 16777216 ;
-Nedges (id) = 67084290 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      6.08637 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2393285 ;
-Nedges (id) = 64097004 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      3098.77 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/V2a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 55042369 ;
-Nedges (id) = 58608800 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      9.37012 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/U1a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 67716231 ;
-Nedges (id) = 69389281 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      10.2586 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4606314 ;
-Nedges (id) = 129250705 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      9593.86 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 67108864 ;
-Nedges (id) = 268386306 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      26.2666 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/P1a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 139353211 ;
-Nedges (id) = 148914992 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      22.4547 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/A2a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170728175 ;
-Nedges (id) = 180292586 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      29.0998 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/V1r.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 214005017 ;
-Nedges (id) = 232705452 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      20.1894 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8860450 ;
-Nedges (id) = 260261843 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      31643.2 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-1073643522-268435456_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 268435456 ;
-Nedges (id) = 1073643522 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =      106.168 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 17043780 ;
-Nedges (id) = 523467448 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =       124314 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/friendster/friendster_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 119432957 ;
-Nedges (id) = 1799999986 ;
-% Time (3:kmax) = time for k-truss
-Time (3) =        14884 ;
-T {id} = Time' ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz ';
-id = id + 1 ;
diff --git a/GraphBLAS/Extras/ktruss/ktruss_main.c b/GraphBLAS/Extras/ktruss/ktruss_main.c
deleted file mode 100644
index 928485d3a5..0000000000
--- a/GraphBLAS/Extras/ktruss/ktruss_main.c
+++ /dev/null
@@ -1,153 +0,0 @@
-//------------------------------------------------------------------------------
-// ktruss_main.c: construct k-trusses of a graph (without using GraphBLAS)
-//------------------------------------------------------------------------------
-
-// Read a graph from a file and construct all k-trusses
-// Usage:
-//
-//  ktruss_main < infile
-//
-// See the "kgo" script for the whole GraphChallenge collection.
-
-#include "ktruss_def.h"
-
-#define CHUNK 1000
-
-// select the system:
-
-// cholesky.cse.tamu.edu: 160 hardware threads (20 cores, SMT8),
-// IBM Power8 8335-GTB, 4GHz, 1TB RAM
-
-#define MAX_THREADS 160
-#define FOR_ALL_THREADS(max_threads) \
-    for (int nthreads = 1 ; nthreads <= max_threads ; \
-        nthreads = (nthreads == 128) ? 160 : (2*nthreads))
-/*
-*/
-
-// backslash.cse.tamu.edu: 24 cores, Intel Xeon CPU E5-2695 v2 @ 2.4GHz
-// 3/4 TB RAM
-
-/*
-#define MAX_THREADS 48
-#define FOR_ALL_THREADS(max_threads) \
-    for (int nthreads = 1 ; nthreads <= max_threads ; \
-        nthreads = (nthreads == 16) ? 24 : (2*nthreads))
-*/
-
-// slash MacBook: 4 cores, Intel Core i7, 2.8Ghz, 16GB RAM
-/*
-#define MAX_THREADS 4
-#define FOR_ALL_THREADS(max_threads) \
-    for (int nthreads = 1 ; nthreads <= max_threads ; nthreads++)
-*/
-
-int main (int argc, char **argv)
-{
-
-    double tic, Time [MAX_THREADS+1] ;
-
-    //--------------------------------------------------------------------------
-    // get a 1-based symmetric matrix with no self edges, from stdin
-    //--------------------------------------------------------------------------
-
-    FILE *f ;
-    int64_t *Ap ;
-    Index *Ai, n ;
-
-    printf ("=============================================================\n") ;
-
-    tic = omp_get_wtime ( ) ;
-    if (argc > 1)
-    {
-        fprintf (stderr, "%s\n", argv [1]) ;
-        printf ("\nfile: %s ", argv [1]) ;
-        f = fopen (argv [1], "r") ;
-        if (f == NULL) { printf (": no such file\n") ; exit (1) ; }
-    }
-    else
-    {
-        f = stdin ;
-    }
-    if (!ktruss_read (f, &Ap, &Ai, &n))
-    {
-        printf ("failed to read matrix\n") ;
-        exit (1) ;
-    }
-    if (f != stdin) fclose (f) ;
-    double tread = omp_get_wtime ( ) - tic ;
-
-    int64_t nnz = Ap [n] ;
-    int64_t nedges = nnz / 2 ;
-    printf ("n %10"PRId64" edges %12"PRId64" read time: %10.4f sec\n",
-        (int64_t) n, nedges, tread) ;
-
-    /*
-    Index *Ax = (Index *) malloc ((nnz+1) * sizeof (Index)) ;
-    if (Ax == NULL)
-    {
-        printf ("out of memory\n") ;
-        exit (1) ;
-    }
-    */
-
-    int64_t *Sp = (int64_t *) malloc ((n+1)   * sizeof (int64_t)) ;
-    Index   *Si = (Index   *) malloc ((nnz+1) * sizeof (Index)) ;
-    Index   *Sx = (Index   *) malloc ((nnz+1) * sizeof (Index)) ;
-    if (Sp == NULL || Si == NULL || Sx == NULL)
-    {
-        printf ("out of memory\n") ;
-        exit (1) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // construct all k-trusses, with different # of threads
-    //--------------------------------------------------------------------------
-
-    // for further MATLAB analysis
-    FILE *fm = fopen ("ktruss_results.m", "a") ;
-    fprintf (fm, "\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n") ;
-    fprintf (fm, "id = id + 1 ;\n") ;
-    fprintf (fm, "N (id) = %"PRIu64" ;\n", n) ;
-    fprintf (fm, "Nedges (id) = %"PRIu64" ;\n", nedges) ;
-    fprintf (fm, "Time  = nan (2,%d) ;\n", MAX_THREADS) ;
-    fprintf (fm, "%% Time (3:kmax, nthreads) = time for each k-truss\n") ;
-
-    // for (Index k = 3 ; ; k++)
-    Index k = 3 ;
-    {
-        printf ("k %"PRId64"\n", k) ;
-        int64_t ne = 0 ;
-    
-        fprintf (fm, "Time = [Time ; nan(1,%d)] ;\n", MAX_THREADS) ;
-
-        FOR_ALL_THREADS (MAX_THREADS)
-        {
-            memcpy (Sp, Ap, (n+1) * sizeof (int64_t)) ;
-            memcpy (Si, Ai, (nnz+1) * sizeof (Index)) ;
-
-            printf ("start ktruss\n") ;
-
-            tic = omp_get_wtime ( ) ;
-            int64_t nsteps = ktruss (Sp, Si, Sx, n, k-2, nthreads, CHUNK) ;
-            double t = omp_get_wtime ( ) - tic ;
-            printf ("did ktruss %12.6f sec\n", t) ;
-            Time [nthreads] = t ;
-            ne = Sp [n] / 2 ;
-            int64_t nt = ktruss_ntriangles (Sp [n], Sx) ;
-            printf (
-            "ktruss        nthreads %3d : k %4"PRId64" ne %10"PRId64" nt %10"PRId64" %12.6f sec"
-            " steps %4"PRId64" rate %7.2f speedup %6.2f\n",
-            nthreads, k, ne, nt, t, nsteps, 1e-6*nedges/t, Time [1] / t) ;
-            fprintf (fm, "Time (%"PRId64",%d) = %12.6g ;\n", k, nthreads, t) ;
-        }
-
-        fprintf (fm, "T {id} = Time ;\n") ;
-
-        // if (ne == 0) break ;
-    }
-
-    fprintf (fm, "File {id} = filetrim (file) ;\n\n") ;
-    fclose (fm) ;
-}
-
diff --git a/GraphBLAS/Extras/ktruss/ktruss_mex.c b/GraphBLAS/Extras/ktruss/ktruss_mex.c
deleted file mode 100644
index 2ea2b28291..0000000000
--- a/GraphBLAS/Extras/ktruss/ktruss_mex.c
+++ /dev/null
@@ -1,59 +0,0 @@
-//------------------------------------------------------------------------------
-// ktruss_mex.c:  construct k-truss of a graph
-//------------------------------------------------------------------------------
-
-// usage:  C = ktruss_mex (A,k)
-
-// This function computes the same thing as ktruss.m.  See that function for a
-// description.
-
-#include "ktruss_def.h"
-
-void mexFunction
-(
-    int nargout,
-    mxArray *pargout [ ],
-    int nargin,
-    const mxArray *pargin [ ]
-)
-{
-    
-    // check inputs
-    if (nargin != 2 || nargout > 2)
-    {
-        mexErrMsgTxt ("usage: [C,nsteps] = ktruss_mex (A,k)") ;
-    }
-
-    int64_t n = (int64_t) mxGetN (pargin [0]) ;
-    int64_t *Ap = (int64_t *) mxGetJc (pargin [0]) ;
-    int64_t *Ai = (int64_t *) mxGetIr (pargin [0]) ;
-    int64_t k = (int64_t) mxGetScalar (pargin [1]) ;
-    int64_t nnz = Ap [n] ;
-    if (k < 3) mexErrMsgTxt ("k must be >= 3") ;
-
-    // create the output matrix
-    pargout [0] = mxCreateSparse (n, n, nnz, mxREAL) ;
-    int64_t *Cp = (int64_t *) mxGetJc (pargout [0]) ;
-    int64_t *Ci = (int64_t *) mxGetIr (pargout [0]) ;
-    int64_t *Ce = (int64_t *) mxMalloc ((nnz+1) * sizeof (int64_t)) ;
-
-    memcpy (Cp, Ap, (n+1) * sizeof (int64_t)) ;
-    memcpy (Ci, Ai, (nnz) * sizeof (int64_t)) ;
-
-    // construct the ktruss
-    int64_t nsteps = ktruss (Cp, Ci, Ce, n, k-2, 1, n) ;
-
-    // copy the edge weights to the output matrix
-    double *Cx = mxGetPr (pargout [0]) ;
-    for (int64_t p = 0 ; p < Ap [n] ; p++)
-    {
-        Cx [p] = (double) (Ce [p]) ;
-    }
-    mxFree (Ce) ;
-
-    if (nargout > 1)
-    {
-        pargout [1] = mxCreateDoubleScalar ((double) nsteps) ;
-    }
-}
-
diff --git a/GraphBLAS/Extras/ktruss/ktruss_ntriangles.c b/GraphBLAS/Extras/ktruss/ktruss_ntriangles.c
deleted file mode 100644
index d8aa2ca7d5..0000000000
--- a/GraphBLAS/Extras/ktruss/ktruss_ntriangles.c
+++ /dev/null
@@ -1,16 +0,0 @@
-//------------------------------------------------------------------------------
-// ktruss_ntriangles.c: count the number of triangles in a k-truss
-//------------------------------------------------------------------------------
-
-#include "ktruss_def.h"
-
-int64_t ktruss_ntriangles (const int64_t anz, const Index *Ax)
-{
-    int64_t nt = 0 ;
-    for (int64_t p = 0 ; p < anz ; p++)
-    {
-        nt += Ax [p] ;
-    }
-    return (nt/6) ;
-}
-
diff --git a/GraphBLAS/Extras/ktruss/ktruss_read.c b/GraphBLAS/Extras/ktruss/ktruss_read.c
deleted file mode 100644
index 38c17ac01a..0000000000
--- a/GraphBLAS/Extras/ktruss/ktruss_read.c
+++ /dev/null
@@ -1,157 +0,0 @@
-//------------------------------------------------------------------------------
-// ktruss_read: read an adjacency matrix from a file
-//------------------------------------------------------------------------------
-
-// The file has one entry per line, separated by white space: i j x, where
-// A(i,j)=x.  The value x is ignored.  The matrix in the input file must be
-// symmetric with no self-edges, but these conditions are not checked.  The
-// entries must appear in sorted order, by increasing column index and the by
-// increasing row index.  Indices in the input file are 1-based, but the matrix
-// A returned is 0-based.
-
-#include "ktruss_def.h"
-
-#define FREE_ALL                    \
-    if (I2 != NULL) free (I2) ;     \
-    if (J2 != NULL) free (J2) ;
-
-bool ktruss_read        // true if successful, false otherwise
-(
-    FILE *f,            // file for reading, already open (can be stdin)
-    int64_t **p_Ap,     // Ap: column pointers, of size n+1
-    Index **p_Ai,       // Ai: row indices, of size nz = Ap [n]
-    Index *p_n          // A is n-by-n
-)
-{
-
-    //--------------------------------------------------------------------------
-    // allocate initial space for the triplets
-    //--------------------------------------------------------------------------
-
-    int64_t len = 1024 * 1024 ;
-    Index *I = malloc (len * sizeof (Index)), *I2 = NULL ;
-    Index *J = malloc (len * sizeof (Index)), *J2 = NULL ;
-
-    if (I == NULL || J == NULL)
-    {
-        if (I != NULL) free (I) ;
-        if (J != NULL) free (J) ;
-        printf ("out of memory\n") ;
-        return (false) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // read the triplets into I and J
-    //--------------------------------------------------------------------------
-
-    double xx ;
-    int64_t ii, jj ;
-    int64_t ntuples = 0 ;
-    Index i, j, ilast = -1, jlast = -1, n = 1 ;
-
-    while (fscanf (f, "%"PRId64" %"PRId64" %lg", &ii, &jj, &xx) != EOF)
-    {
-
-        // double the size of I and J if needed
-        if (ntuples >= len)
-        {
-            printf ("double len from %"PRId64" to %"PRId64"\n", len, 2*len) ;
-            I2 = realloc (I, 2 * len * sizeof (Index)) ;
-            J2 = realloc (J, 2 * len * sizeof (Index)) ;
-            bool ok = (I2 != NULL) && (J2 != NULL) ;
-            if (I2 != NULL) I = I2 ;
-            if (J2 != NULL) J = J2 ;
-            if (!ok)
-            {
-                free (I) ;
-                free (J) ;
-                printf ("out of memory\n") ;
-                return (false) ;
-            }
-            len *= 2 ;
-        }
-
-        // check if the indices are valid
-        if (ii > INDEX_MAX || jj > INDEX_MAX)
-        {
-            printf ("problem too large, max n = %g\n", (double) INDEX_MAX) ;
-            return (false) ;
-        }
-
-        if (ii < 1 || jj < 1)
-        {
-            printf ("invalid row or column index\n") ;
-            return (false) ;
-        }
-
-        // find the dimension
-        n = MAX (n, ii) ;
-        n = MAX (n, jj) ;
-
-        // convert indices to zero-based
-        i = (Index) (ii-1) ;
-        j = (Index) (jj-1) ;
-
-        // check if sorted
-        if (j < jlast || (j == jlast && i <= ilast))
-        {
-            printf ("invalid: entries not sorted on input\n") ;
-            return (false) ;
-        }
-
-        // save the tuples, but delete any self-edges
-        if (i != j)
-        {
-            I [ntuples] = i ;
-            J [ntuples] = j ;
-            ntuples++ ;
-        }
-
-        ilast = i ;
-        jlast = j ;
-    }
-
-    //--------------------------------------------------------------------------
-    // construct the column pointers
-    //--------------------------------------------------------------------------
-
-    int64_t *Ap = malloc ((n+1) * sizeof (int64_t)) ;
-    if (Ap == NULL)
-    {
-        FREE_ALL ;
-        printf ("out of memory\n") ;
-        return (false) ;
-    }
-
-    jlast = -1 ;
-    for (int64_t p = 0 ; p < ntuples ; p++)
-    {
-        j = J [p] ;
-        if (j > jlast)
-        {
-            // p is the start of columns jlast+1 to j
-            for (Index j2 = jlast+1 ; j2 <= j ; j2++)
-            {
-                Ap [j2] = p ;
-            }
-        }
-        jlast = j ;
-    }
-
-    for (Index j = jlast+1 ; j <= n ; j++)
-    {
-        Ap [n] = ntuples ;
-    }
-
-    //--------------------------------------------------------------------------
-    // free workspace and return result
-    //--------------------------------------------------------------------------
-
-    (*p_Ap) = Ap ;
-    (*p_Ai) = I ;
-    (*p_n) = n ;
-
-    free (J) ;
-    return (true) ;
-}
-
diff --git a/GraphBLAS/Extras/ktruss/ktruss_results.m b/GraphBLAS/Extras/ktruss/ktruss_results.m
deleted file mode 100644
index 1ce08cea20..0000000000
--- a/GraphBLAS/Extras/ktruss/ktruss_results.m
+++ /dev/null
@@ -1,2952 +0,0 @@
-function [T, File, N, Nedges]  = ktruss_results
-id = 0 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 442 ;
-Nedges (id) = 841 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0093507 ;
-Time (3,2) =  0.000242933 ;
-Time (3,4) =  0.000242558 ;
-Time (3,8) =  0.000242219 ;
-Time (3,16) =  0.000242336 ;
-Time (3,32) =  0.000242255 ;
-Time (3,64) =  0.000253271 ;
-Time (3,128) =  0.000242504 ;
-Time (3,160) =  0.000242201 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 442 ;
-Nedges (id) = 841 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00889126 ;
-Time (3,2) =  0.000190451 ;
-Time (3,4) =  0.000189913 ;
-Time (3,8) =  0.000189849 ;
-Time (3,16) =  0.000189781 ;
-Time (3,32) =  0.000189791 ;
-Time (3,64) =  0.000189945 ;
-Time (3,128) =  0.000189644 ;
-Time (3,160) =  0.000189646 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 442 ;
-Nedges (id) = 800 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00889979 ;
-Time (3,2) =  0.000186865 ;
-Time (3,4) =  0.000185861 ;
-Time (3,8) =  0.000186143 ;
-Time (3,16) =  0.000185828 ;
-Time (3,32) =  0.000186049 ;
-Time (3,64) =  0.000185925 ;
-Time (3,128) =  0.000185918 ;
-Time (3,160) =  0.000185829 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 120 ;
-Nedges (id) = 346 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00878341 ;
-Time (3,2) =  5.31366e-05 ;
-Time (3,4) =  5.24158e-05 ;
-Time (3,8) =  5.22379e-05 ;
-Time (3,16) =  5.22994e-05 ;
-Time (3,32) =  5.22789e-05 ;
-Time (3,64) =  5.20423e-05 ;
-Time (3,128) =   5.8176e-05 ;
-Time (3,160) =  5.69057e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 120 ;
-Nedges (id) = 346 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00880844 ;
-Time (3,2) =  3.06573e-05 ;
-Time (3,4) =  2.99858e-05 ;
-Time (3,8) =  2.97679e-05 ;
-Time (3,16) =  2.97362e-05 ;
-Time (3,32) =  2.94317e-05 ;
-Time (3,64) =  2.95015e-05 ;
-Time (3,128) =  2.97111e-05 ;
-Time (3,160) =  2.95844e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 120 ;
-Nedges (id) = 240 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00879504 ;
-Time (3,2) =  2.20938e-05 ;
-Time (3,4) =  2.18749e-05 ;
-Time (3,8) =  2.16207e-05 ;
-Time (3,16) =   2.1575e-05 ;
-Time (3,32) =  2.15657e-05 ;
-Time (3,64) =  2.15592e-05 ;
-Time (3,128) =  2.14176e-05 ;
-Time (3,160) =  2.17343e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20 ;
-Nedges (id) = 31 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00869876 ;
-Time (3,2) =  1.07661e-05 ;
-Time (3,4) =  1.03489e-05 ;
-Time (3,8) =  1.01607e-05 ;
-Time (3,16) =  1.01449e-05 ;
-Time (3,32) =  9.91393e-06 ;
-Time (3,64) =  1.00331e-05 ;
-Time (3,128) =  1.02585e-05 ;
-Time (3,160) =  1.00629e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20 ;
-Nedges (id) = 31 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00870913 ;
-Time (3,2) =  1.04858e-05 ;
-Time (3,4) =  9.98285e-06 ;
-Time (3,8) =  9.75002e-06 ;
-Time (3,16) =  9.63639e-06 ;
-Time (3,32) =  9.75095e-06 ;
-Time (3,64) =  9.77237e-06 ;
-Time (3,128) =  9.82266e-06 ;
-Time (3,160) =  9.73325e-06 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20 ;
-Nedges (id) = 24 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00880242 ;
-Time (3,2) =  9.55723e-06 ;
-Time (3,4) =  9.34862e-06 ;
-Time (3,8) =  8.98633e-06 ;
-Time (3,16) =  8.89041e-06 ;
-Time (3,32) =  8.90531e-06 ;
-Time (3,64) =  9.14745e-06 ;
-Time (3,128) =  9.02079e-06 ;
-Time (3,160) =  8.98167e-06 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 300 ;
-Nedges (id) = 720 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00878806 ;
-Time (3,2) =   7.5425e-05 ;
-Time (3,4) =  7.56541e-05 ;
-Time (3,8) =  7.46893e-05 ;
-Time (3,16) =  7.47452e-05 ;
-Time (3,32) =  7.47638e-05 ;
-Time (3,64) =  7.49687e-05 ;
-Time (3,128) =  7.51801e-05 ;
-Time (3,160) =  7.50422e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 30 ;
-Nedges (id) = 49 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00873025 ;
-Time (3,2) =  1.19898e-05 ;
-Time (3,4) =  1.13966e-05 ;
-Time (3,8) =  1.15698e-05 ;
-Time (3,16) =  1.14469e-05 ;
-Time (3,32) =  1.12737e-05 ;
-Time (3,64) =  1.12969e-05 ;
-Time (3,128) =  1.13435e-05 ;
-Time (3,160) =  1.13053e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 30 ;
-Nedges (id) = 49 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00874405 ;
-Time (3,2) =  1.13482e-05 ;
-Time (3,4) =  1.07605e-05 ;
-Time (3,8) =  1.09999e-05 ;
-Time (3,16) =  1.06022e-05 ;
-Time (3,32) =  1.04764e-05 ;
-Time (3,64) =  1.04941e-05 ;
-Time (3,128) =  1.04066e-05 ;
-Time (3,160) =   1.0578e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 30 ;
-Nedges (id) = 40 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00880038 ;
-Time (3,2) =  1.06487e-05 ;
-Time (3,4) =  1.04848e-05 ;
-Time (3,8) =  1.03088e-05 ;
-Time (3,16) =  1.03749e-05 ;
-Time (3,32) =  1.04569e-05 ;
-Time (3,64) =  1.01794e-05 ;
-Time (3,128) =  1.01542e-05 ;
-Time (3,160) =  1.00322e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 60 ;
-Nedges (id) = 104 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00875213 ;
-Time (3,2) =  1.68411e-05 ;
-Time (3,4) =  1.65151e-05 ;
-Time (3,8) =  1.64509e-05 ;
-Time (3,16) =  1.63168e-05 ;
-Time (3,32) =  1.62991e-05 ;
-Time (3,64) =  1.63568e-05 ;
-Time (3,128) =  1.61426e-05 ;
-Time (3,160) =  1.61231e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 60 ;
-Nedges (id) = 104 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00874484 ;
-Time (3,2) =  1.50623e-05 ;
-Time (3,4) =  1.45761e-05 ;
-Time (3,8) =  1.43126e-05 ;
-Time (3,16) =   1.4375e-05 ;
-Time (3,32) =  1.45324e-05 ;
-Time (3,64) =  1.42753e-05 ;
-Time (3,128) =   1.4443e-05 ;
-Time (3,160) =  1.42902e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 60 ;
-Nedges (id) = 90 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00875808 ;
-Time (3,2) =  1.35219e-05 ;
-Time (3,4) =  1.32974e-05 ;
-Time (3,8) =  1.31577e-05 ;
-Time (3,16) =  1.31615e-05 ;
-Time (3,32) =  1.31791e-05 ;
-Time (3,64) =  1.33133e-05 ;
-Time (3,128) =  1.31903e-05 ;
-Time (3,160) =  1.30394e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170 ;
-Nedges (id) = 313 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00878444 ;
-Time (3,2) =  5.22826e-05 ;
-Time (3,4) =  5.18048e-05 ;
-Time (3,8) =  5.18868e-05 ;
-Time (3,16) =  5.18244e-05 ;
-Time (3,32) =  5.18523e-05 ;
-Time (3,64) =  5.17545e-05 ;
-Time (3,128) =    5.168e-05 ;
-Time (3,160) =   5.1816e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170 ;
-Nedges (id) = 313 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00875158 ;
-Time (3,2) =  3.98783e-05 ;
-Time (3,4) =  3.93111e-05 ;
-Time (3,8) =   3.9475e-05 ;
-Time (3,16) =  3.92674e-05 ;
-Time (3,32) =  3.93651e-05 ;
-Time (3,64) =   3.9109e-05 ;
-Time (3,128) =   3.9096e-05 ;
-Time (3,160) =  3.92031e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170 ;
-Nedges (id) = 288 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00876202 ;
-Time (3,2) =  3.77269e-05 ;
-Time (3,4) =  3.75938e-05 ;
-Time (3,8) =  3.73172e-05 ;
-Time (3,16) =  3.73926e-05 ;
-Time (3,32) =  3.72585e-05 ;
-Time (3,64) =  3.72818e-05 ;
-Time (3,128) =  3.71682e-05 ;
-Time (3,160) =  3.73265e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 300 ;
-Nedges (id) = 940 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00896873 ;
-Time (3,2) =  0.000209486 ;
-Time (3,4) =  0.000209034 ;
-Time (3,8) =  0.000209082 ;
-Time (3,16) =  0.000208824 ;
-Time (3,32) =  0.000208793 ;
-Time (3,64) =  0.000208906 ;
-Time (3,128) =  0.000208717 ;
-Time (3,160) =  0.000221932 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 300 ;
-Nedges (id) = 940 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00880451 ;
-Time (3,2) =  9.90564e-05 ;
-Time (3,4) =  9.82815e-05 ;
-Time (3,8) =  9.78671e-05 ;
-Time (3,16) =  9.81558e-05 ;
-Time (3,32) =  9.76752e-05 ;
-Time (3,64) =  9.78494e-05 ;
-Time (3,128) =  9.76035e-05 ;
-Time (3,160) =  9.79407e-05 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1020 ;
-Nedges (id) = 3448 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0104901 ;
-Time (3,2) =   0.00179835 ;
-Time (3,4) =   0.00182448 ;
-Time (3,8) =   0.00185399 ;
-Time (3,16) =   0.00213697 ;
-Time (3,32) =   0.00473384 ;
-Time (3,64) =    0.0148354 ;
-Time (3,128) =    0.0466205 ;
-Time (3,160) =    0.0439261 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1020 ;
-Nedges (id) = 3448 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00969835 ;
-Time (3,2) =  0.000889075 ;
-Time (3,4) =  0.000929411 ;
-Time (3,8) =  0.000980819 ;
-Time (3,16) =   0.00121659 ;
-Time (3,32) =   0.00160923 ;
-Time (3,64) =    0.0147777 ;
-Time (3,128) =    0.0188072 ;
-Time (3,160) =    0.0316389 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1020 ;
-Nedges (id) = 2880 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00958484 ;
-Time (3,2) =  0.000789658 ;
-Time (3,4) =  0.000825242 ;
-Time (3,8) =  0.000878215 ;
-Time (3,16) =   0.00106111 ;
-Time (3,32) =   0.00187521 ;
-Time (3,64) =   0.00572989 ;
-Time (3,128) =    0.0456754 ;
-Time (3,160) =    0.0332433 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2132 ;
-Nedges (id) = 4156 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0134243 ;
-Time (3,2) =    0.0025267 ;
-Time (3,4) =   0.00250584 ;
-Time (3,8) =   0.00318571 ;
-Time (3,16) =   0.00450859 ;
-Time (3,32) =   0.00473495 ;
-Time (3,64) =    0.0139164 ;
-Time (3,128) =    0.0398613 ;
-Time (3,160) =    0.0525789 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2132 ;
-Nedges (id) = 4156 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0128638 ;
-Time (3,2) =   0.00221142 ;
-Time (3,4) =   0.00263452 ;
-Time (3,8) =   0.00337363 ;
-Time (3,16) =   0.00223419 ;
-Time (3,32) =   0.00471823 ;
-Time (3,64) =     0.011459 ;
-Time (3,128) =    0.0213596 ;
-Time (3,160) =     0.054389 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2132 ;
-Nedges (id) = 4050 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0129883 ;
-Time (3,2) =   0.00221057 ;
-Time (3,4) =   0.00200233 ;
-Time (3,8) =    0.0020413 ;
-Time (3,16) =   0.00298891 ;
-Time (3,32) =   0.00383742 ;
-Time (3,64) =    0.0106074 ;
-Time (3,128) =    0.0475099 ;
-Time (3,160) =     0.072201 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1200 ;
-Nedges (id) = 4320 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00949016 ;
-Time (3,2) =  0.000645688 ;
-Time (3,4) =  0.000686822 ;
-Time (3,8) =  0.000722803 ;
-Time (3,16) =  0.000967737 ;
-Time (3,32) =   0.00127423 ;
-Time (3,64) =   0.00511441 ;
-Time (3,128) =    0.0389821 ;
-Time (3,160) =    0.0295256 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1200 ;
-Nedges (id) = 6583 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0126457 ;
-Time (3,2) =   0.00352938 ;
-Time (3,4) =   0.00358886 ;
-Time (3,8) =    0.0036765 ;
-Time (3,16) =   0.00385521 ;
-Time (3,32) =   0.00432914 ;
-Time (3,64) =    0.0197077 ;
-Time (3,128) =    0.0500894 ;
-Time (3,160) =    0.0571839 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1200 ;
-Nedges (id) = 6583 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =   0.00997934 ;
-Time (3,2) =  0.000959249 ;
-Time (3,4) =  0.000993427 ;
-Time (3,8) =   0.00104092 ;
-Time (3,16) =    0.0012503 ;
-Time (3,32) =   0.00209164 ;
-Time (3,64) =   0.00683293 ;
-Time (3,128) =    0.0308157 ;
-Time (3,160) =    0.0337315 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4420 ;
-Nedges (id) = 14400 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0235534 ;
-Time (3,2) =   0.00810332 ;
-Time (3,4) =   0.00484334 ;
-Time (3,8) =   0.00394089 ;
-Time (3,16) =   0.00737522 ;
-Time (3,32) =    0.0100857 ;
-Time (3,64) =    0.0219661 ;
-Time (3,128) =    0.0312811 ;
-Time (3,160) =    0.0983352 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/as20000102/as20000102_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 6474 ;
-Nedges (id) = 12572 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0158956 ;
-Time (3,2) =   0.00410936 ;
-Time (3,4) =   0.00294883 ;
-Time (3,8) =   0.00302515 ;
-Time (3,16) =   0.00362259 ;
-Time (3,32) =   0.00733917 ;
-Time (3,64) =    0.0262437 ;
-Time (3,128) =    0.0327778 ;
-Time (3,160) =    0.0597102 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4420 ;
-Nedges (id) = 15988 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0347091 ;
-Time (3,2) =    0.0137737 ;
-Time (3,4) =   0.00939692 ;
-Time (3,8) =     0.011752 ;
-Time (3,16) =   0.00996832 ;
-Time (3,32) =    0.0192782 ;
-Time (3,64) =    0.0476973 ;
-Time (3,128) =     0.165541 ;
-Time (3,160) =     0.181475 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4420 ;
-Nedges (id) = 15988 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0243372 ;
-Time (3,2) =   0.00854159 ;
-Time (3,4) =   0.00528761 ;
-Time (3,8) =   0.00626268 ;
-Time (3,16) =    0.0075701 ;
-Time (3,32) =   0.00817912 ;
-Time (3,64) =      0.03057 ;
-Time (3,128) =     0.058477 ;
-Time (3,160) =     0.110965 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-GrQc/ca-GrQc_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5242 ;
-Nedges (id) = 14484 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0117192 ;
-Time (3,2) =   0.00179233 ;
-Time (3,4) =   0.00130675 ;
-Time (3,8) =    0.0012413 ;
-Time (3,16) =   0.00190615 ;
-Time (3,32) =   0.00240651 ;
-Time (3,64) =   0.00601825 ;
-Time (3,128) =    0.0571106 ;
-Time (3,160) =    0.0464376 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5100 ;
-Nedges (id) = 23040 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0227549 ;
-Time (3,2) =   0.00813943 ;
-Time (3,4) =     0.005522 ;
-Time (3,8) =   0.00570726 ;
-Time (3,16) =   0.00524416 ;
-Time (3,32) =    0.0058419 ;
-Time (3,64) =    0.0196238 ;
-Time (3,128) =    0.0519772 ;
-Time (3,160) =    0.0528934 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella08/p2p-Gnutella08_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 6301 ;
-Nedges (id) = 20777 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0110176 ;
-Time (3,2) =   0.00125031 ;
-Time (3,4) =  0.000895101 ;
-Time (3,8) =   0.00082655 ;
-Time (3,16) =   0.00134421 ;
-Time (3,32) =   0.00203045 ;
-Time (3,64) =   0.00499005 ;
-Time (3,128) =    0.0486057 ;
-Time (3,160) =    0.0753034 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010331/oregon1_010331_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10670 ;
-Nedges (id) = 22002 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0274228 ;
-Time (3,2) =   0.00971197 ;
-Time (3,4) =   0.00571313 ;
-Time (3,8) =   0.00485772 ;
-Time (3,16) =   0.00596866 ;
-Time (3,32) =    0.0109368 ;
-Time (3,64) =    0.0170006 ;
-Time (3,128) =    0.0557308 ;
-Time (3,160) =     0.080961 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010407/oregon1_010407_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10729 ;
-Nedges (id) = 21999 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0276417 ;
-Time (3,2) =    0.0097825 ;
-Time (3,4) =   0.00571122 ;
-Time (3,8) =   0.00512698 ;
-Time (3,16) =   0.00533301 ;
-Time (3,32) =    0.0102815 ;
-Time (3,64) =    0.0178114 ;
-Time (3,128) =    0.0974802 ;
-Time (3,160) =    0.0771359 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010414/oregon1_010414_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10790 ;
-Nedges (id) = 22469 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0284524 ;
-Time (3,2) =    0.0101699 ;
-Time (3,4) =    0.0060926 ;
-Time (3,8) =   0.00401234 ;
-Time (3,16) =   0.00454067 ;
-Time (3,32) =   0.00778056 ;
-Time (3,64) =    0.0204666 ;
-Time (3,128) =     0.067177 ;
-Time (3,160) =     0.063385 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010421/oregon1_010421_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10859 ;
-Nedges (id) = 22747 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0288209 ;
-Time (3,2) =    0.0103695 ;
-Time (3,4) =   0.00628572 ;
-Time (3,8) =   0.00469762 ;
-Time (3,16) =    0.0074049 ;
-Time (3,32) =    0.0120978 ;
-Time (3,64) =    0.0187485 ;
-Time (3,128) =    0.0768636 ;
-Time (3,160) =    0.0604947 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010428/oregon1_010428_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10886 ;
-Nedges (id) = 22493 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0288615 ;
-Time (3,2) =    0.0104769 ;
-Time (3,4) =   0.00603847 ;
-Time (3,8) =   0.00530271 ;
-Time (3,16) =   0.00690549 ;
-Time (3,32) =    0.0111947 ;
-Time (3,64) =    0.0177808 ;
-Time (3,128) =    0.0911819 ;
-Time (3,160) =     0.100552 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010505/oregon1_010505_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10943 ;
-Nedges (id) = 22607 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0288253 ;
-Time (3,2) =    0.0105553 ;
-Time (3,4) =   0.00616833 ;
-Time (3,8) =   0.00473009 ;
-Time (3,16) =   0.00697107 ;
-Time (3,32) =    0.0106289 ;
-Time (3,64) =    0.0183399 ;
-Time (3,128) =    0.0785627 ;
-Time (3,160) =     0.117321 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010512/oregon1_010512_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11011 ;
-Nedges (id) = 22677 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0289704 ;
-Time (3,2) =    0.0105224 ;
-Time (3,4) =   0.00639929 ;
-Time (3,8) =   0.00517092 ;
-Time (3,16) =   0.00637842 ;
-Time (3,32) =   0.00802747 ;
-Time (3,64) =    0.0182551 ;
-Time (3,128) =    0.0645117 ;
-Time (3,160) =    0.0961921 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010519/oregon1_010519_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11051 ;
-Nedges (id) = 22724 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0291115 ;
-Time (3,2) =    0.0108376 ;
-Time (3,4) =   0.00666335 ;
-Time (3,8) =   0.00439739 ;
-Time (3,16) =   0.00539298 ;
-Time (3,32) =   0.00845802 ;
-Time (3,64) =     0.016612 ;
-Time (3,128) =    0.0557511 ;
-Time (3,160) =    0.0583895 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon1_010526/oregon1_010526_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11174 ;
-Nedges (id) = 23409 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0299195 ;
-Time (3,2) =    0.0111584 ;
-Time (3,4) =   0.00660501 ;
-Time (3,8) =   0.00528178 ;
-Time (3,16) =    0.0063168 ;
-Time (3,32) =   0.00665607 ;
-Time (3,64) =      0.01757 ;
-Time (3,128) =     0.077788 ;
-Time (3,160) =    0.0852325 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5100 ;
-Nedges (id) = 31036 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0632131 ;
-Time (3,2) =    0.0275127 ;
-Time (3,4) =    0.0214701 ;
-Time (3,8) =    0.0208751 ;
-Time (3,16) =    0.0217879 ;
-Time (3,32) =    0.0323732 ;
-Time (3,64) =    0.0934431 ;
-Time (3,128) =     0.213762 ;
-Time (3,160) =     0.199714 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5100 ;
-Nedges (id) = 31036 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0271779 ;
-Time (3,2) =    0.0107503 ;
-Time (3,4) =   0.00771933 ;
-Time (3,8) =   0.00573907 ;
-Time (3,16) =    0.0101537 ;
-Time (3,32) =   0.00947774 ;
-Time (3,64) =    0.0298129 ;
-Time (3,128) =    0.0549244 ;
-Time (3,160) =     0.122501 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella09/p2p-Gnutella09_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8114 ;
-Nedges (id) = 26013 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0114452 ;
-Time (3,2) =   0.00148217 ;
-Time (3,4) =  0.000964352 ;
-Time (3,8) =   0.00105056 ;
-Time (3,16) =   0.00136643 ;
-Time (3,32) =   0.00182524 ;
-Time (3,64) =    0.0044075 ;
-Time (3,128) =    0.0560472 ;
-Time (3,160) =    0.0420656 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-HepTh/ca-HepTh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 9877 ;
-Nedges (id) = 25973 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0137658 ;
-Time (3,2) =   0.00266758 ;
-Time (3,4) =   0.00176585 ;
-Time (3,8) =   0.00144993 ;
-Time (3,16) =   0.00173814 ;
-Time (3,32) =   0.00261788 ;
-Time (3,64) =   0.00589375 ;
-Time (3,128) =    0.0804297 ;
-Time (3,160) =    0.0654507 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010331/oregon2_010331_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10900 ;
-Nedges (id) = 31180 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0344619 ;
-Time (3,2) =    0.0131368 ;
-Time (3,4) =   0.00795486 ;
-Time (3,8) =   0.00773155 ;
-Time (3,16) =   0.00759175 ;
-Time (3,32) =    0.0113145 ;
-Time (3,64) =    0.0249462 ;
-Time (3,128) =    0.0836437 ;
-Time (3,160) =     0.118184 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010407/oregon2_010407_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10981 ;
-Nedges (id) = 30855 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0340289 ;
-Time (3,2) =    0.0130474 ;
-Time (3,4) =   0.00741012 ;
-Time (3,8) =   0.00773314 ;
-Time (3,16) =    0.0103881 ;
-Time (3,32) =     0.010282 ;
-Time (3,64) =    0.0237304 ;
-Time (3,128) =    0.0759865 ;
-Time (3,160) =    0.0843679 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010505/oregon2_010505_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11157 ;
-Nedges (id) = 30943 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.034532 ;
-Time (3,2) =    0.0130711 ;
-Time (3,4) =   0.00785039 ;
-Time (3,8) =   0.00711422 ;
-Time (3,16) =   0.00802866 ;
-Time (3,32) =    0.0115709 ;
-Time (3,64) =    0.0210987 ;
-Time (3,128) =    0.0802963 ;
-Time (3,160) =    0.0654876 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010414/oregon2_010414_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11019 ;
-Nedges (id) = 31761 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0352735 ;
-Time (3,2) =    0.0135602 ;
-Time (3,4) =   0.00793724 ;
-Time (3,8) =   0.00965449 ;
-Time (3,16) =    0.0101247 ;
-Time (3,32) =    0.0164656 ;
-Time (3,64) =     0.025777 ;
-Time (3,128) =     0.087777 ;
-Time (3,160) =    0.0937295 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010421/oregon2_010421_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11080 ;
-Nedges (id) = 31538 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0350937 ;
-Time (3,2) =    0.0134759 ;
-Time (3,4) =   0.00864785 ;
-Time (3,8) =   0.00817608 ;
-Time (3,16) =   0.00927384 ;
-Time (3,32) =    0.0133533 ;
-Time (3,64) =    0.0229889 ;
-Time (3,128) =    0.0969181 ;
-Time (3,160) =     0.100603 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010428/oregon2_010428_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11113 ;
-Nedges (id) = 31434 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0350835 ;
-Time (3,2) =    0.0133177 ;
-Time (3,4) =   0.00783251 ;
-Time (3,8) =   0.00923822 ;
-Time (3,16) =   0.00773925 ;
-Time (3,32) =   0.00979985 ;
-Time (3,64) =    0.0263586 ;
-Time (3,128) =    0.0781548 ;
-Time (3,160) =    0.0886359 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010512/oregon2_010512_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11260 ;
-Nedges (id) = 31303 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0349412 ;
-Time (3,2) =     0.013279 ;
-Time (3,4) =   0.00781125 ;
-Time (3,8) =   0.00689378 ;
-Time (3,16) =   0.00941735 ;
-Time (3,32) =    0.0149195 ;
-Time (3,64) =    0.0224141 ;
-Time (3,128) =    0.0643801 ;
-Time (3,160) =    0.0921711 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010519/oregon2_010519_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11375 ;
-Nedges (id) = 32287 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0363285 ;
-Time (3,2) =    0.0141951 ;
-Time (3,4) =   0.00863549 ;
-Time (3,8) =   0.00753869 ;
-Time (3,16) =   0.00881576 ;
-Time (3,32) =    0.0119122 ;
-Time (3,64) =    0.0215327 ;
-Time (3,128) =    0.0610501 ;
-Time (3,160) =     0.080623 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-81-256-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 21074 ;
-Nedges (id) = 41809 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.424892 ;
-Time (3,2) =     0.215857 ;
-Time (3,4) =     0.121412 ;
-Time (3,8) =    0.0699866 ;
-Time (3,16) =    0.0501131 ;
-Time (3,32) =    0.0922802 ;
-Time (3,64) =     0.147886 ;
-Time (3,128) =     0.373349 ;
-Time (3,160) =      0.43748 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-81-256-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 21074 ;
-Nedges (id) = 41472 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.404606 ;
-Time (3,2) =     0.206535 ;
-Time (3,4) =     0.112856 ;
-Time (3,8) =    0.0685401 ;
-Time (3,16) =    0.0490118 ;
-Time (3,32) =    0.0809567 ;
-Time (3,64) =     0.122157 ;
-Time (3,128) =      0.31567 ;
-Time (3,160) =      0.35597 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010526/oregon2_010526_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11461 ;
-Nedges (id) = 32730 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.037297 ;
-Time (3,2) =    0.0148454 ;
-Time (3,4) =    0.0087565 ;
-Time (3,8) =   0.00621034 ;
-Time (3,16) =   0.00995113 ;
-Time (3,32) =   0.00951845 ;
-Time (3,64) =    0.0224472 ;
-Time (3,128) =    0.0707122 ;
-Time (3,160) =    0.0923302 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella06/p2p-Gnutella06_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8717 ;
-Nedges (id) = 31525 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0115746 ;
-Time (3,2) =    0.0015459 ;
-Time (3,4) =   0.00102421 ;
-Time (3,8) =  0.000910257 ;
-Time (3,16) =   0.00120355 ;
-Time (3,32) =   0.00172182 ;
-Time (3,64) =   0.00467277 ;
-Time (3,128) =    0.0287641 ;
-Time (3,160) =    0.0639212 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-81-256-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 21074 ;
-Nedges (id) = 41809 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.405307 ;
-Time (3,2) =     0.206544 ;
-Time (3,4) =     0.113246 ;
-Time (3,8) =    0.0688813 ;
-Time (3,16) =     0.044171 ;
-Time (3,32) =     0.104955 ;
-Time (3,64) =     0.114203 ;
-Time (3,128) =     0.309948 ;
-Time (3,160) =     0.315535 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella05/p2p-Gnutella05_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8846 ;
-Nedges (id) = 31839 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0116155 ;
-Time (3,2) =   0.00160128 ;
-Time (3,4) =    0.0010952 ;
-Time (3,8) =  0.000999094 ;
-Time (3,16) =    0.0013684 ;
-Time (3,32) =   0.00198843 ;
-Time (3,64) =   0.00489317 ;
-Time (3,128) =    0.0174483 ;
-Time (3,160) =    0.0536915 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella04/p2p-Gnutella04_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 10876 ;
-Nedges (id) = 39994 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0121991 ;
-Time (3,2) =     0.001835 ;
-Time (3,4) =   0.00126286 ;
-Time (3,8) =   0.00102255 ;
-Time (3,16) =   0.00141839 ;
-Time (3,32) =   0.00210975 ;
-Time (3,64) =   0.00547105 ;
-Time (3,128) =    0.0600371 ;
-Time (3,160) =     0.073001 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/as-caida20071105/as-caida20071105_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26475 ;
-Nedges (id) = 53381 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0557073 ;
-Time (3,2) =     0.023872 ;
-Time (3,4) =    0.0141424 ;
-Time (3,8) =    0.0106311 ;
-Time (3,16) =   0.00817117 ;
-Time (3,32) =    0.0109534 ;
-Time (3,64) =    0.0218789 ;
-Time (3,128) =    0.0738658 ;
-Time (3,160) =      0.11388 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella25/p2p-Gnutella25_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 22687 ;
-Nedges (id) = 54705 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0131922 ;
-Time (3,2) =   0.00250216 ;
-Time (3,4) =   0.00169912 ;
-Time (3,8) =   0.00145138 ;
-Time (3,16) =   0.00135428 ;
-Time (3,32) =   0.00204432 ;
-Time (3,64) =   0.00485568 ;
-Time (3,128) =    0.0500168 ;
-Time (3,160) =     0.073106 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella24/p2p-Gnutella24_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26518 ;
-Nedges (id) = 65369 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0145301 ;
-Time (3,2) =   0.00303022 ;
-Time (3,4) =    0.0018645 ;
-Time (3,8) =   0.00142159 ;
-Time (3,16) =   0.00157181 ;
-Time (3,32) =   0.00240084 ;
-Time (3,64) =   0.00582789 ;
-Time (3,128) =    0.0420556 ;
-Time (3,160) =    0.0499297 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/facebook_combined/facebook_combined_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4039 ;
-Nedges (id) = 88234 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0829269 ;
-Time (3,2) =    0.0386986 ;
-Time (3,4) =    0.0329655 ;
-Time (3,8) =    0.0671025 ;
-Time (3,16) =    0.0410689 ;
-Time (3,32) =    0.0655248 ;
-Time (3,64) =     0.124667 ;
-Time (3,128) =     0.232954 ;
-Time (3,160) =     0.176462 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36244 ;
-Nedges (id) = 129600 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      1.07297 ;
-Time (3,2) =       0.5354 ;
-Time (3,4) =     0.273544 ;
-Time (3,8) =     0.152927 ;
-Time (3,16) =      0.12094 ;
-Time (3,32) =     0.110496 ;
-Time (3,64) =     0.182403 ;
-Time (3,128) =     0.408975 ;
-Time (3,160) =     0.447585 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella30/p2p-Gnutella30_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36682 ;
-Nedges (id) = 88328 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0161454 ;
-Time (3,2) =   0.00403768 ;
-Time (3,4) =    0.0026848 ;
-Time (3,8) =   0.00218384 ;
-Time (3,16) =   0.00220357 ;
-Time (3,32) =   0.00299581 ;
-Time (3,64) =   0.00756641 ;
-Time (3,128) =    0.0603206 ;
-Time (3,160) =     0.038889 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36244 ;
-Nedges (id) = 137164 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =       1.4719 ;
-Time (3,2) =     0.736075 ;
-Time (3,4) =     0.393926 ;
-Time (3,8) =     0.222933 ;
-Time (3,16) =     0.195796 ;
-Time (3,32) =     0.197979 ;
-Time (3,64) =     0.319613 ;
-Time (3,128) =       1.0137 ;
-Time (3,160) =      1.63547 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-16-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36244 ;
-Nedges (id) = 137164 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      1.09014 ;
-Time (3,2) =     0.543668 ;
-Time (3,4) =     0.279777 ;
-Time (3,8) =     0.156154 ;
-Time (3,16) =     0.121129 ;
-Time (3,32) =     0.112099 ;
-Time (3,64) =     0.185972 ;
-Time (3,128) =     0.533725 ;
-Time (3,160) =     0.568171 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20400 ;
-Nedges (id) = 138240 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.171924 ;
-Time (3,2) =    0.0830894 ;
-Time (3,4) =      0.04542 ;
-Time (3,8) =    0.0311353 ;
-Time (3,16) =    0.0218536 ;
-Time (3,32) =    0.0342395 ;
-Time (3,64) =    0.0522495 ;
-Time (3,128) =      0.26398 ;
-Time (3,160) =     0.224875 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26520 ;
-Nedges (id) = 144000 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      0.44432 ;
-Time (3,2) =     0.221915 ;
-Time (3,4) =     0.116238 ;
-Time (3,8) =    0.0679478 ;
-Time (3,16) =    0.0638867 ;
-Time (3,32) =    0.0723454 ;
-Time (3,64) =     0.101777 ;
-Time (3,128) =     0.351747 ;
-Time (3,160) =     0.373349 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-CondMat/ca-CondMat_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 23133 ;
-Nedges (id) = 93439 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0348109 ;
-Time (3,2) =    0.0138121 ;
-Time (3,4) =   0.00830936 ;
-Time (3,8) =   0.00509025 ;
-Time (3,16) =   0.00430992 ;
-Time (3,32) =   0.00664546 ;
-Time (3,64) =    0.0130467 ;
-Time (3,128) =    0.0636529 ;
-Time (3,160) =    0.0667354 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-HepPh/ca-HepPh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 12008 ;
-Nedges (id) = 118489 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.112903 ;
-Time (3,2) =    0.0538657 ;
-Time (3,4) =     0.039751 ;
-Time (3,8) =    0.0297696 ;
-Time (3,16) =    0.0251605 ;
-Time (3,32) =    0.0422601 ;
-Time (3,64) =    0.0812053 ;
-Time (3,128) =     0.190999 ;
-Time (3,160) =     0.192849 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26520 ;
-Nedges (id) = 175873 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      1.17884 ;
-Time (3,2) =     0.593964 ;
-Time (3,4) =       0.3194 ;
-Time (3,8) =     0.199798 ;
-Time (3,16) =     0.144582 ;
-Time (3,32) =     0.221978 ;
-Time (3,64) =     0.543958 ;
-Time (3,128) =     0.946298 ;
-Time (3,160) =      1.11539 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 26520 ;
-Nedges (id) = 175873 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.512033 ;
-Time (3,2) =     0.256895 ;
-Time (3,4) =     0.133251 ;
-Time (3,8) =    0.0809736 ;
-Time (3,16) =    0.0776793 ;
-Time (3,32) =    0.0650504 ;
-Time (3,64) =     0.126733 ;
-Time (3,128) =      0.37101 ;
-Time (3,160) =     0.467928 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella31/p2p-Gnutella31_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 62586 ;
-Nedges (id) = 147892 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0212309 ;
-Time (3,2) =   0.00675853 ;
-Time (3,4) =   0.00436349 ;
-Time (3,8) =   0.00311087 ;
-Time (3,16) =   0.00269288 ;
-Time (3,32) =   0.00473273 ;
-Time (3,64) =   0.00828711 ;
-Time (3,128) =    0.0440965 ;
-Time (3,160) =    0.0654344 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20400 ;
-Nedges (id) = 217255 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.278105 ;
-Time (3,2) =      0.14338 ;
-Time (3,4) =    0.0773842 ;
-Time (3,8) =    0.0488485 ;
-Time (3,16) =    0.0365784 ;
-Time (3,32) =    0.0741512 ;
-Time (3,64) =     0.123271 ;
-Time (3,128) =     0.429206 ;
-Time (3,160) =     0.330997 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 20400 ;
-Nedges (id) = 217255 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      1.13494 ;
-Time (3,2) =     0.563388 ;
-Time (3,4) =     0.300551 ;
-Time (3,8) =     0.172071 ;
-Time (3,16) =     0.160704 ;
-Time (3,32) =      0.31631 ;
-Time (3,64) =     0.839502 ;
-Time (3,128) =     0.996806 ;
-Time (3,160) =      1.07397 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/email-Enron/email-Enron_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36692 ;
-Nedges (id) = 183831 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      0.14495 ;
-Time (3,2) =    0.0691044 ;
-Time (3,4) =    0.0397681 ;
-Time (3,8) =     0.035312 ;
-Time (3,16) =    0.0320414 ;
-Time (3,32) =    0.0364893 ;
-Time (3,64) =    0.0811839 ;
-Time (3,128) =     0.168704 ;
-Time (3,160) =     0.197436 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-260610-65536_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 65536 ;
-Nedges (id) = 260610 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0155287 ;
-Time (3,2) =   0.00361639 ;
-Time (3,4) =   0.00255196 ;
-Time (3,8) =   0.00202858 ;
-Time (3,16) =    0.0021249 ;
-Time (3,32) =   0.00521332 ;
-Time (3,64) =     0.012165 ;
-Time (3,128) =    0.0526173 ;
-Time (3,160) =     0.114801 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-AstroPh/ca-AstroPh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 18772 ;
-Nedges (id) = 198050 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.123682 ;
-Time (3,2) =     0.059907 ;
-Time (3,4) =    0.0362101 ;
-Time (3,8) =     0.024727 ;
-Time (3,16) =    0.0174631 ;
-Time (3,32) =    0.0230717 ;
-Time (3,64) =     0.049217 ;
-Time (3,128) =    0.0945168 ;
-Time (3,160) =     0.147992 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/loc-brightkite_edges/loc-brightkite_edges_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 58228 ;
-Nedges (id) = 214078 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0923406 ;
-Time (3,2) =      0.04322 ;
-Time (3,4) =    0.0290134 ;
-Time (3,8) =     0.020673 ;
-Time (3,16) =    0.0203094 ;
-Time (3,32) =    0.0225374 ;
-Time (3,64) =    0.0548241 ;
-Time (3,128) =     0.111185 ;
-Time (3,160) =     0.138319 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-256-625-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 160882 ;
-Nedges (id) = 320000 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =       23.297 ;
-Time (3,2) =      11.7144 ;
-Time (3,4) =      5.93802 ;
-Time (3,8) =      3.04842 ;
-Time (3,16) =      1.66077 ;
-Time (3,32) =      1.29272 ;
-Time (3,64) =      1.64575 ;
-Time (3,128) =      1.27518 ;
-Time (3,160) =      1.51235 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-256-625-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 160882 ;
-Nedges (id) = 320881 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      23.7941 ;
-Time (3,2) =      11.8744 ;
-Time (3,4) =       6.0965 ;
-Time (3,8) =      3.19055 ;
-Time (3,16) =      1.75373 ;
-Time (3,32) =      1.14884 ;
-Time (3,64) =      1.19439 ;
-Time (3,128) =      1.93557 ;
-Time (3,160) =      1.97322 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-256-625-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 160882 ;
-Nedges (id) = 320881 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =       23.464 ;
-Time (3,2) =      11.7307 ;
-Time (3,4) =      5.91425 ;
-Time (3,8) =      3.02102 ;
-Time (3,16) =      1.64959 ;
-Time (3,32) =      1.29378 ;
-Time (3,64) =      1.72571 ;
-Time (3,128) =      1.20806 ;
-Time (3,160) =      1.18098 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-HepTh/cit-HepTh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 27770 ;
-Nedges (id) = 352285 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.249466 ;
-Time (3,2) =      0.12286 ;
-Time (3,4) =    0.0654386 ;
-Time (3,8) =    0.0421039 ;
-Time (3,16) =     0.038696 ;
-Time (3,32) =    0.0388652 ;
-Time (3,64) =    0.0853796 ;
-Time (3,128) =     0.199401 ;
-Time (3,160) =     0.220284 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Epinions1/soc-Epinions1_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 75879 ;
-Nedges (id) = 405740 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.378663 ;
-Time (3,2) =     0.189904 ;
-Time (3,4) =     0.100851 ;
-Time (3,8) =     0.073682 ;
-Time (3,16) =    0.0617277 ;
-Time (3,32) =    0.0746577 ;
-Time (3,64) =     0.127795 ;
-Time (3,128) =      0.23943 ;
-Time (3,160) =     0.311636 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 265214 ;
-Nedges (id) = 364481 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.516913 ;
-Time (3,2) =     0.236714 ;
-Time (3,4) =     0.121707 ;
-Time (3,8) =    0.0821668 ;
-Time (3,16) =    0.0493099 ;
-Time (3,32) =    0.0522339 ;
-Time (3,64) =    0.0745565 ;
-Time (3,128) =     0.140877 ;
-Time (3,160) =     0.191922 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-HepPh/cit-HepPh_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 34546 ;
-Nedges (id) = 420877 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      0.21981 ;
-Time (3,2) =     0.107105 ;
-Time (3,4) =    0.0593083 ;
-Time (3,8) =    0.0356224 ;
-Time (3,16) =     0.029265 ;
-Time (3,32) =    0.0309938 ;
-Time (3,64) =    0.0688174 ;
-Time (3,128) =     0.138544 ;
-Time (3,160) =     0.186235 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Slashdot0811/soc-Slashdot0811_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 77360 ;
-Nedges (id) = 469180 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.282259 ;
-Time (3,2) =     0.139931 ;
-Time (3,4) =    0.0862256 ;
-Time (3,8) =    0.0574889 ;
-Time (3,16) =    0.0496384 ;
-Time (3,32) =    0.0723133 ;
-Time (3,64) =     0.103803 ;
-Time (3,128) =     0.163829 ;
-Time (3,160) =     0.237065 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 82168 ;
-Nedges (id) = 504230 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.313408 ;
-Time (3,2) =     0.155184 ;
-Time (3,4) =    0.0960056 ;
-Time (3,8) =    0.0758768 ;
-Time (3,16) =    0.0634018 ;
-Time (3,32) =    0.0586729 ;
-Time (3,64) =     0.109524 ;
-Time (3,128) =      0.25787 ;
-Time (3,160) =     0.213493 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-1045506-262144_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 262144 ;
-Nedges (id) = 1045506 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =    0.0354106 ;
-Time (3,2) =    0.0148174 ;
-Time (3,4) =    0.0104366 ;
-Time (3,8) =   0.00814945 ;
-Time (3,16) =   0.00699974 ;
-Time (3,32) =    0.0108724 ;
-Time (3,64) =    0.0416461 ;
-Time (3,128) =    0.0688723 ;
-Time (3,160) =      0.10792 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 132600 ;
-Nedges (id) = 1152000 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      8.70432 ;
-Time (3,2) =      4.36394 ;
-Time (3,4) =      2.21899 ;
-Time (3,8) =       1.1934 ;
-Time (3,16) =     0.610236 ;
-Time (3,32) =     0.506022 ;
-Time (3,64) =     0.575578 ;
-Time (3,128) =     0.634331 ;
-Time (3,160) =     0.911112 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/loc-gowalla_edges/loc-gowalla_edges_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 196591 ;
-Nedges (id) = 950327 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      1.16977 ;
-Time (3,2) =      0.58268 ;
-Time (3,4) =     0.301335 ;
-Time (3,8) =     0.187086 ;
-Time (3,16) =      0.11282 ;
-Time (3,32) =     0.102785 ;
-Time (3,64) =     0.192702 ;
-Time (3,128) =     0.406443 ;
-Time (3,160) =     0.513417 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0302/amazon0302_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 262111 ;
-Nedges (id) = 899792 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.232935 ;
-Time (3,2) =     0.113496 ;
-Time (3,4) =    0.0694247 ;
-Time (3,8) =    0.0441284 ;
-Time (3,16) =    0.0303421 ;
-Time (3,32) =    0.0347701 ;
-Time (3,64) =    0.0748737 ;
-Time (3,128) =     0.177231 ;
-Time (3,160) =     0.241243 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 132600 ;
-Nedges (id) = 1582861 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =       37.825 ;
-Time (3,2) =      18.9997 ;
-Time (3,4) =      9.64983 ;
-Time (3,8) =      4.90901 ;
-Time (3,16) =      2.77205 ;
-Time (3,32) =      2.19433 ;
-Time (3,64) =      2.72084 ;
-Time (3,128) =      6.68183 ;
-Time (3,160) =      8.43276 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 132600 ;
-Nedges (id) = 1582861 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      11.6148 ;
-Time (3,2) =      5.80791 ;
-Time (3,4) =      2.93472 ;
-Time (3,8) =      1.54341 ;
-Time (3,16) =     0.913135 ;
-Time (3,32) =     0.834372 ;
-Time (3,64) =      1.20184 ;
-Time (3,128) =     0.980878 ;
-Time (3,160) =     0.983866 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-256-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 547924 ;
-Nedges (id) = 2073600 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      258.696 ;
-Time (3,2) =      129.815 ;
-Time (3,4) =       64.914 ;
-Time (3,8) =      32.8326 ;
-Time (3,16) =      17.0576 ;
-Time (3,32) =      10.1346 ;
-Time (3,64) =      8.58177 ;
-Time (3,128) =      8.83703 ;
-Time (3,160) =      9.00754 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-256-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 547924 ;
-Nedges (id) = 2132284 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      307.322 ;
-Time (3,2) =      154.592 ;
-Time (3,4) =      78.2784 ;
-Time (3,8) =      39.9975 ;
-Time (3,16) =      20.9503 ;
-Time (3,32) =      12.3974 ;
-Time (3,64) =      11.5522 ;
-Time (3,128) =      14.1713 ;
-Time (3,160) =      16.4062 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-25-81-256-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 547924 ;
-Nedges (id) = 2132284 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      260.039 ;
-Time (3,2) =      130.563 ;
-Time (3,4) =      65.1224 ;
-Time (3,8) =      33.1858 ;
-Time (3,16) =      17.2139 ;
-Time (3,32) =      10.2627 ;
-Time (3,64) =      8.68935 ;
-Time (3,128) =      9.00885 ;
-Time (3,160) =      10.2271 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-PA/roadNet-PA_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1088092 ;
-Nedges (id) = 1541898 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.103607 ;
-Time (3,2) =     0.055436 ;
-Time (3,4) =    0.0373133 ;
-Time (3,8) =     0.028405 ;
-Time (3,16) =    0.0263944 ;
-Time (3,32) =    0.0363534 ;
-Time (3,64) =     0.101064 ;
-Time (3,128) =     0.186402 ;
-Time (3,160) =      0.16538 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 362440 ;
-Nedges (id) = 2332800 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =       96.682 ;
-Time (3,2) =      48.0909 ;
-Time (3,4) =      24.0927 ;
-Time (3,8) =      12.2796 ;
-Time (3,16) =      6.43946 ;
-Time (3,32) =      4.01594 ;
-Time (3,64) =       3.6777 ;
-Time (3,128) =       4.3499 ;
-Time (3,160) =         4.16 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 362440 ;
-Nedges (id) = 2606125 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =       174.92 ;
-Time (3,2) =      87.7473 ;
-Time (3,4) =      44.1468 ;
-Time (3,8) =      22.5601 ;
-Time (3,16) =      11.9493 ;
-Time (3,32) =      8.94347 ;
-Time (3,64) =      7.56079 ;
-Time (3,128) =      13.2631 ;
-Time (3,160) =      15.3724 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 362440 ;
-Nedges (id) = 2606125 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      100.664 ;
-Time (3,2) =      50.3916 ;
-Time (3,4) =      25.4214 ;
-Time (3,8) =      12.9789 ;
-Time (3,16) =      6.79805 ;
-Time (3,32) =      4.56349 ;
-Time (3,64) =      3.38121 ;
-Time (3,128) =      4.78048 ;
-Time (3,160) =      4.98812 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1379917 ;
-Nedges (id) = 1921660 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      0.12704 ;
-Time (3,2) =    0.0687145 ;
-Time (3,4) =    0.0493576 ;
-Time (3,8) =    0.0405297 ;
-Time (3,16) =    0.0339601 ;
-Time (3,32) =    0.0461492 ;
-Time (3,64) =    0.0985398 ;
-Time (3,128) =     0.177048 ;
-Time (3,160) =     0.193628 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-4188162-1048576_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1048576 ;
-Nedges (id) = 4188162 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.114454 ;
-Time (3,2) =    0.0581869 ;
-Time (3,4) =     0.035344 ;
-Time (3,8) =    0.0268763 ;
-Time (3,16) =    0.0248465 ;
-Time (3,32) =    0.0379802 ;
-Time (3,64) =     0.120381 ;
-Time (3,128) =     0.230657 ;
-Time (3,160) =      0.29132 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/flickrEdges/flickrEdges_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 105938 ;
-Nedges (id) = 2316948 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      6.43685 ;
-Time (3,2) =      3.41784 ;
-Time (3,4) =      1.66746 ;
-Time (3,8) =     0.908039 ;
-Time (3,16) =     0.661907 ;
-Time (3,32) =     0.706622 ;
-Time (3,64) =      1.23461 ;
-Time (3,128) =      2.06068 ;
-Time (3,160) =      2.08551 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0312/amazon0312_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 400727 ;
-Nedges (id) = 2349869 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      1.02132 ;
-Time (3,2) =     0.505187 ;
-Time (3,4) =     0.256302 ;
-Time (3,8) =     0.152104 ;
-Time (3,16) =     0.108643 ;
-Time (3,32) =      0.15077 ;
-Time (3,64) =     0.278829 ;
-Time (3,128) =     0.450839 ;
-Time (3,160) =     0.447454 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0505/amazon0505_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 410236 ;
-Nedges (id) = 2439437 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      1.03952 ;
-Time (3,2) =     0.506617 ;
-Time (3,4) =     0.288364 ;
-Time (3,8) =     0.161769 ;
-Time (3,16) =     0.120066 ;
-Time (3,32) =     0.161457 ;
-Time (3,64) =     0.284179 ;
-Time (3,128) =     0.452662 ;
-Time (3,160) =     0.486259 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0601/amazon0601_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 403394 ;
-Nedges (id) = 2443408 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      1.04985 ;
-Time (3,2) =     0.508595 ;
-Time (3,4) =     0.268794 ;
-Time (3,8) =     0.164888 ;
-Time (3,16) =     0.115409 ;
-Time (3,32) =      0.13204 ;
-Time (3,64) =     0.228005 ;
-Time (3,128) =     0.449229 ;
-Time (3,160) =     0.429259 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-CA/roadNet-CA_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1965206 ;
-Nedges (id) = 2766607 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      0.18426 ;
-Time (3,2) =     0.102545 ;
-Time (3,4) =    0.0741183 ;
-Time (3,8) =    0.0527419 ;
-Time (3,16) =    0.0479753 ;
-Time (3,32) =    0.0657046 ;
-Time (3,64) =     0.182883 ;
-Time (3,128) =     0.302614 ;
-Time (3,160) =     0.325729 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale18-ef16/graph500-scale18-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 174147 ;
-Nedges (id) = 3800348 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =       24.922 ;
-Time (3,2) =      12.9534 ;
-Time (3,4) =      6.86452 ;
-Time (3,8) =      3.58636 ;
-Time (3,16) =      2.17883 ;
-Time (3,32) =      1.99656 ;
-Time (3,64) =      3.89761 ;
-Time (3,128) =      5.11734 ;
-Time (3,160) =      5.91206 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 530400 ;
-Nedges (id) = 6912000 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      104.125 ;
-Time (3,2) =      52.1546 ;
-Time (3,4) =      26.3397 ;
-Time (3,8) =      13.3098 ;
-Time (3,16) =      6.95511 ;
-Time (3,32) =      4.32689 ;
-Time (3,64) =      3.88691 ;
-Time (3,128) =      4.17159 ;
-Time (3,160) =      4.78732 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 530400 ;
-Nedges (id) = 11080030 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      825.793 ;
-Time (3,2) =      429.825 ;
-Time (3,4) =      222.732 ;
-Time (3,8) =      115.923 ;
-Time (3,16) =      61.6138 ;
-Time (3,32) =       38.226 ;
-Time (3,64) =      33.3867 ;
-Time (3,128) =      47.5497 ;
-Time (3,160) =      62.1349 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 530400 ;
-Nedges (id) = 11080030 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      173.763 ;
-Time (3,2) =      87.0633 ;
-Time (3,4) =      44.0309 ;
-Time (3,8) =      22.4716 ;
-Time (3,16) =      11.8237 ;
-Time (3,32) =      7.86624 ;
-Time (3,64) =      7.26816 ;
-Time (3,128) =      8.88717 ;
-Time (3,160) =      9.60792 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale19-ef16/graph500-scale19-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 335318 ;
-Nedges (id) = 7729675 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      73.5842 ;
-Time (3,2) =      37.3659 ;
-Time (3,4) =      19.8035 ;
-Time (3,8) =      10.5685 ;
-Time (3,16) =      5.87693 ;
-Time (3,32) =      4.96113 ;
-Time (3,64) =      6.67414 ;
-Time (3,128) =      11.1768 ;
-Time (3,160) =      12.5976 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-16764930-4194304_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4194304 ;
-Nedges (id) = 16764930 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =     0.454083 ;
-Time (3,2) =       0.2334 ;
-Time (3,4) =     0.144168 ;
-Time (3,8) =     0.135219 ;
-Time (3,16) =     0.100972 ;
-Time (3,32) =     0.142511 ;
-Time (3,64) =     0.567446 ;
-Time (3,128) =     0.748592 ;
-Time (3,160) =     0.829623 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-Bk.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2174640 ;
-Nedges (id) = 23328000 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      3113.02 ;
-Time (3,2) =      1658.01 ;
-Time (3,4) =      915.452 ;
-Time (3,8) =      530.099 ;
-Time (3,16) =      352.964 ;
-Time (3,32) =      203.464 ;
-Time (3,64) =      141.395 ;
-Time (3,128) =      107.282 ;
-Time (3,160) =      98.9674 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale20-ef16/graph500-scale20-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 645820 ;
-Nedges (id) = 15680861 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =       220.68 ;
-Time (3,2) =      118.286 ;
-Time (3,4) =      59.5588 ;
-Time (3,8) =      30.4544 ;
-Time (3,16) =      16.5858 ;
-Time (3,32) =      12.7103 ;
-Time (3,64) =      14.0411 ;
-Time (3,128) =      24.8719 ;
-Time (3,160) =      34.4406 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B2k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2174640 ;
-Nedges (id) = 28667380 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =       3602.4 ;
-Time (3,2) =      1942.72 ;
-Time (3,4) =      1064.95 ;
-Time (3,8) =      617.275 ;
-Time (3,16) =      416.479 ;
-Time (3,32) =      243.668 ;
-Time (3,64) =      169.829 ;
-Time (3,128) =      133.056 ;
-Time (3,160) =        125.7 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B1k.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2174640 ;
-Nedges (id) = 28667380 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      8960.97 ;
-Time (3,2) =      4942.88 ;
-Time (3,4) =      2679.23 ;
-Time (3,8) =       1525.7 ;
-Time (3,16) =      891.151 ;
-Time (3,32) =      584.092 ;
-Time (3,64) =      429.999 ;
-Time (3,128) =      343.202 ;
-Time (3,160) =      344.391 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 3774768 ;
-Nedges (id) = 16518947 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      11.6535 ;
-Time (3,2) =      6.19726 ;
-Time (3,4) =      3.34655 ;
-Time (3,8) =       2.6094 ;
-Time (3,16) =      2.25554 ;
-Time (3,32) =      1.43684 ;
-Time (3,64) =      1.89783 ;
-Time (3,128) =      2.84095 ;
-Time (3,160) =       3.1777 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale21-ef16/graph500-scale21-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1243072 ;
-Nedges (id) = 31731650 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =       685.09 ;
-Time (3,2) =      349.387 ;
-Time (3,4) =      177.251 ;
-Time (3,8) =      90.0703 ;
-Time (3,16) =       50.497 ;
-Time (3,32) =      39.5101 ;
-Time (3,64) =      53.4911 ;
-Time (3,128) =      113.401 ;
-Time (3,160) =      148.216 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-67084290-16777216_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 16777216 ;
-Nedges (id) = 67084290 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      1.69677 ;
-Time (3,2) =     0.944051 ;
-Time (3,4) =     0.627936 ;
-Time (3,8) =     0.414705 ;
-Time (3,16) =     0.569987 ;
-Time (3,32) =     0.335902 ;
-Time (3,64) =      1.11754 ;
-Time (3,128) =      3.33592 ;
-Time (3,160) =      3.52626 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 2393285 ;
-Nedges (id) = 64097004 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      1988.12 ;
-Time (3,2) =       1008.6 ;
-Time (3,4) =      513.907 ;
-Time (3,8) =      263.004 ;
-Time (3,16) =      145.684 ;
-Time (3,32) =      131.288 ;
-Time (3,64) =      235.283 ;
-Time (3,128) =       604.12 ;
-Time (3,160) =      732.397 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/V2a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 55042369 ;
-Nedges (id) = 58608800 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      6.10816 ;
-Time (3,2) =      3.43691 ;
-Time (3,4) =      2.23954 ;
-Time (3,8) =      1.98498 ;
-Time (3,16) =      1.89596 ;
-Time (3,32) =      2.20569 ;
-Time (3,64) =      3.36682 ;
-Time (3,128) =       4.8354 ;
-Time (3,160) =      6.16523 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/U1a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 67716231 ;
-Nedges (id) = 69389281 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      7.28726 ;
-Time (3,2) =      4.22466 ;
-Time (3,4) =      2.67605 ;
-Time (3,8) =      2.05956 ;
-Time (3,16) =      2.00367 ;
-Time (3,32) =      2.90813 ;
-Time (3,64) =       2.7785 ;
-Time (3,128) =      6.82208 ;
-Time (3,160) =      8.21771 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4606314 ;
-Nedges (id) = 129250705 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      6063.94 ;
-Time (3,2) =      3140.55 ;
-Time (3,4) =      1581.43 ;
-Time (3,8) =       815.23 ;
-Time (3,16) =      465.981 ;
-Time (3,32) =      589.308 ;
-Time (3,64) =      1456.19 ;
-Time (3,128) =      2522.63 ;
-Time (3,160) =       2725.3 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 67108864 ;
-Nedges (id) = 268386306 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      6.90681 ;
-Time (3,2) =      4.05308 ;
-Time (3,4) =      2.29494 ;
-Time (3,8) =       1.6449 ;
-Time (3,16) =      1.40724 ;
-Time (3,32) =      2.24847 ;
-Time (3,64) =      7.63115 ;
-Time (3,128) =      9.43306 ;
-Time (3,160) =      11.7424 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/P1a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 139353211 ;
-Nedges (id) = 148914992 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      17.6988 ;
-Time (3,2) =      8.78384 ;
-Time (3,4) =      5.65644 ;
-Time (3,8) =      4.87841 ;
-Time (3,16) =       4.7557 ;
-Time (3,32) =      5.37712 ;
-Time (3,64) =       8.7399 ;
-Time (3,128) =      11.7733 ;
-Time (3,160) =      15.3016 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/A2a.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 170728175 ;
-Nedges (id) = 180292586 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      21.0891 ;
-Time (3,2) =      11.8599 ;
-Time (3,4) =      6.47449 ;
-Time (3,8) =      5.14339 ;
-Time (3,16) =      5.14708 ;
-Time (3,32) =      5.59176 ;
-Time (3,64) =      8.36305 ;
-Time (3,128) =      16.0435 ;
-Time (3,160) =      19.8863 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc6/V1r.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 214005017 ;
-Nedges (id) = 232705452 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      12.5895 ;
-Time (3,2) =      7.61762 ;
-Time (3,4) =      4.68552 ;
-Time (3,8) =      3.86778 ;
-Time (3,16) =      4.10119 ;
-Time (3,32) =      5.16586 ;
-Time (3,64) =      10.9733 ;
-Time (3,128) =      14.3186 ;
-Time (3,160) =      19.1759 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 8860450 ;
-Nedges (id) = 260261843 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      21084.2 ;
-Time (3,2) =      10840.8 ;
-Time (3,4) =      5606.41 ;
-Time (3,8) =      3156.66 ;
-Time (3,16) =      2524.42 ;
-Time (3,32) =      3616.42 ;
-Time (3,64) =       6719.1 ;
-Time (3,128) =      8621.23 ;
-Time (3,160) =      8898.15 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/image-grid/g-1073643522-268435456_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 268435456 ;
-Nedges (id) = 1073643522 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      27.7898 ;
-Time (3,2) =      15.1516 ;
-Time (3,4) =      10.0062 ;
-Time (3,8) =      6.38577 ;
-Time (3,16) =      5.37024 ;
-Time (3,32) =        8.474 ;
-Time (3,64) =      23.7901 ;
-Time (3,128) =       27.713 ;
-Time (3,160) =      41.0829 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 17043780 ;
-Nedges (id) = 523467448 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      83095.7 ;
-Time (3,2) =      39708.7 ;
-Time (3,4) =      20373.3 ;
-Time (3,8) =      15444.2 ;
-Time (3,16) =      15598.8 ;
-Time (3,32) =      18803.5 ;
-Time (3,64) =      23338.7 ;
-Time (3,128) =      26184.4 ;
-Time (3,160) =      26433.3 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/friendster/friendster_adj.tsv.gz ';
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 119432957 ;
-Nedges (id) = 1799999986 ;
-Time  = nan (2,160) ;
-% Time (3:kmax, nthreads) = time for each k-truss
-Time = [Time ; nan(1,160)] ;
-Time (3,1) =      10564.4 ;
-Time (3,2) =      5206.03 ;
-Time (3,4) =      3066.44 ;
-Time (3,8) =      2617.21 ;
-Time (3,16) =      3149.77 ;
-Time (3,32) =      2152.89 ;
-Time (3,64) =      1834.79 ;
-Time (3,128) =      2017.54 ;
-Time (3,160) =      2033.32 ;
-T {id} = Time ;
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz ';
-id = id + 1 ;
-file = ' /users/davis/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz ';
-id = id + 1 ;
diff --git a/GraphBLAS/Extras/ktruss/lsall b/GraphBLAS/Extras/ktruss/lsall
deleted file mode 100755
index 3f69ae0417..0000000000
--- a/GraphBLAS/Extras/ktruss/lsall
+++ /dev/null
@@ -1,151 +0,0 @@
-#!/bin/csh
-
-ls -s \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/snap/as20000102/as20000102_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/snap/ca-GrQc/ca-GrQc_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-Bk.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella08/p2p-Gnutella08_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010331/oregon1_010331_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010407/oregon1_010407_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010414/oregon1_010414_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010421/oregon1_010421_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010428/oregon1_010428_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010505/oregon1_010505_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010512/oregon1_010512_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010519/oregon1_010519_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010526/oregon1_010526_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B2k.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella09/p2p-Gnutella09_adj.tsv.gz \
-    ~/GraphChallenge/snap/ca-HepTh/ca-HepTh_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010331/oregon2_010331_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010407/oregon2_010407_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010505/oregon2_010505_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010414/oregon2_010414_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010421/oregon2_010421_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010428/oregon2_010428_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010512/oregon2_010512_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010519/oregon2_010519_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-81-256-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-81-256-Bk.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010526/oregon2_010526_adj.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella06/p2p-Gnutella06_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-81-256-B2k.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella05/p2p-Gnutella05_adj.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella04/p2p-Gnutella04_adj.tsv.gz \
-    ~/GraphChallenge/snap/as-caida20071105/as-caida20071105_adj.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella25/p2p-Gnutella25_adj.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella24/p2p-Gnutella24_adj.tsv.gz \
-    ~/GraphChallenge/snap/facebook_combined/facebook_combined_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella30/p2p-Gnutella30_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/snap/ca-CondMat/ca-CondMat_adj.tsv.gz \
-    ~/GraphChallenge/snap/ca-HepPh/ca-HepPh_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella31/p2p-Gnutella31_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B1k.tsv.gz \
-    ~/GraphChallenge/snap/email-Enron/email-Enron_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-260610-65536_adj.tsv.gz \
-    ~/GraphChallenge/snap/ca-AstroPh/ca-AstroPh_adj.tsv.gz \
-    ~/GraphChallenge/snap/loc-brightkite_edges/loc-brightkite_edges_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-256-625-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-256-625-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-256-625-B2k.tsv.gz \
-    ~/GraphChallenge/snap/cit-HepTh/cit-HepTh_adj.tsv.gz \
-    ~/GraphChallenge/snap/soc-Epinions1/soc-Epinions1_adj.tsv.gz \
-    ~/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz \
-    ~/GraphChallenge/snap/cit-HepPh/cit-HepPh_adj.tsv.gz \
-    ~/GraphChallenge/snap/soc-Slashdot0811/soc-Slashdot0811_adj.tsv.gz \
-    ~/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-1045506-262144_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/snap/loc-gowalla_edges/loc-gowalla_edges_adj.tsv.gz \
-    ~/GraphChallenge/snap/amazon0302/amazon0302_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-256-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-256-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-256-B2k.tsv.gz \
-    ~/GraphChallenge/snap/roadNet-PA/roadNet-PA_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-4188162-1048576_adj.tsv.gz \
-    ~/GraphChallenge/snap/flickrEdges/flickrEdges_adj.tsv.gz \
-    ~/GraphChallenge/snap/amazon0312/amazon0312_adj.tsv.gz \
-    ~/GraphChallenge/snap/amazon0505/amazon0505_adj.tsv.gz \
-    ~/GraphChallenge/snap/amazon0601/amazon0601_adj.tsv.gz \
-    ~/GraphChallenge/snap/roadNet-CA/roadNet-CA_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale18-ef16/graph500-scale18-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale19-ef16/graph500-scale19-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-16764930-4194304_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale20-ef16/graph500-scale20-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale21-ef16/graph500-scale21-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-67084290-16777216_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/U1a.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/P1a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/A2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V1r.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-1073643522-268435456_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz \
-    ~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz
-
-ls -s \
-    ~/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz
-
diff --git a/GraphBLAS/Extras/tri/Makefile b/GraphBLAS/Extras/tri/Makefile
deleted file mode 100644
index 71142beadc..0000000000
--- a/GraphBLAS/Extras/tri/Makefile
+++ /dev/null
@@ -1,54 +0,0 @@
-# Makefile for tri package (triangle counting)
-
-# CC = gcc-6        # Mac, with OpenMP
-  CC = xlc          # IBM Minsky
-# CC = gcc          # linux
-# CC = icc          # Intel compiler
-
-CF = -O3 -fexceptions -fPIC -std=c11 -fopenmp
-LIB = -lm -fopenmp
-
-ifeq ($(UNAME),Linux)
-    # add the realtime library, librt
-    # this might not be needed anymore
-    LIB += -lrt
-endif
-
-ifeq ($(UNAME), Darwin)
-    # To compile on the Mac with clang. First install Xcode.  Then do this at
-    # the command line in the Terminal, before doing 'make':
-    # xcode-select --install
-    # This is needed for clang for some of my codes but perhaps not this one...
-    CF += -fno-common
-endif
-
-INC = tri_def.h Makefile
-
-run: tri_main
-	./tri_main < bcsstk01
-
-tri_main: tri_functions.o tri_main.o tri_prep.o tri_read.o tri_simple.o
-	$(CC) $(CF) $^ -o $@ $(LIB)
-
-tri_functions.o: tri_functions.c tri_template.c tri_dot_template.c $(INC)
-	$(CC) $(CF) -c $<
-
-tri_main.o: tri_main.c $(INC)
-	$(CC) $(CF) -c $<
-
-tri_prep.o: tri_prep.c $(INC)
-	$(CC) $(CF) -c $<
-
-tri_read.o: tri_read.c $(INC)
-	$(CC) $(CF) -c $<
-
-tri_simple.o: tri_simple.c $(INC)
-	$(CC) $(CF) -c $<
-
-clean: distclean
-
-purge: distclean
-
-distclean:
-	- $(RM) tri_main *.o
-
diff --git a/GraphBLAS/Extras/tri/README.txt b/GraphBLAS/Extras/tri/README.txt
deleted file mode 100644
index 94930727e8..0000000000
--- a/GraphBLAS/Extras/tri/README.txt
+++ /dev/null
@@ -1,160 +0,0 @@
-tri:  triangle counting, various methods (excluding GraphBLAS).
-For the GraphBLAS version see GraphBLAS/Demo/Source/tricount.c
-For the MATLAB version, see GraphBLAS/Demo/MATLAB/tricount.m
-
-Tim Davis, June 23, 2018
-
-To compile and run a very small matrix, first edit the Makefile and change
-the compiler.  I'm using "gcc-6" on the Mac, which is gcc 6.2.0_1 "Homebrew",
-since clang does not support OpenMP.  On the IBM Power, I'm using xlc 18.
-
-Then do:
-
-    make
-
-Next, you may wish to edit MAXTHREADS in tri_main.c, which controls how many
-threads are used.  See the comments in that file for the description of the
-systems I use and the # of threads I test with.
-
-To run larger matrices, one from GraphChallenge.org and one from
-the SuiteSparse collection:
-
-    ./go1 > go1_out.txt
-
-The output of go1 is in go1_out.txt.
-
-To run the whole GraphChallenge
-
-    ./go > go_out.txt
-
-A summary of the results is printed stderr as the computation proceeds.
-
-The GraphChallenge/ssget matrices are triplet files that I created from the
-SuiteSparse Matrix Collection, converting them into the GraphChallenge triplet
-format.  They appear in ~/GraphChallenge and are not part of this distribution.
-
-GraphChallenge/snap and GraphChallenge/synthetic are all matrices from
-GraphChallenge.org.  I'm using the *_adj.tsv format, but I have gzip'd them to
-save space.  So the scripts gunzip them and pipe to stdin of the test program.
-
-Files:
-
-    Makefile
-    make_output.txt         output of 'make'
-    README.txt              this file
-
-    go                      run all the matrices
-    go1                     run 2 matrices (ok, the name is bad..)
-    go1_out.txt             output of go1 on cholesky.cse.tamu.edu
-    gocage                  run cage15, a large matrix
-    gocage_out.txt          output of gocage on cholesky.cse.tamu.edu
-    a.awk                   use to summary output files from tri_main:
-                            awk -f a.awk < output.txt
-
-    tri_def.h               include file
-    tri_dot_template.c      template for two dot product versions
-    tri_functions.c         all methods created here
-    tri_main.c              main test program
-    tri_prep.c              prepare L, U, or permuted L and U
-    tri_read.c              read a matrix
-    tri_run                 shell script to run a set of matrices
-    tri_simple.c            very simple method, sequential, no frills
-    tri_template.c          template for 8 outer-product versions
-
-    bcsstk01                test matrix
-
-    ~/GraphChallenge        a symbolic link to the GraphChallenge matrices
-
-    fbest.m                 print an entry in a latex table
-    filetrim.m              trim a filename
-    go4                     run 4 problems
-    gosub                   run subset for HPEC18 paper
-    gosub_cholesky.out      output of gosub
-    gosub_chol_friendster_out.txt
-
-    tfr.m                   process results
-    tres.m                  process results
-
-    tri_graphblas.awk       process GraphBLAS output
-    tri_grb_output.txt      output of tricount in GraphBLAS
-    tri_grb_results.m       summary results from GraphBLAS
-    tri_results.m           summary results from tri_main.c
-
-    GraphChallenge/snap:
-
-        amazon0302
-        amazon0312
-        amazon0505
-        amazon0601
-        as-caida20071105
-        as20000102
-        ca-AstroPh
-        ca-CondMat
-        ca-GrQc
-        ca-HepPh
-        ca-HepTh
-        cit-HepPh
-        cit-HepTh
-        cit-Patents
-        email-Enron
-        email-EuAll
-        facebook_combined
-        flickrEdges
-        friendster
-        loc-brightkite_edges
-        loc-gowalla_edges
-        oregon1_010331
-        oregon1_010407
-        oregon1_010414
-        oregon1_010421
-        oregon1_010428
-        oregon1_010505
-        oregon1_010512
-        oregon1_010519
-        oregon1_010526
-        oregon2_010331
-        oregon2_010407
-        oregon2_010414
-        oregon2_010421
-        oregon2_010428
-        oregon2_010505
-        oregon2_010512
-        oregon2_010519
-        oregon2_010526
-        p2p-Gnutella04
-        p2p-Gnutella05
-        p2p-Gnutella06
-        p2p-Gnutella08
-        p2p-Gnutella09
-        p2p-Gnutella24
-        p2p-Gnutella25
-        p2p-Gnutella30
-        p2p-Gnutella31
-        roadNet-CA
-        roadNet-PA
-        roadNet-TX
-        soc-Epinions1
-        soc-Slashdot0811
-        soc-Slashdot0902
-
-    GraphChallenge/ssget:
-
-        DIMACS10
-        Freescale
-        Gleich
-        Mallya
-        SNAP
-        vanHeukelum
-
-    GraphChallenge/synthetic:
-
-        graph500-scale18-ef16
-        graph500-scale19-ef16
-        graph500-scale20-ef16
-        graph500-scale21-ef16
-        graph500-scale22-ef16
-        graph500-scale23-ef16
-        graph500-scale24-ef16
-        graph500-scale25-ef16
-        image-grid
-
diff --git a/GraphBLAS/Extras/tri/a.awk b/GraphBLAS/Extras/tri/a.awk
deleted file mode 100644
index 32b38da9f6..0000000000
--- a/GraphBLAS/Extras/tri/a.awk
+++ /dev/null
@@ -1,6 +0,0 @@
-/GraphChallenge/ {
-    n = split ($0, a, "/") ;
-    print "\n" a[n]
-}
-
-/best/
diff --git a/GraphBLAS/Extras/tri/bcsstk01 b/GraphBLAS/Extras/tri/bcsstk01
deleted file mode 100644
index 9712eb14ec..0000000000
--- a/GraphBLAS/Extras/tri/bcsstk01
+++ /dev/null
@@ -1,400 +0,0 @@
-1 1 1
-5 1 1
-6 1 1
-7 1 1
-11 1 1
-19 1 1
-25 1 1
-30 1 1
-2 2 1
-4 2 1
-6 2 1
-8 2 1
-10 2 1
-20 2 1
-24 2 1
-26 2 1
-3 3 1
-4 3 1
-5 3 1
-9 3 1
-21 3 1
-23 3 1
-27 3 1
-28 3 1
-2 4 1
-3 4 1
-4 4 1
-8 4 1
-10 4 1
-22 4 1
-27 4 1
-28 4 1
-1 5 1
-3 5 1
-5 5 1
-7 5 1
-11 5 1
-21 5 1
-23 5 1
-29 5 1
-1 6 1
-2 6 1
-6 6 1
-12 6 1
-20 6 1
-24 6 1
-25 6 1
-30 6 1
-1 7 1
-5 7 1
-7 7 1
-11 7 1
-12 7 1
-13 7 1
-31 7 1
-36 7 1
-2 8 1
-4 8 1
-8 8 1
-10 8 1
-12 8 1
-14 8 1
-18 8 1
-32 8 1
-3 9 1
-9 9 1
-10 9 1
-11 9 1
-15 9 1
-17 9 1
-33 9 1
-34 9 1
-2 10 1
-4 10 1
-8 10 1
-9 10 1
-10 10 1
-16 10 1
-33 10 1
-34 10 1
-1 11 1
-5 11 1
-7 11 1
-9 11 1
-11 11 1
-15 11 1
-17 11 1
-35 11 1
-6 12 1
-7 12 1
-8 12 1
-12 12 1
-14 12 1
-18 12 1
-31 12 1
-36 12 1
-7 13 1
-13 13 1
-17 13 1
-18 13 1
-19 13 1
-23 13 1
-37 13 1
-42 13 1
-43 13 1
-47 13 1
-48 13 1
-8 14 1
-12 14 1
-14 14 1
-15 14 1
-16 14 1
-18 14 1
-20 14 1
-22 14 1
-38 14 1
-44 14 1
-45 14 1
-46 14 1
-9 15 1
-11 15 1
-14 15 1
-15 15 1
-16 15 1
-17 15 1
-21 15 1
-39 15 1
-40 15 1
-44 15 1
-45 15 1
-46 15 1
-10 16 1
-14 16 1
-15 16 1
-16 16 1
-20 16 1
-22 16 1
-39 16 1
-40 16 1
-44 16 1
-45 16 1
-46 16 1
-9 17 1
-11 17 1
-13 17 1
-15 17 1
-17 17 1
-18 17 1
-19 17 1
-23 17 1
-41 17 1
-43 17 1
-47 17 1
-48 17 1
-8 18 1
-12 18 1
-13 18 1
-14 18 1
-17 18 1
-18 18 1
-24 18 1
-37 18 1
-42 18 1
-43 18 1
-47 18 1
-48 18 1
-1 19 1
-13 19 1
-17 19 1
-19 19 1
-23 19 1
-24 19 1
-43 19 1
-48 19 1
-2 20 1
-6 20 1
-14 20 1
-16 20 1
-20 20 1
-22 20 1
-24 20 1
-44 20 1
-3 21 1
-5 21 1
-15 21 1
-21 21 1
-22 21 1
-23 21 1
-45 21 1
-46 21 1
-4 22 1
-14 22 1
-16 22 1
-20 22 1
-21 22 1
-22 22 1
-45 22 1
-46 22 1
-3 23 1
-5 23 1
-13 23 1
-17 23 1
-19 23 1
-21 23 1
-23 23 1
-47 23 1
-2 24 1
-6 24 1
-18 24 1
-19 24 1
-20 24 1
-24 24 1
-43 24 1
-48 24 1
-1 25 1
-6 25 1
-25 25 1
-29 25 1
-30 25 1
-31 25 1
-35 25 1
-2 26 1
-26 26 1
-28 26 1
-32 26 1
-34 26 1
-3 27 1
-4 27 1
-27 27 1
-28 27 1
-33 27 1
-3 28 1
-4 28 1
-26 28 1
-27 28 1
-28 28 1
-32 28 1
-34 28 1
-5 29 1
-25 29 1
-29 29 1
-31 29 1
-35 29 1
-1 30 1
-6 30 1
-25 30 1
-30 30 1
-36 30 1
-7 31 1
-12 31 1
-25 31 1
-29 31 1
-31 31 1
-35 31 1
-36 31 1
-37 31 1
-8 32 1
-26 32 1
-28 32 1
-32 32 1
-34 32 1
-36 32 1
-38 32 1
-42 32 1
-9 33 1
-10 33 1
-27 33 1
-33 33 1
-34 33 1
-35 33 1
-39 33 1
-41 33 1
-9 34 1
-10 34 1
-26 34 1
-28 34 1
-32 34 1
-33 34 1
-34 34 1
-40 34 1
-11 35 1
-25 35 1
-29 35 1
-31 35 1
-33 35 1
-35 35 1
-39 35 1
-41 35 1
-7 36 1
-12 36 1
-30 36 1
-31 36 1
-32 36 1
-36 36 1
-38 36 1
-42 36 1
-13 37 1
-18 37 1
-31 37 1
-37 37 1
-41 37 1
-42 37 1
-43 37 1
-47 37 1
-14 38 1
-32 38 1
-36 38 1
-38 38 1
-40 38 1
-42 38 1
-44 38 1
-46 38 1
-15 39 1
-16 39 1
-33 39 1
-35 39 1
-39 39 1
-40 39 1
-41 39 1
-45 39 1
-15 40 1
-16 40 1
-34 40 1
-38 40 1
-39 40 1
-40 40 1
-44 40 1
-46 40 1
-17 41 1
-33 41 1
-35 41 1
-37 41 1
-39 41 1
-41 41 1
-43 41 1
-47 41 1
-13 42 1
-18 42 1
-32 42 1
-36 42 1
-37 42 1
-38 42 1
-42 42 1
-48 42 1
-13 43 1
-17 43 1
-18 43 1
-19 43 1
-24 43 1
-37 43 1
-41 43 1
-43 43 1
-47 43 1
-48 43 1
-14 44 1
-15 44 1
-16 44 1
-20 44 1
-38 44 1
-40 44 1
-44 44 1
-45 44 1
-46 44 1
-14 45 1
-15 45 1
-16 45 1
-21 45 1
-22 45 1
-39 45 1
-44 45 1
-45 45 1
-46 45 1
-14 46 1
-15 46 1
-16 46 1
-21 46 1
-22 46 1
-38 46 1
-40 46 1
-44 46 1
-45 46 1
-46 46 1
-13 47 1
-17 47 1
-18 47 1
-23 47 1
-37 47 1
-41 47 1
-43 47 1
-47 47 1
-48 47 1
-13 48 1
-17 48 1
-18 48 1
-19 48 1
-24 48 1
-42 48 1
-43 48 1
-47 48 1
-48 48 1
diff --git a/GraphBLAS/Extras/tri/doboth b/GraphBLAS/Extras/tri/doboth
deleted file mode 100755
index ea11add4d8..0000000000
--- a/GraphBLAS/Extras/tri/doboth
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/csh
-cd trifast
-./go
-cd ../GraphBLAS/Demo
-./go
diff --git a/GraphBLAS/Extras/tri/elist.m b/GraphBLAS/Extras/tri/elist.m
deleted file mode 100644
index 07c394347a..0000000000
--- a/GraphBLAS/Extras/tri/elist.m
+++ /dev/null
@@ -1,73 +0,0 @@
-function s = elist
-
-s = [
-% Kronecker graphs (synthetic)
- 81 %     Theory-3-4-5-9-16-B2k &   0.02 &    0.22  
- 82 %     Theory-3-4-5-9-16-B1k &   0.02 &    0.22  
- 87 %         Theory-256-625-Bk &   0.16 &    0.32  
- 88 %        Theory-256-625-B1k &   0.16 &    0.32  
- 89 %        Theory-256-625-B2k &   0.16 &    0.32  
- 97 %     Theory-4-5-9-16-25-Bk &   0.13 &    1.15  
-100 %    Theory-4-5-9-16-25-B1k &   0.13 &    1.58  
-101 %    Theory-4-5-9-16-25-B2k &   0.13 &    1.58  
-102 %       Theory-25-81-256-Bk &   0.55 &    2.07  
-103 %      Theory-25-81-256-B1k &   0.55 &    2.13  
-104 %      Theory-25-81-256-B2k &   0.55 &    2.13  
-106 %      Theory-9-16-25-81-Bk &   0.36 &    2.33  
-107 %     Theory-9-16-25-81-B1k &   0.36 &    2.61  
-108 %     Theory-9-16-25-81-B2k &   0.36 &    2.61  
-117 %   Theory-3-4-5-9-16-25-Bk &   0.53 &    6.91  
-118 %  Theory-3-4-5-9-16-25-B1k &   0.53 &   11.08  
-119 %  Theory-3-4-5-9-16-25-B2k &   0.53 &   11.08  
-122 %    Theory-5-9-16-25-81-Bk &   2.17 &   23.33  
-124 %   Theory-5-9-16-25-81-B2k &   2.17 &   28.67  
-125 %   Theory-5-9-16-25-81-B1k &   2.17 &   28.67  
-% SNAP (real)
- 86 %      loc-brightkite_edges &   0.06 &    0.21  
- 90 %                 cit-HepTh &   0.03 &    0.35  
- 91 %             soc-Epinions1 &   0.08 &    0.41  
- 92 %               email-EuAll &   0.27 &    0.36  
- 93 %                 cit-HepPh &   0.03 &    0.42  
- 94 %          soc-Slashdot0811 &   0.08 &    0.47  
- 95 %          soc-Slashdot0902 &   0.08 &    0.50  
- 98 %         loc-gowalla_edges &   0.20 &    0.95  
- 99 %                amazon0302 &   0.26 &    0.90  
-105 %                roadNet-PA &   1.09 &    1.54  
-109 %                roadNet-TX &   1.38 &    1.92  
-111 %               flickrEdges &   0.11 &    2.32  
-112 %                amazon0312 &   0.40 &    2.35  
-113 %                amazon0505 &   0.41 &    2.44  
-114 %                amazon0601 &   0.40 &    2.44  
-115 %                roadNet-CA &   1.97 &    2.77  
-126 %               cit-Patents &   3.77 &   16.52  
-140 %                friendster & 119.43 & 1800.00  
-% GenBank
-130 %                       V2a &  55.04 &   58.61  
-131 %                       U1a &  67.72 &   69.39  
-134 %                       P1a & 139.35 &  148.91  
-135 %                       A2a & 170.73 &  180.29  
-136 %                       V1r & 214.01 &  232.71  
-% image-grid
- 84 %            g-260610-65536 &   0.07 &    0.26  
- 96 %          g-1045506-262144 &   0.26 &    1.05  
-110 %         g-4188162-1048576 &   1.05 &    4.19  
-121 %        g-16764930-4194304 &   4.19 &   16.76  
-128 %       g-67084290-16777216 &  16.78 &   67.08  
-133 %      g-268386306-67108864 &  67.11 &  268.39  
-138 %    g-1073643522-268435456 & 268.44 & 1073.64  
-% synetheitc
-116 %     graph500-scale18-ef16 &   0.17 &    3.80  
-120 %     graph500-scale19-ef16 &   0.34 &    7.73  
-123 %     graph500-scale20-ef16 &   0.65 &   15.68  
-127 %     graph500-scale21-ef16 &   1.24 &   31.73  
-129 %     graph500-scale22-ef16 &   2.39 &   64.10  
-132 %     graph500-scale23-ef16 &   4.61 &  129.25  
-137 %     graph500-scale24-ef16 &   8.86 &  260.26  
-139 %     graph500-scale25-ef16 &  17.04 &  523.47  
-% MAWI traffic
-141 %         mawi_201512012345 &  18.57 &   19.02  
-142 %         mawi_201512020000 &  35.99 &   37.24  
-143 %         mawi_201512020030 &  68.86 &   71.71  
-144 %         mawi_201512020130 & 128.57 &  135.12  
-145 %         mawi_201512020330 & 226.20 &  240.02  
-] ;
diff --git a/GraphBLAS/Extras/tri/fbest.m b/GraphBLAS/Extras/tri/fbest.m
deleted file mode 100644
index 9b55555047..0000000000
--- a/GraphBLAS/Extras/tri/fbest.m
+++ /dev/null
@@ -1,6 +0,0 @@
-function fmt = fbest (i,k)
-if (i == k)
-    fmt = ' & {\\bf %6.1f }' ;
-else
-    fmt = ' &      %6.1f  ' ;
-end
diff --git a/GraphBLAS/Extras/tri/filetrim.m b/GraphBLAS/Extras/tri/filetrim.m
deleted file mode 100644
index b7f4c93bae..0000000000
--- a/GraphBLAS/Extras/tri/filetrim.m
+++ /dev/null
@@ -1,21 +0,0 @@
-function f = filetrim (filename)
-% f = filetrim (filename)
-%
-% removes leading path and trailing "_adj.tsv.gz" from a filename
-
-f = filename ;
-
-i = find (f == '/', 1, 'last') ;
-if (~isempty (i))
-    f = f (i+1:end) ;
-end
-
-i = strfind (f, '_adj') ;
-if (~isempty (i))
-    f = f (1:i-1) ;
-end
-
-i = strfind (f, '.tsv') ;
-if (~isempty (i))
-    f = f (1:i-1) ;
-end
diff --git a/GraphBLAS/Extras/tri/fixmawi.m b/GraphBLAS/Extras/tri/fixmawi.m
deleted file mode 100644
index 86c4969e04..0000000000
--- a/GraphBLAS/Extras/tri/fixmawi.m
+++ /dev/null
@@ -1,8 +0,0 @@
-function filefix = fixmawi (filename)
-i = strfind (filename, '.v') ;
-
-if (~isempty (i))
-    filefix = ['mawi_' filename(1:i-1)] ;
-else
-    filefix = filename ;
-end
diff --git a/GraphBLAS/Extras/tri/go b/GraphBLAS/Extras/tri/go
deleted file mode 100755
index a19e24c91c..0000000000
--- a/GraphBLAS/Extras/tri/go
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/csh
-./tri_run \
-	~/GraphChallenge/ssget/Mallya/lhr71_adj.tsv.gz \
-	~/GraphChallenge/ssget/Mallya/lhr71_withz_adj.tsv.gz \
-	~/GraphChallenge/ssget/Freescale/Freescale2_adj.tsv.gz \
-	~/GraphChallenge/ssget/Freescale/Freescale2_withz_adj.tsv.gz \
-	~/GraphChallenge/snap/cit-HepPh/cit-HepPh_adj.tsv.gz \
-	~/GraphChallenge/snap/cit-HepTh/cit-HepTh_adj.tsv.gz \
-	~/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz \
-	~/GraphChallenge/snap/soc-Epinions1/soc-Epinions1_adj.tsv.gz \
-	~/GraphChallenge/snap/soc-Slashdot0811/soc-Slashdot0811_adj.tsv.gz \
-	~/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz \
-	~/GraphChallenge/snap/amazon0312/amazon0312_adj.tsv.gz \
-	~/GraphChallenge/snap/amazon0505/amazon0505_adj.tsv.gz \
-	~/GraphChallenge/snap/amazon0601/amazon0601_adj.tsv.gz \
-	~/GraphChallenge/snap/flickrEdges/flickrEdges_adj.tsv.gz \
-	~/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz \
-	~/GraphChallenge/ssget/SNAP/soc-LiveJournal1_adj.tsv.gz \
-	~/GraphChallenge/ssget/Gleich/wb-edu_adj.tsv.gz \
-	~/GraphChallenge/snap/amazon0302/amazon0302_adj.tsv.gz \
-	~/GraphChallenge/snap/as-caida20071105/as-caida20071105_adj.tsv.gz \
-	~/GraphChallenge/snap/as20000102/as20000102_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-AstroPh/ca-AstroPh_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-CondMat/ca-CondMat_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-GrQc/ca-GrQc_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-HepPh/ca-HepPh_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-HepTh/ca-HepTh_adj.tsv.gz \
-	~/GraphChallenge/snap/email-Enron/email-Enron_adj.tsv.gz \
-	~/GraphChallenge/snap/facebook_combined/facebook_combined_adj.tsv.gz \
-	~/GraphChallenge/snap/loc-brightkite_edges/loc-brightkite_edges_adj.tsv.gz \
-	~/GraphChallenge/snap/loc-gowalla_edges/loc-gowalla_edges_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010331/oregon1_010331_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010407/oregon1_010407_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010414/oregon1_010414_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010421/oregon1_010421_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010428/oregon1_010428_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010505/oregon1_010505_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010512/oregon1_010512_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010519/oregon1_010519_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon1_010526/oregon1_010526_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010331/oregon2_010331_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010407/oregon2_010407_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010414/oregon2_010414_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010421/oregon2_010421_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010428/oregon2_010428_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010505/oregon2_010505_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010512/oregon2_010512_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010519/oregon2_010519_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010526/oregon2_010526_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella04/p2p-Gnutella04_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella05/p2p-Gnutella05_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella06/p2p-Gnutella06_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella08/p2p-Gnutella08_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella09/p2p-Gnutella09_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella24/p2p-Gnutella24_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella25/p2p-Gnutella25_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella30/p2p-Gnutella30_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella31/p2p-Gnutella31_adj.tsv.gz \
-	~/GraphChallenge/snap/roadNet-CA/roadNet-CA_adj.tsv.gz \
-	~/GraphChallenge/snap/roadNet-PA/roadNet-PA_adj.tsv.gz \
-	~/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-1045506-262144_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-16764930-4194304_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-260610-65536_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-4188162-1048576_adj.tsv.gz \
-	~/GraphChallenge/ssget/DIMACS10/hugebubbles-00020_adj.tsv.gz \
-	~/GraphChallenge/ssget/vanHeukelum/cage15_adj.tsv.gz \
-	~/GraphChallenge/ssget/Freescale/circuit5M_adj.tsv.gz \
-	~/GraphChallenge/synthetic/graph500-scale18-ef16/graph500-scale18-ef16_adj.tsv.gz \
-	~/GraphChallenge/synthetic/graph500-scale19-ef16/graph500-scale19-ef16_adj.tsv.gz \
-	~/GraphChallenge/synthetic/graph500-scale20-ef16/graph500-scale20-ef16_adj.tsv.gz \
-	~/GraphChallenge/synthetic/graph500-scale21-ef16/graph500-scale21-ef16_adj.tsv.gz \
-	~/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz \
- 	~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz \
- 	~/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz \
- 	~/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz \
- 	~/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz
-
diff --git a/GraphBLAS/Extras/tri/go1 b/GraphBLAS/Extras/tri/go1
deleted file mode 100755
index 9d18b77ef4..0000000000
--- a/GraphBLAS/Extras/tri/go1
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/csh
-./tri_run \
-	~/GraphChallenge/ssget/Mallya/lhr71_adj.tsv.gz \
-	~/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz
diff --git a/GraphBLAS/Extras/tri/go4 b/GraphBLAS/Extras/tri/go4
deleted file mode 100755
index b48621919c..0000000000
--- a/GraphBLAS/Extras/tri/go4
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/csh
-./tri_run \
-	~//GraphChallenge/ssget/LAW/uk-2005/uk-2005_adj.tsv.gz \
- 	~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz \
- 	~/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz \
- 	~/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz
-
diff --git a/GraphBLAS/Extras/tri/gocage b/GraphBLAS/Extras/tri/gocage
deleted file mode 100755
index 774cace80a..0000000000
--- a/GraphBLAS/Extras/tri/gocage
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/csh
-./tri_run \
-	~/GraphChallenge/ssget/vanHeukelum/cage15_adj.tsv.gz
diff --git a/GraphBLAS/Extras/tri/gosub b/GraphBLAS/Extras/tri/gosub
deleted file mode 100755
index 1e9fe2d390..0000000000
--- a/GraphBLAS/Extras/tri/gosub
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/csh
-./tri_run \
-	~/GraphChallenge/snap/cit-HepPh/cit-HepPh_adj.tsv.gz \
-	~/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz \
-	~/GraphChallenge/snap/soc-Epinions1/soc-Epinions1_adj.tsv.gz \
-	~/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz \
-	~/GraphChallenge/snap/flickrEdges/flickrEdges_adj.tsv.gz \
-	~/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz \
-	~/GraphChallenge/snap/amazon0302/amazon0302_adj.tsv.gz \
-	~/GraphChallenge/snap/as20000102/as20000102_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-AstroPh/ca-AstroPh_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-CondMat/ca-CondMat_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-GrQc/ca-GrQc_adj.tsv.gz \
-	~/GraphChallenge/snap/ca-HepPh/ca-HepPh_adj.tsv.gz \
-	~/GraphChallenge/snap/email-Enron/email-Enron_adj.tsv.gz \
-	~/GraphChallenge/snap/facebook_combined/facebook_combined_adj.tsv.gz \
-	~/GraphChallenge/snap/loc-brightkite_edges/loc-brightkite_edges_adj.tsv.gz \
-	~/GraphChallenge/snap/loc-gowalla_edges/loc-gowalla_edges_adj.tsv.gz \
-	~/GraphChallenge/snap/oregon2_010526/oregon2_010526_adj.tsv.gz \
-	~/GraphChallenge/snap/p2p-Gnutella31/p2p-Gnutella31_adj.tsv.gz \
-	~/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz \
- 	~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-4188162-1048576_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-16764930-4194304_adj.tsv.gz \
-	~/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz
diff --git a/GraphBLAS/Extras/tri/r.awk b/GraphBLAS/Extras/tri/r.awk
deleted file mode 100644
index 585b865d8f..0000000000
--- a/GraphBLAS/Extras/tri/r.awk
+++ /dev/null
@@ -1,3 +0,0 @@
-/users/
-
-/with prep/
diff --git a/GraphBLAS/Extras/tri/t5 b/GraphBLAS/Extras/tri/t5
deleted file mode 100755
index 86df608789..0000000000
--- a/GraphBLAS/Extras/tri/t5
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/csh
-./tri_run2 \
-    ~/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz 
diff --git a/GraphBLAS/Extras/tri/t5nos b/GraphBLAS/Extras/tri/t5nos
deleted file mode 100755
index 79af337604..0000000000
--- a/GraphBLAS/Extras/tri/t5nos
+++ /dev/null
@@ -1,3 +0,0 @@
-#!/bin/csh
-./tri_run2nos \
-    ~/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz 
diff --git a/GraphBLAS/Extras/tri/tfr.m b/GraphBLAS/Extras/tri/tfr.m
deleted file mode 100644
index 3669bdc439..0000000000
--- a/GraphBLAS/Extras/tri/tfr.m
+++ /dev/null
@@ -1,124 +0,0 @@
-function tfr
-
-n =  119432957  ; 
-e =  1799999986  ; 
-
-% % /users/davis/GraphChallenge/snap/friendster/friendster_adj.tsv.gz
-% S prep:0 time      5.47770 threads   1
-% S prep:0 time      6.35170 threads   2
-% S prep:0 time      5.11135 threads   4
-% S prep:0 time      3.28445 threads   8
-% S prep:0 time      2.12595 threads  16
-% S prep:0 time      2.26119 threads  32
-% S prep:0 time      2.87851 threads  64
-% S prep:0 time      2.95568 threads 128
-% S prep:0 time      2.56526 threads 160
-% R prep:0 time     36.25617 threads   1
-% R prep:0 time     40.96442 threads   2
-% R prep:0 time     30.68297 threads   4
-% R prep:0 time     18.12771 threads   8
-% R prep:0 time      9.67377 threads  16
-% R prep:0 time      6.09739 threads  32
-% R prep:0 time      3.60461 threads  64
-% R prep:0 time      2.53687 threads 128
-% R prep:0 time      2.29291 threads 160
-
-% 0: tri_mark     nthreads   1 : 191716   689.914489 sec rate    2.61
-% 0: tri_mark     nthreads   2 : 191716   375.982723 sec rate    4.79
-% 0: tri_mark     nthreads   4 : 191716   190.253594 sec rate    9.46
-% 0: tri_mark     nthreads   8 : 191716   103.394895 sec rate   17.41
-% 0: tri_mark     nthreads  16 : 191716    57.626319 sec rate   31.24
-% 0: tri_mark     nthreads  32 : 191716    33.136969 sec rate   54.32
-% 0: tri_mark     nthreads  64 : 191716    28.329136 sec rate   63.54
-% 0: tri_mark     nthreads 128 : 191716    27.959363 sec rate   64.38
-% 0: tri_mark     nthreads 160 : 191716    28.105840 sec rate   64.04
-
-% 0: tri_bit      nthreads   1 : 191716   703.886985 sec rate    2.56
-% 0: tri_bit      nthreads   2 : 191716   353.912515 sec rate    5.09
-% 0: tri_bit      nthreads   4 : 191716   196.024546 sec rate    9.18
-% 0: tri_bit      nthreads   8 : 191716   103.655962 sec rate   17.37
-% 0: tri_bit      nthreads  16 : 191716    56.037665 sec rate   32.12
-% 0: tri_bit      nthreads  32 : 191716    31.546347 sec rate   57.06
-% 0: tri_bit      nthreads  64 : 191716    24.537434 sec rate   73.36
-% 0: tri_bit      nthreads 128 : 191716    27.026594 sec rate   66.60
-% 0: tri_bit      nthreads 160 : 191716    34.024947 sec rate   52.90
-
-% 0: tri_dot      nthreads   1 : 191716  1346.620320 sec rate    1.34
-% 0: tri_dot      nthreads   2 : 191716   689.779232 sec rate    2.61
-% 0: tri_dot      nthreads   4 : 191716   336.581647 sec rate    5.35
-% 0: tri_dot      nthreads   8 : 191716   167.496258 sec rate   10.75
-% 0: tri_dot      nthreads  16 : 191716    81.753185 sec rate   22.02
-% 0: tri_dot      nthreads  32 : 191716    40.821216 sec rate   44.09
-% 0: tri_dot      nthreads  64 : 191716    22.172620 sec rate   81.18
-% 0: tri_dot      nthreads 128 : 191716    19.144363 sec rate   94.02
-% 0: tri_dot      nthreads 160 : 191716    19.075288 sec rate   94.36
-
-% 0: tri_logmark  nthreads   1 : 191716  2637.025384 sec rate    0.68
-% 0: tri_logmark  nthreads   2 : 191716  1272.340471 sec rate    1.41
-% 0: tri_logmark  nthreads   4 : 191716   644.875558 sec rate    2.79
-% 0: tri_logmark  nthreads   8 : 191716   329.216188 sec rate    5.47
-% 0: tri_logmark  nthreads  16 : 191716   159.329162 sec rate   11.30
-% 0: tri_logmark  nthreads  32 : 191716    79.701649 sec rate   22.58
-% 0: tri_logmark  nthreads  64 : 191716    41.861751 sec rate   43.00
-% 0: tri_logmark  nthreads 128 : 191716    26.203552 sec rate   68.69
-% 0: tri_logmark  nthreads 160 : 191716    23.642465 sec rate   76.13
-
-% 0: tri_simple   nthreads   1 : 191716   830.263628 sec rate    2.17
-
-fprintf ('friendster: 1 thread\n') ;
-
-id = 1
-N (id) =  119432957  ; 
-Nedges (id) =  1799999986  ; 
-Tprep (id,1) =  15.144969  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  18.179867  ; % dot product prep
-Ntri (id) =  191716  ; 
-T (id,2) =  424.531732  ;  % dot product 
-T (id,1) =  351.595748  ; % outer product 
-
-t = T (id,1) ;
-p = Tprep (id,1) ;
-fprintf ('GrB    %12.6f\n', 1e-6 * e / (t+p)) ;
-
-t = 830.263628 ;
-p = 5.47770 ;
-fprintf ('simple %12.6f\n', 1e-6 * e / (t+p)) ;
-
-t = 689.914489 ;
-p = 5.47770 ;
-fprintf ('mark  %12.6f\n', 1e-6 * e / (t+p)) ;
-
-t = 703.886985 ;
-p = 5.47770 ;
-fprintf ('bit   %12.6f\n', 1e-6 * e / (t+p)) ;
-
-t = T (id,2) ;
-p = Tprep (id,2) ;
-fprintf ('GrBdot %12.6f\n', 1e-6 * e / (t+p)) ;
-
-t = 1346.620320  ;
-p = 5.47770 + 36.25617 ;
-fprintf ('dot   %12.6f\n', 1e-6 * e / (t+p)) ;
-
-fprintf ('friendster: multiple threads\n') ;
-
-% S prep:0 time      2.95568 threads 128
-% 0: tri_mark     nthreads 128 : 191716    27.959363 sec rate   64.38
-t = 27.959363 ;
-p = 2.95568 ;
-fprintf ('mark  %12.6f\n', 1e-6 * e / (t+p)) ;
-
-% S prep:0 time      2.87851 threads  64
-% 0: tri_bit      nthreads  64 : 191716    24.537434 sec rate   73.36
-p = 2.87851 ;
-t = 24.537434 ;
-fprintf ('bit   %12.6f\n', 1e-6 * e / (t+p)) ;
-
-% S prep:0 time      2.56526 threads 160
-% R prep:0 time      2.29291 threads 160
-% 0: tri_dot      nthreads 160 : 191716    19.075288 sec rate   94.36
-p = 2.56526 + 2.29291 ;
-t = 19.075288 ;
-fprintf ('dot   %12.6f\n', 1e-6 * e / (t+p)) ;
-
-end
diff --git a/GraphBLAS/Extras/tri/tres.m b/GraphBLAS/Extras/tri/tres.m
deleted file mode 100644
index 925cc46912..0000000000
--- a/GraphBLAS/Extras/tri/tres.m
+++ /dev/null
@@ -1,137 +0,0 @@
-
-clear
-
-[T_grb, Tprep_grb, N, Nedges, Ntri] = tri_grb_results ;
-
-% T_grb = (1:nmat, 1:2)
-% T_grb = (id, 1): outer product method, prep is Tprep_grb (id,1)
-% T_grb = (id, 2): doe product method,   prep is Tprep_grb (id,2)
-
-% T_grb_total = (id, 1): outer product method, incl prep
-% T_grb_total = (id, 2): doe product method,   incl prep
-T_grb_total = T_grb + Tprep_grb ;
-
-[T, Tprep, N2, Nedges2, Ntri2, File] = tri_results ;
-
-assert (isequal (N, N2))
-assert (isequal (Nedges, Nedges2))
-assert (isequal (Ntri, Ntri2))
-clear N2 Nedges2 Ntri2
-
-nmat = length (N) ;
-
-% collect the tri_main results
-
-% find the # of threads used
-
-Time = T {1} ;
-nthreads_max = size (Time, 3)
-threads = [ ] ;
-for nth = 1:nthreads_max
-    if (all (all (all (~isnan (Time (1:4,:,nth))))))
-        threads = [threads nth] ;
-    end
-end
-
-threads
-
-% sort by # edges
-% [ignore, esort] = sort (Nedges) ;
-% esort = 1:nmat ;
-esort = elist ;
-
-% use only prep method 1:L (which is prep:0 in tri_main.c)
-
-T_total  = nan (nmat, 4, length (threads)) ;
-T_best   = nan (nmat, 4) ;
-T_simple = nan (nmat, 1) ;
-
-for id = 1:nmat
-
-    Time   = T {id} ;
-    T_prep = Tprep {id} ;
-    for kth = 1:length(threads)
-        nth = threads (kth) ;
-        tprep = T_prep (1,1,nth) ;
-        % 1:mark
-        T_total (id, 1, kth) = Time (1, 1, nth) + tprep ;
-        % 2:bit
-        T_total (id, 2, kth) = Time (2, 1, nth) + tprep ;
-        % 3:dot
-        T_total (id, 3, kth) = Time (3, 1, nth) + tprep + T_prep (2, 1, nth) ;
-        % 4:logmark
-        T_total (id, 4, kth) = Time (4, 1, nth) + tprep ;
-    end
-
-    % fastest for all threads
-    T_best (id, 1) = min (T_total (id, 1, :)) ; % 1:mark
-    T_best (id, 2) = min (T_total (id, 2, :)) ; % 2:bit
-    T_best (id, 3) = min (T_total (id, 3, :)) ; % 3:dot
-    T_best (id, 4) = min (T_total (id, 4, :)) ; % 4:logmark
-
-    % 5:simple
-    tprep = T_prep (1,1,1) ;
-    T_simple (id) = Time (5, 1, 1) + tprep ;
-end
-
-fprintf ('rate (1 thread):\n') ;
-fprintf ('outer:GrB | simple mark bit logm | dot:GrB dot | ') ;
-fprintf ('parallel mark bit logm dot\n') ;
-
-% for id = 1:nmat
-for kkk = 1:length (esort)
-    id = esort (kkk) ;
-    e = Nedges (id) ;
-    if (e < 2e5)
-        continue ;
-    end
-
-    file = fixmawi (File {id}) ;
-
-%   fprintf ('%3d %30s %10d %12d ', id, file, N(id), Nedges(id)) ;
-    what = kkk ;
-    % what = id ;
-    fprintf ('%3d& %24s & %6.2f & %7.2f ', ...
-        what, file, N(id)/1e6, Nedges(id)/1e6) ;
-
-    [tbest, i] = min ([
-        (T_grb_total (id, 1)),      % outer, GrB
-        (T_simple (id)),            % simple
-        (T_total (id, 1, 1)),       % mark
-        (T_total (id, 2, 1)),       % bit
-        (T_total (id, 4, 1))        % logmark
-        (T_grb_total (id, 2)),      % dot, GrB
-        (T_total (id, 3, 1))        % dot
-        ]) ;
-
-    fprintf (fbest(i,1), 1e-6*e/T_grb_total (id, 1)) ;    % outer, GrB
-    if (T_simple (id) > 1e20)
-        fprintf (' &         -    ') ;                  % simple (failure)
-    else
-        fprintf (fbest(i,2), 1e-6*e/T_simple (id)) ;          % simple
-    end
-    fprintf (fbest(i,3), 1e-6*e/T_total (id, 1, 1)) ;     % mark
-    fprintf (fbest(i,4), 1e-6*e/T_total (id, 2, 1)) ;     % bit
-    fprintf (fbest(i,5), 1e-6*e/T_total (id, 4, 1)) ;     % logmark
-    fprintf (fbest(i,6), 1e-6*e/T_grb_total (id, 2)) ;    % dot, GrB
-    fprintf (fbest(i,7), 1e-6*e/T_total (id, 3, 1)) ;     % dot
-
-%     fprintf ('\n') ;
-% end
-
-% fprintf ('rate (best of all threads):\n') ;
-% fprintf ('outer: mark bit logm | dot\n') ;
-% for id = 1:nmat
-%   e = Nedges (id) ;
-%   fprintf ('%3d: %40s ', id, File {id}) ;
-
-    [ibest, i] = min (T_best (id, [1 2 4 3])) ;
-
-    fprintf (fbest (i,1), 1e-6*e/T_best (id, 1)) ; % mark
-    fprintf (fbest (i,2), 1e-6*e/T_best (id, 2)) ; % bit
-    fprintf (fbest (i,3), 1e-6*e/T_best (id, 4)) ; % logmark
-    fprintf (fbest (i,4), 1e-6*e/T_best (id, 3)) ; % dot
-
-    fprintf ('\\\\ \n') ;
-end
-
diff --git a/GraphBLAS/Extras/tri/tri_def.h b/GraphBLAS/Extras/tri/tri_def.h
deleted file mode 100644
index c06a9d0d6e..0000000000
--- a/GraphBLAS/Extras/tri/tri_def.h
+++ /dev/null
@@ -1,196 +0,0 @@
-//------------------------------------------------------------------------------
-// tri_def.h:  definitions for tri_* triangle counting methods
-//------------------------------------------------------------------------------
-
-#include <stdint.h>
-#include <inttypes.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <math.h>
-#include <omp.h>
-
-#ifdef MATLAB_MEX_FILE
-#include "mex.h"
-#include "matrix.h"
-#endif
-
-#if 0
-// 32-bit version
-// row and column indices, and matrix dimension, must be < 2^31.  The number
-// of entries in the matrix can be up to 2^63 (Lp has type int64_t).
-typedef int32_t Index ;
-#define INDEX_MAX INT32_MAX
-#endif
-
-// 64-bit version
-typedef int64_t Index ;
-#define INDEX_MAX INT64_MAX
-
-#define MAX(a,b) (((a) > (b)) ? (a) : (b))
-#define MIN(a,b) (((a) < (b)) ? (a) : (b))
-
-// ceil(a/b) for two integers a and b
-#define CEIL(a,b) ( ((a) + (b) - 1) / (b) )
-
-// S=tril(A), S=triu(A), and permuted variants
-bool tri_prep                       // true if successful, false otherwise
-(
-    int64_t *restrict Sp,           // column pointers, size n+1
-    Index   *restrict Si,           // row indices
-    const int64_t *restrict Ap,     // column pointers, size n+1
-    const Index *restrict Ai,       // row indices
-    const Index n,                  // A is n-by-n
-    int method,
-    int nthreads                    // # of threads to use
-) ;
-
-
-//------------------------------------------------------------------------------
-// 8 versions created by tri_template:
-//------------------------------------------------------------------------------
-
-// parallel version with bool Mark array
-int64_t tri_mark_parallel           // # of triangles, or -1 if out of memory
-(
-    const int64_t *restrict Ap,     // column pointers, size n+1
-    const Index   *restrict Ai,     // row indices, size nz = Ap [n]
-    const Index n,                  // A is n-by-n
-    const int threads,              // # of threads
-    const Index chunk               // scheduler chunk size
-) ;
-
-// sequential version with bool Mark array
-int64_t tri_mark                    // # of triangles, or -1 if out of memory
-(
-    const int64_t *restrict Ap,     // column pointers, size n+1
-    const Index   *restrict Ai,     // row indices, size nz = Ap [n]
-    const Index n                   // A is n-by-n
-) ;
-
-// parallel version with bit Mark array
-int64_t tri_bit_parallel            // # of triangles, or -1 if out of memory
-(
-    const int64_t *restrict Ap,     // column pointers, size n+1
-    const Index   *restrict Ai,     // row indices, size nz = Ap [n]
-    const Index n,                  // A is n-by-n
-    const int threads,              // # of threads
-    const Index chunk               // scheduler chunk size
-) ;
-
-// sequential version with bit Mark array
-int64_t tri_bit                     // # of triangles, or -1 if out of memory
-(
-    const int64_t *restrict Ap,     // column pointers, size n+1
-    const Index   *restrict Ai,     // row indices, size nz = Ap [n]
-    const Index n                   // A is n-by-n
-) ;
-
-// parallel version with bool Mark array
-int64_t tri_logmark_parallel           // # of triangles, or -1 if out of memory
-(
-    const int64_t *restrict Ap,     // column pointers, size n+1
-    const Index   *restrict Ai,     // row indices, size nz = Ap [n]
-    const Index n,                  // A is n-by-n
-    const int threads,              // # of threads
-    const Index chunk               // scheduler chunk size
-) ;
-
-// sequential version with bool Mark array
-int64_t tri_logmark                    // # of triangles, or -1 if out of memory
-(
-    const int64_t *restrict Ap,     // column pointers, size n+1
-    const Index   *restrict Ai,     // row indices, size nz = Ap [n]
-    const Index n                   // A is n-by-n
-) ;
-
-// parallel version with bit Mark array
-int64_t tri_logbit_parallel            // # of triangles, or -1 if out of memory
-(
-    const int64_t *restrict Ap,     // column pointers, size n+1
-    const Index   *restrict Ai,     // row indices, size nz = Ap [n]
-    const Index n,                  // L is n-by-n
-    const int threads,              // # of threads
-    const Index chunk               // scheduler chunk size
-) ;
-
-// sequential version with bit Mark array
-int64_t tri_logbit                     // # of triangles, or -1 if out of memory
-(
-    const int64_t *restrict Ap,     // column pointers, size n+1
-    const Index   *restrict Ai,     // row indices, size nz = Ap [n]
-    const Index n                   // A is n-by-n
-) ;
-
-//------------------------------------------------------------------------------
-
-// find the first and last row index in a column
-static inline bool tri_lohi         // true if column has any entries
-(
-    const int64_t *restrict Ap,     // column pointers, size n+1
-    const Index   *restrict Ai,     // row indices
-    const Index j,                  // get info on column j
-    Index *lo,                      // first row index in column j
-    Index *hi                       // last row index in column j
-)
-{
-    int64_t p1 = Ap [j] ;
-    int64_t p2 = Ap [j+1] ;
-    if (p1 < p2)
-    {
-        // column j has at least one entry.  Return the first and last entry.
-        (*lo) = Ai [p1] ;
-        (*hi) = Ai [p2-1] ;
-        return (true) ;
-    }
-    else
-    {
-        // column j is empty
-        return (false) ;
-    }
-}
-
-bool tri_read           // true if successful, false otherwise
-(
-    FILE *f,            // file for reading, already open (can be stdin)
-    int64_t **p_Ap,     // Ap: column pointers, of size n+1
-    Index **p_Ai,       // Ai: row indices, of size nz = Ap [n]
-    Index *p_n          // A is n-by-n
-) ;
-
-//------------------------------------------------------------------------------
-// two versions created by tri_dot_template
-//------------------------------------------------------------------------------
-
-int64_t tri_dot                     // # of triangles
-(
-    const int64_t *restrict Lp,     // column pointers of L, size n+1
-    const Index   *restrict Li,     // row indices of L
-    const int64_t *restrict Up,     // column pointers of U, size n+1
-    const Index   *restrict Ui,     // row indices of U
-    const Index n                   // L and U are n-by-n
-) ;
-
-int64_t tri_dot_parallel            // # of triangles
-(
-    const int64_t *restrict Lp,     // column pointers of L, size n+1
-    const Index   *restrict Li,     // row indices of L
-    const int64_t *restrict Up,     // column pointers of U, size n+1
-    const Index   *restrict Ui,     // row indices of U
-    const Index n,                  // L and U are n-by-n
-    const int threads,              // # of threads
-    const Index chunk               // scheduler chunk size
-) ;
-
-//------------------------------------------------------------------------------
-// tri_simple: simplest method, not parallel
-//------------------------------------------------------------------------------
-
-int64_t tri_simple          // # of triangles, or -1 if out of memory
-(
-    const int64_t *restrict Lp,     // column pointers, size n+1
-    const Index   *restrict Li,     // row indices
-    const Index   n                 // L is n-by-n
-) ;
-
diff --git a/GraphBLAS/Extras/tri/tri_dot_template.c b/GraphBLAS/Extras/tri/tri_dot_template.c
deleted file mode 100644
index 0e12db0b4a..0000000000
--- a/GraphBLAS/Extras/tri/tri_dot_template.c
+++ /dev/null
@@ -1,180 +0,0 @@
-//------------------------------------------------------------------------------
-// tri_dot: compute the number of triangles in a graph, using dot products 
-//------------------------------------------------------------------------------
-
-// Compute the # of triangles in a graph, C<L>=U'*L in GraphBLAS notation, then
-// ntri=sum(C).  Or, in MATLAB notation, ntri = sum (sum ((U'*L).*L)).
-
-// L is a binary matrix stored in compressed sparse column form.  Its values
-// are not stored.  If L(i,j) is in the pattern, its value is assumed to be 1.
-// C is not computed.  The pattern of column j is in Li [Lp [j]..Lp[j+1]].  Row
-// indices in the matrix L MUST be sorted.  Lp [0]=0, and Lp [n] = total
-// number of entries in the matrix.  Lp is of size n+1.
-
-// U is the transpose of L.  It also must have sorted row indices.  It can
-// also be viewed as the compressed-row form of L.
-
-// L can also be a symmetric permutation of a lower triangular matrix.  U must
-// always be the transpose of L (or equivalently, the compressed-sparse row
-// form of L).
-
-#ifdef PARALLEL
-#define TRI_DOT tri_dot_parallel
-#else
-#define TRI_DOT tri_dot
-#endif
-
-int64_t TRI_DOT                     // # of triangles
-(
-    const int64_t *restrict Lp,     // column pointers of L, size n+1
-    const Index   *restrict Li,     // row indices of L
-    const int64_t *restrict Up,     // column pointers of U, size n+1
-    const Index   *restrict Ui,     // row indices of U
-    const Index n                   // L and U are n-by-n
-    #ifdef PARALLEL
-    , const int threads             // # of threads
-    , const Index chunk             // scheduler chunk size
-    #endif
-)
-{
-
-    int64_t ntri = 0 ;
-
-    #ifdef PARALLEL
-    if (n < chunk || threads < 2)
-    {
-        // punt to sequential version of the same algorithm
-        return (tri_dot (Lp, Li, Up, Ui, n)) ;
-    }
-
-    #pragma omp parallel for num_threads(threads) reduction(+:ntri) schedule(dynamic,chunk)
-    #endif
-    for (Index j = 0 ; j < n ; j++)
-    {
-
-        //----------------------------------------------------------------------
-        // compute sum(C(:,j)), where C(:,j) = (U' * L(:,j)) .* L(:,j)
-        //----------------------------------------------------------------------
-
-        // get the pattern of the mask, L(:,j)
-        int64_t pl_start = Lp [j] ;
-        int64_t pl_end   = Lp [j+1] ;
-        if (pl_start == pl_end) continue ;
-
-        // first and last row indices in L(:,j)
-        Index ifirst = Li [pl_start] ;
-        Index ilast  = Li [pl_end-1] ;
-
-        for (int64_t p = pl_start ; p < pl_end ; p++)
-        {
-
-            //------------------------------------------------------------------
-            // compute C(i,j) = U(:,i)' * L(:,j), via merge
-            //------------------------------------------------------------------
-
-            // C(i,j) is in the pattern of the mask, L
-
-            // get the head of the L(:,j) list
-            Index i = Li [p] ;
-            int64_t pl = pl_start ;
-
-            // get the head of the U(:,i) list
-            int64_t pu = Up [i] ;
-            int64_t pu_end = Up [i+1] ;
-
-            // skip if U(:,i) is empty
-            if (pu == pu_end) continue ;
-
-            // skip if all entries in U(:,i) are outside [ifirst..ilast]
-            if (Ui [pu_end-1] < ifirst || ilast < Ui [pu]) continue ;
-
-            while (pl < pl_end && pu < pu_end)
-            {
-
-                //--------------------------------------------------------------
-                // get the next values at the head of the Ui and Li lists
-                //--------------------------------------------------------------
-
-                Index iu = Ui [pu] ;
-                Index il = Li [pl] ;
-
-                if (iu < il)
-                {
-
-                    //----------------------------------------------------------
-                    // U(iu,i) appears before L(il,j)
-                    //----------------------------------------------------------
-
-                    // consume entries from U(:,i) until reaching L(il,j)
-
-                    // binary search of Ui [pleft ... pright] for integer il
-                    int64_t pleft  = pu + 1 ;
-                    int64_t pright = pu_end ;
-                    while (pleft < pright)
-                    {
-                        int64_t pmiddle = (pleft + pright) / 2 ;
-                        if (il > Ui [pmiddle])
-                        {
-                            // if in the list, it appears in [pmiddle+1..pright]
-                            pleft = pmiddle + 1 ;
-                        }
-                        else
-                        {
-                            // if in the list, it appears in [pleft..pmiddle]
-                            pright = pmiddle ;
-                        }
-                    }
-                    pu = pleft ;
-
-                }
-                else if (il < iu)
-                {
-
-                    //----------------------------------------------------------
-                    // L(il,j) appears before U(iu,i)
-                    //----------------------------------------------------------
-
-                    // consume entries from L(:,j) until reaching U(iu,i)
-
-                    // binary search of Li [pleft ... pright] for integer iu
-                    int64_t pleft  = pl + 1 ;
-                    int64_t pright = pl_end ;
-                    while (pleft < pright)
-                    {
-                        int64_t pmiddle = (pleft + pright) / 2 ;
-                        if (iu > Li [pmiddle])
-                        {
-                            // if in the list, it appears in [pmiddle+1..pright]
-                            pleft = pmiddle + 1 ;
-                        }
-                        else
-                        {
-                            // if in the list, it appears in [pleft..pmiddle]
-                            pright = pmiddle ;
-                        }
-                    }
-                    pl = pleft ;
-
-                }
-                else // iu == il == k
-                {
-
-                    //----------------------------------------------------------
-                    // U(k,i) and L(k,j) are both present
-                    //----------------------------------------------------------
-
-                    // C(i,j) += U(k,i) * L(k,j)
-                    ntri++ ;
-
-                    // advance to the next rows in the U(:,i) and L(:,j) lists
-                    pl++ ;
-                    pu++ ;
-                }
-            }
-        }
-    }
-    return (ntri) ;
-}
-
-#undef TRI_DOT
-#undef PARALLEL
diff --git a/GraphBLAS/Extras/tri/tri_functions.c b/GraphBLAS/Extras/tri/tri_functions.c
deleted file mode 100644
index e06c2a77b7..0000000000
--- a/GraphBLAS/Extras/tri/tri_functions.c
+++ /dev/null
@@ -1,60 +0,0 @@
-//------------------------------------------------------------------------------
-// tri_functions: four variants of outer-product triangle counting
-//------------------------------------------------------------------------------
-
-#include "tri_def.h"
-
-//------------------------------------------------------------------------------
-// outer-product methods, no binary search
-//------------------------------------------------------------------------------
-
-// tri_mark
-#include "tri_template.c"
-
-// tri_bit
-#define BIT
-#include "tri_template.c"
-
-// tri_mark_parallel
-#define PARALLEL
-#include "tri_template.c"
-
-// tri_bit_parallel
-#define PARALLEL
-#define BIT
-#include "tri_template.c"
-
-//------------------------------------------------------------------------------
-// dot product methods
-//------------------------------------------------------------------------------
-
-// tri_dot
-#include "tri_dot_template.c"
-
-// tri_dot_parallel
-#define PARALLEL
-#include "tri_dot_template.c"
-
-//------------------------------------------------------------------------------
-// outer-product methods, with binary search
-//------------------------------------------------------------------------------
-
-// tri_logmark
-#define LOGSEARCH
-#include "tri_template.c"
-
-// tri_logbit
-#define LOGSEARCH
-#define BIT
-#include "tri_template.c"
-
-// tri_logmark_parallel
-#define LOGSEARCH
-#define PARALLEL
-#include "tri_template.c"
-
-// tri_logbit_parallel
-#define LOGSEARCH
-#define PARALLEL
-#define BIT
-#include "tri_template.c"
diff --git a/GraphBLAS/Extras/tri/tri_graphblas.awk b/GraphBLAS/Extras/tri/tri_graphblas.awk
deleted file mode 100644
index ac9c9ded04..0000000000
--- a/GraphBLAS/Extras/tri/tri_graphblas.awk
+++ /dev/null
@@ -1,29 +0,0 @@
-
-/edges / {
-    print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
-    print "id = id + 1 ;"
-    print "N (id) = ", $2, " ; "
-    print "Nedges (id) = ", $5, " ; "
-}
-
-/triangles/ {
-    print "Ntri (id) = ", $3, " ; "
-}
-
-/outer product method/ {
-    print "T (id,1) = ", $3, " ; % outer product "
-}
-
-/U=triu\(A\) time/ {
-    print "Tprep (id,1) = ", $3, " ; % outer product prep"
-}
-
-
-/dot product method/ {
-    print "T (id,2) = ", $3, " ;  % dot product "
-}
-
-/L=tril\(A\) time/ {
-    print "Tprep (id,2) = Tprep (id,1) + ", $3, " ; % dot product prep"
-}
-
diff --git a/GraphBLAS/Extras/tri/tri_grb_output.txt b/GraphBLAS/Extras/tri/tri_grb_output.txt
deleted file mode 100644
index bcc5fe9fc8..0000000000
--- a/GraphBLAS/Extras/tri/tri_grb_output.txt
+++ /dev/null
@@ -1,4171 +0,0 @@
-
---------------------------------------------------------------
-matrix 34546 by 34546, 841754 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 34546 ncols: 34546 max # entries: 841754
-format: standard CSR vlen: 34546 nvec_nonempty: 34546 nvec: 34546 plen: 34546 vdim: 34546
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 841754 
-row: 0 : 44 entries [0:43]
-    column 636: bool 1
-    column 1705: bool 1
-    column 1766: bool 1
-    column 3505: bool 1
-    column 4861: bool 1
-    column 5699: bool 1
-    column 6115: bool 1
-    column 6741: bool 1
-    column 7305: bool 1
-    column 7593: bool 1
-    column 7880: bool 1
-    column 7943: bool 1
-    column 8570: bool 1
-    column 8852: bool 1
-    column 9571: bool 1
-    column 9635: bool 1
-    column 12031: bool 1
-    column 12467: bool 1
-    column 14441: bool 1
-    column 14513: bool 1
-    column 17564: bool 1
-    column 18432: bool 1
-    column 20624: bool 1
-    column 20851: bool 1
-    column 20887: bool 1
-    column 21318: bool 1
-    column 21544: bool 1
-    column 21942: bool 1
-    column 24279: bool 1
-    column 24525: bool 1
-    ...
-row: 1 : 128 entries [44:171]
-    ...
-row: 2 : 28 entries [172:199]
-    ...
-row: 3 : 1 entries [200:200]
-    ...
-row: 4 : 28 entries [201:228]
-    ...
-row: 5 : 64 entries [229:292]
-    ...
-row: 6 : 79 entries [293:371]
-    ...
-row: 7 : 58 entries [372:429]
-    ...
-row: 8 : 72 entries [430:501]
-    ...
-row: 9 : 19 entries [502:520]
-    ...
-...
-
-total time to read A matrix:       0.544975 sec
-
-n 34546 # edges 420877
-U=triu(A) time:        0.004417 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.003768 sec
-# triangles 1276868
-
-L'*U time (dot):         0.079471 sec
-tricount time:         0.081258 sec (dot product method)
-tri+prep time:         0.089443 sec (incl time to compute L and U)
-compute C time:        0.079471 sec
-reduce (C) time:       0.001787 sec
-rate       4.71 million edges/sec (incl time for U=triu(A))
-rate       5.18 million edges/sec (just tricount itself)
-
-# triangles 1276868
-
-L'*U time (dot):         0.075064 sec (nthreads: 2 speedup 1.0587)
-tricount time:         0.076849 sec (dot product method)
-tri+prep time:         0.085035 sec (incl time to compute L and U)
-compute C time:        0.075064 sec
-reduce (C) time:       0.001785 sec
-rate       4.95 million edges/sec (incl time for U=triu(A))
-rate       5.48 million edges/sec (just tricount itself)
-
-# triangles 1276868
-
-L'*U time (dot):         0.046292 sec (nthreads: 4 speedup 1.71671)
-tricount time:         0.048083 sec (dot product method)
-tri+prep time:         0.056269 sec (incl time to compute L and U)
-compute C time:        0.046292 sec
-reduce (C) time:       0.001791 sec
-rate       7.48 million edges/sec (incl time for U=triu(A))
-rate       8.75 million edges/sec (just tricount itself)
-
-# triangles 1276868
-
-L'*U time (dot):         0.026514 sec (nthreads: 8 speedup 2.9973)
-tricount time:         0.028299 sec (dot product method)
-tri+prep time:         0.036484 sec (incl time to compute L and U)
-compute C time:        0.026514 sec
-reduce (C) time:       0.001785 sec
-rate      11.54 million edges/sec (incl time for U=triu(A))
-rate      14.87 million edges/sec (just tricount itself)
-
-# triangles 1276868
-
-L'*U time (dot):         0.023177 sec (nthreads: 16 speedup 3.42882)
-tricount time:         0.024995 sec (dot product method)
-tri+prep time:         0.033181 sec (incl time to compute L and U)
-compute C time:        0.023177 sec
-reduce (C) time:       0.001818 sec
-rate      12.68 million edges/sec (incl time for U=triu(A))
-rate      16.84 million edges/sec (just tricount itself)
-
-# triangles 1276868
-
-L'*U time (dot):         0.025867 sec (nthreads: 32 speedup 3.07225)
-tricount time:         0.027925 sec (dot product method)
-tri+prep time:         0.036111 sec (incl time to compute L and U)
-compute C time:        0.025867 sec
-reduce (C) time:       0.002058 sec
-rate      11.66 million edges/sec (incl time for U=triu(A))
-rate      15.07 million edges/sec (just tricount itself)
-
-# triangles 1276868
-
-L'*U time (dot):         0.049711 sec (nthreads: 64 speedup 1.59866)
-tricount time:         0.055341 sec (dot product method)
-tri+prep time:         0.063527 sec (incl time to compute L and U)
-compute C time:        0.049711 sec
-reduce (C) time:       0.005631 sec
-rate       6.63 million edges/sec (incl time for U=triu(A))
-rate       7.61 million edges/sec (just tricount itself)
-
-# triangles 1276868
-
-L'*U time (dot):         0.140619 sec (nthreads: 128 speedup 0.56515)
-tricount time:         0.148593 sec (dot product method)
-tri+prep time:         0.156778 sec (incl time to compute L and U)
-compute C time:        0.140619 sec
-reduce (C) time:       0.007974 sec
-rate       2.68 million edges/sec (incl time for U=triu(A))
-rate       2.83 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.170530 sec
-tricount time:         0.178530 sec (saxpy method)
-tri+prep time:         0.182299 sec (incl time to compute L)
-compute C time:        0.170530 sec
-reduce (C) time:       0.008000 sec
-rate       2.31 million edges/sec (incl time for L=tril(A))
-rate       2.36 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.107084 sec (nthreads: 2 speedup 1.59249)
-tricount time:         0.115099 sec (saxpy method)
-tri+prep time:         0.118868 sec (incl time to compute L)
-compute C time:        0.107084 sec
-reduce (C) time:       0.008015 sec
-rate       3.54 million edges/sec (incl time for L=tril(A))
-rate       3.66 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.060139 sec (nthreads: 4 speedup 2.8356)
-tricount time:         0.068086 sec (saxpy method)
-tri+prep time:         0.071854 sec (incl time to compute L)
-compute C time:        0.060139 sec
-reduce (C) time:       0.007947 sec
-rate       5.86 million edges/sec (incl time for L=tril(A))
-rate       6.18 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.035077 sec (nthreads: 8 speedup 4.86158)
-tricount time:         0.043061 sec (saxpy method)
-tri+prep time:         0.046829 sec (incl time to compute L)
-compute C time:        0.035077 sec
-reduce (C) time:       0.007984 sec
-rate       8.99 million edges/sec (incl time for L=tril(A))
-rate       9.77 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.031066 sec (nthreads: 16 speedup 5.48929)
-tricount time:         0.039161 sec (saxpy method)
-tri+prep time:         0.042929 sec (incl time to compute L)
-compute C time:        0.031066 sec
-reduce (C) time:       0.008095 sec
-rate       9.80 million edges/sec (incl time for L=tril(A))
-rate      10.75 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.029828 sec (nthreads: 32 speedup 5.71711)
-tricount time:         0.037779 sec (saxpy method)
-tri+prep time:         0.041548 sec (incl time to compute L)
-compute C time:        0.029828 sec
-reduce (C) time:       0.007951 sec
-rate      10.13 million edges/sec (incl time for L=tril(A))
-rate      11.14 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.030285 sec (nthreads: 64 speedup 5.6309)
-tricount time:         0.038566 sec (saxpy method)
-tri+prep time:         0.042335 sec (incl time to compute L)
-compute C time:        0.030285 sec
-reduce (C) time:       0.008281 sec
-rate       9.94 million edges/sec (incl time for L=tril(A))
-rate      10.91 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.029030 sec (nthreads: 128 speedup 5.87432)
-tricount time:         0.037023 sec (saxpy method)
-tri+prep time:         0.040791 sec (incl time to compute L)
-compute C time:        0.029030 sec
-reduce (C) time:       0.007993 sec
-rate      10.32 million edges/sec (incl time for L=tril(A))
-rate      11.37 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 265214 by 265214, 728962 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 265214 ncols: 265214 max # entries: 728962
-format: standard CSR vlen: 265214 nvec_nonempty: 265009 nvec: 265214 plen: 265214 vdim: 265214
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 728962 
-row: 0 : 15 entries [0:14]
-    column 1: bool 1
-    column 11113: bool 1
-    column 33336: bool 1
-    column 66669: bool 1
-    column 74302: bool 1
-    column 111113: bool 1
-    column 194239: bool 1
-    column 198548: bool 1
-    column 201350: bool 1
-    column 201883: bool 1
-    column 207437: bool 1
-    column 209659: bool 1
-    column 228326: bool 1
-    column 235882: bool 1
-    column 242992: bool 1
-row: 1 : 957 entries [15:971]
-    column 0: bool 1
-    column 80: bool 1
-    column 311: bool 1
-    column 495: bool 1
-    column 798: bool 1
-    column 1062: bool 1
-    column 2311: bool 1
-    column 3099: bool 1
-    column 3144: bool 1
-    column 3423: bool 1
-    column 3448: bool 1
-    column 3613: bool 1
-    column 3670: bool 1
-    column 3767: bool 1
-    column 3799: bool 1
-    ...
-row: 2 : 1587 entries [972:2558]
-    ...
-row: 3 : 1 entries [2559:2559]
-    ...
-row: 4 : 2 entries [2560:2561]
-    ...
-row: 5 : 1 entries [2562:2562]
-    ...
-row: 6 : 1 entries [2563:2563]
-    ...
-row: 7 : 1 entries [2564:2564]
-    ...
-row: 8 : 1 entries [2565:2565]
-    ...
-row: 9 : 1 entries [2566:2566]
-    ...
-...
-
-total time to read A matrix:       0.496675 sec
-
-n 265214 # edges 364481
-U=triu(A) time:        0.006510 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.005932 sec
-# triangles 267313
-
-L'*U time (dot):         0.054255 sec
-tricount time:         0.054640 sec (dot product method)
-tri+prep time:         0.067082 sec (incl time to compute L and U)
-compute C time:        0.054255 sec
-reduce (C) time:       0.000386 sec
-rate       5.43 million edges/sec (incl time for U=triu(A))
-rate       6.67 million edges/sec (just tricount itself)
-
-# triangles 267313
-
-L'*U time (dot):         0.061700 sec (nthreads: 2 speedup 0.879326)
-tricount time:         0.062155 sec (dot product method)
-tri+prep time:         0.074596 sec (incl time to compute L and U)
-compute C time:        0.061700 sec
-reduce (C) time:       0.000454 sec
-rate       4.89 million edges/sec (incl time for U=triu(A))
-rate       5.86 million edges/sec (just tricount itself)
-
-# triangles 267313
-
-L'*U time (dot):         0.044808 sec (nthreads: 4 speedup 1.21082)
-tricount time:         0.045193 sec (dot product method)
-tri+prep time:         0.057635 sec (incl time to compute L and U)
-compute C time:        0.044808 sec
-reduce (C) time:       0.000385 sec
-rate       6.32 million edges/sec (incl time for U=triu(A))
-rate       8.06 million edges/sec (just tricount itself)
-
-# triangles 267313
-
-L'*U time (dot):         0.029871 sec (nthreads: 8 speedup 1.81631)
-tricount time:         0.030255 sec (dot product method)
-tri+prep time:         0.042697 sec (incl time to compute L and U)
-compute C time:        0.029871 sec
-reduce (C) time:       0.000384 sec
-rate       8.54 million edges/sec (incl time for U=triu(A))
-rate      12.05 million edges/sec (just tricount itself)
-
-# triangles 267313
-
-L'*U time (dot):         0.029781 sec (nthreads: 16 speedup 1.82182)
-tricount time:         0.030172 sec (dot product method)
-tri+prep time:         0.042613 sec (incl time to compute L and U)
-compute C time:        0.029781 sec
-reduce (C) time:       0.000391 sec
-rate       8.55 million edges/sec (incl time for U=triu(A))
-rate      12.08 million edges/sec (just tricount itself)
-
-# triangles 267313
-
-L'*U time (dot):         0.038613 sec (nthreads: 32 speedup 1.40508)
-tricount time:         0.039066 sec (dot product method)
-tri+prep time:         0.051507 sec (incl time to compute L and U)
-compute C time:        0.038613 sec
-reduce (C) time:       0.000452 sec
-rate       7.08 million edges/sec (incl time for U=triu(A))
-rate       9.33 million edges/sec (just tricount itself)
-
-# triangles 267313
-
-L'*U time (dot):         0.079630 sec (nthreads: 64 speedup 0.681337)
-tricount time:         0.080874 sec (dot product method)
-tri+prep time:         0.093316 sec (incl time to compute L and U)
-compute C time:        0.079630 sec
-reduce (C) time:       0.001244 sec
-rate       3.91 million edges/sec (incl time for U=triu(A))
-rate       4.51 million edges/sec (just tricount itself)
-
-# triangles 267313
-
-L'*U time (dot):         0.295697 sec (nthreads: 128 speedup 0.183481)
-tricount time:         0.297700 sec (dot product method)
-tri+prep time:         0.310141 sec (incl time to compute L and U)
-compute C time:        0.295697 sec
-reduce (C) time:       0.002003 sec
-rate       1.18 million edges/sec (incl time for U=triu(A))
-rate       1.22 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.258586 sec
-tricount time:         0.260633 sec (saxpy method)
-tri+prep time:         0.266565 sec (incl time to compute L)
-compute C time:        0.258586 sec
-reduce (C) time:       0.002048 sec
-rate       1.37 million edges/sec (incl time for L=tril(A))
-rate       1.40 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.155504 sec (nthreads: 2 speedup 1.66289)
-tricount time:         0.157440 sec (saxpy method)
-tri+prep time:         0.163372 sec (incl time to compute L)
-compute C time:        0.155504 sec
-reduce (C) time:       0.001937 sec
-rate       2.23 million edges/sec (incl time for L=tril(A))
-rate       2.32 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.087361 sec (nthreads: 4 speedup 2.95997)
-tricount time:         0.089278 sec (saxpy method)
-tri+prep time:         0.095210 sec (incl time to compute L)
-compute C time:        0.087361 sec
-reduce (C) time:       0.001918 sec
-rate       3.83 million edges/sec (incl time for L=tril(A))
-rate       4.08 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.057031 sec (nthreads: 8 speedup 4.53412)
-tricount time:         0.058948 sec (saxpy method)
-tri+prep time:         0.064880 sec (incl time to compute L)
-compute C time:        0.057031 sec
-reduce (C) time:       0.001917 sec
-rate       5.62 million edges/sec (incl time for L=tril(A))
-rate       6.18 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.044497 sec (nthreads: 16 speedup 5.81134)
-tricount time:         0.046412 sec (saxpy method)
-tri+prep time:         0.052344 sec (incl time to compute L)
-compute C time:        0.044497 sec
-reduce (C) time:       0.001915 sec
-rate       6.96 million edges/sec (incl time for L=tril(A))
-rate       7.85 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.034539 sec (nthreads: 32 speedup 7.48678)
-tricount time:         0.036328 sec (saxpy method)
-tri+prep time:         0.042259 sec (incl time to compute L)
-compute C time:        0.034539 sec
-reduce (C) time:       0.001789 sec
-rate       8.62 million edges/sec (incl time for L=tril(A))
-rate      10.03 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.040884 sec (nthreads: 64 speedup 6.32485)
-tricount time:         0.042856 sec (saxpy method)
-tri+prep time:         0.048788 sec (incl time to compute L)
-compute C time:        0.040884 sec
-reduce (C) time:       0.001972 sec
-rate       7.47 million edges/sec (incl time for L=tril(A))
-rate       8.50 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.037621 sec (nthreads: 128 speedup 6.87338)
-tricount time:         0.039443 sec (saxpy method)
-tri+prep time:         0.045375 sec (incl time to compute L)
-compute C time:        0.037621 sec
-reduce (C) time:       0.001822 sec
-rate       8.03 million edges/sec (incl time for L=tril(A))
-rate       9.24 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 75879 by 75879, 811480 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 75879 ncols: 75879 max # entries: 811480
-format: standard CSR vlen: 75879 nvec_nonempty: 75879 nvec: 75879 plen: 75879 vdim: 75879
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 811480 
-row: 0 : 682 entries [0:681]
-    column 2: bool 1
-    column 3: bool 1
-    column 114: bool 1
-    column 149: bool 1
-    column 181: bool 1
-    column 225: bool 1
-    column 281: bool 1
-    column 336: bool 1
-    column 370: bool 1
-    column 447: bool 1
-    column 558: bool 1
-    column 669: bool 1
-    column 779: bool 1
-    column 825: bool 1
-    column 874: bool 1
-    column 890: bool 1
-    column 896: bool 1
-    column 924: bool 1
-    column 1001: bool 1
-    column 1110: bool 1
-    column 1111: bool 1
-    column 1121: bool 1
-    column 1222: bool 1
-    column 1333: bool 1
-    column 1444: bool 1
-    column 1555: bool 1
-    column 1666: bool 1
-    column 1777: bool 1
-    column 1833: bool 1
-    column 1888: bool 1
-    ...
-row: 1 : 841 entries [682:1522]
-    ...
-row: 2 : 286 entries [1523:1808]
-    ...
-row: 3 : 345 entries [1809:2153]
-    ...
-row: 4 : 55 entries [2154:2208]
-    ...
-row: 5 : 24 entries [2209:2232]
-    ...
-row: 6 : 10 entries [2233:2242]
-    ...
-row: 7 : 79 entries [2243:2321]
-    ...
-row: 8 : 11 entries [2322:2332]
-    ...
-row: 9 : 1 entries [2333:2333]
-    ...
-...
-
-total time to read A matrix:       0.525031 sec
-
-n 75879 # edges 405740
-U=triu(A) time:        0.004679 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.004171 sec
-# triangles 1624481
-
-L'*U time (dot):         0.163430 sec
-tricount time:         0.164800 sec (dot product method)
-tri+prep time:         0.173650 sec (incl time to compute L and U)
-compute C time:        0.163430 sec
-reduce (C) time:       0.001370 sec
-rate       2.34 million edges/sec (incl time for U=triu(A))
-rate       2.46 million edges/sec (just tricount itself)
-
-# triangles 1624481
-
-L'*U time (dot):         0.135309 sec (nthreads: 2 speedup 1.20783)
-tricount time:         0.136675 sec (dot product method)
-tri+prep time:         0.145525 sec (incl time to compute L and U)
-compute C time:        0.135309 sec
-reduce (C) time:       0.001366 sec
-rate       2.79 million edges/sec (incl time for U=triu(A))
-rate       2.97 million edges/sec (just tricount itself)
-
-# triangles 1624481
-
-L'*U time (dot):         0.082300 sec (nthreads: 4 speedup 1.98579)
-tricount time:         0.083657 sec (dot product method)
-tri+prep time:         0.092506 sec (incl time to compute L and U)
-compute C time:        0.082300 sec
-reduce (C) time:       0.001357 sec
-rate       4.39 million edges/sec (incl time for U=triu(A))
-rate       4.85 million edges/sec (just tricount itself)
-
-# triangles 1624481
-
-L'*U time (dot):         0.046050 sec (nthreads: 8 speedup 3.54899)
-tricount time:         0.047417 sec (dot product method)
-tri+prep time:         0.056266 sec (incl time to compute L and U)
-compute C time:        0.046050 sec
-reduce (C) time:       0.001367 sec
-rate       7.21 million edges/sec (incl time for U=triu(A))
-rate       8.56 million edges/sec (just tricount itself)
-
-# triangles 1624481
-
-L'*U time (dot):         0.034231 sec (nthreads: 16 speedup 4.77427)
-tricount time:         0.035610 sec (dot product method)
-tri+prep time:         0.044460 sec (incl time to compute L and U)
-compute C time:        0.034231 sec
-reduce (C) time:       0.001379 sec
-rate       9.13 million edges/sec (incl time for U=triu(A))
-rate      11.39 million edges/sec (just tricount itself)
-
-# triangles 1624481
-
-L'*U time (dot):         0.031847 sec (nthreads: 32 speedup 5.13178)
-tricount time:         0.033412 sec (dot product method)
-tri+prep time:         0.042261 sec (incl time to compute L and U)
-compute C time:        0.031847 sec
-reduce (C) time:       0.001565 sec
-rate       9.60 million edges/sec (incl time for U=triu(A))
-rate      12.14 million edges/sec (just tricount itself)
-
-# triangles 1624481
-
-L'*U time (dot):         0.063033 sec (nthreads: 64 speedup 2.59275)
-tricount time:         0.067341 sec (dot product method)
-tri+prep time:         0.076191 sec (incl time to compute L and U)
-compute C time:        0.063033 sec
-reduce (C) time:       0.004308 sec
-rate       5.33 million edges/sec (incl time for U=triu(A))
-rate       6.03 million edges/sec (just tricount itself)
-
-# triangles 1624481
-
-L'*U time (dot):         0.140785 sec (nthreads: 128 speedup 1.16085)
-tricount time:         0.147065 sec (dot product method)
-tri+prep time:         0.155914 sec (incl time to compute L and U)
-compute C time:        0.140785 sec
-reduce (C) time:       0.006280 sec
-rate       2.60 million edges/sec (incl time for U=triu(A))
-rate       2.76 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.324452 sec
-tricount time:         0.330741 sec (saxpy method)
-tri+prep time:         0.334912 sec (incl time to compute L)
-compute C time:        0.324452 sec
-reduce (C) time:       0.006289 sec
-rate       1.21 million edges/sec (incl time for L=tril(A))
-rate       1.23 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.178357 sec (nthreads: 2 speedup 1.81912)
-tricount time:         0.184639 sec (saxpy method)
-tri+prep time:         0.188810 sec (incl time to compute L)
-compute C time:        0.178357 sec
-reduce (C) time:       0.006282 sec
-rate       2.15 million edges/sec (incl time for L=tril(A))
-rate       2.20 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.094628 sec (nthreads: 4 speedup 3.4287)
-tricount time:         0.100963 sec (saxpy method)
-tri+prep time:         0.105134 sec (incl time to compute L)
-compute C time:        0.094628 sec
-reduce (C) time:       0.006335 sec
-rate       3.86 million edges/sec (incl time for L=tril(A))
-rate       4.02 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.049949 sec (nthreads: 8 speedup 6.49569)
-tricount time:         0.056240 sec (saxpy method)
-tri+prep time:         0.060411 sec (incl time to compute L)
-compute C time:        0.049949 sec
-reduce (C) time:       0.006292 sec
-rate       6.72 million edges/sec (incl time for L=tril(A))
-rate       7.21 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.049982 sec (nthreads: 16 speedup 6.49132)
-tricount time:         0.056282 sec (saxpy method)
-tri+prep time:         0.060453 sec (incl time to compute L)
-compute C time:        0.049982 sec
-reduce (C) time:       0.006300 sec
-rate       6.71 million edges/sec (incl time for L=tril(A))
-rate       7.21 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.034162 sec (nthreads: 32 speedup 9.49743)
-tricount time:         0.040508 sec (saxpy method)
-tri+prep time:         0.044679 sec (incl time to compute L)
-compute C time:        0.034162 sec
-reduce (C) time:       0.006346 sec
-rate       9.08 million edges/sec (incl time for L=tril(A))
-rate      10.02 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.020313 sec (nthreads: 64 speedup 15.9729)
-tricount time:         0.026721 sec (saxpy method)
-tri+prep time:         0.030892 sec (incl time to compute L)
-compute C time:        0.020313 sec
-reduce (C) time:       0.006408 sec
-rate      13.13 million edges/sec (incl time for L=tril(A))
-rate      15.18 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.019142 sec (nthreads: 128 speedup 16.9495)
-tricount time:         0.025290 sec (saxpy method)
-tri+prep time:         0.029460 sec (incl time to compute L)
-compute C time:        0.019142 sec
-reduce (C) time:       0.006147 sec
-rate      13.77 million edges/sec (incl time for L=tril(A))
-rate      16.04 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 82168 by 82168, 1008460 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 82168 ncols: 82168 max # entries: 1008460
-format: standard CSR vlen: 82168 nvec_nonempty: 82168 nvec: 82168 plen: 82168 vdim: 82168
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 1008460 
-row: 0 : 245 entries [0:244]
-    column 1: bool 1
-    column 2: bool 1
-    column 3: bool 1
-    column 114: bool 1
-    column 225: bool 1
-    column 336: bool 1
-    column 447: bool 1
-    column 558: bool 1
-    column 669: bool 1
-    column 780: bool 1
-    column 891: bool 1
-    column 1002: bool 1
-    column 1113: bool 1
-    column 1114: bool 1
-    column 1225: bool 1
-    column 1336: bool 1
-    column 1447: bool 1
-    column 1558: bool 1
-    column 1669: bool 1
-    column 1780: bool 1
-    column 1891: bool 1
-    column 1982: bool 1
-    column 2002: bool 1
-    column 2113: bool 1
-    column 2224: bool 1
-    column 2225: bool 1
-    column 2336: bool 1
-    column 2447: bool 1
-    column 2558: bool 1
-    column 2669: bool 1
-    ...
-row: 1 : 186 entries [245:430]
-    ...
-row: 2 : 23 entries [431:453]
-    ...
-row: 3 : 136 entries [454:589]
-    ...
-row: 4 : 8 entries [590:597]
-    ...
-row: 5 : 28 entries [598:625]
-    ...
-row: 6 : 17 entries [626:642]
-    ...
-row: 7 : 14 entries [643:656]
-    ...
-row: 8 : 205 entries [657:861]
-    ...
-row: 9 : 65 entries [862:926]
-    ...
-...
-
-total time to read A matrix:       0.662633 sec
-
-n 82168 # edges 504230
-U=triu(A) time:        0.005689 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.005103 sec
-# triangles 602592
-
-L'*U time (dot):         0.147630 sec
-tricount time:         0.148543 sec (dot product method)
-tri+prep time:         0.159335 sec (incl time to compute L and U)
-compute C time:        0.147630 sec
-reduce (C) time:       0.000913 sec
-rate       3.16 million edges/sec (incl time for U=triu(A))
-rate       3.39 million edges/sec (just tricount itself)
-
-# triangles 602592
-
-L'*U time (dot):         0.131999 sec (nthreads: 2 speedup 1.11842)
-tricount time:         0.132908 sec (dot product method)
-tri+prep time:         0.143701 sec (incl time to compute L and U)
-compute C time:        0.131999 sec
-reduce (C) time:       0.000910 sec
-rate       3.51 million edges/sec (incl time for U=triu(A))
-rate       3.79 million edges/sec (just tricount itself)
-
-# triangles 602592
-
-L'*U time (dot):         0.075345 sec (nthreads: 4 speedup 1.95938)
-tricount time:         0.076317 sec (dot product method)
-tri+prep time:         0.087110 sec (incl time to compute L and U)
-compute C time:        0.075345 sec
-reduce (C) time:       0.000972 sec
-rate       5.79 million edges/sec (incl time for U=triu(A))
-rate       6.61 million edges/sec (just tricount itself)
-
-# triangles 602592
-
-L'*U time (dot):         0.045432 sec (nthreads: 8 speedup 3.24949)
-tricount time:         0.046341 sec (dot product method)
-tri+prep time:         0.057134 sec (incl time to compute L and U)
-compute C time:        0.045432 sec
-reduce (C) time:       0.000909 sec
-rate       8.83 million edges/sec (incl time for U=triu(A))
-rate      10.88 million edges/sec (just tricount itself)
-
-# triangles 602592
-
-L'*U time (dot):         0.038071 sec (nthreads: 16 speedup 3.87778)
-tricount time:         0.038996 sec (dot product method)
-tri+prep time:         0.049789 sec (incl time to compute L and U)
-compute C time:        0.038071 sec
-reduce (C) time:       0.000926 sec
-rate      10.13 million edges/sec (incl time for U=triu(A))
-rate      12.93 million edges/sec (just tricount itself)
-
-# triangles 602592
-
-L'*U time (dot):         0.040837 sec (nthreads: 32 speedup 3.6151)
-tricount time:         0.042361 sec (dot product method)
-tri+prep time:         0.053154 sec (incl time to compute L and U)
-compute C time:        0.040837 sec
-reduce (C) time:       0.001524 sec
-rate       9.49 million edges/sec (incl time for U=triu(A))
-rate      11.90 million edges/sec (just tricount itself)
-
-# triangles 602592
-
-L'*U time (dot):         0.067593 sec (nthreads: 64 speedup 2.18408)
-tricount time:         0.071297 sec (dot product method)
-tri+prep time:         0.082090 sec (incl time to compute L and U)
-compute C time:        0.067593 sec
-reduce (C) time:       0.003703 sec
-rate       6.14 million edges/sec (incl time for U=triu(A))
-rate       7.07 million edges/sec (just tricount itself)
-
-# triangles 602592
-
-L'*U time (dot):         0.196633 sec (nthreads: 128 speedup 0.750789)
-tricount time:         0.201127 sec (dot product method)
-tri+prep time:         0.211920 sec (incl time to compute L and U)
-compute C time:        0.196633 sec
-reduce (C) time:       0.004494 sec
-rate       2.38 million edges/sec (incl time for U=triu(A))
-rate       2.51 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.360700 sec
-tricount time:         0.365183 sec (saxpy method)
-tri+prep time:         0.370287 sec (incl time to compute L)
-compute C time:        0.360700 sec
-reduce (C) time:       0.004483 sec
-rate       1.36 million edges/sec (incl time for L=tril(A))
-rate       1.38 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.203456 sec (nthreads: 2 speedup 1.77287)
-tricount time:         0.207962 sec (saxpy method)
-tri+prep time:         0.213066 sec (incl time to compute L)
-compute C time:        0.203456 sec
-reduce (C) time:       0.004506 sec
-rate       2.37 million edges/sec (incl time for L=tril(A))
-rate       2.42 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.105258 sec (nthreads: 4 speedup 3.42682)
-tricount time:         0.109858 sec (saxpy method)
-tri+prep time:         0.114961 sec (incl time to compute L)
-compute C time:        0.105258 sec
-reduce (C) time:       0.004599 sec
-rate       4.39 million edges/sec (incl time for L=tril(A))
-rate       4.59 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.057355 sec (nthreads: 8 speedup 6.28887)
-tricount time:         0.061974 sec (saxpy method)
-tri+prep time:         0.067078 sec (incl time to compute L)
-compute C time:        0.057355 sec
-reduce (C) time:       0.004619 sec
-rate       7.52 million edges/sec (incl time for L=tril(A))
-rate       8.14 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.048210 sec (nthreads: 16 speedup 7.4818)
-tricount time:         0.052692 sec (saxpy method)
-tri+prep time:         0.057795 sec (incl time to compute L)
-compute C time:        0.048210 sec
-reduce (C) time:       0.004481 sec
-rate       8.72 million edges/sec (incl time for L=tril(A))
-rate       9.57 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.033719 sec (nthreads: 32 speedup 10.6971)
-tricount time:         0.038177 sec (saxpy method)
-tri+prep time:         0.043281 sec (incl time to compute L)
-compute C time:        0.033719 sec
-reduce (C) time:       0.004458 sec
-rate      11.65 million edges/sec (incl time for L=tril(A))
-rate      13.21 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.025466 sec (nthreads: 64 speedup 14.1639)
-tricount time:         0.030013 sec (saxpy method)
-tri+prep time:         0.035116 sec (incl time to compute L)
-compute C time:        0.025466 sec
-reduce (C) time:       0.004547 sec
-rate      14.36 million edges/sec (incl time for L=tril(A))
-rate      16.80 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.031870 sec (nthreads: 128 speedup 11.318)
-tricount time:         0.036290 sec (saxpy method)
-tri+prep time:         0.041393 sec (incl time to compute L)
-compute C time:        0.031870 sec
-reduce (C) time:       0.004420 sec
-rate      12.18 million edges/sec (incl time for L=tril(A))
-rate      13.89 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 105938 by 105938, 4633896 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 105938 ncols: 105938 max # entries: 4633896
-format: standard CSR vlen: 105938 nvec_nonempty: 105938 nvec: 105938 plen: 105938 vdim: 105938
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 4633896 
-row: 0 : 5 entries [0:4]
-    column 544: bool 1
-    column 20146: bool 1
-    column 23577: bool 1
-    column 36848: bool 1
-    column 70621: bool 1
-row: 1 : 6 entries [5:10]
-    column 22327: bool 1
-    column 36684: bool 1
-    column 51337: bool 1
-    column 91575: bool 1
-    column 92704: bool 1
-    column 103228: bool 1
-row: 2 : 11 entries [11:21]
-    column 6387: bool 1
-    column 15872: bool 1
-    column 17843: bool 1
-    column 22327: bool 1
-    column 45601: bool 1
-    column 56813: bool 1
-    column 61257: bool 1
-    column 62226: bool 1
-    column 62744: bool 1
-    column 63040: bool 1
-    column 75239: bool 1
-row: 3 : 5 entries [22:26]
-    column 14109: bool 1
-    column 20304: bool 1
-    column 48334: bool 1
-    column 69197: bool 1
-    column 69954: bool 1
-row: 4 : 272 entries [27:298]
-    column 81: bool 1
-    column 102: bool 1
-    column 171: bool 1
-    ...
-row: 5 : 46 entries [299:344]
-    ...
-row: 6 : 12 entries [345:356]
-    ...
-row: 7 : 7 entries [357:363]
-    ...
-row: 8 : 447 entries [364:810]
-    ...
-row: 9 : 178 entries [811:988]
-    ...
-...
-
-total time to read A matrix:       3.094173 sec
-
-n 105938 # edges 2316948
-U=triu(A) time:        0.051494 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.047304 sec
-# triangles 107987357
-
-L'*U time (dot):         4.180983 sec
-tricount time:         4.195167 sec (dot product method)
-tri+prep time:         4.293965 sec (incl time to compute L and U)
-compute C time:        4.180983 sec
-reduce (C) time:       0.014184 sec
-rate       0.54 million edges/sec (incl time for U=triu(A))
-rate       0.55 million edges/sec (just tricount itself)
-
-# triangles 107987357
-
-L'*U time (dot):         2.551951 sec (nthreads: 2 speedup 1.63835)
-tricount time:         2.572430 sec (dot product method)
-tri+prep time:         2.671228 sec (incl time to compute L and U)
-compute C time:        2.551951 sec
-reduce (C) time:       0.020479 sec
-rate       0.87 million edges/sec (incl time for U=triu(A))
-rate       0.90 million edges/sec (just tricount itself)
-
-# triangles 107987357
-
-L'*U time (dot):         2.254845 sec (nthreads: 4 speedup 1.85422)
-tricount time:         2.274253 sec (dot product method)
-tri+prep time:         2.373051 sec (incl time to compute L and U)
-compute C time:        2.254845 sec
-reduce (C) time:       0.019408 sec
-rate       0.98 million edges/sec (incl time for U=triu(A))
-rate       1.02 million edges/sec (just tricount itself)
-
-# triangles 107987357
-
-L'*U time (dot):         1.240315 sec (nthreads: 8 speedup 3.3709)
-tricount time:         1.259698 sec (dot product method)
-tri+prep time:         1.358496 sec (incl time to compute L and U)
-compute C time:        1.240315 sec
-reduce (C) time:       0.019383 sec
-rate       1.71 million edges/sec (incl time for U=triu(A))
-rate       1.84 million edges/sec (just tricount itself)
-
-# triangles 107987357
-
-L'*U time (dot):         0.697253 sec (nthreads: 16 speedup 5.99637)
-tricount time:         0.710558 sec (dot product method)
-tri+prep time:         0.809356 sec (incl time to compute L and U)
-compute C time:        0.697253 sec
-reduce (C) time:       0.013306 sec
-rate       2.86 million edges/sec (incl time for U=triu(A))
-rate       3.26 million edges/sec (just tricount itself)
-
-# triangles 107987357
-
-L'*U time (dot):         0.313786 sec (nthreads: 32 speedup 13.3243)
-tricount time:         0.327081 sec (dot product method)
-tri+prep time:         0.425879 sec (incl time to compute L and U)
-compute C time:        0.313786 sec
-reduce (C) time:       0.013295 sec
-rate       5.44 million edges/sec (incl time for U=triu(A))
-rate       7.08 million edges/sec (just tricount itself)
-
-# triangles 107987357
-
-L'*U time (dot):         0.431921 sec (nthreads: 64 speedup 9.67997)
-tricount time:         0.465903 sec (dot product method)
-tri+prep time:         0.564701 sec (incl time to compute L and U)
-compute C time:        0.431921 sec
-reduce (C) time:       0.033982 sec
-rate       4.10 million edges/sec (incl time for U=triu(A))
-rate       4.97 million edges/sec (just tricount itself)
-
-# triangles 107987357
-
-L'*U time (dot):         1.002142 sec (nthreads: 128 speedup 4.17205)
-tricount time:         1.053887 sec (dot product method)
-tri+prep time:         1.152685 sec (incl time to compute L and U)
-compute C time:        1.002142 sec
-reduce (C) time:       0.051745 sec
-rate       2.01 million edges/sec (incl time for U=triu(A))
-rate       2.20 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         4.868158 sec
-tricount time:         4.918601 sec (saxpy method)
-tri+prep time:         4.965905 sec (incl time to compute L)
-compute C time:        4.868158 sec
-reduce (C) time:       0.050442 sec
-rate       0.47 million edges/sec (incl time for L=tril(A))
-rate       0.47 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         2.242440 sec (nthreads: 2 speedup 2.17092)
-tricount time:         2.289258 sec (saxpy method)
-tri+prep time:         2.336562 sec (incl time to compute L)
-compute C time:        2.242440 sec
-reduce (C) time:       0.046818 sec
-rate       0.99 million edges/sec (incl time for L=tril(A))
-rate       1.01 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         1.464217 sec (nthreads: 4 speedup 3.32475)
-tricount time:         1.502119 sec (saxpy method)
-tri+prep time:         1.549423 sec (incl time to compute L)
-compute C time:        1.464217 sec
-reduce (C) time:       0.037901 sec
-rate       1.50 million edges/sec (incl time for L=tril(A))
-rate       1.54 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.817360 sec (nthreads: 8 speedup 5.95595)
-tricount time:         0.854729 sec (saxpy method)
-tri+prep time:         0.902033 sec (incl time to compute L)
-compute C time:        0.817360 sec
-reduce (C) time:       0.037369 sec
-rate       2.57 million edges/sec (incl time for L=tril(A))
-rate       2.71 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.375198 sec (nthreads: 16 speedup 12.9749)
-tricount time:         0.420531 sec (saxpy method)
-tri+prep time:         0.467835 sec (incl time to compute L)
-compute C time:        0.375198 sec
-reduce (C) time:       0.045333 sec
-rate       4.95 million edges/sec (incl time for L=tril(A))
-rate       5.51 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.208759 sec (nthreads: 32 speedup 23.3195)
-tricount time:         0.259771 sec (saxpy method)
-tri+prep time:         0.307076 sec (incl time to compute L)
-compute C time:        0.208759 sec
-reduce (C) time:       0.051012 sec
-rate       7.55 million edges/sec (incl time for L=tril(A))
-rate       8.92 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.121960 sec (nthreads: 64 speedup 39.9161)
-tricount time:         0.172358 sec (saxpy method)
-tri+prep time:         0.219662 sec (incl time to compute L)
-compute C time:        0.121960 sec
-reduce (C) time:       0.050398 sec
-rate      10.55 million edges/sec (incl time for L=tril(A))
-rate      13.44 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.100966 sec (nthreads: 128 speedup 48.2159)
-tricount time:         0.151946 sec (saxpy method)
-tri+prep time:         0.199250 sec (incl time to compute L)
-compute C time:        0.100966 sec
-reduce (C) time:       0.050980 sec
-rate      11.63 million edges/sec (incl time for L=tril(A))
-rate      15.25 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 3774768 by 3774768, 33037894 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 3774768 ncols: 3774768 max # entries: 33037894
-format: standard CSR vlen: 3774768 nvec_nonempty: 3774768 nvec: 3774768 plen: 3774768 vdim: 3774768
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 33037894 
-row: 0 : 2 entries [0:1]
-    column 1640588: bool 1
-    column 2330914: bool 1
-row: 1 : 1 entries [2:2]
-    column 2221416: bool 1
-row: 2 : 1 entries [3:3]
-    column 2719475: bool 1
-row: 3 : 1 entries [4:4]
-    column 2398884: bool 1
-row: 4 : 1 entries [5:5]
-    column 2451924: bool 1
-row: 5 : 1 entries [6:6]
-    column 2721440: bool 1
-row: 6 : 1 entries [7:7]
-    column 1583896: bool 1
-row: 7 : 1 entries [8:8]
-    column 1719648: bool 1
-row: 8 : 2 entries [9:10]
-    column 1869106: bool 1
-    column 2663586: bool 1
-row: 9 : 1 entries [11:11]
-    column 1768660: bool 1
-...
-
-total time to read A matrix:      25.542778 sec
-
-n 3774768 # edges 16518947
-U=triu(A) time:        1.708214 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        1.165534 sec
-# triangles 7515023
-
-L'*U time (dot):         9.267821 sec
-tricount time:         9.377206 sec (dot product method)
-tri+prep time:        12.250954 sec (incl time to compute L and U)
-compute C time:        9.267821 sec
-reduce (C) time:       0.109385 sec
-rate       1.35 million edges/sec (incl time for U=triu(A))
-rate       1.76 million edges/sec (just tricount itself)
-
-# triangles 7515023
-
-L'*U time (dot):         8.554563 sec (nthreads: 2 speedup 1.08338)
-tricount time:         8.665538 sec (dot product method)
-tri+prep time:        11.539286 sec (incl time to compute L and U)
-compute C time:        8.554563 sec
-reduce (C) time:       0.110975 sec
-rate       1.43 million edges/sec (incl time for U=triu(A))
-rate       1.91 million edges/sec (just tricount itself)
-
-# triangles 7515023
-
-L'*U time (dot):         5.313411 sec (nthreads: 4 speedup 1.74423)
-tricount time:         5.424479 sec (dot product method)
-tri+prep time:         8.298228 sec (incl time to compute L and U)
-compute C time:        5.313411 sec
-reduce (C) time:       0.111069 sec
-rate       1.99 million edges/sec (incl time for U=triu(A))
-rate       3.05 million edges/sec (just tricount itself)
-
-# triangles 7515023
-
-L'*U time (dot):         3.388036 sec (nthreads: 8 speedup 2.73545)
-tricount time:         3.499265 sec (dot product method)
-tri+prep time:         6.373013 sec (incl time to compute L and U)
-compute C time:        3.388036 sec
-reduce (C) time:       0.111228 sec
-rate       2.59 million edges/sec (incl time for U=triu(A))
-rate       4.72 million edges/sec (just tricount itself)
-
-# triangles 7515023
-
-L'*U time (dot):         2.573956 sec (nthreads: 16 speedup 3.60061)
-tricount time:         2.685031 sec (dot product method)
-tri+prep time:         5.558779 sec (incl time to compute L and U)
-compute C time:        2.573956 sec
-reduce (C) time:       0.111075 sec
-rate       2.97 million edges/sec (incl time for U=triu(A))
-rate       6.15 million edges/sec (just tricount itself)
-
-# triangles 7515023
-
-L'*U time (dot):         2.115553 sec (nthreads: 32 speedup 4.3808)
-tricount time:         2.226400 sec (dot product method)
-tri+prep time:         5.100148 sec (incl time to compute L and U)
-compute C time:        2.115553 sec
-reduce (C) time:       0.110847 sec
-rate       3.24 million edges/sec (incl time for U=triu(A))
-rate       7.42 million edges/sec (just tricount itself)
-
-# triangles 7515023
-
-L'*U time (dot):         1.958226 sec (nthreads: 64 speedup 4.73276)
-tricount time:         2.068879 sec (dot product method)
-tri+prep time:         4.942627 sec (incl time to compute L and U)
-compute C time:        1.958226 sec
-reduce (C) time:       0.110653 sec
-rate       3.34 million edges/sec (incl time for U=triu(A))
-rate       7.98 million edges/sec (just tricount itself)
-
-# triangles 7515023
-
-L'*U time (dot):         2.010807 sec (nthreads: 128 speedup 4.60901)
-tricount time:         2.121607 sec (dot product method)
-tri+prep time:         4.995355 sec (incl time to compute L and U)
-compute C time:        2.010807 sec
-reduce (C) time:       0.110801 sec
-rate       3.31 million edges/sec (incl time for U=triu(A))
-rate       7.79 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         7.887623 sec
-tricount time:         7.997788 sec (saxpy method)
-tri+prep time:         9.163323 sec (incl time to compute L)
-compute C time:        7.887623 sec
-reduce (C) time:       0.110165 sec
-rate       1.80 million edges/sec (incl time for L=tril(A))
-rate       2.07 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         6.399081 sec (nthreads: 2 speedup 1.23262)
-tricount time:         6.510326 sec (saxpy method)
-tri+prep time:         7.675861 sec (incl time to compute L)
-compute C time:        6.399081 sec
-reduce (C) time:       0.111246 sec
-rate       2.15 million edges/sec (incl time for L=tril(A))
-rate       2.54 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         4.180014 sec (nthreads: 4 speedup 1.88698)
-tricount time:         4.291750 sec (saxpy method)
-tri+prep time:         5.457285 sec (incl time to compute L)
-compute C time:        4.180014 sec
-reduce (C) time:       0.111736 sec
-rate       3.03 million edges/sec (incl time for L=tril(A))
-rate       3.85 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         2.487384 sec (nthreads: 8 speedup 3.17105)
-tricount time:         2.598690 sec (saxpy method)
-tri+prep time:         3.764224 sec (incl time to compute L)
-compute C time:        2.487384 sec
-reduce (C) time:       0.111305 sec
-rate       4.39 million edges/sec (incl time for L=tril(A))
-rate       6.36 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         1.470643 sec (nthreads: 16 speedup 5.36338)
-tricount time:         1.583145 sec (saxpy method)
-tri+prep time:         2.748679 sec (incl time to compute L)
-compute C time:        1.470643 sec
-reduce (C) time:       0.112501 sec
-rate       6.01 million edges/sec (incl time for L=tril(A))
-rate      10.43 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.945724 sec (nthreads: 32 speedup 8.3403)
-tricount time:         1.057723 sec (saxpy method)
-tri+prep time:         2.223258 sec (incl time to compute L)
-compute C time:        0.945724 sec
-reduce (C) time:       0.111999 sec
-rate       7.43 million edges/sec (incl time for L=tril(A))
-rate      15.62 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.706045 sec (nthreads: 64 speedup 11.1716)
-tricount time:         0.815743 sec (saxpy method)
-tri+prep time:         1.981277 sec (incl time to compute L)
-compute C time:        0.706045 sec
-reduce (C) time:       0.109698 sec
-rate       8.34 million edges/sec (incl time for L=tril(A))
-rate      20.25 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.621474 sec (nthreads: 128 speedup 12.6918)
-tricount time:         0.733698 sec (saxpy method)
-tri+prep time:         1.899233 sec (incl time to compute L)
-compute C time:        0.621474 sec
-reduce (C) time:       0.112224 sec
-rate       8.70 million edges/sec (incl time for L=tril(A))
-rate      22.51 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 262111 by 262111, 1799584 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 262111 ncols: 262111 max # entries: 1799584
-format: standard CSR vlen: 262111 nvec_nonempty: 262111 nvec: 262111 plen: 262111 vdim: 262111
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 1799584 
-row: 0 : 5 entries [0:4]
-    column 1: bool 1
-    column 111112: bool 1
-    column 184334: bool 1
-    column 195445: bool 1
-    column 206556: bool 1
-row: 1 : 5 entries [5:9]
-    column 0: bool 1
-    column 55557: bool 1
-    column 111112: bool 1
-    column 195445: bool 1
-    column 206556: bool 1
-row: 2 : 36 entries [10:45]
-    column 8032: bool 1
-    column 13261: bool 1
-    column 18869: bool 1
-    column 18880: bool 1
-    column 18893: bool 1
-    column 18904: bool 1
-    column 28395: bool 1
-    column 31225: bool 1
-    column 41113: bool 1
-    column 42224: bool 1
-    column 42281: bool 1
-    column 43335: bool 1
-    column 44487: bool 1
-    column 46718: bool 1
-    column 47658: bool 1
-    column 79794: bool 1
-    column 90164: bool 1
-    column 149547: bool 1
-    column 161652: bool 1
-    column 161914: bool 1
-    ...
-row: 3 : 55 entries [46:100]
-    ...
-row: 4 : 5 entries [101:105]
-    ...
-row: 5 : 5 entries [106:110]
-    ...
-row: 6 : 6 entries [111:116]
-    ...
-row: 7 : 7 entries [117:123]
-    ...
-row: 8 : 13 entries [124:136]
-    ...
-row: 9 : 5 entries [137:141]
-    ...
-...
-
-total time to read A matrix:       1.228025 sec
-
-n 262111 # edges 899792
-U=triu(A) time:        0.014009 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.011089 sec
-# triangles 717719
-
-L'*U time (dot):         0.055937 sec
-tricount time:         0.058578 sec (dot product method)
-tri+prep time:         0.083675 sec (incl time to compute L and U)
-compute C time:        0.055937 sec
-reduce (C) time:       0.002641 sec
-rate      10.75 million edges/sec (incl time for U=triu(A))
-rate      15.36 million edges/sec (just tricount itself)
-
-# triangles 717719
-
-L'*U time (dot):         0.087551 sec (nthreads: 2 speedup 0.638907)
-tricount time:         0.090188 sec (dot product method)
-tri+prep time:         0.115285 sec (incl time to compute L and U)
-compute C time:        0.087551 sec
-reduce (C) time:       0.002637 sec
-rate       7.80 million edges/sec (incl time for U=triu(A))
-rate       9.98 million edges/sec (just tricount itself)
-
-# triangles 717719
-
-L'*U time (dot):         0.063258 sec (nthreads: 4 speedup 0.884259)
-tricount time:         0.065887 sec (dot product method)
-tri+prep time:         0.090984 sec (incl time to compute L and U)
-compute C time:        0.063258 sec
-reduce (C) time:       0.002628 sec
-rate       9.89 million edges/sec (incl time for U=triu(A))
-rate      13.66 million edges/sec (just tricount itself)
-
-# triangles 717719
-
-L'*U time (dot):         0.048404 sec (nthreads: 8 speedup 1.15563)
-tricount time:         0.051031 sec (dot product method)
-tri+prep time:         0.076128 sec (incl time to compute L and U)
-compute C time:        0.048404 sec
-reduce (C) time:       0.002628 sec
-rate      11.82 million edges/sec (incl time for U=triu(A))
-rate      17.63 million edges/sec (just tricount itself)
-
-# triangles 717719
-
-L'*U time (dot):         0.039286 sec (nthreads: 16 speedup 1.42383)
-tricount time:         0.041952 sec (dot product method)
-tri+prep time:         0.067049 sec (incl time to compute L and U)
-compute C time:        0.039286 sec
-reduce (C) time:       0.002666 sec
-rate      13.42 million edges/sec (incl time for U=triu(A))
-rate      21.45 million edges/sec (just tricount itself)
-
-# triangles 717719
-
-L'*U time (dot):         0.046282 sec (nthreads: 32 speedup 1.2086)
-tricount time:         0.049069 sec (dot product method)
-tri+prep time:         0.074167 sec (incl time to compute L and U)
-compute C time:        0.046282 sec
-reduce (C) time:       0.002787 sec
-rate      12.13 million edges/sec (incl time for U=triu(A))
-rate      18.34 million edges/sec (just tricount itself)
-
-# triangles 717719
-
-L'*U time (dot):         0.131007 sec (nthreads: 64 speedup 0.426976)
-tricount time:         0.141713 sec (dot product method)
-tri+prep time:         0.166810 sec (incl time to compute L and U)
-compute C time:        0.131007 sec
-reduce (C) time:       0.010707 sec
-rate       5.39 million edges/sec (incl time for U=triu(A))
-rate       6.35 million edges/sec (just tricount itself)
-
-# triangles 717719
-
-L'*U time (dot):         0.347425 sec (nthreads: 128 speedup 0.161004)
-tricount time:         0.361000 sec (dot product method)
-tri+prep time:         0.386097 sec (incl time to compute L and U)
-compute C time:        0.347425 sec
-reduce (C) time:       0.013575 sec
-rate       2.33 million edges/sec (incl time for U=triu(A))
-rate       2.49 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.219622 sec
-tricount time:         0.232950 sec (saxpy method)
-tri+prep time:         0.244039 sec (incl time to compute L)
-compute C time:        0.219622 sec
-reduce (C) time:       0.013328 sec
-rate       3.69 million edges/sec (incl time for L=tril(A))
-rate       3.86 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.174075 sec (nthreads: 2 speedup 1.26166)
-tricount time:         0.187482 sec (saxpy method)
-tri+prep time:         0.198571 sec (incl time to compute L)
-compute C time:        0.174075 sec
-reduce (C) time:       0.013408 sec
-rate       4.53 million edges/sec (incl time for L=tril(A))
-rate       4.80 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.128061 sec (nthreads: 4 speedup 1.71498)
-tricount time:         0.141567 sec (saxpy method)
-tri+prep time:         0.152656 sec (incl time to compute L)
-compute C time:        0.128061 sec
-reduce (C) time:       0.013507 sec
-rate       5.89 million edges/sec (incl time for L=tril(A))
-rate       6.36 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.117338 sec (nthreads: 8 speedup 1.8717)
-tricount time:         0.130515 sec (saxpy method)
-tri+prep time:         0.141604 sec (incl time to compute L)
-compute C time:        0.117338 sec
-reduce (C) time:       0.013177 sec
-rate       6.35 million edges/sec (incl time for L=tril(A))
-rate       6.89 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.115311 sec (nthreads: 16 speedup 1.90461)
-tricount time:         0.128565 sec (saxpy method)
-tri+prep time:         0.139654 sec (incl time to compute L)
-compute C time:        0.115311 sec
-reduce (C) time:       0.013255 sec
-rate       6.44 million edges/sec (incl time for L=tril(A))
-rate       7.00 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.111518 sec (nthreads: 32 speedup 1.96938)
-tricount time:         0.124781 sec (saxpy method)
-tri+prep time:         0.135870 sec (incl time to compute L)
-compute C time:        0.111518 sec
-reduce (C) time:       0.013263 sec
-rate       6.62 million edges/sec (incl time for L=tril(A))
-rate       7.21 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.113296 sec (nthreads: 64 speedup 1.93847)
-tricount time:         0.126702 sec (saxpy method)
-tri+prep time:         0.137790 sec (incl time to compute L)
-compute C time:        0.113296 sec
-reduce (C) time:       0.013405 sec
-rate       6.53 million edges/sec (incl time for L=tril(A))
-rate       7.10 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.108408 sec (nthreads: 128 speedup 2.02589)
-tricount time:         0.120862 sec (saxpy method)
-tri+prep time:         0.131950 sec (incl time to compute L)
-compute C time:        0.108408 sec
-reduce (C) time:       0.012454 sec
-rate       6.82 million edges/sec (incl time for L=tril(A))
-rate       7.44 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 6474 by 6474, 25144 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 6474 ncols: 6474 max # entries: 25144
-format: standard CSR vlen: 6474 nvec_nonempty: 6474 nvec: 6474 plen: 6474 vdim: 6474
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 25144 
-row: 0 : 378 entries [0:377]
-    column 1: bool 1
-    column 2: bool 1
-    column 3: bool 1
-    column 14: bool 1
-    column 25: bool 1
-    column 36: bool 1
-    column 47: bool 1
-    column 58: bool 1
-    column 69: bool 1
-    column 80: bool 1
-    column 91: bool 1
-    column 102: bool 1
-    column 113: bool 1
-    column 114: bool 1
-    column 125: bool 1
-    column 136: bool 1
-    column 147: bool 1
-    column 158: bool 1
-    column 169: bool 1
-    column 180: bool 1
-    column 191: bool 1
-    column 202: bool 1
-    column 213: bool 1
-    column 224: bool 1
-    column 225: bool 1
-    column 236: bool 1
-    column 247: bool 1
-    column 258: bool 1
-    column 269: bool 1
-    column 280: bool 1
-    ...
-row: 1 : 1458 entries [378:1835]
-    ...
-row: 2 : 29 entries [1836:1864]
-    ...
-row: 3 : 16 entries [1865:1880]
-    ...
-row: 4 : 15 entries [1881:1895]
-    ...
-row: 5 : 2 entries [1896:1897]
-    ...
-row: 6 : 2 entries [1898:1899]
-    ...
-row: 7 : 4 entries [1900:1903]
-    ...
-row: 8 : 4 entries [1904:1907]
-    ...
-row: 9 : 3 entries [1908:1910]
-    ...
-...
-
-total time to read A matrix:       0.014988 sec
-
-n 6474 # edges 12572
-U=triu(A) time:        0.000224 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.000177 sec
-# triangles 6584
-
-L'*U time (dot):         0.000910 sec
-tricount time:         0.000927 sec (dot product method)
-tri+prep time:         0.001327 sec (incl time to compute L and U)
-compute C time:        0.000910 sec
-reduce (C) time:       0.000017 sec
-rate       9.47 million edges/sec (incl time for U=triu(A))
-rate      13.56 million edges/sec (just tricount itself)
-
-# triangles 6584
-
-L'*U time (dot):         0.001005 sec (nthreads: 2 speedup 0.905484)
-tricount time:         0.001020 sec (dot product method)
-tri+prep time:         0.001420 sec (incl time to compute L and U)
-compute C time:        0.001005 sec
-reduce (C) time:       0.000016 sec
-rate       8.85 million edges/sec (incl time for U=triu(A))
-rate      12.32 million edges/sec (just tricount itself)
-
-# triangles 6584
-
-L'*U time (dot):         0.000860 sec (nthreads: 4 speedup 1.05748)
-tricount time:         0.000876 sec (dot product method)
-tri+prep time:         0.001276 sec (incl time to compute L and U)
-compute C time:        0.000860 sec
-reduce (C) time:       0.000016 sec
-rate       9.85 million edges/sec (incl time for U=triu(A))
-rate      14.35 million edges/sec (just tricount itself)
-
-# triangles 6584
-
-L'*U time (dot):         0.000924 sec (nthreads: 8 speedup 0.984542)
-tricount time:         0.000940 sec (dot product method)
-tri+prep time:         0.001340 sec (incl time to compute L and U)
-compute C time:        0.000924 sec
-reduce (C) time:       0.000016 sec
-rate       9.38 million edges/sec (incl time for U=triu(A))
-rate      13.38 million edges/sec (just tricount itself)
-
-# triangles 6584
-
-L'*U time (dot):         0.001187 sec (nthreads: 16 speedup 0.766191)
-tricount time:         0.001203 sec (dot product method)
-tri+prep time:         0.001603 sec (incl time to compute L and U)
-compute C time:        0.001187 sec
-reduce (C) time:       0.000016 sec
-rate       7.84 million edges/sec (incl time for U=triu(A))
-rate      10.45 million edges/sec (just tricount itself)
-
-# triangles 6584
-
-L'*U time (dot):         0.001598 sec (nthreads: 32 speedup 0.569148)
-tricount time:         0.001614 sec (dot product method)
-tri+prep time:         0.002014 sec (incl time to compute L and U)
-compute C time:        0.001598 sec
-reduce (C) time:       0.000016 sec
-rate       6.24 million edges/sec (incl time for U=triu(A))
-rate       7.79 million edges/sec (just tricount itself)
-
-# triangles 6584
-
-L'*U time (dot):         0.004406 sec (nthreads: 64 speedup 0.206433)
-tricount time:         0.004454 sec (dot product method)
-tri+prep time:         0.004854 sec (incl time to compute L and U)
-compute C time:        0.004406 sec
-reduce (C) time:       0.000047 sec
-rate       2.59 million edges/sec (incl time for U=triu(A))
-rate       2.82 million edges/sec (just tricount itself)
-
-# triangles 6584
-
-L'*U time (dot):         0.034679 sec (nthreads: 128 speedup 0.0262291)
-tricount time:         0.034768 sec (dot product method)
-tri+prep time:         0.035168 sec (incl time to compute L and U)
-compute C time:        0.034679 sec
-reduce (C) time:       0.000089 sec
-rate       0.36 million edges/sec (incl time for U=triu(A))
-rate       0.36 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.003306 sec
-tricount time:         0.003383 sec (saxpy method)
-tri+prep time:         0.003560 sec (incl time to compute L)
-compute C time:        0.003306 sec
-reduce (C) time:       0.000077 sec
-rate       3.53 million edges/sec (incl time for L=tril(A))
-rate       3.72 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.004139 sec (nthreads: 2 speedup 0.798868)
-tricount time:         0.004217 sec (saxpy method)
-tri+prep time:         0.004393 sec (incl time to compute L)
-compute C time:        0.004139 sec
-reduce (C) time:       0.000078 sec
-rate       2.86 million edges/sec (incl time for L=tril(A))
-rate       2.98 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003599 sec (nthreads: 4 speedup 0.918704)
-tricount time:         0.003674 sec (saxpy method)
-tri+prep time:         0.003850 sec (incl time to compute L)
-compute C time:        0.003599 sec
-reduce (C) time:       0.000075 sec
-rate       3.27 million edges/sec (incl time for L=tril(A))
-rate       3.42 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003452 sec (nthreads: 8 speedup 0.957683)
-tricount time:         0.003536 sec (saxpy method)
-tri+prep time:         0.003713 sec (incl time to compute L)
-compute C time:        0.003452 sec
-reduce (C) time:       0.000084 sec
-rate       3.39 million edges/sec (incl time for L=tril(A))
-rate       3.56 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003449 sec (nthreads: 16 speedup 0.958536)
-tricount time:         0.003536 sec (saxpy method)
-tri+prep time:         0.003712 sec (incl time to compute L)
-compute C time:        0.003449 sec
-reduce (C) time:       0.000086 sec
-rate       3.39 million edges/sec (incl time for L=tril(A))
-rate       3.56 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003386 sec (nthreads: 32 speedup 0.976404)
-tricount time:         0.003471 sec (saxpy method)
-tri+prep time:         0.003647 sec (incl time to compute L)
-compute C time:        0.003386 sec
-reduce (C) time:       0.000085 sec
-rate       3.45 million edges/sec (incl time for L=tril(A))
-rate       3.62 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003457 sec (nthreads: 64 speedup 0.9563)
-tricount time:         0.003550 sec (saxpy method)
-tri+prep time:         0.003726 sec (incl time to compute L)
-compute C time:        0.003457 sec
-reduce (C) time:       0.000092 sec
-rate       3.37 million edges/sec (incl time for L=tril(A))
-rate       3.54 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003667 sec (nthreads: 128 speedup 0.901679)
-tricount time:         0.003740 sec (saxpy method)
-tri+prep time:         0.003917 sec (incl time to compute L)
-compute C time:        0.003667 sec
-reduce (C) time:       0.000073 sec
-rate       3.21 million edges/sec (incl time for L=tril(A))
-rate       3.36 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 18772 by 18772, 396100 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 18772 ncols: 18772 max # entries: 396100
-format: standard CSR vlen: 18772 nvec_nonempty: 18771 nvec: 18772 plen: 18772 vdim: 18772
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 396100 
-row: 0 : 8 entries [0:7]
-    column 1056: bool 1
-    column 1701: bool 1
-    column 3425: bool 1
-    column 6911: bool 1
-    column 8883: bool 1
-    column 12884: bool 1
-    column 13078: bool 1
-    column 13559: bool 1
-row: 1 : 130 entries [8:137]
-    column 51: bool 1
-    column 362: bool 1
-    column 541: bool 1
-    column 772: bool 1
-    column 1025: bool 1
-    column 1218: bool 1
-    column 1268: bool 1
-    column 1354: bool 1
-    column 1427: bool 1
-    column 1527: bool 1
-    column 1781: bool 1
-    column 1792: bool 1
-    column 1808: bool 1
-    column 1956: bool 1
-    column 1957: bool 1
-    column 2007: bool 1
-    column 2216: bool 1
-    column 2302: bool 1
-    column 2474: bool 1
-    column 2871: bool 1
-    column 3100: bool 1
-    column 3203: bool 1
-    ...
-row: 2 : 7 entries [138:144]
-    ...
-row: 3 : 8 entries [145:152]
-    ...
-row: 4 : 8 entries [153:160]
-    ...
-row: 5 : 20 entries [161:180]
-    ...
-row: 6 : 22 entries [181:202]
-    ...
-row: 7 : 105 entries [203:307]
-    ...
-row: 8 : 19 entries [308:326]
-    ...
-row: 9 : 85 entries [327:411]
-    ...
-...
-
-total time to read A matrix:       0.251553 sec
-
-n 18772 # edges 198050
-U=triu(A) time:        0.002127 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.001814 sec
-# triangles 1351441
-
-L'*U time (dot):         0.041407 sec
-tricount time:         0.042389 sec (dot product method)
-tri+prep time:         0.046330 sec (incl time to compute L and U)
-compute C time:        0.041407 sec
-reduce (C) time:       0.000981 sec
-rate       4.27 million edges/sec (incl time for U=triu(A))
-rate       4.67 million edges/sec (just tricount itself)
-
-# triangles 1351441
-
-L'*U time (dot):         0.033978 sec (nthreads: 2 speedup 1.21864)
-tricount time:         0.034948 sec (dot product method)
-tri+prep time:         0.038889 sec (incl time to compute L and U)
-compute C time:        0.033978 sec
-reduce (C) time:       0.000970 sec
-rate       5.09 million edges/sec (incl time for U=triu(A))
-rate       5.67 million edges/sec (just tricount itself)
-
-# triangles 1351441
-
-L'*U time (dot):         0.022515 sec (nthreads: 4 speedup 1.8391)
-tricount time:         0.023494 sec (dot product method)
-tri+prep time:         0.027435 sec (incl time to compute L and U)
-compute C time:        0.022515 sec
-reduce (C) time:       0.000979 sec
-rate       7.22 million edges/sec (incl time for U=triu(A))
-rate       8.43 million edges/sec (just tricount itself)
-
-# triangles 1351441
-
-L'*U time (dot):         0.017663 sec (nthreads: 8 speedup 2.34427)
-tricount time:         0.018650 sec (dot product method)
-tri+prep time:         0.022591 sec (incl time to compute L and U)
-compute C time:        0.017663 sec
-reduce (C) time:       0.000987 sec
-rate       8.77 million edges/sec (incl time for U=triu(A))
-rate      10.62 million edges/sec (just tricount itself)
-
-# triangles 1351441
-
-L'*U time (dot):         0.012357 sec (nthreads: 16 speedup 3.35099)
-tricount time:         0.013370 sec (dot product method)
-tri+prep time:         0.017311 sec (incl time to compute L and U)
-compute C time:        0.012357 sec
-reduce (C) time:       0.001013 sec
-rate      11.44 million edges/sec (incl time for U=triu(A))
-rate      14.81 million edges/sec (just tricount itself)
-
-# triangles 1351441
-
-L'*U time (dot):         0.012640 sec (nthreads: 32 speedup 3.27602)
-tricount time:         0.013763 sec (dot product method)
-tri+prep time:         0.017704 sec (incl time to compute L and U)
-compute C time:        0.012640 sec
-reduce (C) time:       0.001123 sec
-rate      11.19 million edges/sec (incl time for U=triu(A))
-rate      14.39 million edges/sec (just tricount itself)
-
-# triangles 1351441
-
-L'*U time (dot):         0.031777 sec (nthreads: 64 speedup 1.30304)
-tricount time:         0.035757 sec (dot product method)
-tri+prep time:         0.039698 sec (incl time to compute L and U)
-compute C time:        0.031777 sec
-reduce (C) time:       0.003980 sec
-rate       4.99 million edges/sec (incl time for U=triu(A))
-rate       5.54 million edges/sec (just tricount itself)
-
-# triangles 1351441
-
-L'*U time (dot):         0.060030 sec (nthreads: 128 speedup 0.689773)
-tricount time:         0.065184 sec (dot product method)
-tri+prep time:         0.069125 sec (incl time to compute L and U)
-compute C time:        0.060030 sec
-reduce (C) time:       0.005153 sec
-rate       2.87 million edges/sec (incl time for U=triu(A))
-rate       3.04 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.103763 sec
-tricount time:         0.108918 sec (saxpy method)
-tri+prep time:         0.110732 sec (incl time to compute L)
-compute C time:        0.103763 sec
-reduce (C) time:       0.005155 sec
-rate       1.79 million edges/sec (incl time for L=tril(A))
-rate       1.82 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.061613 sec (nthreads: 2 speedup 1.68412)
-tricount time:         0.066706 sec (saxpy method)
-tri+prep time:         0.068520 sec (incl time to compute L)
-compute C time:        0.061613 sec
-reduce (C) time:       0.005093 sec
-rate       2.89 million edges/sec (incl time for L=tril(A))
-rate       2.97 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.033766 sec (nthreads: 4 speedup 3.07299)
-tricount time:         0.038979 sec (saxpy method)
-tri+prep time:         0.040793 sec (incl time to compute L)
-compute C time:        0.033766 sec
-reduce (C) time:       0.005213 sec
-rate       4.85 million edges/sec (incl time for L=tril(A))
-rate       5.08 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.027503 sec (nthreads: 8 speedup 3.77284)
-tricount time:         0.032641 sec (saxpy method)
-tri+prep time:         0.034455 sec (incl time to compute L)
-compute C time:        0.027503 sec
-reduce (C) time:       0.005138 sec
-rate       5.75 million edges/sec (incl time for L=tril(A))
-rate       6.07 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.026506 sec (nthreads: 16 speedup 3.91468)
-tricount time:         0.031644 sec (saxpy method)
-tri+prep time:         0.033458 sec (incl time to compute L)
-compute C time:        0.026506 sec
-reduce (C) time:       0.005138 sec
-rate       5.92 million edges/sec (incl time for L=tril(A))
-rate       6.26 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.026437 sec (nthreads: 32 speedup 3.92487)
-tricount time:         0.031607 sec (saxpy method)
-tri+prep time:         0.033421 sec (incl time to compute L)
-compute C time:        0.026437 sec
-reduce (C) time:       0.005169 sec
-rate       5.93 million edges/sec (incl time for L=tril(A))
-rate       6.27 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.026111 sec (nthreads: 64 speedup 3.97386)
-tricount time:         0.031254 sec (saxpy method)
-tri+prep time:         0.033068 sec (incl time to compute L)
-compute C time:        0.026111 sec
-reduce (C) time:       0.005143 sec
-rate       5.99 million edges/sec (incl time for L=tril(A))
-rate       6.34 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.024855 sec (nthreads: 128 speedup 4.17467)
-tricount time:         0.029850 sec (saxpy method)
-tri+prep time:         0.031664 sec (incl time to compute L)
-compute C time:        0.024855 sec
-reduce (C) time:       0.004995 sec
-rate       6.25 million edges/sec (incl time for L=tril(A))
-rate       6.63 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 23133 by 23133, 186878 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 23133 ncols: 23133 max # entries: 186878
-format: standard CSR vlen: 23133 nvec_nonempty: 23133 nvec: 23133 plen: 23133 vdim: 23133
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 186878 
-row: 0 : 3 entries [0:2]
-    column 6736: bool 1
-    column 17557: bool 1
-    column 20971: bool 1
-row: 1 : 1 entries [3:3]
-    column 13159: bool 1
-row: 2 : 7 entries [4:10]
-    column 2326: bool 1
-    column 2758: bool 1
-    column 4756: bool 1
-    column 11897: bool 1
-    column 13158: bool 1
-    column 16241: bool 1
-    column 22779: bool 1
-row: 3 : 2 entries [11:12]
-    column 7187: bool 1
-    column 17472: bool 1
-row: 4 : 3 entries [13:15]
-    column 12808: bool 1
-    column 17930: bool 1
-    column 22133: bool 1
-row: 5 : 1 entries [16:16]
-    column 21281: bool 1
-row: 6 : 2 entries [17:18]
-    column 11919: bool 1
-    column 22350: bool 1
-row: 7 : 33 entries [19:51]
-    column 1623: bool 1
-    column 2184: bool 1
-    column 2555: bool 1
-    column 2580: bool 1
-    column 2705: bool 1
-    column 3962: bool 1
-    column 4215: bool 1
-    column 6872: bool 1
-    column 7759: bool 1
-    column 7941: bool 1
-    column 7970: bool 1
-    ...
-row: 8 : 1 entries [52:52]
-    ...
-row: 9 : 7 entries [53:59]
-    ...
-...
-
-total time to read A matrix:       0.118297 sec
-
-n 23133 # edges 93439
-U=triu(A) time:        0.001295 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.000996 sec
-# triangles 173361
-
-L'*U time (dot):         0.007639 sec
-tricount time:         0.007998 sec (dot product method)
-tri+prep time:         0.010289 sec (incl time to compute L and U)
-compute C time:        0.007639 sec
-reduce (C) time:       0.000359 sec
-rate       9.08 million edges/sec (incl time for U=triu(A))
-rate      11.68 million edges/sec (just tricount itself)
-
-# triangles 173361
-
-L'*U time (dot):         0.007842 sec (nthreads: 2 speedup 0.974208)
-tricount time:         0.008206 sec (dot product method)
-tri+prep time:         0.010497 sec (incl time to compute L and U)
-compute C time:        0.007842 sec
-reduce (C) time:       0.000365 sec
-rate       8.90 million edges/sec (incl time for U=triu(A))
-rate      11.39 million edges/sec (just tricount itself)
-
-# triangles 173361
-
-L'*U time (dot):         0.005292 sec (nthreads: 4 speedup 1.44347)
-tricount time:         0.005652 sec (dot product method)
-tri+prep time:         0.007942 sec (incl time to compute L and U)
-compute C time:        0.005292 sec
-reduce (C) time:       0.000360 sec
-rate      11.76 million edges/sec (incl time for U=triu(A))
-rate      16.53 million edges/sec (just tricount itself)
-
-# triangles 173361
-
-L'*U time (dot):         0.004582 sec (nthreads: 8 speedup 1.66709)
-tricount time:         0.004944 sec (dot product method)
-tri+prep time:         0.007234 sec (incl time to compute L and U)
-compute C time:        0.004582 sec
-reduce (C) time:       0.000361 sec
-rate      12.92 million edges/sec (incl time for U=triu(A))
-rate      18.90 million edges/sec (just tricount itself)
-
-# triangles 173361
-
-L'*U time (dot):         0.004524 sec (nthreads: 16 speedup 1.68856)
-tricount time:         0.004900 sec (dot product method)
-tri+prep time:         0.007190 sec (incl time to compute L and U)
-compute C time:        0.004524 sec
-reduce (C) time:       0.000376 sec
-rate      13.00 million edges/sec (incl time for U=triu(A))
-rate      19.07 million edges/sec (just tricount itself)
-
-# triangles 173361
-
-L'*U time (dot):         0.005723 sec (nthreads: 32 speedup 1.33482)
-tricount time:         0.006153 sec (dot product method)
-tri+prep time:         0.008444 sec (incl time to compute L and U)
-compute C time:        0.005723 sec
-reduce (C) time:       0.000430 sec
-rate      11.07 million edges/sec (incl time for U=triu(A))
-rate      15.19 million edges/sec (just tricount itself)
-
-# triangles 173361
-
-L'*U time (dot):         0.014748 sec (nthreads: 64 speedup 0.518)
-tricount time:         0.015903 sec (dot product method)
-tri+prep time:         0.018194 sec (incl time to compute L and U)
-compute C time:        0.014748 sec
-reduce (C) time:       0.001155 sec
-rate       5.14 million edges/sec (incl time for U=triu(A))
-rate       5.88 million edges/sec (just tricount itself)
-
-# triangles 173361
-
-L'*U time (dot):         0.047450 sec (nthreads: 128 speedup 0.160998)
-tricount time:         0.049344 sec (dot product method)
-tri+prep time:         0.051634 sec (incl time to compute L and U)
-compute C time:        0.047450 sec
-reduce (C) time:       0.001894 sec
-rate       1.81 million edges/sec (incl time for U=triu(A))
-rate       1.89 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.025917 sec
-tricount time:         0.027776 sec (saxpy method)
-tri+prep time:         0.028772 sec (incl time to compute L)
-compute C time:        0.025917 sec
-reduce (C) time:       0.001859 sec
-rate       3.25 million edges/sec (incl time for L=tril(A))
-rate       3.36 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.026884 sec (nthreads: 2 speedup 0.964041)
-tricount time:         0.028940 sec (saxpy method)
-tri+prep time:         0.029935 sec (incl time to compute L)
-compute C time:        0.026884 sec
-reduce (C) time:       0.002056 sec
-rate       3.12 million edges/sec (incl time for L=tril(A))
-rate       3.23 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.026391 sec (nthreads: 4 speedup 0.982038)
-tricount time:         0.028391 sec (saxpy method)
-tri+prep time:         0.029386 sec (incl time to compute L)
-compute C time:        0.026391 sec
-reduce (C) time:       0.001999 sec
-rate       3.18 million edges/sec (incl time for L=tril(A))
-rate       3.29 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.025862 sec (nthreads: 8 speedup 1.00212)
-tricount time:         0.027835 sec (saxpy method)
-tri+prep time:         0.028831 sec (incl time to compute L)
-compute C time:        0.025862 sec
-reduce (C) time:       0.001973 sec
-rate       3.24 million edges/sec (incl time for L=tril(A))
-rate       3.36 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.025567 sec (nthreads: 16 speedup 1.0137)
-tricount time:         0.027522 sec (saxpy method)
-tri+prep time:         0.028518 sec (incl time to compute L)
-compute C time:        0.025567 sec
-reduce (C) time:       0.001955 sec
-rate       3.28 million edges/sec (incl time for L=tril(A))
-rate       3.40 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.025094 sec (nthreads: 32 speedup 1.03282)
-tricount time:         0.027044 sec (saxpy method)
-tri+prep time:         0.028040 sec (incl time to compute L)
-compute C time:        0.025094 sec
-reduce (C) time:       0.001950 sec
-rate       3.33 million edges/sec (incl time for L=tril(A))
-rate       3.46 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.025414 sec (nthreads: 64 speedup 1.01979)
-tricount time:         0.027415 sec (saxpy method)
-tri+prep time:         0.028410 sec (incl time to compute L)
-compute C time:        0.025414 sec
-reduce (C) time:       0.002000 sec
-rate       3.29 million edges/sec (incl time for L=tril(A))
-rate       3.41 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.023614 sec (nthreads: 128 speedup 1.09754)
-tricount time:         0.025505 sec (saxpy method)
-tri+prep time:         0.026501 sec (incl time to compute L)
-compute C time:        0.023614 sec
-reduce (C) time:       0.001891 sec
-rate       3.53 million edges/sec (incl time for L=tril(A))
-rate       3.66 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 5242 by 5242, 28968 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 5242 ncols: 5242 max # entries: 28968
-format: standard CSR vlen: 5242 nvec_nonempty: 5241 nvec: 5242 plen: 5242 vdim: 5242
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 28968 
-row: 0 : 25 entries [0:24]
-    column 301: bool 1
-    column 422: bool 1
-    column 440: bool 1
-    column 641: bool 1
-    column 718: bool 1
-    column 903: bool 1
-    column 1256: bool 1
-    column 1463: bool 1
-    column 1670: bool 1
-    column 1808: bool 1
-    column 2411: bool 1
-    column 2416: bool 1
-    column 2498: bool 1
-    column 2834: bool 1
-    column 2935: bool 1
-    column 2991: bool 1
-    column 3137: bool 1
-    column 3187: bool 1
-    column 3426: bool 1
-    column 3558: bool 1
-    column 4070: bool 1
-    column 4172: bool 1
-    column 4307: bool 1
-    column 4436: bool 1
-    column 5203: bool 1
-row: 1 : 1 entries [25:25]
-    column 3402: bool 1
-row: 2 : 1 entries [26:26]
-    column 4675: bool 1
-row: 3 : 5 entries [27:31]
-    column 522: bool 1
-    column 1567: bool 1
-    column 2265: bool 1
-    ...
-row: 4 : 4 entries [32:35]
-    ...
-row: 5 : 5 entries [36:40]
-    ...
-row: 6 : 2 entries [41:42]
-    ...
-row: 7 : 11 entries [43:53]
-    ...
-row: 8 : 6 entries [54:59]
-    ...
-row: 9 : 17 entries [60:76]
-    ...
-...
-
-total time to read A matrix:       0.017437 sec
-
-n 5242 # edges 14484
-U=triu(A) time:        0.000236 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.000186 sec
-# triangles 48260
-
-L'*U time (dot):         0.000911 sec
-tricount time:         0.000964 sec (dot product method)
-tri+prep time:         0.001386 sec (incl time to compute L and U)
-compute C time:        0.000911 sec
-reduce (C) time:       0.000053 sec
-rate      10.45 million edges/sec (incl time for U=triu(A))
-rate      15.03 million edges/sec (just tricount itself)
-
-# triangles 48260
-
-L'*U time (dot):         0.001072 sec (nthreads: 2 speedup 0.849759)
-tricount time:         0.001123 sec (dot product method)
-tri+prep time:         0.001545 sec (incl time to compute L and U)
-compute C time:        0.001072 sec
-reduce (C) time:       0.000051 sec
-rate       9.37 million edges/sec (incl time for U=triu(A))
-rate      12.90 million edges/sec (just tricount itself)
-
-# triangles 48260
-
-L'*U time (dot):         0.000883 sec (nthreads: 4 speedup 1.03188)
-tricount time:         0.000934 sec (dot product method)
-tri+prep time:         0.001356 sec (incl time to compute L and U)
-compute C time:        0.000883 sec
-reduce (C) time:       0.000051 sec
-rate      10.68 million edges/sec (incl time for U=triu(A))
-rate      15.51 million edges/sec (just tricount itself)
-
-# triangles 48260
-
-L'*U time (dot):         0.000955 sec (nthreads: 8 speedup 0.953612)
-tricount time:         0.001006 sec (dot product method)
-tri+prep time:         0.001428 sec (incl time to compute L and U)
-compute C time:        0.000955 sec
-reduce (C) time:       0.000051 sec
-rate      10.14 million edges/sec (incl time for U=triu(A))
-rate      14.39 million edges/sec (just tricount itself)
-
-# triangles 48260
-
-L'*U time (dot):         0.000972 sec (nthreads: 16 speedup 0.936971)
-tricount time:         0.001023 sec (dot product method)
-tri+prep time:         0.001445 sec (incl time to compute L and U)
-compute C time:        0.000972 sec
-reduce (C) time:       0.000051 sec
-rate      10.02 million edges/sec (incl time for U=triu(A))
-rate      14.16 million edges/sec (just tricount itself)
-
-# triangles 48260
-
-L'*U time (dot):         0.002110 sec (nthreads: 32 speedup 0.431779)
-tricount time:         0.002165 sec (dot product method)
-tri+prep time:         0.002587 sec (incl time to compute L and U)
-compute C time:        0.002110 sec
-reduce (C) time:       0.000055 sec
-rate       5.60 million edges/sec (incl time for U=triu(A))
-rate       6.69 million edges/sec (just tricount itself)
-
-# triangles 48260
-
-L'*U time (dot):         0.004377 sec (nthreads: 64 speedup 0.208147)
-tricount time:         0.004561 sec (dot product method)
-tri+prep time:         0.004983 sec (incl time to compute L and U)
-compute C time:        0.004377 sec
-reduce (C) time:       0.000184 sec
-rate       2.91 million edges/sec (incl time for U=triu(A))
-rate       3.18 million edges/sec (just tricount itself)
-
-# triangles 48260
-
-L'*U time (dot):         0.035342 sec (nthreads: 128 speedup 0.0257801)
-tricount time:         0.035610 sec (dot product method)
-tri+prep time:         0.036032 sec (incl time to compute L and U)
-compute C time:        0.035342 sec
-reduce (C) time:       0.000268 sec
-rate       0.40 million edges/sec (incl time for U=triu(A))
-rate       0.41 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.003886 sec
-tricount time:         0.004166 sec (saxpy method)
-tri+prep time:         0.004352 sec (incl time to compute L)
-compute C time:        0.003886 sec
-reduce (C) time:       0.000280 sec
-rate       3.33 million edges/sec (incl time for L=tril(A))
-rate       3.48 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003934 sec (nthreads: 2 speedup 0.987698)
-tricount time:         0.004194 sec (saxpy method)
-tri+prep time:         0.004380 sec (incl time to compute L)
-compute C time:        0.003934 sec
-reduce (C) time:       0.000260 sec
-rate       3.31 million edges/sec (incl time for L=tril(A))
-rate       3.45 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003729 sec (nthreads: 4 speedup 1.04193)
-tricount time:         0.004027 sec (saxpy method)
-tri+prep time:         0.004213 sec (incl time to compute L)
-compute C time:        0.003729 sec
-reduce (C) time:       0.000298 sec
-rate       3.44 million edges/sec (incl time for L=tril(A))
-rate       3.60 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003668 sec (nthreads: 8 speedup 1.05939)
-tricount time:         0.003938 sec (saxpy method)
-tri+prep time:         0.004124 sec (incl time to compute L)
-compute C time:        0.003668 sec
-reduce (C) time:       0.000270 sec
-rate       3.51 million edges/sec (incl time for L=tril(A))
-rate       3.68 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003576 sec (nthreads: 16 speedup 1.08676)
-tricount time:         0.003846 sec (saxpy method)
-tri+prep time:         0.004032 sec (incl time to compute L)
-compute C time:        0.003576 sec
-reduce (C) time:       0.000271 sec
-rate       3.59 million edges/sec (incl time for L=tril(A))
-rate       3.77 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003551 sec (nthreads: 32 speedup 1.09439)
-tricount time:         0.003802 sec (saxpy method)
-tri+prep time:         0.003988 sec (incl time to compute L)
-compute C time:        0.003551 sec
-reduce (C) time:       0.000251 sec
-rate       3.63 million edges/sec (incl time for L=tril(A))
-rate       3.81 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003537 sec (nthreads: 64 speedup 1.09856)
-tricount time:         0.003813 sec (saxpy method)
-tri+prep time:         0.003999 sec (incl time to compute L)
-compute C time:        0.003537 sec
-reduce (C) time:       0.000276 sec
-rate       3.62 million edges/sec (incl time for L=tril(A))
-rate       3.80 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.003449 sec (nthreads: 128 speedup 1.12656)
-tricount time:         0.003700 sec (saxpy method)
-tri+prep time:         0.003886 sec (incl time to compute L)
-compute C time:        0.003449 sec
-reduce (C) time:       0.000250 sec
-rate       3.73 million edges/sec (incl time for L=tril(A))
-rate       3.92 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 12008 by 12008, 236978 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 12008 ncols: 12008 max # entries: 236978
-format: standard CSR vlen: 12008 nvec_nonempty: 12006 nvec: 12008 plen: 12008 vdim: 12008
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 236978 
-row: 0 : 1 entries [0:0]
-    column 8107: bool 1
-row: 1 : 4 entries [1:4]
-    column 130: bool 1
-    column 6242: bool 1
-    column 6527: bool 1
-    column 7301: bool 1
-row: 2 : 20 entries [5:24]
-    column 574: bool 1
-    column 1588: bool 1
-    column 2534: bool 1
-    column 3079: bool 1
-    column 3278: bool 1
-    column 3399: bool 1
-    column 4019: bool 1
-    column 4830: bool 1
-    column 4940: bool 1
-    column 5872: bool 1
-    column 6868: bool 1
-    column 6880: bool 1
-    column 6937: bool 1
-    column 7237: bool 1
-    column 7432: bool 1
-    column 8479: bool 1
-    column 8940: bool 1
-    column 9545: bool 1
-    column 9837: bool 1
-    column 10838: bool 1
-row: 3 : 34 entries [25:58]
-    column 4: bool 1
-    column 347: bool 1
-    column 548: bool 1
-    column 587: bool 1
-    column 801: bool 1
-    ...
-row: 4 : 2 entries [59:60]
-    ...
-row: 5 : 8 entries [61:68]
-    ...
-row: 6 : 9 entries [69:77]
-    ...
-row: 7 : 2 entries [78:79]
-    ...
-row: 8 : 10 entries [80:89]
-    ...
-row: 9 : 6 entries [90:95]
-    ...
-...
-
-total time to read A matrix:       0.146088 sec
-
-n 12008 # edges 118489
-U=triu(A) time:        0.001229 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.001108 sec
-# triangles 3358499
-
-L'*U time (dot):         0.038713 sec
-tricount time:         0.039314 sec (dot product method)
-tri+prep time:         0.041651 sec (incl time to compute L and U)
-compute C time:        0.038713 sec
-reduce (C) time:       0.000601 sec
-rate       2.84 million edges/sec (incl time for U=triu(A))
-rate       3.01 million edges/sec (just tricount itself)
-
-# triangles 3358499
-
-L'*U time (dot):         0.029973 sec (nthreads: 2 speedup 1.2916)
-tricount time:         0.030571 sec (dot product method)
-tri+prep time:         0.032909 sec (incl time to compute L and U)
-compute C time:        0.029973 sec
-reduce (C) time:       0.000599 sec
-rate       3.60 million edges/sec (incl time for U=triu(A))
-rate       3.88 million edges/sec (just tricount itself)
-
-# triangles 3358499
-
-L'*U time (dot):         0.019165 sec (nthreads: 4 speedup 2.01999)
-tricount time:         0.019768 sec (dot product method)
-tri+prep time:         0.022105 sec (incl time to compute L and U)
-compute C time:        0.019165 sec
-reduce (C) time:       0.000603 sec
-rate       5.36 million edges/sec (incl time for U=triu(A))
-rate       5.99 million edges/sec (just tricount itself)
-
-# triangles 3358499
-
-L'*U time (dot):         0.012958 sec (nthreads: 8 speedup 2.9875)
-tricount time:         0.013567 sec (dot product method)
-tri+prep time:         0.015904 sec (incl time to compute L and U)
-compute C time:        0.012958 sec
-reduce (C) time:       0.000609 sec
-rate       7.45 million edges/sec (incl time for U=triu(A))
-rate       8.73 million edges/sec (just tricount itself)
-
-# triangles 3358499
-
-L'*U time (dot):         0.015234 sec (nthreads: 16 speedup 2.54118)
-tricount time:         0.015865 sec (dot product method)
-tri+prep time:         0.018202 sec (incl time to compute L and U)
-compute C time:        0.015234 sec
-reduce (C) time:       0.000631 sec
-rate       6.51 million edges/sec (incl time for U=triu(A))
-rate       7.47 million edges/sec (just tricount itself)
-
-# triangles 3358499
-
-L'*U time (dot):         0.012456 sec (nthreads: 32 speedup 3.10784)
-tricount time:         0.013158 sec (dot product method)
-tri+prep time:         0.015496 sec (incl time to compute L and U)
-compute C time:        0.012456 sec
-reduce (C) time:       0.000702 sec
-rate       7.65 million edges/sec (incl time for U=triu(A))
-rate       9.00 million edges/sec (just tricount itself)
-
-# triangles 3358499
-
-L'*U time (dot):         0.020496 sec (nthreads: 64 speedup 1.88883)
-tricount time:         0.022384 sec (dot product method)
-tri+prep time:         0.024722 sec (incl time to compute L and U)
-compute C time:        0.020496 sec
-reduce (C) time:       0.001889 sec
-rate       4.79 million edges/sec (incl time for U=triu(A))
-rate       5.29 million edges/sec (just tricount itself)
-
-# triangles 3358499
-
-L'*U time (dot):         0.060357 sec (nthreads: 128 speedup 0.641392)
-tricount time:         0.063327 sec (dot product method)
-tri+prep time:         0.065664 sec (incl time to compute L and U)
-compute C time:        0.060357 sec
-reduce (C) time:       0.002970 sec
-rate       1.80 million edges/sec (incl time for U=triu(A))
-rate       1.87 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.114476 sec
-tricount time:         0.117511 sec (saxpy method)
-tri+prep time:         0.118619 sec (incl time to compute L)
-compute C time:        0.114476 sec
-reduce (C) time:       0.003035 sec
-rate       1.00 million edges/sec (incl time for L=tril(A))
-rate       1.01 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.062332 sec (nthreads: 2 speedup 1.83654)
-tricount time:         0.065352 sec (saxpy method)
-tri+prep time:         0.066460 sec (incl time to compute L)
-compute C time:        0.062332 sec
-reduce (C) time:       0.003020 sec
-rate       1.78 million edges/sec (incl time for L=tril(A))
-rate       1.81 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.033039 sec (nthreads: 4 speedup 3.46482)
-tricount time:         0.036058 sec (saxpy method)
-tri+prep time:         0.037167 sec (incl time to compute L)
-compute C time:        0.033039 sec
-reduce (C) time:       0.003019 sec
-rate       3.19 million edges/sec (incl time for L=tril(A))
-rate       3.29 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.022622 sec (nthreads: 8 speedup 5.06028)
-tricount time:         0.025728 sec (saxpy method)
-tri+prep time:         0.026837 sec (incl time to compute L)
-compute C time:        0.022622 sec
-reduce (C) time:       0.003106 sec
-rate       4.42 million edges/sec (incl time for L=tril(A))
-rate       4.61 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.021998 sec (nthreads: 16 speedup 5.20383)
-tricount time:         0.025055 sec (saxpy method)
-tri+prep time:         0.026163 sec (incl time to compute L)
-compute C time:        0.021998 sec
-reduce (C) time:       0.003057 sec
-rate       4.53 million edges/sec (incl time for L=tril(A))
-rate       4.73 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.021796 sec (nthreads: 32 speedup 5.2521)
-tricount time:         0.024866 sec (saxpy method)
-tri+prep time:         0.025975 sec (incl time to compute L)
-compute C time:        0.021796 sec
-reduce (C) time:       0.003070 sec
-rate       4.56 million edges/sec (incl time for L=tril(A))
-rate       4.77 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.021730 sec (nthreads: 64 speedup 5.26806)
-tricount time:         0.024703 sec (saxpy method)
-tri+prep time:         0.025811 sec (incl time to compute L)
-compute C time:        0.021730 sec
-reduce (C) time:       0.002972 sec
-rate       4.59 million edges/sec (incl time for L=tril(A))
-rate       4.80 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.020915 sec (nthreads: 128 speedup 5.47331)
-tricount time:         0.023936 sec (saxpy method)
-tri+prep time:         0.025044 sec (incl time to compute L)
-compute C time:        0.020915 sec
-reduce (C) time:       0.003021 sec
-rate       4.73 million edges/sec (incl time for L=tril(A))
-rate       4.95 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 36692 by 36692, 367662 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 36692 ncols: 36692 max # entries: 367662
-format: standard CSR vlen: 36692 nvec_nonempty: 36692 nvec: 36692 plen: 36692 vdim: 36692
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 367662 
-row: 0 : 1 entries [0:0]
-    column 1: bool 1
-row: 1 : 70 entries [1:70]
-    column 0: bool 1
-    column 2: bool 1
-    column 1113: bool 1
-    column 2224: bool 1
-    column 3335: bool 1
-    column 4446: bool 1
-    column 5557: bool 1
-    column 6668: bool 1
-    column 7779: bool 1
-    column 8890: bool 1
-    column 10001: bool 1
-    column 11112: bool 1
-    column 11113: bool 1
-    column 12224: bool 1
-    column 13335: bool 1
-    column 14446: bool 1
-    column 15557: bool 1
-    column 16668: bool 1
-    column 17779: bool 1
-    column 18890: bool 1
-    column 20001: bool 1
-    column 21112: bool 1
-    column 22223: bool 1
-    column 22224: bool 1
-    column 23335: bool 1
-    column 24446: bool 1
-    column 25557: bool 1
-    column 26668: bool 1
-    column 27779: bool 1
-    ...
-row: 2 : 4 entries [71:74]
-    ...
-row: 3 : 4 entries [75:78]
-    ...
-row: 4 : 65 entries [79:143]
-    ...
-row: 5 : 1 entries [144:144]
-    ...
-row: 6 : 2 entries [145:146]
-    ...
-row: 7 : 2 entries [147:148]
-    ...
-row: 8 : 3 entries [149:151]
-    ...
-row: 9 : 3 entries [152:154]
-    ...
-...
-
-total time to read A matrix:       0.233840 sec
-
-n 36692 # edges 183831
-U=triu(A) time:        0.002264 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.001893 sec
-# triangles 727044
-
-L'*U time (dot):         0.044426 sec
-tricount time:         0.045204 sec (dot product method)
-tri+prep time:         0.049361 sec (incl time to compute L and U)
-compute C time:        0.044426 sec
-reduce (C) time:       0.000778 sec
-rate       3.72 million edges/sec (incl time for U=triu(A))
-rate       4.07 million edges/sec (just tricount itself)
-
-# triangles 727044
-
-L'*U time (dot):         0.037540 sec (nthreads: 2 speedup 1.18341)
-tricount time:         0.038313 sec (dot product method)
-tri+prep time:         0.042470 sec (incl time to compute L and U)
-compute C time:        0.037540 sec
-reduce (C) time:       0.000773 sec
-rate       4.33 million edges/sec (incl time for U=triu(A))
-rate       4.80 million edges/sec (just tricount itself)
-
-# triangles 727044
-
-L'*U time (dot):         0.023296 sec (nthreads: 4 speedup 1.90705)
-tricount time:         0.024067 sec (dot product method)
-tri+prep time:         0.028224 sec (incl time to compute L and U)
-compute C time:        0.023296 sec
-reduce (C) time:       0.000771 sec
-rate       6.51 million edges/sec (incl time for U=triu(A))
-rate       7.64 million edges/sec (just tricount itself)
-
-# triangles 727044
-
-L'*U time (dot):         0.015731 sec (nthreads: 8 speedup 2.82415)
-tricount time:         0.016515 sec (dot product method)
-tri+prep time:         0.020671 sec (incl time to compute L and U)
-compute C time:        0.015731 sec
-reduce (C) time:       0.000784 sec
-rate       8.89 million edges/sec (incl time for U=triu(A))
-rate      11.13 million edges/sec (just tricount itself)
-
-# triangles 727044
-
-L'*U time (dot):         0.013930 sec (nthreads: 16 speedup 3.18931)
-tricount time:         0.014734 sec (dot product method)
-tri+prep time:         0.018891 sec (incl time to compute L and U)
-compute C time:        0.013930 sec
-reduce (C) time:       0.000804 sec
-rate       9.73 million edges/sec (incl time for U=triu(A))
-rate      12.48 million edges/sec (just tricount itself)
-
-# triangles 727044
-
-L'*U time (dot):         0.016600 sec (nthreads: 32 speedup 2.67619)
-tricount time:         0.017493 sec (dot product method)
-tri+prep time:         0.021650 sec (incl time to compute L and U)
-compute C time:        0.016600 sec
-reduce (C) time:       0.000893 sec
-rate       8.49 million edges/sec (incl time for U=triu(A))
-rate      10.51 million edges/sec (just tricount itself)
-
-# triangles 727044
-
-L'*U time (dot):         0.027876 sec (nthreads: 64 speedup 1.59371)
-tricount time:         0.030409 sec (dot product method)
-tri+prep time:         0.034565 sec (incl time to compute L and U)
-compute C time:        0.027876 sec
-reduce (C) time:       0.002533 sec
-rate       5.32 million edges/sec (incl time for U=triu(A))
-rate       6.05 million edges/sec (just tricount itself)
-
-# triangles 727044
-
-L'*U time (dot):         0.079906 sec (nthreads: 128 speedup 0.555981)
-tricount time:         0.083671 sec (dot product method)
-tri+prep time:         0.087828 sec (incl time to compute L and U)
-compute C time:        0.079906 sec
-reduce (C) time:       0.003766 sec
-rate       2.09 million edges/sec (incl time for U=triu(A))
-rate       2.20 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.122935 sec
-tricount time:         0.127066 sec (saxpy method)
-tri+prep time:         0.128958 sec (incl time to compute L)
-compute C time:        0.122935 sec
-reduce (C) time:       0.004131 sec
-rate       1.43 million edges/sec (incl time for L=tril(A))
-rate       1.45 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.074327 sec (nthreads: 2 speedup 1.65396)
-tricount time:         0.078522 sec (saxpy method)
-tri+prep time:         0.080415 sec (incl time to compute L)
-compute C time:        0.074327 sec
-reduce (C) time:       0.004195 sec
-rate       2.29 million edges/sec (incl time for L=tril(A))
-rate       2.34 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.041594 sec (nthreads: 4 speedup 2.95557)
-tricount time:         0.045695 sec (saxpy method)
-tri+prep time:         0.047588 sec (incl time to compute L)
-compute C time:        0.041594 sec
-reduce (C) time:       0.004101 sec
-rate       3.86 million edges/sec (incl time for L=tril(A))
-rate       4.02 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.029511 sec (nthreads: 8 speedup 4.16575)
-tricount time:         0.033563 sec (saxpy method)
-tri+prep time:         0.035456 sec (incl time to compute L)
-compute C time:        0.029511 sec
-reduce (C) time:       0.004052 sec
-rate       5.18 million edges/sec (incl time for L=tril(A))
-rate       5.48 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.028663 sec (nthreads: 16 speedup 4.28891)
-tricount time:         0.032923 sec (saxpy method)
-tri+prep time:         0.034815 sec (incl time to compute L)
-compute C time:        0.028663 sec
-reduce (C) time:       0.004259 sec
-rate       5.28 million edges/sec (incl time for L=tril(A))
-rate       5.58 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.027717 sec (nthreads: 32 speedup 4.43541)
-tricount time:         0.031619 sec (saxpy method)
-tri+prep time:         0.033512 sec (incl time to compute L)
-compute C time:        0.027717 sec
-reduce (C) time:       0.003902 sec
-rate       5.49 million edges/sec (incl time for L=tril(A))
-rate       5.81 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.027718 sec (nthreads: 64 speedup 4.43515)
-tricount time:         0.031880 sec (saxpy method)
-tri+prep time:         0.033773 sec (incl time to compute L)
-compute C time:        0.027718 sec
-reduce (C) time:       0.004162 sec
-rate       5.44 million edges/sec (incl time for L=tril(A))
-rate       5.77 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.026182 sec (nthreads: 128 speedup 4.69538)
-tricount time:         0.030073 sec (saxpy method)
-tri+prep time:         0.031965 sec (incl time to compute L)
-compute C time:        0.026182 sec
-reduce (C) time:       0.003891 sec
-rate       5.75 million edges/sec (incl time for L=tril(A))
-rate       6.11 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 4039 by 4039, 176468 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 4039 ncols: 4039 max # entries: 176468
-format: standard CSR vlen: 4039 nvec_nonempty: 4039 nvec: 4039 plen: 4039 vdim: 4039
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 176468 
-row: 0 : 347 entries [0:346]
-    column 1: bool 1
-    column 2: bool 1
-    column 3: bool 1
-    column 14: bool 1
-    column 25: bool 1
-    column 36: bool 1
-    column 47: bool 1
-    column 58: bool 1
-    column 69: bool 1
-    column 80: bool 1
-    column 91: bool 1
-    column 102: bool 1
-    column 113: bool 1
-    column 114: bool 1
-    column 125: bool 1
-    column 136: bool 1
-    column 147: bool 1
-    column 158: bool 1
-    column 169: bool 1
-    column 180: bool 1
-    column 191: bool 1
-    column 202: bool 1
-    column 213: bool 1
-    column 224: bool 1
-    column 225: bool 1
-    column 236: bool 1
-    column 247: bool 1
-    column 258: bool 1
-    column 269: bool 1
-    column 280: bool 1
-    ...
-row: 1 : 17 entries [347:363]
-    ...
-row: 2 : 10 entries [364:373]
-    ...
-row: 3 : 9 entries [374:382]
-    ...
-row: 4 : 16 entries [383:398]
-    ...
-row: 5 : 39 entries [399:437]
-    ...
-row: 6 : 4 entries [438:441]
-    ...
-row: 7 : 95 entries [442:536]
-    ...
-row: 8 : 120 entries [537:656]
-    ...
-row: 9 : 14 entries [657:670]
-    ...
-...
-
-total time to read A matrix:       0.104571 sec
-
-n 4039 # edges 88234
-U=triu(A) time:        0.000803 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.000728 sec
-# triangles 1612010
-
-L'*U time (dot):         0.027232 sec
-tricount time:         0.027715 sec (dot product method)
-tri+prep time:         0.029246 sec (incl time to compute L and U)
-compute C time:        0.027232 sec
-reduce (C) time:       0.000482 sec
-rate       3.02 million edges/sec (incl time for U=triu(A))
-rate       3.18 million edges/sec (just tricount itself)
-
-# triangles 1612010
-
-L'*U time (dot):         0.021491 sec (nthreads: 2 speedup 1.26718)
-tricount time:         0.021970 sec (dot product method)
-tri+prep time:         0.023501 sec (incl time to compute L and U)
-compute C time:        0.021491 sec
-reduce (C) time:       0.000480 sec
-rate       3.75 million edges/sec (incl time for U=triu(A))
-rate       4.02 million edges/sec (just tricount itself)
-
-# triangles 1612010
-
-L'*U time (dot):         0.015191 sec (nthreads: 4 speedup 1.79269)
-tricount time:         0.015674 sec (dot product method)
-tri+prep time:         0.017205 sec (incl time to compute L and U)
-compute C time:        0.015191 sec
-reduce (C) time:       0.000483 sec
-rate       5.13 million edges/sec (incl time for U=triu(A))
-rate       5.63 million edges/sec (just tricount itself)
-
-# triangles 1612010
-
-L'*U time (dot):         0.010958 sec (nthreads: 8 speedup 2.48509)
-tricount time:         0.011446 sec (dot product method)
-tri+prep time:         0.012977 sec (incl time to compute L and U)
-compute C time:        0.010958 sec
-reduce (C) time:       0.000487 sec
-rate       6.80 million edges/sec (incl time for U=triu(A))
-rate       7.71 million edges/sec (just tricount itself)
-
-# triangles 1612010
-
-L'*U time (dot):         0.007858 sec (nthreads: 16 speedup 3.4655)
-tricount time:         0.008366 sec (dot product method)
-tri+prep time:         0.009897 sec (incl time to compute L and U)
-compute C time:        0.007858 sec
-reduce (C) time:       0.000508 sec
-rate       8.91 million edges/sec (incl time for U=triu(A))
-rate      10.55 million edges/sec (just tricount itself)
-
-# triangles 1612010
-
-L'*U time (dot):         0.008857 sec (nthreads: 32 speedup 3.07453)
-tricount time:         0.009389 sec (dot product method)
-tri+prep time:         0.010920 sec (incl time to compute L and U)
-compute C time:        0.008857 sec
-reduce (C) time:       0.000532 sec
-rate       8.08 million edges/sec (incl time for U=triu(A))
-rate       9.40 million edges/sec (just tricount itself)
-
-# triangles 1612010
-
-L'*U time (dot):         0.017355 sec (nthreads: 64 speedup 1.56911)
-tricount time:         0.019315 sec (dot product method)
-tri+prep time:         0.020846 sec (incl time to compute L and U)
-compute C time:        0.017355 sec
-reduce (C) time:       0.001960 sec
-rate       4.23 million edges/sec (incl time for U=triu(A))
-rate       4.57 million edges/sec (just tricount itself)
-
-# triangles 1612010
-
-L'*U time (dot):         0.040942 sec (nthreads: 128 speedup 0.665145)
-tricount time:         0.043470 sec (dot product method)
-tri+prep time:         0.045001 sec (incl time to compute L and U)
-compute C time:        0.040942 sec
-reduce (C) time:       0.002528 sec
-rate       1.96 million edges/sec (incl time for U=triu(A))
-rate       2.03 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.062089 sec
-tricount time:         0.064582 sec (saxpy method)
-tri+prep time:         0.065310 sec (incl time to compute L)
-compute C time:        0.062089 sec
-reduce (C) time:       0.002493 sec
-rate       1.35 million edges/sec (incl time for L=tril(A))
-rate       1.37 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.033309 sec (nthreads: 2 speedup 1.86404)
-tricount time:         0.035880 sec (saxpy method)
-tri+prep time:         0.036608 sec (incl time to compute L)
-compute C time:        0.033309 sec
-reduce (C) time:       0.002571 sec
-rate       2.41 million edges/sec (incl time for L=tril(A))
-rate       2.46 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.022537 sec (nthreads: 4 speedup 2.75498)
-tricount time:         0.025108 sec (saxpy method)
-tri+prep time:         0.025836 sec (incl time to compute L)
-compute C time:        0.022537 sec
-reduce (C) time:       0.002571 sec
-rate       3.42 million edges/sec (incl time for L=tril(A))
-rate       3.51 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.022127 sec (nthreads: 8 speedup 2.80602)
-tricount time:         0.024723 sec (saxpy method)
-tri+prep time:         0.025451 sec (incl time to compute L)
-compute C time:        0.022127 sec
-reduce (C) time:       0.002596 sec
-rate       3.47 million edges/sec (incl time for L=tril(A))
-rate       3.57 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.022103 sec (nthreads: 16 speedup 2.80909)
-tricount time:         0.024553 sec (saxpy method)
-tri+prep time:         0.025281 sec (incl time to compute L)
-compute C time:        0.022103 sec
-reduce (C) time:       0.002450 sec
-rate       3.49 million edges/sec (incl time for L=tril(A))
-rate       3.59 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.021998 sec (nthreads: 32 speedup 2.82246)
-tricount time:         0.024509 sec (saxpy method)
-tri+prep time:         0.025237 sec (incl time to compute L)
-compute C time:        0.021998 sec
-reduce (C) time:       0.002510 sec
-rate       3.50 million edges/sec (incl time for L=tril(A))
-rate       3.60 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.022113 sec (nthreads: 64 speedup 2.8078)
-tricount time:         0.024629 sec (saxpy method)
-tri+prep time:         0.025357 sec (incl time to compute L)
-compute C time:        0.022113 sec
-reduce (C) time:       0.002516 sec
-rate       3.48 million edges/sec (incl time for L=tril(A))
-rate       3.58 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.020851 sec (nthreads: 128 speedup 2.97781)
-tricount time:         0.023261 sec (saxpy method)
-tri+prep time:         0.023989 sec (incl time to compute L)
-compute C time:        0.020851 sec
-reduce (C) time:       0.002410 sec
-rate       3.68 million edges/sec (incl time for L=tril(A))
-rate       3.79 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 58228 by 58228, 428156 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 58228 ncols: 58228 max # entries: 428156
-format: standard CSR vlen: 58228 nvec_nonempty: 58228 nvec: 58228 plen: 58228 vdim: 58228
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 428156 
-row: 0 : 120 entries [0:119]
-    column 1: bool 1
-    column 2: bool 1
-    column 3: bool 1
-    column 114: bool 1
-    column 225: bool 1
-    column 336: bool 1
-    column 447: bool 1
-    column 558: bool 1
-    column 669: bool 1
-    column 780: bool 1
-    column 891: bool 1
-    column 1002: bool 1
-    column 1113: bool 1
-    column 1114: bool 1
-    column 1225: bool 1
-    column 1336: bool 1
-    column 1447: bool 1
-    column 1558: bool 1
-    column 1669: bool 1
-    column 1780: bool 1
-    column 1891: bool 1
-    column 2002: bool 1
-    column 2113: bool 1
-    column 2224: bool 1
-    column 2225: bool 1
-    column 2336: bool 1
-    column 2447: bool 1
-    column 3335: bool 1
-    column 4446: bool 1
-    column 5557: bool 1
-    ...
-row: 1 : 40 entries [120:159]
-    ...
-row: 2 : 35 entries [160:194]
-    ...
-row: 3 : 2 entries [195:196]
-    ...
-row: 4 : 7 entries [197:203]
-    ...
-row: 5 : 2 entries [204:205]
-    ...
-row: 6 : 2 entries [206:207]
-    ...
-row: 7 : 4 entries [208:211]
-    ...
-row: 8 : 4 entries [212:215]
-    ...
-row: 9 : 3 entries [216:218]
-    ...
-...
-
-total time to read A matrix:       0.275038 sec
-
-n 58228 # edges 214078
-U=triu(A) time:        0.002885 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.002478 sec
-# triangles 494728
-
-L'*U time (dot):         0.031530 sec
-tricount time:         0.032080 sec (dot product method)
-tri+prep time:         0.037442 sec (incl time to compute L and U)
-compute C time:        0.031530 sec
-reduce (C) time:       0.000550 sec
-rate       5.72 million edges/sec (incl time for U=triu(A))
-rate       6.67 million edges/sec (just tricount itself)
-
-# triangles 494728
-
-L'*U time (dot):         0.031040 sec (nthreads: 2 speedup 1.0158)
-tricount time:         0.031586 sec (dot product method)
-tri+prep time:         0.036949 sec (incl time to compute L and U)
-compute C time:        0.031040 sec
-reduce (C) time:       0.000547 sec
-rate       5.79 million edges/sec (incl time for U=triu(A))
-rate       6.78 million edges/sec (just tricount itself)
-
-# triangles 494728
-
-L'*U time (dot):         0.020621 sec (nthreads: 4 speedup 1.529)
-tricount time:         0.021172 sec (dot product method)
-tri+prep time:         0.026535 sec (incl time to compute L and U)
-compute C time:        0.020621 sec
-reduce (C) time:       0.000551 sec
-rate       8.07 million edges/sec (incl time for U=triu(A))
-rate      10.11 million edges/sec (just tricount itself)
-
-# triangles 494728
-
-L'*U time (dot):         0.016261 sec (nthreads: 8 speedup 1.93903)
-tricount time:         0.016817 sec (dot product method)
-tri+prep time:         0.022180 sec (incl time to compute L and U)
-compute C time:        0.016261 sec
-reduce (C) time:       0.000556 sec
-rate       9.65 million edges/sec (incl time for U=triu(A))
-rate      12.73 million edges/sec (just tricount itself)
-
-# triangles 494728
-
-L'*U time (dot):         0.016906 sec (nthreads: 16 speedup 1.86506)
-tricount time:         0.017472 sec (dot product method)
-tri+prep time:         0.022835 sec (incl time to compute L and U)
-compute C time:        0.016906 sec
-reduce (C) time:       0.000566 sec
-rate       9.38 million edges/sec (incl time for U=triu(A))
-rate      12.25 million edges/sec (just tricount itself)
-
-# triangles 494728
-
-L'*U time (dot):         0.017165 sec (nthreads: 32 speedup 1.83686)
-tricount time:         0.017806 sec (dot product method)
-tri+prep time:         0.023169 sec (incl time to compute L and U)
-compute C time:        0.017165 sec
-reduce (C) time:       0.000641 sec
-rate       9.24 million edges/sec (incl time for U=triu(A))
-rate      12.02 million edges/sec (just tricount itself)
-
-# triangles 494728
-
-L'*U time (dot):         0.034351 sec (nthreads: 64 speedup 0.917868)
-tricount time:         0.036096 sec (dot product method)
-tri+prep time:         0.041458 sec (incl time to compute L and U)
-compute C time:        0.034351 sec
-reduce (C) time:       0.001744 sec
-rate       5.16 million edges/sec (incl time for U=triu(A))
-rate       5.93 million edges/sec (just tricount itself)
-
-# triangles 494728
-
-L'*U time (dot):         0.089298 sec (nthreads: 128 speedup 0.353085)
-tricount time:         0.092149 sec (dot product method)
-tri+prep time:         0.097511 sec (incl time to compute L and U)
-compute C time:        0.089298 sec
-reduce (C) time:       0.002850 sec
-rate       2.20 million edges/sec (incl time for U=triu(A))
-rate       2.32 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.078125 sec
-tricount time:         0.080982 sec (saxpy method)
-tri+prep time:         0.083460 sec (incl time to compute L)
-compute C time:        0.078125 sec
-reduce (C) time:       0.002857 sec
-rate       2.57 million edges/sec (incl time for L=tril(A))
-rate       2.64 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.053151 sec (nthreads: 2 speedup 1.46988)
-tricount time:         0.055969 sec (saxpy method)
-tri+prep time:         0.058446 sec (incl time to compute L)
-compute C time:        0.053151 sec
-reduce (C) time:       0.002817 sec
-rate       3.66 million edges/sec (incl time for L=tril(A))
-rate       3.82 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.029480 sec (nthreads: 4 speedup 2.65012)
-tricount time:         0.032324 sec (saxpy method)
-tri+prep time:         0.034802 sec (incl time to compute L)
-compute C time:        0.029480 sec
-reduce (C) time:       0.002844 sec
-rate       6.15 million edges/sec (incl time for L=tril(A))
-rate       6.62 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.028410 sec (nthreads: 8 speedup 2.74995)
-tricount time:         0.031342 sec (saxpy method)
-tri+prep time:         0.033820 sec (incl time to compute L)
-compute C time:        0.028410 sec
-reduce (C) time:       0.002932 sec
-rate       6.33 million edges/sec (incl time for L=tril(A))
-rate       6.83 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.028168 sec (nthreads: 16 speedup 2.7736)
-tricount time:         0.031117 sec (saxpy method)
-tri+prep time:         0.033595 sec (incl time to compute L)
-compute C time:        0.028168 sec
-reduce (C) time:       0.002950 sec
-rate       6.37 million edges/sec (incl time for L=tril(A))
-rate       6.88 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.026989 sec (nthreads: 32 speedup 2.89472)
-tricount time:         0.029859 sec (saxpy method)
-tri+prep time:         0.032337 sec (incl time to compute L)
-compute C time:        0.026989 sec
-reduce (C) time:       0.002871 sec
-rate       6.62 million edges/sec (incl time for L=tril(A))
-rate       7.17 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.026632 sec (nthreads: 64 speedup 2.93348)
-tricount time:         0.029585 sec (saxpy method)
-tri+prep time:         0.032063 sec (incl time to compute L)
-compute C time:        0.026632 sec
-reduce (C) time:       0.002953 sec
-rate       6.68 million edges/sec (incl time for L=tril(A))
-rate       7.24 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.025241 sec (nthreads: 128 speedup 3.09519)
-tricount time:         0.027982 sec (saxpy method)
-tri+prep time:         0.030460 sec (incl time to compute L)
-compute C time:        0.025241 sec
-reduce (C) time:       0.002742 sec
-rate       7.03 million edges/sec (incl time for L=tril(A))
-rate       7.65 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 196591 by 196591, 1900654 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 196591 ncols: 196591 max # entries: 1900654
-format: standard CSR vlen: 196591 nvec_nonempty: 196591 nvec: 196591 plen: 196591 vdim: 196591
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 1900654 
-row: 0 : 615 entries [0:614]
-    column 1: bool 1
-    column 2: bool 1
-    column 3: bool 1
-    column 1114: bool 1
-    column 2225: bool 1
-    column 3336: bool 1
-    column 4447: bool 1
-    column 5558: bool 1
-    column 6669: bool 1
-    column 7780: bool 1
-    column 8891: bool 1
-    column 10002: bool 1
-    column 11113: bool 1
-    column 11114: bool 1
-    column 12225: bool 1
-    column 13336: bool 1
-    column 14447: bool 1
-    column 15558: bool 1
-    column 16669: bool 1
-    column 17780: bool 1
-    column 18891: bool 1
-    column 20002: bool 1
-    column 21113: bool 1
-    column 22224: bool 1
-    column 22225: bool 1
-    column 23336: bool 1
-    column 24447: bool 1
-    column 25558: bool 1
-    column 26669: bool 1
-    column 27780: bool 1
-    ...
-row: 1 : 838 entries [615:1452]
-    ...
-row: 2 : 22 entries [1453:1474]
-    ...
-row: 3 : 74 entries [1475:1548]
-    ...
-row: 4 : 6 entries [1549:1554]
-    ...
-row: 5 : 9 entries [1555:1563]
-    ...
-row: 6 : 5 entries [1564:1568]
-    ...
-row: 7 : 6 entries [1569:1574]
-    ...
-row: 8 : 18 entries [1575:1592]
-    ...
-row: 9 : 4 entries [1593:1596]
-    ...
-...
-
-total time to read A matrix:       1.298653 sec
-
-n 196591 # edges 950327
-U=triu(A) time:        0.012239 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.010112 sec
-# triangles 2273138
-
-L'*U time (dot):         0.217779 sec
-tricount time:         0.220830 sec (dot product method)
-tri+prep time:         0.243181 sec (incl time to compute L and U)
-compute C time:        0.217779 sec
-reduce (C) time:       0.003051 sec
-rate       3.91 million edges/sec (incl time for U=triu(A))
-rate       4.30 million edges/sec (just tricount itself)
-
-# triangles 2273138
-
-L'*U time (dot):         0.296627 sec (nthreads: 2 speedup 0.734186)
-tricount time:         0.299700 sec (dot product method)
-tri+prep time:         0.322051 sec (incl time to compute L and U)
-compute C time:        0.296627 sec
-reduce (C) time:       0.003073 sec
-rate       2.95 million edges/sec (incl time for U=triu(A))
-rate       3.17 million edges/sec (just tricount itself)
-
-# triangles 2273138
-
-L'*U time (dot):         0.151442 sec (nthreads: 4 speedup 1.43804)
-tricount time:         0.154514 sec (dot product method)
-tri+prep time:         0.176865 sec (incl time to compute L and U)
-compute C time:        0.151442 sec
-reduce (C) time:       0.003071 sec
-rate       5.37 million edges/sec (incl time for U=triu(A))
-rate       6.15 million edges/sec (just tricount itself)
-
-# triangles 2273138
-
-L'*U time (dot):         0.090897 sec (nthreads: 8 speedup 2.39589)
-tricount time:         0.093966 sec (dot product method)
-tri+prep time:         0.116317 sec (incl time to compute L and U)
-compute C time:        0.090897 sec
-reduce (C) time:       0.003069 sec
-rate       8.17 million edges/sec (incl time for U=triu(A))
-rate      10.11 million edges/sec (just tricount itself)
-
-# triangles 2273138
-
-L'*U time (dot):         0.083176 sec (nthreads: 16 speedup 2.6183)
-tricount time:         0.086272 sec (dot product method)
-tri+prep time:         0.108623 sec (incl time to compute L and U)
-compute C time:        0.083176 sec
-reduce (C) time:       0.003096 sec
-rate       8.75 million edges/sec (incl time for U=triu(A))
-rate      11.02 million edges/sec (just tricount itself)
-
-# triangles 2273138
-
-L'*U time (dot):         0.070648 sec (nthreads: 32 speedup 3.08258)
-tricount time:         0.074101 sec (dot product method)
-tri+prep time:         0.096452 sec (incl time to compute L and U)
-compute C time:        0.070648 sec
-reduce (C) time:       0.003453 sec
-rate       9.85 million edges/sec (incl time for U=triu(A))
-rate      12.82 million edges/sec (just tricount itself)
-
-# triangles 2273138
-
-L'*U time (dot):         0.078318 sec (nthreads: 64 speedup 2.7807)
-tricount time:         0.081313 sec (dot product method)
-tri+prep time:         0.103664 sec (incl time to compute L and U)
-compute C time:        0.078318 sec
-reduce (C) time:       0.002994 sec
-rate       9.17 million edges/sec (incl time for U=triu(A))
-rate      11.69 million edges/sec (just tricount itself)
-
-# triangles 2273138
-
-L'*U time (dot):         0.171880 sec (nthreads: 128 speedup 1.26704)
-tricount time:         0.185838 sec (dot product method)
-tri+prep time:         0.208189 sec (incl time to compute L and U)
-compute C time:        0.171880 sec
-reduce (C) time:       0.013958 sec
-rate       4.56 million edges/sec (incl time for U=triu(A))
-rate       5.11 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         1.213952 sec
-tricount time:         1.227846 sec (saxpy method)
-tri+prep time:         1.237957 sec (incl time to compute L)
-compute C time:        1.213952 sec
-reduce (C) time:       0.013894 sec
-rate       0.77 million edges/sec (incl time for L=tril(A))
-rate       0.77 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.647339 sec (nthreads: 2 speedup 1.8753)
-tricount time:         0.661043 sec (saxpy method)
-tri+prep time:         0.671154 sec (incl time to compute L)
-compute C time:        0.647339 sec
-reduce (C) time:       0.013704 sec
-rate       1.42 million edges/sec (incl time for L=tril(A))
-rate       1.44 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.333296 sec (nthreads: 4 speedup 3.64227)
-tricount time:         0.347097 sec (saxpy method)
-tri+prep time:         0.357209 sec (incl time to compute L)
-compute C time:        0.333296 sec
-reduce (C) time:       0.013801 sec
-rate       2.66 million edges/sec (incl time for L=tril(A))
-rate       2.74 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.180155 sec (nthreads: 8 speedup 6.73837)
-tricount time:         0.193965 sec (saxpy method)
-tri+prep time:         0.204077 sec (incl time to compute L)
-compute C time:        0.180155 sec
-reduce (C) time:       0.013810 sec
-rate       4.66 million edges/sec (incl time for L=tril(A))
-rate       4.90 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.108224 sec (nthreads: 16 speedup 11.217)
-tricount time:         0.121982 sec (saxpy method)
-tri+prep time:         0.132094 sec (incl time to compute L)
-compute C time:        0.108224 sec
-reduce (C) time:       0.013758 sec
-rate       7.19 million edges/sec (incl time for L=tril(A))
-rate       7.79 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.069080 sec (nthreads: 32 speedup 17.5732)
-tricount time:         0.082939 sec (saxpy method)
-tri+prep time:         0.093051 sec (incl time to compute L)
-compute C time:        0.069080 sec
-reduce (C) time:       0.013860 sec
-rate      10.21 million edges/sec (incl time for L=tril(A))
-rate      11.46 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.048640 sec (nthreads: 64 speedup 24.9578)
-tricount time:         0.062510 sec (saxpy method)
-tri+prep time:         0.072621 sec (incl time to compute L)
-compute C time:        0.048640 sec
-reduce (C) time:       0.013869 sec
-rate      13.09 million edges/sec (incl time for L=tril(A))
-rate      15.20 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.045458 sec (nthreads: 128 speedup 26.7046)
-tricount time:         0.059088 sec (saxpy method)
-tri+prep time:         0.069200 sec (incl time to compute L)
-compute C time:        0.045458 sec
-reduce (C) time:       0.013630 sec
-rate      13.73 million edges/sec (incl time for L=tril(A))
-rate      16.08 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 11461 by 11461, 65460 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 11461 ncols: 11461 max # entries: 65460
-format: standard CSR vlen: 11461 nvec_nonempty: 11461 nvec: 11461 plen: 11461 vdim: 11461
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 65460 
-row: 0 : 583 entries [0:582]
-    column 191: bool 1
-    column 194: bool 1
-    column 200: bool 1
-    column 203: bool 1
-    column 217: bool 1
-    column 219: bool 1
-    column 227: bool 1
-    column 232: bool 1
-    column 251: bool 1
-    column 254: bool 1
-    column 271: bool 1
-    column 279: bool 1
-    column 285: bool 1
-    column 296: bool 1
-    column 297: bool 1
-    column 328: bool 1
-    column 341: bool 1
-    column 345: bool 1
-    column 357: bool 1
-    column 365: bool 1
-    column 367: bool 1
-    column 368: bool 1
-    column 370: bool 1
-    column 382: bool 1
-    column 404: bool 1
-    column 405: bool 1
-    column 417: bool 1
-    column 427: bool 1
-    column 430: bool 1
-    column 454: bool 1
-    ...
-row: 1 : 1 entries [583:583]
-    ...
-row: 2 : 2 entries [584:585]
-    ...
-row: 3 : 2 entries [586:587]
-    ...
-row: 4 : 2 entries [588:589]
-    ...
-row: 5 : 2 entries [590:591]
-    ...
-row: 6 : 1 entries [592:592]
-    ...
-row: 7 : 2 entries [593:594]
-    ...
-row: 8 : 3 entries [595:597]
-    ...
-row: 9 : 3 entries [598:600]
-    ...
-...
-
-total time to read A matrix:       0.042357 sec
-
-n 11461 # edges 32730
-U=triu(A) time:        0.000477 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.000387 sec
-# triangles 89541
-
-L'*U time (dot):         0.004441 sec
-tricount time:         0.004535 sec (dot product method)
-tri+prep time:         0.005398 sec (incl time to compute L and U)
-compute C time:        0.004441 sec
-reduce (C) time:       0.000094 sec
-rate       6.06 million edges/sec (incl time for U=triu(A))
-rate       7.22 million edges/sec (just tricount itself)
-
-# triangles 89541
-
-L'*U time (dot):         0.004021 sec (nthreads: 2 speedup 1.10443)
-tricount time:         0.004113 sec (dot product method)
-tri+prep time:         0.004977 sec (incl time to compute L and U)
-compute C time:        0.004021 sec
-reduce (C) time:       0.000092 sec
-rate       6.58 million edges/sec (incl time for U=triu(A))
-rate       7.96 million edges/sec (just tricount itself)
-
-# triangles 89541
-
-L'*U time (dot):         0.003004 sec (nthreads: 4 speedup 1.47844)
-tricount time:         0.003096 sec (dot product method)
-tri+prep time:         0.003960 sec (incl time to compute L and U)
-compute C time:        0.003004 sec
-reduce (C) time:       0.000093 sec
-rate       8.26 million edges/sec (incl time for U=triu(A))
-rate      10.57 million edges/sec (just tricount itself)
-
-# triangles 89541
-
-L'*U time (dot):         0.003812 sec (nthreads: 8 speedup 1.16506)
-tricount time:         0.003905 sec (dot product method)
-tri+prep time:         0.004769 sec (incl time to compute L and U)
-compute C time:        0.003812 sec
-reduce (C) time:       0.000093 sec
-rate       6.86 million edges/sec (incl time for U=triu(A))
-rate       8.38 million edges/sec (just tricount itself)
-
-# triangles 89541
-
-L'*U time (dot):         0.003424 sec (nthreads: 16 speedup 1.29688)
-tricount time:         0.003521 sec (dot product method)
-tri+prep time:         0.004384 sec (incl time to compute L and U)
-compute C time:        0.003424 sec
-reduce (C) time:       0.000097 sec
-rate       7.46 million edges/sec (incl time for U=triu(A))
-rate       9.30 million edges/sec (just tricount itself)
-
-# triangles 89541
-
-L'*U time (dot):         0.003708 sec (nthreads: 32 speedup 1.19761)
-tricount time:         0.003861 sec (dot product method)
-tri+prep time:         0.004725 sec (incl time to compute L and U)
-compute C time:        0.003708 sec
-reduce (C) time:       0.000153 sec
-rate       6.93 million edges/sec (incl time for U=triu(A))
-rate       8.48 million edges/sec (just tricount itself)
-
-# triangles 89541
-
-L'*U time (dot):         0.007400 sec (nthreads: 64 speedup 0.600054)
-tricount time:         0.007672 sec (dot product method)
-tri+prep time:         0.008536 sec (incl time to compute L and U)
-compute C time:        0.007400 sec
-reduce (C) time:       0.000272 sec
-rate       3.83 million edges/sec (incl time for U=triu(A))
-rate       4.27 million edges/sec (just tricount itself)
-
-# triangles 89541
-
-L'*U time (dot):         0.026473 sec (nthreads: 128 speedup 0.16774)
-tricount time:         0.026984 sec (dot product method)
-tri+prep time:         0.027848 sec (incl time to compute L and U)
-compute C time:        0.026473 sec
-reduce (C) time:       0.000511 sec
-rate       1.18 million edges/sec (incl time for U=triu(A))
-rate       1.21 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.024920 sec
-tricount time:         0.025434 sec (saxpy method)
-tri+prep time:         0.025821 sec (incl time to compute L)
-compute C time:        0.024920 sec
-reduce (C) time:       0.000514 sec
-rate       1.27 million edges/sec (incl time for L=tril(A))
-rate       1.29 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.014630 sec (nthreads: 2 speedup 1.70337)
-tricount time:         0.015150 sec (saxpy method)
-tri+prep time:         0.015537 sec (incl time to compute L)
-compute C time:        0.014630 sec
-reduce (C) time:       0.000520 sec
-rate       2.11 million edges/sec (incl time for L=tril(A))
-rate       2.16 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.014555 sec (nthreads: 4 speedup 1.71209)
-tricount time:         0.015084 sec (saxpy method)
-tri+prep time:         0.015471 sec (incl time to compute L)
-compute C time:        0.014555 sec
-reduce (C) time:       0.000529 sec
-rate       2.12 million edges/sec (incl time for L=tril(A))
-rate       2.17 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.014122 sec (nthreads: 8 speedup 1.76459)
-tricount time:         0.014643 sec (saxpy method)
-tri+prep time:         0.015030 sec (incl time to compute L)
-compute C time:        0.014122 sec
-reduce (C) time:       0.000521 sec
-rate       2.18 million edges/sec (incl time for L=tril(A))
-rate       2.24 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.013944 sec (nthreads: 16 speedup 1.78719)
-tricount time:         0.014472 sec (saxpy method)
-tri+prep time:         0.014859 sec (incl time to compute L)
-compute C time:        0.013944 sec
-reduce (C) time:       0.000529 sec
-rate       2.20 million edges/sec (incl time for L=tril(A))
-rate       2.26 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.013642 sec (nthreads: 32 speedup 1.82666)
-tricount time:         0.014137 sec (saxpy method)
-tri+prep time:         0.014524 sec (incl time to compute L)
-compute C time:        0.013642 sec
-reduce (C) time:       0.000494 sec
-rate       2.25 million edges/sec (incl time for L=tril(A))
-rate       2.32 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.013822 sec (nthreads: 64 speedup 1.80288)
-tricount time:         0.014380 sec (saxpy method)
-tri+prep time:         0.014768 sec (incl time to compute L)
-compute C time:        0.013822 sec
-reduce (C) time:       0.000558 sec
-rate       2.22 million edges/sec (incl time for L=tril(A))
-rate       2.28 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.013115 sec (nthreads: 128 speedup 1.90013)
-tricount time:         0.013595 sec (saxpy method)
-tri+prep time:         0.013982 sec (incl time to compute L)
-compute C time:        0.013115 sec
-reduce (C) time:       0.000480 sec
-rate       2.34 million edges/sec (incl time for L=tril(A))
-rate       2.41 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 62586 by 62586, 295784 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 62586 ncols: 62586 max # entries: 295784
-format: standard CSR vlen: 62586 nvec_nonempty: 62586 nvec: 62586 plen: 62586 vdim: 62586
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 295784 
-row: 0 : 23 entries [0:22]
-    column 1: bool 1
-    column 2: bool 1
-    column 631: bool 1
-    column 1056: bool 1
-    column 8385: bool 1
-    column 11112: bool 1
-    column 12408: bool 1
-    column 16108: bool 1
-    column 16298: bool 1
-    column 19134: bool 1
-    column 20208: bool 1
-    column 22223: bool 1
-    column 33334: bool 1
-    column 36336: bool 1
-    column 38949: bool 1
-    column 40628: bool 1
-    column 44445: bool 1
-    column 46917: bool 1
-    column 55556: bool 1
-    column 58547: bool 1
-    column 59253: bool 1
-    column 60364: bool 1
-    column 61475: bool 1
-row: 1 : 36 entries [23:58]
-    column 0: bool 1
-    column 950: bool 1
-    column 1890: bool 1
-    column 2926: bool 1
-    column 4282: bool 1
-    column 6780: bool 1
-    column 6969: bool 1
-    ...
-row: 2 : 20 entries [59:78]
-    ...
-row: 3 : 11 entries [79:89]
-    ...
-row: 4 : 1 entries [90:90]
-    ...
-row: 5 : 19 entries [91:109]
-    ...
-row: 6 : 1 entries [110:110]
-    ...
-row: 7 : 6 entries [111:116]
-    ...
-row: 8 : 3 entries [117:119]
-    ...
-row: 9 : 1 entries [120:120]
-    ...
-...
-
-total time to read A matrix:       0.190243 sec
-
-n 62586 # edges 147892
-U=triu(A) time:        0.002483 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.001959 sec
-# triangles 2024
-
-L'*U time (dot):         0.006822 sec
-tricount time:         0.006838 sec (dot product method)
-tri+prep time:         0.011280 sec (incl time to compute L and U)
-compute C time:        0.006822 sec
-reduce (C) time:       0.000016 sec
-rate      13.11 million edges/sec (incl time for U=triu(A))
-rate      21.63 million edges/sec (just tricount itself)
-
-# triangles 2024
-
-L'*U time (dot):         0.005807 sec (nthreads: 2 speedup 1.17475)
-tricount time:         0.005823 sec (dot product method)
-tri+prep time:         0.010265 sec (incl time to compute L and U)
-compute C time:        0.005807 sec
-reduce (C) time:       0.000016 sec
-rate      14.41 million edges/sec (incl time for U=triu(A))
-rate      25.40 million edges/sec (just tricount itself)
-
-# triangles 2024
-
-L'*U time (dot):         0.004411 sec (nthreads: 4 speedup 1.54661)
-tricount time:         0.004425 sec (dot product method)
-tri+prep time:         0.008867 sec (incl time to compute L and U)
-compute C time:        0.004411 sec
-reduce (C) time:       0.000014 sec
-rate      16.68 million edges/sec (incl time for U=triu(A))
-rate      33.43 million edges/sec (just tricount itself)
-
-# triangles 2024
-
-L'*U time (dot):         0.004653 sec (nthreads: 8 speedup 1.4662)
-tricount time:         0.004667 sec (dot product method)
-tri+prep time:         0.009109 sec (incl time to compute L and U)
-compute C time:        0.004653 sec
-reduce (C) time:       0.000014 sec
-rate      16.24 million edges/sec (incl time for U=triu(A))
-rate      31.69 million edges/sec (just tricount itself)
-
-# triangles 2024
-
-L'*U time (dot):         0.008246 sec (nthreads: 16 speedup 0.827313)
-tricount time:         0.008261 sec (dot product method)
-tri+prep time:         0.012703 sec (incl time to compute L and U)
-compute C time:        0.008246 sec
-reduce (C) time:       0.000015 sec
-rate      11.64 million edges/sec (incl time for U=triu(A))
-rate      17.90 million edges/sec (just tricount itself)
-
-# triangles 2024
-
-L'*U time (dot):         0.008473 sec (nthreads: 32 speedup 0.805191)
-tricount time:         0.008500 sec (dot product method)
-tri+prep time:         0.012942 sec (incl time to compute L and U)
-compute C time:        0.008473 sec
-reduce (C) time:       0.000028 sec
-rate      11.43 million edges/sec (incl time for U=triu(A))
-rate      17.40 million edges/sec (just tricount itself)
-
-# triangles 2024
-
-L'*U time (dot):         0.018793 sec (nthreads: 64 speedup 0.363004)
-tricount time:         0.018837 sec (dot product method)
-tri+prep time:         0.023279 sec (incl time to compute L and U)
-compute C time:        0.018793 sec
-reduce (C) time:       0.000044 sec
-rate       6.35 million edges/sec (incl time for U=triu(A))
-rate       7.85 million edges/sec (just tricount itself)
-
-# triangles 2024
-
-L'*U time (dot):         0.088846 sec (nthreads: 128 speedup 0.0767841)
-tricount time:         0.088907 sec (dot product method)
-tri+prep time:         0.093349 sec (incl time to compute L and U)
-compute C time:        0.088846 sec
-reduce (C) time:       0.000060 sec
-rate       1.58 million edges/sec (incl time for U=triu(A))
-rate       1.66 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         0.028909 sec
-tricount time:         0.028956 sec (saxpy method)
-tri+prep time:         0.030915 sec (incl time to compute L)
-compute C time:        0.028909 sec
-reduce (C) time:       0.000047 sec
-rate       4.78 million edges/sec (incl time for L=tril(A))
-rate       5.11 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.026415 sec (nthreads: 2 speedup 1.09442)
-tricount time:         0.026481 sec (saxpy method)
-tri+prep time:         0.028440 sec (incl time to compute L)
-compute C time:        0.026415 sec
-reduce (C) time:       0.000066 sec
-rate       5.20 million edges/sec (incl time for L=tril(A))
-rate       5.58 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.025799 sec (nthreads: 4 speedup 1.12055)
-tricount time:         0.025868 sec (saxpy method)
-tri+prep time:         0.027827 sec (incl time to compute L)
-compute C time:        0.025799 sec
-reduce (C) time:       0.000069 sec
-rate       5.31 million edges/sec (incl time for L=tril(A))
-rate       5.72 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.025175 sec (nthreads: 8 speedup 1.1483)
-tricount time:         0.025245 sec (saxpy method)
-tri+prep time:         0.027204 sec (incl time to compute L)
-compute C time:        0.025175 sec
-reduce (C) time:       0.000070 sec
-rate       5.44 million edges/sec (incl time for L=tril(A))
-rate       5.86 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.024729 sec (nthreads: 16 speedup 1.16904)
-tricount time:         0.024797 sec (saxpy method)
-tri+prep time:         0.026756 sec (incl time to compute L)
-compute C time:        0.024729 sec
-reduce (C) time:       0.000068 sec
-rate       5.53 million edges/sec (incl time for L=tril(A))
-rate       5.96 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.023920 sec (nthreads: 32 speedup 1.20855)
-tricount time:         0.023979 sec (saxpy method)
-tri+prep time:         0.025937 sec (incl time to compute L)
-compute C time:        0.023920 sec
-reduce (C) time:       0.000058 sec
-rate       5.70 million edges/sec (incl time for L=tril(A))
-rate       6.17 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.023826 sec (nthreads: 64 speedup 1.21334)
-tricount time:         0.023900 sec (saxpy method)
-tri+prep time:         0.025859 sec (incl time to compute L)
-compute C time:        0.023826 sec
-reduce (C) time:       0.000074 sec
-rate       5.72 million edges/sec (incl time for L=tril(A))
-rate       6.19 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.022938 sec (nthreads: 128 speedup 1.26031)
-tricount time:         0.023005 sec (saxpy method)
-tri+prep time:         0.024964 sec (incl time to compute L)
-compute C time:        0.022938 sec
-reduce (C) time:       0.000067 sec
-rate       5.92 million edges/sec (incl time for L=tril(A))
-rate       6.43 million edges/sec (just tricount itself)
-
-
---------------------------------------------------------------
-matrix 1379917 by 1379917, 3843320 entries, from stdin
-
-GraphBLAS matrix: from get_matrix: 
-nrows: 1379917 ncols: 1379917 max # entries: 3843320
-format: standard CSR vlen: 1379917 nvec_nonempty: 1379917 nvec: 1379917 plen: 1379917 vdim: 1379917
-hyper_ratio 0.0625
-GraphBLAS type:  bool size: 1
-number of entries: 3843320 
-row: 0 : 3 entries [0:2]
-    column 1: bool 1
-    column 500958: bool 1
-    column 599260: bool 1
-row: 1 : 3 entries [3:5]
-    column 0: bool 1
-    column 533845: bool 1
-    column 632071: bool 1
-row: 2 : 3 entries [6:8]
-    column 110510: bool 1
-    column 830498: bool 1
-    column 1159914: bool 1
-row: 3 : 2 entries [9:10]
-    column 11046: bool 1
-    column 1105070: bool 1
-row: 4 : 1 entries [11:11]
-    column 1376617: bool 1
-row: 5 : 3 entries [12:14]
-    column 717769: bool 1
-    column 732327: bool 1
-    column 732438: bool 1
-row: 6 : 3 entries [15:17]
-    column 5176: bool 1
-    column 1379807: bool 1
-    column 1379895: bool 1
-row: 7 : 3 entries [18:20]
-    column 8: bool 1
-    column 9: bool 1
-    column 1379916: bool 1
-row: 8 : 2 entries [21:22]
-    column 7: bool 1
-    column 1379901: bool 1
-row: 9 : 4 entries [23:26]
-    column 7: bool 1
-...
-
-total time to read A matrix:       2.566620 sec
-
-n 1379917 # edges 1921660
-U=triu(A) time:        0.058191 sec
-
-------------------------------------- dot product method:
-L=tril(A) time:        0.049557 sec
-# triangles 82869
-
-L'*U time (dot):         0.086845 sec
-tricount time:         0.087429 sec (dot product method)
-tri+prep time:         0.195176 sec (incl time to compute L and U)
-compute C time:        0.086845 sec
-reduce (C) time:       0.000584 sec
-rate       9.85 million edges/sec (incl time for U=triu(A))
-rate      21.98 million edges/sec (just tricount itself)
-
-# triangles 82869
-
-L'*U time (dot):         0.154690 sec (nthreads: 2 speedup 0.561414)
-tricount time:         0.155267 sec (dot product method)
-tri+prep time:         0.263014 sec (incl time to compute L and U)
-compute C time:        0.154690 sec
-reduce (C) time:       0.000577 sec
-rate       7.31 million edges/sec (incl time for U=triu(A))
-rate      12.38 million edges/sec (just tricount itself)
-
-# triangles 82869
-
-L'*U time (dot):         0.137445 sec (nthreads: 4 speedup 0.631856)
-tricount time:         0.138022 sec (dot product method)
-tri+prep time:         0.245769 sec (incl time to compute L and U)
-compute C time:        0.137445 sec
-reduce (C) time:       0.000577 sec
-rate       7.82 million edges/sec (incl time for U=triu(A))
-rate      13.92 million edges/sec (just tricount itself)
-
-# triangles 82869
-
-L'*U time (dot):         0.127639 sec (nthreads: 8 speedup 0.680395)
-tricount time:         0.128216 sec (dot product method)
-tri+prep time:         0.235964 sec (incl time to compute L and U)
-compute C time:        0.127639 sec
-reduce (C) time:       0.000577 sec
-rate       8.14 million edges/sec (incl time for U=triu(A))
-rate      14.99 million edges/sec (just tricount itself)
-
-# triangles 82869
-
-L'*U time (dot):         0.134564 sec (nthreads: 16 speedup 0.64538)
-tricount time:         0.135142 sec (dot product method)
-tri+prep time:         0.242889 sec (incl time to compute L and U)
-compute C time:        0.134564 sec
-reduce (C) time:       0.000577 sec
-rate       7.91 million edges/sec (incl time for U=triu(A))
-rate      14.22 million edges/sec (just tricount itself)
-
-# triangles 82869
-
-L'*U time (dot):         0.130057 sec (nthreads: 32 speedup 0.667744)
-tricount time:         0.130638 sec (dot product method)
-tri+prep time:         0.238386 sec (incl time to compute L and U)
-compute C time:        0.130057 sec
-reduce (C) time:       0.000581 sec
-rate       8.06 million edges/sec (incl time for U=triu(A))
-rate      14.71 million edges/sec (just tricount itself)
-
-# triangles 82869
-
-L'*U time (dot):         0.354407 sec (nthreads: 64 speedup 0.245043)
-tricount time:         0.356050 sec (dot product method)
-tri+prep time:         0.463797 sec (incl time to compute L and U)
-compute C time:        0.354407 sec
-reduce (C) time:       0.001642 sec
-rate       4.14 million edges/sec (incl time for U=triu(A))
-rate       5.40 million edges/sec (just tricount itself)
-
-# triangles 82869
-
-L'*U time (dot):         1.045150 sec (nthreads: 128 speedup 0.0830935)
-tricount time:         1.048142 sec (dot product method)
-tri+prep time:         1.155889 sec (incl time to compute L and U)
-compute C time:        1.045150 sec
-reduce (C) time:       0.002992 sec
-rate       1.66 million edges/sec (incl time for U=triu(A))
-rate       1.83 million edges/sec (just tricount itself)
-
-
------------------------------------ saxpy method:
-
-C<L>=L*L time (saxpy):         1.040479 sec
-tricount time:         1.043441 sec (saxpy method)
-tri+prep time:         1.092998 sec (incl time to compute L)
-compute C time:        1.040479 sec
-reduce (C) time:       0.002962 sec
-rate       1.76 million edges/sec (incl time for L=tril(A))
-rate       1.84 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.908654 sec (nthreads: 2 speedup 1.14508)
-tricount time:         0.911369 sec (saxpy method)
-tri+prep time:         0.960925 sec (incl time to compute L)
-compute C time:        0.908654 sec
-reduce (C) time:       0.002715 sec
-rate       2.00 million edges/sec (incl time for L=tril(A))
-rate       2.11 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.868608 sec (nthreads: 4 speedup 1.19787)
-tricount time:         0.871812 sec (saxpy method)
-tri+prep time:         0.921368 sec (incl time to compute L)
-compute C time:        0.868608 sec
-reduce (C) time:       0.003204 sec
-rate       2.09 million edges/sec (incl time for L=tril(A))
-rate       2.20 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.852715 sec (nthreads: 8 speedup 1.2202)
-tricount time:         0.855711 sec (saxpy method)
-tri+prep time:         0.905267 sec (incl time to compute L)
-compute C time:        0.852715 sec
-reduce (C) time:       0.002996 sec
-rate       2.12 million edges/sec (incl time for L=tril(A))
-rate       2.25 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.856140 sec (nthreads: 16 speedup 1.21532)
-tricount time:         0.859025 sec (saxpy method)
-tri+prep time:         0.908582 sec (incl time to compute L)
-compute C time:        0.856140 sec
-reduce (C) time:       0.002886 sec
-rate       2.12 million edges/sec (incl time for L=tril(A))
-rate       2.24 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.829044 sec (nthreads: 32 speedup 1.25504)
-tricount time:         0.832229 sec (saxpy method)
-tri+prep time:         0.881785 sec (incl time to compute L)
-compute C time:        0.829044 sec
-reduce (C) time:       0.003185 sec
-rate       2.18 million edges/sec (incl time for L=tril(A))
-rate       2.31 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.799150 sec (nthreads: 64 speedup 1.30198)
-tricount time:         0.801911 sec (saxpy method)
-tri+prep time:         0.851468 sec (incl time to compute L)
-compute C time:        0.799150 sec
-reduce (C) time:       0.002761 sec
-rate       2.26 million edges/sec (incl time for L=tril(A))
-rate       2.40 million edges/sec (just tricount itself)
-
-
-C<L>=L*L time (saxpy):         0.723219 sec (nthreads: 128 speedup 1.43868)
-tricount time:         0.725796 sec (saxpy method)
-tri+prep time:         0.775352 sec (incl time to compute L)
-compute C time:        0.723219 sec
-reduce (C) time:       0.002577 sec
-rate       2.48 million edges/sec (incl time for L=tril(A))
-rate       2.65 million edges/sec (just tricount itself)
-
-
diff --git a/GraphBLAS/Extras/tri/tri_grb_results.m b/GraphBLAS/Extras/tri/tri_grb_results.m
deleted file mode 100644
index 61d4d3d9f6..0000000000
--- a/GraphBLAS/Extras/tri/tri_grb_results.m
+++ /dev/null
@@ -1,1316 +0,0 @@
-function [T, Tprep, N, Nedges, Ntri] = tri_grb_results
-id = 0 ;
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  442  ; 
-Nedges (id) =  841  ; 
-Tprep (id,1) =  0.000011  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000010  ; % dot product prep
-Ntri (id) =  400  ; 
-T (id,2) =  0.000030  ;  % dot product 
-T (id,1) =  0.000031  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  442  ; 
-Nedges (id) =  841  ; 
-Tprep (id,1) =  0.000010  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000010  ; % dot product prep
-Ntri (id) =  1  ; 
-T (id,2) =  0.000024  ;  % dot product 
-T (id,1) =  0.000039  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  442  ; 
-Nedges (id) =  800  ; 
-Tprep (id,1) =  0.000010  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000010  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000022  ;  % dot product 
-T (id,1) =  0.000024  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  120  ; 
-Nedges (id) =  346  ; 
-Tprep (id,1) =  0.000007  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000006  ; % dot product prep
-Ntri (id) =  287  ; 
-T (id,2) =  0.000019  ;  % dot product 
-T (id,1) =  0.000023  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  120  ; 
-Nedges (id) =  346  ; 
-Tprep (id,1) =  0.000007  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000006  ; % dot product prep
-Ntri (id) =  7  ; 
-T (id,2) =  0.000019  ;  % dot product 
-T (id,1) =  0.000029  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  120  ; 
-Nedges (id) =  240  ; 
-Tprep (id,1) =  0.000006  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000004  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000013  ;  % dot product 
-T (id,1) =  0.000014  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  20  ; 
-Nedges (id) =  31  ; 
-Tprep (id,1) =  0.000004  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000002  ; % dot product prep
-Ntri (id) =  12  ; 
-T (id,2) =  0.000009  ;  % dot product 
-T (id,1) =  0.000011  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  20  ; 
-Nedges (id) =  31  ; 
-Tprep (id,1) =  0.000005  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000002  ; % dot product prep
-Ntri (id) =  1  ; 
-T (id,2) =  0.000012  ;  % dot product 
-T (id,1) =  0.000013  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  20  ; 
-Nedges (id) =  24  ; 
-Tprep (id,1) =  0.000004  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000002  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000010  ;  % dot product 
-T (id,1) =  0.000011  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  300  ; 
-Nedges (id) =  720  ; 
-Tprep (id,1) =  0.000010  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000009  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000022  ;  % dot product 
-T (id,1) =  0.000022  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  30  ; 
-Nedges (id) =  49  ; 
-Tprep (id,1) =  0.000005  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000003  ; % dot product prep
-Ntri (id) =  20  ; 
-T (id,2) =  0.000013  ;  % dot product 
-T (id,1) =  0.000014  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  30  ; 
-Nedges (id) =  49  ; 
-Tprep (id,1) =  0.000004  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000002  ; % dot product prep
-Ntri (id) =  1  ; 
-T (id,2) =  0.000010  ;  % dot product 
-T (id,1) =  0.000012  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  30  ; 
-Nedges (id) =  40  ; 
-Tprep (id,1) =  0.000004  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000002  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000009  ;  % dot product 
-T (id,1) =  0.000010  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  60  ; 
-Nedges (id) =  104  ; 
-Tprep (id,1) =  0.000005  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000003  ; % dot product prep
-Ntri (id) =  45  ; 
-T (id,2) =  0.000012  ;  % dot product 
-T (id,1) =  0.000014  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  60  ; 
-Nedges (id) =  104  ; 
-Tprep (id,1) =  0.000005  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000003  ; % dot product prep
-Ntri (id) =  1  ; 
-T (id,2) =  0.000012  ;  % dot product 
-T (id,1) =  0.000015  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  60  ; 
-Nedges (id) =  90  ; 
-Tprep (id,1) =  0.000004  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000003  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000010  ;  % dot product 
-T (id,1) =  0.000012  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  170  ; 
-Nedges (id) =  313  ; 
-Tprep (id,1) =  0.000006  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000005  ; % dot product prep
-Ntri (id) =  144  ; 
-T (id,2) =  0.000015  ;  % dot product 
-T (id,1) =  0.000016  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  170  ; 
-Nedges (id) =  313  ; 
-Tprep (id,1) =  0.000006  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000005  ; % dot product prep
-Ntri (id) =  1  ; 
-T (id,2) =  0.000015  ;  % dot product 
-T (id,1) =  0.000020  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  170  ; 
-Nedges (id) =  288  ; 
-Tprep (id,1) =  0.000006  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000005  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000013  ;  % dot product 
-T (id,1) =  0.000015  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  300  ; 
-Nedges (id) =  940  ; 
-Tprep (id,1) =  0.000011  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000011  ; % dot product prep
-Ntri (id) =  821  ; 
-T (id,2) =  0.000038  ;  % dot product 
-T (id,1) =  0.000024  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  300  ; 
-Nedges (id) =  940  ; 
-Tprep (id,1) =  0.000011  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000011  ; % dot product prep
-Ntri (id) =  7  ; 
-T (id,2) =  0.000031  ;  % dot product 
-T (id,1) =  0.000017  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  1020  ; 
-Nedges (id) =  3448  ; 
-Tprep (id,1) =  0.000030  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000035  ; % dot product prep
-Ntri (id) =  3149  ; 
-T (id,2) =  0.000129  ;  % dot product 
-T (id,1) =  0.000070  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  1020  ; 
-Nedges (id) =  3448  ; 
-Tprep (id,1) =  0.000031  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000035  ; % dot product prep
-Ntri (id) =  7  ; 
-T (id,2) =  0.000094  ;  % dot product 
-T (id,1) =  0.000039  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  1020  ; 
-Nedges (id) =  2880  ; 
-Tprep (id,1) =  0.000026  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000035  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000051  ;  % dot product 
-T (id,1) =  0.000048  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  2132  ; 
-Nedges (id) =  4156  ; 
-Tprep (id,1) =  0.000035  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000042  ; % dot product prep
-Ntri (id) =  2025  ; 
-T (id,2) =  0.000083  ;  % dot product 
-T (id,1) =  0.000094  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  2132  ; 
-Nedges (id) =  4156  ; 
-Tprep (id,1) =  0.000035  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000042  ; % dot product prep
-Ntri (id) =  1  ; 
-T (id,2) =  0.000080  ;  % dot product 
-T (id,1) =  0.000212  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  2132  ; 
-Nedges (id) =  4050  ; 
-Tprep (id,1) =  0.000035  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000042  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000078  ;  % dot product 
-T (id,1) =  0.000081  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  1200  ; 
-Nedges (id) =  4320  ; 
-Tprep (id,1) =  0.000037  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000047  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000073  ;  % dot product 
-T (id,1) =  0.000059  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  1200  ; 
-Nedges (id) =  6583  ; 
-Tprep (id,1) =  0.000057  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000067  ; % dot product prep
-Ntri (id) =  9107  ; 
-T (id,2) =  0.000384  ;  % dot product 
-T (id,1) =  0.000191  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  1200  ; 
-Nedges (id) =  6583  ; 
-Tprep (id,1) =  0.000058  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000068  ; % dot product prep
-Ntri (id) =  35  ; 
-T (id,2) =  0.000261  ;  % dot product 
-T (id,1) =  0.000111  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  4420  ; 
-Nedges (id) =  14400  ; 
-Tprep (id,1) =  0.000121  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000158  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000216  ;  % dot product 
-T (id,1) =  0.000182  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  6474  ; 
-Nedges (id) =  12572  ; 
-Tprep (id,1) =  0.000159  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000180  ; % dot product prep
-Ntri (id) =  6584  ; 
-T (id,2) =  0.000963  ;  % dot product 
-T (id,1) =  0.000488  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  4420  ; 
-Nedges (id) =  15988  ; 
-Tprep (id,1) =  0.000131  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000172  ; % dot product prep
-Ntri (id) =  15169  ; 
-T (id,2) =  0.000636  ;  % dot product 
-T (id,1) =  0.000339  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  4420  ; 
-Nedges (id) =  15988  ; 
-Tprep (id,1) =  0.000133  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000179  ; % dot product prep
-Ntri (id) =  7  ; 
-T (id,2) =  0.000367  ;  % dot product 
-T (id,1) =  0.000130  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  5242  ; 
-Nedges (id) =  14484  ; 
-Tprep (id,1) =  0.000150  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000210  ; % dot product prep
-Ntri (id) =  48260  ; 
-T (id,2) =  0.001080  ;  % dot product 
-T (id,1) =  0.000690  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  5100  ; 
-Nedges (id) =  23040  ; 
-Tprep (id,1) =  0.000185  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000229  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000322  ;  % dot product 
-T (id,1) =  0.000242  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  6301  ; 
-Nedges (id) =  20777  ; 
-Tprep (id,1) =  0.000196  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000275  ; % dot product prep
-Ntri (id) =  2383  ; 
-T (id,2) =  0.001347  ;  % dot product 
-T (id,1) =  0.000651  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  10670  ; 
-Nedges (id) =  22002  ; 
-Tprep (id,1) =  0.000231  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000303  ; % dot product prep
-Ntri (id) =  17144  ; 
-T (id,2) =  0.002128  ;  % dot product 
-T (id,1) =  0.001855  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  10729  ; 
-Nedges (id) =  21999  ; 
-Tprep (id,1) =  0.000233  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000306  ; % dot product prep
-Ntri (id) =  15834  ; 
-T (id,2) =  0.002128  ;  % dot product 
-T (id,1) =  0.001883  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  10790  ; 
-Nedges (id) =  22469  ; 
-Tprep (id,1) =  0.000228  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000320  ; % dot product prep
-Ntri (id) =  18237  ; 
-T (id,2) =  0.002258  ;  % dot product 
-T (id,1) =  0.001985  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  10859  ; 
-Nedges (id) =  22747  ; 
-Tprep (id,1) =  0.000232  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000313  ; % dot product prep
-Ntri (id) =  19108  ; 
-T (id,2) =  0.002350  ;  % dot product 
-T (id,1) =  0.002022  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  10886  ; 
-Nedges (id) =  22493  ; 
-Tprep (id,1) =  0.000232  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000316  ; % dot product prep
-Ntri (id) =  17645  ; 
-T (id,2) =  0.002244  ;  % dot product 
-T (id,1) =  0.002006  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  10943  ; 
-Nedges (id) =  22607  ; 
-Tprep (id,1) =  0.000232  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000313  ; % dot product prep
-Ntri (id) =  17597  ; 
-T (id,2) =  0.002248  ;  % dot product 
-T (id,1) =  0.001985  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  11011  ; 
-Nedges (id) =  22677  ; 
-Tprep (id,1) =  0.000234  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000312  ; % dot product prep
-Ntri (id) =  17598  ; 
-T (id,2) =  0.002257  ;  % dot product 
-T (id,1) =  0.001988  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  11051  ; 
-Nedges (id) =  22724  ; 
-Tprep (id,1) =  0.000234  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000314  ; % dot product prep
-Ntri (id) =  17677  ; 
-T (id,2) =  0.002261  ;  % dot product 
-T (id,1) =  0.001990  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  11174  ; 
-Nedges (id) =  23409  ; 
-Tprep (id,1) =  0.000244  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000345  ; % dot product prep
-Ntri (id) =  19894  ; 
-T (id,2) =  0.002428  ;  % dot product 
-T (id,1) =  0.002091  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  5100  ; 
-Nedges (id) =  31036  ; 
-Tprep (id,1) =  0.000241  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000297  ; % dot product prep
-Ntri (id) =  45013  ; 
-T (id,2) =  0.001774  ;  % dot product 
-T (id,1) =  0.001581  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  5100  ; 
-Nedges (id) =  31036  ; 
-Tprep (id,1) =  0.000242  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000304  ; % dot product prep
-Ntri (id) =  35  ; 
-T (id,2) =  0.001537  ;  % dot product 
-T (id,1) =  0.000413  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  8114  ; 
-Nedges (id) =  26013  ; 
-Tprep (id,1) =  0.000246  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000348  ; % dot product prep
-Ntri (id) =  2354  ; 
-T (id,2) =  0.001711  ;  % dot product 
-T (id,1) =  0.000770  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  9877  ; 
-Nedges (id) =  25973  ; 
-Tprep (id,1) =  0.000257  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000372  ; % dot product prep
-Ntri (id) =  28339  ; 
-T (id,2) =  0.001739  ;  % dot product 
-T (id,1) =  0.001126  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  10900  ; 
-Nedges (id) =  31180  ; 
-Tprep (id,1) =  0.000294  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000396  ; % dot product prep
-Ntri (id) =  82856  ; 
-T (id,2) =  0.004584  ;  % dot product 
-T (id,1) =  0.003006  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  10981  ; 
-Nedges (id) =  30855  ; 
-Tprep (id,1) =  0.000291  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000392  ; % dot product prep
-Ntri (id) =  78138  ; 
-T (id,2) =  0.004452  ;  % dot product 
-T (id,1) =  0.002986  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  11157  ; 
-Nedges (id) =  30943  ; 
-Tprep (id,1) =  0.000293  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000526  ; % dot product prep
-Ntri (id) =  72182  ; 
-T (id,2) =  0.004455  ;  % dot product 
-T (id,1) =  0.002982  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  11019  ; 
-Nedges (id) =  31761  ; 
-Tprep (id,1) =  0.000299  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000411  ; % dot product prep
-Ntri (id) =  88905  ; 
-T (id,2) =  0.004839  ;  % dot product 
-T (id,1) =  0.003167  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  11080  ; 
-Nedges (id) =  31538  ; 
-Tprep (id,1) =  0.000295  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000405  ; % dot product prep
-Ntri (id) =  82129  ; 
-T (id,2) =  0.004688  ;  % dot product 
-T (id,1) =  0.003119  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  11113  ; 
-Nedges (id) =  31434  ; 
-Tprep (id,1) =  0.000294  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000404  ; % dot product prep
-Ntri (id) =  78000  ; 
-T (id,2) =  0.004642  ;  % dot product 
-T (id,1) =  0.003083  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  11260  ; 
-Nedges (id) =  31303  ; 
-Tprep (id,1) =  0.000294  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000404  ; % dot product prep
-Ntri (id) =  72866  ; 
-T (id,2) =  0.004491  ;  % dot product 
-T (id,1) =  0.003050  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  11375  ; 
-Nedges (id) =  32287  ; 
-Tprep (id,1) =  0.000303  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000441  ; % dot product prep
-Ntri (id) =  83709  ; 
-T (id,2) =  0.004894  ;  % dot product 
-T (id,1) =  0.003206  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  21074  ; 
-Nedges (id) =  41809  ; 
-Tprep (id,1) =  0.000324  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000428  ; % dot product prep
-Ntri (id) =  20736  ; 
-T (id,2) =  0.000724  ;  % dot product 
-T (id,1) =  0.000735  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  21074  ; 
-Nedges (id) =  41472  ; 
-Tprep (id,1) =  0.000321  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000449  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.000631  ;  % dot product 
-T (id,1) =  0.000681  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  11461  ; 
-Nedges (id) =  32730  ; 
-Tprep (id,1) =  0.000308  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000448  ; % dot product prep
-Ntri (id) =  89541  ; 
-T (id,2) =  0.005036  ;  % dot product 
-T (id,1) =  0.003303  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  8717  ; 
-Nedges (id) =  31525  ; 
-Tprep (id,1) =  0.000289  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000408  ; % dot product prep
-Ntri (id) =  1142  ; 
-T (id,2) =  0.001883  ;  % dot product 
-T (id,1) =  0.000874  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  21074  ; 
-Nedges (id) =  41809  ; 
-Tprep (id,1) =  0.000362  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000429  ; % dot product prep
-Ntri (id) =  1  ; 
-T (id,2) =  0.000651  ;  % dot product 
-T (id,1) =  0.002038  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  8846  ; 
-Nedges (id) =  31839  ; 
-Tprep (id,1) =  0.000290  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000429  ; % dot product prep
-Ntri (id) =  1112  ; 
-T (id,2) =  0.001926  ;  % dot product 
-T (id,1) =  0.000888  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  10876  ; 
-Nedges (id) =  39994  ; 
-Tprep (id,1) =  0.000363  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000523  ; % dot product prep
-Ntri (id) =  934  ; 
-T (id,2) =  0.002355  ;  % dot product 
-T (id,1) =  0.001088  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  26475  ; 
-Nedges (id) =  53381  ; 
-Tprep (id,1) =  0.000552  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000743  ; % dot product prep
-Ntri (id) =  36365  ; 
-T (id,2) =  0.006295  ;  % dot product 
-T (id,1) =  0.005171  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  22687  ; 
-Nedges (id) =  54705  ; 
-Tprep (id,1) =  0.000538  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000802  ; % dot product prep
-Ntri (id) =  806  ; 
-T (id,2) =  0.002729  ;  % dot product 
-T (id,1) =  0.001513  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  26518  ; 
-Nedges (id) =  65369  ; 
-Tprep (id,1) =  0.000641  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000924  ; % dot product prep
-Ntri (id) =  986  ; 
-T (id,2) =  0.003487  ;  % dot product 
-T (id,1) =  0.001816  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  4039  ; 
-Nedges (id) =  88234  ; 
-Tprep (id,1) =  0.000682  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.000866  ; % dot product prep
-Ntri (id) =  1612010  ; 
-T (id,2) =  0.027820  ;  % dot product 
-T (id,1) =  0.012070  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  36244  ; 
-Nedges (id) =  129600  ; 
-Tprep (id,1) =  0.000976  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.001338  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.001697  ;  % dot product 
-T (id,1) =  0.001462  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  36682  ; 
-Nedges (id) =  88328  ; 
-Tprep (id,1) =  0.000857  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.001273  ; % dot product prep
-Ntri (id) =  1590  ; 
-T (id,2) =  0.004555  ;  % dot product 
-T (id,1) =  0.002464  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  36244  ; 
-Nedges (id) =  137164  ; 
-Tprep (id,1) =  0.001029  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.001368  ; % dot product prep
-Ntri (id) =  133321  ; 
-T (id,2) =  0.009167  ;  % dot product 
-T (id,1) =  0.005635  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  36244  ; 
-Nedges (id) =  137164  ; 
-Tprep (id,1) =  0.001028  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.001406  ; % dot product prep
-Ntri (id) =  7  ; 
-T (id,2) =  0.003397  ;  % dot product 
-T (id,1) =  0.001107  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  20400  ; 
-Nedges (id) =  138240  ; 
-Tprep (id,1) =  0.001042  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.001312  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.001691  ;  % dot product 
-T (id,1) =  0.001185  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  26520  ; 
-Nedges (id) =  144000  ; 
-Tprep (id,1) =  0.001095  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.001398  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.001779  ;  % dot product 
-T (id,1) =  0.001333  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  23133  ; 
-Nedges (id) =  93439  ; 
-Tprep (id,1) =  0.000824  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.001179  ; % dot product prep
-Ntri (id) =  173361  ; 
-T (id,2) =  0.008997  ;  % dot product 
-T (id,1) =  0.005252  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  12008  ; 
-Nedges (id) =  118489  ; 
-Tprep (id,1) =  0.000927  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.001253  ; % dot product prep
-Ntri (id) =  3358499  ; 
-T (id,2) =  0.040939  ;  % dot product 
-T (id,1) =  0.019710  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  26520  ; 
-Nedges (id) =  175873  ; 
-Tprep (id,1) =  0.001319  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.001717  ; % dot product prep
-Ntri (id) =  264799  ; 
-T (id,2) =  0.012550  ;  % dot product 
-T (id,1) =  0.017782  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  26520  ; 
-Nedges (id) =  175873  ; 
-Tprep (id,1) =  0.001324  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.001714  ; % dot product prep
-Ntri (id) =  35  ; 
-T (id,2) =  0.012406  ;  % dot product 
-T (id,1) =  0.003520  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  62586  ; 
-Nedges (id) =  147892  ; 
-Tprep (id,1) =  0.001466  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.002156  ; % dot product prep
-Ntri (id) =  2024  ; 
-T (id,2) =  0.007566  ;  % dot product 
-T (id,1) =  0.004296  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  20400  ; 
-Nedges (id) =  217255  ; 
-Tprep (id,1) =  0.001644  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.002090  ; % dot product prep
-Ntri (id) =  155  ; 
-T (id,2) =  0.017273  ;  % dot product 
-T (id,1) =  0.007034  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  20400  ; 
-Nedges (id) =  217255  ; 
-Tprep (id,1) =  0.001627  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.002087  ; % dot product prep
-Ntri (id) =  465427  ; 
-T (id,2) =  0.032884  ;  % dot product 
-T (id,1) =  0.033234  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  36692  ; 
-Nedges (id) =  183831  ; 
-Tprep (id,1) =  0.001579  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.002152  ; % dot product prep
-Ntri (id) =  727044  ; 
-T (id,2) =  0.047701  ;  % dot product 
-T (id,1) =  0.020386  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  65536  ; 
-Nedges (id) =  260610  ; 
-Tprep (id,1) =  0.002274  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.003098  ; % dot product prep
-Ntri (id) =  260100  ; 
-T (id,2) =  0.009634  ;  % dot product 
-T (id,1) =  0.007757  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  18772  ; 
-Nedges (id) =  198050  ; 
-Tprep (id,1) =  0.001593  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.002120  ; % dot product prep
-Ntri (id) =  1351441  ; 
-T (id,2) =  0.044697  ;  % dot product 
-T (id,1) =  0.020233  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  58228  ; 
-Nedges (id) =  214078  ; 
-Tprep (id,1) =  0.001958  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.002739  ; % dot product prep
-Ntri (id) =  494728  ; 
-T (id,2) =  0.033946  ;  % dot product 
-T (id,1) =  0.014273  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  160882  ; 
-Nedges (id) =  320000  ; 
-Tprep (id,1) =  0.002521  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.003453  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.004769  ;  % dot product 
-T (id,1) =  0.005131  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  160882  ; 
-Nedges (id) =  320881  ; 
-Tprep (id,1) =  0.002530  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.003471  ; % dot product prep
-Ntri (id) =  160000  ; 
-T (id,2) =  0.005419  ;  % dot product 
-T (id,1) =  0.005513  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  160882  ; 
-Nedges (id) =  320881  ; 
-Tprep (id,1) =  0.002525  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.003467  ; % dot product prep
-Ntri (id) =  1  ; 
-T (id,2) =  0.004846  ;  % dot product 
-T (id,1) =  0.013695  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  27770  ; 
-Nedges (id) =  352285  ; 
-Tprep (id,1) =  0.002791  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.003763  ; % dot product prep
-Ntri (id) =  1478735  ; 
-T (id,2) =  0.090012  ;  % dot product 
-T (id,1) =  0.042020  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  75879  ; 
-Nedges (id) =  405740  ; 
-Tprep (id,1) =  0.003463  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.004646  ; % dot product prep
-Ntri (id) =  1624481  ; 
-T (id,2) =  0.167726  ;  % dot product 
-T (id,1) =  0.058715  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  265214  ; 
-Nedges (id) =  364481  ; 
-Tprep (id,1) =  0.004144  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.005199  ; % dot product prep
-Ntri (id) =  267313  ; 
-T (id,2) =  0.056350  ;  % dot product 
-T (id,1) =  0.027824  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  34546  ; 
-Nedges (id) =  420877  ; 
-Tprep (id,1) =  0.003342  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.004473  ; % dot product prep
-Ntri (id) =  1276868  ; 
-T (id,2) =  0.085943  ;  % dot product 
-T (id,1) =  0.037497  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  77360  ; 
-Nedges (id) =  469180  ; 
-Tprep (id,1) =  0.004016  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.005353  ; % dot product prep
-Ntri (id) =  551724  ; 
-T (id,2) =  0.149605  ;  % dot product 
-T (id,1) =  0.051030  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  82168  ; 
-Nedges (id) =  504230  ; 
-Tprep (id,1) =  0.004263  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.005745  ; % dot product prep
-Ntri (id) =  602592  ; 
-T (id,2) =  0.161639  ;  % dot product 
-T (id,1) =  0.055747  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  262144  ; 
-Nedges (id) =  1045506  ; 
-Tprep (id,1) =  0.009111  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.011724  ; % dot product prep
-Ntri (id) =  1044484  ; 
-T (id,2) =  0.038591  ;  % dot product 
-T (id,1) =  0.031741  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  132600  ; 
-Nedges (id) =  1152000  ; 
-Tprep (id,1) =  0.008702  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.011136  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.014118  ;  % dot product 
-T (id,1) =  0.009114  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  196591  ; 
-Nedges (id) =  950327  ; 
-Tprep (id,1) =  0.008402  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.011423  ; % dot product prep
-Ntri (id) =  2273138  ; 
-T (id,2) =  0.224672  ;  % dot product 
-T (id,1) =  0.181719  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  262111  ; 
-Nedges (id) =  899792  ; 
-Tprep (id,1) =  0.008349  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.012158  ; % dot product prep
-Ntri (id) =  717719  ; 
-T (id,2) =  0.067463  ;  % dot product 
-T (id,1) =  0.047111  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  132600  ; 
-Nedges (id) =  1582861  ; 
-Tprep (id,1) =  0.011937  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.014973  ; % dot product prep
-Ntri (id) =  3548463  ; 
-T (id,2) =  0.394386  ;  % dot product 
-T (id,1) =  0.816055  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  132600  ; 
-Nedges (id) =  1582861  ; 
-Tprep (id,1) =  0.011866  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.015130  ; % dot product prep
-Ntri (id) =  155  ; 
-T (id,2) =  0.175661  ;  % dot product 
-T (id,1) =  0.142055  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  547924  ; 
-Nedges (id) =  2073600  ; 
-Tprep (id,1) =  0.015897  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.020926  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.029166  ;  % dot product 
-T (id,1) =  0.023934  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  547924  ; 
-Nedges (id) =  2132284  ; 
-Tprep (id,1) =  0.016693  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.021436  ; % dot product prep
-Ntri (id) =  2102761  ; 
-T (id,2) =  0.307040  ;  % dot product 
-T (id,1) =  0.232266  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  547924  ; 
-Nedges (id) =  2132284  ; 
-Tprep (id,1) =  0.016806  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.021424  ; % dot product prep
-Ntri (id) =  7  ; 
-T (id,2) =  0.091494  ;  % dot product 
-T (id,1) =  0.021540  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  1088092  ; 
-Nedges (id) =  1541898  ; 
-Tprep (id,1) =  0.017518  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.023553  ; % dot product prep
-Ntri (id) =  67150  ; 
-T (id,2) =  0.041180  ;  % dot product 
-T (id,1) =  0.104405  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  362440  ; 
-Nedges (id) =  2332800  ; 
-Tprep (id,1) =  0.018072  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.022554  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.030148  ;  % dot product 
-T (id,1) =  0.020813  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  362440  ; 
-Nedges (id) =  2606125  ; 
-Tprep (id,1) =  0.020394  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.025196  ; % dot product prep
-Ntri (id) =  4059175  ; 
-T (id,2) =  0.333721  ;  % dot product 
-T (id,1) =  1.119326  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  362440  ; 
-Nedges (id) =  2606125  ; 
-Tprep (id,1) =  0.020074  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.025049  ; % dot product prep
-Ntri (id) =  35  ; 
-T (id,2) =  0.389651  ;  % dot product 
-T (id,1) =  0.119422  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  1379917  ; 
-Nedges (id) =  1921660  ; 
-Tprep (id,1) =  0.022016  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.029426  ; % dot product prep
-Ntri (id) =  82869  ; 
-T (id,2) =  0.050899  ;  % dot product 
-T (id,1) =  0.129575  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  1048576  ; 
-Nedges (id) =  4188162  ; 
-Tprep (id,1) =  0.038609  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.047033  ; % dot product prep
-Ntri (id) =  4186116  ; 
-T (id,2) =  0.157066  ;  % dot product 
-T (id,1) =  0.130294  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  105938  ; 
-Nedges (id) =  2316948  ; 
-Tprep (id,1) =  0.018262  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.023163  ; % dot product prep
-Ntri (id) =  107987357  ; 
-T (id,2) =  2.332463  ;  % dot product 
-T (id,1) =  0.973177  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  400727  ; 
-Nedges (id) =  2349869  ; 
-Tprep (id,1) =  0.020535  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.027974  ; % dot product prep
-Ntri (id) =  3686467  ; 
-T (id,2) =  0.317234  ;  % dot product 
-T (id,1) =  0.190173  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  410236  ; 
-Nedges (id) =  2439437  ; 
-Tprep (id,1) =  0.021293  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.028944  ; % dot product prep
-Ntri (id) =  3951063  ; 
-T (id,2) =  0.332993  ;  % dot product 
-T (id,1) =  0.194294  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  403394  ; 
-Nedges (id) =  2443408  ; 
-Tprep (id,1) =  0.021255  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.029061  ; % dot product prep
-Ntri (id) =  3986507  ; 
-T (id,2) =  0.336412  ;  % dot product 
-T (id,1) =  0.195120  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  1965206  ; 
-Nedges (id) =  2766607  ; 
-Tprep (id,1) =  0.042657  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.043064  ; % dot product prep
-Ntri (id) =  120676  ; 
-T (id,2) =  0.075238  ;  % dot product 
-T (id,1) =  0.191646  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  174147  ; 
-Nedges (id) =  3800348  ; 
-Tprep (id,1) =  0.030824  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.037843  ; % dot product prep
-Ntri (id) =  82287285  ; 
-T (id,2) =  7.943570  ;  % dot product 
-T (id,1) =  3.133779  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  530400  ; 
-Nedges (id) =  6912000  ; 
-Tprep (id,1) =  0.053676  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.065353  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.089922  ;  % dot product 
-T (id,1) =  0.049658  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  530400  ; 
-Nedges (id) =  11080030  ; 
-Tprep (id,1) =  0.086205  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.103883  ; % dot product prep
-Ntri (id) =  35882427  ; 
-T (id,2) =  6.835646  ;  % dot product 
-T (id,1) =  21.342843  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  530400  ; 
-Nedges (id) =  11080030  ; 
-Tprep (id,1) =  0.085729  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.104021  ; % dot product prep
-Ntri (id) =  651  ; 
-T (id,2) =  2.892264  ;  % dot product 
-T (id,1) =  3.774106  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  335318  ; 
-Nedges (id) =  7729675  ; 
-Tprep (id,1) =  0.061941  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.076279  ; % dot product prep
-Ntri (id) =  186288972  ; 
-T (id,2) =  20.813293  ;  % dot product 
-T (id,1) =  8.419336  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  4194304  ; 
-Nedges (id) =  16764930  ; 
-Tprep (id,1) =  0.156520  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.184832  ; % dot product prep
-Ntri (id) =  16760836  ; 
-T (id,2) =  0.632111  ;  % dot product 
-T (id,1) =  0.526555  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  2174640  ; 
-Nedges (id) =  23328000  ; 
-Tprep (id,1) =  0.182483  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.222154  ; % dot product prep
-Ntri (id) =  0  ; 
-T (id,2) =  0.354778  ;  % dot product 
-T (id,1) =  0.195528  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  645820  ; 
-Nedges (id) =  15680861  ; 
-Tprep (id,1) =  0.127157  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.156208  ; % dot product prep
-Ntri (id) =  419349784  ; 
-T (id,2) =  52.393991  ;  % dot product 
-T (id,1) =  25.049017  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  2174640  ; 
-Nedges (id) =  28667380  ; 
-Tprep (id,1) =  0.220846  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.270545  ; % dot product prep
-Ntri (id) =  155  ; 
-T (id,2) =  7.513081  ;  % dot product 
-T (id,1) =  16.538012  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  2174640  ; 
-Nedges (id) =  28667380  ; 
-Tprep (id,1) =  0.223525  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.271223  ; % dot product prep
-Ntri (id) =  66758995  ; 
-T (id,2) =  17.726470  ;  % dot product 
-T (id,1) =  109.349237  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  3774768  ; 
-Nedges (id) =  16518947  ; 
-Tprep (id,1) =  0.148233  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.201672  ; % dot product prep
-Ntri (id) =  7515023  ; 
-T (id,2) =  3.912948  ;  % dot product 
-T (id,1) =  2.622050  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  1243072  ; 
-Nedges (id) =  31731650  ; 
-Tprep (id,1) =  0.252281  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.310289  ; % dot product prep
-Ntri (id) =  935100883  ; 
-T (id,2) =  138.295633  ;  % dot product 
-T (id,1) =  72.351893  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  16777216  ; 
-Nedges (id) =  67084290  ; 
-Tprep (id,1) =  0.637560  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.737606  ; % dot product prep
-Ntri (id) =  67076100  ; 
-T (id,2) =  2.528375  ;  % dot product 
-T (id,1) =  2.116193  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  2393285  ; 
-Nedges (id) =  64097004  ; 
-Tprep (id,1) =  0.510347  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.628378  ; % dot product prep
-Ntri (id) =  2067392370  ; 
-T (id,2) =  357.924040  ;  % dot product 
-T (id,1) =  192.474544  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  55042369  ; 
-Nedges (id) =  58608800  ; 
-Tprep (id,1) =  1.233890  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.839784  ; % dot product prep
-Ntri (id) =  1443  ; 
-T (id,2) =  2.427427  ;  % dot product 
-T (id,1) =  4.796537  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  67716231  ; 
-Nedges (id) =  69389281  ; 
-Tprep (id,1) =  0.796733  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.990193  ; % dot product prep
-Ntri (id) =  325  ; 
-T (id,2) =  2.633713  ;  % dot product 
-T (id,1) =  5.802966  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  4606314  ; 
-Nedges (id) =  129250705  ; 
-Tprep (id,1) =  1.020718  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  1.256577  ; % dot product prep
-Ntri (id) =  4549133002  ; 
-T (id,2) =  894.724959  ;  % dot product 
-T (id,1) =  603.077244  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  67108864  ; 
-Nedges (id) =  268386306  ; 
-Tprep (id,1) =  2.561061  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  2.947710  ; % dot product prep
-Ntri (id) =  268369924  ; 
-T (id,2) =  10.228181  ;  % dot product 
-T (id,1) =  8.506296  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  139353211  ; 
-Nedges (id) =  148914992  ; 
-Tprep (id,1) =  1.718435  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  2.881230  ; % dot product prep
-Ntri (id) =  3412  ; 
-T (id,2) =  5.698108  ;  % dot product 
-T (id,1) =  12.956621  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  170728175  ; 
-Nedges (id) =  180292586  ; 
-Tprep (id,1) =  2.084988  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  2.588980  ; % dot product prep
-Ntri (id) =  3858  ; 
-T (id,2) =  7.420849  ;  % dot product 
-T (id,1) =  16.644336  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  214005017  ; 
-Nedges (id) =  232705452  ; 
-Tprep (id,1) =  2.687696  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  3.337711  ; % dot product prep
-Ntri (id) =  49  ; 
-T (id,2) =  6.210607  ;  % dot product 
-T (id,1) =  14.065263  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  8860450  ; 
-Nedges (id) =  260261843  ; 
-Tprep (id,1) =  2.074360  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  2.539253  ; % dot product prep
-Ntri (id) =  9936161560  ; 
-T (id,2) =  2200.615854  ;  % dot product 
-T (id,1) =  1843.245643  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  268435456  ; 
-Nedges (id) =  1073643522  ; 
-Tprep (id,1) =  10.267192  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  12.144759  ; % dot product prep
-Ntri (id) =  1073610756  ; 
-T (id,2) =  40.612525  ;  % dot product 
-T (id,1) =  34.749074  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  17043780  ; 
-Nedges (id) =  523467448  ; 
-Tprep (id,1) =  4.133281  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  5.082778  ; % dot product prep
-Ntri (id) =  21575375802  ; 
-T (id,2) =  5714.596929  ;  % dot product 
-T (id,1) =  5794.598052  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  119432957  ; 
-Nedges (id) =  1799999986  ; 
-Tprep (id,1) =  15.286951  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  21.580714  ; % dot product prep
-Ntri (id) =  191716  ; 
-T (id,2) =  454.475198  ;  % dot product 
-T (id,1) =  298.683801  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-% id = id + 1 ;
-% N (id) =  18571154  ; 
-% Nedges (id) =  19020160  ; 
-% Tprep (id,1) =  0.183129  ; % outer product prep
-% Tprep (id,2) = Tprep (id,1) +  0.231539  ; % dot product prep
-% Ntri (id) =  2  ; 
-% T (id,2) =  0.434722  ;  % dot product 
-% T (id,1) =  0.986654  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  18571154  ; 
-Nedges (id) =  19020160  ; 
-Tprep (id,1) =  0.183184  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.232691  ; % dot product prep
-Ntri (id) =  2  ; 
-T (id,2) =  0.435907  ;  % dot product 
-T (id,1) =  0.980264  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  35991342  ; 
-Nedges (id) =  37242710  ; 
-Tprep (id,1) =  0.626479  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.449630  ; % dot product prep
-Ntri (id) =  2  ; 
-T (id,2) =  0.880243  ;  % dot product 
-T (id,1) =  3.629320  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  68863315  ; 
-Nedges (id) =  71707480  ; 
-Tprep (id,1) =  0.684708  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  0.858514  ; % dot product prep
-Ntri (id) =  6  ; 
-T (id,2) =  1.774402  ;  % dot product 
-T (id,1) =  13.613598  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  128568730  ; 
-Nedges (id) =  135117420  ; 
-Tprep (id,1) =  1.291922  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  2.404865  ; % dot product prep
-Ntri (id) =  10  ; 
-T (id,2) =  3.589538  ;  % dot product 
-T (id,1) =  48.577188  ; % outer product 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) =  226196185  ; 
-Nedges (id) =  240023945  ; 
-Tprep (id,1) =  2.273042  ; % outer product prep
-Tprep (id,2) = Tprep (id,1) +  2.861364  ; % dot product prep
-Ntri (id) =  26  ; 
-T (id,2) =  7.733386  ;  % dot product 
-T (id,1) =  176.478079  ; % outer product 
diff --git a/GraphBLAS/Extras/tri/tri_main.c b/GraphBLAS/Extras/tri/tri_main.c
deleted file mode 100644
index 463da96035..0000000000
--- a/GraphBLAS/Extras/tri/tri_main.c
+++ /dev/null
@@ -1,609 +0,0 @@
-//------------------------------------------------------------------------------
-// tri_main.c: count triangles
-//------------------------------------------------------------------------------
-
-// Read a graph from a file and count the # of triangles using two methods.
-// Usage:
-//
-//  tri_main < infile
-//
-// See the "go" script for the whole GraphChallenge collection.
-
-#include "tri_def.h"
-
-#define NPREP 1
-#define NMETHODS 5
-#define CHUNK 1000
-
-// select the system:
-
-// cholesky.cse.tamu.edu: 160 hardware threads (20 cores, SMT8),
-// IBM Power8 8335-GTB, 4GHz, 1TB RAM
-#define MAX_THREADS 160
-#define FOR_ALL_THREADS(max_threads) \
-    for (int nthreads = 1 ; nthreads <= max_threads ; \
-        nthreads = (nthreads == 128) ? 160 : (2*nthreads))
-
-// backslash.cse.tamu.edu: 24 cores, Intel Xeon CPU E5-2695 v2 @ 2.4GHz
-// 3/4 TB RAM
-/*
-#define MAX_THREADS 48
-#define FOR_ALL_THREADS(max_threads) \
-    for (int nthreads = 1 ; nthreads <= max_threads ; \
-        nthreads = (nthreads == 16) ? 24 : (2*nthreads))
-*/
-
-// slash MacBook: 4 cores, Intel Core i7, 2.8Ghz, 16GB RAM
-/*
-#define MAX_THREADS 4
-#define FOR_ALL_THREADS(max_threads) \
-    for (int nthreads = 1 ; nthreads <= max_threads ; nthreads *= 2)
-*/
-
-// sequential
-/*
-#define MAX_THREADS 1
-#define FOR_ALL_THREADS(max_threads) \
-    for (int nthreads = 1 ; nthreads <= max_threads ; nthreads++)
-*/
-
-// uplo: -1 lower, 1 upper, 0 any
-/*
-void dump (char *name, int64_t *Ap, Index *Ai, Index n, int uplo)
-{
-    printf ("\n---- Matrix %s, n %"PRId64"\n", name, n) ;
-    for (int64_t j = 0 ; j < n ; j++)
-    {
-        printf ("column %"PRId64": ", j) ;
-        for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-        {
-            printf (" %"PRId64, Ai [p]) ;
-            bool ok = true ;
-            if (uplo == -1)
-            {
-                ok = (Ai [p] > j) ;
-            }
-            else if (uplo == 1)
-            {
-                ok = (Ai [p] < j) ;
-            }
-            if (!ok) { printf (" !!!\n") ; abort ( ) ; }
-        }
-        printf ("\n") ;
-    }
-}
-*/
-
-int main (int argc, char **argv)
-{
-
-    double tic, T_prep [2][NPREP][MAX_THREADS+1],
-            Time [NMETHODS][NPREP][MAX_THREADS+1] ;
-    int64_t Ntri [NMETHODS][NPREP][MAX_THREADS+1] ;
-
-    //--------------------------------------------------------------------------
-    // get a 1-based symmetric matrix with no self edges, from stdin
-    //--------------------------------------------------------------------------
-
-    FILE *f ;
-    int64_t *Ap ;
-    Index *Ai, n ;
-
-    printf ("=============================================================\n") ;
-
-    tic = omp_get_wtime ( ) ;
-
-    f = stdin ;
-
-    bool skip_simple = (argc > 1) ;
-    fprintf (stderr, "skip simple: %d\n", skip_simple) ;
-
-/*
-    if (argc > 1)
-    {
-        fprintf (stderr, "%s\n", argv [1]) ;
-        printf ("\nfile: %s ", argv [1]) ;
-        f = fopen (argv [1], "r") ;
-        if (f == NULL) { printf (": no such file\n") ; exit (1) ; }
-    }
-    else
-    {
-        f = stdin ;
-    }
-*/
-
-    if (!tri_read (f, &Ap, &Ai, &n))
-    {
-        printf ("failed to read matrix\n") ;
-        exit (1) ;
-    }
-    if (f != stdin) fclose (f) ;
-    double tread = omp_get_wtime ( ) - tic ;
-
-    int64_t nnz = Ap [n] ;
-    int64_t nedges = nnz / 2 ;
-    printf ("n %"PRId64" edges %"PRId64" read time: %10.4f sec\n",
-         n,  nedges, tread) ;
-    fprintf (stderr, "n %"PRId64" edges %"PRId64" read time: %10.4f sec\n",
-         n,  nedges, tread) ;
-
-    // allocate space for S and R
-    // use calloc to page in the space for more consistent timing
-    int64_t *Sp = calloc ((n+1) , sizeof (int64_t)) ;
-    Index   *Si = calloc ((nedges+1) , sizeof (Index)) ;
-    int64_t *Rp = calloc ((n+1) , sizeof (int64_t)) ;
-    Index   *Ri = calloc ((nedges+1) , sizeof (Index)) ;
-    if (Sp == NULL || Si == NULL || Rp == NULL || Ri == NULL)
-    {
-        printf ("out of memory for R and S\n") ;
-        exit (1) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // try each method
-    //--------------------------------------------------------------------------
-
-    for (int prep_method = 0 ; prep_method < NPREP ; prep_method++)
-    {
-
-        // warmup for consistent timing
-        tri_prep (Sp, Si, Ap, Ai, n, prep_method, 1) ;
-
-        //----------------------------------------------------------------------
-        // create S
-        //----------------------------------------------------------------------
-
-        FOR_ALL_THREADS (MAX_THREADS)
-        {
-            tic = omp_get_wtime ( ) ;
-            if (!tri_prep (Sp, Si, Ap, Ai, n, prep_method, nthreads))
-            {
-                printf ("matrix invalid or out of memory\n") ; exit (1) ;
-            }
-            T_prep [0][prep_method][nthreads] = omp_get_wtime ( ) - tic ;
-            fprintf (stderr, "S prep:%d time %12.5f threads %3d\n",
-            prep_method, T_prep [0][prep_method][nthreads], nthreads) ;
-        }
-
-        //----------------------------------------------------------------------
-        // compute R=S' for dot-product method
-        //----------------------------------------------------------------------
-
-        // select the method that computes R=S'
-        int R_prep_method ;
-        switch (prep_method)
-        {
-            case 0: R_prep_method = 1 ; break ;
-            case 1: R_prep_method = 0 ; break ;
-            case 3: R_prep_method = 3 ; break ;
-            case 4: R_prep_method = 4 ; break ;
-        }
-
-        FOR_ALL_THREADS (MAX_THREADS)
-        {
-            tic = omp_get_wtime ( ) ;
-            if (!tri_prep (Rp, Ri, Ap, Ai, n, R_prep_method, nthreads))
-            {
-                printf ("failed to construct R\n") ; exit (1) ;
-            }
-            T_prep [1][prep_method][nthreads] = omp_get_wtime ( ) - tic ;
-            fprintf (stderr, "R prep:%d time %12.5f threads %3d\n",
-            prep_method, T_prep [1][prep_method][nthreads], nthreads) ;
-        }
-
-        //----------------------------------------------------------------------
-        // warmup for more accurate timing
-        //----------------------------------------------------------------------
-
-        tri_mark_parallel (Sp, Si, n, 1, CHUNK) ;
-
-        //----------------------------------------------------------------------
-        // 0: triangle counting with bool Mark array, outer-product method
-        //----------------------------------------------------------------------
-
-        // make sure S has the right structure
-        /*
-        if (prep_method == 0) dump ("S = L for tri_mark", Sp, Si, n, -1) ;
-        if (prep_method == 1) dump ("S = U for tri_mark", Sp, Si, n,  1) ;
-        if (prep_method == 2) dump ("S = perm for tri_mark", Sp, Si, n,  0) ;
-        if (prep_method == 3) dump ("S = perm for tri_mark", Sp, Si, n,  0) ;
-        */
-
-        FOR_ALL_THREADS (MAX_THREADS)
-        {
-            tic = omp_get_wtime ( ) ;
-            int64_t nt = tri_mark_parallel (Sp, Si, n, nthreads, CHUNK) ;
-            double t = omp_get_wtime ( ) - tic ;
-            Ntri [0][prep_method][nthreads] = nt ;
-            Time [0][prep_method][nthreads] = t ;
-            fprintf (stderr, "%d: tri_mark     nthreads %3d : %" PRId64
-                " %12.6f sec rate %7.2f\n", prep_method, nthreads, nt,
-                t, 1e-6*nedges/t) ;
-        }
-
-        //----------------------------------------------------------------------
-        // 1: triangle counting with bit-vector Mark, outer-product method
-        //----------------------------------------------------------------------
-
-        FOR_ALL_THREADS (MAX_THREADS)
-        {
-            tic = omp_get_wtime ( ) ;
-            int64_t nt = tri_bit_parallel (Sp, Si, n, nthreads, CHUNK) ;
-            double t = omp_get_wtime ( ) - tic ;
-            Ntri [1][prep_method][nthreads] = nt ;
-            Time [1][prep_method][nthreads] = t ;
-            fprintf (stderr, "%d: tri_bit      nthreads %3d : %" PRId64
-                " %12.6f sec rate %7.2f\n", prep_method, nthreads, nt,
-                t, 1e-6*nedges/t) ;
-        }
-
-        //----------------------------------------------------------------------
-        // 2: triangle counting, dot-product method
-        //----------------------------------------------------------------------
-
-        FOR_ALL_THREADS (MAX_THREADS)
-        {
-            tic = omp_get_wtime ( ) ;
-            int64_t nt = tri_dot_parallel (Rp, Ri, Sp, Si, n, nthreads, CHUNK) ;
-            double t = omp_get_wtime ( ) - tic ;
-            Ntri [2][prep_method][nthreads] = nt ;
-            Time [2][prep_method][nthreads] = t ;
-            fprintf (stderr, "%d: tri_dot      nthreads %3d : %" PRId64
-                " %12.6f sec rate %7.2f\n", prep_method, nthreads, nt,
-                t, 1e-6*nedges/t) ;
-        }
-
-        //----------------------------------------------------------------------
-        // 3: tri_logmark
-        //----------------------------------------------------------------------
-
-        FOR_ALL_THREADS (MAX_THREADS)
-        {
-            tic = omp_get_wtime ( ) ;
-            int64_t nt = tri_logmark_parallel (Sp, Si, n, nthreads, CHUNK) ;
-            double t = omp_get_wtime ( ) - tic ;
-            Ntri [3][prep_method][nthreads] = nt ;
-            Time [3][prep_method][nthreads] = t ;
-            fprintf (stderr, "%d: tri_logmark  nthreads %3d : %" PRId64
-                " %12.6f sec rate %7.2f\n", prep_method, nthreads, nt,
-                t, 1e-6*nedges/t) ;
-        }
-
-        //----------------------------------------------------------------------
-        // 4: tri_simple (one thread only)
-        //----------------------------------------------------------------------
-
-        fprintf (stderr, "\ntri_simple:\n") ;
-        if (skip_simple)
-        {
-            Ntri [4][prep_method][1] = -1 ;
-            Time [4][prep_method][1] = 9e99 ;
-            fprintf (stderr, "%d: tri_simple   skipped\n", prep_method) ;
-        }
-        else
-        {
-            tic = omp_get_wtime ( ) ;
-            int64_t nt = tri_simple (Sp, Si, n) ;
-            double t = omp_get_wtime ( ) - tic ;
-            Ntri [4][prep_method][1] = nt ;
-            Time [4][prep_method][1] = t ;
-            fprintf (stderr, "%d: tri_simple   nthreads %3d : %" PRId64
-                " %12.6f sec rate %7.2f\n", prep_method, 1, nt,
-                t, 1e-6*nedges/t) ;
-        }
-
-    }
-
-    free (Rp) ;
-    free (Ri) ;
-    free (Sp) ;
-    free (Si) ;
-    free (Ap) ;
-    free (Ai) ;
-
-    //--------------------------------------------------------------------------
-    // report results
-    //--------------------------------------------------------------------------
-
-    int64_t ntri = -1 ;
-    double tbest = 1e99, tbest1 = 1e99 ;
-    int nthreads_best, prep_best, best_method, max_threads ;
-    int prep1_best, best1_method ;
-    double speedup_method ;
-
-    ntri = Ntri [0][0][1] ;
-
-    printf ("-------------------------------------------------------------\n"
-            "RESULTS:\n"
-            "-------------------------------------------------------------\n"
-            "# triangles %" PRId64 "\n\n", ntri) ;
-
-    printf ("prep time:\n") ;
-    for (int e = 0 ; e <= 1 ; e++)
-    {
-        if (e == 0) printf ("prep for all methods:\n") ;
-        else printf ("extra prep for tri_dot:\n") ;
-
-        printf (" #  | 0:S=tril(A)      | 1:S=triu(A)      |"
-                " 2:sort inc       | 3: sort dec      |\n") ;
-        printf ("thr |") ;
-        for (int prep_method = 0 ; prep_method < NPREP ; prep_method++)
-        {
-            printf ("     time speedup |") ;
-        }
-        printf ("\n") ;
-        FOR_ALL_THREADS (MAX_THREADS)
-        {
-            printf ("%3d |", nthreads) ;
-            for (int prep_method = 0 ; prep_method < NPREP ; prep_method++)
-            {
-                printf (" %10.5f %5.1f |",
-                    T_prep [e][prep_method][nthreads],
-                    T_prep [e][prep_method][1] /
-                    T_prep [e][prep_method][nthreads]) ;
-            }
-            printf ("\n") ;
-        }
-        printf ("\n") ;
-    }
-
-    // for later MATLAB analysis:
-    FILE *fm = fopen ("tri_results.m", "a") ;
-    fprintf (fm, "\n%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\n") ;
-    fprintf (fm, "id = id + 1 ;\n") ;
-    fprintf (fm, "N (id) = %ld ;\n", n) ;
-    fprintf (fm, "Nedges (id) = %ld ;\n", nedges) ;
-    fprintf (fm, "Ntri (id) = %ld ;\n", ntri) ;
-    fprintf (fm, "T_prep = nan (2, %d, %d) ; \n", NPREP, MAX_THREADS) ;
-    fprintf (fm, "%% prep: 1:L, 2:U, 3:Lperm, 4:Uperm\n") ;
-    fprintf (fm, "%% T_prep (1, prep_method, nthreads): for all tri methods\n");
-    fprintf (fm, "%% T_prep (2, prep_method, nthreads): just for tri_dot\n") ;
-    fprintf (fm, "%% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple\n") ;
-    fprintf (fm, "%% Time (tri_method, prep_method, nthreads)\n") ;
-    for (int e = 0 ; e <= 1 ; e++)
-    {
-        FOR_ALL_THREADS (MAX_THREADS)
-        {
-            for (int prep_method = 0 ; prep_method < NPREP ; prep_method++)
-            {
-                fprintf (fm, "T_prep (%d,%d,%d) = %12.6g ;\n",
-                        1+e, 1+prep_method, nthreads,
-                        T_prep [e][prep_method][nthreads]) ;
-            }
-        }
-    }
-    fprintf (fm, "Tprep {id} = T_prep ;\n") ;
-    fprintf (fm, "Time = nan (2, %d, %d) ; \n", NPREP, MAX_THREADS) ;
-    for (int method = 0 ; method < NMETHODS ; method++)
-    {
-        FOR_ALL_THREADS (MAX_THREADS)
-        {
-            for (int prep_method = 0 ; prep_method < NPREP ; prep_method++)
-            {
-                if (method < 4 || nthreads == 1)
-                    fprintf (fm, "Time (%d,%d,%d) = %12.6g ;\n",
-                        1+method, 1+prep_method, nthreads,
-                        Time [method][prep_method][nthreads]) ;
-            }
-        }
-    }
-    fprintf (fm, "T {id} = Time ;\n\n") ;
-    fprintf (fm, "File {id} = filetrim (file) ;\n\n") ;
-    fclose (fm) ;
-
-    for (int just_tri = 0 ; just_tri <= 1 ; just_tri++)
-    {
-        if (just_tri)
-        {
-            printf (
-            "\n----------------------------------------\n"
-            "performance excluding prep time"
-            "\n----------------------------------------\n") ;
-        }
-        else
-        {
-            printf (
-            "\n----------------------------------------\n"
-            "performance including prep time"
-            "\n----------------------------------------\n") ;
-        }
-        for (int method = 0 ; method < NMETHODS ; method++)
-        {
-            printf ("\nmethod: ") ;
-            switch (method)
-            {
-                case 0: printf ("tri_mark\n") ;
-                    max_threads = MAX_THREADS ;
-                    break ;
-
-                case 1: printf ("tri_bit\n") ;
-                    max_threads = MAX_THREADS ;
-                    break ;
-
-                case 2:
-                    printf ("tri_dot\n") ;
-                    max_threads = MAX_THREADS ;
-                    break ;
-
-                case 3:
-                    printf ("tri_logmark\n") ;
-                    max_threads = MAX_THREADS ;
-                    break ;
-
-                case 4:
-                    printf ("tri_simple\n") ;
-                    max_threads = 1 ;
-                    break ;
-
-            }
-
-            printf (" #  |") ;
-            for (int prep_method = 0 ; prep_method < NPREP ; prep_method++)
-            {
-                printf ("prep:%d                 |", prep_method) ;
-            }
-            printf ("\n") ;
-
-            printf ("thr |") ;
-            for (int prep_method = 0 ; prep_method < NPREP ; prep_method++)
-            {
-                printf ("   time    rate speedup|") ;
-            }
-            printf ("\n") ;
-
-            // find the best sequential method
-            for (int prep_method = 0 ; prep_method < NPREP ; prep_method++)
-            {
-
-                // sequential time
-                double t1 = Time [method][prep_method][1] ;
-                if (!just_tri)
-                {
-                    t1 += T_prep [0][prep_method][1] ;
-                    if (method == 2) t1 += T_prep [1][prep_method][1] ;
-                }
-                if (t1 < tbest1)
-                {
-                    prep1_best = prep_method ;
-                    best1_method = method ;
-                    tbest1 = t1 ;
-                }
-            }
-
-            // print results and find the best parallel method
-            FOR_ALL_THREADS (max_threads)
-            {
-                printf ("%3d |", nthreads) ;
-                for (int prep_method = 0 ; prep_method < NPREP ; prep_method++)
-                {
-                    int64_t nt = Ntri [method][prep_method][nthreads] ;
-
-                    // parallel time
-                    double t = Time [method][prep_method][nthreads] ;
-                    if (!just_tri)
-                    {
-                        t += T_prep [0][prep_method][nthreads] ;
-                        if (method == 2)
-                        {
-                            t += T_prep [1][prep_method][nthreads];
-                        }
-                    }
-
-                    // sequential time
-                    double t1 = Time [method][prep_method][1] ;
-                    if (!just_tri)
-                    {
-                        t1 += T_prep [0][prep_method][nthreads] ;
-                        if (method == 2)
-                        {
-                            t1 += T_prep [1][prep_method][nthreads] ;
-                        }
-                    }
-
-                    double speedup = t1 / t ;
-
-                    if (ntri == -1) ntri = nt ;
-                    if (ntri != nt) {
-                        printf ("nt is %g\n", (double) nt) ;
-                        printf ("ntri is %g\n", (double) ntri) ;
-                        /* printf ("error!\n") ; exit (1) ;  */
-                        }
-                    printf ("%8.3f %7.2f %5.1f |", t, 1e-6 * nedges / t,
-                        speedup) ;
-                    if (t < tbest)
-                    {
-                        nthreads_best = nthreads ;
-                        prep_best = prep_method ;
-                        best_method = method ;
-                        speedup_method = speedup ;
-                        tbest = t ;
-                    }
-                }
-                printf ("\n") ;
-            }
-        }
-
-        printf ("\n") ;
-
-        printf ("best 1-thread: ") ;
-        switch (best1_method)
-        {
-            case 0: printf ("tri_mark   ") ; break ;
-            case 1: printf ("tri_bit    ") ; break ;
-            case 2: printf ("tri_dot    ") ; break ;
-            case 3: printf ("tri_logmark") ; break ;
-            case 4: printf ("tri_simple ") ; break ;
-        }
-        printf (" threads: %3d prep: %d rate %8.2f              ",
-            1, prep1_best, 1e-6 * nedges / tbest1) ;
-        if (just_tri)
-        {
-            printf (" (excl prep)\n") ;
-        }
-        else
-        {
-            printf (" (with prep)\n") ;
-        }
-
-        fprintf (stderr, "best 1-thread: ") ;
-        switch (best1_method)
-        {
-            case 0: fprintf (stderr, "tri_mark   ") ; break ;
-            case 1: fprintf (stderr, "tri_bit    ") ; break ;
-            case 2: fprintf (stderr, "tri_dot    ") ; break ;
-            case 3: fprintf (stderr, "tri_logmark") ; break ;
-            case 4: fprintf (stderr, "tri_simple ") ; break ;
-        }
-        fprintf (stderr, " threads: %3d prep: %d rate %8.2f              ",
-            1, prep1_best, 1e-6 * nedges / tbest1) ;
-        if (just_tri)
-        {
-            fprintf (stderr, " (excl prep)\n") ;
-        }
-        else
-        {
-            fprintf (stderr, " (with prep)\n") ;
-        }
-
-        printf ("best parallel: ") ;
-        switch (best_method)
-        {
-            case 0: printf ("tri_mark   ") ; break ;
-            case 1: printf ("tri_bit    ") ; break ;
-            case 2: printf ("tri_dot    ") ; break ;
-            case 3: printf ("tri_logmark") ; break ;
-            case 4: printf ("tri_simple ") ; break ;
-        }
-        printf (" threads: %3d prep: %d rate %8.2f speedup %5.1f",
-            nthreads_best, prep_best, 1e-6 * nedges / tbest, speedup_method) ;
-        if (just_tri)
-        {
-            printf (" (excl prep)\n") ;
-        }
-        else
-        {
-            printf (" (with prep)\n") ;
-        }
-
-        fprintf (stderr, "best parallel: ") ;
-        switch (best_method)
-        {
-            case 0: fprintf (stderr, "tri_mark   ") ; break ;
-            case 1: fprintf (stderr, "tri_bit    ") ; break ;
-            case 2: fprintf (stderr, "tri_dot    ") ; break ;
-            case 3: fprintf (stderr, "tri_logmark") ; break ;
-            case 4: fprintf (stderr, "tri_simple ") ; break ;
-        }
-        fprintf (stderr, " threads: %3d prep: %d rate %8.2f speedup %5.1f",
-            nthreads_best, prep_best, 1e-6 * nedges / tbest, speedup_method) ;
-        if (just_tri)
-        {
-            fprintf (stderr, " (excl prep)\n") ;
-        }
-        else
-        {
-            fprintf (stderr, " (with prep)\n") ;
-        }
-    }
-}
-
diff --git a/GraphBLAS/Extras/tri/tri_prep.c b/GraphBLAS/Extras/tri/tri_prep.c
deleted file mode 100644
index 37f0ea7a69..0000000000
--- a/GraphBLAS/Extras/tri/tri_prep.c
+++ /dev/null
@@ -1,410 +0,0 @@
-//------------------------------------------------------------------------------
-// tri_prep: remove edges from a graph, making it acyclic
-//------------------------------------------------------------------------------
-
-// Given a symmetric binary graph A with no self-edges, prune the edges to make
-// it acyclic.  The resulting graph is a symmetric permutation of a lower
-// triangular matrix.
-
-// methods: where [~,p] = sort (sum (A)) ;
-
-// 0: S = tril (A) ;
-// 1: S = triu (A) ;
-// 2: S (p,p) = tril (A (p,p)) ;
-// 3: S (p,p) = triu (A (p,p)) ;
-
-#include "tri_def.h"
-
-//------------------------------------------------------------------------------
-// dsort: sort the rows/cols of A by degree
-//------------------------------------------------------------------------------
-
-// returns a permutation vector perm that sorts the rows/columns of A by
-// increasing degree.  perm[k]=j if column j is the kth column in the permuted
-// matrix.  Ties are sorted by original column index.
-
-static Index *dsort                 // return perm of size n
-(
-    const int64_t *restrict Ap,     // column pointers of A, size n+1
-    const Index n                   // A is n-by-n
-)
-{
-
-    // allocate perm and workspace
-    Index *perm = malloc ((n+1) * sizeof (Index)) ;
-    Index *head = malloc ((n+1) * sizeof (Index)) ;
-    Index *next = malloc ((n+1) * sizeof (Index)) ;
-    if (perm == NULL || head == NULL || next == NULL)
-    {
-        if (perm != NULL) free (perm) ;
-        if (head != NULL) free (head) ;
-        if (next != NULL) free (next) ;
-        return (NULL) ;
-    }
-
-    // empty the degree buckets
-    for (Index d = 0 ; d < n ; d++)
-    {
-        head [d] = -1 ;
-    }
-
-    // place column j in bucket of its degree d
-    for (Index j = n-1 ; j >= 0 ; j--)
-    {
-        Index d = (Index) (Ap [j+1] - Ap [j]) ;
-        next [j] = head [d] ;
-        head [d] = j ;
-    }
-
-    // scan the buckets in increasing degree
-    Index k = 0 ;
-    for (Index d = 0 ; d < n ; d++)
-    {
-        // scan bucket d and append its contents to perm
-        for (Index j = head [d] ; j != -1 ; j = next [j])
-        {
-            perm [k++] = j ;
-        }
-        if (k == n) break ;
-    }
-
-    // free workspace
-    free (head) ;
-    free (next) ;
-
-    // return the permutation
-    return (perm) ;
-}
-
-//------------------------------------------------------------------------------
-// tri_prep: prune an undirected graph to make it acyclic
-//------------------------------------------------------------------------------
-
-// construct the pruned matrix S from the symmetric graph A
-
-bool tri_prep                   // true if successful, false otherwise
-(
-    int64_t *restrict Sp,       // column pointers, size n+1
-    Index *restrict Si,         // row indices
-    const int64_t *restrict Ap, // column pointers, size n+1
-    const Index *restrict Ai,   // row indices
-    const Index n,              // A is n-by-n
-    int method,                 // 0 to 3, see above
-    int nthreads                // # of threads to use
-)
-{
-
-    int64_t snz = 0 ;
-    Index *perm ;
-
-    int64_t anz = Ap [n] ;
-    nthreads = MIN (nthreads, CEIL (anz, 100000)) ;
-
-    if (nthreads <= 2)
-    {
-
-        //----------------------------------------------------------------------
-        // sequential case
-        //----------------------------------------------------------------------
-
-        // If only 2 threads are used, then just use the sequential version.
-        // The parallel version must pass over the data twice.
-
-        switch (method)
-        {
-
-            //------------------------------------------------------------------
-            case 0: // S = tril (A)
-            //------------------------------------------------------------------
-
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    Sp [j] = snz ;
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        Index i = Ai [p] ;
-                        if (i > j)
-                        {
-                            Si [snz++] = i ;
-                        }
-                    }
-                }
-                Sp [n] = snz ;
-                return (true) ;
-
-            //------------------------------------------------------------------
-            case 1: // S = triu (A)
-            //------------------------------------------------------------------
-
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    Sp [j] = snz ;
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        Index i = Ai [p] ;
-                        if (i < j)
-                        {
-                            Si [snz++] = i ;
-                        }
-                    }
-                }
-                Sp [n] = snz ;
-                return (true) ;
-
-            //------------------------------------------------------------------
-            case 2: // sort by increasing degree:  S (p,p) = tril (A (p,p))
-            //------------------------------------------------------------------
-
-                perm = dsort (Ap, n) ;
-                if (perm == NULL) return (false) ;
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    Sp [j] = snz ;
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        Index i = Ai [p] ;
-                        if (perm [i] > perm [j])
-                        {
-                            Si [snz++] = i ;
-                        }
-                    }
-                }
-                Sp [n] = snz ;
-                free (perm) ;
-                return (true) ;
-
-            //------------------------------------------------------------------
-            case 3: // sort by decreasing degree:  S (p,p) = triu (A (p,p))
-            //------------------------------------------------------------------
-
-                perm = dsort (Ap, n) ;
-                if (perm == NULL) return (false) ;
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    Sp [j] = snz ;
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        Index i = Ai [p] ;
-                        if (perm [i] < perm [j])
-                        {
-                            Si [snz++] = i ;
-                        }
-                    }
-                }
-                Sp [n] = snz ;
-                free (perm) ;
-                return (true) ;
-
-            default: return (false) ;
-        }
-
-    }
-    else
-    {
-
-        //----------------------------------------------------------------------
-        // parallel case
-        //----------------------------------------------------------------------
-
-        switch (method)
-        {
-
-            //------------------------------------------------------------------
-            case 0: // S = tril (A)
-            //------------------------------------------------------------------
-
-                // count the entries in each column of S
-                #pragma omp parallel for num_threads(nthreads)
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    int64_t jnz = 0 ;
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        Index i = Ai [p] ;
-                        if (i > j)
-                        {
-                            jnz++ ;
-                        }
-                    }
-                    Sp [j] = jnz ;
-                }
-
-                // construct the column pointers of S
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    int64_t jnz = Sp [j] ;
-                    Sp [j] = snz ;
-                    snz += jnz ;
-                }
-                Sp [n] = snz ;
-
-                // construct the row indices of S
-                #pragma omp parallel for num_threads(nthreads)
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    int64_t s = Sp [j] ;
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        Index i = Ai [p] ;
-                        if (i > j)
-                        {
-                            Si [s++] = i ;
-                        }
-                    }
-                }
-
-                return (true) ;
-
-            //------------------------------------------------------------------
-            case 1: // S = triu (A)
-            //------------------------------------------------------------------
-
-                // count the entries in each column of S
-                #pragma omp parallel for num_threads(nthreads)
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    int64_t jnz = 0 ;
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        Index i = Ai [p] ;
-                        if (i < j)
-                        {
-                            jnz++ ;
-                        }
-                    }
-                    Sp [j] = jnz ;
-                }
-
-                // construct the column pointers of S
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    int64_t jnz = Sp [j] ;
-                    Sp [j] = snz ;
-                    snz += jnz ;
-                }
-                Sp [n] = snz ;
-
-                // construct the row indices of S
-                #pragma omp parallel for num_threads(nthreads)
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    int64_t s = Sp [j] ;
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        Index i = Ai [p] ;
-                        if (i < j)
-                        {
-                            Si [s++] = i ;
-                        }
-                    }
-                }
-
-                return (true) ;
-
-            //------------------------------------------------------------------
-            case 2: // sort by increasing degree:  S (p,p) = tril (A (p,p))
-            //------------------------------------------------------------------
-
-                // sort the columns
-                perm = dsort (Ap, n) ;
-                if (perm == NULL) return (false) ;
-
-                // count the entries in each column of S
-                #pragma omp parallel for num_threads(nthreads)
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    int64_t jnz = 0 ;
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        Index i = Ai [p] ;
-                        if (perm [i] > perm [j])
-                        {
-                            jnz++ ;
-                        }
-                    }
-                    Sp [j] = jnz ;
-                }
-
-                // construct the column pointers of S
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    int64_t jnz = Sp [j] ;
-                    Sp [j] = snz ;
-                    snz += jnz ;
-                }
-                Sp [n] = snz ;
-
-                // construct the row indices of S
-                #pragma omp parallel for num_threads(nthreads)
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    int64_t s = Sp [j] ;
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        Index i = Ai [p] ;
-                        if (perm [i] > perm [j])
-                        {
-                            Si [s++] = i ;
-                        }
-                    }
-                }
-
-                free (perm) ;
-                return (true) ;
-
-            //------------------------------------------------------------------
-            case 3: // sort by decreasing degree:  S (p,p) = triu (A (p,p))
-            //------------------------------------------------------------------
-
-                // sort the columns
-                perm = dsort (Ap, n) ;
-                if (perm == NULL) return (false) ;
-
-                // count the entries in each column of S
-                #pragma omp parallel for num_threads(nthreads)
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    int64_t jnz = 0 ;
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        Index i = Ai [p] ;
-                        if (perm [i] < perm [j])
-                        {
-                            jnz++ ;
-                        }
-                    }
-                    Sp [j] = jnz ;
-                }
-
-                // construct the column pointers of S
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    int64_t jnz = Sp [j] ;
-                    Sp [j] = snz ;
-                    snz += jnz ;
-                }
-                Sp [n] = snz ;
-
-                // construct the row indices of S
-                #pragma omp parallel for num_threads(nthreads)
-                for (Index j = 0 ; j < n ; j++)
-                {
-                    int64_t s = Sp [j] ;
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        Index i = Ai [p] ;
-                        if (perm [i] < perm [j])
-                        {
-                            Si [s++] = i ;
-                        }
-                    }
-                }
-
-                free (perm) ;
-                return (true) ;
-
-            default: return (false) ;
-        }
-    }
-}
-
diff --git a/GraphBLAS/Extras/tri/tri_read.c b/GraphBLAS/Extras/tri/tri_read.c
deleted file mode 100644
index 6aab89acf3..0000000000
--- a/GraphBLAS/Extras/tri/tri_read.c
+++ /dev/null
@@ -1,155 +0,0 @@
-//------------------------------------------------------------------------------
-// tri_read: read an adjacency matrix from a file
-//------------------------------------------------------------------------------
-
-// The file has one entry per line, separated by white space: i j x, where
-// A(i,j)=x.  The value x is ignored.  The matrix in the input file must be
-// symmetric with no self-edges, but these conditions are not checked.  The
-// entries must appear in sorted order, by increasing column index and the by
-// increasing row index.  Indices in the input file are 1-based, but the matrix
-// A returned is 0-based.
-
-#include "tri_def.h"
-
-#define FREE_ALL                    \
-    if (I2 != NULL) free (I2) ;     \
-    if (J2 != NULL) free (J2) ;
-
-bool tri_read           // true if successful, false otherwise
-(
-    FILE *f,            // file for reading, already open (can be stdin)
-    int64_t **p_Ap,     // Ap: column pointers, of size n+1
-    Index **p_Ai,       // Ai: row indices, of size nz = Ap [n]
-    Index *p_n          // A is n-by-n
-)
-{
-
-    //--------------------------------------------------------------------------
-    // allocate initial space for the triplets
-    //--------------------------------------------------------------------------
-
-    int64_t len = 1024 * 1024 ;
-    Index *I = malloc (len * sizeof (Index)), *I2 = NULL ;
-    Index *J = malloc (len * sizeof (Index)), *J2 = NULL ;
-
-    if (I == NULL || J == NULL)
-    {
-        if (I != NULL) free (I) ;
-        if (J != NULL) free (J) ;
-        printf ("out of memory\n") ;
-        return (false) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // read the triplets into I and J
-    //--------------------------------------------------------------------------
-
-    double xx ;
-    int64_t ii, jj ;
-    int64_t ntuples = 0 ;
-    Index i, j, ilast = -1, jlast = -1, n = 1 ;
-
-    while (fscanf (f, "%lld %lld %lg", &ii, &jj, &xx) != EOF)
-    {
-
-        // double the size of I and J if needed
-        if (ntuples >= len)
-        {
-            I2 = realloc (I, 2 * len * sizeof (Index)) ;
-            J2 = realloc (J, 2 * len * sizeof (Index)) ;
-            bool ok = (I2 != NULL) && (J2 != NULL) ;
-            if (I2 != NULL) I = I2 ;
-            if (J2 != NULL) J = J2 ;
-            if (!ok)
-            {
-                free (I) ;
-                free (J) ;
-                printf ("out of memory\n") ;
-                return (false) ;
-            }
-            len *= 2 ;
-        }
-
-        // check if the indices are valid
-        if (ii > INDEX_MAX || jj > INDEX_MAX)
-        {
-            printf ("problem too large, max n = %g\n", (double) INDEX_MAX) ;
-            return (false) ;
-        }
-        if (ii < 1 || jj < 1)
-        {
-            printf ("invalid row or column index\n") ;
-            return (false) ;
-        }
-
-        // find the dimension
-        n = MAX (n, ii) ;
-        n = MAX (n, jj) ;
-
-        // convert indices to zero-based
-        i = (Index) (ii-1) ;
-        j = (Index) (jj-1) ;
-
-        // check if sorted
-        if (j < jlast || (j == jlast && i <= ilast))
-        {
-            printf ("invalid: entries not sorted on input\n") ;
-            return (false) ;
-        }
-
-        // save the tuples, but delete any self-edges
-        if (i != j)
-        {
-            I [ntuples] = i ;
-            J [ntuples] = j ;
-            ntuples++ ;
-        }
-
-        ilast = i ;
-        jlast = j ;
-    }
-
-    //--------------------------------------------------------------------------
-    // construct the column pointers
-    //--------------------------------------------------------------------------
-
-    int64_t *Ap = malloc ((n+1) * sizeof (int64_t)) ;
-    if (Ap == NULL)
-    {
-        FREE_ALL ;
-        printf ("out of memory\n") ;
-        return (false) ;
-    }
-
-    jlast = -1 ;
-    for (int64_t p = 0 ; p < ntuples ; p++)
-    {
-        j = J [p] ;
-        if (j > jlast)
-        {
-            // p is the start of columns jlast+1 to j
-            for (Index j2 = jlast+1 ; j2 <= j ; j2++)
-            {
-                Ap [j2] = p ;
-            }
-        }
-        jlast = j ;
-    }
-
-    for (Index j = jlast+1 ; j <= n ; j++)
-    {
-        Ap [n] = ntuples ;
-    }
-
-    //--------------------------------------------------------------------------
-    // free workspace and return result
-    //--------------------------------------------------------------------------
-
-    (*p_Ap) = Ap ;
-    (*p_Ai) = I ;
-    (*p_n) = n ;
-
-    free (J) ;
-    return (true) ;
-}
-
diff --git a/GraphBLAS/Extras/tri/tri_results.m b/GraphBLAS/Extras/tri/tri_results.m
deleted file mode 100644
index 782b50014e..0000000000
--- a/GraphBLAS/Extras/tri/tri_results.m
+++ /dev/null
@@ -1,1409 +0,0 @@
-function [T, Tprep, N, Nedges, Ntri, File] = tri_results
-id = 0 ;
-file = ' /users/davis/GraphChallenge/snap/cit-HepPh/cit-HepPh_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 34546 ;
-Nedges (id) = 420877 ;
-Ntri (id) = 1276868 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =   0.00172177 ;
-T_prep (1,1,2) =   0.00159929 ;
-T_prep (1,1,4) =    0.0099232 ;
-T_prep (1,1,8) =   0.00061516 ;
-T_prep (1,1,16) =  0.000420875 ;
-T_prep (1,1,32) =  0.000394518 ;
-T_prep (1,1,64) =  0.000395483 ;
-T_prep (1,1,128) =  0.000392887 ;
-T_prep (1,1,160) =  0.000395406 ;
-T_prep (2,1,1) =   0.00175747 ;
-T_prep (2,1,2) =   0.00155222 ;
-T_prep (2,1,4) =  0.000967097 ;
-T_prep (2,1,8) =  0.000579378 ;
-T_prep (2,1,16) =  0.000405272 ;
-T_prep (2,1,32) =   0.00038354 ;
-T_prep (2,1,64) =  0.000380456 ;
-T_prep (2,1,128) =  0.000378498 ;
-T_prep (2,1,160) =  0.000379262 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =    0.0122288 ;
-Time (1,1,2) =   0.00754383 ;
-Time (1,1,4) =   0.00379281 ;
-Time (1,1,8) =   0.00190407 ;
-Time (1,1,16) =   0.00183442 ;
-Time (1,1,32) =   0.00467137 ;
-Time (1,1,64) =   0.00585735 ;
-Time (1,1,128) =    0.0292829 ;
-Time (1,1,160) =    0.0208727 ;
-Time (2,1,1) =     0.196432 ;
-Time (2,1,2) =     0.096601 ;
-Time (2,1,4) =    0.0461234 ;
-Time (2,1,8) =    0.0212591 ;
-Time (2,1,16) =       0.0147 ;
-Time (2,1,32) =    0.0123689 ;
-Time (2,1,64) =    0.0113109 ;
-Time (2,1,128) =    0.0244843 ;
-Time (2,1,160) =    0.0335951 ;
-Time (3,1,1) =     0.579493 ;
-Time (3,1,2) =     0.283587 ;
-Time (3,1,4) =     0.142558 ;
-Time (3,1,8) =    0.0704943 ;
-Time (3,1,16) =    0.0479423 ;
-Time (3,1,32) =    0.0423779 ;
-Time (3,1,64) =    0.0916506 ;
-Time (3,1,128) =    0.0426447 ;
-Time (3,1,160) =    0.0539626 ;
-Time (4,1,1) =    0.0872232 ;
-Time (4,1,2) =    0.0428946 ;
-Time (4,1,4) =    0.0209487 ;
-Time (4,1,8) =   0.00979565 ;
-Time (4,1,16) =   0.00570474 ;
-Time (4,1,32) =   0.00573493 ;
-Time (4,1,64) =   0.00630966 ;
-Time (4,1,128) =   0.00505143 ;
-Time (4,1,160) =   0.00567714 ;
-Time (5,1,1) =    0.0748151 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 265214 ;
-Nedges (id) = 364481 ;
-Ntri (id) = 267313 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =   0.00215169 ;
-T_prep (1,1,2) =   0.00213172 ;
-T_prep (1,1,4) =    0.0109282 ;
-T_prep (1,1,8) =   0.00133665 ;
-T_prep (1,1,16) =   0.00121882 ;
-T_prep (1,1,32) =    0.0012298 ;
-T_prep (1,1,64) =   0.00123333 ;
-T_prep (1,1,128) =   0.00123885 ;
-T_prep (1,1,160) =   0.00125613 ;
-T_prep (2,1,1) =   0.00253202 ;
-T_prep (2,1,2) =   0.00214532 ;
-T_prep (2,1,4) =   0.00243376 ;
-T_prep (2,1,8) =   0.00118696 ;
-T_prep (2,1,16) =   0.00117818 ;
-T_prep (2,1,32) =   0.00117486 ;
-T_prep (2,1,64) =   0.00117802 ;
-T_prep (2,1,128) =   0.00118144 ;
-T_prep (2,1,160) =   0.00118114 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =    0.0118194 ;
-Time (1,1,2) =   0.00882767 ;
-Time (1,1,4) =    0.0053405 ;
-Time (1,1,8) =   0.00239646 ;
-Time (1,1,16) =   0.00230489 ;
-Time (1,1,32) =   0.00213822 ;
-Time (1,1,64) =   0.00371994 ;
-Time (1,1,128) =     0.023929 ;
-Time (1,1,160) =    0.0422437 ;
-Time (2,1,1) =     0.199444 ;
-Time (2,1,2) =     0.090692 ;
-Time (2,1,4) =    0.0425724 ;
-Time (2,1,8) =    0.0198362 ;
-Time (2,1,16) =    0.0181026 ;
-Time (2,1,32) =   0.00869824 ;
-Time (2,1,64) =    0.0049364 ;
-Time (2,1,128) =    0.0041164 ;
-Time (2,1,160) =    0.0109694 ;
-Time (3,1,1) =     0.435986 ;
-Time (3,1,2) =     0.212671 ;
-Time (3,1,4) =       0.1016 ;
-Time (3,1,8) =    0.0486389 ;
-Time (3,1,16) =    0.0237879 ;
-Time (3,1,32) =    0.0144228 ;
-Time (3,1,64) =    0.0184733 ;
-Time (3,1,128) =     0.011262 ;
-Time (3,1,160) =    0.0281922 ;
-Time (4,1,1) =    0.0909414 ;
-Time (4,1,2) =    0.0445853 ;
-Time (4,1,4) =    0.0225091 ;
-Time (4,1,8) =    0.0112915 ;
-Time (4,1,16) =   0.00584894 ;
-Time (4,1,32) =   0.00385881 ;
-Time (4,1,64) =    0.0035955 ;
-Time (4,1,128) =   0.00403222 ;
-Time (4,1,160) =    0.0105378 ;
-Time (5,1,1) =     0.283936 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Epinions1/soc-Epinions1_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 75879 ;
-Nedges (id) = 405740 ;
-Ntri (id) = 1624481 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =   0.00174613 ;
-T_prep (1,1,2) =   0.00174408 ;
-T_prep (1,1,4) =    0.0102386 ;
-T_prep (1,1,8) =  0.000839096 ;
-T_prep (1,1,16) =  0.000633582 ;
-T_prep (1,1,32) =  0.000613992 ;
-T_prep (1,1,64) =  0.000616338 ;
-T_prep (1,1,128) =  0.000665922 ;
-T_prep (1,1,160) =  0.000674273 ;
-T_prep (2,1,1) =   0.00203371 ;
-T_prep (2,1,2) =   0.00174673 ;
-T_prep (2,1,4) =   0.00128628 ;
-T_prep (2,1,8) =  0.000759095 ;
-T_prep (2,1,16) =  0.000693751 ;
-T_prep (2,1,32) =   0.00070261 ;
-T_prep (2,1,64) =  0.000696656 ;
-T_prep (2,1,128) =  0.000699433 ;
-T_prep (2,1,160) =  0.000700285 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =     0.018514 ;
-Time (1,1,2) =    0.0123019 ;
-Time (1,1,4) =   0.00677538 ;
-Time (1,1,8) =   0.00316501 ;
-Time (1,1,16) =   0.00288289 ;
-Time (1,1,32) =   0.00472351 ;
-Time (1,1,64) =    0.0151164 ;
-Time (1,1,128) =    0.0285686 ;
-Time (1,1,160) =    0.0240511 ;
-Time (2,1,1) =     0.398338 ;
-Time (2,1,2) =     0.195772 ;
-Time (2,1,4) =     0.089314 ;
-Time (2,1,8) =    0.0427568 ;
-Time (2,1,16) =    0.0366971 ;
-Time (2,1,32) =    0.0367389 ;
-Time (2,1,64) =    0.0324279 ;
-Time (2,1,128) =    0.0370532 ;
-Time (2,1,160) =    0.0347489 ;
-Time (3,1,1) =      1.33743 ;
-Time (3,1,2) =     0.708084 ;
-Time (3,1,4) =     0.367683 ;
-Time (3,1,8) =     0.234659 ;
-Time (3,1,16) =     0.155909 ;
-Time (3,1,32) =     0.133575 ;
-Time (3,1,64) =     0.139907 ;
-Time (3,1,128) =     0.210402 ;
-Time (3,1,160) =     0.228959 ;
-Time (4,1,1) =     0.128571 ;
-Time (4,1,2) =    0.0660288 ;
-Time (4,1,4) =    0.0331695 ;
-Time (4,1,8) =    0.0166085 ;
-Time (4,1,16) =    0.0158287 ;
-Time (4,1,32) =    0.0144873 ;
-Time (4,1,64) =   0.00938948 ;
-Time (4,1,128) =    0.0090725 ;
-Time (4,1,160) =   0.00635095 ;
-Time (5,1,1) =      0.12141 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 82168 ;
-Nedges (id) = 504230 ;
-Ntri (id) = 602592 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =   0.00220339 ;
-T_prep (1,1,2) =   0.00219794 ;
-T_prep (1,1,4) =    0.0105152 ;
-T_prep (1,1,8) =   0.00149857 ;
-T_prep (1,1,16) =   0.00129807 ;
-T_prep (1,1,32) =   0.00108569 ;
-T_prep (1,1,64) =   0.00110022 ;
-T_prep (1,1,128) =   0.00109902 ;
-T_prep (1,1,160) =   0.00111153 ;
-T_prep (2,1,1) =   0.00256478 ;
-T_prep (2,1,2) =   0.00220822 ;
-T_prep (2,1,4) =   0.00166903 ;
-T_prep (2,1,8) =   0.00108518 ;
-T_prep (2,1,16) =  0.000830401 ;
-T_prep (2,1,32) =  0.000828937 ;
-T_prep (2,1,64) =  0.000829672 ;
-T_prep (2,1,128) =  0.000837812 ;
-T_prep (2,1,160) =  0.000839303 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =    0.0217925 ;
-Time (1,1,2) =    0.0164594 ;
-Time (1,1,4) =   0.00851251 ;
-Time (1,1,8) =   0.00388461 ;
-Time (1,1,16) =   0.00486799 ;
-Time (1,1,32) =    0.0065027 ;
-Time (1,1,64) =    0.0112705 ;
-Time (1,1,128) =    0.0506854 ;
-Time (1,1,160) =     0.025578 ;
-Time (2,1,1) =     0.449198 ;
-Time (2,1,2) =     0.201236 ;
-Time (2,1,4) =    0.0961724 ;
-Time (2,1,8) =    0.0442138 ;
-Time (2,1,16) =    0.0347344 ;
-Time (2,1,32) =    0.0304745 ;
-Time (2,1,64) =    0.0407797 ;
-Time (2,1,128) =    0.0489277 ;
-Time (2,1,160) =    0.0280076 ;
-Time (3,1,1) =      1.16894 ;
-Time (3,1,2) =     0.591891 ;
-Time (3,1,4) =      0.33867 ;
-Time (3,1,8) =     0.220315 ;
-Time (3,1,16) =     0.170014 ;
-Time (3,1,32) =     0.131553 ;
-Time (3,1,64) =     0.127014 ;
-Time (3,1,128) =     0.283506 ;
-Time (3,1,160) =     0.212026 ;
-Time (4,1,1) =      0.14859 ;
-Time (4,1,2) =    0.0777948 ;
-Time (4,1,4) =    0.0392363 ;
-Time (4,1,8) =    0.0197891 ;
-Time (4,1,16) =    0.0130086 ;
-Time (4,1,32) =    0.0133412 ;
-Time (4,1,64) =    0.0138385 ;
-Time (4,1,128) =     0.010518 ;
-Time (4,1,160) =   0.00452247 ;
-Time (5,1,1) =     0.139655 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/flickrEdges/flickrEdges_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 105938 ;
-Nedges (id) = 2316948 ;
-Ntri (id) = 107987357 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =   0.00733979 ;
-T_prep (1,1,2) =   0.00717259 ;
-T_prep (1,1,4) =    0.0132894 ;
-T_prep (1,1,8) =   0.00418624 ;
-T_prep (1,1,16) =   0.00234925 ;
-T_prep (1,1,32) =   0.00335234 ;
-T_prep (1,1,64) =   0.00294828 ;
-T_prep (1,1,128) =   0.00188383 ;
-T_prep (1,1,160) =   0.00186422 ;
-T_prep (2,1,1) =    0.0200621 ;
-T_prep (2,1,2) =    0.0216051 ;
-T_prep (2,1,4) =    0.0139994 ;
-T_prep (2,1,8) =   0.00816896 ;
-T_prep (2,1,16) =    0.0040576 ;
-T_prep (2,1,32) =   0.00246876 ;
-T_prep (2,1,64) =   0.00185912 ;
-T_prep (2,1,128) =   0.00158692 ;
-T_prep (2,1,160) =   0.00157906 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =     0.593228 ;
-Time (1,1,2) =     0.326745 ;
-Time (1,1,4) =     0.162824 ;
-Time (1,1,8) =    0.0932065 ;
-Time (1,1,16) =    0.0677845 ;
-Time (1,1,32) =    0.0702636 ;
-Time (1,1,64) =    0.0825883 ;
-Time (1,1,128) =    0.0980902 ;
-Time (1,1,160) =    0.0882717 ;
-Time (2,1,1) =      4.06352 ;
-Time (2,1,2) =      2.15187 ;
-Time (2,1,4) =     0.782527 ;
-Time (2,1,8) =     0.463504 ;
-Time (2,1,16) =      0.54339 ;
-Time (2,1,32) =     0.303844 ;
-Time (2,1,64) =     0.286989 ;
-Time (2,1,128) =     0.372933 ;
-Time (2,1,160) =     0.213527 ;
-Time (3,1,1) =      17.2696 ;
-Time (3,1,2) =      8.52598 ;
-Time (3,1,4) =      3.39734 ;
-Time (3,1,8) =      1.84423 ;
-Time (3,1,16) =      1.00018 ;
-Time (3,1,32) =     0.637341 ;
-Time (3,1,64) =      0.77338 ;
-Time (3,1,128) =     0.708011 ;
-Time (3,1,160) =     0.612325 ;
-Time (4,1,1) =      1.36097 ;
-Time (4,1,2) =     0.683428 ;
-Time (4,1,4) =     0.314546 ;
-Time (4,1,8) =     0.176254 ;
-Time (4,1,16) =     0.103766 ;
-Time (4,1,32) =     0.108882 ;
-Time (4,1,64) =    0.0849441 ;
-Time (4,1,128) =    0.0982898 ;
-Time (4,1,160) =     0.219638 ;
-Time (5,1,1) =      1.29537 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 3774768 ;
-Nedges (id) = 16518947 ;
-Ntri (id) = 7515023 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =    0.0832786 ;
-T_prep (1,1,2) =    0.0832394 ;
-T_prep (1,1,4) =    0.0647605 ;
-T_prep (1,1,8) =    0.0482139 ;
-T_prep (1,1,16) =    0.0489521 ;
-T_prep (1,1,32) =    0.0378229 ;
-T_prep (1,1,64) =    0.0450291 ;
-T_prep (1,1,128) =    0.0657901 ;
-T_prep (1,1,160) =    0.0901113 ;
-T_prep (2,1,1) =      0.60908 ;
-T_prep (2,1,2) =     0.657463 ;
-T_prep (2,1,4) =     0.442365 ;
-T_prep (2,1,8) =     0.234065 ;
-T_prep (2,1,16) =      0.17469 ;
-T_prep (2,1,32) =     0.102708 ;
-T_prep (2,1,64) =    0.0778727 ;
-T_prep (2,1,128) =    0.0647799 ;
-T_prep (2,1,160) =    0.0607464 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =      3.52044 ;
-Time (1,1,2) =      1.76475 ;
-Time (1,1,4) =     0.860002 ;
-Time (1,1,8) =     0.423721 ;
-Time (1,1,16) =     0.227632 ;
-Time (1,1,32) =     0.131198 ;
-Time (1,1,64) =     0.119124 ;
-Time (1,1,128) =     0.149866 ;
-Time (1,1,160) =     0.165292 ;
-Time (2,1,1) =      3.92377 ;
-Time (2,1,2) =      2.12131 ;
-Time (2,1,4) =      1.01931 ;
-Time (2,1,8) =     0.480549 ;
-Time (2,1,16) =     0.237128 ;
-Time (2,1,32) =     0.116611 ;
-Time (2,1,64) =     0.067831 ;
-Time (2,1,128) =    0.0705961 ;
-Time (2,1,160) =    0.0695417 ;
-Time (3,1,1) =      7.18595 ;
-Time (3,1,2) =      3.75441 ;
-Time (3,1,4) =      1.86724 ;
-Time (3,1,8) =     0.904827 ;
-Time (3,1,16) =     0.442388 ;
-Time (3,1,32) =     0.219984 ;
-Time (3,1,64) =     0.125101 ;
-Time (3,1,128) =     0.113114 ;
-Time (3,1,160) =     0.117255 ;
-Time (4,1,1) =      3.62881 ;
-Time (4,1,2) =      1.92004 ;
-Time (4,1,4) =     0.939712 ;
-Time (4,1,8) =     0.438684 ;
-Time (4,1,16) =     0.247607 ;
-Time (4,1,32) =     0.136795 ;
-Time (4,1,64) =     0.110929 ;
-Time (4,1,128) =     0.145937 ;
-Time (4,1,160) =     0.157412 ;
-Time (5,1,1) =      3.29286 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/amazon0302/amazon0302_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 262111 ;
-Nedges (id) = 899792 ;
-Ntri (id) = 717719 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =   0.00533128 ;
-T_prep (1,1,2) =   0.00530913 ;
-T_prep (1,1,4) =    0.0123212 ;
-T_prep (1,1,8) =   0.00185314 ;
-T_prep (1,1,16) =   0.00191198 ;
-T_prep (1,1,32) =   0.00134487 ;
-T_prep (1,1,64) =   0.00128675 ;
-T_prep (1,1,128) =     0.001314 ;
-T_prep (1,1,160) =   0.00133804 ;
-T_prep (2,1,1) =   0.00630471 ;
-T_prep (2,1,2) =   0.00546483 ;
-T_prep (2,1,4) =   0.00546332 ;
-T_prep (2,1,8) =   0.00279296 ;
-T_prep (2,1,16) =   0.00148509 ;
-T_prep (2,1,32) =   0.00135435 ;
-T_prep (2,1,64) =   0.00132286 ;
-T_prep (2,1,128) =   0.00133467 ;
-T_prep (2,1,160) =   0.00134267 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =    0.0194887 ;
-Time (1,1,2) =    0.0126948 ;
-Time (1,1,4) =   0.00714006 ;
-Time (1,1,8) =   0.00395695 ;
-Time (1,1,16) =   0.00195887 ;
-Time (1,1,32) =   0.00199372 ;
-Time (1,1,64) =   0.00345989 ;
-Time (1,1,128) =    0.0185475 ;
-Time (1,1,160) =    0.0316882 ;
-Time (2,1,1) =     0.139093 ;
-Time (2,1,2) =    0.0734908 ;
-Time (2,1,4) =     0.034775 ;
-Time (2,1,8) =    0.0178267 ;
-Time (2,1,16) =   0.00873007 ;
-Time (2,1,32) =   0.00458344 ;
-Time (2,1,64) =   0.00609534 ;
-Time (2,1,128) =    0.0184151 ;
-Time (2,1,160) =   0.00479449 ;
-Time (3,1,1) =     0.188339 ;
-Time (3,1,2) =     0.100398 ;
-Time (3,1,4) =    0.0479633 ;
-Time (3,1,8) =     0.024461 ;
-Time (3,1,16) =    0.0115969 ;
-Time (3,1,32) =   0.00597034 ;
-Time (3,1,64) =   0.00692002 ;
-Time (3,1,128) =   0.00367406 ;
-Time (3,1,160) =    0.0154982 ;
-Time (4,1,1) =     0.112444 ;
-Time (4,1,2) =    0.0581958 ;
-Time (4,1,4) =    0.0260675 ;
-Time (4,1,8) =    0.0135944 ;
-Time (4,1,16) =   0.00696451 ;
-Time (4,1,32) =   0.00383737 ;
-Time (4,1,64) =   0.00447709 ;
-Time (4,1,128) =   0.00438581 ;
-Time (4,1,160) =    0.0126642 ;
-Time (5,1,1) =    0.0856413 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/as20000102/as20000102_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 6474 ;
-Nedges (id) = 12572 ;
-Ntri (id) = 6584 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =  8.50661e-05 ;
-T_prep (1,1,2) =  8.38274e-05 ;
-T_prep (1,1,4) =  8.32211e-05 ;
-T_prep (1,1,8) =  8.30665e-05 ;
-T_prep (1,1,16) =   8.3074e-05 ;
-T_prep (1,1,32) =  8.29278e-05 ;
-T_prep (1,1,64) =  8.28486e-05 ;
-T_prep (1,1,128) =  8.28439e-05 ;
-T_prep (1,1,160) =  8.26642e-05 ;
-T_prep (2,1,1) =  8.93241e-05 ;
-T_prep (2,1,2) =  8.30023e-05 ;
-T_prep (2,1,4) =  8.03042e-05 ;
-T_prep (2,1,8) =  7.92136e-05 ;
-T_prep (2,1,16) =   7.8789e-05 ;
-T_prep (2,1,32) =  7.83894e-05 ;
-T_prep (2,1,64) =  7.84667e-05 ;
-T_prep (2,1,128) =   7.8219e-05 ;
-T_prep (2,1,160) =  7.83829e-05 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =  0.000204409 ;
-Time (1,1,2) =   0.00913808 ;
-Time (1,1,4) =  0.000134269 ;
-Time (1,1,8) =  0.000191727 ;
-Time (1,1,16) =  0.000335066 ;
-Time (1,1,32) =  0.000810037 ;
-Time (1,1,64) =   0.00222954 ;
-Time (1,1,128) =    0.0396388 ;
-Time (1,1,160) =    0.0292245 ;
-Time (2,1,1) =   0.00250841 ;
-Time (2,1,2) =    0.0011946 ;
-Time (2,1,4) =   0.00110941 ;
-Time (2,1,8) =   0.00116954 ;
-Time (2,1,16) =   0.00115584 ;
-Time (2,1,32) =   0.00106059 ;
-Time (2,1,64) =    0.0162548 ;
-Time (2,1,128) =   0.00674562 ;
-Time (2,1,160) =   0.00064766 ;
-Time (3,1,1) =   0.00376333 ;
-Time (3,1,2) =   0.00272124 ;
-Time (3,1,4) =   0.00203726 ;
-Time (3,1,8) =    0.0019485 ;
-Time (3,1,16) =   0.00181902 ;
-Time (3,1,32) =   0.00176924 ;
-Time (3,1,64) =      0.00172 ;
-Time (3,1,128) =   0.00329123 ;
-Time (3,1,160) =   0.00162727 ;
-Time (4,1,1) =   0.00157093 ;
-Time (4,1,2) =  0.000763775 ;
-Time (4,1,4) =   0.00053339 ;
-Time (4,1,8) =  0.000537995 ;
-Time (4,1,16) =  0.000624751 ;
-Time (4,1,32) =   0.00128062 ;
-Time (4,1,64) =  0.000617419 ;
-Time (4,1,128) =   0.00215557 ;
-Time (4,1,160) =  0.000284475 ;
-Time (5,1,1) =   0.00152723 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-AstroPh/ca-AstroPh_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 18772 ;
-Nedges (id) = 198050 ;
-Ntri (id) = 1351441 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =  0.000761062 ;
-T_prep (1,1,2) =  0.000760148 ;
-T_prep (1,1,4) =   0.00947682 ;
-T_prep (1,1,8) =   0.00038939 ;
-T_prep (1,1,16) =  0.000388402 ;
-T_prep (1,1,32) =  0.000389406 ;
-T_prep (1,1,64) =  0.000387115 ;
-T_prep (1,1,128) =  0.000387007 ;
-T_prep (1,1,160) =  0.000387843 ;
-T_prep (2,1,1) =  0.000847824 ;
-T_prep (2,1,2) =  0.000736702 ;
-T_prep (2,1,4) =  0.000468294 ;
-T_prep (2,1,8) =  0.000400668 ;
-T_prep (2,1,16) =  0.000397982 ;
-T_prep (2,1,32) =  0.000419151 ;
-T_prep (2,1,64) =  0.000399346 ;
-T_prep (2,1,128) =  0.000398797 ;
-T_prep (2,1,160) =   0.00040034 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =   0.00540217 ;
-Time (1,1,2) =   0.00335841 ;
-Time (1,1,4) =   0.00161057 ;
-Time (1,1,8) =   0.00116873 ;
-Time (1,1,16) =   0.00146888 ;
-Time (1,1,32) =    0.0023417 ;
-Time (1,1,64) =   0.00473649 ;
-Time (1,1,128) =    0.0235978 ;
-Time (1,1,160) =    0.0236769 ;
-Time (2,1,1) =    0.0834667 ;
-Time (2,1,2) =    0.0492075 ;
-Time (2,1,4) =    0.0230388 ;
-Time (2,1,8) =     0.012689 ;
-Time (2,1,16) =    0.0116279 ;
-Time (2,1,32) =    0.0124123 ;
-Time (2,1,64) =    0.0106075 ;
-Time (2,1,128) =    0.0208374 ;
-Time (2,1,160) =    0.0220545 ;
-Time (3,1,1) =     0.277274 ;
-Time (3,1,2) =     0.149733 ;
-Time (3,1,4) =    0.0827538 ;
-Time (3,1,8) =    0.0494157 ;
-Time (3,1,16) =    0.0403221 ;
-Time (3,1,32) =    0.0406969 ;
-Time (3,1,64) =    0.0424608 ;
-Time (3,1,128) =    0.0906851 ;
-Time (3,1,160) =    0.0596359 ;
-Time (4,1,1) =    0.0335596 ;
-Time (4,1,2) =    0.0186093 ;
-Time (4,1,4) =   0.00943022 ;
-Time (4,1,8) =   0.00632686 ;
-Time (4,1,16) =   0.00428994 ;
-Time (4,1,32) =    0.0046295 ;
-Time (4,1,64) =   0.00497019 ;
-Time (4,1,128) =   0.00513665 ;
-Time (4,1,160) =   0.00434534 ;
-Time (5,1,1) =    0.0304096 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-CondMat/ca-CondMat_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 23133 ;
-Nedges (id) = 93439 ;
-Ntri (id) = 173361 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =  0.000523167 ;
-T_prep (1,1,2) =   0.00052223 ;
-T_prep (1,1,4) =  0.000521403 ;
-T_prep (1,1,8) =  0.000522205 ;
-T_prep (1,1,16) =  0.000521895 ;
-T_prep (1,1,32) =  0.000522001 ;
-T_prep (1,1,64) =  0.000522131 ;
-T_prep (1,1,128) =  0.000521845 ;
-T_prep (1,1,160) =  0.000521746 ;
-T_prep (2,1,1) =  0.000549096 ;
-T_prep (2,1,2) =  0.000510162 ;
-T_prep (2,1,4) =  0.000509229 ;
-T_prep (2,1,8) =  0.000508229 ;
-T_prep (2,1,16) =  0.000507902 ;
-T_prep (2,1,32) =  0.000531025 ;
-T_prep (2,1,64) =  0.000507803 ;
-T_prep (2,1,128) =  0.000507808 ;
-T_prep (2,1,160) =  0.000507423 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =   0.00195272 ;
-Time (1,1,2) =    0.0100139 ;
-Time (1,1,4) =   0.00068501 ;
-Time (1,1,8) =  0.000493414 ;
-Time (1,1,16) =   0.00057496 ;
-Time (1,1,32) =  0.000956435 ;
-Time (1,1,64) =   0.00264913 ;
-Time (1,1,128) =    0.0226349 ;
-Time (1,1,160) =    0.0277254 ;
-Time (2,1,1) =    0.0208155 ;
-Time (2,1,2) =    0.0103247 ;
-Time (2,1,4) =   0.00523089 ;
-Time (2,1,8) =    0.0025214 ;
-Time (2,1,16) =   0.00270134 ;
-Time (2,1,32) =   0.00223824 ;
-Time (2,1,64) =    0.0029579 ;
-Time (2,1,128) =   0.00555123 ;
-Time (2,1,160) =   0.00316232 ;
-Time (3,1,1) =    0.0445187 ;
-Time (3,1,2) =    0.0219684 ;
-Time (3,1,4) =    0.0128282 ;
-Time (3,1,8) =   0.00781821 ;
-Time (3,1,16) =   0.00462432 ;
-Time (3,1,32) =   0.00455968 ;
-Time (3,1,64) =   0.00470291 ;
-Time (3,1,128) =    0.0046768 ;
-Time (3,1,160) =   0.00697879 ;
-Time (4,1,1) =    0.0134912 ;
-Time (4,1,2) =   0.00666913 ;
-Time (4,1,4) =   0.00392742 ;
-Time (4,1,8) =   0.00233671 ;
-Time (4,1,16) =   0.00135781 ;
-Time (4,1,32) =   0.00132038 ;
-Time (4,1,64) =    0.0103251 ;
-Time (4,1,128) =   0.00236017 ;
-Time (4,1,160) =   0.00153313 ;
-Time (5,1,1) =   0.00932815 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-GrQc/ca-GrQc_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 5242 ;
-Nedges (id) = 14484 ;
-Ntri (id) = 48260 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =  9.82182e-05 ;
-T_prep (1,1,2) =  9.69404e-05 ;
-T_prep (1,1,4) =    9.639e-05 ;
-T_prep (1,1,8) =  9.61134e-05 ;
-T_prep (1,1,16) =  9.59551e-05 ;
-T_prep (1,1,32) =  9.63304e-05 ;
-T_prep (1,1,64) =  9.60939e-05 ;
-T_prep (1,1,128) =   9.5984e-05 ;
-T_prep (1,1,160) =  9.61935e-05 ;
-T_prep (2,1,1) =  0.000100027 ;
-T_prep (2,1,2) =  9.50759e-05 ;
-T_prep (2,1,4) =  9.25669e-05 ;
-T_prep (2,1,8) =  9.19681e-05 ;
-T_prep (2,1,16) =  9.14112e-05 ;
-T_prep (2,1,32) =   9.1373e-05 ;
-T_prep (2,1,64) =  9.14317e-05 ;
-T_prep (2,1,128) =  9.12966e-05 ;
-T_prep (2,1,160) =  9.11308e-05 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =  0.000254632 ;
-Time (1,1,2) =    0.0091398 ;
-Time (1,1,4) =  0.000138425 ;
-Time (1,1,8) =  0.000211489 ;
-Time (1,1,16) =  0.000356439 ;
-Time (1,1,32) =   0.00085505 ;
-Time (1,1,64) =   0.00205155 ;
-Time (1,1,128) =    0.0424482 ;
-Time (1,1,160) =    0.0258949 ;
-Time (2,1,1) =   0.00275145 ;
-Time (2,1,2) =   0.00176973 ;
-Time (2,1,4) =   0.00135954 ;
-Time (2,1,8) =   0.00116748 ;
-Time (2,1,16) =   0.00130925 ;
-Time (2,1,32) =   0.00169373 ;
-Time (2,1,64) =   0.00101461 ;
-Time (2,1,128) =   0.00831778 ;
-Time (2,1,160) =   0.00471793 ;
-Time (3,1,1) =   0.00467641 ;
-Time (3,1,2) =   0.00268594 ;
-Time (3,1,4) =   0.00238491 ;
-Time (3,1,8) =   0.00199596 ;
-Time (3,1,16) =   0.00177157 ;
-Time (3,1,32) =   0.00194796 ;
-Time (3,1,64) =   0.00186768 ;
-Time (3,1,128) =    0.0018298 ;
-Time (3,1,160) =   0.00351568 ;
-Time (4,1,1) =   0.00199309 ;
-Time (4,1,2) =  0.000988812 ;
-Time (4,1,4) =    0.0010024 ;
-Time (4,1,8) =   0.00082964 ;
-Time (4,1,16) =  0.000966364 ;
-Time (4,1,32) =  0.000923011 ;
-Time (4,1,64) =  0.000910864 ;
-Time (4,1,128) =   0.00071692 ;
-Time (4,1,160) =  0.000728177 ;
-Time (5,1,1) =   0.00153091 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/ca-HepPh/ca-HepPh_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 12008 ;
-Nedges (id) = 118489 ;
-Ntri (id) = 3358499 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =  0.000477539 ;
-T_prep (1,1,2) =  0.000452701 ;
-T_prep (1,1,4) =   0.00928367 ;
-T_prep (1,1,8) =  0.000272412 ;
-T_prep (1,1,16) =  0.000276008 ;
-T_prep (1,1,32) =  0.000272755 ;
-T_prep (1,1,64) =  0.000275407 ;
-T_prep (1,1,128) =   0.00027151 ;
-T_prep (1,1,160) =  0.000273135 ;
-T_prep (2,1,1) =  0.000503027 ;
-T_prep (2,1,2) =  0.000434381 ;
-T_prep (2,1,4) =  0.000312365 ;
-T_prep (2,1,8) =  0.000294285 ;
-T_prep (2,1,16) =  0.000292072 ;
-T_prep (2,1,32) =  0.000292344 ;
-T_prep (2,1,64) =   0.00029134 ;
-T_prep (2,1,128) =  0.000290911 ;
-T_prep (2,1,160) =  0.000291031 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =   0.00409924 ;
-Time (1,1,2) =   0.00228815 ;
-Time (1,1,4) =   0.00137833 ;
-Time (1,1,8) =   0.00108169 ;
-Time (1,1,16) =   0.00164779 ;
-Time (1,1,32) =   0.00299506 ;
-Time (1,1,64) =   0.00703118 ;
-Time (1,1,128) =    0.0300857 ;
-Time (1,1,160) =    0.0414169 ;
-Time (2,1,1) =    0.0990427 ;
-Time (2,1,2) =    0.0488408 ;
-Time (2,1,4) =    0.0232465 ;
-Time (2,1,8) =    0.0226429 ;
-Time (2,1,16) =    0.0215785 ;
-Time (2,1,32) =    0.0192538 ;
-Time (2,1,64) =    0.0179272 ;
-Time (2,1,128) =    0.0152749 ;
-Time (2,1,160) =    0.0075024 ;
-Time (3,1,1) =     0.290469 ;
-Time (3,1,2) =     0.151395 ;
-Time (3,1,4) =    0.0807575 ;
-Time (3,1,8) =    0.0715253 ;
-Time (3,1,16) =    0.0594709 ;
-Time (3,1,32) =    0.0659696 ;
-Time (3,1,64) =    0.0604369 ;
-Time (3,1,128) =    0.0617015 ;
-Time (3,1,160) =    0.0872132 ;
-Time (4,1,1) =    0.0381432 ;
-Time (4,1,2) =    0.0174243 ;
-Time (4,1,4) =   0.00835389 ;
-Time (4,1,8) =    0.0071084 ;
-Time (4,1,16) =   0.00784557 ;
-Time (4,1,32) =   0.00766976 ;
-Time (4,1,64) =   0.00721821 ;
-Time (4,1,128) =   0.00659505 ;
-Time (4,1,160) =    0.0128473 ;
-Time (5,1,1) =    0.0324228 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/email-Enron/email-Enron_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 36692 ;
-Nedges (id) = 183831 ;
-Ntri (id) = 727044 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =  0.000863755 ;
-T_prep (1,1,2) =  0.000863449 ;
-T_prep (1,1,4) =   0.00975098 ;
-T_prep (1,1,8) =  0.000675009 ;
-T_prep (1,1,16) =  0.000665898 ;
-T_prep (1,1,32) =  0.000665439 ;
-T_prep (1,1,64) =  0.000666771 ;
-T_prep (1,1,128) =  0.000668996 ;
-T_prep (1,1,160) =  0.000669512 ;
-T_prep (2,1,1) =  0.000952451 ;
-T_prep (2,1,2) =  0.000840957 ;
-T_prep (2,1,4) =  0.000688679 ;
-T_prep (2,1,8) =  0.000625157 ;
-T_prep (2,1,16) =  0.000626359 ;
-T_prep (2,1,32) =  0.000625726 ;
-T_prep (2,1,64) =  0.000622813 ;
-T_prep (2,1,128) =  0.000627062 ;
-T_prep (2,1,160) =  0.000625451 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =   0.00567584 ;
-Time (1,1,2) =   0.00375137 ;
-Time (1,1,4) =   0.00188346 ;
-Time (1,1,8) =   0.00145294 ;
-Time (1,1,16) =   0.00172687 ;
-Time (1,1,32) =   0.00153269 ;
-Time (1,1,64) =   0.00548506 ;
-Time (1,1,128) =    0.0287391 ;
-Time (1,1,160) =    0.0280628 ;
-Time (2,1,1) =     0.106454 ;
-Time (2,1,2) =     0.057145 ;
-Time (2,1,4) =    0.0315938 ;
-Time (2,1,8) =    0.0165101 ;
-Time (2,1,16) =    0.0127668 ;
-Time (2,1,32) =    0.0143374 ;
-Time (2,1,64) =    0.0123853 ;
-Time (2,1,128) =    0.0111665 ;
-Time (2,1,160) =    0.0114268 ;
-Time (3,1,1) =     0.284646 ;
-Time (3,1,2) =     0.151441 ;
-Time (3,1,4) =     0.086434 ;
-Time (3,1,8) =    0.0457307 ;
-Time (3,1,16) =    0.0410795 ;
-Time (3,1,32) =    0.0502886 ;
-Time (3,1,64) =    0.0382163 ;
-Time (3,1,128) =    0.0385119 ;
-Time (3,1,160) =    0.0852609 ;
-Time (4,1,1) =    0.0464374 ;
-Time (4,1,2) =     0.024075 ;
-Time (4,1,4) =    0.0127294 ;
-Time (4,1,8) =   0.00796367 ;
-Time (4,1,16) =   0.00442442 ;
-Time (4,1,32) =   0.00494526 ;
-Time (4,1,64) =   0.00470455 ;
-Time (4,1,128) =   0.00737762 ;
-Time (4,1,160) =   0.00764178 ;
-Time (5,1,1) =    0.0457543 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/facebook_combined/facebook_combined_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 4039 ;
-Nedges (id) = 88234 ;
-Ntri (id) = 1612010 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =   0.00028193 ;
-T_prep (1,1,2) =  0.000297496 ;
-T_prep (1,1,4) =  0.000281459 ;
-T_prep (1,1,8) =  0.000281462 ;
-T_prep (1,1,16) =  0.000281407 ;
-T_prep (1,1,32) =  0.000281453 ;
-T_prep (1,1,64) =  0.000281356 ;
-T_prep (1,1,128) =  0.000281426 ;
-T_prep (1,1,160) =  0.000281285 ;
-T_prep (2,1,1) =  0.000297293 ;
-T_prep (2,1,2) =  0.000271593 ;
-T_prep (2,1,4) =  0.000270561 ;
-T_prep (2,1,8) =  0.000270208 ;
-T_prep (2,1,16) =  0.000270461 ;
-T_prep (2,1,32) =  0.000270491 ;
-T_prep (2,1,64) =  0.000270331 ;
-T_prep (2,1,128) =  0.000270416 ;
-T_prep (2,1,160) =   0.00027026 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =   0.00220263 ;
-Time (1,1,2) =    0.0102225 ;
-Time (1,1,4) =   0.00118068 ;
-Time (1,1,8) =   0.00177998 ;
-Time (1,1,16) =   0.00206365 ;
-Time (1,1,32) =   0.00238352 ;
-Time (1,1,64) =   0.00965254 ;
-Time (1,1,128) =    0.0249094 ;
-Time (1,1,160) =    0.0284213 ;
-Time (2,1,1) =    0.0482929 ;
-Time (2,1,2) =     0.032786 ;
-Time (2,1,4) =    0.0317393 ;
-Time (2,1,8) =    0.0296665 ;
-Time (2,1,16) =    0.0270235 ;
-Time (2,1,32) =      0.02884 ;
-Time (2,1,64) =    0.0264932 ;
-Time (2,1,128) =    0.0352756 ;
-Time (2,1,160) =    0.0429848 ;
-Time (3,1,1) =     0.204865 ;
-Time (3,1,2) =     0.106324 ;
-Time (3,1,4) =     0.106386 ;
-Time (3,1,8) =     0.093087 ;
-Time (3,1,16) =     0.096197 ;
-Time (3,1,32) =    0.0909891 ;
-Time (3,1,64) =    0.0880598 ;
-Time (3,1,128) =    0.0885353 ;
-Time (3,1,160) =    0.0888497 ;
-Time (4,1,1) =    0.0205337 ;
-Time (4,1,2) =    0.0113009 ;
-Time (4,1,4) =     0.010028 ;
-Time (4,1,8) =    0.0110994 ;
-Time (4,1,16) =    0.0102719 ;
-Time (4,1,32) =    0.0101472 ;
-Time (4,1,64) =   0.00907464 ;
-Time (4,1,128) =    0.0166028 ;
-Time (4,1,160) =    0.0138348 ;
-Time (5,1,1) =    0.0162247 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/loc-brightkite_edges/loc-brightkite_edges_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 58228 ;
-Nedges (id) = 214078 ;
-Ntri (id) = 494728 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =   0.00123709 ;
-T_prep (1,1,2) =   0.00119691 ;
-T_prep (1,1,4) =   0.00984593 ;
-T_prep (1,1,8) =  0.000625347 ;
-T_prep (1,1,16) =  0.000600687 ;
-T_prep (1,1,32) =  0.000602236 ;
-T_prep (1,1,64) =  0.000592828 ;
-T_prep (1,1,128) =  0.000600934 ;
-T_prep (1,1,160) =  0.000594297 ;
-T_prep (2,1,1) =   0.00127883 ;
-T_prep (2,1,2) =   0.00117183 ;
-T_prep (2,1,4) =   0.00079483 ;
-T_prep (2,1,8) =  0.000662158 ;
-T_prep (2,1,16) =  0.000644467 ;
-T_prep (2,1,32) =  0.000642532 ;
-T_prep (2,1,64) =  0.000643932 ;
-T_prep (2,1,128) =  0.000646623 ;
-T_prep (2,1,160) =  0.000643902 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =   0.00526279 ;
-Time (1,1,2) =   0.00329484 ;
-Time (1,1,4) =    0.0016198 ;
-Time (1,1,8) =   0.00108642 ;
-Time (1,1,16) =   0.00095525 ;
-Time (1,1,32) =   0.00238251 ;
-Time (1,1,64) =   0.00515616 ;
-Time (1,1,128) =    0.0271972 ;
-Time (1,1,160) =      0.04854 ;
-Time (2,1,1) =    0.0611335 ;
-Time (2,1,2) =    0.0158991 ;
-Time (2,1,4) =    0.0117145 ;
-Time (2,1,8) =   0.00661662 ;
-Time (2,1,16) =    0.0118862 ;
-Time (2,1,32) =    0.0116806 ;
-Time (2,1,64) =   0.00440477 ;
-Time (2,1,128) =   0.00561241 ;
-Time (2,1,160) =   0.00445527 ;
-Time (3,1,1) =     0.167803 ;
-Time (3,1,2) =    0.0485841 ;
-Time (3,1,4) =    0.0366572 ;
-Time (3,1,8) =    0.0428078 ;
-Time (3,1,16) =    0.0206344 ;
-Time (3,1,32) =    0.0339566 ;
-Time (3,1,64) =    0.0536373 ;
-Time (3,1,128) =    0.0370094 ;
-Time (3,1,160) =    0.0357821 ;
-Time (4,1,1) =    0.0318383 ;
-Time (4,1,2) =   0.00658995 ;
-Time (4,1,4) =   0.00468408 ;
-Time (4,1,8) =   0.00339232 ;
-Time (4,1,16) =   0.00315771 ;
-Time (4,1,32) =   0.00244777 ;
-Time (4,1,64) =   0.00274445 ;
-Time (4,1,128) =   0.00148881 ;
-Time (4,1,160) =   0.00211696 ;
-Time (5,1,1) =    0.0289041 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/loc-gowalla_edges/loc-gowalla_edges_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 196591 ;
-Nedges (id) = 950327 ;
-Ntri (id) = 2273138 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =    0.0045619 ;
-T_prep (1,1,2) =    0.0045661 ;
-T_prep (1,1,4) =    0.0120126 ;
-T_prep (1,1,8) =   0.00239414 ;
-T_prep (1,1,16) =   0.00226343 ;
-T_prep (1,1,32) =   0.00181485 ;
-T_prep (1,1,64) =   0.00151644 ;
-T_prep (1,1,128) =   0.00153535 ;
-T_prep (1,1,160) =   0.00155128 ;
-T_prep (2,1,1) =   0.00543913 ;
-T_prep (2,1,2) =   0.00462818 ;
-T_prep (2,1,4) =   0.00481987 ;
-T_prep (2,1,8) =   0.00358009 ;
-T_prep (2,1,16) =   0.00197792 ;
-T_prep (2,1,32) =   0.00156496 ;
-T_prep (2,1,64) =   0.00148032 ;
-T_prep (2,1,128) =     0.001487 ;
-T_prep (2,1,160) =   0.00150625 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =    0.0595212 ;
-Time (1,1,2) =    0.0434699 ;
-Time (1,1,4) =    0.0254249 ;
-Time (1,1,8) =    0.0144005 ;
-Time (1,1,16) =   0.00864994 ;
-Time (1,1,32) =   0.00806137 ;
-Time (1,1,64) =    0.0158899 ;
-Time (1,1,128) =    0.0504152 ;
-Time (1,1,160) =    0.0430212 ;
-Time (2,1,1) =      1.40194 ;
-Time (2,1,2) =      0.65135 ;
-Time (2,1,4) =     0.286207 ;
-Time (2,1,8) =     0.135119 ;
-Time (2,1,16) =    0.0690475 ;
-Time (2,1,32) =    0.0430818 ;
-Time (2,1,64) =    0.0514066 ;
-Time (2,1,128) =    0.0706108 ;
-Time (2,1,160) =    0.0755078 ;
-Time (3,1,1) =      1.33529 ;
-Time (3,1,2) =     0.593995 ;
-Time (3,1,4) =     0.287254 ;
-Time (3,1,8) =     0.148162 ;
-Time (3,1,16) =    0.0719728 ;
-Time (3,1,32) =    0.0653916 ;
-Time (3,1,64) =    0.0344031 ;
-Time (3,1,128) =    0.0700913 ;
-Time (3,1,160) =    0.0651249 ;
-Time (4,1,1) =     0.484879 ;
-Time (4,1,2) =     0.218848 ;
-Time (4,1,4) =     0.108079 ;
-Time (4,1,8) =     0.055675 ;
-Time (4,1,16) =    0.0238907 ;
-Time (4,1,32) =     0.019865 ;
-Time (4,1,64) =    0.0169104 ;
-Time (4,1,128) =    0.0148744 ;
-Time (4,1,160) =    0.0313377 ;
-Time (5,1,1) =     0.494808 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/oregon2_010526/oregon2_010526_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 11461 ;
-Nedges (id) = 32730 ;
-Ntri (id) = 89541 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =  0.000184031 ;
-T_prep (1,1,2) =  0.000182885 ;
-T_prep (1,1,4) =  0.000182666 ;
-T_prep (1,1,8) =  0.000182434 ;
-T_prep (1,1,16) =  0.000182588 ;
-T_prep (1,1,32) =  0.000182384 ;
-T_prep (1,1,64) =  0.000182654 ;
-T_prep (1,1,128) =  0.000182411 ;
-T_prep (1,1,160) =  0.000182063 ;
-T_prep (2,1,1) =  0.000192672 ;
-T_prep (2,1,2) =  0.000177859 ;
-T_prep (2,1,4) =  0.000175307 ;
-T_prep (2,1,8) =   0.00017436 ;
-T_prep (2,1,16) =  0.000173952 ;
-T_prep (2,1,32) =  0.000173851 ;
-T_prep (2,1,64) =  0.000174152 ;
-T_prep (2,1,128) =  0.000173982 ;
-T_prep (2,1,160) =  0.000173683 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =  0.000931364 ;
-Time (1,1,2) =    0.0096181 ;
-Time (1,1,4) =  0.000477485 ;
-Time (1,1,8) =   0.00041469 ;
-Time (1,1,16) =  0.000847504 ;
-Time (1,1,32) =    0.0011097 ;
-Time (1,1,64) =   0.00254424 ;
-Time (1,1,128) =    0.0273042 ;
-Time (1,1,160) =    0.0270036 ;
-Time (2,1,1) =    0.0225449 ;
-Time (2,1,2) =    0.0106043 ;
-Time (2,1,4) =   0.00532091 ;
-Time (2,1,8) =   0.00471486 ;
-Time (2,1,16) =   0.00472159 ;
-Time (2,1,32) =   0.00467979 ;
-Time (2,1,64) =   0.00352795 ;
-Time (2,1,128) =   0.00766061 ;
-Time (2,1,160) =   0.00540922 ;
-Time (3,1,1) =    0.0283431 ;
-Time (3,1,2) =    0.0176835 ;
-Time (3,1,4) =    0.0112265 ;
-Time (3,1,8) =    0.0109361 ;
-Time (3,1,16) =   0.00964607 ;
-Time (3,1,32) =    0.0109729 ;
-Time (3,1,64) =    0.0101144 ;
-Time (3,1,128) =    0.0102803 ;
-Time (3,1,160) =    0.0121136 ;
-Time (4,1,1) =   0.00912987 ;
-Time (4,1,2) =   0.00460107 ;
-Time (4,1,4) =   0.00228834 ;
-Time (4,1,8) =    0.0027202 ;
-Time (4,1,16) =   0.00203007 ;
-Time (4,1,32) =   0.00160793 ;
-Time (4,1,64) =   0.00158124 ;
-Time (4,1,128) =   0.00170367 ;
-Time (4,1,160) =   0.00156372 ;
-Time (5,1,1) =    0.0112134 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/p2p-Gnutella31/p2p-Gnutella31_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 62586 ;
-Nedges (id) = 147892 ;
-Ntri (id) = 2024 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =   0.00105015 ;
-T_prep (1,1,2) =    0.0010476 ;
-T_prep (1,1,4) =   0.00975827 ;
-T_prep (1,1,8) =  0.000697746 ;
-T_prep (1,1,16) =  0.000695752 ;
-T_prep (1,1,32) =  0.000695124 ;
-T_prep (1,1,64) =  0.000693991 ;
-T_prep (1,1,128) =  0.000697194 ;
-T_prep (1,1,160) =  0.000699475 ;
-T_prep (2,1,1) =   0.00114415 ;
-T_prep (2,1,2) =   0.00104675 ;
-T_prep (2,1,4) =   0.00077433 ;
-T_prep (2,1,8) =  0.000707028 ;
-T_prep (2,1,16) =  0.000705439 ;
-T_prep (2,1,32) =  0.000703332 ;
-T_prep (2,1,64) =  0.000703448 ;
-T_prep (2,1,128) =  0.000702973 ;
-T_prep (2,1,160) =  0.000704874 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =   0.00298379 ;
-Time (1,1,2) =   0.00157102 ;
-Time (1,1,4) =   0.00088307 ;
-Time (1,1,8) =  0.000652933 ;
-Time (1,1,16) =  0.000677209 ;
-Time (1,1,32) =  0.000887504 ;
-Time (1,1,64) =   0.00215749 ;
-Time (1,1,128) =    0.0172231 ;
-Time (1,1,160) =    0.0244925 ;
-Time (2,1,1) =    0.0243411 ;
-Time (2,1,2) =    0.0109989 ;
-Time (2,1,4) =   0.00571572 ;
-Time (2,1,8) =   0.00320895 ;
-Time (2,1,16) =   0.00150203 ;
-Time (2,1,32) =   0.00117493 ;
-Time (2,1,64) =   0.00180202 ;
-Time (2,1,128) =   0.00136881 ;
-Time (2,1,160) =    0.0012381 ;
-Time (3,1,1) =    0.0331518 ;
-Time (3,1,2) =    0.0174933 ;
-Time (3,1,4) =   0.00779538 ;
-Time (3,1,8) =   0.00461499 ;
-Time (3,1,16) =   0.00605642 ;
-Time (3,1,32) =     0.009088 ;
-Time (3,1,64) =   0.00148166 ;
-Time (3,1,128) =   0.00141556 ;
-Time (3,1,160) =    0.0023642 ;
-Time (4,1,1) =    0.0186444 ;
-Time (4,1,2) =    0.0132235 ;
-Time (4,1,4) =   0.00428349 ;
-Time (4,1,8) =   0.00221026 ;
-Time (4,1,16) =   0.00185037 ;
-Time (4,1,32) =   0.00106698 ;
-Time (4,1,64) =   0.00103085 ;
-Time (4,1,128) =  0.000826748 ;
-Time (4,1,160) =  0.000881965 ;
-Time (5,1,1) =    0.0162797 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz ' ;
-
-%%%%%%%%%%%%%%%%%%%%
-id = id + 1 ;
-N (id) = 1379917 ;
-Nedges (id) = 1921660 ;
-Ntri (id) = 82869 ;
-T_prep = nan (2, 1, 160) ; 
-% prep: 1:L, 2:U, 3:Lperm, 4:Uperm
-% T_prep (1, prep_method, nthreads): for all tri methods
-% T_prep (2, prep_method, nthreads): just for tri_dot
-% tri_method: 1:mark 2:bit 3:dot 4:logmark 5:simple
-% Time (tri_method, prep_method, nthreads)
-T_prep (1,1,1) =    0.0162696 ;
-T_prep (1,1,2) =    0.0162874 ;
-T_prep (1,1,4) =    0.0187174 ;
-T_prep (1,1,8) =   0.00797976 ;
-T_prep (1,1,16) =   0.00556715 ;
-T_prep (1,1,32) =   0.00528788 ;
-T_prep (1,1,64) =   0.00471785 ;
-T_prep (1,1,128) =   0.00417295 ;
-T_prep (1,1,160) =   0.00415698 ;
-T_prep (2,1,1) =    0.0367283 ;
-T_prep (2,1,2) =    0.0367483 ;
-T_prep (2,1,4) =    0.0211096 ;
-T_prep (2,1,8) =    0.0139737 ;
-T_prep (2,1,16) =   0.00819072 ;
-T_prep (2,1,32) =   0.00507755 ;
-T_prep (2,1,64) =   0.00422121 ;
-T_prep (2,1,128) =   0.00418509 ;
-T_prep (2,1,160) =   0.00439598 ;
-Tprep {id} = T_prep ;
-Time = nan (2, 1, 160) ; 
-Time (1,1,1) =    0.0655255 ;
-Time (1,1,2) =    0.0362846 ;
-Time (1,1,4) =     0.014358 ;
-Time (1,1,8) =   0.00852266 ;
-Time (1,1,16) =   0.00421718 ;
-Time (1,1,32) =   0.00251556 ;
-Time (1,1,64) =   0.00471895 ;
-Time (1,1,128) =    0.0278034 ;
-Time (1,1,160) =    0.0436332 ;
-Time (2,1,1) =     0.188735 ;
-Time (2,1,2) =    0.0909381 ;
-Time (2,1,4) =    0.0366032 ;
-Time (2,1,8) =    0.0226138 ;
-Time (2,1,16) =    0.0100851 ;
-Time (2,1,32) =   0.00560976 ;
-Time (2,1,64) =   0.00320991 ;
-Time (2,1,128) =   0.00891437 ;
-Time (2,1,160) =   0.00437485 ;
-Time (3,1,1) =     0.132727 ;
-Time (3,1,2) =    0.0698302 ;
-Time (3,1,4) =    0.0304489 ;
-Time (3,1,8) =    0.0157446 ;
-Time (3,1,16) =   0.00829323 ;
-Time (3,1,32) =   0.00463674 ;
-Time (3,1,64) =   0.00930153 ;
-Time (3,1,128) =   0.00189781 ;
-Time (3,1,160) =    0.0179518 ;
-Time (4,1,1) =     0.170086 ;
-Time (4,1,2) =    0.0855865 ;
-Time (4,1,4) =     0.041516 ;
-Time (4,1,8) =    0.0222419 ;
-Time (4,1,16) =    0.0102048 ;
-Time (4,1,32) =   0.00866638 ;
-Time (4,1,64) =   0.00464176 ;
-Time (4,1,128) =    0.0109919 ;
-Time (4,1,160) =    0.0163081 ;
-Time (5,1,1) =     0.154037 ;
-T {id} = Time ;
-
-File {id} = filetrim (file) ;
-
-file = ' /users/davis/GraphChallenge/snap/friendster/friendster_adj.tsv.gz ' ;
diff --git a/GraphBLAS/Extras/tri/tri_run b/GraphBLAS/Extras/tri/tri_run
deleted file mode 100755
index 6d6f007ba7..0000000000
--- a/GraphBLAS/Extras/tri/tri_run
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/csh
-# usage: tri_run files
-
-echo "function [T, Tprep, N, Nedges, Ntri, File] = tri_results" > tri_results.m
-echo "id = 0 ;" >>  tri_results.m
-echo "" > tri_grb_output.txt
-
-foreach file ($argv[1-])
-        echo ''
-        echo '--------------------------------------------------------------------------------'
-	echo $file
-	echo $file >& /dev/tty
-        echo "file = '" $file "' ;" >> tri_results.m
-	gunzip -c $file | ./tri_main
-	gunzip -c $file | ../../build/tri_demo 1 >> tri_grb_output.txt
-end
-
-echo "function [T, Tprep, N, Nedges, Ntri] = tri_grb_results" > tri_grb_results.m
-echo "id = 0 ;" >>  tri_grb_results.m
-awk -f tri_graphblas.awk < tri_grb_output.txt >> tri_grb_results.m
-
diff --git a/GraphBLAS/Extras/tri/tri_run2 b/GraphBLAS/Extras/tri/tri_run2
deleted file mode 100755
index 7c3b1fb90f..0000000000
--- a/GraphBLAS/Extras/tri/tri_run2
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/csh
-# usage: tri_run2 files
-
-# already done:
-# echo "function [T, Tprep, N, Nedges, Ntri, File] = tri_results" > tri_results.m
-# echo "id = 0 ;" >>  tri_results.m
-
-# echo "" > tri_grb_output.txt
-
-# continue a prior experiment, one result per file:
-foreach file ($argv[1-])
-        echo ''
-        echo '--------------------------------------------------------------------------------'
-	echo $file
-	echo $file >& /dev/tty
-        echo "file = '" $file "' ;" >> tri_results.m
-	gunzip -c $file | ./tri_main
-	gunzip -c $file | ../GraphBLAS/build/tri_demo 1 >> tri_grb_output.txt
-end
-
-echo "function [T, Tprep, N, Nedges, Ntri] = tri_grb_results" > tri_grb_results.m
-echo "id = 0 ;" >>  tri_grb_results.m
-awk -f tri_graphblas.awk < tri_grb_output.txt >> tri_grb_results.m
-
diff --git a/GraphBLAS/Extras/tri/tri_run2nos b/GraphBLAS/Extras/tri/tri_run2nos
deleted file mode 100755
index 1976187d02..0000000000
--- a/GraphBLAS/Extras/tri/tri_run2nos
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/csh
-# usage: tri_run2 files
-
-# already done:
-# echo "function [T, Tprep, N, Nedges, Ntri, File] = tri_results" > tri_results.m
-# echo "id = 0 ;" >>  tri_results.m
-
-# echo "" > tri_grb_output.txt
-
-# continue a prior experiment, one result per file:
-foreach file ($argv[1-])
-        echo ''
-        echo '--------------------------------------------------------------------------------'
-	echo $file
-	echo $file >& /dev/tty
-        echo "file = '" $file "' ;" >> tri_results.m
-	gunzip -c $file | ./tri_main skipsimple
-	gunzip -c $file | ../GraphBLAS/build/tri_demo 1 >> tri_grb_output.txt
-end
-
-echo "function [T, Tprep, N, Nedges, Ntri] = tri_grb_results" > tri_grb_results.m
-echo "id = 0 ;" >>  tri_grb_results.m
-awk -f tri_graphblas.awk < tri_grb_output.txt >> tri_grb_results.m
-
diff --git a/GraphBLAS/Extras/tri/tri_runnos b/GraphBLAS/Extras/tri/tri_runnos
deleted file mode 100755
index 828fb09c6f..0000000000
--- a/GraphBLAS/Extras/tri/tri_runnos
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/bin/csh
-# usage: tri_run files
-
-# echo "function [T, Tprep, N, Nedges, Ntri, File] = tri_results" > tri_results.m
-# echo "id = 0 ;" >>  tri_results.m
-# echo "" > tri_grb_output.txt
-
-foreach file ($argv[1-])
-        echo ''
-        echo '--------------------------------------------------------------------------------'
-	echo $file
-	echo $file >& /dev/tty
-        echo "file = '" $file "' ;" >> tri_results.m
-	gunzip -c $file | ./tri_main skipsimple
-	gunzip -c $file | ../GraphBLAS/build/tri_demo 1 >> tri_grb_output.txt
-end
-
-# echo "function [T, Tprep, N, Nedges, Ntri] = tri_grb_results" > tri_grb_results.m
-# echo "id = 0 ;" >>  tri_grb_results.m
-# awk -f tri_graphblas.awk < tri_grb_output.txt >> tri_grb_results.m
-
diff --git a/GraphBLAS/Extras/tri/tri_simple.c b/GraphBLAS/Extras/tri/tri_simple.c
deleted file mode 100644
index 4280a12b3e..0000000000
--- a/GraphBLAS/Extras/tri/tri_simple.c
+++ /dev/null
@@ -1,56 +0,0 @@
-//------------------------------------------------------------------------------
-// tri_simple: compute the number of triangles in a graph (simplest method)
-//------------------------------------------------------------------------------
-
-// A bare-bones version of the many variants in tri_template.c, with no
-// parallelism, no log-time binary search, no use of tri_lohi to cut the work.
-// This function is most similar to tri_mark (sequential version), defined
-// in tri_template.c.
-
-// Computes the sum(sum((A*A).*A)), in MATLAB notation, where A is binary
-// (only the pattern is present).  Or, in GraphBLAS notation,
-// C<A> = A*A followed by reduce(C) to scalar.
-
-#include "tri_def.h"
-
-int64_t tri_simple          // # of triangles, or -1 if out of memory
-(
-    const int64_t *restrict Ap,     // column pointers, size n+1
-    const Index   *restrict Ai,     // row indices
-    const Index   n                 // A is n-by-n
-)
-{
-
-    bool *restrict Mark = (bool *) calloc (n, sizeof (bool)) ;
-    if (Mark == NULL) return (-1) ;
-    int64_t ntri = 0 ;
-
-    for (Index j = 0 ; j < n ; j++)
-    {
-
-        // scatter A(:,j) into Mark
-        for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-        {
-            Mark [Ai [p]] = 1 ;
-        }
-        // compute sum(C(:,j)) where C(:,j) = (A * A(:,j)) .* Mark
-        for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-        {
-            const Index k = Ai [p] ;
-            // C(:,j) += (A(:,k) * A(k,j)) .* Mark
-            for (int64_t pa = Ap [k] ; pa < Ap [k+1] ; pa++)
-            {
-                // C(i,j) += (A(i,k) * A(k,j)) .* Mark
-                ntri += Mark [Ai [pa]] ;
-            }
-        }
-        for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-        {
-            Mark [Ai [p]] = 0 ;
-        }
-    }
-
-    free (Mark) ;
-    return (ntri) ;
-}
-
diff --git a/GraphBLAS/Extras/tri/tri_template.c b/GraphBLAS/Extras/tri/tri_template.c
deleted file mode 100644
index 91bc035bcf..0000000000
--- a/GraphBLAS/Extras/tri/tri_template.c
+++ /dev/null
@@ -1,307 +0,0 @@
-//------------------------------------------------------------------------------
-// tri_template: count triangles in a graph, outer-product method
-//------------------------------------------------------------------------------
-
-// Compute the # of triangles in a graph, C<A>=A*A in GraphBLAS notation, then
-// ntri=sum(C).  Or, in MATLAB notation, ntri = sum (sum ((A*A).*A)).  C=A*A is
-// computed using an outer-product matrix multiplication.  C is not computed
-// explicitly, but its entries are summed up in the scalar ntri.
-
-// A is a binary matrix stored in compressed sparse column form.  Its values
-// are not stored.  If A(i,j) is in the pattern, its value is assumed to be 1.
-// The pattern of column j is in Ai [Ap [j]..Ap[j+1]].  Row indices in the
-// matrix A must be sorted.  Ap[0]=0, and Ap [n] = total number of entries in
-// the matrix.  Ap is of size n+1.
-
-// When this function is called, A is a triangular matrix (with no diagonal
-// entries, or a symmetric permutation of such a triangular matrix.  However,
-// this function works on any matrix.  It just computes sum(sum((A*A).*A) in
-// MATLAB notation, or C<A>=A*A where A is binary, followed by reduce(C), to
-// scalar.
-
-// So it can be used with C<L>=L*L or C<U>=U*U, and ntri is the number of
-// triangles.  It can also be used as C<A>=A*A where A is symmetric, in
-// which case the # of triangles is ntri/6 (Burkhardt's method).
-
-// This file creates eight methods via compile-time definitions:
-//
-// BIT:       if defined, Mark is a bit vector of size n.  Otherwise it is a
-//            bool array of size n.  This can help cut workspace if many
-//            threads are used since each thread needs its own Mark array.
-// PARALLEL:  if defined, then OpenMP is used
-// LOGSEARCH: if binary search is used to reduce the work
-
-// Compare this code with tri_simple.c.  That code is a simple version of this
-// algorithm, with the bare essential features.
-
-#ifdef BIT
-
-#define MARK_TYPE     uint8_t
-#define MARK_SIZE     (1 + n/8)
-#define SET_MARK(i)   { Index t=(i) ; Mark [t/8] |= (1 << (t%8)) ; }
-#define CLEAR_MARK(i) { Mark [(i)/8] = 0 ; }
-#define COUNT_MARK(i) { Index t=(i) ; if (Mark [t/8] & (1 << t%8)) ntri++ ; }
-
-#else
-
-#define MARK_TYPE     bool
-#define MARK_SIZE     n
-#define SET_MARK(i)   { Mark [i] = 1 ; }
-#define CLEAR_MARK(i) { Mark [i] = 0 ; }
-#define COUNT_MARK(i) { ntri += Mark [i] ; }
-
-#endif
-
-#ifdef LOGSEARCH
-    #ifdef PARALLEL
-        #ifdef BIT
-            #define TRI_FUNCTION  tri_logbit_parallel
-        #else
-            #define TRI_FUNCTION  tri_logmark_parallel
-        #endif
-    #else
-        #ifdef BIT
-            #define TRI_FUNCTION  tri_logbit
-        #else
-            #define TRI_FUNCTION  tri_logmark
-        #endif
-    #endif
-#else
-    #ifdef PARALLEL
-        #ifdef BIT
-            #define TRI_FUNCTION  tri_bit_parallel
-        #else
-            #define TRI_FUNCTION  tri_mark_parallel
-        #endif
-    #else
-        #ifdef BIT
-            #define TRI_FUNCTION  tri_bit
-        #else
-            #define TRI_FUNCTION  tri_mark
-        #endif
-    #endif
-#endif
-
-//------------------------------------------------------------------------------
-// tri_* function: count the triangles in a graph
-//------------------------------------------------------------------------------
-
-int64_t TRI_FUNCTION                // # of triangles, or -1 if out of memory
-(
-    const int64_t *restrict Ap,     // column pointers, size n+1
-    const Index   *restrict Ai,     // row indices, size nz = Ap [n]
-    const Index n                   // A is n-by-n
-    #ifdef PARALLEL
-    , const int threads             // # of threads
-    , const Index chunk             // scheduler chunk size
-    #endif
-)
-{
-
-    int64_t ntri = 0 ;      // # of triangles
-    bool ok = true ;        // false if any thread ran out of memory
-
-    //--------------------------------------------------------------------------
-    // check if sequential version of same algorithm should be used
-    //--------------------------------------------------------------------------
-
-    #ifdef PARALLEL
-    if (n < chunk || threads < 2)
-    {
-        #ifdef LOGSEARCH
-            #ifdef BIT
-            return (tri_logbit (Ap, Ai, n)) ;
-            #else
-            return (tri_logmark (Ap, Ai, n)) ;
-            #endif
-        #else
-            #ifdef BIT
-            return (tri_bit (Ap, Ai, n)) ;
-            #else
-            return (tri_mark (Ap, Ai, n)) ;
-            #endif
-        #endif
-    }
-    #endif
-
-    //--------------------------------------------------------------------------
-    // parallel and sequential triangle counting, outer-product method
-    //--------------------------------------------------------------------------
-
-    #ifdef PARALLEL
-    #pragma omp parallel num_threads(threads) reduction(+:ntri) reduction(&&:ok)
-    #endif
-    {
-
-        //----------------------------------------------------------------------
-        // get workspace
-        //----------------------------------------------------------------------
-
-        // each thread needs its own private workspace, Mark [0..n-1] = 0
-        MARK_TYPE *restrict Mark = calloc (MARK_SIZE, sizeof (MARK_TYPE)) ;
-        if (Mark == NULL)
-        {
-            ok = false ;
-        }
-        else
-        {
-
-            //------------------------------------------------------------------
-            // count triangles in each column C(:,j)
-            //------------------------------------------------------------------
-
-            #ifdef PARALLEL
-            #pragma omp for schedule(dynamic,chunk)
-            #endif
-            for (Index j = 0 ; j < n ; j++)
-            {
-
-                //--------------------------------------------------------------
-                // get column j of A
-                //--------------------------------------------------------------
-
-                // A(:,j) has row indices in range jlo..jhi
-                Index jlo, jhi ;
-                if (!tri_lohi (Ap, Ai, j, &jlo, &jhi)) continue ;
-
-                bool marked = false ;
-
-                #ifdef LOGSEARCH
-                Index ljnz = jhi - jlo + 1 ;
-                #endif
-
-                //--------------------------------------------------------------
-                // compute sum(C(:,j)) where C=(A*A(:,j))*.(A(:,j))
-                //--------------------------------------------------------------
-
-                for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                {
-
-                    //----------------------------------------------------------
-                    // A(k,j) is present, compute C(:,j) += A(:,j)*A(k,j)
-                    //----------------------------------------------------------
-
-                    const Index k = Ai [p] ;
-
-                    // A(:,k) has row indices in range klo..khi
-                    Index klo, khi ;
-                    if (!tri_lohi (Ap, Ai, k, &klo, &khi)) continue ;
-
-                    // skip if A(:,j) and A(:,k) do not overlap
-                    if (khi < jlo || klo > jhi) continue ;
-
-                    //----------------------------------------------------------
-                    // binary search if A(:,k) has many nonzeros
-                    //----------------------------------------------------------
-
-                    #ifdef LOGSEARCH
-
-                    // find the intersection between the mask, A(:,j),
-                    // and the column A(:,k)
-
-                    Index lknz = khi - klo + 1 ;
-
-                    if (512 * ljnz < lknz) // (4 * ljnz * log2 (lknz) < lknz)
-                    {
-
-                        //------------------------------------------------------
-                        // A (:,j) is very sparse compared with A (:,k) ;
-                        //------------------------------------------------------
-
-                        // Do not use the Mark array at all, but use binary
-                        // search instead.  time is O(ljnz * log (lknz))
-                        int64_t pleft  = Ap [k] ;
-                        for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                        {
-                            // find i in A (:,k)
-                            Index i = Ai [p] ;
-                            // binary search of Ai [pleft ... pright] for i
-                            int64_t pright = Ap [k+1] - 1 ;
-                            while (pleft < pright)
-                            {
-                                int64_t pmiddle = (pleft + pright) / 2 ;
-                                if (i > Ai [pmiddle])
-                                {
-                                    // if in the list, it appears in
-                                    // [pmiddle+1..pright]
-                                    pleft = pmiddle + 1 ;
-                                }
-                                else
-                                {
-                                    // if in the list, it appears in
-                                    // [pleft..pmiddle]
-                                    pright = pmiddle ;
-                                }
-                            }
-                            if (pleft == pright && Ai [pleft] == i)
-                            {
-                                // found it:  A(i,k) and A (k,j) both nonzero
-                                // C(i,j) += A (i,k) * A (k,j)
-                                ntri++ ;
-                            }
-                        }
-                        continue ;
-                    }
-                    #endif
-
-                    //----------------------------------------------------------
-                    // linear search
-                    //----------------------------------------------------------
-
-                    if (!marked)
-                    {
-                        // scatter A(:,j) into Mark
-                        for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                        {
-                            // Mark [Ai [p]] = 1 ;
-                            SET_MARK (Ai [p]) ;
-                        }
-                        marked = true ;
-                    }
-
-                    for (int64_t pa = Ap [k] ; pa < Ap [k+1] ; pa++)
-                    {
-                        // C(i,j) += A (i,k) * A (k,j)
-                        COUNT_MARK (Ai [pa]) ;
-                    }
-                }
-
-                //--------------------------------------------------------------
-                // clear the Mark array
-                //--------------------------------------------------------------
-
-                if (marked)
-                {
-                    for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
-                    {
-                        // Mark [Ai [p]] = 0 ;
-                        CLEAR_MARK (Ai [p]) ;
-                    }
-                }
-            }
-
-            //------------------------------------------------------------------
-            // free workspace
-            //------------------------------------------------------------------
-
-            free (Mark) ;
-        }
-    }
-
-    //--------------------------------------------------------------------------
-    // return result
-    //--------------------------------------------------------------------------
-
-    return (ok ? ntri : -1) ;
-}
-
-#undef BIT
-#undef PARALLEL
-#undef MARK_TYPE
-#undef MARK_SIZE
-#undef SET_MARK
-#undef CLEAR_MARK
-#undef COUNT_MARK
-#undef TRI_FUNCTION
-#undef LOGSEARCH
-
diff --git a/GraphBLAS/Extras/tri/triall b/GraphBLAS/Extras/tri/triall
deleted file mode 100755
index 86220744e3..0000000000
--- a/GraphBLAS/Extras/tri/triall
+++ /dev/null
@@ -1,162 +0,0 @@
-#!/bin/csh
-
-# startup
-echo "function [T, Tprep, N, Nedges, Ntri, File] = tri_results" > tri_results.m
-echo "id = 0 ;" >>  tri_results.m
-echo "" > tri_grb_output.txt
-
-./tri_run \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/snap/as20000102/as20000102_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/snap/ca-GrQc/ca-GrQc_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-Bk.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella08/p2p-Gnutella08_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010331/oregon1_010331_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010407/oregon1_010407_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010414/oregon1_010414_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010421/oregon1_010421_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010428/oregon1_010428_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010505/oregon1_010505_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010512/oregon1_010512_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010519/oregon1_010519_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon1_010526/oregon1_010526_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-B2k.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella09/p2p-Gnutella09_adj.tsv.gz \
-    ~/GraphChallenge/snap/ca-HepTh/ca-HepTh_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010331/oregon2_010331_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010407/oregon2_010407_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010505/oregon2_010505_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010414/oregon2_010414_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010421/oregon2_010421_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010428/oregon2_010428_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010512/oregon2_010512_adj.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010519/oregon2_010519_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-81-256-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-81-256-Bk.tsv.gz \
-    ~/GraphChallenge/snap/oregon2_010526/oregon2_010526_adj.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella06/p2p-Gnutella06_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-81-256-B2k.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella05/p2p-Gnutella05_adj.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella04/p2p-Gnutella04_adj.tsv.gz \
-    ~/GraphChallenge/snap/as-caida20071105/as-caida20071105_adj.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella25/p2p-Gnutella25_adj.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella24/p2p-Gnutella24_adj.tsv.gz \
-    ~/GraphChallenge/snap/facebook_combined/facebook_combined_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella30/p2p-Gnutella30_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-16-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/snap/ca-CondMat/ca-CondMat_adj.tsv.gz \
-    ~/GraphChallenge/snap/ca-HepPh/ca-HepPh_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/snap/p2p-Gnutella31/p2p-Gnutella31_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-B1k.tsv.gz \
-    ~/GraphChallenge/snap/email-Enron/email-Enron_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-260610-65536_adj.tsv.gz \
-    ~/GraphChallenge/snap/ca-AstroPh/ca-AstroPh_adj.tsv.gz \
-    ~/GraphChallenge/snap/loc-brightkite_edges/loc-brightkite_edges_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-256-625-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-256-625-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-256-625-B2k.tsv.gz \
-    ~/GraphChallenge/snap/cit-HepTh/cit-HepTh_adj.tsv.gz \
-    ~/GraphChallenge/snap/soc-Epinions1/soc-Epinions1_adj.tsv.gz \
-    ~/GraphChallenge/snap/email-EuAll/email-EuAll_adj.tsv.gz \
-    ~/GraphChallenge/snap/cit-HepPh/cit-HepPh_adj.tsv.gz \
-    ~/GraphChallenge/snap/soc-Slashdot0811/soc-Slashdot0811_adj.tsv.gz \
-    ~/GraphChallenge/snap/soc-Slashdot0902/soc-Slashdot0902_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-1045506-262144_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/snap/loc-gowalla_edges/loc-gowalla_edges_adj.tsv.gz \
-    ~/GraphChallenge/snap/amazon0302/amazon0302_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-4-5-9-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-256-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-256-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-25-81-256-B2k.tsv.gz \
-    ~/GraphChallenge/snap/roadNet-PA/roadNet-PA_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-9-16-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/snap/roadNet-TX/roadNet-TX_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-4188162-1048576_adj.tsv.gz \
-    ~/GraphChallenge/snap/flickrEdges/flickrEdges_adj.tsv.gz \
-    ~/GraphChallenge/snap/amazon0312/amazon0312_adj.tsv.gz \
-    ~/GraphChallenge/snap/amazon0505/amazon0505_adj.tsv.gz \
-    ~/GraphChallenge/snap/amazon0601/amazon0601_adj.tsv.gz \
-    ~/GraphChallenge/snap/roadNet-CA/roadNet-CA_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale18-ef16/graph500-scale18-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B1k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-3-4-5-9-16-25-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale19-ef16/graph500-scale19-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-16764930-4194304_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale20-ef16/graph500-scale20-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale21-ef16/graph500-scale21-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-67084290-16777216_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/U1a.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/P1a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/A2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V1r.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-1073643522-268435456_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz \
-    ~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz
-
-# do not use tri_simple method (too slow):
-./tri_runnos \
-    ~/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz
-
-# wrapup
-echo "function [T, Tprep, N, Nedges, Ntri] = tri_grb_results" > tri_grb_results.m
-echo "id = 0 ;" >>  tri_grb_results.m
-awk -f tri_graphblas.awk < tri_grb_output.txt >> tri_grb_results.m
-
diff --git a/GraphBLAS/Extras/tri/triall2 b/GraphBLAS/Extras/tri/triall2
deleted file mode 100755
index 89a8adde56..0000000000
--- a/GraphBLAS/Extras/tri/triall2
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/csh
-
-# do not use tri_simple method (too slow):
-./tri_runnos \
-    ~/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz \
-    ~/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz
-
-# wrapup
-echo "function [T, Tprep, N, Nedges, Ntri] = tri_grb_results" > tri_grb_results.m
-echo "id = 0 ;" >>  tri_grb_results.m
-awk -f tri_graphblas.awk < tri_grb_output.txt >> tri_grb_results.m
-
-
diff --git a/GraphBLAS/Extras/tri/triall4 b/GraphBLAS/Extras/tri/triall4
deleted file mode 100755
index 48923d780c..0000000000
--- a/GraphBLAS/Extras/tri/triall4
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/csh
-./tri_run2nos \
-    ~/GraphChallenge/synthetic/gc5/201512012345.v18571154_e38040320.tsv.gz \
-
-./tri_run2 \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-Bk.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale20-ef16/graph500-scale20-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B2k.tsv.gz \
-    ~/GraphChallenge/synthetic/gc3/Theory-5-9-16-25-81-B1k.tsv.gz \
-    ~/GraphChallenge/snap/cit-Patents/cit-Patents_adj.tsv.gz \
-
-./tri_run2nos \
-    ~/GraphChallenge/synthetic/gc5/201512020000.v35991342_e74485420.tsv.gz \
-
-./tri_run2 \
-    ~/GraphChallenge/synthetic/graph500-scale21-ef16/graph500-scale21-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-67084290-16777216_adj.tsv.gz \
-
-./tri_run2nos \
-    ~/GraphChallenge/synthetic/gc5/201512020030.v68863315_e143414960.tsv.gz \
-
-./tri_run2 \
-    ~/GraphChallenge/synthetic/graph500-scale22-ef16/graph500-scale22-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/U1a.tsv.gz \
-
-./tri_run2nos \
-    ~/GraphChallenge/synthetic/gc5/201512020130.v128568730_e270234840.tsv.gz \
-
-./tri_run2 \
-    ~/GraphChallenge/synthetic/graph500-scale23-ef16/graph500-scale23-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-268386306-67108864_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/P1a.tsv.gz \
-
-./tri_run2nos \
-    ~/GraphChallenge/synthetic/gc5/201512020330.v226196185_e480047894.tsv.gz \
-
-./tri_run2 \
-    ~/GraphChallenge/synthetic/gc6/A2a.tsv.gz \
-    ~/GraphChallenge/synthetic/gc6/V1r.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale24-ef16/graph500-scale24-ef16_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/image-grid/g-1073643522-268435456_adj.tsv.gz \
-    ~/GraphChallenge/synthetic/graph500-scale25-ef16/graph500-scale25-ef16_adj.tsv.gz \
-    ~/GraphChallenge/snap/friendster/friendster_adj.tsv.gz
-
diff --git a/GraphBLAS/GraphBLAS/@GrB/GrB.m b/GraphBLAS/GraphBLAS/@GrB/GrB.m
index 23e2f84a51..d8d54bc956 100644
--- a/GraphBLAS/GraphBLAS/@GrB/GrB.m
+++ b/GraphBLAS/GraphBLAS/@GrB/GrB.m
@@ -14,8 +14,8 @@
 %
 % constructs a GraphBLAS matrix G, which is the result of C<M>=A in
 % GraphBLAS notation (like C(M)=A(M) in MATLAB).  The matrices used in any
-% GrB.method may be MATLAB matrices (sparse or dense) or GraphBLAS sparse
-% matrices, in any combination.
+% GrB.method may be MATLAB matrices (sparse or full) or GraphBLAS matrices
+% (hyper, sparse, bitmap, or full, by row or column), in any combination.
 %
 % --------------------
 % The GrB constructor:
@@ -32,13 +32,15 @@
 %   The m and n parameters above are MATLAB scalars.  The type and format
 %   parameters are strings.  The default format is 'by col', to match the
 %   format used in MATLAB (see also GrB.format), but many graph
-%   algorithms are faster if the format is 'by row'.
+%   algorithms are faster if the format is 'by row'.  The format can also
+%   specify the data structure to use (hypersparse, sparse, bitmap, and/or
+%   full).
 %
 %   The usage C = GrB (m, n, type) is analgous to A = sparse (m, n),
 %   which creates an empty MATLAB sparse matrix A.  The type parameter is
 %   a string, which defaults to 'double' if not present.
 %
-%   For the usage C = GrB (A, type), A is either a MATLAB sparse or dense
+%   For the usage C = GrB (A, type), A is either a MATLAB sparse or full
 %   matrix, or a GraphBLAS sparse matrix object.  C is created as a
 %   GraphBLAS sparse matrix object that contains a copy of A, typecasted
 %   to the given type if the type string does not match the type of A.
@@ -93,6 +95,10 @@
 %   When a GraphBLAS matrix is converted into a MATLAB sparse or full
 %   matrix, it is always returned to MATLAB 'by col'.
 %
+%   The format can also specify the data structure to use.  By default
+%   GraphBLAS selects automatically between hypersparse, sparse, bitmap,
+%   and full formats.  See 'help GrB.format' for details.
+%
 %--------------------
 % Integer operations:
 %--------------------
@@ -315,12 +321,12 @@
 %-------------------------------------------------------------------------
 %
 %   The Static Methods for the GrB class can be used on input matrices of
-%   any kind: GraphBLAS sparse matrices, MATLAB sparse matrices, or
-%   MATLAB dense matrices, in any combination.  The output matrix C is
-%   a GraphBLAS matrix, by default, but can be optionally returned as a
-%   MATLAB sparse or dense matrix.  The static methods divide into three
-%   categories: those that perform basic functions, graph algorithms,
-%   and the 12 foundational GraphBLAS operations.
+%   any kind: GraphBLAS sparse matrices, MATLAB sparse matrices, or MATLAB
+%   full matrices, in any combination.  The output matrix C is a GraphBLAS
+%   matrix, by default, but can be optionally returned as a MATLAB sparse
+%   or full matrix.  The static methods divide into three categories:
+%   those that perform basic functions, graph algorithms, and the 12
+%   foundational GraphBLAS operations.
 %
 %---------------------------
 % GraphBLAS basic functions:
@@ -362,6 +368,8 @@
 %   C = GrB.random (...)         random GraphBLAS matrix (like 'sprand')
 %   C = GrB.speye (m,n,type)     identity matrix of any type (like 'speye')
 %   t = GrB.type (A)             get the type of a MATLAB or GrB matrix A
+%   v = GrB.version              string with SuiteSparse:GraphBLAS version
+%   v = GrB.ver                  struct with SuiteSparse:GraphBLAS version
 %
 %-------------------------------------
 % Static Methods for graph algorithms:
@@ -547,8 +555,8 @@
 %
 % See also sparse.
 %
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 properties (SetAccess = private, GetAccess = private)
     % The struct contains the entire opaque content of a GraphBLAS
@@ -590,8 +598,8 @@
                 % GraphBLAS mexFunction into a GrB matrix object.
                 C.opaque = arg1 ;
             elseif (isobject (arg1))
-                % arg1 is already a GrB matrix; nothing to do
-                C = arg1 ;
+                % arg1 is already a GrB matrix; make a deep copy
+                C.opaque = gbnew (arg1.opaque) ;
             else
                 % arg1 is a MATLAB matrix; convert to a GrB matrix
                 C.opaque = gbnew (arg1) ;
@@ -915,6 +923,7 @@
     C = sprandsym (arg1, arg2) ;
     c = sprintf (varargin) ;
     C = sqrt (G) ;
+    S = struct (G) ;
     C = sum (G, option) ;
     [p, varargout] = symamd (G, varargin) ;
     p = symrcm (G) ;
@@ -977,7 +986,7 @@
     [I, J, X] = extracttuples (A, desc) ;
     C = eye (m, n, type) ;
     finalize ;
-    f = format (arg) ;
+    [f, s] = format (arg) ;
     C = incidence (A, varargin) ;
     init ;
     s = isbyrow (A) ;
@@ -1008,6 +1017,8 @@
     s = tricount (A, check, d) ;                % uses GrB matrices
     s = type (A) ;
     unopinfo (op, type) ;
+    v = version ;
+    v = ver ;
     C = vreduce (Cin, M, accum, monoid, A, desc) ;
 
 end
diff --git a/GraphBLAS/GraphBLAS/@GrB/MATLAB_vs_GrB.m b/GraphBLAS/GraphBLAS/@GrB/MATLAB_vs_GrB.m
index ba257b8321..778f3c4437 100644
--- a/GraphBLAS/GraphBLAS/@GrB/MATLAB_vs_GrB.m
+++ b/GraphBLAS/GraphBLAS/@GrB/MATLAB_vs_GrB.m
@@ -2,8 +2,8 @@
 %
 % Most of the overloaded operations on GrB matrices work just the same as
 % the MATLAB operations of the same name.  There are some important
-% differences.  In future versions, the GrB MATLAB interface to
-% GraphBLAS may be modified to reduce these differences.
+% differences.  In future versions, the GrB MATLAB interface to GraphBLAS
+% may be modified to minimize these differences.
 %
 % ------------------------------------------------
 %% Matrix classes and types:
@@ -11,7 +11,7 @@
 %
 %     MATLAB supports 3 kinds of sparse matrices: logical, double, and
 %     double complex.  For single precision floating-point (real or
-%     complex), and integer matrices, MATLAB only supports dense matrices,
+%     complex), and integer matrices, MATLAB only supports full matrices,
 %     not sparse.
 %
 %     GraphBLAS supports all types:  logical, int8, int16, int32, int64,
@@ -119,20 +119,13 @@
 %
 %     In MATLAB, the default is to round to the nearest integer.  If the
 %     fractional part is exactly 0.5: the integer with larger magnitude is
-%     selected.
-%
-%     In GraphBLAS v3.2.2 and earlier, the convention followed the one in
-%     the C API, which is to truncate (the same as what happens in C when
-%     typecasting from double to int, for example).
-%
-%     In GraphBLAS v3.3, the typecasting in the MATLAB interface has been
-%     changed to match the MATLAB behavior, when explicitly converting
-%     matrices:
+%     selected.  In GraphBLAS, typecasting matches the MATLAB behavior
+%     when explicitly converting matrices:
 %
 %       G = 100 * rand (4)
 %       G = GrB (G, 'int8')
 %
-%     If instead, a double matrix is used as-is directly in an integer
+%     However, if a double matrix is used as-is directly in an integer
 %     semiring, the C typecasting rules are used:
 %
 %       % suppose A and B are double:
@@ -224,9 +217,8 @@
 %         ./  ldivide
 %         .\  rdivide
 %         .^  power
-%
 %         sum, prod:  MATLAB converts to double; GraphBLAS keeps the type
-%         of the input
+%                     of the input
 %
 %     It does not affect the following:
 %
@@ -270,11 +262,10 @@
 %         GrB.semiringinfo ('bitor.bitand.uint8')
 %
 %% For more details, see the GraphBLAS user guide in GraphBLAS/Doc.
-%
 % See also GrB, sparse.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 help GrB.MATLAB_vs_GrB ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/abs.m b/GraphBLAS/GraphBLAS/@GrB/abs.m
index 72e117ef6c..bf1477bc66 100644
--- a/GraphBLAS/GraphBLAS/@GrB/abs.m
+++ b/GraphBLAS/GraphBLAS/@GrB/abs.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/sign.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 C = GrB (gb_abs (G)) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/acos.m b/GraphBLAS/GraphBLAS/@GrB/acos.m
index b1daa27807..8cc0f2eab2 100644
--- a/GraphBLAS/GraphBLAS/@GrB/acos.m
+++ b/GraphBLAS/GraphBLAS/@GrB/acos.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/cos, GrB/cosh, GrB/acosh.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 C = GrB (gb_trig ('acos', G)) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/acosh.m b/GraphBLAS/GraphBLAS/@GrB/acosh.m
index 90b6710b02..6e68cf23b4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/acosh.m
+++ b/GraphBLAS/GraphBLAS/@GrB/acosh.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/cos, GrB/acos, GrB/cosh.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 C = GrB (gb_trig ('acosh', gbfull (G))) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/acot.m b/GraphBLAS/GraphBLAS/@GrB/acot.m
index 0dd564c2df..0e6497a9c8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/acot.m
+++ b/GraphBLAS/GraphBLAS/@GrB/acot.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/cot, GrB/coth, GrB/acoth.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/acoth.m b/GraphBLAS/GraphBLAS/@GrB/acoth.m
index 4653640c26..7fffb330b7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/acoth.m
+++ b/GraphBLAS/GraphBLAS/@GrB/acoth.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/cot, GrB/acot, GrB/coth.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/acsc.m b/GraphBLAS/GraphBLAS/@GrB/acsc.m
index 9e2e0cf696..ab95a3f554 100644
--- a/GraphBLAS/GraphBLAS/@GrB/acsc.m
+++ b/GraphBLAS/GraphBLAS/@GrB/acsc.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/csc, GrB/csch, GrB/acsch.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/acsch.m b/GraphBLAS/GraphBLAS/@GrB/acsch.m
index f0c5113c35..18553f9026 100644
--- a/GraphBLAS/GraphBLAS/@GrB/acsch.m
+++ b/GraphBLAS/GraphBLAS/@GrB/acsch.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/csc, GrB/acsc, GrB/csch.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/all.m b/GraphBLAS/GraphBLAS/@GrB/all.m
index 3c4007ff93..65c5796140 100644
--- a/GraphBLAS/GraphBLAS/@GrB/all.m
+++ b/GraphBLAS/GraphBLAS/@GrB/all.m
@@ -9,8 +9,8 @@
 %
 % See also GrB/any, GrB/nnz, GrB/prod, GrB.entries.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/amd.m b/GraphBLAS/GraphBLAS/@GrB/amd.m
index 2f4506057a..850870165b 100644
--- a/GraphBLAS/GraphBLAS/@GrB/amd.m
+++ b/GraphBLAS/GraphBLAS/@GrB/amd.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/colamd, GrB/symrcm.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 p = builtin ('amd', logical (G), varargin {:}) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/and.m b/GraphBLAS/GraphBLAS/@GrB/and.m
index 084ae3c53a..64c0763929 100644
--- a/GraphBLAS/GraphBLAS/@GrB/and.m
+++ b/GraphBLAS/GraphBLAS/@GrB/and.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/or, GrB/xor, GrB/not.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/angle.m b/GraphBLAS/GraphBLAS/@GrB/angle.m
index 4e35643571..a589123eb4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/angle.m
+++ b/GraphBLAS/GraphBLAS/@GrB/angle.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/abs.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 [m, n, type] = gbsize (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/any.m b/GraphBLAS/GraphBLAS/@GrB/any.m
index be84962c44..9525f3aef2 100644
--- a/GraphBLAS/GraphBLAS/@GrB/any.m
+++ b/GraphBLAS/GraphBLAS/@GrB/any.m
@@ -9,8 +9,8 @@
 %
 % See also GrB/all, GrB/sum, GrB/nnz, GrB.entries, GrB.nonz.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/apply.m b/GraphBLAS/GraphBLAS/@GrB/apply.m
index 7924a79016..3e64a5e7b6 100644
--- a/GraphBLAS/GraphBLAS/@GrB/apply.m
+++ b/GraphBLAS/GraphBLAS/@GrB/apply.m
@@ -22,8 +22,8 @@
 %
 % See also GrB/apply2, GrB/spfun.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/apply2.m b/GraphBLAS/GraphBLAS/@GrB/apply2.m
index 83ae110c3a..084f41f320 100644
--- a/GraphBLAS/GraphBLAS/@GrB/apply2.m
+++ b/GraphBLAS/GraphBLAS/@GrB/apply2.m
@@ -24,8 +24,8 @@
 %
 % See also GrB/apply, GrB/spfun.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/asec.m b/GraphBLAS/GraphBLAS/@GrB/asec.m
index 50fe4a5db1..8ef0294d71 100644
--- a/GraphBLAS/GraphBLAS/@GrB/asec.m
+++ b/GraphBLAS/GraphBLAS/@GrB/asec.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/sec, GrB/sech, GrB/asech.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/asech.m b/GraphBLAS/GraphBLAS/@GrB/asech.m
index 758ddc595c..388208f9cb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/asech.m
+++ b/GraphBLAS/GraphBLAS/@GrB/asech.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/sec, GrB/asec, GrB/sech.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/asin.m b/GraphBLAS/GraphBLAS/@GrB/asin.m
index fb57904ef7..291ab93c45 100644
--- a/GraphBLAS/GraphBLAS/@GrB/asin.m
+++ b/GraphBLAS/GraphBLAS/@GrB/asin.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/sin, GrB/sinh, GrB/asinh.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 C = GrB (gb_trig ('asin', G)) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/asinh.m b/GraphBLAS/GraphBLAS/@GrB/asinh.m
index 25767317d4..a086dacb4a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/asinh.m
+++ b/GraphBLAS/GraphBLAS/@GrB/asinh.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/sin, GrB/asin, GrB/sinh.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/assert.m b/GraphBLAS/GraphBLAS/@GrB/assert.m
index 6cb6d8eca6..b6bf223188 100644
--- a/GraphBLAS/GraphBLAS/@GrB/assert.m
+++ b/GraphBLAS/GraphBLAS/@GrB/assert.m
@@ -3,8 +3,8 @@ function assert (G)
 %
 % See also error.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 builtin ('assert', logical (G)) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/assign.m b/GraphBLAS/GraphBLAS/@GrB/assign.m
index 457ccd4a75..747891c752 100644
--- a/GraphBLAS/GraphBLAS/@GrB/assign.m
+++ b/GraphBLAS/GraphBLAS/@GrB/assign.m
@@ -94,8 +94,8 @@
 %
 % See also GrB.subassign, GrB/subsasgn.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/atan.m b/GraphBLAS/GraphBLAS/@GrB/atan.m
index fa6d3c4b74..fa91e5125e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/atan.m
+++ b/GraphBLAS/GraphBLAS/@GrB/atan.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/tan, GrB/tanh, GrB/atanh, GrB/atan2.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/atan2.m b/GraphBLAS/GraphBLAS/@GrB/atan2.m
index 1df20d56bc..893ee20c47 100644
--- a/GraphBLAS/GraphBLAS/@GrB/atan2.m
+++ b/GraphBLAS/GraphBLAS/@GrB/atan2.m
@@ -4,12 +4,12 @@
 %
 % See also GrB/tan, GrB/tanh, GrB/atan, GrB/atanh.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: atan2(A,B) for two matrices A and B is slower than it could be.
 % See comments in gb_union_op.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 if (isobject (A))
     A = A.opaque ;
 end
diff --git a/GraphBLAS/GraphBLAS/@GrB/atanh.m b/GraphBLAS/GraphBLAS/@GrB/atanh.m
index 13b089810a..a686d07af4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/atanh.m
+++ b/GraphBLAS/GraphBLAS/@GrB/atanh.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/tan, GrB/atan, GrB/tanh, GrB/atan2.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 C = GrB (gb_trig ('atanh', G)) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/bandwidth.m b/GraphBLAS/GraphBLAS/@GrB/bandwidth.m
index 24216bbc08..ba4fa9c613 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bandwidth.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bandwidth.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/isbanded, GrB/isdiag, GrB/istril, GrB/istriu.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % FUTURE: this will be much faster when implemented in a mexFunction.
 % It is currently much slower than the MATLAB bandwidth function.
diff --git a/GraphBLAS/GraphBLAS/@GrB/bfs.m b/GraphBLAS/GraphBLAS/@GrB/bfs.m
index 98454f892b..db9f186ea9 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bfs.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bfs.m
@@ -58,8 +58,8 @@
 %
 % See also graph/bfsearch, graph/shortestpathtree, treeplot.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % NOTE: this is a high-level algorithm that uses GrB objects.
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/binopinfo.m b/GraphBLAS/GraphBLAS/@GrB/binopinfo.m
index 8ac90d4357..794834182d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/binopinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/binopinfo.m
@@ -39,29 +39,49 @@ function binopinfo (op, optype)
 %
 %   operator name(s) f(x,y)         |   operator names(s) f(x,y)
 %   ---------------- ------         |   ----------------- ------
-%   1st first        x              |   iseq             x == y
-%   2nd second       y              |   isne             x ~= y
-%   min              min(x,y)       |   isgt             x > y
-%   max              max(x,y)       |   islt             x < y
-%   +   plus         x+y            |   isge             x >= y
-%   -   minus        x-y            |   isle             x <= y
-%   rminus           y-x            |   ==  eq           x == y
-%   *   times        x*y            |   ~=  ne           x ~= y
-%   /   div          x/y            |   >   gt           x > y
-%   \   rdiv         y/x            |   <   lt           x < y
-%   |   || or  lor   x | y          |   >=  ge           x >= y
-%   &   && and land  x & y          |   <=  le           x <= y
-%   xor lxor         xor(x,y)       |   .^  pow          x .^ y
-%   pair             1              |   any              pick x or y
-%
-% Comparators (*lt, *gt, *le, *ge) and min/max are not available for
-% complex types.
+%   1st first        x              |   iseq              x == y
+%   2nd second       y              |   isne              x ~= y
+%   min              min(x,y)       |   isgt              x > y
+%   max              max(x,y)       |   islt              x < y
+%   +   plus         x+y            |   isge              x >= y
+%   -   minus        x-y            |   isle              x <= y
+%   rminus           y-x            |   ==  eq            x == y
+%   *   times        x*y            |   ~=  ne            x ~= y
+%   /   div          x/y            |   >   gt            x > y
+%   \   rdiv         y/x            |   <   lt            x < y
+%   |   || or  lor   x | y          |   >=  ge            x >= y
+%   &   && and land  x & y          |   <=  le            x <= y
+%   xor lxor         xor(x,y)       |   .^  pow           x .^ y
+%   pair             1              |   any               pick x or y
 %
 % All of the above operators are defined for logical operands, but many
 % are redundant. 'min.logical' is the same as 'and.logical', for example.
 % Most of the logical operators have aliases: ('lor', 'or', '|') are the
 % same, as are ('lxnor', 'xnor', 'eq', '==') for logical types.
 %
+% Positional operators return int32 or int64, and depend only on the position
+% of the entry in the matrix.  They do not depend on the values of their
+% inputs, but on their position in the matrix instead:
+%
+%   1-based postional ops:          in a semiring:     in ewise operators:
+%   operator name(s)                f(A(i,k)*B(k,j))   f(A(i,j),B(i,j))
+%   ----------------                ----------------   ----------------
+%   firsti1  1sti1 firsti  1sti     i                  i
+%   firstj1  1stj1 firstj  1stj     k                  j
+%   secondi1 2ndi1 secondi 2ndi     k                  i
+%   secondj1 2ndj1 secondj 2ndj     j                  j
+%
+%   0-based postional ops:          in a semiring:     in ewise operators:
+%   operator name(s)                f(A(i,k)*B(k,j))   f(A(i,j),B(i,j))
+%   ----------------                ----------------   ----------------
+%   firsti0  1sti0                  i-1                i-1
+%   firstj0  1stj0                  k-1                j-1
+%   secondi0 2ndi0                  k-1                i-1
+%   secondj0 2ndj0                  j-1                j-1
+%
+% Comparators (*lt, *gt, *le, *ge) and min/max are not available for
+% complex types.
+%
 % The three logical operators, lor, land, and lxor, can be used with any
 % real types.  z = lor.double (x,y) tests the condition (x~=0) || (y~=0),
 % and returns the double value 1.0 if true, or 0.0 if false.
@@ -74,9 +94,8 @@ function binopinfo (op, optype)
 % z = cmplx(x,y) can be computed for x and y as single and double; z is
 % single complex or double complex, respectively.
 %
-% The following bitwise operators are available for any signed or
-% unsigned integer types:  bitor, bitand, bitxor, bitxnor, bitget, bitset,
-% bitclr, and bitshift.
+% The bitwise ops bitor, bitand, bitxor, bitxnor, bitget, bitset, bitclr,
+% and bitshift are available for any signed or unsigned integer type.
 %
 % Typecasting:  If the optype is omitted from the string (for example,
 % GrB.eadd (A, '+', B) or simply C = A+B), then the optype is inferred
@@ -97,8 +116,8 @@ function binopinfo (op, optype)
 % See also GrB.descriptorinfo, GrB.monoidinfo, GrB.selectopinfo,
 % GrB.semiringinfo, GrB.unopinfo, GrB.optype.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 0)
     help GrB.binopinfo
diff --git a/GraphBLAS/GraphBLAS/@GrB/bitand.m b/GraphBLAS/GraphBLAS/@GrB/bitand.m
index 06ef5c9813..1019972ea2 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bitand.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bitand.m
@@ -34,8 +34,8 @@
 % See also GrB/bitor, GrB/bitxor, GrB/bitcmp, GrB/bitshift, GrB/bitget,
 % GrB/bitset, GrB/bitclr.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 3)
     assumedtype = 'uint64' ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/bitcmp.m b/GraphBLAS/GraphBLAS/@GrB/bitcmp.m
index 711c672591..3754300103 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bitcmp.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bitcmp.m
@@ -24,8 +24,8 @@
 % See also GrB/bitor, GrB/bitand, GrB/bitxor, GrB/bitshift, GrB/bitget,
 % GrB/bitset, GrB/bitclr.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 2)
     assumedtype = 'uint64' ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/bitget.m b/GraphBLAS/GraphBLAS/@GrB/bitget.m
index bf95516a9c..af0ebed797 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bitget.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bitget.m
@@ -36,8 +36,8 @@
 % See also GrB/bitor, GrB/bitand, GrB/bitxor, GrB/bitcmp, GrB/bitshift,
 % GrB/bitset, GrB/bitclr.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 3)
     assumedtype = 'uint64' ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/bitor.m b/GraphBLAS/GraphBLAS/@GrB/bitor.m
index ff5d0b72ed..13b6ff6cb1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bitor.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bitor.m
@@ -2,8 +2,8 @@
 %BITOR bitwise OR.
 % C = bitor (A,B) is the bitwise OR of A and B.  If A and B are matrices,
 % the pattern of C is the set union of A and B.  If one of A or B is a
-% nonzero scalar, the scalar is expanded into a dense matrix the size of
-% the other matrix, and the result is a dense matrix.
+% nonzero scalar, the scalar is expanded into a full matrix the size of
+% the other matrix, and the result is a full matrix.
 %
 % With a third parameter, C = bitor (A,B,assumedtype) provides a data type
 % to convert A and B to if they are floating-point types.  If A or B
@@ -28,8 +28,8 @@
 % See also GrB/bitand, GrB/bitxor, GrB/bitcmp, GrB/bitshift, GrB/bitget,
 % GrB/bitset, GrB/bitclr.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 3)
     assumedtype = 'uint64' ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/bitset.m b/GraphBLAS/GraphBLAS/@GrB/bitset.m
index 7c41753345..249e7ff28b 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bitset.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bitset.m
@@ -42,12 +42,12 @@
 % See also GrB/bitor, GrB/bitand, GrB/bitxor, GrB/bitcmp, GrB/bitshift,
 % GrB/bitset, GrB/bitclr.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: bitset(A,B,V) for two matrices A and B is slower than it could be.
 % See comments in gb_union_op.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 if (isobject (A))
     A = A.opaque ;
 end
@@ -156,11 +156,11 @@
 
     if (a_is_scalar)
         % expand A to a full matrix the same size as V.
-        A = gb_scalar_to_full (m, n, atype, A) ;
+        A = gb_scalar_to_full (m, n, atype, gb_fmt (V), A) ;
     end
     if (b_is_scalar)
         % expand B to a full matrix the same size as V.
-        B = gb_scalar_to_full (m, n, atype, B) ;
+        B = gb_scalar_to_full (m, n, atype, gb_fmt (V), B) ;
     end
 
     % Set all bits referenced by B(i,j) to 1, even those that need to be
diff --git a/GraphBLAS/GraphBLAS/@GrB/bitshift.m b/GraphBLAS/GraphBLAS/@GrB/bitshift.m
index a7cc690f3c..24210228c9 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bitshift.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bitshift.m
@@ -22,12 +22,12 @@
 % See also GrB/bitor, GrB/bitand, GrB/bitxor, GrB/bitcmp, GrB/bitget,
 % GrB/bitset, GrB/bitclr.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: bitshift(A,B) for two matrices A and B is slower than it could be.
 % See comments in gb_union_op.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 if (nargin < 3)
     assumedtype = 'uint64' ;
 end
diff --git a/GraphBLAS/GraphBLAS/@GrB/bitxor.m b/GraphBLAS/GraphBLAS/@GrB/bitxor.m
index c71345a388..d8922e922d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/bitxor.m
+++ b/GraphBLAS/GraphBLAS/@GrB/bitxor.m
@@ -2,8 +2,8 @@
 %BITXOR bitwise XOR.
 % C = bitxor (A,B) is the bitwise XOR of A and B.  If A and B are
 % matrices, the pattern of C is the set union of A and B.  If one of A or
-% B is a nonzero scalar, the scalar is expanded into a dense matrix the
-% size of the other matrix, and the result is a dense matrix.
+% B is a nonzero scalar, the scalar is expanded into a full matrix the
+% size of the other matrix, and the result is a full matrix.
 %
 % With a third parameter, C = bitxor (A,B,assumedtype) provides a data
 % type to convert A and B to if they are floating-point types.  If A or B
@@ -28,8 +28,8 @@
 % See also GrB/bitor, GrB/bitand, GrB/bitcmp, GrB/bitshift, GrB/bitget,
 % GrB/bitset, GrB/bitclr.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 3)
     assumedtype = 'uint64' ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/build.m b/GraphBLAS/GraphBLAS/@GrB/build.m
index 0ebf88220f..2553e58aff 100644
--- a/GraphBLAS/GraphBLAS/@GrB/build.m
+++ b/GraphBLAS/GraphBLAS/@GrB/build.m
@@ -62,8 +62,8 @@
 %
 % See also sparse, GrB/sparse, GrB/find, GrB.extracttuples.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [C, k] = gbbuild (varargin {:}) ;
 if (k == 0)
diff --git a/GraphBLAS/GraphBLAS/@GrB/burble.m b/GraphBLAS/GraphBLAS/@GrB/burble.m
index 9d194119e2..5925e05c83 100644
--- a/GraphBLAS/GraphBLAS/@GrB/burble.m
+++ b/GraphBLAS/GraphBLAS/@GrB/burble.m
@@ -11,8 +11,8 @@
 %
 % See also spparms.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 0)
     b = gbburble ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/ceil.m b/GraphBLAS/GraphBLAS/@GrB/ceil.m
index 39f743a2fa..7ff2d1aa98 100644
--- a/GraphBLAS/GraphBLAS/@GrB/ceil.m
+++ b/GraphBLAS/GraphBLAS/@GrB/ceil.m
@@ -3,8 +3,8 @@
 %
 % See also GrB/floor, GrB/round, GrB/fix.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 Q = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/chunk.m b/GraphBLAS/GraphBLAS/@GrB/chunk.m
index 5bc57a2e41..17550c96fc 100644
--- a/GraphBLAS/GraphBLAS/@GrB/chunk.m
+++ b/GraphBLAS/GraphBLAS/@GrB/chunk.m
@@ -17,8 +17,8 @@
 %
 % See also GrB.threads.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 0)
     c = gbchunk ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/clear.m b/GraphBLAS/GraphBLAS/@GrB/clear.m
index aba7403d99..99a10004fc 100644
--- a/GraphBLAS/GraphBLAS/@GrB/clear.m
+++ b/GraphBLAS/GraphBLAS/@GrB/clear.m
@@ -6,8 +6,8 @@
 %
 % See also clear, GrB.init, GrB.finalize.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 gbsetup ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/colamd.m b/GraphBLAS/GraphBLAS/@GrB/colamd.m
index 93ee7e9a78..4e05d7c822 100644
--- a/GraphBLAS/GraphBLAS/@GrB/colamd.m
+++ b/GraphBLAS/GraphBLAS/@GrB/colamd.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/amd.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [p, varargout{1:nargout-1}] = colamd (double (G), varargin {:}) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/compact.m b/GraphBLAS/GraphBLAS/@GrB/compact.m
index f4411a5d24..1691e6b4d1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/compact.m
+++ b/GraphBLAS/GraphBLAS/@GrB/compact.m
@@ -34,8 +34,8 @@
 %
 % See also GrB.entries, GrB.nonz, GrB.prune.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/complex.m b/GraphBLAS/GraphBLAS/@GrB/complex.m
index 7fc2311ca4..b1195ad449 100644
--- a/GraphBLAS/GraphBLAS/@GrB/complex.m
+++ b/GraphBLAS/GraphBLAS/@GrB/complex.m
@@ -1,15 +1,15 @@
 function C = complex (A, B)
-%COMPLEX cast to a MATLAB sparse double complex matrix.
-% C = complex (G) typecasts the GraphBLAS matrix G to into a MATLAB sparse
-% complex matrix.
+%COMPLEX cast to a MATLAB double complex matrix.
+% C = complex (G) typecasts the GraphBLAS matrix G to into a MATLAB
+% double complex matrix.  C is full if all entries in G are present,
+% or sparse otherwse.
 %
 % With two inputs, C = complex (A,B) returns a MATLAB matrix C = A + 1i*B,
 % where A or B are real matrices (MATLAB and/or GraphBLAS, in any
 % combination).  If A or B are nonzero scalars and the other input is a
-% matrix, or if both A and B are scalars, C is full.  Otherwise, C is
-% sparse.
+% matrix, or if both A and B are scalars, C is full.
 %
-% To typecast the matrix G to a GraphBLAS sparse double complex matrix
+% To typecast the matrix G to a GraphBLAS double complex matrix
 % instead, use C = GrB (G, 'complex') or C = GrB (G, 'double complex').
 % To typecast the matrix G to a GraphBLAS single complex matrix, use
 % C = GrB (G, 'single complex').
@@ -24,22 +24,25 @@
 % GrB/int16, GrB/int32, GrB/int64, GrB/uint8, GrB/uint16, GrB/uint32,
 % GrB/uint64.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: complex(A,B) for two matrices A and B is slower than it could be.
 % See comments in gb_union_op.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 if (nargin == 1)
 
     % with a single input, A must be a GraphBLAS matrix (otherwise,
     % this overloaded method for GrB objects would not be called).
-    % Convert A to a double complex matrix C.
+    % Convert A to a MATLAB double complex matrix C.
     A = A.opaque ;
-    C = gbsparse (A, 'double complex') ;
+    C = gbmatlab (A, 'double complex') ;
 
 else
 
+    % with two inputs, A and B are real matrices (either MATLAB or GrB,
+    % but at least one must be GrB or otherwise this overloaded method
+    % would not be called).  The output is a MATLAB double complex matrix.
     if (isobject (A))
         A = A.opaque ;
     end
@@ -67,12 +70,12 @@
         else
             % A is a scalar, B is a matrix.  C is full, unless A == 0.
             if (gb_scalar (A) == 0)
-                % C = 1i*B, so A = zero, C is sparse.
-                desc.kind = 'sparse' ;
+                % C = 1i*B, so A = zero, C is sparse or full.
+                desc.kind = 'matlab' ;
                 C = gbapply2 ('cmplx.double', 0, B, desc) ;
             else
                 % expand A and B to full double matrices; C is full
-                A = gb_scalar_to_full (bm, bn, 'double', A) ;
+                A = gb_scalar_to_full (bm, bn, 'double', gb_fmt (B), A) ;
                 B = gbfull (B, 'double') ;
                 desc.kind = 'full' ;
                 C = gbemult ('cmplx.double', A, B, desc) ;
@@ -82,18 +85,18 @@
         if (b_is_scalar)
             % A is a matrix, B is a scalar.  C is full, unless B == 0.
             if (gb_scalar (B) == 0)
-                % C = complex (A); C is sparse
-                C = gbsparse (A, 'double.complex') ;
+                % C = complex (A); C is sparse or full
+                C = gbmatlab (A, 'double.complex') ;
             else
                 % expand A and B to full double matrices; C is full
                 A = gbfull (A, 'double') ;
-                B = gb_scalar_to_full (am, an, 'double', B) ;
+                B = gb_scalar_to_full (am, an, 'double', gb_fmt (A), B) ;
                 desc.kind = 'full' ;
                 C = gbemult ('cmplx.double', A, B, desc) ;
             end
         else
-            % both A and B are matrices.  C is sparse.
-            desc.kind = 'sparse' ;
+            % both A and B are matrices.  C is sparse or full.
+            desc.kind = 'matlab' ;
             C = gbeadd (A, '+', gbapply2 (1i, '*', B), desc) ;
         end
     end
diff --git a/GraphBLAS/GraphBLAS/@GrB/conj.m b/GraphBLAS/GraphBLAS/@GrB/conj.m
index b6ea69209a..eacbeacb07 100644
--- a/GraphBLAS/GraphBLAS/@GrB/conj.m
+++ b/GraphBLAS/GraphBLAS/@GrB/conj.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/real, GrB/imag.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 Q = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/cos.m b/GraphBLAS/GraphBLAS/@GrB/cos.m
index 6e76bf7435..c2083706f5 100644
--- a/GraphBLAS/GraphBLAS/@GrB/cos.m
+++ b/GraphBLAS/GraphBLAS/@GrB/cos.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/acos, GrB/cosh, GrB/acosh.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/cosh.m b/GraphBLAS/GraphBLAS/@GrB/cosh.m
index 631d653ee9..9e9b0bc306 100644
--- a/GraphBLAS/GraphBLAS/@GrB/cosh.m
+++ b/GraphBLAS/GraphBLAS/@GrB/cosh.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/cos, GrB/acos, GrB/acosh.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/cot.m b/GraphBLAS/GraphBLAS/@GrB/cot.m
index 00d0224fd1..28c5b0ea13 100644
--- a/GraphBLAS/GraphBLAS/@GrB/cot.m
+++ b/GraphBLAS/GraphBLAS/@GrB/cot.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/coth, GrB/acot, GrB/acoth.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 if (~gb_isfloat (gbtype (G)))
diff --git a/GraphBLAS/GraphBLAS/@GrB/coth.m b/GraphBLAS/GraphBLAS/@GrB/coth.m
index 2d519b4041..1d893ca1e4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/coth.m
+++ b/GraphBLAS/GraphBLAS/@GrB/coth.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/cot, GrB/acot, GrB/acoth.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 if (~gb_isfloat (gbtype (G)))
diff --git a/GraphBLAS/GraphBLAS/@GrB/csc.m b/GraphBLAS/GraphBLAS/@GrB/csc.m
index d705c56e88..8b388253bd 100644
--- a/GraphBLAS/GraphBLAS/@GrB/csc.m
+++ b/GraphBLAS/GraphBLAS/@GrB/csc.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/acsc, GrB/csch, GrB/acsch.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 if (~gb_isfloat (gbtype (G)))
diff --git a/GraphBLAS/GraphBLAS/@GrB/csch.m b/GraphBLAS/GraphBLAS/@GrB/csch.m
index 87039e27d4..031fbef7cc 100644
--- a/GraphBLAS/GraphBLAS/@GrB/csch.m
+++ b/GraphBLAS/GraphBLAS/@GrB/csch.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/csc, GrB/acsc, GrB/acsch.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 if (~gb_isfloat (gbtype (G)))
diff --git a/GraphBLAS/GraphBLAS/@GrB/ctranspose.m b/GraphBLAS/GraphBLAS/@GrB/ctranspose.m
index b8bb37030b..2adaf56f0c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/ctranspose.m
+++ b/GraphBLAS/GraphBLAS/@GrB/ctranspose.m
@@ -4,8 +4,8 @@
 %
 % See also GrB.trans, GrB/transpose, GrB/conj.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/descriptorinfo.m b/GraphBLAS/GraphBLAS/@GrB/descriptorinfo.m
index b3eeeb0f3c..326bc8fc7e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/descriptorinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/descriptorinfo.m
@@ -28,33 +28,26 @@ function descriptorinfo (d)
 %            determine the method automatically, via a heuristic.
 %
 %   d.kind   For most GrB.methods, this is a string equal to 'default',
-%            'GrB', 'sparse', or 'full'.  The default is d.kind = 'GrB',
+%            'GrB', 'sparse', 'full', or 'matlab'.  The default is 'GrB',
 %            where the GraphBLAS operation returns an object, which is
 %            preferred since GraphBLAS sparse matrices are faster and can
 %            represent many more data types.  However, if you want a
 %            standard MATLAB sparse matrix on ouput, use d.kind='sparse'.
-%            Use d.kind='full' to return a MATLAB dense matrix.  For any
-%            GrB.method that takes a descriptor, the following uses are
-%            the same, but the first method is faster and takes less
-%            temporary workspace:
-%
-%               d.kind = 'sparse' ;
-%               S = GrB.method (..., d) ;
-%
-%               % with no d, or d.kind = 'default'
-%               S = double (GrB.method (...)) :
+%            Use d.kind='full' to return a MATLAB full matrix.  Use
+%            d.kind='matlab' for a MATLAB sparse or full matrix (full if
+%            all entries are present, sparse otherwise).
 %
 %   d.base   A string equal to 'default', 'zero-based', 'one-based', or
 %            'one-based int'.  The default is 'one-based'.  If d.base is
 %            'zero-based', then indices are zero-based, in the range 0 to
 %            n-1, for a matrix of dimension n.
 %
-%   d.format a string, either 'by row' or 'by col', which defines the
-%            format of the GraphBLAS output matrix C.  The following rules
-%            are used to determine the format of the result, in order:
+%   d.format a string that describes the sparsity format of the output
+%            matrix C.  The following rules are used to determine the
+%            format of the result, in order:
 %
-%            (1) If the format is determined by the descriptor to the
-%                method, then that determines the format of C.
+%            (1) If d.format appears in the descriptor for a method, then
+%               that determines the format of C.
 %            (2) If C is a column vector then C is stored by column.
 %            (3) If C is a row vector then C is stored by row.
 %            (4) If the method has a first matrix input (usually called A),
@@ -66,6 +59,13 @@ function descriptorinfo (d)
 %            (6) Otherwise, the global default format is used for C.
 %                See GrB.format for details.
 %
+%           The d.format string optionally includes one or more strings
+%           'sparse', 'hypersparse' (or 'hyper' for short), 'bitmap', and
+%           'full', separated by '/', and then optionally followed by the
+%           string 'by row' or 'by col'.  For example, to allow C to be
+%           sparse or bitmap, use d.format = 'sparse/bitmap'.  To return
+%           C as hypersparse in row-oriented format, use 'hyper by row'.
+%
 % These descriptor values are scalars:
 %
 %   d.nthreads  max # of threads to use; default is omp_get_max_threads.
@@ -78,8 +78,8 @@ function descriptorinfo (d)
 % See also GrB.binopinfo, GrB.monoidinfo, GrB.selectopinfo,
 % GrB.semiringinfo, GrB.unopinfo.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % FUTURE: add desc.in* = 'conjugate transpose'
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/diag.m b/GraphBLAS/GraphBLAS/@GrB/diag.m
index 179bccf88c..7464b31c2a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/diag.m
+++ b/GraphBLAS/GraphBLAS/@GrB/diag.m
@@ -31,8 +31,8 @@
 %
 % See also GrB/diag, spdiags, GrB/tril, GrB/triu, GrB.select.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/digraph.m b/GraphBLAS/GraphBLAS/@GrB/digraph.m
index e0ee76db97..c3f7adbc3a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/digraph.m
+++ b/GraphBLAS/GraphBLAS/@GrB/digraph.m
@@ -24,8 +24,8 @@
 %
 % See also graph, digraph, GrB/graph.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
@@ -62,13 +62,13 @@
 
     case { 'logical' }
 
-        % The MATLAB digraph(...) function allows for sparse logical
+        % The MATLAB digraph(...) function allows for logical
         % adjacency matrices (no edge weights are created).
-        DiGraph = digraph (gbsparse (G, 'logical')) ;
+        DiGraph = digraph (gbmatlab (G, 'logical')) ;
 
     otherwise
 
         % typecast to double
-        DiGraph = digraph (gbsparse (G, 'double')) ;
+        DiGraph = digraph (gbmatlab (G, 'double')) ;
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/disp.m b/GraphBLAS/GraphBLAS/@GrB/disp.m
index 305fd71e5e..4b4fecfcb6 100644
--- a/GraphBLAS/GraphBLAS/@GrB/disp.m
+++ b/GraphBLAS/GraphBLAS/@GrB/disp.m
@@ -24,8 +24,8 @@ function disp (A, level)
 %
 % See also GrB/display.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 2)
     level = 2 ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/display.m b/GraphBLAS/GraphBLAS/@GrB/display.m
index 607c95831a..c9c9137b47 100644
--- a/GraphBLAS/GraphBLAS/@GrB/display.m
+++ b/GraphBLAS/GraphBLAS/@GrB/display.m
@@ -6,8 +6,8 @@ function display (G) %#ok<DISPLAY>
 %
 % See also GrB/disp.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 name = inputname (1) ;
 if (~isempty (name))
diff --git a/GraphBLAS/GraphBLAS/@GrB/dmperm.m b/GraphBLAS/GraphBLAS/@GrB/dmperm.m
index d648bd9202..19d7bc2c80 100644
--- a/GraphBLAS/GraphBLAS/@GrB/dmperm.m
+++ b/GraphBLAS/GraphBLAS/@GrB/dmperm.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/amd, GrB/colamd.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [p, varargout{1:nargout-1}] = builtin ('dmperm', logical (G)) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/dnn.m b/GraphBLAS/GraphBLAS/@GrB/dnn.m
index b28e26d945..32b49ca5ad 100644
--- a/GraphBLAS/GraphBLAS/@GrB/dnn.m
+++ b/GraphBLAS/GraphBLAS/@GrB/dnn.m
@@ -14,8 +14,8 @@
 %   Y = GrB.dnn (W, bias, Y0) ;
 %
 % The matrices can be stored by row or by column, but GrB.format ('by row')
-% is significantly faster.  For the 2019 GraphChallenge, all matrices can
-% be 'single', and the same results are obtained.
+% is somewhat faster.  For the 2019 GraphChallenge, all matrices can be
+% 'single', and the same results are obtained.
 %
 % In the MATLAB reference implementation, the bias{k} is a row vector of
 % size 1-by-nneurons.  The MATLAB reference inputs can be converted to
@@ -34,21 +34,28 @@
 %
 % See also dnn_matlab, dnn_mat2gb.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % NOTE: this is a high-level algorithm that uses GrB objects.
 
+[f,~] = GrB.format (Y0) ;
+desc.format = '' ;
+if (isequal (f, 'by row'))
+    % hypersparse-by-row is fastest, since entire rows drop out of Y
+    desc.format = 'hyper by row' ;
+end
+tol = single (32) ;
+
 Y = Y0 ;
 for k = 1:length(W)
-
     % Propagate through layer, apply bias, and threshold negative values.
-    Y = GrB.select (GrB.mxm (Y * W {k}, '+.+', bias {k}), '>0') ;
-
-    M = Y > 32 ;
+    Y = GrB.mxm (Y, '+.*', W {k}, desc) ;
+    Y = GrB.select (GrB.mxm (Y, '+.+', bias {k}, desc), '>0', desc) ;
+    M = Y > tol ;
     if (nnz (M) > 0)
-        % Y (M) = 32 ;
-        Y = GrB.subassign (Y, M, 32) ;
+        % Y (M) = tol ;
+        Y = GrB.subassign (Y, M, tol, desc) ;
     end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/double.m b/GraphBLAS/GraphBLAS/@GrB/double.m
index dca55663b2..21b98d2d1f 100644
--- a/GraphBLAS/GraphBLAS/@GrB/double.m
+++ b/GraphBLAS/GraphBLAS/@GrB/double.m
@@ -1,23 +1,24 @@
 function C = double (G)
-%DOUBLE cast a GraphBLAS sparse matrix to a MATLAB sparse double matrix.
-% C = double (G) typecasts the GraphBLAS matrix G into a MATLAB sparse
-% double matrix C, either real or complex.  Explicit zeros are dropped.
+%DOUBLE cast a GraphBLAS matrix to a MATLAB double matrix.
+% C = double (G) typecasts the GraphBLAS matrix G into a MATLAB
+% double matrix C, either real or complex.  C is full if all
+% entries in G are present, and sparse otherwise.
 %
-% To typecast the matrix G to a GraphBLAS sparse double (real) matrix
+% To typecast the matrix G to a GraphBLAS double (real) matrix
 % instead, use C = GrB (G, 'double').  Explicit zeros are kept in C.
 %
 % See also GrB/cast, GrB, GrB/complex, GrB/single, GrB/logical, GrB/int8,
 % GrB/int16, GrB/int32, GrB/int64, GrB/uint8, GrB/uint16, GrB/uint32,
 % GrB/uint64.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
 if (contains (gbtype (G), 'complex'))
-    C = gbsparse (G, 'double complex') ;
+    C = gbmatlab (G, 'double complex') ;
 else
-    C = gbsparse (G, 'double') ;
+    C = gbmatlab (G, 'double') ;
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/eadd.m b/GraphBLAS/GraphBLAS/@GrB/eadd.m
index 72d62b5b4d..9d1c9ea278 100644
--- a/GraphBLAS/GraphBLAS/@GrB/eadd.m
+++ b/GraphBLAS/GraphBLAS/@GrB/eadd.m
@@ -27,8 +27,8 @@
 %
 % See also GrB.emult.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/eig.m b/GraphBLAS/GraphBLAS/@GrB/eig.m
index 61cae9ca79..2a1e8f33ff 100644
--- a/GraphBLAS/GraphBLAS/@GrB/eig.m
+++ b/GraphBLAS/GraphBLAS/@GrB/eig.m
@@ -4,8 +4,8 @@
 %
 % See also eigs.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % NOTE: this is a high-level algorithm that uses GrB objects.
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/empty.m b/GraphBLAS/GraphBLAS/@GrB/empty.m
index 0ea625cdbd..a07dcd1973 100644
--- a/GraphBLAS/GraphBLAS/@GrB/empty.m
+++ b/GraphBLAS/GraphBLAS/@GrB/empty.m
@@ -10,8 +10,8 @@
 %
 % See also GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 0)
     m = 0 ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/emult.m b/GraphBLAS/GraphBLAS/@GrB/emult.m
index 1266836352..8c82fae5ce 100644
--- a/GraphBLAS/GraphBLAS/@GrB/emult.m
+++ b/GraphBLAS/GraphBLAS/@GrB/emult.m
@@ -22,8 +22,8 @@
 %
 % See also GrB.eadd.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/end.m b/GraphBLAS/GraphBLAS/@GrB/end.m
index 023105801a..e2aec16a77 100644
--- a/GraphBLAS/GraphBLAS/@GrB/end.m
+++ b/GraphBLAS/GraphBLAS/@GrB/end.m
@@ -3,8 +3,8 @@
 %
 % See also GrB/size, GrB/length.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % FUTURE: add linear indexing
 % FUTURE: use hypersparse matrices to implement multidimensionl nD arrays
diff --git a/GraphBLAS/GraphBLAS/@GrB/entries.m b/GraphBLAS/GraphBLAS/@GrB/entries.m
index 50ca805fa7..6485a4b3de 100644
--- a/GraphBLAS/GraphBLAS/@GrB/entries.m
+++ b/GraphBLAS/GraphBLAS/@GrB/entries.m
@@ -52,8 +52,8 @@
 %
 % See also GrB.nonz, nnz, GrB/nnz, nonzeros, GrB/nonzeros.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     % A is a GraphBLAS matrix; get its opaque content
diff --git a/GraphBLAS/GraphBLAS/@GrB/eps.m b/GraphBLAS/GraphBLAS/@GrB/eps.m
index 60ff204e4c..55422f2192 100644
--- a/GraphBLAS/GraphBLAS/@GrB/eps.m
+++ b/GraphBLAS/GraphBLAS/@GrB/eps.m
@@ -5,14 +5,17 @@
 %
 % See also GrB/isfloat, realmax, realmin.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % FUTURE: GraphBLAS should have a built-in unary operator to
 % compute eps.
 
 % convert to a MATLAB full matrix and use the MATLAB eps:
 
+% FUTURE: there should be a sparse version of 'eps'.  C is full because
+% eps (0) is 2^(-1024).
+
 switch (GrB.type (G))
 
     case { 'single' }
diff --git a/GraphBLAS/GraphBLAS/@GrB/eq.m b/GraphBLAS/GraphBLAS/@GrB/eq.m
index 51b6efc48f..b85144f571 100644
--- a/GraphBLAS/GraphBLAS/@GrB/eq.m
+++ b/GraphBLAS/GraphBLAS/@GrB/eq.m
@@ -8,8 +8,8 @@
 %
 % See also GrB/lt, GrB/le, GrB/gt, GrB/ge, GrB/ne.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % The pattern of C depends on the type of inputs:
 % A scalar, B scalar:  C is scalar.
@@ -39,8 +39,8 @@
         % A is a scalar, B is a matrix
         if (gb_scalar (A) == 0)
             % since a == 0, entries not present in B result in a true
-            % value, so the result is dense.  Expand A to a dense matrix.
-            A = gb_scalar_to_full (bm, bn, ctype, A) ;
+            % value, so the result is full.  Expand A to a full matrix.
+            A = gb_scalar_to_full (bm, bn, ctype, gb_fmt (B), A) ;
             C = GrB (gbemult (A, '==', gbfull (B, ctype))) ;
         else
             % since a ~= 0, entries not present in B result in a false
@@ -54,8 +54,8 @@
         % A is a matrix, B is a scalar
         if (gb_scalar (B) == 0)
             % since b == 0, entries not present in A result in a true
-            % value, so the result is dense.  Expand B to a dense matrix.
-            B = gb_scalar_to_full (am, an, ctype, B) ;
+            % value, so the result is full.  Expand B to a full matrix.
+            B = gb_scalar_to_full (am, an, ctype, gb_fmt (A), B) ;
             C = GrB (gbemult (gbfull (A, ctype), '==', B)) ;
         else
             % since b ~= 0, entries not present in A result in a false
diff --git a/GraphBLAS/GraphBLAS/@GrB/erf.m b/GraphBLAS/GraphBLAS/@GrB/erf.m
index 383bceebd0..420eb440a1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/erf.m
+++ b/GraphBLAS/GraphBLAS/@GrB/erf.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/erfc, erfcx, erfinv, erfcinv.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/erfc.m b/GraphBLAS/GraphBLAS/@GrB/erfc.m
index 36e2c99a85..787b65f9fb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/erfc.m
+++ b/GraphBLAS/GraphBLAS/@GrB/erfc.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/erf, erfcx, erfinv, erfcinv.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/etree.m b/GraphBLAS/GraphBLAS/@GrB/etree.m
index 9a71c36e9f..bba5671bb9 100644
--- a/GraphBLAS/GraphBLAS/@GrB/etree.m
+++ b/GraphBLAS/GraphBLAS/@GrB/etree.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/amd.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = logical (G) ;
 [parent, varargout{1:nargout-1}] = builtin ('etree', G, varargin {:}) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/exp.m b/GraphBLAS/GraphBLAS/@GrB/exp.m
index b5a86c55a8..52c37f0330 100644
--- a/GraphBLAS/GraphBLAS/@GrB/exp.m
+++ b/GraphBLAS/GraphBLAS/@GrB/exp.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/exp, GrB/expm1, GrB/pow2, GrB/log, GrB/log10, GrB/log2.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/expand.m b/GraphBLAS/GraphBLAS/@GrB/expand.m
index 52a430e0bf..8315a078c6 100644
--- a/GraphBLAS/GraphBLAS/@GrB/expand.m
+++ b/GraphBLAS/GraphBLAS/@GrB/expand.m
@@ -14,8 +14,8 @@
 %
 % See also GrB.assign.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (scalar))
     % do not use gb_get_scalar, to keep it sparse
diff --git a/GraphBLAS/GraphBLAS/@GrB/expm1.m b/GraphBLAS/GraphBLAS/@GrB/expm1.m
index 880b10fbcf..23d076a087 100644
--- a/GraphBLAS/GraphBLAS/@GrB/expm1.m
+++ b/GraphBLAS/GraphBLAS/@GrB/expm1.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/exp, GrB/expm1, GrB/log, GrB/log10, GrB/log2.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/extract.m b/GraphBLAS/GraphBLAS/@GrB/extract.m
index 5f15ff3f83..df7c7c0b1c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/extract.m
+++ b/GraphBLAS/GraphBLAS/@GrB/extract.m
@@ -65,8 +65,8 @@
 %
 % See also GrB/subsref.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/extracttuples.m b/GraphBLAS/GraphBLAS/@GrB/extracttuples.m
index 0070304e6e..779cd2d415 100644
--- a/GraphBLAS/GraphBLAS/@GrB/extracttuples.m
+++ b/GraphBLAS/GraphBLAS/@GrB/extracttuples.m
@@ -4,7 +4,7 @@
 %   [I,J,X] = GrB.extracttuples (A, desc)
 %
 % GrB.extracttuples extracts all entries from either a MATLAB or
-% GraphBLAS matrix.  If A is a MATLAB sparse or dense matrix,
+% GraphBLAS matrix.  If A is a MATLAB sparse or full matrix,
 % [I,J,X] = GrB.extracttuples (A) is identical to [I,J,X] = find (A).
 %
 % For a GraphBLAS matrix G, GrB.extracttuples (G) returns any explicit
@@ -18,7 +18,7 @@
 % that I and J are int64 vectors with one-based indices.  One-based
 % indices in I are in the range 1 to m, and the indices in J are in the
 % range 1 to n, if A is m-by-n.  This is identical to [I,J,X] = find (A)
-% for a MATLAB sparse or dense matrix.
+% for a MATLAB sparse or full matrix.
 %
 % If 'zero-based', I and J are returned as int64 arrays, with zero-based
 % indices.  The entries in I and J are in the range 0 to m-1 and 0 to
@@ -30,8 +30,8 @@
 %
 % See also GrB/find, GrB/build.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/eye.m b/GraphBLAS/GraphBLAS/@GrB/eye.m
index 0200e35593..ba2e79b875 100644
--- a/GraphBLAS/GraphBLAS/@GrB/eye.m
+++ b/GraphBLAS/GraphBLAS/@GrB/eye.m
@@ -10,8 +10,8 @@
 %
 % See also GrB/spones, spdiags, GrB.speye, GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 C = GrB (gb_speye ('eye', varargin {:})) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/false.m b/GraphBLAS/GraphBLAS/@GrB/false.m
index 515bc8b90d..1e99315f61 100644
--- a/GraphBLAS/GraphBLAS/@GrB/false.m
+++ b/GraphBLAS/GraphBLAS/@GrB/false.m
@@ -14,8 +14,8 @@
 %
 % See also GrB/ones, GrB/true, GrB/zeros.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n, ~] = gb_parse_args ('false', varargin {:}) ;
 C = GrB (gbnew (m, n, 'logical')) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/finalize.m b/GraphBLAS/GraphBLAS/@GrB/finalize.m
index 965caf213c..bc51c21397 100644
--- a/GraphBLAS/GraphBLAS/@GrB/finalize.m
+++ b/GraphBLAS/GraphBLAS/@GrB/finalize.m
@@ -8,8 +8,8 @@
 %
 % See also GrB.clear, GrB.init.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 gbsetup ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/find.m b/GraphBLAS/GraphBLAS/@GrB/find.m
index ec978a05eb..ccea75bfbb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/find.m
+++ b/GraphBLAS/GraphBLAS/@GrB/find.m
@@ -21,8 +21,8 @@
 %
 % See also sparse, GrB.build, GrB.extracttuples.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % FUTURE: add linear indexing
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/fix.m b/GraphBLAS/GraphBLAS/@GrB/fix.m
index dfda32f498..f187347e8e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/fix.m
+++ b/GraphBLAS/GraphBLAS/@GrB/fix.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/ceil, GrB/floor, GrB/round.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 Q = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/flip.m b/GraphBLAS/GraphBLAS/@GrB/flip.m
index 7eab0338d8..4534d0aac7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/flip.m
+++ b/GraphBLAS/GraphBLAS/@GrB/flip.m
@@ -9,8 +9,8 @@
 %
 % See also GrB/transpose.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 a_is_object = isobject (A) ;
 if (a_is_object)
diff --git a/GraphBLAS/GraphBLAS/@GrB/floor.m b/GraphBLAS/GraphBLAS/@GrB/floor.m
index 04be4df291..dc702802f7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/floor.m
+++ b/GraphBLAS/GraphBLAS/@GrB/floor.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/ceil, GrB/round, GrB/fix.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 Q = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/format.m b/GraphBLAS/GraphBLAS/@GrB/format.m
index 19355d3ae8..64900f691e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/format.m
+++ b/GraphBLAS/GraphBLAS/@GrB/format.m
@@ -1,14 +1,14 @@
-function f = format (arg)
+function [f,s] = format (arg)
 %GRB.FORMAT get/set the default GraphBLAS matrix format.
 %
 % In its ANSI C interface, SuiteSparse:GraphBLAS stores its matrices by
 % row, by default, since that format tends to be fastest for graph
-% algorithms, but it can also store its matrices by column.  MATLAB
-% sparse and dense sparse matrices are always stored by column.  For
-% better compatibility with MATLAB sparse matrices, the default for the
-% MATLAB interface for SuiteSparse:GraphBLAS is to store matrices by
-% column.  This has performance implications, and algorithms should be
-% designed accordingly.  The default format can be can changed via:
+% algorithms, but it can also store its matrices by column.  MATLAB sparse
+% and full matrices are always stored by column.  For better compatibility
+% with MATLAB matrices, the default for the MATLAB interface for
+% SuiteSparse:GraphBLAS is to store matrices by column.  This has
+% performance implications, and algorithms should be designed accordingly.
+% The default format can be can changed via:
 %
 %   GrB.format ('by row')
 %   GrB.format ('by col')
@@ -62,13 +62,46 @@
 % The GrB.format setting is reset to its default ('by col'), via GrB.clear.
 %
 % To query the format for a given GraphBLAS matrix G, use the following
-% (which does not affect the global format setting):
+% (which does not affect the global format setting).  The return value f
+% is 'by row' or 'by col', and s is 'hypersparse', 'sparse', 'bitmap',
+% or 'full'.
 %
-%   f = GrB.format (G)
+%   [f,s] = GrB.format (G)
 %
 % Use G = GrB (G, 'by row') or G = GrB (G, 'by col') to change the format
 % of G after it is constructed.
 %
+% Individual matrices are held in one of four data structurs, each of
+% which can be held 'by row' and 'by col'.  By default, GraphBLAS selects
+% automatically between the following four formats.  Let A by m-by-n with
+% e entries:
+%
+%   (1) 'hypersparse' (or 'hyper' for short):  This is useful if A
+%       n << e and A is 'by col', or m << e if A is 'by row'.  The data
+%       structure takes only O (e) space.
+%   (2) 'sparse':  This the same as the MATLAB sparse matrix, except that
+%       A can be either 'by col' (taking O(n+e) space), or 'by row'
+%       taking O(m+e) space.  A native MATLAB sparse matrix is only held
+%       'by col'.
+%   (3) 'bitmap':  This data structure takes O(m*n) space, but it can
+%       represent a sparse matrix with e < m*n.  It is very efficient
+%       if e is about 0.1*m*n or greater.
+%   (4) 'full':  This takes O(m*n) sparse, and 'full by col' is the same
+%       as a MATLAB full matrix.  All entries must be present (e == m*n).
+%       GraphBLAS can also store a matrix 'full by row'.
+%
+% The sparsity formats can be combined.  For example, to store a matrix in
+% either sparse or bitmap format (but not hypersparse or full) use G = GrB
+% (A, 'sparse/bitmap by col').  GraphBLAS will automatically select
+% between the 'sparse by col' and 'bitmap by col' formats, choosing the
+% latter if the density e/m*n exceeds a default threshold b.  A bitmap
+% matrix is converted to sparse if its density drops below the b/2.  The
+% value of b depends on min(m,n).  A matrix between these two ranges is
+% kept in its current format.  The with 'sparse/bitmap by col' format, a
+% matrix will not be held in hypersparse or full formats.  The default is
+% 'hyper/sparse/bitmap/full by col', which allows GraphBLAS to select
+% between all 4 formats, each column-oriented 'by col'.
+%
 % Examples:
 %
 %   A = sparse (rand (4))
@@ -77,12 +110,12 @@
 %   GrB.format (G)
 %   GrB.format ('by row') ;      % set the default format to 'by row'
 %   G = GrB.build (1:3, 1:3, 1:3)
-%   GrB.format (G)               % query the format of G, which is 'by row'
+%   [f,s] = GrB.format (G)       % query the format of G
 %
 % See also GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 0)
     % f = GrB.format ; get the global format
@@ -94,6 +127,6 @@
     end
     % f = GrB.format (A) ; get the format of the matrix A (MATLAB or GraphBLAS)
     % f = GrB.format (f) ; set the global format for all matrices.
-    f = gbformat (arg) ;
+    [f,s] = gbformat (arg) ;
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/fprintf.m b/GraphBLAS/GraphBLAS/@GrB/fprintf.m
index ab28329dd0..b20072509f 100644
--- a/GraphBLAS/GraphBLAS/@GrB/fprintf.m
+++ b/GraphBLAS/GraphBLAS/@GrB/fprintf.m
@@ -6,8 +6,8 @@
 %
 % See also fprintf, sprintf, GrB/sprintf.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 c = gb_printf_helper ('fprintf', varargin {:}) ;
 if (nargout > 0)
diff --git a/GraphBLAS/GraphBLAS/@GrB/full.m b/GraphBLAS/GraphBLAS/@GrB/full.m
index 81fdb830a7..c82d8d7bbc 100644
--- a/GraphBLAS/GraphBLAS/@GrB/full.m
+++ b/GraphBLAS/GraphBLAS/@GrB/full.m
@@ -1,6 +1,6 @@
 function C = full (A, type, identity)
-%FULL convert a matrix into a GraphBLAS 'dense' matrix.
-% C = full (A, type, identity) converts the matrix A into a GraphBLAS dense
+%FULL convert a matrix into a GraphBLAS full matrix.
+% C = full (A, type, identity) converts the matrix A into a GraphBLAS full
 % matrix C of the given type, by inserting identity values.  The type may
 % be any GraphBLAS type: 'double', 'single', 'single complex', 'double
 % complex', 'logical', 'int8', 'int16', 'int32', 'int64', 'uint8',
@@ -25,8 +25,8 @@
 %
 % See also GrB/issparse, sparse, cast, GrB.type, GrB, GrB.isfull.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 A_is_GrB = isobject (A) ;
 if (A_is_GrB)
diff --git a/GraphBLAS/GraphBLAS/@GrB/gamma.m b/GraphBLAS/GraphBLAS/@GrB/gamma.m
index 844173840e..b48c856f05 100644
--- a/GraphBLAS/GraphBLAS/@GrB/gamma.m
+++ b/GraphBLAS/GraphBLAS/@GrB/gamma.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/gammaln.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/gammaln.m b/GraphBLAS/GraphBLAS/@GrB/gammaln.m
index 73f307f1e7..983ac543d7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/gammaln.m
+++ b/GraphBLAS/GraphBLAS/@GrB/gammaln.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/gammaln.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/ge.m b/GraphBLAS/GraphBLAS/@GrB/ge.m
index 1b010b6f01..02d8f720c7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/ge.m
+++ b/GraphBLAS/GraphBLAS/@GrB/ge.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/lt, GrB/le, GrB/gt, GrB/ne, GrB/eq.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 C = le (B, A) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/graph.m b/GraphBLAS/GraphBLAS/@GrB/graph.m
index ed3236feba..e32924e78c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/graph.m
+++ b/GraphBLAS/GraphBLAS/@GrB/graph.m
@@ -23,8 +23,8 @@
 %
 % See also graph, digraph, GrB/digraph.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
@@ -77,13 +77,13 @@
 
     case { 'logical' }
 
-        % The MATLAB digraph(...) function allows for sparse logical
+        % The MATLAB digraph(...) function allows for logical
         % adjacency matrices (no edge weights are created).
-        Graph = graph (gbsparse (G, 'logical'), side) ;
+        Graph = graph (gbmatlab (G, 'logical'), side) ;
 
     otherwise
 
         % typecast to double
-        Graph = graph (gbsparse (G, 'double'), side) ;
+        Graph = graph (gbmatlab (G, 'double'), side) ;
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/gt.m b/GraphBLAS/GraphBLAS/@GrB/gt.m
index b2ea1ed826..6423cbe848 100644
--- a/GraphBLAS/GraphBLAS/@GrB/gt.m
+++ b/GraphBLAS/GraphBLAS/@GrB/gt.m
@@ -5,11 +5,11 @@
 %
 % See also GrB/lt, GrB/le, GrB/ge, GrB/ne, GrB/eq.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: gt(A,B) for two matrices A and B is slower than it could be.
 % See comments in gb_union_op.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 C = lt (B, A) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/horzcat.m b/GraphBLAS/GraphBLAS/@GrB/horzcat.m
index f7850ea62c..48b8dda1a9 100644
--- a/GraphBLAS/GraphBLAS/@GrB/horzcat.m
+++ b/GraphBLAS/GraphBLAS/@GrB/horzcat.m
@@ -7,10 +7,10 @@
 %
 % See also GrB/vertcat, GrB.optype.
 
-% FUTURE: this will be much faster when it is a mexFunction.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% FUTURE: this will be much faster when it is a mexFunction.
 
 % determine the size of each matrix and the size of the result
 nmatrices = length (varargin) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/hypot.m b/GraphBLAS/GraphBLAS/@GrB/hypot.m
index 2c4d3b4658..2a67d4c64a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/hypot.m
+++ b/GraphBLAS/GraphBLAS/@GrB/hypot.m
@@ -3,17 +3,17 @@
 % C = hypot (A,B) computes sqrt (abs (A).^2 + abs (B).^2) accurately.
 % If A and B are matrices, the pattern of C is the set union of A and B.
 % If one of A or B is a nonzero scalar, the scalar is expanded into a
-% dense matrix the size of the other matrix, and the result is a full
+% full matrix the size of the other matrix, and the result is a full
 % matrix.
 %
 % See also GrB/abs, GrB/norm, GrB/sqrt, GrB/plus, GrB.eadd.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: hypot(A,B) for two matrices A and B is slower than it could be.
 % See comments in gb_union_op.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 if (isobject (A))
     A = A.opaque ;
 end
diff --git a/GraphBLAS/GraphBLAS/@GrB/imag.m b/GraphBLAS/GraphBLAS/@GrB/imag.m
index 1f1009b8ce..e42a1c1926 100644
--- a/GraphBLAS/GraphBLAS/@GrB/imag.m
+++ b/GraphBLAS/GraphBLAS/@GrB/imag.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/conj, GrB/real.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 [m, n, type] = gbsize (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/incidence.m b/GraphBLAS/GraphBLAS/@GrB/incidence.m
index 84bc6c124a..206c0954a1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/incidence.m
+++ b/GraphBLAS/GraphBLAS/@GrB/incidence.m
@@ -36,8 +36,8 @@
 %
 % See also graph/incidence, digraph/incidence.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/init.m b/GraphBLAS/GraphBLAS/@GrB/init.m
index 09e5e1dcef..9ebc956d58 100644
--- a/GraphBLAS/GraphBLAS/@GrB/init.m
+++ b/GraphBLAS/GraphBLAS/@GrB/init.m
@@ -8,8 +8,8 @@
 %
 % See also GrB.clear, GrB.finalize, startup.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 gbsetup ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/int16.m b/GraphBLAS/GraphBLAS/@GrB/int16.m
index 2ee9286d68..d7fb644567 100644
--- a/GraphBLAS/GraphBLAS/@GrB/int16.m
+++ b/GraphBLAS/GraphBLAS/@GrB/int16.m
@@ -10,10 +10,10 @@
 % See also GrB, GrB/double, GrB/complex, GrB/single, GrB/logical, GrB/int8,
 % GrB/int32, GrB/int64, GrB/uint8, GrB/uint16, GrB/uint32, GrB/uint64.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 desc.kind = 'full' ;
-C = gbfull (G, 'int16', int16 (0), desc) ;
+C = gbfull (G, 'int16', int16 (0), desc) ;      % export as a MATLAB full matrix
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/int32.m b/GraphBLAS/GraphBLAS/@GrB/int32.m
index 0165406d19..64cec1e457 100644
--- a/GraphBLAS/GraphBLAS/@GrB/int32.m
+++ b/GraphBLAS/GraphBLAS/@GrB/int32.m
@@ -10,10 +10,10 @@
 % See also GrB, GrB/double, GrB/complex, GrB/single, GrB/logical, GrB/int8,
 % GrB/int16, GrB/int64, GrB/uint8, GrB/uint16, GrB/uint32, GrB/uint64.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 desc.kind = 'full' ;
-C = gbfull (G, 'int32', int32 (0), desc) ;
+C = gbfull (G, 'int32', int32 (0), desc) ;      % export as a MATLAB full matrix
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/int64.m b/GraphBLAS/GraphBLAS/@GrB/int64.m
index f92334f540..372fcea291 100644
--- a/GraphBLAS/GraphBLAS/@GrB/int64.m
+++ b/GraphBLAS/GraphBLAS/@GrB/int64.m
@@ -10,10 +10,10 @@
 % See also GrB, GrB/double, GrB/complex, GrB/single, GrB/logical, GrB/int8,
 % GrB/int16, GrB/int32, GrB/uint8, GrB/uint16, GrB/uint32, GrB/uint64.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 desc.kind = 'full' ;
-C = gbfull (G, 'int64', int64 (0), desc) ;
+C = gbfull (G, 'int64', int64 (0), desc) ;      % export as a MATLAB full matrix
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/int8.m b/GraphBLAS/GraphBLAS/@GrB/int8.m
index cce3b6363b..d37f89978d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/int8.m
+++ b/GraphBLAS/GraphBLAS/@GrB/int8.m
@@ -11,10 +11,10 @@
 % GrB/int16, GrB/int32, GrB/int64, GrB/uint8, GrB/uint16, GrB/uint32,
 % GrB/uint64.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 desc.kind = 'full' ;
-C = gbfull (G, 'int8', int8 (0), desc) ;
+C = gbfull (G, 'int8', int8 (0), desc) ;        % export as a MATLAB full matrix
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/isa.m b/GraphBLAS/GraphBLAS/@GrB/isa.m
index a360a742d3..3128d90850 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isa.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isa.m
@@ -19,8 +19,8 @@
 % See also class, GrB.type, GrB/isnumeric, GrB/islogical, GrB/isfloat,
 % GrB/isinteger, isobject, GrB/issparse, GrB/isreal.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isequal (type, 'GrB') || isequal (type, 'numeric'))
     % all GraphBLAS matrices are numeric, and have class name 'GrB'
diff --git a/GraphBLAS/GraphBLAS/@GrB/isbanded.m b/GraphBLAS/GraphBLAS/@GrB/isbanded.m
index b52f85aee7..246a33d542 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isbanded.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isbanded.m
@@ -4,10 +4,10 @@
 %
 % See also GrB/istril, GrB/istriu, GrB/bandwidth.
 
-% FUTURE: this will be much faster when 'gb_bandwidth' is a mexFunction.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved.  http://suitesparse.com   See GraphBLAS/Doc/License.txt.
+% FUTURE: this will be much faster when 'gb_bandwidth' is a mexFunction.
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/isbycol.m b/GraphBLAS/GraphBLAS/@GrB/isbycol.m
index e62cd9da00..07ba651b01 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isbycol.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isbycol.m
@@ -6,8 +6,8 @@
 %
 % See also GrB.isbyrow, GrB.format.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/isbyrow.m b/GraphBLAS/GraphBLAS/@GrB/isbyrow.m
index 03bead6e1e..59406ef0fe 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isbyrow.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isbyrow.m
@@ -6,8 +6,8 @@
 %
 % See also GrB.isbycol, GrB.format.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/isdiag.m b/GraphBLAS/GraphBLAS/@GrB/isdiag.m
index e2392f73b9..cca8977851 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isdiag.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isdiag.m
@@ -4,10 +4,11 @@
 %
 % See also GrB/isbanded.
 
-% FUTURE: this will faster when 'gb_bandwidth' is a mexFunction.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% FUTURE: this will be faster when 'gb_bandwidth' is a mexFunction,
+% but this version is fairly fast anyway.
 
 G = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/isempty.m b/GraphBLAS/GraphBLAS/@GrB/isempty.m
index 296cb7a933..007239bcd5 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isempty.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isempty.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/size.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n] = size (G) ;
 s = (m == 0) | (n == 0) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/isequal.m b/GraphBLAS/GraphBLAS/@GrB/isequal.m
index a7cc1be1c9..125eab3383 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isequal.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isequal.m
@@ -15,8 +15,8 @@
 %
 % See also isequal, GrB/eq, isequaln.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/isfinite.m b/GraphBLAS/GraphBLAS/@GrB/isfinite.m
index 56b715de15..9c5327af18 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isfinite.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isfinite.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/isnan, GrB/isinf.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 [m, n, type] = gbsize (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/isfloat.m b/GraphBLAS/GraphBLAS/@GrB/isfloat.m
index 6d6c4d3651..20115e727d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isfloat.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isfloat.m
@@ -6,8 +6,8 @@
 % See also GrB/isnumeric, GrB/isreal, GrB/isinteger, GrB/islogical,
 % GrB.type, GrB/isa, GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 s = gb_isfloat (gbtype (G)) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/isfull.m b/GraphBLAS/GraphBLAS/@GrB/isfull.m
index fd373adcd2..746983ae1d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isfull.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isfull.m
@@ -1,14 +1,13 @@
 function s = isfull (A)
 %GRB.ISFULL determine if all entries are present.
-% For a GraphBLAS matrix, or a MATLAB sparse matrix, GrB.isfull (A) is true
-% if numel (A) == nnz (A).  A can be a GraphBLAS matrix, or a MATLAB sparse
-% or full matrix.  GrB.isfull (A) is always true if A is a MATLAB full
-% matrix.
+% For either a GraphBLAS or MATLAB matrix, GrB.isfull (A) is true if
+% numel(A) == nnz(A).  GrB.isfull (A) is always true if A is a GraphBLAS
+% or MATLAB full matrix.
 %
 % See also GrB/issparse, GrB/full.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     % GraphBLAS matrix
diff --git a/GraphBLAS/GraphBLAS/@GrB/ishermitian.m b/GraphBLAS/GraphBLAS/@GrB/ishermitian.m
index 2218b1cbfb..f917b440a4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/ishermitian.m
+++ b/GraphBLAS/GraphBLAS/@GrB/ishermitian.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/issymmetric.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/isinf.m b/GraphBLAS/GraphBLAS/@GrB/isinf.m
index 8ddf2ee3e1..1d21bf4b27 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isinf.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isinf.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/isnan, GrB/isfinite.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 [m, n, type] = gbsize (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/isinteger.m b/GraphBLAS/GraphBLAS/@GrB/isinteger.m
index 87a4ccd357..017a894a0d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isinteger.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isinteger.m
@@ -6,8 +6,8 @@
 % See also GrB/isnumeric, GrB/isfloat, GrB/isreal, GrB/islogical,
 % GrB.type, GrB/isa, GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 s = contains (gbtype (G), 'int') ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/islogical.m b/GraphBLAS/GraphBLAS/@GrB/islogical.m
index 4ff0fd3033..b3f0f58b6b 100644
--- a/GraphBLAS/GraphBLAS/@GrB/islogical.m
+++ b/GraphBLAS/GraphBLAS/@GrB/islogical.m
@@ -5,8 +5,8 @@
 % See also GrB/isnumeric, GrB/isfloat, GrB/isreal, GrB/isinteger,
 % GrB.type, GrB/isa, GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 s = isequal (gbtype (G), 'logical') ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/ismatrix.m b/GraphBLAS/GraphBLAS/@GrB/ismatrix.m
index af9dae7942..b44bbd1f1d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/ismatrix.m
+++ b/GraphBLAS/GraphBLAS/@GrB/ismatrix.m
@@ -5,8 +5,8 @@
 % See also GrB/issparse, GrB/isvector, GrB/isscalar, GrB/full, GrB/isa,
 % GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 s = true ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/isnan.m b/GraphBLAS/GraphBLAS/@GrB/isnan.m
index ee9ea68a41..21982c95de 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isnan.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isnan.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/isinf, GrB/isfinite.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 [m, n, type] = gbsize (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/isnumeric.m b/GraphBLAS/GraphBLAS/@GrB/isnumeric.m
index 486efd3a98..8bcdedd4eb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isnumeric.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isnumeric.m
@@ -7,8 +7,8 @@
 % See also GrB/isfloat, GrB/isreal, GrB/isinteger, GrB/islogical,
 % GrB.type, GrB/isa, GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 s = true ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/isreal.m b/GraphBLAS/GraphBLAS/@GrB/isreal.m
index 1a75c4e560..38169d7804 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isreal.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isreal.m
@@ -6,8 +6,8 @@
 % See also GrB/isnumeric, GrB/isfloat, GrB/isinteger, GrB/islogical,
 % GrB.type, GrB/isa, GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 s = ~contains (gbtype (G), 'complex') ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/isscalar.m b/GraphBLAS/GraphBLAS/@GrB/isscalar.m
index 17bd314d5a..fd125a7d68 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isscalar.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isscalar.m
@@ -5,8 +5,8 @@
 % See also GrB/issparse, GrB/ismatrix, GrB/isvector, GrB/issparse,
 % GrB/isfull, GrB/isa, GrB, GrB/size.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 s = gb_isscalar (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/issigned.m b/GraphBLAS/GraphBLAS/@GrB/issigned.m
index a0bbb62f52..54228c1dfb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/issigned.m
+++ b/GraphBLAS/GraphBLAS/@GrB/issigned.m
@@ -9,8 +9,8 @@
 %
 % See also GrB/isinteger, GrB/isreal, GrB/isnumeric, GrB/isfloat, GrB.type.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (ischar (arg))
     type = arg ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/issparse.m b/GraphBLAS/GraphBLAS/@GrB/issparse.m
index 3dbb67d37f..5411d44a5c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/issparse.m
+++ b/GraphBLAS/GraphBLAS/@GrB/issparse.m
@@ -10,8 +10,8 @@
 %
 % See also GrB/ismatrix, GrB/isvector, GrB/isscalar, GrB/isfull, GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 s = true ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/issymmetric.m b/GraphBLAS/GraphBLAS/@GrB/issymmetric.m
index 6b785c4b10..cd939dfff7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/issymmetric.m
+++ b/GraphBLAS/GraphBLAS/@GrB/issymmetric.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/ishermitian.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/istril.m b/GraphBLAS/GraphBLAS/@GrB/istril.m
index 0865310a37..c7c2790a9d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/istril.m
+++ b/GraphBLAS/GraphBLAS/@GrB/istril.m
@@ -7,12 +7,12 @@
 %
 % See also GrB/istriu, GrB/isbanded.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: this will be much faster when written as a mexFunction
 % that doesn't rely on gbselect.  Use a gb_bandwith mexFunction.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 G = G.opaque ;
 s = (gbnvals (gbselect ('triu', G, 1)) == 0) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/istriu.m b/GraphBLAS/GraphBLAS/@GrB/istriu.m
index 5f9c0e721e..add28ba929 100644
--- a/GraphBLAS/GraphBLAS/@GrB/istriu.m
+++ b/GraphBLAS/GraphBLAS/@GrB/istriu.m
@@ -7,12 +7,12 @@
 %
 % See also GrB/istriu, GrB/isbanded.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: this will be much faster when written as a mexFunction
 % that doesn't rely on gbselect.  Use a gb_bandwith mexFunction.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 G = G.opaque ;
 s = (gbnvals (gbselect ('tril', G, -1)) == 0) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/isvector.m b/GraphBLAS/GraphBLAS/@GrB/isvector.m
index 17b11d72b9..043f4fa4fd 100644
--- a/GraphBLAS/GraphBLAS/@GrB/isvector.m
+++ b/GraphBLAS/GraphBLAS/@GrB/isvector.m
@@ -5,8 +5,8 @@
 % See also GrB/issparse, GrB/ismatrix, GrB/isscalar, GrB/issparse,
 % GrB/isfull, GrB/isa, GrB, GrB/size.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 s = gb_isvector (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/kron.m b/GraphBLAS/GraphBLAS/@GrB/kron.m
index a07e49d674..ed62bf5760 100644
--- a/GraphBLAS/GraphBLAS/@GrB/kron.m
+++ b/GraphBLAS/GraphBLAS/@GrB/kron.m
@@ -4,8 +4,8 @@
 %
 % See also GrB.kronecker.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/kronecker.m b/GraphBLAS/GraphBLAS/@GrB/kronecker.m
index 792204c8d9..6b4f22a2b5 100644
--- a/GraphBLAS/GraphBLAS/@GrB/kronecker.m
+++ b/GraphBLAS/GraphBLAS/@GrB/kronecker.m
@@ -15,8 +15,8 @@
 %
 % See also kron, GrB/kron.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/ktruss.m b/GraphBLAS/GraphBLAS/@GrB/ktruss.m
index e14a907c3c..98d99a56ab 100644
--- a/GraphBLAS/GraphBLAS/@GrB/ktruss.m
+++ b/GraphBLAS/GraphBLAS/@GrB/ktruss.m
@@ -35,8 +35,8 @@
 %
 % See also GrB.tricount.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % NOTE: this is a high-level algorithm that uses GrB objects.
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/laplacian.m b/GraphBLAS/GraphBLAS/@GrB/laplacian.m
index e08a7e7705..1101f7b0ee 100644
--- a/GraphBLAS/GraphBLAS/@GrB/laplacian.m
+++ b/GraphBLAS/GraphBLAS/@GrB/laplacian.m
@@ -26,8 +26,8 @@
 %
 % See also graph/laplacian.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/ldivide.m b/GraphBLAS/GraphBLAS/@GrB/ldivide.m
index 05aa52e97b..f74adbf526 100644
--- a/GraphBLAS/GraphBLAS/@GrB/ldivide.m
+++ b/GraphBLAS/GraphBLAS/@GrB/ldivide.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/rdivide.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 C = rdivide (B, A) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/le.m b/GraphBLAS/GraphBLAS/@GrB/le.m
index 17b6ccce07..c81f217d4c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/le.m
+++ b/GraphBLAS/GraphBLAS/@GrB/le.m
@@ -11,8 +11,8 @@
 % B scalar, A matrix:  C is full if B>=0, otherwise C is a subset of A.
 % A matrix, B matrix:  C is full.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
@@ -36,8 +36,8 @@
         % A is a scalar, B is a matrix
         if (gb_scalar (A) <= 0)
             % since a <= 0, entries not present in B result in a true
-            % value, so the result is dense.  Expand A to a dense matrix.
-            A = gb_scalar_to_full (bm, bn, ctype, A) ;
+            % value, so the result is full.  Expand A to a full matrix.
+            A = gb_scalar_to_full (bm, bn, ctype, gb_fmt (B), A) ;
             B = gbfull (B, ctype) ;
             C = GrB (gbemult (A, '<=', B)) ;
         else
@@ -52,8 +52,8 @@
         % A is a matrix, B is a scalar
         if (gb_scalar (B) >= 0)
             % since b >= 0, entries not present in A result in a true
-            % value, so the result is dense.  Expand B to a dense matrix.
-            B = gb_scalar_to_full (am, an, ctype, B) ;
+            % value, so the result is full.  Expand B to a full matrix.
+            B = gb_scalar_to_full (am, an, ctype, gb_fmt (A), B) ;
             A = gbfull (A, ctype) ;
             C = GrB (gbemult (A, '<=', B)) ;
         else
diff --git a/GraphBLAS/GraphBLAS/@GrB/length.m b/GraphBLAS/GraphBLAS/@GrB/length.m
index 2c8aeedf92..c17bc7793c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/length.m
+++ b/GraphBLAS/GraphBLAS/@GrB/length.m
@@ -7,8 +7,8 @@
 %
 % See also GrB/size, GrB/numel.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 [m, n] = gbsize (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/log.m b/GraphBLAS/GraphBLAS/@GrB/log.m
index 83aaf9276a..4c74a5f833 100644
--- a/GraphBLAS/GraphBLAS/@GrB/log.m
+++ b/GraphBLAS/GraphBLAS/@GrB/log.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/log1p, GrB/log2, GrB/log10, GrB/exp.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 C = GrB (gb_to_real_if_imag_zero (gb_trig ('log', gbfull (G)))) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/log10.m b/GraphBLAS/GraphBLAS/@GrB/log10.m
index ba35adb20c..3501b70b6e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/log10.m
+++ b/GraphBLAS/GraphBLAS/@GrB/log10.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/log, GrB/log1p, GrB/log2, GrB/exp.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 C = GrB (gb_to_real_if_imag_zero (gb_trig ('log10', gbfull (G)))) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/log1p.m b/GraphBLAS/GraphBLAS/@GrB/log1p.m
index de34f689d1..bb370062b5 100644
--- a/GraphBLAS/GraphBLAS/@GrB/log1p.m
+++ b/GraphBLAS/GraphBLAS/@GrB/log1p.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/log, GrB/log2, GrB/log10, GrB/exp.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 C = GrB (gb_trig ('log1p', G)) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/log2.m b/GraphBLAS/GraphBLAS/@GrB/log2.m
index 87bbd43206..5c12ab2048 100644
--- a/GraphBLAS/GraphBLAS/@GrB/log2.m
+++ b/GraphBLAS/GraphBLAS/@GrB/log2.m
@@ -11,8 +11,8 @@
 %
 % See also GrB/pow2, GrB/log, GrB/log1p, GrB/log10, GrB/exp.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/logical.m b/GraphBLAS/GraphBLAS/@GrB/logical.m
index 19e736e36a..b86fbc67a7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/logical.m
+++ b/GraphBLAS/GraphBLAS/@GrB/logical.m
@@ -1,18 +1,19 @@
 function C = logical (G)
-%LOGICAL typecast a GraphBLAS matrix to MATLAB sparse logical matrix.
-% C = logical (G) typecasts the GraphBLAS matrix G to into a MATLAB sparse
-% logical matrix.
+%LOGICAL typecast a GraphBLAS matrix to MATLAB logical matrix.
+% C = logical (G) typecasts the GraphBLAS matrix G to into a MATLAB
+% logical matrix.  C is full if all entries in G are present, and
+% sparse otherwise.
 %
-% To typecast the matrix G to a GraphBLAS sparse logical matrix instead,
+% To typecast the matrix G to a GraphBLAS logical matrix instead,
 % use C = GrB (G, 'logical').
 %
 % See also cast, GrB, GrB/double, GrB/complex, GrB/single, GrB/int8,
 % GrB/int16, GrB/int32, GrB/int64, GrB/uint8, GrB/uint16, GrB/uint32,
 % GrB/uint64.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
-C = gbsparse (G, 'logical') ;
+C = gbmatlab (G, 'logical') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/lt.m b/GraphBLAS/GraphBLAS/@GrB/lt.m
index 116d10f24b..b9a47e0f2f 100644
--- a/GraphBLAS/GraphBLAS/@GrB/lt.m
+++ b/GraphBLAS/GraphBLAS/@GrB/lt.m
@@ -14,8 +14,8 @@
 % B scalar, A matrix:  C is full if B>0, otherwise C is a subset of A.
 % A matrix, B matrix:  C has the pattern of the set union, A+B.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
@@ -40,11 +40,12 @@
         if (gb_scalar (A) < 0)
             if (~gb_issigned (btype))
                 % a < 0, and B has an unsigned type.  C is all true.
-                C = GrB (gb_scalar_to_full (bm, bn, 'logical', true)) ;
+                C = GrB (gb_scalar_to_full (bm, bn, 'logical', ...
+                    gb_fmt (B), true)) ;
             else
                 % since a < 0, entries not present in B result in a true
-                % value, so the result is dense.  Expand A to dense.
-                A = gb_scalar_to_full (bm, bn, ctype, A) ;
+                % value, so the result is full.  Expand A to full.
+                A = gb_scalar_to_full (bm, bn, ctype, gb_fmt (B), A) ;
                 C = GrB (gbemult (A, '<', gbfull (B, ctype))) ;
             end
         else
@@ -63,8 +64,8 @@
             C = GrB (gbnew (am, an, 'logical')) ;
         elseif (b > 0)
             % since b > 0, entries not present in A result in a true
-            % value, so the result is dense.  Expand B to a dense matrix.
-            B = gb_scalar_to_full (am, an, ctype, B) ;
+            % value, so the result is full.  Expand B to a full matrix.
+            B = gb_scalar_to_full (am, an, ctype, gb_fmt (A), B) ;
             C = GrB (gbemult (gbfull (A, ctype), '<', B)) ;
         else
             % since b <= 0, entries not present in A result in a false
diff --git a/GraphBLAS/GraphBLAS/@GrB/max.m b/GraphBLAS/GraphBLAS/@GrB/max.m
index 722bc41e3b..79cd34ecda 100644
--- a/GraphBLAS/GraphBLAS/@GrB/max.m
+++ b/GraphBLAS/GraphBLAS/@GrB/max.m
@@ -18,12 +18,12 @@
 %
 % See also GrB/min.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: max(A,B) for two matrices A and B is slower than it could be.
 % See comments in gb_union_op.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 if (isobject (A))
     A = A.opaque ;
 end
diff --git a/GraphBLAS/GraphBLAS/@GrB/min.m b/GraphBLAS/GraphBLAS/@GrB/min.m
index b7543d5017..96aa1cc978 100644
--- a/GraphBLAS/GraphBLAS/@GrB/min.m
+++ b/GraphBLAS/GraphBLAS/@GrB/min.m
@@ -18,12 +18,12 @@
 %
 % See also GrB/max.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: min(A,B) for two matrices A and B is slower than it could be.
 % See comments in gb_union_op.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 if (isobject (A))
     A = A.opaque ;
 end
diff --git a/GraphBLAS/GraphBLAS/@GrB/minus.m b/GraphBLAS/GraphBLAS/@GrB/minus.m
index 14bce3ea57..007db11ced 100644
--- a/GraphBLAS/GraphBLAS/@GrB/minus.m
+++ b/GraphBLAS/GraphBLAS/@GrB/minus.m
@@ -2,17 +2,17 @@
 %MINUS sparse matrix subtraction, C = A-B.
 % C = A-B subtracts the two matrices A and B.  If A and B are matrices,
 % the pattern of C is the set union of A and B.  If one of A or B is a
-% scalar, the scalar is expanded into a dense matrix the size of the
-% other matrix, and the result is a dense matrix.
+% scalar, the scalar is expanded into a full matrix the size of the
+% other matrix, and the result is a full matrix.
 %
 % See also GrB.eadd, GrB/plus, GrB/uminus.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: minus(A,B) for two matrices A and B is slower than it could be.
 % See comments in gb_union_op.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 if (isobject (A))
     A = A.opaque ;
 end
@@ -22,6 +22,5 @@
 end
 
 type = gboptype (gbtype (A), gbtype (B)) ;
-
 C = GrB (gb_eadd (A, '+', gbapply (['-.' type], B))) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/mis.m b/GraphBLAS/GraphBLAS/@GrB/mis.m
index 2c5ef76aff..88b38e4b76 100644
--- a/GraphBLAS/GraphBLAS/@GrB/mis.m
+++ b/GraphBLAS/GraphBLAS/@GrB/mis.m
@@ -22,8 +22,8 @@
 %
 % See also GrB.offdiag.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % NOTE: this is a high-level algorithm that uses GrB objects.
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/mldivide.m b/GraphBLAS/GraphBLAS/@GrB/mldivide.m
index 0f524d7a92..0ddba527fa 100644
--- a/GraphBLAS/GraphBLAS/@GrB/mldivide.m
+++ b/GraphBLAS/GraphBLAS/@GrB/mldivide.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/mrdivide.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isscalar (A))
     C = rdivide (B, A) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/monoidinfo.m b/GraphBLAS/GraphBLAS/@GrB/monoidinfo.m
index 89bcd7f9df..6a87a77b98 100644
--- a/GraphBLAS/GraphBLAS/@GrB/monoidinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/monoidinfo.m
@@ -46,8 +46,8 @@ function monoidinfo (monoid, type)
 % See also GrB.binopinfo, GrB.descriptorinfo, GrB.selectopinfo,
 % GrB.semiringinfo, GrB.unopinfo.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 0)
     help GrB.monoidinfo
diff --git a/GraphBLAS/GraphBLAS/@GrB/mpower.m b/GraphBLAS/GraphBLAS/@GrB/mpower.m
index 07c5d7a32e..7bbfdac8f3 100644
--- a/GraphBLAS/GraphBLAS/@GrB/mpower.m
+++ b/GraphBLAS/GraphBLAS/@GrB/mpower.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/power.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/mrdivide.m b/GraphBLAS/GraphBLAS/@GrB/mrdivide.m
index 3414f240bd..40cea893ab 100644
--- a/GraphBLAS/GraphBLAS/@GrB/mrdivide.m
+++ b/GraphBLAS/GraphBLAS/@GrB/mrdivide.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/mldivide.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isscalar (B))
     C = rdivide (A, B) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/mtimes.m b/GraphBLAS/GraphBLAS/@GrB/mtimes.m
index 53c6a95a46..efb04845c2 100644
--- a/GraphBLAS/GraphBLAS/@GrB/mtimes.m
+++ b/GraphBLAS/GraphBLAS/@GrB/mtimes.m
@@ -5,8 +5,8 @@
 %
 % See also GrB.mxm, GrB.emult, GrB/times.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/mxm.m b/GraphBLAS/GraphBLAS/@GrB/mxm.m
index 5b0f3ec366..cf69e41efa 100644
--- a/GraphBLAS/GraphBLAS/@GrB/mxm.m
+++ b/GraphBLAS/GraphBLAS/@GrB/mxm.m
@@ -56,8 +56,8 @@
 %
 % See also GrB.descriptorinfo, GrB.add, GrB/mtimes.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/ne.m b/GraphBLAS/GraphBLAS/@GrB/ne.m
index 7d29ed28f3..3445621cdf 100644
--- a/GraphBLAS/GraphBLAS/@GrB/ne.m
+++ b/GraphBLAS/GraphBLAS/@GrB/ne.m
@@ -15,8 +15,8 @@
 % A matrix, B matrix:  C is sparse, with the pattern of A+B.
 % Zeroes are then dropped from C after it is computed.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
@@ -40,8 +40,8 @@
         % A is a scalar, B is a matrix
         if (gb_scalar (A) ~= 0)
             % since a ~= 0, entries not present in B result in a true
-            % value, so the result is dense.  Expand A to a dense matrix.
-            A = gb_scalar_to_full (bm, bn, ctype, A) ;
+            % value, so the result is full.  Expand A to a full matrix.
+            A = gb_scalar_to_full (bm, bn, ctype, gb_fmt (B), A) ;
             C = GrB (gbemult (A, '~=', gbfull (B, ctype))) ;
         else
             % since a == 0, entries not present in B result in a false
@@ -55,15 +55,15 @@
         % A is a matrix, B is a scalar
         if (gb_scalar (B) ~= 0)
             % since b ~= 0, entries not present in A result in a true
-            % value, so the result is dense.  Expand B to a dense matrix.
-            B = gb_scalar_to_full (am, an, ctype, B) ;
+            % value, so the result is full.  Expand B to a full matrix.
+            B = gb_scalar_to_full (am, an, ctype, gb_fmt (A), B) ;
             C = GrB (gbemult (gbfull (A, ctype), '~=', B)) ;
         else
             % since b == 0, entries not present in A result in a false
-            % value, so the result is a sparse subset of A.  Simply typecast
-            % A to logical.  Explicit zeroes in A become explicit false
-            % entries.  Any other explicit entries not equal to zero become
-            % true.
+            % value, so the result is a sparse subset of A.  Simply
+            % typecast A to logical.  Explicit zeroes in A become explicit
+            % false entries.  Any other explicit entries not equal to zero
+            % become true.
             C = GrB (gbnew (A, 'logical')) ;
         end
     else
diff --git a/GraphBLAS/GraphBLAS/@GrB/nnz.m b/GraphBLAS/GraphBLAS/@GrB/nnz.m
index 8f2fc10d3e..7a240c46b6 100644
--- a/GraphBLAS/GraphBLAS/@GrB/nnz.m
+++ b/GraphBLAS/GraphBLAS/@GrB/nnz.m
@@ -6,8 +6,8 @@
 %
 % See also GrB.entries, GrB.prune, GrB/nonzeros, GrB/size, GrB/numel.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 e = gb_nnz (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/nonz.m b/GraphBLAS/GraphBLAS/@GrB/nonz.m
index 11ade6769b..5fe3ccea38 100644
--- a/GraphBLAS/GraphBLAS/@GrB/nonz.m
+++ b/GraphBLAS/GraphBLAS/@GrB/nonz.m
@@ -8,8 +8,8 @@
 %
 % e = GrB.nonz (A)         number of nonzeros
 % e = GrB.nonz (A, 'all')  number of nonzeros
-% e = GrB.nonz (A, 'row')  number of rows with at least one nonzeros
-% e = GrB.nonz (A, 'col')  number of columns with at least one nonzeros
+% e = GrB.nonz (A, 'row')  number of rows with at least one nonzero
+% e = GrB.nonz (A, 'col')  number of columns with at least one nonzero
 %
 % X = GrB.nonz (A, 'list')         list of values of unique nonzeros
 % X = GrB.nonz (A, 'all', 'list')  list of values of unique nonzeros
@@ -52,8 +52,8 @@
 %
 % See also GrB.entries, GrB/nnz, GrB/nonzeros, GrB.prune.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 matlab_sparse = false ;
 if (isobject (A))
@@ -80,7 +80,9 @@
     % id is nonzero, so prune A first (for any matrix A)
     A = gbselect (A, '~=', id) ;
 elseif (~matlab_sparse)
-    % id is zero, so prune A only if it is a GraphBLAS matrix
+    % id is zero, so prune A only if it is a GraphBLAS matrix,
+    % or a MATLAB full matrix.  A MATLAB sparse matrix can remain
+    % unchanged.
     A = gbselect (A, 'nonzero') ;
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/nonzeros.m b/GraphBLAS/GraphBLAS/@GrB/nonzeros.m
index e90bdb445b..01314977ae 100644
--- a/GraphBLAS/GraphBLAS/@GrB/nonzeros.m
+++ b/GraphBLAS/GraphBLAS/@GrB/nonzeros.m
@@ -8,8 +8,8 @@
 %
 % See also GrB.extracttuples, GrB.entries, GrB.nonz, GrB/find.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 X = gbextractvalues (gbselect ('nonzero', G)) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/norm.m b/GraphBLAS/GraphBLAS/@GrB/norm.m
index c97c9fadef..a1d4eb7c4a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/norm.m
+++ b/GraphBLAS/GraphBLAS/@GrB/norm.m
@@ -20,8 +20,8 @@
 %
 % See also GrB.reduce, GrB.normdiff.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % FUTURE: add the p-norm for vectors.
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/normdiff.m b/GraphBLAS/GraphBLAS/@GrB/normdiff.m
index 5b05774d9f..893d4caa08 100644
--- a/GraphBLAS/GraphBLAS/@GrB/normdiff.m
+++ b/GraphBLAS/GraphBLAS/@GrB/normdiff.m
@@ -18,8 +18,8 @@
 %
 % See also GrB.reduce, GrB/norm.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 3)
     kind = 2 ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/not.m b/GraphBLAS/GraphBLAS/@GrB/not.m
index 57bd57d9fb..8cc85415d5 100644
--- a/GraphBLAS/GraphBLAS/@GrB/not.m
+++ b/GraphBLAS/GraphBLAS/@GrB/not.m
@@ -6,8 +6,8 @@
 %
 % See also GrB.apply.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 C = GrB (gbapply ('~', gbfull (G, 'logical'))) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/numel.m b/GraphBLAS/GraphBLAS/@GrB/numel.m
index b8582ec5d8..8a27cb8c6c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/numel.m
+++ b/GraphBLAS/GraphBLAS/@GrB/numel.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/nnz.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 s = gb_numel (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/nzmax.m b/GraphBLAS/GraphBLAS/@GrB/nzmax.m
index 4b23469be3..385751fdc9 100644
--- a/GraphBLAS/GraphBLAS/@GrB/nzmax.m
+++ b/GraphBLAS/GraphBLAS/@GrB/nzmax.m
@@ -7,8 +7,8 @@
 %
 % See also GrB/nnz, GrB.entries, GrB.nonz.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 e = max (gbnvals (G), 1) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/offdiag.m b/GraphBLAS/GraphBLAS/@GrB/offdiag.m
index 99dd446c18..60ed20ca5a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/offdiag.m
+++ b/GraphBLAS/GraphBLAS/@GrB/offdiag.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/tril, GrB/triu, GrB/diag, GrB.select.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/ones.m b/GraphBLAS/GraphBLAS/@GrB/ones.m
index fecd0d904f..37a9d5b10a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/ones.m
+++ b/GraphBLAS/GraphBLAS/@GrB/ones.m
@@ -13,9 +13,9 @@
 %
 % See also GrB/zeros, GrB/false, GrB/true.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n, type] = gb_parse_args ('ones', varargin {:}) ;
-C = GrB (gb_scalar_to_full (m, n, type, 1)) ;
+C = GrB (gb_scalar_to_full (m, n, type, gbformat, 1)) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/optype.m b/GraphBLAS/GraphBLAS/@GrB/optype.m
index dde4fe5342..7b46e8aebd 100644
--- a/GraphBLAS/GraphBLAS/@GrB/optype.m
+++ b/GraphBLAS/GraphBLAS/@GrB/optype.m
@@ -7,6 +7,8 @@
 %
 % The rules are listed below; the first one that applies is used:
 %
+% (0) for positional operators, int64 is used by default.
+%
 % (1) same:
 %
 %   if A and B have the same type:  optype is the type of A and B.
@@ -44,8 +46,8 @@
 %
 % See also GrB.binopinfo, GrB.semiringinfo, GrB.type.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (ischar (a))
     atype = a ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/or.m b/GraphBLAS/GraphBLAS/@GrB/or.m
index 1c5a883fe5..fda30c7151 100644
--- a/GraphBLAS/GraphBLAS/@GrB/or.m
+++ b/GraphBLAS/GraphBLAS/@GrB/or.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/and, GrB/xor, GrB/not.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
@@ -32,7 +32,7 @@
             C = GrB (gbnew (B, 'logical')) ;
         else
             % A is true, so C is a full matrix the same size as B
-            C = GrB (gb_scalar_to_full (bm, bn, 'logical', true)) ;
+            C = GrB (gb_scalar_to_full (bm, bn, 'logical', gb_fmt (B), true)) ;
         end
     end
 else
@@ -43,7 +43,7 @@
             C = GrB (A, 'logical') ;
         else
             % B is true, so C is a full matrix the same size as A
-            C = GrB (gb_scalar_to_full (am, an, 'logical', true)) ;
+            C = GrB (gb_scalar_to_full (am, an, 'logical', gb_fmt (A), true)) ;
         end
     else
         % both A and B are matrices.  C is the set union of A and B
diff --git a/GraphBLAS/GraphBLAS/@GrB/pagerank.m b/GraphBLAS/GraphBLAS/@GrB/pagerank.m
index b39a74d2cb..43f4b6147f 100644
--- a/GraphBLAS/GraphBLAS/@GrB/pagerank.m
+++ b/GraphBLAS/GraphBLAS/@GrB/pagerank.m
@@ -21,8 +21,8 @@
 %
 % See also graph/centrality.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % NOTE: this is a high-level algorithm that uses GrB objects.
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/plus.m b/GraphBLAS/GraphBLAS/@GrB/plus.m
index 76b86cd003..d38764d778 100644
--- a/GraphBLAS/GraphBLAS/@GrB/plus.m
+++ b/GraphBLAS/GraphBLAS/@GrB/plus.m
@@ -2,13 +2,13 @@
 %PLUS sparse matrix addition, C = A+B.
 % C = A+B adds the two matrices A and B.  If A and B are matrices, the
 % pattern of C is the set union of A and B.  If one of A or B is a
-% nonzero scalar, the scalar is expanded into a dense matrix the size of
-% the other matrix, and the result is a dense matrix.
+% nonzero scalar, the scalar is expanded into a full matrix the size of
+% the other matrix, and the result is a full matrix.
 %
 % See also GrB.eadd, GrB/minus, GrB/uminus.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/pow2.m b/GraphBLAS/GraphBLAS/@GrB/pow2.m
index 4125661aed..534cd236cf 100644
--- a/GraphBLAS/GraphBLAS/@GrB/pow2.m
+++ b/GraphBLAS/GraphBLAS/@GrB/pow2.m
@@ -8,12 +8,12 @@
 %
 % See also GrB/log2, GrB/power, GrB/exp.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: pow2(A,B) for two matrices A and B is slower than it could be.
 % See comments in gb_union_op.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 if (isobject (A))
     A = A.opaque ;
 end
diff --git a/GraphBLAS/GraphBLAS/@GrB/power.m b/GraphBLAS/GraphBLAS/@GrB/power.m
index b403b625da..1ea340cfc5 100644
--- a/GraphBLAS/GraphBLAS/@GrB/power.m
+++ b/GraphBLAS/GraphBLAS/@GrB/power.m
@@ -7,8 +7,8 @@
 %
 % See also GrB/mpower, GrB/pow2, GrB/exp.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/Makefile b/GraphBLAS/GraphBLAS/@GrB/private/Makefile
index e3aceb696e..09880b6ddb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/Makefile
+++ b/GraphBLAS/GraphBLAS/@GrB/private/Makefile
@@ -3,7 +3,7 @@
 #-------------------------------------------------------------------------------
 
 # SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2018, All Rights Reserved.
-# http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+# SPDX-License-Identifier: Apache-2.0
 
 #-------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_abs.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_abs.m
index 98bbfbe289..ee52bad5da 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_abs.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_abs.m
@@ -2,8 +2,8 @@
 %GB_ABS Absolute value of a GraphBLAS matrix.
 % Implements C = abs (G)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (gb_issigned (gbtype (G)))
     C = gbapply ('abs', G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_bandwidth.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_bandwidth.m
index 3b093dbbc1..6881a28fdc 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_bandwidth.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_bandwidth.m
@@ -2,8 +2,8 @@
 %GB_BANDWIDTH Determine the bandwidth of a GraphBLAS matrix.
 % Implements [lo, hi] = bandwidth (G).
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % compute the bandwidth
 if (gbnvals (G) == 0)
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_bitwise.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_bitwise.m
index bb2478a133..914470901d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_bitwise.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_bitwise.m
@@ -1,8 +1,8 @@
 function C = gb_bitwise (op, A, B, assumedtype)
 %GB_BITWISE bitwise AND, OR, XOR, ...
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_convert_index_1d_to_2d.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_convert_index_1d_to_2d.m
index 75d5aefd50..cad59b58b6 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_convert_index_1d_to_2d.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_convert_index_1d_to_2d.m
@@ -2,8 +2,8 @@
 %GB_CONVERT_INDEX_1D_TO_2D convert 1D indices to 2D
 % the indices must be zero-based.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 i = rem (k, m) ;
 j = (k - i) / m ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_convert_index_2d_to_1d.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_convert_index_2d_to_1d.m
index 925d7b7d7b..a416230da4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_convert_index_2d_to_1d.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_convert_index_2d_to_1d.m
@@ -2,8 +2,8 @@
 %GB_CONVERT_INDEX_2D_TO_1D convert 2D indices to 1D
 % the indices must be zero-based.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 k = i + j * m ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_diag.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_diag.m
index 1b337ff0bd..0cebaaacf2 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_diag.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_diag.m
@@ -2,8 +2,8 @@
 %GB_DIAG Diagonal matrices and diagonals of a GraphBLAS matrix.
 % Implements C = diag (A,k)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [am, an, atype] = gbsize (A) ;
 a_is_vector = (am == 1) || (an == 1) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_eadd.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_eadd.m
index 9ac223ea95..d41a1a981e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_eadd.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_eadd.m
@@ -10,8 +10,8 @@
 %
 % See also GrB/plus, GrB/minus, GrB/bitxor, GrB/bitor, GrB/hypot.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved.  http://suitesparse.com   See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [am, an, atype] = gbsize (A) ;
 [bm, bn, btype] = gbsize (B) ;
@@ -30,7 +30,7 @@
             C = B ;
         else
             % expand A to a full matrix
-            A = gb_scalar_to_full (bm, bn, type, A) ;
+            A = gb_scalar_to_full (bm, bn, type, gb_fmt (B), A) ;
             C = gbeadd (A, op, B) ;
         end
     end
@@ -42,7 +42,7 @@
             C = A ;
         else
             % expand B to a full matrix
-            B = gb_scalar_to_full (am, an, type, B) ;
+            B = gb_scalar_to_full (am, an, type, gb_fmt (A), B) ;
             C = gbeadd (A, op, B) ;
         end
     else
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_emult.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_emult.m
index 9a86db85a1..70106aa228 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_emult.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_emult.m
@@ -9,8 +9,8 @@
 % The input matrices may be either GraphBLAS structs and/or MATLAB matrices,
 % in any combination.  C is returned as a GraphBLAS struct.
 
-% SuiteSparse:GraphBLAS, T. A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (gb_isscalar (A) || gb_isscalar (B))
     % either A or B are scalars
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_entries.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_entries.m
index 7aa30a5474..4e32d12e55 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_entries.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_entries.m
@@ -2,8 +2,8 @@
 %GB_ENTRIES count or query the entries of a matrix.
 % Implements GrB.entries (A, ...) and GrB.nonz (A, ...).
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % get the string arguments
 dim = 'all' ;           % 'all', 'row', or 'col'
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_expand.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_expand.m
index 47bfeabece..da5b26e43e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_expand.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_expand.m
@@ -3,8 +3,8 @@
 % Implements C = GrB.expand (scalar, S, type).  This function assumes the
 % first input is a scalar; the caller has checked this already.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % typecast the scalar to the desired type
 scalar = gbnew (scalar, type) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_fmt.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_fmt.m
new file mode 100644
index 0000000000..0b69e9e530
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_fmt.m
@@ -0,0 +1,12 @@
+function f = gb_fmt (A)
+%GB_FMT return the format of A as a single string.
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+[f, s] = GrB.format (A) ;
+
+if (~isempty (s))
+    f = [s ' ' f] ;
+end
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_get_pair.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_get_pair.m
index df92738d39..c106f244eb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_get_pair.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_get_pair.m
@@ -1,8 +1,8 @@
 function [x, y] = gb_get_pair (A)
 %GB_GET_PAIR get a pair of scalars from a parameter of length 2
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
@@ -10,7 +10,7 @@
 
 type = gbtype (A) ;
 desc.kind = 'full' ;
-A = gbfull (A, type, 0, desc) ;
-x = A (1) ;
-y = A (2) ;
+C = gbfull (A, type, 0, desc) ;                 % export as a MATLAB full matrix
+x = C (1) ;
+y = C (2) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_get_scalar.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_get_scalar.m
index b038931d6f..e63b5d654c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_get_scalar.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_get_scalar.m
@@ -1,8 +1,8 @@
 function x = gb_get_scalar (A)
 %GB_GET_SCALAR get a scalar from a matrix
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_index.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_index.m
index ea99b3558e..b3ce1cd712 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_index.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_index.m
@@ -28,8 +28,8 @@
 % If I is a MATLAB matrix or vector (not a cell array), then it is
 % wrapped in a cell array, { I }, to denote A(I).
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 whole = false ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_index1.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_index1.m
index 733ef077b5..681da076f0 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_index1.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_index1.m
@@ -5,8 +5,8 @@
 % This function converts them into into integer lists so that they can be
 % handled by the mexFunctions.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 type = gbtype (G) ;
 I = gbextractvalues (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_isfloat.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_isfloat.m
index c4c7cee2a5..7acabdf5df 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_isfloat.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_isfloat.m
@@ -2,8 +2,8 @@
 %GB_ISFLOAT true for floating-point GraphBLAS types.
 % Implements s = isfloat (type (G))
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 s = contains (type, 'double') || contains (type, 'single') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_isfull.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_isfull.m
index 769fa99533..92b491e520 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_isfull.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_isfull.m
@@ -1,8 +1,8 @@
 function s = gb_isfull (A)
 %GB_ISFULL determine if all entries are present in a GraphBLAS struct.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n] = gbsize (A) ;
 if (isinteger (m))
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_isscalar.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_isscalar.m
index 927ad7b8dc..8cfd46f918 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_isscalar.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_isscalar.m
@@ -3,8 +3,8 @@
 % isscalar (G) is true for an m-by-n GraphBLAS matrix if m and n are 1.
 % G is an opaque GraphBLAS struct or a MATLAB matrix.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n] = gbsize (G) ;
 s = (m == 1) && (n == 1) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_issigned.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_issigned.m
index 6be4bd2c92..6902a1809b 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_issigned.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_issigned.m
@@ -4,8 +4,8 @@
 % 'single', 'single complex', 'double complex', 'int8', 'int16', 'int32',
 % or 'int64'.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 s = ~ (isequal (type, 'logical') || contains (type, 'uint')) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_issymmetric.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_issymmetric.m
index d5d314ef0e..1ec94d7a45 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_issymmetric.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_issymmetric.m
@@ -2,10 +2,10 @@
 %GB_ISSYMMETRIC check if symmetric or Hermitian
 % Implements issymmetric (G,option) and ishermitian (G,option).
 
-% FUTURE: this can be much faster; see CHOLMOD/MATLAB/spsym.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% FUTURE: this can be much faster; see CHOLMOD/MATLAB/spsym.
 
 [m, n, type] = gbsize (G) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_isvector.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_isvector.m
index 171dd6c649..4101f503e7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_isvector.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_isvector.m
@@ -3,8 +3,8 @@
 % where G is the opaque struct of the GraphBLAS matrix.
 % gb_isvector (G) is true for an m-by-n GraphBLAS matrix if m or n is 1.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n] = gbsize (G) ;
 s = (m == 1) || (n == 1) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_max1.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_max1.m
index 6b487bd024..9a64b21664 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_max1.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_max1.m
@@ -2,8 +2,8 @@
 %GB_MAX1 single-input max
 % Implements C = max (A)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n] = gbsize (A) ;
 if (m == 1 || n == 1)
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_max2.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_max2.m
index eceee11b90..3129711722 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_max2.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_max2.m
@@ -2,8 +2,8 @@
 %GB_MAX2 2-input max
 % Implements C = max (A,B)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [am, an, atype] = gbsize (A) ;
 [bm, bn, btype] = gbsize (B) ;
@@ -19,7 +19,7 @@
         % A is a scalar, B is a matrix
         if (gb_scalar (A) > 0)
             % since A > 0, the result is full
-            A = gb_scalar_to_full (bm, bn, ctype, A) ;
+            A = gb_scalar_to_full (bm, bn, ctype, gb_fmt (B), A) ;
             C = gbeadd (A, op, B) ;
         else
             % since A <= 0, the result is sparse.
@@ -31,7 +31,7 @@
         % A is a matrix, B is a scalar
         if (gb_scalar (B) > 0)
             % since B > 0, the result is full
-            B = gb_scalar_to_full (am, an, ctype, B) ;
+            B = gb_scalar_to_full (am, an, ctype, gb_fmt (A), B) ;
             C = gbeadd (A, op, B) ;
         else
             % since B <= 0, the result is sparse.
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_max3.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_max3.m
index 3263d68914..149e03225c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_max3.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_max3.m
@@ -2,6 +2,9 @@
 %GB_MAX3 3-input max
 % Implements C = max (A, [ ], option)
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 if (isequal (option, 'all'))
     % C = max (A, [ ] 'all'), reducing all entries to a scalar
     C = gb_maxall (op, A) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_maxall.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_maxall.m
index 7a02cf7ce7..7545f0bd61 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_maxall.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_maxall.m
@@ -2,8 +2,8 @@
 %GB_MAXALL reduce a matrix to a scalar
 % Implements C = max (A, [ ], 'all') ;
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 C = gbreduce (op, A) ;
 [m, n] = gbsize (A) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbycol.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbycol.m
index fab45f6d8f..1c106b458c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbycol.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbycol.m
@@ -2,6 +2,9 @@
 %GB_MAXBYCOL max, by column
 % Implements C = max (A, [ ], 1)
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % C = max (A, [ ], 1) reduces each col to a scalar; C is 1-by-n
 desc.in0 = 'transpose' ;
 C = gbvreduce (op, A, desc) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbyrow.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbyrow.m
index ae914cc3ec..c5ceebfc51 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbyrow.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_maxbyrow.m
@@ -2,6 +2,9 @@
 %GB_MAXBYROW max, by row
 % Implements C = max (A, [ ], 2)
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % C = max (A, [ ], 2) reduces each row to a scalar; C is m-by-1
 C = gbvreduce (op, A) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_min1.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_min1.m
index c47fcea902..aae66d1b07 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_min1.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_min1.m
@@ -2,8 +2,8 @@
 %GB_MIN1 single-input min
 % Implements C = min (A)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n] = gbsize (A) ;
 if (m == 1 || n == 1)
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_min2.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_min2.m
index 56f2c02333..3e06b9b75d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_min2.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_min2.m
@@ -2,8 +2,8 @@
 %GB_MIN2 2-input min
 % Implements C = min (A,B)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [am, an, atype] = gbsize (A) ;
 [bm, bn, btype] = gbsize (B) ;
@@ -19,7 +19,7 @@
         % A is a scalar, B is a matrix
         if (gb_scalar (A) < 0)
             % since A < 0, the result is full
-            A = gb_scalar_to_full (bm, bn, ctype, A) ;
+            A = gb_scalar_to_full (bm, bn, ctype, gb_fmt (B), A) ;
             C = gbeadd (A, op, B) ;
         else
             % since A >= 0, the result is sparse.
@@ -31,7 +31,7 @@
         % A is a matrix, B is a scalar
         if (gb_scalar (B) < 0)
             % since B < 0, the result is full
-            B = gb_scalar_to_full (am, an, ctype, B) ;
+            B = gb_scalar_to_full (am, an, ctype, gb_fmt (A), B) ;
             C = gbeadd (A, op, B) ;
         else
             % since B >= 0, the result is sparse.
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_min3.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_min3.m
index a5c8375cb8..a7df46a575 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_min3.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_min3.m
@@ -2,6 +2,9 @@
 %GB_MIN3 3-input min
 % Implements C = min (A, [ ], option)
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 if (isequal (option, 'all'))
     % C = min (A, [ ] 'all'), reducing all entries to a scalar
     C = gb_minall (op, A) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_minall.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_minall.m
index 28c3296ceb..67213f59a9 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_minall.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_minall.m
@@ -2,8 +2,8 @@
 %GB_MINALL reduce a matrix to a scalar
 % Implements C = min (A, [ ], 'all') ;
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 C = gbreduce (op, A) ;
 [m, n] = gbsize (A) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_minbycol.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_minbycol.m
index e86099b982..713ba86d1b 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_minbycol.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_minbycol.m
@@ -2,6 +2,9 @@
 %GB_MINBYCOL min, by column
 % Implements C = min (A, [ ], 1)
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % C = min (A, [ ], 1) reduces each col to a scalar; C is 1-by-n
 desc.in0 = 'transpose' ;
 C = gbvreduce (op, A, desc) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_minbyrow.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_minbyrow.m
index 121fd746f6..93f17e70d0 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_minbyrow.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_minbyrow.m
@@ -2,6 +2,9 @@
 %GB_MINBYROW min, by row
 % Implements C = min (A, [ ], 2)
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % C = min (A, [ ], 2) reduces each row to a scalar; C is m-by-1
 C = gbvreduce (op, A) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_mpower.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_mpower.m
index 20c72df3ab..28ef7cb827 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_mpower.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_mpower.m
@@ -1,8 +1,8 @@
 function C = gb_mpower (A, b)
 %GB_MPOWER C = A^b where b > 0 is an integer
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (b == 1)
     C = A ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_nnz.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_nnz.m
index 8a53bbc99c..77e13c256e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_nnz.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_nnz.m
@@ -2,8 +2,9 @@
 %GB_NNZ the number of nonzeros in a GraphBLAS matrix.
 % Implements e = nnz (G)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
+% count entries in G and then subtract the number explicit zero entries
 e = gbnvals (G) - gbnvals (gbselect (G, '==0')) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_numel.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_numel.m
index b0ba199fb3..22a95e9ccd 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_numel.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_numel.m
@@ -2,8 +2,8 @@
 %GB_NUMEL the maximum number of entries a GraphBLAS matrix can hold.
 % Implements s = numel (G)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n] = gbsize (G) ;
 s = m*n ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_args.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_args.m
index e33a17301a..816c333c70 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_args.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_args.m
@@ -9,8 +9,8 @@
 %   C = ones (... , 'like', G) ;
 %   C = ones (... , 'int8') ;
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % parse the type
 type = 'double' ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_dimensions.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_dimensions.m
index 1445ce3cb1..93c88c3031 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_dimensions.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_parse_dimensions.m
@@ -1,8 +1,8 @@
 function [m, n] = gb_parse_dimensions (arg1, arg2)
 %GB_GET_DIMENSIONS parse arguments for dimensions
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 switch (nargin)
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_power.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_power.m
index 9ae3248d05..ee6c801b16 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_power.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_power.m
@@ -2,8 +2,8 @@
 %GB_POWER .^ Array power.
 % C = A.^B computes element-wise powers.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [am, an, atype] = gbsize (A) ;
 [bm, bn, btype] = gbsize (B) ;
@@ -71,7 +71,7 @@
         b = gb_scalar (B) ;
         if (b == 0)
             % special case:  C = A.^0 = ones (am, an, ctype)
-            C = gb_scalar_to_full (am, an, ctype, 1) ;
+            C = gb_scalar_to_full (am, an, ctype, gb_fmt (A), 1) ;
             return ;
         elseif (b == 1)
             % special case: C = A.^1 = A
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_printf_helper.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_printf_helper.m
index 06efb24440..0d80969b03 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_printf_helper.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_printf_helper.m
@@ -1,8 +1,8 @@
 function result = gb_printf_helper (printf_function, varargin)
 %GB_PRINTF_HELPER wrapper for fprintf and sprintf
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % convert all GraphBLAS matrices to full MATLAB matrices
 len = length (varargin) ;
@@ -11,7 +11,7 @@
     if (isobject (arg))
         arg = arg.opaque ;
         desc.kind = 'full' ;
-        varargin {k} = gbfull (arg, gbtype (arg), 0, desc) ;
+        varargin {k} = gbfull (arg, gbtype (arg), 0, desc) ;    % as MATLAB full
     end
 end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_prod.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_prod.m
index 7816198cbd..a68dd144bb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_prod.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_prod.m
@@ -2,8 +2,8 @@
 %GB_PROD C = prod (G), using the given operator and type
 % Implements C = prod (G) and C = all (G).
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n] = gbsize (G) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_random.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_random.m
index 10ff6dcac5..ddc0ed0f69 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_random.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_random.m
@@ -2,8 +2,8 @@
 %GB_RANDOM uniformly distributed random GraphBLAS matrix.
 % Implements C = GrB.random (...), C = sprand (...), C = sprand (...),
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %---------------------------------------------------------------------------
 % parse inputs
@@ -79,7 +79,7 @@
         d = gb_get_scalar (varargin {3}) ;
     end
     if (isinf (d))
-        % construct a dense random matrix
+        % construct a full random matrix
         e = m * n ;
         I = repmat ((int64 (0) : int64 (m-1)), 1, n) ;
         J = repmat ((int64 (0) : int64 (n-1)), m, 1) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_scalar.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_scalar.m
index df81a92a12..b7d606f69e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_scalar.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_scalar.m
@@ -5,8 +5,8 @@
 % as a MATLAB non-sparse scalar.  If the scalar has no entry
 % (the MATLAB sparse(0)), then x is returned as zero.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [~, ~, x] = gbextracttuples (A) ;
 if (isempty (x))
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_scalar_to_full.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_scalar_to_full.m
index aaefcc38f3..eabbc916e7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_scalar_to_full.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_scalar_to_full.m
@@ -1,4 +1,5 @@
-function C = gb_scalar_to_full (m, n, type, scalar)
-%GB_DENSE expand a scalar into a dense matrix
+function C = gb_scalar_to_full (m, n, type, fmt, scalar)
+%GB_SCALAR_TO_FULL expand a scalar into a full matrix
+
+C = gbsubassign (gbnew (m, n, type, fmt), gbfull (scalar)) ;
 
-C = gbsubassign (gbnew (m, n, type), gbfull (scalar)) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_speye.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_speye.m
index 5ddce29022..435b7c1867 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_speye.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_speye.m
@@ -2,8 +2,8 @@
 %GB_SPEYE Sparse identity matrix, of any type supported by GraphBLAS.
 % Implements C = GrB.eye (...) and GrB.speye (...).
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % get the size and type
 [m, n, type] = gb_parse_args (func, varargin {:}) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_spones.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_spones.m
index c7c607e690..ef411dfe9c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_spones.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_spones.m
@@ -2,8 +2,8 @@
 %GB_SPONES return pattern of GraphBLAS matrix.
 % Implements C = spones (G).
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 1)
     switch (gbtype (G))
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_sum.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_sum.m
index f99de4f615..cfb1167707 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_sum.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_sum.m
@@ -1,8 +1,8 @@
 function C = gb_sum (op, G, option)
 %GB_SUM C = sum (G) or C = any (G)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 2)
     % C = sum (G)
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_to_real_if_imag_zero.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_to_real_if_imag_zero.m
index c9598ccb8e..260d48a42a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_to_real_if_imag_zero.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_to_real_if_imag_zero.m
@@ -1,8 +1,8 @@
 function C = gb_to_real_if_imag_zero (G)
 %GB_TO_REAL_IF_IMAG_ZERO convert complex matrix to real if imag(G) is zero
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (contains (gbtype (G), 'complex') && ...
     gbnvals (gbselect ('nonzero', gbapply ('cimag', G))) == 0)
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_trig.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_trig.m
index ed331f7121..7fbe2c5f19 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_trig.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_trig.m
@@ -2,64 +2,63 @@
 %GB_TRIG inverse sine, cosine, log, sqrt, ... etc
 % Implements C = asin (G), C = acos (G), C = atanh (G), ... etc
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 type = gbtype (G) ;
 
-% determine if any entries are outside the domain for the real case
-switch (op)
+if (~contains (type, 'complex'))
 
-    case { 'asin', 'acos', 'atanh' }
+    % determine if any entries are outside the domain for the real case
+    noutside = 0 ;  % default if no switch cases apply
+    switch (op)
 
-        % C is complex if any (abs (G) > 1)
-        switch (type)
-            case { 'int8', 'int16', 'int32', 'int64', 'single', 'double' }
-                noutside = gbnvals (gbselect (gbapply ('abs', G), '>', 1)) ;
-            case { 'uint8', 'uint16', 'uint32', 'uint64' }
-                noutside = gbnvals (gbselect (G, '>', 1)) ;
-            otherwise
-                noutside = 0 ;
-        end
+        case { 'asin', 'acos', 'atanh' }
 
-    case { 'log', 'log10', 'sqrt', 'log2' }
+            % C is complex if any (abs (G) > 1)
+            switch (type)
+                case { 'int8', 'int16', 'int32', 'int64', 'single', 'double' }
+                    noutside = gbnvals (gbselect (gbapply ('abs', G), '>', 1)) ;
+                case { 'uint8', 'uint16', 'uint32', 'uint64' }
+                    noutside = gbnvals (gbselect (G, '>', 1)) ;
+            end
 
-        % C is complex if any (G < 0)
-        switch (type)
-            case { 'int8', 'int16', 'int32', 'int64', 'single', 'double' }
-                noutside = gbnvals (gbselect (G, '<', 0)) ;
-            otherwise
-                noutside = 0 ;
-        end
+        case { 'log', 'log10', 'sqrt', 'log2' }
 
-    case { 'log1p' }
+            % C is complex if any (G < 0)
+            switch (type)
+                case { 'int8', 'int16', 'int32', 'int64', 'single', 'double' }
+                    noutside = gbnvals (gbselect (G, '<', 0)) ;
+            end
 
-        % C is complex if any (G < -1)
-        switch (type)
-            case { 'int8', 'int16', 'int32', 'int64', 'single', 'double' }
-                noutside = gbnvals (gbselect (G, '<', -1)) ;
-            otherwise
-                noutside = 0 ;
-        end
+        case { 'log1p' }
 
-    case { 'acosh' }
+            % C is complex if any (G < -1)
+            switch (type)
+                case { 'int8', 'int16', 'int32', 'int64', 'single', 'double' }
+                    noutside = gbnvals (gbselect (G, '<', -1)) ;
+            end
 
-        % C is complex if any (G < 1)
-        noutside = gbnvals (gbselect (G, '<', 1)) ;
+        case { 'acosh' }
 
-end
+            % C is complex if any (G < 1)
+            noutside = gbnvals (gbselect (G, '<', 1)) ;
+    end
 
-if (noutside > 0)
-    % G is real but C is complex
-    if (isequal (type, 'single'))
-        op = [op '.single complex'] ;
-    else
-        op = [op '.double complex'] ;
+    if (noutside > 0)
+        % G is real but C is complex
+        if (isequal (type, 'single'))
+            op = [op '.single complex'] ;
+        else
+            op = [op '.double complex'] ;
+        end
+    elseif (~gb_isfloat (type))
+        % G is integer or logical; use the op.double operator
+        op = [op '.double'] ;
     end
-elseif (~gb_isfloat (type))
-    % G is integer or logical; use the op.double operator
-    op = [op '.double'] ;
 end
 
+% if G is already complex, gbapply will select a complex operator
+
 C = gbapply (op, G) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gb_union_op.m b/GraphBLAS/GraphBLAS/@GrB/private/gb_union_op.m
index 51795882db..b2d55fd650 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gb_union_op.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gb_union_op.m
@@ -21,8 +21,8 @@
 % See also GrB/lt, GrB/min, GrB/max, GrB/ne, GrB/pow2, GrB/atan2,
 % GrB/bitset, GrB/complex.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % FUTURE: this is slower than it could be.
 % affects: lt, gt, min(A,B), max(A,B), minus, ne, pow2, atan2, bitset, complex,
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbapply.m b/GraphBLAS/GraphBLAS/@GrB/private/gbapply.m
index e83934ddef..4c68ed1bd4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbapply.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbapply.m
@@ -1,7 +1,7 @@
 function C = gbapply (Cin, M, accum, op, A, desc)    %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbassign.m b/GraphBLAS/GraphBLAS/@GrB/private/gbassign.m
index 91181929f9..bf58fad2d4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbassign.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbassign.m
@@ -1,7 +1,7 @@
 function C = gbassign (Cin, M, accum, A, I, J, desc)     %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbbinopinfo.m b/GraphBLAS/GraphBLAS/@GrB/private/gbbinopinfo.m
index 0379391168..eb34b207cf 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbbinopinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbbinopinfo.m
@@ -1,7 +1,7 @@
 function gbbinopinfo (op, type)     %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbbuild.m b/GraphBLAS/GraphBLAS/@GrB/private/gbbuild.m
index b58eafdf6c..e33952f269 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbbuild.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbbuild.m
@@ -1,7 +1,7 @@
 function A = gbbuild (I, J, X, m, n, dup, type, desc)       %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbburble.m b/GraphBLAS/GraphBLAS/@GrB/private/gbburble.m
index c6944ebdd7..0fdf039125 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbburble.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbburble.m
@@ -1,7 +1,7 @@
 function c = gbburble (c)       %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbchunk.m b/GraphBLAS/GraphBLAS/@GrB/private/gbchunk.m
index 984edacfba..a7425b3478 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbchunk.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbchunk.m
@@ -1,7 +1,7 @@
 function c = gbchunk (c)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbdescriptorinfo.m b/GraphBLAS/GraphBLAS/@GrB/private/gbdescriptorinfo.m
index 45caf32354..ad45446058 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbdescriptorinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbdescriptorinfo.m
@@ -1,7 +1,7 @@
 function gbdescriptorinfo (d)       %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbdisp.m b/GraphBLAS/GraphBLAS/@GrB/private/gbdisp.m
index c676645c81..66d9d742a5 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbdisp.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbdisp.m
@@ -1,7 +1,7 @@
 function gbdisp (C, cnz, level)     %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbeadd.m b/GraphBLAS/GraphBLAS/@GrB/private/gbeadd.m
index 42991972e3..4afdef1949 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbeadd.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbeadd.m
@@ -1,7 +1,7 @@
 function C = gbeadd (Cin, M, accum, semiring, A, B, desc)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbemult.m b/GraphBLAS/GraphBLAS/@GrB/private/gbemult.m
index 2cb0ed8deb..ea94048417 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbemult.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbemult.m
@@ -1,7 +1,7 @@
 function C = gbemult (Cin, M, accum, semiring, A, B, desc)       %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbextract.m b/GraphBLAS/GraphBLAS/@GrB/private/gbextract.m
index c152a426b8..ca65fe44d6 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbextract.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbextract.m
@@ -1,7 +1,7 @@
 function C = gbextract (Cin, M, accum, A, I, J, desc)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbextracttuples.m b/GraphBLAS/GraphBLAS/@GrB/private/gbextracttuples.m
index d36aad3204..769e00c6b4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbextracttuples.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbextracttuples.m
@@ -1,7 +1,7 @@
 function [I,J,X] = gbextracttuples (A, desc)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbextractvalues.m b/GraphBLAS/GraphBLAS/@GrB/private/gbextractvalues.m
index e158b037d5..e7aa1a3e0a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbextractvalues.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbextractvalues.m
@@ -1,7 +1,7 @@
 function X = gbextractvalues (A)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbformat.m b/GraphBLAS/GraphBLAS/@GrB/private/gbformat.m
index 85872b8fb6..78c21c588e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbformat.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbformat.m
@@ -1,7 +1,7 @@
 function f = gbformat (arg)     %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbfull.m b/GraphBLAS/GraphBLAS/@GrB/private/gbfull.m
index feaffe1861..3aae608ead 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbfull.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbfull.m
@@ -1,7 +1,7 @@
 function F = gbfull (G, type, identity, descriptor)     %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbisequal.m b/GraphBLAS/GraphBLAS/@GrB/private/gbisequal.m
index 32437b2514..236b5377af 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbisequal.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbisequal.m
@@ -1,7 +1,7 @@
 function s = gbisequal (A, B)       %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbkronecker.m b/GraphBLAS/GraphBLAS/@GrB/private/gbkronecker.m
index 0429c91eba..7fe174a1a8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbkronecker.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbkronecker.m
@@ -1,7 +1,7 @@
 function C = gbkronecker (Cin, M, accum, op, A, B, desc)     %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gblogassign.m b/GraphBLAS/GraphBLAS/@GrB/private/gblogassign.m
index 32e84b48be..0f5796363c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gblogassign.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gblogassign.m
@@ -1,7 +1,7 @@
 function C = gblogassign (Cin, M, A)     %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gblogextract.m b/GraphBLAS/GraphBLAS/@GrB/private/gblogextract.m
index f672925421..bfeeb2fc87 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gblogextract.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gblogextract.m
@@ -1,7 +1,7 @@
 function C = gblogextract (A, M)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbmake.m b/GraphBLAS/GraphBLAS/@GrB/private/gbmake.m
index 0f69efcf21..c354d24555 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbmake.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbmake.m
@@ -14,14 +14,10 @@ function gbmake (what)
 % used is ../build/libgraphblas.so if found, or in /usr/local/lib if not found
 % there.
 %
-% If GraphBLAS has been initialized already, then gbmake must first finalize
-% GraphBLAS, just as GrB.clear does.  It then calls GrB.init to initialize
-% GraphBLAS.
-%
 % See also mex, version, GrB.clear.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 have_octave = (exist ('OCTAVE_VERSION', 'builtin') == 5) ;
 
@@ -73,7 +69,7 @@ function gbmake (what)
                 if (~isempty (ansi))
                     cflags = [cflags(1:ansi-1) '-std=c11' cflags(ansi+5:end)] ;
                     flags = [flags ' ' cflags] ;
-                    fprintf ('compiling with -std=c11 instead of default -ansi\n') ;
+                    fprintf ('using -std=c11 instead of default -ansi\n') ;
                 end
             end
         end
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbmatlab.m b/GraphBLAS/GraphBLAS/@GrB/private/gbmatlab.m
new file mode 100644
index 0000000000..e02a8595a3
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbmatlab.m
@@ -0,0 +1,7 @@
+function A = gbmatlab (X, type)     %#ok
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbmonoidinfo.m b/GraphBLAS/GraphBLAS/@GrB/private/gbmonoidinfo.m
index 9431df202d..4e5b613972 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbmonoidinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbmonoidinfo.m
@@ -1,7 +1,7 @@
 function gbmonoidinfo (monoid, type)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbmxm.m b/GraphBLAS/GraphBLAS/@GrB/private/gbmxm.m
index ec5b7c0978..f061ef2523 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbmxm.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbmxm.m
@@ -1,7 +1,7 @@
 function C = gbmxm (Cin, M, accum, semiring, A, B, desc)     %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbnew.m b/GraphBLAS/GraphBLAS/@GrB/private/gbnew.m
index 41d219b331..9e36c7730a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbnew.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbnew.m
@@ -1,7 +1,7 @@
-function G = gbnew (arg1, arg2, arg3)       %#ok
+function G = gbnew (arg1, arg2, arg3, arg4)       %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbnvals.m b/GraphBLAS/GraphBLAS/@GrB/private/gbnvals.m
index 3193393a10..a2e23b8f42 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbnvals.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbnvals.m
@@ -1,7 +1,7 @@
 function nvals = gbnvals (G)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbreduce.m b/GraphBLAS/GraphBLAS/@GrB/private/gbreduce.m
index eb0094b981..c76d840aa3 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbreduce.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbreduce.m
@@ -1,7 +1,7 @@
 function c = gbreduce (cin, accum, op, A, desc)      %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbselect.m b/GraphBLAS/GraphBLAS/@GrB/private/gbselect.m
index d0b2b6797f..a47d0f825f 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbselect.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbselect.m
@@ -1,7 +1,7 @@
 function C = gbselect (Cin, M, accum, op, A, b, desc)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbselectopinfo.m b/GraphBLAS/GraphBLAS/@GrB/private/gbselectopinfo.m
index b9a2c0cf04..b150494347 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbselectopinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbselectopinfo.m
@@ -1,7 +1,7 @@
 function gbselectopinfo (selectop, type)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbsemiringinfo.m b/GraphBLAS/GraphBLAS/@GrB/private/gbsemiringinfo.m
index 6549b68d24..710a8b5bd2 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbsemiringinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbsemiringinfo.m
@@ -1,7 +1,7 @@
 function gbsemiringinfo (semiring, type)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbsetup.m b/GraphBLAS/GraphBLAS/@GrB/private/gbsetup.m
index 88eebe293c..c17c17b6ca 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbsetup.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbsetup.m
@@ -1,7 +1,7 @@
 function gbsetup
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbsize.m b/GraphBLAS/GraphBLAS/@GrB/private/gbsize.m
index be2ed59039..4f8092d5ec 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbsize.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbsize.m
@@ -1,7 +1,7 @@
 function [m, n, type] = gbsize (G, dim)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved.  http://suitesparse.com   See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbsparse.m b/GraphBLAS/GraphBLAS/@GrB/private/gbsparse.m
deleted file mode 100644
index 520fe95781..0000000000
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbsparse.m
+++ /dev/null
@@ -1,7 +0,0 @@
-function A = gbsparse (X, type)     %#ok
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
-
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbsubassign.m b/GraphBLAS/GraphBLAS/@GrB/private/gbsubassign.m
index a2fdbb2b19..e1889233ee 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbsubassign.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbsubassign.m
@@ -1,7 +1,7 @@
 function C = gbsubassign (Cin, M, accum, A, I, j, desc)      %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbthreads.m b/GraphBLAS/GraphBLAS/@GrB/private/gbthreads.m
index 0eab336bd7..bd3f3dae5f 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbthreads.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbthreads.m
@@ -1,7 +1,7 @@
 function nthreads = gbthreads (nthreads)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbtrans.m b/GraphBLAS/GraphBLAS/@GrB/private/gbtrans.m
index 177fd6c2f5..df56919bb3 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbtrans.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbtrans.m
@@ -1,7 +1,7 @@
 function C = gbtrans (Cin, M, accum, A, desc)        %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbtype.m b/GraphBLAS/GraphBLAS/@GrB/private/gbtype.m
index aaf40321f4..3c57178841 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbtype.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbtype.m
@@ -1,7 +1,7 @@
 function type = gbtype (X)      %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbunopinfo.m b/GraphBLAS/GraphBLAS/@GrB/private/gbunopinfo.m
index faf06bed03..01a819349a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbunopinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbunopinfo.m
@@ -1,7 +1,7 @@
 function gbunopinfo (op, type)      %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/gbvreduce.m b/GraphBLAS/GraphBLAS/@GrB/private/gbvreduce.m
index b5a5bba6ed..e33c2fcc7d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/gbvreduce.m
+++ b/GraphBLAS/GraphBLAS/@GrB/private/gbvreduce.m
@@ -1,7 +1,7 @@
 function C = gbvreduce (Cin, M, accum, op, A, desc)      %#ok
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 error ('GrB:mex', 'mexFunction not found; use gbmake to compile GraphBLAS') ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbapply.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbapply.c
index 11eb4b4859..685c2c31eb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbapply.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbapply.c
@@ -2,8 +2,8 @@
 // gbapply: apply a unary operator to a sparse matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,10 +47,10 @@ void mexFunction
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
-    int nmatrices, nstrings, ncells ;
+    int nmatrices, nstrings, ncells, sparsity ;
     GrB_Descriptor desc ;
     gb_get_mxargs (nargin, pargin, USAGE, Matrix, &nmatrices, String, &nstrings,
-        Cell, &ncells, &desc, &base, &kind, &fmt) ;
+        Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
     CHECK_ERROR (nmatrices < 1 || nmatrices > 3 || nstrings < 1 || ncells > 0,
         USAGE) ;
@@ -130,16 +130,17 @@ void mexFunction
         // use the ztype of the op as the type of C
         OK (GxB_UnaryOp_ztype (&ctype, op)) ;
 
-        OK (GrB_Matrix_new (&C, ctype, cnrows, cncols)) ;
+        // create the matrix C and set its format and sparsity
         fmt = gb_get_format (cnrows, cncols, A, NULL, fmt) ;
-        OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+        sparsity = gb_get_sparsity (A, NULL, sparsity) ;
+        C = gb_new (ctype, cnrows, cncols, fmt, sparsity) ;
     }
 
     //--------------------------------------------------------------------------
     // compute C<M> += f(A)
     //--------------------------------------------------------------------------
 
-    OK (GrB_Matrix_apply (C, M, accum, op, A, desc)) ;
+    OK1 (C, GrB_Matrix_apply (C, M, accum, op, A, desc)) ;
 
     //--------------------------------------------------------------------------
     // free shallow copies
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbapply2.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbapply2.c
index 6689a7e378..06376ed585 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbapply2.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbapply2.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// gbapply2: apply a binary operator to a matrix, with scalaring binding
+// gbapply2: apply a binary operator to a matrix, with scalar binding
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -51,10 +51,10 @@ void mexFunction
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
-    int nmatrices, nstrings, ncells ;
+    int nmatrices, nstrings, ncells, sparsity ;
     GrB_Descriptor desc ;
     gb_get_mxargs (nargin, pargin, USAGE, Matrix, &nmatrices, String, &nstrings,
-        Cell, &ncells, &desc, &base, &kind, &fmt) ;
+        Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
     CHECK_ERROR (nmatrices < 2 || nstrings < 1 || ncells > 0, USAGE) ;
 
@@ -103,7 +103,7 @@ void mexFunction
     OK (GrB_Matrix_nrows (&bnrows, B)) ;
     OK (GrB_Matrix_ncols (&bncols, B)) ;
 
-    GxB_Scalar scalar, scalar0 = NULL ;
+    GxB_Scalar scalar = NULL, scalar0 = NULL ;
     bool binop_bind1st ;
     if (anrows == 1 && ancols == 1)
     {
@@ -191,9 +191,10 @@ void mexFunction
         // use the ztype of the op as the type of C
         OK (GxB_BinaryOp_ztype (&ctype, op)) ;
 
-        OK (GrB_Matrix_new (&C, ctype, cnrows, cncols)) ;
+        // create the matrix C and set its format and sparsity
         fmt = gb_get_format (cnrows, cncols, A, B, fmt) ;
-        OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+        sparsity = gb_get_sparsity (A, B, sparsity) ;
+        C = gb_new (ctype, cnrows, cncols, fmt, sparsity) ;
     }
 
     //--------------------------------------------------------------------------
@@ -202,13 +203,13 @@ void mexFunction
 
     if (binop_bind1st)
     {
-        // printf ("bind 1st:\n") ;
-        OK (GxB_Matrix_apply_BinaryOp1st (C, M, accum, op, scalar, B, desc)) ;
+        OK1 (C, GxB_Matrix_apply_BinaryOp1st (C, M, accum, op, scalar, B,
+            desc)) ;
     }
     else
     {
-        // printf ("bind 2nd:\n") ;
-        OK (GxB_Matrix_apply_BinaryOp2nd (C, M, accum, op, A, scalar, desc)) ;
+        OK1 (C, GxB_Matrix_apply_BinaryOp2nd (C, M, accum, op, A, scalar,
+            desc)) ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbassign.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbassign.c
index dc82dabac2..7cdf6b8d17 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbassign.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbassign.c
@@ -2,8 +2,8 @@
 // gbassign: assign entries into a GraphBLAS matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbinopinfo.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbinopinfo.c
index 3eca952ff5..3571a4a5ba 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbinopinfo.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbinopinfo.c
@@ -2,8 +2,8 @@
 // gbbinopinfo : print a GraphBLAS binary op (for illustration only)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbuild.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbuild.c
index 2ac53025c8..e3d47019b4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbuild.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbbuild.c
@@ -2,8 +2,8 @@
 // gbbuild: build a GraphBLAS matrix or a MATLAB sparse matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -53,8 +53,10 @@ void mexFunction
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
+    int sparsity ;
     GrB_Descriptor desc = NULL ;
-    desc = gb_mxarray_to_descriptor (pargin [nargin-1], &kind, &fmt, &base) ;
+    desc = gb_mxarray_to_descriptor (pargin [nargin-1], &kind, &fmt,
+        &sparsity, &base) ;
 
     // if present, remove the descriptor from consideration
     if (desc != NULL) nargin-- ;
@@ -204,10 +206,9 @@ void mexFunction
     // build the matrix
     //--------------------------------------------------------------------------
 
-    GrB_Matrix A ;
-    OK (GrB_Matrix_new (&A, type, nrows, ncols)) ;
     fmt = gb_get_format (nrows, ncols, NULL, NULL, fmt) ;
-    OK (GxB_Matrix_Option_set (A, GxB_FORMAT, fmt)) ;
+    sparsity = gb_get_sparsity (NULL, NULL, sparsity) ;
+    GrB_Matrix A = gb_new (type, nrows, ncols, fmt, sparsity) ;
 
     // expandx is true if X must be expanded from a scalar to a vector
     void *X2 = NULL ;
@@ -216,7 +217,7 @@ void mexFunction
     if (xtype == GrB_BOOL)
     { 
         bool empty = 0 ;
-        bool *X = (nvals == 0) ? &empty : mxGetData (pargin [2]) ;  // OK:bool
+        bool *X = (nvals == 0) ? &empty : mxGetData (pargin [2]) ;
         if (dup == NULL) dup = GrB_LOR ;
         if (expandx)
         { 
@@ -225,7 +226,7 @@ void mexFunction
                 sizeof (bool)) ;
             X = (bool *) X2 ;
         }
-        OK (GrB_Matrix_build_BOOL (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GrB_Matrix_build_BOOL (A, I, J, X, nvals, dup)) ;
     }
     else if (xtype == GrB_INT8)
     { 
@@ -239,7 +240,7 @@ void mexFunction
                 sizeof (int8_t)) ;
             X = (int8_t *) X2 ;
         }
-        OK (GrB_Matrix_build_INT8 (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GrB_Matrix_build_INT8 (A, I, J, X, nvals, dup)) ;
     }
     else if (xtype == GrB_INT16)
     { 
@@ -253,7 +254,7 @@ void mexFunction
                 sizeof (int16_t)) ;
             X = (int16_t *) X2 ;
         }
-        OK (GrB_Matrix_build_INT16 (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GrB_Matrix_build_INT16 (A, I, J, X, nvals, dup)) ;
     }
     else if (xtype == GrB_INT32)
     { 
@@ -267,7 +268,7 @@ void mexFunction
                 sizeof (int32_t)) ;
             X = (int32_t *) X2 ;
         }
-        OK (GrB_Matrix_build_INT32 (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GrB_Matrix_build_INT32 (A, I, J, X, nvals, dup)) ;
     }
     else if (xtype == GrB_INT64)
     { 
@@ -281,7 +282,7 @@ void mexFunction
                 sizeof (int64_t)) ;
             X = (int64_t *) X2 ;
         }
-        OK (GrB_Matrix_build_INT64 (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GrB_Matrix_build_INT64 (A, I, J, X, nvals, dup)) ;
     }
     else if (xtype == GrB_UINT8)
     { 
@@ -295,7 +296,7 @@ void mexFunction
                 sizeof (uint8_t)) ;
             X = (uint8_t *) X2 ;
         }
-        OK (GrB_Matrix_build_UINT8 (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GrB_Matrix_build_UINT8 (A, I, J, X, nvals, dup)) ;
     }
     else if (xtype == GrB_UINT16)
     { 
@@ -309,7 +310,7 @@ void mexFunction
                 sizeof (uint16_t)) ;
             X = (uint16_t *) X2 ;
         }
-        OK (GrB_Matrix_build_UINT16 (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GrB_Matrix_build_UINT16 (A, I, J, X, nvals, dup)) ;
     }
     else if (xtype == GrB_UINT32)
     { 
@@ -323,7 +324,7 @@ void mexFunction
                 sizeof (uint32_t)) ;
             X = (uint32_t *) X2 ;
         }
-        OK (GrB_Matrix_build_UINT32 (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GrB_Matrix_build_UINT32 (A, I, J, X, nvals, dup)) ;
     }
     else if (xtype == GrB_UINT64)
     { 
@@ -337,7 +338,7 @@ void mexFunction
                 sizeof (uint64_t)) ;
             X = (uint64_t *) X2 ;
         }
-        OK (GrB_Matrix_build_UINT64 (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GrB_Matrix_build_UINT64 (A, I, J, X, nvals, dup)) ;
     }
     else if (xtype == GrB_FP32)
     { 
@@ -351,7 +352,7 @@ void mexFunction
                 sizeof (float)) ;
             X = (float *) X2 ;
         }
-        OK (GrB_Matrix_build_FP32 (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GrB_Matrix_build_FP32 (A, I, J, X, nvals, dup)) ;
     }
     else if (xtype == GrB_FP64)
     { 
@@ -365,7 +366,7 @@ void mexFunction
                 sizeof (double)) ;
             X = (double *) X2 ;
         }
-        OK (GrB_Matrix_build_FP64 (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GrB_Matrix_build_FP64 (A, I, J, X, nvals, dup)) ;
     }
     else if (xtype == GxB_FC32)
     { 
@@ -380,7 +381,7 @@ void mexFunction
                 sizeof (GxB_FC32_t)) ;
             X = (GxB_FC32_t *) X2 ;
         }
-        OK (GxB_Matrix_build_FC32 (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GxB_Matrix_build_FC32 (A, I, J, X, nvals, dup)) ;
     }
     else if (xtype == GxB_FC64)
     { 
@@ -395,7 +396,7 @@ void mexFunction
                 sizeof (GxB_FC64_t)) ;
             X = (GxB_FC64_t *) X2 ;
         }
-        OK (GxB_Matrix_build_FC64 (A, I, J, X, nvals, dup)) ;
+        OK1 (A, GxB_Matrix_build_FC64 (A, I, J, X, nvals, dup)) ;
     }
     else
     {
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbburble.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbburble.c
index fbce31fb53..7138a05e5a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbburble.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbburble.c
@@ -2,8 +2,8 @@
 // gbburble: get/set the burble setting for diagnostic output
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbchunk.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbchunk.c
index 3667311473..e3fc0ad1c3 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbchunk.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbchunk.c
@@ -2,8 +2,8 @@
 // gbchunk: get/set the chunk size to use in GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdegree.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdegree.c
index d77ec374ea..34148978ba 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdegree.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdegree.c
@@ -2,8 +2,8 @@
 // gbdegree: number of entries in each vector of a GraphBLAS matrix struct
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -42,6 +42,14 @@ void mexFunction
     // get the inputs 
     //--------------------------------------------------------------------------
 
+    int64_t *degree = NULL ;
+    GrB_Index *list = NULL, nvec = 0 ;
+    GrB_Vector d = NULL ;
+    GrB_Vector y = NULL ;
+    GrB_Matrix T = NULL ;
+    GrB_Matrix Z = NULL ;
+    GrB_Descriptor desc = NULL ;
+
     GrB_Matrix X = gb_get_shallow (pargin [0]) ;
     GxB_Format_Value fmt ;
     OK (GxB_Matrix_Option_get (X, GxB_FORMAT, &fmt)) ;
@@ -66,41 +74,57 @@ void mexFunction
         native = (mxGetScalar (pargin [1]) != 0) ;
     }
 
+    //--------------------------------------------------------------------------
+    // if X is bitmap: create a copy of X and convert it to sparse
+    //--------------------------------------------------------------------------
+
+    int sparsity ;
+    OK (GxB_Matrix_Option_get (X, GxB_SPARSITY_STATUS, &sparsity)) ;
+    if (sparsity == GxB_BITMAP)
+    { 
+        // Z = deep copy of the shallow matrix X
+        OK (GrB_Matrix_dup (&Z, X)) ;
+        // convert Z to sparse
+        OK (GxB_Matrix_Option_set (Z, GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
+        // free the shallow X and replace it with Z
+        OK (GrB_Matrix_free (&X)) ;
+        X = Z ;
+    }
+
     //--------------------------------------------------------------------------
     // get the degree of each row or column of X
     //--------------------------------------------------------------------------
 
-    int64_t *degree = NULL ;
-    GrB_Index *list = NULL, nvec = 0 ;
-    GrB_Vector d = NULL ;
+    GrB_Index nvals, nrows, ncols ;
+    OK (GrB_Matrix_nvals (&nvals, X)) ;
+    OK (GrB_Matrix_nrows (&nrows, X)) ;
+    OK (GrB_Matrix_ncols (&ncols, X)) ;
 
     if (native)
     { 
 
         //----------------------------------------------------------------------
-        // get the degree of each vector of X
+        // get the degree of each vector of X, where X is sparse or hypersparse
         //----------------------------------------------------------------------
 
         if (!GB_matlab_helper9 (X, &degree, &list, &nvec))
         {
             ERROR ("out of memory") ;
         }
-        OK (GxB_Vector_import (&d, GrB_INT64, X->vdim, nvec, &list, &degree,
-            NULL)) ;
+        OK (GxB_Vector_import_CSC (&d, GrB_INT64, X->vdim,
+            &list, &degree, nvec, nvec, nvec, false, NULL)) ;
 
     }
     else
     {
 
         //----------------------------------------------------------------------
-        // get the degree of each index of X
+        // get the degree of each index of X, where X is sparse or hypersparse
         //----------------------------------------------------------------------
 
-        GrB_Index nvals, nrows, ncols ;
-        OK (GrB_Matrix_nvals (&nvals, X)) ;
-        OK (GrB_Matrix_nrows (&nrows, X)) ;
-        OK (GrB_Matrix_ncols (&ncols, X)) ;
-        GrB_Vector y = NULL ;
+        // ensure the descriptor is present, and set GxB_SORT to true
+        OK (GrB_Descriptor_new (&desc)) ;
+        OK (GxB_Desc_set (desc, GxB_SORT, true)) ;
 
         if (fmt == GxB_BY_COL)
         {
@@ -111,26 +135,37 @@ void mexFunction
 
             if (nvals < ncols / 16 && ncols > 256)
             { 
-                // X is hypersparse, or might as well be, so let y be the
-                // pattern of nonempty columns of X.
-                if (!GB_matlab_helper9 (X, &degree, &list, &nvec))
+
+                // X is hypersparse, or might as well be, and held by column,
+                // so compute the degree of each vector of T = GrB(X,'by row')
+                // instead.
+
+                OK (GrB_Matrix_new (&T, GrB_BOOL, nrows, ncols)) ;
+                OK (GxB_Matrix_Option_set (T, GxB_FORMAT, GxB_BY_ROW)) ;
+                OK1 (T, GrB_Matrix_apply (T, NULL, NULL, GxB_ONE_BOOL, X,
+                    NULL)) ;
+
+                // get the degree of nonempty rows of T
+                if (!GB_matlab_helper9 (T, &degree, &list, &nvec))
                 {
                     ERROR ("out of memory") ;
                 }
-                OK (GxB_Vector_import (&y, GrB_INT64, ncols, nvec,
-                    &list, &degree, NULL)) ;
+                OK (GxB_Vector_import_CSC (&d, GrB_INT64, nrows,
+                    &list, &degree, nvec, nvec, nvec, false, NULL)) ;
+
             }
             else
             { 
-                // y = dense vector of size ncols-by-1; value is not relevant
+
+                // y = full vector of size ncols-by-1; value is not relevant
                 OK (GrB_Vector_new (&y, GrB_BOOL, ncols)) ;
                 OK (GrB_Vector_assign_BOOL (y, NULL, NULL, false, GrB_ALL,
                     ncols, NULL)) ;
-            }
 
-            // d = X*y using the PLUS_PAIR semiring
-            OK (GrB_Vector_new (&d, GrB_INT64, nrows)) ;
-            OK (GrB_mxv (d, NULL, NULL, GxB_PLUS_PAIR_INT64, X, y, NULL)) ;
+                // d = X*y using the PLUS_PAIR semiring
+                OK (GrB_Vector_new (&d, GrB_INT64, nrows)) ;
+                OK (GrB_mxv (d, NULL, NULL, GxB_PLUS_PAIR_INT64, X, y, desc)) ;
+            }
 
         }
         else
@@ -142,36 +177,48 @@ void mexFunction
 
             if (nvals < nrows / 16 && nrows > 256)
             { 
-                // X is hypersparse, or might as well be, so let y be the
-                // pattern of nonempty rows of X.
-                if (!GB_matlab_helper9 (X, &degree, &list, &nvec))
+
+                // X is hypersparse, or might as well be, and held by row,
+                // so compute the degree of each vector of T = GrB(X,'by col')
+                // instead.
+
+                OK (GrB_Matrix_new (&T, GrB_BOOL, nrows, ncols)) ;
+                OK (GxB_Matrix_Option_set (T, GxB_FORMAT, GxB_BY_COL)) ;
+                OK1 (T, GrB_Matrix_apply (T, NULL, NULL, GxB_ONE_BOOL, X,
+                    NULL)) ;
+
+                // get the degree of nonempty columns of T
+                if (!GB_matlab_helper9 (T, &degree, &list, &nvec))
                 {
                     ERROR ("out of memory") ;
                 }
-                OK (GxB_Vector_import (&y, GrB_INT64, nrows, nvec,
-                    &list, &degree, NULL)) ;
+                OK (GxB_Vector_import_CSC (&d, GrB_INT64, ncols,
+                    &list, &degree, nvec, nvec, nvec, false, NULL)) ;
+
             }
             else
             { 
-                // y = dense vector of size nrows-by-1; value is not relevant
+
+                // y = full vector of size nrows-by-1; value is not relevant
                 OK (GrB_Vector_new (&y, GrB_BOOL, nrows)) ;
                 OK (GrB_Vector_assign_BOOL (y, NULL, NULL, false, GrB_ALL,
                     nrows, NULL)) ;
-            }
 
-            // d = y*X using the PLUS_PAIR semiring
-            OK (GrB_Vector_new (&d, GrB_INT64, ncols)) ;
-            OK (GrB_vxm (d, NULL, NULL, GxB_PLUS_PAIR_INT64, y, X, NULL)) ;
+                // d = y*X using the PLUS_PAIR semiring
+                OK (GrB_Vector_new (&d, GrB_INT64, ncols)) ;
+                OK (GrB_vxm (d, NULL, NULL, GxB_PLUS_PAIR_INT64, y, X, desc)) ;
+            }
         }
-
-        OK (GrB_Vector_free (&y)) ;
     }
 
     //--------------------------------------------------------------------------
-    // return result
+    // free workspace and export d to MATLAB as a GraphBLAS matrix
     //--------------------------------------------------------------------------
 
+    OK (GrB_Vector_free (&y)) ;
+    OK (GrB_Matrix_free (&T)) ;
     OK (GrB_Matrix_free (&X)) ;
+    OK (GrB_Descriptor_free (&desc)) ;
     pargout [0] = gb_export (&d, KIND_GRB) ;
     GB_WRAPUP ;
 }
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdescriptorinfo.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdescriptorinfo.c
index 5c67b9f415..78feafb018 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdescriptorinfo.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdescriptorinfo.c
@@ -2,8 +2,8 @@
 // gbdescriptorinfo: print a GraphBLAS descriptor (for illustration only)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -37,10 +37,12 @@ void mexFunction
     base_enum_t base = BASE_DEFAULT ;
     kind_enum_t kind = KIND_GRB ;
     GxB_Format_Value fmt = GxB_NO_FORMAT ;
+    int sparsity = 0 ;
     GrB_Descriptor desc = NULL ;
     if (nargin > 0)
     {
-        desc = gb_mxarray_to_descriptor (pargin [nargin-1], &kind, &fmt, &base);
+        desc = gb_mxarray_to_descriptor (pargin [nargin-1], &kind, &fmt,
+            &sparsity, &base) ;
     }
 
     if (desc == NULL)
@@ -64,6 +66,7 @@ void mexFunction
     {
         case KIND_SPARSE : printf ("sparse\n")  ; break ;
         case KIND_FULL   : printf ("full\n")    ; break ;
+        case KIND_MATLAB : printf ("matlab\n")  ; break ;
         case KIND_GRB    :
         default          : printf ("GrB\n")     ; break ;
     }
@@ -79,12 +82,63 @@ void mexFunction
     }
 
     printf ("    d.format   = ") ;
+
+    switch (sparsity)
+    {
+        case GxB_HYPERSPARSE :                              // 1
+            printf ("hypersparse ") ;
+            break ;
+        case GxB_SPARSE :                                   // 2
+            printf ("sparse ") ;
+            break ;
+        case GxB_HYPERSPARSE + GxB_SPARSE :                 // 3
+            printf ("hypersparse/sparse ") ;
+            break ;
+        case GxB_BITMAP :                                   // 4
+            printf ("bitmap ") ;
+            break ;
+        case GxB_HYPERSPARSE + GxB_BITMAP :                 // 5
+            printf ("hypersparse/bitmap ") ;
+            break ;
+        case GxB_SPARSE + GxB_BITMAP :                      // 6
+            printf ("sparse/bitmap ") ;
+            break ;
+        case GxB_HYPERSPARSE + GxB_SPARSE + GxB_BITMAP :    // 7
+            printf ("hypersparse/sparse/bitmap ") ;
+            break ;
+        case GxB_FULL :                                     // 8
+            printf ("full ") ;
+            break ;
+        case GxB_HYPERSPARSE + GxB_FULL :                   // 9
+            printf ("hypersparse/full ") ;
+            break ;
+        case GxB_SPARSE + GxB_FULL :                        // 10
+            printf ("sparse/full ") ;
+            break ;
+        default :
+        case GxB_HYPERSPARSE + GxB_SPARSE + GxB_FULL :      // 11
+            // printf ("hypersparse/sparse/full ") ;
+            break ;
+        case GxB_BITMAP + GxB_FULL :                        // 12
+            printf ("bitmap/full ") ;
+            break ;
+        case GxB_HYPERSPARSE + GxB_BITMAP + GxB_FULL :      // 13
+            printf ("hypersparse/bitmap/full ") ;
+            break ;
+        case GxB_SPARSE + GxB_BITMAP + GxB_FULL :           // 14
+            printf ("sparse/bitmap/full ") ;
+            break ;
+        case GxB_HYPERSPARSE + GxB_SPARSE + GxB_BITMAP + GxB_FULL : // 15
+            printf ("hypersparse/sparse/bitmap/full ") ;
+            break ;
+    }
+
     switch (fmt)
     {
-        case GxB_BY_ROW    : printf ("by row\n")    ; break ;
-        case GxB_BY_COL    : printf ("by col\n")    ; break ;
+        case GxB_BY_ROW    : printf ("by row\n")     ; break ;
+        case GxB_BY_COL    : printf ("by col\n")     ; break ;
         case GxB_NO_FORMAT :
-        default            : printf ("default\n")   ; break ;
+        default            : printf ("by default\n") ; break ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdisp.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdisp.c
index b31b89789b..d1aba0016b 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdisp.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbdisp.c
@@ -2,8 +2,8 @@
 // gbdisp: display a GraphBLAS matrix struct
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbeadd.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbeadd.c
index da30404e68..35402c9c28 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbeadd.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbeadd.c
@@ -2,8 +2,8 @@
 // gbeadd: sparse matrix addition
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -48,10 +48,10 @@ void mexFunction
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
-    int nmatrices, nstrings, ncells ;
+    int nmatrices, nstrings, ncells, sparsity ;
     GrB_Descriptor desc ;
     gb_get_mxargs (nargin, pargin, USAGE, Matrix, &nmatrices, String, &nstrings,
-        Cell, &ncells, &desc, &base, &kind, &fmt) ;
+        Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
     CHECK_ERROR (nmatrices < 2 || nstrings < 1 || ncells > 0, USAGE) ;
 
@@ -132,16 +132,17 @@ void mexFunction
         // use the ztype of the op as the type of C
         OK (GxB_BinaryOp_ztype (&ctype, op)) ;
 
-        OK (GrB_Matrix_new (&C, ctype, cnrows, cncols)) ;
+        // create the matrix C and set its format and sparsity
         fmt = gb_get_format (cnrows, cncols, A, B, fmt) ;
-        OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+        sparsity = gb_get_sparsity (A, B, sparsity) ;
+        C = gb_new (ctype, cnrows, cncols, fmt, sparsity) ;
     }
 
     //--------------------------------------------------------------------------
     // compute C<M> += A+B
     //--------------------------------------------------------------------------
 
-    OK (GrB_Matrix_eWiseAdd_BinaryOp (C, M, accum, op, A, B, desc)) ;
+    OK1 (C, GrB_Matrix_eWiseAdd_BinaryOp (C, M, accum, op, A, B, desc)) ;
 
     //--------------------------------------------------------------------------
     // free shallow copies
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbemult.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbemult.c
index 326dd645a1..190c05d9e9 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbemult.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbemult.c
@@ -2,8 +2,8 @@
 // gbemult: sparse matrix element-wise multiplication
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -48,10 +48,10 @@ void mexFunction
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
-    int nmatrices, nstrings, ncells ;
+    int nmatrices, nstrings, ncells, sparsity ;
     GrB_Descriptor desc ;
     gb_get_mxargs (nargin, pargin, USAGE, Matrix, &nmatrices, String, &nstrings,
-        Cell, &ncells, &desc, &base, &kind, &fmt) ;
+        Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
     CHECK_ERROR (nmatrices < 2 || nstrings < 1 || ncells > 0, USAGE) ;
 
@@ -132,16 +132,17 @@ void mexFunction
         // use the ztype of the op as the type of C
         OK (GxB_BinaryOp_ztype (&ctype, op)) ;
 
-        OK (GrB_Matrix_new (&C, ctype, cnrows, cncols)) ;
+        // create the matrix C and set its format and sparsity
         fmt = gb_get_format (cnrows, cncols, A, B, fmt) ;
-        OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+        sparsity = gb_get_sparsity (A, B, sparsity) ;
+        C = gb_new (ctype, cnrows, cncols, fmt, sparsity) ;
     }
 
     //--------------------------------------------------------------------------
     // compute C<M> += (A.*B)
     //--------------------------------------------------------------------------
 
-    OK (GrB_Matrix_eWiseMult_BinaryOp (C, M, accum, op, A, B, desc)) ;
+    OK1 (C, GrB_Matrix_eWiseMult_BinaryOp (C, M, accum, op, A, B, desc)) ;
 
     //--------------------------------------------------------------------------
     // free shallow copies
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextract.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextract.c
index a31b198a72..a125f60950 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextract.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextract.c
@@ -2,8 +2,8 @@
 // gbextract: extract entries into a GraphBLAS matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -48,10 +48,10 @@ void mexFunction
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
-    int nmatrices, nstrings, ncells ;
+    int nmatrices, nstrings, ncells, sparsity ;
     GrB_Descriptor desc ;
     gb_get_mxargs (nargin, pargin, USAGE, Matrix, &nmatrices, String, &nstrings,
-        Cell, &ncells, &desc, &base, &kind, &fmt) ;
+        Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
     CHECK_ERROR (nmatrices < 1 || nmatrices > 3 || nstrings > 1, USAGE) ;
 
@@ -160,16 +160,17 @@ void mexFunction
         GB_ijlength (J, nj, ancols, (int64_t *) &cncols, &J_kind, J_colon) ;
         ctype = atype ;
 
-        OK (GrB_Matrix_new (&C, ctype, cnrows, cncols)) ;
+        // create the matrix C and set its format and sparsity
         fmt = gb_get_format (cnrows, cncols, A, NULL, fmt) ;
-        OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+        sparsity = gb_get_sparsity (A, NULL, sparsity) ;
+        C = gb_new (ctype, cnrows, cncols, fmt, sparsity) ;
     }
 
     //--------------------------------------------------------------------------
     // compute C<M> += A(I,J) or AT(I,J)
     //--------------------------------------------------------------------------
 
-    OK (GrB_Matrix_extract (C, M, accum, A, I, ni, J, nj, desc)) ;
+    OK1 (C, GrB_Matrix_extract (C, M, accum, A, I, ni, J, nj, desc)) ;
 
     //--------------------------------------------------------------------------
     // free shallow copies
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextracttuples.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextracttuples.c
index a1693ddb9a..1e16a40913 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextracttuples.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextracttuples.c
@@ -2,8 +2,8 @@
 // gbextracttuples: extract all entries from a GraphBLAS matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -45,10 +45,12 @@ void mexFunction
     base_enum_t base = BASE_DEFAULT ;
     kind_enum_t kind = KIND_FULL ;              // ignored
     GxB_Format_Value fmt = GxB_NO_FORMAT ;      // ignored
+    int sparsity = 0 ;                          // ignored
     GrB_Descriptor desc = NULL ;
     if (nargin > 1)
     { 
-        desc = gb_mxarray_to_descriptor (pargin [nargin-1], &kind, &fmt, &base);
+        desc = gb_mxarray_to_descriptor (pargin [nargin-1], &kind, &fmt,
+            &sparsity, &base) ;
     }
     OK (GrB_Descriptor_free (&desc)) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextractvalues.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextractvalues.c
index 3d34812c21..cbbea1e1db 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextractvalues.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbextractvalues.c
@@ -2,8 +2,8 @@
 // gbextractvalues: extract all entries from a GraphBLAS matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbformat.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbformat.c
index 1eed9723b6..8b56af2bce 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbformat.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbformat.c
@@ -2,15 +2,17 @@
 // gbformat: get/set the matrix format to use in GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // Usage
 
-// f = gbformat ;
-// f = gbformat (f) ;
+// fmt = gbformat ;                 get the global default format (row/col)
+// fmt = gbformat (fmt) ;           set the global default format
+// [f,sparsity] = gbformat (G) ;    get the format and sparsity of a matrix
+//                                  (either GraphBLAS or MATLAB)
 
 #include "gb_matlab.h"
 
@@ -27,14 +29,15 @@ void mexFunction
     // check inputs
     //--------------------------------------------------------------------------
 
-    gb_usage (nargin <= 1 && nargout <= 1,
-        "usage: f = GrB.format or GrB.format (f)") ;
+    gb_usage (nargin <= 1 && nargout <= 2,
+        "usage: [f,s] = GrB.format, GrB.format (f), GrB.format (G)") ;
 
     //--------------------------------------------------------------------------
     // get/set the format
     //--------------------------------------------------------------------------
 
-    GxB_Format_Value fmt ;
+    GxB_Format_Value fmt = GxB_BY_COL ;
+    int sparsity = GxB_AUTO_SPARSITY ;
 
     if (nargin == 0)
     { 
@@ -57,24 +60,59 @@ void mexFunction
             // GrB.format (format)
             //------------------------------------------------------------------
 
+            // parse the format string
+            int ignore ;
+            bool ok = gb_mxstring_to_format (pargin [0], &fmt, &ignore) ;
+            CHECK_ERROR (!ok, "invalid format") ;
             // set the global format
-            fmt = gb_mxstring_to_format (pargin [0]) ;
             OK (GxB_Global_Option_set (GxB_FORMAT, fmt)) ;
 
         }
-        else
+        else if (mxIsStruct (pargin [0]))
         { 
 
             //------------------------------------------------------------------
-            // GrB.format (G)
+            // GrB.format (G) for a GraphBLAS matrix G
             //------------------------------------------------------------------
 
-            // get the format of the input matrix G
+            // get the type
+            mxArray *mx_type = mxGetField (pargin [0], 0, "GraphBLASv4") ;
+            if (mx_type == NULL)
+            {
+                // check if it is a GraphBLASv3 struct
+                mx_type = mxGetField (pargin [0], 0, "GraphBLAS") ;
+                CHECK_ERROR (mx_type == NULL, "invalid GraphBLAS struct") ;
+            }
+
+            // get the row/column format of the input matrix G
             mxArray *opaque = mxGetField (pargin [0], 0, "s") ;
             CHECK_ERROR (opaque == NULL, "invalid GraphBLAS struct") ;
             int64_t *s = mxGetInt64s (opaque) ;
             bool is_csc = (bool) (s [6]) ;
             fmt = (is_csc) ? GxB_BY_COL : GxB_BY_ROW ;
+
+            // get the current sparsity status of the input matrix G
+            switch (mxGetNumberOfFields (pargin [0]))
+            {
+                case 3 : sparsity = GxB_FULL ;        break ;
+                case 4 : sparsity = GxB_BITMAP ;      break ;
+                case 5 : sparsity = GxB_SPARSE ;      break ;
+                case 6 : sparsity = GxB_HYPERSPARSE ; break ;
+                default: ERROR ("invalid GraphBLAS struct") ;
+            }
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // GrB.format (A) for a MATLAB matrix A
+            //------------------------------------------------------------------
+
+            // MATLAB matrices are always stored by column
+            fmt = GxB_BY_COL ;
+            // MATLAB matrices are sparse or full, never hypersparse or bitmap
+            sparsity = mxIsSparse (pargin [0]) ? GxB_SPARSE : GxB_FULL ;
         }
     }
 
@@ -82,14 +120,21 @@ void mexFunction
     // return result
     //--------------------------------------------------------------------------
 
-    if (fmt == GxB_BY_ROW)
+    pargout [0] = mxCreateString ((fmt == GxB_BY_ROW) ? "by row" : "by col") ;
+    if (nargout > 1)
     { 
-        pargout [0] = mxCreateString ("by row") ;
-    }
-    else
-    { 
-        pargout [0] = mxCreateString ("by col") ;
+        char *s ;
+        switch (sparsity)
+        {
+            case GxB_HYPERSPARSE : s = "hypersparse" ; break ;
+            case GxB_SPARSE :      s = "sparse"      ; break ;
+            case GxB_BITMAP :      s = "bitmap"      ; break ;
+            case GxB_FULL :        s = "full"        ; break ;
+            default :              s = ""            ; break ;
+        }
+        pargout [1] = mxCreateString (s) ;
     }
+
     GB_WRAPUP ;
 }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbfull.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbfull.c
index 5e16915d6a..6ec096b88b 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbfull.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbfull.c
@@ -1,17 +1,21 @@
 //------------------------------------------------------------------------------
-// gbfull: convert a GraphBLAS matrix struct into a MATLAB dense matrix
+// gbfull: add identity values to a matrix so all entries are present
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // The input may be either a GraphBLAS matrix struct or a standard MATLAB
-// sparse or dense matrix.  The output is a GraphBLAS matrix by default, with
+// sparse or full matrix.  The output is a GraphBLAS matrix by default, with
 // all entries present, of the given type.  Entries are filled in with the id
-// value, whose default value is zero.  If desc.kind = 'full', the output is a
-// MATLAB dense matrix.
+// value, whose default value is zero.
+
+// If desc.kind = 'grb', or if the descriptor is not present, the output is a
+// GraphBLAS full matrix.  Otherwise the output is a MATLAB full matrix
+// (desc.kind = 'full').   The two other cases, desc.kind = 'sparse' and
+// 'matlab' are treated as 'full'.
 
 // Usage:
 //  C = gbfull (A)
@@ -65,17 +69,11 @@ void mexFunction
     // get the identity scalar
     //--------------------------------------------------------------------------
 
-    GrB_Matrix id ;
+    GrB_Matrix id = NULL ;
     if (nargin > 2)
     { 
         id = gb_get_shallow (pargin [2]) ;
     }
-    else
-    { 
-        // Assume the identity is zero, of the same type as C.
-        // The format does not matter, since only id (0,0) will be used.
-        OK (GrB_Matrix_new (&id, type, 1, 1)) ;
-    }
 
     //--------------------------------------------------------------------------
     // get the descriptor
@@ -84,68 +82,42 @@ void mexFunction
     base_enum_t base = BASE_DEFAULT ;
     kind_enum_t kind = KIND_GRB ;
     GxB_Format_Value fmt = GxB_NO_FORMAT ;
+    int sparsity = 0 ;
     GrB_Descriptor desc = NULL ;
     if (nargin > 3)
     { 
-        desc = gb_mxarray_to_descriptor (pargin [nargin-1], &kind, &fmt, &base);
+        desc = gb_mxarray_to_descriptor (pargin [nargin-1], &kind, &fmt,
+            &sparsity, &base) ;
     }
     OK (GrB_Descriptor_free (&desc)) ;
 
-    // A determines the format of C, unless defined by the descriptor
-    fmt = gb_get_format (nrows, ncols, A, NULL, fmt) ;
-
     //--------------------------------------------------------------------------
-    // expand the identity into a dense matrix B the same size as C
+    // finalize the kind and format
     //--------------------------------------------------------------------------
 
-    GrB_Matrix B ;
-    OK (GrB_Matrix_new (&B, type, nrows, ncols)) ;
-    OK (GxB_Matrix_Option_set (B, GxB_FORMAT, fmt)) ;
-    gb_matrix_assign_scalar (B, NULL, NULL, id, GrB_ALL, 0, GrB_ALL, 0, NULL,
-        false) ;
-
-    //--------------------------------------------------------------------------
-    // typecast A from float to integer using the MATLAB rules
-    //--------------------------------------------------------------------------
+    // ignore desc.kind = 'sparse' or 'matlab' and just use 'full' instead
+    kind = (kind == KIND_SPARSE || kind == KIND_MATLAB) ? KIND_FULL : kind ;
 
-    GrB_Matrix S, T = NULL ;
-    GrB_Type atype ;
-    OK (GxB_Matrix_type (&atype, A)) ;
-    if (gb_is_integer (type) && gb_is_float (atype))
-    { 
-        // T = (type) round (A)
-        OK (GrB_Matrix_new (&T, type, nrows, ncols)) ;
-        OK (GxB_Matrix_Option_set (T, GxB_FORMAT, fmt)) ;
-        OK (GrB_Matrix_apply (T, NULL, NULL, gb_round_binop (atype), A, NULL)) ;
-        S = T ;
+    if (kind == KIND_FULL)
+    {
+        // MATLAB matrices are always held by column
+        fmt = GxB_BY_COL ;
     }
     else
-    { 
-        // T = A, and let GrB_Matrix_eWiseAdd_BinaryOp do the typecasting
-        S = A ;
+    {
+        // A determines the format of C, unless defined by the descriptor
+        fmt = gb_get_format (nrows, ncols, A, NULL, fmt) ;
     }
 
     //--------------------------------------------------------------------------
-    // C = first (S, B)
-    //--------------------------------------------------------------------------
-
-    GrB_Matrix C ;
-    OK (GrB_Matrix_new (&C, type, nrows, ncols)) ;
-    OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
-    OK (GrB_Matrix_eWiseAdd_BinaryOp (C, NULL, NULL,
-        gb_first_binop (type), S, B, NULL)) ;
-
-    //--------------------------------------------------------------------------
-    // free workspace
+    // expand A to a full matrix
     //--------------------------------------------------------------------------
 
-    OK (GrB_Matrix_free (&id)) ;
-    OK (GrB_Matrix_free (&B)) ;
+    GrB_Matrix C = gb_expand_to_full (A, type, fmt, id) ;
     OK (GrB_Matrix_free (&A)) ;
-    OK (GrB_Matrix_free (&T)) ;
 
     //--------------------------------------------------------------------------
-    // export C to a MATLAB dense matrix
+    // export C
     //--------------------------------------------------------------------------
 
     pargout [0] = gb_export (&C, kind) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbisequal.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbisequal.c
index d969f9212b..4ed57c3bc8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbisequal.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbisequal.c
@@ -2,8 +2,8 @@
 // gbisequal: isequal (A,B)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbkronecker.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbkronecker.c
index 5cf04eba97..758b319587 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbkronecker.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbkronecker.c
@@ -2,8 +2,8 @@
 // gbkronecker: sparse matrix Kronecker product
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,10 +47,10 @@ void mexFunction
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
-    int nmatrices, nstrings, ncells ;
+    int nmatrices, nstrings, ncells, sparsity ;
     GrB_Descriptor desc ;
     gb_get_mxargs (nargin, pargin, USAGE, Matrix, &nmatrices, String, &nstrings,
-        Cell, &ncells, &desc, &base, &kind, &fmt) ;
+        Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
     CHECK_ERROR (nmatrices < 2 || nstrings < 1 || ncells > 0, USAGE) ;
 
@@ -151,16 +151,17 @@ void mexFunction
         // use the ztype of the op as the type of C
         OK (GxB_BinaryOp_ztype (&ctype, op)) ;
 
-        OK (GrB_Matrix_new (&C, ctype, cnrows, cncols)) ;
+        // create the matrix C and set its format and sparsity
         fmt = gb_get_format (cnrows, cncols, A, B, fmt) ;
-        OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+        sparsity = gb_get_sparsity (A, B, sparsity) ;
+        C = gb_new (ctype, cnrows, cncols, fmt, sparsity) ;
     }
 
     //--------------------------------------------------------------------------
     // compute C<M> += kron (A,B)
     //--------------------------------------------------------------------------
 
-    OK (GrB_Matrix_kronecker_BinaryOp (C, M, accum, op, A, B, desc)) ;
+    OK1 (C, GrB_Matrix_kronecker_BinaryOp (C, M, accum, op, A, B, desc)) ;
 
     //--------------------------------------------------------------------------
     // free shallow copies
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gblogassign.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gblogassign.c
index 1534e7b773..c08cff1b57 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gblogassign.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gblogassign.c
@@ -2,8 +2,8 @@
 // gblogassign: logical assignment: C(M) = A
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -86,11 +86,10 @@ void mexFunction
     gb_usage (nargin == 3 && nargout <= 1, "usage: C = gblogassign (C, M, A)") ;
 
     //--------------------------------------------------------------------------
-    // get a deep copy of C
+    // get a deep copy of C, of any sparsity structure
     //--------------------------------------------------------------------------
 
     GrB_Matrix C = gb_get_deep (pargin [0]) ;
-
     GrB_Index nrows, ncols ;
     OK (GrB_Matrix_nrows (&nrows, C)) ;
     OK (GrB_Matrix_ncols (&ncols, C)) ;
@@ -99,14 +98,13 @@ void mexFunction
     // get M
     //--------------------------------------------------------------------------
 
-    // make M boolean, stored by column, and drop explicit zeros
+    // make M boolean, sparse/hyper, stored by column, and drop explicit zeros
     GrB_Matrix M_input = gb_get_shallow (pargin [1]) ;
-    GrB_Matrix M ;
-    OK (GrB_Matrix_new (&M, GrB_BOOL, nrows, ncols)) ;
-    OK (GxB_Matrix_Option_set (M, GxB_FORMAT, GxB_BY_COL)) ;
-    OK (GxB_Matrix_select (M, NULL, NULL, GxB_NONZERO, M_input, NULL, NULL)) ;
+    GrB_Matrix M = gb_new (GrB_BOOL, nrows, ncols, GxB_BY_COL,
+        GxB_SPARSE + GxB_HYPERSPARSE) ;
+    OK1 (M, GxB_Matrix_select (M, NULL, NULL, GxB_NONZERO, M_input,
+        NULL, NULL)) ;
     OK (GrB_Matrix_free (&M_input)) ;
-
     GrB_Index mnz ;
     OK (GrB_Matrix_nvals (&mnz, M)) ;
 
@@ -115,24 +113,36 @@ void mexFunction
     //--------------------------------------------------------------------------
 
     GrB_Matrix A_input = gb_get_shallow (pargin [2]) ;
+    GrB_Matrix A = A_input ;
     GrB_Type atype ;
     GrB_Index anrows, ancols, anz ;
     GxB_Format_Value fmt ;
-    OK (GrB_Matrix_nrows (&anrows, A_input)) ;
-    OK (GrB_Matrix_ncols (&ancols, A_input)) ;
-    OK (GxB_Matrix_type (&atype, A_input)) ;
-    OK (GrB_Matrix_nvals (&anz, A_input)) ;
-    OK (GxB_Matrix_Option_get (A_input, GxB_FORMAT, &fmt)) ;
+    int A_sparsity ;
+    OK (GrB_Matrix_nrows (&anrows, A)) ;
+    OK (GrB_Matrix_ncols (&ancols, A)) ;
+    OK (GxB_Matrix_type (&atype, A)) ;
+    OK (GrB_Matrix_nvals (&anz, A)) ;
+    OK (GxB_Matrix_Option_get (A, GxB_FORMAT, &fmt)) ;
+    OK (GxB_Matrix_Option_get (A, GxB_SPARSITY_CONTROL, &A_sparsity)) ;
+
+    GrB_Matrix A_copy = NULL ;
+    GrB_Matrix A_copy2 = NULL ;
+
+    // make sure A is not bitmap; it can be sparse, hypersparse, or full
+    if (A_sparsity == GxB_BITMAP)
+    {
+        OK (GrB_Matrix_dup (&A_copy2, A)) ;
+        OK1 (A_copy2, GxB_Matrix_Option_set (A_copy2, GxB_SPARSITY_CONTROL,
+            GxB_SPARSE + GxB_HYPERSPARSE + GxB_FULL)) ;
+        A = A_copy2 ;
+    }
 
     // make sure A is a vector of the right size
-    GrB_Matrix A, A_copy = NULL ;
-
     if (mnz == 0)
     { 
         // M is empty, so A must have no entries.  The dimensions and format of
         // A are not relevant, since the content of A will not be accessed.
         CHECK_ERROR (anz != 0, ERR) ;
-        A = A_input ;
     }
     else if (anrows == 1)
     {
@@ -142,16 +152,12 @@ void mexFunction
         if (fmt == GxB_BY_COL)
         { 
             // A is 1-by-ancols and held by column: transpose it
-            OK (GrB_Matrix_new (&A_copy, atype, mnz, 1)) ;
-            OK (GxB_Matrix_Option_set (A_copy, GxB_FORMAT, GxB_BY_COL)) ;
-            OK (GrB_transpose (A_copy, NULL, NULL, A_input, NULL)) ;
-            OK (GrB_Matrix_wait (&A_copy)) ;
+            A_copy = gb_new (atype, mnz, 1, GxB_BY_COL, 
+                GxB_SPARSE + GxB_HYPERSPARSE + GxB_FULL) ;
+            OK1 (A_copy, GrB_transpose (A_copy, NULL, NULL, A, NULL)) ;
+            OK1 (A_copy, GrB_Matrix_wait (&A_copy)) ;
             A = A_copy ;
         }
-        else
-        { 
-            A = A_input ;
-        }
     }
     else if (ancols == 1)
     {
@@ -161,23 +167,19 @@ void mexFunction
         if (fmt == GxB_BY_ROW)
         { 
             // A is anrows-by-1 and held by row: transpose it
-            OK (GrB_Matrix_new (&A_copy, atype, 1, mnz)) ;
-            OK (GxB_Matrix_Option_set (A_copy, GxB_FORMAT, GxB_BY_ROW)) ;
-            OK (GrB_transpose (A_copy, NULL, NULL, A_input, NULL)) ;
-            OK (GrB_Matrix_wait (&A_copy)) ;
+            A_copy = gb_new (atype, 1, mnz, GxB_BY_ROW,
+                GxB_SPARSE + GxB_HYPERSPARSE + GxB_FULL) ;
+            OK1 (A_copy, GrB_transpose (A_copy, NULL, NULL, A, NULL)) ;
+            OK1 (A_copy, GrB_Matrix_wait (&A_copy)) ;
             A = A_copy ;
         }
-        else
-        { 
-            A = A_input ;
-        }
     }
     else
     {
         ERROR (ERR) ;
     }
 
-    int64_t *Ai = A->i ;
+    GrB_Index *Ai = (GrB_Index *) A->i ;        // NULL if A is full
     void *Ax = A->x ;
     double empty ;
     if (Ax == NULL) Ax = &empty ;
@@ -197,63 +199,61 @@ void mexFunction
     GrB_Index *Si = mxMalloc (MAX (anz, 1) * sizeof (GrB_Index)) ;
     GrB_Index *Sj = mxMalloc (MAX (anz, 1) * sizeof (GrB_Index)) ;
 
-    GB_matlab_helper5 (Si, Sj, Mi, Mj, (GrB_Index *) Ai, anz) ;
+    GB_matlab_helper5 (Si, Sj, Mi, Mj, M->vlen, Ai, A->vlen, anz) ;
 
-    GrB_Matrix S ;
-    OK (GrB_Matrix_new (&S, atype, nrows, ncols)) ;
-    OK (GxB_Matrix_Option_set (S, GxB_FORMAT, GxB_BY_COL)) ;
+    GrB_Matrix S = gb_new (atype, nrows, ncols, GxB_BY_COL, 0) ;
 
     if (atype == GrB_BOOL)
     { 
-        OK (GrB_Matrix_build_BOOL (S, Si, Sj, Ax, anz, GrB_LOR)) ;
+        OK1 (S, GrB_Matrix_build_BOOL (S, Si, Sj, Ax, anz, GrB_LOR)) ;
     }
     else if (atype == GrB_INT8)
     { 
-        OK (GrB_Matrix_build_INT8 (S, Si, Sj, Ax, anz, GrB_PLUS_INT8)) ;
+        OK1 (S, GrB_Matrix_build_INT8 (S, Si, Sj, Ax, anz, GrB_PLUS_INT8)) ;
     }
     else if (atype == GrB_INT16)
     { 
-        OK (GrB_Matrix_build_INT16 (S, Si, Sj, Ax, anz, GrB_PLUS_INT16)) ;
+        OK1 (S, GrB_Matrix_build_INT16 (S, Si, Sj, Ax, anz, GrB_PLUS_INT16)) ;
     }
     else if (atype == GrB_INT32)
     { 
-        OK (GrB_Matrix_build_INT32 (S, Si, Sj, Ax, anz, GrB_PLUS_INT32)) ;
+        OK1 (S, GrB_Matrix_build_INT32 (S, Si, Sj, Ax, anz, GrB_PLUS_INT32)) ;
     }
     else if (atype == GrB_INT64)
     { 
-        OK (GrB_Matrix_build_INT64 (S, Si, Sj, Ax, anz, GrB_PLUS_INT64)) ;
+        OK1 (S, GrB_Matrix_build_INT64 (S, Si, Sj, Ax, anz, GrB_PLUS_INT64)) ;
     }
     else if (atype == GrB_UINT8)
     { 
-        OK (GrB_Matrix_build_UINT8 (S, Si, Sj, Ax, anz, GrB_PLUS_UINT8)) ;
+        OK1 (S, GrB_Matrix_build_UINT8 (S, Si, Sj, Ax, anz, GrB_PLUS_UINT8)) ;
     }
     else if (atype == GrB_UINT16)
     { 
-        OK (GrB_Matrix_build_UINT16 (S, Si, Sj, Ax, anz, GrB_PLUS_UINT16)) ;
+        OK1 (S, GrB_Matrix_build_UINT16 (S, Si, Sj, Ax, anz, GrB_PLUS_UINT16)) ;
     }
     else if (atype == GrB_UINT32)
     { 
-        OK (GrB_Matrix_build_UINT32 (S, Si, Sj, Ax, anz, GrB_PLUS_UINT32)) ;
+        OK1 (S, GrB_Matrix_build_UINT32 (S, Si, Sj, Ax, anz, GrB_PLUS_UINT32)) ;
     }
     else if (atype == GrB_UINT64)
     { 
-        OK (GrB_Matrix_build_UINT64 (S, Si, Sj, Ax, anz, GrB_PLUS_UINT64)) ;
+        OK1 (S, GrB_Matrix_build_UINT64 (S, Si, Sj, Ax, anz, GrB_PLUS_UINT64)) ;
     }
     else if (atype == GrB_FP32)
     { 
-        OK (GrB_Matrix_build_FP32 (S, Si, Sj, Ax, anz, GrB_PLUS_FP32)) ;
+        OK1 (S, GrB_Matrix_build_FP32 (S, Si, Sj, Ax, anz, GrB_PLUS_FP32)) ;
     }
     else if (atype == GrB_FP64)
     { 
-        OK (GrB_Matrix_build_FP64 (S, Si, Sj, Ax, anz, GrB_PLUS_FP64)) ;
+        OK1 (S, GrB_Matrix_build_FP64 (S, Si, Sj, Ax, anz, GrB_PLUS_FP64)) ;
     }
     else if (atype == GxB_FC32)
     { 
-        OK (GxB_Matrix_build_FC32 (S, Si, Sj, Ax, anz, GxB_PLUS_FC32)) ;
+        OK1 (S, GxB_Matrix_build_FC32 (S, Si, Sj, Ax, anz, GxB_PLUS_FC32)) ;
     }
     else if (atype == GxB_FC64)
     { 
-        OK (GxB_Matrix_build_FC64 (S, Si, Sj, Ax, anz, GxB_PLUS_FC64)) ;
+        OK1 (S, GxB_Matrix_build_FC64 (S, Si, Sj, Ax, anz, GxB_PLUS_FC64)) ;
     }
     else
     {
@@ -261,12 +261,13 @@ void mexFunction
     }
 
     OK (GrB_Matrix_free (&A_copy)) ;
+    OK (GrB_Matrix_free (&A_copy2)) ;
 
     //--------------------------------------------------------------------------
     // C<M> = S
     //--------------------------------------------------------------------------
 
-    OK (GxB_Matrix_subassign (C, M, NULL,
+    OK1 (C, GxB_Matrix_subassign (C, M, NULL,
         S, GrB_ALL, nrows, GrB_ALL, ncols, NULL)) ;
 
     //--------------------------------------------------------------------------
@@ -280,7 +281,7 @@ void mexFunction
     OK (GrB_Matrix_free (&M_input)) ;
 
     //--------------------------------------------------------------------------
-    // export the output matrix C back to MATLAB
+    // export the output matrix C back to MATLAB as a GraphBLAS matrix
     //--------------------------------------------------------------------------
 
     pargout [0] = gb_export (&C, KIND_GRB) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gblogextract.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gblogextract.c
index bc7101cc06..2d6897320a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gblogextract.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gblogextract.c
@@ -2,8 +2,8 @@
 // gblogextract: logical extraction: C = A(M)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -113,31 +113,33 @@ void mexFunction
     // get M
     //--------------------------------------------------------------------------
 
+    // M can be hypersparse, sparse, or full, but not bitmap
+    int not_bitmap = GxB_HYPERSPARSE + GxB_SPARSE + GxB_FULL ;
+
     // make M boolean, stored by column, and drop explicit zeros
     GrB_Matrix M_input = gb_get_shallow (pargin [1]) ;
-    GrB_Matrix M ;
-    OK (GrB_Matrix_new (&M, GrB_BOOL, nrows, ncols)) ;
-    OK (GxB_Matrix_Option_set (M, GxB_FORMAT, GxB_BY_COL)) ;
-    OK (GxB_Matrix_select (M, NULL, NULL, GxB_NONZERO, M_input, NULL, NULL)) ;
+    GrB_Matrix M = gb_new (GrB_BOOL, nrows, ncols, GxB_BY_COL, not_bitmap) ;
+    OK1 (M, GxB_Matrix_select (M, NULL, NULL, GxB_NONZERO, M_input,
+        NULL, NULL)) ;
     OK (GrB_Matrix_free (&M_input)) ;
 
     GrB_Index mnz ;
     OK (GrB_Matrix_nvals (&mnz, M)) ;
+    int sparsity ;
+    OK (GxB_Matrix_Option_get (M, GxB_SPARSITY_STATUS, &sparsity)) ;
+    CHECK_ERROR (sparsity == GxB_BITMAP, "internal error 5") ;
 
     //--------------------------------------------------------------------------
     // G<M> = A
     //--------------------------------------------------------------------------
 
-    // G has the same type and size as A, but it is always stored by column
+    // G has the same type and size as A, but it is always stored by column.
+    // Also ensure the G is not bitmap.
     GrB_Type type ;
     OK (GxB_Matrix_type (&type, A)) ;
-    GrB_Matrix G ;
-    OK (GrB_Matrix_new (&G, type, nrows, ncols)) ;
-    OK (GxB_Matrix_Option_set (G, GxB_FORMAT, GxB_BY_COL)) ;
-
-    OK (GxB_Matrix_subassign (G, M, NULL,
+    GrB_Matrix G = gb_new (type, nrows, ncols, GxB_BY_COL, not_bitmap) ;
+    OK1 (G, GxB_Matrix_subassign (G, M, NULL,
         A, GrB_ALL, nrows, GrB_ALL, ncols, NULL)) ;
-
     OK (GrB_Matrix_free (&A_copy)) ;
     OK (GrB_Matrix_free (&A_input)) ;
 
@@ -146,7 +148,10 @@ void mexFunction
     //--------------------------------------------------------------------------
 
     GrB_Index gnvals ;
+    OK1 (G, GrB_Matrix_wait (&G)) ;
     OK (GrB_Matrix_nvals (&gnvals, G)) ;
+    OK (GxB_Matrix_Option_get (G, GxB_SPARSITY_STATUS, &sparsity)) ;
+    CHECK_ERROR (sparsity == GxB_BITMAP, "internal error 0") ;
     void *Gx = G->x ;
 
     //--------------------------------------------------------------------------
@@ -155,7 +160,6 @@ void mexFunction
 
     // This does not affect the extracted values Gx
     G->type = GrB_BOOL ;
-    G->type_size = sizeof (bool) ;
     if (G->nzmax > 0)
     { 
         G->x = mxMalloc (G->nzmax * sizeof (bool)) ;
@@ -170,6 +174,8 @@ void mexFunction
     // K is a shallow copy of M, except for its numerical values
     GrB_Matrix K ;
     OK (GB_shallow_copy (&K, GxB_BY_COL, M, Context)) ;
+    OK (GxB_Matrix_Option_get (K, GxB_SPARSITY_STATUS, &sparsity)) ;
+    CHECK_ERROR (sparsity == GxB_BITMAP, "internal error 10") ;
 
     // Kx = uint64 (0:mnz-1)
     uint64_t *Kx = mxMalloc (MAX (mnz, 1) * sizeof (uint64_t)) ;
@@ -178,16 +184,13 @@ void mexFunction
     K->x = Kx ;
     K->x_shallow = false ;
     K->type = GrB_UINT64 ;
-    K->type_size = sizeof (uint64_t) ;
 
     //--------------------------------------------------------------------------
     // T<G> = K
     //--------------------------------------------------------------------------
 
-    GrB_Matrix T ;
-    OK (GrB_Matrix_new (&T, GrB_UINT64, nrows, ncols)) ;
-    OK (GxB_Matrix_Option_set (T, GxB_FORMAT, GxB_BY_COL)) ;
-    OK (GxB_Matrix_subassign (T, G, NULL,
+    GrB_Matrix T = gb_new (GrB_UINT64, nrows, ncols, GxB_BY_COL, not_bitmap) ;
+    OK1 (T, GxB_Matrix_subassign (T, G, NULL,
         K, GrB_ALL, nrows, GrB_ALL, ncols, NULL)) ;
 
     //--------------------------------------------------------------------------
@@ -195,6 +198,7 @@ void mexFunction
     //--------------------------------------------------------------------------
 
     GrB_Index tnvals ;
+    OK1 (T, GrB_Matrix_wait (&T)) ;
     OK (GrB_Matrix_nvals (&tnvals, T)) ;
     uint64_t *Tx = T->x ;
 
@@ -207,16 +211,17 @@ void mexFunction
 
     // Vectors are always stored by column, and are never hypersparse.  This
     // step takes constant time, using a transplant of the row indices Tx from
-    // T and the values Gx from G.
+    // T and the values Gx from G.  V is sparse (not full, not hypersparse).
 
     GrB_Vector V ;
     OK (GrB_Vector_new (&V, type, mnz)) ;
+    OK1 (V, GxB_Vector_Option_set (V, GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
     gb_mxfree (&V->i) ;
     gb_mxfree (&V->x) ;
     V->i = (int64_t *) Tx ; // transplant values of T as the row indices of V
     T->x = NULL ;
     V->x = Gx ;             // transplant the values of G as the values of V
-    V->nzmax = tnvals ;
+    V->nzmax = T->nzmax ;
     int64_t *Vp = V->p ;
     Vp [0] = 0 ;
     Vp [1] = tnvals ;
@@ -236,7 +241,7 @@ void mexFunction
     OK (GrB_Matrix_free (&T)) ;
 
     //--------------------------------------------------------------------------
-    // export the output matrix C back to MATLAB
+    // export the output matrix C back to MATLAB as a GraphBLAS matrix
     //--------------------------------------------------------------------------
 
     pargout [0] = gb_export (&C, KIND_GRB) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsparse.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbmatlab.c
similarity index 72%
rename from GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsparse.c
rename to GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbmatlab.c
index 1b1b7ce8e0..e34f24ed2d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsparse.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbmatlab.c
@@ -1,18 +1,19 @@
 //------------------------------------------------------------------------------
-// gbsparse: convert a GraphBLAS matrix struct into a MATLAB sparse matrix
+// gbmatlab: convert to a sparse or full MATLAB matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // The input may be either a GraphBLAS matrix struct or a standard MATLAB
-// sparse matrix.  The output is a standard MATLAB sparse matrix.
+// sparse or full matrix.  The output is a standard MATLAB sparse or full
+// matrix: full if all entries are present, and sparse otherwise.
 
 // Usage:
 
-// A = gbsparse (X, type)
+// A = gbmatlab (X, type)
 
 #include "gb_matlab.h"
 
@@ -29,7 +30,7 @@ void mexFunction
     // check inputs
     //--------------------------------------------------------------------------
 
-    gb_usage (nargin == 2 && nargout <= 1, "usage: A = GrB.sparse (X, type)") ;
+    gb_usage (nargin == 2 && nargout <= 1, "usage: A = gbmatlab (X, type)") ;
 
     //--------------------------------------------------------------------------
     // get the input matrix
@@ -47,16 +48,16 @@ void mexFunction
     GrB_Matrix T = NULL ;
     if (type != xtype)
     { 
-        T = gb_typecast (type, GxB_BY_COL, X) ;
+        T = gb_typecast (X, type, GxB_BY_COL, GxB_SPARSE + GxB_FULL) ;
         OK (GrB_Matrix_free (&X)) ;
         X = T ;
     }
 
     //--------------------------------------------------------------------------
-    // export the input matrix to a MATLAB sparse matrix
+    // export the input matrix to a MATLAB sparse or full matrix
     //--------------------------------------------------------------------------
 
-    pargout [0] = gb_export (&X, KIND_SPARSE) ;
+    pargout [0] = gb_export (&X, KIND_MATLAB) ;
     GB_WRAPUP ;
 }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbmonoidinfo.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbmonoidinfo.c
index c969ac2656..cb61f41cc2 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbmonoidinfo.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbmonoidinfo.c
@@ -2,8 +2,8 @@
 // gbmonoidinfo : print a GraphBLAS monoid (for illustration only)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbmxm.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbmxm.c
index a2719a3077..ec46fff241 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbmxm.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbmxm.c
@@ -2,8 +2,8 @@
 // gbmxm: sparse matrix-matrix multiplication
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,13 +47,20 @@ void mexFunction
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
-    int nmatrices, nstrings, ncells ;
+    int nmatrices, nstrings, ncells, sparsity ;
     GrB_Descriptor desc ;
     gb_get_mxargs (nargin, pargin, USAGE, Matrix, &nmatrices, String, &nstrings,
-        Cell, &ncells, &desc, &base, &kind, &fmt) ;
+        Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
     CHECK_ERROR (nmatrices < 2 || nstrings < 1 || ncells > 0, USAGE) ;
 
+    // ensure the descriptor is present, and set GxB_SORT to true
+    if (desc == NULL)
+    { 
+        OK (GrB_Descriptor_new (&desc)) ;
+    }
+    OK (GxB_Desc_set (desc, GxB_SORT, true)) ;
+
     //--------------------------------------------------------------------------
     // get the matrices
     //--------------------------------------------------------------------------
@@ -140,16 +147,17 @@ void mexFunction
         OK (GxB_Monoid_operator (&add, add_monoid)) ;
         OK (GxB_BinaryOp_ztype (&ctype, add)) ;
 
-        OK (GrB_Matrix_new (&C, ctype, cnrows, cncols)) ;
+        // create the matrix C and set its format and sparsity
         fmt = gb_get_format (cnrows, cncols, A, B, fmt) ;
-        OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+        sparsity = gb_get_sparsity (A, B, sparsity) ;
+        C = gb_new (ctype, cnrows, cncols, fmt, sparsity) ;
     }
 
     //--------------------------------------------------------------------------
     // compute C<M> += A*B
     //--------------------------------------------------------------------------
 
-    OK (GrB_mxm (C, M, accum, semiring, A, B, desc)) ;
+    OK1 (C, GrB_mxm (C, M, accum, semiring, A, B, desc)) ;
 
     //--------------------------------------------------------------------------
     // free shallow copies
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnew.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnew.c
index 34e9bc32f8..da9835174a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnew.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnew.c
@@ -2,8 +2,8 @@
 // gbnew: create a GraphBLAS matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -46,6 +46,8 @@ void mexFunction
     //--------------------------------------------------------------------------
 
     GrB_Matrix C ;
+    GxB_Format_Value fmt ;
+    int sparsity = 0 ;
 
     if (nargin == 1)
     { 
@@ -76,7 +78,7 @@ void mexFunction
             //------------------------------------------------------------------
 
             GrB_Type type = gb_mxstring_to_type (pargin [1]) ;
-            GxB_Format_Value fmt = gb_mxstring_to_format (pargin [1]) ;
+            bool ok = gb_mxstring_to_format (pargin [1], &fmt, &sparsity) ;
 
             if (type != NULL)
             {
@@ -90,7 +92,7 @@ void mexFunction
                     // A is a 0-by-0 MATLAB matrix.  create a new 0-by-0
                     // GraphBLAS matrix C of the given type, with the default
                     // format.
-                    OK (GrB_Matrix_new (&C, type, 0, 0)) ;
+                    C = gb_new (type, 0, 0, -1, 0) ;
                 }
                 else
                 { 
@@ -98,21 +100,22 @@ void mexFunction
                     // use the same format as A
                     GrB_Matrix A = gb_get_shallow (pargin [0]) ;
                     OK (GxB_Matrix_Option_get (A, GxB_FORMAT, &fmt)) ;
-                    C = gb_typecast (type, fmt, A) ;
+                    C = gb_typecast (A, type, fmt, 0) ;
                     OK (GrB_Matrix_free (&A)) ;
                 }
 
             }
-            else if (fmt != GxB_NO_FORMAT)
+            else if (ok)
             { 
 
                 //--------------------------------------------------------------
                 // C = GrB (A, format)
                 //--------------------------------------------------------------
 
-                // get a deep copy of A and convert it to the requested format
-                C = gb_get_deep (pargin [0]) ;
-                OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+                // get a shallow copy of A
+                GrB_Matrix A = gb_get_shallow (pargin [0]) ;
+                // C = A with the requested format and sparsity, no typecast
+                C = gb_typecast (A, NULL, fmt, sparsity) ;
 
             }
             else
@@ -132,12 +135,7 @@ void mexFunction
             // m-by-n GraphBLAS double matrix, no entries, default format
             GrB_Index nrows = mxGetScalar (pargin [0]) ;
             GrB_Index ncols = mxGetScalar (pargin [1]) ;
-            OK (GrB_Matrix_new (&C, GrB_FP64, nrows, ncols)) ;
-
-            // set to BY_COL if column vector, BY_ROW if row vector,
-            // use global default format otherwise
-            OK (GxB_Matrix_Option_set (C, GxB_FORMAT,
-                gb_default_format (nrows, ncols))) ;
+            C = gb_new (GrB_FP64, nrows, ncols, -1, 0) ;
 
         }
         else
@@ -169,25 +167,18 @@ void mexFunction
             GrB_Index nrows = mxGetScalar (pargin [0]) ;
             GrB_Index ncols = mxGetScalar (pargin [1]) ;
             GrB_Type type = gb_mxstring_to_type (pargin [2]) ;
-            GxB_Format_Value fmt = gb_mxstring_to_format (pargin [2]) ;
+            bool ok = gb_mxstring_to_format (pargin [2], &fmt, &sparsity) ;
 
             if (type != NULL)
             { 
                 // create an m-by-n matrix of the desired type, no entries,
                 // use the default format.
-                OK (GrB_Matrix_new (&C, type, nrows, ncols)) ;
-
-                // set to BY_COL if column vector, BY_ROW if row vector,
-                // use global default format otherwise
-                OK (GxB_Matrix_Option_set (C, GxB_FORMAT,
-                    gb_default_format (nrows, ncols))) ;
-
+                C = gb_new (type, nrows, ncols, -1, sparsity) ;
             }
-            else if (fmt != GxB_NO_FORMAT)
+            else if (ok)
             { 
                 // create an m-by-n double matrix of the desired format
-                OK (GrB_Matrix_new (&C, GrB_FP64, nrows, ncols)) ;
-                OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+                C = gb_new (GrB_FP64, nrows, ncols, fmt, sparsity) ;
             }
             else
             { 
@@ -204,34 +195,33 @@ void mexFunction
             //------------------------------------------------------------------
 
             GrB_Type type = gb_mxstring_to_type (pargin [1]) ;
-            GxB_Format_Value fmt = gb_mxstring_to_format (pargin [2]) ;
+            bool ok = gb_mxstring_to_format (pargin [2], &fmt, &sparsity) ;
 
-            if (type != NULL && fmt != GxB_NO_FORMAT)
+            if (ok)
             { 
                 // C = GrB (A, type, format)
             }
             else
             { 
                 // C = GrB (A, format, type)
-                fmt = gb_mxstring_to_format (pargin [1]) ;
+                ok = gb_mxstring_to_format (pargin [1], &fmt, &sparsity) ;
                 type = gb_mxstring_to_type (pargin [2]) ;
             }
 
-            if (type == NULL || fmt == GxB_NO_FORMAT)
+            if (type == NULL || !ok)
             { 
                 ERROR ("unknown type and/or format") ;
             }
 
             if (gb_mxarray_is_empty (pargin [0]))
             { 
-                OK (GrB_Matrix_new (&C, type, 0, 0)) ;
-                OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+                C = gb_new (type, 0, 0, fmt, sparsity) ;
             }
             else
             { 
                 // get a shallow copy, typecast it, and set the format
                 GrB_Matrix A = gb_get_shallow (pargin [0]) ;
-                C = gb_typecast (type, fmt, A) ;
+                C = gb_typecast (A, type, fmt, sparsity) ;
                 OK (GrB_Matrix_free (&A)) ;
             }
         }
@@ -260,27 +250,25 @@ void mexFunction
             GrB_Index ncols = mxGetScalar (pargin [1]) ;
 
             GrB_Type type = gb_mxstring_to_type (pargin [2]) ;
-            GxB_Format_Value fmt = gb_mxstring_to_format (pargin [3]) ;
+            bool ok = gb_mxstring_to_format (pargin [3], &fmt, &sparsity) ;
 
-            if (type != NULL && fmt != GxB_NO_FORMAT)
+            if (ok)
             { 
                 // C = GrB (m, n, type, format)
             }
             else
             { 
                 // C = GrB (m, n, format, type)
-                fmt = gb_mxstring_to_format (pargin [2]) ;
+                ok = gb_mxstring_to_format (pargin [2], &fmt, &sparsity) ;
                 type = gb_mxstring_to_type (pargin [3]) ;
             }
 
-            if (type == NULL || fmt == GxB_NO_FORMAT)
+            if (type == NULL || !ok)
             { 
                 ERROR ("unknown type and/or format") ;
             }
 
-            OK (GrB_Matrix_new (&C, type, nrows, ncols)) ;
-            OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
-
+            C = gb_new (type, nrows, ncols, fmt, sparsity) ;
         }
         else
         { 
@@ -289,7 +277,7 @@ void mexFunction
     }
 
     //--------------------------------------------------------------------------
-    // export the output matrix C back to MATLAB
+    // export the output matrix C back to MATLAB as a GraphBLAS matrix
     //--------------------------------------------------------------------------
 
     pargout [0] = gb_export (&C, KIND_GRB) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnorm.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnorm.c
index e3234ee619..3017552a5d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnorm.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnorm.c
@@ -2,8 +2,8 @@
 // gbnorm: norm (A,kind)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -40,6 +40,9 @@ void mexFunction
     OK (GrB_Matrix_nrows (&anrows, A)) ;
     OK (GrB_Matrix_ncols (&ancols, A)) ;
 
+    int sparsity ;
+    OK (GxB_Matrix_Option_get (A, GxB_SPARSITY_STATUS, &sparsity)) ;
+
     //--------------------------------------------------------------------------
     // s = norm (A,kind)
     //--------------------------------------------------------------------------
@@ -48,14 +51,15 @@ void mexFunction
 
     if (norm_kind == INT64_MIN && !GB_is_dense (A))
     { 
-        // norm (A,-inf) is zero if A is not dense
+        // norm (A,-inf) is zero if A is not full
         s = 0 ;
     }
     else if ((atype == GrB_FP32 || atype == GrB_FP64)
+        && (sparsity != GxB_BITMAP)
         && (anrows == 1 || ancols == 1 || norm_kind == 0))
     { 
         // s = norm (A,p) where A is an FP32 or FP64 vector,
-        // or when p = 0 (for Frobenius norm)
+        // or when p = 0 (for Frobenius norm).  A cannot be bitmap.
         GrB_Index anz ;
         OK (GrB_Matrix_nvals (&anz, A)) ;
         s = GB_matlab_helper10 (A->x, NULL, atype, norm_kind, anz) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnormdiff.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnormdiff.c
index 833a00f15e..010e3aab2e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnormdiff.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnormdiff.c
@@ -2,8 +2,8 @@
 // gbnormdiff: norm (A-B,kind)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -58,7 +58,7 @@ void mexFunction
         (atype == GrB_FP32 || atype == GrB_FP64) && (atype == btype)
         && (anrows == 1 || ancols == 1 || norm_kind == 0))
     {
-        // s = norm (A-B,p) where A and B are dense FP32 or FP64 vectors,
+        // s = norm (A-B,p) where A and B are full FP32 or FP64 vectors,
         // or when p = 0 (for Frobenius norm)
         GrB_Index anz ;
         OK (GrB_Matrix_nvals (&anz, A)) ;
@@ -98,7 +98,7 @@ void mexFunction
         // X = A-B
         GrB_Matrix X ;
         OK (GrB_Matrix_new (&X, xtype, anrows, ancols)) ;
-        OK (GrB_Matrix_eWiseAdd_BinaryOp (X, NULL, NULL, op, A, B, NULL)) ;
+        OK1 (X, GrB_Matrix_eWiseAdd_BinaryOp (X, NULL, NULL, op, A, B, NULL)) ;
 
         // s = norm (X, norm_kind)
         s = gb_norm (X, norm_kind) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnvals.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnvals.c
index 6f9dcbc600..54eb2ca430 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnvals.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbnvals.c
@@ -2,8 +2,8 @@
 // gbnvals: number of entries in a GraphBLAS matrix struct
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gboptype.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gboptype.c
index f93beb74fc..794b21781b 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gboptype.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gboptype.c
@@ -2,8 +2,8 @@
 // gboptype : determine the type of a binary operator from the input types
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbreduce.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbreduce.c
index acaaa9b397..190975fb39 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbreduce.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbreduce.c
@@ -2,8 +2,8 @@
 // gbreduce: reduce a sparse matrix to a scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -44,10 +44,10 @@ void mexFunction
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
-    int nmatrices, nstrings, ncells ;
+    int nmatrices, nstrings, ncells, sparsity ;
     GrB_Descriptor desc ;
     gb_get_mxargs (nargin, pargin, USAGE, Matrix, &nmatrices, String, &nstrings,
-        Cell, &ncells, &desc, &base, &kind, &fmt) ;
+        Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
     CHECK_ERROR (nmatrices < 1 || nmatrices > 2 || nstrings < 1 || ncells > 0,
         USAGE) ;
@@ -108,9 +108,9 @@ void mexFunction
         OK (GxB_Monoid_operator (&binop, monoid)) ;
         OK (GxB_BinaryOp_ztype (&ctype, binop)) ;
 
-        OK (GrB_Matrix_new (&C, ctype, 1, 1)) ;
         fmt = gb_get_format (1, 1, A, NULL, fmt) ;
-        OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+        sparsity = gb_get_sparsity (A, NULL, sparsity) ;
+        C = gb_new (ctype, 1, 1, fmt, sparsity) ;
     }
 
     //--------------------------------------------------------------------------
@@ -131,7 +131,7 @@ void mexFunction
     { 
         // set C(0,0) to zero
         OK (GrB_Matrix_nvals (&nvals, C)) ;
-        OK (GrB_Matrix_setElement_BOOL (C, 0, 0, 0)) ;
+        OK1 (C, GrB_Matrix_setElement_BOOL (C, 0, 0, 0)) ;
     }
 
     //--------------------------------------------------------------------------
@@ -143,91 +143,91 @@ void mexFunction
         bool c = false ;
         OK (GrB_Matrix_extractElement_BOOL (&c, C, 0, 0)) ;
         OK (GrB_Matrix_reduce_BOOL (&c, accum, monoid, A, desc)) ;
-        OK (GrB_Matrix_setElement_BOOL (C, c, 0, 0)) ;
+        OK1 (C, GrB_Matrix_setElement_BOOL (C, c, 0, 0)) ;
     }
     else if (ctype == GrB_INT8)
     { 
         int8_t c = 0 ;
         OK (GrB_Matrix_extractElement_INT8 (&c, C, 0, 0)) ;
         OK (GrB_Matrix_reduce_INT8 (&c, accum, monoid, A, desc)) ;
-        OK (GrB_Matrix_setElement_INT8 (C, c, 0, 0)) ;
+        OK1 (C, GrB_Matrix_setElement_INT8 (C, c, 0, 0)) ;
     }
     else if (ctype == GrB_INT16)
     { 
         int16_t c = 0 ;
         OK (GrB_Matrix_extractElement_INT16 (&c, C, 0, 0)) ;
         OK (GrB_Matrix_reduce_INT16 (&c, accum, monoid, A, desc)) ;
-        OK (GrB_Matrix_setElement_INT16 (C, c, 0, 0)) ;
+        OK1 (C, GrB_Matrix_setElement_INT16 (C, c, 0, 0)) ;
     }
     else if (ctype == GrB_INT32)
     { 
         int32_t c = 0 ;
         OK (GrB_Matrix_extractElement_INT32 (&c, C, 0, 0)) ;
         OK (GrB_Matrix_reduce_INT32 (&c, accum, monoid, A, desc)) ;
-        OK (GrB_Matrix_setElement_INT32 (C, c, 0, 0)) ;
+        OK1 (C, GrB_Matrix_setElement_INT32 (C, c, 0, 0)) ;
     }
     else if (ctype == GrB_INT64)
     { 
         int64_t c = 0 ;
         OK (GrB_Matrix_extractElement_INT64 (&c, C, 0, 0)) ;
         OK (GrB_Matrix_reduce_INT64 (&c, accum, monoid, A, desc)) ;
-        OK (GrB_Matrix_setElement_INT64 (C, c, 0, 0)) ;
+        OK1 (C, GrB_Matrix_setElement_INT64 (C, c, 0, 0)) ;
     }
     else if (ctype == GrB_UINT8)
     { 
         uint8_t c = 0 ;
         OK (GrB_Matrix_extractElement_UINT8 (&c, C, 0, 0)) ;
         OK (GrB_Matrix_reduce_UINT8 (&c, accum, monoid, A, desc)) ;
-        OK (GrB_Matrix_setElement_UINT8 (C, c, 0, 0)) ;
+        OK1 (C, GrB_Matrix_setElement_UINT8 (C, c, 0, 0)) ;
     }
     else if (ctype == GrB_UINT16)
     { 
         uint16_t c = 0 ;
         OK (GrB_Matrix_extractElement_UINT16 (&c, C, 0, 0)) ;
         OK (GrB_Matrix_reduce_UINT16 (&c, accum, monoid, A, desc)) ;
-        OK (GrB_Matrix_setElement_UINT16 (C, c, 0, 0)) ;
+        OK1 (C, GrB_Matrix_setElement_UINT16 (C, c, 0, 0)) ;
     }
     else if (ctype == GrB_UINT32)
     { 
         uint32_t c = 0 ;
         OK (GrB_Matrix_extractElement_UINT32 (&c, C, 0, 0)) ;
         OK (GrB_Matrix_reduce_UINT32 (&c, accum, monoid, A, desc)) ;
-        OK (GrB_Matrix_setElement_UINT32 (C, c, 0, 0)) ;
+        OK1 (C, GrB_Matrix_setElement_UINT32 (C, c, 0, 0)) ;
     }
     else if (ctype == GrB_UINT64)
     { 
         uint64_t c = 0 ;
         OK (GrB_Matrix_extractElement_UINT64 (&c, C, 0, 0)) ;
         OK (GrB_Matrix_reduce_UINT64 (&c, accum, monoid, A, desc)) ;
-        OK (GrB_Matrix_setElement_UINT64 (C, c, 0, 0)) ;
+        OK1 (C, GrB_Matrix_setElement_UINT64 (C, c, 0, 0)) ;
     }
     else if (ctype == GrB_FP32)
     { 
         float c = 0 ;
         OK (GrB_Matrix_extractElement_FP32 (&c, C, 0, 0)) ;
         OK (GrB_Matrix_reduce_FP32 (&c, accum, monoid, A, desc)) ;
-        OK (GrB_Matrix_setElement_FP32 (C, c, 0, 0)) ;
+        OK1 (C, GrB_Matrix_setElement_FP32 (C, c, 0, 0)) ;
     }
     else if (ctype == GrB_FP64)
     { 
         double c = 0 ;
         OK (GrB_Matrix_extractElement_FP64 (&c, C, 0, 0)) ;
         OK (GrB_Matrix_reduce_FP64 (&c, accum, monoid, A, desc)) ;
-        OK (GrB_Matrix_setElement_FP64 (C, c, 0, 0)) ;
+        OK1 (C, GrB_Matrix_setElement_FP64 (C, c, 0, 0)) ;
     }
     else if (ctype == GxB_FC32)
     { 
         GxB_FC32_t c = GxB_CMPLXF (0,0) ;
         OK (GxB_Matrix_extractElement_FC32 (&c, C, 0, 0)) ;
         OK (GxB_Matrix_reduce_FC32 (&c, accum, monoid, A, desc)) ;
-        OK (GxB_Matrix_setElement_FC32 (C, c, 0, 0)) ;
+        OK1 (C, GxB_Matrix_setElement_FC32 (C, c, 0, 0)) ;
     }
     else if (ctype == GxB_FC64)
     { 
         GxB_FC64_t c = GxB_CMPLX (0,0) ;
         OK (GxB_Matrix_extractElement_FC64 (&c, C, 0, 0)) ;
         OK (GxB_Matrix_reduce_FC64 (&c, accum, monoid, A, desc)) ;
-        OK (GxB_Matrix_setElement_FC64 (C, c, 0, 0)) ;
+        OK1 (C, GxB_Matrix_setElement_FC64 (C, c, 0, 0)) ;
     }
     else
     {
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselect.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselect.c
index deb5c07e85..d1dd52007e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselect.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselect.c
@@ -2,8 +2,8 @@
 // gbselect: select entries from a GraphBLAS matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -44,57 +44,49 @@
 // nan operators
 //------------------------------------------------------------------------------
 
-bool gb_isnan32 (GrB_Index i, GrB_Index j, GrB_Index nrows, GrB_Index ncols,
-    const void *x, const void *b)
+bool gb_isnan32 (GrB_Index i, GrB_Index j, const void *x, const void *b)
 { 
     float aij = * ((float *) x) ;
     return (isnan (aij)) ;
 }
 
-bool gb_isnan64 (GrB_Index i, GrB_Index j, GrB_Index nrows, GrB_Index ncols,
-    const void *x, const void *b)
+bool gb_isnan64 (GrB_Index i, GrB_Index j, const void *x, const void *b)
 { 
     double aij = * ((double *) x) ;
     return (isnan (aij)) ;
 }
 
-bool gb_isnotnan32 (GrB_Index i, GrB_Index j, GrB_Index nrows, GrB_Index ncols,
-    const void *x, const void *b)
+bool gb_isnotnan32 (GrB_Index i, GrB_Index j, const void *x, const void *b)
 { 
     float aij = * ((float *) x) ;
     return (!isnan (aij)) ;
 }
 
-bool gb_isnotnan64 (GrB_Index i, GrB_Index j, GrB_Index nrows, GrB_Index ncols,
-    const void *x, const void *b)
+bool gb_isnotnan64 (GrB_Index i, GrB_Index j, const void *x, const void *b)
 { 
     double aij = * ((double *) x) ;
     return (!isnan (aij)) ;
 }
 
-bool gb_isnanfc32 (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const void *x, const void *b)
+bool gb_isnanfc32 (GrB_Index i, GrB_Index j, const void *x, const void *b)
 { 
     GxB_FC32_t aij = * ((GxB_FC32_t *) x) ;
     return (isnan (crealf (aij)) || isnan (cimagf (aij))) ;
 }
 
-bool gb_isnanfc64 (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const void *x, const void *b)
+bool gb_isnanfc64 (GrB_Index i, GrB_Index j, const void *x, const void *b)
 { 
     GxB_FC64_t aij = * ((GxB_FC64_t *) x) ;
     return (isnan (creal (aij)) || isnan (cimag (aij))) ;
 }
 
-bool gb_isnotnanfc32 (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const void *x, const void *b)
+bool gb_isnotnanfc32 (GrB_Index i, GrB_Index j, const void *x, const void *b)
 { 
     GxB_FC32_t aij = * ((GxB_FC32_t *) x) ;
     return (!isnan (crealf (aij)) && !isnan (cimagf (aij))) ;
 }
 
-bool gb_isnotnanfc64 (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const void *x, const void *b)
+bool gb_isnotnanfc64 (GrB_Index i, GrB_Index j, const void *x, const void *b)
 { 
     GxB_FC64_t aij = * ((GxB_FC64_t *) x) ;
     return (!isnan (creal (aij)) && !isnan (cimag (aij))) ;
@@ -127,10 +119,10 @@ void mexFunction
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
-    int nmatrices, nstrings, ncells ;
+    int nmatrices, nstrings, ncells, sparsity ;
     GrB_Descriptor desc ;
     gb_get_mxargs (nargin, pargin, USAGE, Matrix, &nmatrices, String, &nstrings,
-        Cell, &ncells, &desc, &base, &kind, &fmt) ;
+        Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
     CHECK_ERROR (nmatrices < 1 || nstrings < 1 || ncells > 0, USAGE) ;
 
@@ -246,9 +238,10 @@ void mexFunction
         // C has the same type as A
         OK (GxB_Matrix_type (&ctype, A)) ;
 
-        OK (GrB_Matrix_new (&C, ctype, cnrows, cncols)) ;
+        // create the matrix C and set its format and sparsity
         fmt = gb_get_format (cnrows, cncols, A, NULL, fmt) ;
-        OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+        sparsity = gb_get_sparsity (A, NULL, sparsity) ;
+        C = gb_new (ctype, cnrows, cncols, fmt, sparsity) ;
     }
 
     //--------------------------------------------------------------------------
@@ -356,7 +349,7 @@ void mexFunction
     // compute C<M> += select (A, b2)
     //--------------------------------------------------------------------------
 
-    OK (GxB_Matrix_select (C, M, accum, op, A, b2, desc)) ;
+    OK1 (C, GxB_Matrix_select (C, M, accum, op, A, (GxB_Scalar) b2, desc)) ;
 
     //--------------------------------------------------------------------------
     // free shallow copies
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselectopinfo.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselectopinfo.c
index b3f50adf7c..ede96f39ff 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselectopinfo.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbselectopinfo.c
@@ -2,8 +2,8 @@
 // gbselectopinfo : print a GraphBLAS selectop (for illustration only)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsemiringinfo.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsemiringinfo.c
index c5f4c18353..e964c4fc13 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsemiringinfo.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsemiringinfo.c
@@ -2,8 +2,8 @@
 // gbsemiringinfo: print a GraphBLAS semiring (for illustration only)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsetup.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsetup.c
index b1d11683ae..1b7c9cbfe7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsetup.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsetup.c
@@ -2,8 +2,8 @@
 // gbsetup: initialize or finalize GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsize.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsize.c
index f424eee218..16687c7e3e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsize.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsize.c
@@ -2,8 +2,8 @@
 // gbsize: dimension and type of a GraphBLAS or MATLAB matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -46,6 +46,15 @@ void mexFunction
         // get the size of a GraphBLAS matrix
         //----------------------------------------------------------------------
 
+        // get the type
+        mxArray *mx_type = mxGetField (pargin [0], 0, "GraphBLASv4") ;
+        if (mx_type == NULL)
+        {
+            // check if it is a GraphBLASv3 struct
+            mx_type = mxGetField (pargin [0], 0, "GraphBLAS") ;
+            CHECK_ERROR (mx_type == NULL, "invalid GraphBLAS struct") ;
+        }
+
         // get the scalar info
         mxArray *opaque = mxGetField (pargin [0], 0, "s") ;
         CHECK_ERROR (opaque == NULL, "invalid GraphBLAS struct") ;
@@ -58,14 +67,12 @@ void mexFunction
         ncols = (is_csc) ? vdim : vlen ;
 
         //----------------------------------------------------------------------
-        // get the type of a GraphBLAS matrix, if requested
+        // return type of a GraphBLAS matrix, if requested
         //----------------------------------------------------------------------
 
         if (nargout > 2)
         { 
-            // get the type
-            mxArray *mx_type = mxGetField (pargin [0], 0, "GraphBLAS") ;
-            CHECK_ERROR (mx_type == NULL, "invalid GraphBLAS struct") ;
+            // return the type
             pargout [2] = mxDuplicateArray (mx_type) ;
         }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsubassign.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsubassign.c
index 3d3b311414..774c2cb7d0 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsubassign.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbsubassign.c
@@ -2,8 +2,8 @@
 // gbsubassign: assign entries into a GraphBLAS matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbthreads.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbthreads.c
index 6ff3b62d04..5f196f3cf8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbthreads.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbthreads.c
@@ -2,8 +2,8 @@
 // gbthreads: get/set the maximum # of threads to use in GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbtrans.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbtrans.c
index 20607a7d4f..35b7ace0cb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbtrans.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbtrans.c
@@ -2,8 +2,8 @@
 // gbtrans: sparse matrix transpose
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -49,10 +49,10 @@ void mexFunction
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
-    int nmatrices, nstrings, ncells ;
+    int nmatrices, nstrings, ncells, sparsity ;
     GrB_Descriptor desc ;
     gb_get_mxargs (nargin, pargin, USAGE, Matrix, &nmatrices, String, &nstrings,
-        Cell, &ncells, &desc, &base, &kind, &fmt) ;
+        Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
     CHECK_ERROR (nmatrices < 1 || nmatrices > 3 || nstrings > 1 || ncells > 0,
         USAGE) ;
@@ -125,16 +125,17 @@ void mexFunction
         // use the type of A
         OK (GxB_Matrix_type (&ctype, A)) ;
 
-        OK (GrB_Matrix_new (&C, ctype, cnrows, cncols)) ;
+        // create the matrix C and set its format and sparsity
         fmt = gb_get_format (cnrows, cncols, A, NULL, fmt) ;
-        OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+        sparsity = gb_get_sparsity (A, NULL, sparsity) ;
+        C = gb_new (ctype, cnrows, cncols, fmt, sparsity) ;
     }
 
     //--------------------------------------------------------------------------
     // compute C<M> += A or A'
     //--------------------------------------------------------------------------
 
-    OK (GrB_transpose (C, M, accum, A, desc)) ;
+    OK1 (C, GrB_transpose (C, M, accum, A, desc)) ;
 
     //--------------------------------------------------------------------------
     // free shallow copies
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbtype.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbtype.c
index 8d5204f97e..67836b5439 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbtype.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbtype.c
@@ -2,8 +2,8 @@
 // gbtype: type of a GraphBLAS matrix struct, or any MATLAB variable
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -41,12 +41,22 @@ void mexFunction
 
     if (class == mxSTRUCT_CLASS)
     {
-        mxArray *mx_type = mxGetField (pargin [0], 0, "GraphBLAS") ;
+        mxArray *mx_type = mxGetField (pargin [0], 0, "GraphBLASv4") ;
         if (mx_type != NULL)
         { 
-            // X is a GraphBLAS G.opaque struct; get its type
+            // X is a GraphBLASv4 G.opaque struct; get its type
             c = mxDuplicateArray (mx_type) ;
         }
+        else
+        {
+            // check if it is a GraphBLASv3 struct
+            mx_type = mxGetField (pargin [0], 0, "GraphBLAS") ;
+            if (mx_type != NULL)
+            {
+                // X is a GraphBLASv3 G.opaque struct; get its type
+                c = mxDuplicateArray (mx_type) ;
+            }
+        }
     }
 
     if (c == NULL)
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbunopinfo.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbunopinfo.c
index 488af41995..b4227377ae 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbunopinfo.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbunopinfo.c
@@ -2,8 +2,8 @@
 // gbunopinfo : print a GraphBLAS unary op (for illustration only)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbver.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbver.c
new file mode 100644
index 0000000000..675e55f560
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbver.c
@@ -0,0 +1,73 @@
+//------------------------------------------------------------------------------
+// gbver: struct with SuiteSparse:GraphBLAS version
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// v = gbver
+
+#include "gb_matlab.h"
+
+static const char *vfields [3] = { "Name", "Version", "Date" } ;
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    gb_usage (nargin == 0 && nargout <= 1, "usage: v = gbver") ;
+
+    //--------------------------------------------------------------------------
+    // get the version and date information and return it as a MATLAB struct
+    //--------------------------------------------------------------------------
+
+    int version [3] ;
+    OK (GxB_Global_Option_get (GxB_LIBRARY_VERSION, version)) ;
+
+    char *date ;
+    OK (GxB_Global_Option_get (GxB_LIBRARY_DATE, &date)) ;
+
+    if (nargout == 0)
+    {
+        char *license, *about, *spec, *url ;
+        printf ("----------------------------------------"
+                "-----------------------------------\n") ;
+        OK (GxB_Global_Option_get (GxB_LIBRARY_ABOUT, &about)) ;
+        printf ("%s\n", about) ;
+        printf ("Version: %d.%d.%d (%s)\n\n",
+                version [0], version [1], version [2], date) ;
+        OK (GxB_Global_Option_get (GxB_LIBRARY_LICENSE, &license)) ;
+        printf ("License:\n%s\n", license) ;
+        OK (GxB_Global_Option_get (GxB_API_ABOUT, &spec)) ;
+        printf ("Spec:\n%s\n", spec) ;
+        OK (GxB_Global_Option_get (GxB_API_URL, &url)) ;
+        printf ("URL: %s\n", url) ;
+        printf ("----------------------------------------"
+                "-----------------------------------\n") ;
+    }
+    else
+    {
+        #define LEN 256
+        char s [LEN+1] ;
+        snprintf (s, LEN, "%d.%d.%d", version [0], version [1], version [2]) ;
+        pargout [0] = mxCreateStructMatrix (1, 1, 3, vfields) ;
+        mxSetFieldByNumber (pargout [0], 0, 0,
+                mxCreateString ("SuiteSparse:GraphBLAS")) ;
+        mxSetFieldByNumber (pargout [0], 0, 1, mxCreateString (s)) ;
+        mxSetFieldByNumber (pargout [0], 0, 2, mxCreateString (date)) ;
+    }
+
+    GB_WRAPUP ;
+}
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbversion.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbversion.c
new file mode 100644
index 0000000000..68394c78a8
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbversion.c
@@ -0,0 +1,47 @@
+//------------------------------------------------------------------------------
+// gbversion: string with SuiteSparse:GraphBLAS version
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// v = gbversion
+
+#include "gb_matlab.h"
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    gb_usage (nargin == 0 && nargout <= 1, "usage: v = gbversion") ;
+
+    //--------------------------------------------------------------------------
+    // get the version and date information and return it as a MATLAB string
+    //--------------------------------------------------------------------------
+
+    int version [3] ;
+    OK (GxB_Global_Option_get (GxB_LIBRARY_VERSION, version)) ;
+
+    char *date = NULL ;
+    OK (GxB_Global_Option_get (GxB_LIBRARY_DATE, &date)) ;
+
+    #define LEN 256
+    char s [LEN+1] ;
+    snprintf (s, LEN, "%d.%d.%d (%s)",
+        version [0], version [1], version [2], date) ;
+
+    pargout [0] = mxCreateString (s) ;
+    GB_WRAPUP ;
+}
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbvreduce.c b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbvreduce.c
index 8d1e54ddfd..ce8933c54e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbvreduce.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/mexfunctions/gbvreduce.c
@@ -2,8 +2,8 @@
 // gbvreduce: reduce a matrix to a vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,14 +47,21 @@ void mexFunction
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
-    int nmatrices, nstrings, ncells ;
+    int nmatrices, nstrings, ncells, sparsity ;
     GrB_Descriptor desc ;
     gb_get_mxargs (nargin, pargin, USAGE, Matrix, &nmatrices, String, &nstrings,
-        Cell, &ncells, &desc, &base, &kind, &fmt) ;
+        Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
     CHECK_ERROR (nmatrices < 1 || nmatrices > 3 || nstrings < 1 || ncells > 0,
         USAGE) ;
 
+    // ensure the descriptor is present, and set GxB_SORT to true
+    if (desc == NULL)
+    { 
+        OK (GrB_Descriptor_new (&desc)) ;
+    }
+    OK (GxB_Desc_set (desc, GxB_SORT, true)) ;
+
     //--------------------------------------------------------------------------
     // get the matrices
     //--------------------------------------------------------------------------
@@ -81,7 +88,7 @@ void mexFunction
     OK (GxB_Matrix_type (&atype, A)) ;
     if (C != NULL)
     { 
-        CHECK_ERROR (C->is_hyper, "Cin cannot be hypersparse") ;
+        CHECK_ERROR (C->h != NULL, "Cin cannot be hypersparse") ;
         CHECK_ERROR (!(C->is_csc), "Cin must be stored by column") ;
         CHECK_ERROR (!GB_VECTOR_OK (C), "Cin must be a column vector") ;
         OK (GxB_Matrix_type (&ctype, C)) ;
@@ -133,16 +140,17 @@ void mexFunction
         OK (GxB_Monoid_operator (&binop, monoid)) ;
         OK (GxB_BinaryOp_ztype (&ctype, binop)) ;
 
-        OK (GrB_Matrix_new (&C, ctype, cnrows, 1)) ;
+        // create the matrix C and set its format and sparsity
         fmt = gb_get_format (cnrows, 1, A, NULL, fmt) ;
-        OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+        sparsity = gb_get_sparsity (A, NULL, sparsity) ;
+        C = gb_new (ctype, cnrows, 1, fmt, sparsity) ;
     }
 
     //--------------------------------------------------------------------------
     // compute C<M> += reduce(A)
     //--------------------------------------------------------------------------
 
-    OK (GrB_Matrix_reduce_Monoid (C, M, accum, monoid, A, desc)) ;
+    OK1 (C, GrB_Matrix_reduce_Monoid (C, M, accum, monoid, A, desc)) ;
 
     //--------------------------------------------------------------------------
     // free shallow copies
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_abort.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_abort.c
index 0f23a08393..bee57810e8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_abort.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_abort.c
@@ -2,8 +2,8 @@
 // gb_abort: terminate a GraphBLAS function
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_assign.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_assign.c
index a954202089..7f649b38c3 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_assign.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_assign.c
@@ -2,8 +2,8 @@
 // gb_assign: assign entries into a GraphBLAS matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -29,6 +29,7 @@
 // Cin and A are required.  See GrB.m for more details.
 
 #include "gb_matlab.h"
+#include "GB_ij.h"
 
 void gb_assign                  // gbassign or gbsubassign mexFunctions
 (
@@ -55,10 +56,10 @@ void gb_assign                  // gbassign or gbsubassign mexFunctions
     base_enum_t base ;
     kind_enum_t kind ;
     GxB_Format_Value fmt ;
-    int nmatrices, nstrings, ncells ;
+    int nmatrices, nstrings, ncells, sparsity ;
     GrB_Descriptor desc ;
     gb_get_mxargs (nargin, pargin, usage, Matrix, &nmatrices, String, &nstrings,
-        Cell, &ncells, &desc, &base, &kind, &fmt) ;
+        Cell, &ncells, &desc, &base, &kind, &fmt, &sparsity) ;
 
     CHECK_ERROR (nmatrices < 2 || nmatrices > 3 || nstrings > 1, usage) ;
 
@@ -146,12 +147,17 @@ void gb_assign                  // gbassign or gbsubassign mexFunctions
 
     if (scalar_assignment && anvals == 0)
     { 
-        // A is a sparse scalar.  Expand it to an ni-by-nj sparse matrix with
-        // the same type as C, with no entries, and use matrix assignment.
+        // A is a sparse scalar with no entry.  Expand it to an empty ni-by-nj
+        // sparse matrix with the same type as C, with no entries, and use
+        // matrix assignment.
+        int64_t nI, nJ, Icolon [3], Jcolon [3] ;
+        int Ikind, Jkind ;
+        GB_ijlength (I, ni, cnrows, &nI, &Ikind, Icolon) ;
+        GB_ijlength (J, nj, cncols, &nJ, &Jkind, Jcolon) ;
         OK (GrB_Matrix_free (&A)) ;
-        OK (GrB_Matrix_new (&A, ctype, ni, nj)) ;
         OK (GxB_Matrix_Option_get (C, GxB_FORMAT, &fmt)) ;
-        OK (GxB_Matrix_Option_set (A, GxB_FORMAT, fmt)) ;
+        OK (GxB_Matrix_Option_get (C, GxB_SPARSITY_CONTROL, &sparsity)) ;
+        A = gb_new (ctype, nI, nJ, fmt, sparsity) ;
         scalar_assignment = false ;
     }
 
@@ -168,11 +174,11 @@ void gb_assign                  // gbassign or gbsubassign mexFunctions
     {
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign (C, M, accum, A, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign (C, M, accum, A, I, ni, J, nj, desc)) ;
         }
         else
         { 
-            OK (GrB_Matrix_assign (C, M, accum, A, I, ni, J, nj, desc)) ;
+            OK1 (C, GrB_Matrix_assign (C, M, accum, A, I, ni, J, nj, desc)) ;
         }
     }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_binop_to_monoid.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_binop_to_monoid.c
index caf6bf950a..9cce9e6f7d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_binop_to_monoid.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_binop_to_monoid.c
@@ -2,8 +2,8 @@
 // gb_binop_to_monoid: convert a binary operator to the corresponding monoid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_by_col.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_by_col.c
index 34c8300cb2..4c69c9b116 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_by_col.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_by_col.c
@@ -2,8 +2,8 @@
 // gb_by_col: ensure a matrix is stored by column
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -29,8 +29,8 @@ GrB_Matrix gb_by_col            // return the matrix by column
     { 
         // make a deep copy of A_input and change it to be stored by column
         OK (GrB_Matrix_dup (&A_copy, A_input)) ;
-        OK (GxB_Matrix_Option_set (A_copy, GxB_FORMAT, GxB_BY_COL)) ;
-        OK (GrB_Matrix_wait (&A_copy)) ;
+        OK1 (A_copy, GxB_Matrix_Option_set (A_copy, GxB_FORMAT, GxB_BY_COL)) ;
+        OK1 (A_copy, GrB_Matrix_wait (&A_copy)) ;
         A = A_copy ;
     }
     else
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_default_format.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_default_format.c
index 465b8c3b6f..41e3138f2c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_default_format.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_default_format.c
@@ -2,8 +2,8 @@
 // gb_default_format: determine the default format
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -31,8 +31,6 @@ GxB_Format_Value gb_default_format      // GxB_BY_ROW or GxB_BY_COL
     { 
         // get the default format
         OK (GxB_Global_Option_get (GxB_FORMAT, &fmt)) ;
-        // mexPrintf ("got global format: %d (by_row: %d, by_col: %d)\n",
-        //     fmt, GxB_BY_ROW, GxB_BY_COL) ;
     }
     return (fmt) ;
 }
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_default_type.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_default_type.c
index 5b4ff88883..02b3f43ba7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_default_type.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_default_type.c
@@ -2,8 +2,8 @@
 // gb_default_type: determine the default type for a binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_expand_to_full.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_expand_to_full.c
new file mode 100644
index 0000000000..1dcb087ec4
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_expand_to_full.c
@@ -0,0 +1,92 @@
+//------------------------------------------------------------------------------
+// gb_expand_to_full: add identity values to a matrix so all entries are present
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "gb_matlab.h"
+
+GrB_Matrix gb_expand_to_full    // C = full (A), and typecast
+(
+    const GrB_Matrix A,         // input matrix to expand to full
+    GrB_Type type,              // type of C, if NULL use the type of A
+    GxB_Format_Value fmt,       // format of C
+    GrB_Matrix id               // identity value, use zero if NULL
+)
+{
+
+    //--------------------------------------------------------------------------
+    // get the size and type of A
+    //--------------------------------------------------------------------------
+
+    GrB_Type atype ;
+    GrB_Index nrows, ncols ;
+    OK (GrB_Matrix_nrows (&nrows, A)) ;
+    OK (GrB_Matrix_ncols (&ncols, A)) ;
+    OK (GxB_Matrix_type (&atype, A)) ;
+
+    // C defaults to the same type of A
+    if (type == NULL)
+    {
+        type = atype ;
+    }
+
+    //--------------------------------------------------------------------------
+    // get the identity, use zero if NULL
+    //--------------------------------------------------------------------------
+
+    GrB_Matrix id2 = NULL ;
+    if (id == NULL)
+    {
+        OK (GrB_Matrix_new (&id2, type, 1, 1)) ;
+        id = id2 ;
+    }
+
+    //--------------------------------------------------------------------------
+    // expand the identity into a full matrix B the same size as C
+    //--------------------------------------------------------------------------
+
+    GrB_Matrix B = gb_new (type, nrows, ncols, fmt, 0) ;
+    gb_matrix_assign_scalar (B, NULL, NULL, id, GrB_ALL, 0, GrB_ALL, 0, NULL,
+        false) ;
+
+    //--------------------------------------------------------------------------
+    // typecast A from float to integer using the MATLAB rules
+    //--------------------------------------------------------------------------
+
+    GrB_Matrix S, T = NULL ;
+    if (gb_is_integer (type) && gb_is_float (atype))
+    { 
+        // T = (type) round (A)
+        T = gb_new (type, nrows, ncols, fmt, 0) ;
+        OK1 (T, GrB_Matrix_apply (T, NULL, NULL, gb_round_binop (atype), A,
+            NULL)) ;
+        S = T ;
+    }
+    else
+    { 
+        // T = A, and let GrB_Matrix_eWiseAdd_BinaryOp do the typecasting
+        S = A ;
+    }
+
+    //--------------------------------------------------------------------------
+    // C = first (S, B)
+    //--------------------------------------------------------------------------
+
+    GrB_Matrix C = gb_new (type, nrows, ncols, fmt, 0) ;
+    OK1 (C, GrB_Matrix_eWiseAdd_BinaryOp (C, NULL, NULL,
+        gb_first_binop (type), S, B, NULL)) ;
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    OK (GrB_Matrix_free (&id2)) ;
+    OK (GrB_Matrix_free (&B)) ;
+    OK (GrB_Matrix_free (&T)) ;
+    return (C) ;
+}
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export.c
index 75029e73e7..3962c794f8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export.c
@@ -2,8 +2,8 @@
 // gb_export: export a GrB_Matrix as a MATLAB matrix or GraphBLAS struct
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,10 +15,35 @@
 mxArray *gb_export              // return the exported MATLAB matrix or struct
 (
     GrB_Matrix *C_handle,       // GrB_Matrix to export and free
-    kind_enum_t kind            // GrB, sparse, or full
+    kind_enum_t kind            // GrB, sparse, full, or matlab
 )
 {
 
+    //--------------------------------------------------------------------------
+    // determine if all entries in C are present
+    //--------------------------------------------------------------------------
+
+    GrB_Index nrows, ncols ;
+    bool is_full = false ;
+    if (kind == KIND_MATLAB || kind == KIND_FULL)
+    { 
+        GrB_Index nvals ;
+        OK (GrB_Matrix_nvals (&nvals, *C_handle)) ;
+        OK (GrB_Matrix_nrows (&nrows, *C_handle)) ;
+        OK (GrB_Matrix_ncols (&ncols, *C_handle)) ;
+        is_full = ((double) nrows * (double) ncols == (double) nvals) ;
+    }
+
+    if (kind == KIND_MATLAB)
+    { 
+        // export as full if all entries present, or sparse otherwise
+        kind = (is_full) ? KIND_FULL : KIND_SPARSE ;
+    }
+
+    //--------------------------------------------------------------------------
+    // export the matrix
+    //--------------------------------------------------------------------------
+
     if (kind == KIND_SPARSE)
     { 
 
@@ -35,32 +60,42 @@ mxArray *gb_export              // return the exported MATLAB matrix or struct
     { 
 
         //----------------------------------------------------------------------
-        // export C as a MATLAB dense matrix
+        // export C as a MATLAB full matrix, adding explicit zeros if needed
         //----------------------------------------------------------------------
 
         // No typecasting is needed since MATLAB supports all the same types.
 
-        // ensure nvals(C) is equal to nrows*ncols
-        GrB_Index nrows, ncols, nvals ;
-        OK (GrB_Matrix_nvals (&nvals, *C_handle)) ;
-        OK (GrB_Matrix_nrows (&nrows, *C_handle)) ;
-        OK (GrB_Matrix_ncols (&ncols, *C_handle)) ;
-        CHECK_ERROR ((double) nrows * (double) ncols != (double) nvals,
-            "matrix must be full to export as full matrix") ;
-
-        GrB_Index nzmax, *Cp, *Ci ;
-        int64_t nonempty ;
-        void *Cx ;
-        GrB_Type ctype ;
-        OK (GxB_Matrix_export_CSC (C_handle, &ctype, &nrows, &ncols, &nzmax,
-            &nonempty, &Cp, &Ci, &Cx, NULL)) ;
-        gb_mxfree (&Cp) ;
-        gb_mxfree (&Ci) ;
+        GrB_Matrix C = NULL ;
+        if (!is_full)
+        {
+            // expand C with explicit zeros so all entries are present
+            C = gb_expand_to_full (*C_handle, NULL, GxB_BY_COL, NULL) ;
+            OK (GrB_Matrix_free (C_handle)) ;
+            (*C_handle) = C ;
+            CHECK_ERROR (GB_is_shallow (*C_handle), "internal error 707")
+        }
+
+        if (GB_is_shallow (*C_handle))
+        {
+            // C is shallow so make a deep copy
+            OK (GrB_Matrix_dup (&C, *C_handle)) ;
+            OK (GrB_Matrix_free (C_handle)) ;
+            (*C_handle) = C ;
+        }
+
+        CHECK_ERROR (GB_is_shallow (*C_handle), "internal error 717")
+
+        // export as a full matrix, held by column
+        void *Cx = NULL ;
+        GrB_Type ctype = NULL ;
+        GrB_Index Cx_size ;
+        OK (GxB_Matrix_export_FullC (C_handle, &ctype, &nrows, &ncols,
+            &Cx, &Cx_size, NULL)) ;
 
         return (gb_export_to_mxfull (&Cx, nrows, ncols, ctype)) ;
 
     }
-    else
+    else // kind == KIND_GRB
     { 
 
         //----------------------------------------------------------------------
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export_to_mxfull.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export_to_mxfull.c
index e6035b88ea..818f6caf15 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export_to_mxfull.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export_to_mxfull.c
@@ -1,20 +1,21 @@
 //------------------------------------------------------------------------------
-// gb_export_to_mxfull: export a dense array to a MATLAB dense matrix
+// gb_export_to_mxfull: export a full array to a MATLAB full matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// The input (void *) X is exported to a MATLAB dense mxArray S.
+// The input (void *) X is exported to a MATLAB full mxArray S.
 
-// The input array must be deep.  The output is a standard
-// MATLAB dense matrix as an mxArray.  No typecasting is done.
+// The input array must be deep, but this cannot be checked here.  The caller
+// must ensure that the input X is deep.  The output is a standard MATLAB full
+// matrix as an mxArray.  No typecasting is done.
 
 #include "gb_matlab.h"
 
-mxArray *gb_export_to_mxfull    // return exported MATLAB dense matrix F
+mxArray *gb_export_to_mxfull    // return exported MATLAB full matrix F
 (
     void **X_handle,            // pointer to array to export
     const GrB_Index nrows,      // dimensions of F
@@ -31,7 +32,7 @@ mxArray *gb_export_to_mxfull    // return exported MATLAB dense matrix F
     CHECK_ERROR (type == NULL, "internal error 11") ;
 
     //--------------------------------------------------------------------------
-    // allocate an empty dense matrix of the right type, then set content
+    // allocate an empty full matrix of the right type, then set content
     //--------------------------------------------------------------------------
 
     mxArray *F ;
@@ -48,7 +49,7 @@ mxArray *gb_export_to_mxfull    // return exported MATLAB dense matrix F
     if (type == GrB_BOOL)
     { 
         F = mxCreateLogicalMatrix (0, 0) ;
-        mxSetData (F, X) ;      // OK:bool
+        mxSetData (F, X) ;
     }
     else if (type == GrB_FP32)
     { 
@@ -123,7 +124,7 @@ mxArray *gb_export_to_mxfull    // return exported MATLAB dense matrix F
     (*X_handle) = NULL ;
 
     //--------------------------------------------------------------------------
-    // return the new MATLAB dense matrix
+    // return the new MATLAB full matrix
     //--------------------------------------------------------------------------
 
     return (F) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export_to_mxsparse.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export_to_mxsparse.c
index d474e0d7b3..1ed7e90242 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export_to_mxsparse.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export_to_mxsparse.c
@@ -2,8 +2,8 @@
 // gb_export_to_mxsparse: export a GrB_Matrix to a MATLAB sparse matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -44,7 +44,7 @@ mxArray *gb_export_to_mxsparse  // return exported MATLAB sparse matrix S
         // A is already in a native MATLAB sparse matrix type, by column
         //----------------------------------------------------------------------
 
-        if (gb_is_shallow (*A_handle))
+        if (GB_is_shallow (*A_handle))
         { 
             // A is shallow so make a deep copy
             OK (GrB_Matrix_dup (&T, *A_handle)) ;
@@ -86,19 +86,19 @@ mxArray *gb_export_to_mxsparse  // return exported MATLAB sparse matrix S
             type = GrB_FP64 ;
         }
 
-        T = gb_typecast (type, GxB_BY_COL, *A_handle) ;
+        T = gb_typecast (*A_handle, type, GxB_BY_COL, GxB_SPARSE) ;
 
         OK (GrB_Matrix_free (A_handle)) ;
     }
 
     // ensure T is deep
-    CHECK_ERROR (gb_is_shallow (T), "internal error 7") ;
+    CHECK_ERROR (GB_is_shallow (T), "internal error 7") ;
 
     //--------------------------------------------------------------------------
     // drop zeros from T
     //--------------------------------------------------------------------------
 
-    OK (GxB_Matrix_select (T, NULL, NULL, GxB_NONZERO, T, NULL, NULL)) ;
+    OK1 (T, GxB_Matrix_select (T, NULL, NULL, GxB_NONZERO, T, NULL, NULL)) ;
 
     //--------------------------------------------------------------------------
     // create the new MATLAB sparse matrix
@@ -136,17 +136,18 @@ mxArray *gb_export_to_mxsparse  // return exported MATLAB sparse matrix S
     {
 
         //----------------------------------------------------------------------
-        // export the content of T
+        // export the content of T as a sparse CSC matrix
         //----------------------------------------------------------------------
 
-        GrB_Index nzmax ;
+        GrB_Index Tp_size, Ti_size, Tx_size ;
         int64_t nonempty, *Tp, *Ti ;
         void *Tx ;
 
-        OK (GxB_Matrix_export_CSC (&T, &type, &nrows, &ncols, &nzmax, &nonempty,
-            &Tp, &Ti, &Tx, NULL)) ;
+        // pass jumbled as NULL to indicate the matrix must be sorted
+        OK (GxB_Matrix_export_CSC (&T, &type, &nrows, &ncols,
+            &Tp, &Ti, &Tx, &Tp_size, &Ti_size, &Tx_size, NULL, NULL)) ;
 
-        CHECK_ERROR (nzmax == 0, "internal error 8") ;
+        CHECK_ERROR (Ti_size == 0, "internal error 8") ;
         CHECK_ERROR (Tp == NULL || Ti == NULL || Tx == NULL,
             "internal error 9") ;
 
@@ -170,7 +171,7 @@ mxArray *gb_export_to_mxsparse  // return exported MATLAB sparse matrix S
         // set the size
         mxSetM (S, nrows) ;
         mxSetN (S, ncols) ;
-        mxSetNzmax (S, nzmax) ;
+        mxSetNzmax (S, Ti_size) ;
 
         // set the column pointers
         void *p = mxGetJc (S) ;
@@ -185,9 +186,9 @@ mxArray *gb_export_to_mxsparse  // return exported MATLAB sparse matrix S
         // set the values
         if (type == GrB_BOOL)
         { 
-            p = mxGetData (S) ;     // OK:bool
+            p = mxGetData (S) ;
             gb_mxfree (&p) ;
-            mxSetData (S, Tx) ;     // OK:bool
+            mxSetData (S, Tx) ;
         }
         else if (type == GxB_FC64)
         { 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export_to_mxstruct.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export_to_mxstruct.c
index 294b3760e2..71f0ac05a8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export_to_mxstruct.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_export_to_mxstruct.c
@@ -2,31 +2,40 @@
 // gb_export_to_mxstruct: export a GrB_Matrix to a MATLAB struct
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // The input GrB_Matrix A is exported to a GraphBLAS matrix struct G, and freed.
 
-// The input GrB_Matrix A must be deep.  The output is a standard MATLAB sparse
-// matrix as an mxArray.
+// The input GrB_Matrix A must be deep.  The output is a MATLAB struct
+// holding the content of the GrB_Matrix.
 
 #include "gb_matlab.h"
 
-#define NFIELDS 6
-
-static const char *MatrixFields [NFIELDS] =
+// for hypersparse, sparse, or full matrices
+static const char *MatrixFields [6] =
 {
-    "GraphBLAS",        // 0: "logical", "int8", ... "double",
+    "GraphBLASv4",      // 0: "logical", "int8", ... "double",
                         //    "single complex", or "double complex"
     "s",                // 1: all scalar info goes here
-    "p",                // 2: array of int64_t, size plen+1
-    "i",                // 3: array of int64_t, size nzmax
-    "x",                // 4: array of uint8, size (sizeof(type) * nzmax)
+    "x",                // 2: array of uint8, size (sizeof(type) * nzmax)
+    "p",                // 3: array of int64_t, size plen+1
+    "i",                // 4: array of int64_t, size nzmax
     "h"                 // 5: array of int64_t, size plen if hypersparse
 } ;
 
+// for bitmap matrices only
+static const char *Bitmap_MatrixFields [4] =
+{
+    "GraphBLASv4",      // 0: "logical", "int8", ... "double",
+                        //    "single complex", or "double complex"
+    "s",                // 1: all scalar info goes here
+    "x",                // 2: array of uint8, size (sizeof(type) * nzmax)
+    "b"                 // 3: array of int8_t, size nzmax, for bitmap only
+} ;
+
 //------------------------------------------------------------------------------
 
 mxArray *gb_export_to_mxstruct  // return exported MATLAB struct G
@@ -40,22 +49,56 @@ mxArray *gb_export_to_mxstruct  // return exported MATLAB struct G
     //--------------------------------------------------------------------------
 
     CHECK_ERROR (A_handle == NULL, "matrix missing") ;
+
+    GrB_Matrix T = NULL ;
+    if (GB_is_shallow (*A_handle))
+    {
+        // A is shallow so make a deep copy
+        OK (GrB_Matrix_dup (&T, *A_handle)) ;
+        OK (GrB_Matrix_free (A_handle)) ;
+        (*A_handle) = T ;
+    }
+
     GrB_Matrix A = (*A_handle) ;
-    CHECK_ERROR (gb_is_shallow (A), "internal error 4") ;
 
     //--------------------------------------------------------------------------
     // make sure the matrix is finished
     //--------------------------------------------------------------------------
 
-    GrB_Index nvals ;
-    OK (GrB_Matrix_nvals (&nvals, A)) ;
+    OK1 (A, GrB_Matrix_wait (&A)) ;
 
     //--------------------------------------------------------------------------
     // construct the output struct
     //--------------------------------------------------------------------------
 
-    mxArray *G = mxCreateStructMatrix (1, 1,
-        A->is_hyper ? NFIELDS : (NFIELDS-1), MatrixFields) ;
+    int sparsity ;
+    OK (GxB_Matrix_Option_get (A, GxB_SPARSITY_STATUS, &sparsity)) ;
+    mxArray *G ;
+
+    switch (sparsity)
+    {
+        case GxB_FULL :
+            // A is full, with 3 fields: GraphBLASv4, s, x
+            G = mxCreateStructMatrix (1, 1, 3, MatrixFields) ;
+            break ;
+
+        case GxB_SPARSE :
+            // A is sparse, with 5 fields: GraphBLASv4, s, x, p, i
+            G = mxCreateStructMatrix (1, 1, 5, MatrixFields) ;
+            break ;
+
+        case GxB_HYPERSPARSE :
+            // A is hypersparse, with 6 fields: GraphBLASv4, s, x, p, i, h
+            G = mxCreateStructMatrix (1, 1, 6, MatrixFields) ;
+            break ;
+
+        case GxB_BITMAP :
+            // A is bitmap, with 4 fields: GraphBLASv4, s, x, b
+            G = mxCreateStructMatrix (1, 1, 4, Bitmap_MatrixFields) ;
+            break ;
+
+        default : ERROR ("invalid GraphBLAS struct") ;
+    }
 
     //--------------------------------------------------------------------------
     // export content into the output struct
@@ -65,55 +108,57 @@ mxArray *gb_export_to_mxstruct  // return exported MATLAB struct G
     mxSetFieldByNumber (G, 0, 0, gb_type_to_mxstring (A->type)) ;
 
     // export the scalar content
-    mxArray *opaque = mxCreateNumericMatrix (1, 8, mxINT64_CLASS, mxREAL) ;
+    mxArray *opaque = mxCreateNumericMatrix (1, 9, mxINT64_CLASS, mxREAL) ;
     int64_t *s = mxGetInt64s (opaque) ;
     s [0] = A->plen ;
     s [1] = A->vlen ;
     s [2] = A->vdim ;
     s [3] = A->nvec ;
     s [4] = A->nvec_nonempty ;
-    s [5] = (int64_t) (A->is_hyper) ;
+    s [5] = A->sparsity ;
     s [6] = (int64_t) (A->is_csc) ;
     s [7] = A->nzmax ;
+    s [8] = A->nvals ;
     mxSetFieldByNumber (G, 0, 1, opaque) ;
 
     // These components do not need to be exported: Pending, nzombies,
-    // queue_next, queue_head, enqueued, *_shallow.
-
-    // export the pointers
-    mxArray *Ap = mxCreateNumericMatrix (1, 1, mxINT64_CLASS, mxREAL) ;
-    mxSetN (Ap, A->plen+1) ;
-    void *p = mxGetInt64s (Ap) ;
-    gb_mxfree (&p) ;
-    mxSetInt64s (Ap, A->p) ;
-    A->p = NULL ;
-    mxSetFieldByNumber (G, 0, 2, Ap) ;
+    // queue_next, queue_head, enqueued, *_shallow, jumbled, logger,
+    // hyper_switch, bitmap_switch.
 
-    // export the indices
-    mxArray *Ai = mxCreateNumericMatrix (1, 1, mxINT64_CLASS, mxREAL) ;
-    if (A->nzmax > 0)
-    { 
-        mxSetN (Ai, A->nzmax) ;
-        p = mxGetInt64s (Ai) ;
+    if (sparsity == GxB_SPARSE || sparsity == GxB_HYPERSPARSE)
+    {
+        // export the pointers
+        mxArray *Ap = mxCreateNumericMatrix (1, 1, mxINT64_CLASS, mxREAL) ;
+        mxSetN (Ap, A->plen+1) ;
+        void *p = mxGetInt64s (Ap) ;
         gb_mxfree (&p) ;
-        mxSetInt64s (Ai, A->i) ;
+        mxSetInt64s (Ap, A->p) ;
+        mxSetFieldByNumber (G, 0, 3, Ap) ;
+
+        // export the indices
+        mxArray *Ai = mxCreateNumericMatrix (1, 1, mxINT64_CLASS, mxREAL) ;
+        if (A->nzmax > 0)
+        { 
+            mxSetN (Ai, A->nzmax) ;
+            p = mxGetInt64s (Ai) ;
+            gb_mxfree (&p) ;
+            mxSetInt64s (Ai, A->i) ;
+        }
+        mxSetFieldByNumber (G, 0, 4, Ai) ;
     }
-    A->i = NULL ;
-    mxSetFieldByNumber (G, 0, 3, Ai) ;
 
     // export the values as uint8
     mxArray *Ax = mxCreateNumericMatrix (1, 1, mxUINT8_CLASS, mxREAL) ;
     if (A->nzmax > 0)
     { 
-        mxSetN (Ax, A->nzmax * A->type_size) ;
-        p = mxGetUint8s (Ax) ;
+        mxSetN (Ax, A->nzmax * A->type->size) ;
+        void *p = mxGetUint8s (Ax) ;
         gb_mxfree (&p) ;
         mxSetUint8s (Ax, A->x) ;
     }
-    A->x = NULL ;
-    mxSetFieldByNumber (G, 0, 4, Ax) ;
+    mxSetFieldByNumber (G, 0, 2, Ax) ;
 
-    if (A->is_hyper)
+    if (sparsity == GxB_HYPERSPARSE)
     {
         // export the hyperlist
         if (A->nvec < A->plen)
@@ -126,18 +171,36 @@ mxArray *gb_export_to_mxstruct  // return exported MATLAB struct G
         if (A->plen > 0)
         { 
             mxSetN (Ah, A->plen) ;
-            p = mxGetInt64s (Ah) ;
+            void *p = mxGetInt64s (Ah) ;
             gb_mxfree (&p) ;
             mxSetInt64s (Ah, A->h) ;
         }
         mxSetFieldByNumber (G, 0, 5, Ah) ;
     }
-    A->h = NULL ;
+
+    if (sparsity == GxB_BITMAP)
+    { 
+        // export the bitmap
+        mxArray *Ab = mxCreateNumericMatrix (1, 1, mxINT8_CLASS, mxREAL) ;
+        if (A->nzmax > 0)
+        { 
+            mxSetN (Ab, A->nzmax) ;
+            void *p = mxGetInt8s (Ab) ;
+            gb_mxfree (&p) ;
+            mxSetInt8s (Ab, A->b) ;
+        }
+        mxSetFieldByNumber (G, 0, 3, Ab) ;
+    }
 
     //--------------------------------------------------------------------------
     // free the header of A
     //--------------------------------------------------------------------------
 
+    A->p = NULL ;
+    A->h = NULL ;
+    A->b = NULL ;
+    A->i = NULL ;
+    A->x = NULL ;
     OK (GrB_Matrix_free (A_handle)) ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_find_dot.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_find_dot.c
index f1e40ccf54..fd3af7f2d8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_find_dot.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_find_dot.c
@@ -2,8 +2,8 @@
 // gb_find_dot:  find the first two occurences of '.' in a string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_first_binop.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_first_binop.c
index d9f9fb711e..12db255322 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_first_binop.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_first_binop.c
@@ -2,8 +2,8 @@
 // gb_first_binop: return the GrB_FIRST operator for a given type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_flush.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_flush.c
index 479c761afd..d9c69bd2ba 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_flush.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_flush.c
@@ -2,8 +2,8 @@
 // gb_flush: flush mexPrintf output to MATLAB Command Window
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,6 +23,8 @@
 
 int gb_flush ( void )       // flush mexPrintf output to MATLAB Command Window
 {
-    return (mexEvalString ("drawnow ; pause (1e-8) ;")) ;
+    // 'drawnow' is slow when logging in remotely: disable it.
+    // return (mexEvalString ("drawnow ; pause (1e-8) ;")) ;
+    return (mexEvalString ("pause (1e-8) ;")) ;
 }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_deep.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_deep.c
index 0df7c8b26b..878d7f1555 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_deep.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_deep.c
@@ -2,8 +2,8 @@
 // gb_get_deep: create a deep GrB_Matrix copy of a MATLAB X
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,7 +18,7 @@ GrB_Matrix gb_get_deep      // return a deep GrB_Matrix copy of a MATLAB X
     GrB_Matrix S = gb_get_shallow (X) ;
     GxB_Format_Value fmt ;
     OK (GxB_Matrix_Option_get (S, GxB_FORMAT, &fmt)) ;
-    GrB_Matrix A = gb_typecast (NULL, fmt, S) ;
+    GrB_Matrix A = gb_typecast (S, NULL, fmt, 0) ;
     OK (GrB_Matrix_free (&S)) ;
     return (A) ;
 }
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_format.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_format.c
index 87764c688d..131c3a4988 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_format.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_format.c
@@ -2,8 +2,8 @@
 // gb_get_format: determine the format of a matrix result 
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -55,7 +55,7 @@ GxB_Format_Value gb_get_format          // GxB_BY_ROW or GxB_BY_COL
     }
     else if (cnrows == 1)
     { 
-        // (3) row vectors are stored by column, by default
+        // (3) row vectors are stored by row, by default
         fmt = GxB_BY_ROW ;
     }
     else if (A != NULL && !gb_is_vector (A))
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_mxargs.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_mxargs.c
index 2918361d50..7802f34a6d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_mxargs.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_mxargs.c
@@ -2,8 +2,8 @@
 // gb_get_mxargs: get input arguments to a GraphBLAS mexFunction 
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -33,7 +33,9 @@ void gb_get_mxargs
     GrB_Descriptor *desc,       // last argument is always the descriptor
     base_enum_t *base,          // desc.base
     kind_enum_t *kind,          // desc.kind
-    GxB_Format_Value *fmt       // desc.format
+    GxB_Format_Value *fmt,      // desc.format : by row or by col
+    int *sparsity               // desc.format : hypersparse/sparse/bitmap/full
+                                // or 0 if not in the descriptor
 )
 {
 
@@ -44,10 +46,12 @@ void gb_get_mxargs
     (*desc) = NULL ;
     (*kind) = KIND_GRB ;
     (*fmt) = GxB_NO_FORMAT ;
+    (*sparsity) = 0 ;
     (*base) = BASE_DEFAULT ;
     if (nargin > 0)
     { 
-        (*desc) = gb_mxarray_to_descriptor (pargin [nargin-1], kind, fmt, base);
+        (*desc) = gb_mxarray_to_descriptor (pargin [nargin-1], kind, fmt,
+            sparsity, base) ;
     }
     if ((*desc) != NULL)
     { 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_shallow.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_shallow.c
index c6edc7f923..6d8598392e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_shallow.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_shallow.c
@@ -2,8 +2,8 @@
 // gb_get_shallow: create a shallow copy of a MATLAB sparse matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,6 +16,9 @@
 // error here, since the caller might be getting an optional input matrix, such
 // as Cin or the Mask.
 
+// FUTURE: it would be better to use the GxB* import/export functions,
+// instead of accessing the opaque content of the GrB_Matrix directly.
+
 #include "gb_matlab.h"
 
 #define IF(error,message) \
@@ -58,9 +61,19 @@ GrB_Matrix gb_get_shallow   // return a shallow copy of MATLAB sparse matrix
         // construct a shallow GrB_Matrix copy from a MATLAB struct
         //----------------------------------------------------------------------
 
+        bool GraphBLASv3 = false ;
+        mxArray *mx_type = NULL ;
+
         // get the type
-        mxArray *mx_type = mxGetField (X, 0, "GraphBLAS") ;
-        CHECK_ERROR (mx_type == NULL, "not a GraphBLAS struct") ;
+        mx_type = mxGetField (X, 0, "GraphBLASv4") ;
+        if (mx_type == NULL)
+        {
+            // check if it is a GraphBLASv3 struct
+            mx_type = mxGetField (X, 0, "GraphBLAS") ;
+            CHECK_ERROR (mx_type == NULL, "not a GraphBLAS struct") ;
+            GraphBLASv3 = true ;
+        }
+
         GrB_Type type = gb_mxstring_to_type (mx_type) ;
 
         // allocate the header, with no content
@@ -69,48 +82,72 @@ GrB_Matrix gb_get_shallow   // return a shallow copy of MATLAB sparse matrix
         gb_mxfree (&(A->h)) ;
         gb_mxfree (&(A->i)) ;
         gb_mxfree (&(A->x)) ;
+        gb_mxfree (&(A->b)) ;
 
         // get the scalar info
         mxArray *opaque = mxGetField (X, 0, "s") ;
         IF (opaque == NULL, ".s missing") ;
+        IF (mxGetM (opaque) != 1, ".s wrong size") ;
+        IF (mxGetN (opaque) != (GraphBLASv3 ? 8 : 9), ".s wrong size") ;
         int64_t *s = mxGetInt64s (opaque) ;
-        A->hyper_ratio   = GxB_HYPER_DEFAULT ;
+        A->hyper_switch  = GB_Global_hyper_switch_get ( ) ;
         A->plen          = s [0] ;
         A->vlen          = s [1] ;
         A->vdim          = s [2] ;
         A->nvec          = s [3] ;
         A->nvec_nonempty = s [4] ;
-        A->is_hyper      = (bool) (s [5]) ;
         A->is_csc        = (bool) (s [6]) ;
         A->nzmax         = s [7] ;
 
-        // get the pointers
-        mxArray *Ap = mxGetField (X, 0, "p") ;
-        IF (Ap == NULL, ".p missing") ;
-        IF (mxGetM (Ap) != 1, ".p wrong size") ;
-        IF (mxGetN (Ap) != A->plen+1, ".p wrong size") ;
-        A->p = mxGetInt64s (Ap) ;
-        IF (A->p == NULL, ".p wrong type") ;
-
-        // get the indices
-        mxArray *Ai = mxGetField (X, 0, "i") ;
-        IF (Ai == NULL, ".i missing") ;
-        IF (mxGetM (Ai) != 1, ".i wrong size") ;
-        IF (mxGetN (Ai) != MAX (A->nzmax, 1), ".i wrong size") ;
-        A->i = (A->nzmax == 0) ? NULL : mxGetInt64s (Ai) ;
-        IF (A->i == NULL && A->nzmax > 0, ".i wrong type") ;
+        if (GraphBLASv3)
+        {
+            // GraphBLASv3 struct: sparse or hypersparse only
+            A->sparsity      = GxB_AUTO_SPARSITY ;
+            A->nvals         = 0 ;
+        }
+        else
+        {
+            // GraphBLASv4 struct: sparse, hypersparse, bitmap, or full
+            A->sparsity      = (int) (s [5]) ;
+            A->nvals         = s [8] ;
+        }
+
+        A->bitmap_switch = GB_Global_bitmap_switch_matrix_get
+            (A->vlen, A->vdim) ;
+
+        int nfields = mxGetNumberOfFields (X) ;
+
+        if (nfields == 5 || nfields == 6)
+        {
+            // A is hypersparse or sparse
+            // get the pointers
+            mxArray *Ap = mxGetField (X, 0, "p") ;
+            IF (Ap == NULL, ".p missing") ;
+            IF (mxGetM (Ap) != 1, ".p wrong size") ;
+            IF (mxGetN (Ap) != A->plen+1, ".p wrong size") ;
+            A->p = mxGetInt64s (Ap) ;
+            IF (A->p == NULL, ".p wrong type") ;
+            // get the indices
+            mxArray *Ai = mxGetField (X, 0, "i") ;
+            IF (Ai == NULL, ".i missing") ;
+            IF (mxGetM (Ai) != 1, ".i wrong size") ;
+            IF (mxGetN (Ai) != MAX (A->nzmax, 1), ".i wrong size") ;
+            A->i = (A->nzmax == 0) ? NULL : mxGetInt64s (Ai) ;
+            IF (A->i == NULL && A->nzmax > 0, ".i wrong type") ;
+        }
 
         // get the values
         mxArray *Ax = mxGetField (X, 0, "x") ;
         IF (Ax == NULL, ".x missing") ;
         IF (mxGetM (Ax) != 1, ".x wrong size") ;
-        IF (mxGetN (Ax) != MAX (A->type_size*A->nzmax, 1), ".x wrong size") ;
+        IF (mxGetN (Ax) != MAX (A->type->size*A->nzmax, 1), ".x wrong size") ;
         A->x = (A->nzmax == 0) ? NULL : ((void *) mxGetUint8s (Ax)) ;
         IF (A->x == NULL && A->nzmax > 0, ".x wrong type") ;
 
         A->h = NULL ;
-        if (A->is_hyper)
+        if (nfields == 6)
         { 
+            // A is hypersparse
             // get the hyperlist
             mxArray *Ah = mxGetField (X, 0, "h") ;
             IF (Ah == NULL, ".h missing") ;
@@ -120,11 +157,25 @@ GrB_Matrix gb_get_shallow   // return a shallow copy of MATLAB sparse matrix
             IF (A->h == NULL, ".h wrong type") ;
         }
 
+        A->b = NULL ;
+        if (nfields == 4)
+        { 
+            // A is bitmap
+            // get the bitmap
+            mxArray *Ab = mxGetField (X, 0, "b") ;
+            IF (Ab == NULL, ".b missing") ;
+            IF (mxGetM (Ab) != 1, ".b wrong size") ;
+            IF (mxGetN (Ab) != MAX (A->nzmax, 1), ".b wrong size") ;
+            A->b = (void *) mxGetInt8s (Ab) ;
+            IF (A->b == NULL, ".b wrong type") ;
+        }
+
         // tell GraphBLAS the matrix is shallow
-        A->p_shallow = true ;
+        A->p_shallow = (A->p != NULL) ;
         A->i_shallow = (A->i != NULL) ;
         A->x_shallow = (A->x != NULL) ;
         A->h_shallow = (A->h != NULL) ;
+        A->b_shallow = (A->b != NULL) ;
 
         // matrix is now initialized
         A->magic = GB_MAGIC ;
@@ -155,31 +206,28 @@ GrB_Matrix gb_get_shallow   // return a shallow copy of MATLAB sparse matrix
         }
         else
         { 
-            // X is a MATLAB dense matrix; create a partially shallow
-            // GrB_Matrix copy by allocating the row indices Xi and pointers Xp
-            // but keeping Xx shallow.
-            nzmax = MAX (nrows * ncols, 1) ;
-            Xp = (GrB_Index *) mxMalloc ((ncols+1) * sizeof (GrB_Index)) ;
-            Xi = (GrB_Index *) mxMalloc (nzmax * sizeof (GrB_Index)) ;
-            GB_matlab_helper2 (Xp, Xi, (int64_t) ncols, (int64_t) nrows) ;
+            // X is a MATLAB full matrix; so is the GrB_Matrix
+            nzmax = nrows * ncols ;
+            Xp = NULL ;
+            Xi = NULL ;
         }
 
         // get the numeric data
         void *Xx = NULL ;
         if (type == GrB_FP64)
         { 
-            // MATLAB sparse or dense double matrix
+            // MATLAB sparse or full double matrix
             Xx = mxGetDoubles (X) ;
         }
         else if (type == GxB_FC64)
         { 
-            // MATLAB sparse or dense double complex matrix
+            // MATLAB sparse or full double complex matrix
             Xx = mxGetComplexDoubles (X) ;
         }
         else if (type == GrB_BOOL)
         { 
-            // MATLAB sparse or dense logical matrix
-            Xx = mxGetData (X) ;        // OK:bool
+            // MATLAB sparse or full logical matrix
+            Xx = mxGetData (X) ;
         }
         else if (X_is_sparse)
         {
@@ -188,52 +236,52 @@ GrB_Matrix gb_get_shallow   // return a shallow copy of MATLAB sparse matrix
         }
         else if (type == GrB_INT8)
         { 
-            // dense int8 matrix
+            // full int8 matrix
             Xx = mxGetInt8s (X) ;
         }
         else if (type == GrB_INT16)
         { 
-            // dense int16 matrix
+            // full int16 matrix
             Xx = mxGetInt16s (X) ;
         }
         else if (type == GrB_INT32)
         { 
-            // dense int32 matrix
+            // full int32 matrix
             Xx = mxGetInt32s (X) ;
         }
         else if (type == GrB_INT64)
         { 
-            // dense int64 matrix
+            // full int64 matrix
             Xx = mxGetInt64s (X) ;
         }
         else if (type == GrB_UINT8)
         { 
-            // dense uint8 matrix
+            // full uint8 matrix
             Xx = mxGetUint8s (X) ;
         }
         else if (type == GrB_UINT16)
         { 
-            // dense uint16 matrix
+            // full uint16 matrix
             Xx = mxGetUint16s (X) ;
         }
         else if (type == GrB_UINT32)
         { 
-            // dense uint32 matrix
+            // full uint32 matrix
             Xx = mxGetUint32s (X) ;
         }
         else if (type == GrB_UINT64)
         { 
-            // dense uint64 matrix
+            // full uint64 matrix
             Xx = mxGetUint64s (X) ;
         }
         else if (type == GrB_FP32)
         { 
-            // dense single matrix
+            // full single matrix
             Xx = mxGetSingles (X) ;
         }
         else if (type == GxB_FC32)
         { 
-            // dense single complex matrix
+            // full single complex matrix
             Xx = mxGetComplexSingles (X) ;
         }
         else
@@ -241,22 +289,26 @@ GrB_Matrix gb_get_shallow   // return a shallow copy of MATLAB sparse matrix
             ERROR ("unsupported type") ;
         }
 
-        // import the matrix in CSC format.  This sets Xp, Xi, and Xx to NULL,
-        // but it does not change the MATLAB matrix they came from.
-        OK (GxB_Matrix_import_CSC (&A, type, nrows, ncols, nzmax, -1,
-            &Xp, &Xi, &Xx, NULL)) ;
-
-        // tell GraphBLAS the matrix is shallow
         if (X_is_sparse)
         { 
+            // import the matrix in CSC format.  This sets Xp, Xi, and Xx to
+            // NULL, but it does not change the MATLAB matrix they came from.
+            OK (GxB_Matrix_import_CSC (&A, type, nrows, ncols,
+                &Xp, &Xi, &Xx, ncols+1, nzmax, nzmax, false, NULL)) ;
+
+            // tell GraphBLAS the matrix is shallow
             A->p_shallow = true ;
             A->i_shallow = (A->i != NULL) ;
         }
         else
         { 
-            A->p_shallow = false ;
-            A->i_shallow = false ;
+            // import a full matrix
+            OK (GxB_Matrix_import_FullC (&A, type, nrows, ncols, &Xx,
+                nzmax, NULL)) ;
         }
+
+        // tell GraphBLAS the matrix is shallow
+        A->b_shallow = (A->b != NULL) ;
         A->h_shallow = (A->h != NULL) ;
         A->x_shallow = (A->x != NULL) ;
     }
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_sparsity.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_sparsity.c
new file mode 100644
index 0000000000..8a43d4eb3b
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_get_sparsity.c
@@ -0,0 +1,102 @@
+//------------------------------------------------------------------------------
+// gb_get_sparsity: determine the sparsity of a matrix result 
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// gb_get_sparsity determines the sparsity of a result matrix C, which may be
+// computed from one or two input matrices A and B.  The following rules are
+// used, in order:
+
+// (1) GraphBLAS operations of the form C = GrB.method (Cin, ...) use the
+//      sparsity of Cin for the new matrix C.
+
+// (2) If the sparsity is determined by the descriptor to the method, then that
+//      determines the sparsity of C.
+
+// (3) If both A and B are present and both matrices (not scalars), the
+//      sparsity of C is A_sparsity | B_sparsity
+
+// (4) If A is present (and not a scalar), then the sparsity of C is A_sparsity.
+
+// (5) If B is present (and not a scalar), then the sparsity of C is B_sparsity.
+
+// (6) Otherwise, the global default sparsity is used for C.
+
+#include "gb_matlab.h"
+
+GxB_Format_Value gb_get_sparsity        // 0 to 15
+(
+    GrB_Matrix A,                       // may be NULL
+    GrB_Matrix B,                       // may be NULL
+    int sparsity_default                // may be 0
+)
+{
+
+    int sparsity ;
+    int A_sparsity = 0 ;
+    int B_sparsity = 0 ;
+    GrB_Index nrows, ncols ;
+
+    //--------------------------------------------------------------------------
+    // get the sparsity of the matrices A and B
+    //--------------------------------------------------------------------------
+
+    if (A != NULL)
+    {
+        OK (GrB_Matrix_nrows (&nrows, A)) ;
+        OK (GrB_Matrix_ncols (&ncols, A)) ;
+        if (nrows > 1 || ncols > 1)
+        {
+            // A is a vector or matrix, not a scalar
+            OK (GxB_Matrix_Option_get (A, GxB_SPARSITY_CONTROL, &A_sparsity)) ;
+        }
+    }
+
+    if (B != NULL)
+    {
+        OK (GrB_Matrix_nrows (&nrows, B)) ;
+        OK (GrB_Matrix_ncols (&ncols, B)) ;
+        if (nrows > 1 || ncols > 1)
+        {
+            // B is a vector or matrix, not a scalar
+            OK (GxB_Matrix_Option_get (B, GxB_SPARSITY_CONTROL, &B_sparsity)) ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // determine the sparsity of C
+    //--------------------------------------------------------------------------
+
+    if (sparsity_default != 0)
+    { 
+        // (2) the sparsity is defined by the descriptor to the method
+        sparsity = sparsity_default ;
+    }
+    else if (A_sparsity > 0 && B_sparsity > 0)
+    {
+        // (3) C is determined by the sparsity of A and B
+        sparsity = A_sparsity | B_sparsity ;
+    }
+    else if (A_sparsity > 0)
+    {
+        // (4) get the sparsity of A
+        sparsity = A_sparsity ;
+    }
+    else if (B_sparsity > 0)
+    {
+        // (5) get the sparsity of B
+        sparsity = B_sparsity ;
+    }
+    else
+    {
+        // (6) use the default sparsity
+        sparsity = GxB_AUTO_SPARSITY ;
+    }
+
+    return (sparsity) ;
+}
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_all.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_all.c
index 5f9caee40c..8fdabffa2e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_all.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_all.c
@@ -2,8 +2,8 @@
 // gb_is_all: check two matrices
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -21,10 +21,12 @@ bool gb_is_all              // true if op (A,B) is all true, false otherwise
 )
 {
 
-    GrB_Matrix C = NULL ;
     GrB_Index nrows1, ncols1, nrows2, ncols2, nvals, nvals1, nvals2 ;
 
+    //--------------------------------------------------------------------------
     // check the size of A and B
+    //--------------------------------------------------------------------------
+
     OK (GrB_Matrix_nrows (&nrows1, A)) ;
     OK (GrB_Matrix_nrows (&nrows2, B)) ;
     if (nrows1 != nrows2)
@@ -41,7 +43,10 @@ bool gb_is_all              // true if op (A,B) is all true, false otherwise
         return (false) ;
     }
 
+    //--------------------------------------------------------------------------
     // check the # entries in A and B
+    //--------------------------------------------------------------------------
+
     OK (GrB_Matrix_nvals (&nvals1, A)) ;
     OK (GrB_Matrix_nvals (&nvals2, B)) ;
     if (nvals1 != nvals2)
@@ -57,15 +62,20 @@ bool gb_is_all              // true if op (A,B) is all true, false otherwise
         return (true) ;
     }
 
+    //--------------------------------------------------------------------------
     // C = A .* B, where the pattern of C is the intersection of A and B
-    OK (GrB_Matrix_new (&C, GrB_BOOL, nrows1, ncols1)) ;
+    //--------------------------------------------------------------------------
+
     GxB_Format_Value fmt ;
     OK (GxB_Matrix_Option_get (A, GxB_FORMAT, &fmt)) ;
-    OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
-
-    OK (GrB_Matrix_eWiseMult_BinaryOp (C, NULL, NULL, op, A, B, NULL)) ;
+    int sparsity = gb_get_sparsity (A, B, 0) ;
+    GrB_Matrix C = gb_new (GrB_BOOL, nrows1, ncols1, fmt, sparsity) ;
+    OK1 (C, GrB_Matrix_eWiseMult_BinaryOp (C, NULL, NULL, op, A, B, NULL)) ;
 
+    //--------------------------------------------------------------------------
     // ensure C has the same number of entries as A and B
+    //--------------------------------------------------------------------------
+
     OK (GrB_Matrix_nvals (&nvals, C)) ;
     if (nvals != nvals1)
     { 
@@ -74,11 +84,17 @@ bool gb_is_all              // true if op (A,B) is all true, false otherwise
         return (false) ;
     }
 
+    //--------------------------------------------------------------------------
     // result = and (C)
+    //--------------------------------------------------------------------------
+
     bool result = true ;
     OK (GrB_Matrix_reduce_BOOL (&result, NULL, GrB_LAND_MONOID_BOOL, C, NULL)) ;
 
+    //--------------------------------------------------------------------------
     // free workspace and return result
+    //--------------------------------------------------------------------------
+
     GrB_Matrix_free (&C) ;
     return (result) ;
 }
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_equal.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_equal.c
index 98c3426431..336012ec0c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_equal.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_equal.c
@@ -2,8 +2,8 @@
 // gb_is_equal: check two matrices for exact equality
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_float.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_float.c
index 23c713b936..39d69b0c63 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_float.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_float.c
@@ -2,8 +2,8 @@
 // gb_is_float: check if a GrB_Type is floating-point
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_integer.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_integer.c
index 4f3838cb35..c70770601e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_integer.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_integer.c
@@ -2,8 +2,8 @@
 // gb_is_integer: check if a GrB_Type is integer
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_shallow.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_shallow.c
deleted file mode 100644
index 5fd03a38eb..0000000000
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_shallow.c
+++ /dev/null
@@ -1,29 +0,0 @@
-//------------------------------------------------------------------------------
-// gb_is_shallow: determine if a GrB_matrix has any shallow components
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-#include "gb_matlab.h"
-
-bool gb_is_shallow              // true if any component of A is shallow
-(
-    GrB_Matrix A                // GrB_Matrix to query
-)
-{
-
-    if (A == NULL)
-    {
-        // a NULL pointer is not shallow
-        return (false) ;
-    }
-    else
-    { 
-        // check if any component of A is shallow
-        return (A->p_shallow || A->h_shallow || A->i_shallow || A->x_shallow) ;
-    }
-}
-
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_vector.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_vector.c
index ee7b5eba00..ad9bd00df2 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_vector.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_is_vector.c
@@ -2,8 +2,8 @@
 // gb_is_vector: determine if a GrB_matrix is a row or column vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_matlab.h b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_matlab.h
index 3a20d656b0..81f3d0f967 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_matlab.h
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_matlab.h
@@ -2,8 +2,8 @@
 // gb_matlab.h: definitions for MATLAB interface for SuiteSparse:GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,14 +47,15 @@ void gbcov_put (void) ;
 
 #define CHECK_ERROR(error,message) if (error) ERROR (message) ;
 
-#define OK(method) CHECK_ERROR ((method) != GrB_SUCCESS, GrB_error ( )) ;
+#define OK(method) CHECK_ERROR ((method) != GrB_SUCCESS, "GrB:error") ;
 
-#define OK2(method)                                         \
+#define OK1(C,method)                                       \
 {                                                           \
-    GrB_Info info = method ;                                \
-    if (! (info == GrB_SUCCESS || info == GrB_NO_VALUE))    \
+    if ((method) != GrB_SUCCESS)                            \
     {                                                       \
-        ERROR (GrB_error ( )) ;                             \
+        char *message ;                                     \
+        GrB_Matrix_error (&message, C) ;                    \
+        ERROR (message) ;                                   \
     }                                                       \
 }
 
@@ -78,7 +79,9 @@ typedef enum            // output of GrB.methods
 {
     KIND_GRB = 0,       // return a MATLAB struct containing a GrB_Matrix
     KIND_SPARSE = 1,    // return a MATLAB sparse matrix
-    KIND_FULL = 2       // return a MATLAB dense matrix
+    KIND_FULL = 2,      // return a MATLAB full matrix
+    KIND_MATLAB = 3     // return a MATLAB sparse or full matrix (full if all
+                        // entries present, sparse otherwise)
 }
 kind_enum_t ;
 
@@ -165,11 +168,21 @@ GrB_Type gb_type_to_mxstring    // return the MATLAB string from a GrB_Type
     const GrB_Type type
 ) ;
 
-GrB_Matrix gb_typecast      // A = (type) S, where A is deep
+GrB_Matrix gb_typecast          // C = (type) A, where C is deep
 (
+    GrB_Matrix A,               // may be shallow
     GrB_Type type,              // if NULL, copy but do not typecast
-    GxB_Format_Value fmt,       // also convert to the requested format
-    GrB_Matrix S                // may be shallow
+    GxB_Format_Value fmt,       // format of C
+    int sparsity                // sparsity control for C, if 0 use A
+) ;
+
+GrB_Matrix gb_new               // create and empty matrix C
+(
+    GrB_Type type,              // type of C
+    GrB_Index nrows,            // # of rows
+    GrB_Index ncols,            // # of rows
+    GxB_Format_Value fmt,       // requested format
+    int sparsity                // sparsity control for C, 0 for default
 ) ;
 
 void gb_abort ( void ) ;    // failure
@@ -208,7 +221,8 @@ GrB_UnaryOp gb_string_to_unop           // return unary operator from a string
 GrB_UnaryOp gb_string_and_type_to_unop  // return op from string and type
 (
     const char *op_name,        // name of the operator, as a string
-    const GrB_Type type         // type of the x,y inputs to the operator
+    const GrB_Type type,        // type of the x,y inputs to the operator
+    const bool type_not_given   // true if no type present in the string
 ) ;
 
 GrB_BinaryOp gb_mxstring_to_binop       // return binary operator from a string
@@ -228,7 +242,8 @@ GrB_BinaryOp gb_string_to_binop         // return binary operator from a string
 GrB_BinaryOp gb_string_and_type_to_binop    // return op from string and type
 (
     const char *op_name,        // name of the operator, as a string
-    const GrB_Type type         // type of the x,y inputs to the operator
+    const GrB_Type type,        // type of the x,y inputs to the operator
+    const bool type_not_given   // true if no type present in the string
 ) ;
 
 GrB_Semiring gb_mxstring_to_semiring    // return semiring from a string
@@ -256,9 +271,18 @@ GrB_Descriptor gb_mxarray_to_descriptor // new descriptor, or NULL if none
     const mxArray *desc_matlab, // MATLAB struct with possible descriptor
     kind_enum_t *kind,          // GrB, sparse, or full
     GxB_Format_Value *fmt,      // by row or by col
+    int *sparsity,              // hypersparse/sparse/bitmap/full
     base_enum_t *base           // 0-based int, 1-based int, or 1-based double
 ) ;
 
+GrB_Matrix gb_expand_to_full    // C = full (A), and typecast
+(
+    const GrB_Matrix A,         // input matrix to expand to full
+    GrB_Type type,              // type of C, if NULL use the type of A
+    GxB_Format_Value fmt,       // format of C
+    GrB_Matrix id               // identity value, use zero if NULL
+) ;
+
 mxArray *gb_export_to_mxstruct  // return exported MATLAB struct G
 (
     GrB_Matrix *A_handle        // matrix to export; freed on output
@@ -269,7 +293,7 @@ mxArray *gb_export_to_mxsparse  // return exported MATLAB sparse matrix S
     GrB_Matrix *A_handle        // matrix to export; freed on output
 ) ;
 
-mxArray *gb_export_to_mxfull    // return exported MATLAB dense matrix F
+mxArray *gb_export_to_mxfull    // return exported MATLAB full matrix F
 (
     void **X_handle,            // pointer to array to export
     const GrB_Index nrows,      // dimensions of F
@@ -293,11 +317,6 @@ GxB_SelectOp gb_mxstring_to_selectop    // return select operator from a string
     const mxArray *mxstring             // MATLAB string
 ) ;
 
-bool gb_is_shallow              // true if any component of A is shallow
-(
-    GrB_Matrix A                // GrB_Matrix to query
-) ;
-
 bool gb_mxarray_is_scalar   // true if MATLAB array is a scalar
 (
     const mxArray *S
@@ -353,9 +372,13 @@ GrB_Monoid gb_mxstring_to_monoid        // return monoid from a string
     const GrB_Type type                 // default type if not in the string
 ) ;
 
-GxB_Format_Value gb_mxstring_to_format  // GxB_BY_ROW or GxB_BY_COL
+bool gb_mxstring_to_format      // true if a valid format is found
 (
-    const mxArray *mxformat             // MATLAB string, 'by row' or 'by col'
+    // input
+    const mxArray *mxformat,    // MATLAB string, 'by row' or 'by col'
+    // output
+    GxB_Format_Value *fmt,
+    int *sparsity
 ) ;
 
 void gb_matrix_assign_scalar
@@ -408,6 +431,13 @@ GxB_Format_Value gb_get_format          // GxB_BY_ROW or GxB_BY_COL
     GxB_Format_Value fmt_descriptor     // may be GxB_NO_FORMAT
 ) ;
 
+GxB_Format_Value gb_get_sparsity        // 1 to 15
+(
+    GrB_Matrix A,                       // may be NULL
+    GrB_Matrix B,                       // may be NULL
+    int sparsity_default                // may be 0
+) ;
+
 bool gb_is_equal            // true if A == B, false if A ~= B
 (
     GrB_Matrix A,
@@ -421,29 +451,14 @@ bool gb_is_all              // true if op (A,B) is all true, false otherwise
     GrB_BinaryOp op
 ) ;
 
-bool gb_isnan32 (GrB_Index i, GrB_Index j, GrB_Index nrows, GrB_Index ncols,
-    const void *x, const void *b) ;
-
-bool gb_isnan64 (GrB_Index i, GrB_Index j, GrB_Index nrows, GrB_Index ncols,
-    const void *x, const void *b) ;
-
-bool gb_isnotnan32 (GrB_Index i, GrB_Index j, GrB_Index nrows, GrB_Index ncols,
-    const void *x, const void *b) ;
-
-bool gb_isnotnan64 (GrB_Index i, GrB_Index j, GrB_Index nrows, GrB_Index ncols,
-    const void *x, const void *b) ;
-
-bool gb_isnanfc32 (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const void *x, const void *b) ;
-
-bool gb_isnanfc64 (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const void *x, const void *b) ;
-
-bool gb_isnotnanfc32 (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const void *x, const void *b) ;
-
-bool gb_isnotnanfc64 (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const void *x, const void *b) ;
+bool gb_isnan32      (GrB_Index i, GrB_Index j, const void *x, const void *b) ;
+bool gb_isnan64      (GrB_Index i, GrB_Index j, const void *x, const void *b) ;
+bool gb_isnotnan32   (GrB_Index i, GrB_Index j, const void *x, const void *b) ;
+bool gb_isnotnan64   (GrB_Index i, GrB_Index j, const void *x, const void *b) ;
+bool gb_isnanfc32    (GrB_Index i, GrB_Index j, const void *x, const void *b) ;
+bool gb_isnanfc64    (GrB_Index i, GrB_Index j, const void *x, const void *b) ;
+bool gb_isnotnanfc32 (GrB_Index i, GrB_Index j, const void *x, const void *b) ;
+bool gb_isnotnanfc64 (GrB_Index i, GrB_Index j, const void *x, const void *b) ;
 
 void gb_get_mxargs
 (
@@ -461,7 +476,8 @@ void gb_get_mxargs
     GrB_Descriptor *desc,       // last argument is always the descriptor
     base_enum_t *base,          // desc.base
     kind_enum_t *kind,          // desc.kind
-    GxB_Format_Value *fmt       // desc.format
+    GxB_Format_Value *fmt,      // desc.format : by row or by col
+    int *sparsity               // desc.format : hypersparse/sparse/bitmap/full
 ) ;
 
 int64_t gb_norm_kind (const mxArray *arg) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_matrix_assign_scalar.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_matrix_assign_scalar.c
index a103484f97..dea78a5593 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_matrix_assign_scalar.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_matrix_assign_scalar.c
@@ -2,13 +2,22 @@
 // gb_matrix_assign_scalar: assign scalar into a GraphBLAS matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "gb_matlab.h"
 
+#define OK2(method)                                         \
+{                                                           \
+    GrB_Info info = method ;                                \
+    if (! (info == GrB_SUCCESS || info == GrB_NO_VALUE))    \
+    {                                                       \
+        ERROR ("GrB:error") ;                               \
+    }                                                       \
+}
+
 // if do_subassign true:  GxB_Matrix_subassign_[TYPE]
 // if do_subassign false: GrB_Matrix_assign_[TYPE]
 
@@ -25,7 +34,7 @@ void gb_matrix_assign_scalar
     const GrB_Index ni,
     const GrB_Index *J,
     const GrB_Index nj,
-    const GrB_Descriptor desc,
+    const GrB_Descriptor d,
     bool do_subassign           // true: use GxB_subassign, false: GrB_assign
 )
 {
@@ -38,11 +47,11 @@ void gb_matrix_assign_scalar
         OK2 (GrB_Matrix_extractElement_BOOL (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_BOOL (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign_BOOL (C, M, op, x, I, ni, J, nj, d)) ;
         }
         else
         { 
-            OK (GrB_Matrix_assign_BOOL (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GrB_Matrix_assign_BOOL (C, M, op, x, I, ni, J, nj, d)) ;
         }
     }
     else if (atype == GrB_INT8)
@@ -51,11 +60,11 @@ void gb_matrix_assign_scalar
         OK2 (GrB_Matrix_extractElement_INT8 (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_INT8 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign_INT8 (C, M, op, x, I, ni, J, nj, d)) ;
         }
         else
         { 
-            OK (GrB_Matrix_assign_INT8 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GrB_Matrix_assign_INT8 (C, M, op, x, I, ni, J, nj, d)) ;
         }
     }
     else if (atype == GrB_INT16)
@@ -64,11 +73,11 @@ void gb_matrix_assign_scalar
         OK2 (GrB_Matrix_extractElement_INT16 (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_INT16 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK (GxB_Matrix_subassign_INT16 (C, M, op, x, I, ni, J, nj, d)) ;
         }
         else
         { 
-            OK (GrB_Matrix_assign_INT16 (C, M, op, x, I, ni, J, nj, desc));
+            OK1 (C, GrB_Matrix_assign_INT16 (C, M, op, x, I, ni, J, nj, d));
         }
     }
     else if (atype == GrB_INT32)
@@ -77,11 +86,11 @@ void gb_matrix_assign_scalar
         OK2 (GrB_Matrix_extractElement_INT32 (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_INT32 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign_INT32 (C, M, op, x, I, ni, J, nj, d)) ;
         }
         else
         { 
-            OK (GrB_Matrix_assign_INT32 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GrB_Matrix_assign_INT32 (C, M, op, x, I, ni, J, nj, d)) ;
         }
     }
     else if (atype == GrB_INT64)
@@ -90,11 +99,11 @@ void gb_matrix_assign_scalar
         OK2 (GrB_Matrix_extractElement_INT64 (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_INT64 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign_INT64 (C, M, op, x, I, ni, J, nj, d)) ;
         }
         else
         { 
-            OK (GrB_Matrix_assign_INT64 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GrB_Matrix_assign_INT64 (C, M, op, x, I, ni, J, nj, d)) ;
         }
     }
     else if (atype == GrB_UINT8)
@@ -103,11 +112,11 @@ void gb_matrix_assign_scalar
         OK2 (GrB_Matrix_extractElement_UINT8 (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_UINT8 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign_UINT8 (C, M, op, x, I, ni, J, nj, d)) ;
         }
         else
         { 
-            OK (GrB_Matrix_assign_UINT8 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GrB_Matrix_assign_UINT8 (C, M, op, x, I, ni, J, nj, d)) ;
         }
     }
     else if (atype == GrB_UINT16)
@@ -116,11 +125,11 @@ void gb_matrix_assign_scalar
         OK2 (GrB_Matrix_extractElement_UINT16 (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_UINT16 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign_UINT16 (C, M, op, x, I, ni, J, nj, d));
         }
         else
         { 
-            OK (GrB_Matrix_assign_UINT16 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GrB_Matrix_assign_UINT16 (C, M, op, x, I, ni, J, nj, d)) ;
         }
     }
     else if (atype == GrB_UINT32)
@@ -129,11 +138,11 @@ void gb_matrix_assign_scalar
         OK2 (GrB_Matrix_extractElement_UINT32 (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_UINT32 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign_UINT32 (C, M, op, x, I, ni, J, nj, d));
         }
         else
         { 
-            OK (GrB_Matrix_assign_UINT32 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GrB_Matrix_assign_UINT32 (C, M, op, x, I, ni, J, nj, d)) ;
         }
     }
     else if (atype == GrB_UINT64)
@@ -142,11 +151,11 @@ void gb_matrix_assign_scalar
         OK2 (GrB_Matrix_extractElement_UINT64 (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_UINT64 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign_UINT64 (C, M, op, x, I, ni, J, nj, d));
         }
         else
         { 
-            OK (GrB_Matrix_assign_UINT64 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GrB_Matrix_assign_UINT64 (C, M, op, x, I, ni, J, nj, d)) ;
         }
     }
     else if (atype == GrB_FP32)
@@ -155,11 +164,11 @@ void gb_matrix_assign_scalar
         OK2 (GrB_Matrix_extractElement_FP32 (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_FP32 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign_FP32 (C, M, op, x, I, ni, J, nj, d)) ;
         }
         else
         { 
-            OK (GrB_Matrix_assign_FP32 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GrB_Matrix_assign_FP32 (C, M, op, x, I, ni, J, nj, d)) ;
         }
     }
     else if (atype == GrB_FP64)
@@ -168,11 +177,11 @@ void gb_matrix_assign_scalar
         OK2 (GrB_Matrix_extractElement_FP64 (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_FP64 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign_FP64 (C, M, op, x, I, ni, J, nj, d)) ;
         }
         else
         { 
-            OK (GrB_Matrix_assign_FP64 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GrB_Matrix_assign_FP64 (C, M, op, x, I, ni, J, nj, d)) ;
         }
     }
     else if (atype == GxB_FC32)
@@ -181,11 +190,11 @@ void gb_matrix_assign_scalar
         OK2 (GxB_Matrix_extractElement_FC32 (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_FC32 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign_FC32 (C, M, op, x, I, ni, J, nj, d)) ;
         }
         else
         { 
-            OK (GxB_Matrix_assign_FC32 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_assign_FC32 (C, M, op, x, I, ni, J, nj, d)) ;
         }
     }
     else if (atype == GxB_FC64)
@@ -194,11 +203,11 @@ void gb_matrix_assign_scalar
         OK2 (GxB_Matrix_extractElement_FC64 (&x, A, 0, 0)) ;
         if (do_subassign)
         { 
-            OK (GxB_Matrix_subassign_FC64 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_subassign_FC64 (C, M, op, x, I, ni, J, nj, d)) ;
         }
         else
         { 
-            OK (GxB_Matrix_assign_FC64 (C, M, op, x, I, ni, J, nj, desc)) ;
+            OK1 (C, GxB_Matrix_assign_FC64 (C, M, op, x, I, ni, J, nj, d)) ;
         }
     }
     else
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_is_empty.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_is_empty.c
index e4d8a1c505..282566d363 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_is_empty.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_is_empty.c
@@ -2,8 +2,8 @@
 // gb_mxarray_is_empty: check if a MATLAB mxArray is empty
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_is_scalar.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_is_scalar.c
index d128204b3b..e54513ef86 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_is_scalar.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_is_scalar.c
@@ -2,8 +2,8 @@
 // gb_mxarray_is_scalar: check if MATLAB mxArray is a non-sparse numeric scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_to_descriptor.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_to_descriptor.c
index 1dae9fac5d..aeafd9b8e5 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_to_descriptor.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_to_descriptor.c
@@ -2,8 +2,8 @@
 // gb_mxarray_to_descriptor: get the contents of a GraphBLAS Descriptor
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -76,7 +76,7 @@ static void get_descriptor
             }
             else if (MATCH (s, "structural complement"))
             { 
-                OK (GxB_Desc_set (desc, field, GrB_COMP + GrB_STRUCTURE)) ;
+                OK (GxB_Desc_set (desc, field, GrB_COMP+GrB_STRUCTURE)) ;
             }
             else if (MATCH (s, "replace"))
             { 
@@ -94,10 +94,6 @@ static void get_descriptor
             { 
                 OK (GxB_Desc_set (desc, field, GxB_AxB_SAXPY)) ;
             }
-            else if (MATCH (s, "heap"))
-            { 
-                OK (GxB_Desc_set (desc, field, GxB_AxB_HEAP)) ;
-            }
             else if (MATCH (s, "hash"))
             { 
                 OK (GxB_Desc_set (desc, field, GxB_AxB_HASH)) ;
@@ -120,6 +116,7 @@ GrB_Descriptor gb_mxarray_to_descriptor // new descriptor, or NULL if none
     const mxArray *desc_matlab, // MATLAB struct with possible descriptor
     kind_enum_t *kind,          // GrB, sparse, or full
     GxB_Format_Value *fmt,      // by row or by col
+    int *sparsity,              // hypersparse/sparse/bitmap/full
     base_enum_t *base           // 0-based int, 1-based int, or 1-based double
 )
 {
@@ -132,14 +129,16 @@ GrB_Descriptor gb_mxarray_to_descriptor // new descriptor, or NULL if none
     (*kind) = KIND_GRB ;
     (*fmt) = GxB_NO_FORMAT ;
     (*base) = BASE_DEFAULT ;
+    (*sparsity) = 0 ;
 
     if (desc_matlab == NULL || !mxIsStruct (desc_matlab)
-        || mxGetField (desc_matlab, 0, "GraphBLAS") != NULL)
+        || (mxGetField (desc_matlab, 0, "GraphBLASv4") != NULL)
+        || (mxGetField (desc_matlab, 0, "GraphBLAS") != NULL))
     {
         // If present, the descriptor is a struct whose first field is not
-        // "desc.GraphBLAS" (since that is a GrB matrix struct, not a
-        // descriptor).  If not present, the GraphBLAS descriptor is NULL.
-        // This is not an error.
+        // "desc.GraphBLASv4" (since that is a GrB matrix struct, not a
+        // descriptor), or "desc.GraphBLAS" (a v3 GraphBLAS struct).  If not
+        // present, the GraphBLAS descriptor is NULL.  This is not an error.
         return (NULL) ;
     }
 
@@ -171,15 +170,19 @@ GrB_Descriptor gb_mxarray_to_descriptor // new descriptor, or NULL if none
         gb_mxstring_to_string (s, LEN, mxkind, "kind") ;
         if (MATCH (s, "grb") || MATCH (s, "default"))
         { 
-            (*kind) = KIND_GRB ;
+            (*kind) = KIND_GRB ;        // @GrB matrix
         }
         else if (MATCH (s, "sparse"))
         { 
-            (*kind) = KIND_SPARSE ;
+            (*kind) = KIND_SPARSE ;     // MATLAB sparse matrix
         }
         else if (MATCH (s, "full"))
         { 
-            (*kind) = KIND_FULL ;
+            (*kind) = KIND_FULL ;       // MATLAB full matrix
+        }
+        else if (MATCH (s, "matlab"))
+        {
+            (*kind) = KIND_MATLAB ;     // MATLAB sparse or full matrix
         }
         else
         { 
@@ -194,8 +197,8 @@ GrB_Descriptor gb_mxarray_to_descriptor // new descriptor, or NULL if none
     mxArray *mxfmt = mxGetField (desc_matlab, 0, "format") ;
     if (mxfmt != NULL)
     {
-        (*fmt) = gb_mxstring_to_format (mxfmt) ;
-        if ((*fmt) == GxB_NO_FORMAT)
+        bool ok = gb_mxstring_to_format (mxfmt, fmt, sparsity) ;
+        if (!ok)
         { 
             ERROR ("unknown format") ;
         }
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_to_list.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_to_list.c
index cd5ee50555..1794e324d2 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_to_list.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_to_list.c
@@ -2,8 +2,8 @@
 // gb_mxarray_to_list: convert a MATLAB array to a list of integers
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -29,6 +29,7 @@ int64_t *gb_mxarray_to_list     // return List of integers
 
     CHECK_ERROR (!mxIsNumeric (mxList), "index list must be numeric") ;
     CHECK_ERROR (mxIsSparse (mxList), "index list cannot be sparse") ;
+    CHECK_ERROR (mxIsComplex (mxList), "index list cannot be complex") ;
 
     //--------------------------------------------------------------------------
     // get the length and class of the MATLAB list
@@ -71,6 +72,7 @@ int64_t *gb_mxarray_to_list     // return List of integers
         { 
             // input list is 1-based double
             double *List_double = mxGetDoubles (mxList) ;
+            CHECK_ERROR (List_double == NULL, "index list must be integer") ;
             bool ok = GB_matlab_helper3 (List, List_double, (*len), List_max) ;
             CHECK_ERROR (!ok, "index must be integer") ;
         }
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_type.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_type.c
index 096bce2569..267fd2a351 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_type.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxarray_type.c
@@ -2,8 +2,8 @@
 // gb_mxarray_type: return the GraphBLAS type of a MATLAB matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxcell_to_index.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxcell_to_index.c
index 8ebd224fc9..ecbaac4126 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxcell_to_index.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxcell_to_index.c
@@ -2,8 +2,8 @@
 // gb_mxcell_to_index: convert cell array to index list I or colon expression
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxclass_to_mxstring.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxclass_to_mxstring.c
index 124ce32524..4f87b53061 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxclass_to_mxstring.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxclass_to_mxstring.c
@@ -2,8 +2,8 @@
 // gb_mxclass_to_mxstring: type of a MATLAB matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,7 +13,7 @@ mxArray *gb_mxclass_to_mxstring (mxClassID class, bool is_complex)
 {
     switch (class)
     {
-        // a MATLAB sparse or dense matrix, valid for G = GrB (X), or
+        // a MATLAB sparse or full matrix, valid for G = GrB (X), or
         // for inputs to any GrB.method.
         case mxLOGICAL_CLASS  : return (mxCreateString ("logical")) ;
         case mxINT8_CLASS     : return (mxCreateString ("int8")) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxfree.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxfree.c
index 583791b7c0..7665bf9083 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxfree.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxfree.c
@@ -2,8 +2,8 @@
 // gb_mxfree: mxFree wrapper
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_binop.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_binop.c
index 4cf66608da..92cb53df85 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_binop.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_binop.c
@@ -2,8 +2,8 @@
 // gb_mxstring_to_binop: get a GraphBLAS operator from a MATLAB string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_format.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_format.c
index f79fcd08cc..4b2547e520 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_format.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_format.c
@@ -2,39 +2,121 @@
 // gb_mxstring_to_format: get the format from a MATLAB string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
+// Valid format strings:
+
+//  'by row'            auto sparsity for these 2 strings
+//  'by col'
+
+//  'sparse by row'
+//  'hypersparse by row'
+//  'bitmap by row'
+//  'full by row'
+
+//  'sparse by col'
+//  'hypersparse by col'
+//  'bitmap by col'
+//  'full by col'
+
+//  'sparse'            fmt is GxB_BY_COL for these four strings
+//  'hypersparse'
+//  'bitmap'
+//  'full'
+
+// The sparsity formats can be combined as well, such as:
+//  'sparse/hyper by row'
+
+// hypersparse can be abbreviated as 'hyper'
+
 #include "gb_matlab.h"
 
-GxB_Format_Value gb_mxstring_to_format  // GxB_BY_ROW or GxB_BY_COL
+bool gb_mxstring_to_format      // true if a valid format is found
 (
-    const mxArray *mxformat             // MATLAB string, 'by row' or 'by col'
+    // input
+    const mxArray *mxformat,    // MATLAB string, 'by row' or 'by col'
+    // output
+    GxB_Format_Value *fmt,
+    int *sparsity
 )
 {
 
-    GxB_Format_Value fmt ;
+    bool valid = false ;
+    (*fmt) = GxB_BY_COL ;
+    (*sparsity) = 0 ;
     #define LEN 256
     char format_string [LEN+2] ;
     gb_mxstring_to_string (format_string, LEN, mxformat, "format") ;
-    if (MATCH (format_string, "by row"))
-    { 
-        fmt = GxB_BY_ROW  ;
+
+    //--------------------------------------------------------------------------
+    // look for trailing "by row" or "by col", and set format if found
+    //--------------------------------------------------------------------------
+
+    int len = strlen (format_string) ;
+    if (len >= 6)
+    {
+        if (MATCH (format_string + len - 6, "by row"))
+        { 
+            valid = true ;
+            (*fmt) = GxB_BY_ROW ;
+            len = len - 6 ;
+            format_string [GB_IMAX (0, len-1)] = '\0' ;
+        }
+        else if (MATCH (format_string + len - 6, "by col"))
+        { 
+            valid = true ;
+            (*fmt) = GxB_BY_COL ;
+            len = len - 6 ;
+            format_string [GB_IMAX (0, len-1)] = '\0' ;
+        }
     }
-    else if (MATCH (format_string, "by col"))
-    { 
-        fmt = GxB_BY_COL  ;
+
+    //--------------------------------------------------------------------------
+    // parse the format for hypersparse/sparse/bitmap/full sparsity tokens
+    //--------------------------------------------------------------------------
+
+    int s = 0 ;
+    int kstart = 0 ;
+    for (int k = 0 ; k <= len ; k++)
+    {
+        if (format_string [k] == '/' || format_string [k] == '\0')
+        {
+            // mark the end of prior token
+            format_string [k] = '\0' ;
+
+            // null-terminated token is contained in format_string [kstart:k]
+            if (MATCH (format_string + kstart, "sparse"))
+            { 
+                s += GxB_SPARSE ;
+            }
+            else if (MATCH (format_string + kstart, "hypersparse") ||
+                     MATCH (format_string + kstart, "hyper"))
+            { 
+                s += GxB_HYPERSPARSE ;
+            }
+            else if (MATCH (format_string + kstart, "bitmap"))
+            { 
+                s += GxB_BITMAP ;
+            }
+            else if (MATCH (format_string + kstart, "full"))
+            { 
+                s += GxB_FULL ;
+            }
+
+            // advance to the next token
+            kstart = k+1 ;
+        }
     }
-    else
-    { 
-        // The string is not a format string, but this is not an error here.
-        // For example, G = GrB (m,n,'double','by row') queries both its string
-        // input arguments with this function and gb_mxstring_to_type, to parse
-        // its inputs.
-        fmt = GxB_NO_FORMAT ;
+
+    if (s > 0)
+    {
+        valid = true ;
+        (*sparsity) = s ;
     }
-    return (fmt) ;
+
+    return (valid) ;
 }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_monoid.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_monoid.c
index 13ac0e65c7..87cee910dd 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_monoid.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_monoid.c
@@ -2,8 +2,8 @@
 // gb_mxstring_to_monoid: get a GraphBLAS monoid from a MATLAB string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_selectop.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_selectop.c
index 6c0f8d53d8..7adb7a45b4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_selectop.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_selectop.c
@@ -2,8 +2,8 @@
 // gb_mxstring_to_selectop: get a GraphBLAS select operator from a MATLAB string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_semiring.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_semiring.c
index b0b0aafd43..97f16974a9 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_semiring.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_semiring.c
@@ -2,8 +2,8 @@
 // gb_mxstring_to_semiring: get a GraphBLAS semiring from a MATLAB string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_string.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_string.c
index c413da33c0..9a432bed6e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_string.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_string.c
@@ -2,8 +2,8 @@
 // gb_mxstring_to_string: copy a MATLAB string into a C string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_type.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_type.c
index 3d0c304633..8916c8e108 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_type.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_type.c
@@ -2,8 +2,8 @@
 // gb_mxstring_to_type: return the GraphBLAS type from a MATLAB string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_unop.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_unop.c
index b4e36cdf7f..45130f5379 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_unop.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_mxstring_to_unop.c
@@ -2,8 +2,8 @@
 // gb_mxstring_to_unop: get a GraphBLAS unary operator from a MATLAB string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_new.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_new.c
new file mode 100644
index 0000000000..8722b56595
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_new.c
@@ -0,0 +1,53 @@
+//------------------------------------------------------------------------------
+// gb_new: create a GraphBLAS matrix with desired format and sparsity control
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "gb_matlab.h"
+
+GrB_Matrix gb_new               // create and empty matrix C
+(
+    GrB_Type type,              // type of C
+    GrB_Index nrows,            // # of rows
+    GrB_Index ncols,            // # of rows
+    GxB_Format_Value fmt,       // requested format, if < 0 use default
+    int sparsity                // sparsity control for C, 0 for default
+)
+{
+
+    // create the matrix
+    GrB_Matrix C = NULL ;
+    OK (GrB_Matrix_new (&C, type, nrows, ncols)) ;
+
+    // get the default format, if needed
+    if (fmt < 0)
+    {
+        fmt = gb_default_format (nrows, ncols) ;
+    }
+
+    // set the desired format
+    GxB_Format_Value fmt_current ;
+    OK (GxB_Matrix_Option_get (C, GxB_FORMAT, &fmt_current)) ;
+    if (fmt != fmt_current)
+    {
+        OK (GxB_Matrix_Option_set (C, GxB_FORMAT, fmt)) ;
+    }
+
+    // set the desired sparsity structure
+    if (sparsity != 0)
+    {
+        int current ;
+        OK (GxB_Matrix_Option_get (C, GxB_SPARSITY_CONTROL, &current)) ;
+        if (current != sparsity)
+        {
+            OK (GxB_Matrix_Option_set (C, GxB_SPARSITY_CONTROL, sparsity)) ;
+        }
+    }
+
+    return (C) ;
+}
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_norm.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_norm.c
index ba493b2a93..445fdab816 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_norm.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_norm.c
@@ -2,8 +2,8 @@
 // gb_norm: compute the norm of a GraphBLAS matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -39,7 +39,7 @@ double gb_norm              // compute norm (A,kind)
     { 
         // if A is FP32, use the FP32 type and operators
         xtype = GrB_FP32 ;
-        absop = GxB_ABS_FP32 ;
+        absop = GrB_ABS_FP32 ;
         sumop = GrB_PLUS_MONOID_FP32 ;
         maxop = GrB_MAX_MONOID_FP32 ;
         minop = GrB_MIN_MONOID_FP32 ;
@@ -69,7 +69,7 @@ double gb_norm              // compute norm (A,kind)
         // otherwise, use FP64 type and operators; this will typecast the 
         // input matrix to FP64 if A is not of that type.
         xtype = GrB_FP64 ;
-        absop = GxB_ABS_FP64 ;
+        absop = GrB_ABS_FP64 ;
         sumop = GrB_PLUS_MONOID_FP64 ;
         maxop = GrB_MAX_MONOID_FP64 ;
         minop = GrB_MIN_MONOID_FP64 ;
@@ -99,23 +99,23 @@ double gb_norm              // compute norm (A,kind)
                 if (is_complex)
                 { 
                     // X = abs (A)
-                    OK (GrB_Matrix_apply (X, NULL, NULL, absop, A, NULL)) ;
+                    OK1 (X, GrB_Matrix_apply (X, NULL, NULL, absop, A, NULL)) ;
                     // X = X.^2
                     if (atype == GxB_FC32)
                     {
-                        OK (GrB_Matrix_apply_BinaryOp2nd_FP32 (X, NULL, NULL,
-                            GxB_POW_FP32, X, (float) 2.0, NULL)) ;
+                        OK1 (X, GrB_Matrix_apply_BinaryOp2nd_FP32 (X, NULL,
+                            NULL, GxB_POW_FP32, X, (float) 2.0, NULL)) ;
                     }
                     else
                     {
-                        OK (GrB_Matrix_apply_BinaryOp2nd_FP64 (X, NULL, NULL,
-                            GxB_POW_FP64, X, (double) 2.0, NULL)) ;
+                        OK1 (X, GrB_Matrix_apply_BinaryOp2nd_FP64 (X, NULL,
+                            NULL, GxB_POW_FP64, X, (double) 2.0, NULL)) ;
                     }
                 }
                 else
                 { 
                     // X = A.^2
-                    OK (GrB_Matrix_apply_BinaryOp2nd_FP64 (X, NULL, NULL,
+                    OK1 (X, GrB_Matrix_apply_BinaryOp2nd_FP64 (X, NULL, NULL,
                         GxB_POW_FP64, A, (double) 2.0, NULL)) ;
                 }
                 // s = sum (X)
@@ -126,7 +126,7 @@ double gb_norm              // compute norm (A,kind)
             case 1 :    // 1-norm
 
                 // X = abs (A)
-                OK (GrB_Matrix_apply (X, NULL, NULL, absop, A, NULL)) ;
+                OK1 (X, GrB_Matrix_apply (X, NULL, NULL, absop, A, NULL)) ;
                 // s = sum (X)
                 OK (GrB_Matrix_reduce_FP64 (&s, NULL, sumop, X, NULL)) ;
                 break ;
@@ -134,7 +134,7 @@ double gb_norm              // compute norm (A,kind)
             case INT64_MAX :    // inf-norm
 
                 // X = abs (A)
-                OK (GrB_Matrix_apply (X, NULL, NULL, absop, A, NULL)) ;
+                OK1 (X, GrB_Matrix_apply (X, NULL, NULL, absop, A, NULL)) ;
                 // s = max (X)
                 OK (GrB_Matrix_reduce_FP64 (&s, NULL, maxop, X, NULL)) ;
                 break ;
@@ -144,7 +144,7 @@ double gb_norm              // compute norm (A,kind)
                 if (GB_is_dense (A))
                 { 
                     // X = abs (A)
-                    OK (GrB_Matrix_apply (X, NULL, NULL, absop, A, NULL)) ;
+                    OK1 (X, GrB_Matrix_apply (X, NULL, NULL, absop, A, NULL)) ;
                     // s = min (X)
                     OK (GrB_Matrix_reduce_FP64 (&s, NULL, minop, X, NULL)) ;
                 }
@@ -175,7 +175,7 @@ double gb_norm              // compute norm (A,kind)
             case 1 :    // 1-norm:  max sum of columns of abs (A)
 
                 // X = abs (A)
-                OK (GrB_Matrix_apply (X, NULL, NULL, absop, A, NULL)) ;
+                OK1 (X, GrB_Matrix_apply (X, NULL, NULL, absop, A, NULL)) ;
                 // t = zeros (ncols,1)
                 OK (GrB_Vector_new (&t, xtype, ncols)) ;
                 // t(j) = sum of the ith column, X(:,j)
@@ -188,7 +188,7 @@ double gb_norm              // compute norm (A,kind)
             case INT64_MAX :    // inf-norm:  max sum of rows of abs (A)
 
                 // X = abs (A)
-                OK (GrB_Matrix_apply (X, NULL, NULL, absop, A, NULL)) ;
+                OK1 (X, GrB_Matrix_apply (X, NULL, NULL, absop, A, NULL)) ;
                 // t = zeros (nrows,1)
                 OK (GrB_Vector_new (&t, xtype, nrows)) ;
                 // t(i) = sum of the ith row, X(i,:)
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_norm_kind.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_norm_kind.c
index 5c5e1853ee..adc32666e1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_norm_kind.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_norm_kind.c
@@ -2,8 +2,8 @@
 // gb_norm_kind: determine the kind of norm to compute
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_round_binop.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_round_binop.c
index 45546c33bb..a4a3f9d2cb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_round_binop.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_round_binop.c
@@ -2,8 +2,8 @@
 // gb_round_binop: get a rounding operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_semiring.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_semiring.c
index 18bf34e068..9899700e38 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_semiring.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_semiring.c
@@ -2,8 +2,8 @@
 // gb_semiring: get a built-in semiring from an add and multiply operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,7 +16,7 @@
 
 // Using built-in types and operators, many unique semirings can be built.  Not
 // all possible semirings that can be constructed from built-in types and
-// operators are pre-defined.  Below is a list of the 1473 pre-defined
+// operators are pre-defined.  Below is a list of the 1513 pre-defined
 // semirings.
 
 // 1000 semirings with a multiply operator TxT -> T where T is non-Boolean, from
@@ -56,6 +56,17 @@
 //      4 bitwise multiply operators: BOR, BAND, BXOR, BXNOR
 //      4 unsigned integer types: UINT8, UINT16, UINT32, UINT64
 
+// 80 positional semirings: TxT -> T where T is int64:
+
+//      5 monoids: MIN, MAX, PLUS, TIMES, ANY
+//      8 multiply operators:
+//          FIRSTI, FIRSTI1, FIRSTJ, FIRSTJ1,
+//          SECONDI, SECONDI1, SECONDJ, SECONDJ1
+//      2 type: int32, int64
+//
+//      Note that FIRSTJ and SECONDI are identical when used in a semiring,
+//      as the mult operator.  Likewise for FIRSTJ1 and SECONDI1.
+
 // In the names below, each semiring has a name of the form GxB_add_mult_T
 // where add is the additive monoid, mult is the multiply operator, and T is
 // the type.  The type T is always the type of x and y for the z=mult(x,y)
@@ -2597,12 +2608,246 @@ GrB_Semiring gb_semiring            // built-in semiring, or NULL if error
             default  : ;
         }
 
+        //----------------------------------------------------------------------
+        // 80 positional semirings
+        //----------------------------------------------------------------------
+
+        switch (mult_opcode)
+        {
+
+            case GB_FIRSTI_opcode   :   // z = first_i(A(i,k),y) == i
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTI_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTI_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTI_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTI_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTI_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTI_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTI_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTI_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTI_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTI_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_FIRSTI1_opcode  :   // z = first_i1(A(i,k),y) == i+1
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTI1_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTI1_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTI1_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTI1_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTI1_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTI1_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTI1_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTI1_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTI1_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTI1_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_FIRSTJ_opcode   :   // z = first_j(A(i,k),y) == k
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTJ_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTJ_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTJ_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTJ_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTJ_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTJ_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTJ_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTJ_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTJ_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTJ_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_FIRSTJ1_opcode  :   // z = first_j1(A(i,k),y) == k+1
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTJ1_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTJ1_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTJ1_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTJ1_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTJ1_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTJ1_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTJ1_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTJ1_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTJ1_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTJ1_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_SECONDI_opcode  :   // z = second_i(x,B(k,j)) == k
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDI_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDI_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDI_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDI_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDI_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDI_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDI_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDI_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDI_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDI_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_SECONDI1_opcode :   // z = second_i1(x,B(k,j)) == k+1
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDI1_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDI1_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDI1_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDI1_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDI1_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDI1_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDI1_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDI1_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDI1_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDI1_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_SECONDJ_opcode  :   // z = second_j(x,B(i,j)) == j
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDJ_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDJ_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDJ_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDJ_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDJ_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDJ_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDJ_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDJ_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDJ_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDJ_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_SECONDJ1_opcode :   // z = second_j1(x,B(i,j)) == j+1
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDJ1_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDJ1_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDJ1_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDJ1_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDJ1_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDJ1_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDJ1_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDJ1_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDJ1_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDJ1_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            default  : ;
+        }
+
     }
     else if (xcode != GB_BOOL_code)
     {
 
         //----------------------------------------------------------------------
-        // 300 semirings with TxT->bool multiply operators
+        // 300 semirings with TxT -> bool multiply operators
         //----------------------------------------------------------------------
 
         // x,y are one of the 10 non-Boolean types, z is Boolean
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_and_type_to_binop.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_and_type_to_binop.c
index ff7b9c60bf..e5b28575da 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_and_type_to_binop.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_and_type_to_binop.c
@@ -2,8 +2,8 @@
 // gb_string_and_type_to_binop: get a GraphBLAS operator from a string and type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +19,10 @@
 //  bitwise operators:
 //      bitand, bitor, bitxor, bitxnor, bitget, bitset, bitclr, bitshift
 
+// positional operators:
+//      firsti0, firsti1, firstj0, firstj1, secondi0, secondi1, secondj0,
+//      secondj1.  The default type is int64
+
 // The following synonyms are allowed for specifying these operators:
 //
 //      1st   first
@@ -43,12 +47,11 @@
 GrB_BinaryOp gb_string_and_type_to_binop    // return op from string and type
 (
     const char *op_name,        // name of the operator, as a string
-    const GrB_Type type         // type of the x,y inputs to the operator
+    const GrB_Type type,        // type of the x,y inputs to the operator
+    const bool type_not_given   // true if no type present in the string
 )
 {
 
-    CHECK_ERROR (type == NULL, "unsupported type") ;
-
     if (MATCH (op_name, "1st") || MATCH (op_name, "first"))
     { 
 
@@ -689,6 +692,74 @@ GrB_BinaryOp gb_string_and_type_to_binop    // return op from string and type
         if (type == GrB_UINT32) return (GxB_BSHIFT_UINT32) ;
         if (type == GrB_UINT64) return (GxB_BSHIFT_UINT64) ;
 
+    }
+    else if (MATCH (op_name, "firsti0" ) || MATCH (op_name, "1sti0"))
+    {
+
+        if (type == GrB_INT32) return (GxB_FIRSTI_INT32 ) ;
+        if (type == GrB_INT64
+        ||  type_not_given   ) return (GxB_FIRSTI_INT64 ) ;
+
+    }
+    else if (MATCH (op_name, "firsti1" ) || MATCH (op_name, "1sti1") ||
+             MATCH (op_name, "firsti"  ) || MATCH (op_name, "1sti"))
+    {
+
+        if (type == GrB_INT32) return (GxB_FIRSTI1_INT32 ) ;
+        if (type == GrB_INT64
+        ||  type_not_given   ) return (GxB_FIRSTI1_INT64 ) ;
+
+    }
+    else if (MATCH (op_name, "firstj0" ) || MATCH (op_name, "1stj0"))
+    {
+
+        if (type == GrB_INT32) return (GxB_FIRSTJ_INT32 ) ;
+        if (type == GrB_INT64
+        ||  type_not_given   ) return (GxB_FIRSTJ_INT64 ) ;
+
+    }
+    else if (MATCH (op_name, "firstj1" ) || MATCH (op_name, "1stj1") ||
+             MATCH (op_name, "firstj"  ) || MATCH (op_name, "1stj"))
+    {
+
+        if (type == GrB_INT32) return (GxB_FIRSTJ1_INT32 ) ;
+        if (type == GrB_INT64
+        ||  type_not_given   ) return (GxB_FIRSTJ1_INT64 ) ;
+
+    }
+    else if (MATCH (op_name, "secondi0") || MATCH (op_name, "2ndi0"))
+    {
+
+        if (type == GrB_INT32) return (GxB_SECONDI_INT32 ) ;
+        if (type == GrB_INT64
+        ||  type_not_given   ) return (GxB_SECONDI_INT64 ) ;
+
+    }
+    else if (MATCH (op_name, "secondi1") || MATCH (op_name, "2ndi1") ||
+             MATCH (op_name, "secondi" ) || MATCH (op_name, "2ndi"))
+    {
+
+        if (type == GrB_INT32) return (GxB_SECONDI1_INT32 ) ;
+        if (type == GrB_INT64
+        ||  type_not_given   ) return (GxB_SECONDI1_INT64 ) ;
+
+    }
+    else if (MATCH (op_name, "secondj0" ) || MATCH (op_name, "2ndj0"))
+    {
+
+        if (type == GrB_INT32) return (GxB_SECONDJ_INT32 ) ;
+        if (type == GrB_INT64
+        ||  type_not_given   ) return (GxB_SECONDJ_INT64 ) ;
+
+    }
+    else if (MATCH (op_name, "secondj1") || MATCH (op_name, "2ndj1") ||
+             MATCH (op_name, "secondj" ) || MATCH (op_name, "2ndj"))
+    {
+
+        if (type == GrB_INT32) return (GxB_SECONDJ1_INT32 ) ;
+        if (type == GrB_INT64
+        ||  type_not_given   ) return (GxB_SECONDJ1_INT64 ) ;
+
     }
 
     ERROR2 ("unknown binary operator", op_name) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_and_type_to_unop.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_and_type_to_unop.c
index 2454cbd446..5ec4f78761 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_and_type_to_unop.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_and_type_to_unop.c
@@ -2,8 +2,8 @@
 // gb_string_and_type_to_unop: get a GraphBLAS operator from a string and type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -38,6 +38,10 @@
 // for integer types only:
 //      bitcmp
 
+// for int32 and int64:
+//      positioni0, positioni1
+//      positionj0, positionj1
+
 // The following equivalent synonyms are available:
 //  identity    +       uplus
 //  ainv        -       uminus  negate
@@ -49,16 +53,19 @@
 //  lgamma      gammaln
 //  tgamma      gamma
 //  exp2        pow2
+//  i           i1  positioni   positioni1 (since MATLAB is 1-based)
+//  i0          positioni0
+//  j           j1  positionj   positionj1 (since MATLAB is 1-based)
+//  j0          positionj0
 
 GrB_UnaryOp gb_string_and_type_to_unop  // return op from string and type
 (
     const char *op_name,        // name of the operator, as a string
-    const GrB_Type type         // type of the x,y inputs to the operator
+    const GrB_Type type,        // type of the input to the operator
+    const bool type_not_given   // true if no type present in the string
 )
 {
 
-    CHECK_ERROR (type == NULL, "unsupported type") ;
-
     if (MATCH (op_name, "identity") || MATCH (op_name, "+") ||
         MATCH (op_name, "uplus"))
     { 
@@ -153,17 +160,17 @@ GrB_UnaryOp gb_string_and_type_to_unop  // return op from string and type
     else if (MATCH (op_name, "abs"))
     { 
 
-        if (type == GrB_BOOL  ) return (GxB_ABS_BOOL  ) ;
-        if (type == GrB_INT8  ) return (GxB_ABS_INT8  ) ;
-        if (type == GrB_INT16 ) return (GxB_ABS_INT16 ) ;
-        if (type == GrB_INT32 ) return (GxB_ABS_INT32 ) ;
-        if (type == GrB_INT64 ) return (GxB_ABS_INT64 ) ;
-        if (type == GrB_UINT8 ) return (GxB_ABS_UINT8 ) ;
-        if (type == GrB_UINT16) return (GxB_ABS_UINT16) ;
-        if (type == GrB_UINT32) return (GxB_ABS_UINT32) ;
-        if (type == GrB_UINT64) return (GxB_ABS_UINT64) ;
-        if (type == GrB_FP32  ) return (GxB_ABS_FP32  ) ;
-        if (type == GrB_FP64  ) return (GxB_ABS_FP64  ) ;
+        if (type == GrB_BOOL  ) return (GrB_ABS_BOOL  ) ;
+        if (type == GrB_INT8  ) return (GrB_ABS_INT8  ) ;
+        if (type == GrB_INT16 ) return (GrB_ABS_INT16 ) ;
+        if (type == GrB_INT32 ) return (GrB_ABS_INT32 ) ;
+        if (type == GrB_INT64 ) return (GrB_ABS_INT64 ) ;
+        if (type == GrB_UINT8 ) return (GrB_ABS_UINT8 ) ;
+        if (type == GrB_UINT16) return (GrB_ABS_UINT16) ;
+        if (type == GrB_UINT32) return (GrB_ABS_UINT32) ;
+        if (type == GrB_UINT64) return (GrB_ABS_UINT64) ;
+        if (type == GrB_FP32  ) return (GrB_ABS_FP32  ) ;
+        if (type == GrB_FP64  ) return (GrB_ABS_FP64  ) ;
         if (type == GxB_FC32  ) return (GxB_ABS_FC32  ) ;
         if (type == GxB_FC64  ) return (GxB_ABS_FC64  ) ;
 
@@ -502,6 +509,40 @@ GrB_UnaryOp gb_string_and_type_to_unop  // return op from string and type
         if (type == GrB_UINT32) return (GrB_BNOT_UINT32) ;
         if (type == GrB_UINT64) return (GrB_BNOT_UINT64) ;
 
+    }
+    else if (MATCH (op_name, "positioni0") || MATCH (op_name, "i0"))
+    { 
+
+        if (type == GrB_INT32) return (GxB_POSITIONI_INT32) ;
+        if (type == GrB_INT64
+        ||  type_not_given   ) return (GxB_POSITIONI_INT64) ;
+
+    }
+    else if (MATCH (op_name, "positioni1") || MATCH (op_name, "i1") ||
+             MATCH (op_name, "positioni" ) || MATCH (op_name, "i"))
+    { 
+
+        if (type == GrB_INT32) return (GxB_POSITIONI1_INT32) ;
+        if (type == GrB_INT64
+        ||  type_not_given   ) return (GxB_POSITIONI1_INT64) ;
+
+    }
+    else if (MATCH (op_name, "positionj0") || MATCH (op_name, "j0"))
+    { 
+
+        if (type == GrB_INT32) return (GxB_POSITIONJ_INT32) ;
+        if (type == GrB_INT64
+        ||  type_not_given   ) return (GxB_POSITIONJ_INT64) ;
+
+    }
+    else if (MATCH (op_name, "positionj1") || MATCH (op_name, "j1") ||
+             MATCH (op_name, "positionj" ) || MATCH (op_name, "j"))
+    { 
+
+        if (type == GrB_INT32) return (GxB_POSITIONJ1_INT32) ;
+        if (type == GrB_INT64
+        ||  type_not_given   ) return (GxB_POSITIONJ1_INT64) ;
+
     }
 
     ERROR2 ("unknown unary operator", op_name) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_binop.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_binop.c
index d8f63790ed..4cd1e13406 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_binop.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_binop.c
@@ -2,16 +2,16 @@
 // gb_string_to_binop: get a GraphBLAS operator from a string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "gb_matlab.h"
 
 // The string has the form op_name.op_type.  For example '+.double' is the
-// GrB_PLUS_FP64 operator.  The type is optional.  If not present, it defaults
-// to the default_type parameter.
+// GrB_PLUS_FP64 operator.  The type is optional.  If not present in the
+// string, it is found by gb_default_type (atype, btype).
 
 GrB_BinaryOp gb_string_to_binop         // return binary operator from a string
 (
@@ -40,8 +40,9 @@ GrB_BinaryOp gb_string_to_binop         // return binary operator from a string
     // get the operator type
     //--------------------------------------------------------------------------
 
+    bool type_not_given = (op_typename == NULL) ;
     GrB_Type type ;
-    if (op_typename == NULL)
+    if (type_not_given)
     { 
         type = gb_default_type (atype, btype) ;
     }
@@ -54,6 +55,6 @@ GrB_BinaryOp gb_string_to_binop         // return binary operator from a string
     // convert the string to a GraphBLAS binary operator, built-in or Complex
     //--------------------------------------------------------------------------
 
-    return (gb_string_and_type_to_binop (op_name, type)) ;
+    return (gb_string_and_type_to_binop (op_name, type, type_not_given)) ;
 }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_monoid.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_monoid.c
index 803861720e..45dbcb09c2 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_monoid.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_monoid.c
@@ -2,8 +2,8 @@
 // gb_string_to_monoid: get a GraphBLAS monoid from a string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_selectop.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_selectop.c
index a5f7f2e80f..2f73c73e57 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_selectop.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_selectop.c
@@ -2,8 +2,8 @@
 // gb_string_to_selectop: get a GraphBLAS select operator from a string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_semiring.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_semiring.c
index f45991280e..1dfd8790f4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_semiring.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_semiring.c
@@ -2,8 +2,8 @@
 // gb_string_to_semiring: convert a string to a GraphBLAS semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,6 +47,7 @@ GrB_Semiring gb_string_to_semiring      // return a semiring from a string
     // get the mult operator
     //--------------------------------------------------------------------------
 
+    bool type_not_given = (mult_typename == NULL) ;
     GrB_Type mult_type ;
     if (mult_typename == NULL)
     { 
@@ -57,7 +58,8 @@ GrB_Semiring gb_string_to_semiring      // return a semiring from a string
         mult_type = gb_string_to_type (mult_typename) ;
     }
 
-    GrB_BinaryOp mult = gb_string_and_type_to_binop (mult_name, mult_type) ;
+    GrB_BinaryOp mult = gb_string_and_type_to_binop (mult_name, mult_type,
+        type_not_given) ;
     CHECK_ERROR (mult == NULL, "invalid semiring (unknown multipy operator)") ;
 
     //--------------------------------------------------------------------------
@@ -65,8 +67,7 @@ GrB_Semiring gb_string_to_semiring      // return a semiring from a string
     //--------------------------------------------------------------------------
 
     GrB_Type add_type = mult->ztype ;
-
-    GrB_BinaryOp add = gb_string_and_type_to_binop (add_name, add_type) ;
+    GrB_BinaryOp add = gb_string_and_type_to_binop (add_name, add_type, false) ;
     CHECK_ERROR (add == NULL, "invalid semiring (unknown add operator)") ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_type.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_type.c
index b722f45717..16763ad4f0 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_type.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_type.c
@@ -2,8 +2,8 @@
 // gb_string_to_type: return the GraphBLAS type from a string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_unop.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_unop.c
index 28fc7331e4..d8b25874b8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_unop.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_string_to_unop.c
@@ -2,15 +2,15 @@
 // gb_string_to_unop: get a GraphBLAS unary operator from a string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "gb_matlab.h"
 
 // The string has the form op_name.op_type.  For example 'abs.double' is the
-// GxB_ABS_FP64 operator.  The type is optional.  If not present, it defaults
+// GrB_ABS_FP64 operator.  The type is optional.  If not present, it defaults
 // to the default_type parameter.
 
 GrB_UnaryOp gb_string_to_unop           // return unary operator from a string
@@ -39,6 +39,7 @@ GrB_UnaryOp gb_string_to_unop           // return unary operator from a string
     // get the operator type
     //--------------------------------------------------------------------------
 
+    bool type_not_given = (op_typename == NULL) ;
     GrB_Type type ;
     if (op_typename == NULL)
     { 
@@ -53,6 +54,6 @@ GrB_UnaryOp gb_string_to_unop           // return unary operator from a string
     // convert the string to a GraphBLAS unary operator, built-in or Complex
     //--------------------------------------------------------------------------
 
-    return (gb_string_and_type_to_unop (op_name, type)) ;
+    return (gb_string_and_type_to_unop (op_name, type, type_not_given)) ;
 }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_type_to_mxstring.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_type_to_mxstring.c
index cd1318255b..6a14bbd609 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_type_to_mxstring.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_type_to_mxstring.c
@@ -2,8 +2,8 @@
 // gb_type_to_mxstring: create a MATLAB string from a GraphBLAS type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_typecast.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_typecast.c
index a508b658fd..9a79d08a90 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_typecast.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_typecast.c
@@ -2,70 +2,70 @@
 // gb_typecast: typecast a GraphBLAS matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "gb_matlab.h"
 
-GrB_Matrix gb_typecast      // A = (type) S, where A is deep
+GrB_Matrix gb_typecast          // C = (type) A, where C is deep
 (
-    GrB_Type type,              // if NULL, copy but do not typecast
-    GxB_Format_Value fmt,       // also convert to the requested format
-    GrB_Matrix S                // may be shallow
+    GrB_Matrix A,               // may be shallow
+    GrB_Type type,              // if NULL, use the type of A
+    GxB_Format_Value fmt,       // format of C
+    int sparsity                // sparsity control for C, if 0 use A
 )
 {
 
-    GrB_Matrix A ;
+    //--------------------------------------------------------------------------
+    // determine the sparsity control for C
+    //--------------------------------------------------------------------------
+
+    sparsity = gb_get_sparsity (A, NULL, sparsity) ;
+
+    //--------------------------------------------------------------------------
+    // get the type of C and A
+    //--------------------------------------------------------------------------
 
+    GrB_Type atype ;
+    OK (GxB_Matrix_type (&atype, A)) ;
     if (type == NULL)
     { 
+        // keep the same type
+        type = atype ;
+    }
+
+    //--------------------------------------------------------------------------
+    // create the empty C matrix and set its format and sparsity
+    //--------------------------------------------------------------------------
 
-        //----------------------------------------------------------------------
-        // make a deep copy of the input
-        //----------------------------------------------------------------------
+    GrB_Index nrows, ncols ;
+    OK (GrB_Matrix_nrows (&nrows, A)) ;
+    OK (GrB_Matrix_ncols (&ncols, A)) ;
+    GrB_Matrix C = gb_new (type, nrows, ncols, fmt, sparsity) ;
 
-        OK (GrB_Matrix_dup (&A, S)) ;
-        OK (GxB_Matrix_Option_set (A, GxB_FORMAT, fmt)) ;
+    //--------------------------------------------------------------------------
+    // C = A
+    //--------------------------------------------------------------------------
 
+    if (gb_is_integer (type) && gb_is_float (atype))
+    { 
+        // C = (type) round (A), using MATLAB rules for typecasting.
+        OK1 (C, GrB_Matrix_apply (C, NULL, NULL, gb_round_binop (atype), A,
+            NULL)) ;
     }
     else
     { 
-
-        //----------------------------------------------------------------------
-        // typecast the input to the requested type and format
-        //----------------------------------------------------------------------
-
-        GrB_Index nrows, ncols ;
-        OK (GrB_Matrix_nrows (&nrows, S)) ;
-        OK (GrB_Matrix_ncols (&ncols, S)) ;
-        OK (GrB_Matrix_new (&A, type, nrows, ncols)) ;
-        OK (GxB_Matrix_Option_set (A, GxB_FORMAT, fmt)) ;
-        GrB_Type stype ;
-        OK (GxB_Matrix_type (&stype, S)) ;
-
-        if (gb_is_integer (type) && gb_is_float (stype))
-        { 
-            // A = (type) round (S), using MATLAB rules for typecasting.
-            OK (GrB_Matrix_apply (A, NULL, NULL, gb_round_binop (stype), S,
-                NULL)) ;
-        }
-        else
-        { 
-            // A = (type) S, no rounding.  Use GraphBLAS typecasting if needed.
-            GrB_Descriptor d ;
-            OK (GrB_Descriptor_new (&d)) ;
-            OK (GrB_Descriptor_set (d, GrB_INP0, GrB_TRAN)) ;
-            OK (GrB_transpose (A, NULL, NULL, S, d)) ;
-            OK (GrB_Descriptor_free (&d)) ;
-        }
+        // C = (type) A, with GraphBLAS typecasting if needed.
+        OK1 (C, GrB_Matrix_assign (C, NULL, NULL, A,
+            GrB_ALL, nrows, GrB_ALL, ncols, NULL)) ;
     }
 
     //--------------------------------------------------------------------------
     // return result
     //--------------------------------------------------------------------------
 
-    return (A) ;
+    return (C) ;
 }
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_usage.c b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_usage.c
index 6af7d300ef..d631ea8cd3 100644
--- a/GraphBLAS/GraphBLAS/@GrB/private/util/gb_usage.c
+++ b/GraphBLAS/GraphBLAS/@GrB/private/util/gb_usage.c
@@ -2,8 +2,8 @@
 // gb_usage: check usage and make sure GrB.init has been called
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/prod.m b/GraphBLAS/GraphBLAS/@GrB/prod.m
index 0cb676ba3a..63ed817a96 100644
--- a/GraphBLAS/GraphBLAS/@GrB/prod.m
+++ b/GraphBLAS/GraphBLAS/@GrB/prod.m
@@ -22,8 +22,8 @@
 %
 % See also GrB/all, GrB/max, GrB/min, GrB/sum.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/prune.m b/GraphBLAS/GraphBLAS/@GrB/prune.m
index 98cee056a1..83a8d42ae2 100644
--- a/GraphBLAS/GraphBLAS/@GrB/prune.m
+++ b/GraphBLAS/GraphBLAS/@GrB/prune.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/full, GrB.select, GrB.prune.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 1)
     id = 0 ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/random.m b/GraphBLAS/GraphBLAS/@GrB/random.m
index b6620e2bcb..578d651971 100644
--- a/GraphBLAS/GraphBLAS/@GrB/random.m
+++ b/GraphBLAS/GraphBLAS/@GrB/random.m
@@ -93,8 +93,8 @@
 %
 % See also GrB/sprand, GrB/sprandn, GrB/sprandsym.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 C = GrB (gb_random (varargin {:})) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/rdivide.m b/GraphBLAS/GraphBLAS/@GrB/rdivide.m
index 6738b63646..3f6bd38cb8 100644
--- a/GraphBLAS/GraphBLAS/@GrB/rdivide.m
+++ b/GraphBLAS/GraphBLAS/@GrB/rdivide.m
@@ -1,14 +1,14 @@
 function C = rdivide (A, B)
 %RDIVIDE C = A./B, sparse matrix element-wise division.
-% C = A./B when B is a matrix results in a dense matrix C, with all
+% C = A./B when B is a matrix results in a full matrix C, with all
 % entries present.  If A is a matrix and B is a scalar, then C has the
 % pattern of A, except if B is zero and A is double, single, or complex.
-% In that case, since 0/0 is NaN, C is a dense matrix.
+% In that case, since 0/0 is NaN, C is a full matrix.
 %
 % See also GrB/ldivide, GrB.emult, GrB.eadd.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
@@ -37,9 +37,9 @@
         % A is a matrix, B is a scalar
         if (gb_scalar (B) == 0 && gb_isfloat (atype))
             % 0/0 is Nan, and thus must be computed computed if A is
-            % floating-point.  The result is a dense matrix.
+            % floating-point.  The result is a full matrix.
             % expand B t a full matrix and cast to the type of A
-            B = gb_scalar_to_full (am, an, atype, B) ;
+            B = gb_scalar_to_full (am, an, atype, gb_fmt (A), B) ;
             C = GrB (gbemult (A, '/', B)) ;
         else
             % The scalar B is nonzero so just compute A/B in the pattern
@@ -47,7 +47,7 @@
             C = GrB (gbapply2 (A, '/', B)) ;
         end
     else
-        % both A and B are matrices.  The result is a dense matrix.
+        % both A and B are matrices.  The result is a full matrix.
         C = GrB (gbemult (gbfull (A, ctype), '/', gbfull (B, ctype))) ;
     end
 end
diff --git a/GraphBLAS/GraphBLAS/@GrB/real.m b/GraphBLAS/GraphBLAS/@GrB/real.m
index 554c300ba8..37a200f02e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/real.m
+++ b/GraphBLAS/GraphBLAS/@GrB/real.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/conj, GrB/imag.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 Q = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/reduce.m b/GraphBLAS/GraphBLAS/@GrB/reduce.m
index 255a03f9e4..88859437aa 100644
--- a/GraphBLAS/GraphBLAS/@GrB/reduce.m
+++ b/GraphBLAS/GraphBLAS/@GrB/reduce.m
@@ -26,8 +26,8 @@
 %
 % See also GrB.vreduce, GrB/sum, GrB/prod, GrB/max, GrB/min.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/repmat.m b/GraphBLAS/GraphBLAS/@GrB/repmat.m
index 411ffda095..ae0f6623c4 100644
--- a/GraphBLAS/GraphBLAS/@GrB/repmat.m
+++ b/GraphBLAS/GraphBLAS/@GrB/repmat.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/kron, GrB.kronecker.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/reshape.m b/GraphBLAS/GraphBLAS/@GrB/reshape.m
index 97225a719d..25869ef7d1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/reshape.m
+++ b/GraphBLAS/GraphBLAS/@GrB/reshape.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/numel, squeeze.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % FUTURE: this would be faster as a built-in GxB_reshape function.
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/round.m b/GraphBLAS/GraphBLAS/@GrB/round.m
index f53aba9806..c9f11cc859 100644
--- a/GraphBLAS/GraphBLAS/@GrB/round.m
+++ b/GraphBLAS/GraphBLAS/@GrB/round.m
@@ -7,10 +7,10 @@
 %
 % See also GrB/ceil, GrB/floor, GrB/fix.
 
-% FUTURE: round (x,n) and round (x,n,type)
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% FUTURE: round (x,n) and round (x,n,type)
 
 Q = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/sec.m b/GraphBLAS/GraphBLAS/@GrB/sec.m
index e37fe4b4bf..33e6ec2e96 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sec.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sec.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/asec, GrB/sech, GrB/asech.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/sech.m b/GraphBLAS/GraphBLAS/@GrB/sech.m
index 3bff483c45..f7b272de47 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sech.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sech.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/sec, GrB/asec, GrB/asech.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 type = gbtype (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/select.m b/GraphBLAS/GraphBLAS/@GrB/select.m
index 553f3aa72d..f321bcd7d6 100644
--- a/GraphBLAS/GraphBLAS/@GrB/select.m
+++ b/GraphBLAS/GraphBLAS/@GrB/select.m
@@ -78,8 +78,8 @@
 %
 % See also GrB/tril, GrB/triu, GrB/diag.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/selectopinfo.m b/GraphBLAS/GraphBLAS/@GrB/selectopinfo.m
index 04817f0205..3c0249c53f 100644
--- a/GraphBLAS/GraphBLAS/@GrB/selectopinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/selectopinfo.m
@@ -37,8 +37,8 @@ function selectopinfo (op)
 % See also GrB.binopinfo, GrB.descriptorinfo, GrB.monoidinfo,
 % GrB.semiringinfo, GrB.unopinfo.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 0)
     help GrB.selectopinfo
diff --git a/GraphBLAS/GraphBLAS/@GrB/semiringinfo.m b/GraphBLAS/GraphBLAS/@GrB/semiringinfo.m
index 399660b409..8bbcd18141 100644
--- a/GraphBLAS/GraphBLAS/@GrB/semiringinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/semiringinfo.m
@@ -33,8 +33,8 @@ function semiringinfo (s, type)
 % See also GrB.binopinfo, GrB.descriptorinfo, GrB.monoidinfo,
 % GrB.selectopinfo, GrB.unopinfo.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 0)
     help GrB.semiringinfo
diff --git a/GraphBLAS/GraphBLAS/@GrB/sign.m b/GraphBLAS/GraphBLAS/@GrB/sign.m
index 5b4ef5798a..d7ec6fd7f7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sign.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sign.m
@@ -6,8 +6,8 @@
 %
 % See also GrB/abs.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 Q = G.opaque ;
 type = gbtype (Q) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/sin.m b/GraphBLAS/GraphBLAS/@GrB/sin.m
index f936bfe696..90d59b526f 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sin.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sin.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/asin, GrB/sinh, GrB/asinh.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 if (~gb_isfloat (gbtype (G)))
diff --git a/GraphBLAS/GraphBLAS/@GrB/single.m b/GraphBLAS/GraphBLAS/@GrB/single.m
index a9b0aa3dac..689f9ca4d1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/single.m
+++ b/GraphBLAS/GraphBLAS/@GrB/single.m
@@ -11,14 +11,18 @@
 % See also GrB, GrB/double, GrB/complex, GrB/logical, GrB/int8, GrB/int16,
 % GrB/int32, GrB/int64, GrB/uint8, GrB/uint16, GrB/uint32, GrB/uint64.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 desc.kind = 'full' ;
 if (contains (gbtype (G), 'complex'))
-    C = gbfull (G, 'single complex', complex (single (0)), desc) ;
+    z = complex (single (0)) ;
+    ctype = 'single complex' ;
 else
-    C = gbfull (G, 'single', single (0), desc) ;
+    z = single (0) ;
+    ctype = 'single complex' ;
 end
 
+C = gbfull (G, ctype, z, desc) ;                % export as a MATLAB full matrix
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/sinh.m b/GraphBLAS/GraphBLAS/@GrB/sinh.m
index 626a610411..10b9154993 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sinh.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sinh.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/sin, GrB/asin, GrB/asinh.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 if (~gb_isfloat (gbtype (G)))
diff --git a/GraphBLAS/GraphBLAS/@GrB/size.m b/GraphBLAS/GraphBLAS/@GrB/size.m
index affaf490b2..999f333c0c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/size.m
+++ b/GraphBLAS/GraphBLAS/@GrB/size.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/length, GrB/numel.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 [m, n] = gbsize (G) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/sparse.m b/GraphBLAS/GraphBLAS/@GrB/sparse.m
index 13777fe137..5e077c02bb 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sparse.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sparse.m
@@ -1,12 +1,22 @@
 function C = sparse (G)
 %SPARSE make a copy of a GraphBLAS sparse matrix.
-% Since G is already sparse, C = sparse (G) simply makes a copy of G.
+% If G is already sparse, C = sparse (G) simply makes a copy of G.
+% If G is full or bitmap, C = sparse (G) returns C as sparse or hypersparse.
 % Explicit zeros are not removed.  To remove them use C = GrB.prune(G).
 %
 % See also GrB/issparse, GrB/full, GrB.type, GrB/prune, GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-C = G ;
+[~, sparsity] = gbformat (G.opaque) ;
+
+switch (sparsity)
+    case { 'hypersparse', 'sparse' }
+        % nothing to do; G is already sparse or hypersparse
+        C = G ;
+    case { 'bitmap', 'full' }
+        % convert G to sparse or hypersparse
+        C = GrB (G, 'sparse/hypersparse') ;
+end
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/speye.m b/GraphBLAS/GraphBLAS/@GrB/speye.m
index 6b70511730..925a83f945 100644
--- a/GraphBLAS/GraphBLAS/@GrB/speye.m
+++ b/GraphBLAS/GraphBLAS/@GrB/speye.m
@@ -5,8 +5,8 @@
 %
 % See also GrB.eye.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 C = GrB (gb_speye ('speye', varargin {:})) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/spfun.m b/GraphBLAS/GraphBLAS/@GrB/spfun.m
index 261bf145d6..f6f35da57d 100644
--- a/GraphBLAS/GraphBLAS/@GrB/spfun.m
+++ b/GraphBLAS/GraphBLAS/@GrB/spfun.m
@@ -38,8 +38,8 @@
 %
 % See also GrB.apply, GrB.unopinfo.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (G))
     G = G.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/spones.m b/GraphBLAS/GraphBLAS/@GrB/spones.m
index c5692ea322..4d395213d6 100644
--- a/GraphBLAS/GraphBLAS/@GrB/spones.m
+++ b/GraphBLAS/GraphBLAS/@GrB/spones.m
@@ -17,8 +17,8 @@
 %
 % See also GrB/spfun, GrB.apply.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 if (nargin == 1)
diff --git a/GraphBLAS/GraphBLAS/@GrB/sprand.m b/GraphBLAS/GraphBLAS/@GrB/sprand.m
index 85d1bce15e..03bafc172e 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sprand.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sprand.m
@@ -16,8 +16,8 @@
 %
 % See also GrB/sprandn, GrB/sprandsym, GrB.random.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 1)
     % C = sprand (G)
diff --git a/GraphBLAS/GraphBLAS/@GrB/sprandn.m b/GraphBLAS/GraphBLAS/@GrB/sprandn.m
index eaa5792f39..dae162746a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sprandn.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sprandn.m
@@ -14,8 +14,8 @@
 %
 % See also GrB/sprandn, GrB/sprandsym, GrB.random.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 1)
     % C = sprandn (G)
diff --git a/GraphBLAS/GraphBLAS/@GrB/sprandsym.m b/GraphBLAS/GraphBLAS/@GrB/sprandsym.m
index 2b13c5c75f..94e9d25865 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sprandsym.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sprandsym.m
@@ -23,8 +23,8 @@
 %
 % See also GrB/sprand, GrB/sprandn, GrB.random.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 1)
     % C = sprandsym (G)
diff --git a/GraphBLAS/GraphBLAS/@GrB/sprintf.m b/GraphBLAS/GraphBLAS/@GrB/sprintf.m
index c54e1a2e99..2633231756 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sprintf.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sprintf.m
@@ -6,8 +6,8 @@
 %
 % See also fprintf, sprintf, GrB/fprintf.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 s = gb_printf_helper ('sprintf', varargin {:}) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/sqrt.m b/GraphBLAS/GraphBLAS/@GrB/sqrt.m
index 3998e05106..46905c3cd7 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sqrt.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sqrt.m
@@ -4,8 +4,8 @@
 %
 % See also GrB.apply, GrB/hypot.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 C = GrB (gb_to_real_if_imag_zero (gb_trig ('sqrt', G))) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/struct.m b/GraphBLAS/GraphBLAS/@GrB/struct.m
new file mode 100644
index 0000000000..640deaa566
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/struct.m
@@ -0,0 +1,27 @@
+function S = struct (G)
+%GRB.STRUCT return the opaque (private) contents of a @GrB object.
+% S = struct (G) returns the opaque properties of a @GrB object as a
+% struct.  The properties of G are private, and thus the fields of S
+% cannot be interpretted except by GrB (S).  However, S can be can be
+% saved to a file and then loaded back in, which may facilitate
+% portability with future SuiteSparse:GraphBLAS versions, in case the
+% contents of a @GrB object changes.  S can be converted back into a @GrB
+% object with G = GrB (S).
+%
+% See also load, save, properties, GrB.version, GrB.ver.
+
+% Note for Octave users:  Octave cannot save or load a @GrB object G
+% to/from a file, but it should be able to save/load the struct S.  When S
+% is loaded back in from the file, it can be converted to a @GrB object
+% with G = GrB (S).  A struct S constructed from a @GrB object G via S =
+% struct (G) always has the fieldname 'GraphBLASv4' as its first field, in
+% SuiteSparse:GraphBLAS version v4.0.1.  If the format of the struct
+% changes in the future, another field will be used, unique to that
+% version of SuiteSparse:GraphBLAS, and a backward-compatibility process
+% will be written so that G = GrB (S) converts S into a @GrB object for
+% the then-current version of SuiteSparse:GraphBLAS.
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+S = G.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/subassign.m b/GraphBLAS/GraphBLAS/@GrB/subassign.m
index d6b7e3042b..831d40a3f1 100644
--- a/GraphBLAS/GraphBLAS/@GrB/subassign.m
+++ b/GraphBLAS/GraphBLAS/@GrB/subassign.m
@@ -44,8 +44,8 @@
 %
 % See also GrB.assign, GrB/subsasgn.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/subsasgn.m b/GraphBLAS/GraphBLAS/@GrB/subsasgn.m
index 8c8c758db6..669aa67f2a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/subsasgn.m
+++ b/GraphBLAS/GraphBLAS/@GrB/subsasgn.m
@@ -34,10 +34,10 @@
 %
 % See also GrB/subsref, GrB/subsindex, GrB.assign, GrB.subassign.
 
-% FUTURE: add linear indexing, and allow the matrix to grow/shrink in size.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% FUTURE: add linear indexing, and allow the matrix to grow/shrink in size.
 
 if (~isequal (S.type, '()'))
     error ('index type %s not supported', S.type) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/subsindex.m b/GraphBLAS/GraphBLAS/@GrB/subsindex.m
index f02b5052d2..2e31e29ac5 100644
--- a/GraphBLAS/GraphBLAS/@GrB/subsindex.m
+++ b/GraphBLAS/GraphBLAS/@GrB/subsindex.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/subsref, GrB/subsasgn.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % On input, G must contain integers in the range 1 to prod (size (A))-1.
 % The dimensions of A are not provided to subsindex.
diff --git a/GraphBLAS/GraphBLAS/@GrB/subsref.m b/GraphBLAS/GraphBLAS/@GrB/subsref.m
index 72c3f6aa66..09ab2f45cf 100644
--- a/GraphBLAS/GraphBLAS/@GrB/subsref.m
+++ b/GraphBLAS/GraphBLAS/@GrB/subsref.m
@@ -7,7 +7,7 @@
 % x = A (M) for a logical matrix M constructs an nnz(M)-by-1 vector x,
 % for MATLAB-style logical indexing.  A or M may be MATLAB sparse or full
 % matrices, or GraphBLAS matrices, in any combination.  M must be either
-% a MATLAB logical matrix (sparse or dense), or a GraphBLAS logical
+% a MATLAB logical matrix (sparse or full), or a GraphBLAS logical
 % matrix; that is, GrB.type (M) must be 'logical'.
 %
 % GraphBLAS can construct huge sparse matrices, but they cannot always be
@@ -31,10 +31,10 @@
 % See also GrB/subsasgn, GrB/subsindex, GrB.subassign, GrB.assign,
 % GrB.extract.
 
-% FUTURE: add linear indexing.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% FUTURE: add linear indexing.
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/sum.m b/GraphBLAS/GraphBLAS/@GrB/sum.m
index bf06046db9..c813b91d74 100644
--- a/GraphBLAS/GraphBLAS/@GrB/sum.m
+++ b/GraphBLAS/GraphBLAS/@GrB/sum.m
@@ -20,8 +20,8 @@
 %
 % See also GrB/any, GrB/prod, GrB/max, GrB/min.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/symamd.m b/GraphBLAS/GraphBLAS/@GrB/symamd.m
index b2545f43d6..df3c6e7b45 100644
--- a/GraphBLAS/GraphBLAS/@GrB/symamd.m
+++ b/GraphBLAS/GraphBLAS/@GrB/symamd.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/amd, GrB/colamd, GrB/symrcm.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [p, varargout{1:nargout-1}] = symamd (double (G), varargin {:}) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/symrcm.m b/GraphBLAS/GraphBLAS/@GrB/symrcm.m
index 76f68623a4..14ff75a1af 100644
--- a/GraphBLAS/GraphBLAS/@GrB/symrcm.m
+++ b/GraphBLAS/GraphBLAS/@GrB/symrcm.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/amd, GrB/colamd, GrB/symamd.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 p = builtin ('symrcm', logical (G)) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/tan.m b/GraphBLAS/GraphBLAS/@GrB/tan.m
index 06c964dbb6..8104a80f7c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/tan.m
+++ b/GraphBLAS/GraphBLAS/@GrB/tan.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/tanh, GrB/atan, GrB/atanh, GrB/atan2.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 if (~gb_isfloat (gbtype (G)))
diff --git a/GraphBLAS/GraphBLAS/@GrB/tanh.m b/GraphBLAS/GraphBLAS/@GrB/tanh.m
index 99bf1527cf..87c8917395 100644
--- a/GraphBLAS/GraphBLAS/@GrB/tanh.m
+++ b/GraphBLAS/GraphBLAS/@GrB/tanh.m
@@ -4,8 +4,8 @@
 %
 % See also GrB/tan, GrB/atan, GrB/atanh, GrB/atan2.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 if (~gb_isfloat (gbtype (G)))
diff --git a/GraphBLAS/GraphBLAS/@GrB/threads.m b/GraphBLAS/GraphBLAS/@GrB/threads.m
index e577181cb4..9d7f4b041c 100644
--- a/GraphBLAS/GraphBLAS/@GrB/threads.m
+++ b/GraphBLAS/GraphBLAS/@GrB/threads.m
@@ -30,8 +30,8 @@
 %
 % See also feature, maxNumCompThreads, GrB.chunk.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 0)
     nthreads = gbthreads ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/times.m b/GraphBLAS/GraphBLAS/@GrB/times.m
index eef7c0f7d4..bae1269188 100644
--- a/GraphBLAS/GraphBLAS/@GrB/times.m
+++ b/GraphBLAS/GraphBLAS/@GrB/times.m
@@ -7,8 +7,8 @@
 %
 % See also GrB/mtimes, GrB.emult, GrB.mxm.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/trans.m b/GraphBLAS/GraphBLAS/@GrB/trans.m
index 7e06ccae23..8f7b02d4fa 100644
--- a/GraphBLAS/GraphBLAS/@GrB/trans.m
+++ b/GraphBLAS/GraphBLAS/@GrB/trans.m
@@ -16,8 +16,8 @@
 %
 % See also GrB/transpose, GrB/ctranspose, GrB/conj.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/transpose.m b/GraphBLAS/GraphBLAS/@GrB/transpose.m
index ed148aaaa8..e824211c3f 100644
--- a/GraphBLAS/GraphBLAS/@GrB/transpose.m
+++ b/GraphBLAS/GraphBLAS/@GrB/transpose.m
@@ -4,8 +4,8 @@
 %
 % See also GrB.trans, GrB/ctranspose.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 C = GrB (gbtrans (G)) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/tricount.m b/GraphBLAS/GraphBLAS/@GrB/tricount.m
index 545cbc3e3d..7053f823ae 100644
--- a/GraphBLAS/GraphBLAS/@GrB/tricount.m
+++ b/GraphBLAS/GraphBLAS/@GrB/tricount.m
@@ -13,8 +13,8 @@
 %
 % See also GrB.ktruss, GrB.entries.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % NOTE: this is a high-level algorithm that uses GrB objects.
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/tril.m b/GraphBLAS/GraphBLAS/@GrB/tril.m
index 281e6e600e..6751e10028 100644
--- a/GraphBLAS/GraphBLAS/@GrB/tril.m
+++ b/GraphBLAS/GraphBLAS/@GrB/tril.m
@@ -7,8 +7,8 @@
 %
 % See also GrB/triu.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (G))
     G = G.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/triu.m b/GraphBLAS/GraphBLAS/@GrB/triu.m
index 12e3b35aef..3daaa3e081 100644
--- a/GraphBLAS/GraphBLAS/@GrB/triu.m
+++ b/GraphBLAS/GraphBLAS/@GrB/triu.m
@@ -7,8 +7,8 @@
 %
 % See also GrB/tril.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (G))
     G = G.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/true.m b/GraphBLAS/GraphBLAS/@GrB/true.m
index 8254cd5564..eff03de873 100644
--- a/GraphBLAS/GraphBLAS/@GrB/true.m
+++ b/GraphBLAS/GraphBLAS/@GrB/true.m
@@ -13,9 +13,9 @@
 %
 % See also GrB/zeros, GrB/ones, GrB/false.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n, ~] = gb_parse_args ('true', varargin {:}) ;
-C = GrB (gb_scalar_to_full (m, n, 'logical', true)) ;
+C = GrB (gb_scalar_to_full (m, n, 'logical', gbformat, true)) ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/type.m b/GraphBLAS/GraphBLAS/@GrB/type.m
index 845fe87449..3f1fa43ef6 100644
--- a/GraphBLAS/GraphBLAS/@GrB/type.m
+++ b/GraphBLAS/GraphBLAS/@GrB/type.m
@@ -39,8 +39,8 @@
 %
 % See also class, isa, GrB/isa, GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (X))
     X = X.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/uint16.m b/GraphBLAS/GraphBLAS/@GrB/uint16.m
index db33615f6d..ff2780f555 100644
--- a/GraphBLAS/GraphBLAS/@GrB/uint16.m
+++ b/GraphBLAS/GraphBLAS/@GrB/uint16.m
@@ -10,10 +10,10 @@
 % See also GrB, GrB/double, GrB/complex, GrB/single, GrB/logical, GrB/int8,
 % GrB/int16, GrB/int32, GrB/int64, GrB/uint8, GrB/uint32, GrB/uint64.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 desc.kind = 'full' ;
-C = gbfull (G, 'uint16', uint16 (0), desc) ;
+C = gbfull (G, 'uint16', uint16 (0), desc) ;    % export as a MATLAB full matrix
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/uint32.m b/GraphBLAS/GraphBLAS/@GrB/uint32.m
index 95874288dd..ff27946e45 100644
--- a/GraphBLAS/GraphBLAS/@GrB/uint32.m
+++ b/GraphBLAS/GraphBLAS/@GrB/uint32.m
@@ -10,10 +10,10 @@
 % See also GrB, GrB/double, GrB/complex, GrB/single, GrB/logical, GrB/int8,
 % GrB/int16, GrB/int32, GrB/int64, GrB/uint8, GrB/uint16, GrB/uint64.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 desc.kind = 'full' ;
-C = gbfull (G, 'uint32', uint32 (0), desc) ;
+C = gbfull (G, 'uint32', uint32 (0), desc) ;    % export as a MATLAB full matrix
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/uint64.m b/GraphBLAS/GraphBLAS/@GrB/uint64.m
index aea6488e24..de66eb54de 100644
--- a/GraphBLAS/GraphBLAS/@GrB/uint64.m
+++ b/GraphBLAS/GraphBLAS/@GrB/uint64.m
@@ -10,10 +10,10 @@
 % See also GrB, GrB/double, GrB/complex, GrB/single, GrB/logical, GrB/int8,
 % GrB/int16, GrB/int32, GrB/int64, GrB/uint8, GrB/uint16, GrB/uint32.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 desc.kind = 'full' ;
-C = gbfull (G, 'uint64', uint64 (0), desc) ;
+C = gbfull (G, 'uint64', uint64 (0), desc) ;    % export as a MATLAB full matrix
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/uint8.m b/GraphBLAS/GraphBLAS/@GrB/uint8.m
index d428fbdf06..255b84b39a 100644
--- a/GraphBLAS/GraphBLAS/@GrB/uint8.m
+++ b/GraphBLAS/GraphBLAS/@GrB/uint8.m
@@ -10,10 +10,10 @@
 % See also GrB, GrB/double, GrB/complex, GrB/single, GrB/logical, GrB/int8,
 % GrB/int16, GrB/int32, GrB/int64, GrB/uint16, GrB/uint32, GrB/uint64.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 desc.kind = 'full' ;
-C = gbfull (G, 'uint8', uint8 (0), desc) ;
+C = gbfull (G, 'uint8', uint8 (0), desc) ;      % export as a MATLAB full matrix
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/uminus.m b/GraphBLAS/GraphBLAS/@GrB/uminus.m
index 45f90b3c2f..7d9836c2ea 100644
--- a/GraphBLAS/GraphBLAS/@GrB/uminus.m
+++ b/GraphBLAS/GraphBLAS/@GrB/uminus.m
@@ -4,8 +4,8 @@
 %
 % See also GrB.apply, GrB/minus, GrB/uplus.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = G.opaque ;
 C = GrB (gbapply ('-', G)) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/unopinfo.m b/GraphBLAS/GraphBLAS/@GrB/unopinfo.m
index 35696ad434..17ab0e3c77 100644
--- a/GraphBLAS/GraphBLAS/@GrB/unopinfo.m
+++ b/GraphBLAS/GraphBLAS/@GrB/unopinfo.m
@@ -78,6 +78,12 @@ function unopinfo (op, type)
 % For integer types only (result is same type as input):
 %   bitcmp      z = ~(x)        bitwise complement, also 'bitnot'
 %
+% For int32 and int64 types, applied to an entry A(i,j)
+%   positioni0  z = i-1     also 'i0'
+%   positioni1  z = i       also 'i', 'i1', and 'positioni'
+%   positionj0  z = j-1     also 'j0'
+%   positionj1  z = j       also 'j', 'j1', and 'positionj'
+%
 % Example:
 %
 %   % valid unary operators
@@ -93,8 +99,8 @@ function unopinfo (op, type)
 % See also GrB.binopinfo, GrB.descriptorinfo, GrB.monoidinfo,
 % GrB.selectopinfo, GrB.semiringinfo.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 0)
     help GrB.unopinfo
diff --git a/GraphBLAS/GraphBLAS/@GrB/uplus.m b/GraphBLAS/GraphBLAS/@GrB/uplus.m
index 2ab54d0af4..a710aba4d0 100644
--- a/GraphBLAS/GraphBLAS/@GrB/uplus.m
+++ b/GraphBLAS/GraphBLAS/@GrB/uplus.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/uminus.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 C = G ;
 
diff --git a/GraphBLAS/GraphBLAS/@GrB/ver.m b/GraphBLAS/GraphBLAS/@GrB/ver.m
new file mode 100644
index 0000000000..499d4468ea
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/ver.m
@@ -0,0 +1,16 @@
+function v = ver
+%GRB.VER Version information for GraphBLAS
+% v = GrB.ver returns a struct with the SuiteSparse:GraphBLAS version.
+% With no outputs, the version information is displayed.
+%
+% See also ver, version, GrB.version.
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+if (nargout == 0)
+    gbver ;
+else
+    v = gbver ;
+end
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/version.m b/GraphBLAS/GraphBLAS/@GrB/version.m
new file mode 100644
index 0000000000..2b72ad147d
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/@GrB/version.m
@@ -0,0 +1,11 @@
+function v = version
+%GRB.VERSION Version information for GraphBLAS
+% v = GrB.version returns a string with the SuiteSparse:GraphBLAS version.
+%
+% See also ver, version, GrB.ver.
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+v = gbversion ;
+
diff --git a/GraphBLAS/GraphBLAS/@GrB/vertcat.m b/GraphBLAS/GraphBLAS/@GrB/vertcat.m
index ca20961fa4..313dfa3671 100644
--- a/GraphBLAS/GraphBLAS/@GrB/vertcat.m
+++ b/GraphBLAS/GraphBLAS/@GrB/vertcat.m
@@ -7,12 +7,12 @@
 %
 % See also GrB/horzcat, GrB.optype.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % FUTURE: this will be much faster when it is a mexFunction.
 % The version below requires a sort in GrB.build.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
 % determine the size of each matrix and the size of the result
 nmatrices = length (varargin) ;
 nvals = zeros (1, nmatrices) ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/vreduce.m b/GraphBLAS/GraphBLAS/@GrB/vreduce.m
index f0faaca777..ed87647edf 100644
--- a/GraphBLAS/GraphBLAS/@GrB/vreduce.m
+++ b/GraphBLAS/GraphBLAS/@GrB/vreduce.m
@@ -27,8 +27,8 @@
 %
 % See also GrB.reduce, GrB/sum, GrB/prod, GrB/max, GrB/min.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (arg1))
     arg1 = arg1.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/xor.m b/GraphBLAS/GraphBLAS/@GrB/xor.m
index 685245d6de..f4a9f8a4d3 100644
--- a/GraphBLAS/GraphBLAS/@GrB/xor.m
+++ b/GraphBLAS/GraphBLAS/@GrB/xor.m
@@ -5,8 +5,8 @@
 %
 % See also GrB/and, GrB/or, GrB/not.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isobject (A))
     A = A.opaque ;
diff --git a/GraphBLAS/GraphBLAS/@GrB/zeros.m b/GraphBLAS/GraphBLAS/@GrB/zeros.m
index 43402124dd..81630d0671 100644
--- a/GraphBLAS/GraphBLAS/@GrB/zeros.m
+++ b/GraphBLAS/GraphBLAS/@GrB/zeros.m
@@ -13,8 +13,8 @@
 %
 % See also GrB/ones, GrB/false, GrB/true.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n, type] = gb_parse_args ('zeros', varargin {:}) ;
 C = GrB (gbnew (m, n, type)) ;
diff --git a/GraphBLAS/GraphBLAS/Contents.m b/GraphBLAS/GraphBLAS/Contents.m
index 9f1353dd25..927413e9ac 100644
--- a/GraphBLAS/GraphBLAS/Contents.m
+++ b/GraphBLAS/GraphBLAS/Contents.m
@@ -16,7 +16,7 @@
 %
 % The type can be 'double', 'single', 'logical', 'int8', 'int16', 'int32',
 % 'int64', 'uint8', 'uint16', 'uint32', 'uint64', 'double complex' or 'single
-% complex'.  The format is 'by row' or 'by col'.  
+% complex'.  Typical formats are 'by row' or 'by col'. 
 %
 % Essentially all operators and many built-in MATLAB functions are overloaded
 % by the @GrB class, so that they can be used for GraphBLAS matrices.
diff --git a/GraphBLAS/GraphBLAS/GAP/10June.txt b/GraphBLAS/GraphBLAS/GAP/10June.txt
deleted file mode 100644
index 52ad1c5678..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/10June.txt
+++ /dev/null
@@ -1,506 +0,0 @@
-gap_bc
-hypersparse: 40 threads
-
-threads =
-
-    40    20
-
-
-matrix: GAP/GAP-kron
-load time: 316.621 sec
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-init time: 140.275 sec
-  Name                  Size                         Bytes  Class     Attributes
-
-  A             134217726x134217726            39083124570  GrB                 
-  AT            134217726x134217726            39083124570  GrB                 
-  ans                   1x1                              8  double              
-  f                     1x245                         1960  double              
-  i                     1x245                         1960  double              
-  id                    1x12                            24  char                
-  index                 1x1                        1411276  struct              
-  k                     1x1                              8  double              
-  matrices              5x1                            686  cell                
-  n                     1x1                              8  double              
-  name                  1x12                            24  char                
-  sources              64x1                            512  double              
-  t1                    1x1                              8  double              
-  threads               1x2                             16  double              
-
-
-gap_centrality  tests: 40 threads
-sources: 2338013 31997660 23590941 43400605 
-trial:  1 GrB centrality time:   52.119
-sources: 75337938 169868 104041221 94177943 
-trial:  2 GrB centrality time:   46.612
-sources: 32871358 56230003 69883038 9346346 
-trial:  3 GrB centrality time:   49.608
-sources: 48915359 122571174 6183280 86323664 
-trial:  4 GrB centrality time:   48.715
-sources: 106725781 92389939 16210739 59816701 
-trial:  5 GrB centrality time:   51.616
-sources: 111669930 102831412 113384801 43872565 
-trial:  6 GrB centrality time:   48.145
-sources: 80508828 26105649 8807517 118452456 
-trial:  7 GrB centrality time:   50.583
-sources: 121818860 42361929 29493054 98461504 
-trial:  8 GrB centrality time:   44.334
-sources: 71931338 103808469 4092346 115276242 
-trial:  9 GrB centrality time:   46.551
-sources: 4649344 76656190 31312002 111334128 
-trial: 10 GrB centrality time:   51.708
-sources: 100962919 41823216 22631241 42848462 
-trial: 11 GrB centrality time:   52.226
-sources: 79485149 106818743 73347975 78848446 
-trial: 12 GrB centrality time:   43.957
-sources: 109920511 121492134 101037297 15438601 
-trial: 13 GrB centrality time:   47.053
-sources: 4584785 124503846 87241744 108297009 
-trial: 14 GrB centrality time:   57.508
-sources: 33955083 79934824 8608482 82435064 
-trial: 15 GrB centrality time:   50.801
-sources: 46579272 515422 121530468 127978737 
-trial: 16 GrB centrality time:   48.648
-avg GrB centrality time:      49.386 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 2338013 31997660 23590941 43400605 
-trial:  1 GrB centrality time:   86.168
-sources: 75337938 169868 104041221 94177943 
-trial:  2 GrB centrality time:   53.202
-sources: 32871358 56230003 69883038 9346346 
-trial:  3 GrB centrality time:   61.425
-sources: 48915359 122571174 6183280 86323664 
-trial:  4 GrB centrality time:   62.242
-sources: 106725781 92389939 16210739 59816701 
-trial:  5 GrB centrality time:   58.865
-sources: 111669930 102831412 113384801 43872565 
-trial:  6 GrB centrality time:   61.415
-sources: 80508828 26105649 8807517 118452456 
-trial:  7 GrB centrality time:   59.641
-sources: 121818860 42361929 29493054 98461504 
-trial:  8 GrB centrality time:   49.941
-sources: 71931338 103808469 4092346 115276242 
-trial:  9 GrB centrality time:   52.699
-sources: 4649344 76656190 31312002 111334128 
-trial: 10 GrB centrality time:   60.513
-sources: 100962919 41823216 22631241 42848462 
-trial: 11 GrB centrality time:   76.976
-sources: 79485149 106818743 73347975 78848446 
-trial: 12 GrB centrality time:   49.734
-sources: 109920511 121492134 101037297 15438601 
-trial: 13 GrB centrality time:   54.278
-sources: 4584785 124503846 87241744 108297009 
-trial: 14 GrB centrality time:   66.659
-sources: 33955083 79934824 8608482 82435064 
-trial: 15 GrB centrality time:   62.144
-sources: 46579272 515422 121530468 127978737 
-trial: 16 GrB centrality time:   59.253
-avg GrB centrality time:      60.947 (16 trials)
-
-matrix: GAP/GAP-urand
-load time: 328.928 sec
-
-GAP/GAP-urand: nodes: 134.218 million  nvals: 4294.97 million
-init time: 178.27 sec
-  Name                  Size                         Bytes  Class     Attributes
-
-  A             134217728x134217728            39728443450  GrB                 
-  AT            134217728x134217728            39728443450  GrB                 
-  ans                   1x1                              8  double              
-  f                     1x245                         1960  double              
-  good                  1x27                            54  char                
-  i                     1x245                         1960  double              
-  id                    1x13                            26  char                
-  index                 1x1                        1411276  struct              
-  k                     1x1                              8  double              
-  matrices              5x1                            686  cell                
-  n                     1x1                              8  double              
-  name                  1x13                            26  char                
-  nthreads              1x1                              8  double              
-  ntrials               1x1                              8  double              
-  sources              64x1                            512  double              
-  src                   4x1                             32  double              
-  t                     1x1                              8  double              
-  t1                    1x1                              8  double              
-  threads               1x2                             16  double              
-  tot                   1x1                              8  double              
-  trial                 1x1                              8  double              
-  tstart                1x1                              8  uint64              
-
-
-gap_centrality  tests: 40 threads
-sources: 27691420 121280315 2413432 37512114 
-trial:  1 GrB centrality time:   74.947
-sources: 38390878 56651038 128461249 33029843 
-trial:  2 GrB centrality time:   75.263
-sources: 71406329 117872828 24351939 15444520 
-trial:  3 GrB centrality time:   73.808
-sources: 127526282 112279429 13631650 110379303 
-trial:  4 GrB centrality time:   73.097
-sources: 44800624 77768194 175348 107397390 
-trial:  5 GrB centrality time:   72.765
-sources: 43457210 97215941 73575166 44449716 
-trial:  6 GrB centrality time:   74.541
-sources: 33931725 55526611 14422052 58043874 
-trial:  7 GrB centrality time:   72.600
-sources: 72137330 9647841 15940696 14209953 
-trial:  8 GrB centrality time:   76.552
-sources: 49020884 28901139 50493274 49150070 
-trial:  9 GrB centrality time:   73.684
-sources: 126525083 6382741 89108298 9239736 
-trial: 10 GrB centrality time:   75.398
-sources: 110168549 95370260 116653531 123410704 
-trial: 11 GrB centrality time:   73.611
-sources: 16733666 49030283 108545122 99095666 
-trial: 12 GrB centrality time:   74.702
-sources: 133850078 63499302 21541383 6230752 
-trial: 13 GrB centrality time:   75.281
-sources: 89077457 70392766 6670456 61746272 
-trial: 14 GrB centrality time:   73.777
-sources: 83349536 115272185 20129909 106148554 
-trial: 15 GrB centrality time:   72.808
-sources: 117042376 71431188 45287809 107702121 
-trial: 16 GrB centrality time:   72.729
-avg GrB centrality time:      74.098 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 27691420 121280315 2413432 37512114 
-trial:  1 GrB centrality time:   83.387
-sources: 38390878 56651038 128461249 33029843 
-trial:  2 GrB centrality time:   84.064
-sources: 71406329 117872828 24351939 15444520 
-trial:  3 GrB centrality time:   81.968
-sources: 127526282 112279429 13631650 110379303 
-trial:  4 GrB centrality time:   81.372
-sources: 44800624 77768194 175348 107397390 
-trial:  5 GrB centrality time:   81.197
-sources: 43457210 97215941 73575166 44449716 
-trial:  6 GrB centrality time:   83.546
-sources: 33931725 55526611 14422052 58043874 
-trial:  7 GrB centrality time:   80.773
-sources: 72137330 9647841 15940696 14209953 
-trial:  8 GrB centrality time:   85.141
-sources: 49020884 28901139 50493274 49150070 
-trial:  9 GrB centrality time:   82.123
-sources: 126525083 6382741 89108298 9239736 
-trial: 10 GrB centrality time:   83.311
-sources: 110168549 95370260 116653531 123410704 
-trial: 11 GrB centrality time:   81.797
-sources: 16733666 49030283 108545122 99095666 
-trial: 12 GrB centrality time:   82.911
-sources: 133850078 63499302 21541383 6230752 
-trial: 13 GrB centrality time:   83.885
-sources: 89077457 70392766 6670456 61746272 
-trial: 14 GrB centrality time:   82.198
-sources: 83349536 115272185 20129909 106148554 
-trial: 15 GrB centrality time:   81.205
-sources: 117042376 71431188 45287809 107702121 
-trial: 16 GrB centrality time:   81.039
-avg GrB centrality time:      82.495 (16 trials)
-
-matrix: GAP/GAP-twitter
-load time: 108.777 sec
-
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 1468.36 million
-init time: 43.579 sec
-  Name                 Size                        Bytes  Class     Attributes
-
-  A             61578415x61578415            13707912242  GrB                 
-  AT            61578415x61578415            13707912242  GrB                 
-  ans                  1x1                             8  double              
-  f                    1x245                        1960  double              
-  good                 1x27                           54  char                
-  i                    1x245                        1960  double              
-  id                   1x15                           30  char                
-  index                1x1                       1411276  struct              
-  k                    1x1                             8  double              
-  matrices             5x1                           686  cell                
-  n                    1x1                             8  double              
-  name                 1x15                           30  char                
-  nthreads             1x1                             8  double              
-  ntrials              1x1                             8  double              
-  sources             64x1                           512  double              
-  src                  4x1                            32  double              
-  t                    1x1                             8  double              
-  t1                   1x1                             8  double              
-  threads              1x2                            16  double              
-  tot                  1x1                             8  double              
-  trial                1x1                             8  double              
-  tstart               1x1                             8  uint64              
-
-
-gap_centrality  tests: 40 threads
-sources: 12441073 54488258 25451916 57714474 
-trial:  1 GrB centrality time:   21.867
-sources: 14839495 32081105 52957358 50444381 
-trial:  2 GrB centrality time:   26.836
-sources: 49590702 20127817 34939334 48251002 
-trial:  3 GrB centrality time:   22.993
-sources: 19524254 43676727 33055509 15244688 
-trial:  4 GrB centrality time:   21.378
-sources: 24946739 6479473 26077683 22023876 
-trial:  5 GrB centrality time:   24.234
-sources: 22081916 40034163 49496015 42847508 
-trial:  6 GrB centrality time:   21.762
-sources: 52409558 55445389 22028098 48766649 
-trial:  7 GrB centrality time:   22.379
-sources: 44521242 60135543 28528672 9678013 
-trial:  8 GrB centrality time:   24.338
-sources: 40020307 31625736 37446893 51788953 
-trial:  9 GrB centrality time:   21.017
-sources: 52584256 20346697 48387910 37337428 
-trial: 10 GrB centrality time:   21.273
-sources: 50501085 30130062 41185894 56495704 
-trial: 11 GrB centrality time:   22.354
-sources: 45663306 33359461 48143059 33291514 
-trial: 12 GrB centrality time:   24.343
-sources: 53461446 29340611 34148499 49171807 
-trial: 13 GrB centrality time:   22.019
-sources: 35550697 14521508 51633219 46823383 
-trial: 14 GrB centrality time:   22.511
-sources: 19396274 19871751 36862678 49539127 
-trial: 15 GrB centrality time:   25.143
-sources: 34016453 36567396 55487794 14391371 
-trial: 16 GrB centrality time:   21.732
-avg GrB centrality time:      22.886 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 12441073 54488258 25451916 57714474 
-trial:  1 GrB centrality time:   27.666
-sources: 14839495 32081105 52957358 50444381 
-trial:  2 GrB centrality time:   32.216
-sources: 49590702 20127817 34939334 48251002 
-trial:  3 GrB centrality time:   26.790
-sources: 19524254 43676727 33055509 15244688 
-trial:  4 GrB centrality time:   24.171
-sources: 24946739 6479473 26077683 22023876 
-trial:  5 GrB centrality time:   28.481
-sources: 22081916 40034163 49496015 42847508 
-trial:  6 GrB centrality time:   25.967
-sources: 52409558 55445389 22028098 48766649 
-trial:  7 GrB centrality time:   28.448
-sources: 44521242 60135543 28528672 9678013 
-trial:  8 GrB centrality time:   29.210
-sources: 40020307 31625736 37446893 51788953 
-trial:  9 GrB centrality time:   23.987
-sources: 52584256 20346697 48387910 37337428 
-trial: 10 GrB centrality time:   25.415
-sources: 50501085 30130062 41185894 56495704 
-trial: 11 GrB centrality time:   27.809
-sources: 45663306 33359461 48143059 33291514 
-trial: 12 GrB centrality time:   34.563
-sources: 53461446 29340611 34148499 49171807 
-trial: 13 GrB centrality time:   25.155
-sources: 35550697 14521508 51633219 46823383 
-trial: 14 GrB centrality time:   28.695
-sources: 19396274 19871751 36862678 49539127 
-trial: 15 GrB centrality time:   31.991
-sources: 34016453 36567396 55487794 14391371 
-trial: 16 GrB centrality time:   24.406
-avg GrB centrality time:      27.811 (16 trials)
-
-matrix: GAP/GAP-web
-load time: 109.881 sec
-
-GAP/GAP-web: nodes: 50.6362 million  nvals: 1930.29 million
-init time: 15.8748 sec
-  Name                 Size                        Bytes  Class     Attributes
-
-  A             50636151x50636151            17777726706  GrB                 
-  AT            50636151x50636151            17777726706  GrB                 
-  ans                  1x1                             8  double              
-  f                    1x245                        1960  double              
-  good                 1x27                           54  char                
-  i                    1x245                        1960  double              
-  id                   1x11                           22  char                
-  index                1x1                       1411276  struct              
-  k                    1x1                             8  double              
-  matrices             5x1                           686  cell                
-  n                    1x1                             8  double              
-  name                 1x11                           22  char                
-  nthreads             1x1                             8  double              
-  ntrials              1x1                             8  double              
-  sources             64x1                           512  double              
-  src                  4x1                            32  double              
-  t                    1x1                             8  double              
-  t1                   1x1                             8  double              
-  threads              1x2                            16  double              
-  tot                  1x1                             8  double              
-  trial                1x1                             8  double              
-  tstart               1x1                             8  uint64              
-
-
-gap_centrality  tests: 40 threads
-sources: 10219453 44758212 890672 13843757 
-trial:  1 GrB centrality time:   33.414
-sources: 14168063 20906931 12189585 26352336 
-trial:  2 GrB centrality time:   31.717
-sources: 43500687 8987025 5699763 41436456 
-trial:  3 GrB centrality time:   31.956
-sources: 5030728 40735219 16533564 28700167 
-trial:  4 GrB centrality time:   35.747
-sources: 64712 39634751 16037780 27152740 
-trial:  5 GrB centrality time:   33.590
-sources: 16404062 20491964 5322424 21420954 
-trial:  6 GrB centrality time:   35.281
-sources: 26622110 5882876 18091041 10665897 
-trial:  7 GrB centrality time:   31.890
-sources: 18634423 18138716 2355536 32885206 
-trial:  8 GrB centrality time:   30.599
-sources: 40657441 35196168 45544427 6175520 
-trial:  9 GrB centrality time:   30.224
-sources: 40058319 50626231 36571020 49397053 
-trial: 10 GrB centrality time:   33.086
-sources: 23434266 2299445 32873824 25978283 
-trial: 11 GrB centrality time:   30.283
-sources: 2461716 22787315 30759948 7428895 
-trial: 12 GrB centrality time:   28.235
-sources: 39173871 43194210 26361510 39747212 
-trial: 13 GrB centrality time:    6.004
-sources: 30670030 41483034 9358667 9945009 
-trial: 14 GrB centrality time:   33.588
-sources: 3355245 33831270 45124745 16137878 
-trial: 15 GrB centrality time:   31.715
-sources: 11235449 37509145 27402415 39546084 
-trial: 16 GrB centrality time:   33.015
-avg GrB centrality time:      30.647 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 10219453 44758212 890672 13843757 
-trial:  1 GrB centrality time:   35.358
-sources: 14168063 20906931 12189585 26352336 
-trial:  2 GrB centrality time:   33.374
-sources: 43500687 8987025 5699763 41436456 
-trial:  3 GrB centrality time:   33.550
-sources: 5030728 40735219 16533564 28700167 
-trial:  4 GrB centrality time:   38.401
-sources: 64712 39634751 16037780 27152740 
-trial:  5 GrB centrality time:   35.467
-sources: 16404062 20491964 5322424 21420954 
-trial:  6 GrB centrality time:   37.822
-sources: 26622110 5882876 18091041 10665897 
-trial:  7 GrB centrality time:   33.738
-sources: 18634423 18138716 2355536 32885206 
-trial:  8 GrB centrality time:   32.538
-sources: 40657441 35196168 45544427 6175520 
-trial:  9 GrB centrality time:   31.783
-sources: 40058319 50626231 36571020 49397053 
-trial: 10 GrB centrality time:   34.963
-sources: 23434266 2299445 32873824 25978283 
-trial: 11 GrB centrality time:   32.044
-sources: 2461716 22787315 30759948 7428895 
-trial: 12 GrB centrality time:   30.174
-sources: 39173871 43194210 26361510 39747212 
-trial: 13 GrB centrality time:    6.343
-sources: 30670030 41483034 9358667 9945009 
-trial: 14 GrB centrality time:   35.518
-sources: 3355245 33831270 45124745 16137878 
-trial: 15 GrB centrality time:   33.545
-sources: 11235449 37509145 27402415 39546084 
-trial: 16 GrB centrality time:   34.955
-avg GrB centrality time:      32.473 (16 trials)
-
-matrix: GAP/GAP-road
-load time: 5.13869 sec
-
-GAP/GAP-road: nodes: 23.9473 million  nvals: 57.7086 million
-init time: 1.60054 sec
-  Name                 Size                      Bytes  Class     Attributes
-
-  A             23947347x23947347            710957358  GrB                 
-  AT            23947347x23947347            710957358  GrB                 
-  ans                  1x1                           8  double              
-  f                    1x245                      1960  double              
-  good                 1x27                         54  char                
-  i                    1x245                      1960  double              
-  id                   1x12                         24  char                
-  index                1x1                     1411276  struct              
-  k                    1x1                           8  double              
-  matrices             5x1                         686  cell                
-  n                    1x1                           8  double              
-  name                 1x12                         24  char                
-  nthreads             1x1                           8  double              
-  ntrials              1x1                           8  double              
-  sources             64x1                         512  double              
-  src                  4x1                          32  double              
-  t                    1x1                           8  double              
-  t1                   1x1                           8  double              
-  threads              1x2                          16  double              
-  tot                  1x1                           8  double              
-  trial                1x1                           8  double              
-  tstart               1x1                           8  uint64              
-
-
-gap_centrality  tests: 40 threads
-sources: 4795721 21003854 417969 6496512 
-trial:  1 GrB centrality time: 1649.255
-sources: 6648700 9811074 22247479 5720253 
-trial:  2 GrB centrality time: 1705.799
-sources: 12366460 20413730 4217375 2674750 
-trial:  3 GrB centrality time: 1771.855
-sources: 22085558 19445041 2360789 19115969 
-trial:  4 GrB centrality time: 1661.399
-sources: 7758768 13468235 30368 18599548 
-trial:  5 GrB centrality time: 1634.964
-sources: 7526109 16836281 12742068 7697996 
-trial:  6 GrB centrality time: 1722.095
-sources: 5876444 9616341 2497674 10052291 
-trial:  7 GrB centrality time: 1741.140
-sources: 12493058 1670856 2760680 2460942 
-trial:  8 GrB centrality time: 1708.093
-sources: 8489651 5005226 8744646 8512024 
-trial:  9 GrB centrality time: 1691.964
-sources: 21912166 1105391 15432164 1600178 
-trial: 10 GrB centrality time: 1886.061
-sources: 19079470 16516638 20202567 21372804 
-trial: 11 GrB centrality time: 1823.846
-sources: 2898010 8491278 18798318 23757561 
-trial: 12 GrB centrality time: 1719.091
-sources: 17161820 23180740 10997086 3730631 
-trial: 13 GrB centrality time: 1834.101
-sources: 1079069 15426823 12190926 1155219 
-trial: 14 GrB centrality time: 1487.468
-sources: 10693489 14434836 19963340 3486186 
-trial: 15 GrB centrality time: 1602.231
-sources: 18383270 20269909 12370765 7843141 
-trial: 16 GrB centrality time: 1629.186
-avg GrB centrality time:    1704.284 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 4795721 21003854 417969 6496512 
-trial:  1 GrB centrality time: 1724.491
-sources: 6648700 9811074 22247479 5720253 
-trial:  2 GrB centrality time: 1801.756
-sources: 12366460 20413730 4217375 2674750 
-trial:  3 GrB centrality time: 1848.167
-sources: 22085558 19445041 2360789 19115969 
-trial:  4 GrB centrality time: 1718.186
-sources: 7758768 13468235 30368 18599548 
-trial:  5 GrB centrality time: 1732.298
-sources: 7526109 16836281 12742068 7697996 
-trial:  6 GrB centrality time: 1818.407
-sources: 5876444 9616341 2497674 10052291 
-trial:  7 GrB centrality time: 1846.650
-sources: 12493058 1670856 2760680 2460942 
-trial:  8 GrB centrality time: 1774.472
-sources: 8489651 5005226 8744646 8512024 
-trial:  9 GrB centrality time: 1744.318
-sources: 21912166 1105391 15432164 1600178 
-trial: 10 GrB centrality time: 1990.489
-sources: 19079470 16516638 20202567 21372804 
-trial: 11 GrB centrality time: 1888.783
-sources: 2898010 8491278 18798318 23757561 
-trial: 12 GrB centrality time: 1742.198
-sources: 17161820 23180740 10997086 3730631 
-trial: 13 GrB centrality time: 1887.741
-sources: 1079069 15426823 12190926 1155219 
-trial: 14 GrB centrality time: 1550.960
-sources: 10693489 14434836 19963340 3486186 
-trial: 15 GrB centrality time: 1667.253
-sources: 18383270 20269909 12370765 7843141 
-trial: 16 GrB centrality time: 1716.281
-avg GrB centrality time:    1778.278 (16 trials)
-
-diary off
diff --git a/GraphBLAS/GraphBLAS/GAP/11June.txt b/GraphBLAS/GraphBLAS/GAP/11June.txt
deleted file mode 100644
index 72b3bb4a28..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/11June.txt
+++ /dev/null
@@ -1,218 +0,0 @@
-gap_pr
-hypersparse: 40 threads
-
-threads =
-
-    40    20
-
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-load time: 316.305 sec
-degree time: 12.5178 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 54.7622 iter: 7
-trial:  2 GAP pagerank time: 54.9208 iter: 7
-trial:  3 GAP pagerank time: 55.5911 iter: 7
-trial:  4 GAP pagerank time: 55.5171 iter: 7
-trial:  5 GAP pagerank time: 54.8102 iter: 7
-trial:  6 GAP pagerank time: 54.9398 iter: 7
-trial:  7 GAP pagerank time: 55.0613 iter: 7
-trial:  8 GAP pagerank time: 55.0913 iter: 7
-trial:  9 GAP pagerank time: 54.9875 iter: 7
-trial: 10 GAP pagerank time: 54.958 iter: 7
-trial: 11 GAP pagerank time: 54.9143 iter: 7
-trial: 12 GAP pagerank time: 54.9323 iter: 7
-trial: 13 GAP pagerank time: 54.9006 iter: 7
-trial: 14 GAP pagerank time: 55.2832 iter: 7
-trial: 15 GAP pagerank time: 55.1054 iter: 7
-trial: 16 GAP pagerank time: 55.107 iter: 7
-avg gap_pagerank time:  55.0551 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 69.8657 iter: 7
-trial:  2 GAP pagerank time: 69.689 iter: 7
-trial:  3 GAP pagerank time: 69.862 iter: 7
-trial:  4 GAP pagerank time: 69.695 iter: 7
-trial:  5 GAP pagerank time: 69.6841 iter: 7
-trial:  6 GAP pagerank time: 69.7549 iter: 7
-trial:  7 GAP pagerank time: 69.8197 iter: 7
-trial:  8 GAP pagerank time: 69.8276 iter: 7
-trial:  9 GAP pagerank time: 69.7322 iter: 7
-trial: 10 GAP pagerank time: 69.6343 iter: 7
-trial: 11 GAP pagerank time: 69.7579 iter: 7
-trial: 12 GAP pagerank time: 69.906 iter: 7
-trial: 13 GAP pagerank time: 69.8679 iter: 7
-trial: 14 GAP pagerank time: 69.5744 iter: 7
-trial: 15 GAP pagerank time: 69.6943 iter: 7
-trial: 16 GAP pagerank time: 69.5739 iter: 7
-avg gap_pagerank time:  69.7462 (16 trials)
-
-GAP/GAP-urand: nodes: 134.218 million  nvals: 4294.97 million
-load time: 327.415 sec
-degree time: 16.4261 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 59.5044 iter: 6
-trial:  2 GAP pagerank time: 59.34 iter: 6
-trial:  3 GAP pagerank time: 59.5149 iter: 6
-trial:  4 GAP pagerank time: 59.516 iter: 6
-trial:  5 GAP pagerank time: 59.4021 iter: 6
-trial:  6 GAP pagerank time: 59.4271 iter: 6
-trial:  7 GAP pagerank time: 59.2889 iter: 6
-trial:  8 GAP pagerank time: 59.3625 iter: 6
-trial:  9 GAP pagerank time: 59.2414 iter: 6
-trial: 10 GAP pagerank time: 59.1346 iter: 6
-trial: 11 GAP pagerank time: 59.1234 iter: 6
-trial: 12 GAP pagerank time: 59.0963 iter: 6
-trial: 13 GAP pagerank time: 59.2279 iter: 6
-trial: 14 GAP pagerank time: 59.2222 iter: 6
-trial: 15 GAP pagerank time: 59.2744 iter: 6
-trial: 16 GAP pagerank time: 59.2498 iter: 6
-avg gap_pagerank time:  59.3079 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 73.8393 iter: 6
-trial:  2 GAP pagerank time: 73.8172 iter: 6
-trial:  3 GAP pagerank time: 73.8366 iter: 6
-trial:  4 GAP pagerank time: 73.9487 iter: 6
-trial:  5 GAP pagerank time: 73.7961 iter: 6
-trial:  6 GAP pagerank time: 73.7989 iter: 6
-trial:  7 GAP pagerank time: 73.8041 iter: 6
-trial:  8 GAP pagerank time: 73.8071 iter: 6
-trial:  9 GAP pagerank time: 73.8143 iter: 6
-trial: 10 GAP pagerank time: 73.8102 iter: 6
-trial: 11 GAP pagerank time: 74.8926 iter: 6
-trial: 12 GAP pagerank time: 75.2881 iter: 6
-trial: 13 GAP pagerank time: 73.8813 iter: 6
-trial: 14 GAP pagerank time: 73.9724 iter: 6
-trial: 15 GAP pagerank time: 73.9023 iter: 6
-trial: 16 GAP pagerank time: 73.9341 iter: 6
-avg gap_pagerank time:  74.009 (16 trials)
-
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 1468.36 million
-load time: 111.112 sec
-degree time: 4.23829 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 50.8085 iter: 22
-trial:  2 GAP pagerank time: 50.4662 iter: 22
-trial:  3 GAP pagerank time: 50.6976 iter: 22
-trial:  4 GAP pagerank time: 50.653 iter: 22
-trial:  5 GAP pagerank time: 50.6742 iter: 22
-trial:  6 GAP pagerank time: 50.599 iter: 22
-trial:  7 GAP pagerank time: 50.645 iter: 22
-trial:  8 GAP pagerank time: 50.6151 iter: 22
-trial:  9 GAP pagerank time: 50.7264 iter: 22
-trial: 10 GAP pagerank time: 50.6086 iter: 22
-trial: 11 GAP pagerank time: 50.6712 iter: 22
-trial: 12 GAP pagerank time: 50.6564 iter: 22
-trial: 13 GAP pagerank time: 50.6021 iter: 22
-trial: 14 GAP pagerank time: 50.603 iter: 22
-trial: 15 GAP pagerank time: 50.652 iter: 22
-trial: 16 GAP pagerank time: 50.6255 iter: 22
-avg gap_pagerank time:  50.644 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 62.8932 iter: 22
-trial:  2 GAP pagerank time: 62.8036 iter: 22
-trial:  3 GAP pagerank time: 62.8394 iter: 22
-trial:  4 GAP pagerank time: 62.6346 iter: 22
-trial:  5 GAP pagerank time: 62.8497 iter: 22
-trial:  6 GAP pagerank time: 62.6194 iter: 22
-trial:  7 GAP pagerank time: 62.708 iter: 22
-trial:  8 GAP pagerank time: 62.5987 iter: 22
-trial:  9 GAP pagerank time: 64.2764 iter: 22
-trial: 10 GAP pagerank time: 62.6751 iter: 22
-trial: 11 GAP pagerank time: 62.8904 iter: 22
-trial: 12 GAP pagerank time: 62.7711 iter: 22
-trial: 13 GAP pagerank time: 62.722 iter: 22
-trial: 14 GAP pagerank time: 62.7893 iter: 22
-trial: 15 GAP pagerank time: 63.0269 iter: 22
-trial: 16 GAP pagerank time: 62.7751 iter: 22
-avg gap_pagerank time:  62.8671 (16 trials)
-
-GAP/GAP-web: nodes: 50.6362 million  nvals: 1930.29 million
-load time: 111.81 sec
-degree time: 1.91181 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 37.9281 iter: 30
-trial:  2 GAP pagerank time: 37.7365 iter: 30
-trial:  3 GAP pagerank time: 37.8741 iter: 30
-trial:  4 GAP pagerank time: 37.8086 iter: 30
-trial:  5 GAP pagerank time: 37.8508 iter: 30
-trial:  6 GAP pagerank time: 37.763 iter: 30
-trial:  7 GAP pagerank time: 37.8713 iter: 30
-trial:  8 GAP pagerank time: 37.6641 iter: 30
-trial:  9 GAP pagerank time: 37.814 iter: 30
-trial: 10 GAP pagerank time: 37.7209 iter: 30
-trial: 11 GAP pagerank time: 37.8673 iter: 30
-trial: 12 GAP pagerank time: 37.8055 iter: 30
-trial: 13 GAP pagerank time: 37.8326 iter: 30
-trial: 14 GAP pagerank time: 37.7662 iter: 30
-trial: 15 GAP pagerank time: 37.9222 iter: 30
-trial: 16 GAP pagerank time: 37.7059 iter: 30
-avg gap_pagerank time:  37.8082 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 43.5903 iter: 30
-trial:  2 GAP pagerank time: 43.5856 iter: 30
-trial:  3 GAP pagerank time: 43.6212 iter: 30
-trial:  4 GAP pagerank time: 43.5251 iter: 30
-trial:  5 GAP pagerank time: 43.71 iter: 30
-trial:  6 GAP pagerank time: 43.728 iter: 30
-trial:  7 GAP pagerank time: 43.5926 iter: 30
-trial:  8 GAP pagerank time: 43.4663 iter: 30
-trial:  9 GAP pagerank time: 43.5845 iter: 30
-trial: 10 GAP pagerank time: 43.5632 iter: 30
-trial: 11 GAP pagerank time: 43.6577 iter: 30
-trial: 12 GAP pagerank time: 43.5902 iter: 30
-trial: 13 GAP pagerank time: 43.8056 iter: 30
-trial: 14 GAP pagerank time: 43.7544 iter: 30
-trial: 15 GAP pagerank time: 43.6156 iter: 30
-trial: 16 GAP pagerank time: 46.3152 iter: 30
-avg gap_pagerank time:  43.7941 (16 trials)
-
-GAP/GAP-road: nodes: 23.9473 million  nvals: 57.7086 million
-load time: 5.86096 sec
-degree time: 0.457217 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 14.5858 iter: 39
-trial:  2 GAP pagerank time: 14.3275 iter: 39
-trial:  3 GAP pagerank time: 14.4111 iter: 39
-trial:  4 GAP pagerank time: 14.2189 iter: 39
-trial:  5 GAP pagerank time: 14.4498 iter: 39
-trial:  6 GAP pagerank time: 14.3562 iter: 39
-trial:  7 GAP pagerank time: 14.222 iter: 39
-trial:  8 GAP pagerank time: 14.4399 iter: 39
-trial:  9 GAP pagerank time: 14.4184 iter: 39
-trial: 10 GAP pagerank time: 14.848 iter: 39
-trial: 11 GAP pagerank time: 14.1727 iter: 39
-trial: 12 GAP pagerank time: 14.1905 iter: 39
-trial: 13 GAP pagerank time: 14.491 iter: 39
-trial: 14 GAP pagerank time: 12.9994 iter: 39
-trial: 15 GAP pagerank time: 12.8917 iter: 39
-trial: 16 GAP pagerank time: 12.7749 iter: 39
-avg gap_pagerank time:  14.1123 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 14.3336 iter: 39
-trial:  2 GAP pagerank time: 14.3914 iter: 39
-trial:  3 GAP pagerank time: 14.4256 iter: 39
-trial:  4 GAP pagerank time: 14.4541 iter: 39
-trial:  5 GAP pagerank time: 14.5971 iter: 39
-trial:  6 GAP pagerank time: 14.5769 iter: 39
-trial:  7 GAP pagerank time: 14.6101 iter: 39
-trial:  8 GAP pagerank time: 14.6619 iter: 39
-trial:  9 GAP pagerank time: 14.7005 iter: 39
-trial: 10 GAP pagerank time: 14.6866 iter: 39
-trial: 11 GAP pagerank time: 14.6957 iter: 39
-trial: 12 GAP pagerank time: 14.7395 iter: 39
-trial: 13 GAP pagerank time: 14.6945 iter: 39
-trial: 14 GAP pagerank time: 14.7693 iter: 39
-trial: 15 GAP pagerank time: 14.6958 iter: 39
-trial: 16 GAP pagerank time: 14.7584 iter: 39
-avg gap_pagerank time:  14.6119 (16 trials)
-diary off
diff --git a/GraphBLAS/GraphBLAS/GAP/13June.txt b/GraphBLAS/GraphBLAS/GAP/13June.txt
deleted file mode 100644
index 53e974a54d..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/13June.txt
+++ /dev/null
@@ -1,832 +0,0 @@
-type gap
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-type gap
-gap_bc         % run centrality for the GAP benchmark
-gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-
-gap
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-type gap
-gap_bc         % run centrality for the GAP benchmark
-gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-
-hypersparse: 40 threads
-
-threads =
-
-    40    20
-
-
-matrix: GAP/GAP-kron
-load time: 315.987 sec
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-init time: 149.265 sec
-  Name                  Size                         Bytes  Class     Attributes
-
-  A             134217726x134217726            39083124570  GrB                 
-  AT            134217726x134217726            39083124570  GrB                 
-  ans                   1x1                              8  double              
-  f                     1x245                         1960  double              
-  i                     1x245                         1960  double              
-  id                    1x12                            24  char                
-  index                 1x1                        1411276  struct              
-  k                     1x1                              8  double              
-  matrices              5x1                            686  cell                
-  n                     1x1                              8  double              
-  name                  1x12                            24  char                
-  sources              64x1                            512  double              
-  t1                    1x1                              8  double              
-  threads               1x2                             16  double              
-
-
-gap_centrality  tests: 40 threads
-sources: 2338013 31997660 23590941 43400605 
-trial:  1 GrB centrality time:   55.772
-sources: 75337938 169868 104041221 94177943 
-trial:  2 GrB centrality time:   51.141
-sources: 32871358 56230003 69883038 9346346 
-trial:  3 GrB centrality time:   53.636
-sources: 48915359 122571174 6183280 86323664 
-trial:  4 GrB centrality time:   52.674
-sources: 106725781 92389939 16210739 59816701 
-trial:  5 GrB centrality time:   55.560
-sources: 111669930 102831412 113384801 43872565 
-trial:  6 GrB centrality time:   52.270
-sources: 80508828 26105649 8807517 118452456 
-trial:  7 GrB centrality time:   54.604
-sources: 121818860 42361929 29493054 98461504 
-trial:  8 GrB centrality time:   48.475
-sources: 71931338 103808469 4092346 115276242 
-trial:  9 GrB centrality time:   51.351
-sources: 4649344 76656190 31312002 111334128 
-trial: 10 GrB centrality time:   55.496
-sources: 100962919 41823216 22631241 42848462 
-trial: 11 GrB centrality time:   55.866
-sources: 79485149 106818743 73347975 78848446 
-trial: 12 GrB centrality time:   48.344
-sources: 109920511 121492134 101037297 15438601 
-trial: 13 GrB centrality time:   51.057
-sources: 4584785 124503846 87241744 108297009 
-trial: 14 GrB centrality time:   61.556
-sources: 33955083 79934824 8608482 82435064 
-trial: 15 GrB centrality time:   54.823
-sources: 46579272 515422 121530468 127978737 
-trial: 16 GrB centrality time:   52.968
-avg GrB centrality time:      53.474 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 2338013 31997660 23590941 43400605 
-trial:  1 GrB centrality time:   89.299
-sources: 75337938 169868 104041221 94177943 
-trial:  2 GrB centrality time:   56.431
-sources: 32871358 56230003 69883038 9346346 
-trial:  3 GrB centrality time:   64.684
-sources: 48915359 122571174 6183280 86323664 
-trial:  4 GrB centrality time:   65.468
-sources: 106725781 92389939 16210739 59816701 
-trial:  5 GrB centrality time:   62.224
-sources: 111669930 102831412 113384801 43872565 
-trial:  6 GrB centrality time:   64.615
-sources: 80508828 26105649 8807517 118452456 
-trial:  7 GrB centrality time:   63.542
-sources: 121818860 42361929 29493054 98461504 
-trial:  8 GrB centrality time:   54.220
-sources: 71931338 103808469 4092346 115276242 
-trial:  9 GrB centrality time:   56.408
-sources: 4649344 76656190 31312002 111334128 
-trial: 10 GrB centrality time:   64.315
-sources: 100962919 41823216 22631241 42848462 
-trial: 11 GrB centrality time:   80.969
-sources: 79485149 106818743 73347975 78848446 
-trial: 12 GrB centrality time:   53.674
-sources: 109920511 121492134 101037297 15438601 
-trial: 13 GrB centrality time:   58.184
-sources: 4584785 124503846 87241744 108297009 
-trial: 14 GrB centrality time:   70.230
-sources: 33955083 79934824 8608482 82435064 
-trial: 15 GrB centrality time:   66.013
-sources: 46579272 515422 121530468 127978737 
-trial: 16 GrB centrality time:   61.721
-avg GrB centrality time:      64.500 (16 trials)
-
-matrix: GAP/GAP-urand
-load time: 329.316 sec
-
-GAP/GAP-urand: nodes: 134.218 million  nvals: 4294.97 million
-init time: 180.471 sec
-  Name                  Size                         Bytes  Class     Attributes
-
-  A             134217728x134217728            39728443450  GrB                 
-  AT            134217728x134217728            39728443450  GrB                 
-  ans                   1x1                              8  double              
-  f                     1x245                         1960  double              
-  good                  1x27                            54  char                
-  i                     1x245                         1960  double              
-  id                    1x13                            26  char                
-  index                 1x1                        1411276  struct              
-  k                     1x1                              8  double              
-  matrices              5x1                            686  cell                
-  n                     1x1                              8  double              
-  name                  1x13                            26  char                
-  nthreads              1x1                              8  double              
-  ntrials               1x1                              8  double              
-  sources              64x1                            512  double              
-  src                   4x1                             32  double              
-  t                     1x1                              8  double              
-  t1                    1x1                              8  double              
-  threads               1x2                             16  double              
-  tot                   1x1                              8  double              
-  trial                 1x1                              8  double              
-  tstart                1x1                              8  uint64              
-
-
-gap_centrality  tests: 40 threads
-sources: 27691420 121280315 2413432 37512114 
-trial:  1 GrB centrality time:   76.906
-sources: 38390878 56651038 128461249 33029843 
-trial:  2 GrB centrality time:   77.319
-sources: 71406329 117872828 24351939 15444520 
-trial:  3 GrB centrality time:   75.876
-sources: 127526282 112279429 13631650 110379303 
-trial:  4 GrB centrality time:   75.081
-sources: 44800624 77768194 175348 107397390 
-trial:  5 GrB centrality time:   74.895
-sources: 43457210 97215941 73575166 44449716 
-trial:  6 GrB centrality time:   76.792
-sources: 33931725 55526611 14422052 58043874 
-trial:  7 GrB centrality time:   74.862
-sources: 72137330 9647841 15940696 14209953 
-trial:  8 GrB centrality time:   78.749
-sources: 49020884 28901139 50493274 49150070 
-trial:  9 GrB centrality time:   76.004
-sources: 126525083 6382741 89108298 9239736 
-trial: 10 GrB centrality time:   76.950
-sources: 110168549 95370260 116653531 123410704 
-trial: 11 GrB centrality time:   75.874
-sources: 16733666 49030283 108545122 99095666 
-trial: 12 GrB centrality time:   76.758
-sources: 133850078 63499302 21541383 6230752 
-trial: 13 GrB centrality time:   77.479
-sources: 89077457 70392766 6670456 61746272 
-trial: 14 GrB centrality time:   75.981
-sources: 83349536 115272185 20129909 106148554 
-trial: 15 GrB centrality time:   74.830
-sources: 117042376 71431188 45287809 107702121 
-trial: 16 GrB centrality time:   74.896
-avg GrB centrality time:      76.203 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 27691420 121280315 2413432 37512114 
-trial:  1 GrB centrality time:   85.513
-sources: 38390878 56651038 128461249 33029843 
-trial:  2 GrB centrality time:   86.110
-sources: 71406329 117872828 24351939 15444520 
-trial:  3 GrB centrality time:   84.054
-sources: 127526282 112279429 13631650 110379303 
-trial:  4 GrB centrality time:   83.460
-sources: 44800624 77768194 175348 107397390 
-trial:  5 GrB centrality time:   83.207
-sources: 43457210 97215941 73575166 44449716 
-trial:  6 GrB centrality time:   85.618
-sources: 33931725 55526611 14422052 58043874 
-trial:  7 GrB centrality time:   82.769
-sources: 72137330 9647841 15940696 14209953 
-trial:  8 GrB centrality time:   87.538
-sources: 49020884 28901139 50493274 49150070 
-trial:  9 GrB centrality time:   84.201
-sources: 126525083 6382741 89108298 9239736 
-trial: 10 GrB centrality time:   85.400
-sources: 110168549 95370260 116653531 123410704 
-trial: 11 GrB centrality time:   83.951
-sources: 16733666 49030283 108545122 99095666 
-trial: 12 GrB centrality time:   85.045
-sources: 133850078 63499302 21541383 6230752 
-trial: 13 GrB centrality time:   85.979
-sources: 89077457 70392766 6670456 61746272 
-trial: 14 GrB centrality time:   84.130
-sources: 83349536 115272185 20129909 106148554 
-trial: 15 GrB centrality time:   83.188
-sources: 117042376 71431188 45287809 107702121 
-trial: 16 GrB centrality time:   83.059
-avg GrB centrality time:      84.576 (16 trials)
-
-matrix: GAP/GAP-twitter
-load time: 109.737 sec
-
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 1468.36 million
-init time: 43.892 sec
-  Name                 Size                        Bytes  Class     Attributes
-
-  A             61578415x61578415            13707912242  GrB                 
-  AT            61578415x61578415            13707912242  GrB                 
-  ans                  1x1                             8  double              
-  f                    1x245                        1960  double              
-  good                 1x27                           54  char                
-  i                    1x245                        1960  double              
-  id                   1x15                           30  char                
-  index                1x1                       1411276  struct              
-  k                    1x1                             8  double              
-  matrices             5x1                           686  cell                
-  n                    1x1                             8  double              
-  name                 1x15                           30  char                
-  nthreads             1x1                             8  double              
-  ntrials              1x1                             8  double              
-  sources             64x1                           512  double              
-  src                  4x1                            32  double              
-  t                    1x1                             8  double              
-  t1                   1x1                             8  double              
-  threads              1x2                            16  double              
-  tot                  1x1                             8  double              
-  trial                1x1                             8  double              
-  tstart               1x1                             8  uint64              
-
-
-gap_centrality  tests: 40 threads
-sources: 12441073 54488258 25451916 57714474 
-trial:  1 GrB centrality time:   22.921
-sources: 14839495 32081105 52957358 50444381 
-trial:  2 GrB centrality time:   28.070
-sources: 49590702 20127817 34939334 48251002 
-trial:  3 GrB centrality time:   24.194
-sources: 19524254 43676727 33055509 15244688 
-trial:  4 GrB centrality time:   22.521
-sources: 24946739 6479473 26077683 22023876 
-trial:  5 GrB centrality time:   25.457
-sources: 22081916 40034163 49496015 42847508 
-trial:  6 GrB centrality time:   22.897
-sources: 52409558 55445389 22028098 48766649 
-trial:  7 GrB centrality time:   23.483
-sources: 44521242 60135543 28528672 9678013 
-trial:  8 GrB centrality time:   25.496
-sources: 40020307 31625736 37446893 51788953 
-trial:  9 GrB centrality time:   22.138
-sources: 52584256 20346697 48387910 37337428 
-trial: 10 GrB centrality time:   22.394
-sources: 50501085 30130062 41185894 56495704 
-trial: 11 GrB centrality time:   23.596
-sources: 45663306 33359461 48143059 33291514 
-trial: 12 GrB centrality time:   25.540
-sources: 53461446 29340611 34148499 49171807 
-trial: 13 GrB centrality time:   23.124
-sources: 35550697 14521508 51633219 46823383 
-trial: 14 GrB centrality time:   23.591
-sources: 19396274 19871751 36862678 49539127 
-trial: 15 GrB centrality time:   26.233
-sources: 34016453 36567396 55487794 14391371 
-trial: 16 GrB centrality time:   22.845
-avg GrB centrality time:      24.031 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 12441073 54488258 25451916 57714474 
-trial:  1 GrB centrality time:   28.558
-sources: 14839495 32081105 52957358 50444381 
-trial:  2 GrB centrality time:   33.260
-sources: 49590702 20127817 34939334 48251002 
-trial:  3 GrB centrality time:   27.827
-sources: 19524254 43676727 33055509 15244688 
-trial:  4 GrB centrality time:   25.167
-sources: 24946739 6479473 26077683 22023876 
-trial:  5 GrB centrality time:   29.496
-sources: 22081916 40034163 49496015 42847508 
-trial:  6 GrB centrality time:   26.889
-sources: 52409558 55445389 22028098 48766649 
-trial:  7 GrB centrality time:   29.450
-sources: 44521242 60135543 28528672 9678013 
-trial:  8 GrB centrality time:   30.134
-sources: 40020307 31625736 37446893 51788953 
-trial:  9 GrB centrality time:   24.943
-sources: 52584256 20346697 48387910 37337428 
-trial: 10 GrB centrality time:   26.430
-sources: 50501085 30130062 41185894 56495704 
-trial: 11 GrB centrality time:   28.728
-sources: 45663306 33359461 48143059 33291514 
-trial: 12 GrB centrality time:   35.626
-sources: 53461446 29340611 34148499 49171807 
-trial: 13 GrB centrality time:   26.143
-sources: 35550697 14521508 51633219 46823383 
-trial: 14 GrB centrality time:   29.736
-sources: 19396274 19871751 36862678 49539127 
-trial: 15 GrB centrality time:   32.999
-sources: 34016453 36567396 55487794 14391371 
-trial: 16 GrB centrality time:   25.409
-avg GrB centrality time:      28.800 (16 trials)
-
-matrix: GAP/GAP-web
-load time: 110.622 sec
-
-GAP/GAP-web: nodes: 50.6362 million  nvals: 1930.29 million
-init time: 16.1462 sec
-  Name                 Size                        Bytes  Class     Attributes
-
-  A             50636151x50636151            17777726706  GrB                 
-  AT            50636151x50636151            17777726706  GrB                 
-  ans                  1x1                             8  double              
-  f                    1x245                        1960  double              
-  good                 1x27                           54  char                
-  i                    1x245                        1960  double              
-  id                   1x11                           22  char                
-  index                1x1                       1411276  struct              
-  k                    1x1                             8  double              
-  matrices             5x1                           686  cell                
-  n                    1x1                             8  double              
-  name                 1x11                           22  char                
-  nthreads             1x1                             8  double              
-  ntrials              1x1                             8  double              
-  sources             64x1                           512  double              
-  src                  4x1                            32  double              
-  t                    1x1                             8  double              
-  t1                   1x1                             8  double              
-  threads              1x2                            16  double              
-  tot                  1x1                             8  double              
-  trial                1x1                             8  double              
-  tstart               1x1                             8  uint64              
-
-
-gap_centrality  tests: 40 threads
-sources: 10219453 44758212 890672 13843757 
-trial:  1 GrB centrality time:   34.519
-sources: 14168063 20906931 12189585 26352336 
-trial:  2 GrB centrality time:   32.776
-sources: 43500687 8987025 5699763 41436456 
-trial:  3 GrB centrality time:   32.588
-sources: 5030728 40735219 16533564 28700167 
-trial:  4 GrB centrality time:   36.598
-sources: 64712 39634751 16037780 27152740 
-trial:  5 GrB centrality time:   34.379
-sources: 16404062 20491964 5322424 21420954 
-trial:  6 GrB centrality time:   36.237
-sources: 26622110 5882876 18091041 10665897 
-trial:  7 GrB centrality time:   32.900
-sources: 18634423 18138716 2355536 32885206 
-trial:  8 GrB centrality time:   31.617
-sources: 40657441 35196168 45544427 6175520 
-trial:  9 GrB centrality time:   31.073
-sources: 40058319 50626231 36571020 49397053 
-trial: 10 GrB centrality time:   33.862
-sources: 23434266 2299445 32873824 25978283 
-trial: 11 GrB centrality time:   30.885
-sources: 2461716 22787315 30759948 7428895 
-trial: 12 GrB centrality time:   28.832
-sources: 39173871 43194210 26361510 39747212 
-trial: 13 GrB centrality time:    6.108
-sources: 30670030 41483034 9358667 9945009 
-trial: 14 GrB centrality time:   34.487
-sources: 3355245 33831270 45124745 16137878 
-trial: 15 GrB centrality time:   32.466
-sources: 11235449 37509145 27402415 39546084 
-trial: 16 GrB centrality time:   33.862
-avg GrB centrality time:      31.449 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 10219453 44758212 890672 13843757 
-trial:  1 GrB centrality time:   36.217
-sources: 14168063 20906931 12189585 26352336 
-trial:  2 GrB centrality time:   34.248
-sources: 43500687 8987025 5699763 41436456 
-trial:  3 GrB centrality time:   34.476
-sources: 5030728 40735219 16533564 28700167 
-trial:  4 GrB centrality time:   39.454
-sources: 64712 39634751 16037780 27152740 
-trial:  5 GrB centrality time:   36.464
-sources: 16404062 20491964 5322424 21420954 
-trial:  6 GrB centrality time:   38.870
-sources: 26622110 5882876 18091041 10665897 
-trial:  7 GrB centrality time:   34.630
-sources: 18634423 18138716 2355536 32885206 
-trial:  8 GrB centrality time:   33.292
-sources: 40657441 35196168 45544427 6175520 
-trial:  9 GrB centrality time:   32.620
-sources: 40058319 50626231 36571020 49397053 
-trial: 10 GrB centrality time:   35.856
-sources: 23434266 2299445 32873824 25978283 
-trial: 11 GrB centrality time:   32.754
-sources: 2461716 22787315 30759948 7428895 
-trial: 12 GrB centrality time:   30.514
-sources: 39173871 43194210 26361510 39747212 
-trial: 13 GrB centrality time:    6.457
-sources: 30670030 41483034 9358667 9945009 
-trial: 14 GrB centrality time:   36.347
-sources: 3355245 33831270 45124745 16137878 
-trial: 15 GrB centrality time:   34.223
-sources: 11235449 37509145 27402415 39546084 
-trial: 16 GrB centrality time:   35.701
-avg GrB centrality time:      33.257 (16 trials)
-
-matrix: GAP/GAP-road
-load time: 5.26975 sec
-
-GAP/GAP-road: nodes: 23.9473 million  nvals: 57.7086 million
-init time: 1.62456 sec
-  Name                 Size                      Bytes  Class     Attributes
-
-  A             23947347x23947347            710957358  GrB                 
-  AT            23947347x23947347            710957358  GrB                 
-  ans                  1x1                           8  double              
-  f                    1x245                      1960  double              
-  good                 1x27                         54  char                
-  i                    1x245                      1960  double              
-  id                   1x12                         24  char                
-  index                1x1                     1411276  struct              
-  k                    1x1                           8  double              
-  matrices             5x1                         686  cell                
-  n                    1x1                           8  double              
-  name                 1x12                         24  char                
-  nthreads             1x1                           8  double              
-  ntrials              1x1                           8  double              
-  sources             64x1                         512  double              
-  src                  4x1                          32  double              
-  t                    1x1                           8  double              
-  t1                   1x1                           8  double              
-  threads              1x2                          16  double              
-  tot                  1x1                           8  double              
-  trial                1x1                           8  double              
-  tstart               1x1                           8  uint64              
-
-
-gap_centrality  tests: 40 threads
-sources: 4795721 21003854 417969 6496512 
-trial:  1 GrB centrality time: 1693.440
-sources: 6648700 9811074 22247479 5720253 
-trial:  2 GrB centrality time: 1781.869
-sources: 12366460 20413730 4217375 2674750 
-trial:  3 GrB centrality time: 1829.616
-sources: 22085558 19445041 2360789 19115969 
-trial:  4 GrB centrality time: 1715.498
-sources: 7758768 13468235 30368 18599548 
-trial:  5 GrB centrality time: 1711.589
-sources: 7526109 16836281 12742068 7697996 
-trial:  6 GrB centrality time: 1785.744
-sources: 5876444 9616341 2497674 10052291 
-trial:  7 GrB centrality time: 1805.876
-sources: 12493058 1670856 2760680 2460942 
-trial:  8 GrB centrality time: 1789.747
-sources: 8489651 5005226 8744646 8512024 
-trial:  9 GrB centrality time: 1765.239
-sources: 21912166 1105391 15432164 1600178 
-trial: 10 GrB centrality time: 1980.044
-sources: 19079470 16516638 20202567 21372804 
-trial: 11 GrB centrality time: 1907.042
-sources: 2898010 8491278 18798318 23757561 
-trial: 12 GrB centrality time: 1793.916
-sources: 17161820 23180740 10997086 3730631 
-trial: 13 GrB centrality time: 1922.203
-sources: 1079069 15426823 12190926 1155219 
-trial: 14 GrB centrality time: 1543.236
-sources: 10693489 14434836 19963340 3486186 
-trial: 15 GrB centrality time: 1660.389
-sources: 18383270 20269909 12370765 7843141 
-trial: 16 GrB centrality time: 1683.752
-avg GrB centrality time:    1773.075 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 4795721 21003854 417969 6496512 
-trial:  1 GrB centrality time: 1756.062
-sources: 6648700 9811074 22247479 5720253 
-trial:  2 GrB centrality time: 1837.411
-sources: 12366460 20413730 4217375 2674750 
-trial:  3 GrB centrality time: 1865.112
-sources: 22085558 19445041 2360789 19115969 
-trial:  4 GrB centrality time: 1759.544
-sources: 7758768 13468235 30368 18599548 
-trial:  5 GrB centrality time: 1753.446
-sources: 7526109 16836281 12742068 7697996 
-trial:  6 GrB centrality time: 1871.221
-sources: 5876444 9616341 2497674 10052291 
-trial:  7 GrB centrality time: 1856.786
-sources: 12493058 1670856 2760680 2460942 
-trial:  8 GrB centrality time: 1863.061
-sources: 8489651 5005226 8744646 8512024 
-trial:  9 GrB centrality time: 1799.368
-sources: 21912166 1105391 15432164 1600178 
-trial: 10 GrB centrality time: 1990.970
-sources: 19079470 16516638 20202567 21372804 
-trial: 11 GrB centrality time: 1949.663
-sources: 2898010 8491278 18798318 23757561 
-trial: 12 GrB centrality time: 1814.542
-sources: 17161820 23180740 10997086 3730631 
-trial: 13 GrB centrality time: 1951.980
-sources: 1079069 15426823 12190926 1155219 
-trial: 14 GrB centrality time: 1590.792
-sources: 10693489 14434836 19963340 3486186 
-trial: 15 GrB centrality time: 1716.727
-sources: 18383270 20269909 12370765 7843141 
-trial: 16 GrB centrality time: 1743.378
-avg GrB centrality time:    1820.004 (16 trials)
-hypersparse: 40 threads
-
-threads =
-
-    40    20
-
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-load time: 317.137 sec
-degree time: 12.9519 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 56.2284 iter: 7
-trial:  2 GAP pagerank time: 56.084 iter: 7
-trial:  3 GAP pagerank time: 56.2184 iter: 7
-trial:  4 GAP pagerank time: 56.3363 iter: 7
-trial:  5 GAP pagerank time: 56.2301 iter: 7
-trial:  6 GAP pagerank time: 56.2395 iter: 7
-trial:  7 GAP pagerank time: 56.6253 iter: 7
-trial:  8 GAP pagerank time: 56.87 iter: 7
-trial:  9 GAP pagerank time: 56.0584 iter: 7
-trial: 10 GAP pagerank time: 55.9574 iter: 7
-trial: 11 GAP pagerank time: 56.1497 iter: 7
-trial: 12 GAP pagerank time: 55.9553 iter: 7
-trial: 13 GAP pagerank time: 56.2568 iter: 7
-trial: 14 GAP pagerank time: 56.0345 iter: 7
-trial: 15 GAP pagerank time: 56.102 iter: 7
-trial: 16 GAP pagerank time: 56.0707 iter: 7
-avg gap_pagerank time:  56.2135 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 70.7899 iter: 7
-trial:  2 GAP pagerank time: 70.5758 iter: 7
-trial:  3 GAP pagerank time: 70.7127 iter: 7
-trial:  4 GAP pagerank time: 70.8028 iter: 7
-trial:  5 GAP pagerank time: 70.592 iter: 7
-trial:  6 GAP pagerank time: 70.8455 iter: 7
-trial:  7 GAP pagerank time: 70.7298 iter: 7
-trial:  8 GAP pagerank time: 70.629 iter: 7
-trial:  9 GAP pagerank time: 70.657 iter: 7
-trial: 10 GAP pagerank time: 70.5814 iter: 7
-trial: 11 GAP pagerank time: 70.7438 iter: 7
-trial: 12 GAP pagerank time: 70.7 iter: 7
-trial: 13 GAP pagerank time: 70.7008 iter: 7
-trial: 14 GAP pagerank time: 70.6564 iter: 7
-trial: 15 GAP pagerank time: 70.6647 iter: 7
-trial: 16 GAP pagerank time: 70.6893 iter: 7
-avg gap_pagerank time:  70.6919 (16 trials)
-
-GAP/GAP-urand: nodes: 134.218 million  nvals: 4294.97 million
-load time: 329.296 sec
-degree time: 16.5704 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 60.3215 iter: 6
-trial:  2 GAP pagerank time: 60.3247 iter: 6
-trial:  3 GAP pagerank time: 61.2041 iter: 6
-trial:  4 GAP pagerank time: 60.3334 iter: 6
-trial:  5 GAP pagerank time: 60.2866 iter: 6
-trial:  6 GAP pagerank time: 60.2422 iter: 6
-trial:  7 GAP pagerank time: 60.3153 iter: 6
-trial:  8 GAP pagerank time: 60.2452 iter: 6
-trial:  9 GAP pagerank time: 60.2881 iter: 6
-trial: 10 GAP pagerank time: 60.2949 iter: 6
-trial: 11 GAP pagerank time: 60.1941 iter: 6
-trial: 12 GAP pagerank time: 60.2079 iter: 6
-trial: 13 GAP pagerank time: 60.3012 iter: 6
-trial: 14 GAP pagerank time: 60.3033 iter: 6
-trial: 15 GAP pagerank time: 60.3642 iter: 6
-trial: 16 GAP pagerank time: 60.2028 iter: 6
-avg gap_pagerank time:  60.3394 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 74.9066 iter: 6
-trial:  2 GAP pagerank time: 74.9301 iter: 6
-trial:  3 GAP pagerank time: 74.7575 iter: 6
-trial:  4 GAP pagerank time: 74.6965 iter: 6
-trial:  5 GAP pagerank time: 74.7824 iter: 6
-trial:  6 GAP pagerank time: 74.84 iter: 6
-trial:  7 GAP pagerank time: 75.011 iter: 6
-trial:  8 GAP pagerank time: 74.7813 iter: 6
-trial:  9 GAP pagerank time: 74.8229 iter: 6
-trial: 10 GAP pagerank time: 75.1566 iter: 6
-trial: 11 GAP pagerank time: 74.9659 iter: 6
-trial: 12 GAP pagerank time: 74.9282 iter: 6
-trial: 13 GAP pagerank time: 74.9627 iter: 6
-trial: 14 GAP pagerank time: 74.9935 iter: 6
-trial: 15 GAP pagerank time: 75.142 iter: 6
-trial: 16 GAP pagerank time: 74.9159 iter: 6
-avg gap_pagerank time:  74.9121 (16 trials)
-
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 1468.36 million
-load time: 111.941 sec
-degree time: 4.48824 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 52.6016 iter: 22
-trial:  2 GAP pagerank time: 52.4401 iter: 22
-trial:  3 GAP pagerank time: 52.7246 iter: 22
-trial:  4 GAP pagerank time: 52.5105 iter: 22
-trial:  5 GAP pagerank time: 52.6847 iter: 22
-trial:  6 GAP pagerank time: 52.4898 iter: 22
-trial:  7 GAP pagerank time: 52.5195 iter: 22
-trial:  8 GAP pagerank time: 52.5429 iter: 22
-trial:  9 GAP pagerank time: 52.593 iter: 22
-trial: 10 GAP pagerank time: 52.549 iter: 22
-trial: 11 GAP pagerank time: 52.6655 iter: 22
-trial: 12 GAP pagerank time: 52.5112 iter: 22
-trial: 13 GAP pagerank time: 52.6352 iter: 22
-trial: 14 GAP pagerank time: 52.3182 iter: 22
-trial: 15 GAP pagerank time: 52.7146 iter: 22
-trial: 16 GAP pagerank time: 52.5071 iter: 22
-avg gap_pagerank time:  52.563 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 65.9717 iter: 22
-trial:  2 GAP pagerank time: 65.7624 iter: 22
-trial:  3 GAP pagerank time: 65.8384 iter: 22
-trial:  4 GAP pagerank time: 65.6904 iter: 22
-trial:  5 GAP pagerank time: 65.8656 iter: 22
-trial:  6 GAP pagerank time: 65.7528 iter: 22
-trial:  7 GAP pagerank time: 65.9073 iter: 22
-trial:  8 GAP pagerank time: 65.7332 iter: 22
-trial:  9 GAP pagerank time: 65.931 iter: 22
-trial: 10 GAP pagerank time: 65.7044 iter: 22
-trial: 11 GAP pagerank time: 66.0141 iter: 22
-trial: 12 GAP pagerank time: 65.7873 iter: 22
-trial: 13 GAP pagerank time: 65.9282 iter: 22
-trial: 14 GAP pagerank time: 65.6653 iter: 22
-trial: 15 GAP pagerank time: 65.887 iter: 22
-trial: 16 GAP pagerank time: 65.704 iter: 22
-avg gap_pagerank time:  65.8214 (16 trials)
-
-GAP/GAP-web: nodes: 50.6362 million  nvals: 1930.29 million
-load time: 112.947 sec
-degree time: 2.00099 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 39.5218 iter: 30
-trial:  2 GAP pagerank time: 39.4863 iter: 30
-trial:  3 GAP pagerank time: 39.5716 iter: 30
-trial:  4 GAP pagerank time: 39.5377 iter: 30
-trial:  5 GAP pagerank time: 39.5277 iter: 30
-trial:  6 GAP pagerank time: 39.5897 iter: 30
-trial:  7 GAP pagerank time: 39.5435 iter: 30
-trial:  8 GAP pagerank time: 39.5566 iter: 30
-trial:  9 GAP pagerank time: 39.5029 iter: 30
-trial: 10 GAP pagerank time: 39.5397 iter: 30
-trial: 11 GAP pagerank time: 39.5112 iter: 30
-trial: 12 GAP pagerank time: 39.5544 iter: 30
-trial: 13 GAP pagerank time: 39.5982 iter: 30
-trial: 14 GAP pagerank time: 39.4961 iter: 30
-trial: 15 GAP pagerank time: 39.5416 iter: 30
-trial: 16 GAP pagerank time: 39.5859 iter: 30
-avg gap_pagerank time:  39.5416 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 45.2306 iter: 30
-trial:  2 GAP pagerank time: 45.0701 iter: 30
-trial:  3 GAP pagerank time: 45.1122 iter: 30
-trial:  4 GAP pagerank time: 45.1034 iter: 30
-trial:  5 GAP pagerank time: 45.0952 iter: 30
-trial:  6 GAP pagerank time: 45.0879 iter: 30
-trial:  7 GAP pagerank time: 45.1199 iter: 30
-trial:  8 GAP pagerank time: 45.1113 iter: 30
-trial:  9 GAP pagerank time: 45.099 iter: 30
-trial: 10 GAP pagerank time: 45.084 iter: 30
-trial: 11 GAP pagerank time: 45.1051 iter: 30
-trial: 12 GAP pagerank time: 45.0906 iter: 30
-trial: 13 GAP pagerank time: 45.1125 iter: 30
-trial: 14 GAP pagerank time: 45.0879 iter: 30
-trial: 15 GAP pagerank time: 45.0962 iter: 30
-trial: 16 GAP pagerank time: 45.0963 iter: 30
-avg gap_pagerank time:  45.1064 (16 trials)
-
-GAP/GAP-road: nodes: 23.9473 million  nvals: 57.7086 million
-load time: 6.39184 sec
-degree time: 0.416885 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 14.2601 iter: 39
-trial:  2 GAP pagerank time: 14.2656 iter: 39
-trial:  3 GAP pagerank time: 14.1912 iter: 39
-trial:  4 GAP pagerank time: 14.0162 iter: 39
-trial:  5 GAP pagerank time: 13.9157 iter: 39
-trial:  6 GAP pagerank time: 13.9858 iter: 39
-trial:  7 GAP pagerank time: 13.9086 iter: 39
-trial:  8 GAP pagerank time: 13.8534 iter: 39
-trial:  9 GAP pagerank time: 13.7337 iter: 39
-trial: 10 GAP pagerank time: 13.8071 iter: 39
-trial: 11 GAP pagerank time: 13.73 iter: 39
-trial: 12 GAP pagerank time: 13.7265 iter: 39
-trial: 13 GAP pagerank time: 13.5879 iter: 39
-trial: 14 GAP pagerank time: 13.5738 iter: 39
-trial: 15 GAP pagerank time: 13.5303 iter: 39
-trial: 16 GAP pagerank time: 13.4642 iter: 39
-avg gap_pagerank time:  13.8469 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 15.0318 iter: 39
-trial:  2 GAP pagerank time: 15.1323 iter: 39
-trial:  3 GAP pagerank time: 15.181 iter: 39
-trial:  4 GAP pagerank time: 15.3761 iter: 39
-trial:  5 GAP pagerank time: 15.2311 iter: 39
-trial:  6 GAP pagerank time: 15.3829 iter: 39
-trial:  7 GAP pagerank time: 15.3478 iter: 39
-trial:  8 GAP pagerank time: 15.3798 iter: 39
-trial:  9 GAP pagerank time: 15.388 iter: 39
-trial: 10 GAP pagerank time: 15.4779 iter: 39
-trial: 11 GAP pagerank time: 15.351 iter: 39
-trial: 12 GAP pagerank time: 15.4525 iter: 39
-trial: 13 GAP pagerank time: 15.4956 iter: 39
-trial: 14 GAP pagerank time: 15.5075 iter: 39
-trial: 15 GAP pagerank time: 15.465 iter: 39
-trial: 16 GAP pagerank time: 15.5601 iter: 39
-avg gap_pagerank time:  15.36 (16 trials)
- GAP_TC run tricount for the GAP benchmark
-
-
-matrices =
-
-  5x1 cell array
-
-    {'GAP/GAP-road'   }
-    {'GAP/GAP-web'    }
-    {'GAP/GAP-urand'  }
-    {'GAP/GAP-twitter'}
-    {'GAP/GAP-kron'   }
-
-
-matrices =
-
-  2x1 cell array
-
-    {'GAP/GAP-twitter'}
-    {'GAP/GAP-kron'   }
-
-hypersparse: 40 threads
-
-threads =
-
-    40    20
-
-
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 2405.03 million
-load time: 153.268 sec
-
-GAP tricount  tests: 40 threads
-mean degree: 25.979 median: 6
-sorting A first
-trial:  1 GrB.tricount  time:  343.550
-mean degree: 26.561 median: 5
-sorting A first
-trial:  2 GrB.tricount  time:  342.570
-mean degree: 29.001 median: 5
-sorting A first
-trial:  3 GrB.tricount  time:  341.025
-avg GrB.tricount time:     342.382 (3 trials)
-triangles: 34824916864
-
-GAP tricount  tests: 20 threads
-mean degree: 45.814 median: 5
-sorting A first
-trial:  1 GrB.tricount  time:  520.747
-mean degree: 24.15 median: 5
-sorting A first
-trial:  2 GrB.tricount  time:  519.746
-mean degree: 52.545 median: 5
-sorting A first
-trial:  3 GrB.tricount  time:  519.793
-avg GrB.tricount time:     520.095 (3 trials)
-triangles: 34824916864
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-load time: 485.712 sec
-
-GAP tricount  tests: 40 threads
-mean degree: 18.775 median: 0.5
-sorting A first
-trial:  1 GrB.tricount  time: 1286.464
-mean degree: 25.79 median: 0
-sorting A first
-{Operation terminated by user during GrB/subsref (line 88)
-
-
-In tricount (line 75)
-        A = A (p,p) ;
-
-In gap_tc (line 103)
-            s = tricount (A, d) ;
-
-In gap (line 12)
-gap_tc         % run tricount for the GAP benchmark
-} 
-diary off
diff --git a/GraphBLAS/GraphBLAS/GAP/Contents.m b/GraphBLAS/GraphBLAS/GAP/Contents.m
deleted file mode 100644
index 1b06e7992a..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/Contents.m
+++ /dev/null
@@ -1,17 +0,0 @@
-% GAP: GAP benchmark methods in MATLAB (in progress)
-%
-%   gap            - run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%   gap_bfs        - run bfs for the GAP benchmark
-%   gap_pr         - run pagerank for the GAP benchmark
-%   gap_pagerank   - PageRank of a graph (GAP benchmark algorithm)
-%   gap_tc         - run tricount for the GAP benchmark
-%   tric           - triangle counting tests
-%   ttest          - run triangle counting tests
-%   gap_bc         - run centrality for the GAP benchmark
-%   gap_centrality - batch betweenness centrality of a graph, via GraphBLAS
-%   gap_sssp12c    - single source shortest path, via delta stepping, for GAP
-%   gap_sssp12     - single source shortest path, via delta stepping, for GAP
-%   gap_sssp       - run SSSP for the GAP benchmark
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
diff --git a/GraphBLAS/GraphBLAS/GAP/bctest.m b/GraphBLAS/GraphBLAS/GAP/bctest.m
deleted file mode 100644
index 78d0c14e82..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/bctest.m
+++ /dev/null
@@ -1,11 +0,0 @@
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
-s = [20 34 14 51] ;
-Prob = ssget ('HB/west0067') ;
-A = GrB (Prob.A, 'by row') ;
-A = GrB (A, 'logical')
-
-c = gap_centrality (s, A)
-
diff --git a/GraphBLAS/GraphBLAS/GAP/cover.mtx b/GraphBLAS/GraphBLAS/GAP/cover.mtx
deleted file mode 100644
index a338dc9458..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/cover.mtx
+++ /dev/null
@@ -1,17 +0,0 @@
-%%MatrixMarket matrix coordinate real general
-%%GraphBLAS GrB_INT32
-% Matrix from the cover of "Graph Algorithms in the Language of Linear
-% Algebra", Kepner and Gilbert.  Note that cover shows A'.  This is A.
-7 7 12
-4 1 4
-1 2 2
-4 3 1
-6 3 5
-7 3 9
-1 4 7
-7 4 1
-2 5 5
-7 5 1
-3 6 1
-5 6 7
-2 7 8
diff --git a/GraphBLAS/GraphBLAS/GAP/gap.m b/GraphBLAS/GraphBLAS/GAP/gap.m
deleted file mode 100644
index d0cc64a657..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/gap.m
+++ /dev/null
@@ -1,15 +0,0 @@
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-type gap
-gap_bc         % run centrality for the GAP benchmark
-gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-
diff --git a/GraphBLAS/GraphBLAS/GAP/gap_bc.m b/GraphBLAS/GraphBLAS/GAP/gap_bc.m
deleted file mode 100644
index e92e3c59c8..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/gap_bc.m
+++ /dev/null
@@ -1,131 +0,0 @@
-function gap_bc
-%GAP_BC run centrality for the GAP benchmark
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-rng ('default') ;
-
-% warmup, to make sure GrB library is loaded
-C = GrB (1) * GrB (1) + 1 ;
-clear C
-
-index = ssget ;
-f = find (index.nrows == index.ncols & index.nnz > 5e6 & index.isReal) ;
-[~,i] = sort (index.nnz (f)) ;
-matrices = f (i) ;
-
-% smaller test matrices:
-matrices = { 'HB/west0067', 'SNAP/roadNet-CA', ...
-    'GAP/GAP-road', ...
-    'GAP/GAP-web', ...
-    'GAP/GAP-urand', ...
-    'GAP/GAP-twitter', ...
-    'GAP/GAP-kron' } ;
-
-matrices = { 'HB/west0067', 'SNAP/roadNet-CA' , ...
-    'SNAP/com-Orkut', 'LAW/indochina-2004' } ;
-
-% the GAP test matrices:
-matrices = {
-    'GAP/GAP-kron'
-    'GAP/GAP-urand'
-    'GAP/GAP-twitter'
-    'GAP/GAP-web'
-    'GAP/GAP-road'
-    } ;
-
-[status, result] = system ('hostname') ;
-clear status
-if (isequal (result (1:5), 'hyper'))
-    fprintf ('hypersparse: %d threads\n', GrB.threads (40)) ;
-elseif (isequal (result (1:5), 'slash'))
-    fprintf ('slash: %d threads\n', GrB.threads (8)) ;
-elseif (isequal (result (1:9), 'backslash'))
-    fprintf ('backslash: %d threads\n', GrB.threads (24)) ;
-else
-    fprintf ('default: %d threads\n', GrB.threads) ;
-end
-clear result
-
-threads = GrB.threads ;
-threads = [threads threads/2]
-
-for k = 1:length(matrices)
-
-    %---------------------------------------------------------------------------
-    % get the GAP problem
-    %---------------------------------------------------------------------------
-
-    id = matrices {k} ;
-    fprintf ('\nmatrix: %s\n', id) ;
-    GrB.burble (0) ;
-    t1 = tic ;
-    clear A Prob
-    Prob = ssget (id, index) ;
-    t1 = toc (t1) ; ;
-    fprintf ('load time: %g sec\n', t1) ;
-    t1 = tic ;
-    sources = Prob.aux.sources ;
-    A = GrB (Prob.A, 'by row', 'logical') ;
-    name = Prob.name ;
-    clear Prob
-    A = spones (A) ;
-    AT = A' ;
-    n = size (A,1) ;
-    fprintf ('\n%s: nodes: %g million  nvals: %g million\n', ...
-        name, n / 1e6, nnz (A) / 1e6) ;
-    t1 = toc (t1) ;
-    fprintf ('init time: %g sec\n', t1) ;
-    whos
-
-    %---------------------------------------------------------------------------
-    % compute the centrality for each batch of 4
-    %---------------------------------------------------------------------------
-
-    for nthreads = threads
-        GrB.threads (nthreads) ;
-        fprintf ('\ngap_centrality  tests: %d threads\n', nthreads) ;
-
-        % good = '~/LAGraph/Test/BetweennessCentrality/batch_%02d_%d.mtx' ;
-        good = '/raid/GAP/batch_%02d_%d.mtx' ;
-
-        tot = 0 ;
-        trial = 0 ;
-        for k = 1:4:length(sources)
-            src = sources (k:k+3)  ;
-            fprintf ('sources: ') ;
-            fprintf ('%d ', src) ;
-            fprintf ('\n') ;
-            trial = trial + 1 ;
-
-            tstart = tic ;
-            c = gap_centrality (src, A, AT) ;
-            t = toc (tstart) ;
-            tot = tot + t ;
-            fprintf ('trial: %2d GrB centrality time: %8.3f\n', trial, t) ;
-
-            % check result
-            try
-                c = GrB.prune (c) ;
-                tstart = tic ;
-                cgood = GrB (mread (sprintf (good, k-1, n))) ;
-                err = norm (cgood - c) / norm (cgood);
-                t = toc (tstart) ;
-                fprintf ('err: %g (time %g sec) entries %d %d diff %d\n', err, t, ...
-                    GrB.entries (c),  GrB.entries (cgood), ...
-                    GrB.entries (c) - GrB.entries (cgood)) ;
-            catch
-            end
-            clear c
-        end
-
-        ntrials = trial ;
-        fprintf ('avg GrB centrality time:  %10.3f (%d trials)\n', ...
-            tot/ntrials, ntrials) ;
-    end
-
-    clear A AT
-
-end
-
diff --git a/GraphBLAS/GraphBLAS/GAP/gap_bfs.m b/GraphBLAS/GraphBLAS/GAP/gap_bfs.m
deleted file mode 100644
index 780a9a3016..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/gap_bfs.m
+++ /dev/null
@@ -1,131 +0,0 @@
-function gap_bfs
-%GAP_BFS run bfs for the GAP benchmark
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-rng ('default') ;
-
-% warmup, to make sure GrB library is loaded
-C = GrB (1) * GrB (1) + 1 ;
-clear C
-
-% smaller test matrices:
-% matrices = { 'HB/west0067', 'LAW/indochina-2004' } ;
-matrices = { 'HB/west0067' } ;
-matrices = { 'LAW/indochina-2004' } ;
-
-% the GAP test matrices:
-matrices = {
-    'GAP/GAP-kron'
-    'GAP/GAP-twitter'
-    'GAP/GAP-web'
-    'GAP/GAP-road'
-    'GAP/GAP-urand'
-    } ;
-
-[status, result] = system ('hostname') ;
-
-clear status
-if (isequal (result (1:5), 'hyper'))
-    fprintf ('hypersparse: %d threads\n', GrB.threads (40)) ;
-elseif (isequal (result (1:5), 'slash'))
-    fprintf ('slash: %d threads\n', GrB.threads (8)) ;
-elseif (isequal (result (1:9), 'backslash'))
-    fprintf ('slash: %d threads\n', GrB.threads (24)) ;
-else
-    fprintf ('default: %d threads\n', GrB.threads) ;
-end
-clear result
-
-threads = GrB.threads ;
-threads = [threads threads/2]
-
-for k = 1:length(matrices)
-
-    %---------------------------------------------------------------------------
-    % get the GAP problem
-    %---------------------------------------------------------------------------
-
-    t1 = tic ;
-    clear A Prob d
-    Prob = ssget (matrices {k}) ;
-    A = GrB (Prob.A, 'by row', 'logical') ;
-    n = size (Prob.A,1) ;
-    try
-        sources = Prob.aux.sources ;
-    catch
-        sources = randperm (n, 64) ;
-    end
-    fprintf ('\n%s: nodes: %g million  nvals: %g million\n', ...
-        Prob.name, n / 1e6, nnz (Prob.A) / 1e6) ;
-    clear Prob
-    t1 = toc (t1) ;
-    fprintf ('load time: %g sec\n', t1) ;
-
-    ntrials = length (sources) ;
-
-    %---------------------------------------------------------------------------
-    % BFS with GrB.bfs
-    %---------------------------------------------------------------------------
-
-    for nthreads = threads
-        GrB.threads (nthreads) ;
-
-        fprintf ('\nGrB.bfs  tests: %d threads\n', nthreads) ;
-        tot = 0 ;
-        for trial = 1:ntrials
-            s = sources (trial) ;
-            tstart = tic ;
-            [v, parent] = GrB.bfs (A, s) ;
-            % v = GrB.bfs (A, s) ;
-            t = toc (tstart) ;
-            tot = tot + t ;
-            fprintf ('trial: %2d source: %8d GrB.bfs  time: %8.3f ', trial, s, t) ;
-            fprintf ('visited: %8d depth: %8d\n', nnz (v), max (v)) ;
-            % pause
-        end
-        fprintf ('avg GrB.bfs  time:  %g (%d trials)\n', tot/ntrials, ntrials) ;
-    end
-
-    %---------------------------------------------------------------------------
-    % BFS with MATLAB
-    %---------------------------------------------------------------------------
-
-%{
-    % if (n < 24*1e6)
-    try
-        fprintf ('\nCompare with built-in MATLAB bfs:\n') ;
-        A = GrB (A, 'by col') ;
-        A = double (A) ;
-        G = digraph (A) ;
-        clear A
-
-        tot = 0 ;
-        for trial = 1:ntrials
-            s = sources (trial) ;
-            tstart = tic ;
-            [table, edgetonew] = bfsearch (G, s, 'edgetonew') ;
-            % [nodes] = bfsearch (G, s) ;
-            t = toc (tstart) ;
-            tot = tot + t ;
-            fprintf ('trial: %2d source: %8d GrB.bfs  time: %8.3f ', ...
-                trial, s, t) ;
-            fprintf ('visited: %8d\n', 1 + size (table, 1)) ;
-        end
-        fprintf ('avg bfsearch time:  %g (%d trials)\n', tot/ntrials, ntrials) ;
-
-        clear G
-
-    catch me
-        me
-        printf ('MATLAB failed\n') ;
-    end
-
-    clear G table parent v nodes edgetonew
-%}
-
-    clear A
-
-end
-
diff --git a/GraphBLAS/GraphBLAS/GAP/gap_bfs_urand.m b/GraphBLAS/GraphBLAS/GAP/gap_bfs_urand.m
deleted file mode 100644
index da1d7eccb8..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/gap_bfs_urand.m
+++ /dev/null
@@ -1,125 +0,0 @@
-function gap_bfs_urand
-%GAP_BFS run bfs for the GAP benchmark
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-rng ('default') ;
-
-% warmup, to make sure GrB library is loaded
-C = GrB (1) * GrB (1) + 1 ;
-clear C
-
-% smaller test matrices:
-% matrices = { 'HB/west0067', 'LAW/indochina-2004' } ;
-matrices = { 'HB/west0067' } ;
-matrices = { 'LAW/indochina-2004' } ;
-
-% the GAP test matrices:
-matrices = {
-    'GAP/GAP-kron'
-    'GAP/GAP-urand'
-    'GAP/GAP-twitter'
-    'GAP/GAP-web'
-    'GAP/GAP-road'
-    } ;
-
-% the GAP test matrices:
-matrices = {
-    'GAP/GAP-urand'
-    } ;
-
-[status, result] = system ('hostname') ;
-clear status
-if (isequal (result (1:5), 'hyper'))
-    fprintf ('hypersparse: %d threads\n', GrB.threads (40)) ;
-elseif (isequal (result (1:5), 'slash'))
-    fprintf ('slash: %d threads\n', GrB.threads (8)) ;
-else
-    fprintf ('default: %d threads\n', GrB.threads) ;
-end
-clear result
-
-for k = 1:length(matrices)
-
-    %---------------------------------------------------------------------------
-    % get the GAP problem
-    %---------------------------------------------------------------------------
-
-    t1 = tic ;
-    clear A Prob d
-    Prob = ssget (matrices {k}) ;
-    A = GrB (Prob.A, 'by row', 'logical') ;
-    n = size (Prob.A,1) ;
-    try
-        sources = Prob.aux.sources ;
-    catch
-        sources = randperm (n, 64) ;
-    end
-    fprintf ('\n%s: nodes: %g million  nvals: %g million\n', ...
-        Prob.name, n / 1e6, nnz (Prob.A) / 1e6) ;
-    clear Prob
-    t1 = toc (t1) ;
-    fprintf ('load time: %g sec\n', t1) ;
-
-    ntrials = length (sources) ;
-
-    %---------------------------------------------------------------------------
-    % BFS with GrB.bfs
-    %---------------------------------------------------------------------------
-
-    fprintf ('\nGrB.bfs  tests:\n') ;
-
-    tot = 0 ;
-    for trial = 1:ntrials
-        s = sources (trial) ;
-        tstart = tic ;
-        [v, parent] = GrB.bfs (A, s) ;
-        % v = GrB.bfs (A, s) ;
-        t = toc (tstart) ;
-        tot = tot + t ;
-        fprintf ('trial: %2d source: %8d GrB.bfs  time: %8.3f ', trial, s, t) ;
-        fprintf ('visited: %8d depth: %8d\n', nnz (v), max (v)) ;
-        % pause
-    end
-    fprintf ('avg GrB.bfs  time:  %g (%d trials)\n', tot/ntrials, ntrials) ;
-
-    %---------------------------------------------------------------------------
-    % BFS with MATLAB
-    %---------------------------------------------------------------------------
-
-%{
-    % if (n < 24*1e6)
-    try
-        fprintf ('\nCompare with built-in MATLAB bfs:\n') ;
-        A = GrB (A, 'by col') ;
-        A = double (A) ;
-        G = digraph (A) ;
-        clear A
-
-        tot = 0 ;
-        for trial = 1:ntrials
-            s = sources (trial) ;
-            tstart = tic ;
-            [table, edgetonew] = bfsearch (G, s, 'edgetonew') ;
-            % [nodes] = bfsearch (G, s) ;
-            t = toc (tstart) ;
-            tot = tot + t ;
-            fprintf ('trial: %2d source: %8d GrB.bfs  time: %8.3f ', ...
-                trial, s, t) ;
-            fprintf ('visited: %8d\n', 1 + size (table, 1)) ;
-        end
-        fprintf ('avg bfsearch time:  %g (%d trials)\n', tot/ntrials, ntrials) ;
-
-        clear G
-
-    catch me
-        me
-        printf ('MATLAB failed\n') ;
-    end
-
-    clear G table parent v nodes edgetonew
-%}
-
-end
-
diff --git a/GraphBLAS/GraphBLAS/GAP/gap_centrality.m b/GraphBLAS/GraphBLAS/GAP/gap_centrality.m
deleted file mode 100644
index d8d0695ccf..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/gap_centrality.m
+++ /dev/null
@@ -1,126 +0,0 @@
-function centrality = gap_centrality (sources, A, AT)
-%GAP_CENTRALITY batch betweenness centrality of a graph, via GraphBLAS
-%
-% Given a set of source nodes s (an array of integers in the range 1 to n) and
-% an adjacency matrix A, c=gap_centrality(s,A) computes the betweenness
-% centrality of all nodes in the graph.  The result is a vector c of size n.
-% The centrality of a node is the relative number of shortest paths that pass
-% through node i.
-%
-% Let sigma(s,t|i) be the total number of shortest paths from node s, for s in
-% the list soources, to node t, that pass through node i.  Let sigma(s,t) be
-% the number total number of shortest paths from s to t.  Then c(i) is the sum
-% of sigma(s,t|i)/sigma(s,t), for all unique s and t (s is not t) that are also
-% not equal to i.
-%
-% A must be square.  It may be unsymmetric, and self-edges (diagonal entries)
-% are OK.  GrB.format must be stored 'by row'.  AT is optional, but if present,
-% it must be the transpose of A (and must also be stored by row).  If not
-% present, AT=A' is computed first.  The values of A and AT are ignored; just
-% the pattern of the two matrices is important.
-%
-% The list of sources should be small; length(sources) == 4 is typical.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-%-------------------------------------------------------------------------------
-% check inputs
-%-------------------------------------------------------------------------------
-
-% get input problem size
-[m,n] = size (A) ;
-ns = length (sources) ;
-
-if (m ~= n)
-    error ('A must be square') ;
-end
-if (~isequal (GrB.format (A), 'by row'))
-    % FUTURE: handle the case when A is stored by column
-    error ('A must be a GrB matrix stored by row') ;
-end
-if (nargin < 3)
-    % transpose not provided, so compute it
-    AT = GrB.trans (A, struct ('format', 'by row')) ;
-elseif (~isequal (GrB.format (AT), 'by row'))
-    error ('AT must be a GrB matrix stored by row') ;
-end
-
-%-------------------------------------------------------------------------------
-% initializations
-%-------------------------------------------------------------------------------
-
-% create result, and workspace
-paths      = GrB (ns, n, 'single', 'by row') ;
-frontier   = GrB (ns, n, 'single', 'by row') ;
-
-% paths is a dense matrix
-paths (:,:) = 0 ;
-
-% create the initial frontier
-for i = 1:ns
-    paths (i, sources (i)) = 1 ;
-    frontier (i, sources (i)) = 1 ;
-end
-
-% descriptors
-desc_rc = struct ('out', 'replace', 'mask', 'complement') ;
-desc_rs = struct ('out', 'replace', 'mask', 'structural') ;
-desc_t0 = struct ('in0', 'transpose') ;
-
-% initial frontier:  frontier<!paths> = frontier*A
-frontier = GrB.mxm (frontier, paths, '+.first.single', frontier, A, desc_rc) ;
-
-% S = cell array of frontiers, at each level
-S = cell (1, n) ;
-
-%-------------------------------------------------------------------------------
-% breadth-first search stage
-%-------------------------------------------------------------------------------
-
-for depth = 1:n
-    % S {depth} = pattern of frontier
-    S {depth} = spones (frontier, 'logical') ;
-    % accumulate path counts: paths += frontier
-    paths = GrB.assign (paths, '+.single', frontier) ;
-    % update frontier: frontier<!paths> = frontier*A
-    frontier = GrB.mxm (frontier, paths, '+.first.single', frontier, A, ...
-        desc_rc) ;
-    % break if frontier is empty
-    if (GrB.entries (frontier) == 0)
-        break ;
-    end
-end
-
-clear frontier
-
-%-------------------------------------------------------------------------------
-% betweenness centrality computation phase
-%-------------------------------------------------------------------------------
-
-% bc_update = ones (ns,n) ;
-bc_update = GrB (ns, n, 'single', 'by row') ;
-bc_update (:,:) = 1 ;
-
-% W = empty ns-by-n workspace
-W = GrB (ns, n, 'single', 'by row') ;
-
-% backtrack through the BFS levels, and compute centrality update for each node
-for i = depth:-1:2
-    % add contributions by successors and mask with that level's frontier
-    % W<S{i}> = bc_update ./ path
-    W = GrB.emult (W, S{i}, '/', bc_update, paths, desc_rs) ;
-    % W<S{i-1}> = W*A'
-    W = GrB.mxm (W, S{i-1}, '+.first.single', W, AT, desc_rs) ;
-    % bc_update += W .* paths
-    bc_update = GrB.emult (bc_update, '+', W, '*', paths) ;
-end
-
-% initialize centrality with -ns to avoid counting zero-length paths
-centrality = GrB (n, 1, 'single', 'by col') ;
-centrality (:) = -ns ;
-
-% centrality (i) += sum (bc_update (:,i)) for all nodes i
-% centrality = centrality + sum (bc_update, 1) ;
-centrality = GrB.vreduce (centrality, '+', '+', bc_update, desc_t0) ;
-
diff --git a/GraphBLAS/GraphBLAS/GAP/gap_pagerank.m b/GraphBLAS/GraphBLAS/GAP/gap_pagerank.m
deleted file mode 100644
index 3d03a3f8d6..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/gap_pagerank.m
+++ /dev/null
@@ -1,71 +0,0 @@
-function [r, niter] = gap_pagerank (A, d)
-%GAP_PAGERANK PageRank of a graph (GAP benchmark algorithm)
-% r = gap_pagerank (A) computes the PageRank of a graph with adjacency matrix
-% A.  This method uses the same algorithm as the GAP pagerank.  d on input
-% is the vector of out degrees, where d(i) = nnz(A(i,:)).  Sinks are ignored in
-% the GAP benchmark, so d(i) should be set to 1 if nnz(A(i,:)) is zero.
-%
-% A can be a GraphBLAS or MATLAB matrix, and must be stored by column.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-% set options
-tol = 1e-4 ;
-maxit = 100 ;
-damp = 0.85 ;
-type = 'single' ;
-
-n = size (A, 1) ;
-
-% native, if A is already stored by column
-native = GrB.isbycol (A) ;
-if (~native)
-    error ('A must be stored by column') ;
-end
-
-if (nargin < 2)
-    % compute d if not provided on input
-    td = tic ;
-    d = GrB.entries (A, 'row', 'degree') ;
-    sinks = find (d == 0) ;
-    if (length (sinks) > 0)
-        d (sinks) = 1 ;
-    end
-    d = GrB (d, 'single') ;
-    t = toc (td) ;
-    fprintf ('degree time: %g\n', t) ;
-end
-
-% teleport factor
-tfactor = cast ((1 - damp) / n, type) ;
-
-% sink factor
-dn = cast (damp / n, type) ;
-
-% use A' in GrB.mxm
-desc.in0 = 'transpose' ;
-
-% initial PageRank: all nodes have rank 1/n
-r = GrB (ones (n, 1, type) / n) ;
-
-% prescale d with damp so it doesn't have to be done in each iteration
-d = d / damp ;
-
-% compute the PageRank
-for iter = 1:maxit
-    prior = r ;
-    % r(:) = tfactor
-    r = GrB.expand (tfactor, r) ;
-    % t = prior ./ d
-    t = prior ./ d ;
-    % r = r + A' * (prior./d)
-    r = GrB.mxm (r, '+', A, '+.2nd.single', t, desc) ;
-    % e = norm (r-prior,1)
-    e = GrB.normdiff (r, prior, 1) ;
-    if (e < tol)
-        niter = iter ;
-        break ;
-    end
-end
-
diff --git a/GraphBLAS/GraphBLAS/GAP/gap_pr.m b/GraphBLAS/GraphBLAS/GAP/gap_pr.m
deleted file mode 100644
index 0dd504ba5e..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/gap_pr.m
+++ /dev/null
@@ -1,152 +0,0 @@
-function gap_pr
-%GAP_PR run pagerank for the GAP benchmark
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-rng ('default') ;
-
-% warmup, to make sure GrB library is loaded
-C = GrB (1) * GrB (1) + 1 ;
-clear C
-
-% smaller test matrices:
-matrices = { 'HB/west0067', 'LAW/indochina-2004' } ;
-
-% the GAP test matrices:
-matrices = {
-    'GAP/GAP-kron'
-    'GAP/GAP-urand'
-    'GAP/GAP-twitter'
-    'GAP/GAP-web'
-    'GAP/GAP-road'
-    } ;
-
-[status, result] = system ('hostname') ;
-clear status
-if (isequal (result (1:5), 'hyper'))
-    fprintf ('hypersparse: %d threads\n', GrB.threads (40)) ;
-elseif (isequal (result (1:5), 'slash'))
-    fprintf ('slash: %d threads\n', GrB.threads (8)) ;
-elseif (isequal (result (1:9), 'backslash'))
-    fprintf ('slash: %d threads\n', GrB.threads (24)) ;
-else
-    fprintf ('default: %d threads\n', GrB.threads) ;
-end
-clear result
-
-threads = GrB.threads ;
-threads = [threads threads/2]
-
-for k = 1:length(matrices)
-
-    %---------------------------------------------------------------------------
-    % get the GAP problem
-    %---------------------------------------------------------------------------
-
-    t1 = tic ;
-    clear A Prob d
-    Prob = ssget (matrices {k}) ;
-    A = GrB (Prob.A, 'by col', 'logical') ;
-    n = size (Prob.A,1) ;
-    fprintf ('\n%s: nodes: %g million  nvals: %g million\n', ...
-        Prob.name, n / 1e6, nnz (Prob.A) / 1e6) ;
-    clear Prob
-    t1 = toc (t1) ;
-    fprintf ('load time: %g sec\n', t1) ;
-
-    t1 = tic ;
-    d = GrB.entries (A, 'row', 'degree') ;
-    sinks = find (d == 0) ;
-    if (length (sinks) > 0)
-        d (sinks) = 1 ;
-    end
-    clear sinks
-    d = GrB (d, 'single') ;
-    t1 = toc (t1) ;
-    fprintf ('degree time: %g sec\n', t1) ;
-
-    ntrials = 16 ;
-    % ntrials = 1 ;
-
-    %---------------------------------------------------------------------------
-    % PageRank with gap_pagerank
-    %---------------------------------------------------------------------------
-
-    for nthreads = threads
-        GrB.threads (nthreads) ;
-        fprintf ('\nGAP PageRank tests: %d threads\n', nthreads) ;
-        tot = 0 ;
-        for trial = 1:ntrials
-            tstart = tic ;
-            [g, iter] = gap_pagerank (A, d) ;
-            t = toc (tstart) ;
-            tot = tot + t ;
-            fprintf ('trial: %2d GAP pagerank time: %g iter: %d\n', trial, t, iter);
-        end
-        fprintf ('avg gap_pagerank time:  %g (%d trials)\n', tot/ntrials, ntrials) ;
-    end
-
-    clear d
-
-    %---------------------------------------------------------------------------
-    % PageRank with GrB.pagerank
-    %---------------------------------------------------------------------------
-
-    % Note that GrB.pagerank is slightly different than the GAP pagerank.
-    % The GAP benchmark ignores nodes with zero out-degree.  The GrB.pagerank
-    % matches the MATLAB @graph/centrality (A, 'pagerank') method, which
-    % handles such nodes properly.
-
-%{
-    fprintf ('\nGrB PageRank tests:\n') ;
-    opts.type = 'single' ;
-
-    tot = 0 ;
-    for trial = 1:ntrials
-        tstart = tic ;
-        [r stats] = GrB.pagerank (A, opts) ;
-        t = toc (tstart) ;
-        tot = tot + t ;
-        fprintf ('trial: %2d GrB.pagerank time: %g = (%g + %g) iter: %d\n', ...
-            trial, t, stats.tinit, stats.trank, stats.iter) ;
-    end
-    fprintf ('avg GrB.pagerank time:  %g (%d trials)\n', tot/ntrials, ntrials) ;
-%}
-
-    %---------------------------------------------------------------------------
-    % PageRank with MATLAB
-    %---------------------------------------------------------------------------
-
-%{
-    % if (n < 24*1e6)
-    try
-        fprintf ('\nCompare with built-in MATLAB pagerank:\n') ;
-        A = double (A) ;
-        G = digraph (A) ;
-        clear A
-        tic
-        rmatlab = centrality (G, 'pagerank') ;
-        t = toc ;
-        fprintf ('MATLAB time: %g sec (one trial)\n', t) ;
-        clear G
-
-        [r1, i1] = sort (full (double (r))) ;
-        [r2, i2] = sort (full (double (rmatlab))) ;
-        [r3, i3] = sort (full (double (g))) ;
-
-        for k = 1:10
-            fprintf ('rank: %2d GrB: node %8d (%10.4e)', k, i1 (k), r1 (k)) ;
-            fprintf (' MATLAB: node %8d (%10.4e)', i2 (k), r2 (k)) ;
-            fprintf (' GAP: node %8d (%10.4e)\n', i3 (k), r3 (k)) ;
-        end
-    catch me
-        me
-        printf ('MATLAB failed\n') ;
-    end
-
-    clear G r g rmatlab d A r1 r2 r3 i1 i2 i3
-%}
-
-end
-
diff --git a/GraphBLAS/GraphBLAS/GAP/gap_sssp.m b/GraphBLAS/GraphBLAS/GAP/gap_sssp.m
deleted file mode 100644
index a95e146ec5..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/gap_sssp.m
+++ /dev/null
@@ -1,187 +0,0 @@
-function gap_sssp
-%GAP_SSSP run SSSP for the GAP benchmark
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-rng ('default') ;
-
-% warmup, to make sure GrB library is loaded
-C = GrB (1) * GrB (1) + 1 ;
-clear C
-
-index = ssget ;
-f = find (index.nrows == index.ncols & index.nnz > 5e6 & index.isReal) ;
-[~,i] = sort (index.nnz (f)) ;
-matrices = f (i) ;
-
-% tiny test matrices:
-matrices = { 'cover', 'HB/jagmesh7' } ;
-deltas = [ 100 100 ] ;
-
-% test matrices for laptop:
-matrices = { 'HB/west0067', 'SNAP/roadNet-CA' , ...
-    'SNAP/com-Orkut', 'LAW/indochina-2004' } ;
-deltas = [ 100 100 100 100 ] ;
-
-% the GAP test matrices:
-matrices = {
-    'GAP/GAP-kron'
-    'GAP/GAP-urand'
-    'GAP/GAP-twitter'
-    'GAP/GAP-web'
-    'GAP/GAP-road'
-    } ;
-deltas = [ 27 35 51 150 200000 ] ;
-
-[status, result] = system ('hostname') ;
-clear status
-if (isequal (result (1:5), 'hyper'))
-    fprintf ('hypersparse: %d threads\n', GrB.threads (40)) ;
-elseif (isequal (result (1:5), 'slash'))
-    fprintf ('slash: %d threads\n', GrB.threads (8)) ;
-elseif (isequal (result (1:9), 'backslash'))
-    fprintf ('backslash: %d threads\n', GrB.threads (24)) ;
-else
-    fprintf ('default: %d threads\n', GrB.threads) ;
-end
-clear result
-
-threads = GrB.threads ;
-threads = [threads threads/2]
-
-good = '/home/davis/sparse/LAGraph/Test/SSSP/' ;
-
-for k = 1:length(matrices)
-
-    %---------------------------------------------------------------------------
-    % get the GAP problem
-    %---------------------------------------------------------------------------
-
-    id = matrices {k} ;
-    fprintf ('\nmatrix: %s\n', id) ;
-    GrB.burble (0) ;
-    t1 = tic ;
-    clear A Prob
-    if (isequal (id, 'cover'))
-        A = mread ('cover.mtx') ;
-        Prob.A = A ;
-        Prob.name = 'cover' ;
-    else
-        Prob = ssget (id, index) ;
-    end
-    t1 = toc (t1) ; ;
-    fprintf ('load time: %g sec\n', t1) ;
-    t1 = tic ;
-    A = abs (GrB (Prob.A, 'by row', 'int32')) ;
-    A = GrB.prune (A) ;
-    n = size (A,1) ;
-    try
-        sources = Prob.aux.sources ;
-    catch
-        try
-            sources = mread (sprintf ('%s/sources_%d.mtx', good, n)) ;
-            sources = full (sources) ;
-        catch
-            sources = randperm (n, 64) ;
-        end
-    end
-    name = Prob.name ;
-    clear Prob
-    fprintf ('\n%s: nodes: %g million  nvals: %g million\n', ...
-        name, n / 1e6, nnz (A) / 1e6) ;
-    t1 = toc (t1) ;
-    fprintf ('init time: %g sec\n', t1) ;
-    % whos
-
-    [i, j, x] = find (A) ;
-    % figure (1)
-    % histogram (x)
-    % drawnow
-    fprintf ('edgeweights: min: %g med: %g max: %g\n', ...  
-        min (x), median (x), max (x)) ;
-    clear i j x
-
-    delta = deltas (k) ;
-    fprintf ('delta for this matrix: %d\n', delta) ;
-
-    %---------------------------------------------------------------------------
-    % compute the SSSP for each source node
-    %---------------------------------------------------------------------------
-
-    for nthreads = threads
-        GrB.threads (nthreads) ;
-
-        fprintf ('\ngap_sssp tests: %d threads\n', nthreads) ;
-
-        tot12 = 0 ;
-        tot12c = 0 ;
-        for trial = 1:length(sources)
-            source = sources (trial)  ;
-            % fprintf ('source: %d\n', source) ;
-
-            % gap_sssp12c
-            %{
-            tstart = tic ;
-            path_length = gap_sssp12c (source, A, delta) ;
-            t = toc (tstart) ;
-            tot12c = tot12c + t ;
-            fprintf ('trial: %2d source: %8d GrB SSSP12c time: %8.3f\n', ...
-                trial, source, t) ;
-            path_length = GrB.prune (path_length) ;
-
-            % check result
-            try
-                tstart = tic ;
-                pgood = GrB (mread (sprintf ('%s/pathlen_%02d_%d.mtx', ...
-                    good, trial-1, n))) ;
-                pgood = pgood' ;
-                err = norm (pgood - path_length) / norm (pgood) ;
-                t = toc (tstart) ;
-                nzdiff = GrB.entries (path_length) - GrB.entries (pgood) ;
-            catch
-                err = 0 ;
-                nzdiff = 0 ;
-            end
-    %       fprintf ('err: %g (time %g sec) entries %d %d diff %d\n', err, t, ...
-    %           GrB.entries (path_length),  GrB.entries (pgood), ...
-    %           GrB.entries (path_length) - GrB.entries (pgood)) ;
-            assert (err == 0) ;
-            assert (nzdiff == 0) ;
-            %}
-
-            % gap_sssp12
-            tstart = tic ;
-            path_len2 = gap_sssp12 (source, A, delta) ;
-            t = toc (tstart) ;
-            tot12 = tot12 + t ;
-            fprintf ('trial: %2d source: %8d GrB SSSP12  time: %8.3f\n', ...
-                trial, source, t) ;
-            path_len2 = GrB.prune (path_len2) ;
-            %{
-            try
-                err = norm (pgood - path_len2) / norm (pgood) ;
-                nzdiff = GrB.entries (path_len2) - GrB.entries (pgood) ;
-            catch
-            end
-            assert (err == 0) ;
-            assert (nzdiff == 0) ;
-            assert (isequal (path_length, path_len2)) ;
-            %}
-
-            clear path_length path_len2 pgood
-        end
-
-        ntrials = trial ;
-
-        fprintf ('avg GrB SSSP12c time:  %10.3f (%d trials)\n', ...
-            tot12c/ntrials, ntrials) ;
-
-        fprintf ('avg GrB SSSP12  time:  %10.3f (%d trials)\n', ...
-            tot12/ntrials, ntrials) ;
-
-    end
-
-    clear A
-end
-
diff --git a/GraphBLAS/GraphBLAS/GAP/gap_sssp12.m b/GraphBLAS/GraphBLAS/GAP/gap_sssp12.m
deleted file mode 100644
index 01d3a06c42..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/gap_sssp12.m
+++ /dev/null
@@ -1,148 +0,0 @@
-function path_length = gap_sssp12 (source, A, delta)
-%GAP_SSSP12 single source shortest path, via delta stepping, for GAP
-%
-% A is square, unsymmetric, int32, and stored by row.  It is assumed that all
-% its explicit entries are > 0.  The method is based on LAGraph_sssp12.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
-%-------------------------------------------------------------------------------
-% check inputs
-%-------------------------------------------------------------------------------
-
-if (~isequal (GrB.type (A), 'int32'))
-    % FUTURE: allow for different types of A
-    error ('A must be int32') ;
-end
-
-if (~GrB.isbyrow (A))
-    % FUTURE: extend to handle A by column
-    error ('A must be stored by row') ;
-end
-
-[m, n] = size (A) ;
-if (m ~= n)
-    error ('A must be square') ;
-end
-
-delta = int32 (delta) ;
-
-%-------------------------------------------------------------------------------
-% initializations
-%-------------------------------------------------------------------------------
-
-empty = GrB (1, n, 'int32', 'by row') ;
-
-% tmasked: a sparse vector containing the path lengths currently being computed
-tmasked = empty ;
-tmasked (source) = 0 ;
-
-% t (i) = path length from source to node i, as a dense vector
-t = empty ;
-t (:) = int32 (inf) ;
-t (source) = 0 ;
-
-% s = nodes found in this pass
-s = empty ;
-s (source) = true ;
-
-% AL = entries in A that are <= delta
-AL = GrB.select (A, '<=', delta) ;
-
-% AH = entries in A that are > delta
-AH = GrB.select (A, '>' , delta) ;
-AH_nvals = GrB.entries (AH) ;
-
-i = int32 (0) ;
-
-desc_s = struct ('mask', 'structural') ;
-
-inf32 = int32 (inf) ;
-do_LT_first = true ;
-
-%-------------------------------------------------------------------------------
-% SSSP iterations
-%-------------------------------------------------------------------------------
-
-while (GrB.entries (tmasked) > 0)
-
-    % tmasked = select (tmasked < (i+1)*delta)
-    uBound = (i+1) * delta ;
-    tmasked = GrB.select (tmasked, '<', uBound) ;
-
-    %---------------------------------------------------------------------------
-    % inner iterations
-    %---------------------------------------------------------------------------
-
-    while (GrB.entries (tmasked) > 0)
-
-        % tReq = tmasked * AL, using the min.plus semiring
-        tReq = GrB.mxm (tmasked, 'min.+', AL) ;
-
-        % s = s | spones (tmasked)
-        s = GrB.eadd (s, 'pair.logical', tmasked) ;
-
-        if (GrB.entries (tReq) == 0)
-            % if tReq has no entries, no need to continue
-            break ;
-        end
-
-        % tless = (tReq .< t), and drop zeros so it can be a structural mask
-        tless = GrB.prune (GrB.emult (tReq, '<', t)) ;
-        if (GrB.entries (tless) == 0)
-            % if tless has no entries, no need to continue
-            break ;
-        end
-
-        % tmasked<tless> = select (tReq < (i+1)*delta)
-        tmasked = GrB.select (empty, tless, tReq, '<', uBound, desc_s) ;
-
-        % t<tless> = tReq
-        t = GrB.assign (t, tless, tReq, desc_s) ;
-    end
-
-    %---------------------------------------------------------------------------
-    % next outer iteration
-    %---------------------------------------------------------------------------
-
-    if (AH_nvals > 0)
-
-        % tmasked<s> = t
-        tmasked = GrB.assign (empty, s, t, desc_s) ;
-
-        % tReq = tmasked * AH using the min.plus semiring
-        tReq = GrB.mxm (tmasked, 'min.+', AH) ;
-
-        % tless = (tReq .< t)
-        tless = GrB.emult (tReq, '<', t) ;
-
-        % t<tless> = tReq
-        t = GrB.assign (t, tless, tReq) ;
-    end
-
-    % prepare for next set of inner iterations
-    i = i + 1 ;
-    lBound = i * delta ;
-
-    % tmasked = select (lBound <= t < inf)
-    if (do_LT_first)
-        tmasked = GrB.select (t, '<', inf32) ;
-        n1 = GrB.entries (tmasked) ;
-        tmasked = GrB.select (tmasked, '>=', lBound) ;
-        if ((n-n1) < (n1-GrB.entries (tmasked)))
-            % reverse the order for future iterations
-            do_LT_first = false ;
-        end
-    else
-        tmasked = GrB.select (t, '>=', lBound) ;
-        tmasked = GrB.select (tmasked, '<', inf32) ;
-    end
-
-    % clear s for the next set of inner iterations
-    s = empty ;
-end
-
-% return result
-path_length = t ;
-
diff --git a/GraphBLAS/GraphBLAS/GAP/gap_sssp12c.m b/GraphBLAS/GraphBLAS/GAP/gap_sssp12c.m
deleted file mode 100644
index ca0ec9f772..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/gap_sssp12c.m
+++ /dev/null
@@ -1,168 +0,0 @@
-function path_length = gap_sssp12c (source, A, delta)
-%GAP_SSSP12c single source shortest path, via delta stepping, for GAP
-%
-% A is square, unsymmetric, int32, and stored by row.  It is assumed to have
-% only positive entries.  The method is based on LAGraph_sssp12c.  This is
-% slower than gap_sssp12. 
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
-%-------------------------------------------------------------------------------
-% check inputs
-%-------------------------------------------------------------------------------
-
-if (~isequal (GrB.type (A), 'int32'))
-    error ('A must be int32') ;
-end
-
-if (~GrB.isbyrow (A))
-    error ('A must be ''by row''') ;
-end
-
-[m, n] = size (A) ;
-if (m ~= n)
-    error ('A must be square') ;
-end
-
-delta = int32 (delta) ;
-
-%-------------------------------------------------------------------------------
-% initializations
-%-------------------------------------------------------------------------------
-
-empty = GrB (1, n, 'int32', 'by row') ;
-
-% t (i) = path length from source to node i
-t = empty ;
-t (:) = int32 (inf) ;
-t (source) = 0 ;
-
-% s = nodes found in this pass
-s = empty ;
-s (source) = true ;
-
-% reach (i) = true if node i can be reached from the source node
-reach = empty ;
-reach (:) = false ;
-reach (source) = true ;
-
-remain = true ;
-
-% AL = entries in A that are <= delta
-AL = GrB.select (A, '<=', delta) ;
-
-% AH = entries in A that are > delta
-AH = GrB.select (A, '>' , delta) ;
-
-i = int32 (0) ;
-
-desc_s  = struct ('mask', 'structural') ;
-desc_rs = struct ('mask', 'structural', 'out', 'replace') ;
-
-% fprintf ('\nINIT===================================\n') ;
-% AL
-% AH
-% reach
-% s
-
-%-------------------------------------------------------------------------------
-% SSSP iterations
-%-------------------------------------------------------------------------------
-
-while (remain)
-
-% fprintf ('\ni = %d ================================\n', i) ;
-
-    % tmasked = select (t < (i+1)*delta)
-    uBound = (i+1) * delta ;
-    tmasked = GrB.assign (empty, reach, t) ;
-    tmasked = GrB.select (tmasked, '<', uBound) ;
-
-% tmasked
-
-    %---------------------------------------------------------------------------
-    % continue while the current bucket B [i] is not empty
-    %---------------------------------------------------------------------------
-
-    while (GrB.entries (tmasked) > 0)
-
-% fprintf ('\n inner -------------------------------------------\n') ;
-
-        % tReq = tmasked * AL, using the min.plus semiring
-        tReq = GrB.mxm (tmasked, 'min.+', AL) ;
-% tReq
-
-        % s = s | spones (tmasked)
-        s = GrB.eadd (s, 'pair.logical', tmasked) ;
-% s
-
-        if (GrB.entries (tReq) == 0)
-            % if tReq is empty, no need to continue
-            break ;
-        end
-
-        % tless<tReq> = (tReq < t)
-        tless = GrB.eadd (empty, tReq, tReq, '<', t, desc_s) ;
-% tless
-
-        % remove explicit zeros from tless to use it as a structural mask
-        tless = GrB.prune (tless) ;
-% tless
-        if (GrB.entries (tless) == 0)
-            % if tless is empty, no need to continue
-            break ;
-        end
-
-        % update reachable node list/mask
-        % reach<tless> = true
-        reach = GrB.assign (reach, tless, true, desc_s) ;
-% reach
-
-        % tmasked<tless> = select (i*delta <= tReq < (i+1)*delta)
-        tmasked = GrB.select (empty, tless, tReq, '<', uBound, desc_s) ;
-% tmasked
-
-        % t<tless> = tReq
-        t = GrB.assign (t, tless, tReq, desc_s) ;
-% t
-    end
-
-    %---------------------------------------------------------------------------
-    % outer iterations
-    %---------------------------------------------------------------------------
-
-% fprintf ('\nnext outer loop ------------------------------------\n') ;
-
-    % tmasked<s> = t
-    tmasked = GrB.assign (tmasked, s, t, desc_rs) ;
-% tmasked
-
-    % tReq = tmasked * AH using the min.plus semiring
-    tReq = GrB.mxm (tmasked, 'min.+', AH) ;
-% tReq
-
-    % t = min (t, tReq) ;
-    tless = GrB.eadd (empty, tReq, tReq, '<', t, desc_s) ;
-% tless
-    t = GrB.assign (t, tless, tReq) ;
-% t
-
-    % update reachable node list/mask
-    reach = GrB.assign (reach, tless, true) ;
-% reach
-
-    % remove previous buckets
-    % reach<s> = false
-    reach = GrB.assign (reach, s, false, desc_s) ;
-    remain = any (reach) ;
-% reach
-
-    % clear s for the next loop
-    s = empty ;
-    i = i + 1 ;
-end
-
-% return result
-path_length = t ;
-% path_length
diff --git a/GraphBLAS/GraphBLAS/GAP/gap_tc.m b/GraphBLAS/GraphBLAS/GAP/gap_tc.m
deleted file mode 100644
index 5bad348cf3..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/gap_tc.m
+++ /dev/null
@@ -1,137 +0,0 @@
-function gap_tc
-%GAP_TC run tricount for the GAP benchmark
-
-help gap_tc
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-rng ('default') ;
-
-% warmup, to make sure GrB library is loaded
-C = GrB (1) * GrB (1) + 1 ;
-clear C
-
-% smaller test matrices:
-matrices = { 'HB/west0067', 'SNAP/roadNet-CA', ...
-    'GAP/GAP-road', ...
-    'GAP/GAP-web', ...
-    'GAP/GAP-urand', ...
-    'GAP/GAP-twitter', ... 
-    'GAP/GAP-kron' } ;
-
-matrices = { 'HB/west0067', 'SNAP/roadNet-CA' , ...
-    'SNAP/com-Orkut', 'LAW/indochina-2004' } ;
-
-index = ssget ;
-f = find (index.nrows == index.ncols & index.nnz > 5e6 & index.isReal) ;
-[~,i] = sort (index.nnz (f)) ;
-matrices = f (i) ;
-
-% the GAP test matrices:
-matrices = {
-    'GAP/GAP-road'
-    'GAP/GAP-web'
-    'GAP/GAP-urand'
-    'GAP/GAP-twitter'
-    'GAP/GAP-kron'
-    }
-
-% the GAP test matrices that need sorting:
-matrices = {
-    'GAP/GAP-twitter'
-    'GAP/GAP-kron'
-    }
-
-[status, result] = system ('hostname') ;
-clear status
-if (isequal (result (1:5), 'hyper'))
-    fprintf ('hypersparse: %d threads\n', GrB.threads (40)) ;
-elseif (isequal (result (1:5), 'slash'))
-    fprintf ('slash: %d threads\n', GrB.threads (8)) ;
-elseif (isequal (result (1:9), 'backslash'))
-    fprintf ('slash: %d threads\n', GrB.threads (24)) ;
-else
-    fprintf ('default: %d threads\n', GrB.threads) ;
-end
-clear result
-
-threads = GrB.threads ;
-threads = [threads threads/2]
-
-% winners = zeros (16,1) ;  
-% total   = zeros (16,1) ;  
-% tbest   = 0 ;
-
-for k = 1:length(matrices)
-
-    %---------------------------------------------------------------------------
-    % get the GAP problem
-    %---------------------------------------------------------------------------
-
-    id = matrices {k} ;
-    GrB.burble (0) ;
-    t1 = tic ;
-    clear A Prob
-    Prob = ssget (id, index) ;
-    A = GrB (Prob.A, 'by row', 'logical') ;
-    name = Prob.name ;
-    clear Prob
-    A = spones (A) ;
-    A = A|A' ;
-    n = size (A,1) ;
-    fprintf ('\n%s: nodes: %g million  nvals: %g million\n', ...
-        name, n / 1e6, nnz (A) / 1e6) ;
-    t1 = toc (t1) ;
-    fprintf ('load time: %g sec\n', t1) ;
-    d = double (GrB.entries (A, 'row', 'degree')) ;
-
-    ntrials = 3 ;
-
-    %---------------------------------------------------------------------------
-    % triangle count
-    %---------------------------------------------------------------------------
-
-    for nthreads = threads
-        GrB.threads (nthreads) ;
-        fprintf ('\nGAP tricount  tests: %d threads\n', nthreads) ;
-
-        tot = 0 ;
-        for trial = 1:ntrials
-            tstart = tic ;
-            % s = GrB.tricount (A) ;
-            s = tricount (A, d) ;
-            t = toc (tstart) ;
-            tot = tot + t ;
-            fprintf ('trial: %2d GrB.tricount  time: %8.3f\n', trial, t) ;
-        end
-        fprintf ('avg GrB.tricount time:  %10.3f (%d trials)\n', ...
-            tot/ntrials, ntrials) ;
-        fprintf ('triangles: %d\n', full (s)) ;
-    end
-
-    %---------------------------------------------------------------------------
-    % triangle count with permutations
-    %---------------------------------------------------------------------------
-
-    %{
-    [c times best] = tric (A, s) ;
-    clear A
-
-    all_times = sum (times, 2) ;
-    total = total + all_times ;
-    winners (best) = winners (best) + 1 ;
-    tbest = tbest + all_times (best) ;
-
-    for k = 1:16
-        if (total (k) < inf)
-            fprintf ('%2d   %10.3f : %d\n', k, total (k), winners (k)) ;
-        end
-    end
-    fprintf ('best %10.3f\n', tbest) ;
-    save gap_tc_results winners total tbest k
-    %}
-
-end
-
-
diff --git a/GraphBLAS/GraphBLAS/GAP/gap_tc_results.mat b/GraphBLAS/GraphBLAS/GAP/gap_tc_results.mat
deleted file mode 100644
index c5fae6572a..0000000000
Binary files a/GraphBLAS/GraphBLAS/GAP/gap_tc_results.mat and /dev/null differ
diff --git a/GraphBLAS/GraphBLAS/GAP/tric.m b/GraphBLAS/GraphBLAS/GAP/tric.m
deleted file mode 100644
index 6082ee3320..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/tric.m
+++ /dev/null
@@ -1,261 +0,0 @@
-function [c times best] = tric (A, cgood)
-%TRIC triangle counting tests
-% A must be logical, symmetric, and stored by row
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-assert (GrB.isbyrow (A)) ;
-assert (isequal (GrB.type (A), 'logical')) ;
-
-rng ('default') ;
-
-desc_s.mask = 'structural' ;
-
-desc_st.mask = 'structural' ;
-desc_st.in1 = 'transpose' ;
-
-semiring = '+.pair.int64' ;
-monoid = '+.int64' ;
-
-n = size (A,1) ;
-Z = GrB (n, n, 'int64', 'by row') ;
-
-if (nargin < 2)
-    tstart = tic ;
-    cgood = GrB.tricount (A) ;
-    tgood = toc (tstart) ;
-    fprintf ('tricount time: %g   triangles %d\n', tgood, cgood) ;
-end
-
-degree = full (double (GrB.entries (A, 'row', 'degree'))) ;
-fprintf ('degree: min: %d max: %d mean: %g std: %g\n', ...
-    min (degree), max (degree), mean (degree), std (degree)) ;
-
-times = inf (16, 2) ;
-
-dot = [3 4 7 8 15 16] ; % 13] ;
-trials = dot ;
-
-for trial = trials
-
-    tstart = tic ;
-    c = -1 ;
-    tprep = inf ;
-
-    try
-
-        if (trial == 1)
-
-            % Sandia method: C<L>=L*L with saxpy method
-            L = tril (A, -1) ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, L, semiring, L, L, desc_s) ;
-
-        elseif (trial == 2)
-
-            % Sandia2 method: C<U>=U*U with saxpy method
-            U = triu (A, 1) ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, U, semiring, U, U, desc_s) ;
-
-        elseif (trial == 3)
-
-            % SandiaDot: C<L>=L*U': dot method
-            L = tril (A, -1) ;
-            U = triu (A, 1) ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, L, semiring, L, U, desc_st) ;
-
-        elseif (trial == 4)
-
-            % SandiaDot2: C<U>=U*L': dot method
-            L = tril (A, -1) ;
-            U = triu (A, 1) ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, U, semiring, U, L, desc_st) ;
-
-
-
-        elseif (trial == 5)
-
-            % sort degree, low to hi: saxpy method (Sandia)
-            [~,p] = sort (degree, 'ascend') ;
-            L = tril (A (p,p), -1) ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, L, semiring, L, L, desc_s) ;
-
-        elseif (trial == 6)
-
-            % sort degree, hi to low: saxpy method (Sandia)
-            [~,p] = sort (degree, 'descend') ;
-            L = tril (A (p,p), -1) ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, L, semiring, L, L, desc_s) ;
-
-        elseif (trial == 7)
-
-            % sort degree, low to hi: dot method (SandiaDot)
-            [~,p] = sort (degree, 'ascend') ;
-            S = A (p,p) ;
-            L = tril (S, -1) ;
-            U = triu (S, 1) ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, L, semiring, L, U, desc_st) ;
-
-        elseif (trial == 8)
-
-            % sort degree, hi to low: dot method (SandiaDot)
-            [~,p] = sort (degree, 'descend') ;
-            S = A (p,p) ;
-            L = tril (S, -1) ;
-            U = triu (S, 1) ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, L, semiring, L, U, desc_st) ;
-
-
-
-        elseif (trial == 9)
-
-            % sort degree, low to hi: saxpy method
-            [~,p] = sort (degree, 'ascend') ;
-            [i j ~] = find (A) ;
-            % if p = 1:n, the rule is i > j, which is tril (A)
-            keep = p(i) > p(j) ;
-            i = i (keep) ;
-            j = j (keep) ;
-            S = GrB.build (i, j, 1, n, n, '|', 'logical') ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, S, semiring, S, S, desc_s) ;
-
-        elseif (trial == 10)
-
-            % sort degree, hi to low: saxpy method
-            [~,p] = sort (degree, 'descend') ;
-            [i j ~] = find (A) ;
-            % if p = 1:n, the rule is i > j, which is tril (A)
-            keep = p(i) > p(j) ;
-            i = i (keep) ;
-            j = j (keep) ;
-            S = GrB.build (i, j, 1, n, n, '|', 'logical') ;
-            clear keep i j
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, S, semiring, S, S, desc_s) ;
-
-        elseif (trial == 11)
-
-            % sort degree, low to hi: dot method
-            [~,p] = sort (degree, 'ascend') ;
-            [i j ~] = find (A) ;
-            % if p = 1:n, the rule is i > j, which is tril (A)
-            keep = p(i) > p(j) ;
-            ilo = i (keep) ;
-            jlo = j (keep) ;
-            L = GrB.build (ilo, jlo, 1, n, n, '|', 'logical') ;
-            keep = p(i) < p(j) ;
-            ihi = i (keep) ;
-            jhi = j (keep) ;
-            U = GrB.build (ihi, jhi, 1, n, n, '|', 'logical') ;
-            clear keep i j ilo jlo ihi jhi
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, L, semiring, L, U, desc_st) ;
-
-        elseif (trial == 12)
-
-            % sort degree, hi to low: dot method
-            [~,p] = sort (degree, 'descend') ;
-            [i j ~] = find (A) ;
-            % if p = 1:n, the rule is i > j, which is tril (A)
-            keep = p(i) > p(j) ;
-            ilo = i (keep) ;
-            jlo = j (keep) ;
-            L = GrB.build (ilo, jlo, 1, n, n, '|', 'logical') ;
-            keep = p(i) < p(j) ;
-            ihi = i (keep) ;
-            jhi = j (keep) ;
-            U = GrB.build (ihi, jhi, 1, n, n, '|', 'logical') ;
-            clear keep i j ilo jlo ihi jhi
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, L, semiring, L, U, desc_st) ;
-
-        elseif (trial == 13)
-
-            % sort via symrcm: dot method (SandiaDot)
-            p = symrcm (A) ;
-            S = A (p,p) ;
-            L = tril (S, -1) ;
-            U = triu (S, 1) ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, L, semiring, L, U, desc_st) ;
-
-        elseif (trial == 14)
-
-            % sort via amd: dot method (SandiaDot)
-            p = amd (A) ;
-            S = A (p,p) ;
-            L = tril (S, -1) ;
-            U = triu (S, 1) ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, L, semiring, L, U, desc_st) ;
-
-        elseif (trial == 15)
-
-            % SandiaDot2: C<U>=U*L': dot method, sorted ascending
-            [~,p] = sort (degree, 'ascend') ;
-            S = A (p,p) ;
-            L = tril (S, -1) ;
-            U = triu (S, 1) ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, U, semiring, U, L, desc_st) ;
-
-        elseif (trial == 16)
-
-            % SandiaDot2: C<U>=U*L': dot method, sorted descending
-            [~,p] = sort (degree, 'descend') ;
-            S = A (p,p) ;
-            L = tril (S, -1) ;
-            U = triu (S, 1) ;
-            tprep = toc (tstart) ; tstart = tic ; 
-            C = GrB.mxm (Z, U, semiring, U, L, desc_st) ;
-
-        end
-
-        c = full (double (GrB.reduce (monoid, C))) ;
-
-    catch me
-        fprintf ('error: %s\n', me.message) ;
-    end
-
-    t = toc (tstart) ;
-    if (c == -1)
-        t = inf ;
-        tprep = inf ;
-    else
-        assert (c == cgood)
-    end
-
-    times (trial,1) = tprep ;
-    times (trial,2) = t ;
-    fprintf ('%2d: %10.4f %10.4f = %10.4f\n', trial, tprep, t, t+tprep) ;
-    clear C S L U ilo jlo ihi jhi keep i j p
-
-end
-
-all_time = sum (times, 2) ;
-
-fprintf ('\n') ;
-[tbest, best] = min (all_time) ;
-best = best (1) ;
-
-for trial = trials
-    t = sum (times (trial,1:2)) ;
-    fprintf ('%2d: %10.4f relative: %10.4f ', trial, t, t / tbest) ;
-    if (trial == best)
-        fprintf ('best') ;
-    end
-    fprintf ('\n') ;
-end
-
-fprintf ('\n') ;
-
-
diff --git a/GraphBLAS/GraphBLAS/GAP/tricount.m b/GraphBLAS/GraphBLAS/GAP/tricount.m
deleted file mode 100644
index af5dc2df58..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/tricount.m
+++ /dev/null
@@ -1,103 +0,0 @@
-function s = tricount (A, arg2, arg3)
-%GRB.TRICOUNT count triangles in a matrix.
-% s = GrB.tricount (A) counts the number of triangles in the matrix A.
-% spones (A) must be symmetric; results are undefined if spones (A) is
-% unsymmetric.  Diagonal entries are ignored.
-%
-% To check the input matrix A, use GrB.tricount (A, 'check').  This check
-% takes additional time so by default the input is not checked.
-%
-% If d is a vector of length n with d(i) equal to the degree of node i,
-% then s = tricount (A, d) can be used.  Otherwise, tricount must compute
-% the degrees first.
-%
-% See also GrB.ktruss, GrB.entries.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
-[m, n] = size (A) ;
-if (m ~= n)
-    gb_error ('A must be square') ;
-end
-
-d = [ ] ;
-check = false ;
-
-if (nargin == 2)
-    if (ischar (arg2))
-        % s = tricount (A, 'check')
-        check = isequal (arg2, 'check') ;
-    else
-        % s = tricount (A, d)
-        d = arg2 ;
-    end
-elseif (nargin == 3)
-    if (ischar (arg2))
-        % s = tricount (A, 'check', d)
-        check = isequal (arg2, 'check') ;
-        d = arg3 ;
-    else
-        % s = tricount (A, d, 'check')
-        d = arg2 ;
-        check = isequal (arg3, 'check') ;
-    end
-end
-
-if (check && ~issymmetric (spones (A)))
-    gb_error ('pattern of A must be symmetric') ;
-end
-
-if (isequal (class (d), 'GrB'))
-    d = double (d) ;
-end
-
-% determine if A should be sorted first
-if (n > 1000 && GrB.entries (A) >= 10*n)
-    if (isempty (d))
-        % compute the degree of each node, if not provided on input
-        if (GrB.isbyrow (A))
-            d = double (GrB.entries (A, 'row', 'degree')) ;
-        else
-            d = double (GrB.entries (A, 'col', 'degree')) ;
-        end
-    end
-    % sample the degree
-    p = randperm (n, 1000) ;
-    sample = d (randperm (n, 1000)) ;
-    dmean = full (mean (sample)) ;
-    dmed  = full (median (sample)) ;
-    fprintf ('mean degree: %g median: %g\n', dmean, dmed) ;
-    if (dmean > 4 * dmed)
-        % sort if the average degree is very high compared to the median
-        fprintf ('sorting A first\n') ;
-        [~, p] = sort (d, 'descend') ;
-        A = A (p,p) ;
-        clear p
-    end
-end
-
-% C, L, and U will have the same format as A
-C = GrB (n, n, 'int64', GrB.format (A)) ;
-L = tril (A, -1) ;
-U = triu (A, 1) ;
-
-% Inside GraphBLAS, the methods below are identical.  For example, L stored by
-% row is the same data structure as U stored by column.  Both use the
-% SandiaDot2 method as defined in LAGraph (case 6), which is typically the
-% fastest of the methods in LAGraph_tricount.
-
-desc.mask = 'structural' ;
-
-if (GrB.isbyrow (A))
-    % C<U> = U*L': SandiaDot2 method
-    desc.in1 = 'transpose' ;
-    C = GrB.mxm (C, U, '+.pair.int64', U, L, desc) ;
-else
-    % C<U> = L'*U: SandiaDot2 method
-    desc.in0 = 'transpose' ;
-    C = GrB.mxm (C, U, '+.pair.int64', L, U, desc) ;
-end
-
-s = full (double (GrB.reduce ('+.int64', C))) ;
-
diff --git a/GraphBLAS/GraphBLAS/GAP/ttest.m b/GraphBLAS/GraphBLAS/GAP/ttest.m
deleted file mode 100644
index 9fbd99d028..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/ttest.m
+++ /dev/null
@@ -1,39 +0,0 @@
-%TTEST run triangle counting tests
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-index = ssget ;
-f = find (index.nnz > 1e6 & index.nrows == index.ncols) ;
-[ignore i] = sort (index.nnz (f)) ;
-f = f (i) ;
-nmat = length (f) ;
-
-winners = zeros (12,1) ;
-totals = zeros (12,1) ;
-
-for k = 1:nmat
-    id = f (k) ;
-    Prob = ssget (id, index)
-    A = spones (Prob.A) ;
-    [m, n] = size (A) ;
-    if (m ~= n)
-        A = [speye(m) A ; A' speye(n)] ;
-    else
-        A = A|A' ;
-    end
-
-    [s, times, best] = tric (A) ;
-    winners (best) = winners (best) + 1 ;
-
-    totals = totals + times ;
-
-    fprintf ('\nwinner count:\n') ;
-    for trial = 1:12
-        fprintf ('  %2d : %12.2f  %d\n', trial, totals (trial), ...
-            winners (trial)) ;
-    end
-
-end
-
-
diff --git a/GraphBLAS/GraphBLAS/GAP/typescript_Mar9b_final.txt b/GraphBLAS/GraphBLAS/GAP/typescript_Mar9b_final.txt
deleted file mode 100644
index a5c636be8a..0000000000
--- a/GraphBLAS/GraphBLAS/GAP/typescript_Mar9b_final.txt
+++ /dev/null
@@ -1,3230 +0,0 @@
-Script started on Mon 09 Mar 2020 12:45:15 PM CDT
-Intel Suite:
-Copyright (C) 2009-2019 Intel Corporation. All rights reserved.
-Intel(R) VTune(TM) Amplifier 2019 (build 591499)
-[0;36mhypersparse $[0m pwd
-/home/faculty/davis/sparse/GraphBLAS/GraphBLAS/GAP
-[0;36mhypersparse $[0m dir
-total 252
-  4 bctest.m      4 gap_bfs_urand.m     8 gap_sssp12c.m         8 tric.m         8 typescript_Mar9
-  4 Contents.m    8 gap_centrality.m    8 gap_sssp12.m          4 ttest.m        4 typescript_Mar9b
-  4 cover.mtx     4 gap.m               8 gap_sssp.m           16 typescript
-  4 gap_bc.m      4 gap_pagerank.m      4 gap_tc.m            116 typescript2
-  4 gap_bfs.m     8 gap_pr.m            4 gap_tc_results.mat   16 typescript3
-[0;36mhypersparse $[0m matlab -nodesktop
-[?1h=
-                                            < M A T L A B (R) >
-                                  Copyright 1984-2018 The MathWorks, Inc.
-                                  R2018a (9.4.0.813654) 64-bit (glnxa64)
-                                             February 23, 2018
-
- 
-To get started, type one of these: helpwin, helpdesk, or demo.
-For product information, visit www.mathworks.com.
- 
-using development version of SuiteSparse
-
-ans =
-
-    '/home/faculty/davis/sparse/GraphBLAS/GraphBLAS/GAP'
-
-
-me = 
-
-  MException with properties:
-
-    identifier: 'GrB:mex'
-       message: 'mexFunction not found; use gbmake to compile GraphBLAS'
-         cause: {0x1 cell}
-         stack: [3x1 struct]
-
-GraphBLAS not initialized
->> cd ../# @Gr   @GrB/pri   private/
->> gbmake
-
-ans =
-
-    '/home/faculty/davis/sparse/GraphBLAS/GraphBLAS/@GrB/private'
-
->> cd me  mexfunctions/
->> ldd     
->> !ldd gbap    gbapply.c         gbapply.c
-gbapply.c  
->> !ldd gbapply.c
-No completions found.
->> !ldd gbapply.c              dir
-
-.                   gbdescriptorinfo.c  gbisequal.c         gbnvals.c           gbthreads.c         
-..                  gbdisp.c            gbkronecker.c       gbreduce.c          gbtrans.c           
-gbapply.c           gbeadd.c            gblogassign.c       gbselect.c          gbtype.c            
-gbassign.c          gbemult.c           gblogextract.c      gbselectopinfo.c    gbunopinfo.c        
-gbbinopinfo.c       gbextract.c         gbmonoidinfo.c      gbsemiringinfo.c    gbvreduce.c         
-gbbuild.c           gbextracttuples.c   gbmxm.c             gbsetup.c           
-gbburble.c          gbextractvalues.c   gbnew.c             gbsize.c            
-gbchunk.c           gbformat.c          gbnorm.c            gbsparse.c          
-gbdegree.c          gbfull.c            gbnormdiff.c        gbsubassign.c       
-
->> cd ..
->> 
->> ldd   !ldd gbapp     gbapply.m
-gbapply.m       gbapply.mexa64  
->> !ldd gbapply.me          gbapply.mexa64
-	linux-vdso.so.1 =>  (0x00007ffc8dbf1000)
-	libgraphblas.so.3 => /home/faculty/davis/sparse/GraphBLAS/build/libgraphblas.so.3 (0x00007fc44e79c000)
-	libmx.so => /opt/coe/matlabR2018a/bin/glnxa64/libmx.so (0x00007fc44e3ae000)
-	libmex.so => /opt/coe/matlabR2018a/bin/glnxa64/libmex.so (0x00007fc44e116000)
-	libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007fc44de0d000)
-	libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007fc44dbf0000)
-	libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007fc44d826000)
-	libgomp.so.1 => /usr/lib/x86_64-linux-gnu/libgomp.so.1 (0x00007fc44d604000)
-	/lib64/ld-linux-x86-64.so.2 (0x00007fc4525eb000)
-	libmwi18n.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwi18n.so (0x00007fc44d2b1000)
-	libut.so => /opt/coe/matlabR2018a/bin/glnxa64/libut.so (0x00007fc44cfd9000)
-	libmwfl.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwfl.so (0x00007fc44cc0d000)
-	libmwfoundation_usm.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwfoundation_usm.so (0x00007fc44c9e5000)
-	libmwboost_log.so.1.56.0 => /opt/coe/matlabR2018a/bin/glnxa64/libmwboost_log.so.1.56.0 (0x00007fc44c70c000)
-	libmwboost_system.so.1.56.0 => /opt/coe/matlabR2018a/bin/glnxa64/libmwboost_system.so.1.56.0 (0x00007fc44c509000)
-	libmwboost_thread.so.1.56.0 => /opt/coe/matlabR2018a/bin/glnxa64/libmwboost_thread.so.1.56.0 (0x00007fc44c2e3000)
-	libmwcpp11compat.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwcpp11compat.so (0x00007fc44c0c3000)
-	libicuuc.so.59 => /opt/coe/matlabR2018a/bin/glnxa64/libicuuc.so.59 (0x00007fc44bd10000)
-	libtbb.so.2 => /opt/coe/matlabR2018a/bin/glnxa64/libtbb.so.2 (0x00007fc44bab8000)
-	libtbbmalloc.so.2 => /opt/coe/matlabR2018a/bin/glnxa64/libtbbmalloc.so.2 (0x00007fc44b864000)
-	libz.so.1 => /lib/x86_64-linux-gnu/libz.so.1 (0x00007fc44b64a000)
-	libstdc++.so.6 => /opt/coe/matlabR2018a/bin/glnxa64/../../sys/os/glnxa64/libstdc++.so.6 (0x00007fc44b2c9000)
-	libgcc_s.so.1 => /opt/coe/matlabR2018a/bin/glnxa64/../../sys/os/glnxa64/libgcc_s.so.1 (0x00007fc44b0b3000)
-	libmwservices.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwservices.so (0x00007fc44aacc000)
-	libmwfoundation_matlabdata_matlab.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwfoundation_matlabdata_matlab.so (0x00007fc44a7ba000)
-	libmwfoundation_extdata_array.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwfoundation_extdata_array.so (0x00007fc44a4d1000)
-	libmwmvm.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwmvm.so (0x00007fc449ea6000)
-	libmwmpath.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwmpath.so (0x00007fc449c1a000)
-	libmwm_dispatcher.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwm_dispatcher.so (0x00007fc4498e3000)
-	libmwmlutil.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwmlutil.so (0x00007fc448ecc000)
-	libmwgenerate_diag_message.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwgenerate_diag_message.so (0x00007fc448cc4000)
-	libmwmcos.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwmcos.so (0x00007fc448a65000)
-	libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007fc448861000)
-	libmwresource_core.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwresource_core.so (0x00007fc44865f000)
-	libmwboost_filesystem.so.1.56.0 => /opt/coe/matlabR2018a/bin/glnxa64/libmwboost_filesystem.so.1.56.0 (0x00007fc448448000)
-	libexpat.so.1 => /opt/coe/matlabR2018a/bin/glnxa64/libexpat.so.1 (0x00007fc44821d000)
-	libicui18n.so.59 => /opt/coe/matlabR2018a/bin/glnxa64/libicui18n.so.59 (0x00007fc447d9d000)
-	libicuio.so.59 => /opt/coe/matlabR2018a/bin/glnxa64/libicuio.so.59 (0x00007fc447b90000)
-	librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007fc447988000)
-	libcrypt.so.1 => /lib/x86_64-linux-gnu/libcrypt.so.1 (0x00007fc447750000)
-	libpam.so.0 => /lib/x86_64-linux-gnu/libpam.so.0 (0x00007fc447542000)
-	libmwboost_date_time.so.1.56.0 => /opt/coe/matlabR2018a/bin/glnxa64/libmwboost_date_time.so.1.56.0 (0x00007fc447332000)
-	libmwboost_regex.so.1.56.0 => /opt/coe/matlabR2018a/bin/glnxa64/libmwboost_regex.so.1.56.0 (0x00007fc44702b000)
-	libmwboost_serialization.so.1.56.0 => /opt/coe/matlabR2018a/bin/glnxa64/libmwboost_serialization.so.1.56.0 (0x00007fc446dca000)
-	libunwind.so.8 => /opt/coe/matlabR2018a/bin/glnxa64/libunwind.so.8 (0x00007fc446bac000)
-	libmwboost_chrono.so.1.56.0 => /opt/coe/matlabR2018a/bin/glnxa64/libmwboost_chrono.so.1.56.0 (0x00007fc4469a6000)
-	libicudata.so.59 => /opt/coe/matlabR2018a/bin/glnxa64/libicudata.so.59 (0x00007fc444e92000)
-	libmwlivecode.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwlivecode.so (0x00007fc444c4e000)
-	libmwcppmicroservices.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwcppmicroservices.so (0x00007fc444a27000)
-	libmwregexp.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwregexp.so (0x00007fc4447e2000)
-	libmwsearch_path_utilities.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwsearch_path_utilities.so (0x00007fc4445bf000)
-	libmwflstoragevfs.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwflstoragevfs.so (0x00007fc4442b8000)
-	libmwcwf_interface.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwcwf_interface.so (0x00007fc44404c000)
-	libmwsettingscore.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwsettingscore.so (0x00007fc443bd1000)
-	libmwms.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwms.so (0x00007fc4436be000)
-	libmwkeybrd.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwkeybrd.so (0x00007fc4434bb000)
-	libmwboost_iostreams.so.1.56.0 => /opt/coe/matlabR2018a/bin/glnxa64/libmwboost_iostreams.so.1.56.0 (0x00007fc4432a7000)
-	libCppMicroServices.so.3.1.1 => /opt/coe/matlabR2018a/bin/glnxa64/libCppMicroServices.so.3.1.1 (0x00007fc442fb0000)
-	libmwfoundation_matlabdata_standalone.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwfoundation_matlabdata_standalone.so (0x00007fc442d17000)
-	libmwmst.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwmst.so (0x00007fc4429b0000)
-	libmwstartupplugin.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwstartupplugin.so (0x00007fc4427a3000)
-	libmwstartup_plugin_initializer.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwstartup_plugin_initializer.so (0x00007fc442535000)
-	libmwmatlabstoragesystem.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwmatlabstoragesystem.so (0x00007fc442330000)
-	libmwstorageshlibstoragesys.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwstorageshlibstoragesys.so (0x00007fc4420ef000)
-	libxerces-c-3.1.so => /opt/coe/matlabR2018a/bin/glnxa64/libxerces-c-3.1.so (0x00007fc441a63000)
-	libmwcwf_implementation.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwcwf_implementation.so (0x00007fc441859000)
-	libmwsearch_path_events.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwsearch_path_events.so (0x00007fc44164a000)
-	libmwsearch_path_impl.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwsearch_path_impl.so (0x00007fc441395000)
-	libmwsearch_path_interfaces.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwsearch_path_interfaces.so (0x00007fc44118e000)
-	libmwddux.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwddux.so (0x00007fc440f6e000)
-	libmwstring_matrix.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwstring_matrix.so (0x00007fc440d43000)
-	libaudit.so.1 => /lib/x86_64-linux-gnu/libaudit.so.1 (0x00007fc440b1c000)
-	libmwopccore.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwopccore.so (0x00007fc4408c6000)
-	libmwopcmodel.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwopcmodel.so (0x00007fc4405f7000)
-	libmwopczippackage.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwopczippackage.so (0x00007fc4403d2000)
-	libmwopcmwservices.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwopcmwservices.so (0x00007fc440172000)
-	libmwstoragefilefolderobserver.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwstoragefilefolderobserver.so (0x00007fc43fef2000)
-	libmwflnetwork.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwflnetwork.so (0x00007fc43fc4b000)
-	libmwflstorageprovider.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwflstorageprovider.so (0x00007fc43fa03000)
-	libmwflstorageevents.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwflstorageevents.so (0x00007fc43f7e7000)
-	libmwxmlcore.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwxmlcore.so (0x00007fc43f572000)
-	libmwstoragesharedlib.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwstoragesharedlib.so (0x00007fc43f359000)
-	libmwflcrypto.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwflcrypto.so (0x00007fc43f12a000)
-	libmwflcryptoutils.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwflcryptoutils.so (0x00007fc43ef1e000)
-	libmwflcryptocryptopp.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwflcryptocryptopp.so (0x00007fc43ea96000)
-	libminizip.so => /opt/coe/matlabR2018a/bin/glnxa64/libminizip.so (0x00007fc43e888000)
-[?1h=>> 
->> !ldd gbapply.mexa64 | grep omp
-	libgomp.so.1 => /usr/lib/x86_64-linux-gnu/libgomp.so.1 (0x00007f2aa57de000)
-	libmwcpp11compat.so => /opt/coe/matlabR2018a/bin/glnxa64/libmwcpp11compat.so (0x00007f2aa429d000)
-[?1h=>> 
->> 
->> type gap
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bc         % run centrality for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-
->> pwd
-
-ans =
-
-    '/home/faculty/davis/sparse/GraphBLAS/GraphBLAS/@GrB/private'
-
->> cd ../../
->> type gap
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bc         % run centrality for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-
->> gap
-[?1h=hypersparse: 40 threads
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-load time: 318.383 sec
-degree time: 13.2515 sec
-
-GAP PageRank tests:
-trial:  1 GAP pagerank time: 25.4866 iter: 7
-trial:  2 GAP pagerank time: 24.8908 iter: 7
-trial:  3 GAP pagerank time: 24.9546 iter: 7
-trial:  4 GAP pagerank time: 24.7858 iter: 7
-trial:  5 GAP pagerank time: 24.8941 iter: 7
-trial:  6 GAP pagerank time: 24.8316 iter: 7
-trial:  7 GAP pagerank time: 25.0024 iter: 7
-trial:  8 GAP pagerank time: 24.9067 iter: 7
-trial:  9 GAP pagerank time: 24.9472 iter: 7
-trial: 10 GAP pagerank time: 24.9617 iter: 7
-trial: 11 GAP pagerank time: 24.9449 iter: 7
-trial: 12 GAP pagerank time: 24.9289 iter: 7
-trial: 13 GAP pagerank time: 25.1089 iter: 7
-trial: 14 GAP pagerank time: 25.0639 iter: 7
-trial: 15 GAP pagerank time: 25.1757 iter: 7
-trial: 16 GAP pagerank time: 25.1723 iter: 7
-avg gap_pagerank time:  25.0035 (16 trials)
-
-GrB PageRank tests:
-trial:  1 GrB.pagerank time: 21.036 = (12.7221 + 8.08054) iter: 2
-trial:  2 GrB.pagerank time: 21.1538 = (12.6017 + 8.26383) iter: 2
-trial:  3 GrB.pagerank time: 20.9856 = (12.5275 + 8.16886) iter: 2
-trial:  4 GrB.pagerank time: 21.2602 = (12.6617 + 8.30826) iter: 2
-trial:  5 GrB.pagerank time: 20.9398 = (12.5012 + 8.15674) iter: 2
-trial:  6 GrB.pagerank time: 21.0694 = (12.5932 + 8.19192) iter: 2
-trial:  7 GrB.pagerank time: 21.036 = (12.6736 + 8.08485) iter: 2
-trial:  8 GrB.pagerank time: 21.0667 = (12.5823 + 8.20872) iter: 2
-trial:  9 GrB.pagerank time: 21.0647 = (12.6943 + 8.09553) iter: 2
-trial: 10 GrB.pagerank time: 20.9324 = (12.4111 + 8.24242) iter: 2
-trial: 11 GrB.pagerank time: 21.1278 = (12.673 + 8.18047) iter: 2
-trial: 12 GrB.pagerank time: 21.0034 = (12.5787 + 8.13931) iter: 2
-trial: 13 GrB.pagerank time: 21.1063 = (12.6885 + 8.14396) iter: 2
-trial: 14 GrB.pagerank time: 20.8351 = (12.5076 + 8.05004) iter: 2
-trial: 15 GrB.pagerank time: 20.9446 = (12.5266 + 8.13947) iter: 2
-trial: 16 GrB.pagerank time: 20.9754 = (12.6445 + 8.04793) iter: 2
-avg GrB.pagerank time:  21.0336 (16 trials)
-
-GAP/GAP-urand: nodes: 134.218 million  nvals: 4294.97 million
-load time: 328.369 sec
-degree time: 15.5513 sec
-
-GAP PageRank tests:
-trial:  1 GAP pagerank time: 30.5827 iter: 6
-trial:  2 GAP pagerank time: 30.4343 iter: 6
-trial:  3 GAP pagerank time: 30.5254 iter: 6
-trial:  4 GAP pagerank time: 30.4336 iter: 6
-trial:  5 GAP pagerank time: 30.6512 iter: 6
-trial:  6 GAP pagerank time: 30.4764 iter: 6
-trial:  7 GAP pagerank time: 30.5251 iter: 6
-trial:  8 GAP pagerank time: 30.475 iter: 6
-trial:  9 GAP pagerank time: 30.5354 iter: 6
-trial: 10 GAP pagerank time: 30.4872 iter: 6
-trial: 11 GAP pagerank time: 30.6038 iter: 6
-trial: 12 GAP pagerank time: 30.5063 iter: 6
-trial: 13 GAP pagerank time: 30.6454 iter: 6
-trial: 14 GAP pagerank time: 30.4838 iter: 6
-trial: 15 GAP pagerank time: 30.6073 iter: 6
-trial: 16 GAP pagerank time: 30.5624 iter: 6
-avg gap_pagerank time:  30.5335 (16 trials)
-
-GrB PageRank tests:
-trial:  1 GrB.pagerank time: 21.1187 = (15.173 + 5.65583) iter: 1
-trial:  2 GrB.pagerank time: 21.303 = (15.3843 + 5.63764) iter: 1
-trial:  3 GrB.pagerank time: 20.9742 = (15.0552 + 5.64756) iter: 1
-trial:  4 GrB.pagerank time: 20.9967 = (15.0955 + 5.63183) iter: 1
-trial:  5 GrB.pagerank time: 20.9263 = (14.9961 + 5.65973) iter: 1
-trial:  6 GrB.pagerank time: 21.2099 = (15.3038 + 5.64245) iter: 1
-trial:  7 GrB.pagerank time: 20.9986 = (15.119 + 5.61123) iter: 1
-trial:  8 GrB.pagerank time: 21.114 = (15.2011 + 5.64537) iter: 1
-trial:  9 GrB.pagerank time: 20.998 = (15.1028 + 5.63516) iter: 1
-trial: 10 GrB.pagerank time: 21.3404 = (15.4664 + 5.61629) iter: 1
-trial: 11 GrB.pagerank time: 21.0637 = (15.1667 + 5.62735) iter: 1
-trial: 12 GrB.pagerank time: 21.3487 = (15.4509 + 5.63679) iter: 1
-trial: 13 GrB.pagerank time: 20.8429 = (14.9512 + 5.60168) iter: 1
-trial: 14 GrB.pagerank time: 21.0757 = (15.1176 + 5.69571) iter: 1
-trial: 15 GrB.pagerank time: 21.0797 = (15.101 + 5.70522) iter: 1
-trial: 16 GrB.pagerank time: 20.7006 = (14.8387 + 5.6022) iter: 1
-avg GrB.pagerank time:  21.0682 (16 trials)
-downloading https://sparse.tamu.edu/mat/GAP/GAP-twitter.mat
-to /home/faculty/davis/ssget/mat/GAP/GAP-twitter.mat
-[?1h=
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 1468.36 million
-load time: 358.405 sec
-degree time: 3.73333 sec
-
-GAP PageRank tests:
-trial:  1 GAP pagerank time: 20.6371 iter: 22
-trial:  2 GAP pagerank time: 20.8124 iter: 22
-trial:  3 GAP pagerank time: 20.7188 iter: 22
-trial:  4 GAP pagerank time: 20.7134 iter: 22
-trial:  5 GAP pagerank time: 20.7016 iter: 22
-trial:  6 GAP pagerank time: 20.7021 iter: 22
-trial:  7 GAP pagerank time: 20.6839 iter: 22
-trial:  8 GAP pagerank time: 20.7189 iter: 22
-trial:  9 GAP pagerank time: 20.721 iter: 22
-trial: 10 GAP pagerank time: 20.78 iter: 22
-trial: 11 GAP pagerank time: 20.7658 iter: 22
-trial: 12 GAP pagerank time: 20.8507 iter: 22
-trial: 13 GAP pagerank time: 20.7392 iter: 22
-trial: 14 GAP pagerank time: 20.7857 iter: 22
-trial: 15 GAP pagerank time: 20.7458 iter: 22
-trial: 16 GAP pagerank time: 20.7535 iter: 22
-avg gap_pagerank time:  20.7394 (16 trials)
-
-GrB PageRank tests:
-trial:  1 GrB.pagerank time: 7.0747 = (3.62348 + 3.29647) iter: 3
-trial:  2 GrB.pagerank time: 7.02421 = (3.60455 + 3.29211) iter: 3
-trial:  3 GrB.pagerank time: 7.00059 = (3.58945 + 3.28657) iter: 3
-trial:  4 GrB.pagerank time: 7.00807 = (3.58603 + 3.29277) iter: 3
-trial:  5 GrB.pagerank time: 7.04522 = (3.61998 + 3.2976) iter: 3
-trial:  6 GrB.pagerank time: 6.98718 = (3.57931 + 3.28035) iter: 3
-trial:  7 GrB.pagerank time: 7.02002 = (3.60235 + 3.29177) iter: 3
-trial:  8 GrB.pagerank time: 7.01044 = (3.59335 + 3.28551) iter: 3
-trial:  9 GrB.pagerank time: 6.99605 = (3.58249 + 3.28783) iter: 3
-trial: 10 GrB.pagerank time: 7.03585 = (3.61878 + 3.28769) iter: 3
-trial: 11 GrB.pagerank time: 7.04545 = (3.63768 + 3.27814) iter: 3
-trial: 12 GrB.pagerank time: 7.03712 = (3.61291 + 3.2952) iter: 3
-trial: 13 GrB.pagerank time: 6.97393 = (3.57452 + 3.27393) iter: 3
-trial: 14 GrB.pagerank time: 6.99437 = (3.59514 + 3.27216) iter: 3
-trial: 15 GrB.pagerank time: 6.98952 = (3.57559 + 3.28704) iter: 3
-trial: 16 GrB.pagerank time: 6.97003 = (3.56921 + 3.2736) iter: 3
-avg GrB.pagerank time:  7.0133 (16 trials)
-downloading https://sparse.tamu.edu/mat/GAP/GAP-web.mat
-to /home/faculty/davis/ssget/mat/GAP/GAP-web.mat
-[?1h=
-GAP/GAP-web: nodes: 50.6362 million  nvals: 1930.29 million
-load time: 321.264 sec
-degree time: 1.76124 sec
-
-GAP PageRank tests:
-trial:  1 GAP pagerank time: 12.6973 iter: 30
-trial:  2 GAP pagerank time: 12.8658 iter: 30
-trial:  3 GAP pagerank time: 12.771 iter: 30
-trial:  4 GAP pagerank time: 12.7706 iter: 30
-trial:  5 GAP pagerank time: 12.7733 iter: 30
-trial:  6 GAP pagerank time: 12.7687 iter: 30
-trial:  7 GAP pagerank time: 12.7835 iter: 30
-trial:  8 GAP pagerank time: 12.7653 iter: 30
-trial:  9 GAP pagerank time: 12.7553 iter: 30
-trial: 10 GAP pagerank time: 12.8002 iter: 30
-trial: 11 GAP pagerank time: 12.7916 iter: 30
-trial: 12 GAP pagerank time: 12.8145 iter: 30
-trial: 13 GAP pagerank time: 12.8054 iter: 30
-trial: 14 GAP pagerank time: 12.7931 iter: 30
-trial: 15 GAP pagerank time: 12.8152 iter: 30
-trial: 16 GAP pagerank time: 12.9082 iter: 30
-avg gap_pagerank time:  12.7925 (16 trials)
-
-GrB PageRank tests:
-trial:  1 GrB.pagerank time: 5.98446 = (1.7072 + 4.17588) iter: 9
-trial:  2 GrB.pagerank time: 5.97926 = (1.70666 + 4.1756) iter: 9
-trial:  3 GrB.pagerank time: 5.98106 = (1.7124 + 4.17043) iter: 9
-trial:  4 GrB.pagerank time: 6.00548 = (1.73093 + 4.17816) iter: 9
-trial:  5 GrB.pagerank time: 5.99141 = (1.72445 + 4.17096) iter: 9
-trial:  6 GrB.pagerank time: 5.97311 = (1.7061 + 4.16899) iter: 9
-trial:  7 GrB.pagerank time: 6.01148 = (1.7216 + 4.1919) iter: 9
-trial:  8 GrB.pagerank time: 5.98191 = (1.72044 + 4.16471) iter: 9
-trial:  9 GrB.pagerank time: 5.97468 = (1.70343 + 4.17371) iter: 9
-trial: 10 GrB.pagerank time: 6.01749 = (1.72161 + 4.19906) iter: 9
-trial: 11 GrB.pagerank time: 6.01718 = (1.72427 + 4.19549) iter: 9
-trial: 12 GrB.pagerank time: 6.06214 = (1.7771 + 4.1863) iter: 9
-trial: 13 GrB.pagerank time: 6.05431 = (1.72101 + 4.2297) iter: 9
-trial: 14 GrB.pagerank time: 6.03629 = (1.70881 + 4.2245) iter: 9
-trial: 15 GrB.pagerank time: 6.13631 = (1.80806 + 4.22962) iter: 9
-trial: 16 GrB.pagerank time: 6.02293 = (1.72141 + 4.19801) iter: 9
-avg GrB.pagerank time:  6.01434 (16 trials)
-downloading https://sparse.tamu.edu/mat/GAP/GAP-road.mat
-to /home/faculty/davis/ssget/mat/GAP/GAP-road.mat
-[?1h=
-GAP/GAP-road: nodes: 23.9473 million  nvals: 57.7086 million
-load time: 19.9078 sec
-degree time: 0.279398 sec
-
-GAP PageRank tests:
-trial:  1 GAP pagerank time: 4.3892 iter: 39
-trial:  2 GAP pagerank time: 4.29737 iter: 39
-trial:  3 GAP pagerank time: 4.32615 iter: 39
-trial:  4 GAP pagerank time: 4.33182 iter: 39
-trial:  5 GAP pagerank time: 4.36272 iter: 39
-trial:  6 GAP pagerank time: 4.28028 iter: 39
-trial:  7 GAP pagerank time: 4.29447 iter: 39
-trial:  8 GAP pagerank time: 4.3256 iter: 39
-trial:  9 GAP pagerank time: 4.29611 iter: 39
-trial: 10 GAP pagerank time: 4.26744 iter: 39
-trial: 11 GAP pagerank time: 4.25426 iter: 39
-trial: 12 GAP pagerank time: 4.26192 iter: 39
-trial: 13 GAP pagerank time: 4.30043 iter: 39
-trial: 14 GAP pagerank time: 4.27988 iter: 39
-trial: 15 GAP pagerank time: 4.26611 iter: 39
-trial: 16 GAP pagerank time: 4.28354 iter: 39
-avg gap_pagerank time:  4.30108 (16 trials)
-
-GrB PageRank tests:
-trial:  1 GrB.pagerank time: 0.586583 = (0.266514 + 0.255283) iter: 1
-trial:  2 GrB.pagerank time: 0.563032 = (0.269994 + 0.243131) iter: 1
-trial:  3 GrB.pagerank time: 0.56085 = (0.266135 + 0.243243) iter: 1
-trial:  4 GrB.pagerank time: 0.556511 = (0.265896 + 0.240265) iter: 1
-trial:  5 GrB.pagerank time: 0.563555 = (0.267116 + 0.245962) iter: 1
-trial:  6 GrB.pagerank time: 0.562189 = (0.267416 + 0.244421) iter: 1
-trial:  7 GrB.pagerank time: 0.55825 = (0.267218 + 0.24066) iter: 1
-trial:  8 GrB.pagerank time: 0.561299 = (0.265203 + 0.245726) iter: 1
-trial:  9 GrB.pagerank time: 0.55675 = (0.264731 + 0.241604) iter: 1
-trial: 10 GrB.pagerank time: 0.55709 = (0.264904 + 0.241769) iter: 1
-trial: 11 GrB.pagerank time: 0.564287 = (0.271245 + 0.242372) iter: 1
-trial: 12 GrB.pagerank time: 0.559848 = (0.266157 + 0.242666) iter: 1
-trial: 13 GrB.pagerank time: 0.558677 = (0.266015 + 0.242296) iter: 1
-trial: 14 GrB.pagerank time: 0.554545 = (0.263562 + 0.241301) iter: 1
-trial: 15 GrB.pagerank time: 0.552973 = (0.262599 + 0.240511) iter: 1
-trial: 16 GrB.pagerank time: 0.559414 = (0.266526 + 0.242978) iter: 1
-avg GrB.pagerank time:  0.560991 (16 trials)
-
-matrices =
-
-  1x7 cell array
-
-    {'HB/west0067'}    {'SNAP/roadNet-CA'}    {'GAP/GAP-road'}    {'GAP/GAP-web'}    {'GAP/GAP-urand'}    {'GAP/GAP-twitter'}    {'GAP/GAP-kron'}
-
-
-matrices =
-
-  1x4 cell array
-
-    {'HB/west0067'}    {'SNAP/roadNet-CA'}    {'SNAP/com-Orkut'}    {'LAW/indochina-2004'}
-
-[?1h=hypersparse: 40 threads
-{Undefined operator '==' for input arguments of type 'cell'.
-
-Error in ssget (line 116)
-if (matrix == 0)
-
-Error in gap_tc (line 63)
-    Prob = ssget (id, index) ;
-
-Error in gap (line 10)
-gap_tc         % run tricount for the GAP benchmark
-} 
->> type gap
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-% gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bc         % run centrality for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-
->> gap
- GAP_TC run tricount for the GAP benchmark
-
-
-matrices =
-
-  5x1 cell array
-
-    {'GAP/GAP-road'   }
-    {'GAP/GAP-web'    }
-    {'GAP/GAP-urand'  }
-    {'GAP/GAP-twitter'}
-    {'GAP/GAP-kron'   }
-
-[?1h=hypersparse: 40 threads
-{Undefined operator '==' for input arguments of type 'cell'.
-
-Error in ssget (line 116)
-if (matrix == 0)
-
-Error in gap_tc (line 65)
-    Prob = ssget (id, index) ;
-
-Error in gap (line 10)
-gap_tc         % run tricount for the GAP benchmark
-} 
->> pwd
-
-ans =
-
-    '/home/faculty/davis/sparse/GraphBLAS/GraphBLAS'
-
->> gap
- GAP_TC run tricount for the GAP benchmark
-
-
-matrices =
-
-  5x1 cell array
-
-    {'GAP/GAP-road'   }
-    {'GAP/GAP-web'    }
-    {'GAP/GAP-urand'  }
-    {'GAP/GAP-twitter'}
-    {'GAP/GAP-kron'   }
-
-[?1h=hypersparse: 40 threads
-
-GAP/GAP-road: nodes: 23.9473 million  nvals: 57.7086 million
-load time: 7.1727 sec
-
-GrB.tricount  tests:
-trial:  1 GrB.tricount  time:    0.428
-avg GrB.tricount time:       0.428 (1 trials)
-triangles: 438804
-{Operation terminated by user during ssget (line 197)
-
-
-In gap_tc (line 65)
-    Prob = ssget (id, index) ;
-
-In gap (line 10)
-gap_tc         % run tricount for the GAP benchmark
-} 
->> gap-tc   +_  
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-type gap
-% gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bc         % run centrality for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-
- GAP_TC run tricount for the GAP benchmark
-
-
-matrices =
-
-  5x1 cell array
-
-    {'GAP/GAP-road'   }
-    {'GAP/GAP-web'    }
-    {'GAP/GAP-urand'  }
-    {'GAP/GAP-twitter'}
-    {'GAP/GAP-kron'   }
-
-[?1h=hypersparse: 40 threads
-
-GAP/GAP-road: nodes: 23.9473 million  nvals: 57.7086 million
-load time: 7.15229 sec
-
-GrB.tricount  tests:
-trial:  1 GrB.tricount  time:    0.354
-trial:  2 GrB.tricount  time:    0.356
-trial:  3 GrB.tricount  time:    0.355
-avg GrB.tricount time:       0.355 (3 trials)
-triangles: 438804
-
-GAP/GAP-web: nodes: 50.6362 million  nvals: 3620.13 million
-load time: 132.137 sec
-
-GrB.tricount  tests:
-mean degree: 53.003 median: 29
-trial:  1 GrB.tricount  time:   46.865
-mean degree: 52.393 median: 33
-trial:  2 GrB.tricount  time:   46.799
-mean degree: 48.115 median: 31
-trial:  3 GrB.tricount  time:   46.855
-avg GrB.tricount time:      46.840 (3 trials)
-triangles: 84907041475
-
-GAP/GAP-urand: nodes: 134.218 million  nvals: 4294.97 million
-load time: 490.981 sec
-
-GrB.tricount  tests:
-mean degree: 32.197 median: 32
-trial:  1 GrB.tricount  time:   39.354
-mean degree: 31.724 median: 32
-trial:  2 GrB.tricount  time:   39.413
-mean degree: 31.937 median: 32
-trial:  3 GrB.tricount  time:   39.419
-avg GrB.tricount time:      39.395 (3 trials)
-triangles: 5378
-
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 2405.03 million
-load time: 173.526 sec
-
-GrB.tricount  tests:
-mean degree: 28.061 median: 5
-sorting A first
-{Undefined function 'sort' for input arguments of type 'GrB'.
-
-Error in tricount (line 69)
-        [~, p] = sort (d, 'descend') ;
-
-Error in gap_tc (line 90)
-        s = tricount (A, d) ;
-
-Error in gap (line 11)
-gap_tc         % run tricount for the GAP benchmark
-} 
->> class (GrB(1))
-
-ans =
-
-    'GrB'
-
->> class (pi)
-
-ans =
-
-    'double'
-
->> gap_tc
- GAP_TC run tricount for the GAP benchmark
-
-
-matrices =
-
-  5x1 cell array
-
-    {'GAP/GAP-road'   }
-    {'GAP/GAP-web'    }
-    {'GAP/GAP-urand'  }
-    {'GAP/GAP-twitter'}
-    {'GAP/GAP-kron'   }
-
-
-matrices =
-
-  2x1 cell array
-
-    {'GAP/GAP-twitter'}
-    {'GAP/GAP-kron'   }
-
-[?1h=hypersparse: 40 threads
-
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 2405.03 million
-load time: 168.986 sec
-
-GrB.tricount  tests:
-mean degree: 25.979 median: 6
-sorting A first
-trial:  1 GrB.tricount  time:  275.059
-mean degree: 26.561 median: 5
-sorting A first
-trial:  2 GrB.tricount  time:  275.534
-mean degree: 29.001 median: 5
-sorting A first
-trial:  3 GrB.tricount  time:  275.510
-avg GrB.tricount time:     275.368 (3 trials)
-triangles: 34824916864
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-load time: 469.984 sec
-
-GrB.tricount  tests:
-mean degree: 12.675 median: 0
-sorting A first
-trial:  1 GrB.tricount  time: 1017.131
-mean degree: 21.572 median: 0
-sorting A first
-trial:  2 GrB.tricount  time: 1019.826
-mean degree: 35.388 median: 0
-sorting A first
-trial:  3 GrB.tricount  time: 1014.824
-avg GrB.tricount time:    1017.260 (3 trials)
-triangles: 106873365648
->> 
->> 
->> type cap  g  gap.m
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-type gap
-% gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bc         % run centrality for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-
->> type gap
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-type gap
-% gap_pr         % run pagerank for the GAP benchmark
-% gap_tc         % run tricount for the GAP benchmark
-gap_bc         % run centrality for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-
->> gap
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-type gap
-% gap_pr         % run pagerank for the GAP benchmark
-% gap_tc         % run tricount for the GAP benchmark
-gap_bc         % run centrality for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-
-[?1h=hypersparse: 40 threads
-
-matrix: GAP/GAP-kron
-load time: 314.157 sec
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-init time: 144.923 sec
-  Name                  Size                         Bytes  Class     Attributes
-
-  A             134217726x134217726            39083124578  GrB                 
-  AT            134217726x134217726            39083124578  GrB                 
-  ans                   1x1                              8  double              
-  f                     1x245                         1960  double              
-  i                     1x245                         1960  double              
-  id                    1x12                            24  char                
-  index                 1x1                        1411276  struct              
-  k                     1x1                              8  double              
-  matrices              5x1                            686  cell                
-  n                     1x1                              8  double              
-  name                  1x12                            24  char                
-  sources              64x1                            512  double              
-  t1                    1x1                              8  double              
-
-
-gap_centrality  tests:
-sources: 2338013 31997660 23590941 43400605 
-trial:  1 GrB centrality time:   52.940
-{Error using mread
-cannot open file
-
-Error in gap_bc (line 107)
-
-
-Error in gap (line 12)
-gap_bc         % run centrality for the GAP benchmark
-} 
->> 
->> 
->> 
->> 
->> 
->> whos
-  Name      Size            Bytes  Class         Attributes
-
-  ans       1x6                12  char                    
-  me        1x1              1957  MException              
-
->> me
-
-me = 
-
-  MException with properties:
-
-    identifier: 'GrB:mex'
-       message: 'mexFunction not found; use gbmake to compile GraphBLAS'
-         cause: {0x1 cell}
-         stack: [3x1 struct]
-
->> clear all
->> GrB(1)
-
-ans =
-
-  1x1 GraphBLAS double matrix, sparse by col:
-  1 nonzero, 1 entry
-
-    (1,1)    1
-
->> 
->> 
->> 
->> typeu     
->> 
->> 
->> gap
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-type gap
-gap_bc         % run centrality for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-
-[?1h=hypersparse: 40 threads
-
-threads =
-
-    40    20
-
-
-matrix: GAP/GAP-kron
-load time: 314 sec
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-init time: 141.433 sec
-  Name                  Size                         Bytes  Class     Attributes
-
-  A             134217726x134217726            39083124578  GrB                 
-  AT            134217726x134217726            39083124578  GrB                 
-  ans                   1x1                              8  double              
-  f                     1x245                         1960  double              
-  i                     1x245                         1960  double              
-  id                    1x12                            24  char                
-  index                 1x1                        1411276  struct              
-  k                     1x1                              8  double              
-  matrices              5x1                            686  cell                
-  n                     1x1                              8  double              
-  name                  1x12                            24  char                
-  sources              64x1                            512  double              
-  t1                    1x1                              8  double              
-  threads               1x2                             16  double              
-
-
-gap_centrality  tests: 40 threads
-sources: 2338013 31997660 23590941 43400605 
-trial:  1 GrB centrality time:   52.381
-avg GrB centrality time:      52.381 (1 trials)
-sources: 75337938 169868 104041221 94177943 
-trial:  2 GrB centrality time:   47.053
-avg GrB centrality time:      49.717 (2 trials)
-sources: 32871358 56230003 69883038 9346346 
-trial:  3 GrB centrality time:   50.057
-avg GrB centrality time:      49.830 (3 trials)
-sources: 48915359 122571174 6183280 86323664 
-trial:  4 GrB centrality time:   49.176
-avg GrB centrality time:      49.667 (4 trials)
-sources: 106725781 92389939 16210739 59816701 
-trial:  5 GrB centrality time:   52.219
-avg GrB centrality time:      50.177 (5 trials)
-sources: 111669930 102831412 113384801 43872565 
-trial:  6 GrB centrality time:   48.770
-avg GrB centrality time:      49.943 (6 trials)
-sources: 80508828 26105649 8807517 118452456 
-trial:  7 GrB centrality time:   50.615
-avg GrB centrality time:      50.039 (7 trials)
-sources: 121818860 42361929 29493054 98461504 
-trial:  8 GrB centrality time:   44.478
-avg GrB centrality time:      49.344 (8 trials)
-sources: 71931338 103808469 4092346 115276242 
-trial:  9 GrB centrality time:   46.895
-avg GrB centrality time:      49.072 (9 trials)
-sources: 4649344 76656190 31312002 111334128 
-trial: 10 GrB centrality time:   51.789
-avg GrB centrality time:      49.343 (10 trials)
-sources: 100962919 41823216 22631241 42848462 
-trial: 11 GrB centrality time:   52.616
-avg GrB centrality time:      49.641 (11 trials)
-sources: 79485149 106818743 73347975 78848446 
-trial: 12 GrB centrality time:   44.445
-avg GrB centrality time:      49.208 (12 trials)
-sources: 109920511 121492134 101037297 15438601 
-{Operation terminated by user during GrB.mxm (line 74)
-
-
-In gap_centrality (line 87)
-    frontier = GrB.mxm (frontier, paths, '+.first.single', frontier, A, ...
-
-In gap_bc (line 103)
-            c = gap_centrality (src, A, AT) ;
-
-In gap (line 10)
-gap_bc         % run centrality for the GAP benchmark
-} 
->> type gap
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-type gap
-gap_bc         % run centrality for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-
->> 
->> 
->> 
->> 
->> 
->> gap   whos
-  Name      Size            Bytes  Class    Attributes
-
-  ans       1x1               996  GrB                
-
->> gap   clear ans
->> gap   whos
->> gap
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-type gap
-gap_bc         % run centrality for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-
-[?1h=hypersparse: 40 threads
-
-threads =
-
-    40    20
-
-
-matrix: GAP/GAP-kron
-load time: 313.92 sec
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-init time: 141.313 sec
-  Name                  Size                         Bytes  Class     Attributes
-
-  A             134217726x134217726            39083124578  GrB                 
-  AT            134217726x134217726            39083124578  GrB                 
-  ans                   1x1                              8  double              
-  f                     1x245                         1960  double              
-  i                     1x245                         1960  double              
-  id                    1x12                            24  char                
-  index                 1x1                        1411276  struct              
-  k                     1x1                              8  double              
-  matrices              5x1                            686  cell                
-  n                     1x1                              8  double              
-  name                  1x12                            24  char                
-  sources              64x1                            512  double              
-  t1                    1x1                              8  double              
-  threads               1x2                             16  double              
-
-
-gap_centrality  tests: 40 threads
-sources: 2338013 31997660 23590941 43400605 
-trial:  1 GrB centrality time:   51.827
-sources: 75337938 169868 104041221 94177943 
-trial:  2 GrB centrality time:   47.058
-sources: 32871358 56230003 69883038 9346346 
-trial:  3 GrB centrality time:   50.140
-sources: 48915359 122571174 6183280 86323664 
-trial:  4 GrB centrality time:   48.988
-sources: 106725781 92389939 16210739 59816701 
-trial:  5 GrB centrality time:   52.296
-sources: 111669930 102831412 113384801 43872565 
-trial:  6 GrB centrality time:   48.998
-sources: 80508828 26105649 8807517 118452456 
-trial:  7 GrB centrality time:   50.716
-sources: 121818860 42361929 29493054 98461504 
-trial:  8 GrB centrality time:   44.442
-sources: 71931338 103808469 4092346 115276242 
-trial:  9 GrB centrality time:   47.029
-sources: 4649344 76656190 31312002 111334128 
-trial: 10 GrB centrality time:   51.742
-sources: 100962919 41823216 22631241 42848462 
-trial: 11 GrB centrality time:   52.785
-sources: 79485149 106818743 73347975 78848446 
-trial: 12 GrB centrality time:   44.429
-sources: 109920511 121492134 101037297 15438601 
-trial: 13 GrB centrality time:   47.359
-sources: 4584785 124503846 87241744 108297009 
-trial: 14 GrB centrality time:   57.655
-sources: 33955083 79934824 8608482 82435064 
-trial: 15 GrB centrality time:   50.922
-sources: 46579272 515422 121530468 127978737 
-trial: 16 GrB centrality time:   49.329
-avg GrB centrality time:      49.732 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 2338013 31997660 23590941 43400605 
-trial:  1 GrB centrality time:   84.277
-sources: 75337938 169868 104041221 94177943 
-trial:  2 GrB centrality time:   52.390
-sources: 32871358 56230003 69883038 9346346 
-trial:  3 GrB centrality time:   61.139
-sources: 48915359 122571174 6183280 86323664 
-trial:  4 GrB centrality time:   61.742
-sources: 106725781 92389939 16210739 59816701 
-trial:  5 GrB centrality time:   58.653
-sources: 111669930 102831412 113384801 43872565 
-trial:  6 GrB centrality time:   60.821
-sources: 80508828 26105649 8807517 118452456 
-trial:  7 GrB centrality time:   59.518
-sources: 121818860 42361929 29493054 98461504 
-trial:  8 GrB centrality time:   50.100
-sources: 71931338 103808469 4092346 115276242 
-trial:  9 GrB centrality time:   52.982
-sources: 4649344 76656190 31312002 111334128 
-trial: 10 GrB centrality time:   60.601
-sources: 100962919 41823216 22631241 42848462 
-trial: 11 GrB centrality time:   76.994
-sources: 79485149 106818743 73347975 78848446 
-trial: 12 GrB centrality time:   49.766
-sources: 109920511 121492134 101037297 15438601 
-trial: 13 GrB centrality time:   54.285
-sources: 4584785 124503846 87241744 108297009 
-trial: 14 GrB centrality time:   66.291
-sources: 33955083 79934824 8608482 82435064 
-trial: 15 GrB centrality time:   61.748
-sources: 46579272 515422 121530468 127978737 
-trial: 16 GrB centrality time:   57.766
-avg GrB centrality time:      60.567 (16 trials)
-
-matrix: GAP/GAP-urand
-load time: 330.404 sec
-
-GAP/GAP-urand: nodes: 134.218 million  nvals: 4294.97 million
-init time: 180.205 sec
-  Name                  Size                         Bytes  Class     Attributes
-
-  A             134217728x134217728            39728443458  GrB                 
-  AT            134217728x134217728            39728443458  GrB                 
-  ans                   1x1                              8  double              
-  c             134217726x1                      173806076  GrB                 
-  f                     1x245                         1960  double              
-  good                  1x27                            54  char                
-  i                     1x245                         1960  double              
-  id                    1x13                            26  char                
-  index                 1x1                        1411276  struct              
-  k                     1x1                              8  double              
-  matrices              5x1                            686  cell                
-  n                     1x1                              8  double              
-  name                  1x13                            26  char                
-  nthreads              1x1                              8  double              
-  ntrials               1x1                              8  double              
-  sources              64x1                            512  double              
-  src                   4x1                             32  double              
-  t                     1x1                              8  double              
-  t1                    1x1                              8  double              
-  threads               1x2                             16  double              
-  tot                   1x1                              8  double              
-  trial                 1x1                              8  double              
-  tstart                1x1                              8  uint64              
-
-
-gap_centrality  tests: 40 threads
-sources: 27691420 121280315 2413432 37512114 
-trial:  1 GrB centrality time:   76.139
-sources: 38390878 56651038 128461249 33029843 
-trial:  2 GrB centrality time:   76.451
-sources: 71406329 117872828 24351939 15444520 
-trial:  3 GrB centrality time:   75.107
-sources: 127526282 112279429 13631650 110379303 
-trial:  4 GrB centrality time:   74.143
-sources: 44800624 77768194 175348 107397390 
-trial:  5 GrB centrality time:   74.019
-sources: 43457210 97215941 73575166 44449716 
-trial:  6 GrB centrality time:   75.784
-sources: 33931725 55526611 14422052 58043874 
-trial:  7 GrB centrality time:   73.814
-sources: 72137330 9647841 15940696 14209953 
-trial:  8 GrB centrality time:   77.732
-sources: 49020884 28901139 50493274 49150070 
-trial:  9 GrB centrality time:   75.134
-sources: 126525083 6382741 89108298 9239736 
-trial: 10 GrB centrality time:   76.127
-sources: 110168549 95370260 116653531 123410704 
-trial: 11 GrB centrality time:   74.946
-sources: 16733666 49030283 108545122 99095666 
-trial: 12 GrB centrality time:   75.910
-sources: 133850078 63499302 21541383 6230752 
-trial: 13 GrB centrality time:   76.360
-sources: 89077457 70392766 6670456 61746272 
-trial: 14 GrB centrality time:   75.071
-sources: 83349536 115272185 20129909 106148554 
-trial: 15 GrB centrality time:   74.392
-sources: 117042376 71431188 45287809 107702121 
-trial: 16 GrB centrality time:   74.249
-avg GrB centrality time:      75.336 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 27691420 121280315 2413432 37512114 
-trial:  1 GrB centrality time:   84.268
-sources: 38390878 56651038 128461249 33029843 
-trial:  2 GrB centrality time:   84.765
-sources: 71406329 117872828 24351939 15444520 
-trial:  3 GrB centrality time:   82.920
-sources: 127526282 112279429 13631650 110379303 
-trial:  4 GrB centrality time:   82.232
-sources: 44800624 77768194 175348 107397390 
-trial:  5 GrB centrality time:   82.117
-sources: 43457210 97215941 73575166 44449716 
-trial:  6 GrB centrality time:   84.400
-sources: 33931725 55526611 14422052 58043874 
-trial:  7 GrB centrality time:   81.672
-sources: 72137330 9647841 15940696 14209953 
-trial:  8 GrB centrality time:   86.194
-sources: 49020884 28901139 50493274 49150070 
-trial:  9 GrB centrality time:   83.000
-sources: 126525083 6382741 89108298 9239736 
-trial: 10 GrB centrality time:   84.239
-sources: 110168549 95370260 116653531 123410704 
-trial: 11 GrB centrality time:   82.731
-sources: 16733666 49030283 108545122 99095666 
-trial: 12 GrB centrality time:   83.858
-sources: 133850078 63499302 21541383 6230752 
-trial: 13 GrB centrality time:   84.563
-sources: 89077457 70392766 6670456 61746272 
-trial: 14 GrB centrality time:   83.073
-sources: 83349536 115272185 20129909 106148554 
-trial: 15 GrB centrality time:   82.155
-sources: 117042376 71431188 45287809 107702121 
-trial: 16 GrB centrality time:   81.973
-avg GrB centrality time:      83.385 (16 trials)
-
-matrix: GAP/GAP-twitter
-load time: 126.806 sec
-
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 1468.36 million
-init time: 44.4989 sec
-  Name                  Size                        Bytes  Class     Attributes
-
-  A              61578415x61578415            13707912250  GrB                 
-  AT             61578415x61578415            13707912250  GrB                 
-  ans                   1x1                             8  double              
-  c             134217728x1                    1069177916  GrB                 
-  f                     1x245                        1960  double              
-  good                  1x27                           54  char                
-  i                     1x245                        1960  double              
-  id                    1x15                           30  char                
-  index                 1x1                       1411276  struct              
-  k                     1x1                             8  double              
-  matrices              5x1                           686  cell                
-  n                     1x1                             8  double              
-  name                  1x15                           30  char                
-  nthreads              1x1                             8  double              
-  ntrials               1x1                             8  double              
-  sources              64x1                           512  double              
-  src                   4x1                            32  double              
-  t                     1x1                             8  double              
-  t1                    1x1                             8  double              
-  threads               1x2                            16  double              
-  tot                   1x1                             8  double              
-  trial                 1x1                             8  double              
-  tstart                1x1                             8  uint64              
-
-
-gap_centrality  tests: 40 threads
-sources: 12441073 54488258 25451916 57714474 
-trial:  1 GrB centrality time:   22.720
-sources: 14839495 32081105 52957358 50444381 
-trial:  2 GrB centrality time:   27.917
-sources: 49590702 20127817 34939334 48251002 
-trial:  3 GrB centrality time:   24.046
-sources: 19524254 43676727 33055509 15244688 
-trial:  4 GrB centrality time:   22.398
-sources: 24946739 6479473 26077683 22023876 
-trial:  5 GrB centrality time:   25.281
-sources: 22081916 40034163 49496015 42847508 
-trial:  6 GrB centrality time:   22.786
-sources: 52409558 55445389 22028098 48766649 
-trial:  7 GrB centrality time:   23.360
-sources: 44521242 60135543 28528672 9678013 
-trial:  8 GrB centrality time:   25.418
-sources: 40020307 31625736 37446893 51788953 
-trial:  9 GrB centrality time:   22.083
-sources: 52584256 20346697 48387910 37337428 
-trial: 10 GrB centrality time:   22.385
-sources: 50501085 30130062 41185894 56495704 
-trial: 11 GrB centrality time:   23.464
-sources: 45663306 33359461 48143059 33291514 
-trial: 12 GrB centrality time:   25.461
-sources: 53461446 29340611 34148499 49171807 
-trial: 13 GrB centrality time:   23.081
-sources: 35550697 14521508 51633219 46823383 
-trial: 14 GrB centrality time:   23.651
-sources: 19396274 19871751 36862678 49539127 
-trial: 15 GrB centrality time:   26.272
-sources: 34016453 36567396 55487794 14391371 
-trial: 16 GrB centrality time:   22.809
-avg GrB centrality time:      23.946 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 12441073 54488258 25451916 57714474 
-trial:  1 GrB centrality time:   29.091
-sources: 14839495 32081105 52957358 50444381 
-trial:  2 GrB centrality time:   32.920
-sources: 49590702 20127817 34939334 48251002 
-trial:  3 GrB centrality time:   27.596
-sources: 19524254 43676727 33055509 15244688 
-trial:  4 GrB centrality time:   24.956
-sources: 24946739 6479473 26077683 22023876 
-trial:  5 GrB centrality time:   29.272
-sources: 22081916 40034163 49496015 42847508 
-trial:  6 GrB centrality time:   26.581
-sources: 52409558 55445389 22028098 48766649 
-trial:  7 GrB centrality time:   29.157
-sources: 44521242 60135543 28528672 9678013 
-trial:  8 GrB centrality time:   29.921
-sources: 40020307 31625736 37446893 51788953 
-trial:  9 GrB centrality time:   24.837
-sources: 52584256 20346697 48387910 37337428 
-trial: 10 GrB centrality time:   26.274
-sources: 50501085 30130062 41185894 56495704 
-trial: 11 GrB centrality time:   28.490
-sources: 45663306 33359461 48143059 33291514 
-trial: 12 GrB centrality time:   35.250
-sources: 53461446 29340611 34148499 49171807 
-trial: 13 GrB centrality time:   25.838
-sources: 35550697 14521508 51633219 46823383 
-trial: 14 GrB centrality time:   29.435
-sources: 19396274 19871751 36862678 49539127 
-trial: 15 GrB centrality time:   32.643
-sources: 34016453 36567396 55487794 14391371 
-trial: 16 GrB centrality time:   25.067
-avg GrB centrality time:      28.583 (16 trials)
-
-matrix: GAP/GAP-web
-load time: 110.328 sec
-
-GAP/GAP-web: nodes: 50.6362 million  nvals: 1930.29 million
-init time: 16.2098 sec
-  Name                 Size                        Bytes  Class     Attributes
-
-  A             50636151x50636151            17777726714  GrB                 
-  AT            50636151x50636151            17777726714  GrB                 
-  ans                  1x1                             8  double              
-  c             61578415x1                     178062992  GrB                 
-  f                    1x245                        1960  double              
-  good                 1x27                           54  char                
-  i                    1x245                        1960  double              
-  id                   1x11                           22  char                
-  index                1x1                       1411276  struct              
-  k                    1x1                             8  double              
-  matrices             5x1                           686  cell                
-  n                    1x1                             8  double              
-  name                 1x11                           22  char                
-  nthreads             1x1                             8  double              
-  ntrials              1x1                             8  double              
-  sources             64x1                           512  double              
-  src                  4x1                            32  double              
-  t                    1x1                             8  double              
-  t1                   1x1                             8  double              
-  threads              1x2                            16  double              
-  tot                  1x1                             8  double              
-  trial                1x1                             8  double              
-  tstart               1x1                             8  uint64              
-
-
-gap_centrality  tests: 40 threads
-sources: 10219453 44758212 890672 13843757 
-trial:  1 GrB centrality time:   34.390
-sources: 14168063 20906931 12189585 26352336 
-trial:  2 GrB centrality time:   32.922
-sources: 43500687 8987025 5699763 41436456 
-trial:  3 GrB centrality time:   33.154
-sources: 5030728 40735219 16533564 28700167 
-trial:  4 GrB centrality time:   37.045
-sources: 64712 39634751 16037780 27152740 
-trial:  5 GrB centrality time:   34.813
-sources: 16404062 20491964 5322424 21420954 
-trial:  6 GrB centrality time:   36.663
-sources: 26622110 5882876 18091041 10665897 
-trial:  7 GrB centrality time:   33.108
-sources: 18634423 18138716 2355536 32885206 
-trial:  8 GrB centrality time:   31.819
-sources: 40657441 35196168 45544427 6175520 
-trial:  9 GrB centrality time:   31.273
-sources: 40058319 50626231 36571020 49397053 
-trial: 10 GrB centrality time:   34.361
-sources: 23434266 2299445 32873824 25978283 
-trial: 11 GrB centrality time:   31.377
-sources: 2461716 22787315 30759948 7428895 
-trial: 12 GrB centrality time:   29.323
-sources: 39173871 43194210 26361510 39747212 
-trial: 13 GrB centrality time:    6.227
-sources: 30670030 41483034 9358667 9945009 
-trial: 14 GrB centrality time:   34.968
-sources: 3355245 33831270 45124745 16137878 
-trial: 15 GrB centrality time:   32.855
-sources: 11235449 37509145 27402415 39546084 
-trial: 16 GrB centrality time:   34.148
-avg GrB centrality time:      31.778 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 10219453 44758212 890672 13843757 
-trial:  1 GrB centrality time:   35.939
-sources: 14168063 20906931 12189585 26352336 
-trial:  2 GrB centrality time:   33.935
-sources: 43500687 8987025 5699763 41436456 
-trial:  3 GrB centrality time:   34.231
-sources: 5030728 40735219 16533564 28700167 
-trial:  4 GrB centrality time:   39.197
-sources: 64712 39634751 16037780 27152740 
-trial:  5 GrB centrality time:   36.234
-sources: 16404062 20491964 5322424 21420954 
-trial:  6 GrB centrality time:   38.549
-sources: 26622110 5882876 18091041 10665897 
-trial:  7 GrB centrality time:   34.484
-sources: 18634423 18138716 2355536 32885206 
-trial:  8 GrB centrality time:   33.219
-sources: 40657441 35196168 45544427 6175520 
-trial:  9 GrB centrality time:   32.439
-sources: 40058319 50626231 36571020 49397053 
-trial: 10 GrB centrality time:   35.586
-sources: 23434266 2299445 32873824 25978283 
-trial: 11 GrB centrality time:   32.547
-sources: 2461716 22787315 30759948 7428895 
-trial: 12 GrB centrality time:   30.205
-sources: 39173871 43194210 26361510 39747212 
-trial: 13 GrB centrality time:    6.418
-sources: 30670030 41483034 9358667 9945009 
-trial: 14 GrB centrality time:   36.155
-sources: 3355245 33831270 45124745 16137878 
-trial: 15 GrB centrality time:   34.035
-sources: 11235449 37509145 27402415 39546084 
-trial: 16 GrB centrality time:   35.432
-avg GrB centrality time:      33.038 (16 trials)
-
-matrix: GAP/GAP-road
-load time: 6.42099 sec
-
-GAP/GAP-road: nodes: 23.9473 million  nvals: 57.7086 million
-init time: 1.61746 sec
-  Name                 Size                      Bytes  Class     Attributes
-
-  A             23947347x23947347            710957366  GrB                 
-  AT            23947347x23947347            710957366  GrB                 
-  ans                  1x1                           8  double              
-  c             50636151x1                   174117356  GrB                 
-  f                    1x245                      1960  double              
-  good                 1x27                         54  char                
-  i                    1x245                      1960  double              
-  id                   1x12                         24  char                
-  index                1x1                     1411276  struct              
-  k                    1x1                           8  double              
-  matrices             5x1                         686  cell                
-  n                    1x1                           8  double              
-  name                 1x12                         24  char                
-  nthreads             1x1                           8  double              
-  ntrials              1x1                           8  double              
-  sources             64x1                         512  double              
-  src                  4x1                          32  double              
-  t                    1x1                           8  double              
-  t1                   1x1                           8  double              
-  threads              1x2                          16  double              
-  tot                  1x1                           8  double              
-  trial                1x1                           8  double              
-  tstart               1x1                           8  uint64              
-
-
-gap_centrality  tests: 40 threads
-sources: 4795721 21003854 417969 6496512 
-trial:  1 GrB centrality time: 1743.281
-sources: 6648700 9811074 22247479 5720253 
-trial:  2 GrB centrality time: 1820.650
-sources: 12366460 20413730 4217375 2674750 
-trial:  3 GrB centrality time: 1879.702
-sources: 22085558 19445041 2360789 19115969 
-trial:  4 GrB centrality time: 1766.489
-sources: 7758768 13468235 30368 18599548 
-trial:  5 GrB centrality time: 1759.383
-sources: 7526109 16836281 12742068 7697996 
-trial:  6 GrB centrality time: 1846.971
-sources: 5876444 9616341 2497674 10052291 
-trial:  7 GrB centrality time: 1873.804
-sources: 12493058 1670856 2760680 2460942 
-trial:  8 GrB centrality time: 1831.694
-sources: 8489651 5005226 8744646 8512024 
-trial:  9 GrB centrality time: 1782.155
-sources: 21912166 1105391 15432164 1600178 
-trial: 10 GrB centrality time: 1977.769
-sources: 19079470 16516638 20202567 21372804 
-trial: 11 GrB centrality time: 1921.094
-sources: 2898010 8491278 18798318 23757561 
-trial: 12 GrB centrality time: 1782.693
-sources: 17161820 23180740 10997086 3730631 
-trial: 13 GrB centrality time: 1911.764
-sources: 1079069 15426823 12190926 1155219 
-trial: 14 GrB centrality time: 1548.449
-sources: 10693489 14434836 19963340 3486186 
-trial: 15 GrB centrality time: 1659.773
-sources: 18383270 20269909 12370765 7843141 
-trial: 16 GrB centrality time: 1712.347
-avg GrB centrality time:    1801.126 (16 trials)
-
-gap_centrality  tests: 20 threads
-sources: 4795721 21003854 417969 6496512 
-trial:  1 GrB centrality time: 1745.401
-sources: 6648700 9811074 22247479 5720253 
-trial:  2 GrB centrality time: 1851.257
-sources: 12366460 20413730 4217375 2674750 
-trial:  3 GrB centrality time: 1884.378
-sources: 22085558 19445041 2360789 19115969 
-trial:  4 GrB centrality time: 1778.555
-sources: 7758768 13468235 30368 18599548 
-trial:  5 GrB centrality time: 1770.553
-sources: 7526109 16836281 12742068 7697996 
-trial:  6 GrB centrality time: 1853.845
-sources: 5876444 9616341 2497674 10052291 
-trial:  7 GrB centrality time: 1889.644
-sources: 12493058 1670856 2760680 2460942 
-trial:  8 GrB centrality time: 1835.298
-sources: 8489651 5005226 8744646 8512024 
-trial:  9 GrB centrality time: 1799.436
-sources: 21912166 1105391 15432164 1600178 
-trial: 10 GrB centrality time: 2033.657
-sources: 19079470 16516638 20202567 21372804 
-trial: 11 GrB centrality time: 1949.362
-sources: 2898010 8491278 18798318 23757561 
-trial: 12 GrB centrality time: 1825.842
-sources: 17161820 23180740 10997086 3730631 
-trial: 13 GrB centrality time: 1958.420
-sources: 1079069 15426823 12190926 1155219 
-trial: 14 GrB centrality time: 1588.243
-sources: 10693489 14434836 19963340 3486186 
-trial: 15 GrB centrality time: 1704.185
-sources: 18383270 20269909 12370765 7843141 
-trial: 16 GrB centrality time: 1735.917
-avg GrB centrality time:    1825.250 (16 trials)
-[?1h=hypersparse: 40 threads
-
-threads =
-
-    40    20
-
-
-matrix: GAP/GAP-kron
-load time: 318.634 sec
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-init time: 82.7095 sec
-edgeweights: min: 1 med: 127 max: 255
-delta for this matrix: 27
-{Undefined function or variable 'threads_list'.
-
-Error in gap_sssp (line 112)
-    for nthreads = threads_list
-
-Error in gap (line 11)
-gap_sssp       % run SSSP for the GAP benchmark
-} 
->> ga  type gap
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-type gap
-% gap_bc         % run centrality for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-
->> gap
-
-function gap
-%GAP run 5 GAP benchmarks (BFS, PR, BC, TC, SSSP; not CC)
-%
-% CC has not yet been implemented.
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-type gap
-% gap_bc         % run centrality for the GAP benchmark
-gap_sssp       % run SSSP for the GAP benchmark
-gap_pr         % run pagerank for the GAP benchmark
-gap_tc         % run tricount for the GAP benchmark
-gap_bfs        % run bfs for the GAP benchmark
-
-[?1h=hypersparse: 40 threads
-
-threads =
-
-    40    20
-
-
-matrix: GAP/GAP-kron
-load time: 315.046 sec
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-init time: 82.5282 sec
-edgeweights: min: 1 med: 127 max: 255
-delta for this matrix: 27
-
-gap_sssp tests: 40 threads
-trial:  1 source:  2338013 GrB SSSP12  time:   32.095
-trial:  2 source: 31997660 GrB SSSP12  time:   32.337
-trial:  3 source: 23590941 GrB SSSP12  time:   31.794
-trial:  4 source: 43400605 GrB SSSP12  time:   32.765
-trial:  5 source: 75337938 GrB SSSP12  time:   31.266
-trial:  6 source:   169868 GrB SSSP12  time:   32.135
-trial:  7 source: 104041221 GrB SSSP12  time:   32.718
-trial:  8 source: 94177943 GrB SSSP12  time:   31.960
-trial:  9 source: 32871358 GrB SSSP12  time:   32.333
-trial: 10 source: 56230003 GrB SSSP12  time:   32.964
-trial: 11 source: 69883038 GrB SSSP12  time:   32.946
-trial: 12 source:  9346346 GrB SSSP12  time:   32.619
-trial: 13 source: 48915359 GrB SSSP12  time:   32.581
-trial: 14 source: 122571174 GrB SSSP12  time:   32.926
-trial: 15 source:  6183280 GrB SSSP12  time:   32.302
-trial: 16 source: 86323664 GrB SSSP12  time:   31.881
-trial: 17 source: 106725781 GrB SSSP12  time:   31.420
-trial: 18 source: 92389939 GrB SSSP12  time:   33.233
-trial: 19 source: 16210739 GrB SSSP12  time:   32.299
-trial: 20 source: 59816701 GrB SSSP12  time:   31.867
-trial: 21 source: 111669930 GrB SSSP12  time:   32.790
-trial: 22 source: 102831412 GrB SSSP12  time:   33.092
-trial: 23 source: 113384801 GrB SSSP12  time:   31.347
-trial: 24 source: 43872565 GrB SSSP12  time:   32.625
-trial: 25 source: 80508828 GrB SSSP12  time:   32.457
-trial: 26 source: 26105649 GrB SSSP12  time:   32.374
-trial: 27 source:  8807517 GrB SSSP12  time:   31.927
-trial: 28 source: 118452456 GrB SSSP12  time:   32.259
-trial: 29 source: 121818860 GrB SSSP12  time:   32.317
-trial: 30 source: 42361929 GrB SSSP12  time:   31.678
-trial: 31 source: 29493054 GrB SSSP12  time:   31.414
-trial: 32 source: 98461504 GrB SSSP12  time:   32.692
-trial: 33 source: 71931338 GrB SSSP12  time:   30.937
-trial: 34 source: 103808469 GrB SSSP12  time:   32.577
-trial: 35 source:  4092346 GrB SSSP12  time:   31.464
-trial: 36 source: 115276242 GrB SSSP12  time:   32.343
-trial: 37 source:  4649344 GrB SSSP12  time:   31.979
-trial: 38 source: 76656190 GrB SSSP12  time:   32.343
-trial: 39 source: 31312002 GrB SSSP12  time:   32.637
-trial: 40 source: 111334128 GrB SSSP12  time:   31.969
-trial: 41 source: 100962919 GrB SSSP12  time:   33.149
-trial: 42 source: 41823216 GrB SSSP12  time:   31.895
-trial: 43 source: 22631241 GrB SSSP12  time:   30.906
-trial: 44 source: 42848462 GrB SSSP12  time:   31.749
-trial: 45 source: 79485149 GrB SSSP12  time:   32.187
-trial: 46 source: 106818743 GrB SSSP12  time:   32.411
-trial: 47 source: 73347975 GrB SSSP12  time:   33.841
-trial: 48 source: 78848446 GrB SSSP12  time:   30.909
-trial: 49 source: 109920511 GrB SSSP12  time:   32.462
-trial: 50 source: 121492134 GrB SSSP12  time:   31.917
-trial: 51 source: 101037297 GrB SSSP12  time:   32.185
-trial: 52 source: 15438601 GrB SSSP12  time:   32.576
-trial: 53 source:  4584785 GrB SSSP12  time:   34.311
-trial: 54 source: 124503846 GrB SSSP12  time:   32.677
-trial: 55 source: 87241744 GrB SSSP12  time:   31.360
-trial: 56 source: 108297009 GrB SSSP12  time:   32.570
-trial: 57 source: 33955083 GrB SSSP12  time:   31.914
-trial: 58 source: 79934824 GrB SSSP12  time:   31.853
-trial: 59 source:  8608482 GrB SSSP12  time:   31.211
-trial: 60 source: 82435064 GrB SSSP12  time:   31.412
-trial: 61 source: 46579272 GrB SSSP12  time:   31.571
-trial: 62 source:   515422 GrB SSSP12  time:   33.963
-trial: 63 source: 121530468 GrB SSSP12  time:   31.647
-trial: 64 source: 127978737 GrB SSSP12  time:   32.716
-avg GrB SSSP12c time:       0.000 (64 trials)
-avg GrB SSSP12  time:      32.235 (64 trials)
-
-gap_sssp tests: 20 threads
-trial:  1 source:  2338013 GrB SSSP12  time:   34.101
-trial:  2 source: 31997660 GrB SSSP12  time:   34.226
-trial:  3 source: 23590941 GrB SSSP12  time:   33.920
-trial:  4 source: 43400605 GrB SSSP12  time:   34.884
-trial:  5 source: 75337938 GrB SSSP12  time:   33.270
-trial:  6 source:   169868 GrB SSSP12  time:   34.246
-trial:  7 source: 104041221 GrB SSSP12  time:   34.979
-trial:  8 source: 94177943 GrB SSSP12  time:   34.237
-trial:  9 source: 32871358 GrB SSSP12  time:   34.456
-trial: 10 source: 56230003 GrB SSSP12  time:   35.006
-trial: 11 source: 69883038 GrB SSSP12  time:   35.117
-trial: 12 source:  9346346 GrB SSSP12  time:   34.815
-trial: 13 source: 48915359 GrB SSSP12  time:   34.592
-trial: 14 source: 122571174 GrB SSSP12  time:   35.108
-trial: 15 source:  6183280 GrB SSSP12  time:   34.504
-trial: 16 source: 86323664 GrB SSSP12  time:   34.017
-trial: 17 source: 106725781 GrB SSSP12  time:   33.519
-trial: 18 source: 92389939 GrB SSSP12  time:   35.385
-trial: 19 source: 16210739 GrB SSSP12  time:   34.346
-trial: 20 source: 59816701 GrB SSSP12  time:   33.919
-trial: 21 source: 111669930 GrB SSSP12  time:   34.594
-trial: 22 source: 102831412 GrB SSSP12  time:   35.020
-trial: 23 source: 113384801 GrB SSSP12  time:   33.130
-trial: 24 source: 43872565 GrB SSSP12  time:   34.550
-trial: 25 source: 80508828 GrB SSSP12  time:   34.319
-trial: 26 source: 26105649 GrB SSSP12  time:   34.114
-trial: 27 source:  8807517 GrB SSSP12  time:   33.678
-trial: 28 source: 118452456 GrB SSSP12  time:   34.169
-trial: 29 source: 121818860 GrB SSSP12  time:   34.234
-trial: 30 source: 42361929 GrB SSSP12  time:   33.678
-trial: 31 source: 29493054 GrB SSSP12  time:   33.331
-trial: 32 source: 98461504 GrB SSSP12  time:   34.706
-trial: 33 source: 71931338 GrB SSSP12  time:   32.821
-trial: 34 source: 103808469 GrB SSSP12  time:   34.620
-trial: 35 source:  4092346 GrB SSSP12  time:   33.412
-trial: 36 source: 115276242 GrB SSSP12  time:   34.354
-trial: 37 source:  4649344 GrB SSSP12  time:   33.796
-trial: 38 source: 76656190 GrB SSSP12  time:   34.462
-trial: 39 source: 31312002 GrB SSSP12  time:   34.677
-trial: 40 source: 111334128 GrB SSSP12  time:   33.997
-trial: 41 source: 100962919 GrB SSSP12  time:   35.299
-trial: 42 source: 41823216 GrB SSSP12  time:   33.835
-trial: 43 source: 22631241 GrB SSSP12  time:   32.714
-trial: 44 source: 42848462 GrB SSSP12  time:   35.707
-trial: 45 source: 79485149 GrB SSSP12  time:   34.588
-trial: 46 source: 106818743 GrB SSSP12  time:   34.296
-trial: 47 source: 73347975 GrB SSSP12  time:   35.143
-trial: 48 source: 78848446 GrB SSSP12  time:   32.927
-trial: 49 source: 109920511 GrB SSSP12  time:   34.629
-trial: 50 source: 121492134 GrB SSSP12  time:   34.069
-trial: 51 source: 101037297 GrB SSSP12  time:   34.467
-trial: 52 source: 15438601 GrB SSSP12  time:   34.843
-trial: 53 source:  4584785 GrB SSSP12  time:   36.789
-trial: 54 source: 124503846 GrB SSSP12  time:   35.010
-trial: 55 source: 87241744 GrB SSSP12  time:   33.694
-trial: 56 source: 108297009 GrB SSSP12  time:   34.926
-trial: 57 source: 33955083 GrB SSSP12  time:   34.020
-trial: 58 source: 79934824 GrB SSSP12  time:   34.002
-trial: 59 source:  8608482 GrB SSSP12  time:   33.420
-trial: 60 source: 82435064 GrB SSSP12  time:   33.564
-trial: 61 source: 46579272 GrB SSSP12  time:   33.744
-trial: 62 source:   515422 GrB SSSP12  time:   36.473
-trial: 63 source: 121530468 GrB SSSP12  time:   33.869
-trial: 64 source: 127978737 GrB SSSP12  time:   35.022
-avg GrB SSSP12c time:       0.000 (64 trials)
-avg GrB SSSP12  time:      34.334 (64 trials)
-
-matrix: GAP/GAP-urand
-load time: 329.435 sec
-
-GAP/GAP-urand: nodes: 134.218 million  nvals: 4294.97 million
-init time: 102.475 sec
-edgeweights: min: 1 med: 128 max: 255
-delta for this matrix: 35
-
-gap_sssp tests: 40 threads
-trial:  1 source: 27691420 GrB SSSP12  time:   40.295
-trial:  2 source: 121280315 GrB SSSP12  time:   40.955
-trial:  3 source:  2413432 GrB SSSP12  time:   41.152
-trial:  4 source: 37512114 GrB SSSP12  time:   41.356
-trial:  5 source: 38390878 GrB SSSP12  time:   41.414
-trial:  6 source: 56651038 GrB SSSP12  time:   41.015
-trial:  7 source: 128461249 GrB SSSP12  time:   41.220
-trial:  8 source: 33029843 GrB SSSP12  time:   41.314
-trial:  9 source: 71406329 GrB SSSP12  time:   41.048
-trial: 10 source: 117872828 GrB SSSP12  time:   40.880
-trial: 11 source: 24351939 GrB SSSP12  time:   41.078
-trial: 12 source: 15444520 GrB SSSP12  time:   41.327
-trial: 13 source: 127526282 GrB SSSP12  time:   41.289
-trial: 14 source: 112279429 GrB SSSP12  time:   40.898
-trial: 15 source: 13631650 GrB SSSP12  time:   41.535
-trial: 16 source: 110379303 GrB SSSP12  time:   40.528
-trial: 17 source: 44800624 GrB SSSP12  time:   40.859
-trial: 18 source: 77768194 GrB SSSP12  time:   42.162
-trial: 19 source:   175348 GrB SSSP12  time:   41.017
-trial: 20 source: 107397390 GrB SSSP12  time:   40.637
-trial: 21 source: 43457210 GrB SSSP12  time:   41.482
-trial: 22 source: 97215941 GrB SSSP12  time:   41.296
-trial: 23 source: 73575166 GrB SSSP12  time:   41.822
-trial: 24 source: 44449716 GrB SSSP12  time:   41.127
-trial: 25 source: 33931725 GrB SSSP12  time:   41.677
-trial: 26 source: 55526611 GrB SSSP12  time:   41.591
-trial: 27 source: 14422052 GrB SSSP12  time:   41.319
-trial: 28 source: 58043874 GrB SSSP12  time:   42.432
-trial: 29 source: 72137330 GrB SSSP12  time:   40.984
-trial: 30 source:  9647841 GrB SSSP12  time:   41.552
-trial: 31 source: 15940696 GrB SSSP12  time:   41.486
-trial: 32 source: 14209953 GrB SSSP12  time:   41.464
-trial: 33 source: 49020884 GrB SSSP12  time:   40.804
-trial: 34 source: 28901139 GrB SSSP12  time:   41.119
-trial: 35 source: 50493274 GrB SSSP12  time:   41.012
-trial: 36 source: 49150070 GrB SSSP12  time:   41.662
-trial: 37 source: 126525083 GrB SSSP12  time:   41.445
-trial: 38 source:  6382741 GrB SSSP12  time:   41.405
-trial: 39 source: 89108298 GrB SSSP12  time:   40.532
-trial: 40 source:  9239736 GrB SSSP12  time:   41.674
-trial: 41 source: 110168549 GrB SSSP12  time:   41.502
-trial: 42 source: 95370260 GrB SSSP12  time:   41.436
-trial: 43 source: 116653531 GrB SSSP12  time:   40.904
-trial: 44 source: 123410704 GrB SSSP12  time:   40.975
-trial: 45 source: 16733666 GrB SSSP12  time:   41.300
-trial: 46 source: 49030283 GrB SSSP12  time:   41.113
-trial: 47 source: 108545122 GrB SSSP12  time:   40.530
-trial: 48 source: 99095666 GrB SSSP12  time:   41.049
-trial: 49 source: 133850078 GrB SSSP12  time:   41.373
-trial: 50 source: 63499302 GrB SSSP12  time:   41.458
-trial: 51 source: 21541383 GrB SSSP12  time:   42.095
-trial: 52 source:  6230752 GrB SSSP12  time:   41.345
-trial: 53 source: 89077457 GrB SSSP12  time:   40.783
-trial: 54 source: 70392766 GrB SSSP12  time:   41.542
-trial: 55 source:  6670456 GrB SSSP12  time:   41.263
-trial: 56 source: 61746272 GrB SSSP12  time:   40.547
-trial: 57 source: 83349536 GrB SSSP12  time:   41.468
-trial: 58 source: 115272185 GrB SSSP12  time:   41.799
-trial: 59 source: 20129909 GrB SSSP12  time:   41.645
-trial: 60 source: 106148554 GrB SSSP12  time:   41.046
-trial: 61 source: 117042376 GrB SSSP12  time:   42.053
-trial: 62 source: 71431188 GrB SSSP12  time:   41.521
-trial: 63 source: 45287809 GrB SSSP12  time:   41.383
-trial: 64 source: 107702121 GrB SSSP12  time:   41.173
-avg GrB SSSP12c time:       0.000 (64 trials)
-avg GrB SSSP12  time:      41.268 (64 trials)
-
-gap_sssp tests: 20 threads
-trial:  1 source: 27691420 GrB SSSP12  time:   42.517
-trial:  2 source: 121280315 GrB SSSP12  time:   43.338
-trial:  3 source:  2413432 GrB SSSP12  time:   44.558
-trial:  4 source: 37512114 GrB SSSP12  time:   43.349
-trial:  5 source: 38390878 GrB SSSP12  time:   43.643
-trial:  6 source: 56651038 GrB SSSP12  time:   43.356
-trial:  7 source: 128461249 GrB SSSP12  time:   43.605
-trial:  8 source: 33029843 GrB SSSP12  time:   43.686
-trial:  9 source: 71406329 GrB SSSP12  time:   43.425
-trial: 10 source: 117872828 GrB SSSP12  time:   43.116
-trial: 11 source: 24351939 GrB SSSP12  time:   43.727
-trial: 12 source: 15444520 GrB SSSP12  time:   43.614
-trial: 13 source: 127526282 GrB SSSP12  time:   43.758
-trial: 14 source: 112279429 GrB SSSP12  time:   43.142
-trial: 15 source: 13631650 GrB SSSP12  time:   43.812
-trial: 16 source: 110379303 GrB SSSP12  time:   42.756
-trial: 17 source: 44800624 GrB SSSP12  time:   42.773
-trial: 18 source: 77768194 GrB SSSP12  time:   43.676
-trial: 19 source:   175348 GrB SSSP12  time:   43.301
-trial: 20 source: 107397390 GrB SSSP12  time:   42.893
-trial: 21 source: 43457210 GrB SSSP12  time:   43.720
-trial: 22 source: 97215941 GrB SSSP12  time:   43.704
-trial: 23 source: 73575166 GrB SSSP12  time:   44.293
-trial: 24 source: 44449716 GrB SSSP12  time:   43.348
-trial: 25 source: 33931725 GrB SSSP12  time:   43.856
-trial: 26 source: 55526611 GrB SSSP12  time:   43.959
-trial: 27 source: 14422052 GrB SSSP12  time:   43.501
-trial: 28 source: 58043874 GrB SSSP12  time:   44.681
-trial: 29 source: 72137330 GrB SSSP12  time:   43.413
-trial: 30 source:  9647841 GrB SSSP12  time:   43.830
-trial: 31 source: 15940696 GrB SSSP12  time:   43.809
-trial: 32 source: 14209953 GrB SSSP12  time:   43.912
-trial: 33 source: 49020884 GrB SSSP12  time:   43.099
-trial: 34 source: 28901139 GrB SSSP12  time:   43.265
-trial: 35 source: 50493274 GrB SSSP12  time:   43.372
-trial: 36 source: 49150070 GrB SSSP12  time:   43.836
-trial: 37 source: 126525083 GrB SSSP12  time:   43.485
-trial: 38 source:  6382741 GrB SSSP12  time:   43.425
-trial: 39 source: 89108298 GrB SSSP12  time:   42.600
-trial: 40 source:  9239736 GrB SSSP12  time:   43.707
-trial: 41 source: 110168549 GrB SSSP12  time:   43.553
-trial: 42 source: 95370260 GrB SSSP12  time:   43.383
-trial: 43 source: 116653531 GrB SSSP12  time:   43.002
-trial: 44 source: 123410704 GrB SSSP12  time:   43.140
-trial: 45 source: 16733666 GrB SSSP12  time:   43.447
-trial: 46 source: 49030283 GrB SSSP12  time:   44.701
-trial: 47 source: 108545122 GrB SSSP12  time:   42.549
-trial: 48 source: 99095666 GrB SSSP12  time:   43.005
-trial: 49 source: 133850078 GrB SSSP12  time:   43.274
-trial: 50 source: 63499302 GrB SSSP12  time:   43.368
-trial: 51 source: 21541383 GrB SSSP12  time:   44.016
-trial: 52 source:  6230752 GrB SSSP12  time:   43.247
-trial: 53 source: 89077457 GrB SSSP12  time:   42.998
-trial: 54 source: 70392766 GrB SSSP12  time:   43.612
-trial: 55 source:  6670456 GrB SSSP12  time:   43.351
-trial: 56 source: 61746272 GrB SSSP12  time:   42.457
-trial: 57 source: 83349536 GrB SSSP12  time:   43.596
-trial: 58 source: 115272185 GrB SSSP12  time:   43.855
-trial: 59 source: 20129909 GrB SSSP12  time:   43.673
-trial: 60 source: 106148554 GrB SSSP12  time:   43.215
-trial: 61 source: 117042376 GrB SSSP12  time:   44.041
-trial: 62 source: 71431188 GrB SSSP12  time:   43.680
-trial: 63 source: 45287809 GrB SSSP12  time:   43.290
-trial: 64 source: 107702121 GrB SSSP12  time:   43.164
-avg GrB SSSP12c time:       0.000 (64 trials)
-avg GrB SSSP12  time:      43.476 (64 trials)
-
-matrix: GAP/GAP-twitter
-load time: 139.349 sec
-
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 1468.36 million
-init time: 26.4982 sec
-edgeweights: min: 1 med: 128 max: 255
-delta for this matrix: 51
-
-gap_sssp tests: 40 threads
-trial:  1 source: 12441073 GrB SSSP12  time:   17.871
-trial:  2 source: 54488258 GrB SSSP12  time:   19.484
-trial:  3 source: 25451916 GrB SSSP12  time:   19.935
-trial:  4 source: 57714474 GrB SSSP12  time:   19.128
-trial:  5 source: 14839495 GrB SSSP12  time:   18.986
-trial:  6 source: 32081105 GrB SSSP12  time:   18.315
-trial:  7 source: 52957358 GrB SSSP12  time:   18.976
-trial:  8 source: 50444381 GrB SSSP12  time:   19.423
-trial:  9 source: 49590702 GrB SSSP12  time:   18.785
-trial: 10 source: 20127817 GrB SSSP12  time:   18.533
-trial: 11 source: 34939334 GrB SSSP12  time:   20.150
-trial: 12 source: 48251002 GrB SSSP12  time:   19.719
-trial: 13 source: 19524254 GrB SSSP12  time:   18.418
-trial: 14 source: 43676727 GrB SSSP12  time:   19.457
-trial: 15 source: 33055509 GrB SSSP12  time:   18.958
-trial: 16 source: 15244688 GrB SSSP12  time:   18.596
-trial: 17 source: 24946739 GrB SSSP12  time:   20.622
-trial: 18 source:  6479473 GrB SSSP12  time:   19.004
-trial: 19 source: 26077683 GrB SSSP12  time:   19.600
-trial: 20 source: 22023876 GrB SSSP12  time:   19.874
-trial: 21 source: 22081916 GrB SSSP12  time:   19.361
-trial: 22 source: 40034163 GrB SSSP12  time:   17.757
-trial: 23 source: 49496015 GrB SSSP12  time:   19.809
-trial: 24 source: 42847508 GrB SSSP12  time:   21.258
-trial: 25 source: 52409558 GrB SSSP12  time:   19.165
-trial: 26 source: 55445389 GrB SSSP12  time:   17.672
-trial: 27 source: 22028098 GrB SSSP12  time:   19.322
-trial: 28 source: 48766649 GrB SSSP12  time:   19.281
-trial: 29 source: 44521242 GrB SSSP12  time:   19.088
-trial: 30 source: 60135543 GrB SSSP12  time:   17.898
-trial: 31 source: 28528672 GrB SSSP12  time:   17.932
-trial: 32 source:  9678013 GrB SSSP12  time:   18.773
-trial: 33 source: 40020307 GrB SSSP12  time:   19.662
-trial: 34 source: 31625736 GrB SSSP12  time:   19.120
-trial: 35 source: 37446893 GrB SSSP12  time:   19.409
-trial: 36 source: 51788953 GrB SSSP12  time:   19.115
-trial: 37 source: 52584256 GrB SSSP12  time:   18.979
-trial: 38 source: 20346697 GrB SSSP12  time:   18.952
-trial: 39 source: 48387910 GrB SSSP12  time:   18.695
-trial: 40 source: 37337428 GrB SSSP12  time:   18.080
-trial: 41 source: 50501085 GrB SSSP12  time:   18.615
-trial: 42 source: 30130062 GrB SSSP12  time:   19.205
-trial: 43 source: 41185894 GrB SSSP12  time:   19.664
-trial: 44 source: 56495704 GrB SSSP12  time:   18.510
-trial: 45 source: 45663306 GrB SSSP12  time:   18.995
-trial: 46 source: 33359461 GrB SSSP12  time:   19.028
-trial: 47 source: 48143059 GrB SSSP12  time:   18.473
-trial: 48 source: 33291514 GrB SSSP12  time:   17.957
-trial: 49 source: 53461446 GrB SSSP12  time:   18.888
-trial: 50 source: 29340611 GrB SSSP12  time:   18.432
-trial: 51 source: 34148499 GrB SSSP12  time:   18.900
-trial: 52 source: 49171807 GrB SSSP12  time:   19.393
-trial: 53 source: 35550697 GrB SSSP12  time:   19.808
-trial: 54 source: 14521508 GrB SSSP12  time:   19.388
-trial: 55 source: 51633219 GrB SSSP12  time:   19.607
-trial: 56 source: 46823383 GrB SSSP12  time:   18.835
-trial: 57 source: 19396274 GrB SSSP12  time:   19.455
-trial: 58 source: 19871751 GrB SSSP12  time:   19.367
-trial: 59 source: 36862678 GrB SSSP12  time:   20.257
-trial: 60 source: 49539127 GrB SSSP12  time:   20.080
-trial: 61 source: 34016453 GrB SSSP12  time:   18.133
-trial: 62 source: 36567396 GrB SSSP12  time:   18.791
-trial: 63 source: 55487794 GrB SSSP12  time:   18.723
-trial: 64 source: 14391371 GrB SSSP12  time:   20.228
-avg GrB SSSP12c time:       0.000 (64 trials)
-avg GrB SSSP12  time:      19.092 (64 trials)
-
-gap_sssp tests: 20 threads
-trial:  1 source: 12441073 GrB SSSP12  time:   18.598
-trial:  2 source: 54488258 GrB SSSP12  time:   20.177
-trial:  3 source: 25451916 GrB SSSP12  time:   20.702
-trial:  4 source: 57714474 GrB SSSP12  time:   19.961
-trial:  5 source: 14839495 GrB SSSP12  time:   19.870
-trial:  6 source: 32081105 GrB SSSP12  time:   19.068
-trial:  7 source: 52957358 GrB SSSP12  time:   19.735
-trial:  8 source: 50444381 GrB SSSP12  time:   20.132
-trial:  9 source: 49590702 GrB SSSP12  time:   19.444
-trial: 10 source: 20127817 GrB SSSP12  time:   19.143
-trial: 11 source: 34939334 GrB SSSP12  time:   20.895
-trial: 12 source: 48251002 GrB SSSP12  time:   20.489
-trial: 13 source: 19524254 GrB SSSP12  time:   19.021
-trial: 14 source: 43676727 GrB SSSP12  time:   20.218
-trial: 15 source: 33055509 GrB SSSP12  time:   19.641
-trial: 16 source: 15244688 GrB SSSP12  time:   19.241
-trial: 17 source: 24946739 GrB SSSP12  time:   21.483
-trial: 18 source:  6479473 GrB SSSP12  time:   19.682
-trial: 19 source: 26077683 GrB SSSP12  time:   20.296
-trial: 20 source: 22023876 GrB SSSP12  time:   20.626
-trial: 21 source: 22081916 GrB SSSP12  time:   20.004
-trial: 22 source: 40034163 GrB SSSP12  time:   18.395
-trial: 23 source: 49496015 GrB SSSP12  time:   20.716
-trial: 24 source: 42847508 GrB SSSP12  time:   22.194
-trial: 25 source: 52409558 GrB SSSP12  time:   19.948
-trial: 26 source: 55445389 GrB SSSP12  time:   18.259
-trial: 27 source: 22028098 GrB SSSP12  time:   20.097
-trial: 28 source: 48766649 GrB SSSP12  time:   20.004
-trial: 29 source: 44521242 GrB SSSP12  time:   19.804
-trial: 30 source: 60135543 GrB SSSP12  time:   18.608
-trial: 31 source: 28528672 GrB SSSP12  time:   18.568
-trial: 32 source:  9678013 GrB SSSP12  time:   19.637
-trial: 33 source: 40020307 GrB SSSP12  time:   20.728
-trial: 34 source: 31625736 GrB SSSP12  time:   20.139
-trial: 35 source: 37446893 GrB SSSP12  time:   20.437
-trial: 36 source: 51788953 GrB SSSP12  time:   20.109
-trial: 37 source: 52584256 GrB SSSP12  time:   19.869
-trial: 38 source: 20346697 GrB SSSP12  time:   19.865
-trial: 39 source: 48387910 GrB SSSP12  time:   19.568
-trial: 40 source: 37337428 GrB SSSP12  time:   18.913
-trial: 41 source: 50501085 GrB SSSP12  time:   19.512
-trial: 42 source: 30130062 GrB SSSP12  time:   20.146
-trial: 43 source: 41185894 GrB SSSP12  time:   20.662
-trial: 44 source: 56495704 GrB SSSP12  time:   19.279
-trial: 45 source: 45663306 GrB SSSP12  time:   19.772
-trial: 46 source: 33359461 GrB SSSP12  time:   19.758
-trial: 47 source: 48143059 GrB SSSP12  time:   19.204
-trial: 48 source: 33291514 GrB SSSP12  time:   18.703
-trial: 49 source: 53461446 GrB SSSP12  time:   19.731
-trial: 50 source: 29340611 GrB SSSP12  time:   19.178
-trial: 51 source: 34148499 GrB SSSP12  time:   19.665
-trial: 52 source: 49171807 GrB SSSP12  time:   19.996
-trial: 53 source: 35550697 GrB SSSP12  time:   20.138
-trial: 54 source: 14521508 GrB SSSP12  time:   20.276
-trial: 55 source: 51633219 GrB SSSP12  time:   20.601
-trial: 56 source: 46823383 GrB SSSP12  time:   19.718
-trial: 57 source: 19396274 GrB SSSP12  time:   20.451
-trial: 58 source: 19871751 GrB SSSP12  time:   20.318
-trial: 59 source: 36862678 GrB SSSP12  time:   21.272
-trial: 60 source: 49539127 GrB SSSP12  time:   20.943
-trial: 61 source: 34016453 GrB SSSP12  time:   18.909
-trial: 62 source: 36567396 GrB SSSP12  time:   19.595
-trial: 63 source: 55487794 GrB SSSP12  time:   19.514
-trial: 64 source: 14391371 GrB SSSP12  time:   21.106
-avg GrB SSSP12c time:       0.000 (64 trials)
-avg GrB SSSP12  time:      19.886 (64 trials)
-
-matrix: GAP/GAP-web
-load time: 115.031 sec
-
-GAP/GAP-web: nodes: 50.6362 million  nvals: 1930.29 million
-init time: 14.9171 sec
-edgeweights: min: 1 med: 128 max: 255
-delta for this matrix: 150
-
-gap_sssp tests: 40 threads
-trial:  1 source: 10219453 GrB SSSP12  time:   47.082
-trial:  2 source: 44758212 GrB SSSP12  time:   47.549
-trial:  3 source:   890672 GrB SSSP12  time:   51.964
-trial:  4 source: 13843757 GrB SSSP12  time:    5.749
-trial:  5 source: 14168063 GrB SSSP12  time:   48.015
-trial:  6 source: 20906931 GrB SSSP12  time:   48.066
-trial:  7 source: 12189585 GrB SSSP12  time:   48.371
-trial:  8 source: 26352336 GrB SSSP12  time:    7.249
-trial:  9 source: 43500687 GrB SSSP12  time:   47.585
-trial: 10 source:  8987025 GrB SSSP12  time:   46.505
-trial: 11 source:  5699763 GrB SSSP12  time:    5.022
-trial: 12 source: 41436456 GrB SSSP12  time:   48.629
-trial: 13 source:  5030728 GrB SSSP12  time:   48.098
-trial: 14 source: 40735219 GrB SSSP12  time:   48.580
-trial: 15 source: 16533564 GrB SSSP12  time:   46.880
-trial: 16 source: 28700167 GrB SSSP12  time:   47.163
-trial: 17 source:    64712 GrB SSSP12  time:   46.332
-trial: 18 source: 39634751 GrB SSSP12  time:   47.601
-trial: 19 source: 16037780 GrB SSSP12  time:   47.190
-trial: 20 source: 27152740 GrB SSSP12  time:   46.895
-trial: 21 source: 16404062 GrB SSSP12  time:   48.305
-trial: 22 source: 20491964 GrB SSSP12  time:   47.839
-trial: 23 source:  5322424 GrB SSSP12  time:   49.146
-trial: 24 source: 21420954 GrB SSSP12  time:   47.523
-trial: 25 source: 26622110 GrB SSSP12  time:   47.947
-trial: 26 source:  5882876 GrB SSSP12  time:   47.578
-trial: 27 source: 18091041 GrB SSSP12  time:   47.473
-trial: 28 source: 10665897 GrB SSSP12  time:    5.516
-trial: 29 source: 18634423 GrB SSSP12  time:   46.058
-trial: 30 source: 18138716 GrB SSSP12  time:   48.143
-trial: 31 source:  2355536 GrB SSSP12  time:   47.802
-trial: 32 source: 32885206 GrB SSSP12  time:    6.302
-trial: 33 source: 40657441 GrB SSSP12  time:   47.386
-trial: 34 source: 35196168 GrB SSSP12  time:   47.644
-trial: 35 source: 45544427 GrB SSSP12  time:   46.045
-trial: 36 source:  6175520 GrB SSSP12  time:    4.603
-trial: 37 source: 40058319 GrB SSSP12  time:   47.572
-trial: 38 source: 50626231 GrB SSSP12  time:   47.830
-trial: 39 source: 36571020 GrB SSSP12  time:   47.559
-trial: 40 source: 49397053 GrB SSSP12  time:   47.395
-trial: 41 source: 23434266 GrB SSSP12  time:   46.424
-trial: 42 source:  2299445 GrB SSSP12  time:   48.281
-trial: 43 source: 32873824 GrB SSSP12  time:    6.084
-trial: 44 source: 25978283 GrB SSSP12  time:   47.592
-trial: 45 source:  2461716 GrB SSSP12  time:    5.717
-trial: 46 source: 22787315 GrB SSSP12  time:   47.340
-trial: 47 source: 30759948 GrB SSSP12  time:   46.565
-trial: 48 source:  7428895 GrB SSSP12  time:    3.148
-trial: 49 source: 39173871 GrB SSSP12  time:    3.004
-trial: 50 source: 43194210 GrB SSSP12  time:    4.462
-trial: 51 source: 26361510 GrB SSSP12  time:    7.011
-trial: 52 source: 39747212 GrB SSSP12  time:    3.021
-trial: 53 source: 30670030 GrB SSSP12  time:   47.099
-trial: 54 source: 41483034 GrB SSSP12  time:   49.001
-trial: 55 source:  9358667 GrB SSSP12  time:   48.264
-trial: 56 source:  9945009 GrB SSSP12  time:   48.862
-trial: 57 source:  3355245 GrB SSSP12  time:   47.903
-trial: 58 source: 33831270 GrB SSSP12  time:   47.305
-trial: 59 source: 45124745 GrB SSSP12  time:   45.730
-trial: 60 source: 16137878 GrB SSSP12  time:    6.299
-trial: 61 source: 11235449 GrB SSSP12  time:   47.534
-trial: 62 source: 37509145 GrB SSSP12  time:   47.602
-trial: 63 source: 27402415 GrB SSSP12  time:   47.956
-trial: 64 source: 39546084 GrB SSSP12  time:   46.528
-avg GrB SSSP12c time:       0.000 (64 trials)
-avg GrB SSSP12  time:      38.358 (64 trials)
-
-gap_sssp tests: 20 threads
-trial:  1 source: 10219453 GrB SSSP12  time:   47.509
-trial:  2 source: 44758212 GrB SSSP12  time:   47.789
-trial:  3 source:   890672 GrB SSSP12  time:   53.437
-trial:  4 source: 13843757 GrB SSSP12  time:    6.140
-trial:  5 source: 14168063 GrB SSSP12  time:   48.050
-trial:  6 source: 20906931 GrB SSSP12  time:   48.837
-trial:  7 source: 12189585 GrB SSSP12  time:   48.809
-trial:  8 source: 26352336 GrB SSSP12  time:    7.625
-trial:  9 source: 43500687 GrB SSSP12  time:   47.852
-trial: 10 source:  8987025 GrB SSSP12  time:   46.717
-trial: 11 source:  5699763 GrB SSSP12  time:    5.380
-trial: 12 source: 41436456 GrB SSSP12  time:   48.450
-trial: 13 source:  5030728 GrB SSSP12  time:   47.797
-trial: 14 source: 40735219 GrB SSSP12  time:   49.368
-trial: 15 source: 16533564 GrB SSSP12  time:   46.978
-trial: 16 source: 28700167 GrB SSSP12  time:   47.615
-trial: 17 source:    64712 GrB SSSP12  time:   47.011
-trial: 18 source: 39634751 GrB SSSP12  time:   47.581
-trial: 19 source: 16037780 GrB SSSP12  time:   47.416
-trial: 20 source: 27152740 GrB SSSP12  time:   47.575
-trial: 21 source: 16404062 GrB SSSP12  time:   48.271
-trial: 22 source: 20491964 GrB SSSP12  time:   47.618
-trial: 23 source:  5322424 GrB SSSP12  time:   49.437
-trial: 24 source: 21420954 GrB SSSP12  time:   47.515
-trial: 25 source: 26622110 GrB SSSP12  time:   48.065
-trial: 26 source:  5882876 GrB SSSP12  time:   47.382
-trial: 27 source: 18091041 GrB SSSP12  time:   47.251
-trial: 28 source: 10665897 GrB SSSP12  time:    5.812
-trial: 29 source: 18634423 GrB SSSP12  time:   46.069
-trial: 30 source: 18138716 GrB SSSP12  time:   48.328
-trial: 31 source:  2355536 GrB SSSP12  time:   47.058
-trial: 32 source: 32885206 GrB SSSP12  time:    6.546
-trial: 33 source: 40657441 GrB SSSP12  time:   47.408
-trial: 34 source: 35196168 GrB SSSP12  time:   47.375
-trial: 35 source: 45544427 GrB SSSP12  time:   47.176
-trial: 36 source:  6175520 GrB SSSP12  time:    4.941
-trial: 37 source: 40058319 GrB SSSP12  time:   47.396
-trial: 38 source: 50626231 GrB SSSP12  time:   47.496
-trial: 39 source: 36571020 GrB SSSP12  time:   47.696
-trial: 40 source: 49397053 GrB SSSP12  time:   47.290
-trial: 41 source: 23434266 GrB SSSP12  time:   46.502
-trial: 42 source:  2299445 GrB SSSP12  time:   48.616
-trial: 43 source: 32873824 GrB SSSP12  time:    6.431
-trial: 44 source: 25978283 GrB SSSP12  time:   47.394
-trial: 45 source:  2461716 GrB SSSP12  time:    5.998
-trial: 46 source: 22787315 GrB SSSP12  time:   46.667
-trial: 47 source: 30759948 GrB SSSP12  time:   46.559
-trial: 48 source:  7428895 GrB SSSP12  time:    3.477
-trial: 49 source: 39173871 GrB SSSP12  time:    3.325
-trial: 50 source: 43194210 GrB SSSP12  time:    4.750
-trial: 51 source: 26361510 GrB SSSP12  time:    7.262
-trial: 52 source: 39747212 GrB SSSP12  time:    3.348
-trial: 53 source: 30670030 GrB SSSP12  time:   47.115
-trial: 54 source: 41483034 GrB SSSP12  time:   49.228
-trial: 55 source:  9358667 GrB SSSP12  time:   48.399
-trial: 56 source:  9945009 GrB SSSP12  time:   48.205
-trial: 57 source:  3355245 GrB SSSP12  time:   47.634
-trial: 58 source: 33831270 GrB SSSP12  time:   47.167
-trial: 59 source: 45124745 GrB SSSP12  time:   45.636
-trial: 60 source: 16137878 GrB SSSP12  time:    6.545
-trial: 61 source: 11235449 GrB SSSP12  time:   47.162
-trial: 62 source: 37509145 GrB SSSP12  time:   46.828
-trial: 63 source: 27402415 GrB SSSP12  time:   47.457
-trial: 64 source: 39546084 GrB SSSP12  time:   46.369
-avg GrB SSSP12c time:       0.000 (64 trials)
-avg GrB SSSP12  time:      38.471 (64 trials)
-
-matrix: GAP/GAP-road
-load time: 6.48607 sec
-
-GAP/GAP-road: nodes: 23.9473 million  nvals: 57.7086 million
-init time: 1.16628 sec
-edgeweights: min: 1 med: 1447 max: 368855
-delta for this matrix: 200000
-
-gap_sssp tests: 40 threads
-trial:  1 source:  4795721 GrB SSSP12  time: 1374.702
-trial:  2 source: 21003854 GrB SSSP12  time: 1132.526
-trial:  3 source:   417969 GrB SSSP12  time: 1109.163
-trial:  4 source:  6496512 GrB SSSP12  time: 1307.139
-trial:  5 source:  6648700 GrB SSSP12  time:  989.710
-trial:  6 source:  9811074 GrB SSSP12  time: 1475.327
-trial:  7 source: 22247479 GrB SSSP12  time: 1170.665
-trial:  8 source:  5720253 GrB SSSP12  time: 1294.248
-trial:  9 source: 12366460 GrB SSSP12  time: 1098.604
-trial: 10 source: 20413730 GrB SSSP12  time: 1097.392
-trial: 11 source:  4217375 GrB SSSP12  time: 1506.935
-trial: 12 source:  2674750 GrB SSSP12  time: 1551.047
-trial: 13 source: 22085558 GrB SSSP12  time: 1195.384
-trial: 14 source: 19445041 GrB SSSP12  time: 1227.463
-trial: 15 source:  2360789 GrB SSSP12  time: 1386.477
-trial: 16 source: 19115969 GrB SSSP12  time: 1321.741
-trial: 17 source:  7758768 GrB SSSP12  time: 1200.321
-trial: 18 source: 13468235 GrB SSSP12  time: 1427.177
-trial: 19 source:    30368 GrB SSSP12  time: 1235.757
-trial: 20 source: 18599548 GrB SSSP12  time: 1224.786
-trial: 21 source:  7526109 GrB SSSP12  time: 1073.748
-trial: 22 source: 16836281 GrB SSSP12  time: 1460.553
-trial: 23 source: 12742068 GrB SSSP12  time: 1355.329
-trial: 24 source:  7697996 GrB SSSP12  time:  979.113
-trial: 25 source:  5876444 GrB SSSP12  time: 1129.634
-trial: 26 source:  9616341 GrB SSSP12  time: 1227.414
-trial: 27 source:  2497674 GrB SSSP12  time: 1536.456
-trial: 28 source: 10052291 GrB SSSP12  time: 1172.358
-trial: 29 source: 12493058 GrB SSSP12  time: 1406.976
-trial: 30 source:  1670856 GrB SSSP12  time: 1376.211
-trial: 31 source:  2760680 GrB SSSP12  time: 1162.811
-trial: 32 source:  2460942 GrB SSSP12  time: 1534.288
-trial: 33 source:  8489651 GrB SSSP12  time:  986.132
-trial: 34 source:  5005226 GrB SSSP12  time: 1330.467
-trial: 35 source:  8744646 GrB SSSP12  time: 1207.155
-trial: 36 source:  8512024 GrB SSSP12  time:  967.647
-trial: 37 source: 21912166 GrB SSSP12  time: 1574.984
-trial: 38 source:  1105391 GrB SSSP12  time: 1243.154
-trial: 39 source: 15432164 GrB SSSP12  time: 1125.060
-trial: 40 source:  1600178 GrB SSSP12  time: 1393.314
-trial: 41 source: 19079470 GrB SSSP12  time: 1362.892
-trial: 42 source: 16516638 GrB SSSP12  time: 1539.288
-trial: 43 source: 20202567 GrB SSSP12  time: 1165.177
-trial: 44 source: 21372804 GrB SSSP12  time: 1173.827
-trial: 45 source:  2898010 GrB SSSP12  time: 1416.960
-trial: 46 source:  8491278 GrB SSSP12  time:  952.983
-trial: 47 source: 18798318 GrB SSSP12  time: 1171.693
-trial: 48 source: 23757561 GrB SSSP12  time: 1261.011
-trial: 49 source: 17161820 GrB SSSP12  time: 1053.801
-trial: 50 source: 23180740 GrB SSSP12  time: 1488.948
-trial: 51 source: 10997086 GrB SSSP12  time: 1091.471
-trial: 52 source:  3730631 GrB SSSP12  time: 1514.013
-trial: 53 source:  1079069 GrB SSSP12  time: 1234.034
-trial: 54 source: 15426823 GrB SSSP12  time: 1038.642
-trial: 55 source: 12190926 GrB SSSP12  time: 1234.216
-trial: 56 source:  1155219 GrB SSSP12  time: 1248.832
-trial: 57 source: 10693489 GrB SSSP12  time:  982.629
-trial: 58 source: 14434836 GrB SSSP12  time: 1308.817
-trial: 59 source: 19963340 GrB SSSP12  time: 1347.550
-trial: 60 source:  3486186 GrB SSSP12  time: 1207.257
-trial: 61 source: 18383270 GrB SSSP12  time: 1229.870
-trial: 62 source: 20269909 GrB SSSP12  time: 1107.479
-trial: 63 source: 12370765 GrB SSSP12  time: 1349.168
-trial: 64 source:  7843141 GrB SSSP12  time: 1233.312
-avg GrB SSSP12c time:       0.000 (64 trials)
-avg GrB SSSP12  time:    1254.426 (64 trials)
-
-gap_sssp tests: 20 threads
-trial:  1 source:  4795721 GrB SSSP12  time: 1329.024
-trial:  2 source: 21003854 GrB SSSP12  time: 1098.578
-trial:  3 source:   417969 GrB SSSP12  time: 1080.999
-trial:  4 source:  6496512 GrB SSSP12  time: 1270.518
-trial:  5 source:  6648700 GrB SSSP12  time:  963.983
-trial:  6 source:  9811074 GrB SSSP12  time: 1437.189
-trial:  7 source: 22247479 GrB SSSP12  time: 1142.221
-trial:  8 source:  5720253 GrB SSSP12  time: 1266.177
-trial:  9 source: 12366460 GrB SSSP12  time: 1075.544
-trial: 10 source: 20413730 GrB SSSP12  time: 1074.051
-trial: 11 source:  4217375 GrB SSSP12  time: 1468.593
-trial: 12 source:  2674750 GrB SSSP12  time: 1511.418
-trial: 13 source: 22085558 GrB SSSP12  time: 1170.064
-trial: 14 source: 19445041 GrB SSSP12  time: 1205.486
-trial: 15 source:  2360789 GrB SSSP12  time: 1353.155
-trial: 16 source: 19115969 GrB SSSP12  time: 1295.064
-trial: 17 source:  7758768 GrB SSSP12  time: 1175.749
-trial: 18 source: 13468235 GrB SSSP12  time: 1397.267
-trial: 19 source:    30368 GrB SSSP12  time: 1207.257
-trial: 20 source: 18599548 GrB SSSP12  time: 1195.440
-trial: 21 source:  7526109 GrB SSSP12  time: 1044.793
-trial: 22 source: 16836281 GrB SSSP12  time: 1428.164
-trial: 23 source: 12742068 GrB SSSP12  time: 1324.975
-trial: 24 source:  7697996 GrB SSSP12  time:  954.062
-trial: 25 source:  5876444 GrB SSSP12  time: 1100.139
-trial: 26 source:  9616341 GrB SSSP12  time: 1204.414
-trial: 27 source:  2497674 GrB SSSP12  time: 1501.284
-trial: 28 source: 10052291 GrB SSSP12  time: 1146.493
-trial: 29 source: 12493058 GrB SSSP12  time: 1370.225
-trial: 30 source:  1670856 GrB SSSP12  time: 1344.398
-trial: 31 source:  2760680 GrB SSSP12  time: 1141.458
-trial: 32 source:  2460942 GrB SSSP12  time: 1495.879
-trial: 33 source:  8489651 GrB SSSP12  time:  963.314
-trial: 34 source:  5005226 GrB SSSP12  time: 1294.245
-trial: 35 source:  8744646 GrB SSSP12  time: 1177.154
-trial: 36 source:  8512024 GrB SSSP12  time:  947.675
-trial: 37 source: 21912166 GrB SSSP12  time: 1527.080
-trial: 38 source:  1105391 GrB SSSP12  time: 1209.189
-trial: 39 source: 15432164 GrB SSSP12  time: 1096.134
-trial: 40 source:  1600178 GrB SSSP12  time: 1350.980
-trial: 41 source: 19079470 GrB SSSP12  time: 1338.824
-trial: 42 source: 16516638 GrB SSSP12  time: 1508.123
-trial: 43 source: 20202567 GrB SSSP12  time: 1138.159
-trial: 44 source: 21372804 GrB SSSP12  time: 1151.933
-trial: 45 source:  2898010 GrB SSSP12  time: 1390.471
-trial: 46 source:  8491278 GrB SSSP12  time:  937.380
-trial: 47 source: 18798318 GrB SSSP12  time: 1152.514
-trial: 48 source: 23757561 GrB SSSP12  time: 1236.519
-trial: 49 source: 17161820 GrB SSSP12  time: 1033.312
-trial: 50 source: 23180740 GrB SSSP12  time: 1453.413
-trial: 51 source: 10997086 GrB SSSP12  time: 1065.229
-trial: 52 source:  3730631 GrB SSSP12  time: 1475.528
-trial: 53 source:  1079069 GrB SSSP12  time: 1200.873
-trial: 54 source: 15426823 GrB SSSP12  time: 1016.983
-trial: 55 source: 12190926 GrB SSSP12  time: 1205.275
-trial: 56 source:  1155219 GrB SSSP12  time: 1221.527
-trial: 57 source: 10693489 GrB SSSP12  time:  958.707
-trial: 58 source: 14434836 GrB SSSP12  time: 1287.330
-trial: 59 source: 19963340 GrB SSSP12  time: 1315.855
-trial: 60 source:  3486186 GrB SSSP12  time: 1178.800
-trial: 61 source: 18383270 GrB SSSP12  time: 1200.175
-trial: 62 source: 20269909 GrB SSSP12  time: 1085.952
-trial: 63 source: 12370765 GrB SSSP12  time: 1318.285
-trial: 64 source:  7843141 GrB SSSP12  time: 1203.807
-avg GrB SSSP12c time:       0.000 (64 trials)
-avg GrB SSSP12  time:    1225.231 (64 trials)
-[?1h=hypersparse: 40 threads
-
-threads =
-
-    40    20
-
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-load time: 323.587 sec
-degree time: 12.95 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 25.2977 iter: 7
-trial:  2 GAP pagerank time: 25.5362 iter: 7
-trial:  3 GAP pagerank time: 25.4815 iter: 7
-trial:  4 GAP pagerank time: 24.9799 iter: 7
-trial:  5 GAP pagerank time: 25.6979 iter: 7
-trial:  6 GAP pagerank time: 25.0444 iter: 7
-trial:  7 GAP pagerank time: 25.6533 iter: 7
-trial:  8 GAP pagerank time: 25.2845 iter: 7
-trial:  9 GAP pagerank time: 25.2236 iter: 7
-trial: 10 GAP pagerank time: 25.7213 iter: 7
-trial: 11 GAP pagerank time: 25.0997 iter: 7
-trial: 12 GAP pagerank time: 25.3893 iter: 7
-trial: 13 GAP pagerank time: 25.2282 iter: 7
-trial: 14 GAP pagerank time: 25.167 iter: 7
-trial: 15 GAP pagerank time: 25.3611 iter: 7
-trial: 16 GAP pagerank time: 25.1193 iter: 7
-avg gap_pagerank time:  25.3303 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 26.3507 iter: 7
-trial:  2 GAP pagerank time: 25.9687 iter: 7
-trial:  3 GAP pagerank time: 26.2271 iter: 7
-trial:  4 GAP pagerank time: 26.2368 iter: 7
-trial:  5 GAP pagerank time: 26.0148 iter: 7
-trial:  6 GAP pagerank time: 26.3725 iter: 7
-trial:  7 GAP pagerank time: 25.927 iter: 7
-trial:  8 GAP pagerank time: 26.0986 iter: 7
-trial:  9 GAP pagerank time: 26.0875 iter: 7
-trial: 10 GAP pagerank time: 26.0404 iter: 7
-trial: 11 GAP pagerank time: 26.247 iter: 7
-trial: 12 GAP pagerank time: 25.9862 iter: 7
-trial: 13 GAP pagerank time: 26.1536 iter: 7
-trial: 14 GAP pagerank time: 26.0602 iter: 7
-trial: 15 GAP pagerank time: 26.0936 iter: 7
-trial: 16 GAP pagerank time: 26.2107 iter: 7
-avg gap_pagerank time:  26.1297 (16 trials)
-
-GAP/GAP-urand: nodes: 134.218 million  nvals: 4294.97 million
-load time: 335.746 sec
-degree time: 16.0955 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 30.436 iter: 6
-trial:  2 GAP pagerank time: 30.4973 iter: 6
-trial:  3 GAP pagerank time: 30.6182 iter: 6
-trial:  4 GAP pagerank time: 30.9307 iter: 6
-trial:  5 GAP pagerank time: 30.706 iter: 6
-trial:  6 GAP pagerank time: 30.4642 iter: 6
-trial:  7 GAP pagerank time: 30.615 iter: 6
-trial:  8 GAP pagerank time: 30.5338 iter: 6
-trial:  9 GAP pagerank time: 30.6083 iter: 6
-trial: 10 GAP pagerank time: 30.4525 iter: 6
-trial: 11 GAP pagerank time: 30.4698 iter: 6
-trial: 12 GAP pagerank time: 30.4422 iter: 6
-trial: 13 GAP pagerank time: 30.5254 iter: 6
-trial: 14 GAP pagerank time: 30.4436 iter: 6
-trial: 15 GAP pagerank time: 30.5104 iter: 6
-trial: 16 GAP pagerank time: 30.437 iter: 6
-avg gap_pagerank time:  30.5431 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 31.3227 iter: 6
-trial:  2 GAP pagerank time: 31.2838 iter: 6
-trial:  3 GAP pagerank time: 31.3005 iter: 6
-trial:  4 GAP pagerank time: 31.2711 iter: 6
-trial:  5 GAP pagerank time: 31.2969 iter: 6
-trial:  6 GAP pagerank time: 31.271 iter: 6
-trial:  7 GAP pagerank time: 31.2897 iter: 6
-trial:  8 GAP pagerank time: 31.2815 iter: 6
-trial:  9 GAP pagerank time: 31.2915 iter: 6
-trial: 10 GAP pagerank time: 31.2588 iter: 6
-trial: 11 GAP pagerank time: 31.2921 iter: 6
-trial: 12 GAP pagerank time: 31.2706 iter: 6
-trial: 13 GAP pagerank time: 31.2853 iter: 6
-trial: 14 GAP pagerank time: 31.272 iter: 6
-trial: 15 GAP pagerank time: 31.2877 iter: 6
-trial: 16 GAP pagerank time: 31.3086 iter: 6
-avg gap_pagerank time:  31.2865 (16 trials)
-
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 1468.36 million
-load time: 145.976 sec
-degree time: 4.24256 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 21.7229 iter: 22
-trial:  2 GAP pagerank time: 21.5573 iter: 22
-trial:  3 GAP pagerank time: 21.6714 iter: 22
-trial:  4 GAP pagerank time: 21.4939 iter: 22
-trial:  5 GAP pagerank time: 21.632 iter: 22
-trial:  6 GAP pagerank time: 21.5333 iter: 22
-trial:  7 GAP pagerank time: 21.5667 iter: 22
-trial:  8 GAP pagerank time: 21.5206 iter: 22
-trial:  9 GAP pagerank time: 21.519 iter: 22
-trial: 10 GAP pagerank time: 21.5205 iter: 22
-trial: 11 GAP pagerank time: 21.5675 iter: 22
-trial: 12 GAP pagerank time: 21.4847 iter: 22
-trial: 13 GAP pagerank time: 21.5937 iter: 22
-trial: 14 GAP pagerank time: 21.4622 iter: 22
-trial: 15 GAP pagerank time: 21.6077 iter: 22
-trial: 16 GAP pagerank time: 21.5128 iter: 22
-avg gap_pagerank time:  21.5604 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 22.3746 iter: 22
-trial:  2 GAP pagerank time: 22.204 iter: 22
-trial:  3 GAP pagerank time: 22.2987 iter: 22
-trial:  4 GAP pagerank time: 22.2453 iter: 22
-trial:  5 GAP pagerank time: 22.329 iter: 22
-trial:  6 GAP pagerank time: 22.206 iter: 22
-trial:  7 GAP pagerank time: 22.3511 iter: 22
-trial:  8 GAP pagerank time: 22.2322 iter: 22
-trial:  9 GAP pagerank time: 22.3268 iter: 22
-trial: 10 GAP pagerank time: 22.2064 iter: 22
-trial: 11 GAP pagerank time: 22.3425 iter: 22
-trial: 12 GAP pagerank time: 22.1843 iter: 22
-trial: 13 GAP pagerank time: 22.314 iter: 22
-trial: 14 GAP pagerank time: 22.2233 iter: 22
-trial: 15 GAP pagerank time: 22.2842 iter: 22
-trial: 16 GAP pagerank time: 22.187 iter: 22
-avg gap_pagerank time:  22.2693 (16 trials)
-
-GAP/GAP-web: nodes: 50.6362 million  nvals: 1930.29 million
-load time: 121.333 sec
-degree time: 1.91383 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 13.7331 iter: 30
-trial:  2 GAP pagerank time: 13.5785 iter: 30
-trial:  3 GAP pagerank time: 13.7519 iter: 30
-trial:  4 GAP pagerank time: 13.6146 iter: 30
-trial:  5 GAP pagerank time: 13.7271 iter: 30
-trial:  6 GAP pagerank time: 13.5897 iter: 30
-trial:  7 GAP pagerank time: 13.7645 iter: 30
-trial:  8 GAP pagerank time: 13.636 iter: 30
-trial:  9 GAP pagerank time: 13.7705 iter: 30
-trial: 10 GAP pagerank time: 13.7997 iter: 30
-trial: 11 GAP pagerank time: 14.0245 iter: 30
-trial: 12 GAP pagerank time: 13.9243 iter: 30
-trial: 13 GAP pagerank time: 13.9019 iter: 30
-trial: 14 GAP pagerank time: 13.6741 iter: 30
-trial: 15 GAP pagerank time: 13.7221 iter: 30
-trial: 16 GAP pagerank time: 13.5963 iter: 30
-avg gap_pagerank time:  13.738 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 13.8347 iter: 30
-trial:  2 GAP pagerank time: 13.7214 iter: 30
-trial:  3 GAP pagerank time: 13.8272 iter: 30
-trial:  4 GAP pagerank time: 13.7206 iter: 30
-trial:  5 GAP pagerank time: 13.8275 iter: 30
-trial:  6 GAP pagerank time: 13.7368 iter: 30
-trial:  7 GAP pagerank time: 13.8083 iter: 30
-trial:  8 GAP pagerank time: 13.7234 iter: 30
-trial:  9 GAP pagerank time: 13.822 iter: 30
-trial: 10 GAP pagerank time: 13.7256 iter: 30
-trial: 11 GAP pagerank time: 13.8229 iter: 30
-trial: 12 GAP pagerank time: 13.7027 iter: 30
-trial: 13 GAP pagerank time: 13.8004 iter: 30
-trial: 14 GAP pagerank time: 13.701 iter: 30
-trial: 15 GAP pagerank time: 13.8264 iter: 30
-trial: 16 GAP pagerank time: 13.7266 iter: 30
-avg gap_pagerank time:  13.7705 (16 trials)
-
-GAP/GAP-road: nodes: 23.9473 million  nvals: 57.7086 million
-load time: 8.01933 sec
-degree time: 0.321667 sec
-
-GAP PageRank tests: 40 threads
-trial:  1 GAP pagerank time: 4.7607 iter: 39
-trial:  2 GAP pagerank time: 4.80214 iter: 39
-trial:  3 GAP pagerank time: 4.76991 iter: 39
-trial:  4 GAP pagerank time: 4.77806 iter: 39
-trial:  5 GAP pagerank time: 4.73588 iter: 39
-trial:  6 GAP pagerank time: 4.74905 iter: 39
-trial:  7 GAP pagerank time: 4.72697 iter: 39
-trial:  8 GAP pagerank time: 4.80085 iter: 39
-trial:  9 GAP pagerank time: 4.80596 iter: 39
-trial: 10 GAP pagerank time: 4.80005 iter: 39
-trial: 11 GAP pagerank time: 4.79774 iter: 39
-trial: 12 GAP pagerank time: 4.80608 iter: 39
-trial: 13 GAP pagerank time: 4.80088 iter: 39
-trial: 14 GAP pagerank time: 4.81895 iter: 39
-trial: 15 GAP pagerank time: 4.82212 iter: 39
-trial: 16 GAP pagerank time: 4.77748 iter: 39
-avg gap_pagerank time:  4.78455 (16 trials)
-
-GAP PageRank tests: 20 threads
-trial:  1 GAP pagerank time: 4.71126 iter: 39
-trial:  2 GAP pagerank time: 4.70941 iter: 39
-trial:  3 GAP pagerank time: 4.69098 iter: 39
-trial:  4 GAP pagerank time: 4.70818 iter: 39
-trial:  5 GAP pagerank time: 4.69731 iter: 39
-trial:  6 GAP pagerank time: 4.71421 iter: 39
-trial:  7 GAP pagerank time: 4.737 iter: 39
-trial:  8 GAP pagerank time: 4.71953 iter: 39
-trial:  9 GAP pagerank time: 4.72425 iter: 39
-trial: 10 GAP pagerank time: 4.72225 iter: 39
-trial: 11 GAP pagerank time: 4.72126 iter: 39
-trial: 12 GAP pagerank time: 4.72697 iter: 39
-trial: 13 GAP pagerank time: 4.71739 iter: 39
-trial: 14 GAP pagerank time: 4.71912 iter: 39
-trial: 15 GAP pagerank time: 4.71605 iter: 39
-trial: 16 GAP pagerank time: 4.72034 iter: 39
-avg gap_pagerank time:  4.71597 (16 trials)
- GAP_TC run tricount for the GAP benchmark
-
-
-matrices =
-
-  5x1 cell array
-
-    {'GAP/GAP-road'   }
-    {'GAP/GAP-web'    }
-    {'GAP/GAP-urand'  }
-    {'GAP/GAP-twitter'}
-    {'GAP/GAP-kron'   }
-
-
-matrices =
-
-  2x1 cell array
-
-    {'GAP/GAP-twitter'}
-    {'GAP/GAP-kron'   }
-
-[?1h=hypersparse: 40 threads
-
-threads =
-
-    40    20
-
-
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 2405.03 million
-load time: 186.959 sec
-
-GAP tricount  tests: 40 threads
-mean degree: 25.979 median: 6
-sorting A first
-trial:  1 GrB.tricount  time:  269.819
-mean degree: 26.561 median: 5
-sorting A first
-trial:  2 GrB.tricount  time:  269.899
-mean degree: 29.001 median: 5
-sorting A first
-trial:  3 GrB.tricount  time:  268.807
-avg GrB.tricount time:     269.508 (3 trials)
-triangles: 34824916864
-
-GAP tricount  tests: 20 threads
-mean degree: 45.814 median: 5
-sorting A first
-trial:  1 GrB.tricount  time:  364.906
-mean degree: 24.15 median: 5
-sorting A first
-trial:  2 GrB.tricount  time:  364.167
-mean degree: 52.545 median: 5
-sorting A first
-trial:  3 GrB.tricount  time:  364.093
-avg GrB.tricount time:     364.389 (3 trials)
-triangles: 34824916864
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-load time: 487.562 sec
-
-GAP tricount  tests: 40 threads
-mean degree: 18.775 median: 0.5
-sorting A first
-trial:  1 GrB.tricount  time:  999.487
-mean degree: 25.79 median: 0
-sorting A first
-trial:  2 GrB.tricount  time: 1019.549
-mean degree: 27.148 median: 0
-sorting A first
-trial:  3 GrB.tricount  time: 1029.959
-avg GrB.tricount time:    1016.332 (3 trials)
-triangles: 106873365648
-
-GAP tricount  tests: 20 threads
-mean degree: 44.87 median: 0
-sorting A first
-trial:  1 GrB.tricount  time: 1458.220
-mean degree: 27.861 median: 1
-sorting A first
-trial:  2 GrB.tricount  time: 1458.611
-mean degree: 26.455 median: 0
-sorting A first
-trial:  3 GrB.tricount  time: 1459.800
-avg GrB.tricount time:    1458.877 (3 trials)
-triangles: 106873365648
-[?1h=hypersparse: 40 threads
-
-threads =
-
-    40    20
-
-
-GAP/GAP-kron: nodes: 134.218 million  nvals: 4223.26 million
-load time: 384.786 sec
-
-GrB.bfs  tests: 40 threads
-trial:  1 source:  2338013 GrB.bfs  time:    8.690 visited: 63032893 depth:        7
-trial:  2 source: 31997660 GrB.bfs  time:    8.784 visited: 63032893 depth:        8
-trial:  3 source: 23590941 GrB.bfs  time:    8.751 visited: 63032893 depth:        8
-trial:  4 source: 43400605 GrB.bfs  time:    8.754 visited: 63032893 depth:        8
-trial:  5 source: 75337938 GrB.bfs  time:    8.842 visited: 63032893 depth:        8
-trial:  6 source:   169868 GrB.bfs  time:    8.875 visited: 63032893 depth:        8
-trial:  7 source: 104041221 GrB.bfs  time:    8.834 visited: 63032893 depth:        8
-trial:  8 source: 94177943 GrB.bfs  time:    8.717 visited: 63032893 depth:        8
-trial:  9 source: 32871358 GrB.bfs  time:    8.790 visited: 63032893 depth:        8
-trial: 10 source: 56230003 GrB.bfs  time:    8.754 visited: 63032893 depth:        8
-trial: 11 source: 69883038 GrB.bfs  time:    8.491 visited: 63032893 depth:        7
-trial: 12 source:  9346346 GrB.bfs  time:    8.993 visited: 63032893 depth:        8
-trial: 13 source: 48915359 GrB.bfs  time:    8.787 visited: 63032893 depth:        8
-trial: 14 source: 122571174 GrB.bfs  time:    8.947 visited: 63032893 depth:        8
-trial: 15 source:  6183280 GrB.bfs  time:    8.785 visited: 63032893 depth:        8
-trial: 16 source: 86323664 GrB.bfs  time:    8.833 visited: 63032893 depth:        8
-trial: 17 source: 106725781 GrB.bfs  time:    8.793 visited: 63032893 depth:        8
-trial: 18 source: 92389939 GrB.bfs  time:    8.586 visited: 63032893 depth:        7
-trial: 19 source: 16210739 GrB.bfs  time:    8.490 visited: 63032893 depth:        7
-trial: 20 source: 59816701 GrB.bfs  time:    8.815 visited: 63032893 depth:        8
-trial: 21 source: 111669930 GrB.bfs  time:    8.799 visited: 63032893 depth:        8
-trial: 22 source: 102831412 GrB.bfs  time:    8.764 visited: 63032893 depth:        8
-trial: 23 source: 113384801 GrB.bfs  time:    8.718 visited: 63032893 depth:        8
-trial: 24 source: 43872565 GrB.bfs  time:    8.476 visited: 63032893 depth:        7
-trial: 25 source: 80508828 GrB.bfs  time:    8.912 visited: 63032893 depth:        8
-trial: 26 source: 26105649 GrB.bfs  time:    8.700 visited: 63032893 depth:        8
-trial: 27 source:  8807517 GrB.bfs  time:    8.843 visited: 63032893 depth:        8
-trial: 28 source: 118452456 GrB.bfs  time:    8.400 visited: 63032893 depth:        7
-trial: 29 source: 121818860 GrB.bfs  time:    9.015 visited: 63032893 depth:        8
-trial: 30 source: 42361929 GrB.bfs  time:    8.768 visited: 63032893 depth:        8
-trial: 31 source: 29493054 GrB.bfs  time:    8.837 visited: 63032893 depth:        8
-trial: 32 source: 98461504 GrB.bfs  time:    8.755 visited: 63032893 depth:        8
-trial: 33 source: 71931338 GrB.bfs  time:    8.732 visited: 63032893 depth:        8
-trial: 34 source: 103808469 GrB.bfs  time:    8.678 visited: 63032893 depth:        8
-trial: 35 source:  4092346 GrB.bfs  time:    8.897 visited: 63032893 depth:        8
-trial: 36 source: 115276242 GrB.bfs  time:    8.800 visited: 63032893 depth:        8
-trial: 37 source:  4649344 GrB.bfs  time:    8.548 visited: 63032893 depth:        7
-trial: 38 source: 76656190 GrB.bfs  time:    8.766 visited: 63032893 depth:        8
-trial: 39 source: 31312002 GrB.bfs  time:    8.734 visited: 63032893 depth:        8
-trial: 40 source: 111334128 GrB.bfs  time:    8.462 visited: 63032893 depth:        7
-trial: 41 source: 100962919 GrB.bfs  time:    8.866 visited: 63032893 depth:        8
-trial: 42 source: 41823216 GrB.bfs  time:    8.780 visited: 63032893 depth:        8
-trial: 43 source: 22631241 GrB.bfs  time:    8.852 visited: 63032893 depth:        8
-trial: 44 source: 42848462 GrB.bfs  time:    9.022 visited: 63032893 depth:        9
-trial: 45 source: 79485149 GrB.bfs  time:    8.834 visited: 63032893 depth:        8
-trial: 46 source: 106818743 GrB.bfs  time:    8.776 visited: 63032893 depth:        8
-trial: 47 source: 73347975 GrB.bfs  time:    8.852 visited: 63032893 depth:        8
-trial: 48 source: 78848446 GrB.bfs  time:    8.670 visited: 63032893 depth:        8
-trial: 49 source: 109920511 GrB.bfs  time:    8.567 visited: 63032893 depth:        7
-trial: 50 source: 121492134 GrB.bfs  time:    8.829 visited: 63032893 depth:        8
-trial: 51 source: 101037297 GrB.bfs  time:    8.752 visited: 63032893 depth:        8
-trial: 52 source: 15438601 GrB.bfs  time:    8.886 visited: 63032893 depth:        8
-trial: 53 source:  4584785 GrB.bfs  time:    8.812 visited: 63032893 depth:        8
-trial: 54 source: 124503846 GrB.bfs  time:    9.211 visited: 63032893 depth:        9
-trial: 55 source: 87241744 GrB.bfs  time:    8.422 visited: 63032893 depth:        7
-trial: 56 source: 108297009 GrB.bfs  time:    8.813 visited: 63032893 depth:        8
-trial: 57 source: 33955083 GrB.bfs  time:    8.835 visited: 63032893 depth:        8
-trial: 58 source: 79934824 GrB.bfs  time:    8.548 visited: 63032893 depth:        7
-trial: 59 source:  8608482 GrB.bfs  time:    8.784 visited: 63032893 depth:        8
-trial: 60 source: 82435064 GrB.bfs  time:    8.780 visited: 63032893 depth:        8
-trial: 61 source: 46579272 GrB.bfs  time:    8.818 visited: 63032893 depth:        8
-trial: 62 source:   515422 GrB.bfs  time:    8.458 visited: 63032893 depth:        7
-trial: 63 source: 121530468 GrB.bfs  time:    8.864 visited: 63032893 depth:        8
-trial: 64 source: 127978737 GrB.bfs  time:    8.780 visited: 63032893 depth:        8
-avg GrB.bfs  time:  8.76136 (64 trials)
-
-GrB.bfs  tests: 20 threads
-trial:  1 source:  2338013 GrB.bfs  time:    9.399 visited: 63032893 depth:        7
-trial:  2 source: 31997660 GrB.bfs  time:    9.618 visited: 63032893 depth:        8
-trial:  3 source: 23590941 GrB.bfs  time:    9.593 visited: 63032893 depth:        8
-trial:  4 source: 43400605 GrB.bfs  time:    9.555 visited: 63032893 depth:        8
-trial:  5 source: 75337938 GrB.bfs  time:    9.537 visited: 63032893 depth:        8
-trial:  6 source:   169868 GrB.bfs  time:    9.653 visited: 63032893 depth:        8
-trial:  7 source: 104041221 GrB.bfs  time:    9.550 visited: 63032893 depth:        8
-trial:  8 source: 94177943 GrB.bfs  time:    9.500 visited: 63032893 depth:        8
-trial:  9 source: 32871358 GrB.bfs  time:    9.522 visited: 63032893 depth:        8
-trial: 10 source: 56230003 GrB.bfs  time:    9.564 visited: 63032893 depth:        8
-trial: 11 source: 69883038 GrB.bfs  time:    9.163 visited: 63032893 depth:        7
-trial: 12 source:  9346346 GrB.bfs  time:    9.664 visited: 63032893 depth:        8
-trial: 13 source: 48915359 GrB.bfs  time:    9.540 visited: 63032893 depth:        8
-trial: 14 source: 122571174 GrB.bfs  time:    9.482 visited: 63032893 depth:        8
-trial: 15 source:  6183280 GrB.bfs  time:    9.507 visited: 63032893 depth:        8
-trial: 16 source: 86323664 GrB.bfs  time:    9.545 visited: 63032893 depth:        8
-trial: 17 source: 106725781 GrB.bfs  time:    9.449 visited: 63032893 depth:        8
-trial: 18 source: 92389939 GrB.bfs  time:    9.147 visited: 63032893 depth:        7
-trial: 19 source: 16210739 GrB.bfs  time:    9.155 visited: 63032893 depth:        7
-trial: 20 source: 59816701 GrB.bfs  time:    9.584 visited: 63032893 depth:        8
-trial: 21 source: 111669930 GrB.bfs  time:    9.528 visited: 63032893 depth:        8
-trial: 22 source: 102831412 GrB.bfs  time:    9.631 visited: 63032893 depth:        8
-trial: 23 source: 113384801 GrB.bfs  time:    9.502 visited: 63032893 depth:        8
-trial: 24 source: 43872565 GrB.bfs  time:    9.225 visited: 63032893 depth:        7
-trial: 25 source: 80508828 GrB.bfs  time:    9.614 visited: 63032893 depth:        8
-trial: 26 source: 26105649 GrB.bfs  time:    9.453 visited: 63032893 depth:        8
-trial: 27 source:  8807517 GrB.bfs  time:    9.508 visited: 63032893 depth:        8
-trial: 28 source: 118452456 GrB.bfs  time:    9.096 visited: 63032893 depth:        7
-trial: 29 source: 121818860 GrB.bfs  time:    9.605 visited: 63032893 depth:        8
-trial: 30 source: 42361929 GrB.bfs  time:    9.503 visited: 63032893 depth:        8
-trial: 31 source: 29493054 GrB.bfs  time:    9.513 visited: 63032893 depth:        8
-trial: 32 source: 98461504 GrB.bfs  time:    9.541 visited: 63032893 depth:        8
-trial: 33 source: 71931338 GrB.bfs  time:    9.483 visited: 63032893 depth:        8
-trial: 34 source: 103808469 GrB.bfs  time:    9.423 visited: 63032893 depth:        8
-trial: 35 source:  4092346 GrB.bfs  time:    9.759 visited: 63032893 depth:        8
-trial: 36 source: 115276242 GrB.bfs  time:    9.752 visited: 63032893 depth:        8
-trial: 37 source:  4649344 GrB.bfs  time:    9.338 visited: 63032893 depth:        7
-trial: 38 source: 76656190 GrB.bfs  time:    9.558 visited: 63032893 depth:        8
-trial: 39 source: 31312002 GrB.bfs  time:    9.573 visited: 63032893 depth:        8
-trial: 40 source: 111334128 GrB.bfs  time:    9.159 visited: 63032893 depth:        7
-trial: 41 source: 100962919 GrB.bfs  time:    9.545 visited: 63032893 depth:        8
-trial: 42 source: 41823216 GrB.bfs  time:    9.509 visited: 63032893 depth:        8
-trial: 43 source: 22631241 GrB.bfs  time:    9.596 visited: 63032893 depth:        8
-trial: 44 source: 42848462 GrB.bfs  time:    9.836 visited: 63032893 depth:        9
-trial: 45 source: 79485149 GrB.bfs  time:    9.601 visited: 63032893 depth:        8
-trial: 46 source: 106818743 GrB.bfs  time:    9.561 visited: 63032893 depth:        8
-trial: 47 source: 73347975 GrB.bfs  time:    9.557 visited: 63032893 depth:        8
-trial: 48 source: 78848446 GrB.bfs  time:    9.471 visited: 63032893 depth:        8
-trial: 49 source: 109920511 GrB.bfs  time:    9.253 visited: 63032893 depth:        7
-trial: 50 source: 121492134 GrB.bfs  time:    9.507 visited: 63032893 depth:        8
-trial: 51 source: 101037297 GrB.bfs  time:    9.460 visited: 63032893 depth:        8
-trial: 52 source: 15438601 GrB.bfs  time:    9.616 visited: 63032893 depth:        8
-trial: 53 source:  4584785 GrB.bfs  time:    9.563 visited: 63032893 depth:        8
-trial: 54 source: 124503846 GrB.bfs  time:    9.823 visited: 63032893 depth:        9
-trial: 55 source: 87241744 GrB.bfs  time:    9.060 visited: 63032893 depth:        7
-trial: 56 source: 108297009 GrB.bfs  time:    9.549 visited: 63032893 depth:        8
-trial: 57 source: 33955083 GrB.bfs  time:    9.537 visited: 63032893 depth:        8
-trial: 58 source: 79934824 GrB.bfs  time:    9.255 visited: 63032893 depth:        7
-trial: 59 source:  8608482 GrB.bfs  time:    9.564 visited: 63032893 depth:        8
-trial: 60 source: 82435064 GrB.bfs  time:    9.673 visited: 63032893 depth:        8
-trial: 61 source: 46579272 GrB.bfs  time:    9.519 visited: 63032893 depth:        8
-trial: 62 source:   515422 GrB.bfs  time:    8.909 visited: 63032893 depth:        7
-trial: 63 source: 121530468 GrB.bfs  time:    9.327 visited: 63032893 depth:        8
-trial: 64 source: 127978737 GrB.bfs  time:    9.606 visited: 63032893 depth:        8
-avg GrB.bfs  time:  9.4904 (64 trials)
-
-GAP/GAP-twitter: nodes: 61.5784 million  nvals: 1468.36 million
-load time: 145.871 sec
-
-GrB.bfs  tests: 40 threads
-trial:  1 source: 12441073 GrB.bfs  time:    4.988 visited: 35016138 depth:       16
-trial:  2 source: 54488258 GrB.bfs  time:    4.752 visited: 35016137 depth:       15
-trial:  3 source: 25451916 GrB.bfs  time:    4.856 visited: 35016137 depth:       16
-trial:  4 source: 57714474 GrB.bfs  time:    4.830 visited: 35016137 depth:       15
-trial:  5 source: 14839495 GrB.bfs  time:    4.701 visited: 35016137 depth:       14
-trial:  6 source: 32081105 GrB.bfs  time:    4.803 visited: 35016137 depth:       15
-trial:  7 source: 52957358 GrB.bfs  time:    4.897 visited: 35016137 depth:       16
-trial:  8 source: 50444381 GrB.bfs  time:    4.923 visited: 35016137 depth:       16
-trial:  9 source: 49590702 GrB.bfs  time:    4.886 visited: 35016137 depth:       16
-trial: 10 source: 20127817 GrB.bfs  time:    5.084 visited: 35016138 depth:       17
-trial: 11 source: 34939334 GrB.bfs  time:    4.893 visited: 35016137 depth:       16
-trial: 12 source: 48251002 GrB.bfs  time:    4.796 visited: 35016137 depth:       15
-trial: 13 source: 19524254 GrB.bfs  time:    4.871 visited: 35016137 depth:       16
-trial: 14 source: 43676727 GrB.bfs  time:    5.026 visited: 35016137 depth:       16
-trial: 15 source: 33055509 GrB.bfs  time:    4.897 visited: 35016137 depth:       16
-trial: 16 source: 15244688 GrB.bfs  time:    5.000 visited: 35016138 depth:       16
-trial: 17 source: 24946739 GrB.bfs  time:    4.821 visited: 35016137 depth:       15
-trial: 18 source:  6479473 GrB.bfs  time:    4.856 visited: 35016137 depth:       15
-trial: 19 source: 26077683 GrB.bfs  time:    4.725 visited: 35016137 depth:       15
-trial: 20 source: 22023876 GrB.bfs  time:    5.022 visited: 35016137 depth:       17
-trial: 21 source: 22081916 GrB.bfs  time:    4.800 visited: 35016137 depth:       15
-trial: 22 source: 40034163 GrB.bfs  time:    4.892 visited: 35016138 depth:       16
-trial: 23 source: 49496015 GrB.bfs  time:    4.966 visited: 35016138 depth:       16
-trial: 24 source: 42847508 GrB.bfs  time:    4.752 visited: 35016137 depth:       15
-trial: 25 source: 52409558 GrB.bfs  time:    4.879 visited: 35016137 depth:       16
-trial: 26 source: 55445389 GrB.bfs  time:    4.988 visited: 35016137 depth:       16
-trial: 27 source: 22028098 GrB.bfs  time:    5.116 visited: 35016137 depth:       17
-trial: 28 source: 48766649 GrB.bfs  time:    4.966 visited: 35016137 depth:       16
-trial: 29 source: 44521242 GrB.bfs  time:    4.912 visited: 35016138 depth:       16
-trial: 30 source: 60135543 GrB.bfs  time:    4.874 visited: 35016137 depth:       16
-trial: 31 source: 28528672 GrB.bfs  time:    4.939 visited: 35016137 depth:       16
-trial: 32 source:  9678013 GrB.bfs  time:    4.657 visited: 35016137 depth:       14
-trial: 33 source: 40020307 GrB.bfs  time:    4.935 visited: 35016138 depth:       16
-trial: 34 source: 31625736 GrB.bfs  time:    4.927 visited: 35016137 depth:       16
-trial: 35 source: 37446893 GrB.bfs  time:    4.924 visited: 35016137 depth:       16
-trial: 36 source: 51788953 GrB.bfs  time:    4.906 visited: 35016137 depth:       16
-trial: 37 source: 52584256 GrB.bfs  time:    4.925 visited: 35016137 depth:       16
-trial: 38 source: 20346697 GrB.bfs  time:    4.760 visited: 35016137 depth:       15
-trial: 39 source: 48387910 GrB.bfs  time:    4.935 visited: 35016138 depth:       16
-trial: 40 source: 37337428 GrB.bfs  time:    4.949 visited: 35016137 depth:       16
-trial: 41 source: 50501085 GrB.bfs  time:    4.917 visited: 35016137 depth:       16
-trial: 42 source: 30130062 GrB.bfs  time:    4.979 visited: 35016137 depth:       16
-trial: 43 source: 41185894 GrB.bfs  time:    5.012 visited: 35016138 depth:       17
-trial: 44 source: 56495704 GrB.bfs  time:    4.756 visited: 35016137 depth:       15
-trial: 45 source: 45663306 GrB.bfs  time:    4.932 visited: 35016137 depth:       16
-trial: 46 source: 33359461 GrB.bfs  time:    4.948 visited: 35016137 depth:       16
-trial: 47 source: 48143059 GrB.bfs  time:    4.937 visited: 35016137 depth:       16
-trial: 48 source: 33291514 GrB.bfs  time:    4.905 visited: 35016137 depth:       15
-trial: 49 source: 53461446 GrB.bfs  time:    4.905 visited: 35016137 depth:       16
-trial: 50 source: 29340611 GrB.bfs  time:    4.859 visited: 35016137 depth:       15
-trial: 51 source: 34148499 GrB.bfs  time:    4.789 visited: 35016137 depth:       15
-trial: 52 source: 49171807 GrB.bfs  time:    4.917 visited: 35016137 depth:       16
-trial: 53 source: 35550697 GrB.bfs  time:    5.085 visited: 35016137 depth:       17
-trial: 54 source: 14521508 GrB.bfs  time:    4.774 visited: 35016137 depth:       15
-trial: 55 source: 51633219 GrB.bfs  time:    4.873 visited: 35016137 depth:       16
-trial: 56 source: 46823383 GrB.bfs  time:    4.796 visited: 35016137 depth:       15
-trial: 57 source: 19396274 GrB.bfs  time:    4.708 visited: 35016137 depth:       15
-trial: 58 source: 19871751 GrB.bfs  time:    4.801 visited: 35016137 depth:       15
-trial: 59 source: 36862678 GrB.bfs  time:    4.877 visited: 35016137 depth:       16
-trial: 60 source: 49539127 GrB.bfs  time:    5.166 visited: 35016137 depth:       17
-trial: 61 source: 34016453 GrB.bfs  time:    4.791 visited: 35016137 depth:       15
-trial: 62 source: 36567396 GrB.bfs  time:    4.826 visited: 35016137 depth:       15
-trial: 63 source: 55487794 GrB.bfs  time:    4.936 visited: 35016137 depth:       16
-trial: 64 source: 14391371 GrB.bfs  time:    4.997 visited: 35016137 depth:       16
-avg GrB.bfs  time:  4.89237 (64 trials)
-
-GrB.bfs  tests: 20 threads
-trial:  1 source: 12441073 GrB.bfs  time:    5.207 visited: 35016138 depth:       16
-trial:  2 source: 54488258 GrB.bfs  time:    5.106 visited: 35016137 depth:       15
-trial:  3 source: 25451916 GrB.bfs  time:    5.182 visited: 35016137 depth:       16
-trial:  4 source: 57714474 GrB.bfs  time:    5.146 visited: 35016137 depth:       15
-trial:  5 source: 14839495 GrB.bfs  time:    5.050 visited: 35016137 depth:       14
-trial:  6 source: 32081105 GrB.bfs  time:    5.182 visited: 35016137 depth:       15
-trial:  7 source: 52957358 GrB.bfs  time:    5.273 visited: 35016137 depth:       16
-trial:  8 source: 50444381 GrB.bfs  time:    5.238 visited: 35016137 depth:       16
-trial:  9 source: 49590702 GrB.bfs  time:    5.293 visited: 35016137 depth:       16
-trial: 10 source: 20127817 GrB.bfs  time:    5.376 visited: 35016138 depth:       17
-trial: 11 source: 34939334 GrB.bfs  time:    5.179 visited: 35016137 depth:       16
-trial: 12 source: 48251002 GrB.bfs  time:    5.130 visited: 35016137 depth:       15
-trial: 13 source: 19524254 GrB.bfs  time:    5.212 visited: 35016137 depth:       16
-trial: 14 source: 43676727 GrB.bfs  time:    5.291 visited: 35016137 depth:       16
-trial: 15 source: 33055509 GrB.bfs  time:    5.145 visited: 35016137 depth:       16
-trial: 16 source: 15244688 GrB.bfs  time:    5.282 visited: 35016138 depth:       16
-trial: 17 source: 24946739 GrB.bfs  time:    5.166 visited: 35016137 depth:       15
-trial: 18 source:  6479473 GrB.bfs  time:    5.226 visited: 35016137 depth:       15
-trial: 19 source: 26077683 GrB.bfs  time:    5.025 visited: 35016137 depth:       15
-trial: 20 source: 22023876 GrB.bfs  time:    5.341 visited: 35016137 depth:       17
-trial: 21 source: 22081916 GrB.bfs  time:    5.036 visited: 35016137 depth:       15
-trial: 22 source: 40034163 GrB.bfs  time:    5.149 visited: 35016138 depth:       16
-trial: 23 source: 49496015 GrB.bfs  time:    5.229 visited: 35016138 depth:       16
-trial: 24 source: 42847508 GrB.bfs  time:    5.030 visited: 35016137 depth:       15
-trial: 25 source: 52409558 GrB.bfs  time:    5.146 visited: 35016137 depth:       16
-trial: 26 source: 55445389 GrB.bfs  time:    5.248 visited: 35016137 depth:       16
-trial: 27 source: 22028098 GrB.bfs  time:    5.373 visited: 35016137 depth:       17
-trial: 28 source: 48766649 GrB.bfs  time:    5.297 visited: 35016137 depth:       16
-trial: 29 source: 44521242 GrB.bfs  time:    5.177 visited: 35016138 depth:       16
-trial: 30 source: 60135543 GrB.bfs  time:    5.187 visited: 35016137 depth:       16
-trial: 31 source: 28528672 GrB.bfs  time:    5.291 visited: 35016137 depth:       16
-trial: 32 source:  9678013 GrB.bfs  time:    4.932 visited: 35016137 depth:       14
-trial: 33 source: 40020307 GrB.bfs  time:    5.278 visited: 35016138 depth:       16
-trial: 34 source: 31625736 GrB.bfs  time:    5.210 visited: 35016137 depth:       16
-trial: 35 source: 37446893 GrB.bfs  time:    5.302 visited: 35016137 depth:       16
-trial: 36 source: 51788953 GrB.bfs  time:    5.261 visited: 35016137 depth:       16
-trial: 37 source: 52584256 GrB.bfs  time:    5.256 visited: 35016137 depth:       16
-trial: 38 source: 20346697 GrB.bfs  time:    5.103 visited: 35016137 depth:       15
-trial: 39 source: 48387910 GrB.bfs  time:    5.220 visited: 35016138 depth:       16
-trial: 40 source: 37337428 GrB.bfs  time:    5.292 visited: 35016137 depth:       16
-trial: 41 source: 50501085 GrB.bfs  time:    5.220 visited: 35016137 depth:       16
-trial: 42 source: 30130062 GrB.bfs  time:    5.243 visited: 35016137 depth:       16
-trial: 43 source: 41185894 GrB.bfs  time:    5.364 visited: 35016138 depth:       17
-trial: 44 source: 56495704 GrB.bfs  time:    5.073 visited: 35016137 depth:       15
-trial: 45 source: 45663306 GrB.bfs  time:    5.269 visited: 35016137 depth:       16
-trial: 46 source: 33359461 GrB.bfs  time:    5.221 visited: 35016137 depth:       16
-trial: 47 source: 48143059 GrB.bfs  time:    5.262 visited: 35016137 depth:       16
-trial: 48 source: 33291514 GrB.bfs  time:    5.295 visited: 35016137 depth:       15
-trial: 49 source: 53461446 GrB.bfs  time:    5.260 visited: 35016137 depth:       16
-trial: 50 source: 29340611 GrB.bfs  time:    5.069 visited: 35016137 depth:       15
-trial: 51 source: 34148499 GrB.bfs  time:    5.118 visited: 35016137 depth:       15
-trial: 52 source: 49171807 GrB.bfs  time:    5.214 visited: 35016137 depth:       16
-trial: 53 source: 35550697 GrB.bfs  time:    5.490 visited: 35016137 depth:       17
-trial: 54 source: 14521508 GrB.bfs  time:    5.108 visited: 35016137 depth:       15
-trial: 55 source: 51633219 GrB.bfs  time:    5.216 visited: 35016137 depth:       16
-trial: 56 source: 46823383 GrB.bfs  time:    5.093 visited: 35016137 depth:       15
-trial: 57 source: 19396274 GrB.bfs  time:    5.052 visited: 35016137 depth:       15
-trial: 58 source: 19871751 GrB.bfs  time:    5.161 visited: 35016137 depth:       15
-trial: 59 source: 36862678 GrB.bfs  time:    5.240 visited: 35016137 depth:       16
-trial: 60 source: 49539127 GrB.bfs  time:    5.379 visited: 35016137 depth:       17
-trial: 61 source: 34016453 GrB.bfs  time:    5.159 visited: 35016137 depth:       15
-trial: 62 source: 36567396 GrB.bfs  time:    5.152 visited: 35016137 depth:       15
-trial: 63 source: 55487794 GrB.bfs  time:    5.153 visited: 35016137 depth:       16
-trial: 64 source: 14391371 GrB.bfs  time:    5.331 visited: 35016137 depth:       16
-avg GrB.bfs  time:  5.20637 (64 trials)
-
-GAP/GAP-web: nodes: 50.6362 million  nvals: 1930.29 million
-load time: 121.819 sec
-
-GrB.bfs  tests: 40 threads
-trial:  1 source: 10219453 GrB.bfs  time:   10.278 visited: 50542646 depth:       56
-trial:  2 source: 44758212 GrB.bfs  time:   10.087 visited: 50542646 depth:       55
-trial:  3 source:   890672 GrB.bfs  time:   10.886 visited: 50542646 depth:       62
-trial:  4 source: 13843757 GrB.bfs  time:    2.106 visited:    58265 depth:        9
-trial:  5 source: 14168063 GrB.bfs  time:   10.534 visited: 50542646 depth:       58
-trial:  6 source: 20906931 GrB.bfs  time:    9.736 visited: 50542646 depth:       55
-trial:  7 source: 12189585 GrB.bfs  time:   10.124 visited: 50542646 depth:       55
-trial:  8 source: 26352336 GrB.bfs  time:    2.335 visited:    18483 depth:       11
-trial:  9 source: 43500687 GrB.bfs  time:   10.262 visited: 50542646 depth:       55
-trial: 10 source:  8987025 GrB.bfs  time:   10.335 visited: 50542646 depth:       59
-trial: 11 source:  5699763 GrB.bfs  time:    1.587 visited:    17422 depth:        5
-trial: 12 source: 41436456 GrB.bfs  time:   10.585 visited: 50542646 depth:       57
-trial: 13 source:  5030728 GrB.bfs  time:   10.914 visited: 50542646 depth:       63
-trial: 14 source: 40735219 GrB.bfs  time:    9.990 visited: 50542646 depth:       55
-trial: 15 source: 16533564 GrB.bfs  time:    9.981 visited: 50542646 depth:       54
-trial: 16 source: 28700167 GrB.bfs  time:   10.060 visited: 50542646 depth:       56
-trial: 17 source:    64712 GrB.bfs  time:   10.329 visited: 50542646 depth:       55
-trial: 18 source: 39634751 GrB.bfs  time:   10.142 visited: 50542646 depth:       57
-trial: 19 source: 16037780 GrB.bfs  time:   10.431 visited: 50542646 depth:       58
-trial: 20 source: 27152740 GrB.bfs  time:    9.972 visited: 50542646 depth:       55
-trial: 21 source: 16404062 GrB.bfs  time:   10.076 visited: 50542646 depth:       55
-trial: 22 source: 20491964 GrB.bfs  time:   10.028 visited: 50542646 depth:       55
-trial: 23 source:  5322424 GrB.bfs  time:   10.974 visited: 50542646 depth:       62
-trial: 24 source: 21420954 GrB.bfs  time:   10.397 visited: 50542646 depth:       58
-trial: 25 source: 26622110 GrB.bfs  time:    9.828 visited: 50542646 depth:       55
-trial: 26 source:  5882876 GrB.bfs  time:   10.529 visited: 50542646 depth:       59
-trial: 27 source: 18091041 GrB.bfs  time:    9.930 visited: 50542646 depth:       55
-trial: 28 source: 10665897 GrB.bfs  time:    2.277 visited:    96985 depth:       10
-trial: 29 source: 18634423 GrB.bfs  time:    9.892 visited: 50542646 depth:       53
-trial: 30 source: 18138716 GrB.bfs  time:    9.931 visited: 50542646 depth:       56
-trial: 31 source:  2355536 GrB.bfs  time:   10.313 visited: 50542646 depth:       56
-trial: 32 source: 32885206 GrB.bfs  time:    2.444 visited:    89180 depth:       12
-trial: 33 source: 40657441 GrB.bfs  time:   10.257 visited: 50542646 depth:       55
-trial: 34 source: 35196168 GrB.bfs  time:    9.788 visited: 50542646 depth:       55
-trial: 35 source: 45544427 GrB.bfs  time:   10.113 visited: 50542646 depth:       55
-trial: 36 source:  6175520 GrB.bfs  time:    1.877 visited:     6289 depth:        8
-trial: 37 source: 40058319 GrB.bfs  time:   10.445 visited: 50542646 depth:       57
-trial: 38 source: 50626231 GrB.bfs  time:   10.071 visited: 50542646 depth:       57
-trial: 39 source: 36571020 GrB.bfs  time:    9.983 visited: 50542646 depth:       55
-trial: 40 source: 49397053 GrB.bfs  time:   10.112 visited: 50542646 depth:       56
-trial: 41 source: 23434266 GrB.bfs  time:   10.126 visited: 50542646 depth:       55
-trial: 42 source:  2299445 GrB.bfs  time:   10.003 visited: 50542646 depth:       55
-trial: 43 source: 32873824 GrB.bfs  time:    2.405 visited:    89180 depth:       11
-trial: 44 source: 25978283 GrB.bfs  time:   10.172 visited: 50542646 depth:       55
-trial: 45 source:  2461716 GrB.bfs  time:    2.309 visited:    96885 depth:       10
-trial: 46 source: 22787315 GrB.bfs  time:   10.040 visited: 50542646 depth:       54
-trial: 47 source: 30759948 GrB.bfs  time:    9.821 visited: 50542646 depth:       55
-trial: 48 source:  7428895 GrB.bfs  time:    1.336 visited:        3 depth:        3
-trial: 49 source: 39173871 GrB.bfs  time:    1.147 visited:        2 depth:        2
-trial: 50 source: 43194210 GrB.bfs  time:    1.670 visited:     4818 depth:        6
-trial: 51 source: 26361510 GrB.bfs  time:    2.406 visited:    18483 depth:       11
-trial: 52 source: 39747212 GrB.bfs  time:    1.117 visited:        2 depth:        2
-trial: 53 source: 30670030 GrB.bfs  time:   10.227 visited: 50542646 depth:       55
-trial: 54 source: 41483034 GrB.bfs  time:   10.354 visited: 50542646 depth:       58
-trial: 55 source:  9358667 GrB.bfs  time:    9.983 visited: 50542646 depth:       55
-trial: 56 source:  9945009 GrB.bfs  time:    9.924 visited: 50542646 depth:       56
-trial: 57 source:  3355245 GrB.bfs  time:   10.495 visited: 50542646 depth:       58
-trial: 58 source: 33831270 GrB.bfs  time:    9.840 visited: 50542646 depth:       55
-trial: 59 source: 45124745 GrB.bfs  time:   10.157 visited: 50542646 depth:       55
-trial: 60 source: 16137878 GrB.bfs  time:    2.258 visited:    99965 depth:       10
-trial: 61 source: 11235449 GrB.bfs  time:   10.095 visited: 50542646 depth:       55
-trial: 62 source: 37509145 GrB.bfs  time:    9.771 visited: 50542646 depth:       55
-trial: 63 source: 27402415 GrB.bfs  time:    9.876 visited: 50542646 depth:       54
-trial: 64 source: 39546084 GrB.bfs  time:   10.066 visited: 50542646 depth:       57
-avg GrB.bfs  time:  8.36777 (64 trials)
-
-GrB.bfs  tests: 20 threads
-trial:  1 source: 10219453 GrB.bfs  time:   10.365 visited: 50542646 depth:       56
-trial:  2 source: 44758212 GrB.bfs  time:   10.355 visited: 50542646 depth:       55
-trial:  3 source:   890672 GrB.bfs  time:   11.048 visited: 50542646 depth:       62
-trial:  4 source: 13843757 GrB.bfs  time:    2.179 visited:    58265 depth:        9
-trial:  5 source: 14168063 GrB.bfs  time:   10.751 visited: 50542646 depth:       58
-trial:  6 source: 20906931 GrB.bfs  time:   10.106 visited: 50542646 depth:       55
-trial:  7 source: 12189585 GrB.bfs  time:   10.347 visited: 50542646 depth:       55
-trial:  8 source: 26352336 GrB.bfs  time:    2.369 visited:    18483 depth:       11
-trial:  9 source: 43500687 GrB.bfs  time:   10.359 visited: 50542646 depth:       55
-trial: 10 source:  8987025 GrB.bfs  time:   10.659 visited: 50542646 depth:       59
-trial: 11 source:  5699763 GrB.bfs  time:    1.651 visited:    17422 depth:        5
-trial: 12 source: 41436456 GrB.bfs  time:   10.831 visited: 50542646 depth:       57
-trial: 13 source:  5030728 GrB.bfs  time:   11.235 visited: 50542646 depth:       63
-trial: 14 source: 40735219 GrB.bfs  time:   10.444 visited: 50542646 depth:       55
-trial: 15 source: 16533564 GrB.bfs  time:   10.199 visited: 50542646 depth:       54
-trial: 16 source: 28700167 GrB.bfs  time:   10.405 visited: 50542646 depth:       56
-trial: 17 source:    64712 GrB.bfs  time:   10.542 visited: 50542646 depth:       55
-trial: 18 source: 39634751 GrB.bfs  time:   10.440 visited: 50542646 depth:       57
-trial: 19 source: 16037780 GrB.bfs  time:   10.718 visited: 50542646 depth:       58
-trial: 20 source: 27152740 GrB.bfs  time:   10.319 visited: 50542646 depth:       55
-trial: 21 source: 16404062 GrB.bfs  time:   10.410 visited: 50542646 depth:       55
-trial: 22 source: 20491964 GrB.bfs  time:   10.818 visited: 50542646 depth:       55
-trial: 23 source:  5322424 GrB.bfs  time:   11.426 visited: 50542646 depth:       62
-trial: 24 source: 21420954 GrB.bfs  time:   10.902 visited: 50542646 depth:       58
-trial: 25 source: 26622110 GrB.bfs  time:   10.232 visited: 50542646 depth:       55
-trial: 26 source:  5882876 GrB.bfs  time:   10.898 visited: 50542646 depth:       59
-trial: 27 source: 18091041 GrB.bfs  time:   10.281 visited: 50542646 depth:       55
-trial: 28 source: 10665897 GrB.bfs  time:    2.308 visited:    96985 depth:       10
-trial: 29 source: 18634423 GrB.bfs  time:   10.113 visited: 50542646 depth:       53
-trial: 30 source: 18138716 GrB.bfs  time:   10.213 visited: 50542646 depth:       56
-trial: 31 source:  2355536 GrB.bfs  time:   10.368 visited: 50542646 depth:       56
-trial: 32 source: 32885206 GrB.bfs  time:    2.553 visited:    89180 depth:       12
-trial: 33 source: 40657441 GrB.bfs  time:   10.511 visited: 50542646 depth:       55
-trial: 34 source: 35196168 GrB.bfs  time:   10.076 visited: 50542646 depth:       55
-trial: 35 source: 45544427 GrB.bfs  time:   10.547 visited: 50542646 depth:       55
-trial: 36 source:  6175520 GrB.bfs  time:    1.973 visited:     6289 depth:        8
-trial: 37 source: 40058319 GrB.bfs  time:   10.773 visited: 50542646 depth:       57
-trial: 38 source: 50626231 GrB.bfs  time:   10.455 visited: 50542646 depth:       57
-trial: 39 source: 36571020 GrB.bfs  time:   10.394 visited: 50542646 depth:       55
-trial: 40 source: 49397053 GrB.bfs  time:   10.664 visited: 50542646 depth:       56
-trial: 41 source: 23434266 GrB.bfs  time:   10.446 visited: 50542646 depth:       55
-trial: 42 source:  2299445 GrB.bfs  time:   10.241 visited: 50542646 depth:       55
-trial: 43 source: 32873824 GrB.bfs  time:    2.460 visited:    89180 depth:       11
-trial: 44 source: 25978283 GrB.bfs  time:   10.539 visited: 50542646 depth:       55
-trial: 45 source:  2461716 GrB.bfs  time:    2.293 visited:    96885 depth:       10
-trial: 46 source: 22787315 GrB.bfs  time:   10.272 visited: 50542646 depth:       54
-trial: 47 source: 30759948 GrB.bfs  time:   10.061 visited: 50542646 depth:       55
-trial: 48 source:  7428895 GrB.bfs  time:    1.355 visited:        3 depth:        3
-trial: 49 source: 39173871 GrB.bfs  time:    1.201 visited:        2 depth:        2
-trial: 50 source: 43194210 GrB.bfs  time:    1.753 visited:     4818 depth:        6
-trial: 51 source: 26361510 GrB.bfs  time:    2.405 visited:    18483 depth:       11
-trial: 52 source: 39747212 GrB.bfs  time:    1.174 visited:        2 depth:        2
-trial: 53 source: 30670030 GrB.bfs  time:   10.444 visited: 50542646 depth:       55
-trial: 54 source: 41483034 GrB.bfs  time:   10.402 visited: 50542646 depth:       58
-trial: 55 source:  9358667 GrB.bfs  time:   10.385 visited: 50542646 depth:       55
-trial: 56 source:  9945009 GrB.bfs  time:   10.167 visited: 50542646 depth:       56
-trial: 57 source:  3355245 GrB.bfs  time:   10.804 visited: 50542646 depth:       58
-trial: 58 source: 33831270 GrB.bfs  time:   10.084 visited: 50542646 depth:       55
-trial: 59 source: 45124745 GrB.bfs  time:   10.228 visited: 50542646 depth:       55
-trial: 60 source: 16137878 GrB.bfs  time:    2.246 visited:    99965 depth:       10
-trial: 61 source: 11235449 GrB.bfs  time:   10.363 visited: 50542646 depth:       55
-trial: 62 source: 37509145 GrB.bfs  time:    9.924 visited: 50542646 depth:       55
-trial: 63 source: 27402415 GrB.bfs  time:   10.236 visited: 50542646 depth:       54
-trial: 64 source: 39546084 GrB.bfs  time:   10.422 visited: 50542646 depth:       57
-avg GrB.bfs  time:  8.61164 (64 trials)
-
-GAP/GAP-road: nodes: 23.9473 million  nvals: 57.7086 million
-load time: 7.14559 sec
-
-GrB.bfs  tests: 40 threads
-trial:  1 source:  4795721 GrB.bfs  time:  486.866 visited: 23947347 depth:     6834
-trial:  2 source: 21003854 GrB.bfs  time:  436.318 visited: 23947347 depth:     6072
-trial:  3 source:   417969 GrB.bfs  time:  386.064 visited: 23947347 depth:     5351
-trial:  4 source:  6496512 GrB.bfs  time:  497.523 visited: 23947347 depth:     6982
-trial:  5 source:  6648700 GrB.bfs  time:  333.897 visited: 23947347 depth:     4599
-trial:  6 source:  9811074 GrB.bfs  time:  522.525 visited: 23947347 depth:     7321
-trial:  7 source: 22247479 GrB.bfs  time:  453.420 visited: 23947347 depth:     6335
-trial:  8 source:  5720253 GrB.bfs  time:  470.524 visited: 23947347 depth:     6559
-trial:  9 source: 12366460 GrB.bfs  time:  411.323 visited: 23947347 depth:     5727
-trial: 10 source: 20413730 GrB.bfs  time:  423.597 visited: 23947347 depth:     5892
-trial: 11 source:  4217375 GrB.bfs  time:  529.465 visited: 23947347 depth:     7422
-trial: 12 source:  2674750 GrB.bfs  time:  536.159 visited: 23947347 depth:     7530
-trial: 13 source: 22085558 GrB.bfs  time:  494.974 visited: 23947347 depth:     6932
-trial: 14 source: 19445041 GrB.bfs  time:  429.524 visited: 23947347 depth:     5988
-trial: 15 source:  2360789 GrB.bfs  time:  500.667 visited: 23947347 depth:     7032
-trial: 16 source: 19115969 GrB.bfs  time:  504.730 visited: 23947347 depth:     7058
-trial: 17 source:  7758768 GrB.bfs  time:  403.471 visited: 23947347 depth:     5598
-trial: 18 source: 13468235 GrB.bfs  time:  499.796 visited: 23947347 depth:     7014
-trial: 19 source:    30368 GrB.bfs  time:  448.713 visited: 23947347 depth:     6267
-trial: 20 source: 18599548 GrB.bfs  time:  467.817 visited: 23947347 depth:     6548
-trial: 21 source:  7526109 GrB.bfs  time:  346.525 visited: 23947347 depth:     4795
-trial: 22 source: 16836281 GrB.bfs  time:  527.283 visited: 23947347 depth:     7388
-trial: 23 source: 12742068 GrB.bfs  time:  525.783 visited: 23947347 depth:     7360
-trial: 24 source:  7697996 GrB.bfs  time:  382.657 visited: 23947347 depth:     5304
-trial: 25 source:  5876444 GrB.bfs  time:  382.797 visited: 23947347 depth:     5294
-trial: 26 source:  9616341 GrB.bfs  time:  425.681 visited: 23947347 depth:     5956
-trial: 27 source:  2497674 GrB.bfs  time:  535.950 visited: 23947347 depth:     7519
-trial: 28 source: 10052291 GrB.bfs  time:  381.227 visited: 23947347 depth:     5286
-trial: 29 source: 12493058 GrB.bfs  time:  520.982 visited: 23947347 depth:     7300
-trial: 30 source:  1670856 GrB.bfs  time:  499.289 visited: 23947347 depth:     7005
-trial: 31 source:  2760680 GrB.bfs  time:  432.376 visited: 23947347 depth:     6034
-trial: 32 source:  2460942 GrB.bfs  time:  520.147 visited: 23947347 depth:     7304
-trial: 33 source:  8489651 GrB.bfs  time:  385.008 visited: 23947347 depth:     5336
-trial: 34 source:  5005226 GrB.bfs  time:  513.172 visited: 23947347 depth:     7187
-trial: 35 source:  8744646 GrB.bfs  time:  417.340 visited: 23947347 depth:     5839
-trial: 36 source:  8512024 GrB.bfs  time:  379.860 visited: 23947347 depth:     5275
-trial: 37 source: 21912166 GrB.bfs  time:  574.745 visited: 23947347 depth:     8071
-trial: 38 source:  1105391 GrB.bfs  time:  436.000 visited: 23947347 depth:     6082
-trial: 39 source: 15432164 GrB.bfs  time:  396.349 visited: 23947347 depth:     5490
-trial: 40 source:  1600178 GrB.bfs  time:  504.254 visited: 23947347 depth:     7079
-trial: 41 source: 19079470 GrB.bfs  time:  533.384 visited: 23947347 depth:     7499
-trial: 42 source: 16516638 GrB.bfs  time:  552.520 visited: 23947347 depth:     7773
-trial: 43 source: 20202567 GrB.bfs  time:  477.440 visited: 23947347 depth:     6689
-trial: 44 source: 21372804 GrB.bfs  time:  433.258 visited: 23947347 depth:     6048
-trial: 45 source:  2898010 GrB.bfs  time:  518.637 visited: 23947347 depth:     7271
-trial: 46 source:  8491278 GrB.bfs  time:  375.516 visited: 23947347 depth:     5205
-trial: 47 source: 18798318 GrB.bfs  time:  492.283 visited: 23947347 depth:     6864
-trial: 48 source: 23757561 GrB.bfs  time:  446.628 visited: 23947347 depth:     6217
-trial: 49 source: 17161820 GrB.bfs  time:  417.193 visited: 23947347 depth:     5802
-trial: 50 source: 23180740 GrB.bfs  time:  539.981 visited: 23947347 depth:     7589
-trial: 51 source: 10997086 GrB.bfs  time:  359.500 visited: 23947347 depth:     4983
-trial: 52 source:  3730631 GrB.bfs  time:  553.902 visited: 23947347 depth:     7801
-trial: 53 source:  1079069 GrB.bfs  time:  448.494 visited: 23947347 depth:     6268
-trial: 54 source: 15426823 GrB.bfs  time:  393.534 visited: 23947347 depth:     5458
-trial: 55 source: 12190926 GrB.bfs  time:  408.222 visited: 23947347 depth:     5693
-trial: 56 source:  1155219 GrB.bfs  time:  442.800 visited: 23947347 depth:     6181
-trial: 57 source: 10693489 GrB.bfs  time:  373.618 visited: 23947347 depth:     5176
-trial: 58 source: 14434836 GrB.bfs  time:  484.661 visited: 23947347 depth:     6770
-trial: 59 source: 19963340 GrB.bfs  time:  470.342 visited: 23947347 depth:     6567
-trial: 60 source:  3486186 GrB.bfs  time:  446.955 visited: 23947347 depth:     6228
-trial: 61 source: 18383270 GrB.bfs  time:  441.777 visited: 23947347 depth:     6166
-trial: 62 source: 20269909 GrB.bfs  time:  422.172 visited: 23947347 depth:     5885
-trial: 63 source: 12370765 GrB.bfs  time:  496.311 visited: 23947347 depth:     6951
-trial: 64 source:  7843141 GrB.bfs  time:  440.201 visited: 23947347 depth:     6121
-avg GrB.bfs  time:  457.721 (64 trials)
-
-GrB.bfs  tests: 20 threads
-trial:  1 source:  4795721 GrB.bfs  time:  483.977 visited: 23947347 depth:     6834
-trial:  2 source: 21003854 GrB.bfs  time:  429.452 visited: 23947347 depth:     6072
-trial:  3 source:   417969 GrB.bfs  time:  381.607 visited: 23947347 depth:     5351
-trial:  4 source:  6496512 GrB.bfs  time:  494.576 visited: 23947347 depth:     6982
-trial:  5 source:  6648700 GrB.bfs  time:  332.419 visited: 23947347 depth:     4599
-trial:  6 source:  9811074 GrB.bfs  time:  518.265 visited: 23947347 depth:     7321
-trial:  7 source: 22247479 GrB.bfs  time:  450.057 visited: 23947347 depth:     6335
-trial:  8 source:  5720253 GrB.bfs  time:  465.205 visited: 23947347 depth:     6559
-trial:  9 source: 12366460 GrB.bfs  time:  409.312 visited: 23947347 depth:     5727
-trial: 10 source: 20413730 GrB.bfs  time:  420.238 visited: 23947347 depth:     5892
-trial: 11 source:  4217375 GrB.bfs  time:  523.728 visited: 23947347 depth:     7422
-trial: 12 source:  2674750 GrB.bfs  time:  534.312 visited: 23947347 depth:     7530
-trial: 13 source: 22085558 GrB.bfs  time:  491.401 visited: 23947347 depth:     6932
-trial: 14 source: 19445041 GrB.bfs  time:  426.593 visited: 23947347 depth:     5988
-trial: 15 source:  2360789 GrB.bfs  time:  496.873 visited: 23947347 depth:     7032
-trial: 16 source: 19115969 GrB.bfs  time:  499.038 visited: 23947347 depth:     7058
-trial: 17 source:  7758768 GrB.bfs  time:  397.701 visited: 23947347 depth:     5598
-trial: 18 source: 13468235 GrB.bfs  time:  501.258 visited: 23947347 depth:     7014
-trial: 19 source:    30368 GrB.bfs  time:  446.619 visited: 23947347 depth:     6267
-trial: 20 source: 18599548 GrB.bfs  time:  466.593 visited: 23947347 depth:     6548
-trial: 21 source:  7526109 GrB.bfs  time:  344.856 visited: 23947347 depth:     4795
-trial: 22 source: 16836281 GrB.bfs  time:  521.973 visited: 23947347 depth:     7388
-trial: 23 source: 12742068 GrB.bfs  time:  516.165 visited: 23947347 depth:     7360
-trial: 24 source:  7697996 GrB.bfs  time:  377.267 visited: 23947347 depth:     5304
-trial: 25 source:  5876444 GrB.bfs  time:  377.700 visited: 23947347 depth:     5294
-trial: 26 source:  9616341 GrB.bfs  time:  424.044 visited: 23947347 depth:     5956
-trial: 27 source:  2497674 GrB.bfs  time:  532.473 visited: 23947347 depth:     7519
-trial: 28 source: 10052291 GrB.bfs  time:  376.052 visited: 23947347 depth:     5286
-trial: 29 source: 12493058 GrB.bfs  time:  518.388 visited: 23947347 depth:     7300
-trial: 30 source:  1670856 GrB.bfs  time:  494.668 visited: 23947347 depth:     7005
-trial: 31 source:  2760680 GrB.bfs  time:  429.727 visited: 23947347 depth:     6034
-trial: 32 source:  2460942 GrB.bfs  time:  513.113 visited: 23947347 depth:     7304
-trial: 33 source:  8489651 GrB.bfs  time:  383.847 visited: 23947347 depth:     5336
-trial: 34 source:  5005226 GrB.bfs  time:  506.105 visited: 23947347 depth:     7187
-trial: 35 source:  8744646 GrB.bfs  time:  418.580 visited: 23947347 depth:     5839
-trial: 36 source:  8512024 GrB.bfs  time:  377.621 visited: 23947347 depth:     5275
-trial: 37 source: 21912166 GrB.bfs  time:  570.647 visited: 23947347 depth:     8071
-trial: 38 source:  1105391 GrB.bfs  time:  431.515 visited: 23947347 depth:     6082
-trial: 39 source: 15432164 GrB.bfs  time:  392.152 visited: 23947347 depth:     5490
-trial: 40 source:  1600178 GrB.bfs  time:  501.947 visited: 23947347 depth:     7079
-trial: 41 source: 19079470 GrB.bfs  time:  528.077 visited: 23947347 depth:     7499
-trial: 42 source: 16516638 GrB.bfs  time:  548.418 visited: 23947347 depth:     7773
-trial: 43 source: 20202567 GrB.bfs  time:  472.614 visited: 23947347 depth:     6689
-trial: 44 source: 21372804 GrB.bfs  time:  427.802 visited: 23947347 depth:     6048
-trial: 45 source:  2898010 GrB.bfs  time:  516.366 visited: 23947347 depth:     7271
-trial: 46 source:  8491278 GrB.bfs  time:  372.102 visited: 23947347 depth:     5205
-trial: 47 source: 18798318 GrB.bfs  time:  487.857 visited: 23947347 depth:     6864
-trial: 48 source: 23757561 GrB.bfs  time:  440.629 visited: 23947347 depth:     6217
-trial: 49 source: 17161820 GrB.bfs  time:  412.726 visited: 23947347 depth:     5802
-trial: 50 source: 23180740 GrB.bfs  time:  536.748 visited: 23947347 depth:     7589
-trial: 51 source: 10997086 GrB.bfs  time:  355.346 visited: 23947347 depth:     4983
-trial: 52 source:  3730631 GrB.bfs  time:  550.072 visited: 23947347 depth:     7801
-trial: 53 source:  1079069 GrB.bfs  time:  446.701 visited: 23947347 depth:     6268
-trial: 54 source: 15426823 GrB.bfs  time:  389.912 visited: 23947347 depth:     5458
-trial: 55 source: 12190926 GrB.bfs  time:  402.925 visited: 23947347 depth:     5693
-trial: 56 source:  1155219 GrB.bfs  time:  438.317 visited: 23947347 depth:     6181
-trial: 57 source: 10693489 GrB.bfs  time:  369.734 visited: 23947347 depth:     5176
-trial: 58 source: 14434836 GrB.bfs  time:  476.760 visited: 23947347 depth:     6770
-trial: 59 source: 19963340 GrB.bfs  time:  465.851 visited: 23947347 depth:     6567
-trial: 60 source:  3486186 GrB.bfs  time:  443.123 visited: 23947347 depth:     6228
-trial: 61 source: 18383270 GrB.bfs  time:  439.877 visited: 23947347 depth:     6166
-trial: 62 source: 20269909 GrB.bfs  time:  422.430 visited: 23947347 depth:     5885
-trial: 63 source: 12370765 GrB.bfs  time:  491.088 visited: 23947347 depth:     6951
-trial: 64 source:  7843141 GrB.bfs  time:  438.361 visited: 23947347 depth:     6121
-avg GrB.bfs  time:  453.936 (64 trials)
-
-GAP/GAP-urand: nodes: 134.218 million  nvals: 4294.97 million
-load time: 421.976 sec
-
-GrB.bfs  tests: 40 threads
-trial:  1 source: 27691420 GrB.bfs  time:   11.297 visited: 134217728 depth:        8
-trial:  2 source: 121280315 GrB.bfs  time:   10.962 visited: 134217728 depth:        8
-trial:  3 source:  2413432 GrB.bfs  time:   11.087 visited: 134217728 depth:        8
-trial:  4 source: 37512114 GrB.bfs  time:   11.100 visited: 134217728 depth:        8
-trial:  5 source: 38390878 GrB.bfs  time:   11.163 visited: 134217728 depth:        8
-trial:  6 source: 56651038 GrB.bfs  time:   10.930 visited: 134217728 depth:        8
-trial:  7 source: 128461249 GrB.bfs  time:   11.097 visited: 134217728 depth:        8
-trial:  8 source: 33029843 GrB.bfs  time:   10.967 visited: 134217728 depth:        8
-trial:  9 source: 71406329 GrB.bfs  time:   11.043 visited: 134217728 depth:        8
-trial: 10 source: 117872828 GrB.bfs  time:   11.006 visited: 134217728 depth:        8
-trial: 11 source: 24351939 GrB.bfs  time:   11.107 visited: 134217728 depth:        8
-trial: 12 source: 15444520 GrB.bfs  time:   11.102 visited: 134217728 depth:        8
-trial: 13 source: 127526282 GrB.bfs  time:   11.054 visited: 134217728 depth:        8
-trial: 14 source: 112279429 GrB.bfs  time:   10.960 visited: 134217728 depth:        8
-trial: 15 source: 13631650 GrB.bfs  time:   11.087 visited: 134217728 depth:        8
-trial: 16 source: 110379303 GrB.bfs  time:   10.970 visited: 134217728 depth:        8
-trial: 17 source: 44800624 GrB.bfs  time:   11.109 visited: 134217728 depth:        8
-trial: 18 source: 77768194 GrB.bfs  time:   10.958 visited: 134217728 depth:        8
-trial: 19 source:   175348 GrB.bfs  time:   11.052 visited: 134217728 depth:        8
-trial: 20 source: 107397390 GrB.bfs  time:   10.966 visited: 134217728 depth:        8
-trial: 21 source: 43457210 GrB.bfs  time:   11.004 visited: 134217728 depth:        8
-trial: 22 source: 97215941 GrB.bfs  time:   10.994 visited: 134217728 depth:        8
-trial: 23 source: 73575166 GrB.bfs  time:   11.125 visited: 134217728 depth:        8
-trial: 24 source: 44449716 GrB.bfs  time:   11.021 visited: 134217728 depth:        8
-trial: 25 source: 33931725 GrB.bfs  time:   11.115 visited: 134217728 depth:        8
-trial: 26 source: 55526611 GrB.bfs  time:   11.090 visited: 134217728 depth:        8
-trial: 27 source: 14422052 GrB.bfs  time:   11.079 visited: 134217728 depth:        8
-trial: 28 source: 58043874 GrB.bfs  time:   11.055 visited: 134217728 depth:        8
-trial: 29 source: 72137330 GrB.bfs  time:   11.116 visited: 134217728 depth:        8
-trial: 30 source:  9647841 GrB.bfs  time:   11.064 visited: 134217728 depth:        8
-trial: 31 source: 15940696 GrB.bfs  time:   11.102 visited: 134217728 depth:        8
-trial: 32 source: 14209953 GrB.bfs  time:   11.085 visited: 134217728 depth:        8
-trial: 33 source: 49020884 GrB.bfs  time:   11.100 visited: 134217728 depth:        8
-trial: 34 source: 28901139 GrB.bfs  time:   11.043 visited: 134217728 depth:        8
-trial: 35 source: 50493274 GrB.bfs  time:   11.094 visited: 134217728 depth:        8
-trial: 36 source: 49150070 GrB.bfs  time:   11.040 visited: 134217728 depth:        8
-trial: 37 source: 126525083 GrB.bfs  time:   11.110 visited: 134217728 depth:        8
-trial: 38 source:  6382741 GrB.bfs  time:   11.021 visited: 134217728 depth:        8
-trial: 39 source: 89108298 GrB.bfs  time:   11.079 visited: 134217728 depth:        8
-trial: 40 source:  9239736 GrB.bfs  time:   11.003 visited: 134217728 depth:        8
-trial: 41 source: 110168549 GrB.bfs  time:   11.053 visited: 134217728 depth:        8
-trial: 42 source: 95370260 GrB.bfs  time:   10.955 visited: 134217728 depth:        8
-trial: 43 source: 116653531 GrB.bfs  time:   11.039 visited: 134217728 depth:        8
-trial: 44 source: 123410704 GrB.bfs  time:   11.005 visited: 134217728 depth:        8
-trial: 45 source: 16733666 GrB.bfs  time:   11.070 visited: 134217728 depth:        8
-trial: 46 source: 49030283 GrB.bfs  time:   10.980 visited: 134217728 depth:        8
-trial: 47 source: 108545122 GrB.bfs  time:   11.153 visited: 134217728 depth:        8
-trial: 48 source: 99095666 GrB.bfs  time:   11.143 visited: 134217728 depth:        8
-trial: 49 source: 133850078 GrB.bfs  time:   11.166 visited: 134217728 depth:        8
-trial: 50 source: 63499302 GrB.bfs  time:   11.175 visited: 134217728 depth:        8
-trial: 51 source: 21541383 GrB.bfs  time:   11.115 visited: 134217728 depth:        8
-trial: 52 source:  6230752 GrB.bfs  time:   10.969 visited: 134217728 depth:        8
-trial: 53 source: 89077457 GrB.bfs  time:   11.089 visited: 134217728 depth:        8
-trial: 54 source: 70392766 GrB.bfs  time:   11.032 visited: 134217728 depth:        8
-trial: 55 source:  6670456 GrB.bfs  time:   11.123 visited: 134217728 depth:        8
-trial: 56 source: 61746272 GrB.bfs  time:   11.014 visited: 134217728 depth:        8
-trial: 57 source: 83349536 GrB.bfs  time:   11.122 visited: 134217728 depth:        8
-trial: 58 source: 115272185 GrB.bfs  time:   11.068 visited: 134217728 depth:        8
-trial: 59 source: 20129909 GrB.bfs  time:   11.128 visited: 134217728 depth:        8
-trial: 60 source: 106148554 GrB.bfs  time:   10.996 visited: 134217728 depth:        8
-trial: 61 source: 117042376 GrB.bfs  time:   11.118 visited: 134217728 depth:        8
-trial: 62 source: 71431188 GrB.bfs  time:   11.008 visited: 134217728 depth:        8
-trial: 63 source: 45287809 GrB.bfs  time:   11.202 visited: 134217728 depth:        8
-trial: 64 source: 107702121 GrB.bfs  time:   11.033 visited: 134217728 depth:        8
-avg GrB.bfs  time:  11.0643 (64 trials)
-
-GrB.bfs  tests: 20 threads
-trial:  1 source: 27691420 GrB.bfs  time:   11.849 visited: 134217728 depth:        8
-trial:  2 source: 121280315 GrB.bfs  time:   11.751 visited: 134217728 depth:        8
-trial:  3 source:  2413432 GrB.bfs  time:   11.921 visited: 134217728 depth:        8
-trial:  4 source: 37512114 GrB.bfs  time:   11.878 visited: 134217728 depth:        8
-trial:  5 source: 38390878 GrB.bfs  time:   11.912 visited: 134217728 depth:        8
-trial:  6 source: 56651038 GrB.bfs  time:   11.785 visited: 134217728 depth:        8
-trial:  7 source: 128461249 GrB.bfs  time:   11.902 visited: 134217728 depth:        8
-trial:  8 source: 33029843 GrB.bfs  time:   11.870 visited: 134217728 depth:        8
-trial:  9 source: 71406329 GrB.bfs  time:   11.846 visited: 134217728 depth:        8
-trial: 10 source: 117872828 GrB.bfs  time:   11.932 visited: 134217728 depth:        8
-trial: 11 source: 24351939 GrB.bfs  time:   11.877 visited: 134217728 depth:        8
-trial: 12 source: 15444520 GrB.bfs  time:   11.751 visited: 134217728 depth:        8
-trial: 13 source: 127526282 GrB.bfs  time:   11.867 visited: 134217728 depth:        8
-trial: 14 source: 112279429 GrB.bfs  time:   11.921 visited: 134217728 depth:        8
-trial: 15 source: 13631650 GrB.bfs  time:   11.946 visited: 134217728 depth:        8
-trial: 16 source: 110379303 GrB.bfs  time:   11.840 visited: 134217728 depth:        8
-trial: 17 source: 44800624 GrB.bfs  time:   11.831 visited: 134217728 depth:        8
-trial: 18 source: 77768194 GrB.bfs  time:   11.923 visited: 134217728 depth:        8
-trial: 19 source:   175348 GrB.bfs  time:   11.802 visited: 134217728 depth:        8
-trial: 20 source: 107397390 GrB.bfs  time:   11.756 visited: 134217728 depth:        8
-trial: 21 source: 43457210 GrB.bfs  time:   11.807 visited: 134217728 depth:        8
-trial: 22 source: 97215941 GrB.bfs  time:   11.804 visited: 134217728 depth:        8
-trial: 23 source: 73575166 GrB.bfs  time:   11.896 visited: 134217728 depth:        8
-trial: 24 source: 44449716 GrB.bfs  time:   11.783 visited: 134217728 depth:        8
-trial: 25 source: 33931725 GrB.bfs  time:   11.871 visited: 134217728 depth:        8
-trial: 26 source: 55526611 GrB.bfs  time:   11.863 visited: 134217728 depth:        8
-trial: 27 source: 14422052 GrB.bfs  time:   11.796 visited: 134217728 depth:        8
-trial: 28 source: 58043874 GrB.bfs  time:   11.869 visited: 134217728 depth:        8
-trial: 29 source: 72137330 GrB.bfs  time:   11.949 visited: 134217728 depth:        8
-trial: 30 source:  9647841 GrB.bfs  time:   11.878 visited: 134217728 depth:        8
-trial: 31 source: 15940696 GrB.bfs  time:   11.817 visited: 134217728 depth:        8
-trial: 32 source: 14209953 GrB.bfs  time:   11.952 visited: 134217728 depth:        8
-trial: 33 source: 49020884 GrB.bfs  time:   11.885 visited: 134217728 depth:        8
-trial: 34 source: 28901139 GrB.bfs  time:   11.851 visited: 134217728 depth:        8
-trial: 35 source: 50493274 GrB.bfs  time:   11.940 visited: 134217728 depth:        8
-trial: 36 source: 49150070 GrB.bfs  time:   11.826 visited: 134217728 depth:        8
-trial: 37 source: 126525083 GrB.bfs  time:   11.824 visited: 134217728 depth:        8
-trial: 38 source:  6382741 GrB.bfs  time:   11.819 visited: 134217728 depth:        8
-trial: 39 source: 89108298 GrB.bfs  time:   11.951 visited: 134217728 depth:        8
-trial: 40 source:  9239736 GrB.bfs  time:   11.832 visited: 134217728 depth:        8
-trial: 41 source: 110168549 GrB.bfs  time:   11.899 visited: 134217728 depth:        8
-trial: 42 source: 95370260 GrB.bfs  time:   11.848 visited: 134217728 depth:        8
-trial: 43 source: 116653531 GrB.bfs  time:   11.825 visited: 134217728 depth:        8
-trial: 44 source: 123410704 GrB.bfs  time:   11.832 visited: 134217728 depth:        8
-trial: 45 source: 16733666 GrB.bfs  time:   11.860 visited: 134217728 depth:        8
-trial: 46 source: 49030283 GrB.bfs  time:   11.837 visited: 134217728 depth:        8
-trial: 47 source: 108545122 GrB.bfs  time:   11.861 visited: 134217728 depth:        8
-trial: 48 source: 99095666 GrB.bfs  time:   11.830 visited: 134217728 depth:        8
-trial: 49 source: 133850078 GrB.bfs  time:   11.849 visited: 134217728 depth:        8
-trial: 50 source: 63499302 GrB.bfs  time:   11.790 visited: 134217728 depth:        8
-trial: 51 source: 21541383 GrB.bfs  time:   11.824 visited: 134217728 depth:        8
-trial: 52 source:  6230752 GrB.bfs  time:   11.838 visited: 134217728 depth:        8
-trial: 53 source: 89077457 GrB.bfs  time:   11.894 visited: 134217728 depth:        8
-trial: 54 source: 70392766 GrB.bfs  time:   11.871 visited: 134217728 depth:        8
-trial: 55 source:  6670456 GrB.bfs  time:   11.832 visited: 134217728 depth:        8
-trial: 56 source: 61746272 GrB.bfs  time:   11.815 visited: 134217728 depth:        8
-trial: 57 source: 83349536 GrB.bfs  time:   11.852 visited: 134217728 depth:        8
-trial: 58 source: 115272185 GrB.bfs  time:   11.807 visited: 134217728 depth:        8
-trial: 59 source: 20129909 GrB.bfs  time:   11.819 visited: 134217728 depth:        8
-trial: 60 source: 106148554 GrB.bfs  time:   11.551 visited: 134217728 depth:        8
-trial: 61 source: 117042376 GrB.bfs  time:   11.884 visited: 134217728 depth:        8
-trial: 62 source: 71431188 GrB.bfs  time:   11.827 visited: 134217728 depth:        8
-trial: 63 source: 45287809 GrB.bfs  time:   11.735 visited: 134217728 depth:        8
-trial: 64 source: 107702121 GrB.bfs  time:   11.878 visited: 134217728 depth:        8
-avg GrB.bfs  time:  11.8474 (64 trials)
->> 
diff --git a/GraphBLAS/GraphBLAS/README.md b/GraphBLAS/GraphBLAS/README.md
index 7f361b655c..0d6b60d0b6 100644
--- a/GraphBLAS/GraphBLAS/README.md
+++ b/GraphBLAS/GraphBLAS/README.md
@@ -1,7 +1,7 @@
 # GraphBLAS/GraphBLAS: MATLAB interface for SuiteSparse:GraphBLAS
 
-SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
 
 The GrB class provides an easy-to-use MATLAB interface to SuiteSparse:GraphBLAS.
 
@@ -116,8 +116,9 @@ These features are supported, but are not as fast as they could be:
     concatenation: [A B], [A;B], and the built-in functions: bandwidth, eps,
     isbanded, isdiag, ishermitian, issymmetric, istril, istriu, spfun.
 
-For Windows: Microsoft Visual Studio does not support OpenMP tasking,
-    which means that the internal sort is not parallel, but sequential.
-    This affects the performance of GrB.build, and some uses of matrix
-    subreferencing (C(I,J) when I and/or J are unsorted lists).
+A GrB matrix object can be saved to a mat-file and loaded back in, but
+must be loaded in with the same major version of SuiteSparse:GraphBLAS.
+v4.0.1. cannot load mat-files saved from v3.3.3 for example.  In the
+future, I will consider GrB.load and GrB.save methods that would work
+across different versions.
 
diff --git a/GraphBLAS/GraphBLAS/demo/Contents.m b/GraphBLAS/GraphBLAS/demo/Contents.m
index 34bae91037..deebc73242 100644
--- a/GraphBLAS/GraphBLAS/demo/Contents.m
+++ b/GraphBLAS/GraphBLAS/demo/Contents.m
@@ -4,11 +4,11 @@
 %   dnn_matlab  - Sparse deep neural network in pure MATLAB
 %   gbdemo      - run the graphblas_demo.m
 %   gbdemo2     - Extreme performance differences: GraphBLAS vs MATLAB.
-%   graphblas_demo      - GraphBLAS demo, for publish command
-%   graphblas_demo2     - GraphBLAS demo2, for publish command
+%   graphblas_demo      - GraphBLAS: graph algorithms in the language of linear algebra
+%   graphblas_demo2     - Run the GraphBLAS demo2
 %
-%   dnn_mat2gb  - convert MATLAB dnn problem to GrB
-%   dnn_run     - run the DNN for the MIT Challenge
+%   dnn_mat2gb  - convert sparse deep neural network from MATLAB to GraphBLAS
+%   dnn_run     - Run the DNN benchmarks in MATLAB.
 %   mxm_demo    - performance test of real and complex A*B
 %
 % Folders and other files:
@@ -18,6 +18,6 @@
 %   mxm_demo_DellXPS13.txt   - mxm_demo results on Intel Core i7-8565U (4 core)
 %   mxm_demo_DGX_Station.txt - mxm_demo results on Intel Xeon E5-2689 (20 core)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
diff --git a/GraphBLAS/GraphBLAS/demo/bfs_matlab.m b/GraphBLAS/GraphBLAS/demo/bfs_matlab.m
index 1e89087614..7e705e07db 100644
--- a/GraphBLAS/GraphBLAS/demo/bfs_matlab.m
+++ b/GraphBLAS/GraphBLAS/demo/bfs_matlab.m
@@ -1,5 +1,5 @@
 function v = bfs_matlab (A, s)
-%BFS_MATLAB a simple breadth-first-search in MATLAB
+%BFS_MATLAB breadth-first-search in MATLAB
 %
 % v = bfs_matlab (A, s)
 %
@@ -11,8 +11,8 @@
 % kth level, where the shortest path (in terms of # of edges) from  s to j has
 % length k+1.  The source node s defaults to 1.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m, n] = size (A) ;
 if (m ~= n)
@@ -52,3 +52,4 @@
 
 end
 
+
diff --git a/GraphBLAS/GraphBLAS/demo/dnn_mat2gb.m b/GraphBLAS/GraphBLAS/demo/dnn_mat2gb.m
index b3cb99f1d6..2538b37165 100644
--- a/GraphBLAS/GraphBLAS/demo/dnn_mat2gb.m
+++ b/GraphBLAS/GraphBLAS/demo/dnn_mat2gb.m
@@ -23,14 +23,17 @@
 %
 % See also GrB.dnn, dnn_matlab.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-d = struct ('format', 'by row') ;
+fmt = 'by row' ;
+prec = 'single' ;
+
+d = struct ('format', fmt) ;
 n = size (Y0, 2) ;
-Y0 = GrB (Y0, 'single', 'by row') ;
+Y0 = GrB (Y0, prec, fmt) ;
 for k=1:length(W)
-    W {k} = GrB (W {k}, 'single', 'by row') ;
-    bias {k} = GrB.build (1:n, 1:n, bias {k}, n, n, '+', 'single', d) ;
+    W {k} = GrB (W {k}, prec, fmt) ;
+    bias {k} = GrB.build (1:n, 1:n, bias {k}, n, n, '+', prec, d) ;
 end
 
diff --git a/GraphBLAS/GraphBLAS/demo/dnn_matlab.m b/GraphBLAS/GraphBLAS/demo/dnn_matlab.m
index 4d262ad894..8e79beda58 100644
--- a/GraphBLAS/GraphBLAS/demo/dnn_matlab.m
+++ b/GraphBLAS/GraphBLAS/demo/dnn_matlab.m
@@ -11,8 +11,8 @@
 %
 % See also GrB.dnn, dnn_mat2gb.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 Y = Y0 ;
 for i=1:length(W)
diff --git a/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_gb.m b/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_gb.m
index 9529ea9dad..5c2c442be4 100644
--- a/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_gb.m
+++ b/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_gb.m
@@ -15,7 +15,7 @@
 % See also dnn_matlab, dnn_mat2gb.
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SPDX-License-Identifier: Apache-2.0
 
 Y = Y0 ;
 for i=1:length(W)
diff --git a/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_matlab.m b/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_matlab.m
index ee2c5de42d..852c22aba4 100644
--- a/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_matlab.m
+++ b/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_matlab.m
@@ -12,7 +12,7 @@
 % See also dnn_gb, dnn_mat2gb.
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SPDX-License-Identifier: Apache-2.0
 
 Y = Y0 ;
 for i=1:length(W)
diff --git a/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_run.m b/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_run.m
index 46def13d7d..646199e8ae 100644
--- a/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_run.m
+++ b/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_run.m
@@ -1,7 +1,7 @@
 % Set locations of files.
 
 % SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SPDX-License-Identifier: Apache-2.0
 
 rootdir = '/raid/hyper/GraphChallenge/dnn_data/MATLAB' ;
 ncores = maxNumCompThreads ;
diff --git a/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_summary.m b/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_summary.m
index a29353eb80..1d21691a7d 100644
--- a/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_summary.m
+++ b/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_summary.m
@@ -1,4 +1,8 @@
 % MATLAB dnn_gb.m
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % threads:   1 time:      23.53 sec speedup:     1.00 rate:      10.03 billion
 % threads:   1 time:      67.52 sec speedup:     1.00 rate:      13.98 billion
 % threads:   1 time:     242.19 sec speedup:     1.00 rate:      15.59 billion
@@ -12,9 +16,6 @@
 % threads:   1 time:   13816.56 sec speedup:     1.00 rate:       4.37 billion
 % threads:   1 time:   54701.46 sec speedup:     1.00 rate:       4.42 billion
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
 TM1 =  [
       23.53
       67.52
diff --git a/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_summary.tex b/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_summary.tex
index b6cce83049..f77965c7f7 100644
--- a/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_summary.tex
+++ b/GraphBLAS/GraphBLAS/demo/dnn_results/dnn_summary.tex
@@ -289,7 +289,7 @@ \subsection{v3.2.0, Feb 2020}
 Using a single thread on the largest problem takes about 18 hours, and thus
 results were still in progress at the time v3.2.0 was released.  This document
 will be updated after the formal release of v3.2.0 when the runs complete.  See
-the master branch at \url{https://github.com/DrTimothyAldenDavis/GraphBLAS} for
+the primary branch at \url{https://github.com/DrTimothyAldenDavis/GraphBLAS} for
 the updated document.
 
 \end{document}
diff --git a/GraphBLAS/GraphBLAS/demo/dnn_run.m b/GraphBLAS/GraphBLAS/demo/dnn_run.m
index 9c49d4fb8b..d0f8db34e0 100644
--- a/GraphBLAS/GraphBLAS/demo/dnn_run.m
+++ b/GraphBLAS/GraphBLAS/demo/dnn_run.m
@@ -1,14 +1,19 @@
-% Set locations of files.
+% Run the DNN benchmarks in MATLAB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+% reset to the default number of threads
+clear all
+maxNumCompThreads ('automatic') ;
+GrB.clear ;
 
 rootdir = '/raid/hyper/GraphChallenge/dnn_data/MATLAB' ;
 ncores = maxNumCompThreads ;
 fprintf ('# of cores :  %d\n', ncores) ;
 GrB.format ('by row') ;
 
-for id = 1 % 1:12
+for id = 1:12
     % load the problem
     tic ;
     fname = sprintf ('%s/HPEC19_dnn_%02d.mat', rootdir, id) ;
@@ -69,3 +74,5 @@
     clear W Y Y0
 end
 
+% restore the default # of threads
+GrB.clear ;
diff --git a/GraphBLAS/GraphBLAS/demo/gbdemo.m b/GraphBLAS/GraphBLAS/demo/gbdemo.m
index de4dcd0e0c..43b01bf4eb 100644
--- a/GraphBLAS/GraphBLAS/demo/gbdemo.m
+++ b/GraphBLAS/GraphBLAS/demo/gbdemo.m
@@ -1,6 +1,7 @@
-% Run the GraphBLAS demo
+%GBDEMO run the graphblas_demo.m
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 echodemo ('graphblas_demo') ;
+
diff --git a/GraphBLAS/GraphBLAS/demo/gbdemo2.m b/GraphBLAS/GraphBLAS/demo/gbdemo2.m
index 7074f0319f..569a3f9236 100644
--- a/GraphBLAS/GraphBLAS/demo/gbdemo2.m
+++ b/GraphBLAS/GraphBLAS/demo/gbdemo2.m
@@ -6,8 +6,8 @@ function gbdemo2 (bnz)
 %       gbdemo2             % uses a default bnz = 6000
 %       gbdemo2 (20000)     % uses bnz = 20000
 %
-% The GraphBLAS operations used in gbdemo are perhaps 3x to 50x
-% faster than the corresponding MATLAB operations, depending on how
+% Many of the GraphBLAS operations used in gbdemo are perhaps 3x to
+% 50x faster than the corresponding MATLAB operations, depending on how
 % many cores your computer has.  Here's an example where GraphBLAS is
 % asymptotically far faster than MATLAB R2019a: a simple assignment
 % for a large matrix C:
@@ -35,8 +35,12 @@ function gbdemo2 (bnz)
 %
 % See also GrB.assign, subsasgn.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+% reset to the default number of threads
+maxNumCompThreads ('automatic') ;
+GrB.clear ;
 
 nthreads = GrB.threads ;
 help gbdemo2
diff --git a/GraphBLAS/GraphBLAS/demo/graphblas_demo.m b/GraphBLAS/GraphBLAS/demo/graphblas_demo.m
index b8f3f44c84..c36d6ef14a 100644
--- a/GraphBLAS/GraphBLAS/demo/graphblas_demo.m
+++ b/GraphBLAS/GraphBLAS/demo/graphblas_demo.m
@@ -4,15 +4,17 @@
 % for more details and resources.  See also the SuiteSparse:GraphBLAS
 % User Guide in this package.
 %
-% SuiteSparse:GraphBLAS, (c) 2017-2020, Tim Davis, Texas A&M University,
 % http://faculty.cse.tamu.edu/davis
+%
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %% GraphBLAS: faster and more general sparse matrices for MATLAB
 % GraphBLAS is not only useful for creating graph algorithms; it also
 % supports a wide range of sparse matrix data types and operations.
 % MATLAB can compute C=A*B with just two semirings: 'plus.times.double'
-% and 'plus.times.complex' for complex matrices.  GraphBLAS has 1,473
-% unique built-in semirings, such as 'max.plus'
+% and 'plus.times.complex' for complex matrices.  GraphBLAS has 2,518
+% built-in semirings, such as 'max.plus'
 % (https://en.wikipedia.org/wiki/Tropical_semiring).  These semirings can
 % be used to construct a wide variety of graph algorithms, based on
 % operations on sparse adjacency matrices.
@@ -23,8 +25,12 @@
 % single complex (with MATLAB matrices, these types can only be held in
 % full matrices).
 
-clear
-GrB.clear
+% reset to the default number of threads
+clear all
+maxNumCompThreads ('automatic') ;
+GrB.clear ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
+
 format compact
 rng ('default') ;
 X = 100 * rand (2) ;
@@ -33,8 +39,17 @@
 %% Sparse integer matrices
 % Here's an int8 version of the same matrix:
 
-S = int8 (G)            % convert G to a full MATLAB int8 matrix
-G = GrB (X, 'int8')      % a GraphBLAS sparse int8 matrix
+S = int8 (G)             % convert G to a full MATLAB int8 matrix
+S (1,1) = 0              % add an explicit zero to S
+G = GrB (X, 'int8')      % a GraphBLAS full int8 matrix
+G (1,1) = 0              % add an explicit zero to G
+G = GrB.prune (G)        % a GraphBLAS sparse int8 matrix
+
+try
+    S = sparse (S) ;     % MATLAB can't create sparse int8 matrices
+catch me
+    display (me)
+end
 
 %% Sparse single-precision matrices
 % Matrix operations in GraphBLAS are typically as fast, or faster than
@@ -57,6 +72,7 @@
 fprintf ('MATLAB time:    %g sec (in double)\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 %% Mixing MATLAB and GraphBLAS matrices
 % The error in the last computation is about eps('single') since
@@ -83,6 +99,7 @@
 fprintf ('MATLAB time:    %g sec (in double)\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 %% A wide range of semirings
 % MATLAB can only compute C=A*B using the standard '+.*.double' and
@@ -163,10 +180,9 @@
 % The C interface for SuiteSparse:GraphBLAS allows for arbitrary types
 % and operators to be constructed.  However, the MATLAB interface to
 % SuiteSparse:GraphBLAS is restricted to pre-defined types and operators:
-% a mere 13 types, 204 unary operators, 385 binary operators, 77 monoids,
-% 16 select operators, and 2,438 semirings (1,473 of which are unique,
-% since some binary operators are equivalent: 'min.logical' and
-% '&.logical' are the same thing, for example).
+% a mere 13 types, 212 unary operators, 401 binary operators, 77 monoids,
+% 22 select operators (each of which can be used for all 13 types),
+% and 2,518 semirings.
 %
 % That gives you a lot of tools to create all kinds of interesting
 % graph algorithms.  For example:
@@ -178,6 +194,7 @@
 % See 'help GrB.binopinfo' for a list of the binary operators, and
 % 'help GrB.monoidinfo' for the ones that can be used as the additive
 % monoid in a semiring.  'help GrB.unopinfo' lists the unary operators.
+% 'help GrB.semiringinfo' descripts the semirings.
 
 %% 
 help GrB.binopinfo
@@ -188,6 +205,9 @@
 %% 
 help GrB.unopinfo
 
+%% 
+help GrB.semiringinfo
+
 %% Element-wise operations
 % Binary operators can be used in element-wise matrix operations, like
 % C=A+B and C=A.*B.  For the matrix addition C=A+B, the pattern of C is
@@ -267,7 +287,7 @@
 %
 % A few differences with the built-in functions:
 %
-%   S = sparse (G)        % makes a copy of a GrB matrix
+%   S = sparse (G)        % converts G to sparse/hypersparse
 %   F = full (G)          % adds explicit zeros, so numel(F)==nnz(F)
 %   F = full (G,type,id)  % adds explicit identity values to a GrB matrix
 %   disp (G, level)       % display a GrB matrix G; level=2 is the default.
@@ -325,15 +345,15 @@
 disp (G,1)
 
 %% Storing a matrix by row or by column
-% MATLAB stores its sparse matrices by column, refered to as 'standard
-% CSC' in SuiteSparse:GraphBLAS.  In the CSC (compressed sparse column)
-% format, each column of the matrix is stored as a list of entries, with
-% their value and row index.  In the CSR (compressed sparse row) format,
-% each row is stored as a list of values and their column indices.
-% GraphBLAS uses both CSC and CSR, and the two formats can be intermixed
-% arbitrarily.  In its C interface, the default format is CSR.  However,
-% for better compatibility with MATLAB, this MATLAB interface for
-% SuiteSparse:GraphBLAS uses CSC by default instead. 
+% MATLAB stores its sparse matrices by column, refered to as 'sparse by
+% col' in SuiteSparse:GraphBLAS.  In the 'sparse by col' format, each
+% column of the matrix is stored as a list of entries, with their value
+% and row index.  In the 'sparse by row' format, each row is stored as a
+% list of values and their column indices.  GraphBLAS uses both 'by row'
+% and 'by col', and the two formats can be intermixed arbitrarily.  In
+% its C interface, the default format is 'by row'.  However, for better
+% compatibility with MATLAB, the SuiteSparse:GraphBLAS MATLAB interface
+% uses 'by col' by default instead. 
 
 %%
 rng ('default') ;
@@ -344,14 +364,14 @@
 GrB.format (G)
 
 %%
-% Many graph algorithms work better in CSR format, with matrices stored
-% by row.  For example, it is common to use A(i,j) for the edge (i,j),
-% and many graph algorithms need to access the out-adjacencies of nodes,
-% which is the row A(i,;) for node i.  If the CSR format is desired,
-% GrB.format ('by row') tells GraphBLAS to create all subsequent matrices
-% in the CSR format.  Converting from a MATLAB sparse matrix (in standard
-% CSC format) takes a little more time (requiring a transpose), but
-% subsequent graph algorithms can be faster.
+% Many graph algorithms work better in 'by row' format, with matrices
+% stored by row.  For example, it is common to use A(i,j) for the edge
+% (i,j), and many graph algorithms need to access the out-adjacencies of
+% nodes, which is the row A(i,;) for node i.  If the 'by row' format is
+% desired, GrB.format ('by row') tells GraphBLAS to create all subsequent
+% matrices in the 'by row' format.  Converting from a MATLAB sparse matrix
+% (in standard 'by col' format) takes a little more time (requiring a
+% transpose), but subsequent graph algorithms can be faster.
 
 %%
 G = GrB (C, 'by row')
@@ -360,12 +380,13 @@
 fprintf ('the format of H is:    %s\n', GrB.format (H)) ;
 err = norm (H-G,1)
 
-%% Hypersparse matrices
-% SuiteSparse:GraphBLAS can use two kinds of sparse matrix data
-% structures: standard and hypersparse, for both CSC and CSR formats.  In
-% the standard CSC format used in MATLAB, an m-by-n matrix A takes
-% O(n+nnz(A)) space.  MATLAB can create huge column vectors, but not huge
-% matrices (when n is huge).
+%% Hypersparse, sparse, bitmap, and full matrices
+% SuiteSparse:GraphBLAS can use four kinds of sparse matrix data
+% structures: hypersparse, sparse, bitmap, and full, in both 'by col' and
+% 'by row' formats, for a total of eight different combinations.  In the
+% 'sparse by col' that MATLAB uses for its sparse matrices, an m-by-n
+% matrix A takes O(n+nnz(A)) space.  MATLAB can create huge column
+% vectors, but not huge matrices (when n is huge).
 
 clear
 [c, huge] = computer ;
@@ -459,12 +480,12 @@
 % Furthermore, C=A*B is not defined for integer types in MATLAB, except
 % when A and/or B are scalars.
 %
-% GraphBLAS supports all of those types for its sparse matrices.  All
-% operations are supported, including C=A*B when A or B are any integer
-% type, in 1000s of semirings.
+% GraphBLAS supports all of those types for all of its matrices (hyper,
+% sparse, bitmap, or full).  All operations are supported, including C=A*B
+% when A or B are any integer type, in 1000s of semirings.
 %
-% However, integer arithmetic differs in GraphBLAS and MATLAB.  In
-% MATLAB, integer values saturate if they exceed their maximum value.  In
+% However, integer arithmetic differs in GraphBLAS and MATLAB.  In MATLAB,
+% integer values saturate if they exceed their maximum value.  In
 % GraphBLAS, integer operators act in a modular fashion.  The latter is
 % essential when computing C=A*B over a semiring.  A saturating integer
 % operator cannot be used as a monoid since it is not associative.
@@ -504,6 +525,7 @@
 fprintf ('MATLAB time:    %g sec\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 %% Example graph algorithm: Luby's method in GraphBLAS
 % The GrB.mis function is variant of Luby's randomized algorithm [Luby
@@ -549,6 +571,7 @@
 fprintf ('# layers:   %d\n', nlayers) ;
 fprintf ('# neurons:  %d\n', nneurons) ;
 fprintf ('# features: %d\n', nfeatures) ;
+fprintf ('# of threads used: %d\n', GrB.threads) ;
 
 tic
 Y0 = sprand (nfeatures, nneurons, 0.1) ;
@@ -582,6 +605,7 @@
 fprintf ('total time in MATLAB:    %g sec\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 err = norm (Y1-Y2,1)
 
@@ -605,7 +629,7 @@
 % as { start, inc, fini }, instead of start:inc:fini. See
 % 'help GrB.extract', 'help GrB.assign' for the functional form.
 % For the overloaded syntax C(I,J)=A and C=A(I,J), see
-% 'help GrB/subsasgn' and 'help GrB/subsfref'.  The cell array
+% 'help GrB/subsasgn' and 'help GrB/subsref'.  The cell array
 % syntax isn't conventional, but it is far faster than the MATLAB
 % colon notation for objects, and takes far less memory when I is huge.
 
@@ -689,11 +713,11 @@
 %       C = GrB.assign (C, M, A)
 %
 % The GrB.assign statement computes C(M)=A(M), and it is vastly faster
-% than C(M)=A(M), even if the time to convert the GrB matrix back to a
-% MATLAB sparse matrix is included.
+% than C(M)=A(M) for MATLAB sparse matrices, even if the time to convert
+% the GrB matrix back to a MATLAB sparse matrix is included.
 %
-% GraphBLAS can also compute C (M) = A (M) using overloaded operators
-% for subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.
+% GraphBLAS can also compute C(M)=A(M) using overloaded operators for
+% subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.
 %
 % Here are both methods in GraphBLAS (both are very fast).  Setting up:
 
@@ -736,16 +760,17 @@
 matlab_time = toc ;
 
 fprintf ('\nGraphBLAS time: %g sec (GrB.assign)\n', gb_time) ;
-fprintf ('\nGraphBLAS time: %g sec (overloading)\n', gb_time2) ;
+fprintf ('GraphBLAS time: %g sec (overloading)\n', gb_time2) ;
 fprintf ('MATLAB time:    %g sec\n', matlab_time) ;
-fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
+fprintf ('Speedup of GraphBLAS (overloading) over MATLAB: %g\n', ...
     matlab_time / gb_time2) ;
+fprintf ('Speedup of GraphBLAS (GrB.assign)  over MATLAB: %g\n', ...
+    matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
-% GraphBLAS computes the exact same result with both methods:
 assert (isequal (C1, C))
 assert (isequal (C2, C))
-C1 - C
-C2 - C
+fprintf ('Results of GrB and MATLAB match perfectly.\n')
 
 %% Limitations and their future solutions
 % The MATLAB interface for SuiteSparse:GraphBLAS is a work-in-progress.
@@ -788,9 +813,9 @@
 % the equivalent built-in operators and functions in MATLAB.
 %
 % There are few notable exceptions; these will be addressed in the future.
-% Dense matrices and vectors held as GraphBLAS objects are slower than
-% their MATLAB counterparts.  horzcat and vertcat, for [A B] and [A;B]
-% when either A or B are GraphBLAS matrices, are also slow, as
+% Full matrices and vectors held as GraphBLAS objects can be slightly
+% slower than their MATLAB counterparts.  horzcat and vertcat, for [A B]
+% and [A;B] when either A or B are GraphBLAS matrices, are also slow, as
 % illustrated below in the next example.
 %
 % Other methods that will be faster in the future include bandwidth,
@@ -847,15 +872,17 @@
 err = norm (C1-C2,1)
 
 %%
-% (6) Performance issues
+% (6) MATLAB object overhead.
 %
 % The GrB matrix is a MATLAB object, and there are some cases where
-% performance issues can arise.  Extracting the contents of a MATLAB
-% object (G.field) takes much more time than for a MATLAB struct with
-% the same syntax, and building an object has similar issues.  The
-% difference is small, and it does not affect large problems.  But if
-% you have many calls to GrB operations with a small amount of work,
-% then the time can be dominated by the MATLAB object-oriented overhead.
+% performance issues can arise as a result.  Extracting the contents of
+% a MATLAB object (G.field) takes much more time than for a MATLAB struct
+% with % the same syntax, and building an object has similar issues.  The
+% difference is small, and it does not affect large problems.  But if you
+% have many calls to GrB operations with a small amount of work, then the
+% time can be dominated by the MATLAB object-oriented overhead.
+%
+% There is no solution or workaround to this issue.
 
 A = rand (3,4) ;
 G = GrB (A) ;
@@ -877,12 +904,14 @@
 % the initial value of the matrix C for the expression below, an optional
 % mask matrix M, and an optional accumulator operator.
 %
-%      C<#M,replace> = accum (C, T)
+%   in GrB syntax:  C<#M,replace> = accum (C, A*B)
+%
+%   in @GrB MATLAB: C = GrB.mxm (Cin, M, accum, semiring, A, B, desc) ;
 %
 % In the above expression, #M is either empty (no mask), M (with a mask
 % matrix) or ~M (with a complemented mask matrix), as determined by the
-% descriptor.  'replace' can be used to clear C after it is used in
-% accum(C,T) but before it is assigned with C<...> = Z, where
+% descriptor (desc).  'replace' can be used to clear C after it is used
+% in accum(C,T) but before it is assigned with C<...> = Z, where
 % Z=accum(C,T).  The matrix T is the result of some operation, such as
 % T=A*B for GrB.mxm, or T=op(A,B) for GrB.eadd.
 %
@@ -892,6 +921,6 @@
 %
 % Thanks for watching!
 %
-% Tim Davis, Texas A&M University, http://faculty.cse.tamu.edu/davis
-% See also doc sparse and https://twitter.com/DocSparse
+% Tim Davis, Texas A&M University, http://faculty.cse.tamu.edu/davis,
+% https://twitter.com/DocSparse
 
diff --git a/GraphBLAS/GraphBLAS/demo/graphblas_demo2.m b/GraphBLAS/GraphBLAS/demo/graphblas_demo2.m
index 2f2541e89f..de6f783d9d 100644
--- a/GraphBLAS/GraphBLAS/demo/graphblas_demo2.m
+++ b/GraphBLAS/GraphBLAS/demo/graphblas_demo2.m
@@ -1,6 +1,11 @@
 % Run the GraphBLAS demo2
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+% reset to the default number of threads
+clear all
+maxNumCompThreads ('automatic') ;
+GrB.clear ;
 
 gbdemo2
diff --git a/GraphBLAS/GraphBLAS/demo/html/DGX_Station/README.txt b/GraphBLAS/GraphBLAS/demo/html/DGX_Station/README.txt
index c0021ac0d0..e22e033aca 100644
--- a/GraphBLAS/GraphBLAS/demo/html/DGX_Station/README.txt
+++ b/GraphBLAS/GraphBLAS/demo/html/DGX_Station/README.txt
@@ -1,7 +1,7 @@
 Results on an NVIDIA DGX Station
 
 Intel(R) Xeon(R) CPU E5-2698 v4 @ 2.20GHz.
-256GB of RAM.  MATLAB R2018a.  Ubuntu 16.04.
+256GB of RAM.  MATLAB R2020b.  Ubuntu 16.04.
 GraphBLAS and the mexFunction interface compiled with gcc 5.4.0.
 The CPU has 20 hardware cores, and 20 threads were used
 (the default when running OpenMP inside MATLAB).
diff --git a/GraphBLAS/GraphBLAS/demo/html/DGX_Station/graphblas_demo.html b/GraphBLAS/GraphBLAS/demo/html/DGX_Station/graphblas_demo.html
index f0d6766e2b..97957a5386 100644
--- a/GraphBLAS/GraphBLAS/demo/html/DGX_Station/graphblas_demo.html
+++ b/GraphBLAS/GraphBLAS/demo/html/DGX_Station/graphblas_demo.html
@@ -6,8 +6,8 @@
    <!--
 This HTML was auto-generated from MATLAB code.
 To make changes, update the MATLAB code and republish this document.
-      --><title>GraphBLAS: graph algorithms in the language of linear algebra</title><meta name="generator" content="MATLAB 9.4"><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/"><meta name="DC.date" content="2020-06-25"><meta name="DC.source" content="graphblas_demo.m"><style type="text/css">
-html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,font,img,ins,kbd,q,s,samp,small,strike,strong,sub,sup,tt,var,b,u,i,center,dl,dt,dd,ol,ul,li,fieldset,form,label,legend,table,caption,tbody,tfoot,thead,tr,th,td{margin:0;padding:0;border:0;outline:0;font-size:100%;vertical-align:baseline;background:transparent}body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote:before,blockquote:after,q:before,q:after{content:'';content:none}:focus{outine:0}ins{text-decoration:none}del{text-decoration:line-through}table{border-collapse:collapse;border-spacing:0}
+      --><title>GraphBLAS: graph algorithms in the language of linear algebra</title><meta name="generator" content="MATLAB 9.9"><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/"><meta name="DC.date" content="2021-01-03"><meta name="DC.source" content="graphblas_demo.m"><style type="text/css">
+html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,font,img,ins,kbd,q,s,samp,small,strike,strong,tt,var,b,u,i,center,dl,dt,dd,ol,ul,li,fieldset,form,label,legend,table,caption,tbody,tfoot,thead,tr,th,td{margin:0;padding:0;border:0;outline:0;font-size:100%;vertical-align:baseline;background:transparent}body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote:before,blockquote:after,q:before,q:after{content:'';content:none}:focus{outine:0}ins{text-decoration:none}del{text-decoration:line-through}table{border-collapse:collapse;border-spacing:0}
 
 html { min-height:100%; margin-bottom:1px; }
 html body { height:100%; margin:0px; font-family:Arial, Helvetica, sans-serif; font-size:10px; color:#000; line-height:140%; background:#fff none; overflow-y:scroll; }
@@ -23,7 +23,7 @@
 
 p { padding:0px; margin:0px 0px 20px; }
 img { padding:0px; margin:0px 0px 20px; border:none; }
-p img, pre img, tt img, li img, h1 img, h2 img { margin-bottom:0px; } 
+p img, pre img, tt img, li img, h1 img, h2 img { margin-bottom:0px; }
 
 ul { padding:0px; margin:0px 0px 20px 23px; list-style:square; }
 ul li { padding:0px; margin:0px 0px 7px 0px; }
@@ -52,6 +52,7 @@
 span.string { color:#A020F0 }
 span.untermstring { color:#B20000 }
 span.syscmd { color:#B28C00 }
+span.typesection { color:#A0522D }
 
 .footer { width:auto; padding:10px 0px; margin:25px 0px 0px; border-top:1px dotted #878787; font-size:0.8em; line-height:140%; font-style:italic; color:#878787; text-align:left; float:none; }
 .footer p { margin:0px; }
@@ -66,16 +67,22 @@
 
 
 
-  </style></head><body><div class="content"><h1>GraphBLAS: graph algorithms in the language of linear algebra</h1><!--introduction--><p>GraphBLAS is a library for creating graph algorithms based on sparse linear algebraic operations over semirings.  Visit <a href="http://graphblas.org">http://graphblas.org</a> for more details and resources.  See also the SuiteSparse:GraphBLAS User Guide in this package.</p><p>SuiteSparse:GraphBLAS, (c) 2017-2020, Tim Davis, Texas A&amp;M University, <a href="http://faculty.cse.tamu.edu/davis">http://faculty.cse.tamu.edu/davis</a></p><!--/introduction--><h2>Contents</h2><div><ul><li><a href="#1">GraphBLAS: faster and more general sparse matrices for MATLAB</a></li><li><a href="#2">Sparse integer matrices</a></li><li><a href="#3">Sparse single-precision matrices</a></li><li><a href="#4">Mixing MATLAB and GraphBLAS matrices</a></li><li><a href="#5">Faster matrix operations</a></li><li><a href="#6">A wide range of semirings</a></li><li><a href="#8">The max.plus tropical semiring</a></li><li><a href="#9">A boolean semiring</a></li><li><a href="#13">GraphBLAS operators, monoids, and semirings</a></li><li><a href="#17">Element-wise operations</a></li><li><a href="#19">Subtracting two matrices</a></li><li><a href="#21">Element-wise 'multiplication'</a></li><li><a href="#23">Overloaded operators</a></li><li><a href="#26">Overloaded functions</a></li><li><a href="#28">Zeros are handled differently</a></li><li><a href="#30">Displaying contents of a GraphBLAS matrix</a></li><li><a href="#35">Storing a matrix by row or by column</a></li><li><a href="#39">Hypersparse matrices</a></li><li><a href="#42">numel uses vpa if the matrix is really huge</a></li><li><a href="#44">The mask and accumulator</a></li><li><a href="#46">The descriptor</a></li><li><a href="#47">Integer arithmetic is different in GraphBLAS</a></li><li><a href="#49">An example graph algorithm: breadth-first search</a></li><li><a href="#50">Example graph algorithm: Luby's method in GraphBLAS</a></li><li><a href="#51">Sparse deep neural network</a></li><li><a href="#52">Solving the sparse deep neural network problem with GraphbLAS</a></li><li><a href="#53">Solving the sparse deep neural network problem with MATLAB</a></li><li><a href="#54">For objects, GraphBLAS has better colon notation than MATLAB</a></li><li><a href="#57">Iterative solvers work as-is</a></li><li><a href="#58">... even in single precision</a></li><li><a href="#61">Extreme performance differences between GraphBLAS and MATLAB.</a></li><li><a href="#62">Sparse logical indexing is much, much faster in GraphBLAS</a></li><li><a href="#63">First method in GraphBLAS, with GrB.assign</a></li><li><a href="#64">Second method in GraphBLAS, with C(M)=A(M)</a></li><li><a href="#65">Now with MATLAB matrices, with C(M)=A(M)</a></li><li><a href="#66">Limitations and their future solutions</a></li><li><a href="#73">GraphBLAS operations</a></li></ul></div><h2 id="1">GraphBLAS: faster and more general sparse matrices for MATLAB</h2><p>GraphBLAS is not only useful for creating graph algorithms; it also supports a wide range of sparse matrix data types and operations. MATLAB can compute C=A*B with just two semirings: 'plus.times.double' and 'plus.times.complex' for complex matrices.  GraphBLAS has 1,473 unique built-in semirings, such as 'max.plus' (https://en.wikipedia.org/wiki/Tropical_semiring).  These semirings can be used to construct a wide variety of graph algorithms, based on operations on sparse adjacency matrices.</p><p>MATLAB and GraphBLAS both provide sparse matrices of type double, logical, and double complex.  GraphBLAS adds sparse matrices of type: single, int8, int16, int32, int64, uint8, uint16, uint32, uint64, and single complex (with MATLAB matrices, these types can only be held in full matrices).</p><pre class="codeinput">clear
-GrB.clear
+  </style></head><body><div class="content"><h1>GraphBLAS: graph algorithms in the language of linear algebra</h1><!--introduction--><p>GraphBLAS is a library for creating graph algorithms based on sparse linear algebraic operations over semirings.  Visit <a href="http://graphblas.org">http://graphblas.org</a> for more details and resources.  See also the SuiteSparse:GraphBLAS User Guide in this package.</p><p><a href="http://faculty.cse.tamu.edu/davis">http://faculty.cse.tamu.edu/davis</a></p><p>SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved. SPDX-License-Identifier: Apache-2.0</p><!--/introduction--><h2>Contents</h2><div><ul><li><a href="#1">GraphBLAS: faster and more general sparse matrices for MATLAB</a></li><li><a href="#2">Sparse integer matrices</a></li><li><a href="#3">Sparse single-precision matrices</a></li><li><a href="#4">Mixing MATLAB and GraphBLAS matrices</a></li><li><a href="#5">Faster matrix operations</a></li><li><a href="#6">A wide range of semirings</a></li><li><a href="#8">The max.plus tropical semiring</a></li><li><a href="#9">A boolean semiring</a></li><li><a href="#13">GraphBLAS operators, monoids, and semirings</a></li><li><a href="#18">Element-wise operations</a></li><li><a href="#20">Subtracting two matrices</a></li><li><a href="#22">Element-wise 'multiplication'</a></li><li><a href="#24">Overloaded operators</a></li><li><a href="#27">Overloaded functions</a></li><li><a href="#29">Zeros are handled differently</a></li><li><a href="#31">Displaying contents of a GraphBLAS matrix</a></li><li><a href="#36">Storing a matrix by row or by column</a></li><li><a href="#40">Hypersparse, sparse, bitmap, and full matrices</a></li><li><a href="#43">numel uses vpa if the matrix is really huge</a></li><li><a href="#45">The mask and accumulator</a></li><li><a href="#47">The descriptor</a></li><li><a href="#48">Integer arithmetic is different in GraphBLAS</a></li><li><a href="#50">An example graph algorithm: breadth-first search</a></li><li><a href="#51">Example graph algorithm: Luby's method in GraphBLAS</a></li><li><a href="#52">Sparse deep neural network</a></li><li><a href="#53">Solving the sparse deep neural network problem with GraphbLAS</a></li><li><a href="#54">Solving the sparse deep neural network problem with MATLAB</a></li><li><a href="#55">For objects, GraphBLAS has better colon notation than MATLAB</a></li><li><a href="#58">Iterative solvers work as-is</a></li><li><a href="#59">... even in single precision</a></li><li><a href="#62">Extreme performance differences between GraphBLAS and MATLAB.</a></li><li><a href="#63">Sparse logical indexing is much, much faster in GraphBLAS</a></li><li><a href="#64">First method in GraphBLAS, with GrB.assign</a></li><li><a href="#65">Second method in GraphBLAS, with C(M)=A(M)</a></li><li><a href="#66">Now with MATLAB matrices, with C(M)=A(M)</a></li><li><a href="#67">Limitations and their future solutions</a></li><li><a href="#74">GraphBLAS operations</a></li></ul></div><h2 id="1">GraphBLAS: faster and more general sparse matrices for MATLAB</h2><p>GraphBLAS is not only useful for creating graph algorithms; it also supports a wide range of sparse matrix data types and operations. MATLAB can compute C=A*B with just two semirings: 'plus.times.double' and 'plus.times.complex' for complex matrices.  GraphBLAS has 2,518 built-in semirings, such as 'max.plus' (<a href="https://en.wikipedia.org/wiki/Tropical_semiring">https://en.wikipedia.org/wiki/Tropical_semiring</a>).  These semirings can be used to construct a wide variety of graph algorithms, based on operations on sparse adjacency matrices.</p><p>MATLAB and GraphBLAS both provide sparse matrices of type double, logical, and double complex.  GraphBLAS adds sparse matrices of type: single, int8, int16, int32, int64, uint8, uint16, uint32, uint64, and single complex (with MATLAB matrices, these types can only be held in full matrices).</p><pre class="codeinput"><span class="comment">% reset to the default number of threads</span>
+clear <span class="string">all</span>
+maxNumCompThreads (<span class="string">'automatic'</span>) ;
+GrB.clear ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
+
 format <span class="string">compact</span>
 rng (<span class="string">'default'</span>) ;
 X = 100 * rand (2) ;
 G = GrB (X)              <span class="comment">% GraphBLAS copy of a matrix X, same type</span>
 </pre><pre class="codeoutput">
+# of threads used by GraphBLAS: 20
+
 G =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    81.4724
@@ -83,16 +90,29 @@
     (1,2)    12.6987
     (2,2)    91.3376
 
-</pre><h2 id="2">Sparse integer matrices</h2><p>Here's an int8 version of the same matrix:</p><pre class="codeinput">S = int8 (G)            <span class="comment">% convert G to a full MATLAB int8 matrix</span>
-G = GrB (X, <span class="string">'int8'</span>)      <span class="comment">% a GraphBLAS sparse int8 matrix</span>
+</pre><h2 id="2">Sparse integer matrices</h2><p>Here's an int8 version of the same matrix:</p><pre class="codeinput">S = int8 (G)             <span class="comment">% convert G to a full MATLAB int8 matrix</span>
+S (1,1) = 0              <span class="comment">% add an explicit zero to S</span>
+G = GrB (X, <span class="string">'int8'</span>)      <span class="comment">% a GraphBLAS full int8 matrix</span>
+G (1,1) = 0              <span class="comment">% add an explicit zero to G</span>
+G = GrB.prune (G)        <span class="comment">% a GraphBLAS sparse int8 matrix</span>
+
+<span class="keyword">try</span>
+    S = sparse (S) ;     <span class="comment">% MATLAB can't create sparse int8 matrices</span>
+<span class="keyword">catch</span> me
+    display (me)
+<span class="keyword">end</span>
 </pre><pre class="codeoutput">S =
   2x2 int8 matrix
    81   13
    91   91
+S =
+  2x2 int8 matrix
+    0   13
+   91   91
 
 G =
 
-  2x2 GraphBLAS int8_t matrix, sparse by col:
+  2x2 GraphBLAS int8_t matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)   81
@@ -100,6 +120,35 @@
     (1,2)   13
     (2,2)   91
 
+
+G =
+
+  2x2 GraphBLAS int8_t matrix, full by col
+  3 nonzeros, 4 entries
+
+    (1,1)   0
+    (2,1)   91
+    (1,2)   13
+    (2,2)   91
+
+
+G =
+
+  2x2 GraphBLAS int8_t matrix, bitmap by col
+  3 nonzeros, 3 entries
+
+    (2,1)   91
+    (1,2)   13
+    (2,2)   91
+
+me = 
+  MException with properties:
+
+    identifier: 'MATLAB:sparse:charConversion'
+       message: 'Input matrix must be double or logical.'
+         cause: {}
+         stack: [4x1 struct]
+    Correction: []
 </pre><h2 id="3">Sparse single-precision matrices</h2><p>Matrix operations in GraphBLAS are typically as fast, or faster than MATLAB.  Here's an unfair comparison: computing X^2 with MATLAB in double precision and with GraphBLAS in single precision.  You would naturally expect GraphBLAS to be faster.</p><p>Please wait ...</p><pre class="codeinput">n = 1e5 ;
 X = spdiags (rand (n, 201), -100:100, n, n) ;
 G = GrB (X, <span class="string">'single'</span>) ;
@@ -113,10 +162,13 @@
 fprintf (<span class="string">'MATLAB time:    %g sec (in double)\n'</span>, matlab_time) ;
 fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 </pre><pre class="codeoutput">
-GraphBLAS time: 0.511561 sec (in single)
-MATLAB time:    9.62246 sec (in double)
-Speedup of GraphBLAS over MATLAB: 18.81
+GraphBLAS time: 0.866641 sec (in single)
+MATLAB time:    9.74563 sec (in double)
+Speedup of GraphBLAS over MATLAB: 11.2453
+
+# of threads used by GraphBLAS: 20
 </pre><h2 id="4">Mixing MATLAB and GraphBLAS matrices</h2><p>The error in the last computation is about eps('single') since GraphBLAS did its computation in single precision, while MATLAB used double precision.  MATLAB and GraphBLAS matrices can be easily combined, as in X2-G2.  The sparse single precision matrices take less memory space.</p><pre class="codeinput">err = norm (X2 - G2, 1) / norm (X2,1)
 eps (<span class="string">'single'</span>)
 whos <span class="string">G</span> <span class="string">G2</span> <span class="string">X</span> <span class="string">X2</span>
@@ -127,8 +179,8 @@
   1.1921e-07
   Name           Size                    Bytes  Class     Attributes
 
-  G         100000x100000            241879764  GrB                 
-  G2        100000x100000            481518564  GrB                 
+  G         100000x100000            241879732  GrB                 
+  G2        100000x100000            481518532  GrB                 
   X         100000x100000            322238408  double    sparse    
   X2        100000x100000            641756808  double    sparse    
 
@@ -141,12 +193,15 @@
 fprintf (<span class="string">'MATLAB time:    %g sec (in double)\n'</span>, matlab_time) ;
 fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 </pre><pre class="codeoutput">err =
      0
 
-GraphBLAS time: 0.585372 sec (in double)
-MATLAB time:    9.62246 sec (in double)
-Speedup of GraphBLAS over MATLAB: 16.4382
+GraphBLAS time: 0.629773 sec (in double)
+MATLAB time:    9.74563 sec (in double)
+Speedup of GraphBLAS over MATLAB: 15.4748
+
+# of threads used by GraphBLAS: 20
 </pre><h2 id="6">A wide range of semirings</h2><p>MATLAB can only compute C=A*B using the standard '+.*.double' and '+.*.complex' semirings.  A semiring is defined in terms of a string, 'add.mult.type', where 'add' is a monoid that takes the place of the additive operator, 'mult' is the multiplicative operator, and 'type' is the data type for the two inputs to the mult operator.</p><p>In the standard semiring, C=A*B is defined as:</p><pre class="language-matlab">C(i,j) = sum (A(i,:).' .* B(:,j))
 </pre><p>using 'plus' as the monoid and 'times' as the multiplicative operator. But in a more general semiring, 'sum' can be any monoid, which is an associative and commutative operator that has an identity value.  For example, in the 'max.plus' tropical algebra, C(i,j) for C=A*B is defined as:</p><pre class="language-matlab">C(i,j) = max (A(i,:).' + B(:,j))
 </pre><p>This can be computed in GraphBLAS with:</p><pre class="language-matlab">C = GrB.mxm (<span class="string">'max.+'</span>, A, B)
@@ -221,9 +276,7 @@
     C1 = double (A) * double (B)
 <span class="keyword">end</span>
 C2 = GrB (A) * GrB (B)
-</pre><pre class="codeoutput">MATLAB R2019a required for C=A*B with logical
-matrices.  Explicitly converting to double:
-C1 =
+</pre><pre class="codeoutput">C1 =
    (1,1)        1
    (2,1)        2
    (3,1)        1
@@ -236,7 +289,7 @@
 
 C2 =
 
-  3x3 GraphBLAS bool matrix, sparse by col:
+  3x3 GraphBLAS bool matrix, bitmap by col
   9 nonzeros, 9 entries
 
     (1,1)   1
@@ -256,14 +309,14 @@
   A         3x3                68  logical    sparse    
   B         3x3               113  logical    sparse    
   C1        3x3               176  double     sparse    
-  C2        3x3              1071  GrB                  
+  C2        3x3               776  GrB                  
 
 ans =
     'logical'
-</pre><h2 id="13">GraphBLAS operators, monoids, and semirings</h2><p>The C interface for SuiteSparse:GraphBLAS allows for arbitrary types and operators to be constructed.  However, the MATLAB interface to SuiteSparse:GraphBLAS is restricted to pre-defined types and operators: a mere 13 types, 204 unary operators, 385 binary operators, 77 monoids, 16 select operators, and 2,438 semirings (1,473 of which are unique, since some binary operators are equivalent: 'min.logical' and '&amp;.logical' are the same thing, for example).</p><p>That gives you a lot of tools to create all kinds of interesting graph algorithms.  For example:</p><pre class="language-matlab">GrB.bfs    <span class="comment">% breadth-first search</span>
+</pre><h2 id="13">GraphBLAS operators, monoids, and semirings</h2><p>The C interface for SuiteSparse:GraphBLAS allows for arbitrary types and operators to be constructed.  However, the MATLAB interface to SuiteSparse:GraphBLAS is restricted to pre-defined types and operators: a mere 13 types, 212 unary operators, 401 binary operators, 77 monoids, 22 select operators (each of which can be used for all 13 types), and 2,518 semirings.</p><p>That gives you a lot of tools to create all kinds of interesting graph algorithms.  For example:</p><pre class="language-matlab">GrB.bfs    <span class="comment">% breadth-first search</span>
 GrB.dnn    <span class="comment">% sparse deep neural network (http://graphchallenge.org)</span>
 GrB.mis    <span class="comment">% maximal independent set</span>
-</pre><p>See 'help GrB.binopinfo' for a list of the binary operators, and 'help GrB.monoidinfo' for the ones that can be used as the additive monoid in a semiring.  'help GrB.unopinfo' lists the unary operators.</p><pre class="codeinput">help <span class="string">GrB.binopinfo</span>
+</pre><p>See 'help GrB.binopinfo' for a list of the binary operators, and 'help GrB.monoidinfo' for the ones that can be used as the additive monoid in a semiring.  'help GrB.unopinfo' lists the unary operators. 'help GrB.semiringinfo' descripts the semirings.</p><pre class="codeinput">help <span class="string">GrB.binopinfo</span>
 </pre><pre class="codeoutput"> GRB.BINOPINFO list the details of a GraphBLAS binary operator.
  
     GrB.binopinfo
@@ -304,29 +357,49 @@
  
     operator name(s) f(x,y)         |   operator names(s) f(x,y)
     ---------------- ------         |   ----------------- ------
-    1st first        x              |   iseq             x == y
-    2nd second       y              |   isne             x ~= y
-    min              min(x,y)       |   isgt             x &gt; y
-    max              max(x,y)       |   islt             x &lt; y
-    +   plus         x+y            |   isge             x &gt;= y
-    -   minus        x-y            |   isle             x &lt;= y
-    rminus           y-x            |   ==  eq           x == y
-    *   times        x*y            |   ~=  ne           x ~= y
-    /   div          x/y            |   &gt;   gt           x &gt; y
-    \   rdiv         y/x            |   &lt;   lt           x &lt; y
-    |   || or  lor   x | y          |   &gt;=  ge           x &gt;= y
-    &amp;   &amp;&amp; and land  x &amp; y          |   &lt;=  le           x &lt;= y
-    xor lxor         xor(x,y)       |   .^  pow          x .^ y
-    pair             1              |   any              pick x or y
- 
-  Comparators (*lt, *gt, *le, *ge) and min/max are not available for
-  complex types.
+    1st first        x              |   iseq              x == y
+    2nd second       y              |   isne              x ~= y
+    min              min(x,y)       |   isgt              x &gt; y
+    max              max(x,y)       |   islt              x &lt; y
+    +   plus         x+y            |   isge              x &gt;= y
+    -   minus        x-y            |   isle              x &lt;= y
+    rminus           y-x            |   ==  eq            x == y
+    *   times        x*y            |   ~=  ne            x ~= y
+    /   div          x/y            |   &gt;   gt            x &gt; y
+    \   rdiv         y/x            |   &lt;   lt            x &lt; y
+    |   || or  lor   x | y          |   &gt;=  ge            x &gt;= y
+    &amp;   &amp;&amp; and land  x &amp; y          |   &lt;=  le            x &lt;= y
+    xor lxor         xor(x,y)       |   .^  pow           x .^ y
+    pair             1              |   any               pick x or y
  
   All of the above operators are defined for logical operands, but many
   are redundant. 'min.logical' is the same as 'and.logical', for example.
   Most of the logical operators have aliases: ('lor', 'or', '|') are the
   same, as are ('lxnor', 'xnor', 'eq', '==') for logical types.
  
+  Positional operators return int32 or int64, and depend only on the position
+  of the entry in the matrix.  They do not depend on the values of their
+  inputs, but on their position in the matrix instead:
+ 
+    1-based postional ops:          in a semiring:     in ewise operators:
+    operator name(s)                f(A(i,k)*B(k,j))   f(A(i,j),B(i,j))
+    ----------------                ----------------   ----------------
+    firsti1  1sti1 firsti  1sti     i                  i
+    firstj1  1stj1 firstj  1stj     k                  j
+    secondi1 2ndi1 secondi 2ndi     k                  i
+    secondj1 2ndj1 secondj 2ndj     j                  j
+ 
+    0-based postional ops:          in a semiring:     in ewise operators:
+    operator name(s)                f(A(i,k)*B(k,j))   f(A(i,j),B(i,j))
+    ----------------                ----------------   ----------------
+    firsti0  1sti0                  i-1                i-1
+    firstj0  1stj0                  k-1                j-1
+    secondi0 2ndi0                  k-1                i-1
+    secondj0 2ndj0                  j-1                j-1
+ 
+  Comparators (*lt, *gt, *le, *ge) and min/max are not available for
+  complex types.
+ 
   The three logical operators, lor, land, and lxor, can be used with any
   real types.  z = lor.double (x,y) tests the condition (x~=0) || (y~=0),
   and returns the double value 1.0 if true, or 0.0 if false.
@@ -490,6 +563,12 @@
   For integer types only (result is same type as input):
     bitcmp      z = ~(x)        bitwise complement, also 'bitnot'
  
+  For int32 and int64 types, applied to an entry A(i,j)
+    positioni0  z = i-1     also 'i0'
+    positioni1  z = i       also 'i', 'i1', and 'positioni'
+    positionj0  z = j-1     also 'j0'
+    positionj1  z = j       also 'j', 'j1', and 'positionj'
+ 
   Example:
  
     % valid unary operators
@@ -505,7 +584,42 @@
   See also GrB.binopinfo, GrB.descriptorinfo, GrB.monoidinfo,
   GrB.selectopinfo, GrB.semiringinfo.
 
-</pre><h2 id="17">Element-wise operations</h2><p>Binary operators can be used in element-wise matrix operations, like C=A+B and C=A.*B.  For the matrix addition C=A+B, the pattern of C is the set union of A and B, and the '+' operator is applied for entries in the intersection.  Entries in A but not B, or in B but not A, are assigned to C without using the operator.  The '+' operator is used for C=A+B but any operator can be used with GrB.eadd.</p><pre class="codeinput">A = GrB (sprand (3, 3, 0.5)) ;
+</pre><pre class="codeinput">help <span class="string">GrB.semiringinfo</span>
+</pre><pre class="codeoutput"> GRB.SEMIRINGINFO list the details of a GraphBLAS semiring.
+ 
+    GrB.semiringinfo
+    GrB.semiringinfo (semiring)
+    GrB.semiringinfo (semiring, type)
+ 
+  For GrB.semiring(semiring), the semiring must be a string of the form
+  'add.mult.type', where 'add' and 'mult' are binary operators.  The
+  second usage allows the type to be omitted from the first argument, as
+  just 'add.mult'.  This is valid for all GraphBLAS operations, since the
+  type defaults to the type of the input matrices.  However,
+  GrB.semiringinfo does not have a default type and thus one must be
+  provided, either in the semiring as GrB.semiringinfo ('+.*.double'), or
+  in the second argument, GrB.semiringinfo ('+.*', 'double').
+ 
+  The additive operator must be the binary operator of a valid monoid (see
+  'help GrB.monoidinfo').  The multiplicative operator can be any binary
+  operator z=f(x,y) listed by 'help GrB.binopinfo', but the type of z must
+  match the operand type of the monoid.  The type in the string
+  'add.mult.type' is the type of x for the multiply operator z=f(x,y), and
+  the type of its z output defines the type of the monoid.
+ 
+  Example:
+ 
+    % valid semirings
+    GrB.semiringinfo ('+.*.double') ;
+    GrB.semiringinfo ('min.1st.int32') ;
+ 
+    % invalid semiring (generates an error; since '&lt;' is not a monoid)
+    GrB.semiringinfo ('&lt;.*.double') ;
+ 
+  See also GrB.binopinfo, GrB.descriptorinfo, GrB.monoidinfo,
+  GrB.selectopinfo, GrB.unopinfo.
+
+</pre><h2 id="18">Element-wise operations</h2><p>Binary operators can be used in element-wise matrix operations, like C=A+B and C=A.*B.  For the matrix addition C=A+B, the pattern of C is the set union of A and B, and the '+' operator is applied for entries in the intersection.  Entries in A but not B, or in B but not A, are assigned to C without using the operator.  The '+' operator is used for C=A+B but any operator can be used with GrB.eadd.</p><pre class="codeinput">A = GrB (sprand (3, 3, 0.5)) ;
 B = GrB (sprand (3, 3, 0.5)) ;
 C1 = A + B
 C2 = GrB.eadd (<span class="string">'+'</span>, A, B)
@@ -513,7 +627,7 @@
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    0.666139
@@ -527,7 +641,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    0.666139
@@ -540,12 +654,12 @@
 
 err =
      0
-</pre><h2 id="19">Subtracting two matrices</h2><p>A-B and GrB.eadd ('-', A, B) are not the same thing, since the '-' operator is not applied to an entry that is in B but not A.</p><pre class="codeinput">C1 = A-B
+</pre><h2 id="20">Subtracting two matrices</h2><p>A-B and GrB.eadd ('-', A, B) are not the same thing, since the '-' operator is not applied to an entry that is in B but not A.</p><pre class="codeinput">C1 = A-B
 C2 = GrB.eadd (<span class="string">'-'</span>, A, B)
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    -0.666139
@@ -559,7 +673,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    0.666139
@@ -576,7 +690,7 @@
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    -0.666139
@@ -590,7 +704,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    -0.666139
@@ -603,13 +717,13 @@
 
 err =
      0
-</pre><h2 id="21">Element-wise 'multiplication'</h2><p>For C = A.*B, the result C is the set intersection of the pattern of A and B.  The operator is applied to entries in both A and B.  Entries in A but not B, or B but not A, do not appear in the result C.</p><pre class="codeinput">C1 = A.*B
+</pre><h2 id="22">Element-wise 'multiplication'</h2><p>For C = A.*B, the result C is the set intersection of the pattern of A and B.  The operator is applied to entries in both A and B.  Entries in A but not B, or B but not A, do not appear in the result C.</p><pre class="codeinput">C1 = A.*B
 C2 = GrB.emult (<span class="string">'*'</span>, A, B)
 C3 = double (A) .* double (B)
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   1 nonzero, 1 entry
 
     (1,2)    0.518474
@@ -617,7 +731,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   1 nonzero, 1 entry
 
     (1,2)    0.518474
@@ -630,7 +744,7 @@
 </pre><pre class="codeoutput">
 A =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   4 nonzeros, 4 entries
 
     (1,2)    0.572029
@@ -641,7 +755,7 @@
 
 B =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   4 nonzeros, 4 entries
 
     (1,1)    0.666139
@@ -652,12 +766,12 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   1 nonzero, 1 entry
 
     (1,2)    0.906378
 
-</pre><h2 id="23">Overloaded operators</h2><p>The following operators all work as you would expect for any matrix. The matrices A and B can be GraphBLAS matrices, or MATLAB sparse or dense matrices, in any combination, or scalars where appropriate, The matrix M is logical (MATLAB or GraphBLAS):</p><pre>  A+B   A-B  A*B   A.*B  A./B  A.\B  A.^b   A/b   C=A(I,J)  C(M)=A
+</pre><h2 id="24">Overloaded operators</h2><p>The following operators all work as you would expect for any matrix. The matrices A and B can be GraphBLAS matrices, or MATLAB sparse or dense matrices, in any combination, or scalars where appropriate, The matrix M is logical (MATLAB or GraphBLAS):</p><pre>  A+B   A-B  A*B   A.*B  A./B  A.\B  A.^b   A/b   C=A(I,J)  C(M)=A
   -A    +A   ~A    A'    A.'   A&amp;B   A|B    b\A   C(I,J)=A  C=A(M)
   A~=B  A&gt;B  A==B  A&lt;=B  A&gt;=B  A&lt;B   [A,B]  [A;B] C(A)
   A(1:end,1:end)</pre><p>For A^b, b must be a non-negative integer.</p><pre class="codeinput">C1 = [A B] ;
@@ -670,7 +784,7 @@
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   5 nonzeros, 5 entries
 
     (2,2)    0.140946
@@ -688,13 +802,13 @@
 </pre><pre class="codeoutput">
 C1 =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, bitmap by col
   2 nonzeros, 2 entries
 
     (1,1)    0.572029
     (2,2)    0.248635
 
-</pre><h2 id="26">Overloaded functions</h2><p>Many MATLAB built-in functions can be used with GraphBLAS matrices:</p><p>A few differences with the built-in functions:</p><pre class="language-matlab">S = sparse (G)        <span class="comment">% makes a copy of a GrB matrix</span>
+</pre><h2 id="27">Overloaded functions</h2><p>Many MATLAB built-in functions can be used with GraphBLAS matrices:</p><p>A few differences with the built-in functions:</p><pre class="language-matlab">S = sparse (G)        <span class="comment">% converts G to sparse/hypersparse</span>
 F = full (G)          <span class="comment">% adds explicit zeros, so numel(F)==nnz(F)</span>
 F = full (G,type,id)  <span class="comment">% adds explicit identity values to a GrB matrix</span>
 disp (G, level)       <span class="comment">% display a GrB matrix G; level=2 is the default.</span>
@@ -702,67 +816,69 @@
 </pre><pre class="codeoutput">
 Methods for class GrB:
 
-GrB             disp            islogical       real            
-abs             display         ismatrix        repmat          
-acos            dmperm          isnan           reshape         
-acosh           double          isnumeric       round           
-acot            eig             isreal          sec             
-acoth           end             isscalar        sech            
-acsc            eps             issparse        sign            
-acsch           eq              issymmetric     sin             
-all             erf             istril          single          
-amd             erfc            istriu          sinh            
-and             etree           isvector        size            
-angle           exp             kron            sparse          
-any             expm1           ldivide         spfun           
-asec            false           le              spones          
-asech           find            length          sprand          
-asin            fix             log             sprandn         
-asinh           flip            log10           sprandsym       
-assert          floor           log1p           sprintf         
-atan            fprintf         log2            sqrt            
-atan2           full            logical         subsasgn        
-atanh           gamma           lt              subsindex       
-bandwidth       gammaln         max             subsref         
-bitand          ge              min             sum             
-bitcmp          graph           minus           symamd          
-bitget          gt              mldivide        symrcm          
-bitor           horzcat         mpower          tan             
-bitset          hypot           mrdivide        tanh            
-bitshift        imag            mtimes          times           
-bitxor          int16           ne              transpose       
-ceil            int32           nnz             tril            
-colamd          int64           nonzeros        triu            
-complex         int8            norm            true            
-conj            isa             not             uint16          
-cos             isbanded        numel           uint32          
-cosh            isdiag          nzmax           uint64          
-cot             isempty         ones            uint8           
-coth            isequal         or              uminus          
-csc             isfinite        plus            uplus           
-csch            isfloat         pow2            vertcat         
-ctranspose      ishermitian     power           xor             
-diag            isinf           prod            zeros           
-digraph         isinteger       rdivide         
+GrB             display         isnan           round           
+abs             dmperm          isnumeric       sec             
+acos            double          isreal          sech            
+acosh           eig             isscalar        sign            
+acot            end             issparse        sin             
+acoth           eps             issymmetric     single          
+acsc            eq              istril          sinh            
+acsch           erf             istriu          size            
+all             erfc            isvector        sparse          
+amd             etree           kron            spfun           
+and             exp             ldivide         spones          
+angle           expm1           le              sprand          
+any             false           length          sprandn         
+asec            find            log             sprandsym       
+asech           fix             log10           sprintf         
+asin            flip            log1p           sqrt            
+asinh           floor           log2            struct          
+assert          fprintf         logical         subsasgn        
+atan            full            lt              subsindex       
+atan2           gamma           max             subsref         
+atanh           gammaln         min             sum             
+bandwidth       ge              minus           symamd          
+bitand          graph           mldivide        symrcm          
+bitcmp          gt              mpower          tan             
+bitget          horzcat         mrdivide        tanh            
+bitor           hypot           mtimes          times           
+bitset          imag            ne              transpose       
+bitshift        int16           nnz             tril            
+bitxor          int32           nonzeros        triu            
+ceil            int64           norm            true            
+colamd          int8            not             uint16          
+complex         isa             numel           uint32          
+conj            isbanded        nzmax           uint64          
+cos             isdiag          ones            uint8           
+cosh            isempty         or              uminus          
+cot             isequal         plus            uplus           
+coth            isfinite        pow2            vertcat         
+csc             isfloat         power           xor             
+csch            ishermitian     prod            zeros           
+ctranspose      isinf           rdivide         
+diag            isinteger       real            
+digraph         islogical       repmat          
+disp            ismatrix        reshape         
 
 Static methods:
 
-MATLAB_vs_GrB   empty           issigned        reduce          
-apply           emult           kronecker       select          
-apply2          entries         ktruss          selectopinfo    
-assign          expand          laplacian       semiringinfo    
-bfs             extract         mis             speye           
-binopinfo       extracttuples   monoidinfo      subassign       
-build           eye             mxm             threads         
-burble          finalize        nonz            trans           
-chunk           format          normdiff        tricount        
-clear           incidence       offdiag         type            
-compact         init            optype          unopinfo        
-descriptorinfo  isbycol         pagerank        vreduce         
-dnn             isbyrow         prune           
-eadd            isfull          random          
-
-</pre><h2 id="28">Zeros are handled differently</h2><p>Explicit zeros cannot be automatically dropped from a GraphBLAS matrix, like they are in MATLAB sparse matrices.  In a shortest-path problem, for example, an edge A(i,j) that is missing has an infinite weight, (the monoid identity of min(x,y) is +inf).  A zero edge weight A(i,j)=0 is very different from an entry that is not present in A.  However, if a GraphBLAS matrix is converted into a MATLAB sparse matrix, explicit zeros are dropped, which is the convention for a MATLAB sparse matrix. They can also be dropped from a GraphBLAS matrix using the GrB.select method.</p><pre class="codeinput">G = GrB (magic (2)) ;
+MATLAB_vs_GrB   emult           ktruss          semiringinfo    
+apply           entries         laplacian       speye           
+apply2          expand          mis             subassign       
+assign          extract         monoidinfo      threads         
+bfs             extracttuples   mxm             trans           
+binopinfo       eye             nonz            tricount        
+build           finalize        normdiff        type            
+burble          format          offdiag         unopinfo        
+chunk           incidence       optype          ver             
+clear           init            pagerank        version         
+compact         isbycol         prune           vreduce         
+descriptorinfo  isbyrow         random          
+dnn             isfull          reduce          
+eadd            issigned        select          
+empty           kronecker       selectopinfo    
+
+</pre><h2 id="29">Zeros are handled differently</h2><p>Explicit zeros cannot be automatically dropped from a GraphBLAS matrix, like they are in MATLAB sparse matrices.  In a shortest-path problem, for example, an edge A(i,j) that is missing has an infinite weight, (the monoid identity of min(x,y) is +inf).  A zero edge weight A(i,j)=0 is very different from an entry that is not present in A.  However, if a GraphBLAS matrix is converted into a MATLAB sparse matrix, explicit zeros are dropped, which is the convention for a MATLAB sparse matrix. They can also be dropped from a GraphBLAS matrix using the GrB.select method.</p><pre class="codeinput">G = GrB (magic (2)) ;
 G (1,1) = 0      <span class="comment">% G(1,1) still appears as an explicit entry</span>
 A = double (G)   <span class="comment">% but it's dropped when converted to MATLAB sparse</span>
 H = GrB.select (<span class="string">'nonzero'</span>, G)  <span class="comment">% drops the explicit zeros from G</span>
@@ -772,7 +888,7 @@
 </pre><pre class="codeoutput">
 G =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, full by col
   3 nonzeros, 4 entries
 
     (1,1)    0
@@ -781,13 +897,12 @@
     (2,2)    2
 
 A =
-   (2,1)        4
-   (1,2)        3
-   (2,2)        2
+     0     3
+     4     2
 
 H =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, bitmap by col
   3 nonzeros, 3 entries
 
     (2,1)    4
@@ -796,13 +911,13 @@
 
 nnz (G): 3  nnz (A): 3 nnz (H): 3
 num entries in G: 4
-</pre><h2 id="30">Displaying contents of a GraphBLAS matrix</h2><p>Unlike MATLAB, the default is to display just a few entries of a GrB matrix. Here are all 100 entries of a 10-by-10 matrix, using a non-default disp(G,3):</p><pre class="codeinput">G = GrB (rand (10)) ;
+</pre><h2 id="31">Displaying contents of a GraphBLAS matrix</h2><p>Unlike MATLAB, the default is to display just a few entries of a GrB matrix. Here are all 100 entries of a 10-by-10 matrix, using a non-default disp(G,3):</p><pre class="codeinput">G = GrB (rand (10)) ;
 <span class="comment">% display everything:</span>
 disp (G,3)
 </pre><pre class="codeoutput">
 G =
 
-  10x10 GraphBLAS double matrix, sparse by col:
+  10x10 GraphBLAS double matrix, full by col
   100 nonzeros, 100 entries
 
     (1,1)    0.0342763
@@ -911,7 +1026,7 @@
 </pre><pre class="codeoutput">
 G =
 
-  10x10 GraphBLAS double matrix, sparse by col:
+  10x10 GraphBLAS double matrix, full by col
   100 nonzeros, 100 entries
 
     (1,1)    0.0342763
@@ -943,18 +1058,17 @@
     (7,3)    0.706156
     (8,3)    0.909475
     (9,3)    0.84868
-    (10,3)    0.564605
     ...
 
 </pre><p>That was disp(G,2) or just display(G), which is what is printed by a MATLAB statement that doesn't have a trailing semicolon.  With level = 1, disp(G,1) gives just a terse summary:</p><pre class="codeinput">disp (G,1)
 </pre><pre class="codeoutput">
 G =
 
-  10x10 GraphBLAS double matrix, sparse by col:
+  10x10 GraphBLAS double matrix, full by col
   100 nonzeros, 100 entries
 
 
-</pre><h2 id="35">Storing a matrix by row or by column</h2><p>MATLAB stores its sparse matrices by column, refered to as 'standard CSC' in SuiteSparse:GraphBLAS.  In the CSC (compressed sparse column) format, each column of the matrix is stored as a list of entries, with their value and row index.  In the CSR (compressed sparse row) format, each row is stored as a list of values and their column indices. GraphBLAS uses both CSC and CSR, and the two formats can be intermixed arbitrarily.  In its C interface, the default format is CSR.  However, for better compatibility with MATLAB, this MATLAB interface for SuiteSparse:GraphBLAS uses CSC by default instead.</p><pre class="codeinput">rng (<span class="string">'default'</span>) ;
+</pre><h2 id="36">Storing a matrix by row or by column</h2><p>MATLAB stores its sparse matrices by column, refered to as 'sparse by col' in SuiteSparse:GraphBLAS.  In the 'sparse by col' format, each column of the matrix is stored as a list of entries, with their value and row index.  In the 'sparse by row' format, each row is stored as a list of values and their column indices.  GraphBLAS uses both 'by row' and 'by col', and the two formats can be intermixed arbitrarily.  In its C interface, the default format is 'by row'.  However, for better compatibility with MATLAB, the SuiteSparse:GraphBLAS MATLAB interface uses 'by col' by default instead.</p><pre class="codeinput">rng (<span class="string">'default'</span>) ;
 GrB.clear ;                      <span class="comment">% clear prior GraphBLAS settings</span>
 fprintf (<span class="string">'the default format is: %s\n'</span>, GrB.format) ;
 C = sparse (rand (2))
@@ -969,7 +1083,7 @@
 
 G =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    0.814724
@@ -979,7 +1093,7 @@
 
 ans =
     'by col'
-</pre><p>Many graph algorithms work better in CSR format, with matrices stored by row.  For example, it is common to use A(i,j) for the edge (i,j), and many graph algorithms need to access the out-adjacencies of nodes, which is the row A(i,;) for node i.  If the CSR format is desired, GrB.format ('by row') tells GraphBLAS to create all subsequent matrices in the CSR format.  Converting from a MATLAB sparse matrix (in standard CSC format) takes a little more time (requiring a transpose), but subsequent graph algorithms can be faster.</p><pre class="codeinput">G = GrB (C, <span class="string">'by row'</span>)
+</pre><p>Many graph algorithms work better in 'by row' format, with matrices stored by row.  For example, it is common to use A(i,j) for the edge (i,j), and many graph algorithms need to access the out-adjacencies of nodes, which is the row A(i,;) for node i.  If the 'by row' format is desired, GrB.format ('by row') tells GraphBLAS to create all subsequent matrices in the 'by row' format.  Converting from a MATLAB sparse matrix (in standard 'by col' format) takes a little more time (requiring a transpose), but subsequent graph algorithms can be faster.</p><pre class="codeinput">G = GrB (C, <span class="string">'by row'</span>)
 fprintf (<span class="string">'the format of G is:    %s\n'</span>, GrB.format (G)) ;
 H = GrB (C)
 fprintf (<span class="string">'the format of H is:    %s\n'</span>, GrB.format (H)) ;
@@ -987,7 +1101,7 @@
 </pre><pre class="codeoutput">
 G =
 
-  2x2 GraphBLAS double matrix, sparse by row:
+  2x2 GraphBLAS double matrix, full by row
   4 nonzeros, 4 entries
 
     (1,1)    0.814724
@@ -999,7 +1113,7 @@
 
 H =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    0.814724
@@ -1010,7 +1124,7 @@
 the format of H is:    by col
 err =
      0
-</pre><h2 id="39">Hypersparse matrices</h2><p>SuiteSparse:GraphBLAS can use two kinds of sparse matrix data structures: standard and hypersparse, for both CSC and CSR formats.  In the standard CSC format used in MATLAB, an m-by-n matrix A takes O(n+nnz(A)) space.  MATLAB can create huge column vectors, but not huge matrices (when n is huge).</p><pre class="codeinput">clear
+</pre><h2 id="40">Hypersparse, sparse, bitmap, and full matrices</h2><p>SuiteSparse:GraphBLAS can use four kinds of sparse matrix data structures: hypersparse, sparse, bitmap, and full, in both 'by col' and 'by row' formats, for a total of eight different combinations.  In the 'sparse by col' that MATLAB uses for its sparse matrices, an m-by-n matrix A takes O(n+nnz(A)) space.  MATLAB can create huge column vectors, but not huge matrices (when n is huge).</p><pre class="codeinput">clear
 [c, huge] = computer ;
 C = sparse (huge, 1)    <span class="comment">% MATLAB can create a huge-by-1 sparse column</span>
 <span class="keyword">try</span>
@@ -1024,9 +1138,10 @@
   MException with properties:
 
     identifier: 'MATLAB:array:SizeLimitExceeded'
-       message: 'Requested 281474976710655x281474976710655 (2097152.0GB) array exceeds maximum array size preference. Creation of arrays greater than this limit may take a long time and cause MATLAB to become unresponsive. See &lt;a href="matlab: helpview([docroot '/matlab/helptargets.map'], 'matlab_env_workspace_prefs')"&gt;array size limit&lt;/a&gt; or preference panel for more information.'
-         cause: {0x1 cell}
+       message: 'Requested 281474976710655x281474976710655 (2097152.0GB) array exceeds maximum array size preference. Creation of arrays greater than this limit may take a long time and cause MATLAB to become unresponsive.'
+         cause: {}
          stack: [4x1 struct]
+    Correction: []
 </pre><p>In a GraphBLAS hypersparse matrix, an m-by-n matrix A takes only O(nnz(A)) space.  The difference can be huge if nnz (A) &lt;&lt; n.</p><pre class="codeinput">clear
 [c, huge] = computer ;
 G = GrB (huge, 1)            <span class="comment">% no problem for GraphBLAS</span>
@@ -1034,13 +1149,13 @@
 </pre><pre class="codeoutput">
 G =
 
-  281474976710655x1 GraphBLAS double matrix, sparse by col:
+  281474976710655x1 GraphBLAS double matrix, sparse by col
   no nonzeros, no entries
 
 
 H =
 
-  281474976710655x281474976710655 GraphBLAS double matrix, hypersparse by col:
+  281474976710655x281474976710655 GraphBLAS double matrix, hypersparse by col
   no nonzeros, no entries
 
 </pre><p>Operations on huge hypersparse matrices are very fast; no component of the time or space complexity is Omega(n).</p><pre class="codeinput">I = randperm (huge, 2) ;
@@ -1054,7 +1169,7 @@
 </pre><pre class="codeoutput">
 H =
 
-  281474976710655x281474976710655 GraphBLAS double matrix, hypersparse by col:
+  281474976710655x281474976710655 GraphBLAS double matrix, hypersparse by col
   8 nonzeros, 8 entries
 
     (27455183225557,27455183225557)    4403.14
@@ -1066,7 +1181,7 @@
     (153933462881710,177993304104065)    143.142
     (177993304104065,177993304104065)    1403.14
 
-</pre><h2 id="42">numel uses vpa if the matrix is really huge</h2><pre class="codeinput">e1 = numel (G)               <span class="comment">% this is huge, but still a flint</span>
+</pre><h2 id="43">numel uses vpa if the matrix is really huge</h2><pre class="codeinput">e1 = numel (G)               <span class="comment">% this is huge, but still a flint</span>
 e2 = numel (H)               <span class="comment">% this is huge^2, which needs vpa</span>
 whos <span class="string">e1</span> <span class="string">e2</span>
 </pre><pre class="codeoutput">e1 =
@@ -1081,11 +1196,11 @@
 </pre><p>All of these matrices take very little memory space:</p><pre class="codeinput">whos <span class="string">C</span> <span class="string">G</span> <span class="string">H</span> <span class="string">K</span>
 </pre><pre class="codeoutput">  Name                    Size                         Bytes  Class    Attributes
 
-  G         281474976710655x1                            981  GrB                
-  H         281474976710655x281474976710655             1300  GrB                
-  K         281474976710655x281474976710655             1300  GrB                
+  G         281474976710655x1                            949  GrB                
+  H         281474976710655x281474976710655             1260  GrB                
+  K         281474976710655x281474976710655             1260  GrB                
 
-</pre><h2 id="44">The mask and accumulator</h2><p>When not used in overloaded operators or built-in functions, many GraphBLAS methods of the form GrB.method ( ... ) can optionally use a mask and/or an accumulator operator.  If the accumulator is '+' in GrB.mxm, for example, then C = C + A*B is computed.  The mask acts much like logical indexing in MATLAB.  With a logical mask matrix M, C&lt;M&gt;=A*B allows only part of C to be assigned.  If M(i,j) is true, then C(i,j) can be modified.  If false, then C(i,j) is not modified.</p><p>For example, to set all values in C that are greater than 0.5 to 3:</p><pre class="codeinput">A = rand (3)
+</pre><h2 id="45">The mask and accumulator</h2><p>When not used in overloaded operators or built-in functions, many GraphBLAS methods of the form GrB.method ( ... ) can optionally use a mask and/or an accumulator operator.  If the accumulator is '+' in GrB.mxm, for example, then C = C + A*B is computed.  The mask acts much like logical indexing in MATLAB.  With a logical mask matrix M, C&lt;M&gt;=A*B allows only part of C to be assigned.  If M(i,j) is true, then C(i,j) can be modified.  If false, then C(i,j) is not modified.</p><p>For example, to set all values in C that are greater than 0.5 to 3:</p><pre class="codeinput">A = rand (3)
 C = GrB.assign (A, A &gt; 0.5, 3) ;     <span class="comment">% in GraphBLAS</span>
 C1 = GrB (A) ; C1 (A &gt; .5) = 3       <span class="comment">% also in GraphBLAS</span>
 C2 = A       ; C2 (A &gt; .5) = 3       <span class="comment">% in MATLAB</span>
@@ -1098,7 +1213,7 @@
 
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, full by col
   9 nonzeros, 9 entries
 
     (1,1)    3
@@ -1119,14 +1234,14 @@
      0
 err =
      0
-</pre><h2 id="46">The descriptor</h2><p>Most GraphBLAS functions of the form GrB.method ( ... ) take an optional last argument, called the descriptor.  It is a MATLAB struct that can modify the computations performed by the method.  'help GrB.descriptorinfo' gives all the details.  The following is a short summary of the primary settings:</p><p>d.out  = 'default' or 'replace', clears C after the accum op is used.</p><p>d.mask = 'default' or 'complement', to use M or ~M as the mask matrix;          'structural', or 'structural complement', to use the pattern           of M or ~M.</p><p>d.in0  = 'default' or 'transpose', to transpose A for C=A*B, C=A+B, etc.</p><p>d.in1  = 'default' or 'transpose', to transpose B for C=A*B, C=A+B, etc.</p><p>d.kind = 'default', 'GrB', 'sparse', or 'full'; the output of GrB.method.</p><pre class="codeinput">A = sparse (rand (2)) ;
+</pre><h2 id="47">The descriptor</h2><p>Most GraphBLAS functions of the form GrB.method ( ... ) take an optional last argument, called the descriptor.  It is a MATLAB struct that can modify the computations performed by the method.  'help GrB.descriptorinfo' gives all the details.  The following is a short summary of the primary settings:</p><p>d.out  = 'default' or 'replace', clears C after the accum op is used.</p><p>d.mask = 'default' or 'complement', to use M or ~M as the mask matrix;          'structural', or 'structural complement', to use the pattern           of M or ~M.</p><p>d.in0  = 'default' or 'transpose', to transpose A for C=A*B, C=A+B, etc.</p><p>d.in1  = 'default' or 'transpose', to transpose B for C=A*B, C=A+B, etc.</p><p>d.kind = 'default', 'GrB', 'sparse', or 'full'; the output of GrB.method.</p><pre class="codeinput">A = sparse (rand (2)) ;
 B = sparse (rand (2)) ;
 C1 = A'*B ;
 C2 = GrB.mxm (<span class="string">'+.*'</span>, A, B, struct (<span class="string">'in0'</span>, <span class="string">'transpose'</span>)) ;
 err = norm (C1-C2,1)
 </pre><pre class="codeoutput">err =
      0
-</pre><h2 id="47">Integer arithmetic is different in GraphBLAS</h2><p>MATLAB supports integer arithmetic on its full matrices, using int8, int16, int32, int64, uint8, uint16, uint32, or uint64 data types.  None of these integer data types can be used to construct a MATLAB sparse matrix, which can only be double, double complex, or logical. Furthermore, C=A*B is not defined for integer types in MATLAB, except when A and/or B are scalars.</p><p>GraphBLAS supports all of those types for its sparse matrices.  All operations are supported, including C=A*B when A or B are any integer type, in 1000s of semirings.</p><p>However, integer arithmetic differs in GraphBLAS and MATLAB.  In MATLAB, integer values saturate if they exceed their maximum value.  In GraphBLAS, integer operators act in a modular fashion.  The latter is essential when computing C=A*B over a semiring.  A saturating integer operator cannot be used as a monoid since it is not associative.</p><pre class="codeinput">C = uint8 (magic (3)) ;
+</pre><h2 id="48">Integer arithmetic is different in GraphBLAS</h2><p>MATLAB supports integer arithmetic on its full matrices, using int8, int16, int32, int64, uint8, uint16, uint32, or uint64 data types.  None of these integer data types can be used to construct a MATLAB sparse matrix, which can only be double, double complex, or logical. Furthermore, C=A*B is not defined for integer types in MATLAB, except when A and/or B are scalars.</p><p>GraphBLAS supports all of those types for all of its matrices (hyper, sparse, bitmap, or full).  All operations are supported, including C=A*B when A or B are any integer type, in 1000s of semirings.</p><p>However, integer arithmetic differs in GraphBLAS and MATLAB.  In MATLAB, integer values saturate if they exceed their maximum value.  In GraphBLAS, integer operators act in a modular fashion.  The latter is essential when computing C=A*B over a semiring.  A saturating integer operator cannot be used as a monoid since it is not associative.</p><pre class="codeinput">C = uint8 (magic (3)) ;
 G = GrB (C) ;
 C1 = C * 40
 C2 = G * uint8 (40)
@@ -1140,7 +1255,7 @@
 
 C2 =
 
-  3x3 GraphBLAS uint8_t matrix, sparse by col:
+  3x3 GraphBLAS uint8_t matrix, full by col
   9 nonzeros, 9 entries
 
     (1,1)   64
@@ -1153,7 +1268,7 @@
     (2,3)   24
     (3,3)   80
 
-</pre><h2 id="49">An example graph algorithm: breadth-first search</h2><p>The breadth-first search of a graph finds all nodes reachable from the source node, and their level, v.  v=GrB.bfs(A,s) or v=bfs_matlab(A,s) compute the same thing, but GrB.bfs uses GraphBLAS matrices and operations, while bfs_matlab uses pure MATLAB operations.  v is defined as v(s) = 1 for the source node, v(i) = 2 for nodes adjacent to the source, and so on.</p><pre class="codeinput">clear
+</pre><h2 id="50">An example graph algorithm: breadth-first search</h2><p>The breadth-first search of a graph finds all nodes reachable from the source node, and their level, v.  v=GrB.bfs(A,s) or v=bfs_matlab(A,s) compute the same thing, but GrB.bfs uses GraphBLAS matrices and operations, while bfs_matlab uses pure MATLAB operations.  v is defined as v(s) = 1 for the source node, v(i) = 2 for nodes adjacent to the source, and so on.</p><pre class="codeinput">clear
 rng (<span class="string">'default'</span>) ;
 n = 1e5 ;
 A = logical (sprandn (n, n, 1e-3)) ;
@@ -1172,12 +1287,15 @@
 fprintf (<span class="string">'MATLAB time:    %g sec\n'</span>, matlab_time) ;
 fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 </pre><pre class="codeoutput">
 nodes reached: 100000 of 100000
-GraphBLAS time: 0.0654 sec
-MATLAB time:    1.09345 sec
-Speedup of GraphBLAS over MATLAB: 16.7194
-</pre><h2 id="50">Example graph algorithm: Luby's method in GraphBLAS</h2><p>The GrB.mis function is variant of Luby's randomized algorithm [Luby 1985].  It is a parallel method for finding an maximal independent set of nodes, where no two nodes are adjacent.  See the GraphBLAS/@GrB/mis.m function for details.  The graph must be symmetric with a zero-free diagonal, so A is symmetrized first and any diagonal entries are removed.</p><pre class="codeinput">A = GrB (A) ;
+GraphBLAS time: 0.155031 sec
+MATLAB time:    0.790311 sec
+Speedup of GraphBLAS over MATLAB: 5.09776
+
+# of threads used by GraphBLAS: 20
+</pre><h2 id="51">Example graph algorithm: Luby's method in GraphBLAS</h2><p>The GrB.mis function is variant of Luby's randomized algorithm [Luby 1985].  It is a parallel method for finding an maximal independent set of nodes, where no two nodes are adjacent.  See the GraphBLAS/@GrB/mis.m function for details.  The graph must be symmetric with a zero-free diagonal, so A is symmetrized first and any diagonal entries are removed.</p><pre class="codeinput">A = GrB (A) ;
 A = GrB.offdiag (A|A') ;
 
 tic
@@ -1198,11 +1316,11 @@
 S = A (notp, p) ;
 deg = GrB.vreduce (<span class="string">'+.int64'</span>, S) ;
 assert (logical (all (deg &gt; 0)))
-</pre><pre class="codeoutput">Elapsed time is 0.257117 seconds.
+</pre><pre class="codeoutput">Elapsed time is 0.493990 seconds.
 # nodes in the graph: 100000
 # edges: : 9.9899e+06
 size of maximal independent set found: 2811
-</pre><h2 id="51">Sparse deep neural network</h2><p>The 2019 MIT GraphChallenge (see <a href="http://graphchallenge.org">http://graphchallenge.org</a>) is to solve a set of large sparse deep neural network problems.  In this demo, the MATLAB reference solution is compared with a solution using GraphBLAS, for a randomly constructed neural network.  See the GrB.dnn and dnn_matlab.m functions for details.</p><pre class="codeinput">clear
+</pre><h2 id="52">Sparse deep neural network</h2><p>The 2019 MIT GraphChallenge (see <a href="http://graphchallenge.org">http://graphchallenge.org</a>) is to solve a set of large sparse deep neural network problems.  In this demo, the MATLAB reference solution is compared with a solution using GraphBLAS, for a randomly constructed neural network.  See the GrB.dnn and dnn_matlab.m functions for details.</p><pre class="codeinput">clear
 rng (<span class="string">'default'</span>) ;
 nlayers = 16 ;
 nneurons = 4096 ;
@@ -1210,6 +1328,7 @@
 fprintf (<span class="string">'# layers:   %d\n'</span>, nlayers) ;
 fprintf (<span class="string">'# neurons:  %d\n'</span>, nneurons) ;
 fprintf (<span class="string">'# features: %d\n'</span>, nfeatures) ;
+fprintf (<span class="string">'# of threads used: %d\n'</span>, GrB.threads) ;
 
 tic
 Y0 = sprand (nfeatures, nneurons, 0.1) ;
@@ -1228,26 +1347,30 @@
 </pre><pre class="codeoutput"># layers:   16
 # neurons:  4096
 # features: 30000
-construct problem time: 7.72332 sec
-setup time: 0.091834 sec
-</pre><h2 id="52">Solving the sparse deep neural network problem with GraphbLAS</h2><p>Please wait ...</p><pre class="codeinput">tic
+# of threads used: 20
+construct problem time: 2.89242 sec
+setup time: 0.131795 sec
+</pre><h2 id="53">Solving the sparse deep neural network problem with GraphbLAS</h2><p>Please wait ...</p><pre class="codeinput">tic
 Y1 = GrB.dnn (W_gb, bias_gb, Y0_gb) ;
 gb_time = toc ;
 fprintf (<span class="string">'total time in GraphBLAS: %g sec\n'</span>, gb_time) ;
-</pre><pre class="codeoutput">total time in GraphBLAS: 2.94826 sec
-</pre><h2 id="53">Solving the sparse deep neural network problem with MATLAB</h2><p>Please wait ...</p><pre class="codeinput">tic
+</pre><pre class="codeoutput">total time in GraphBLAS: 3.14581 sec
+</pre><h2 id="54">Solving the sparse deep neural network problem with MATLAB</h2><p>Please wait ...</p><pre class="codeinput">tic
 Y2 = dnn_matlab (W, bias, Y0) ;
 matlab_time = toc ;
 fprintf (<span class="string">'total time in MATLAB:    %g sec\n'</span>, matlab_time) ;
 fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 
 err = norm (Y1-Y2,1)
-</pre><pre class="codeoutput">total time in MATLAB:    127.537 sec
-Speedup of GraphBLAS over MATLAB: 43.2585
+</pre><pre class="codeoutput">total time in MATLAB:    134.077 sec
+Speedup of GraphBLAS over MATLAB: 42.6208
+
+# of threads used by GraphBLAS: 20
 err =
      0
-</pre><h2 id="54">For objects, GraphBLAS has better colon notation than MATLAB</h2><p>The MATLAB notation C = A (start:inc:fini) is very handy, and it works great if A is a MATLAB matrix.  But for objects like the GraphBLAS matrix, MATLAB starts by creating the explicit index vector I = start:inc:fini.  That's fine if the matrix is modest in size, but GraphBLAS can construct huge matrices. The problem is that 1:n cannot be explicitly constructed when n is huge.</p><p>The C API for GraphBLAS can represent the colon notation start:inc:fini in an implicit manner, so it can do the indexing without actually forming the explicit list I = start:inc:fini. But there is no access to this method using the MATLAB notation start:inc:fini.</p><p>Thus, to compute C = A (start:inc:fini) for very huge matrices, you need to use use a cell array to represent the colon notation, as { start, inc, fini }, instead of start:inc:fini. See 'help GrB.extract', 'help GrB.assign' for the functional form. For the overloaded syntax C(I,J)=A and C=A(I,J), see 'help GrB/subsasgn' and 'help GrB/subsfref'.  The cell array syntax isn't conventional, but it is far faster than the MATLAB colon notation for objects, and takes far less memory when I is huge.</p><pre class="codeinput">n = 1e14 ;
+</pre><h2 id="55">For objects, GraphBLAS has better colon notation than MATLAB</h2><p>The MATLAB notation C = A (start:inc:fini) is very handy, and it works great if A is a MATLAB matrix.  But for objects like the GraphBLAS matrix, MATLAB starts by creating the explicit index vector I = start:inc:fini.  That's fine if the matrix is modest in size, but GraphBLAS can construct huge matrices. The problem is that 1:n cannot be explicitly constructed when n is huge.</p><p>The C API for GraphBLAS can represent the colon notation start:inc:fini in an implicit manner, so it can do the indexing without actually forming the explicit list I = start:inc:fini. But there is no access to this method using the MATLAB notation start:inc:fini.</p><p>Thus, to compute C = A (start:inc:fini) for very huge matrices, you need to use use a cell array to represent the colon notation, as { start, inc, fini }, instead of start:inc:fini. See 'help GrB.extract', 'help GrB.assign' for the functional form. For the overloaded syntax C(I,J)=A and C=A(I,J), see 'help GrB/subsasgn' and 'help GrB/subsref'.  The cell array syntax isn't conventional, but it is far faster than the MATLAB colon notation for objects, and takes far less memory when I is huge.</p><pre class="codeinput">n = 1e14 ;
 H = GrB (n, n) ;            <span class="comment">% a huge empty matrix</span>
 I = [1 1e9 1e12 1e14] ;
 M = magic (4)
@@ -1265,7 +1388,7 @@
 
 C1 =
 
-  10000000000000x10000000000000 GraphBLAS double matrix, hypersparse by col:
+  10000000000000x10000000000000 GraphBLAS double matrix, hypersparse by col
   9 nonzeros, 9 entries
 
     (1,1)    16
@@ -1289,10 +1412,11 @@
   MException with properties:
 
     identifier: 'MATLAB:array:SizeLimitExceeded'
-       message: 'Requested 10000000000000x1 (74505.8GB) array exceeds maximum array size preference. Creation of arrays greater than this limit may take a long time and cause MATLAB to become unresponsive. See &lt;a href="matlab: helpview([docroot '/matlab/helptargets.map'], 'matlab_env_workspace_prefs')"&gt;array size limit&lt;/a&gt; or preference panel for more information.'
+       message: 'Requested 10000000000000x1 (74505.8GB) array exceeds maximum array size preference. Creation of arrays greater than this limit may take a long time and cause MATLAB to become unresponsive.'
          cause: {}
          stack: [4x1 struct]
-</pre><h2 id="57">Iterative solvers work as-is</h2><p>Many built-in functions work with GraphBLAS matrices unmodified.</p><pre class="codeinput">A = sparse (rand (4)) ;
+    Correction: []
+</pre><h2 id="58">Iterative solvers work as-is</h2><p>Many built-in functions work with GraphBLAS matrices unmodified.</p><pre class="codeinput">A = sparse (rand (4)) ;
 b = sparse (rand (4,1)) ;
 x = gmres (A,b)
 norm (A*x-b)
@@ -1314,7 +1438,7 @@
    -1.3867
 ans =
    7.2802e-16
-</pre><h2 id="58">... even in single precision</h2><pre class="codeinput">x = gmres (GrB(A,<span class="string">'single'</span>), GrB(b,<span class="string">'single'</span>))
+</pre><h2 id="59">... even in single precision</h2><pre class="codeinput">x = gmres (GrB(A,<span class="string">'single'</span>), GrB(b,<span class="string">'single'</span>))
 norm (A*x-b)
 </pre><pre class="codeoutput">gmres converged at iteration 4 to a solution with relative residual 0.
 x =
@@ -1340,7 +1464,7 @@
 
 x =
 
-  4x1 GraphBLAS double matrix, sparse by col:
+  4x1 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    0.248942
@@ -1365,7 +1489,7 @@
 
 x =
 
-  4x1 GraphBLAS double matrix, sparse by col:
+  4x1 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    -114.062
@@ -1375,7 +1499,7 @@
 
 ans =
    1.3650e-11
-</pre><h2 id="61">Extreme performance differences between GraphBLAS and MATLAB.</h2><p>The GraphBLAS operations used so far are perhaps 2x to 50x faster than the corresponding MATLAB operations, depending on how many cores your computer has.  To run a demo illustrating a 500x or more speedup versus MATLAB, run this demo:</p><pre>  gbdemo2</pre><p>It will illustrate an assignment C(I,J)=A that can take under a second in GraphBLAS but several minutes in MATLAB.  To make the comparsion even more dramatic, try:</p><pre>  gbdemo2 (20000)</pre><p>assuming you have enough memory.</p><h2 id="62">Sparse logical indexing is much, much faster in GraphBLAS</h2><p>The mask in GraphBLAS acts much like logical indexing in MATLAB, but it is not quite the same.  MATLAB logical indexing takes the form:</p><pre>     C (M) = A (M)</pre><p>which computes the same thing as the GraphBLAS statement:</p><pre>     C = GrB.assign (C, M, A)</pre><p>The GrB.assign statement computes C(M)=A(M), and it is vastly faster than C(M)=A(M), even if the time to convert the GrB matrix back to a MATLAB sparse matrix is included.</p><p>GraphBLAS can also compute C (M) = A (M) using overloaded operators for subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.</p><p>Here are both methods in GraphBLAS (both are very fast).  Setting up:</p><pre class="codeinput">clear
+</pre><h2 id="62">Extreme performance differences between GraphBLAS and MATLAB.</h2><p>The GraphBLAS operations used so far are perhaps 2x to 50x faster than the corresponding MATLAB operations, depending on how many cores your computer has.  To run a demo illustrating a 500x or more speedup versus MATLAB, run this demo:</p><pre>  gbdemo2</pre><p>It will illustrate an assignment C(I,J)=A that can take under a second in GraphBLAS but several minutes in MATLAB.  To make the comparsion even more dramatic, try:</p><pre>  gbdemo2 (20000)</pre><p>assuming you have enough memory.</p><h2 id="63">Sparse logical indexing is much, much faster in GraphBLAS</h2><p>The mask in GraphBLAS acts much like logical indexing in MATLAB, but it is not quite the same.  MATLAB logical indexing takes the form:</p><pre>     C (M) = A (M)</pre><p>which computes the same thing as the GraphBLAS statement:</p><pre>     C = GrB.assign (C, M, A)</pre><p>The GrB.assign statement computes C(M)=A(M), and it is vastly faster than C(M)=A(M) for MATLAB sparse matrices, even if the time to convert the GrB matrix back to a MATLAB sparse matrix is included.</p><p>GraphBLAS can also compute C(M)=A(M) using overloaded operators for subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.</p><p>Here are both methods in GraphBLAS (both are very fast).  Setting up:</p><pre class="codeinput">clear
 n = 4000 ;
 tic
 C = sprand (n, n, 0.1) ;
@@ -1387,15 +1511,15 @@
 fprintf (<span class="string">'\nsetup time:     %g sec\n'</span>, t_setup) ;
 </pre><pre class="codeoutput">nnz(C): 1.5226e+06, nnz(M): 761163, nnz(A): 1.52245e+06
 
-setup time:     1.29904 sec
-</pre><h2 id="63">First method in GraphBLAS, with GrB.assign</h2><p>Including the time to convert C1 from a GraphBLAS matrix to a MATLAB sparse matrix:</p><pre class="codeinput">tic
+setup time:     0.476061 sec
+</pre><h2 id="64">First method in GraphBLAS, with GrB.assign</h2><p>Including the time to convert C1 from a GraphBLAS matrix to a MATLAB sparse matrix:</p><pre class="codeinput">tic
 C1 = GrB.assign (C, M, A) ;
 C1 = double (C1) ;
 gb_time = toc ;
 fprintf (<span class="string">'\nGraphBLAS time: %g sec for GrB.assign\n'</span>, gb_time) ;
 </pre><pre class="codeoutput">
-GraphBLAS time: 0.014748 sec for GrB.assign
-</pre><h2 id="64">Second method in GraphBLAS, with C(M)=A(M)</h2><p>now using overloaded operators, also include the time to convert back to a MATLAB sparse matrix, for good measure:</p><pre class="codeinput">A2 = GrB (A) ;
+GraphBLAS time: 0.022319 sec for GrB.assign
+</pre><h2 id="65">Second method in GraphBLAS, with C(M)=A(M)</h2><p>now using overloaded operators, also include the time to convert back to a MATLAB sparse matrix, for good measure:</p><pre class="codeinput">A2 = GrB (A) ;
 C2 = GrB (C) ;
 tic
 C2 (M) = A2 (M) ;
@@ -1403,33 +1527,33 @@
 gb_time2 = toc ;
 fprintf (<span class="string">'\nGraphBLAS time: %g sec for C(M)=A(M)\n'</span>, gb_time2) ;
 </pre><pre class="codeoutput">
-GraphBLAS time: 0.028518 sec for C(M)=A(M)
-</pre><h2 id="65">Now with MATLAB matrices, with C(M)=A(M)</h2><p>Please wait, this will take about 10 minutes or so ...</p><pre class="codeinput">tic
+GraphBLAS time: 0.164464 sec for C(M)=A(M)
+</pre><h2 id="66">Now with MATLAB matrices, with C(M)=A(M)</h2><p>Please wait, this will take about 10 minutes or so ...</p><pre class="codeinput">tic
 C (M) = A (M) ;
 matlab_time = toc ;
 
 fprintf (<span class="string">'\nGraphBLAS time: %g sec (GrB.assign)\n'</span>, gb_time) ;
-fprintf (<span class="string">'\nGraphBLAS time: %g sec (overloading)\n'</span>, gb_time2) ;
+fprintf (<span class="string">'GraphBLAS time: %g sec (overloading)\n'</span>, gb_time2) ;
 fprintf (<span class="string">'MATLAB time:    %g sec\n'</span>, matlab_time) ;
-fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
+fprintf (<span class="string">'Speedup of GraphBLAS (overloading) over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time2) ;
+fprintf (<span class="string">'Speedup of GraphBLAS (GrB.assign)  over MATLAB: %g\n'</span>, <span class="keyword">...</span>
+    matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 
-<span class="comment">% GraphBLAS computes the exact same result with both methods:</span>
 assert (isequal (C1, C))
 assert (isequal (C2, C))
-C1 - C
-C2 - C
+fprintf (<span class="string">'Results of GrB and MATLAB match perfectly.\n'</span>)
 </pre><pre class="codeoutput">
-GraphBLAS time: 0.014748 sec (GrB.assign)
-
-GraphBLAS time: 0.028518 sec (overloading)
-MATLAB time:    1167.24 sec
-Speedup of GraphBLAS over MATLAB: 40930
-ans =
-   All zero sparse: 4000x4000
-ans =
-   All zero sparse: 4000x4000
-</pre><h2 id="66">Limitations and their future solutions</h2><p>The MATLAB interface for SuiteSparse:GraphBLAS is a work-in-progress. It has some limitations, most of which will be resolved over time.</p><p>(1) Nonblocking mode:</p><p>GraphBLAS has a 'non-blocking' mode, in which operations can be left pending and completed later.  SuiteSparse:GraphBLAS uses the non-blocking mode to speed up a sequence of assignment operations, such as C(I,J)=A.  However, in its MATLAB interface, this would require a MATLAB mexFunction to modify its inputs.  That breaks the MATLAB API standard, so it cannot be safely done.  As a result, using GraphBLAS via its MATLAB interface can be slower than when using its C API.</p><p>(2) Integer element-wise operations:</p><p>Integer operations in MATLAB saturate, so that uint8(255)+1 is 255.  To allow for integer monoids, GraphBLAS uses modular arithmetic instead. This is the only way that C=A*B can be defined for integer semirings. However, saturating integer operators could be added in the future, so that element- wise integer operations on GraphBLAS sparse integer matrices could work just the same as their MATLAB counterparts.</p><p>So in the future, you could perhaps write this, for both sparse and dense integer matrices A and B:</p><pre>     C = GrB.eadd ('+saturate.int8', A, B)</pre><p>to compute the same thing as C=A+B in MATLAB for its full int8 matrices.  Note that MATLAB can do this only for dense integer matrices, since it doesn't support sparse integer matrices.</p><p>(3) Faster methods:</p><p>Most methods in this MATLAB interface are based on efficient parallel C functions in GraphBLAS itself, and are typically as fast or faster than the equivalent built-in operators and functions in MATLAB.</p><p>There are few notable exceptions; these will be addressed in the future. Dense matrices and vectors held as GraphBLAS objects are slower than their MATLAB counterparts.  horzcat and vertcat, for [A B] and [A;B] when either A or B are GraphBLAS matrices, are also slow, as illustrated below in the next example.</p><p>Other methods that will be faster in the future include bandwidth, istriu, istril, isdiag, reshape, issymmetric, and ishermitian.</p><p>Here is an example that illustrates the performance of C = [A B]</p><pre class="codeinput">clear
+GraphBLAS time: 0.022319 sec (GrB.assign)
+GraphBLAS time: 0.164464 sec (overloading)
+MATLAB time:    1128.8 sec
+Speedup of GraphBLAS (overloading) over MATLAB: 6863.52
+Speedup of GraphBLAS (GrB.assign)  over MATLAB: 50575.9
+
+# of threads used by GraphBLAS: 20
+Results of GrB and MATLAB match perfectly.
+</pre><h2 id="67">Limitations and their future solutions</h2><p>The MATLAB interface for SuiteSparse:GraphBLAS is a work-in-progress. It has some limitations, most of which will be resolved over time.</p><p>(1) Nonblocking mode:</p><p>GraphBLAS has a 'non-blocking' mode, in which operations can be left pending and completed later.  SuiteSparse:GraphBLAS uses the non-blocking mode to speed up a sequence of assignment operations, such as C(I,J)=A.  However, in its MATLAB interface, this would require a MATLAB mexFunction to modify its inputs.  That breaks the MATLAB API standard, so it cannot be safely done.  As a result, using GraphBLAS via its MATLAB interface can be slower than when using its C API.</p><p>(2) Integer element-wise operations:</p><p>Integer operations in MATLAB saturate, so that uint8(255)+1 is 255.  To allow for integer monoids, GraphBLAS uses modular arithmetic instead. This is the only way that C=A*B can be defined for integer semirings. However, saturating integer operators could be added in the future, so that element- wise integer operations on GraphBLAS sparse integer matrices could work just the same as their MATLAB counterparts.</p><p>So in the future, you could perhaps write this, for both sparse and dense integer matrices A and B:</p><pre>     C = GrB.eadd ('+saturate.int8', A, B)</pre><p>to compute the same thing as C=A+B in MATLAB for its full int8 matrices.  Note that MATLAB can do this only for dense integer matrices, since it doesn't support sparse integer matrices.</p><p>(3) Faster methods:</p><p>Most methods in this MATLAB interface are based on efficient parallel C functions in GraphBLAS itself, and are typically as fast or faster than the equivalent built-in operators and functions in MATLAB.</p><p>There are few notable exceptions; these will be addressed in the future. Full matrices and vectors held as GraphBLAS objects can be slightly slower than their MATLAB counterparts.  horzcat and vertcat, for [A B] and [A;B] when either A or B are GraphBLAS matrices, are also slow, as illustrated below in the next example.</p><p>Other methods that will be faster in the future include bandwidth, istriu, istril, isdiag, reshape, issymmetric, and ishermitian.</p><p>Here is an example that illustrates the performance of C = [A B]</p><pre class="codeinput">clear
 A = sparse (rand (2000)) ;
 B = sparse (rand (2000)) ;
 tic
@@ -1452,8 +1576,8 @@
 </pre><pre class="codeoutput">err =
      0
 
-MATLAB: 0.069112 sec, GraphBLAS: 0.183674 sec
-GraphBLAS is slower by a factor of 2.65763
+MATLAB: 0.070991 sec, GraphBLAS: 0.147775 sec
+GraphBLAS is slower by a factor of 2.0816
 </pre><p>(4) Linear indexing:</p><p>If A is an m-by-n 2D MATLAB matrix, with n &gt; 1, A(:) is a column vector of length m*n.  The index operation A(i) accesses the ith entry in the vector A(:).  This is called linear indexing in MATLAB.  It is not yet available for GraphBLAS matrices in this MATLAB interface to GraphBLAS, but will be added in the future.</p><p>(5) Implicit singleton dimension expansion</p><p>In MATLAB C=A+B where A is m-by-n and B is a 1-by-n row vector implicitly expands B to a matrix, computing C(i,j)=A(i,j)+B(j).  This implicit expansion is not yet suported in GraphBLAS with C=A+B. However, it can be done with C = GrB.mxm ('+.+', A, diag(GrB(B))). That's a nice example of the power of semirings, but it's not immediately obvious, and not as clear a syntax as C=A+B.  The GraphBLAS/@GrB/dnn.m function uses this 'plus.plus' semiring to apply the bias to each neuron.</p><pre class="codeinput">A = magic (3)
 B = 1000:1000:3000
 C1 = A + B
@@ -1472,7 +1596,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   9 nonzeros, 9 entries
 
     (1,1)    1008
@@ -1487,7 +1611,7 @@
 
 err =
      0
-</pre><p>(6) Performance issues</p><p>The GrB matrix is a MATLAB object, and there are some cases where performance issues can arise.  Extracting the contents of a MATLAB object (G.field) takes much more time than for a MATLAB struct with the same syntax, and building an object has similar issues.  The difference is small, and it does not affect large problems.  But if you have many calls to GrB operations with a small amount of work, then the time can be dominated by the MATLAB object-oriented overhead.</p><pre class="codeinput">A = rand (3,4) ;
+</pre><p>(6) MATLAB object overhead.</p><p>The GrB matrix is a MATLAB object, and there are some cases where performance issues can arise as a result.  Extracting the contents of a MATLAB object (G.field) takes much more time than for a MATLAB struct with % the same syntax, and building an object has similar issues.  The difference is small, and it does not affect large problems.  But if you have many calls to GrB operations with a small amount of work, then the time can be dominated by the MATLAB object-oriented overhead.</p><p>There is no solution or workaround to this issue.</p><pre class="codeinput">A = rand (3,4) ;
 G = GrB (A) ;
 tic
 <span class="keyword">for</span> k = 1:100000
@@ -1499,10 +1623,12 @@
     [m, n] = size (G) ;
 <span class="keyword">end</span>
 toc
-</pre><pre class="codeoutput">Elapsed time is 0.108922 seconds.
-Elapsed time is 0.976359 seconds.
-</pre><h2 id="73">GraphBLAS operations</h2><p>In addition to the overloaded operators (such as C=A*B) and overloaded functions (such as L=tril(A)), GraphBLAS also has methods of the form GrB.method.  Most of them take an optional input matrix Cin, which is the initial value of the matrix C for the expression below, an optional mask matrix M, and an optional accumulator operator.</p><pre>    C&lt;#M,replace&gt; = accum (C, T)</pre><p>In the above expression, #M is either empty (no mask), M (with a mask matrix) or ~M (with a complemented mask matrix), as determined by the descriptor.  'replace' can be used to clear C after it is used in accum(C,T) but before it is assigned with C&lt;...&gt; = Z, where Z=accum(C,T).  The matrix T is the result of some operation, such as T=A*B for GrB.mxm, or T=op(A,B) for GrB.eadd.</p><p>For a complete list of GraphBLAS overloaded operators and methods, type:</p><pre class="language-matlab">help <span class="string">GrB</span>
-</pre><p>Thanks for watching!</p><p>Tim Davis, Texas A&amp;M University, <a href="http://faculty.cse.tamu.edu/davis">http://faculty.cse.tamu.edu/davis</a> See also doc sparse and https://twitter.com/DocSparse</p><p class="footer"><br><a href="https://www.mathworks.com/products/matlab/">Published with MATLAB&reg; R2018a</a><br></p></div><!--
+</pre><pre class="codeoutput">Elapsed time is 0.070051 seconds.
+Elapsed time is 1.015784 seconds.
+</pre><h2 id="74">GraphBLAS operations</h2><p>In addition to the overloaded operators (such as C=A*B) and overloaded functions (such as L=tril(A)), GraphBLAS also has methods of the form GrB.method.  Most of them take an optional input matrix Cin, which is the initial value of the matrix C for the expression below, an optional mask matrix M, and an optional accumulator operator.</p><pre class="language-matlab">in <span class="string">GrB</span> <span class="string">syntax:</span>  <span class="string">C&lt;#M</span>,replace&gt; = accum (C, A*B)
+</pre><pre class="language-matlab">in <span class="string">@GrB</span> <span class="string">MATLAB:</span> <span class="string">C</span> <span class="string">=</span> <span class="string">GrB.mxm</span> <span class="string">(Cin, M, accum, semiring, A, B, desc)</span> ;
+</pre><p>In the above expression, #M is either empty (no mask), M (with a mask matrix) or ~M (with a complemented mask matrix), as determined by the descriptor (desc).  'replace' can be used to clear C after it is used in accum(C,T) but before it is assigned with C&lt;...&gt; = Z, where Z=accum(C,T).  The matrix T is the result of some operation, such as T=A*B for GrB.mxm, or T=op(A,B) for GrB.eadd.</p><p>For a complete list of GraphBLAS overloaded operators and methods, type:</p><pre class="language-matlab">help <span class="string">GrB</span>
+</pre><p>Thanks for watching!</p><p>Tim Davis, Texas A&amp;M University, <a href="http://faculty.cse.tamu.edu/davis">http://faculty.cse.tamu.edu/davis</a>, <a href="https://twitter.com/DocSparse">https://twitter.com/DocSparse</a></p><p class="footer"><br><a href="https://www.mathworks.com/products/matlab/">Published with MATLAB&reg; R2020b</a><br></p></div><!--
 ##### SOURCE BEGIN #####
 %% GraphBLAS: graph algorithms in the language of linear algebra
 % GraphBLAS is a library for creating graph algorithms based on sparse
@@ -1510,15 +1636,17 @@
 % for more details and resources.  See also the SuiteSparse:GraphBLAS
 % User Guide in this package.
 %
-% SuiteSparse:GraphBLAS, (c) 2017-2020, Tim Davis, Texas A&M University,
 % http://faculty.cse.tamu.edu/davis
+%
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %% GraphBLAS: faster and more general sparse matrices for MATLAB
 % GraphBLAS is not only useful for creating graph algorithms; it also
 % supports a wide range of sparse matrix data types and operations.
 % MATLAB can compute C=A*B with just two semirings: 'plus.times.double'
-% and 'plus.times.complex' for complex matrices.  GraphBLAS has 1,473
-% unique built-in semirings, such as 'max.plus'
+% and 'plus.times.complex' for complex matrices.  GraphBLAS has 2,518
+% built-in semirings, such as 'max.plus'
 % (https://en.wikipedia.org/wiki/Tropical_semiring).  These semirings can
 % be used to construct a wide variety of graph algorithms, based on
 % operations on sparse adjacency matrices.
@@ -1529,8 +1657,12 @@
 % single complex (with MATLAB matrices, these types can only be held in
 % full matrices).
 
-clear
-GrB.clear
+% reset to the default number of threads
+clear all
+maxNumCompThreads ('automatic') ;
+GrB.clear ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
+
 format compact
 rng ('default') ;
 X = 100 * rand (2) ;
@@ -1539,8 +1671,17 @@
 %% Sparse integer matrices
 % Here's an int8 version of the same matrix:
 
-S = int8 (G)            % convert G to a full MATLAB int8 matrix
-G = GrB (X, 'int8')      % a GraphBLAS sparse int8 matrix
+S = int8 (G)             % convert G to a full MATLAB int8 matrix
+S (1,1) = 0              % add an explicit zero to S
+G = GrB (X, 'int8')      % a GraphBLAS full int8 matrix
+G (1,1) = 0              % add an explicit zero to G
+G = GrB.prune (G)        % a GraphBLAS sparse int8 matrix
+
+try
+    S = sparse (S) ;     % MATLAB can't create sparse int8 matrices
+catch me
+    display (me)
+end
 
 %% Sparse single-precision matrices
 % Matrix operations in GraphBLAS are typically as fast, or faster than
@@ -1563,6 +1704,7 @@
 fprintf ('MATLAB time:    %g sec (in double)\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 %% Mixing MATLAB and GraphBLAS matrices
 % The error in the last computation is about eps('single') since
@@ -1589,6 +1731,7 @@
 fprintf ('MATLAB time:    %g sec (in double)\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 %% A wide range of semirings
 % MATLAB can only compute C=A*B using the standard '+.*.double' and
@@ -1669,10 +1812,9 @@
 % The C interface for SuiteSparse:GraphBLAS allows for arbitrary types
 % and operators to be constructed.  However, the MATLAB interface to
 % SuiteSparse:GraphBLAS is restricted to pre-defined types and operators:
-% a mere 13 types, 204 unary operators, 385 binary operators, 77 monoids,
-% 16 select operators, and 2,438 semirings (1,473 of which are unique,
-% since some binary operators are equivalent: 'min.logical' and
-% '&.logical' are the same thing, for example).
+% a mere 13 types, 212 unary operators, 401 binary operators, 77 monoids,
+% 22 select operators (each of which can be used for all 13 types),
+% and 2,518 semirings.
 %
 % That gives you a lot of tools to create all kinds of interesting
 % graph algorithms.  For example:
@@ -1684,6 +1826,7 @@
 % See 'help GrB.binopinfo' for a list of the binary operators, and
 % 'help GrB.monoidinfo' for the ones that can be used as the additive
 % monoid in a semiring.  'help GrB.unopinfo' lists the unary operators.
+% 'help GrB.semiringinfo' descripts the semirings.
 
 %% 
 help GrB.binopinfo
@@ -1694,6 +1837,9 @@
 %% 
 help GrB.unopinfo
 
+%% 
+help GrB.semiringinfo
+
 %% Element-wise operations
 % Binary operators can be used in element-wise matrix operations, like
 % C=A+B and C=A.*B.  For the matrix addition C=A+B, the pattern of C is
@@ -1773,7 +1919,7 @@
 %
 % A few differences with the built-in functions:
 %
-%   S = sparse (G)        % makes a copy of a GrB matrix
+%   S = sparse (G)        % converts G to sparse/hypersparse
 %   F = full (G)          % adds explicit zeros, so numel(F)==nnz(F)
 %   F = full (G,type,id)  % adds explicit identity values to a GrB matrix
 %   disp (G, level)       % display a GrB matrix G; level=2 is the default.
@@ -1831,15 +1977,15 @@
 disp (G,1)
 
 %% Storing a matrix by row or by column
-% MATLAB stores its sparse matrices by column, refered to as 'standard
-% CSC' in SuiteSparse:GraphBLAS.  In the CSC (compressed sparse column)
-% format, each column of the matrix is stored as a list of entries, with
-% their value and row index.  In the CSR (compressed sparse row) format,
-% each row is stored as a list of values and their column indices.
-% GraphBLAS uses both CSC and CSR, and the two formats can be intermixed
-% arbitrarily.  In its C interface, the default format is CSR.  However,
-% for better compatibility with MATLAB, this MATLAB interface for
-% SuiteSparse:GraphBLAS uses CSC by default instead. 
+% MATLAB stores its sparse matrices by column, refered to as 'sparse by
+% col' in SuiteSparse:GraphBLAS.  In the 'sparse by col' format, each
+% column of the matrix is stored as a list of entries, with their value
+% and row index.  In the 'sparse by row' format, each row is stored as a
+% list of values and their column indices.  GraphBLAS uses both 'by row'
+% and 'by col', and the two formats can be intermixed arbitrarily.  In
+% its C interface, the default format is 'by row'.  However, for better
+% compatibility with MATLAB, the SuiteSparse:GraphBLAS MATLAB interface
+% uses 'by col' by default instead. 
 
 %%
 rng ('default') ;
@@ -1850,14 +1996,14 @@
 GrB.format (G)
 
 %%
-% Many graph algorithms work better in CSR format, with matrices stored
-% by row.  For example, it is common to use A(i,j) for the edge (i,j),
-% and many graph algorithms need to access the out-adjacencies of nodes,
-% which is the row A(i,;) for node i.  If the CSR format is desired,
-% GrB.format ('by row') tells GraphBLAS to create all subsequent matrices
-% in the CSR format.  Converting from a MATLAB sparse matrix (in standard
-% CSC format) takes a little more time (requiring a transpose), but
-% subsequent graph algorithms can be faster.
+% Many graph algorithms work better in 'by row' format, with matrices
+% stored by row.  For example, it is common to use A(i,j) for the edge
+% (i,j), and many graph algorithms need to access the out-adjacencies of
+% nodes, which is the row A(i,;) for node i.  If the 'by row' format is
+% desired, GrB.format ('by row') tells GraphBLAS to create all subsequent
+% matrices in the 'by row' format.  Converting from a MATLAB sparse matrix
+% (in standard 'by col' format) takes a little more time (requiring a
+% transpose), but subsequent graph algorithms can be faster.
 
 %%
 G = GrB (C, 'by row')
@@ -1866,12 +2012,13 @@
 fprintf ('the format of H is:    %s\n', GrB.format (H)) ;
 err = norm (H-G,1)
 
-%% Hypersparse matrices
-% SuiteSparse:GraphBLAS can use two kinds of sparse matrix data
-% structures: standard and hypersparse, for both CSC and CSR formats.  In
-% the standard CSC format used in MATLAB, an m-by-n matrix A takes
-% O(n+nnz(A)) space.  MATLAB can create huge column vectors, but not huge
-% matrices (when n is huge).
+%% Hypersparse, sparse, bitmap, and full matrices
+% SuiteSparse:GraphBLAS can use four kinds of sparse matrix data
+% structures: hypersparse, sparse, bitmap, and full, in both 'by col' and
+% 'by row' formats, for a total of eight different combinations.  In the
+% 'sparse by col' that MATLAB uses for its sparse matrices, an m-by-n
+% matrix A takes O(n+nnz(A)) space.  MATLAB can create huge column
+% vectors, but not huge matrices (when n is huge).
 
 clear
 [c, huge] = computer ;
@@ -1965,12 +2112,12 @@
 % Furthermore, C=A*B is not defined for integer types in MATLAB, except
 % when A and/or B are scalars.
 %
-% GraphBLAS supports all of those types for its sparse matrices.  All
-% operations are supported, including C=A*B when A or B are any integer
-% type, in 1000s of semirings.
+% GraphBLAS supports all of those types for all of its matrices (hyper,
+% sparse, bitmap, or full).  All operations are supported, including C=A*B
+% when A or B are any integer type, in 1000s of semirings.
 %
-% However, integer arithmetic differs in GraphBLAS and MATLAB.  In
-% MATLAB, integer values saturate if they exceed their maximum value.  In
+% However, integer arithmetic differs in GraphBLAS and MATLAB.  In MATLAB,
+% integer values saturate if they exceed their maximum value.  In
 % GraphBLAS, integer operators act in a modular fashion.  The latter is
 % essential when computing C=A*B over a semiring.  A saturating integer
 % operator cannot be used as a monoid since it is not associative.
@@ -2010,6 +2157,7 @@
 fprintf ('MATLAB time:    %g sec\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 %% Example graph algorithm: Luby's method in GraphBLAS
 % The GrB.mis function is variant of Luby's randomized algorithm [Luby
@@ -2055,6 +2203,7 @@
 fprintf ('# layers:   %d\n', nlayers) ;
 fprintf ('# neurons:  %d\n', nneurons) ;
 fprintf ('# features: %d\n', nfeatures) ;
+fprintf ('# of threads used: %d\n', GrB.threads) ;
 
 tic
 Y0 = sprand (nfeatures, nneurons, 0.1) ;
@@ -2088,6 +2237,7 @@
 fprintf ('total time in MATLAB:    %g sec\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 err = norm (Y1-Y2,1)
 
@@ -2111,7 +2261,7 @@
 % as { start, inc, fini }, instead of start:inc:fini. See
 % 'help GrB.extract', 'help GrB.assign' for the functional form.
 % For the overloaded syntax C(I,J)=A and C=A(I,J), see
-% 'help GrB/subsasgn' and 'help GrB/subsfref'.  The cell array
+% 'help GrB/subsasgn' and 'help GrB/subsref'.  The cell array
 % syntax isn't conventional, but it is far faster than the MATLAB
 % colon notation for objects, and takes far less memory when I is huge.
 
@@ -2195,11 +2345,11 @@
 %       C = GrB.assign (C, M, A)
 %
 % The GrB.assign statement computes C(M)=A(M), and it is vastly faster
-% than C(M)=A(M), even if the time to convert the GrB matrix back to a
-% MATLAB sparse matrix is included.
+% than C(M)=A(M) for MATLAB sparse matrices, even if the time to convert
+% the GrB matrix back to a MATLAB sparse matrix is included.
 %
-% GraphBLAS can also compute C (M) = A (M) using overloaded operators
-% for subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.
+% GraphBLAS can also compute C(M)=A(M) using overloaded operators for
+% subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.
 %
 % Here are both methods in GraphBLAS (both are very fast).  Setting up:
 
@@ -2242,16 +2392,17 @@
 matlab_time = toc ;
 
 fprintf ('\nGraphBLAS time: %g sec (GrB.assign)\n', gb_time) ;
-fprintf ('\nGraphBLAS time: %g sec (overloading)\n', gb_time2) ;
+fprintf ('GraphBLAS time: %g sec (overloading)\n', gb_time2) ;
 fprintf ('MATLAB time:    %g sec\n', matlab_time) ;
-fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
+fprintf ('Speedup of GraphBLAS (overloading) over MATLAB: %g\n', ...
     matlab_time / gb_time2) ;
+fprintf ('Speedup of GraphBLAS (GrB.assign)  over MATLAB: %g\n', ...
+    matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
-% GraphBLAS computes the exact same result with both methods:
 assert (isequal (C1, C))
 assert (isequal (C2, C))
-C1 - C
-C2 - C
+fprintf ('Results of GrB and MATLAB match perfectly.\n')
 
 %% Limitations and their future solutions
 % The MATLAB interface for SuiteSparse:GraphBLAS is a work-in-progress.
@@ -2294,9 +2445,9 @@
 % the equivalent built-in operators and functions in MATLAB.
 %
 % There are few notable exceptions; these will be addressed in the future.
-% Dense matrices and vectors held as GraphBLAS objects are slower than
-% their MATLAB counterparts.  horzcat and vertcat, for [A B] and [A;B]
-% when either A or B are GraphBLAS matrices, are also slow, as
+% Full matrices and vectors held as GraphBLAS objects can be slightly
+% slower than their MATLAB counterparts.  horzcat and vertcat, for [A B]
+% and [A;B] when either A or B are GraphBLAS matrices, are also slow, as
 % illustrated below in the next example.
 %
 % Other methods that will be faster in the future include bandwidth,
@@ -2353,15 +2504,17 @@
 err = norm (C1-C2,1)
 
 %%
-% (6) Performance issues
+% (6) MATLAB object overhead.
 %
 % The GrB matrix is a MATLAB object, and there are some cases where
-% performance issues can arise.  Extracting the contents of a MATLAB
-% object (G.field) takes much more time than for a MATLAB struct with
-% the same syntax, and building an object has similar issues.  The
-% difference is small, and it does not affect large problems.  But if
-% you have many calls to GrB operations with a small amount of work,
-% then the time can be dominated by the MATLAB object-oriented overhead.
+% performance issues can arise as a result.  Extracting the contents of
+% a MATLAB object (G.field) takes much more time than for a MATLAB struct
+% with % the same syntax, and building an object has similar issues.  The
+% difference is small, and it does not affect large problems.  But if you
+% have many calls to GrB operations with a small amount of work, then the
+% time can be dominated by the MATLAB object-oriented overhead.
+%
+% There is no solution or workaround to this issue.
 
 A = rand (3,4) ;
 G = GrB (A) ;
@@ -2383,12 +2536,14 @@
 % the initial value of the matrix C for the expression below, an optional
 % mask matrix M, and an optional accumulator operator.
 %
-%      C<#M,replace> = accum (C, T)
+%   in GrB syntax:  C<#M,replace> = accum (C, A*B)
+%
+%   in @GrB MATLAB: C = GrB.mxm (Cin, M, accum, semiring, A, B, desc) ;
 %
 % In the above expression, #M is either empty (no mask), M (with a mask
 % matrix) or ~M (with a complemented mask matrix), as determined by the
-% descriptor.  'replace' can be used to clear C after it is used in
-% accum(C,T) but before it is assigned with C<...> = Z, where
+% descriptor (desc).  'replace' can be used to clear C after it is used
+% in accum(C,T) but before it is assigned with C<...> = Z, where
 % Z=accum(C,T).  The matrix T is the result of some operation, such as
 % T=A*B for GrB.mxm, or T=op(A,B) for GrB.eadd.
 %
@@ -2398,8 +2553,8 @@
 %
 % Thanks for watching!
 %
-% Tim Davis, Texas A&M University, http://faculty.cse.tamu.edu/davis
-% See also doc sparse and https://twitter.com/DocSparse
+% Tim Davis, Texas A&M University, http://faculty.cse.tamu.edu/davis,
+% https://twitter.com/DocSparse
 
 
 ##### SOURCE END #####
diff --git a/GraphBLAS/GraphBLAS/demo/html/DGX_Station/graphblas_demo2.html b/GraphBLAS/GraphBLAS/demo/html/DGX_Station/graphblas_demo2.html
index 489b858651..b25a9a22ba 100644
--- a/GraphBLAS/GraphBLAS/demo/html/DGX_Station/graphblas_demo2.html
+++ b/GraphBLAS/GraphBLAS/demo/html/DGX_Station/graphblas_demo2.html
@@ -6,8 +6,8 @@
    <!--
 This HTML was auto-generated from MATLAB code.
 To make changes, update the MATLAB code and republish this document.
-      --><title>graphblas_demo2</title><meta name="generator" content="MATLAB 9.4"><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/"><meta name="DC.date" content="2020-06-25"><meta name="DC.source" content="graphblas_demo2.m"><style type="text/css">
-html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,font,img,ins,kbd,q,s,samp,small,strike,strong,sub,sup,tt,var,b,u,i,center,dl,dt,dd,ol,ul,li,fieldset,form,label,legend,table,caption,tbody,tfoot,thead,tr,th,td{margin:0;padding:0;border:0;outline:0;font-size:100%;vertical-align:baseline;background:transparent}body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote:before,blockquote:after,q:before,q:after{content:'';content:none}:focus{outine:0}ins{text-decoration:none}del{text-decoration:line-through}table{border-collapse:collapse;border-spacing:0}
+      --><title>graphblas_demo2</title><meta name="generator" content="MATLAB 9.9"><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/"><meta name="DC.date" content="2021-01-03"><meta name="DC.source" content="graphblas_demo2.m"><style type="text/css">
+html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,font,img,ins,kbd,q,s,samp,small,strike,strong,tt,var,b,u,i,center,dl,dt,dd,ol,ul,li,fieldset,form,label,legend,table,caption,tbody,tfoot,thead,tr,th,td{margin:0;padding:0;border:0;outline:0;font-size:100%;vertical-align:baseline;background:transparent}body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote:before,blockquote:after,q:before,q:after{content:'';content:none}:focus{outine:0}ins{text-decoration:none}del{text-decoration:line-through}table{border-collapse:collapse;border-spacing:0}
 
 html { min-height:100%; margin-bottom:1px; }
 html body { height:100%; margin:0px; font-family:Arial, Helvetica, sans-serif; font-size:10px; color:#000; line-height:140%; background:#fff none; overflow-y:scroll; }
@@ -23,7 +23,7 @@
 
 p { padding:0px; margin:0px 0px 20px; }
 img { padding:0px; margin:0px 0px 20px; border:none; }
-p img, pre img, tt img, li img, h1 img, h2 img { margin-bottom:0px; } 
+p img, pre img, tt img, li img, h1 img, h2 img { margin-bottom:0px; }
 
 ul { padding:0px; margin:0px 0px 20px 23px; list-style:square; }
 ul li { padding:0px; margin:0px 0px 7px 0px; }
@@ -52,6 +52,7 @@
 span.string { color:#A020F0 }
 span.untermstring { color:#B20000 }
 span.syscmd { color:#B28C00 }
+span.typesection { color:#A0522D }
 
 .footer { width:auto; padding:10px 0px; margin:25px 0px 0px; border-top:1px dotted #878787; font-size:0.8em; line-height:140%; font-style:italic; color:#878787; text-align:left; float:none; }
 .footer p { margin:0px; }
@@ -68,8 +69,13 @@
 
   </style></head><body><div class="content"><pre class="codeinput"><span class="comment">% Run the GraphBLAS demo2</span>
 
-<span class="comment">% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.</span>
-<span class="comment">% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.</span>
+<span class="comment">% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.</span>
+<span class="comment">% SPDX-License-Identifier: Apache-2.0</span>
+
+<span class="comment">% reset to the default number of threads</span>
+clear <span class="string">all</span>
+maxNumCompThreads (<span class="string">'automatic'</span>) ;
+GrB.clear ;
 
 gbdemo2
 </pre><pre class="codeoutput"> GBDEMO2 Extreme performance differences: GraphBLAS vs MATLAB.
@@ -79,8 +85,8 @@
         gbdemo2             % uses a default bnz = 6000
         gbdemo2 (20000)     % uses bnz = 20000
  
-  The GraphBLAS operations used in gbdemo are perhaps 3x to 50x
-  faster than the corresponding MATLAB operations, depending on how
+  Many of the GraphBLAS operations used in gbdemo are perhaps 3x to
+  50x faster than the corresponding MATLAB operations, depending on how
   many cores your computer has.  Here's an example where GraphBLAS is
   asymptotically far faster than MATLAB R2019a: a simple assignment
   for a large matrix C:
@@ -116,79 +122,84 @@
 with 35.7126 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.329855 sec
-    GraphBLAS time: 0.173019 sec
+    setup time:     0.400355 sec
+    GraphBLAS time: 0.258735 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    0.474924 sec
-    Speedup of GraphBLAS over MATLAB: 2.74492
-    check time:     0.215864 sec
+    MATLAB time:    0.498557 sec
+    Speedup of GraphBLAS over MATLAB: 1.9269
+    check time:     0.344678 sec
     all tests passed
 
 C(I,J)=A where C is 4 million -by- 4 million
 with 35.8202 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.417582 sec
-    GraphBLAS time: 0.203017 sec
+    setup time:     0.435778 sec
+    GraphBLAS time: 0.221929 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    0.453103 sec
-    Speedup of GraphBLAS over MATLAB: 2.23185
-    check time:     0.22641 sec
+    MATLAB time:    0.470126 sec
+    Speedup of GraphBLAS over MATLAB: 2.11836
+    check time:     0.232899 sec
     all tests passed
 
 C(I,J)=A where C is 9 million -by- 9 million
 with 35.928 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.485024 sec
-    GraphBLAS time: 0.249401 sec
+    setup time:     0.512156 sec
+    GraphBLAS time: 0.269746 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    228.188 sec
-    Speedup of GraphBLAS over MATLAB: 914.945
-    check time:     0.249945 sec
+    MATLAB time:    231.803 sec
+    Speedup of GraphBLAS over MATLAB: 859.34
+    check time:     0.256974 sec
     all tests passed
 
 C(I,J)=A where C is 16 million -by- 16 million
 with 35.916 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.552762 sec
-    GraphBLAS time: 0.292017 sec
+    setup time:     0.583952 sec
+    GraphBLAS time: 0.316899 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    246.474 sec
-    Speedup of GraphBLAS over MATLAB: 844.041
-    check time:     0.27686 sec
+    MATLAB time:    249.945 sec
+    Speedup of GraphBLAS over MATLAB: 788.722
+    check time:     0.286138 sec
     all tests passed
 
 C(I,J)=A where C is 25 million -by- 25 million
 with 35.964 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.630048 sec
-    GraphBLAS time: 0.25246 sec
+    setup time:     0.652218 sec
+    GraphBLAS time: 0.27157 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    269.415 sec
-    Speedup of GraphBLAS over MATLAB: 1067.16
-    check time:     0.311659 sec
+    MATLAB time:    272.701 sec
+    Speedup of GraphBLAS over MATLAB: 1004.17
+    check time:     0.324539 sec
     all tests passed
 
 C(I,J)=A where C is 36 million -by- 36 million
 with 35.976 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.712427 sec
-    GraphBLAS time: 0.401151 sec
+    setup time:     0.731629 sec
+    GraphBLAS time: 0.426048 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    296.906 sec
-    Speedup of GraphBLAS over MATLAB: 740.134
-    check time:     0.35269 sec
+    MATLAB time:    300.018 sec
+    Speedup of GraphBLAS over MATLAB: 704.187
+    check time:     0.365963 sec
     all tests passed
-</pre><p class="footer"><br><a href="https://www.mathworks.com/products/matlab/">Published with MATLAB&reg; R2018a</a><br></p></div><!--
+</pre><p class="footer"><br><a href="https://www.mathworks.com/products/matlab/">Published with MATLAB&reg; R2020b</a><br></p></div><!--
 ##### SOURCE BEGIN #####
 % Run the GraphBLAS demo2
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+% reset to the default number of threads
+clear all
+maxNumCompThreads ('automatic') ;
+GrB.clear ;
 
 gbdemo2
 
diff --git a/GraphBLAS/GraphBLAS/demo/html/DellXPS13/README.txt b/GraphBLAS/GraphBLAS/demo/html/DellXPS13/README.txt
index 34cde5d20a..e07b045800 100644
--- a/GraphBLAS/GraphBLAS/demo/html/DellXPS13/README.txt
+++ b/GraphBLAS/GraphBLAS/demo/html/DellXPS13/README.txt
@@ -1,7 +1,7 @@
 Results on a Dell XPS 13 9380 laptop.
 
 Intel(R) Core(TM) i7-8565U CPU @ 1.80GHz (up to 4.6GHz),
-16GB of RAM.  MATLAB R2019a.  Ubuntu 18.04.  GraphBLAS and
+16GB of RAM.  MATLAB R2020a.  Ubuntu 18.04.  GraphBLAS and
 the mexFunction interface compiled with gcc 7.5.0.  The CPU
 has 4 hardware cores, and 4 threads were used (the default
 when running OpenMP inside MATLAB).
diff --git a/GraphBLAS/GraphBLAS/demo/html/DellXPS13/graphblas_demo.html b/GraphBLAS/GraphBLAS/demo/html/DellXPS13/graphblas_demo.html
index df81ffa1e9..ddcb567ed2 100644
--- a/GraphBLAS/GraphBLAS/demo/html/DellXPS13/graphblas_demo.html
+++ b/GraphBLAS/GraphBLAS/demo/html/DellXPS13/graphblas_demo.html
@@ -6,7 +6,7 @@
    <!--
 This HTML was auto-generated from MATLAB code.
 To make changes, update the MATLAB code and republish this document.
-      --><title>GraphBLAS: graph algorithms in the language of linear algebra</title><meta name="generator" content="MATLAB 9.8"><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/"><meta name="DC.date" content="2020-06-25"><meta name="DC.source" content="graphblas_demo.m"><style type="text/css">
+      --><title>GraphBLAS: graph algorithms in the language of linear algebra</title><meta name="generator" content="MATLAB 9.8"><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/"><meta name="DC.date" content="2021-01-03"><meta name="DC.source" content="graphblas_demo.m"><style type="text/css">
 html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,font,img,ins,kbd,q,s,samp,small,strike,strong,sub,sup,tt,var,b,u,i,center,dl,dt,dd,ol,ul,li,fieldset,form,label,legend,table,caption,tbody,tfoot,thead,tr,th,td{margin:0;padding:0;border:0;outline:0;font-size:100%;vertical-align:baseline;background:transparent}body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote:before,blockquote:after,q:before,q:after{content:'';content:none}:focus{outine:0}ins{text-decoration:none}del{text-decoration:line-through}table{border-collapse:collapse;border-spacing:0}
 
 html { min-height:100%; margin-bottom:1px; }
@@ -67,16 +67,22 @@
 
 
 
-  </style></head><body><div class="content"><h1>GraphBLAS: graph algorithms in the language of linear algebra</h1><!--introduction--><p>GraphBLAS is a library for creating graph algorithms based on sparse linear algebraic operations over semirings.  Visit <a href="http://graphblas.org">http://graphblas.org</a> for more details and resources.  See also the SuiteSparse:GraphBLAS User Guide in this package.</p><p>SuiteSparse:GraphBLAS, (c) 2017-2020, Tim Davis, Texas A&amp;M University, <a href="http://faculty.cse.tamu.edu/davis">http://faculty.cse.tamu.edu/davis</a></p><!--/introduction--><h2>Contents</h2><div><ul><li><a href="#1">GraphBLAS: faster and more general sparse matrices for MATLAB</a></li><li><a href="#2">Sparse integer matrices</a></li><li><a href="#3">Sparse single-precision matrices</a></li><li><a href="#4">Mixing MATLAB and GraphBLAS matrices</a></li><li><a href="#5">Faster matrix operations</a></li><li><a href="#6">A wide range of semirings</a></li><li><a href="#8">The max.plus tropical semiring</a></li><li><a href="#9">A boolean semiring</a></li><li><a href="#13">GraphBLAS operators, monoids, and semirings</a></li><li><a href="#17">Element-wise operations</a></li><li><a href="#19">Subtracting two matrices</a></li><li><a href="#21">Element-wise 'multiplication'</a></li><li><a href="#23">Overloaded operators</a></li><li><a href="#26">Overloaded functions</a></li><li><a href="#28">Zeros are handled differently</a></li><li><a href="#30">Displaying contents of a GraphBLAS matrix</a></li><li><a href="#35">Storing a matrix by row or by column</a></li><li><a href="#39">Hypersparse matrices</a></li><li><a href="#42">numel uses vpa if the matrix is really huge</a></li><li><a href="#44">The mask and accumulator</a></li><li><a href="#46">The descriptor</a></li><li><a href="#47">Integer arithmetic is different in GraphBLAS</a></li><li><a href="#49">An example graph algorithm: breadth-first search</a></li><li><a href="#50">Example graph algorithm: Luby's method in GraphBLAS</a></li><li><a href="#51">Sparse deep neural network</a></li><li><a href="#52">Solving the sparse deep neural network problem with GraphbLAS</a></li><li><a href="#53">Solving the sparse deep neural network problem with MATLAB</a></li><li><a href="#54">For objects, GraphBLAS has better colon notation than MATLAB</a></li><li><a href="#57">Iterative solvers work as-is</a></li><li><a href="#58">... even in single precision</a></li><li><a href="#61">Extreme performance differences between GraphBLAS and MATLAB.</a></li><li><a href="#62">Sparse logical indexing is much, much faster in GraphBLAS</a></li><li><a href="#63">First method in GraphBLAS, with GrB.assign</a></li><li><a href="#64">Second method in GraphBLAS, with C(M)=A(M)</a></li><li><a href="#65">Now with MATLAB matrices, with C(M)=A(M)</a></li><li><a href="#66">Limitations and their future solutions</a></li><li><a href="#73">GraphBLAS operations</a></li></ul></div><h2 id="1">GraphBLAS: faster and more general sparse matrices for MATLAB</h2><p>GraphBLAS is not only useful for creating graph algorithms; it also supports a wide range of sparse matrix data types and operations. MATLAB can compute C=A*B with just two semirings: 'plus.times.double' and 'plus.times.complex' for complex matrices.  GraphBLAS has 1,473 unique built-in semirings, such as 'max.plus' (<a href="https://en.wikipedia.org/wiki/Tropical_semiring">https://en.wikipedia.org/wiki/Tropical_semiring</a>).  These semirings can be used to construct a wide variety of graph algorithms, based on operations on sparse adjacency matrices.</p><p>MATLAB and GraphBLAS both provide sparse matrices of type double, logical, and double complex.  GraphBLAS adds sparse matrices of type: single, int8, int16, int32, int64, uint8, uint16, uint32, uint64, and single complex (with MATLAB matrices, these types can only be held in full matrices).</p><pre class="codeinput">clear
-GrB.clear
+  </style></head><body><div class="content"><h1>GraphBLAS: graph algorithms in the language of linear algebra</h1><!--introduction--><p>GraphBLAS is a library for creating graph algorithms based on sparse linear algebraic operations over semirings.  Visit <a href="http://graphblas.org">http://graphblas.org</a> for more details and resources.  See also the SuiteSparse:GraphBLAS User Guide in this package.</p><p><a href="http://faculty.cse.tamu.edu/davis">http://faculty.cse.tamu.edu/davis</a></p><p>SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved. SPDX-License-Identifier: Apache-2.0</p><!--/introduction--><h2>Contents</h2><div><ul><li><a href="#1">GraphBLAS: faster and more general sparse matrices for MATLAB</a></li><li><a href="#2">Sparse integer matrices</a></li><li><a href="#3">Sparse single-precision matrices</a></li><li><a href="#4">Mixing MATLAB and GraphBLAS matrices</a></li><li><a href="#5">Faster matrix operations</a></li><li><a href="#6">A wide range of semirings</a></li><li><a href="#8">The max.plus tropical semiring</a></li><li><a href="#9">A boolean semiring</a></li><li><a href="#13">GraphBLAS operators, monoids, and semirings</a></li><li><a href="#18">Element-wise operations</a></li><li><a href="#20">Subtracting two matrices</a></li><li><a href="#22">Element-wise 'multiplication'</a></li><li><a href="#24">Overloaded operators</a></li><li><a href="#27">Overloaded functions</a></li><li><a href="#29">Zeros are handled differently</a></li><li><a href="#31">Displaying contents of a GraphBLAS matrix</a></li><li><a href="#36">Storing a matrix by row or by column</a></li><li><a href="#40">Hypersparse, sparse, bitmap, and full matrices</a></li><li><a href="#43">numel uses vpa if the matrix is really huge</a></li><li><a href="#45">The mask and accumulator</a></li><li><a href="#47">The descriptor</a></li><li><a href="#48">Integer arithmetic is different in GraphBLAS</a></li><li><a href="#50">An example graph algorithm: breadth-first search</a></li><li><a href="#51">Example graph algorithm: Luby's method in GraphBLAS</a></li><li><a href="#52">Sparse deep neural network</a></li><li><a href="#53">Solving the sparse deep neural network problem with GraphbLAS</a></li><li><a href="#54">Solving the sparse deep neural network problem with MATLAB</a></li><li><a href="#55">For objects, GraphBLAS has better colon notation than MATLAB</a></li><li><a href="#58">Iterative solvers work as-is</a></li><li><a href="#59">... even in single precision</a></li><li><a href="#62">Extreme performance differences between GraphBLAS and MATLAB.</a></li><li><a href="#63">Sparse logical indexing is much, much faster in GraphBLAS</a></li><li><a href="#64">First method in GraphBLAS, with GrB.assign</a></li><li><a href="#65">Second method in GraphBLAS, with C(M)=A(M)</a></li><li><a href="#66">Now with MATLAB matrices, with C(M)=A(M)</a></li><li><a href="#67">Limitations and their future solutions</a></li><li><a href="#74">GraphBLAS operations</a></li></ul></div><h2 id="1">GraphBLAS: faster and more general sparse matrices for MATLAB</h2><p>GraphBLAS is not only useful for creating graph algorithms; it also supports a wide range of sparse matrix data types and operations. MATLAB can compute C=A*B with just two semirings: 'plus.times.double' and 'plus.times.complex' for complex matrices.  GraphBLAS has 2,518 built-in semirings, such as 'max.plus' (<a href="https://en.wikipedia.org/wiki/Tropical_semiring">https://en.wikipedia.org/wiki/Tropical_semiring</a>).  These semirings can be used to construct a wide variety of graph algorithms, based on operations on sparse adjacency matrices.</p><p>MATLAB and GraphBLAS both provide sparse matrices of type double, logical, and double complex.  GraphBLAS adds sparse matrices of type: single, int8, int16, int32, int64, uint8, uint16, uint32, uint64, and single complex (with MATLAB matrices, these types can only be held in full matrices).</p><pre class="codeinput"><span class="comment">% reset to the default number of threads</span>
+clear <span class="string">all</span>
+maxNumCompThreads (<span class="string">'automatic'</span>) ;
+GrB.clear ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
+
 format <span class="string">compact</span>
 rng (<span class="string">'default'</span>) ;
 X = 100 * rand (2) ;
 G = GrB (X)              <span class="comment">% GraphBLAS copy of a matrix X, same type</span>
 </pre><pre class="codeoutput">
+# of threads used by GraphBLAS: 4
+
 G =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    81.4724
@@ -84,16 +90,29 @@
     (1,2)    12.6987
     (2,2)    91.3376
 
-</pre><h2 id="2">Sparse integer matrices</h2><p>Here's an int8 version of the same matrix:</p><pre class="codeinput">S = int8 (G)            <span class="comment">% convert G to a full MATLAB int8 matrix</span>
-G = GrB (X, <span class="string">'int8'</span>)      <span class="comment">% a GraphBLAS sparse int8 matrix</span>
+</pre><h2 id="2">Sparse integer matrices</h2><p>Here's an int8 version of the same matrix:</p><pre class="codeinput">S = int8 (G)             <span class="comment">% convert G to a full MATLAB int8 matrix</span>
+S (1,1) = 0              <span class="comment">% add an explicit zero to S</span>
+G = GrB (X, <span class="string">'int8'</span>)      <span class="comment">% a GraphBLAS full int8 matrix</span>
+G (1,1) = 0              <span class="comment">% add an explicit zero to G</span>
+G = GrB.prune (G)        <span class="comment">% a GraphBLAS sparse int8 matrix</span>
+
+<span class="keyword">try</span>
+    S = sparse (S) ;     <span class="comment">% MATLAB can't create sparse int8 matrices</span>
+<span class="keyword">catch</span> me
+    display (me)
+<span class="keyword">end</span>
 </pre><pre class="codeoutput">S =
   2x2 int8 matrix
    81   13
    91   91
+S =
+  2x2 int8 matrix
+    0   13
+   91   91
 
 G =
 
-  2x2 GraphBLAS int8_t matrix, sparse by col:
+  2x2 GraphBLAS int8_t matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)   81
@@ -101,6 +120,35 @@
     (1,2)   13
     (2,2)   91
 
+
+G =
+
+  2x2 GraphBLAS int8_t matrix, full by col
+  3 nonzeros, 4 entries
+
+    (1,1)   0
+    (2,1)   91
+    (1,2)   13
+    (2,2)   91
+
+
+G =
+
+  2x2 GraphBLAS int8_t matrix, bitmap by col
+  3 nonzeros, 3 entries
+
+    (2,1)   91
+    (1,2)   13
+    (2,2)   91
+
+me = 
+  MException with properties:
+
+    identifier: 'MATLAB:sparse:charConversion'
+       message: 'Input matrix must be double or logical.'
+         cause: {}
+         stack: [4x1 struct]
+    Correction: []
 </pre><h2 id="3">Sparse single-precision matrices</h2><p>Matrix operations in GraphBLAS are typically as fast, or faster than MATLAB.  Here's an unfair comparison: computing X^2 with MATLAB in double precision and with GraphBLAS in single precision.  You would naturally expect GraphBLAS to be faster.</p><p>Please wait ...</p><pre class="codeinput">n = 1e5 ;
 X = spdiags (rand (n, 201), -100:100, n, n) ;
 G = GrB (X, <span class="string">'single'</span>) ;
@@ -114,10 +162,13 @@
 fprintf (<span class="string">'MATLAB time:    %g sec (in double)\n'</span>, matlab_time) ;
 fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 </pre><pre class="codeoutput">
-GraphBLAS time: 1.60546 sec (in single)
-MATLAB time:    7.18249 sec (in double)
-Speedup of GraphBLAS over MATLAB: 4.47379
+GraphBLAS time: 2.44013 sec (in single)
+MATLAB time:    7.5507 sec (in double)
+Speedup of GraphBLAS over MATLAB: 3.09439
+
+# of threads used by GraphBLAS: 4
 </pre><h2 id="4">Mixing MATLAB and GraphBLAS matrices</h2><p>The error in the last computation is about eps('single') since GraphBLAS did its computation in single precision, while MATLAB used double precision.  MATLAB and GraphBLAS matrices can be easily combined, as in X2-G2.  The sparse single precision matrices take less memory space.</p><pre class="codeinput">err = norm (X2 - G2, 1) / norm (X2,1)
 eps (<span class="string">'single'</span>)
 whos <span class="string">G</span> <span class="string">G2</span> <span class="string">X</span> <span class="string">X2</span>
@@ -128,8 +179,8 @@
   1.1921e-07
   Name           Size                    Bytes  Class     Attributes
 
-  G         100000x100000            241879724  GrB                 
-  G2        100000x100000            481518524  GrB                 
+  G         100000x100000            241879732  GrB                 
+  G2        100000x100000            481518532  GrB                 
   X         100000x100000            322238408  double    sparse    
   X2        100000x100000            641756808  double    sparse    
 
@@ -142,12 +193,15 @@
 fprintf (<span class="string">'MATLAB time:    %g sec (in double)\n'</span>, matlab_time) ;
 fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 </pre><pre class="codeoutput">err =
      0
 
-GraphBLAS time: 1.76195 sec (in double)
-MATLAB time:    7.18249 sec (in double)
-Speedup of GraphBLAS over MATLAB: 4.07645
+GraphBLAS time: 2.59531 sec (in double)
+MATLAB time:    7.5507 sec (in double)
+Speedup of GraphBLAS over MATLAB: 2.90937
+
+# of threads used by GraphBLAS: 4
 </pre><h2 id="6">A wide range of semirings</h2><p>MATLAB can only compute C=A*B using the standard '+.*.double' and '+.*.complex' semirings.  A semiring is defined in terms of a string, 'add.mult.type', where 'add' is a monoid that takes the place of the additive operator, 'mult' is the multiplicative operator, and 'type' is the data type for the two inputs to the mult operator.</p><p>In the standard semiring, C=A*B is defined as:</p><pre class="language-matlab">C(i,j) = sum (A(i,:).' .* B(:,j))
 </pre><p>using 'plus' as the monoid and 'times' as the multiplicative operator. But in a more general semiring, 'sum' can be any monoid, which is an associative and commutative operator that has an identity value.  For example, in the 'max.plus' tropical algebra, C(i,j) for C=A*B is defined as:</p><pre class="language-matlab">C(i,j) = max (A(i,:).' + B(:,j))
 </pre><p>This can be computed in GraphBLAS with:</p><pre class="language-matlab">C = GrB.mxm (<span class="string">'max.+'</span>, A, B)
@@ -235,7 +289,7 @@
 
 C2 =
 
-  3x3 GraphBLAS bool matrix, sparse by col:
+  3x3 GraphBLAS bool matrix, bitmap by col
   9 nonzeros, 9 entries
 
     (1,1)   1
@@ -255,14 +309,14 @@
   A         3x3                68  logical    sparse    
   B         3x3               113  logical    sparse    
   C1        3x3               176  double     sparse    
-  C2        3x3              1031  GrB                  
+  C2        3x3               776  GrB                  
 
 ans =
     'logical'
-</pre><h2 id="13">GraphBLAS operators, monoids, and semirings</h2><p>The C interface for SuiteSparse:GraphBLAS allows for arbitrary types and operators to be constructed.  However, the MATLAB interface to SuiteSparse:GraphBLAS is restricted to pre-defined types and operators: a mere 13 types, 204 unary operators, 385 binary operators, 77 monoids, 16 select operators, and 2,438 semirings (1,473 of which are unique, since some binary operators are equivalent: 'min.logical' and '&amp;.logical' are the same thing, for example).</p><p>That gives you a lot of tools to create all kinds of interesting graph algorithms.  For example:</p><pre class="language-matlab">GrB.bfs    <span class="comment">% breadth-first search</span>
+</pre><h2 id="13">GraphBLAS operators, monoids, and semirings</h2><p>The C interface for SuiteSparse:GraphBLAS allows for arbitrary types and operators to be constructed.  However, the MATLAB interface to SuiteSparse:GraphBLAS is restricted to pre-defined types and operators: a mere 13 types, 212 unary operators, 401 binary operators, 77 monoids, 22 select operators (each of which can be used for all 13 types), and 2,518 semirings.</p><p>That gives you a lot of tools to create all kinds of interesting graph algorithms.  For example:</p><pre class="language-matlab">GrB.bfs    <span class="comment">% breadth-first search</span>
 GrB.dnn    <span class="comment">% sparse deep neural network (http://graphchallenge.org)</span>
 GrB.mis    <span class="comment">% maximal independent set</span>
-</pre><p>See 'help GrB.binopinfo' for a list of the binary operators, and 'help GrB.monoidinfo' for the ones that can be used as the additive monoid in a semiring.  'help GrB.unopinfo' lists the unary operators.</p><pre class="codeinput">help <span class="string">GrB.binopinfo</span>
+</pre><p>See 'help GrB.binopinfo' for a list of the binary operators, and 'help GrB.monoidinfo' for the ones that can be used as the additive monoid in a semiring.  'help GrB.unopinfo' lists the unary operators. 'help GrB.semiringinfo' descripts the semirings.</p><pre class="codeinput">help <span class="string">GrB.binopinfo</span>
 </pre><pre class="codeoutput"> GRB.BINOPINFO list the details of a GraphBLAS binary operator.
  
     GrB.binopinfo
@@ -303,29 +357,49 @@
  
     operator name(s) f(x,y)         |   operator names(s) f(x,y)
     ---------------- ------         |   ----------------- ------
-    1st first        x              |   iseq             x == y
-    2nd second       y              |   isne             x ~= y
-    min              min(x,y)       |   isgt             x &gt; y
-    max              max(x,y)       |   islt             x &lt; y
-    +   plus         x+y            |   isge             x &gt;= y
-    -   minus        x-y            |   isle             x &lt;= y
-    rminus           y-x            |   ==  eq           x == y
-    *   times        x*y            |   ~=  ne           x ~= y
-    /   div          x/y            |   &gt;   gt           x &gt; y
-    \   rdiv         y/x            |   &lt;   lt           x &lt; y
-    |   || or  lor   x | y          |   &gt;=  ge           x &gt;= y
-    &amp;   &amp;&amp; and land  x &amp; y          |   &lt;=  le           x &lt;= y
-    xor lxor         xor(x,y)       |   .^  pow          x .^ y
-    pair             1              |   any              pick x or y
- 
-  Comparators (*lt, *gt, *le, *ge) and min/max are not available for
-  complex types.
+    1st first        x              |   iseq              x == y
+    2nd second       y              |   isne              x ~= y
+    min              min(x,y)       |   isgt              x &gt; y
+    max              max(x,y)       |   islt              x &lt; y
+    +   plus         x+y            |   isge              x &gt;= y
+    -   minus        x-y            |   isle              x &lt;= y
+    rminus           y-x            |   ==  eq            x == y
+    *   times        x*y            |   ~=  ne            x ~= y
+    /   div          x/y            |   &gt;   gt            x &gt; y
+    \   rdiv         y/x            |   &lt;   lt            x &lt; y
+    |   || or  lor   x | y          |   &gt;=  ge            x &gt;= y
+    &amp;   &amp;&amp; and land  x &amp; y          |   &lt;=  le            x &lt;= y
+    xor lxor         xor(x,y)       |   .^  pow           x .^ y
+    pair             1              |   any               pick x or y
  
   All of the above operators are defined for logical operands, but many
   are redundant. 'min.logical' is the same as 'and.logical', for example.
   Most of the logical operators have aliases: ('lor', 'or', '|') are the
   same, as are ('lxnor', 'xnor', 'eq', '==') for logical types.
  
+  Positional operators return int32 or int64, and depend only on the position
+  of the entry in the matrix.  They do not depend on the values of their
+  inputs, but on their position in the matrix instead:
+ 
+    1-based postional ops:          in a semiring:     in ewise operators:
+    operator name(s)                f(A(i,k)*B(k,j))   f(A(i,j),B(i,j))
+    ----------------                ----------------   ----------------
+    firsti1  1sti1 firsti  1sti     i                  i
+    firstj1  1stj1 firstj  1stj     k                  j
+    secondi1 2ndi1 secondi 2ndi     k                  i
+    secondj1 2ndj1 secondj 2ndj     j                  j
+ 
+    0-based postional ops:          in a semiring:     in ewise operators:
+    operator name(s)                f(A(i,k)*B(k,j))   f(A(i,j),B(i,j))
+    ----------------                ----------------   ----------------
+    firsti0  1sti0                  i-1                i-1
+    firstj0  1stj0                  k-1                j-1
+    secondi0 2ndi0                  k-1                i-1
+    secondj0 2ndj0                  j-1                j-1
+ 
+  Comparators (*lt, *gt, *le, *ge) and min/max are not available for
+  complex types.
+ 
   The three logical operators, lor, land, and lxor, can be used with any
   real types.  z = lor.double (x,y) tests the condition (x~=0) || (y~=0),
   and returns the double value 1.0 if true, or 0.0 if false.
@@ -489,6 +563,12 @@
   For integer types only (result is same type as input):
     bitcmp      z = ~(x)        bitwise complement, also 'bitnot'
  
+  For int32 and int64 types, applied to an entry A(i,j)
+    positioni0  z = i-1     also 'i0'
+    positioni1  z = i       also 'i', 'i1', and 'positioni'
+    positionj0  z = j-1     also 'j0'
+    positionj1  z = j       also 'j', 'j1', and 'positionj'
+ 
   Example:
  
     % valid unary operators
@@ -504,7 +584,42 @@
   See also GrB.binopinfo, GrB.descriptorinfo, GrB.monoidinfo,
   GrB.selectopinfo, GrB.semiringinfo.
 
-</pre><h2 id="17">Element-wise operations</h2><p>Binary operators can be used in element-wise matrix operations, like C=A+B and C=A.*B.  For the matrix addition C=A+B, the pattern of C is the set union of A and B, and the '+' operator is applied for entries in the intersection.  Entries in A but not B, or in B but not A, are assigned to C without using the operator.  The '+' operator is used for C=A+B but any operator can be used with GrB.eadd.</p><pre class="codeinput">A = GrB (sprand (3, 3, 0.5)) ;
+</pre><pre class="codeinput">help <span class="string">GrB.semiringinfo</span>
+</pre><pre class="codeoutput"> GRB.SEMIRINGINFO list the details of a GraphBLAS semiring.
+ 
+    GrB.semiringinfo
+    GrB.semiringinfo (semiring)
+    GrB.semiringinfo (semiring, type)
+ 
+  For GrB.semiring(semiring), the semiring must be a string of the form
+  'add.mult.type', where 'add' and 'mult' are binary operators.  The
+  second usage allows the type to be omitted from the first argument, as
+  just 'add.mult'.  This is valid for all GraphBLAS operations, since the
+  type defaults to the type of the input matrices.  However,
+  GrB.semiringinfo does not have a default type and thus one must be
+  provided, either in the semiring as GrB.semiringinfo ('+.*.double'), or
+  in the second argument, GrB.semiringinfo ('+.*', 'double').
+ 
+  The additive operator must be the binary operator of a valid monoid (see
+  'help GrB.monoidinfo').  The multiplicative operator can be any binary
+  operator z=f(x,y) listed by 'help GrB.binopinfo', but the type of z must
+  match the operand type of the monoid.  The type in the string
+  'add.mult.type' is the type of x for the multiply operator z=f(x,y), and
+  the type of its z output defines the type of the monoid.
+ 
+  Example:
+ 
+    % valid semirings
+    GrB.semiringinfo ('+.*.double') ;
+    GrB.semiringinfo ('min.1st.int32') ;
+ 
+    % invalid semiring (generates an error; since '&lt;' is not a monoid)
+    GrB.semiringinfo ('&lt;.*.double') ;
+ 
+  See also GrB.binopinfo, GrB.descriptorinfo, GrB.monoidinfo,
+  GrB.selectopinfo, GrB.unopinfo.
+
+</pre><h2 id="18">Element-wise operations</h2><p>Binary operators can be used in element-wise matrix operations, like C=A+B and C=A.*B.  For the matrix addition C=A+B, the pattern of C is the set union of A and B, and the '+' operator is applied for entries in the intersection.  Entries in A but not B, or in B but not A, are assigned to C without using the operator.  The '+' operator is used for C=A+B but any operator can be used with GrB.eadd.</p><pre class="codeinput">A = GrB (sprand (3, 3, 0.5)) ;
 B = GrB (sprand (3, 3, 0.5)) ;
 C1 = A + B
 C2 = GrB.eadd (<span class="string">'+'</span>, A, B)
@@ -512,7 +627,7 @@
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    0.666139
@@ -526,7 +641,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    0.666139
@@ -539,12 +654,12 @@
 
 err =
      0
-</pre><h2 id="19">Subtracting two matrices</h2><p>A-B and GrB.eadd ('-', A, B) are not the same thing, since the '-' operator is not applied to an entry that is in B but not A.</p><pre class="codeinput">C1 = A-B
+</pre><h2 id="20">Subtracting two matrices</h2><p>A-B and GrB.eadd ('-', A, B) are not the same thing, since the '-' operator is not applied to an entry that is in B but not A.</p><pre class="codeinput">C1 = A-B
 C2 = GrB.eadd (<span class="string">'-'</span>, A, B)
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    -0.666139
@@ -558,7 +673,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    0.666139
@@ -575,7 +690,7 @@
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    -0.666139
@@ -589,7 +704,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    -0.666139
@@ -602,13 +717,13 @@
 
 err =
      0
-</pre><h2 id="21">Element-wise 'multiplication'</h2><p>For C = A.*B, the result C is the set intersection of the pattern of A and B.  The operator is applied to entries in both A and B.  Entries in A but not B, or B but not A, do not appear in the result C.</p><pre class="codeinput">C1 = A.*B
+</pre><h2 id="22">Element-wise 'multiplication'</h2><p>For C = A.*B, the result C is the set intersection of the pattern of A and B.  The operator is applied to entries in both A and B.  Entries in A but not B, or B but not A, do not appear in the result C.</p><pre class="codeinput">C1 = A.*B
 C2 = GrB.emult (<span class="string">'*'</span>, A, B)
 C3 = double (A) .* double (B)
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   1 nonzero, 1 entry
 
     (1,2)    0.518474
@@ -616,7 +731,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   1 nonzero, 1 entry
 
     (1,2)    0.518474
@@ -629,7 +744,7 @@
 </pre><pre class="codeoutput">
 A =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   4 nonzeros, 4 entries
 
     (1,2)    0.572029
@@ -640,7 +755,7 @@
 
 B =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   4 nonzeros, 4 entries
 
     (1,1)    0.666139
@@ -651,12 +766,12 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   1 nonzero, 1 entry
 
     (1,2)    0.906378
 
-</pre><h2 id="23">Overloaded operators</h2><p>The following operators all work as you would expect for any matrix. The matrices A and B can be GraphBLAS matrices, or MATLAB sparse or dense matrices, in any combination, or scalars where appropriate, The matrix M is logical (MATLAB or GraphBLAS):</p><pre>  A+B   A-B  A*B   A.*B  A./B  A.\B  A.^b   A/b   C=A(I,J)  C(M)=A
+</pre><h2 id="24">Overloaded operators</h2><p>The following operators all work as you would expect for any matrix. The matrices A and B can be GraphBLAS matrices, or MATLAB sparse or dense matrices, in any combination, or scalars where appropriate, The matrix M is logical (MATLAB or GraphBLAS):</p><pre>  A+B   A-B  A*B   A.*B  A./B  A.\B  A.^b   A/b   C=A(I,J)  C(M)=A
   -A    +A   ~A    A'    A.'   A&amp;B   A|B    b\A   C(I,J)=A  C=A(M)
   A~=B  A&gt;B  A==B  A&lt;=B  A&gt;=B  A&lt;B   [A,B]  [A;B] C(A)
   A(1:end,1:end)</pre><p>For A^b, b must be a non-negative integer.</p><pre class="codeinput">C1 = [A B] ;
@@ -669,7 +784,7 @@
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   5 nonzeros, 5 entries
 
     (2,2)    0.140946
@@ -687,13 +802,13 @@
 </pre><pre class="codeoutput">
 C1 =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, bitmap by col
   2 nonzeros, 2 entries
 
     (1,1)    0.572029
     (2,2)    0.248635
 
-</pre><h2 id="26">Overloaded functions</h2><p>Many MATLAB built-in functions can be used with GraphBLAS matrices:</p><p>A few differences with the built-in functions:</p><pre class="language-matlab">S = sparse (G)        <span class="comment">% makes a copy of a GrB matrix</span>
+</pre><h2 id="27">Overloaded functions</h2><p>Many MATLAB built-in functions can be used with GraphBLAS matrices:</p><p>A few differences with the built-in functions:</p><pre class="language-matlab">S = sparse (G)        <span class="comment">% converts G to sparse/hypersparse</span>
 F = full (G)          <span class="comment">% adds explicit zeros, so numel(F)==nnz(F)</span>
 F = full (G,type,id)  <span class="comment">% adds explicit identity values to a GrB matrix</span>
 disp (G, level)       <span class="comment">% display a GrB matrix G; level=2 is the default.</span>
@@ -701,67 +816,69 @@
 </pre><pre class="codeoutput">
 Methods for class GrB:
 
-GrB             disp            islogical       real            
-abs             display         ismatrix        repmat          
-acos            dmperm          isnan           reshape         
-acosh           double          isnumeric       round           
-acot            eig             isreal          sec             
-acoth           end             isscalar        sech            
-acsc            eps             issparse        sign            
-acsch           eq              issymmetric     sin             
-all             erf             istril          single          
-amd             erfc            istriu          sinh            
-and             etree           isvector        size            
-angle           exp             kron            sparse          
-any             expm1           ldivide         spfun           
-asec            false           le              spones          
-asech           find            length          sprand          
-asin            fix             log             sprandn         
-asinh           flip            log10           sprandsym       
-assert          floor           log1p           sprintf         
-atan            fprintf         log2            sqrt            
-atan2           full            logical         subsasgn        
-atanh           gamma           lt              subsindex       
-bandwidth       gammaln         max             subsref         
-bitand          ge              min             sum             
-bitcmp          graph           minus           symamd          
-bitget          gt              mldivide        symrcm          
-bitor           horzcat         mpower          tan             
-bitset          hypot           mrdivide        tanh            
-bitshift        imag            mtimes          times           
-bitxor          int16           ne              transpose       
-ceil            int32           nnz             tril            
-colamd          int64           nonzeros        triu            
-complex         int8            norm            true            
-conj            isa             not             uint16          
-cos             isbanded        numel           uint32          
-cosh            isdiag          nzmax           uint64          
-cot             isempty         ones            uint8           
-coth            isequal         or              uminus          
-csc             isfinite        plus            uplus           
-csch            isfloat         pow2            vertcat         
-ctranspose      ishermitian     power           xor             
-diag            isinf           prod            zeros           
-digraph         isinteger       rdivide         
+GrB             display         isnan           round           
+abs             dmperm          isnumeric       sec             
+acos            double          isreal          sech            
+acosh           eig             isscalar        sign            
+acot            end             issparse        sin             
+acoth           eps             issymmetric     single          
+acsc            eq              istril          sinh            
+acsch           erf             istriu          size            
+all             erfc            isvector        sparse          
+amd             etree           kron            spfun           
+and             exp             ldivide         spones          
+angle           expm1           le              sprand          
+any             false           length          sprandn         
+asec            find            log             sprandsym       
+asech           fix             log10           sprintf         
+asin            flip            log1p           sqrt            
+asinh           floor           log2            struct          
+assert          fprintf         logical         subsasgn        
+atan            full            lt              subsindex       
+atan2           gamma           max             subsref         
+atanh           gammaln         min             sum             
+bandwidth       ge              minus           symamd          
+bitand          graph           mldivide        symrcm          
+bitcmp          gt              mpower          tan             
+bitget          horzcat         mrdivide        tanh            
+bitor           hypot           mtimes          times           
+bitset          imag            ne              transpose       
+bitshift        int16           nnz             tril            
+bitxor          int32           nonzeros        triu            
+ceil            int64           norm            true            
+colamd          int8            not             uint16          
+complex         isa             numel           uint32          
+conj            isbanded        nzmax           uint64          
+cos             isdiag          ones            uint8           
+cosh            isempty         or              uminus          
+cot             isequal         plus            uplus           
+coth            isfinite        pow2            vertcat         
+csc             isfloat         power           xor             
+csch            ishermitian     prod            zeros           
+ctranspose      isinf           rdivide         
+diag            isinteger       real            
+digraph         islogical       repmat          
+disp            ismatrix        reshape         
 
 Static methods:
 
-MATLAB_vs_GrB   empty           issigned        reduce          
-apply           emult           kronecker       select          
-apply2          entries         ktruss          selectopinfo    
-assign          expand          laplacian       semiringinfo    
-bfs             extract         mis             speye           
-binopinfo       extracttuples   monoidinfo      subassign       
-build           eye             mxm             threads         
-burble          finalize        nonz            trans           
-chunk           format          normdiff        tricount        
-clear           incidence       offdiag         type            
-compact         init            optype          unopinfo        
-descriptorinfo  isbycol         pagerank        vreduce         
-dnn             isbyrow         prune           
-eadd            isfull          random          
-
-</pre><h2 id="28">Zeros are handled differently</h2><p>Explicit zeros cannot be automatically dropped from a GraphBLAS matrix, like they are in MATLAB sparse matrices.  In a shortest-path problem, for example, an edge A(i,j) that is missing has an infinite weight, (the monoid identity of min(x,y) is +inf).  A zero edge weight A(i,j)=0 is very different from an entry that is not present in A.  However, if a GraphBLAS matrix is converted into a MATLAB sparse matrix, explicit zeros are dropped, which is the convention for a MATLAB sparse matrix. They can also be dropped from a GraphBLAS matrix using the GrB.select method.</p><pre class="codeinput">G = GrB (magic (2)) ;
+MATLAB_vs_GrB   emult           ktruss          semiringinfo    
+apply           entries         laplacian       speye           
+apply2          expand          mis             subassign       
+assign          extract         monoidinfo      threads         
+bfs             extracttuples   mxm             trans           
+binopinfo       eye             nonz            tricount        
+build           finalize        normdiff        type            
+burble          format          offdiag         unopinfo        
+chunk           incidence       optype          ver             
+clear           init            pagerank        version         
+compact         isbycol         prune           vreduce         
+descriptorinfo  isbyrow         random          
+dnn             isfull          reduce          
+eadd            issigned        select          
+empty           kronecker       selectopinfo    
+
+</pre><h2 id="29">Zeros are handled differently</h2><p>Explicit zeros cannot be automatically dropped from a GraphBLAS matrix, like they are in MATLAB sparse matrices.  In a shortest-path problem, for example, an edge A(i,j) that is missing has an infinite weight, (the monoid identity of min(x,y) is +inf).  A zero edge weight A(i,j)=0 is very different from an entry that is not present in A.  However, if a GraphBLAS matrix is converted into a MATLAB sparse matrix, explicit zeros are dropped, which is the convention for a MATLAB sparse matrix. They can also be dropped from a GraphBLAS matrix using the GrB.select method.</p><pre class="codeinput">G = GrB (magic (2)) ;
 G (1,1) = 0      <span class="comment">% G(1,1) still appears as an explicit entry</span>
 A = double (G)   <span class="comment">% but it's dropped when converted to MATLAB sparse</span>
 H = GrB.select (<span class="string">'nonzero'</span>, G)  <span class="comment">% drops the explicit zeros from G</span>
@@ -771,7 +888,7 @@
 </pre><pre class="codeoutput">
 G =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, full by col
   3 nonzeros, 4 entries
 
     (1,1)    0
@@ -780,13 +897,12 @@
     (2,2)    2
 
 A =
-   (2,1)        4
-   (1,2)        3
-   (2,2)        2
+     0     3
+     4     2
 
 H =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, bitmap by col
   3 nonzeros, 3 entries
 
     (2,1)    4
@@ -795,13 +911,13 @@
 
 nnz (G): 3  nnz (A): 3 nnz (H): 3
 num entries in G: 4
-</pre><h2 id="30">Displaying contents of a GraphBLAS matrix</h2><p>Unlike MATLAB, the default is to display just a few entries of a GrB matrix. Here are all 100 entries of a 10-by-10 matrix, using a non-default disp(G,3):</p><pre class="codeinput">G = GrB (rand (10)) ;
+</pre><h2 id="31">Displaying contents of a GraphBLAS matrix</h2><p>Unlike MATLAB, the default is to display just a few entries of a GrB matrix. Here are all 100 entries of a 10-by-10 matrix, using a non-default disp(G,3):</p><pre class="codeinput">G = GrB (rand (10)) ;
 <span class="comment">% display everything:</span>
 disp (G,3)
 </pre><pre class="codeoutput">
 G =
 
-  10x10 GraphBLAS double matrix, sparse by col:
+  10x10 GraphBLAS double matrix, full by col
   100 nonzeros, 100 entries
 
     (1,1)    0.0342763
@@ -910,7 +1026,7 @@
 </pre><pre class="codeoutput">
 G =
 
-  10x10 GraphBLAS double matrix, sparse by col:
+  10x10 GraphBLAS double matrix, full by col
   100 nonzeros, 100 entries
 
     (1,1)    0.0342763
@@ -942,18 +1058,17 @@
     (7,3)    0.706156
     (8,3)    0.909475
     (9,3)    0.84868
-    (10,3)    0.564605
     ...
 
 </pre><p>That was disp(G,2) or just display(G), which is what is printed by a MATLAB statement that doesn't have a trailing semicolon.  With level = 1, disp(G,1) gives just a terse summary:</p><pre class="codeinput">disp (G,1)
 </pre><pre class="codeoutput">
 G =
 
-  10x10 GraphBLAS double matrix, sparse by col:
+  10x10 GraphBLAS double matrix, full by col
   100 nonzeros, 100 entries
 
 
-</pre><h2 id="35">Storing a matrix by row or by column</h2><p>MATLAB stores its sparse matrices by column, refered to as 'standard CSC' in SuiteSparse:GraphBLAS.  In the CSC (compressed sparse column) format, each column of the matrix is stored as a list of entries, with their value and row index.  In the CSR (compressed sparse row) format, each row is stored as a list of values and their column indices. GraphBLAS uses both CSC and CSR, and the two formats can be intermixed arbitrarily.  In its C interface, the default format is CSR.  However, for better compatibility with MATLAB, this MATLAB interface for SuiteSparse:GraphBLAS uses CSC by default instead.</p><pre class="codeinput">rng (<span class="string">'default'</span>) ;
+</pre><h2 id="36">Storing a matrix by row or by column</h2><p>MATLAB stores its sparse matrices by column, refered to as 'sparse by col' in SuiteSparse:GraphBLAS.  In the 'sparse by col' format, each column of the matrix is stored as a list of entries, with their value and row index.  In the 'sparse by row' format, each row is stored as a list of values and their column indices.  GraphBLAS uses both 'by row' and 'by col', and the two formats can be intermixed arbitrarily.  In its C interface, the default format is 'by row'.  However, for better compatibility with MATLAB, the SuiteSparse:GraphBLAS MATLAB interface uses 'by col' by default instead.</p><pre class="codeinput">rng (<span class="string">'default'</span>) ;
 GrB.clear ;                      <span class="comment">% clear prior GraphBLAS settings</span>
 fprintf (<span class="string">'the default format is: %s\n'</span>, GrB.format) ;
 C = sparse (rand (2))
@@ -968,7 +1083,7 @@
 
 G =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    0.814724
@@ -978,7 +1093,7 @@
 
 ans =
     'by col'
-</pre><p>Many graph algorithms work better in CSR format, with matrices stored by row.  For example, it is common to use A(i,j) for the edge (i,j), and many graph algorithms need to access the out-adjacencies of nodes, which is the row A(i,;) for node i.  If the CSR format is desired, GrB.format ('by row') tells GraphBLAS to create all subsequent matrices in the CSR format.  Converting from a MATLAB sparse matrix (in standard CSC format) takes a little more time (requiring a transpose), but subsequent graph algorithms can be faster.</p><pre class="codeinput">G = GrB (C, <span class="string">'by row'</span>)
+</pre><p>Many graph algorithms work better in 'by row' format, with matrices stored by row.  For example, it is common to use A(i,j) for the edge (i,j), and many graph algorithms need to access the out-adjacencies of nodes, which is the row A(i,;) for node i.  If the 'by row' format is desired, GrB.format ('by row') tells GraphBLAS to create all subsequent matrices in the 'by row' format.  Converting from a MATLAB sparse matrix (in standard 'by col' format) takes a little more time (requiring a transpose), but subsequent graph algorithms can be faster.</p><pre class="codeinput">G = GrB (C, <span class="string">'by row'</span>)
 fprintf (<span class="string">'the format of G is:    %s\n'</span>, GrB.format (G)) ;
 H = GrB (C)
 fprintf (<span class="string">'the format of H is:    %s\n'</span>, GrB.format (H)) ;
@@ -986,7 +1101,7 @@
 </pre><pre class="codeoutput">
 G =
 
-  2x2 GraphBLAS double matrix, sparse by row:
+  2x2 GraphBLAS double matrix, full by row
   4 nonzeros, 4 entries
 
     (1,1)    0.814724
@@ -998,7 +1113,7 @@
 
 H =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    0.814724
@@ -1009,7 +1124,7 @@
 the format of H is:    by col
 err =
      0
-</pre><h2 id="39">Hypersparse matrices</h2><p>SuiteSparse:GraphBLAS can use two kinds of sparse matrix data structures: standard and hypersparse, for both CSC and CSR formats.  In the standard CSC format used in MATLAB, an m-by-n matrix A takes O(n+nnz(A)) space.  MATLAB can create huge column vectors, but not huge matrices (when n is huge).</p><pre class="codeinput">clear
+</pre><h2 id="40">Hypersparse, sparse, bitmap, and full matrices</h2><p>SuiteSparse:GraphBLAS can use four kinds of sparse matrix data structures: hypersparse, sparse, bitmap, and full, in both 'by col' and 'by row' formats, for a total of eight different combinations.  In the 'sparse by col' that MATLAB uses for its sparse matrices, an m-by-n matrix A takes O(n+nnz(A)) space.  MATLAB can create huge column vectors, but not huge matrices (when n is huge).</p><pre class="codeinput">clear
 [c, huge] = computer ;
 C = sparse (huge, 1)    <span class="comment">% MATLAB can create a huge-by-1 sparse column</span>
 <span class="keyword">try</span>
@@ -1034,13 +1149,13 @@
 </pre><pre class="codeoutput">
 G =
 
-  281474976710655x1 GraphBLAS double matrix, sparse by col:
+  281474976710655x1 GraphBLAS double matrix, sparse by col
   no nonzeros, no entries
 
 
 H =
 
-  281474976710655x281474976710655 GraphBLAS double matrix, hypersparse by col:
+  281474976710655x281474976710655 GraphBLAS double matrix, hypersparse by col
   no nonzeros, no entries
 
 </pre><p>Operations on huge hypersparse matrices are very fast; no component of the time or space complexity is Omega(n).</p><pre class="codeinput">I = randperm (huge, 2) ;
@@ -1054,7 +1169,7 @@
 </pre><pre class="codeoutput">
 H =
 
-  281474976710655x281474976710655 GraphBLAS double matrix, hypersparse by col:
+  281474976710655x281474976710655 GraphBLAS double matrix, hypersparse by col
   8 nonzeros, 8 entries
 
     (27455183225557,27455183225557)    4403.14
@@ -1066,7 +1181,7 @@
     (153933462881710,177993304104065)    143.142
     (177993304104065,177993304104065)    1403.14
 
-</pre><h2 id="42">numel uses vpa if the matrix is really huge</h2><pre class="codeinput">e1 = numel (G)               <span class="comment">% this is huge, but still a flint</span>
+</pre><h2 id="43">numel uses vpa if the matrix is really huge</h2><pre class="codeinput">e1 = numel (G)               <span class="comment">% this is huge, but still a flint</span>
 e2 = numel (H)               <span class="comment">% this is huge^2, which needs vpa</span>
 whos <span class="string">e1</span> <span class="string">e2</span>
 </pre><pre class="codeoutput">e1 =
@@ -1081,11 +1196,11 @@
 </pre><p>All of these matrices take very little memory space:</p><pre class="codeinput">whos <span class="string">C</span> <span class="string">G</span> <span class="string">H</span> <span class="string">K</span>
 </pre><pre class="codeoutput">  Name                    Size                         Bytes  Class    Attributes
 
-  G         281474976710655x1                            941  GrB                
-  H         281474976710655x281474976710655             1252  GrB                
-  K         281474976710655x281474976710655             1252  GrB                
+  G         281474976710655x1                            949  GrB                
+  H         281474976710655x281474976710655             1260  GrB                
+  K         281474976710655x281474976710655             1260  GrB                
 
-</pre><h2 id="44">The mask and accumulator</h2><p>When not used in overloaded operators or built-in functions, many GraphBLAS methods of the form GrB.method ( ... ) can optionally use a mask and/or an accumulator operator.  If the accumulator is '+' in GrB.mxm, for example, then C = C + A*B is computed.  The mask acts much like logical indexing in MATLAB.  With a logical mask matrix M, C&lt;M&gt;=A*B allows only part of C to be assigned.  If M(i,j) is true, then C(i,j) can be modified.  If false, then C(i,j) is not modified.</p><p>For example, to set all values in C that are greater than 0.5 to 3:</p><pre class="codeinput">A = rand (3)
+</pre><h2 id="45">The mask and accumulator</h2><p>When not used in overloaded operators or built-in functions, many GraphBLAS methods of the form GrB.method ( ... ) can optionally use a mask and/or an accumulator operator.  If the accumulator is '+' in GrB.mxm, for example, then C = C + A*B is computed.  The mask acts much like logical indexing in MATLAB.  With a logical mask matrix M, C&lt;M&gt;=A*B allows only part of C to be assigned.  If M(i,j) is true, then C(i,j) can be modified.  If false, then C(i,j) is not modified.</p><p>For example, to set all values in C that are greater than 0.5 to 3:</p><pre class="codeinput">A = rand (3)
 C = GrB.assign (A, A &gt; 0.5, 3) ;     <span class="comment">% in GraphBLAS</span>
 C1 = GrB (A) ; C1 (A &gt; .5) = 3       <span class="comment">% also in GraphBLAS</span>
 C2 = A       ; C2 (A &gt; .5) = 3       <span class="comment">% in MATLAB</span>
@@ -1098,7 +1213,7 @@
 
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, full by col
   9 nonzeros, 9 entries
 
     (1,1)    3
@@ -1119,14 +1234,14 @@
      0
 err =
      0
-</pre><h2 id="46">The descriptor</h2><p>Most GraphBLAS functions of the form GrB.method ( ... ) take an optional last argument, called the descriptor.  It is a MATLAB struct that can modify the computations performed by the method.  'help GrB.descriptorinfo' gives all the details.  The following is a short summary of the primary settings:</p><p>d.out  = 'default' or 'replace', clears C after the accum op is used.</p><p>d.mask = 'default' or 'complement', to use M or ~M as the mask matrix;          'structural', or 'structural complement', to use the pattern           of M or ~M.</p><p>d.in0  = 'default' or 'transpose', to transpose A for C=A*B, C=A+B, etc.</p><p>d.in1  = 'default' or 'transpose', to transpose B for C=A*B, C=A+B, etc.</p><p>d.kind = 'default', 'GrB', 'sparse', or 'full'; the output of GrB.method.</p><pre class="codeinput">A = sparse (rand (2)) ;
+</pre><h2 id="47">The descriptor</h2><p>Most GraphBLAS functions of the form GrB.method ( ... ) take an optional last argument, called the descriptor.  It is a MATLAB struct that can modify the computations performed by the method.  'help GrB.descriptorinfo' gives all the details.  The following is a short summary of the primary settings:</p><p>d.out  = 'default' or 'replace', clears C after the accum op is used.</p><p>d.mask = 'default' or 'complement', to use M or ~M as the mask matrix;          'structural', or 'structural complement', to use the pattern           of M or ~M.</p><p>d.in0  = 'default' or 'transpose', to transpose A for C=A*B, C=A+B, etc.</p><p>d.in1  = 'default' or 'transpose', to transpose B for C=A*B, C=A+B, etc.</p><p>d.kind = 'default', 'GrB', 'sparse', or 'full'; the output of GrB.method.</p><pre class="codeinput">A = sparse (rand (2)) ;
 B = sparse (rand (2)) ;
 C1 = A'*B ;
 C2 = GrB.mxm (<span class="string">'+.*'</span>, A, B, struct (<span class="string">'in0'</span>, <span class="string">'transpose'</span>)) ;
 err = norm (C1-C2,1)
 </pre><pre class="codeoutput">err =
      0
-</pre><h2 id="47">Integer arithmetic is different in GraphBLAS</h2><p>MATLAB supports integer arithmetic on its full matrices, using int8, int16, int32, int64, uint8, uint16, uint32, or uint64 data types.  None of these integer data types can be used to construct a MATLAB sparse matrix, which can only be double, double complex, or logical. Furthermore, C=A*B is not defined for integer types in MATLAB, except when A and/or B are scalars.</p><p>GraphBLAS supports all of those types for its sparse matrices.  All operations are supported, including C=A*B when A or B are any integer type, in 1000s of semirings.</p><p>However, integer arithmetic differs in GraphBLAS and MATLAB.  In MATLAB, integer values saturate if they exceed their maximum value.  In GraphBLAS, integer operators act in a modular fashion.  The latter is essential when computing C=A*B over a semiring.  A saturating integer operator cannot be used as a monoid since it is not associative.</p><pre class="codeinput">C = uint8 (magic (3)) ;
+</pre><h2 id="48">Integer arithmetic is different in GraphBLAS</h2><p>MATLAB supports integer arithmetic on its full matrices, using int8, int16, int32, int64, uint8, uint16, uint32, or uint64 data types.  None of these integer data types can be used to construct a MATLAB sparse matrix, which can only be double, double complex, or logical. Furthermore, C=A*B is not defined for integer types in MATLAB, except when A and/or B are scalars.</p><p>GraphBLAS supports all of those types for all of its matrices (hyper, sparse, bitmap, or full).  All operations are supported, including C=A*B when A or B are any integer type, in 1000s of semirings.</p><p>However, integer arithmetic differs in GraphBLAS and MATLAB.  In MATLAB, integer values saturate if they exceed their maximum value.  In GraphBLAS, integer operators act in a modular fashion.  The latter is essential when computing C=A*B over a semiring.  A saturating integer operator cannot be used as a monoid since it is not associative.</p><pre class="codeinput">C = uint8 (magic (3)) ;
 G = GrB (C) ;
 C1 = C * 40
 C2 = G * uint8 (40)
@@ -1140,7 +1255,7 @@
 
 C2 =
 
-  3x3 GraphBLAS uint8_t matrix, sparse by col:
+  3x3 GraphBLAS uint8_t matrix, full by col
   9 nonzeros, 9 entries
 
     (1,1)   64
@@ -1153,7 +1268,7 @@
     (2,3)   24
     (3,3)   80
 
-</pre><h2 id="49">An example graph algorithm: breadth-first search</h2><p>The breadth-first search of a graph finds all nodes reachable from the source node, and their level, v.  v=GrB.bfs(A,s) or v=bfs_matlab(A,s) compute the same thing, but GrB.bfs uses GraphBLAS matrices and operations, while bfs_matlab uses pure MATLAB operations.  v is defined as v(s) = 1 for the source node, v(i) = 2 for nodes adjacent to the source, and so on.</p><pre class="codeinput">clear
+</pre><h2 id="50">An example graph algorithm: breadth-first search</h2><p>The breadth-first search of a graph finds all nodes reachable from the source node, and their level, v.  v=GrB.bfs(A,s) or v=bfs_matlab(A,s) compute the same thing, but GrB.bfs uses GraphBLAS matrices and operations, while bfs_matlab uses pure MATLAB operations.  v is defined as v(s) = 1 for the source node, v(i) = 2 for nodes adjacent to the source, and so on.</p><pre class="codeinput">clear
 rng (<span class="string">'default'</span>) ;
 n = 1e5 ;
 A = logical (sprandn (n, n, 1e-3)) ;
@@ -1172,12 +1287,15 @@
 fprintf (<span class="string">'MATLAB time:    %g sec\n'</span>, matlab_time) ;
 fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 </pre><pre class="codeoutput">
 nodes reached: 100000 of 100000
-GraphBLAS time: 0.278521 sec
-MATLAB time:    0.464127 sec
-Speedup of GraphBLAS over MATLAB: 1.6664
-</pre><h2 id="50">Example graph algorithm: Luby's method in GraphBLAS</h2><p>The GrB.mis function is variant of Luby's randomized algorithm [Luby 1985].  It is a parallel method for finding an maximal independent set of nodes, where no two nodes are adjacent.  See the GraphBLAS/@GrB/mis.m function for details.  The graph must be symmetric with a zero-free diagonal, so A is symmetrized first and any diagonal entries are removed.</p><pre class="codeinput">A = GrB (A) ;
+GraphBLAS time: 0.337173 sec
+MATLAB time:    0.482828 sec
+Speedup of GraphBLAS over MATLAB: 1.43199
+
+# of threads used by GraphBLAS: 4
+</pre><h2 id="51">Example graph algorithm: Luby's method in GraphBLAS</h2><p>The GrB.mis function is variant of Luby's randomized algorithm [Luby 1985].  It is a parallel method for finding an maximal independent set of nodes, where no two nodes are adjacent.  See the GraphBLAS/@GrB/mis.m function for details.  The graph must be symmetric with a zero-free diagonal, so A is symmetrized first and any diagonal entries are removed.</p><pre class="codeinput">A = GrB (A) ;
 A = GrB.offdiag (A|A') ;
 
 tic
@@ -1198,11 +1316,11 @@
 S = A (notp, p) ;
 deg = GrB.vreduce (<span class="string">'+.int64'</span>, S) ;
 assert (logical (all (deg &gt; 0)))
-</pre><pre class="codeoutput">Elapsed time is 0.327699 seconds.
+</pre><pre class="codeoutput">Elapsed time is 0.284031 seconds.
 # nodes in the graph: 100000
 # edges: : 9.9899e+06
 size of maximal independent set found: 2811
-</pre><h2 id="51">Sparse deep neural network</h2><p>The 2019 MIT GraphChallenge (see <a href="http://graphchallenge.org">http://graphchallenge.org</a>) is to solve a set of large sparse deep neural network problems.  In this demo, the MATLAB reference solution is compared with a solution using GraphBLAS, for a randomly constructed neural network.  See the GrB.dnn and dnn_matlab.m functions for details.</p><pre class="codeinput">clear
+</pre><h2 id="52">Sparse deep neural network</h2><p>The 2019 MIT GraphChallenge (see <a href="http://graphchallenge.org">http://graphchallenge.org</a>) is to solve a set of large sparse deep neural network problems.  In this demo, the MATLAB reference solution is compared with a solution using GraphBLAS, for a randomly constructed neural network.  See the GrB.dnn and dnn_matlab.m functions for details.</p><pre class="codeinput">clear
 rng (<span class="string">'default'</span>) ;
 nlayers = 16 ;
 nneurons = 4096 ;
@@ -1210,6 +1328,7 @@
 fprintf (<span class="string">'# layers:   %d\n'</span>, nlayers) ;
 fprintf (<span class="string">'# neurons:  %d\n'</span>, nneurons) ;
 fprintf (<span class="string">'# features: %d\n'</span>, nfeatures) ;
+fprintf (<span class="string">'# of threads used: %d\n'</span>, GrB.threads) ;
 
 tic
 Y0 = sprand (nfeatures, nneurons, 0.1) ;
@@ -1228,26 +1347,30 @@
 </pre><pre class="codeoutput"># layers:   16
 # neurons:  4096
 # features: 30000
-construct problem time: 3.93937 sec
-setup time: 0.240521 sec
-</pre><h2 id="52">Solving the sparse deep neural network problem with GraphbLAS</h2><p>Please wait ...</p><pre class="codeinput">tic
+# of threads used: 4
+construct problem time: 4.82957 sec
+setup time: 0.242723 sec
+</pre><h2 id="53">Solving the sparse deep neural network problem with GraphbLAS</h2><p>Please wait ...</p><pre class="codeinput">tic
 Y1 = GrB.dnn (W_gb, bias_gb, Y0_gb) ;
 gb_time = toc ;
 fprintf (<span class="string">'total time in GraphBLAS: %g sec\n'</span>, gb_time) ;
-</pre><pre class="codeoutput">total time in GraphBLAS: 7.06691 sec
-</pre><h2 id="53">Solving the sparse deep neural network problem with MATLAB</h2><p>Please wait ...</p><pre class="codeinput">tic
+</pre><pre class="codeoutput">total time in GraphBLAS: 10.9613 sec
+</pre><h2 id="54">Solving the sparse deep neural network problem with MATLAB</h2><p>Please wait ...</p><pre class="codeinput">tic
 Y2 = dnn_matlab (W, bias, Y0) ;
 matlab_time = toc ;
 fprintf (<span class="string">'total time in MATLAB:    %g sec\n'</span>, matlab_time) ;
 fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 
 err = norm (Y1-Y2,1)
-</pre><pre class="codeoutput">total time in MATLAB:    86.8548 sec
-Speedup of GraphBLAS over MATLAB: 12.2904
+</pre><pre class="codeoutput">total time in MATLAB:    95.308 sec
+Speedup of GraphBLAS over MATLAB: 8.69497
+
+# of threads used by GraphBLAS: 4
 err =
      0
-</pre><h2 id="54">For objects, GraphBLAS has better colon notation than MATLAB</h2><p>The MATLAB notation C = A (start:inc:fini) is very handy, and it works great if A is a MATLAB matrix.  But for objects like the GraphBLAS matrix, MATLAB starts by creating the explicit index vector I = start:inc:fini.  That's fine if the matrix is modest in size, but GraphBLAS can construct huge matrices. The problem is that 1:n cannot be explicitly constructed when n is huge.</p><p>The C API for GraphBLAS can represent the colon notation start:inc:fini in an implicit manner, so it can do the indexing without actually forming the explicit list I = start:inc:fini. But there is no access to this method using the MATLAB notation start:inc:fini.</p><p>Thus, to compute C = A (start:inc:fini) for very huge matrices, you need to use use a cell array to represent the colon notation, as { start, inc, fini }, instead of start:inc:fini. See 'help GrB.extract', 'help GrB.assign' for the functional form. For the overloaded syntax C(I,J)=A and C=A(I,J), see 'help GrB/subsasgn' and 'help GrB/subsfref'.  The cell array syntax isn't conventional, but it is far faster than the MATLAB colon notation for objects, and takes far less memory when I is huge.</p><pre class="codeinput">n = 1e14 ;
+</pre><h2 id="55">For objects, GraphBLAS has better colon notation than MATLAB</h2><p>The MATLAB notation C = A (start:inc:fini) is very handy, and it works great if A is a MATLAB matrix.  But for objects like the GraphBLAS matrix, MATLAB starts by creating the explicit index vector I = start:inc:fini.  That's fine if the matrix is modest in size, but GraphBLAS can construct huge matrices. The problem is that 1:n cannot be explicitly constructed when n is huge.</p><p>The C API for GraphBLAS can represent the colon notation start:inc:fini in an implicit manner, so it can do the indexing without actually forming the explicit list I = start:inc:fini. But there is no access to this method using the MATLAB notation start:inc:fini.</p><p>Thus, to compute C = A (start:inc:fini) for very huge matrices, you need to use use a cell array to represent the colon notation, as { start, inc, fini }, instead of start:inc:fini. See 'help GrB.extract', 'help GrB.assign' for the functional form. For the overloaded syntax C(I,J)=A and C=A(I,J), see 'help GrB/subsasgn' and 'help GrB/subsref'.  The cell array syntax isn't conventional, but it is far faster than the MATLAB colon notation for objects, and takes far less memory when I is huge.</p><pre class="codeinput">n = 1e14 ;
 H = GrB (n, n) ;            <span class="comment">% a huge empty matrix</span>
 I = [1 1e9 1e12 1e14] ;
 M = magic (4)
@@ -1265,7 +1388,7 @@
 
 C1 =
 
-  10000000000000x10000000000000 GraphBLAS double matrix, hypersparse by col:
+  10000000000000x10000000000000 GraphBLAS double matrix, hypersparse by col
   9 nonzeros, 9 entries
 
     (1,1)    16
@@ -1293,7 +1416,7 @@
          cause: {}
          stack: [4x1 struct]
     Correction: []
-</pre><h2 id="57">Iterative solvers work as-is</h2><p>Many built-in functions work with GraphBLAS matrices unmodified.</p><pre class="codeinput">A = sparse (rand (4)) ;
+</pre><h2 id="58">Iterative solvers work as-is</h2><p>Many built-in functions work with GraphBLAS matrices unmodified.</p><pre class="codeinput">A = sparse (rand (4)) ;
 b = sparse (rand (4,1)) ;
 x = gmres (A,b)
 norm (A*x-b)
@@ -1315,7 +1438,7 @@
    -1.3867
 ans =
    7.2802e-16
-</pre><h2 id="58">... even in single precision</h2><pre class="codeinput">x = gmres (GrB(A,<span class="string">'single'</span>), GrB(b,<span class="string">'single'</span>))
+</pre><h2 id="59">... even in single precision</h2><pre class="codeinput">x = gmres (GrB(A,<span class="string">'single'</span>), GrB(b,<span class="string">'single'</span>))
 norm (A*x-b)
 </pre><pre class="codeoutput">gmres converged at iteration 4 to a solution with relative residual 0.
 x =
@@ -1341,7 +1464,7 @@
 
 x =
 
-  4x1 GraphBLAS double matrix, sparse by col:
+  4x1 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    0.248942
@@ -1366,7 +1489,7 @@
 
 x =
 
-  4x1 GraphBLAS double matrix, sparse by col:
+  4x1 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    -114.062
@@ -1376,7 +1499,7 @@
 
 ans =
    1.3650e-11
-</pre><h2 id="61">Extreme performance differences between GraphBLAS and MATLAB.</h2><p>The GraphBLAS operations used so far are perhaps 2x to 50x faster than the corresponding MATLAB operations, depending on how many cores your computer has.  To run a demo illustrating a 500x or more speedup versus MATLAB, run this demo:</p><pre>  gbdemo2</pre><p>It will illustrate an assignment C(I,J)=A that can take under a second in GraphBLAS but several minutes in MATLAB.  To make the comparsion even more dramatic, try:</p><pre>  gbdemo2 (20000)</pre><p>assuming you have enough memory.</p><h2 id="62">Sparse logical indexing is much, much faster in GraphBLAS</h2><p>The mask in GraphBLAS acts much like logical indexing in MATLAB, but it is not quite the same.  MATLAB logical indexing takes the form:</p><pre>     C (M) = A (M)</pre><p>which computes the same thing as the GraphBLAS statement:</p><pre>     C = GrB.assign (C, M, A)</pre><p>The GrB.assign statement computes C(M)=A(M), and it is vastly faster than C(M)=A(M), even if the time to convert the GrB matrix back to a MATLAB sparse matrix is included.</p><p>GraphBLAS can also compute C (M) = A (M) using overloaded operators for subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.</p><p>Here are both methods in GraphBLAS (both are very fast).  Setting up:</p><pre class="codeinput">clear
+</pre><h2 id="62">Extreme performance differences between GraphBLAS and MATLAB.</h2><p>The GraphBLAS operations used so far are perhaps 2x to 50x faster than the corresponding MATLAB operations, depending on how many cores your computer has.  To run a demo illustrating a 500x or more speedup versus MATLAB, run this demo:</p><pre>  gbdemo2</pre><p>It will illustrate an assignment C(I,J)=A that can take under a second in GraphBLAS but several minutes in MATLAB.  To make the comparsion even more dramatic, try:</p><pre>  gbdemo2 (20000)</pre><p>assuming you have enough memory.</p><h2 id="63">Sparse logical indexing is much, much faster in GraphBLAS</h2><p>The mask in GraphBLAS acts much like logical indexing in MATLAB, but it is not quite the same.  MATLAB logical indexing takes the form:</p><pre>     C (M) = A (M)</pre><p>which computes the same thing as the GraphBLAS statement:</p><pre>     C = GrB.assign (C, M, A)</pre><p>The GrB.assign statement computes C(M)=A(M), and it is vastly faster than C(M)=A(M) for MATLAB sparse matrices, even if the time to convert the GrB matrix back to a MATLAB sparse matrix is included.</p><p>GraphBLAS can also compute C(M)=A(M) using overloaded operators for subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.</p><p>Here are both methods in GraphBLAS (both are very fast).  Setting up:</p><pre class="codeinput">clear
 n = 4000 ;
 tic
 C = sprand (n, n, 0.1) ;
@@ -1388,15 +1511,15 @@
 fprintf (<span class="string">'\nsetup time:     %g sec\n'</span>, t_setup) ;
 </pre><pre class="codeoutput">nnz(C): 1.5226e+06, nnz(M): 761163, nnz(A): 1.52245e+06
 
-setup time:     0.597064 sec
-</pre><h2 id="63">First method in GraphBLAS, with GrB.assign</h2><p>Including the time to convert C1 from a GraphBLAS matrix to a MATLAB sparse matrix:</p><pre class="codeinput">tic
+setup time:     0.789715 sec
+</pre><h2 id="64">First method in GraphBLAS, with GrB.assign</h2><p>Including the time to convert C1 from a GraphBLAS matrix to a MATLAB sparse matrix:</p><pre class="codeinput">tic
 C1 = GrB.assign (C, M, A) ;
 C1 = double (C1) ;
 gb_time = toc ;
 fprintf (<span class="string">'\nGraphBLAS time: %g sec for GrB.assign\n'</span>, gb_time) ;
 </pre><pre class="codeoutput">
-GraphBLAS time: 0.018073 sec for GrB.assign
-</pre><h2 id="64">Second method in GraphBLAS, with C(M)=A(M)</h2><p>now using overloaded operators, also include the time to convert back to a MATLAB sparse matrix, for good measure:</p><pre class="codeinput">A2 = GrB (A) ;
+GraphBLAS time: 0.028873 sec for GrB.assign
+</pre><h2 id="65">Second method in GraphBLAS, with C(M)=A(M)</h2><p>now using overloaded operators, also include the time to convert back to a MATLAB sparse matrix, for good measure:</p><pre class="codeinput">A2 = GrB (A) ;
 C2 = GrB (C) ;
 tic
 C2 (M) = A2 (M) ;
@@ -1404,33 +1527,33 @@
 gb_time2 = toc ;
 fprintf (<span class="string">'\nGraphBLAS time: %g sec for C(M)=A(M)\n'</span>, gb_time2) ;
 </pre><pre class="codeoutput">
-GraphBLAS time: 0.041667 sec for C(M)=A(M)
-</pre><h2 id="65">Now with MATLAB matrices, with C(M)=A(M)</h2><p>Please wait, this will take about 10 minutes or so ...</p><pre class="codeinput">tic
+GraphBLAS time: 0.168779 sec for C(M)=A(M)
+</pre><h2 id="66">Now with MATLAB matrices, with C(M)=A(M)</h2><p>Please wait, this will take about 10 minutes or so ...</p><pre class="codeinput">tic
 C (M) = A (M) ;
 matlab_time = toc ;
 
 fprintf (<span class="string">'\nGraphBLAS time: %g sec (GrB.assign)\n'</span>, gb_time) ;
-fprintf (<span class="string">'\nGraphBLAS time: %g sec (overloading)\n'</span>, gb_time2) ;
+fprintf (<span class="string">'GraphBLAS time: %g sec (overloading)\n'</span>, gb_time2) ;
 fprintf (<span class="string">'MATLAB time:    %g sec\n'</span>, matlab_time) ;
-fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
+fprintf (<span class="string">'Speedup of GraphBLAS (overloading) over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time2) ;
+fprintf (<span class="string">'Speedup of GraphBLAS (GrB.assign)  over MATLAB: %g\n'</span>, <span class="keyword">...</span>
+    matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 
-<span class="comment">% GraphBLAS computes the exact same result with both methods:</span>
 assert (isequal (C1, C))
 assert (isequal (C2, C))
-C1 - C
-C2 - C
+fprintf (<span class="string">'Results of GrB and MATLAB match perfectly.\n'</span>)
 </pre><pre class="codeoutput">
-GraphBLAS time: 0.018073 sec (GrB.assign)
-
-GraphBLAS time: 0.041667 sec (overloading)
-MATLAB time:    601.85 sec
-Speedup of GraphBLAS over MATLAB: 14444.3
-ans =
-   All zero sparse: 4000x4000
-ans =
-   All zero sparse: 4000x4000
-</pre><h2 id="66">Limitations and their future solutions</h2><p>The MATLAB interface for SuiteSparse:GraphBLAS is a work-in-progress. It has some limitations, most of which will be resolved over time.</p><p>(1) Nonblocking mode:</p><p>GraphBLAS has a 'non-blocking' mode, in which operations can be left pending and completed later.  SuiteSparse:GraphBLAS uses the non-blocking mode to speed up a sequence of assignment operations, such as C(I,J)=A.  However, in its MATLAB interface, this would require a MATLAB mexFunction to modify its inputs.  That breaks the MATLAB API standard, so it cannot be safely done.  As a result, using GraphBLAS via its MATLAB interface can be slower than when using its C API.</p><p>(2) Integer element-wise operations:</p><p>Integer operations in MATLAB saturate, so that uint8(255)+1 is 255.  To allow for integer monoids, GraphBLAS uses modular arithmetic instead. This is the only way that C=A*B can be defined for integer semirings. However, saturating integer operators could be added in the future, so that element- wise integer operations on GraphBLAS sparse integer matrices could work just the same as their MATLAB counterparts.</p><p>So in the future, you could perhaps write this, for both sparse and dense integer matrices A and B:</p><pre>     C = GrB.eadd ('+saturate.int8', A, B)</pre><p>to compute the same thing as C=A+B in MATLAB for its full int8 matrices.  Note that MATLAB can do this only for dense integer matrices, since it doesn't support sparse integer matrices.</p><p>(3) Faster methods:</p><p>Most methods in this MATLAB interface are based on efficient parallel C functions in GraphBLAS itself, and are typically as fast or faster than the equivalent built-in operators and functions in MATLAB.</p><p>There are few notable exceptions; these will be addressed in the future. Dense matrices and vectors held as GraphBLAS objects are slower than their MATLAB counterparts.  horzcat and vertcat, for [A B] and [A;B] when either A or B are GraphBLAS matrices, are also slow, as illustrated below in the next example.</p><p>Other methods that will be faster in the future include bandwidth, istriu, istril, isdiag, reshape, issymmetric, and ishermitian.</p><p>Here is an example that illustrates the performance of C = [A B]</p><pre class="codeinput">clear
+GraphBLAS time: 0.028873 sec (GrB.assign)
+GraphBLAS time: 0.168779 sec (overloading)
+MATLAB time:    721.545 sec
+Speedup of GraphBLAS (overloading) over MATLAB: 4275.08
+Speedup of GraphBLAS (GrB.assign)  over MATLAB: 24990.3
+
+# of threads used by GraphBLAS: 4
+Results of GrB and MATLAB match perfectly.
+</pre><h2 id="67">Limitations and their future solutions</h2><p>The MATLAB interface for SuiteSparse:GraphBLAS is a work-in-progress. It has some limitations, most of which will be resolved over time.</p><p>(1) Nonblocking mode:</p><p>GraphBLAS has a 'non-blocking' mode, in which operations can be left pending and completed later.  SuiteSparse:GraphBLAS uses the non-blocking mode to speed up a sequence of assignment operations, such as C(I,J)=A.  However, in its MATLAB interface, this would require a MATLAB mexFunction to modify its inputs.  That breaks the MATLAB API standard, so it cannot be safely done.  As a result, using GraphBLAS via its MATLAB interface can be slower than when using its C API.</p><p>(2) Integer element-wise operations:</p><p>Integer operations in MATLAB saturate, so that uint8(255)+1 is 255.  To allow for integer monoids, GraphBLAS uses modular arithmetic instead. This is the only way that C=A*B can be defined for integer semirings. However, saturating integer operators could be added in the future, so that element- wise integer operations on GraphBLAS sparse integer matrices could work just the same as their MATLAB counterparts.</p><p>So in the future, you could perhaps write this, for both sparse and dense integer matrices A and B:</p><pre>     C = GrB.eadd ('+saturate.int8', A, B)</pre><p>to compute the same thing as C=A+B in MATLAB for its full int8 matrices.  Note that MATLAB can do this only for dense integer matrices, since it doesn't support sparse integer matrices.</p><p>(3) Faster methods:</p><p>Most methods in this MATLAB interface are based on efficient parallel C functions in GraphBLAS itself, and are typically as fast or faster than the equivalent built-in operators and functions in MATLAB.</p><p>There are few notable exceptions; these will be addressed in the future. Full matrices and vectors held as GraphBLAS objects can be slightly slower than their MATLAB counterparts.  horzcat and vertcat, for [A B] and [A;B] when either A or B are GraphBLAS matrices, are also slow, as illustrated below in the next example.</p><p>Other methods that will be faster in the future include bandwidth, istriu, istril, isdiag, reshape, issymmetric, and ishermitian.</p><p>Here is an example that illustrates the performance of C = [A B]</p><pre class="codeinput">clear
 A = sparse (rand (2000)) ;
 B = sparse (rand (2000)) ;
 tic
@@ -1453,8 +1576,8 @@
 </pre><pre class="codeoutput">err =
      0
 
-MATLAB: 0.039475 sec, GraphBLAS: 0.13261 sec
-GraphBLAS is slower by a factor of 3.35934
+MATLAB: 0.041042 sec, GraphBLAS: 0.145667 sec
+GraphBLAS is slower by a factor of 3.54922
 </pre><p>(4) Linear indexing:</p><p>If A is an m-by-n 2D MATLAB matrix, with n &gt; 1, A(:) is a column vector of length m*n.  The index operation A(i) accesses the ith entry in the vector A(:).  This is called linear indexing in MATLAB.  It is not yet available for GraphBLAS matrices in this MATLAB interface to GraphBLAS, but will be added in the future.</p><p>(5) Implicit singleton dimension expansion</p><p>In MATLAB C=A+B where A is m-by-n and B is a 1-by-n row vector implicitly expands B to a matrix, computing C(i,j)=A(i,j)+B(j).  This implicit expansion is not yet suported in GraphBLAS with C=A+B. However, it can be done with C = GrB.mxm ('+.+', A, diag(GrB(B))). That's a nice example of the power of semirings, but it's not immediately obvious, and not as clear a syntax as C=A+B.  The GraphBLAS/@GrB/dnn.m function uses this 'plus.plus' semiring to apply the bias to each neuron.</p><pre class="codeinput">A = magic (3)
 B = 1000:1000:3000
 C1 = A + B
@@ -1473,7 +1596,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   9 nonzeros, 9 entries
 
     (1,1)    1008
@@ -1488,7 +1611,7 @@
 
 err =
      0
-</pre><p>(6) Performance issues</p><p>The GrB matrix is a MATLAB object, and there are some cases where performance issues can arise.  Extracting the contents of a MATLAB object (G.field) takes much more time than for a MATLAB struct with the same syntax, and building an object has similar issues.  The difference is small, and it does not affect large problems.  But if you have many calls to GrB operations with a small amount of work, then the time can be dominated by the MATLAB object-oriented overhead.</p><pre class="codeinput">A = rand (3,4) ;
+</pre><p>(6) MATLAB object overhead.</p><p>The GrB matrix is a MATLAB object, and there are some cases where performance issues can arise as a result.  Extracting the contents of a MATLAB object (G.field) takes much more time than for a MATLAB struct with % the same syntax, and building an object has similar issues.  The difference is small, and it does not affect large problems.  But if you have many calls to GrB operations with a small amount of work, then the time can be dominated by the MATLAB object-oriented overhead.</p><p>There is no solution or workaround to this issue.</p><pre class="codeinput">A = rand (3,4) ;
 G = GrB (A) ;
 tic
 <span class="keyword">for</span> k = 1:100000
@@ -1500,10 +1623,12 @@
     [m, n] = size (G) ;
 <span class="keyword">end</span>
 toc
-</pre><pre class="codeoutput">Elapsed time is 0.036883 seconds.
-Elapsed time is 0.613402 seconds.
-</pre><h2 id="73">GraphBLAS operations</h2><p>In addition to the overloaded operators (such as C=A*B) and overloaded functions (such as L=tril(A)), GraphBLAS also has methods of the form GrB.method.  Most of them take an optional input matrix Cin, which is the initial value of the matrix C for the expression below, an optional mask matrix M, and an optional accumulator operator.</p><pre>    C&lt;#M,replace&gt; = accum (C, T)</pre><p>In the above expression, #M is either empty (no mask), M (with a mask matrix) or ~M (with a complemented mask matrix), as determined by the descriptor.  'replace' can be used to clear C after it is used in accum(C,T) but before it is assigned with C&lt;...&gt; = Z, where Z=accum(C,T).  The matrix T is the result of some operation, such as T=A*B for GrB.mxm, or T=op(A,B) for GrB.eadd.</p><p>For a complete list of GraphBLAS overloaded operators and methods, type:</p><pre class="language-matlab">help <span class="string">GrB</span>
-</pre><p>Thanks for watching!</p><p>Tim Davis, Texas A&amp;M University, <a href="http://faculty.cse.tamu.edu/davis">http://faculty.cse.tamu.edu/davis</a> See also doc sparse and <a href="https://twitter.com/DocSparse">https://twitter.com/DocSparse</a></p><p class="footer"><br><a href="https://www.mathworks.com/products/matlab/">Published with MATLAB&reg; R2020a</a><br></p></div><!--
+</pre><pre class="codeoutput">Elapsed time is 0.039770 seconds.
+Elapsed time is 0.709072 seconds.
+</pre><h2 id="74">GraphBLAS operations</h2><p>In addition to the overloaded operators (such as C=A*B) and overloaded functions (such as L=tril(A)), GraphBLAS also has methods of the form GrB.method.  Most of them take an optional input matrix Cin, which is the initial value of the matrix C for the expression below, an optional mask matrix M, and an optional accumulator operator.</p><pre class="language-matlab">in <span class="string">GrB</span> <span class="string">syntax:</span>  <span class="string">C&lt;#M</span>,replace&gt; = accum (C, A*B)
+</pre><pre class="language-matlab">in <span class="string">@GrB</span> <span class="string">MATLAB:</span> <span class="string">C</span> <span class="string">=</span> <span class="string">GrB.mxm</span> <span class="string">(Cin, M, accum, semiring, A, B, desc)</span> ;
+</pre><p>In the above expression, #M is either empty (no mask), M (with a mask matrix) or ~M (with a complemented mask matrix), as determined by the descriptor (desc).  'replace' can be used to clear C after it is used in accum(C,T) but before it is assigned with C&lt;...&gt; = Z, where Z=accum(C,T).  The matrix T is the result of some operation, such as T=A*B for GrB.mxm, or T=op(A,B) for GrB.eadd.</p><p>For a complete list of GraphBLAS overloaded operators and methods, type:</p><pre class="language-matlab">help <span class="string">GrB</span>
+</pre><p>Thanks for watching!</p><p>Tim Davis, Texas A&amp;M University, <a href="http://faculty.cse.tamu.edu/davis">http://faculty.cse.tamu.edu/davis</a>, <a href="https://twitter.com/DocSparse">https://twitter.com/DocSparse</a></p><p class="footer"><br><a href="https://www.mathworks.com/products/matlab/">Published with MATLAB&reg; R2020a</a><br></p></div><!--
 ##### SOURCE BEGIN #####
 %% GraphBLAS: graph algorithms in the language of linear algebra
 % GraphBLAS is a library for creating graph algorithms based on sparse
@@ -1511,15 +1636,17 @@
 % for more details and resources.  See also the SuiteSparse:GraphBLAS
 % User Guide in this package.
 %
-% SuiteSparse:GraphBLAS, (c) 2017-2020, Tim Davis, Texas A&M University,
 % http://faculty.cse.tamu.edu/davis
+%
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %% GraphBLAS: faster and more general sparse matrices for MATLAB
 % GraphBLAS is not only useful for creating graph algorithms; it also
 % supports a wide range of sparse matrix data types and operations.
 % MATLAB can compute C=A*B with just two semirings: 'plus.times.double'
-% and 'plus.times.complex' for complex matrices.  GraphBLAS has 1,473
-% unique built-in semirings, such as 'max.plus'
+% and 'plus.times.complex' for complex matrices.  GraphBLAS has 2,518
+% built-in semirings, such as 'max.plus'
 % (https://en.wikipedia.org/wiki/Tropical_semiring).  These semirings can
 % be used to construct a wide variety of graph algorithms, based on
 % operations on sparse adjacency matrices.
@@ -1530,8 +1657,12 @@
 % single complex (with MATLAB matrices, these types can only be held in
 % full matrices).
 
-clear
-GrB.clear
+% reset to the default number of threads
+clear all
+maxNumCompThreads ('automatic') ;
+GrB.clear ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
+
 format compact
 rng ('default') ;
 X = 100 * rand (2) ;
@@ -1540,8 +1671,17 @@
 %% Sparse integer matrices
 % Here's an int8 version of the same matrix:
 
-S = int8 (G)            % convert G to a full MATLAB int8 matrix
-G = GrB (X, 'int8')      % a GraphBLAS sparse int8 matrix
+S = int8 (G)             % convert G to a full MATLAB int8 matrix
+S (1,1) = 0              % add an explicit zero to S
+G = GrB (X, 'int8')      % a GraphBLAS full int8 matrix
+G (1,1) = 0              % add an explicit zero to G
+G = GrB.prune (G)        % a GraphBLAS sparse int8 matrix
+
+try
+    S = sparse (S) ;     % MATLAB can't create sparse int8 matrices
+catch me
+    display (me)
+end
 
 %% Sparse single-precision matrices
 % Matrix operations in GraphBLAS are typically as fast, or faster than
@@ -1564,6 +1704,7 @@
 fprintf ('MATLAB time:    %g sec (in double)\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 %% Mixing MATLAB and GraphBLAS matrices
 % The error in the last computation is about eps('single') since
@@ -1590,6 +1731,7 @@
 fprintf ('MATLAB time:    %g sec (in double)\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 %% A wide range of semirings
 % MATLAB can only compute C=A*B using the standard '+.*.double' and
@@ -1670,10 +1812,9 @@
 % The C interface for SuiteSparse:GraphBLAS allows for arbitrary types
 % and operators to be constructed.  However, the MATLAB interface to
 % SuiteSparse:GraphBLAS is restricted to pre-defined types and operators:
-% a mere 13 types, 204 unary operators, 385 binary operators, 77 monoids,
-% 16 select operators, and 2,438 semirings (1,473 of which are unique,
-% since some binary operators are equivalent: 'min.logical' and
-% '&.logical' are the same thing, for example).
+% a mere 13 types, 212 unary operators, 401 binary operators, 77 monoids,
+% 22 select operators (each of which can be used for all 13 types),
+% and 2,518 semirings.
 %
 % That gives you a lot of tools to create all kinds of interesting
 % graph algorithms.  For example:
@@ -1685,6 +1826,7 @@
 % See 'help GrB.binopinfo' for a list of the binary operators, and
 % 'help GrB.monoidinfo' for the ones that can be used as the additive
 % monoid in a semiring.  'help GrB.unopinfo' lists the unary operators.
+% 'help GrB.semiringinfo' descripts the semirings.
 
 %% 
 help GrB.binopinfo
@@ -1695,6 +1837,9 @@
 %% 
 help GrB.unopinfo
 
+%% 
+help GrB.semiringinfo
+
 %% Element-wise operations
 % Binary operators can be used in element-wise matrix operations, like
 % C=A+B and C=A.*B.  For the matrix addition C=A+B, the pattern of C is
@@ -1774,7 +1919,7 @@
 %
 % A few differences with the built-in functions:
 %
-%   S = sparse (G)        % makes a copy of a GrB matrix
+%   S = sparse (G)        % converts G to sparse/hypersparse
 %   F = full (G)          % adds explicit zeros, so numel(F)==nnz(F)
 %   F = full (G,type,id)  % adds explicit identity values to a GrB matrix
 %   disp (G, level)       % display a GrB matrix G; level=2 is the default.
@@ -1832,15 +1977,15 @@
 disp (G,1)
 
 %% Storing a matrix by row or by column
-% MATLAB stores its sparse matrices by column, refered to as 'standard
-% CSC' in SuiteSparse:GraphBLAS.  In the CSC (compressed sparse column)
-% format, each column of the matrix is stored as a list of entries, with
-% their value and row index.  In the CSR (compressed sparse row) format,
-% each row is stored as a list of values and their column indices.
-% GraphBLAS uses both CSC and CSR, and the two formats can be intermixed
-% arbitrarily.  In its C interface, the default format is CSR.  However,
-% for better compatibility with MATLAB, this MATLAB interface for
-% SuiteSparse:GraphBLAS uses CSC by default instead. 
+% MATLAB stores its sparse matrices by column, refered to as 'sparse by
+% col' in SuiteSparse:GraphBLAS.  In the 'sparse by col' format, each
+% column of the matrix is stored as a list of entries, with their value
+% and row index.  In the 'sparse by row' format, each row is stored as a
+% list of values and their column indices.  GraphBLAS uses both 'by row'
+% and 'by col', and the two formats can be intermixed arbitrarily.  In
+% its C interface, the default format is 'by row'.  However, for better
+% compatibility with MATLAB, the SuiteSparse:GraphBLAS MATLAB interface
+% uses 'by col' by default instead. 
 
 %%
 rng ('default') ;
@@ -1851,14 +1996,14 @@
 GrB.format (G)
 
 %%
-% Many graph algorithms work better in CSR format, with matrices stored
-% by row.  For example, it is common to use A(i,j) for the edge (i,j),
-% and many graph algorithms need to access the out-adjacencies of nodes,
-% which is the row A(i,;) for node i.  If the CSR format is desired,
-% GrB.format ('by row') tells GraphBLAS to create all subsequent matrices
-% in the CSR format.  Converting from a MATLAB sparse matrix (in standard
-% CSC format) takes a little more time (requiring a transpose), but
-% subsequent graph algorithms can be faster.
+% Many graph algorithms work better in 'by row' format, with matrices
+% stored by row.  For example, it is common to use A(i,j) for the edge
+% (i,j), and many graph algorithms need to access the out-adjacencies of
+% nodes, which is the row A(i,;) for node i.  If the 'by row' format is
+% desired, GrB.format ('by row') tells GraphBLAS to create all subsequent
+% matrices in the 'by row' format.  Converting from a MATLAB sparse matrix
+% (in standard 'by col' format) takes a little more time (requiring a
+% transpose), but subsequent graph algorithms can be faster.
 
 %%
 G = GrB (C, 'by row')
@@ -1867,12 +2012,13 @@
 fprintf ('the format of H is:    %s\n', GrB.format (H)) ;
 err = norm (H-G,1)
 
-%% Hypersparse matrices
-% SuiteSparse:GraphBLAS can use two kinds of sparse matrix data
-% structures: standard and hypersparse, for both CSC and CSR formats.  In
-% the standard CSC format used in MATLAB, an m-by-n matrix A takes
-% O(n+nnz(A)) space.  MATLAB can create huge column vectors, but not huge
-% matrices (when n is huge).
+%% Hypersparse, sparse, bitmap, and full matrices
+% SuiteSparse:GraphBLAS can use four kinds of sparse matrix data
+% structures: hypersparse, sparse, bitmap, and full, in both 'by col' and
+% 'by row' formats, for a total of eight different combinations.  In the
+% 'sparse by col' that MATLAB uses for its sparse matrices, an m-by-n
+% matrix A takes O(n+nnz(A)) space.  MATLAB can create huge column
+% vectors, but not huge matrices (when n is huge).
 
 clear
 [c, huge] = computer ;
@@ -1966,12 +2112,12 @@
 % Furthermore, C=A*B is not defined for integer types in MATLAB, except
 % when A and/or B are scalars.
 %
-% GraphBLAS supports all of those types for its sparse matrices.  All
-% operations are supported, including C=A*B when A or B are any integer
-% type, in 1000s of semirings.
+% GraphBLAS supports all of those types for all of its matrices (hyper,
+% sparse, bitmap, or full).  All operations are supported, including C=A*B
+% when A or B are any integer type, in 1000s of semirings.
 %
-% However, integer arithmetic differs in GraphBLAS and MATLAB.  In
-% MATLAB, integer values saturate if they exceed their maximum value.  In
+% However, integer arithmetic differs in GraphBLAS and MATLAB.  In MATLAB,
+% integer values saturate if they exceed their maximum value.  In
 % GraphBLAS, integer operators act in a modular fashion.  The latter is
 % essential when computing C=A*B over a semiring.  A saturating integer
 % operator cannot be used as a monoid since it is not associative.
@@ -2011,6 +2157,7 @@
 fprintf ('MATLAB time:    %g sec\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 %% Example graph algorithm: Luby's method in GraphBLAS
 % The GrB.mis function is variant of Luby's randomized algorithm [Luby
@@ -2056,6 +2203,7 @@
 fprintf ('# layers:   %d\n', nlayers) ;
 fprintf ('# neurons:  %d\n', nneurons) ;
 fprintf ('# features: %d\n', nfeatures) ;
+fprintf ('# of threads used: %d\n', GrB.threads) ;
 
 tic
 Y0 = sprand (nfeatures, nneurons, 0.1) ;
@@ -2089,6 +2237,7 @@
 fprintf ('total time in MATLAB:    %g sec\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 err = norm (Y1-Y2,1)
 
@@ -2112,7 +2261,7 @@
 % as { start, inc, fini }, instead of start:inc:fini. See
 % 'help GrB.extract', 'help GrB.assign' for the functional form.
 % For the overloaded syntax C(I,J)=A and C=A(I,J), see
-% 'help GrB/subsasgn' and 'help GrB/subsfref'.  The cell array
+% 'help GrB/subsasgn' and 'help GrB/subsref'.  The cell array
 % syntax isn't conventional, but it is far faster than the MATLAB
 % colon notation for objects, and takes far less memory when I is huge.
 
@@ -2196,11 +2345,11 @@
 %       C = GrB.assign (C, M, A)
 %
 % The GrB.assign statement computes C(M)=A(M), and it is vastly faster
-% than C(M)=A(M), even if the time to convert the GrB matrix back to a
-% MATLAB sparse matrix is included.
+% than C(M)=A(M) for MATLAB sparse matrices, even if the time to convert
+% the GrB matrix back to a MATLAB sparse matrix is included.
 %
-% GraphBLAS can also compute C (M) = A (M) using overloaded operators
-% for subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.
+% GraphBLAS can also compute C(M)=A(M) using overloaded operators for
+% subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.
 %
 % Here are both methods in GraphBLAS (both are very fast).  Setting up:
 
@@ -2243,16 +2392,17 @@
 matlab_time = toc ;
 
 fprintf ('\nGraphBLAS time: %g sec (GrB.assign)\n', gb_time) ;
-fprintf ('\nGraphBLAS time: %g sec (overloading)\n', gb_time2) ;
+fprintf ('GraphBLAS time: %g sec (overloading)\n', gb_time2) ;
 fprintf ('MATLAB time:    %g sec\n', matlab_time) ;
-fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
+fprintf ('Speedup of GraphBLAS (overloading) over MATLAB: %g\n', ...
     matlab_time / gb_time2) ;
+fprintf ('Speedup of GraphBLAS (GrB.assign)  over MATLAB: %g\n', ...
+    matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
-% GraphBLAS computes the exact same result with both methods:
 assert (isequal (C1, C))
 assert (isequal (C2, C))
-C1 - C
-C2 - C
+fprintf ('Results of GrB and MATLAB match perfectly.\n')
 
 %% Limitations and their future solutions
 % The MATLAB interface for SuiteSparse:GraphBLAS is a work-in-progress.
@@ -2295,9 +2445,9 @@
 % the equivalent built-in operators and functions in MATLAB.
 %
 % There are few notable exceptions; these will be addressed in the future.
-% Dense matrices and vectors held as GraphBLAS objects are slower than
-% their MATLAB counterparts.  horzcat and vertcat, for [A B] and [A;B]
-% when either A or B are GraphBLAS matrices, are also slow, as
+% Full matrices and vectors held as GraphBLAS objects can be slightly
+% slower than their MATLAB counterparts.  horzcat and vertcat, for [A B]
+% and [A;B] when either A or B are GraphBLAS matrices, are also slow, as
 % illustrated below in the next example.
 %
 % Other methods that will be faster in the future include bandwidth,
@@ -2354,15 +2504,17 @@
 err = norm (C1-C2,1)
 
 %%
-% (6) Performance issues
+% (6) MATLAB object overhead.
 %
 % The GrB matrix is a MATLAB object, and there are some cases where
-% performance issues can arise.  Extracting the contents of a MATLAB
-% object (G.field) takes much more time than for a MATLAB struct with
-% the same syntax, and building an object has similar issues.  The
-% difference is small, and it does not affect large problems.  But if
-% you have many calls to GrB operations with a small amount of work,
-% then the time can be dominated by the MATLAB object-oriented overhead.
+% performance issues can arise as a result.  Extracting the contents of
+% a MATLAB object (G.field) takes much more time than for a MATLAB struct
+% with % the same syntax, and building an object has similar issues.  The
+% difference is small, and it does not affect large problems.  But if you
+% have many calls to GrB operations with a small amount of work, then the
+% time can be dominated by the MATLAB object-oriented overhead.
+%
+% There is no solution or workaround to this issue.
 
 A = rand (3,4) ;
 G = GrB (A) ;
@@ -2384,12 +2536,14 @@
 % the initial value of the matrix C for the expression below, an optional
 % mask matrix M, and an optional accumulator operator.
 %
-%      C<#M,replace> = accum (C, T)
+%   in GrB syntax:  C<#M,replace> = accum (C, A*B)
+%
+%   in @GrB MATLAB: C = GrB.mxm (Cin, M, accum, semiring, A, B, desc) ;
 %
 % In the above expression, #M is either empty (no mask), M (with a mask
 % matrix) or ~M (with a complemented mask matrix), as determined by the
-% descriptor.  'replace' can be used to clear C after it is used in
-% accum(C,T) but before it is assigned with C<...> = Z, where
+% descriptor (desc).  'replace' can be used to clear C after it is used
+% in accum(C,T) but before it is assigned with C<...> = Z, where
 % Z=accum(C,T).  The matrix T is the result of some operation, such as
 % T=A*B for GrB.mxm, or T=op(A,B) for GrB.eadd.
 %
@@ -2399,8 +2553,8 @@
 %
 % Thanks for watching!
 %
-% Tim Davis, Texas A&M University, http://faculty.cse.tamu.edu/davis
-% See also doc sparse and https://twitter.com/DocSparse
+% Tim Davis, Texas A&M University, http://faculty.cse.tamu.edu/davis,
+% https://twitter.com/DocSparse
 
 
 ##### SOURCE END #####
diff --git a/GraphBLAS/GraphBLAS/demo/html/DellXPS13/graphblas_demo2.html b/GraphBLAS/GraphBLAS/demo/html/DellXPS13/graphblas_demo2.html
index 26a1fca52e..6f004dd02e 100644
--- a/GraphBLAS/GraphBLAS/demo/html/DellXPS13/graphblas_demo2.html
+++ b/GraphBLAS/GraphBLAS/demo/html/DellXPS13/graphblas_demo2.html
@@ -6,7 +6,7 @@
    <!--
 This HTML was auto-generated from MATLAB code.
 To make changes, update the MATLAB code and republish this document.
-      --><title>graphblas_demo2</title><meta name="generator" content="MATLAB 9.8"><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/"><meta name="DC.date" content="2020-06-25"><meta name="DC.source" content="graphblas_demo2.m"><style type="text/css">
+      --><title>graphblas_demo2</title><meta name="generator" content="MATLAB 9.8"><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/"><meta name="DC.date" content="2021-01-03"><meta name="DC.source" content="graphblas_demo2.m"><style type="text/css">
 html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,font,img,ins,kbd,q,s,samp,small,strike,strong,sub,sup,tt,var,b,u,i,center,dl,dt,dd,ol,ul,li,fieldset,form,label,legend,table,caption,tbody,tfoot,thead,tr,th,td{margin:0;padding:0;border:0;outline:0;font-size:100%;vertical-align:baseline;background:transparent}body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote:before,blockquote:after,q:before,q:after{content:'';content:none}:focus{outine:0}ins{text-decoration:none}del{text-decoration:line-through}table{border-collapse:collapse;border-spacing:0}
 
 html { min-height:100%; margin-bottom:1px; }
@@ -69,8 +69,13 @@
 
   </style></head><body><div class="content"><pre class="codeinput"><span class="comment">% Run the GraphBLAS demo2</span>
 
-<span class="comment">% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.</span>
-<span class="comment">% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.</span>
+<span class="comment">% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.</span>
+<span class="comment">% SPDX-License-Identifier: Apache-2.0</span>
+
+<span class="comment">% reset to the default number of threads</span>
+clear <span class="string">all</span>
+maxNumCompThreads (<span class="string">'automatic'</span>) ;
+GrB.clear ;
 
 gbdemo2
 </pre><pre class="codeoutput"> GBDEMO2 Extreme performance differences: GraphBLAS vs MATLAB.
@@ -80,8 +85,8 @@
         gbdemo2             % uses a default bnz = 6000
         gbdemo2 (20000)     % uses bnz = 20000
  
-  The GraphBLAS operations used in gbdemo are perhaps 3x to 50x
-  faster than the corresponding MATLAB operations, depending on how
+  Many of the GraphBLAS operations used in gbdemo are perhaps 3x to
+  50x faster than the corresponding MATLAB operations, depending on how
   many cores your computer has.  Here's an example where GraphBLAS is
   asymptotically far faster than MATLAB R2019a: a simple assignment
   for a large matrix C:
@@ -110,86 +115,91 @@
   See also GrB.assign, subsasgn.
 
 
-# of threads used in GraphBLAS: 8
+# of threads used in GraphBLAS: 4
 
 
 C(I,J)=A where C is 1 million -by- 1 million
 with 35.7126 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.261989 sec
-    GraphBLAS time: 0.272427 sec
+    setup time:     0.292615 sec
+    GraphBLAS time: 0.400704 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    0.349314 sec
-    Speedup of GraphBLAS over MATLAB: 1.28223
-    check time:     0.260897 sec
+    MATLAB time:    0.423449 sec
+    Speedup of GraphBLAS over MATLAB: 1.05676
+    check time:     0.402826 sec
     all tests passed
 
 C(I,J)=A where C is 4 million -by- 4 million
 with 35.8202 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.319877 sec
-    GraphBLAS time: 0.333537 sec
+    setup time:     0.354804 sec
+    GraphBLAS time: 0.400309 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    0.345186 sec
-    Speedup of GraphBLAS over MATLAB: 1.03493
-    check time:     0.297323 sec
+    MATLAB time:    0.342458 sec
+    Speedup of GraphBLAS over MATLAB: 0.855484
+    check time:     0.312558 sec
     all tests passed
 
 C(I,J)=A where C is 9 million -by- 9 million
 with 35.928 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.356083 sec
-    GraphBLAS time: 0.391448 sec
+    setup time:     0.379629 sec
+    GraphBLAS time: 0.459797 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    187.06 sec
-    Speedup of GraphBLAS over MATLAB: 477.866
-    check time:     0.292156 sec
+    MATLAB time:    190.938 sec
+    Speedup of GraphBLAS over MATLAB: 415.266
+    check time:     0.36055 sec
     all tests passed
 
 C(I,J)=A where C is 16 million -by- 16 million
 with 35.916 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.40299 sec
-    GraphBLAS time: 0.446623 sec
+    setup time:     0.443602 sec
+    GraphBLAS time: 0.546551 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    199.164 sec
-    Speedup of GraphBLAS over MATLAB: 445.934
-    check time:     0.492775 sec
+    MATLAB time:    208.425 sec
+    Speedup of GraphBLAS over MATLAB: 381.346
+    check time:     0.376075 sec
     all tests passed
 
 C(I,J)=A where C is 25 million -by- 25 million
 with 35.964 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.464605 sec
-    GraphBLAS time: 0.395374 sec
+    setup time:     0.483604 sec
+    GraphBLAS time: 0.472668 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    230.122 sec
-    Speedup of GraphBLAS over MATLAB: 582.036
-    check time:     0.369713 sec
+    MATLAB time:    249.645 sec
+    Speedup of GraphBLAS over MATLAB: 528.162
+    check time:     0.444949 sec
     all tests passed
 
 C(I,J)=A where C is 36 million -by- 36 million
 with 35.976 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.510898 sec
-    GraphBLAS time: 0.681213 sec
+    setup time:     0.554859 sec
+    GraphBLAS time: 0.868482 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    251.809 sec
-    Speedup of GraphBLAS over MATLAB: 369.649
-    check time:     0.416664 sec
+    MATLAB time:    275.087 sec
+    Speedup of GraphBLAS over MATLAB: 316.744
+    check time:     0.528343 sec
     all tests passed
 </pre><p class="footer"><br><a href="https://www.mathworks.com/products/matlab/">Published with MATLAB&reg; R2020a</a><br></p></div><!--
 ##### SOURCE BEGIN #####
 % Run the GraphBLAS demo2
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+% reset to the default number of threads
+clear all
+maxNumCompThreads ('automatic') ;
+GrB.clear ;
 
 gbdemo2
 
diff --git a/GraphBLAS/GraphBLAS/demo/html/Dell_Windows10/graphblas_demo.html b/GraphBLAS/GraphBLAS/demo/html/Dell_Windows10/graphblas_demo.html
index 96b663d73c..d81e17183f 100644
--- a/GraphBLAS/GraphBLAS/demo/html/Dell_Windows10/graphblas_demo.html
+++ b/GraphBLAS/GraphBLAS/demo/html/Dell_Windows10/graphblas_demo.html
@@ -6,7 +6,7 @@
    <!--
 This HTML was auto-generated from MATLAB code.
 To make changes, update the MATLAB code and republish this document.
-      --><title>GraphBLAS: graph algorithms in the language of linear algebra</title><meta name="generator" content="MATLAB 9.7"><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/"><meta name="DC.date" content="2020-06-25"><meta name="DC.source" content="graphblas_demo.m"><style type="text/css">
+      --><title>GraphBLAS: graph algorithms in the language of linear algebra</title><meta name="generator" content="MATLAB 9.7"><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/"><meta name="DC.date" content="2021-01-04"><meta name="DC.source" content="graphblas_demo.m"><style type="text/css">
 html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,font,img,ins,kbd,q,s,samp,small,strike,strong,sub,sup,tt,var,b,u,i,center,dl,dt,dd,ol,ul,li,fieldset,form,label,legend,table,caption,tbody,tfoot,thead,tr,th,td{margin:0;padding:0;border:0;outline:0;font-size:100%;vertical-align:baseline;background:transparent}body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote:before,blockquote:after,q:before,q:after{content:'';content:none}:focus{outine:0}ins{text-decoration:none}del{text-decoration:line-through}table{border-collapse:collapse;border-spacing:0}
 
 html { min-height:100%; margin-bottom:1px; }
@@ -66,16 +66,22 @@
 
 
 
-  </style></head><body><div class="content"><h1>GraphBLAS: graph algorithms in the language of linear algebra</h1><!--introduction--><p>GraphBLAS is a library for creating graph algorithms based on sparse linear algebraic operations over semirings.  Visit <a href="http://graphblas.org">http://graphblas.org</a> for more details and resources.  See also the SuiteSparse:GraphBLAS User Guide in this package.</p><p>SuiteSparse:GraphBLAS, (c) 2017-2020, Tim Davis, Texas A&amp;M University, <a href="http://faculty.cse.tamu.edu/davis">http://faculty.cse.tamu.edu/davis</a></p><!--/introduction--><h2>Contents</h2><div><ul><li><a href="#1">GraphBLAS: faster and more general sparse matrices for MATLAB</a></li><li><a href="#2">Sparse integer matrices</a></li><li><a href="#3">Sparse single-precision matrices</a></li><li><a href="#4">Mixing MATLAB and GraphBLAS matrices</a></li><li><a href="#5">Faster matrix operations</a></li><li><a href="#6">A wide range of semirings</a></li><li><a href="#8">The max.plus tropical semiring</a></li><li><a href="#9">A boolean semiring</a></li><li><a href="#13">GraphBLAS operators, monoids, and semirings</a></li><li><a href="#17">Element-wise operations</a></li><li><a href="#19">Subtracting two matrices</a></li><li><a href="#21">Element-wise 'multiplication'</a></li><li><a href="#23">Overloaded operators</a></li><li><a href="#26">Overloaded functions</a></li><li><a href="#28">Zeros are handled differently</a></li><li><a href="#30">Displaying contents of a GraphBLAS matrix</a></li><li><a href="#35">Storing a matrix by row or by column</a></li><li><a href="#39">Hypersparse matrices</a></li><li><a href="#42">numel uses vpa if the matrix is really huge</a></li><li><a href="#44">The mask and accumulator</a></li><li><a href="#46">The descriptor</a></li><li><a href="#47">Integer arithmetic is different in GraphBLAS</a></li><li><a href="#49">An example graph algorithm: breadth-first search</a></li><li><a href="#50">Example graph algorithm: Luby's method in GraphBLAS</a></li><li><a href="#51">Sparse deep neural network</a></li><li><a href="#52">Solving the sparse deep neural network problem with GraphbLAS</a></li><li><a href="#53">Solving the sparse deep neural network problem with MATLAB</a></li><li><a href="#54">For objects, GraphBLAS has better colon notation than MATLAB</a></li><li><a href="#57">Iterative solvers work as-is</a></li><li><a href="#58">... even in single precision</a></li><li><a href="#61">Extreme performance differences between GraphBLAS and MATLAB.</a></li><li><a href="#62">Sparse logical indexing is much, much faster in GraphBLAS</a></li><li><a href="#63">First method in GraphBLAS, with GrB.assign</a></li><li><a href="#64">Second method in GraphBLAS, with C(M)=A(M)</a></li><li><a href="#65">Now with MATLAB matrices, with C(M)=A(M)</a></li><li><a href="#66">Limitations and their future solutions</a></li><li><a href="#73">GraphBLAS operations</a></li></ul></div><h2 id="1">GraphBLAS: faster and more general sparse matrices for MATLAB</h2><p>GraphBLAS is not only useful for creating graph algorithms; it also supports a wide range of sparse matrix data types and operations. MATLAB can compute C=A*B with just two semirings: 'plus.times.double' and 'plus.times.complex' for complex matrices.  GraphBLAS has 1,473 unique built-in semirings, such as 'max.plus' (<a href="https://en.wikipedia.org/wiki/Tropical_semiring">https://en.wikipedia.org/wiki/Tropical_semiring</a>).  These semirings can be used to construct a wide variety of graph algorithms, based on operations on sparse adjacency matrices.</p><p>MATLAB and GraphBLAS both provide sparse matrices of type double, logical, and double complex.  GraphBLAS adds sparse matrices of type: single, int8, int16, int32, int64, uint8, uint16, uint32, uint64, and single complex (with MATLAB matrices, these types can only be held in full matrices).</p><pre class="codeinput">clear
-GrB.clear
+  </style></head><body><div class="content"><h1>GraphBLAS: graph algorithms in the language of linear algebra</h1><!--introduction--><p>GraphBLAS is a library for creating graph algorithms based on sparse linear algebraic operations over semirings.  Visit <a href="http://graphblas.org">http://graphblas.org</a> for more details and resources.  See also the SuiteSparse:GraphBLAS User Guide in this package.</p><p><a href="http://faculty.cse.tamu.edu/davis">http://faculty.cse.tamu.edu/davis</a></p><p>SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved. SPDX-License-Identifier: Apache-2.0</p><!--/introduction--><h2>Contents</h2><div><ul><li><a href="#1">GraphBLAS: faster and more general sparse matrices for MATLAB</a></li><li><a href="#2">Sparse integer matrices</a></li><li><a href="#3">Sparse single-precision matrices</a></li><li><a href="#4">Mixing MATLAB and GraphBLAS matrices</a></li><li><a href="#5">Faster matrix operations</a></li><li><a href="#6">A wide range of semirings</a></li><li><a href="#8">The max.plus tropical semiring</a></li><li><a href="#9">A boolean semiring</a></li><li><a href="#13">GraphBLAS operators, monoids, and semirings</a></li><li><a href="#18">Element-wise operations</a></li><li><a href="#20">Subtracting two matrices</a></li><li><a href="#22">Element-wise 'multiplication'</a></li><li><a href="#24">Overloaded operators</a></li><li><a href="#27">Overloaded functions</a></li><li><a href="#29">Zeros are handled differently</a></li><li><a href="#31">Displaying contents of a GraphBLAS matrix</a></li><li><a href="#36">Storing a matrix by row or by column</a></li><li><a href="#40">Hypersparse, sparse, bitmap, and full matrices</a></li><li><a href="#43">numel uses vpa if the matrix is really huge</a></li><li><a href="#45">The mask and accumulator</a></li><li><a href="#47">The descriptor</a></li><li><a href="#48">Integer arithmetic is different in GraphBLAS</a></li><li><a href="#50">An example graph algorithm: breadth-first search</a></li><li><a href="#51">Example graph algorithm: Luby's method in GraphBLAS</a></li><li><a href="#52">Sparse deep neural network</a></li><li><a href="#53">Solving the sparse deep neural network problem with GraphbLAS</a></li><li><a href="#54">Solving the sparse deep neural network problem with MATLAB</a></li><li><a href="#55">For objects, GraphBLAS has better colon notation than MATLAB</a></li><li><a href="#58">Iterative solvers work as-is</a></li><li><a href="#59">... even in single precision</a></li><li><a href="#62">Extreme performance differences between GraphBLAS and MATLAB.</a></li><li><a href="#63">Sparse logical indexing is much, much faster in GraphBLAS</a></li><li><a href="#64">First method in GraphBLAS, with GrB.assign</a></li><li><a href="#65">Second method in GraphBLAS, with C(M)=A(M)</a></li><li><a href="#66">Now with MATLAB matrices, with C(M)=A(M)</a></li><li><a href="#67">Limitations and their future solutions</a></li><li><a href="#74">GraphBLAS operations</a></li></ul></div><h2 id="1">GraphBLAS: faster and more general sparse matrices for MATLAB</h2><p>GraphBLAS is not only useful for creating graph algorithms; it also supports a wide range of sparse matrix data types and operations. MATLAB can compute C=A*B with just two semirings: 'plus.times.double' and 'plus.times.complex' for complex matrices.  GraphBLAS has 2,518 built-in semirings, such as 'max.plus' (<a href="https://en.wikipedia.org/wiki/Tropical_semiring">https://en.wikipedia.org/wiki/Tropical_semiring</a>).  These semirings can be used to construct a wide variety of graph algorithms, based on operations on sparse adjacency matrices.</p><p>MATLAB and GraphBLAS both provide sparse matrices of type double, logical, and double complex.  GraphBLAS adds sparse matrices of type: single, int8, int16, int32, int64, uint8, uint16, uint32, uint64, and single complex (with MATLAB matrices, these types can only be held in full matrices).</p><pre class="codeinput"><span class="comment">% reset to the default number of threads</span>
+clear <span class="string">all</span>
+maxNumCompThreads (<span class="string">'automatic'</span>) ;
+GrB.clear ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
+
 format <span class="string">compact</span>
 rng (<span class="string">'default'</span>) ;
 X = 100 * rand (2) ;
 G = GrB (X)              <span class="comment">% GraphBLAS copy of a matrix X, same type</span>
 </pre><pre class="codeoutput">
+# of threads used by GraphBLAS: 8
+
 G =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    81.4724
@@ -83,16 +89,29 @@
     (1,2)    12.6987
     (2,2)    91.3376
 
-</pre><h2 id="2">Sparse integer matrices</h2><p>Here's an int8 version of the same matrix:</p><pre class="codeinput">S = int8 (G)            <span class="comment">% convert G to a full MATLAB int8 matrix</span>
-G = GrB (X, <span class="string">'int8'</span>)      <span class="comment">% a GraphBLAS sparse int8 matrix</span>
+</pre><h2 id="2">Sparse integer matrices</h2><p>Here's an int8 version of the same matrix:</p><pre class="codeinput">S = int8 (G)             <span class="comment">% convert G to a full MATLAB int8 matrix</span>
+S (1,1) = 0              <span class="comment">% add an explicit zero to S</span>
+G = GrB (X, <span class="string">'int8'</span>)      <span class="comment">% a GraphBLAS full int8 matrix</span>
+G (1,1) = 0              <span class="comment">% add an explicit zero to G</span>
+G = GrB.prune (G)        <span class="comment">% a GraphBLAS sparse int8 matrix</span>
+
+<span class="keyword">try</span>
+    S = sparse (S) ;     <span class="comment">% MATLAB can't create sparse int8 matrices</span>
+<span class="keyword">catch</span> me
+    display (me)
+<span class="keyword">end</span>
 </pre><pre class="codeoutput">S =
   2&times;2 int8 matrix
    81   13
    91   91
+S =
+  2&times;2 int8 matrix
+    0   13
+   91   91
 
 G =
 
-  2x2 GraphBLAS int8_t matrix, sparse by col:
+  2x2 GraphBLAS int8_t matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)   81
@@ -100,6 +119,35 @@
     (1,2)   13
     (2,2)   91
 
+
+G =
+
+  2x2 GraphBLAS int8_t matrix, full by col
+  3 nonzeros, 4 entries
+
+    (1,1)   0
+    (2,1)   91
+    (1,2)   13
+    (2,2)   91
+
+
+G =
+
+  2x2 GraphBLAS int8_t matrix, bitmap by col
+  3 nonzeros, 3 entries
+
+    (2,1)   91
+    (1,2)   13
+    (2,2)   91
+
+me = 
+  MException with properties:
+
+    identifier: 'MATLAB:UndefinedFunction'
+       message: 'Undefined function 'sparse' for input arguments of type 'int8'.'
+         cause: {}
+         stack: [4&times;1 struct]
+    Correction: []
 </pre><h2 id="3">Sparse single-precision matrices</h2><p>Matrix operations in GraphBLAS are typically as fast, or faster than MATLAB.  Here's an unfair comparison: computing X^2 with MATLAB in double precision and with GraphBLAS in single precision.  You would naturally expect GraphBLAS to be faster.</p><p>Please wait ...</p><pre class="codeinput">n = 1e5 ;
 X = spdiags (rand (n, 201), -100:100, n, n) ;
 G = GrB (X, <span class="string">'single'</span>) ;
@@ -113,22 +161,25 @@
 fprintf (<span class="string">'MATLAB time:    %g sec (in double)\n'</span>, matlab_time) ;
 fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 </pre><pre class="codeoutput">
-GraphBLAS time: 1.48328 sec (in single)
-MATLAB time:    5.8599 sec (in double)
-Speedup of GraphBLAS over MATLAB: 3.95062
+GraphBLAS time: 1.54567 sec (in single)
+MATLAB time:    5.85673 sec (in double)
+Speedup of GraphBLAS over MATLAB: 3.78913
+
+# of threads used by GraphBLAS: 8
 </pre><h2 id="4">Mixing MATLAB and GraphBLAS matrices</h2><p>The error in the last computation is about eps('single') since GraphBLAS did its computation in single precision, while MATLAB used double precision.  MATLAB and GraphBLAS matrices can be easily combined, as in X2-G2.  The sparse single precision matrices take less memory space.</p><pre class="codeinput">err = norm (X2 - G2, 1) / norm (X2,1)
 eps (<span class="string">'single'</span>)
 whos <span class="string">G</span> <span class="string">G2</span> <span class="string">X</span> <span class="string">X2</span>
 </pre><pre class="codeoutput">err =
-      1.50487018943138e-07
+   1.5049e-07
 ans =
   single
-    1.192093e-07
+  1.1921e-07
   Name           Size                    Bytes  Class     Attributes
 
-  G         100000x100000            241879764  GrB                 
-  G2        100000x100000            481518564  GrB                 
+  G         100000x100000            241879772  GrB                 
+  G2        100000x100000            481518572  GrB                 
   X         100000x100000            322238408  double    sparse    
   X2        100000x100000            641756808  double    sparse    
 
@@ -141,12 +192,15 @@
 fprintf (<span class="string">'MATLAB time:    %g sec (in double)\n'</span>, matlab_time) ;
 fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 </pre><pre class="codeoutput">err =
      0
 
-GraphBLAS time: 1.80984 sec (in double)
-MATLAB time:    5.8599 sec (in double)
-Speedup of GraphBLAS over MATLAB: 3.23781
+GraphBLAS time: 1.75881 sec (in double)
+MATLAB time:    5.85673 sec (in double)
+Speedup of GraphBLAS over MATLAB: 3.32994
+
+# of threads used by GraphBLAS: 8
 </pre><h2 id="6">A wide range of semirings</h2><p>MATLAB can only compute C=A*B using the standard '+.*.double' and '+.*.complex' semirings.  A semiring is defined in terms of a string, 'add.mult.type', where 'add' is a monoid that takes the place of the additive operator, 'mult' is the multiplicative operator, and 'type' is the data type for the two inputs to the mult operator.</p><p>In the standard semiring, C=A*B is defined as:</p><pre class="language-matlab">C(i,j) = sum (A(i,:).' .* B(:,j))
 </pre><p>using 'plus' as the monoid and 'times' as the multiplicative operator. But in a more general semiring, 'sum' can be any monoid, which is an associative and commutative operator that has an identity value.  For example, in the 'max.plus' tropical algebra, C(i,j) for C=A*B is defined as:</p><pre class="language-matlab">C(i,j) = max (A(i,:).' + B(:,j))
 </pre><p>This can be computed in GraphBLAS with:</p><pre class="language-matlab">C = GrB.mxm (<span class="string">'max.+'</span>, A, B)
@@ -234,7 +288,7 @@
 
 C2 =
 
-  3x3 GraphBLAS bool matrix, sparse by col:
+  3x3 GraphBLAS bool matrix, bitmap by col
   9 nonzeros, 9 entries
 
     (1,1)   1
@@ -254,14 +308,14 @@
   A         3x3                68  logical    sparse    
   B         3x3               113  logical    sparse    
   C1        3x3               176  double     sparse    
-  C2        3x3              1071  GrB                  
+  C2        3x3               808  GrB                  
 
 ans =
     'logical'
-</pre><h2 id="13">GraphBLAS operators, monoids, and semirings</h2><p>The C interface for SuiteSparse:GraphBLAS allows for arbitrary types and operators to be constructed.  However, the MATLAB interface to SuiteSparse:GraphBLAS is restricted to pre-defined types and operators: a mere 13 types, 204 unary operators, 385 binary operators, 77 monoids, 16 select operators, and 2,438 semirings (1,473 of which are unique, since some binary operators are equivalent: 'min.logical' and '&amp;.logical' are the same thing, for example).</p><p>That gives you a lot of tools to create all kinds of interesting graph algorithms.  For example:</p><pre class="language-matlab">GrB.bfs    <span class="comment">% breadth-first search</span>
+</pre><h2 id="13">GraphBLAS operators, monoids, and semirings</h2><p>The C interface for SuiteSparse:GraphBLAS allows for arbitrary types and operators to be constructed.  However, the MATLAB interface to SuiteSparse:GraphBLAS is restricted to pre-defined types and operators: a mere 13 types, 212 unary operators, 401 binary operators, 77 monoids, 22 select operators (each of which can be used for all 13 types), and 2,518 semirings.</p><p>That gives you a lot of tools to create all kinds of interesting graph algorithms.  For example:</p><pre class="language-matlab">GrB.bfs    <span class="comment">% breadth-first search</span>
 GrB.dnn    <span class="comment">% sparse deep neural network (http://graphchallenge.org)</span>
 GrB.mis    <span class="comment">% maximal independent set</span>
-</pre><p>See 'help GrB.binopinfo' for a list of the binary operators, and 'help GrB.monoidinfo' for the ones that can be used as the additive monoid in a semiring.  'help GrB.unopinfo' lists the unary operators.</p><pre class="codeinput">help <span class="string">GrB.binopinfo</span>
+</pre><p>See 'help GrB.binopinfo' for a list of the binary operators, and 'help GrB.monoidinfo' for the ones that can be used as the additive monoid in a semiring.  'help GrB.unopinfo' lists the unary operators. 'help GrB.semiringinfo' descripts the semirings.</p><pre class="codeinput">help <span class="string">GrB.binopinfo</span>
 </pre><pre class="codeoutput"> GRB.BINOPINFO list the details of a GraphBLAS binary operator.
  
     GrB.binopinfo
@@ -302,29 +356,49 @@
  
     operator name(s) f(x,y)         |   operator names(s) f(x,y)
     ---------------- ------         |   ----------------- ------
-    1st first        x              |   iseq             x == y
-    2nd second       y              |   isne             x ~= y
-    min              min(x,y)       |   isgt             x &gt; y
-    max              max(x,y)       |   islt             x &lt; y
-    +   plus         x+y            |   isge             x &gt;= y
-    -   minus        x-y            |   isle             x &lt;= y
-    rminus           y-x            |   ==  eq           x == y
-    *   times        x*y            |   ~=  ne           x ~= y
-    /   div          x/y            |   &gt;   gt           x &gt; y
-    \   rdiv         y/x            |   &lt;   lt           x &lt; y
-    |   || or  lor   x | y          |   &gt;=  ge           x &gt;= y
-    &amp;   &amp;&amp; and land  x &amp; y          |   &lt;=  le           x &lt;= y
-    xor lxor         xor(x,y)       |   .^  pow          x .^ y
-    pair             1              |   any              pick x or y
- 
-  Comparators (*lt, *gt, *le, *ge) and min/max are not available for
-  complex types.
+    1st first        x              |   iseq              x == y
+    2nd second       y              |   isne              x ~= y
+    min              min(x,y)       |   isgt              x &gt; y
+    max              max(x,y)       |   islt              x &lt; y
+    +   plus         x+y            |   isge              x &gt;= y
+    -   minus        x-y            |   isle              x &lt;= y
+    rminus           y-x            |   ==  eq            x == y
+    *   times        x*y            |   ~=  ne            x ~= y
+    /   div          x/y            |   &gt;   gt            x &gt; y
+    \   rdiv         y/x            |   &lt;   lt            x &lt; y
+    |   || or  lor   x | y          |   &gt;=  ge            x &gt;= y
+    &amp;   &amp;&amp; and land  x &amp; y          |   &lt;=  le            x &lt;= y
+    xor lxor         xor(x,y)       |   .^  pow           x .^ y
+    pair             1              |   any               pick x or y
  
   All of the above operators are defined for logical operands, but many
   are redundant. 'min.logical' is the same as 'and.logical', for example.
   Most of the logical operators have aliases: ('lor', 'or', '|') are the
   same, as are ('lxnor', 'xnor', 'eq', '==') for logical types.
  
+  Positional operators return int32 or int64, and depend only on the position
+  of the entry in the matrix.  They do not depend on the values of their
+  inputs, but on their position in the matrix instead:
+ 
+    1-based postional ops:          in a semiring:     in ewise operators:
+    operator name(s)                f(A(i,k)*B(k,j))   f(A(i,j),B(i,j))
+    ----------------                ----------------   ----------------
+    firsti1  1sti1 firsti  1sti     i                  i
+    firstj1  1stj1 firstj  1stj     k                  j
+    secondi1 2ndi1 secondi 2ndi     k                  i
+    secondj1 2ndj1 secondj 2ndj     j                  j
+ 
+    0-based postional ops:          in a semiring:     in ewise operators:
+    operator name(s)                f(A(i,k)*B(k,j))   f(A(i,j),B(i,j))
+    ----------------                ----------------   ----------------
+    firsti0  1sti0                  i-1                i-1
+    firstj0  1stj0                  k-1                j-1
+    secondi0 2ndi0                  k-1                i-1
+    secondj0 2ndj0                  j-1                j-1
+ 
+  Comparators (*lt, *gt, *le, *ge) and min/max are not available for
+  complex types.
+ 
   The three logical operators, lor, land, and lxor, can be used with any
   real types.  z = lor.double (x,y) tests the condition (x~=0) || (y~=0),
   and returns the double value 1.0 if true, or 0.0 if false.
@@ -337,9 +411,8 @@
   z = cmplx(x,y) can be computed for x and y as single and double; z is
   single complex or double complex, respectively.
  
-  The following bitwise operators are available for any signed or
-  unsigned integer types:  bitor, bitand, bitxor, bitxnor, bitget, bitset,
-  bitclr, and bitshift.
+  The bitwise ops bitor, bitand, bitxor, bitxnor, bitget, bitset, bitclr,
+  and bitshift are available for any signed or unsigned integer type.
  
   Typecasting:  If the optype is omitted from the string (for example,
   GrB.eadd (A, '+', B) or simply C = A+B), then the optype is inferred
@@ -488,6 +561,12 @@
   For integer types only (result is same type as input):
     bitcmp      z = ~(x)        bitwise complement, also 'bitnot'
  
+  For int32 and int64 types, applied to an entry A(i,j)
+    positioni0  z = i-1     also 'i0'
+    positioni1  z = i       also 'i', 'i1', and 'positioni'
+    positionj0  z = j-1     also 'j0'
+    positionj1  z = j       also 'j', 'j1', and 'positionj'
+ 
   Example:
  
     % valid unary operators
@@ -503,7 +582,42 @@
   See also GrB.binopinfo, GrB.descriptorinfo, GrB.monoidinfo,
   GrB.selectopinfo, GrB.semiringinfo.
 
-</pre><h2 id="17">Element-wise operations</h2><p>Binary operators can be used in element-wise matrix operations, like C=A+B and C=A.*B.  For the matrix addition C=A+B, the pattern of C is the set union of A and B, and the '+' operator is applied for entries in the intersection.  Entries in A but not B, or in B but not A, are assigned to C without using the operator.  The '+' operator is used for C=A+B but any operator can be used with GrB.eadd.</p><pre class="codeinput">A = GrB (sprand (3, 3, 0.5)) ;
+</pre><pre class="codeinput">help <span class="string">GrB.semiringinfo</span>
+</pre><pre class="codeoutput"> GRB.SEMIRINGINFO list the details of a GraphBLAS semiring.
+ 
+    GrB.semiringinfo
+    GrB.semiringinfo (semiring)
+    GrB.semiringinfo (semiring, type)
+ 
+  For GrB.semiring(semiring), the semiring must be a string of the form
+  'add.mult.type', where 'add' and 'mult' are binary operators.  The
+  second usage allows the type to be omitted from the first argument, as
+  just 'add.mult'.  This is valid for all GraphBLAS operations, since the
+  type defaults to the type of the input matrices.  However,
+  GrB.semiringinfo does not have a default type and thus one must be
+  provided, either in the semiring as GrB.semiringinfo ('+.*.double'), or
+  in the second argument, GrB.semiringinfo ('+.*', 'double').
+ 
+  The additive operator must be the binary operator of a valid monoid (see
+  'help GrB.monoidinfo').  The multiplicative operator can be any binary
+  operator z=f(x,y) listed by 'help GrB.binopinfo', but the type of z must
+  match the operand type of the monoid.  The type in the string
+  'add.mult.type' is the type of x for the multiply operator z=f(x,y), and
+  the type of its z output defines the type of the monoid.
+ 
+  Example:
+ 
+    % valid semirings
+    GrB.semiringinfo ('+.*.double') ;
+    GrB.semiringinfo ('min.1st.int32') ;
+ 
+    % invalid semiring (generates an error; since '&lt;' is not a monoid)
+    GrB.semiringinfo ('&lt;.*.double') ;
+ 
+  See also GrB.binopinfo, GrB.descriptorinfo, GrB.monoidinfo,
+  GrB.selectopinfo, GrB.unopinfo.
+
+</pre><h2 id="18">Element-wise operations</h2><p>Binary operators can be used in element-wise matrix operations, like C=A+B and C=A.*B.  For the matrix addition C=A+B, the pattern of C is the set union of A and B, and the '+' operator is applied for entries in the intersection.  Entries in A but not B, or in B but not A, are assigned to C without using the operator.  The '+' operator is used for C=A+B but any operator can be used with GrB.eadd.</p><pre class="codeinput">A = GrB (sprand (3, 3, 0.5)) ;
 B = GrB (sprand (3, 3, 0.5)) ;
 C1 = A + B
 C2 = GrB.eadd (<span class="string">'+'</span>, A, B)
@@ -511,7 +625,7 @@
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    0.666139
@@ -525,7 +639,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    0.666139
@@ -538,12 +652,12 @@
 
 err =
      0
-</pre><h2 id="19">Subtracting two matrices</h2><p>A-B and GrB.eadd ('-', A, B) are not the same thing, since the '-' operator is not applied to an entry that is in B but not A.</p><pre class="codeinput">C1 = A-B
+</pre><h2 id="20">Subtracting two matrices</h2><p>A-B and GrB.eadd ('-', A, B) are not the same thing, since the '-' operator is not applied to an entry that is in B but not A.</p><pre class="codeinput">C1 = A-B
 C2 = GrB.eadd (<span class="string">'-'</span>, A, B)
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    -0.666139
@@ -557,7 +671,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    0.666139
@@ -574,7 +688,7 @@
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    -0.666139
@@ -588,7 +702,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   7 nonzeros, 7 entries
 
     (1,1)    -0.666139
@@ -601,13 +715,13 @@
 
 err =
      0
-</pre><h2 id="21">Element-wise 'multiplication'</h2><p>For C = A.*B, the result C is the set intersection of the pattern of A and B.  The operator is applied to entries in both A and B.  Entries in A but not B, or B but not A, do not appear in the result C.</p><pre class="codeinput">C1 = A.*B
+</pre><h2 id="22">Element-wise 'multiplication'</h2><p>For C = A.*B, the result C is the set intersection of the pattern of A and B.  The operator is applied to entries in both A and B.  Entries in A but not B, or B but not A, do not appear in the result C.</p><pre class="codeinput">C1 = A.*B
 C2 = GrB.emult (<span class="string">'*'</span>, A, B)
 C3 = double (A) .* double (B)
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   1 nonzero, 1 entry
 
     (1,2)    0.518474
@@ -615,20 +729,20 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   1 nonzero, 1 entry
 
     (1,2)    0.518474
 
 C3 =
-   (1,2)            0.518474419030681
+   (1,2)       0.5185
 </pre><p>Just as in GrB.eadd, any operator can be used in GrB.emult:</p><pre class="codeinput">A
 B
 C2 = GrB.emult (<span class="string">'max'</span>, A, B)
 </pre><pre class="codeoutput">
 A =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   4 nonzeros, 4 entries
 
     (1,2)    0.572029
@@ -639,7 +753,7 @@
 
 B =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   4 nonzeros, 4 entries
 
     (1,1)    0.666139
@@ -650,12 +764,12 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   1 nonzero, 1 entry
 
     (1,2)    0.906378
 
-</pre><h2 id="23">Overloaded operators</h2><p>The following operators all work as you would expect for any matrix. The matrices A and B can be GraphBLAS matrices, or MATLAB sparse or dense matrices, in any combination, or scalars where appropriate, The matrix M is logical (MATLAB or GraphBLAS):</p><pre>  A+B   A-B  A*B   A.*B  A./B  A.\B  A.^b   A/b   C=A(I,J)  C(M)=A
+</pre><h2 id="24">Overloaded operators</h2><p>The following operators all work as you would expect for any matrix. The matrices A and B can be GraphBLAS matrices, or MATLAB sparse or dense matrices, in any combination, or scalars where appropriate, The matrix M is logical (MATLAB or GraphBLAS):</p><pre>  A+B   A-B  A*B   A.*B  A./B  A.\B  A.^b   A/b   C=A(I,J)  C(M)=A
   -A    +A   ~A    A'    A.'   A&amp;B   A|B    b\A   C(I,J)=A  C=A(M)
   A~=B  A&gt;B  A==B  A&lt;=B  A&gt;=B  A&lt;B   [A,B]  [A;B] C(A)
   A(1:end,1:end)</pre><p>For A^b, b must be a non-negative integer.</p><pre class="codeinput">C1 = [A B] ;
@@ -668,7 +782,7 @@
 </pre><pre class="codeoutput">
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   5 nonzeros, 5 entries
 
     (2,2)    0.140946
@@ -686,13 +800,13 @@
 </pre><pre class="codeoutput">
 C1 =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, bitmap by col
   2 nonzeros, 2 entries
 
     (1,1)    0.572029
     (2,2)    0.248635
 
-</pre><h2 id="26">Overloaded functions</h2><p>Many MATLAB built-in functions can be used with GraphBLAS matrices:</p><p>A few differences with the built-in functions:</p><pre class="language-matlab">S = sparse (G)        <span class="comment">% makes a copy of a GrB matrix</span>
+</pre><h2 id="27">Overloaded functions</h2><p>Many MATLAB built-in functions can be used with GraphBLAS matrices:</p><p>A few differences with the built-in functions:</p><pre class="language-matlab">S = sparse (G)        <span class="comment">% converts G to sparse/hypersparse</span>
 F = full (G)          <span class="comment">% adds explicit zeros, so numel(F)==nnz(F)</span>
 F = full (G,type,id)  <span class="comment">% adds explicit identity values to a GrB matrix</span>
 disp (G, level)       <span class="comment">% display a GrB matrix G; level=2 is the default.</span>
@@ -700,67 +814,69 @@
 </pre><pre class="codeoutput">
 Methods for class GrB:
 
-GrB             disp            islogical       real            
-abs             display         ismatrix        repmat          
-acos            dmperm          isnan           reshape         
-acosh           double          isnumeric       round           
-acot            eig             isreal          sec             
-acoth           end             isscalar        sech            
-acsc            eps             issparse        sign            
-acsch           eq              issymmetric     sin             
-all             erf             istril          single          
-amd             erfc            istriu          sinh            
-and             etree           isvector        size            
-angle           exp             kron            sparse          
-any             expm1           ldivide         spfun           
-asec            false           le              spones          
-asech           find            length          sprand          
-asin            fix             log             sprandn         
-asinh           flip            log10           sprandsym       
-assert          floor           log1p           sprintf         
-atan            fprintf         log2            sqrt            
-atan2           full            logical         subsasgn        
-atanh           gamma           lt              subsindex       
-bandwidth       gammaln         max             subsref         
-bitand          ge              min             sum             
-bitcmp          graph           minus           symamd          
-bitget          gt              mldivide        symrcm          
-bitor           horzcat         mpower          tan             
-bitset          hypot           mrdivide        tanh            
-bitshift        imag            mtimes          times           
-bitxor          int16           ne              transpose       
-ceil            int32           nnz             tril            
-colamd          int64           nonzeros        triu            
-complex         int8            norm            true            
-conj            isa             not             uint16          
-cos             isbanded        numel           uint32          
-cosh            isdiag          nzmax           uint64          
-cot             isempty         ones            uint8           
-coth            isequal         or              uminus          
-csc             isfinite        plus            uplus           
-csch            isfloat         pow2            vertcat         
-ctranspose      ishermitian     power           xor             
-diag            isinf           prod            zeros           
-digraph         isinteger       rdivide         
+GrB             display         isnan           round           
+abs             dmperm          isnumeric       sec             
+acos            double          isreal          sech            
+acosh           eig             isscalar        sign            
+acot            end             issparse        sin             
+acoth           eps             issymmetric     single          
+acsc            eq              istril          sinh            
+acsch           erf             istriu          size            
+all             erfc            isvector        sparse          
+amd             etree           kron            spfun           
+and             exp             ldivide         spones          
+angle           expm1           le              sprand          
+any             false           length          sprandn         
+asec            find            log             sprandsym       
+asech           fix             log10           sprintf         
+asin            flip            log1p           sqrt            
+asinh           floor           log2            struct          
+assert          fprintf         logical         subsasgn        
+atan            full            lt              subsindex       
+atan2           gamma           max             subsref         
+atanh           gammaln         min             sum             
+bandwidth       ge              minus           symamd          
+bitand          graph           mldivide        symrcm          
+bitcmp          gt              mpower          tan             
+bitget          horzcat         mrdivide        tanh            
+bitor           hypot           mtimes          times           
+bitset          imag            ne              transpose       
+bitshift        int16           nnz             tril            
+bitxor          int32           nonzeros        triu            
+ceil            int64           norm            true            
+colamd          int8            not             uint16          
+complex         isa             numel           uint32          
+conj            isbanded        nzmax           uint64          
+cos             isdiag          ones            uint8           
+cosh            isempty         or              uminus          
+cot             isequal         plus            uplus           
+coth            isfinite        pow2            vertcat         
+csc             isfloat         power           xor             
+csch            ishermitian     prod            zeros           
+ctranspose      isinf           rdivide         
+diag            isinteger       real            
+digraph         islogical       repmat          
+disp            ismatrix        reshape         
 
 Static methods:
 
-MATLAB_vs_GrB   empty           issigned        reduce          
-apply           emult           kronecker       select          
-apply2          entries         ktruss          selectopinfo    
-assign          expand          laplacian       semiringinfo    
-bfs             extract         mis             speye           
-binopinfo       extracttuples   monoidinfo      subassign       
-build           eye             mxm             threads         
-burble          finalize        nonz            trans           
-chunk           format          normdiff        tricount        
-clear           incidence       offdiag         type            
-compact         init            optype          unopinfo        
-descriptorinfo  isbycol         pagerank        vreduce         
-dnn             isbyrow         prune           
-eadd            isfull          random          
-
-</pre><h2 id="28">Zeros are handled differently</h2><p>Explicit zeros cannot be automatically dropped from a GraphBLAS matrix, like they are in MATLAB sparse matrices.  In a shortest-path problem, for example, an edge A(i,j) that is missing has an infinite weight, (the monoid identity of min(x,y) is +inf).  A zero edge weight A(i,j)=0 is very different from an entry that is not present in A.  However, if a GraphBLAS matrix is converted into a MATLAB sparse matrix, explicit zeros are dropped, which is the convention for a MATLAB sparse matrix. They can also be dropped from a GraphBLAS matrix using the GrB.select method.</p><pre class="codeinput">G = GrB (magic (2)) ;
+MATLAB_vs_GrB   emult           ktruss          semiringinfo    
+apply           entries         laplacian       speye           
+apply2          expand          mis             subassign       
+assign          extract         monoidinfo      threads         
+bfs             extracttuples   mxm             trans           
+binopinfo       eye             nonz            tricount        
+build           finalize        normdiff        type            
+burble          format          offdiag         unopinfo        
+chunk           incidence       optype          ver             
+clear           init            pagerank        version         
+compact         isbycol         prune           vreduce         
+descriptorinfo  isbyrow         random          
+dnn             isfull          reduce          
+eadd            issigned        select          
+empty           kronecker       selectopinfo    
+
+</pre><h2 id="29">Zeros are handled differently</h2><p>Explicit zeros cannot be automatically dropped from a GraphBLAS matrix, like they are in MATLAB sparse matrices.  In a shortest-path problem, for example, an edge A(i,j) that is missing has an infinite weight, (the monoid identity of min(x,y) is +inf).  A zero edge weight A(i,j)=0 is very different from an entry that is not present in A.  However, if a GraphBLAS matrix is converted into a MATLAB sparse matrix, explicit zeros are dropped, which is the convention for a MATLAB sparse matrix. They can also be dropped from a GraphBLAS matrix using the GrB.select method.</p><pre class="codeinput">G = GrB (magic (2)) ;
 G (1,1) = 0      <span class="comment">% G(1,1) still appears as an explicit entry</span>
 A = double (G)   <span class="comment">% but it's dropped when converted to MATLAB sparse</span>
 H = GrB.select (<span class="string">'nonzero'</span>, G)  <span class="comment">% drops the explicit zeros from G</span>
@@ -770,7 +886,7 @@
 </pre><pre class="codeoutput">
 G =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, full by col
   3 nonzeros, 4 entries
 
     (1,1)    0
@@ -779,13 +895,12 @@
     (2,2)    2
 
 A =
-   (2,1)        4
-   (1,2)        3
-   (2,2)        2
+     0     3
+     4     2
 
 H =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, bitmap by col
   3 nonzeros, 3 entries
 
     (2,1)    4
@@ -794,13 +909,13 @@
 
 nnz (G): 3  nnz (A): 3 nnz (H): 3
 num entries in G: 4
-</pre><h2 id="30">Displaying contents of a GraphBLAS matrix</h2><p>Unlike MATLAB, the default is to display just a few entries of a GrB matrix. Here are all 100 entries of a 10-by-10 matrix, using a non-default disp(G,3):</p><pre class="codeinput">G = GrB (rand (10)) ;
+</pre><h2 id="31">Displaying contents of a GraphBLAS matrix</h2><p>Unlike MATLAB, the default is to display just a few entries of a GrB matrix. Here are all 100 entries of a 10-by-10 matrix, using a non-default disp(G,3):</p><pre class="codeinput">G = GrB (rand (10)) ;
 <span class="comment">% display everything:</span>
 disp (G,3)
 </pre><pre class="codeoutput">
 G =
 
-  10x10 GraphBLAS double matrix, sparse by col:
+  10x10 GraphBLAS double matrix, full by col
   100 nonzeros, 100 entries
 
     (1,1)    0.0342763
@@ -909,7 +1024,7 @@
 </pre><pre class="codeoutput">
 G =
 
-  10x10 GraphBLAS double matrix, sparse by col:
+  10x10 GraphBLAS double matrix, full by col
   100 nonzeros, 100 entries
 
     (1,1)    0.0342763
@@ -941,18 +1056,17 @@
     (7,3)    0.706156
     (8,3)    0.909475
     (9,3)    0.84868
-    (10,3)    0.564605
     ...
 
 </pre><p>That was disp(G,2) or just display(G), which is what is printed by a MATLAB statement that doesn't have a trailing semicolon.  With level = 1, disp(G,1) gives just a terse summary:</p><pre class="codeinput">disp (G,1)
 </pre><pre class="codeoutput">
 G =
 
-  10x10 GraphBLAS double matrix, sparse by col:
+  10x10 GraphBLAS double matrix, full by col
   100 nonzeros, 100 entries
 
 
-</pre><h2 id="35">Storing a matrix by row or by column</h2><p>MATLAB stores its sparse matrices by column, refered to as 'standard CSC' in SuiteSparse:GraphBLAS.  In the CSC (compressed sparse column) format, each column of the matrix is stored as a list of entries, with their value and row index.  In the CSR (compressed sparse row) format, each row is stored as a list of values and their column indices. GraphBLAS uses both CSC and CSR, and the two formats can be intermixed arbitrarily.  In its C interface, the default format is CSR.  However, for better compatibility with MATLAB, this MATLAB interface for SuiteSparse:GraphBLAS uses CSC by default instead.</p><pre class="codeinput">rng (<span class="string">'default'</span>) ;
+</pre><h2 id="36">Storing a matrix by row or by column</h2><p>MATLAB stores its sparse matrices by column, refered to as 'sparse by col' in SuiteSparse:GraphBLAS.  In the 'sparse by col' format, each column of the matrix is stored as a list of entries, with their value and row index.  In the 'sparse by row' format, each row is stored as a list of values and their column indices.  GraphBLAS uses both 'by row' and 'by col', and the two formats can be intermixed arbitrarily.  In its C interface, the default format is 'by row'.  However, for better compatibility with MATLAB, the SuiteSparse:GraphBLAS MATLAB interface uses 'by col' by default instead.</p><pre class="codeinput">rng (<span class="string">'default'</span>) ;
 GrB.clear ;                      <span class="comment">% clear prior GraphBLAS settings</span>
 fprintf (<span class="string">'the default format is: %s\n'</span>, GrB.format) ;
 C = sparse (rand (2))
@@ -960,14 +1074,14 @@
 GrB.format (G)
 </pre><pre class="codeoutput">the default format is: by col
 C =
-   (1,1)            0.814723686393179
-   (2,1)            0.905791937075619
-   (1,2)            0.126986816293506
-   (2,2)            0.913375856139019
+   (1,1)       0.8147
+   (2,1)       0.9058
+   (1,2)       0.1270
+   (2,2)       0.9134
 
 G =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    0.814724
@@ -977,7 +1091,7 @@
 
 ans =
     'by col'
-</pre><p>Many graph algorithms work better in CSR format, with matrices stored by row.  For example, it is common to use A(i,j) for the edge (i,j), and many graph algorithms need to access the out-adjacencies of nodes, which is the row A(i,;) for node i.  If the CSR format is desired, GrB.format ('by row') tells GraphBLAS to create all subsequent matrices in the CSR format.  Converting from a MATLAB sparse matrix (in standard CSC format) takes a little more time (requiring a transpose), but subsequent graph algorithms can be faster.</p><pre class="codeinput">G = GrB (C, <span class="string">'by row'</span>)
+</pre><p>Many graph algorithms work better in 'by row' format, with matrices stored by row.  For example, it is common to use A(i,j) for the edge (i,j), and many graph algorithms need to access the out-adjacencies of nodes, which is the row A(i,;) for node i.  If the 'by row' format is desired, GrB.format ('by row') tells GraphBLAS to create all subsequent matrices in the 'by row' format.  Converting from a MATLAB sparse matrix (in standard 'by col' format) takes a little more time (requiring a transpose), but subsequent graph algorithms can be faster.</p><pre class="codeinput">G = GrB (C, <span class="string">'by row'</span>)
 fprintf (<span class="string">'the format of G is:    %s\n'</span>, GrB.format (G)) ;
 H = GrB (C)
 fprintf (<span class="string">'the format of H is:    %s\n'</span>, GrB.format (H)) ;
@@ -985,7 +1099,7 @@
 </pre><pre class="codeoutput">
 G =
 
-  2x2 GraphBLAS double matrix, sparse by row:
+  2x2 GraphBLAS double matrix, full by row
   4 nonzeros, 4 entries
 
     (1,1)    0.814724
@@ -997,7 +1111,7 @@
 
 H =
 
-  2x2 GraphBLAS double matrix, sparse by col:
+  2x2 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    0.814724
@@ -1008,7 +1122,7 @@
 the format of H is:    by col
 err =
      0
-</pre><h2 id="39">Hypersparse matrices</h2><p>SuiteSparse:GraphBLAS can use two kinds of sparse matrix data structures: standard and hypersparse, for both CSC and CSR formats.  In the standard CSC format used in MATLAB, an m-by-n matrix A takes O(n+nnz(A)) space.  MATLAB can create huge column vectors, but not huge matrices (when n is huge).</p><pre class="codeinput">clear
+</pre><h2 id="40">Hypersparse, sparse, bitmap, and full matrices</h2><p>SuiteSparse:GraphBLAS can use four kinds of sparse matrix data structures: hypersparse, sparse, bitmap, and full, in both 'by col' and 'by row' formats, for a total of eight different combinations.  In the 'sparse by col' that MATLAB uses for its sparse matrices, an m-by-n matrix A takes O(n+nnz(A)) space.  MATLAB can create huge column vectors, but not huge matrices (when n is huge).</p><pre class="codeinput">clear
 [c, huge] = computer ;
 C = sparse (huge, 1)    <span class="comment">% MATLAB can create a huge-by-1 sparse column</span>
 <span class="keyword">try</span>
@@ -1033,13 +1147,13 @@
 </pre><pre class="codeoutput">
 G =
 
-  281474976710655x1 GraphBLAS double matrix, sparse by col:
+  281474976710655x1 GraphBLAS double matrix, sparse by col
   no nonzeros, no entries
 
 
 H =
 
-  281474976710655x281474976710655 GraphBLAS double matrix, hypersparse by col:
+  281474976710655x281474976710655 GraphBLAS double matrix, hypersparse by col
   no nonzeros, no entries
 
 </pre><p>Operations on huge hypersparse matrices are very fast; no component of the time or space complexity is Omega(n).</p><pre class="codeinput">I = randperm (huge, 2) ;
@@ -1053,7 +1167,7 @@
 </pre><pre class="codeoutput">
 H =
 
-  281474976710655x281474976710655 GraphBLAS double matrix, hypersparse by col:
+  281474976710655x281474976710655 GraphBLAS double matrix, hypersparse by col
   8 nonzeros, 8 entries
 
     (27455183225557,27455183225557)    4403.14
@@ -1065,11 +1179,11 @@
     (153933462881710,177993304104065)    143.142
     (177993304104065,177993304104065)    1403.14
 
-</pre><h2 id="42">numel uses vpa if the matrix is really huge</h2><pre class="codeinput">e1 = numel (G)               <span class="comment">% this is huge, but still a flint</span>
+</pre><h2 id="43">numel uses vpa if the matrix is really huge</h2><pre class="codeinput">e1 = numel (G)               <span class="comment">% this is huge, but still a flint</span>
 e2 = numel (H)               <span class="comment">% this is huge^2, which needs vpa</span>
 whos <span class="string">e1</span> <span class="string">e2</span>
 </pre><pre class="codeoutput">e1 =
-           281474976710655
+   2.8147e+14
 e2 =
 79228162514263774643590529025.0
   Name      Size            Bytes  Class     Attributes
@@ -1080,24 +1194,24 @@
 </pre><p>All of these matrices take very little memory space:</p><pre class="codeinput">whos <span class="string">C</span> <span class="string">G</span> <span class="string">H</span> <span class="string">K</span>
 </pre><pre class="codeoutput">  Name                    Size                         Bytes  Class    Attributes
 
-  G         281474976710655x1                            981  GrB                
-  H         281474976710655x281474976710655             1300  GrB                
-  K         281474976710655x281474976710655             1300  GrB                
+  G         281474976710655x1                            989  GrB                
+  H         281474976710655x281474976710655             1308  GrB                
+  K         281474976710655x281474976710655             1308  GrB                
 
-</pre><h2 id="44">The mask and accumulator</h2><p>When not used in overloaded operators or built-in functions, many GraphBLAS methods of the form GrB.method ( ... ) can optionally use a mask and/or an accumulator operator.  If the accumulator is '+' in GrB.mxm, for example, then C = C + A*B is computed.  The mask acts much like logical indexing in MATLAB.  With a logical mask matrix M, C&lt;M&gt;=A*B allows only part of C to be assigned.  If M(i,j) is true, then C(i,j) can be modified.  If false, then C(i,j) is not modified.</p><p>For example, to set all values in C that are greater than 0.5 to 3:</p><pre class="codeinput">A = rand (3)
+</pre><h2 id="45">The mask and accumulator</h2><p>When not used in overloaded operators or built-in functions, many GraphBLAS methods of the form GrB.method ( ... ) can optionally use a mask and/or an accumulator operator.  If the accumulator is '+' in GrB.mxm, for example, then C = C + A*B is computed.  The mask acts much like logical indexing in MATLAB.  With a logical mask matrix M, C&lt;M&gt;=A*B allows only part of C to be assigned.  If M(i,j) is true, then C(i,j) can be modified.  If false, then C(i,j) is not modified.</p><p>For example, to set all values in C that are greater than 0.5 to 3:</p><pre class="codeinput">A = rand (3)
 C = GrB.assign (A, A &gt; 0.5, 3) ;     <span class="comment">% in GraphBLAS</span>
 C1 = GrB (A) ; C1 (A &gt; .5) = 3       <span class="comment">% also in GraphBLAS</span>
 C2 = A       ; C2 (A &gt; .5) = 3       <span class="comment">% in MATLAB</span>
 err = norm (C - C1, 1)
 err = norm (C - C2, 1)
 </pre><pre class="codeoutput">A =
-         0.957506835434298         0.970592781760616           0.8002804688888
-         0.964888535199277         0.957166948242946         0.141886338627215
-         0.157613081677548         0.485375648722841         0.421761282626275
+    0.9575    0.9706    0.8003
+    0.9649    0.9572    0.1419
+    0.1576    0.4854    0.4218
 
 C1 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, full by col
   9 nonzeros, 9 entries
 
     (1,1)    3
@@ -1111,21 +1225,21 @@
     (3,3)    0.421761
 
 C2 =
-                         3                         3                         3
-                         3                         3         0.141886338627215
-         0.157613081677548         0.485375648722841         0.421761282626275
+    3.0000    3.0000    3.0000
+    3.0000    3.0000    0.1419
+    0.1576    0.4854    0.4218
 err =
      0
 err =
      0
-</pre><h2 id="46">The descriptor</h2><p>Most GraphBLAS functions of the form GrB.method ( ... ) take an optional last argument, called the descriptor.  It is a MATLAB struct that can modify the computations performed by the method.  'help GrB.descriptorinfo' gives all the details.  The following is a short summary of the primary settings:</p><p>d.out  = 'default' or 'replace', clears C after the accum op is used.</p><p>d.mask = 'default' or 'complement', to use M or ~M as the mask matrix;          'structural', or 'structural complement', to use the pattern           of M or ~M.</p><p>d.in0  = 'default' or 'transpose', to transpose A for C=A*B, C=A+B, etc.</p><p>d.in1  = 'default' or 'transpose', to transpose B for C=A*B, C=A+B, etc.</p><p>d.kind = 'default', 'GrB', 'sparse', or 'full'; the output of GrB.method.</p><pre class="codeinput">A = sparse (rand (2)) ;
+</pre><h2 id="47">The descriptor</h2><p>Most GraphBLAS functions of the form GrB.method ( ... ) take an optional last argument, called the descriptor.  It is a MATLAB struct that can modify the computations performed by the method.  'help GrB.descriptorinfo' gives all the details.  The following is a short summary of the primary settings:</p><p>d.out  = 'default' or 'replace', clears C after the accum op is used.</p><p>d.mask = 'default' or 'complement', to use M or ~M as the mask matrix;          'structural', or 'structural complement', to use the pattern           of M or ~M.</p><p>d.in0  = 'default' or 'transpose', to transpose A for C=A*B, C=A+B, etc.</p><p>d.in1  = 'default' or 'transpose', to transpose B for C=A*B, C=A+B, etc.</p><p>d.kind = 'default', 'GrB', 'sparse', or 'full'; the output of GrB.method.</p><pre class="codeinput">A = sparse (rand (2)) ;
 B = sparse (rand (2)) ;
 C1 = A'*B ;
 C2 = GrB.mxm (<span class="string">'+.*'</span>, A, B, struct (<span class="string">'in0'</span>, <span class="string">'transpose'</span>)) ;
 err = norm (C1-C2,1)
 </pre><pre class="codeoutput">err =
      0
-</pre><h2 id="47">Integer arithmetic is different in GraphBLAS</h2><p>MATLAB supports integer arithmetic on its full matrices, using int8, int16, int32, int64, uint8, uint16, uint32, or uint64 data types.  None of these integer data types can be used to construct a MATLAB sparse matrix, which can only be double, double complex, or logical. Furthermore, C=A*B is not defined for integer types in MATLAB, except when A and/or B are scalars.</p><p>GraphBLAS supports all of those types for its sparse matrices.  All operations are supported, including C=A*B when A or B are any integer type, in 1000s of semirings.</p><p>However, integer arithmetic differs in GraphBLAS and MATLAB.  In MATLAB, integer values saturate if they exceed their maximum value.  In GraphBLAS, integer operators act in a modular fashion.  The latter is essential when computing C=A*B over a semiring.  A saturating integer operator cannot be used as a monoid since it is not associative.</p><pre class="codeinput">C = uint8 (magic (3)) ;
+</pre><h2 id="48">Integer arithmetic is different in GraphBLAS</h2><p>MATLAB supports integer arithmetic on its full matrices, using int8, int16, int32, int64, uint8, uint16, uint32, or uint64 data types.  None of these integer data types can be used to construct a MATLAB sparse matrix, which can only be double, double complex, or logical. Furthermore, C=A*B is not defined for integer types in MATLAB, except when A and/or B are scalars.</p><p>GraphBLAS supports all of those types for all of its matrices (hyper, sparse, bitmap, or full).  All operations are supported, including C=A*B when A or B are any integer type, in 1000s of semirings.</p><p>However, integer arithmetic differs in GraphBLAS and MATLAB.  In MATLAB, integer values saturate if they exceed their maximum value.  In GraphBLAS, integer operators act in a modular fashion.  The latter is essential when computing C=A*B over a semiring.  A saturating integer operator cannot be used as a monoid since it is not associative.</p><pre class="codeinput">C = uint8 (magic (3)) ;
 G = GrB (C) ;
 C1 = C * 40
 C2 = G * uint8 (40)
@@ -1139,7 +1253,7 @@
 
 C2 =
 
-  3x3 GraphBLAS uint8_t matrix, sparse by col:
+  3x3 GraphBLAS uint8_t matrix, full by col
   9 nonzeros, 9 entries
 
     (1,1)   64
@@ -1152,7 +1266,7 @@
     (2,3)   24
     (3,3)   80
 
-</pre><h2 id="49">An example graph algorithm: breadth-first search</h2><p>The breadth-first search of a graph finds all nodes reachable from the source node, and their level, v.  v=GrB.bfs(A,s) or v=bfs_matlab(A,s) compute the same thing, but GrB.bfs uses GraphBLAS matrices and operations, while bfs_matlab uses pure MATLAB operations.  v is defined as v(s) = 1 for the source node, v(i) = 2 for nodes adjacent to the source, and so on.</p><pre class="codeinput">clear
+</pre><h2 id="50">An example graph algorithm: breadth-first search</h2><p>The breadth-first search of a graph finds all nodes reachable from the source node, and their level, v.  v=GrB.bfs(A,s) or v=bfs_matlab(A,s) compute the same thing, but GrB.bfs uses GraphBLAS matrices and operations, while bfs_matlab uses pure MATLAB operations.  v is defined as v(s) = 1 for the source node, v(i) = 2 for nodes adjacent to the source, and so on.</p><pre class="codeinput">clear
 rng (<span class="string">'default'</span>) ;
 n = 1e5 ;
 A = logical (sprandn (n, n, 1e-3)) ;
@@ -1171,12 +1285,15 @@
 fprintf (<span class="string">'MATLAB time:    %g sec\n'</span>, matlab_time) ;
 fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 </pre><pre class="codeoutput">
 nodes reached: 100000 of 100000
-GraphBLAS time: 0.33517 sec
-MATLAB time:    0.799412 sec
-Speedup of GraphBLAS over MATLAB: 2.3851
-</pre><h2 id="50">Example graph algorithm: Luby's method in GraphBLAS</h2><p>The GrB.mis function is variant of Luby's randomized algorithm [Luby 1985].  It is a parallel method for finding an maximal independent set of nodes, where no two nodes are adjacent.  See the GraphBLAS/@GrB/mis.m function for details.  The graph must be symmetric with a zero-free diagonal, so A is symmetrized first and any diagonal entries are removed.</p><pre class="codeinput">A = GrB (A) ;
+GraphBLAS time: 0.402623 sec
+MATLAB time:    0.818391 sec
+Speedup of GraphBLAS over MATLAB: 2.03265
+
+# of threads used by GraphBLAS: 8
+</pre><h2 id="51">Example graph algorithm: Luby's method in GraphBLAS</h2><p>The GrB.mis function is variant of Luby's randomized algorithm [Luby 1985].  It is a parallel method for finding an maximal independent set of nodes, where no two nodes are adjacent.  See the GraphBLAS/@GrB/mis.m function for details.  The graph must be symmetric with a zero-free diagonal, so A is symmetrized first and any diagonal entries are removed.</p><pre class="codeinput">A = GrB (A) ;
 A = GrB.offdiag (A|A') ;
 
 tic
@@ -1197,11 +1314,11 @@
 S = A (notp, p) ;
 deg = GrB.vreduce (<span class="string">'+.int64'</span>, S) ;
 assert (logical (all (deg &gt; 0)))
-</pre><pre class="codeoutput">Elapsed time is 0.542797 seconds.
+</pre><pre class="codeoutput">Elapsed time is 0.591778 seconds.
 # nodes in the graph: 100000
 # edges: : 9.9899e+06
 size of maximal independent set found: 2811
-</pre><h2 id="51">Sparse deep neural network</h2><p>The 2019 MIT GraphChallenge (see <a href="http://graphchallenge.org">http://graphchallenge.org</a>) is to solve a set of large sparse deep neural network problems.  In this demo, the MATLAB reference solution is compared with a solution using GraphBLAS, for a randomly constructed neural network.  See the GrB.dnn and dnn_matlab.m functions for details.</p><pre class="codeinput">clear
+</pre><h2 id="52">Sparse deep neural network</h2><p>The 2019 MIT GraphChallenge (see <a href="http://graphchallenge.org">http://graphchallenge.org</a>) is to solve a set of large sparse deep neural network problems.  In this demo, the MATLAB reference solution is compared with a solution using GraphBLAS, for a randomly constructed neural network.  See the GrB.dnn and dnn_matlab.m functions for details.</p><pre class="codeinput">clear
 rng (<span class="string">'default'</span>) ;
 nlayers = 16 ;
 nneurons = 4096 ;
@@ -1209,6 +1326,7 @@
 fprintf (<span class="string">'# layers:   %d\n'</span>, nlayers) ;
 fprintf (<span class="string">'# neurons:  %d\n'</span>, nneurons) ;
 fprintf (<span class="string">'# features: %d\n'</span>, nfeatures) ;
+fprintf (<span class="string">'# of threads used: %d\n'</span>, GrB.threads) ;
 
 tic
 Y0 = sprand (nfeatures, nneurons, 0.1) ;
@@ -1227,26 +1345,30 @@
 </pre><pre class="codeoutput"># layers:   16
 # neurons:  4096
 # features: 30000
-construct problem time: 6.02037 sec
-setup time: 0.284302 sec
-</pre><h2 id="52">Solving the sparse deep neural network problem with GraphbLAS</h2><p>Please wait ...</p><pre class="codeinput">tic
+# of threads used: 8
+construct problem time: 6.21488 sec
+setup time: 0.441809 sec
+</pre><h2 id="53">Solving the sparse deep neural network problem with GraphbLAS</h2><p>Please wait ...</p><pre class="codeinput">tic
 Y1 = GrB.dnn (W_gb, bias_gb, Y0_gb) ;
 gb_time = toc ;
 fprintf (<span class="string">'total time in GraphBLAS: %g sec\n'</span>, gb_time) ;
-</pre><pre class="codeoutput">total time in GraphBLAS: 12.7092 sec
-</pre><h2 id="53">Solving the sparse deep neural network problem with MATLAB</h2><p>Please wait ...</p><pre class="codeinput">tic
+</pre><pre class="codeoutput">total time in GraphBLAS: 12.6377 sec
+</pre><h2 id="54">Solving the sparse deep neural network problem with MATLAB</h2><p>Please wait ...</p><pre class="codeinput">tic
 Y2 = dnn_matlab (W, bias, Y0) ;
 matlab_time = toc ;
 fprintf (<span class="string">'total time in MATLAB:    %g sec\n'</span>, matlab_time) ;
 fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 
 err = norm (Y1-Y2,1)
-</pre><pre class="codeoutput">total time in MATLAB:    105.194 sec
-Speedup of GraphBLAS over MATLAB: 8.27702
+</pre><pre class="codeoutput">total time in MATLAB:    105.342 sec
+Speedup of GraphBLAS over MATLAB: 8.33558
+
+# of threads used by GraphBLAS: 8
 err =
      0
-</pre><h2 id="54">For objects, GraphBLAS has better colon notation than MATLAB</h2><p>The MATLAB notation C = A (start:inc:fini) is very handy, and it works great if A is a MATLAB matrix.  But for objects like the GraphBLAS matrix, MATLAB starts by creating the explicit index vector I = start:inc:fini.  That's fine if the matrix is modest in size, but GraphBLAS can construct huge matrices. The problem is that 1:n cannot be explicitly constructed when n is huge.</p><p>The C API for GraphBLAS can represent the colon notation start:inc:fini in an implicit manner, so it can do the indexing without actually forming the explicit list I = start:inc:fini. But there is no access to this method using the MATLAB notation start:inc:fini.</p><p>Thus, to compute C = A (start:inc:fini) for very huge matrices, you need to use use a cell array to represent the colon notation, as { start, inc, fini }, instead of start:inc:fini. See 'help GrB.extract', 'help GrB.assign' for the functional form. For the overloaded syntax C(I,J)=A and C=A(I,J), see 'help GrB/subsasgn' and 'help GrB/subsfref'.  The cell array syntax isn't conventional, but it is far faster than the MATLAB colon notation for objects, and takes far less memory when I is huge.</p><pre class="codeinput">n = 1e14 ;
+</pre><h2 id="55">For objects, GraphBLAS has better colon notation than MATLAB</h2><p>The MATLAB notation C = A (start:inc:fini) is very handy, and it works great if A is a MATLAB matrix.  But for objects like the GraphBLAS matrix, MATLAB starts by creating the explicit index vector I = start:inc:fini.  That's fine if the matrix is modest in size, but GraphBLAS can construct huge matrices. The problem is that 1:n cannot be explicitly constructed when n is huge.</p><p>The C API for GraphBLAS can represent the colon notation start:inc:fini in an implicit manner, so it can do the indexing without actually forming the explicit list I = start:inc:fini. But there is no access to this method using the MATLAB notation start:inc:fini.</p><p>Thus, to compute C = A (start:inc:fini) for very huge matrices, you need to use use a cell array to represent the colon notation, as { start, inc, fini }, instead of start:inc:fini. See 'help GrB.extract', 'help GrB.assign' for the functional form. For the overloaded syntax C(I,J)=A and C=A(I,J), see 'help GrB/subsasgn' and 'help GrB/subsref'.  The cell array syntax isn't conventional, but it is far faster than the MATLAB colon notation for objects, and takes far less memory when I is huge.</p><pre class="codeinput">n = 1e14 ;
 H = GrB (n, n) ;            <span class="comment">% a huge empty matrix</span>
 I = [1 1e9 1e12 1e14] ;
 M = magic (4)
@@ -1264,7 +1386,7 @@
 
 C1 =
 
-  10000000000000x10000000000000 GraphBLAS double matrix, hypersparse by col:
+  10000000000000x10000000000000 GraphBLAS double matrix, hypersparse by col
   9 nonzeros, 9 entries
 
     (1,1)    16
@@ -1292,7 +1414,7 @@
          cause: {}
          stack: [4&times;1 struct]
     Correction: []
-</pre><h2 id="57">Iterative solvers work as-is</h2><p>Many built-in functions work with GraphBLAS matrices unmodified.</p><pre class="codeinput">A = sparse (rand (4)) ;
+</pre><h2 id="58">Iterative solvers work as-is</h2><p>Many built-in functions work with GraphBLAS matrices unmodified.</p><pre class="codeinput">A = sparse (rand (4)) ;
 b = sparse (rand (4,1)) ;
 x = gmres (A,b)
 norm (A*x-b)
@@ -1300,47 +1422,47 @@
 norm (A*x-b)
 </pre><pre class="codeoutput">gmres converged at iteration 4 to a solution with relative residual 0.
 x =
-          0.91047557490445
-          3.89492850585206
-        -0.569511178041979
-         -1.38669683854936
+    0.9105
+    3.8949
+   -0.5695
+   -1.3867
 ans =
-      8.67111901826273e-16
+   8.6711e-16
 gmres converged at iteration 4 to a solution with relative residual 0.
 x =
-         0.910475574904404
-           3.8949285058519
-         -0.56951117804191
-          -1.3866968385493
+    0.9105
+    3.8949
+   -0.5695
+   -1.3867
 ans =
-      7.28021923224409e-16
-</pre><h2 id="58">... even in single precision</h2><pre class="codeinput">x = gmres (GrB(A,<span class="string">'single'</span>), GrB(b,<span class="string">'single'</span>))
+   7.2802e-16
+</pre><h2 id="59">... even in single precision</h2><pre class="codeinput">x = gmres (GrB(A,<span class="string">'single'</span>), GrB(b,<span class="string">'single'</span>))
 norm (A*x-b)
 </pre><pre class="codeoutput">gmres converged at iteration 4 to a solution with relative residual 0.
 x =
-         0.910472507135207
-          3.89491683958708
-        -0.569506481916661
-          -1.3866920717142
+    0.9105
+    3.8949
+   -0.5695
+   -1.3867
 ans =
-       8.3369210904823e-08
+   8.3369e-08
 </pre><p>Both of the following uses of minres (A,b) fail to converge because A is not symmetric, as the method requires.  Both failures are correctly reported, and both the MATLAB version and the GraphBLAS version return the same incorrect vector x.</p><pre class="codeinput">x = minres (A, b)
 x = minres (GrB(A), GrB(b))
 </pre><pre class="codeoutput">minres stopped at iteration 4 without converging to the desired tolerance 1e-06
 because the maximum number of iterations was reached.
 The iterate returned (number 4) has relative residual 0.21.
 x =
-         0.248941606720887
-         0.208128063873183
-        0.0699707140888991
-         0.392812027589062
+    0.2489
+    0.2081
+    0.0700
+    0.3928
 minres stopped at iteration 4 without converging to the desired tolerance 1e-06
 because the maximum number of iterations was reached.
 The iterate returned (number 4) has relative residual 0.21.
 
 x =
 
-  4x1 GraphBLAS double matrix, sparse by col:
+  4x1 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    0.248942
@@ -1355,17 +1477,17 @@
 norm (A*x-b)
 </pre><pre class="codeoutput">minres converged at iteration 4 to a solution with relative residual 1.3e-11.
 x =
-         -114.061616682974
-         -1.42110186668133
-          134.822699973567
-          2.06940490708633
+ -114.0616
+   -1.4211
+  134.8227
+    2.0694
 ans =
-      1.36498610101886e-11
+   1.3650e-11
 minres converged at iteration 4 to a solution with relative residual 1.3e-11.
 
 x =
 
-  4x1 GraphBLAS double matrix, sparse by col:
+  4x1 GraphBLAS double matrix, full by col
   4 nonzeros, 4 entries
 
     (1,1)    -114.062
@@ -1374,8 +1496,8 @@
     (4,1)    2.0694
 
 ans =
-      1.36498610101886e-11
-</pre><h2 id="61">Extreme performance differences between GraphBLAS and MATLAB.</h2><p>The GraphBLAS operations used so far are perhaps 2x to 50x faster than the corresponding MATLAB operations, depending on how many cores your computer has.  To run a demo illustrating a 500x or more speedup versus MATLAB, run this demo:</p><pre>  gbdemo2</pre><p>It will illustrate an assignment C(I,J)=A that can take under a second in GraphBLAS but several minutes in MATLAB.  To make the comparsion even more dramatic, try:</p><pre>  gbdemo2 (20000)</pre><p>assuming you have enough memory.</p><h2 id="62">Sparse logical indexing is much, much faster in GraphBLAS</h2><p>The mask in GraphBLAS acts much like logical indexing in MATLAB, but it is not quite the same.  MATLAB logical indexing takes the form:</p><pre>     C (M) = A (M)</pre><p>which computes the same thing as the GraphBLAS statement:</p><pre>     C = GrB.assign (C, M, A)</pre><p>The GrB.assign statement computes C(M)=A(M), and it is vastly faster than C(M)=A(M), even if the time to convert the GrB matrix back to a MATLAB sparse matrix is included.</p><p>GraphBLAS can also compute C (M) = A (M) using overloaded operators for subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.</p><p>Here are both methods in GraphBLAS (both are very fast).  Setting up:</p><pre class="codeinput">clear
+   1.3650e-11
+</pre><h2 id="62">Extreme performance differences between GraphBLAS and MATLAB.</h2><p>The GraphBLAS operations used so far are perhaps 2x to 50x faster than the corresponding MATLAB operations, depending on how many cores your computer has.  To run a demo illustrating a 500x or more speedup versus MATLAB, run this demo:</p><pre>  gbdemo2</pre><p>It will illustrate an assignment C(I,J)=A that can take under a second in GraphBLAS but several minutes in MATLAB.  To make the comparsion even more dramatic, try:</p><pre>  gbdemo2 (20000)</pre><p>assuming you have enough memory.</p><h2 id="63">Sparse logical indexing is much, much faster in GraphBLAS</h2><p>The mask in GraphBLAS acts much like logical indexing in MATLAB, but it is not quite the same.  MATLAB logical indexing takes the form:</p><pre>     C (M) = A (M)</pre><p>which computes the same thing as the GraphBLAS statement:</p><pre>     C = GrB.assign (C, M, A)</pre><p>The GrB.assign statement computes C(M)=A(M), and it is vastly faster than C(M)=A(M) for MATLAB sparse matrices, even if the time to convert the GrB matrix back to a MATLAB sparse matrix is included.</p><p>GraphBLAS can also compute C(M)=A(M) using overloaded operators for subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.</p><p>Here are both methods in GraphBLAS (both are very fast).  Setting up:</p><pre class="codeinput">clear
 n = 4000 ;
 tic
 C = sprand (n, n, 0.1) ;
@@ -1387,15 +1509,15 @@
 fprintf (<span class="string">'\nsetup time:     %g sec\n'</span>, t_setup) ;
 </pre><pre class="codeoutput">nnz(C): 1.5226e+06, nnz(M): 761163, nnz(A): 1.52245e+06
 
-setup time:     1.08586 sec
-</pre><h2 id="63">First method in GraphBLAS, with GrB.assign</h2><p>Including the time to convert C1 from a GraphBLAS matrix to a MATLAB sparse matrix:</p><pre class="codeinput">tic
+setup time:     1.07901 sec
+</pre><h2 id="64">First method in GraphBLAS, with GrB.assign</h2><p>Including the time to convert C1 from a GraphBLAS matrix to a MATLAB sparse matrix:</p><pre class="codeinput">tic
 C1 = GrB.assign (C, M, A) ;
 C1 = double (C1) ;
 gb_time = toc ;
 fprintf (<span class="string">'\nGraphBLAS time: %g sec for GrB.assign\n'</span>, gb_time) ;
 </pre><pre class="codeoutput">
-GraphBLAS time: 0.0875537 sec for GrB.assign
-</pre><h2 id="64">Second method in GraphBLAS, with C(M)=A(M)</h2><p>now using overloaded operators, also include the time to convert back to a MATLAB sparse matrix, for good measure:</p><pre class="codeinput">A2 = GrB (A) ;
+GraphBLAS time: 0.0923411 sec for GrB.assign
+</pre><h2 id="65">Second method in GraphBLAS, with C(M)=A(M)</h2><p>now using overloaded operators, also include the time to convert back to a MATLAB sparse matrix, for good measure:</p><pre class="codeinput">A2 = GrB (A) ;
 C2 = GrB (C) ;
 tic
 C2 (M) = A2 (M) ;
@@ -1403,33 +1525,33 @@
 gb_time2 = toc ;
 fprintf (<span class="string">'\nGraphBLAS time: %g sec for C(M)=A(M)\n'</span>, gb_time2) ;
 </pre><pre class="codeoutput">
-GraphBLAS time: 0.181763 sec for C(M)=A(M)
-</pre><h2 id="65">Now with MATLAB matrices, with C(M)=A(M)</h2><p>Please wait, this will take about 10 minutes or so ...</p><pre class="codeinput">tic
+GraphBLAS time: 0.212792 sec for C(M)=A(M)
+</pre><h2 id="66">Now with MATLAB matrices, with C(M)=A(M)</h2><p>Please wait, this will take about 10 minutes or so ...</p><pre class="codeinput">tic
 C (M) = A (M) ;
 matlab_time = toc ;
 
 fprintf (<span class="string">'\nGraphBLAS time: %g sec (GrB.assign)\n'</span>, gb_time) ;
-fprintf (<span class="string">'\nGraphBLAS time: %g sec (overloading)\n'</span>, gb_time2) ;
+fprintf (<span class="string">'GraphBLAS time: %g sec (overloading)\n'</span>, gb_time2) ;
 fprintf (<span class="string">'MATLAB time:    %g sec\n'</span>, matlab_time) ;
-fprintf (<span class="string">'Speedup of GraphBLAS over MATLAB: %g\n'</span>, <span class="keyword">...</span>
+fprintf (<span class="string">'Speedup of GraphBLAS (overloading) over MATLAB: %g\n'</span>, <span class="keyword">...</span>
     matlab_time / gb_time2) ;
+fprintf (<span class="string">'Speedup of GraphBLAS (GrB.assign)  over MATLAB: %g\n'</span>, <span class="keyword">...</span>
+    matlab_time / gb_time) ;
+fprintf (<span class="string">'\n# of threads used by GraphBLAS: %d\n'</span>, GrB.threads) ;
 
-<span class="comment">% GraphBLAS computes the exact same result with both methods:</span>
 assert (isequal (C1, C))
 assert (isequal (C2, C))
-C1 - C
-C2 - C
+fprintf (<span class="string">'Results of GrB and MATLAB match perfectly.\n'</span>)
 </pre><pre class="codeoutput">
-GraphBLAS time: 0.0875537 sec (GrB.assign)
-
-GraphBLAS time: 0.181763 sec (overloading)
-MATLAB time:    628.349 sec
-Speedup of GraphBLAS over MATLAB: 3456.96
-ans =
-   All zero sparse: 4000&times;4000
-ans =
-   All zero sparse: 4000&times;4000
-</pre><h2 id="66">Limitations and their future solutions</h2><p>The MATLAB interface for SuiteSparse:GraphBLAS is a work-in-progress. It has some limitations, most of which will be resolved over time.</p><p>(1) Nonblocking mode:</p><p>GraphBLAS has a 'non-blocking' mode, in which operations can be left pending and completed later.  SuiteSparse:GraphBLAS uses the non-blocking mode to speed up a sequence of assignment operations, such as C(I,J)=A.  However, in its MATLAB interface, this would require a MATLAB mexFunction to modify its inputs.  That breaks the MATLAB API standard, so it cannot be safely done.  As a result, using GraphBLAS via its MATLAB interface can be slower than when using its C API.</p><p>(2) Integer element-wise operations:</p><p>Integer operations in MATLAB saturate, so that uint8(255)+1 is 255.  To allow for integer monoids, GraphBLAS uses modular arithmetic instead. This is the only way that C=A*B can be defined for integer semirings. However, saturating integer operators could be added in the future, so that element- wise integer operations on GraphBLAS sparse integer matrices could work just the same as their MATLAB counterparts.</p><p>So in the future, you could perhaps write this, for both sparse and dense integer matrices A and B:</p><pre>     C = GrB.eadd ('+saturate.int8', A, B)</pre><p>to compute the same thing as C=A+B in MATLAB for its full int8 matrices.  Note that MATLAB can do this only for dense integer matrices, since it doesn't support sparse integer matrices.</p><p>(3) Faster methods:</p><p>Most methods in this MATLAB interface are based on efficient parallel C functions in GraphBLAS itself, and are typically as fast or faster than the equivalent built-in operators and functions in MATLAB.</p><p>There are few notable exceptions; these will be addressed in the future. Dense matrices and vectors held as GraphBLAS objects are slower than their MATLAB counterparts.  horzcat and vertcat, for [A B] and [A;B] when either A or B are GraphBLAS matrices, are also slow, as illustrated below in the next example.</p><p>Other methods that will be faster in the future include bandwidth, istriu, istril, isdiag, reshape, issymmetric, and ishermitian.</p><p>Here is an example that illustrates the performance of C = [A B]</p><pre class="codeinput">clear
+GraphBLAS time: 0.0923411 sec (GrB.assign)
+GraphBLAS time: 0.212792 sec (overloading)
+MATLAB time:    627.941 sec
+Speedup of GraphBLAS (overloading) over MATLAB: 2950.96
+Speedup of GraphBLAS (GrB.assign)  over MATLAB: 6800.23
+
+# of threads used by GraphBLAS: 8
+Results of GrB and MATLAB match perfectly.
+</pre><h2 id="67">Limitations and their future solutions</h2><p>The MATLAB interface for SuiteSparse:GraphBLAS is a work-in-progress. It has some limitations, most of which will be resolved over time.</p><p>(1) Nonblocking mode:</p><p>GraphBLAS has a 'non-blocking' mode, in which operations can be left pending and completed later.  SuiteSparse:GraphBLAS uses the non-blocking mode to speed up a sequence of assignment operations, such as C(I,J)=A.  However, in its MATLAB interface, this would require a MATLAB mexFunction to modify its inputs.  That breaks the MATLAB API standard, so it cannot be safely done.  As a result, using GraphBLAS via its MATLAB interface can be slower than when using its C API.</p><p>(2) Integer element-wise operations:</p><p>Integer operations in MATLAB saturate, so that uint8(255)+1 is 255.  To allow for integer monoids, GraphBLAS uses modular arithmetic instead. This is the only way that C=A*B can be defined for integer semirings. However, saturating integer operators could be added in the future, so that element- wise integer operations on GraphBLAS sparse integer matrices could work just the same as their MATLAB counterparts.</p><p>So in the future, you could perhaps write this, for both sparse and dense integer matrices A and B:</p><pre>     C = GrB.eadd ('+saturate.int8', A, B)</pre><p>to compute the same thing as C=A+B in MATLAB for its full int8 matrices.  Note that MATLAB can do this only for dense integer matrices, since it doesn't support sparse integer matrices.</p><p>(3) Faster methods:</p><p>Most methods in this MATLAB interface are based on efficient parallel C functions in GraphBLAS itself, and are typically as fast or faster than the equivalent built-in operators and functions in MATLAB.</p><p>There are few notable exceptions; these will be addressed in the future. Full matrices and vectors held as GraphBLAS objects can be slightly slower than their MATLAB counterparts.  horzcat and vertcat, for [A B] and [A;B] when either A or B are GraphBLAS matrices, are also slow, as illustrated below in the next example.</p><p>Other methods that will be faster in the future include bandwidth, istriu, istril, isdiag, reshape, issymmetric, and ishermitian.</p><p>Here is an example that illustrates the performance of C = [A B]</p><pre class="codeinput">clear
 A = sparse (rand (2000)) ;
 B = sparse (rand (2000)) ;
 tic
@@ -1452,8 +1574,8 @@
 </pre><pre class="codeoutput">err =
      0
 
-MATLAB: 0.0693146 sec, GraphBLAS: 0.389899 sec
-GraphBLAS is slower by a factor of 5.62506
+MATLAB: 0.0670979 sec, GraphBLAS: 0.44343 sec
+GraphBLAS is slower by a factor of 6.60871
 </pre><p>(4) Linear indexing:</p><p>If A is an m-by-n 2D MATLAB matrix, with n &gt; 1, A(:) is a column vector of length m*n.  The index operation A(i) accesses the ith entry in the vector A(:).  This is called linear indexing in MATLAB.  It is not yet available for GraphBLAS matrices in this MATLAB interface to GraphBLAS, but will be added in the future.</p><p>(5) Implicit singleton dimension expansion</p><p>In MATLAB C=A+B where A is m-by-n and B is a 1-by-n row vector implicitly expands B to a matrix, computing C(i,j)=A(i,j)+B(j).  This implicit expansion is not yet suported in GraphBLAS with C=A+B. However, it can be done with C = GrB.mxm ('+.+', A, diag(GrB(B))). That's a nice example of the power of semirings, but it's not immediately obvious, and not as clear a syntax as C=A+B.  The GraphBLAS/@GrB/dnn.m function uses this 'plus.plus' semiring to apply the bias to each neuron.</p><pre class="codeinput">A = magic (3)
 B = 1000:1000:3000
 C1 = A + B
@@ -1472,7 +1594,7 @@
 
 C2 =
 
-  3x3 GraphBLAS double matrix, sparse by col:
+  3x3 GraphBLAS double matrix, bitmap by col
   9 nonzeros, 9 entries
 
     (1,1)    1008
@@ -1487,7 +1609,7 @@
 
 err =
      0
-</pre><p>(6) Performance issues</p><p>The GrB matrix is a MATLAB object, and there are some cases where performance issues can arise.  Extracting the contents of a MATLAB object (G.field) takes much more time than for a MATLAB struct with the same syntax, and building an object has similar issues.  The difference is small, and it does not affect large problems.  But if you have many calls to GrB operations with a small amount of work, then the time can be dominated by the MATLAB object-oriented overhead.</p><pre class="codeinput">A = rand (3,4) ;
+</pre><p>(6) MATLAB object overhead.</p><p>The GrB matrix is a MATLAB object, and there are some cases where performance issues can arise as a result.  Extracting the contents of a MATLAB object (G.field) takes much more time than for a MATLAB struct with % the same syntax, and building an object has similar issues.  The difference is small, and it does not affect large problems.  But if you have many calls to GrB operations with a small amount of work, then the time can be dominated by the MATLAB object-oriented overhead.</p><p>There is no solution or workaround to this issue.</p><pre class="codeinput">A = rand (3,4) ;
 G = GrB (A) ;
 tic
 <span class="keyword">for</span> k = 1:100000
@@ -1499,10 +1621,12 @@
     [m, n] = size (G) ;
 <span class="keyword">end</span>
 toc
-</pre><pre class="codeoutput">Elapsed time is 0.052089 seconds.
-Elapsed time is 0.524830 seconds.
-</pre><h2 id="73">GraphBLAS operations</h2><p>In addition to the overloaded operators (such as C=A*B) and overloaded functions (such as L=tril(A)), GraphBLAS also has methods of the form GrB.method.  Most of them take an optional input matrix Cin, which is the initial value of the matrix C for the expression below, an optional mask matrix M, and an optional accumulator operator.</p><pre>    C&lt;#M,replace&gt; = accum (C, T)</pre><p>In the above expression, #M is either empty (no mask), M (with a mask matrix) or ~M (with a complemented mask matrix), as determined by the descriptor.  'replace' can be used to clear C after it is used in accum(C,T) but before it is assigned with C&lt;...&gt; = Z, where Z=accum(C,T).  The matrix T is the result of some operation, such as T=A*B for GrB.mxm, or T=op(A,B) for GrB.eadd.</p><p>For a complete list of GraphBLAS overloaded operators and methods, type:</p><pre class="language-matlab">help <span class="string">GrB</span>
-</pre><p>Thanks for watching!</p><p>Tim Davis, Texas A&amp;M University, <a href="http://faculty.cse.tamu.edu/davis">http://faculty.cse.tamu.edu/davis</a> See also doc sparse and <a href="https://twitter.com/DocSparse">https://twitter.com/DocSparse</a></p><p class="footer"><br><a href="https://www.mathworks.com/products/matlab/">Published with MATLAB&reg; R2019b</a><br></p></div><!--
+</pre><pre class="codeoutput">Elapsed time is 0.051501 seconds.
+Elapsed time is 0.535188 seconds.
+</pre><h2 id="74">GraphBLAS operations</h2><p>In addition to the overloaded operators (such as C=A*B) and overloaded functions (such as L=tril(A)), GraphBLAS also has methods of the form GrB.method.  Most of them take an optional input matrix Cin, which is the initial value of the matrix C for the expression below, an optional mask matrix M, and an optional accumulator operator.</p><pre class="language-matlab">in <span class="string">GrB</span> <span class="string">syntax:</span>  <span class="string">C&lt;#M</span>,replace&gt; = accum (C, A*B)
+</pre><pre class="language-matlab">in <span class="string">@GrB</span> <span class="string">MATLAB:</span> <span class="string">C</span> <span class="string">=</span> <span class="string">GrB.mxm</span> <span class="string">(Cin, M, accum, semiring, A, B, desc)</span> ;
+</pre><p>In the above expression, #M is either empty (no mask), M (with a mask matrix) or ~M (with a complemented mask matrix), as determined by the descriptor (desc).  'replace' can be used to clear C after it is used in accum(C,T) but before it is assigned with C&lt;...&gt; = Z, where Z=accum(C,T).  The matrix T is the result of some operation, such as T=A*B for GrB.mxm, or T=op(A,B) for GrB.eadd.</p><p>For a complete list of GraphBLAS overloaded operators and methods, type:</p><pre class="language-matlab">help <span class="string">GrB</span>
+</pre><p>Thanks for watching!</p><p>Tim Davis, Texas A&amp;M University, <a href="http://faculty.cse.tamu.edu/davis">http://faculty.cse.tamu.edu/davis</a>, <a href="https://twitter.com/DocSparse">https://twitter.com/DocSparse</a></p><p class="footer"><br><a href="https://www.mathworks.com/products/matlab/">Published with MATLAB&reg; R2019b</a><br></p></div><!--
 ##### SOURCE BEGIN #####
 %% GraphBLAS: graph algorithms in the language of linear algebra
 % GraphBLAS is a library for creating graph algorithms based on sparse
@@ -1510,15 +1634,17 @@
 % for more details and resources.  See also the SuiteSparse:GraphBLAS
 % User Guide in this package.
 %
-% SuiteSparse:GraphBLAS, (c) 2017-2020, Tim Davis, Texas A&M University,
 % http://faculty.cse.tamu.edu/davis
+%
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %% GraphBLAS: faster and more general sparse matrices for MATLAB
 % GraphBLAS is not only useful for creating graph algorithms; it also
 % supports a wide range of sparse matrix data types and operations.
 % MATLAB can compute C=A*B with just two semirings: 'plus.times.double'
-% and 'plus.times.complex' for complex matrices.  GraphBLAS has 1,473
-% unique built-in semirings, such as 'max.plus'
+% and 'plus.times.complex' for complex matrices.  GraphBLAS has 2,518
+% built-in semirings, such as 'max.plus'
 % (https://en.wikipedia.org/wiki/Tropical_semiring).  These semirings can
 % be used to construct a wide variety of graph algorithms, based on
 % operations on sparse adjacency matrices.
@@ -1529,8 +1655,12 @@
 % single complex (with MATLAB matrices, these types can only be held in
 % full matrices).
 
-clear
-GrB.clear
+% reset to the default number of threads
+clear all
+maxNumCompThreads ('automatic') ;
+GrB.clear ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
+
 format compact
 rng ('default') ;
 X = 100 * rand (2) ;
@@ -1539,8 +1669,17 @@
 %% Sparse integer matrices
 % Here's an int8 version of the same matrix:
 
-S = int8 (G)            % convert G to a full MATLAB int8 matrix
-G = GrB (X, 'int8')      % a GraphBLAS sparse int8 matrix
+S = int8 (G)             % convert G to a full MATLAB int8 matrix
+S (1,1) = 0              % add an explicit zero to S
+G = GrB (X, 'int8')      % a GraphBLAS full int8 matrix
+G (1,1) = 0              % add an explicit zero to G
+G = GrB.prune (G)        % a GraphBLAS sparse int8 matrix
+
+try
+    S = sparse (S) ;     % MATLAB can't create sparse int8 matrices
+catch me
+    display (me)
+end
 
 %% Sparse single-precision matrices
 % Matrix operations in GraphBLAS are typically as fast, or faster than
@@ -1563,6 +1702,7 @@
 fprintf ('MATLAB time:    %g sec (in double)\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 %% Mixing MATLAB and GraphBLAS matrices
 % The error in the last computation is about eps('single') since
@@ -1589,6 +1729,7 @@
 fprintf ('MATLAB time:    %g sec (in double)\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 %% A wide range of semirings
 % MATLAB can only compute C=A*B using the standard '+.*.double' and
@@ -1669,10 +1810,9 @@
 % The C interface for SuiteSparse:GraphBLAS allows for arbitrary types
 % and operators to be constructed.  However, the MATLAB interface to
 % SuiteSparse:GraphBLAS is restricted to pre-defined types and operators:
-% a mere 13 types, 204 unary operators, 385 binary operators, 77 monoids,
-% 16 select operators, and 2,438 semirings (1,473 of which are unique,
-% since some binary operators are equivalent: 'min.logical' and
-% '&.logical' are the same thing, for example).
+% a mere 13 types, 212 unary operators, 401 binary operators, 77 monoids,
+% 22 select operators (each of which can be used for all 13 types),
+% and 2,518 semirings.
 %
 % That gives you a lot of tools to create all kinds of interesting
 % graph algorithms.  For example:
@@ -1684,6 +1824,7 @@
 % See 'help GrB.binopinfo' for a list of the binary operators, and
 % 'help GrB.monoidinfo' for the ones that can be used as the additive
 % monoid in a semiring.  'help GrB.unopinfo' lists the unary operators.
+% 'help GrB.semiringinfo' descripts the semirings.
 
 %% 
 help GrB.binopinfo
@@ -1694,6 +1835,9 @@
 %% 
 help GrB.unopinfo
 
+%% 
+help GrB.semiringinfo
+
 %% Element-wise operations
 % Binary operators can be used in element-wise matrix operations, like
 % C=A+B and C=A.*B.  For the matrix addition C=A+B, the pattern of C is
@@ -1773,7 +1917,7 @@
 %
 % A few differences with the built-in functions:
 %
-%   S = sparse (G)        % makes a copy of a GrB matrix
+%   S = sparse (G)        % converts G to sparse/hypersparse
 %   F = full (G)          % adds explicit zeros, so numel(F)==nnz(F)
 %   F = full (G,type,id)  % adds explicit identity values to a GrB matrix
 %   disp (G, level)       % display a GrB matrix G; level=2 is the default.
@@ -1831,15 +1975,15 @@
 disp (G,1)
 
 %% Storing a matrix by row or by column
-% MATLAB stores its sparse matrices by column, refered to as 'standard
-% CSC' in SuiteSparse:GraphBLAS.  In the CSC (compressed sparse column)
-% format, each column of the matrix is stored as a list of entries, with
-% their value and row index.  In the CSR (compressed sparse row) format,
-% each row is stored as a list of values and their column indices.
-% GraphBLAS uses both CSC and CSR, and the two formats can be intermixed
-% arbitrarily.  In its C interface, the default format is CSR.  However,
-% for better compatibility with MATLAB, this MATLAB interface for
-% SuiteSparse:GraphBLAS uses CSC by default instead. 
+% MATLAB stores its sparse matrices by column, refered to as 'sparse by
+% col' in SuiteSparse:GraphBLAS.  In the 'sparse by col' format, each
+% column of the matrix is stored as a list of entries, with their value
+% and row index.  In the 'sparse by row' format, each row is stored as a
+% list of values and their column indices.  GraphBLAS uses both 'by row'
+% and 'by col', and the two formats can be intermixed arbitrarily.  In
+% its C interface, the default format is 'by row'.  However, for better
+% compatibility with MATLAB, the SuiteSparse:GraphBLAS MATLAB interface
+% uses 'by col' by default instead. 
 
 %%
 rng ('default') ;
@@ -1850,14 +1994,14 @@
 GrB.format (G)
 
 %%
-% Many graph algorithms work better in CSR format, with matrices stored
-% by row.  For example, it is common to use A(i,j) for the edge (i,j),
-% and many graph algorithms need to access the out-adjacencies of nodes,
-% which is the row A(i,;) for node i.  If the CSR format is desired,
-% GrB.format ('by row') tells GraphBLAS to create all subsequent matrices
-% in the CSR format.  Converting from a MATLAB sparse matrix (in standard
-% CSC format) takes a little more time (requiring a transpose), but
-% subsequent graph algorithms can be faster.
+% Many graph algorithms work better in 'by row' format, with matrices
+% stored by row.  For example, it is common to use A(i,j) for the edge
+% (i,j), and many graph algorithms need to access the out-adjacencies of
+% nodes, which is the row A(i,;) for node i.  If the 'by row' format is
+% desired, GrB.format ('by row') tells GraphBLAS to create all subsequent
+% matrices in the 'by row' format.  Converting from a MATLAB sparse matrix
+% (in standard 'by col' format) takes a little more time (requiring a
+% transpose), but subsequent graph algorithms can be faster.
 
 %%
 G = GrB (C, 'by row')
@@ -1866,12 +2010,13 @@
 fprintf ('the format of H is:    %s\n', GrB.format (H)) ;
 err = norm (H-G,1)
 
-%% Hypersparse matrices
-% SuiteSparse:GraphBLAS can use two kinds of sparse matrix data
-% structures: standard and hypersparse, for both CSC and CSR formats.  In
-% the standard CSC format used in MATLAB, an m-by-n matrix A takes
-% O(n+nnz(A)) space.  MATLAB can create huge column vectors, but not huge
-% matrices (when n is huge).
+%% Hypersparse, sparse, bitmap, and full matrices
+% SuiteSparse:GraphBLAS can use four kinds of sparse matrix data
+% structures: hypersparse, sparse, bitmap, and full, in both 'by col' and
+% 'by row' formats, for a total of eight different combinations.  In the
+% 'sparse by col' that MATLAB uses for its sparse matrices, an m-by-n
+% matrix A takes O(n+nnz(A)) space.  MATLAB can create huge column
+% vectors, but not huge matrices (when n is huge).
 
 clear
 [c, huge] = computer ;
@@ -1965,12 +2110,12 @@
 % Furthermore, C=A*B is not defined for integer types in MATLAB, except
 % when A and/or B are scalars.
 %
-% GraphBLAS supports all of those types for its sparse matrices.  All
-% operations are supported, including C=A*B when A or B are any integer
-% type, in 1000s of semirings.
+% GraphBLAS supports all of those types for all of its matrices (hyper,
+% sparse, bitmap, or full).  All operations are supported, including C=A*B
+% when A or B are any integer type, in 1000s of semirings.
 %
-% However, integer arithmetic differs in GraphBLAS and MATLAB.  In
-% MATLAB, integer values saturate if they exceed their maximum value.  In
+% However, integer arithmetic differs in GraphBLAS and MATLAB.  In MATLAB,
+% integer values saturate if they exceed their maximum value.  In
 % GraphBLAS, integer operators act in a modular fashion.  The latter is
 % essential when computing C=A*B over a semiring.  A saturating integer
 % operator cannot be used as a monoid since it is not associative.
@@ -2010,6 +2155,7 @@
 fprintf ('MATLAB time:    %g sec\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 %% Example graph algorithm: Luby's method in GraphBLAS
 % The GrB.mis function is variant of Luby's randomized algorithm [Luby
@@ -2055,6 +2201,7 @@
 fprintf ('# layers:   %d\n', nlayers) ;
 fprintf ('# neurons:  %d\n', nneurons) ;
 fprintf ('# features: %d\n', nfeatures) ;
+fprintf ('# of threads used: %d\n', GrB.threads) ;
 
 tic
 Y0 = sprand (nfeatures, nneurons, 0.1) ;
@@ -2088,6 +2235,7 @@
 fprintf ('total time in MATLAB:    %g sec\n', matlab_time) ;
 fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
     matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
 err = norm (Y1-Y2,1)
 
@@ -2111,7 +2259,7 @@
 % as { start, inc, fini }, instead of start:inc:fini. See
 % 'help GrB.extract', 'help GrB.assign' for the functional form.
 % For the overloaded syntax C(I,J)=A and C=A(I,J), see
-% 'help GrB/subsasgn' and 'help GrB/subsfref'.  The cell array
+% 'help GrB/subsasgn' and 'help GrB/subsref'.  The cell array
 % syntax isn't conventional, but it is far faster than the MATLAB
 % colon notation for objects, and takes far less memory when I is huge.
 
@@ -2195,11 +2343,11 @@
 %       C = GrB.assign (C, M, A)
 %
 % The GrB.assign statement computes C(M)=A(M), and it is vastly faster
-% than C(M)=A(M), even if the time to convert the GrB matrix back to a
-% MATLAB sparse matrix is included.
+% than C(M)=A(M) for MATLAB sparse matrices, even if the time to convert
+% the GrB matrix back to a MATLAB sparse matrix is included.
 %
-% GraphBLAS can also compute C (M) = A (M) using overloaded operators
-% for subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.
+% GraphBLAS can also compute C(M)=A(M) using overloaded operators for
+% subsref and subsasgn, but C = GrB.assign (C, M, A) is a bit faster.
 %
 % Here are both methods in GraphBLAS (both are very fast).  Setting up:
 
@@ -2242,16 +2390,17 @@
 matlab_time = toc ;
 
 fprintf ('\nGraphBLAS time: %g sec (GrB.assign)\n', gb_time) ;
-fprintf ('\nGraphBLAS time: %g sec (overloading)\n', gb_time2) ;
+fprintf ('GraphBLAS time: %g sec (overloading)\n', gb_time2) ;
 fprintf ('MATLAB time:    %g sec\n', matlab_time) ;
-fprintf ('Speedup of GraphBLAS over MATLAB: %g\n', ...
+fprintf ('Speedup of GraphBLAS (overloading) over MATLAB: %g\n', ...
     matlab_time / gb_time2) ;
+fprintf ('Speedup of GraphBLAS (GrB.assign)  over MATLAB: %g\n', ...
+    matlab_time / gb_time) ;
+fprintf ('\n# of threads used by GraphBLAS: %d\n', GrB.threads) ;
 
-% GraphBLAS computes the exact same result with both methods:
 assert (isequal (C1, C))
 assert (isequal (C2, C))
-C1 - C
-C2 - C
+fprintf ('Results of GrB and MATLAB match perfectly.\n')
 
 %% Limitations and their future solutions
 % The MATLAB interface for SuiteSparse:GraphBLAS is a work-in-progress.
@@ -2294,9 +2443,9 @@
 % the equivalent built-in operators and functions in MATLAB.
 %
 % There are few notable exceptions; these will be addressed in the future.
-% Dense matrices and vectors held as GraphBLAS objects are slower than
-% their MATLAB counterparts.  horzcat and vertcat, for [A B] and [A;B]
-% when either A or B are GraphBLAS matrices, are also slow, as
+% Full matrices and vectors held as GraphBLAS objects can be slightly
+% slower than their MATLAB counterparts.  horzcat and vertcat, for [A B]
+% and [A;B] when either A or B are GraphBLAS matrices, are also slow, as
 % illustrated below in the next example.
 %
 % Other methods that will be faster in the future include bandwidth,
@@ -2353,15 +2502,17 @@
 err = norm (C1-C2,1)
 
 %%
-% (6) Performance issues
+% (6) MATLAB object overhead.
 %
 % The GrB matrix is a MATLAB object, and there are some cases where
-% performance issues can arise.  Extracting the contents of a MATLAB
-% object (G.field) takes much more time than for a MATLAB struct with
-% the same syntax, and building an object has similar issues.  The
-% difference is small, and it does not affect large problems.  But if
-% you have many calls to GrB operations with a small amount of work,
-% then the time can be dominated by the MATLAB object-oriented overhead.
+% performance issues can arise as a result.  Extracting the contents of
+% a MATLAB object (G.field) takes much more time than for a MATLAB struct
+% with % the same syntax, and building an object has similar issues.  The
+% difference is small, and it does not affect large problems.  But if you
+% have many calls to GrB operations with a small amount of work, then the
+% time can be dominated by the MATLAB object-oriented overhead.
+%
+% There is no solution or workaround to this issue.
 
 A = rand (3,4) ;
 G = GrB (A) ;
@@ -2383,12 +2534,14 @@
 % the initial value of the matrix C for the expression below, an optional
 % mask matrix M, and an optional accumulator operator.
 %
-%      C<#M,replace> = accum (C, T)
+%   in GrB syntax:  C<#M,replace> = accum (C, A*B)
+%
+%   in @GrB MATLAB: C = GrB.mxm (Cin, M, accum, semiring, A, B, desc) ;
 %
 % In the above expression, #M is either empty (no mask), M (with a mask
 % matrix) or ~M (with a complemented mask matrix), as determined by the
-% descriptor.  'replace' can be used to clear C after it is used in
-% accum(C,T) but before it is assigned with C<...> = Z, where
+% descriptor (desc).  'replace' can be used to clear C after it is used
+% in accum(C,T) but before it is assigned with C<...> = Z, where
 % Z=accum(C,T).  The matrix T is the result of some operation, such as
 % T=A*B for GrB.mxm, or T=op(A,B) for GrB.eadd.
 %
@@ -2398,8 +2551,8 @@
 %
 % Thanks for watching!
 %
-% Tim Davis, Texas A&M University, http://faculty.cse.tamu.edu/davis
-% See also doc sparse and https://twitter.com/DocSparse
+% Tim Davis, Texas A&M University, http://faculty.cse.tamu.edu/davis,
+% https://twitter.com/DocSparse
 
 
 ##### SOURCE END #####
diff --git a/GraphBLAS/GraphBLAS/demo/html/Dell_Windows10/graphblas_demo2.html b/GraphBLAS/GraphBLAS/demo/html/Dell_Windows10/graphblas_demo2.html
index e7383fc323..565388e615 100644
--- a/GraphBLAS/GraphBLAS/demo/html/Dell_Windows10/graphblas_demo2.html
+++ b/GraphBLAS/GraphBLAS/demo/html/Dell_Windows10/graphblas_demo2.html
@@ -6,7 +6,7 @@
    <!--
 This HTML was auto-generated from MATLAB code.
 To make changes, update the MATLAB code and republish this document.
-      --><title>graphblas_demo2</title><meta name="generator" content="MATLAB 9.7"><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/"><meta name="DC.date" content="2020-06-25"><meta name="DC.source" content="graphblas_demo2.m"><style type="text/css">
+      --><title>graphblas_demo2</title><meta name="generator" content="MATLAB 9.7"><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/"><meta name="DC.date" content="2021-01-04"><meta name="DC.source" content="graphblas_demo2.m"><style type="text/css">
 html,body,div,span,applet,object,iframe,h1,h2,h3,h4,h5,h6,p,blockquote,pre,a,abbr,acronym,address,big,cite,code,del,dfn,em,font,img,ins,kbd,q,s,samp,small,strike,strong,sub,sup,tt,var,b,u,i,center,dl,dt,dd,ol,ul,li,fieldset,form,label,legend,table,caption,tbody,tfoot,thead,tr,th,td{margin:0;padding:0;border:0;outline:0;font-size:100%;vertical-align:baseline;background:transparent}body{line-height:1}ol,ul{list-style:none}blockquote,q{quotes:none}blockquote:before,blockquote:after,q:before,q:after{content:'';content:none}:focus{outine:0}ins{text-decoration:none}del{text-decoration:line-through}table{border-collapse:collapse;border-spacing:0}
 
 html { min-height:100%; margin-bottom:1px; }
@@ -68,8 +68,13 @@
 
   </style></head><body><div class="content"><pre class="codeinput"><span class="comment">% Run the GraphBLAS demo2</span>
 
-<span class="comment">% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.</span>
-<span class="comment">% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.</span>
+<span class="comment">% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.</span>
+<span class="comment">% SPDX-License-Identifier: Apache-2.0</span>
+
+<span class="comment">% reset to the default number of threads</span>
+clear <span class="string">all</span>
+maxNumCompThreads (<span class="string">'automatic'</span>) ;
+GrB.clear ;
 
 gbdemo2
 </pre><pre class="codeoutput"> GBDEMO2 Extreme performance differences: GraphBLAS vs MATLAB.
@@ -79,8 +84,8 @@
         gbdemo2             % uses a default bnz = 6000
         gbdemo2 (20000)     % uses bnz = 20000
  
-  The GraphBLAS operations used in gbdemo are perhaps 3x to 50x
-  faster than the corresponding MATLAB operations, depending on how
+  Many of the GraphBLAS operations used in gbdemo are perhaps 3x to
+  50x faster than the corresponding MATLAB operations, depending on how
   many cores your computer has.  Here's an example where GraphBLAS is
   asymptotically far faster than MATLAB R2019a: a simple assignment
   for a large matrix C:
@@ -116,79 +121,84 @@
 with 35.7126 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.412634 sec
-    GraphBLAS time: 1.19528 sec
+    setup time:     0.413704 sec
+    GraphBLAS time: 1.2012 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    0.507453 sec
-    Speedup of GraphBLAS over MATLAB: 0.424546
-    check time:     0.570694 sec
+    MATLAB time:    0.507529 sec
+    Speedup of GraphBLAS over MATLAB: 0.42252
+    check time:     0.570797 sec
     all tests passed
 
 C(I,J)=A where C is 4 million -by- 4 million
 with 35.8202 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.859294 sec
-    GraphBLAS time: 1.13881 sec
+    setup time:     0.799262 sec
+    GraphBLAS time: 1.21632 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    0.489687 sec
-    Speedup of GraphBLAS over MATLAB: 0.429997
-    check time:     0.591902 sec
+    MATLAB time:    0.506752 sec
+    Speedup of GraphBLAS over MATLAB: 0.416628
+    check time:     0.611257 sec
     all tests passed
 
 C(I,J)=A where C is 9 million -by- 9 million
 with 35.928 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.800442 sec
-    GraphBLAS time: 1.17931 sec
+    setup time:     0.826673 sec
+    GraphBLAS time: 1.3019 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    155.789 sec
-    Speedup of GraphBLAS over MATLAB: 132.102
-    check time:     0.65189 sec
+    MATLAB time:    155.34 sec
+    Speedup of GraphBLAS over MATLAB: 119.318
+    check time:     0.647047 sec
     all tests passed
 
 C(I,J)=A where C is 16 million -by- 16 million
 with 35.916 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     0.902976 sec
-    GraphBLAS time: 1.55605 sec
+    setup time:     0.93116 sec
+    GraphBLAS time: 1.65801 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    168.386 sec
-    Speedup of GraphBLAS over MATLAB: 108.213
-    check time:     0.658529 sec
+    MATLAB time:    167.957 sec
+    Speedup of GraphBLAS over MATLAB: 101.3
+    check time:     0.707461 sec
     all tests passed
 
 C(I,J)=A where C is 25 million -by- 25 million
 with 35.964 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     1.0054 sec
-    GraphBLAS time: 1.24475 sec
+    setup time:     0.992939 sec
+    GraphBLAS time: 1.24836 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    187.577 sec
-    Speedup of GraphBLAS over MATLAB: 150.694
-    check time:     0.783732 sec
+    MATLAB time:    187.75 sec
+    Speedup of GraphBLAS over MATLAB: 150.397
+    check time:     0.785995 sec
     all tests passed
 
 C(I,J)=A where C is 36 million -by- 36 million
 with 35.976 million entries:
 
     A is 5000-by-5000 with 49954 entries
-    setup time:     1.11575 sec
-    GraphBLAS time: 1.68189 sec
+    setup time:     1.08988 sec
+    GraphBLAS time: 1.86523 sec
     Starting MATLAB ... please wait ... 
-    MATLAB time:    208.664 sec
-    Speedup of GraphBLAS over MATLAB: 124.065
-    check time:     0.839128 sec
+    MATLAB time:    206.674 sec
+    Speedup of GraphBLAS over MATLAB: 110.804
+    check time:     0.913111 sec
     all tests passed
 </pre><p class="footer"><br><a href="https://www.mathworks.com/products/matlab/">Published with MATLAB&reg; R2019b</a><br></p></div><!--
 ##### SOURCE BEGIN #####
 % Run the GraphBLAS demo2
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+% reset to the default number of threads
+clear all
+maxNumCompThreads ('automatic') ;
+GrB.clear ;
 
 gbdemo2
 
diff --git a/GraphBLAS/GraphBLAS/demo/mxm_demo.m b/GraphBLAS/GraphBLAS/demo/mxm_demo.m
index f1a81892f5..8393810d32 100644
--- a/GraphBLAS/GraphBLAS/demo/mxm_demo.m
+++ b/GraphBLAS/GraphBLAS/demo/mxm_demo.m
@@ -3,8 +3,12 @@
 % Requires the ssget interface to the SuiteSparse Matrix Collection.
 % See https://sparse.tamu.edu.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved.  http://suitesparse.com   See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+% reset to the default number of threads
+maxNumCompThreads ('automatic') ;
+GrB.clear ;
 
 ncores = feature ('numcores') ;
 rng ('default') ;
@@ -18,6 +22,8 @@
 end
 v = ver ('matlab') ;
 fprintf ('MATLAB version: %s release: %s\n', v.Version, v.Release) ;
+v = GrB.ver ;
+fprintf ('GraphBLAS version: %s (%s)\n', v.Version, v.Date) ;
 
 % warmup
 G = GrB (1) ;
@@ -153,6 +159,6 @@
 end
 
 % restore # of threads to their defaults
-maxNumCompThreads (ncores) ;
+maxNumCompThreads ('automatic') ;
 GrB.clear ;
 
diff --git a/GraphBLAS/GraphBLAS/demo/mxm_demo_DGX_Station.txt b/GraphBLAS/GraphBLAS/demo/mxm_demo_DGX_Station.txt
index 98a58660be..9f65f0a562 100644
--- a/GraphBLAS/GraphBLAS/demo/mxm_demo_DGX_Station.txt
+++ b/GraphBLAS/GraphBLAS/demo/mxm_demo_DGX_Station.txt
@@ -30,7 +30,8 @@ Prob2 =
      notes: [4x59 char]
 
 hypersparse.cs.tamu.edu
-MATLAB version: 9.4 release: (R2018a)
+MATLAB version: 9.9 release: (R2020b)
+GraphBLAS version: 4.0.1 (Jan 4, 2021)
 
 -------------------------------------------------
 Testing single-threaded performance of C=A*B:
@@ -38,83 +39,83 @@ Testing single-threaded performance of C=A*B:
 
 === MATLAB: double (real) vs GraphBLAS: single
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     3.7541 GrB:     3.1161 speedup:       1.20 err: 1.63154e-07
-trial 2: MATLAB:     3.7649 GrB:     3.1256 speedup:       1.20 err: 1.63154e-07
-trial 3: MATLAB:     3.7685 GrB:     3.1263 speedup:       1.21 err: 1.63154e-07
-trial 4: MATLAB:     3.7641 GrB:     3.1268 speedup:       1.20 err: 1.63154e-07
-average: MATLAB:     3.7629 GrB:     3.1237 speedup:       1.20
+trial 1: MATLAB:     3.7771 GrB:     3.5406 speedup:       1.07 err: 1.63154e-07
+trial 2: MATLAB:     3.7923 GrB:     3.5586 speedup:       1.07 err: 1.63154e-07
+trial 3: MATLAB:     3.8008 GrB:     3.5582 speedup:       1.07 err: 1.63154e-07
+trial 4: MATLAB:     3.7916 GrB:     3.5497 speedup:       1.07 err: 1.63154e-07
+average: MATLAB:     3.7904 GrB:     3.5518 speedup:       1.07
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2346 GrB:     0.0898 speedup:       2.61 err: 3.60006e-08
-trial 2: MATLAB:     0.2412 GrB:     0.0793 speedup:       3.04 err: 3.60006e-08
-trial 3: MATLAB:     0.2256 GrB:     0.0798 speedup:       2.83 err: 3.60006e-08
-trial 4: MATLAB:     0.2122 GrB:     0.0792 speedup:       2.68 err: 3.60006e-08
-average: MATLAB:     0.2284 GrB:     0.0820 speedup:       2.78
+trial 1: MATLAB:     0.2234 GrB:     0.0773 speedup:       2.89 err: 3.60006e-08
+trial 2: MATLAB:     0.2225 GrB:     0.0660 speedup:       3.37 err: 3.60006e-08
+trial 3: MATLAB:     0.2102 GrB:     0.0661 speedup:       3.18 err: 3.60006e-08
+trial 4: MATLAB:     0.2221 GrB:     0.0662 speedup:       3.36 err: 3.60006e-08
+average: MATLAB:     0.2196 GrB:     0.0689 speedup:       3.19
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0363 GrB:     0.1592 speedup:       0.23 err: 4.86966e-08
-trial 2: MATLAB:     0.0358 GrB:     0.1591 speedup:       0.22 err: 4.86966e-08
-trial 3: MATLAB:     0.0363 GrB:     0.1593 speedup:       0.23 err: 4.86966e-08
-trial 4: MATLAB:     0.0357 GrB:     0.1592 speedup:       0.22 err: 4.86966e-08
-average: MATLAB:     0.0360 GrB:     0.1592 speedup:       0.23
+trial 1: MATLAB:     0.0416 GrB:     0.0611 speedup:       0.68 err: 4.86966e-08
+trial 2: MATLAB:     0.0372 GrB:     0.0611 speedup:       0.61 err: 4.86966e-08
+trial 3: MATLAB:     0.0375 GrB:     0.0611 speedup:       0.61 err: 4.86966e-08
+trial 4: MATLAB:     0.0372 GrB:     0.0611 speedup:       0.61 err: 4.86966e-08
+average: MATLAB:     0.0384 GrB:     0.0611 speedup:       0.63
 
 === MATLAB: double (real) vs GraphBLAS: double
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     3.7502 GrB:     3.3431 speedup:       1.12 err: 0
-trial 2: MATLAB:     3.7652 GrB:     3.3574 speedup:       1.12 err: 0
-trial 3: MATLAB:     3.7639 GrB:     3.3555 speedup:       1.12 err: 0
-trial 4: MATLAB:     3.7644 GrB:     3.3563 speedup:       1.12 err: 0
-average: MATLAB:     3.7609 GrB:     3.3531 speedup:       1.12
+trial 1: MATLAB:     3.7769 GrB:     3.8013 speedup:       0.99 err: 0
+trial 2: MATLAB:     3.7918 GrB:     3.8230 speedup:       0.99 err: 0
+trial 3: MATLAB:     3.7907 GrB:     3.8220 speedup:       0.99 err: 0
+trial 4: MATLAB:     3.7894 GrB:     3.8224 speedup:       0.99 err: 0
+average: MATLAB:     3.7872 GrB:     3.8172 speedup:       0.99
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2226 GrB:     0.0963 speedup:       2.31 err: 0
-trial 2: MATLAB:     0.2215 GrB:     0.0833 speedup:       2.66 err: 0
-trial 3: MATLAB:     0.2219 GrB:     0.0831 speedup:       2.67 err: 0
-trial 4: MATLAB:     0.2215 GrB:     0.0845 speedup:       2.62 err: 0
-average: MATLAB:     0.2219 GrB:     0.0868 speedup:       2.56
+trial 1: MATLAB:     0.2227 GrB:     0.0814 speedup:       2.74 err: 0
+trial 2: MATLAB:     0.2061 GrB:     0.0653 speedup:       3.16 err: 0
+trial 3: MATLAB:     0.2054 GrB:     0.0654 speedup:       3.14 err: 0
+trial 4: MATLAB:     0.2057 GrB:     0.0653 speedup:       3.15 err: 0
+average: MATLAB:     0.2100 GrB:     0.0694 speedup:       3.03
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0365 GrB:     0.1639 speedup:       0.22 err: 0
-trial 2: MATLAB:     0.0362 GrB:     0.1617 speedup:       0.22 err: 0
-trial 3: MATLAB:     0.0379 GrB:     0.1621 speedup:       0.23 err: 0
-trial 4: MATLAB:     0.0359 GrB:     0.1621 speedup:       0.22 err: 0
-average: MATLAB:     0.0366 GrB:     0.1625 speedup:       0.23
+trial 1: MATLAB:     0.0379 GrB:     0.0653 speedup:       0.58 err: 0
+trial 2: MATLAB:     0.0372 GrB:     0.0650 speedup:       0.57 err: 0
+trial 3: MATLAB:     0.0377 GrB:     0.0653 speedup:       0.58 err: 0
+trial 4: MATLAB:     0.0371 GrB:     0.0649 speedup:       0.57 err: 0
+average: MATLAB:     0.0375 GrB:     0.0651 speedup:       0.58
 
 === MATLAB: double complex vs GraphBLAS: single complex
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     7.8952 GrB:     4.6260 speedup:       1.71 err: 1.70413e-07
-trial 2: MATLAB:     7.9092 GrB:     4.6375 speedup:       1.71 err: 1.70413e-07
-trial 3: MATLAB:     7.9074 GrB:     4.6368 speedup:       1.71 err: 1.70413e-07
-trial 4: MATLAB:     7.9074 GrB:     4.6362 speedup:       1.71 err: 1.70413e-07
-average: MATLAB:     7.9048 GrB:     4.6341 speedup:       1.71
+trial 1: MATLAB:     8.5948 GrB:     4.9319 speedup:       1.74 err: 1.70413e-07
+trial 2: MATLAB:     8.6180 GrB:     4.9453 speedup:       1.74 err: 1.70413e-07
+trial 3: MATLAB:     8.6200 GrB:     4.9454 speedup:       1.74 err: 1.70413e-07
+trial 4: MATLAB:     8.6130 GrB:     4.9449 speedup:       1.74 err: 1.70413e-07
+average: MATLAB:     8.6115 GrB:     4.9419 speedup:       1.74
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.3001 GrB:     0.1219 speedup:       2.46 err: 4.56897e-08
-trial 2: MATLAB:     0.2849 GrB:     0.1088 speedup:       2.62 err: 4.56897e-08
-trial 3: MATLAB:     0.2844 GrB:     0.1097 speedup:       2.59 err: 4.56897e-08
-trial 4: MATLAB:     0.2844 GrB:     0.1093 speedup:       2.60 err: 4.56897e-08
-average: MATLAB:     0.2885 GrB:     0.1124 speedup:       2.57
+trial 1: MATLAB:     0.3120 GrB:     0.0911 speedup:       3.43 err: 4.56897e-08
+trial 2: MATLAB:     0.2903 GrB:     0.0752 speedup:       3.86 err: 4.56897e-08
+trial 3: MATLAB:     0.2899 GrB:     0.0756 speedup:       3.83 err: 4.56897e-08
+trial 4: MATLAB:     0.2896 GrB:     0.0752 speedup:       3.85 err: 4.56897e-08
+average: MATLAB:     0.2955 GrB:     0.0793 speedup:       3.73
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.1246 GrB:     0.2572 speedup:       0.48 err: 5.75158e-08
-trial 2: MATLAB:     0.1250 GrB:     0.2567 speedup:       0.49 err: 5.75158e-08
-trial 3: MATLAB:     0.1247 GrB:     0.2569 speedup:       0.49 err: 5.75158e-08
-trial 4: MATLAB:     0.1244 GrB:     0.2569 speedup:       0.48 err: 5.75158e-08
-average: MATLAB:     0.1247 GrB:     0.2569 speedup:       0.49
+trial 1: MATLAB:     0.1341 GrB:     0.0786 speedup:       1.71 err: 5.75158e-08
+trial 2: MATLAB:     0.1348 GrB:     0.0788 speedup:       1.71 err: 5.75158e-08
+trial 3: MATLAB:     0.1344 GrB:     0.0788 speedup:       1.71 err: 5.75158e-08
+trial 4: MATLAB:     0.1344 GrB:     0.0787 speedup:       1.71 err: 5.75158e-08
+average: MATLAB:     0.1344 GrB:     0.0787 speedup:       1.71
 
 === MATLAB: double complex vs GraphBLAS: double complex
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     7.8802 GrB:     4.9123 speedup:       1.60 err: 0
-trial 2: MATLAB:     7.9086 GrB:     4.9253 speedup:       1.61 err: 0
-trial 3: MATLAB:     7.9067 GrB:     4.9258 speedup:       1.61 err: 0
-trial 4: MATLAB:     7.9072 GrB:     4.9248 speedup:       1.61 err: 0
-average: MATLAB:     7.9007 GrB:     4.9220 speedup:       1.61
+trial 1: MATLAB:     8.6011 GrB:     5.4293 speedup:       1.58 err: 0
+trial 2: MATLAB:     8.6057 GrB:     5.4520 speedup:       1.58 err: 0
+trial 3: MATLAB:     8.6198 GrB:     5.4410 speedup:       1.58 err: 0
+trial 4: MATLAB:     8.6164 GrB:     5.4510 speedup:       1.58 err: 0
+average: MATLAB:     8.6108 GrB:     5.4433 speedup:       1.58
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2998 GrB:     0.1735 speedup:       1.73 err: 0
-trial 2: MATLAB:     0.2836 GrB:     0.1571 speedup:       1.81 err: 0
-trial 3: MATLAB:     0.2838 GrB:     0.1569 speedup:       1.81 err: 0
-trial 4: MATLAB:     0.2837 GrB:     0.1571 speedup:       1.81 err: 0
-average: MATLAB:     0.2877 GrB:     0.1611 speedup:       1.79
+trial 1: MATLAB:     0.3115 GrB:     0.1522 speedup:       2.05 err: 0
+trial 2: MATLAB:     0.2987 GrB:     0.1311 speedup:       2.28 err: 0
+trial 3: MATLAB:     0.2888 GrB:     0.1310 speedup:       2.20 err: 0
+trial 4: MATLAB:     0.2886 GrB:     0.1310 speedup:       2.20 err: 0
+average: MATLAB:     0.2969 GrB:     0.1363 speedup:       2.18
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.1239 GrB:     0.3115 speedup:       0.40 err: 0
-trial 2: MATLAB:     0.1248 GrB:     0.3113 speedup:       0.40 err: 0
-trial 3: MATLAB:     0.1249 GrB:     0.3117 speedup:       0.40 err: 0
-trial 4: MATLAB:     0.1249 GrB:     0.3117 speedup:       0.40 err: 0
-average: MATLAB:     0.1246 GrB:     0.3116 speedup:       0.40
+trial 1: MATLAB:     0.1372 GrB:     0.1406 speedup:       0.98 err: 0
+trial 2: MATLAB:     0.1343 GrB:     0.1411 speedup:       0.95 err: 0
+trial 3: MATLAB:     0.1344 GrB:     0.1406 speedup:       0.96 err: 0
+trial 4: MATLAB:     0.1343 GrB:     0.1411 speedup:       0.95 err: 0
+average: MATLAB:     0.1351 GrB:     0.1409 speedup:       0.96
 
 -------------------------------------------------
 Testing performance of C=A*B using 20 threads:
@@ -122,83 +123,83 @@ Testing performance of C=A*B using 20 threads:
 
 === MATLAB: double (real) vs GraphBLAS: single
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     3.7515 GrB:     0.2927 speedup:      12.82 err: 1.66754e-07
-trial 2: MATLAB:     3.7632 GrB:     0.2003 speedup:      18.79 err: 1.66754e-07
-trial 3: MATLAB:     3.7647 GrB:     0.2017 speedup:      18.66 err: 1.66754e-07
-trial 4: MATLAB:     3.7642 GrB:     0.2039 speedup:      18.46 err: 1.66754e-07
-average: MATLAB:     3.7609 GrB:     0.2247 speedup:      16.74
+trial 1: MATLAB:     3.7822 GrB:     0.2777 speedup:      13.62 err: 1.66754e-07
+trial 2: MATLAB:     3.8007 GrB:     0.2139 speedup:      17.77 err: 1.66754e-07
+trial 3: MATLAB:     3.8010 GrB:     0.2176 speedup:      17.47 err: 1.66754e-07
+trial 4: MATLAB:     3.8017 GrB:     0.2331 speedup:      16.31 err: 1.66754e-07
+average: MATLAB:     3.7964 GrB:     0.2356 speedup:      16.12
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2328 GrB:     0.0179 speedup:      13.02 err: 3.59677e-08
-trial 2: MATLAB:     0.2407 GrB:     0.0076 speedup:      31.64 err: 3.59641e-08
-trial 3: MATLAB:     0.2249 GrB:     0.0082 speedup:      27.28 err: 3.59578e-08
-trial 4: MATLAB:     0.2111 GrB:     0.0080 speedup:      26.50 err: 3.59687e-08
-average: MATLAB:     0.2274 GrB:     0.0104 speedup:      21.81
+trial 1: MATLAB:     0.2214 GrB:     0.0194 speedup:      11.41 err: 3.59323e-08
+trial 2: MATLAB:     0.2209 GrB:     0.0088 speedup:      25.22 err: 3.59853e-08
+trial 3: MATLAB:     0.2086 GrB:     0.0078 speedup:      26.91 err: 3.59694e-08
+trial 4: MATLAB:     0.2207 GrB:     0.0082 speedup:      26.99 err: 3.59819e-08
+average: MATLAB:     0.2179 GrB:     0.0110 speedup:      19.77
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0359 GrB:     0.0160 speedup:       2.24 err: 4.88011e-08
-trial 2: MATLAB:     0.0358 GrB:     0.0155 speedup:       2.31 err: 4.8775e-08
-trial 3: MATLAB:     0.0362 GrB:     0.0151 speedup:       2.40 err: 4.87229e-08
-trial 4: MATLAB:     0.0363 GrB:     0.0157 speedup:       2.30 err: 4.87412e-08
-average: MATLAB:     0.0361 GrB:     0.0156 speedup:       2.31
+trial 1: MATLAB:     0.0411 GrB:     0.0143 speedup:       2.87 err: 4.87039e-08
+trial 2: MATLAB:     0.0377 GrB:     0.0127 speedup:       2.98 err: 4.86783e-08
+trial 3: MATLAB:     0.0373 GrB:     0.0123 speedup:       3.02 err: 4.86985e-08
+trial 4: MATLAB:     0.0374 GrB:     0.0140 speedup:       2.68 err: 4.87141e-08
+average: MATLAB:     0.0384 GrB:     0.0133 speedup:       2.88
 
 === MATLAB: double (real) vs GraphBLAS: double
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     3.7503 GrB:     0.3087 speedup:      12.15 err: 0
-trial 2: MATLAB:     3.7593 GrB:     0.2163 speedup:      17.38 err: 0
-trial 3: MATLAB:     3.7607 GrB:     0.2156 speedup:      17.44 err: 0
-trial 4: MATLAB:     3.7595 GrB:     0.2160 speedup:      17.40 err: 0
-average: MATLAB:     3.7575 GrB:     0.2392 speedup:      15.71
+trial 1: MATLAB:     3.7809 GrB:     0.3733 speedup:      10.13 err: 0
+trial 2: MATLAB:     3.8024 GrB:     0.3289 speedup:      11.56 err: 0
+trial 3: MATLAB:     3.8035 GrB:     0.2579 speedup:      14.75 err: 0
+trial 4: MATLAB:     3.8014 GrB:     0.2569 speedup:      14.80 err: 0
+average: MATLAB:     3.7971 GrB:     0.3042 speedup:      12.48
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2237 GrB:     0.0240 speedup:       9.32 err: 1.91167e-18
-trial 2: MATLAB:     0.2217 GrB:     0.0106 speedup:      20.91 err: 1.78066e-18
-trial 3: MATLAB:     0.2215 GrB:     0.0105 speedup:      21.02 err: 1.69227e-18
-trial 4: MATLAB:     0.2214 GrB:     0.0111 speedup:      20.02 err: 1.73048e-18
-average: MATLAB:     0.2221 GrB:     0.0140 speedup:      15.81
+trial 1: MATLAB:     0.2223 GrB:     0.0272 speedup:       8.18 err: 1.98831e-18
+trial 2: MATLAB:     0.2055 GrB:     0.0118 speedup:      17.41 err: 1.62657e-18
+trial 3: MATLAB:     0.2060 GrB:     0.0107 speedup:      19.25 err: 2.37914e-18
+trial 4: MATLAB:     0.2056 GrB:     0.0108 speedup:      18.99 err: 1.94308e-18
+average: MATLAB:     0.2098 GrB:     0.0151 speedup:      13.87
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0355 GrB:     0.0195 speedup:       1.82 err: 4.48698e-18
-trial 2: MATLAB:     0.0362 GrB:     0.0191 speedup:       1.89 err: 4.64343e-18
-trial 3: MATLAB:     0.0367 GrB:     0.0187 speedup:       1.96 err: 5.06085e-18
-trial 4: MATLAB:     0.0368 GrB:     0.0187 speedup:       1.98 err: 5.10341e-18
-average: MATLAB:     0.0363 GrB:     0.0190 speedup:       1.91
+trial 1: MATLAB:     0.0372 GrB:     0.0214 speedup:       1.74 err: 6.77465e-18
+trial 2: MATLAB:     0.0371 GrB:     0.0188 speedup:       1.97 err: 5.30567e-18
+trial 3: MATLAB:     0.0378 GrB:     0.0172 speedup:       2.19 err: 5.5798e-18
+trial 4: MATLAB:     0.0375 GrB:     0.0159 speedup:       2.36 err: 4.66091e-18
+average: MATLAB:     0.0374 GrB:     0.0183 speedup:       2.04
 
 === MATLAB: double complex vs GraphBLAS: single complex
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     7.8866 GrB:     0.3520 speedup:      22.41 err: 1.67076e-07
-trial 2: MATLAB:     7.9114 GrB:     0.2931 speedup:      27.00 err: 1.67076e-07
-trial 3: MATLAB:     7.9061 GrB:     0.3725 speedup:      21.22 err: 1.67076e-07
-trial 4: MATLAB:     7.9073 GrB:     0.2855 speedup:      27.70 err: 1.67076e-07
-average: MATLAB:     7.9028 GrB:     0.3258 speedup:      24.26
+trial 1: MATLAB:     8.5904 GrB:     0.3827 speedup:      22.45 err: 1.67076e-07
+trial 2: MATLAB:     8.6116 GrB:     0.4502 speedup:      19.13 err: 1.67076e-07
+trial 3: MATLAB:     8.6200 GrB:     0.2959 speedup:      29.13 err: 1.67076e-07
+trial 4: MATLAB:     8.6117 GrB:     0.2875 speedup:      29.95 err: 1.67076e-07
+average: MATLAB:     8.6084 GrB:     0.3541 speedup:      24.31
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2977 GrB:     0.0245 speedup:      12.13 err: 4.5749e-08
-trial 2: MATLAB:     0.2815 GrB:     0.0109 speedup:      25.72 err: 4.58505e-08
-trial 3: MATLAB:     0.2814 GrB:     0.0121 speedup:      23.34 err: 4.57556e-08
-trial 4: MATLAB:     0.2819 GrB:     0.0113 speedup:      24.94 err: 4.5774e-08
-average: MATLAB:     0.2856 GrB:     0.0147 speedup:      19.42
+trial 1: MATLAB:     0.3085 GrB:     0.0249 speedup:      12.39 err: 4.57594e-08
+trial 2: MATLAB:     0.2876 GrB:     0.0102 speedup:      28.11 err: 4.57466e-08
+trial 3: MATLAB:     0.2876 GrB:     0.0103 speedup:      27.82 err: 4.57482e-08
+trial 4: MATLAB:     0.2873 GrB:     0.0102 speedup:      28.18 err: 4.57895e-08
+average: MATLAB:     0.2927 GrB:     0.0139 speedup:      21.04
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.1241 GrB:     0.0223 speedup:       5.57 err: 5.73696e-08
-trial 2: MATLAB:     0.1249 GrB:     0.0229 speedup:       5.47 err: 5.73579e-08
-trial 3: MATLAB:     0.1248 GrB:     0.0224 speedup:       5.56 err: 5.73493e-08
-trial 4: MATLAB:     0.1249 GrB:     0.0226 speedup:       5.53 err: 5.73671e-08
-average: MATLAB:     0.1247 GrB:     0.0225 speedup:       5.53
+trial 1: MATLAB:     0.1337 GrB:     0.0223 speedup:       5.99 err: 5.73863e-08
+trial 2: MATLAB:     0.1344 GrB:     0.0202 speedup:       6.66 err: 5.73787e-08
+trial 3: MATLAB:     0.1343 GrB:     0.0216 speedup:       6.23 err: 5.74014e-08
+trial 4: MATLAB:     0.1344 GrB:     0.0209 speedup:       6.42 err: 5.74188e-08
+average: MATLAB:     0.1342 GrB:     0.0212 speedup:       6.32
 
 === MATLAB: double complex vs GraphBLAS: double complex
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     7.8960 GrB:     0.2910 speedup:      27.13 err: 0
-trial 2: MATLAB:     7.9072 GrB:     0.3126 speedup:      25.29 err: 0
-trial 3: MATLAB:     7.9070 GrB:     0.3129 speedup:      25.27 err: 0
-trial 4: MATLAB:     7.9076 GrB:     0.3156 speedup:      25.06 err: 0
-average: MATLAB:     7.9045 GrB:     0.3080 speedup:      25.66
+trial 1: MATLAB:     8.6014 GrB:     0.3149 speedup:      27.31 err: 0
+trial 2: MATLAB:     8.6027 GrB:     0.3432 speedup:      25.07 err: 0
+trial 3: MATLAB:     8.6165 GrB:     0.3425 speedup:      25.16 err: 0
+trial 4: MATLAB:     8.6144 GrB:     0.3351 speedup:      25.71 err: 0
+average: MATLAB:     8.6088 GrB:     0.3339 speedup:      25.78
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2990 GrB:     0.0366 speedup:       8.18 err: 2.11905e-18
-trial 2: MATLAB:     0.2816 GrB:     0.0198 speedup:      14.24 err: 2.11446e-18
-trial 3: MATLAB:     0.2819 GrB:     0.0200 speedup:      14.10 err: 2.17699e-18
-trial 4: MATLAB:     0.2821 GrB:     0.0191 speedup:      14.76 err: 2.27212e-18
-average: MATLAB:     0.2862 GrB:     0.0239 speedup:      11.99
+trial 1: MATLAB:     0.3109 GrB:     0.0371 speedup:       8.37 err: 2.50978e-18
+trial 2: MATLAB:     0.2982 GrB:     0.0176 speedup:      16.96 err: 2.34916e-18
+trial 3: MATLAB:     0.2878 GrB:     0.0170 speedup:      16.97 err: 2.37391e-18
+trial 4: MATLAB:     0.2877 GrB:     0.0155 speedup:      18.60 err: 2.56515e-18
+average: MATLAB:     0.2961 GrB:     0.0218 speedup:      13.59
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.1242 GrB:     0.0295 speedup:       4.21 err: 6.40588e-18
-trial 2: MATLAB:     0.1249 GrB:     0.0301 speedup:       4.15 err: 6.24168e-18
-trial 3: MATLAB:     0.1250 GrB:     0.0310 speedup:       4.03 err: 6.12928e-18
-trial 4: MATLAB:     0.1249 GrB:     0.0301 speedup:       4.15 err: 6.38989e-18
-average: MATLAB:     0.1247 GrB:     0.0302 speedup:       4.13
+trial 1: MATLAB:     0.1372 GrB:     0.0251 speedup:       5.47 err: 7.24933e-18
+trial 2: MATLAB:     0.1343 GrB:     0.0257 speedup:       5.23 err: 6.71833e-18
+trial 3: MATLAB:     0.1344 GrB:     0.0246 speedup:       5.47 err: 7.52593e-18
+trial 4: MATLAB:     0.1347 GrB:     0.0251 speedup:       5.38 err: 6.99933e-18
+average: MATLAB:     0.1352 GrB:     0.0251 speedup:       5.39
 
 -------------------------------------------------
 Testing performance of C=A*B using 40 threads:
@@ -206,81 +207,123 @@ Testing performance of C=A*B using 40 threads:
 
 === MATLAB: double (real) vs GraphBLAS: single
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     3.7511 GrB:     0.1791 speedup:      20.95 err: 1.66299e-07
-trial 2: MATLAB:     3.7694 GrB:     0.1857 speedup:      20.30 err: 1.66299e-07
-trial 3: MATLAB:     3.7620 GrB:     0.1902 speedup:      19.78 err: 1.66299e-07
-trial 4: MATLAB:     3.7920 GrB:     0.1957 speedup:      19.38 err: 1.66299e-07
-average: MATLAB:     3.7686 GrB:     0.1877 speedup:      20.08
+trial 1: MATLAB:     3.7816 GrB:     0.1910 speedup:      19.80 err: 1.66299e-07
+trial 2: MATLAB:     3.8018 GrB:     0.2026 speedup:      18.77 err: 1.66299e-07
+trial 3: MATLAB:     3.8025 GrB:     0.2173 speedup:      17.50 err: 1.66299e-07
+trial 4: MATLAB:     3.8030 GrB:     0.2190 speedup:      17.37 err: 1.66299e-07
+average: MATLAB:     3.7972 GrB:     0.2075 speedup:      18.30
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2376 GrB:     0.0189 speedup:      12.60 err: 3.59002e-08
-trial 2: MATLAB:     0.2454 GrB:     0.0092 speedup:      26.64 err: 3.59017e-08
-trial 3: MATLAB:     0.2290 GrB:     0.0079 speedup:      28.83 err: 3.59127e-08
-trial 4: MATLAB:     0.2152 GrB:     0.0096 speedup:      22.33 err: 3.59439e-08
-average: MATLAB:     0.2318 GrB:     0.0114 speedup:      20.31
+trial 1: MATLAB:     0.2252 GrB:     0.0176 speedup:      12.81 err: 3.58846e-08
+trial 2: MATLAB:     0.2232 GrB:     0.0074 speedup:      29.98 err: 3.5885e-08
+trial 3: MATLAB:     0.2109 GrB:     0.0073 speedup:      28.85 err: 3.58637e-08
+trial 4: MATLAB:     0.2233 GrB:     0.0059 speedup:      37.74 err: 3.59073e-08
+average: MATLAB:     0.2206 GrB:     0.0096 speedup:      23.07
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0369 GrB:     0.0162 speedup:       2.28 err: 4.84726e-08
-trial 2: MATLAB:     0.0369 GrB:     0.0147 speedup:       2.51 err: 4.84549e-08
-trial 3: MATLAB:     0.0377 GrB:     0.0149 speedup:       2.52 err: 4.84251e-08
-trial 4: MATLAB:     0.0370 GrB:     0.0144 speedup:       2.57 err: 4.84684e-08
-average: MATLAB:     0.0371 GrB:     0.0151 speedup:       2.47
+trial 1: MATLAB:     0.0411 GrB:     0.0127 speedup:       3.25 err: 4.84918e-08
+trial 2: MATLAB:     0.0374 GrB:     0.0117 speedup:       3.19 err: 4.8497e-08
+trial 3: MATLAB:     0.0373 GrB:     0.0104 speedup:       3.57 err: 4.84767e-08
+trial 4: MATLAB:     0.0425 GrB:     0.0104 speedup:       4.07 err: 4.84922e-08
+average: MATLAB:     0.0396 GrB:     0.0113 speedup:       3.50
 
 === MATLAB: double (real) vs GraphBLAS: double
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     3.7726 GrB:     0.2020 speedup:      18.68 err: 0
-trial 2: MATLAB:     3.7838 GrB:     0.2078 speedup:      18.21 err: 0
-trial 3: MATLAB:     3.7867 GrB:     0.2141 speedup:      17.69 err: 0
-trial 4: MATLAB:     3.7848 GrB:     0.2196 speedup:      17.24 err: 0
-average: MATLAB:     3.7820 GrB:     0.2109 speedup:      17.94
+trial 1: MATLAB:     3.8404 GrB:     0.2141 speedup:      17.94 err: 0
+trial 2: MATLAB:     3.7882 GrB:     0.2379 speedup:      15.92 err: 0
+trial 3: MATLAB:     3.7892 GrB:     0.2248 speedup:      16.85 err: 0
+trial 4: MATLAB:     3.7888 GrB:     0.2227 speedup:      17.01 err: 0
+average: MATLAB:     3.8017 GrB:     0.2249 speedup:      16.90
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2236 GrB:     0.0231 speedup:       9.67 err: 2.85795e-18
-trial 2: MATLAB:     0.2215 GrB:     0.0107 speedup:      20.67 err: 2.96379e-18
-trial 3: MATLAB:     0.2220 GrB:     0.0106 speedup:      20.89 err: 3.06503e-18
-trial 4: MATLAB:     0.2214 GrB:     0.0104 speedup:      21.37 err: 2.86709e-18
-average: MATLAB:     0.2221 GrB:     0.0137 speedup:      16.21
+trial 1: MATLAB:     0.2198 GrB:     0.0228 speedup:       9.62 err: 2.75062e-18
+trial 2: MATLAB:     0.2039 GrB:     0.0083 speedup:      24.68 err: 3.26793e-18
+trial 3: MATLAB:     0.2043 GrB:     0.0078 speedup:      26.23 err: 2.58955e-18
+trial 4: MATLAB:     0.2039 GrB:     0.0076 speedup:      27.00 err: 2.94408e-18
+average: MATLAB:     0.2080 GrB:     0.0116 speedup:      17.91
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0365 GrB:     0.0159 speedup:       2.29 err: 9.45379e-18
-trial 2: MATLAB:     0.0370 GrB:     0.0154 speedup:       2.40 err: 9.45917e-18
-trial 3: MATLAB:     0.0374 GrB:     0.0170 speedup:       2.20 err: 9.45645e-18
-trial 4: MATLAB:     0.0369 GrB:     0.0165 speedup:       2.23 err: 9.14239e-18
-average: MATLAB:     0.0370 GrB:     0.0162 speedup:       2.28
+trial 1: MATLAB:     0.0426 GrB:     0.0139 speedup:       3.07 err: 7.66364e-18
+trial 2: MATLAB:     0.0410 GrB:     0.0109 speedup:       3.74 err: 8.1437e-18
+trial 3: MATLAB:     0.0410 GrB:     0.0126 speedup:       3.25 err: 7.96734e-18
+trial 4: MATLAB:     0.0410 GrB:     0.0127 speedup:       3.23 err: 8.23416e-18
+average: MATLAB:     0.0414 GrB:     0.0125 speedup:       3.30
 
 === MATLAB: double complex vs GraphBLAS: single complex
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     7.9119 GrB:     0.2556 speedup:      30.95 err: 1.71854e-07
-trial 2: MATLAB:     7.9310 GrB:     0.2706 speedup:      29.31 err: 1.71854e-07
-trial 3: MATLAB:     7.9309 GrB:     0.2930 speedup:      27.07 err: 1.71854e-07
-trial 4: MATLAB:     7.9329 GrB:     0.2728 speedup:      29.08 err: 1.71854e-07
-average: MATLAB:     7.9267 GrB:     0.2730 speedup:      29.04
+trial 1: MATLAB:     8.6778 GrB:     0.2846 speedup:      30.49 err: 1.71854e-07
+trial 2: MATLAB:     8.6130 GrB:     0.3069 speedup:      28.06 err: 1.71854e-07
+trial 3: MATLAB:     8.6291 GrB:     0.3064 speedup:      28.16 err: 1.71854e-07
+trial 4: MATLAB:     8.6014 GrB:     0.2850 speedup:      30.18 err: 1.71854e-07
+average: MATLAB:     8.6303 GrB:     0.2957 speedup:      29.18
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.3016 GrB:     0.0243 speedup:      12.41 err: 4.58012e-08
-trial 2: MATLAB:     0.2852 GrB:     0.0117 speedup:      24.29 err: 4.57655e-08
-trial 3: MATLAB:     0.2858 GrB:     0.0138 speedup:      20.78 err: 4.57947e-08
-trial 4: MATLAB:     0.2850 GrB:     0.0119 speedup:      23.85 err: 4.577e-08
-average: MATLAB:     0.2894 GrB:     0.0154 speedup:      18.75
+trial 1: MATLAB:     0.3070 GrB:     0.0258 speedup:      11.88 err: 4.58149e-08
+trial 2: MATLAB:     0.2858 GrB:     0.0098 speedup:      29.15 err: 4.57967e-08
+trial 3: MATLAB:     0.2862 GrB:     0.0091 speedup:      31.51 err: 4.5832e-08
+trial 4: MATLAB:     0.2859 GrB:     0.0105 speedup:      27.14 err: 4.57913e-08
+average: MATLAB:     0.2912 GrB:     0.0138 speedup:      21.08
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.1256 GrB:     0.0234 speedup:       5.37 err: 5.73804e-08
-trial 2: MATLAB:     0.1264 GrB:     0.0223 speedup:       5.66 err: 5.7385e-08
-trial 3: MATLAB:     0.1266 GrB:     0.0235 speedup:       5.38 err: 5.74139e-08
-trial 4: MATLAB:     0.1263 GrB:     0.0224 speedup:       5.64 err: 5.73704e-08
-average: MATLAB:     0.1262 GrB:     0.0229 speedup:       5.51
+trial 1: MATLAB:     0.1325 GrB:     0.0199 speedup:       6.65 err: 5.74647e-08
+trial 2: MATLAB:     0.1332 GrB:     0.0197 speedup:       6.77 err: 5.74833e-08
+trial 3: MATLAB:     0.1334 GrB:     0.0185 speedup:       7.21 err: 5.74665e-08
+trial 4: MATLAB:     0.1333 GrB:     0.0211 speedup:       6.32 err: 5.74735e-08
+average: MATLAB:     0.1331 GrB:     0.0198 speedup:       6.72
 
 === MATLAB: double complex vs GraphBLAS: double complex
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     7.9200 GrB:     0.2870 speedup:      27.59 err: 0
-trial 2: MATLAB:     7.9326 GrB:     0.3041 speedup:      26.09 err: 0
-trial 3: MATLAB:     7.9325 GrB:     0.3133 speedup:      25.32 err: 0
-trial 4: MATLAB:     7.9344 GrB:     0.3034 speedup:      26.15 err: 0
-average: MATLAB:     7.9299 GrB:     0.3020 speedup:      26.26
+trial 1: MATLAB:     8.6641 GrB:     0.2967 speedup:      29.20 err: 0
+trial 2: MATLAB:     8.6113 GrB:     0.3463 speedup:      24.87 err: 0
+trial 3: MATLAB:     8.6190 GrB:     0.3444 speedup:      25.02 err: 0
+trial 4: MATLAB:     8.6290 GrB:     0.3700 speedup:      23.32 err: 0
+average: MATLAB:     8.6308 GrB:     0.3394 speedup:      25.43
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.3019 GrB:     0.0371 speedup:       8.13 err: 3.08382e-18
-trial 2: MATLAB:     0.2855 GrB:     0.0193 speedup:      14.79 err: 3.03002e-18
-trial 3: MATLAB:     0.2855 GrB:     0.0195 speedup:      14.62 err: 3.33865e-18
-trial 4: MATLAB:     0.2854 GrB:     0.0202 speedup:      14.11 err: 3.13626e-18
-average: MATLAB:     0.2896 GrB:     0.0241 speedup:      12.04
+trial 1: MATLAB:     0.3201 GrB:     0.0364 speedup:       8.79 err: 2.91584e-18
+trial 2: MATLAB:     0.2986 GrB:     0.0143 speedup:      20.93 err: 3.0252e-18
+trial 3: MATLAB:     0.2883 GrB:     0.0142 speedup:      20.27 err: 3.11557e-18
+trial 4: MATLAB:     0.2885 GrB:     0.0148 speedup:      19.44 err: 2.88111e-18
+average: MATLAB:     0.2989 GrB:     0.0199 speedup:      14.99
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.1251 GrB:     0.0286 speedup:       4.37 err: 1.04442e-17
-trial 2: MATLAB:     0.1264 GrB:     0.0298 speedup:       4.24 err: 1.01352e-17
-trial 3: MATLAB:     0.1264 GrB:     0.0303 speedup:       4.18 err: 1.01733e-17
-trial 4: MATLAB:     0.1263 GrB:     0.0290 speedup:       4.35 err: 1.03531e-17
-average: MATLAB:     0.1260 GrB:     0.0294 speedup:       4.28
+trial 1: MATLAB:     0.1377 GrB:     0.0233 speedup:       5.91 err: 9.77031e-18
+trial 2: MATLAB:     0.1348 GrB:     0.0221 speedup:       6.10 err: 9.39256e-18
+trial 3: MATLAB:     0.1353 GrB:     0.0230 speedup:       5.87 err: 8.59556e-18
+trial 4: MATLAB:     0.1349 GrB:     0.0232 speedup:       5.81 err: 8.17499e-18
+average: MATLAB:     0.1357 GrB:     0.0229 speedup:       5.92
+
+GrB.ver
+---------------------------------------------------------------------------
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+http://suitesparse.com  Dept of Computer Sci. & Eng, Texas A&M University.
+
+Version: 4.0.1 (Jan 4, 2021)
+
+License:
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may
+not use SuiteSparse:GraphBLAS except in compliance with the License.  You
+may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Spec:
+GraphBLAS C API, by Aydin Buluc, Timothy Mattson, Scott McMillan,
+Jose' Moreira, Carl Yang, and Benjamin Brock.  Based on 'GraphBLAS
+Mathematics by Jeremy Kepner.  See also 'Graph Algorithms in the Language
+of Linear Algebra,' edited by J. Kepner and J. Gilbert, SIAM, 2011.
+
+URL: http://graphblas.org
+---------------------------------------------------------------------------
+
+feature ('numcores')
+MATLAB detected: 20 physical cores.
+MATLAB detected: 40 logical cores.
+MATLAB was assigned: 40 logical cores by the OS.
+MATLAB is using: 20 logical cores.
+MATLAB is not using all logical cores because hyper-threading is enabled.
+
+ans =
+
+    20
+
 diary off
diff --git a/GraphBLAS/GraphBLAS/demo/mxm_demo_DellXPS13.txt b/GraphBLAS/GraphBLAS/demo/mxm_demo_DellXPS13.txt
index c70e0b645e..e3eb069112 100644
--- a/GraphBLAS/GraphBLAS/demo/mxm_demo_DellXPS13.txt
+++ b/GraphBLAS/GraphBLAS/demo/mxm_demo_DellXPS13.txt
@@ -25,6 +25,7 @@ Prob2 =
      notes: [4x59 char]
 slash
 MATLAB version: 9.8 release: (R2020a)
+GraphBLAS version: 4.0.1 (Jan 4, 2021)
 
 -------------------------------------------------
 Testing single-threaded performance of C=A*B:
@@ -32,83 +33,83 @@ Testing single-threaded performance of C=A*B:
 
 === MATLAB: double (real) vs GraphBLAS: single
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     2.6985 GrB:     2.0130 speedup:       1.34 err: 1.63154e-07
-trial 2: MATLAB:     2.7443 GrB:     2.0528 speedup:       1.34 err: 1.63154e-07
-trial 3: MATLAB:     2.7937 GrB:     2.0850 speedup:       1.34 err: 1.63154e-07
-trial 4: MATLAB:     2.8493 GrB:     2.1111 speedup:       1.35 err: 1.63154e-07
-average: MATLAB:     2.7715 GrB:     2.0655 speedup:       1.34
+trial 1: MATLAB:     2.8151 GrB:     2.5738 speedup:       1.09 err: 1.63154e-07
+trial 2: MATLAB:     2.9101 GrB:     2.6455 speedup:       1.10 err: 1.63154e-07
+trial 3: MATLAB:     2.9749 GrB:     2.7096 speedup:       1.10 err: 1.63154e-07
+trial 4: MATLAB:     3.0297 GrB:     2.7680 speedup:       1.09 err: 1.63154e-07
+average: MATLAB:     2.9324 GrB:     2.6742 speedup:       1.10
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.1748 GrB:     0.0594 speedup:       2.94 err: 3.60006e-08
-trial 2: MATLAB:     0.1678 GrB:     0.0529 speedup:       3.17 err: 3.60006e-08
-trial 3: MATLAB:     0.1660 GrB:     0.0529 speedup:       3.14 err: 3.60006e-08
-trial 4: MATLAB:     0.1690 GrB:     0.0529 speedup:       3.19 err: 3.60006e-08
-average: MATLAB:     0.1694 GrB:     0.0545 speedup:       3.11
+trial 1: MATLAB:     0.1934 GrB:     0.0582 speedup:       3.32 err: 3.60006e-08
+trial 2: MATLAB:     0.1896 GrB:     0.0452 speedup:       4.19 err: 3.60006e-08
+trial 3: MATLAB:     0.1789 GrB:     0.0463 speedup:       3.86 err: 3.60006e-08
+trial 4: MATLAB:     0.1852 GrB:     0.0465 speedup:       3.99 err: 3.60006e-08
+average: MATLAB:     0.1868 GrB:     0.0491 speedup:       3.81
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0301 GrB:     0.1096 speedup:       0.27 err: 4.86966e-08
-trial 2: MATLAB:     0.0295 GrB:     0.1099 speedup:       0.27 err: 4.86966e-08
-trial 3: MATLAB:     0.0299 GrB:     0.1099 speedup:       0.27 err: 4.86966e-08
-trial 4: MATLAB:     0.0287 GrB:     0.1100 speedup:       0.26 err: 4.86966e-08
-average: MATLAB:     0.0295 GrB:     0.1099 speedup:       0.27
+trial 1: MATLAB:     0.0367 GrB:     0.0430 speedup:       0.85 err: 4.86966e-08
+trial 2: MATLAB:     0.0369 GrB:     0.0440 speedup:       0.84 err: 4.86966e-08
+trial 3: MATLAB:     0.0369 GrB:     0.0418 speedup:       0.88 err: 4.86966e-08
+trial 4: MATLAB:     0.0367 GrB:     0.0433 speedup:       0.85 err: 4.86966e-08
+average: MATLAB:     0.0368 GrB:     0.0430 speedup:       0.86
 
 === MATLAB: double (real) vs GraphBLAS: double
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     2.8741 GrB:     2.2235 speedup:       1.29 err: 0
-trial 2: MATLAB:     2.9556 GrB:     2.2587 speedup:       1.31 err: 0
-trial 3: MATLAB:     2.9355 GrB:     2.2755 speedup:       1.29 err: 0
-trial 4: MATLAB:     2.9714 GrB:     2.2565 speedup:       1.32 err: 0
-average: MATLAB:     2.9341 GrB:     2.2536 speedup:       1.30
+trial 1: MATLAB:     3.1134 GrB:     3.1008 speedup:       1.00 err: 0
+trial 2: MATLAB:     3.1892 GrB:     3.1229 speedup:       1.02 err: 0
+trial 3: MATLAB:     3.1533 GrB:     3.0565 speedup:       1.03 err: 0
+trial 4: MATLAB:     3.1657 GrB:     3.0411 speedup:       1.04 err: 0
+average: MATLAB:     3.1554 GrB:     3.0803 speedup:       1.02
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.1754 GrB:     0.0647 speedup:       2.71 err: 0
-trial 2: MATLAB:     0.1694 GrB:     0.0550 speedup:       3.08 err: 0
-trial 3: MATLAB:     0.1686 GrB:     0.0553 speedup:       3.05 err: 0
-trial 4: MATLAB:     0.1674 GrB:     0.0551 speedup:       3.04 err: 0
-average: MATLAB:     0.1702 GrB:     0.0575 speedup:       2.96
+trial 1: MATLAB:     0.1966 GrB:     0.0662 speedup:       2.97 err: 0
+trial 2: MATLAB:     0.1805 GrB:     0.0474 speedup:       3.81 err: 0
+trial 3: MATLAB:     0.1781 GrB:     0.0486 speedup:       3.66 err: 0
+trial 4: MATLAB:     0.1764 GrB:     0.0479 speedup:       3.68 err: 0
+average: MATLAB:     0.1829 GrB:     0.0525 speedup:       3.48
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0337 GrB:     0.1125 speedup:       0.30 err: 0
-trial 2: MATLAB:     0.0303 GrB:     0.1111 speedup:       0.27 err: 0
-trial 3: MATLAB:     0.0316 GrB:     0.1114 speedup:       0.28 err: 0
-trial 4: MATLAB:     0.0305 GrB:     0.1112 speedup:       0.27 err: 0
-average: MATLAB:     0.0315 GrB:     0.1116 speedup:       0.28
+trial 1: MATLAB:     0.0352 GrB:     0.0473 speedup:       0.74 err: 0
+trial 2: MATLAB:     0.0383 GrB:     0.0478 speedup:       0.80 err: 0
+trial 3: MATLAB:     0.0374 GrB:     0.0477 speedup:       0.78 err: 0
+trial 4: MATLAB:     0.0372 GrB:     0.0479 speedup:       0.78 err: 0
+average: MATLAB:     0.0370 GrB:     0.0477 speedup:       0.78
 
 === MATLAB: double complex vs GraphBLAS: single complex
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     5.7407 GrB:     3.1930 speedup:       1.80 err: 1.70413e-07
-trial 2: MATLAB:     5.6105 GrB:     3.1906 speedup:       1.76 err: 1.70413e-07
-trial 3: MATLAB:     5.6970 GrB:     3.1832 speedup:       1.79 err: 1.70413e-07
-trial 4: MATLAB:     6.1397 GrB:     3.1874 speedup:       1.93 err: 1.70413e-07
-average: MATLAB:     5.7969 GrB:     3.1885 speedup:       1.82
+trial 1: MATLAB:     5.9933 GrB:     4.0888 speedup:       1.47 err: 1.70413e-07
+trial 2: MATLAB:     6.0862 GrB:     4.1487 speedup:       1.47 err: 1.70413e-07
+trial 3: MATLAB:     6.1503 GrB:     4.5718 speedup:       1.35 err: 1.70413e-07
+trial 4: MATLAB:     6.4036 GrB:     4.1321 speedup:       1.55 err: 1.70413e-07
+average: MATLAB:     6.1584 GrB:     4.2353 speedup:       1.45
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2196 GrB:     0.0806 speedup:       2.73 err: 4.56897e-08
-trial 2: MATLAB:     0.2092 GrB:     0.0718 speedup:       2.91 err: 4.56897e-08
-trial 3: MATLAB:     0.2102 GrB:     0.0719 speedup:       2.92 err: 4.56897e-08
-trial 4: MATLAB:     0.2112 GrB:     0.0715 speedup:       2.95 err: 4.56897e-08
-average: MATLAB:     0.2125 GrB:     0.0740 speedup:       2.87
+trial 1: MATLAB:     0.2461 GrB:     0.0709 speedup:       3.47 err: 4.56897e-08
+trial 2: MATLAB:     0.2389 GrB:     0.0496 speedup:       4.82 err: 4.56897e-08
+trial 3: MATLAB:     0.2281 GrB:     0.0588 speedup:       3.88 err: 4.56897e-08
+trial 4: MATLAB:     0.2343 GrB:     0.0505 speedup:       4.64 err: 4.56897e-08
+average: MATLAB:     0.2368 GrB:     0.0574 speedup:       4.12
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0855 GrB:     0.1729 speedup:       0.49 err: 5.75158e-08
-trial 2: MATLAB:     0.0792 GrB:     0.1716 speedup:       0.46 err: 5.75158e-08
-trial 3: MATLAB:     0.0855 GrB:     0.1713 speedup:       0.50 err: 5.75158e-08
-trial 4: MATLAB:     0.0809 GrB:     0.1717 speedup:       0.47 err: 5.75158e-08
-average: MATLAB:     0.0828 GrB:     0.1719 speedup:       0.48
+trial 1: MATLAB:     0.0932 GrB:     0.0573 speedup:       1.63 err: 5.75158e-08
+trial 2: MATLAB:     0.0906 GrB:     0.0578 speedup:       1.57 err: 5.75158e-08
+trial 3: MATLAB:     0.0896 GrB:     0.0570 speedup:       1.57 err: 5.75158e-08
+trial 4: MATLAB:     0.0921 GrB:     0.0571 speedup:       1.61 err: 5.75158e-08
+average: MATLAB:     0.0914 GrB:     0.0573 speedup:       1.59
 
 === MATLAB: double complex vs GraphBLAS: double complex
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     5.6534 GrB:     3.3685 speedup:       1.68 err: 0
-trial 2: MATLAB:     5.7998 GrB:     3.4582 speedup:       1.68 err: 0
-trial 3: MATLAB:     5.7682 GrB:     3.4221 speedup:       1.69 err: 0
-trial 4: MATLAB:     5.8052 GrB:     3.4220 speedup:       1.70 err: 0
-average: MATLAB:     5.7566 GrB:     3.4177 speedup:       1.68
+trial 1: MATLAB:     6.0286 GrB:     3.9882 speedup:       1.51 err: 0
+trial 2: MATLAB:     6.1365 GrB:     3.9255 speedup:       1.56 err: 0
+trial 3: MATLAB:     6.0697 GrB:     3.9323 speedup:       1.54 err: 0
+trial 4: MATLAB:     6.0473 GrB:     3.9100 speedup:       1.55 err: 0
+average: MATLAB:     6.0705 GrB:     3.9390 speedup:       1.54
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2195 GrB:     0.1091 speedup:       2.01 err: 0
-trial 2: MATLAB:     0.2108 GrB:     0.0936 speedup:       2.25 err: 0
-trial 3: MATLAB:     0.2137 GrB:     0.1048 speedup:       2.04 err: 0
-trial 4: MATLAB:     0.2089 GrB:     0.1007 speedup:       2.07 err: 0
-average: MATLAB:     0.2132 GrB:     0.1021 speedup:       2.09
+trial 1: MATLAB:     0.2305 GrB:     0.1006 speedup:       2.29 err: 0
+trial 2: MATLAB:     0.2283 GrB:     0.0865 speedup:       2.64 err: 0
+trial 3: MATLAB:     0.2120 GrB:     0.0801 speedup:       2.65 err: 0
+trial 4: MATLAB:     0.2297 GrB:     0.0870 speedup:       2.64 err: 0
+average: MATLAB:     0.2251 GrB:     0.0885 speedup:       2.54
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0821 GrB:     0.2104 speedup:       0.39 err: 0
-trial 2: MATLAB:     0.0804 GrB:     0.2112 speedup:       0.38 err: 0
-trial 3: MATLAB:     0.0805 GrB:     0.2113 speedup:       0.38 err: 0
-trial 4: MATLAB:     0.0801 GrB:     0.2106 speedup:       0.38 err: 0
-average: MATLAB:     0.0808 GrB:     0.2109 speedup:       0.38
+trial 1: MATLAB:     0.0993 GrB:     0.0900 speedup:       1.10 err: 0
+trial 2: MATLAB:     0.0980 GrB:     0.0973 speedup:       1.01 err: 0
+trial 3: MATLAB:     0.0964 GrB:     0.1000 speedup:       0.96 err: 0
+trial 4: MATLAB:     0.0970 GrB:     0.0905 speedup:       1.07 err: 0
+average: MATLAB:     0.0977 GrB:     0.0944 speedup:       1.03
 
 -------------------------------------------------
 Testing performance of C=A*B using 4 threads:
@@ -116,83 +117,83 @@ Testing performance of C=A*B using 4 threads:
 
 === MATLAB: double (real) vs GraphBLAS: single
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     2.9880 GrB:     0.8335 speedup:       3.58 err: 1.66754e-07
-trial 2: MATLAB:     3.3389 GrB:     0.8281 speedup:       4.03 err: 1.66754e-07
-trial 3: MATLAB:     3.1531 GrB:     0.7693 speedup:       4.10 err: 1.66754e-07
-trial 4: MATLAB:     3.1583 GrB:     0.7750 speedup:       4.08 err: 1.66754e-07
-average: MATLAB:     3.1596 GrB:     0.8015 speedup:       3.94
+trial 1: MATLAB:     3.0710 GrB:     1.1738 speedup:       2.62 err: 1.66754e-07
+trial 2: MATLAB:     3.2605 GrB:     1.1203 speedup:       2.91 err: 1.66754e-07
+trial 3: MATLAB:     3.2421 GrB:     1.1311 speedup:       2.87 err: 1.66754e-07
+trial 4: MATLAB:     3.2263 GrB:     1.1259 speedup:       2.87 err: 1.66754e-07
+average: MATLAB:     3.2000 GrB:     1.1378 speedup:       2.81
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2191 GrB:     0.0292 speedup:       7.50 err: 3.59022e-08
-trial 2: MATLAB:     0.1917 GrB:     0.0208 speedup:       9.23 err: 3.59139e-08
-trial 3: MATLAB:     0.1851 GrB:     0.0203 speedup:       9.10 err: 3.59058e-08
-trial 4: MATLAB:     0.1823 GrB:     0.0202 speedup:       9.04 err: 3.59304e-08
-average: MATLAB:     0.1946 GrB:     0.0226 speedup:       8.60
+trial 1: MATLAB:     0.2265 GrB:     0.0290 speedup:       7.82 err: 3.59729e-08
+trial 2: MATLAB:     0.2077 GrB:     0.0168 speedup:      12.36 err: 3.59399e-08
+trial 3: MATLAB:     0.1993 GrB:     0.0172 speedup:      11.59 err: 3.59533e-08
+trial 4: MATLAB:     0.2031 GrB:     0.0180 speedup:      11.25 err: 3.59502e-08
+average: MATLAB:     0.2091 GrB:     0.0202 speedup:      10.33
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0348 GrB:     0.0420 speedup:       0.83 err: 4.87148e-08
-trial 2: MATLAB:     0.0354 GrB:     0.0432 speedup:       0.82 err: 4.87372e-08
-trial 3: MATLAB:     0.0338 GrB:     0.0440 speedup:       0.77 err: 4.87285e-08
-trial 4: MATLAB:     0.0355 GrB:     0.0450 speedup:       0.79 err: 4.87312e-08
-average: MATLAB:     0.0349 GrB:     0.0435 speedup:       0.80
+trial 1: MATLAB:     0.0366 GrB:     0.0302 speedup:       1.21 err: 4.8658e-08
+trial 2: MATLAB:     0.0401 GrB:     0.0315 speedup:       1.27 err: 4.86415e-08
+trial 3: MATLAB:     0.0401 GrB:     0.0318 speedup:       1.26 err: 4.86721e-08
+trial 4: MATLAB:     0.0401 GrB:     0.0319 speedup:       1.26 err: 4.86539e-08
+average: MATLAB:     0.0392 GrB:     0.0314 speedup:       1.25
 
 === MATLAB: double (real) vs GraphBLAS: double
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     3.1056 GrB:     0.8568 speedup:       3.62 err: 0
-trial 2: MATLAB:     3.1695 GrB:     0.8645 speedup:       3.67 err: 0
-trial 3: MATLAB:     3.1661 GrB:     0.8669 speedup:       3.65 err: 0
-trial 4: MATLAB:     3.1759 GrB:     0.8585 speedup:       3.70 err: 0
-average: MATLAB:     3.1543 GrB:     0.8617 speedup:       3.66
+trial 1: MATLAB:     3.1586 GrB:     1.2946 speedup:       2.44 err: 0
+trial 2: MATLAB:     3.2542 GrB:     1.2840 speedup:       2.53 err: 0
+trial 3: MATLAB:     3.2285 GrB:     1.3714 speedup:       2.35 err: 0
+trial 4: MATLAB:     3.2369 GrB:     1.2709 speedup:       2.55 err: 0
+average: MATLAB:     3.2196 GrB:     1.3052 speedup:       2.47
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2186 GrB:     0.0355 speedup:       6.16 err: 9.45663e-19
-trial 2: MATLAB:     0.1952 GrB:     0.0248 speedup:       7.89 err: 9.34549e-19
-trial 3: MATLAB:     0.1887 GrB:     0.0244 speedup:       7.72 err: 9.50195e-19
-trial 4: MATLAB:     0.1855 GrB:     0.0244 speedup:       7.62 err: 9.65232e-19
-average: MATLAB:     0.1970 GrB:     0.0273 speedup:       7.23
+trial 1: MATLAB:     0.2305 GrB:     0.0337 speedup:       6.84 err: 1.06104e-18
+trial 2: MATLAB:     0.2069 GrB:     0.0199 speedup:      10.42 err: 1.11635e-18
+trial 3: MATLAB:     0.1969 GrB:     0.0266 speedup:       7.39 err: 5.18379e-19
+trial 4: MATLAB:     0.1958 GrB:     0.0246 speedup:       7.95 err: 4.79398e-19
+average: MATLAB:     0.2075 GrB:     0.0262 speedup:       7.92
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0386 GrB:     0.0452 speedup:       0.85 err: 2.16477e-18
-trial 2: MATLAB:     0.0377 GrB:     0.0451 speedup:       0.84 err: 2.14128e-18
-trial 3: MATLAB:     0.0391 GrB:     0.0457 speedup:       0.85 err: 2.15751e-18
-trial 4: MATLAB:     0.0378 GrB:     0.0464 speedup:       0.82 err: 2.12172e-18
-average: MATLAB:     0.0383 GrB:     0.0456 speedup:       0.84
+trial 1: MATLAB:     0.0381 GrB:     0.0311 speedup:       1.22 err: 2.40621e-18
+trial 2: MATLAB:     0.0412 GrB:     0.0316 speedup:       1.30 err: 2.32578e-18
+trial 3: MATLAB:     0.0420 GrB:     0.0319 speedup:       1.32 err: 2.38401e-18
+trial 4: MATLAB:     0.0400 GrB:     0.0325 speedup:       1.23 err: 2.43923e-18
+average: MATLAB:     0.0403 GrB:     0.0318 speedup:       1.27
 
 === MATLAB: double complex vs GraphBLAS: single complex
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     6.0729 GrB:     1.0515 speedup:       5.78 err: 1.67076e-07
-trial 2: MATLAB:     6.1336 GrB:     1.1285 speedup:       5.44 err: 1.67076e-07
-trial 3: MATLAB:     6.0999 GrB:     1.1001 speedup:       5.54 err: 1.67076e-07
-trial 4: MATLAB:     6.0591 GrB:     1.0755 speedup:       5.63 err: 1.67076e-07
-average: MATLAB:     6.0914 GrB:     1.0889 speedup:       5.59
+trial 1: MATLAB:     6.1511 GrB:     1.6564 speedup:       3.71 err: 1.67076e-07
+trial 2: MATLAB:     6.1331 GrB:     1.6563 speedup:       3.70 err: 1.67076e-07
+trial 3: MATLAB:     6.1641 GrB:     1.6603 speedup:       3.71 err: 1.67076e-07
+trial 4: MATLAB:     6.1658 GrB:     1.6696 speedup:       3.69 err: 1.67076e-07
+average: MATLAB:     6.1535 GrB:     1.6606 speedup:       3.71
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2591 GrB:     0.0400 speedup:       6.47 err: 4.57604e-08
-trial 2: MATLAB:     0.2374 GrB:     0.0277 speedup:       8.57 err: 4.57199e-08
-trial 3: MATLAB:     0.2284 GrB:     0.0276 speedup:       8.28 err: 4.57461e-08
-trial 4: MATLAB:     0.2324 GrB:     0.0274 speedup:       8.48 err: 4.57531e-08
-average: MATLAB:     0.2393 GrB:     0.0307 speedup:       7.80
+trial 1: MATLAB:     0.2841 GrB:     0.0373 speedup:       7.61 err: 4.57626e-08
+trial 2: MATLAB:     0.2577 GrB:     0.0236 speedup:      10.92 err: 4.57741e-08
+trial 3: MATLAB:     0.2532 GrB:     0.0235 speedup:      10.77 err: 4.5745e-08
+trial 4: MATLAB:     0.2539 GrB:     0.0231 speedup:      10.97 err: 4.57588e-08
+average: MATLAB:     0.2622 GrB:     0.0269 speedup:       9.75
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.1016 GrB:     0.0633 speedup:       1.61 err: 5.73869e-08
-trial 2: MATLAB:     0.0988 GrB:     0.0665 speedup:       1.49 err: 5.73819e-08
-trial 3: MATLAB:     0.1001 GrB:     0.0687 speedup:       1.46 err: 5.73877e-08
-trial 4: MATLAB:     0.1108 GrB:     0.0681 speedup:       1.63 err: 5.74016e-08
-average: MATLAB:     0.1028 GrB:     0.0667 speedup:       1.54
+trial 1: MATLAB:     0.1010 GrB:     0.0563 speedup:       1.79 err: 5.73907e-08
+trial 2: MATLAB:     0.1041 GrB:     0.0576 speedup:       1.81 err: 5.73975e-08
+trial 3: MATLAB:     0.1057 GrB:     0.0582 speedup:       1.82 err: 5.73847e-08
+trial 4: MATLAB:     0.1040 GrB:     0.0585 speedup:       1.78 err: 5.73969e-08
+average: MATLAB:     0.1037 GrB:     0.0577 speedup:       1.80
 
 === MATLAB: double complex vs GraphBLAS: double complex
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     6.0275 GrB:     1.3029 speedup:       4.63 err: 0
-trial 2: MATLAB:     6.1072 GrB:     1.3290 speedup:       4.60 err: 0
-trial 3: MATLAB:     6.1190 GrB:     1.3285 speedup:       4.61 err: 0
-trial 4: MATLAB:     6.1723 GrB:     1.3442 speedup:       4.59 err: 0
-average: MATLAB:     6.1065 GrB:     1.3262 speedup:       4.60
+trial 1: MATLAB:     6.1252 GrB:     1.6972 speedup:       3.61 err: 0
+trial 2: MATLAB:     6.1505 GrB:     1.7211 speedup:       3.57 err: 0
+trial 3: MATLAB:     6.1563 GrB:     1.8016 speedup:       3.42 err: 0
+trial 4: MATLAB:     6.0962 GrB:     1.7277 speedup:       3.53 err: 0
+average: MATLAB:     6.1320 GrB:     1.7369 speedup:       3.53
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2696 GrB:     0.0531 speedup:       5.07 err: 1.02212e-18
-trial 2: MATLAB:     0.2434 GrB:     0.0419 speedup:       5.80 err: 7.96916e-19
-trial 3: MATLAB:     0.2436 GrB:     0.0349 speedup:       6.97 err: 1.10678e-18
-trial 4: MATLAB:     0.2250 GrB:     0.0364 speedup:       6.18 err: 1.08192e-18
-average: MATLAB:     0.2454 GrB:     0.0416 speedup:       5.90
+trial 1: MATLAB:     0.2831 GrB:     0.0518 speedup:       5.47 err: 9.70922e-19
+trial 2: MATLAB:     0.2546 GrB:     0.0323 speedup:       7.89 err: 9.77494e-19
+trial 3: MATLAB:     0.2458 GrB:     0.0321 speedup:       7.66 err: 9.85446e-19
+trial 4: MATLAB:     0.2464 GrB:     0.0319 speedup:       7.72 err: 9.65811e-19
+average: MATLAB:     0.2575 GrB:     0.0370 speedup:       6.96
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0877 GrB:     0.0698 speedup:       1.26 err: 2.56057e-18
-trial 2: MATLAB:     0.0937 GrB:     0.0748 speedup:       1.25 err: 2.57202e-18
-trial 3: MATLAB:     0.1015 GrB:     0.0774 speedup:       1.31 err: 2.57616e-18
-trial 4: MATLAB:     0.1010 GrB:     0.0778 speedup:       1.30 err: 2.57983e-18
-average: MATLAB:     0.0960 GrB:     0.0750 speedup:       1.28
+trial 1: MATLAB:     0.1036 GrB:     0.0618 speedup:       1.68 err: 1.95098e-18
+trial 2: MATLAB:     0.1152 GrB:     0.0670 speedup:       1.72 err: 1.9996e-18
+trial 3: MATLAB:     0.1152 GrB:     0.0681 speedup:       1.69 err: 1.9647e-18
+trial 4: MATLAB:     0.1157 GrB:     0.0686 speedup:       1.69 err: 2.0081e-18
+average: MATLAB:     0.1124 GrB:     0.0664 speedup:       1.69
 
 -------------------------------------------------
 Testing performance of C=A*B using 8 threads:
@@ -200,82 +201,120 @@ Testing performance of C=A*B using 8 threads:
 
 === MATLAB: double (real) vs GraphBLAS: single
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     3.1576 GrB:     0.7394 speedup:       4.27 err: 1.66299e-07
-trial 2: MATLAB:     3.2147 GrB:     0.7440 speedup:       4.32 err: 1.66299e-07
-trial 3: MATLAB:     3.2095 GrB:     0.7455 speedup:       4.31 err: 1.66299e-07
-trial 4: MATLAB:     3.1947 GrB:     0.7488 speedup:       4.27 err: 1.66299e-07
-average: MATLAB:     3.1941 GrB:     0.7444 speedup:       4.29
+trial 1: MATLAB:     3.1757 GrB:     1.1523 speedup:       2.76 err: 1.66299e-07
+trial 2: MATLAB:     3.2655 GrB:     1.1644 speedup:       2.80 err: 1.66299e-07
+trial 3: MATLAB:     3.2497 GrB:     1.1830 speedup:       2.75 err: 1.66299e-07
+trial 4: MATLAB:     3.2368 GrB:     1.1804 speedup:       2.74 err: 1.66299e-07
+average: MATLAB:     3.2319 GrB:     1.1700 speedup:       2.76
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2232 GrB:     0.0271 speedup:       8.23 err: 3.59027e-08
-trial 2: MATLAB:     0.1963 GrB:     0.0191 speedup:      10.28 err: 3.59051e-08
-trial 3: MATLAB:     0.1901 GrB:     0.0189 speedup:      10.08 err: 3.58925e-08
-trial 4: MATLAB:     0.1870 GrB:     0.0187 speedup:       9.99 err: 3.59161e-08
-average: MATLAB:     0.1992 GrB:     0.0210 speedup:       9.50
+trial 1: MATLAB:     0.2487 GrB:     0.0258 speedup:       9.66 err: 3.59098e-08
+trial 2: MATLAB:     0.2187 GrB:     0.0140 speedup:      15.58 err: 3.58929e-08
+trial 3: MATLAB:     0.2044 GrB:     0.0138 speedup:      14.77 err: 3.58856e-08
+trial 4: MATLAB:     0.2067 GrB:     0.0136 speedup:      15.16 err: 3.59008e-08
+average: MATLAB:     0.2196 GrB:     0.0168 speedup:      13.06
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0364 GrB:     0.0383 speedup:       0.95 err: 4.84615e-08
-trial 2: MATLAB:     0.0374 GrB:     0.0387 speedup:       0.97 err: 4.84617e-08
-trial 3: MATLAB:     0.0377 GrB:     0.0394 speedup:       0.96 err: 4.84899e-08
-trial 4: MATLAB:     0.0371 GrB:     0.0411 speedup:       0.90 err: 4.84767e-08
-average: MATLAB:     0.0371 GrB:     0.0394 speedup:       0.94
+trial 1: MATLAB:     0.0432 GrB:     0.0288 speedup:       1.50 err: 4.84787e-08
+trial 2: MATLAB:     0.0381 GrB:     0.0278 speedup:       1.37 err: 4.84694e-08
+trial 3: MATLAB:     0.0401 GrB:     0.0294 speedup:       1.36 err: 4.84549e-08
+trial 4: MATLAB:     0.0414 GrB:     0.0291 speedup:       1.42 err: 4.84613e-08
+average: MATLAB:     0.0407 GrB:     0.0288 speedup:       1.41
 
 === MATLAB: double (real) vs GraphBLAS: double
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     3.1283 GrB:     0.8696 speedup:       3.60 err: 0
-trial 2: MATLAB:     3.2203 GrB:     0.8937 speedup:       3.60 err: 0
-trial 3: MATLAB:     3.2129 GrB:     0.8944 speedup:       3.59 err: 0
-trial 4: MATLAB:     3.2180 GrB:     0.8879 speedup:       3.62 err: 0
-average: MATLAB:     3.1949 GrB:     0.8864 speedup:       3.60
+trial 1: MATLAB:     3.1909 GrB:     1.3223 speedup:       2.41 err: 0
+trial 2: MATLAB:     3.2724 GrB:     1.3732 speedup:       2.38 err: 0
+trial 3: MATLAB:     3.2548 GrB:     1.3553 speedup:       2.40 err: 0
+trial 4: MATLAB:     3.2509 GrB:     1.3373 speedup:       2.43 err: 0
+average: MATLAB:     3.2423 GrB:     1.3470 speedup:       2.41
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2223 GrB:     0.0353 speedup:       6.30 err: 1.48649e-18
-trial 2: MATLAB:     0.1971 GrB:     0.0241 speedup:       8.17 err: 1.47824e-18
-trial 3: MATLAB:     0.1901 GrB:     0.0242 speedup:       7.84 err: 1.43758e-18
-trial 4: MATLAB:     0.1865 GrB:     0.0242 speedup:       7.70 err: 1.46659e-18
-average: MATLAB:     0.1990 GrB:     0.0270 speedup:       7.38
+trial 1: MATLAB:     0.2316 GrB:     0.0335 speedup:       6.91 err: 2.28145e-18
+trial 2: MATLAB:     0.2024 GrB:     0.0182 speedup:      11.15 err: 2.223e-18
+trial 3: MATLAB:     0.1958 GrB:     0.0192 speedup:      10.22 err: 1.84492e-18
+trial 4: MATLAB:     0.2012 GrB:     0.0183 speedup:      11.01 err: 2.0961e-18
+average: MATLAB:     0.2078 GrB:     0.0223 speedup:       9.33
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0406 GrB:     0.0408 speedup:       0.99 err: 3.60051e-18
-trial 2: MATLAB:     0.0397 GrB:     0.0416 speedup:       0.96 err: 3.36233e-18
-trial 3: MATLAB:     0.0401 GrB:     0.0443 speedup:       0.91 err: 3.9134e-18
-trial 4: MATLAB:     0.0387 GrB:     0.0407 speedup:       0.95 err: 3.46332e-18
-average: MATLAB:     0.0398 GrB:     0.0418 speedup:       0.95
+trial 1: MATLAB:     0.0408 GrB:     0.0285 speedup:       1.43 err: 4.84536e-18
+trial 2: MATLAB:     0.0413 GrB:     0.0278 speedup:       1.49 err: 4.42375e-18
+trial 3: MATLAB:     0.0416 GrB:     0.0275 speedup:       1.51 err: 4.95095e-18
+trial 4: MATLAB:     0.0402 GrB:     0.0314 speedup:       1.28 err: 3.95661e-18
+average: MATLAB:     0.0410 GrB:     0.0288 speedup:       1.42
 
 === MATLAB: double complex vs GraphBLAS: single complex
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     6.0956 GrB:     0.9965 speedup:       6.12 err: 1.71854e-07
-trial 2: MATLAB:     6.1450 GrB:     1.0143 speedup:       6.06 err: 1.71854e-07
-trial 3: MATLAB:     6.1340 GrB:     1.0294 speedup:       5.96 err: 1.71854e-07
-trial 4: MATLAB:     6.1243 GrB:     1.0233 speedup:       5.98 err: 1.71854e-07
-average: MATLAB:     6.1247 GrB:     1.0159 speedup:       6.03
+trial 1: MATLAB:     6.0668 GrB:     1.6896 speedup:       3.59 err: 1.71854e-07
+trial 2: MATLAB:     6.0933 GrB:     1.7084 speedup:       3.57 err: 1.71854e-07
+trial 3: MATLAB:     6.1232 GrB:     1.6972 speedup:       3.61 err: 1.71854e-07
+trial 4: MATLAB:     6.1122 GrB:     1.7244 speedup:       3.54 err: 1.71854e-07
+average: MATLAB:     6.0989 GrB:     1.7049 speedup:       3.58
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2592 GrB:     0.0365 speedup:       7.11 err: 4.58189e-08
-trial 2: MATLAB:     0.2372 GrB:     0.0259 speedup:       9.17 err: 4.58469e-08
-trial 3: MATLAB:     0.2297 GrB:     0.0258 speedup:       8.90 err: 4.5815e-08
-trial 4: MATLAB:     0.2327 GrB:     0.0255 speedup:       9.13 err: 4.58221e-08
-average: MATLAB:     0.2397 GrB:     0.0284 speedup:       8.44
+trial 1: MATLAB:     0.2756 GrB:     0.0355 speedup:       7.76 err: 4.58439e-08
+trial 2: MATLAB:     0.2701 GrB:     0.0209 speedup:      12.95 err: 4.58386e-08
+trial 3: MATLAB:     0.2532 GrB:     0.0200 speedup:      12.68 err: 4.58447e-08
+trial 4: MATLAB:     0.2559 GrB:     0.0216 speedup:      11.87 err: 4.58186e-08
+average: MATLAB:     0.2637 GrB:     0.0245 speedup:      10.77
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0948 GrB:     0.0588 speedup:       1.61 err: 5.74209e-08
-trial 2: MATLAB:     0.0991 GrB:     0.0626 speedup:       1.58 err: 5.74563e-08
-trial 3: MATLAB:     0.1125 GrB:     0.0636 speedup:       1.77 err: 5.74157e-08
-trial 4: MATLAB:     0.1048 GrB:     0.0641 speedup:       1.63 err: 5.74098e-08
-average: MATLAB:     0.1028 GrB:     0.0623 speedup:       1.65
+trial 1: MATLAB:     0.1061 GrB:     0.0476 speedup:       2.23 err: 5.73407e-08
+trial 2: MATLAB:     0.1106 GrB:     0.0495 speedup:       2.23 err: 5.73505e-08
+trial 3: MATLAB:     0.1134 GrB:     0.0500 speedup:       2.27 err: 5.73444e-08
+trial 4: MATLAB:     0.1150 GrB:     0.0522 speedup:       2.20 err: 5.7358e-08
+average: MATLAB:     0.1113 GrB:     0.0498 speedup:       2.23
 
 === MATLAB: double complex vs GraphBLAS: double complex
 C=A*B: sparse matrix times sparse matrix:
-trial 1: MATLAB:     5.9519 GrB:     1.3817 speedup:       4.31 err: 0
-trial 2: MATLAB:     5.9340 GrB:     1.3938 speedup:       4.26 err: 0
-trial 3: MATLAB:     5.9435 GrB:     1.3859 speedup:       4.29 err: 0
-trial 4: MATLAB:     5.9609 GrB:     1.3927 speedup:       4.28 err: 0
-average: MATLAB:     5.9476 GrB:     1.3885 speedup:       4.28
+trial 1: MATLAB:     6.0969 GrB:     1.7930 speedup:       3.40 err: 0
+trial 2: MATLAB:     6.1184 GrB:     1.8122 speedup:       3.38 err: 0
+trial 3: MATLAB:     6.1158 GrB:     1.8328 speedup:       3.34 err: 0
+trial 4: MATLAB:     6.1138 GrB:     1.8012 speedup:       3.39 err: 0
+average: MATLAB:     6.1112 GrB:     1.8098 speedup:       3.38
 C=A*x: sparse matrix times sparse vector:
-trial 1: MATLAB:     0.2656 GrB:     0.0535 speedup:       4.97 err: 1.01314e-18
-trial 2: MATLAB:     0.2354 GrB:     0.0381 speedup:       6.18 err: 1.21973e-18
-trial 3: MATLAB:     0.2318 GrB:     0.0359 speedup:       6.46 err: 1.4112e-18
-trial 4: MATLAB:     0.2293 GrB:     0.0361 speedup:       6.34 err: 1.39198e-18
-average: MATLAB:     0.2405 GrB:     0.0409 speedup:       5.88
+trial 1: MATLAB:     0.2670 GrB:     0.0503 speedup:       5.31 err: 1.88567e-18
+trial 2: MATLAB:     0.2464 GrB:     0.0310 speedup:       7.94 err: 2.2629e-18
+trial 3: MATLAB:     0.2467 GrB:     0.0282 speedup:       8.75 err: 2.01905e-18
+trial 4: MATLAB:     0.2347 GrB:     0.0308 speedup:       7.63 err: 1.99707e-18
+average: MATLAB:     0.2487 GrB:     0.0351 speedup:       7.09
 C=A*x: sparse matrix times dense vector:
-trial 1: MATLAB:     0.0877 GrB:     0.0599 speedup:       1.47 err: 3.75987e-18
-trial 2: MATLAB:     0.0923 GrB:     0.0650 speedup:       1.42 err: 3.83457e-18
-trial 3: MATLAB:     0.0982 GrB:     0.0703 speedup:       1.40 err: 3.65548e-18
-trial 4: MATLAB:     0.1004 GrB:     0.0686 speedup:       1.46 err: 3.72724e-18
-average: MATLAB:     0.0946 GrB:     0.0660 speedup:       1.43
+trial 1: MATLAB:     0.1116 GrB:     0.0456 speedup:       2.45 err: 5.15931e-18
+trial 2: MATLAB:     0.1054 GrB:     0.0501 speedup:       2.10 err: 4.00492e-18
+trial 3: MATLAB:     0.1028 GrB:     0.0608 speedup:       1.69 err: 2.70972e-18
+trial 4: MATLAB:     0.1141 GrB:     0.0589 speedup:       1.94 err: 3.73043e-18
+average: MATLAB:     0.1085 GrB:     0.0538 speedup:       2.01
 
+GrB.ver
+---------------------------------------------------------------------------
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+http://suitesparse.com  Dept of Computer Sci. & Eng, Texas A&M University.
+
+Version: 4.0.1 (Jan 4, 2021)
+
+License:
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may
+not use SuiteSparse:GraphBLAS except in compliance with the License.  You
+may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Spec:
+GraphBLAS C API, by Aydin Buluc, Timothy Mattson, Scott McMillan,
+Jose' Moreira, Carl Yang, and Benjamin Brock.  Based on 'GraphBLAS
+Mathematics by Jeremy Kepner.  See also 'Graph Algorithms in the Language
+of Linear Algebra,' edited by J. Kepner and J. Gilbert, SIAM, 2011.
+
+URL: http://graphblas.org
+---------------------------------------------------------------------------
+
+feature ('numcores')
+MATLAB detected: 4 physical cores.
+MATLAB detected: 8 logical cores.
+MATLAB was assigned: 8 logical cores by the OS.
+MATLAB is using: 4 logical cores.
+MATLAB is not using all logical cores because hyper-threading is enabled.
+ans =
+     4
 diary off
diff --git a/GraphBLAS/GraphBLAS/test/Contents.m b/GraphBLAS/GraphBLAS/test/Contents.m
index 7c818749cb..a32210b2b2 100644
--- a/GraphBLAS/GraphBLAS/test/Contents.m
+++ b/GraphBLAS/GraphBLAS/test/Contents.m
@@ -101,7 +101,9 @@
 %  gbtest96  - test GrB.optype
 %  gbtest97  - test GrB.apply2
 %  gbtest98  - test row/col degree for hypersparse matrices
-%  gbtest99  - test GrB.bfs and plot (graph (G))
+%  gbtest99  - test performance of C=A'*B and C=A'
+%  gbtest00  - test GrB.bfs and plot (graph (G))
+%  gbtest100 - test GrB.ver and GrB.version
 %
 % Utilities and other tests:
 %
@@ -114,5 +116,6 @@
 %  gbtest_complex - return list of complex operators
 %  gbtest_err     - compare two matrices
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
diff --git a/GraphBLAS/GraphBLAS/test/README.md b/GraphBLAS/GraphBLAS/test/README.md
index 4da469f125..7857aa27ed 100644
--- a/GraphBLAS/GraphBLAS/test/README.md
+++ b/GraphBLAS/GraphBLAS/test/README.md
@@ -37,6 +37,6 @@ GraphBLAS resides in your file system.
 See the tcov subfolder to run the test with statement coverage of the
 C mexFunctions and utility routines.
 
-SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest.m b/GraphBLAS/GraphBLAS/test/gbtest.m
index 0d08b729eb..f46f8d641f 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest.m
@@ -22,8 +22,8 @@
 %
 % See also GrB.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % gbtest3 requires ../demo/dnn_matlab.m and ../demo/dnn_mat2gb.m.
 demo_folder = fullfile (fileparts (mfilename ('fullpath')), '../demo') ;
@@ -134,9 +134,14 @@
 gbtest96  % test GrB.optype
 gbtest97  % test GrB.apply2
 gbtest98  % test row/col degree for hypersparse matrices
+gbtest99  % test performance of C=A'*B and C=A'
+gbtest100 % test GrB.ver and GrB.version
+gbtest101 % test loading of v3 GraphBLAS objects
+gbtest00  % test GrB.bfs and plot (graph (G))
 
-gbtest99  % test GrB.bfs and plot (graph (G))
+% restore default # of threads
+maxNumCompThreads ('automatic') ;
+GrB.clear
 
 fprintf ('\ngbtest: all tests passed\n') ;
 
-
diff --git a/GraphBLAS/GraphBLAS/test/gbtest0.m b/GraphBLAS/GraphBLAS/test/gbtest0.m
index 7366646233..1b851448c4 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest0.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest0.m
@@ -1,8 +1,8 @@
 function gbtest0
 %GBTEST0 test GrB.clear
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 GrB.clear
 
@@ -21,6 +21,8 @@
     GrB.burble (rand (2)) ;
     ok = false ;
 catch me
+    fprintf ('expected error:\n') ;
+    disp (me) ;
 end
 assert (ok) ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest00.m b/GraphBLAS/GraphBLAS/test/gbtest00.m
new file mode 100644
index 0000000000..ef8b2477d1
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/test/gbtest00.m
@@ -0,0 +1,142 @@
+function gbtest00 (doplots)
+%GBTEST00 test GrB.bfs and plot (graph (G))
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+if (nargin < 1)
+    doplots = true ;
+end
+
+save_threads = GrB.threads ;
+save_chunk   = GrB.chunk ;
+GrB.threads (4) ;
+GrB.chunk (2) ;
+
+%%MatrixMarket matrix coordinate pattern general
+%%GraphBLAS GrB_BOOL
+% Matrix from the cover of "Graph Algorithms in the Language of Linear
+% Algebra", Kepner and Gilbert.  Note that cover shows A'.  This is A.
+% 7 7 12
+ij = [
+4 1
+1 2
+4 3
+6 3
+7 3
+1 4
+7 4
+2 5
+7 5
+3 6
+5 6
+2 7 ] ;
+
+source = 1 ;
+
+A = sparse (ij (:,1), ij (:,2), ones (12,1), 8, 8) ;
+
+formats = { 'by row', 'by col' } ;
+if (doplots)
+    figure (1) ;
+    clf ;
+end
+
+for k1 = 1:2
+    fmt = formats {k1} ;
+
+    A = GrB (A, fmt) ;
+    H = GrB (A, 'logical', fmt) ;
+    if (k1 == 1 && doplots)
+        subplot (1,2,1) ;
+        plot (digraph (A)) ;
+    end
+
+    v1 = GrB.bfs (H, source) ;
+    [v, pi] = GrB.bfs (H, source) ;
+    assert (isequal (v, v1)) ;
+
+    vok = [1 2 3 2 3 4 3 0] ;
+    assert (isequal (full (double (v)), vok)) ;
+
+    % there are 2 valid trees, and GrB.bfs can return either one
+    piok1 = [1 1 4 1 2 3 2 0] ;
+    piok2 = [1 1 4 1 2 5 2 0] ;
+    ok1 = isequal (full (double (pi)), piok1) ;
+    ok2 = isequal (full (double (pi)), piok2) ;
+    if (ok1)
+        % this tree is more commonly found
+        % fprintf ('.') ;
+    end
+    if (ok2)
+        % fprintf ('#') ;
+    end
+    assert (ok1 || ok2) ;
+
+    G = digraph (H) ;
+    v2 = bfsearch (G, source) ;
+
+    levels = full (double (v (v2))) ;
+    assert (isequal (levels, sort (levels))) ;
+
+    [v, pi] = GrB.bfs (H, source, 'directed') ;
+    assert (isequal (full (double (v)), vok)) ;
+
+    ok1 = isequal (full (double (pi)), piok1) ;
+    ok2 = isequal (full (double (pi)), piok2) ;
+    if (ok1)
+        % this tree is more commonly found
+        % fprintf ('+') ;
+    end
+    if (ok2)
+        % this is also valid
+        % fprintf ('-') ;
+    end
+    assert (ok1 || ok2) ;
+
+    [v, pi] = GrB.bfs (H, source, 'directed', 'check') ;
+    assert (isequal (full (double (v)), vok)) ;
+
+    ok1 = isequal (full (double (pi)), piok1) ;
+    ok2 = isequal (full (double (pi)), piok2) ;
+    if (ok1)
+        % this tree is more commonly found
+        % fprintf ('\\') ;
+    end
+    if (ok2)
+        % this is also valid
+        % fprintf ('/') ;
+    end
+    assert (ok1 || ok2) ;
+
+end
+
+A = A+A' ;
+[v, pi] = GrB.bfs (A, 2, 'undirected') ;
+if (doplots)
+    subplot (1,2,2) ;
+    plot (graph (A))
+end
+vok = [2 1 3 3 2 3 2 0] ;
+assert (isequal (full (double (v)), vok)) ;
+% two valid trees:
+piok1 = [2 2 7 1 2 5 2 0] ;
+piok2 = [2 2 7 7 2 5 2 0] ;
+
+    ok1 = isequal (full (double (pi)), piok1) ;
+    ok2 = isequal (full (double (pi)), piok2) ;
+    if (ok1)
+        % this tree is more commonly found
+        % fprintf ('@') ;
+    end
+    if (ok2)
+        % fprintf ('_') ;
+    end
+    assert (ok1 || ok2) ;
+
+GrB.threads (save_threads) ;
+GrB.chunk (save_chunk) ;
+
+fprintf ('gbtest00: all tests passed\n') ;
+
+
diff --git a/GraphBLAS/GraphBLAS/test/gbtest1.m b/GraphBLAS/GraphBLAS/test/gbtest1.m
index 01970f9eec..cf92f15daf 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest1.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest1.m
@@ -1,8 +1,8 @@
 function gbtest1
 %GBTEST1 test GrB
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 X = 100 * sprand (3, 4, 0.4) %#ok<*NOPRT>
diff --git a/GraphBLAS/GraphBLAS/test/gbtest10.m b/GraphBLAS/GraphBLAS/test/gbtest10.m
index 52b431248d..e8a2be64ae 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest10.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest10.m
@@ -1,8 +1,8 @@
 function gbtest10
 %GBTEST10 test GrB.assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
@@ -53,12 +53,12 @@
 assert (gbtest_eq (C2, Cout)) ;
 
 A = rand (4) ;
-G = GrB (A, 'by row') ;
+G = GrB (A, 'by row') 
 M = logical (eye (4)) ;
 B = rand (4) ;
-H = GrB (B, 'by row') ;
-A (M) = B (M) ;
-G (M) = H (M) ;
+H = GrB (B, 'by row') 
+A (M) = B (M) 
+G (M) = H (M)
 assert (isequal (A, G)) ;
 
 A = sprand (4, 4, 0.5) %#ok<*NOPRT>
diff --git a/GraphBLAS/GraphBLAS/test/gbtest100.m b/GraphBLAS/GraphBLAS/test/gbtest100.m
new file mode 100644
index 0000000000..f768fd30db
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/test/gbtest100.m
@@ -0,0 +1,23 @@
+function gbtest100
+%GBTEST100 test GrB.ver and GrB.version
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+GrB.MATLAB_vs_GrB ;
+
+fprintf ('v = ver (''matlab'')\n') ;
+v = ver ('matlab') ;
+display (v) ;
+
+fprintf ('v = GrB.ver\n') ;
+v = GrB.ver ;
+display (v) ;
+
+fprintf ('v = GrB.version\n') ;
+v = GrB.version ;
+display (v) ;
+
+fprintf ('GrB.ver\n\n') ;
+GrB.ver
+
diff --git a/GraphBLAS/GraphBLAS/test/gbtest101.m b/GraphBLAS/GraphBLAS/test/gbtest101.m
new file mode 100644
index 0000000000..081b7d4305
--- /dev/null
+++ b/GraphBLAS/GraphBLAS/test/gbtest101.m
@@ -0,0 +1,87 @@
+function gbtest101
+%GBTEST101 test loading of v3 GraphBLAS objects
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+load gbtestv3
+whos
+
+G
+fprintf ('================== v3 sparse struct:\n') ;
+G_struct = struct (G)
+G2 = GrB (G, 'sparse') ;
+fprintf ('================== v4 sparse struct:\n') ;
+G2_struct = struct (G2)
+assert (isequal (G, A)) ;
+assert (isequal (G2, A)) ;
+
+assert (isfield (G_struct, 'GraphBLAS')) ;
+assert (isfield (G2_struct, 'GraphBLASv4')) ;
+G3 = GrB (G) ;
+G3_struct = struct (G3) ;
+assert (isfield (G3_struct, 'GraphBLASv4')) ;
+assert (isequal (G3, A)) ;
+
+[m1, n1] = size (G) ;
+[m2, n2] = size (A) ;
+assert (m1 == m2) ;
+assert (n1 == n2) ;
+
+t1 = GrB.type (G) ;
+t2 = GrB.type (A) ;
+assert (isequal (t1, t2)) ;
+
+[s1, f1] = GrB.format (G) ;
+[s2, f2] = GrB.format (G2) ;
+assert (isequal (s1, s2)) ;
+assert (isequal (f1, f2)) ;
+
+H2 = GrB (H, 'hyper') ;
+fprintf ('================== v3 hypersparse struct:\n') ;
+H_struct = struct (H)
+fprintf ('================== v4 hypersparse struct:\n') ;
+H2_struct = struct (H2)
+
+assert (isfield (H_struct, 'GraphBLAS')) ;
+assert (isfield (H2_struct, 'GraphBLASv4')) ;
+H3 = GrB (H) ;
+H3_struct = struct (H3) ;
+assert (isfield (H3_struct, 'GraphBLASv4')) ;
+assert (isequal (H3, H)) ;
+
+H3 = GrB (n,n) ;
+H3 (1:4, 1:4) = magic (4) ;
+assert (isequal (H2, H)) ;
+assert (isequal (H3, H)) ;
+
+[s1, f1] = GrB.format (H) ;
+[s2, f2] = GrB.format (H2) ;
+assert (isequal (s1, s2)) ;
+assert (isequal (f1, f2)) ;
+
+t1 = GrB.type (H2) ;
+t2 = GrB.type (H) ;
+assert (isequal (t1, t2)) ;
+
+R2 = GrB (R) ;
+assert (isequal (R2, R)) ;
+assert (isequal (R2, A')) ;
+
+assert (isfield (struct (R), 'GraphBLAS')) ;
+assert (isfield (struct (R2), 'GraphBLASv4')) ;
+
+X2 = GrB (X) ;
+assert (isequal (magic (4), X)) ;
+assert (isequal (magic (4), X2)) ;
+
+assert (isfield (struct (X), 'GraphBLAS')) ;
+assert (isfield (struct (X2), 'GraphBLASv4')) ;
+
+fprintf ('================== v3 dense struct (held in sparse format):\n') ;
+X_struct = struct (X)
+fprintf ('================== v4 dense struct (no integers in struct):\n') ;
+X2_struct = struct (X2)
+
+fprintf ('gbtest101: all tests passed\n') ;
+
diff --git a/GraphBLAS/GraphBLAS/test/gbtest11.m b/GraphBLAS/GraphBLAS/test/gbtest11.m
index 3933342544..e6610ea39d 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest11.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest11.m
@@ -1,15 +1,29 @@
 function gbtest11
 %GBTEST11 test GrB, sparse
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 A = 100 * rand (4) ;
 A (1,1:2) = 0 %#ok<*NOPRT>
 S = sparse (A)
+
+% x1 = GrB (S)
+% x2 = full (x1)
+% x3 = double (x2)
+% assert (gbtest_eq (S, x3))
+
 assert (gbtest_eq (S, double (full (GrB (S)))))
+
+% x1 = GrB (S)
+% x2 = full (x1)
+% x3 = full (x2)
+% x4 = double (x3)
+% assert (gbtest_eq (S, x4))
+
 assert (gbtest_eq (S, double (full (full (GrB (S))))))
+
 assert (gbtest_eq (S, double (full (double (full (GrB (S)))))))
 
 S2 = double (GrB (full (double (full (GrB (S))))))
@@ -77,5 +91,22 @@
 assert (gbtest_eq (X, full (complex (G))))
 assert (gbtest_eq (X, complex (full (G))))
 
+X = rand (4) ;
+Y = GrB (X) ;
+Z = sparse (Y) ;
+W = sparse (Z) ;
+assert (gbtest_eq (X, Z)) ;
+assert (gbtest_eq (X, Y)) ;
+assert (gbtest_eq (X, W)) ;
+
+S = struct (Y) ;
+Z = GrB (S) ;
+assert (gbtest_eq (Z, Y)) ;
+
+assert (GrB.isfull (Z)) ;
+assert (GrB.isfull (double (Z))) ;
+assert (~GrB.isfull (speye (3))) ;
+assert (~GrB.isfull (GrB (speye (3)))) ;
+
 fprintf ('gbtest11: all tests passed\n') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest12.m b/GraphBLAS/GraphBLAS/test/gbtest12.m
index 9bf8e837bb..41247ca849 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest12.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest12.m
@@ -1,8 +1,8 @@
 function gbtest12
 %GBTEST12 test GrB.eadd, GrB.emult
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 A = sparse (rand (2)) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest13.m b/GraphBLAS/GraphBLAS/test/gbtest13.m
index 68b9ababe1..c24e834f9b 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest13.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest13.m
@@ -1,8 +1,8 @@
 function gbtest13
 %GBTEST13 test find and GrB.extracttuples
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 list = gbtest_types ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest14.m b/GraphBLAS/GraphBLAS/test/gbtest14.m
index 6c7424b635..87f818e5b7 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest14.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest14.m
@@ -1,8 +1,8 @@
 function gbtest14
 %GBTEST14 test kron and GrB.kronecker
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 A = sparse (rand (2,3)) ;
@@ -68,8 +68,8 @@
 G = GrB (C) ;
 T = kron (A,B) ;
 C (M) = T (M) ;
-G = GrB.kronecker (G, M, '*', A, B) ;
-err = norm (C-G, 1) ;
+G2 = GrB.kronecker (G, M, '*', A, B) ;
+err = norm (C-G2, 1) ;
 assert (err < 1e-12)
 
 fprintf ('gbtest14: all tests passed\n') ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest15.m b/GraphBLAS/GraphBLAS/test/gbtest15.m
index 9a80a70588..2e6e561ac6 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest15.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest15.m
@@ -1,8 +1,8 @@
 function gbtest15
 %GBTEST15 list all unary operators
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 types = gbtest_types ;
 ops = { 'identity', '~', '-', '1', 'minv', 'abs',  'sqrt', 'log', ...
@@ -11,7 +11,7 @@
     'signum', 'ceil', 'floor', 'round', 'trunc', 'pow2', ...
     'expm1', 'log10', 'log1p', 'log2', 'lgamma', 'tgamma', 'erf', ...
     'erfc', 'conj', 'creal', 'cimag', 'carg', 'isinf', 'isnan', ...
-    'isinfinite', 'frexpx', 'frexpe' } ;
+    'isinfinite', 'frexpx', 'frexpe', 'i0', 'i1', 'j0', 'j1' } ;
 
 nops = 0 ;
 for k1 = 1:length (ops)
@@ -31,7 +31,7 @@
 GrB.unopinfo ;
 
 fprintf ('number of unary ops: %d\n', nops) ;
-assert (nops == 204) ;
+assert (nops == 212) ;
 
 fprintf ('gbtest15: all tests passed\n') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest16.m b/GraphBLAS/GraphBLAS/test/gbtest16.m
index 634feaf939..a1ac5f3816 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest16.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest16.m
@@ -1,8 +1,8 @@
 function gbtest16
 %GBTEST16 test GrB.extract
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest17.m b/GraphBLAS/GraphBLAS/test/gbtest17.m
index 5f3eb9b228..17312f6fd5 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest17.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest17.m
@@ -1,8 +1,8 @@
 function gbtest17
 %GBTEST17 test GrB.trans
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest18.m b/GraphBLAS/GraphBLAS/test/gbtest18.m
index 0b5acc08d3..1ca3f9de1c 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest18.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest18.m
@@ -1,8 +1,8 @@
 function gbtest18
 %GBTEST18 test comparators (and, or, >, ...)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 tol = 1e-14 ;
 rng ('default') ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest19.m b/GraphBLAS/GraphBLAS/test/gbtest19.m
index a24ea2b7dd..92366c2a9b 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest19.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest19.m
@@ -1,8 +1,8 @@
 function gbtest19
 %GBTEST19 test mpower
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 A = rand (4) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest2.m b/GraphBLAS/GraphBLAS/test/gbtest2.m
index b16ececb53..c46fd5b799 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest2.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest2.m
@@ -1,8 +1,8 @@
 function gbtest2
 %GBTEST2 list all binary operators
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 optype = gbtest_types ;
 opnames = gbtest_binops ;
@@ -37,7 +37,7 @@
 GrB.binopinfo ;
 
 fprintf ('number of valid binary operators: %d\n', nbinop) ;
-assert (nbinop == 385) ;
+assert (nbinop == 401) ;
 
 fprintf ('gbtest2: all tests passed\n') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest20.m b/GraphBLAS/GraphBLAS/test/gbtest20.m
index 859b4695de..a215f3c9f6 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest20.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest20.m
@@ -1,8 +1,8 @@
 function gbtest20
 %GBTEST20 test bandwidth, isdiag, ceil, floor, round, fix
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 for trial = 1:10
diff --git a/GraphBLAS/GraphBLAS/test/gbtest21.m b/GraphBLAS/GraphBLAS/test/gbtest21.m
index be154e9d14..6c4ada6fcd 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest21.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest21.m
@@ -1,8 +1,8 @@
 function gbtest21
 %GBTEST21 test isfinite, isinf, isnan
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 for trial = 1:40
diff --git a/GraphBLAS/GraphBLAS/test/gbtest22.m b/GraphBLAS/GraphBLAS/test/gbtest22.m
index fbefc5720e..2d2f7eea96 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest22.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest22.m
@@ -1,8 +1,8 @@
 function gbtest22
 %GBTEST22 test reduce to scalar
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 desc.kind = 'sparse' ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest23.m b/GraphBLAS/GraphBLAS/test/gbtest23.m
index 4019edc98b..4f0b796bfe 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest23.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest23.m
@@ -1,8 +1,8 @@
 function gbtest23
 %GBTEST23 test min and max
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 for trial = 1:10
diff --git a/GraphBLAS/GraphBLAS/test/gbtest24.m b/GraphBLAS/GraphBLAS/test/gbtest24.m
index 7723d51886..c1d23bfd98 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest24.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest24.m
@@ -1,8 +1,8 @@
 function gbtest24
 %GBTEST24 test any, all
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 for trial = 1:10
diff --git a/GraphBLAS/GraphBLAS/test/gbtest25.m b/GraphBLAS/GraphBLAS/test/gbtest25.m
index 209aa96da1..a561622f3e 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest25.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest25.m
@@ -1,8 +1,8 @@
 function gbtest25
 %GBTEST25 test diag, tril, triu
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 for trials = 1:10
diff --git a/GraphBLAS/GraphBLAS/test/gbtest26.m b/GraphBLAS/GraphBLAS/test/gbtest26.m
index abc9fc2873..c0961ef3fb 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest26.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest26.m
@@ -1,8 +1,8 @@
 function gbtest26
 %GBTEST26 test typecasting
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 types = gbtest_types ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest27.m b/GraphBLAS/GraphBLAS/test/gbtest27.m
index 00086b6314..aaf791e28a 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest27.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest27.m
@@ -1,8 +1,8 @@
 function gbtest27
 %GBTEST27 test conversion to full
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 types = gbtest_types ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest28.m b/GraphBLAS/GraphBLAS/test/gbtest28.m
index 638c66ff86..17a6d5a963 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest28.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest28.m
@@ -1,8 +1,8 @@
 function gbtest28
 %GBTEST28 test GrB.build
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ngbtest28: testing GrB.build and compare with A=sparse(i,j,x)\n') ;
 nthreads = GrB.threads ;
@@ -38,6 +38,9 @@
 i0 = uint64 (i) - 1 ;
 j0 = uint64 (j) - 1 ;
 
+d.kind = 'sparse' ;
+desc0.base = 'zero-based' ;
+
 fprintf ('\nwith [I J] already sorted on input:\n') ;
 
 tic
@@ -49,29 +52,25 @@
 A3 = GrB.build (i, j, x, m, n) ;
 t = toc ;
 fprintf ('%12.4f sec : A = GrB.build (...), same inputs as MATLAB\n', t) ;
-
-d.kind = 'sparse' ;
+assert (gbtest_eq (A1, A3)) ;
 
 tic
 A4 = GrB.build (i, j, x, m, n, d) ;
 t = toc ;
 fprintf ('%12.4f sec : A = GrB.build (...), same inputs/outputs as MATLAB\n',t);
+assert (gbtest_eq (A1, A4)) ;
 
-desc0.base = 'zero-based' ;
 tic
 A2 = GrB.build (i0, j0, x, m, n, desc0) ;
 t = toc ;
 fprintf ('%12.4f sec : A = GrB.build (i0, j0, ...), with i0 and j0 uint64\n',t);
-
 assert (gbtest_eq (A1, A2)) ;
-assert (gbtest_eq (A1, A3)) ;
-assert (gbtest_eq (A1, A4)) ;
-
-fprintf ('\nwith [I J] jumbled so that a sort is required:\n') ;
 
+fprintf ('\nwith [I J] reversed/jumbled so that a sort is required:\n') ;
 i = i (end:-1:1) ;
 j = j (end:-1:1) ;
 i (1:10) = i (randperm (10)) ;
+
 i0 = uint64 (i) - 1 ;
 j0 = uint64 (j) - 1 ;
 
@@ -84,20 +83,19 @@
 A3 = GrB.build (i, j, x, m, n) ;
 t = toc ;
 fprintf ('%12.4f sec : A = GrB.build (...), same inputs as MATLAB\n', t) ;
+assert (gbtest_eq (A1, A3)) ;
 
 tic
 A4 = GrB.build (i, j, x, m, n, d) ;
 t = toc ;
 fprintf ('%12.4f sec : A = GrB.build (...), same inputs/outputs as MATLAB\n',t);
+assert (gbtest_eq (A1, A4)) ;
 
 tic
 A2 = GrB.build (i0, j0, x, m, n, desc0) ;
 t = toc ;
 fprintf ('%12.4f sec : A = GrB.build (i0,j0,...), with i0 and j0 uint64\n', t) ;
-
 assert (gbtest_eq (A1, A2)) ;
-assert (gbtest_eq (A1, A3)) ;
-assert (gbtest_eq (A1, A4)) ;
 
 fprintf ('\ngbtest28: all tests passed\n') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest29.m b/GraphBLAS/GraphBLAS/test/gbtest29.m
index 8b69e390fd..5f9ddab547 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest29.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest29.m
@@ -1,8 +1,8 @@
 function gbtest29
 %GBTEST29 test subsref and subsasgn with logical indexing
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest3.m b/GraphBLAS/GraphBLAS/test/gbtest3.m
index 16b09bae30..e9d284a398 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest3.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest3.m
@@ -1,10 +1,9 @@
 function gbtest3
 %GBTEST3 test dnn
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-fprintf ('gbtest3: testing sparse deep neural network\n') ;
 help GrB.dnn
 
 rng ('default') ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest30.m b/GraphBLAS/GraphBLAS/test/gbtest30.m
index 31b8fbafa3..b0de715c17 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest30.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest30.m
@@ -1,8 +1,8 @@
 function gbtest30
 %GBTEST30 test colon notation
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 n = 1e9 %#ok<*NOPRT>
diff --git a/GraphBLAS/GraphBLAS/test/gbtest31.m b/GraphBLAS/GraphBLAS/test/gbtest31.m
index 777eb318c8..e941ecc058 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest31.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest31.m
@@ -1,8 +1,8 @@
 function gbtest31
 %GBTEST31 test GrB and casting
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 types = gbtest_types ;
 fprintf ('gbtest31: typecasting\n') ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest32.m b/GraphBLAS/GraphBLAS/test/gbtest32.m
index 2c2154c09b..fda8e38869 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest32.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest32.m
@@ -1,8 +1,8 @@
 function gbtest32
 %GBTEST32 test nonzeros
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest33.m b/GraphBLAS/GraphBLAS/test/gbtest33.m
index d94056ce1f..fd2eb32fb5 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest33.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest33.m
@@ -3,8 +3,8 @@
 % isempty, issparse, ...  ismatrix, isvector, isscalar, isnumeric,
 % isfloat, isreal, isinteger, islogical, isa.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest34.m b/GraphBLAS/GraphBLAS/test/gbtest34.m
index bbf8a6bb01..e1efe5c171 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest34.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest34.m
@@ -1,8 +1,8 @@
 function gbtest34
 %GBTEST34 test repmat
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest35.m b/GraphBLAS/GraphBLAS/test/gbtest35.m
index bce0bf7b05..e116cff9a5 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest35.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest35.m
@@ -1,8 +1,8 @@
 function gbtest35
 %GBTEST35 test reshape
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default')
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest36.m b/GraphBLAS/GraphBLAS/test/gbtest36.m
index d48cbe5857..8ccb222e0b 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest36.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest36.m
@@ -1,8 +1,8 @@
 function gbtest36
 %GBTEST36 test abs, sign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest37.m b/GraphBLAS/GraphBLAS/test/gbtest37.m
index 4eb2af07ab..ce0faf7522 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest37.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest37.m
@@ -2,8 +2,8 @@
 %GBTEST37 test istril, istriu, isbanded, isdiag, ishermitian, ...
 % issymmetric, bandwith
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 nmax = 5 ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest38.m b/GraphBLAS/GraphBLAS/test/gbtest38.m
index b5ff29d235..2e932e59f4 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest38.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest38.m
@@ -2,8 +2,8 @@
 %GBTEST38 test sqrt, eps, ceil, floor, round, fix, real, conj, ...
 % isfinite, isinf, isnan, spfun, eig
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
@@ -57,7 +57,8 @@
 
     A = sparse (A+A') ;
     G = GrB (A) ;
-    assert (isequal (eig (A), double (eig (G)))) ;
+    err = norm (eig (A) - double (eig (G)), 2) ;
+    assert (err < 1e-12) ;
 
     A = rand (10) ;
     B = rand (10) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest39.m b/GraphBLAS/GraphBLAS/test/gbtest39.m
index e453bb4bb7..eaa3b02f26 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest39.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest39.m
@@ -1,8 +1,8 @@
 function gbtest39
 %GBTEST39 test amd, colamd, symamd, symrcm, dmperm, etree
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 for trial = 1:40
     fprintf ('.') ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest4.m b/GraphBLAS/GraphBLAS/test/gbtest4.m
index 10b103e76a..f60224145e 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest4.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest4.m
@@ -5,8 +5,8 @@
 % names.  For example, the spec has many boolean operators with different
 % names but they compute the same thing.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 types = gbtest_types ;
 ops = gbtest_binops ;
@@ -42,7 +42,7 @@
 GrB.semiringinfo
 
 fprintf ('number of semirings: %d\n', nsemirings) ;
-assert (nsemirings == 2438) ;
+assert (nsemirings == 2518) ;
 
 fprintf ('\ngbtest4: all tests passed\n') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest40.m b/GraphBLAS/GraphBLAS/test/gbtest40.m
index 9b0e922c6d..79a65e48b1 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest40.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest40.m
@@ -1,8 +1,8 @@
 function gbtest40
 %GBTEST40 test sum, prod, max, min, any, all, norm
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default')
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest41.m b/GraphBLAS/GraphBLAS/test/gbtest41.m
index d92d40a6d2..1152722da9 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest41.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest41.m
@@ -1,8 +1,8 @@
 function gbtest41
 %GBTEST41 test ones, zeros, false
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 types = gbtest_types ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest42.m b/GraphBLAS/GraphBLAS/test/gbtest42.m
index cb67c0a885..d0163d88c6 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest42.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest42.m
@@ -1,8 +1,8 @@
 function gbtest42
 %GBTEST42 test for nan
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 types = { 'single', 'double', 'single complex', 'double complex' } ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest43.m b/GraphBLAS/GraphBLAS/test/gbtest43.m
index 87edef22b9..22f95bb97e 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest43.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest43.m
@@ -2,8 +2,8 @@
 %GBTEST43 test error handling
 % All errors generated by this test are expected.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 ok = true ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest44.m b/GraphBLAS/GraphBLAS/test/gbtest44.m
index 82b7137522..7a7c455d49 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest44.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest44.m
@@ -1,8 +1,8 @@
 function gbtest44
 %GBTEST44 test subsasgn, mtimes, plus, false, ...
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest45.m b/GraphBLAS/GraphBLAS/test/gbtest45.m
index da0b4978b1..a409b44f4e 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest45.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest45.m
@@ -1,8 +1,8 @@
 function gbtest45
 %GBTEST45 test GrB.vreduce
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 d.kind = 'sparse' ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest46.m b/GraphBLAS/GraphBLAS/test/gbtest46.m
index bbe4000228..5eccc48cec 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest46.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest46.m
@@ -1,8 +1,8 @@
 function gbtest46
 %GBTEST46 test GrB.subassign and GrB.assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 d.kind = 'sparse' ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest47.m b/GraphBLAS/GraphBLAS/test/gbtest47.m
index 68068bb99e..7f8526c1cb 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest47.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest47.m
@@ -1,8 +1,8 @@
 function gbtest47
 %GBTEST47 test GrB.entries, GrB.nonz, numel
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest48.m b/GraphBLAS/GraphBLAS/test/gbtest48.m
index 11252bdbc9..de8ed87981 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest48.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest48.m
@@ -1,8 +1,8 @@
 function gbtest48
 %GBTEST48 test GrB.apply
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 desc.kind = 'sparse' ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest49.m b/GraphBLAS/GraphBLAS/test/gbtest49.m
index 3dcfe431c6..998b04adfe 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest49.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest49.m
@@ -1,8 +1,8 @@
 function gbtest49
 %GBTEST49 test GrB.prune
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest5.m b/GraphBLAS/GraphBLAS/test/gbtest5.m
index 740208f7a1..202f3c7bf7 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest5.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest5.m
@@ -1,14 +1,14 @@
 function gbtest5
 %GBTEST5 test GrB.descriptorinfo
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 list_out  = { [ ], 'default', 'replace' } ;
 list_in   = { [ ], 'default', 'transpose' } ;
 list_mask = { [ ], 'default', 'complement', ...
                    'structural complement', 'structure' } ;
-list_axb  = { [ ], 'default', 'gustavson', 'heap', 'dot', 'hash', 'saxpy' } ;
+list_axb  = { [ ], 'default', 'gustavson', 'dot', 'hash', 'saxpy' } ;
 list_kind = { [ ], 'sparse', 'full', 'grb', 'default' } ;
 
 ntrials = 0;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest50.m b/GraphBLAS/GraphBLAS/test/gbtest50.m
index ad60012fc2..f1cc2a0ef8 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest50.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest50.m
@@ -1,8 +1,8 @@
 function gbtest50
 %GBTEST50 test GrB.ktruss and GrB.tricount
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest51.m b/GraphBLAS/GraphBLAS/test/gbtest51.m
index b406e6010a..0657329f81 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest51.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest51.m
@@ -1,8 +1,8 @@
 function gbtest51
 %GBTEST51 test GrB.tricount
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 files =  {
 '../../Demo/Matrix/2blocks'
@@ -42,24 +42,37 @@
 [filepath, name, ext] = fileparts (mfilename ('fullpath')) ; %#ok<*ASGLU>
 
 for k = 1:nfiles
+    % fprintf ('--------------------------load file:\n') ;
     filename = files {k} ;
-    T = load (fullfile (filepath, files {k})) ;
-    G = GrB.build (int64 (T (:,1)), int64 (T (:,2)), T (:,3), desc) ;
+    T = load (fullfile (filepath, filename)) ;
+    nz = size (T, 1) ;
+    X = ones (nz,1) ;
+    G = GrB.build (int64 (T (:,1)), int64 (T (:,2)), X, desc) ;
+    A = sparse (T (:,1)+1, T (:,2)+1, X) ;
+    assert (isequal (A,G))
+
+    % fprintf ('--------------------------construct G:\n') ;
     [m, n] = size (G) ;
     if (m ~= n)
         G = [GrB(m,m) G ; G' GrB(n,n)] ; %#ok<*AGROW>
     elseif (~issymmetric (G))
         G = G + G' ;
     end
+
+    % fprintf ('--------------------------tricount (G):\n') ;
     c = GrB.tricount (G) ;
-    fprintf ('triangle count: %-30s : # triangles %d\n', filename, c) ;
+    % fprintf ('triangle count: %-30s : # triangles %d\n', filename, c) ;
     assert (c == valid_count (k)) ;
 
+    % fprintf ('--------------------------convert G to by-row:\n') ;
     G = GrB (G, 'by row') ;
+
+    % fprintf ('--------------------------tricount (G):\n') ;
     c = GrB.tricount (G) ;
     assert (c == valid_count (k)) ;
 end
 
+% fprintf ('--------------------------tricount (G, ''check''):\n') ;
 c = GrB.tricount (G, 'check') ;
 assert (c == valid_count (end)) ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest52.m b/GraphBLAS/GraphBLAS/test/gbtest52.m
index 55488dde7c..7f1dd6c7e1 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest52.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest52.m
@@ -1,8 +1,8 @@
 function gbtest52
 %GBTEST52 test GrB.format
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 GrB.format
 GrB.format ('by col') ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest53.m b/GraphBLAS/GraphBLAS/test/gbtest53.m
index 5b8187c1de..ee7cbf407a 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest53.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest53.m
@@ -1,8 +1,8 @@
 function gbtest53
 %GBTEST53 test GrB.monoidinfo
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 types10 = {
     'double'
diff --git a/GraphBLAS/GraphBLAS/test/gbtest54.m b/GraphBLAS/GraphBLAS/test/gbtest54.m
index 81878af6c1..03a5004c97 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest54.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest54.m
@@ -1,8 +1,8 @@
 function gbtest54
 %GBTEST54 test GrB.compact
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 n = 32 ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest55.m b/GraphBLAS/GraphBLAS/test/gbtest55.m
index ef2e6e404b..8c1b6fa077 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest55.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest55.m
@@ -1,8 +1,8 @@
 function gbtest55
 %GBTEST55 test disp
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest56.m b/GraphBLAS/GraphBLAS/test/gbtest56.m
index 2a1488327c..b113567b1a 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest56.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest56.m
@@ -1,8 +1,8 @@
 function gbtest56
 %GBTEST56 test GrB.empty
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 for m1 = -1:5
     for n1 = -1:5
diff --git a/GraphBLAS/GraphBLAS/test/gbtest57.m b/GraphBLAS/GraphBLAS/test/gbtest57.m
index 8d642bb0bc..87add460c7 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest57.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest57.m
@@ -1,8 +1,8 @@
 function gbtest57
 %GBTEST57 test fprintf and sprintf
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 c1 = fprintf ('pi: %g\n', pi) ;
 c2 = fprintf ('pi: %g\n', GrB (pi)) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest58.m b/GraphBLAS/GraphBLAS/test/gbtest58.m
index 836cc26696..2af575c325 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest58.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest58.m
@@ -1,8 +1,8 @@
 function gbtest58
 %GBTEST58 test uplus
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 A = 1 - 2 * rand (3) ;
 G = GrB (A) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest59.m b/GraphBLAS/GraphBLAS/test/gbtest59.m
index e2746fe976..d91a14a5a8 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest59.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest59.m
@@ -1,8 +1,8 @@
 function gbtest59
 %GBTEST59 test end
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 A = rand (4,7) ;
 G = GrB (A) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest6.m b/GraphBLAS/GraphBLAS/test/gbtest6.m
index cd137bb334..eb11546be2 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest6.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest6.m
@@ -1,8 +1,8 @@
 function gbtest6
 %GBTEST6 test GrB.mxm
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 A = sparse (rand (2)) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest60.m b/GraphBLAS/GraphBLAS/test/gbtest60.m
index cdb2637e5e..3cb3d1bf94 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest60.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest60.m
@@ -1,8 +1,8 @@
 function gbtest60
 %GBTEST60 test GrB.issigned
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % 8 signed types:
 signed_types   = { 'double', 'single', ...
diff --git a/GraphBLAS/GraphBLAS/test/gbtest61.m b/GraphBLAS/GraphBLAS/test/gbtest61.m
index 5158a34938..d5754c74f6 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest61.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest61.m
@@ -1,8 +1,8 @@
 function gbtest61
 %GBTEST61 test GrB.laplacian
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 n = 10 ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest62.m b/GraphBLAS/GraphBLAS/test/gbtest62.m
index c0489cddc9..1111dbb4dd 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest62.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest62.m
@@ -1,8 +1,8 @@
 function gbtest62
 %GBTEST62 test ldivide, rdivide, mldivide, mrdivide
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest63.m b/GraphBLAS/GraphBLAS/test/gbtest63.m
index 2e6574a3f9..a232897047 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest63.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest63.m
@@ -1,8 +1,8 @@
 function gbtest63
 %GBTEST63 test GrB.incidence
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest64.m b/GraphBLAS/GraphBLAS/test/gbtest64.m
index b741844d16..c15e9b53f8 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest64.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest64.m
@@ -1,8 +1,8 @@
 function gbtest64
 %GBTEST64 test GrB.pagerank
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 load west0479 ; %#ok<*LOAD>
 W = abs (west0479) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest65.m b/GraphBLAS/GraphBLAS/test/gbtest65.m
index 14e0613378..f0041c3e92 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest65.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest65.m
@@ -1,8 +1,8 @@
 function gbtest65
 %GBTEST65 test GrB.mis
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 load west0479 ; %#ok<LOAD>
diff --git a/GraphBLAS/GraphBLAS/test/gbtest66.m b/GraphBLAS/GraphBLAS/test/gbtest66.m
index 60659b290a..b347b18eb3 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest66.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest66.m
@@ -1,8 +1,8 @@
 function gbtest66
 %GBTEST66 test graph
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest67.m b/GraphBLAS/GraphBLAS/test/gbtest67.m
index 51ac1a60c0..cdd182374f 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest67.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest67.m
@@ -1,8 +1,8 @@
 function gbtest67
 %GBTEST67 test digraph
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest68.m b/GraphBLAS/GraphBLAS/test/gbtest68.m
index 8a5994d0e6..9053cf753c 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest68.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest68.m
@@ -1,8 +1,8 @@
 function gbtest68
 %GBTEST68 test isequal
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest69.m b/GraphBLAS/GraphBLAS/test/gbtest69.m
index 515c103a78..0292b3da07 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest69.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest69.m
@@ -1,8 +1,8 @@
 function gbtest69
 %GBTEST69 test flip
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest7.m b/GraphBLAS/GraphBLAS/test/gbtest7.m
index 7b805c7665..8580b6ce5f 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest7.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest7.m
@@ -1,8 +1,8 @@
 function gbtest7
 %GBTEST7 test GrB.build
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest70.m b/GraphBLAS/GraphBLAS/test/gbtest70.m
index 2df0fd3b01..bfb02acd53 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest70.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest70.m
@@ -1,8 +1,8 @@
 function gbtest70
 %GBTEST70 test GrB.random
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ; A = sprand (4, 5, 0.5) ;
 rng ('default') ; C0 = sprand (A) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest71.m b/GraphBLAS/GraphBLAS/test/gbtest71.m
index 84c9d7674b..21fc3d5331 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest71.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest71.m
@@ -1,8 +1,8 @@
 function gbtest71
 %GBTEST71 test GrB.selectopinfo
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 ops = {
     'tril'
@@ -28,12 +28,14 @@
     '<'
     '<=' } ;
 
-for k = 1:length(ops)
+nops = length (ops) ;
+for k = 1:nops
     GrB.selectopinfo (ops {k}) ;
 end
 
 fprintf ('\n\n') ;
 GrB.selectopinfo
 
+fprintf ('# of select ops: %d\n', nops) ;
 fprintf ('gbtest71: all tests passed\n') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest72.m b/GraphBLAS/GraphBLAS/test/gbtest72.m
index ffe57adc2f..e3427df0a1 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest72.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest72.m
@@ -1,8 +1,8 @@
 function gbtest72
 %GBTEST72 test any-pair semiring
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 dt = struct ('in0', 'transpose') ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest73.m b/GraphBLAS/GraphBLAS/test/gbtest73.m
index 9b4d41b7b6..5dd7fa9fa2 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest73.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest73.m
@@ -1,8 +1,8 @@
 function gbtest73
 %GBTEST73 test GrB.normdiff
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest74.m b/GraphBLAS/GraphBLAS/test/gbtest74.m
index 564d647c05..ebdd0a13d8 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest74.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest74.m
@@ -1,10 +1,8 @@
 function gbtest74
 %GBTEST74 test bitwise operators
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
-fprintf ('gbtest74: test bitwise operators\n') ;
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 int_types = {
 'int8'
diff --git a/GraphBLAS/GraphBLAS/test/gbtest75.m b/GraphBLAS/GraphBLAS/test/gbtest75.m
index 81aaea6960..f279711412 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest75.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest75.m
@@ -1,8 +1,8 @@
 function gbtest75
 %GBTEST75 test bitshift
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ngbtest75: bitshift\n') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest76.m b/GraphBLAS/GraphBLAS/test/gbtest76.m
index ed8df7fce6..3847f94344 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest76.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest76.m
@@ -1,8 +1,8 @@
 function gbtest76
 %GBTEST76 test trig and other functions
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ngbtest76: testing trig and special functions\n') ;
 
@@ -300,6 +300,9 @@ function gbtest76b (A, B, G, H, tol)
 
     C1 = gamma (real (full (A))) ;
     C2 = gamma (real (G)) ;
+    assert (isinf (C1 (1,1)) == isinf (C2 (1,1)))
+    C1 (1,1) = 0 ;
+    C2 (1,1) = 0 ;
     err = norm (C1-C2, 1) / norm (C1, 1) ;
     assert (err < tol) ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest77.m b/GraphBLAS/GraphBLAS/test/gbtest77.m
index 84862bde5b..7f401b4136 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest77.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest77.m
@@ -2,8 +2,8 @@
 %GBTEST77 test error handling
 % All errors generated by this test are expected.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 ok = true ;
@@ -331,6 +331,18 @@
 end
 assert (ok) ;
 
+try
+    A (GrB (true))
+    ok = false ;
+catch expected_error
+    expected_error
+    s = expected_error.stack ;
+    for k = 1:length (s)
+        disp (s (k)) ;
+    end
+end
+assert (ok) ;
+
 try
     sprand (G, 0)
     ok = false ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest78.m b/GraphBLAS/GraphBLAS/test/gbtest78.m
index cdb8b26ef6..a4be27e547 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest78.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest78.m
@@ -1,8 +1,8 @@
 function gbtest78
 %GBTEST78 test integer operators
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 A = uint8 (magic (4)) ;
 A = A (:,1:3) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest79.m b/GraphBLAS/GraphBLAS/test/gbtest79.m
index 599d6cdef8..295035f78c 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest79.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest79.m
@@ -2,8 +2,8 @@
 %GBTEST79 test real power
 % Tests all real, inf, and nan cases.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 list = [-2:0.5:2 inf -inf nan] ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest8.m b/GraphBLAS/GraphBLAS/test/gbtest8.m
index 1b466223a2..9b2d30f7e9 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest8.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest8.m
@@ -1,8 +1,8 @@
 function gbtest8
 %GBTEST8 test GrB.select
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %   tril
 %   triu
diff --git a/GraphBLAS/GraphBLAS/test/gbtest80.m b/GraphBLAS/GraphBLAS/test/gbtest80.m
index 4644247a82..f09e2b6874 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest80.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest80.m
@@ -2,8 +2,8 @@
 %GBTEST80 test complex division and power
 % Tests all real, inf, and nan cases.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 list = [-2:0.5:2 inf -inf nan] ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest81.m b/GraphBLAS/GraphBLAS/test/gbtest81.m
index 249dd735b1..d1dcb3acb9 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest81.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest81.m
@@ -1,8 +1,8 @@
 function gbtest81
 %GBTEST81 test complex operators
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('gbtest81: test complex operators\n') ;
 rng ('default')
diff --git a/GraphBLAS/GraphBLAS/test/gbtest82.m b/GraphBLAS/GraphBLAS/test/gbtest82.m
index 74f129b48c..56220940e1 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest82.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest82.m
@@ -1,8 +1,8 @@
 function gbtest82
 %GBTEST82 test complex A*B, A'*B, A*B', A'*B', A+B
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest83.m b/GraphBLAS/GraphBLAS/test/gbtest83.m
index df233f38c2..acb455ece5 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest83.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest83.m
@@ -6,10 +6,9 @@
 % C = GrB.apply (C, M, op, A)
 % C = GrB.apply (C, M, accum, op, A)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% fprintf ('gbtest83: test GrB.apply\n') ;
 rng ('default')
 
 C     = GrB.random (9, 9, 0.5) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest84.m b/GraphBLAS/GraphBLAS/test/gbtest84.m
index 00eaedf4a7..c6a353c48a 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest84.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest84.m
@@ -16,10 +16,9 @@
 % C = GrB.assign (C, accum, A, I, J) ;
 % C = GrB.assign (C, M, accum, A, I, J) ;
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% fprintf ('gbtest84: test GrB.assign\n') ;
 rng ('default')
 
 C     = GrB.random (9, 9, 0.5) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest85.m b/GraphBLAS/GraphBLAS/test/gbtest85.m
index a82f2c09fb..132c018f0a 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest85.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest85.m
@@ -16,10 +16,9 @@
 % C = GrB.subassign (C, accum, A, I, J) ;
 % C = GrB.subassign (C, M, accum, A, I, J) ;
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% fprintf ('gbtest85: test GrB.subassign\n') ;
 rng ('default')
 
 C     = GrB.random (9, 9, 0.5) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest86.m b/GraphBLAS/GraphBLAS/test/gbtest86.m
index 34583c821c..8da3e23d37 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest86.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest86.m
@@ -7,10 +7,9 @@
 % C = GrB.mxm (C, M, semiring, A, B, desc)
 % C = GrB.mxm (C, M, accum, semiring, A, B, desc)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% fprintf ('gbtest86: test GrB.mxm\n') ;
 rng ('default')
 
 C     = GrB.random (9, 9, 0.5) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest87.m b/GraphBLAS/GraphBLAS/test/gbtest87.m
index 383dcd97c9..ee04154183 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest87.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest87.m
@@ -7,10 +7,9 @@
 % C = GrB.eadd (C, M, op, A, B, desc)
 % C = GrB.eadd (C, M, accum, op, A, B, desc)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% fprintf ('gbtest87: test GrB.eadd\n') ;
 rng ('default')
 
 C     = GrB.random (9, 9, 0.5) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest88.m b/GraphBLAS/GraphBLAS/test/gbtest88.m
index cbd6848721..1a95ae18ba 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest88.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest88.m
@@ -7,8 +7,8 @@
 % C = GrB.emult (C, M, op, A, B, desc)
 % C = GrB.emult (C, M, accum, op, A, B, desc)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % fprintf ('gbtest88: test GrB.emult\n') ;
 rng ('default')
diff --git a/GraphBLAS/GraphBLAS/test/gbtest89.m b/GraphBLAS/GraphBLAS/test/gbtest89.m
index 894a6da87d..cffe827014 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest89.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest89.m
@@ -15,8 +15,8 @@
 % V = GrB.extract (V, accum, U, I, desc)
 % V = GrB.extract (V, W, accum, U, I, desc)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default')
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest9.m b/GraphBLAS/GraphBLAS/test/gbtest9.m
index 994c17a835..9525529fec 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest9.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest9.m
@@ -1,8 +1,8 @@
 function gbtest9
 %GBTEST9 test eye and speye
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 types = gbtest_types ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest90.m b/GraphBLAS/GraphBLAS/test/gbtest90.m
index c2c3580b51..d1025fdc39 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest90.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest90.m
@@ -6,8 +6,8 @@
 % c = GrB.reduce (c, accum, monoid, A)
 % c = GrB.reduce (c, accum, monoid, A, desc)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default')
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest91.m b/GraphBLAS/GraphBLAS/test/gbtest91.m
index 59e412a88c..ab2eb37d53 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest91.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest91.m
@@ -7,8 +7,8 @@
 % C = GrB.trans (C, M, A, desc)
 % C = GrB.trans (C, M, accum, A, desc)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default')
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest92.m b/GraphBLAS/GraphBLAS/test/gbtest92.m
index 7b582df7d6..46dba446be 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest92.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest92.m
@@ -7,8 +7,8 @@
 % C = GrB.kronecker (C, M, op, A, B, desc)
 % C = GrB.kronecker (C, M, accum, op, A, B, desc)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default')
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest93.m b/GraphBLAS/GraphBLAS/test/gbtest93.m
index 180218db74..5d2da14a61 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest93.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest93.m
@@ -17,8 +17,8 @@
 % C = GrB.select (C, M, accum, op, A, b)
 % C = GrB.select (C, M, accum, op, A, b, desc)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default')
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest94.m b/GraphBLAS/GraphBLAS/test/gbtest94.m
index a3009dfa96..5dab901fb2 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest94.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest94.m
@@ -17,8 +17,8 @@
 % C = GrB.vreduce (C, M, accum, monoid, A, b)
 % C = GrB.vreduce (C, M, accum, monoid, A, b, desc)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default')
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest95.m b/GraphBLAS/GraphBLAS/test/gbtest95.m
index dfbb3c2dc6..fdb34c28c9 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest95.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest95.m
@@ -1,8 +1,8 @@
 function gbtest95
 %GBTEST95 test indexing
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 G = GrB.empty (GrB ([0 2])) ;
 assert (isequal (size (G), [0 2])) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest97.m b/GraphBLAS/GraphBLAS/test/gbtest97.m
index 0e386cf39e..78981f34b0 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest97.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest97.m
@@ -11,8 +11,8 @@
 % C = GrB.apply2 (C, M, op, x, A)
 % C = GrB.apply2 (C, M, accum, op, x, A)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default')
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest98.m b/GraphBLAS/GraphBLAS/test/gbtest98.m
index 08c4e95b63..2a406f12df 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest98.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest98.m
@@ -1,8 +1,8 @@
-% function gbtest98
+function gbtest98
 %GBTEST98 test row/col degree for hypersparse matrices
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
@@ -36,7 +36,7 @@
 assert (isequal (x1, x2)) ;
 
 [i1,j1,x1] = GrB.extracttuples (G) ;
-[i2,j2,x2] = GrB.extracttuples (A) ;
+[~ ,~ ,x2] = GrB.extracttuples (A) ;
 assert (isequal (x1, x2)) ;
 
 assert (isequal (class (i1), 'int64')) ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest99.m b/GraphBLAS/GraphBLAS/test/gbtest99.m
index 255437d019..5ccff40aa2 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest99.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest99.m
@@ -1,142 +1,92 @@
-function gbtest99 (doplots)
-%GBTEST99 test GrB.bfs and plot (graph (G))
+function gbtest99
+%GBTEST99 test performance of C=A'*B and C=A'
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-if (nargin < 1)
-    doplots = true ;
+if (~ispc)
+    feature ('numcores') ;
 end
-
-save_threads = GrB.threads ;
-save_chunk   = GrB.chunk ;
-GrB.threads (4) ;
-GrB.chunk (2) ;
-
-%%MatrixMarket matrix coordinate pattern general
-%%GraphBLAS GrB_BOOL
-% Matrix from the cover of "Graph Algorithms in the Language of Linear
-% Algebra", Kepner and Gilbert.  Note that cover shows A'.  This is A.
-% 7 7 12
-ij = [
-4 1
-1 2
-4 3
-6 3
-7 3
-1 4
-7 4
-2 5
-7 5
-3 6
-5 6
-2 7 ] ;
-
-source = 1 ;
-
-A = sparse (ij (:,1), ij (:,2), ones (12,1), 8, 8) ;
-
-formats = { 'by row', 'by col' } ;
-if (doplots)
-    figure (1) ;
-    clf ;
-end
-
-for k1 = 1:2
-    fmt = formats {k1} ;
-
-    A = GrB (A, fmt) ;
-    H = GrB (A, 'logical', fmt) ;
-    if (k1 == 1 && doplots)
-        subplot (1,2,1) ;
-        plot (digraph (A)) ;
-    end
-
-    v1 = GrB.bfs (H, source) ;
-    [v, pi] = GrB.bfs (H, source) ;
-    assert (isequal (v, v1)) ;
-
-    vok = [1 2 3 2 3 4 3 0] ;
-    assert (isequal (full (double (v)), vok)) ;
-
-    % there are 2 valid trees, and GrB.bfs can return either one
-    piok1 = [1 1 4 1 2 3 2 0] ;
-    piok2 = [1 1 4 1 2 5 2 0] ;
-    ok1 = isequal (full (double (pi)), piok1) ;
-    ok2 = isequal (full (double (pi)), piok2) ;
-    if (ok1)
-        % this tree is more commonly found
-        % fprintf ('.') ;
-    end
-    if (ok2)
-        % fprintf ('#') ;
+fprintf ('# of threads in @GrB: %d\n', GrB.threads) ;
+n = 10 * 1e6 ;
+kset = [1    2    10  32 100 120 150 1000] ;
+nset = [1000 1000 100 10 10  10  10  10  ] ;
+nset = nset/10 ;
+
+for kk = 1:length (kset)
+    ntrials = nset (kk) ;
+    k = kset (kk) ;
+
+    fprintf ('\n======================== k = %d\n', k) ;
+    A = sprand (n, k, 0.001) ;
+    B = sprand (n, k, 0.001) ;
+
+    % MATLAB, with warmup:
+    C1 = A'*B ;
+    tic
+    for trial = 1:ntrials
+        C1 = A'*B ;
     end
-    assert (ok1 || ok2) ;
-
-    G = digraph (H) ;
-    v2 = bfsearch (G, source) ;
-
-    levels = full (double (v (v2))) ;
-    assert (isequal (levels, sort (levels))) ;
-
-    [v, pi] = GrB.bfs (H, source, 'directed') ;
-    assert (isequal (full (double (v)), vok)) ;
-
-    ok1 = isequal (full (double (pi)), piok1) ;
-    ok2 = isequal (full (double (pi)), piok2) ;
-    if (ok1)
-        % this tree is more commonly found
-        % fprintf ('+') ;
+    t1 = toc / ntrials ;
+    fprintf ('MATLAB time: %g sec\n', t1) ;
+
+    % GrB, with warmup, using the descriptor transpose
+    A = GrB (A) ;
+    B = GrB (B) ;
+    d.in0 = 'transpose' ;
+    C2 = GrB.mxm (A, '+.*', B, d) ;
+    tic
+    for trial = 1:ntrials
+        C2 = GrB.mxm (A, '+.*', B, d) ;
     end
-    if (ok2)
-        % this is also valid
-        % fprintf ('-') ;
+    t2 = toc / ntrials ;
+    err = norm (C1-C2, 1) ;
+    fprintf ('@GrB default time: %g sec, speedup %g error: %g\n', ...
+        t2, t1/t2, err) ;
+    assert (err <= 1e-12 * norm (C1,1)) ;
+
+    % GrB, with warmup, using the explicit transpose
+    C2 = A'*B ;
+    tic
+    for trial = 1:ntrials
+        C2 = A'*B ;
     end
-    assert (ok1 || ok2) ;
-
-    [v, pi] = GrB.bfs (H, source, 'directed', 'check') ;
-    assert (isequal (full (double (v)), vok)) ;
-
-    ok1 = isequal (full (double (pi)), piok1) ;
-    ok2 = isequal (full (double (pi)), piok2) ;
-    if (ok1)
-        % this tree is more commonly found
-        % fprintf ('\\') ;
+    t3 = toc / ntrials ;
+    err = norm (C1-C2, 1) ;
+    fprintf ('@GrB saxpy/transpose time: %g sec, speedup %g, error: %g\n', ...
+        t3, t1/t3, err) ;
+    assert (err <= 1e-12 * norm (C1,1)) ;
+
+    % with burble, to see what GraphBLAS is doing
+    GrB.burble (1) ;
+    fprintf ('\nGrB with mxm and descriptor transpose:\n') ;
+    C2 = GrB.mxm (A, '+.*', B, d) ; %#ok<NASGU>
+    fprintf ('\nGrB with A''*B syntax and explicit transpose:\n') ;
+    C2 = A'*B ; %#ok<NASGU>
+    GrB.burble (0) ;
+
+    % MATLAB transpose time
+    A = double (A) ;
+    C1 = A' ;
+    tic
+    for trial = 1:ntrials
+        C1 = A' ;
     end
-    if (ok2)
-        % this is also valid
-        % fprintf ('/') ;
+    t1 = toc / ntrials ;
+    fprintf ('\nMATLAB transpose time: %g sec\n', t1) ;
+
+    % GrB transpose time
+    A = GrB (A) ;
+    C2 = A' ;
+    tic
+    for trial = 1:ntrials
+        C2 = A' ;
     end
-    assert (ok1 || ok2) ;
+    t2 = toc / ntrials ;
+    assert (isequal (C1, C2)) ;
+    fprintf ('@GrB transpose time: %g sec, speedup %g\n', t2, t1/t2) ;
 
 end
 
-A = A+A' ;
-[v, pi] = GrB.bfs (A, 2, 'undirected') ;
-if (doplots)
-    subplot (1,2,2) ;
-    plot (graph (A))
-end
-vok = [2 1 3 3 2 3 2 0] ;
-assert (isequal (full (double (v)), vok)) ;
-% two valid trees:
-piok1 = [2 2 7 1 2 5 2 0] ;
-piok2 = [2 2 7 7 2 5 2 0] ;
-
-    ok1 = isequal (full (double (pi)), piok1) ;
-    ok2 = isequal (full (double (pi)), piok2) ;
-    if (ok1)
-        % this tree is more commonly found
-        % fprintf ('@') ;
-    end
-    if (ok2)
-        % fprintf ('_') ;
-    end
-    assert (ok1 || ok2) ;
-
-GrB.threads (save_threads) ;
-GrB.chunk (save_chunk) ;
-
 fprintf ('gbtest99: all tests passed\n') ;
 
-
diff --git a/GraphBLAS/GraphBLAS/test/gbtest_binops.m b/GraphBLAS/GraphBLAS/test/gbtest_binops.m
index 24397e6081..c0bf4effb3 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest_binops.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest_binops.m
@@ -9,8 +9,8 @@
 %
 % See also GrB.binopinfo.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 binops = {
     '1st'
@@ -53,6 +53,14 @@
     'bitand'
     'bitxor'
     'bitxnor'
+    'firsti0'
+    'firsti1'
+    'firstj0'
+    'firstj1'
+    'secondi0'
+    'secondi1'
+    'secondj0'
+    'secondj1'
     } ;
 
 synonyms = {
diff --git a/GraphBLAS/GraphBLAS/test/gbtest_cast.m b/GraphBLAS/GraphBLAS/test/gbtest_cast.m
index e356cbd9e0..7878714fe2 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest_cast.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest_cast.m
@@ -10,8 +10,8 @@
 %
 % See also cast.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (issparse (A))
     error ('A must be full') ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest_complex.m b/GraphBLAS/GraphBLAS/test/gbtest_complex.m
index d6fcd9b1b0..34c2cc4470 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest_complex.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest_complex.m
@@ -1,8 +1,8 @@
 function [complex_binaryops, complex_unaryops ] = gbtest_complex
 %GBTEST_COMPLEX return list of complex operators
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 complex_binaryops = {
     % x,y,z all the same type:
diff --git a/GraphBLAS/GraphBLAS/test/gbtest_eq.m b/GraphBLAS/GraphBLAS/test/gbtest_eq.m
index d2337a6a3d..fcfb3492b7 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest_eq.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest_eq.m
@@ -1,8 +1,8 @@
 function s = gbtest_eq (A, B)
 %GBTEST_EQ tests if A and B are equal, after dropping zeros.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 s = isequal (GrB.prune (A), GrB.prune (B)) ;
 
diff --git a/GraphBLAS/GraphBLAS/test/gbtest_err.m b/GraphBLAS/GraphBLAS/test/gbtest_err.m
index 2f75053c1e..dbfe8f920f 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest_err.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest_err.m
@@ -6,8 +6,8 @@
 % Returns the norm (A-B,1), ignoring inf's and nan's.
 % Also tests the result of isinf and isnan for A and B.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 err = 0 ;
 errnan = false ;
@@ -36,46 +36,6 @@
     B (Y) = 0 ;
 end
 
-%{
-if (~gbtest_eq (isfinite (A), isfinite (B)))
-    errnan = true ;
-end
-
-if (isreal (A) ~= isreal (B))
-    if (nnz (imag (A)) ~= nnz (imag (B)))
-        err = 99 ;
-    end
-end
-%}
-
-%{
-if (~isreal (A) || ~isreal (B))
-
-    if (~gbtest_eq (isnan (real (A)), isnan (real (B))))
-        % error ('isnan differs') ;
-        err = 4 ;
-    elseif (~gbtest_eq (isinf (real (A)), isinf (real (B))))
-        % error ('isinf differs') ;
-        err = 5 ;
-    elseif (~gbtest_eq (isfinite (real (A)), isfinite (real (B))))
-        % error ('isfinite differs') ;
-        err = 6 ;
-    end
-
-    if (~gbtest_eq (isnan (imag (A)), isnan (imag (B))))
-        % error ('isnan differs') ;
-        err = 7 ;
-    elseif (~gbtest_eq (isinf (imag (A)), isinf (imag (B))))
-        % error ('isinf differs') ;
-        err = 8 ;
-    elseif (~gbtest_eq (isfinite (imag (A)), isfinite (imag (B))))
-        % error ('isfinite differs') ;
-        err = 9 ;
-    end
-
-end
-%}
-
 A (~isfinite (A)) = 0 ;
 B (~isfinite (B)) = 0 ;
 if (err == 0)
diff --git a/GraphBLAS/GraphBLAS/test/gbtest_perf1.m b/GraphBLAS/GraphBLAS/test/gbtest_perf1.m
index 4e11a15857..afe24567c4 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest_perf1.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest_perf1.m
@@ -1,8 +1,8 @@
 function gbtest_perf1
 %GBTEST_PERF1 test A*x performance
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 max_nthreads = GrB.threads ;
 threads = [1 2 4 8 16 20 32 40 64] ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest_perf2.m b/GraphBLAS/GraphBLAS/test/gbtest_perf2.m
index 9605d63df6..12755058df 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest_perf2.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest_perf2.m
@@ -1,8 +1,8 @@
 function gbtest_perf2
 %GBTEST_PERF2 test A'*x performance
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 max_nthreads = GrB.threads ;
 threads = [1 2 4 8 16 20 32 40 64] ;
diff --git a/GraphBLAS/GraphBLAS/test/gbtest_types.m b/GraphBLAS/GraphBLAS/test/gbtest_types.m
index 831bc4f603..8911a0be53 100644
--- a/GraphBLAS/GraphBLAS/test/gbtest_types.m
+++ b/GraphBLAS/GraphBLAS/test/gbtest_types.m
@@ -3,8 +3,8 @@
 %
 % See also gbtest_binops.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 types = {
     'double'
diff --git a/GraphBLAS/GraphBLAS/test/gbtestv3.mat b/GraphBLAS/GraphBLAS/test/gbtestv3.mat
new file mode 100644
index 0000000000..e62933b64b
Binary files /dev/null and b/GraphBLAS/GraphBLAS/test/gbtestv3.mat differ
diff --git a/GraphBLAS/GraphBLAS/test/gunk2.m b/GraphBLAS/GraphBLAS/test/gunk2.m
deleted file mode 100644
index af1b21c048..0000000000
--- a/GraphBLAS/GraphBLAS/test/gunk2.m
+++ /dev/null
@@ -1,8 +0,0 @@
-clear all
-ver
-A = 1287128410976072704
-whos
-fprintf ('A:         %30o\n', A) ;
-C = bitcmp (A, 'uint64')
-fprintf ('bitcmp(A): %30o\n', C) ;
-
diff --git a/GraphBLAS/GraphBLAS/test/tcov/Contents.m b/GraphBLAS/GraphBLAS/test/tcov/Contents.m
index ac64b3a790..d1a0f92816 100644
--- a/GraphBLAS/GraphBLAS/test/tcov/Contents.m
+++ b/GraphBLAS/GraphBLAS/test/tcov/Contents.m
@@ -2,12 +2,9 @@
 %
 % The gbcov script compiles the MATLAB interface with statement coverage
 % enabled, and then runs the full test suite (../gbtest).  Next, it uses
-% gbcovshow to create the statement coverage report in tmp/cover.  To remove
-% all temporary files, use 'make distclean' or remove the tmp/* files and
-% folders.
-%
-% To run these tests, if GraphBLAS/@GrB is initialized, first use
-% GrB.finalize.  Then gbcov can load the modified MATLAB interface.
+% gbcovshow to create the statement coverage report in tmp/cover (one for
+% each file).  To remove all temporary files, use 'make distclean' or
+% remove the tmp/* files and folders manually.
 %
 %   gbcov      - run all GraphBLAS tests, with statement coverage
 %
@@ -17,6 +14,6 @@
 %   gbcovshow  - report GraphBLAS statement coverage
 %   gbcov_edit - create a version of GraphBLAS for statement coverage tests
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
diff --git a/GraphBLAS/GraphBLAS/test/tcov/Makefile b/GraphBLAS/GraphBLAS/test/tcov/Makefile
index c718bfd801..62c604f2e6 100644
--- a/GraphBLAS/GraphBLAS/test/tcov/Makefile
+++ b/GraphBLAS/GraphBLAS/test/tcov/Makefile
@@ -2,8 +2,8 @@
 # GraphBLAS/GraphBLAS/test/tcov/Makefile
 #-------------------------------------------------------------------------------
 
-#  SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-#  http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 #-------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/GraphBLAS/test/tcov/gbcov.m b/GraphBLAS/GraphBLAS/test/tcov/gbcov.m
index 5a6eeee9ec..35939c589b 100644
--- a/GraphBLAS/GraphBLAS/test/tcov/gbcov.m
+++ b/GraphBLAS/GraphBLAS/test/tcov/gbcov.m
@@ -1,8 +1,8 @@
 function gbcov
 %GBCOV run all GraphBLAS tests, with statement coverage
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % compile the coverage-test version of the @GrB mexFunctions
 global gbcov_global %#ok<*NUSED>
diff --git a/GraphBLAS/GraphBLAS/test/tcov/gbcov_edit.m b/GraphBLAS/GraphBLAS/test/tcov/gbcov_edit.m
index 306e290be4..42726b7221 100644
--- a/GraphBLAS/GraphBLAS/test/tcov/gbcov_edit.m
+++ b/GraphBLAS/GraphBLAS/test/tcov/gbcov_edit.m
@@ -13,8 +13,8 @@
 %
 %   { gbcov [count]++ ;
 
-%  SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-%  http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % infiles can be a struct from dir, or a single string with one filename
 if (~isstruct (infiles))
diff --git a/GraphBLAS/GraphBLAS/test/tcov/gbcov_util.c b/GraphBLAS/GraphBLAS/test/tcov/gbcov_util.c
index 561182edca..59f124d974 100644
--- a/GraphBLAS/GraphBLAS/test/tcov/gbcov_util.c
+++ b/GraphBLAS/GraphBLAS/test/tcov/gbcov_util.c
@@ -2,8 +2,8 @@
 // gbcov_util.c: utilities for test coverage
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -42,7 +42,7 @@ void gbcov_get ( )
     // it should exist now, but double-check
     if (GB_cov_matlab == NULL || mxIsEmpty (GB_cov_matlab))
     {
-        mexErrMsgTxt ("GB_cov_matlab still null!") ;
+        mexErrMsgIdAndTxt ("GrB:panic", "GB_cov_matlab still null!") ;
     }
 
     // get a pointer to the content of the gbcov_global array in the
@@ -50,7 +50,7 @@ void gbcov_get ( )
     int64_t *g = (int64_t *) mxGetData (GB_cov_matlab) ;
 
     // getting paranoid here; this should never happen
-    if (g == NULL) mexErrMsgTxt ("g null!") ;
+    if (g == NULL) mexErrMsgIdAndTxt ("GrB:panic", "g null!") ;
 
     // copy the count from the MATLAB gbcov_global into gbcov
     memcpy (gbcov, g, gbcov_max * sizeof (int64_t)) ;
diff --git a/GraphBLAS/GraphBLAS/test/tcov/gbcovmake.m b/GraphBLAS/GraphBLAS/test/tcov/gbcovmake.m
index 89a542c9c6..f2baaf38d9 100644
--- a/GraphBLAS/GraphBLAS/test/tcov/gbcovmake.m
+++ b/GraphBLAS/GraphBLAS/test/tcov/gbcovmake.m
@@ -3,8 +3,8 @@
 %
 % See also: gbcover, gbcov_edit
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if verLessThan ('matlab', '9.4')
     error ('MATLAB 9.4 (R2018a) or later is required') ;
diff --git a/GraphBLAS/GraphBLAS/test/tcov/gbcovshow.m b/GraphBLAS/GraphBLAS/test/tcov/gbcovshow.m
index 8c205a3416..81ee526019 100644
--- a/GraphBLAS/GraphBLAS/test/tcov/gbcovshow.m
+++ b/GraphBLAS/GraphBLAS/test/tcov/gbcovshow.m
@@ -1,8 +1,8 @@
 function gbcovshow
 %GBCOVSHOW report GraphBLAS statement coverage
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % report the coverage summary
 
diff --git a/GraphBLAS/Include/GraphBLAS.h b/GraphBLAS/Include/GraphBLAS.h
index a64fbd4099..9998b77db0 100644
--- a/GraphBLAS/Include/GraphBLAS.h
+++ b/GraphBLAS/Include/GraphBLAS.h
@@ -2,18 +2,18 @@
 // GraphBLAS.h: definitions for the GraphBLAS package
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS is an full implementation of the GraphBLAS standard,
-// which defines a set of sparse matrix operations on an extended algebra of
-// semirings, using an almost unlimited variety of operators and types.  When
-// applied to sparse adjacency matrices, these algebraic operations are
-// equivalent to computations on graphs.  GraphBLAS provides a powerful and
-// expressive framework creating graph algorithms based on the elegant
-// mathematics of sparse matrix operations on a semiring.
+// SuiteSparse:GraphBLAS is a complete implementation of the GraphBLAS
+// standard, which defines a set of sparse matrix operations on an extended
+// algebra of semirings, using an almost unlimited variety of operators and
+// types.  When applied to sparse adjacency matrices, these algebraic
+// operations are equivalent to computations on graphs.  GraphBLAS provides a
+// powerful and expressive framework creating graph algorithms based on the
+// elegant mathematics of sparse matrix operations on a semiring.
 
 // This GraphBLAS.h file contains GraphBLAS definitions for user applications
 // to #include.  Functions and variables with the prefix GB_ need to be defined
@@ -23,32 +23,38 @@
 // example GrB_free is a macro that uses _Generic to select the right method,
 // depending on the type of its argument.
 
-// This implementation fully conforms to the GraphBLAS API Specification, but
-// also includes functions and features that are extensions to the spec.  These
-// are cataloged here and tagged with "SPEC."
-
-// All functions and definitions that are extensions to the spec are given
-// names of the form GxB_* for functions, built-in objects, and macros, so it
-// is clear which are in the spec and which are extensions.  Extensions with
-// the name GxB_* are user-accessible in SuiteSparse:GraphBLAS but cannot be
-// guaranteed to appear in all GraphBLAS implementations.  In the future, if
-// any GxB_* functions are included as-is in the GraphBLAS API spec with GrB_*
-// names, the prior GxB_* variants that appear here will be kept for backward
-// compatibility.  If they must change for inclusion in the spec, a reasonable
-// attempt will be made to keep the prior GxB_* variant alongside the GrB_*
-// version, also for backward compatibility.
-
-// CUDA and MKL integration are in progress.  This file includes some
-// defintions for related support functions and macros.  These are tagged
-// as DRAFT below, and are not yet documented.  Do not use them; they will
-// likely change when these features are added for public usage.
+// This implementation (nearly) fully conforms to the GraphBLAS API
+// Specification (see the notes in the User Guide regarding GrB_wait,
+// GrB_error, and GrB_Matrix_reduce_BinaryOp).
+
+// It also includes functions and features that are extensions to the spec,
+// which are given names of the form GxB_* for functions, built-in objects, and
+// macros, so it is clear which are in the spec and which are extensions.
+// Extensions with the name GxB_* are user-accessible in SuiteSparse:GraphBLAS
+// but cannot be guaranteed to appear in all GraphBLAS implementations.
 
 #ifndef GRAPHBLAS_H
 #define GRAPHBLAS_H
 
-//------------------------------------------------------------------------------
+//==============================================================================
+// include files required by GraphBLAS
+//==============================================================================
+
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <stddef.h>
+#include <limits.h>
+#include <math.h>
+#include <stdarg.h>
+
+//==============================================================================
 // compiler variations
-//------------------------------------------------------------------------------
+//==============================================================================
 
 // Exporting/importing symbols for Microsoft Visual Studio
 
@@ -82,134 +88,6 @@
 #define GxB_STDC_VERSION 199001L
 #endif
 
-//------------------------------------------------------------------------------
-// GraphBLAS version
-//------------------------------------------------------------------------------
-
-// SPEC: the following macros are extensions to the spec
-
-// There are two version numbers that user codes can check against with
-// compile-time #if tests:  the version of this GraphBLAS implementation,
-// and the version of the GraphBLAS specification it conforms to.  User code
-// can use tests like this:
-//
-//      #if GxB_SPEC_VERSION >= GxB_VERSION (2,0,3)
-//      ... use features in GraphBLAS specification 2.0.3 ...
-//      #else
-//      ... only use features in early specifications
-//      #endif
-//
-//      #if GxB_IMPLEMENTATION > GxB_VERSION (1,4,0)
-//      ... use features from version 1.4.0 of a GraphBLAS package
-//      #endif
-
-// X_GRAPHBLAS: names this particular implementation:
-#define GxB_SUITESPARSE_GRAPHBLAS
-
-// GxB_VERSION: a single integer for comparing spec and version levels
-#define GxB_VERSION(major,minor,sub) \
-    (((major)*1000ULL + (minor))*1000ULL + (sub))
-
-// The version of this implementation, and the GraphBLAS API version:
-#define GxB_IMPLEMENTATION_NAME "SuiteSparse:GraphBLAS"
-#define GxB_IMPLEMENTATION_DATE "July 14, 2020"
-#define GxB_IMPLEMENTATION_MAJOR 3
-#define GxB_IMPLEMENTATION_MINOR 3
-#define GxB_IMPLEMENTATION_SUB   3
-#define GxB_SPEC_DATE "Sept 25, 2019"
-#define GxB_SPEC_MAJOR 1
-#define GxB_SPEC_MINOR 3
-#define GxB_SPEC_SUB   0
-
-#define GxB_IMPLEMENTATION \
-        GxB_VERSION (GxB_IMPLEMENTATION_MAJOR, \
-                     GxB_IMPLEMENTATION_MINOR, \
-                     GxB_IMPLEMENTATION_SUB)
-
-// The 'about' string the describes this particular implementation of GraphBLAS:
-#define GxB_IMPLEMENTATION_ABOUT \
-"SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, "                   \
-"All Rights Reserved.\n"                                                     \
-"http://suitesparse.com  Dept of Computer Sci. & Eng, Texas A&M University\n"
-
-// The GraphBLAS license for this particular implementation of GraphBLAS:
-#define GxB_IMPLEMENTATION_LICENSE \
-"SuiteSparse:GraphBLAS, Copyright 2017-2020, Timothy A. Davis\n"             \
-"\n"                                                                         \
-"Licensed under the Apache License, Version 2.0 (the \"License\");\n"        \
-"you may not use SuiteSparse:GraphBLAS except in compliance with the\n"      \
-"License.  You may obtain a copy of the License at\n"                        \
-"\n"                                                                         \
-"    http://www.apache.org/licenses/LICENSE-2.0  \n"                         \
-"\n"                                                                         \
-"Unless required by applicable law or agreed to in writing, software\n"      \
-"distributed under the License is distributed on an \"AS IS\" BASIS,\n"      \
-"WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" \
-"See the License for the specific language governing permissions and\n"      \
-"limitations under the License.\n"
-
-//------------------------------------------------------------------------------
-// GraphBLAS C API version
-//------------------------------------------------------------------------------
-
-#define GxB_SPEC_VERSION GxB_VERSION(GxB_SPEC_MAJOR,GxB_SPEC_MINOR,GxB_SPEC_SUB)
-
-// The 'spec' string describes the GraphBLAS spec:
-#define GxB_SPEC_ABOUT \
-"GraphBLAS C API, by Aydin Buluc, Timothy Mattson, Scott McMillan,\n"   \
-"Jose' Moreira, Carl Yang.  Based on \"GraphBLAS Mathematics\" by\n"    \
-"Jeremy Kepner.  See also \"Graph Algorithms in the Language of\n"      \
-"Linear Algebra\", edited by J. Kepner and J. Gilbert, SIAM, 2011.\n"
-
-//------------------------------------------------------------------------------
-// deprecrated macros
-//------------------------------------------------------------------------------
-
-// Use the definitions on the right, not on the left.
-
-#define GXB_SUITESPARSE_GRAPHBLAS
-#define GXB_VERSION(major,minor,sub)    GxB_VERSION(major,minor,sub)
-#define GXB_DATE                        GxB_IMPLEMENTATION_DATE
-#define GXB_IMPLEMENTATION_MAJOR        GxB_IMPLEMENTATION_MAJOR
-#define GXB_IMPLEMENTATION_MINOR        GxB_IMPLEMENTATION_MINOR
-#define GXB_IMPLEMENTATION_SUB          GxB_IMPLEMENTATION_SUB
-#define GXB_IMPLEMENTATION              GxB_IMPLEMENTATION
-#define GXB_ABOUT                       GxB_IMPLEMENTATION_ABOUT
-#define GXB_LICENSE                     GxB_IMPLEMENTATION_LICENSE
-
-#define GXB_SPEC_DATE   GxB_SPEC_DATE
-#define GXB_MAJOR       GxB_SPEC_MAJOR
-#define GXB_MINOR       GxB_SPEC_MINOR
-#define GXB_SUB         GxB_SPEC_SUB
-#define GXB             GxB_SPEC_VERSION
-#define GXB_SPEC        GxB_SPEC_ABOUT
-
-#define GxB             GxB_SPEC_VERSION
-#define GxB_MAJOR       GxB_SPEC_MAJOR
-#define GxB_MINOR       GxB_SPEC_MINOR
-#define GxB_SUB         GxB_SPEC_SUB
-#define GxB_SPEC        GxB_SPEC_ABOUT
-
-#define GxB_DATE        GxB_IMPLEMENTATION_DATE
-#define GxB_ABOUT       GxB_IMPLEMENTATION_ABOUT
-#define GxB_LICENSE     GxB_IMPLEMENTATION_LICENSE
-
-//------------------------------------------------------------------------------
-// include files required by GraphBLAS
-//------------------------------------------------------------------------------
-
-#include <stdio.h>
-#include <errno.h>
-#include <string.h>
-#include <stdlib.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <inttypes.h>
-#include <stddef.h>
-#include <limits.h>
-#include <math.h>
-#include <stdarg.h>
-
 //------------------------------------------------------------------------------
 // definitions for complex types
 //------------------------------------------------------------------------------
@@ -270,55 +148,100 @@
 
 #endif
 
-//------------------------------------------------------------------------------
-// user threading model
-//------------------------------------------------------------------------------
+//==============================================================================
+// version control
+//==============================================================================
+
+// There are two version numbers that user codes can check against with
+// compile-time #if tests:  the version of this GraphBLAS implementation,
+// and the version of the GraphBLAS specification it conforms to.  User code
+// can use tests like this:
+//
+//      #if GxB_SPEC_VERSION >= GxB_VERSION (2,0,3)
+//      ... use features in GraphBLAS specification 2.0.3 ...
+//      #else
+//      ... only use features in early specifications
+//      #endif
+//
+//      #if GxB_IMPLEMENTATION > GxB_VERSION (1,4,0)
+//      ... use features from version 1.4.0 of a GraphBLAS package
+//      #endif
 
-#if defined (USER_POSIX_THREADS)
-// POSIX pthreads
-#include <pthread.h>
+// X_GRAPHBLAS: names this particular implementation:
+#define GxB_SUITESPARSE_GRAPHBLAS
 
-#elif defined (_OPENMP) || defined (USER_OPENMP_THREADS)
-// OpenMP threads: this is the default, if OpenMP is available
-#include <omp.h>
+// GxB_VERSION: a single integer for comparing spec and version levels
+#define GxB_VERSION(major,minor,sub) \
+    (((major)*1000ULL + (minor))*1000ULL + (sub))
 
-#else // USER_NO_THREADS
-// no user threads
-#endif
+// The version of this implementation, and the GraphBLAS API version:
+#define GxB_IMPLEMENTATION_NAME "SuiteSparse:GraphBLAS"
+#define GxB_IMPLEMENTATION_DATE "Jan 19, 2021"
+#define GxB_IMPLEMENTATION_MAJOR 4
+#define GxB_IMPLEMENTATION_MINOR 0
+#define GxB_IMPLEMENTATION_SUB   3
+#define GxB_SPEC_DATE "Sept 25, 2019"
+#define GxB_SPEC_MAJOR 1
+#define GxB_SPEC_MINOR 3
+#define GxB_SPEC_SUB   0
+
+#define GxB_IMPLEMENTATION \
+        GxB_VERSION (GxB_IMPLEMENTATION_MAJOR, \
+                     GxB_IMPLEMENTATION_MINOR, \
+                     GxB_IMPLEMENTATION_SUB)
+
+// The 'about' string the describes this particular implementation of GraphBLAS:
+#define GxB_IMPLEMENTATION_ABOUT \
+"SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved." \
+"\nhttp://suitesparse.com  Dept of Computer Sci. & Eng, Texas A&M University.\n"
+
+// The GraphBLAS license for this particular implementation of GraphBLAS:
+#define GxB_IMPLEMENTATION_LICENSE \
+"SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved." \
+"\nLicensed under the Apache License, Version 2.0 (the \"License\"); you may\n"\
+"not use SuiteSparse:GraphBLAS except in compliance with the License.  You\n"  \
+"may obtain a copy of the License at\n\n"                                      \
+"    http://www.apache.org/licenses/LICENSE-2.0\n\n"                           \
+"Unless required by applicable law or agreed to in writing, software\n"        \
+"distributed under the License is distributed on an \"AS IS\" BASIS,\n"        \
+"WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"   \
+"See the License for the specific language governing permissions and\n"        \
+"limitations under the License.\n"
 
 //------------------------------------------------------------------------------
-// the GraphBLAS integer
+// GraphBLAS C API version
 //------------------------------------------------------------------------------
 
+#define GxB_SPEC_VERSION GxB_VERSION(GxB_SPEC_MAJOR,GxB_SPEC_MINOR,GxB_SPEC_SUB)
+
+// The 'spec' string describes the GraphBLAS spec:
+#define GxB_SPEC_ABOUT \
+"GraphBLAS C API, by Aydin Buluc, Timothy Mattson, Scott McMillan,\n"         \
+"Jose' Moreira, Carl Yang, and Benjamin Brock.  Based on 'GraphBLAS\n"        \
+"Mathematics by Jeremy Kepner.  See also 'Graph Algorithms in the Language\n" \
+"of Linear Algebra,' edited by J. Kepner and J. Gilbert, SIAM, 2011.\n"
+
+//==============================================================================
+// GrB_Index: the GraphBLAS integer
+//==============================================================================
+
 // GrB_Index: row or column index, or matrix dimension.  This typedef is used
 // for row and column indices, or matrix and vector dimensions.
 
 typedef uint64_t GrB_Index ;
 
 // The largest valid dimension permitted in this implementation is 2^60.
-// Matrices with that many rows and/or columns can be actually be easily
-// created, particularly if they are hypersparse since in that case O(nrows) or
-// O(ncols) memory is not needed.  For the standard formats, O(ncols) space is
-// needed for CSC and O(nrows) space is needed for CSR.  For hypersparse
-// matrices, the time complexity does not depend on O(nrows) or O(ncols).
-
 #define GxB_INDEX_MAX ((GrB_Index) (1ULL << 60))
 
-//------------------------------------------------------------------------------
+//==============================================================================
 // GraphBLAS error and informational codes
-//------------------------------------------------------------------------------
+//==============================================================================
 
 // All GraphBLAS functions return a code that indicates if it was successful
 // or not.  If more information is required, the GrB_error function can be
 // called, which returns a string that provides more information on the last
 // return value from GraphBLAS.
 
-// SPEC: all enum values in the spec should be defined.  They are not, so as a
-// result, a user code cannot be linked against an arbitrary GraphBLAS library
-// after it is compiled.  It must be linked with the same GraphBLAS library it
-// is compiled with.  SuiteSparse:GraphBLAS defines all user-visible enum
-// values explicitly.
-
 typedef enum
 {
 
@@ -328,38 +251,17 @@ typedef enum
     // informational codes, not an error:
     //--------------------------------------------------------------------------
 
-    // The GraphBLAS spec lists GrB_NO_VALUE as an 'error' code; it means that
-    // A(i,j) is not present in the matrix, having been requested by
-    // GrB_*_extractElement.  The function cannot return the proper value
-    // because the value of 'implicit zeros' depends on the semiring.  For the
-    // conventational plus-times semiring, the implied 'zero' actually has the
-    // value of zero.  For the max-plus semiring, it has the value -infinity.
-    // A matrix does not keep track of its semiring, and the user can change
-    // the semiring used to operate on the matrix.  How mathematically
-    // well-defined that change of semiring is depends the user; GraphBLAS will
-    // not change the explicit values in the matrix if the semiring changes.
-    // As a result, GraphBLAS needs to return not a value, but an indication
-    // that the value of A(i,j) is implicit.  The user application can use this
-    // indicator (GrB_NO_VALUE) to use the semiring's addititive identity, or
-    // it can take other action, as it chooses.  In either case, it is safe to
-    // ask for values that are not there, which is why this return condition is
-    // not really an 'error' code but an informational code.
-
     GrB_NO_VALUE = 1,           // A(i,j) requested but not there
 
     //--------------------------------------------------------------------------
     // API errors:
     //--------------------------------------------------------------------------
 
-    // In non-blocking mode, these errors are caught right away.
-
     GrB_UNINITIALIZED_OBJECT = 2,   // object has not been initialized
     GrB_INVALID_OBJECT = 3,         // object is corrupted
     GrB_NULL_POINTER = 4,           // input pointer is NULL
     GrB_INVALID_VALUE = 5,          // generic error code; some value is bad
-    GrB_INVALID_INDEX = 6,          // a row or column index is out of bounds;
-                                    // used for indices passed as scalars, not
-                                    // in a list.
+    GrB_INVALID_INDEX = 6,          // a row or column index is out of bounds
     GrB_DOMAIN_MISMATCH = 7,        // object domains are not compatible
     GrB_DIMENSION_MISMATCH = 8,     // matrix dimensions do not match
     GrB_OUTPUT_NOT_EMPTY = 9,       // output matrix already has values in it
@@ -368,20 +270,15 @@ typedef enum
     // execution errors:
     //--------------------------------------------------------------------------
 
-    // In non-blocking mode, these errors can be deferred.
-
     GrB_OUT_OF_MEMORY = 10,         // out of memory
     GrB_INSUFFICIENT_SPACE = 11,    // output array not large enough
-    GrB_INDEX_OUT_OF_BOUNDS = 12,   // a row or column index is out of bounds;
-                                    // used for indices in a list of indices.
-    GrB_PANIC = 13                  // SuiteSparse:GraphBLAS only panics if
-                                    // a critical section fails
-
+    GrB_INDEX_OUT_OF_BOUNDS = 12,   // a row or column index is out of bounds
+    GrB_PANIC = 13                  // unknown error, or GrB_init not called.
 }
 GrB_Info ;
 
 //==============================================================================
-//=== GraphBLAS context methods ================================================
+// GrB_init / GrB_finalize
 //==============================================================================
 
 // GrB_init must called before any other GraphBLAS operation.  GrB_finalize
@@ -408,27 +305,10 @@ GrB_Info GrB_init           // start up GraphBLAS
     GrB_Mode mode           // blocking or non-blocking mode
 ) ;
 
-// SPEC: GxB_init is an extension to the spec.  It does the same thing as
-// GrB_init, but it also defines the memory management functions that GraphBLAS
-// will use internally.  The functions can only be defined once, in GxB_init.
-// The GxB_*import* and GxB_*export* functions require that the user
-// application and the GraphBLAS library agree on the same
-// malloc/calloc/realloc/free functions to use, thus GxB_init is required so
-// the user application can define them for SuiteSparse:GraphBLAS.  The
-// user_malloc_is_thread_safe parameter tells SuiteSparse:GraphBLAS whether or
-// not the user-provided functions are thread-safe.  If false, then the
-// functions are only called from within an OpenMP critical section, to provide
-// thread safety.
-
-// SuiteSparse:GraphBLAS V3.0 added user_malloc_is_thread_safe argument to
-// GxB_init, and the Thunk argument changed in GxB_select.  As a result,
-// GxB_init and GxB_select in V3.0 are not backward compatible with V2.x.
-
 GB_PUBLIC
 GrB_Info GxB_init           // start up GraphBLAS and also define malloc, etc
 (
     GrB_Mode mode,          // blocking or non-blocking mode
-
     // pointers to memory management functions
     void * (* user_malloc_function  ) (size_t),
     void * (* user_calloc_function  ) (size_t, size_t),
@@ -447,13 +327,15 @@ GrB_Info GxB_cuda_init      // start up GraphBLAS for use with CUDA
 GB_PUBLIC
 GrB_Info GrB_finalize (void) ;     // finish GraphBLAS
 
+//==============================================================================
+// GrB_getVersion: GraphBLAS C API version
+//==============================================================================
+
 // compile-time access to the C API Version number of this library.
 #define GRB_VERSION     GxB_SPEC_MAJOR
 #define GRB_SUBVERSION  GxB_SPEC_MINOR
 
-// If the user program was compiled with one version of the library but linked
-// with a different one later on, the compile-time version check would be
-// stale.  GrB_getVersion thus provides a runtime access of the C API Version.
+// GrB_getVersion provides a runtime access of the C API Version.
 GB_PUBLIC
 GrB_Info GrB_getVersion         // runtime access to C API version number
 (
@@ -462,33 +344,9 @@ GrB_Info GrB_getVersion         // runtime access to C API version number
 ) ;
 
 //==============================================================================
-//=== GraphBLAS error handling =================================================
-//==============================================================================
-
-// Each GraphBLAS method and operation returns a GrB_Info error code.
-// GrB_error returns additional information on the error in a thread-safe
-// null-terminated string.  The string returned by GrB_error is statically
-// allocated in thread local storage and must not be free'd.
-
-GB_PUBLIC
-const char *GrB_error (void) ;     // return a string describing the last error
-
-//==============================================================================
-//=== GraphBLAS types, operators, monoids, and semirings =======================
+// GrB_Type: data types
 //==============================================================================
 
-//------------------------------------------------------------------------------
-// GraphBLAS types
-//------------------------------------------------------------------------------
-
-// A GraphBLAS GrB_Type defines the type of scalar values that a matrix
-// contains, and the type of scalar operands for a unary or binary operator.
-// There are 13 built-in types, and a user application can define any types of
-// its own as well.  The built-in types correspond to built-in types in C and
-// the classes in MATLAB, as listed below.  The user application can also
-// define new types based on any typedef in the C language whose values are
-// held in a contiguous region of memory.
-
 typedef struct GB_Type_opaque *GrB_Type ;
 
 // GraphBLAS predefined types and their counterparts in pure C and in MATLAB
@@ -507,17 +365,10 @@ GB_PUBLIC GrB_Type
     GxB_FC32   ,        // in C: float complex      in MATLAB: single complex
     GxB_FC64   ;        // in C: double complex     in MATLAB: double complex
 
-// SPEC: complex types are an extension to the spec.
-
 //------------------------------------------------------------------------------
-// GB_ helper macro for polymorphic functions
+// GB_ helper macro for polymorphic functions: do not use outside this file
 //------------------------------------------------------------------------------
 
-// This macro is not intended for use outside this file.  It provides the case
-// statements for the _Generic macros used in polymorphic functions, to select
-// a function based on one of the pre-defined types listed above, or a
-// user-defined type.
-
 #if GxB_STDC_VERSION >= 201112L
 #define GB_(p,prefix,func)                                      \
         const bool       p : prefix ## _ ## func ## _BOOL   ,   \
@@ -557,13 +408,7 @@ GB_PUBLIC GrB_Type
 // GrB_Type_new is implemented both as a macro and a function.  Both are
 // user-callable.  The default is to use the macro, since this allows the name
 // of the type to be saved as a string, for subsequent error reporting by
-// GrB_error.  It is also provided as a function so that applications that
-// require a function instead of macro can access it.  User code can simply do
-// #undef GrB_Type_new before using the function.  This approach also places
-// the function GrB_Type_new in the linkable SuiteSparse:GraphBLAS library so
-// that it is visible for linking with applications in languages other than
-// ANSI C99.  The function version does not allow the name of the ctype to be
-// saved in the new GraphBLAS type, however.  It is given a generic name.
+// GrB_error.
 
 // If SuiteSparse:GraphBLAS is compiled with -DNMACRO then the macro versions
 // of GrB_Type_new, GrB_UnaryOp_new, GrB_BinaryOp_new, and GxB_SelectOp_new
@@ -599,8 +444,6 @@ GrB_Info GB_Type_new            // not user-callable; use GrB_Type_new instead
     const char *name            // name of the type, as "sizeof (ctype)"
 ) ;
 
-// SPEC: GxB_Type_size is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_Type_size          // determine the size of the type
 (
@@ -614,20 +457,9 @@ GrB_Info GrB_Type_free          // free a user-defined type
     GrB_Type *type              // handle of user-defined type to free
 ) ;
 
-//------------------------------------------------------------------------------
-// GraphBLAS unary and binary operators
-//------------------------------------------------------------------------------
-
-// GraphBLAS defines built-in unary and binary operators, and the user may also
-// define new ones via function pointers.  When a user function z=f(x,y) or
-// z=f(x) is called by GraphBLAS, the pointers x, y, and z are guaranteed to be
-// non-NULL and to point to unique valid space of the expected type.  Built-in
-// types are statically allocated and need not be freed when the application
-// finishes.
-
-//------------------------------------------------------------------------------
-// unary operators
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_UnaryOp: unary operators
+//==============================================================================
 
 // GrB_UnaryOp: a function z=f(x).  The function f must have the signature:
 
@@ -686,13 +518,7 @@ GB_PUBLIC GrB_UnaryOp
     // GxB_LNOT_BOOL; it just has a different name.
     GrB_LNOT ;
 
-//------------------------------------------------------------------------------
-// operators for backward compatibilty
-//------------------------------------------------------------------------------
-
-// Now with GrB* names in the current specification.  Kept for backward
-// compatibility.
-
+// GxB_ABS is now in the v1.3 spec, the following names are deprecated:
 GB_PUBLIC GrB_UnaryOp
 
     // z = abs(x)
@@ -821,11 +647,6 @@ GB_PUBLIC GrB_UnaryOp
 // methods for unary operators
 //------------------------------------------------------------------------------
 
-// GrB_UnaryOp_new is implemented both as a macro and a function.  Both are
-// user-callable.  The default is to use the macro, since this allows the name
-// of the unary function to be kept in the new operator as a string.  See the
-// discussion of GrB_Type_new above.
-
 typedef void (*GxB_unary_function)  (void *, const void *) ;
 
 #undef GrB_UnaryOp_new
@@ -853,8 +674,6 @@ GrB_Info GB_UnaryOp_new             // not user-callable; use GrB_UnaryOp_new
     const char *name                // name of the underlying function
 ) ;
 
-// SPEC: GxB_UnaryOp_ztype is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_UnaryOp_ztype          // return the type of z
 (
@@ -862,8 +681,6 @@ GrB_Info GxB_UnaryOp_ztype          // return the type of z
     GrB_UnaryOp unaryop             // unary operator
 ) ;
 
-// SPEC: GxB_UnaryOp_xtype is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_UnaryOp_xtype          // return the type of x
 (
@@ -877,9 +694,9 @@ GrB_Info GrB_UnaryOp_free           // free a user-created unary operator
     GrB_UnaryOp *unaryop            // handle of unary operator to free
 ) ;
 
-//------------------------------------------------------------------------------
-// binary operators
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_BinaryOp: binary operators
+//==============================================================================
 
 // GrB_BinaryOp: a function z=f(x,y).  The function f must have the signature:
 
@@ -949,12 +766,7 @@ GB_PUBLIC GrB_BinaryOp
     // The GxB_IS* comparison operators z=f(x,y) return the same type as their
     // inputs.  Each of them compute z = (x OP y), where x, y, and z all have
     // the same type.  The value z is either 1 for true or 0 for false, but it
-    // is a value with the same type as x and y.  Z is not bool (unless x and y
-    // are also bool).  These operators compute the same thing as the 6 sets of
-    // EQ, NE, GT, LT, GE, and LE operators.  They just return their result z
-    // as the same type as x and y, instead of returning a value z that is
-    // boolean.  Since their ztype is non-boolean, they can be used as multiply
-    // operators in a semring with non-boolean monoids (PLUS, for example).
+    // is a value with the same type as x and y.
 
     // z = (x == y)     z = (x != y)        
     GxB_ISEQ_BOOL,      GxB_ISNE_BOOL,      
@@ -972,8 +784,6 @@ GB_PUBLIC GrB_BinaryOp
     GxB_ISEQ_FC32,      GxB_ISNE_FC32,
     GxB_ISEQ_FC64,      GxB_ISNE_FC64,
 
-    // The following operators are not defined for complex types:
-
     // z = (x > y)      z = (x < y)         z = (x >= y)     z = (x <= y)
     GxB_ISGT_BOOL,      GxB_ISLT_BOOL,      GxB_ISGE_BOOL,      GxB_ISLE_BOOL,
     GxB_ISGT_INT8,      GxB_ISLT_INT8,      GxB_ISGE_INT8,      GxB_ISLE_INT8,
@@ -1001,7 +811,7 @@ GB_PUBLIC GrB_BinaryOp
     GrB_MIN_FP64,       GrB_MAX_FP64,
 
     // Binary operators for each of the 11 real types:
-    
+
     // The operators convert non-boolean types internally to boolean and return
     // a value 1 or 0 in the same type, for true or false.  Each computes z =
     // ((x != 0) OP (y != 0)), where x, y, and z all the same type.  These
@@ -1021,8 +831,8 @@ GB_PUBLIC GrB_BinaryOp
     GxB_LOR_FP32,       GxB_LAND_FP32,      GxB_LXOR_FP32,
     GxB_LOR_FP64,       GxB_LAND_FP64,      GxB_LXOR_FP64,
 
-    // Binary operators operate only on boolean types: LOR, LAND, LXOR, and
-    // LXNOR.  The naming convention differs (_BOOL is not appended to the
+    // Binary operators that operate only on boolean types: LOR, LAND, LXOR,
+    // and LXNOR.  The naming convention differs (_BOOL is not appended to the
     // name).  They are the same as GxB_LOR_BOOL, GxB_LAND_BOOL, and
     // GxB_LXOR_BOOL, and GrB_EQ_BOOL, respectively.
 
@@ -1098,7 +908,7 @@ GB_PUBLIC GrB_BinaryOp
     // the type of x and y since z is always boolean.  If used as multiply
     // operators in a semiring, they can only be combined with boolean monoids.
     // The _BOOL versions of these operators give the same results as their
-    // IS*_BOOL counterparts.
+    // IS*_BOOL counterparts.  GrB_EQ_BOOL and GrB_LXNOR are identical.
 
     // z = (x == y)     z = (x != y)        z = (x > y)         z = (x < y)
     GrB_EQ_BOOL,        GrB_NE_BOOL,        GrB_GT_BOOL,        GrB_LT_BOOL,
@@ -1139,9 +949,61 @@ GB_PUBLIC GrB_BinaryOp
     GxB_CMPLX_FP32,
     GxB_CMPLX_FP64 ;
 
-//------------------------------------------------------------------------------
+//==============================================================================
+// positional GrB_UnaryOp and GrB_BinaryOp operators
+//==============================================================================
+
+// Positional operators do not depend on the value of an entry, but its row or
+// column index in the matrix instead.  For example, for an entry A(i,j),
+// first_i(A(i,j),y) is equal to i.  These operators are useful for returning
+// node id's as the result of a semiring operation.  If used as a mask, zero
+// has a special value, and thus z=first_i1(A(i,j),j) returns i+1 instead of i.
+// This can be useful when using a positional operator to construct a mask
+// matrix or vector for another GraphBLAS operation.  It is also essential for
+// the MATLAB interface, since the user view of matrix indices in MATLAB is
+// 1-based, not 0-based.
+
+// When applied to a vector, j is always equal to 0.  For a GxB_SCALAR,
+// both i and j are always zero.
+
+// GraphBLAS defines a GrB_Index as uint64_t, but these operators return a
+// GrB_INT32 or GrB_INT64 type, which is more flexible to use because the
+// result of this operator can be negated, to flag an entry for example.  The
+// value -1 can be used to denote "no node" or "no position".  GrB_INT32 is
+// useful for graphs smaller than 2^31 nodes.  If the row or column index
+// exceeds INT32_MAX, the result is determined by the typecast from the
+// 64-bit index to the smaller 32-bit index.
+
+// Positional operators cannot be used to construct monoids.  They can be used
+// as multiplicative operators in semirings, and as operators for GrB_eWise*,
+// and GrB_apply (bind first or second).  For the latter, the operator cannot
+// depend on the bound scalar.
+
+// When used as multiplicative operators in a semiring, FIRSTJ and SECONDI
+// are identical.  If C(i,j) += t is computed where t = A(i,k)*B(k,j), then
+// t = k in both cases.  Likewise, FIRSTJ1 and SECONDI1 are identical.
+
+GB_PUBLIC GrB_BinaryOp
+
+    GxB_FIRSTI_INT32,   GxB_FIRSTI_INT64,    // z = first_i(A(i,j),y) == i
+    GxB_FIRSTI1_INT32,  GxB_FIRSTI1_INT64,   // z = first_i1(A(i,j),y) == i+1
+    GxB_FIRSTJ_INT32,   GxB_FIRSTJ_INT64,    // z = first_j(A(i,j),y) == j
+    GxB_FIRSTJ1_INT32,  GxB_FIRSTJ1_INT64,   // z = first_j1(A(i,j),y) == j+1
+    GxB_SECONDI_INT32,  GxB_SECONDI_INT64,   // z = second_i(x,B(i,j)) == i
+    GxB_SECONDI1_INT32, GxB_SECONDI1_INT64,  // z = second_i1(x,B(i,j)) == i+1
+    GxB_SECONDJ_INT32,  GxB_SECONDJ_INT64,   // z = second_j(x,B(i,j)) == j
+    GxB_SECONDJ1_INT32, GxB_SECONDJ1_INT64 ; // z = second_j1(x,B(i,j)) == j+1
+
+GB_PUBLIC GrB_UnaryOp
+
+    GxB_POSITIONI_INT32,  GxB_POSITIONI_INT64,  // z=position_i(A(i,j)) == i
+    GxB_POSITIONI1_INT32, GxB_POSITIONI1_INT64, // z=position_i1(A(i,j)) == i+1
+    GxB_POSITIONJ_INT32,  GxB_POSITIONJ_INT64,  // z=position_j(A(i,j)) == j
+    GxB_POSITIONJ1_INT32, GxB_POSITIONJ1_INT64 ;// z=position_j1(A(i,j)) == j+1
+
+//==============================================================================
 // About boolean and bitwise binary operators
-//------------------------------------------------------------------------------
+//==============================================================================
 
 // Some of the boolean operators compute the same thing with different names.
 // For example, x*y and x&&y give the same results for boolean x and y.
@@ -1158,8 +1020,6 @@ GB_PUBLIC GrB_BinaryOp
 //  1 0  1   0   0   1   1  1  0  1  1  0   1   0  1  1 0 1  0  0 1   1
 //  1 1  1   1   1   1   1  0  1  1  1  1   0   1  0  0 0 1  1  1 1   1
 
-// SPEC: the definition of divide-by-zero is an extension to the spec
-
 // GraphBLAS includes a GrB_DIV_BOOL operator in its specification, but does
 // not define what boolean "division" means.  SuiteSparse:GraphBLAS makes the
 // following interpretation.
@@ -1203,7 +1063,7 @@ GB_PUBLIC GrB_BinaryOp
 //      z = (x >= y)    1 0 1 1     GE, ISGE, POW, and "x implies y"
 //
 //      z = ~x          1 1 0 0     (not(x), not predefined)
-//      z = (x >= y)    1 1 0 1     LE, ISLE, and "y implies x"
+//      z = (x <= y)    1 1 0 1     LE, ISLE, and "y implies x"
 //      z = ~(x && y)   1 1 1 0     (nand(x,y) function, not predefined)
 //      z = 1           1 1 1 1     PAIR
 //
@@ -1224,11 +1084,6 @@ GB_PUBLIC GrB_BinaryOp
 // methods for binary operators
 //------------------------------------------------------------------------------
 
-// GrB_BinaryOp_new is implemented both as a macro and a function.  Both are
-// user-callable.  The default is to use the macro, since this allows the name
-// of the unary function to be kept in the new operator as a string.  See the
-// discussion of GrB_Type_new above.
-
 typedef void (*GxB_binary_function) (void *, const void *, const void *) ;
 
 #undef GrB_BinaryOp_new
@@ -1258,8 +1113,6 @@ GrB_Info GB_BinaryOp_new            // not user-callable; use GrB_BinaryOp_new
     const char *name                // name of the underlying function
 ) ;
 
-// SPEC: GxB_BinaryOp_ztype is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_BinaryOp_ztype         // return the type of z
 (
@@ -1267,8 +1120,6 @@ GrB_Info GxB_BinaryOp_ztype         // return the type of z
     GrB_BinaryOp binaryop           // binary operator to query
 ) ;
 
-// SPEC: GxB_BinaryOp_xtype is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_BinaryOp_xtype         // return the type of x
 (
@@ -1276,8 +1127,6 @@ GrB_Info GxB_BinaryOp_xtype         // return the type of x
     GrB_BinaryOp binaryop           // binary operator to query
 ) ;
 
-// SPEC: GxB_BinaryOp_ytype is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_BinaryOp_ytype         // return the type of y
 (
@@ -1291,22 +1140,19 @@ GrB_Info GrB_BinaryOp_free          // free a user-created binary operator
     GrB_BinaryOp *binaryop          // handle of binary operator to free
 ) ;
 
-//------------------------------------------------------------------------------
-// Select operators
-//------------------------------------------------------------------------------
-
-// SPEC: GxB_SelectOp and all related functions are an extenstion to the spec.
+//==============================================================================
+// GxB_SelectOp: select operators
+//==============================================================================
 
 // GxB_SelectOp is an operator used by GxB_select to select entries from an
 // input matrix A that are kept in the output C.  If an entry A(i,j) in the
 // matrix A, of size nrows-by-ncols, has the value aij, then it calls the
-// select function as result = f (i, j, nrows, ncols, aij, thunk).  If the
-// function returns true, the entry is kept in the output C.  If f returns
-// false, the entry is not kept in C.  The type of x for the GxB_SelectOp
-// operator may be any of the 11 built-in types, or any user-defined type.  It
-// may also be GrB_NULL, to indicate that the function is type-generic and does
-// not depend at all on the value aij.  In this case, x is passed to f as a
-// NULL pointer.
+// select function as result = f (i, j, aij, thunk).  If the function returns
+// true, the entry is kept in the output C.  If f returns false, the entry is
+// not kept in C.  The type of x for the GxB_SelectOp operator may be any of
+// the 11 built-in types, or any user-defined type.  It may also be GrB_NULL,
+// to indicate that the function is type-generic and does not depend at all on
+// the value aij.  In this case, x is passed to f as a NULL pointer.
 
 // The optional Thunk parameter to GxB_select is a GxB_Scalar.  For built-in
 // select operators (TRIL, TRIU, DIAG, and OFFDIAG), Thunk must have any
@@ -1319,22 +1165,17 @@ GrB_Info GrB_BinaryOp_free          // free a user-created binary operator
 // the user operator is defined with a non-NULL Thunk input, then it must
 // be non-NULL and of the same type, when calling GxB_select.
 
-// GxB_SelectOp:  a function z=f(i,j,m,n,x,thunk) for the GxB_Select operation.
+// GxB_SelectOp:  a function z=f(i,j,x,thunk) for the GxB_Select operation.
 // The function f must have the signature:
 
 //      bool f (GrB_Index i, GrB_Index j,
-//              GrB_Index nrows, GrB_Index ncols,
 //              const void *x, const void *thunk) ;
 
-// Note that in Version 2.x of SuiteSparse:GraphBLAS, Thunk was passed to
-// GxB_select as a const void * pointer.  However, this design was incompatible
-// with non-blocking mode, when the GxB_select is computed in parallel.  Thus,
-// in Version 3.0 and following of SuiteSparse:GraphBLAS, Thunk becomes a
-// GxB_Scalar.  The function signature of the user-defined select operator, f,
-// remains the same.
+// NOTE: GxB_SelectOp has changed in v4.  In v3.3.3 it had this syntax:
 
-// ADDED in V3.0:  thunk changed from (const void *) to a GxB_Scalar.  This
-// change is not backward compatible with SuiteSparse:GraphBLAS V2.x.
+//      bool f (GrB_Index i, GrB_Index j,
+//              GrB_Index nrows, GrB_Index ncols,
+//              const void *x, const void *thunk) ;
 
 typedef struct GB_SelectOp_opaque *GxB_SelectOp ;
 
@@ -1386,17 +1227,10 @@ GB_PUBLIC GxB_SelectOp
 // select operators
 //------------------------------------------------------------------------------
 
-// GxB_SelectOp_new is implemented both as a macro and a function.  Both are
-// user-callable.  The default is to use the macro, since this allows the name
-// of the select function to be kept in the new operator as a string.  See the
-// discussion of GrB_Type_new above.
-
 typedef bool (*GxB_select_function)      // return true if A(i,j) is kept
 (
     GrB_Index i,                // row index of A(i,j)
     GrB_Index j,                // column index of A(i,j)
-    GrB_Index nrows,            // number of rows of A
-    GrB_Index ncols,            // number of columns of A
     const void *x,              // value of A(i,j)
     const void *thunk           // optional input for select function
 ) ;
@@ -1446,9 +1280,9 @@ GrB_Info GxB_SelectOp_free      // free a user-created select operator
     GxB_SelectOp *selectop      // handle of select operator to free
 ) ;
 
-//------------------------------------------------------------------------------
-// GraphBLAS Monoid
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_Monoid
+//==============================================================================
 
 // A monoid is an associative operator z=op(x,y) where all three types of z, x,
 // and y are identical.  The monoid also has an identity element, such that
@@ -1456,9 +1290,6 @@ GrB_Info GxB_SelectOp_free      // free a user-created select operator
 
 typedef struct GB_Monoid_opaque *GrB_Monoid ;
 
-// Create a new Monoid with a specific type of identity, which must match
-// the binary_op type.  The binary_op's three types must all be the same.
-
 GB_PUBLIC
 GrB_Info GrB_Monoid_new_BOOL        // create a new boolean monoid
 (
@@ -1742,9 +1573,6 @@ GrB_Info GxB_Monoid_terminal_new             // create a monoid
     (monoid, op, identity, terminal) ;
 #endif
 
-// SPEC: GxB_Monoid_terminal_new is an extension to the spec
-
-// SPEC: GxB_Monoid_operator is an extension to the spec
 GB_PUBLIC
 GrB_Info GxB_Monoid_operator        // return the monoid operator
 (
@@ -1752,7 +1580,6 @@ GrB_Info GxB_Monoid_operator        // return the monoid operator
     GrB_Monoid monoid               // monoid to query
 ) ;
 
-// SPEC: GxB_Monoid_identity is an extension to the spec
 GB_PUBLIC
 GrB_Info GxB_Monoid_identity        // return the monoid identity
 (
@@ -1760,7 +1587,6 @@ GrB_Info GxB_Monoid_identity        // return the monoid identity
     GrB_Monoid monoid               // monoid to query
 ) ;
 
-// SPEC: GxB_Monoid_terminal is an extension to the spec
 GB_PUBLIC
 GrB_Info GxB_Monoid_terminal        // return the monoid terminal
 (
@@ -1776,15 +1602,9 @@ GrB_Info GrB_Monoid_free            // free a user-created monoid
     GrB_Monoid *monoid              // handle of monoid to free
 ) ;
 
-//------------------------------------------------------------------------------
-// GraphBLAS Semiring
-//------------------------------------------------------------------------------
-
-// A semiring defines all the operators required to define the multiplication
-// of two sparse matrices in GraphBLAS, C=A*B.  The "add" operator is a
-// commutative and associative monoid, and the binary "multiply" operator
-// defines a function z=fmult(x,y) where the type of z matches the exactly with
-// the monoid type.
+//==============================================================================
+// GrB_Semiring
+//==============================================================================
 
 typedef struct GB_Semiring_opaque *GrB_Semiring ;
 
@@ -1796,8 +1616,6 @@ GrB_Info GrB_Semiring_new           // create a semiring
     GrB_BinaryOp multiply           // multiply operator of the semiring
 ) ;
 
-// SPEC: GxB_Semiring_add is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_Semiring_add           // return the add monoid of a semiring
 (
@@ -1805,8 +1623,6 @@ GrB_Info GxB_Semiring_add           // return the add monoid of a semiring
     GrB_Semiring semiring           // semiring to query
 ) ;
 
-// SPEC: GxB_Semiring_multiply is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_Semiring_multiply      // return multiply operator of a semiring
 (
@@ -1821,35 +1637,11 @@ GrB_Info GrB_Semiring_free          // free a user-created semiring
 ) ;
 
 //==============================================================================
-//=== GraphBLAS Matrix, Vector, and Scalar objects =============================
+// GxB_Scalar: a GraphBLAS scalar
 //==============================================================================
 
-// Sparse matrices and vectors are the primary objects in GraphBLAS.  All other
-// objects exist to support them, and all the operations do their work on them.
-
-// A sparse matrix is nrows-by-ncols and stored in a compressed sparse column
-// form.  The row indices are kept sorted.  Also present is a list of pending
-// tuples, held in (i,j,x) form in an unsorted format.  These are pending
-// updates to the matrix, having been put there by the setElement method and/or
-// assign operations.  The row and column indices of a matrix are of type
-// GrB_Index, and they range from 0 to the dimesion minus 1.  That is, they are
-// zero-based.
-
-// Like all GraphBLAS objects, the GrB_Matrix, GrB_Vector, and GxB_Scalar are
-// opaque to the user; their internal structure may change in future releases.
-
-typedef struct GB_Matrix_opaque *GrB_Matrix ;
-typedef struct GB_Vector_opaque *GrB_Vector ;
 typedef struct GB_Scalar_opaque *GxB_Scalar ;
 
-//==============================================================================
-//=== GraphBLAS Scalar methods =================================================
-//==============================================================================
-
-// SPEC: the GxB_Scalar is an extension to the spec.  A GxB_Scalar acts just
-// like a GrB_Vector of length 1.  It can be sparse, so its entry need not be
-// present.
-
 // These methods create, free, copy, and clear a GxB_Scalar.  The nvals,
 // and type methods return basic information about a GxB_Scalar.
 
@@ -2024,9 +1816,6 @@ GrB_Info GxB_Scalar_setElement          // s = x
 // Extract a single entry from a GxB_Scalar, x = s, typecasting from the type
 // of s to the type of x as needed.
 
-// Returns GrB_SUCCESS if s has an entry, and sets x to its value.
-// Returns GrB_NO_VALUE if s does not an entry, and x is unmodified.
-
 GB_PUBLIC
 GrB_Info GxB_Scalar_extractElement_BOOL     // x = s
 (
@@ -2145,9 +1934,11 @@ GrB_Info GxB_Scalar_extractElement  // x = s
 #endif
 
 //==============================================================================
-//=== GraphBLAS Vector methods =================================================
+// GrB_Vector: a GraphBLAS vector
 //==============================================================================
 
+typedef struct GB_Vector_opaque *GrB_Vector ;
+
 // These methods create, free, copy, and clear a vector.  The size, nvals,
 // and type methods return basic information about a vector.
 
@@ -2186,8 +1977,6 @@ GrB_Info GrB_Vector_nvals   // get the number of entries in a vector
     const GrB_Vector v      // vector to query
 ) ;
 
-// SPEC: GxB_Vector_type is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_Vector_type    // get the type of a vector
 (
@@ -2208,30 +1997,6 @@ GrB_Info GrB_Vector_free    // free a vector
 // GrB_Vector_build:  w = sparse (I,1,X) in MATLAB notation, but using any
 // associative operator to assemble duplicate entries.
 
-// Build a vector w from a set of (i,x) tuples.  The type and dimension of the
-// vector is already defined in w (via GrB_Vector_new), which must initially
-// have no entries.  I [0..nvals-1] is the list of row indices, and X
-// [0..nvals-1] is the list of numerical values.  The kth tuple is (I[k],X[k]),
-// and tuples can appear in any order.  Values are typecasted from X into the
-// type of the dup operator, as needed (user-defined types cannot be cast).
-// Duplicates are assembled together with the dup operator.  If two tuples
-// (i,x1) and (i,x2) have the same row index, then w(i) = dup (x1,x2).  All
-// three types of x,y,z of z=dup(x,y) must be the same.  The types of C, X, and
-// dup must be compatible.
-
-// SPEC: extension: well-defined behavior of a non-associative dup operator.
-
-// The GraphBLAS spec requires dup to be associative and does not define the
-// order in which duplicates are assembled.  Currently this implementation
-// assembles duplicates in the order they appear in I and X.  For example, if
-// (i,x1), (i,x2), and (i,x3) appear in that order in I and X, then w(i) =
-// dup(dup(x1,x2),x3).  This means that using the non-associative FIRST
-// operator as dup means that w(i) is set equal to the first entry in the list,
-// x1, and SECOND gives the last one, x3.  SuiteSparse:GraphBLAS guarantees
-// this ordering.  However, per the spec, this order of assembly is not
-// guaranteed in all implementations.  Thus dup must be associative and results
-// are not guaranteed in all implementations if it is not.
-
 GB_PUBLIC
 GrB_Info GrB_Vector_build_BOOL      // build a vector from (I,X) tuples
 (
@@ -2541,9 +2306,6 @@ GrB_Info GrB_Vector_setElement          // w(i) = x
 // Extract a single entry from a vector, x = v(i), typecasting from the type of
 // v to the type of x as needed.
 
-// Returns GrB_SUCCESS if v(i) is present, and sets x to its value.
-// Returns GrB_NO_VALUE if v(i) is not present, and x is unmodified.
-
 GB_PUBLIC
 GrB_Info GrB_Vector_extractElement_BOOL     // x = v(i)
 (
@@ -2698,12 +2460,6 @@ GrB_Info GrB_Vector_removeElement
 // example, to extract just the row indices, pass I as non-NULL, and X as NULL.
 // This is like [I,~,~] = find (v) in MATLAB.
 
-// The size of the I and X arrays (those that are not NULL) is given by nvals,
-// which must be at least as large as GrB_Vector_nvals (&nvals, v).  The values
-// in the vector are typecasted to the type of X, as needed.
-
-// SPEC: allowing I and/or X to be NULL is an extension to the spec.
-
 GB_PUBLIC
 GrB_Info GrB_Vector_extractTuples_BOOL      // [I,~,X] = find (v)
 (
@@ -2852,9 +2608,11 @@ GrB_Info GrB_Vector_extractTuples           // [I,~,X] = find (v)
 #endif
 
 //==============================================================================
-//=== GraphBLAS Matrix methods =================================================
+// GrB_Matrix: a GraphBLAS matrix
 //==============================================================================
 
+typedef struct GB_Matrix_opaque *GrB_Matrix ;
+
 // These methods create, free, copy, and clear a matrix.  The nrows, ncols,
 // nvals, and type methods return basic information about a matrix.
 
@@ -2901,8 +2659,6 @@ GrB_Info GrB_Matrix_nvals   // get the number of entries in a matrix
     const GrB_Matrix A      // matrix to query
 ) ;
 
-// SPEC: GxB_Matrix_type is an extension to the spec
-
 GB_PUBLIC
 GrB_Info GxB_Matrix_type    // get the type of a matrix
 (
@@ -2923,24 +2679,6 @@ GrB_Info GrB_Matrix_free    // free a matrix
 // GrB_Matrix_build:  C = sparse (I,J,X) in MATLAB notation, but using any
 // associative operator to assemble duplicate entries.
 
-// Builds a matrix C from a set of (i,j,x) tuples.  The type and dimension of
-// the matrix is already defined in C (via GrB_Matrix_new), which must
-// initially have no entries.  I [0..nvals-1] is the list of row indices, J
-// [0..nvals-1] is the list of column indices, and X [0..nvals-1] is the list
-// of numerical values.  The kth triplet is (I[k],J[k],X[k]), and tuples can
-// appear in any order.  Values are typecasted from X into the type of C, as
-// needed (user-defined types cannot be cast).  Duplicates are assembled
-// together with the dup operator.  If two tuples (i,j,x1) and (i,j,x2) have
-// the same row index, then C(i,j) = dup(x1,x2).  All three types of x,y,z
-// for z=dup(x,y) must be the same; and dup, C, and X must be compatible.
-
-// SPEC: extension: well-defined behavior of a non-associative dup operator.
-
-// The dup operator must be associative in general, and the GraphBLAS spec
-// states the order of assembly is not defined.  However, SuiteSparse:GraphBLAS
-// does guarantee an ordering; see the description of GrB_Vector_build for more
-// details.
-
 GB_PUBLIC
 GrB_Info GrB_Matrix_build_BOOL      // build a matrix from (I,J,X) tuples
 (
@@ -3280,9 +3018,6 @@ GrB_Info GrB_Matrix_setElement          // C (i,j) = x
 // Extract a single entry from a matrix, x = A(i,j), typecasting from the type
 // of A to the type of x, as needed.
 
-// Returns GrB_SUCCESS if A(i,j) is present, and sets x to its value.
-// Returns GrB_NO_VALUE if A(i,j) is not present, and x is unmodified.
-
 GB_PUBLIC
 GrB_Info GrB_Matrix_extractElement_BOOL     // x = A(i,j)
 (
@@ -3453,12 +3188,6 @@ GrB_Info GrB_Matrix_removeElement
 // For example, to extract just the row and col indices, pass I and J as
 // non-NULL, and X as NULL.  This is like [I,J,~] = find (A).
 
-// The size of the I, J, and X arrays (those that are not NULL) is given by
-// nvals, which must be at least as large as GrB_Matrix_nvals (&nvals, A).  The
-// values in the matrix are typecasted to the type of X, as needed.
-
-// SPEC: allowing I, J and/or X to be NULL is an extension to the spec.
-
 GB_PUBLIC
 GrB_Info GrB_Matrix_extractTuples_BOOL      // [I,J,X] = find (A)
 (
@@ -3622,7 +3351,7 @@ GrB_Info GrB_Matrix_extractTuples           // [I,J,X] = find (A)
 #endif
 
 //==============================================================================
-//=== GraphBLAS Descriptor =====================================================
+// GrB_Descriptor: the GraphBLAS descriptor
 //==============================================================================
 
 // The GrB_Descriptor is used to modify the behavior of GraphBLAS operations.
@@ -3655,22 +3384,22 @@ GrB_Info GrB_Matrix_extractTuples           // [I,J,X] = find (A)
 // GxB_CHUNK: an integer parameter that determines the number of threads to use
 //      for a small problem.  If w is the work to be performed, and chunk is
 //      the value of this parameter, then the # of threads is limited to floor
-//      (w/chunk).  The default chunk is currently 4096, but this may change in
+//      (w/chunk).  The default chunk is currently 64K, but this may change in
 //      the future.  If chunk is set to <= GxB_DEFAULT (that is, zero), the
 //      default is used.
 //
 // GxB_AxB_METHOD: this is a hint to SuiteSparse:GraphBLAS on which algorithm
 //      it should use to compute C=A*B, in GrB_mxm, GrB_mxv, and GrB_vxm.
-//      SuiteSparse:GraphBLAS has three different methods, and the default
+//      SuiteSparse:GraphBLAS has four different heuristics, and the default
 //      method (GxB_DEFAULT) selects between them automatically.  The complete
 //      rule is in the User Guide.  The brief discussion here assumes all
 //      matrices are stored by column.  All methods compute the same result,
 //      except that floating-point roundoff may differ when working on
 //      floating-point data types.
 //
-//      GxB_AxB_SAXPY:  C(:,j)=A*B(:,j) is computed using a mix of Gustavson,
-//          Hash, and (in the future) the Heap method.  Each task in the
-//          parallel computation makes its own decision, via a heuristic.
+//      GxB_AxB_SAXPY:  C(:,j)=A*B(:,j) is computed using a mix of Gustavson
+//          and Hash methods.  Each task in the parallel computation makes its
+//          own decision between these two methods, via a heuristic.
 //
 //      GxB_AxB_GUSTAVSON:  This is the same as GxB_AxB_SAXPY, except that
 //          every task uses Gustavon's method, computing C(:,j)=A*B(:,j) via a
@@ -3678,68 +3407,35 @@ GrB_Info GrB_Matrix_extractTuples           // [I,J,X] = find (A)
 //          Very good general-purpose method, but sometimes the workspace can
 //          be too large when many threads are used.
 //
-//      GxB_AxB_HEAP: a heap-based saxpy-style method, computing
-//          C(:,j)=A*B(:,j) via a heap of size equal to the maximum number of
-//          entries in any column of B.  Very good for hypersparse matrices,
-//          particularly when nnz(B) is less than the number of rows of A.
-//          The Heap method is no longer available in v3.2, so it is silently
-//          replaced with GxB_AxB_HASH.  It may reappear in a future version.
-//
 //      GxB_AxB_HASH: This is the same as GxB_AxB_SAXPY, except that every
-//          task uses the Hash method.  Like the Heap method, it is very good
-//          for hypersparse matrices and uses very little workspace (but more
-//          workspace than the Heap method).
+//          task uses the Hash method.  It is very good for hypersparse
+//          matrices and uses very little workspace, and so it scales well to
+//          many threads.
 //
 //      GxB_AxB_DOT: computes C(i,j) = A(:,i)'*B(:,j), for each entry C(i,j).
 //          A very specialized method that works well only if the mask is
-//          present, very sparse, and not complemented, when C is a dense
-//          vector or matrix, or when C is tiny.  It is impossibly slow if C is
-//          large and the mask is not present, since it takes Omega(m*n) time
-//          if C is m-by-n.
+//          present, very sparse, and not complemented, or when C is a dense
+//          vector or matrix, or when C is small.
+//
+// GxB_SORT: GrB_mxm and other methods may return a matrix in a 'jumbled'
+//      state, with indices out of order.  The sort is left pending.  Some
+//      methods can tolerate jumbled matrices on input, so this can be faster.
+//      However, in some cases, it can be faster for GrB_mxm to sort its output
+//      as it is computed.  With GxB_SORT set to GxB_DEFAULT, the sort is left
+//      pending.  With GxB_SORT set to a nonzero value, GrB_mxm typically sorts
+//      the resulting matrix C (but not always; this is just a hint).  If
+//      GrB_init is called with GrB_BLOCKING mode, the sort will always be
+//      done, and this setting has no effect.
 
 // The following are enumerated values in both the GrB_Desc_Field and the
-// GxB_Option_Field.  They are defined with the same integer value for both
-// enums, so the user can use them for both.
+// GxB_Option_Field for global options.  They are defined with the same integer
+// value for both enums, so the user can use them for both.
 #define GxB_NTHREADS 5
 #define GxB_CHUNK 7
 
-// GxB_MKL (DRAFT: in progress, do not use) a boolean that controls the usage
-// of the Intel MKL.  If true, then MKL may be used; if false, MKL is not
-// called. 
-#define GxB_MKL 31
-
-// CUDA support (DRAFT: in progress, do not use)
-// SuiteSparse:GraphBLAS can exploit a CUDA-aware GPU.  CUDA must be avaiable
-// when GraphBLAS is compiled (see the installations instructions), and it
-// must also be requested at run time by called GxB_cuda_init instead of
-// GrB_init or GxB_init.
-//
-//      GxB_GPU_CONTROL:  determines where the computation is performed.
-//
-//          GxB_DEFAULT:    decide based on where the matrix is, etc. 
-//          GxB_GPU_ALWAYS: always use the GPU
-//          GxB_GPU_NEVER:  never use the GPU
-//          type: GrB_Desc_Value (an enum)
-//
-//      GxB_GPU_CHUNK: used by the GxB_GPU_AUTO rule, to decide when a
-//          problem is large enough to use the GPU.  A double value.
-//
-//      GxB_GPU_SET: an array of GPUs to use, defined by integers 0 to
-//          # of GPUs available - 1. For example: [2 4 6], if there are
-//          8 GPUs.  (FUTURE)
-//
-// GxB_cuda_init is not used, or if CUDA is not available when GraphBLAS is
-// compiled as a library, then no GPUs are used and these settings are silently
-// ignored.
-
+// GPU control (DRAFT: in progress, do not use)
 #define GxB_GPU_CONTROL 21
 #define GxB_GPU_CHUNK   22
-// #define GxB_GPU_SET  23      // FUTURE
-
-// GxB_NTHREADS_MAX is no longer used, as of v3.2.0.
-#ifndef GxB_NTHREADS_MAX
-#define GxB_NTHREADS_MAX INT32_MAX
-#endif
 
 typedef enum
 {
@@ -3758,22 +3454,12 @@ typedef enum
     // GPU control (DRAFT: in progress, do not use)
     GxB_DESCRIPTOR_GPU_CONTROL = GxB_GPU_CONTROL,
     GxB_DESCRIPTOR_GPU_CHUNK   = GxB_GPU_CHUNK,
-    // GxB_DESCRIPTOR_GPU_SET  = GxB_GPU_SET,       // FUTURE
-
-    // MKL control (DRAFT: in progress, do not use)
-    GxB_DESCRIPTOR_MKL = GxB_MKL,   // control usage of Intel MKL
 
-    // SuiteSparse:GraphBLAS extensions are given large values so they do not
-    // conflict with future enum values added to the spec:
-    GxB_AxB_METHOD = 1000   // descriptor for selecting C=A*B algorithm
+    GxB_AxB_METHOD = 1000,  // descriptor for selecting C=A*B algorithm
+    GxB_SORT = 35           // control sort in GrB_mxm
 }
 GrB_Desc_Field ;
 
-// SPEC: GxB_DEFAULT, GxB_NTHREADS, GxB_CHUNK, and GxB_AxB_* are extensions.
-// In the spec, setting both GrB_COMP and GrB_STRUCTURE can be done with two
-// calls to GrB_Descriptor_set.  As an extension to the spec, they can also be
-// set with a single call, using the setting GrB_COMP+GrB_STRUCTURE.
-
 typedef enum
 {
     // for all GrB_Descriptor fields:
@@ -3790,13 +3476,12 @@ typedef enum
     // for GrB_INP0 and GrB_INP1 only:
     GrB_TRAN = 3,       // use the transpose of the input
 
-    // for GxB_GPU_CONTROL only:
+    // for GxB_GPU_CONTROL only (DRAFT: in progress, do not use)
     GxB_GPU_ALWAYS  = 2001,
     GxB_GPU_NEVER   = 2002,
 
     // for GxB_AxB_METHOD only:
     GxB_AxB_GUSTAVSON = 1001,   // gather-scatter saxpy method
-    GxB_AxB_HEAP      = 1002,   // heap-based saxpy method
     GxB_AxB_DOT       = 1003,   // dot product
     GxB_AxB_HASH      = 1004,   // hash-based saxpy method
     GxB_AxB_SAXPY     = 1005    // saxpy method (any kind)
@@ -3827,30 +3512,6 @@ GrB_Info GxB_Descriptor_get     // get a parameter from a descriptor
     GrB_Desc_Field field        // parameter to query
 ) ;
 
-// SPEC: GxB_Descriptor_get and GxB_Desc_get are extensions to the spec.  The
-// two functions are identical except for the order of the parameters, and the
-// type of the val parameter.  GxB_Desc_set is also an extension to the spec.
-
-// GxB_Descriptor_get was introduced in SuiteSparse:GraphBLAS Version 1.0 as an
-// extenstion to the spec.  Version 2.1.0 includes GxB_Desc_set and
-// GxB_Desc_get to have the same parameter ordering as the other GxB_*set/get
-// functions introduced in Version 2.1.  The third argument of GxB_*set is a
-// variable type, depending on the field.  The third argument of GxB_*get is a
-// pointer to a variable type, also depending on the field.
-
-// For the future, GxB_Descriptor_get will only be able to query the descriptor
-// fields in the spec of type GrB_Desc_Value (GrB_OUTP, GrB_MASK, GrB_INP0, and
-// GrB_INP1).  It does not extend to fields of arbitrary type.  GxB_Desc_get is
-// able to extend to arbitrary types, as is GxB_Desc_set.  Thus,
-// GxB_Desc_set/get, are preferred for future use.   GxB_Descriptor_get shall
-// be preserved into the future, for backward compatibility, however.
-
-// The simplest way to set/get a value of a GrB_Descriptor is with
-// the generic GxB_set and GxB_get functions:
-
-//      GxB_set (desc, field, value) ;
-//      GxB_get (desc, field, &value) ;
-
 GB_PUBLIC
 GrB_Info GxB_Desc_set           // set a parameter in a descriptor
 (
@@ -3931,11 +3592,9 @@ GrB_DESC_RSCT0T1 ; // GrB_REPLACE  GrB_STRUCTURE  GrB_COMP   GrB_TRAN  GrB_TRAN
 // them results in an error (GrB_INVALID_VALUE).  Attempts to free them are
 // silently ignored.
 
-//==============================================================================
-//=== SuiteSparse:GraphBLAS options ============================================
-//==============================================================================
-
-// SPEC: GxB_*_Option_* are extensions to the specification.
+//------------------------------------------------------------------------------
+// SuiteSparse:GraphBLAS options
+//------------------------------------------------------------------------------
 
 // The following options modify how SuiteSparse:GraphBLAS stores and operates
 // on its matrices.  The GxB_*Option* methods allow the user to suggest how the
@@ -3946,44 +3605,29 @@ GrB_DESC_RSCT0T1 ; // GrB_REPLACE  GrB_STRUCTURE  GrB_COMP   GrB_TRAN  GrB_TRAN
 
 //      GxB_Matrix_Option_set:  sets an option for a specific matrix
 //      GxB_Matrix_Option_get:  queries the current option of a specific matrix
+//      GxB_Vector_Option_set:  sets an option for a specific vector
+//      GxB_Vector_Option_get:  queries the current option of a specific vector
 //      GxB_Global_Option_set:  sets an option for all future matrices
 //      GxB_Global_Option_get:  queries current option for all future matrices
 
-// A pair generic functions are available to set and query the global options,
-// the matrix options, and the values of the GrB_Descriptor:
-
-//  GxB_set: sets a global option, a GrB_Matrix option or a GrB_Descriptor
-//  GxB_get: queries a global option, a GrB_Matrix option or a GrB_Descriptor
-
-// ADDED in V3.0: GxB_CHUNK, GxB_LIBRARY_*, GxB_API_* options:
-// ADDED in V3.3: GxB_MKL and GxB_GPU* (DRAFT: in progress, do not use)
+#define GxB_HYPER 0     // (deprecated, use GxB_HYPER_SWITCH)
 
 typedef enum            // for global options or matrix options
 {
-    // GxB_Matrix_Option_get/set and GxB_Global_Option_get/set:
-    GxB_HYPER = 0,      // defines switch to hypersparse format (a double value)
-    GxB_FORMAT = 1,     // defines CSR/CSC format: GxB_BY_ROW or GxB_BY_COL
-
-    // GxB_Global_Option_get only:
-    GxB_MODE = 2,       // mode passed to GrB_init (blocking or non-blocking)
-
-    GxB_THREAD_SAFETY = 3,  // thread library that allows GraphBLAS to
-                        // be thread-safe for user threads.
 
-    GxB_THREADING = 4,  // thread library used for internal GraphBLAS threads
+    //------------------------------------------------------------
+    // for GxB_Matrix_Option_get/set and GxB_Global_Option_get/set:
+    //------------------------------------------------------------
 
-    // GxB_Global_Option_get/set only:
-    GxB_GLOBAL_NTHREADS = GxB_NTHREADS,  // max number of threads to use
-                        // If <= GxB_DEFAULT, then GraphBLAS selects the number
-                        // of threads automatically.
-
-    GxB_GLOBAL_CHUNK = GxB_CHUNK,       // chunk size for small problems.
-                        // If <= GxB_DEFAULT, then the default is used.
+    GxB_HYPER_SWITCH = 0,   // defines switch to hypersparse (a double value)
+    GxB_BITMAP_SWITCH = 34, // defines switch to bitmap (a double value)
+    GxB_FORMAT = 1,         // defines CSR/CSC format: GxB_BY_ROW or GxB_BY_COL
 
-    // GxB_Matrix_Option_get only:
-    GxB_IS_HYPER = 6,   // query a matrix to see if it hypersparse or not
+    //------------------------------------------------------------
+    // for GxB_Global_Option_get only:
+    //------------------------------------------------------------
 
-    // GxB_Global_Option_get only:
+    GxB_MODE = 2,       // mode passed to GrB_init (blocking or non-blocking)
     GxB_LIBRARY_NAME = 8,           // name of the library (char *)
     GxB_LIBRARY_VERSION = 9,        // library version (3 int's)
     GxB_LIBRARY_DATE = 10,          // date of the library (char *)
@@ -3997,16 +3641,38 @@ typedef enum            // for global options or matrix options
     GxB_API_ABOUT = 18,             // about the API (char *)
     GxB_API_URL = 19,               // URL for the API (char *)
 
-    // GPU control (DRAFT: in progress, do not use)
-    GxB_GPU_COUNT = 20,             // # of GPUs (query only)
-    GxB_GLOBAL_GPU_CONTROL = GxB_GPU_CONTROL,
-    GxB_GLOBAL_GPU_CHUNK   = GxB_GPU_CHUNK,
-    // GxB_GLOBAL_GPU_SET  = GxB_GPU_SET,       // FUTURE
+    //------------------------------------------------------------
+    // for GxB_Global_Option_get/set only:
+    //------------------------------------------------------------
+
+    GxB_GLOBAL_NTHREADS = GxB_NTHREADS,  // max number of threads to use
+                        // If <= GxB_DEFAULT, then GraphBLAS selects the number
+                        // of threads automatically.
+
+    GxB_GLOBAL_CHUNK = GxB_CHUNK,       // chunk size for small problems.
+                        // If <= GxB_DEFAULT, then the default is used.
+
+    GxB_BURBLE = 99,    // diagnostic output (bool *)
+
+    //------------------------------------------------------------
+    // for GxB_Matrix_Option_get only:
+    //------------------------------------------------------------
+
+    GxB_SPARSITY_STATUS = 33,       // hyper, sparse, bitmap or full (1,2,4,8)
+    GxB_IS_HYPER = 6,               // deprecated; use GxB_SPARSITY_STATUS
 
-    // MKL control (DRAFT: in progress, do not use)
-    GxB_GLOBAL_MKL = GxB_MKL,       // control usage of Intel MKL
+    //------------------------------------------------------------
+    // for GxB_Matrix_Option_get/set only:
+    //------------------------------------------------------------
 
-    GxB_BURBLE = 99                 // development only (bool *)
+    GxB_SPARSITY_CONTROL = 32,      // sparsity control: 0 to 15; see below
+
+    //------------------------------------------------------------
+    // GPU and options (DRAFT: do not use)
+    //------------------------------------------------------------
+
+    GxB_GLOBAL_GPU_CONTROL = GxB_GPU_CONTROL,
+    GxB_GLOBAL_GPU_CHUNK   = GxB_GPU_CHUNK,
 
 } GxB_Option_Field ;
 
@@ -4019,42 +3685,86 @@ typedef enum
 }
 GxB_Format_Value ;
 
-// GxB_THREAD_SAFETY and GxB_THREADING can be one of the following:
-typedef enum
-{
-    GxB_THREAD_NONE = 0,    // no threading
-    GxB_THREAD_OPENMP = 1,  // OpenMP
-    GxB_THREAD_POSIX = 2,   // POSIX pthreads
-    GxB_THREAD_WINDOWS = 3, // Windows threads
-    GxB_THREAD_ANSI = 4     // ANSI C11 threads
-}
-GxB_Thread_Model ;
-
 // The default format is by column, just like MATLAB.  These constants are
 // defined as GB_PUBLIC const, so that if SuiteSparse:GraphBLAS is recompiled
 // with a different default format, and the application is relinked but not
 // recompiled, it will acquire the new default values.
 GB_PUBLIC const GxB_Format_Value GxB_FORMAT_DEFAULT ;
 
-// the default hypersparsity ratio
+// the default hyper_switch parameter
 GB_PUBLIC const double GxB_HYPER_DEFAULT ;
 
-// Let k be the actual number of non-empty vectors (with at least one entry).
-// This value k is not dependent on whether or not the matrix is stored in
-// hypersparse format.  Let n be the number of vectors (the # of columns if
-// CSC, or rows if CSR).  Let h be the value of the GxB_HYPER setting of the
-// matrix.
+// GxB_SPARSITY_CONTROL can be any sum or bitwise OR of these 4 values:
+#define GxB_HYPERSPARSE 1   // store matrix in hypersparse form
+#define GxB_SPARSE      2   // store matrix as sparse form (compressed vector)
+#define GxB_BITMAP      4   // store matrix as a bitmap
+#define GxB_FULL        8   // store matrix as full; all entries must be present
+
+// size of b array for GxB_set/get (GxB_BITMAP_SWITCH, b)
+#define GxB_NBITMAP_SWITCH 8    // size of bitmap_switch parameter array
+
+// any sparsity value:
+#define GxB_ANY_SPARSITY (GxB_HYPERSPARSE + GxB_SPARSE + GxB_BITMAP + GxB_FULL)
+
+// the default sparsity control is any format:
+#define GxB_AUTO_SPARSITY GxB_ANY_SPARSITY
+
+// GxB_Matrix_Option_set (A, GxB_SPARSITY_CONTROL, scontrol) provides hints
+// about which data structure GraphBLAS should use for the matrix A:
 //
-// If a matrix is currently hypersparse, it can be converted to non-hypersparse
-// if (n <= 1  || k > 2*n*h).  Otherwise ti stays hypersparse.  If (n <= 1) the
-// matrix is always stored as non-hypersparse.
+//      GxB_AUTO_SPARSITY: GraphBLAS selects automatically.
+//      GxB_HYPERSPARSE: always hypersparse, taking O(nvals(A)) space.
+//      GxB_SPARSE: always in a sparse struture: compressed-sparse row/column,
+//          taking O(nrows+nvals(A)) space if stored by row, or
+//          O(ncols+nvals(A)) if stored by column.
+//      GxB_BITMAP: always in a bitmap struture, taking O(nrows*ncols) space.
+//      GxB_FULL: always in a full structure, taking O(nrows*ncols) space,
+//          unless not all entries are present, in which case the bitmap
+//          storage is used.
 //
-// If currently non-hypersparse, it can be converted to hypersparse if (n > 1
-// && k <= n*h).  Otherwise, it stays non-hypersparse.  If (n <= 1) the matrix
-// always remains non-hypersparse.
+// These options can be summed.  For example, to allow a matrix to be sparse
+// or hypersparse, but not bitmap or full, use GxB_SPARSE + GxB_HYPERSPARSE.
+// Since GxB_FULL can only be used when all entries are present, matrices with
+// the just GxB_FULL control setting are stored in bitmap form if any entries
+// are not present.
+//
+// Only the least 4 bits of the sparsity control are considered, so the
+// formats can be bitwise negated.  For example, to allow for any format
+// except full, use ~GxB_FULL.
+//
+// GxB_Matrix_Option_get (A, GxB_SPARSITY_STATUS, &sparsity) returns the
+// current data structure currently used for the matrix A (either hypersparse,
+// sparse, bitmap, or full).
+//
+// GxB_Matrix_Option_get (A, GxB_SPARSITY_CONTROL, &scontrol) returns the hint
+// for how A should be stored (hypersparse, sparse, bitmap, or full, or any
+// combination).
+
+// GxB_HYPER_SWITCH:
+//      If the matrix or vector structure can be sparse or hypersparse, the
+//      GxB_HYPER_SWITCH parameter controls when each of these structures are
+//      used.  The parameter is not used if the matrix or vector is full or
+//      bitmap.
+//
+//      Let k be the actual number of non-empty vectors (with at least one
+//      entry).  This value k is not dependent on whether or not the matrix is
+//      stored in hypersparse structure.  Let n be the number of vectors (the #
+//      of columns if CSC, or rows if CSR).  Let h be the value of the
+//      GxB_HYPER_SWITCH setting of the matrix.
+//
+//      If a matrix is currently hypersparse, it can be converted to
+//      non-hypersparse if (n <= 1  || k > 2*n*h).  Otherwise it stays
+//      hypersparse.  If (n <= 1) the matrix is always stored as
+//      non-hypersparse.
+//
+//      If currently non-hypersparse, it can be converted to hypersparse if (n
+//      > 1 && k <= n*h).  Otherwise, it stays non-hypersparse.  If (n <= 1)
+//      the matrix always remains non-hypersparse.
+//
+//      Setting GxB_HYPER_SWITCH to GxB_ALWAYS_HYPER or GxB_NEVER_HYPER ensures
+//      a matrix always stays hypersparse, or always stays non-hypersparse,
+//      respectively.
 
-// setting GxB_HYPER to either of these values ensures a matrix always
-// stays hypersparse, or always stays non-hypersparse, respectively
 GB_PUBLIC const double GxB_ALWAYS_HYPER, GxB_NEVER_HYPER ;
 
 GB_PUBLIC
@@ -4073,17 +3783,34 @@ GrB_Info GxB_Matrix_Option_get      // gets the current option of a matrix
     ...                             // return value of the matrix option
 ) ;
 
+GB_PUBLIC
+GrB_Info GxB_Vector_Option_set      // set an option in a vector
+(
+    GrB_Vector A,                   // vector to modify
+    GxB_Option_Field field,         // option to change
+    ...                             // value to change it to
+) ;
+
+GB_PUBLIC
+GrB_Info GxB_Vector_Option_get      // gets the current option of a vector
+(
+    GrB_Vector A,                   // vector to query
+    GxB_Option_Field field,         // option to query
+    ...                             // return value of the vector option
+) ;
+
 // GxB_Global_Option_set controls the global defaults used when a new matrix is
 // created.  GrB_init defines the following initial settings:
 //
-//      GxB_Global_Option_set (GxB_HYPER, GxB_HYPER_DEFAULT) ;
+//      GxB_Global_Option_set (GxB_HYPER_SWITCH, GxB_HYPER_DEFAULT) ;
+//      GxB_Global_Option_set (GxB_BITMAP_SWITCH, NULL) ;
 //      GxB_Global_Option_set (GxB_FORMAT, GxB_FORMAT_DEFAULT) ;
 //
 // The compile-time constants GxB_HYPER_DEFAULT and GxB_FORMAT_DEFAULT are
-// equal to 0.625 and GxB_BY_ROW, by default.  That is, by default, all new
+// equal to 0.0625 and GxB_BY_ROW, by default.  That is, by default, all new
 // matrices are held by row in CSR format.  If a matrix has fewer than n/16
-// columns, it can be converted to hypersparse format.  If it has more than n/8
-// columns, it can be converted to non-hypersparse format.  Modifying these
+// columns, it can be converted to hypersparse structure.  If it has more than
+// n/8 columns, it can be converted to a sparse structure.  Modifying these
 // global settings via GxB_Global_Option_set has no effect on matrices already
 // created.
 
@@ -4101,9 +3828,15 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
     ...                             // return value of the global option
 ) ;
 
-//==============================================================================
-// === GxB_set and GxB_get =====================================================
-//==============================================================================
+//------------------------------------------------------------------------------
+// GxB_set and GxB_get
+//------------------------------------------------------------------------------
+
+// The simplest way to set/get a value of a GrB_Descriptor is with
+// the generic GxB_set and GxB_get functions:
+
+//      GxB_set (desc, field, value) ;
+//      GxB_get (desc, field, &value) ;
 
 // GxB_set and GxB_get are generic methods that and set or query the options in
 // a GrB_Matrix, a GrB_Descriptor, or in the global options.  They can be used
@@ -4112,10 +3845,15 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
 
 // To set/get the global options:
 //
-//      GxB_set (GxB_HYPER, double h) ;
-//      GxB_set (GxB_HYPER, GxB_ALWAYS_HYPER) ;
-//      GxB_set (GxB_HYPER, GxB_NEVER_HYPER) ;
-//      GxB_get (GxB_HYPER, double *h) ;
+//      GxB_set (GxB_HYPER_SWITCH, double h) ;
+//      GxB_set (GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER) ;
+//      GxB_set (GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
+//      GxB_get (GxB_HYPER_SWITCH, double *h) ;
+//
+//      double b [GxB_NBITMAP_SWITCH] ;
+//      GxB_set (GxB_BITMAP_SWITCH, b) ;
+//      GxB_set (GxB_BITMAP_SWITCH, NULL) ;     // set defaults
+//      GxB_get (GxB_BITMAP_SWITCH, b) ;
 //
 //      GxB_set (GxB_FORMAT, GxB_BY_ROW) ;
 //      GxB_set (GxB_FORMAT, GxB_BY_COL) ;
@@ -4130,53 +3868,44 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
 //      GxB_set (GxB_BURBLE, bool burble) ;
 //      GxB_get (GxB_BURBLE, bool *burble) ;
 
-// To set/get the global GPU options: (DRAFT: in progress, do not use)
-//
-//      GxB_set (GxB_GPU_CONTROL, GxB_DEFAULT) ;
-//      GxB_set (GxB_GPU_CONTROL, GxB_GPU_ALWAYS) ;
-//      GxB_set (GxB_GPU_CONTROL, GxB_GPU_NEVER) ;
-//      GxB_get (GxB_GPU_CONTROL, GrB_Desc_Value *)
-//
-//      GxB_set (GxB_GPU_CHUNK, double chunk) ;
-//      GxB_get (GxB_GPU_CHUNK, double *chunk) ;
-//
-//      GxB_get (GxB_GPU_COUNT, int *ngpus) ;   // query only
-
-// To set/get the global MKL options: (DRAFT: in progress, do not use)
-//
-//      GxB_set (GxB_MKL, bool use_mkl) ;
-//      GxB_get (GxB_MKL, bool *use_mkl) ;
-
 // To get global options that can be queried but not modified:
 //
-//      GxB_get (GxB_MODE,          GrB_Mode *mode) ;
-//      GxB_get (GxB_THREAD_SAFETY, GxB_Thread_Model *thread_safety) ;
-//      GxB_get (GxB_THREADING,     GxB_Thread_Model *threading) ;
+//      GxB_get (GxB_MODE, GrB_Mode *mode) ;
 
 // To set/get a matrix option:
 //
-//      GxB_set (GrB_Matrix A, GxB_HYPER, double h) ;
-//      GxB_set (GrB_Matrix A, GxB_HYPER, GxB_ALWAYS_HYPER) ;
-//      GxB_set (GrB_Matrix A, GxB_HYPER, GxB_NEVER_HYPER) ;
-//      GxB_get (GrB_Matrix A, GxB_HYPER, double *h) ;
+//      GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, double h) ;
+//      GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER) ;
+//      GxB_set (GrB_Matrix A, GxB_HYPER_SWITCH, GxB_NEVER_HYPER) ;
+//      GxB_get (GrB_Matrix A, GxB_HYPER_SWITCH, double *h) ;
+//
+//      GxB_set (GrB_Matrix A, GxB_BITMAP_SWITCH, double b) ;
+//      GxB_get (GrB_Matrix A, GxB_BITMAP_SWITCH, double *b) ;
 //
 //      GxB_set (GrB_Matrix A, GxB_FORMAT, GxB_BY_ROW) ;
 //      GxB_set (GrB_Matrix A, GxB_FORMAT, GxB_BY_COL) ;
 //      GxB_get (GrB_Matrix A, GxB_FORMAT, GxB_Format_Value *s) ;
-
-// To set/get the matrix GPU options: (DRAFT: in progress, do not use)
 //
-//      GxB_set (GrB_Matrix A, GxB_GPU_CONTROL, GxB_DEFAULT) ;
-//      GxB_set (GrB_Matrix A, GxB_GPU_CONTROL, GxB_GPU_ALWAYS) ;
-//      GxB_set (GrB_Matrix A, GxB_GPU_CONTROL, GxB_GPU_NEVER) ;
-//      GxB_get (GrB_Matrix A, GxB_GPU_CONTROL, GrB_Desc_Value *)
+//      GxB_set (GrB_Matrix A, GxB_SPARSITY_CONTROL, GxB_AUTO_SPARSITY) ;
+//      GxB_set (GrB_Matrix A, GxB_SPARSITY_CONTROL, scontrol) ;
+//      GxB_get (GrB_Matrix A, GxB_SPARSITY_CONTROL, int *scontrol) ;
 //
-//      GxB_set (GrB_Matrix A, GxB_GPU_CHUNK, double chunk) ;
-//      GxB_get (GrB_Matrix A, GxB_GPU_CHUNK, double *chunk) ;
+//      GxB_get (GrB_Matrix A, GxB_SPARSITY_STATUS, int *sparsity) ;
 
-// To get a matrix status (modified with GxB_HYPER, double h parameter):
+// To set/get a vector option or status:
+//
+//      GxB_set (GrB_Vector v, GxB_BITMAP_SWITCH, double b) ;
+//      GxB_get (GrB_Vector v, GxB_BITMAP_SWITCH, double *b) ;
+//
+//      GxB_set (GrB_Vector v, GxB_FORMAT, GxB_BY_ROW) ;
+//      GxB_set (GrB_Vector v, GxB_FORMAT, GxB_BY_COL) ;
+//      GxB_get (GrB_Vector v, GxB_FORMAT, GxB_Format_Value *s) ;
+//
+//      GxB_set (GrB_Vector v, GxB_SPARSITY_CONTROL, GxB_AUTO_SPARSITY) ;
+//      GxB_set (GrB_Vector v, GxB_SPARSITY_CONTROL, scontrol) ;
+//      GxB_get (GrB_Vector v, GxB_SPARSITY_CONTROL, int *scontrol) ;
 //
-//      GxB_get (GrB_Matrix A, GxB_IS_HYPER, bool *is_hyper) ;
+//      GxB_get (GrB_Vector v, GxB_SPARSITY_STATUS, int *sparsity) ;
 
 // To set/get a descriptor field:
 //
@@ -4200,7 +3929,6 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
 //
 //      GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_DEFAULT) ;
 //      GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_GUSTAVSON) ;
-//      GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_HEAP) ;
 //      GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_HASH) ;
 //      GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_SAXPY) ;
 //      GxB_set (GrB_Descriptor d, GxB_AxB_METHOD, GxB_AxB_DOT) ;
@@ -4211,21 +3939,9 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
 //
 //      GxB_set (GrB_Descriptor d, GxB_CHUNK, double chunk) ;
 //      GxB_get (GrB_Descriptor d, GxB_CHUNK, double *chunk) ;
-
-// To set/get the descriptor MKL options: (DRAFT: in progress, do not use)
-//
-//      GxB_set (GrB_Descriptor d, GxB_MKL, bool use_mkl) ;
-//      GxB_get (GrB_Descriptor d, GxB_MKL, bool *use_mkl) ;
-
-// To set/get the descriptor GPU options: (DRAFT: in progress, do not use)
-//
-//      GxB_set (GrB_Descriptor d, GxB_GPU_CONTROL, GxB_DEFAULT) ;
-//      GxB_set (GrB_Descriptor d, GxB_GPU_CONTROL, GxB_GPU_ALWAYS) ;
-//      GxB_set (GrB_Descriptor d, GxB_GPU_CONTROL, GxB_GPU_NEVER) ;
-//      GxB_get (GrB_Descriptor d, GxB_GPU_CONTROL, GrB_Desc_Value *)
 //
-//      GxB_set (GrB_Descriptor d, GxB_GPU_CHUNK, double chunk) ;
-//      GxB_get (GrB_Descriptor d, GxB_GPU_CHUNK, double *chunk) ;
+//      GxB_set (GrB_Descriptor d, GxB_SORT, sort) ;
+//      GxB_get (GrB_Descriptor d, GxB_SORT, int *sort) ;
 
 #if GxB_STDC_VERSION >= 201112L
 #define GxB_set(arg1,...)                                   \
@@ -4234,6 +3950,7 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
         (arg1),                                             \
               int              : GxB_Global_Option_set ,    \
               GxB_Option_Field : GxB_Global_Option_set ,    \
+              GrB_Vector       : GxB_Vector_Option_set ,    \
               GrB_Matrix       : GxB_Matrix_Option_set ,    \
               GrB_Descriptor   : GxB_Desc_set               \
     )                                                       \
@@ -4247,6 +3964,8 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
               int              : GxB_Global_Option_get ,    \
         const GxB_Option_Field : GxB_Global_Option_get ,    \
               GxB_Option_Field : GxB_Global_Option_get ,    \
+        const GrB_Vector       : GxB_Vector_Option_get ,    \
+              GrB_Vector       : GxB_Vector_Option_get ,    \
         const GrB_Matrix       : GxB_Matrix_Option_get ,    \
               GrB_Matrix       : GxB_Matrix_Option_get ,    \
         const GrB_Descriptor   : GxB_Desc_get          ,    \
@@ -4256,16 +3975,12 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
 #endif
 
 //==============================================================================
-//=== GrB_free =================================================================
+// GrB_free: free any GraphBLAS object
 //==============================================================================
 
-// GrB_free: free a GraphBLAS object.  Each GraphBLAS object has a specific
-// GrB_*_new and GrB_*_free method.  There is no generic GrB_new, but the
-// generic GrB_free method can free any GraphBLAS object.  It is safe to free
-// an object twice, and it is also safe to (attempt to) free a built-in object.
-// In that case, GrB_free silently does nothing and returns GrB_SUCCESS.  By
-// the GraphBLAS spec, GrB_*_free functions can return GrB_SUCCESS or
-// GrB_PANIC; in this implementation they never panic.
+// for null and invalid objects
+#define GrB_NULL NULL
+#define GrB_INVALID_HANDLE NULL
 
 #if GxB_STDC_VERSION >= 201112L
 #define GrB_free(object)                         \
@@ -4287,7 +4002,7 @@ GrB_Info GxB_Global_Option_get      // gets the current global default option
 #endif
 
 //==============================================================================
-//=== GraphBLAS sequence termination ===========================================
+// GrB_wait: finish computations
 //==============================================================================
 
 // Finish all pending work in a specific object.
@@ -4303,15 +4018,8 @@ GB_PUBLIC GrB_Info GxB_Scalar_wait     (GxB_Scalar     *s       ) ;
 GB_PUBLIC GrB_Info GrB_Vector_wait     (GrB_Vector     *v       ) ;
 GB_PUBLIC GrB_Info GrB_Matrix_wait     (GrB_Matrix     *A       ) ;
 
-// TODO in 4.0: GrB_wait (with no inputs) is deprecated, and also not
-// compatible with the polymorphic GrB_wait (&object).  In V4.0,
-// GrB_wait ( ) will be removed, and the polymorphic GrB_wait (&object)
-// will be added.
-
-GB_PUBLIC GrB_Info GrB_wait (void) ;        // DEPRECATED: TODO in 4.0: delete
-
-// TODO in 4.0: add GrB_wait (&object) polymorphic function:
-/*
+// GrB_wait (&object) polymorphic function:
+#if GxB_STDC_VERSION >= 201112L
 #define GrB_wait(object)                         \
     _Generic                                     \
     (                                            \
@@ -4329,44 +4037,60 @@ GB_PUBLIC GrB_Info GrB_wait (void) ;        // DEPRECATED: TODO in 4.0: delete
     )                                            \
     (object)
 #endif
-*/
 
 //==============================================================================
-//=== GraphBLAS operations =====================================================
+// GrB_error: error handling
 //==============================================================================
 
-// Each GraphBLAS operation can be modified by an optional Mask, an optional
-// accum operator, and a descriptor.
-
-// The primary computation of an operation computes a matrix or vector T.  If
-// accum is NULL, Z=T.  Otherwise, Z=accum(C,T) is computed, where accum is a
-// binary operator applied in an element-wise add manner.  Next, C is
-// optionally cleared if the REPLACE descriptor is enabled.  Finally, C<Mask>=Z
-// is computed.  If there is no Mask, C=Z, or if an empty Mask (Mask==NULL) is
-// complemented via the descriptor, C is not modified at all.  Otherwise
-// C(Mask)=Z(Mask) is computed using MATLAB-style logical index, if the Mask is
-// not complemented.  Otherwise C(~Mask)=Z(~Mask) is computed.  This
-// description is terse; see the User Guide for more details.
-
-// GrB_NULL is used for the accum argument when no accum operation is desired,
-// for the Mask argument when no Mask is desired, and for the descriptor
-// argument when the default descriptor is desired.
-
-#define GrB_NULL NULL
-
-// An object that has been freed is a GrB_INVALID_HANDLE, a NULL pointer.
-
-#define GrB_INVALID_HANDLE NULL
-
-//------------------------------------------------------------------------------
-// matrix and vector multiplication over a semiring
-//------------------------------------------------------------------------------
+// Each GraphBLAS method and operation returns a GrB_Info error code.
+// GrB_error returns additional information on the error in a thread-safe
+// null-terminated string.  The string returned by GrB_error is owned by
+// the GraphBLAS library and must not be free'd.
+
+GB_PUBLIC GrB_Info GrB_Type_error       (const char **error, const GrB_Type       type) ;
+GB_PUBLIC GrB_Info GrB_UnaryOp_error    (const char **error, const GrB_UnaryOp    op) ;
+GB_PUBLIC GrB_Info GrB_BinaryOp_error   (const char **error, const GrB_BinaryOp   op) ;
+GB_PUBLIC GrB_Info GxB_SelectOp_error   (const char **error, const GxB_SelectOp   op) ;
+GB_PUBLIC GrB_Info GrB_Monoid_error     (const char **error, const GrB_Monoid     monoid) ;
+GB_PUBLIC GrB_Info GrB_Semiring_error   (const char **error, const GrB_Semiring   semiring) ;
+GB_PUBLIC GrB_Info GxB_Scalar_error     (const char **error, const GxB_Scalar     s) ;
+GB_PUBLIC GrB_Info GrB_Vector_error     (const char **error, const GrB_Vector     v) ;
+GB_PUBLIC GrB_Info GrB_Matrix_error     (const char **error, const GrB_Matrix     A) ;
+GB_PUBLIC GrB_Info GrB_Descriptor_error (const char **error, const GrB_Descriptor d) ;
+
+// GrB_error (error,object) polymorphic function:
+#if GxB_STDC_VERSION >= 201112L
+#define GrB_error(error,object)                         \
+    _Generic                                            \
+    (                                                   \
+        (object),                                       \
+        const GrB_Type       : GrB_Type_error       ,   \
+              GrB_Type       : GrB_Type_error       ,   \
+        const GrB_UnaryOp    : GrB_UnaryOp_error    ,   \
+              GrB_UnaryOp    : GrB_UnaryOp_error    ,   \
+        const GrB_BinaryOp   : GrB_BinaryOp_error   ,   \
+              GrB_BinaryOp   : GrB_BinaryOp_error   ,   \
+        const GxB_SelectOp   : GxB_SelectOp_error   ,   \
+              GxB_SelectOp   : GxB_SelectOp_error   ,   \
+        const GrB_Monoid     : GrB_Monoid_error     ,   \
+              GrB_Monoid     : GrB_Monoid_error     ,   \
+        const GrB_Semiring   : GrB_Semiring_error   ,   \
+              GrB_Semiring   : GrB_Semiring_error   ,   \
+        const GxB_Scalar     : GxB_Scalar_error     ,   \
+              GxB_Scalar     : GxB_Scalar_error     ,   \
+        const GrB_Vector     : GrB_Vector_error     ,   \
+              GrB_Vector     : GrB_Vector_error     ,   \
+        const GrB_Matrix     : GrB_Matrix_error     ,   \
+              GrB_Matrix     : GrB_Matrix_error     ,   \
+        const GrB_Descriptor : GrB_Descriptor_error ,   \
+              GrB_Descriptor : GrB_Descriptor_error     \
+    )                                                   \
+    (error, object)
+#endif
 
-// Each of these methods compute a matrix multiplication over a semiring.  The
-// inputs are typecasted into the inputs of the semiring's multiply operator.
-// The result T=A*B has the type of the multiplier output, which is also the 3
-// types of the 'add' operator.  The 'add' operator is a commutatitive and
-// associative monoid.
+//==============================================================================
+// GrB_mxm, vxm, mxv: matrix multiplication over a semiring
+//==============================================================================
 
 GB_PUBLIC
 GrB_Info GrB_mxm                    // C<Mask> = accum (C, A*B)
@@ -4404,25 +4128,13 @@ GrB_Info GrB_mxv                    // w<Mask> = accum (w, A*u)
     const GrB_Descriptor desc       // descriptor for w, mask, and A
 ) ;
 
-//------------------------------------------------------------------------------
-// element-wise matrix and vector operations: using set intersection
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_eWiseMult: element-wise matrix and vector operations, set intersection
+//==============================================================================
 
 // GrB_eWiseMult computes C<Mask> = accum (C, A.*B), where ".*" is MATLAB
 // notation, and where pairs of elements in two matrices (or vectors) are
-// pairwise "multiplied" with C(i,j) = mult (A(i,j),B(i,j)).  The
-// "multiplication" operator can be any binary operator.  This is not matrix
-// multiplication in the conventional linear algebra sense; see GrB_mxm and
-// related methods for that operation.  The pattern of the result T=A.*B is the
-// set intersection (not union) of A and B.  Entries outside of the
-// intersection are not computed.  This is primary difference with
-// GrB_eWiseAdd.
-
-// The input matrices A and/or B may be transposed first, via the descriptor.
-
-// For a semiring, the mult operator is the semiring's multiply operator; note
-// that this differs from the eWiseAdd methods which use the semiring's add
-// operator instead. For a monoid, the mult operator is the monoid operator.
+// pairwise "multiplied" with C(i,j) = mult (A(i,j),B(i,j)).
 
 GB_PUBLIC
 GrB_Info GrB_Vector_eWiseMult_Semiring       // w<Mask> = accum (w, u.*v)
@@ -4496,35 +4208,6 @@ GrB_Info GrB_Matrix_eWiseMult_BinaryOp       // C<Mask> = accum (C, A.*B)
     const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
 ) ;
 
-// misnamed, deprecated functions:  These function names do not appear
-// in the GraphBLAS C API Specification.  They were misnamed in
-// earlier versions of SuiteSparse:GraphBLAS.  The corrected versions
-// appear above.
-
-GB_PUBLIC GrB_Info GrB_eWiseMult_Vector_Semiring (GrB_Vector,
-    const GrB_Vector, const GrB_BinaryOp, const GrB_Semiring,
-    const GrB_Vector, const GrB_Vector, const GrB_Descriptor) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseMult_Vector_Monoid (GrB_Vector,
-    const GrB_Vector, const GrB_BinaryOp, const GrB_Monoid,
-    const GrB_Vector, const GrB_Vector, const GrB_Descriptor desc) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseMult_Vector_BinaryOp (GrB_Vector,
-    const GrB_Vector, const GrB_BinaryOp, const GrB_BinaryOp,
-    const GrB_Vector, const GrB_Vector, const GrB_Descriptor desc) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseMult_Matrix_Semiring (GrB_Matrix,
-    const GrB_Matrix, const GrB_BinaryOp, const GrB_Semiring,
-    const GrB_Matrix, const GrB_Matrix, const GrB_Descriptor desc) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseMult_Matrix_Monoid (GrB_Matrix,
-    const GrB_Matrix, const GrB_BinaryOp, const GrB_Monoid,
-    const GrB_Matrix, const GrB_Matrix, const GrB_Descriptor) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseMult_Matrix_BinaryOp (GrB_Matrix,
-    const GrB_Matrix, const GrB_BinaryOp, const GrB_BinaryOp,
-    const GrB_Matrix, const GrB_Matrix, const GrB_Descriptor desc) ;
-
 // All 6 of the above type-specific functions are captured in a single
 // type-generic function, GrB_eWiseMult:
 
@@ -4559,29 +4242,12 @@ GB_PUBLIC GrB_Info GrB_eWiseMult_Matrix_BinaryOp (GrB_Matrix,
     (C, Mask, accum, op, A, B, desc)
 #endif
 
-//------------------------------------------------------------------------------
-// element-wise matrix and vector operations: using set union
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_eWiseAdd: element-wise matrix and vector operations, set union
+//==============================================================================
 
 // GrB_eWiseAdd computes C<Mask> = accum (C, A+B), where pairs of elements in
-// two matrices (or two vectors) are pairwise "added".  The "add" operator can
-// be any binary operator.  With the plus operator, this is the same matrix
-// addition in conventional linear algebra.  The pattern of the result T=A+B is
-// the set union (not intersection) of A and B.  Entries outside of the union
-// are not computed.  That is, if both A(i,j) and B(i,j) are present in the
-// pattern of A and B, then T(i,j) = A(i,j) "+" B(i,j).  If only A(i,j) is
-// present then T(i,j) = A (i,j) and the "+" operator is not used.  Likewise,
-// if only B(i,j) is in the pattern of B but A(i,j) is not in the pattern of A,
-// then T(i,j) = B(i,j).  This is the primary difference between GrB_eWiseAdd and
-// GrB_eWiseMult; the same set of binary operators can be used in both methods,
-// and the action they take on entries in the intersection of the pattern of A
-// and B is identical.
-
-// The input matrices A and/or B may be transposed first, via the descriptor.
-
-// For a semiring, the mult operator is the semiring's add operator; note that
-// this differs from the eWiseMult methods which use the semiring's multiply
-// operator instead. For a monoid, the mult operator is the monoid operator.
+// two matrices (or two vectors) are pairwise "added".
 
 GB_PUBLIC
 GrB_Info GrB_Vector_eWiseAdd_Semiring       // w<Mask> = accum (w, u+v)
@@ -4655,35 +4321,6 @@ GrB_Info GrB_Matrix_eWiseAdd_BinaryOp       // C<Mask> = accum (C, A+B)
     const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
 ) ;
 
-// misnamed, deprecated functions:  These function names do not appear
-// in the GraphBLAS C API Specification.  They were misnamed in
-// earlier versions of SuiteSparse:GraphBLAS.  The corrected versions
-// appear above.
-
-GB_PUBLIC GrB_Info GrB_eWiseAdd_Vector_Semiring (GrB_Vector,
-    const GrB_Vector, const GrB_BinaryOp, const GrB_Semiring,
-    const GrB_Vector, const GrB_Vector, const GrB_Descriptor desc) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseAdd_Vector_Monoid (GrB_Vector,
-    const GrB_Vector, const GrB_BinaryOp, const GrB_Monoid,
-    const GrB_Vector, const GrB_Vector, const GrB_Descriptor desc) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseAdd_Vector_BinaryOp (GrB_Vector,
-    const GrB_Vector, const GrB_BinaryOp, const GrB_BinaryOp,
-    const GrB_Vector, const GrB_Vector, const GrB_Descriptor) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseAdd_Matrix_Semiring (GrB_Matrix,
-    const GrB_Matrix, const GrB_BinaryOp, const GrB_Semiring,
-    const GrB_Matrix, const GrB_Matrix, const GrB_Descriptor) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseAdd_Matrix_Monoid (GrB_Matrix,
-    const GrB_Matrix, const GrB_BinaryOp, const GrB_Monoid,
-    const GrB_Matrix, const GrB_Matrix, const GrB_Descriptor) ;
-
-GB_PUBLIC GrB_Info GrB_eWiseAdd_Matrix_BinaryOp (GrB_Matrix,
-    const GrB_Matrix, const GrB_BinaryOp, const GrB_BinaryOp,
-    const GrB_Matrix, const GrB_Matrix, const GrB_Descriptor) ;
-
 #if GxB_STDC_VERSION >= 201112L
 #define GrB_eWiseAdd(C,Mask,accum,op,A,B,desc)                          \
     _Generic                                                            \
@@ -4715,27 +4352,19 @@ GB_PUBLIC GrB_Info GrB_eWiseAdd_Matrix_BinaryOp (GrB_Matrix,
     (C, Mask, accum, op, A, B, desc)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix and vector extract
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_extract: extract a submatrix or subvector
+//==============================================================================
 
 // Extract entries from a matrix or vector; T = A(I,J) in MATLAB notation.
 // This (like most GraphBLAS methods) is then followed by C<Mask>=accum(C,T).
 
-// The input matrix A may be transposed first, via the descriptor.
-
 // To extract all rows of a matrix or vector, as in A (:,J) in MATLAB, use
 // I=GrB_ALL as the input argument.  For all columns of a matrix, use
-// J=GrB_ALL.  GrB_ALL is a predefined pointer that is not NULL so that
-// out-of-memory conditions can be (I=NULL) distinguished from a request for
-// all rows (I=GrB_ALL).  The pointer GrB_ALL should never dereferenced, and it
-// must not be freed or modified.
+// J=GrB_ALL.
 
 GB_PUBLIC const uint64_t *GrB_ALL ;
 
-// SPEC:  GxB_RANGE (where I = begin:end) and GxB_STRIDE (where I =
-// begin:inc:end) are extensions to the spec.
-
 // To extract a range of rows and columns, I and J can be a list of 2 or 3
 // indices that defines a range (begin:end) or a strided range (begin:inc:end),
 // in MATLAB notation.  To specify the MATLAB syntax I = begin:end, the array I
@@ -4768,9 +4397,6 @@ GB_PUBLIC const uint64_t *GrB_ALL ;
 //      I [GxB_INC   ] = 2 ;                // the magnitude of the increment
 //      I [GxB_END   ] = 1 ;                // the end of the sequence
 
-
-// Each of the following can be used with their generic name, GrB_extract.
-
 GB_PUBLIC
 GrB_Info GrB_Vector_extract         // w<mask> = accum (w, u(I))
 (
@@ -4839,78 +4465,23 @@ GrB_Info GrB_Col_extract            // w<mask> = accum (w, A(I,j))
     (arg1, Mask, accum, arg4, __VA_ARGS__)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix and vector subassign: C(I,J)<Mask> = accum (C(I,J), A)
-//------------------------------------------------------------------------------
+//==============================================================================
+// GxB_subassign: matrix and vector subassign: C(I,J)<Mask> = accum (C(I,J), A)
+//==============================================================================
 
 // Assign entries in a matrix or vector; C(I,J) = A in MATLAB notation.
-// Each of these can be used with their generic name, GxB_subassign.
-
-// SPEC: The GxB_*_subassign functions are extensions to the spec.
 
 // Each GxB_subassign function is very similar to its corresponding GrB_assign
-// function in the spec, but they differ in two ways:
-
-// (1) the mask in the GxB_subassign functions has the same dimensions as
-//      w(I) for vectors and C(I,J) for matrices.  In GrB_assign, the mask is
-//      the same size as w or C, respectively (except for GrB_Row_assign and
-//      GrB_Col_assign, in which case the mask is the same size as a row or
-//      column of C, respectively).  The two masks are related.  If M is the
-//      mask for GrB_assign, then M(I,J) is the mask for GxB_subassign.  If
-//      there is no mask, or if I and J are both GrB_ALL, then the two masks
-//      are the same.
-
-//      For GrB_Row_assign and GrB_Col_assign, the mask vector is the same
-//      size as a row or column of C, respectively.  For the corresponding
-//      GxB_Row_subassign and GxB_Col_subassign operations, the mask is the
-//      same size as the subrow C(i,J) or subcolumn C(I,j), respectively.
-
-// (2) They differ in how C is affected in areas outside the C(I,J) submatrix.
-//      In GxB_subassign, C(I,J) is the only part of C that can be modified,
-//      and no part of C outside the submatrix is ever modified.  In
-//      GrB_assign, it is possible to modify C outside the submatrix, but only
-//      in one specific manner.  Suppose the mask M is present (or, suppose it
-//      is not present but GrB_COMP is true).  After (optionally) complementing
-//      the mask, the value of M(i,j) can be 0 for some entry outside the
-//      C(I,J) submatrix.  If the GrB_REPLACE descriptor is true, the
-//      GrB_assign deletes this entry.  This case does not occur if GrB_REPLACE
-//      is false.  With GrB_assign, it is not possible to change entries
-//      outside the submatrix C(I,J), except to delete them in this
-//      circumstance.
-
-// GxB_subassign and GrB_assign are identical if GrB_REPLACE is set to its
-// default value of false, or if the masks happen to be the same.  The two
-// masks can be the same in two cases:  either there is no mask (and GrB_COMP
-// is false), or I and J are both GrB_ALL.  In this case, the two algorithms
-// are identical and have the same performance.
-
-// GxB_subassign is much faster than GrB_assign, when the latter must examine
-// the entire matrix C to delete entries (when GrB__REPLACE is true), and it
-// must deal with a much larger Mask matrix.  However, both methods have
-// specific uses.  Consider using C(I,J)+=F for many submatrices F (for
-// example, when assembling a finite-element matrix).  If the Mask is meant as
-// a specification for which entries of C should appear in the final result,
-// then use GrB_assign.  If the Mask is meant to control which entries of the
-// submatrix C(I,J) are modified by the finite-element F, then use
-// GxB_subassign.  This is particularly useful is the Mask is a "template" that
-// follows along with the finite-element F, independent of where it is applied
-// C.  Using GrB_assign would be very difficult in this case since a new Mask,
-// the same size as C, would need to be constructed for each finite-element F.
+// function in the spec, but they differ in two ways: (1) the mask in
+// GxB_subassign has the same size as w(I) for vectors and C(I,J) for matrices,
+// and (2) they differ in the GrB_REPLACE option.  See the user guide for
+// details.
 
 // In GraphBLAS notation, the two methods can be described as follows:
 
 // matrix and vector subassign: C(I,J)<Mask> = accum (C(I,J), A)
 // matrix and vector    assign: C<Mask>(I,J) = accum (C(I,J), A)
 
-// This notation does not include the details of the GrB_COMP and GrB_REPLACE
-// descriptors, but it does illustrate the difference in the Mask.  In the
-// subassign, Mask is the same size as C(I,J) and A.  If I[0]=i and J[0]=j,
-// Then Mask(0,0) controls how C(i,j) is modified by the subassign, from the
-// value A(0,0).  In the assign, Mask is the same size as C, and Mask(i,j)
-// controls how C(i,j) is modified.
-
-// Summary:
-
 // --- assign ------------------------------------------------------------------
 //
 // GrB_Matrix_assign      C<M>(I,J) += A        M same size as matrix C.
@@ -4921,13 +4492,11 @@ GrB_Info GrB_Col_extract            // w<mask> = accum (w, A(I,j))
 //
 // GrB_Row_assign         C<m'>(i,J) += u'      m is a column vector the same
 //                                              size as a row of C.
-//                                              u is |J|-by-1
-//                                              i is a scalar.
+//                                              u is |J|-by-1, i is a scalar.
 //
 // GrB_Col_assign         C<m>(I,j) += u        m is a column vector the same
 //                                              size as a column of C.
-//                                              u is |I|-by-1
-//                                              j is a scalar.
+//                                              u is |I|-by-1, j is a scalar.
 //
 // --- subassign ---------------------------------------------------------------
 //
@@ -4938,19 +4507,10 @@ GrB_Info GrB_Col_extract            // w<mask> = accum (w, A(I,j))
 //                                              u is |I|-by-1
 //
 // GxB_Row_subassign      C(i,J)<m'> += u'      m same size as column vector u.
-//                                              u is |J|-by-1
-//                                              i is a scalar.
+//                                              u is |J|-by-1, i is a scalar.
 //
 // GxB_Col_subassign      C(I,j)<m> += u        m same size as column vector u.
-//                                              u is |I|-by-1
-//                                              j is a scalar.
-
-// For the scalar variants of the matrix and vector assign and subassign,
-// the input scalar is implicitly expanded to a dense matrix A or dense
-// vector u.
-
-// The GxB_subassign and GrB_assign functions have the same signatures; they
-// differ only in how they consider the Mask and the GrB_REPLACE descriptor.
+//                                              u is |I|-by-1, j is a scalar.
 
 GB_PUBLIC
 GrB_Info GxB_Vector_subassign       // w(I)<mask> = accum (w(I),u)
@@ -5012,8 +4572,6 @@ GrB_Info GxB_Row_subassign          // C(i,J)<mask'> = accum (C(i,J),u')
 // scalar x is implicitly expanded into a vector u of size ni-by-1, with each
 // entry in u equal to x, and then w(I)<mask> = accum(w(I),u) is done.
 
-// Each of these can be used with their generic name, GxB_subassign.
-
 GB_PUBLIC
 GrB_Info GxB_Vector_subassign_BOOL  // w(I)<mask> = accum (w(I),x)
 (
@@ -5190,8 +4748,6 @@ GrB_Info GxB_Vector_subassign_UDT      // w(I)<mask> = accum (w(I),x)
 // scalar x is implicitly expanded into a matrix A of size ni-by-nj, with each
 // entry in A equal to x, and then C(I,J)<Mask> = accum(C(I,J),A) is done.
 
-// Each of these can be used with their generic name, GxB_subassign.
-
 GB_PUBLIC
 GrB_Info GxB_Matrix_subassign_BOOL  // C(I,J)<Mask> = accum (C(I,J),x)
 (
@@ -5441,9 +4997,9 @@ GrB_Info GxB_Matrix_subassign_UDT      // C(I,J)<Mask> = accum (C(I,J),x)
     (arg1, Mask, accum, arg4, arg5, __VA_ARGS__)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix and vector assign: C<Mask>(I,J) = accum (C(I,J), A)
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_assign: matrix and vector assign: C<Mask>(I,J) = accum (C(I,J), A)
+//==============================================================================
 
 // Assign entries in a matrix or vector; C(I,J) = A in MATLAB notation.
 // Each of these can be used with their generic name, GrB_assign.
@@ -5508,8 +5064,6 @@ GrB_Info GrB_Row_assign             // C<mask'>(i,J) = accum (C(i,J),u')
 // scalar x is implicitly expanded into a vector u of size ni-by-1, with each
 // entry in u equal to x, and then w<mask>(I) = accum(w(I),u) is done.
 
-// Each of these can be used with their generic name, GrB_assign.
-
 GB_PUBLIC
 GrB_Info GrB_Vector_assign_BOOL     // w<mask>(I) = accum (w(I),x)
 (
@@ -5686,8 +5240,6 @@ GrB_Info GrB_Vector_assign_UDT      // w<mask>(I) = accum (w(I),x)
 // scalar x is implicitly expanded into a matrix A of size ni-by-nj, with each
 // entry in A equal to x, and then C<Mask>(I,J) = accum(C(I,J),A) is done.
 
-// Each of these can be used with their generic name, GrB_assign.
-
 GB_PUBLIC
 GrB_Info GrB_Matrix_assign_BOOL     // C<Mask>(I,J) = accum (C(I,J),x)
 (
@@ -5937,14 +5489,12 @@ GrB_Info GrB_Matrix_assign_UDT      // C<Mask>(I,J) = accum (C(I,J),x)
     (arg1, Mask, accum, arg4, arg5, __VA_ARGS__)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix and vector apply
-//------------------------------------------------------------------------------
-
-// Apply a unary operator to the entries in a matrix or vector,
-// C<Mask> = accum (C, op (A)).
+//==============================================================================
+// GrB_apply: matrix and vector apply
+//==============================================================================
 
-// The input matrix A may be optionally transposed first, via the descriptor.
+// Apply a unary operator to entries in a matrix or vector,
+// C<M> = accum (C, op (A)).
 
 GB_PUBLIC
 GrB_Info GrB_Vector_apply           // w<mask> = accum (w, op(u))
@@ -6726,8 +6276,6 @@ GrB_Info GrB_Matrix_apply_BinaryOp2nd_UDT       // C<M>=accum(C,op(x,A))
 // GrB_Vector_apply (w,mask,acc,op,u,d)  // w<mask> = accum (w, op(u))
 // GrB_Matrix_apply (C,Mask,acc,op,A,d)  // C<Mask> = accum (C, op(A))
 
-// It has been extended in the v1.3 spec to binary operators:
-
 // GrB_Vector_apply                  (w,m,acc,unop ,u,d)
 // GxB_Vector_apply_BinaryOp1st      (w,m,acc,binop,x,u,d)
 // GrB_Vector_apply_BinaryOp1st_TYPE (w,m,acc,binop,x,u,d)
@@ -6779,19 +6327,14 @@ GrB_Info GrB_Matrix_apply_BinaryOp2nd_UDT       // C<M>=accum(C,op(x,A))
     (C, Mask, accum, op, __VA_ARGS__)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix and vector selection
-//------------------------------------------------------------------------------
+//==============================================================================
+// GxB_select: matrix and vector selection
+//==============================================================================
 
 // Select a subset of entries from a matrix or vector.
 // C<Mask> = accum (C, op (A,k)), where the entries of op(A,k) are a subset of
 // the entries of A.
 
-// The input matrix A may be optionally transposed first, via the descriptor.
-
-// ADDED in V3.0:  thunk changed from (const void *) to a GxB_Scalar.  This
-// change is not backward compatible with SuiteSparse:GraphBLAS V2.x.
-
 GB_PUBLIC
 GrB_Info GxB_Vector_select          // w<mask> = accum (w, op(u,k))
 (
@@ -6837,16 +6380,20 @@ GrB_Info GxB_Matrix_select          // C<Mask> = accum (C, op(A,k)) or op(A',k)
     (C, Mask, accum, op, A, Thunk, desc)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix and vector reduction
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_reduce: matrix and vector reduction
+//==============================================================================
 
-// Reduce the entries in a matrix to a vector.  By default these methods
-// compute a column vector t such that t(i) = sum (A (i,:)), and where "sum" is
-// a commutative and associative monoid with an identity value.  A can be
-// transposed, which reduces down the columns instead of the rows.  This
-// behavior is the transpose of the MATLAB convention, where r=sum(A) produces
-// a row vector and sums each column.
+// Reduce the entries in a matrix to a vector, a column vector t such that
+// t(i) = sum (A (i,:)), and where "sum" is a commutative and associative
+// monoid with an identity value.  A can be transposed, which reduces down the
+// columns instead of the rows.  This behavior is the transpose of the MATLAB
+// convention, where r=sum(A) produces a row vector and sums each column.
+
+// For GrB_Matrix_reduce_BinaryOp, the GrB_BinaryOp op must correspond to a
+// known built-in GrB_Monoid.  User-defined binary operators are not supported.
+// the use of GrB_Matrix_reduce_BinaryOp is discouraged; use GrB_reduce with a
+// monoid instead.
 
 GB_PUBLIC
 GrB_Info GrB_Matrix_reduce_Monoid   // w<mask> = accum (w,reduce(A))
@@ -6876,18 +6423,6 @@ GrB_Info GrB_Matrix_reduce_BinaryOp // w<mask> = accum (w,reduce(A))
 
 // Reduce entries in a vector to a scalar, c = accum (c, reduce_to_scalar(u))
 
-// All entries in the vector are "summed" to a single scalar t using the reduce
-// monoid, which must be associative (otherwise the results are undefined).
-// The result is either assigned to the output scalar c (if accum is NULL), or
-// it accumulated in the result c via c = accum(c,t).  If the vector has no
-// entries, the result t is the identity value of the monoid.  Unlike most
-// other GraphBLAS operations, this operation uses an accum operator but no
-// mask.
-
-// Like all GraphBLAS operations, these take a last argument of a GraphBLAS
-// descriptor.  However, it is unused in the current GraphBLAS spec.  It may be
-// used in the future.
-
 GB_PUBLIC
 GrB_Info GrB_Vector_reduce_BOOL     // c = accum (c, reduce_to_scalar (u))
 (
@@ -7034,14 +6569,6 @@ GrB_Info GrB_Vector_reduce_UDT      // c = accum (c, reduce_to_scalar (u))
 
 // Reduce entries in a matrix to a scalar, c = accum (c, reduce_to_scalar(A))
 
-// All entries in the matrix are "summed" to a single scalar t using the reduce
-// monoid, which must be associative (otherwise the results are undefined).
-// The result is either assigned to the output scalar c (if accum is NULL), or
-// it accumulated in the result c via c = accum(c,t).  If the matrix has no
-// entries, the result t is the identity value of the monoid.  Unlike most
-// other GraphBLAS operations, this operation uses an accum operator but no
-// mask.
-
 GB_PUBLIC
 GrB_Info GrB_Matrix_reduce_BOOL     // c = accum (c, reduce_to_scalar (A))
 (
@@ -7221,15 +6748,9 @@ GrB_Info GrB_Matrix_reduce_UDT      // c = accum (c, reduce_to_scalar (A))
     (arg1, arg2, arg3, arg4, __VA_ARGS__)
 #endif
 
-//------------------------------------------------------------------------------
-// matrix transpose
-//------------------------------------------------------------------------------
-
-// T = A' is computed by default, but A can also be transposed via the
-// descriptor.  In this case A is not transposed at all, and T = A.  The result
-// is then passed through the Mask and accum, like almost all other GraphBLAS
-// operations.  This makes GrB_transpose a direct interface to the accum/mask
-// operation, C<Mask> = accum (C,A), or C<Mask> = accum (C,A') by default.
+//==============================================================================
+// GrB_transpose: matrix transpose
+//==============================================================================
 
 GB_PUBLIC
 GrB_Info GrB_transpose              // C<Mask> = accum (C, A')
@@ -7242,31 +6763,77 @@ GrB_Info GrB_transpose              // C<Mask> = accum (C, A')
 ) ;
 
 //==============================================================================
-// additional predefined objects
+// GrB_kronecker:  Kronecker product
 //==============================================================================
 
-// SPEC: predefined monoids and semirings are extensions to the spec.
-// The v1.3 spec added many GrB_*_MONOIDs; these are listed below.
-// Prior GxB_* monoids are kept for backward compatbility.
-
-//------------------------------------------------------------------------------
-// built-in monoids
-//------------------------------------------------------------------------------
+// GxB_kron is deprecated; use GrB_kronecker instead
+GB_PUBLIC
+GrB_Info GxB_kron                   // C<Mask> = accum(C,kron(A,B)) (deprecated)
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_BinaryOp op,          // defines '*' for T=kron(A,B)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
+) ;
 
-// 77 monoids constructed using built-in types and operators are defined below.
-// Five operators (min, max, plus, times, any) are available for each of the 10
-// real non-Boolean types, plus five purely Boolean monoids are available.
-// Three operators (plus, times, any) are available for both complex types.
-// Sixteen monoids are pre-defined for bitwise operators (only for unsigned
-// integers).
+GB_PUBLIC
+GrB_Info GrB_Matrix_kronecker_BinaryOp  // C<M> = accum (C, kron(A,B))
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix M,             // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_BinaryOp op,          // defines '*' for T=kron(A,B)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Descriptor desc       // descriptor for C, M, A, and B
+) ;
 
-// 44 pre-defined monoids now appear with GrB_* names in the v1.3 C API.
-// These are identical to the 44 GxB* monoids listed below, just with different
-// names.  The GrB* names are preferred.
+GB_PUBLIC
+GrB_Info GrB_Matrix_kronecker_Monoid  // C<M> = accum (C, kron(A,B))
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix M,             // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_Monoid monoid,        // defines '*' for T=kron(A,B)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Descriptor desc       // descriptor for C, M, A, and B
+) ;
+
+GB_PUBLIC
+GrB_Info GrB_Matrix_kronecker_Semiring  // C<M> = accum (C, kron(A,B))
+(
+    GrB_Matrix C,                   // input/output matrix for results
+    const GrB_Matrix M,             // optional mask for C, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
+    const GrB_Semiring semiring,    // defines '*' for T=kron(A,B)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Matrix B,             // second input: matrix B
+    const GrB_Descriptor desc       // descriptor for C, M, A, and B
+) ;
+
+#if GxB_STDC_VERSION >= 201112L
+#define GrB_kronecker(C,Mask,accum,op,A,B,desc)                 \
+    _Generic                                                    \
+    (                                                           \
+        (op),                                                   \
+        const GrB_Semiring : GrB_Matrix_kronecker_Semiring ,    \
+              GrB_Semiring : GrB_Matrix_kronecker_Semiring ,    \
+        const GrB_Monoid   : GrB_Matrix_kronecker_Monoid   ,    \
+              GrB_Monoid   : GrB_Matrix_kronecker_Monoid   ,    \
+        const GrB_BinaryOp : GrB_Matrix_kronecker_BinaryOp ,    \
+              GrB_BinaryOp : GrB_Matrix_kronecker_BinaryOp      \
+    )                                                           \
+    (C, Mask, accum, op, A, B, desc)
+#endif
 
-// Bitwise monoids can be constructed for signed integer types, but these are
-// not well-defined by the ANSI C specification, so they are excluded from the
-// pre-defined monoids in SuiteSparse:GraphBLAS.
+
+//==============================================================================
+// GrB_Monoid: built-in monoids
+//==============================================================================
 
 GB_PUBLIC GrB_Monoid
 
@@ -7274,193 +6841,191 @@ GB_PUBLIC GrB_Monoid
     // 10 MIN monoids: (not for complex types)
     //--------------------------------------------------------------------------
 
-    GxB_MIN_INT8_MONOID,          // identity: INT8_MAX     terminal: INT8_MIN
-    GxB_MIN_INT16_MONOID,         // identity: INT16_MAX    terminal: INT16_MIN
-    GxB_MIN_INT32_MONOID,         // identity: INT32_MAX    terminal: INT32_MIN
-    GxB_MIN_INT64_MONOID,         // identity: INT64_MAX    terminal: INT32_MIN
-    GxB_MIN_UINT8_MONOID,         // identity: UINT8_MAX    terminal: 0
-    GxB_MIN_UINT16_MONOID,        // identity: UINT16_MAX   terminal: 0
-    GxB_MIN_UINT32_MONOID,        // identity: UINT32_MAX   terminal: 0
-    GxB_MIN_UINT64_MONOID,        // identity: UINT64_MAX   terminal: 0
-    GxB_MIN_FP32_MONOID,          // identity: INFINITY     terminal: -INFINITY
-    GxB_MIN_FP64_MONOID,          // identity: INFINITY     terminal: -INFINITY
-
-    // all of the MIN monoids are now in the v1.3 spec with GrB_* names.
-    // The are identical to the GxB_* versions above.
-    GrB_MIN_MONOID_INT8,
-    GrB_MIN_MONOID_INT16,
-    GrB_MIN_MONOID_INT32,
-    GrB_MIN_MONOID_INT64,
-    GrB_MIN_MONOID_UINT8,
-    GrB_MIN_MONOID_UINT16,
-    GrB_MIN_MONOID_UINT32,
-    GrB_MIN_MONOID_UINT64,
-    GrB_MIN_MONOID_FP32,
-    GrB_MIN_MONOID_FP64,
+    // GxB_MIN monoids, deprecated, use GrB_MIN_MONOID_* instead:
+    GxB_MIN_INT8_MONOID,        // identity: INT8_MAX     terminal: INT8_MIN
+    GxB_MIN_INT16_MONOID,       // identity: INT16_MAX    terminal: INT16_MIN
+    GxB_MIN_INT32_MONOID,       // identity: INT32_MAX    terminal: INT32_MIN
+    GxB_MIN_INT64_MONOID,       // identity: INT64_MAX    terminal: INT32_MIN
+    GxB_MIN_UINT8_MONOID,       // identity: UINT8_MAX    terminal: 0
+    GxB_MIN_UINT16_MONOID,      // identity: UINT16_MAX   terminal: 0
+    GxB_MIN_UINT32_MONOID,      // identity: UINT32_MAX   terminal: 0
+    GxB_MIN_UINT64_MONOID,      // identity: UINT64_MAX   terminal: 0
+    GxB_MIN_FP32_MONOID,        // identity: INFINITY     terminal: -INFINITY
+    GxB_MIN_FP64_MONOID,        // identity: INFINITY     terminal: -INFINITY
+
+    // preferred names from the v1.3 spec:
+    GrB_MIN_MONOID_INT8,        // identity: INT8_MAX     terminal: INT8_MIN
+    GrB_MIN_MONOID_INT16,       // identity: INT16_MAX    terminal: INT16_MIN
+    GrB_MIN_MONOID_INT32,       // identity: INT32_MAX    terminal: INT32_MIN
+    GrB_MIN_MONOID_INT64,       // identity: INT64_MAX    terminal: INT32_MIN
+    GrB_MIN_MONOID_UINT8,       // identity: UINT8_MAX    terminal: 0
+    GrB_MIN_MONOID_UINT16,      // identity: UINT16_MAX   terminal: 0
+    GrB_MIN_MONOID_UINT32,      // identity: UINT32_MAX   terminal: 0
+    GrB_MIN_MONOID_UINT64,      // identity: UINT64_MAX   terminal: 0
+    GrB_MIN_MONOID_FP32,        // identity: INFINITY     terminal: -INFINITY
+    GrB_MIN_MONOID_FP64,        // identity: INFINITY     terminal: -INFINITY
 
     //--------------------------------------------------------------------------
     // 10 MAX monoids:
     //--------------------------------------------------------------------------
 
-    GxB_MAX_INT8_MONOID,          // identity: INT8_MIN     terminal: INT8_MAX
-    GxB_MAX_INT16_MONOID,         // identity: INT16_MIN    terminal: INT16_MAX
-    GxB_MAX_INT32_MONOID,         // identity: INT32_MIN    terminal: INT32_MAX
-    GxB_MAX_INT64_MONOID,         // identity: INT64_MIN    terminal: INT64_MAX
-    GxB_MAX_UINT8_MONOID,         // identity: 0            terminal: UINT8_MAX
-    GxB_MAX_UINT16_MONOID,        // identity: 0            terminal: UINT16_MAX
-    GxB_MAX_UINT32_MONOID,        // identity: 0            terminal: UINT32_MAX
-    GxB_MAX_UINT64_MONOID,        // identity: 0            terminal: UINT64_MAX
-    GxB_MAX_FP32_MONOID,          // identity: -INFINITY    terminal: INFINITY
-    GxB_MAX_FP64_MONOID,          // identity: -INFINITY    terminal: INFINITY
-
-    // identical monoids now in the v1.3 spec:
-    GrB_MAX_MONOID_INT8,
-    GrB_MAX_MONOID_INT16,
-    GrB_MAX_MONOID_INT32,
-    GrB_MAX_MONOID_INT64,
-    GrB_MAX_MONOID_UINT8,
-    GrB_MAX_MONOID_UINT16,
-    GrB_MAX_MONOID_UINT32,
-    GrB_MAX_MONOID_UINT64,
-    GrB_MAX_MONOID_FP32,
-    GrB_MAX_MONOID_FP64,
+    // GxB_MAX monoids, deprecated, use GrB_MAX_MONOID_* instead:
+    GxB_MAX_INT8_MONOID,        // identity: INT8_MIN     terminal: INT8_MAX
+    GxB_MAX_INT16_MONOID,       // identity: INT16_MIN    terminal: INT16_MAX
+    GxB_MAX_INT32_MONOID,       // identity: INT32_MIN    terminal: INT32_MAX
+    GxB_MAX_INT64_MONOID,       // identity: INT64_MIN    terminal: INT64_MAX
+    GxB_MAX_UINT8_MONOID,       // identity: 0            terminal: UINT8_MAX
+    GxB_MAX_UINT16_MONOID,      // identity: 0            terminal: UINT16_MAX
+    GxB_MAX_UINT32_MONOID,      // identity: 0            terminal: UINT32_MAX
+    GxB_MAX_UINT64_MONOID,      // identity: 0            terminal: UINT64_MAX
+    GxB_MAX_FP32_MONOID,        // identity: -INFINITY    terminal: INFINITY
+    GxB_MAX_FP64_MONOID,        // identity: -INFINITY    terminal: INFINITY
+
+    // preferred names from the v1.3 spec:
+    GrB_MAX_MONOID_INT8,        // identity: INT8_MIN     terminal: INT8_MAX
+    GrB_MAX_MONOID_INT16,       // identity: INT16_MIN    terminal: INT16_MAX
+    GrB_MAX_MONOID_INT32,       // identity: INT32_MIN    terminal: INT32_MAX
+    GrB_MAX_MONOID_INT64,       // identity: INT64_MIN    terminal: INT64_MAX
+    GrB_MAX_MONOID_UINT8,       // identity: 0            terminal: UINT8_MAX
+    GrB_MAX_MONOID_UINT16,      // identity: 0            terminal: UINT16_MAX
+    GrB_MAX_MONOID_UINT32,      // identity: 0            terminal: UINT32_MAX
+    GrB_MAX_MONOID_UINT64,      // identity: 0            terminal: UINT64_MAX
+    GrB_MAX_MONOID_FP32,        // identity: -INFINITY    terminal: INFINITY
+    GrB_MAX_MONOID_FP64,        // identity: -INFINITY    terminal: INFINITY
 
     //--------------------------------------------------------------------------
     // 12 PLUS monoids:
     //--------------------------------------------------------------------------
 
-    GxB_PLUS_INT8_MONOID,         // identity: 0            terminal: none
-    GxB_PLUS_INT16_MONOID,        // identity: 0            terminal: none
-    GxB_PLUS_INT32_MONOID,        // identity: 0            terminal: none
-    GxB_PLUS_INT64_MONOID,        // identity: 0            terminal: none
-    GxB_PLUS_UINT8_MONOID,        // identity: 0            terminal: none
-    GxB_PLUS_UINT16_MONOID,       // identity: 0            terminal: none
-    GxB_PLUS_UINT32_MONOID,       // identity: 0            terminal: none
-    GxB_PLUS_UINT64_MONOID,       // identity: 0            terminal: none
-    GxB_PLUS_FP32_MONOID,         // identity: 0            terminal: none
-    GxB_PLUS_FP64_MONOID,         // identity: 0            terminal: none
-    GxB_PLUS_FC32_MONOID,         // identity: 0            terminal: none
-    GxB_PLUS_FC64_MONOID,         // identity: 0            terminal: none
-
-    // identical monoids now in the v1.3 spec:  note the complex monoids
-    // do not appear in the v1.3 spec.  They are SuiteSparse:GraphBLAS
-    // extensions only.
-    GrB_PLUS_MONOID_INT8,
-    GrB_PLUS_MONOID_INT16,
-    GrB_PLUS_MONOID_INT32,
-    GrB_PLUS_MONOID_INT64,
-    GrB_PLUS_MONOID_UINT8,
-    GrB_PLUS_MONOID_UINT16,
-    GrB_PLUS_MONOID_UINT32,
-    GrB_PLUS_MONOID_UINT64,
-    GrB_PLUS_MONOID_FP32,
-    GrB_PLUS_MONOID_FP64,
+    // GxB_PLUS monoids, deprecated, use GrB_PLUS_MONOID_* instead:
+    GxB_PLUS_INT8_MONOID,       // identity: 0
+    GxB_PLUS_INT16_MONOID,      // identity: 0
+    GxB_PLUS_INT32_MONOID,      // identity: 0
+    GxB_PLUS_INT64_MONOID,      // identity: 0
+    GxB_PLUS_UINT8_MONOID,      // identity: 0
+    GxB_PLUS_UINT16_MONOID,     // identity: 0
+    GxB_PLUS_UINT32_MONOID,     // identity: 0
+    GxB_PLUS_UINT64_MONOID,     // identity: 0
+    GxB_PLUS_FP32_MONOID,       // identity: 0
+    GxB_PLUS_FP64_MONOID,       // identity: 0
+
+    // preferred names from the v1.3 spec:
+    GrB_PLUS_MONOID_INT8,       // identity: 0
+    GrB_PLUS_MONOID_INT16,      // identity: 0
+    GrB_PLUS_MONOID_INT32,      // identity: 0
+    GrB_PLUS_MONOID_INT64,      // identity: 0
+    GrB_PLUS_MONOID_UINT8,      // identity: 0
+    GrB_PLUS_MONOID_UINT16,     // identity: 0
+    GrB_PLUS_MONOID_UINT32,     // identity: 0
+    GrB_PLUS_MONOID_UINT64,     // identity: 0
+    GrB_PLUS_MONOID_FP32,       // identity: 0
+    GrB_PLUS_MONOID_FP64,       // identity: 0
+
+    // complex monoids:
+    GxB_PLUS_FC32_MONOID,       // identity: 0
+    GxB_PLUS_FC64_MONOID,       // identity: 0
 
     //--------------------------------------------------------------------------
-    // 12 TIMES monoids:
+    // 12 TIMES monoids: identity value is 1, int* and uint* are terminal
     //--------------------------------------------------------------------------
 
-    GxB_TIMES_INT8_MONOID,        // identity: 1            terminal: 0
-    GxB_TIMES_INT16_MONOID,       // identity: 1            terminal: 0
-    GxB_TIMES_INT32_MONOID,       // identity: 1            terminal: 0
-    GxB_TIMES_INT64_MONOID,       // identity: 1            terminal: 0
-    GxB_TIMES_UINT8_MONOID,       // identity: 1            terminal: 0
-    GxB_TIMES_UINT16_MONOID,      // identity: 1            terminal: 0
-    GxB_TIMES_UINT32_MONOID,      // identity: 1            terminal: 0
-    GxB_TIMES_UINT64_MONOID,      // identity: 1            terminal: 0
-    GxB_TIMES_FP32_MONOID,        // identity: 1            terminal: none
-    GxB_TIMES_FP64_MONOID,        // identity: 1            terminal: none
-    GxB_TIMES_FC32_MONOID,        // identity: 1            terminal: none
-    GxB_TIMES_FC64_MONOID,        // identity: 1            terminal: none
-
-    // identical monoids now in the v1.3 spec:  note the complex monoids
-    // do not appear in the v1.3 spec.  They are SuiteSparse:GraphBLAS
-    // extensions only.
-    GrB_TIMES_MONOID_INT8,
-    GrB_TIMES_MONOID_INT16,
-    GrB_TIMES_MONOID_INT32,
-    GrB_TIMES_MONOID_INT64,
-    GrB_TIMES_MONOID_UINT8,
-    GrB_TIMES_MONOID_UINT16,
-    GrB_TIMES_MONOID_UINT32,
-    GrB_TIMES_MONOID_UINT64,
-    GrB_TIMES_MONOID_FP32,
-    GrB_TIMES_MONOID_FP64,
+    // GxB_TIMES monoids, deprecated, use GrB_TIMES_MONOID_* instead:
+    GxB_TIMES_INT8_MONOID,      // identity: 1            terminal: 0
+    GxB_TIMES_INT16_MONOID,     // identity: 1            terminal: 0
+    GxB_TIMES_INT32_MONOID,     // identity: 1            terminal: 0
+    GxB_TIMES_INT64_MONOID,     // identity: 1            terminal: 0
+    GxB_TIMES_UINT8_MONOID,     // identity: 1            terminal: 0
+    GxB_TIMES_UINT16_MONOID,    // identity: 1            terminal: 0
+    GxB_TIMES_UINT32_MONOID,    // identity: 1            terminal: 0
+    GxB_TIMES_UINT64_MONOID,    // identity: 1            terminal: 0
+    GxB_TIMES_FP32_MONOID,      // identity: 1
+    GxB_TIMES_FP64_MONOID,      // identity: 1
+
+    // preferred names from the v1.3 spec:
+    GrB_TIMES_MONOID_INT8,      // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_INT16,     // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_INT32,     // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_INT64,     // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_UINT8,     // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_UINT16,    // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_UINT32,    // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_UINT64,    // identity: 1            terminal: 0
+    GrB_TIMES_MONOID_FP32,      // identity: 1
+    GrB_TIMES_MONOID_FP64,      // identity: 1
+
+    // complex monoids:
+    GxB_TIMES_FC32_MONOID,      // identity: 1
+    GxB_TIMES_FC64_MONOID,      // identity: 1
 
     //--------------------------------------------------------------------------
-    // 12 ANY monoids: (excluding boolean, listed below)
+    // 13 ANY monoids:
     //--------------------------------------------------------------------------
 
-    // These do not appear in the v1.3 C API of the GraphBLAS Specification,
-    // since the ANY operator is a SuiteSparse:GraphBLAS extension.
-
-    GxB_ANY_INT8_MONOID,          // identity: any value    terminal: any value
-    GxB_ANY_INT16_MONOID,         // identity: any value    terminal: any value
-    GxB_ANY_INT32_MONOID,         // identity: any value    terminal: any value
-    GxB_ANY_INT64_MONOID,         // identity: any value    terminal: any value
-    GxB_ANY_UINT8_MONOID,         // identity: any value    terminal: any value
-    GxB_ANY_UINT16_MONOID,        // identity: any value    terminal: any value
-    GxB_ANY_UINT32_MONOID,        // identity: any value    terminal: any value
-    GxB_ANY_UINT64_MONOID,        // identity: any value    terminal: any value
-    GxB_ANY_FP32_MONOID,          // identity: any value    terminal: any value
-    GxB_ANY_FP64_MONOID,          // identity: any value    terminal: any value
-    GxB_ANY_FC32_MONOID,          // identity: any value    terminal: any value
-    GxB_ANY_FC64_MONOID,          // identity: any value    terminal: any value
+    GxB_ANY_BOOL_MONOID,        // identity: any value    terminal: any value
+    GxB_ANY_INT8_MONOID,        // identity: any value    terminal: any value
+    GxB_ANY_INT16_MONOID,       // identity: any value    terminal: any value
+    GxB_ANY_INT32_MONOID,       // identity: any value    terminal: any value
+    GxB_ANY_INT64_MONOID,       // identity: any value    terminal: any value
+    GxB_ANY_UINT8_MONOID,       // identity: any value    terminal: any value
+    GxB_ANY_UINT16_MONOID,      // identity: any value    terminal: any value
+    GxB_ANY_UINT32_MONOID,      // identity: any value    terminal: any value
+    GxB_ANY_UINT64_MONOID,      // identity: any value    terminal: any value
+    GxB_ANY_FP32_MONOID,        // identity: any value    terminal: any value
+    GxB_ANY_FP64_MONOID,        // identity: any value    terminal: any value
+    GxB_ANY_FC32_MONOID,        // identity: any value    terminal: any value
+    GxB_ANY_FC64_MONOID,        // identity: any value    terminal: any value
 
     //--------------------------------------------------------------------------
-    // 5 Boolean monoids:
+    // 4 Boolean monoids: (see also the GxB_ANY_BOOL_MONOID above)
     //--------------------------------------------------------------------------
 
-    GxB_ANY_BOOL_MONOID,          // identity: any value    terminal: any value
-    GxB_LOR_BOOL_MONOID,          // identity: false        terminal: true
-    GxB_LAND_BOOL_MONOID,         // identity: true         terminal: false
-    GxB_LXOR_BOOL_MONOID,         // identity: false
-    GxB_LXNOR_BOOL_MONOID,        // identity: true
-    GxB_EQ_BOOL_MONOID,           // identity: true (same as LXNOR monoid)
+    // GxB boolean monoids, deprecated, use GrB instead:
+    GxB_LOR_BOOL_MONOID,        // identity: false        terminal: true
+    GxB_LAND_BOOL_MONOID,       // identity: true         terminal: false
+    GxB_LXOR_BOOL_MONOID,       // identity: false
+    GxB_LXNOR_BOOL_MONOID,      // identity: true
+    GxB_EQ_BOOL_MONOID,         // (alternative name for GrB_LXNOR_MONOID_BOOL)
 
-    // the LOR, LAND, LXOR, and LXNOR monoids now appear in the v1.3 C API:
-    GrB_LOR_MONOID_BOOL,
-    GrB_LAND_MONOID_BOOL,
-    GrB_LXOR_MONOID_BOOL,
-    GrB_LXNOR_MONOID_BOOL,
+    // preferred names from the v1.3 spec:
+    GrB_LOR_MONOID_BOOL,        // identity: false        terminal: true
+    GrB_LAND_MONOID_BOOL,       // identity: true         terminal: false
+    GrB_LXOR_MONOID_BOOL,       // identity: false
+    GrB_LXNOR_MONOID_BOOL,      // identity: true
 
     //--------------------------------------------------------------------------
     // 16 Bitwise-or monoids:
     //--------------------------------------------------------------------------
 
-    // The v1.3 specification adds the bitwise operators, but no predefined
-    // monoids or semirings that use them.
-
     // BOR monoids (bitwise or):
-    GxB_BOR_UINT8_MONOID,         // identity: 0   terminal: 0xFF
-    GxB_BOR_UINT16_MONOID,        // identity: 0   terminal: 0xFFFF
-    GxB_BOR_UINT32_MONOID,        // identity: 0   terminal: 0xFFFFFFFF
-    GxB_BOR_UINT64_MONOID,        // identity: 0   terminal: 0xFFFFFFFFFFFFFFFF
+    GxB_BOR_UINT8_MONOID,       // identity: 0   terminal: 0xFF
+    GxB_BOR_UINT16_MONOID,      // identity: 0   terminal: 0xFFFF
+    GxB_BOR_UINT32_MONOID,      // identity: 0   terminal: 0xFFFFFFFF
+    GxB_BOR_UINT64_MONOID,      // identity: 0   terminal: 0xFFFFFFFFFFFFFFFF
 
     // BAND monoids (bitwise and):
-    GxB_BAND_UINT8_MONOID,        // identity: 0xFF               terminal: 0
-    GxB_BAND_UINT16_MONOID,       // identity: 0xFFFF             terminal: 0
-    GxB_BAND_UINT32_MONOID,       // identity: 0xFFFFFFFF         terminal: 0
-    GxB_BAND_UINT64_MONOID,       // identity: 0xFFFFFFFFFFFFFFFF terminal: 0
+    GxB_BAND_UINT8_MONOID,      // identity: 0xFF               terminal: 0
+    GxB_BAND_UINT16_MONOID,     // identity: 0xFFFF             terminal: 0
+    GxB_BAND_UINT32_MONOID,     // identity: 0xFFFFFFFF         terminal: 0
+    GxB_BAND_UINT64_MONOID,     // identity: 0xFFFFFFFFFFFFFFFF terminal: 0
 
     // BXOR monoids (bitwise xor):
-    GxB_BXOR_UINT8_MONOID,        // identity: 0
-    GxB_BXOR_UINT16_MONOID,       // identity: 0
-    GxB_BXOR_UINT32_MONOID,       // identity: 0
-    GxB_BXOR_UINT64_MONOID,       // identity: 0
+    GxB_BXOR_UINT8_MONOID,      // identity: 0
+    GxB_BXOR_UINT16_MONOID,     // identity: 0
+    GxB_BXOR_UINT32_MONOID,     // identity: 0
+    GxB_BXOR_UINT64_MONOID,     // identity: 0
 
     // BXNOR monoids (bitwise xnor):
-    GxB_BXNOR_UINT8_MONOID,       // identity: 0xFF
-    GxB_BXNOR_UINT16_MONOID,      // identity: 0xFFFF
-    GxB_BXNOR_UINT32_MONOID,      // identity: 0xFFFFFFFF
-    GxB_BXNOR_UINT64_MONOID ;     // identity: 0xFFFFFFFFFFFFFFFF
+    GxB_BXNOR_UINT8_MONOID,     // identity: 0xFF
+    GxB_BXNOR_UINT16_MONOID,    // identity: 0xFFFF
+    GxB_BXNOR_UINT32_MONOID,    // identity: 0xFFFFFFFF
+    GxB_BXNOR_UINT64_MONOID ;   // identity: 0xFFFFFFFFFFFFFFFF
 
-//------------------------------------------------------------------------------
-// built-in semirings
-//------------------------------------------------------------------------------
+//==============================================================================
+// GrB_Semiring: built-in semirings
+//==============================================================================
 
 // Using built-in types and operators, SuiteSparse:GraphBLAS provides
-// 1473 pre-defined, built-in semirings:
+// 1553 pre-defined, built-in semirings:
 
 // 1000 semirings with a multiply operator TxT -> T where T is non-Boolean,
 // from the complete cross product of:
@@ -7479,20 +7044,20 @@ GB_PUBLIC GrB_Monoid
 // 300 semirings with a comparison operator TxT -> bool, where T is
 // non-Boolean, from the complete cross product of:
 
-//      5 Boolean monoids: LAND, LOR, LXOR, EQ, ANY
+//      5 Boolean monoids: LAND, LOR, LXOR, EQ (=LXNOR), ANY
 //      6 multiply operators: EQ, NE, GT, LT, GE, LE
 //      10 non-Boolean real types, T
 
 // 55 semirings with purely Boolean types, bool x bool -> bool, from the
 // complete cross product of:
 
-//      5 Boolean monoids LAND, LOR, LXOR, EQ, ANY
+//      5 Boolean monoids LAND, LOR, LXOR, EQ (=LXNOR), ANY
 //      11 multiply operators:
-//          FIRST, SECOND, LOR, LAND, LXOR, EQ, GT, LT, GE, LE, PAIR
+//          FIRST, SECOND, LOR, LAND, LXOR, EQ (=LXNOR), GT, LT, GE, LE, PAIR
 //
-//      Note that lor_pair, land_pair, and eq_pair are all identical to any_pair.
-//      These 3 semirings are named below, but are internally remapped to
-//      any_pair_bool semiring.
+//      Note that lor_pair, land_pair, and eq_pair are all identical to
+//      any_pair.  These 3 semirings are named below, but are internally
+//      remapped to any_pair_bool semiring.
 
 // 54 complex semirings: TxT -> T where T is float complex or double complex:
 
@@ -7511,6 +7076,15 @@ GB_PUBLIC GrB_Monoid
 //      4 bitwise multiply operators: BOR, BAND, BXOR, BXNOR
 //      4 unsigned integer types: UINT8, UINT16, UINT32, UINT64
 
+// 80 positional semirings: XxX -> T where T is int64 or int32, and the type of
+// X is ignored:
+
+//      5 monoids: MIN, MAX, PLUS, TIMES, ANY
+//      8 multiply operators:
+//          FIRSTI, FIRSTI1, FIRSTJ, FIRSTJ1,
+//          SECONDI, SECONDI1, SECONDJ, SECONDJ1
+//      2 types: int32, int64
+
 // The ANY operator is also valid to use as a multiplicative operator in a
 // semiring, but serves no purpose in that case.  The ANY operator is meant as
 // a fast additive operator for a monoid, that terminates, or short-circuits,
@@ -7530,11 +7104,11 @@ GB_PUBLIC GrB_Monoid
 // always the same.  This is the type T for the first set, and Boolean for
 // the second and third sets of semirngs.
 
-// 1473 = 1000 + 300 + 55 + 54 + 64 semirings are named below, but 35 = 30 + 3
-// + 2 are identical to the corresponding any_pair semirings of the same type.
-// There are thus 1438 unique semirings listed below.  The PAIR multiplier thus
-// appears in 26 unique semirings: 13 any_pair (one per 13 types), 12 plus_pair
-// (for all but bool), and lxor_pair for bool.
+// 1553 = 1000 + 300 + 55 + 54 + 64 + 80 semirings are named below, but 35 = 30
+// + 3 + 2 are identical to the corresponding any_pair semirings of the same
+// type.  For positional semirings, the mulitiply ops FIRSTJ and SECONDI are
+// identical, as are FIRSTJ1 and SECONDI1.  These semirings still appear as
+// predefined, for convenience.
 
 GB_PUBLIC GrB_Semiring
 
@@ -7787,6 +7361,10 @@ GB_PUBLIC GrB_Semiring
 // 300 semirings with a comparison operator TxT -> bool, where T is non-Boolean
 //------------------------------------------------------------------------------
 
+    // In the 4th column the GxB_EQ_*_* semirings could also be called
+    // GxB_LXNOR_*_*, since the EQ and LXNOR boolean operators are identical
+    // but those names are not included.
+
     // semirings with multiply op: z = EQ (x,y), where z is boolean and x,y are given by the suffix:
     GxB_LOR_EQ_INT8        , GxB_LAND_EQ_INT8       , GxB_LXOR_EQ_INT8       , GxB_EQ_EQ_INT8         , GxB_ANY_EQ_INT8        ,
     GxB_LOR_EQ_INT16       , GxB_LAND_EQ_INT16      , GxB_LXOR_EQ_INT16      , GxB_EQ_EQ_INT16        , GxB_ANY_EQ_INT16       ,
@@ -7864,9 +7442,11 @@ GB_PUBLIC GrB_Semiring
 //------------------------------------------------------------------------------
 
     // Note that lor_pair, land_pair, and eq_pair are all identical to any_pair.
-    // These 3 are marked below.
+    // These 3 are marked below.  GxB_EQ_*_BOOL could be called
+    // GxB_LXNOR_*_BOOL, and GxB_*_EQ_BOOL could be called GxB_*_LXNOR_BOOL,
+    // but those names are not included.
 
-    // purely boolean semirings (in the form GxB_(add monoid)_(multipy operator)_BOOL:
+    // purely boolean semirings in the form GxB_(add monoid)_(multipy operator)_BOOL:
     GxB_LOR_FIRST_BOOL     , GxB_LAND_FIRST_BOOL    , GxB_LXOR_FIRST_BOOL    , GxB_EQ_FIRST_BOOL      , GxB_ANY_FIRST_BOOL     ,
     GxB_LOR_SECOND_BOOL    , GxB_LAND_SECOND_BOOL   , GxB_LXOR_SECOND_BOOL   , GxB_EQ_SECOND_BOOL     , GxB_ANY_SECOND_BOOL    ,
     GxB_LOR_PAIR_BOOL/**/  , GxB_LAND_PAIR_BOOL/**/ , GxB_LXOR_PAIR_BOOL     , GxB_EQ_PAIR_BOOL/**/   , GxB_ANY_PAIR_BOOL      ,
@@ -7884,8 +7464,6 @@ GB_PUBLIC GrB_Semiring
 //------------------------------------------------------------------------------
 
     // 3 monoids (plus, times, any), 2 types (FC32 and FC64), and 9
-    // multiplicative operators.  This list is not exhaustive, since it is
-    // possible to build complex semirings POW, ANY, ISEQ, and ISNE as the
     // multiplicative operators.
 
     // Note that times_pair is identical to any_pair.
@@ -7926,13 +7504,6 @@ GB_PUBLIC GrB_Semiring
     // mult:    (BOR, BAND, BXOR, BXNOR) x 
     // types:   (UINT8, UINT16, UINT32, UINT64)
 
-    // Many other bitwise semirings can be constructed using predefined types
-    // and operators.  Bitwise monoids can be constructed for signed integer
-    // types, but these are not well-defined by the ANSI C specification, so
-    // they are excluded from the pre-defined monoids in SuiteSparse:GraphBLAS.
-    // Additional semirings can also be constructed with a multiplicative
-    // binary operator on any signed or unsigned integer type, as well.
-
     GxB_BOR_BOR_UINT8      , GxB_BOR_BOR_UINT16     , GxB_BOR_BOR_UINT32     , GxB_BOR_BOR_UINT64     ,
     GxB_BOR_BAND_UINT8     , GxB_BOR_BAND_UINT16    , GxB_BOR_BAND_UINT32    , GxB_BOR_BAND_UINT64    ,
     GxB_BOR_BXOR_UINT8     , GxB_BOR_BXOR_UINT16    , GxB_BOR_BXOR_UINT32    , GxB_BOR_BXOR_UINT64    ,
@@ -7951,17 +7522,73 @@ GB_PUBLIC GrB_Semiring
     GxB_BXNOR_BOR_UINT8    , GxB_BXNOR_BOR_UINT16   , GxB_BXNOR_BOR_UINT32   , GxB_BXNOR_BOR_UINT64   ,
     GxB_BXNOR_BAND_UINT8   , GxB_BXNOR_BAND_UINT16  , GxB_BXNOR_BAND_UINT32  , GxB_BXNOR_BAND_UINT64  ,
     GxB_BXNOR_BXOR_UINT8   , GxB_BXNOR_BXOR_UINT16  , GxB_BXNOR_BXOR_UINT32  , GxB_BXNOR_BXOR_UINT64  ,
-    GxB_BXNOR_BXNOR_UINT8  , GxB_BXNOR_BXNOR_UINT16 , GxB_BXNOR_BXNOR_UINT32 , GxB_BXNOR_BXNOR_UINT64 ;
+    GxB_BXNOR_BXNOR_UINT8  , GxB_BXNOR_BXNOR_UINT16 , GxB_BXNOR_BXNOR_UINT32 , GxB_BXNOR_BXNOR_UINT64 ,
 
 //------------------------------------------------------------------------------
-// GrB_* semirings in the specification
+// 80 positional semirings
+//------------------------------------------------------------------------------
+
+    // monoids: (MIN, MAX, ANY, PLUS, TIMES) x
+    // mult:    (FIRSTI, FIRSTI1, FIRSTJ, FIRSTJ1, SECONDI, SECONDI1, SECONDJ, SECONDJ1)
+    // types:   (INT32, INT64)
+
+    GxB_MIN_FIRSTI_INT32,     GxB_MIN_FIRSTI_INT64,
+    GxB_MAX_FIRSTI_INT32,     GxB_MAX_FIRSTI_INT64,
+    GxB_ANY_FIRSTI_INT32,     GxB_ANY_FIRSTI_INT64,
+    GxB_PLUS_FIRSTI_INT32,    GxB_PLUS_FIRSTI_INT64,
+    GxB_TIMES_FIRSTI_INT32,   GxB_TIMES_FIRSTI_INT64,
+
+    GxB_MIN_FIRSTI1_INT32,    GxB_MIN_FIRSTI1_INT64,
+    GxB_MAX_FIRSTI1_INT32,    GxB_MAX_FIRSTI1_INT64,
+    GxB_ANY_FIRSTI1_INT32,    GxB_ANY_FIRSTI1_INT64,
+    GxB_PLUS_FIRSTI1_INT32,   GxB_PLUS_FIRSTI1_INT64,
+    GxB_TIMES_FIRSTI1_INT32,  GxB_TIMES_FIRSTI1_INT64,
+
+    GxB_MIN_FIRSTJ_INT32,     GxB_MIN_FIRSTJ_INT64,
+    GxB_MAX_FIRSTJ_INT32,     GxB_MAX_FIRSTJ_INT64,
+    GxB_ANY_FIRSTJ_INT32,     GxB_ANY_FIRSTJ_INT64,
+    GxB_PLUS_FIRSTJ_INT32,    GxB_PLUS_FIRSTJ_INT64,
+    GxB_TIMES_FIRSTJ_INT32,   GxB_TIMES_FIRSTJ_INT64,
+
+    GxB_MIN_FIRSTJ1_INT32,    GxB_MIN_FIRSTJ1_INT64,
+    GxB_MAX_FIRSTJ1_INT32,    GxB_MAX_FIRSTJ1_INT64,
+    GxB_ANY_FIRSTJ1_INT32,    GxB_ANY_FIRSTJ1_INT64,
+    GxB_PLUS_FIRSTJ1_INT32,   GxB_PLUS_FIRSTJ1_INT64,
+    GxB_TIMES_FIRSTJ1_INT32,  GxB_TIMES_FIRSTJ1_INT64,
+
+    GxB_MIN_SECONDI_INT32,    GxB_MIN_SECONDI_INT64,
+    GxB_MAX_SECONDI_INT32,    GxB_MAX_SECONDI_INT64,
+    GxB_ANY_SECONDI_INT32,    GxB_ANY_SECONDI_INT64,
+    GxB_PLUS_SECONDI_INT32,   GxB_PLUS_SECONDI_INT64,
+    GxB_TIMES_SECONDI_INT32,  GxB_TIMES_SECONDI_INT64,
+
+    GxB_MIN_SECONDI1_INT32,   GxB_MIN_SECONDI1_INT64,
+    GxB_MAX_SECONDI1_INT32,   GxB_MAX_SECONDI1_INT64,
+    GxB_ANY_SECONDI1_INT32,   GxB_ANY_SECONDI1_INT64,
+    GxB_PLUS_SECONDI1_INT32,  GxB_PLUS_SECONDI1_INT64,
+    GxB_TIMES_SECONDI1_INT32, GxB_TIMES_SECONDI1_INT64,
+
+    GxB_MIN_SECONDJ_INT32,    GxB_MIN_SECONDJ_INT64,
+    GxB_MAX_SECONDJ_INT32,    GxB_MAX_SECONDJ_INT64,
+    GxB_ANY_SECONDJ_INT32,    GxB_ANY_SECONDJ_INT64,
+    GxB_PLUS_SECONDJ_INT32,   GxB_PLUS_SECONDJ_INT64,
+    GxB_TIMES_SECONDJ_INT32,  GxB_TIMES_SECONDJ_INT64,
+
+    GxB_MIN_SECONDJ1_INT32,   GxB_MIN_SECONDJ1_INT64,
+    GxB_MAX_SECONDJ1_INT32,   GxB_MAX_SECONDJ1_INT64,
+    GxB_ANY_SECONDJ1_INT32,   GxB_ANY_SECONDJ1_INT64,
+    GxB_PLUS_SECONDJ1_INT32,  GxB_PLUS_SECONDJ1_INT64,
+    GxB_TIMES_SECONDJ1_INT32, GxB_TIMES_SECONDJ1_INT64 ;
+
+//------------------------------------------------------------------------------
+// GrB_* semirings
 //------------------------------------------------------------------------------
 
 // The v1.3 C API for GraphBLAS adds the following 124 predefined semirings,
 // with GrB* names.  They are identical to 124 GxB* semirings defined above,
 // with the same name, except that GrB_LXNOR_LOR_SEMIRING_BOOL is identical to
 // GxB_EQ_LOR_BOOL (since GrB_EQ_BOOL == GrB_LXNOR).  The old names are listed
-// below alongside each new name; the new names are preferred.
+// below alongside each new name; the new GrB* names are preferred.
 
 // 12 kinds of GrB* semirings are available for all 10 real, non-boolean types:
 
@@ -7973,6 +7600,8 @@ GB_PUBLIC GrB_Semiring
 
     // LOR_LAND, LAND_LOR, LXOR_LAND, LXNOR_LOR.
 
+// GxB* semirings corresponding to the equivalent GrB* semiring are deprecated.
+
 GB_PUBLIC GrB_Semiring
 
     //--------------------------------------------------------------------------
@@ -8140,21 +7769,17 @@ GB_PUBLIC GrB_Semiring
     GrB_LXOR_LAND_SEMIRING_BOOL,        // GxB_LXOR_LAND_BOOL
     GrB_LXNOR_LOR_SEMIRING_BOOL ;       // GxB_EQ_LOR_BOOL (note EQ == LXNOR)
 
-//------------------------------------------------------------------------------
+//==============================================================================
 // GrB_*_resize:  change the size of a matrix or vector
-//------------------------------------------------------------------------------
+//==============================================================================
 
 // If the dimensions decrease, entries that fall outside the resized matrix or
-// vector are deleted.  GrB_Matrix_resize and GrB_Vector_resize now appear in
-// the spec, with the identical behaviour as the earlier GxB_*_resize
-// functions.  The Generic GxB_resize does not appear in the spec.  The old
-// GxB* names are kept for backward compatibility, but new code should use
-// the GrB* names.
+// vector are deleted.
 
 GB_PUBLIC
 GrB_Info GrB_Matrix_resize      // change the size of a matrix
 (
-    GrB_Matrix A,               // matrix to modify
+    GrB_Matrix C,               // matrix to modify
     GrB_Index nrows_new,        // new number of rows in matrix
     GrB_Index ncols_new         // new number of columns in matrix
 ) ;
@@ -8162,22 +7787,23 @@ GrB_Info GrB_Matrix_resize      // change the size of a matrix
 GB_PUBLIC
 GrB_Info GrB_Vector_resize      // change the size of a vector
 (
-    GrB_Vector u,               // vector to modify
+    GrB_Vector w,               // vector to modify
     GrB_Index nrows_new         // new number of rows in vector
 ) ;
 
+// GxB_*_resize are identical to the GrB*resize methods above
 GB_PUBLIC
-GrB_Info GxB_Matrix_resize      // change the size of a matrix
+GrB_Info GxB_Matrix_resize      // change the size of a matrix (deprecated)
 (
-    GrB_Matrix A,               // matrix to modify
+    GrB_Matrix C,               // matrix to modify
     GrB_Index nrows_new,        // new number of rows in matrix
     GrB_Index ncols_new         // new number of columns in matrix
 ) ;
 
 GB_PUBLIC
-GrB_Info GxB_Vector_resize      // change the size of a vector
+GrB_Info GxB_Vector_resize      // change the size of a vector (deprecated)
 (
-    GrB_Vector u,               // vector to modify
+    GrB_Vector w,               // vector to modify
     GrB_Index nrows_new         // new number of rows in vector
 ) ;
 
@@ -8197,81 +7823,9 @@ GrB_Info GxB_Vector_resize      // change the size of a vector
     (arg1, __VA_ARGS__)
 #endif
 
-//------------------------------------------------------------------------------
-// GrB_kronecker:  Kronecker product
-//------------------------------------------------------------------------------
-
-// GxB_kron is now called GrB_Matrix_kronecker_BinaryOp, and can also be used
-// by the generic GrB_kronecker.  The GxB_kron name is kept for backward
-// compatibility.  GxB_kron will be kept for backward compatibility, but
-// new user code should switch to GrB_kronecker.
-
-GB_PUBLIC
-GrB_Info GxB_kron                   // C<Mask> = accum (C, kron(A,B))
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix Mask,          // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_BinaryOp op,          // defines '*' for T=kron(A,B)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Descriptor desc       // descriptor for C, Mask, A, and B
-) ;
-
-GB_PUBLIC
-GrB_Info GrB_Matrix_kronecker_BinaryOp  // C<M> = accum (C, kron(A,B))
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix M,             // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_BinaryOp op,          // defines '*' for T=kron(A,B)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Descriptor desc       // descriptor for C, M, A, and B
-) ;
-
-GB_PUBLIC
-GrB_Info GrB_Matrix_kronecker_Monoid  // C<M> = accum (C, kron(A,B))
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix M,             // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_Monoid monoid,        // defines '*' for T=kron(A,B)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Descriptor desc       // descriptor for C, M, A, and B
-) ;
-
-GB_PUBLIC
-GrB_Info GrB_Matrix_kronecker_Semiring  // C<M> = accum (C, kron(A,B))
-(
-    GrB_Matrix C,                   // input/output matrix for results
-    const GrB_Matrix M,             // optional mask for C, unused if NULL
-    const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_Semiring semiring,    // defines '*' for T=kron(A,B)
-    const GrB_Matrix A,             // first input:  matrix A
-    const GrB_Matrix B,             // second input: matrix B
-    const GrB_Descriptor desc       // descriptor for C, M, A, and B
-) ;
-
-#if GxB_STDC_VERSION >= 201112L
-#define GrB_kronecker(C,Mask,accum,op,A,B,desc)                 \
-    _Generic                                                    \
-    (                                                           \
-        (op),                                                   \
-        const GrB_Semiring : GrB_Matrix_kronecker_Semiring ,    \
-              GrB_Semiring : GrB_Matrix_kronecker_Semiring ,    \
-        const GrB_Monoid   : GrB_Matrix_kronecker_Monoid   ,    \
-              GrB_Monoid   : GrB_Matrix_kronecker_Monoid   ,    \
-        const GrB_BinaryOp : GrB_Matrix_kronecker_BinaryOp ,    \
-              GrB_BinaryOp : GrB_Matrix_kronecker_BinaryOp      \
-    )                                                           \
-    (C, Mask, accum, op, A, B, desc)
-#endif
-
-//------------------------------------------------------------------------------
+//==============================================================================
 // GxB_fprint and GxB_print: print the contents of a GraphBLAS object
-//------------------------------------------------------------------------------
+//==============================================================================
 
 // GxB_fprint (object, GxB_Print_Level pr, FILE *f) prints the contents of any
 // of the 9 GraphBLAS objects to the file f, and also does an extensive test on
@@ -8449,7 +8003,7 @@ GrB_Info GxB_Scalar_fprint          // print and check a GxB_Scalar
 #endif
 
 //==============================================================================
-// Matrix and vector import/export
+// GxB_import* and GxB_export*: Matrix and vector import/export
 //==============================================================================
 
 // The import/export functions allow the user application to create a
@@ -8485,115 +8039,149 @@ GrB_Info GxB_Scalar_fprint          // print and check a GxB_Scalar
 // for the user (via the ANSI C malloc function), fill them with the GrB_Matrix
 // or GrB_Vector data, and then return the newly allocated arrays to the user.
 
-// Four different formats are provided for import/export.  For each format, the
-// Ax array has a C-type <type> corresponding to one of the 11 built-in types
-// in GraphBLAS (bool, int*_t, uint*_t, float, and double), or a user-defined
-// type.
+// Eight different formats are provided for import/export.  For each format,
+// the Ax array has a C-type <type> corresponding to one of the 13 built-in
+// types in GraphBLAS (bool, int*_t, uint*_t, float, double, float complex, or
+// double complex), or a user-defined type.
+
+// On import, the required user arrays Ah, Ap, Ab, Ai, Aj, and/or Ax must be
+// non-NULL pointers to memory space allocated by the ANSI C malloc (or calloc,
+// or realloc), unless nzmax is zero (in which case the Ab, Ai, Aj, Ax, vb, vi,
+// and vx arrays may all be NULL).  Just like GrB_*_new, the GrB_Matrix A (or
+// GrB_Vector v) is undefined on input.  If the import is successful, the
+// GrB_Matrix A or GrB_Vector v is created, and the pointers to the user input
+// arrays have been set to NULL.  These user arrays have either been
+// incorporated directly into the GrB_Matrix A or GrB_Vector v, in which case
+// the user input arrays will eventually be freed by GrB_free (&A), or their
+// contents have been copied and the arrays freed.  This decision is made by
+// the GraphBLAS library itself, and the user application has no control over
+// this decision.
+
+// If any of the arrays Ab, Aj, Ai, Ax, vb, vi, or vx have zero size (with
+// nzmax of zero), they are allowed to be be NULL pointers on input.
+
+// No error checking is performed on the content of the user input arrays.  If
+// the user input arrays do not conform to the precise specifications above,
+// results are undefined.  No typecasting of the values of the matrix or vector
+// entries is performed on import or export.
 
+// SuiteSparse:GraphBLAS supports all eight formats natively (CSR, CSC,
+// HyperCSR, and HyperCSC, BitmapR, BitmapC, FullR, FullC).  For vectors, only
+// CSC, BitmapC, and FullC formats are used.  On import, the all eight formats
+// take O(1) time and memory to import.  On export, if the GrB_Matrix or
+// GrB_Vector is already in this particular format, then the export takes O(1)
+// time and no memory copying is performed.
+
+// If the import is not successful, the GxB_Matrix_import_* functions return A
+// as NULL, GxB_Vector_import returns v as NULL, and the user input arrays are
+// neither modified nor freed.  They are still owned by the user application.
+
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_CSR: import a CSR matrix
 //------------------------------------------------------------------------------
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_import_CSR      // import a CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // CSR format:
-    int64_t nonempty,       // number of rows with at least one entry:
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index **Ap,         // row "pointers", size nrows+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nrows+1
+    GrB_Index **Aj,     // row indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Aj_size,  // size of Aj
+    GrB_Index Ax_size,  // size of Ax
+    bool jumbled,       // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
 ) ;
 
     // CSR:  an nrows-by-ncols matrix with nvals entries in CSR format consists
-    // of 3 arrays:
+    // of 3 arrays, where nvals = Ap [nrows]:
     //
     //          GrB_Index Ap [nrows+1], Aj [nvals] ; <type> Ax [nvals] ;
     //
     //      The column indices of entries in the ith row of the matrix are held
     //      in Aj [Ap [i] ... Ap[i+1]], and the corresponding values are held
     //      in the same positions in Ax.  Column indices must be in the range 0
-    //      to ncols-1, and must appear in sorted order within each row.  No
-    //      duplicate column indices may appear in any row.  Ap [0] must equal
-    //      zero, and Ap [nrows] must equal nvals.  The Ap array must be of
-    //      size nrows+1 (or larger), and the Aj and Ax arrays must have size
-    //      at least nvals.  If nvals is zero, then the Aj and Ax arrays need
-    //      not be present and can be NULL.
-
-    //      The nonempty parameter is optional.  It states the number of rows
-    //      that have at least one entry: if not known, use -1;
-    //      if nonempty >= 0 the value must be exact.
+    //      to ncols-1.  If jumbled is false, the column indices must appear in
+    //      sorted order within each row.  No duplicate column indices may
+    //      appear in any row.  Ap [0] must equal zero, and Ap [nrows] must
+    //      equal nvals.  The Ap array must be of size nrows+1 (or larger), and
+    //      the Aj and Ax arrays must have size at least nvals.  If nvals is
+    //      zero, then the Aj and Ax arrays need not be present and can be
+    //      NULL.
 
 //------------------------------------------------------------------------------
+// GxB_Matrix_import_CSC: import a CSC matrix
+//------------------------------------------------------------------------------
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_import_CSC      // import a CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // CSC format:
-    int64_t nonempty,       // number of columns with at least one entry:
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index **Ap,         // column "pointers", size ncols+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    GrB_Index **Ap,     // column "pointers", Ap_size >= ncols+1
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Ai_size,  // size of Ai
+    GrB_Index Ax_size,  // size of Ax
+    bool jumbled,       // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
 ) ;
 
     // CSC:  an nrows-by-ncols matrix with nvals entries in CSC format consists
-    // of 3 arrays:
+    // of 3 arrays, where nvals = Ap [ncols]:
     //
     //          GrB_Index Ap [ncols+1], Ai [nvals] ; <type> Ax [nvals] ;
     //
     //      The row indices of entries in the jth column of the matrix are held
     //      in Ai [Ap [j] ... Ap[j+1]], and the corresponding values are held
     //      in the same positions in Ax.  Row indices must be in the range 0 to
-    //      nrows-1, and must appear in sorted order within each column.  No
-    //      duplicate row indices may appear in any column.  Ap [0] must equal
-    //      zero, and Ap [ncols] must equal nvals.  The Ap array must be of
-    //      size ncols+1 (or larger), and the Ai and Ax arrays must have size
-    //      at least nvals.  If nvals is zero, then the Ai and Ax arrays need
-    //      not be present and can be NULL.
-
-    //      The nonempty parameter is optional.  It states the number of
-    //      columns that have at least one entry: if not known, use -1;
-    //      if nonempty >= 0 the value must be exact.
+    //      nrows-1.  If jumbled is false, the row indices must appear in
+    //      sorted order within each column.  No duplicate row indices may
+    //      appear in any column.  Ap [0] must equal zero, and Ap [ncols] must
+    //      equal nvals.  The Ap array must be of size ncols+1 (or larger), and
+    //      the Ai and Ax arrays must have size at least nvals.  If nvals is
+    //      zero, then the Ai and Ax arrays need not be present and can be
+    //      NULL.
 
 //------------------------------------------------------------------------------
+// GxB_Matrix_import_HyperCSR: import a hypersparse CSR matrix
+//------------------------------------------------------------------------------
 
 GB_PUBLIC
-GrB_Info GxB_Matrix_import_HyperCSR     // import a hypersparse CSR matrix
+GrB_Info GxB_Matrix_import_HyperCSR      // import a hypersparse CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // hypersparse CSR format:
-    int64_t nonempty,       // number of rows in Ah with at least one entry,
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index nvec,         // number of rows in Ah list
-    GrB_Index **Ah,         // list of size nvec of rows that appear in A
-    GrB_Index **Ap,         // row "pointers", size nvec+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
-) ;
-
-    // HYPER_CSR: an nrows-by-ncols matrix with nvals entries and nvec
-    // rows that may have entries in HYPER_CSR format consists of 4 arrays:
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // row indices, Ah_size >= nvec
+    GrB_Index **Aj,     // column indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Ah_size,  // size of Ah
+    GrB_Index Aj_size,  // size of Aj
+    GrB_Index Ax_size,  // size of Ax
+    GrB_Index nvec,     // number of rows that appear in Ah
+    bool jumbled,       // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
+) ;
+
+    // HyperCSR: an nrows-by-ncols matrix with nvals entries and nvec
+    // rows that may have entries in HyperCSR format consists of 4 arrays,
+    // where nvals = Ap [nvec]:
     //
     //          GrB_Index Ah [nvec], Ap [nvec+1], Aj [nvals] ;
     //          <type> Ax [nvals] ;
     //
-    //      The Aj and Ax arrays are the same for a matrix in CSR or HYPER_CSR
+    //      The Aj and Ax arrays are the same for a matrix in CSR or HyperCSR
     //      format.  Only Ap and Ah differ.
     //
     //      The Ah array is a list of the row indices of rows that appear in
@@ -8609,41 +8197,39 @@ GrB_Info GxB_Matrix_import_HyperCSR     // import a hypersparse CSR matrix
     //      Aj and Ax must be at least of size nvals.  If nvals is zero, then
     //      the Aj and Ax arrays need not be present and can be NULL.
 
-    //      The nonempty parameter is optional.  Row indices that do not appear
-    //      in the Ah list have no entries.  Row indices that do appear in Ah
-    //      have >= 0 entries.  The nonempty parameter states the number of
-    //      rows in the Ah list that have at least one entry: if not known, use
-    //      -1.  If nonempty >= 0 the value must be exact.
-
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_HyperCSC: import a hypersparse CSC matrix
 //------------------------------------------------------------------------------
 
 GB_PUBLIC
-GrB_Info GxB_Matrix_import_HyperCSC     // import a hypersparse CSC matrix
+GrB_Info GxB_Matrix_import_HyperCSC      // import a hypersparse CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // hypersparse CSC format:
-    int64_t nonempty,       // number of columns in Ah with at least one entry,
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index nvec,         // number of columns in Ah list
-    GrB_Index **Ah,         // list of size nvec of columns that appear in A
-    GrB_Index **Ap,         // column "pointers", size nvec+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
-) ;
-
-    // HYPER_CSC: an nrows-by-ncols matrix with nvals entries and nvec
-    // columns that may have entries in HYPER_CSC format consists of 4 arrays:
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    GrB_Index **Ap,     // column "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // column indices, Ah_size >= nvec
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Ah_size,  // size of Ah
+    GrB_Index Ai_size,  // size of Ai
+    GrB_Index Ax_size,  // size of Ax
+    GrB_Index nvec,     // number of columns that appear in Ah
+    bool jumbled,       // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
+) ;
+
+    // HyperCSC: an nrows-by-ncols matrix with nvals entries and nvec
+    // columns that may have entries in HyperCSC format consists of 4 arrays,
+    // where nvals = Ap [nvec]:
     //
     //
     //          GrB_Index Ah [nvec], Ap [nvec+1], Ai [nvals] ;
     //          <type> Ax [nvals] ;
     //
-    //      The Ai and Ax arrays are the same for a matrix in CSC or HYPER_CSC
+    //      The Ai and Ax arrays are the same for a matrix in CSC or HyperCSC
     //      format.  Only Ap and Ah differ.
     //
     //      The Ah array is a list of the column indices of non-empty columns.
@@ -8659,75 +8245,177 @@ GrB_Info GxB_Matrix_import_HyperCSC     // import a hypersparse CSC matrix
     //      If nvals is zero, then the Ai and Ax arrays need not be present and
     //      can be NULL.
 
-    //      The nonempty parameter is optional.  Column indices that do not
-    //      appear in the Ah list have no entries.  Column indices that do
-    //      appear in Ah have >= 0 entries.  The nonempty parameter states the
-    //      number of columns in the Ah list that have at least one entry: if
-    //      not known, use -1.  If nonempty >= 0 the value must be exact.
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_BitmapR: import a bitmap matrix, held by row
+//------------------------------------------------------------------------------
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_import_BitmapR  // import a bitmap matrix, held by row
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index Ab_size,  // size of Ab
+    GrB_Index Ax_size,  // size of Ax
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+
+    // BitmapR: a dense format, but able to represent sparsity structure of A.
+    //
+    //          int8_t Ab [nrows*ncols] ;
+    //          <type> Ax [nrows*ncols] ;
+    //
+    //      Ab and Ax are both of size nrows*ncols.  Ab [i*ncols+j] = 1 if the
+    //      A(i,j) entry is present with value Ax [i*ncols+j], or 0 if A(i,j)
+    //      is not present.  nvals must equal the number of 1's in the Ab
+    //      array.
 
 //------------------------------------------------------------------------------
+// GxB_Matrix_import_BitmapC: import a bitmap matrix, held by column
+//------------------------------------------------------------------------------
 
-// On import, the required user arrays Ah, Ap, Ai, Aj, and/or Ax must be
-// non-NULL pointers to memory space allocted by the ANSI C malloc (or calloc,
-// or realloc).  Just like GrB_*_new, the GrB_Matrix A (or GrB_Vector v) is
-// undefined on input.  If the import is successful, the GrB_Matrix A or
-// GrB_Vector v is created, and the pointers to the user input arrays have been
-// set to NULL.  These user arrays have either been incorporated directly into
-// the GrB_Matrix A or GrB_Vector v, in which case the user input arrays will
-// eventually be freed by GrB_free (&A), or their contents have been copied and
-// the arrays freed.  This decision is made by the GraphBLAS library itself,
-// and the user application has no control over this decision.
-
-// If any of the above arrays Ap, Ah, Aj, Ai, or Ax have zero size, they must
-// still be non-NULL pointers to malloc'd space on input (effectively of size
-// at least 1 byte).  No error checking is performed on the user input arrays.
-// If the user input arrays do not conform to the precise specifications above,
-// results are undefined.  No typecasting of the values of the matrix or vector
-// entries is performed on import or export.
+GB_PUBLIC
+GrB_Info GxB_Matrix_import_BitmapC  // import a bitmap matrix, held by column
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index Ab_size,  // size of Ab
+    GrB_Index Ax_size,  // size of Ax
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+
+    // BitmapC: a dense format, but able to represent sparsity structure of A.
+    //
+    //          int8_t Ab [nrows*ncols] ;
+    //          <type> Ax [nrows*ncols] ;
+    //
+    //      Ab and Ax are both of size nrows*ncols.  Ab [i+j*nrows] = 1 if the
+    //      A(i,j) entry is present with value Ax [i+j*nrows], or 0 if A(i,j)
+    //      is not present.  nvals must equal the number of 1's in the Ab
+    //      array.
 
-// SuiteSparse:GraphBLAS supports the first four formats natively (CSR, CSC,
-// HYPER_CSR, and HYPER_CSC).  On import, the first four formats take O(1) time
-// and memory to import.  On export, if the GrB_Matrix or GrB_Vector is already
-// in this particular format, then the export takes O(1) time and no memory
-// copying is performed.
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_FullR:  import a full matrix, held by row
+//------------------------------------------------------------------------------
 
-// GxB_Vector_import:
-//
-//      For the import of a GrB_Vector, the four formats are all identical to
-//      one another (CSR, CSC, HYPER_CSR, HYPER_CSC).  The Ap and Ah arrays do
-//      not appear, and implicitly refer to a single sparse vector.  The
-//      GrB_Vector is treated as if it were a single row of an 1-by-n matrix in
-//      CSR format, or equivalently as a single column of an n-by-1 matrix in
-//      CSC format.  If nvals is zero, then the vi and vx arrays need not be
-//      present and can be NULL.
+GB_PUBLIC
+GrB_Info GxB_Matrix_import_FullR  // import a full matrix, held by row
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index Ax_size,  // size of Ax
+    const GrB_Descriptor desc
+) ;
+
+    // FullR: an nrows-by-ncols full matrix held in row-major order:
+    //
+    //  <type> Ax [nrows*ncols] ;
+    //
+    //      Ax is an array of size nrows*ncols, where A(i,j) is held in
+    //      Ax [i*ncols+j].  All entries in A are present.
+
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_FullC: import a full matrix, held by column
+//------------------------------------------------------------------------------
 
 GB_PUBLIC
-GrB_Info GxB_Vector_import  // import a vector in CSC format
+GrB_Info GxB_Matrix_import_FullC  // import a full matrix, held by column
 (
-    GrB_Vector *vhandle,    // handle of vector to create
-    GrB_Type type,          // type of vector to create
-    GrB_Index n,            // vector length
-    GrB_Index nvals,        // number of entries in the vector
-    // CSR/CSC format:
-    GrB_Index **vi,         // indices, size nvals (in sorted order)
-    void      **vx,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index Ax_size,  // size of Ax
+    const GrB_Descriptor desc
 ) ;
 
-// If the import is not successful, the GxB_Matrix_import_* functions return A
-// as NULL, GxB_Vector_import returns v as NULL, and the user input arrays are
-// neither modified nor freed.  They are still owned by the user application.
+    // FullC: an nrows-by-ncols full matrix held in column-major order:
+    //
+    //  <type> Ax [nrows*ncols] ;
+    //
+    //      Ax is an array of size nrows*ncols, where A(i,j) is held in
+    //      Ax [i+j*nrows].  All entries in A are present.
+
+//------------------------------------------------------------------------------
+// GxB_Vector_import_CSC: import a vector in CSC format
+//------------------------------------------------------------------------------
+
+GB_PUBLIC
+GrB_Info GxB_Vector_import_CSC  // import a vector in CSC format
+(
+    GrB_Vector *v,      // handle of vector to create
+    GrB_Type type,      // type of vector to create
+    GrB_Index n,        // vector length
+    GrB_Index **vi,     // indices, vi_size >= nvals(v)
+    void **vx,          // values, vx_size >= nvals(v)
+    GrB_Index vi_size,  // size of vi
+    GrB_Index vx_size,  // size of vx
+    GrB_Index nvals,    // # of entries in vector
+    bool jumbled,       // if true, indices may be unsorted
+    const GrB_Descriptor desc
+) ;
 
-// Note that the first 4 arguments of GxB_Matrix_import_*, and the first 3
-// of GxB_Vector_import, are identical to GrB_Matrix_new and GrB_Vector_new,
-// respectively.
+    // The GrB_Vector is treated as if it was a single column of an n-by-1
+    // matrix in CSC format, except that no vp array is required.  If nvals is
+    // zero, then the vi and vx arrays need not be present and can be NULL.
 
 //------------------------------------------------------------------------------
+// GxB_Vector_import_Bitmap: import a vector in bitmap format
+//------------------------------------------------------------------------------
 
-// The GrB_*_export functions are symmetric with the GrB_*_import functions.
-//
-// GxB_Matrix_export and GxB_Vector_export force completion of any pending
-// operations, prior to the export.
+GB_PUBLIC
+GrB_Info GxB_Vector_import_Bitmap // import a bitmap vector
+(
+    GrB_Vector *v,      // handle of vector to create
+    GrB_Type type,      // type of vector to create
+    GrB_Index n,        // vector length
+    int8_t **vb,        // bitmap, vb_size >= n
+    void **vx,          // values, vx_size >= n
+    GrB_Index vb_size,  // size of vb
+    GrB_Index vx_size,  // size of vx
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+
+    // The GrB_Vector is treated as if it was a single column of an n-by-1
+    // matrix in BitmapC format.
+
+//------------------------------------------------------------------------------
+// GxB_Vector_import_Full: import a vector in full format
+//------------------------------------------------------------------------------
+
+GB_PUBLIC
+GrB_Info GxB_Vector_import_Full // import a full vector
+(
+    GrB_Vector *v,      // handle of vector to create
+    GrB_Type type,      // type of vector to create
+    GrB_Index n,        // vector length
+    void **vx,          // values, vx_size >= nvals(v)
+    GrB_Index vx_size,  // size of vx
+    const GrB_Descriptor desc
+) ;
+
+    // The GrB_Vector is treated as if it was a single column of an n-by-1
+    // matrix in FullC format.
+
+//------------------------------------------------------------------------------
+
+// The GxB_*_export functions are symmetric with the GxB_*_import functions.
+// GxB_*export* functions force completion of any pending operations, prior to
+// the export, except if the only pending operation is to unjumble the matrix.
 //
 // If there are no entries in the matrix or vector, then the index arrays
 // (Ai, Aj, or vi) and value arrays (Ax or vx) are returned as NULL.  This is
@@ -8740,108 +8428,192 @@ GrB_Info GxB_Vector_import  // import a vector in CSC format
 //      arrays Ah, Ap, Ai, Aj, and/or Ax are returned to the user application
 //      as arrays allocated by the ANSI C malloc function.  The four formats
 //      are the same as the import formats for GrB_Matrix_import_*.
+//
+//      If jumbled is NULL on input, this indicates to GxB_*export* that the
+//      exported matrix cannot be returned in a jumbled format.  In this case,
+//      if the matrix is jumbled, it is sorted before exporting it to the
+//      caller.
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_export_CSR  // export and free a CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // CSR format:
-    int64_t *nonempty,      // number of rows with at least one entry
-    GrB_Index **Ap,         // row "pointers", size nrows+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nrows+1
+    GrB_Index **Aj,     // row indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Aj_size, // size of Aj
+    GrB_Index *Ax_size, // size of Ax
+    bool *jumbled,      // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
 ) ;
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_export_CSC  // export and free a CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // CSC format:
-    int64_t *nonempty,      // number of columns with at least one entry
-    GrB_Index **Ap,         // column "pointers", size ncols+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    GrB_Index **Ap,     // column "pointers", Ap_size >= ncols+1
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Ai_size, // size of Ai
+    GrB_Index *Ax_size, // size of Ax
+    bool *jumbled,      // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
 ) ;
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_export_HyperCSR  // export and free a hypersparse CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // hypersparse CSR format:
-    int64_t *nonempty,      // number of rows in Ah with at least one entry
-    GrB_Index *nvec,        // number of rows in Ah list
-    GrB_Index **Ah,         // list of size nvec of rows that appear in A
-    GrB_Index **Ap,         // row "pointers", size nvec+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // row indices, Ah_size >= nvec
+    GrB_Index **Aj,     // column indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Ah_size, // size of Ah
+    GrB_Index *Aj_size, // size of Aj
+    GrB_Index *Ax_size, // size of Ax
+    GrB_Index *nvec,    // number of rows that appear in Ah
+    bool *jumbled,      // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
 ) ;
 
 GB_PUBLIC
 GrB_Info GxB_Matrix_export_HyperCSC  // export and free a hypersparse CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // hypersparse CSC format:
-    int64_t *nonempty,      // number of columns in Ah with at least one entry
-    GrB_Index *nvec,        // number of columns in Ah list
-    GrB_Index **Ah,         // list of size nvec of columns that appear in A
-    GrB_Index **Ap,         // columns "pointers", size nvec+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
-) ;
-
-// GxB_Vector_export:
-//
-//      GxB_Vector_export exports a vector in CSC format for GxB_Vector_import,
-//      in which the indices are returned in sorted order.
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    GrB_Index **Ap,     // column "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // column indices, Ah_size >= nvec
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Ah_size, // size of Ah
+    GrB_Index *Ai_size, // size of Ai
+    GrB_Index *Ax_size, // size of Ax
+    GrB_Index *nvec,    // number of columns that appear in Ah
+    bool *jumbled,      // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
+) ;
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_export_BitmapR  // export and free a bitmap matrix, by row
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index *Ab_size, // size of Ab
+    GrB_Index *Ax_size, // size of Ax
+    GrB_Index *nvals,   // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_export_BitmapC  // export and free a bitmap matrix, by col
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index *Ab_size, // size of Ab
+    GrB_Index *Ax_size, // size of Ax
+    GrB_Index *nvals,   // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_export_FullR  // export and free a full matrix, by row
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index *Ax_size, // size of Ax
+    const GrB_Descriptor desc
+) ;
+
+GB_PUBLIC
+GrB_Info GxB_Matrix_export_FullC  // export and free a full matrix, by column
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+    void **Ax,          // values, Ax_size >= nrows*ncols
+    GrB_Index *Ax_size, // size of Ax
+    const GrB_Descriptor desc
+) ;
+
+    // For GxB_Matrix_export_Full*, all entries in A must be present.  That is,
+    // GrB_Matrix_nvals must report nvals equal to nrows*ncols.  If this
+    // condition does not hold, the matrix is not exported, and
+    // GrB_INVALID_VALUE is returned.
+
+GB_PUBLIC
+GrB_Info GxB_Vector_export_CSC  // export and free a CSC vector
+(
+    GrB_Vector *v,      // handle of vector to export and free
+    GrB_Type *type,     // type of vector exported
+    GrB_Index *n,       // length of the vector
+    GrB_Index **vi,     // indices, vi_size >= nvals(v)
+    void **vx,          // values, vx_size >= nvals(v)
+    GrB_Index *vi_size, // size of vi
+    GrB_Index *vx_size, // size of vx
+    GrB_Index *nvals,   // # of entries in vector
+    bool *jumbled,      // if true, indices may be unsorted
+    const GrB_Descriptor desc
+) ;
+
+GB_PUBLIC
+GrB_Info GxB_Vector_export_Bitmap   // export and free a bitmap vector
+(
+    GrB_Vector *v,      // handle of vector to export and free
+    GrB_Type *type,     // type of vector exported
+    GrB_Index *n,       // length of the vector
+    int8_t **vb,        // bitmap, vb_size >= n
+    void **vx,          // values, vx_size >= n
+    GrB_Index *vb_size, // size of vb
+    GrB_Index *vx_size, // size of vx
+    GrB_Index *nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+) ;
 
 GB_PUBLIC
-GrB_Info GxB_Vector_export  // export and free a vector
+GrB_Info GxB_Vector_export_Full   // export and free a full vector
 (
-    GrB_Vector *vhandle,    // handle of vector to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *n,           // length of the vector
-    GrB_Index *nvals,       // number of entries in the vector
-    // CSR/CSC format:
-    GrB_Index **vi,         // indices, size nvals
-    void      **vx,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
+    GrB_Vector *v,      // handle of vector to export and free
+    GrB_Type *type,     // type of vector exported
+    GrB_Index *n,       // length of the vector
+    void **vx,          // values, vx_size >= nvals(v)
+    GrB_Index *vx_size, // size of vx
+    const GrB_Descriptor desc
 ) ;
 
 // If the export is not successful, the GxB_Matrix_export_* functions do not
 // modify A, the GxB_Vector_export does not modify v, and the user arrays are
 // returned as NULL.
 
-// SuiteSparse:GraphBLAS supports all four formats natively (CSR, CSC,
-// HYPER_CSR, and HYPER_CSC).  On export, they take O(1) time if the internal
-// format matches the requested output format.  The internal format can be
-// queried via GxB_Matrix_Option_get, to determine if the format is by row or
-// by column, if desired.  If the formats do not match, SuiteSparse:GraphBLAS
-// first reformats the GrB_Matrix A into the desired format, and then exports
-// the result.
-
-//------------------------------------------------------------------------------
+//==============================================================================
 // CUDA memory management (DRAFT: in progress, do not use)
-//------------------------------------------------------------------------------
+//==============================================================================
 
 // These functions are made available to the user application, since the
 // GxB_import/export functions require the user application and the GraphBLAS
@@ -8863,21 +8635,5 @@ void *GxB_cuda_malloc (size_t size) ;           // standard malloc signature
 void *GxB_cuda_calloc (size_t n, size_t size) ; // standard calloc signature
 void  GxB_cuda_free (void *p) ;                 // standard free signature
 
-//------------------------------------------------------------------------------
-// MKL optimization (DRAFT: in progress, do not use)
-//------------------------------------------------------------------------------
-
-GrB_Info GxB_mxv_optimize           // analyze A for subsequent use in mxv
-(
-    GrB_Matrix A,                   // input/output matrix
-    int64_t ncalls,                 // estimate # of future calls to GrB_mxv
-    const GrB_Descriptor desc       // currently unused
-) ;
-
-GrB_Info GxB_mxv_optimize_free      // analyze A for subsequent use in mxv
-(
-    GrB_Matrix A                    // input/output matrix
-) ;
-
 #endif
 
diff --git a/GraphBLAS/Source/GB_AxB_dot4_mkl.c b/GraphBLAS/MKL/GB_AxB_dot4_mkl.c
similarity index 89%
rename from GraphBLAS/Source/GB_AxB_dot4_mkl.c
rename to GraphBLAS/MKL/GB_AxB_dot4_mkl.c
index 0fec8911e8..194d8f94ad 100644
--- a/GraphBLAS/Source/GB_AxB_dot4_mkl.c
+++ b/GraphBLAS/MKL/GB_AxB_dot4_mkl.c
@@ -2,8 +2,8 @@
 // GB_AxB_dot4_mkl: compute c+=A*b where c and b are dense vectors
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -43,6 +43,24 @@ GrB_Info GB_AxB_dot4_mkl            // c += A*b using MKL
     // check inputs
     //--------------------------------------------------------------------------
 
+    ASSERT (!GB_ZOMBIES (c)) ;
+    ASSERT (!GB_JUMBLED (c)) ;
+    ASSERT (!GB_PENDING (c)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+    ASSERT (!GB_PENDING (B)) ;
+
+    ASSERT (!GB_IS_BITMAP (c)) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (!GB_IS_BITMAP (B)) ;
+
+    //--------------------------------------------------------------------------
+    // declare workspace
+    //--------------------------------------------------------------------------
+
     GrB_Info info ;
 
     mkl_graph_descriptor_t mkl_desc = NULL ;
@@ -127,7 +145,7 @@ GrB_Info GB_AxB_dot4_mkl            // c += A*b using MKL
     {
         // out of memory
         GB_MKL_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
     GB_MKL_OK (mkl_graph_vector_create (&z_mkl)) ;
     GB_MKL_OK (mkl_graph_vector_set_dense (z_mkl, n, Zx, MKL_GRAPH_TYPE_FP32)) ;
@@ -149,7 +167,7 @@ GrB_Info GB_AxB_dot4_mkl            // c += A*b using MKL
         mkl_semiring, A_mkl, b_mkl, mkl_desc,
         MKL_GRAPH_REQUEST_COMPUTE_ALL, MKL_GRAPH_METHOD_AUTO)) ;
     t = omp_get_wtime ( ) - t ;
-    GBBURBLE ("(MKL mxv time: %g) ", t) ;
+    GBURBLE ("(MKL mxv time: %g) ", t) ;
 
     //--------------------------------------------------------------------------
     // c += z
diff --git a/GraphBLAS/MKL/GB_AxB_dot4_mkl_template.c b/GraphBLAS/MKL/GB_AxB_dot4_mkl_template.c
new file mode 100644
index 0000000000..413e7bd5f1
--- /dev/null
+++ b/GraphBLAS/MKL/GB_AxB_dot4_mkl_template.c
@@ -0,0 +1,48 @@
+//------------------------------------------------------------------------------
+// GB_AxB_dot4_mkl_template: compute C+=A'*B in-place, with MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+    bool use_mkl = (Context == NULL) ? false : Context->use_mkl ;
+
+    // Note that GB_AxB_dot4 computes C+=A'*B where A and B treated as if CSC,
+    // but MKL views the matrices as CSR.  MKL only handles the case when B
+    // is a dense vector in mkl_graph_mxv, and A' in CSC format is the same
+    // as A in CSR.
+
+    #if GB_HAS_MKL_GRAPH
+
+    if (use_mkl &&
+        (semiring == GrB_PLUS_TIMES_SEMIRING_FP32 ||
+         semiring == GxB_PLUS_SECOND_FP32) && GB_VECTOR_OK (C)
+        && GB_is_dense (C) && GB_is_dense (B) && GB_VECTOR_OK (B) && !flipxy
+        && !GB_IS_HYPERSPARSE (A)
+        && !GB_IS_BITMAP (C) && !GB_IS_BITMAP (A) && !GB_IS_BITMAP (B))
+    {
+  
+        info = // GrB_NO_VALUE ;
+        #if 1
+        GB_AxB_dot4_mkl (
+            (GrB_Vector) C,     // input/output (now a vector)
+            A,                  // first input matrix
+            (GrB_Vector) B,     // second input (now a vector)
+            semiring,           // semiring that defines C=A*B
+            Context) ;
+        #endif
+  
+        if (info != GrB_NO_VALUE)
+        {
+            // MKL_graph supports this semiring, and has either computed C=A*B,
+            // C<M>=A*B, or C<!M>=A*B, or has failed.
+            return (info) ;
+        }
+  
+        // If MKL_graph doesn't support this semiring, it returns GrB_NO_VALUE,
+        // so fall through to use GraphBLAS, below.
+    }
+    #endif
+
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3_mkl.c b/GraphBLAS/MKL/GB_AxB_saxpy3_mkl.c
similarity index 85%
rename from GraphBLAS/Source/GB_AxB_saxpy3_mkl.c
rename to GraphBLAS/MKL/GB_AxB_saxpy3_mkl.c
index f923d9b61f..e335239d90 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy3_mkl.c
+++ b/GraphBLAS/MKL/GB_AxB_saxpy3_mkl.c
@@ -2,8 +2,8 @@
 // GB_AxB_saxpy3_mkl: compute C=A*B, C<M>=A*B, or C<!M>=A*B via MKL_graph
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -26,7 +26,7 @@
 
 #define GB_MKL_FREE_ALL                         \
     GB_MKL_FREE_WORK                            \
-    GB_MATRIX_FREE (C) ;                        \
+    GB_Matrix_free (C) ;                        \
     GB_FREE (Cp) ;                              \
     GB_FREE (Ci) ;                              \
     GB_FREE (Cx) ;
@@ -50,6 +50,21 @@ GrB_Info GB_AxB_saxpy3_mkl          // C = A*B using MKL
     // check inputs
     //--------------------------------------------------------------------------
 
+    ASSERT (Chandle != NULL) ;
+    ASSERT (*Chandle == NULL) ;
+
+    ASSERT (!GB_ZOMBIES (M)) ; 
+    ASSERT (!GB_JUMBLED (M)) ;
+    ASSERT (!GB_PENDING (M)) ; 
+
+    ASSERT (!GB_ZOMBIES (A)) ; 
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ; 
+
+    ASSERT (!GB_ZOMBIES (B)) ; 
+    ASSERT (!GB_JUMBLED (B)) ;
+    ASSERT (!GB_PENDING (B)) ; 
+
     GrB_Info info ;
 
     mkl_graph_matrix_t C_mkl = NULL ;
@@ -62,9 +77,10 @@ GrB_Info GB_AxB_saxpy3_mkl          // C = A*B using MKL
     int64_t *GB_RESTRICT Ci = NULL ;
     GB_void *GB_RESTRICT Cx = NULL ;
 
-    if (GB_IS_HYPER (M) || GB_IS_HYPER( A) || GB_IS_HYPER (B))
+    if (!GB_IS_SPARSE (A) || !GB_IS_SPARSE (B)
+        || (M != NULL && !GB_IS_SPARSE (M)))
     {
-        // MKL does not handle hypersparsity
+        // MKL does not handle hypersparsity, bitmap, or full
         return (GrB_NO_VALUE) ;
     }
 
@@ -132,13 +148,15 @@ GrB_Info GB_AxB_saxpy3_mkl          // C = A*B using MKL
 
     if (M != NULL)
     {
-        // HACK: not actually supported!
+        // note: not actually supported by MKL
         return (GrB_NO_VALUE) ;
+        /*
         GB_MKL_OK (mkl_graph_matrix_create (&M_mkl)) ;
         GB_MKL_OK (mkl_graph_matrix_set_csr (M_mkl, M->vdim, M->vlen,
             M->p, MKL_GRAPH_TYPE_INT64,
             M->i, MKL_GRAPH_TYPE_INT64,
             M->x, GB_type_mkl (M->type->code))) ;
+        */
     }
 
     GB_MKL_OK (mkl_graph_matrix_create (&B_mkl)) ;
@@ -157,16 +175,16 @@ GrB_Info GB_AxB_saxpy3_mkl          // C = A*B using MKL
     // C=A*B or C<M>=A*B via MKL
     //--------------------------------------------------------------------------
 
-// TODO for MKL: figure out how to call mkl_mxv for both dense and sparse v
+// note for MKL: figure out how to call mkl_mxv for both dense and sparse v
 
     GB_MKL_OK (mkl_graph_matrix_create (&C_mkl)) ;
-    GBBURBLE ("(MKL start) ") ;
+    GBURBLE ("(MKL start) ") ;
     double t = omp_get_wtime ( ) ;
     GB_MKL_OK (mkl_graph_mxm (C_mkl, M_mkl, MKL_GRAPH_ACCUMULATOR_NONE,
         mkl_semiring, A_mkl, B_mkl, NULL,
         MKL_GRAPH_REQUEST_COMPUTE_ALL, MKL_GRAPH_METHOD_AUTO)) ;
     t = omp_get_wtime ( ) - t ;
-    GBBURBLE ("(MKL time: %g) ", t) ;
+    GBURBLE ("(MKL time: %g) ", t) ;
 
     //--------------------------------------------------------------------------
     // get the contents of C
@@ -181,7 +199,7 @@ GrB_Info GB_AxB_saxpy3_mkl          // C = A*B using MKL
 
     GB_MKL_OK (mkl_graph_matrix_get_csr (C_mkl, &cnrows, &cncols,
         &Tp, &Tp_type, &Ti, &Ti_type, &Tx, &Tx_type)) ;
-    GBBURBLE ("(got csr) ") ;
+    GBURBLE ("(got csr) ") ;
 
     if (Tp == NULL || Ti == NULL || Tx == NULL)
     {
@@ -196,13 +214,7 @@ GrB_Info GB_AxB_saxpy3_mkl          // C = A*B using MKL
         Tx_type != GB_type_mkl (ctype->code))
     {
         GB_MKL_FREE_ALL ;
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "MKL returned result with wrong type."
-            "Expected [%d,%d,%d], got [%d,%d,%d]\n",
-            MKL_GRAPH_TYPE_INT64,
-            MKL_GRAPH_TYPE_INT64,
-            GB_type_mkl (ctype->code),
-            Tp_type, Ti_type, Tx_type))) ;
+        return (GrB_PANIC) ;
     }
 
     cnvals = Tp [cnrows] ;
@@ -217,7 +229,7 @@ GrB_Info GB_AxB_saxpy3_mkl          // C = A*B using MKL
     {
         // out of memory
         GB_MKL_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     GB_memcpy (Cp, Tp, (cnrows+1) * sizeof (int64_t), nthreads_max) ;
@@ -240,11 +252,11 @@ GrB_Info GB_AxB_saxpy3_mkl          // C = A*B using MKL
     // free MKL matrices C, M, A, and B
     //--------------------------------------------------------------------------
 
-    GBBURBLE ("(copied) ") ;
-    GB_MKL_GRAPH_MATRIX_DESTROY (M_mkl) ; GBBURBLE ("(freed M) ") ;
-    GB_MKL_GRAPH_MATRIX_DESTROY (A_mkl) ; GBBURBLE ("(freed A) ") ;
-    GB_MKL_GRAPH_MATRIX_DESTROY (B_mkl) ; GBBURBLE ("(freed B) ") ;
-    GB_MKL_GRAPH_MATRIX_DESTROY (C_mkl) ; GBBURBLE ("(freed C) ") ;
+    GBURBLE ("(copied) ") ;
+    GB_MKL_GRAPH_MATRIX_DESTROY (M_mkl) ; GBURBLE ("(freed M) ") ;
+    GB_MKL_GRAPH_MATRIX_DESTROY (A_mkl) ; GBURBLE ("(freed A) ") ;
+    GB_MKL_GRAPH_MATRIX_DESTROY (B_mkl) ; GBURBLE ("(freed B) ") ;
+    GB_MKL_GRAPH_MATRIX_DESTROY (C_mkl) ; GBURBLE ("(freed C) ") ;
 
     //--------------------------------------------------------------------------
     // import result in C as a CSR matrix
@@ -252,8 +264,9 @@ GrB_Info GB_AxB_saxpy3_mkl          // C = A*B using MKL
 
     // C may be flagged as a CSC matrix in the caller
 
-    info = GB_new (Chandle, ctype, cncols, cnrows, GB_Ap_null, false,
-        GB_FORCE_NONHYPER, B->hyper_ratio, cnrows, Context) ;
+    info = GB_new (Chandle, // sparse, new header
+        ctype, cncols, cnrows, GB_Ap_null, false,
+        GxB_SPARSE, B->hyper_switch, cnrows, Context) ;
     if (info != GrB_SUCCESS)
     {
         // out of memory
@@ -277,7 +290,8 @@ GrB_Info GB_AxB_saxpy3_mkl          // C = A*B using MKL
     C->plen = cnrows ;
     C->nvec = cnrows ;
     C->magic = GB_MAGIC ;
-    C->nvec_nonempty = -1 ;
+    C->nvec_nonempty = -1 ;     // not computed; delay until required
+    C->jumbled = false ;        // assume MKL returns a non-jumbled matrix
 
     //--------------------------------------------------------------------------
     // prune empty vectors, free workspace, and return result
@@ -285,7 +299,7 @@ GrB_Info GB_AxB_saxpy3_mkl          // C = A*B using MKL
 
     // MKL never computes C as hypersparse, so skip this step:
     // info = GB_hypermatrix_prune (C, Context) ;
-    GBBURBLE ("(done) ") ;
+    GBURBLE ("(done) ") ;
     if (info == GrB_SUCCESS) { ASSERT_MATRIX_OK (C, "mkl: output", GB0) ; }
     ASSERT (!GB_ZOMBIES (C)) ;
     ASSERT (!GB_PENDING (C)) ;
diff --git a/GraphBLAS/MKL/GB_AxB_saxpy3_mkl_template.c b/GraphBLAS/MKL/GB_AxB_saxpy3_mkl_template.c
new file mode 100644
index 0000000000..0fe44dd522
--- /dev/null
+++ b/GraphBLAS/MKL/GB_AxB_saxpy3_mkl_template.c
@@ -0,0 +1,42 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_mkl_template: compute C=A*B, C<M>=A*B, or C<!M>=A*B with MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+    bool use_mkl = (Context == NULL) ? false : Context->use_mkl ;
+
+    // Note that GB_AxB_saxpy3 computes C=A*B where A and B treated as if CSC,
+    // but MKL views the matrices as CSR.  So they are flipped below:
+
+    #if GB_HAS_MKL_GRAPH
+
+    if (use_mkl)
+    {
+        info = GB_AxB_saxpy3_mkl (
+            Chandle,            // output matrix to construct
+            M,                  // input mask M (may be NULL)
+            Mask_comp,          // true if M is complemented
+            Mask_struct,        // true if M is structural
+            B,                  // first input matrix
+            A,                  // second input matrix
+            semiring,           // semiring that defines C=A*B
+            !flipxy,            // true if multiply operator is flipped
+            mask_applied,       // if true, then mask was applied
+            Context) ;
+  
+        if (info != GrB_NO_VALUE)
+        {
+            // MKL_graph supports this semiring, and has either computed C=A*B,
+            // C<M>=A*B, or C<!M>=A*B, or has failed.
+            return (info) ;
+        }
+  
+        // If MKL_graph doesn't support this semiring, it returns GrB_NO_VALUE,
+        // so fall through to use GraphBLAS, below.
+    }
+    #endif
+
diff --git a/GraphBLAS/Source/GB_AxB_semiring_mkl.c b/GraphBLAS/MKL/GB_AxB_semiring_mkl.c
similarity index 99%
rename from GraphBLAS/Source/GB_AxB_semiring_mkl.c
rename to GraphBLAS/MKL/GB_AxB_semiring_mkl.c
index 267d9ffd1f..862da43a0c 100644
--- a/GraphBLAS/Source/GB_AxB_semiring_mkl.c
+++ b/GraphBLAS/MKL/GB_AxB_semiring_mkl.c
@@ -2,8 +2,8 @@
 // GB_AxB_semiring_mkl: map a GraphBLAS semiring to an Intel MKL semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/MKL/GB_CONTEXT_mkl_template.h b/GraphBLAS/MKL/GB_CONTEXT_mkl_template.h
new file mode 100644
index 0000000000..d2d8953a6c
--- /dev/null
+++ b/GraphBLAS/MKL/GB_CONTEXT_mkl_template.h
@@ -0,0 +1,23 @@
+//------------------------------------------------------------------------------
+// GB_CONTEXT_mkl_template.h: definitions for MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#undef  GB_CONTEXT
+#define GB_CONTEXT(where_string)                                    \
+    /* construct the Context */                                     \
+    GB_Context_struct Context_struct ;                              \
+    GB_Context Context = &Context_struct ;                          \
+    /* set Context->where so GrB_error can report it if needed */   \
+    Context->where = where_string ;                                 \
+    /* get the default max # of threads and default chunk size */   \
+    Context->nthreads_max = GB_Global_nthreads_max_get ( ) ;        \
+    Context->chunk = GB_Global_chunk_get ( ) ;                      \
+    /* get the pointer to where any error will be logged */         \
+    Context->logger = NULL ;                                        \
+    Context->use_mkl = GB_Global_use_mkl_get ( ) ;
+
diff --git a/GraphBLAS/MKL/GB_Context_struct_mkl_template.c b/GraphBLAS/MKL/GB_Context_struct_mkl_template.c
new file mode 100644
index 0000000000..5ffc83ef39
--- /dev/null
+++ b/GraphBLAS/MKL/GB_Context_struct_mkl_template.c
@@ -0,0 +1,17 @@
+//------------------------------------------------------------------------------
+// GB_Context_struct_mkl_template.h: definitions for MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// typedef struct
+// {
+//  ...
+    bool use_mkl ;              // control usage of Intel MKL
+//  ...
+// }
+// GB_Context_struct ;
+
diff --git a/GraphBLAS/MKL/GB_Descriptor_check_mkl_template.c b/GraphBLAS/MKL/GB_Descriptor_check_mkl_template.c
new file mode 100644
index 0000000000..9acd2b2ed1
--- /dev/null
+++ b/GraphBLAS/MKL/GB_Descriptor_check_mkl_template.c
@@ -0,0 +1,14 @@
+//------------------------------------------------------------------------------
+// GB_Descriptor_check_mkl_template: check and print a Descriptor MKL setting
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+    if (D->use_mkl)
+    {
+        GBPR0 ("    d.use_mkl = true") ;
+    }
+
diff --git a/GraphBLAS/MKL/GB_Descriptor_get_mkl_template.c b/GraphBLAS/MKL/GB_Descriptor_get_mkl_template.c
new file mode 100644
index 0000000000..7dfdfd9fd5
--- /dev/null
+++ b/GraphBLAS/MKL/GB_Descriptor_get_mkl_template.c
@@ -0,0 +1,16 @@
+//------------------------------------------------------------------------------
+// GB_Descriptor_get_mkl_template: get the status of a descriptor for MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+    bool use_mkl = false ;
+    if (desc != NULL)
+    {
+        use_mkl = desc->use_mkl ;
+    }
+    Context->use_mkl = use_mkl ;
+
diff --git a/GraphBLAS/MKL/GB_Descriptor_opaque_mkl_template.h b/GraphBLAS/MKL/GB_Descriptor_opaque_mkl_template.h
new file mode 100644
index 0000000000..437e4b319a
--- /dev/null
+++ b/GraphBLAS/MKL/GB_Descriptor_opaque_mkl_template.h
@@ -0,0 +1,16 @@
+//------------------------------------------------------------------------------
+// GB_Descriptor_opaque_mkl_template.h: definitions of opaque objects
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// struct GB_Descriptor_opaque // content of GrB_Descriptor
+// {
+//  ...
+    bool use_mkl ;          // if true, use the Intel MKL
+//  ...
+// } ;
+
diff --git a/GraphBLAS/MKL/GB_Global_init_mkl_template.c b/GraphBLAS/MKL/GB_Global_init_mkl_template.c
new file mode 100644
index 0000000000..438f6d1ad3
--- /dev/null
+++ b/GraphBLAS/MKL/GB_Global_init_mkl_template.c
@@ -0,0 +1,16 @@
+//------------------------------------------------------------------------------
+// GB_Global_init_mkl_template.c: global values in GraphBLAS with MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// GB_Global_struct GB_Global =
+// {
+//  ...
+    .use_mkl = false,           // if true, exploit the Intel MKL
+//  ...
+// } ;
+
diff --git a/GraphBLAS/MKL/GB_Global_mkl_template.c b/GraphBLAS/MKL/GB_Global_mkl_template.c
new file mode 100644
index 0000000000..81bcadc73f
--- /dev/null
+++ b/GraphBLAS/MKL/GB_Global_mkl_template.c
@@ -0,0 +1,21 @@
+//------------------------------------------------------------------------------
+// GB_Global_mkl_template: global values in GraphBLAS for MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+GB_PUBLIC
+void GB_Global_use_mkl_set (bool use_mkl)
+{ 
+    GB_Global.use_mkl = use_mkl ;
+}
+
+GB_PUBLIC
+bool GB_Global_use_mkl_get (void)
+{ 
+    return (GB_Global.use_mkl) ;
+}
+
diff --git a/GraphBLAS/MKL/GB_Global_mkl_template.h b/GraphBLAS/MKL/GB_Global_mkl_template.h
new file mode 100644
index 0000000000..8271173e38
--- /dev/null
+++ b/GraphBLAS/MKL/GB_Global_mkl_template.h
@@ -0,0 +1,12 @@
+//------------------------------------------------------------------------------
+// GB_Global_mkl_template.h: definitions for global variables for MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+          void     GB_Global_use_mkl_set (bool use_mkl) ;
+          bool     GB_Global_use_mkl_get (void) ;
+
diff --git a/GraphBLAS/MKL/GB_Global_struct_mkl_template.c b/GraphBLAS/MKL/GB_Global_struct_mkl_template.c
new file mode 100644
index 0000000000..f57e11e18d
--- /dev/null
+++ b/GraphBLAS/MKL/GB_Global_struct_mkl_template.c
@@ -0,0 +1,18 @@
+//------------------------------------------------------------------------------
+// GB_Global_struct_mkl_template.c: global values in GraphBLAS with MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+
+// typedef struct
+// {
+//  ...
+    bool use_mkl ;              // control usage of Intel MKL
+//  ...
+// }
+// GB_Global_struct ;
+
diff --git a/GraphBLAS/MKL/GB_Matrix_free_mkl_template.c b/GraphBLAS/MKL/GB_Matrix_free_mkl_template.c
new file mode 100644
index 0000000000..96203938a0
--- /dev/null
+++ b/GraphBLAS/MKL/GB_Matrix_free_mkl_template.c
@@ -0,0 +1,14 @@
+//------------------------------------------------------------------------------
+// GB_Matrix_free_mkl_template: free a GrB_Matrix/GrB_Vector when using MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+    // free the MKL optimization, if it exists
+    #if GB_HAS_MKL
+    GB_MKL_GRAPH_MATRIX_DESTROY (A->mkl) ;
+    #endif
+
diff --git a/GraphBLAS/MKL/GB_Scalar_wrap_mkl_template.c b/GraphBLAS/MKL/GB_Scalar_wrap_mkl_template.c
new file mode 100644
index 0000000000..8e27110444
--- /dev/null
+++ b/GraphBLAS/MKL/GB_Scalar_wrap_mkl_template.c
@@ -0,0 +1,11 @@
+//------------------------------------------------------------------------------
+// GB_Scalar_wrap_mkl_template: wrap a C scalar for MKL 
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+    s->mkl = NULL ;
+
diff --git a/GraphBLAS/MKL/GB_Test_about_mkl_template.c b/GraphBLAS/MKL/GB_Test_about_mkl_template.c
new file mode 100644
index 0000000000..4d4e6d0619
--- /dev/null
+++ b/GraphBLAS/MKL/GB_Test_about_mkl_template.c
@@ -0,0 +1,30 @@
+//------------------------------------------------------------------------------
+// GB_Test_about_mkl_template.c: testing with MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+    int use_mkl = -99 ;
+    OK (GxB_Global_Option_get_(GxB_GLOBAL_MKL, &use_mkl)) ;
+    printf ("MKL control: %d\n", use_mkl) ;
+
+    OK (GxB_Global_Option_set_(GxB_GLOBAL_MKL, true)) ;
+    OK (GxB_Global_Option_get_(GxB_GLOBAL_MKL, &use_mkl)) ;
+    CHECK (use_mkl == true) ;
+
+    OK (GxB_Global_Option_set_(GxB_GLOBAL_MKL, false)) ;
+    OK (GxB_Global_Option_get_(GxB_GLOBAL_MKL, &use_mkl)) ;
+    CHECK (use_mkl == false) ;
+
+    GxB_Global_Option_get_(GxB_MKL, &use_mkl) ;
+    printf ("use mkl: %d\n", use_mkl) ;
+
+    OK (GrB_Descriptor_new (&desc)) ;
+    OK (GxB_Desc_set (desc, GxB_MKL, false)) ;
+    OK (GxB_Desc_get (desc, GxB_MKL, &use_mkl)) ;
+    CHECK (use_mkl == false) ;
+    GrB_Descriptor_free_(&desc) ;
+
diff --git a/GraphBLAS/MKL/GB_Test_init_mkl_template.c b/GraphBLAS/MKL/GB_Test_init_mkl_template.c
new file mode 100644
index 0000000000..91f1cb3df7
--- /dev/null
+++ b/GraphBLAS/MKL/GB_Test_init_mkl_template.c
@@ -0,0 +1,13 @@
+//------------------------------------------------------------------------------
+// GB_Test_init_mkl_template: initialize GraphBLAS for MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+    bool use_mkl ;
+    GxB_Global_Option_get_(GxB_MKL, &use_mkl) ;
+    pargout [14] = mxCreateLogicalScalar (use_mkl) ;
+
diff --git a/GraphBLAS/Source/GB_cblas_daxpy.c b/GraphBLAS/MKL/GB_cblas_daxpy.c
similarity index 94%
rename from GraphBLAS/Source/GB_cblas_daxpy.c
rename to GraphBLAS/MKL/GB_cblas_daxpy.c
index 9a827a12bb..5c8f1fdabf 100644
--- a/GraphBLAS/Source/GB_cblas_daxpy.c
+++ b/GraphBLAS/MKL/GB_cblas_daxpy.c
@@ -2,8 +2,8 @@
 // GB_cblas_daxpy: Y += alpha*X where X and Y are dense double arrays
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -33,7 +33,7 @@ void GB_cblas_daxpy         // Y += alpha*X
     ASSERT (X != NULL) ;
     ASSERT (nthreads >= 1) ;
 
-    GBBURBLE ("(cblas_daxpy) ") ;
+    GBURBLE ("(cblas_daxpy) ") ;
 
     //--------------------------------------------------------------------------
     // determine the number of threads to use
@@ -94,3 +94,4 @@ void GB_cblas_daxpy         // Y += alpha*X
 }
 
 #endif
+
diff --git a/GraphBLAS/Source/GB_cblas_saxpy.c b/GraphBLAS/MKL/GB_cblas_saxpy.c
similarity index 96%
rename from GraphBLAS/Source/GB_cblas_saxpy.c
rename to GraphBLAS/MKL/GB_cblas_saxpy.c
index 7265cbd4b0..5c76403b19 100644
--- a/GraphBLAS/Source/GB_cblas_saxpy.c
+++ b/GraphBLAS/MKL/GB_cblas_saxpy.c
@@ -2,8 +2,8 @@
 // GB_cblas_saxpy: Y += alpha*X where X and Y are dense float arrays
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -37,7 +37,7 @@ void GB_cblas_saxpy         // Y += alpha*X
     ASSERT (X != NULL) ;
     ASSERT (nthreads >= 1) ;
 
-    GBBURBLE ("(cblas_saxpy) ") ;
+    GBURBLE ("(cblas_saxpy) ") ;
 
     //--------------------------------------------------------------------------
     // determine the number of threads to use
diff --git a/GraphBLAS/MKL/GB_init_mkl_template.c b/GraphBLAS/MKL/GB_init_mkl_template.c
new file mode 100644
index 0000000000..cebab7c034
--- /dev/null
+++ b/GraphBLAS/MKL/GB_init_mkl_template.c
@@ -0,0 +1,26 @@
+//------------------------------------------------------------------------------
+// GB_init_mkl_template: initialize GraphBLAS for MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+    //--------------------------------------------------------------------------
+    // set the MKL allocator functions
+    //--------------------------------------------------------------------------
+
+    #if GB_HAS_MKL_GRAPH
+    i_malloc  = malloc_function ;
+    i_calloc  = calloc_function ;
+    i_realloc = realloc_function ;
+    i_free    = free_function ;
+    #endif
+
+    //--------------------------------------------------------------------------
+    // control usage of Intel MKL
+    //--------------------------------------------------------------------------
+
+    GB_Global_use_mkl_set (false) ;
+
diff --git a/GraphBLAS/MKL/GB_matrix_mkl_template.h b/GraphBLAS/MKL/GB_matrix_mkl_template.h
new file mode 100644
index 0000000000..da08e3b3ec
--- /dev/null
+++ b/GraphBLAS/MKL/GB_matrix_mkl_template.h
@@ -0,0 +1,11 @@
+//------------------------------------------------------------------------------
+// GB_matrix_mkl_template.h: definitions for GrB_Matrix and GrB_Vector with MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+    void *mkl ;          // from the MKL analysis for this matrix
+
diff --git a/GraphBLAS/Source/GB_mkl.h b/GraphBLAS/MKL/GB_mkl.h
similarity index 93%
rename from GraphBLAS/Source/GB_mkl.h
rename to GraphBLAS/MKL/GB_mkl.h
index 1deadb6777..250a9a768d 100644
--- a/GraphBLAS/Source/GB_mkl.h
+++ b/GraphBLAS/MKL/GB_mkl.h
@@ -2,8 +2,8 @@
 // GB_mkl.h: definitions for using the Intel MKL and/or CBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,17 @@
 #ifndef GB_MKL_H
 #define GB_MKL_H
 
+// disable MKL and the CBLAS: work in progress
+#undef  GB_HAS_CBLAS
+#define GB_HAS_CBLAS 0
+
+#undef  GB_HAS_MKL_GRAPH
+#define GB_HAS_MKL_GRAPH 0
+
 //==============================================================================
 // determine if MKL and/or CBLAS is available
 //==============================================================================
 
-#define GB_HAS_MKL_GRAPH 0
 #define GB_INTEL_MKL_VERSION 0
 
 #if !defined ( GBCOMPACT )
@@ -135,7 +141,7 @@ static inline GrB_Info GB_info_mkl      // equivalent GrB_Info
 }
 
 //------------------------------------------------------------------------------
-// GB_MKL_OK: call an MKL_graph method and check its result
+// GB_MKL_OK    call an MKL_graph method and check its result
 //------------------------------------------------------------------------------
 
 #define GB_MKL_OK(mkl_method)                                               \
@@ -145,21 +151,13 @@ static inline GrB_Info GB_info_mkl      // equivalent GrB_Info
     {                                                                       \
         case GrB_SUCCESS:                                                   \
             break ;                                                         \
-        case GrB_UNINITIALIZED_OBJECT:                                      \
-            GB_MKL_FREE_ALL ;                                               \
-            return (GB_ERROR (info, (GB_LOG, "MKL_graph uninitialized"))) ; \
         case GrB_OUT_OF_MEMORY:                                             \
-            GB_MKL_FREE_ALL ;                                               \
-            return (GB_OUT_OF_MEMORY) ;                                     \
+        case GrB_UNINITIALIZED_OBJECT:                                      \
         case GrB_INVALID_VALUE:                                             \
-            GB_MKL_FREE_ALL ;                                               \
-            return (GB_ERROR (info, (GB_LOG, "MKL_graph invalid value"))) ; \
         case GrB_PANIC:                                                     \
-            GB_MKL_FREE_ALL ;                                               \
-            return (GB_ERROR (info, (GB_LOG, "MKL_graph panic"))) ;         \
         case GrB_NO_VALUE:                                                  \
             GB_MKL_FREE_ALL ;                                               \
-            return (GB_ERROR (info, (GB_LOG, "MKL_graph not supported"))) ; \
+            return (info) ;                                                 \
         default:                                                            \
             GB_MKL_FREE_ALL ;                                               \
             return (GrB_PANIC) ;                                            \
diff --git a/GraphBLAS/MKL/GB_new_mkl_template.c b/GraphBLAS/MKL/GB_new_mkl_template.c
new file mode 100644
index 0000000000..ae847a1a93
--- /dev/null
+++ b/GraphBLAS/MKL/GB_new_mkl_template.c
@@ -0,0 +1,11 @@
+//------------------------------------------------------------------------------
+// GB_new_mkl_template: create a new GraphBLAS matrix when using MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+    A->mkl = NULL ;             // no analysis from MKL yet
+
diff --git a/GraphBLAS/MKL/GrB_Descriptor_new_mkl_template.c b/GraphBLAS/MKL/GrB_Descriptor_new_mkl_template.c
new file mode 100644
index 0000000000..6903347880
--- /dev/null
+++ b/GraphBLAS/MKL/GrB_Descriptor_new_mkl_template.c
@@ -0,0 +1,11 @@
+//------------------------------------------------------------------------------
+// GrB_Descriptor_new_mkl_template: create a new descriptor when using MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+    desc->use_mkl = false ;        // control usage of Intel MKL
+
diff --git a/GraphBLAS/MKL/GxB_Desc_get_mkl_template.c b/GraphBLAS/MKL/GxB_Desc_get_mkl_template.c
new file mode 100644
index 0000000000..072ebf8a87
--- /dev/null
+++ b/GraphBLAS/MKL/GxB_Desc_get_mkl_template.c
@@ -0,0 +1,20 @@
+//------------------------------------------------------------------------------
+// GxB_Desc_get_mkl_template: get a field in a descriptor for MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+        case GxB_DESCRIPTOR_MKL:
+
+            {
+                va_start (ap, field) ;
+                int *use_mkl = va_arg (ap, int *) ;
+                va_end (ap) ;
+                GB_RETURN_IF_NULL (use_mkl) ;
+                (*use_mkl) = (desc == NULL) ? false : desc->use_mkl ;
+            }
+            break ;
+
diff --git a/GraphBLAS/MKL/GxB_Desc_set_mkl_template.c b/GraphBLAS/MKL/GxB_Desc_set_mkl_template.c
new file mode 100644
index 0000000000..4a53a8a3b7
--- /dev/null
+++ b/GraphBLAS/MKL/GxB_Desc_set_mkl_template.c
@@ -0,0 +1,18 @@
+//------------------------------------------------------------------------------
+// GxB_Desc_set_mkl_template: set a field in a descriptor for MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+        case GxB_DESCRIPTOR_MKL:
+
+            {
+                va_start (ap, field) ;
+                desc->use_mkl = va_arg (ap, int) ;
+                va_end (ap) ;
+            }
+            break ;
+
diff --git a/GraphBLAS/MKL/GxB_Global_Option_get_mkl_template.c b/GraphBLAS/MKL/GxB_Global_Option_get_mkl_template.c
new file mode 100644
index 0000000000..675964bae4
--- /dev/null
+++ b/GraphBLAS/MKL/GxB_Global_Option_get_mkl_template.c
@@ -0,0 +1,20 @@
+//------------------------------------------------------------------------------
+// GxB_Global_Option_get_mkl_template: get a global option for MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+        case GxB_GLOBAL_MKL:            // same as GxB_MKL
+
+            {
+                va_start (ap, field) ;
+                int *use_mkl = va_arg (ap, int *) ;
+                va_end (ap) ;
+                GB_RETURN_IF_NULL (use_mkl) ;
+                (*use_mkl) = GB_Global_use_mkl_get ( ) ;
+            }
+            break ;
+
diff --git a/GraphBLAS/MKL/GxB_Global_Option_set_mkl_template.c b/GraphBLAS/MKL/GxB_Global_Option_set_mkl_template.c
new file mode 100644
index 0000000000..9b037dd9fa
--- /dev/null
+++ b/GraphBLAS/MKL/GxB_Global_Option_set_mkl_template.c
@@ -0,0 +1,19 @@
+//------------------------------------------------------------------------------
+// GxB_Global_Option_set_mkl_template: set a global option for MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+        case GxB_GLOBAL_MKL:           // same as GxB_MKL
+
+            {
+                va_start (ap, field) ;
+                int use_mkl = va_arg (ap, int) ;
+                va_end (ap) ;
+                GB_Global_use_mkl_set (use_mkl != 0) ;
+            }
+            break ;
+
diff --git a/GraphBLAS/Source/GxB_mxv_optimize.c b/GraphBLAS/MKL/GxB_mxv_optimize.c
similarity index 66%
rename from GraphBLAS/Source/GxB_mxv_optimize.c
rename to GraphBLAS/MKL/GxB_mxv_optimize.c
index d4aadac8fe..55f44ebeb1 100644
--- a/GraphBLAS/Source/GxB_mxv_optimize.c
+++ b/GraphBLAS/MKL/GxB_mxv_optimize.c
@@ -2,8 +2,8 @@
 // GxB_mxv_optimize: optimize a matrix for matrix-vector multiply
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,13 +18,13 @@
     {                                                   \
         /* if the analysis fails, return GrB_SUCCESS */ \
         /* anyway, since the analysis is optional */    \
-        GB_MKL_GRAPH_MATRIX_DESTROY (A->mkl) ;          \
+        GB_MKL_GRAPH_MATRIX_DESTROY (C->mkl) ;          \
         return (GrB_SUCCESS) ;                          \
     }
 
-GrB_Info GxB_mxv_optimize           // analyze A for subsequent use in mxv
+GrB_Info GxB_mxv_optimize           // analyze C for subsequent use in mxv
 (
-    GrB_Matrix A,                   // input/output matrix
+    GrB_Matrix C,                   // input/output matrix
     int64_t ncalls,                 // estimate # of future calls to GrB_mxv
     const GrB_Descriptor desc       // currently unused
 )
@@ -35,58 +35,58 @@ GrB_Info GxB_mxv_optimize           // analyze A for subsequent use in mxv
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_mxv_optimize (A, ncols, desc)") ;
+    GB_WHERE (C, "GxB_mxv_optimize (C, ncols, desc)") ;
     GB_BURBLE_START ("GxB_mxv_optimize") ;
-    GB_RETURN_IF_NULL_OR_FAULTY (A) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (C) ;
 
     // get the use_mkl flag from the descriptor
-    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
 
     // delete any lingering zombies and assemble any pending tuples
-    GB_MATRIX_WAIT (A) ;
+    GB_MATRIX_WAIT (C) ;
 
     //--------------------------------------------------------------------------
     // optimize the matrix for mkl_graph_mxv in MKL
     //--------------------------------------------------------------------------
 
-    if (use_mkl)
+    if (use_mkl && !GB_IS_BITMAP (C))
     {
 
         //----------------------------------------------------------------------
-        // free any existing MKL version of the matrix A and its optimization
+        // free any existing MKL version of the matrix C and its optimization
         //----------------------------------------------------------------------
 
-        GB_MKL_GRAPH_MATRIX_DESTROY (A->mkl) ;
+        GB_MKL_GRAPH_MATRIX_DESTROY (C->mkl) ;
 
         //----------------------------------------------------------------------
-        // create the MKL version of the matrix A, and analyze it
+        // create the MKL version of the matrix C, and analyze it
         //----------------------------------------------------------------------
 
-        // TODO for MKL: doesn't the analysis depend on A'*x or A*x?
+        // note for MKL: doesn't the analysis depend on C'*x or C*x?
 
-        int A_mkl_type = GB_type_mkl (A->type->code) ;
-        if (!GB_IS_HYPER (A) && A_mkl_type >= 0)
+        int A_mkl_type = GB_type_mkl (C->type->code) ;
+        if (!GB_IS_HYPERSPARSE (C) && A_mkl_type >= 0)
         {
 
             // create the MKL version of the matrix
             mkl_graph_matrix_t A_mkl = NULL ;
             GB_MKL_OK (mkl_graph_matrix_create (&A_mkl)) ;
-            A->mkl = A_mkl ;
+            C->mkl = A_mkl ;
 
             // import the data as shallow arrays into the MKL matrix
-            GB_MKL_OK (mkl_graph_matrix_set_csr (A_mkl, A->vdim, A->vlen,
-                A->p, MKL_GRAPH_TYPE_INT64,
-                A->i, MKL_GRAPH_TYPE_INT64,
-                A->x, A_mkl_type)) ;
+            GB_MKL_OK (mkl_graph_matrix_set_csr (A_mkl, C->vdim, C->vlen,
+                C->p, MKL_GRAPH_TYPE_INT64,
+                C->i, MKL_GRAPH_TYPE_INT64,
+                C->x, A_mkl_type)) ;
 
             // analyze the matrix for future calls to GrB_mxv
             GB_MKL_OK (mkl_graph_optimize_mxv (A_mkl, ncalls)) ;
 
             // save the analysis inside the GrB_Matrix
-            A->mkl = (void *) A_mkl ;
+            C->mkl = (void *) A_mkl ;
         }
 
-        // TODO for MKL: if A is modified, A->mkl needs to be freed.
+        // note for MKL: if C is modified, C->mkl needs to be freed.
 
     }
 
diff --git a/GraphBLAS/Source/GxB_mxv_optimize_free.c b/GraphBLAS/MKL/GxB_mxv_optimize_free.c
similarity index 64%
rename from GraphBLAS/Source/GxB_mxv_optimize_free.c
rename to GraphBLAS/MKL/GxB_mxv_optimize_free.c
index 8f33afee11..c2fe691285 100644
--- a/GraphBLAS/Source/GxB_mxv_optimize_free.c
+++ b/GraphBLAS/MKL/GxB_mxv_optimize_free.c
@@ -2,17 +2,17 @@
 // GxB_mxv_optimize: optimize a matrix for matrix-vector multiply
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "GB_mxm.h"
 #include "GB_mkl.h"
 
-GrB_Info GxB_mxv_optimize_free      // analyze A for subsequent use in mxv
+GrB_Info GxB_mxv_optimize_free      // analyze C for subsequent use in mxv
 (
-    GrB_Matrix A                    // input/output matrix
+    GrB_Matrix C                    // input/output matrix
 )
 {
 #if GB_HAS_MKL_GRAPH
@@ -21,17 +21,17 @@ GrB_Info GxB_mxv_optimize_free      // analyze A for subsequent use in mxv
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_mxv_optimize_free (A, desc)") ;
+    GB_WHERE (C, "GxB_mxv_optimize_free (C, desc)") ;
     GB_BURBLE_START ("GxB_mxv_optimize_free") ;
-    GB_RETURN_IF_NULL_OR_FAULTY (A) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (C) ;
 
     //--------------------------------------------------------------------------
-    // free any existing MKL version of the matrix A and its optimization
+    // free any existing MKL version of the matrix C and its optimization
     //--------------------------------------------------------------------------
 
-    GB_MKL_GRAPH_MATRIX_DESTROY (A->mkl) ;
+    GB_MKL_GRAPH_MATRIX_DESTROY (C->mkl) ;
 
-    A->mkl = NULL ;
+    C->mkl = NULL ;
 #endif
     return (GrB_SUCCESS) ;
 }
diff --git a/GraphBLAS/MKL/README.txt b/GraphBLAS/MKL/README.txt
new file mode 100644
index 0000000000..d7144a0687
--- /dev/null
+++ b/GraphBLAS/MKL/README.txt
@@ -0,0 +1,10 @@
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+
+MKL integration is work in progress; these files are currently not used in the
+production version.  To use them, move or copy them into the ../Source
+directory (put *template.c files in ../Source/Template), uncomment all
+"#include "GB ...mkl_template..." lines, enable MKL in the CMakeLists.txt file,
+and edit Config/GraphBLAS.h.in to include (not #include) the contents of
+for_GraphBLAS.h.in.
+
diff --git a/GraphBLAS/MKL/for_GraphBLAS.h.in b/GraphBLAS/MKL/for_GraphBLAS.h.in
new file mode 100644
index 0000000000..53029f35fe
--- /dev/null
+++ b/GraphBLAS/MKL/for_GraphBLAS.h.in
@@ -0,0 +1,51 @@
+//------------------------------------------------------------------------------
+// for_GraphBLAS.h.in: definitions for the GraphBLAS package for MKL
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Add this contents file to GraphBLAS/Config/GraohBLAS.h.in
+
+#define GxB_MKL 31
+
+// Add this to the GrB_Desc_Field enum:
+// typedef enum
+// { 
+//      ...
+
+    GxB_DESCRIPTOR_MKL = GxB_MKL,
+
+//      ...
+// }
+// GrB_Desc_Field ;
+
+// Add this to the GxB_Option_Field enum:
+// typedef enum            // for global options or matrix options
+// {
+//      ...
+
+    GxB_GLOBAL_MKL = GxB_MKL,
+
+//      ...
+// } GxB_Option_Field ;
+
+
+//==============================================================================
+// MKL optimization (DRAFT: in progress, do not use)
+//==============================================================================
+
+GrB_Info GxB_mxv_optimize           // analyze C for subsequent use in mxv
+(
+    GrB_Matrix C,                   // input/output matrix
+    int64_t ncalls,                 // estimate # of future calls to GrB_mxv
+    const GrB_Descriptor desc       // currently unused
+) ;
+
+GrB_Info GxB_mxv_optimize_free      // analyze A for subsequent use in mxv
+(
+    GrB_Matrix C                    // input/output matrix
+) ;
+
diff --git a/GraphBLAS/Makefile b/GraphBLAS/Makefile
index e4466ef6ec..78d70848fd 100644
--- a/GraphBLAS/Makefile
+++ b/GraphBLAS/Makefile
@@ -2,8 +2,8 @@
 # GraphBLAS/Makefile
 #-------------------------------------------------------------------------------
 
-# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-# http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 #-------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/README.md b/GraphBLAS/README.md
index 9ab8155b8b..641c8bf2c1 100644
--- a/GraphBLAS/README.md
+++ b/GraphBLAS/README.md
@@ -1,11 +1,11 @@
 # SuiteSparse:GraphBLAS
 
-SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
 
-VERSION 3.3.3, July 14, 2020
+VERSION 4.0.3, Jan 19, 2021
 
-SuiteSparse:GraphBLAS is an full implementation of the GraphBLAS standard,
+SuiteSparse:GraphBLAS is complete implementation of the GraphBLAS standard,
 which defines a set of sparse matrix operations on an extended algebra of
 semirings using an almost unlimited variety of operators and types.  When
 applied to sparse adjacency matrices, these algebraic operations are equivalent
@@ -62,12 +62,6 @@ Demo:           a set of demos on how to use GraphBLAS
 
 Doc:            SuiteSparse:GraphBLAS User Guide and license
 
-Extras:         parallel methods: triangle counting, k-truss, and a
-                massively parallel (MPI) Kronecker product matrix generator.
-                These are stand-along package that rely on GraphBLAS.  They
-                are not compiled by the cmake script.  See Extras/README.txt
-                for more details.
-
 GraphBLAS:      the MATLAB interface.  This folder is called 'GraphBLAS' so
                 that typing 'help graphblas' or 'doc graphblas' in the MATLAB
                 Command Window can locate the Contents.m file.
@@ -93,13 +87,25 @@ alternative:    an alternative to CMake; edit the alternative/Makefile and do
 
 --------------------------------------------------------------------------------
 
-## SPEC:
+## GraphBLAS C API Specification:
 
 This version fully conforms to the version 1.3.0 (Sept 25, 2019)
 of the GraphBLAS C API Specification.  It includes several additional functions
-and features as extensions to the spec.  These extensions are tagged with the
-keyword SPEC: in the code and in the User Guide, and in the Include/GraphBLAS.h
-file.  All functions, objects, and macros with the prefix GxB are extensions to
+and features as extensions to the spec.
+
+All functions, objects, and macros with the prefix GxB are extensions to
 the spec.  Functions, objects, and macros with prefix GB must not be accessed
 by user code.  They are for internal use in GraphBLAS only.
 
+--------------------------------------------------------------------------------
+
+## About NUMA systems
+
+I have tested this package extensively on multicore single-socket systems, but
+have not yet optimized it for multi-socket systems with a NUMA architecture.
+That will be done in a future release.  If you publish benchmark comparisons
+with this package, please state the SuiteSparse:GraphBLAS version, and a caveat
+if appropriate.  If you see significant performance issues when going from a
+single-socket to multi-socket system, I would like to hear from you so I can
+look into it.  Contact me at davis@tamu.edu.
+
diff --git a/GraphBLAS/Source/Contents.m b/GraphBLAS/Source/Contents.m
index 704e36a1b7..8e82a10737 100644
--- a/GraphBLAS/Source/Contents.m
+++ b/GraphBLAS/Source/Contents.m
@@ -7,8 +7,8 @@
 %   codegen                      - generate all code for Generated/*
 %   codegen_axb                  - create all C=A*B functions for all semirings
 %   codegen_axb_method           - create a function to compute C=A*B over a semiring
-%   codegen_axb_template         - create a function for a semiring with a TxT->T multiplier
-%   codegen_axb_compare_template - create a function for a semiring with a TxT->bool multiplier
+%   codegen_axb_template         - create a function for a semiring with a TxT -> T multiplier
+%   codegen_axb_compare_template - create a function for a semiring with a TxT -> bool multiplier
 %   codegen_binop                - create functions for all binary operators
 %   codegen_binop_method         - create a function to compute C=binop(A,B)
 %   codegen_binop_template       - create binop functions
@@ -20,3 +20,7 @@
 %   codegen_unop                 - create functions for all unary operators
 %   codegen_unop_method          - create a function to compute C=unop(cast(A))
 %   codegen_unop_template        - create unop functions
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
diff --git a/GraphBLAS/Source/GB.h b/GraphBLAS/Source/GB.h
index ae30a588e7..af76b4f9ca 100644
--- a/GraphBLAS/Source/GB.h
+++ b/GraphBLAS/Source/GB.h
@@ -2,319 +2,38 @@
 // GB.h: definitions visible only inside GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// These defintions are not visible to the user.  They are used only inside
-// GraphBLAS itself.
-
-// Future plans: (see also 'grep -r FUTURE')
-// FUTURE: support for dense matrices (A->i and A->p as NULL pointers)
-// FUTURE: implement v1.3 of the API
-// FUTURE: add matrix I/O in binary format (see draft LAGraph_binread/binwrite)
-// FUTURE: add Heap method to GB_AxB_saxpy3 (inspector-executor style)
-// FUTURE: allow matrices and vectors to be left jumbled (sort left pending)
-
 #ifndef GB_H
 #define GB_H
 
 //------------------------------------------------------------------------------
-// code development settings
-//------------------------------------------------------------------------------
-
-// to turn on Debug for a single file of GraphBLAS, add:
-// #define GB_DEBUG
-// just before the statement:
-// #include "GB.h"
-
-// set GB_BURBLE to 1 to enable extensive diagnostic output, or compile with
-// -DGB_BURBLE=1.  This setting can also be added at the top of any individual
-// Source/* files, before #including any other files.
-#ifndef GB_BURBLE
-#define GB_BURBLE 0
-#endif
-
-// to turn on Debug for all of GraphBLAS, uncomment this line:
-// #define GB_DEBUG
-
-// to reduce code size and for faster time to compile, uncomment this line;
-// GraphBLAS will be slower.  Alternatively, use cmake with -DGBCOMPACT=1
-// #define GBCOMPACT 1
-
-// for code development only
-// #define GB_DEVELOPER 1
-
-// set these via cmake, or uncomment to select the user-thread model:
-
-// #define USER_POSIX_THREADS
-// #define USER_OPENMP_THREADS
-// #define USER_NO_THREADS
-
-//------------------------------------------------------------------------------
-// manage compiler warnings
-//------------------------------------------------------------------------------
-
-#if defined __INTEL_COMPILER
-
-//  10397: remark about where *.optrpt reports are placed
-//  15552: loop not vectorized
-#pragma warning (disable: 10397 15552 )
-
-// disable icc -w2 warnings
-//  191:  type qualifier meangingless
-//  193:  zero used for undefined #define
-//  589:  bypass initialization
-#pragma warning (disable: 191 193 )
-
-// disable icc -w3 warnings
-//  144:  initialize with incompatible pointer
-//  181:  format
-//  869:  unused parameters
-//  1572: floating point comparisons
-//  1599: shadow
-//  2259: typecasting may lose bits
-//  2282: unrecognized pragma
-//  2557: sign compare
-#pragma warning (disable: 144 181 869 1572 1599 2259 2282 2557 )
-
-// See GB_unused.h, for warnings 177 and 593, which are not globally
-// disabled, but selectively by #include'ing GB_unused.h as needed.
-
-// resolved (warnings no longer disabled globally):
-//  58:   sign compare
-//  167:  incompatible pointer
-//  177:  declared but unused
-//  186:  useless comparison
-//  188:  mixing enum types
-//  593:  set but not used
-//  981:  unspecified order
-//  1418: no external declaration
-//  1419: external declaration in source file
-//  2330: const incompatible
-//  2547: remark about include files
-//  3280: shadow
-
-#elif defined __GNUC__
-
-// disable warnings for gcc 5.x and higher:
-#if (__GNUC__ > 4)
-// disable warnings
-// #pragma GCC diagnostic ignored "-Wunknown-warning-option"
-#pragma GCC diagnostic ignored "-Wint-in-bool-context"
-#pragma GCC diagnostic ignored "-Wformat-truncation="
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-// enable these warnings as errors
-#pragma GCC diagnostic error "-Wmisleading-indentation"
-#endif
-
-// disable warnings from -Wall -Wextra -Wpendantic
-#pragma GCC diagnostic ignored "-Wunused-parameter"
-#pragma GCC diagnostic ignored "-Wsign-compare"
-#if defined ( __cplusplus )
-#pragma GCC diagnostic ignored "-Wwrite-strings"
-#else
-#pragma GCC diagnostic ignored "-Wincompatible-pointer-types"
-#endif
-
-// See GB_unused.h, where these two pragmas are used:
-// #pragma GCC diagnostic ignored "-Wunused-but-set-variable"
-// #pragma GCC diagnostic ignored "-Wunused-variable"
-
-// resolved (warnings no longer disabled globally):
-// #pragma GCC diagnostic ignored "-Wunknown-pragmas"
-// #pragma GCC diagnostic ignored "-Wtype-limits"
-// #pragma GCC diagnostic ignored "-Wunused-result"
-// #pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
-
-// enable these warnings as errors
-#pragma GCC diagnostic error "-Wswitch-default"
-#if !defined ( __cplusplus )
-#pragma GCC diagnostic error "-Wmissing-prototypes"
-#endif
-
-// #pragma GCC diagnostic error "-Wdouble-promotion"
-
-#endif
-
-#if ( _MSC_VER && !__INTEL_COMPILER )
-// disable MS Visual Studio warnings
-#pragma warning(disable:4146)
-#endif
-
-//------------------------------------------------------------------------------
-// include GraphBLAS.h (depends on user threading model)
+// defintions that modify GraphBLAS.h
 //------------------------------------------------------------------------------
 
+#include "GB_warnings.h"
 #ifndef MATLAB_MEX_FILE
 #define GB_LIBRARY
 #endif
 
-#include "GraphBLAS.h"
-
-//------------------------------------------------------------------------------
-// compiler variations
-//------------------------------------------------------------------------------
-
-// Determine the restrict keyword, and whether or not variable-length arrays
-// are supported.
-
-#if ( _MSC_VER && !__INTEL_COMPILER )
-
-    // Microsoft Visual Studio does not have the restrict keyword, but it does
-    // support __restrict, which is equivalent.  Variable-length arrays are
-    // not supported.  OpenMP tasks are not available.
-
-    #define GB_MICROSOFT 1
-    #define GB_RESTRICT __restrict
-    #define GB_HAS_VLA  0
-    #define GB_HAS_OPENMP_TASKS 0
-
-#elif GxB_STDC_VERSION >= 199901L
-
-    // ANSI C99 and later have the restrict keyword and variable-length arrays.
-    #define GB_MICROSOFT 0
-    #define GB_RESTRICT restrict
-    #define GB_HAS_VLA  1
-    #define GB_HAS_OPENMP_TASKS 1
-
-#else
-
-    // ANSI C95 and earlier have neither
-    #define GB_MICROSOFT 0
-    #define GB_RESTRICT
-    #define GB_HAS_VLA  0
-    #define GB_HAS_OPENMP_TASKS 1
-
-#endif
-
-//------------------------------------------------------------------------------
-// Microsoft specific include files
-//------------------------------------------------------------------------------
-
-#if GB_MICROSOFT
-#include <malloc.h>
-#endif
-
-//------------------------------------------------------------------------------
-// OpenMP pragmas and tasks
-//------------------------------------------------------------------------------
-
-// GB_PRAGMA(x) becomes "#pragma x", but the way to do this depends on the
-// compiler:
-#if GB_MICROSOFT
-    // MS Visual Studio is not ANSI C11 compliant, and uses __pragma:
-    #define GB_PRAGMA(x) __pragma (x)
-#else
-    // ANSI C11 compilers use _Pragma:
-    #define GB_PRAGMA(x) _Pragma (#x)
-#endif
-
-// construct pragmas for loop vectorization:
-#if GB_MICROSOFT
-
-    // no #pragma omp simd is available in MS Visual Studio
-    #define GB_PRAGMA_SIMD
-    #define GB_PRAGMA_SIMD_REDUCTION(op,s)
-
-#else
-
-    // create two kinds of SIMD pragmas:
-    // GB_PRAGMA_SIMD becomes "#pragma omp simd"
-    // GB_PRAGMA_SIMD_REDUCTION (+,cij) becomes
-    // "#pragma omp simd reduction(+:cij)"
-    #define GB_PRAGMA_SIMD GB_PRAGMA (omp simd)
-    #define GB_PRAGMA_SIMD_REDUCTION(op,s) GB_PRAGMA (omp simd reduction(op:s))
-
-#endif
-
-// construct pragmas for OpenMP tasks, if available:
-#if GB_HAS_OPENMP_TASKS
-
-    // Use OpenMP tasks
-    #define GB_TASK(func, ...)                          \
-        GB_PRAGMA(omp task firstprivate(__VA_ARGS__))   \
-        func (__VA_ARGS__)
-    #define GB_TASK_WAIT GB_PRAGMA (omp taskwait)
-    #define GB_TASK_MASTER(nthreads)                    \
-        GB_PRAGMA (omp parallel num_threads (nthreads)) \
-        GB_PRAGMA (omp master)
-
-#else
-
-    // OpenMP tasks not available
-    #define GB_TASK(func, ...) func (__VA_ARGS__)
-    #define GB_TASK_WAIT
-    #define GB_TASK_MASTER(nthreads)
-
-#endif
-
-#define GB_PRAGMA_IVDEP GB_PRAGMA(ivdep)
-
-//------------------------------------------------------------------------------
-// PGI_COMPILER_BUG
-//------------------------------------------------------------------------------
-
-// If GraphBLAS is compiled with -DPGI_COMPILER_BUG, then a workaround is
-// enabled for a bug in the PGI compiler.  The compiler does not correctly
-// handle automatic arrays of variable size.
-
-#ifdef PGI_COMPILER_BUG
-
-    // override the ANSI C compiler to turn off variable-length arrays
-    #undef  GB_HAS_VLA
-    #define GB_HAS_VLA  0
-
-#endif
-
 //------------------------------------------------------------------------------
-// variable-length arrays
+// user-visible GraphBLAS.h
 //------------------------------------------------------------------------------
 
-// If variable-length arrays are not supported, user-defined types are limited
-// in size to 128 bytes or less.  Many of the type-generic routines allocate
-// workspace for a single scalar of variable size, using a statement:
-//
-//      GB_void aij [xsize] ;
-//
-// To support non-variable-length arrays in ANSI C95 or earlier, this is used:
-//
-//      GB_void aij [GB_VLA(xsize)] ;
-//
-// GB_VLA(xsize) is either defined as xsize (for ANSI C99 or later), or a fixed
-// size of 128, in which case user-defined types are limited to a max of 128
-// bytes.
-
-typedef unsigned char GB_void ;
-
-#if ( GB_HAS_VLA )
-
-    // variable-length arrays are allowed
-    #define GB_VLA(s) s
-
-#else
-
-    // variable-length arrays are not allowed
-    #define GB_VLA_MAXSIZE 128
-    #define GB_VLA(s) GB_VLA_MAXSIZE
-
-#endif
-
-//------------------------------------------------------------------------------
-// for coverage tests in Tcov/
-//------------------------------------------------------------------------------
-
-#ifdef GBCOVER
-#define GBCOVER_MAX 20000
-GB_PUBLIC int64_t GB_cov [GBCOVER_MAX] ;
-GB_PUBLIC int GB_cover_max ;
-#endif
+#include "GraphBLAS.h"
 
 //------------------------------------------------------------------------------
-// GraphBLAS include files
+// internal #include files
 //------------------------------------------------------------------------------
 
+#include "GB_dev.h"
+#include "GB_defaults.h"
+#include "GB_compiler.h"
+#include "GB_coverage.h"
+#include "GB_index.h"
 #include "GB_cplusplus.h"
 #include "GB_Global.h"
 #include "GB_printf.h"
@@ -323,61 +42,18 @@ GB_PUBLIC int GB_cover_max ;
 #include "GB_casting.h"
 #include "GB_math.h"
 #include "GB_bitwise.h"
-#include "GB_wait.h"
 #include "GB_binary_search.h"
+#include "GB_check.h"
+#include "GB_nnz.h"
+#include "GB_zombie.h"
+#include "GB_partition.h"
+#include "GB_omp.h"
+// #include "GB_mkl.h"
 
 //------------------------------------------------------------------------------
-// default options
+// internal definitions
 //------------------------------------------------------------------------------
 
-// These parameters define the content of values that can be
-// used as inputs to GxB_*Option_set.
-
-// The default format is by row (CSR), with a hyper_ratio of 1/16.
-// In Versions 2.1 and earlier, the default was GxB_BY_COL (CSC format).
-
-#define GB_HYPER_DEFAULT (0.0625)
-
-// compile SuiteSparse:GraphBLAS with "-DBYCOL" to make GxB_BY_COL the default
-// format
-#ifdef BYCOL
-#define GB_FORMAT_DEFAULT GxB_BY_COL
-#else
-#define GB_FORMAT_DEFAULT GxB_BY_ROW
-#endif
-
-// these parameters define the hyper_ratio needed to ensure matrix stays
-// either always hypersparse, or never hypersparse.
-#define GB_ALWAYS_HYPER (1.0)
-#define GB_NEVER_HYPER  (-1.0)
-
-#define GB_FORCE_HYPER 1
-#define GB_FORCE_NONHYPER 0
-#define GB_AUTO_HYPER (-1)
-
-#define GB_SAME_HYPER_AS(A_is_hyper) \
-    ((A_is_hyper) ? GB_FORCE_HYPER : GB_FORCE_NONHYPER)
-
-// if A is hypersparse but all vectors are present, then
-// treat A as if it were non-hypersparse
-#define GB_IS_HYPER(A) \
-    (((A) != NULL) && ((A)->is_hyper && ((A)->nvec < (A)->vdim)))
-
-//------------------------------------------------------------------------------
-// macros for matrices and vectors
-//------------------------------------------------------------------------------
-
-// If A->nzmax is zero, then A->p might not be allocated.  Note that this
-// function does not count pending tuples; use GB_MATRIX_WAIT(A) first, if
-// needed.  For sparse or hypersparse matrix, Ap [0] == 0.  For a slice or
-// hyperslice, Ap [0] >= 0 points to the first entry in the slice.  For all 4
-// cases (sparse, hypersparse, slice, hyperslice), nnz(A) = Ap [nvec] - Ap [0].
-#define GB_NNZ(A) (((A)->nzmax > 0) ? ((A)->p [(A)->nvec] - (A)->p [0]) : 0 )
-
-// Upper bound on nnz(A) when the matrix has zombies and pending tuples;
-// does not need GB_MATRIX_WAIT(A) first.
-#define GB_NNZ_UPPER_BOUND(A) ((GB_NNZ (A) - A->nzombies) + GB_Pending_n (A))
-
 int64_t GB_Pending_n        // return # of pending tuples in A
 (
     GrB_Matrix A
@@ -401,22 +77,21 @@ int64_t GB_Pending_n        // return # of pending tuples in A
 // in the GraphBLAS/Test directory only.  The macro is also used in
 // GB_Vector_check, to ensure the content of a GrB_Vector is valid.
 
-#define GB_VECTOR_OK(v)             \
-(                                   \
-    ((v) != NULL) &&                \
-    ((v)->is_hyper == false) &&     \
-    ((v)->is_csc == true) &&        \
-    ((v)->plen == 1) &&             \
-    ((v)->vdim == 1) &&             \
-    ((v)->nvec == 1) &&             \
-    ((v)->h == NULL)                \
+#define GB_VECTOR_OK(v)                     \
+(                                           \
+    ((v) != NULL) &&                        \
+    ((v)->is_csc == true) &&                \
+    ((v)->plen == 1 || (v)->plen == -1) &&  \
+    ((v)->vdim == 1) &&                     \
+    ((v)->nvec == 1) &&                     \
+    ((v)->h == NULL)                        \
 )
 
 // A GxB_Vector is a GrB_Vector of length 1
 #define GB_SCALAR_OK(v) (GB_VECTOR_OK(v) && ((v)->vlen == 1))
 
 //------------------------------------------------------------------------------
-// aliased objects
+// aliased and shallow objects
 //------------------------------------------------------------------------------
 
 // GraphBLAS allows all inputs to all user-accessible objects to be aliased, as
@@ -431,23 +106,17 @@ bool GB_aliased             // determine if A and B are aliased
     GrB_Matrix B            // input B matrix
 ) ;
 
+// matrices returned to the user are never shallow; internal matrices may be
+GB_PUBLIC                       // used by the MATLAB interface
+bool GB_is_shallow              // true if any component of A is shallow
+(
+    GrB_Matrix A                // matrix to query
+) ;
+
 //------------------------------------------------------------------------------
-// internal GraphBLAS type and operator codes
+// internal GraphBLAS type
 //------------------------------------------------------------------------------
 
-// GB_MAGIC is an arbitrary number that is placed inside each object when it is
-// initialized, as a way of detecting uninitialized objects.
-#define GB_MAGIC  0x72657473786f62ULL
-
-// The magic number is set to GB_FREED when the object is freed, as a way of
-// helping to detect dangling pointers.
-#define GB_FREED  0x6c6c756e786f62ULL
-
-// The value is set to GB_MAGIC2 when the object has been allocated but cannot
-// yet be used in most methods and operations.  Currently this is used only for
-// when A->p array is allocated but not initialized.
-#define GB_MAGIC2 0x7265745f786f62ULL
-
 // predefined type objects
 GB_PUBLIC struct GB_Type_opaque
     GB_opaque_GrB_BOOL   ,  // GrB_BOOL is a pointer to this object, etc.
@@ -593,9 +262,9 @@ GB_PUBLIC struct GB_SelectOp_opaque
 // error logging and parallel thread control
 //------------------------------------------------------------------------------
 
-// Error messages are logged in GB_DLEN, on the stack, and then copied into
-// thread-local storage of size GB_RLEN.  If the user-defined data types,
-// operators, etc have really long names, the error messages are safely
+// Error messages are logged in Context->logger, on the stack which is handle
+// to the input/output matrix/vector (typically C).  If the user-defined data
+// types, operators, etc have really long names, the error messages are safely
 // truncated (via snprintf).  This is intentional, but gcc with
 // -Wformat-truncation will print a warning (see pragmas above).  Ignore the
 // warning.
@@ -621,8 +290,8 @@ typedef struct
     double chunk ;              // chunk size for small problems
     int nthreads_max ;          // max # of threads to use
     const char *where ;         // GraphBLAS function where error occurred
-    char details [GB_DLEN] ;    // error report
-    bool use_mkl ;              // control usage of Intel MKL
+    char **logger ;             // error report
+    // #include "GB_Context_struct_mkl_template.h"
 }
 GB_Context_struct ;
 
@@ -640,10 +309,6 @@ typedef GB_Context_struct *GB_Context ;
 // GrB_*free does not encounter error conditions so it doesn't need to be
 // logged by the GB_WHERE macro.
 
-#ifndef GB_PANIC
-#define GB_PANIC return (GrB_PANIC)
-#endif
-
 #define GB_CONTEXT(where_string)                                    \
     /* construct the Context */                                     \
     GB_Context_struct Context_struct ;                              \
@@ -653,13 +318,28 @@ typedef GB_Context_struct *GB_Context ;
     /* get the default max # of threads and default chunk size */   \
     Context->nthreads_max = GB_Global_nthreads_max_get ( ) ;        \
     Context->chunk = GB_Global_chunk_get ( ) ;                      \
-    Context->use_mkl = GB_Global_use_mkl_get ( )
+    /* get the pointer to where any error will be logged */         \
+    Context->logger = NULL ;
+
+// #include "GB_CONTEXT_mkl_template.h"
+
+#define GB_WHERE(C,where_string)                                    \
+    if (!GB_Global_GrB_init_called_get ( ))                         \
+    {                                                               \
+        return (GrB_PANIC) ; /* GrB_init not called */              \
+    }                                                               \
+    GB_CONTEXT (where_string)                                       \
+    if (C != NULL)                                                  \
+    {                                                               \
+        /* free any prior error logged in the object */             \
+        GB_FREE (C->logger) ;                                       \
+        Context->logger = &(C->logger) ;                            \
+    }
 
-#define GB_WHERE(where_string)                                      \
+#define GB_WHERE1(where_string)                                     \
     if (!GB_Global_GrB_init_called_get ( ))                         \
     {                                                               \
-        /* GrB_init (or GxB_init) has not been called! */           \
-        GB_PANIC ;                                                  \
+        return (GrB_PANIC) ; /* GrB_init not called */              \
     }                                                               \
     GB_CONTEXT (where_string)
 
@@ -713,193 +393,42 @@ static inline int GB_nthreads   // return # of threads to use
 // error logging
 //------------------------------------------------------------------------------
 
-// The GB_ERROR and GB_LOG macros work together.  If an error occurs, the
-// GB_ERROR macro records the details in the Context.details, and returns the
-// GrB_info to its 'caller'.  This value can then be returned, or set to an
-// info variable of type GrB_Info.  For example:
+// The GB_ERROR macro logs an error in the logger error string.
 //
 //  if (i >= nrows)
 //  {
-//      return (GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS, (GB_LOG,
-//          "Row index %d out of bounds; must be < %d", i, nrows))) ;
+//      GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS,
+//          "Row index %d out of bounds; must be < %d", i, nrows) ;
 //  }
 //
 // The user can then do:
 //
-//  printf ("%s", GrB_error ( )) ;
-//
-// To print details of the error, which includes: which user-callable function
-// encountered the error, the error status (GrB_INDEX_OUT_OF_BOUNDS), the
-// details ("Row index 102 out of bounds, must be < 100").
+//  const char *error ;
+//  GrB_error (&error, A) ;
+//  printf ("%s", error) ;
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 const char *GB_status_code (GrB_Info info) ;
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_error           // log an error in thread-local-storage
-(
-    GrB_Info info,          // error return code from a GraphBLAS function
-    GB_Context Context      // pointer to a Context struct, on the stack
-) ;
-
-// GB_LOG becomes the snprintf_args for GB_ERROR.  Unused if Context is NULL.
-#define GB_LOG Context->details, GB_DLEN
-
-// if Context is NULL, do not log the error string in Context->details
-#define GB_ERROR(info,snprintf_args)                                \
-(                                                                   \
-    ((Context == NULL) ? 0 : snprintf snprintf_args),               \
-    GB_error (info, Context)                                        \
-)
-
-// return (GB_OUT_OF_MEMORY) ; reports an out-of-memory error
-#define GB_OUT_OF_MEMORY GB_ERROR (GrB_OUT_OF_MEMORY, (GB_LOG, "out of memory"))
-
-//------------------------------------------------------------------------------
-// GraphBLAS check functions: check and optionally print an object
-//------------------------------------------------------------------------------
-
-// pr values for *_check functions
-#define GB0 GxB_SILENT
-#define GB1 GxB_SUMMARY
-#define GB2 GxB_SHORT
-#define GB3 GxB_COMPLETE
-#define GB4 GxB_SHORT_VERBOSE
-#define GB5 GxB_COMPLETE_VERBOSE
-
-// a NULL name is treated as the empty string
-#define GB_NAME ((name != NULL) ? name : "")
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_entry_check     // print a single value
-(
-    const GrB_Type type,    // type of value to print
-    const void *x,          // value to print
-    int pr,                 // print level
-    FILE *f,                // file to print to
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_code_check          // print and check an entry using a type code
-(
-    const GB_Type_code code,    // type code of value to print
-    const void *x,              // entry to print
-    int pr,                     // print level
-    FILE *f,                    // file to print to
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_Type_check      // check a GraphBLAS Type
-(
-    const GrB_Type type,    // GraphBLAS type to print and check
-    const char *name,       // name of the type from the caller; optional
-    int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_BinaryOp_check  // check a GraphBLAS binary operator
-(
-    const GrB_BinaryOp op,  // GraphBLAS operator to print and check
-    const char *name,       // name of the operator
-    int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_UnaryOp_check   // check a GraphBLAS unary operator
-(
-    const GrB_UnaryOp op,   // GraphBLAS operator to print and check
-    const char *name,       // name of the operator
-    int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_SelectOp_check  // check a GraphBLAS select operator
-(
-    const GxB_SelectOp op,  // GraphBLAS operator to print and check
-    const char *name,       // name of the operator
-    int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_Monoid_check        // check a GraphBLAS monoid
-(
-    const GrB_Monoid monoid,    // GraphBLAS monoid to print and check
-    const char *name,           // name of the monoid, optional
-    int pr,                     // print level
-    FILE *f,                    // file for output
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_Semiring_check          // check a GraphBLAS semiring
-(
-    const GrB_Semiring semiring,    // GraphBLAS semiring to print and check
-    const char *name,               // name of the semiring, optional
-    int pr,                         // print level
-    FILE *f,                        // file for output
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_Descriptor_check    // check a GraphBLAS descriptor
-(
-    const GrB_Descriptor D,     // GraphBLAS descriptor to print and check
-    const char *name,           // name of the descriptor, optional
-    int pr,                     // print level
-    FILE *f,                    // file for output
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
-(
-    const GrB_Matrix A,     // GraphBLAS matrix to print and check
-    const char *name,       // name of the matrix, optional
-    int pr,                 // print level; if negative, ignore nzombie
-                            // conditions and use GB_FLIP(pr) for diagnostics
-    FILE *f,                // file for output
-    const char *kind,       // "matrix" or "vector"
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_Matrix_check    // check a GraphBLAS matrix
-(
-    const GrB_Matrix A,     // GraphBLAS matrix to print and check
-    const char *name,       // name of the matrix
-    int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_Vector_check    // check a GraphBLAS vector
-(
-    const GrB_Vector v,     // GraphBLAS vector to print and check
-    const char *name,       // name of the vector
-    int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
-) ;
-
-GrB_Info GB_Scalar_check    // check a GraphBLAS GxB_Scalar
-(
-    const GxB_Scalar v,     // GraphBLAS GxB_Scalar to print and check
-    const char *name,       // name of the GxB_Scalar
-    int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
-) ;
+// log an error in the error logger string and return the error
+#define GB_ERROR(info,format,...)                                           \
+{                                                                           \
+    if (Context != NULL)                                                    \
+    {                                                                       \
+        char **logger = Context->logger ;                                   \
+        if (logger != NULL)                                                 \
+        {                                                                   \
+            (*logger) = GB_MALLOC (GB_RLEN+1, char) ;                       \
+            if ((*logger) != NULL)                                          \
+            {                                                               \
+                snprintf ((*logger), GB_RLEN,                               \
+                    "GraphBLAS error: %s\nfunction: %s\n" format,           \
+                    GB_status_code (info), Context->where, __VA_ARGS__) ;   \
+            }                                                               \
+        }                                                                   \
+    }                                                                       \
+    return (info) ;                                                         \
+}
 
 //------------------------------------------------------------------------------
 // internal GraphBLAS functions
@@ -921,7 +450,7 @@ GrB_Info GB_init            // start up GraphBLAS
     GB_Context Context      // from GrB_init or GxB_init
 ) ;
 
-typedef enum                    // input parameter to GB_new and GB_create
+typedef enum                    // input parameter to GB_new and GB_new_bix
 {
     GB_Ap_calloc,               // 0: calloc A->p, malloc A->h if hypersparse
     GB_Ap_malloc,               // 1: malloc A->p, malloc A->h if hypersparse
@@ -938,14 +467,15 @@ GrB_Info GB_new                 // create matrix, except for indices & values
     const int64_t vdim,         // number of vectors
     const GB_Ap_code Ap_option, // allocate A->p and A->h, or leave NULL
     const bool is_csc,          // true if CSC, false if CSR
-    const int hyper_option,     // 1:hyper, 0:nonhyper, -1:auto
-    const double hyper_ratio,   // A->hyper_ratio, unless auto
+    const int sparsity,         // hyper, sparse, bitmap, full, or
+                                // auto (hyper + sparse)
+    const float hyper_switch,   // A->hyper_switch, ignored if auto
     const int64_t plen,         // size of A->p and A->h, if A hypersparse.
                                 // Ignored if A is not hypersparse.
     GB_Context Context
 ) ;
 
-GrB_Info GB_create              // create a new matrix, including A->i and A->x
+GrB_Info GB_new_bix             // create a new matrix, incl. A->b, A->i, A->x
 (
     GrB_Matrix *Ahandle,        // output matrix to create
     const GrB_Type type,        // type of output matrix
@@ -953,8 +483,9 @@ GrB_Info GB_create              // create a new matrix, including A->i and A->x
     const int64_t vdim,         // number of vectors
     const GB_Ap_code Ap_option, // allocate A->p and A->h, or leave NULL
     const bool is_csc,          // true if CSC, false if CSR
-    const int hyper_option,     // 1:hyper, 0:nonhyper, -1:auto
-    const double hyper_ratio,   // A->hyper_ratio, unless auto
+    const int sparsity,         // hyper, sparse, bitmap, full, or auto
+    const bool bitmap_calloc,   // if true, calloc A->b, otherwise use malloc
+    const float hyper_switch,   // A->hyper_switch, unless auto
     const int64_t plen,         // size of A->p and A->h, if hypersparse
     const int64_t anz,          // number of nonzeros the matrix must hold
     const bool numeric,         // if true, allocate A->x, else A->x is NULL
@@ -1000,6 +531,14 @@ void GB_memcpy                  // parallel memcpy
     int nthreads                // # of threads to use
 ) ;
 
+void GB_memset                  // parallel memset
+(
+    void *dest,                 // destination
+    const int c,                // value to to set
+    size_t n,                   // # of bytes to set
+    int nthreads                // # of threads to use
+) ;
+
 GrB_Info GB_nvals           // get the number of entries in a matrix
 (
     GrB_Index *nvals,       // matrix has nvals entries
@@ -1015,10 +554,13 @@ GrB_Info GB_matvec_type            // get the type of a matrix
 ) ;
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_ix_alloc        // allocate A->i and A->x space in a matrix
+GrB_Info GB_bix_alloc       // allocate A->b, A->i, and A->x space in a matrix
 (
     GrB_Matrix A,           // matrix to allocate space for
     const GrB_Index nzmax,  // number of entries the matrix can hold
+    const bool is_bitmap,   // if true, allocate A->b, otherwise A->b is NULL
+    const bool bitmap_calloc,   // if true, calloc A->b, otherwise use malloc
+    const bool is_sparse,   // if true, allocate A->i, otherwise A->i is NULL
     const bool numeric,     // if true, allocate A->x, otherwise A->x is NULL
     GB_Context Context
 ) ;
@@ -1026,25 +568,21 @@ GrB_Info GB_ix_alloc        // allocate A->i and A->x space in a matrix
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 GrB_Info GB_ix_realloc      // reallocate space in a matrix
 (
-    GrB_Matrix A,           // matrix to allocate space for
-    const GrB_Index nzmax,  // new number of entries the matrix can hold
-    const bool numeric,     // if true, reallocate A->x, otherwise A->x is NULL
+    GrB_Matrix A,               // matrix to allocate space for
+    const int64_t nzmax_new,    // new number of entries the matrix can hold
+    const bool numeric,         // if true, reallocate A->x, else A->x is NULL
     GB_Context Context
 ) ;
 
-GrB_Info GB_ix_resize           // resize a matrix
+GrB_Info GB_ix_resize       // resize a matrix
 (
-    GrB_Matrix A,
-    const int64_t anz_new,      // required new nnz(A)
+    GrB_Matrix A,           // matrix to resize (sparse/hyper, not full/bitmap)
+    const int64_t anz_new,  // required new nnz(A)
     GB_Context Context
 ) ;
 
-// free A->i and A->x and return if critical section fails
-#define GB_IX_FREE(A)                                                       \
-    if (GB_ix_free (A) == GrB_PANIC) GB_PANIC
-
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_ix_free             // free A->i and A->x of a matrix
+void GB_bix_free                // free A->b, A->i, and A->x of a matrix
 (
     GrB_Matrix A                // matrix with content to free
 ) ;
@@ -1055,17 +593,13 @@ void GB_ph_free                 // free A->p and A->h of a matrix
     GrB_Matrix A                // matrix with content to free
 ) ;
 
-// free all content, and return if critical section fails
-#define GB_PHIX_FREE(A)                                                     \
-    if (GB_phix_free (A) == GrB_PANIC) GB_PANIC
-
-GrB_Info GB_phix_free           // free all content of a matrix
+void GB_phbix_free              // free all content of a matrix
 (
     GrB_Matrix A                // matrix with content to free
 ) ;
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-bool GB_Type_compatible         // check if two types can be typecast
+bool GB_Type_compatible             // check if two types can be typecast
 (
     const GrB_Type atype,
     const GrB_Type btype
@@ -1153,12 +687,12 @@ GB_task_struct ;
         bool ok ;                                                           \
         int nold = (max_ntasks == 0) ? 0 : (max_ntasks + 1) ;               \
         int nnew = 2 * (ntasks) + 1 ;                                       \
-        TaskList = GB_REALLOC (TaskList, nnew, nold, GB_task_struct, &ok) ; \
+        GB_REALLOC (TaskList, nnew, nold, GB_task_struct, &ok) ;            \
         if (!ok)                                                            \
         {                                                                   \
             /* out of memory */                                             \
             GB_FREE_ALL ;                                                   \
-            return (GB_OUT_OF_MEMORY) ;                                     \
+            return (GrB_OUT_OF_MEMORY) ;                                    \
         }                                                                   \
         for (int t = nold ; t < nnew ; t++)                                 \
         {                                                                   \
@@ -1183,9 +717,9 @@ GrB_Info GB_ewise_slice
 (
     // output:
     GB_task_struct **p_TaskList,    // array of structs, of size max_ntasks
-    int *p_max_ntasks,              // size of TaskList
+    int *p_TaskList_size,           // size of TaskList
     int *p_ntasks,                  // # of tasks constructed
-    int *p_nthreads,                // # of threads to use
+    int *p_nthreads,                // # of threads for eWise operation
     // input:
     const int64_t Cnvec,            // # of vectors of C
     const int64_t *GB_RESTRICT Ch,     // vectors of C, if hypersparse
@@ -1214,7 +748,6 @@ void GB_slice_vector
     const int64_t pA_start,         // A(:,kA) starts at pA_start in Ai,Ax
     const int64_t pA_end,           // A(:,kA) ends at pA_end-1 in Ai,Ax
     const int64_t *GB_RESTRICT Ai,     // indices of A
-    const int64_t A_hfirst,         // if Ai is an implicit hyperlist
     const int64_t pB_start,         // B(:,kB) starts at pB_start in Bi,Bx
     const int64_t pB_end,           // B(:,kB) ends at pB_end-1 in Bi,Bx
     const int64_t *GB_RESTRICT Bi,     // indices of B
@@ -1236,7 +769,7 @@ void GB_task_cumsum
 // GB_GET_VECTOR: get the content of a vector for a coarse/fine task
 //------------------------------------------------------------------------------
 
-#define GB_GET_VECTOR(pX_start, pX_fini, pX, pX_end, Xp, kX)                \
+#define GB_GET_VECTOR(pX_start, pX_fini, pX, pX_end, Xp, kX, Xvlen)         \
     int64_t pX_start, pX_fini ;                                             \
     if (fine_task)                                                          \
     {                                                                       \
@@ -1247,8 +780,8 @@ void GB_task_cumsum
     else                                                                    \
     {                                                                       \
         /* vectors are never sliced for a coarse task */                    \
-        pX_start = Xp [kX] ;                                                \
-        pX_fini  = Xp [kX+1] ;                                              \
+        pX_start = GBP (Xp, kX, Xvlen) ;                                    \
+        pX_fini  = GBP (Xp, kX+1, Xvlen) ;                                  \
     }
 
 //------------------------------------------------------------------------------
@@ -1311,10 +844,10 @@ void GB_free_memory
     void *p                 // pointer to allocated block of memory to free
 ) ;
 
-#define GB_FREE(p)                                                            \
-{                                                                             \
-    GB_free_memory ((void *) p) ;                                             \
-    (p) = NULL ;                                                              \
+#define GB_FREE(p)                                          \
+{                                                           \
+    GB_free_memory ((void *) p) ;                           \
+    (p) = NULL ;                                            \
 }
 
 #define GB_CALLOC(n,type) (type *) GB_calloc_memory (n, sizeof (type))
@@ -1322,25 +855,11 @@ void GB_free_memory
 #define GB_REALLOC(p,nnew,nold,type,ok) \
     p = (type *) GB_realloc_memory (nnew, nold, sizeof (type), (void *) p, ok)
 
-//------------------------------------------------------------------------------
-// macros to create/free matrices, vectors, and scalars
-//------------------------------------------------------------------------------
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_Matrix_free         // free a matrix
+void GB_Matrix_free             // free a matrix
 (
     GrB_Matrix *matrix_handle   // handle of matrix to free
 ) ;
 
-#define GB_MATRIX_FREE(A)                                                     \
-{                                                                             \
-    if (GB_Matrix_free (A) == GrB_PANIC) GB_PANIC ;                           \
-}
-
-#define GB_VECTOR_FREE(v) GB_MATRIX_FREE ((GrB_Matrix *) v)
-
-#define GB_SCALAR_FREE(s) GB_MATRIX_FREE ((GrB_Matrix *) s)
-
 //------------------------------------------------------------------------------
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
@@ -1350,23 +869,14 @@ GrB_Type GB_code_type           // return the GrB_Type corresponding to the code
     const GrB_Type type         // user type if code is GB_UDT_code
 ) ;
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_slice       // slice B into nthreads slices or hyperslices
-(
-    GrB_Matrix B,       // matrix to slice
-    int nthreads,       // # of slices to create
-    int64_t *Slice,     // array of size nthreads+1 that defines the slice
-    GrB_Matrix *Bslice, // array of output slices, of size nthreads
-    GB_Context Context
-) ;
-
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 bool GB_pslice          // slice Ap; return true if ok, false if out of memory
 (
     int64_t *GB_RESTRICT *Slice_handle,    // size ntasks+1
     const int64_t *GB_RESTRICT Ap,         // array of size n+1
     const int64_t n,
-    const int ntasks                    // # of tasks
+    const int ntasks,                       // # of tasks
+    const bool perfectly_balanced
 ) ;
 
 void GB_eslice
@@ -1397,6 +907,7 @@ GrB_Info GB_Descriptor_get      // get the contents of a descriptor
     bool *In0_transpose,        // if true transpose first input
     bool *In1_transpose,        // if true transpose second input
     GrB_Desc_Value *AxB_method, // method for C=A*B
+    int *do_sort,               // if nonzero, sort in GrB_mxm
     GB_Context Context
 ) ;
 
@@ -1429,11 +940,6 @@ GrB_Info GB_BinaryOp_compatible     // check for domain mismatch
     GB_Context Context
 ) ;
 
-// Several methods can use choose between a qsort-based method that takes
-// O(anz*log(anz)) time, or a bucket-sort method that takes O(anz+n) time.
-// The qsort method is choosen if the following condition is true:
-#define GB_CHOOSE_QSORT_INSTEAD_OF_BUCKET(anz,n) ((16 * (anz)) < (n))
-
 GB_PUBLIC   // accessed by the MATLAB interface only
 bool GB_Index_multiply      // true if ok, false if overflow
 (
@@ -1480,54 +986,12 @@ GrB_Info GB_Monoid_new          // create a monoid
     GB_Context Context
 ) ;
 
-//------------------------------------------------------------------------------
-// GB_is_dense: check if a matrix is completely dense
-//------------------------------------------------------------------------------
-
-static inline bool GB_is_dense
+GrB_Info GB_Semiring_new            // create a semiring
 (
-    const GrB_Matrix A
-)
-{
-    // check if A is competely dense:  all entries present.
-    // zombies and pending tuples are not considered
-    if (A == NULL) return (false) ;
-    GrB_Index anzmax ;
-    bool ok = GB_Index_multiply (&anzmax, A->vlen, A->vdim) ;
-    return (ok && (anzmax == GB_NNZ (A))) ;
-}
-
-//------------------------------------------------------------------------------
-// OpenMP definitions
-//------------------------------------------------------------------------------
-
-// GB_PART and GB_PARTITION:  divide the index range 0:n-1 uniformly
-// for nthreads.  GB_PART(tid,n,nthreads) is the first index for thread tid.
-#define GB_PART(tid,n,nthreads)  \
-    (((tid) * ((double) (n))) / ((double) (nthreads)))
-
-// thread tid will operate on the range k1:(k2-1)
-#define GB_PARTITION(k1,k2,n,tid,nthreads)                                  \
-    k1 = ((tid) ==  0          ) ?  0  : GB_PART ((tid),  n, nthreads) ;    \
-    k2 = ((tid) == (nthreads)-1) ? (n) : GB_PART ((tid)+1,n, nthreads)
-
-#if defined ( _OPENMP )
-
-    #include <omp.h>
-    #define GB_OPENMP_MAX_THREADS       omp_get_max_threads ( )
-    #define GB_OPENMP_GET_NUM_THREADS   omp_get_num_threads ( )
-    #define GB_OPENMP_GET_WTIME         omp_get_wtime ( )
-
-#else
-
-    #define GB_OPENMP_MAX_THREADS       (1)
-    #define GB_OPENMP_GET_NUM_THREADS   (1)
-    #define GB_OPENMP_GET_WTIME         (0)
-
-#endif
-
-// by default, give each thread at least 64K units of work to do
-#define GB_CHUNK_DEFAULT (64*1024)
+    GrB_Semiring *semiring,         // handle of semiring to create
+    GrB_Monoid add,                 // additive monoid of the semiring
+    GrB_BinaryOp multiply           // multiply operator of the semiring
+) ;
 
 //------------------------------------------------------------------------------
 
@@ -1579,37 +1043,7 @@ int64_t GB_nvec_nonempty        // return # of non-empty vectors
     GB_Context Context
 ) ;
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_to_nonhyper     // convert a matrix to non-hypersparse
-(
-    GrB_Matrix A,           // matrix to convert to non-hypersparse
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_to_hyper        // convert a matrix to hypersparse
-(
-    GrB_Matrix A,           // matrix to convert to hypersparse
-    GB_Context Context
-) ;
-
-bool GB_to_nonhyper_test    // test for conversion to hypersparse
-(
-    GrB_Matrix A,           // matrix to test
-    int64_t k,              // # of non-empty vectors of A, an estimate is OK,
-                            // but normally A->nvec_nonempty
-    int64_t vdim            // normally A->vdim
-) ;
-
-bool GB_to_hyper_test       // test for conversion to hypersparse
-(
-    GrB_Matrix A,           // matrix to test
-    int64_t k,              // # of non-empty vectors of A, an estimate is OK,
-                            // but normally A->nvec_nonempty
-    int64_t vdim            // normally A->vdim
-) ;
-
-GrB_Info GB_to_hyper_conform    // conform a matrix to its desired format
+GrB_Info GB_conform_hyper       // conform a matrix to sparse/hypersparse
 (
     GrB_Matrix A,               // matrix to conform
     GB_Context Context
@@ -1641,6 +1075,7 @@ void GB_cast_array              // typecast an array
     const GB_Type_code code1,   // type code for Cx
     GB_void *Ax,                // input array
     const GB_Type_code code2,   // type code for Ax
+    const int8_t *GB_RESTRICT Ab,   // bitmap for Ax
     const size_t user_size,     // size of Ax and Cx if user-defined
     const int64_t anz,          // number of entries in Cx and Ax
     const int nthreads          // number of threads to use
@@ -1668,8 +1103,7 @@ void GB_cast_array              // typecast an array
     if ((arg) == NULL)                                                  \
     {                                                                   \
         /* the required arg is NULL */                                  \
-        return (GB_ERROR (GrB_NULL_POINTER, (GB_LOG,                    \
-            "Required argument is null: [%s]", GB_STR(arg)))) ;         \
+        return (GrB_NULL_POINTER) ;                                     \
     }
 
 // arg may be NULL, but if non-NULL then it must be initialized
@@ -1679,14 +1113,12 @@ void GB_cast_array              // typecast an array
         if ((arg)->magic == GB_MAGIC2)                                  \
         {                                                               \
             /* optional arg is not NULL, but invalid */                 \
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,              \
-                "Argument is invalid: [%s]", GB_STR(arg)))) ;           \
+            return (GrB_INVALID_OBJECT) ;                               \
         }                                                               \
         else                                                            \
         {                                                               \
             /* optional arg is not NULL, but not initialized */         \
-            return (GB_ERROR (GrB_UNINITIALIZED_OBJECT, (GB_LOG,        \
-                "Argument is uninitialized: [%s]", GB_STR(arg)))) ;     \
+            return (GrB_UNINITIALIZED_OBJECT) ;                         \
         }                                                               \
     }
 
@@ -1695,44 +1127,28 @@ void GB_cast_array              // typecast an array
     GB_RETURN_IF_NULL (arg) ;                                           \
     GB_RETURN_IF_FAULTY (arg) ;
 
-// same as GB_RETURN_IF_NULL(arg), but set Context first
-#define GB_CONTEXT_RETURN_IF_NULL(arg)                                  \
-    if ((arg) == NULL)                                                  \
-    {                                                                   \
-        /* the required arg is NULL */                                  \
-        GB_WHERE (GB_WHERE_STRING) ;                                    \
-        return (GB_ERROR (GrB_NULL_POINTER, (GB_LOG,                    \
-            "Required argument is null: [%s]", GB_STR(arg)))) ;         \
-    }
-
-// same as GB_RETURN_IF_FAULTY(arg), but set Context first
-#define GB_CONTEXT_RETURN_IF_FAULTY(arg)                                \
-    if ((arg) != NULL && (arg)->magic != GB_MAGIC)                      \
+// positional ops not supported for use as accum operators
+#define GB_RETURN_IF_FAULTY_OR_POSITIONAL(accum)                        \
+{                                                                       \
+    GB_RETURN_IF_FAULTY (accum) ;                                       \
+    if (GB_OP_IS_POSITIONAL (accum))                                    \
     {                                                                   \
-        GB_WHERE (GB_WHERE_STRING) ;                                    \
-        if ((arg)->magic == GB_MAGIC2)                                  \
-        {                                                               \
-            /* optional arg is not NULL, but invalid */                 \
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,              \
-                "Argument is invalid: [%s]", GB_STR(arg)))) ;           \
-        }                                                               \
-        else                                                            \
-        {                                                               \
-            /* optional arg is not NULL, but not initialized */         \
-            return (GB_ERROR (GrB_UNINITIALIZED_OBJECT, (GB_LOG,        \
-                "Argument is uninitialized: [%s]", GB_STR(arg)))) ;     \
-        }                                                               \
-    }
+        GB_ERROR (GrB_DOMAIN_MISMATCH,                                  \
+            "Positional op z=%s(x,y) not supported as accum\n",         \
+                accum->name) ;                                          \
+    }                                                                   \
+}
 
 // check the descriptor and extract its contents; also copies
-// nthreads_max, chunk, and use_mkl from the descriptor to the Context
-#define GB_GET_DESCRIPTOR(info,desc,dout,dmc,dms,d0,d1,dalgo)                \
+// nthreads_max and chunk from the descriptor to the Context
+#define GB_GET_DESCRIPTOR(info,desc,dout,dmc,dms,d0,d1,dalgo,dsort)          \
     GrB_Info info ;                                                          \
     bool dout, dmc, dms, d0, d1 ;                                            \
+    int dsort ;                                                              \
     GrB_Desc_Value dalgo ;                                                   \
     /* if desc is NULL then defaults are used.  This is OK */                \
     info = GB_Descriptor_get (desc, &dout, &dmc, &dms, &d0, &d1, &dalgo,     \
-        Context) ;                                                           \
+        &dsort, Context) ;                                                   \
     if (info != GrB_SUCCESS)                                                 \
     {                                                                        \
         /* desc not NULL, but uninitialized or an invalid object */          \
@@ -1741,11 +1157,11 @@ void GB_cast_array              // typecast an array
 
 // C<M>=Z ignores Z if an empty mask is complemented, so return from
 // the method without computing anything.  But do apply the mask.
-#define GB_RETURN_IF_QUICK_MASK(C, C_replace, M, Mask_comp)             \
-    if (Mask_comp && M == NULL)                                         \
-    {                                                                   \
-        /* C<!NULL>=NULL since result does not depend on computing Z */ \
-        return (C_replace ? GB_clear (C, Context) : GrB_SUCCESS) ;      \
+#define GB_RETURN_IF_QUICK_MASK(C, C_replace, M, Mask_comp)                 \
+    if (Mask_comp && M == NULL)                                             \
+    {                                                                       \
+        /* C<!NULL>=NULL since result does not depend on computing Z */     \
+        return (C_replace ? GB_clear (C, Context) : GrB_SUCCESS) ;          \
     }
 
 // GB_MASK_VERY_SPARSE is true if C<M>=A+B or C<M>=accum(C,T) is being
@@ -1756,30 +1172,6 @@ void GB_cast_array              // typecast an array
 // Pending upddate and zombies
 //------------------------------------------------------------------------------
 
-// GB_FLIP is a kind of  "negation" about (-1) of a zero-based index.
-// If i >= 0 then it is not flipped.
-// If i < 0 then it has been flipped.
-// Like negation, GB_FLIP is its own inverse: GB_FLIP (GB_FLIP (i)) == i.
-// The "nil" value, -1, doesn't change when flipped: GB_FLIP (-1) = -1.
-// GB_UNFLIP(i) is like taking an absolute value, undoing any GB_FLIP(i).
-
-// An entry A(i,j) in a matrix can be marked as a "zombie".  A zombie is an
-// entry that has been marked for deletion, but hasn't been deleted yet because
-// it's more efficient to delete all zombies all at once, instead of one at a
-// time.  Zombies are created by submatrix assignment, C(I,J)=A which copies
-// not only new entries into C, but it also deletes entries already present in
-// C.  If an entry appears in A but not C(I,J), it is a new entry; new entries
-// placed in the pending tuple lists to be added later.  If an entry appear in
-// C(I,J) but NOT in A, then it is marked for deletion by flipping its row
-// index, marking it as a zombie.
-
-// Zombies can be restored as regular entries by GrB_*assign.  If an assignment
-// C(I,J)=A finds an entry in A that is a zombie in C, the zombie becomes a
-// regular entry, taking on the value from A.  The row index is unflipped.
-
-// Zombies are deleted and pending tuples are added into the matrix all at
-// once, by GB_Matrix_wait.
-
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 GrB_Info GB_Matrix_wait         // finish all pending computations
 (
@@ -1787,12 +1179,11 @@ GrB_Info GB_Matrix_wait         // finish all pending computations
     GB_Context Context
 ) ;
 
-#define GB_FLIP(i)             (-(i)-2)
-#define GB_IS_FLIPPED(i)       ((i) < 0)
-#define GB_IS_ZOMBIE(i)        ((i) < 0)
-#define GB_IS_NOT_FLIPPED(i)   ((i) >= 0)
-#define GB_IS_NOT_ZOMBIE(i)    ((i) >= 0)
-#define GB_UNFLIP(i)           (((i) < 0) ? GB_FLIP(i) : (i))
+GrB_Info GB_unjumble        // unjumble a matrix
+(
+    GrB_Matrix A,           // matrix to unjumble
+    GB_Context Context
+) ;
 
 // true if a matrix has pending tuples
 #define GB_PENDING(A) ((A) != NULL && (A)->Pending != NULL)
@@ -1809,35 +1200,45 @@ GrB_Info GB_Matrix_wait         // finish all pending computations
 // true if a matrix has pending tuples or zombies
 #define GB_PENDING_OR_ZOMBIES(A) (GB_PENDING (A) || GB_ZOMBIES (A))
 
-// do all pending updates:  delete zombies and assemble any pending tuples
-#define GB_MATRIX_WAIT(A)                                               \
+// true if a matrix is jumbled
+#define GB_JUMBLED(A) ((A) != NULL && (A)->jumbled)
+
+// true if a matrix is allowed to be jumbled
+#define GB_JUMBLED_OK(A) (GB_JUMBLED (A) || !GB_JUMBLED (A))
+
+// true if a matrix has pending tuples, zombies, or is jumbled
+#define GB_ANY_PENDING_WORK(A) \
+    (GB_PENDING (A) || GB_ZOMBIES (A) || GB_JUMBLED (A))
+
+// wait if condition holds
+#define GB_WAIT_IF(condition,A)                                         \
 {                                                                       \
-    if (GB_PENDING_OR_ZOMBIES (A))                                      \
+    if (condition)                                                      \
     {                                                                   \
+        GrB_Info info ;                                                 \
         GB_OK (GB_Matrix_wait ((GrB_Matrix) A, Context)) ;              \
-        ASSERT (!GB_ZOMBIES (A)) ;                                      \
-        ASSERT (!GB_PENDING (A)) ;                                      \
     }                                                                   \
 }
 
-#define GB_VECTOR_WAIT(v) GB_MATRIX_WAIT (v)
-#define GB_SCALAR_WAIT(s) GB_MATRIX_WAIT (s)
-
-// do all pending updates:  but only if pending tuples; zombies are OK
-#define GB_MATRIX_WAIT_PENDING(A)                                             \
-{                                                                             \
-    if (GB_PENDING (A))                                                       \
-    {                                                                         \
-        /* do all pending work: delete zombies and assemble pending tuples */ \
-        GB_OK (GB_Matrix_wait ((GrB_Matrix) A, Context)) ;                    \
-        ASSERT (!GB_ZOMBIES (A)) ;                                            \
-        ASSERT (!GB_PENDING (A)) ;                                            \
-    }                                                                         \
-    ASSERT (GB_ZOMBIES_OK (A)) ;                                              \
-}
+// do all pending work:  zombies, pending tuples, and unjumble
+#define GB_MATRIX_WAIT(A) GB_WAIT_IF (GB_ANY_PENDING_WORK (A), A)
+
+// do all pending work if pending tuples; zombies and jumbled are OK
+#define GB_MATRIX_WAIT_IF_PENDING(A) GB_WAIT_IF (GB_PENDING (A), A)
+
+// delete zombies and assemble any pending tuples; jumbled is O
+#define GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES(A)                         \
+    GB_WAIT_IF (GB_PENDING_OR_ZOMBIES (A), A)
+
+// ensure A is not jumbled
+#define GB_MATRIX_WAIT_IF_JUMBLED(A) GB_WAIT_IF (GB_JUMBLED (A), A)
 
 // true if a matrix has no entries; zombies OK
-#define GB_EMPTY(A) ((GB_NNZ (A) == 0) && !GB_PENDING (A))
+#define GB_IS_EMPTY(A) ((GB_NNZ (A) == 0) && !GB_PENDING (A))
+
+//------------------------------------------------------------------------------
+
+#include "GB_convert.h"
 
 //------------------------------------------------------------------------------
 // built-in unary and binary operators
diff --git a/GraphBLAS/Source/GB_AxB_colscale.c b/GraphBLAS/Source/GB_AxB_colscale.c
index 2663e7b02e..8c86c89aa3 100644
--- a/GraphBLAS/Source/GB_AxB_colscale.c
+++ b/GraphBLAS/Source/GB_AxB_colscale.c
@@ -2,13 +2,14 @@
 // GB_AxB_colscale: C = A*D where D is diagonal
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "GB_mxm.h"
 #include "GB_binop.h"
+#include "GB_apply.h"
 #include "GB_ek_slice.h"
 #ifndef GBCOMPACT
 #include "GB_binop__include.h"
@@ -36,23 +37,130 @@ GrB_Info GB_AxB_colscale            // C = A*D, column scale with diagonal D
     ASSERT (Chandle != NULL) ;
     ASSERT_MATRIX_OK (A, "A for colscale A*D", GB0) ;
     ASSERT_MATRIX_OK (D, "D for colscale A*D", GB0) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
-    ASSERT (!GB_PENDING (D)) ; ASSERT (!GB_ZOMBIES (D)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (!GB_ZOMBIES (D)) ;
+    ASSERT (!GB_JUMBLED (D)) ;
+    ASSERT (!GB_PENDING (D)) ;
     ASSERT_SEMIRING_OK (semiring, "semiring for numeric A*D", GB0) ;
     ASSERT (A->vdim == D->vlen) ;
     ASSERT (GB_is_diagonal (D, Context)) ;
 
+    ASSERT (!GB_IS_BITMAP (A)) ;        // TODO: ok for now
+    ASSERT (!GB_IS_BITMAP (D)) ;
+    ASSERT (!GB_IS_FULL (D)) ;
+
+    GBURBLE ("(%s=%s*%s) ",
+        GB_sparsity_char_matrix (A),    // C has the sparsity structure of A
+        GB_sparsity_char_matrix (A),
+        GB_sparsity_char_matrix (D)) ;
+
+    //--------------------------------------------------------------------------
+    // get the semiring operators
+    //--------------------------------------------------------------------------
+
+    GrB_BinaryOp mult = semiring->multiply ;
+    ASSERT (mult->ztype == semiring->add->op->ztype) ;
+    GB_Opcode opcode = mult->opcode ;
+    // GB_reduce_to_vector does not use GB_AxB_colscale:
+    ASSERT (!(mult->function == NULL &&
+        (opcode == GB_FIRST_opcode || opcode == GB_SECOND_opcode))) ;
+
+    //--------------------------------------------------------------------------
+    // copy the pattern of A into C
+    //--------------------------------------------------------------------------
+
+    // allocate C->x but do not initialize it
+    (*Chandle) = NULL ;
+    info = GB_dup (Chandle, A, false, mult->ztype, Context) ;
+    if (info != GrB_SUCCESS)
+    { 
+        // out of memory
+        return (info) ;
+    }
+    GrB_Matrix C = (*Chandle) ;
+
+    //--------------------------------------------------------------------------
+    // apply a positional operator: convert C=A*D to C=op(A)
+    //--------------------------------------------------------------------------
+
+    if (GB_OPCODE_IS_POSITIONAL (opcode))
+    { 
+        if (flipxy)
+        { 
+            // the multiplicative operator is fmult(y,x), so flip the opcode
+            opcode = GB_binop_flip (opcode) ;
+        }
+        // determine unary operator to compute C=A*D
+        GrB_UnaryOp op1 = NULL ;
+        if (mult->ztype == GrB_INT64)
+        {
+            switch (opcode)
+            {
+                // first_op(A,D) becomes position_op(A)
+                case GB_FIRSTI_opcode   : op1 = GxB_POSITIONI_INT64  ;
+                    break ;
+                case GB_FIRSTJ_opcode   : op1 = GxB_POSITIONJ_INT64  ;
+                    break ;
+                case GB_FIRSTI1_opcode  : op1 = GxB_POSITIONI1_INT64 ;
+                    break ;
+                case GB_FIRSTJ1_opcode  : op1 = GxB_POSITIONJ1_INT64 ;
+                    break ;
+                // second_op(A,D) becomes position_j(A)
+                case GB_SECONDI_opcode  : 
+                case GB_SECONDJ_opcode  : op1 = GxB_POSITIONJ_INT64  ;
+                    break ;
+                case GB_SECONDI1_opcode : 
+                case GB_SECONDJ1_opcode : op1 = GxB_POSITIONJ1_INT64 ;
+                    break ;
+                default:  ;
+            }
+        }
+        else
+        {
+            switch (opcode)
+            {
+                // first_op(A,D) becomes position_op(A)
+                case GB_FIRSTI_opcode   : op1 = GxB_POSITIONI_INT32  ;
+                    break ;
+                case GB_FIRSTJ_opcode   : op1 = GxB_POSITIONJ_INT32  ;
+                    break ;
+                case GB_FIRSTI1_opcode  : op1 = GxB_POSITIONI1_INT32 ;
+                    break ;
+                case GB_FIRSTJ1_opcode  : op1 = GxB_POSITIONJ1_INT32 ;
+                    break ;
+                // second_op(A,D) becomes position_j(A)
+                case GB_SECONDI_opcode  : 
+                case GB_SECONDJ_opcode  : op1 = GxB_POSITIONJ_INT32  ;
+                    break ;
+                case GB_SECONDI1_opcode : 
+                case GB_SECONDJ1_opcode : op1 = GxB_POSITIONJ1_INT32 ;
+                    break ;
+                default:  ;
+            }
+        }
+        info = GB_apply_op (C->x, op1,      // positional unary op only
+            NULL, NULL, false, A, Context) ;
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory
+            GB_Matrix_free (Chandle) ;
+            return (info) ;
+        }
+        ASSERT_MATRIX_OK (C, "colscale positional: C = A*D output", GB0) ;
+        return (GrB_SUCCESS) ;
+    }
+
     //--------------------------------------------------------------------------
     // determine the number of threads to use
     //--------------------------------------------------------------------------
 
-    int64_t anz   = GB_NNZ (A) ;
+    int64_t anz = GB_NNZ_HELD (A) ;
     int64_t anvec = A->nvec ;
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
     int nthreads = GB_nthreads (anz + anvec, chunk, nthreads_max) ;
     int ntasks = (nthreads == 1) ? 1 : (32 * nthreads) ;
-    ntasks = GB_IMIN (ntasks, anz) ;
-    ntasks = GB_IMAX (ntasks, 1) ;
 
     //--------------------------------------------------------------------------
     // slice the entries for each task
@@ -63,22 +171,20 @@ GrB_Info GB_AxB_colscale            // C = A*D, column scale with diagonal D
     // vectors may be shared with prior slices and subsequent slices.
 
     int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
-    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A, ntasks))
+    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A, &ntasks))
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        GB_Matrix_free (Chandle) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
-    // get the semiring operators
+    // determine if the values are accessed
     //--------------------------------------------------------------------------
 
-    GrB_BinaryOp mult = semiring->multiply ;
-    ASSERT (mult->ztype == semiring->add->op->ztype) ;
-
-    bool op_is_first  = mult->opcode == GB_FIRST_opcode ;
-    bool op_is_second = mult->opcode == GB_SECOND_opcode ;
-    bool op_is_pair   = mult->opcode == GB_PAIR_opcode ;
+    bool op_is_first  = (opcode == GB_FIRST_opcode) ;
+    bool op_is_second = (opcode == GB_SECOND_opcode) ;
+    bool op_is_pair   = (opcode == GB_PAIR_opcode) ;
     bool A_is_pattern = false ;
     bool D_is_pattern = false ;
 
@@ -103,23 +209,6 @@ GrB_Info GB_AxB_colscale            // C = A*D, column scale with diagonal D
             GB_Type_compatible (D->type, mult->ytype))) ;
     }
 
-    (*Chandle) = NULL ;
-
-    //--------------------------------------------------------------------------
-    // copy the pattern of A into C
-    //--------------------------------------------------------------------------
-
-    // allocate but do not initialize C->x
-    info = GB_dup (Chandle, A, false, mult->ztype, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory
-        GB_FREE_WORK ;
-        return (info) ;
-    }
-
-    GrB_Matrix C = (*Chandle) ;
-
     //--------------------------------------------------------------------------
     // C = A*D, column scale, via built-in binary operators
     //--------------------------------------------------------------------------
@@ -146,7 +235,6 @@ GrB_Info GB_AxB_colscale            // C = A*D, column scale with diagonal D
         // launch the switch factory
         //----------------------------------------------------------------------
 
-        GB_Opcode opcode ;
         GB_Type_code xcode, ycode, zcode ;
         if (GB_binop_builtin (A->type, A_is_pattern, D->type, D_is_pattern,
             mult, flipxy, &opcode, &xcode, &ycode, &zcode))
@@ -170,7 +258,7 @@ GrB_Info GB_AxB_colscale            // C = A*D, column scale with diagonal D
         // get operators, functions, workspace, contents of A, D, and C
         //----------------------------------------------------------------------
 
-        GB_BURBLE_MATRIX (C, "generic ") ;
+        GB_BURBLE_MATRIX (C, "(generic C=A*D colscale) ") ;
 
         GxB_binary_function fmult = mult->function ;
 
@@ -194,9 +282,9 @@ GrB_Info GB_AxB_colscale            // C = A*D, column scale with diagonal D
         if (flipxy)
         { 
             // A is typecasted to y, and D is typecasted to x
-            cast_A = A_is_pattern ? NULL : 
+            cast_A = A_is_pattern ? NULL :
                      GB_cast_factory (mult->ytype->code, A->type->code) ;
-            cast_D = D_is_pattern ? NULL : 
+            cast_D = D_is_pattern ? NULL :
                      GB_cast_factory (mult->xtype->code, D->type->code) ;
         }
         else
@@ -234,13 +322,13 @@ GrB_Info GB_AxB_colscale            // C = A*D, column scale with diagonal D
 
         if (flipxy)
         { 
-            #define GB_BINOP(z,x,y) fmult (z,y,x)
+            #define GB_BINOP(z,x,y,i,j) fmult (z,y,x)
             #include "GB_AxB_colscale_meta.c"
             #undef GB_BINOP
         }
         else
         { 
-            #define GB_BINOP(z,x,y) fmult (z,x,y)
+            #define GB_BINOP(z,x,y,i,j) fmult (z,x,y)
             #include "GB_AxB_colscale_meta.c"
             #undef GB_BINOP
         }
diff --git a/GraphBLAS/Source/GB_AxB_dot.c b/GraphBLAS/Source/GB_AxB_dot.c
index 185a4b538e..cb8e038847 100644
--- a/GraphBLAS/Source/GB_AxB_dot.c
+++ b/GraphBLAS/Source/GB_AxB_dot.c
@@ -2,8 +2,8 @@
 // GB_AxB_dot: C<M>=A'*B using dot products
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -28,9 +28,9 @@
 // The output matrix C = *Chandle has not been allocated, so C is NULL on
 // input.  The mask M is optional.
 
-// If C is computed in place, Chandle is ignored, and the result is computed
-// in C_in_place instead.  This case requires the accum operator to match
-// the monoid of the semiring.
+// If C is computed in-place, Chandle is ignored, and the result is computed in
+// C_in instead.  This case requires the accum operator to match the monoid of
+// the semiring.
 
 // The semiring defines C=A*B.  flipxy modifies how the semiring multiply
 // operator is applied.  If false, then fmult(aik,bkj) is computed.  If true,
@@ -41,24 +41,12 @@
 // GxB_vxm) and detailed error reports.
 
 #include "GB_mxm.h"
-
-#define GB_FREE_ALL                                             \
-{                                                               \
-    if (naslice > 1 && Aslice != NULL)                          \
-    {                                                           \
-        for (int tid = 0 ; tid < naslice ; tid++)               \
-        {                                                       \
-            GB_MATRIX_FREE (& (Aslice [tid])) ;                 \
-        }                                                       \
-    }                                                           \
-    GB_FREE (Slice) ;                                           \
-    GB_FREE (Aslice) ;                                          \
-}
+#define GB_FREE_ALL ;
 
 GrB_Info GB_AxB_dot                 // dot product (multiple methods)
 (
     GrB_Matrix *Chandle,            // output matrix, NULL on input
-    GrB_Matrix C_in_place,          // input/output matrix, if done in place
+    GrB_Matrix C_in,                // input/output matrix, if done in-place
     GrB_Matrix M,                   // optional mask matrix
     const bool Mask_comp,           // if true, use !M
     const bool Mask_struct,         // if true, use the only structure of M
@@ -67,7 +55,7 @@ GrB_Info GB_AxB_dot                 // dot product (multiple methods)
     const GrB_Semiring semiring,    // semiring that defines C=A*B
     const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
     bool *mask_applied,             // if true, mask was applied
-    bool *done_in_place,            // if true, C_in_place was computed in place
+    bool *done_in_place,            // if true, C_in was computed in-place
     GB_Context Context
 )
 {
@@ -78,44 +66,87 @@ GrB_Info GB_AxB_dot                 // dot product (multiple methods)
 
     ASSERT (Chandle != NULL) ;          // C = (*Chandle) is NULL
     ASSERT (*Chandle == NULL) ;
-    ASSERT_MATRIX_OK_OR_NULL (M, "M for parallel A*B", GB0) ;
-    ASSERT_MATRIX_OK (A, "A for parallel A*B", GB0) ;
-    ASSERT_MATRIX_OK (B, "B for parallel A*B", GB0) ;
-    ASSERT (!GB_PENDING (M)) ; ASSERT (!GB_ZOMBIES (M)) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
-    ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_ZOMBIES (B)) ;
-    ASSERT_SEMIRING_OK (semiring, "semiring for parallel A*B", GB0) ;
-
-    int64_t naslice = 0 ;
-    int64_t nbslice = 0 ;
-    int64_t *GB_RESTRICT Slice = NULL ;    // size naslice+1
-    GrB_Matrix *Aslice = NULL ;         // size naslice+1
-
-    if (M != NULL && !Mask_comp)
+
+    ASSERT_MATRIX_OK_OR_NULL (M, "M for dot A'*B", GB0) ;
+    ASSERT (!GB_PENDING (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+
+    ASSERT_MATRIX_OK (A, "A for dot A'*B", GB0) ;
+    GB_MATRIX_WAIT (A) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+
+    ASSERT_MATRIX_OK (B, "B for dot A'*B", GB0) ;
+    GB_MATRIX_WAIT (B) ;
+    ASSERT (!GB_PENDING (B)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+
+    ASSERT_SEMIRING_OK (semiring, "semiring for dot A'*B", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // in-place C+=A'*B.  mask is not present (and not applied)
+    //--------------------------------------------------------------------------
+
+    if (GB_AxB_dot4_control (C_in, M, Mask_comp))
     { 
+        (*done_in_place) = true ;
+        (*mask_applied) = false ;    // no mask to apply
+        return (GB_AxB_dot4 (C_in, A, B, semiring, flipxy, Context)) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // check the empty case
+    //--------------------------------------------------------------------------
+
+    if (A->vlen == 0)
+    { 
+        // no work to do; C is an empty matrix, normally hypersparse
+        if (C_in != NULL) return (GrB_SUCCESS) ;
+        return (GB_new (Chandle, // auto sparsity, new header
+            semiring->add->op->ztype, A->vdim, B->vdim, GB_Ap_calloc, true,
+            GxB_AUTO_SPARSITY, GB_Global_hyper_switch_get ( ), 1, Context)) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // C<M>=A'*B: general case
+    //--------------------------------------------------------------------------
 
-        //======================================================================
-        // C<M>=A'*B
-        //======================================================================
+    if (GB_AxB_dot3_control (M, Mask_comp))
+    { 
 
-        // use dot3 if M is present and not complemented
-        GBBURBLE ("dot3 ") ;
-        (*mask_applied) = true ;
+        // use dot3 if M is present and not complemented, and either sparse or
+        // hypersparse
+        GBURBLE ("dot3 ") ;
+        (*mask_applied) = true ;    // mask is always applied
+        (*done_in_place) = false ;
 
         #if defined ( GBCUDA )
 
+// [ replace this with:
+// if (GB_AxB_dot3_cuda_branch (M, Mask_struct, A, B, semiring, flipxy, Context)
+
         // very rough estimate of the work to do
-        double adeg = ((double) GB_NNZ (A)) / ((double) GB_IMAX (1, A->nvec)) ;
-        double bdeg = ((double) GB_NNZ (B)) / ((double) GB_IMAX (1, B->nvec)) ;
-        double work = GB_NNZ (M) * GB_IMIN (adeg, bdeg) ;
+        int64_t anz = GB_IS_FULL (A) ? GB_NNZ_FULL (A) : GB_NNZ (A) ;
+        int64_t bnz = GB_IS_FULL (B) ? GB_NNZ_FULL (B) : GB_NNZ (B) ;
+        int64_t mnz = GB_NNZ (M) ;
+
+        double adeg = ((double) anz) / ((double) GB_IMAX (1, A->nvec)) ;
+        double bdeg = ((double) bnz) / ((double) GB_IMAX (1, B->nvec)) ;
+        double work = mnz * GB_IMIN (adeg, bdeg) ;
 
         // TODO for GPU: if A or B are not accessed (first, 2nd, or pair
         // ops) then the type of A can be user-defined here, for CUDA.
 
         int ngpus_to_use = GB_ngpus_to_use (work) ;
-        if (ngpus_to_use > 0 && semiring->builtin &&
+        GBURBLE (" work:%g gpus:%d ", work, ngpus_to_use) ;
+        if (ngpus_to_use > 0 && semiring->semiring_is_builtin
             && (A->type->code != GB_UDT_code)
-            && (B->type->code != GB_UDT_code))
+            && (B->type->code != GB_UDT_code)
+            && !GB_IS_BITMAP (A) && !GB_IS_BITMAP (B))
+// to here ... ]
         {
             // use "the" GPU (TODO for GPU: could use multiple GPUs too)
             return (GB_AxB_dot3_cuda (Chandle, M, Mask_struct, A, B, semiring,
@@ -123,143 +154,20 @@ GrB_Info GB_AxB_dot                 // dot product (multiple methods)
         }
         else
         #endif
-        {
+        { 
             // use the CPU
             return (GB_AxB_dot3 (Chandle, M, Mask_struct, A, B, semiring,
                 flipxy, Context)) ;
         }
-
     }
-    else
-    {
-
-        //======================================================================
-        // C<!M>=A'*B or C=A'*B
-        //======================================================================
-
-        GrB_Info info ;
-
-        //----------------------------------------------------------------------
-        // get A and B
-        //----------------------------------------------------------------------
-
-        if (B->nvec_nonempty < 0)
-        { 
-            B->nvec_nonempty = GB_nvec_nonempty (B, NULL) ;
-        }
-
-        if (A->nvec_nonempty < 0)
-        { 
-            A->nvec_nonempty = GB_nvec_nonempty (A, NULL) ;
-        }
-
-        //======================================================================
-        // in place C+=A'*B
-        //======================================================================
 
-        if (C_in_place != NULL && M == NULL && !Mask_comp)
-        { 
-            GBBURBLE ("dense, C+=A'*B in place ") ;
-            (*done_in_place) = true ;
-            return (GB_AxB_dot4 (C_in_place, A, B, semiring, flipxy, Context)) ;
-        }
-
-        //----------------------------------------------------------------------
-        // determine the number of threads to use
-        //----------------------------------------------------------------------
-
-        int64_t anvec = A->nvec ;
-        int64_t anz   = GB_NNZ (A) ;
-
-        int64_t bnvec = B->nvec ;
-        int64_t bnz   = GB_NNZ (B) ;
-
-        ASSERT (A->vlen == B->vlen) ;
-
-        GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-        int nthreads = GB_nthreads (anz + bnz, chunk, nthreads_max) ;
-
-        //======================================================================
-        // sequential C<!M>=A'*B or C=A'*B
-        //======================================================================
-
-        if (nthreads == 1)
-        {
-            // do the entire computation with a single thread
-            info = GB_AxB_dot2 (Chandle, M, Mask_struct, &A, B, semiring,
-                flipxy, mask_applied, 1, 1, 1, NULL) ;
-            if (info == GrB_SUCCESS)
-            { 
-                ASSERT_MATRIX_OK (*Chandle, "C for sequential A*B", GB0) ;
-            }
-            return ((info == GrB_OUT_OF_MEMORY) ? GB_OUT_OF_MEMORY : info) ;
-        }
-
-        //======================================================================
-        // parallel C<!M>=A'*B or C=A'*B
-        //======================================================================
-
-        ASSERT (nthreads > 1) ;
-
-        //----------------------------------------------------------------------
-        // slice A' for C=A'*B or C<!M>=A'*B
-        //----------------------------------------------------------------------
-
-        // determine number of slices for A' and B
-
-        if (bnvec > 32 * nthreads || bnvec == 0)
-        { 
-            // just slice B
-            nbslice = 32 * nthreads ;
-            naslice = 1 ;
-        }
-        else
-        { 
-            // slice B into individual vectors
-            nbslice = bnvec ;
-
-            // slice A' to get a total of about 32*nthreads tasks
-            naslice = (32 * nthreads) / nbslice ;
-
-            // but do not slice A too finely
-            naslice = GB_IMIN (naslice, anvec/4) ;
-            naslice = GB_IMAX (naslice, nthreads) ;
-        }
-
-        // thread tid will do rows Slice [tid] to Slice [tid+1]-1 of A'
-
-        //----------------------------------------------------------------------
-        // slice A' by nz
-        //----------------------------------------------------------------------
-
-        Aslice = GB_CALLOC (naslice+1, GrB_Matrix) ;
-        if (Aslice == NULL || !GB_pslice (&Slice, A->p, A->nvec, naslice))
-        { 
-            // out of memory
-            GB_FREE_ALL ;
-            return (GB_OUT_OF_MEMORY) ;
-        }
-
-        //----------------------------------------------------------------------
-        // construct each slice of A'
-        //----------------------------------------------------------------------
-
-        GB_OK (GB_slice (A, naslice, Slice, Aslice, Context)) ;
-
-        //----------------------------------------------------------------------
-        // compute each slice of C = A'*B or C<!M> = A'*B
-        //----------------------------------------------------------------------
-
-        GB_OK (GB_AxB_dot2 (Chandle, M, Mask_struct, Aslice, B, semiring,
-            flipxy, mask_applied, nthreads, naslice, nbslice, Context)) ;
-
-        //----------------------------------------------------------------------
-        // free workspace and return result
-        //----------------------------------------------------------------------
+    //--------------------------------------------------------------------------
+    // general case: C<M>=A'*B, C<!M>=A'B*, or C=A'*B, not in-place
+    //--------------------------------------------------------------------------
 
-        GB_FREE_ALL ;
-        ASSERT_MATRIX_OK (*Chandle, "C for dot2 A'*B", GB0) ;
-        return (GrB_SUCCESS) ;
-    }
+    (*mask_applied) = (M != NULL) ; // mask applied if present
+    (*done_in_place) = false ;      // TODO: allow dot2 to work in-place
+    return (GB_AxB_dot2 (Chandle, M, Mask_comp, Mask_struct, A, B, semiring,
+        flipxy, Context)) ;
 }
 
diff --git a/GraphBLAS/Source/GB_AxB_dot2.c b/GraphBLAS/Source/GB_AxB_dot2.c
index 5df6d8db60..fe878eeb1a 100644
--- a/GraphBLAS/Source/GB_AxB_dot2.c
+++ b/GraphBLAS/Source/GB_AxB_dot2.c
@@ -1,221 +1,305 @@
 //------------------------------------------------------------------------------
-// GB_AxB_dot2: compute C=A'*B or C<!M>=A'*B in parallel, in place
+// GB_AxB_dot2: compute C=A'*B or C<!M>=A'*B in parallel, in-place
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// GB_AxB_dot2 does its computation in two phases.  The first phase counts the
-// number of entries in each column of C.  The second phase can then construct
-// the result C in place, and thus this method can be done in parallel, for the
-// single matrix computation C=A'*B.
+// This method always constructs C as bitmap; it then converts C to sparse or
+// hyper if A or B are hypersparse.  The C<M>=A'*B dot product when C is sparse
+// is computed by GB_AxB_dot3.  This method handles the case when C is bitmap.
 
-// Two variants are handled: C=A'*B and C<!M>=A'*B.
-// The C<M>=A'*B computation is computed by GB_AxB_dot3.
+// TODO:  this is slower than it could be if A and B are both bitmap, when
+// A->vlen is large, and likely if A and B are both either bitmap or full.
+// This is because the inner loop is a simple full/bitmap dot product, across
+// the entire input vectors.  No tiling is used, so cache performance is not
+// as good as it could be.  For large problems, C=(A')*B is faster with
+// the saxpy3 method, as compared to this method with C=A'*B.
 
 #include "GB_mxm.h"
-#include "GB_iterator.h"
+#include "GB_subref.h"
+#include "GB_binop.h"
+#include "GB_ek_slice.h"
+#include "GB_bitmap_assign_methods.h"
 #ifndef GBCOMPACT
 #include "GB_AxB__include.h"
 #endif
 
-#define GB_FREE_WORK                                            \
-{                                                               \
-    GB_FREE (B_slice) ;                                         \
-    if (C_counts != NULL)                                       \
-    {                                                           \
-        for (int taskid = 0 ; taskid < naslice ; taskid++)      \
-        {                                                       \
-            GB_FREE (C_counts [taskid]) ;                       \
-        }                                                       \
-    }                                                           \
-    GB_FREE (C_counts) ;                                        \
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_Matrix_free (&M2) ;                                              \
+    GB_FREE (A_slice) ;                                                 \
+    GB_FREE (B_slice) ;                                                 \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
 }
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 GrB_Info GB_AxB_dot2                // C=A'*B or C<!M>=A'*B, dot product method
 (
     GrB_Matrix *Chandle,            // output matrix
-    const GrB_Matrix M,             // mask matrix for C<!M>=A'*B
-                                    // if present, the mask is complemented
+    const GrB_Matrix M_in,          // mask matrix for C<!M>=A'*B, may be NULL
+    const bool Mask_comp,           // if true, use !M
     const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_Matrix *Aslice,       // input matrices (already sliced)
-    const GrB_Matrix B,             // input matrix
+    const GrB_Matrix A_in,          // input matrix
+    const GrB_Matrix B_in,          // input matrix
     const GrB_Semiring semiring,    // semiring that defines C=A*B
     const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
-    bool *mask_applied,             // if true, mask was applied
-    int nthreads,
-    int naslice,
-    int nbslice,
     GB_Context Context
 )
 {
+// double ttt = omp_get_wtime ( ) ;
 
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
 
     GrB_Info info ;
-    ASSERT (Aslice != NULL) ;
-    GrB_Matrix A = Aslice [0] ;     // just for type and dimensions
+
     ASSERT (Chandle != NULL) ;
     ASSERT (*Chandle == NULL) ;
-    ASSERT_MATRIX_OK_OR_NULL (M, "M for dot A'*B", GB0) ;
-    ASSERT_MATRIX_OK (A, "A for dot A'*B", GB0) ;
-    for (int taskid = 0 ; taskid < naslice ; taskid++)
-    {
-        ASSERT_MATRIX_OK (Aslice [taskid], "A slice for dot2 A'*B", GB0) ;
-        ASSERT (!GB_PENDING (Aslice [taskid])) ;
-        ASSERT (!GB_ZOMBIES (Aslice [taskid])) ;
-        ASSERT ((Aslice [taskid])->vlen == B->vlen) ;
-        ASSERT (A->vlen == (Aslice [taskid])->vlen) ;
-        ASSERT (A->vdim == (Aslice [taskid])->vdim) ;
-        ASSERT (A->type == (Aslice [taskid])->type) ;
-    }
-    ASSERT_MATRIX_OK (B, "B for dot A'*B", GB0) ;
-    ASSERT (!GB_PENDING (M)) ; ASSERT (!GB_ZOMBIES (M)) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
-    ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT_MATRIX_OK_OR_NULL (M_in, "M for dot A'*B", GB0) ;
+    ASSERT_MATRIX_OK (A_in, "A for dot A'*B", GB0) ;
+    ASSERT_MATRIX_OK (B_in, "B for dot A'*B", GB0) ;
+
+    ASSERT (!GB_ZOMBIES (M_in)) ;
+    ASSERT (GB_JUMBLED_OK (M_in)) ;
+    ASSERT (!GB_PENDING (M_in)) ;
+    ASSERT (!GB_ZOMBIES (A_in)) ;
+    ASSERT (!GB_JUMBLED (A_in)) ;
+    ASSERT (!GB_PENDING (A_in)) ;
+    ASSERT (!GB_ZOMBIES (B_in)) ;
+    ASSERT (!GB_JUMBLED (B_in)) ;
+    ASSERT (!GB_PENDING (B_in)) ;
+
     ASSERT_SEMIRING_OK (semiring, "semiring for numeric A'*B", GB0) ;
-    ASSERT (A->vlen == B->vlen) ;
-    ASSERT (mask_applied != NULL) ;
 
+    (*Chandle) = NULL ;
+    GrB_Matrix M, M2 = NULL ;
+    int64_t *GB_RESTRICT A_slice = NULL ;
     int64_t *GB_RESTRICT B_slice = NULL ;
-    int64_t **C_counts = NULL ;
-    int64_t cnvec = B->nvec ;
+    int64_t *GB_RESTRICT pstart_Mslice = NULL ;
+    int64_t *GB_RESTRICT kfirst_Mslice = NULL ;
+    int64_t *GB_RESTRICT klast_Mslice  = NULL ;
+    ASSERT (A_in->vlen == B_in->vlen) ;
+    ASSERT (A_in->vlen > 0) ;
+
+    if (M_in == NULL)
+    {
+        GBURBLE ("(%s=%s'*%s) ",
+            GB_sparsity_char (GxB_BITMAP),
+            GB_sparsity_char_matrix (A_in),
+            GB_sparsity_char_matrix (B_in)) ;
+    }
+    else
+    {
+        GBURBLE ("(%s%s%s%s%s=%s'*%s) ",
+            GB_sparsity_char (GxB_BITMAP),
+            Mask_struct ? "{" : "<",
+            Mask_comp ? "!" : "",
+            GB_sparsity_char_matrix (M_in),
+            Mask_struct ? "}" : ">",
+            GB_sparsity_char_matrix (A_in),
+            GB_sparsity_char_matrix (B_in)) ;
+    }
 
     //--------------------------------------------------------------------------
-    // get the semiring operators
+    // construct shallow copies of A and B, if hypersparse
     //--------------------------------------------------------------------------
 
-    GrB_BinaryOp mult = semiring->multiply ;
-    GrB_Monoid add = semiring->add ;
-    ASSERT (mult->ztype == add->op->ztype) ;
-    bool A_is_pattern, B_is_pattern ;
-    GB_AxB_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult->opcode) ;
-
-    (*Chandle) = NULL ;
+    // If A_in is hypersparse, a new sparse matrix A is constructed with
+    // A->vdim = A_in->nvec and the same vlen as A_in, and then the packed
+    // C->vlen will equal A->vdim < cvlen_final.
+
+    // If B_in is hypersparse, a new sparse matrix B is constructed with
+    // B->vdim = B_in->nvec and the same vlen as B_in, and then the packed
+    // C->vdim will equal B->vdim < cvdim_final.
+
+    int64_t cvlen_final = A_in->vdim ;
+    int64_t cvdim_final = B_in->vdim ;
+    bool A_is_hyper = GB_IS_HYPERSPARSE (A_in) ;
+    bool B_is_hyper = GB_IS_HYPERSPARSE (B_in) ;
+    bool A_or_B_hyper = A_is_hyper || B_is_hyper ;
+    GrB_Index *GB_RESTRICT Ah = A_in->h ;
+    GrB_Index *GB_RESTRICT Bh = B_in->h ;
+    struct GB_Matrix_opaque A_header, B_header ;
+    GrB_Matrix A = (A_is_hyper) ? GB_hyper_pack (&A_header, A_in) : A_in ;
+    GrB_Matrix B = (B_is_hyper) ? GB_hyper_pack (&B_header, B_in) : B_in ;
+    ASSERT (!GB_IS_HYPERSPARSE (A)) ;
+    ASSERT (!GB_IS_HYPERSPARSE (B)) ;
 
     //--------------------------------------------------------------------------
-    // allocate workspace and slice B
+    // determine the size of C
     //--------------------------------------------------------------------------
 
-    if (!GB_pslice (&B_slice, /* B */ B->p, B->nvec, nbslice))
-    { 
-        // out of memory
-        GB_FREE_WORK ;
+    int64_t cnvec = B->nvec ;
+    int64_t cvlen = A->vdim ;
+    int64_t cvdim = B->vdim ;
+
+    int64_t cnz ;
+    if (!GB_Index_multiply ((GrB_Index *) (&cnz), cvlen, cvdim))
+    {
+        // problem too large
         return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
-    // compute # of entries in each vector of C
+    // extract the submask if A or B are hypersparse 
     //--------------------------------------------------------------------------
 
-    GrB_Type ctype = add->op->ztype ;
-    int64_t cvlen = A->vdim ;
-    int64_t cvdim = B->vdim ;
-
-    if (B->nvec_nonempty < 0)
-    { 
-        B->nvec_nonempty = GB_nvec_nonempty (B, NULL) ;
+    if (A_or_B_hyper && M_in != NULL)
+    {
+        // M2 = M_in (Ah, Bh)
+        GB_OK (GB_subref (&M2, M_in->is_csc, M_in,
+            (A_is_hyper) ? Ah : GrB_ALL, cvlen,
+            (B_is_hyper) ? Bh : GrB_ALL, cvdim, false, Context)) ;
+        // TODO: if Mask_struct is true, only extract the pattern of M_in
+        M = M2 ;
+        ASSERT_MATRIX_OK_OR_NULL (M, "M submask dot A'*B", GB0) ;
     }
+    else
+    {
+        // use the mask as-is
+        M = M_in ;
+    }
+
+    //--------------------------------------------------------------------------
+    // determine the number of threads to use
+    //--------------------------------------------------------------------------
+
+    int64_t naslice = 0 ;
+    int64_t nbslice = 0 ;
+
+    int64_t anvec = A->nvec ;
+    int64_t anz   = GB_NNZ_HELD (A) ;
+
+    int64_t bnvec = B->nvec ;
+    int64_t bnz   = GB_NNZ_HELD (B) ;
+
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+    int nthreads = GB_nthreads (anz + bnz, chunk, nthreads_max) ;
 
-    C_counts = GB_CALLOC (naslice, int64_t *) ;
-    if (C_counts == NULL)
+    #define GB_NTASKS_PER_THREAD 32
+
+    if (nthreads == 1)
     { 
-        // out of memory
-        GB_FREE_WORK ;
-        return (GrB_OUT_OF_MEMORY) ;
+        // do the entire computation with a single thread
+        naslice = 1 ;
+        nbslice = 1 ;
     }
-
-    for (int a_taskid = 0 ; a_taskid < naslice ; a_taskid++)
+    else
     {
-        int64_t *GB_RESTRICT C_count = GB_CALLOC (B->nvec, int64_t) ;
-        if (C_count == NULL)
+        // determine number of slices for A' and B
+        if (bnvec == 1)
         { 
-            // out of memory
-            GB_FREE_WORK ;
-            return (GrB_OUT_OF_MEMORY) ;
+            // C and B are single vectors
+            naslice = GB_NTASKS_PER_THREAD * nthreads ;
+            nbslice = 1 ;
         }
-        C_counts [a_taskid] = C_count ;
-    }
-
-    for (int a_taskid = 0 ; a_taskid < naslice ; a_taskid++)
-    {
-        if ((Aslice [a_taskid])->nvec_nonempty < 0)
+        else if (anvec == 1 || bnvec == 0
+            || bnvec > GB_NTASKS_PER_THREAD * nthreads)
         { 
-            (Aslice [a_taskid])->nvec_nonempty =
-                GB_nvec_nonempty (Aslice [a_taskid], NULL) ;
+            // A is a single vector, or B is empty, or B is large: just slice B
+            naslice = 1 ;
+            nbslice = GB_NTASKS_PER_THREAD * nthreads ;
         }
-    }
+        else
+        { 
+            // slice B into individual vectors
+            nbslice = bnvec ;
 
-    // phase1 parallel region: each thread computes C_counts [taskid]
-    // for its slice.
-    #define GB_PHASE_1_OF_2
-    #include "GB_AxB_dot2_meta.c"
-    #undef  GB_PHASE_1_OF_2
+            // slice A' to get a total of about 16*nthreads tasks
+            naslice = (GB_NTASKS_PER_THREAD * nthreads) / nbslice ;
 
-    info = GB_new (Chandle, ctype, cvlen, cvdim, GB_Ap_malloc, true,
-        GB_SAME_HYPER_AS (B->is_hyper), B->hyper_ratio, cnvec, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory
-        GB_FREE_WORK ;
-        return (info) ;
+            // but do not slice A too finely
+            naslice = GB_IMIN (naslice, anvec/4) ;
+            naslice = GB_IMAX (naslice, nthreads) ;
+        }
     }
 
-    GrB_Matrix C = (*Chandle) ;
-    int64_t *GB_RESTRICT Cp = C->p ;
+    //--------------------------------------------------------------------------
+    // get the semiring operators
+    //--------------------------------------------------------------------------
 
-    // cumulative sum of counts in each column
-    int64_t k ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (k = 0 ; k < cnvec ; k++)
-    {
-        int64_t s = 0 ;
-        for (int taskid = 0 ; taskid < naslice ; taskid++)
-        { 
-            int64_t *GB_RESTRICT C_count = C_counts [taskid] ;
-            int64_t c = C_count [k] ;
-            C_count [k] = s ;
-            s += c ;
-        }
-        Cp [k] = s ;
-    }
-    Cp [cnvec] = 0 ;
-    C->nvec = cnvec ;
+    GrB_BinaryOp mult = semiring->multiply ;
+    GrB_Monoid add = semiring->add ;
+    ASSERT (mult->ztype == add->op->ztype) ;
+    bool A_is_pattern, B_is_pattern ;
+    GB_AxB_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult->opcode) ;
 
-    // Cp = cumulative sum of Cp
-    GB_cumsum (Cp, cnvec, &(C->nvec_nonempty), nthreads) ;
-    int64_t cnz = Cp [cnvec] ;
+    //--------------------------------------------------------------------------
+    // allocate workspace and slice A and B
+    //--------------------------------------------------------------------------
+
+    // A and B can have any sparsity: full, bitmap, sparse, or hypersparse.
+    // C is always created as bitmap
 
-    // C->h = B->h
-    if (B->is_hyper)
+    if (!GB_pslice (&A_slice, A->p, A->nvec, naslice, false) ||
+        !GB_pslice (&B_slice, B->p, B->nvec, nbslice, false))
     { 
-        GB_memcpy (C->h, B->h, cnvec * sizeof (int64_t), nthreads) ;
+        // out of memory
+        GB_FREE_ALL ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
-    // free C_count for the first thread; it is no longer needed
-    GB_FREE (C_counts [0]) ;
-    C->magic = GB_MAGIC ;
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (17, ttt) ;
+// ttt = omp_get_wtime ( ) ;
+
+    //--------------------------------------------------------------------------
+    // allocate C
+    //--------------------------------------------------------------------------
+
+    // if M is sparse/hyper, then calloc C->b; otherwise use malloc
+    bool M_is_sparse_or_hyper = (M != NULL) &&
+        (GB_IS_SPARSE (M) || GB_IS_HYPERSPARSE (M)) ;
+    GrB_Type ctype = add->op->ztype ;
+    GB_OK (GB_new_bix (Chandle, // bitmap, new header
+        ctype, cvlen, cvdim, GB_Ap_malloc, true,
+        GxB_BITMAP, M_is_sparse_or_hyper, B->hyper_switch, cnvec, cnz, true,
+        Context)) ;
+    GrB_Matrix C = (*Chandle) ;
+
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (18, ttt) ;
+// ttt = omp_get_wtime ( ) ;
 
     //--------------------------------------------------------------------------
-    // allocate C->x and C->i
+    // if M is sparse/hyper, scatter it into the C bitmap
     //--------------------------------------------------------------------------
 
-    info = GB_ix_alloc (C, cnz, true, Context) ;
-    if (info != GrB_SUCCESS)
+    if (M_is_sparse_or_hyper)
     { 
-        // out of memory
-        GB_MATRIX_FREE (Chandle) ;
-        GB_FREE_WORK ;
-        return (info) ;
+        // FUTURE:: could just set Cb [pC] = 2 since Cb has just been calloc'd.
+        // However, in the future, this method might be able to modify C on
+        // input, in which case C->b will not be all zero.
+
+        int mthreads = GB_nthreads (GB_NNZ (M) + M->nvec, chunk, nthreads_max) ;
+        int mtasks = (mthreads == 1) ? 1 : (8 * mthreads) ;
+        if (!GB_ek_slice (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice,
+            M, &mtasks))
+        { 
+            // out of memory
+            GB_FREE_ALL ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+
+        // Cb [pC] += 2 for each entry M(i,j) in the mask
+        GB_bitmap_M_scatter (C,
+            NULL, 0, GB_ALL, NULL, NULL, 0, GB_ALL, NULL,
+            M, Mask_struct, GB_ASSIGN, GB_BITMAP_M_SCATTER_PLUS_2,
+            pstart_Mslice, kfirst_Mslice, klast_Mslice,
+            mthreads, mtasks, Context) ;
+        // the bitmap of C now contains:
+        //  Cb (i,j) = 0:   cij not present, mij zero
+        //  Cb (i,j) = 1:   cij present, mij zero           (not used yet)
+        //  Cb (i,j) = 2:   cij not present, mij 1
+        //  Cb (i,j) = 3:   cij present, mij 1              (not used yet)
     }
 
     //--------------------------------------------------------------------------
-    // C = A'*B, computing each entry with a dot product, via builtin semiring
+    // C<#>=A'*B, computing each entry with a dot product, via builtin semiring
     //--------------------------------------------------------------------------
 
     bool done = false ;
@@ -228,13 +312,13 @@ GrB_Info GB_AxB_dot2                // C=A'*B or C<!M>=A'*B, dot product method
 
         #define GB_Adot2B(add,mult,xname) GB_Adot2B_ ## add ## mult ## xname
 
-        #define GB_AxB_WORKER(add,mult,xname)                               \
-        {                                                                   \
-            info = GB_Adot2B (add,mult,xname) (C, M, Mask_struct,           \
-                Aslice, A_is_pattern, B, B_is_pattern, B_slice,             \
-                C_counts, nthreads, naslice, nbslice) ;                     \
-            done = (info != GrB_NO_VALUE) ;                                 \
-        }                                                                   \
+        #define GB_AxB_WORKER(add,mult,xname)                                \
+        {                                                                    \
+            info = GB_Adot2B (add,mult,xname) (C, M, Mask_comp, Mask_struct, \
+                A, A_is_pattern, A_slice, B, B_is_pattern, B_slice,          \
+                nthreads, naslice, nbslice) ;                                \
+            done = (info != GrB_NO_VALUE) ;                                  \
+        }                                                                    \
         break ;
 
         //----------------------------------------------------------------------
@@ -258,124 +342,142 @@ GrB_Info GB_AxB_dot2                // C=A'*B or C<!M>=A'*B, dot product method
     //--------------------------------------------------------------------------
 
     if (!done)
-    {
-        GB_BURBLE_MATRIX (C, "generic ") ;
+    { 
+        #define GB_DOT2_GENERIC
+        GB_BURBLE_MATRIX (C, "(generic C%s=A'*B) ", (M == NULL) ? "" :
+            (Mask_comp ? "<!M>" : "<M>")) ;
+        #include "GB_AxB_dot_generic.c"
+    }
 
-        //----------------------------------------------------------------------
-        // get operators, functions, workspace, contents of A, B, C, and M
-        //----------------------------------------------------------------------
+    //--------------------------------------------------------------------------
+    // free workspace
+    //--------------------------------------------------------------------------
 
-        GxB_binary_function fmult = mult->function ;
-        GxB_binary_function fadd  = add->op->function ;
+    GB_FREE_ALL ;
+    C->magic = GB_MAGIC ;
+    ASSERT_MATRIX_OK (C, "dot2: C = A'*B output", GB0) ;
+    ASSERT (!GB_ZOMBIES (C)) ;
 
-        size_t csize = C->type->size ;
-        size_t asize = A_is_pattern ? 0 : A->type->size ;
-        size_t bsize = B_is_pattern ? 0 : B->type->size ;
+    //--------------------------------------------------------------------------
+    // unpack C if A or B are hypersparse
+    //--------------------------------------------------------------------------
 
-        size_t xsize = mult->xtype->size ;
-        size_t ysize = mult->ytype->size ;
+    if (A_or_B_hyper)
+    {
 
-        // scalar workspace: because of typecasting, the x/y types need not
-        // be the same as the size of the A and B types.
-        // flipxy false: aki = (xtype) A(k,i) and bkj = (ytype) B(k,j)
-        // flipxy true:  aki = (ytype) A(k,i) and bkj = (xtype) B(k,j)
-        size_t aki_size = flipxy ? ysize : xsize ;
-        size_t bkj_size = flipxy ? xsize : ysize ;
+        //----------------------------------------------------------------------
+        // unpack C from bitmap to sparse/hyper
+        //----------------------------------------------------------------------
 
-        GB_void *GB_RESTRICT terminal = (GB_void *) add->terminal ;
+        // C is currently A_in->nvec by B_in->nvec, in bitmap form.  It must be
+        // unpacked into sparse/hypersparse form, with zombies.
 
-        GB_cast_function cast_A, cast_B ;
-        if (flipxy)
-        { 
-            // A is typecasted to y, and B is typecasted to x
-            cast_A = A_is_pattern ? NULL : 
-                     GB_cast_factory (mult->ytype->code, A->type->code) ;
-            cast_B = B_is_pattern ? NULL : 
-                     GB_cast_factory (mult->xtype->code, B->type->code) ;
-        }
-        else
+        //----------------------------------------------------------------------
+        // allocate the sparse/hypersparse structure of the final C
+        //----------------------------------------------------------------------
+
+        int64_t *GB_RESTRICT Cp = GB_MALLOC (cvdim+1, int64_t) ;
+        int64_t *GB_RESTRICT Ch =
+            B_is_hyper ? GB_MALLOC (cvdim, int64_t) : NULL ;
+        int64_t *GB_RESTRICT Ci = GB_MALLOC (cnz, int64_t) ;
+        if (Cp == NULL || (B_is_hyper && Ch == NULL) || Ci == NULL)
         { 
-            // A is typecasted to x, and B is typecasted to y
-            cast_A = A_is_pattern ? NULL :
-                     GB_cast_factory (mult->xtype->code, A->type->code) ;
-            cast_B = B_is_pattern ? NULL :
-                     GB_cast_factory (mult->ytype->code, B->type->code) ;
+            // out of memory
+            GB_Matrix_free (Chandle) ;
+            GB_FREE (Cp) ;
+            GB_FREE (Ch) ;
+            GB_FREE (Ci) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         //----------------------------------------------------------------------
-        // C = A'*B via dot products, function pointers, and typecasting
+        // construct the hyperlist of C, if B is hypersparse
         //----------------------------------------------------------------------
 
-        // aki = A(k,i), located in Ax [pA]
-        #define GB_GETA(aki,Ax,pA)                                          \
-            GB_void aki [GB_VLA(aki_size)] ;                                \
-            if (!A_is_pattern) cast_A (aki, Ax +((pA)*asize), asize)
-
-        // bkj = B(k,j), located in Bx [pB]
-        #define GB_GETB(bkj,Bx,pB)                                          \
-            GB_void bkj [GB_VLA(bkj_size)] ;                                \
-            if (!B_is_pattern) cast_B (bkj, Bx +((pB)*bsize), bsize)
-
-        // break if cij reaches the terminal value
-        #define GB_DOT_TERMINAL(cij)                                        \
-            if (terminal != NULL && memcmp (cij, terminal, csize) == 0)     \
-            {                                                               \
-                break ;                                                     \
-            }
-
-        // C(i,j) = A(i,k) * B(k,j)
-        #define GB_MULT(cij, aki, bkj)                                      \
-            GB_FMULT (cij, aki, bkj)
-
-        // C(i,j) += A(i,k) * B(k,j)
-        #define GB_MULTADD(cij, aki, bkj)                                   \
-            GB_void zwork [GB_VLA(csize)] ;                                 \
-            GB_MULT (zwork, aki, bkj) ;                                     \
-            fadd (cij, cij, zwork)
-
-        // define cij for each task
-        #define GB_CIJ_DECLARE(cij)                                         \
-            GB_void cij [GB_VLA(csize)]
-
-        // address of Cx [p]
-        #define GB_CX(p) Cx +((p)*csize)
+        nthreads = GB_nthreads (cvdim, chunk, nthreads_max) ;
+        if (B_is_hyper)
+        { 
+            // C becomes hypersparse
+            ASSERT (cvdim == B_in->nvec) ;
+            GB_memcpy (Ch, B_in->h, cvdim * sizeof (int64_t), nthreads) ;
+        }
 
-        // save the value of C(i,j)
-        #define GB_CIJ_SAVE(cij,p)                                          \
-            memcpy (GB_CX (p), cij, csize)
+        //----------------------------------------------------------------------
+        // construct the vector pointers of C
+        //----------------------------------------------------------------------
 
-        #define GB_ATYPE GB_void
-        #define GB_BTYPE GB_void
-        #define GB_CTYPE GB_void
+        int64_t pC ;
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (pC = 0 ; pC < cvdim+1 ; pC++)
+        { 
+            Cp [pC] = pC * cvlen ;
+        }
 
-        #define GB_PHASE_2_OF_2
+        //----------------------------------------------------------------------
+        // construct the pattern of C from its bitmap
+        //----------------------------------------------------------------------
 
-        // no vectorization
-        #define GB_PRAGMA_SIMD_VECTORIZE ;
-        #define GB_PRAGMA_SIMD_DOT(cij) ;
+        // C(i,j) becomes a zombie if not present in the bitmap
+        nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
 
-        if (flipxy)
+        int8_t *GB_RESTRICT Cb = C->b ;
+        if (A_is_hyper)
         { 
-            #define GB_FMULT(z,x,y) fmult (z,y,x)
-            #include "GB_AxB_dot2_meta.c"
-            #undef GB_FMULT
+            ASSERT (cvlen == A_in->nvec) ;
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (pC = 0 ; pC < cnz ; pC++)
+            {
+                int64_t i = Ah [pC % cvlen] ;
+                Ci [pC] = (Cb [pC]) ? i : GB_FLIP (i) ;
+            }
         }
         else
         { 
-            #define GB_FMULT(z,x,y) fmult (z,x,y)
-            #include "GB_AxB_dot2_meta.c"
-            #undef GB_FMULT
+            ASSERT (cvlen == cvlen_final && cvlen == A->vdim) ;
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (pC = 0 ; pC < cnz ; pC++)
+            {
+                int64_t i = pC % cvlen ;
+                Ci [pC] = (Cb [pC]) ? i : GB_FLIP (i) ;
+            }
         }
+
+        //----------------------------------------------------------------------
+        // transplant the new content and finalize C
+        //----------------------------------------------------------------------
+
+        C->p = Cp ; Cp = NULL ;
+        C->h = Ch ; Ch = NULL ;
+        C->i = Ci ; Ci = NULL ;
+        C->nzombies = cnz - C->nvals ;
+        C->vdim = cvdim_final ;
+        C->vlen = cvlen_final ;
+        C->nvals = -1 ;
+        C->nvec = cvdim ;
+        C->plen = cvdim ;
+        C->nvec_nonempty = (cvlen == 0) ? 0 : cvdim ;
+
+        // free the bitmap
+        GB_FREE (C->b) ;
+
+        // C is now sparse or hypersparse
+        ASSERT_MATRIX_OK (C, "dot2: unpacked C", GB0) ;
+        ASSERT (GB_ZOMBIES_OK (C)) ;
     }
 
     //--------------------------------------------------------------------------
-    // free workspace and return result
+    // return result
     //--------------------------------------------------------------------------
 
-    GB_FREE_WORK ;
-    ASSERT_MATRIX_OK (C, "dot: C = A'*B output", GB0) ;
     ASSERT (*Chandle == C) ;
-    (*mask_applied) = (M != NULL) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
+
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (19, ttt) ;
+// ttt = omp_get_wtime ( ) ;
+
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_AxB_dot2_control.c b/GraphBLAS/Source/GB_AxB_dot2_control.c
new file mode 100644
index 0000000000..2b1b79cb0d
--- /dev/null
+++ b/GraphBLAS/Source/GB_AxB_dot2_control.c
@@ -0,0 +1,86 @@
+//------------------------------------------------------------------------------
+// GB_AxB_dot2_control.c: determine when to use GB_AxB_dot2
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B where C is constructed in bitmap format.
+// C must be small and likely very dense.
+
+#include "GB_mxm.h"
+
+bool GB_AxB_dot2_control  // true: use dot2, false: use saxpy
+(
+    const GrB_Matrix A,
+    const GrB_Matrix B,
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // C = A'*B is very efficient if A and/or B are full or bitmap
+    //--------------------------------------------------------------------------
+
+    if (GB_IS_FULL (A) || GB_IS_BITMAP (A) ||
+        GB_IS_FULL (B) || GB_IS_BITMAP (B))
+    { 
+        return (true) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // both A and B are sparse or hyper
+    //--------------------------------------------------------------------------
+
+    // Notation: C=A'*B where all 3 matrices are CSC.  This might be C=A*B'
+    // where all 3 matrices are CSR, equivalently.  The comments here assume
+    // CSC, but this method is CSC/CSR agnostic.
+
+    double anz = GB_NNZ (A) ;       // # of entries in A
+    double bnz = GB_NNZ (B) ;       // # of entries in B
+    if (A->nvec_nonempty < 0) A->nvec_nonempty = GB_nvec_nonempty (A, Context) ;
+    if (B->nvec_nonempty < 0) B->nvec_nonempty = GB_nvec_nonempty (B, Context) ;
+    double anvec = A->nvec_nonempty ;
+    double bnvec = B->nvec_nonempty ;
+    double avlen = A->vlen ;
+    ASSERT (avlen == B->vlen) ;
+    double cnz = (anvec * bnvec) ;  // size of the C bitmap
+    double row_degree = anz / GB_IMAX (avlen, 1) ;
+    double col_degree = anz / GB_IMAX (anvec, 1) ;
+
+    if (cnz > anz + bnz)
+    { 
+        // The C bitmap is too big, use saxpy and construct C as sparse
+        GBURBLE ("(C large: use saxpy C=(A')*B) ") ;
+        return (false) ;
+    }
+
+    if ((anz + bnz > 10000 * cnz) || (cnz <= 100))
+    { 
+        // The C bitmap is very small compared with A and B, so use dot2
+        // and construct C as bitmap
+        GBURBLE ("(C tiny: dot) ") ;
+        return (true) ;
+    }
+
+    // average # of entries in each row and column of A (assuming A is CSC)
+    if (row_degree < 0.125 && col_degree > 1200)
+    { 
+        // If AT=A' is computed, it will have mostly empty vectors (the
+        // row_degree of A), so do not transpose it.  If the fraction of
+        // populated vectors in AT is very low (< 0.0625 by default), then AT
+        // will become hypersparse, and this slows down the saxpy method.  If
+        // the vectors (col_degree) have lots of entries, then dot2 is
+        // efficient in this case.  If both conditions hold, use dot2 and
+        // compute C as bitmap.
+        GBURBLE ("(A' implicit: dot) ") ;
+        return (true) ;
+    }
+
+    // if none of the above rules trigger, use saxpy
+    GBURBLE ("(saxpy C=(A')*B) ") ;
+    return (false) ;
+}
+
diff --git a/GraphBLAS/Source/GB_AxB_dot3.c b/GraphBLAS/Source/GB_AxB_dot3.c
index 7db7278187..a8cb85f035 100644
--- a/GraphBLAS/Source/GB_AxB_dot3.c
+++ b/GraphBLAS/Source/GB_AxB_dot3.c
@@ -2,15 +2,18 @@
 // GB_AxB_dot3: compute C<M> = A'*B in parallel
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // This function only computes C<M>=A'*B.  The mask must be present, and not
-// complemented, either valued or structural.  The mask is always applied.
+// complemented, and can be either valued or structural.  The mask is always
+// applied.  C and M are both sparse or hypersparse, and have the same sparsity
+// structure.
 
 #include "GB_mxm.h"
+#include "GB_binop.h"
 #ifndef GBCOMPACT
 #include "GB_AxB__include.h"
 #endif
@@ -23,7 +26,7 @@
 #define GB_FREE_ALL                                                     \
 {                                                                       \
     GB_FREE_WORK ;                                                      \
-    GB_MATRIX_FREE (Chandle) ;                                          \
+    GB_Matrix_free (Chandle) ;                                          \
 }
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
@@ -50,15 +53,33 @@ GrB_Info GB_AxB_dot3                // C<M> = A'*B using dot product method
     ASSERT_MATRIX_OK (M, "M for dot3 A'*B", GB0) ;
     ASSERT_MATRIX_OK (A, "A for dot3 A'*B", GB0) ;
     ASSERT_MATRIX_OK (B, "B for dot3 A'*B", GB0) ;
-    ASSERT (!GB_PENDING (M)) ; ASSERT (!GB_ZOMBIES (M)) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
-    ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_ZOMBIES (B)) ;
+
+    ASSERT (!GB_ZOMBIES (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;    // C is jumbled if M is jumbled
+    ASSERT (!GB_PENDING (M)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+    ASSERT (!GB_PENDING (B)) ;
+
+    ASSERT (!GB_IS_BITMAP (M)) ;
+    ASSERT (!GB_IS_FULL (M)) ;
+
     ASSERT_SEMIRING_OK (semiring, "semiring for numeric A'*B", GB0) ;
-    ASSERT (A->vlen == B->vlen) ;
 
     int ntasks, max_ntasks = 0, nthreads ;
     GB_task_struct *TaskList = NULL ;
 
+    GBURBLE ("(%s%s%s%s=%s'*%s) ",
+        GB_sparsity_char_matrix (M),    // C has the same sparsity as M
+        Mask_struct ? "{" : "<",
+        GB_sparsity_char_matrix (M),
+        Mask_struct ? "}" : ">",
+        GB_sparsity_char_matrix (A),
+        GB_sparsity_char_matrix (B)) ;
+
     //--------------------------------------------------------------------------
     // get the semiring operators
     //--------------------------------------------------------------------------
@@ -109,25 +130,25 @@ GrB_Info GB_AxB_dot3                // C<M> = A'*B using dot product method
     const int64_t mvdim = M->vdim ;
     const int64_t mnz = GB_NNZ (M) ;
     const int64_t mnvec = M->nvec ;
-    const bool M_is_hyper = M->is_hyper ;
+    const bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
+    const bool M_is_sparse = GB_IS_SPARSE (M) ;
 
     const int64_t *GB_RESTRICT Ap = A->p ;
     const int64_t *GB_RESTRICT Ah = A->h ;
-    // const int64_t *GB_RESTRICT Ai = A->i ;
-    // const int64_t avlen = A->vlen ;
-    // const int64_t avdim = A->vdim ;
-    // const int64_t anz = GB_NNZ (A) ;
+    const int64_t vlen = A->vlen ;
     const int64_t anvec = A->nvec ;
-    const bool A_is_hyper = A->is_hyper ;
+    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
+    const bool A_is_sparse = GB_IS_SPARSE (A) ;
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
 
     const int64_t *GB_RESTRICT Bp = B->p ;
     const int64_t *GB_RESTRICT Bh = B->h ;
-    // const int64_t *GB_RESTRICT Bi = B->i ;
-    // const int64_t bvlen = B->vlen ;
-    // const int64_t bvdim = B->vdim ;
-    // const int64_t bnz = GB_NNZ (B) ;
     const int64_t bnvec = B->nvec ;
-    const bool B_is_hyper = B->is_hyper ;
+    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
+    const bool B_is_sparse = GB_IS_SPARSE (B) ;
+    const bool B_is_bitmap = GB_IS_BITMAP (B) ;
+    ASSERT (A->vlen == B->vlen) ;
+    ASSERT (vlen > 0) ;
 
     //--------------------------------------------------------------------------
     // allocate C, the same size and # of entries as M
@@ -138,9 +159,12 @@ GrB_Info GB_AxB_dot3                // C<M> = A'*B using dot product method
     int64_t cvdim = mvdim ;
     int64_t cnz = mnz ;
     int64_t cnvec = mnvec ;
+    int C_sparsity = (M_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE ;
 
-    info = GB_create (Chandle, ctype, cvlen, cvdim, GB_Ap_malloc, true,
-        GB_SAME_HYPER_AS (M_is_hyper), M->hyper_ratio, cnvec,
+    // C is sparse or hypersparse, not full or bitmap
+    info = GB_new_bix (Chandle, // sparse or hyper (from M), new header
+        ctype, cvlen, cvdim, GB_Ap_malloc, true,
+        C_sparsity, true, M->hyper_switch, cnvec,
         cnz+1,  // add one to cnz for GB_cumsum of Cwork in GB_AxB_dot3_slice
         true, Context) ;
     if (info != GrB_SUCCESS)
@@ -166,19 +190,17 @@ GrB_Info GB_AxB_dot3                // C<M> = A'*B using dot product method
     // copy Mp and Mh into C
     //--------------------------------------------------------------------------
 
-    // FUTURE:: C->p and C->h could be shallow copies of M->p and M->h, which
-    // could same some time and memory if C is then, say, transposed by
-    // GB_accum_mask later on.
-
     nthreads = GB_nthreads (cnvec, chunk, nthreads_max) ;
+
+    // M is sparse or hypersparse; C is the same as M
     GB_memcpy (Cp, Mp, (cnvec+1) * sizeof (int64_t), nthreads) ;
     if (M_is_hyper)
     { 
         GB_memcpy (Ch, Mh, cnvec * sizeof (int64_t), nthreads) ;
     }
-    C->magic = GB_MAGIC ;
     C->nvec_nonempty = M->nvec_nonempty ;
     C->nvec = M->nvec ;
+    C->magic = GB_MAGIC ;
 
     //--------------------------------------------------------------------------
     // construct the tasks for the first phase
@@ -195,93 +217,25 @@ GrB_Info GB_AxB_dot3                // C<M> = A'*B using dot product method
     // The work to compute C(i,j) is held in Cwork [p], if C(i,j) appears in
     // as the pth entry in C.
 
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        // GB_GET_TASK_DESCRIPTOR ;
-        int64_t kfirst = TaskList [taskid].kfirst ;
-        int64_t klast  = TaskList [taskid].klast ;
-        bool fine_task = (klast == -1) ;
-        if (fine_task)
-        { 
-            // a fine task operates on a slice of a single vector
-            klast = kfirst ;
-        }
+    #define GB_DOT3
+    #define GB_DOT3_PHASE1
 
-        int64_t bpleft = 0 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get j, the kth vector of C and M
-            //------------------------------------------------------------------
-
-            int64_t j = (Mh == NULL) ? k : Mh [k] ;
-            GB_GET_VECTOR (pM, pM_end, pM, pM_end, Mp, k) ;
-
-            //------------------------------------------------------------------
-            // get B(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pB, pB_end ;
-            GB_lookup (B_is_hyper, Bh, Bp, &bpleft, bnvec-1, j, &pB, &pB_end) ;
-            int64_t bjnz = pB_end - pB ;
-
-            //------------------------------------------------------------------
-            // estimate the work to compute each entry of C(:,j)
-            //------------------------------------------------------------------
-
-            // A decent estimate of the work to compute the dot product C(i,j)
-            // = A(:,i)'*B(:,j) is min (|A(:,i)|, |B(:,j)|) + 1.  This is a
-            // lower bound.  The actual work could require a binary search of
-            // either A(:,i) or B(:,j), or a merge of the two vectors.  Or it
-            // could require no work at all if all entries in A(:,i) appear
-            // before all entries in B(:,j), or visa versa.  No work is done if
-            // M(i,j)=0.  A more accurate estimate is possible to compute,
-            // following the different methods used in
-            // Template/GB_AxB_dot_cij.c.
-
-            if (bjnz == 0)
-            {
-                // B(:,j) is empty, so C(:,j) is empty as well.  No work is to
-                // be done, but it still takes unit work to flag each C(:,j) as
-                // a zombie
-                for ( ; pM < pM_end ; pM++)
-                { 
-                    Cwork [pM] = 1 ;
-                }
-            }
-            else
-            {
-                int64_t apleft = 0 ;
-                for ( ; pM < pM_end ; pM++)
-                {
-                    int64_t work = 1 ;
-                    if (GB_mcast (Mx, pM, msize))
-                    { 
-                        int64_t pA, pA_end, i = Mi [pM] ;
-                        GB_lookup (A_is_hyper, Ah, Ap, &apleft, anvec-1, i,
-                            &pA, &pA_end) ;
-                        int64_t ajnz = pA_end - pA ;
-                        work += GB_IMIN (ajnz, bjnz) ;
-                    }
-                    Cwork [pM] = work ;
-                }
-            }
-        }
+    if (M_is_sparse && Mask_struct)
+    {
+        // special case: M is sparse and structural
+        #define GB_MASK_SPARSE_AND_STRUCTURAL
+        #include "GB_meta16_factory.c"
+        #undef GB_MASK_SPARSE_AND_STRUCTURAL
+    }
+    else
+    {
+        // general case: M sparse/hyper, structural/valued
+        #include "GB_meta16_factory.c"
     }
 
+    #undef GB_DOT3
+    #undef GB_DOT3_PHASE1
+
     //--------------------------------------------------------------------------
     // free the current tasks and construct the tasks for the second phase
     //--------------------------------------------------------------------------
@@ -290,7 +244,7 @@ GrB_Info GB_AxB_dot3                // C<M> = A'*B using dot product method
     GB_OK (GB_AxB_dot3_slice (&TaskList, &max_ntasks, &ntasks, &nthreads,
         C, Context)) ;
 
-    GBBURBLE ("nthreads %d ntasks %d ", nthreads, ntasks) ;
+    GBURBLE ("nthreads %d ntasks %d ", nthreads, ntasks) ;
 
     //--------------------------------------------------------------------------
     // C<M> = A'*B, via masked dot product method and built-in semiring
@@ -334,124 +288,22 @@ GrB_Info GB_AxB_dot3                // C<M> = A'*B using dot product method
     //--------------------------------------------------------------------------
 
     if (!done)
-    {
-        GB_BURBLE_MATRIX (C, "generic ") ;
-
-        //----------------------------------------------------------------------
-        // get operators, functions, workspace, contents of A, B, C, and M
-        //----------------------------------------------------------------------
-
-        GxB_binary_function fmult = mult->function ;
-        GxB_binary_function fadd  = add->op->function ;
-
-        size_t csize = C->type->size ;
-        size_t asize = A_is_pattern ? 0 : A->type->size ;
-        size_t bsize = B_is_pattern ? 0 : B->type->size ;
-
-        size_t xsize = mult->xtype->size ;
-        size_t ysize = mult->ytype->size ;
-
-        // scalar workspace: because of typecasting, the x/y types need not
-        // be the same as the size of the A and B types.
-        // flipxy false: aki = (xtype) A(k,i) and bkj = (ytype) B(k,j)
-        // flipxy true:  aki = (ytype) A(k,i) and bkj = (xtype) B(k,j)
-        size_t aki_size = flipxy ? ysize : xsize ;
-        size_t bkj_size = flipxy ? xsize : ysize ;
-
-        GB_void *GB_RESTRICT terminal = (GB_void *) add->terminal ;
-
-        GB_cast_function cast_A, cast_B ;
-        if (flipxy)
-        { 
-            // A is typecasted to y, and B is typecasted to x
-            cast_A = A_is_pattern ? NULL : 
-                     GB_cast_factory (mult->ytype->code, A->type->code) ;
-            cast_B = B_is_pattern ? NULL : 
-                     GB_cast_factory (mult->xtype->code, B->type->code) ;
-        }
-        else
-        { 
-            // A is typecasted to x, and B is typecasted to y
-            cast_A = A_is_pattern ? NULL :
-                     GB_cast_factory (mult->xtype->code, A->type->code) ;
-            cast_B = B_is_pattern ? NULL :
-                     GB_cast_factory (mult->ytype->code, B->type->code) ;
-        }
-
-        //----------------------------------------------------------------------
-        // C<M> = A'*B via dot products, function pointers, and typecasting
-        //----------------------------------------------------------------------
-
-        // aki = A(k,i), located in Ax [pA]
-        #define GB_GETA(aki,Ax,pA)                                          \
-            GB_void aki [GB_VLA(aki_size)] ;                                \
-            if (!A_is_pattern) cast_A (aki, Ax +((pA)*asize), asize)
-
-        // bkj = B(k,j), located in Bx [pB]
-        #define GB_GETB(bkj,Bx,pB)                                          \
-            GB_void bkj [GB_VLA(bkj_size)] ;                                \
-            if (!B_is_pattern) cast_B (bkj, Bx +((pB)*bsize), bsize)
-
-        // break if cij reaches the terminal value
-        #define GB_DOT_TERMINAL(cij)                                        \
-            if (terminal != NULL && memcmp (cij, terminal, csize) == 0)     \
-            {                                                               \
-                break ;                                                     \
-            }
-
-        // C(i,j) = A(i,k) * B(k,j)
-        #define GB_MULT(cij, aki, bkj)                                      \
-            GB_FMULT (cij, aki, bkj)
-
-        // C(i,j) += A(i,k) * B(k,j)
-        #define GB_MULTADD(cij, aki, bkj)                                   \
-            GB_void zwork [GB_VLA(csize)] ;                                 \
-            GB_MULT (zwork, aki, bkj) ;                                     \
-            fadd (cij, cij, zwork)
-
-        // define cij for each task
-        #define GB_CIJ_DECLARE(cij)                                         \
-            GB_void cij [GB_VLA(csize)]
-
-        // address of Cx [p]
-        #define GB_CX(p) Cx +((p)*csize)
-
-        // save the value of C(i,j)
-        #define GB_CIJ_SAVE(cij,p)                                          \
-            memcpy (GB_CX (p), cij, csize)
-
-        #define GB_ATYPE GB_void
-        #define GB_BTYPE GB_void
-        #define GB_CTYPE GB_void
-
-        // no vectorization
-        #define GB_PRAGMA_SIMD_VECTORIZE ;
-        #define GB_PRAGMA_SIMD_DOT(cij) ;
-
-        if (flipxy)
-        { 
-            #define GB_FMULT(z,x,y) fmult (z,y,x)
-            #include "GB_AxB_dot3_template.c"
-            #undef GB_FMULT
-        }
-        else
-        { 
-            #define GB_FMULT(z,x,y) fmult (z,x,y)
-            #include "GB_AxB_dot3_template.c"
-            #undef GB_FMULT
-        }
+    { 
+        #define GB_DOT3_GENERIC
+        GB_BURBLE_MATRIX (C, "(generic C<M>=A'*B) ") ;
+        #include "GB_AxB_dot_generic.c"
     }
 
     //--------------------------------------------------------------------------
     // free workspace and return result
     //--------------------------------------------------------------------------
 
-    if (C->nzombies > 0) { if (!GB_queue_insert (C)) GB_PANIC ; } // TODO in 4.0: delete
-
     GB_FREE_WORK ;
+    C->jumbled = GB_JUMBLED (M) ;   // C is jumbled if M is jumbled
     ASSERT_MATRIX_OK (C, "dot3: C<M> = A'*B output", GB0) ;
     ASSERT (*Chandle == C) ;
     ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
     ASSERT (!GB_PENDING (C)) ;
     return (GrB_SUCCESS) ;
 }
diff --git a/GraphBLAS/Source/GB_AxB_dot3_one_slice.c b/GraphBLAS/Source/GB_AxB_dot3_one_slice.c
index 47ffd13a1c..2978c10b30 100644
--- a/GraphBLAS/Source/GB_AxB_dot3_one_slice.c
+++ b/GraphBLAS/Source/GB_AxB_dot3_one_slice.c
@@ -2,8 +2,8 @@
 // GB_AxB_dot3_one_slice: slice the entries and vectors of a single matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -52,6 +52,13 @@ GrB_Info GB_AxB_dot3_one_slice
     ASSERT (p_nthreads != NULL) ;
     ASSERT_MATRIX_OK (M, "M for dot3_one_slice", GB0) ;
 
+    // the pattern of M is not accessed
+    ASSERT (GB_ZOMBIES_OK (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (GB_PENDING_OK (M)) ;
+    ASSERT (!GB_IS_BITMAP (M)) ;
+    ASSERT (!GB_IS_FULL (M)) ;
+
     (*p_TaskList  ) = NULL ;
     (*p_max_ntasks) = 0 ;
     (*p_ntasks    ) = 0 ;
@@ -68,8 +75,9 @@ GrB_Info GB_AxB_dot3_one_slice
     //--------------------------------------------------------------------------
 
     const int64_t *GB_RESTRICT Mp = M->p ;
-    const int64_t mnz = GB_NNZ (M) ;
+    const int64_t mnz = GB_NNZ_HELD (M) ;
     const int64_t mnvec = M->nvec ;
+    const int64_t mvlen = M->vlen ;
 
     //--------------------------------------------------------------------------
     // allocate the initial TaskList
@@ -113,11 +121,11 @@ GrB_Info GB_AxB_dot3_one_slice
     // slice the work into coarse tasks
     //--------------------------------------------------------------------------
 
-    if (!GB_pslice (&Coarse, Mp, mnvec, ntasks1))
+    if (!GB_pslice (&Coarse, Mp, mnvec, ntasks1, false))
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -190,7 +198,7 @@ GrB_Info GB_AxB_dot3_one_slice
             // determine the # of fine-grain tasks to create for vector k
             //------------------------------------------------------------------
 
-            int64_t mknz = Mp [k+1] - Mp [k] ;
+            int64_t mknz = (Mp == NULL) ? mvlen : (Mp [k+1] - Mp [k]) ;
             int nfine = ((double) mknz) / target_task_size ;
             nfine = GB_IMAX (nfine, 1) ;
 
@@ -232,8 +240,9 @@ GrB_Info GB_AxB_dot3_one_slice
                     // slice M(:,k) for this task
                     int64_t p1, p2 ;
                     GB_PARTITION (p1, p2, mknz, tfine, nfine) ;
-                    int64_t pM     = Mp [k] + p1 ;
-                    int64_t pM_end = Mp [k] + p2 ;
+                    int64_t pM_start = GBP (Mp, k, mvlen) ;
+                    int64_t pM     = pM_start + p1 ;
+                    int64_t pM_end = pM_start + p2 ;
                     TaskList [ntasks].pM     = pM ;
                     TaskList [ntasks].pM_end = pM_end ;
 
diff --git a/GraphBLAS/Source/GB_AxB_dot3_slice.c b/GraphBLAS/Source/GB_AxB_dot3_slice.c
index 0c2e5e7d68..00ffb760a4 100644
--- a/GraphBLAS/Source/GB_AxB_dot3_slice.c
+++ b/GraphBLAS/Source/GB_AxB_dot3_slice.c
@@ -2,8 +2,8 @@
 // GB_AxB_dot3_slice: slice the entries and vectors for C<M>=A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -63,6 +63,11 @@ GrB_Info GB_AxB_dot3_slice
     // ASSERT_MATRIX_OK (C, ...) cannot be done since C->i is the work need to
     // compute the entry, not the row index itself.
 
+    // C is always constructed as sparse or hypersparse, not full, since it
+    // must accomodate zombies
+    ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_IS_BITMAP (C)) ;
+
     (*p_TaskList  ) = NULL ;
     (*p_max_ntasks) = 0 ;
     (*p_ntasks    ) = 0 ;
@@ -81,7 +86,8 @@ GrB_Info GB_AxB_dot3_slice
     const int64_t *GB_RESTRICT Cp = C->p ;
     int64_t *GB_RESTRICT Cwork = C->i ;
     const int64_t cnvec = C->nvec ;
-    const int64_t cnz = GB_NNZ (C) ;
+    const int64_t cvlen = C->vlen ;
+    const int64_t cnz = GB_NNZ_HELD (C) ;
 
     //--------------------------------------------------------------------------
     // compute the cumulative sum of the work
@@ -137,11 +143,11 @@ GrB_Info GB_AxB_dot3_slice
     // slice the work into coarse tasks
     //--------------------------------------------------------------------------
 
-    if (!GB_pslice (&Coarse, Cwork, cnz, ntasks1))
+    if (!GB_pslice (&Coarse, Cwork, cnz, ntasks1, false))
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -162,11 +168,13 @@ GrB_Info GB_AxB_dot3_slice
         { 
             // find the first vector of the slice for task taskid: the
             // vector that owns the entry Ci [pfirst] and Cx [pfirst].
-            int64_t kfirst = GB_search_for_vector (pfirst, Cp, 0, cnvec) ;
+            int64_t kfirst = GB_search_for_vector (pfirst, Cp, 0, cnvec,
+                cvlen) ;
 
             // find the last vector of the slice for task taskid: the
             // vector that owns the entry Ci [plast] and Cx [plast].
-            int64_t klast = GB_search_for_vector (plast, Cp, kfirst, cnvec) ;
+            int64_t klast = GB_search_for_vector (plast, Cp, kfirst, cnvec,
+                cvlen) ;
 
             // construct a coarse task that computes Ci,Cx [pfirst:plast].
             // These entries appear in C(:,kfirst:klast), but this task does
diff --git a/GraphBLAS/Source/GB_AxB_dot4.c b/GraphBLAS/Source/GB_AxB_dot4.c
index e698867f45..8be2aa2f61 100644
--- a/GraphBLAS/Source/GB_AxB_dot4.c
+++ b/GraphBLAS/Source/GB_AxB_dot4.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_AxB_dot4: compute C+=A'*B in place
+// GB_AxB_dot4: compute C+=A'*B in-place
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,8 +12,8 @@
 // this function.
 
 #include "GB_mxm.h"
+#include "GB_binop.h"
 #include "GB_unused.h"
-#include "GB_mkl.h"
 #ifndef GBCOMPACT
 #include "GB_AxB__include.h"
 #endif
@@ -40,69 +40,45 @@ GrB_Info GB_AxB_dot4                // C+=A'*B, dot product method
     //--------------------------------------------------------------------------
 
     GrB_Info info ;
-    ASSERT_MATRIX_OK (C, "C for dot in place += A'*B", GB0) ;
-    ASSERT_MATRIX_OK (A, "A for dot in place += A'*B", GB0) ;
-    ASSERT_MATRIX_OK (B, "B for dot in place += A'*B", GB0) ;
-    ASSERT (!GB_PENDING (C)) ; ASSERT (!GB_ZOMBIES (C)) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
-    ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT_MATRIX_OK (C, "C for dot in-place += A'*B", GB0) ;
+    ASSERT_MATRIX_OK (A, "A for dot in-place += A'*B", GB0) ;
+    ASSERT_MATRIX_OK (B, "B for dot in-place += A'*B", GB0) ;
     ASSERT (GB_is_dense (C)) ;
-    ASSERT_SEMIRING_OK (semiring, "semiring for in place += A'*B", GB0) ;
+    ASSERT (!GB_ZOMBIES (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+    ASSERT (!GB_PENDING (B)) ;
+
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (!GB_IS_BITMAP (B)) ;
+
+    ASSERT_SEMIRING_OK (semiring, "semiring for in-place += A'*B", GB0) ;
     ASSERT (A->vlen == B->vlen) ;
 
     int64_t *GB_RESTRICT A_slice = NULL ;
     int64_t *GB_RESTRICT B_slice = NULL ;
 
+    GBURBLE ("(%s+=%s'*%s) ",
+        GB_sparsity_char_matrix (C),
+        GB_sparsity_char_matrix (A),
+        GB_sparsity_char_matrix (B)) ;
+
     //--------------------------------------------------------------------------
-    // determine the number of threads to use, and the use_mkl flag
+    // determine the number of threads to use
     //--------------------------------------------------------------------------
 
-    int64_t anz = GB_NNZ (A) ;
-    int64_t bnz = GB_NNZ (B) ;
+    int64_t anz = GB_NNZ_HELD (A) ;
+    int64_t bnz = GB_NNZ_HELD (B) ;
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
     int nthreads = GB_nthreads (anz + bnz, chunk, nthreads_max) ;
-    bool use_mkl = (Context == NULL) ? false : Context->use_mkl ;
-
-    //--------------------------------------------------------------------------
-    // use MKL_graph if it available and has this semiring
-    //--------------------------------------------------------------------------
-
-    // Note that GB_AxB_dot4 computes C+=A'*B where A and B treated as if CSC,
-    // but MKL views the matrices as CSR.  MKL only handles the case when B
-    // is a dense vector in mkl_graph_mxv, and A' in CSC format is the same
-    // as A in CSR.
-
-    #if GB_HAS_MKL_GRAPH
-
-    if (use_mkl &&
-        (semiring == GrB_PLUS_TIMES_SEMIRING_FP32 ||
-         semiring == GxB_PLUS_SECOND_FP32) && GB_VECTOR_OK (C)
-        && GB_is_dense (C) && GB_is_dense (B) && GB_VECTOR_OK (B) && !flipxy
-        && !GB_IS_HYPER (A))
-    {
-
-        info = // GrB_NO_VALUE ;
-        #if 1
-        GB_AxB_dot4_mkl (
-            (GrB_Vector) C,     // input/output (now a vector)
-            A,                  // first input matrix
-            (GrB_Vector) B,     // second input (now a vector)
-            semiring,           // semiring that defines C=A*B
-            Context) ;
-        #endif
-
-        if (info != GrB_NO_VALUE)
-        {
-            // MKL_graph supports this semiring, and has ether computed C=A*B,
-            // C<M>=A*B, or C<!M>=A*B, or has failed.
-            return (info) ;
-        }
-
-        // If MKL_graph doesn't support this semiring, it returns GrB_NO_VALUE,
-        // so fall through to use GraphBLAS, below.
-    }
-    #endif
 
+    // #include "GB_AxB_dot4_mkl_template.c
 
     //--------------------------------------------------------------------------
     // get the semiring operators
@@ -144,7 +120,11 @@ GrB_Info GB_AxB_dot4                // C+=A'*B, dot product method
     // slice A and B
     //--------------------------------------------------------------------------
 
+    // A and B can have any sparsity: full, sparse, or hypersparse.
+    // C is always full.
+
     int64_t anvec = A->nvec ;
+    int64_t vlen  = A->vlen ;
     int64_t bnvec = B->nvec ;
 
     int naslice = (nthreads == 1) ? 1 : (16 * nthreads) ;
@@ -153,12 +133,12 @@ GrB_Info GB_AxB_dot4                // C+=A'*B, dot product method
     naslice = GB_IMIN (naslice, anvec) ;
     nbslice = GB_IMIN (nbslice, bnvec) ;
 
-    if (!GB_pslice (&A_slice, A->p, anvec, naslice)  ||
-        !GB_pslice (&B_slice, B->p, bnvec, nbslice))
+    if (!GB_pslice (&A_slice, A->p, anvec, naslice, false)  ||
+        !GB_pslice (&B_slice, B->p, bnvec, nbslice, false))
     { 
         // out of memory
         GB_FREE_WORK ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -205,112 +185,10 @@ GrB_Info GB_AxB_dot4                // C+=A'*B, dot product method
     //--------------------------------------------------------------------------
 
     if (!done)
-    {
-        GB_BURBLE_MATRIX (C, "generic ") ;
-
-        //----------------------------------------------------------------------
-        // get operators, functions, workspace, contents of A, B, C, and M
-        //----------------------------------------------------------------------
-
-        GxB_binary_function fmult = mult->function ;
-        GxB_binary_function fadd  = add->op->function ;
-
-        size_t csize = C->type->size ;
-        size_t asize = A_is_pattern ? 0 : A->type->size ;
-        size_t bsize = B_is_pattern ? 0 : B->type->size ;
-
-        size_t xsize = mult->xtype->size ;
-        size_t ysize = mult->ytype->size ;
-
-        // scalar workspace: because of typecasting, the x/y types need not
-        // be the same as the size of the A and B types.
-        // flipxy false: aki = (xtype) A(k,i) and bkj = (ytype) B(k,j)
-        // flipxy true:  aki = (ytype) A(k,i) and bkj = (xtype) B(k,j)
-        size_t aki_size = flipxy ? ysize : xsize ;
-        size_t bkj_size = flipxy ? xsize : ysize ;
-
-        GB_void *GB_RESTRICT terminal = (GB_void *) add->terminal ;
-
-        GB_cast_function cast_A, cast_B ;
-        if (flipxy)
-        { 
-            // A is typecasted to y, and B is typecasted to x
-            cast_A = A_is_pattern ? NULL : 
-                     GB_cast_factory (mult->ytype->code, A->type->code) ;
-            cast_B = B_is_pattern ? NULL : 
-                     GB_cast_factory (mult->xtype->code, B->type->code) ;
-        }
-        else
-        { 
-            // A is typecasted to x, and B is typecasted to y
-            cast_A = A_is_pattern ? NULL :
-                     GB_cast_factory (mult->xtype->code, A->type->code) ;
-            cast_B = B_is_pattern ? NULL :
-                     GB_cast_factory (mult->ytype->code, B->type->code) ;
-        }
-
-        //----------------------------------------------------------------------
-        // C = A'*B via dot products, function pointers, and typecasting
-        //----------------------------------------------------------------------
-
-        // aki = A(k,i), located in Ax [pA]
-        #define GB_GETA(aki,Ax,pA)                                          \
-            GB_void aki [GB_VLA(aki_size)] ;                                \
-            if (!A_is_pattern) cast_A (aki, Ax +((pA)*asize), asize)
-
-        // bkj = B(k,j), located in Bx [pB]
-        #define GB_GETB(bkj,Bx,pB)                                          \
-            GB_void bkj [GB_VLA(bkj_size)] ;                                \
-            if (!B_is_pattern) cast_B (bkj, Bx +((pB)*bsize), bsize)
-
-        // break if cij reaches the terminal value
-        #define GB_DOT_TERMINAL(cij)                                        \
-            if (terminal != NULL && memcmp (cij, terminal, csize) == 0)     \
-            {                                                               \
-                break ;                                                     \
-            }
-
-        // C(i,j) += A(i,k) * B(k,j)
-        #define GB_MULTADD(cij, aki, bkj)                                   \
-            GB_void zwork [GB_VLA(csize)] ;                                 \
-            GB_FMULT (zwork, aki, bkj) ;                                    \
-            fadd (cij, cij, zwork)
-
-        // define cij for each task
-        #define GB_CIJ_DECLARE(cij)                                         \
-            GB_void cij [GB_VLA(csize)]
-
-        // address of Cx [p]
-        #define GB_CX(p) Cx +((p)*csize)
-
-        // cij = Cx [p]
-        #define GB_GETC(cij,pC)                                             \
-            memcpy (cij, GB_CX (pC), csize)
-
-        // Cx [p] = cij
-        #define GB_PUTC(cij,pC)                                             \
-            memcpy (GB_CX (pC), cij, csize)
-
-        #define GB_ATYPE GB_void
-        #define GB_BTYPE GB_void
-        #define GB_CTYPE GB_void
-
-        // no vectorization
-        #define GB_PRAGMA_SIMD_VECTORIZE ;
-        #define GB_PRAGMA_SIMD_DOT(cij) ;
-
-        if (flipxy)
-        { 
-            #define GB_FMULT(z,x,y) fmult (z,y,x)
-            #include "GB_AxB_dot4_template.c"
-            #undef GB_FMULT
-        }
-        else
-        { 
-            #define GB_FMULT(z,x,y) fmult (z,x,y)
-            #include "GB_AxB_dot4_template.c"
-            #undef GB_FMULT
-        }
+    { 
+        #define GB_DOT4_GENERIC
+        GB_BURBLE_MATRIX (C, "(generic C+=A'*B) ") ;
+        #include "GB_AxB_dot_generic.c"
     }
 
     //--------------------------------------------------------------------------
@@ -318,7 +196,7 @@ GrB_Info GB_AxB_dot4                // C+=A'*B, dot product method
     //--------------------------------------------------------------------------
 
     GB_FREE_WORK ;
-    ASSERT_MATRIX_OK (C, "dot: C += A'*B output", GB0) ;
+    ASSERT_MATRIX_OK (C, "dot4: C += A'*B output", GB0) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_AxB_meta.c b/GraphBLAS/Source/GB_AxB_meta.c
index 74df0dd399..b09d52a8e7 100644
--- a/GraphBLAS/Source/GB_AxB_meta.c
+++ b/GraphBLAS/Source/GB_AxB_meta.c
@@ -2,8 +2,8 @@
 // GB_AxB_meta: C<M>=A*B meta algorithm
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,22 +15,19 @@
 // This algorithm may decide that it is more efficient to apply the mask later,
 // in GB_accum_mask, after this matrix C is computed, in GB_mxm.  The result is
 // either the T matrix in GB_mxm, or (if done in-place), the final output
-// matrix C passed in from the user (C_in_place).
+// matrix C passed in from the user (C_in).
 
 // The method is chosen automatically:  a gather/scatter saxpy method
-// (Gustavson), a heap-based saxpy method, or a dot product method.  The
-// AxB_method can modify this automatic choice, if set to a non-default value.
-// AxB_method_used is DOT, SAXPY, or DEFAULT (the latter denotes the row/col
-// scaling methods).
+// (Gustavson), or a dot product method.
 
 // FUTURE:: an outer-product method for C=A*B'
 
 #define GB_FREE_ALL             \
 {                               \
-    GB_MATRIX_FREE (Chandle) ;  \
-    GB_MATRIX_FREE (&AT) ;      \
-    GB_MATRIX_FREE (&BT) ;      \
-    GB_MATRIX_FREE (&MT) ;      \
+    GB_Matrix_free (Chandle) ;  \
+    GB_Matrix_free (&AT) ;      \
+    GB_Matrix_free (&BT) ;      \
+    GB_Matrix_free (&MT) ;      \
 }
 
 #include "GB_mxm.h"
@@ -39,15 +36,15 @@
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
 (
-    GrB_Matrix *Chandle,            // output matrix (if not done in place)
-    GrB_Matrix C_in_place,          // input/output matrix, if done in place
+    GrB_Matrix *Chandle,            // output matrix (if not done in-place)
+    GrB_Matrix C_in,                // input/output matrix, if done in-place
     bool C_replace,                 // C matrix descriptor
     const bool C_is_csc,            // desired CSR/CSC format of C
     GrB_Matrix *MT_handle,          // return MT = M' to caller, if computed
     const GrB_Matrix M_in,          // mask for C<M> (not complemented)
     const bool Mask_comp,           // if true, use !M
     const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_BinaryOp accum,       // accum operator for C_in_place += A*B
+    const GrB_BinaryOp accum,       // accum operator for C_in += A*B
     const GrB_Matrix A_in,          // input matrix
     const GrB_Matrix B_in,          // input matrix
     const GrB_Semiring semiring,    // semiring that defines C=A*B
@@ -55,9 +52,9 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
     bool B_transpose,               // if true, use B', else B
     bool flipxy,                    // if true, do z=fmult(b,a) vs fmult(a,b)
     bool *mask_applied,             // if true, mask was applied
-    bool *done_in_place,            // if true, C was computed in place
+    bool *done_in_place,            // if true, C was computed in-place
     GrB_Desc_Value AxB_method,      // for auto vs user selection of methods
-    GrB_Desc_Value *AxB_method_used,// method selected
+    const int do_sort,              // if nonzero, try to return C unjumbled
     GB_Context Context
 )
 {
@@ -66,18 +63,31 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
     // check inputs
     //--------------------------------------------------------------------------
 
-    ASSERT_MATRIX_OK_OR_NULL (C_in_place, "C_in_place for meta A*B", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (C_in, "C_in for meta A*B", GB0) ;
     ASSERT_MATRIX_OK_OR_NULL (M_in, "M for meta A*B", GB0) ;
     ASSERT_MATRIX_OK (A_in, "A_in for meta A*B", GB0) ;
     ASSERT_MATRIX_OK (B_in, "B_in for meta A*B", GB0) ;
-    ASSERT (!GB_PENDING (M_in)) ; ASSERT (!GB_ZOMBIES (M_in)) ;
-    ASSERT (!GB_PENDING (A_in)) ; ASSERT (!GB_ZOMBIES (A_in)) ;
-    ASSERT (!GB_PENDING (B_in)) ; ASSERT (!GB_ZOMBIES (B_in)) ;
+
+    ASSERT (!GB_ZOMBIES (M_in)) ;
+    ASSERT (GB_JUMBLED_OK (M_in)) ;
+    ASSERT (!GB_PENDING (M_in)) ;
+
+    ASSERT (!GB_ZOMBIES (A_in)) ;
+    ASSERT (GB_JUMBLED_OK (A_in)) ;
+    ASSERT (!GB_PENDING (A_in)) ;
+
+    ASSERT (!GB_ZOMBIES (B_in)) ;
+    ASSERT (GB_JUMBLED_OK (B_in)) ;
+    ASSERT (!GB_PENDING (B_in)) ;
+
     ASSERT_SEMIRING_OK (semiring, "semiring for numeric A*B", GB0) ;
     ASSERT (mask_applied != NULL) ;
-    ASSERT (AxB_method_used != NULL) ;
     ASSERT (Chandle != NULL) ;
 
+    //--------------------------------------------------------------------------
+    // declare workspace
+    //--------------------------------------------------------------------------
+
     (*Chandle) = NULL ;
     if (MT_handle != NULL)
     { 
@@ -92,47 +102,69 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
 
     (*mask_applied) = false ;
     (*done_in_place) = false ;
-    (*AxB_method_used) = GxB_DEFAULT ;
-
-    if (AxB_method == GxB_AxB_HEAP)
-    { 
-        // FUTURE::: Heap method not yet reinstalled; using Hash instead
-        AxB_method = GxB_AxB_HASH ;
-    }
 
     //--------------------------------------------------------------------------
-    // see if the work can be done in place
+    // see if the work can be done in-place
     //--------------------------------------------------------------------------
 
-    // C can be computed in place if it is already dense, and if it is
-    // guaranteed to remain dense after the computation is done.  This case
-    // requires the accum operator to be present and it must match the monoid
-    // of the semiring.  C_replace must be false, or effectively false.
-    // Finally, C must not transposed on output.
+    // If C is hypersparse, sparse, or full:
+    //
+    //      C can be computed in-place if it is already dense, and if it is
+    //      guaranteed to remain dense after the computation is done.  This
+    //      case requires the accum operator to be present and it must match
+    //      the monoid of the semiring.  C_replace must be false, or
+    //      effectively false.
+    //
+    //      TODO:  if C is full and accum is not present, it can be quickly
+    //      converted to bitmap and then done in-place.
+    //
+    // If C is bitmap:
+    //
+    //      C can be computed in-place if its type is the same as the semiring
+    //      monoid.  The accum must not be present, or if present it must match
+    //      the semiring monoid.  C_replace can be true or false.
+    //
+    //      TODO: modify GB_AxB_dot2 so it can compute C in-place,
+    //      or add a bitmap dot product method.  Also modify GB_AxB_saxpy
+    //      so it can compute a C bitmap in-place.
+    //
+    // In both cases, C must not be transposed, nor can it be aliased with any
+    // input matrix.
 
     bool can_do_in_place = false ;
-    if (C_in_place != NULL && accum != NULL)
+
+    if (C_in != NULL)
     { 
-        // check if C_in_place is competely dense:  all entries present and no
-        // pending work
-        bool C_is_dense = !GB_PENDING_OR_ZOMBIES (C_in_place)
-            && GB_is_dense (C_in_place) ;
-
-        // accum must be present, and must match the monoid of the semiring,
-        // and the ztype of the monoid must match the type of C
-        bool accum_is_monoid = (accum == semiring->add->op) 
-            && (C_in_place->type == accum->ztype) ;
-
-        // C += A*B with C_replace ignored (effectively false)
-        // C<M> += A*B with C_replace false
-        // C<!M> += A*B with C_replace false
-        can_do_in_place =
-            C_is_dense
-            && accum_is_monoid
-            && ((M_in == NULL) || (M_in != NULL && !C_replace)) ;
-
-        // C must also not be transposed on output; see below.
-        // Nor can it be aliased with any input matrix.
+        if (GB_IS_BITMAP (C_in))
+        {
+            // C is bitmap
+            ASSERT (!GB_PENDING (C_in)) ; // no pending tuples in bitmap
+            ASSERT (!GB_ZOMBIES (C_in)) ; // bitmap never has zombies
+            can_do_in_place = (C_in->type == semiring->add->op->ztype)
+                && ((accum == NULL) || (accum == semiring->add->op)) ;
+        }
+        else if (accum != NULL)
+        { 
+            // C is hypersparse, sparse, or full, and accum is present.
+            // check if C_in is competely dense:  no pending work.
+            bool C_is_dense = GB_as_if_full (C_in) ;
+
+            // accum must be present, and must match the monoid of the
+            // semiring, and the ztype of the monoid must match the type of C
+            bool accum_is_monoid = (accum == semiring->add->op) 
+                && (C_in->type == accum->ztype) ;
+
+            // C += A*B with C_replace ignored (effectively false)
+            // C<M> += A*B with C_replace false
+            // C<!M> += A*B with C_replace false
+            can_do_in_place = C_is_dense && accum_is_monoid
+                && ((M_in == NULL) || (M_in != NULL && !C_replace)) ;
+        }
+
+        // C must also not be transposed on output; see below.  Nor can it be
+        // aliased with any input matrix.  This test is done after handling the
+        // CSR/CSC formats since the input matrices may be transposed (thus
+        // breaking the alias with C).
     }
 
     //--------------------------------------------------------------------------
@@ -256,15 +288,18 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
     //--------------------------------------------------------------------------
 
     // all uses of GB_transpose below:
-    // transpose: typecast, no op, not in place
+    // transpose: typecast, no op, not in-place
 
     GrB_Matrix M ;
     bool M_transposed ;
 
+    // TODO: if Mask_struct is true, do not create values of MT = M'
+
     if (M_transpose && M_in != NULL)
     { 
         // MT = M_in' also typecasting to boolean.  It is not freed here
         // unless an error occurs, but is returned to the caller.
+        GBURBLE ("(M transpose) ") ;
         GB_OK (GB_transpose (&MT, GrB_BOOL, C_is_csc, M_in,
             NULL, NULL, NULL, false, Context)) ;
         M = MT ;
@@ -285,16 +320,17 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
 
     if (can_do_in_place)
     {
-        // C cannot be done in place if it is aliased with any input matrix.
-        // Also cannot compute C in place (yet) if it is to be transposed.
-        bool C_aliased =
-            GB_aliased (C_in_place, M) ||
-            GB_aliased (C_in_place, A) ||
-            GB_aliased (C_in_place, B) ;
+        // C cannot be done in-place if it is aliased with any input matrix.
+        // Also cannot compute C in-place if it is to be transposed.
+        bool C_aliased = GB_aliased (C_in, M) || GB_aliased (C_in, A) ||
+            GB_aliased (C_in, B) ;
         if (C_transpose || C_aliased)
         { 
             can_do_in_place = false ;
         }
+
+        // TODO: A and B can be transposed below, so this check should be
+        // done after any such transposings.
     }
 
     //--------------------------------------------------------------------------
@@ -306,52 +342,78 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
     #define GB_PROP_LEN (GB_LEN+128)
     char A_str [GB_PROP_LEN+1] ;
     char B_str [GB_PROP_LEN+1] ;
-    snprintf (A_str, GB_PROP_LEN, "A: " GBd "-by-" GBd ", %s, " GBd " entries",
-        GB_NROWS (A), GB_NCOLS (A), A->type->name, GB_NNZ (A)) ;
-    snprintf (B_str, GB_PROP_LEN, "B: " GBd "-by-" GBd ", %s, " GBd " entries",
-        GB_NROWS (B), GB_NCOLS (B), B->type->name, GB_NNZ (B)) ;
+    if (GB_Global_burble_get ( ))
+    {
+        int64_t anz = GB_IS_FULL (A) ? GB_NNZ_FULL (A) : GB_NNZ (A) ;
+        int64_t bnz = GB_IS_FULL (B) ? GB_NNZ_FULL (B) : GB_NNZ (B) ;
+        snprintf (A_str, GB_PROP_LEN, "A: " GBd "-by-" GBd ", %s, " GBd 
+            " entries", GB_NROWS (A), GB_NCOLS (A), A->type->name, anz) ;
+        snprintf (B_str, GB_PROP_LEN, "B: " GBd "-by-" GBd ", %s, " GBd
+            " entries", GB_NROWS (B), GB_NCOLS (B), B->type->name, bnz) ;
+    }
     #endif
 
     //--------------------------------------------------------------------------
     // typecast A and B when transposing them, if needed
     //--------------------------------------------------------------------------
 
-    bool op_is_first  = semiring->multiply->opcode == GB_FIRST_opcode ;
-    bool op_is_second = semiring->multiply->opcode == GB_SECOND_opcode ;
-    bool op_is_pair   = semiring->multiply->opcode == GB_PAIR_opcode ;
-    bool A_is_pattern = false ;
-    bool B_is_pattern = false ;
+    // TODO: if A is pattern, do not compute values of AT = A'
+    // TODO: if B is pattern, do not compute values of BT = B'
+    // use constant-valued matrices for AT and BT, with type GrB_BOOL.
+
+    GB_Opcode opcode = semiring->multiply->opcode  ;
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (opcode) ;
+    bool op_is_first  = (opcode == GB_FIRST_opcode) ;
+    bool op_is_second = (opcode == GB_SECOND_opcode) ;
+    bool op_is_pair   = (opcode == GB_PAIR_opcode) ;
+    bool A_is_pattern ;
+    bool B_is_pattern ;
 
     GrB_Type atype_required, btype_required ;
     if (flipxy)
     { 
         // A is passed as y, and B as x, in z = mult(x,y)
-        A_is_pattern = op_is_first  || op_is_pair ;
-        B_is_pattern = op_is_second || op_is_pair ;
+        A_is_pattern = op_is_first  || op_is_pair || op_is_positional ;
+        B_is_pattern = op_is_second || op_is_pair || op_is_positional ;
         atype_required = A_is_pattern ? A->type : semiring->multiply->ytype ;
         btype_required = B_is_pattern ? B->type : semiring->multiply->xtype ;
     }
     else
     { 
         // A is passed as x, and B as y, in z = mult(x,y)
-        A_is_pattern = op_is_second || op_is_pair ;
-        B_is_pattern = op_is_first  || op_is_pair ;
+        A_is_pattern = op_is_second || op_is_pair || op_is_positional ;
+        B_is_pattern = op_is_first  || op_is_pair || op_is_positional ;
         atype_required = A_is_pattern ? A->type : semiring->multiply->xtype ;
         btype_required = B_is_pattern ? B->type : semiring->multiply->ytype ;
     }
 
+    bool allow_scale = true ;
+    if (semiring->multiply->function == NULL && (op_is_first || op_is_second))
+    { 
+        // GB_AxB_rowscale and GB_AxB_colscale do not handle the implicit FIRST
+        // operator for GB_reduce_to_vector.  They do handle any other
+        // positional operator (FIRSTI, FIRSTJ, SECONDI, SECONDJ, etc).
+        allow_scale = false ;
+    }
+
     //--------------------------------------------------------------------------
     // select the algorithm
     //--------------------------------------------------------------------------
 
-    // Four cases remain with the swap_rule above.  M may or may not be
-    // present.
+    // Four cases remain with the swap_rule.  M may or may not be present.
 
-        //      C<M> = A *B
-        //      C<M> = A *B'
+        //      C<M> = A*B
+        //      C<M> = A*B'
         //      C<M> = A'*B
         //      C<M> = (A*B)'
 
+    // use GB_AxB_saxpy3 by default
+    #define GB_USE_ROWSCALE 0
+    #define GB_USE_COLSCALE 1
+    #define GB_USE_DOT      2
+    #define GB_USE_SAXPY    3
+    int axb_method = GB_USE_SAXPY ;
+
     if (atrans)
     {
 
@@ -378,7 +440,7 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
         //----------------------------------------------------------------------
 
         // A'*B is being computed: use the dot product without computing A'
-        // or use the saxpy (heap or Gustavson) method
+        // or use the saxpy (Gustavson) method
 
         // If the mask is present, only entries for which M(i,j)=1 are
         // computed, which makes this method very efficient when the mask is
@@ -388,85 +450,99 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
         // is very slow in general, and thus the saxpy method is usually used
         // instead.
 
-        bool do_rowscale = false ;
-        bool do_colscale = false ;
-        bool do_adotb = false ;
-
-        if (M == NULL && B_is_diagonal)
+        if (allow_scale && M == NULL
+            && !GB_IS_BITMAP (A)     // TODO: A'*D colscale with A bitmap
+            && B_is_diagonal)
         { 
-            // C = A'*D
-            do_colscale = true ;
+            // C = A'*D, col scale
+            axb_method = GB_USE_COLSCALE ;
         }
-        else if (M == NULL && GB_is_diagonal (A, Context))
+        else if (allow_scale && M == NULL
+            && !GB_IS_BITMAP (B)     // TODO: D*B rowscale with B bitmap
+            && GB_is_diagonal (A, Context))
         { 
-            // C = D*B
-            do_rowscale = true ;
+            // C = D*B, row scale
+            axb_method = GB_USE_ROWSCALE ;
         }
         else if (AxB_method == GxB_DEFAULT)
         {
             // auto selection for A'*B
-            if (M != NULL && !Mask_comp)
-            { 
-                // C<M>=A'*B uses the masked dot product method
-                do_adotb = true ;
+            if (GB_AxB_dot4_control (can_do_in_place ? C_in : NULL,
+                M, Mask_comp))
+            {
+                // C+=A'*B can be done with dot4
+                axb_method = GB_USE_DOT ;
             }
-            else if (A->vdim == 1 || B->vdim == 1)
+            else if (GB_AxB_dot3_control (M, Mask_comp))
             { 
-                // C=A'*B uses dot product method if C is a 1-by-n or n-by-1
-                do_adotb = true ;
+                // C<M>=A'*B uses the masked dot product method (dot3)
+                axb_method = GB_USE_DOT ;
             }
-            else
-            { 
-                // when C is a matrix, C=A'*B uses the dot product method if A
-                // or B are dense, since the dot product method requires no
-                // workspace in that case and can exploit dense vectors of A
-                // and/or B.
-                do_adotb = GB_is_dense (A) || GB_is_dense (B) ;
+            else if (GB_AxB_dot2_control (A, B, Context))
+            {
+                // C=A'*B or C<!M>=A'B* can efficiently use the dot2 method
+                axb_method = GB_USE_DOT ;
             }
+
         }
-        else
+        else if (AxB_method == GxB_AxB_DOT)
         { 
             // user selection for A'*B
-            do_adotb = (AxB_method == GxB_AxB_DOT) ;
+            axb_method = GB_USE_DOT ;
         }
 
         //----------------------------------------------------------------------
-        // C<M>=A'*B
+        // AT = A'
         //----------------------------------------------------------------------
 
-        if (do_rowscale)
-        { 
-            // C = D*B
-            GBBURBLE ("C%s=A'*B, rowscale ", M_str) ;
-            GB_OK (GB_AxB_rowscale (Chandle, A, B, semiring, flipxy, Context)) ;
-        }
-        else if (do_colscale)
-        { 
-            // C = A'*D
-            GBBURBLE ("C%s=A'*B, colscale (transposed %s) ", M_str, A_str) ;
+        if (axb_method == GB_USE_COLSCALE || axb_method == GB_USE_SAXPY)
+        {
+            // AT = A'
             GB_OK (GB_transpose (&AT, atype_required, true, A,
                 NULL, NULL, NULL, false, Context)) ;
-            GB_OK (GB_AxB_colscale (Chandle, AT, B, semiring, flipxy, Context));
-        }
-        else if (do_adotb)
-        { 
-            // C<M>=A'*B via dot product, or C_in_place<M>+=A'*B if in place
-            GBBURBLE ("C%s=A'*B, %sdot_product ", M_str,
-                (M != NULL && !Mask_comp) ? "masked_" : "") ;
-            GB_OK (GB_AxB_dot (Chandle, (can_do_in_place) ? C_in_place : NULL,
-                M, Mask_comp, Mask_struct, A, B, semiring, flipxy,
-                mask_applied, done_in_place, Context)) ;
-            (*AxB_method_used) = GxB_AxB_DOT ;
+            // do not use colscale if AT is now bitmap
+            if (GB_IS_BITMAP (AT))
+            { 
+                axb_method = GB_USE_SAXPY ;
+            }
         }
-        else
-        { 
-            // C = A'*B via saxpy3: Gustavson + Hash method
-            GBBURBLE ("C%s=A'*B, saxpy (transposed %s) ", M_str, A_str) ;
-            GB_OK (GB_transpose (&AT, atype_required, true, A,
-                NULL, NULL, NULL, false, Context)) ;
-            GB_OK (GB_AxB_saxpy3 (Chandle, M, Mask_comp, Mask_struct,
-                AT, B, semiring, flipxy, mask_applied, AxB_method, Context)) ;
-            (*AxB_method_used) = GxB_AxB_SAXPY ;
+
+        //----------------------------------------------------------------------
+        // C<M>=A'*B
+        //----------------------------------------------------------------------
+
+        switch (axb_method)
+        {
+            case GB_USE_ROWSCALE : 
+                // C = D*B using rowscale
+                GBURBLE ("C%s=A'*B, rowscale ", M_str) ;
+                GB_OK (GB_AxB_rowscale (Chandle, A, B, semiring, flipxy,
+                    Context)) ;
+                break ;
+
+            case GB_USE_COLSCALE : 
+                // C = A'*D using colscale
+                GBURBLE ("C%s=A'*B, colscale (transposed %s) ", M_str, A_str) ;
+                GB_OK (GB_AxB_colscale (Chandle, AT, B, semiring, flipxy,
+                    Context)) ;
+                break ;
+
+            case GB_USE_DOT : 
+                // C<M>=A'*B via dot, or C_in<M>+=A'*B if in-place
+                GBURBLE ("C%s=A'*B, %sdot_product ", M_str,
+                    (M != NULL && !Mask_comp) ? "masked_" : "") ;
+                GB_OK (GB_AxB_dot (Chandle, (can_do_in_place) ? C_in : NULL,
+                    M, Mask_comp, Mask_struct, A, B, semiring, flipxy,
+                    mask_applied, done_in_place, Context)) ;
+                break ;
+
+            default : 
+                // C = A'*B via saxpy: Gustavson + Hash method
+                GBURBLE ("C%s=A'*B, saxpy (transposed %s) ", M_str, A_str) ;
+                GB_OK (GB_AxB_saxpy (Chandle, M, Mask_comp, Mask_struct,
+                    AT, B, semiring, flipxy, mask_applied, AxB_method, do_sort,
+                    Context)) ;
+                break ;
         }
 
     }
@@ -474,46 +550,83 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
     {
 
         //----------------------------------------------------------------------
-        // C<M> = A*B'
+        // select the method for C<M> = A*B'
         //----------------------------------------------------------------------
 
-        if (M == NULL && GB_is_diagonal (B, Context))
+        if (allow_scale && M == NULL
+            && !GB_IS_BITMAP (A)     // TODO: A*D colscale with A bitmap
+            && GB_is_diagonal (B, Context))
         { 
-            // C = A*D
-            GBBURBLE ("C%s=A*B', colscale ", M_str) ;
-            GB_OK (GB_AxB_colscale (Chandle, A, B, semiring, flipxy, Context)) ;
+            // C = A*D, column scale
+            axb_method = GB_USE_COLSCALE ;
         }
-        else if (M == NULL && GB_is_diagonal (A, Context))
+        else if (allow_scale && M == NULL
+            && !GB_IS_BITMAP (B)     // TODO: D*B' rowscale with B bitmap
+            && GB_is_diagonal (A, Context))
         { 
-            // C = D*B'
-            GBBURBLE ("C%s=A*B', rowscale (transposed %s) ", M_str, B_str) ;
-            GB_OK (GB_transpose (&BT, btype_required, true, B,
-                NULL, NULL, NULL, false, Context)) ;
-            GB_OK (GB_AxB_rowscale (Chandle, A, BT, semiring, flipxy, Context));
+            // C = D*B', row scale
+            axb_method = GB_USE_ROWSCALE ;
         }
         else if (AxB_method == GxB_AxB_DOT)
         { 
-            // C<M>=A*B' via dot product, or C_in_place<M>+=A*B' if in place
-            GBBURBLE ("C%s=A*B', dot_product (transposed %s) (transposed %s) ",
-                M_str, A_str, B_str) ;
-            GB_OK (GB_transpose (&AT, atype_required, true, A,
-                NULL, NULL, NULL, false, Context)) ;
-            GB_OK (GB_transpose (&BT, btype_required, true, B,
-                NULL, NULL, NULL, false, Context)) ;
-            GB_OK (GB_AxB_dot (Chandle, (can_do_in_place) ? C_in_place : NULL,
-                M, Mask_comp, Mask_struct, AT, BT, semiring, flipxy,
-                mask_applied, done_in_place, Context)) ;
-            (*AxB_method_used) = GxB_AxB_DOT ;
+            // only use the dot product method if explicitly requested
+            axb_method = GB_USE_DOT ;
         }
-        else
-        { 
-            // C = A*B' via saxpy3: Gustavson + Hash method
-            GBBURBLE ("C%s=A*B', saxpy (transposed %s) ", M_str, B_str) ;
+
+        //----------------------------------------------------------------------
+        // BT = B'
+        //----------------------------------------------------------------------
+
+        if (axb_method != GB_USE_COLSCALE)
+        {
+            // BT = B'
             GB_OK (GB_transpose (&BT, btype_required, true, B,
                 NULL, NULL, NULL, false, Context)) ;
-            GB_OK (GB_AxB_saxpy3 (Chandle, M, Mask_comp, Mask_struct,
-                A, BT, semiring, flipxy, mask_applied, AxB_method, Context)) ;
-            (*AxB_method_used) = GxB_AxB_SAXPY ;
+            // do not use rowscale if BT is now bitmap
+            if (axb_method == GB_USE_ROWSCALE && GB_IS_BITMAP (BT))
+            { 
+                axb_method = GB_USE_SAXPY ;
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // C<M> = A*B'
+        //----------------------------------------------------------------------
+
+        switch (axb_method)
+        {
+            case GB_USE_COLSCALE : 
+                // C = A*D
+                GBURBLE ("C%s=A*B', colscale ", M_str) ;
+                GB_OK (GB_AxB_colscale (Chandle, A, B, semiring, flipxy,
+                    Context)) ;
+                break ;
+
+            case GB_USE_ROWSCALE : 
+                // C = D*B'
+                GBURBLE ("C%s=A*B', rowscale (transposed %s) ", M_str, B_str) ;
+                GB_OK (GB_AxB_rowscale (Chandle, A, BT, semiring, flipxy,
+                    Context)) ;
+                break ;
+
+            case GB_USE_DOT : 
+                // C<M>=A*B' via dot product, or C_in<M>+=A*B' if in-place
+                GBURBLE ("C%s=A*B', dot_product (transposed %s) "
+                    "(transposed %s) ", M_str, A_str, B_str) ;
+                GB_OK (GB_transpose (&AT, atype_required, true, A,
+                    NULL, NULL, NULL, false, Context)) ;
+                GB_OK (GB_AxB_dot (Chandle, (can_do_in_place) ? C_in : NULL,
+                    M, Mask_comp, Mask_struct, AT, BT, semiring, flipxy,
+                    mask_applied, done_in_place, Context)) ;
+                break ;
+
+            default : 
+                // C = A*B' via saxpy: Gustavson + Hash method
+                GBURBLE ("C%s=A*B', saxpy (transposed %s) ", M_str, B_str) ;
+                GB_OK (GB_AxB_saxpy (Chandle, M, Mask_comp, Mask_struct,
+                    A, BT, semiring, flipxy, mask_applied, AxB_method, do_sort,
+                    Context)) ;
+                break ;
         }
 
     }
@@ -524,41 +637,45 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
         // C<M> = A*B
         //----------------------------------------------------------------------
 
-        if (M == NULL && GB_is_diagonal (B, Context))
+        if (allow_scale && M == NULL
+            && !GB_IS_BITMAP (A)     // TODO: A*D colscale with A bitmap
+            && GB_is_diagonal (B, Context))
         { 
             // C = A*D, column scale
-            GBBURBLE ("C%s=A*B, colscale ", M_str) ;
+            GBURBLE ("C%s=A*B, colscale ", M_str) ;
             GB_OK (GB_AxB_colscale (Chandle, A, B, semiring, flipxy, Context)) ;
         }
-        else if (M == NULL && GB_is_diagonal (A, Context))
+        else if (allow_scale && M == NULL
+            && !GB_IS_BITMAP (B)     // TODO: D*B rowscale with B bitmap
+            && GB_is_diagonal (A, Context))
         { 
             // C = D*B, row scale
-            GBBURBLE ("C%s=A*B, rowscale ", M_str) ;
+            GBURBLE ("C%s=A*B, rowscale ", M_str) ;
             GB_OK (GB_AxB_rowscale (Chandle, A, B, semiring, flipxy, Context)) ;
         }
         else if (AxB_method == GxB_AxB_DOT)
         { 
-            // C<M>=A*B via dot product, or C_in_place<M>+=A*B if in place
-            GBBURBLE ("C%s=A*B', dot_product (transposed %s) ", M_str, A_str) ;
+            // C<M>=A*B via dot product, or C_in<M>+=A*B if in-place.
+            // only use the dot product method if explicitly requested
+            GBURBLE ("C%s=A*B', dot_product (transposed %s) ", M_str, A_str) ;
             GB_OK (GB_transpose (&AT, atype_required, true, A,
                 NULL, NULL, NULL, false, Context)) ;
-            GB_OK (GB_AxB_dot (Chandle, (can_do_in_place) ? C_in_place : NULL,
+            GB_OK (GB_AxB_dot (Chandle, (can_do_in_place) ? C_in : NULL,
                 M, Mask_comp, Mask_struct, AT, B, semiring, flipxy,
                 mask_applied, done_in_place, Context)) ;
-            (*AxB_method_used) = GxB_AxB_DOT ;
         }
         else
         { 
-            // C = A*B via saxpy3: Gustavson + Hash method
-            GBBURBLE ("C%s=A*B, saxpy ", M_str) ;
-            GB_OK (GB_AxB_saxpy3 (Chandle, M, Mask_comp, Mask_struct,
-                A, B, semiring, flipxy, mask_applied, AxB_method, Context)) ;
-            (*AxB_method_used) = GxB_AxB_SAXPY ;
+            // C = A*B via saxpy: Gustavson + Hash method
+            GBURBLE ("C%s=A*B, saxpy ", M_str) ;
+            GB_OK (GB_AxB_saxpy (Chandle, M, Mask_comp, Mask_struct,
+                A, B, semiring, flipxy, mask_applied, AxB_method, do_sort,
+                Context)) ;
         }
     }
 
-    if (M_transposed) { GBBURBLE ("(M transposed) ") ; }
-    if ((M != NULL) && !(*mask_applied)) { GBBURBLE ("(mask later) ") ; }
+    if (M_transposed) { GBURBLE ("(M transposed) ") ; }
+    if ((M != NULL) && !(*mask_applied)) { GBURBLE ("(mask later) ") ; }
 
     //--------------------------------------------------------------------------
     // handle C_transpose and assign the CSR/CSC format
@@ -571,9 +688,10 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
 
     if (*done_in_place)
     { 
-        // C can be done in place only if C is not transposed on output
-        ASSERT_MATRIX_OK (C_in_place, "C_in_place output for all C=A*B", GB0) ;
-        ASSERT (C_in_place->is_csc == C_is_csc) ;
+        GBURBLE ("(C in place) ") ;
+        // C can be done in-place only if C is not transposed on output
+        ASSERT_MATRIX_OK (C_in, "C_in output for all C=A*B", GB0) ;
+        ASSERT (C_in->is_csc == C_is_csc) ;
     }
     else
     { 
@@ -587,8 +705,8 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
     // free workspace and return result
     //--------------------------------------------------------------------------
 
-    GB_MATRIX_FREE (&AT) ;
-    GB_MATRIX_FREE (&BT) ;
+    GB_Matrix_free (&AT) ;
+    GB_Matrix_free (&BT) ;
     ASSERT_MATRIX_OK_OR_NULL (MT, "MT if computed", GB0) ;
     if (MT_handle != NULL)
     { 
@@ -598,7 +716,7 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
     else
     { 
         // otherwise, free it
-        GB_MATRIX_FREE (&MT) ;
+        GB_Matrix_free (&MT) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GB_AxB_pattern.c b/GraphBLAS/Source/GB_AxB_pattern.c
index 5c32522c93..862a938c86 100644
--- a/GraphBLAS/Source/GB_AxB_pattern.c
+++ b/GraphBLAS/Source/GB_AxB_pattern.c
@@ -2,8 +2,8 @@
 // GB_AxB_pattern: determine if the values of A and B will be used by C=A*B 
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -28,21 +28,28 @@ void GB_AxB_pattern
     // determine A_is_pattern and B_is_pattern
     //--------------------------------------------------------------------------
 
-    bool op_is_first  = mult_opcode == GB_FIRST_opcode ;
-    bool op_is_second = mult_opcode == GB_SECOND_opcode ;
-    bool op_is_pair   = mult_opcode == GB_PAIR_opcode ;
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (mult_opcode) ;
+    bool op_is_first  = (mult_opcode == GB_FIRST_opcode) ;
+    bool op_is_second = (mult_opcode == GB_SECOND_opcode) ;
+    bool op_is_pair   = (mult_opcode == GB_PAIR_opcode) ;
 
-    if (flipxy)
+    if (op_is_positional || op_is_pair)
+    { 
+        // mult (x,y) does not depend on the values of x or y
+        (*A_is_pattern) = true ;
+        (*B_is_pattern) = true ;
+    }
+    else if (flipxy)
     { 
         // z = mult (b,a) will be computed
-        (*A_is_pattern) = op_is_first  || op_is_pair ;
-        (*B_is_pattern) = op_is_second || op_is_pair ;
+        (*A_is_pattern) = op_is_first  ;
+        (*B_is_pattern) = op_is_second ;
     }
     else
     { 
         // z = mult (a,b) will be computed
-        (*A_is_pattern) = op_is_second || op_is_pair ;
-        (*B_is_pattern) = op_is_first  || op_is_pair ;
+        (*A_is_pattern) = op_is_second ;
+        (*B_is_pattern) = op_is_first  ;
     }
 }
 
diff --git a/GraphBLAS/Source/GB_AxB_rowscale.c b/GraphBLAS/Source/GB_AxB_rowscale.c
index 91e4cd7bd0..84de7452a2 100644
--- a/GraphBLAS/Source/GB_AxB_rowscale.c
+++ b/GraphBLAS/Source/GB_AxB_rowscale.c
@@ -2,13 +2,14 @@
 // GB_AxB_rowscale: C = D*B, row scale with diagonal matrix D
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "GB_mxm.h"
 #include "GB_binop.h"
+#include "GB_apply.h"
 #ifndef GBCOMPACT
 #include "GB_binop__include.h"
 #endif
@@ -30,21 +31,29 @@ GrB_Info GB_AxB_rowscale            // C = D*B, row scale with diagonal D
 
     GrB_Info info ;
     ASSERT (Chandle != NULL) ;
+
     ASSERT_MATRIX_OK (D, "D for rowscale A*D", GB0) ;
+    ASSERT (!GB_ZOMBIES (D)) ;
+    ASSERT (!GB_JUMBLED (D)) ;
+    ASSERT (!GB_PENDING (D)) ;
+
     ASSERT_MATRIX_OK (B, "B for rowscale A*D", GB0) ;
-    ASSERT (!GB_PENDING (D)) ; ASSERT (!GB_ZOMBIES (D)) ;
-    ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (GB_JUMBLED_OK (B)) ;
+    ASSERT (!GB_PENDING (B)) ;
+
     ASSERT_SEMIRING_OK (semiring, "semiring for numeric D*A", GB0) ;
-    ASSERT (D->vlen == D->vdim) ;
-    ASSERT (D->vlen == B->vlen) ;
+    ASSERT (D->vdim == B->vlen) ;
     ASSERT (GB_is_diagonal (D, Context)) ;
 
-    //--------------------------------------------------------------------------
-    // determine the number of threads to use
-    //--------------------------------------------------------------------------
+    ASSERT (!GB_IS_BITMAP (D)) ;        // bitmap or full: not needed
+    ASSERT (!GB_IS_BITMAP (B)) ;
+    ASSERT (!GB_IS_FULL (D)) ;
 
-    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-    int nthreads = GB_nthreads (GB_NNZ (B) + B->nvec, chunk, nthreads_max) ;
+    GBURBLE ("(%s=%s*%s) ",
+        GB_sparsity_char_matrix (B),    // C has the sparsity structure of B
+        GB_sparsity_char_matrix (D),
+        GB_sparsity_char_matrix (B)) ;
 
     //--------------------------------------------------------------------------
     // get the semiring operators
@@ -52,10 +61,110 @@ GrB_Info GB_AxB_rowscale            // C = D*B, row scale with diagonal D
 
     GrB_BinaryOp mult = semiring->multiply ;
     ASSERT (mult->ztype == semiring->add->op->ztype) ;
+    GB_Opcode opcode = mult->opcode ;
+    // GB_reduce_to_vector does not use GB_AxB_rowscale:
+    ASSERT (!(mult->function == NULL &&
+        (opcode == GB_FIRST_opcode || opcode == GB_SECOND_opcode))) ;
+
+    //--------------------------------------------------------------------------
+    // copy the pattern of B into C
+    //--------------------------------------------------------------------------
 
-    bool op_is_first  = mult->opcode == GB_FIRST_opcode ;
-    bool op_is_second = mult->opcode == GB_SECOND_opcode ;
-    bool op_is_pair   = mult->opcode == GB_PAIR_opcode ;
+    // allocate C->x but do not initialize it
+    (*Chandle) = NULL ;
+    info = GB_dup (Chandle, B, false, mult->ztype, Context) ;
+    if (info != GrB_SUCCESS)
+    { 
+        // out of memory
+        return (info) ;
+    }
+    GrB_Matrix C = (*Chandle) ;
+
+    //--------------------------------------------------------------------------
+    // apply a positional operator: convert C=D*B to C=op(B)
+    //--------------------------------------------------------------------------
+
+    if (GB_OPCODE_IS_POSITIONAL (opcode))
+    { 
+        if (flipxy)
+        { 
+            // the multiplicatve operator is fmult(y,x), so flip the opcode
+            opcode = GB_binop_flip (opcode) ;
+        }
+        // determine unary operator to compute C=D*B
+        GrB_UnaryOp op1 = NULL ;
+        if (mult->ztype == GrB_INT64)
+        {
+            switch (opcode)
+            {
+                // first_op(D,B) becomes position_i(B)
+                case GB_FIRSTI_opcode   : 
+                case GB_FIRSTJ_opcode   : op1 = GxB_POSITIONI_INT64  ; 
+                    break ;
+                case GB_FIRSTI1_opcode  : 
+                case GB_FIRSTJ1_opcode  : op1 = GxB_POSITIONI1_INT64 ; 
+                    break ;
+                // second_op(D,B) becomes position_op(B)
+                case GB_SECONDI_opcode  : op1 = GxB_POSITIONI_INT64  ; 
+                    break ;
+                case GB_SECONDJ_opcode  : op1 = GxB_POSITIONJ_INT64  ; 
+                    break ;
+                case GB_SECONDI1_opcode : op1 = GxB_POSITIONI1_INT64 ; 
+                    break ;
+                case GB_SECONDJ1_opcode : op1 = GxB_POSITIONJ1_INT64 ; 
+                    break ;
+                default:  ;
+            }
+        }
+        else
+        {
+            switch (opcode)
+            {
+                // first_op(D,B) becomes position_i(B)
+                case GB_FIRSTI_opcode   : 
+                case GB_FIRSTJ_opcode   : op1 = GxB_POSITIONI_INT32  ; 
+                    break ;
+                case GB_FIRSTI1_opcode  : 
+                case GB_FIRSTJ1_opcode  : op1 = GxB_POSITIONI1_INT32 ; 
+                    break ;
+                // second_op(D,B) becomes position_op(B)
+                case GB_SECONDI_opcode  : op1 = GxB_POSITIONI_INT32  ; 
+                    break ;
+                case GB_SECONDJ_opcode  : op1 = GxB_POSITIONJ_INT32  ; 
+                    break ;
+                case GB_SECONDI1_opcode : op1 = GxB_POSITIONI1_INT32 ; 
+                    break ;
+                case GB_SECONDJ1_opcode : op1 = GxB_POSITIONJ1_INT32 ; 
+                    break ;
+                default:  ;
+            }
+        }
+        info = GB_apply_op (C->x, op1,      // positional unary op only
+            NULL, NULL, false, B, Context) ;
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory
+            GB_Matrix_free (Chandle) ;
+            return (info) ;
+        }
+        ASSERT_MATRIX_OK (C, "rowscale positional: C = D*B output", GB0) ;
+        return (GrB_SUCCESS) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // determine the number of threads to use
+    //--------------------------------------------------------------------------
+
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+    int nthreads = GB_nthreads (GB_NNZ_HELD (B) + B->nvec, chunk, nthreads_max);
+
+    //--------------------------------------------------------------------------
+    // determine if the values are accessed
+    //--------------------------------------------------------------------------
+
+    bool op_is_first  = (opcode == GB_FIRST_opcode) ;
+    bool op_is_second = (opcode == GB_SECOND_opcode) ;
+    bool op_is_pair   = (opcode == GB_PAIR_opcode) ;
     bool D_is_pattern = false ;
     bool B_is_pattern = false ;
 
@@ -80,21 +189,6 @@ GrB_Info GB_AxB_rowscale            // C = D*B, row scale with diagonal D
             GB_Type_compatible (B->type, mult->ytype))) ;
     }
 
-    (*Chandle) = NULL ;
-
-    //--------------------------------------------------------------------------
-    // copy the pattern of B into C
-    //--------------------------------------------------------------------------
-
-    info = GB_dup (Chandle, B, false, mult->ztype, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory
-        return (info) ;
-    }
-
-    GrB_Matrix C = (*Chandle) ;
-
     //--------------------------------------------------------------------------
     // C = D*B, row scale, via built-in binary operators
     //--------------------------------------------------------------------------
@@ -109,23 +203,23 @@ GrB_Info GB_AxB_rowscale            // C = D*B, row scale with diagonal D
 
         #define GB_DxB(mult,xname) GB_DxB_ ## mult ## xname
 
-        #define GB_BINOP_WORKER(mult,xname)                                   \
-        {                                                                     \
-            info = GB_DxB(mult,xname) (C, D, D_is_pattern, B, B_is_pattern,   \
-                nthreads) ;                                                   \
-            done = (info != GrB_NO_VALUE) ;                                   \
-        }                                                                     \
+        #define GB_BINOP_WORKER(mult,xname)                                  \
+        {                                                                    \
+            info = GB_DxB(mult,xname) (C, D, D_is_pattern, B, B_is_pattern,  \
+                nthreads) ;                                                  \
+            done = (info != GrB_NO_VALUE) ;                                  \
+        }                                                                    \
         break ;
 
         //----------------------------------------------------------------------
         // launch the switch factory
         //----------------------------------------------------------------------
 
-        GB_Opcode opcode ;
         GB_Type_code xcode, ycode, zcode ;
         if (GB_binop_builtin (D->type, D_is_pattern, B->type, B_is_pattern,
             mult, flipxy, &opcode, &xcode, &ycode, &zcode))
         { 
+            // C=D*B, rowscale with built-in operator
             #define GB_BINOP_IS_SEMIRING_MULTIPLIER
             #include "GB_binop_factory.c"
             #undef  GB_BINOP_IS_SEMIRING_MULTIPLIER
@@ -139,12 +233,13 @@ GrB_Info GB_AxB_rowscale            // C = D*B, row scale with diagonal D
 
     if (!done)
     {
-        GB_BURBLE_MATRIX (C, "generic ") ;
 
         //----------------------------------------------------------------------
         // get operators, functions, workspace, contents of D, B, and C
         //----------------------------------------------------------------------
 
+        GB_BURBLE_MATRIX (C, "(generic C=D*B rowscale) ") ;
+
         GxB_binary_function fmult = mult->function ;
 
         size_t csize = C->type->size ;
@@ -167,9 +262,9 @@ GrB_Info GB_AxB_rowscale            // C = D*B, row scale with diagonal D
         if (flipxy)
         { 
             // D is typecasted to y, and B is typecasted to x
-            cast_D = D_is_pattern ? NULL : 
+            cast_D = D_is_pattern ? NULL :
                      GB_cast_factory (mult->ytype->code, D->type->code) ;
-            cast_B = B_is_pattern ? NULL : 
+            cast_B = B_is_pattern ? NULL :
                      GB_cast_factory (mult->xtype->code, B->type->code) ;
         }
         else
@@ -207,13 +302,13 @@ GrB_Info GB_AxB_rowscale            // C = D*B, row scale with diagonal D
 
         if (flipxy)
         { 
-            #define GB_BINOP(z,x,y) fmult (z,y,x)
+            #define GB_BINOP(z,x,y,i,j) fmult (z,y,x)
             #include "GB_AxB_rowscale_meta.c"
             #undef GB_BINOP
         }
         else
         { 
-            #define GB_BINOP(z,x,y) fmult (z,x,y)
+            #define GB_BINOP(z,x,y,i,j) fmult (z,x,y)
             #include "GB_AxB_rowscale_meta.c"
             #undef GB_BINOP
         }
diff --git a/GraphBLAS/Source/GB_AxB_saxpy.c b/GraphBLAS/Source/GB_AxB_saxpy.c
new file mode 100644
index 0000000000..6e08dbcc6d
--- /dev/null
+++ b/GraphBLAS/Source/GB_AxB_saxpy.c
@@ -0,0 +1,117 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy: compute C=A*B, C<M>=A*B, or C<!M>=A*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_mxm.h"
+#include "GB_AxB_saxpy.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_bitmap_AxB_saxpy.h"
+
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy: compute C=A*B, C<M>=A*B, or C<!M>=A*B
+//------------------------------------------------------------------------------
+
+// TODO: pass in user's C and accum, and allow bitmap multiply to work in-place
+
+GrB_Info GB_AxB_saxpy               // C = A*B using Gustavson/Hash/Bitmap
+(
+    GrB_Matrix *Chandle,            // output matrix (if not done in-place)
+    const GrB_Matrix M,             // optional mask matrix
+    const bool Mask_comp,           // if true, use !M
+    const bool Mask_struct,         // if true, use the only structure of M
+    const GrB_Matrix A,             // input matrix A
+    const GrB_Matrix B,             // input matrix B
+    const GrB_Semiring semiring,    // semiring that defines C=A*B
+    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
+    bool *mask_applied,             // if true, then mask was applied
+    const GrB_Desc_Value AxB_method,
+    const int do_sort,              // if nonzero, try to sort in saxpy3
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    (*mask_applied) = false ;
+    ASSERT (Chandle != NULL) ;
+    ASSERT (*Chandle == NULL) ;
+
+    ASSERT_MATRIX_OK_OR_NULL (M, "M for saxpy A*B", GB0) ;
+    ASSERT (!GB_PENDING (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+
+    ASSERT_MATRIX_OK (A, "A for saxpy A*B", GB0) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+
+    ASSERT_MATRIX_OK (B, "B for saxpy A*B", GB0) ;
+    ASSERT (!GB_PENDING (B)) ;
+    ASSERT (GB_JUMBLED_OK (B)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+
+    ASSERT_SEMIRING_OK (semiring, "semiring for saxpy A*B", GB0) ;
+    ASSERT (A->vdim == B->vlen) ;
+
+    //--------------------------------------------------------------------------
+    // determine the sparsity of C
+    //--------------------------------------------------------------------------
+
+    int C_sparsity = GB_AxB_saxpy_sparsity (M, Mask_comp, A, B, Context) ;
+
+    if (M == NULL)
+    {
+        GBURBLE ("(%s=%s*%s) ",
+            GB_sparsity_char (C_sparsity),
+            GB_sparsity_char_matrix (A),
+            GB_sparsity_char_matrix (B)) ;
+    }
+    else
+    {
+        GBURBLE ("(%s%s%s%s%s=%s*%s) ",
+            GB_sparsity_char (C_sparsity),
+            Mask_struct ? "{" : "<",
+            Mask_comp ? "!" : "",
+            GB_sparsity_char_matrix (M),
+            Mask_struct ? "}" : ">",
+            GB_sparsity_char_matrix (A),
+            GB_sparsity_char_matrix (B)) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // select the method to use
+    //--------------------------------------------------------------------------
+
+    if (C_sparsity == GxB_HYPERSPARSE || C_sparsity == GxB_SPARSE)
+    { 
+
+        //----------------------------------------------------------------------
+        // C=A*B, C<M>=A*B or C<!M>=A*B: sparse Gustavson/Hash method
+        //----------------------------------------------------------------------
+
+        // GB_AxB_saxpy3 assumes C and B have the same sparsity structure
+        return (GB_AxB_saxpy3 (Chandle, C_sparsity, M, Mask_comp, Mask_struct,
+            A, B, semiring, flipxy, mask_applied, AxB_method, do_sort,
+            Context)) ;
+
+    }
+    else
+    { 
+
+        //----------------------------------------------------------------------
+        // C=A*B, C<M>=A*B or C<!M>=A*B: bitmap/full, possibly in-place 
+        //----------------------------------------------------------------------
+
+        return (GB_bitmap_AxB_saxpy (Chandle, C_sparsity, M, Mask_comp,
+            Mask_struct, A, B, semiring, flipxy, mask_applied, Context)) ;
+    }
+}
+
diff --git a/GraphBLAS/Source/GB_AxB_saxpy.h b/GraphBLAS/Source/GB_AxB_saxpy.h
new file mode 100644
index 0000000000..9600c64fa4
--- /dev/null
+++ b/GraphBLAS/Source/GB_AxB_saxpy.h
@@ -0,0 +1,75 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy.h: definitions for GB_AxB_saxpy and related methods
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_AXB_SAXPY_H
+#define GB_AXB_SAXPY_H
+#include "GB.h"
+#include "GB_AxB_saxpy3.h"
+
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy
+//------------------------------------------------------------------------------
+
+GrB_Info GB_AxB_saxpy               // C = A*B using Gustavson/Hash/Bitmap
+(
+    GrB_Matrix *Chandle,            // output matrix (if not done in-place)
+    const GrB_Matrix M,             // optional mask matrix
+    const bool Mask_comp,           // if true, use !M
+    const bool Mask_struct,         // if true, use the only structure of M
+    const GrB_Matrix A,             // input matrix A
+    const GrB_Matrix B,             // input matrix B
+    const GrB_Semiring semiring,    // semiring that defines C=A*B
+    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
+    bool *mask_applied,             // if true, then mask was applied
+    const GrB_Desc_Value AxB_method,
+    const int do_sort,              // if nonzero, try to sort in saxpy3
+    GB_Context Context
+) ;
+
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy_sparsity: determine the sparsity of C
+//------------------------------------------------------------------------------
+
+int GB_AxB_saxpy_sparsity           // return the sparsity structure for C
+(
+    // input:
+    const GrB_Matrix M,             // optional mask for C, unused if NULL
+    const bool Mask_comp,           // if true, use !M
+    const GrB_Matrix A,             // input A matrix
+    const GrB_Matrix B,             // input B matrix
+    GB_Context Context
+) ;
+
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy_generic: for any types and operators
+//------------------------------------------------------------------------------
+
+GrB_Info GB_AxB_saxpy_generic
+(
+    GrB_Matrix C,                   // any sparsity
+    const GrB_Matrix M,
+    bool Mask_comp,
+    const bool Mask_struct,
+    const bool M_dense_in_place,    // ignored if C is bitmap
+    const GrB_Matrix A,
+    bool A_is_pattern,
+    const GrB_Matrix B,
+    bool B_is_pattern,
+    const GrB_Semiring semiring,    // semiring that defines C=A*B
+    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
+    GB_saxpy3task_struct *GB_RESTRICT TaskList, // NULL if C is bitmap
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,              // if nonzero, try to sort in saxpy3
+    GB_Context Context
+) ;
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3.c b/GraphBLAS/Source/GB_AxB_saxpy3.c
index d2a04e69ed..4f2259b5df 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy3.c
+++ b/GraphBLAS/Source/GB_AxB_saxpy3.c
@@ -2,8 +2,8 @@
 // GB_AxB_saxpy3: compute C=A*B, C<M>=A*B, or C<!M>=A*B in parallel
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,10 @@
 // it is not applied.  Instead, M is ignored and C=A*B is computed.  The mask
 // is applied later, in GB_mxm.
 
+// C is sparse or hypersparse.  M, A, and B can have any format.
+// The accum operator is not handled, and C is not modified in-place.  Instead,
+// C is constructed and returned in Chandle.
+
 // For simplicity, this discussion and all comments in this code assume that
 // all matrices are in CSC format, but the algorithm is CSR/CSC agnostic.
 
@@ -49,7 +53,7 @@
 // vector for the fine tasks).  It is set to twice the smallest power of 2 that
 // is greater than the flop count to compute that vector (plus the # of entries
 // in M(:,j) for tasks that compute C<M>=A*B or C<!M>=A*B).  This size ensures
-// the results will fit in the hash table, and with hopefully only a modest
+// the results will fit in the hash table, and with ideally only a modest
 // number of collisions.  If the hash table size exceeds a threshold (currently
 // m/16 if C is m-by-n), then Gustavson's method is used instead, and the hash
 // table size is set to m, to serve as the gather/scatter workspace for
@@ -75,7 +79,7 @@
 // Multiplication and Permuted Transposition. ACM Trans. Math. Softw.  4, 3
 // (Sept. 1978), 250–269. DOI:https://doi.org/10.1145/355791.355796
 
-// [2] Yusuke Nagasaka, Satoshi Matsuoka, Ariful Azad, and Aydın Buluç. 2018.
+// [2] Yusuke Nagasaka, Satoshi Matsuoka, Ariful Azad, and Aydin Buluc. 2018.
 // High-Performance Sparse Matrix-Matrix Products on Intel KNL and Multicore
 // Architectures. In Proc. 47th Intl. Conf. on Parallel Processing (ICPP '18).
 // Association for Computing Machinery, New York, NY, USA, Article 34, 1–10.
@@ -84,37 +88,14 @@
 //------------------------------------------------------------------------------
 
 #include "GB_mxm.h"
-#include "GB_AxB_saxpy3.h"
-#include "GB_mkl.h"
+#include "GB_Global.h"
+#include "GB_is_nonzero.h"
 #ifndef GBCOMPACT
 #include "GB_AxB__include.h"
 #endif
 
-//------------------------------------------------------------------------------
-// control parameters for generating parallel tasks
-//------------------------------------------------------------------------------
-
-#define GB_NTASKS_PER_THREAD 2
-#define GB_COSTLY 1.2
-#define GB_FINE_WORK 2
-#define GB_MWORK_ALPHA 0.01
-
-//------------------------------------------------------------------------------
-// free workspace
-//------------------------------------------------------------------------------
-
-// This workspace is not needed in the GB_Asaxpy3B* worker functions.
-#define GB_FREE_INITIAL_WORK                                                \
-{                                                                           \
-    GB_FREE (Bflops2) ;                                                     \
-    GB_FREE (Coarse_Work) ;                                                 \
-    GB_FREE (Coarse_initial) ;                                              \
-    GB_FREE (Fine_slice) ;                                                  \
-}
-
 #define GB_FREE_WORK                                                        \
 {                                                                           \
-    GB_FREE_INITIAL_WORK ;                                                  \
     GB_FREE (TaskList) ;                                                    \
     GB_FREE (Hi_all) ;                                                      \
     GB_FREE (Hf_all) ;                                                      \
@@ -124,133 +105,7 @@
 #define GB_FREE_ALL                                                         \
 {                                                                           \
     GB_FREE_WORK ;                                                          \
-    GB_MATRIX_FREE (Chandle) ;                                              \
-}
-
-//------------------------------------------------------------------------------
-// GB_hash_table_size
-//------------------------------------------------------------------------------
-
-// flmax is the max flop count for computing A*B(:,j), for any vector j that
-// this task computes.  If the mask M is present, flmax also includes the
-// number of entries in M(:,j).  GB_hash_table_size determines the hash table
-// size for this task, which is twice the smallest power of 2 larger than
-// flmax.  If flmax is large enough, the hash_size is returned as cvlen, so
-// that Gustavson's method will be used instead of the Hash method.
-
-// By default, Gustavson vs Hash is selected automatically.  AxB_method can be
-// selected via the descriptor or a global setting, as the non-default
-// GxB_AxB_GUSTAVSON or GxB_AxB_HASH settings, to enforce the selection of
-// either of those methods.  However, if Hash is selected by the hash table
-// exceeds cvlen, then Gustavson's method is used instead.
-
-static inline int64_t GB_hash_table_size
-(
-    int64_t flmax,      // max flop count for any vector computed by this task
-    int64_t cvlen,      // vector length of C
-    const GrB_Desc_Value AxB_method     // Default, Gustavson, or Hash
-)
-{
-    // hash_size = 2 * (smallest power of 2 >= flmax)
-    double hlog = log2 ((double) flmax) ;
-    int64_t hash_size = ((int64_t) 2) << ((int64_t) floor (hlog) + 1) ;
-    bool use_Gustavson ;
-
-    if (AxB_method == GxB_AxB_GUSTAVSON)
-    { 
-        // always use Gustavson's method
-        use_Gustavson = true ;
-    }
-    else if (AxB_method == GxB_AxB_HASH)
-    { 
-        // always use Hash method, unless the hash_size >= cvlen
-        use_Gustavson = (hash_size >= cvlen) ;
-    }
-    else
-    { 
-        // default: auto selection:
-        // use Gustavson's method if hash_size is too big
-        use_Gustavson = (hash_size >= cvlen/16) ;
-    }
-
-    if (use_Gustavson)
-    { 
-        hash_size = cvlen ;
-    }
-    return (hash_size) ;
-}
-
-//------------------------------------------------------------------------------
-// GB_create_coarse_task: create a single coarse task
-//------------------------------------------------------------------------------
-
-// Compute the max flop count for any vector in a coarse task, determine the
-// hash table size, and construct the coarse task.
-
-static inline void GB_create_coarse_task
-(
-    int64_t kfirst,     // coarse task consists of vectors kfirst:klast
-    int64_t klast,
-    GB_saxpy3task_struct *TaskList,
-    int taskid,         // taskid for this coarse task
-    int64_t *Bflops,    // size bnvec; cum sum of flop counts for vectors of B
-    int64_t cvlen,      // vector length of B and C
-    double chunk,
-    int nthreads_max,
-    int64_t *Coarse_Work,   // workspace for parallel reduction for flop count
-    const GrB_Desc_Value AxB_method     // Default, Gustavson, or Hash
-)
-{
-    // find the max # of flops for any vector in this task
-    int64_t nk = klast - kfirst + 1 ;
-    int nth = GB_nthreads (nk, chunk, nthreads_max) ;
-    int64_t tid ;
-
-    // each thread finds the max flop count for a subset of the vectors
-    #pragma omp parallel for num_threads(nth) schedule(static)
-    for (tid = 0 ; tid < nth ; tid++)
-    {
-        int64_t my_flmax = 1, istart, iend ;
-        GB_PARTITION (istart, iend, nk, tid, nth) ;
-        for (int64_t i = istart ; i < iend ; i++)
-        {
-            int64_t kk = kfirst + i ;
-            int64_t fl = Bflops [kk+1] - Bflops [kk] ;
-            my_flmax = GB_IMAX (my_flmax, fl) ;
-        }
-        Coarse_Work [tid] = my_flmax ;
-    }
-
-    // combine results from each thread
-    int64_t flmax = 1 ;
-    for (tid = 0 ; tid < nth ; tid++)
-    {
-        flmax = GB_IMAX (flmax, Coarse_Work [tid]) ;
-    }
-
-    // check the parallel computation
-    #ifdef GB_DEBUG
-    int64_t flmax2 = 1 ;
-    for (int64_t kk = kfirst ; kk <= klast ; kk++)
-    {
-        int64_t fl = Bflops [kk+1] - Bflops [kk] ;
-        flmax2 = GB_IMAX (flmax2, fl) ;
-    }
-    ASSERT (flmax == flmax2) ;
-    #endif
-
-    // define the coarse task
-    TaskList [taskid].start   = kfirst ;
-    TaskList [taskid].end     = klast ;
-    TaskList [taskid].vector  = -1 ;
-    TaskList [taskid].hsize   = GB_hash_table_size (flmax, cvlen, AxB_method) ;
-    TaskList [taskid].Hi      = NULL ;      // assigned later
-    TaskList [taskid].Hf      = NULL ;      // assigned later
-    TaskList [taskid].Hx      = NULL ;      // assigned later
-    TaskList [taskid].my_cjnz = 0 ;         // unused
-    TaskList [taskid].flops   = Bflops [klast+1] - Bflops [kfirst] ;
-    TaskList [taskid].master  = taskid ;
-    TaskList [taskid].team_size = 1 ;
+    GB_Matrix_free (Chandle) ;                                              \
 }
 
 //------------------------------------------------------------------------------
@@ -259,7 +114,8 @@ static inline void GB_create_coarse_task
 
 GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
 (
-    GrB_Matrix *Chandle,            // output matrix
+    GrB_Matrix *Chandle,            // output matrix (not done in-place)
+    int C_sparsity,                 // construct C as sparse or hypersparse
     const GrB_Matrix M_input,       // optional mask matrix
     const bool Mask_comp_input,     // if true, use !M
     const bool Mask_struct,         // if true, use the only structure of M
@@ -268,10 +124,12 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
     const GrB_Semiring semiring,    // semiring that defines C=A*B
     const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
     bool *mask_applied,             // if true, then mask was applied
-    const GrB_Desc_Value AxB_method,    // Default, Gustavson, or Hash
+    GrB_Desc_Value AxB_method,      // Default, Gustavson, or Hash
+    const int do_sort,              // if nonzero, try to sort in saxpy3
     GB_Context Context
 )
 {
+// double ttt = omp_get_wtime ( ) ;
 
     //--------------------------------------------------------------------------
     // check inputs
@@ -283,61 +141,40 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
     bool Mask_comp = Mask_comp_input ;
 
     (*mask_applied) = false ;
+    bool apply_mask = false ;
+
     ASSERT (Chandle != NULL) ;
     ASSERT (*Chandle == NULL) ;
+
     ASSERT_MATRIX_OK_OR_NULL (M, "M for saxpy3 A*B", GB0) ;
+    ASSERT (!GB_PENDING (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+
     ASSERT_MATRIX_OK (A, "A for saxpy3 A*B", GB0) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+
     ASSERT_MATRIX_OK (B, "B for saxpy3 A*B", GB0) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
-    ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_PENDING (B)) ;
+    ASSERT (GB_JUMBLED_OK (B)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+
     ASSERT_SEMIRING_OK (semiring, "semiring for saxpy3 A*B", GB0) ;
     ASSERT (A->vdim == B->vlen) ;
 
     (*Chandle) = NULL ;
 
-    //--------------------------------------------------------------------------
-    // determine the # of threads to use, and the use_mkl flag
-    //--------------------------------------------------------------------------
-
-    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-    bool use_mkl = (Context == NULL) ? false : Context->use_mkl ;
+    ASSERT (C_sparsity == GxB_HYPERSPARSE || C_sparsity == GxB_SPARSE) ;
 
     //--------------------------------------------------------------------------
-    // use MKL_graph if it available and has this semiring
+    // determine the # of threads to use
     //--------------------------------------------------------------------------
 
-    // Note that GB_AxB_saxpy3 computes C=A*B where A and B treated as if CSC,
-    // but MKL views the matrices as CSR.  So they are flipped below:
-
-    #if GB_HAS_MKL_GRAPH
-
-    if (use_mkl)
-    {
-        info = GB_AxB_saxpy3_mkl (
-            Chandle,            // output matrix to construct
-            M,                  // input mask M (may be NULL)
-            Mask_comp,          // true if M is complemented
-            Mask_struct,        // true if M is structural
-            B,                  // first input matrix
-            A,                  // second input matrix
-            semiring,           // semiring that defines C=A*B
-            !flipxy,            // true if multiply operator is flipped
-            mask_applied,       // if true, then mask was applied
-            Context) ;
-
-        if (info != GrB_NO_VALUE)
-        {
-            // MKL_graph supports this semiring, and has ether computed C=A*B,
-            // C<M>=A*B, or C<!M>=A*B, or has failed.
-            return (info) ;
-        }
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
 
-        GBBURBLE ("(MKL tried) ") ;
-
-        // If MKL_graph doesn't support this semiring, it returns GrB_NO_VALUE,
-        // so fall through to use GraphBLAS, below.
-    }
-    #endif
+    // #include "GB_AxB_saxpy3_mkl_template.c
 
     //--------------------------------------------------------------------------
     // define workspace
@@ -346,18 +183,7 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
     int64_t *GB_RESTRICT Hi_all = NULL ;
     int64_t *GB_RESTRICT Hf_all = NULL ;
     GB_void *GB_RESTRICT Hx_all = NULL ;
-    int64_t *GB_RESTRICT Coarse_initial = NULL ;    // initial coarse tasks
-    int64_t *GB_RESTRICT Coarse_Work = NULL ;       // workspace for flop counts
-    GB_saxpy3task_struct *GB_RESTRICT TaskList = NULL ;
-    int64_t *GB_RESTRICT Fine_slice = NULL ;
-    int64_t *GB_RESTRICT Bflops2 = NULL ;
-
-    int ntasks = 0 ;
-    int ntasks_initial = 0 ;
-    size_t Hi_size_total = 0 ;
-    size_t Hf_size_total = 0 ;
-    size_t Hx_size_total = 0 ;
-    int64_t max_bjnz = 0 ;
+    GB_saxpy3task_struct *TaskList = NULL ;
 
     //--------------------------------------------------------------------------
     // get the semiring operators
@@ -369,14 +195,15 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
     bool A_is_pattern, B_is_pattern ;
     GB_AxB_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult->opcode) ;
 
-    #ifdef GBCOMPACT
-    bool is_any_pair_semiring = false ;
-    #else
     GB_Opcode mult_opcode, add_opcode ;
     GB_Type_code xcode, ycode, zcode ;
     bool builtin_semiring = GB_AxB_semiring_builtin (A, A_is_pattern, B,
         B_is_pattern, semiring, flipxy, &mult_opcode, &add_opcode, &xcode,
         &ycode, &zcode) ;
+
+    #ifdef GBCOMPACT
+    bool is_any_pair_semiring = false ;
+    #else
     bool is_any_pair_semiring = builtin_semiring
         && (add_opcode == GB_ANY_opcode)
         && (mult_opcode == GB_PAIR_opcode) ;
@@ -388,18 +215,19 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
 
     const int64_t *GB_RESTRICT Ap = A->p ;
     const int64_t *GB_RESTRICT Ah = A->h ;
-    const int64_t *GB_RESTRICT Ai = A->i ;
     const int64_t avlen = A->vlen ;
     const int64_t anvec = A->nvec ;
-    const bool A_is_hyper = A->is_hyper ;
+    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
 
     const int64_t *GB_RESTRICT Bp = B->p ;
     const int64_t *GB_RESTRICT Bh = B->h ;
+    const int8_t  *GB_RESTRICT Bb = B->b ;
     const int64_t *GB_RESTRICT Bi = B->i ;
     const int64_t bvdim = B->vdim ;
-    const int64_t bnz = GB_NNZ (B) ;
+    const int64_t bnz = GB_NNZ_HELD (B) ;
     const int64_t bnvec = B->nvec ;
-    const bool B_is_hyper = B->is_hyper ;
+    const int64_t bvlen = B->vlen ;
+    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
 
     //--------------------------------------------------------------------------
     // allocate C (just C->p and C->h, but not C->i or C->x)
@@ -411,9 +239,9 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
     int64_t cvdim = bvdim ;
     int64_t cnvec = bnvec ;
 
-    // calloc Cp so it can be used as the Bflops workspace
-    info = GB_new (Chandle, ctype, cvlen, cvdim, GB_Ap_calloc, true,
-        GB_SAME_HYPER_AS (B_is_hyper), B->hyper_ratio, cnvec, Context) ;
+    info = GB_new (Chandle, // sparse or hyper, new header
+        ctype, cvlen, cvdim, GB_Ap_malloc, true,
+        C_sparsity, B->hyper_switch, cnvec, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory
@@ -427,485 +255,63 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
     int64_t *GB_RESTRICT Ch = C->h ;
     if (B_is_hyper)
     { 
-        // C has the same set of vectors as B
+        // B and C are both hypersparse
+        ASSERT (C_sparsity == GxB_HYPERSPARSE) ;
         int nth = GB_nthreads (cnvec, chunk, nthreads_max) ;
         GB_memcpy (Ch, Bh, cnvec * sizeof (int64_t), nth) ;
         C->nvec = bnvec ;
     }
-
-    //==========================================================================
-    // phase0: create parallel tasks
-    //==========================================================================
-
-    //--------------------------------------------------------------------------
-    // compute flop counts for each vector of B and C
-    //--------------------------------------------------------------------------
-
-    int64_t Mwork = 0 ;
-    int64_t *GB_RESTRICT Bflops = Cp ;  // Cp is used as workspace for Bflops
-    GB_OK (GB_AxB_flopcount (&Mwork, Bflops, M, Mask_comp, A, B, Context)) ;
-    int64_t total_flops = Bflops [bnvec] ;
-
-    //--------------------------------------------------------------------------
-    // determine if the mask M should be applied, or done later
-    //--------------------------------------------------------------------------
-
-    // If M is very large as compared to A*B, then it is too costly to apply
-    // during the computation of A*B.  In this case, compute C=A*B, ignoring
-    // the mask.  Tell the caller that the mask was not applied, so that it
-    // will be applied later in GB_mxm.
-
-    double axbflops = total_flops - Mwork ;
-    GBBURBLE ("axbflops %g Mwork %g ", axbflops, (double) Mwork) ;
-
-    if ((M != NULL) && (axbflops < ((double) Mwork * GB_MWORK_ALPHA)))
-    {
-        // M is present but costly to use.  Do not use it during the
-        // computation of A*B.  Instead, compute C=A*B and then apply the mask
-        // later.
-
-        M = NULL ;
-        Mask_comp = false ;
-
-        int nth = GB_nthreads (bnvec, chunk, nthreads_max) ;
-        int64_t kk ;
-        // GB_AxB_flopcount requires Bflops be set to zero here
-        #pragma omp parallel for num_threads(nth) schedule(static)
-        for (kk = 0 ; kk <= bnvec ; kk++)
-        { 
-            Bflops [kk] = 0 ;
-        }
-
-        // redo the flop count analysis, without the mask
-        GB_OK (GB_AxB_flopcount (&Mwork, Bflops, NULL, false, A, B, Context)) ;
-        total_flops = Bflops [bnvec] ;
-        GBBURBLE ("(discard mask) ") ;
-    }
-    else if (M != NULL)
-    { 
-        GBBURBLE ("(use mask) ") ;
-    }
-
-    //--------------------------------------------------------------------------
-    // get M
-    //--------------------------------------------------------------------------
-
-    bool mask_is_M = (M != NULL && !Mask_comp) ;
-    const int64_t *GB_RESTRICT Mp = NULL ;
-    const int64_t *GB_RESTRICT Mh = NULL ;
-    const int64_t *GB_RESTRICT Mi = NULL ;
-    // const GB_void *GB_RESTRICT Mx = NULL ;
-    // size_t msize = 0 ;
-    int64_t mnvec = 0 ;
-    bool M_is_hyper = false ;
-    if (M != NULL)
-    { 
-        Mp = M->p ;
-        Mh = M->h ;
-        Mi = M->i ;
-        // Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
-        // msize = M->type->size ;
-        mnvec = M->nvec ;
-        M_is_hyper = M->is_hyper ;
-    }
-
-    //--------------------------------------------------------------------------
-    // determine # of threads and # of initial coarse tasks
-    //--------------------------------------------------------------------------
-
-    int nthreads = GB_nthreads ((double) total_flops, chunk, nthreads_max) ;
-    ntasks_initial = (nthreads == 1) ?  1 : (GB_NTASKS_PER_THREAD * nthreads) ;
-
-    double target_task_size = ((double) total_flops) / ntasks_initial ;
-    target_task_size = GB_IMAX (target_task_size, chunk) ;
-    double target_fine_size = target_task_size / GB_FINE_WORK ;
-    target_fine_size = GB_IMAX (target_fine_size, chunk) ;
-
-    //--------------------------------------------------------------------------
-    // determine # of parallel tasks
-    //--------------------------------------------------------------------------
-
-    int nfine = 0 ;         // # of fine tasks
-    int ncoarse = 0 ;       // # of coarse tasks
-    max_bjnz = 0 ;          // max (nnz (B (:,j))) of fine tasks
-
-    // FUTURE: also use ultra-fine tasks that compute A(i1:i2,k)*B(k,j)
-
-    if (ntasks_initial > 1)
-    {
-
-        //----------------------------------------------------------------------
-        // construct initial coarse tasks
-        //----------------------------------------------------------------------
-
-        if (!GB_pslice (&Coarse_initial, Bflops, bnvec, ntasks_initial))
-        { 
-            // out of memory
-            GB_FREE_ALL ;
-            return (GB_OUT_OF_MEMORY) ;
-        }
-
-        //----------------------------------------------------------------------
-        // split the work into coarse and fine tasks
-        //----------------------------------------------------------------------
-
-        for (int taskid = 0 ; taskid < ntasks_initial ; taskid++)
-        {
-            // get the initial coarse task
-            int64_t kfirst = Coarse_initial [taskid] ;
-            int64_t klast  = Coarse_initial [taskid+1] ;
-            int64_t task_ncols = klast - kfirst ;
-            int64_t task_flops = Bflops [klast] - Bflops [kfirst] ;
-
-            if (task_ncols == 0)
-            { 
-                // This coarse task is empty, having been squeezed out by
-                // costly vectors in adjacent coarse tasks.
-            }
-            else if (task_flops > 2 * GB_COSTLY * target_task_size)
-            {
-                // This coarse task is too costly, because it contains one or
-                // more costly vectors.  Split its vectors into a mixture of
-                // coarse and fine tasks.
-
-                int64_t kcoarse_start = kfirst ;
-
-                for (int64_t kk = kfirst ; kk < klast ; kk++)
-                {
-                    // jflops = # of flops to compute a single vector A*B(:,j)
-                    // where j == (Bh == NULL) ? kk : Bh [kk].
-                    double jflops = Bflops [kk+1] - Bflops [kk] ;
-                    // bjnz = nnz (B (:,j))
-                    int64_t bjnz = Bp [kk+1] - Bp [kk] ;
-
-                    if (jflops > GB_COSTLY * target_task_size && bjnz > 1)
-                    {
-                        // A*B(:,j) is costly; split it into 2 or more fine
-                        // tasks.  First flush the prior coarse task, if any.
-                        if (kcoarse_start < kk)
-                        { 
-                            // vectors kcoarse_start to kk-1 form a single
-                            // coarse task
-                            ncoarse++ ;
-                        }
-
-                        // next coarse task (if any) starts at kk+1
-                        kcoarse_start = kk+1 ;
-
-                        // vectors kk will be split into multiple fine tasks
-                        max_bjnz = GB_IMAX (max_bjnz, bjnz) ;
-                        int team_size = ceil (jflops / target_fine_size) ;
-                        nfine += team_size ;
-                    }
-                }
-
-                // flush the last coarse task, if any
-                if (kcoarse_start < klast)
-                { 
-                    // vectors kcoarse_start to klast-1 form a single
-                    // coarse task
-                    ncoarse++ ;
-                }
-
-            }
-            else
-            { 
-                // This coarse task is OK as-is.
-                ncoarse++ ;
-            }
-        }
-    }
     else
-    {
-
-        //----------------------------------------------------------------------
-        // entire computation in a single fine or coarse task
-        //----------------------------------------------------------------------
-
-        if (bnvec == 1)
-        { 
-            // If B is a single vector, and is computed by a single thread,
-            // then a single fine task is used.
-            nfine = 1 ;
-            ncoarse = 0 ;
-        }
-        else
-        { 
-            // One thread uses a single coarse task if B is not a vector.
-            nfine = 0 ;
-            ncoarse = 1 ;
-        }
-    }
-
-    ntasks = ncoarse + nfine ;
-
-    //--------------------------------------------------------------------------
-    // allocate the tasks, and workspace to construct fine tasks
-    //--------------------------------------------------------------------------
-
-    TaskList    = GB_CALLOC (ntasks, GB_saxpy3task_struct) ;
-    Coarse_Work = GB_MALLOC (nthreads_max, int64_t) ;
-    if (max_bjnz > 0)
-    { 
-        // also allocate workspace to construct fine tasks
-        Fine_slice = GB_MALLOC (ntasks+1  , int64_t) ;
-        Bflops2    = GB_MALLOC (max_bjnz+1, int64_t) ;
-    }
-
-    if (TaskList == NULL || Coarse_Work == NULL ||
-        (max_bjnz > 0 && (Fine_slice == NULL || Bflops2 == NULL)))
     { 
-        // out of memory
-        GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        // B is sparse, bitmap, or full; C is sparse
+        ASSERT (C_sparsity == GxB_SPARSE) ;
     }
 
-    //--------------------------------------------------------------------------
-    // create the tasks
-    //--------------------------------------------------------------------------
-
-    if (ntasks_initial > 1)
-    {
-
-        //----------------------------------------------------------------------
-        // create the coarse and fine tasks
-        //----------------------------------------------------------------------
-
-        int nf = 0 ;        // fine tasks have task id 0:nfine-1
-        int nc = nfine ;    // coarse task ids are nfine:ntasks-1
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (3, ttt) ;
+// ttt = omp_get_wtime ( ) ;
 
-        for (int taskid = 0 ; taskid < ntasks_initial ; taskid++)
-        {
-            // get the initial coarse task
-            int64_t kfirst = Coarse_initial [taskid] ;
-            int64_t klast  = Coarse_initial [taskid+1] ;
-            int64_t task_ncols = klast - kfirst ;
-            int64_t task_flops = Bflops [klast] - Bflops [kfirst] ;
-
-            if (task_ncols == 0)
-            { 
-                // This coarse task is empty, having been squeezed out by
-                // costly vectors in adjacent coarse tasks.
-            }
-            else if (task_flops > 2 * GB_COSTLY * target_task_size)
-            {
-                // This coarse task is too costly, because it contains one or
-                // more costly vectors.  Split its vectors into a mixture of
-                // coarse and fine tasks.
-
-                int64_t kcoarse_start = kfirst ;
-
-                for (int64_t kk = kfirst ; kk < klast ; kk++)
-                {
-                    // jflops = # of flops to compute a single vector A*B(:,j)
-                    double jflops = Bflops [kk+1] - Bflops [kk] ;
-                    // bjnz = nnz (B (:,j))
-                    int64_t bjnz = Bp [kk+1] - Bp [kk] ;
-
-                    if (jflops > GB_COSTLY * target_task_size && bjnz > 1)
-                    {
-                        // A*B(:,j) is costly; split it into 2 or more fine
-                        // tasks.  First flush the prior coarse task, if any.
-                        if (kcoarse_start < kk)
-                        { 
-                            // kcoarse_start:kk-1 form a single coarse task
-                            GB_create_coarse_task (kcoarse_start, kk-1,
-                                TaskList, nc++, Bflops, cvlen,
-                                chunk, nthreads_max, Coarse_Work, AxB_method) ;
-                        }
-
-                        // next coarse task (if any) starts at kk+1
-                        kcoarse_start = kk+1 ;
-
-                        // get the mask M(:,j), for C<M>=A*B
-                        int64_t im_first = -1, im_last = -1 ;
-                        if (mask_is_M)
-                        {
-                            int64_t j = (Bh == NULL) ? kk : Bh [kk] ;
-                            int64_t mpleft = 0 ;
-                            int64_t mpright = mnvec-1 ;
-                            int64_t pM, pM_end ;
-                            GB_lookup (M_is_hyper, Mh, Mp, &mpleft, mpright, j,
-                                &pM, &pM_end) ;
-                            int64_t mjnz = pM_end - pM ;    // nnz (M (:,j))
-                            // For C<M>=A*B, if M(:,j) is empty, then there
-                            // would be no flops to compute C(:,j), and thus
-                            // no fine tasks constructed for C(:,j).
-                            // Thus mjnz > 0 must hold.
-                            ASSERT (mjnz > 0) ;
-                            if (mjnz > 0)   // but check anyway, just to be safe
-                            { 
-                                im_first = Mi [pM] ;
-                                im_last  = Mi [pM_end-1] ;
-                            }
-                        }
-
-                        // count the work for each entry B(k,j).  Do not
-                        // include the work to scan M(:,j), since that will
-                        // be evenly divided between all tasks in this team.
-                        // Do check if M(:,j) and A(:,k) are disjoint, for
-                        // C<M>=A*B, when accounting for the flops for B(k,j).
-                        int64_t pB_start = Bp [kk] ;
-                        int nth = GB_nthreads (bjnz, chunk, nthreads_max) ;
-                        int64_t s ;
-                        #pragma omp parallel for num_threads(nth) \
-                            schedule(static)
-                        for (s = 0 ; s < bjnz ; s++)
-                        {
-                            // get B(k,j)
-                            int64_t k = Bi [pB_start + s] ;
-                            // fl = flop count for just A(:,k)*B(k,j)
-                            int64_t pA, pA_end ;
-                            int64_t pleft = 0 ;
-                            GB_lookup (A_is_hyper, Ah, Ap, &pleft, anvec-1, k,
-                                &pA, &pA_end) ;
-                            int64_t fl = pA_end - pA ;
-                            if (mask_is_M && fl > 0)
-                            { 
-                                // no work if A(:,k) and M(:,j) disjoint
-                                int64_t alo = Ai [pA] ;      // get first A(:,k)
-                                int64_t ahi = Ai [pA_end-1] ;// get last A(:,k)
-                                if (ahi < im_first || alo > im_last) fl = 0 ;
-                            }
-                            Bflops2 [s] = fl ;
-                            ASSERT (fl >= 0) ;
-                        }
-
-                        // cumulative sum of flops to compute A*B(:,j)
-                        GB_cumsum (Bflops2, bjnz, NULL, nth) ;
-
-                        // slice B(:,j) into fine tasks
-                        int team_size = ceil (jflops / target_fine_size) ;
-                        ASSERT (Fine_slice != NULL) ;
-                        GB_pslice (&Fine_slice, Bflops2, bjnz, team_size) ;
-
-                        // shared hash table for all fine tasks for A*B(:,j)
-                        int64_t hsize = 
-                            GB_hash_table_size (jflops, cvlen, AxB_method) ;
-
-                        // construct the fine tasks for C(:,j)=A*B(:,j)
-                        int master = nf ;
-                        for (int fid = 0 ; fid < team_size ; fid++)
-                        { 
-                            int64_t pstart = Fine_slice [fid] ;
-                            int64_t pend   = Fine_slice [fid+1] ;
-                            int64_t fl = Bflops2 [pend] - Bflops2 [pstart] ;
-                            TaskList [nf].start  = pB_start + pstart ;
-                            TaskList [nf].end    = pB_start + pend - 1 ;
-                            TaskList [nf].vector = kk ;
-                            TaskList [nf].hsize  = hsize ;
-                            TaskList [nf].Hi = NULL ;   // assigned later
-                            TaskList [nf].Hf = NULL ;   // assigned later
-                            TaskList [nf].Hx = NULL ;   // assigned later
-                            TaskList [nf].my_cjnz = 0 ;
-                            TaskList [nf].flops = fl ;
-                            TaskList [nf].master = master ;
-                            TaskList [nf].team_size = team_size ;
-                            nf++ ;
-                        }
-                    }
-                }
-
-                // flush the last coarse task, if any
-                if (kcoarse_start < klast)
-                { 
-                    // kcoarse_start:klast-1 form a single coarse task
-                    GB_create_coarse_task (kcoarse_start, klast-1, TaskList,
-                        nc++, Bflops, cvlen, chunk, nthreads_max, Coarse_Work,
-                        AxB_method) ;
-                }
+    //==========================================================================
+    // phase0: create parallel tasks and allocate workspace
+    //==========================================================================
 
-            }
-            else
-            { 
-                // This coarse task is OK as-is.
-                GB_create_coarse_task (kfirst, klast-1, TaskList, nc++, Bflops,
-                    cvlen, chunk, nthreads_max, Coarse_Work, AxB_method) ;
-            }
-        }
+    int nthreads, ntasks, nfine ;
+    bool M_dense_in_place = false ;
 
+    if (nthreads_max == 1 && M == NULL && (AxB_method != GxB_AxB_HASH) &&
+        GB_IMIN (GB_NNZ (A), GB_NNZ (B)) > cvlen)
+    { 
+        // Skip the flopcount analysis if only a single thread is being used,
+        // no mask is present, the min # of entries in A and B is > cvlen, and
+        // the Hash method is not explicitly selected.  In this case, use a
+        // single Gustavson task only (fine task if B has one vector, coarse
+        // otherwise).  In this case, the flop count analysis is not needed.
+        GB_OK (GB_AxB_saxpy3_slice_quick (C, A, B,
+            &TaskList, &ntasks, &nfine, &nthreads, Context)) ;
+        GBURBLE ("(single-threaded Gustavson) ") ;
     }
     else
-    {
-
-        //----------------------------------------------------------------------
-        // entire computation in a single fine or coarse task
-        //----------------------------------------------------------------------
-
-        // create a single coarse task
-        GB_create_coarse_task (0, bnvec-1, TaskList, 0, Bflops, cvlen, 1, 1,
-            Coarse_Work, AxB_method) ;
-
-        if (bnvec == 1)
-        { 
-            // convert the single coarse task into a single fine task
-            TaskList [0].start  = 0 ;           // first entry in B(:,0)
-            TaskList [0].end    = bnz - 1 ;     // last entry in B(:,0)
-            TaskList [0].vector = 0 ;
-        }
+    { 
+        // Do the flopcount analysis and create a set of well-balanced tasks in
+        // the general case.  This may select a single task for a single thread
+        // anyway, but this decision would be based on the analysis.
+        GB_OK (GB_AxB_saxpy3_slice_balanced (C, M, Mask_comp, A, B, AxB_method,
+            &TaskList, &apply_mask, &M_dense_in_place, &ntasks, &nfine,
+            &nthreads, Context)) ;
     }
 
-    //--------------------------------------------------------------------------
-    // free workspace used to create the tasks
-    //--------------------------------------------------------------------------
-
-    // Frees Bflops2, Coarse_initial, Coarse_Work, and Fine_slice.  These do
-    // not need to be freed in the GB_Asaxpy3B worker below.
-
-    GB_FREE_INITIAL_WORK ;
+    if (do_sort) GBURBLE ("sort ") ;
 
-    //--------------------------------------------------------------------------
-
-    #if GB_BURBLE
-    int nfine_hash = 0 ;
-    int nfine_gus = 0 ;
-    int ncoarse_hash = 0 ;
-    int ncoarse_1hash = 0 ;
-    int ncoarse_gus = 0 ;
-    for (int taskid = 0 ; taskid < ntasks ; taskid++)
+    if (!apply_mask)
     {
-        int64_t hash_size = TaskList [taskid].hsize ;
-        bool is_fine = (taskid < nfine) ;
-        bool use_Gustavson = (hash_size == cvlen) ;
-        if (is_fine)
-        {
-            // fine task
-            if (use_Gustavson)
-            {
-                // fine Gustavson task
-                nfine_gus++ ;
-            }
-            else
-            {
-                // fine hash task
-                nfine_hash++ ;
-            }
-        }
-        else
-        {
-            // coarse task
-            int64_t kfirst = TaskList [taskid].start ;
-            int64_t klast = TaskList [taskid].end ;
-            if (use_Gustavson)
-            {
-                // coarse Gustavson task
-                ncoarse_gus++ ;
-            }
-            else
-            {
-                // hash task
-                ncoarse_hash++ ;
-            }
-        }
+        // disable the mask, if present
+        M = NULL ;
+        Mask_comp = false ;
     }
 
-    GBBURBLE ("nthreads %d ntasks %d coarse: (gus: %d hash: %d)"
-        " fine: (gus: %d hash: %d) ", nthreads, ntasks,
-        ncoarse_gus, ncoarse_hash, nfine_gus, nfine_hash) ;
-    #endif
-
-    // Bflops is no longer needed as an alias for Cp
-    Bflops = NULL ;
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (4, ttt) ;
+// ttt = omp_get_wtime ( ) ;
 
     //--------------------------------------------------------------------------
     // allocate the hash tables
@@ -932,9 +338,11 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
     //      than cvlen (otherwise, Gustavson's method is used).
     //
     //      A hash function is used for the ith entry:
-    //          hash = (i * GB_HASH_FACTOR) & (hash_size-1)
+    //          hash = GB_HASHF (i) ; in range 0 to hash_size-1
     //      If a collision occurs, linear probing is used:
-    //          hash = (hash + 1) & (hashsize-1)
+    //          GB_REHASH (hash, i)
+    //      which is:
+    //          hash = (hash + 1) & (hash_size-1)
     //
     //      (Hf [hash] == mark) is true if the position is occupied.
     //      i = Hi [hash] gives the row index i that occupies that position.
@@ -948,29 +356,67 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
 
     // add some padding to the end of each hash table, to avoid false
     // sharing of cache lines between the hash tables.
-    size_t hx_pad = 64 ;
+    size_t hx_pad = GB_ICEIL (64, csize) ;
     size_t hi_pad = 64 / sizeof (int64_t) ;
 
-    Hi_size_total = 0 ;
-    Hf_size_total = 0 ;
-    Hx_size_total = 0 ;
+    size_t Hi_size_total = 0 ;
+    size_t Hf_size_total = 0 ;
+    size_t Hx_size_total = 0 ;
 
+    //--------------------------------------------------------------------------
     // determine the total size of all hash tables
+    //--------------------------------------------------------------------------
+
+    int nfine_hash = 0 ;
+    int nfine_gus = 0 ;
+    int ncoarse_hash = 0 ;
+    int ncoarse_1hash = 0 ;
+    int ncoarse_gus = 0 ;
+
     for (int taskid = 0 ; taskid < ntasks ; taskid++)
     {
-        if (taskid != TaskList [taskid].master)
-        { 
-            // allocate a single shared hash table for all fine
-            // tasks that compute a single C(:,j)
-            continue ;
-        }
 
+        // get the task type and is hash size
         int64_t hash_size = TaskList [taskid].hsize ;
         int64_t k = TaskList [taskid].vector ;
         bool is_fine = (k >= 0) ;
         bool use_Gustavson = (hash_size == cvlen) ;
-        // int64_t kfirst = TaskList [taskid].start ;
-        // int64_t klast = TaskList [taskid].end ;
+
+        if (is_fine)
+        {
+            // fine task
+            if (use_Gustavson)
+            {
+                // fine Gustavson task
+                nfine_gus++ ;
+            }
+            else
+            {
+                // fine hash task
+                nfine_hash++ ;
+            }
+        }
+        else
+        {
+            // coarse task
+            if (use_Gustavson)
+            {
+                // coarse Gustavson task
+                ncoarse_gus++ ;
+            }
+            else
+            {
+                // coarse hash task
+                ncoarse_hash++ ;
+            }
+        }
+
+        if (taskid != TaskList [taskid].leader)
+        { 
+            // allocate a single shared hash table for all fine
+            // tasks that compute a single C(:,j)
+            continue ;
+        }
 
         if (is_fine && use_Gustavson)
         { 
@@ -992,22 +438,33 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
         if (!is_any_pair_semiring)
         { 
             // except that the ANY_PAIR semiring does not use Hx
-            Hx_size_total += (hash_size * csize + hx_pad) ;
+            Hx_size_total += (hash_size + hx_pad) ;
         }
     }
 
+    GBURBLE ("(nthreads %d", nthreads) ;
+    if (ncoarse_gus  > 0) GBURBLE (" coarse: %d",      ncoarse_gus) ;
+    if (ncoarse_hash > 0) GBURBLE (" coarse hash: %d", ncoarse_hash) ;
+    if (nfine_gus    > 0) GBURBLE (" fine: %d",        nfine_gus) ;
+    if (nfine_hash   > 0) GBURBLE (" fine hash: %d",   nfine_hash) ;
+    GBURBLE (") ") ;
+
+    //--------------------------------------------------------------------------
     // allocate space for all hash tables
+    //--------------------------------------------------------------------------
+
     if (Hi_size_total > 0)
     { 
         Hi_all = GB_MALLOC (Hi_size_total, int64_t) ;
     }
     if (Hf_size_total > 0)
     { 
+        // Hf must be calloc'd to initialize all entries as empty 
         Hf_all = GB_CALLOC (Hf_size_total, int64_t) ;
     }
     if (Hx_size_total > 0)
     { 
-        Hx_all = GB_MALLOC (Hx_size_total, GB_void) ;
+        Hx_all = GB_MALLOC (Hx_size_total * csize, GB_void) ;
     }
 
     if ((Hi_size_total > 0 && Hi_all == NULL) ||
@@ -1016,33 +473,35 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
+    //--------------------------------------------------------------------------
     // split the space into separate hash tables
+    //--------------------------------------------------------------------------
+
     int64_t *GB_RESTRICT Hi_split = Hi_all ;
     int64_t *GB_RESTRICT Hf_split = Hf_all ;
     GB_void *GB_RESTRICT Hx_split = Hx_all ;
 
     for (int taskid = 0 ; taskid < ntasks ; taskid++)
     {
-        if (taskid != TaskList [taskid].master)
+
+        if (taskid != TaskList [taskid].leader)
         { 
             // allocate a single hash table for all fine
             // tasks that compute a single C(:,j)
             continue ;
         }
 
-        TaskList [taskid].Hi = Hi_split ;
-        TaskList [taskid].Hf = (GB_void *) Hf_split ;
-        TaskList [taskid].Hx = Hx_split ;
-
         int64_t hash_size = TaskList [taskid].hsize ;
         int64_t k = TaskList [taskid].vector ;
         bool is_fine = (k >= 0) ;
         bool use_Gustavson = (hash_size == cvlen) ;
-        // int64_t kfirst = TaskList [taskid].start ;
-        // int64_t klast = TaskList [taskid].end ;
+
+        TaskList [taskid].Hi = Hi_split ;
+        TaskList [taskid].Hf = (GB_void *) Hf_split ;
+        TaskList [taskid].Hx = Hx_split ;
 
         if (is_fine && use_Gustavson)
         { 
@@ -1062,22 +521,22 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
         // all tasks use an Hx array of size hash_size
         if (!is_any_pair_semiring)
         { 
-            Hx_split += (hash_size * csize + hx_pad) ;
+            Hx_split += (hash_size + hx_pad) * csize ;
         }
     }
 
     // assign shared hash tables to fine task teams
     for (int taskid = 0 ; taskid < nfine ; taskid++)
     {
-        int master = TaskList [taskid].master ;
-        ASSERT (TaskList [master].vector >= 0) ;
-        if (taskid != master)
+        int leader = TaskList [taskid].leader ;
+        ASSERT (TaskList [leader].vector >= 0) ;
+        if (taskid != leader)
         { 
             // this fine task (Gustavson or hash) shares its hash table
             // with all other tasks in its team, for a single vector C(:,j).
-            ASSERT (TaskList [taskid].vector == TaskList [master].vector) ;
-            TaskList [taskid].Hf = TaskList [master].Hf ;
-            TaskList [taskid].Hx = TaskList [master].Hx ;
+            ASSERT (TaskList [taskid].vector == TaskList [leader].vector) ;
+            TaskList [taskid].Hf = TaskList [leader].Hf ;
+            TaskList [taskid].Hx = TaskList [leader].Hx ;
         }
     }
 
@@ -1085,8 +544,23 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
     // phase1: symbolic analysis
     //==========================================================================
 
-    GB_AxB_saxpy3_symbolic (C, M, Mask_comp, Mask_struct, A, B, TaskList,
-        ntasks, nfine, nthreads) ;
+    // TODO constructing the tasks (the work above) can take a lot of time.
+    // See the web graph, where it takes a total of 3.03 sec for 64 trials, vs
+    // a total of 5.9 second for phase 7 (the numerical work below).
+    // Figure out a faster method.
+
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (5, ttt) ;
+// ttt = omp_get_wtime ( ) ;
+
+    GB_AxB_saxpy3_symbolic (C, M, Mask_comp, Mask_struct, M_dense_in_place,
+        A, B, TaskList, ntasks, nfine, nthreads) ;
+
+// the above phase takes 1.6 seconds for 64 trials of the web graph.
+
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (6, ttt) ;
+// ttt = omp_get_wtime ( ) ;
 
     //==========================================================================
     // C = A*B, via saxpy3 method and built-in semiring
@@ -1106,8 +580,9 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
         #define GB_AxB_WORKER(add,mult,xname)                               \
         {                                                                   \
             info = GB_Asaxpy3B (add,mult,xname) (C, M, Mask_comp,           \
-                Mask_struct, A, A_is_pattern, B, B_is_pattern,              \
-                TaskList, ntasks, nfine, nthreads, Context) ;               \
+                Mask_struct, M_dense_in_place, A, A_is_pattern,  B,         \
+                B_is_pattern, TaskList, ntasks, nfine, nthreads, do_sort,   \
+                Context) ;                                                  \
             done = (info != GrB_NO_VALUE) ;                                 \
         }                                                                   \
         break ;
@@ -1124,35 +599,43 @@ GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
     #endif
 
     //==========================================================================
-    // C = A*B, via the generic saxpy3 method, with typecasting
+    // C = A*B, via the generic saxpy method, with typecasting
     //==========================================================================
 
     if (!done)
     { 
-        GB_BURBLE_MATRIX (C, "generic ") ;
-        info = GB_AxB_saxpy3_generic (C, M, Mask_comp, Mask_struct,
-            A, A_is_pattern, B, B_is_pattern, semiring, flipxy,
-            TaskList, ntasks, nfine, nthreads, Context) ;
+        info = GB_AxB_saxpy_generic (C, M, Mask_comp, Mask_struct,
+            M_dense_in_place, A, A_is_pattern, B, B_is_pattern, semiring,
+            flipxy, TaskList, ntasks, nfine, nthreads, do_sort, Context) ;
     }
 
     if (info != GrB_SUCCESS)
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //==========================================================================
     // prune empty vectors, free workspace, and return result
     //==========================================================================
 
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (7, ttt) ;
+// ttt = omp_get_wtime ( ) ;
+
+    C->magic = GB_MAGIC ;
     GB_FREE_WORK ;
-    info = GB_hypermatrix_prune (C, Context) ;
-    if (info == GrB_SUCCESS) { ASSERT_MATRIX_OK (C, "saxpy3: output", GB0) ; }
+    GB_OK (GB_hypermatrix_prune (C, Context)) ;
+    ASSERT_MATRIX_OK (C, "saxpy3: output", GB0) ;
     ASSERT (*Chandle == C) ;
     ASSERT (!GB_ZOMBIES (C)) ;
     ASSERT (!GB_PENDING (C)) ;
-    (*mask_applied) = (M != NULL) ;
+    (*mask_applied) = apply_mask ;
+
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (8, ttt) ;
+
     return (info) ;
 }
 
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3.h b/GraphBLAS/Source/GB_AxB_saxpy3.h
index 10c3fbb319..d03bc139b2 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy3.h
+++ b/GraphBLAS/Source/GB_AxB_saxpy3.h
@@ -2,8 +2,8 @@
 // GB_AxB_saxpy3.h: definitions for C=A*B saxpy3 method
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,16 +14,31 @@
 #define GB_AXB_SAXPY3_H
 #include "GB.h"
 
+GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
+(
+    GrB_Matrix *Chandle,            // output matrix (not done in-place)
+    int C_sparsity,                 // construct C as sparse or hypersparse
+    const GrB_Matrix M_input,       // optional mask matrix
+    const bool Mask_comp_input,     // if true, use !M
+    const bool Mask_struct,         // if true, use the only structure of M
+    const GrB_Matrix A,             // input matrix A
+    const GrB_Matrix B,             // input matrix B
+    const GrB_Semiring semiring,    // semiring that defines C=A*B
+    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
+    bool *mask_applied,             // if true, then mask was applied
+    GrB_Desc_Value AxB_method,      // Default, Gustavson, or Hash
+    const int do_sort,              // if nonzero, try to sort in saxpy3
+    GB_Context Context
+) ;
+
 //------------------------------------------------------------------------------
 // functions for the Hash method for C=A*B
 //------------------------------------------------------------------------------
 
-#define GB_HASH_FACTOR 107
-
 // initial hash function, for where to place the integer i in the hash table.
 // hash_bits is a bit mask to compute the result modulo the hash table size,
-// which is always a power of 2.
-#define GB_HASH_FUNCTION(i) ((i * GB_HASH_FACTOR) & (hash_bits))
+// which is always a power of 2.  The function is (i*257) & (hash_bits).
+#define GB_HASHF(i) ((((i) << 8) + (i)) & (hash_bits))
 
 // rehash function, for subsequent hash lookups if the initial hash function
 // refers to a hash entry that is already occupied.  Linear probing is used,
@@ -32,14 +47,17 @@
 // the new hash value.
 #define GB_REHASH(hash,i) hash = ((hash + 1) & (hash_bits))
 
-// The hash functions and their parameters are from this paper:
+// The hash functions and their parameters are modified from this paper:
 
-// [2] Yusuke Nagasaka, Satoshi Matsuoka, Ariful Azad, and Aydın Buluç. 2018.
+// [2] Yusuke Nagasaka, Satoshi Matsuoka, Ariful Azad, and Aydin Buluc. 2018.
 // High-Performance Sparse Matrix-Matrix Products on Intel KNL and Multicore
 // Architectures. In Proc. 47th Intl. Conf. on Parallel Processing (ICPP '18).
 // Association for Computing Machinery, New York, NY, USA, Article 34, 1–10.
 // DOI:https://doi.org/10.1145/3229710.3229720
 
+// The hash function in that paper is (i*107)&(hash_bits).  Here, the term
+// 107 is replaced with 257 to allow for a faster hash function computation.
+
 //------------------------------------------------------------------------------
 // GB_saxpy3task_struct: task descriptor for GB_AxB_saxpy3
 //------------------------------------------------------------------------------
@@ -47,15 +65,14 @@
 // A coarse task computes C(:,j1:j2) = A*B(:,j1:j2), for a contiguous set of
 // vectors j1:j2.  A coarse taskid is denoted byTaskList [taskid].vector == -1,
 // kfirst = TaskList [taskid].start, and klast = TaskList [taskid].end, and
-// where j1 = (Bh == NULL) ? kstart : Bh [kstart] and likewise for j2.  No
-// summation is needed for the final result of each coarse task.
+// where j1 = GBH (Bh, kstart) and likewise for j2.  No summation is needed for
+// the final result of each coarse task.
 
 // A fine taskid computes A*B(k1:k2,j) for a single vector C(:,j), for a
 // contiguous range k1:k2, where kk = Tasklist[taskid].vector (which is >= 0),
 // k1 = Bi [TaskList [taskid].start], k2 = Bi [TaskList [taskid].end].  It sums
 // its computations in a hash table shared by all fine tasks that compute
-// C(:,j), via atomics.  The vector index j is either kk if B is standard, or j
-// = B->h [kk] if B is hypersparse.
+// C(:,j), via atomics.  The vector index j is GBH (Bh, kk).
 
 // Both tasks use a hash table allocated uniquely for the task, in Hi, Hf, and
 // Hx.  The size of the hash table is determined by the maximum # of flops
@@ -76,22 +93,38 @@ typedef struct
     GB_void *Hf ;       // Hf array for hash table (int8_t or int64_t)
     GB_void *Hx ;       // Hx array for hash table
     int64_t my_cjnz ;   // # entries in C(:,j) found by this fine task
-    int64_t flops ;     // # of flops in this task
-    int master ;        // master fine task for the vector C(:,j)
+    int leader ;        // leader fine task for the vector C(:,j)
     int team_size ;     // # of fine tasks in the team for vector C(:,j)
 }
 GB_saxpy3task_struct ;
 
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_flopcount:  compute flops for GB_AxB_saxpy3
+//------------------------------------------------------------------------------
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_AxB_saxpy3_flopcount
+(
+    int64_t *Mwork,             // amount of work to handle the mask M
+    int64_t *Bflops,            // size B->nvec+1 and all zero
+    const GrB_Matrix M,         // optional mask matrix
+    const bool Mask_comp,       // if true, mask is complemented
+    const GrB_Matrix A,
+    const GrB_Matrix B,
+    GB_Context Context
+) ;
+
 //------------------------------------------------------------------------------
 // GB_AxB_saxpy3_symbolic: symbolic analysis for GB_AxB_saxpy3
 //------------------------------------------------------------------------------
 
 void GB_AxB_saxpy3_symbolic
 (
-    GrB_Matrix C,               // Cp [k] is computed for coarse tasks
+    GrB_Matrix C,               // Cp is computed for coarse tasks
     const GrB_Matrix M,         // mask matrix M
-    bool Mask_comp,             // M complemented, or not
-    bool Mask_struct,           // M structural, or not
+    const bool Mask_comp,       // M complemented, or not
+    const bool Mask_struct,     // M structural, or not
+    const bool M_dense_in_place,
     const GrB_Matrix A,         // A matrix; only the pattern is accessed
     const GrB_Matrix B,         // B matrix; only the pattern is accessed
     GB_saxpy3task_struct *TaskList,     // list of tasks, and workspace
@@ -104,7 +137,7 @@ void GB_AxB_saxpy3_symbolic
 // GB_AxB_saxpy3_cumsum: cumulative sum of C->p for GB_AxB_saxpy3
 //------------------------------------------------------------------------------
 
-int64_t GB_AxB_saxpy3_cumsum    // return cjnz_max for fine tasks
+void GB_AxB_saxpy3_cumsum
 (
     GrB_Matrix C,               // finalize C->p
     GB_saxpy3task_struct *TaskList, // list of tasks, and workspace
@@ -114,46 +147,45 @@ int64_t GB_AxB_saxpy3_cumsum    // return cjnz_max for fine tasks
 ) ;
 
 //------------------------------------------------------------------------------
-// GB_AxB_saxpy3_generic: for any types and operators
+// GB_AxB_saxpy3_slice_balanced: create balanced parallel tasks for saxpy3
 //------------------------------------------------------------------------------
 
-GrB_Info GB_AxB_saxpy3_generic
+GrB_Info GB_AxB_saxpy3_slice_balanced
 (
-    GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
-    const GrB_Matrix A, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    const GrB_Semiring semiring,    // semiring that defines C=A*B
-    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
-    GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    // inputs
+    GrB_Matrix C,                   // output matrix
+    const GrB_Matrix M,             // optional mask matrix
+    const bool Mask_comp,           // if true, use !M
+    const GrB_Matrix A,             // input matrix A
+    const GrB_Matrix B,             // input matrix B
+    GrB_Desc_Value AxB_method,      // Default, Gustavson, or Hash
+    // outputs
+    GB_saxpy3task_struct **TaskList_handle,
+    bool *apply_mask,               // if true, apply M during sapxy3
+    bool *M_dense_in_place,         // if true, use M in-place
+    int *ntasks,                    // # of tasks created (coarse and fine)
+    int *nfine,                     // # of fine tasks created
+    int *nthreads,                  // # of threads to use
     GB_Context Context
 ) ;
 
 //------------------------------------------------------------------------------
-// AVX2 instructions
+// GB_AxB_saxpy3_slice_quick: create a single sequential task for saxpy3
 //------------------------------------------------------------------------------
 
-#if defined ( __AVX2__ )
-
-    #include <x86intrin.h>
-    #include <immintrin.h>
-
-    // int64_t vector of length 4
-    typedef union i4vector64
-    {
-        // in avxintrin.h:
-        // typedef long long __v4di __attribute__ ((__vector_size__ (32)));
-        // __v4di v ;
-        int64_t i [4] ;
-        __m256i m ;
-        __m256d d ;
-    }
-    GB_vector ;
-
-#endif
+GrB_Info GB_AxB_saxpy3_slice_quick
+(
+    // inputs
+    GrB_Matrix C,                   // output matrix
+    const GrB_Matrix A,             // input matrix A
+    const GrB_Matrix B,             // input matrix B
+    // outputs
+    GB_saxpy3task_struct **TaskList_handle,
+    int *ntasks,                    // # of tasks created (coarse and fine)
+    int *nfine,                     // # of fine tasks created
+    int *nthreads,                  // # of threads to use
+    GB_Context Context
+) ;
 
 #endif
 
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3_cumsum.c b/GraphBLAS/Source/GB_AxB_saxpy3_cumsum.c
index 3eaac27141..970c9a0f42 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy3_cumsum.c
+++ b/GraphBLAS/Source/GB_AxB_saxpy3_cumsum.c
@@ -2,8 +2,8 @@
 // GB_AxB_saxpy3_cumsum: finalize nnz(C(:,j)) and find cumulative sum of Cp
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,7 +12,7 @@
 
 #include "GB_AxB_saxpy3.h"
 
-int64_t GB_AxB_saxpy3_cumsum    // return cjnz_max for fine tasks
+void GB_AxB_saxpy3_cumsum
 (
     GrB_Matrix C,               // finalize C->p
     GB_saxpy3task_struct *TaskList, // list of tasks, and workspace
@@ -26,6 +26,8 @@ int64_t GB_AxB_saxpy3_cumsum    // return cjnz_max for fine tasks
     // get C
     //--------------------------------------------------------------------------
 
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (!GB_IS_FULL (C)) ;
     int64_t *GB_RESTRICT Cp = C->p ;
     const int64_t cvlen = C->vlen ;
     const int64_t cnvec = C->nvec ;
@@ -47,8 +49,8 @@ int64_t GB_AxB_saxpy3_cumsum    // return cjnz_max for fine tasks
         int64_t hash_size = TaskList [taskid].hsize ;
         bool use_Gustavson = (hash_size == cvlen) ;
         int team_size = TaskList [taskid].team_size ;
-        int master    = TaskList [taskid].master ;
-        int my_teamid = taskid - master ;
+        int leader    = TaskList [taskid].leader ;
+        int my_teamid = taskid - leader ;
         int64_t my_cjnz = 0 ;
 
         if (use_Gustavson)
@@ -102,6 +104,10 @@ int64_t GB_AxB_saxpy3_cumsum    // return cjnz_max for fine tasks
     // phase4: compute Cp with cumulative sum
     //==========================================================================
 
+    //--------------------------------------------------------------------------
+    // sum nnz (C (:,j)) for fine tasks
+    //--------------------------------------------------------------------------
+
     // TaskList [taskid].my_cjnz is the # of unique entries found in C(:,j) by
     // that task.  Sum these terms to compute total # of entries in C(:,j).
 
@@ -119,18 +125,24 @@ int64_t GB_AxB_saxpy3_cumsum    // return cjnz_max for fine tasks
         ASSERT (my_cjnz <= cvlen) ;
     }
 
+    //--------------------------------------------------------------------------
+    // cumulative sum for Cp (fine and coarse tasks)
+    //--------------------------------------------------------------------------
+
     // Cp [kk] is now nnz (C (:,j)), for all vectors j, whether computed by
-    // fine tasks or coarse tasks, and where j == (Bh == NULL) ? kk : Bh [kk].
+    // fine tasks or coarse tasks, and where j == GBH (Bh, kk) 
 
     int nth = GB_nthreads (cnvec, chunk, nthreads) ;
     GB_cumsum (Cp, cnvec, &(C->nvec_nonempty), nth) ;
 
+    //--------------------------------------------------------------------------
     // cumulative sum of nnz (C (:,j)) for each team of fine tasks
+    //--------------------------------------------------------------------------
+
     int64_t cjnz_sum = 0 ;
-    int64_t cjnz_max = 0 ;
     for (taskid = 0 ; taskid < nfine ; taskid++)
     {
-        if (taskid == TaskList [taskid].master)
+        if (taskid == TaskList [taskid].leader)
         {
             cjnz_sum = 0 ;
             // also find the max (C (:,j)) for any fine hash tasks
@@ -140,18 +152,11 @@ int64_t GB_AxB_saxpy3_cumsum    // return cjnz_max for fine tasks
             { 
                 int64_t kk = TaskList [taskid].vector ;
                 int64_t cjnz = Cp [kk+1] - Cp [kk] ;
-                cjnz_max = GB_IMAX (cjnz_max, cjnz) ;
             }
         }
         int64_t my_cjnz = TaskList [taskid].my_cjnz ;
         TaskList [taskid].my_cjnz = cjnz_sum ;
         cjnz_sum += my_cjnz ;
     }
-
-    //--------------------------------------------------------------------------
-    // return result
-    //--------------------------------------------------------------------------
-
-    return (cjnz_max) ;
 }
 
diff --git a/GraphBLAS/Source/GB_AxB_flopcount.c b/GraphBLAS/Source/GB_AxB_saxpy3_flopcount.c
similarity index 83%
rename from GraphBLAS/Source/GB_AxB_flopcount.c
rename to GraphBLAS/Source/GB_AxB_saxpy3_flopcount.c
index 85ec4c8090..6f784b26ad 100644
--- a/GraphBLAS/Source/GB_AxB_flopcount.c
+++ b/GraphBLAS/Source/GB_AxB_saxpy3_flopcount.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_AxB_flopcount:  compute flops for C=A*B, C<M>=A*B, or C<!M>=A*B
+// GB_AxB_saxpy3_flopcount:  compute flops for GB_AxB_saxpy3
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -53,18 +53,11 @@
         if (B (:,j) is empty) continue ;
         mjnz = nnz (M (:,j))
         if (M is present, not complemented, and M (:,j) is empty) continue ;
-        im_first = min row index of nonzeros in M(:,j)
-        im_last  = max row index of nonzeros in M(:,j)
-        Bflops (j) = mjnz if M present, to scatter M(:,j) (M or !M case)
+        Bflops (j) = mjnz if M present and not dense, to scatter M(:,j)
         Mwork += mjnz
         for each k where B (k,j) is nonzero:
             aknz = nnz (A (:,k))
             if (aknz == 0) continue ;
-            alo = min row index of nonzeros in A(:,k)
-            ahi = max row index of nonzeros in A(:,k)
-            if (M is present and not complemented)
-                if (intersection (alo:ahi, im_first:im_last) empty) continue
-            end
             % numerical phase will compute: C(:,j)<#M(:,j)> += A(:,k)*B(k,j)
             % where #M is no mask, M, or !M.  This typically takes aknz flops,
             % or with a binary search if nnz(M(:,j)) << nnz(A(:,k)).
@@ -76,6 +69,7 @@
 #include "GB_mxm.h"
 #include "GB_ek_slice.h"
 #include "GB_bracket.h"
+#include "GB_AxB_saxpy3.h"
 
 #define GB_FREE_WORK                                                    \
 {                                                                       \
@@ -85,10 +79,10 @@
 }
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_AxB_flopcount
+GrB_Info GB_AxB_saxpy3_flopcount
 (
     int64_t *Mwork,             // amount of work to handle the mask M
-    int64_t *Bflops,            // size B->nvec+1 and all zero
+    int64_t *Bflops,            // size B->nvec+1
     const GrB_Matrix M,         // optional mask matrix
     const bool Mask_comp,       // if true, mask is complemented
     const GrB_Matrix A,
@@ -102,11 +96,20 @@ GrB_Info GB_AxB_flopcount
     //--------------------------------------------------------------------------
 
     ASSERT_MATRIX_OK_OR_NULL (M, "M for flop count A*B", GB0) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (!GB_PENDING (M)) ;
+
     ASSERT_MATRIX_OK (A, "A for flop count A*B", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+
     ASSERT_MATRIX_OK (B, "B for flop count A*B", GB0) ;
-    ASSERT (!GB_PENDING (M)) ; ASSERT (!GB_ZOMBIES (M)) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
-    ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (GB_JUMBLED_OK (B)) ;
+    ASSERT (!GB_PENDING (B)) ;
+
     ASSERT (A->vdim == B->vlen) ;
     ASSERT (Bflops != NULL) ;
     ASSERT (Mwork != NULL) ;
@@ -115,52 +118,54 @@ GrB_Info GB_AxB_flopcount
     // determine the number of threads to use
     //--------------------------------------------------------------------------
 
-    int64_t bnz = GB_NNZ (B) ;
+    int64_t bnz = GB_NNZ_HELD (B) ;
     int64_t bnvec = B->nvec ;
 
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
     int nthreads = GB_nthreads (bnz + bnvec, chunk, nthreads_max) ;
 
-    #ifdef GB_DEBUG
-    // Bflops must be set to zero in the caller
-    for (int64_t kk = 0 ; kk <= bnvec ; kk++)
-    {
-        ASSERT (Bflops [kk] == 0) ;
-    }
-    #endif
+    // clear Bflops
+    GB_memset (Bflops, 0, (bnvec+1) * sizeof (int64_t), nthreads_max) ;
 
     //--------------------------------------------------------------------------
-    // get the mask, if present
+    // get the mask, if present: any sparsity structure
     //--------------------------------------------------------------------------
 
     bool mask_is_M = (M != NULL && !Mask_comp) ;
-    const int64_t *GB_RESTRICT Mh = NULL ;
     const int64_t *GB_RESTRICT Mp = NULL ;
-    const int64_t *GB_RESTRICT Mi = NULL ;
+    const int64_t *GB_RESTRICT Mh = NULL ;
     int64_t mnvec = 0 ;
-    bool M_is_hyper = GB_IS_HYPER (M) ;
+    int64_t mvlen = 0 ;
+    bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
+    bool M_is_dense = false ;
     if (M != NULL)
     { 
         Mh = M->h ;
         Mp = M->p ;
-        Mi = M->i ;
         mnvec = M->nvec ;
+        mvlen = M->vlen ;
+        M_is_dense = GB_is_packed (M) ;
     }
 
     //--------------------------------------------------------------------------
-    // get A and B
+    // get A and B: any sparsity structure
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Ah = A->h ;
     const int64_t *GB_RESTRICT Ap = A->p ;
-    const int64_t *GB_RESTRICT Ai = A->i ;
-    int64_t anvec = A->nvec ;
-    bool A_is_hyper = GB_IS_HYPER (A) ;
+    const int64_t *GB_RESTRICT Ah = A->h ;
+    const int64_t anvec = A->nvec ;
+    const int64_t avlen = A->vlen ;
+    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
 
-    const int64_t *GB_RESTRICT Bh = B->h ;
     const int64_t *GB_RESTRICT Bp = B->p ;
+    const int64_t *GB_RESTRICT Bh = B->h ;
+    const int8_t  *GB_RESTRICT Bb = B->b ;
     const int64_t *GB_RESTRICT Bi = B->i ;
-    bool B_is_hyper = GB_IS_HYPER (B) ;
+    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
+    const bool B_is_bitmap = GB_IS_BITMAP (B) ;
+    const bool B_is_sparse_or_hyper = B_is_hyper || GB_IS_SPARSE (B) ;
+    const int64_t bvlen = B->vlen ;
+    const bool B_jumbled = B->jumbled ;
 
     //--------------------------------------------------------------------------
     // construct the parallel tasks
@@ -174,14 +179,12 @@ GrB_Info GB_AxB_flopcount
     int64_t *GB_RESTRICT Wlast = NULL ;        // size ntasks
 
     int ntasks = (nthreads == 1) ? 1 : (64 * nthreads) ;
-    ntasks = GB_IMIN (ntasks, bnz) ;
-    ntasks = GB_IMAX (ntasks, 1) ;
     int64_t *pstart_slice, *kfirst_slice, *klast_slice ;
-    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, B, ntasks))
+    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, B, &ntasks))
     { 
         // out of memory
         GB_FREE_WORK ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -194,7 +197,7 @@ GrB_Info GB_AxB_flopcount
     { 
         // out of memory
         GB_FREE_WORK ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -227,7 +230,7 @@ GrB_Info GB_AxB_flopcount
         {
 
             // nnz (B (:,j)), for all tasks
-            int64_t bjnz = Bp [kk+1] - Bp [kk] ;
+            int64_t bjnz = (Bp == NULL) ? bvlen : (Bp [kk+1] - Bp [kk]) ;
             // C(:,j) is empty if the entire vector B(:,j) is empty
             if (bjnz == 0) continue ;
 
@@ -236,33 +239,33 @@ GrB_Info GB_AxB_flopcount
             //------------------------------------------------------------------
 
             int64_t pB, pB_end ;
-            GB_get_pA_and_pC (&pB, &pB_end, NULL,
-                taskid, kk, kfirst, klast, pstart_slice, NULL, NULL, Bp) ;
+            GB_get_pA (&pB, &pB_end, taskid, kk,
+                kfirst, klast, pstart_slice, Bp, bvlen) ;
             int64_t my_bjnz = pB_end - pB ;
-            int64_t j = (B_is_hyper) ? Bh [kk] : kk ;
+            int64_t j = GBH (Bh, kk) ;
 
             //------------------------------------------------------------------
             // see if M(:,j) is present and non-empty
             //------------------------------------------------------------------
 
-            int64_t bjflops = 0 ;
-            int64_t im_first = -1, im_last = -1 ;
+            // if M(:,j) is full, bitmap, or dense, do not add mjnz to bjflops
+            // or task_MWork.
+
+            int64_t bjflops = (B_is_bitmap) ? my_bjnz : 0 ;
             int64_t mjnz = 0 ;
-            if (M != NULL)
+            if (M != NULL && !M_is_dense)
             {
                 int64_t mpright = mnvec - 1 ;
                 int64_t pM, pM_end ;
-                GB_lookup (M_is_hyper, Mh, Mp, &mpleft, mpright, j,
+                GB_lookup (M_is_hyper, Mh, Mp, mvlen, &mpleft, mpright, j,
                     &pM, &pM_end) ;
                 mjnz = pM_end - pM ;
                 // If M not complemented: C(:,j) is empty if M(:,j) is empty.
                 if (mjnz == 0 && !Mask_comp) continue ;
                 if (mjnz > 0)
                 {
-                    // M(:,j) not empty; get 1st and last index in M(:,j)
-                    im_first = Mi [pM] ;
-                    im_last  = Mi [pM_end-1] ;
-                    if (pB == Bp [kk])
+                    // M(:,j) not empty
+                    if (pB == GBP (Bp, kk, bvlen))
                     { 
                         // this task owns the top part of B(:,j), so it can
                         // account for the work to access M(:,j), without the
@@ -292,10 +295,11 @@ GrB_Info GB_AxB_flopcount
 
             int64_t pleft = 0 ;
             int64_t pright = anvec-1 ;
-            if (A_is_hyper && my_bjnz > 2)
+            if (A_is_hyper && B_is_sparse_or_hyper && my_bjnz > 2 && !B_jumbled)
             { 
                 // trim Ah [0..pright] to remove any entries past last B(:,j)
-                GB_bracket_right (Bi [pB_end-1], Ah, 0, &pright) ;
+                int64_t ilast = Bi [pB_end-1] ;
+                GB_bracket_right (ilast, Ah, 0, &pright) ;
             }
 
             //------------------------------------------------------------------
@@ -306,12 +310,20 @@ GrB_Info GB_AxB_flopcount
 
             for ( ; pB < pB_end ; pB++)
             {
+                // get B(k,j)
+                int64_t k = GBI (Bi, pB, bvlen) ;
+                if (!GBB (Bb, pB)) continue ;
+
                 // B(k,j) is nonzero
-                int64_t k = Bi [pB] ;
 
-                // find A(:,k), reusing pleft since Bi [...] is sorted
+                // find A(:,k), reusing pleft if B is not jumbled
+                if (B_jumbled)
+                { 
+                    pleft = 0 ;
+                }
                 int64_t pA, pA_end ;
-                GB_lookup (A_is_hyper, Ah, Ap, &pleft, pright, k, &pA, &pA_end);
+                GB_lookup (A_is_hyper, Ah, Ap, avlen, &pleft, pright, k,
+                    &pA, &pA_end) ;
 
                 // skip if A(:,k) empty
                 int64_t aknz = pA_end - pA ;
@@ -324,10 +336,8 @@ GrB_Info GB_AxB_flopcount
                 if (mask_is_M)
                 {
                     // A(:,k) is non-empty; get first and last index of A(:,k)
-                    int64_t alo = Ai [pA] ;
-                    int64_t ahi = Ai [pA_end-1] ;
-                    if (ahi < im_first || alo > im_last) continue ;
-                    if (aknz > 256 && mjnz_much < aknz)
+                    if (aknz > 256 && mjnz_much < aknz && mjnz < mvlen &&
+                        aknz < avlen && !(A->jumbled))
                     { 
                         // scan M(:j), and do binary search for A(i,j)
                         bkjflops = mjnz * (1 + 4 * log2 ((double) aknz)) ;
@@ -375,7 +385,7 @@ GrB_Info GB_AxB_flopcount
     // reduce the first and last vector of each slice
     //--------------------------------------------------------------------------
 
-    // See also Template/GB_reduce_each_vector.c
+    // See also Template/GB_select_phase1.c
 
     int64_t kprior = -1 ;
 
@@ -392,8 +402,8 @@ GrB_Info GB_AxB_flopcount
         if (kfirst <= klast)
         {
             int64_t pB = pstart_slice [taskid] ;
-            int64_t pB_end =
-                GB_IMIN (Bp [kfirst+1], pstart_slice [taskid+1]) ;
+            int64_t pB_end = GBP (Bp, kfirst+1, bvlen) ;
+            pB_end = GB_IMIN (pB_end, pstart_slice [taskid+1]) ;
             if (pB < pB_end)
             {
                 if (kprior < kfirst)
@@ -417,8 +427,8 @@ GrB_Info GB_AxB_flopcount
 
         if (kfirst < klast)
         {
-            int64_t pB = Bp [klast] ;
-            int64_t pB_end   = pstart_slice [taskid+1] ;
+            int64_t pB = GBP (Bp, klast, bvlen) ;
+            int64_t pB_end = pstart_slice [taskid+1] ;
             if (pB < pB_end)
             {
                 /* if */ ASSERT (kprior < klast) ;
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3_generic.c b/GraphBLAS/Source/GB_AxB_saxpy3_generic.c
deleted file mode 100644
index 7c34685d9d..0000000000
--- a/GraphBLAS/Source/GB_AxB_saxpy3_generic.c
+++ /dev/null
@@ -1,177 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_AxB_saxpy3_generic: compute C=A*B, C<M>=A*B, or C<!M>=A*B in parallel
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// GB_AxB_saxpy3_generic computes C=A*B, C<M>=A*B, or C<!M>=A*B in parallel,
-// with arbitrary types and operators.
-
-//------------------------------------------------------------------------------
-
-#include "GB_mxm.h"
-#include "GB_AxB_saxpy3.h"
-#include "GB_bracket.h"
-#include "GB_sort.h"
-#include "GB_atomics.h"
-
-GrB_Info GB_AxB_saxpy3_generic
-(
-    GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
-    const GrB_Matrix A, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    const GrB_Semiring semiring,    // semiring that defines C=A*B
-    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
-    GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
-    GB_Context Context
-)
-{
-
-    //----------------------------------------------------------------------
-    // get operators, functions, workspace, contents of A, B, and C
-    //----------------------------------------------------------------------
-
-    GrB_BinaryOp mult = semiring->multiply ;
-    GrB_Monoid add = semiring->add ;
-    ASSERT (mult->ztype == add->op->ztype) ;
-
-    GxB_binary_function fmult = mult->function ;
-    GxB_binary_function fadd  = add->op->function ;
-
-    size_t csize = C->type->size ;
-    size_t asize = A_is_pattern ? 0 : A->type->size ;
-    size_t bsize = B_is_pattern ? 0 : B->type->size ;
-
-    size_t xsize = mult->xtype->size ;
-    size_t ysize = mult->ytype->size ;
-
-    // scalar workspace: because of typecasting, the x/y types need not
-    // be the same as the size of the A and B types.
-    // flipxy false: aik = (xtype) A(i,k) and bkj = (ytype) B(k,j)
-    // flipxy true:  aik = (ytype) A(i,k) and bkj = (xtype) B(k,j)
-    size_t aik_size = flipxy ? ysize : xsize ;
-    size_t bkj_size = flipxy ? xsize : ysize ;
-
-    GB_void *GB_RESTRICT terminal = (GB_void *) add->terminal ;
-    GB_void *GB_RESTRICT identity = (GB_void *) add->identity ;
-
-    GB_cast_function cast_A, cast_B ;
-    if (flipxy)
-    { 
-        // A is typecasted to y, and B is typecasted to x
-        cast_A = A_is_pattern ? NULL : 
-                 GB_cast_factory (mult->ytype->code, A->type->code) ;
-        cast_B = B_is_pattern ? NULL : 
-                 GB_cast_factory (mult->xtype->code, B->type->code) ;
-    }
-    else
-    { 
-        // A is typecasted to x, and B is typecasted to y
-        cast_A = A_is_pattern ? NULL :
-                 GB_cast_factory (mult->xtype->code, A->type->code) ;
-        cast_B = B_is_pattern ? NULL :
-                 GB_cast_factory (mult->ytype->code, B->type->code) ;
-    }
-
-    //----------------------------------------------------------------------
-    // C = A*B via saxpy3 method, function pointers, and typecasting
-    //----------------------------------------------------------------------
-
-    #define GB_IDENTITY identity
-
-    // aik = A(i,k), located in Ax [pA]
-    #define GB_GETA(aik,Ax,pA)                                          \
-        GB_void aik [GB_VLA(aik_size)] ;                                \
-        if (!A_is_pattern) cast_A (aik, Ax +((pA)*asize), asize)
-
-    // bkj = B(k,j), located in Bx [pB]
-    #define GB_GETB(bkj,Bx,pB)                                          \
-        GB_void bkj [GB_VLA(bkj_size)] ;                                \
-        if (!B_is_pattern) cast_B (bkj, Bx +((pB)*bsize), bsize)
-
-    // t = A(i,k) * B(k,j)
-    #define GB_MULT(t, aik, bkj)                                        \
-        GB_FMULT (t, aik, bkj)
-
-    // define t for each task
-    #define GB_CIJ_DECLARE(t)                                           \
-        GB_void t [GB_VLA(csize)]
-
-    // address of Cx [p]
-    #define GB_CX(p) (Cx +((p)*csize))
-
-    // Cx [p] = t
-    #define GB_CIJ_WRITE(p,t)                                           \
-        memcpy (GB_CX (p), t, csize)
-
-    // Cx [p] += t
-    #define GB_CIJ_UPDATE(p,t)                                          \
-        fadd (GB_CX (p), GB_CX (p), t)
-
-    // address of Hx [i]
-    #define GB_HX(i) (Hx +((i)*csize))
-
-    // atomic update not available for function pointers
-    #define GB_HAS_ATOMIC 0
-
-    // normal Hx [i] += t
-    #define GB_HX_UPDATE(i, t)                                          \
-        fadd (GB_HX (i), GB_HX (i), t)
-
-    // normal Hx [i] = t
-    #define GB_HX_WRITE(i, t)                                           \
-        memcpy (GB_HX (i), t, csize)
-
-    // Cx [p] = Hx [i]
-    #define GB_CIJ_GATHER(p,i)                                          \
-        memcpy (GB_CX (p), GB_HX(i), csize)
-
-    // memcpy (&(Cx [pC]), &(Hx [i]), len)
-    #define GB_CIJ_MEMCPY(pC,i,len) \
-        memcpy (Cx +((pC)*csize), Hx +((i)*csize), (len) * csize)
-
-    // 1 if monoid update can skipped entirely (the ANY monoid)
-    #define GB_IS_ANY_MONOID 0
-
-    // user-defined monoid update cannot be done with an OpenMP atomic
-    #define GB_HAS_OMP_ATOMIC 0
-
-    // not an ANY_PAIR semiring
-    #define GB_IS_ANY_PAIR_SEMIRING 0
-
-    // not a PAIR multiply operator 
-    #define GB_IS_PAIR_MULTIPLIER 0
-
-    #define GB_ATYPE GB_void
-    #define GB_BTYPE GB_void
-    #define GB_CTYPE GB_void
-
-    // no vectorization
-    #define GB_PRAGMA_SIMD_VECTORIZE ;
-
-    // definitions for GB_AxB_saxpy3_template.c
-    #include "GB_AxB_saxpy3_template.h"
-
-    if (flipxy)
-    { 
-        #define GB_FMULT(z,x,y) fmult (z,y,x)
-        #include "GB_AxB_saxpy3_template.c"
-        #undef GB_FMULT
-    }
-    else
-    { 
-        #define GB_FMULT(z,x,y) fmult (z,x,y)
-        #include "GB_AxB_saxpy3_template.c"
-        #undef GB_FMULT
-    }
-
-    return (GrB_SUCCESS) ;
-}
-
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3_slice_balanced.c b/GraphBLAS/Source/GB_AxB_saxpy3_slice_balanced.c
new file mode 100644
index 0000000000..291dede6ad
--- /dev/null
+++ b/GraphBLAS/Source/GB_AxB_saxpy3_slice_balanced.c
@@ -0,0 +1,721 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_slice_balanced: construct balanced tasks for GB_AxB_saxpy3
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3.h"
+
+// control parameters for generating parallel tasks
+#define GB_NTASKS_PER_THREAD 2
+#define GB_COSTLY 1.2
+#define GB_FINE_WORK 2
+#define GB_MWORK_ALPHA 0.01
+#define GB_MWORK_BETA 0.10
+
+#define GB_FREE_WORK            \
+{                               \
+    GB_FREE (Fine_fl) ;         \
+    GB_FREE (Coarse_Work) ;     \
+    GB_FREE (Coarse_initial) ;  \
+    GB_FREE (Fine_slice) ;      \
+}
+
+#define GB_FREE_ALL             \
+{                               \
+    GB_FREE_WORK ;              \
+    GB_FREE (TaskList) ;        \
+}
+
+//------------------------------------------------------------------------------
+// GB_hash_table_size
+//------------------------------------------------------------------------------
+
+// flmax is the max flop count for computing A*B(:,j), for any vector j that
+// this task computes.  If the mask M is present, flmax also includes the
+// number of entries in M(:,j).  GB_hash_table_size determines the hash table
+// size for this task, which is twice the smallest power of 2 larger than
+// flmax.  If flmax is large enough, the hash_size is returned as cvlen, so
+// that Gustavson's method will be used instead of the Hash method.
+
+// By default, Gustavson vs Hash is selected automatically.  AxB_method can be
+// selected via the descriptor or a global setting, as the non-default
+// GxB_AxB_GUSTAVSON or GxB_AxB_HASH settings, to enforce the selection of
+// either of those methods.  However, if Hash is selected but the hash table
+// equals or exceeds cvlen, then Gustavson's method is used instead.
+
+static inline int64_t GB_hash_table_size
+(
+    int64_t flmax,      // max flop count for any vector computed by this task
+    int64_t cvlen,      // vector length of C
+    const GrB_Desc_Value AxB_method     // Default, Gustavson, or Hash
+)
+{
+    // hash_size = 2 * (smallest power of 2 >= flmax)
+    double hlog = log2 ((double) flmax) ;
+    int64_t hash_size = ((int64_t) 2) << ((int64_t) floor (hlog) + 1) ;
+    bool use_Gustavson ;
+
+    if (AxB_method == GxB_AxB_GUSTAVSON)
+    { 
+        // always use Gustavson's method
+        use_Gustavson = true ;
+    }
+    else if (AxB_method == GxB_AxB_HASH)
+    { 
+        // always use Hash method, unless the hash_size >= cvlen
+        use_Gustavson = (hash_size >= cvlen) ;
+    }
+    else
+    { 
+        // default: auto selection:
+        // use Gustavson's method if hash_size is too big
+        use_Gustavson = (hash_size >= cvlen/12) ;
+    }
+
+    if (use_Gustavson)
+    { 
+        hash_size = cvlen ;
+    }
+    return (hash_size) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_create_coarse_task: create a single coarse task
+//------------------------------------------------------------------------------
+
+// Compute the max flop count for any vector in a coarse task, determine the
+// hash table size, and construct the coarse task.
+
+static inline void GB_create_coarse_task
+(
+    int64_t kfirst,     // coarse task consists of vectors kfirst:klast
+    int64_t klast,
+    GB_saxpy3task_struct *TaskList,
+    int taskid,         // taskid for this coarse task
+    int64_t *Bflops,    // size bnvec; cum sum of flop counts for vectors of B
+    int64_t cvlen,      // vector length of B and C
+    double chunk,
+    int nthreads_max,
+    int64_t *Coarse_Work,   // workspace for parallel reduction for flop count
+    const GrB_Desc_Value AxB_method     // Default, Gustavson, or Hash
+)
+{
+
+    //--------------------------------------------------------------------------
+    // find the max # of flops for any vector in this task
+    //--------------------------------------------------------------------------
+
+    int64_t nk = klast - kfirst + 1 ;
+    int nth = GB_nthreads (nk, chunk, nthreads_max) ;
+
+    // each thread finds the max flop count for a subset of the vectors
+    int tid ;
+    #pragma omp parallel for num_threads(nth) schedule(static)
+    for (tid = 0 ; tid < nth ; tid++)
+    {
+        int64_t my_flmax = 1, istart, iend ;
+        GB_PARTITION (istart, iend, nk, tid, nth) ;
+        for (int64_t i = istart ; i < iend ; i++)
+        { 
+            int64_t kk = kfirst + i ;
+            int64_t fl = Bflops [kk+1] - Bflops [kk] ;
+            my_flmax = GB_IMAX (my_flmax, fl) ;
+        }
+        Coarse_Work [tid] = my_flmax ;
+    }
+
+    // combine results from each thread
+    int64_t flmax = 1 ;
+    for (tid = 0 ; tid < nth ; tid++)
+    { 
+        flmax = GB_IMAX (flmax, Coarse_Work [tid]) ;
+    }
+
+    // check the parallel computation
+    #ifdef GB_DEBUG
+    int64_t flmax2 = 1 ;
+    for (int64_t kk = kfirst ; kk <= klast ; kk++)
+    {
+        int64_t fl = Bflops [kk+1] - Bflops [kk] ;
+        flmax2 = GB_IMAX (flmax2, fl) ;
+    }
+    ASSERT (flmax == flmax2) ;
+    #endif
+
+    //--------------------------------------------------------------------------
+    // define the coarse task
+    //--------------------------------------------------------------------------
+
+    TaskList [taskid].start   = kfirst ;
+    TaskList [taskid].end     = klast ;
+    TaskList [taskid].vector  = -1 ;
+    TaskList [taskid].hsize   = GB_hash_table_size (flmax, cvlen, AxB_method) ;
+    TaskList [taskid].Hi      = NULL ;      // assigned later
+    TaskList [taskid].Hf      = NULL ;      // assigned later
+    TaskList [taskid].Hx      = NULL ;      // assigned later
+    TaskList [taskid].my_cjnz = 0 ;         // unused
+    TaskList [taskid].leader  = taskid ;
+    TaskList [taskid].team_size = 1 ;
+}
+
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_slice_balanced: create balanced tasks for saxpy3
+//------------------------------------------------------------------------------
+
+GrB_Info GB_AxB_saxpy3_slice_balanced
+(
+    // inputs
+    GrB_Matrix C,                   // output matrix
+    const GrB_Matrix M,             // optional mask matrix
+    const bool Mask_comp,           // if true, use !M
+    const GrB_Matrix A,             // input matrix A
+    const GrB_Matrix B,             // input matrix B
+    GrB_Desc_Value AxB_method,      // Default, Gustavson, or Hash
+    // outputs
+    GB_saxpy3task_struct **TaskList_handle,
+    bool *apply_mask,               // if true, apply M during sapxy3
+    bool *M_dense_in_place,         // if true, use M in-place
+    int *ntasks,                    // # of tasks created (coarse and fine)
+    int *nfine,                     // # of fine tasks created
+    int *nthreads,                  // # of threads to use
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+
+    (*apply_mask) = false ;
+    (*M_dense_in_place) = false ;
+    (*ntasks) = 0 ;
+    (*nfine) = 0 ;
+    (*nthreads) = 0 ;
+
+    ASSERT_MATRIX_OK_OR_NULL (M, "M for saxpy3_slice_balanced A*B", GB0) ;
+    ASSERT (!GB_PENDING (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+
+    ASSERT_MATRIX_OK (A, "A for saxpy3_slice_balanced A*B", GB0) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+
+    ASSERT_MATRIX_OK (B, "B for saxpy3_slice_balanced A*B", GB0) ;
+    ASSERT (!GB_PENDING (B)) ;
+    ASSERT (GB_JUMBLED_OK (B)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+
+    //--------------------------------------------------------------------------
+    // determine the # of threads to use
+    //--------------------------------------------------------------------------
+
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+
+    //--------------------------------------------------------------------------
+    // define result and workspace
+    //--------------------------------------------------------------------------
+
+    GB_saxpy3task_struct *GB_RESTRICT TaskList = NULL ;
+
+    int64_t *GB_RESTRICT Coarse_initial = NULL ;    // initial coarse tasks
+    int64_t *GB_RESTRICT Coarse_Work = NULL ;       // workspace for flop counts
+    int64_t *GB_RESTRICT Fine_slice = NULL ;
+    int64_t *GB_RESTRICT Fine_fl = NULL ;
+
+    //--------------------------------------------------------------------------
+    // get A, and B
+    //--------------------------------------------------------------------------
+
+    const int64_t *GB_RESTRICT Ap = A->p ;
+    const int64_t *GB_RESTRICT Ah = A->h ;
+    const int64_t avlen = A->vlen ;
+    const int64_t anvec = A->nvec ;
+    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
+
+    const int64_t *GB_RESTRICT Bp = B->p ;
+    const int64_t *GB_RESTRICT Bh = B->h ;
+    const int8_t  *GB_RESTRICT Bb = B->b ;
+    const int64_t *GB_RESTRICT Bi = B->i ;
+    const int64_t bvdim = B->vdim ;
+    const int64_t bnz = GB_NNZ_HELD (B) ;
+    const int64_t bnvec = B->nvec ;
+    const int64_t bvlen = B->vlen ;
+    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
+
+    int64_t cvlen = avlen ;
+    int64_t cvdim = bvdim ;
+
+    //--------------------------------------------------------------------------
+    // compute flop counts for each vector of B and C
+    //--------------------------------------------------------------------------
+
+    int64_t Mwork = 0 ;
+    int64_t *GB_RESTRICT Bflops = C->p ;    // use C->p as workspace for Bflops
+    GB_OK (GB_AxB_saxpy3_flopcount (&Mwork, Bflops, M, Mask_comp, A, B,
+        Context)) ;
+    int64_t total_flops = Bflops [bnvec] ;
+    double axbflops = total_flops - Mwork ;
+    GBURBLE ("axbwork %g ", axbflops) ;
+    if (Mwork > 0) GBURBLE ("mwork %g ", (double) Mwork) ;
+
+    //--------------------------------------------------------------------------
+    // determine if the mask M should be applied, or done later
+    //--------------------------------------------------------------------------
+
+    if (M == NULL)
+    {
+
+        //----------------------------------------------------------------------
+        // M is not present 
+        //----------------------------------------------------------------------
+
+        (*apply_mask) = false ;
+
+    }
+    else if (GB_is_packed (M))
+    {
+
+        //----------------------------------------------------------------------
+        // M is present and full, bitmap, or sparse/hyper with all entries
+        //----------------------------------------------------------------------
+
+        // Choose all-hash or all-Gustavson tasks, and apply M during saxpy3.
+
+        (*apply_mask) = true ;
+
+        // The work for M has not yet been added Bflops.
+        // Each vector M(:,j) has cvlen entries.
+        Mwork = cvlen * cvdim ;
+
+        if (!(AxB_method == GxB_AxB_HASH || AxB_method == GxB_AxB_GUSTAVSON))
+        {
+            if (axbflops < (double) Mwork * GB_MWORK_BETA)
+            { 
+                // The mask is too costly to scatter into the Hf workspace.
+                // Leave it in place and use all-hash tasks.
+                AxB_method = GxB_AxB_HASH ;
+            }
+            else
+            { 
+                // Scatter M into Hf and use all-Gustavson tasks.
+                AxB_method = GxB_AxB_GUSTAVSON ;
+            }
+        }
+
+        if (AxB_method == GxB_AxB_HASH)
+        {
+            // Use the hash method for all tasks (except for those tasks which
+            // require a hash table size >= cvlen; those tasks use Gustavson).
+            // Do not scatter the mask into the Hf hash workspace.  The work
+            // for the mask is not accounted for in Bflops, so the hash tables
+            // can be small.
+            (*M_dense_in_place) = true ;
+            GBURBLE ("(use dense mask in-place) ") ;
+        }
+        else
+        {
+            // Use the Gustavson method for all tasks, and scatter M into the
+            // fine Gustavson workspace.  The work for M is not yet in the
+            // Bflops cumulative sum.  Add it now.
+            ASSERT (AxB_method == GxB_AxB_GUSTAVSON)
+            int nth = GB_nthreads (bnvec, chunk, nthreads_max) ;
+            int64_t kk ;
+            #pragma omp parallel for num_threads(nth) schedule(static)
+            for (kk = 0 ; kk <= bnvec ; kk++)
+            { 
+                Bflops [kk] += cvlen * (kk+1) ;
+            }
+            total_flops = Bflops [bnvec] ;
+            GBURBLE ("(use dense mask) ") ;
+        }
+
+    }
+    else if (axbflops < ((double) Mwork * GB_MWORK_ALPHA))
+    {
+
+        //----------------------------------------------------------------------
+        // M is costly to use; apply it after C=A*B
+        //----------------------------------------------------------------------
+
+        // Do not use M during the computation of A*B.  Instead, compute C=A*B
+        // and then apply the mask later.  Tell the caller that the mask should
+        // not be applied, so that it will be applied later in GB_mxm.
+
+        (*apply_mask) = false ;
+
+        // redo the flop count analysis, without the mask
+        GB_OK (GB_AxB_saxpy3_flopcount (&Mwork, Bflops, NULL, false, A, B,
+            Context)) ;
+        total_flops = Bflops [bnvec] ;
+        GBURBLE ("(discard mask) ") ;
+
+    }
+    else
+    { 
+
+        //----------------------------------------------------------------------
+        // use M during saxpy3
+        //----------------------------------------------------------------------
+
+        (*apply_mask) = true ;
+        GBURBLE ("(use mask) ") ;
+    }
+
+    //--------------------------------------------------------------------------
+    // determine # of threads and # of initial coarse tasks
+    //--------------------------------------------------------------------------
+
+    (*nthreads) = GB_nthreads ((double) total_flops, chunk, nthreads_max) ;
+    int ntasks_initial = ((*nthreads) == 1) ? 1 :
+        (GB_NTASKS_PER_THREAD * (*nthreads)) ;
+
+    //--------------------------------------------------------------------------
+    // give preference to Gustavson when using few threads
+    //--------------------------------------------------------------------------
+
+    if ((*nthreads) <= 8 &&
+        (!(AxB_method == GxB_AxB_HASH || AxB_method == GxB_AxB_GUSTAVSON)))
+    {
+        // Unless a specific method has been explicitly requested, see if
+        // Gustavson should be used with a small number of threads.
+        // Matrix-vector has a maximum intensity of 1, so this heuristic only
+        // applies to GrB_mxm.
+        double abnz = GB_NNZ (A) + GB_NNZ (B) + 1 ;
+        double workspace = (double) ntasks_initial * (double) cvlen ;
+        double intensity = total_flops / abnz ;
+        GBURBLE ("(intensity: %0.3g workspace/(nnz(A)+nnz(B)): %0.3g",
+            intensity, workspace / abnz) ;
+        if (intensity >= 8 && workspace < abnz)
+        {
+            // work intensity is large, and Gustvason workspace is modest;
+            // use Gustavson for all tasks
+            AxB_method = GxB_AxB_GUSTAVSON ;
+            GBURBLE (": select Gustvason) ") ;
+        }
+        else
+        {
+            // use default task creation: mix of Hash and Gustavson
+            GBURBLE (") ") ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // determine target task size
+    //--------------------------------------------------------------------------
+
+    double target_task_size = ((double) total_flops) / ntasks_initial ;
+    target_task_size = GB_IMAX (target_task_size, chunk) ;
+    double target_fine_size = target_task_size / GB_FINE_WORK ;
+    target_fine_size = GB_IMAX (target_fine_size, chunk) ;
+
+    //--------------------------------------------------------------------------
+    // determine # of parallel tasks
+    //--------------------------------------------------------------------------
+
+    int ncoarse = 0 ;       // # of coarse tasks
+    int max_bjnz = 0 ;      // max (nnz (B (:,j))) of fine tasks
+
+    // FUTURE: also use ultra-fine tasks that compute A(i1:i2,k)*B(k,j)
+
+    if (ntasks_initial > 1)
+    {
+
+        //----------------------------------------------------------------------
+        // construct initial coarse tasks
+        //----------------------------------------------------------------------
+
+        if (!GB_pslice (&Coarse_initial, Bflops, bnvec, ntasks_initial, true))
+        { 
+            // out of memory
+            GB_FREE_ALL ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+
+        //----------------------------------------------------------------------
+        // split the work into coarse and fine tasks
+        //----------------------------------------------------------------------
+
+        for (int taskid = 0 ; taskid < ntasks_initial ; taskid++)
+        {
+            // get the initial coarse task
+            int64_t kfirst = Coarse_initial [taskid] ;
+            int64_t klast  = Coarse_initial [taskid+1] ;
+            int64_t task_ncols = klast - kfirst ;
+            int64_t task_flops = Bflops [klast] - Bflops [kfirst] ;
+
+            if (task_ncols == 0)
+            { 
+                // This coarse task is empty, having been squeezed out by
+                // costly vectors in adjacent coarse tasks.
+            }
+            else if (task_flops > 2 * GB_COSTLY * target_task_size)
+            {
+                // This coarse task is too costly, because it contains one or
+                // more costly vectors.  Split its vectors into a mixture of
+                // coarse and fine tasks.
+
+                int64_t kcoarse_start = kfirst ;
+
+                for (int64_t kk = kfirst ; kk < klast ; kk++)
+                {
+                    // jflops = # of flops to compute a single vector A*B(:,j)
+                    // where j == GBH (Bh, kk)
+                    double jflops = Bflops [kk+1] - Bflops [kk] ;
+                    // bjnz = nnz (B (:,j))
+                    int64_t bjnz = (Bp == NULL) ? bvlen : (Bp [kk+1] - Bp [kk]);
+
+                    if (jflops > GB_COSTLY * target_task_size && bjnz > 1)
+                    {
+                        // A*B(:,j) is costly; split it into 2 or more fine
+                        // tasks.  First flush the prior coarse task, if any.
+                        if (kcoarse_start < kk)
+                        { 
+                            // vectors kcoarse_start to kk-1 form a single
+                            // coarse task
+                            ncoarse++ ;
+                        }
+
+                        // next coarse task (if any) starts at kk+1
+                        kcoarse_start = kk+1 ;
+
+                        // vectors kk will be split into multiple fine tasks
+                        max_bjnz = GB_IMAX (max_bjnz, bjnz) ;
+                        int team_size = ceil (jflops / target_fine_size) ;
+                        (*nfine) += team_size ;
+                    }
+                }
+
+                // flush the last coarse task, if any
+                if (kcoarse_start < klast)
+                { 
+                    // vectors kcoarse_start to klast-1 form a single
+                    // coarse task
+                    ncoarse++ ;
+                }
+
+            }
+            else
+            { 
+                // This coarse task is OK as-is.
+                ncoarse++ ;
+            }
+        }
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // entire computation in a single fine or coarse task
+        //----------------------------------------------------------------------
+
+        if (bnvec == 1)
+        { 
+            // If B is a single vector, and is computed by a single thread,
+            // then a single fine task is used.
+            (*nfine) = 1 ;
+            ncoarse = 0 ;
+        }
+        else
+        { 
+            // One thread uses a single coarse task if B is not a vector.
+            (*nfine) = 0 ;
+            ncoarse = 1 ;
+        }
+    }
+
+    (*ntasks) = ncoarse + (*nfine) ;
+
+    //--------------------------------------------------------------------------
+    // allocate the tasks, and workspace to construct fine tasks
+    //--------------------------------------------------------------------------
+
+    TaskList    = GB_CALLOC ((*ntasks), GB_saxpy3task_struct) ;
+    Coarse_Work = GB_MALLOC (nthreads_max, int64_t) ;
+    if (max_bjnz > 0)
+    { 
+        // also allocate workspace to construct fine tasks
+        Fine_slice = GB_MALLOC ((*ntasks)+1  , int64_t) ;
+        Fine_fl    = GB_MALLOC (max_bjnz+1, int64_t) ;
+    }
+
+    if (TaskList == NULL || Coarse_Work == NULL ||
+        (max_bjnz > 0 && (Fine_slice == NULL || Fine_fl == NULL)))
+    { 
+        // out of memory
+        GB_FREE_ALL ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // create the tasks
+    //--------------------------------------------------------------------------
+
+    if (ntasks_initial > 1)
+    {
+
+        //----------------------------------------------------------------------
+        // create the coarse and fine tasks
+        //----------------------------------------------------------------------
+
+        int nf = 0 ;            // fine tasks have task id 0:nfine-1
+        int nc = (*nfine) ;     // coarse task ids are nfine:ntasks-1
+
+        for (int taskid = 0 ; taskid < ntasks_initial ; taskid++)
+        {
+            // get the initial coarse task
+            int64_t kfirst = Coarse_initial [taskid] ;
+            int64_t klast  = Coarse_initial [taskid+1] ;
+            int64_t task_ncols = klast - kfirst ;
+            int64_t task_flops = Bflops [klast] - Bflops [kfirst] ;
+
+            if (task_ncols == 0)
+            { 
+                // This coarse task is empty, having been squeezed out by
+                // costly vectors in adjacent coarse tasks.
+            }
+            else if (task_flops > 2 * GB_COSTLY * target_task_size)
+            {
+                // This coarse task is too costly, because it contains one or
+                // more costly vectors.  Split its vectors into a mixture of
+                // coarse and fine tasks.
+
+                int64_t kcoarse_start = kfirst ;
+
+                for (int64_t kk = kfirst ; kk < klast ; kk++)
+                {
+                    // jflops = # of flops to compute a single vector A*B(:,j)
+                    double jflops = Bflops [kk+1] - Bflops [kk] ;
+                    // bjnz = nnz (B (:,j))
+                    int64_t bjnz = (Bp == NULL) ? bvlen : (Bp [kk+1] - Bp [kk]);
+
+                    if (jflops > GB_COSTLY * target_task_size && bjnz > 1)
+                    {
+                        // A*B(:,j) is costly; split it into 2 or more fine
+                        // tasks.  First flush the prior coarse task, if any.
+                        if (kcoarse_start < kk)
+                        { 
+                            // kcoarse_start:kk-1 form a single coarse task
+                            GB_create_coarse_task (kcoarse_start, kk-1,
+                                TaskList, nc++, Bflops, cvlen, chunk,
+                                nthreads_max, Coarse_Work, AxB_method) ;
+                        }
+
+                        // next coarse task (if any) starts at kk+1
+                        kcoarse_start = kk+1 ;
+
+                        // count the work for each entry B(k,j).  Do not
+                        // include the work to scan M(:,j), since that will
+                        // be evenly divided between all tasks in this team.
+                        int64_t pB_start = GBP (Bp, kk, bvlen) ;
+                        int nth = GB_nthreads (bjnz, chunk, nthreads_max) ;
+                        int64_t s ;
+                        #pragma omp parallel for num_threads(nth) \
+                            schedule(static)
+                        for (s = 0 ; s < bjnz ; s++)
+                        { 
+                            // get B(k,j)
+                            Fine_fl [s] = 1 ;
+                            int64_t pB = pB_start + s ;
+                            if (!GBB (Bb, pB)) continue ;
+                            int64_t k = GBI (Bi, pB, bvlen) ;
+                            // fl = flop count for just A(:,k)*B(k,j)
+                            int64_t pA, pA_end ;
+                            int64_t pleft = 0 ;
+                            GB_lookup (A_is_hyper, Ah, Ap, avlen, &pleft,
+                                anvec-1, k, &pA, &pA_end) ;
+                            int64_t fl = pA_end - pA ;
+                            Fine_fl [s] = fl ;
+                            ASSERT (fl >= 0) ;
+                        }
+
+                        // cumulative sum of flops to compute A*B(:,j)
+                        GB_cumsum (Fine_fl, bjnz, NULL, nth) ;
+
+                        // slice B(:,j) into fine tasks
+                        int team_size = ceil (jflops / target_fine_size) ;
+                        ASSERT (Fine_slice != NULL) ;
+                        GB_pslice (&Fine_slice, Fine_fl, bjnz, team_size,
+                            false) ;
+
+                        // shared hash table for all fine tasks for A*B(:,j)
+                        int64_t hsize = 
+                            GB_hash_table_size (jflops, cvlen, AxB_method) ;
+
+                        // construct the fine tasks for C(:,j)=A*B(:,j)
+                        int leader = nf ;
+                        for (int fid = 0 ; fid < team_size ; fid++)
+                        { 
+                            int64_t pstart = Fine_slice [fid] ;
+                            int64_t pend   = Fine_slice [fid+1] ;
+                            int64_t fl = Fine_fl [pend] - Fine_fl [pstart] ;
+                            TaskList [nf].start  = pB_start + pstart ;
+                            TaskList [nf].end    = pB_start + pend - 1 ;
+                            TaskList [nf].vector = kk ;
+                            TaskList [nf].hsize  = hsize ;
+                            TaskList [nf].Hi = NULL ;   // assigned later
+                            TaskList [nf].Hf = NULL ;   // assigned later
+                            TaskList [nf].Hx = NULL ;   // assigned later
+                            TaskList [nf].my_cjnz = 0 ;
+                            TaskList [nf].leader = leader ;
+                            TaskList [nf].team_size = team_size ;
+                            nf++ ;
+                        }
+                    }
+                }
+
+                // flush the last coarse task, if any
+                if (kcoarse_start < klast)
+                { 
+                    // kcoarse_start:klast-1 form a single coarse task
+                    GB_create_coarse_task (kcoarse_start, klast-1, TaskList,
+                        nc++, Bflops, cvlen, chunk, nthreads_max,
+                        Coarse_Work, AxB_method) ;
+                }
+
+            }
+            else
+            { 
+                // This coarse task is OK as-is.
+                GB_create_coarse_task (kfirst, klast-1, TaskList, nc++, Bflops,
+                    cvlen, chunk, nthreads_max, Coarse_Work, AxB_method) ;
+            }
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // entire computation in a single fine or coarse task
+        //----------------------------------------------------------------------
+
+        // create a single coarse task: hash or Gustavson
+        GB_create_coarse_task (0, bnvec-1, TaskList, 0, Bflops, cvlen, 1, 1,
+            Coarse_Work, AxB_method) ;
+
+        if (bnvec == 1)
+        { 
+            // convert the single coarse task into a single fine task
+            TaskList [0].start  = 0 ;           // first entry in B(:,0)
+            TaskList [0].end = bnz - 1 ;        // last entry in B(:,0)
+            TaskList [0].vector = 0 ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    GB_FREE_WORK ;
+    (*TaskList_handle) = TaskList ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3_slice_quick.c b/GraphBLAS/Source/GB_AxB_saxpy3_slice_quick.c
new file mode 100644
index 0000000000..d129f07106
--- /dev/null
+++ b/GraphBLAS/Source/GB_AxB_saxpy3_slice_quick.c
@@ -0,0 +1,82 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_slice_quick: construct a single task for GB_AxB_saxpy3
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// create a single task for C=A*B, for a single thread.
+
+#include "GB_AxB_saxpy3.h"
+
+GrB_Info GB_AxB_saxpy3_slice_quick
+(
+    // inputs
+    GrB_Matrix C,                   // output matrix
+    const GrB_Matrix A,             // input matrix A
+    const GrB_Matrix B,             // input matrix B
+    // outputs
+    GB_saxpy3task_struct **TaskList_handle,
+    int *ntasks,                    // # of tasks created (coarse and fine)
+    int *nfine,                     // # of fine tasks created
+    int *nthreads,                  // # of threads to use
+    GB_Context Context
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    (*ntasks) = 1 ;
+    (*nfine) = 0 ;
+    (*nthreads) = 1 ;
+
+    const int64_t bnvec = B->nvec ;
+    const int64_t cvlen = A->vlen ;
+
+    //--------------------------------------------------------------------------
+    // allocate the task
+    //--------------------------------------------------------------------------
+
+    GB_saxpy3task_struct *TaskList = GB_CALLOC (1, GB_saxpy3task_struct) ;
+    if (TaskList == NULL)
+    { 
+        // out of memory
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // create a single Gustavson task
+    //--------------------------------------------------------------------------
+
+    TaskList [0].start   = 0 ;
+    TaskList [0].end     = bnvec-1 ;
+    TaskList [0].vector  = -1 ;
+    TaskList [0].hsize   = cvlen ;
+    TaskList [0].Hi      = NULL ;      // assigned later
+    TaskList [0].Hf      = NULL ;      // assigned later
+    TaskList [0].Hx      = NULL ;      // assigned later
+    TaskList [0].my_cjnz = 0 ;         // unused
+    TaskList [0].leader  = 0 ;
+    TaskList [0].team_size = 1 ;
+
+    if (bnvec == 1)
+    { 
+        // convert the single coarse task into a single fine task
+        TaskList [0].start  = 0 ;                   // first entry in B(:,0)
+        TaskList [0].end = GB_NNZ_HELD (B) - 1 ;    // last entry in B(:,0)
+        TaskList [0].vector = 0 ;
+        (*nfine) = 1 ;
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    (*TaskList_handle) = TaskList ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_AxB_saxpy3_symbolic.c b/GraphBLAS/Source/GB_AxB_saxpy3_symbolic.c
index ad672ac73b..8083bc0262 100644
--- a/GraphBLAS/Source/GB_AxB_saxpy3_symbolic.c
+++ b/GraphBLAS/Source/GB_AxB_saxpy3_symbolic.c
@@ -2,8 +2,8 @@
 // GB_AxB_saxpy3_symbolic: symbolic analysis for GB_AxB_saxpy3
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,19 +13,22 @@
 // the semiring, nor does it depend on the type of C, A, or B.  It does access
 // the values of M, if the mask matrix M is present and not structural.
 
+// If B is hypersparse, C must also be hypersparse.
+// Otherwise, C must be sparse.
+
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB_saxpy3_template.h"
 #include "GB_atomics.h"
 #include "GB_bracket.h"
-// GB_GET_A_k and GB_GET_M_j declare aknz and mjnz, but these are unused here.
 #include "GB_unused.h"
 
 void GB_AxB_saxpy3_symbolic
 (
-    GrB_Matrix C,               // Cp [k] is computed for coarse tasks
+    GrB_Matrix C,               // Cp is computed for coarse tasks
     const GrB_Matrix M,         // mask matrix M
-    bool Mask_comp,             // M complemented, or not
-    bool Mask_struct,           // M structural, or not
+    const bool Mask_comp,       // M complemented, or not
+    const bool Mask_struct,     // M structural, or not
+    const bool M_dense_in_place,
     const GrB_Matrix A,         // A matrix; only the pattern is accessed
     const GrB_Matrix B,         // B matrix; only the pattern is accessed
     GB_saxpy3task_struct *TaskList,     // list of tasks, and workspace
@@ -35,46 +38,72 @@ void GB_AxB_saxpy3_symbolic
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_ZOMBIES (M)) ; 
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (!GB_PENDING (M)) ; 
+
+    ASSERT (!GB_ZOMBIES (A)) ; 
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_PENDING (A)) ; 
+
+    ASSERT (!GB_ZOMBIES (B)) ; 
+    ASSERT (GB_JUMBLED_OK (B)) ;
+    ASSERT (!GB_PENDING (B)) ; 
+
     //--------------------------------------------------------------------------
     // get M, A, B, and C
     //--------------------------------------------------------------------------
 
     int64_t *GB_RESTRICT Cp = C->p ;
-    // const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t cvlen = C->vlen ;
-    // const int64_t cnvec = C->nvec ;
 
     const int64_t *GB_RESTRICT Bp = B->p ;
     const int64_t *GB_RESTRICT Bh = B->h ;
+    const int8_t  *GB_RESTRICT Bb = B->b ;
     const int64_t *GB_RESTRICT Bi = B->i ;
-    // const GB_BTYPE *GB_RESTRICT Bx = B_is_pattern ? NULL : B->x ;
-    // const int64_t bvlen = B->vlen ;
-    // const int64_t bnvec = B->nvec ;
-    // const bool B_is_hyper = B->is_hyper ;
+    const int64_t bvlen = B->vlen ;
+    const bool B_jumbled = B->jumbled ;
+    const bool B_is_bitmap = GB_IS_BITMAP (B) ;
+    const bool B_is_sparse = GB_IS_SPARSE (B) ;
+    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
+    const bool B_is_sparse_or_hyper = B_is_sparse || B_is_hyper ;
 
     const int64_t *GB_RESTRICT Ap = A->p ;
     const int64_t *GB_RESTRICT Ah = A->h ;
+    const int8_t  *GB_RESTRICT Ab = A->b ;
     const int64_t *GB_RESTRICT Ai = A->i ;
     const int64_t anvec = A->nvec ;
-    const bool A_is_hyper = GB_IS_HYPER (A) ;
-    // const GB_ATYPE *GB_RESTRICT Ax = A_is_pattern ? NULL : A->x ;
+    const int64_t avlen = A->vlen ;
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
+    const bool A_is_sparse = GB_IS_SPARSE (A) ;
+    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
+    const bool A_jumbled = A->jumbled ;
 
     const int64_t *GB_RESTRICT Mp = NULL ;
     const int64_t *GB_RESTRICT Mh = NULL ;
+    const int8_t  *GB_RESTRICT Mb = NULL ;
     const int64_t *GB_RESTRICT Mi = NULL ;
     const GB_void *GB_RESTRICT Mx = NULL ;
     size_t msize = 0 ;
     int64_t mnvec = 0 ;
-    bool M_is_hyper = false ;
+    int64_t mvlen = 0 ;
+    const bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
+    const bool M_is_bitmap = GB_IS_BITMAP (M) ;
+    const bool M_jumbled = GB_JUMBLED (M) ;
     if (M != NULL)
     { 
         Mp = M->p ;
         Mh = M->h ;
+        Mb = M->b ;
         Mi = M->i ;
         Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
         msize = M->type->size ;
         mnvec = M->nvec ;
-        M_is_hyper = M->is_hyper ;
+        mvlen = M->vlen ;
     }
 
     // 3 cases:
@@ -83,7 +112,13 @@ void GB_AxB_saxpy3_symbolic
     //      M present     and Mask_comp true : compute C<!M>=A*B
     // If M is NULL on input, then Mask_comp is also false on input.
 
-    bool mask_is_M = (M != NULL && !Mask_comp) ;
+    const bool mask_is_M = (M != NULL && !Mask_comp) ;
+
+    // ignore the mask if present, not complemented, dense and
+    // used in place, structural, and not bitmap.  In this case,
+    // all entries in M are true, so M can be ignored.
+    const bool ignore_mask = mask_is_M && M_dense_in_place &&
+        Mask_struct && !M_is_bitmap ;
 
     //==========================================================================
     // phase1: count nnz(C(:,j)) for coarse tasks, scatter M for fine tasks
@@ -116,17 +151,18 @@ void GB_AxB_saxpy3_symbolic
             //------------------------------------------------------------------
             // get the task descriptor
             //------------------------------------------------------------------
-        
+
             int64_t kk = TaskList [taskid].vector ;
-            int64_t bjnz = Bp [kk+1] - Bp [kk] ;
+            int64_t bjnz = (Bp == NULL) ? bvlen : (Bp [kk+1] - Bp [kk]) ;
             // no work to do if B(:,j) is empty
             if (bjnz == 0) continue ;
 
             // partition M(:,j)
             GB_GET_M_j ;        // get M(:,j)
+
             int team_size = TaskList [taskid].team_size ;
-            int master    = TaskList [taskid].master ;
-            int my_teamid = taskid - master ;
+            int leader    = TaskList [taskid].leader ;
+            int my_teamid = taskid - leader ;
             int64_t mystart, myend ;
             GB_PARTITION (mystart, myend, mjnz, my_teamid, team_size) ;
             mystart += pM_start ;
@@ -140,20 +176,28 @@ void GB_AxB_saxpy3_symbolic
                 //--------------------------------------------------------------
 
                 // Scatter the values of M(:,j) into Hf.  No atomics needed
-                // since all indices i in M(;,j) are unique.
-
-                int8_t *GB_RESTRICT
-                    Hf = (int8_t *GB_RESTRICT) TaskList [taskid].Hf ;
-                GB_SCATTER_M_j (mystart, myend, 1) ;
+                // since all indices i in M(;,j) are unique.  Do not scatter
+                // the mask if M(:,j) is a dense vector, since in that case
+                // the numeric phase accesses M(:,j) directly, not via Hf.
+
+                if (mjnz > 0)
+                { 
+                    int8_t *GB_RESTRICT
+                        Hf = (int8_t *GB_RESTRICT) TaskList [taskid].Hf ;
+                    GB_SCATTER_M_j (mystart, myend, 1) ;
+                }
 
             }
-            else
+            else if (!M_dense_in_place)
             {
 
                 //--------------------------------------------------------------
                 // phase1: fine hash task, C<M>=A*B or C<!M>=A*B
                 //--------------------------------------------------------------
 
+                // If M_dense_in_place is true, this is skipped.  The mask M
+                // is dense, and is used in-place.
+
                 // The least significant 2 bits of Hf [hash] is the flag f, and
                 // the upper bits contain h, as (h,f).  After this phase1, if
                 // M(i,j)=1 then the hash table contains ((i+1),1) in Hf [hash]
@@ -168,12 +212,12 @@ void GB_AxB_saxpy3_symbolic
                 int64_t *GB_RESTRICT
                     Hf = (int64_t *GB_RESTRICT) TaskList [taskid].Hf ;
                 int64_t hash_bits = (hash_size-1) ;
-                ASSERT (hash_size >= mjnz) ;
-                for (int64_t pM = mystart ; pM < myend ; pM++) // scan my M(:,j)
+                // scan my M(:,j)
+                for (int64_t pM = mystart ; pM < myend ; pM++)
                 {
-                    GB_GET_M_ij ;                   // get M(i,j)
+                    GB_GET_M_ij (pM) ;              // get M(i,j)
                     if (!mij) continue ;            // skip if M(i,j)=0
-                    int64_t i = Mi [pM] ;
+                    int64_t i = GBI (Mi, pM, mvlen) ;
                     int64_t i_mine = ((i+1) << 2) + 1 ;  // ((i+1),1)
                     for (GB_HASH (i))
                     { 
@@ -205,7 +249,6 @@ void GB_AxB_saxpy3_symbolic
             int64_t kfirst = TaskList [taskid].start ;
             int64_t klast  = TaskList [taskid].end ;
             int64_t mark = 0 ;
-            // int64_t nk = klast - kfirst + 1 ;
 
             if (use_Gustavson)
             {
@@ -221,48 +264,9 @@ void GB_AxB_saxpy3_symbolic
                     // phase1: coarse Gustavson task, C=A*B
                     //----------------------------------------------------------
 
-                    // Initially, Hf [...] < mark for all Hf.
-                    // Hf [i] is set to mark when C(i,j) is found.
-
-                    for (int64_t kk = kfirst ; kk <= klast ; kk++)
-                    {
-                        GB_GET_B_j ;            // get B(:,j)
-                        if (bjnz == 0)
-                        { 
-                            Cp [kk] = 0 ;
-                            continue ;
-                        }
-                        if (bjnz == 1)
-                        { 
-                            int64_t k = Bi [pB] ;   // get B(k,j)
-                            GB_GET_A_k ;            // get A(:,k)
-                            Cp [kk] = aknz ;        // nnz(C(:,j)) = nnz(A(:,k))
-                            continue ;
-                        }
-                        mark++ ;
-                        int64_t cjnz = 0 ;
-                        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
-                        {
-                            int64_t k = Bi [pB] ;       // get B(k,j)
-                            GB_GET_A_k ;                // get A(:,k)
-                            if (aknz == cvlen)
-                            { 
-                                cjnz = cvlen ;  // A(:,k) is dense
-                                break ;         // so nnz(C(:,j)) = cvlen
-                            }
-                            // scan A(:,k)
-                            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
-                            {
-                                int64_t i = Ai [pA] ;    // get A(i,k)
-                                if (Hf [i] != mark)     // if true, i is new
-                                { 
-                                    Hf [i] = mark ; // mark C(i,j) as seen
-                                    cjnz++ ;        // C(i,j) is a new entry
-                                }
-                            }
-                        }
-                        Cp [kk] = cjnz ;    // count the entries in C(:,j)
-                    }
+                    #define GB_SAXPY_COARSE_GUSTAVSON_NOMASK_PHASE1
+                    #include "GB_meta16_factory.c"
+                    #undef  GB_SAXPY_COARSE_GUSTAVSON_NOMASK_PHASE1
 
                 }
                 else if (mask_is_M)
@@ -272,50 +276,9 @@ void GB_AxB_saxpy3_symbolic
                     // phase1: coarse Gustavson task, C<M>=A*B
                     //----------------------------------------------------------
 
-                    // Initially, Hf [...] < mark for all of Hf.
-
-                    // Hf [i] < mark    : M(i,j)=0, C(i,j) is ignored.
-                    // Hf [i] == mark   : M(i,j)=1, and C(i,j) not yet seen.
-                    // Hf [i] == mark+1 : M(i,j)=1, and C(i,j) has been seen.
-
-                    for (int64_t kk = kfirst ; kk <= klast ; kk++)
-                    {
-                        GB_GET_B_j ;            // get B(:,j)
-                        if (bjnz == 0)
-                        { 
-                            Cp [kk] = 0 ;
-                            continue ;
-                        }
-                        GB_GET_M_j ;            // get M(:,j)
-                        if (mjnz == 0)
-                        { 
-                            Cp [kk] = 0 ;
-                            continue ;
-                        }
-                        GB_GET_M_j_RANGE (64) ; // get first and last in M(:,j)
-                        mark += 2 ;
-                        int64_t mark1 = mark+1 ;
-                        // scatter M(:,j)
-                        GB_SCATTER_M_j (pM_start, pM_end, mark) ;
-                        int64_t cjnz = 0 ;
-                        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
-                        { 
-                            int64_t k = Bi [pB] ;       // get B(k,j)
-                            GB_GET_A_k ;                // get A(:,k)
-                            GB_SKIP_IF_A_k_DISJOINT_WITH_M_j ;
-                            #define GB_IKJ                                     \
-                            {                                                  \
-                                if (Hf [i] == mark)   /* if true, M(i,j) is 1*/\
-                                {                                              \
-                                    Hf [i] = mark1 ;  /* mark C(i,j) as seen */\
-                                    cjnz++ ;          /* C(i,j) is new */      \
-                                }                                              \
-                            }
-                            GB_SCAN_M_j_OR_A_k ;
-                            #undef GB_IKJ
-                        }
-                        Cp [kk] = cjnz ;    // count the entries in C(:,j)
-                    }
+                    #define GB_SAXPY_COARSE_GUSTAVSON_M_PHASE1
+                    #include "GB_meta16_factory.c"
+                    #undef  GB_SAXPY_COARSE_GUSTAVSON_M_PHASE1
 
                 }
                 else
@@ -325,43 +288,10 @@ void GB_AxB_saxpy3_symbolic
                     // phase1: coarse Gustavson task, C<!M>=A*B
                     //----------------------------------------------------------
 
-                    // Initially, Hf [...] < mark for all of Hf.
-
-                    // Hf [i] < mark    : M(i,j)=0, C(i,j) is not yet seen.
-                    // Hf [i] == mark   : M(i,j)=1, so C(i,j) is ignored.
-                    // Hf [i] == mark+1 : M(i,j)=0, and C(i,j) has been seen.
+                    #define GB_SAXPY_COARSE_GUSTAVSON_NOTM_PHASE1
+                    #include "GB_meta16_factory.c"
+                    #undef  GB_SAXPY_COARSE_GUSTAVSON_NOTM_PHASE1
 
-                    for (int64_t kk = kfirst ; kk <= klast ; kk++)
-                    {
-                        GB_GET_B_j ;                    // get B(:,j)
-                        if (bjnz == 0)
-                        { 
-                            Cp [kk] = 0 ;
-                            continue ;
-                        }
-                        GB_GET_M_j ;            // get M(:,j)
-                        mark += 2 ;
-                        int64_t mark1 = mark+1 ;
-                        // scatter M(:,j)
-                        GB_SCATTER_M_j (pM_start, pM_end, mark) ;
-                        int64_t cjnz = 0 ;
-                        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
-                        {
-                            int64_t k = Bi [pB] ;       // get B(k,j)
-                            GB_GET_A_k ;                // get A(:,k)
-                            // scan A(:,k)
-                            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
-                            {
-                                int64_t i = Ai [pA] ;   // get A(i,k)
-                                if (Hf [i] < mark)      // if true, M(i,j) is 0
-                                { 
-                                    Hf [i] = mark1 ;    // mark C(i,j) as seen
-                                    cjnz++ ;            // C(i,j) is a new entry
-                                }
-                            }
-                        }
-                        Cp [kk] = cjnz ;    // count the entries in C(:,j)
-                    }
                 }
 
             }
@@ -375,120 +305,94 @@ void GB_AxB_saxpy3_symbolic
                 int64_t *GB_RESTRICT Hi = TaskList [taskid].Hi ;
                 int64_t hash_bits = (hash_size-1) ;
 
-                if (M == NULL)
-                {
+                if (M == NULL || ignore_mask)
+                { 
 
                     //----------------------------------------------------------
                     // phase1: coarse hash task, C=A*B
                     //----------------------------------------------------------
 
-                    // Initially, Hf [...] < mark for all of Hf.
-                    // Let f = Hf [hash] and h = Hi [hash]
-
-                    // f < mark          : unoccupied.
-                    // h == i, f == mark : occupied with C(i,j)
-
-                    for (int64_t kk = kfirst ; kk <= klast ; kk++)
-                    {
-                        GB_GET_B_j ;            // get B(:,j)
-                        if (bjnz == 0)
-                        { 
-                            Cp [kk] = 0 ; continue ;
-                        }
-                        if (bjnz == 1)
-                        { 
-                            int64_t k = Bi [pB] ;   // get B(k,j)
-                            GB_GET_A_k ;            // get A(:,k)
-                            Cp [kk] = aknz ;        // nnz(C(:,j)) = nnz(A(:,k))
-                            continue ;
-                        }
-                        mark++ ;
-                        int64_t cjnz = 0 ;
-                        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
-                        {
-                            int64_t k = Bi [pB] ;       // get B(k,j)
-                            GB_GET_A_k ;                // get A(:,k)
-                            // scan A(:,k)
-                            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
-                            {
-                                int64_t i = Ai [pA] ;   // get A(i,k)
-                                for (GB_HASH (i))       // find i in hash
-                                {
-                                    if (Hf [hash] < mark)
-                                    { 
-                                        Hf [hash] = mark ; // insert C(i,j)
-                                        Hi [hash] = i ;
-                                        cjnz++ ;  // C(i,j) is a new entry.
-                                        break ;
-                                    }
-                                    if (Hi [hash] == i) break ;
-                                }
-                            }
-                        }
-                        Cp [kk] = cjnz ;    // count the entries in C(:,j)
-                    }
+                    // no mask present, or mask ignored
+                    #undef GB_CHECK_MASK_ij
+                    #define GB_SAXPY_COARSE_HASH_PHASE1
+                    #include "GB_meta16_factory.c"
 
                 }
                 else if (mask_is_M)
                 {
 
                     //----------------------------------------------------------
-                    // phase1: hash task, C<M>=A*B
+                    // phase1: coarse hash task, C<M>=A*B
                     //----------------------------------------------------------
 
-                    // Initially, Hf [...] < mark for all of Hf.
-                    // Let h = Hi [hash] and f = Hf [hash].
+                    if (M_dense_in_place)
+                    { 
 
-                    // f < mark: unoccupied, M(i,j)=0, C(i,j) ignored if
-                    //           this case occurs while scanning A(:,k)
-                    // h == i, f == mark   : M(i,j)=1, and C(i,j) not yet seen.
-                    // h == i, f == mark+1 : M(i,j)=1, and C(i,j) has been seen.
+                        //------------------------------------------------------
+                        // M(:,j) is dense.  M is not scattered into Hf.
+                        //------------------------------------------------------
 
-                    for (int64_t kk = kfirst ; kk <= klast ; kk++)
-                    {
-                        GB_GET_B_j ;            // get B(:,j)
-                        if (bjnz == 0)
-                        { 
-                            Cp [kk] = 0 ;
-                            continue ;
-                        }
-                        GB_GET_M_j ;            // get M(:,j)
-                        if (mjnz == 0)
-                        { 
-                            Cp [kk] = 0 ;
-                            continue ;
-                        }
-                        GB_GET_M_j_RANGE (64) ; // get first and last in M(:,j)
-                        mark += 2 ;
-                        int64_t mark1 = mark+1 ;
-                        GB_HASH_M_j ;           // hash M(:,j)
-                        int64_t cjnz = 0 ;
-                        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
-                        { 
-                            int64_t k = Bi [pB] ;       // get B(k,j)
-                            GB_GET_A_k ;                // get A(:,k)
-                            GB_SKIP_IF_A_k_DISJOINT_WITH_M_j ;
-                            #define GB_IKJ                                     \
-                            {                                                  \
-                                for (GB_HASH (i))       /* find i in hash */   \
-                                {                                              \
-                                    int64_t f = Hf [hash] ;                    \
-                                    if (f < mark) break ; /* M(i,j)=0; ignore*/\
-                                    if (Hi [hash] == i)   /* if true, i found*/\
-                                    {                                          \
-                                        if (f == mark)  /* if true, i is new */\
-                                        {                                      \
-                                            Hf [hash] = mark1 ; /* mark seen */\
-                                            cjnz++ ;    /* C(i,j) is new */    \
-                                        }                                      \
-                                        break ;                                \
-                                    }                                          \
-                                }                                              \
-                            }
-                            GB_SCAN_M_j_OR_A_k ;
-                            #undef GB_IKJ
+                        ASSERT (!Mask_struct || M_is_bitmap) ;
+                        #define GB_CHECK_MASK_ij                        \
+                            bool mij =                                  \
+                                (M_is_bitmap ? Mjb [i] : 1) &&          \
+                                (Mask_struct ? 1 : (Mjx [i] != 0)) ;    \
+                            if (!mij) continue ;
+
+                        switch (msize)
+                        {
+                            default:
+                            case 1 : 
+                                #undef  M_TYPE
+                                #define M_TYPE uint8_t
+                                #undef  M_SIZE
+                                #define M_SIZE 1
+                                #include "GB_meta16_factory.c"
+                                break ;
+                            case 2 : 
+                                #undef  M_TYPE
+                                #define M_TYPE uint16_t
+                                #include "GB_meta16_factory.c"
+                                break ;
+                            case 4 : 
+                                #undef  M_TYPE
+                                #define M_TYPE uint32_t
+                                #include "GB_meta16_factory.c"
+                                break ;
+                            case 8 : 
+                                #undef  M_TYPE
+                                #define M_TYPE uint64_t
+                                #include "GB_meta16_factory.c"
+                                break ;
+                            case 16 : 
+                                #undef  M_TYPE
+                                #define M_TYPE uint64_t
+                                #undef  M_SIZE
+                                #define M_SIZE 2
+                                #undef  GB_CHECK_MASK_ij
+                                #define GB_CHECK_MASK_ij                    \
+                                    bool mij =                              \
+                                        (M_is_bitmap ? Mjb [i] : 1) &&      \
+                                        (Mask_struct ? 1 :                  \
+                                            (Mjx [2*i] != 0) ||             \
+                                            (Mjx [2*i+1] != 0)) ;           \
+                                    if (!mij) continue ;
+                                #include "GB_meta16_factory.c"
+                                break ;
                         }
-                        Cp [kk] = cjnz ;    // count the entries in C(:,j)
+                        #undef GB_SAXPY_COARSE_HASH_PHASE1
+
+                    }
+                    else
+                    {
+
+                        //------------------------------------------------------
+                        // M is sparse and scattered into Hf
+                        //------------------------------------------------------
+
+                        #define GB_SAXPY_COARSE_HASH_M_PHASE1
+                        #include "GB_meta16_factory.c"
+                        #undef  GB_SAXPY_COARSE_HASH_M_PHASE1
                     }
 
                 }
@@ -499,48 +403,87 @@ void GB_AxB_saxpy3_symbolic
                     // phase1: coarse hash task, C<!M>=A*B
                     //----------------------------------------------------------
 
-                    // Initially, Hf [...] < mark for all of Hf.
-                    // Let h = Hi [hash] and f = Hf [hash].
+                    if (M_dense_in_place)
+                    {
 
-                    // f < mark: unoccupied, M(i,j)=0, and C(i,j) not yet seen.
-                    // h == i, f == mark   : M(i,j)=1. C(i,j) ignored.
-                    // h == i, f == mark+1 : M(i,j)=0, and C(i,j) has been seen.
+                        //------------------------------------------------------
+                        // M(:,j) is dense.  M is not scattered into Hf.
+                        //------------------------------------------------------
 
-                    for (int64_t kk = kfirst ; kk <= klast ; kk++)
-                    {
-                        GB_GET_B_j ;            // get B(:,j)
-                        if (bjnz == 0)
+                        if (Mask_struct && !M_is_bitmap)
                         { 
-                            Cp [kk] = 0 ;
+                            // structural mask, complemented, not bitmap.
+                            // No work to do; C is empty.
+                            for (int64_t kk = kfirst ; kk <= klast ; kk++)
+                            {
+                                Cp [kk] = 0 ;
+                            }
                             continue ;
                         }
-                        GB_GET_M_j ;            // get M(:,j)
-                        mark += 2 ;
-                        int64_t mark1 = mark+1 ;
-                        GB_HASH_M_j ;           // hash M(:,j)
-                        int64_t cjnz = 0 ;
-                        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
+
+                        #define GB_SAXPY_COARSE_HASH_PHASE1
+
+                        #undef  GB_CHECK_MASK_ij
+                        #define GB_CHECK_MASK_ij                        \
+                            bool mij =                                  \
+                                (M_is_bitmap ? Mjb [i] : 1) &&          \
+                                (Mask_struct ? 1 : (Mjx [i] != 0)) ;    \
+                            if (mij) continue ;
+
+                        switch (msize)
                         {
-                            int64_t k = Bi [pB] ;       // get B(k,j)
-                            GB_GET_A_k ;                // get A(:,k)
-                            // scan A(:,k)
-                            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
-                            {
-                                int64_t i = Ai [pA] ;   // get A(i,k)
-                                for (GB_HASH (i))       // find i in hash
-                                {
-                                    if (Hf [hash] < mark)   // if true, i is new
-                                    { 
-                                        Hf [hash] = mark1 ; // mark C(i,j) seen
-                                        Hi [hash] = i ;
-                                        cjnz++ ;        // C(i,j) is a new entry
-                                        break ;
-                                    }
-                                    if (Hi [hash] == i) break ;
-                                }
-                            }
+                            default:
+                            case 1 : 
+                                #undef  M_TYPE
+                                #define M_TYPE uint8_t
+                                #undef  M_SIZE
+                                #define M_SIZE 1
+                                #include "GB_meta16_factory.c"
+                                break ;
+                            case 2 : 
+                                #undef  M_TYPE
+                                #define M_TYPE uint16_t
+                                #include "GB_meta16_factory.c"
+                                break ;
+                            case 4 : 
+                                #undef  M_TYPE
+                                #define M_TYPE uint32_t
+                                #include "GB_meta16_factory.c"
+                                break ;
+                            case 8 : 
+                                #undef  M_TYPE
+                                #define M_TYPE uint64_t
+                                #include "GB_meta16_factory.c"
+                                break ;
+                            case 16 : 
+                                #undef  M_TYPE
+                                #define M_TYPE uint64_t
+                                #undef  M_SIZE
+                                #define M_SIZE 2
+                                #undef  GB_CHECK_MASK_ij
+                                #define GB_CHECK_MASK_ij                    \
+                                    bool mij =                              \
+                                        (M_is_bitmap ? Mjb [i] : 1) &&      \
+                                        (Mask_struct ? 1 :                  \
+                                            (Mjx [2*i] != 0) ||             \
+                                            (Mjx [2*i+1] != 0)) ;           \
+                                    if (mij) continue ;
+                                #include "GB_meta16_factory.c"
+                                break ;
                         }
-                        Cp [kk] = cjnz ;    // count the entries in C(:,j)
+                        #undef GB_SAXPY_COARSE_HASH_PHASE1
+
+                    }
+                    else
+                    {
+
+                        //------------------------------------------------------
+                        // M is sparse and scattered into Hf
+                        //------------------------------------------------------
+
+                        #define GB_SAXPY_COARSE_HASH_NOTM_PHASE1
+                        #include "GB_meta16_factory.c"
+                        #undef  GB_SAXPY_COARSE_HASH_NOTM_PHASE1
                     }
                 }
             }
@@ -558,19 +501,20 @@ void GB_AxB_saxpy3_symbolic
         {
             int64_t kk = TaskList [taskid].vector ;
             ASSERT (kk >= 0 && kk < B->nvec) ;
-            int64_t bjnz = Bp [kk+1] - Bp [kk] ;
+            int64_t bjnz = (Bp == NULL) ? bvlen : (Bp [kk+1] - Bp [kk]) ;
             // no work to do if B(:,j) is empty
             if (bjnz == 0) continue ;
             int64_t hash_size = TaskList [taskid].hsize ;
             bool use_Gustavson = (hash_size == cvlen) ;
-            int master = TaskList [taskid].master ;
-            if (master != taskid) continue ;
+            int leader = TaskList [taskid].leader ;
+            if (leader != taskid) continue ;
             GB_GET_M_j ;        // get M(:,j)
+            if (mjnz == 0) continue ;
             int64_t mjcount2 = 0 ;
             int64_t mjcount = 0 ;
             for (int64_t pM = pM_start ; pM < pM_end ; pM++)
             {
-                GB_GET_M_ij ;           // get M(i,j)
+                GB_GET_M_ij (pM) ;                  // get M(i,j)
                 if (mij) mjcount++ ;
             }
             if (use_Gustavson)
@@ -580,8 +524,9 @@ void GB_AxB_saxpy3_symbolic
                     Hf = (int8_t *GB_RESTRICT) TaskList [taskid].Hf ;
                 for (int64_t pM = pM_start ; pM < pM_end ; pM++)
                 {
-                    GB_GET_M_ij ;                    // get M(i,j)
-                    ASSERT (Hf [Mi [pM]] == mij) ;
+                    GB_GET_M_ij (pM) ;               // get M(i,j)
+                    int64_t i = GBI (Mi, pM, mvlen) ;
+                    ASSERT (Hf [i] == mij) ;
                 }
                 for (int64_t i = 0 ; i < cvlen ; i++)
                 {
@@ -590,7 +535,7 @@ void GB_AxB_saxpy3_symbolic
                 }
                 ASSERT (mjcount == mjcount2) ;
             }
-            else
+            else if (!M_dense_in_place)
             {
                 // phase1: fine hash task, C<M>=A*B or C<!M>=A*B
                 // h == 0,   f == 0: unoccupied and unlocked
@@ -600,9 +545,9 @@ void GB_AxB_saxpy3_symbolic
                 int64_t hash_bits = (hash_size-1) ;
                 for (int64_t pM = pM_start ; pM < pM_end ; pM++)
                 {
-                    GB_GET_M_ij ;                   // get M(i,j)
+                    GB_GET_M_ij (pM) ;              // get M(i,j)
                     if (!mij) continue ;            // skip if M(i,j)=0
-                    int64_t i = Mi [pM] ;
+                    int64_t i = GBI (Mi, pM, mvlen) ;
                     int64_t i_mine = ((i+1) << 2) + 1 ;  // ((i+1),1)
                     int64_t probe = 0 ;
                     for (GB_HASH (i))
diff --git a/GraphBLAS/Source/GB_AxB_saxpy_generic.c b/GraphBLAS/Source/GB_AxB_saxpy_generic.c
new file mode 100644
index 0000000000..c5b51f1b93
--- /dev/null
+++ b/GraphBLAS/Source/GB_AxB_saxpy_generic.c
@@ -0,0 +1,385 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy_generic: compute C=A*B, C<M>=A*B, or C<!M>=A*B in parallel
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// GB_AxB_saxpy_generic computes C=A*B, C<M>=A*B, or C<!M>=A*B in parallel,
+// with arbitrary types and operators.  C can have any sparsity pattern:
+// hyper, sparse, bitmap, or full.
+
+//------------------------------------------------------------------------------
+
+#include "GB_mxm.h"
+#include "GB_binop.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_ek_slice.h"
+#include "GB_bitmap_assign_methods.h"
+
+GrB_Info GB_AxB_saxpy_generic
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,    // ignored if C is bitmap
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GrB_Semiring semiring,    // semiring that defines C=A*B
+    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
+    GB_saxpy3task_struct *GB_RESTRICT TaskList, // NULL if C is bitmap
+    int ntasks,                     // 0 if C is bitmap (computed below)
+    int nfine,                      // 0 if C is bitmap (not used)
+    int nthreads,                   // 0 if C is bitmap (computed below)
+    const int do_sort,              // if nonzero, try to sort in saxpy3
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // get operators, functions, workspace, contents of A, B, and C
+    //--------------------------------------------------------------------------
+
+    GrB_BinaryOp mult = semiring->multiply ;
+    GrB_Monoid add = semiring->add ;
+    ASSERT (mult->ztype == add->op->ztype) ;
+    ASSERT (mult->ztype == C->type) ;
+
+    GxB_binary_function fmult = mult->function ;    // NULL if positional
+    GxB_binary_function fadd  = add->op->function ;
+    GB_Opcode opcode = mult->opcode ;
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (opcode) ;
+
+    size_t csize = C->type->size ;
+    size_t asize = A_is_pattern ? 0 : A->type->size ;
+    size_t bsize = B_is_pattern ? 0 : B->type->size ;
+
+    size_t xsize = mult->xtype->size ;
+    size_t ysize = mult->ytype->size ;
+
+    // scalar workspace: because of typecasting, the x/y types need not
+    // be the same as the size of the A and B types.
+    // flipxy false: aik = (xtype) A(i,k) and bkj = (ytype) B(k,j)
+    // flipxy true:  aik = (ytype) A(i,k) and bkj = (xtype) B(k,j)
+    size_t aik_size = flipxy ? ysize : xsize ;
+    size_t bkj_size = flipxy ? xsize : ysize ;
+
+    GB_cast_function cast_A, cast_B ;
+    if (flipxy)
+    { 
+        // A is typecasted to y, and B is typecasted to x
+        cast_A = A_is_pattern ? NULL : 
+                 GB_cast_factory (mult->ytype->code, A->type->code) ;
+        cast_B = B_is_pattern ? NULL : 
+                 GB_cast_factory (mult->xtype->code, B->type->code) ;
+    }
+    else
+    { 
+        // A is typecasted to x, and B is typecasted to y
+        cast_A = A_is_pattern ? NULL :
+                 GB_cast_factory (mult->xtype->code, A->type->code) ;
+        cast_B = B_is_pattern ? NULL :
+                 GB_cast_factory (mult->ytype->code, B->type->code) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // C = A*B via saxpy3 or bitmap method, function pointers, and typecasting
+    //--------------------------------------------------------------------------
+
+    // memcpy (&(Cx [pC]), &(Hx [i]), len*csize)
+    #define GB_CIJ_MEMCPY(pC,i,len) \
+        memcpy (GB_CX (pC), GB_HX (i), (len)*csize)
+
+    // atomic update not available for function pointers
+    #define GB_HAS_ATOMIC 0
+
+    // user-defined monoid update cannot be done with an OpenMP atomic
+    #define GB_HAS_OMP_ATOMIC 0
+
+    // no special cases
+    #define GB_IS_ANY_MONOID 0
+    #define GB_IS_ANY_FC32_MONOID 0
+    #define GB_IS_ANY_FC64_MONOID 0
+    #define GB_IS_PLUS_FC32_MONOID 0
+    #define GB_IS_PLUS_FC64_MONOID 0
+    #define GB_IS_ANY_PAIR_SEMIRING 0
+    #define GB_IS_PAIR_MULTIPLIER 0
+
+    // a generic semiring does not have a concise bitmap multiply-add statement
+    #define GB_HAS_BITMAP_MULTADD 0
+    #define GB_XINIT ;
+    #define GB_XLOAD(bkj) ;
+
+    #define GB_ATYPE GB_void
+    #define GB_BTYPE GB_void
+    #define GB_ASIZE asize
+    #define GB_BSIZE bsize
+
+    // no vectorization
+    #define GB_PRAGMA_SIMD_VECTORIZE ;
+
+    // The monoid identity value is not used.
+    #undef GB_IDENTITY
+    #define GB_HAS_IDENTITY_BYTE 0
+    #define GB_IDENTITY_BYTE (none)
+
+    // definitions for GB_AxB_saxpy_template.c
+    #include "GB_AxB_saxpy3_template.h"
+
+    if (op_is_positional)
+    { 
+
+        //----------------------------------------------------------------------
+        // generic semirings with positional mulitiply operators
+        //----------------------------------------------------------------------
+
+        GB_BURBLE_MATRIX (C, "(generic positional C=A*B) ") ;
+
+        if (flipxy)
+        { 
+            // flip a positional multiplicative operator
+            opcode = GB_binop_flip (opcode) ;
+        }
+
+        // C always has type int64_t or int32_t.  The monoid must be used via
+        // its function pointer.  The positional multiply operator must be
+        // hard-coded since it has no function pointer.  The numerical values
+        // and types of A and B are not accessed.
+
+        ASSERT (A_is_pattern) ;
+        ASSERT (B_is_pattern) ;
+
+        // aik = A(i,k), located in Ax [pA], value not used
+        #define GB_GETA(aik,Ax,pA) ;
+
+        // bkj = B(k,j), located in Bx [pB], value not used
+        #define GB_GETB(bkj,Bx,pB) ;
+
+        // Gx [pG] = A(i,k), located in Ax [pA], value not used
+        #define GB_LOADA(Gx,pG,Ax,pA) ;
+
+        // Gx [pG] = B(k,j), located in Bx [pB], value not used
+        #define GB_LOADB(Gx,pG,Bx,pB) ;
+
+        // define t for each task
+        #define GB_CIJ_DECLARE(t) GB_CTYPE t
+
+        // address of Cx [p]
+        #define GB_CX(p) (&Cx [p])
+
+        // Cx [p] = t
+        #define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+        // address of Hx [i]
+        #define GB_HX(i) (&Hx [i])
+
+        // Hx [i] = t
+        #define GB_HX_WRITE(i,t) Hx [i] = t
+
+        // Cx [p] = Hx [i]
+        #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+        // Cx [p] += Hx [i]
+        #define GB_CIJ_GATHER_UPDATE(p,i) fadd (GB_CX (p), GB_CX (p), GB_HX (i))
+
+        // Cx [p] += t
+        #define GB_CIJ_UPDATE(p,t) fadd (GB_CX (p), GB_CX (p), &t)
+
+        // Hx [i] += t
+        #define GB_HX_UPDATE(i,t) fadd (GB_HX (i), GB_HX (i), &t)
+
+        int64_t offset = GB_positional_offset (opcode) ;
+
+        if (mult->ztype == GrB_INT64)
+        {
+            #undef  GB_CTYPE
+            #define GB_CTYPE int64_t
+            #undef  GB_CSIZE
+            #define GB_CSIZE (sizeof (int64_t))
+            ASSERT (C->type == GrB_INT64) ;
+            ASSERT (csize == sizeof (int64_t)) ;
+            switch (opcode)
+            {
+                case GB_FIRSTI_opcode   :   // z = first_i(A(i,k),y) == i
+                case GB_FIRSTI1_opcode  :   // z = first_i1(A(i,k),y) == i+1
+                    #undef  GB_MULT
+                    #define GB_MULT(t, aik, bkj, i, k, j) t = i + offset
+                    #include "GB_AxB_saxpy_template.c"
+                    break ;
+                case GB_FIRSTJ_opcode   :   // z = first_j(A(i,k),y) == k
+                case GB_FIRSTJ1_opcode  :   // z = first_j1(A(i,k),y) == k+1
+                case GB_SECONDI_opcode  :   // z = second_i(x,B(k,j)) == k
+                case GB_SECONDI1_opcode :   // z = second_i1(x,B(k,j)) == k+1
+                    #undef  GB_MULT
+                    #define GB_MULT(t, aik, bkj, i, k, j) t = k + offset
+                    #include "GB_AxB_saxpy_template.c"
+                    break ;
+                case GB_SECONDJ_opcode  :   // z = second_j(x,B(k,j)) == j
+                case GB_SECONDJ1_opcode :   // z = second_j1(x,B(k,j)) == j+1
+                    #undef  GB_MULT
+                    #define GB_MULT(t, aik, bkj, i, k, j) t = j + offset
+                    #include "GB_AxB_saxpy_template.c"
+                    break ;
+                default: ;
+            }
+        }
+        else
+        {
+            #undef  GB_CTYPE
+            #define GB_CTYPE int32_t
+            #undef  GB_CSIZE
+            #define GB_CSIZE (sizeof (int32_t))
+            ASSERT (C->type == GrB_INT32) ;
+            ASSERT (csize == sizeof (int32_t)) ;
+            switch (opcode)
+            {
+                case GB_FIRSTI_opcode   :   // z = first_i(A(i,k),y) == i
+                case GB_FIRSTI1_opcode  :   // z = first_i1(A(i,k),y) == i+1
+                    #undef  GB_MULT
+                    #define GB_MULT(t,aik,bkj,i,k,j) t = (int32_t) (i + offset)
+                    #include "GB_AxB_saxpy_template.c"
+                    break ;
+                case GB_FIRSTJ_opcode   :   // z = first_j(A(i,k),y) == k
+                case GB_FIRSTJ1_opcode  :   // z = first_j1(A(i,k),y) == k+1
+                case GB_SECONDI_opcode  :   // z = second_i(x,B(k,j)) == k
+                case GB_SECONDI1_opcode :   // z = second_i1(x,B(k,j)) == k+1
+                    #undef  GB_MULT
+                    #define GB_MULT(t,aik,bkj,i,k,j) t = (int32_t) (k + offset)
+                    #include "GB_AxB_saxpy_template.c"
+                    break ;
+                case GB_SECONDJ_opcode  :   // z = second_j(x,B(k,j)) == j
+                case GB_SECONDJ1_opcode :   // z = second_j1(x,B(k,j)) == j+1
+                    #undef  GB_MULT
+                    #define GB_MULT(t,aik,bkj,i,k,j) t = (int32_t) (j + offset)
+                    #include "GB_AxB_saxpy_template.c"
+                    break ;
+                default: ;
+            }
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // generic semirings with standard multiply operators
+        //----------------------------------------------------------------------
+
+        GB_BURBLE_MATRIX (C, "(generic C=A*B) ") ;
+
+        // aik = A(i,k), located in Ax [pA]
+        #undef  GB_GETA
+        #define GB_GETA(aik,Ax,pA)                                          \
+            GB_void aik [GB_VLA(aik_size)] ;                                \
+            if (!A_is_pattern) cast_A (aik, Ax +((pA)*asize), asize)
+
+        // bkj = B(k,j), located in Bx [pB]
+        #undef  GB_GETB
+        #define GB_GETB(bkj,Bx,pB)                                          \
+            GB_void bkj [GB_VLA(bkj_size)] ;                                \
+            if (!B_is_pattern) cast_B (bkj, Bx +((pB)*bsize), bsize)
+
+        // Gx [pG] = A(i,k), located in Ax [pA], no typecasting
+        #undef  GB_LOADA
+        #define GB_LOADA(Gx,pG,Ax,pA)                                       \
+            memcpy (Gx + ((pG)*asize), Ax +((pA)*asize), asize)
+
+        // Gx [pG] = B(k,j), located in Bx [pB], no typecasting
+        #undef  GB_LOADB
+        #define GB_LOADB(Gx,pG,Bx,pB)                                       \
+            memcpy (Gx + ((pG)*bsize), Bx +((pB)*bsize), bsize)
+
+        // define t for each task
+        #undef  GB_CIJ_DECLARE
+        #define GB_CIJ_DECLARE(t) GB_void t [GB_VLA(csize)]
+
+        // address of Cx [p]
+        #undef  GB_CX
+        #define GB_CX(p) (Cx +((p)*csize))
+
+        // Cx [p] = t
+        #undef  GB_CIJ_WRITE
+        #define GB_CIJ_WRITE(p,t) memcpy (GB_CX (p), t, csize)
+
+        // address of Hx [i]
+        #undef  GB_HX
+        #define GB_HX(i) (Hx +((i)*csize))
+
+        // Hx [i] = t
+        #undef  GB_HX_WRITE
+        #define GB_HX_WRITE(i,t) memcpy (GB_HX (i), t, csize)
+
+        // Cx [p] = Hx [i]
+        #undef  GB_CIJ_GATHER
+        #define GB_CIJ_GATHER(p,i) memcpy (GB_CX (p), GB_HX(i), csize)
+
+        // Cx [p] += Hx [i]
+        #undef  GB_CIJ_GATHER_UPDATE
+        #define GB_CIJ_GATHER_UPDATE(p,i) fadd (GB_CX (p), GB_CX (p), GB_HX (i))
+
+        // Cx [p] += t
+        #undef  GB_CIJ_UPDATE
+        #define GB_CIJ_UPDATE(p,t) fadd (GB_CX (p), GB_CX (p), t)
+
+        // Hx [i] += t
+        #undef  GB_HX_UPDATE
+        #define GB_HX_UPDATE(i,t) fadd (GB_HX (i), GB_HX (i), t)
+
+        #undef  GB_CTYPE
+        #define GB_CTYPE GB_void
+
+        #undef  GB_CSIZE
+        #define GB_CSIZE csize
+
+        if (opcode == GB_FIRST_opcode || opcode == GB_SECOND_opcode)
+        {
+            // fmult is not used and can be NULL (for user-defined types)
+            if (flipxy)
+            { 
+                // flip first and second
+                opcode = GB_binop_flip (opcode) ;
+            }
+            if (opcode == GB_FIRST_opcode)
+            { 
+                // t = A(i,k)
+                ASSERT (B_is_pattern) ;
+                #undef  GB_MULT
+                #define GB_MULT(t, aik, bkj, i, k, j) memcpy (t, aik, csize)
+                #include "GB_AxB_saxpy_template.c"
+            }
+            else // opcode == GB_SECOND_opcode
+            { 
+                // t = B(i,k)
+                ASSERT (A_is_pattern) ;
+                #undef  GB_MULT
+                #define GB_MULT(t, aik, bkj, i, k, j) memcpy (t, bkj, csize)
+                #include "GB_AxB_saxpy_template.c"
+            }
+        }
+        else
+        {
+            ASSERT (fmult != NULL) ;
+            if (flipxy)
+            { 
+                // t = B(k,j) * A(i,k)
+                #undef  GB_MULT
+                #define GB_MULT(t, aik, bkj, i, k, j) fmult (t, bkj, aik)
+                #include "GB_AxB_saxpy_template.c"
+            }
+            else
+            { 
+                // t = A(i,k) * B(k,j)
+                #undef  GB_MULT
+                #define GB_MULT(t, aik, bkj, i, k, j) fmult (t, aik, bkj)
+                #include "GB_AxB_saxpy_template.c"
+            }
+        }
+    }
+
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_AxB_saxpy_sparsity.c b/GraphBLAS/Source/GB_AxB_saxpy_sparsity.c
new file mode 100644
index 0000000000..08e5e41934
--- /dev/null
+++ b/GraphBLAS/Source/GB_AxB_saxpy_sparsity.c
@@ -0,0 +1,239 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy_sparsity: determine the sparsity structure for C<M or !M>=A*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Determines the sparsity structure for C, for computing C=A*B, C<M>=A*B, or
+// C<!M>=A*B, based on the sparsity structures of C (on input), M, A, and B,
+// and whether or not M is complemented.
+
+// If C_sparsity is returned as GxB_HYPERSPARSE or GxB_SPARSE, then C is
+// computed by GB_AxB_saxpy3.  That method has the following restriction:  if B
+// is hypersparse, C must also be hypersparse; otherwise C must be sparse.
+
+// TODO: When A or B are bitmapped or full, they can be transposed in-place.
+// TODO: give the user control over this decision
+
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy.h"
+
+int GB_AxB_saxpy_sparsity           // return the sparsity structure for C
+(
+    // input:
+    const GrB_Matrix M,             // optional mask for C, unused if NULL
+    const bool Mask_comp,           // if true, use !M
+    const GrB_Matrix A,             // input A matrix
+    const GrB_Matrix B,             // input B matrix
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // determine the sparsity of C
+    //--------------------------------------------------------------------------
+
+    int C_sparsity ;
+
+    if (B->nvec_nonempty < 0) B->nvec_nonempty = GB_nvec_nonempty (B, Context) ;
+    double bnvec = B->nvec_nonempty ;
+
+    double m = (double) A->vlen ;
+    double n = (double) B->vdim ;
+    double anz = (double) GB_NNZ_HELD (A) ;
+
+    int M_sparsity = (M == NULL) ? 0 : GB_sparsity (M) ;
+    int B_sparsity = GB_sparsity (B) ;
+    int A_sparsity = GB_sparsity (A) ;
+    bool M_is_hyper  = (M_sparsity == GxB_HYPERSPARSE) ;
+    bool M_is_sparse = (M_sparsity == GxB_SPARSE) ;
+
+    if (M != NULL && !Mask_comp && (M_is_hyper || M_is_sparse))
+    {
+
+        //-----------------------------------------------------
+        // C               <M>=             A     *     B
+        //-----------------------------------------------------
+
+        // hyper            sparse          any         hyper
+        // hyper            hyper           any         hyper
+        // sparse           hyper           any         sparse/bitmap/full
+        // sparse           sparse          any         sparse/bitmap/full
+
+        // The non-empty columns of C are a subset of the non-empty columns of
+        // B, so in general, if B is hypersparse, so is C.  If B is sparse,
+        // bitmap, or full, then C must be sparse, regardless of the sparsity
+        // of A and B.  This is a restriction of GB_AxB_saxpy3.c.
+
+        if (B_sparsity == GxB_HYPERSPARSE)
+        { 
+            C_sparsity = GxB_HYPERSPARSE ;
+        }
+        else
+        { 
+            C_sparsity = GxB_SPARSE ;
+        }
+
+    }
+    else
+    {
+
+        //-----------------------------------------------------
+        // C                =               A     *     B
+        //-----------------------------------------------------
+
+        // hyper            .               hyper       hyper
+        // hyper            .               sparse      hyper
+        // hyper            .               bitmap      hyper
+        // hyper            .               full        hyper
+
+        // sparse           .               hyper       sparse
+        // sparse           .               sparse      sparse
+        // sparse/bitmap    .               bitmap      sparse
+        // sparse/bitmap    .               full        sparse
+
+        // sparse/bitmap    .               hyper       bitmap
+        // sparse/bitmap    .               sparse      bitmap
+        // bitmap           .               bitmap      bitmap
+        // bitmap           .               full        bitmap
+
+        // sparse/bitmap    .               hyper       full 
+        // sparse/bitmap    .               sparse      full
+        // bitmap           .               bitmap      full
+        // bitmap (***)     .               full        full
+
+        //    (***): future, compute C as full
+
+        //-----------------------------------------------------
+        // C               <M>=             A     *     B
+        //-----------------------------------------------------
+
+        // hyper            any             hyper       hyper
+        // hyper            any             sparse      hyper
+        // hyper            any             bitmap      hyper
+        // hyper            any             full        hyper
+
+        // sparse           any             hyper       sparse
+        // sparse           any             sparse      sparse
+        // sparse/bitmap    any             bitmap      sparse
+        // sparse/bitmap    any             full        sparse
+
+        // sparse/bitmap    any             hyper       bitmap
+        // sparse/bitmap    any             sparse      bitmap
+        // bitmap           any             bitmap      bitmap
+        // bitmap           any             full        bitmap
+
+        // sparse/bitmap    bitmap/full     hyper       full    (*)
+        // sparse/bitmap    bitmap/full     sparse      full    (*)
+        // bitmap           bitmap/full     bitmap      full    (*)
+        // bitmap           bitmap/full     full        full    (*)
+
+        // (*): if M hyper/sparse, then C is hyper/sparse; see above
+
+        //-----------------------------------------------------
+        // C               <!M>=            A     *     B
+        //-----------------------------------------------------
+
+        // hyper            any             hyper       hyper
+        // hyper            any             sparse      hyper
+        // hyper            any             bitmap      hyper
+        // hyper            any             full        hyper
+
+        // sparse           any             hyper       sparse
+        // sparse           any             sparse      sparse
+        // sparse/bitmap    any             bitmap      sparse
+        // sparse/bitmap    any             full        sparse
+
+        // sparse/bitmap    any             hyper       bitmap
+        // sparse/bitmap    any             sparse      bitmap
+        // bitmap           any             bitmap      bitmap
+        // bitmap           any             full        bitmap
+
+        // sparse/bitmap    any             hyper       full 
+        // sparse/bitmap    any             sparse      full
+        // bitmap           any             bitmap      full
+        // bitmap           any             full        full
+
+        // If M is complemented, or not complemented and bitmap/full, then C
+        // has the same sparsity as listed above, except when A and B are both
+        // full.
+
+        // For the cases where C is labelled as hyper/bitmap or sparse/bitmap:
+        // If m*n is much larger than nnz(A)+nnz(B), then always construct C as
+        // sparse/hyper, not bitmap.   TODO: give the user control over this
+        // decision.
+
+        // TODO:  for bitmap*hyper and hyper*bitmap, create a packed version
+        // of the hyper matrix (like dot does), and construct C as bitmap.
+        // Then expand into C into hyper.
+
+        switch (B_sparsity)
+        {
+            case GxB_HYPERSPARSE : 
+
+                // H = any * H
+                C_sparsity = GxB_HYPERSPARSE ;
+                break ;
+
+            case GxB_SPARSE : 
+
+                switch (A_sparsity)
+                {
+                    case GxB_HYPERSPARSE : 
+                    case GxB_SPARSE : 
+                        // S = {S,H} * S : C has the same sparsity as B
+                        C_sparsity = GxB_SPARSE ;
+                        break ;
+                    case GxB_BITMAP : 
+                    case GxB_FULL : 
+                        // S = {B,F} * S : if B has many empty columns
+                        // B = {B,F} * S : otherwise C is bitmap
+                        C_sparsity = (bnvec < n/2) ? GxB_SPARSE : GxB_BITMAP ;
+                        break ;
+                    default: ;
+                }
+                break ;
+
+            case GxB_BITMAP : 
+
+            case GxB_FULL : 
+
+                switch (A_sparsity)
+                {
+                    case GxB_HYPERSPARSE : 
+                    case GxB_SPARSE : 
+                        // S = {S,H} * {B,F} : if A is very sparse
+                        // B = {S,H} * {B,F} : otherwise C is bitmap
+                        C_sparsity = (anz < m) ? GxB_SPARSE : GxB_BITMAP ;
+                        break ;
+                    case GxB_BITMAP : 
+                    case GxB_FULL : 
+                        // B = {B,F} * {B,F} : C is bitmap
+                        C_sparsity = GxB_BITMAP ;
+                        break ;
+                    default: ;
+                }
+                break ;
+
+            default: ;
+        }
+    }
+
+    if (C_sparsity == GxB_HYPERSPARSE || C_sparsity == GxB_SPARSE)
+    {
+        // If C is sparse or hypersparse, then it will be computed by
+        // GB_AxB_saxpy3.  For that method, the sparsity of C must follow from
+        // B.  If B is hypersparse, C must also be hypersparse.  Otherwise C
+        // must be sparse.  This is a requirement of GB_AxB_saxpy3, and is also
+        // asserted there.
+        ASSERT (C_sparsity ==
+            (B_sparsity == GxB_HYPERSPARSE) ? GxB_HYPERSPARSE : GxB_SPARSE) ;
+    }
+
+    return (C_sparsity) ;
+}
+
diff --git a/GraphBLAS/Source/GB_AxB_semiring_builtin.c b/GraphBLAS/Source/GB_AxB_semiring_builtin.c
index 637acad3f3..7327c4b642 100644
--- a/GraphBLAS/Source/GB_AxB_semiring_builtin.c
+++ b/GraphBLAS/Source/GB_AxB_semiring_builtin.c
@@ -2,8 +2,8 @@
 // GB_AxB_semiring_builtin:  determine if semiring is built-in
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,8 +13,6 @@
 #include "GB_mxm.h"
 #include "GB_binop.h"
 
-#ifndef GBCOMPACT
-
 bool GB_AxB_semiring_builtin        // true if semiring is builtin
 (
     // inputs:
@@ -24,7 +22,7 @@ bool GB_AxB_semiring_builtin        // true if semiring is builtin
     const bool B_is_pattern,        // true if only the pattern of B is used
     const GrB_Semiring semiring,    // semiring that defines C=A*B
     const bool flipxy,              // true if z=fmult(y,x), flipping x and y
-    // outputs, unused by caller if this function returns false
+    // outputs:
     GB_Opcode *mult_opcode,         // multiply opcode
     GB_Opcode *add_opcode,          // add opcode
     GB_Type_code *xcode,            // type code for x input
@@ -44,6 +42,7 @@ bool GB_AxB_semiring_builtin        // true if semiring is builtin
 
     // add is a monoid
     ASSERT (add->xtype == add->ztype && add->ytype == add->ztype) ;
+    ASSERT (!GB_OP_IS_POSITIONAL (add)) ;
 
     // in a semiring, the ztypes of add and mult are always the same:
     ASSERT (add->ztype == mult->ztype) ;
@@ -52,6 +51,11 @@ bool GB_AxB_semiring_builtin        // true if semiring is builtin
     // or not this function handles the semiring as hard-coded.  Now return for
     // cases this function does not handle.
 
+    (*mult_opcode) = 0 ;
+    (*xcode) = 0 ;
+    (*ycode) = 0 ;
+    (*zcode) = 0 ;
+
     //--------------------------------------------------------------------------
     // check the monoid
     //--------------------------------------------------------------------------
@@ -103,5 +107,3 @@ bool GB_AxB_semiring_builtin        // true if semiring is builtin
     return (true) ;
 }
 
-#endif
-
diff --git a/GraphBLAS/Source/GB_BinaryOp_check.c b/GraphBLAS/Source/GB_BinaryOp_check.c
index bafe84c6e3..9ec46ac6e2 100644
--- a/GraphBLAS/Source/GB_BinaryOp_check.c
+++ b/GraphBLAS/Source/GB_BinaryOp_check.c
@@ -2,8 +2,8 @@
 // GB_BinaryOp_check: check and print a binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,8 +15,7 @@ GrB_Info GB_BinaryOp_check  // check a GraphBLAS binary operator
     const GrB_BinaryOp op,  // GraphBLAS operator to print and check
     const char *name,       // name of the operator
     int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
+    FILE *f                 // file for output
 )
 {
 
@@ -28,7 +27,7 @@ GrB_Info GB_BinaryOp_check  // check a GraphBLAS binary operator
 
     if (op == NULL)
     { 
-        // GrB_error status not modified since this may be an optional argument
+        // this may be an optional argument
         GBPR0 ("NULL\n") ;
         return (GrB_NULL_POINTER) ;
     }
@@ -39,7 +38,8 @@ GrB_Info GB_BinaryOp_check  // check a GraphBLAS binary operator
 
     GB_CHECK_MAGIC (op, "BinaryOp") ;
 
-    if (op->opcode >= GB_USER_opcode)
+    GB_Opcode opcode = op->opcode ;
+    if (opcode >= GB_USER_opcode)
     { 
         GBPR0 ("(user-defined) ") ;
     }
@@ -50,45 +50,54 @@ GrB_Info GB_BinaryOp_check  // check a GraphBLAS binary operator
 
     GBPR0 ("z=%s(x,y)\n", op->name) ;
 
-    if (op->function == NULL)
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (opcode) ;
+    bool op_is_first  = (opcode == GB_FIRST_opcode) ;
+    bool op_is_second = (opcode == GB_SECOND_opcode) ;
+    bool op_is_pair   = (opcode == GB_PAIR_opcode) ;
+
+    if (!(op_is_positional || op_is_first || op_is_second)
+       && op->function == NULL)
     { 
         GBPR0 ("    BinaryOp has a NULL function pointer\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "BinaryOp has a NULL function pointer: %s [%s]",
-            GB_NAME, op->name))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
-    if (op->opcode < GB_FIRST_opcode || op->opcode > GB_USER_opcode)
+    if (opcode < GB_FIRST_opcode || opcode > GB_USER_opcode)
     { 
         GBPR0 ("    BinaryOp has an invalid opcode\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "BinaryOp has an invalid opcode: %s [%s]", GB_NAME, op->name))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
     GrB_Info info ;
 
-    info = GB_Type_check (op->ztype, "ztype", pr, f, Context) ;
+    info = GB_Type_check (op->ztype, "ztype", pr, f) ;
     if (info != GrB_SUCCESS)
     { 
         GBPR0 ("    BinaryOp has an invalid ztype\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "BinaryOp has an invalid ztype: %s [%s]", GB_NAME, op->name))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
-    info = GB_Type_check (op->xtype, "xtype", pr, f, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        GBPR0 ("    BinaryOp has an invalid xtype\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "BinaryOp has an invalid xtype: %s [%s]", GB_NAME, op->name))) ;
-    }
-
-    info = GB_Type_check (op->ytype, "ytype", pr, f, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        GBPR0 ("    BinaryOp has an invalid ytype\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "BinaryOp has an invalid ytype: %s [%s]", GB_NAME, op->name))) ;
+    if (!op_is_positional && !op_is_pair)
+    {
+        if (!op_is_second)
+        {
+            info = GB_Type_check (op->xtype, "xtype", pr, f) ;
+            if (info != GrB_SUCCESS)
+            { 
+                GBPR0 ("    BinaryOp has an invalid xtype\n") ;
+                return (GrB_INVALID_OBJECT) ;
+            }
+        }
+
+        if (!op_is_first)
+        {
+            info = GB_Type_check (op->ytype, "ytype", pr, f) ;
+            if (info != GrB_SUCCESS)
+            { 
+                GBPR0 ("    BinaryOp has an invalid ytype\n") ;
+                return (GrB_INVALID_OBJECT) ;
+            }
+        }
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GB_BinaryOp_compatible.c b/GraphBLAS/Source/GB_BinaryOp_compatible.c
index b0a705216f..f29d935f85 100644
--- a/GraphBLAS/Source/GB_BinaryOp_compatible.c
+++ b/GraphBLAS/Source/GB_BinaryOp_compatible.c
@@ -2,8 +2,8 @@
 // GB_BinaryOp_compatible: check binary operator for type compatibility
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -32,30 +32,33 @@ GrB_Info GB_BinaryOp_compatible     // check for domain mismatch
     // ctype and btype may be NULL, but atype is never NULL
     ASSERT (atype != NULL) ;
     ASSERT (bcode <= GB_UDT_code) ;
+    GB_Opcode opcode = op->opcode ;
+    bool op_is_pair_or_positional = (opcode == GB_PAIR_opcode) 
+        || GB_OPCODE_IS_POSITIONAL (opcode) ;
 
     //--------------------------------------------------------------------------
     // first input A is cast into the type of op->xtype
     //--------------------------------------------------------------------------
 
-    if (op->opcode == GB_SECOND_opcode || op->opcode == GB_PAIR_opcode)
+    if (opcode == GB_SECOND_opcode || op_is_pair_or_positional)
     { 
         // first input is unused, so A is always compatible
         ;
     }
     else if (!GB_Type_compatible (atype, op->xtype))
     { 
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+        GB_ERROR (GrB_DOMAIN_MISMATCH,
             "Incompatible type for z=%s(x,y):\n"
             "first input of type [%s]\n"
             "cannot be typecast to x input of type [%s]",
-            op->name, atype->name, op->xtype->name))) ;
+            op->name, atype->name, op->xtype->name) ;
     }
 
     //--------------------------------------------------------------------------
     // second input B is cast into the type of op->ytype
     //--------------------------------------------------------------------------
 
-    if (op->opcode == GB_FIRST_opcode || op->opcode == GB_PAIR_opcode)
+    if (opcode == GB_FIRST_opcode || op_is_pair_or_positional)
     { 
         // second input is unused, so B is always compatible
         ;
@@ -65,11 +68,11 @@ GrB_Info GB_BinaryOp_compatible     // check for domain mismatch
         // B has a type
         if (!GB_Type_compatible (btype, op->ytype))
         { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+            GB_ERROR (GrB_DOMAIN_MISMATCH,
                 "Incompatible type for z=%s(x,y):\n"
                 "second input of type [%s]\n"
                 "cannot be typecast to y input of type [%s]",
-                op->name, btype->name, op->ytype->name))) ;
+                op->name, btype->name, op->ytype->name) ;
         }
     }
     else
@@ -77,11 +80,11 @@ GrB_Info GB_BinaryOp_compatible     // check for domain mismatch
         // B has a just a type code, not a type
         if (!GB_code_compatible (bcode, op->ytype->code))
         { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+            GB_ERROR (GrB_DOMAIN_MISMATCH,
                 "Incompatible type for z=%s(x,y):\n"
                 "second input of type [%s]\n"
                 "cannot be typecast to y input of type [%s]",
-                op->name, GB_code_string (bcode), op->ytype->name))) ;
+                op->name, GB_code_string (bcode), op->ytype->name) ;
         }
     }
 
@@ -89,13 +92,13 @@ GrB_Info GB_BinaryOp_compatible     // check for domain mismatch
     // result of binary operator of op->ztype is cast to C
     //--------------------------------------------------------------------------
 
-    if (ctype != NULL && !GB_Type_compatible (ctype, op->ztype))
+    if (!GB_Type_compatible (ctype, op->ztype))
     { 
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+        GB_ERROR (GrB_DOMAIN_MISMATCH,
             "Incompatible type for z=%s(x,y):\n"
             "operator output z of type [%s]\n"
             "cannot be typecast to result of type [%s]",
-            op->name, op->ztype->name, ctype->name))) ;
+            op->name, op->ztype->name, ctype->name) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GB_BinaryOp_new.c b/GraphBLAS/Source/GB_BinaryOp_new.c
index f0827be2bc..ef581849fa 100644
--- a/GraphBLAS/Source/GB_BinaryOp_new.c
+++ b/GraphBLAS/Source/GB_BinaryOp_new.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_BinaryOp_new: create a new binary operator
+// GB_BinaryOp_new: create a new user-defined binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,7 @@
 // This function is not directly user-callable.  Use GrB_BinaryOp_new instead.
 
 #include "GB.h"
-#include <ctype.h>
+#include "GB_binop.h"
 
 GrB_Info GB_BinaryOp_new
 (
@@ -23,15 +23,15 @@ GrB_Info GB_BinaryOp_new
     GrB_Type ztype,                 // type of output z
     GrB_Type xtype,                 // type of input x
     GrB_Type ytype,                 // type of input y
-    const char *name                // name of the function
+    const char *name                // name of the function (may be NULL)
 )
-{
+{ 
 
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_BinaryOp_new (binaryop, function, ztype, xtype, ytype)") ;
+    GB_WHERE1 ("GrB_BinaryOp_new (op, function, ztype, xtype, ytype)") ;
     GB_RETURN_IF_NULL (binaryop) ;
     (*binaryop) = NULL ;
     GB_RETURN_IF_NULL (function) ;
@@ -43,58 +43,7 @@ GrB_Info GB_BinaryOp_new
     // create the binary op
     //--------------------------------------------------------------------------
 
-    // allocate the binary operator
-    (*binaryop) = GB_CALLOC (1, struct GB_BinaryOp_opaque) ;
-    if (*binaryop == NULL)
-    { 
-        // out of memory
-        return (GB_OUT_OF_MEMORY) ;
-    }
-
-    // initialize the binary operator
-    GrB_BinaryOp op = *binaryop ;
-    op->magic = GB_MAGIC ;
-    op->xtype = xtype ;
-    op->ytype = ytype ;
-    op->ztype = ztype ;
-    op->function = function ;
-    op->opcode = GB_USER_opcode ;     // user-defined operator
-
-    //--------------------------------------------------------------------------
-    // find the name of the operator
-    //--------------------------------------------------------------------------
-
-    if (name == NULL)
-    { 
-        // if no name , a generic name is used instead
-        strncpy (op->name, "user_binary_operator", GB_LEN-1) ;
-    }
-    else
-    {
-        // see if the typecast "(GxB_binary_function)" appears in the name
-        char *p = NULL ;
-        p = strstr ((char *) name, "GxB_binary_function") ;
-        if (p != NULL)
-        { 
-            // skip past the typecast, the left parenthesis, and any whitespace
-            p += 19 ;
-            while (isspace (*p)) p++ ;
-            if (*p == ')') p++ ;
-            while (isspace (*p)) p++ ;
-            strncpy (op->name, p, GB_LEN-1) ;
-        }
-        else
-        { 
-            // copy the entire name as-is
-            strncpy (op->name, name, GB_LEN-1) ;
-        }
-    }
-
-    //--------------------------------------------------------------------------
-    // return result
-    //--------------------------------------------------------------------------
-
-    ASSERT_BINARYOP_OK (op, "new user-defined binary op", GB0) ;
-    return (GrB_SUCCESS) ;
+    return (GB_binop_new (binaryop, function, ztype, xtype, ytype, name,
+        GB_USER_opcode)) ;
 }
 
diff --git a/GraphBLAS/Source/GB_Descriptor_check.c b/GraphBLAS/Source/GB_Descriptor_check.c
index a88d818aee..de9d980f89 100644
--- a/GraphBLAS/Source/GB_Descriptor_check.c
+++ b/GraphBLAS/Source/GB_Descriptor_check.c
@@ -2,8 +2,8 @@
 // GB_Descriptor_check: check and print a Descriptor
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,7 @@ static GrB_Info GB_dc
     const GrB_Desc_Value v,
     const GrB_Desc_Value nondefault,    // for kind == 0
     int pr,                             // print level
-    FILE *f,
-    GB_Context Context
+    FILE *f 
 )
 {
 
@@ -39,7 +38,6 @@ static GrB_Info GB_dc
         case GrB_REPLACE            : GBPR0 ("replace   ") ; break ;
         case GxB_AxB_SAXPY          : GBPR0 ("saxpy     ") ; break ;
         case GxB_AxB_GUSTAVSON      : GBPR0 ("Gustavson ") ; break ;
-        case GxB_AxB_HEAP           : GBPR0 ("heap      ") ; break ;
         case GxB_AxB_HASH           : GBPR0 ("hash      ") ; break ;
         case GxB_AxB_DOT            : GBPR0 ("dot       ") ; break ;
         default                     : GBPR0 ("unknown   ") ;
@@ -52,8 +50,8 @@ static GrB_Info GB_dc
     {
         if (kind == 0)
         {
-            // descriptor field can be set to the default,
-            // or one non-default value
+            // most descriptor fields can be set to the default,
+            // or just one non-default value
             if (! (v == GxB_DEFAULT || v == nondefault))
             { 
                 ok = false ;
@@ -61,10 +59,10 @@ static GrB_Info GB_dc
         }
         else if (kind == 1)
         {
-            // mask
+            // mask: can only be one of 4 different values
             if (! (v == GxB_DEFAULT || v == GrB_COMP || v == GrB_STRUCTURE ||
                    v == (GrB_COMP + GrB_STRUCTURE)))
-            {
+            { 
                 ok = false ;
             }
         }
@@ -72,8 +70,7 @@ static GrB_Info GB_dc
         {
             // GxB_AxB_METHOD:
             if (! (v == GxB_DEFAULT || v == GxB_AxB_GUSTAVSON
-                || v == GxB_AxB_HEAP || v == GxB_AxB_DOT
-                || v == GxB_AxB_HASH || v == GxB_AxB_SAXPY))
+                || v == GxB_AxB_DOT || v == GxB_AxB_HASH || v == GxB_AxB_SAXPY))
             { 
                 ok = false ;
             }
@@ -101,8 +98,7 @@ GrB_Info GB_Descriptor_check    // check a GraphBLAS descriptor
     const GrB_Descriptor D,     // GraphBLAS descriptor to print and check
     const char *name,           // name of the descriptor, optional
     int pr,                     // print level
-    FILE *f,                    // file for output
-    GB_Context Context
+    FILE *f                     // file for output
 )
 {
 
@@ -114,7 +110,6 @@ GrB_Info GB_Descriptor_check    // check a GraphBLAS descriptor
 
     if (D == NULL)
     { 
-        // GrB_error status not modified since this may be an optional argument
         GBPR0 ("NULL\n") ;
         return (GrB_NULL_POINTER) ;
     }
@@ -128,19 +123,18 @@ GrB_Info GB_Descriptor_check    // check a GraphBLAS descriptor
     GBPR0 ("\n") ;
 
     GrB_Info info [5] ;
-    info [0] = GB_dc (0, "out     ", D->out,  GrB_REPLACE, pr, f, Context) ;
-    info [1] = GB_dc (1, "mask    ", D->mask, GxB_DEFAULT, pr, f, Context) ;
-    info [2] = GB_dc (0, "in0     ", D->in0,  GrB_TRAN,    pr, f, Context) ;
-    info [3] = GB_dc (0, "in1     ", D->in1,  GrB_TRAN,    pr, f, Context) ;
-    info [4] = GB_dc (2, "axb     ", D->axb,  GxB_DEFAULT, pr, f, Context) ;
+    info [0] = GB_dc (0, "out     ", D->out,  GrB_REPLACE, pr, f) ;
+    info [1] = GB_dc (1, "mask    ", D->mask, GxB_DEFAULT, pr, f) ;
+    info [2] = GB_dc (0, "in0     ", D->in0,  GrB_TRAN,    pr, f) ;
+    info [3] = GB_dc (0, "in1     ", D->in1,  GrB_TRAN,    pr, f) ;
+    info [4] = GB_dc (2, "axb     ", D->axb,  GxB_DEFAULT, pr, f) ;
 
     for (int i = 0 ; i < 5 ; i++)
     {
         if (info [i] != GrB_SUCCESS)
         { 
             GBPR0 ("    Descriptor field set to an invalid value\n") ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "Descriptor field set to an invalid value: [%s]", GB_NAME))) ;
+            return (GrB_INVALID_OBJECT) ;
         }
     }
 
@@ -166,11 +160,13 @@ GrB_Info GB_Descriptor_check    // check a GraphBLAS descriptor
         GBPR0 ("%g\n", chunk) ;
     }
 
-    if (D->use_mkl)
+    if (D->do_sort)
     {
-        GBPR0 ("    d.use_mkl = true") ;
+        GBPR0 ("    d.sort     = true\n") ;
     }
 
+    // #include "GB_Descriptor_check_mkl_template.c"
+
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_Descriptor_get.c b/GraphBLAS/Source/GB_Descriptor_get.c
index f1fd9f27ea..cab2e6059e 100644
--- a/GraphBLAS/Source/GB_Descriptor_get.c
+++ b/GraphBLAS/Source/GB_Descriptor_get.c
@@ -2,8 +2,8 @@
 // GB_Descriptor_get: get the status of a descriptor
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -62,22 +62,18 @@
 //  desc->axb                   can be:
 
 //      GxB_DEFAULT = 0         automatic selection
-
 //      GxB_AxB_GUSTAVSON       gather-scatter saxpy method
-
-//      GxB_AxB_HEAP            heap-based saxpy method
-
 //      GxB_AxB_HASH            hash-based saxpy method
-
+//      GxB_AxB_SAXPY           saxpy: either Gustavson or hash
 //      GxB_AxB_DOT             dot product
 
+//  desc->do_sort               true or false (default is false) 
+
 //  desc->nthreads_max          max # number of threads to use (auto if <= 0)
 //  desc->chunk                 chunk size for threadds
-//  desc->use_mkl               control usage of Intel MKL
 
 //      These are copied from the GrB_Descriptor into the Context.
 
-
 #include "GB.h"
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
@@ -90,6 +86,7 @@ GrB_Info GB_Descriptor_get      // get the contents of a descriptor
     bool *In0_transpose,        // if true transpose first input
     bool *In1_transpose,        // if true transpose second input
     GrB_Desc_Value *AxB_method, // method for C=A*B
+    int *do_sort,               // if nonzero, sort in GrB_mxm
     GB_Context Context
 )
 {
@@ -114,7 +111,7 @@ GrB_Info GB_Descriptor_get      // get the contents of a descriptor
     GrB_Desc_Value AxB_desc  = GxB_DEFAULT ;
     int nthreads_desc        = GxB_DEFAULT ;
     double chunk_desc        = GxB_DEFAULT ;
-    bool use_mkl             = false ;
+    int do_sort_desc         = GxB_DEFAULT ;
 
     // non-defaults descriptor values
     if (desc != NULL)
@@ -124,7 +121,8 @@ GrB_Info GB_Descriptor_get      // get the contents of a descriptor
         Mask_desc = desc->mask ;  // DEFAULT, COMP, STRUCTURE, or COMP+STRUCTURE
         In0_desc  = desc->in0 ;   // DEFAULT or TRAN
         In1_desc  = desc->in1 ;   // DEFAULT or TRAN
-        AxB_desc  = desc->axb ;   // DEFAULT, GUSTAVSON, HEAP, HASH, or DOT
+        AxB_desc  = desc->axb ;   // DEFAULT, GUSTAVSON, HASH, or DOT
+        do_sort_desc = desc->do_sort ;  // DEFAULT, or true (nonzero)
 
         // default is zero.  if descriptor->nthreads_max <= 0, GraphBLAS selects
         // automatically: any value between 1 and the global nthreads_max.  If
@@ -132,7 +130,6 @@ GrB_Info GB_Descriptor_get      // get the contents of a descriptor
         // threads to use in the current GraphBLAS operation.
         nthreads_desc = desc->nthreads_max ;
         chunk_desc = desc->chunk ;
-        use_mkl = desc->use_mkl ;
     }
 
     // check for valid values of each descriptor field
@@ -142,10 +139,10 @@ GrB_Info GB_Descriptor_get      // get the contents of a descriptor
         !(In0_desc  == GxB_DEFAULT || In0_desc  == GrB_TRAN) ||
         !(In1_desc  == GxB_DEFAULT || In1_desc  == GrB_TRAN) ||
         !(AxB_desc  == GxB_DEFAULT || AxB_desc  == GxB_AxB_GUSTAVSON ||
-          AxB_desc  == GxB_AxB_DOT || AxB_desc  == GxB_AxB_HEAP ||
+          AxB_desc  == GxB_AxB_DOT ||
           AxB_desc  == GxB_AxB_HASH || AxB_desc  == GxB_AxB_SAXPY))
     { 
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG, "Descriptor invalid"))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
     if (C_replace != NULL)
@@ -174,12 +171,17 @@ GrB_Info GB_Descriptor_get      // get the contents of a descriptor
     { 
         *AxB_method = AxB_desc ;
     }
+    if (do_sort != NULL)
+    { 
+        *do_sort = do_sort_desc ;
+    }
 
     // The number of threads is copied from the descriptor into the Context, so
     // it is available to any internal function that needs it.
     Context->nthreads_max = nthreads_desc ;
     Context->chunk = chunk_desc ;
-    Context->use_mkl = use_mkl ;
+
+    // #include "GB_Descriptor_get_mkl_template.c"
 
     return (GrB_SUCCESS) ;
 }
diff --git a/GraphBLAS/Source/GB_Global.c b/GraphBLAS/Source/GB_Global.c
index 257697c02a..274d75ddc6 100644
--- a/GraphBLAS/Source/GB_Global.c
+++ b/GraphBLAS/Source/GB_Global.c
@@ -2,8 +2,8 @@
 // GB_Global: global values in GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,8 +22,6 @@
 typedef struct
 {
 
-    void *queue_head ;          // TODO in 4.0: delete
-
     //--------------------------------------------------------------------------
     // blocking/non-blocking mode, set by GrB_init
     //--------------------------------------------------------------------------
@@ -32,10 +30,9 @@ typedef struct
     bool GrB_init_called ;      // true if GrB_init already called
 
     //--------------------------------------------------------------------------
-    // threading and MKL control
+    // threading control
     //--------------------------------------------------------------------------
 
-    bool use_mkl ;              // control usage of Intel MKL
     int nthreads_max ;          // max number of threads to use
     double chunk ;              // chunk size for determining # threads to use
 
@@ -43,7 +40,8 @@ typedef struct
     // hypersparsity and CSR/CSC format control
     //--------------------------------------------------------------------------
 
-    double hyper_ratio ;        // default hyper_ratio for new matrices
+    float bitmap_switch [GxB_NBITMAP_SWITCH] ; // default bitmap_switch
+    float hyper_switch ;        // default hyper_switch for new matrices
     bool is_csc ;               // default CSR/CSC format for new matrices
 
     //--------------------------------------------------------------------------
@@ -97,7 +95,12 @@ typedef struct
     //--------------------------------------------------------------------------
 
     int64_t hack ;                  // ad hoc setting (for draft versions only)
-    bool burble ;                   // controls GBBURBLE output
+
+    //--------------------------------------------------------------------------
+    // diagnostic output
+    //--------------------------------------------------------------------------
+
+    bool burble ;                   // controls GBURBLE output
 
     //--------------------------------------------------------------------------
     // for MATLAB interface only
@@ -115,6 +118,13 @@ typedef struct
     // properties of each GPU:
     GB_cuda_device gpu_properties [GB_CUDA_MAX_GPUS] ;
 
+    //--------------------------------------------------------------------------
+    // timing: for code development only
+    //--------------------------------------------------------------------------
+
+    double timing [20] ;
+
+    // #include "GB_Global_struct_mkl_template.c"
 }
 GB_Global_struct ;
 
@@ -123,23 +133,38 @@ GB_PUBLIC GB_Global_struct GB_Global ;
 GB_Global_struct GB_Global =
 {
 
-    .queue_head = NULL,         // TODO in 4.0: delete
-
     // GraphBLAS mode
     .mode = GrB_NONBLOCKING,    // default is nonblocking
 
     // initialization flag
     .GrB_init_called = false,   // GrB_init has not yet been called
 
-    // Intel MKL control (DRAFT: in progress)
-    .use_mkl = false,           // if true, exploit the Intel MKL
-
     // max number of threads and chunk size
     .nthreads_max = 1,
     .chunk = GB_CHUNK_DEFAULT,
 
+    // min dimension                density
+    #define GB_BITSWITCH_1          ((float) 0.04)
+    #define GB_BITSWITCH_2          ((float) 0.05)
+    #define GB_BITSWITCH_3_to_4     ((float) 0.06)
+    #define GB_BITSWITCH_5_to_8     ((float) 0.08)
+    #define GB_BITSWITCH_9_to_16    ((float) 0.10)
+    #define GB_BITSWITCH_17_to_32   ((float) 0.20)
+    #define GB_BITSWITCH_33_to_64   ((float) 0.30)
+    #define GB_BITSWITCH_gt_than_64 ((float) 0.40)
+
     // default format
-    .hyper_ratio = GB_HYPER_DEFAULT,
+    .hyper_switch = GB_HYPER_SWITCH_DEFAULT,
+    .bitmap_switch = {
+        GB_BITSWITCH_1,
+        GB_BITSWITCH_2,
+        GB_BITSWITCH_3_to_4,
+        GB_BITSWITCH_5_to_8,
+        GB_BITSWITCH_9_to_16,
+        GB_BITSWITCH_17_to_32,
+        GB_BITSWITCH_33_to_64,
+        GB_BITSWITCH_gt_than_64 },
+
     .is_csc = (GB_FORMAT_DEFAULT != GxB_BY_ROW),    // default is GxB_BY_ROW
 
     // abort function for debugging only
@@ -167,6 +192,8 @@ GB_Global_struct GB_Global =
     // for MATLAB interface only
     .print_one_based = false,   // if true, print 1-based indices
 
+    // #include "GB_Global_init_mkl_template.c'
+
     // CUDA environment (DRAFT: in progress)
     .gpu_count = 0,                     // # of GPUs in the system
     .gpu_control = GxB_DEFAULT,         // always, never, or default
@@ -178,12 +205,6 @@ GB_Global_struct GB_Global =
 // GB_Global access functions
 //==============================================================================
 
-// TODO in 4.0: delete:
-GB_PUBLIC
-void GB_Global_queue_head_set (void *p) { GB_Global.queue_head = p ; }
-GB_PUBLIC
-void *GB_Global_queue_head_get (void) { return (GB_Global.queue_head) ; }
-
 //------------------------------------------------------------------------------
 // mode
 //------------------------------------------------------------------------------
@@ -202,13 +223,13 @@ GrB_Mode GB_Global_mode_get (void)
 // GrB_init_called
 //------------------------------------------------------------------------------
 
-GB_PUBLIC   // accessed by the MATLAB interface only
+GB_PUBLIC
 void GB_Global_GrB_init_called_set (bool GrB_init_called)
 { 
     GB_Global.GrB_init_called = GrB_init_called ;
 }
 
-GB_PUBLIC   // accessed by the MATLAB interface only
+GB_PUBLIC
 bool GB_Global_GrB_init_called_get (void)
 { 
     return (GB_Global.GrB_init_called) ;
@@ -218,13 +239,13 @@ bool GB_Global_GrB_init_called_get (void)
 // nthreads_max
 //------------------------------------------------------------------------------
 
-GB_PUBLIC   // accessed by the MATLAB interface only
+GB_PUBLIC
 void GB_Global_nthreads_max_set (int nthreads_max)
 { 
     GB_Global.nthreads_max = GB_IMAX (nthreads_max, 1) ;
 }
 
-GB_PUBLIC   // accessed by the MATLAB interface only
+GB_PUBLIC
 int GB_Global_nthreads_max_get (void)
 { 
     return (GB_Global.nthreads_max) ;
@@ -234,7 +255,7 @@ int GB_Global_nthreads_max_get (void)
 // OpenMP max_threads
 //------------------------------------------------------------------------------
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GB_PUBLIC
 int GB_Global_omp_get_max_threads (void)
 { 
     return (GB_OPENMP_MAX_THREADS) ;
@@ -244,47 +265,80 @@ int GB_Global_omp_get_max_threads (void)
 // chunk
 //------------------------------------------------------------------------------
 
-GB_PUBLIC   // accessed by the MATLAB interface only
+GB_PUBLIC
 void GB_Global_chunk_set (double chunk)
 { 
     if (chunk <= GxB_DEFAULT) chunk = GB_CHUNK_DEFAULT ;
     GB_Global.chunk = fmax (chunk, 1) ;
 }
 
-GB_PUBLIC   // accessed by the MATLAB interface only
+GB_PUBLIC
 double GB_Global_chunk_get (void)
 { 
     return (GB_Global.chunk) ;
 }
 
 //------------------------------------------------------------------------------
-// hyper_ratio
+// hyper_switch
 //------------------------------------------------------------------------------
 
-void GB_Global_hyper_ratio_set (double hyper_ratio)
+GB_PUBLIC
+void GB_Global_hyper_switch_set (float hyper_switch)
 { 
-    GB_Global.hyper_ratio = hyper_ratio ;
+    GB_Global.hyper_switch = hyper_switch ;
 }
 
-double GB_Global_hyper_ratio_get (void)
+GB_PUBLIC
+float GB_Global_hyper_switch_get (void)
 { 
-    return (GB_Global.hyper_ratio) ;
+    return (GB_Global.hyper_switch) ;
 }
 
 //------------------------------------------------------------------------------
-// use_mkl
+// bitmap_switch
 //------------------------------------------------------------------------------
 
-GB_PUBLIC   // accessed by the MATLAB interface only
-void GB_Global_use_mkl_set (bool use_mkl)
+GB_PUBLIC
+void GB_Global_bitmap_switch_set (int k, float b)
+{ 
+    k = GB_IMAX (k, 0) ;
+    k = GB_IMIN (k, 7) ;
+    GB_Global.bitmap_switch [k] = b ;
+}
+
+GB_PUBLIC
+float GB_Global_bitmap_switch_get (int k)
 { 
-    GB_Global.use_mkl = use_mkl ;
+    k = GB_IMAX (k, 0) ;
+    k = GB_IMIN (k, 7) ;
+    return (GB_Global.bitmap_switch [k]) ;
 }
 
-GB_PUBLIC   // accessed by the MATLAB interface only
-bool GB_Global_use_mkl_get (void)
+GB_PUBLIC
+float GB_Global_bitmap_switch_matrix_get (int64_t vlen, int64_t vdim)
 { 
-    return (GB_Global.use_mkl) ;
+    int64_t d = GB_IMIN (vlen, vdim) ;
+    if (d <=  1) return (GB_Global.bitmap_switch [0]) ;
+    if (d <=  2) return (GB_Global.bitmap_switch [1]) ;
+    if (d <=  4) return (GB_Global.bitmap_switch [2]) ;
+    if (d <=  8) return (GB_Global.bitmap_switch [3]) ;
+    if (d <= 16) return (GB_Global.bitmap_switch [4]) ;
+    if (d <= 32) return (GB_Global.bitmap_switch [5]) ;
+    if (d <= 64) return (GB_Global.bitmap_switch [6]) ;
+    return (GB_Global.bitmap_switch [7]) ;
+}
+
+GB_PUBLIC
+void GB_Global_bitmap_switch_default (void)
+{
+    GB_Global.bitmap_switch [0] = GB_BITSWITCH_1 ;
+    GB_Global.bitmap_switch [1] = GB_BITSWITCH_2 ;
+    GB_Global.bitmap_switch [2] = GB_BITSWITCH_3_to_4 ;
+    GB_Global.bitmap_switch [3] = GB_BITSWITCH_5_to_8 ;
+    GB_Global.bitmap_switch [4] = GB_BITSWITCH_9_to_16 ;
+    GB_Global.bitmap_switch [5] = GB_BITSWITCH_17_to_32 ;
+    GB_Global.bitmap_switch [6] = GB_BITSWITCH_33_to_64 ;
+    GB_Global.bitmap_switch [7] = GB_BITSWITCH_gt_than_64 ;
 }
 
 //------------------------------------------------------------------------------
@@ -305,13 +359,13 @@ bool GB_Global_is_csc_get (void)
 // abort_function
 //------------------------------------------------------------------------------
 
-GB_PUBLIC   // accessed by the MATLAB interface only
+GB_PUBLIC
 void GB_Global_abort_function_set (void (* abort_function) (void))
 { 
     GB_Global.abort_function = abort_function ;
 }
 
-GB_PUBLIC   // accessed by the MATLAB interface only
+GB_PUBLIC
 void GB_Global_abort_function (void)
 {
     GB_Global.abort_function ( ) ;
@@ -434,13 +488,13 @@ void GB_Global_free_function (void *p)
 // malloc_is_thread_safe
 //------------------------------------------------------------------------------
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GB_PUBLIC
 void GB_Global_malloc_is_thread_safe_set (bool malloc_is_thread_safe)
 { 
     GB_Global.malloc_is_thread_safe = malloc_is_thread_safe ;
 }
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GB_PUBLIC
 bool GB_Global_malloc_is_thread_safe_get (void)
 { 
     return (GB_Global.malloc_is_thread_safe) ;
@@ -450,7 +504,7 @@ bool GB_Global_malloc_is_thread_safe_get (void)
 // malloc_tracking
 //------------------------------------------------------------------------------
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GB_PUBLIC
 void GB_Global_malloc_tracking_set (bool malloc_tracking)
 { 
     GB_Global.malloc_tracking = malloc_tracking ;
@@ -471,7 +525,7 @@ void GB_Global_nmalloc_clear (void)
     GB_Global.nmalloc = 0 ;
 }
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GB_PUBLIC
 int64_t GB_Global_nmalloc_get (void)
 { 
     int64_t nmalloc ;
@@ -486,7 +540,7 @@ void GB_Global_nmalloc_increment (void)
     GB_Global.nmalloc++ ;
 }
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GB_PUBLIC
 void GB_Global_nmalloc_decrement (void)
 { 
     GB_ATOMIC_UPDATE
@@ -497,7 +551,7 @@ void GB_Global_nmalloc_decrement (void)
 // malloc_debug
 //------------------------------------------------------------------------------
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GB_PUBLIC
 void GB_Global_malloc_debug_set (bool malloc_debug)
 { 
     GB_ATOMIC_WRITE
@@ -516,7 +570,7 @@ bool GB_Global_malloc_debug_get (void)
 // malloc_debug_count
 //------------------------------------------------------------------------------
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GB_PUBLIC
 void GB_Global_malloc_debug_count_set (int64_t malloc_debug_count)
 { 
     GB_ATOMIC_WRITE
@@ -538,13 +592,13 @@ bool GB_Global_malloc_debug_count_decrement (void)
 // hack: for setting an internal value for development only
 //------------------------------------------------------------------------------
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GB_PUBLIC
 void GB_Global_hack_set (int64_t hack)
 { 
     GB_Global.hack = hack ;
 }
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GB_PUBLIC
 int64_t GB_Global_hack_get (void)
 { 
     return (GB_Global.hack) ;
@@ -559,7 +613,7 @@ void GB_Global_burble_set (bool burble)
     GB_Global.burble = burble ;
 }
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GB_PUBLIC
 bool GB_Global_burble_get (void)
 { 
     return (GB_Global.burble) ;
@@ -569,13 +623,13 @@ bool GB_Global_burble_get (void)
 // for MATLAB interface only
 //------------------------------------------------------------------------------
 
-GB_PUBLIC   // accessed by the MATLAB interface only
+GB_PUBLIC
 void GB_Global_print_one_based_set (bool onebased)
 { 
     GB_Global.print_one_based = onebased ;
 }
 
-GB_PUBLIC   // accessed by the MATLAB interface only
+GB_PUBLIC
 bool GB_Global_print_one_based_get (void)
 { 
     return (GB_Global.print_one_based) ;
@@ -668,6 +722,34 @@ int GB_Global_gpu_sm_get (int device)
     return (GB_Global.gpu_properties [device].number_of_sms)  ;
 }
 
+bool GB_Global_gpu_device_pool_size_set( int device, size_t size)
+{
+    GB_GPU_DEVICE_CHECK (0) ;       // zero if invalid GPU
+    GB_Global.gpu_properties [device].pool_size = (int) size ;
+    return( true); 
+}
+
+bool GB_Global_gpu_device_max_pool_size_set( int device, size_t size)
+{
+    GB_GPU_DEVICE_CHECK (0) ;       // zero if invalid GPU
+    GB_Global.gpu_properties[device].max_pool_size = (int) size ;
+    return( true); 
+}
+
+bool GB_Global_gpu_device_memory_resource_set( int device, void *resource)
+{
+    GB_GPU_DEVICE_CHECK (0) ;       // zero if invalid GPU
+    GB_Global.gpu_properties[device].memory_resource = resource;
+    return( true); 
+}
+
+void* GB_Global_gpu_device_memory_resource_get( int device )
+{
+    GB_GPU_DEVICE_CHECK (0) ;       // zero if invalid GPU
+    return ( GB_Global.gpu_properties [device].memory_resource ) ;
+    //NOTE: this returns a void*, needs to be cast to be used
+}
+
 bool GB_Global_gpu_device_properties_get (int device)
 {
     // get all properties of a specific GPU;
@@ -682,4 +764,42 @@ bool GB_Global_gpu_device_properties_get (int device)
     #endif
 }
 
+//------------------------------------------------------------------------------
+// timing: for code development only
+//------------------------------------------------------------------------------
+
+GB_PUBLIC
+void GB_Global_timing_clear_all (void)
+{
+    for (int k = 0 ; k < 20 ; k++)
+    {
+        GB_Global.timing [k] = 0 ;
+    }
+}
+
+GB_PUBLIC
+void GB_Global_timing_clear (int k)
+{
+    GB_Global.timing [k] = 0 ;
+}
+
+GB_PUBLIC
+void GB_Global_timing_set (int k, double t)
+{
+    GB_Global.timing [k] = t ;
+}
+
+GB_PUBLIC
+void GB_Global_timing_add (int k, double t)
+{
+    GB_Global.timing [k] += t ;
+}
+
+GB_PUBLIC
+double GB_Global_timing_get (int k)
+{
+    return (GB_Global.timing [k]) ;
+}
+
+// #include "GB_Global_mkl_template.c
 
diff --git a/GraphBLAS/Source/GB_Global.h b/GraphBLAS/Source/GB_Global.h
index 6fa5b5ccd6..44901f8fbf 100644
--- a/GraphBLAS/Source/GB_Global.h
+++ b/GraphBLAS/Source/GB_Global.h
@@ -2,8 +2,8 @@
 // GB_Global.h: definitions for global variables
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,120 +14,105 @@
 #ifndef GB_GLOBAL_H
 #define GB_GLOBAL_H
 
-GB_PUBLIC void   GB_Global_queue_head_set (void *p) ;   // TODO in 4.0: delete
-GB_PUBLIC void * GB_Global_queue_head_get (void) ;  // TODO in 4.0: delete
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void     GB_Global_mode_set (GrB_Mode mode) ;
-GrB_Mode GB_Global_mode_get (void) ;
-
-GB_PUBLIC   // accessed by the MATLAB interface only
-void     GB_Global_GrB_init_called_set (bool GrB_init_called) ;
-GB_PUBLIC   // accessed by the MATLAB interface only
-bool     GB_Global_GrB_init_called_get (void) ;
-
-GB_PUBLIC   // accessed by the MATLAB interface only
-void     GB_Global_nthreads_max_set (int nthreads_max) ;
-GB_PUBLIC   // accessed by the MATLAB interface only
-int      GB_Global_nthreads_max_get (void) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-int      GB_Global_omp_get_max_threads (void) ;
-
-GB_PUBLIC   // accessed by the MATLAB interface only
-void     GB_Global_chunk_set (double chunk) ;
-GB_PUBLIC   // accessed by the MATLAB interface only
-double   GB_Global_chunk_get (void) ;
-
-void     GB_Global_hyper_ratio_set (double hyper_ratio) ;
-double   GB_Global_hyper_ratio_get (void) ;
-
-void     GB_Global_is_csc_set (bool is_csc) ;
-bool     GB_Global_is_csc_get (void) ;
-
-GB_PUBLIC   // accessed by the MATLAB interface only
-void     GB_Global_use_mkl_set (bool use_mkl) ;
-GB_PUBLIC   // accessed by the MATLAB interface only
-bool     GB_Global_use_mkl_get (void) ;
-
-GB_PUBLIC   // accessed by the MATLAB interface only
-void     GB_Global_abort_function_set (void (* abort_function) (void)) ;
-GB_PUBLIC   // accessed by the MATLAB interface only
-void     GB_Global_abort_function (void) ;
-
-void     GB_Global_malloc_function_set
-         (
-             void * (* malloc_function) (size_t)
-         ) ;
-void  *  GB_Global_malloc_function (size_t size) ;
-
-void     GB_Global_calloc_function_set
-         (
-             void * (* calloc_function) (size_t, size_t)
-         ) ;
-void  *  GB_Global_calloc_function (size_t count, size_t size) ;
-
-void     GB_Global_realloc_function_set
-         (
-             void * (* realloc_function) (void *, size_t)
-         ) ;
-void  *  GB_Global_realloc_function (void *p, size_t size) ;
-bool     GB_Global_have_realloc_function (void) ;
-
-void     GB_Global_free_function_set (void (* free_function) (void *)) ;
-void     GB_Global_free_function (void *p) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void     GB_Global_malloc_is_thread_safe_set
-         (
-            bool malloc_is_thread_safe
-         ) ;
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-bool     GB_Global_malloc_is_thread_safe_get (void) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void     GB_Global_malloc_tracking_set (bool malloc_tracking) ;
-bool     GB_Global_malloc_tracking_get (void) ;
-
-void     GB_Global_nmalloc_clear (void) ;
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-int64_t  GB_Global_nmalloc_get (void) ;
-void     GB_Global_nmalloc_increment (void) ;
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void     GB_Global_nmalloc_decrement (void) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void     GB_Global_malloc_debug_set (bool malloc_debug) ;
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-bool     GB_Global_malloc_debug_get (void) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void     GB_Global_malloc_debug_count_set (int64_t malloc_debug_count) ;
-bool     GB_Global_malloc_debug_count_decrement (void) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void     GB_Global_hack_set (int64_t hack) ;
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-int64_t  GB_Global_hack_get (void) ;
-
-void     GB_Global_burble_set (bool burble) ;
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-bool     GB_Global_burble_get (void) ;
-
-GB_PUBLIC   // accessed by the MATLAB interface only
-void     GB_Global_print_one_based_set (bool onebased) ;
-GB_PUBLIC   // accessed by the MATLAB interface only
-bool     GB_Global_print_one_based_get (void) ;
-
-void     GB_Global_gpu_control_set (GrB_Desc_Value value) ;
-GrB_Desc_Value GB_Global_gpu_control_get (void);
-void     GB_Global_gpu_chunk_set (double gpu_chunk) ;
-double   GB_Global_gpu_chunk_get (void) ;
-bool     GB_Global_gpu_count_set (bool enable_cuda) ;
-int      GB_Global_gpu_count_get (void) ;
-size_t   GB_Global_gpu_memorysize_get (int device) ;
-int      GB_Global_gpu_sm_get (int device) ;
-bool     GB_Global_gpu_device_properties_get (int device) ;
-
+GB_PUBLIC void     GB_Global_mode_set (GrB_Mode mode) ;
+          GrB_Mode GB_Global_mode_get (void) ;
+
+          void     GB_Global_sort_set (int sort) ;
+          int      GB_Global_sort_get (void) ;
+
+GB_PUBLIC void     GB_Global_GrB_init_called_set (bool GrB_init_called) ;
+GB_PUBLIC bool     GB_Global_GrB_init_called_get (void) ;
+
+GB_PUBLIC void     GB_Global_nthreads_max_set (int nthreads_max) ;
+GB_PUBLIC int      GB_Global_nthreads_max_get (void) ;
+
+GB_PUBLIC int      GB_Global_omp_get_max_threads (void) ;
+
+GB_PUBLIC void     GB_Global_chunk_set (double chunk) ;
+GB_PUBLIC double   GB_Global_chunk_get (void) ;
+
+GB_PUBLIC void     GB_Global_hyper_switch_set (float hyper_switch) ;
+GB_PUBLIC float    GB_Global_hyper_switch_get (void) ;
+
+GB_PUBLIC void     GB_Global_bitmap_switch_set (int k, float b) ;
+GB_PUBLIC float    GB_Global_bitmap_switch_get (int k) ;
+GB_PUBLIC float    GB_Global_bitmap_switch_matrix_get
+                        (int64_t vlen, int64_t vdim) ;
+GB_PUBLIC void     GB_Global_bitmap_switch_default (void) ;
+
+          void     GB_Global_is_csc_set (bool is_csc) ;
+          bool     GB_Global_is_csc_get (void) ;
+
+GB_PUBLIC void     GB_Global_abort_function_set
+                        (void (* abort_function) (void)) ;
+GB_PUBLIC void     GB_Global_abort_function (void) ;
+
+          void     GB_Global_malloc_function_set
+                        (void * (* malloc_function) (size_t)) ;
+          void  *  GB_Global_malloc_function (size_t size) ;
+          void     GB_Global_calloc_function_set
+                        (void * (* calloc_function) (size_t, size_t)) ;
+          void  *  GB_Global_calloc_function (size_t count, size_t size) ;
+          void     GB_Global_realloc_function_set
+                        (void * (* realloc_function) (void *, size_t)) ;
+          void  *  GB_Global_realloc_function (void *p, size_t size) ;
+          bool     GB_Global_have_realloc_function (void) ;
+          void     GB_Global_free_function_set
+                        (void (* free_function) (void *)) ;
+          void     GB_Global_free_function (void *p) ;
+
+GB_PUBLIC void     GB_Global_malloc_is_thread_safe_set
+                        (bool malloc_is_thread_safe) ;
+GB_PUBLIC bool     GB_Global_malloc_is_thread_safe_get (void) ;
+
+GB_PUBLIC void     GB_Global_malloc_tracking_set (bool malloc_tracking) ;
+          bool     GB_Global_malloc_tracking_get (void) ;
+
+          void     GB_Global_nmalloc_clear (void) ;
+GB_PUBLIC int64_t  GB_Global_nmalloc_get (void) ;
+          void     GB_Global_nmalloc_increment (void) ;
+GB_PUBLIC void     GB_Global_nmalloc_decrement (void) ;
+
+GB_PUBLIC void     GB_Global_malloc_debug_set (bool malloc_debug) ;
+GB_PUBLIC bool     GB_Global_malloc_debug_get (void) ;
+
+GB_PUBLIC void     GB_Global_malloc_debug_count_set
+                        (int64_t malloc_debug_count) ;
+          bool     GB_Global_malloc_debug_count_decrement (void) ;
+
+GB_PUBLIC void     GB_Global_hack_set (int64_t hack) ;
+GB_PUBLIC int64_t  GB_Global_hack_get (void) ;
+
+          void     GB_Global_burble_set (bool burble) ;
+GB_PUBLIC bool     GB_Global_burble_get (void) ;
+
+GB_PUBLIC void     GB_Global_print_one_based_set (bool onebased) ;
+GB_PUBLIC bool     GB_Global_print_one_based_get (void) ;
+
+          void     GB_Global_gpu_control_set (GrB_Desc_Value value) ;
+          GrB_Desc_Value GB_Global_gpu_control_get (void);
+          void     GB_Global_gpu_chunk_set (double gpu_chunk) ;
+          double   GB_Global_gpu_chunk_get (void) ;
+          bool     GB_Global_gpu_count_set (bool enable_cuda) ;
+          int      GB_Global_gpu_count_get (void) ;
+          size_t   GB_Global_gpu_memorysize_get (int device) ;
+          int      GB_Global_gpu_sm_get (int device) ;
+          bool     GB_Global_gpu_device_pool_size_set
+                        (int device, size_t size) ;
+          bool     GB_Global_gpu_device_max_pool_size_set
+                        (int device, size_t size) ;
+          bool     GB_Global_gpu_device_memory_resource_set
+                        (int device, void *resource) ;
+          void*    GB_Global_gpu_device_memory_resource_get
+                        (int device) ;
+          bool     GB_Global_gpu_device_properties_get (int device) ;
+
+GB_PUBLIC void     GB_Global_timing_clear_all (void) ;
+GB_PUBLIC void     GB_Global_timing_clear (int k) ;
+GB_PUBLIC void     GB_Global_timing_set (int k, double t) ;
+GB_PUBLIC void     GB_Global_timing_add (int k, double t) ;
+GB_PUBLIC double   GB_Global_timing_get (int k) ;
+
+// #include "GB_Global_mkl_template.h"
 #endif
 
diff --git a/GraphBLAS/Source/GB_I_inverse.c b/GraphBLAS/Source/GB_I_inverse.c
index 3ef674291b..944c591672 100644
--- a/GraphBLAS/Source/GB_I_inverse.c
+++ b/GraphBLAS/Source/GB_I_inverse.c
@@ -2,8 +2,8 @@
 // GB_I_inverse: invert an index list
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -55,7 +55,7 @@ GrB_Info GB_I_inverse           // invert the I list for C=A(I,:)
         // out of memory
         GB_FREE (Mark) ;
         GB_FREE (Inext) ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_Index_multiply.c b/GraphBLAS/Source/GB_Index_multiply.c
index ebcffe9af2..2fcbe8b4ea 100644
--- a/GraphBLAS/Source/GB_Index_multiply.c
+++ b/GraphBLAS/Source/GB_Index_multiply.c
@@ -2,8 +2,8 @@
 // GB_Index_multiply:  multiply two integers and guard against overflow
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_Mask_compatible.c b/GraphBLAS/Source/GB_Mask_compatible.c
index dd02c2a0b7..9ea2b86162 100644
--- a/GraphBLAS/Source/GB_Mask_compatible.c
+++ b/GraphBLAS/Source/GB_Mask_compatible.c
@@ -2,8 +2,8 @@
 // GB_Mask_compatible: check input and operators for type compatibility
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -21,12 +21,6 @@ GrB_Info GB_Mask_compatible     // check type and dimensions of mask
 )
 {
 
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    // C and M may be aliased
-
     //--------------------------------------------------------------------------
     // check the mask M
     //--------------------------------------------------------------------------
@@ -34,12 +28,12 @@ GrB_Info GB_Mask_compatible     // check type and dimensions of mask
     if (M != NULL)
     { 
 
-        // M  is typecast to boolean
+        // M is typecast to boolean
         if (!GB_Type_compatible (M->type, GrB_BOOL))
         { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+            GB_ERROR (GrB_DOMAIN_MISMATCH,
                 "M of type [%s] cannot be typecast to boolean",
-                M->type->name))) ;
+                M->type->name) ;
         }
 
         // check the mask dimensions
@@ -47,10 +41,10 @@ GrB_Info GB_Mask_compatible     // check type and dimensions of mask
         GrB_Index cncols = (C == NULL) ? ncols : GB_NCOLS (C) ;
         if (GB_NROWS (M) != cnrows || GB_NCOLS (M) != cncols)
         { 
-            return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
+            GB_ERROR (GrB_DIMENSION_MISMATCH,
                 "M is " GBd "-by-" GBd "; "
                 "does not match output dimensions (" GBu "-by-" GBu ")",
-                GB_NROWS (M), GB_NCOLS (M), cnrows, cncols))) ;
+                GB_NROWS (M), GB_NCOLS (M), cnrows, cncols) ;
         }
     }
 
diff --git a/GraphBLAS/Source/GB_Matrix_check.c b/GraphBLAS/Source/GB_Matrix_check.c
index ce0b814729..4b1cb10635 100644
--- a/GraphBLAS/Source/GB_Matrix_check.c
+++ b/GraphBLAS/Source/GB_Matrix_check.c
@@ -2,8 +2,8 @@
 // GB_Matrix_check: print a GraphBLAS matrix and check if it is valid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,10 +15,9 @@ GrB_Info GB_Matrix_check    // check a GraphBLAS matrix
     const GrB_Matrix A,     // GraphBLAS matrix to print and check
     const char *name,       // name of the matrix
     int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
+    FILE *f                 // file for output
 )
 { 
-    return (GB_matvec_check (A, name, pr, f, "matrix", Context)) ;
+    return (GB_matvec_check (A, name, pr, f, "matrix")) ;
 }
 
diff --git a/GraphBLAS/Source/GB_Matrix_free.c b/GraphBLAS/Source/GB_Matrix_free.c
index 37bfccd4b5..bded3d05ab 100644
--- a/GraphBLAS/Source/GB_Matrix_free.c
+++ b/GraphBLAS/Source/GB_Matrix_free.c
@@ -2,8 +2,8 @@
 // GB_Matrix_free: free a GrB_Matrix or GrB_Vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,10 +11,8 @@
 // NULL.
 
 #include "GB.h"
-#include "GB_mkl.h"
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_Matrix_free         // free a matrix
+void GB_Matrix_free             // free a matrix
 (
     GrB_Matrix *matrix          // handle of matrix to free
 )
@@ -26,18 +24,15 @@ GrB_Info GB_Matrix_free         // free a matrix
         if (A != NULL && (A->magic == GB_MAGIC || A->magic == GB_MAGIC2))
         { 
             // free all content of A
-            GB_PHIX_FREE (A) ;
-            // free the MKL optimization, if it exists
-            #if GB_HAS_MKL
-            GB_MKL_GRAPH_MATRIX_DESTROY (A->mkl) ;
-            #endif
+            GB_phbix_free (A) ;
+            // #include "GB_Matrix_free_mkl_template.c
+            // free the error logger string
+            GB_FREE (A->logger) ;
             // free the header of A itself
             A->magic = GB_FREED ;      // to help detect dangling pointers
             GB_FREE (*matrix) ;
         }
         (*matrix) = NULL ;
     }
-
-    return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_Matrix_wait.c b/GraphBLAS/Source/GB_Matrix_wait.c
index dbd10878fb..e0709d7581 100644
--- a/GraphBLAS/Source/GB_Matrix_wait.c
+++ b/GraphBLAS/Source/GB_Matrix_wait.c
@@ -2,8 +2,8 @@
 // GB_Matrix_wait:  finish all pending computations on a single matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,6 +15,7 @@
 // The matrix A has zombies and/or pending tuples placed there by
 // GrB_setElement, GrB_*assign, or GB_mxm.  Zombies must now be deleted, and
 // pending tuples must now be assembled together and added into the matrix.
+// The indices in A might also be jumbled; if so, they are sorted now.
 
 // When the function returns, and all pending tuples and zombies have been
 // deleted.  This is true even the function fails due to lack of memory (in
@@ -29,6 +30,8 @@
 // If A is non-hypersparse, then O(n) is added in the worst case, to prune
 // zombies and to update the vector pointers for A.
 
+// If the method is successful, it does an OpenMP flush just before returning.
+
 #include "GB_select.h"
 #include "GB_add.h"
 #include "GB_Pending.h"
@@ -37,11 +40,10 @@
 
 #define GB_FREE_ALL                     \
 {                                       \
-    GB_PHIX_FREE (A) ;                  \
-    GB_MATRIX_FREE (&T) ;               \
-    GB_MATRIX_FREE (&S) ;               \
-    GB_MATRIX_FREE (&(Aslice [0])) ;    \
-    GB_MATRIX_FREE (&(Aslice [1])) ;    \
+    GB_phbix_free (A) ;                 \
+    GB_Matrix_free (&T) ;               \
+    GB_Matrix_free (&S) ;               \
+    GB_Matrix_free (&A1) ;              \
 }
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
@@ -56,169 +58,193 @@ GrB_Info GB_Matrix_wait         // finish all pending computations
     // check inputs
     //--------------------------------------------------------------------------
 
-    ASSERT (A != NULL) ;
+    GrB_Matrix T = NULL, S = NULL, A1 = NULL ;
+    GrB_Info info = GrB_SUCCESS ;
+
     ASSERT_MATRIX_OK (A, "A to wait", GB_FLIP (GB0)) ;
 
-    //--------------------------------------------------------------------------
-    // determine the max # of threads to use
-    //--------------------------------------------------------------------------
+    if (GB_IS_FULL (A) || GB_IS_BITMAP (A))
+    { 
+        // full and bitmap matrices never have any pending work
+        ASSERT (!GB_ZOMBIES (A)) ;
+        ASSERT (!GB_JUMBLED (A)) ;
+        ASSERT (!GB_PENDING (A)) ;
+        // ensure the matrix is written to memory
+        #pragma omp flush
+        return (GrB_SUCCESS) ;
+    }
 
-    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+    // only sparse and hypersparse matrices can have pending work
+    ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
 
     //--------------------------------------------------------------------------
-    // delete zombies
+    // get the zombie and pending count, and burble if work needs to be done
     //--------------------------------------------------------------------------
 
-    // A zombie is an entry A(i,j) in the matrix that as been marked for
-    // deletion, but hasn't been deleted yet.  It is marked by "negating"
-    // replacing its index i with GB_FLIP(i).  Zombies are simple to delete via
-    // an in-place algorithm.  No memory is allocated so this step always
-    // succeeds.  Pending tuples are ignored, so A can have pending tuples.
-
-    GrB_Matrix T = NULL, S = NULL, Aslice [2] = { NULL, NULL } ;
-    GrB_Info info = GrB_SUCCESS ;
-
     int64_t nzombies = A->nzombies ;
     int64_t npending = GB_Pending_n (A) ;
-
-    if (nzombies > 0 || npending > 0)
+    if (nzombies > 0 || npending > 0 || A->jumbled)
     { 
-        GB_BURBLE_MATRIX (A, "wait (zombies: " GBd ", pending: " GBd ") ",
-            nzombies, npending) ;
+        GB_BURBLE_MATRIX (A, "(wait: " GBd " %s, " GBd " pending%s) ",
+            nzombies, (nzombies == 1) ? "zombie" : "zombies", npending,
+            A->jumbled ? ", jumbled" : "") ;
     }
 
-    if (nzombies > 0)
-    { 
-        // remove all zombies from A.  Also compute A->nvec_nonempty
-        #ifdef GB_DEBUG
-        int64_t anz_orig = GB_NNZ (A) ;
-        #endif
-        GB_OK (GB_selector (NULL, GB_NONZOMBIE_opcode, NULL, false, A,
-            0, NULL, Context)) ;
-        ASSERT (A->nvec_nonempty == GB_nvec_nonempty (A, NULL)) ;
-        ASSERT (A->nzombies == (anz_orig - GB_NNZ (A))) ;
-        A->nzombies = 0 ;
-    }
-    else if (A->nvec_nonempty < 0)
-    { 
-        // no zombies to remove, but make sure A->nvec_nonempty is computed
-        A->nvec_nonempty = GB_nvec_nonempty (A, Context) ;
-    }
+    //--------------------------------------------------------------------------
+    // determine the max # of threads to use
+    //--------------------------------------------------------------------------
 
-    // all the zombies are gone
-    ASSERT (!GB_ZOMBIES (A)) ;
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
 
     //--------------------------------------------------------------------------
-    // check for pending tuples
+    // assemble the pending tuples into T
     //--------------------------------------------------------------------------
 
-    int64_t anz = GB_NNZ (A) ;
+    int64_t tnz = 0 ;
+    if (npending > 0)
+    {
 
-    if (!GB_PENDING (A))
-    { 
-        if (!GB_queue_remove (A)) GB_PANIC ;    // TODO in 4.0: delete
+        //----------------------------------------------------------------------
+        // construct a new hypersparse matrix T with just the pending tuples
+        //----------------------------------------------------------------------
 
-        // trim any significant extra space from the matrix, but allow for some
-        // future insertions.  do not increase the size of the matrix;
-        // zombies have been deleted but no pending tuples added.  This is
-        // guaranteed not to fail.
-        GB_OK (GB_ix_resize (A, anz, Context)) ;
+        // T has the same type as A->type, which can differ from the type of
+        // the pending tuples, A->Pending->type.  The Pending->op can be NULL
+        // (an implicit SECOND function), or it can be any accum operator.  The
+        // z=accum(x,y) operator can have any types, and it does not have to be
+        // associative.
+
+        info = GB_builder
+        (
+            &T,                     // create T
+            A->type,                // T->type = A->type
+            A->vlen,                // T->vlen = A->vlen
+            A->vdim,                // T->vdim = A->vdim
+            A->is_csc,              // T->is_csc = A->is_csc
+            &(A->Pending->i),       // iwork_handle, becomes T->i on output
+            &(A->Pending->j),       // jwork_handle, free on output
+            &(A->Pending->x),       // Swork_handle, free on output
+            A->Pending->sorted,     // tuples may or may not be sorted
+            false,                  // there might be duplicates; look for them
+            A->Pending->nmax,       // size of Pending->[ijx] arrays
+            true,                   // is_matrix: unused
+            NULL, NULL, NULL,       // original I,J,S tuples, not used here
+            npending,               // # of tuples
+            A->Pending->op,         // dup operator for assembling duplicates
+            A->Pending->type->code, // type of Pending->x
+            Context
+        ) ;
 
-        // conform A to its desired hypersparsity
-        return (GB_to_hyper_conform (A, Context)) ;
-    }
+        //----------------------------------------------------------------------
+        // free pending tuples
+        //----------------------------------------------------------------------
 
-    // There are pending tuples that will now be assembled.
-    ASSERT (GB_PENDING (A)) ;
-    GB_Pending Pending = A->Pending ;
+        // The tuples have been converted to T, which is more compact, and
+        // duplicates have been removed.  The following work needs to be done
+        // even if the builder fails.
 
-    //--------------------------------------------------------------------------
-    // construct a new hypersparse matrix T with just the pending tuples
-    //--------------------------------------------------------------------------
+        // GB_builder frees A->Pending->j and A->Pending->x.  If successful,
+        // A->Pending->i is now T->i.  Otherwise A->Pending->i is freed.  In
+        // both cases, A->Pending->i is NULL.
+        ASSERT (A->Pending->i == NULL) ;
+        ASSERT (A->Pending->j == NULL) ;
+        ASSERT (A->Pending->x == NULL) ;
+
+        // free the list of pending tuples
+        GB_Pending_free (&(A->Pending)) ;
+        ASSERT (!GB_PENDING (A)) ;
+
+        ASSERT_MATRIX_OK (A, "A after moving pending tuples to T", GB0) ;
 
-    // T has the same type as A->type, which can differ from the type of the
-    // pending tuples, A->Pending->type.  This is OK since build process
-    // assembles the tuples in the order they were inserted into the matrix.
-    // The Pending->op can be NULL (an implicit SECOND function), or it
-    // can be any accum operator.  The z=accum(x,y) operator can have any
-    // types, and it does not have to be associative.
-
-    info = GB_builder
-    (
-        &T,                     // create T
-        A->type,                // T->type = A->type
-        A->vlen,                // T->vlen = A->vlen
-        A->vdim,                // T->vdim = A->vdim
-        A->is_csc,              // T->is_csc = A->is_csc
-        &(Pending->i),          // iwork_handle, becomes T->i on output
-        &(Pending->j),          // jwork_handle, free on output
-        &(Pending->x),          // Swork_handle, free on output
-        Pending->sorted,        // tuples may or may not be sorted
-        false,                  // check for duplicates
-        Pending->nmax,          // size of Pending->[ijx] arrays
-        true,                   // is_matrix: unused
-        false,                  // ijcheck: unused
-        NULL, NULL, NULL,       // original I,J,S tuples, not used here
-        Pending->n,             // # of tuples
-        Pending->op,            // dup operator for assembling duplicates
-        Pending->type->code,    // type of Pending->x
-        Context
-    ) ;
+        //----------------------------------------------------------------------
+        // check the status of the builder
+        //----------------------------------------------------------------------
+
+        // Finally check the status of the builder.  The pending tuples, must
+        // be freed (just above), whether or not the builder is successful.
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory in GB_builder
+            GB_FREE_ALL ;
+            return (info) ;
+        }
+
+        ASSERT_MATRIX_OK (T, "T = hypersparse matrix of pending tuples", GB0) ;
+        ASSERT (GB_IS_HYPERSPARSE (T)) ;
+        ASSERT (!GB_ZOMBIES (T)) ;
+        ASSERT (!GB_JUMBLED (T)) ;
+        ASSERT (!GB_PENDING (T)) ;
+
+        tnz = GB_NNZ (T) ;
+        ASSERT (tnz > 0) ;
+    }
 
     //--------------------------------------------------------------------------
-    // free pending tuples
+    // delete zombies
     //--------------------------------------------------------------------------
 
-    // The tuples have been converted to T, which is more compact, and
-    // duplicates have been removed.
+    // A zombie is an entry A(i,j) in the matrix that as been marked for
+    // deletion, but hasn't been deleted yet.  It is marked by "negating"
+    // replacing its index i with GB_FLIP(i).
 
-    // This work needs to be done even if the builder fails.
+    // TODO: pass tnz to GB_selector, to pad the reallocated A matrix
 
-    // GB_builder frees Pending->j.  If successful, Pending->i is now T->i.
-    // Otherwise Pending->i is freed.  In both cases, it has been set to NULL.
-    ASSERT (Pending->i == NULL) ;
-    ASSERT (Pending->j == NULL) ;
-    ASSERT (Pending->x == NULL) ;
+    if (nzombies > 0)
+    { 
+        // remove all zombies from A
+        #ifdef GB_DEBUG
+        int64_t anz_orig = GB_NNZ (A) ;
+        #endif
+        GB_OK (GB_selector (NULL /* A in-place */, GB_NONZOMBIE_opcode, NULL,
+            false, A, 0, NULL, Context)) ;
+        ASSERT (A->nzombies == (anz_orig - GB_NNZ (A))) ;
+        A->nzombies = 0 ;
+    }
 
-    // free the list of pending tuples
-    GB_Pending_free (&(A->Pending)) ;
+    ASSERT_MATRIX_OK (A, "A after zombies removed", GB0) ;
 
-    ASSERT (!GB_PENDING (A)) ;
+    // all the zombies are gone, and pending tuples are now in T 
     ASSERT (!GB_ZOMBIES (A)) ;
-    if (!GB_queue_remove (A)) GB_PANIC ;    // TODO in 4.0: delete
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+
+    //--------------------------------------------------------------------------
+    // unjumble the matrix
+    //--------------------------------------------------------------------------
+
+    GB_OK (GB_unjumble (A, Context)) ;
 
-    // No pending operations on A
-    ASSERT_MATRIX_OK (A, "A after moving pending tuples to T", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
 
     //--------------------------------------------------------------------------
-    // check the status of the builder
+    // check for pending tuples
     //--------------------------------------------------------------------------
 
-    // Finally check the status of the builder.  The pending tuples, must
-    // be freed (just above), whether or not the builder is successful.
-    if (info != GrB_SUCCESS)
+    if (npending == 0)
     { 
-        // out of memory in GB_builder
-        GB_FREE_ALL ;
+        // conform A to its desired sparsity structure and return result
+        info = GB_conform (A, Context) ;
+        #pragma omp flush
         return (info) ;
     }
 
-    ASSERT_MATRIX_OK (T, "T = matrix of pending tuples", GB0) ;
-    ASSERT (!GB_PENDING (T)) ;
-    ASSERT (!GB_ZOMBIES (T)) ;
-    ASSERT (GB_NNZ (T) > 0) ;
-    ASSERT (T->is_hyper) ;
-    ASSERT (T->nvec == T->nvec_nonempty) ;
-
     //--------------------------------------------------------------------------
     // check for quick transplant
     //--------------------------------------------------------------------------
 
+    int64_t anz = GB_NNZ (A) ;
     if (anz == 0)
     { 
         // A has no entries so just transplant T into A, then free T and
         // conform A to its desired hypersparsity.
-        return (GB_transplant_conform (A, A->type, &T, Context)) ;
+        info = GB_transplant_conform (A, A->type, &T, Context) ;
+        #pragma omp flush
+        return (info) ;
     }
 
     //--------------------------------------------------------------------------
@@ -226,8 +252,8 @@ GrB_Info GB_Matrix_wait         // finish all pending computations
     //--------------------------------------------------------------------------
 
     // If anz > 0, T is hypersparse, even if A is a GrB_Vector
-    ASSERT (T->is_hyper) ;
-    ASSERT (GB_NNZ (T) > 0) ;
+    ASSERT (GB_IS_HYPERSPARSE (T)) ;
+    ASSERT (tnz > 0) ;
     ASSERT (T->nvec > 0) ;
     ASSERT (A->nvec > 0) ;
 
@@ -237,34 +263,41 @@ GrB_Info GB_Matrix_wait         // finish all pending computations
     int64_t kA = 0 ;
     int64_t jlast ;
 
+    int64_t *GB_RESTRICT Ap = A->p ;
+    int64_t *GB_RESTRICT Ah = A->h ;
+    int64_t *GB_RESTRICT Ai = A->i ;
+    GB_void *GB_RESTRICT Ax = (GB_void *) A->x ;
+
+    int64_t anvec = A->nvec ;
+    int64_t asize = A->type->size ;
+
     // anz0 = nnz (A0) = nnz (A (:, 0:tjfirst-1)), the region not modified by T
-    if (A->is_hyper)
+    if (A->h != NULL)
     { 
         // find tjfirst in A->h 
-        int64_t pright = A->nvec - 1 ;
+        int64_t pright = anvec - 1 ;
         bool found ;
-        int64_t *GB_RESTRICT Ah = A->h ;
-        GB_SPLIT_BINARY_SEARCH (tjfirst, Ah, kA, pright, found) ;
-        // Ah [0 ... kA-1] excludes vector tjfirst.  The list
-        // Ah [kA ... A->nvec-1] includes tjfirst.
-        ASSERT (kA >= 0 && kA <= A->nvec) ;
-        ASSERT (GB_IMPLIES (kA > 0 && kA < A->nvec, Ah [kA-1] < tjfirst)) ;
-        ASSERT (GB_IMPLIES (found, Ah [kA] == tjfirst)) ;
-        anz0 = A->p [kA] ;
-        jlast = (kA > 0) ? Ah [kA-1] : (-1) ;
+        GB_SPLIT_BINARY_SEARCH (tjfirst, A->h, kA, pright, found) ;
+        // A->h [0 ... kA-1] excludes vector tjfirst.  The list
+        // A->h [kA ... anvec-1] includes tjfirst.
+        ASSERT (kA >= 0 && kA <= anvec) ;
+        ASSERT (GB_IMPLIES (kA > 0 && kA < anvec, A->h [kA-1] < tjfirst)) ;
+        ASSERT (GB_IMPLIES (found, A->h [kA] == tjfirst)) ;
+        jlast = (kA > 0) ? A->h [kA-1] : (-1) ;
     }
     else
     { 
         kA = tjfirst ;
-        anz0 = A->p [tjfirst] ;
         jlast = tjfirst - 1 ;
     }
 
-    // anz1 = nnz (A1) = nnz (A (:, tjfirst:end)), the region modifed by T
+    // anz1 = nnz (A1) = nnz (A (:, kA:end)), the region modified by T
+    anz0 = A->p [kA] ;
     int64_t anz1 = anz - anz0 ;
+    bool ignore ;
 
     // A + T will have anz_new entries
-    int64_t anz_new = anz + GB_NNZ (T) ;  // must have at least this space
+    int64_t anz_new = anz + tnz ;       // must have at least this space
 
     if (2 * anz1 < anz0)
     {
@@ -274,22 +307,22 @@ GrB_Info GB_Matrix_wait         // finish all pending computations
         //----------------------------------------------------------------------
 
         // A is growing incrementally.  It splits into two parts: A = [A0 A1].
-        // where A0 = A (:, 0:tjfirst-1) and A1 = A (:, tjfirst:end).  The
-        // first part (A0 with anz = nnz (A0) entries) is not modified.  The
+        // where A0 = A (:, 0:kA-1) and A1 = A (:, kA:end).  The
+        // first part (A0 with anz0 = nnz (A0) entries) is not modified.  The
         // second part (A1, with anz1 = nnz (A1) entries) overlaps with T.
         // If anz1 is zero, or small compared to anz0, then it is faster to
         // leave A0 unmodified, and to update just A1.
 
-        // FUTURE:: this does not tolerate zombies.  So do it only if A has no
-        // zombies on input.  Or, when GB_add can tolerate zombies, set the
-        // Aslice [1] to start at the first zombie.  Keep track of the vector
-        // containing the first zombie.
+        // TODO: if A also had zombies, GB_selector could pad A so that
+        // A->nzmax = anz + tnz.
 
         // make sure A has enough space for the new tuples
         if (anz_new > A->nzmax)
         { 
             // double the size if not enough space
-            GB_OK (GB_ix_resize (A, 2*anz_new, Context)) ;
+            GB_OK (GB_ix_resize (A, anz_new, Context)) ;
+            Ai = A->i ;
+            Ax = (GB_void *) A->x ;
         }
 
         //----------------------------------------------------------------------
@@ -297,41 +330,77 @@ GrB_Info GB_Matrix_wait         // finish all pending computations
         //----------------------------------------------------------------------
 
         if (anz1 > 0)
-        { 
-
-            // extract A0 and A1, and then compute T = A1 + T.
-
-            // A0 = A (:, 0:tjfirst-1), not used
-            // A1 = A (:, tjfirst:end)
-            // T = A1 + T
+        {
+
+            //------------------------------------------------------------------
+            // extract A1 = A (:, kA:end) as a shallow copy
+            //------------------------------------------------------------------
+
+            // A1 = [0, A (:, kA:end)], hypersparse with same dimensions as A
+            GB_OK (GB_new (&A1, // hyper, new header
+                A->type, A->vlen, A->vdim, GB_Ap_malloc, A->is_csc,
+                GxB_HYPERSPARSE, GB_ALWAYS_HYPER, anvec - kA, Context)) ;
+
+            // the A1->i and A1->x content are shallow copies of A(:,kA:end)
+            A1->x = (void *) (Ax + asize * anz0) ;
+            A1->i = Ai + anz0 ;
+            A1->x_shallow = true ;
+            A1->i_shallow = true ;
+            A1->nzmax = anz1 ;
+
+            // fill the column A1->h and A1->p with A->h and A->p, shifted
+            int64_t *GB_RESTRICT A1p = A1->p ;
+            int64_t *GB_RESTRICT A1h = A1->h ;
+            int64_t a1nvec = 0 ;
+            for (int64_t k = kA ; k < anvec ; k++)
+            {
+                // get A (:,k)
+                int64_t pA_start = Ap [k] ;
+                int64_t pA_end = Ap [k+1] ;
+                if (pA_end > pA_start)
+                { 
+                    // add this column to A1 if A (:,k) is not empty
+                    int64_t j = GBH (Ah, k) ;
+                    A1p [a1nvec] = pA_start - anz0 ;
+                    A1h [a1nvec] = j ;
+                    a1nvec++ ;
+                }
+            }
 
-            int64_t Slice [3] ;
-            Slice [0] = 0 ;         // A0 is not needed
-            Slice [1] = kA ;        // kA is the first vector in A1
-            Slice [2] = A->nvec ;   // A->nvec-1 is the last vector in A1
-            GB_OK (GB_slice (A, 2, Slice, Aslice, Context)) ;
+            // finalize A1
+            A1p [a1nvec] = anz1 ;
+            A1->nvec = a1nvec ;
+            A1->nvec_nonempty = a1nvec ;
+            A1->magic = GB_MAGIC ;
 
-            ASSERT_MATRIX_OK (Aslice [1], "A1 slice for GB_Matrix_wait", GB0) ;
+            ASSERT_MATRIX_OK (A1, "A1 slice for GB_Matrix_wait", GB0) ;
 
-            // free A0, which is not used
-            GB_MATRIX_FREE (&(Aslice [0])) ;
+            //------------------------------------------------------------------
+            // S = A1 + T, with no operator or mask
+            //------------------------------------------------------------------
 
-            // S = A1 + T, but with no operator
-            GB_OK (GB_add (&S, A->type, A->is_csc, NULL, 0, Aslice [1], T,
-                NULL, Context)) ;
+            GB_OK (GB_add (&S, A->type, A->is_csc, NULL, 0, 0, &ignore,
+                A1, T, NULL, Context)) ;
 
             ASSERT_MATRIX_OK (S, "S = A1+T", GB0) ;
 
             // free A1 and T
-            GB_MATRIX_FREE (&T) ;
-            GB_MATRIX_FREE (&(Aslice [1])) ;
+            GB_Matrix_free (&T) ;
+            GB_Matrix_free (&A1) ;
 
+            //------------------------------------------------------------------
             // replace T with S
+            //------------------------------------------------------------------
+
             T = S ;
             S = NULL ;
+            tnz = GB_NNZ (T) ;
+
+            //------------------------------------------------------------------
+            // remove A1 from the vectors of A, if A is hypersparse
+            //------------------------------------------------------------------
 
-            // remove A1 from the vectors of A
-            if (A->is_hyper)
+            if (A->h != NULL)
             { 
                 A->nvec = kA ;
             }
@@ -341,16 +410,11 @@ GrB_Info GB_Matrix_wait         // finish all pending computations
         // append T to the end of A0
         //----------------------------------------------------------------------
 
-        int64_t *GB_RESTRICT Ai = A->i ;
-        GB_void *GB_RESTRICT Ax = (GB_void *) A->x ;
-        int64_t asize = A->type->size ;
-
         const int64_t *GB_RESTRICT Tp = T->p ;
         const int64_t *GB_RESTRICT Th = T->h ;
         const int64_t *GB_RESTRICT Ti = T->i ;
         const GB_void *GB_RESTRICT Tx = (GB_void *) T->x ;
         int64_t tnvec = T->nvec ;
-        int64_t tnz = GB_NNZ (T) ;
 
         anz = anz0 ;
         int64_t anz_last = anz ;
@@ -373,15 +437,15 @@ GrB_Info GB_Matrix_wait         // finish all pending computations
         GB_jwrapup (A, jlast, anz) ;
         ASSERT (anz == anz_new) ;
 
-        // recompute the # of non-empty vectors
-        A->nvec_nonempty = GB_nvec_nonempty (A, Context) ;
+        // need to recompute the # of non-empty vectors in GB_conform
+        A->nvec_nonempty = -1 ;     // recomputed just below
 
         ASSERT_MATRIX_OK (A, "A after GB_Matrix_wait:append", GB0) ;
 
-        GB_MATRIX_FREE (&T) ;
+        GB_Matrix_free (&T) ;
 
-        // conform A to its desired hypersparsity
-        return (GB_to_hyper_conform (A, Context)) ;
+        // conform A to its desired sparsity structure
+        info = GB_conform (A, Context) ;
 
     }
     else
@@ -399,10 +463,18 @@ GrB_Info GB_Matrix_wait         // finish all pending computations
         // FUTURE:: if GB_add could tolerate zombies in A, then the initial
         // prune of zombies can be skipped.
 
-        GB_OK (GB_add (&S, A->type, A->is_csc, NULL, 0, A, T, NULL, Context)) ;
-        GB_MATRIX_FREE (&T) ;
+        GB_OK (GB_add (&S, A->type, A->is_csc, NULL, 0, 0, &ignore, A, T, NULL,
+            Context)) ;
+        GB_Matrix_free (&T) ;
         ASSERT_MATRIX_OK (S, "S after GB_Matrix_wait:add", GB0) ;
-        return (GB_transplant_conform (A, A->type, &S, Context)) ;
+        info = GB_transplant_conform (A, A->type, &S, Context) ;
     }
+
+    //--------------------------------------------------------------------------
+    // flush the matrix and return result
+    //--------------------------------------------------------------------------
+
+    #pragma omp flush
+    return (info) ;
 }
 
diff --git a/GraphBLAS/Source/GB_Monoid_check.c b/GraphBLAS/Source/GB_Monoid_check.c
index ecf72bf723..ed0529dd2d 100644
--- a/GraphBLAS/Source/GB_Monoid_check.c
+++ b/GraphBLAS/Source/GB_Monoid_check.c
@@ -2,8 +2,8 @@
 // GB_Monoid_check: check and print a monoid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,8 +15,7 @@ GrB_Info GB_Monoid_check        // check a GraphBLAS monoid
     const GrB_Monoid monoid,    // GraphBLAS monoid to print and check
     const char *name,           // name of the monoid, optional
     int pr,                     // print level
-    FILE *f,                    // file for output
-    GB_Context Context
+    FILE *f                     // file for output
 )
 {
 
@@ -28,7 +27,6 @@ GrB_Info GB_Monoid_check        // check a GraphBLAS monoid
 
     if (monoid == NULL)
     { 
-        // GrB_error status not modified since this may be an optional argument
         GBPR0 ("NULL\n") ;
         return (GrB_NULL_POINTER) ;
     }
@@ -38,40 +36,42 @@ GrB_Info GB_Monoid_check        // check a GraphBLAS monoid
     //--------------------------------------------------------------------------
 
     GB_CHECK_MAGIC (monoid, "Monoid") ;
-    GBPR0 (monoid->builtin ? "(built-in)" : "(user-defined)") ;
+    GBPR0 (monoid->monoid_is_builtin ? "(built-in)" : "(user-defined)") ;
 
-    GrB_Info info = GB_BinaryOp_check (monoid->op, "monoid->op", pr, f,
-        Context) ;
-    if (info != GrB_SUCCESS)
+    GrB_Info info = GB_BinaryOp_check (monoid->op, "monoid->op", pr, f) ;
+    if (info != GrB_SUCCESS || GB_OP_IS_POSITIONAL (monoid->op))
     { 
         GBPR0 ("    Monoid contains an invalid operator\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "Monoid contains an invalid operator: [%s]", GB_NAME))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
     if (monoid->op->xtype != monoid->op->ztype ||
         monoid->op->ytype != monoid->op->ztype)
     { 
         GBPR0 ("    All domains of operator must be the same\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "All domains of monoid operator must be the same: [%s]",
-            GB_NAME))) ;
+        return (GrB_INVALID_OBJECT) ;
+    }
+
+    if (monoid->identity == NULL)
+    {
+        GBPR0 ("    Identity value is missing\n") ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
     // print the identity and terminal values
     if (pr != GxB_SILENT)
     { 
+        // print the identity value, if present
         GBPR ("    identity: [ ") ;
-        info = GB_entry_check (monoid->op->ztype, monoid->identity, pr, f,
-            Context) ;
+        info = GB_entry_check (monoid->op->ztype, monoid->identity, pr, f) ;
         if (info != GrB_SUCCESS) return (info) ;
         GBPR (" ] ") ;
+
         // print the terminal value, if present
         if (monoid->terminal != NULL)
         { 
             GBPR ("terminal: [ ") ;
-            info = GB_entry_check (monoid->op->ztype, monoid->terminal, pr, f,  
-                Context) ;
+            info = GB_entry_check (monoid->op->ztype, monoid->terminal, pr, f) ;
             if (info != GrB_SUCCESS) return (info) ;
             GBPR (" ]") ;
         }
diff --git a/GraphBLAS/Source/GB_Monoid_new.c b/GraphBLAS/Source/GB_Monoid_new.c
index b4d535098c..ca9caddeba 100644
--- a/GraphBLAS/Source/GB_Monoid_new.c
+++ b/GraphBLAS/Source/GB_Monoid_new.c
@@ -2,26 +2,35 @@
 // GB_Monoid_new: create a GrB_Monoid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Create a user-defined monoid with an operator, identity value, and
+// Create a monoid with an operator, (optionally) an identity value, and
 // (optionally) a terminal value.  If using a built-in operator, a duplicate
-// boolean operators is first replaced with its unique equivalent.  If the
+// boolean operator is first replaced with its unique equivalent.  If the
 // operator is built-in and corresponds to a known monoid, then the identity
-// value and terminal value provided on input are ignored, and the known values
-// are used instead.  This is to allow the use of the hard-coded functions for
-// built-in monoids.
+// value and terminal value provided on input are silently ignored, and the
+// known values are used instead.  This is to allow the use of the hard-coded
+// functions for built-in monoids.
+
+// The identity value of built-in monoids is exploited in one special case in
+// Template/GB_AxB_saxpy3_template.c, but it is used via the #define
+// GB_IDENTITY value, not monoid->identity.  This #define is also used for
+// built-in monoids in Template/GB_reduce_to_scalar_template.c.
+
+// User-defined monoids may have a NULL terminal value, which denotes that the
+// monoid does not have a terminal value.
 
 #include "GB.h"
+#include "GB_binop.h"
 
 GrB_Info GB_Monoid_new          // create a monoid
 (
     GrB_Monoid *monoid,         // handle of monoid to create
     GrB_BinaryOp op,            // binary operator of the monoid
-    const void *identity,       // identity value
+    const void *identity,       // identity value, if any
     const void *terminal,       // terminal value, if any (may be NULL)
     GB_Type_code idcode,        // identity and terminal type code
     GB_Context Context
@@ -34,8 +43,8 @@ GrB_Info GB_Monoid_new          // create a monoid
 
     GB_RETURN_IF_NULL (monoid) ;
     (*monoid) = NULL ;
+    GB_RETURN_IF_NULL (identity) ;
     GB_RETURN_IF_NULL_OR_FAULTY (op) ;
-    ASSERT (identity != NULL) ;
 
     ASSERT_BINARYOP_OK (op, "op for monoid", GB0) ;
     ASSERT (idcode <= GB_UDT_code) ;
@@ -47,72 +56,20 @@ GrB_Info GB_Monoid_new          // create a monoid
     // If the user requests the creation of a monoid based on a duplicate
     // built-in binary operator, the unique boolean operator is used instead.
     // See also GB_boolean_rename, which does this for opcodes, not operators.
-    // This is done before the operator is checked, so that any error messages
-    // reflect the renaming.
 
-    if (op == GrB_DIV_BOOL)
-    { 
-        // FIRST and DIV are the same for boolean:
-        op = GrB_FIRST_BOOL ;
-    }
-    else if (op == GxB_RDIV_BOOL)
-    { 
-        // SECOND and RDIV are the same for boolean:
-        op = GrB_SECOND_BOOL ;
-    }
-    else if (op == GrB_MIN_BOOL || op == GrB_TIMES_BOOL)
-    { 
-        // MIN, TIMES, and LAND are the same for boolean:
-        op = GrB_LAND ;
-    }
-    else if (op == GrB_MAX_BOOL || op == GrB_PLUS_BOOL)
-    { 
-        // MAX, PLUS, and OR are the same for boolean:
-        op = GrB_LOR ;
-    }
-    else if (op == GxB_ISNE_BOOL || op == GrB_NE_BOOL || op == GrB_MINUS_BOOL
-        || op == GxB_RMINUS_BOOL)
-    { 
-        // ISNE, NE, MINUS, RMINUS, and XOR are the same for boolean:
-        op = GrB_LXOR ;
-    }
-    else if (op == GxB_ISEQ_BOOL || op == GrB_LXNOR)
-    { 
-        // LXNOR, ISEQ, EQ are the same for boolean:
-        op = GrB_EQ_BOOL ;
-    }
-    else if (op == GxB_ISGT_BOOL)
-    { 
-        // ISGT, GT are the same for boolean:
-        op = GrB_GT_BOOL ;
-    }
-    else if (op == GxB_ISLT_BOOL)
-    { 
-        // ISLT, LT are the same for boolean:
-        op = GrB_LT_BOOL ;
-    }
-    else if (op == GxB_ISGE_BOOL)
-    { 
-        // ISGE, GE are the same for boolean:
-        op = GrB_GE_BOOL ;
-    }
-    else if (op == GxB_ISLE_BOOL)
-    { 
-        // ISLE, LE are the same for boolean:
-        op = GrB_LE_BOOL ;
-    }
+    op = GB_boolean_rename_op (op) ;
+    ASSERT_BINARYOP_OK (op, "revised op", GB0) ;
 
     //--------------------------------------------------------------------------
     // continue checking inputs
     //--------------------------------------------------------------------------
 
-    // check operator types; all must be identical
-    if (op->xtype != op->ztype || op->ytype != op->ztype)
+    // check operator types; all must be identical, and the operator cannot
+    // be positional
+    if (op->xtype != op->ztype || op->ytype != op->ztype ||
+        GB_OP_IS_POSITIONAL (op))
     { 
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-            "All domains of monoid operator must be identical;\n"
-            "operator is: [%s] = %s ([%s],[%s])",
-            op->ztype->name, op->name, op->xtype->name, op->ytype->name))) ;
+        return (GrB_DOMAIN_MISMATCH) ;
     }
 
     // The idcode must match the monoid->op->ztype->code for built-in types,
@@ -121,15 +78,12 @@ GrB_Info GB_Monoid_new          // create a monoid
     // compared with the input op->ztype parameter.  Only the type code,
     // GB_UDT_code, can be checked to see if it matches.  In
     // that case, all that is known is that identity is a void * pointer that
-    // points to something, hopefully a scalar of the proper user-defined type.
+    // points to something; it must a scalar of the proper user-defined type.
 
     GB_Type_code zcode = op->ztype->code ;
     if (idcode != zcode)
     { 
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-            "Identity type [%s]\n"
-            "must be identical to monoid operator z=%s(x,y) of type [%s]",
-            GB_code_string (idcode), op->name, op->ztype->name))) ;
+        return (GrB_DOMAIN_MISMATCH) ;
     }
 
     //--------------------------------------------------------------------------
@@ -141,47 +95,45 @@ GrB_Info GB_Monoid_new          // create a monoid
     if (*monoid == NULL)
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     // initialize the monoid
     GrB_Monoid mon = *monoid ;
     mon->magic = GB_MAGIC ;
     mon->op = op ;
-    mon->builtin = false ;
+    mon->monoid_is_builtin = false ;
     size_t zsize = op->ztype->size ;
-    mon->op_ztype_size = zsize ;
-    mon->identity = NULL ;                  // defined below
+    mon->identity = NULL ;                  // defined below (if present)
     mon->terminal = NULL ;                  // defined below (if present)
 
     //--------------------------------------------------------------------------
-    // allocation of idenity and terminal values
+    // allocation of identity and terminal values
     //--------------------------------------------------------------------------
 
-    // allocate both the identity and terminal value
-    #define GB_ALLOC_IDENTITY_AND_TERMINAL                                  \
+    // allocate the identity value
+    #define GB_ALLOC_IDENTITY                                               \
     {                                                                       \
-        mon->identity = GB_CALLOC (zsize, GB_void) ;                        \
-        mon->terminal = GB_CALLOC (zsize, GB_void) ;                        \
-        if (mon->identity == NULL || mon->terminal == NULL)                 \
+        mon->identity = GB_MALLOC (zsize, GB_void) ;                        \
+        if (mon->identity == NULL)                                          \
         {                                                                   \
             /* out of memory */                                             \
-            GB_FREE (mon->identity) ;                                       \
             GB_FREE (mon->terminal) ;                                       \
             GB_FREE (*monoid) ;                                             \
-            return (GB_OUT_OF_MEMORY) ;                                     \
+            return (GrB_OUT_OF_MEMORY) ;                                    \
         }                                                                   \
     }
 
-    // allocate just the identity, not the terminal
-    #define GB_ALLOC_JUST_IDENTITY                                          \
+    // allocate the terminal value
+    #define GB_ALLOC_TERMINAL                                               \
     {                                                                       \
-        mon->identity = GB_CALLOC (zsize, GB_void) ;                        \
-        if (mon->identity == NULL)                                          \
+        mon->terminal = GB_MALLOC (zsize, GB_void) ;                        \
+        if (mon->terminal == NULL)                                          \
         {                                                                   \
             /* out of memory */                                             \
+            GB_FREE (mon->identity) ;                                       \
             GB_FREE (*monoid) ;                                             \
-            return (GB_OUT_OF_MEMORY) ;                                     \
+            return (GrB_OUT_OF_MEMORY) ;                                    \
         }                                                                   \
     }
 
@@ -192,16 +144,20 @@ GrB_Info GB_Monoid_new          // create a monoid
     bool done = false ;
 
     // If the user requests the creation of a monoid based on a built-in
-    // operator that corresponds to known monoid, the identity and terminal
-    // values provided by the user to GrB_Monoid_new are ignored, since these
-    // are all handled by the same hard-coded functions as the built-in monoids
-    // based on the same operator.
+    // operator that corresponds to known built-in monoid, the identity and
+    // terminal values provided by the user to GrB_Monoid_new are ignored,
+    // since these are all handled by the same hard-coded functions as the
+    // built-in monoids based on the same operator.
+
+    // All of the monoids created in the switch statement below have non-NULL
+    // identity values; some have terminal values and some do not.
 
     // create a monoid with both an identity and a terminal value,
     // based on a built-in operator that is a known monoid
     #define GB_IT(ztype,identity_value,terminal_value)                      \
     {                                                                       \
-        GB_ALLOC_IDENTITY_AND_TERMINAL ;                                    \
+        GB_ALLOC_TERMINAL ;                                                 \
+        GB_ALLOC_IDENTITY ;                                                 \
         ztype *identity = (ztype *) mon->identity ;                         \
         ztype *terminal = (ztype *) mon->terminal ;                         \
         (*identity) = identity_value ;                                      \
@@ -214,7 +170,7 @@ GrB_Info GB_Monoid_new          // create a monoid
     // based on a built-in operator that is a known monoid
     #define GB_IN(ztype,identity_value)                                     \
     {                                                                       \
-        GB_ALLOC_JUST_IDENTITY ;                                            \
+        GB_ALLOC_IDENTITY ;                                                 \
         ztype *identity = (ztype *) mon->identity ;                         \
         (*identity) = identity_value ;                                      \
         done = true ;                                                       \
@@ -223,9 +179,10 @@ GrB_Info GB_Monoid_new          // create a monoid
 
     switch (op->opcode)
     {
-        case GB_MIN_opcode :
+        case GB_MIN_opcode : 
 
             // MIN monoid:  identity is +inf, terminal is -inf
+            // note there is no MIN monoid for complex types
             switch (zcode)
             {
                 case GB_INT8_code   : GB_IT (int8_t  , INT8_MAX  , INT8_MIN  )
@@ -243,9 +200,10 @@ GrB_Info GB_Monoid_new          // create a monoid
             }
             break ;
 
-        case GB_MAX_opcode :
+        case GB_MAX_opcode : 
 
             // MAX monoid:  identity is -inf, terminal is +inf
+            // note there is no MAX monoid for complex types
             switch (zcode)
             {
                 case GB_INT8_code   : GB_IT (int8_t  , INT8_MIN  , INT8_MAX  )
@@ -263,7 +221,7 @@ GrB_Info GB_Monoid_new          // create a monoid
             }
             break ;
 
-        case GB_PLUS_opcode :
+        case GB_PLUS_opcode : 
 
             // PLUS monoid:  identity is zero, no terminal value
             switch (zcode)
@@ -284,7 +242,7 @@ GrB_Info GB_Monoid_new          // create a monoid
             }
             break ;
 
-        case GB_TIMES_opcode :
+        case GB_TIMES_opcode : 
 
             // TIMES monoid:  identity is 1, no terminal value
             switch (zcode)
@@ -305,7 +263,7 @@ GrB_Info GB_Monoid_new          // create a monoid
             }
             break ;
 
-        case GB_ANY_opcode :
+        case GB_ANY_opcode : 
 
             // ANY monoid:  identity is anything, terminal value is anything
             switch (zcode)
@@ -321,9 +279,9 @@ GrB_Info GB_Monoid_new          // create a monoid
                 case GB_UINT64_code : GB_IT (uint64_t, 0, 0 )
                 case GB_FP32_code   : GB_IT (float   , 0, 0 )
                 case GB_FP64_code   : GB_IT (double  , 0, 0 )
-                case GB_FC32_code   :
+                case GB_FC32_code   : 
                     GB_IT (GxB_FC32_t, GxB_CMPLXF(0,0), GxB_CMPLXF(0,0))
-                case GB_FC64_code   :
+                case GB_FC64_code   : 
                     GB_IT (GxB_FC64_t, GxB_CMPLX(0,0), GxB_CMPLX(0,0))
                 default: ;
             }
@@ -364,17 +322,13 @@ GrB_Info GB_Monoid_new          // create a monoid
 
     if (!done)
     {
-        if (terminal == NULL)
-        { 
-            // create a monoid with just an identity but no terminal value
-            GB_ALLOC_JUST_IDENTITY ;
-            memcpy (mon->identity, identity, zsize) ;
-        }
-        else
+        // create the monoid identity value
+        GB_ALLOC_IDENTITY ;
+        memcpy (mon->identity, identity, zsize) ;
+        if (terminal != NULL)
         { 
-            // create a monoid with both an identity and a terminal value
-            GB_ALLOC_IDENTITY_AND_TERMINAL ;
-            memcpy (mon->identity, identity, zsize) ;
+            // create the monoid terminal value
+            GB_ALLOC_TERMINAL ;
             memcpy (mon->terminal, terminal, zsize) ;
         }
     }
diff --git a/GraphBLAS/Source/GB_Pending.h b/GraphBLAS/Source/GB_Pending.h
index 1b0434150d..0f1484cd58 100644
--- a/GraphBLAS/Source/GB_Pending.h
+++ b/GraphBLAS/Source/GB_Pending.h
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_Pending.h: data structure and operations for pending tuples
+// GB_Pending.h: operations for pending tuples
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,26 +11,6 @@
 #define GB_PENDING_H
 #include "GB.h"
 
-//------------------------------------------------------------------------------
-// GB_Pending data structure
-//------------------------------------------------------------------------------
-
-struct GB_Pending_struct    // list of pending tuples for a matrix
-{
-    int64_t n ;         // number of pending tuples to add to matrix
-    int64_t nmax ;      // size of i,j,x
-    bool sorted ;       // true if pending tuples are in sorted order
-    int64_t *i ;        // row indices of pending tuples
-    int64_t *j ;        // col indices of pending tuples; NULL if A->vdim <= 1
-    GB_void *x ;        // values of pending tuples
-    GrB_Type type ;     // the type of s
-    size_t size ;       // type->size
-    GrB_BinaryOp op ;   // operator to assemble pending tuples
-} ;
-
-// initial size of the pending tuples
-#define GB_PENDING_INIT 256
-
 //------------------------------------------------------------------------------
 // GB_Pending functions
 //------------------------------------------------------------------------------
@@ -164,8 +144,8 @@ static inline bool GB_Pending_add   // add a tuple to the list
 // add (iC,jC,aij) or just (iC,aij) if Pending_j is NULL
 //------------------------------------------------------------------------------
 
-// GB_PENDING_INSERT(aij) is used by GB_subassign to insert a pending tuple, in
-// phase 2.  The list has already been reallocated after phase 1 to hold all
+// GB_PENDING_INSERT(aij) is used by GB_subassign_* to insert a pending tuple,
+// in phase 2.  The list has already been reallocated after phase 1 to hold all
 // the new pending tuples, so GB_Pending_realloc is not required.
 
 #define GB_PENDING_INSERT(aij)                                              \
@@ -193,9 +173,13 @@ static inline bool GB_shall_block
 (
     GrB_Matrix A
 )
-{
+{ 
 
-    if (!GB_PENDING_OR_ZOMBIES (A)) return (false) ;
+    if (!GB_ANY_PENDING_WORK (A))
+    { 
+        // no pending work, so no need to block
+        return (false) ;
+    }
     double npending = (double) GB_Pending_n (A) ;
     double anzmax = ((double) A->vlen) * ((double) A->vdim) ;
     bool many_pending = (npending >= anzmax) ;
diff --git a/GraphBLAS/Source/GB_Pending_alloc.c b/GraphBLAS/Source/GB_Pending_alloc.c
index d634cbc1e1..efce8071bd 100644
--- a/GraphBLAS/Source/GB_Pending_alloc.c
+++ b/GraphBLAS/Source/GB_Pending_alloc.c
@@ -2,8 +2,8 @@
 // GB_Pending_alloc: allocate a list of pending tuples
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_Pending_free.c b/GraphBLAS/Source/GB_Pending_free.c
index b5f6e27a65..439ed388fa 100644
--- a/GraphBLAS/Source/GB_Pending_free.c
+++ b/GraphBLAS/Source/GB_Pending_free.c
@@ -2,8 +2,8 @@
 // GB_Pending_free: free a list of pending tuples
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_Pending_n.c b/GraphBLAS/Source/GB_Pending_n.c
index c881498f80..a88c011751 100644
--- a/GraphBLAS/Source/GB_Pending_n.c
+++ b/GraphBLAS/Source/GB_Pending_n.c
@@ -2,8 +2,8 @@
 // GB_Pending_n: return the # of pending tuples in a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,12 +16,10 @@ int64_t GB_Pending_n        // return # of pending tuples in A
 {
 
     int64_t n = 0 ;
-    if (A != NULL)
-    {
-        if (A->Pending != NULL)
-        { 
-            n = A->Pending->n ;
-        }
+    if (A != NULL && A->Pending != NULL)
+    { 
+        // only sparse and hypersparse matries can have pending tuples
+        n = A->Pending->n ;
     }
     return (n) ;
 }
diff --git a/GraphBLAS/Source/GB_Pending_realloc.c b/GraphBLAS/Source/GB_Pending_realloc.c
index 6e66a1cbc4..e124ebe5ff 100644
--- a/GraphBLAS/Source/GB_Pending_realloc.c
+++ b/GraphBLAS/Source/GB_Pending_realloc.c
@@ -2,8 +2,8 @@
 // GB_Pending_realloc: reallocate a list of pending tuples
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -48,13 +48,13 @@ bool GB_Pending_realloc         // reallocate a list of pending tuples
         bool ok2 = true ;
         bool ok3 = true ;
 
-        Pending->i = GB_REALLOC (Pending->i, newsize, Pending->nmax, int64_t, &ok1) ;
+        GB_REALLOC (Pending->i, newsize, Pending->nmax, int64_t, &ok1) ;
         if (Pending->j != NULL)
         { 
-            Pending->j = GB_REALLOC (Pending->j, newsize, Pending->nmax, int64_t, &ok2) ;
+            GB_REALLOC (Pending->j, newsize, Pending->nmax, int64_t, &ok2) ;
         }
         size_t s = Pending->size ;
-        Pending->x = GB_REALLOC (Pending->x, newsize*s, (Pending->nmax)*s, GB_void, &ok3) ;
+        GB_REALLOC (Pending->x, newsize*s, (Pending->nmax)*s, GB_void, &ok3) ;
 
         if (!ok1 || !ok2 || !ok3)
         { 
diff --git a/GraphBLAS/Source/GB_Scalar_check.c b/GraphBLAS/Source/GB_Scalar_check.c
index 153b5a1940..8258d51a27 100644
--- a/GraphBLAS/Source/GB_Scalar_check.c
+++ b/GraphBLAS/Source/GB_Scalar_check.c
@@ -2,8 +2,8 @@
 // GB_Scalar_check: print a GraphBLAS GxB_Scalar and check if it is valid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,8 +16,7 @@ GrB_Info GB_Scalar_check    // check a GraphBLAS GxB_Scalar
     const GxB_Scalar s,     // GraphBLAS GxB_Scalar to print and check
     const char *name,       // name of the GxB_Scalar
     int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
+    FILE *f                 // file for output
 )
 {
 
@@ -25,9 +24,8 @@ GrB_Info GB_Scalar_check    // check a GraphBLAS GxB_Scalar
     // check GrB_Matrix conditions
     //--------------------------------------------------------------------------
 
-    GrB_Info info = GB_matvec_check ((GrB_Matrix) s, name, pr, f, "scalar",
-        Context) ;
-    if (! (info == GrB_INDEX_OUT_OF_BOUNDS || info == GrB_SUCCESS))
+    GrB_Info info = GB_matvec_check ((GrB_Matrix) s, name, pr, f, "scalar") ;
+    if (info != GrB_SUCCESS)
     { 
         // GrB_Matrix form is invalid already
         return (info) ;
@@ -40,10 +38,9 @@ GrB_Info GB_Scalar_check    // check a GraphBLAS GxB_Scalar
     if (!GB_SCALAR_OK (s))
     { 
         GBPR0 ("    GxB_Scalar is invalid [%s]\n", name) ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "GxB_Scalar is invalid [%s]", name))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
-    return (info) ; // pass info directly from GB_matvec_check (jumbled case)
+    return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_Scalar_wrap.c b/GraphBLAS/Source/GB_Scalar_wrap.c
index ac7fa5c943..2afcdec811 100644
--- a/GraphBLAS/Source/GB_Scalar_wrap.c
+++ b/GraphBLAS/Source/GB_Scalar_wrap.c
@@ -2,13 +2,13 @@
 // GB_Scalar_wrap: wrap a C scalar inside a GraphBLAS scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // This method construct a shallow statically-defined scalar, with no memory
-// allocations.
+// allocations.  The scalar is full, with a single entry.
 
 #include "GB.h"
 #include "GB_scalar.h"
@@ -17,8 +17,6 @@ GxB_Scalar GB_Scalar_wrap   // create a new GxB_Scalar with one entry
 (
     GxB_Scalar s,           // GxB_Scalar to create
     GrB_Type type,          // type of GxB_Scalar to create
-    int64_t *Sp,            // becomes S->p, an array of size 2
-    int64_t *Si,            // becomes S->i, an array of size 1
     void *Sx                // becomes S->x, an array of size 1 * type->size
 )
 { 
@@ -34,34 +32,31 @@ GxB_Scalar GB_Scalar_wrap   // create a new GxB_Scalar with one entry
     //--------------------------------------------------------------------------
 
     s->magic = GB_MAGIC ;
-    s->type = type ;
-    s->type_size = type->size ;
-    s->hyper_ratio = GB_HYPER_DEFAULT ;
-    s->plen = 1 ;
+    s->type = (type == NULL) ? GrB_BOOL : type ;
+    s->hyper_switch  = GxB_NEVER_HYPER ;
+    s->bitmap_switch = 0.5 ;
+    s->sparsity = GxB_FULL ;
+    s->plen = -1 ;
     s->vlen = 1 ;
     s->vdim = 1 ;
     s->nvec = 1 ;
     s->nvec_nonempty = 1 ;
-    s->p = Sp ; Sp [0] = 0 ; Sp [1] = 1 ;
+    s->p = NULL ;
     s->h = NULL ;
-    s->i = Si ; Si [0] = 0 ;
+    s->b = NULL ;
+    s->i = NULL ;
     s->x = Sx ;
     s->nzmax = 1 ;
-    s->hfirst = 0 ;
     s->Pending = NULL ;
     s->nzombies = 0 ;
-    s->AxB_method_used = GxB_DEFAULT ;
-    s->queue_next = NULL ;  // TODO in 4.0: delete
-    s->queue_prev = NULL ;  // TODO in 4.0: delete
-    s->enqueued = false ;   // TODO in 4.0: delete
-    s->p_shallow = true ;
+    s->jumbled = false ;
+    s->p_shallow = false ;
     s->h_shallow = false ;
-    s->i_shallow = true ;
+    s->b_shallow = false ;
+    s->i_shallow = false ;
     s->x_shallow = true ;
-    s->is_hyper = false ;
     s->is_csc = true ;
-    s->is_slice = false ;
-    s->mkl = NULL ;
+    // #include "GB_Scalar_wrap_mkl_template.c"
 
     //--------------------------------------------------------------------------
     // return result
diff --git a/GraphBLAS/Source/GB_SelectOp_check.c b/GraphBLAS/Source/GB_SelectOp_check.c
index b92603c3e8..ea70dcdcb0 100644
--- a/GraphBLAS/Source/GB_SelectOp_check.c
+++ b/GraphBLAS/Source/GB_SelectOp_check.c
@@ -2,8 +2,8 @@
 // GB_SelectOp_check: check and print a select operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,8 +15,7 @@ GrB_Info GB_SelectOp_check  // check a GraphBLAS select operator
     const GxB_SelectOp op,  // GraphBLAS operator to print and check
     const char *name,       // name of the operator
     int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
+    FILE *f                 // file for output
 )
 {
 
@@ -28,7 +27,6 @@ GrB_Info GB_SelectOp_check  // check a GraphBLAS select operator
 
     if (op == NULL)
     { 
-        // GrB_error status not modified since this may be an optional argument
         GBPR0 ("NULL\n") ;
         return (GrB_NULL_POINTER) ;
     }
@@ -53,37 +51,32 @@ GrB_Info GB_SelectOp_check  // check a GraphBLAS select operator
     if (op->function == NULL && op->opcode >= GB_USER_SELECT_opcode)
     { 
         GBPR0 ("    function pointer is NULL\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "SelectOp has a NULL function pointer: %s [%s]",
-            GB_NAME, op->name))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
     if (op->opcode < GB_TRIL_opcode || op->opcode > GB_USER_SELECT_opcode)
     { 
         GBPR0 ("    invalid opcode\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "SelectOp has an invalid opcode: %s [%s]", GB_NAME, op->name))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
     if (op->xtype != NULL)
-    { 
-        GrB_Info info = GB_Type_check (op->xtype, "xtype", pr, f, Context) ;
+    {
+        GrB_Info info = GB_Type_check (op->xtype, "xtype", pr, f) ;
         if (info != GrB_SUCCESS)
         { 
             GBPR0 ("    SelectOp has an invalid xtype\n") ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "SelectOp has an invalid xtype: %s [%s]", GB_NAME, op->name))) ;
+            return (GrB_INVALID_OBJECT) ;
         }
     }
 
     if (op->ttype != NULL)
-    { 
-        GrB_Info info = GB_Type_check (op->ttype, "ttype", pr, f, Context) ;
+    {
+        GrB_Info info = GB_Type_check (op->ttype, "ttype", pr, f) ;
         if (info != GrB_SUCCESS)
         { 
             GBPR0 ("    SelectOp has an invalid ttype\n") ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "SelectOp has an invalid ttype: %s [%s]", GB_NAME, op->name))) ;
+            return (GrB_INVALID_OBJECT) ;
         }
     }
 
diff --git a/GraphBLAS/Source/GB_SelectOp_new.c b/GraphBLAS/Source/GB_SelectOp_new.c
index 557b815506..796fa5128e 100644
--- a/GraphBLAS/Source/GB_SelectOp_new.c
+++ b/GraphBLAS/Source/GB_SelectOp_new.c
@@ -2,8 +2,8 @@
 // GB_SelectOp_new: create a new select operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -29,7 +29,7 @@ GrB_Info GB_SelectOp_new        // create a new user-defined select operator
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_SelectOp_new (selectop, function, xtype)") ;
+    GB_WHERE1 ("GxB_SelectOp_new (selectop, function, xtype)") ;
     GB_RETURN_IF_NULL (selectop) ;
     (*selectop) = NULL ;
     GB_RETURN_IF_NULL (function) ;
@@ -45,7 +45,7 @@ GrB_Info GB_SelectOp_new        // create a new user-defined select operator
     if (*selectop == NULL)
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     // initialize the select operator
diff --git a/GraphBLAS/Source/GB_Semiring_check.c b/GraphBLAS/Source/GB_Semiring_check.c
index c8c47cf3a9..3fd7bd9199 100644
--- a/GraphBLAS/Source/GB_Semiring_check.c
+++ b/GraphBLAS/Source/GB_Semiring_check.c
@@ -2,8 +2,8 @@
 // GB_Semiring_check: check and print a semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,8 +15,7 @@ GrB_Info GB_Semiring_check          // check a GraphBLAS semiring
     const GrB_Semiring semiring,    // GraphBLAS semiring to print and check
     const char *name,               // name of the semiring, optional
     int pr,                         // print level
-    FILE *f,                        // file for output
-    GB_Context Context
+    FILE *f                         // file for output
 )
 {
 
@@ -28,7 +27,6 @@ GrB_Info GB_Semiring_check          // check a GraphBLAS semiring
 
     if (semiring == NULL)
     { 
-        // GrB_error status not modified since this may be an optional argument
         GBPR0 ("NULL\n") ;
         return (GrB_NULL_POINTER) ;
     }
@@ -38,24 +36,21 @@ GrB_Info GB_Semiring_check          // check a GraphBLAS semiring
     //--------------------------------------------------------------------------
 
     GB_CHECK_MAGIC (semiring, "Semiring") ;
-    GBPR0 (semiring->builtin ? "(built-in)" : "(user-defined)") ;
+    GBPR0 (semiring->semiring_is_builtin ? "(built-in)" : "(user-defined)") ;
 
     GrB_Info info ;
-    info = GB_Monoid_check (semiring->add, "semiring->add", pr, f, Context) ;
+    info = GB_Monoid_check (semiring->add, "semiring->add", pr, f) ;
     if (info != GrB_SUCCESS)
     { 
         GBPR0 ("    Semiring->add invalid\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "Semiring->add is an invalid monoid: [%s]", GB_NAME))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
-    info = GB_BinaryOp_check (semiring->multiply, "semiring->multiply", pr, f,
-        Context) ;
+    info = GB_BinaryOp_check (semiring->multiply, "semiring->multiply", pr, f) ;
     if (info != GrB_SUCCESS)
     { 
         GBPR0 ("    Semiring->multiply invalid\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "Semiring->multiply is an invalid operator: [%s]", GB_NAME))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
     // z = multiply(x,y); type of z must match monoid type
@@ -63,9 +58,7 @@ GrB_Info GB_Semiring_check          // check a GraphBLAS semiring
     { 
         GBPR0 ("    Semiring multiply output domain must match monoid"
             " domain\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "Semiring multiply output domain must match monoid domain: [%s]",
-            GB_NAME))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GB_Semiring_new.c b/GraphBLAS/Source/GB_Semiring_new.c
new file mode 100644
index 0000000000..f8d0fb6c4e
--- /dev/null
+++ b/GraphBLAS/Source/GB_Semiring_new.c
@@ -0,0 +1,60 @@
+//------------------------------------------------------------------------------
+// GB_semiring_new: create a new semiring
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+
+GrB_Info GB_Semiring_new            // create a semiring
+(
+    GrB_Semiring *semiring,         // handle of semiring to create
+    GrB_Monoid add,                 // additive monoid of the semiring
+    GrB_BinaryOp multiply           // multiply operator of the semiring
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (semiring != NULL) ;
+    (*semiring) = NULL ;
+    ASSERT (add != NULL) ;
+    ASSERT (multiply != NULL) ;
+    ASSERT_MONOID_OK (add, "semiring->add", GB0) ;
+    ASSERT_BINARYOP_OK (multiply, "semiring->multiply", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // create the semiring
+    //--------------------------------------------------------------------------
+
+    // z = multiply(x,y); type of z must match monoid z = add(z,z)
+    if (multiply->ztype != add->op->ztype)
+    { 
+        (*semiring) = NULL ;
+        return (GrB_DOMAIN_MISMATCH) ;
+    }
+
+    // allocate the semiring
+    (*semiring) = GB_CALLOC (1, struct GB_Semiring_opaque) ;
+    if (*semiring == NULL)
+    { 
+        // out of memory
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    // initialize the semiring
+    GrB_Semiring s = *semiring ;
+    s->magic = GB_MAGIC ;
+    s->add = add ;
+    s->multiply = multiply ;
+    s->semiring_is_builtin = false ;
+
+    ASSERT_SEMIRING_OK (s, "new semiring", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_Type_check.c b/GraphBLAS/Source/GB_Type_check.c
index 81a5bde8c2..5f45a3318e 100644
--- a/GraphBLAS/Source/GB_Type_check.c
+++ b/GraphBLAS/Source/GB_Type_check.c
@@ -2,8 +2,8 @@
 // GB_Type_check: print a built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,7 @@ GrB_Info GB_Type_check      // check a GraphBLAS Type
     const GrB_Type type,    // GraphBLAS type to print and check
     const char *name,       // name of the type from the caller; optional
     int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
+    FILE *f                 // file for output
 )
 {
 
@@ -34,7 +33,6 @@ GrB_Info GB_Type_check      // check a GraphBLAS Type
 
     if (type == NULL)
     { 
-        // GrB_error status not modified since this may be an optional argument
         GBPR0 ("NULL\n") ;
         return (GrB_NULL_POINTER) ;
     }
@@ -62,9 +60,7 @@ GrB_Info GB_Type_check      // check a GraphBLAS Type
         case GB_FC64_code   : GBPR0 ("double complex") ; break ;
         case GB_UDT_code    : GBPR0 ("user-defined: [%s]", type->name) ; break ;
         default             : GBPR0 ("unknown type\n") ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "Type code %d is unknown: %s [%s]",
-                type->code, GB_NAME, type->name))) ;
+            return (GrB_INVALID_OBJECT) ;
     }
 
     GBPR0 (" size: %zu\n", type->size) ;
@@ -72,8 +68,7 @@ GrB_Info GB_Type_check      // check a GraphBLAS Type
     if (type->size == 0 || type->size != GB_code_size (type->code, type->size))
     { 
         GBPR0 ("    Type has an invalid size\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "Type has an invalid size: %s [%s]", GB_NAME, type->name))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GB_Type_compatible.c b/GraphBLAS/Source/GB_Type_compatible.c
index 19833455ee..7c515e6025 100644
--- a/GraphBLAS/Source/GB_Type_compatible.c
+++ b/GraphBLAS/Source/GB_Type_compatible.c
@@ -2,8 +2,8 @@
 // GB_Type_compatible: return true if domains are compatible
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,7 +11,7 @@
 // types (of any kind) or if both are the same user-defined type.
 
 #include "GB.h"
- 
+
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 bool GB_Type_compatible             // check if two types can be typecast
 (
@@ -19,8 +19,12 @@ bool GB_Type_compatible             // check if two types can be typecast
     const GrB_Type btype
 )
 {
-
-    if (atype->code == GB_UDT_code || btype->code == GB_UDT_code)
+    if (atype == NULL || btype == NULL)
+    { 
+        // the op ignores its inputs
+        return (true) ;
+    }
+    else if (atype->code == GB_UDT_code || btype->code == GB_UDT_code)
     { 
         // two user types must be identical to be compatible
         return (atype == btype) ;
diff --git a/GraphBLAS/Source/GB_Type_new.c b/GraphBLAS/Source/GB_Type_new.c
index ed1d1c75eb..58707dc2b9 100644
--- a/GraphBLAS/Source/GB_Type_new.c
+++ b/GraphBLAS/Source/GB_Type_new.c
@@ -2,8 +2,8 @@
 // GB_Type_new: create a new user-defined type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -24,17 +24,20 @@ GrB_Info GB_Type_new
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Type_new (&type, sizeof (ctype))") ;
+    GB_WHERE1 ("GrB_Type_new (&type, sizeof (ctype))") ;
     GB_RETURN_IF_NULL (type) ;
     (*type) = NULL ;
 
     #if ( ! GB_HAS_VLA )
 
+        // Microsoft Visual Studio does not support variable-length arrays
+        // allocating automatically on the stack.  These arrays are used for
+        // scalar values for a given type.  If VLA is not supported,
+        // user-defined types can be no larger than GB_VLA_MAXSIZE.
+
         if (sizeof_ctype > GB_VLA_MAXSIZE)
         {
-            return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG, "user-defined types"
-                " are limited to %d bytes (ANSI C99 or later is required)",
-                GB_VLA_MAXSIZE))) ;
+            return (GrB_INVALID_VALUE) ;
         }
 
     #endif
@@ -48,7 +51,7 @@ GrB_Info GB_Type_new
     if (*type == NULL)
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     // initialize the type
diff --git a/GraphBLAS/Source/GB_UnaryOp_check.c b/GraphBLAS/Source/GB_UnaryOp_check.c
index dcf71be822..5ce196c6b3 100644
--- a/GraphBLAS/Source/GB_UnaryOp_check.c
+++ b/GraphBLAS/Source/GB_UnaryOp_check.c
@@ -2,8 +2,8 @@
 // GB_UnaryOp_check: check and print a unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,8 +15,7 @@ GrB_Info GB_UnaryOp_check   // check a GraphBLAS unary operator
     const GrB_UnaryOp op,   // GraphBLAS operator to print and check
     const char *name,       // name of the operator
     int pr,                 // print level
-    FILE *f,                // file for output
-    GB_Context Context
+    FILE *f                 // file for output
 )
 {
 
@@ -28,7 +27,6 @@ GrB_Info GB_UnaryOp_check   // check a GraphBLAS unary operator
 
     if (op == NULL)
     { 
-        // GrB_error status not modified since this may be an optional argument
         GBPR0 ("NULL\n") ;
         return (GrB_NULL_POINTER) ;
     }
@@ -39,7 +37,8 @@ GrB_Info GB_UnaryOp_check   // check a GraphBLAS unary operator
 
     GB_CHECK_MAGIC (op, "UnaryOp") ;
 
-    if (op->opcode >= GB_USER_opcode)
+    GB_Opcode opcode = op->opcode ;
+    if (opcode >= GB_USER_opcode)
     { 
         GBPR0 ("(user-defined) ") ;
     }
@@ -50,41 +49,41 @@ GrB_Info GB_UnaryOp_check   // check a GraphBLAS unary operator
 
     GBPR0 ("z=%s(x)\n", op->name) ;
 
-    if (op->function == NULL)
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (opcode) ;
+    bool op_is_one = (opcode == GB_ONE_opcode) ;
+
+    if (!op_is_positional && op->function == NULL)
     { 
         GBPR0 ("    function pointer is NULL\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "UnaryOp has a NULL function pointer: %s [%s]",
-            GB_NAME, op->name))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
-    if (op->opcode != GB_USER_opcode)
+    if (opcode != GB_USER_opcode)
     {
-        if (op->opcode < GB_ONE_opcode || op->opcode >= GB_FIRST_opcode)
+        if (opcode < GB_ONE_opcode || opcode >= GB_FIRST_opcode)
         { 
             GBPR0 ("    invalid opcode\n") ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "UnaryOp has an invalid opcode: %s [%s]",
-                GB_NAME, op->name))) ;
+            return (GrB_INVALID_OBJECT) ;
         }
     }
 
     GrB_Info info ;
 
-    info = GB_Type_check (op->ztype, "ztype", pr, f, Context) ;
+    info = GB_Type_check (op->ztype, "ztype", pr, f) ;
     if (info != GrB_SUCCESS)
     { 
         GBPR0 ("    UnaryOP has an invalid ztype\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "UnaryOp has an invalid ztype: %s [%s]", GB_NAME, op->name))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
-    info = GB_Type_check (op->xtype, "xtype", pr, f, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        GBPR0 ("    UnaryOP has an invalid xtype\n") ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "UnaryOp has an invalid xtype: %s [%s]", GB_NAME, op->name))) ;
+    if (!op_is_positional && !op_is_one)
+    {
+        info = GB_Type_check (op->xtype, "xtype", pr, f) ;
+        if (info != GrB_SUCCESS)
+        { 
+            GBPR0 ("    UnaryOP has an invalid xtype\n") ;
+            return (GrB_INVALID_OBJECT) ;
+        }
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GB_UnaryOp_new.c b/GraphBLAS/Source/GB_UnaryOp_new.c
index b27eaddcf6..bde131ce34 100644
--- a/GraphBLAS/Source/GB_UnaryOp_new.c
+++ b/GraphBLAS/Source/GB_UnaryOp_new.c
@@ -2,8 +2,8 @@
 // GB_UnaryOp_new: create a new unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -30,7 +30,7 @@ GrB_Info GB_UnaryOp_new             // create a new user-defined unary operator
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_UnaryOp_new (unaryop, function, ztype, xtype)") ;
+    GB_WHERE1 ("GrB_UnaryOp_new (unaryop, function, ztype, xtype)") ;
     GB_RETURN_IF_NULL (unaryop) ;
     (*unaryop) = NULL ;
     GB_RETURN_IF_NULL (function) ;
@@ -46,7 +46,7 @@ GrB_Info GB_UnaryOp_new             // create a new user-defined unary operator
     if (*unaryop == NULL)
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     // initialize the unary operator
diff --git a/GraphBLAS/Source/GB_Vector_check.c b/GraphBLAS/Source/GB_Vector_check.c
index e2ac2da871..df91c25f6d 100644
--- a/GraphBLAS/Source/GB_Vector_check.c
+++ b/GraphBLAS/Source/GB_Vector_check.c
@@ -2,8 +2,8 @@
 // GB_Vector_check: print a GraphBLAS GrB_Vector and check if it is valid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,8 +18,7 @@ GrB_Info GB_Vector_check    // check a GraphBLAS vector
     const char *name,       // name of the vector
     int pr,                 // 0: print nothing, 1: print header and errors,
                             // 2: print brief, 3: print all
-    FILE *f,                // file for output
-    GB_Context Context
+    FILE *f                 // file for output
 )
 {
 
@@ -27,9 +26,8 @@ GrB_Info GB_Vector_check    // check a GraphBLAS vector
     // check GrB_Matrix conditions
     //--------------------------------------------------------------------------
 
-    GrB_Info info = GB_matvec_check ((GrB_Matrix) v, name, pr, f, "vector",
-        Context) ;
-    if (! (info == GrB_INDEX_OUT_OF_BOUNDS || info == GrB_SUCCESS))
+    GrB_Info info = GB_matvec_check ((GrB_Matrix) v, name, pr, f, "vector") ;
+    if (info != GrB_SUCCESS)
     { 
         // GrB_Matrix form is invalid already
         return (info) ;
@@ -42,10 +40,9 @@ GrB_Info GB_Vector_check    // check a GraphBLAS vector
     if (!GB_VECTOR_OK (v))
     { 
         GBPR0 ("    GrB_Vector is invalid [%s]\n", name) ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "GrB_Vector is invalid [%s]", name))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
-    return (info) ; // pass info directly from GB_matvec_check (jumbled case)
+    return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_accum_mask.c b/GraphBLAS/Source/GB_accum_mask.c
index e8084aebf3..013d8e7497 100644
--- a/GraphBLAS/Source/GB_accum_mask.c
+++ b/GraphBLAS/Source/GB_accum_mask.c
@@ -2,8 +2,8 @@
 // GB_accum_mask: accumulate results via the mask and accum operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -49,6 +49,7 @@
 #include "GB_mask.h"
 #include "GB_transpose.h"
 #include "GB_accum_mask.h"
+#include "GB_bitmap_assign.h"
 
 /* -----------------------------------------------------------------------------
 
@@ -112,9 +113,9 @@
 
 #define GB_FREE_ALL                 \
 {                                   \
-    GB_MATRIX_FREE (Thandle) ;      \
-    GB_MATRIX_FREE (&MT) ;          \
-    GB_MATRIX_FREE (&Z) ;           \
+    GB_Matrix_free (Thandle) ;      \
+    GB_Matrix_free (&MT) ;          \
+    GB_Matrix_free (&Z) ;           \
 }
 
 //------------------------------------------------------------------------------
@@ -152,29 +153,32 @@ GrB_Info GB_accum_mask          // C<M> = accum (C,T)
     ASSERT_MATRIX_OK_OR_NULL (M, "M for GB_accum_mask", GB0) ;
     ASSERT_MATRIX_OK_OR_NULL (MT_in, "MT_in for GB_accum_mask", GB0) ;
     ASSERT_BINARYOP_OK_OR_NULL (accum, "accum for GB_accum_mask", GB0) ;
+    ASSERT (!GB_OP_IS_POSITIONAL (accum)) ;
 
     // pending work in C may be abandoned, or it might not need to be
-    // finished if GB_subassigner is used, so it is not finished here.
-    ASSERT (GB_PENDING_OK (C)) ; ASSERT (GB_ZOMBIES_OK (C)) ;
-    ASSERT (GB_PENDING_OK (M)) ; ASSERT (GB_ZOMBIES_OK (M)) ;
+    // finished if GB_subassign is used, so it is not finished here.
+    ASSERT (GB_PENDING_OK (C)) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
 
-    // pending work in T will be finished now
-    ASSERT (GB_PENDING_OK (T)) ; ASSERT (GB_ZOMBIES_OK (T)) ;
+    ASSERT (GB_PENDING_OK (M)) ;
+    ASSERT (GB_ZOMBIES_OK (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
 
-    // GB_extract can pass in a matrix T that is jumbled, but it does so
-    // only if T->is_csc and C->is_csc are different.  In that case, T is
-    // transposed, so the sort can be skipped.
-    ASSERT_MATRIX_OK_OR_JUMBLED (T, "[T = results of computation]", GB0) ;
+    // pending work in T will be finished now
+    ASSERT (GB_PENDING_OK (T)) ;
+    ASSERT (GB_ZOMBIES_OK (T)) ;
+    ASSERT (GB_JUMBLED_OK (T)) ;
+    ASSERT_MATRIX_OK (T, "[T = results of computation]", GB0) ;
 
     //--------------------------------------------------------------------------
-    // remove zombies and pending tuples from T
+    // remove zombies and pending tuples from T, but leave it jumbled
     //--------------------------------------------------------------------------
 
-    if (GB_PENDING_OR_ZOMBIES (T))
-    { 
-        // if this fails, *Thandle must be freed
-        GB_OK (GB_Matrix_wait (T, Context)) ;
-    }
+    GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (T) ;
+    ASSERT (!GB_PENDING (T)) ;
+    ASSERT (!GB_ZOMBIES (T)) ;
+    ASSERT (GB_JUMBLED_OK (T)) ;
 
     //--------------------------------------------------------------------------
     // ensure M and T have the same CSR/CSC format as C
@@ -187,14 +191,17 @@ GrB_Info GB_accum_mask          // C<M> = accum (C,T)
 
     if (C->is_csc != T->is_csc)
     { 
-        // transpose: no typecast, no op, in place of T, jumbled, but T
+        // transpose: no typecast, no op, in-place of T, but T
         // cannot have any zombies or pending tuples.
+        // T can be jumbled.
+        ASSERT (GB_JUMBLED_OK (T)) ;
         GB_OK (GB_transpose (Thandle, NULL, C->is_csc, NULL,
             NULL, NULL, NULL, false, Context)) ;
         #if GB_BURBLE
         T_transposed = true ;
         #endif
         T = (*Thandle) ;
+        ASSERT (GB_JUMBLED_OK (T)) ;
         ASSERT_MATRIX_OK (T, "[T = transposed]", GB0) ;
     }
 
@@ -204,18 +211,17 @@ GrB_Info GB_accum_mask          // C<M> = accum (C,T)
         // that C and M are not aliased.
 
         // MT = M' to conform M to the same CSR/CSC format as C.
-        // transpose: typecast, no op, not in place
+        // transpose: typecast, no op, not in-place
         if (MT_in == NULL)
         { 
-            if (GB_PENDING_OR_ZOMBIES (M))
-            {
-                // remove zombies and pending tuples from M
-                GB_OK (GB_Matrix_wait (M, Context)) ;
-            }
+            // remove zombies and pending tuples from M.  M can be jumbled.
+            GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (M) ;
+            ASSERT (GB_JUMBLED_OK (M)) ;
             GB_OK (GB_transpose (&MT, GrB_BOOL, C->is_csc, M,
                 NULL, NULL, NULL, false, Context)) ;
             // use the transpose mask
             M = MT ;
+            ASSERT (GB_JUMBLED_OK (M)) ;
             #if GB_BURBLE
             M_transposed = true ;
             #endif
@@ -234,13 +240,17 @@ GrB_Info GB_accum_mask          // C<M> = accum (C,T)
     ASSERT (C->is_csc == T->is_csc) ;
     ASSERT (M == NULL || (C->vlen == M->vlen && C->vdim == M->vdim)) ;
     ASSERT (M == NULL || (C->is_csc == M->is_csc)) ;
-    ASSERT (!GB_PENDING (T)) ; ASSERT (!GB_ZOMBIES (T)) ;
+    ASSERT (!GB_PENDING (T)) ;
+    ASSERT (!GB_ZOMBIES (T)) ;
+
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (GB_JUMBLED_OK (T)) ;
 
     //--------------------------------------------------------------------------
-    // apply the accumulator and the mask
+    // decide on the method
     //--------------------------------------------------------------------------
 
-    // decide on the method
     int64_t cnz = GB_NNZ (C) ;          // includes live entries and zombies
     int64_t cnpending = GB_Pending_n (C) ;  // # pending tuples in C
     int64_t tnz = GB_NNZ (T) ;
@@ -252,40 +262,92 @@ GrB_Info GB_accum_mask          // C<M> = accum (C,T)
     // then use subassign.  It will be fast when T is very sparse and C has
     // many nonzeros.  If the # of pending tuples in C is growing, however,
     // then it would be better to finish the work now, and leave C completed.
-    // In this case, GB_transplant (if no accum) or GB_add with accum, and
+    // In this case, GB_transplant if no accum or GB_add with accum, and
     // GB_mask are used for the accum/mask step.
 
     // If there is no mask M, and no accum, then C=T is fast (just
     // GB_transplant for Z=T and GB_transplant_conform in GB_mask for C=Z).
-    // So in this case, GB_subassigner takes more work.
+    // So in this case, GB_subassign takes more work.
+
+    if (GB_aliased (C, M)) GBURBLE ("(C aliased with M) ") ;
+    if (GB_aliased (C, T)) GBURBLE ("(C aliased with T) ") ;
 
-    bool use_subassigner =
-        ((M != NULL || accum != NULL) && (tnz + cnpending <= cnz)
-            && !GB_aliased (C, M) && !GB_aliased (C, T)) ;
+    bool use_subassign = false ;
 
-    bool use_transplant = (!use_subassigner)
+    if (M != NULL || accum != NULL)
+    {
+        if (GB_IS_BITMAP (C) || GB_IS_FULL (C))
+        { 
+            // always use GB_subassign if C is bitmap or full and M and/or
+            // accum is present.  No zombies or pending tuples are introduced
+            // into C, and C is modified in-place, so GB_subassign is very
+            // efficient in this case.
+            use_subassign = true ;
+        }
+        else
+        { 
+            // C is sparse or hypersparse (at least for now, before any wait on
+            // C): use GB_subassign if the update is small (resuling in a small
+            // number of pending tuples), and if C is not aliased with M or T.
+            use_subassign = (tnz + cnpending <= cnz)
+                && !GB_aliased (C, M) && !GB_aliased (C, T) ;
+        }
+    }
+
+    bool use_transplant = (!use_subassign)
         && (accum == NULL || (cnz + cnpending) == 0) ;
 
+    if (!use_subassign && (!use_transplant || (M != NULL && !C_replace)))
+    {
+        // GB_accum_mask will be used instead of GB_subassign, or so it
+        // appears.  GB_subassign does not require the pending work in C to be
+        // finished, but GB_accum_mask does in most cases.  Finish the work on
+        // C now.  This may change C to bitmap/full, so recheck the bitmap/full
+        // condition on C after doing the GB_MATRIX_WAIT (C).
+        GB_MATRIX_WAIT (C) ;
+        if (GB_IS_BITMAP (C) || GB_IS_FULL (C))
+        { 
+            // See Test/test182 for a test that triggers this condition.
+            // GB_MATRIX_WAIT (C) has changed C from sparse/hyper to
+            // bitmap/full.  GB_mask does not handle the case where M is
+            // present, C_replace is false, and C is bitmap/full, so switch to
+            // GB_subassign.
+            use_subassign = true ;
+        }
+    }
+
+    // use_subassign has been reconsidered and the pending work on C may now
+    // be finished, which changes cnz and cnpending.  Recompute use_transplant.
+    cnz = GB_NNZ (C) ;              // includes live entries and zombies
+    cnpending = GB_Pending_n (C) ;  // # pending tuples in C
+    use_transplant = (!use_subassign)
+        && (accum == NULL || (cnz + cnpending) == 0) ;
+
+    // burble the decision on which method to use
     if (!use_transplant)
     { 
-        GBBURBLE ("(C%s%s=Z via %s%s%s) ",
+        GBURBLE ("(C%s%s=Z via %s%s%s) ",
             ((M == NULL) ? "" : ((Mask_comp) ? "<!M>" : "<M>")),
             ((accum == NULL) ? "" : "+"),
-            ((use_subassigner) ? "assign" :
-                ((use_transplant) ? "transplant" : "add")),
+            ((use_subassign) ? "assign" : "add"),
             (M_transposed ? "(M transposed)" : ""),
             (T_transposed ? "(result transposed)" : "")) ;
     }
 
-    if (use_subassigner)
+    //--------------------------------------------------------------------------
+    // apply the accumulator and the mask
+    //--------------------------------------------------------------------------
+
+    if (use_subassign)
     { 
 
         //----------------------------------------------------------------------
-        // C(:,:)<M> = accum (C(:,:),T) via GB_subassigner
+        // C(:,:)<M> = accum (C(:,:),T) via GB_subassign
         //----------------------------------------------------------------------
 
-        GB_OK (GB_subassigner (C, C_replace, M, Mask_comp, Mask_struct, accum,
-            T, GrB_ALL, 0, GrB_ALL, 0, false, NULL, GB_ignore_code, Context)) ;
+        GB_OK (GB_subassign (C, C_replace, M, Mask_comp, Mask_struct,
+            false, accum, T, false, GrB_ALL, 0, GrB_ALL, 0,
+            false, NULL, GB_ignore_code, Context)) ;
 
     }
     else
@@ -295,56 +357,57 @@ GrB_Info GB_accum_mask          // C<M> = accum (C,T)
         // C<M> = accum (C,T) via GB_transplant or GB_add, and GB_mask
         //----------------------------------------------------------------------
 
-        //----------------------------------------------------------------------
-        // apply the accumulator (Z = accum (C,T) or Z=T if accum not present)
-        //----------------------------------------------------------------------
-
         // see GB_spec_accum.m for a description of this step.  If C is empty,
         // then the accumulator can be ignored.
 
         if (use_transplant)
         { 
-
+ 
             //------------------------------------------------------------------
-            // Z = (ctype) T ;
+            // Z = (ctype) T
             //------------------------------------------------------------------
 
-            // [ Z is just the header; the rest can be allocated by the
-            // transplant if needed.  Z has the same hypersparsity as T.
+            // GB_new allocates just the header for Z; the rest can be
+            // allocated by the transplant if needed.  Z has the same
+            // hypersparsity as T.
 
-            info = GB_new (&Z, C->type, C->vlen, C->vdim, GB_Ap_null, C->is_csc,
-                GB_SAME_HYPER_AS (T->is_hyper), T->hyper_ratio, T->plen,
-                Context) ;
+            info = GB_new (&Z, // sparse or hyper, new header
+                C->type, C->vlen, C->vdim, GB_Ap_null, C->is_csc,
+                GB_sparsity (T), T->hyper_switch, T->plen, Context) ;
             GB_OK (info) ;
 
             // Transplant T into Z, typecasting if needed, and free T.  This
             // may need to do a deep copy if T is shallow.  T is always freed
             // by GB_transplant.
 
-            // Z and T have same vlen, vdim, is_csc, is_hyper
+            // Z and T have same vlen, vdim, is_csc, hypersparsity
             GB_OK (GB_transplant (Z, C->type, Thandle, Context)) ;
-            // Z initialized, and Z->p, Z->h, Z->i, and Z->x are allocated ]
 
         }
         else
         { 
 
             //------------------------------------------------------------------
-            // Z = (ctype) accum (C,T) ;
+            // Z = (ctype) accum (C,T)
             //------------------------------------------------------------------
 
-            // use the mask if present, not complemented, and very sparse
-            GrB_Matrix M1 = NULL ;
-            if (M != NULL && !Mask_comp && GB_MASK_VERY_SPARSE (M, C, T))
-            {
-                M1 = M ;
-            }
+            // GB_add_sparsity needs the final sparsity pattern of C and T.
+            GB_MATRIX_WAIT (C) ;
+            GB_MATRIX_WAIT (T) ;
+
+            bool apply_mask ;
+            int Z_sparsity = GB_add_sparsity (&apply_mask, M, Mask_comp, C, T) ;
 
-            GB_OK (GB_add (&Z, C->type, C->is_csc, M1, Mask_struct, C, T,
-                accum, Context)) ;
-            GB_MATRIX_FREE (Thandle) ;
+            // whether or not GB_add chooses to exploit the mask, it must still
+            // be used in GB_mask, below.  So ignore the mask_applied return
+            // flag from GB_add.
+            bool ignore ;
+            GB_OK (GB_add (&Z, C->type, C->is_csc, (apply_mask) ? M : NULL,
+                Mask_struct, Mask_comp, &ignore, C, T, accum, Context)) ;
+            GB_Matrix_free (Thandle) ;
         }
 
+        // T has been transplanted into Z or freed after Z=C+T
         ASSERT (*Thandle == NULL) ;
 
         // C and Z have the same type
@@ -352,20 +415,12 @@ GrB_Info GB_accum_mask          // C<M> = accum (C,T)
         ASSERT (Z->type == C->type) ;
 
         //----------------------------------------------------------------------
-        // apply the mask (C<M> = Z)
+        // apply the mask (C<M>=Z) and free Z
         //----------------------------------------------------------------------
 
-        // see GB_spec_mask.m for a description of this step.
-
-        // C->hyper_ratio is not modified by GB_mask, which conforms
-        // the hypersparsity of C to that parameter.
-
-        // apply the mask, storing the results back into C, and free Z.
         ASSERT_MATRIX_OK (C, "C<M>=Z input", GB0) ;
         GB_OK (GB_mask (C, M, &Z, C_replace, Mask_comp, Mask_struct, Context)) ;
         ASSERT (Z == NULL) ;
-        ASSERT (!C->p_shallow && !C->h_shallow) ;
-        ASSERT (!C->i_shallow && !C->x_shallow) ;
     }
 
     //--------------------------------------------------------------------------
@@ -374,6 +429,6 @@ GrB_Info GB_accum_mask          // C<M> = accum (C,T)
 
     GB_FREE_ALL ;
     ASSERT_MATRIX_OK (C, "C<M>=accum(C,T)", GB0) ;
-    return (GrB_SUCCESS) ;
+    return (GB_block (C, Context)) ;
 }
 
diff --git a/GraphBLAS/Source/GB_accum_mask.h b/GraphBLAS/Source/GB_accum_mask.h
index 1b705f8a7b..4e8691fc4c 100644
--- a/GraphBLAS/Source/GB_accum_mask.h
+++ b/GraphBLAS/Source/GB_accum_mask.h
@@ -2,8 +2,8 @@
 // GB_accum_mask.h: definitions for GB_accum_mask
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_add.c b/GraphBLAS/Source/GB_add.c
index cd03f8e574..aad0806aa0 100644
--- a/GraphBLAS/Source/GB_add.c
+++ b/GraphBLAS/Source/GB_add.c
@@ -1,17 +1,16 @@
 //------------------------------------------------------------------------------
-// GB_add: C = A+B or C<M>=A+B, but not C<!M>=A+B
+// GB_add: C = A+B, C<M>=A+B, and C<!M>=A+B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// GB_add computes C=A+B or C<M>=A+B, using the given operator element-wise on
-// the matrices A and B.  The result is typecasted as needed.  The pattern of C
-// is the union of the pattern of A and B, intersection with the mask M, if
-// present.  The C<!M>=A+B case is not handled; the complemented mask is
-// handled in GB_mask.
+// GB_add computes C=A+B, C<M>=A+B, or C<!M>=A+B using the given operator
+// element-wise on the matrices A and B.  The result is typecasted as needed.
+// The pattern of C is the union of the pattern of A and B, intersection with
+// the mask M, if present.
 
 // Let the op be z=f(x,y) where x, y, and z have type xtype, ytype, and ztype.
 // If both A(i,j) and B(i,j) are present, then:
@@ -30,21 +29,30 @@
 
 // op may be NULL.  In this case, the intersection of A and B must be empty.
 // This is used by GB_Matrix_wait only, for merging the pending tuple matrix T
-// into A.  Any duplicate pending tuples have already been summed in T, so the
-// intersection of T and A is always empty.
+// into A.  In this case, the result C is always sparse or hypersparse, not
+// bitmap or full.  Any duplicate pending tuples have already been summed in T,
+// so the intersection of T and A is always empty.
+
+// Some methods should not exploit the mask, but leave it for later.
+// See GB_ewise and GB_accum_mask: the only places where this function is
+// called with a non-null mask M.  Both of those callers can handle the
+// mask being applied later.  GB_add_sparsity determines whether or not the
+// mask should be applied now, or later.
 
 #include "GB_add.h"
 
 #define GB_FREE_ALL ;
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_add             // C=A+B or C<M>=A+B
+GrB_Info GB_add             // C=A+B, C<M>=A+B, or C<!M>=A+B
 (
     GrB_Matrix *Chandle,    // output matrix (unallocated on input)
     const GrB_Type ctype,   // type of output matrix C
     const bool C_is_csc,    // format of output matrix C
     const GrB_Matrix M,     // optional mask for C, unused if NULL
     const bool Mask_struct, // if true, use the only structure of M
+    const bool Mask_comp,   // if true, use !M
+    bool *mask_applied,     // if true, the mask was applied
     const GrB_Matrix A,     // input A matrix
     const GrB_Matrix B,     // input B matrix
     const GrB_BinaryOp op,  // op to perform C = op (A,B)
@@ -57,97 +65,130 @@ GrB_Info GB_add             // C=A+B or C<M>=A+B
     //--------------------------------------------------------------------------
 
     GrB_Info info ;
-    GBBURBLE ((M == NULL) ? "add " : "masked_add ") ;
 
     ASSERT (Chandle != NULL) ;
+    (*Chandle) = NULL ;
+    GrB_Matrix C = NULL ;
+
+    ASSERT (mask_applied != NULL) ;
+    (*mask_applied) = false ;
+
     ASSERT_MATRIX_OK (A, "A for add", GB0) ;
     ASSERT_MATRIX_OK (B, "B for add", GB0) ;
     ASSERT_BINARYOP_OK_OR_NULL (op, "op for add", GB0) ;
     ASSERT_MATRIX_OK_OR_NULL (M, "M for add", GB0) ;
     ASSERT (A->vdim == B->vdim && A->vlen == B->vlen) ;
-    if (M != NULL)
-    { 
-        ASSERT (A->vdim == M->vdim && A->vlen == M->vlen) ;
-    }
+    ASSERT (GB_IMPLIES (M != NULL, A->vdim == M->vdim && A->vlen == M->vlen)) ;
 
+    //--------------------------------------------------------------------------
     // delete any lingering zombies and assemble any pending tuples
-    GB_MATRIX_WAIT (M) ;
-    GB_MATRIX_WAIT (A) ;
-    GB_MATRIX_WAIT (B) ;
+    //--------------------------------------------------------------------------
+
+    GB_MATRIX_WAIT (M) ;        // cannot be jumbled
+    GB_MATRIX_WAIT (A) ;        // cannot be jumbled
+    GB_MATRIX_WAIT (B) ;        // cannot be jumbled
+
+    //--------------------------------------------------------------------------
+    // determine the sparsity of C
+    //--------------------------------------------------------------------------
+
+    bool apply_mask ;
+    int C_sparsity = GB_add_sparsity (&apply_mask, M, Mask_comp, A, B) ;
 
     //--------------------------------------------------------------------------
     // initializations
     //--------------------------------------------------------------------------
 
-    GrB_Matrix C = NULL ;
     int64_t Cnvec, Cnvec_nonempty ;
     int64_t *Cp = NULL, *Ch = NULL ;
     int64_t *C_to_M = NULL, *C_to_A = NULL, *C_to_B = NULL ;
     bool Ch_is_Mh ;
-    int ntasks, max_ntasks, nthreads ;
+    int C_ntasks = 0, TaskList_size = 0, C_nthreads ;
     GB_task_struct *TaskList = NULL ;
 
     //--------------------------------------------------------------------------
-    // phase0: determine the vectors in C(:,j)
+    // phase0: finalize the sparsity C and find the vectors in C
     //--------------------------------------------------------------------------
 
     info = GB_add_phase0 (
         // computed by by phase0:
         &Cnvec, &Ch, &C_to_M, &C_to_A, &C_to_B, &Ch_is_Mh,
+        // input/output to phase0:
+        &C_sparsity,
         // original input:
-        M, A, B, Context) ;
-
+        (apply_mask) ? M : NULL, A, B, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory
         return (info) ;
     }
 
+    GBURBLE ("add:(%s<%s>=%s+%s) ",
+        GB_sparsity_char (C_sparsity),
+        GB_sparsity_char_matrix (M),
+        GB_sparsity_char_matrix (A),
+        GB_sparsity_char_matrix (B)) ;
+
     //--------------------------------------------------------------------------
-    // phase0b: split C into tasks for phase1 and phase2
+    // phase1: split C into tasks, and count entries in each vector of C
     //--------------------------------------------------------------------------
 
-    info = GB_ewise_slice (
-        // computed by phase0b
-        &TaskList, &max_ntasks, &ntasks, &nthreads,
-        // computed by phase0:
-        Cnvec, Ch, C_to_M, C_to_A, C_to_B, Ch_is_Mh,
-        // original input:
-        M, A, B, Context) ;
+    if (C_sparsity == GxB_SPARSE || C_sparsity == GxB_HYPERSPARSE)
+    {
+
+        //----------------------------------------------------------------------
+        // C is sparse or hypersparse: slice and analyze the C matrix
+        //----------------------------------------------------------------------
+
+        // phase1a: split C into tasks
+        info = GB_ewise_slice (
+            // computed by phase1a
+            &TaskList, &TaskList_size, &C_ntasks, &C_nthreads,
+            // computed by phase0:
+            Cnvec, Ch, C_to_M, C_to_A, C_to_B, Ch_is_Mh,
+            // original input:
+            (apply_mask) ? M : NULL, A, B, Context) ;
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory; free everything allocated by GB_add_phase0
+            GB_FREE (Ch) ;
+            GB_FREE (C_to_M) ;
+            GB_FREE (C_to_A) ;
+            GB_FREE (C_to_B) ;
+            return (info) ;
+        }
+
+        // count the number of entries in each vector of C
+        info = GB_add_phase1 (
+            // computed or used by phase1:
+            &Cp, &Cnvec_nonempty, op == NULL,
+            // from phase1a:
+            TaskList, C_ntasks, C_nthreads,
+            // from phase0:
+            Cnvec, Ch, C_to_M, C_to_A, C_to_B, Ch_is_Mh,
+            // original input:
+            (apply_mask) ? M : NULL, Mask_struct, Mask_comp, A, B, Context) ;
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory; free everything allocated by GB_add_phase0
+            GB_FREE (TaskList) ;
+            GB_FREE (Ch) ;
+            GB_FREE (C_to_M) ;
+            GB_FREE (C_to_A) ;
+            GB_FREE (C_to_B) ;
+            return (info) ;
+        }
 
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory; free everything allocated by GB_add_phase0
-        GB_FREE (Ch) ;
-        GB_FREE (C_to_M) ;
-        GB_FREE (C_to_A) ;
-        GB_FREE (C_to_B) ;
-        return (info) ;
     }
+    else
+    { 
 
-    //--------------------------------------------------------------------------
-    // phase1: count the number of entries in each vector of C
-    //--------------------------------------------------------------------------
-
-    info = GB_add_phase1 (
-        // computed or used by phase1:
-        &Cp, &Cnvec_nonempty, op == NULL,
-        // from phase0b:
-        TaskList, ntasks, nthreads,
-        // from phase0:
-        Cnvec, Ch, C_to_M, C_to_A, C_to_B, Ch_is_Mh,
-        // original input:
-        M, Mask_struct, A, B, Context) ;
+        //----------------------------------------------------------------------
+        // C is bitmap or full: only determine how many threads to use
+        //----------------------------------------------------------------------
 
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory; free everything allocated by GB_add_phase0
-        GB_FREE (TaskList) ;
-        GB_FREE (Ch) ;
-        GB_FREE (C_to_M) ;
-        GB_FREE (C_to_A) ;
-        GB_FREE (C_to_B) ;
-        return (info) ;
+        GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+        C_nthreads = GB_nthreads (A->vlen * A->vdim, chunk, nthreads_max) ;
     }
 
     //--------------------------------------------------------------------------
@@ -160,14 +201,15 @@ GrB_Info GB_add             // C=A+B or C<M>=A+B
     info = GB_add_phase2 (
         // computed or used by phase2:
         &C, ctype, C_is_csc, op,
-        // from phase1:
-        Cp, Cnvec_nonempty,
-        // from phase0b:
-        TaskList, ntasks, nthreads,
+        // from phase1 and phase1a:
+        Cp, Cnvec_nonempty, TaskList, C_ntasks, C_nthreads,
         // from phase0:
-        Cnvec, Ch, C_to_M, C_to_A, C_to_B, Ch_is_Mh,
+        Cnvec, Ch, C_to_M, C_to_A, C_to_B, Ch_is_Mh, C_sparsity,
         // original input:
-        M, Mask_struct, A, B, Context) ;
+        (apply_mask) ? M : NULL, Mask_struct, Mask_comp, A, B, Context) ;
+
+    // Ch and Cp must not be freed; they are now C->h and C->p.
+    // If the method failed, Cp and Ch have already been freed.
 
     // free workspace
     GB_FREE (TaskList) ;
@@ -177,18 +219,17 @@ GrB_Info GB_add             // C=A+B or C<M>=A+B
 
     if (info != GrB_SUCCESS)
     { 
-        // out of memory; note that Cp and Ch are already freed
+        // out of memory
         return (info) ;
     }
 
-    // if successful, Ch and Cp must not be freed; they are now C->h and C->p
-
     //--------------------------------------------------------------------------
     // return result
     //--------------------------------------------------------------------------
 
     ASSERT_MATRIX_OK (C, "C output for add", GB0) ;
     (*Chandle) = C ;
+    (*mask_applied) = apply_mask ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_add.h b/GraphBLAS/Source/GB_add.h
index cf289c905e..1107023bf8 100644
--- a/GraphBLAS/Source/GB_add.h
+++ b/GraphBLAS/Source/GB_add.h
@@ -2,8 +2,8 @@
 // GB_add.h: definitiions for GB_add and related functions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,13 +12,15 @@
 #include "GB.h"
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_add             // C=A+B or C<M>=A+B
+GrB_Info GB_add             // C=A+B, C<M>=A+B, or C<!M>=A+B
 (
     GrB_Matrix *Chandle,    // output matrix (unallocated on input)
     const GrB_Type ctype,   // type of output matrix C
     const bool C_is_csc,    // format of output matrix C
     const GrB_Matrix M,     // optional mask for C, unused if NULL
     const bool Mask_struct, // if true, use the only structure of M
+    const bool Mask_comp,   // if true, use !M
+    bool *mask_applied,     // if true, the mask was applied
     const GrB_Matrix A,     // input A matrix
     const GrB_Matrix B,     // input B matrix
     const GrB_BinaryOp op,  // op to perform C = op (A,B)
@@ -33,9 +35,10 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
     int64_t *GB_RESTRICT *C_to_A_handle,    // C_to_A: size Cnvec, or NULL
     int64_t *GB_RESTRICT *C_to_B_handle,    // C_to_B: of size Cnvec, or NULL
     bool *p_Ch_is_Mh,           // if true, then Ch == Mh
+    int *C_sparsity,            // sparsity structure of C
     const GrB_Matrix M,         // optional mask, may be NULL; not complemented
-    const GrB_Matrix A,         // standard, hypersparse, slice, or hyperslice
-    const GrB_Matrix B,         // standard or hypersparse; never a slice
+    const GrB_Matrix A,         // first input matrix
+    const GrB_Matrix B,         // second input matrix
     GB_Context Context
 ) ;
 
@@ -45,9 +48,9 @@ GrB_Info GB_add_phase1                  // count nnz in each C(:,j)
     int64_t *Cnvec_nonempty,            // # of non-empty vectors in C
     const bool A_and_B_are_disjoint,    // if true, then A and B are disjoint
     // tasks from phase0b:
-    GB_task_struct *GB_RESTRICT TaskList,      // array of structs
-    const int ntasks,                       // # of tasks
-    const int nthreads,                     // # of threads to use
+    GB_task_struct *GB_RESTRICT TaskList,   // array of structs
+    const int C_ntasks,                 // # of tasks
+    const int C_nthreads,               // # of threads to use
     // analysis from phase0:
     const int64_t Cnvec,
     const int64_t *GB_RESTRICT Ch,
@@ -56,26 +59,27 @@ GrB_Info GB_add_phase1                  // count nnz in each C(:,j)
     const int64_t *GB_RESTRICT C_to_B,
     const bool Ch_is_Mh,                // if true, then Ch == M->h
     // original input:
-    const GrB_Matrix M,                 // optional mask, may be NULL
+    const GrB_Matrix M,             // optional mask, may be NULL
     const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, use !M
     const GrB_Matrix A,
     const GrB_Matrix B,
     GB_Context Context
 ) ;
 
-GrB_Info GB_add_phase2      // C=A+B or C<M>=A+B
+GrB_Info GB_add_phase2      // C=A+B, C<M>=A+B, or C<!M>=A+B
 (
     GrB_Matrix *Chandle,    // output matrix (unallocated on input)
     const GrB_Type ctype,   // type of output matrix C
     const bool C_is_csc,    // format of output matrix C
     const GrB_BinaryOp op,  // op to perform C = op (A,B), or NULL if no op
     // from phase1:
-    const int64_t *GB_RESTRICT Cp,         // vector pointers for C
-    const int64_t Cnvec_nonempty,       // # of non-empty vectors in C
-    // tasks from phase0b:
+    const int64_t *GB_RESTRICT Cp,  // vector pointers for C
+    const int64_t Cnvec_nonempty,   // # of non-empty vectors in C
+    // tasks from phase1a:
     const GB_task_struct *GB_RESTRICT TaskList,    // array of structs
-    const int ntasks,                           // # of tasks
-    const int nthreads,                         // # of threads to use
+    const int C_ntasks,         // # of tasks
+    const int C_nthreads,       // # of threads to use
     // analysis from phase0:
     const int64_t Cnvec,
     const int64_t *GB_RESTRICT Ch,
@@ -83,12 +87,25 @@ GrB_Info GB_add_phase2      // C=A+B or C<M>=A+B
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const bool Ch_is_Mh,        // if true, then Ch == M->h
+    const int C_sparsity,
     // original input:
     const GrB_Matrix M,         // optional mask, may be NULL
-    const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_struct,     // if true, use the only structure of M
+    const bool Mask_comp,       // if true, use !M
     const GrB_Matrix A,
     const GrB_Matrix B,
     GB_Context Context
 ) ;
 
+int GB_add_sparsity         // return the sparsity structure for C
+(
+    // output:
+    bool *apply_mask,       // if true then mask will be applied
+    // input:
+    const GrB_Matrix M,     // optional mask for C, unused if NULL
+    const bool Mask_comp,   // if true, use !M
+    const GrB_Matrix A,     // input A matrix
+    const GrB_Matrix B      // input B matrix
+) ;
+
 #endif
diff --git a/GraphBLAS/Source/GB_add_phase0.c b/GraphBLAS/Source/GB_add_phase0.c
index ad4d5a9918..27abc0b305 100644
--- a/GraphBLAS/Source/GB_add_phase0.c
+++ b/GraphBLAS/Source/GB_add_phase0.c
@@ -2,23 +2,19 @@
 // GB_add_phase0: find vectors of C to compute for C=A+B or C<M>=A+B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // The eWise add of two matrices, C=A+B, C<M>=A+B, or C<!M>=A+B starts with
 // this phase, which determines which vectors of C need to be computed.
-// This phase is also used for GB_masker.
+// This phase is also used for GB_masker, and for GB_SUBASSIGN_TWO_SLICE.
 
 // On input, A and B are the two matrices being added, and M is the optional
 // mask matrix (not complemented).  The complemented mask is handed in GB_mask,
 // not here.
 
-// The A matrix can be sparse, hypersparse, slice, or hyperslice.  The B matrix
-// can only be sparse or hypersparse.  See GB_Matrix_wait, which can pass in A
-// as any of the four formats.  In this case, no mask is present.
-
 // On output, an integer (Cnvec) a boolean (Ch_to_Mh) and up to 3 arrays are
 // returned, either NULL or of size Cnvec.  Let n = A->vdim be the vector
 // dimension of A, B, M and C.
@@ -29,32 +25,36 @@
 //      the mask, or because the vector j appeared in A or B, but is empty.
 //      It is pruned at the end of GB_add_phase2.  If Ch is NULL then it is an
 //      implicit list of size n, and Ch [k] == k for all k = 0:n-1.  In this
-//      case, C will be a standard matrix, not hypersparse.  Thus, the kth
-//      vector is j = (Ch == NULL) ? k : Ch [k].
+//      case, C will be a sparse matrix, not hypersparse.  Thus, the kth
+//      vector is j = GBH (Ch, k).
 
 //      Ch is freed by GB_add if phase1 fails.  phase2 either frees it or
-//      transplants it into C.
+//      transplants it into C, if C is hypersparse.
 
 //      Ch_is_Mh:  true if the mask M is present, hypersparse, and not
 //      complemented, false otherwise.  In this case Ch is a deep copy of Mh.
-//      Only GB_add uses this option; it is not used by GB_masker (Ch_is_Mh
-//      is always false for GB_masker).  This is determined by passing in
-//      p_Ch_is_Mh as a NULL or non-NULL pointer.
+//      Only GB_add uses this option; it is not used by GB_masker or
+//      GB_SUBASSIGN_TWO_SLICE (Ch_is_Mh is always false in this case).  This
+//      is determined by passing in p_Ch_is_Mh as a NULL or non-NULL pointer.
 
-//      C_to_A:  if A is hypersparse, then C_to_A [k] = kA if the kth vector, j
-//      = (Ch == NULL) ? k : Ch [k] appears in A, as j = Ah [kA].  If j does
-//      not appear in A, then C_to_A [k] = -1.  If A is not hypersparse, then
-//      C_to_A is returned as NULL.
+//      C_to_A:  if A is hypersparse, then C_to_A [k] = kA if the kth vector,
+//      j = GBH (Ch, k) appears in A, as j = Ah [kA].  If j does not appear in
+//      A, then C_to_A [k] = -1.  If A is not hypersparse, then C_to_A is
+//      returned as NULL.
 
-//      C_to_B:  if B is hypersparse, then C_to_B [k] = kB if the kth vector, j
-//      = (Ch == NULL) ? k : Ch [k] appears in B, as j = Bh [kB].  If j does
-//      not appear in B, then C_to_B [k] = -1.  If B is not hypersparse, then
-//      C_to_B is returned as NULL.
+//      C_to_B:  if B is hypersparse, then C_to_B [k] = kB if the kth vector,
+//      j = GBH (Ch, k) appears in B, as j = Bh [kB].  If j does not appear in
+//      B, then C_to_B [k] = -1.  If B is not hypersparse, then C_to_B is
+//      returned as NULL.
 
 //      C_to_M:  if M is hypersparse, and Ch_is_Mh is false, then C_to_M [k] =
-//      kM if the kth vector, j = (Ch == NULL) ? k : Ch [k] appears in M, as j
-//      = Mh [kM].  If j does not appear in M, then C_to_M [k] = -1.  If M is
-//      not hypersparse, then C_to_M is returned as NULL.
+//      kM if the kth vector, j = GBH (Ch, k) appears in M, as j = Mh [kM].  If
+//      j does not appear in M, then C_to_M [k] = -1.  If M is not hypersparse,
+//      then C_to_M is returned as NULL.
+
+// M, A, B: any sparsity structure (hypersparse, sparse, bitmap, or full)
+// C: not present here, but its sparsity structure is finalized, via the
+// input/output parameter C_sparsity.
 
 #include "GB_add.h"
 
@@ -135,9 +135,10 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
     int64_t *GB_RESTRICT *C_to_A_handle,    // C_to_A: size Cnvec, or NULL
     int64_t *GB_RESTRICT *C_to_B_handle,    // C_to_B: of size Cnvec, or NULL
     bool *p_Ch_is_Mh,           // if true, then Ch == Mh
+    int *C_sparsity,            // sparsity structure of C
     const GrB_Matrix M,         // optional mask, may be NULL; not complemented
-    const GrB_Matrix A,         // standard, hypersparse, slice, or hyperslice
-    const GrB_Matrix B,         // standard or hypersparse; never a slice
+    const GrB_Matrix A,         // first input matrix
+    const GrB_Matrix B,         // second input matrix
     GB_Context Context
 )
 {
@@ -146,32 +147,61 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
     // check inputs
     //--------------------------------------------------------------------------
 
+    // M, A, and B can be jumbled for this phase, but not phase1 or phase2
+
     ASSERT (p_Cnvec != NULL) ;
     ASSERT (Ch_handle != NULL) ;
     ASSERT (C_to_A_handle != NULL) ;
     ASSERT (C_to_B_handle != NULL) ;
+
+    ASSERT_MATRIX_OK_OR_NULL (M, "M for add phase0", GB0) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;        // pattern not accessed
+    ASSERT (!GB_PENDING (M)) ;
+
     ASSERT_MATRIX_OK (A, "A for add phase0", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_JUMBLED_OK (B)) ;        // pattern not accessed
+    ASSERT (!GB_PENDING (A)) ;
+
     ASSERT_MATRIX_OK (B, "B for add phase0", GB0) ;
-    ASSERT_MATRIX_OK_OR_NULL (M, "M for add phase0", GB0) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;        // pattern not accessed
+    ASSERT (!GB_PENDING (B)) ;
+
     ASSERT (A->vdim == B->vdim) ;
+    ASSERT (A->vlen == B->vlen) ;
     ASSERT (GB_IMPLIES (M != NULL, A->vdim == M->vdim)) ;
+    ASSERT (GB_IMPLIES (M != NULL, A->vlen == M->vlen)) ;
 
     //--------------------------------------------------------------------------
     // initializations
     //--------------------------------------------------------------------------
 
-    int64_t *GB_RESTRICT Ch = NULL ;
-    int64_t *GB_RESTRICT C_to_M = NULL ;
-    int64_t *GB_RESTRICT C_to_A = NULL ;
-    int64_t *GB_RESTRICT C_to_B = NULL ;
-
+    (*p_Cnvec) = 0 ;
     (*Ch_handle) = NULL ;
-    (*C_to_A_handle) = NULL ;
-    (*C_to_B_handle) = NULL ;
     if (C_to_M_handle != NULL)
     { 
         (*C_to_M_handle) = NULL ;
     }
+    (*C_to_A_handle) = NULL ;
+    (*C_to_B_handle) = NULL ;
+    if (p_Ch_is_Mh != NULL)
+    { 
+        (*p_Ch_is_Mh) = false ;
+    }
+
+    if ((*C_sparsity) == GxB_BITMAP || (*C_sparsity) == GxB_FULL)
+    { 
+        // nothing to do in phase0 for C bitmap or full
+        (*p_Cnvec) = A->vdim ;  // not needed; to be consistent with GB_emult
+        return (GrB_SUCCESS) ;
+    }
+
+    int64_t *GB_RESTRICT Ch = NULL ;
+    int64_t *GB_RESTRICT C_to_M = NULL ;
+    int64_t *GB_RESTRICT C_to_A = NULL ;
+    int64_t *GB_RESTRICT C_to_B = NULL ;
 
     int64_t *GB_RESTRICT kA_start = NULL ;
     int64_t *GB_RESTRICT kB_start = NULL ;
@@ -193,30 +223,26 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
 
     int64_t n = A->vdim ;
     int64_t Anvec = A->nvec ;
-    bool A_is_hyper = A->is_hyper ;
-    bool A_is_slice = A->is_slice ;
+    int64_t vlen = A->vlen ;
     const int64_t *GB_RESTRICT Ap = A->p ;
-    const int64_t *GB_RESTRICT Ah = (A_is_hyper) ? A->h : NULL ;
-    const int64_t A_hfirst = A->hfirst ;
-    #define GB_Ah(k) (A_is_hyper ? Ah [k] : (A_hfirst + (k)))
+    const int64_t *GB_RESTRICT Ah = A->h ;
+    bool A_is_hyper = (Ah != NULL) ;
+    #define GB_Ah(k) (A_is_hyper ? Ah [k] : (k))
 
     int64_t Bnvec = B->nvec ;
     const int64_t *GB_RESTRICT Bp = B->p ;
     const int64_t *GB_RESTRICT Bh = B->h ;
-    bool B_is_hyper = B->is_hyper ;
-    ASSERT (!B->is_slice) ;
+    bool B_is_hyper = (Bh != NULL) ;
 
     int64_t Mnvec = 0 ;
     const int64_t *GB_RESTRICT Mp = NULL ;
     const int64_t *GB_RESTRICT Mh = NULL ;
-    bool M_is_hyper = false ;
+    bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
     if (M != NULL)
     { 
         Mnvec = M->nvec ;
         Mp = M->p ;
         Mh = M->h ;
-        M_is_hyper = M->is_hyper ;
-        ASSERT (!M->is_slice) ;
     }
 
     // For GB_add, if M is present, hypersparse, and not complemented, then C
@@ -232,24 +258,20 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
     {
 
         //----------------------------------------------------------------------
-        // C is hypersparse, with the same vectors as the hypersparse M
+        // C and M are hypersparse, with the same vectors as the hypersparse M
         //----------------------------------------------------------------------
 
-        // This step is done for GB_add only, not GB_masker.  GB_Matrix_wait is
-        // the only place where A may be a slice, and it does not use a mask.
-        // So this phase can ignore the case where A is a slice.
-
+        (*C_sparsity) = GxB_HYPERSPARSE ;
+        ASSERT (M_is_hyper) ;
         Cnvec = Mnvec ;
         nthreads = GB_nthreads (Cnvec, chunk, nthreads_max) ;
 
-        ASSERT (!A_is_slice) ;
-
         if (!GB_allocate_result (Cnvec, &Ch, NULL,
             (A_is_hyper) ? (&C_to_A) : NULL, (B_is_hyper) ? (&C_to_B) : NULL))
         { 
             // out of memory
             GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         // copy Mh into Ch.  Ch is Mh so C_to_M is not needed.
@@ -267,32 +289,36 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
                 { 
                     // C_to_A [k] = kA if Ah [kA] == j and A(:,j) is non-empty
                     int64_t kA = 0, pA, pA_end ;
-                    GB_lookup (true, Ah, Ap, &kA, Anvec-1, j, &pA, &pA_end) ;
+                    GB_lookup (true, Ah, Ap, vlen, &kA, Anvec-1, j,
+                        &pA, &pA_end) ;
                     C_to_A [k] = (pA < pA_end) ? kA : -1 ;
                 }
                 if (B_is_hyper)
                 { 
                     // C_to_B [k] = kB if Bh [kB] == j and B(:,j) is non-empty
                     int64_t kB = 0, pB, pB_end ;
-                    GB_lookup (true, Bh, Bp, &kB, Bnvec-1, j, &pB, &pB_end) ;
+                    GB_lookup (true, Bh, Bp, vlen, &kB, Bnvec-1, j,
+                        &pB, &pB_end) ;
                     C_to_B [k] = (pB < pB_end) ? kB : -1 ;
                 }
             }
         }
 
     }
-    else if ((A_is_hyper || A_is_slice) && B_is_hyper)
+    else if (A_is_hyper && B_is_hyper)
     {
 
         //----------------------------------------------------------------------
-        // A is hypersparse or a hyperslice, and B is hypersparse
+        // A and B are hypersparse: C will be hypersparse
         //----------------------------------------------------------------------
 
         // Ch is the set union of Ah and Bh.  This is handled with a parallel
         // merge, since Ah and Bh are both sorted lists.
 
+        (*C_sparsity) = GxB_HYPERSPARSE ;
+
         //----------------------------------------------------------------------
-        // phase 0: create the tasks
+        // create the tasks to construct Ch
         //----------------------------------------------------------------------
 
         double work = GB_IMIN (Anvec + Bnvec, n) ;
@@ -306,10 +332,10 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         kB_start = GB_MALLOC (ntasks+1, int64_t) ;
         kC_start = GB_MALLOC (ntasks+1, int64_t) ;
         if (kA_start == NULL || kB_start == NULL || kC_start == NULL)
-        {
+        { 
             // out of memory
             GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         kA_start [0] = (Anvec == 0) ? -1 : 0 ;
@@ -324,14 +350,14 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
             GB_slice_vector (NULL, NULL,
                 &(kA_start [taskid]), &(kB_start [taskid]),
                 0, 0, NULL,                 // Mi not present
-                0, Anvec, Ah, A_hfirst,     // Ah, explicit or implicit list
+                0, Anvec, Ah,               // Ah, explicit list
                 0, Bnvec, Bh,               // Bh, explicit list
                 n,                          // Ah and Bh have dimension n
                 target_work) ;
         }
 
         //----------------------------------------------------------------------
-        // phase 1: count the entries in the result of each task
+        // count the entries in Ch for each task
         //----------------------------------------------------------------------
 
         int taskid ;
@@ -369,14 +395,14 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         }
 
         //----------------------------------------------------------------------
-        // phase 1b: cumulative sum of entries for each task
+        // cumulative sum of entries in Ch for each task
         //----------------------------------------------------------------------
 
         GB_cumsum (kC_start, ntasks, NULL, 1) ;
         Cnvec = kC_start [ntasks] ;
 
         //----------------------------------------------------------------------
-        // allocate the result
+        // allocate the result: Ch and the mappings C_to_[MAB]
         //----------------------------------------------------------------------
 
         // C will be hypersparse, so Ch is allocated.  The mask M is ignored
@@ -387,11 +413,11 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         { 
             // out of memory
             GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         //----------------------------------------------------------------------
-        // phase 2: compute the result 
+        // compute the result: Ch and the mappings C_to_[AB]
         //----------------------------------------------------------------------
 
         #pragma omp parallel for num_threads(nthreads) schedule (dynamic,1)
@@ -521,14 +547,14 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         #endif
 
     }
-    else if ((A_is_hyper || A_is_slice) && !B_is_hyper)
+    else if (A_is_hyper && !B_is_hyper)
     {
 
         //----------------------------------------------------------------------
-        // A is hypersparse, B is standard
+        // A is hypersparse, B is not hypersparse
         //----------------------------------------------------------------------
 
-        // C will be standard.  Construct the C_to_A mapping.
+        // C will be sparse.  Construct the C_to_A mapping.
 
         Cnvec = n ;
         nthreads = GB_nthreads (Cnvec, chunk, nthreads_max) ;
@@ -538,7 +564,7 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         { 
             // out of memory
             GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         int64_t j ;
@@ -558,14 +584,14 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         }
 
     }
-    else if (!(A_is_hyper || A_is_slice) && B_is_hyper)
+    else if (!A_is_hyper && B_is_hyper)
     {
 
         //----------------------------------------------------------------------
-        // A is standard, B is hypersparse
+        // A is not hypersparse, B is hypersparse
         //----------------------------------------------------------------------
 
-        // C will be standard.  Construct the C_to_B mapping.
+        // C will be sparse.  Construct the C_to_B mapping.
 
         Cnvec = n ;
         nthreads = GB_nthreads (Cnvec, chunk, nthreads_max) ;
@@ -575,7 +601,7 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         { 
             // out of memory
             GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         int64_t j ;
@@ -599,11 +625,10 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
     {
 
         //----------------------------------------------------------------------
-        // A and B are both standard
+        // A and B are both non-hypersparse
         //----------------------------------------------------------------------
 
-        // C will be standard
-
+        // C will be sparse
         Cnvec = n ;
         nthreads = GB_nthreads (Cnvec, chunk, nthreads_max) ;
 
@@ -612,17 +637,17 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         { 
             // out of memory
             GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
-
     }
 
     //--------------------------------------------------------------------------
-    // construct C_to_M if needed
+    // construct C_to_M if needed, if M is hypersparse
     //--------------------------------------------------------------------------
 
     if (C_to_M != NULL)
     {
+        ASSERT (M_is_hyper) ;
         if (Ch != NULL)
         {
             // C is hypersparse
@@ -633,13 +658,13 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
                 int64_t j = Ch [k] ;
                 // C_to_M [k] = kM if Mh [kM] == j and M(:,j) is non-empty
                 int64_t kM = 0, pM, pM_end ;
-                GB_lookup (true, Mh, Mp, &kM, Mnvec-1, j, &pM, &pM_end) ;
+                GB_lookup (true, Mh, Mp, vlen, &kM, Mnvec-1, j, &pM, &pM_end) ;
                 C_to_M [k] = (pM < pM_end) ? kM : -1 ;
             }
         }
         else
         {
-            // C is standard
+            // C is sparse
             int64_t j ;
             #pragma omp parallel for num_threads(nthreads) schedule(static)
             for (j = 0 ; j < n ; j++)
@@ -661,18 +686,18 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
     // return result
     //--------------------------------------------------------------------------
 
-    (*p_Cnvec      ) = Cnvec ;
-    if (p_Ch_is_Mh != NULL)
+    (*p_Cnvec) = Cnvec ;
+    (*Ch_handle) = Ch ;
+    if (C_to_M_handle != NULL)
     { 
-        // return Ch_is_Mh to GB_add.  For GB_masker, Ch is never Mh.
-        (*p_Ch_is_Mh) = Ch_is_Mh ;
+        (*C_to_M_handle) = C_to_M ;
     }
-    (*Ch_handle    ) = Ch ;
     (*C_to_A_handle) = C_to_A ;
     (*C_to_B_handle) = C_to_B ;
-    if (C_to_M_handle != NULL)
+    if (p_Ch_is_Mh != NULL)
     { 
-        (*C_to_M_handle) = C_to_M ;
+        // return Ch_is_Mh to GB_add.  For GB_masker, Ch is never Mh.
+        (*p_Ch_is_Mh) = Ch_is_Mh ;
     }
 
     //--------------------------------------------------------------------------
@@ -680,6 +705,8 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
     //--------------------------------------------------------------------------
 
     #ifdef GB_DEBUG
+    // the mappings are only constructed when C is sparse or hypersparse
+    ASSERT ((*C_sparsity) == GxB_SPARSE || (*C_sparsity) == GxB_HYPERSPARSE) ;
     ASSERT (A != NULL) ;        // A and B are always present
     ASSERT (B != NULL) ;
     int64_t jlast = -1 ;
@@ -690,12 +717,14 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         int64_t j ;
         if (Ch == NULL)
         {
-            // C will be constructed as standard sparse
+            // C will be constructed as sparse
+            ASSERT ((*C_sparsity) == GxB_SPARSE) ;
             j = k ;
         }
         else
         {
             // C will be constructed as hypersparse
+            ASSERT ((*C_sparsity) == GxB_HYPERSPARSE) ;
             j = Ch [k] ;
         }
 
@@ -707,8 +736,8 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         // see if A (:,j) exists
         if (C_to_A != NULL)
         {
-            // A is hypersparse, or a slice
-            ASSERT (A->is_hyper || A->is_slice) ;
+            // A is hypersparse
+            ASSERT (A_is_hyper) ;
             int64_t kA = C_to_A [k] ;
             ASSERT (kA >= -1 && kA < A->nvec) ;
             if (kA >= 0)
@@ -719,16 +748,16 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         }
         else
         {
-            // A is in standard sparse form
+            // A is not hypersparse
             // C_to_A exists only if A is hypersparse
-            ASSERT (!(A->is_hyper || A->is_slice)) ;
+            ASSERT (!A_is_hyper) ;
         }
 
         // see if B (:,j) exists
         if (C_to_B != NULL)
         {
             // B is hypersparse
-            ASSERT (B->is_hyper) ;
+            ASSERT (B_is_hyper) ;
             int64_t kB = C_to_B [k] ;
             ASSERT (kB >= -1 && kB < B->nvec) ;
             if (kB >= 0)
@@ -739,9 +768,9 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         }
         else
         {
-            // B is in standard sparse form
+            // B is not hypersparse
             // C_to_B exists only if B is hypersparse
-            ASSERT (!B->is_hyper) ;
+            ASSERT (!B_is_hyper) ;
         }
 
         // see if M (:,j) exists
@@ -749,7 +778,7 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         {
             // Ch is the same as Mh
             ASSERT (M != NULL) ;
-            ASSERT (M->is_hyper) ;
+            ASSERT (M_is_hyper) ;
             ASSERT (Ch != NULL && M->h != NULL && Ch [k] == M->h [k]) ;
             ASSERT (C_to_M == NULL) ;
         }
@@ -757,7 +786,7 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         {
             // M is present and hypersparse
             ASSERT (M != NULL) ;
-            ASSERT (M->is_hyper) ;
+            ASSERT (M_is_hyper) ;
             int64_t kM = C_to_M [k] ;
             ASSERT (kM >= -1 && kM < M->nvec) ;
             if (kM >= 0)
@@ -768,8 +797,8 @@ GrB_Info GB_add_phase0          // find vectors in C for C=A+B or C<M>=A+B
         }
         else
         {
-            // M is not present, or in standard form
-            ASSERT (M == NULL || !(M->is_hyper)) ;
+            // M is not present, or present and sparse, bitmap or full
+            ASSERT (M == NULL || !M_is_hyper) ;
         }
     }
     #endif
diff --git a/GraphBLAS/Source/GB_add_phase1.c b/GraphBLAS/Source/GB_add_phase1.c
index ede52dd628..83c90b854a 100644
--- a/GraphBLAS/Source/GB_add_phase1.c
+++ b/GraphBLAS/Source/GB_add_phase1.c
@@ -1,27 +1,26 @@
 //------------------------------------------------------------------------------
-// GB_add_phase1: find # of entries in C=A+B or C<M>=A+B
+// GB_add_phase1: # entries in C=A+B, C<M or !M>=A+B (C is sparse/hyper)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// GB_add_phase1 counts the number of entries in each vector of C, for C=A+B or
-// C<M>=A+B  and then does a cumulative sum to find Cp.  GB_add_phase1 is
-// preceded by GB_add_phase0, which finds the non-empty vectors of C.  This
-// phase is done entirely in parallel.
+// GB_add_phase1 counts the number of entries in each vector of C, for C=A+B,
+// C<M>=A+B, or C<!M>=A+B and then does a cumulative sum to find Cp.
+// GB_add_phase1 is preceded by GB_add_phase0, which finds the non-empty
+// vectors of C.  If the mask M is sparse, it is not complemented; only a
+// bitmap or full M is complemented.
 
-// C, M, A, and B can be standard sparse or hypersparse, as determined by
-// GB_add_phase0.  The mask M may be present, but it is not complemented.
+// C is sparse or hypersparse, as determined by GB_add_sparsity.  M, A, and B
+// can have any sparsity structure, but only a specific set of cases will be
+// used (see the list in Template/GB_sparse_add_template.c).
 
-// GB_Matrix_wait computes A=A+T where T is the matrix of the assembled pending
-// tuples.  A and T are disjoint, so this function does not need to examine
-// the pattern of A and T at all.  No mask is used in this case.
-
-// Cp is either freed by phase2, or transplanted into C.
+// Cp is constructed here, and either freed by phase2, or transplanted into C.
 
 #include "GB_add.h"
+#include "GB_unused.h"
 
 GrB_Info GB_add_phase1                  // count nnz in each C(:,j)
 (
@@ -29,9 +28,9 @@ GrB_Info GB_add_phase1                  // count nnz in each C(:,j)
     int64_t *Cnvec_nonempty,            // # of non-empty vectors in C
     const bool A_and_B_are_disjoint,    // if true, then A and B are disjoint
     // tasks from phase0b:
-    GB_task_struct *GB_RESTRICT TaskList,      // array of structs
-    const int ntasks,                       // # of tasks
-    const int nthreads,                     // # of threads to use
+    GB_task_struct *GB_RESTRICT TaskList,   // array of structs
+    const int C_ntasks,                 // # of tasks
+    const int C_nthreads,               // # of threads to use
     // analysis from phase0:
     const int64_t Cnvec,
     const int64_t *GB_RESTRICT Ch,
@@ -40,8 +39,9 @@ GrB_Info GB_add_phase1                  // count nnz in each C(:,j)
     const int64_t *GB_RESTRICT C_to_B,
     const bool Ch_is_Mh,                // if true, then Ch == M->h
     // original input:
-    const GrB_Matrix M,                 // optional mask, may be NULL
+    const GrB_Matrix M,             // optional mask, may be NULL
     const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, use !M
     const GrB_Matrix A,
     const GrB_Matrix B,
     GB_Context Context
@@ -54,23 +54,34 @@ GrB_Info GB_add_phase1                  // count nnz in each C(:,j)
 
     ASSERT (Cp_handle != NULL) ;
     ASSERT (Cnvec_nonempty != NULL) ;
+
+    ASSERT_MATRIX_OK_OR_NULL (M, "M for add phase1", GB0) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+    ASSERT (!GB_JUMBLED (M)) ;
+    ASSERT (!GB_PENDING (M)) ;
+
     ASSERT_MATRIX_OK (A, "A for add phase1", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+
     ASSERT_MATRIX_OK (B, "B for add phase1", GB0) ;
-    ASSERT_MATRIX_OK_OR_NULL (M, "M for add phase1", GB0) ;
-    ASSERT (A->vdim == B->vdim) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+    ASSERT (!GB_PENDING (B)) ;
 
-    int64_t *GB_RESTRICT Cp = NULL ;
-    (*Cp_handle) = NULL ;
+    ASSERT (A->vdim == B->vdim) ;
 
     //--------------------------------------------------------------------------
     // allocate the result
     //--------------------------------------------------------------------------
 
-    Cp = GB_CALLOC (GB_IMAX (2, Cnvec+1), int64_t) ;
+    (*Cp_handle) = NULL ;
+    int64_t *GB_RESTRICT Cp = GB_CALLOC (GB_IMAX (2, Cnvec+1), int64_t) ;
     if (Cp == NULL)
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -84,7 +95,7 @@ GrB_Info GB_add_phase1                  // count nnz in each C(:,j)
     // cumulative sum of Cp and fine tasks in TaskList
     //--------------------------------------------------------------------------
 
-    GB_task_cumsum (Cp, Cnvec, Cnvec_nonempty, TaskList, ntasks, nthreads) ;
+    GB_task_cumsum (Cp, Cnvec, Cnvec_nonempty, TaskList, C_ntasks, C_nthreads) ;
 
     //--------------------------------------------------------------------------
     // return the result
diff --git a/GraphBLAS/Source/GB_add_phase2.c b/GraphBLAS/Source/GB_add_phase2.c
index 420ac6ff18..0799902544 100644
--- a/GraphBLAS/Source/GB_add_phase2.c
+++ b/GraphBLAS/Source/GB_add_phase2.c
@@ -2,50 +2,68 @@
 // GB_add_phase2: C=A+B or C<M>=A+B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// GB_add_phase2 computes C=A+B or C<M>=A+B.  It is preceded first
+// GB_add_phase2 computes C=A+B, C<M>=A+B, or C<!M>A+B.  It is preceded first
 // by GB_add_phase0, which computes the list of vectors of C to compute (Ch)
 // and their location in A and B (C_to_[AB]).  Next, GB_add_phase1 counts the
 // entries in each vector C(:,j) and computes Cp.
 
 // GB_add_phase2 computes the pattern and values of each vector of C(:,j),
-// fully in parallel.
+// entirely in parallel.
 
 // C, M, A, and B can be standard sparse or hypersparse, as determined by
 // GB_add_phase0.  The mask can be either: not present, or present and
-// not complemented.  The complemented mask is not handled here.
+// not complemented.  The complemented mask is handled in most cases,
+// except when C, M, A, and B are all sparse or hypersparse.
 
 // This function either frees Cp and Ch, or transplants then into C, as C->p
 // and C->h.  Either way, the caller must not free them.
 
 // op may be NULL.  In this case, the intersection of A and B must be empty.
 // This is used by GB_Matrix_wait only, for merging the pending tuple matrix T
-// into A.
+// into A.  In this case, C is always sparse or hypersparse, not bitmap or
+// full.
 
 #include "GB_add.h"
 #include "GB_binop.h"
 #include "GB_unused.h"
+#include "GB_ek_slice.h"
 #ifndef GBCOMPACT
 #include "GB_binop__include.h"
 #endif
 
-GrB_Info GB_add_phase2      // C=A+B or C<M>=A+B
+#undef  GB_FREE_WORK
+#define GB_FREE_WORK                                                    \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL         \
+{                           \
+    GB_FREE_WORK ;          \
+    GB_Matrix_free (&C) ;   \
+}
+
+GrB_Info GB_add_phase2      // C=A+B, C<M>=A+B, or C<!M>=A+B
 (
     GrB_Matrix *Chandle,    // output matrix (unallocated on input)
     const GrB_Type ctype,   // type of output matrix C
     const bool C_is_csc,    // format of output matrix C
     const GrB_BinaryOp op,  // op to perform C = op (A,B), or NULL if no op
     // from phase1:
-    const int64_t *GB_RESTRICT Cp,         // vector pointers for C
-    const int64_t Cnvec_nonempty,       // # of non-empty vectors in C
-    // tasks from phase0b:
+    const int64_t *GB_RESTRICT Cp,  // vector pointers for C
+    const int64_t Cnvec_nonempty,   // # of non-empty vectors in C
+    // tasks from phase1a:
     const GB_task_struct *GB_RESTRICT TaskList,    // array of structs
-    const int ntasks,                           // # of tasks
-    const int nthreads,                         // # of threads to use
+    const int C_ntasks,         // # of tasks
+    const int C_nthreads,       // # of threads to use
     // analysis from phase0:
     const int64_t Cnvec,
     const int64_t *GB_RESTRICT Ch,
@@ -53,9 +71,11 @@ GrB_Info GB_add_phase2      // C=A+B or C<M>=A+B
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const bool Ch_is_Mh,        // if true, then Ch == M->h
+    const int C_sparsity,
     // original input:
     const GrB_Matrix M,         // optional mask, may be NULL
-    const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_struct,     // if true, use the only structure of M
+    const bool Mask_comp,       // if true, use !M
     const GrB_Matrix A,
     const GrB_Matrix B,
     GB_Context Context
@@ -66,13 +86,35 @@ GrB_Info GB_add_phase2      // C=A+B or C<M>=A+B
     // check inputs
     //--------------------------------------------------------------------------
 
-    ASSERT (Cp != NULL) ;
     ASSERT_BINARYOP_OK_OR_NULL (op, "op for add phase2", GB0) ;
     ASSERT_MATRIX_OK (A, "A for add phase2", GB0) ;
     ASSERT_MATRIX_OK (B, "B for add phase2", GB0) ;
     ASSERT_MATRIX_OK_OR_NULL (M, "M for add phase2", GB0) ;
     ASSERT (A->vdim == B->vdim) ;
 
+    ASSERT (!GB_JUMBLED (M)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
+
+    //--------------------------------------------------------------------------
+    // get the opcode
+    //--------------------------------------------------------------------------
+
+    bool C_is_hyper = (C_sparsity == GxB_HYPERSPARSE) ;
+    bool C_is_sparse_or_hyper = (C_sparsity == GxB_SPARSE) || C_is_hyper ;
+    ASSERT (C_is_sparse_or_hyper == (Cp != NULL)) ;
+    ASSERT (C_is_hyper == (Ch != NULL)) ;
+
+    GB_Opcode opcode = (op == NULL) ? GB_NOP_opcode : op->opcode ;
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (opcode) ;
+    bool op_is_first  = (opcode == GB_FIRST_opcode) ;
+    bool op_is_second = (opcode == GB_SECOND_opcode) ;
+    bool op_is_pair   = (opcode == GB_PAIR_opcode) ;
+
     if (op == NULL)
     { 
         // GB_Matrix_wait does no typecasting.  A and T have the same type when
@@ -81,34 +123,35 @@ GrB_Info GB_add_phase2      // C=A+B or C<M>=A+B
         ASSERT (ctype == A->type) ;
         ASSERT (ctype == B->type) ;
         ASSERT (M == NULL) ;
+        ASSERT (C_is_sparse_or_hyper) ;
     }
     else
     { 
-        ASSERT (GB_Type_compatible (ctype,   A->type)) ;
-        ASSERT (GB_Type_compatible (ctype,   B->type)) ;
-        ASSERT (GB_Type_compatible (ctype,   op->ztype)) ;
-        ASSERT (GB_Type_compatible (A->type, op->xtype)) ;
-        ASSERT (GB_Type_compatible (B->type, op->ytype)) ;
+        ASSERT (GB_Type_compatible (ctype, A->type)) ;
+        ASSERT (GB_Type_compatible (ctype, B->type)) ;
+        ASSERT (GB_Type_compatible (ctype, op->ztype)) ;
+        ASSERT (GB_IMPLIES (!(op_is_second || op_is_pair || op_is_positional),
+                GB_Type_compatible (A->type, op->xtype))) ;
+        ASSERT (GB_IMPLIES (!(op_is_first  || op_is_pair || op_is_positional),
+                GB_Type_compatible (B->type, op->ytype))) ;
     }
 
     //--------------------------------------------------------------------------
-    // allocate the output matrix C
+    // allocate the output matrix C: hypersparse, sparse, bitmap, or full
     //--------------------------------------------------------------------------
 
-    int64_t cnz = Cp [Cnvec] ;
-    (*Chandle) = NULL ;
-
     // C is hypersparse if both A and B are (contrast with GrB_Matrix_emult),
     // or if M is present, not complemented, and hypersparse.
     // C acquires the same hyperatio as A.
 
-    bool C_is_hyper = (Ch != NULL) ;
+    int64_t cnz = (C_is_sparse_or_hyper) ? Cp [Cnvec] : (A->vlen*A->vdim) ;
+    (*Chandle) = NULL ;
 
     // allocate the result C (but do not allocate C->p or C->h)
-    GrB_Matrix C = NULL ;           // allocate a new header for C
-    GrB_Info info = GB_create (&C, ctype, A->vlen, A->vdim, GB_Ap_null,
-        C_is_csc, GB_SAME_HYPER_AS (C_is_hyper), A->hyper_ratio, Cnvec, cnz,
-        true, Context) ;
+    GrB_Matrix C = NULL ;
+    GrB_Info info = GB_new_bix (&C, // any sparsity, new header
+        ctype, A->vlen, A->vdim, GB_Ap_null, C_is_csc,
+        C_sparsity, true, A->hyper_switch, Cnvec, cnz, true, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory; caller must free C_to_M, C_to_A, C_to_B
@@ -118,7 +161,11 @@ GrB_Info GB_add_phase2      // C=A+B or C<M>=A+B
     }
 
     // add Cp as the vector pointers for C, from GB_add_phase1
-    C->p = (int64_t *) Cp ;
+    if (C_is_sparse_or_hyper)
+    {
+        C->nvec_nonempty = Cnvec_nonempty ;
+        C->p = (int64_t *) Cp ;
+    }
 
     // add Ch as the hypersparse list for C, from GB_add_phase0
     if (C_is_hyper)
@@ -128,18 +175,29 @@ GrB_Info GB_add_phase2      // C=A+B or C<M>=A+B
     }
 
     // now Cp and Ch have been transplanted into C, so they must not be freed.
-
-    C->nvec_nonempty = Cnvec_nonempty ;
     C->magic = GB_MAGIC ;
     GB_Type_code ccode = ctype->code ;
 
     //--------------------------------------------------------------------------
-    // using a built-in binary operator
+    // check if the values of A and/or B are ignored
+    //--------------------------------------------------------------------------
+
+    // With C = ewiseadd (A,B), the union of A and B is used.
+    // Entries in A but not B, and in B but not A, are copied to C without using
+    // the operator, so entries in A and B are never ignored.
+    // Contrast with ewisemult.
+
+    // A is passed as x, and B as y, in z = op(x,y)
+    bool A_is_pattern = false ;
+    bool B_is_pattern = false ;
+
+    //--------------------------------------------------------------------------
+    // using a built-in binary operator (except for positional operators)
     //--------------------------------------------------------------------------
 
     bool done = false ;
 
-#ifndef GBCOMPACT
+    #ifndef GBCOMPACT
 
         //----------------------------------------------------------------------
         // define the worker for the switch factory
@@ -147,46 +205,55 @@ GrB_Info GB_add_phase2      // C=A+B or C<M>=A+B
 
         #define GB_AaddB(mult,xname) GB_AaddB_ ## mult ## xname
 
-        #define GB_BINOP_WORKER(mult,xname)                                  \
-        {                                                                    \
-            info = GB_AaddB(mult,xname) (C, M, Mask_struct, A, B, Ch_is_Mh,  \
-                C_to_M, C_to_A, C_to_B, TaskList, ntasks, nthreads) ;        \
-            done = (info != GrB_NO_VALUE) ;                                  \
-        }                                                                    \
+        #define GB_BINOP_WORKER(mult,xname)                                 \
+        {                                                                   \
+            info = GB_AaddB(mult,xname) (C, C_sparsity,                     \
+                M, Mask_struct, Mask_comp,                                  \
+                A, B, Ch_is_Mh, C_to_M, C_to_A, C_to_B,                     \
+                TaskList, C_ntasks, C_nthreads, Context) ;                  \
+            done = (info != GrB_NO_VALUE) ;                                 \
+        }                                                                   \
         break ;
 
         //----------------------------------------------------------------------
         // launch the switch factory
         //----------------------------------------------------------------------
 
-        GB_Opcode opcode ;
         GB_Type_code xcode, ycode, zcode ;
-        if (GB_binop_builtin (A->type, false, B->type, false,
+        if (!op_is_positional &&
+            GB_binop_builtin (A->type, A_is_pattern, B->type, B_is_pattern,
             op, false, &opcode, &xcode, &ycode, &zcode) && ccode == zcode)
         { 
             #include "GB_binop_factory.c"
-            ASSERT (done) ;
         }
 
-#endif
+        if (info == GrB_OUT_OF_MEMORY)
+        { 
+            // out of memory
+            GB_FREE_ALL ;
+            return (info) ;
+        }
+
+    #endif
 
     //--------------------------------------------------------------------------
-    // generic worker
+    // generic worker for positional ops, user-defined ops, and typecasting
     //--------------------------------------------------------------------------
 
     if (!done)
     {
-        GB_BURBLE_MATRIX (C, "generic ") ;
+        GB_BURBLE_MATRIX (C, "(generic add: %s) ",
+            (op == NULL) ? "second" : op->name) ;
 
         GxB_binary_function fadd ;
         size_t csize, asize, bsize, xsize, ysize, zsize ;
-        GB_cast_function
-            cast_A_to_X, cast_B_to_Y, cast_A_to_C, cast_B_to_C, cast_Z_to_C ;
+        GB_cast_function cast_A_to_C, cast_B_to_C ;
+        GB_cast_function cast_A_to_X, cast_B_to_Y, cast_Z_to_C ;
 
         if (op == NULL)
         { 
             // implicit GB_SECOND_[type] operator with no typecasting
-            fadd = NULL ;
+            fadd = NULL ;               // the operator is not called
             csize = ctype->size ;
             asize = csize ;
             bsize = csize ;
@@ -202,18 +269,39 @@ GrB_Info GB_add_phase2      // C=A+B or C<M>=A+B
         else
         { 
             // normal case, C = A + B with optional typecasting
-            fadd = op->function ;
+            fadd = op->function ;       // NULL if op is positional
             csize = ctype->size ;
             asize = A->type->size ;
             bsize = B->type->size ;
-            xsize = op->xtype->size ;
-            ysize = op->ytype->size ;
+
+            if (op_is_second || op_is_pair || op_is_positional)
+            { 
+                // the op does not depend on the value of A(i,j)
+                xsize = 1 ;
+                cast_A_to_X = NULL ;
+            }
+            else
+            { 
+                xsize = op->xtype->size ;
+                cast_A_to_X = GB_cast_factory (op->xtype->code, A->type->code) ;
+            }
+
+            if (op_is_first || op_is_pair || op_is_positional)
+            { 
+                // the op does not depend on the value of B(i,j)
+                ysize = 1 ;
+                cast_B_to_Y = NULL ;
+            }
+            else
+            { 
+                ysize = op->ytype->size ;
+                cast_B_to_Y = GB_cast_factory (op->ytype->code, B->type->code) ;
+            }
+
             zsize = op->ztype->size ;
-            cast_A_to_X = GB_cast_factory (op->xtype->code, A->type->code) ;
-            cast_B_to_Y = GB_cast_factory (op->ytype->code, B->type->code) ;
-            cast_A_to_C = GB_cast_factory (ccode,           A->type->code) ;
-            cast_B_to_C = GB_cast_factory (ccode,           B->type->code) ;
-            cast_Z_to_C = GB_cast_factory (ccode,           op->ztype->code) ;
+            cast_A_to_C = GB_cast_factory (ccode, A->type->code) ;
+            cast_B_to_C = GB_cast_factory (ccode, B->type->code) ;
+            cast_Z_to_C = GB_cast_factory (ccode, op->ztype->code) ;
         }
 
         // C(i,j) = (ctype) A(i,j), located in Ax [pA]
@@ -227,20 +315,18 @@ GrB_Info GB_add_phase2      // C=A+B or C<M>=A+B
         // aij = (xtype) A(i,j), located in Ax [pA]
         #define GB_GETA(aij,Ax,pA)                                          \
             GB_void aij [GB_VLA(xsize)] ;                                   \
-            cast_A_to_X (aij, Ax +((pA)*asize), asize) ;
+            if (cast_A_to_X != NULL)                                        \
+            {                                                               \
+                cast_A_to_X (aij, Ax +((pA)*asize), asize) ;                \
+            }
 
         // bij = (ytype) B(i,j), located in Bx [pB]
         #define GB_GETB(bij,Bx,pB)                                          \
             GB_void bij [GB_VLA(ysize)] ;                                   \
-            cast_B_to_Y (bij, Bx +((pB)*bsize), bsize) ;
-
-        // C(i,j) = (ctype) (A(i,j) + B(i,j))
-        // not used if op is null
-        #define GB_BINOP(cij, aij, bij)                                     \
-            ASSERT (op != NULL) ;                                           \
-            GB_void z [GB_VLA(zsize)] ;                                     \
-            fadd (z, aij, bij) ;                                            \
-            cast_Z_to_C (cij, z, csize) ;
+            if (cast_B_to_Y != NULL)                                        \
+            {                                                               \
+                cast_B_to_Y (bij, Bx +((pB)*bsize), bsize) ;                \
+            }
 
         // address of Cx [p]
         #define GB_CX(p) Cx +((p)*csize)
@@ -254,26 +340,103 @@ GrB_Info GB_add_phase2      // C=A+B or C<M>=A+B
         // loops cannot be vectorized
         #define GB_PRAGMA_SIMD_VECTORIZE ;
 
-        #include "GB_add_template.c"
+        if (op_is_positional)
+        { 
+
+            //------------------------------------------------------------------
+            // C(i,j) = positional_op (aij, bij)
+            //------------------------------------------------------------------
+
+            int64_t offset = GB_positional_offset (opcode) ;
+
+            if (op->ztype == GrB_INT64)
+            {
+                switch (opcode)
+                {
+                    case GB_FIRSTI_opcode    : // z = first_i(A(i,j),y) == i
+                    case GB_FIRSTI1_opcode   : // z = first_i1(A(i,j),y) == i+1
+                    case GB_SECONDI_opcode   : // z = second_i(x,A(i,j)) == i
+                    case GB_SECONDI1_opcode  : // z = second_i1(x,A(i,j)) == i+1
+                        #undef  GB_BINOP
+                        #define GB_BINOP(cij, aij, bij, i, j)   \
+                            int64_t z = i + offset ;            \
+                            cast_Z_to_C (cij, &z, csize) ;
+                        #include "GB_add_template.c"
+                        break ;
+                    case GB_FIRSTJ_opcode    : // z = first_j(A(i,j),y) == j
+                    case GB_FIRSTJ1_opcode   : // z = first_j1(A(i,j),y) == j+1
+                    case GB_SECONDJ_opcode   : // z = second_j(x,A(i,j)) == j
+                    case GB_SECONDJ1_opcode  : // z = second_j1(x,A(i,j)) == j+1
+                        #undef  GB_BINOP
+                        #define GB_BINOP(cij, aij, bij, i, j)   \
+                            int64_t z = j + offset ;            \
+                            cast_Z_to_C (cij, &z, csize) ;
+                        #include "GB_add_template.c"
+                        break ;
+                    default: ;
+                }
+            }
+            else
+            {
+                switch (opcode)
+                {
+                    case GB_FIRSTI_opcode    : // z = first_i(A(i,j),y) == i
+                    case GB_FIRSTI1_opcode   : // z = first_i1(A(i,j),y) == i+1
+                    case GB_SECONDI_opcode   : // z = second_i(x,A(i,j)) == i
+                    case GB_SECONDI1_opcode  : // z = second_i1(x,A(i,j)) == i+1
+                        #undef  GB_BINOP
+                        #define GB_BINOP(cij, aij, bij, i, j)       \
+                            int32_t z = (int32_t) (i + offset) ;    \
+                            cast_Z_to_C (cij, &z, csize) ;
+                        #include "GB_add_template.c"
+                        break ;
+                    case GB_FIRSTJ_opcode    : // z = first_j(A(i,j),y) == j
+                    case GB_FIRSTJ1_opcode   : // z = first_j1(A(i,j),y) == j+1
+                    case GB_SECONDJ_opcode   : // z = second_j(x,A(i,j)) == j
+                    case GB_SECONDJ1_opcode  : // z = second_j1(x,A(i,j)) == j+1
+                        #undef  GB_BINOP
+                        #define GB_BINOP(cij, aij, bij, i, j)       \
+                            int32_t z = (int32_t) (j + offset) ;    \
+                            cast_Z_to_C (cij, &z, csize) ;
+                        #include "GB_add_template.c"
+                        break ;
+                    default: ;
+                }
+            }
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // standard binary operator
+            //------------------------------------------------------------------
+
+            // C(i,j) = (ctype) (A(i,j) + B(i,j))
+            // not used if op is null
+            #undef  GB_BINOP
+            #define GB_BINOP(cij, aij, bij, i, j)   \
+                ASSERT (op != NULL) ;               \
+                GB_void z [GB_VLA(zsize)] ;         \
+                fadd (z, aij, bij) ;                \
+                cast_Z_to_C (cij, z, csize) ;
+
+            #include "GB_add_template.c"
+        }
     }
 
     //--------------------------------------------------------------------------
     // remove empty vectors from C, if hypersparse
     //--------------------------------------------------------------------------
 
-    info = GB_hypermatrix_prune (C, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory
-        GB_MATRIX_FREE (&C) ;
-        return (info) ;
-    }
+    GB_OK (GB_hypermatrix_prune (C, Context)) ;
 
     //--------------------------------------------------------------------------
-    // return result
+    // free workspace and return result
     //--------------------------------------------------------------------------
 
     // caller must free C_to_M, C_to_A, and C_to_B, but not Cp or Ch
+    GB_FREE_WORK ;
     ASSERT_MATRIX_OK (C, "C output for add phase2", GB0) ;
     (*Chandle) = C ;
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GB_add_sparsity.c b/GraphBLAS/Source/GB_add_sparsity.c
new file mode 100644
index 0000000000..03b4f14d58
--- /dev/null
+++ b/GraphBLAS/Source/GB_add_sparsity.c
@@ -0,0 +1,226 @@
+//------------------------------------------------------------------------------
+// GB_add_sparsity: determine the sparsity structure for C<M or !M>=A+B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Determines the sparsity structure for C, for computing C=A+B, C<M>=A+B,
+// or C<!M>=A+B, based on the sparsity structures of M, A, and B, and whether
+// or not M is complemented.  It also decides if the mask M should be applied
+// by GB_add, or if C=A+B should be computed without the mask, and the mask
+// applied later.
+
+// If C should be hypersparse or sparse, on output, this function simply
+// returns GxB_SPARSE.  The final determination is made by GB_add_phase0.
+
+#include "GB_add.h"
+
+int GB_add_sparsity         // return the sparsity structure for C
+(
+    // output:
+    bool *apply_mask,       // if true then mask will be applied by GB_add
+    // input:
+    const GrB_Matrix M,     // optional mask for C, unused if NULL
+    const bool Mask_comp,   // if true, use !M
+    const GrB_Matrix A,     // input A matrix
+    const GrB_Matrix B      // input B matrix
+)
+{
+
+    //--------------------------------------------------------------------------
+    // determine the sparsity of C
+    //--------------------------------------------------------------------------
+
+    // Unless deciding otherwise, use the mask if it appears
+    (*apply_mask) = (M != NULL) ;
+
+    int C_sparsity ;
+
+    // In the table below, sparse/hypersparse are listed as "sparse".  If C is
+    // listed as sparse: it is hypersparse if M is hypersparse (and not
+    // complemented), or if both A and B are hypersparse, and sparse otherwise.
+    // This is determined by GB_add_phase0.  If M is complemented and all 4
+    // matrices are sparse, then C=A+B is always computed.  So C is hypersparse
+    // if both A and B are hypersparse, in this case.
+
+    bool M_is_sparse_or_hyper = GB_IS_SPARSE (M) || GB_IS_HYPERSPARSE (M) ;
+    bool A_is_sparse_or_hyper = GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A) ;
+    bool B_is_sparse_or_hyper = GB_IS_SPARSE (B) || GB_IS_HYPERSPARSE (B) ;
+    bool A_is_full = GB_as_if_full (A) ;
+    bool B_is_full = GB_as_if_full (B) ;
+
+    if (M == NULL)
+    {
+
+        //      ------------------------------------------
+        //      C       =           A       +       B
+        //      ------------------------------------------
+        //      sparse  .           sparse          sparse
+        //      bitmap  .           sparse          bitmap
+        //      full    .           sparse          full  
+        //      bitmap  .           bitmap          sparse
+        //      bitmap  .           bitmap          bitmap
+        //      full    .           bitmap          full  
+        //      full    .           full            sparse
+        //      full    .           full            bitmap
+        //      full    .           full            full  
+
+        if (A_is_sparse_or_hyper && B_is_sparse_or_hyper)
+        { 
+            C_sparsity = GxB_SPARSE ;
+        }
+        else if (A_is_full || B_is_full)
+        { 
+            C_sparsity = GxB_FULL ;
+        }
+        else
+        { 
+            C_sparsity = GxB_BITMAP ;
+        }
+
+    }
+    else if (!Mask_comp)
+    {
+
+        if (M_is_sparse_or_hyper)
+        { 
+
+            //      ------------------------------------------
+            //      C      <M> =        A       +       B
+            //      ------------------------------------------
+            //      sparse  sparse      sparse          sparse
+            //      sparse  sparse      sparse          bitmap
+            //      sparse  sparse      sparse          full  
+            //      sparse  sparse      bitmap          sparse
+            //      sparse  sparse      bitmap          bitmap
+            //      sparse  sparse      bitmap          full  
+            //      sparse  sparse      full            sparse
+            //      sparse  sparse      full            bitmap
+            //      sparse  sparse      full            full  
+
+            // TODO: if M and A and/or B are all sparse, use the mask only if:
+            // 8*nnz(M) <= ( (A sparse or hyper) ? nnz(A) : 0 ) +
+            //             ( (B sparse or hyper) ? nnz(B) : 0 )
+            // if A and B are both bitmap or full, then always use the mask.
+            // GB_sparse_add_template handles this case, but exploiting the
+            // mask can be asympotically slow, when C and M are sparse, and A
+            // and/or B are sparse.
+
+            // TODO: check the sparse_mask_is_easy condition:  use M
+            // if Mask_struct is true, A is not bitmap, B is not bitmap,
+            // and one of the 3 conditions holds.  In this case, ignore the
+            // 8*nnz(M) <= (...) test, and always use the mask.
+
+            // TODO: See the GB_MASK_VERY_SPARSE (M, A, B) macro for this test.
+
+            C_sparsity = GxB_SPARSE ;
+
+        }
+        else
+        {
+
+            //      ------------------------------------------
+            //      C      <M> =        A       +       B
+            //      ------------------------------------------
+            //      sparse  bitmap      sparse          sparse
+            //      bitmap  bitmap      sparse          bitmap
+            //      bitmap  bitmap      sparse          full  
+            //      bitmap  bitmap      bitmap          sparse
+            //      bitmap  bitmap      bitmap          bitmap
+            //      bitmap  bitmap      bitmap          full  
+            //      bitmap  bitmap      full            sparse
+            //      bitmap  bitmap      full            bitmap
+            //      bitmap  bitmap      full            full  
+
+            //      ------------------------------------------
+            //      C      <M> =        A       +       B
+            //      ------------------------------------------
+            //      sparse  full        sparse          sparse
+            //      bitmap  full        sparse          bitmap
+            //      bitmap  full        sparse          full  
+            //      bitmap  full        bitmap          sparse
+            //      bitmap  full        bitmap          bitmap
+            //      bitmap  full        bitmap          full  
+            //      bitmap  full        full            sparse
+            //      bitmap  full        full            bitmap
+            //      bitmap  full        full            full  
+
+            // The mask is very efficient to use in the case, when C is sparse.
+
+            if (A_is_sparse_or_hyper && B_is_sparse_or_hyper)
+            { 
+                C_sparsity = GxB_SPARSE ;
+            }
+            else
+            { 
+                C_sparsity = GxB_BITMAP ;
+            }
+        }
+
+    }
+    else // Mask_comp
+    {
+
+        //      ------------------------------------------
+        //      C     <!M> =        A       +       B
+        //      ------------------------------------------
+        //      sparse  sparse      sparse          sparse      (mask later)
+        //      bitmap  sparse      sparse          bitmap
+        //      bitmap  sparse      sparse          full  
+        //      bitmap  sparse      bitmap          sparse
+        //      bitmap  sparse      bitmap          bitmap
+        //      bitmap  sparse      bitmap          full  
+        //      bitmap  sparse      full            sparse
+        //      bitmap  sparse      full            bitmap
+        //      bitmap  sparse      full            full  
+
+        //      ------------------------------------------
+        //      C     <!M> =        A       +       B
+        //      ------------------------------------------
+        //      sparse  bitmap      sparse          sparse
+        //      bitmap  bitmap      sparse          bitmap
+        //      bitmap  bitmap      sparse          full  
+        //      bitmap  bitmap      bitmap          sparse
+        //      bitmap  bitmap      bitmap          bitmap
+        //      bitmap  bitmap      bitmap          full  
+        //      bitmap  bitmap      full            sparse
+        //      bitmap  bitmap      full            bitmap
+        //      bitmap  bitmap      full            full  
+
+        //      ------------------------------------------
+        //      C     <!M> =        A       +       B
+        //      ------------------------------------------
+        //      sparse  full        sparse          sparse
+        //      bitmap  full        sparse          bitmap
+        //      bitmap  full        sparse          full  
+        //      bitmap  full        bitmap          sparse
+        //      bitmap  full        bitmap          bitmap
+        //      bitmap  full        bitmap          full  
+        //      bitmap  full        full            sparse
+        //      bitmap  full        full            bitmap
+        //      bitmap  full        full            full  
+
+        if (A_is_sparse_or_hyper && B_is_sparse_or_hyper)
+        { 
+            // !M must be applied later if all 4 matrices are sparse or
+            // hypersparse, since the GB_sparse_add_template method does not
+            // handle this case.  See the "(mask later)" above.  The method can
+            // construct a sparse/hyper C with !M as bitmap or full. 
+            C_sparsity = GxB_SPARSE ;
+            (*apply_mask) = !M_is_sparse_or_hyper ;
+        }
+        else
+        { 
+            // !M can be applied now, or later.  TODO: If M is sparse and
+            // either A or B are sparse/hyper, then there might be cases where
+            // !M should be applied later, for better performance.
+            C_sparsity = GxB_BITMAP ;
+        }
+    }
+
+    return (C_sparsity) ;
+}
+
diff --git a/GraphBLAS/Source/GB_aliased.c b/GraphBLAS/Source/GB_aliased.c
index 8391767c43..bcfab224f4 100644
--- a/GraphBLAS/Source/GB_aliased.c
+++ b/GraphBLAS/Source/GB_aliased.c
@@ -2,8 +2,8 @@
 // GB_aliased: determine if two matrices are aliased
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -58,6 +58,12 @@ bool GB_aliased             // determine if A and B are aliased
         aliased = true ;
     }
 
+    if (GB_POINTER_ALIASED (A->b, B->b))
+    { 
+        ASSERT (A->b_shallow || B->b_shallow) ;
+        aliased = true ;
+    }
+
     if (GB_POINTER_ALIASED (A->i, B->i))
     { 
         ASSERT (A->i_shallow || B->i_shallow) ;
diff --git a/GraphBLAS/Source/GB_apply.c b/GraphBLAS/Source/GB_apply.c
index 09737732e5..9a2bc9aca8 100644
--- a/GraphBLAS/Source/GB_apply.c
+++ b/GraphBLAS/Source/GB_apply.c
@@ -2,8 +2,8 @@
 // GB_apply: apply a unary operator; optionally transpose a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -40,8 +40,7 @@ GrB_Info GB_apply                   // C<M> = accum (C, op(A)) or op(A')
     //--------------------------------------------------------------------------
 
     // C may be aliased with M and/or A
-
-    GB_RETURN_IF_FAULTY (accum) ;
+    GB_RETURN_IF_FAULTY_OR_POSITIONAL (accum) ;
     ASSERT_MATRIX_OK (C, "C input for GB_apply", GB0) ;
     ASSERT_MATRIX_OK_OR_NULL (M, "M for GB_apply", GB0) ;
     ASSERT_BINARYOP_OK_OR_NULL (accum, "accum for GB_apply", GB0) ;
@@ -58,14 +57,17 @@ GrB_Info GB_apply                   // C<M> = accum (C, op(A)) or op(A')
         ASSERT_UNARYOP_OK (op1, "op1 for GB_apply", GB0) ;
         T_type = op1->ztype ;
         opcode = op1->opcode ;
-        // A must also be compatible with op1->xtype
-        if (!GB_Type_compatible (A->type, op1->xtype))
-        { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-                "Incompatible type for z=%s(x):\n"
-                "input A of type [%s]\n"
-                "cannot be typecast to x input of type [%s]",
-                op1->name, A->type->name, op1->xtype->name))) ;
+        if (!GB_OPCODE_IS_POSITIONAL (opcode))
+        {
+            // A must also be compatible with op1->xtype
+            if (!GB_Type_compatible (A->type, op1->xtype))
+            { 
+                GB_ERROR (GrB_DOMAIN_MISMATCH,
+                    "Incompatible type for z=%s(x):\n"
+                    "input A of type [%s]\n"
+                    "cannot be typecast to x input of type [%s]",
+                    op1->name, A->type->name, op1->xtype->name) ;
+            }
         }
     }
     else if (op2 != NULL)
@@ -76,62 +78,65 @@ GrB_Info GB_apply                   // C<M> = accum (C, op(A)) or op(A')
         ASSERT_SCALAR_OK (scalar, "scalar for GB_apply", GB0) ;
         T_type = op2->ztype ;
         opcode = op2->opcode ;
-        bool op_is_first  = opcode == GB_FIRST_opcode ;
-        bool op_is_second = opcode == GB_SECOND_opcode ;
-        bool op_is_pair   = opcode == GB_PAIR_opcode ;
-        if (binop_bind1st)
+        if (!GB_OPCODE_IS_POSITIONAL (opcode))
         {
-            // C = op (scalar,A)
-            // A must be compatible with op2->ytype
-            if (!(op_is_first || op_is_pair ||
-                  GB_Type_compatible (A->type, op2->ytype)))
-            { 
-                return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-                    "Incompatible type for z=%s(x,y):\n"
-                    "input A of type [%s]\n"
-                    "cannot be typecast to y input of type [%s]",
-                    op2->name, A->type->name, op2->ytype->name))) ;
-            }
-            // scalar must be compatible with op2->xtype
-            if (!(op_is_second || op_is_pair ||
-                  GB_Type_compatible (scalar->type, op2->xtype)))
-            { 
-                return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-                    "Incompatible type for z=%s(x,y):\n"
-                    "input scalar of type [%s]\n"
-                    "cannot be typecast to x input of type [%s]",
-                    op2->name, scalar->type->name, op2->xtype->name))) ;
-            }
-        }
-        else
-        {
-            // C = op (A,scalar)
-            // A must be compatible with op2->xtype
-            if (!(op_is_first || op_is_pair ||
-                  GB_Type_compatible (A->type, op2->xtype)))
-            { 
-                return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-                    "Incompatible type for z=%s(x,y):\n"
-                    "input scalar of type [%s]\n"
-                    "cannot be typecast to x input of type [%s]",
-                    op2->name, A->type->name, op2->xtype->name))) ;
+            bool op_is_first  = opcode == GB_FIRST_opcode ;
+            bool op_is_second = opcode == GB_SECOND_opcode ;
+            bool op_is_pair   = opcode == GB_PAIR_opcode ;
+            if (binop_bind1st)
+            {
+                // C = op (scalar,A)
+                // A must be compatible with op2->ytype
+                if (!(op_is_first || op_is_pair ||
+                      GB_Type_compatible (A->type, op2->ytype)))
+                { 
+                    GB_ERROR (GrB_DOMAIN_MISMATCH,
+                        "Incompatible type for z=%s(x,y):\n"
+                        "input A of type [%s]\n"
+                        "cannot be typecast to y input of type [%s]",
+                        op2->name, A->type->name, op2->ytype->name) ;
+                }
+                // scalar must be compatible with op2->xtype
+                if (!(op_is_second || op_is_pair ||
+                      GB_Type_compatible (scalar->type, op2->xtype)))
+                { 
+                    GB_ERROR (GrB_DOMAIN_MISMATCH,
+                        "Incompatible type for z=%s(x,y):\n"
+                        "input scalar of type [%s]\n"
+                        "cannot be typecast to x input of type [%s]",
+                        op2->name, scalar->type->name, op2->xtype->name) ;
+                }
             }
-            // scalar must be compatible with op2->ytype
-            if (!(op_is_second || op_is_pair
-                  || GB_Type_compatible (scalar->type, op2->ytype)))
-            { 
-                return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-                    "Incompatible type for z=%s(x,y):\n"
-                    "input A of type [%s]\n"
-                    "cannot be typecast to y input of type [%s]",
-                    op2->name, scalar->type->name, op2->ytype->name))) ;
+            else
+            {
+                // C = op (A,scalar)
+                // A must be compatible with op2->xtype
+                if (!(op_is_first || op_is_pair ||
+                      GB_Type_compatible (A->type, op2->xtype)))
+                { 
+                    GB_ERROR (GrB_DOMAIN_MISMATCH,
+                        "Incompatible type for z=%s(x,y):\n"
+                        "input scalar of type [%s]\n"
+                        "cannot be typecast to x input of type [%s]",
+                        op2->name, A->type->name, op2->xtype->name) ;
+                }
+                // scalar must be compatible with op2->ytype
+                if (!(op_is_second || op_is_pair
+                      || GB_Type_compatible (scalar->type, op2->ytype)))
+                { 
+                    GB_ERROR (GrB_DOMAIN_MISMATCH,
+                        "Incompatible type for z=%s(x,y):\n"
+                        "input A of type [%s]\n"
+                        "cannot be typecast to y input of type [%s]",
+                        op2->name, scalar->type->name, op2->ytype->name) ;
+                }
             }
         }
     }
     else
-    {
-        return (GB_ERROR (GrB_NULL_POINTER, (GB_LOG,
-            "Required argument is null: [op]"))) ;
+    { 
+        GB_ERROR (GrB_NULL_POINTER,
+            "Required argument is null: [%s]", "op") ;
     }
 
     // check domains and dimensions for C<M> = accum (C,T)
@@ -146,27 +151,30 @@ GrB_Info GB_apply                   // C<M> = accum (C, op(A)) or op(A')
     int64_t tncols = (A_transpose) ? GB_NROWS (A) : GB_NCOLS (A) ;
     if (GB_NROWS (C) != tnrows || GB_NCOLS (C) != tncols)
     { 
-        return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
+        GB_ERROR (GrB_DIMENSION_MISMATCH,
             "Dimensions not compatible:\n"
             "output is " GBd "-by-" GBd "\n"
             "input is " GBd "-by-" GBd "%s",
             GB_NROWS (C), GB_NCOLS (C),
-            tnrows, tncols, A_transpose ? " (transposed)" : ""))) ;
+            tnrows, tncols, A_transpose ? " (transposed)" : "") ;
     }
 
     // quick return if an empty mask is complemented
     GB_RETURN_IF_QUICK_MASK (C, C_replace, M, Mask_comp) ;
 
     // delete any lingering zombies and assemble any pending tuples
-    GB_MATRIX_WAIT (M) ;
-    GB_MATRIX_WAIT (A) ;
-    GB_SCALAR_WAIT (scalar) ;
+    GB_MATRIX_WAIT (M) ;        // TODO: postpone until accum/mask phase
+    GB_MATRIX_WAIT (A) ;        // TODO: allow A and C to be jumbled
+    GB_MATRIX_WAIT (scalar) ;
+
+    GB_BURBLE_DENSE (C, "(C %s) ") ;
+    GB_BURBLE_DENSE (M, "(M %s) ") ;
+    GB_BURBLE_DENSE (A, "(A %s) ") ;
 
     if (op2 != NULL && GB_NNZ (scalar) != 1)
-    {
+    { 
         // the scalar entry must be present
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "Scalar is missing; it must contain a single entry"))) ;
+        GB_ERROR (GrB_INVALID_VALUE, "%s", "Scalar must contain an entry") ;
     }
 
     //--------------------------------------------------------------------------
@@ -175,7 +183,6 @@ GrB_Info GB_apply                   // C<M> = accum (C, op(A)) or op(A')
 
     if (op2 != NULL)
     { 
-        // GxB_print (op2, 3) ;
         // first(A,x), second(y,A), and any(...) become identity(A)
         if ((opcode == GB_ANY_opcode) ||
             (opcode == GB_FIRST_opcode  && !binop_bind1st) ||
@@ -198,7 +205,6 @@ GrB_Info GB_apply                   // C<M> = accum (C, op(A)) or op(A')
                 case GB_FC32_code    : op1 = GxB_IDENTITY_FC32   ; break ;
                 case GB_FC64_code    : op1 = GxB_IDENTITY_FC64   ; break ;
             }
-            // printf ("renamed: ") ; GxB_print (op1, 3) ;
             op2 = NULL ;
         }
         else if (opcode == GB_PAIR_opcode)
@@ -221,92 +227,77 @@ GrB_Info GB_apply                   // C<M> = accum (C, op(A)) or op(A')
                 case GB_FC32_code    : op1 = GxB_ONE_FC32   ; break ;
                 case GB_FC64_code    : op1 = GxB_ONE_FC64   ; break ;
             }
-            // printf ("renamed: ") ; GxB_print (op1, 3) ;
             op2 = NULL ;
         }
-
-#if 0
-        else
-        {
-            switch (opcode)
-            {
-                // commutative operators, no need for bind1st workers:
-                case PLUS_opcode      :
-                case TIMES_opcode     :
-                case PAIR_opcode      :
-                case ANY_opcode       :
-                case ISEQ_opcode      :
-                case ISNE_opcode      :
-                case EQ_opcode        :
-                case NE_opcode        :
-                case MIN_opcode       :
-                case MAX_opcode       :
-                case LOR_opcode       :
-                case LAND_opcode      :
-                case LXOR_opcode      :
-                case LXNOR_opcode     :
-                case HYPOT_opcode     :
-                case BOR_opcode       :
-                case BAND_opcode      :
-                case BXOR_opcode      :
-                case BXNOR_opcode     : binop_bind1st = false ;
-                default : ;
-            }
-        }
-#endif
-
     }
 
     //--------------------------------------------------------------------------
     // T = op(A) or op(A')
     //--------------------------------------------------------------------------
 
-    bool C_is_csc = C->is_csc ;
-    if (C_is_csc != A->is_csc)
+    bool T_is_csc = C->is_csc ;
+    if (T_is_csc != A->is_csc)
     { 
         // Flip the sense of A_transpose
         A_transpose = !A_transpose ;
     }
 
+    if (!T_is_csc)
+    {
+        // positional ops must be flipped, with i and j swapped
+        if (op1 != NULL)
+        { 
+            op1 = GB_positional_unop_ijflip (op1) ;
+            opcode = op1->opcode ;
+        }
+        else if (op2 != NULL)
+        { 
+            op2 = GB_positional_binop_ijflip (op2) ;
+            opcode = op2->opcode ;
+        }
+    }
+
     GrB_Matrix T = NULL ;
 
     if (A_transpose)
     { 
         // T = op (A'), typecasting to op*->ztype
-        // transpose: typecast, apply an op, not in place
-        GBBURBLE ("(transpose-op) ") ;
-        info = GB_transpose (&T, T_type, C_is_csc, A,
-            op1, op2, scalar, binop_bind1st,
-            Context) ;
+        // transpose: typecast, apply an op, not in-place.
+        GBURBLE ("(transpose-op) ") ;
+        info = GB_transpose (&T, T_type, T_is_csc, A,
+            op1, op2, scalar, binop_bind1st, Context) ;
+        ASSERT (GB_JUMBLED_OK (T)) ;
+        // A positional op is applied to C after the transpose is computed,
+        // using the T_is_csc format.  The ijflip is handled
+        // above.
     }
     else if (M == NULL && accum == NULL && (C == A) && C->type == T_type)
-    { 
-        GBBURBLE ("(inplace-op) ") ;
-        // C = op (C), operating on the values in place, with no typecasting
-        // of the output of the operator with the matrix C.  Always succeeds.
+    {
+        GBURBLE ("(inplace-op) ") ;
+        // C = op (C), operating on the values in-place, with no typecasting
+        // of the output of the operator with the matrix C.
         // No work to do if the op is identity.
         // FUTURE::: also handle C += op(C), with accum.
         if (opcode != GB_IDENTITY_opcode)
         { 
+            // the output Cx is aliased with C->x in GB_apply_op.
             GB_void *Cx = (GB_void *) C->x ;
-            GB_apply_op (Cx,
-                op1, op2, scalar, binop_bind1st,
-                Cx, C->type, GB_NNZ (C), Context) ;
+            info = GB_apply_op (Cx, op1, op2,   // op1 != identity
+                scalar, binop_bind1st, C, Context) ;
         }
-        return (GrB_SUCCESS) ;
+        return (info) ;
     }
     else
     { 
         // T = op (A), pattern is a shallow copy of A, type is op*->ztype.
-        GBBURBLE ("(shallow-op) ") ;
-        info = GB_shallow_op (&T, C_is_csc,
-            op1, op2, scalar, binop_bind1st,
-            A, Context) ;
+        GBURBLE ("(shallow-op) ") ;
+        info = GB_shallow_op (&T, T_is_csc,
+            op1, op2, scalar, binop_bind1st, A, Context) ;
     }
 
     if (info != GrB_SUCCESS)
     { 
-        GB_MATRIX_FREE (&T) ;
+        GB_Matrix_free (&T) ;
         return (info) ;
     }
 
diff --git a/GraphBLAS/Source/GB_apply.h b/GraphBLAS/Source/GB_apply.h
index cce6ecb3ea..c43dc9045c 100644
--- a/GraphBLAS/Source/GB_apply.h
+++ b/GraphBLAS/Source/GB_apply.h
@@ -2,8 +2,8 @@
 // GB_apply.h: definitions for GB_apply
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -28,18 +28,16 @@ GrB_Info GB_apply                   // C<M> = accum (C, op(A)) or op(A')
     GB_Context Context
 ) ;
 
-// Cx and Ax may be aliased in GB_apply_op
+// Cx and A->x may be aliased in GB_apply_op
 
-void GB_apply_op            // apply a unary operator, Cx = op ((xtype) Ax)
+GrB_Info GB_apply_op                // apply a unary operator, Cx = op (A)
 (
     GB_void *Cx,                    // output array, of type op->ztype
         const GrB_UnaryOp op1,          // unary operator to apply
         const GrB_BinaryOp op2,         // binary operator to apply
         const GxB_Scalar scalar,        // scalar to bind to binary operator
-        bool binop_bind1st,             // if true, binop(x,A) else binop(A,y)
-    const GB_void *Ax,              // input array, of type Atype
-    const GrB_Type Atype,           // type of Ax
-    const int64_t anz,              // size of Ax and Cx
+        bool binop_bind1st,             // if true, binop(x,Ax) else binop(Ax,y)
+    const GrB_Matrix A,             // input matrix
     GB_Context Context
 ) ;
 
diff --git a/GraphBLAS/Source/GB_apply_op.c b/GraphBLAS/Source/GB_apply_op.c
index 66f1006356..3227078e2e 100644
--- a/GraphBLAS/Source/GB_apply_op.c
+++ b/GraphBLAS/Source/GB_apply_op.c
@@ -1,35 +1,40 @@
 //------------------------------------------------------------------------------
-// GB_apply_op: typecast and apply a unary operator to an array
+// GB_apply_op: typecast and apply a unary or binary operator to an array
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Cx = op ((xtype) Ax)
+// Cx = op (A)
 
-// Cx and Ax may be aliased.
-// Compare with GB_transpose_op.c
+// Cx and A->x may be aliased.
+
+// This function is CSR/CSC agnostic.  For positional ops, A is treated as if
+// it is in CSC format.  The caller has already modified the op if A is in CSR
+// format.
+
+// Template/GB_positional_op_ijp can return GrB_OUT_OF_MEMORY.
+// Otherwise, this function only returns GrB_SUCCESS.
 
 #include "GB_apply.h"
 #include "GB_binop.h"
+#include "GB_ek_slice.h"
 #include "GB_unused.h"
 #ifndef GBCOMPACT
 #include "GB_unop__include.h"
 #include "GB_binop__include.h"
 #endif
 
-void GB_apply_op            // apply a unary operator, Cx = op ((xtype) Ax)
+GrB_Info GB_apply_op                // apply a unary operator, Cx = op (A)
 (
     GB_void *Cx,                    // output array, of type op->ztype
         const GrB_UnaryOp op1,          // unary operator to apply
         const GrB_BinaryOp op2,         // binary operator to apply
         const GxB_Scalar scalar,        // scalar to bind to binary operator
         bool binop_bind1st,             // if true, binop(x,Ax) else binop(Ax,y)
-    const GB_void *Ax,              // input array, of type Atype
-    const GrB_Type Atype,           // type of Ax
-    const int64_t anz,              // size of Ax and Cx
+    const GrB_Matrix A,             // input matrix
     GB_Context Context
 )
 {
@@ -39,45 +44,153 @@ void GB_apply_op            // apply a unary operator, Cx = op ((xtype) Ax)
     //--------------------------------------------------------------------------
 
     ASSERT (Cx != NULL) ;
-    ASSERT (Ax != NULL) ;
-    ASSERT (anz >= 0) ;
-    ASSERT (Atype != NULL) ;
     ASSERT (op1 != NULL || op2 != NULL) ;
+    ASSERT_MATRIX_OK (A, "A input for GB_apply_op", GB0) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;        // A can be jumbled
 
     //--------------------------------------------------------------------------
-    // determine the number of threads to use
+    // get A
+    //--------------------------------------------------------------------------
+
+    // A->x is not const since the operator might be applied in-place, if
+    // C is aliased to C.
+
+    GB_void *Ax = (GB_void *) A->x ;        // A->x has type A->type
+    const int8_t  *Ab = A->b ;              // only if A is bitmap
+    const GrB_Type Atype = A->type ;        // type of A->x
+    const int64_t anz = GB_NNZ_HELD (A) ;   // size of A->x and Cx
+
+    //--------------------------------------------------------------------------
+    // determine the maximum number of threads to use
     //--------------------------------------------------------------------------
 
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-    int nthreads = GB_nthreads (anz, chunk, nthreads_max) ;
 
     //--------------------------------------------------------------------------
     // apply the operator
     //--------------------------------------------------------------------------
 
-    if (op1 != NULL)
+    GB_Opcode opcode = (op1 != NULL) ? op1->opcode : op2->opcode ;
+
+    if (GB_OPCODE_IS_POSITIONAL (opcode))
     {
 
         //----------------------------------------------------------------------
-        // built-in unary operator
+        // built-in positional unary or binary operator
+        //----------------------------------------------------------------------
+
+        bool is64 ;
+        if (op1 != NULL)
+        { 
+            ASSERT_UNARYOP_OK (op1, "positional op1 for GB_apply_op", GB0) ;
+            is64 = (op1->ztype == GrB_INT64) ;
+        }
+        else // if (op2 != NULL)
+        { 
+            ASSERT_BINARYOP_OK (op2, "positional op2 for GB_apply_op", GB0) ;
+            is64 = (op2->ztype == GrB_INT64) ;
+        }
+
+        // get A and C
+        const int64_t *GB_RESTRICT Ah = A->h ;
+        const int64_t *GB_RESTRICT Ap = A->p ;
+        const int64_t *GB_RESTRICT Ai = A->i ;
+        int64_t anvec = A->nvec ;
+        int64_t avlen = A->vlen ;
+        int64_t avdim = A->vdim ;
+
+        //----------------------------------------------------------------------
+        // determine number of threads to use
+        //----------------------------------------------------------------------
+
+        int nthreads = GB_nthreads (anz + anvec, chunk, nthreads_max) ;
+        int ntasks = (nthreads == 1) ? 1 : (32 * nthreads) ;
+
+        //----------------------------------------------------------------------
+        // Cx = positional_op (A)
         //----------------------------------------------------------------------
 
+        int64_t offset = GB_positional_offset (opcode) ;
+
+        // GB_positional_op_ijp allocates a set of tasks, which can possibly
+        // fail if out of memory.
+
+        if (is64)
+        {
+            int64_t *GB_RESTRICT Cx_int = (int64_t *) Cx ;
+            switch (opcode)
+            {
+                case GB_POSITIONI_opcode  : // z = position_i(A(i,j)) == i
+                case GB_POSITIONI1_opcode : // z = position_i1(A(i,j)) == i+1
+                case GB_FIRSTI_opcode     : // z = first_i(A(i,j),y) == i
+                case GB_FIRSTI1_opcode    : // z = first_i1(A(i,j),y) == i+1
+                case GB_SECONDI_opcode    : // z = second_i(x,A(i,j)) == i
+                case GB_SECONDI1_opcode   : // z = second_i1(x,A(i,j)) == i+1
+                    #define GB_POSITION i + offset
+                    #include "GB_positional_op_ip.c"
+                    return (GrB_SUCCESS) ;
+                case GB_POSITIONJ_opcode  : // z = position_j(A(i,j)) == j
+                case GB_POSITIONJ1_opcode : // z = position_j1(A(i,j)) == j+1
+                case GB_FIRSTJ_opcode     : // z = first_j(A(i,j),y) == j
+                case GB_FIRSTJ1_opcode    : // z = first_j1(A(i,j),y) == j+1
+                case GB_SECONDJ_opcode    : // z = second_j(x,A(i,j)) == j
+                case GB_SECONDJ1_opcode   : // z = second_j1(x,A(i,j)) == j+1
+                    #define GB_POSITION j + offset
+                    #include "GB_positional_op_ijp.c"
+                    return (GrB_SUCCESS) ;
+                default: ;
+            }
+        }
+        else
+        {
+            int32_t *GB_RESTRICT Cx_int = (int32_t *) Cx ;
+            switch (opcode)
+            {
+                case GB_POSITIONI_opcode  : // z = position_i(A(i,j)) == i
+                case GB_POSITIONI1_opcode : // z = position_i1(A(i,j)) == i+1
+                case GB_FIRSTI_opcode     : // z = first_i(A(i,j),y) == i
+                case GB_FIRSTI1_opcode    : // z = first_i1(A(i,j),y) == i+1
+                case GB_SECONDI_opcode    : // z = second_i(x,A(i,j)) == i
+                case GB_SECONDI1_opcode   : // z = second_i1(x,A(i,j)) == i+1
+                    #define GB_POSITION (int32_t) (i + offset)
+                    #include "GB_positional_op_ip.c"
+                    return (GrB_SUCCESS) ;
+                case GB_POSITIONJ_opcode  : // z = position_j(A(i,j)) == j
+                case GB_POSITIONJ1_opcode : // z = position_j1(A(i,j)) == j+1
+                case GB_FIRSTJ_opcode     : // z = first_j(A(i,j),y) == j
+                case GB_FIRSTJ1_opcode    : // z = first_j1(A(i,j),y) == j+1
+                case GB_SECONDJ_opcode    : // z = second_j(x,A(i,j)) == j
+                case GB_SECONDJ1_opcode   : // z = second_j1(x,A(i,j)) == j+1
+                    #define GB_POSITION (int32_t) (j + offset)
+                    #include "GB_positional_op_ijp.c"
+                    return (GrB_SUCCESS) ;
+                default: ;
+            }
+        }
+
+    }
+    else if (op1 != NULL)
+    {
+
+        //----------------------------------------------------------------------
+        // unary operator
+        //----------------------------------------------------------------------
+
+        ASSERT_UNARYOP_OK (op1, "op1 for GB_apply_op", GB0) ;
+
+        // determine number of threads to use
+        int nthreads = GB_nthreads (anz, chunk, nthreads_max) ;
         GrB_UnaryOp op = op1 ;
 
         #ifndef GBCOMPACT
-        bool no_typecasting = (Atype == op->xtype)
-            || (op->opcode == GB_IDENTITY_opcode)
-            || (op->opcode == GB_ONE_opcode) ;
-
-        if (no_typecasting)
+        if ((Atype == op->xtype)
+            || (opcode == GB_IDENTITY_opcode) || (opcode == GB_ONE_opcode))
         { 
 
-            // only two workers are allowed to do their own typecasting from
-            // the Atype to the xtype of the operator: IDENTITY and ONE.  For
-            // all others, the input type Atype must match the op->xtype of the
-            // operator.  If this check isn't done, abs.fp32 with fc32 input
-            // will map to abs.fc32, based on the type of the input Ax, which is
-            // the wrong operator.
+            // The switch factory is used if the op is IDENTITY or ONE, or if
+            // no typecasting is being done.  The ONE operator ignores the type
+            // of its input and just produces a 1 of op->ztype == op->xtype.
+            // The IDENTITY operator can do arbitrary typecasting.
 
             //------------------------------------------------------------------
             // define the worker for the switch factory
@@ -88,9 +201,9 @@ void GB_apply_op            // apply a unary operator, Cx = op ((xtype) Ax)
 
             #define GB_WORKER(op,zname,ztype,aname,atype)               \
             {                                                           \
-                GrB_Info info = GB_unop_apply (op,zname,aname)          \
-                    ((ztype *) Cx, (const atype *) Ax, anz, nthreads) ; \
-                if (info == GrB_SUCCESS) return ;                       \
+                if (GB_unop_apply (op,zname,aname) ((ztype *) Cx,       \
+                    (const atype *) Ax, Ab, anz, nthreads)              \
+                    == GrB_SUCCESS) return (GrB_SUCCESS) ;              \
             }                                                           \
             break ;
 
@@ -106,7 +219,7 @@ void GB_apply_op            // apply a unary operator, Cx = op ((xtype) Ax)
         // generic worker: typecast and apply a unary operator
         //----------------------------------------------------------------------
 
-        GB_BURBLE_N (anz, "generic ") ;
+        GB_BURBLE_N (anz, "(generic apply: %s) ", op->name) ;
 
         size_t asize = Atype->size ;
         size_t zsize = op->ztype->size ;
@@ -119,6 +232,7 @@ void GB_apply_op            // apply a unary operator, Cx = op ((xtype) Ax)
         #pragma omp parallel for num_threads(nthreads) schedule(static)
         for (p = 0 ; p < anz ; p++)
         { 
+            if (!GBB (Ab, p)) continue ;
             // xwork = (xtype) Ax [p]
             GB_void xwork [GB_VLA(xsize)] ;
             cast_A_to_X (xwork, Ax +(p*asize), asize) ;
@@ -131,14 +245,19 @@ void GB_apply_op            // apply a unary operator, Cx = op ((xtype) Ax)
     {
 
         //----------------------------------------------------------------------
-        // built-in binary operator
+        // binary operator
         //----------------------------------------------------------------------
 
-        GB_Opcode opcode = op2->opcode ;
+        ASSERT_BINARYOP_OK (op2, "standard op2 for GB_apply_op", GB0) ;
+        ASSERT_SCALAR_OK (scalar, "scalar for GB_apply_op", GB0) ;
+
+        // determine number of threads to use
+        int nthreads = GB_nthreads (anz, chunk, nthreads_max) ;
+
         GB_Type_code xcode, ycode, zcode ;
-        bool op_is_first  = opcode == GB_FIRST_opcode ;
-        bool op_is_second = opcode == GB_SECOND_opcode ;
-        bool op_is_pair   = opcode == GB_PAIR_opcode ;
+        bool op_is_first  = (opcode == GB_FIRST_opcode) ;
+        bool op_is_second = (opcode == GB_SECOND_opcode) ;
+        bool op_is_pair   = (opcode == GB_PAIR_opcode) ;
 
         size_t asize = Atype->size ;
         size_t ssize = scalar->type->size ;
@@ -177,94 +296,97 @@ void GB_apply_op            // apply a unary operator, Cx = op ((xtype) Ax)
         }
 
         #ifndef GBCOMPACT
-        if (binop_bind1st)
-        {
 
-            //------------------------------------------------------------------
-            // z = op(scalar,Ax)
-            //------------------------------------------------------------------
-
-            if (GB_binop_builtin (
-                op2->xtype, ignore_scalar,
-                Atype,      op_is_first  || op_is_pair,
-                op2, false, &opcode, &xcode, &ycode, &zcode))
-            { 
-
-                //--------------------------------------------------------------
-                // define the worker for the switch factory
-                //--------------------------------------------------------------
-
-                #define GB_bind1st(op,xname) GB_bind1st_ ## op ## xname
-
-                #define GB_BINOP_WORKER(op,xname)                       \
-                {                                                       \
-                    if (GB_bind1st (op, xname) (Cx, scalarx, Ax,        \
-                            anz, nthreads) == GrB_SUCCESS) return ;     \
-                }                                                       \
-                break ;
+            if (binop_bind1st)
+            {
 
                 //--------------------------------------------------------------
-                // launch the switch factory
+                // z = op(scalar,Ax)
                 //--------------------------------------------------------------
 
-                #define GB_NO_SECOND
-                #define GB_NO_PAIR
-                #include "GB_binop_factory.c"
+                if (GB_binop_builtin (
+                    op2->xtype, ignore_scalar,
+                    Atype,      op_is_first  || op_is_pair,
+                    op2, false, &opcode, &xcode, &ycode, &zcode))
+                { 
+
+                    //----------------------------------------------------------
+                    // define the worker for the switch factory
+                    //----------------------------------------------------------
+
+                    #define GB_bind1st(op,xname) GB_bind1st_ ## op ## xname
+
+                    #define GB_BINOP_WORKER(op,xname)                        \
+                    {                                                        \
+                        if (GB_bind1st (op, xname) (Cx, scalarx, Ax, Ab, anz,\
+                            nthreads) == GrB_SUCCESS) return (GrB_SUCCESS) ; \
+                    }                                                        \
+                    break ;
+
+                    //----------------------------------------------------------
+                    // launch the switch factory
+                    //----------------------------------------------------------
+
+                    #define GB_NO_SECOND
+                    #define GB_NO_PAIR
+                    #include "GB_binop_factory.c"
+                }
             }
-        }
-        else
-        {
-
-            //------------------------------------------------------------------
-            // z = op(Ax,scalar)
-            //------------------------------------------------------------------
-
-            if (GB_binop_builtin (
-                Atype,      op_is_second || op_is_pair,
-                op2->ytype, ignore_scalar,
-                op2, false, &opcode, &xcode, &ycode, &zcode))
-            { 
-
-                //--------------------------------------------------------------
-                // define the worker for the switch factory
-                //--------------------------------------------------------------
-
-                #define GB_bind2nd(op,xname) GB_bind2nd_ ## op ## xname
-                #undef  GB_BINOP_WORKER
-                #define GB_BINOP_WORKER(op,xname)                       \
-                {                                                       \
-                    if (GB_bind2nd (op, xname) (Cx, Ax, scalarx,        \
-                            anz, nthreads) == GrB_SUCCESS) return ;     \
-                }                                                       \
-                break ;
+            else
+            {
 
                 //--------------------------------------------------------------
-                // launch the switch factory
+                // z = op(Ax,scalar)
                 //--------------------------------------------------------------
 
-                #define GB_NO_FIRST
-                #define GB_NO_PAIR
-                #include "GB_binop_factory.c"
+                if (GB_binop_builtin (
+                    Atype,      op_is_second || op_is_pair,
+                    op2->ytype, ignore_scalar,
+                    op2, false, &opcode, &xcode, &ycode, &zcode))
+                { 
+
+                    //----------------------------------------------------------
+                    // define the worker for the switch factory
+                    //----------------------------------------------------------
+
+                    #define GB_bind2nd(op,xname) GB_bind2nd_ ## op ## xname
+                    #undef  GB_BINOP_WORKER
+                    #define GB_BINOP_WORKER(op,xname)                        \
+                    {                                                        \
+                        if (GB_bind2nd (op, xname) (Cx, Ax, scalarx, Ab, anz,\
+                            nthreads) == GrB_SUCCESS) return (GrB_SUCCESS) ; \
+                    }                                                        \
+                    break ;
+
+                    //----------------------------------------------------------
+                    // launch the switch factory
+                    //----------------------------------------------------------
+
+                    #define GB_NO_FIRST
+                    #define GB_NO_PAIR
+                    #include "GB_binop_factory.c"
+                }
             }
-        }
+
         #endif
 
         //----------------------------------------------------------------------
         // generic worker: typecast and apply a binary operator
         //----------------------------------------------------------------------
 
-        GB_BURBLE_N (anz, "generic ") ;
+        GB_BURBLE_N (anz, "(generic apply: %s) ", op2->name) ;
         GB_Type_code acode = Atype->code ;
         GxB_binary_function fop = op2->function ;
 
         if (binop_bind1st)
-        { 
+        {
             // Cx = op (scalar,Ax)
             GB_cast_function cast_A_to_Y = GB_cast_factory (ycode, acode) ;
             int64_t p ;
             #pragma omp parallel for num_threads(nthreads) schedule(static)
             for (p = 0 ; p < anz ; p++)
-            {
+            { 
+                if (!GBB (Ab, p)) continue ;
                 // ywork = (ytype) Ax [p]
                 GB_void ywork [GB_VLA(ysize)] ;
                 cast_A_to_Y (ywork, Ax +(p*asize), asize) ;
@@ -273,13 +395,14 @@ void GB_apply_op            // apply a unary operator, Cx = op ((xtype) Ax)
             }
         }
         else
-        { 
+        {
             // Cx = op (Ax,scalar)
             GB_cast_function cast_A_to_X = GB_cast_factory (xcode, acode) ;
             int64_t p ;
             #pragma omp parallel for num_threads(nthreads) schedule(static)
             for (p = 0 ; p < anz ; p++)
-            {
+            { 
+                if (!GBB (Ab, p)) continue ;
                 // xwork = (xtype) Ax [p]
                 GB_void xwork [GB_VLA(xsize)] ;
                 cast_A_to_X (xwork, Ax +(p*asize), asize) ;
@@ -288,5 +411,11 @@ void GB_apply_op            // apply a unary operator, Cx = op ((xtype) Ax)
             }
         }
     }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_assert.h b/GraphBLAS/Source/GB_assert.h
index 2adbaa6ec3..a3f322e28e 100644
--- a/GraphBLAS/Source/GB_assert.h
+++ b/GraphBLAS/Source/GB_assert.h
@@ -2,8 +2,8 @@
 // GB_assert.h: assertions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,7 +17,6 @@
 #undef ASSERT
 #undef ASSERT_OK
 #undef ASSERT_OK_OR_NULL
-#undef ASSERT_OK_OR_JUMBLED
 
 #ifdef GB_DEBUG
 
@@ -26,8 +25,7 @@
     {                                                                       \
         if (!(X))                                                           \
         {                                                                   \
-            GBDUMP ("assert(" GB_STR(X) ") failed: "                        \
-                __FILE__ " line %d\n", __LINE__) ;                          \
+            GBDUMP ("assertion failed: " __FILE__ " line %d\n", __LINE__) ; \
             GB_Global_abort_function ( ) ;                                  \
         }                                                                   \
     }
@@ -47,22 +45,12 @@
         ASSERT (Info == GrB_SUCCESS || Info == GrB_NULL_POINTER) ;          \
     }
 
-    // call a GraphBLAS method and assert that it returns GrB_SUCCESS
-    // or GrB_INDEX_OUT_OF_BOUNDS.  Used by GB_Matrix_check(A,...) when the
-    // indices in the vectors of A may be jumbled.
-    #define ASSERT_OK_OR_JUMBLED(X)                                         \
-    {                                                                       \
-        GrB_Info Info = (X) ;                                               \
-        ASSERT (Info == GrB_SUCCESS || Info == GrB_INDEX_OUT_OF_BOUNDS) ;   \
-    }
-
 #else
 
     // debugging disabled
     #define ASSERT(X)
     #define ASSERT_OK(X)
     #define ASSERT_OK_OR_NULL(X)
-    #define ASSERT_OK_OR_JUMBLED(X)
 
 #endif
 
@@ -70,11 +58,12 @@
 
 // for finding tests that trigger statement coverage.  If running a test
 // in GraphBLAS/Tcov, the test does not terminate.
+#if 1
 #ifdef GBTESTCOV
 #define GB_GOTCHA                                                   \
 {                                                                   \
-    fprintf (stderr, "gotcha: " __FILE__ " line: %d\n", __LINE__) ; \
-    GBDUMP ("gotcha: " __FILE__ " line: %d\n", __LINE__) ;          \
+    fprintf (stderr, "Gotcha: " __FILE__ " line: %d\n", __LINE__) ; \
+    GBDUMP ("Gotcha: " __FILE__ " line: %d\n", __LINE__) ;          \
 }
 #else
 #define GB_GOTCHA                                                   \
@@ -84,8 +73,13 @@
     GB_Global_abort_function ( ) ;                                  \
 }
 #endif
+#endif
+
+#ifndef GB_GOTCHA
+#define GB_GOTCHA
+#endif
 
-#define GB_HERE GBDUMP ("%2d: Here: " __FILE__ " line: %d\n", __LINE__) ;
+#define GB_HERE GBDUMP ("%2d: Here: " __FILE__ "\n", __LINE__) ;
 
 // ASSERT (GB_DEAD_CODE) marks code that is intentionally dead, leftover from
 // prior versions of SuiteSparse:GraphBLAS but no longer used in the current
@@ -98,60 +92,58 @@
 //------------------------------------------------------------------------------
 
 #define ASSERT_TYPE_OK(t,name,pr)  \
-    ASSERT_OK (GB_Type_check (t, name, pr, NULL, Context))
+    ASSERT_OK (GB_Type_check (t, name, pr, NULL))
 
 #define ASSERT_TYPE_OK_OR_NULL(t,name,pr)  \
-    ASSERT_OK_OR_NULL (GB_Type_check (t, name, pr, NULL, Context))
+    ASSERT_OK_OR_NULL (GB_Type_check (t, name, pr, NULL))
 
 #define ASSERT_BINARYOP_OK(op,name,pr)  \
-    ASSERT_OK (GB_BinaryOp_check (op, name, pr, NULL, Context))
+    ASSERT_OK (GB_BinaryOp_check (op, name, pr, NULL))
 
 #define ASSERT_BINARYOP_OK_OR_NULL(op,name,pr)  \
-    ASSERT_OK_OR_NULL (GB_BinaryOp_check (op, name, pr, NULL, Context))
+    ASSERT_OK_OR_NULL (GB_BinaryOp_check (op, name, pr, NULL))
 
 #define ASSERT_UNARYOP_OK(op,name,pr)  \
-    ASSERT_OK (GB_UnaryOp_check (op, name, pr, NULL, Context))
+    ASSERT_OK (GB_UnaryOp_check (op, name, pr, NULL))
 
 #define ASSERT_UNARYOP_OK_OR_NULL(op,name,pr)  \
-    ASSERT_OK_OR_NULL (GB_UnaryOp_check (op, name, pr, NULL, Context))
+    ASSERT_OK_OR_NULL (GB_UnaryOp_check (op, name, pr, NULL))
 
 #define ASSERT_SELECTOP_OK(op,name,pr)  \
-    ASSERT_OK (GB_SelectOp_check (op, name, pr, NULL, Context))
+    ASSERT_OK (GB_SelectOp_check (op, name, pr, NULL))
 
 #define ASSERT_SELECTOP_OK_OR_NULL(op,name,pr)  \
-    ASSERT_OK_OR_NULL (GB_SelectOp_check (op, name, pr, NULL, Context))
+    ASSERT_OK_OR_NULL (GB_SelectOp_check (op, name, pr, NULL))
 
 #define ASSERT_MONOID_OK(mon,name,pr)  \
-    ASSERT_OK (GB_Monoid_check (mon, name, pr, NULL, Context))
+    ASSERT_OK (GB_Monoid_check (mon, name, pr, NULL))
 
 #define ASSERT_SEMIRING_OK(s,name,pr)  \
-    ASSERT_OK (GB_Semiring_check (s, name, pr, NULL, Context))
+    ASSERT_OK (GB_Semiring_check (s, name, pr, NULL))
 
 #define ASSERT_MATRIX_OK(A,name,pr)  \
-    ASSERT_OK (GB_Matrix_check (A, name, pr, NULL, Context))
+    ASSERT_OK (GB_Matrix_check (A, name, pr, NULL))
 
 #define ASSERT_MATRIX_OK_OR_NULL(A,name,pr)  \
-    ASSERT_OK_OR_NULL (GB_Matrix_check (A, name, pr, NULL, Context))
-
-#define ASSERT_MATRIX_OK_OR_JUMBLED(A,name,pr)  \
-    ASSERT_OK_OR_JUMBLED (GB_Matrix_check (A, name, pr, NULL, Context))
+    ASSERT_OK_OR_NULL (GB_Matrix_check (A, name, pr, NULL))
 
 #define ASSERT_VECTOR_OK(v,name,pr)  \
-    ASSERT_OK (GB_Vector_check (v, name, pr, NULL, Context))
+    ASSERT_OK (GB_Vector_check (v, name, pr, NULL))
 
 #define ASSERT_VECTOR_OK_OR_NULL(v,name,pr)  \
-    ASSERT_OK_OR_NULL (GB_Vector_check (v, name, pr, NULL, Context))
+    ASSERT_OK_OR_NULL (GB_Vector_check (v, name, pr, NULL))
 
 #define ASSERT_SCALAR_OK(s,name,pr)  \
-    ASSERT_OK (GB_Scalar_check (s, name, pr, NULL, Context))
+    ASSERT_OK (GB_Scalar_check (s, name, pr, NULL))
 
 #define ASSERT_SCALAR_OK_OR_NULL(s,name,pr)  \
-    ASSERT_OK_OR_NULL (GB_Scalar_check (s, name, pr, NULL, Context))
+    ASSERT_OK_OR_NULL (GB_Scalar_check (s, name, pr, NULL))
 
 #define ASSERT_DESCRIPTOR_OK(d,name,pr)  \
-    ASSERT_OK (GB_Descriptor_check (d, name, pr, NULL, Context))
+    ASSERT_OK (GB_Descriptor_check (d, name, pr, NULL))
 
 #define ASSERT_DESCRIPTOR_OK_OR_NULL(d,name,pr)  \
-    ASSERT_OK_OR_NULL (GB_Descriptor_check (d, name, pr, NULL, Context))
+    ASSERT_OK_OR_NULL (GB_Descriptor_check (d, name, pr, NULL))
 
 #endif
+
diff --git a/GraphBLAS/Source/GB_assign.c b/GraphBLAS/Source/GB_assign.c
index c808256352..172a142cc5 100644
--- a/GraphBLAS/Source/GB_assign.c
+++ b/GraphBLAS/Source/GB_assign.c
@@ -2,8 +2,8 @@
 // GB_assign: submatrix assignment: C<M>(Rows,Cols) = accum (C(Rows,Cols),A)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -25,33 +25,32 @@
 // Compare with GB_subassign, which uses M and C_replace differently
 
 #include "GB_assign.h"
+#include "GB_assign_zombie.h"
 #include "GB_subassign.h"
 #include "GB_subref.h"
-#include "GB_transpose.h"
+#include "GB_bitmap_assign.h"
 
 #define GB_FREE_ALL             \
 {                               \
-    GB_MATRIX_FREE (&Z2) ;      \
-    GB_MATRIX_FREE (&AT) ;      \
-    GB_MATRIX_FREE (&MT) ;      \
+    GB_Matrix_free (&C2) ;      \
+    GB_Matrix_free (&M2) ;      \
+    GB_Matrix_free (&A2) ;      \
     GB_FREE (I2) ;              \
-    GB_FREE (I2k) ;             \
     GB_FREE (J2) ;              \
-    GB_FREE (J2k) ;             \
-    GB_MATRIX_FREE (&SubMask) ; \
+    GB_Matrix_free (&SubMask) ; \
 }
 
 GrB_Info GB_assign                  // C<M>(Rows,Cols) += A or A'
 (
-    GrB_Matrix C,                   // input/output matrix for results
+    GrB_Matrix C_in,                // input/output matrix for results
     bool C_replace,                 // descriptor for C
     const GrB_Matrix M_in,          // optional mask for C
     const bool Mask_comp,           // true if mask is complemented
     const bool Mask_struct,         // if true, use the only structure of M
-    bool M_transpose,               // true if the mask should be transposed
+    const bool M_transpose,         // true if the mask should be transposed
     const GrB_BinaryOp accum,       // optional accum for accum(C,T)
     const GrB_Matrix A_in,          // input matrix
-    bool A_transpose,               // true if A is transposed
+    const bool A_transpose,         // true if A is transposed
     const GrB_Index *Rows,          // row indices
     const GrB_Index nRows_in,       // number of row indices
     const GrB_Index *Cols,          // column indices
@@ -59,234 +58,61 @@ GrB_Info GB_assign                  // C<M>(Rows,Cols) += A or A'
     const bool scalar_expansion,    // if true, expand scalar to A
     const void *scalar,             // scalar to be expanded
     const GB_Type_code scalar_code, // type code of scalar to expand
-    const bool col_assign,          // true for GrB_Col_assign
-    const bool row_assign,          // true for GrB_Row_assign
+    int assign_kind,                // row assign, col assign, or assign
     GB_Context Context
 )
 {
 
     //--------------------------------------------------------------------------
-    // check inputs
+    // check and prep inputs
     //--------------------------------------------------------------------------
 
     GrB_Info info ;
-    GrB_Matrix AT = NULL ;
-    GrB_Matrix MT = NULL ;
-    GrB_Matrix Z = NULL ;
-    GrB_Matrix Z2 = NULL ;
-    GrB_Index *GB_RESTRICT I2  = NULL ;
-    GrB_Index *GB_RESTRICT I2k = NULL ;
-    GrB_Index *GB_RESTRICT J2  = NULL ;
-    GrB_Index *GB_RESTRICT J2k = NULL ;
-    int64_t I2_size = 0, J2_size = 0 ;
+    GrB_Matrix C = NULL ;           // C_in or C2
+    GrB_Matrix M = NULL ;           // M_in or M2
+    GrB_Matrix A = NULL ;           // A_in or A2
+    GrB_Index *I = NULL ;           // Rows, Cols, or I2
+    GrB_Index *J = NULL ;           // Rows, Cols, or J2
+
+    // temporary matrices and arrays
+    GrB_Matrix C2 = NULL ;
+    GrB_Matrix M2 = NULL ;
+    GrB_Matrix A2 = NULL ;
+    GrB_Index *I2  = NULL ;
+    GrB_Index *J2  = NULL ;
     GrB_Matrix SubMask = NULL ;
 
-    // C may be aliased with M_in and/or A_in
+    GrB_Type atype = NULL ;
+    bool done = false ;
+    int64_t ni, nj, nI, nJ, Icolon [3], Jcolon [3] ;
+    int Ikind, Jkind ;
+    ASSERT_MATRIX_OK (C_in, "C_in for assign", GB0) ;
 
-    GB_RETURN_IF_FAULTY (accum) ;
-    GB_RETURN_IF_NULL (Rows) ;
-    GB_RETURN_IF_NULL (Cols) ;
+    GB_OK (GB_assign_prep (&C, &M, &A, &C2, &M2, &A2,
+        &I, &I2, &ni, &nI, &Ikind, Icolon,
+        &J, &J2, &nj, &nJ, &Jkind, Jcolon,
+        &done, &atype, C_in, &C_replace, &assign_kind,
+        M_in, Mask_comp, Mask_struct, M_transpose, accum,
+        A_in, A_transpose, Rows, nRows_in, Cols, nCols_in,
+        scalar_expansion, scalar, scalar_code, Context)) ;
 
-    GrB_Matrix M = M_in ;
-    GrB_Matrix A = A_in ;
+    ASSERT_MATRIX_OK (C, "initial C for assign", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (M, "initial M for assign", GB0) ;
 
-    if (scalar_expansion)
-    { 
-        // for scalar expansion, the NULL pointer case has been already checked
-        // for user-defined types, and can't be NULL for built-in types.
-        ASSERT (scalar != NULL) ;
-        ASSERT (A == NULL) ;
-        ASSERT (!row_assign && !col_assign) ;
-    }
-    else
+    if (done)
     { 
-        // GrB_*assign, not scalar:  The user's input matrix has been checked.
-        // The pointer to the scalar is NULL.
-        ASSERT (scalar == NULL) ;
-        ASSERT_MATRIX_OK (A, "A for GB_assign", GB0) ;
-    }
-
-    ASSERT_MATRIX_OK (C, "C input for GB_assign", GB0) ;
-    ASSERT_MATRIX_OK_OR_NULL (M, "M for GB_assign", GB0) ;
-    ASSERT_BINARYOP_OK_OR_NULL (accum, "accum for GB_assign", GB0) ;
-    ASSERT (scalar_code <= GB_UDT_code) ;
-
-    // only one of these three cases can be true:
-    ASSERT (row_assign + col_assign + scalar_expansion <= 1) ;
-
-    // row_assign always uses M_transpose == true.
-    // all other methods use M_transpose == false.
-    ASSERT (row_assign == M_transpose) ;
-
-    int64_t nRows, nCols, RowColon [3], ColColon [3] ;
-    int RowsKind, ColsKind ;
-    GB_ijlength (Rows, nRows_in, GB_NROWS (C), &nRows, &RowsKind, RowColon) ;
-    GB_ijlength (Cols, nCols_in, GB_NCOLS (C), &nCols, &ColsKind, ColColon) ;
-
-    bool whole_C_matrix = (RowsKind == GB_ALL && ColsKind == GB_ALL) ;
-
-    bool C_is_csc = C->is_csc ;
-
-    //--------------------------------------------------------------------------
-    // check domains and dimensions for C<M>(Rows,Cols) += A or A'
-    //--------------------------------------------------------------------------
-
-    // GB_compatible is not used since most of it is slightly different here
-    if (accum != NULL)
-    { 
-        // C<M>(Rows,Cols) = accum (C(Rows,Cols),A)
-        GB_OK (GB_BinaryOp_compatible (accum, C->type, C->type,
-            (scalar_expansion) ? NULL : A->type,
-            (scalar_expansion) ? scalar_code : GB_ignore_code, Context)) ;
-    }
-
-    // C<M>(Rows,Cols) = T, so C and T must be compatible.
-    // also C<M>(Rows,Cols) = accum(C,T) for entries in T but not C
-    if (scalar_expansion)
-    {
-        if (!GB_code_compatible (C->type->code, scalar_code))
-        { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-                "Input scalar of type [%s]\n"
-                "cannot be typecast to output of type [%s]",
-                GB_code_string (scalar_code), C->type->name))) ;
-        }
-    }
-    else
-    {
-        if (!GB_Type_compatible (C->type, A->type))
-        { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-                "Input of type [%s]\n"
-                "cannot be typecast to output of type [%s]",
-                A->type->name, C->type->name))) ;
-        }
-    }
-
-    // check the dimensions and type of M
-    if (M != NULL)
-    {
-        // M is typecast to boolean
-        if (!GB_Type_compatible (M->type, GrB_BOOL))
-        { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-                "M of type [%s] cannot be typecast to boolean",
-                M->type->name))) ;
-        }
-        // check the mask: size depends on the method
-        if (row_assign)
-        {
-            // GrB_Row_assign:
-            // M is a column vector the same size as one row of C
-            ASSERT (nRows == 1) ;
-            ASSERT (!scalar_expansion && !col_assign) ;
-            ASSERT (GB_VECTOR_OK (M)) ;
-            if (GB_NROWS (M) != GB_NCOLS (C))
-            { 
-                return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
-                    "Mask vector m length is " GBd "; must match the number of "
-                    "columns of C (" GBd ")", GB_NROWS (M), GB_NCOLS (C)))) ;
-            }
-        }
-        else if (col_assign)
-        {
-            // GrB_Col_assign:
-            // M is a column vector the same size as one column of C
-            ASSERT (nCols == 1) ;
-            ASSERT (!scalar_expansion && !row_assign) ;
-            ASSERT (GB_VECTOR_OK (M)) ;
-            if (GB_NROWS (M) != GB_NROWS (C))
-            { 
-                return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
-                    "Mask vector m length is " GBd "; must match the number of "
-                    "rows of C (" GBd ")", GB_NROWS (M), GB_NROWS (C)))) ;
-            }
-        }
-        else
-        {
-            // GrB_Matrix_assign, GrB_Vector_assign, and scalar variants:
-            // M is a matrix the same size as C for entire matrix (or vector)
-            // assignment, where A is either a matrix or a scalar
-            if (GB_NROWS (M) != GB_NROWS (C) || GB_NCOLS (M) != GB_NCOLS (C))
-            { 
-                return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
-                    "Mask M is " GBd "-by-" GBd "; "
-                    "must match result C (" GBd "-by-" GBd ")",
-                    GB_NROWS (M), GB_NCOLS (M), GB_NROWS (C), GB_NCOLS (C)))) ;
-            }
-        }
-    }
-
-    // check the dimensions of A
-    if (!scalar_expansion)
-    {
-        int64_t anrows = (A_transpose) ? GB_NCOLS (A) : GB_NROWS (A) ;
-        int64_t ancols = (A_transpose) ? GB_NROWS (A) : GB_NCOLS (A) ;
-        if (nRows != anrows || nCols != ancols)
-        { 
-            return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
-                "Dimensions not compatible:\n"
-                "C(Rows,Cols) is " GBd "-by-" GBd "\n"
-                "input is " GBd "-by-" GBd "%s",
-                nRows, nCols, anrows, ancols,
-                A_transpose ? " (transposed)" : ""))) ;
-        }
+        // GB_assign_prep has handled the entire assignment itself
+        ASSERT_MATRIX_OK (C_in, "QUICK : Final C for assign", GB0) ;
+        ASSERT (C == C_in) ;
+        return (GrB_SUCCESS) ;
     }
 
     //--------------------------------------------------------------------------
-    // quick return if an empty mask is complemented
+    // determine method for GB_subassigner
     //--------------------------------------------------------------------------
 
-    if (Mask_comp && M == NULL)
-    {
-        // The mask M is empty, and complemented, and thus M(i,j)=0 for all i
-        // and j.  The result does not depend on A, Rows, Cols, or accum.  The
-        // output C is either untouched (if C_replace is false) or cleared (if
-        // C_replace is true).  However, the GrB_Row_assign and GrB_Col_assign
-        // only clear their specific row or column of C, respectively.
-
-        // M is NULL so C and M cannot be the same, and A is ignored so
-        // it doesn't matter whether or not C == A.  Thus C is not aliased
-        // to the inputs.
-
-        if (C_replace)
-        {
-            ASSERT_MATRIX_OK (C, "C for quick mask", GB0) ;
-            if (row_assign || col_assign)
-            {
-                // all pending tuples must first be assembled; zombies OK
-                GB_MATRIX_WAIT_PENDING (C) ;
-                ASSERT_MATRIX_OK (C, "waited C for quick mask", GB0) ;
-                if ((row_assign && !C_is_csc) || (col_assign && C_is_csc))
-                { 
-                    // delete all entries in vector j
-                    GBBURBLE ("C(:,j)=zombie ") ;
-                    int64_t j = (col_assign) ? Cols [0] : Rows [0] ;
-                    GB_assign_zombie1 (C, j, Context) ;
-                }
-                else
-                { 
-                    // delete all entries in each vector with index i
-                    GBBURBLE ("C(i,:)=zombie ") ;
-                    int64_t i = (row_assign) ? Rows [0] : Cols [0] ;
-                    GB_assign_zombie2 (C, i, Context) ;
-                }
-            }
-            else
-            { 
-                // C<!NULL>=NULL since result does not depend on computing Z.
-                // Since C_replace is true, all of C is cleared.  This is the
-                // same as the GB_RETURN_IF_QUICK_MASK macro, except that C may
-                // have zombies and pending tuples.
-                return (GB_clear (C, Context)) ;
-            }
-        }
-
-        if (C->nzombies > 0) { if (!GB_queue_insert (C)) GB_PANIC ; } // TODO in 4.0: delete
-        // finalize C if blocking mode is enabled, and return result
-        ASSERT_MATRIX_OK (C, "Final C for assign, quick mask", GB0) ;
-        return (GB_block (C, Context)) ;
-    }
+    int subassign_method = GB_subassigner_method (C, C_replace,
+        M, Mask_comp, Mask_struct, accum, A, Ikind, Jkind, scalar_expansion) ;
 
     //--------------------------------------------------------------------------
     // determine if the final C_replace phase is needed
@@ -296,19 +122,20 @@ GrB_Info GB_assign                  // C<M>(Rows,Cols) += A or A'
     // all of C), or all that the operation can modify for row/col assign.
 
     bool whole_submatrix ;
-    if (row_assign)
+    bool whole_C_matrix = (Ikind == GB_ALL && Jkind == GB_ALL) ;
+    if (assign_kind == GB_ROW_ASSIGN)
     { 
-        // row assignment to the entire row
-        whole_submatrix = (ColsKind == GB_ALL) ;
+        // C(i,:) = ... row assignment to the entire row
+        whole_submatrix = (Jkind == GB_ALL) ;
     }
-    else if (col_assign)
+    else if (assign_kind == GB_COL_ASSIGN)
     { 
-        // col assignment to the entire column
-        whole_submatrix = (RowsKind == GB_ALL) ;
+        // C(:,j) = ... col assignment to the entire column
+        whole_submatrix = (Ikind == GB_ALL) ;
     }
     else
     { 
-        // matrix assignment to the entire matrix
+        // C(:,:) = ... matrix assignment to the entire matrix
         whole_submatrix = whole_C_matrix ;
     }
 
@@ -318,513 +145,242 @@ GrB_Info GB_assign                  // C<M>(Rows,Cols) += A or A'
     // C_replace_phase is true if a final pass over all of C is required
     // to delete entries outside the C(I,J) submatrix.
     bool C_replace_phase = (C_replace && !Mask_is_same) ;
-    ASSERT (!Mask_is_same == (M != NULL && !whole_submatrix)) ;
-
-    //--------------------------------------------------------------------------
-    // apply pending updates to A and M
-    //--------------------------------------------------------------------------
 
-    // if C == M or C == A, pending updates are applied to C as well
-
-    // delete any lingering zombies and assemble any pending tuples
-    // but only in A and M, not C
-    GB_MATRIX_WAIT (M) ;
-    if (!scalar_expansion)
+    if ((GB_IS_BITMAP (C) || GB_IS_FULL (C)) && C_replace_phase)
     { 
-        GB_MATRIX_WAIT (A) ;
+        // GB_subassigner_method might not select the bitmap assignment
+        subassign_method = GB_SUBASSIGN_METHOD_BITMAP ;
     }
 
     //--------------------------------------------------------------------------
-    // handle the CSR/CSC format of C:
+    // do the assignment
     //--------------------------------------------------------------------------
 
-    // GrB_Row_assign: A is always a vector in CSC format, and A_transpose is
-    // always true.  If C is in CSC format then A_transpose remains true, and
-    // the n-by-1 vector A is transposed below into a 1-by-n hypersparse CSC
-    // matrix.  If C is in CSR format then A_transpose becomes false, and the
-    // assignment does not need to transpose A.  It remains in CSC format but
-    // has the correct vector length and dimension for the CSR/CSC-agnostic
-    // assignment.
-
-    // GrB_Col_assign: A is always a vector in CSC format, and A_transpose is
-    // always false.  If C is in CSC format then A_transpose remains false, and
-    // the assignment does not need to transpose A.  If C is in CSR format then
-    // A_transpose becomes true, and the the n-by-1 vector A is transposed
-    // below into a 1-by-n hypersparse CSC matrix.  The CSC format is ignored
-    // by the CSR/CSC-agnostic assignment.
-
-    // GrB_Vector_assign:  both A and C are always in CSC format, and
-    // A_transpose is always false, and doesn't change below.
-
-    // GrB_Matrix_assign:  A and C can be in any format, and A_transpose can be
-    // true or false, depending on the descriptor.  If the CSR/CSC formats of A
-    // and C are the same, then A_transpose remains as-is.  If they differ,
-    // then A_transpose is flipped.  Then the CSR-CSC agnostic assignment
-    // proceeds.
-
-    if (!scalar_expansion && C_is_csc != A->is_csc)
+    if (subassign_method == GB_SUBASSIGN_METHOD_BITMAP)
     { 
-        // Flip the sense of A_transpose
-        A_transpose = !A_transpose ;
-    }
-
-    // get the I and J index lists
-    int Ikind, Jkind ;
-    const GrB_Index *I, *J ;
-    int64_t ni, nj, nI, nJ, *Icolon, *Jcolon ;
-
-    if (C_is_csc)
-    { 
-        // C is in CSC format
-        I      = Rows     ;     J      = Cols     ;
-        ni     = nRows_in ;     nj     = nCols_in ;
-        Ikind  = RowsKind ;     Jkind  = ColsKind ;
-        nI     = nRows    ;     nJ     = nCols    ;
-        Icolon = RowColon ;     Jcolon = ColColon ;
-    }
-    else
-    { 
-        // C is in CSR format
-        I       = Cols     ;    J       = Rows     ;
-        ni      = nCols_in ;    nj      = nRows_in ;
-        Ikind   = ColsKind ;    Jkind   = RowsKind ;
-        nI      = nCols    ;    nJ      = nRows    ;
-        Icolon  = ColColon ;    Jcolon  = RowColon ;
-    }
-
-    // C has cnvec vectors, where cnvec <= C->vdim
-    // J is a list of vectors in the range 0:C->vdim-1
-    // I is a list of indices in the range 0:C->vlen-1
-
-    //--------------------------------------------------------------------------
-    // scalar expansion: sort I and J and remove duplicates
-    //--------------------------------------------------------------------------
-
-    if (scalar_expansion)
-    {
-        // The spec states that scalar expansion is well-defined if I and J
-        // have duplicate entries.  However, GB_subassigner is not defined in
-        // this case.  To ensure that GrB_assign is well-defined, duplicates in
-        // I and J must first be removed.  This reduces the size of I and J,
-        // but has no effect on any other parameters.  This can be done here
-        // since the mask M has the same size as C (or the entire row/column
-        // for GrB_Row_assign and GrB_Col_assign).  It cannot be done in
-        // GB_subassigner since its mask has the same size as IxJ.
-
-        // no need to sort a list of length 0 or 1; it is already sorted
-
-        if (Ikind == GB_LIST && ni > 1)
-        { 
-            // ni and nI are reduced if there are duplicates
-            I2_size = ni ;
-            GB_OK (GB_ijsort (I, &ni, &I2, &I2k, Context)) ;
-            ASSERT (ni <= I2_size) ;
-            nI = ni ;
-            I = I2 ;
-        }
 
-        if (Jkind == GB_LIST && nj > 1)
-        { 
-            // nj and nJ are reduced if there are duplicates
-            J2_size = nj ;
-            GB_OK (GB_ijsort (J, &nj, &J2, &J2k, Context)) ;
-            ASSERT (nj <= J2_size) ;
-            nJ = nj ;
-            J = J2 ;
-        }
-    }
-
-    //--------------------------------------------------------------------------
-    // transpose A if requested
-    //--------------------------------------------------------------------------
-
-    // GrB_Row_assign and GrB_Col_assign pass A as a typecasted vector,
-    // which is then quickly transposed to a hypersparse matrix.
+        //----------------------------------------------------------------------
+        // use GB_bitmap_assign directly
+        //----------------------------------------------------------------------
 
-    if (!scalar_expansion && A_transpose)
-    { 
-        // AT = A', with no typecasting
-        // transpose: no typecast, no op, not in place
-        GBBURBLE ("(A transpose) ") ;
-        GB_OK (GB_transpose (&AT, NULL, C_is_csc, A,
-            NULL, NULL, NULL, false, Context)) ;
-        A = AT ;
-    }
+        // GB_bitmap_assign does not need to create the SubMask, and it also
+        // handles the C_replace_phase itself.  C is bitmap, or is converted to
+        // bitmap by GB_bitmap_assign, before the assignment.  For the C = A
+        // and C = scalar assignment, C may be returned in any sparsity
+        // structure, but otherwise C is returned as bitmap.
 
-    //--------------------------------------------------------------------------
-    // extract the SubMask = M (I,J) if needed
-    //--------------------------------------------------------------------------
+        GB_OK (GB_bitmap_assign (C, C_replace,
+            I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+            M, Mask_comp, Mask_struct, accum, A,
+            scalar, atype, assign_kind, Context)) ;
 
-    if (Mask_is_same)
-    { 
-        // the mask M is the same for GB_assign and GB_subassign.  Either
-        // both masks are NULL, or SubMask = M (:,:), and the two masks
-        // are equivalent.
-        ASSERT_MATRIX_OK_OR_NULL (SubMask, "SubMask is same as M", GB0) ;
     }
     else
     {
-        // extract M (I,J)
-        ASSERT_MATRIX_OK (M, "big mask", GB0) ;
-        if (row_assign)
-        {
-            // SubMask = M(Cols,:), but use I or J if they are the sorted I2, J2
-            ASSERT (GB_VECTOR_OK (M)) ;
-            ASSERT (M->is_csc) ;
-            if (C_is_csc)
-            { 
-                // SubMask = Mask (J,:)
-                ASSERT (J == J2 || J == Cols) ;
-                GB_OK (GB_subref (&SubMask, true, M,
-                    J, nj, GrB_ALL, 1, false, true, Context)) ;
-            }
-            else
-            { 
-                // SubMask = Mask (I,:)
-                ASSERT (I == I2 || I == Cols) ;
-                GB_OK (GB_subref (&SubMask, true, M,
-                    I, ni, GrB_ALL, 1, false, true, Context)) ;
-            }
-            ASSERT (GB_VECTOR_OK (SubMask)) ;
-        }
-        else if (col_assign)
-        {
-            // SubMask = M(Rows,:), but use I or J if they are the sorted I2, J2
-            ASSERT (GB_VECTOR_OK (M)) ;
-            ASSERT (M->is_csc) ;
-            if (C_is_csc)
-            { 
-                // SubMask = Mask (I,:)
-                ASSERT (I == I2 || I == Rows) ;
-                GB_OK (GB_subref (&SubMask, true, M,
-                    I, ni, GrB_ALL, 1, false, true, Context)) ;
-            }
-            else
-            { 
-                // SubMask = Mask (J,:)
-                ASSERT (J == J2 || J == Rows) ;
-                GB_OK (GB_subref (&SubMask, true, M,
-                    J, nj, GrB_ALL, 1, false, true, Context)) ;
-            }
-            ASSERT (GB_VECTOR_OK (SubMask)) ;
-        }
-        else
-        {
-            // SubMask = M (I,J)
-            if (M->is_csc == C_is_csc)
-            { 
-                GB_OK (GB_subref (&SubMask, M->is_csc,
-                    M, I, ni, J, nj, false, true, Context)) ;
-            }
-            else
-            { 
-                GB_OK (GB_subref (&SubMask, M->is_csc,
-                    M, J, nj, I, ni, false, true, Context)) ;
-            }
-        }
-        M = SubMask ;
-        ASSERT_MATRIX_OK (M, "extracted submask M", GB0) ;
-    }
 
-    //--------------------------------------------------------------------------
-    // transpose the mask if requested
-    //--------------------------------------------------------------------------
-
-    // the mask for G*B_Col_*assign and G*B_Row_*assign is a GrB_Vector in CSC
-    // form, which is quickly transposed to a hypersparse matrix, if needed.
-    // G*B_Vector_*assign always has a CSC mask and CSC C matrix, since both
-    // are GrB_Vectors.
+        //----------------------------------------------------------------------
+        // use GB_subassigner
+        //----------------------------------------------------------------------
 
-    if (M != NULL)
-    {
-        if (M->is_csc != C_is_csc)
-        { 
-            // either G*B_Row_*assign and G*B_Col_*assign when matrix C is in
-            // CSR format, and or G*B_Matrix_assign when the format of the
-            // matrices C and M differ.
-            M_transpose = !M_transpose ;
-        }
-        if (M_transpose)
-        { 
-            // MT = M' to conform M to the same CSR/CSC format as C.
-            // typecast to boolean, if a full matrix transpose is done.
-            // transpose: typecast, no op, not in place
-            GBBURBLE ("(M transpose) ") ;
-            GB_OK (GB_transpose (&MT, GrB_BOOL, C_is_csc, M,
-                NULL, NULL, NULL, false, Context)) ;
-            M = MT ;
-        }
-    }
+        // M and A can have any sparsity structure.  C is typically not
+        // bitmap, except
 
-    //--------------------------------------------------------------------------
-    // make a copy Z = C if C is aliased to A or M
-    //--------------------------------------------------------------------------
+        //----------------------------------------------------------------------
+        // extract the SubMask = M (I,J) if needed
+        //----------------------------------------------------------------------
 
-    // If C is aliased to A and/or M, a copy must be made.  GB_subassigner
-    // operates on the copy, Z, which is then transplanted back into C when
-    // done.  This is costly, and can have performance implications, but it is
-    // the only reasonable method.  If a copy of C must be made, then it is as
-    // large as M or A, so copying the whole matrix will not add much time.
+        if (Mask_is_same)
+        { 
+            // the mask M is the same for GB_assign and GB_subassign.  Either
+            // both masks are NULL, or SubMask = M (:,:), and the two masks
+            // are equivalent.
 
-    bool C_aliased = GB_aliased (C, A) || GB_aliased (C, M) ;
+            //------------------------------------------------------------------
+            // C(I,J)<M> = A or accum (C(I,J),A) via GB_subassigner
+            //------------------------------------------------------------------
 
-    // GB_assign cannot tolerate any alias with the input mask,
-    // if the C_replace phase will be performed.
-    if (C_replace_phase)
-    { 
-        // the C_replace_phase requires C and M_in not to be aliased
-        C_aliased = C_aliased || GB_aliased (C, M_in) ;
-    }
+            GB_OK (GB_subassigner (C, subassign_method, C_replace,
+                M, Mask_comp, Mask_struct, accum, A,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                scalar_expansion, scalar, atype, Context)) ;
 
-    if (C_aliased)
-    {
-        // If C is aliased, it no longer has any pending work, A and M have
-        // been finished, above.  This also ensures GB_dup does not need to
-        // finish any pending work in C.
-        GBBURBLE ("(C aliased) ") ;
-        ASSERT (!GB_ZOMBIES (C)) ;
-        ASSERT (!GB_PENDING (C)) ;
-        if (whole_C_matrix && C_replace && accum == NULL)
-        { 
-            // C(:,:)<any mask, replace> = A or x, with C aliased to M or A.  C
-            // is about to be cleared in GB_subassigner anyway, but a duplicate
-            // is needed because C is aliased with M or A.  Instead of
-            // duplicating it, create an empty matrix Z2.  This also prevents
-            // the C_replace_phase from being needed.
-            GB_OK (GB_new (&Z2, C->type, C->vlen, C->vdim, GB_Ap_calloc,
-                C->is_csc, GB_SAME_HYPER_AS (C->is_hyper), C->hyper_ratio, 1,
-                Context)) ;
-            GBBURBLE ("(C alias cleared; C_replace early) ") ;
-            C_replace = false ;
-            C_replace_phase = false ;
         }
         else
-        { 
-            // Z2 = duplicate of C, which must be freed when done
-            GB_OK (GB_dup (&Z2, C, true, NULL, Context)) ;
-        }
-        Z = Z2 ;
-    }
-    else
-    {
-        // GB_subassigner can safely operate on C in place and so can the
-        // C_replace_phase below.
-        // FUTURE:  if C is dense and will remain so,
-        // it would be faster to delay the clearing of C.
-        if (whole_C_matrix && C_replace && accum == NULL)
-        { 
-            // C(:,:)<any mask, replace> = A or x, with C not aliased to M or
-            // A.  C is about to be cleared in GB_subassigner anyway, so clear
-            // it now.  This also prevents the C_replace_phase from being
-            // needed.
-            GB_OK (GB_clear (C, Context)) ;
-            GBBURBLE ("(C(:,:)<any mask>: C_replace early) ") ;
-            C_replace = false ;
-            C_replace_phase = false ;
-        }
-        Z = C ;
-    }
-
-    //--------------------------------------------------------------------------
-    // Z(I,J)<M> = A or accum (Z(I,J),A)
-    //--------------------------------------------------------------------------
-
-    GB_OK (GB_subassigner (
-        Z,          C_replace,      // Z matrix and its descriptor
-        M, Mask_comp, Mask_struct,  // mask matrix and its descriptor
-        accum,                      // for accum (C(I,J),A)
-        A,                          // A matrix, NULL for scalar expansion
-        I, ni,                      // indices
-        J, nj,                      // vectors
-        scalar_expansion,           // if true, expand scalar to A
-        scalar,                     // scalar to expand, NULL if A not NULL
-        scalar_code,                // type code of scalar to expand
-        Context)) ;
-
-    // free all but Z2, I2, and J2, which are still needed.  MT and SubMask are
-    // freed because the C_replace_phase requires the original mask, not the
-    // submask or its transpose.  Z2 is still needed since Z == Z2, and it will
-    // be modified by the C_replace_phase and then transplanted back into C.
-    GB_MATRIX_FREE (&AT) ;
-    GB_MATRIX_FREE (&MT) ;
-    GB_MATRIX_FREE (&SubMask) ;
-
-    //--------------------------------------------------------------------------
-    // examine Z outside the Z(I,J) submatrix
-    //--------------------------------------------------------------------------
-
-    if (C_replace_phase)
-    {
-        // If C_replace is true and M_in(i,j)=0 for any entry outside the
-        // Z(I,J) submatrix, then that entry must be deleted.  This phase is
-        // very costly but it is what the GraphBLAS Specification requires.
-        // This phase is skipped if C_replace is false.
-
-        // This case can only occur if the mask is present (either complemented
-        // or not).  If the mask is not present, then it is not complemented
-        // (see the "quick return" case above).  So if there is no mask matrix,
-        // M_in(I,J)=1 is true, so C_replace has no effect outside the Z(I,J)
-        // submatrix.
+        {
 
-        // Also, if whole_submatrix is true, then there is nothing outside the
-        // Z(I,J) submatrix to modify, so this phase is skipped if
-        // whole_submatrix is true.
+            //------------------------------------------------------------------
+            // extract the SubMask
+            //------------------------------------------------------------------
 
-        // This code requires Z and M_in not to be aliased to each other.
+            ASSERT_MATRIX_OK (M, "big mask", GB0) ;
 
-        M = M_in ;
-        ASSERT (M != NULL) ;
-        ASSERT (!GB_aliased (Z, M)) ;
+            const GrB_Index *I_SubMask = I ; int64_t ni_SubMask = ni ;
+            const GrB_Index *J_SubMask = J ; int64_t nj_SubMask = nj ;
 
-        ASSERT_MATRIX_OK (Z, "Z for C-replace-phase", GB0) ;
-        ASSERT_MATRIX_OK (M, "M for C-replace-phase", GB0) ;
+            if (assign_kind == GB_ROW_ASSIGN)
+            { 
+                // SubMask = M (:,J)
+                ASSERT (M->vlen == 1 && M->vdim == C->vdim) ;
+                I_SubMask = GrB_ALL ;
+                ni_SubMask = 1 ;
+            }
+            else if (assign_kind == GB_COL_ASSIGN)
+            { 
+                // SubMask = M (I,:)
+                ASSERT (M->vlen == C->vlen && M->vdim == 1) ;
+                J_SubMask = GrB_ALL ;
+                nj_SubMask = 1 ;
+            }
+            else // assign_kind == GB_ASSIGN
+            { 
+                // SubMask = M (I,J)
+                ASSERT (M->vlen == C->vlen && M->vdim == C->vdim) ;
+            }
 
-        //----------------------------------------------------------------------
-        // assemble any pending tuples
-        //----------------------------------------------------------------------
+            GB_OK (GB_subref (&SubMask, true, M,
+                I_SubMask, ni_SubMask, J_SubMask, nj_SubMask,
+                false, Context)) ;
 
-        if (GB_PENDING (Z))
-        { 
-            GB_OK (GB_Matrix_wait (Z, Context)) ;
-        }
+            // GB_subref can return a jumbled result
+            ASSERT (GB_JUMBLED_OK (SubMask)) ;
+            ASSERT_MATRIX_OK (SubMask, "extracted SubMask", GB0) ;
 
-        ASSERT_MATRIX_OK (Z, "Z cleaned up for C-replace-phase", GB0) ;
+            //------------------------------------------------------------------
+            // C(I,J)<SubMask> = A or accum (C(I,J),A) via GB_subassigner
+            //------------------------------------------------------------------
 
-        //----------------------------------------------------------------------
-        // get the original mask and transpose it if required
-        //----------------------------------------------------------------------
+            // determine the method again since SubMask is not M
+            subassign_method = GB_subassigner_method (C, C_replace,
+                SubMask, Mask_comp, Mask_struct, accum, A, Ikind, Jkind,
+                scalar_expansion) ;
 
-        // M is the now all of M_in, not the SubMask, so it must be transposed
+            GB_OK (GB_subassigner (C, subassign_method, C_replace,
+                SubMask, Mask_comp, Mask_struct, accum, A,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                scalar_expansion, scalar, atype, Context)) ;
 
-        if (M_transpose)
-        { 
-            // MT = M' to conform M to the same CSR/CSC format as C.
-            // typecast to boolean, if a full matrix transpose is done.
-            // transpose: typecast, no op, not in place
-            GBBURBLE ("(M transpose) ") ;
-            GB_OK (GB_transpose (&MT, GrB_BOOL, C_is_csc, M,
-                NULL, NULL, NULL, false, Context)) ;
-            M = MT ;
+            GB_Matrix_free (&SubMask) ;
         }
 
-        ASSERT_MATRIX_OK (M, "M transposed for C-replace-phase", GB0) ;
-
         //----------------------------------------------------------------------
-        // sort I and J, if they are GB_LIST, if not already done
+        // examine C outside the C(I,J) submatrix
         //----------------------------------------------------------------------
 
-        // I2 and J2 may already have been constructed for scalar expansion,
-        // so allocate them and construct them, if needed.
-
-        ASSERT (GB_IMPLIES (I2 != NULL, (I == I2) && (I2_size > 1))) ;
-        ASSERT (GB_IMPLIES (J2 != NULL, (J == J2) && (J2_size > 1))) ;
-
-        if (Ikind == GB_LIST && ni > 1 && I2 == NULL)
-        { 
-            // ni and nI are reduced if there are duplicates
-            I2_size = ni ;
-            GB_OK (GB_ijsort (I, &ni, &I2, &I2k, Context)) ;
-            ASSERT (ni <= I2_size) ;
-            nI = ni ;
-            I = I2 ;
-        }
+        if (C_replace_phase)
+        {
+            // If C_replace is true and M(i,j)=0 for any entry outside the
+            // C(I,J) submatrix, then that entry must be deleted.  This phase
+            // is very costly but it is what the GraphBLAS Specification
+            // requires.  This phase is skipped if C_replace is false.
+
+            // This case can only occur if the mask is present (either
+            // complemented or not).  If the mask is not present, then it is
+            // not complemented (see the "quick return" case above).  So if
+            // there is no mask matrix, M(I,J)=1 is true, so C_replace has no
+            // effect outside the C(I,J) submatrix.
+
+            // Also, if whole_submatrix is true, then there is nothing outside
+            // the C(I,J) submatrix to modify, so this phase is skipped if
+            // whole_submatrix is true.
+
+            // This code requires C and M not to be aliased to each other.
+            ASSERT (M != NULL) ;
+            ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS C==M in C_replace_phase
+            ASSERT (!whole_submatrix) ;
+            ASSERT (!GB_IS_BITMAP (C)) ;
+            ASSERT (!GB_IS_FULL (C)) ;
+
+            ASSERT_MATRIX_OK (C, "C for C-replace-phase", GB0) ;
+            ASSERT_MATRIX_OK (M, "M for C-replace-phase", GB0) ;
 
-        if (Jkind == GB_LIST && nj > 1 && J2 == NULL)
-        { 
-            // nj and nJ are reduced if there are duplicates
-            J2_size = nj ;
-            GB_OK (GB_ijsort (J, &nj, &J2, &J2k, Context)) ;
-            ASSERT (nj <= J2_size) ;
-            nJ = nj ;
-            J = J2 ;
-        }
+            //------------------------------------------------------------------
+            // assemble any pending tuples
+            //------------------------------------------------------------------
 
-        //----------------------------------------------------------------------
-        // delete entries outside Z(I,J) for which M(i,j) is false
-        //----------------------------------------------------------------------
-
-        if ((row_assign && !C->is_csc) || (col_assign && C->is_csc))
-        { 
+            GB_MATRIX_WAIT_IF_PENDING (C) ;
+            ASSERT_MATRIX_OK (C, "C cleaned up for C-replace-phase", GB0) ;
 
             //------------------------------------------------------------------
-            // vector assignment, examine all of M but just Z(:,j)
+            // delete entries outside C(I,J) for which M(i,j) is false
             //------------------------------------------------------------------
 
-            // M is a single column so it is never hypersparse
-            ASSERT (nJ == 1) ;
-            ASSERT (M->vlen == Z->vlen && M->vdim == 1 && !M->is_hyper) ;
-            ASSERT (Jkind == GB_LIST) ;
-            int64_t j = J [0] ;
-            ASSERT (j == GB_ijlist (J, 0, Jkind, Jcolon)) ;
+            // C must be sparse or hypersparse
+            GB_ENSURE_SPARSE (C) ;
 
-            GBBURBLE ("assign zombies outside C(I,j) ") ;
-            GB_assign_zombie3 (Z, M, Mask_comp, Mask_struct,
-                j, I, nI, Ikind, Icolon, Context) ;
-        }
-        else if ((row_assign && C->is_csc) || (col_assign && !C->is_csc))
-        { 
+            if (assign_kind == GB_COL_ASSIGN)
+            { 
 
-            //------------------------------------------------------------------
-            // index assignment, examine just Z(i,:) and M
-            //------------------------------------------------------------------
+                //--------------------------------------------------------------
+                // vector assignment, examine all of M but just C(:,j)
+                //--------------------------------------------------------------
 
-            // GrB_Row_assign: only examine Z(i,:)
-            // M has vlen == 1 and the same vdim as Z
-            ASSERT (nI == 1) ;
-            ASSERT (M->vlen == 1 && M->vdim == Z->vdim) ;
-            ASSERT (Ikind == GB_LIST) ;
-            int64_t i = I [0] ;
-            ASSERT (i == GB_ijlist (I, 0, Ikind, Icolon)) ;
-
-            GBBURBLE ("assign zombies outside C(i,J) ") ;
-            GB_assign_zombie4 (Z, M, Mask_comp, Mask_struct,
-                i, J, nJ, Jkind, Jcolon, Context) ;
-        }
-        else
-        { 
+                // M is a single column so it is never hypersparse
+                ASSERT (nJ == 1) ;
+                ASSERT (M->vlen == C->vlen && M->vdim == 1 && M->h == NULL) ;
+                int64_t j = GB_ijlist (J, 0, Jkind, Jcolon) ;
+                GBURBLE ("assign zombies outside C(I,j) ") ;
+                GB_MATRIX_WAIT (M) ;
+                GB_assign_zombie3 (C, M, Mask_comp, Mask_struct,
+                    j, I, nI, Ikind, Icolon, Context) ;
 
-            //------------------------------------------------------------------
-            // Matrix/vector assignment: examine all of Z and M
-            //------------------------------------------------------------------
+            }
+            else if (assign_kind == GB_ROW_ASSIGN)
+            { 
 
-            // M has the same size as Z
-            ASSERT (M->vlen == Z->vlen && M->vdim == Z->vdim) ;
+                //--------------------------------------------------------------
+                // index assignment, examine just C(i,:) and M
+                //--------------------------------------------------------------
+
+                // GrB_Row_assign: only examine C(i,:)
+                // M s a single row with vlen == 1 and the same vdim as C
+                ASSERT (nI == 1) ;
+                ASSERT (M->vlen == 1 && M->vdim == C->vdim) ;
+                int64_t i = GB_ijlist (I, 0, Ikind, Icolon) ;
+                GBURBLE ("assign zombies outside C(i,J) ") ;
+                GB_MATRIX_WAIT_IF_JUMBLED (C) ;
+                GB_MATRIX_WAIT (M) ;
+                GB_assign_zombie4 (C, M, Mask_comp, Mask_struct,
+                    i, J, nJ, Jkind, Jcolon, Context) ;
 
-            GBBURBLE ("assign zombies outside C(I,J) ") ;
-            GB_OK (GB_assign_zombie5 (Z, M, Mask_comp, Mask_struct,
-                I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon, Context)) ;
-        }
+            }
+            else
+            { 
 
-        ASSERT_MATRIX_OK (Z, "Z for C-replace-phase done", GB_FLIP (GB0)) ;
+                //--------------------------------------------------------------
+                // Matrix/vector assignment: examine all of C and M
+                //--------------------------------------------------------------
+
+                // M has the same size as C
+                ASSERT (M->vlen == C->vlen && M->vdim == C->vdim) ;
+                GBURBLE ("assign zombies outside C(I,J) ") ;
+                GB_MATRIX_WAIT (M) ;
+                GB_OK (GB_assign_zombie5 (C, M, Mask_comp, Mask_struct,
+                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon, Context)) ;
+            }
+            ASSERT_MATRIX_OK (C, "C for C-replace-phase done", GB_FLIP (GB0)) ;
+        }
     }
 
     //--------------------------------------------------------------------------
-    // transplant Z2 back into C
+    // transplant C2 back into C_in
     //--------------------------------------------------------------------------
 
-    if (C_aliased)
-    {
-        // zombies can be transplanted into C but pending tuples cannot
-        if (GB_PENDING (Z2))
-        { 
-            // assemble all pending tuples, and delete all zombies too
-            GB_OK (GB_Matrix_wait (Z2, Context)) ;
-        }
-        // transplants the content of Z into C and frees Z
-        GB_OK (GB_transplant (C, C->type, &Z2, Context)) ;
+    if (C == C2)
+    { 
+        // Transplant the content of C2 into C_in and free C2.  Zombies and
+        // pending tuples can be transplanted from C2 into C_in, and if C2 is
+        // jumbled, C_in becomes jumbled too.
+        GB_OK (GB_transplant (C_in, C_in->type, &C2, Context)) ;
     }
 
-    // The hypersparsity of C is not modified.  This will be done eventually,
-    // when all pending operations are completed via GB_Matrix_wait.
-
     //--------------------------------------------------------------------------
     // free workspace, finalize C, and return result
     //--------------------------------------------------------------------------
 
-    if (C->nzombies > 0) { if (!GB_queue_insert (C)) GB_PANIC ; }// TODO in 4.0: delete
-
-    ASSERT_MATRIX_OK (C, "Final C for assign", GB0) ;
+    ASSERT_MATRIX_OK (C_in, "C to conform", GB0) ;
+    GB_OK (GB_conform (C_in, Context)) ;
+    ASSERT_MATRIX_OK (C_in, "Final C for assign", GB0) ;
     GB_FREE_ALL ;
-    return (GB_block (C, Context)) ;
+    return (GB_block (C_in, Context)) ;
 }
 
diff --git a/GraphBLAS/Source/GB_assign.h b/GraphBLAS/Source/GB_assign.h
index bf26246752..35bde3d6bc 100644
--- a/GraphBLAS/Source/GB_assign.h
+++ b/GraphBLAS/Source/GB_assign.h
@@ -2,14 +2,14 @@
 // GB_assign.h: definitions for GB_assign and related functions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #ifndef GB_ASSIGN_H
 #define GB_ASSIGN_H
-#include "GB_ij.h"
+#include "GB.h"
 
 GrB_Info GB_assign                  // C<M>(Rows,Cols) += A or A'
 (
@@ -18,10 +18,10 @@ GrB_Info GB_assign                  // C<M>(Rows,Cols) += A or A'
     const GrB_Matrix M_in,          // optional mask for C
     const bool Mask_comp,           // true if mask is complemented
     const bool Mask_struct,         // if true, use the only structure of M
-    bool M_transpose,               // true if the mask should be transposed
+    const bool M_transpose,         // true if the mask should be transposed
     const GrB_BinaryOp accum,       // optional accum for accum(C,T)
     const GrB_Matrix A_in,          // input matrix
-    bool A_transpose,               // true if A is transposed
+    const bool A_transpose,         // true if A is transposed
     const GrB_Index *Rows,          // row indices
     const GrB_Index nRows_in,       // number of row indices
     const GrB_Index *Cols,          // column indices
@@ -29,8 +29,7 @@ GrB_Info GB_assign                  // C<M>(Rows,Cols) += A or A'
     const bool scalar_expansion,    // if true, expand scalar to A
     const void *scalar,             // scalar to be expanded
     const GB_Type_code scalar_code, // type code of scalar to expand
-    const bool col_assign,          // true for GrB_Col_assign
-    const bool row_assign,          // true for GrB_Row_assign
+    int assign_kind,                // row, col, or matrix/vector assign
     GB_Context Context
 ) ;
 
@@ -49,63 +48,4 @@ GrB_Info GB_assign_scalar           // C<M>(Rows,Cols) += x
     GB_Context Context
 ) ;
 
-void GB_assign_zombie1
-(
-    GrB_Matrix C,
-    const int64_t j,
-    GB_Context Context
-) ;
-
-void GB_assign_zombie2
-(
-    GrB_Matrix C,
-    const int64_t i,
-    GB_Context Context
-) ;
-
-void GB_assign_zombie3
-(
-    GrB_Matrix Z,
-    const GrB_Matrix M,
-    const bool Mask_comp,
-    const bool Mask_struct,         // if true, use the only structure of M
-    const int64_t j,
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    GB_Context Context
-) ;
-
-void GB_assign_zombie4
-(
-    GrB_Matrix Z,
-    const GrB_Matrix M,
-    const bool Mask_comp,
-    const bool Mask_struct,         // if true, use the only structure of M
-    const int64_t i,
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    GB_Context Context
-) ;
-
-GrB_Info GB_assign_zombie5
-(
-    GrB_Matrix Z,
-    const GrB_Matrix M,
-    const bool Mask_comp,
-    const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    GB_Context Context
-) ;
-
 #endif
diff --git a/GraphBLAS/Source/GB_assign_prep.c b/GraphBLAS/Source/GB_assign_prep.c
new file mode 100644
index 0000000000..e3fde8a1d1
--- /dev/null
+++ b/GraphBLAS/Source/GB_assign_prep.c
@@ -0,0 +1,1193 @@
+//------------------------------------------------------------------------------
+// GB_assign_prep: check and prepare inputs for GB_assign and GB_subassign
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// GB_assign_prep checks the inputs for all assignment methods:
+// GrB_Row_assign, GrB_Col_assign, GrB_assign, and GxB_subassign.
+
+#include "GB_subassign.h"
+#include "GB_bitmap_assign.h"
+#include "GB_assign_zombie.h"
+#include "GB_subassign_methods.h"
+#include "GB_transpose.h"
+#include "GB_subref.h"
+
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL             \
+{                               \
+    GB_Matrix_free (&C2) ;      \
+    GB_Matrix_free (&A2) ;      \
+    GB_Matrix_free (&AT) ;      \
+    GB_Matrix_free (&M2) ;      \
+    GB_Matrix_free (&MT) ;      \
+    GB_FREE (I2) ;              \
+    GB_FREE (J2) ;              \
+    GB_FREE (I2k) ;             \
+    GB_FREE (J2k) ;             \
+}
+
+GrB_Info GB_assign_prep
+(
+    // output:
+    GrB_Matrix *Chandle,            // C_in, or C2 if C is aliased to M or A
+    GrB_Matrix *Mhandle,            // M_in, or a modified version M2
+    GrB_Matrix *Ahandle,            // A_in, or a modified version A2
+
+    // modified versions of the matrices C, M, and A:
+    GrB_Matrix *C2_handle,          // NULL, or a copy of C
+    GrB_Matrix *M2_handle,          // NULL, or a temporary matrix
+    GrB_Matrix *A2_handle,          // NULL, or a temporary matrix
+
+    // modified versions of the Rows/Cols lists, and their analysis:
+    const GrB_Index **I_handle,     // Rows, Cols, or a modified copy I2
+    GrB_Index **I2_handle,          // NULL, or sorted/pruned Rows or Cols
+    int64_t *ni_handle,
+    int64_t *nI_handle,
+    int *Ikind_handle,
+    int64_t Icolon [3],
+
+    const GrB_Index **J_handle,     // Rows, Cols, or a modified copy J2
+    GrB_Index **J2_handle,          // NULL, or sorted/pruned Rows or Cols
+    int64_t *nj_handle,
+    int64_t *nJ_handle,
+    int *Jkind_handle,
+    int64_t Jcolon [3],
+
+    bool *done,                     // true if the prep has finished all work
+    GrB_Type *atype_handle,         // type of A or the scalar
+
+    // input/output
+    GrB_Matrix C_in,                // input/output matrix for results
+    bool *C_replace,                // descriptor for C
+    int *assign_kind,               // row/col assign, assign, or subassign
+
+    // input
+    const GrB_Matrix M_in,          // optional mask for C
+    const bool Mask_comp,           // true if mask is complemented
+    const bool Mask_struct,         // if true, use the only structure of M
+    bool M_transpose,               // true if the mask should be transposed
+    const GrB_BinaryOp accum,       // optional accum for accum(C,T)
+    const GrB_Matrix A_in,          // input matrix
+    bool A_transpose,               // true if A is transposed
+    const GrB_Index *Rows,          // row indices
+    const GrB_Index nRows_in,       // number of row indices
+    const GrB_Index *Cols,          // column indices
+    const GrB_Index nCols_in,       // number of column indices
+    const bool scalar_expansion,    // if true, expand scalar to A
+    const void *scalar,             // scalar to be expanded
+    const GB_Type_code scalar_code, // type code of scalar to expand
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    GB_RETURN_IF_FAULTY_OR_POSITIONAL (accum) ;
+    GB_RETURN_IF_NULL (Rows) ;
+    GB_RETURN_IF_NULL (Cols) ;
+
+    GrB_Matrix C = C_in ;
+    GrB_Matrix M = M_in ;
+    GrB_Matrix A = A_in ;
+
+    ASSERT_MATRIX_OK (C, "C input for GB_assign/subassign", GB0) ;
+    ASSERT (!GB_is_shallow (C)) ;
+    ASSERT_MATRIX_OK_OR_NULL (M, "M for GB_assign/subassign", GB0) ;
+    ASSERT_BINARYOP_OK_OR_NULL (accum, "accum for GB_assign/subassign", GB0) ;
+    ASSERT (scalar_code <= GB_UDT_code) ;
+
+    GrB_Matrix C2 = NULL ;
+    GrB_Matrix M2 = NULL ;
+    GrB_Matrix A2 = NULL ;
+    GrB_Matrix MT = NULL ;
+    GrB_Matrix AT = NULL ;
+
+    GrB_Index *I2 = NULL ;
+    GrB_Index *J2 = NULL ;
+    GrB_Index *I2k = NULL ;
+    GrB_Index *J2k = NULL ;
+    (*done) = false ;
+    (*atype_handle) = NULL ;
+
+    (*Chandle) = NULL ;
+    (*Mhandle) = NULL ;
+    (*Ahandle) = NULL ;
+
+    (*C2_handle) = NULL ;
+    (*A2_handle) = NULL ;
+    (*M2_handle) = NULL ;
+
+    (*I_handle) = NULL ; 
+    (*I2_handle) = NULL ;
+    (*ni_handle) = 0 ;
+    (*nI_handle) = 0 ;
+    (*Ikind_handle) = 0 ;
+
+    (*J_handle) = NULL ;
+    (*J2_handle) = NULL ;
+    (*nj_handle) = 0 ;
+    (*nJ_handle) = 0 ;
+    (*Jkind_handle) = 0 ;
+
+    //--------------------------------------------------------------------------
+    // determine the type of A or the scalar
+    //--------------------------------------------------------------------------
+
+    GrB_Type atype ;
+    if (scalar_expansion)
+    { 
+        // for scalar expansion, the NULL pointer case has been already checked
+        // for user-defined types, and can't be NULL for built-in types.
+        ASSERT (scalar != NULL) ;
+        ASSERT (A == NULL) ;
+        ASSERT ((*assign_kind) == GB_ASSIGN || (*assign_kind) == GB_SUBASSIGN) ;
+        atype = GB_code_type (scalar_code, C->type) ;
+    }
+    else
+    { 
+        // GrB_*assign, not scalar:  The user's input matrix has been checked.
+        // The pointer to the scalar is NULL.
+        ASSERT (scalar == NULL) ;
+        ASSERT_MATRIX_OK (A, "A for GB_assign/GB_subassign", GB0) ;
+        atype = A->type ;
+    }
+
+    //--------------------------------------------------------------------------
+    // delete any lingering zombies and assemble any pending tuples
+    //--------------------------------------------------------------------------
+
+    // zombies and pending tuples in C or OK, but not M or A
+    GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (M) ;
+    GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (A) ;
+
+    // some kernels allow for M and A to be jumbled
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+
+    // C can have any kind of pending work
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    ASSERT (GB_PENDING_OK (C)) ;
+
+    //--------------------------------------------------------------------------
+    // check domains of C, M, A, and accum
+    //--------------------------------------------------------------------------
+
+    // GB_compatible is not used since most of it is slightly different here
+    if (accum != NULL)
+    { 
+        // C<M>(Rows,Cols) = accum (C(Rows,Cols),A), or
+        // C(Rows,Cols)<M> = accum (C(Rows,Cols),A)
+        GB_OK (GB_BinaryOp_compatible (accum, C->type, C->type,
+            (scalar_expansion) ? NULL : A->type,
+            (scalar_expansion) ? scalar_code : GB_ignore_code, Context)) ;
+    }
+
+    // C<M>(Rows,Cols) = T, so C and T must be compatible.
+    // also C<M>(Rows,Cols) = accum(C,T) for entries in T but not C
+    if (scalar_expansion)
+    {
+        if (!GB_code_compatible (C->type->code, scalar_code))
+        { 
+            GB_ERROR (GrB_DOMAIN_MISMATCH, "Input scalar of type [%s]\n"
+                "cannot be typecast to output of type [%s]",
+                GB_code_string (scalar_code), C->type->name) ;
+        }
+    }
+    else
+    {
+        if (!GB_Type_compatible (C->type, A->type))
+        { 
+            GB_ERROR (GrB_DOMAIN_MISMATCH, "Input of type [%s]\n"
+                "cannot be typecast to output of type [%s]",
+                A->type->name, C->type->name) ;
+        }
+    }
+
+    if (M != NULL)
+    {
+        // M is typecast to boolean
+        if (!GB_Type_compatible (M->type, GrB_BOOL))
+        { 
+            GB_ERROR (GrB_DOMAIN_MISMATCH,
+                "M of type [%s] cannot be typecast to boolean", M->type->name) ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // determine the properites of the Rows and Cols index lists
+    //--------------------------------------------------------------------------
+
+    int64_t nRows, nCols, RowColon [3], ColColon [3] ;
+    int RowsKind, ColsKind ;
+    GB_ijlength (Rows, nRows_in, GB_NROWS (C), &nRows, &RowsKind, RowColon) ;
+    GB_ijlength (Cols, nCols_in, GB_NCOLS (C), &nCols, &ColsKind, ColColon) ;
+
+    //--------------------------------------------------------------------------
+    // check the dimensions of M
+    //--------------------------------------------------------------------------
+
+    if (M != NULL)
+    {
+        // check the mask: size depends on the method
+
+        switch (*assign_kind)
+        {
+            case GB_ROW_ASSIGN : 
+            {
+                // GrB_Row_assign:
+                // M is a column vector the same size as one row of C
+                ASSERT (nRows == 1) ;
+                ASSERT (!scalar_expansion) ;
+                ASSERT (GB_VECTOR_OK (M)) ;
+                if (GB_NROWS (M) != GB_NCOLS (C))
+                { 
+                    GB_ERROR (GrB_DIMENSION_MISMATCH, "Mask vector m length"
+                        " is " GBd "; must match the number of columns of C ("
+                        GBd ")", GB_NROWS (M), GB_NCOLS (C)) ;
+                }
+            }
+            break ;
+
+            case GB_COL_ASSIGN : 
+            {
+                // GrB_Col_assign:
+                // M is a column vector the same size as one column of C
+                ASSERT (nCols == 1) ;
+                ASSERT (!scalar_expansion) ;
+                ASSERT (GB_VECTOR_OK (M)) ;
+                if (GB_NROWS (M) != GB_NROWS (C))
+                { 
+                    GB_ERROR (GrB_DIMENSION_MISMATCH, "Mask vector m length"
+                        " is " GBd "; must match the number of rows of C ("
+                        GBd ")", GB_NROWS (M), GB_NROWS (C)) ;
+                }
+            }
+            break ;
+
+            case GB_ASSIGN : 
+            {
+                // GrB_Matrix_assign, GrB_Vector_assign, and scalar variants: M
+                // is a matrix the same size as C for entire matrix (or vector)
+                // assignment, where A is either a matrix or a scalar
+                if (GB_NROWS (M) != GB_NROWS (C) ||
+                    GB_NCOLS (M) != GB_NCOLS (C))
+                { 
+                    GB_ERROR (GrB_DIMENSION_MISMATCH, "Mask M is " GBd "-by-"
+                        GBd "; " "must match result C (" GBd "-by-" GBd ")",
+                        GB_NROWS (M), GB_NCOLS (M),
+                        GB_NROWS (C), GB_NCOLS (C)) ;
+                }
+            }
+            break ;
+
+            case GB_SUBASSIGN : 
+            {
+                // GxB_subassign: M is a matrix the same size as C(Rows,Cols)
+                int64_t mnrows = M_transpose ? GB_NCOLS (M) : GB_NROWS (M) ;
+                int64_t mncols = M_transpose ? GB_NROWS (M) : GB_NCOLS (M) ;
+                if (mnrows != nRows || mncols != nCols)
+                { 
+                    GB_ERROR (GrB_DIMENSION_MISMATCH,
+                        "M is " GBd "-by-" GBd "%s, "
+                        "must match size of result C(I,J): " GBd "-by-" GBd "",
+                        mnrows, mncols, M_transpose ? " (transposed)" : "",
+                        nRows, nCols) ;
+                }
+            }
+            break ;
+
+            default:
+                ASSERT (GB_DEAD_CODE) ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // check the dimensions of A
+    //--------------------------------------------------------------------------
+
+    if (!scalar_expansion)
+    {
+        int64_t anrows = (A_transpose) ? GB_NCOLS (A) : GB_NROWS (A) ;
+        int64_t ancols = (A_transpose) ? GB_NROWS (A) : GB_NCOLS (A) ;
+        if (nRows != anrows || nCols != ancols)
+        { 
+            GB_ERROR (GrB_DIMENSION_MISMATCH,
+                "Dimensions not compatible:\n"
+                "C(Rows,Cols) is " GBd "-by-" GBd "\n"
+                "input is " GBd "-by-" GBd "%s",
+                nRows, nCols, anrows, ancols,
+                A_transpose ? " (transposed)" : "") ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // handle the CSR/CSC format of C:
+    //--------------------------------------------------------------------------
+
+    // GrB_Row_assign, GxB_Row_subassign: A is always a vector in CSC format,
+    // and A_transpose is always true.  If C is in CSC format then A_transpose
+    // remains true, and the n-by-1 vector A is transposed below into a 1-by-n
+    // hypersparse CSC matrix.  If C is in CSR format then A_transpose becomes
+    // false, and the assignment does not need to transpose A.  It remains in
+    // CSC format but has the correct vector length and dimension for the
+    // CSR/CSC-agnostic assignment.
+
+    // GrB_Col_assign, GxB_Col_subassign: A is always a vector in CSC format,
+    // and A_transpose is always false.  If C is in CSC format then A_transpose
+    // remains false, and the assignment does not need to transpose A.  If C is
+    // in CSR format then A_transpose becomes true, and the the n-by-1 vector A
+    // is transposed below into a 1-by-n hypersparse CSC matrix.  The CSC
+    // format is ignored by the CSR/CSC-agnostic assignment.
+
+    // GrB_Vector_assign, GxB_Vector_subassign:  both A and C are always in CSC
+    // format, and A_transpose is always false, and doesn't change below.
+
+    // GrB_Matrix_assign, GxB_Matrix_subassign:  A and C can be in any format,
+    // and A_transpose can be true or false, depending on the descriptor.  If
+    // the CSR/CSC formats of A and C are the same, then A_transpose remains
+    // as-is.  If they differ, then A_transpose is flipped.  Then the CSR-CSC
+    // agnostic assignment proceeds.
+
+    bool C_is_csc = C->is_csc ;
+    if (!scalar_expansion && C_is_csc != A->is_csc)
+    { 
+        // Flip the sense of A_transpose
+        A_transpose = !A_transpose ;
+    }
+
+    // get the I and J index lists
+    int Ikind, Jkind ;
+    const GrB_Index *I, *J ;
+    int64_t ni, nj, nI, nJ ;
+
+    if (C_is_csc)
+    { 
+        // C is in CSC format
+        I      = Rows     ;     J      = Cols     ;
+        ni     = nRows_in ;     nj     = nCols_in ;
+        Ikind  = RowsKind ;     Jkind  = ColsKind ;
+        nI     = nRows    ;     nJ     = nCols    ;
+        memcpy (Icolon, RowColon, 3 * sizeof (int64_t)) ;
+        memcpy (Jcolon, ColColon, 3 * sizeof (int64_t)) ;
+    }
+    else
+    { 
+        // C is in CSR format
+        I       = Cols     ;    J       = Rows     ;
+        ni      = nCols_in ;    nj      = nRows_in ;
+        Ikind   = ColsKind ;    Jkind   = RowsKind ;
+        nI      = nCols    ;    nJ      = nRows    ;
+        memcpy (Icolon, ColColon, 3 * sizeof (int64_t)) ;
+        memcpy (Jcolon, RowColon, 3 * sizeof (int64_t)) ;
+        // flip the sense of row/col assign
+        if ((*assign_kind) == GB_ROW_ASSIGN)
+        {
+            // assignment to vector j = J [0], which is Rows [0]
+            (*assign_kind) = GB_COL_ASSIGN ;
+        }
+        else if ((*assign_kind) == GB_COL_ASSIGN)
+        {
+            // assignment to index i = I [0], which is Cols [0]
+            (*assign_kind) = GB_ROW_ASSIGN ;
+        }
+    }
+
+    // J is now a list of vectors in the range 0:C->vdim-1
+    // I is now a list of indices in the range 0:C->vlen-1
+
+    bool whole_C_matrix = (Ikind == GB_ALL && Jkind == GB_ALL) ;
+
+    //--------------------------------------------------------------------------
+    // quick return if an empty mask is complemented
+    //--------------------------------------------------------------------------
+
+    bool C_is_bitmap = GB_IS_BITMAP (C) ;
+    int C_sparsity = GB_sparsity_control (C->sparsity, C->vdim) ;
+    bool C_may_be_bitmap = (C_sparsity & GxB_BITMAP) ;
+    bool use_bitmap_assign = (C_is_bitmap ||
+        ((*C_replace) && GB_IS_FULL (C) && C_may_be_bitmap)) ;
+
+    // an empty mask occurs when M is not present, but complemented
+
+    if (M == NULL && Mask_comp)
+    {
+
+        //----------------------------------------------------------------------
+        // C<!,replace or !replace>(I,J) = anything
+        //----------------------------------------------------------------------
+
+        // The mask M is empty, and complemented, and thus M(i,j)=0 for all i
+        // and j.  The result does not depend on A or accum.  The output C is
+        // either untouched (if C_replace is false) or cleared (if C_replace is
+        // true).  However, the GrB_Row_assign and GrB_Col_assign only clear
+        // their specific row or column of C, respectively.  GB_subassign only
+        // clears C(I,J).  GrB_assign clears all of C.
+
+        // M is NULL so C and M cannot be the same, and A is ignored so
+        // it doesn't matter whether or not C == A.  Thus C is not aliased
+        // to the inputs.
+
+        // This condition is like GB_RETURN_IF_QUICK_MASK(...), except that
+        // the action taken by C_replace is different for row/col assign
+        // and subassign.
+
+        if (*C_replace)
+        {
+
+            //------------------------------------------------------------------
+            // C<!,replace>(I,J) = anything
+            //------------------------------------------------------------------
+
+            ASSERT_MATRIX_OK (C, "C for quick mask", GB0) ;
+
+            // to clear the whole C matrix: assign and subassign are the same
+
+            switch (whole_C_matrix ? GB_ASSIGN : (*assign_kind))
+            {
+
+                //--------------------------------------------------------------
+                // row assign: delete all entries in C(i,:)
+                //--------------------------------------------------------------
+
+                case GB_ROW_ASSIGN : 
+                {
+                    // delete all entries in each vector with index i
+                    GB_MATRIX_WAIT_IF_PENDING (C) ;
+                    if (use_bitmap_assign)
+                    { 
+                        // neither A nor the scalar are used, so convert this
+                        // to a scalar assignment (the scalar is not used)
+                        GBURBLE ("bitmap C(i,:)=zombie ") ;
+                        int scalar_unused = 0 ;
+                        GB_OK (GB_bitmap_assign (C, /* C_replace: */ true,
+                            I,    1, GB_LIST, NULL, // I
+                            NULL, 0, GB_ALL,  NULL, // J
+                            /* no M: */ NULL,
+                            /* Mask_comp: */ true,
+                            /* Mask_struct: ignored */ false,
+                            /* no accum: */ NULL,
+                            /* no A: */ NULL,
+                            /* scalar: */ &scalar_unused, GrB_INT32,
+                            GB_ROW_ASSIGN, Context)) ;
+                    }
+                    else
+                    { 
+                        GB_MATRIX_WAIT_IF_JUMBLED (C) ;
+                        GB_ENSURE_SPARSE (C) ;
+                        GBURBLE ("C(i,:)=zombie ") ;
+                        GB_assign_zombie2 (C, I [0], Context) ;
+                    }
+                }
+                break ;
+
+                //--------------------------------------------------------------
+                // col assign: delete all entries in C(:,j)
+                //--------------------------------------------------------------
+
+                case GB_COL_ASSIGN : 
+                {
+                    GB_MATRIX_WAIT_IF_PENDING (C) ;
+                    if (use_bitmap_assign)
+                    { 
+                        // neither A nor the scalar are used, so convert this
+                        // to a scalar assignment (the scalar is not used)
+                        GBURBLE ("bitmap C(:,j)=zombie ") ;
+                        int scalar_unused = 0 ;
+                        GB_OK (GB_bitmap_assign (C, /* C_replace: */ true,
+                            NULL, 0, GB_ALL,  NULL, // I
+                            J,    1, GB_LIST, NULL, // J
+                            /* no M: */ NULL,
+                            /* Mask_comp: */ true,
+                            /* Mask_struct: ignored */ false,
+                            /* no accum: */ NULL,
+                            /* no A: */ NULL,
+                            /* scalar: */ &scalar_unused, GrB_INT32,
+                            GB_COL_ASSIGN, Context)) ;
+                    }
+                    else
+                    { 
+                        GB_ENSURE_SPARSE (C) ;
+                        GBURBLE ("C(:,j)=zombie ") ;
+                        GB_assign_zombie1 (C, J [0], Context) ;
+                    }
+                }
+                break ;
+
+                //--------------------------------------------------------------
+                // assign: delete all entries in C
+                //--------------------------------------------------------------
+
+                case GB_ASSIGN : 
+                {
+                    // C<!>=anything since result does not depend on computing
+                    // Z.  Since C_replace is true, all of C is cleared.  This
+                    // is the same as the GB_RETURN_IF_QUICK_MASK macro.
+                    // GB_clear either converts C to an empty sparse/hyper
+                    // matrix, or to a bitmap matrix with no entries, depending
+                    // on its sparsity control setting.
+                    GBURBLE ("clear C ") ;
+                    GB_OK (GB_clear (C, Context)) ;
+                }
+                break ;
+
+                //--------------------------------------------------------------
+                // subassign: delete all entries in C(I,J)
+                //--------------------------------------------------------------
+
+                case GB_SUBASSIGN : 
+                {
+                    GB_MATRIX_WAIT_IF_PENDING (C) ;
+                    if (use_bitmap_assign)
+                    { 
+                        // neither A nor the scalar are used, so convert this
+                        // to a scalar assignment (the scalar is not used)
+                        GBURBLE ("bitmap C(I,J)=zombie ") ;
+                        int scalar_unused = 0 ;
+                        GB_OK (GB_bitmap_assign (C, /* C_replace: */ true,
+                            I, nI, Ikind, Icolon,
+                            J, nJ, Jkind, Jcolon,
+                            /* no M: */ NULL,
+                            /* Mask_comp: */ true,
+                            /* Mask_struct: ignored */ false,
+                            /* no accum: */ NULL,
+                            /* no A: */ NULL,
+                            /* scalar: */ &scalar_unused, GrB_INT32,
+                            GB_SUBASSIGN, Context)) ;
+                    }
+                    else
+                    { 
+                        // Method 00: C(I,J) = empty, using S
+                        GBURBLE ("C(I,J)=zombie ") ;
+                        GB_ENSURE_SPARSE (C) ;
+                        GB_OK (GB_subassign_zombie (C,
+                            I, ni, nI, Ikind, Icolon,
+                            J, nj, nJ, Jkind, Jcolon, Context)) ;
+                    }
+                }
+                break ;
+
+                default: ;
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // finalize C if blocking mode is enabled, and return result
+        //----------------------------------------------------------------------
+
+        ASSERT_MATRIX_OK (C, "Final C for assign, quick mask", GB0) ;
+        (*done) = true ;
+        GB_FREE_ALL ;
+        ASSERT (C == C_in) ;
+        (*Chandle) = C ;
+        return (GB_block (C, Context)) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // disable C_replace if no mask present
+    //--------------------------------------------------------------------------
+
+    bool no_mask = (M == NULL && !Mask_comp) ;
+    if (no_mask)
+    {
+        // no_mask:  mask is not present, and not complemented
+        if (*C_replace)
+        { 
+            // The mask is not present and not complemented.  In this case,
+            // C_replace is effectively false for subassign.  Disable it, since
+            // it can force pending tuples to be assembled.
+            GBURBLE ("(no mask: C_replace effectively false) ") ;
+            (*C_replace) = false ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // delete pending tuples for C(:,:) = x and C(:,:) = A
+    //--------------------------------------------------------------------------
+
+    if (whole_C_matrix)
+    { 
+        // If the assignment is C<M>(:,:) = ... then convert the assignment
+        // into a subassign.
+        (*assign_kind) = GB_SUBASSIGN ;
+    }
+
+    if (whole_C_matrix && no_mask && accum == NULL)
+    { 
+
+        //----------------------------------------------------------------------
+        // C(:,:) = x or A:  whole matrix assignment with no mask
+        //----------------------------------------------------------------------
+
+        // C_replace is already effectively false (see no_mask condition above)
+        ASSERT ((*C_replace) == false) ;
+
+        if (GB_aliased (C, A) && !A_transpose && !scalar_expansion)
+        { 
+            // C = C, with C and A aliased, no transpose, no mask, no accum
+            // operator, both I and J are ":", Mask_comp false.  C is not
+            // modified at all, and there's no work to do except to check for
+            // blocking mode.
+            GBURBLE ("(no-op) ") ;
+            (*done) = true ;
+            GB_FREE_ALL ;
+            ASSERT (C == C_in) ;
+            (*Chandle) = C ;
+            return (GB_block (C, Context)) ;
+        }
+
+        // free pending tuples early but do not clear C.  If it is
+        // already dense then its pattern can be reused.
+        GB_Pending_free (&(C->Pending)) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // transpose A if requested
+    //--------------------------------------------------------------------------
+
+    // GrB_Row_assign and GrB_Col_assign pass A as a typecasted vector,
+    // which is then quickly transposed to a hypersparse matrix.
+
+    ASSERT_MATRIX_OK (C, "C here in GB_assign", GB0) ;
+
+    if (!scalar_expansion && A_transpose)
+    { 
+        // AT = A', with no typecasting
+        // transpose: no typecast, no op, not in-place
+        // TODO: if accum is present and it does not depend on the values of
+        // A,  only construct the pattern of AT, not the values.
+        GBURBLE ("(A transpose) ") ;
+        GB_OK (GB_transpose (&AT, NULL, C_is_csc, A,
+            NULL, NULL, NULL, false, Context)) ;
+        GB_MATRIX_WAIT (AT) ;       // A cannot be jumbled
+        A = AT ;
+    }
+
+    //--------------------------------------------------------------------------
+    // transpose the mask if requested
+    //--------------------------------------------------------------------------
+
+    // the mask for G*B_Col_*assign and G*B_Row_*assign is a GrB_Vector in CSC
+    // form, which is quickly transposed to a hypersparse matrix, if needed.
+    // G*B_Vector_*assign always has a CSC mask and CSC C matrix, since both
+    // are GrB_Vectors.
+
+    if (M != NULL)
+    {
+        if (M->is_csc != C_is_csc)
+        { 
+            // either G*B_Row_*assign and G*B_Col_*assign when matrix C is in
+            // CSR format, and or G*B_Matrix_assign when the format of the
+            // matrices C and M differ.
+            M_transpose = !M_transpose ;
+        }
+        if (M_transpose)
+        { 
+            // MT = M' to conform M to the same CSR/CSC format as C,
+            // and typecast to boolean.
+            // transpose: typecast, no op, not in-place
+            // TODO: if Mask_struct, only construct the pattern of MT
+            GBURBLE ("(M transpose) ") ;
+            GB_OK (GB_transpose (&MT, GrB_BOOL, C_is_csc, M,
+                NULL, NULL, NULL, false, Context)) ;
+            GB_MATRIX_WAIT (MT) ;       // M cannot be jumbled
+            M = MT ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // determine the properties of I and J
+    //--------------------------------------------------------------------------
+
+    // If the descriptor says that A must be transposed, it has already been
+    // transposed in the caller.  Thus C(I,J), A, and M (if present) all
+    // have the same size: length(I)-by-length(J)
+
+    bool I_unsorted, I_has_dupl, I_contig, J_unsorted, J_has_dupl, J_contig ;
+    int64_t imin, imax, jmin, jmax ;
+    GB_OK (GB_ijproperties (I, ni, nI, C->vlen, &Ikind, Icolon,
+                &I_unsorted, &I_has_dupl, &I_contig, &imin, &imax, Context)) ;
+    GB_OK (GB_ijproperties (J, nj, nJ, C->vdim, &Jkind, Jcolon,
+                &J_unsorted, &J_has_dupl, &J_contig, &jmin, &jmax, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // sort I and J and remove duplicates, if needed
+    //--------------------------------------------------------------------------
+
+    // If I or J are explicit lists, and either of are unsorted or are sorted
+    // but have duplicate entries, then both I and J are sorted and their
+    // duplicates are removed.  A and M are adjusted accordingly.  Removing
+    // duplicates decreases the length of I and J.
+
+    bool I_unsorted_or_has_dupl = (I_unsorted || I_has_dupl) ;
+    bool J_unsorted_or_has_dupl = (J_unsorted || J_has_dupl) ;
+    bool presort = I_unsorted_or_has_dupl || J_unsorted_or_has_dupl ;
+
+    // This pre-sort of I and J is required for the parallel assignment.
+    // Otherwise, multiple threads may attempt to modify the same part of C.
+    // This could cause a race condition, if one thread flags a zombie at the
+    // same time another thread is using that index in a binary search.  If the
+    // 2nd thread finds either zombie/not-zombie, this is fine, but the
+    // modification would have to be atomic.  Atomic read/write is slow, so to
+    // avoid the use of atomics, the index lists I and J are sorted and all
+    // duplicates are removed.
+
+    // A side benefit of this pre-sort is that it ensures that the results of
+    // GrB_assign and GxB_subassign are completely defined if I and J have
+    // duplicates.  The definition of this pre-sort is given in MATLAB below.
+
+    /*
+        function C = subassign (C, I, J, A)
+        % submatrix assignment with pre-sort of I and J; and remove duplicates
+
+        % delete duplicates from I, keeping the last one seen
+        [I2 I2k] = sort (I) ;
+        Idupl = [(I2 (1:end-1) == I2 (2:end)), false] ;
+        I2  = I2  (~Idupl) ;
+        I2k = I2k (~Idupl) ;
+        assert (isequal (I2, unique (I)))
+
+        % delete duplicates from J, keeping the last one seen
+        [J2 J2k] = sort (J) ;
+        Jdupl = [(J2 (1:end-1) == J2 (2:end)), false] ;
+        J2  = J2  (~Jdupl) ;
+        J2k = J2k (~Jdupl) ;
+        assert (isequal (J2, unique (J)))
+
+        % do the submatrix assignment, with no duplicates in I2 or J2
+        C (I2,J2) = A (I2k,J2k) ;
+    */
+
+    // With this subassign script, the result returned by GB_subassigner
+    // matches the behavior in MATLAB, so the following holds:
+
+    /*
+        C4 = C ;
+        C4 (I,J) = A ;
+        C3 = subassign (C, I, J, A) ;
+        assert (isequal (C4, C3)) ;
+    */
+
+    // That is, the pre-sort of I, J, and A has no effect on the final C, in
+    // MATLAB.
+
+    // The pre-sort itself takes additional work and memory space, but it may
+    // actually improve the performance since it makes the data access of C
+    // more regular, even in the sequential case.
+
+    if (presort)
+    {
+
+        ASSERT (Ikind == GB_LIST || Jkind == GB_LIST) ;
+        ASSERT (!whole_C_matrix) ;
+
+        if (I_unsorted_or_has_dupl)
+        { 
+            // I2 = sort I and remove duplicates
+            ASSERT (Ikind == GB_LIST) ;
+            GB_OK (GB_ijsort (I, &ni, &I2, &I2k, Context)) ;
+            // Recheck the length and properties of the new I2.  This may
+            // convert I2 to GB_ALL or GB_RANGE, after I2 has been sorted.
+            GB_ijlength (I2, ni, C->vlen, &nI, &Ikind, Icolon) ;
+            GB_OK (GB_ijproperties (I2, ni, nI, C->vlen, &Ikind, Icolon,
+                &I_unsorted, &I_has_dupl, &I_contig, &imin, &imax, Context)) ;
+            ASSERT (! (I_unsorted || I_has_dupl)) ;
+            I = I2 ;
+        }
+
+        if (J_unsorted_or_has_dupl)
+        { 
+            // J2 = sort J and remove duplicates
+            ASSERT (Jkind == GB_LIST) ;
+            GB_OK (GB_ijsort (J, &nj, &J2, &J2k, Context)) ;
+            // Recheck the length and properties of the new J2.  This may
+            // convert J2 to GB_ALL or GB_RANGE, after J2 has been sorted.
+            GB_ijlength (J2, nj, C->vdim, &nJ, &Jkind, Jcolon) ;
+            GB_OK (GB_ijproperties (J2, nj, nJ, C->vdim, &Jkind, Jcolon,
+                &J_unsorted, &J_has_dupl, &J_contig, &jmin, &jmax, Context)) ;
+            ASSERT (! (J_unsorted || J_has_dupl)) ;
+            J = J2 ;
+        }
+
+        // inverse index lists to create the A2 and M2 submatrices:
+        const GrB_Index *Iinv = I_unsorted_or_has_dupl ? I2k : GrB_ALL ;
+        const GrB_Index *Jinv = J_unsorted_or_has_dupl ? J2k : GrB_ALL ;
+
+        if (!scalar_expansion)
+        { 
+            // A2 = A (Iinv, Jinv)
+            GB_OK (GB_subref (&A2, A->is_csc, A, Iinv, ni, Jinv, nj, false,
+                Context)) ;
+            // GB_subref can return a jumbled result
+            ASSERT (GB_JUMBLED_OK (A2)) ;
+            if (A == AT) GB_Matrix_free (&AT) ;
+            A = A2 ;
+        }
+
+        if (M != NULL && (*assign_kind) == GB_SUBASSIGN)
+        { 
+            // M2 = M (Iinv, Jinv)
+            GB_OK (GB_subref (&M2, M->is_csc, M, Iinv, ni, Jinv, nj, false,
+                Context)) ;
+            // GB_subref can return a jumbled result
+            ASSERT (GB_JUMBLED_OK (M2)) ;
+            if (M == MT) GB_Matrix_free (&MT) ;
+            M = M2 ;
+        }
+
+        GB_FREE (I2k) ;
+        GB_FREE (J2k) ;
+    }
+
+    // I and J are now sorted, with no duplicate entries.  They are either
+    // GB_ALL, GB_RANGE, or GB_STRIDE, which are intrinsically sorted with no
+    // duplicates, or they are explicit GB_LISTs with sorted entries and no
+    // duplicates.
+
+    ASSERT (!I_unsorted) ; ASSERT (!I_has_dupl) ;
+    ASSERT (!J_unsorted) ; ASSERT (!J_has_dupl) ;
+
+    //--------------------------------------------------------------------------
+    // check for early C_replace
+    //--------------------------------------------------------------------------
+
+    // C(:,:)<any mask, replace> = A or x
+
+    // C_replace_may_be_done_early is true if the C_replace action can take
+    // place now.  If true, the final C does not depend on the contents of
+    // C on input.  If bitmap assigment might be done, delay the clearing of
+    // C since it would be faster to set its bitmap to all zero later on,
+    // instead of freeing it and reallocating it.
+
+    bool C_replace_may_be_done_early = (whole_C_matrix && (*C_replace)
+        && accum == NULL && !use_bitmap_assign) ;
+
+    // If the entire C(:,:) is being assigned to, and if no accum operator is
+    // present, then the matrix can be cleared of all entries now, and then
+    // C_replace can be set false.  Clearing C now speeds up the assignment
+    // since the wait on C can be skipped, below.  It also simplifies the
+    // kernels.  If S is constructed, it is just an empty matrix.
+
+    // By clearing C now and setting C_replace to false, the following methods
+    // are used: 09 becomes 05, 10 becomes 06n or 06s, 17 becomes 13, and 18
+    // becomes 14.  The S matrix for methods 06s, 13, and 14 is still created,
+    // but it is very fast to construct and traverse since C is empty.  Method
+    // 00 can be skipped since C is already empty (see "quick" case below).
+
+        // prior time             new  time           action
+        // ----- ----             ---  ----           ------
+
+        // 00:  O(S)              nothing, O(1)       C already cleared
+
+        // 09:  O(M+S)            05:  O(M)           C<M> = x, no S
+
+        // 10:  O((A+S)*log(m))   06n: O(M*(log(a))   C<M> = A, no S
+        //                        06s: O(A*(log(m))   C<M> = A, with S
+
+        // 17:  O(m*n)            13:  O(m*n)         C<!M> = x, with S
+
+        // 18:  O(A*log(m))       14:  O(A*log(m))    C<!M> = A, with S
+
+        //  =====================       ==============
+        //  M   cmp rpl acc A   S       method: action
+        //  =====================       ==============
+
+        //  M   -   -   -   -   -       05:  C(I,J)<M> = x, no S
+        //  M   -   -   -   A   -       06n: C(I,J)<M> = A, no S
+        //  M   -   -   -   A   S       06s: C(I,J)<M> = A, with S
+
+        //  M   -   r   -   -   S       09:  C(I,J)<M,repl> = x, with S
+        //  M   -   r   -   A   S       10:  C(I,J)<M,repl> = A, with S
+
+        //  M   c   -   -   -   S       13:  C(I,J)<!M> = x, with S
+        //  M   c   -   -   A   S       14:  C(I,J)<!M> = A, with S
+
+        //  M   c   r   -   -   S       17:  C(I,J)<!M,repl> = x, with S
+        //  M   c   r   -   A   S       18:  C(I,J)<!M,repl> = A, with S
+
+        // Methods 09, 10, 17, and 18 are now used only if C(I,J) is a
+        // submatrix of C, and not for the whole_C_matrix case.
+
+    //--------------------------------------------------------------------------
+    // make a copy Z = C if C is aliased to A or M
+    //--------------------------------------------------------------------------
+
+    // TODO: bitmap assign can handle C==M and C==A aliasing in some cases
+
+    // If C is aliased to A and/or M, a copy of C must be made.
+    bool C_aliased = GB_aliased (C, A) || GB_aliased (C, M) ;
+
+    // GB_assign cannot tolerate any alias with the input mask,
+    // if the C_replace phase will be performed.
+    if ((*C_replace) && ((*assign_kind) != GB_SUBASSIGN))
+    { 
+        // the C_replace phase requires C and M_in not to be aliased
+        C_aliased = C_aliased || GB_aliased (C, M_in) ;
+    }
+
+    if (C_aliased)
+    {
+        // C is aliased with M or A: make a copy of C to assign into
+        if (C_replace_may_be_done_early)
+        { 
+            // Instead of duplicating C, create a new empty matrix C2.
+            int sparsity = (C->h != NULL) ? GxB_HYPERSPARSE : GxB_SPARSE ;
+            GB_OK (GB_new (&C2, // sparse or hyper, new header
+                C->type, C->vlen, C->vdim, GB_Ap_calloc, C_is_csc,
+                sparsity, C->hyper_switch, 1, Context)) ;
+            GBURBLE ("(C alias cleared; C_replace early) ") ;
+            (*C_replace) = false ;
+        }
+        else
+        { 
+            // finish any computations in C, but leave it jumbled
+            GBURBLE ("(C alias: make duplicate) ") ;
+            GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (C) ;
+            ASSERT (!GB_ZOMBIES (C)) ;
+            ASSERT (GB_JUMBLED_OK (C)) ;
+            ASSERT (!GB_PENDING (C)) ;
+            // C2 = duplicate of C, which must be freed when done
+            GB_OK (GB_dup (&C2, C, true, NULL, Context)) ;
+        }
+        // C2 must be transplanted back into C when done
+        C = C2 ;
+    }
+    else
+    {
+        // C is not aliased, but check if it can be cleared early
+        if (C_replace_may_be_done_early)
+        { 
+            // Clear C early.
+            GB_OK (GB_clear (C, Context)) ;
+            GBURBLE ("(C(:,:)<any mask>: C_replace early) ") ;
+            (*C_replace) = false ;
+        }
+        // the assignment operates on C in-place
+    }
+
+    //--------------------------------------------------------------------------
+    // disable C_replace if C is empty
+    //--------------------------------------------------------------------------
+
+    bool C_is_empty = (GB_NNZ (C) == 0 && !GB_PENDING (C) && !GB_ZOMBIES (C)) ;
+    if (C_is_empty)
+    { 
+        // C is completely empty.  C_replace is irrelevant so set it to false.
+        (*C_replace) = false ;
+    }
+
+    //--------------------------------------------------------------------------
+    // check compatibilty of prior pending tuples
+    //--------------------------------------------------------------------------
+
+    // The action: ( delete ) can only delete a live entry in the pattern.  It
+    // cannot delete a pending tuple; pending tuples cannot become zombies.
+    // Thus, if this assignment has the potential for creating zombies, all
+    // prior pending tuples must be assembled now.  They thus become live
+    // entries in the pattern of C, so that this GB_subassigner can
+    // (potentially) turn them into zombies via action: ( delete ).
+
+    // If accum is NULL, the operation is C(I,J) = A, or C(I,J)<M> = A.  If A
+    // has any implicit zeros at all, or if M is present, then the
+    // action: ( delete ) is possible.  This action is taken when an entry is
+    // found in C but not A.  It is thus not possible to check A in advance if
+    // an entry in C must be deleted.  If an entry does not appear in C but
+    // appears as a pending tuple, deleting it would require a scan of all the
+    // pending tuples in C.  This is costly, and simply assembling all pending
+    // tuples first is faster.
+
+    // The action: ( insert ) adds additional pending tuples.  All pending
+    // tuples will be assembled sometime later on, using a single pending
+    // operator, and thus the current accum operator must match the prior
+    // pending operator.  If the operators do not match, then all prior pending
+    // tuples must be assembled now, so that this GB_subassigner can
+    // (potentially) insert new pending tuples whose pending operator is accum.
+
+    // These tests are conservative because it is possible that this
+    // GxB_subassign will not need to use action: ( insert ).
+
+    // In the discussion below, let SECOND_Ctype denote the SECOND operator
+    // z=f(x,y) whose ztype, xtype, and ytype matches the type of C.
+
+    bool wait = false ;
+
+    if (C->Pending == NULL)
+    { 
+
+        //----------------------------------------------------------------------
+        // no pending tuples currently exist
+        //----------------------------------------------------------------------
+
+        // If any new pending tuples are added, their pending operator is
+        // accum, or the implicit SECOND_Ctype operator if accum is NULL.
+        // The type of any pending tuples will become C->type.
+        // Prior zombies have no effect on this decision.
+
+        wait = false ;
+
+    }
+    else
+    { 
+
+        //----------------------------------------------------------------------
+        // prior pending tuples exist: check if action: ( delete ) can occur
+        //----------------------------------------------------------------------
+
+        // action: ( delete ) can only operate on entries in the pattern by
+        // turning them into zombies.  It cannot delete prior pending tuples.
+        // Thus all prior pending tuples must be assembled first if
+        // action: ( delete ) can occur.
+
+        if (*C_replace)
+        { 
+            // C_replace must use the action: ( delete )
+            wait = true ;
+        }
+        else if (accum == NULL)
+        { 
+            // This GxB_subassign can potentially use action: ( delete ), and
+            // thus prior pending tuples must be assembled first.  However, if
+            // A is completely dense, then C(I,J)=A cannot delete any entries
+            // from C.
+
+            if (scalar_expansion || GB_is_dense (A))
+            { 
+                // A is a scalar or dense matrix, so entries cannot be deleted
+                wait = false ;
+            }
+            else
+            { 
+                // A is sparse.  action: ( delete ) might occur.
+                wait = true ;
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // check if pending operator is compatible
+        //----------------------------------------------------------------------
+
+        if (!wait)
+        { 
+
+            // ( delete ) will not occur, but new pending tuples may be added
+            // via the action: ( insert ).  Check if the accum operator is the
+            // same as the prior pending operator and ensure the types are
+            // the same.
+
+            ASSERT (C->Pending != NULL) ;
+            ASSERT (C->Pending->type != NULL) ;
+
+            if (atype != C->Pending->type)
+            { 
+                // entries in A are copied directly into the list of pending
+                // tuples for C, with no typecasting.  The type of the prior
+                // pending tuples must match the type of A.  Since the types
+                // do not match, prior updates must be assembled first.
+                wait = true ;
+            }
+            else if
+            (
+                // the types match, now check the pending operator
+                ! (
+                    // the operators are the same
+                    (accum == C->Pending->op)
+                    // or both operators are SECOND_Ctype, implicit or explicit
+                    || (GB_op_is_second (accum, C->type) &&
+                        GB_op_is_second (C->Pending->op, C->type))
+                  )
+            )
+            { 
+                wait = true ;
+            }
+        }
+    }
+
+    if (wait)
+    { 
+        // Prior computations are not compatible with this assignment, so all
+        // prior work must be finished.  This potentially costly.
+        // delete any lingering zombies and assemble any pending tuples
+        ASSERT_MATRIX_OK (C, "C before wait", GB0) ;
+        GB_MATRIX_WAIT (C) ;
+    }
+
+    ASSERT_MATRIX_OK (C, "C before subassign", GB0) ;
+    ASSERT_BINARYOP_OK_OR_NULL (accum, "accum for assign", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // check again if C is empty
+    //--------------------------------------------------------------------------
+
+    // GB_clear or GB_Matrix_wait, above, may have deleted all the zombies in
+    // C, so check again if C is empty.
+
+    C_is_empty = (GB_NNZ (C) == 0 && !GB_PENDING (C) && !GB_ZOMBIES (C)) ;
+    if (C_is_empty)
+    { 
+        // C is completely empty.  C_replace is irrelevant so set it to false.
+        GBURBLE ("(C empty) ") ;
+        (*C_replace) = false ;
+    }
+
+    //--------------------------------------------------------------------------
+    // keep track of the current accum operator
+    //--------------------------------------------------------------------------
+
+    // If accum is NULL and pending tuples are added, they will be assembled
+    // sometime later (not here) using the implied SECOND_Ctype operator.  This
+    // GB_subassigner operation corresponds to C(I,J)=A or C(I,J)<M>=A.
+    // Subsequent calls to GrB_setElement, and subsequent calls to GrB_assign
+    // or GxB_subassign with an explict SECOND_Ctype operator, may create
+    // additional pending tuples and add them to the list without requiring
+    // that they be assembled first.
+
+    // If accum is non-NULL, then all prior pending tuples have the same
+    // pending operator as this accum.  If that prior operator was the implicit
+    // SECOND_Ctype and those pending tuples still exist, then this accum
+    // operator is the explicit SECOND_ctype operator.  The implicit
+    // SECOND_Ctype operator is replaced with the current accum, which is the
+    // explicit SECOND_Ctype operator.
+
+    if (C->Pending != NULL)
+    { 
+        C->Pending->op = accum ;
+    }
+
+    //--------------------------------------------------------------------------
+    // return results
+    //--------------------------------------------------------------------------
+
+    (*Chandle) = C ;            // C is C_in or C2
+    (*Mhandle) = M ;            // M is M_in or M2
+    (*Ahandle) = A ;            // A is A_in or A2
+
+    (*C2_handle) = C2 ;
+    (*M2_handle) = (MT != NULL) ? MT : M2 ;
+    (*A2_handle) = (AT != NULL) ? AT : A2 ;
+
+    (*atype_handle) = atype ;
+
+    // modified versions of the Rows/Cols lists, and their analysis:
+    (*I_handle) = I ;           // either Rows, Cols, or I2
+    (*I2_handle) = I2 ;         // temporary sorted copy of Rows or Cols list
+    (*ni_handle) = ni ;
+    (*nI_handle) = nI ;
+    (*Ikind_handle) = Ikind ;
+
+    (*J_handle) = J ;           // either Rows, Cols, or J2
+    (*J2_handle) = J2 ;         // temporary sorted copy of Rows or Cols list
+    (*nj_handle) = nj ;
+    (*nJ_handle) = nJ ;
+    (*Jkind_handle) = Jkind ;
+
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_assign_scalar.c b/GraphBLAS/Source/GB_assign_scalar.c
index 265aac2f5c..36ff237b76 100644
--- a/GraphBLAS/Source/GB_assign_scalar.c
+++ b/GraphBLAS/Source/GB_assign_scalar.c
@@ -2,8 +2,8 @@
 // GB_assign_scalar:    C<M>(Rows,Cols) = accum (C(Rows,Cols),x)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,7 @@
 // Compare with GB_subassign_scalar, which uses M and C_replace differently
 
 #include "GB_assign.h"
+#include "GB_bitmap_assign.h"
 
 GrB_Info GB_assign_scalar           // C<M>(Rows,Cols) += x
 (
@@ -46,7 +47,7 @@ GrB_Info GB_assign_scalar           // C<M>(Rows,Cols) += x
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // C<M>(Rows,Cols) = accum (C(Rows,Cols), scalar)
@@ -63,7 +64,7 @@ GrB_Info GB_assign_scalar           // C<M>(Rows,Cols) += x
         true,                       // do scalar expansion
         scalar,                     // scalar to assign, expands to become A
         scalar_code,                // type code of scalar to expand
-        false, false,               // not GrB_Col_assign nor GrB_row_assign
+        GB_ASSIGN,
         Context)) ;
 }
 
diff --git a/GraphBLAS/Source/GB_assign_zombie.h b/GraphBLAS/Source/GB_assign_zombie.h
new file mode 100644
index 0000000000..68bb241a14
--- /dev/null
+++ b/GraphBLAS/Source/GB_assign_zombie.h
@@ -0,0 +1,74 @@
+//------------------------------------------------------------------------------
+// GB_assign_zombie.h: definitions for GB_assign_zombie* functions
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_ASSIGN_ZOMBIE_H
+#define GB_ASSIGN_ZOMBIE_H
+#include "GB_ij.h"
+
+void GB_assign_zombie1
+(
+    GrB_Matrix C,
+    const int64_t j,
+    GB_Context Context
+) ;
+
+void GB_assign_zombie2
+(
+    GrB_Matrix C,
+    const int64_t i,
+    GB_Context Context
+) ;
+
+void GB_assign_zombie3
+(
+    GrB_Matrix C,
+    const GrB_Matrix M,
+    const bool Mask_comp,
+    const bool Mask_struct,         // if true, use the only structure of M
+    const int64_t j,
+    const GrB_Index *I,
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    GB_Context Context
+) ;
+
+void GB_assign_zombie4
+(
+    GrB_Matrix C,
+    const GrB_Matrix M,
+    const bool Mask_comp,
+    const bool Mask_struct,         // if true, use the only structure of M
+    const int64_t i,
+    const GrB_Index *J,
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    GB_Context Context
+) ;
+
+GrB_Info GB_assign_zombie5
+(
+    GrB_Matrix C,
+    const GrB_Matrix M,
+    const bool Mask_comp,
+    const bool Mask_struct,         // if true, use the only structure of M
+    const GrB_Index *I,
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    GB_Context Context
+) ;
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_assign_zombie1.c b/GraphBLAS/Source/GB_assign_zombie1.c
index 21064b9f93..37aeac1349 100644
--- a/GraphBLAS/Source/GB_assign_zombie1.c
+++ b/GraphBLAS/Source/GB_assign_zombie1.c
@@ -2,15 +2,17 @@
 // GB_assign_zombie1: delete all entries in C(:,j) for GB_assign
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // C(:,j)<!> = anything: GrB_Row_assign or GrB_Col_assign with an empty
 // complemented mask requires all entries in the C(:,j) vector to be deleted.
+// C must be sparse or hypersparse.
 
 #include "GB_assign.h"
+#include "GB_assign_zombie.h"
 
 void GB_assign_zombie1
 (
@@ -20,13 +22,24 @@ void GB_assign_zombie1
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
+
     //--------------------------------------------------------------------------
     // get C(:,j)
     //--------------------------------------------------------------------------
 
     int64_t *GB_RESTRICT Ci = C->i ;
     int64_t pC_start, pC_end, pleft = 0, pright = C->nvec-1 ;
-    GB_lookup (C->is_hyper, C->h, C->p, &pleft, pright, j, &pC_start, &pC_end) ;
+    GB_lookup (C->h != NULL, C->h, C->p, C->vlen, &pleft, pright, j,
+        &pC_start, &pC_end) ;
     int64_t cjnz = pC_end - pC_start ;
     int64_t nzombies = C->nzombies ;
 
diff --git a/GraphBLAS/Source/GB_assign_zombie2.c b/GraphBLAS/Source/GB_assign_zombie2.c
index 5d91ba1452..cc8e1b992d 100644
--- a/GraphBLAS/Source/GB_assign_zombie2.c
+++ b/GraphBLAS/Source/GB_assign_zombie2.c
@@ -2,15 +2,17 @@
 // GB_assign_zombie2: delete all entries in C(i,:) for GB_assign
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // C(i,:)<!> = anything: GrB_Row_assign or GrB_Col_assign with an empty
 // complemented mask requires all entries in C(i,:) to be deleted.
+// C must be sparse or hypersparse.
 
 #include "GB_assign.h"
+#include "GB_assign_zombie.h"
 
 void GB_assign_zombie2
 (
@@ -20,6 +22,16 @@ void GB_assign_zombie2
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;      // binary search is used
+    ASSERT (!GB_PENDING (C)) ;
+
     //--------------------------------------------------------------------------
     // get C
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_assign_zombie3.c b/GraphBLAS/Source/GB_assign_zombie3.c
index 6ee12c2c86..373f2c73a5 100644
--- a/GraphBLAS/Source/GB_assign_zombie3.c
+++ b/GraphBLAS/Source/GB_assign_zombie3.c
@@ -2,8 +2,8 @@
 // GB_assign_zombie3: delete entries in C(:,j) for C_replace_phase
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,11 +14,16 @@
 
 // GB_assign_zombie3 and GB_assign_zombie4 are transposes of each other.
 
+// C must be sparse or hypersparse.
+// M can have any sparsity structure: hypersparse, sparse, bitmap, or full
+
 #include "GB_assign.h"
+#include "GB_assign_zombie.h"
+#include "GB_subassign_methods.h"
 
 void GB_assign_zombie3
 (
-    GrB_Matrix Z,                   // the matrix C, or a copy
+    GrB_Matrix C,                   // the matrix C, or a copy
     const GrB_Matrix M,
     const bool Mask_comp,
     const bool Mask_struct,
@@ -32,27 +37,46 @@ void GB_assign_zombie3
 {
 
     //--------------------------------------------------------------------------
-    // get Z (:,j)
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
+    ASSERT (!GB_ZOMBIES (M)) ; 
+    ASSERT (!GB_JUMBLED (M)) ;      // binary search on M
+    ASSERT (!GB_PENDING (M)) ; 
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
+    //--------------------------------------------------------------------------
+    // get C (:,j)
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Zh = Z->h ;
-    const int64_t *GB_RESTRICT Zp = Z->p ;
-    int64_t *GB_RESTRICT Zi = Z->i ;
-    int64_t pZ_start, pZ_end, pleft = 0, pright = Z->nvec-1 ;
-    GB_lookup (Z->is_hyper, Zh, Zp, &pleft, pright, j, &pZ_start, &pZ_end) ;
-    int64_t nzombies = Z->nzombies ;
-    const int64_t zjnz = pZ_end - pZ_start ;
+    const int64_t *GB_RESTRICT Ch = C->h ;
+    const int64_t *GB_RESTRICT Cp = C->p ;
+    int64_t *GB_RESTRICT Ci = C->i ;
+    int64_t pC_start, pC_end, pleft = 0, pright = C->nvec-1 ;
+    GB_lookup (C->h != NULL, Ch, Cp, C->vlen, &pleft, pright, j,
+        &pC_start, &pC_end) ;
+    int64_t nzombies = C->nzombies ;
+    const int64_t zjnz = pC_end - pC_start ;
 
     //--------------------------------------------------------------------------
     // get M(:,0)
     //--------------------------------------------------------------------------
 
     const int64_t *GB_RESTRICT Mp = M->p ;
+    const int8_t  *GB_RESTRICT Mb = M->b ;
     const int64_t *GB_RESTRICT Mi = M->i ;
     const GB_void *GB_RESTRICT Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
     const size_t msize = M->type->size ;
-    int64_t pM_start = Mp [0] ;
-    int64_t pM_end = Mp [1] ;
+    const int64_t Mvlen = M->vlen ;
+    int64_t pM_start = 0 ; // Mp [0]
+    int64_t pM_end = GBP (Mp, 1, Mvlen) ;
+    const bool M_is_bitmap = GB_IS_BITMAP (M) ;
+    const bool mjdense = (pM_end - pM_start) == Mvlen ;
 
     //--------------------------------------------------------------------------
     // determine the number of threads to use
@@ -63,7 +87,7 @@ void GB_assign_zombie3
     int ntasks = (nthreads == 1) ? 1 : (64 * nthreads) ;
 
     //--------------------------------------------------------------------------
-    // delete entries from Z(:,j) that are outside I, if the mask M allows it
+    // delete entries from C(:,j) that are outside I, if the mask M allows it
     //--------------------------------------------------------------------------
 
     int taskid ;
@@ -73,19 +97,19 @@ void GB_assign_zombie3
     {
         int64_t p1, p2 ;
         GB_PARTITION (p1, p2, zjnz, taskid, ntasks) ;
-        for (int64_t pZ = pZ_start + p1 ; pZ < pZ_start + p2 ; pZ++)
+        for (int64_t pC = pC_start + p1 ; pC < pC_start + p2 ; pC++)
         {
 
             //------------------------------------------------------------------
-            // get Z(i,j)
+            // get C(i,j)
             //------------------------------------------------------------------
 
-            int64_t i = Zi [pZ] ;
+            int64_t i = Ci [pC] ;
             if (!GB_IS_ZOMBIE (i))
             {
 
                 //--------------------------------------------------------------
-                // Z(i,j) is outside Z(I,j) if i is not in the list I
+                // C(i,j) is outside C(I,j) if i is not in the list I
                 //--------------------------------------------------------------
 
                 bool i_outside = !GB_ij_is_in_list (I, nI, i, Ikind, Icolon) ;
@@ -93,21 +117,11 @@ void GB_assign_zombie3
                 {
 
                     //----------------------------------------------------------
-                    // Z(i,j) is a live entry not in the Z(I,J) submatrix
+                    // C(i,j) is a live entry not in the C(I,J) submatrix
                     //----------------------------------------------------------
 
                     // Check the mask M to see if it should be deleted.
-
-                    int64_t pM     = pM_start ;
-                    int64_t pright = pM_end - 1 ;
-                    bool found ;
-                    GB_BINARY_SEARCH (i, Mi, pM, pright, found) ;
-                    bool mij = false ;
-                    if (found)
-                    { 
-                        // found it
-                        mij = GB_mcast (Mx, pM, msize) ;
-                    }
+                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (i) ;
                     if (Mask_comp)
                     { 
                         // negate the mask if Mask_comp is true
@@ -115,9 +129,9 @@ void GB_assign_zombie3
                     }
                     if (!mij)
                     { 
-                        // delete Z(i,j) by marking it as a zombie
+                        // delete C(i,j) by marking it as a zombie
                         nzombies++ ;
-                        Zi [pZ] = GB_FLIP (i) ;
+                        Ci [pC] = GB_FLIP (i) ;
                     }
                 }
             }
@@ -128,6 +142,6 @@ void GB_assign_zombie3
     // return result
     //--------------------------------------------------------------------------
 
-    Z->nzombies = nzombies ;
+    C->nzombies = nzombies ;
 }
 
diff --git a/GraphBLAS/Source/GB_assign_zombie4.c b/GraphBLAS/Source/GB_assign_zombie4.c
index 50101712de..85bba9bd1a 100644
--- a/GraphBLAS/Source/GB_assign_zombie4.c
+++ b/GraphBLAS/Source/GB_assign_zombie4.c
@@ -2,8 +2,8 @@
 // GB_assign_zombie4: delete entries in C(i,:) for C_replace_phase
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,11 +13,15 @@
 
 // GB_assign_zombie3 and GB_assign_zombie4 are transposes of each other.
 
+// C must be sparse or hypersparse.
+// M can have any sparsity structure: hypersparse, sparse, bitmap, or full
+
 #include "GB_assign.h"
+#include "GB_assign_zombie.h"
 
 void GB_assign_zombie4
 (
-    GrB_Matrix Z,                   // the matrix C, or a copy
+    GrB_Matrix C,                   // the matrix C, or a copy
     const GrB_Matrix M,
     const bool Mask_comp,
     const bool Mask_struct,
@@ -31,41 +35,60 @@ void GB_assign_zombie4
 {
 
     //--------------------------------------------------------------------------
-    // get Z
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;      // binary search on C
+    ASSERT (!GB_PENDING (C)) ;
+    ASSERT (!GB_ZOMBIES (M)) ; 
+    ASSERT (!GB_JUMBLED (M)) ;
+    ASSERT (!GB_PENDING (M)) ; 
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
+    //--------------------------------------------------------------------------
+    // get C
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Zh = Z->h ;
-    const int64_t *GB_RESTRICT Zp = Z->p ;
-    const int64_t Znvec = Z->nvec ;
-    int64_t *GB_RESTRICT Zi = Z->i ;
-    int64_t nzombies = Z->nzombies ;
+    const int64_t *GB_RESTRICT Ch = C->h ;
+    const int64_t *GB_RESTRICT Cp = C->p ;
+    const int64_t Cnvec = C->nvec ;
+    int64_t *GB_RESTRICT Ci = C->i ;
+    int64_t nzombies = C->nzombies ;
     const int64_t zorig = nzombies ;
 
     //--------------------------------------------------------------------------
     // get M
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Mh = M->h ;
     const int64_t *GB_RESTRICT Mp = M->p ;
+    const int64_t *GB_RESTRICT Mh = M->h ;
+    const int8_t  *GB_RESTRICT Mb = M->b ;
     const GB_void *GB_RESTRICT Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
     const size_t msize = M->type->size ;
     const int64_t Mnvec = M->nvec ;
-    const bool M_is_hyper = M->is_hyper ;
+    const int64_t Mvlen = M->vlen ;
+    ASSERT (Mvlen == 1) ;
+    const bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
+    const bool M_is_bitmap = GB_IS_BITMAP (M) ;
+    const bool M_is_full = GB_IS_FULL (M) ;
 
     //--------------------------------------------------------------------------
     // determine the number of threads to use
     //--------------------------------------------------------------------------
 
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-    int nthreads = GB_nthreads (Znvec, chunk, nthreads_max) ;
+    int nthreads = GB_nthreads (Cnvec, chunk, nthreads_max) ;
     int ntasks = (nthreads == 1) ? 1 : (64 * nthreads) ;
 
     //--------------------------------------------------------------------------
-    // delete entries in Z(i,:)
+    // delete entries in C(i,:)
     //--------------------------------------------------------------------------
 
-    // The entry Z(i,j) is deleted if j is not in the J, and if M(0,j)=0 (if
-    // the mask is not complemented) or M(0.j)=1 (if the mask is complemented.
+    // The entry C(i,j) is deleted if j is not in the J, and if M(0,j)=0 (if
+    // the mask is not complemented) or M(0,j)=1 (if the mask is complemented.
 
     int taskid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
@@ -73,54 +96,65 @@ void GB_assign_zombie4
     for (taskid = 0 ; taskid < ntasks ; taskid++)
     {
         int64_t kfirst, klast ;
-        GB_PARTITION (kfirst, klast, Znvec, taskid, ntasks) ;
+        GB_PARTITION (kfirst, klast, Cnvec, taskid, ntasks) ;
         for (int64_t k = kfirst ; k < klast ; k++)
         {
 
             //------------------------------------------------------------------
-            // get Z(:,j) and determine if j is outside the list J
+            // get C(:,j) and determine if j is outside the list J
             //------------------------------------------------------------------
 
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
+            int64_t j = GBH (Ch, k) ;
             bool j_outside = !GB_ij_is_in_list (J, nJ, j, Jkind, Jcolon) ;
             if (j_outside)
             {
 
                 //--------------------------------------------------------------
-                // j is not in J; find Z(i,j)
+                // j is not in J; find C(i,j)
                 //--------------------------------------------------------------
 
-                int64_t pZ = Zp [k] ;
-                int64_t pZ_end = Zp [k+1] ;
-                int64_t pright = pZ_end - 1 ;
+                int64_t pC = Cp [k] ;
+                int64_t pC_end = Cp [k+1] ;
+                int64_t pright = pC_end - 1 ;
                 bool found, is_zombie ;
-                GB_BINARY_SEARCH_ZOMBIE (i, Zi, pZ, pright, found, zorig,
+                GB_BINARY_SEARCH_ZOMBIE (i, Ci, pC, pright, found, zorig,
                     is_zombie) ;
 
                 //--------------------------------------------------------------
-                // delete Z(i,j) if found, not a zombie, and M(0,j) allows it
+                // delete C(i,j) if found, not a zombie, and M(0,j) allows it
                 //--------------------------------------------------------------
 
                 if (found && !is_zombie)
                 {
 
                     //----------------------------------------------------------
-                    // Z(i,j) is a live entry not in the Z(I,J) submatrix
+                    // C(i,j) is a live entry not in the C(I,J) submatrix
                     //----------------------------------------------------------
 
                     // Check the mask M to see if it should be deleted.
-
-                    int64_t pM, pM_end ;
-                    int64_t pleft = 0 ;
-                    int64_t pright = Mnvec - 1 ;
-                    GB_lookup (M_is_hyper, Mh, Mp, &pleft, pright, j,
-                        &pM, &pM_end) ;
                     bool mij = false ;
-                    if (pM < pM_end)
+
+                    if (M_is_bitmap || M_is_full)
                     { 
-                        // found it
-                        mij = GB_mcast (Mx, pM, msize) ;
+                        // M is bitmap/full, no need for GB_lookup
+                        int64_t pM = j ;
+                        mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
                     }
+                    else
+                    { 
+                        // M is sparse or hypersparse
+                        int64_t pM, pM_end ;
+                        int64_t pleft = 0 ;
+                        int64_t pright = Mnvec - 1 ;
+                        GB_lookup (M_is_hyper, Mh, Mp, Mvlen, &pleft, pright,
+                            j, &pM, &pM_end) ;
+                        if (pM < pM_end)
+                        { 
+                            // found it
+                            mij = GB_mcast (Mx, pM, msize) ;
+                        }
+                    }
+
                     if (Mask_comp)
                     { 
                         // negate the mask if Mask_comp is true
@@ -128,9 +162,9 @@ void GB_assign_zombie4
                     }
                     if (!mij)
                     { 
-                        // delete Z(i,j) by marking it as a zombie
+                        // delete C(i,j) by marking it as a zombie
                         nzombies++ ;
-                        Zi [pZ] = GB_FLIP (i) ;
+                        Ci [pC] = GB_FLIP (i) ;
                     }
                 }
             }
@@ -141,6 +175,6 @@ void GB_assign_zombie4
     // return result
     //--------------------------------------------------------------------------
 
-    Z->nzombies = nzombies ;
+    C->nzombies = nzombies ;
 }
 
diff --git a/GraphBLAS/Source/GB_assign_zombie5.c b/GraphBLAS/Source/GB_assign_zombie5.c
index eb76967b3a..b46cf70443 100644
--- a/GraphBLAS/Source/GB_assign_zombie5.c
+++ b/GraphBLAS/Source/GB_assign_zombie5.c
@@ -2,8 +2,8 @@
 // GB_assign_zombie5: delete entries in C for C_replace_phase
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,15 +13,22 @@
 
 // See also GB_assign_zombie3 and GB_assign_zombie4.
 
+// C must be sparse or hypersparse.
+
 #include "GB_assign.h"
+#include "GB_assign_zombie.h"
+#include "GB_subassign_methods.h"
 #include "GB_ek_slice.h"
 
+#undef  GB_FREE_WORK
 #define GB_FREE_WORK \
     GB_ek_slice_free (&pstart_slice, &kfirst_slice, &klast_slice) ;
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL GB_FREE_WORK
 
 GrB_Info GB_assign_zombie5
 (
-    GrB_Matrix Z,                   // the matrix C, or a copy
+    GrB_Matrix C,                   // the matrix C, or a copy
     const GrB_Matrix M,
     const bool Mask_comp,
     const bool Mask_struct,
@@ -38,37 +45,54 @@ GrB_Info GB_assign_zombie5
 {
 
     //--------------------------------------------------------------------------
-    // get Z
+    // check inputs
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Zh = Z->h ;
-    const int64_t *GB_RESTRICT Zp = Z->p ;
-    // const int64_t Znvec = Z->nvec ;
-    int64_t *GB_RESTRICT Zi = Z->i ;
-    int64_t nzombies = Z->nzombies ;
-    const int64_t znz = GB_NNZ (Z) ;
+    ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
+    ASSERT (!GB_ZOMBIES (M)) ; 
+    ASSERT (!GB_JUMBLED (M)) ;      // binary search on M
+    ASSERT (!GB_PENDING (M)) ; 
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
+    //--------------------------------------------------------------------------
+    // get C
+    //--------------------------------------------------------------------------
+
+    const int64_t *GB_RESTRICT Ch = C->h ;
+    const int64_t *GB_RESTRICT Cp = C->p ;
+    // const int64_t Cnvec = C->nvec ;
+    int64_t *GB_RESTRICT Ci = C->i ;
+    int64_t nzombies = C->nzombies ;
+    const int64_t zvlen = C->vlen ;
 
     //--------------------------------------------------------------------------
     // get M
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Mh = M->h ;
     const int64_t *GB_RESTRICT Mp = M->p ;
+    const int64_t *GB_RESTRICT Mh = M->h ;
+    const int8_t  *GB_RESTRICT Mb = M->b ;
     const int64_t *GB_RESTRICT Mi = M->i ;
     const GB_void *GB_RESTRICT Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
     const size_t msize = M->type->size ;
     const int64_t Mnvec = M->nvec ;
-    const bool M_is_hyper = M->is_hyper ;
+    const int64_t Mvlen = M->vlen ;
+    const bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
+    const bool M_is_bitmap = GB_IS_BITMAP (M) ;
+    const bool M_is_full = GB_IS_FULL (M) ;
 
     //--------------------------------------------------------------------------
     // determine the number of threads to use
     //--------------------------------------------------------------------------
 
+    const int64_t znz = GB_NNZ_HELD (C) ;
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
     int nthreads = GB_nthreads (znz, chunk, nthreads_max) ;
     int ntasks = (nthreads == 1) ? 1 : (64 * nthreads) ;
-    ntasks = GB_IMIN (ntasks, znz) ;
-    ntasks = GB_IMAX (ntasks, 1) ;
 
     //--------------------------------------------------------------------------
     // slice the entries for each task
@@ -79,10 +103,10 @@ GrB_Info GB_assign_zombie5
     // vectors may be shared with prior slices and subsequent slices.
 
     int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
-    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, Z, ntasks))
-    {
+    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, C, &ntasks))
+    { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -110,60 +134,52 @@ GrB_Info GB_assign_zombie5
         {
 
             //------------------------------------------------------------------
-            // get Z(:,j) and determine if j is outside the list J
+            // get C(:,j) and determine if j is outside the list J
             //------------------------------------------------------------------
 
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            // j_outside is true if column j is outside the Z(I,J) submatrix
+            int64_t j = GBH (Ch, k) ;
+            // j_outside is true if column j is outside the C(I,J) submatrix
             bool j_outside = !GB_ij_is_in_list (J, nJ, j, Jkind, Jcolon) ;
-            int64_t pZ_start, pZ_end ;
-            GB_get_pA_and_pC (&pZ_start, &pZ_end, NULL,
-                tid, k, kfirst, klast, pstart_slice, NULL, NULL, Zp) ;
+            int64_t pC_start, pC_end ;
+            GB_get_pA (&pC_start, &pC_end, tid, k,
+                kfirst, klast, pstart_slice, Cp, zvlen) ;
 
             //------------------------------------------------------------------
             // get M(:,j)
             //------------------------------------------------------------------
 
+            // this works for M with any sparsity structure
             int64_t pM_start, pM_end ;
-            int64_t pleft = 0 ;
             int64_t pright = Mnvec - 1 ;
-            GB_lookup (M_is_hyper, Mh, Mp, &pleft, pright, j,
+            int64_t pleft = 0 ;
+            GB_lookup (M_is_hyper, Mh, Mp, Mvlen, &pleft, pright, j,
                 &pM_start, &pM_end) ;
+            bool mjdense = (pM_end - pM_start) == Mvlen ;
 
             //------------------------------------------------------------------
-            // iterate over all entries in Z(:,j)
+            // iterate over all entries in C(:,j)
             //------------------------------------------------------------------
 
-            for (int64_t pZ = pZ_start ; pZ < pZ_end ; pZ++)
+            for (int64_t pC = pC_start ; pC < pC_end ; pC++)
             {
 
                 //--------------------------------------------------------------
-                // consider Z(i,j)
+                // consider C(i,j)
                 //--------------------------------------------------------------
 
-                // Z(i,j) is outside the Z(I,J) submatrix if either i is
+                // C(i,j) is outside the C(I,J) submatrix if either i is
                 // not in the list I, or j is not in J, or both.
-                int64_t i = Zi [pZ] ;
+                int64_t i = Ci [pC] ;
                 if (!GB_IS_ZOMBIE (i) &&
                     (j_outside || !GB_ij_is_in_list (I, nI, i, Ikind, Icolon)))
                 {
 
                     //----------------------------------------------------------
-                    // Z(i,j) is a live entry not in the Z(I,J) submatrix
+                    // C(i,j) is a live entry not in the C(I,J) submatrix
                     //----------------------------------------------------------
 
                     // Check the mask M to see if it should be deleted.
-
-                    int64_t pM     = pM_start ;
-                    int64_t pright = pM_end - 1 ;
-                    bool found ;
-                    GB_BINARY_SEARCH (i, Mi, pM, pright, found) ;
-                    bool mij = false ;
-                    if (found)
-                    { 
-                        // found it
-                        mij = GB_mcast (Mx, pM, msize) ;
-                    }
+                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (i) ;
                     if (Mask_comp)
                     { 
                         // negate the mask if Mask_comp is true
@@ -171,9 +187,9 @@ GrB_Info GB_assign_zombie5
                     }
                     if (!mij)
                     { 
-                        // delete Z(i,j) by marking it as a zombie
+                        // delete C(i,j) by marking it as a zombie
                         nzombies++ ;
-                        Zi [pZ] = GB_FLIP (i) ;
+                        Ci [pC] = GB_FLIP (i) ;
                     }
                 }
             }
@@ -184,7 +200,7 @@ GrB_Info GB_assign_zombie5
     // free workspace and return result
     //--------------------------------------------------------------------------
 
-    Z->nzombies = nzombies ;
+    C->nzombies = nzombies ;
     GB_FREE_WORK ;
     return (GrB_SUCCESS) ;
 }
diff --git a/GraphBLAS/Source/GB_atomics.h b/GraphBLAS/Source/GB_atomics.h
index 9e3556f2f8..26d45f3aed 100644
--- a/GraphBLAS/Source/GB_atomics.h
+++ b/GraphBLAS/Source/GB_atomics.h
@@ -2,14 +2,15 @@
 // GB_atomics.h: definitions for atomic operations
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // All atomic operations used by SuiteSparse:GraphBLAS appear in this file.
+
 // These atomic operations assume either an ANSI C11 compiler that supports
-// OpenMP 4.0 or later, or Microsoft Visual Studio on 64-bit Windows (which
+// OpenMP 3.1 or later, or Microsoft Visual Studio on 64-bit Windows (which
 // only supports OpenMP 2.0).  SuiteSparse:GraphBLAS is not supported on 32-bit
 // Windows.
 
@@ -17,8 +18,24 @@
 #define GB_ATOMICS_H
 #include "GB.h"
 
-#if GB_MICROSOFT
-#include <intrin.h>
+//------------------------------------------------------------------------------
+// determine the architecture
+//------------------------------------------------------------------------------
+
+#if __x86_64__
+
+    // on the x86, atomic updates can be more aggresive.  The MIN, MAX, EQ,
+    // XNOR, and ANY monoids are implemented with atomic compare/exchange.
+
+    #define GB_X86_64 1
+
+#else
+
+    // on the ARM, Power8/9, and others, only use built-in #pragma omp atomic
+    // updates.  Do not use atomic compare/exchange.
+
+    #define GB_X86_64 0
+
 #endif
 
 //------------------------------------------------------------------------------
@@ -26,7 +43,7 @@
 //------------------------------------------------------------------------------
 
 // Whenever possible, the OpenMP pragma is used with a clause (as introduced in
-// OpenMP 3.0), as follow:
+// OpenMP 3.1), as follow:
 //
 //      #pragma omp atomic [clause]
 //
@@ -58,49 +75,90 @@
 //
 // As a result, the atomic updates are the same for gcc and icc (which support
 // OpenMP 3.0 or later) with the "update" clause.  For MS Visual Studio, the
-// "update" clause is removed.
+// "update" clause is removed since it supports OpenMP 2.0.
 
-#if GB_MICROSOFT
-    // MS Visual Studio does not support the "update" clause
+#if ( _OPENMP >= 201307 )
+
+    // OpenMP 4.0 or later
+    #define GB_ATOMIC_UPDATE GB_PRAGMA (omp atomic update seq_cst)
+
+#elif ( _OPENMP >= 201107 )
+
+    // OpenMP 3.1
+    #define GB_ATOMIC_UPDATE GB_PRAGMA (omp atomic update)
+
+#elif ( _OPENMP >= 199810 )
+
+    // OpenMP 1.0 to 3.0: no optional clauses, "update" is assumed
     #define GB_ATOMIC_UPDATE GB_PRAGMA (omp atomic)
+
 #else
-    // assume OpenMP 3.0 or later
-    #define GB_ATOMIC_UPDATE GB_PRAGMA (omp atomic update)
+
+    // no OpenMP at all
+    #define GB_ATOMIC_UPDATE
+
 #endif
 
 //------------------------------------------------------------------------------
-// atomic reads and writes
+// atomic read and write
 //------------------------------------------------------------------------------
 
-#if GB_MICROSOFT
+// In Microsoft Visual Studio, simple reads and writes to properly aligned
+// 64-bit values are already atomic on 64-bit Windows for any architecture
+// supported by Windows (any Intel or ARM architecture). See:
+// https://docs.microsoft.com/en-us/windows/win32/sync/interlocked-variable-access
+// SuiteSparse:GraphBLAS is not supported on 32-bit Windows.  Thus, there
+// is no need for atomic reads/writes when compiling GraphBLAS on Windows
+// with MS Visual Studio.
+
+// ARM, Power8/9, and others need the explicit atomic read/write.
+// x86: no atomic read/write is needed.
 
-    // In Microsoft Visual Studio, simple reads and writes to properly aligned
-    // 64-bit values are already atomic on 64-bit Windows for any architecture
-    // supported by Windows (any Intel or ARM architecture). See:
-    // https://docs.microsoft.com/en-us/windows/win32/sync/interlocked-variable-access
-    // SuiteSparse:GraphBLAS is not supported on 32-bit Windows.  Thus, there
-    // is no need for atomic reads/writes when compiling GraphBLAS on Windows
-    // with MS Visual Studio.
+#if GB_X86_64
 
+    // x86: no atomic read/write is needed.
     #define GB_ATOMIC_READ
     #define GB_ATOMIC_WRITE
 
+#elif ( _OPENMP >= 201811 )
+
+    // OpenMP 5.0 or later
+    #define GB_ATOMIC_READ    GB_PRAGMA (omp atomic read acquire)
+    #define GB_ATOMIC_WRITE   GB_PRAGMA (omp atomic write release)
+
+#elif ( _OPENMP >= 201307 )
+
+    // OpenMP 4.0 and 4.5
+    #define GB_ATOMIC_READ    GB_PRAGMA (omp atomic read seq_cst)
+    #define GB_ATOMIC_WRITE   GB_PRAGMA (omp atomic write seq_cst)
+
+#elif ( _OPENMP >= 201107 )
+
+    // OpenMP 3.1
+    #define GB_ATOMIC_READ    GB_PRAGMA (omp atomic read)
+    #define GB_ATOMIC_WRITE   GB_PRAGMA (omp atomic write)
+
 #else
 
-    #if __x86_64__
+    // OpenMP 3.0 or earlier, or no OpenMP at all
+    #define GB_ATOMIC_READ
+    #define GB_ATOMIC_WRITE
 
-        // No need for atomic read/write on x86_64.  gcc already treats atomic
-        // read/write as plain read/write, so these definitions only affect icc.
-        #define GB_ATOMIC_READ
-        #define GB_ATOMIC_WRITE
+#endif
 
-    #else
+//------------------------------------------------------------------------------
+// flush
+//------------------------------------------------------------------------------
 
-        // ARM, Power8/9, and others need the explicit atomic read/write
-        #define GB_ATOMIC_READ    GB_PRAGMA (omp atomic read)
-        #define GB_ATOMIC_WRITE   GB_PRAGMA (omp atomic write)
+#if defined ( _OPENMP )
 
-    #endif
+    // All versions of OpenMP have the #pragma omp flush
+    #define GB_OMP_FLUSH GB_PRAGMA (omp flush)
+
+#else
+
+    // no OpenMP at all
+    #define GB_OMP_FLUSH
 
 #endif
 
@@ -109,15 +167,16 @@
 //------------------------------------------------------------------------------
 
 // An atomic capture loads the prior value of the target into a thread-local
-// result, and then overwrites the targe with the new value.  The target is a
+// result, and then overwrites the target with the new value.  The target is a
 // value that is shared between threads.  The value and result arguments are
-// thread-local.  SuiteSparse:GraphBLAS uses three atomic captures,
+// thread-local.  SuiteSparse:GraphBLAS uses four atomic capture methods,
 // defined below, of the form:
 //
 //      { result = target ; target = value ; }      for int64_t and int8_t
 //      { result = target ; target |= value ; }     for int64_t
+//      { result = target++ ; }                     for int64_t
 //
-// OpenMP 4.0 and later supports atomic captures with a "capture" clause: 
+// OpenMP 3.1 and later supports atomic captures with a "capture" clause: 
 //
 //      #pragma omp atomic capture
 //      { result = target ; target = value ; }
@@ -134,6 +193,29 @@
 // https://docs.microsoft.com/en-us/cpp/intrinsics/interlockedexchange-intrinsic-functions
 // https://docs.microsoft.com/en-us/cpp/intrinsics/interlockedor-intrinsic-functions
 
+#if ( _OPENMP >= 201307 )
+
+    // OpenMP 4.0 or later
+    #define GB_ATOMIC_CAPTURE GB_PRAGMA (omp atomic capture seq_cst)
+
+#elif ( _OPENMP >= 201107 )
+
+    // OpenMP 3.1
+    #define GB_ATOMIC_CAPTURE GB_PRAGMA (omp atomic capture)
+
+#elif ( _OPENMP >= 199810 )
+
+    // OpenMP 1.0 to 3.0: generate an intentional compile-time error if any
+    // attempt is made to use the atomic capture.
+    #define GB_ATOMIC_CAPTURE atomic capture not available
+
+#else
+
+    // no OpenMP at all
+    #define GB_ATOMIC_CAPTURE
+
+#endif
+
     //--------------------------------------------------------------------------
     // atomic capture for int64_t
     //--------------------------------------------------------------------------
@@ -153,7 +235,7 @@
 
         #define GB_ATOMIC_CAPTURE_INT64(result, target, value)          \
         {                                                               \
-            GB_PRAGMA (omp atomic capture)                              \
+            GB_ATOMIC_CAPTURE                                           \
             {                                                           \
                 result = target ;                                       \
                 target = value ;                                        \
@@ -181,7 +263,7 @@
 
         #define GB_ATOMIC_CAPTURE_INT8(result, target, value)           \
         {                                                               \
-            GB_PRAGMA (omp atomic capture)                              \
+            GB_ATOMIC_CAPTURE                                           \
             {                                                           \
                 result = target ;                                       \
                 target = value ;                                        \
@@ -209,7 +291,7 @@
 
         #define GB_ATOMIC_CAPTURE_INT64_OR(result, target, value)       \
         {                                                               \
-            GB_PRAGMA (omp atomic capture)                              \
+            GB_ATOMIC_CAPTURE                                           \
             {                                                           \
                 result = target ;                                       \
                 target |= value ;                                       \
@@ -218,6 +300,38 @@
 
     #endif
 
+    //--------------------------------------------------------------------------
+    // atomic post-increment
+    //--------------------------------------------------------------------------
+
+    // Increment an int64_t value and return the value prior to being
+    // incremented:
+    //
+    //      int64_t result = target++ ;
+    //
+    // See
+    // https://docs.microsoft.com/en-us/cpp/intrinsics/interlockedincrement-intrinsic-functions?view=msvc-160
+    // The MS Visual Studio version computes result = ++target, so result must
+    // be decremented by one.
+
+    #if GB_MICROSOFT
+
+        #define GB_ATOMIC_CAPTURE_INC64(result,target)                  \
+        {                                                               \
+            result = _InterlockedIncrement64                            \
+                ((int64_t volatile *) (&(target))) - 1 ;                \
+        }
+
+    #else
+
+        #define GB_ATOMIC_CAPTURE_INC64(result,target)                  \
+        {                                                               \
+            GB_ATOMIC_CAPTURE                                           \
+            result = (target)++ ;                                       \
+        }
+
+    #endif
+
 //------------------------------------------------------------------------------
 // atomic compare-and-exchange
 //------------------------------------------------------------------------------
@@ -227,7 +341,7 @@
 // OpenMP would be used for these atomic operation, but they are not supported.
 // So compiler-specific functions are used instead.
 
-// In gcc and icc, the atomic compare-and-exchange function
+// In gcc, icc, and clang, the atomic compare-and-exchange function
 // __atomic_compare_exchange computes the following, as a single atomic
 // operation, where type_t is any 8, 16, 32, or 64 bit scalar type.  In
 // SuiteSparse:GraphBLAS, type_t can be bool, int8_t, uint8_t, int16_t,
@@ -239,8 +353,8 @@
 //          type_t *expected,       // input/output
 //          type_t *desired,        // input only, even though it is a pointer
 //          bool weak,              // true, for SuiteSparse:GraphBLAS
-//          int success_memorder,   // __ATOMIC_RELAXED for SuiteSparse:GrB
-//          int failure_memorder    // __ATOMIC_RELAXED for SuiteSparse:GrB
+//          int success_memorder,   // __ATOMIC_SEQ_CST for SuiteSparse:GrB
+//          int failure_memorder    // __ATOMIC_SEQ_CST for SuiteSparse:GrB
 //      )
 //      {
 //          bool result ;
@@ -264,13 +378,7 @@
 // value of 'expected' after the operation completes.   The target, expected,
 // and desired parameters are all provided as pointers:
 //
-// __atomic_compare_exchange also includes parameters that define the memory
-// model.  SuiteSparse:GraphBLAS can use the most relaxed settings for these
-// parameters (weak is true, and the memorder parameters are both
-// __ATOMIC_RELAXED).
-//
-// See https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html for
-// more details.
+// See https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html
 
 // Microsoft Visual Studio provides similar but not identical functionality in
 // the _InterlockedCompareExchange functions, but they are named differently
@@ -354,13 +462,13 @@
 #else
 
     //--------------------------------------------------------------------------
-    // compare/exchange for gcc, icc, and clang
+    // compare/exchange for gcc, icc, and clang on x86 and Power8/9
     //--------------------------------------------------------------------------
 
     // the compare/exchange function is generic for any type
     #define GB_ATOMIC_COMPARE_EXCHANGE_X(target, expected, desired)     \
         __atomic_compare_exchange (target, &expected, &desired,         \
-            true, __ATOMIC_RELAXED, __ATOMIC_RELAXED)                   \
+            true, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)                   \
 
     // bool, int8_t, and uint8_t
     #define GB_ATOMIC_COMPARE_EXCHANGE_8(target, expected, desired)     \
@@ -378,6 +486,7 @@
     #define GB_ATOMIC_COMPARE_EXCHANGE_64(target, expected, desired)    \
             GB_ATOMIC_COMPARE_EXCHANGE_X (target, expected, desired)
 
+
 #endif
 
 #endif
diff --git a/GraphBLAS/Source/GB_binary_search.h b/GraphBLAS/Source/GB_binary_search.h
index 99b0e2c4fb..f6d1677e02 100644
--- a/GraphBLAS/Source/GB_binary_search.h
+++ b/GraphBLAS/Source/GB_binary_search.h
@@ -2,8 +2,8 @@
 // GB_binary_search.h: binary search in a sorted list
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,9 @@
 // The list X [pleft ... pright] is in ascending order.  It may have
 // duplicates.
 
+#if GB_KERNEL
+
+// version for the GPU, with fewer branches
 #define GB_TRIM_BINARY_SEARCH(i,X,pleft,pright)                             \
 {                                                                           \
     /* binary search of X [pleft ... pright] for integer i */               \
@@ -33,8 +36,9 @@
     ASSERT (pleft == pright || pleft == pright + 1) ;                       \
 }
 
-#if 0
-// slower version
+#else
+
+// version for the CPU
 #define GB_TRIM_BINARY_SEARCH(i,X,pleft,pright)                             \
 {                                                                           \
     /* binary search of X [pleft ... pright] for integer i */               \
@@ -106,6 +110,9 @@
 // GB_TRIM_BINARY_SEARCH_ZOMBIE: binary search in the presence of zombies
 //------------------------------------------------------------------------------
 
+#if GB_KERNEL
+
+// version for the GPU, with fewer branches
 #define GB_TRIM_BINARY_SEARCH_ZOMBIE(i,X,pleft,pright)                      \
 {                                                                           \
     /* binary search of X [pleft ... pright] for integer i */               \
@@ -121,7 +128,9 @@
     ASSERT (pleft == pright || pleft == pright + 1) ;                       \
 }
 
-#if 0
+#else
+
+// version for the CPU
 #define GB_TRIM_BINARY_SEARCH_ZOMBIE(i,X,pleft,pright)                      \
 {                                                                           \
     /* binary search of X [pleft ... pright] for integer i */               \
@@ -218,11 +227,6 @@
 // GB_lookup: find k so that j == Ah [k]
 //------------------------------------------------------------------------------
 
-// Given a sparse, hypersparse, or hyperslice matrix, find k so that j == Ah
-// [k], if it appears in the list.  k is not needed by the caller, just the
-// variables pstart, pend, pleft, and found.  GB_lookup cannot be used if
-// A is a slice (it could be extended to handle this case).
-
 #include "GB_lookup_template.c"
 
 #endif
diff --git a/GraphBLAS/Source/GB_binop.h b/GraphBLAS/Source/GB_binop.h
index 7d343437df..dd53087889 100644
--- a/GraphBLAS/Source/GB_binop.h
+++ b/GraphBLAS/Source/GB_binop.h
@@ -2,8 +2,8 @@
 // GB_binop.h: definitions for binary operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -37,5 +37,21 @@ GB_Opcode GB_boolean_rename     // renamed opcode
     const GB_Opcode opcode      // opcode to rename
 ) ;
 
+GrB_BinaryOp GB_boolean_rename_op   // return renamed op
+(
+    const GrB_BinaryOp op           // op to rename
+) ;
+
+GrB_Info GB_binop_new
+(
+    GrB_BinaryOp *binaryop,         // handle for the new binary operator
+    GxB_binary_function function,   // binary function (may be NULL)
+    GrB_Type ztype,                 // type of output z
+    GrB_Type xtype,                 // type of input x
+    GrB_Type ytype,                 // type of input y
+    const char *name,               // name of the function
+    const GB_Opcode opcode          // opcode for the function
+) ;
+
 #endif
 
diff --git a/GraphBLAS/Source/GB_binop_builtin.c b/GraphBLAS/Source/GB_binop_builtin.c
index 1af1fb853f..bb51406520 100644
--- a/GraphBLAS/Source/GB_binop_builtin.c
+++ b/GraphBLAS/Source/GB_binop_builtin.c
@@ -2,8 +2,8 @@
 // GB_binop_builtin:  determine if a binary operator is built-in
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,8 +18,6 @@
 #include "GB_binop.h"
 #include "GB_unused.h"
 
-#ifndef GBCOMPACT
-
 bool GB_binop_builtin               // true if binary operator is builtin
 (
     // inputs:
@@ -37,12 +35,6 @@ bool GB_binop_builtin               // true if binary operator is builtin
 )
 {
 
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    // A and B may be aliased
-
     //--------------------------------------------------------------------------
     // check if the operator is builtin, with no typecasting
     //--------------------------------------------------------------------------
@@ -50,6 +42,7 @@ bool GB_binop_builtin               // true if binary operator is builtin
     GrB_Type op_xtype, op_ytype, op_ztype ;
     if (op == NULL)
     { 
+        // implicit GB_SECOND_[TYPE] operator
         ASSERT (A_type == B_type) ;
         (*opcode) = GB_SECOND_opcode ;
         op_xtype = A_type ;
@@ -70,8 +63,10 @@ bool GB_binop_builtin               // true if binary operator is builtin
         return (false) ;
     }
 
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (*opcode) ;
+
     // check if A matches the input to the operator
-    if (!A_is_pattern)
+    if (!A_is_pattern && !op_is_positional)
     {
         if ((A_type != (flipxy ? op_ytype : op_xtype)) ||
             (A_type->code >= GB_UDT_code))
@@ -83,7 +78,7 @@ bool GB_binop_builtin               // true if binary operator is builtin
     }
 
     // check if B matches the input to the operator
-    if (!B_is_pattern)
+    if (!B_is_pattern && !op_is_positional)
     {
         if ((B_type != (flipxy ? op_xtype : op_ytype)) ||
             (B_type->code >= GB_UDT_code))
@@ -134,7 +129,7 @@ bool GB_binop_builtin               // true if binary operator is builtin
     // functions rminus (z=y-x)and rdiv (z=y/x).
 
     if (flipxy)
-    {
+    { 
         // All built-in semirings use either commutative multiplicative
         // operators (PLUS, TIMES, ANY, ...), or operators that have flipped
         // versions (DIV vs RDIV, ...).
@@ -144,5 +139,3 @@ bool GB_binop_builtin               // true if binary operator is builtin
     return (true) ;
 }
 
-#endif
-
diff --git a/GraphBLAS/Source/GB_binop_flip.c b/GraphBLAS/Source/GB_binop_flip.c
index 9bb9029247..ac94bda470 100644
--- a/GraphBLAS/Source/GB_binop_flip.c
+++ b/GraphBLAS/Source/GB_binop_flip.c
@@ -1,12 +1,16 @@
 //------------------------------------------------------------------------------
-// GB_binop_flip:  flip a binary operator
+// GB_binop_flip:  flip a binary multipy operator in a semiring
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
+// Positional operators are flipped both in (first,second), but also (i,j).
+// This function is only used for semirings, for matrix-matrix multiply.
+// It is not used for GrB_apply or GrB_eWise*.
+
 #include "GB.h"
 #include "GB_binop.h"
 
@@ -46,6 +50,22 @@ GB_Opcode GB_binop_flip     // flipped opcode
         case GB_MINUS_opcode  : return (GB_RMINUS_opcode) ;
         case GB_RMINUS_opcode : return (GB_MINUS_opcode) ;
 
+        // swap FIRSTI and SECONDJ
+        case GB_FIRSTI_opcode  : return (GB_SECONDJ_opcode) ;
+        case GB_SECONDJ_opcode : return (GB_FIRSTI_opcode) ;
+
+        // swap FIRSTI1 and SECONDJ1
+        case GB_FIRSTI1_opcode  : return (GB_SECONDJ1_opcode) ;
+        case GB_SECONDJ1_opcode : return (GB_FIRSTI1_opcode) ;
+
+        // swap FIRSTJ and SECONDI
+        case GB_FIRSTJ_opcode  : return (GB_SECONDI_opcode) ;
+        case GB_SECONDI_opcode : return (GB_FIRSTJ_opcode) ;
+
+        // swap FIRSTJ1 and SECONDI1
+        case GB_FIRSTJ1_opcode  : return (GB_SECONDI1_opcode) ;
+        case GB_SECONDI1_opcode : return (GB_FIRSTJ1_opcode) ;
+
         // these operators do not have flipped versions:
         // POW, BGET, BSET, BCLR, BSHIFT, ATAN2, FMOD, REMAINDER, COPYSIGN,
         // LDEXP, CMPLX, and user-defined operators.
@@ -53,7 +73,7 @@ GB_Opcode GB_binop_flip     // flipped opcode
         // these operators are commutative; they are their own flipped ops:
         // PLUS, TIMES, PAIR, ANY, ISEQ, ISNE, EQ, NE, MIN, MAX, LOR, LAND,
         // LXOR, LXNOR, HYPOT, BOR, BAND, BXOR, BXNOR.
-        default :
+        default:
             return (opcode) ;
     }
 }
diff --git a/GraphBLAS/Source/GB_binop_new.c b/GraphBLAS/Source/GB_binop_new.c
new file mode 100644
index 0000000000..9c849602be
--- /dev/null
+++ b/GraphBLAS/Source/GB_binop_new.c
@@ -0,0 +1,97 @@
+//------------------------------------------------------------------------------
+// GB_binop_new: create a new operator (user-defined or internal)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Create a new a binary operator: z = f (x,y).  The function pointer may
+// be NULL, for implied functions (FIRST and SECOND).  It may not be NULL
+// otherwise.
+
+#include "GB.h"
+#include "GB_binop.h"
+#include <ctype.h>
+
+GrB_Info GB_binop_new
+(
+    GrB_BinaryOp *binaryop,         // handle for the new binary operator
+    GxB_binary_function function,   // binary function (may be NULL)
+    GrB_Type ztype,                 // type of output z
+    GrB_Type xtype,                 // type of input x
+    GrB_Type ytype,                 // type of input y
+    const char *name,               // name of the function (may be NULL)
+    const GB_Opcode opcode          // opcode for the function
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (binaryop != NULL) ;
+    ASSERT (ztype != NULL) ;
+    ASSERT (xtype != NULL) ;
+    ASSERT (ytype != NULL) ;
+
+    //--------------------------------------------------------------------------
+    // create the binary op
+    //--------------------------------------------------------------------------
+
+    // allocate the binary operator
+    (*binaryop) = GB_CALLOC (1, struct GB_BinaryOp_opaque) ;
+    if (*binaryop == NULL)
+    { 
+        // out of memory
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    // initialize the binary operator
+    GrB_BinaryOp op = *binaryop ;
+    op->magic = GB_MAGIC ;
+    op->xtype = xtype ;
+    op->ytype = ytype ;
+    op->ztype = ztype ;
+    op->function = function ;       // may be NULL
+    op->opcode = opcode ;
+
+    //--------------------------------------------------------------------------
+    // find the name of the operator
+    //--------------------------------------------------------------------------
+
+    if (name == NULL)
+    { 
+        // if no name , a generic name is used instead
+        strncpy (op->name, "user_binary_operator", GB_LEN-1) ;
+    }
+    else
+    {
+        // see if the typecast "(GxB_binary_function)" appears in the name
+        char *p = NULL ;
+        p = strstr ((char *) name, "GxB_binary_function") ;
+        if (p != NULL)
+        { 
+            // skip past the typecast, the left parenthesis, and any whitespace
+            p += 19 ;
+            while (isspace (*p)) p++ ;
+            if (*p == ')') p++ ;
+            while (isspace (*p)) p++ ;
+            strncpy (op->name, p, GB_LEN-1) ;
+        }
+        else
+        { 
+            // copy the entire name as-is
+            strncpy (op->name, name, GB_LEN-1) ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_BINARYOP_OK (op, "new binary op", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_AxB_saxpy.c b/GraphBLAS/Source/GB_bitmap_AxB_saxpy.c
new file mode 100644
index 0000000000..fc74badeb5
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_AxB_saxpy.c
@@ -0,0 +1,169 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_AxB_saxpy: compute C=A*B, C<M>=A*B, or C<!M>=A*B; C bitmap or full
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_bitmap_AxB_saxpy.h"
+#ifndef GBCOMPACT
+#include "GB_AxB__include.h"
+#endif
+
+#define GB_FREE_ALL             \
+{                               \
+    GB_Matrix_free (Chandle) ;  \
+}
+
+//------------------------------------------------------------------------------
+// GB_bitmap_AxB_saxpy: compute C=A*B, C<M>=A*B, or C<!M>=A*B
+//------------------------------------------------------------------------------
+
+// TODO: also pass in the user's C and the accum operator, and done_in_place,
+// like GB_AxB_dot4.
+
+GB_PUBLIC                           // for testing only
+GrB_Info GB_bitmap_AxB_saxpy        // C = A*B where C is bitmap or full
+(
+    GrB_Matrix *Chandle,            // output matrix (not computed in-place)
+    const int C_sparsity,
+    const GrB_Matrix M,             // optional mask matrix
+    const bool Mask_comp,           // if true, use !M
+    const bool Mask_struct,         // if true, use the only structure of M
+    const GrB_Matrix A,             // input matrix A
+    const GrB_Matrix B,             // input matrix B
+    const GrB_Semiring semiring,    // semiring that defines C=A*B
+    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
+    bool *mask_applied,             // mask always applied if present
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+
+    (*mask_applied) = false ;
+    ASSERT (Chandle != NULL) ;
+    ASSERT (*Chandle == NULL) ;
+
+    ASSERT_MATRIX_OK_OR_NULL (M, "M for bitmap saxpy A*B", GB0) ;
+    ASSERT (!GB_PENDING (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+
+    ASSERT_MATRIX_OK (A, "A for bitmap saxpy A*B", GB0) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+
+    ASSERT_MATRIX_OK (B, "B for bitmap saxpy A*B", GB0) ;
+    ASSERT (!GB_PENDING (B)) ;
+    ASSERT (GB_JUMBLED_OK (B)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+
+    ASSERT_SEMIRING_OK (semiring, "semiring for bitmap saxpy A*B", GB0) ;
+    ASSERT (A->vdim == B->vlen) ;
+
+    ASSERT (C_sparsity == GxB_BITMAP || C_sparsity == GxB_FULL) ;
+
+    //--------------------------------------------------------------------------
+    // construct C
+    //--------------------------------------------------------------------------
+
+    // TODO: If C is the right type on input, and accum is the same as the
+    // monoid, then do not create C, but compute in-place instead.
+
+    GrB_Type ctype = semiring->add->op->ztype ;
+    int64_t cnzmax ;
+    bool ok = GB_Index_multiply ((GrB_Index *) &cnzmax, A->vlen, B->vdim) ;
+    if (!ok)
+    { 
+        // problem too large
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+    GB_OK (GB_new_bix (Chandle, ctype, A->vlen, B->vdim, GB_Ap_null, true,
+        C_sparsity, true, GB_HYPER_SWITCH_DEFAULT, -1, cnzmax, true, Context)) ;
+    GrB_Matrix C = *Chandle ;
+    C->magic = GB_MAGIC ;
+
+    //--------------------------------------------------------------------------
+    // get the semiring operators
+    //--------------------------------------------------------------------------
+
+    GrB_BinaryOp mult = semiring->multiply ;
+    GrB_Monoid add = semiring->add ;
+    ASSERT (mult->ztype == add->op->ztype) ;
+    bool A_is_pattern, B_is_pattern ;
+    GB_AxB_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult->opcode) ;
+
+    //--------------------------------------------------------------------------
+    // C<#M>+=A*B
+    //--------------------------------------------------------------------------
+
+    bool done = false ;
+
+    #ifndef GBCOMPACT
+
+        //----------------------------------------------------------------------
+        // define the worker for the switch factory
+        //----------------------------------------------------------------------
+
+        #define GB_Asaxpy3B(add,mult,xname) \
+            GB_Asaxpy3B_ ## add ## mult ## xname
+
+        #define GB_AxB_WORKER(add,mult,xname)                               \
+        {                                                                   \
+            info = GB_Asaxpy3B (add,mult,xname) (C, M, Mask_comp,           \
+                Mask_struct, true, A, A_is_pattern,  B,                     \
+                B_is_pattern, NULL, 0, 0, 0, 0, Context) ;                  \
+            done = (info != GrB_NO_VALUE) ;                                 \
+        }                                                                   \
+        break ;
+
+        //----------------------------------------------------------------------
+        // launch the switch factory
+        //----------------------------------------------------------------------
+
+        GB_Opcode mult_opcode, add_opcode ;
+        GB_Type_code xcode, ycode, zcode ;
+        if (GB_AxB_semiring_builtin (A, A_is_pattern, B,
+            B_is_pattern, semiring, flipxy, &mult_opcode, &add_opcode, &xcode,
+            &ycode, &zcode))
+        { 
+            #include "GB_AxB_factory.c"
+        }
+
+    #endif
+
+    //--------------------------------------------------------------------------
+    // generic method
+    //--------------------------------------------------------------------------
+
+    if (!done)
+    { 
+        info = GB_AxB_saxpy_generic (C, M, Mask_comp, Mask_struct,
+            true, A, A_is_pattern, B, B_is_pattern, semiring,
+            flipxy, NULL, 0, 0, 0, 0, Context) ;
+    }
+
+    if (info != GrB_SUCCESS)
+    { 
+        // out of memory
+        GB_FREE_ALL ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    (*mask_applied) = (M != NULL) ;
+    ASSERT_MATRIX_OK (C, "C bitmap saxpy output", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_AxB_saxpy.h b/GraphBLAS/Source/GB_bitmap_AxB_saxpy.h
new file mode 100644
index 0000000000..b3c17921df
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_AxB_saxpy.h
@@ -0,0 +1,30 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_AxB_saxpy: compute C=A*B, C<M>=A*B, or C<!M>=A*B; C bitmap or full
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_AXB_BITMAP_SAXPY_H
+#define GB_AXB_BITMAP_SAXPY_H
+#include "GB_mxm.h"
+
+GB_PUBLIC                           // for testing only
+GrB_Info GB_bitmap_AxB_saxpy        // C = A*B where C is bitmap or full
+(
+    GrB_Matrix *Chandle,            // output matrix (not computed in-place)
+    const int C_sparsity,
+    const GrB_Matrix M,             // optional mask matrix
+    const bool Mask_comp,           // if true, use !M
+    const bool Mask_struct,         // if true, use the only structure of M
+    const GrB_Matrix A,             // input matrix A
+    const GrB_Matrix B,             // input matrix B
+    const GrB_Semiring semiring,    // semiring that defines C=A*B
+    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
+    bool *mask_applied,             // mask always applied if present
+    GB_Context Context
+) ;
+
+#endif
diff --git a/GraphBLAS/Source/GB_bitmap_M_scatter.c b/GraphBLAS/Source/GB_bitmap_M_scatter.c
new file mode 100644
index 0000000000..d7609cd3a2
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_M_scatter.c
@@ -0,0 +1,85 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_M_scatter: scatter M into/from the C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_bitmap_assign_methods.h"
+
+void GB_bitmap_M_scatter        // scatter M into the C bitmap
+(
+    // input/output:
+    GrB_Matrix C,
+    // inputs:
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask to scatter into the C bitmap
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    const int operation,        // +=2, -=2, or %=2
+    const int64_t *GB_RESTRICT pstart_Mslice, // size ntasks+1
+    const int64_t *GB_RESTRICT kfirst_Mslice, // size ntasks
+    const int64_t *GB_RESTRICT klast_Mslice,  // size ntasks
+    const int M_nthreads,
+    const int M_ntasks,
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (M, "M for bitmap scatter", GB0) ;
+    ASSERT (GB_IS_SPARSE (M) || GB_IS_HYPERSPARSE (M)) ;
+
+    //--------------------------------------------------------------------------
+    // get C and M
+    //--------------------------------------------------------------------------
+
+    GB_GET_M
+    int8_t *Cb = C->b ;
+    const int64_t cvlen = C->vlen ;
+    int64_t cnvals = 0 ;
+
+    //--------------------------------------------------------------------------
+    // scatter M into the C bitmap
+    //--------------------------------------------------------------------------
+
+    switch (operation)
+    {
+
+        case GB_BITMAP_M_SCATTER_PLUS_2 :       // Cb (i,j) += 2
+
+            #undef  GB_MASK_WORK
+            #define GB_MASK_WORK(pC) Cb [pC] += 2
+            #include "GB_bitmap_assign_M_template.c"
+            break ;
+
+        case GB_BITMAP_M_SCATTER_MINUS_2 :      // Cb (i,j) -= 2
+
+            #undef  GB_MASK_WORK
+            #define GB_MASK_WORK(pC) Cb [pC] -= 2
+            #include "GB_bitmap_assign_M_template.c"
+            break ;
+
+        case GB_BITMAP_M_SCATTER_MOD_2 :        // Cb (i,j) %= 2
+
+            #undef  GB_MASK_WORK
+            #define GB_MASK_WORK(pC) Cb [pC] %= 2
+            #include "GB_bitmap_assign_M_template.c"
+            break ;
+
+        default: ;
+    }
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_M_scatter_whole.c b/GraphBLAS/Source/GB_bitmap_M_scatter_whole.c
new file mode 100644
index 0000000000..3670da2bf8
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_M_scatter_whole.c
@@ -0,0 +1,77 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_M_scatter_whole: scatter M into/from the C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_bitmap_assign_methods.h"
+
+void GB_bitmap_M_scatter_whole  // scatter M into the C bitmap
+(
+    // input/output:
+    GrB_Matrix C,
+    // inputs:
+    const GrB_Matrix M,         // mask to scatter into the C bitmap
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const int operation,        // +=2, -=2, or %=2
+    const int64_t *GB_RESTRICT pstart_Mslice, // size ntasks+1
+    const int64_t *GB_RESTRICT kfirst_Mslice, // size ntasks
+    const int64_t *GB_RESTRICT klast_Mslice,  // size ntasks
+    const int M_nthreads,
+    const int M_ntasks,
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (M, "M for bitmap scatter, whoe", GB0) ;
+    ASSERT (GB_IS_SPARSE (M) || GB_IS_HYPERSPARSE (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+
+    //--------------------------------------------------------------------------
+    // get C and M
+    //--------------------------------------------------------------------------
+
+    GB_GET_M
+    int8_t *Cb = C->b ;
+    const int64_t cvlen = C->vlen ;
+    int64_t cnvals = 0 ;
+
+    //--------------------------------------------------------------------------
+    // scatter M into the C bitmap
+    //--------------------------------------------------------------------------
+
+    switch (operation)
+    {
+
+        case GB_BITMAP_M_SCATTER_PLUS_2 :       // Cb (i,j) += 2
+
+            #undef  GB_MASK_WORK
+            #define GB_MASK_WORK(pC) Cb [pC] += 2
+            #include "GB_bitmap_assign_M_all_template.c"
+            break ;
+
+        case GB_BITMAP_M_SCATTER_MINUS_2 :      // Cb (i,j) -= 2
+
+            #undef  GB_MASK_WORK
+            #define GB_MASK_WORK(pC) Cb [pC] -= 2
+            #include "GB_bitmap_assign_M_all_template.c"
+            break ;
+
+        case GB_BITMAP_M_SCATTER_SET_2 :        // Cb (i,j) = 2
+
+            #undef  GB_MASK_WORK
+            #define GB_MASK_WORK(pC) Cb [pC] = 2
+            #include "GB_bitmap_assign_M_all_template.c"
+            break ;
+
+        default: ;
+    }
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign.c b/GraphBLAS/Source/GB_bitmap_assign.c
new file mode 100644
index 0000000000..1a906fcdfe
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign.c
@@ -0,0 +1,233 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign: assign to C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Implements GrB_Row_assign, GrB_Col_assign, GrB_assign, GxB_subassign when C
+// is in bitmap form, or when C is converted into bitmap form.
+
+// C is returned as bitmap in all cases except for C = A or C = scalar (the
+// whole_C_matrix case with GB_bitmap_assign_noM_noaccum_whole).  For that
+// method, C can be returned with any sparsity structure.
+
+#include "GB_bitmap_assign_methods.h"
+#include "GB_dense.h"
+
+#define GB_FREE_ALL GB_phbix_free (C) ;
+
+GrB_Info GB_bitmap_assign
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix, NULL if not present
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // NULL if not present
+    const GrB_Matrix A,         // input matrix, NULL for scalar assignment
+    const void *scalar,         // input scalar, if A == NULL
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // ensure C is in bitmap form
+    //--------------------------------------------------------------------------
+
+    GB_OK (GB_convert_any_to_bitmap (C, Context)) ;
+    ASSERT (GB_IS_BITMAP (C)) ;
+
+    bool whole_C_matrix = (Ikind == GB_ALL && Jkind == GB_ALL) ;
+
+    //--------------------------------------------------------------------------
+    // do the assignment
+    //--------------------------------------------------------------------------
+
+    if (M == NULL)
+    {
+        if (accum == NULL)
+        {
+            if (whole_C_matrix)
+            { 
+                // C = A or scalar, no mask.  C may become sparse, hyper, or
+                // full, or it may remain bitmap.
+                GB_OK (GB_bitmap_assign_noM_noaccum_whole (C, C_replace,
+                    /* no M, */ Mask_comp, Mask_struct, /* no accum, */
+                    A, scalar, scalar_type, Context)) ;
+            }
+            else
+            { 
+                // C(I,J) = A or scalar, no mask
+                GB_OK (GB_bitmap_assign_noM_noaccum (C, C_replace,
+                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                    /* no M, */ Mask_comp, Mask_struct, /* no accum, */
+                    A, scalar, scalar_type, assign_kind, Context)) ;
+            }
+        }
+        else
+        {
+            if (whole_C_matrix)
+            { 
+                // C += A or scalar, no mask.
+                GB_OK (GB_bitmap_assign_noM_accum_whole (C, C_replace,
+                    /* no M, */ Mask_comp, Mask_struct, accum,
+                    A, scalar, scalar_type, Context)) ;
+            }
+            else
+            { 
+                // C(I,J) += A or scalar, no mask.
+                GB_OK (GB_bitmap_assign_noM_accum (C, C_replace,
+                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                    /* no M, */ Mask_comp, Mask_struct, accum,
+                    A, scalar, scalar_type, assign_kind, Context)) ;
+            }
+        }
+    }
+    else if (GB_IS_BITMAP (M) || GB_IS_FULL (M))
+    {
+        if (accum == NULL)
+        {
+            if (whole_C_matrix)
+            { 
+                // C<M or !M, where M is full> = A or scalar
+                GB_OK (GB_bitmap_assign_fullM_noaccum_whole (C, C_replace,
+                    M, Mask_comp, Mask_struct, /* no accum, */
+                    A, scalar, scalar_type, Context)) ;
+            }
+            else
+            { 
+                // C<M or !M, where M is full>(I,J) = A or scalar
+                GB_OK (GB_bitmap_assign_fullM_noaccum (C, C_replace,
+                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                    M, Mask_comp, Mask_struct, /* no accum, */
+                    A, scalar, scalar_type, assign_kind, Context)) ;
+            }
+        }
+        else
+        {
+            if (whole_C_matrix)
+            { 
+                // C<M or !M, where M is full> = A or scalar
+                GB_OK (GB_bitmap_assign_fullM_accum_whole (C, C_replace,
+                    M, Mask_comp, Mask_struct, accum,
+                    A, scalar, scalar_type, Context)) ;
+            }
+            else
+            { 
+                // C<M or !M, where M is full>(I,J) = A or scalar
+                GB_OK (GB_bitmap_assign_fullM_accum (C, C_replace,
+                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                    M, Mask_comp, Mask_struct, accum,
+                    A, scalar, scalar_type, assign_kind, Context)) ;
+            }
+        }
+    }
+    else if (!Mask_comp)
+    {
+        if (accum == NULL)
+        {
+            if (whole_C_matrix)
+            { 
+                // C<M> = A or scalar, M is sparse or hypersparse
+                GB_OK (GB_bitmap_assign_M_noaccum_whole (C, C_replace,
+                    M, /* Mask_comp false, */ Mask_struct, /* no accum, */
+                    A, scalar, scalar_type, Context)) ;
+            }
+            else
+            { 
+                // C<M>(I,J) = A or scalar, M is sparse or hypersparse
+                GB_OK (GB_bitmap_assign_M_noaccum (C, C_replace,
+                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                    M, /* Mask_comp false, */ Mask_struct, /* no accum, */
+                    A, scalar, scalar_type, assign_kind, Context)) ;
+            }
+        }
+        else
+        {
+            if (whole_C_matrix)
+            { 
+                // C<M> += A or scalar, M is sparse or hypersparse
+                GB_OK (GB_bitmap_assign_M_accum_whole (C, C_replace,
+                    M, /* Mask_comp false, */ Mask_struct, accum,
+                    A, scalar, scalar_type, Context)) ;
+            }
+            else
+            { 
+                // C<M>(I,J) += A or scalar, M is sparse or hypersparse
+                GB_OK (GB_bitmap_assign_M_accum (C, C_replace,
+                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                    M, /* Mask_comp false, */ Mask_struct, accum,
+                    A, scalar, scalar_type, assign_kind, Context)) ;
+            }
+        }
+    }
+    else // Mask_comp is true
+    {
+        if (accum == NULL)
+        {
+            if (whole_C_matrix)
+            { 
+                // C<!M> = A or scalar, M is sparse or hypersparse
+                GB_OK (GB_bitmap_assign_notM_noaccum_whole (C, C_replace,
+                    M, /* Mask_comp true, */ Mask_struct, /* no accum, */
+                    A, scalar, scalar_type, Context)) ;
+            }
+            else
+            { 
+                // C<!M>(I,J) = A or scalar, M is sparse or hypersparse
+                GB_OK (GB_bitmap_assign_notM_noaccum (C, C_replace,
+                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                    M, /* Mask_comp true, */ Mask_struct, /* no accum, */
+                    A, scalar, scalar_type, assign_kind, Context)) ;
+            }
+        }
+        else
+        {
+            if (whole_C_matrix)
+            { 
+                // C<!M> += A or scalar, M is sparse or hypersparse
+                GB_OK (GB_bitmap_assign_notM_accum_whole (C, C_replace,
+                    M, /* Mask_comp true, */ Mask_struct, accum,
+                    A, scalar, scalar_type, Context)) ;
+            }
+            else
+            { 
+                // C<!M>(I,J) += A or scalar, M is sparse or hypersparse
+                GB_OK (GB_bitmap_assign_notM_accum (C, C_replace,
+                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                    M, /* Mask_comp true, */ Mask_struct, accum,
+                    A, scalar, scalar_type, assign_kind, Context)) ;
+            }
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (C, "final C for bitmap assign", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign.h b/GraphBLAS/Source/GB_bitmap_assign.h
new file mode 100644
index 0000000000..f1c1d02a18
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign.h
@@ -0,0 +1,40 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign.h: definitions for GB_bitmap_assign
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_BITMAP_ASSIGN_H
+#define GB_BITMAP_ASSIGN_H
+#include "GB.h"
+
+GrB_Info GB_bitmap_assign
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix, NULL if not present
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present here
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+) ;
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_M_accum.c b/GraphBLAS/Source/GB_bitmap_assign_M_accum.c
new file mode 100644
index 0000000000..6c4528c073
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_M_accum.c
@@ -0,0 +1,255 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_M_accum:  assign to C bitmap 
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<M>(I,J) += A       assign
+// C(I,J)<M> += A       subassign
+
+// C<M,repl>(I,J) += A       assign
+// C(I,J)<M,repl> += A       subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           present, hypersparse or sparse (not bitmap or full)
+// Mask_comp:   false
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign, row assign, col assign, or subassign
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;
+
+GrB_Info GB_bitmap_assign_M_accum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix, which is not NULL here
+//  const bool Mask_comp,       // false here
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present here
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit3", M, false, accum,
+        Ikind, Jkind, assign_kind) ;
+    ASSERT (GB_IS_HYPERSPARSE (M) || GB_IS_SPARSE (M)) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, M, accum", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for bitmap assign, M, accum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, M, accum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get C, M, A, and accum
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_SLICE_M
+    GB_GET_A_AND_SCALAR
+    GB_GET_ACCUM_FOR_BITMAP
+
+    // if C FULL:  if C_replace false, no deletion occurs
+    // if C_replace is true: convert C to bitmap first
+
+    //--------------------------------------------------------------------------
+    // do the assignment
+    //--------------------------------------------------------------------------
+
+    if (A == NULL && assign_kind == GB_SUBASSIGN)
+    { 
+
+        //----------------------------------------------------------------------
+        // scalar subassignment: C(I,J)<M> += scalar
+        //----------------------------------------------------------------------
+
+        ASSERT (assign_kind == GB_SUBASSIGN) ;
+        int64_t keep = C_replace ? 3 : 1 ;
+
+        // for all entries in the mask M:
+        #undef  GB_MASK_WORK
+        #define GB_MASK_WORK(pC)            \
+        {                                   \
+            int8_t cb = Cb [pC] ;           \
+            /* keep this entry */           \
+            Cb [pC] = keep ;                \
+            if (cb == 0)                    \
+            {                               \
+                /* Cx [pC] = scalar */      \
+                GB_ASSIGN_SCALAR (pC) ;     \
+                task_cnvals++ ;             \
+            }                               \
+            else /* (cb == 1) */            \
+            {                               \
+                /* Cx [pC] += scalar */     \
+                GB_ACCUM_SCALAR (pC) ;      \
+            }                               \
+        }
+        #include "GB_bitmap_assign_M_sub_template.c"
+
+        if (C_replace)
+        { 
+            // for all entries in IxJ
+            #undef  GB_IXJ_WORK
+            #define GB_IXJ_WORK(pC,ignore)      \
+            {                                   \
+                int8_t cb = Cb [pC] ;           \
+                Cb [pC] = (cb == 3) ;           \
+                task_cnvals -= (cb == 1) ;      \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // scatter M into C
+        //----------------------------------------------------------------------
+
+        // Cb [pC] += 2 for each entry M(i,j) in the mask
+        GB_bitmap_M_scatter (C, I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+            M, Mask_struct, assign_kind, GB_BITMAP_M_SCATTER_PLUS_2,
+            pstart_Mslice, kfirst_Mslice, klast_Mslice,
+            M_nthreads, M_ntasks, Context) ;
+        // the bitmap of C now contains:
+        //  Cb (i,j) = 0:   cij not present, mij zero
+        //  Cb (i,j) = 1:   cij present, mij zero
+        //  Cb (i,j) = 2:   cij not present, mij 1
+        //  Cb (i,j) = 3:   cij present, mij 1
+
+        if (A == NULL)
+        { 
+
+            //------------------------------------------------------------------
+            // scalar assignment: C<M>(I,J) += scalar
+            //------------------------------------------------------------------
+
+            ASSERT (assign_kind == GB_ASSIGN) ;
+            // for all entries in IxJ
+            #undef  GB_IXJ_WORK
+            #define GB_IXJ_WORK(pC,ignore)      \
+            {                                   \
+                int8_t cb = Cb [pC] ;           \
+                if (cb == 2)                    \
+                {                               \
+                    /* Cx [pC] = scalar */      \
+                    GB_ASSIGN_SCALAR (pC) ;     \
+                    Cb [pC] = 3 ;               \
+                    task_cnvals++ ;             \
+                }                               \
+                else if (cb == 3)               \
+                {                               \
+                    /* Cx [pC] += scalar */     \
+                    GB_ACCUM_SCALAR (pC) ;      \
+                }                               \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // matrix assignment: C<M>(I,J) += A or C(I,J)<M> += A
+            //------------------------------------------------------------------
+
+            //  for all entries aij in A (A hyper, sparse, bitmap, or full)
+            //      if Cb(p) == 0       // do nothing
+            //      if Cb(p) == 1       // do nothing
+            //      if Cb(p) == 2:
+            //          Cx(p) = aij
+            //          Cb(p) = 3       // C(iC,jC) is now present, insert
+            //          task_cnvals++
+            //      if Cb(p) == 3:
+            //          Cx(p) += aij    // C(iC,jC) still present, updated
+            //          Cb(p) still 3
+
+            #define GB_AIJ_WORK(pC,pA)          \
+            {                                   \
+                int8_t cb = Cb [pC] ;           \
+                if (cb == 2)                    \
+                {                               \
+                    /* Cx [pC] = Ax [pA] */     \
+                    GB_ASSIGN_AIJ (pC, pA) ;    \
+                    Cb [pC] = 3 ;               \
+                    task_cnvals++ ;             \
+                }                               \
+                else if (cb == 3)               \
+                {                               \
+                    /* Cx [pC] += Ax [pA] */    \
+                    GB_ACCUM_AIJ (pC, pA) ;     \
+                }                               \
+            }
+            #include "GB_bitmap_assign_A_template.c"
+        }
+
+        //----------------------------------------------------------------------
+        // final pass: clear M from C or handle C_replace
+        //----------------------------------------------------------------------
+
+        if (C_replace)
+        { 
+            // scan all of C for the C_replace phase
+            // for row assign: for all entries in C(i,:)
+            // for col assign: for all entries in C(:,j)
+            // for assign: for all entries in C(:,:)
+            // for subassign: for all entries in C(I,J)
+                    // 0 -> 0
+                    // 1 -> 0  delete this entry
+                    // 2 -> 0
+                    // 3 -> 1: keep this entry.  already counted above
+            #define GB_CIJ_WORK(pC)                 \
+            {                                       \
+                int8_t cb = Cb [pC] ;               \
+                Cb [pC] = (cb == 3) ;               \
+                task_cnvals -= (cb == 1) ;          \
+            }
+            #include "GB_bitmap_assign_C_template.c"
+        }
+        else
+        { 
+            // clear M from C
+            // Cb [pC] -= 2 for each entry M(i,j) in the mask
+            GB_bitmap_M_scatter (C, I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                M, Mask_struct, assign_kind, GB_BITMAP_M_SCATTER_MINUS_2,
+                pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                M_nthreads, M_ntasks, Context) ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    GB_FREE_ALL ;
+    ASSERT_MATRIX_OK (C, "final C for bitmap assign, M, accum", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_M_accum_whole.c b/GraphBLAS/Source/GB_bitmap_assign_M_accum_whole.c
new file mode 100644
index 0000000000..8fc6dbb34e
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_M_accum_whole.c
@@ -0,0 +1,330 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_M_accum_whole:  assign to C bitmap 
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<M> += A       assign
+// C<M> += A       subassign
+
+// C<M,repl> += A       assign
+// C<M,repl> += A       subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           present, hypersparse or sparse (not bitmap or full)
+// Mask_comp:   false
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign or subassign (same action)
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;
+
+GrB_Info GB_bitmap_assign_M_accum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Matrix M,         // mask matrix, which is not NULL here
+//  const bool Mask_comp,       // false here
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present here
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit3:whole", M, false, accum,
+        GB_ALL, GB_ALL, GB_ASSIGN) ;
+    ASSERT (GB_IS_HYPERSPARSE (M) || GB_IS_SPARSE (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, M, accum", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for bitmap assign, M, accum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, M, accum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get C, M, A, and accum
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_SLICE_M
+    GB_GET_A_AND_SCALAR
+    GB_GET_ACCUM_FOR_BITMAP
+
+    //--------------------------------------------------------------------------
+    // do the assignment
+    //--------------------------------------------------------------------------
+
+    if (A == NULL)
+    { 
+
+        //----------------------------------------------------------------------
+        // scalar assignment: C<M, replace or !replace> += scalar
+        //----------------------------------------------------------------------
+
+        if (C_replace)
+        { 
+
+            //------------------------------------------------------------------
+            // C<M,replace> += scalar
+            //------------------------------------------------------------------
+
+            // Cb [pC] += 2 for each entry M(i,j) in the mask
+            GB_bitmap_M_scatter_whole (C,
+                M, Mask_struct, GB_BITMAP_M_SCATTER_PLUS_2,
+                pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                M_nthreads, M_ntasks, Context) ;
+            // the bitmap of C now contains:
+            //  Cb (i,j) = 0:   cij not present, mij zero
+            //  Cb (i,j) = 1:   cij present, mij zero
+            //  Cb (i,j) = 2:   cij not present, mij 1
+            //  Cb (i,j) = 3:   cij present, mij 1
+
+            #undef  GB_CIJ_WORK
+            #define GB_CIJ_WORK(pC)                                 \
+            {                                                       \
+                switch (Cb [pC])                                    \
+                {                                                   \
+                    case 1: /* C(i,j) present, M(i,j) = 0 */        \
+                        /* delete this entry */                     \
+                        Cb [pC] = 0 ;                               \
+                        task_cnvals-- ;                             \
+                        break ;                                     \
+                    case 2: /* C(i,j) not present, M(i,j) = 1 */    \
+                        /* Cx [pC] = scalar */                      \
+                        GB_ASSIGN_SCALAR (pC) ;                     \
+                        Cb [pC] = 1 ;                               \
+                        task_cnvals++ ;                             \
+                        break ;                                     \
+                    case 3: /* C(i,j) present, M(i,j) = 1 */        \
+                        /* Cx [pC] += scalar */                     \
+                        GB_ACCUM_SCALAR (pC) ;                      \
+                        Cb [pC] = 1 ;                               \
+                        break ;                                     \
+                    default: ;                                      \
+                }                                                   \
+            }
+            #include "GB_bitmap_assign_C_whole_template.c"
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // C<M> += scalar
+            //------------------------------------------------------------------
+
+            #undef  GB_MASK_WORK
+            #define GB_MASK_WORK(pC)                        \
+            {                                               \
+                if (Cb [pC])                                \
+                {                                           \
+                    /* C(i,j) present, M(i,j) = 1 */        \
+                    /* Cx [pC] += scalar */                 \
+                    GB_ACCUM_SCALAR (pC) ;                  \
+                }                                           \
+                else                                        \
+                {                                           \
+                    /* C(i,j) not present, M(i,j) = 1 */    \
+                    /* Cx [pC] = scalar */                  \
+                    GB_ASSIGN_SCALAR (pC) ;                 \
+                    Cb [pC] = 1 ;                           \
+                    task_cnvals++ ;                         \
+                }                                           \
+            }
+            #include "GB_bitmap_assign_M_all_template.c"
+        }
+
+    }
+    else
+    { 
+
+        //----------------------------------------------------------------------
+        // matrix assignment: C<M, replace or !replace> += A
+        //----------------------------------------------------------------------
+
+        if (GB_IS_BITMAP (A) || GB_IS_FULL (A))
+        {
+
+            //------------------------------------------------------------------
+            // C<M, replace or !replace> += A where A is bitmap or full
+            //------------------------------------------------------------------
+
+            if (C_replace)
+            {
+
+                //--------------------------------------------------------------
+                // C<M, replace> += A where A is bitmap or full
+                //--------------------------------------------------------------
+
+                // Cb [pC] += 2 for each entry M(i,j) in the mask
+                GB_bitmap_M_scatter_whole (C,
+                    M, Mask_struct, GB_BITMAP_M_SCATTER_PLUS_2,
+                    pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                    M_nthreads, M_ntasks, Context) ;
+                // the bitmap of C now contains:
+                //  Cb (i,j) = 0:   cij not present, mij zero
+                //  Cb (i,j) = 1:   cij present, mij zero
+                //  Cb (i,j) = 2:   cij not present, mij 1
+                //  Cb (i,j) = 3:   cij present, mij 1
+
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                                 \
+                {                                                       \
+                    switch (Cb [pC])                                    \
+                    {                                                   \
+                        case 1: /* C(i,j) present, M(i,j) = 0 */        \
+                            /* delete this entry */                     \
+                            Cb [pC] = 0 ;                               \
+                            task_cnvals-- ;                             \
+                            break ;                                     \
+                        case 2: /* C(i,j) not present, M(i,j) = 1 */    \
+                            if (GBB (Ab, pC))                           \
+                            {                                           \
+                                /* Cx [pC] = Ax [pC] */                 \
+                                GB_ASSIGN_AIJ (pC, pC) ;                \
+                                Cb [pC] = 1 ;                           \
+                                task_cnvals++ ;                         \
+                            }                                           \
+                            else                                        \
+                            {                                           \
+                                /* clear the mask from C */             \
+                                Cb [pC] = 0 ;                           \
+                            }                                           \
+                            break ;                                     \
+                        case 3: /* C(i,j) present, M(i,j) = 1 */        \
+                            if (GBB (Ab, pC))                           \
+                            {                                           \
+                                /* Cx [pC] += Ax [pC] */                \
+                                GB_ACCUM_AIJ (pC, pC) ;                 \
+                            }                                           \
+                            Cb [pC] = 1 ;                               \
+                            break ;                                     \
+                        default: ;                                      \
+                    }                                                   \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+
+            }
+            else
+            {
+
+                //--------------------------------------------------------------
+                // C<M> += A where A is bitmap or full
+                //--------------------------------------------------------------
+
+                #undef  GB_MASK_WORK
+                #define GB_MASK_WORK(pC)                            \
+                {                                                   \
+                    if (GBB (Ab, pC))                               \
+                    {                                               \
+                        /* A(i,j) is present */                     \
+                        if (Cb [pC])                                \
+                        {                                           \
+                            /* C(i,j) present, M(i,j) = 1 */        \
+                            /* Cx [pC] += Ax [pC] */                \
+                            GB_ACCUM_AIJ (pC, pC) ;                 \
+                        }                                           \
+                        else                                        \
+                        {                                           \
+                            /* C(i,j) not present, M(i,j) = 1 */    \
+                            /* Cx [pC] = Ax [pC] */                 \
+                            GB_ASSIGN_AIJ (pC, pC) ;                \
+                            Cb [pC] = 1 ;                           \
+                            task_cnvals++ ;                         \
+                        }                                           \
+                    }                                               \
+                }
+                #include "GB_bitmap_assign_M_all_template.c"
+
+            }
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // C<M, replace or !replace> += A where A is sparse or hyper
+            //------------------------------------------------------------------
+
+            // Cb [pC] += 2 for each entry M(i,j) in the mask
+            GB_bitmap_M_scatter_whole (C,
+                M, Mask_struct, GB_BITMAP_M_SCATTER_PLUS_2,
+                pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                M_nthreads, M_ntasks, Context) ;
+            // the bitmap of C now contains:
+            //  Cb (i,j) = 0:   cij not present, mij zero
+            //  Cb (i,j) = 1:   cij present, mij zero
+            //  Cb (i,j) = 2:   cij not present, mij 1
+            //  Cb (i,j) = 3:   cij present, mij 1
+
+            // assign or accumulate entries from A into C
+            #undef  GB_AIJ_WORK
+            #define GB_AIJ_WORK(pC,pA)                      \
+            {                                               \
+                /* A(i,j) is present */                     \
+                int8_t cb = Cb [pC] ;                       \
+                if (cb == 2)                                \
+                {                                           \
+                    /* C(i,j) not present, M(i,j) = 1 */    \
+                    /* Cx [pC] = Ax [pA] */                 \
+                    GB_ASSIGN_AIJ (pC, pA) ;                \
+                    Cb [pC] = 3 ;                           \
+                    task_cnvals++ ;                         \
+                }                                           \
+                else if (cb == 3)                           \
+                {                                           \
+                    /* C(i,j) present, M(i,j) = 1 */        \
+                    /* Cx [pC] += Ax [pA] */                \
+                    GB_ACCUM_AIJ (pC, pA) ;                 \
+                }                                           \
+            }
+            #include "GB_bitmap_assign_A_whole_template.c"
+
+            if (C_replace)
+            { 
+                // clear the mask and delete entries not assigned
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                 \
+                {                                       \
+                    int8_t cb = Cb [pC] ;               \
+                    Cb [pC] = (cb == 3) ;               \
+                    task_cnvals -= (cb == 1) ;          \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+            }
+            else
+            { 
+                // clear the mask
+                // Cb [pC] -= 2 for each entry M(i,j) in the mask
+                GB_bitmap_M_scatter_whole (C,
+                    M, Mask_struct, GB_BITMAP_M_SCATTER_MINUS_2,
+                    pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                    M_nthreads, M_ntasks, Context) ;
+            }
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    GB_FREE_ALL ;
+    ASSERT_MATRIX_OK (C, "final C for bitmap assign, M, accum, whole", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_M_noaccum.c b/GraphBLAS/Source/GB_bitmap_assign_M_noaccum.c
new file mode 100644
index 0000000000..6da43761a2
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_M_noaccum.c
@@ -0,0 +1,323 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_M_noaccum:  assign to C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<M>(I,J) = A       assign
+// C(I,J)<M> = A       subassign
+
+// C<M,repl>(I,J) = A       assign
+// C(I,J)<M,repl> = A       subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           present, hypersparse or sparse, (not bitmap or full)
+// Mask_comp:   false
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       not present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign, row assign, col assign, or subassign
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;
+
+GrB_Info GB_bitmap_assign_M_noaccum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix, which is not NULL here
+//  const bool Mask_comp,       // false here
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit4", M, false, NULL,
+        Ikind, Jkind, assign_kind) ;
+    ASSERT (GB_IS_HYPERSPARSE (M) || GB_IS_SPARSE (M)) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, M, noaccum", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for bitmap assign, M, noaccum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, M, noaccum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_SLICE_M
+    GB_GET_A_AND_SCALAR
+
+    //--------------------------------------------------------------------------
+    // C<M,repl or !repl>(I,J) = A or scalar
+    //--------------------------------------------------------------------------
+
+    //--------------------------------------------------------------------------
+    // scatter M into C
+    //--------------------------------------------------------------------------
+
+    // Cb [pC] += 2 for each entry M(i,j) in the mask
+    GB_bitmap_M_scatter (C, I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+        M, Mask_struct, assign_kind, GB_BITMAP_M_SCATTER_PLUS_2,
+        pstart_Mslice, kfirst_Mslice, klast_Mslice,
+        M_nthreads, M_ntasks, Context) ;
+
+    //    Cb (i,j) = 0:  mij == 0, cij not present
+    //    Cb (i,j) = 1:  mij == 0, cij present
+    //    Cb (i,j) = 2:  mij == 1, cij not present, can be assigned
+    //    Cb (i,j) = 3:  mij == 1, cij present, can be assigned
+
+    //    below:
+    //    Cb (i,j) = 4:  mij == 1, cij present, has been assigned
+
+    //--------------------------------------------------------------------------
+    // scatter A or the scalar into C(I,J)
+    //--------------------------------------------------------------------------
+
+    if (A == NULL)
+    {
+
+        //----------------------------------------------------------------------
+        // scalar assignment: C<M>(I,J) = scalar or C(I,J)<M> = scalar
+        //----------------------------------------------------------------------
+
+        // if C FULL:  if C_replace false, no deletion occurs
+        // otherwise: convert C to bitmap
+
+        if (assign_kind == GB_SUBASSIGN)
+        { 
+
+            //------------------------------------------------------------------
+            // scalar subassign: C(I,J)<M,repl or !repl> = scalar
+            //------------------------------------------------------------------
+
+            // for all IxJ
+            #undef  GB_IXJ_WORK
+            #define GB_IXJ_WORK(pC,ignore)          \
+            {                                       \
+                int8_t cb = Cb [pC] ;               \
+                if (cb >= 2)                        \
+                {                                   \
+                    /* Cx [pC] = scalar */          \
+                    GB_ASSIGN_SCALAR (pC) ;         \
+                    Cb [pC] = 1 ;                   \
+                    task_cnvals += (cb == 2) ;      \
+                }                                   \
+                else if (C_replace && cb == 1)      \
+                {                                   \
+                    /* delete this entry */         \
+                    Cb [pC] = 0 ;                   \
+                    task_cnvals-- ;                 \
+                }                                   \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+
+        }
+        else // assign_kind == GB_ASSIGN
+        { 
+
+            //------------------------------------------------------------------
+            // scalar assign: C<M,repl or !repl>(I,J) = scalar
+            //------------------------------------------------------------------
+
+            int keep = C_replace ? 4 : 1 ;
+
+            // for all IxJ
+            #undef  GB_IXJ_WORK
+            #define GB_IXJ_WORK(pC,ignore)          \
+            {                                       \
+                int8_t cb = Cb [pC] ;               \
+                if (cb >= 2)                        \
+                {                                   \
+                    /* Cx [pC] = scalar */          \
+                    GB_ASSIGN_SCALAR (pC) ;         \
+                    Cb [pC] = keep ;                \
+                    task_cnvals += (cb == 2) ;      \
+                }                                   \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+
+            if (C_replace)
+            { 
+                // for all of C
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                 \
+                {                                       \
+                    int8_t cb = Cb [pC] ;               \
+                    Cb [pC] = (cb == 4 || cb == 3) ;    \
+                    task_cnvals -= (cb == 1) ;          \
+                }
+                #include "GB_bitmap_assign_C_template.c"
+            }
+            else
+            { 
+                // clear the mask
+                // Cb [pC] %= 2 for each entry M(i,j) in the mask
+                GB_bitmap_M_scatter (C,
+                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                    M, Mask_struct, GB_ASSIGN, GB_BITMAP_M_SCATTER_MOD_2,
+                    pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                    M_nthreads, M_ntasks, Context) ;
+            }
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // matrix assignment: C<M,repl or !repl>(I,J) = A
+        //----------------------------------------------------------------------
+
+        // for all entries aij in A (A can be hyper, sparse, bitmap, or full)
+        //  if Cb(p) == 0 or 1  // do nothing
+        //  if Cb(p) == 2
+        //      Cx(p) = aij     // C(iC,jC) is now present, insert
+        //      Cb(p) = 4       // keep it
+        //      task_cnvals++ ;
+        //  if Cb(p) == 3
+        //      Cx(p) = aij     // C(iC,jC) is present, update it
+        //      Cb(p) = 4       // keep it
+
+        #define GB_AIJ_WORK(pC,pA)              \
+        {                                       \
+            int8_t cb = Cb [pC] ;               \
+            if (cb >= 2)                        \
+            {                                   \
+                /* Cx [pC] = Ax [pA] ; */       \
+                GB_ASSIGN_AIJ (pC, pA) ;        \
+                Cb [pC] = 4 ;                   \
+                task_cnvals += (cb == 2) ;      \
+            }                                   \
+        }
+        #include "GB_bitmap_assign_A_template.c"
+
+        //----------------------------------------------------------------------
+        // clear M from C and handle C_replace for row/col/assign
+        //----------------------------------------------------------------------
+
+        if (assign_kind == GB_SUBASSIGN)
+        {
+
+            //------------------------------------------------------------------
+            // subassign case
+            //------------------------------------------------------------------
+
+            if (C_replace)
+            { 
+                // for all IxJ
+                #undef  GB_IXJ_WORK
+                #define GB_IXJ_WORK(pC,ignore)              \
+                {                                           \
+                    int8_t cb = Cb [pC] ;                   \
+                    Cb [pC] = (cb == 4) ;                   \
+                    task_cnvals -= (cb == 1 || cb == 3) ;   \
+                }
+                #include "GB_bitmap_assign_IxJ_template.c"
+
+            }
+            else
+            { 
+                // for all IxJ
+                #undef  GB_IXJ_WORK
+                #define GB_IXJ_WORK(pC,ignore)              \
+                {                                           \
+                    int8_t cb = Cb [pC] ;                   \
+                    Cb [pC] = (cb == 4 || cb == 1) ;        \
+                    task_cnvals -= (cb == 3) ;              \
+                }
+                #include "GB_bitmap_assign_IxJ_template.c"
+            }
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // row/col/assign case
+            //------------------------------------------------------------------
+
+            #define GB_NO_SUBASSIGN_CASE
+
+            if (C_replace)
+            { 
+
+                // for all IxJ
+                #undef  GB_IXJ_WORK
+                #define GB_IXJ_WORK(pC,ignore)              \
+                {                                           \
+                    int8_t cb = Cb [pC] ;                   \
+                    Cb [pC] = (cb == 4) ? 3 : 0 ;           \
+                    task_cnvals -= (cb == 1 || cb == 3) ;   \
+                }
+                #include "GB_bitmap_assign_IxJ_template.c"
+
+                // for all of C
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                     \
+                {                                           \
+                    int8_t cb = Cb [pC] ;                   \
+                    ASSERT (cb != 4) ;                      \
+                    Cb [pC] = (cb == 3) ;                   \
+                    task_cnvals -= (cb == 1) ;              \
+                }
+                #include "GB_bitmap_assign_C_template.c"
+            }
+            else
+            { 
+
+                // for all IxJ
+                #undef  GB_IXJ_WORK
+                #define GB_IXJ_WORK(pC,ignore)              \
+                {                                           \
+                    int8_t cb = Cb [pC] ;                   \
+                    Cb [pC] = (cb == 4 || cb == 1) ;        \
+                    task_cnvals -= (cb == 3) ;              \
+                }
+                #include "GB_bitmap_assign_IxJ_template.c"
+
+                // clear M from C 
+                // Cb [pC] %= 2 for each entry M(i,j) in the mask
+                GB_bitmap_M_scatter (C,
+                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                    M, Mask_struct, assign_kind, GB_BITMAP_M_SCATTER_MOD_2,
+                    pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                    M_nthreads, M_ntasks, Context) ;
+            }
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    GB_FREE_ALL ;
+    ASSERT_MATRIX_OK (C, "final C for bitmap assign, M, noaccum", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_M_noaccum_whole.c b/GraphBLAS/Source/GB_bitmap_assign_M_noaccum_whole.c
new file mode 100644
index 0000000000..4c0ca36721
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_M_noaccum_whole.c
@@ -0,0 +1,400 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_M_noaccum_whole:  assign to C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<M> = A       assign
+// C<M> = A       subassign
+
+// C<M,repl> = A       assign
+// C<M,repl> = A       subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           present, hypersparse or sparse, (not bitmap or full)
+// Mask_comp:   false
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       not present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign or subassign (same action)
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;
+
+GrB_Info GB_bitmap_assign_M_noaccum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Matrix M,         // mask matrix, which is not NULL here
+//  const bool Mask_comp,       // false here
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit4:whole", M, false, NULL,
+        GB_ALL, GB_ALL, GB_ASSIGN) ;
+    ASSERT (GB_IS_HYPERSPARSE (M) || GB_IS_SPARSE (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, M, noaccum", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for bitmap assign, M, noaccum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, M, noaccum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_SLICE_M
+    GB_GET_A_AND_SCALAR
+
+    //--------------------------------------------------------------------------
+    // C<M,repl or !repl>(I,J) = A or scalar
+    //--------------------------------------------------------------------------
+
+    //--------------------------------------------------------------------------
+    // scatter A or the scalar into C
+    //--------------------------------------------------------------------------
+
+    if (A == NULL)
+    {
+
+        //----------------------------------------------------------------------
+        // scalar assignment: C<M or !M, repl or !repl> = scalar
+        //----------------------------------------------------------------------
+
+        if (C_replace)
+        { 
+
+            // Cb [pC] += 2 for each entry M(i,j) in the mask
+            GB_bitmap_M_scatter_whole (C,
+                M, Mask_struct, GB_BITMAP_M_SCATTER_PLUS_2,
+                pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                M_nthreads, M_ntasks, Context) ;
+            // the bitmap of C now contains:
+            //  Cb (i,j) = 0:   cij not present, mij zero
+            //  Cb (i,j) = 1:   cij present, mij zero
+            //  Cb (i,j) = 2:   cij not present, mij 1
+            //  Cb (i,j) = 3:   cij present, mij 1
+
+            //------------------------------------------------------------------
+            // C<M,replace> = scalar
+            //------------------------------------------------------------------
+
+            #undef  GB_CIJ_WORK
+            #define GB_CIJ_WORK(pC)                                 \
+            {                                                       \
+                switch (Cb [pC])                                    \
+                {                                                   \
+                    case 1: /* C(i,j) present, M(i,j) = 0 */        \
+                        Cb [pC] = 0 ;                               \
+                        task_cnvals-- ;                             \
+                        break ;                                     \
+                    case 2: /* C(i,j) not present, M(i,j) = 1 */    \
+                        /* Cx [pC] = scalar */                      \
+                        GB_ASSIGN_SCALAR (pC) ;                     \
+                        Cb [pC] = 1 ;                               \
+                        task_cnvals++ ;                             \
+                        break ;                                     \
+                    case 3:  /* C(i,j) present, M(i,j) = 1 */       \
+                        /* Cx [pC] = scalar */                      \
+                        GB_ASSIGN_SCALAR (pC) ;                     \
+                        Cb [pC] = 1 ;                               \
+                    default: ;                                      \
+                }                                                   \
+            }
+            #include "GB_bitmap_assign_C_whole_template.c"
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // C<M> = scalar
+            //------------------------------------------------------------------
+
+            #undef  GB_MASK_WORK
+            #define GB_MASK_WORK(pC)                        \
+            {                                               \
+                if (Cb [pC])                                \
+                {                                           \
+                    /* C(i,j) present, M(i,j) = 1 */        \
+                    /* Cx [pC] = scalar */                  \
+                    GB_ASSIGN_SCALAR (pC) ;                 \
+                }                                           \
+                else                                        \
+                {                                           \
+                    /* C(i,j) not present, M(i,j) = 1 */    \
+                    /* Cx [pC] = scalar */                  \
+                    GB_ASSIGN_SCALAR (pC) ;                 \
+                    Cb [pC] = 1 ;                           \
+                    task_cnvals++ ;                         \
+                }                                           \
+            }
+            #include "GB_bitmap_assign_M_all_template.c"
+
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // matrix assignment: C<M,repl or !repl> = A
+        //----------------------------------------------------------------------
+
+        if (GB_IS_BITMAP (A) || GB_IS_FULL (A))
+        {
+
+            //------------------------------------------------------------------
+            // C<M, replace or !replace> = A where A is bitmap or full
+            //------------------------------------------------------------------
+
+            if (C_replace)
+            { 
+
+                //--------------------------------------------------------------
+                // C<M,replace> = A where A is bitmap or full
+                //--------------------------------------------------------------
+
+                // Cb [pC] += 2 for each entry M(i,j) in the mask
+                GB_bitmap_M_scatter_whole (C,
+                    M, Mask_struct, GB_BITMAP_M_SCATTER_PLUS_2,
+                    pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                    M_nthreads, M_ntasks, Context) ;
+                // the bitmap of C now contains:
+                //  Cb (i,j) = 0:   cij not present, mij zero
+                //  Cb (i,j) = 1:   cij present, mij zero
+                //  Cb (i,j) = 2:   cij not present, mij 1
+                //  Cb (i,j) = 3:   cij present, mij 1
+
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                                 \
+                {                                                       \
+                    switch (Cb [pC])                                    \
+                    {                                                   \
+                        case 1: /* C(i,j) present, M(i,j) = 0 */        \
+                            Cb [pC] = 0 ;                               \
+                            task_cnvals-- ;                             \
+                            break ;                                     \
+                        case 2: /* C(i,j) not present, M(i,j) = 1 */    \
+                            if (GBB (Ab, pC))                           \
+                            {                                           \
+                                /* Cx [pC] = Ax [pC] */                 \
+                                GB_ASSIGN_AIJ (pC, pC) ;                \
+                                Cb [pC] = 1 ;                           \
+                                task_cnvals++ ;                         \
+                            }                                           \
+                            else                                        \
+                            {                                           \
+                                /* clear the mask from C */             \
+                                Cb [pC] = 0 ;                           \
+                            }                                           \
+                            break ;                                     \
+                        case 3: /* C(i,j) present, M(i,j) = 1 */        \
+                            if (GBB (Ab, pC))                           \
+                            {                                           \
+                                /* Cx [pC] = Ax [pC] */                 \
+                                GB_ASSIGN_AIJ (pC, pC) ;                \
+                                Cb [pC] = 1 ;                           \
+                            }                                           \
+                            else                                        \
+                            {                                           \
+                                /* delete C(i,j) */                     \
+                                Cb [pC] = 0 ;                           \
+                                task_cnvals-- ;                         \
+                            }                                           \
+                            break ;                                     \
+                        default: ;                                      \
+                    }                                                   \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+
+            }
+            else
+            { 
+
+                //--------------------------------------------------------------
+                // C<M> = A where A is bitmap or full
+                //--------------------------------------------------------------
+
+                #undef  GB_MASK_WORK
+                #define GB_MASK_WORK(pC)                        \
+                {                                               \
+                    if (Cb [pC])                                \
+                    {                                           \
+                        /* C(i,j) present, M(i,j) = 1 */        \
+                        if (GBB (Ab, pC))                       \
+                        {                                       \
+                            /* Cx [pC] = Ax [pC] */             \
+                            GB_ASSIGN_AIJ (pC, pC) ;            \
+                        }                                       \
+                        else                                    \
+                        {                                       \
+                            /* delete C(i,j) */                 \
+                            Cb [pC] = 0 ;                       \
+                            task_cnvals-- ;                     \
+                        }                                       \
+                    }                                           \
+                    else                                        \
+                    {                                           \
+                        /* C(i,j) not present, M(i,j) = 1 */    \
+                        if (GBB (Ab, pC))                       \
+                        {                                       \
+                            /* Cx [pC] = Ax [pC] */             \
+                            GB_ASSIGN_AIJ (pC, pC) ;            \
+                            Cb [pC] = 1 ;                       \
+                            task_cnvals++ ;                     \
+                        }                                       \
+                    }                                           \
+                }
+                #include "GB_bitmap_assign_M_all_template.c"
+
+            }
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // C<M, replace or !replace> = A where A is sparse or hyper
+            //------------------------------------------------------------------
+
+            if (C_replace)
+            { 
+
+                //--------------------------------------------------------------
+                // C<M,replace> = A where A is sparse or hyper
+                //--------------------------------------------------------------
+
+                // Cb [pC] += 2 for each entry M(i,j) in the mask
+                GB_bitmap_M_scatter_whole (C,
+                    M, Mask_struct, GB_BITMAP_M_SCATTER_PLUS_2,
+                    pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                    M_nthreads, M_ntasks, Context) ;
+                // the bitmap of C now contains:
+                //  Cb (i,j) = 0:   cij not present, mij zero
+                //  Cb (i,j) = 1:   cij present, mij zero
+                //  Cb (i,j) = 2:   cij not present, mij 1
+                //  Cb (i,j) = 3:   cij present, mij 1
+
+                // assign entries from A into C
+                #undef  GB_AIJ_WORK
+                #define GB_AIJ_WORK(pC,pA)                      \
+                {                                               \
+                    int8_t cb = Cb [pC] ;                       \
+                    if (cb >= 2)                                \
+                    {                                           \
+                        /* M(i,j)=1 and A(i,j) present */       \
+                        /* Cx [pC] = Ax [pA] ; */               \
+                        GB_ASSIGN_AIJ (pC, pA) ;                \
+                        Cb [pC] = 4 ;                           \
+                        task_cnvals += (cb == 2) ;              \
+                    }                                           \
+                }
+                #include "GB_bitmap_assign_A_whole_template.c"
+
+                // clear the mask and delete entries not assigned
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                         \
+                {                                               \
+                    int8_t cb = Cb [pC] ;                       \
+                    Cb [pC] = (cb == 4) ;                       \
+                    task_cnvals -= (cb == 1 || cb == 3) ;       \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+
+            }
+            else if (GB_NNZ (A) == 0)
+            {
+
+                //--------------------------------------------------------------
+                // C<M> = A where A is sparse or hyper, with no entries
+                //--------------------------------------------------------------
+
+                GBURBLE ("(A empty) ") ;
+
+                // delete entries via the mask
+                #undef  GB_MASK_WORK
+                #define GB_MASK_WORK(pC)                        \
+                {                                               \
+                    int8_t cb = Cb [pC] ;                       \
+                    Cb [pC] = 0 ;                               \
+                    task_cnvals -= (cb == 1) ;                  \
+                }
+                #include "GB_bitmap_assign_M_all_template.c"
+
+            }
+            else
+            { 
+
+                //--------------------------------------------------------------
+                // C<M> = A where A is sparse or hyper
+                //--------------------------------------------------------------
+
+                // Cb [pC] += 2 for each entry M(i,j) in the mask
+                GB_bitmap_M_scatter_whole (C,
+                    M, Mask_struct, GB_BITMAP_M_SCATTER_PLUS_2,
+                    pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                    M_nthreads, M_ntasks, Context) ;
+                // the bitmap of C now contains:
+                //  Cb (i,j) = 0:   cij not present, mij zero
+                //  Cb (i,j) = 1:   cij present, mij zero
+                //  Cb (i,j) = 2:   cij not present, mij 1
+                //  Cb (i,j) = 3:   cij present, mij 1
+
+                // assign entries from A into C
+                #undef  GB_AIJ_WORK
+                #define GB_AIJ_WORK(pC,pA)                      \
+                {                                               \
+                    int8_t cb = Cb [pC] ;                       \
+                    if (cb >= 2)                                \
+                    {                                           \
+                        /* M(i,j)=1 and A(i,j) present */       \
+                        /* Cx [pC] = Ax [pA] ; */               \
+                        GB_ASSIGN_AIJ (pC, pA) ;                \
+                        Cb [pC] = 1 ;                           \
+                        task_cnvals += (cb == 2) ;              \
+                    }                                           \
+                }
+                #include "GB_bitmap_assign_A_whole_template.c"
+
+                // clear the mask and delete entries not assigned
+                #undef  GB_MASK_WORK
+                #define GB_MASK_WORK(pC)                        \
+                {                                               \
+                    int8_t cb = Cb [pC] ;                       \
+                    Cb [pC] = (cb == 1) ;                       \
+                    task_cnvals -= (cb == 3) ;                  \
+                }
+                #include "GB_bitmap_assign_M_all_template.c"
+
+            }
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    GB_FREE_ALL ;
+    ASSERT_MATRIX_OK (C, "final C for bitmap assign, M, noaccum, whole", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_fullM_accum.c b/GraphBLAS/Source/GB_bitmap_assign_fullM_accum.c
new file mode 100644
index 0000000000..c20523826e
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_fullM_accum.c
@@ -0,0 +1,270 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_fullM_accum:  assign to C bitmap, M is bitmap or full
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<M>(I,J) += A           assign
+// C(I,J)<M> += A           subassign
+
+// C<M,repl>(I,J) += A      assign
+// C(I,J)<M,repl> += A      subassign
+
+// C<!M>(I,J) += A          assign
+// C(I,J)<!M> += A          subassign
+
+// C<!M,repl>(I,J) += A     assign
+// C(I,J)<!M,repl> += A     subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           present, bitmap or full (not hypersparse or sparse)
+// Mask_comp:   true or false
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign, row assign, col assign, or subassign
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GB_bitmap_assign_fullM_accum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix, which is present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present here
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit1", M, Mask_comp, accum,
+        Ikind, Jkind, assign_kind) ;
+    ASSERT (GB_IS_BITMAP (M) || GB_IS_FULL (M)) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, M full, accum", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for bitmap assign, M full, accum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, M full, accum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_GET_M
+    GB_GET_A_AND_SCALAR
+    GB_GET_ACCUM_FOR_BITMAP
+
+    //--------------------------------------------------------------------------
+    // to get the effective value of the mask entry mij
+    //--------------------------------------------------------------------------
+
+    #define GB_GET_MIJ(mij,pM)                                  \
+        bool mij = (GBB (Mb, pM) && GB_mcast (Mx, pM, msize)) ^ Mask_comp ;
+
+    //--------------------------------------------------------------------------
+    // assignment phase
+    //--------------------------------------------------------------------------
+
+    if (A == NULL)
+    {
+
+        //----------------------------------------------------------------------
+        // scalar assignment: C<M or !M>(I,J) += scalar
+        //----------------------------------------------------------------------
+
+        // for all IxJ
+        //  get the effective value of the mask, via GB_GET_MIJ:
+        //      for row assign: get mij = m(jC,0)
+        //      for col assign: get mij = m(iC,0)
+        //      for assign: get mij = M(iC,jC)
+        //      for subassign: get mij = M(i,j)
+        //      if complemented: mij = !mij
+        //  if mij == 1:
+        //      if Cb(p) == 0
+        //          Cx(p) = scalar
+        //          Cb(p) = 1       // C(iC,jC) is now present, insert
+        //      else // if Cb(p) == 1:
+        //          Cx(p) += scalar // C(iC,jC) still present, updated
+
+        // if C FULL: no change, just cb = GBB (Cb,pC)
+
+        #undef  GB_IXJ_WORK
+        #define GB_IXJ_WORK(pC,pA)                  \
+        {                                           \
+            int64_t pM = GB_GET_pM ;                \
+            GB_GET_MIJ (mij, pM) ;                  \
+            if (mij)                                \
+            {                                       \
+                int8_t cb = Cb [pC] ;               \
+                if (cb == 0)                        \
+                {                                   \
+                    /* Cx [pC] = scalar */          \
+                    GB_ASSIGN_SCALAR (pC) ;         \
+                    Cb [pC] = 1 ;                   \
+                    task_cnvals++ ;                 \
+                }                                   \
+                else /* (cb == 1) */                \
+                {                                   \
+                    /* Cx [pC] += scalar */         \
+                    GB_ACCUM_SCALAR (pC) ;          \
+                }                                   \
+            }                                       \
+        }
+
+        ASSERT (assign_kind == GB_ASSIGN || assign_kind == GB_SUBASSIGN) ;
+
+        switch (assign_kind)
+        {
+            case GB_ASSIGN : 
+                // C<M>(I,J) += scalar where M has the same size as C
+                #undef  GB_GET_pM
+                #define GB_GET_pM pC
+                #include "GB_bitmap_assign_IxJ_template.c"
+                break ;
+            case GB_SUBASSIGN : 
+                // C(I,J)<M> += scalar where M has the same size as A
+                #undef  GB_GET_pM
+                #define GB_GET_pM pA
+                #include "GB_bitmap_assign_IxJ_template.c"
+                break ;
+            default: ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // matrix assignment: C<M or !M>(I,J) += A
+        //----------------------------------------------------------------------
+
+        // for all entries aij in A (A can be hyper, sparse, bitmap, or full)
+        //     get the effective value of the mask, via GB_GET_MIJ:
+        //         for row assign: get mij = m(jC,0)
+        //         for col assign: get mij = m(iC,0)
+        //         for assign: get mij = M(iC,jC)
+        //         for subassign: get mij = M(i,j)
+        //         if complemented: mij = !mij
+        //     if mij == 1:
+        //         if Cb(p) == 0
+        //             Cx(p) = aij
+        //             Cb(p) = 1       // C(iC,jC) is now present, insert
+        //             task_cnvals++
+        //         else // if Cb(p) == 1:
+        //             Cx(p) += aij    // C(iC,jC) still present, updated
+
+        // if C FULL: no change, just cb = GBB (Cb,pC)
+
+        #define GB_AIJ_WORK(pC,pA)                  \
+        {                                           \
+            int64_t pM = GB_GET_pM ;                \
+            GB_GET_MIJ (mij, pM) ;                  \
+            if (mij)                                \
+            {                                       \
+                int8_t cb = Cb [pC] ;               \
+                if (cb == 0)                        \
+                {                                   \
+                    /* Cx [pC] = Ax [pA] */         \
+                    GB_ASSIGN_AIJ (pC, pA) ;        \
+                    Cb [pC] = 1 ;                   \
+                    task_cnvals++ ;                 \
+                }                                   \
+                else /* (cb == 1) */                \
+                {                                   \
+                    /* Cx [pC] += Ax [pA] */        \
+                    GB_ACCUM_AIJ (pC, pA) ;         \
+                }                                   \
+            }                                       \
+        }
+
+        switch (assign_kind)
+        {
+            case GB_ROW_ASSIGN : 
+                // C<m>(i,J) += A where m is a 1-by-C->vdim row vector
+                #undef  GB_GET_pM
+                #define GB_GET_pM jC
+                #include "GB_bitmap_assign_A_template.c"
+                break ;
+            case GB_COL_ASSIGN : 
+                // C<m>(I,j) += A where m is a C->vlen-by-1 column vector
+                #undef  GB_GET_pM
+                #define GB_GET_pM iC
+                #include "GB_bitmap_assign_A_template.c"
+                break ;
+            case GB_ASSIGN : 
+                // C<M>(I,J) += A where M has the same size as C
+                #undef  GB_GET_pM
+                #define GB_GET_pM pC
+                #include "GB_bitmap_assign_A_template.c"
+                break ;
+            case GB_SUBASSIGN : 
+                // C(I,J)<M> += A where M has the same size as A
+                #undef  GB_GET_pM
+                #define GB_GET_pM (iA + jA * nI)
+                #include "GB_bitmap_assign_A_template.c"
+                break ;
+            default: ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // C_replace phase
+    //--------------------------------------------------------------------------
+
+    if (C_replace)
+    { 
+        // if C FULL: use two passes: first pass checks if any
+        // entry must be deleted.  If none: do nothing.  Else:  change C
+        // to full and do 2nd pass as below.
+
+        // for row assign: for all entries in C(i,:)
+        // for col assign: for all entries in C(:,j)
+        // for assign: for all entries in C(:,:)
+        // for subassign: for all entries in C(I,J)
+        //      get effective value mij of the mask via GB_GET_MIJ
+        //      if mij == 0 set Cb(p) = 0
+        #define GB_CIJ_WORK(pC)             \
+        {                                   \
+            if (!mij)                       \
+            {                               \
+                int8_t cb = Cb [pC] ;       \
+                Cb [pC] = 0 ;               \
+                task_cnvals -= (cb == 1) ;  \
+            }                               \
+        }
+        #include "GB_bitmap_assign_C_template.c"
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    ASSERT_MATRIX_OK (C, "final C for bitmap assign, M full, accum", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_fullM_accum_whole.c b/GraphBLAS/Source/GB_bitmap_assign_fullM_accum_whole.c
new file mode 100644
index 0000000000..93c978bd2a
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_fullM_accum_whole.c
@@ -0,0 +1,294 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_fullM_accum_whole: assign to C bitmap, M is bitmap or full
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<M> += A           assign
+// C<M> += A           subassign
+
+// C<M,repl> += A      assign
+// C<M,repl> += A      subassign
+
+// C<!M> += A          assign
+// C<!M> += A          subassign
+
+// C<!M,repl> += A     assign
+// C<!M,repl> += A     subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           present, bitmap or full (not hypersparse or sparse)
+// Mask_comp:   true or false
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign or subassign (same action)
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GB_bitmap_assign_fullM_accum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Matrix M,         // mask matrix, which is present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present here
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit1:whole", M, Mask_comp, accum,
+        GB_ALL, GB_ALL, GB_ASSIGN) ;
+    ASSERT (GB_IS_BITMAP (M) || GB_IS_FULL (M)) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, M full, accum, whole", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for bitmap assign, M full, accum, whole", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, M full, accum, whole",
+        GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_GET_M
+    GB_GET_A_AND_SCALAR
+    GB_GET_ACCUM_FOR_BITMAP
+
+    //--------------------------------------------------------------------------
+    // to get the effective value of the mask entry mij
+    //--------------------------------------------------------------------------
+
+    #define GB_GET_MIJ(mij,pC)                                  \
+        bool mij = (GBB (Mb, pC) && GB_mcast (Mx, pC, msize)) ^ Mask_comp ;
+
+    //--------------------------------------------------------------------------
+    // assignment phase
+    //--------------------------------------------------------------------------
+
+    if (A == NULL)
+    {
+
+        //----------------------------------------------------------------------
+        // scalar assignment: C<M or !M> += scalar
+        //----------------------------------------------------------------------
+
+        if (C_replace)
+        {
+
+            //------------------------------------------------------------------
+            // C<M,replace> += scalar
+            //------------------------------------------------------------------
+
+            #undef  GB_CIJ_WORK
+            #define GB_CIJ_WORK(pC)                     \
+            {                                           \
+                int8_t cb = Cb [pC] ;                   \
+                if (mij)                                \
+                {                                       \
+                    if (cb == 0)                        \
+                    {                                   \
+                        /* Cx [pC] = scalar */          \
+                        GB_ASSIGN_SCALAR (pC) ;         \
+                        Cb [pC] = 1 ;                   \
+                        task_cnvals++ ;                 \
+                    }                                   \
+                    else /* (cb == 1) */                \
+                    {                                   \
+                        /* Cx [pC] += scalar */         \
+                        GB_ACCUM_SCALAR (pC) ;          \
+                    }                                   \
+                }                                       \
+                else                                    \
+                {                                       \
+                    /* delete C(i,j) if present */      \
+                    Cb [pC] = 0 ;                       \
+                    task_cnvals -= (cb == 1) ;          \
+                }                                       \
+            }
+            #include "GB_bitmap_assign_C_whole_template.c"
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // C<M> += scalar
+            //------------------------------------------------------------------
+
+            #undef  GB_CIJ_WORK
+            #define GB_CIJ_WORK(pC)                     \
+            {                                           \
+                if (mij)                                \
+                {                                       \
+                    if (Cb [pC])                        \
+                    {                                   \
+                        /* Cx [pC] += scalar */         \
+                        GB_ACCUM_SCALAR (pC) ;          \
+                    }                                   \
+                    else                                \
+                    {                                   \
+                        /* Cx [pC] = scalar */          \
+                        GB_ASSIGN_SCALAR (pC) ;         \
+                        Cb [pC] = 1 ;                   \
+                        task_cnvals++ ;                 \
+                    }                                   \
+                }                                       \
+            }
+            #include "GB_bitmap_assign_C_whole_template.c"
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // matrix assignment: C<M or !M> += A
+        //----------------------------------------------------------------------
+
+        if (GB_IS_BITMAP (A) || GB_IS_FULL (A))
+        {
+            if (C_replace)
+            {
+
+                //--------------------------------------------------------------
+                // C<M or !M,replace> += A where A is bitmap or full
+                //--------------------------------------------------------------
+
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                         \
+                {                                               \
+                    int8_t cb = Cb [pC] ;                       \
+                    if (mij)                                    \
+                    {                                           \
+                        if (GBB (Ab, pC))                       \
+                        {                                       \
+                            /* mij true and A(i,j) present */   \
+                            if (cb)                             \
+                            {                                   \
+                                /* Cx [pC] += Ax [pC] */        \
+                                GB_ACCUM_AIJ (pC, pC) ;         \
+                            }                                   \
+                            else                                \
+                            {                                   \
+                                /* Cx [pC] = Ax [pC] */         \
+                                GB_ASSIGN_AIJ (pC, pC) ;        \
+                                Cb [pC] = 1 ;                   \
+                                task_cnvals++ ;                 \
+                            }                                   \
+                        }                                       \
+                    }                                           \
+                    else                                        \
+                    {                                           \
+                        /* delete C(i,j) if present */          \
+                        Cb [pC] = 0 ;                           \
+                        task_cnvals -= (cb == 1) ;              \
+                    }                                           \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+
+            }
+            else
+            {
+
+                //--------------------------------------------------------------
+                // C<M or !M> += A where A is bitmap or full
+                //--------------------------------------------------------------
+
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                     \
+                {                                           \
+                    if (mij && GBB (Ab, pC))                \
+                    {                                       \
+                        /* mij true and A(i,j) present */   \
+                        if (Cb [pC])                        \
+                        {                                   \
+                            /* Cx [pC] += Ax [pC] */        \
+                            GB_ACCUM_AIJ (pC, pC) ;         \
+                        }                                   \
+                        else                                \
+                        {                                   \
+                            /* Cx [pC] = Ax [pC] */         \
+                            GB_ASSIGN_AIJ (pC, pC) ;        \
+                            Cb [pC] = 1 ;                   \
+                            task_cnvals++ ;                 \
+                        }                                   \
+                    }                                       \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+            }
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // C<M or !M,replace or !replace> += A, where A is sparse/hyper
+            //------------------------------------------------------------------
+
+            // assign entries from A
+            #undef  GB_AIJ_WORK
+            #define GB_AIJ_WORK(pC,pA)                  \
+            {                                           \
+                GB_GET_MIJ (mij, pC) ;                  \
+                if (mij)                                \
+                {                                       \
+                    /* mij true and A(i,j) present */   \
+                    if (Cb [pC])                        \
+                    {                                   \
+                        /* Cx [pC] += Ax [pA] */        \
+                        GB_ACCUM_AIJ (pC, pA) ;         \
+                    }                                   \
+                    else                                \
+                    {                                   \
+                        /* Cx [pC] = Ax [pA] */         \
+                        GB_ASSIGN_AIJ (pC, pA) ;        \
+                        Cb [pC] = 1 ;                   \
+                        task_cnvals++ ;                 \
+                    }                                   \
+                }                                       \
+            }
+            #include "GB_bitmap_assign_A_whole_template.c"
+
+            // clear the mask and delete entries not assigned
+            if (C_replace)
+            { 
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)             \
+                {                                   \
+                    if (!mij)                       \
+                    {                               \
+                        int8_t cb = Cb [pC] ;       \
+                        Cb [pC] = 0 ;               \
+                        task_cnvals -= (cb == 1) ;  \
+                    }                               \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+            }
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    ASSERT_MATRIX_OK (C, "final C, bitmap assign, M full, accum, whole", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_fullM_noaccum.c b/GraphBLAS/Source/GB_bitmap_assign_fullM_noaccum.c
new file mode 100644
index 0000000000..dbb804bcf5
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_fullM_noaccum.c
@@ -0,0 +1,268 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_fullM_noaccum:  assign to C bitmap, M is bitmap or full
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<M>(I,J) = A       assign
+// C(I,J)<M> = A       subassign
+
+// C<M,repl>(I,J) = A       assign
+// C(I,J)<M,repl> = A       subassign
+
+// C<!M>(I,J) = A       assign
+// C(I,J)<!M> = A       subassign
+
+// C<!M,repl>(I,J) = A       assign
+// C(I,J)<!M,repl> = A       subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           present, bitmap or full (not hypersparse or sparse)
+// Mask_comp:   true or false
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       not present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign, row assign, col assign, or subassign
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GB_bitmap_assign_fullM_noaccum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    const bool C_replace,       // descriptor for C
+    // inputs:
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix, which is present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit2", M, Mask_comp, NULL,
+        Ikind, Jkind, assign_kind) ;
+    ASSERT (GB_IS_BITMAP (M) || GB_IS_FULL (M)) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, M full, noaccum", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for bitmap assign, M full, noaccum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, M full, noaccum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_GET_M
+    GB_GET_A_AND_SCALAR
+
+    //--------------------------------------------------------------------------
+    // to get the effective value of the mask entry mij
+    //--------------------------------------------------------------------------
+
+    #undef  GB_GET_MIJ
+    #define GB_GET_MIJ(mij,pM)                                  \
+        bool mij = (GBB (Mb, pM) && GB_mcast (Mx, pM, msize)) ^ Mask_comp ;
+
+    //--------------------------------------------------------------------------
+    // C_replace phase
+    //--------------------------------------------------------------------------
+
+    if (C_replace)
+    { 
+        // if C FULL: use two passes: first pass checks if any
+        // entry must be deleted.  If none: do nothing.  Else:  change C
+        // to full and do 2nd pass as below.
+
+        // for row assign: set Cb(i,:) to zero if mij == 0
+        // for col assign: set Cb(:,j) to zero if mij == 0
+        // for assign: set Cb(:,:) to zero if mij == 0
+        // for subassign set Cb(I,J) to zero if mij == 0
+        #undef  GB_CIJ_WORK
+        #define GB_CIJ_WORK(pC)             \
+        {                                   \
+            if (!mij)                       \
+            {                               \
+                int8_t cb = Cb [pC] ;       \
+                Cb [pC] = 0 ;               \
+                task_cnvals -= (cb == 1) ;  \
+            }                               \
+        }
+        #include "GB_bitmap_assign_C_template.c"
+    }
+
+    //--------------------------------------------------------------------------
+    // assignment phase
+    //--------------------------------------------------------------------------
+
+    if (A == NULL)
+    {
+
+        //----------------------------------------------------------------------
+        // scalar assignment: C<M or !M>(I,J) = scalar
+        //----------------------------------------------------------------------
+
+        // if C FULL: no change, just cb = GBB (CB,pC)
+
+        // for all entries in IxJ
+        #undef  GB_IXJ_WORK
+        #define GB_IXJ_WORK(pC,pA)              \
+        {                                       \
+            int64_t pM = GB_GET_pM ;            \
+            GB_GET_MIJ (mij, pM) ;              \
+            if (mij)                            \
+            {                                   \
+                int8_t cb = Cb [pC] ;           \
+                /* Cx [pC] = scalar */          \
+                GB_ASSIGN_SCALAR (pC) ;         \
+                Cb [pC] = 1 ;                   \
+                task_cnvals += (cb == 0) ;      \
+            }                                   \
+        }
+
+        ASSERT (assign_kind == GB_ASSIGN || assign_kind == GB_SUBASSIGN) ;
+
+        switch (assign_kind)
+        {
+            case GB_ASSIGN : 
+                // C<M>(I,J) = scalar where M has the same size as C
+                #undef  GB_GET_pM
+                #define GB_GET_pM pC
+                #include "GB_bitmap_assign_IxJ_template.c"
+                break ;
+            case GB_SUBASSIGN : 
+                // C(I,J)<M> = scalar where M has the same size as A
+                #undef  GB_GET_pM
+                #define GB_GET_pM pA
+                #include "GB_bitmap_assign_IxJ_template.c"
+                break ;
+            default: ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // matrix assignment: C<M or !M>(I,J) = A
+        //----------------------------------------------------------------------
+
+        // assign A into C:
+
+            //  for all entries aij in A
+            //      get the effective value of the mask:
+            //          for row assign: get mij = m(jC,0)
+            //          for col assign: get mij = m(iC,0)
+            //          for assign: get mij = M(iC,jC)
+            //          for subassign: get mij = M(i,j)
+            //          if complemented: mij = !mij
+            //      if mij == 1:
+            //          Cx(p) = aij     // C(iC,jC) inserted or updated
+            //          Cb(p) = 4
+
+        // clear entries from C that were not in A:
+
+            // for all entries in IxJ
+                // get the effective value of the mask
+                // if mij == 1
+                    // 0 -> 0
+                    // 1 -> 0           delete because aij not present
+                    // 4 -> 1
+
+        // TODO: if A is bitmap or full, use a single pass
+
+        #define GB_AIJ_WORK(pC,pA)              \
+        {                                       \
+            int64_t pM = GB_GET_pM ;            \
+            GB_GET_MIJ (mij, pM) ;              \
+            if (mij)                            \
+            {                                   \
+                int8_t cb = Cb [pC] ;           \
+                /* Cx [pC] = Ax [pA] */         \
+                GB_ASSIGN_AIJ (pC, pA) ;        \
+                Cb [pC] = 4 ;                   \
+                task_cnvals += (cb == 0) ;      \
+            }                                   \
+        }
+
+        #undef  GB_IXJ_WORK
+        #define GB_IXJ_WORK(pC,pA)          \
+        {                                   \
+            int64_t pM = GB_GET_pM ;        \
+            GB_GET_MIJ (mij, pM) ;          \
+            if (mij)                        \
+            {                               \
+                int8_t cb = Cb [pC] ;       \
+                Cb [pC] = (cb > 1) ;        \
+                task_cnvals -= (cb == 1) ;  \
+            }                               \
+        }
+
+        switch (assign_kind)
+        {
+            case GB_ROW_ASSIGN : 
+                // C<m>(i,J) = A where m is a 1-by-C->vdim row vector
+                #undef  GB_GET_pM
+                #define GB_GET_pM jC
+                #include "GB_bitmap_assign_A_template.c"
+                #include "GB_bitmap_assign_IxJ_template.c"
+                break ;
+
+            case GB_COL_ASSIGN : 
+                // C<m>(I,j) = A where m is a C->vlen-by-1 column vector
+                #undef  GB_GET_pM
+                #define GB_GET_pM iC
+                #include "GB_bitmap_assign_A_template.c"
+                #include "GB_bitmap_assign_IxJ_template.c"
+                break ;
+
+            case GB_ASSIGN : 
+                // C<M>(I,J) = A where M has the same size as C
+                #undef  GB_GET_pM
+                #define GB_GET_pM pC
+                #include "GB_bitmap_assign_A_template.c"
+                #include "GB_bitmap_assign_IxJ_template.c"
+                break ;
+
+            case GB_SUBASSIGN : 
+                // C(I,J)<M> = A where M has the same size as A
+                #undef  GB_GET_pM
+                #define GB_GET_pM (iA + jA * nI)
+                #include "GB_bitmap_assign_A_template.c"
+                #include "GB_bitmap_assign_IxJ_template.c"
+                break ;
+
+            default: ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    ASSERT_MATRIX_OK (C, "final C for bitmap assign, M full, noaccum", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_fullM_noaccum_whole.c b/GraphBLAS/Source/GB_bitmap_assign_fullM_noaccum_whole.c
new file mode 100644
index 0000000000..c6a26fb2cf
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_fullM_noaccum_whole.c
@@ -0,0 +1,295 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_fullM_noaccum_whole: assign to C bitmap, M is bitmap or full
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<M> = A       assign
+// C<M> = A       subassign
+
+// C<M,repl> = A       assign
+// C<M,repl> = A       subassign
+
+// C<!M> = A       assign
+// C<!M> = A       subassign
+
+// C<!M,repl> = A       assign
+// C<!M,repl> = A       subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           present, bitmap or full (not hypersparse or sparse)
+// Mask_comp:   true or false
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       not present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign or subassign (same action)
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GB_bitmap_assign_fullM_noaccum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    const bool C_replace,       // descriptor for C
+    // inputs:
+    const GrB_Matrix M,         // mask matrix, which is present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit2:whole", M, Mask_comp, NULL,
+        GB_ALL, GB_ALL, GB_ASSIGN) ;
+    ASSERT (GB_IS_BITMAP (M) || GB_IS_FULL (M)) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, M full, noaccum, whole", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for bitmap assign, M full, noaccum, whole", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, M full, noaccum, whole",
+        GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_GET_M
+    GB_GET_A_AND_SCALAR
+
+    //--------------------------------------------------------------------------
+    // to get the effective value of the mask entry mij
+    //--------------------------------------------------------------------------
+
+    #define GB_GET_MIJ(mij,pM)                                  \
+        bool mij = (GBB (Mb, pM) && GB_mcast (Mx, pM, msize)) ^ Mask_comp ;
+
+    //--------------------------------------------------------------------------
+    // assignment phase
+    //--------------------------------------------------------------------------
+
+    if (A == NULL)
+    {
+
+        //----------------------------------------------------------------------
+        // scalar assignment: C<M or !M> = scalar
+        //----------------------------------------------------------------------
+
+        if (C_replace)
+        { 
+
+            //------------------------------------------------------------------
+            // C<M or !M, replace> = scalar
+            //------------------------------------------------------------------
+
+            #undef  GB_CIJ_WORK
+            #define GB_CIJ_WORK(pC)                     \
+            {                                           \
+                int8_t cb = Cb [pC] ;                   \
+                if (mij)                                \
+                {                                       \
+                    /* Cx [pC] = scalar */              \
+                    GB_ASSIGN_SCALAR (pC) ;             \
+                    Cb [pC] = 1 ;                       \
+                    task_cnvals += (cb == 0) ;          \
+                }                                       \
+                else                                    \
+                {                                       \
+                    /* delete C(i,j) if present */      \
+                    Cb [pC] = 0 ;                       \
+                    task_cnvals -= (cb == 1) ;          \
+                }                                       \
+            }
+            #include "GB_bitmap_assign_C_whole_template.c"
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // C<M or !M> = scalar
+            //------------------------------------------------------------------
+
+            #undef  GB_CIJ_WORK
+            #define GB_CIJ_WORK(pC)                     \
+            {                                           \
+                if (mij)                                \
+                {                                       \
+                    /* Cx [pC] = scalar */              \
+                    int8_t cb = Cb [pC] ;               \
+                    GB_ASSIGN_SCALAR (pC) ;             \
+                    Cb [pC] = 1 ;                       \
+                    task_cnvals += (cb == 0) ;          \
+                }                                       \
+            }
+            #include "GB_bitmap_assign_C_whole_template.c"
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // matrix assignment: C<M or !M> = A
+        //----------------------------------------------------------------------
+
+        if (GB_IS_BITMAP (A) || GB_IS_FULL (A))
+        {
+
+            //------------------------------------------------------------------
+            // matrix assignment: C<M or !M> = A where A is bitmap or full
+            //------------------------------------------------------------------
+
+            if (C_replace)
+            {
+
+                //--------------------------------------------------------------
+                // C<M or !M,replace> = A where A is bitmap or full
+                //--------------------------------------------------------------
+
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                     \
+                {                                           \
+                    int8_t cb = Cb [pC] ;                   \
+                    if (mij && GBB (Ab, pC))                \
+                    {                                       \
+                        /* Cx [pC] = Ax [pC] */             \
+                        GB_ASSIGN_AIJ (pC, pC) ;            \
+                        Cb [pC] = 1 ;                       \
+                        task_cnvals += (cb == 0) ;          \
+                    }                                       \
+                    else                                    \
+                    {                                       \
+                        /* delete C(i,j) if present */      \
+                        Cb [pC] = 0 ;                       \
+                        task_cnvals -= (cb == 1) ;          \
+                    }                                       \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+
+            }
+            else
+            {
+
+                //--------------------------------------------------------------
+                // C<M or !M> = A where A is bitmap or full
+                //--------------------------------------------------------------
+
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                     \
+                {                                           \
+                    if (mij)                                \
+                    {                                       \
+                        int8_t cb = Cb [pC] ;               \
+                        if (GBB (Ab, pC))                   \
+                        {                                   \
+                            /* Cx [pC] = Ax [pC] */         \
+                            GB_ASSIGN_AIJ (pC, pC) ;        \
+                            Cb [pC] = 1 ;                   \
+                            task_cnvals += (cb == 0) ;      \
+                        }                                   \
+                        else                                \
+                        {                                   \
+                            /* delete C(i,j) if present */  \
+                            Cb [pC] = 0 ;                   \
+                            task_cnvals -= (cb == 1) ;      \
+                        }                                   \
+                    }                                       \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+            }
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // matrix assignment: C<M or !M> = A where A is sparse or hyper
+            //------------------------------------------------------------------
+
+            if (C_replace)
+            {
+
+                //--------------------------------------------------------------
+                // C<M or !M,replace> = A where A is sparse or hyper
+                //--------------------------------------------------------------
+
+                // clear C of all entries
+                cnvals = 0 ;
+                GB_memset (Cb, 0, cnzmax, nthreads_max) ;
+
+                // C<M or !M> = A
+                #undef  GB_AIJ_WORK
+                #define GB_AIJ_WORK(pC,pA)                  \
+                {                                           \
+                    GB_GET_MIJ (mij, pC) ;                  \
+                    if (mij)                                \
+                    {                                       \
+                        /* Cx [pC] = Ax [pA] */             \
+                        GB_ASSIGN_AIJ (pC, pA) ;            \
+                        Cb [pC] = 1 ;                       \
+                        task_cnvals++ ;                     \
+                    }                                       \
+                }
+                #include "GB_bitmap_assign_A_whole_template.c"
+
+            }
+            else
+            {
+
+                //--------------------------------------------------------------
+                // C<M or !M> = A where A is sparse or hyper
+                //--------------------------------------------------------------
+
+                // C<M or !M> = A, assign entries from A
+                #undef  GB_AIJ_WORK
+                #define GB_AIJ_WORK(pC,pA)                  \
+                {                                           \
+                    GB_GET_MIJ (mij, pC) ;                  \
+                    if (mij)                                \
+                    {                                       \
+                        /* Cx [pC] = Ax [pA] */             \
+                        int8_t cb = Cb [pC] ;               \
+                        GB_ASSIGN_AIJ (pC, pA) ;            \
+                        Cb [pC] = 4 ; /* keep this entry */ \
+                        task_cnvals += (cb == 0) ;          \
+                    }                                       \
+                }
+                #include "GB_bitmap_assign_A_whole_template.c"
+
+                // delete entries where M(i,j)=1 but not assigned by A
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                     \
+                {                                           \
+                    int8_t cb = Cb [pC] ;                   \
+                    if (mij)                                \
+                    {                                       \
+                        Cb [pC] = (cb == 4) ;               \
+                        task_cnvals -= (cb == 1) ;          \
+                    }                                       \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+            }
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    ASSERT_MATRIX_OK (C, "final C bitmap assign, M full, noaccum, whole", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_methods.h b/GraphBLAS/Source/GB_bitmap_assign_methods.h
new file mode 100644
index 0000000000..ab67057204
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_methods.h
@@ -0,0 +1,573 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_methods.h: definitions for GB_bitmap_assign* methods
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_BITMAP_ASSIGN_METHODS_H
+#define GB_BITMAP_ASSIGN_METHODS_H
+#include "GB_bitmap_assign.h"
+#include "GB_ek_slice.h"
+#include "GB_partition.h"
+#include "GB_ij.h"
+#include "GB_subassign_IxJ_slice.h"
+#include "GB_unused.h"
+
+//------------------------------------------------------------------------------
+// burble
+//------------------------------------------------------------------------------
+
+#if GB_BURBLE
+    #define GBURBLE_BITMAP_ASSIGN(method,M,Mask_comp,accum,Ikind,Jkind,akind) \
+        GBURBLE ("Method:" method " ") ;                                    \
+        GB_burble_assign (C_replace, Ikind, Jkind, M, Mask_comp,            \
+            Mask_struct, accum, A, akind) ;
+#else
+    #define GBURBLE_BITMAP_ASSIGN(method,M,Mask_comp,accum,Ikind,Jkind,akind) ;
+#endif
+
+//------------------------------------------------------------------------------
+// GB_GET_C_BITMAP: get the C matrix (must be bitmap)
+//------------------------------------------------------------------------------
+
+// C must be a bitmap matrix
+
+#define GB_GET_C_BITMAP                                                     \
+    GrB_Info info ;                                                         \
+    /* also get the max # of threads to use */                              \
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;                    \
+    ASSERT_MATRIX_OK (C, "C for bitmap assign", GB0) ;                      \
+    ASSERT (GB_IS_BITMAP (C)) ;                                             \
+    int8_t  *Cb = C->b ;                                                    \
+    GB_void *Cx = (GB_void *) C->x ;                                        \
+    const size_t csize = C->type->size ;                                    \
+    const GB_Type_code ccode = C->type->code ;                              \
+    const int64_t cvdim = C->vdim ;                                         \
+    const int64_t cvlen = C->vlen ;                                         \
+    const int64_t vlen = cvlen ;    /* for GB_bitmap_assign_IxJ_template */ \
+    const int64_t cnzmax = cvlen * cvdim ;                                  \
+    int64_t cnvals = C->nvals ;
+
+//------------------------------------------------------------------------------
+// GB_GET_M: get the mask matrix M and check for aliasing
+//------------------------------------------------------------------------------
+
+// ALIAS of C and M for bitmap methods: For the assign methods, C==M is always
+// permitted, since with arbitrary (I,J) indexing (I,J), the mask entry M(i,j)
+// always controls C(i,j).  For row/col assign, aliasing of C and M would be
+// unusual, since M is a single row or column.  But if C was also a single
+// row/column, then C and M can be safely aliased.  For subassign, C==M can
+// only occur if (I,J) are (:,:).
+
+#define GB_GET_M                                                            \
+    ASSERT_MATRIX_OK (M, "M for bitmap assign", GB0) ;                      \
+    const int64_t *Mp = M->p ;                                              \
+    const int8_t  *Mb = M->b ;                                              \
+    const int64_t *Mh = M->h ;                                              \
+    const int64_t *Mi = M->i ;                                              \
+    const GB_void *Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;         \
+    const size_t msize = M->type->size ;                                    \
+    const size_t mvlen = M->vlen ;
+
+//------------------------------------------------------------------------------
+// GB_SLICE_M: slice the mask matrix M
+//------------------------------------------------------------------------------
+
+#define GB_SLICE_M                                                          \
+    GB_GET_M                                                                \
+    int64_t *pstart_Mslice = NULL ;                                         \
+    int64_t *kfirst_Mslice = NULL ;                                         \
+    int64_t *klast_Mslice  = NULL ;                                         \
+    int M_nthreads, M_ntasks ;                                              \
+    GB_SLICE_MATRIX (M, 8) ;
+
+//------------------------------------------------------------------------------
+// GB_GET_A: get the A matrix or the scalar
+//------------------------------------------------------------------------------
+
+// ALIAS of C and A for bitmap methods: OK only for C(:,:)=A assignment.
+
+#define GB_GET_A_AND_SCALAR                                                 \
+    const int64_t *Ap = NULL ;                                              \
+    const int64_t *Ah = NULL ;                                              \
+    const int8_t  *Ab = NULL ;                                              \
+    const int64_t *Ai = NULL ;                                              \
+    const GB_void *Ax = NULL ;                                              \
+    size_t asize ;                                                          \
+    GB_Type_code acode ;                                                    \
+    if (A == NULL)                                                          \
+    {                                                                       \
+        asize = scalar_type->size ;                                         \
+        acode = scalar_type->code ;                                         \
+    }                                                                       \
+    else                                                                    \
+    {                                                                       \
+        ASSERT_MATRIX_OK (A, "A for bitmap assign/subassign", GB0) ;        \
+        asize = A->type->size ;                                             \
+        acode = A->type->code ;                                             \
+        Ap = A->p ;                                                         \
+        Ah = A->h ;                                                         \
+        Ab = A->b ;                                                         \
+        Ai = A->i ;                                                         \
+        Ax = (GB_void *) A->x ;                                             \
+    }                                                                       \
+    GB_cast_function cast_A_to_C = GB_cast_factory (ccode, acode) ;         \
+    GB_void cwork [GB_VLA(csize)] ;                                         \
+    if (A == NULL)                                                          \
+    {                                                                       \
+        cast_A_to_C (cwork, scalar, asize) ;                                \
+    }                                                                       \
+
+//------------------------------------------------------------------------------
+// GB_GET_ACCUM: get the accumulator op and its related typecasting functions
+//------------------------------------------------------------------------------
+
+#define GB_GET_ACCUM_FOR_BITMAP                                             \
+    ASSERT_BINARYOP_OK (accum, "accum for bitmap assign", GB0) ;            \
+    ASSERT (!GB_OP_IS_POSITIONAL (accum)) ;                                 \
+    GxB_binary_function faccum = accum->function ;                          \
+    GB_cast_function cast_A_to_Y = GB_cast_factory (accum->ytype->code, acode);\
+    GB_cast_function cast_C_to_X = GB_cast_factory (accum->xtype->code, ccode);\
+    GB_cast_function cast_Z_to_C = GB_cast_factory (ccode, accum->ztype->code);\
+    size_t xsize = accum->xtype->size ;                                     \
+    size_t ysize = accum->ytype->size ;                                     \
+    size_t zsize = accum->ztype->size ;                                     \
+    GB_void ywork [GB_VLA(ysize)] ;                                         \
+    if (A == NULL)                                                          \
+    {                                                                       \
+        cast_A_to_Y (ywork, scalar, asize) ;                                \
+    }
+
+//------------------------------------------------------------------------------
+// GB_ASSIGN_SCALAR:  Cx [pC] = cwork, already typecasted
+//------------------------------------------------------------------------------
+
+#define GB_ASSIGN_SCALAR(pC)                                \
+{                                                           \
+    memcpy (Cx +(pC)*csize, cwork, csize) ;                 \
+}
+
+//------------------------------------------------------------------------------
+// GB_ASSIGN_AIJ:  Cx [pC] = Ax [pA], with typecasting as needed
+//------------------------------------------------------------------------------
+
+#define GB_ASSIGN_AIJ(pC,pA)                                \
+{                                                           \
+    cast_A_to_C (Cx +(pC)*csize, Ax +(pA)*asize, csize) ;   \
+}
+
+//------------------------------------------------------------------------------
+// GB_ACCUM_SCALAR:  Cx [pC] += ywork
+//------------------------------------------------------------------------------
+
+#define GB_ACCUM_SCALAR(pC)                                 \
+{                                                           \
+    GB_void xwork [GB_VLA(xsize)] ;                         \
+    cast_C_to_X (xwork, Cx +((pC)*csize), csize) ;          \
+    GB_void zwork [GB_VLA(zsize)] ;                         \
+    faccum (zwork, xwork, ywork) ;                          \
+    cast_Z_to_C (Cx +((pC)*csize), zwork, csize) ;          \
+}                                                           \
+
+//------------------------------------------------------------------------------
+// GB_ACCUM_AIJ:  Cx [pC] += Ax [pA]
+//------------------------------------------------------------------------------
+
+#define GB_ACCUM_AIJ(pC, pA)                                \
+{                                                           \
+    /* ywork = Ax [pA], with typecasting as needed */       \
+    GB_void ywork [GB_VLA(ysize)] ;                         \
+    cast_A_to_Y (ywork, Ax +((pA)*asize), asize) ;          \
+    /* Cx [pC] += ywork */                                  \
+    GB_ACCUM_SCALAR (pC) ;                                  \
+}
+
+//------------------------------------------------------------------------------
+// prototypes
+//------------------------------------------------------------------------------
+
+GrB_Info GB_bitmap_assign_fullM_accum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix, which is not NULL here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present here
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_fullM_accum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Matrix M,         // mask matrix, which is present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present here
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_fullM_noaccum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    const bool C_replace,       // descriptor for C
+    // inputs:
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix, which is present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_fullM_noaccum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    const bool C_replace,       // descriptor for C
+    // inputs:
+    const GrB_Matrix M,         // mask matrix, which is present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_M_accum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix, which is not NULL here
+//  const bool Mask_comp,       // false here
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present here
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_M_accum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Matrix M,         // mask matrix, which is not NULL here
+//  const bool Mask_comp,       // false here
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present here
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_M_noaccum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix, which is not NULL here
+//  const bool Mask_comp,       // false here
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_M_noaccum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Matrix M,         // mask matrix, which is not NULL here
+//  const bool Mask_comp,       // false here
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_noM_accum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+//  const GrB_Matrix M,         // mask matrix, not present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_noM_accum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+//  const GrB_Matrix M,         // mask matrix, not present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_noM_noaccum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+//  const GrB_Matrix M,         // mask matrix, not present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_noM_noaccum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+//  const GrB_Matrix M,         // mask matrix, not present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_notM_accum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix
+//  const bool Mask_comp,       // true here, for !M only
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_notM_accum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Matrix M,         // mask matrix
+//  const bool Mask_comp,       // true here, for !M only
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_notM_noaccum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix
+//  const bool Mask_comp,       // true here, for !M only
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+) ;
+
+GrB_Info GB_bitmap_assign_notM_noaccum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Matrix M,         // mask matrix
+//  const bool Mask_comp,       // true here, for !M only
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+) ;
+
+#define GB_BITMAP_M_SCATTER_PLUS_2  0
+#define GB_BITMAP_M_SCATTER_MINUS_2 1
+#define GB_BITMAP_M_SCATTER_SET_2   2
+#define GB_BITMAP_M_SCATTER_MOD_2   3
+
+void GB_bitmap_M_scatter        // scatter M into the C bitmap
+(
+    // input/output:
+    GrB_Matrix C,
+    // inputs:
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask to scatter into the C bitmap
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    const int operation,        // +=2, -=2, or %=2
+    const int64_t *GB_RESTRICT pstart_Mslice, // size ntasks+1
+    const int64_t *GB_RESTRICT kfirst_Mslice, // size ntasks
+    const int64_t *GB_RESTRICT klast_Mslice,  // size ntasks
+    const int mthreads,
+    const int mtasks,
+    GB_Context Context
+) ;
+
+void GB_bitmap_M_scatter_whole  // scatter M into the C bitmap
+(
+    // input/output:
+    GrB_Matrix C,
+    // inputs:
+    const GrB_Matrix M,         // mask to scatter into the C bitmap
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const int operation,        // +=2, -=2, or %=2
+    const int64_t *GB_RESTRICT pstart_Mslice, // size ntasks+1
+    const int64_t *GB_RESTRICT kfirst_Mslice, // size ntasks
+    const int64_t *GB_RESTRICT klast_Mslice,  // size ntasks
+    const int mthreads,
+    const int mtasks,
+    GB_Context Context
+) ;
+
+void GB_bitmap_assign_to_full   // set all C->b to 1, or free it and make C full
+(
+    GrB_Matrix C,
+    int nthreads_max
+) ;
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_noM_accum.c b/GraphBLAS/Source/GB_bitmap_assign_noM_accum.c
new file mode 100644
index 0000000000..e4057f8435
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_noM_accum.c
@@ -0,0 +1,191 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_noM_accum:  assign to C bitmap, mask M is not present
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<>(I,J) += A            assign
+// C(I,J)<> += A            subassign
+
+// C<repl>(I,J) += A        assign
+// C(I,J)<repl> += A        subassign
+
+// C<!>(I,J) += A           assign: no work to do
+// C(I,J)<!> += A           subassign: no work to do
+
+// C<!,repl>(I,J) += A      assign: just clear C(I,J) of all entries
+// C(I,J)<!,repl> += A      subassign: just clear C(I,J) of all entries
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           none
+// Mask_comp:   true or false
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign, row assign, col assign, or subassign (all the same)
+
+// If Mask_comp is true, then an empty mask is complemented.  This case has
+// already been handled by GB_assign_prep, which calls
+// GB_bitmap_assign_noM_noaccum, with a scalar (which is unused).
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GB_bitmap_assign_noM_accum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+//  const GrB_Matrix M,         // mask matrix, not present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit5", NULL, Mask_comp, accum,
+        Ikind, Jkind, assign_kind) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, no M, accum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, no M, accum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap TODO: C full is OK
+    GB_GET_A_AND_SCALAR
+    GB_GET_ACCUM_FOR_BITMAP
+
+    //--------------------------------------------------------------------------
+    // do the assignment
+    //--------------------------------------------------------------------------
+
+    if (!Mask_comp)
+    {
+
+        //----------------------------------------------------------------------
+        // C(I,J) += A or += scalar
+        //----------------------------------------------------------------------
+
+        if (A == NULL)
+        { 
+
+            //------------------------------------------------------------------
+            // scalar assignment: C(I,J) += scalar
+            //------------------------------------------------------------------
+
+            // for all entries in IxJ
+            #define GB_IXJ_WORK(pC,ignore)          \
+            {                                       \
+                int8_t cb = Cb [pC] ;               \
+                if (cb == 0)                        \
+                {                                   \
+                    /* Cx [pC] = scalar */          \
+                    GB_ASSIGN_SCALAR (pC) ;         \
+                    Cb [pC] = 1 ;                   \
+                    task_cnvals++ ;                 \
+                }                                   \
+                else                                \
+                {                                   \
+                    /* Cx [pC] += scalar */         \
+                    GB_ACCUM_SCALAR (pC) ;          \
+                }                                   \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // matrix assignment: C(I,J) += A
+            //------------------------------------------------------------------
+
+            // for all entries aij in A (A hyper, sparse, bitmap, or full)
+            //        if Cb(p) == 0
+            //            Cx(p) = aij
+            //            Cb(p) = 1       // C(iC,jC) is now present, insert
+            //        else // if Cb(p) == 1:
+            //            Cx(p) += aij    // C(iC,jC) still present, updated
+            //            task_cnvals++
+
+            #define GB_AIJ_WORK(pC,pA)              \
+            {                                       \
+                int8_t cb = Cb [pC] ;               \
+                if (cb == 0)                        \
+                {                                   \
+                    /* Cx [pC] = Ax [pA] */         \
+                    GB_ASSIGN_AIJ (pC, pA) ;        \
+                    Cb [pC] = 1 ;                   \
+                    task_cnvals++ ;                 \
+                }                                   \
+                else                                \
+                {                                   \
+                    /* Cx [pC] += Ax [pA] */        \
+                    GB_ACCUM_AIJ (pC, pA) ;         \
+                }                                   \
+            }
+            #include "GB_bitmap_assign_A_template.c"
+        }
+
+    }
+
+#if 0
+    else if (C_replace)
+    {
+
+        //----------------------------------------------------------------------
+        // This case is handled by GB_assign_prep and is thus not needed here.
+        //----------------------------------------------------------------------
+
+        // mask not present yet complemented: C_replace phase only
+
+        // for row assign: for all entries in C(i,:)
+        // for col assign: for all entries in C(:,j)
+        // for assign: for all entries in C(:,:)
+        // for subassign: for all entries in C(I,J)
+        //      M not present; so effective value of the mask is mij==0
+        //      set Cb(p) = 0
+
+        #define GB_CIJ_WORK(pC)         \
+        {                               \
+            int8_t cb = Cb [pC] ;       \
+            Cb [pC] = 0 ;               \
+            task_cnvals -= (cb == 1) ;  \
+        }
+        #include "GB_bitmap_assign_C_template.c"
+    }
+#endif
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, no M, accum", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_noM_accum_whole.c b/GraphBLAS/Source/GB_bitmap_assign_noM_accum_whole.c
new file mode 100644
index 0000000000..28699e3899
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_noM_accum_whole.c
@@ -0,0 +1,226 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_noM_accum_whole:  assign to C bitmap, mask M is not present
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<> += A            assign
+// C<> += A            subassign
+
+// C<repl> += A        assign
+// C<repl> += A        subassign
+
+// C<!> += A           assign: no work to do
+// C<!> += A           subassign: no work to do
+
+// C<!,repl> += A      assign: just clear C of all entries, not done here
+// C<!,repl> += A      subassign: just clear C of all entries, not done here
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           none
+// Mask_comp:   true or false
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign or subassign (same action)
+
+// If Mask_comp is true, then an empty mask is complemented.  This case has
+// already been handled by GB_assign_prep, which calls GB_clear, and thus
+// Mask_comp is always false in this method.
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GB_bitmap_assign_noM_accum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+//  const GrB_Matrix M,         // mask matrix, not present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit5:whole", NULL, Mask_comp, accum,
+        GB_ALL, GB_ALL, GB_ASSIGN) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, no M, accum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, no M, accum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_GET_A_AND_SCALAR
+    GB_GET_ACCUM_FOR_BITMAP
+
+    //--------------------------------------------------------------------------
+    // do the assignment
+    //--------------------------------------------------------------------------
+
+    if (!Mask_comp)
+    {
+
+        //----------------------------------------------------------------------
+        // C += A or += scalar
+        //----------------------------------------------------------------------
+
+        if (A == NULL)
+        { 
+
+            //------------------------------------------------------------------
+            // scalar assignment: C += scalar
+            //------------------------------------------------------------------
+
+            #undef  GB_CIJ_WORK
+            #define GB_CIJ_WORK(pC)                 \
+            {                                       \
+                int8_t cb = Cb [pC] ;               \
+                if (cb == 0)                        \
+                {                                   \
+                    /* Cx [pC] = scalar */          \
+                    GB_ASSIGN_SCALAR (pC) ;         \
+                }                                   \
+                else                                \
+                {                                   \
+                    /* Cx [pC] += scalar */         \
+                    GB_ACCUM_SCALAR (pC) ;          \
+                }                                   \
+            }
+            #include "GB_bitmap_assign_C_whole_template.c"
+
+            // free the bitmap or set it to all ones
+            GB_bitmap_assign_to_full (C, nthreads_max) ;
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // matrix assignment: C += A
+            //------------------------------------------------------------------
+
+            if (GB_IS_FULL (A))
+            { 
+
+                //--------------------------------------------------------------
+                // C += A where C is bitmap and A is full
+                //--------------------------------------------------------------
+
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                     \
+                {                                           \
+                    int8_t cb = Cb [pC] ;                   \
+                    if (cb == 0)                            \
+                    {                                       \
+                        /* Cx [pC] = Ax [pC] */             \
+                        GB_ASSIGN_AIJ (pC, pC) ;            \
+                    }                                       \
+                    else                                    \
+                    {                                       \
+                        /* Cx [pC] += Ax [pC] */            \
+                        GB_ACCUM_AIJ (pC, pC) ;             \
+                    }                                       \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+
+                // free the bitmap or set it to all ones
+                GB_bitmap_assign_to_full (C, nthreads_max) ;
+
+            }
+            else if (GB_IS_BITMAP (A))
+            { 
+
+                //--------------------------------------------------------------
+                // C += A where C and A are bitmap
+                //--------------------------------------------------------------
+
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                     \
+                {                                           \
+                    if (Ab [pC])                            \
+                    {                                       \
+                        int8_t cb = Cb [pC] ;               \
+                        if (cb == 0)                        \
+                        {                                   \
+                            /* Cx [pC] = Ax [pC] */         \
+                            GB_ASSIGN_AIJ (pC, pC) ;        \
+                            Cb [pC] = 1 ;                   \
+                            task_cnvals++ ;                 \
+                        }                                   \
+                        else                                \
+                        {                                   \
+                            /* Cx [pC] += Ax [pC] */        \
+                            GB_ACCUM_AIJ (pC, pC) ;         \
+                        }                                   \
+                    }                                       \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+                C->nvals = cnvals ;
+
+            }
+            else
+            { 
+
+                //--------------------------------------------------------------
+                // C += A where C is bitmap and A is sparse or hyper
+                //--------------------------------------------------------------
+
+                #undef  GB_AIJ_WORK
+                #define GB_AIJ_WORK(pC,pA)                  \
+                {                                           \
+                    int8_t cb = Cb [pC] ;                   \
+                    if (cb == 0)                            \
+                    {                                       \
+                        /* Cx [pC] = Ax [pA] */             \
+                        GB_ASSIGN_AIJ (pC, pA) ;            \
+                        Cb [pC] = 1 ;                       \
+                        task_cnvals++ ;                     \
+                    }                                       \
+                    else                                    \
+                    {                                       \
+                        /* Cx [pC] += Ax [pA] */            \
+                        GB_ACCUM_AIJ (pC, pA) ;             \
+                    }                                       \
+                }
+                #include "GB_bitmap_assign_A_whole_template.c"
+                C->nvals = cnvals ;
+            }
+        }
+    }
+
+#if 0
+    else if (C_replace)
+    {
+        // The mask is not present yet complemented: C_replace phase only.  all
+        // entries are deleted.  This is done by GB_clear in GB_assign_prep
+        // and is thus not needed here.
+        GB_memset (Cb, 0, cnzmax, nthreads_max) ;
+        cnvals = 0 ;
+    }
+#endif
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, no M, accum, whole", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_noM_noaccum.c b/GraphBLAS/Source/GB_bitmap_assign_noM_noaccum.c
new file mode 100644
index 0000000000..e41bb35757
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_noM_noaccum.c
@@ -0,0 +1,183 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_noM_noaccum:  assign to C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<>(I,J) = A             assign
+// C(I,J)<> = A             subassign
+
+// C<repl>(I,J) = A         assign
+// C(I,J)<repl> = A         subassign
+
+// C<!>(I,J) = A            assign
+// C(I,J)<!> = A            subassign
+
+// C<!,repl>(I,J) = A       assign
+// C(I,J)<!,repl> = A       subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           none
+// Mask_comp:   true or false
+// Mask_struct: true or false (ignored)
+// C_replace:   true or false
+// accum:       not present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign, row assign, col assign, or subassign
+
+// If M is not present and Mask_comp is true, then an empty mask is
+// complemented.  This case is handled by GB_assign_prep by calling this
+// method with no matrix A, but with a scalar (which is unused).  However,
+// for GB_ASSIGN, C<!,replace>(I,J)=anything clears all of C, regardless of
+// I and J.  In that case, GB_assign_prep calls GB_clear instead.
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GB_bitmap_assign_noM_noaccum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+//  const GrB_Matrix M,         // mask matrix, not present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit6", NULL, Mask_comp, NULL,
+        Ikind, Jkind, assign_kind) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign: noM, noaccum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign: noM, noaccum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_GET_A_AND_SCALAR
+
+    //--------------------------------------------------------------------------
+    // C_replace phase
+    //--------------------------------------------------------------------------
+
+    if (C_replace)
+    { 
+        if (assign_kind == GB_ASSIGN)
+        {
+            // for assign: set all Cb(:,:) to zero
+            GB_memset (Cb, 0, cnzmax, nthreads_max) ;
+            cnvals = 0 ;
+        }
+        else
+        {
+            // for row assign: set Cb(i,:) to zero
+            // for col assign: set Cb(:,j) to zero
+            // for subassign: set all Cb(I,J) to zero
+            #define NO_ASSIGN_CASE
+            #define GB_CIJ_WORK(pC)                 \
+            {                                       \
+                int8_t cb = Cb [pC] ;               \
+                Cb [pC] = 0 ;                       \
+                task_cnvals -= (cb == 1) ;          \
+            }
+            #include "GB_bitmap_assign_C_template.c"
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // assignment phase
+    //--------------------------------------------------------------------------
+
+    if (!Mask_comp)
+    {
+
+        if (A == NULL)
+        { 
+
+            //------------------------------------------------------------------
+            // scalar assignment: C(I,J) = scalar
+            //------------------------------------------------------------------
+
+            // for all IxJ
+            #undef  GB_IXJ_WORK
+            #define GB_IXJ_WORK(pC,ignore)          \
+            {                                       \
+                int8_t cb = Cb [pC] ;               \
+                /* Cx [pC] = scalar */              \
+                GB_ASSIGN_SCALAR (pC) ;             \
+                Cb [pC] = 1 ;                       \
+                task_cnvals += (cb == 0) ;          \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // matrix assignment: C(I,J) = A
+            //------------------------------------------------------------------
+
+            if (!C_replace)
+            { 
+                // delete all entries in C(I,J)
+                #undef  GB_IXJ_WORK
+                #define GB_IXJ_WORK(pC,ignore)          \
+                {                                       \
+                    int8_t cb = Cb [pC] ;               \
+                    Cb [pC] = 0 ;                       \
+                    task_cnvals -= (cb == 1) ;          \
+                }
+                #include "GB_bitmap_assign_IxJ_template.c"
+            }
+
+            // for all entries aij in A (A hyper, sparse, bitmap, or full)
+            //      Cx(p) = aij     // C(iC,jC) inserted or updated
+            //      Cb(p) = 1
+
+            #define GB_AIJ_WORK(pC,pA)              \
+            {                                       \
+                int8_t cb = Cb [pC] ;               \
+                /* Cx [pC] = Ax [pA] */             \
+                GB_ASSIGN_AIJ (pC, pA) ;            \
+                Cb [pC] = 1 ;                       \
+            }
+            #include "GB_bitmap_assign_A_template.c"
+
+            cnvals += GB_NNZ (A) ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    ASSERT_MATRIX_OK (C, "final C for bitmap assign: noM, noaccum", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_noM_noaccum_whole.c b/GraphBLAS/Source/GB_bitmap_assign_noM_noaccum_whole.c
new file mode 100644
index 0000000000..a052835a5e
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_noM_noaccum_whole.c
@@ -0,0 +1,170 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_noM_noaccum_whole:  assign to C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<> = A             assign
+// C<> = A             subassign
+
+// C<repl> = A         assign
+// C<repl> = A         subassign
+
+// C<!> = A            assign
+// C<!> = A            subassign
+
+// C<!,repl> = A       assign
+// C<!,repl> = A       subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           none
+// Mask_comp:   true or false
+// Mask_struct: true or false (ignored)
+// C_replace:   true or false
+// accum:       not present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign or subassign (same action)
+
+// If M is not present and Mask_comp is true, then an empty mask is
+// complemented.  This case is handled by GB_assign_prep:  if C_replace is
+// true, the matrix is cleared by GB_clear, or no action is taken otherwise.
+// In either case, this method is not called.  However, the "if (!Mask_comp)"
+// test is left in below, for clarity.  Mask_comp will always be false here.
+
+// For scalar assignment, C = x, this method just calls GB_dense_subassign_21,
+// which handles any sparsity structure of C, including bitmap.
+
+// For matrix assignment, C = A, if A is sparse or hyper and C may become
+// sparse or hyper, then the assignement is done by GB_subassign_24.
+
+#include "GB_bitmap_assign_methods.h"
+#include "GB_dense.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GB_bitmap_assign_noM_noaccum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+//  const GrB_Matrix M,         // mask matrix, not present here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    GBURBLE_BITMAP_ASSIGN ("bit6:whole", NULL, Mask_comp, NULL,
+        GB_ALL, GB_ALL, GB_ASSIGN) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign: noM, noaccum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign: noM, noaccum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // do the assignment
+    //--------------------------------------------------------------------------
+
+    if (!Mask_comp)
+    {
+
+        //----------------------------------------------------------------------
+        // C = A or C = scalar
+        //----------------------------------------------------------------------
+
+        if (A == NULL)
+        { 
+
+            //------------------------------------------------------------------
+            // scalar assignment: C = scalar
+            //------------------------------------------------------------------
+
+            GB_OK (GB_dense_subassign_21 (C, scalar, scalar_type, Context)) ;
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // matrix assignment: C = A
+            //------------------------------------------------------------------
+
+            GB_GET_C_BITMAP ;           // C must be bitmap
+            GB_GET_A_AND_SCALAR
+
+            if (GB_IS_BITMAP (A) || GB_IS_FULL (A))
+            { 
+
+                //--------------------------------------------------------------
+                // C = A where C is bitmap and A is bitmap or full
+                //--------------------------------------------------------------
+
+                // copy or typecast the values
+                int nthreads = GB_nthreads (cnzmax, chunk, nthreads_max) ;
+                GB_cast_array (Cx, C->type->code, (GB_void *) Ax,
+                    A->type->code, Ab, A->type->size, cnzmax, nthreads) ;
+
+                if (GB_IS_BITMAP (A))
+                { 
+                    // copy the bitmap
+                    GB_memcpy (Cb, Ab, cnzmax, nthreads_max) ;
+                    C->nvals = GB_NNZ (A) ;
+                }
+                else
+                { 
+                    // free the bitmap or set it to all ones
+                    GB_bitmap_assign_to_full (C, nthreads_max) ;
+                }
+
+            }
+            else
+            {
+
+                //--------------------------------------------------------------
+                // C = A where C is bitmap and A is sparse or hyper
+                //--------------------------------------------------------------
+
+                int sparsity = GB_sparsity_control (C->sparsity, C->vdim) ;
+                if ((GB_IS_SPARSE (A) && (sparsity & GxB_SPARSE)) ||
+                    (GB_IS_HYPERSPARSE (A) && (sparsity & GxB_HYPERSPARSE)))
+                { 
+                    // C becomes sparse or hypersparse, the same as A
+                    GB_OK (GB_subassign_24 (C, A, Context)) ;
+                }
+                else
+                { 
+                    // C remains bitmap: scatter A into the C bitmap
+                    GB_memset (Cb, 0, cnzmax, nthreads_max) ;
+                    cnvals = 0 ;
+                    #define GB_AIJ_WORK(pC,pA)              \
+                    {                                       \
+                        /* Cx [pC] = Ax [pA] */             \
+                        GB_ASSIGN_AIJ (pC, pA) ;            \
+                        Cb [pC] = 1 ;                       \
+                    }
+                    #include "GB_bitmap_assign_A_whole_template.c"
+                    C->nvals = GB_NNZ (A) ;
+                }
+            }
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (C, "final C bitmap assign: noM, noaccum, whole", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_notM_accum.c b/GraphBLAS/Source/GB_bitmap_assign_notM_accum.c
new file mode 100644
index 0000000000..f663b65129
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_notM_accum.c
@@ -0,0 +1,193 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_notM_accum:  assign to C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<!M>(I,J) += A       assign
+// C(I,J)<!M> += A       subassign
+
+// C<!M,repl>(I,J) += A       assign
+// C(I,J)<!M,repl> += A       subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           present, hypersparse or sparse (not bitmap or full)
+// Mask_comp:   true
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign, row assign, col assign, or subassign
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;
+
+GrB_Info GB_bitmap_assign_notM_accum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix
+//  const bool Mask_comp,       // true here, for !M only
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit7", M, true, accum,
+        Ikind, Jkind, assign_kind) ;
+    ASSERT (GB_IS_HYPERSPARSE (M) || GB_IS_SPARSE (M)) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, !M, accum", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for bitmap assign, !M, accum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, !M, accum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_SLICE_M
+    GB_GET_A_AND_SCALAR
+    GB_GET_ACCUM_FOR_BITMAP
+
+    //--------------------------------------------------------------------------
+    // scatter the mask M into C
+    //--------------------------------------------------------------------------
+
+    // Cb [pC] += 2 for each entry M(i,j) in the mask
+    GB_bitmap_M_scatter (C, I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+        M, Mask_struct, assign_kind, GB_BITMAP_M_SCATTER_PLUS_2,
+        pstart_Mslice, kfirst_Mslice, klast_Mslice,
+        M_nthreads, M_ntasks, Context) ;
+
+    //--------------------------------------------------------------------------
+    // do the assignment
+    //--------------------------------------------------------------------------
+
+    if (A == NULL)
+    { 
+
+        //----------------------------------------------------------------------
+        // scalar assignment: C<!M>(I,J) += scalar
+        //----------------------------------------------------------------------
+
+        // for all IxJ
+        #define GB_IXJ_WORK(pC,ignore)          \
+        {                                       \
+            int8_t cb = Cb [pC] ;               \
+            if (cb == 0)                        \
+            {                                   \
+                /* Cx [pC] = scalar  */         \
+                GB_ASSIGN_SCALAR (pC) ;         \
+                Cb [pC] = 1 ;                   \
+                task_cnvals++ ;                 \
+            }                                   \
+            else if (cb == 1)                   \
+            {                                   \
+                /* Cx [pC] += scalar */         \
+                GB_ACCUM_SCALAR (pC) ;          \
+            }                                   \
+        }
+        #include "GB_bitmap_assign_IxJ_template.c"
+
+    }
+    else
+    { 
+
+        //----------------------------------------------------------------------
+        // matrix assignment: C<!M>(I,J) += A
+        //----------------------------------------------------------------------
+
+        // for all entries aij in A (A can be hyper, sparse, bitmap, or full)
+        //     if Cb(p) == 0
+        //         Cx(p) = aij
+        //         Cb(p) = 1       // C(iC,jC) is now present, insert
+        //         task_cnvals++
+        //     if Cb(p) == 1
+        //         Cx(p) += aij    // C(iC,jC) still present, updated
+        //         Cb(p) still 1
+        //     if Cb(p) == 2       // do nothing
+        //     if Cb(p) == 3       // do nothing
+
+        #define GB_AIJ_WORK(pC,pA)              \
+        {                                       \
+            int8_t cb = Cb [pC] ;               \
+            if (cb == 0)                        \
+            {                                   \
+                /* Cx [pC] = Ax [pA] */         \
+                GB_ASSIGN_AIJ (pC, pA) ;        \
+                Cb [pC] = 1 ;                   \
+                task_cnvals++ ;                 \
+            }                                   \
+            else if (cb == 1)                   \
+            {                                   \
+                /* Cx [pC] += Ax [pA] */        \
+                GB_ACCUM_AIJ (pC, pA) ;         \
+            }                                   \
+        }
+        #include "GB_bitmap_assign_A_template.c"
+    }
+
+    //--------------------------------------------------------------------------
+    // clear M from C and handle the C_replace phase
+    //--------------------------------------------------------------------------
+
+    if (!C_replace)
+    { 
+        // for each entry mij == 1
+                // 2 -> 0
+                // 3 -> 1       keep this entry
+        // Cb [pC] -= 2 for each entry M(i,j) in the mask
+        GB_bitmap_M_scatter (C, I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+            M, Mask_struct, assign_kind, GB_BITMAP_M_SCATTER_MINUS_2,
+            pstart_Mslice, kfirst_Mslice, klast_Mslice,
+            M_nthreads, M_ntasks, Context) ;
+    }
+    else
+    { 
+        // for each entry mij == 1
+                // 2 -> 0
+                // 3 -> 0       delete this entry
+        #undef  GB_MASK_WORK
+        #define GB_MASK_WORK(pC)                \
+        {                                       \
+            int8_t cb = Cb [pC] ;               \
+            task_cnvals -= (cb == 3) ;          \
+            Cb [pC] = 0 ;                       \
+        }
+        #include "GB_bitmap_assign_M_template.c"
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    GB_FREE_ALL ;
+    ASSERT_MATRIX_OK (C, "final C for bitmap assign, !M, accum", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_notM_accum_whole.c b/GraphBLAS/Source/GB_bitmap_assign_notM_accum_whole.c
new file mode 100644
index 0000000000..516068ebf1
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_notM_accum_whole.c
@@ -0,0 +1,329 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_notM_accum_whole:  assign to C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<!M> += A       assign
+// C<!M> += A       subassign
+
+// C<!M,repl> += A       assign
+// C<!M,repl> += A       subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           present, hypersparse or sparse (not bitmap or full)
+// Mask_comp:   true
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign or subassign (same action)
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;
+
+GrB_Info GB_bitmap_assign_notM_accum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Matrix M,         // mask matrix
+//  const bool Mask_comp,       // true here, for !M only
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit7:whole", M, true, accum,
+        GB_ALL, GB_ALL, GB_ASSIGN) ;
+    ASSERT (GB_IS_HYPERSPARSE (M) || GB_IS_SPARSE (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, !M, accum", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for bitmap assign, !M, accum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, !M, accum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_SLICE_M
+    GB_GET_A_AND_SCALAR
+    GB_GET_ACCUM_FOR_BITMAP
+
+    //--------------------------------------------------------------------------
+    // scatter M
+    //--------------------------------------------------------------------------
+
+    // Cb [pC] += 2 for each entry M(i,j) in the mask
+    GB_bitmap_M_scatter_whole (C,
+        M, Mask_struct, GB_BITMAP_M_SCATTER_PLUS_2,
+        pstart_Mslice, kfirst_Mslice, klast_Mslice,
+        M_nthreads, M_ntasks, Context) ;
+    // the bitmap of C now contains:
+    //  Cb (i,j) = 0:   cij not present, mij zero
+    //  Cb (i,j) = 1:   cij present, mij zero
+    //  Cb (i,j) = 2:   cij not present, mij 1
+    //  Cb (i,j) = 3:   cij present, mij 1
+
+    //--------------------------------------------------------------------------
+    // do the assignment
+    //--------------------------------------------------------------------------
+
+    if (A == NULL)
+    { 
+
+        //----------------------------------------------------------------------
+        // scalar assignment: C<!M, replace or !replace> += scalar
+        //----------------------------------------------------------------------
+
+        if (C_replace)
+        { 
+
+            //------------------------------------------------------------------
+            // C<!M,replace> += scalar
+            //------------------------------------------------------------------
+
+            #undef  GB_CIJ_WORK
+            #define GB_CIJ_WORK(pC)                                 \
+            {                                                       \
+                switch (Cb [pC])                                    \
+                {                                                   \
+                    case 0: /* C(i,j) not present, !M(i,j) = 1 */   \
+                        /* Cx [pC] = scalar */                      \
+                        GB_ASSIGN_SCALAR (pC) ;                     \
+                        Cb [pC] = 1 ;                               \
+                        task_cnvals++ ;                             \
+                        break ;                                     \
+                    case 1: /* C(i,j) present, !M(i,j) = 1 */       \
+                        /* Cx [pC] += scalar */                     \
+                        GB_ACCUM_SCALAR (pC) ;                      \
+                        break ;                                     \
+                    case 2: /* C(i,j) not present, !M(i,j) = 0 */   \
+                        /* clear the mask from C */                 \
+                        Cb [pC] = 0 ;                               \
+                        break ;                                     \
+                    case 3: /* C(i,j) present, !M(i,j) = 0 */       \
+                        /* delete this entry */                     \
+                        Cb [pC] = 0 ;                               \
+                        task_cnvals-- ;                             \
+                        break ;                                     \
+                    default: ;                                      \
+                }                                                   \
+            }
+            #include "GB_bitmap_assign_C_whole_template.c"
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // C<!M> += scalar
+            //------------------------------------------------------------------
+
+            #undef  GB_CIJ_WORK
+            #define GB_CIJ_WORK(pC)                                 \
+            {                                                       \
+                switch (Cb [pC])                                    \
+                {                                                   \
+                    case 0: /* C(i,j) not present, !M(i,j) = 1 */   \
+                        /* Cx [pC] = scalar */                      \
+                        GB_ASSIGN_SCALAR (pC) ;                     \
+                        Cb [pC] = 1 ;                               \
+                        task_cnvals++ ;                             \
+                        break ;                                     \
+                    case 1: /* C(i,j) present, !M(i,j) = 1 */       \
+                        /* Cx [pC] += scalar */                     \
+                        GB_ACCUM_SCALAR (pC) ;                      \
+                        break ;                                     \
+                    case 2: /* C(i,j) not present, !M(i,j) = 0 */   \
+                        /* clear the mask from C */                 \
+                        Cb [pC] = 0 ;                               \
+                        break ;                                     \
+                    case 3: /* C(i,j) present, !M(i,j) = 0 */       \
+                        /* C(i,j) remains; clear the mask from C */ \
+                        Cb [pC] = 1 ;                               \
+                        break ;                                     \
+                    default: ;                                      \
+                }                                                   \
+            }
+            #include "GB_bitmap_assign_C_whole_template.c"
+        }
+
+    }
+    else
+    { 
+
+        //----------------------------------------------------------------------
+        // matrix assignment: C<!M, replace or !replace> += A
+        //----------------------------------------------------------------------
+
+        if (GB_IS_BITMAP (A) || GB_IS_FULL (A))
+        {
+
+            //------------------------------------------------------------------
+            // C<!M, replace or !replace> += A where A is bitmap or full
+            //------------------------------------------------------------------
+
+            if (C_replace)
+            {
+
+                //--------------------------------------------------------------
+                // C<!M, replace> += A where A is bitmap or full
+                //--------------------------------------------------------------
+
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                                 \
+                {                                                       \
+                    switch (Cb [pC])                                    \
+                    {                                                   \
+                        case 0: /* C(i,j) not present, !M(i,j) = 1 */   \
+                            if (GBB (Ab, pC))                           \
+                            {                                           \
+                                /* Cx [pC] = Ax [pC] */                 \
+                                GB_ASSIGN_AIJ (pC, pC) ;                \
+                                Cb [pC] = 1 ;                           \
+                                task_cnvals++ ;                         \
+                            }                                           \
+                            break ;                                     \
+                        case 1: /* C(i,j) present, !M(i,j) = 1 */       \
+                            if (GBB (Ab, pC))                           \
+                            {                                           \
+                                /* Cx [pC] += Ax [pC] */                \
+                                GB_ACCUM_AIJ (pC, pC) ;                 \
+                            }                                           \
+                            break ;                                     \
+                        case 2: /* C(i,j) not present, !M(i,j) = 0 */   \
+                            /* clear the mask from C */                 \
+                            Cb [pC] = 0 ;                               \
+                            break ;                                     \
+                        case 3: /* C(i,j) present, !M(i,j) = 0 */       \
+                            /* delete this entry */                     \
+                            Cb [pC] = 0 ;                               \
+                            task_cnvals-- ;                             \
+                            break ;                                     \
+                        default: ;                                      \
+                    }                                                   \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+
+            }
+            else
+            {
+
+                //--------------------------------------------------------------
+                // C<!M> += A where A is bitmap or full
+                //--------------------------------------------------------------
+
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                                 \
+                {                                                       \
+                    switch (Cb [pC])                                    \
+                    {                                                   \
+                        case 0: /* C(i,j) not present, !M(i,j) = 1 */   \
+                            if (GBB (Ab, pC))                           \
+                            {                                           \
+                                /* Cx [pC] = Ax [pC] */                 \
+                                GB_ASSIGN_AIJ (pC, pC) ;                \
+                                Cb [pC] = 1 ;                           \
+                                task_cnvals++ ;                         \
+                            }                                           \
+                            break ;                                     \
+                        case 1: /* C(i,j) present, !M(i,j) = 1 */       \
+                            if (GBB (Ab, pC))                           \
+                            {                                           \
+                                /* Cx [pC] += Ax [pC] */                \
+                                GB_ACCUM_AIJ (pC, pC) ;                 \
+                            }                                           \
+                            Cb [pC] = 1 ;                               \
+                            break ;                                     \
+                        case 2: /* C(i,j) not present, !M(i,j) = 0 */   \
+                            /* clear the mask from C */                 \
+                            Cb [pC] = 0 ;                               \
+                            break ;                                     \
+                        case 3: /* C(i,j) present, !M(i,j) = 0 */       \
+                            /* keep the entry */                        \
+                            Cb [pC] = 1 ;                               \
+                            break ;                                     \
+                        default: ;                                      \
+                    }                                                   \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+            }
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // C<!M, replace or !replace> += A where A is sparse or hyper
+            //------------------------------------------------------------------
+
+            // assign or accumulate entries from A into C
+            #undef  GB_AIJ_WORK
+            #define GB_AIJ_WORK(pC,pA)          \
+            {                                   \
+                int8_t cb = Cb [pC] ;           \
+                if (cb == 0)                    \
+                {                               \
+                    /* Cx [pC] = Ax [pA] */     \
+                    GB_ASSIGN_AIJ (pC, pA) ;    \
+                    Cb [pC] = 1 ;               \
+                    task_cnvals++ ;             \
+                }                               \
+                else if (cb == 1)               \
+                {                               \
+                    /* Cx [pC] += Ax [pA] */    \
+                    GB_ACCUM_AIJ (pC, pA) ;     \
+                }                               \
+            }
+            #include "GB_bitmap_assign_A_whole_template.c"
+
+            if (C_replace)
+            { 
+                // clear the mask and delete entries not assigned
+                #undef  GB_MASK_WORK
+                #define GB_MASK_WORK(pC)                \
+                {                                       \
+                    int8_t cb = Cb [pC] ;               \
+                    Cb [pC] = 0 ;                       \
+                    task_cnvals -= (cb == 3) ;          \
+                }
+                #include "GB_bitmap_assign_M_all_template.c"
+            }
+            else
+            { 
+                // clear the mask
+                // Cb [pC] -= 2 for each entry M(i,j) in the mask
+                GB_bitmap_M_scatter_whole (C,
+                    M, Mask_struct, GB_BITMAP_M_SCATTER_MINUS_2,
+                    pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                    M_nthreads, M_ntasks, Context) ;
+            }
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    GB_FREE_ALL ;
+    ASSERT_MATRIX_OK (C, "final C for bitmap assign, !M, accum, whole", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_notM_noaccum.c b/GraphBLAS/Source/GB_bitmap_assign_notM_noaccum.c
new file mode 100644
index 0000000000..eac05a34c7
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_notM_noaccum.c
@@ -0,0 +1,231 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_notM_noaccum:  assign to C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<!M>(I,J) = A       assign
+// C(I,J)<!M> = A       subassign
+
+// C<!M,repl>(I,J) = A       assign
+// C(I,J)<!M,repl> = A       subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           present, hypersparse or sparse (not bitmap or full)
+// Mask_comp:   true
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       not present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign, row assign, col assign, or subassign
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;
+
+GrB_Info GB_bitmap_assign_notM_noaccum
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Index *I,         // I index list
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,         // J index list
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,         // mask matrix
+//  const bool Mask_comp,       // true here, for !M only
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    const int assign_kind,      // row assign, col assign, assign, or subassign
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit8", M, true, NULL,
+        Ikind, Jkind, assign_kind) ;
+    ASSERT (GB_IS_HYPERSPARSE (M) || GB_IS_SPARSE (M)) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, !M, noaccum", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for bitmap assign, !M, noaccum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, !M, noaccum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_SLICE_M
+    GB_GET_A_AND_SCALAR
+
+    //--------------------------------------------------------------------------
+    // scatter M into the bitmap of C
+    //--------------------------------------------------------------------------
+
+    // Cb [pC] += 2 for each entry M(i,j) in the mask
+    GB_bitmap_M_scatter (C, I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+        M, Mask_struct, assign_kind, GB_BITMAP_M_SCATTER_PLUS_2,
+        pstart_Mslice, kfirst_Mslice, klast_Mslice,
+        M_nthreads, M_ntasks, Context) ;
+
+    // Cb (i,j) = 0:   cij not present, mij zero: can be modified
+    // Cb (i,j) = 1:   cij present, mij zero: can be modified,
+    //                      but delete if aij not present
+    // Cb (i,j) = 2:   cij not present, mij == 1: do not modify
+    // Cb (i,j) = 3:   cij present, mij == 1: do not modify
+
+    //--------------------------------------------------------------------------
+    // assign A into C
+    //--------------------------------------------------------------------------
+
+    if (A == NULL)
+    { 
+
+        //----------------------------------------------------------------------
+        // scalar assignment: C<!M>(I,J) = scalar
+        //----------------------------------------------------------------------
+
+        // for all IxJ
+        #define GB_IXJ_WORK(pC,ignore)      \
+        {                                   \
+            int8_t cb = Cb [pC] ;           \
+            if (cb <= 1)                    \
+            {                               \
+                /* Cx [pC] = scalar */      \
+                GB_ASSIGN_SCALAR (pC) ;     \
+                Cb [pC] = 1 ;               \
+                task_cnvals += (cb == 0) ;  \
+            }                               \
+            else if (C_replace)             \
+            {                               \
+                /* delete this entry */     \
+                Cb [pC] = 0 ;               \
+                task_cnvals -= (cb == 3) ;  \
+            }                               \
+            else                            \
+            {                               \
+                /* keep this entry */       \
+                Cb [pC] = (cb == 3) ;       \
+            }                               \
+        }
+        #include "GB_bitmap_assign_IxJ_template.c"
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // matrix assignment: C<!M>(I,J) = A
+        //----------------------------------------------------------------------
+
+        // for all entries aij in A (A can be hyper, sparse, bitmap, or full)
+        //     if Cb(p) == 0       // C(iC,jC) is now present, insert
+        //         Cx(p) = aij     //
+        //         Cb(p) = 4       // keep it
+        //         task_cnvals++
+        //     if Cb(p) == 1       // C(iC,jC) still present, updated
+        //         Cx(p) = aij     //
+        //         Cb(p) = 4       // keep it
+        //     if Cb(p) == 2       // do nothing
+        //     if Cb(p) == 3       // do nothing
+
+        #define GB_AIJ_WORK(pC,pA)          \
+        {                                   \
+            int8_t cb = Cb [pC] ;           \
+            if (cb <= 1)                    \
+            {                               \
+                /* Cx [pC] = Ax [pA] */     \
+                GB_ASSIGN_AIJ (pC, pA) ;    \
+                Cb [pC] = 4 ;               \
+                task_cnvals += (cb == 0) ;  \
+            }                               \
+        }
+        #include "GB_bitmap_assign_A_template.c"
+
+        //----------------------------------------------------------------------
+        // handle entries in IxJ
+        //----------------------------------------------------------------------
+
+        if (C_replace)
+        { 
+            // for all IxJ
+            #undef  GB_IXJ_WORK
+            #define GB_IXJ_WORK(pC,ignore)              \
+            {                                           \
+                int8_t cb = Cb [pC] ;                   \
+                Cb [pC] = (cb == 4) ;                   \
+                task_cnvals -= (cb == 1 || cb == 3) ;   \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+        }
+        else
+        { 
+            // for all IxJ
+            #undef  GB_IXJ_WORK
+            #define GB_IXJ_WORK(pC,ignore)              \
+            {                                           \
+                int8_t cb = Cb [pC] ;                   \
+                Cb [pC] = (cb == 4 || cb == 3) ;        \
+                task_cnvals -= (cb == 1) ;              \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // handle entries outside of IxJ
+    //--------------------------------------------------------------------------
+
+    if (assign_kind == GB_SUBASSIGN)
+    { 
+        // see above.  no more work to do
+    }
+    else
+    {
+        #define GB_NO_SUBASSIGN_CASE
+        if (C_replace)
+        { 
+            // for all entries in C.  Also clears M from C
+            #define GB_CIJ_WORK(pC)                 \
+            {                                       \
+                int8_t cb = Cb [pC] ;               \
+                Cb [pC] = (cb == 1) ;               \
+                task_cnvals -= (cb == 3) ;          \
+            }
+            #include "GB_bitmap_assign_C_template.c"
+        }
+        else
+        { 
+            // clear M from C
+            // Cb [pC] %= 2 for each entry M(i,j) in the mask
+            GB_bitmap_M_scatter (C, I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                M, Mask_struct, assign_kind, GB_BITMAP_M_SCATTER_MOD_2,
+                pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                M_nthreads, M_ntasks, Context) ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    GB_FREE_ALL ;
+    ASSERT_MATRIX_OK (C, "final C for bitmap assign, !M, noaccum", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_notM_noaccum_whole.c b/GraphBLAS/Source/GB_bitmap_assign_notM_noaccum_whole.c
new file mode 100644
index 0000000000..861d3bc491
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_notM_noaccum_whole.c
@@ -0,0 +1,335 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_notM_noaccum_whole:  assign to C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+// C<!M> = A       assign
+// C<!M> = A       subassign
+
+// C<!M,repl> = A       assign
+// C<!M,repl> = A       subassign
+//------------------------------------------------------------------------------
+
+// C:           bitmap
+// M:           present, hypersparse or sparse (not bitmap or full)
+// Mask_comp:   true
+// Mask_struct: true or false
+// C_replace:   true or false
+// accum:       not present
+// A:           matrix (hyper, sparse, bitmap, or full), or scalar
+// kind:        assign or subassign (same action)
+
+#include "GB_bitmap_assign_methods.h"
+
+#define GB_FREE_ALL \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;
+
+GrB_Info GB_bitmap_assign_notM_noaccum_whole
+(
+    // input/output:
+    GrB_Matrix C,               // input/output matrix in bitmap format
+    // inputs:
+    const bool C_replace,       // descriptor for C
+    const GrB_Matrix M,         // mask matrix
+//  const bool Mask_comp,       // true here, for !M only
+    const bool Mask_struct,     // true if M is structural, false if valued
+//  const GrB_BinaryOp accum,   // not present
+    const GrB_Matrix A,         // input matrix, not transposed
+    const void *scalar,         // input scalar
+    const GrB_Type scalar_type, // type of input scalar
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GBURBLE_BITMAP_ASSIGN ("bit8:whole", M, true, NULL,
+        GB_ALL, GB_ALL, GB_ASSIGN) ;
+    ASSERT (GB_IS_HYPERSPARSE (M) || GB_IS_SPARSE (M)) ;
+    ASSERT_MATRIX_OK (C, "C for bitmap assign, !M, noaccum", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for bitmap assign, !M, noaccum", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (A, "A for bitmap assign, !M, noaccum", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_GET_C_BITMAP ;           // C must be bitmap
+    GB_SLICE_M
+    GB_GET_A_AND_SCALAR
+
+    //--------------------------------------------------------------------------
+    // scatter M into the bitmap of C
+    //--------------------------------------------------------------------------
+
+    // Cb [pC] += 2 for each entry M(i,j) in the mask
+    GB_bitmap_M_scatter_whole (C,
+        M, Mask_struct, GB_BITMAP_M_SCATTER_PLUS_2,
+        pstart_Mslice, kfirst_Mslice, klast_Mslice,
+        M_nthreads, M_ntasks, Context) ;
+    // the bitmap of C now contains:
+    //  Cb (i,j) = 0:   cij not present, mij zero
+    //  Cb (i,j) = 1:   cij present, mij zero
+    //  Cb (i,j) = 2:   cij not present, mij 1
+    //  Cb (i,j) = 3:   cij present, mij 1
+
+    //--------------------------------------------------------------------------
+    // do the assignment
+    //--------------------------------------------------------------------------
+
+    if (A == NULL)
+    { 
+
+        //----------------------------------------------------------------------
+        // scalar assignment: C<!M, replace or !replace> = scalar
+        //----------------------------------------------------------------------
+
+        if (C_replace)
+        {
+
+            //------------------------------------------------------------------
+            // C<!M,replace> = scalar
+            //------------------------------------------------------------------
+
+            #undef  GB_CIJ_WORK
+            #define GB_CIJ_WORK(pC)                                 \
+            {                                                       \
+                switch (Cb [pC])                                    \
+                {                                                   \
+                    case 0: /* C(i,j) not present, !M(i,j) = 1 */   \
+                        /* Cx [pC] = scalar */                      \
+                        GB_ASSIGN_SCALAR (pC) ;                     \
+                        Cb [pC] = 1 ;                               \
+                        task_cnvals++ ;                             \
+                        break ;                                     \
+                    case 1: /* C(i,j) present, !M(i,j) = 1 */       \
+                        /* Cx [pC] = scalar */                      \
+                        GB_ASSIGN_SCALAR (pC) ;                     \
+                        break ;                                     \
+                    case 2: /* C(i,j) not present, !M(i,j) = 0 */   \
+                        /* clear the mask from C */                 \
+                        Cb [pC] = 0 ;                               \
+                        break ;                                     \
+                    case 3: /* C(i,j) present, !M(i,j) = 0 */       \
+                        /* delete this entry */                     \
+                        Cb [pC] = 0 ;                               \
+                        task_cnvals-- ;                             \
+                        break ;                                     \
+                    default: ;                                      \
+                }                                                   \
+            }
+            #include "GB_bitmap_assign_C_whole_template.c"
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // C<!M> = scalar
+            //------------------------------------------------------------------
+
+            #undef  GB_CIJ_WORK
+            #define GB_CIJ_WORK(pC)                                 \
+            {                                                       \
+                switch (Cb [pC])                                    \
+                {                                                   \
+                    case 0: /* C(i,j) not present, !M(i,j) = 1 */   \
+                        /* Cx [pC] = scalar */                      \
+                        GB_ASSIGN_SCALAR (pC) ;                     \
+                        Cb [pC] = 1 ;                               \
+                        task_cnvals++ ;                             \
+                        break ;                                     \
+                    case 1: /* C(i,j) present, !M(i,j) = 1 */       \
+                        /* Cx [pC] = scalar */                      \
+                        GB_ASSIGN_SCALAR (pC) ;                     \
+                        break ;                                     \
+                    case 2: /* C(i,j) not present, !M(i,j) = 0 */   \
+                        /* clear the mask from C */                 \
+                        Cb [pC] = 0 ;                               \
+                        break ;                                     \
+                    case 3: /* C(i,j) present, !M(i,j) = 0 */       \
+                        /* C(i,j) remains; clear the mask from C */ \
+                        Cb [pC] = 1 ;                               \
+                        break ;                                     \
+                    default: ;                                      \
+                }                                                   \
+            }
+            #include "GB_bitmap_assign_C_whole_template.c"
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // matrix assignment: C<!M, replace or !replace> = A
+        //----------------------------------------------------------------------
+
+        if (GB_IS_BITMAP (A) || GB_IS_FULL (A))
+        {
+
+            //------------------------------------------------------------------
+            // C<!M, replace or !replace> = A where A is bitmap or full
+            //------------------------------------------------------------------
+
+            if (C_replace)
+            {
+
+                //--------------------------------------------------------------
+                // C<!M, replace> = A where A is bitmap or full
+                //--------------------------------------------------------------
+
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                                 \
+                {                                                       \
+                    switch (Cb [pC])                                    \
+                    {                                                   \
+                        case 0: /* C(i,j) not present, !M(i,j) = 1 */   \
+                            if (GBB (Ab, pC))                           \
+                            {                                           \
+                                /* Cx [pC] = Ax [pC] */                 \
+                                GB_ASSIGN_AIJ (pC, pC) ;                \
+                                Cb [pC] = 1 ;                           \
+                                task_cnvals++ ;                         \
+                            }                                           \
+                            break ;                                     \
+                        case 1: /* C(i,j) present, !M(i,j) = 1 */       \
+                            if (GBB (Ab, pC))                           \
+                            {                                           \
+                                /* Cx [pC] = Ax [pC] */                 \
+                                GB_ASSIGN_AIJ (pC, pC) ;                \
+                            }                                           \
+                            else                                        \
+                            {                                           \
+                                /* delete this entry */                 \
+                                Cb [pC] = 0 ;                           \
+                                task_cnvals-- ;                         \
+                            }                                           \
+                            break ;                                     \
+                        case 2: /* C(i,j) not present, !M(i,j) = 0 */   \
+                            /* clear the mask from C */                 \
+                            Cb [pC] = 0 ;                               \
+                            break ;                                     \
+                        case 3: /* C(i,j) present, !M(i,j) = 0 */       \
+                            /* delete this entry */                     \
+                            Cb [pC] = 0 ;                               \
+                            task_cnvals-- ;                             \
+                            break ;                                     \
+                        default: ;                                      \
+                    }                                                   \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+
+            }
+            else
+            {
+
+                //--------------------------------------------------------------
+                // C<!M> = A where A is bitmap or full
+                //--------------------------------------------------------------
+
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                                 \
+                {                                                       \
+                    switch (Cb [pC])                                    \
+                    {                                                   \
+                        case 0: /* C(i,j) not present, !M(i,j) = 1 */   \
+                            if (GBB (Ab, pC))                           \
+                            {                                           \
+                                /* Cx [pC] = Ax [pC] */                 \
+                                GB_ASSIGN_AIJ (pC, pC) ;                \
+                                Cb [pC] = 1 ;                           \
+                                task_cnvals++ ;                         \
+                            }                                           \
+                            break ;                                     \
+                        case 1: /* C(i,j) present, !M(i,j) = 1 */       \
+                            if (GBB (Ab, pC))                           \
+                            {                                           \
+                                /* Cx [pC] = Ax [pC] */                 \
+                                GB_ASSIGN_AIJ (pC, pC) ;                \
+                            }                                           \
+                            else                                        \
+                            {                                           \
+                                /* delete this entry */                 \
+                                Cb [pC] = 0 ;                           \
+                                task_cnvals-- ;                         \
+                            }                                           \
+                            break ;                                     \
+                        case 2: /* C(i,j) not present, !M(i,j) = 0 */   \
+                            /* clear the mask from C */                 \
+                            Cb [pC] = 0 ;                               \
+                            break ;                                     \
+                        case 3: /* C(i,j) present, !M(i,j) = 0 */       \
+                            /* keep the entry */                        \
+                            Cb [pC] = 1 ;                               \
+                            break ;                                     \
+                        default: ;                                      \
+                    }                                                   \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+            }
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // C<!M, replace or !replace> = A where A is sparse or hyper
+            //------------------------------------------------------------------
+
+            // assign entries from A into C
+            #undef  GB_AIJ_WORK
+            #define GB_AIJ_WORK(pC,pA)          \
+            {                                   \
+                int8_t cb = Cb [pC] ;           \
+                if (cb <= 1)                    \
+                {                               \
+                    /* Cx [pC] = Ax [pA] */     \
+                    GB_ASSIGN_AIJ (pC, pA) ;    \
+                    Cb [pC] = 4 ;               \
+                    task_cnvals += (cb == 0) ;  \
+                }                               \
+            }
+            #include "GB_bitmap_assign_A_whole_template.c"
+
+            // clear the mask and delete entries not assigned
+            if (C_replace)
+            { 
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                     \
+                {                                           \
+                    int8_t cb = Cb [pC] ;                   \
+                    Cb [pC] = (cb == 4) ;                   \
+                    task_cnvals -= (cb == 1 || cb == 3) ;   \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+            }
+            else
+            { 
+                #undef  GB_CIJ_WORK
+                #define GB_CIJ_WORK(pC)                     \
+                {                                           \
+                    int8_t cb = Cb [pC] ;                   \
+                    Cb [pC] = (cb == 4 || cb == 3) ;        \
+                    task_cnvals -= (cb == 1) ;              \
+                }
+                #include "GB_bitmap_assign_C_whole_template.c"
+            }
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    C->nvals = cnvals ;
+    GB_FREE_ALL ;
+    ASSERT_MATRIX_OK (C, "final C for bitmap assign, !M, noaccum, whole", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_assign_to_full.c b/GraphBLAS/Source/GB_bitmap_assign_to_full.c
new file mode 100644
index 0000000000..795abb8059
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_assign_to_full.c
@@ -0,0 +1,46 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_to_full:  make a full bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// All entries in C are now present.  Either set all of C->b to 1, or free it
+// and make C full.
+
+#include "GB_bitmap_assign_methods.h"
+
+void GB_bitmap_assign_to_full   // set all C->b to 1, or free it and make C full
+(
+    GrB_Matrix C,
+    int nthreads_max
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_BITMAP (C)) ;
+
+    //--------------------------------------------------------------------------
+    // free the bitmap or set it to all ones
+    //--------------------------------------------------------------------------
+
+    if (GB_sparsity_control (C->sparsity, C->vdim) & GxB_FULL)
+    { 
+        // C is bitmap but can become full; convert it to full
+        GB_FREE (C->b) ;
+        C->nvals = -1 ;
+    }
+    else
+    { 
+        // all entries in C are now present; C remains bitmap
+        int64_t cnzmax = C->vlen * C->vdim ;
+        GB_memset (C->b, 1, cnzmax, nthreads_max) ;
+        C->nvals = cnzmax ;
+    }
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_selector.c b/GraphBLAS/Source/GB_bitmap_selector.c
new file mode 100644
index 0000000000..d8877caa13
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_selector.c
@@ -0,0 +1,102 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_selector:  select entries from a bitmap or full matrix
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_select.h"
+#include "GB_sel__include.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GB_bitmap_selector
+(
+    GrB_Matrix *Chandle,        // output matrix, never NULL
+    GB_Select_Opcode opcode,    // selector opcode
+    const GxB_select_function user_select,      // user select function
+    const bool flipij,          // if true, flip i and j for user operator
+    GrB_Matrix A,               // input matrix
+    const int64_t ithunk,       // (int64_t) Thunk, if Thunk is NULL
+    const GB_void *GB_RESTRICT xthunk,
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    ASSERT_MATRIX_OK (A, "A for bitmap selector", GB0) ;
+    ASSERT (GB_is_packed (A)) ;
+    ASSERT (opcode != GB_RESIZE_opcode) ;
+    ASSERT (opcode != GB_NONZOMBIE_opcode) ;
+
+    // Only GB_Matrix_wait and GB_resize pass in Chandle as NULL, and they
+    // do not operate on bitmap matrices.  So for the bitmap case, Chandle
+    // is never NULL.
+    ASSERT (Chandle != NULL) ;
+
+    //--------------------------------------------------------------------------
+    // get A
+    //--------------------------------------------------------------------------
+
+    int64_t anz = GB_NNZ_HELD (A) ;
+    const GB_Type_code typecode = A->type->code ;
+
+    //--------------------------------------------------------------------------
+    // allocate C
+    //--------------------------------------------------------------------------
+
+    // C->b and C->x are malloc'd, not calloc'd
+    GrB_Matrix C = NULL ;
+    GB_OK (GB_new_bix (&C, // always bitmap, new header
+        A->type, A->vlen, A->vdim, GB_Ap_calloc, true,
+        GxB_BITMAP, false, A->hyper_switch, -1, anz, true, Context)) ;
+    int64_t cnvals ;
+
+    //--------------------------------------------------------------------------
+    // determine the number of threads to use
+    //--------------------------------------------------------------------------
+
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+    int nthreads = GB_nthreads (anz, chunk, nthreads_max) ;
+
+    //--------------------------------------------------------------------------
+    // clear C for the EQ_ZERO opcode
+    //--------------------------------------------------------------------------
+
+    // All other opcodes set C->x in the worker below
+    if (opcode == GB_EQ_ZERO_opcode)
+    { 
+        GB_memset (C->x, 0, anz * A->type->size, nthreads_max) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // launch the switch factory to select the entries
+    //--------------------------------------------------------------------------
+
+    #define GB_BITMAP_SELECTOR
+    #define GB_selbit(opname,aname) GB_sel_bitmap_ ## opname ## aname
+    #define GB_SEL_WORKER(opname,aname,atype)                           \
+    {                                                                   \
+        GB_selbit (opname, aname) (C->b, C->x, &cnvals, A, flipij,      \
+            ithunk, (atype *) xthunk, user_select, nthreads) ;          \
+    }                                                                   \
+    break ;
+    #include "GB_select_factory.c"
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    (*Chandle) = C ;
+    C->nvals = cnvals ;
+    C->magic = GB_MAGIC ;
+    ASSERT_MATRIX_OK (C, "C from bitmap selector", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitmap_subref.c b/GraphBLAS/Source/GB_bitmap_subref.c
new file mode 100644
index 0000000000..b1f13b3396
--- /dev/null
+++ b/GraphBLAS/Source/GB_bitmap_subref.c
@@ -0,0 +1,223 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_subref: C = A(I,J) where A is bitmap or full
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// C=A(I,J), where A is bitmap or full, symbolic and numeric.
+// See GB_subref for details.
+
+#include "GB_subref.h"
+#include "GB_subassign_IxJ_slice.h"
+
+#define GB_FREE_ALL             \
+{                               \
+    GB_Matrix_free (Chandle) ;  \
+}
+
+GrB_Info GB_bitmap_subref       // C = A(I,J): either symbolic or numeric
+(
+    // output
+    GrB_Matrix *Chandle,
+    // input, not modified
+    const bool C_is_csc,        // requested format of C
+    const GrB_Matrix A,
+    const GrB_Index *I,         // index list for C = A(I,J), or GrB_ALL, etc.
+    const int64_t ni,           // length of I, or special
+    const GrB_Index *J,         // index list for C = A(I,J), or GrB_ALL, etc.
+    const int64_t nj,           // length of J, or special
+    const bool symbolic,        // if true, construct C as symbolic
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    ASSERT (Chandle != NULL) ;
+    ASSERT_MATRIX_OK (A, "A for C=A(I,J) bitmap subref", GB0) ;
+    ASSERT (GB_IS_BITMAP (A) || GB_IS_FULL (A)) ;
+    ASSERT (!GB_IS_SPARSE (A)) ;
+    ASSERT (!GB_IS_HYPERSPARSE (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    (*Chandle) = NULL ;
+
+    //--------------------------------------------------------------------------
+    // get A
+    //--------------------------------------------------------------------------
+
+    const int8_t  *GB_RESTRICT Ab = A->b ;
+    const GB_void *GB_RESTRICT Ax = (GB_void *) A->x ;
+    const int64_t avlen = A->vlen ;
+    const int64_t avdim = A->vdim ;
+    const size_t asize = A->type->size ;
+
+    //--------------------------------------------------------------------------
+    // check the properties of I and J
+    //--------------------------------------------------------------------------
+
+    // C = A(I,J) so I is in range 0:avlen-1 and J is in range 0:avdim-1
+    int64_t nI, nJ, Icolon [3], Jcolon [3] ;
+    int Ikind, Jkind ;
+    GB_ijlength (I, ni, avlen, &nI, &Ikind, Icolon) ;
+    GB_ijlength (J, nj, avdim, &nJ, &Jkind, Jcolon) ;
+
+    bool I_unsorted, I_has_dupl, I_contig, J_unsorted, J_has_dupl, J_contig ;
+    int64_t imin, imax, jmin, jmax ;
+
+    info = GB_ijproperties (I, ni, nI, avlen, &Ikind, Icolon,
+        &I_unsorted, &I_has_dupl, &I_contig, &imin, &imax, Context) ;
+    if (info != GrB_SUCCESS)
+    { 
+        // I invalid
+        return (info) ;
+    }
+
+    info = GB_ijproperties (J, nj, nJ, avdim, &Jkind, Jcolon,
+        &J_unsorted, &J_has_dupl, &J_contig, &jmin, &jmax, Context) ;
+    if (info != GrB_SUCCESS)
+    { 
+        // J invalid
+        return (info) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // allocate C
+    //--------------------------------------------------------------------------
+
+    GrB_Matrix C = NULL ;
+    int64_t cnzmax ;
+    bool ok = GB_Index_multiply ((GrB_Index *) (&cnzmax), nI, nJ) ;
+    if (!ok)
+    {
+        // problem too large
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+    GrB_Type ctype = symbolic ? GrB_INT64 : A->type ;
+    int sparsity = GB_IS_BITMAP (A) ? GxB_BITMAP : GxB_FULL ;
+    GB_OK (GB_new_bix (Chandle, // bitmap or full, new header
+        ctype, nI, nJ, GB_Ap_null, C_is_csc,
+        sparsity, true, A->hyper_switch, -1, cnzmax, true, Context)) ;
+    C = (*Chandle) ;
+
+    //--------------------------------------------------------------------------
+    // get C
+    //--------------------------------------------------------------------------
+
+    int8_t *GB_RESTRICT Cb = C->b ;
+
+    // In GB_bitmap_assign_IxJ_template, vlen is the vector length of the
+    // submatrix C(I,J), but here the template is used to access A(I,J), and so
+    // the vector length is A->vlen, not C->vlen.  The pointers pA and pC are
+    // swapped in the macros below, since C=A(I,J) is being computed, instead
+    // of C(I,J)=A for the bitmap assignment.
+
+    int64_t vlen = avlen ;
+
+    //--------------------------------------------------------------------------
+    // C = A(I,J)
+    //--------------------------------------------------------------------------
+
+    int64_t cnvals = 0 ;
+
+    if (sparsity == GxB_BITMAP)
+    {
+
+        //----------------------------------------------------------------------
+        // C = A (I,J) where A and C are both bitmap
+        //----------------------------------------------------------------------
+
+        // symbolic subref is only used by GB_subassign_symbolic, which only
+        // operates on a matrix that is hypersparse, sparse, or full, but not
+        // bitmap.  As a result, the symbolic subref C=A(I,J) where both A and
+        // C are bitmap is not needed.  The code is left here in case it is
+        // needed in the future.
+
+        ASSERT (!symbolic) ;
+
+#if 0
+        if (symbolic)
+        {
+            // C=A(I,J) symbolic with A and C bitmap
+            ASSERT (GB_DEAD_CODE) ;
+            int64_t *GB_RESTRICT Cx = (int64_t *) C->x ;
+            #undef  GB_IXJ_WORK
+            #define GB_IXJ_WORK(pA,pC)                                      \
+            {                                                               \
+                int8_t ab = Ab [pA] ;                                       \
+                Cb [pC] = ab ;                                              \
+                Cx [pC] = pA ;                                              \
+                task_cnvals += ab ;                                         \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+        }
+        else
+#endif
+        { 
+            // C=A(I,J) numeric with A and C bitmap
+            GB_void *GB_RESTRICT Cx = (GB_void *) C->x ;
+            #undef  GB_IXJ_WORK
+            #define GB_IXJ_WORK(pA,pC)                                      \
+            {                                                               \
+                int8_t ab = Ab [pA] ;                                       \
+                Cb [pC] = ab ;                                              \
+                if (ab)                                                     \
+                {                                                           \
+                    /* Cx [pC] = Ax [pA] */                                 \
+                    memcpy (Cx +((pC)*asize), Ax +((pA)*asize), asize) ;    \
+                    task_cnvals++ ;                                         \
+                }                                                           \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+        }
+        C->nvals = cnvals ;
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // C = A (I,J) where A and C are both full
+        //----------------------------------------------------------------------
+
+        if (symbolic)
+        { 
+            // C=A(I,J) symbolic with A and C full (from GB_subassign_symbolic)
+            int64_t *GB_RESTRICT Cx = (int64_t *) C->x ;
+            #undef  GB_IXJ_WORK
+            #define GB_IXJ_WORK(pA,pC)                                      \
+            {                                                               \
+                Cx [pC] = pA ;                                              \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+        }
+        else
+        { 
+            // C=A(I,J) numeric with A and C full
+            GB_void *GB_RESTRICT Cx = (GB_void *) C->x ;
+            #undef  GB_IXJ_WORK
+            #define GB_IXJ_WORK(pA,pC)                                      \
+            {                                                               \
+                /* Cx [pC] = Ax [pA] */                                     \
+                memcpy (Cx +((pC)*asize), Ax +((pA)*asize), asize) ;        \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    C->magic = GB_MAGIC ;
+    ASSERT_MATRIX_OK (C, "C output for bitmap subref C=A(I,J)", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bitwise.c b/GraphBLAS/Source/GB_bitwise.c
index 67c2b8a761..fe3a6eb92b 100644
--- a/GraphBLAS/Source/GB_bitwise.c
+++ b/GraphBLAS/Source/GB_bitwise.c
@@ -2,8 +2,8 @@
 // GB_bitwise.c: declaring functions from GB_bitwise.h
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_bitwise.h b/GraphBLAS/Source/GB_bitwise.h
index 610c2ed600..f9016eb318 100644
--- a/GraphBLAS/Source/GB_bitwise.h
+++ b/GraphBLAS/Source/GB_bitwise.h
@@ -2,8 +2,8 @@
 // GB_bitwise.h: definitions for bitwise operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_bix_alloc.c b/GraphBLAS/Source/GB_bix_alloc.c
new file mode 100644
index 0000000000..bd28cc75c6
--- /dev/null
+++ b/GraphBLAS/Source/GB_bix_alloc.c
@@ -0,0 +1,98 @@
+//------------------------------------------------------------------------------
+// GB_bix_alloc: allocate a matrix to hold a given number of entries
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Does not modify A->p or A->h (unless an error occurs).  Frees A->b, A->x,
+// and A->i and reallocates them to the requested size.  Frees any pending
+// tuples and deletes all entries (including zombies, if any).  If numeric is
+// false, then A->x is freed but not reallocated.
+
+// If this method fails, all content of A is freed (including A->p and A->h).
+
+#include "GB.h"
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_bix_alloc       // allocate A->b, A->i, and A->x space in a matrix
+(
+    GrB_Matrix A,           // matrix to allocate space for
+    const GrB_Index nzmax,  // number of entries the matrix can hold
+    const bool is_bitmap,   // if true, allocate A->b, otherwise A->b is NULL
+    const bool bitmap_calloc,   // if true, calloc A->b, otherwise use malloc
+    const bool is_sparse,   // if true, allocate A->i, otherwise A->i is NULL
+    const bool numeric,     // if true, allocate A->x, otherwise A->x is NULL
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (A != NULL) ;
+    if (nzmax > GxB_INDEX_MAX)
+    { 
+        // problem too large
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // allocate the A->b, A->x, and A->i content of the matrix
+    //--------------------------------------------------------------------------
+
+    // Free the existing A->b, A->x, and A->i content, if any.
+    // Leave A->p and A->h unchanged.
+    GB_bix_free (A) ;
+
+    // allocate the new A->x and A->i content
+    A->nzmax = GB_IMAX (nzmax, 1) ;
+
+    bool ok = true ;
+    if (is_sparse)
+    { 
+        if (A->nzmax <= 1)
+        {
+            A->i = GB_CALLOC (1, int64_t) ;
+        }
+        else
+        { 
+            A->i = GB_MALLOC (A->nzmax, int64_t) ;
+        }
+        ok = (A->i != NULL) ;
+    }
+    else if (is_bitmap)
+    { 
+        if (bitmap_calloc)
+        { 
+            // content is fully defined
+            A->b = GB_CALLOC (A->nzmax, int8_t) ;
+            A->magic = GB_MAGIC ;
+        }
+        else
+        { 
+            // bitmap is not defined and will be computed by the caller
+            A->b = GB_MALLOC (A->nzmax, int8_t) ;
+        }
+        ok = (A->b != NULL) ;
+    }
+
+    if (numeric)
+    { 
+        A->x = GB_MALLOC (A->nzmax * A->type->size, GB_void) ;
+        ok = ok && (A->x != NULL) ;
+    }
+
+    if (!ok)
+    { 
+        // out of memory
+        GB_phbix_free (A) ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_ix_free.c b/GraphBLAS/Source/GB_bix_free.c
similarity index 71%
rename from GraphBLAS/Source/GB_ix_free.c
rename to GraphBLAS/Source/GB_bix_free.c
index 3e1e96af22..24f68ed5cf 100644
--- a/GraphBLAS/Source/GB_ix_free.c
+++ b/GraphBLAS/Source/GB_bix_free.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_ix_free: free A->i, A->x, pending tuples, zombies; A->p, A->h unchanged
+// GB_bix_free: free A->(b,i,x) pending tuples, zombies; A->p, A->h unchanged
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,7 +13,7 @@
 #include "GB_Pending.h"
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_ix_free             // free A->i and A->x of a matrix
+void GB_bix_free                // free A->b, A->i, and A->x of a matrix
 (
     GrB_Matrix A                // matrix with content to free
 )
@@ -25,7 +25,8 @@ GrB_Info GB_ix_free             // free A->i and A->x of a matrix
 
     if (A == NULL)
     { 
-        return (GrB_SUCCESS) ;
+        // nothing to do
+        return ;
     }
 
     //--------------------------------------------------------------------------
@@ -33,7 +34,17 @@ GrB_Info GB_ix_free             // free A->i and A->x of a matrix
     //--------------------------------------------------------------------------
 
     // zombies and pending tuples are about to be deleted
-    ASSERT (GB_PENDING_OK (A)) ; ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+
+    // free A->b unless it is shallow
+    if (!A->b_shallow)
+    { 
+        GB_FREE (A->b) ;
+    }
+    A->b = NULL ;
+    A->b_shallow = false ;
 
     // free A->i unless it is shallow
     if (!A->i_shallow)
@@ -52,15 +63,15 @@ GrB_Info GB_ix_free             // free A->i and A->x of a matrix
     A->x_shallow = false ;
 
     A->nzmax = 0 ;
+    A->nvals = 0 ;
 
     // no zombies remain
     A->nzombies = 0 ;
 
+    // an empty matrix is not jumbled
+    A->jumbled = false ;
+
     // free the list of pending tuples
     GB_Pending_free (&(A->Pending)) ;
-
-    if (!GB_queue_remove (A)) return (GrB_PANIC) ;  // TODO in 4.0: delete
-
-    return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_block.c b/GraphBLAS/Source/GB_block.c
index 936ba14a0d..776dd286d6 100644
--- a/GraphBLAS/Source/GB_block.c
+++ b/GraphBLAS/Source/GB_block.c
@@ -2,8 +2,8 @@
 // GB_block: apply all pending computations if blocking mode enabled
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,7 +23,6 @@ GrB_Info GB_block   // apply all pending computations if blocking mode enabled
     // check inputs
     //--------------------------------------------------------------------------
 
-    GrB_Info info ;
     ASSERT (A != NULL) ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_boolean_rename.c b/GraphBLAS/Source/GB_boolean_rename.c
index b4f274be2f..7cc3c37ee5 100644
--- a/GraphBLAS/Source/GB_boolean_rename.c
+++ b/GraphBLAS/Source/GB_boolean_rename.c
@@ -2,8 +2,8 @@
 // GB_boolean_rename: rename a boolean opcode
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_boolean_rename_op.c b/GraphBLAS/Source/GB_boolean_rename_op.c
new file mode 100644
index 0000000000..e7178bfaba
--- /dev/null
+++ b/GraphBLAS/Source/GB_boolean_rename_op.c
@@ -0,0 +1,80 @@
+//------------------------------------------------------------------------------
+// GB_boolean_rename_op: rename a boolean operator
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If the user requests the creation of a monoid based on a duplicate
+// built-in binary operator, the unique boolean operator is used instead.
+// See also GB_boolean_rename, which does this for opcodes, not operators.
+// This is done before the operator is checked, so that any error messages
+// reflect the renaming.
+
+#include "GB.h"
+#include "GB_binop.h"
+
+GrB_BinaryOp GB_boolean_rename_op   // return renamed op
+(
+    const GrB_BinaryOp op           // op to rename
+)
+{
+
+    if (op == GrB_DIV_BOOL)
+    { 
+        // FIRST and DIV are the same for boolean:
+        return (GrB_FIRST_BOOL) ;
+    }
+    if (op == GxB_RDIV_BOOL)
+    { 
+        // SECOND and RDIV are the same for boolean:
+        return (GrB_SECOND_BOOL) ;
+    }
+    if (op == GrB_MIN_BOOL || op == GrB_TIMES_BOOL)
+    { 
+        // MIN, TIMES, and LAND are the same for boolean:
+        return (GrB_LAND) ;
+    }
+    if (op == GrB_MAX_BOOL || op == GrB_PLUS_BOOL)
+    { 
+        // MAX, PLUS, and OR are the same for boolean:
+        return (GrB_LOR) ;
+    }
+    if (op == GxB_ISNE_BOOL || op == GrB_NE_BOOL || op == GrB_MINUS_BOOL
+        || op == GxB_RMINUS_BOOL)
+    { 
+        // ISNE, NE, MINUS, RMINUS, and XOR are the same for boolean:
+        return (GrB_LXOR) ;
+    }
+    if (op == GxB_ISEQ_BOOL || op == GrB_LXNOR)
+    { 
+        // LXNOR, ISEQ, EQ are the same for boolean:
+        return (GrB_EQ_BOOL) ;
+    }
+    if (op == GxB_ISGT_BOOL)
+    { 
+        // ISGT, GT are the same for boolean:
+        return (GrB_GT_BOOL) ;
+    }
+    if (op == GxB_ISLT_BOOL)
+    { 
+        // ISLT, LT are the same for boolean:
+        return (GrB_LT_BOOL) ;
+    }
+    if (op == GxB_ISGE_BOOL || op == GxB_POW_BOOL)
+    { 
+        // POW, ISGE, GE are the same for boolean:
+        return (GrB_GE_BOOL) ;
+    }
+    if (op == GxB_ISLE_BOOL)
+    { 
+        // ISLE, LE are the same for boolean:
+        return (GrB_LE_BOOL) ;
+    }
+
+    // operator is not changed
+    return (op) ;
+}
+
diff --git a/GraphBLAS/Source/GB_bracket.h b/GraphBLAS/Source/GB_bracket.h
index 527818b087..dddf21adc8 100644
--- a/GraphBLAS/Source/GB_bracket.h
+++ b/GraphBLAS/Source/GB_bracket.h
@@ -2,8 +2,8 @@
 // GB_bracket.h: definitions for GB_bracket
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,8 @@
 
 #if 0
 
-// no longer used
+// This method is no longer used but is kept here in case it is needed in
+// the future.
 
 static inline void GB_bracket_left
 (
@@ -29,8 +30,7 @@ static inline void GB_bracket_left
     int64_t *kleft,
     const int64_t kright
 )
-{
-    ASSERT (DEAD_CODE) ;
+{ 
     // tighten kleft
     int64_t len = kright - (*kleft) + 1 ;
     if (len > 0 && X [(*kleft)] < imin)
@@ -82,6 +82,9 @@ static inline void GB_bracket_right
 
 // Zombies are not tolerated.
 
+// This method is no longer used but is kept here in case it is needed in
+// the future.
+
 #if 0
 
 static inline void GB_bracket
@@ -95,8 +98,6 @@ static inline void GB_bracket
     int64_t *kright_new
 )
 { 
-    ASSERT (DEAD_CODE) ;
-
     int64_t kleft  = kleft_in ;
     int64_t kright = kright_in ;
 
diff --git a/GraphBLAS/Source/GB_build.c b/GraphBLAS/Source/GB_build.c
index 14ac654111..3ff9e1fe8a 100644
--- a/GraphBLAS/Source/GB_build.c
+++ b/GraphBLAS/Source/GB_build.c
@@ -2,22 +2,19 @@
 // GB_build: build a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// CALLED BY: GB_matvec_build and GB_reduce_to_vector
+// CALLED BY: GB_matvec_build
 // CALLS:     GB_builder
 
 // GB_matvec_build constructs a GrB_Matrix or GrB_Vector from the tuples
 // provided by the user.  In that case, the tuples must be checked for
 // duplicates.  They might be sorted on input, so this condition is checked and
-// exploited if found.  GB_reduce_to_vector constructs a GrB_Vector froma
-// GrB_Matrix, by discarding the vector index.  As a result, duplicates are
-// likely to appear, and the input is likely to be unsorted.  But for
-// GB_reduce_to_vector, the validity of the tuples need not be checked.  All of
-// these conditions are checked in GB_builder.
+// exploited if that condition is found.  All of these conditions are checked
+// in GB_builder.
 
 // GB_build constructs a matrix C from a list of indices and values.  Any
 // duplicate entries with identical indices are assembled using the binary dup
@@ -93,14 +90,13 @@ GrB_Info GB_build               // build matrix
 (
     GrB_Matrix C,               // matrix to build
     const GrB_Index *I_input,   // "row" indices of tuples (as if CSC)
-    const GrB_Index *J_input,   // "col" indices of tuples (as if CSC) NULL for
-                                // GrB_Vector_build or GB_reduce_to_vector
+    const GrB_Index *J_input,   // "col" indices of tuples (as if CSC)
+                                // J_input is NULL for GrB_Vector_build
     const void *S_input,        // values
     const GrB_Index nvals,      // number of tuples
     const GrB_BinaryOp dup,     // binary function to assemble duplicates
     const GB_Type_code scode,   // GB_Type_code of S_input array
     const bool is_matrix,       // true if C is a matrix, false if GrB_Vector
-    const bool ijcheck,         // true if I and J are to be checked
     GB_Context Context
 )
 {
@@ -110,14 +106,16 @@ GrB_Info GB_build               // build matrix
     //--------------------------------------------------------------------------
 
     ASSERT (C != NULL) ;
+    ASSERT (dup != NULL) ;
+    ASSERT (!GB_OP_IS_POSITIONAL (dup)) ;
 
     //--------------------------------------------------------------------------
     // free all content of C
     //--------------------------------------------------------------------------
 
-    // the type, dimensions, and hyper ratio are still preserved in C.
-    GB_PHIX_FREE (C) ;
-    ASSERT (GB_EMPTY (C)) ;
+    // the type, dimensions, and hyper_switch are still preserved in C.
+    GB_phbix_free (C) ;
+    ASSERT (GB_IS_EMPTY (C)) ;
     ASSERT (!GB_ZOMBIES (C)) ;
     ASSERT (C->magic == GB_MAGIC2) ;
 
@@ -125,8 +123,8 @@ GrB_Info GB_build               // build matrix
     // build the matrix T
     //--------------------------------------------------------------------------
 
-    // T is always hypersparse.  Its type is the same as the z output of the
-    // z=dup(x,y) operator.
+    // T is always built as hypersparse .  Its type is the same as the z output
+    // of the z=dup(x,y) operator.
 
     // S_input must be treated as read-only, so GB_builder is not allowed to
     // transplant it into T->x.
@@ -147,10 +145,9 @@ GrB_Info GB_build               // build matrix
         &no_J_work,     // J_work_handle, not used here
         &no_S_work,     // S_work_handle, not used here
         false,          // known_sorted: not yet known
-        false,          // known_no_duplicatces: not yet known
+        false,          // known_no_duplicates: not yet known
         0,              // I_work, J_work, and S_work not used here
         is_matrix,      // true if T is a GrB_Matrix
-        ijcheck,        // true if I and J are to be checked
         (int64_t *) ((C->is_csc) ? I_input : J_input),
         (int64_t *) ((C->is_csc) ? J_input : I_input),
         (const GB_void *) S_input,   // original values, each of size nvals
@@ -170,6 +167,10 @@ GrB_Info GB_build               // build matrix
     // transplant and typecast T into C, conform C, and free T
     //--------------------------------------------------------------------------
 
+    ASSERT (GB_IS_HYPERSPARSE (T)) ;
+    ASSERT (!GB_ZOMBIES (T)) ;
+    ASSERT (!GB_JUMBLED (T)) ;
+    ASSERT (!GB_PENDING (T)) ;
     return (GB_transplant_conform (C, C->type, &T, Context)) ;
 }
 
diff --git a/GraphBLAS/Source/GB_build.h b/GraphBLAS/Source/GB_build.h
index d083dff5cc..5f57c54b7a 100644
--- a/GraphBLAS/Source/GB_build.h
+++ b/GraphBLAS/Source/GB_build.h
@@ -2,8 +2,8 @@
 // GB_build.h: definitions for GB_build
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -28,14 +28,13 @@ GrB_Info GB_build               // build matrix
 (
     GrB_Matrix C,               // matrix to build
     const GrB_Index *I_input,   // "row" indices of tuples (as if CSC)
-    const GrB_Index *J_input,   // "col" indices of tuples (as if CSC) NULL for
-                                // GrB_Vector_build or GB_reduce_to_vector
+    const GrB_Index *J_input,   // "col" indices of tuples (as if CSC)
+                                // J_input is NULL for GrB_Vector_build
     const void *S_input,        // values
     const GrB_Index nvals,      // number of tuples
     const GrB_BinaryOp dup,     // binary function to assemble duplicates
     const GB_Type_code scode,   // GB_Type_code of S_input array
     const bool is_matrix,       // true if C is a matrix, false if GrB_Vector
-    const bool ijcheck,         // true if I and J are to be checked
     GB_Context Context
 ) ;
 
@@ -53,7 +52,6 @@ GrB_Info GB_builder                 // build a matrix from tuples
     bool known_no_duplicates,       // true if tuples known to not have dupl
     int64_t ijslen,                 // size of I_work and J_work arrays
     const bool is_matrix,           // true if T a GrB_Matrix, false if vector
-    const bool ijcheck,             // true if I_input,J_input must be checked
     const int64_t *GB_RESTRICT I_input,// original indices, size nvals
     const int64_t *GB_RESTRICT J_input,// original indices, size nvals
     const GB_void *GB_RESTRICT S_input,// array of values of tuples, size nvals
diff --git a/GraphBLAS/Source/GB_builder.c b/GraphBLAS/Source/GB_builder.c
index 7b2a59479c..9b934a6690 100644
--- a/GraphBLAS/Source/GB_builder.c
+++ b/GraphBLAS/Source/GB_builder.c
@@ -2,8 +2,8 @@
 // GB_builder: build a matrix from tuples
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -56,13 +56,6 @@
 // all unless duplicates appear.  Step 4 takes no time, for any vector. Step 5
 // does O(e/p) reads/writes per thread.
 
-// For GrB_reduce_to_vector: like GrB_Vector_build, but many duplicates are
-// likely, and the indices will not be sorted.  The input is always a single
-// vector (vdim == 1).  Step 1 only does a parallel memcpy, from I_input to
-// I_work.  Step 2 takes O((e log e)/p) time to sort the (i,k) tuples.  Step 3
-// does O(e/p) read/writes.  Step 4 takes no time.  Step 5 does O(e/p)
-// read/writes per thread.
-
 // For GB_Matrix_wait:  the pending tuples are provided as I_work, J_work, and
 // S_work, so Step 1 is skipped (no need to check for invalid indices).  The
 // input J_work may be null (vdim can be anything, since GB_Matrix_wait is used
@@ -86,6 +79,10 @@
 // able to transplant S_work into T->x since the input will almost always be
 // unsorted.
 
+// For BITMAP case: this method always returns T as hypersparse, and has no
+// matrix inputs.   If the final C should become full or bitmap, that
+// conversion is done by GB_transplant_conform.
+
 #include "GB_build.h"
 #include "GB_sort.h"
 #include "GB_binop.h"
@@ -108,9 +105,6 @@
     GB_FREE (*J_work_handle) ;      \
     GB_FREE (*S_work_handle) ;      \
     GB_FREE (K_work) ;              \
-    GB_FREE (W0) ;                  \
-    GB_FREE (W1) ;                  \
-    GB_FREE (W1) ;                  \
 }
 
 //------------------------------------------------------------------------------
@@ -131,7 +125,6 @@ GrB_Info GB_builder                 // build a matrix from tuples
     bool known_no_duplicates,       // true if tuples known to not have dupl
     int64_t ijslen,                 // size of I_work and J_work arrays
     const bool is_matrix,           // true if T a GrB_Matrix, false if vector
-    const bool ijcheck,             // true if I_input,J_input must be checked
     const int64_t *GB_RESTRICT I_input,// original indices, size nvals
     const int64_t *GB_RESTRICT J_input,// original indices, size nvals
     const GB_void *GB_RESTRICT S_input,// array of values of tuples, size nvals
@@ -156,6 +149,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
     ASSERT (I_work_handle != NULL) ;
     ASSERT (J_work_handle != NULL) ;
     ASSERT (S_work_handle != NULL) ;
+    ASSERT (!GB_OP_IS_POSITIONAL (dup)) ;
 
     //--------------------------------------------------------------------------
     // get S
@@ -190,9 +184,6 @@ GrB_Info GB_builder                 // build a matrix from tuples
     int64_t *GB_RESTRICT I_work = (*I_work_handle) ;
     int64_t *GB_RESTRICT J_work = (*J_work_handle) ;
     int64_t *GB_RESTRICT K_work = NULL ;
-    int64_t *GB_RESTRICT W0 = NULL ;
-    int64_t *GB_RESTRICT W1 = NULL ;
-    int64_t *GB_RESTRICT W2 = NULL ;
 
     //--------------------------------------------------------------------------
     // determine the number of threads to use
@@ -222,7 +213,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
     { 
         // out of memory
         GB_FREE_WORK ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -295,7 +286,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
         { 
             // out of memory
             GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         //----------------------------------------------------------------------
@@ -308,11 +299,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
         if (nvals == 0)
         { 
 
-            //------------------------------------------------------------------
             // nothing to do
-            //------------------------------------------------------------------
-
-            ;
 
         }
         else if (is_matrix)
@@ -395,10 +382,10 @@ GrB_Info GB_builder                 // build a matrix from tuples
                     int64_t nrows = is_csc ? vlen : vdim ;
                     int64_t ncols = is_csc ? vdim : vlen ;
                     GB_FREE_WORK ;
-                    return (GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS, (GB_LOG,
+                    GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS,
                         "index (" GBd "," GBd ") out of bounds,"
                         " must be < (" GBd ", " GBd ")",
-                        row, col, nrows, ncols))) ;
+                        row, col, nrows, ncols) ;
                 }
             }
 
@@ -420,7 +407,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
                 { 
                     // out of memory
                     GB_FREE_WORK ;
-                    return (GB_OUT_OF_MEMORY) ;
+                    return (GrB_OUT_OF_MEMORY) ;
                 }
                 GB_memcpy (J_work, J_input, nvals * sizeof (int64_t), nthreads);
             }
@@ -434,7 +421,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
             }
 
         }
-        else if (ijcheck)
+        else
         {
 
             //------------------------------------------------------------------
@@ -491,29 +478,11 @@ GrB_Info GB_builder                 // build a matrix from tuples
                     // invalid index
                     int64_t i = I_input [kbad [tid]] ;
                     GB_FREE_WORK ;
-                    return (GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS, (GB_LOG,
+                    GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS,
                         "index (" GBd ") out of bounds, must be < (" GBd ")",
-                        i, vlen))) ;
+                        i, vlen) ;
                 }
             }
-
-        }
-        else
-        { 
-
-            //------------------------------------------------------------------
-            // GB_reduce_to_vector: do not check I_input, assume not sorted
-            //------------------------------------------------------------------
-
-            // Many duplicates are possible, since the tuples are being used to
-            // construct a single vector.  For a CSC format, each entry A(i,j)
-            // becomes an (i,aij) tuple, with the vector index j discarded.  All
-            // entries in a single row i are reduced to a single entry in the
-            // vector.  The input is unlikely to be sorted, so do not bother to
-            // check.
-
-            GB_memcpy (I_work, I_input, nvals * sizeof (int64_t), nthreads) ;
-            known_sorted = false ;
         }
 
         //----------------------------------------------------------------------
@@ -545,7 +514,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
         { 
             // out of memory
             GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         // The k part of each tuple (i,k) or (j,i,k) records the original
@@ -562,59 +531,18 @@ GrB_Info GB_builder                 // build a matrix from tuples
         { 
             K_work [k] = k ;
         }
-    
-        // determine # of threads to use in the parallel mergesort
-        int nth = GB_MSORT_NTHREADS (nthreads) ;
 
         // sort all the tuples
         if (vdim > 1)
         {
-
-            //------------------------------------------------------------------
             // sort a set of (j,i,k) tuples
-            //------------------------------------------------------------------
-
-            if (nth > 1)
-            {
-                W0 = GB_MALLOC (nvals, int64_t) ;
-                W1 = GB_MALLOC (nvals, int64_t) ;
-                W2 = GB_MALLOC (nvals, int64_t) ;
-                if (W0 == NULL || W1 == NULL || W2 == NULL)
-                { 
-                    // out of memory
-                    GB_FREE_WORK ;
-                    return (GB_OUT_OF_MEMORY) ;
-                }
-            }
-
-            GB_msort_3 (J_work, I_work, K_work, W0, W1, W2, nvals, nth) ;
-
+            GB_msort_3b (J_work, I_work, K_work, nvals, nthreads) ;
         }
         else
         {
-
-            //------------------------------------------------------------------
             // sort a set of (i,k) tuples
-            //------------------------------------------------------------------
-
-            if (nth > 1)
-            { 
-                W0 = GB_MALLOC (nvals, int64_t) ;
-                W1 = GB_MALLOC (nvals, int64_t) ;
-                if (W0 == NULL || W1 == NULL)
-                { 
-                    // out of memory
-                    GB_FREE_WORK ;
-                    return (GB_OUT_OF_MEMORY) ;
-                }
-            }
-
-            GB_msort_2 (I_work, K_work, W0, W1, nvals, nth) ;
+            GB_msort_2b (I_work, K_work, nvals, nthreads) ;
         }
-
-        GB_FREE (W0) ;
-        GB_FREE (W1) ;
-        GB_FREE (W2) ;
     }
 
     //--------------------------------------------------------------------------
@@ -786,10 +714,11 @@ GrB_Info GB_builder                 // build a matrix from tuples
     // allocate T; always hypersparse
     //--------------------------------------------------------------------------
 
-    // [ allocate T; allocate T->p and T->h but do not initialize them.
+    // allocate T; allocate T->p and T->h but do not initialize them.
     // T is always hypersparse.
-    info = GB_new (&T, ttype, vlen, vdim, GB_Ap_malloc, is_csc, GB_FORCE_HYPER,
-        GB_ALWAYS_HYPER, tnvec, Context) ;
+    info = GB_new (&T, // always hyper (even vectors), new header
+        ttype, vlen, vdim, GB_Ap_malloc, is_csc,
+        GxB_HYPERSPARSE, GB_ALWAYS_HYPER, tnvec, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory
@@ -797,10 +726,12 @@ GrB_Info GB_builder                 // build a matrix from tuples
         return (info) ;
     }
 
-    ASSERT (T->is_hyper) ;
+    ASSERT (T->p != NULL) ;
+    ASSERT (T->h != NULL) ;
     ASSERT (T->nzmax == 0) ;        // T->i and T->x not yet allocated
-    ASSERT (T->x == NULL) ;
+    ASSERT (T->b == NULL) ;
     ASSERT (T->i == NULL) ;
+    ASSERT (T->x == NULL) ;
 
     (*Thandle) = T ;
 
@@ -906,7 +837,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
     T->nvec = tnvec ;
     Tp [tnvec] = tnz ;
     ASSERT (T->nvec == T->plen) ;
-    T->magic = GB_MAGIC ;                      // T->p and T->h are now valid ]
+    T->magic = GB_MAGIC ;
 
     //--------------------------------------------------------------------------
     // free J_work if it exists
@@ -929,7 +860,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
         { 
             // this cannot fail since the size is shrinking.
             bool ok ;
-            I_work = GB_REALLOC (I_work, T->nzmax, ijslen, int64_t, &ok) ;
+            GB_REALLOC (I_work, T->nzmax, ijslen, int64_t, &ok) ;
             ASSERT (ok) ;
         }
         // transplant I_work into T->i
@@ -944,9 +875,9 @@ GrB_Info GB_builder                 // build a matrix from tuples
         if (T->i == NULL)
         { 
             // out of memory
-            GB_MATRIX_FREE (Thandle) ;
+            GB_Matrix_free (Thandle) ;
             GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
     }
 
@@ -1004,7 +935,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
     GB_Type_code tcode = ttype->code ;
     bool op_2nd ;
 
-    ASSERT_TYPE_OK (ttype, "ttype for build_factorize", GB0) ;
+    ASSERT_TYPE_OK (ttype, "ttype for build_factory", GB0) ;
 
     if (dup == NULL)
     { 
@@ -1092,11 +1023,11 @@ GrB_Info GB_builder                 // build a matrix from tuples
         // order, and no duplicates appear.  All that is required is to copy S
         // into Tx.  S can be directly transplanted into T->x since S is
         // provided as S_work.  GB_builder must either transplant or free
-        // S_work.  The transplant can be used by GB_Matrix_wait (whenever the tuples
-        // are already sorted, with no duplicates, and no typecasting is
+        // S_work.  The transplant can be used by GB_Matrix_wait (whenever the
+        // tuples are already sorted, with no duplicates, and no typecasting is
         // needed, since S_work is always A->Pending->x).  This transplant can
-        // rarely be used for GB_transpose (when op is NULL and the transposed
-        // tuples happen to be sorted, which is unlikely).
+        // rarely be used for GB_transpose, in the case when op is NULL and the
+        // transposed tuples happen to be sorted (which is unlikely).
 
         T->x = S_work ;
         S_work = NULL ;
@@ -1114,9 +1045,9 @@ GrB_Info GB_builder                 // build a matrix from tuples
         if (T->x == NULL)
         { 
             // out of memory
-            GB_MATRIX_FREE (Thandle) ;
+            GB_Matrix_free (Thandle) ;
             GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         GB_void *GB_RESTRICT Tx = (GB_void *) T->x ;
@@ -1141,7 +1072,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
             // copy S into Tx.  S cannot be transplanted into T->x since
             // S_work is NULL and S_input cannot be modified by GB_builder.
 
-            GBBURBLE ("(memcpy S into T) ") ;
+            GBURBLE ("(build:memcpy) ") ;
             ASSERT (S_work == NULL) ;
             ASSERT (S == S_input) ;
             GB_memcpy (Tx, S, nvals * tsize, nthreads) ;
@@ -1171,7 +1102,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
 
             // Early exit cannot be exploited, so the terminal is ignored.
 
-            GBBURBLE ("(assemble S into T, no casting) ") ;
+            GBURBLE ("(build:assemble) ") ;
             bool done = false ;
 
             #ifndef GBCOMPACT
@@ -1209,7 +1140,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
 
             if (!done)
             {
-                GB_BURBLE_N (nvals, "generic ") ;
+                GB_BURBLE_N (nvals, "(generic) ") ;
 
                 //--------------------------------------------------------------
                 // no typecasting, but use the fdup function pointer and memcpy
@@ -1261,7 +1192,7 @@ GrB_Info GB_builder                 // build a matrix from tuples
             // assemble the values S into T, typecasting as needed
             //------------------------------------------------------------------
 
-            GB_BURBLE_N (nvals, "generic ") ;
+            GB_BURBLE_N (nvals, "(generic with typecast) ") ;
 
             // S (either S_work or S_input) must be permuted and copied into
             // T->x, since the tuples had to be sorted, or duplicates appear.
@@ -1333,7 +1264,9 @@ GrB_Info GB_builder                 // build a matrix from tuples
     //--------------------------------------------------------------------------
 
     GB_FREE_WORK ;
+    T->jumbled = false ;
     ASSERT_MATRIX_OK (T, "T built", GB0) ;
+    ASSERT (GB_IS_HYPERSPARSE (T)) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_calloc_memory.c b/GraphBLAS/Source/GB_calloc_memory.c
index fc7560cc52..34e6109952 100644
--- a/GraphBLAS/Source/GB_calloc_memory.c
+++ b/GraphBLAS/Source/GB_calloc_memory.c
@@ -2,17 +2,17 @@
 // GB_calloc_memory: wrapper for calloc_function
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // A wrapper for calloc_function.  Space is set to zero.
 
-// Parameters are the same as the POSIX calloc, except that asking to allocate
-// a block of zero size causes a block of size 1 to be allocated instead.  This
-// allows the return pointer p to be checked for the out-of-memory condition,
-// even when allocating an object of size zero.
+// Parameters are the same as the ANSI C11 calloc, except that asking to
+// allocate a block of zero size causes a block of size 1 to be allocated
+// instead.  This allows the return pointer p to be checked for the
+// out-of-memory condition, even when allocating an object of size zero.
 
 #include "GB.h"
 
diff --git a/GraphBLAS/Source/GB_cast_array.c b/GraphBLAS/Source/GB_cast_array.c
index 6606094a20..543e536dee 100644
--- a/GraphBLAS/Source/GB_cast_array.c
+++ b/GraphBLAS/Source/GB_cast_array.c
@@ -2,8 +2,8 @@
 // GB_cast_array: typecast or copy an array
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,6 +22,7 @@ void GB_cast_array              // typecast an array
     const GB_Type_code code1,   // type code for Cx
     GB_void *Ax,                // input array
     const GB_Type_code code2,   // type code for Ax
+    const int8_t *GB_RESTRICT Ab,   // bitmap for Ax
     const size_t user_size,     // size of Ax and Cx if user-defined
     const int64_t anz,          // number of entries in Cx and Ax
     const int nthreads          // number of threads to use
@@ -44,20 +45,6 @@ void GB_cast_array              // typecast an array
     ASSERT (anz > 0) ;
     ASSERT (GB_code_compatible (code1, code2)) ;
 
-    //--------------------------------------------------------------------------
-    // quick memcpy if no typecast is needed
-    //--------------------------------------------------------------------------
-
-    if (code1 == code2)
-    { 
-        GB_memcpy (Cx, Ax, anz * GB_code_size (code2, user_size), nthreads) ;
-        return ;
-    }
-
-    // user-defined types cannot be typecast, so if either is user-defined,
-    // they must be the same type, and have just been handled above.
-    ASSERT (code1 != GB_UDT_code && code2 != GB_UDT_code) ;
-
     //--------------------------------------------------------------------------
     // typecast the array
     //--------------------------------------------------------------------------
@@ -74,7 +61,7 @@ void GB_cast_array              // typecast an array
         #define GB_WORKER(ignore1,zname,ztype,xname,xtype)          \
         {                                                           \
             GrB_Info info = GB_unop_apply (zname,xname)             \
-                ((ztype *) Cx, (xtype *) Ax, anz, nthreads) ;       \
+                ((ztype *) Cx, (xtype *) Ax, Ab, anz, nthreads) ;   \
             if (info == GrB_SUCCESS) return ;                       \
         }                                                           \
         break ;
@@ -83,27 +70,23 @@ void GB_cast_array              // typecast an array
         // launch the switch factory
         //----------------------------------------------------------------------
 
-        #define GB_EXCLUDE_SAME_TYPES
         #include "GB_2type_factory.c"
 
     #endif
 
     //--------------------------------------------------------------------------
-    // generic worker: typecasting for compact case only
+    // generic worker
     //--------------------------------------------------------------------------
 
-    // This is dead code unless GBCOMPACT is enabled.
-
-    GB_BURBLE_N (anz, "generic ") ;
-
-    int64_t csize = GB_code_size (code1, 1) ;
-    int64_t asize = GB_code_size (code2, 1) ;
+    int64_t csize = GB_code_size (code1, user_size) ;
+    int64_t asize = GB_code_size (code2, user_size) ;
     GB_cast_function cast_A_to_C = GB_cast_factory (code1, code2) ;
 
     int64_t p ;
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
-    {
+    { 
+        if (!GBB (Ab, p)) continue ;
         // Cx [p] = Ax [p]
         cast_A_to_C (Cx +(p*csize), Ax +(p*asize), asize) ;
     }
diff --git a/GraphBLAS/Source/GB_cast_factory.c b/GraphBLAS/Source/GB_cast_factory.c
index 09b22b777c..32f764f781 100644
--- a/GraphBLAS/Source/GB_cast_factory.c
+++ b/GraphBLAS/Source/GB_cast_factory.c
@@ -2,8 +2,8 @@
 // GB_cast_factory: return a pointer to a typecasting function
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_casting.c b/GraphBLAS/Source/GB_casting.c
index c003aaf463..737804207d 100644
--- a/GraphBLAS/Source/GB_casting.c
+++ b/GraphBLAS/Source/GB_casting.c
@@ -2,8 +2,10 @@
 // GB_casting.c: unary typecasting functions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
 
 #include "GB.h"
 
diff --git a/GraphBLAS/Source/GB_casting.h b/GraphBLAS/Source/GB_casting.h
index 3e469ef8db..9a152228c0 100644
--- a/GraphBLAS/Source/GB_casting.h
+++ b/GraphBLAS/Source/GB_casting.h
@@ -2,11 +2,13 @@
 // GB_casting.h: define the unary typecasting functions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
+// TODO: complex code is #ifdef'd out when using CUDA.
+
 #ifndef GB_CASTING_H
 #define GB_CASTING_H
 
@@ -33,8 +35,8 @@ GB_cast_function GB_cast_factory   // returns pointer to function to cast x to z
 // outside the range of the integer type.  MATLAB handles this case by
 // typecasting a float or double that is larger than the maximum integer to the
 // max integer, and a value less than the minimum integer to the min integer.
-// GraphBLAS v3.3.0 follows this rule (earlier versions handled this
-// differently).  NaN's are typecasted to the integer value zero.
+// NaN's are typecasted to the integer value zero.  GraphBLAS follows the same
+// rules as MATLAB.
 
 inline int8_t GB_cast_to_int8_t (double x)
 { 
@@ -148,12 +150,15 @@ GB_CAST_FUNCTION (bool      , uint32_t  )
 GB_CAST_FUNCTION (bool      , uint64_t  )
 GB_CAST_FUNCTION (bool      , float     )
 GB_CAST_FUNCTION (bool      , double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) (crealf (x) != 0 || cimagf (x) != 0)
 GB_CAST_FUNCTION (bool      , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) (creal (x) != 0 || cimag (x) != 0)
 GB_CAST_FUNCTION (bool      , GxB_FC64_t)
+#endif
 
 //------------------------------------------------------------------------------
 // typecast to int8_t
@@ -174,12 +179,15 @@ GB_CAST_FUNCTION (int8_t    , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_int8_t ((double) x)
 GB_CAST_FUNCTION (int8_t    , float     )
 GB_CAST_FUNCTION (int8_t    , double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int8_t ((double) crealf (x))
 GB_CAST_FUNCTION (int8_t    , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int8_t (creal (x))
 GB_CAST_FUNCTION (int8_t    , GxB_FC64_t)
+#endif
 
 //------------------------------------------------------------------------------
 // typecast to int16_t
@@ -200,12 +208,15 @@ GB_CAST_FUNCTION (int16_t   , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_int16_t ((double) x)
 GB_CAST_FUNCTION (int16_t   , float     )
 GB_CAST_FUNCTION (int16_t   , double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int16_t ((double) crealf (x))
 GB_CAST_FUNCTION (int16_t   , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int16_t (creal (x))
 GB_CAST_FUNCTION (int16_t   , GxB_FC64_t)
+#endif
 
 //------------------------------------------------------------------------------
 // typecast to int32_t
@@ -226,12 +237,15 @@ GB_CAST_FUNCTION (int32_t   , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_int32_t ((double) x)
 GB_CAST_FUNCTION (int32_t   , float     )
 GB_CAST_FUNCTION (int32_t   , double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int32_t ((double) crealf (x))
 GB_CAST_FUNCTION (int32_t   , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int32_t (creal (x))
 GB_CAST_FUNCTION (int32_t   , GxB_FC64_t)
+#endif
 
 //------------------------------------------------------------------------------
 // typecast to int64_t
@@ -252,12 +266,15 @@ GB_CAST_FUNCTION (int64_t   , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_int64_t ((double) x)
 GB_CAST_FUNCTION (int64_t   , float     )
 GB_CAST_FUNCTION (int64_t   , double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int64_t ((double) crealf (x))
 GB_CAST_FUNCTION (int64_t   , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_int64_t (creal (x))
 GB_CAST_FUNCTION (int64_t   , GxB_FC64_t)
+#endif
 
 //------------------------------------------------------------------------------
 // typecast to uint8_t
@@ -278,12 +295,15 @@ GB_CAST_FUNCTION (uint8_t   , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_uint8_t ((double) x)
 GB_CAST_FUNCTION (uint8_t   , float     )
 GB_CAST_FUNCTION (uint8_t   , double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint8_t ((double) crealf (x))
 GB_CAST_FUNCTION (uint8_t   , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint8_t (creal (x))
 GB_CAST_FUNCTION (uint8_t   , GxB_FC64_t)
+#endif
 
 //------------------------------------------------------------------------------
 // typecast to uint16_t
@@ -304,12 +324,15 @@ GB_CAST_FUNCTION (uint16_t  , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_uint16_t ((double) x)
 GB_CAST_FUNCTION (uint16_t  , float     )
 GB_CAST_FUNCTION (uint16_t  , double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint16_t ((double) crealf (x))
 GB_CAST_FUNCTION (uint16_t  , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint16_t (creal (x))
 GB_CAST_FUNCTION (uint16_t  , GxB_FC64_t)
+#endif
 
 //------------------------------------------------------------------------------
 // typecast to uint32_t
@@ -330,12 +353,15 @@ GB_CAST_FUNCTION (uint32_t  , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_uint32_t ((double) x)
 GB_CAST_FUNCTION (uint32_t  , float     )
 GB_CAST_FUNCTION (uint32_t  , double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint32_t ((double) crealf (x))
 GB_CAST_FUNCTION (uint32_t  , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint32_t (creal (x))
 GB_CAST_FUNCTION (uint32_t  , GxB_FC64_t)
+#endif 
 
 //------------------------------------------------------------------------------
 // typecast to uint64_t
@@ -356,12 +382,15 @@ GB_CAST_FUNCTION (uint64_t  , uint64_t  )
 #define GB_CAST(ztype,x) GB_cast_to_uint64_t ((double) x)
 GB_CAST_FUNCTION (uint64_t  , float     )
 GB_CAST_FUNCTION (uint64_t  , double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint64_t ((double) crealf (x))
 GB_CAST_FUNCTION (uint64_t  , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GB_cast_to_uint64_t (creal (x))
 GB_CAST_FUNCTION (uint64_t  , GxB_FC64_t)
+#endif
 
 //------------------------------------------------------------------------------
 // typecast to float
@@ -380,12 +409,15 @@ GB_CAST_FUNCTION (float     , uint32_t  )
 GB_CAST_FUNCTION (float     , uint64_t  )
 GB_CAST_FUNCTION (float     , float     )
 GB_CAST_FUNCTION (float     , double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) crealf (x)
 GB_CAST_FUNCTION (float     , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) ((float) creal (x))
 GB_CAST_FUNCTION (float     , GxB_FC64_t)
+#endif
 
 //------------------------------------------------------------------------------
 // typecast to double
@@ -404,12 +436,15 @@ GB_CAST_FUNCTION (double    , uint32_t  )
 GB_CAST_FUNCTION (double    , uint64_t  )
 GB_CAST_FUNCTION (double    , float     )
 GB_CAST_FUNCTION (double    , double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) ((double) crealf (x))
 GB_CAST_FUNCTION (double    , GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) creal (x)
 GB_CAST_FUNCTION (double    , GxB_FC64_t)
+#endif
 
 //------------------------------------------------------------------------------
 // typecast to float complex
@@ -428,12 +463,15 @@ GB_CAST_FUNCTION (GxB_FC32_t, uint32_t  )
 GB_CAST_FUNCTION (GxB_FC32_t, uint64_t  )
 GB_CAST_FUNCTION (GxB_FC32_t, float     )
 GB_CAST_FUNCTION (GxB_FC32_t, double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) x
 GB_CAST_FUNCTION (GxB_FC32_t, GxB_FC32_t)
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GxB_CMPLXF ((float) creal (x), (float) cimag (x))
 GB_CAST_FUNCTION (GxB_FC32_t, GxB_FC64_t)
+#endif
 
 //------------------------------------------------------------------------------
 // typecast to double complex
@@ -452,6 +490,8 @@ GB_CAST_FUNCTION (GxB_FC64_t, uint32_t  )
 GB_CAST_FUNCTION (GxB_FC64_t, uint64_t  )
 GB_CAST_FUNCTION (GxB_FC64_t, float     )
 GB_CAST_FUNCTION (GxB_FC64_t, double    )
+#ifndef GBCUDA
+// TODO: this does not work on CUDA yet
 #undef  GB_CAST
 #define GB_CAST(ztype,x) GxB_CMPLX ((double) crealf (x), (double) cimagf (x))
 GB_CAST_FUNCTION (GxB_FC64_t, GxB_FC32_t)
@@ -461,6 +501,7 @@ GB_CAST_FUNCTION (GxB_FC64_t, GxB_FC64_t)
 
 #undef  GB_CAST
 #undef  GB_CAST_FUNCTION
+#endif
 
 //------------------------------------------------------------------------------
 // GB_mcast: cast a mask entry from any native type to boolean
diff --git a/GraphBLAS/Source/GB_check.h b/GraphBLAS/Source/GB_check.h
new file mode 100644
index 0000000000..dc205cfe29
--- /dev/null
+++ b/GraphBLAS/Source/GB_check.h
@@ -0,0 +1,143 @@
+//------------------------------------------------------------------------------
+// GB_check.h: check and optionally print an object
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_CHECK_H
+#define GB_CHECK_H
+
+// pr values for *_check functions
+#define GB0 GxB_SILENT
+#define GB1 GxB_SUMMARY
+#define GB2 GxB_SHORT
+#define GB3 GxB_COMPLETE
+#define GB4 GxB_SHORT_VERBOSE
+#define GB5 GxB_COMPLETE_VERBOSE
+
+// a NULL name is treated as the empty string
+#define GB_NAME ((name != NULL) ? name : "")
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_entry_check     // print a single value
+(
+    const GrB_Type type,    // type of value to print
+    const void *x,          // value to print
+    int pr,                 // print level
+    FILE *f                 // file to print to
+) ;
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_code_check          // print and check an entry using a type code
+(
+    const GB_Type_code code,    // type code of value to print
+    const void *x,              // entry to print
+    int pr,                     // print level
+    FILE *f                     // file to print to
+) ;
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_Type_check      // check a GraphBLAS Type
+(
+    const GrB_Type type,    // GraphBLAS type to print and check
+    const char *name,       // name of the type from the caller; optional
+    int pr,                 // print level
+    FILE *f                 // file for output
+) ;
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_BinaryOp_check  // check a GraphBLAS binary operator
+(
+    const GrB_BinaryOp op,  // GraphBLAS operator to print and check
+    const char *name,       // name of the operator
+    int pr,                 // print level
+    FILE *f                 // file for output
+) ;
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_UnaryOp_check   // check a GraphBLAS unary operator
+(
+    const GrB_UnaryOp op,   // GraphBLAS operator to print and check
+    const char *name,       // name of the operator
+    int pr,                 // print level
+    FILE *f                 // file for output
+) ;
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_SelectOp_check  // check a GraphBLAS select operator
+(
+    const GxB_SelectOp op,  // GraphBLAS operator to print and check
+    const char *name,       // name of the operator
+    int pr,                 // print level
+    FILE *f                 // file for output
+) ;
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_Monoid_check        // check a GraphBLAS monoid
+(
+    const GrB_Monoid monoid,    // GraphBLAS monoid to print and check
+    const char *name,           // name of the monoid, optional
+    int pr,                     // print level
+    FILE *f                     // file for output
+) ;
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_Semiring_check          // check a GraphBLAS semiring
+(
+    const GrB_Semiring semiring,    // GraphBLAS semiring to print and check
+    const char *name,               // name of the semiring, optional
+    int pr,                         // print level
+    FILE *f                         // file for output
+) ;
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_Descriptor_check    // check a GraphBLAS descriptor
+(
+    const GrB_Descriptor D,     // GraphBLAS descriptor to print and check
+    const char *name,           // name of the descriptor, optional
+    int pr,                     // print level
+    FILE *f                     // file for output
+) ;
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
+(
+    const GrB_Matrix A,     // GraphBLAS matrix to print and check
+    const char *name,       // name of the matrix, optional
+    int pr,                 // print level; if negative, ignore nzombie
+                            // conditions and use GB_FLIP(pr) for diagnostics
+    FILE *f,                // file for output
+    const char *kind        // "matrix" or "vector"
+) ;
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_Matrix_check    // check a GraphBLAS matrix
+(
+    const GrB_Matrix A,     // GraphBLAS matrix to print and check
+    const char *name,       // name of the matrix
+    int pr,                 // print level
+    FILE *f                 // file for output
+) ;
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_Vector_check    // check a GraphBLAS vector
+(
+    const GrB_Vector v,     // GraphBLAS vector to print and check
+    const char *name,       // name of the vector
+    int pr,                 // print level
+    FILE *f                 // file for output
+) ;
+
+GrB_Info GB_Scalar_check    // check a GraphBLAS GxB_Scalar
+(
+    const GxB_Scalar v,     // GraphBLAS GxB_Scalar to print and check
+    const char *name,       // name of the GxB_Scalar
+    int pr,                 // print level
+    FILE *f                 // file for output
+) ;
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_clear.c b/GraphBLAS/Source/GB_clear.c
index 2d37786825..cbc595b1ab 100644
--- a/GraphBLAS/Source/GB_clear.c
+++ b/GraphBLAS/Source/GB_clear.c
@@ -2,8 +2,8 @@
 // GB_clear: clears the content of a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,7 +12,7 @@
 // had after GrB_Matrix_new (&A, ...), with A->magic == GB_MAGIC to denote a
 // valid, initialized matrix, with nnz(A) equal to zero.  The dimensions, type,
 // and CSR/CSC format are unchanged.  The hypersparsity of the newly empty
-// matrix A is determined by the A->hyper_ratio for the matrix.  The matrix is
+// matrix A is determined by the A->hyper_switch for the matrix.  The matrix is
 // valid.
 
 // However, if this method runs out of memory, and the A->p and A->h structure
@@ -20,6 +20,10 @@
 // the matrix A is left in an invalid state (A->magic == GB_MAGIC2).  Only the
 // header is left.
 
+// A is first converted to sparse or hypersparse, and then conformed via
+// GB_conform.  If A->sparsity disables the sparse and hypersparse structures,
+// A is converted bitmap instead.
+
 #include "GB.h"
 
 GrB_Info GB_clear           // clear a matrix, type and dimensions unchanged
@@ -37,56 +41,63 @@ GrB_Info GB_clear           // clear a matrix, type and dimensions unchanged
     ASSERT (A->magic == GB_MAGIC || A->magic == GB_MAGIC2) ;
 
     // zombies and pending tuples have no effect; about to delete them anyway
-    ASSERT (GB_PENDING_OK (A)) ; ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
+
+    //--------------------------------------------------------------------------
+    // clear the content of A if bitmap
+    //--------------------------------------------------------------------------
+
+    int sparsity = GB_sparsity_control (A->sparsity, A->vdim) ;
+    if (((sparsity & (GxB_SPARSE + GxB_HYPERSPARSE)) == 0) && GB_IS_BITMAP (A))
+    { 
+        // A should remain bitmap
+        GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+        GB_memset (A->b, 0, GB_NNZ_HELD (A), nthreads_max) ;
+        A->nvals = 0 ;
+        A->magic = GB_MAGIC ;
+        return (GrB_SUCCESS) ;
+    }
 
     //--------------------------------------------------------------------------
     // clear the content of A
     //--------------------------------------------------------------------------
 
     // free all content
-    GB_PHIX_FREE (A) ;
+    GB_phbix_free (A) ;
 
     // no more zombies or pending tuples
-    ASSERT (!GB_PENDING (A)) ;
     ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
 
     //--------------------------------------------------------------------------
-    // check hypersparsity status of an empty matrix
+    // allocate new A->p and A->h components
     //--------------------------------------------------------------------------
 
     // By default, an empty matrix with n > 1 vectors is held in hypersparse
     // form.  A GrB_Matrix with n <= 1, or a GrB_Vector (with n == 1) is always
-    // non-hypersparse.  If A->hyper_ratio is negative, A will be always be
+    // non-hypersparse.  If A->hyper_switch is negative, A will be always be
     // non-hypersparse.
 
-    A->is_hyper = true ;
-    A->nvec_nonempty = 0 ;
-    if (GB_to_nonhyper_test (A, A->nvec_nonempty, A->vdim))
-    { 
-        A->is_hyper = false ;
-    }
-
-    //--------------------------------------------------------------------------
-    // allocate new A->p and A->h components
-    //--------------------------------------------------------------------------
-
-    if (A->is_hyper)
+    if (GB_convert_hyper_to_sparse_test (A->hyper_switch, 0, A->vdim))
     {
 
         //----------------------------------------------------------------------
-        // A is hypersparse
+        // A is sparse
         //----------------------------------------------------------------------
 
-        int64_t plen = GB_IMIN (1, A->vdim) ;
-        A->nvec = 0 ;
+        int64_t plen = A->vdim ;
+        A->nvec = plen ;
         A->plen = plen ;
         A->p = GB_CALLOC (plen+1, int64_t) ;
-        A->h = GB_CALLOC (plen  , int64_t) ;
-        if (A->p == NULL || A->h == NULL)
+        ASSERT (A->h == NULL) ;
+        if (A->p == NULL)
         { 
             // out of memory
-            GB_PHIX_FREE (A) ;
-            return (GB_OUT_OF_MEMORY) ;
+            GB_phbix_free (A) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
     }
@@ -94,27 +105,28 @@ GrB_Info GB_clear           // clear a matrix, type and dimensions unchanged
     {
 
         //----------------------------------------------------------------------
-        // A is non-hypersparse
+        // A is hypersparse
         //----------------------------------------------------------------------
 
-        int64_t plen = A->vdim ;
-        A->nvec = plen ;
+        int64_t plen = GB_IMIN (1, A->vdim) ;
+        A->nvec = 0 ;
         A->plen = plen ;
         A->p = GB_CALLOC (plen+1, int64_t) ;
-        ASSERT (A->h == NULL) ;
-        if (A->p == NULL)
+        A->h = GB_CALLOC (plen  , int64_t) ;
+        if (A->p == NULL || A->h == NULL)
         { 
             // out of memory
-            GB_PHIX_FREE (A) ;
-            return (GB_OUT_OF_MEMORY) ;
+            GB_phbix_free (A) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
     }
 
+    A->magic = GB_MAGIC ;
+
     //--------------------------------------------------------------------------
-    // return a valid empty matrix
+    // conform A to its desired sparsity 
     //--------------------------------------------------------------------------
 
-    A->magic = GB_MAGIC ;
-    return (GrB_SUCCESS) ;
+    return (GB_conform (A, Context)) ;
 }
 
diff --git a/GraphBLAS/Source/GB_code_check.c b/GraphBLAS/Source/GB_code_check.c
index ff73b69beb..e1f039fe12 100644
--- a/GraphBLAS/Source/GB_code_check.c
+++ b/GraphBLAS/Source/GB_code_check.c
@@ -2,8 +2,8 @@
 // GB_code_check: print an entry using a type code
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -52,8 +52,7 @@ GrB_Info GB_code_check          // print an entry using a type code
     const GB_Type_code code,    // type code of value to print
     const void *x,              // entry to print
     int pr,                     // print level
-    FILE *f,                    // file to print to
-    GB_Context Context
+    FILE *f                     // file to print to
 )
 {
 
@@ -131,19 +130,19 @@ GrB_Info GB_code_check          // print an entry using a type code
         case GB_UINT64_code : u = *((uint64_t *) x) ; GBPR ("  " GBu, u) ;
             break ;
 
-        case GB_FP32_code   :
+        case GB_FP32_code   : 
             s = *((float *) x) ;
             GBPR ("   ") ;
             GB_PRINT_FLOAT (s) ;
             break ;
 
-        case GB_FP64_code   :
+        case GB_FP64_code   : 
             d = *((double *) x) ;
             GBPR ("   ") ;
             GB_PRINT_DOUBLE (d, pr_verbose) ;
             break ;
 
-        case GB_FC32_code   :
+        case GB_FC32_code   : 
             c = *((GxB_FC32_t *) x) ;
             GBPR ("   ") ;
             GB_PRINT_FLOAT (crealf (c)) ;
@@ -161,7 +160,7 @@ GrB_Info GB_code_check          // print an entry using a type code
             GBPR ("i") ;
             break ;
 
-        case GB_FC64_code   :
+        case GB_FC64_code   : 
             z = *((GxB_FC64_t *) x) ;
             GBPR ("   ") ;
             GB_PRINT_DOUBLE (creal (z), pr_verbose) ;
@@ -181,7 +180,7 @@ GrB_Info GB_code_check          // print an entry using a type code
 
         #endif
 
-        case GB_UDT_code    :
+        case GB_UDT_code    : 
             { 
                 GBPR ("[user-defined value]") ;
                 // FUTURE: GraphBLAS does not have a method for the user to
diff --git a/GraphBLAS/Source/GB_code_size.c b/GraphBLAS/Source/GB_code_size.c
index 991c2c9fac..c6b5166555 100644
--- a/GraphBLAS/Source/GB_code_size.c
+++ b/GraphBLAS/Source/GB_code_size.c
@@ -2,8 +2,8 @@
 // GB_code_size: given a type code, return sizeof (type)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_code_string.c b/GraphBLAS/Source/GB_code_string.c
index b7bc1b117e..895779d63d 100644
--- a/GraphBLAS/Source/GB_code_string.c
+++ b/GraphBLAS/Source/GB_code_string.c
@@ -2,8 +2,8 @@
 // GB_code_string: convert a type code into a string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_code_type.c b/GraphBLAS/Source/GB_code_type.c
index 03953f242d..f9f02cf5ee 100644
--- a/GraphBLAS/Source/GB_code_type.c
+++ b/GraphBLAS/Source/GB_code_type.c
@@ -2,8 +2,8 @@
 // GB_code_type: convert a type code to a GrB_Type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_compatible.c b/GraphBLAS/Source/GB_compatible.c
index 9a9d479fd7..14395a676f 100644
--- a/GraphBLAS/Source/GB_compatible.c
+++ b/GraphBLAS/Source/GB_compatible.c
@@ -2,8 +2,8 @@
 // GB_compatible: check input and operators for type compatibility
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -63,10 +63,10 @@ GrB_Info GB_compatible          // SUCCESS if all is OK, *_MISMATCH otherwise
     // also C<M> = accum(C,T) for entries in T but not C
     if (!GB_Type_compatible (ctype, ttype))
     { 
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+        GB_ERROR (GrB_DOMAIN_MISMATCH,
             "Result of computation of type [%s]\n"
             "cannot be typecast to final output of type [%s]",
-            ttype->name, ctype->name))) ;
+            ttype->name, ctype->name) ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_compiler.h b/GraphBLAS/Source/GB_compiler.h
new file mode 100644
index 0000000000..235175ab14
--- /dev/null
+++ b/GraphBLAS/Source/GB_compiler.h
@@ -0,0 +1,136 @@
+//------------------------------------------------------------------------------
+// GB_compiler.h: handle compiler variations
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_COMPILER_H
+#define GB_COMPILER_H
+
+//------------------------------------------------------------------------------
+// compiler variations
+//------------------------------------------------------------------------------
+
+// Determine the restrict keyword, and whether or not variable-length arrays
+// are supported.
+
+#if ( _MSC_VER && !__INTEL_COMPILER )
+
+    // Microsoft Visual Studio does not have the restrict keyword, but it does
+    // support __restrict, which is equivalent.  Variable-length arrays are
+    // not supported.  OpenMP tasks are not available, GraphBLAS no longer
+    // uses OpenMP tasks.
+
+    #define GB_MICROSOFT 1
+    #define GB_RESTRICT __restrict
+    #define GB_HAS_VLA  0
+
+#elif GxB_STDC_VERSION >= 199901L
+
+    // ANSI C99 and later have the restrict keyword and variable-length arrays.
+    #define GB_MICROSOFT 0
+    #define GB_RESTRICT restrict
+    #define GB_HAS_VLA  1
+
+#else
+
+    // ANSI C95 and earlier have neither
+    #define GB_MICROSOFT 0
+    #define GB_RESTRICT
+    #define GB_HAS_VLA  0
+
+#endif
+
+//------------------------------------------------------------------------------
+// Microsoft specific include files
+//------------------------------------------------------------------------------
+
+#if GB_MICROSOFT
+#include <malloc.h>
+#endif
+
+//------------------------------------------------------------------------------
+// PGI_COMPILER_BUG
+//------------------------------------------------------------------------------
+
+// If GraphBLAS is compiled with -DPGI_COMPILER_BUG, then a workaround is
+// enabled for a bug in the PGI compiler.  The compiler does not correctly
+// handle automatic arrays of variable size.
+
+#ifdef PGI_COMPILER_BUG
+
+    // override the ANSI C compiler to turn off variable-length arrays
+    #undef  GB_HAS_VLA
+    #define GB_HAS_VLA  0
+
+#endif
+
+//------------------------------------------------------------------------------
+// OpenMP pragmas and tasks
+//------------------------------------------------------------------------------
+
+// GB_PRAGMA(x) becomes "#pragma x", but the way to do this depends on the
+// compiler:
+#if GB_MICROSOFT
+    // MS Visual Studio is not ANSI C11 compliant, and uses __pragma:
+    #define GB_PRAGMA(x) __pragma (x)
+#else
+    // ANSI C11 compilers use _Pragma:
+    #define GB_PRAGMA(x) _Pragma (#x)
+#endif
+
+// construct pragmas for loop vectorization:
+#if GB_MICROSOFT
+
+    // no #pragma omp simd is available in MS Visual Studio
+    #define GB_PRAGMA_SIMD
+    #define GB_PRAGMA_SIMD_REDUCTION(op,s)
+
+#else
+
+    // create two kinds of SIMD pragmas:
+    // GB_PRAGMA_SIMD becomes "#pragma omp simd"
+    // GB_PRAGMA_SIMD_REDUCTION (+,cij) becomes
+    // "#pragma omp simd reduction(+:cij)"
+    #define GB_PRAGMA_SIMD GB_PRAGMA (omp simd)
+    #define GB_PRAGMA_SIMD_REDUCTION(op,s) GB_PRAGMA (omp simd reduction(op:s))
+
+#endif
+
+#define GB_PRAGMA_IVDEP GB_PRAGMA(ivdep)
+
+//------------------------------------------------------------------------------
+// variable-length arrays
+//------------------------------------------------------------------------------
+
+// If variable-length arrays are not supported, user-defined types are limited
+// in size to 128 bytes or less.  Many of the type-generic routines allocate
+// workspace for a single scalar of variable size, using a statement:
+//
+//      GB_void aij [xsize] ;
+//
+// To support non-variable-length arrays in ANSI C95 or earlier, this is used:
+//
+//      GB_void aij [GB_VLA(xsize)] ;
+//
+// GB_VLA(xsize) is either defined as xsize (for ANSI C99 or later), or a fixed
+// size of 128, in which case user-defined types
+// are limited to a max of 128 bytes.
+
+#if ( GB_HAS_VLA )
+
+    // variable-length arrays are allowed
+    #define GB_VLA(s) s
+
+#else
+
+    // variable-length arrays are not allowed
+    #define GB_VLA_MAXSIZE 128
+    #define GB_VLA(s) GB_VLA_MAXSIZE
+
+#endif
+#endif
+
diff --git a/GraphBLAS/Source/GB_conform.c b/GraphBLAS/Source/GB_conform.c
new file mode 100644
index 0000000000..3fcf744ef4
--- /dev/null
+++ b/GraphBLAS/Source/GB_conform.c
@@ -0,0 +1,390 @@
+//------------------------------------------------------------------------------
+// GB_conform: conform any matrix to its desired sparsity structure
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// On input, the matrix has any one of four sparsity structures: hypersparse,
+// sparse, bitmap, or full.  A bitmap or full matrix never has pending work.  A
+// sparse or hypersparse matrix may have pending work (zombies, jumbled, and/or
+// pending tuples).  The pending work is not finished unless the matrix is
+// converted to bitmap or full.  If this method fails, the matrix is cleared
+// of all entries.
+
+#include "GB.h"
+
+#define GB_FREE_ALL ;
+
+//------------------------------------------------------------------------------
+// GB_hyper_or_bitmap: ensure a matrix is either hypersparse or bitmap
+//------------------------------------------------------------------------------
+
+static inline GrB_Info GB_hyper_or_bitmap
+(
+    bool is_hyper, bool is_sparse, bool is_bitmap, bool is_full,
+    GrB_Matrix A, GB_Context Context
+)
+{
+    GrB_Info info ;
+    if (is_full || ((is_hyper || is_sparse) &&
+        GB_convert_sparse_to_bitmap_test (A->bitmap_switch,
+            GB_NNZ (A), A->vlen, A->vdim)))
+    { 
+        // if full or sparse/hypersparse with many entries: to bitmap
+        GB_OK (GB_convert_any_to_bitmap (A, Context)) ;
+    }
+    else if (is_sparse || (is_bitmap &&
+        GB_convert_bitmap_to_sparse_test (A->bitmap_switch,
+            GB_NNZ (A), A->vlen, A->vdim)))
+    { 
+        // if sparse or bitmap with few entries: to hypersparse
+        GB_OK (GB_convert_any_to_hyper (A, Context)) ;
+    }
+    return (GrB_SUCCESS) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_sparse_or_bitmap: ensure a matrix is either sparse or bitmap
+//------------------------------------------------------------------------------
+
+static inline GrB_Info GB_sparse_or_bitmap
+(
+    bool is_hyper, bool is_sparse, bool is_bitmap, bool is_full,
+    GrB_Matrix A, GB_Context Context
+)
+{
+    GrB_Info info ;
+    if (is_full || ((is_hyper || is_sparse) &&
+        GB_convert_sparse_to_bitmap_test (A->bitmap_switch,
+            GB_NNZ (A), A->vlen, A->vdim)))
+    { 
+        // if full or sparse/hypersparse with many entries: to bitmap
+        GB_OK (GB_convert_any_to_bitmap (A, Context)) ;
+    }
+    else if (is_hyper || (is_bitmap &&
+        GB_convert_bitmap_to_sparse_test (A->bitmap_switch,
+            GB_NNZ (A), A->vlen, A->vdim)))
+    { 
+        // if hypersparse or bitmap with few entries: to sparse
+        GB_OK (GB_convert_any_to_sparse (A, Context)) ;
+    }
+    return (GrB_SUCCESS) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_hyper_sparse_or_bitmap: ensure a matrix is hypersparse, sparse, or bitmap
+//------------------------------------------------------------------------------
+
+static inline GrB_Info GB_hyper_sparse_or_bitmap
+(
+    bool is_hyper, bool is_sparse, bool is_bitmap, bool is_full,
+    GrB_Matrix A, GB_Context Context
+)
+{
+    GrB_Info info ;
+    if (is_full || ((is_hyper || is_sparse) &&
+        GB_convert_sparse_to_bitmap_test (A->bitmap_switch,
+            GB_NNZ (A), A->vlen, A->vdim)))
+    { 
+        // if full or sparse/hypersparse with many entries: to bitmap
+        GB_OK (GB_convert_any_to_bitmap (A, Context)) ;
+    }
+    else if (is_bitmap)
+    {
+        if (GB_convert_bitmap_to_sparse_test (A->bitmap_switch,
+            GB_NNZ (A), A->vlen, A->vdim))
+        { 
+            // if bitmap with few entries: to sparse
+            GB_OK (GB_convert_bitmap_to_sparse (A, Context)) ;
+            // conform between sparse and hypersparse
+            GB_OK (GB_conform_hyper (A, Context)) ;
+        }
+    }
+    else // is_hyper || is_sparse
+    { 
+        // conform between sparse and hypersparse
+        GB_OK (GB_conform_hyper (A, Context)) ;
+    }
+    return (GrB_SUCCESS) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_conform
+//------------------------------------------------------------------------------
+
+GrB_Info GB_conform     // conform a matrix to its desired sparsity structure
+(
+    GrB_Matrix A,       // matrix to conform
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    ASSERT_MATRIX_OK (A, "A to conform", GB0) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
+    bool is_hyper = GB_IS_HYPERSPARSE (A) ;
+    bool is_sparse = GB_IS_SPARSE (A) ;
+    bool is_full = GB_IS_FULL (A) ;
+    bool is_bitmap = GB_IS_BITMAP (A) ;
+    bool is_full_or_dense_with_no_pending_work = is_full || (GB_is_dense (A)
+        && !GB_ZOMBIES (A) && !GB_JUMBLED (A) && !GB_PENDING (A)) ;
+
+    //--------------------------------------------------------------------------
+    // select the sparsity structure
+    //--------------------------------------------------------------------------
+
+    switch (GB_sparsity_control (A->sparsity, A->vdim))
+    {
+
+        //----------------------------------------------------------------------
+        // (1) always hypersparse
+        //----------------------------------------------------------------------
+
+        case GxB_HYPERSPARSE : 
+
+            GB_OK (GB_convert_any_to_hyper (A, Context)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        // (2) always sparse
+        //----------------------------------------------------------------------
+
+        case GxB_SPARSE : 
+
+            GB_OK (GB_convert_any_to_sparse (A, Context)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        // (3) sparse or hypersparse
+        //----------------------------------------------------------------------
+
+        case GxB_HYPERSPARSE + GxB_SPARSE : 
+
+            if (is_full || is_bitmap)
+            { 
+                // if full or bitmap: to sparse
+                GB_OK (GB_convert_any_to_sparse (A, Context)) ;
+            }
+            // conform between sparse and hypersparse
+            GB_OK (GB_conform_hyper (A, Context)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        // (4) always bitmap
+        //----------------------------------------------------------------------
+
+        case GxB_BITMAP : 
+
+            GB_OK (GB_convert_any_to_bitmap (A, Context)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        // (5) hypersparse or bitmap
+        //----------------------------------------------------------------------
+
+        case GxB_HYPERSPARSE + GxB_BITMAP : 
+
+            // ensure the matrix is hypersparse or bitmap
+            GB_OK (GB_hyper_or_bitmap (is_hyper, is_sparse, is_bitmap,
+                is_full, A, Context)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        // (6) sparse or bitmap
+        //----------------------------------------------------------------------
+
+        case GxB_SPARSE + GxB_BITMAP : 
+
+            // ensure the matrix is sparse or bitmap
+            GB_OK (GB_sparse_or_bitmap (is_hyper, is_sparse, is_bitmap,
+                is_full, A, Context)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        // (7) hypersparse, sparse, or bitmap
+        //----------------------------------------------------------------------
+
+        case GxB_HYPERSPARSE + GxB_SPARSE + GxB_BITMAP : 
+
+            // ensure the matrix is hypersparse, sparse, or bitmap
+            GB_OK (GB_hyper_sparse_or_bitmap (is_hyper, is_sparse,
+                is_bitmap, is_full, A, Context)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        // (8): full
+        //----------------------------------------------------------------------
+
+        case GxB_FULL : 
+
+            if (is_full_or_dense_with_no_pending_work)
+            { 
+                // if full or all entries present: to full
+                GB_convert_any_to_full (A) ;
+            }
+            else
+            { 
+                // otherwise: to bitmap
+                GB_OK (GB_convert_any_to_bitmap (A, Context)) ;
+            }
+            break ;
+
+        //----------------------------------------------------------------------
+        // (9) hypersparse or full
+        //----------------------------------------------------------------------
+
+        case GxB_HYPERSPARSE + GxB_FULL : 
+
+            if (is_full_or_dense_with_no_pending_work)
+            { 
+                // if all entries present: to full
+                GB_convert_any_to_full (A) ;
+            }
+            else
+            { 
+                // otherwise: to hypersparse
+                GB_OK (GB_convert_any_to_hyper (A, Context)) ;
+            }
+            break ;
+
+        //----------------------------------------------------------------------
+        // (10) sparse or full
+        //----------------------------------------------------------------------
+
+        case GxB_SPARSE + GxB_FULL :  
+
+            if (is_full_or_dense_with_no_pending_work)
+            { 
+                // if full or all entries present: to full
+                GB_convert_any_to_full (A) ;
+            }
+            else
+            { 
+                // otherwise: to sparse
+                GB_OK (GB_convert_any_to_sparse (A, Context)) ;
+            }
+            break ;
+
+        //----------------------------------------------------------------------
+        // (11) hypersparse, sparse, or full
+        //----------------------------------------------------------------------
+
+        case GxB_HYPERSPARSE + GxB_SPARSE + GxB_FULL : 
+
+            if (is_full_or_dense_with_no_pending_work)
+            { 
+                // if full or all entries present: to full
+                GB_convert_any_to_full (A) ;
+            }
+            else if (is_bitmap)
+            { 
+                // if bitmap: to sparse
+                GB_OK (GB_convert_bitmap_to_sparse (A, Context)) ;
+                // conform between sparse and hypersparse
+                GB_OK (GB_conform_hyper (A, Context)) ;
+            }
+            else
+            { 
+                // conform between sparse and hypersparse
+                GB_OK (GB_conform_hyper (A, Context)) ;
+            }
+            break ;
+
+        //----------------------------------------------------------------------
+        // (12): bitmap or full
+        //----------------------------------------------------------------------
+
+        case GxB_FULL + GxB_BITMAP : 
+
+            if (is_bitmap)
+            { 
+                // leave in bitmap form, even if it can be converted to full
+            }
+            else if (is_full_or_dense_with_no_pending_work)
+            { 
+                // if full or all entries present: to full
+                GB_convert_any_to_full (A) ;
+            }
+            else
+            { 
+                // otherwise: to bitmap
+                GB_OK (GB_convert_any_to_bitmap (A, Context)) ;
+            }
+            break ;
+
+        //----------------------------------------------------------------------
+        // (13) hypersparse, bitmap, or full
+        //----------------------------------------------------------------------
+
+        case GxB_HYPERSPARSE + GxB_BITMAP + GxB_FULL : 
+
+            if (is_full_or_dense_with_no_pending_work && !is_bitmap)
+            { 
+                // if full or all entries present (and not bitmap): to full
+                GB_convert_any_to_full (A) ;
+            }
+            else
+            { 
+                // ensure the matrix is hypersparse or bitmap
+                GB_OK (GB_hyper_or_bitmap (is_hyper, is_sparse, is_bitmap,
+                    is_full, A, Context)) ;
+            }
+            break ;
+
+        //----------------------------------------------------------------------
+        // (14) sparse, bitmap, or full
+        //----------------------------------------------------------------------
+
+        case GxB_SPARSE + GxB_BITMAP + GxB_FULL : 
+
+            if (is_full_or_dense_with_no_pending_work && !is_bitmap)
+            { 
+                // if full or all entries present (and not bitmap): to full
+                GB_convert_any_to_full (A) ;
+            }
+            else
+            { 
+                // ensure the matrix is sparse or bitmap
+                GB_OK (GB_sparse_or_bitmap (is_hyper, is_sparse, is_bitmap,
+                    is_full, A, Context)) ;
+            }
+            break ;
+
+        //----------------------------------------------------------------------
+        // (15) hypersparse, sparse, bitmap, or full
+        //----------------------------------------------------------------------
+
+        default:
+        case GxB_HYPERSPARSE + GxB_SPARSE + GxB_BITMAP + GxB_FULL : 
+
+            if (is_full_or_dense_with_no_pending_work && !is_bitmap)
+            { 
+                // if full or all entries present (and not bitmap): to full
+                GB_convert_any_to_full (A) ;
+            }
+            else
+            { 
+                // ensure the matrix is hypersparse, sparse, or bitmap
+                GB_OK (GB_hyper_sparse_or_bitmap (is_hyper, is_sparse,
+                    is_bitmap, is_full, A, Context)) ;
+            }
+            break ;
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "A conformed", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_to_hyper_conform.c b/GraphBLAS/Source/GB_conform_hyper.c
similarity index 52%
rename from GraphBLAS/Source/GB_to_hyper_conform.c
rename to GraphBLAS/Source/GB_conform_hyper.c
index 7892a9ae94..90448e64d0 100644
--- a/GraphBLAS/Source/GB_to_hyper_conform.c
+++ b/GraphBLAS/Source/GB_conform_hyper.c
@@ -1,22 +1,23 @@
 //------------------------------------------------------------------------------
-// GB_to_hyper_conform: conform a matrix to its desired hypersparse format
+// GB_conform_hyper: conform a sparse matrix to its desired hypersparse format
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
+// The input matrix must be sparse or hypersparse, and it may be left as-is,
+// or converted to sparse/hypersparse.
+
 // The input matrix can have shallow A->p and/or A->h components.  If the
-// hypersparsity is changed, these components are no longer shallow.  If the
-// method fails and the matrix is shallow, all content is removed or freed.
-// The input matrix may be jumbled; this is not an error condition.  Zombies
-// are OK, but A never has pending tuples.  However, this function is agnostic
-// about pending tuples so they could be OK.
+// hypersparsity is changed, these components are no longer shallow.
 
 #include "GB.h"
 
-GrB_Info GB_to_hyper_conform    // conform a matrix to its desired format
+#define GB_FREE_ALL ;
+
+GrB_Info GB_conform_hyper       // conform a matrix to sparse/hypersparse
 (
     GrB_Matrix A,               // matrix to conform
     GB_Context Context
@@ -27,28 +28,34 @@ GrB_Info GB_to_hyper_conform    // conform a matrix to its desired format
     // check inputs
     //--------------------------------------------------------------------------
 
-    ASSERT_MATRIX_OK_OR_JUMBLED (A, "A to conform", GB0) ;
+    GrB_Info info = GrB_SUCCESS ;
+    ASSERT_MATRIX_OK (A, "A to conform_hyper", GB0) ;
+    ASSERT (!GB_IS_FULL (A)) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
     ASSERT (GB_ZOMBIES_OK (A)) ;
-    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
 
     //--------------------------------------------------------------------------
-    // convert to hypersparse or non-hypersparse
+    // convert to sparse or hypersparse
     //--------------------------------------------------------------------------
 
-    GrB_Info info = GrB_SUCCESS ;
-
     if (A->nvec_nonempty < 0)
     { 
         A->nvec_nonempty = GB_nvec_nonempty (A, Context) ;
     }
 
-    if (GB_to_hyper_test (A, A->nvec_nonempty, A->vdim))
+    if (A->h == NULL && GB_convert_sparse_to_hyper_test (A->hyper_switch,
+        A->nvec_nonempty, A->vdim)) // A->nvec_nonempty used here
     { 
-        info = GB_to_hyper (A, Context) ;
+        // A is sparse but should be converted to hypersparse
+        GB_OK (GB_convert_sparse_to_hyper (A, Context)) ;
     }
-    else if (GB_to_nonhyper_test (A, A->nvec_nonempty, A->vdim))
+    else if (A->h != NULL && GB_convert_hyper_to_sparse_test (A->hyper_switch,
+        A->nvec_nonempty, A->vdim)) // A->nvec_nonempty used here
     { 
-        info = GB_to_nonhyper (A, Context) ;
+        // A is hypersparse but should be converted to sparse
+        GB_OK (GB_convert_hyper_to_sparse (A, Context)) ;
     }
     else
     { 
@@ -56,18 +63,11 @@ GrB_Info GB_to_hyper_conform    // conform a matrix to its desired format
         ;
     }
 
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory; all content has been freed
-        ASSERT (A->magic == GB_MAGIC2) ;
-        return (info) ;
-    }
-
     //--------------------------------------------------------------------------
     // return result
     //--------------------------------------------------------------------------
 
-    ASSERT_MATRIX_OK_OR_JUMBLED (A, "A conformed", GB0) ;
+    ASSERT_MATRIX_OK (A, "A conform_hyper result", GB0) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_control.h b/GraphBLAS/Source/GB_control.h
index adb50d3020..a7f0c88672 100644
--- a/GraphBLAS/Source/GB_control.h
+++ b/GraphBLAS/Source/GB_control.h
@@ -2,8 +2,8 @@
 // GB_control.h:  disable hard-coded functions to reduce code size
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,10 +17,10 @@
 // same thing.
 
 // GraphBLAS will still work as expected.  It will simply use a generic method
-// in place of the type- or operator-specific code.  It will be slower, by
-// about 2x or 3x, depending on the operation. but its results will be the
-// same.  A few operations will be 10x slower, such as GrB_reduce to scalar
-// using the GrB_MAX_FP64 operator.
+// instead of the type- or operator-specific code.  It will be slower, by about
+// 2x or 3x, depending on the operation. but its results will be the same.  A
+// few operations will be 10x slower, such as GrB_reduce to scalar using the
+// GrB_MAX_FP64 operator.
 
 // Enabling the "#define GBCOMPACT" option is the same as uncommenting this
 // entire file.  This file provides a more concise control over which
@@ -78,6 +78,86 @@
 // Note that there are no macros that disable the hard-coded functions for
 // GxB_select (Generated/GB_sel__*), since they have no generic equivalents.
 
+// In SuiteSparse:GraphBLAS v4.0.1, some of the fast hard-coded semirings have
+// been disabled below.  They still work, but are now slower since the work is
+// now done by the generic semiring instead.  These semirings are likely not
+// needed by any application, and disabling them here saves compile time and
+// reduces the size of the compiled library.  Hard-coded semirings removed:
+
+//  (1) *_IS* semirings are removed.
+//  (2) semirings with DIV, RDIV, MINUS, RMINUS and ANY multiplicative
+//      operators are removed.  ANY still appears as a monoid for many fast
+//      hard-coded semirings, just not as a multiplicative operator.
+//  (3) MIN, MAX, and TIMES monoids with the PAIR, LAND, LOR and LXOR
+//      multiplicative operators are removed.
+//  (4) PLUS monoids with LAND, LOR, and LXOR multiplicative ops.
+//  (5) MAX_MAX, MIN_MIN, semirings are removed.
+//  (5) ANY monoids with the EQ, NE, GE, LE, GT, LT, LAND, LOR, LXOR, MAX, MIN,
+//      PLUS and TIMES operators.
+//  (6) boolean semirings with non-boolean inputs removed
+//      (EQ_LT_FP32 for example), and boolean inputs with comparators
+//      (GE, GT, LE, LT)
+
+// With the above semirings removed, the remaining 474 semirings are:
+
+//  28 boolean semirings:
+//
+//      monoid  multiply ops
+//      EQ:     EQ (=LXNOR), LAND, LOR, LXOR (=NE), FIRST, SECOND, PAIR.
+//      LAND:   EQ (=LXNOR), LAND, LOR, LXOR (=NE), FIRST, SECOND, PAIR.
+//      LOR:    EQ (=LXNOR), LAND, LOR, LXOR (=NE), FIRST, SECOND, PAIR.
+//      LXOR:   EQ (=LXNOR), LAND, LOR, LXOR (=NE), FIRST, SECOND, PAIR.
+//
+//              note: EQ_BOOL and LXNOR are the same operator, and
+//                    NE_BOOL and LXOR  are the same operator.
+//
+//  120 semirings with MIN/MAX monoids (12 kinds, 10 real types each):
+//
+//      monoid  multiply ops
+//      MAX:    MIN, PLUS, TIMES, FIRST, SECOND, PAIR
+//      MIN:    MAX, PLUS, TIMES, FIRST, SECOND, PAIR
+//
+//  140 semirings with PLUS monoids (14 kinds, 10 real types each):
+//
+//      monoid  multiply ops
+//      PLUS:   MIN, MAX, PLUS, TIMES, FIRST, SECOND, PAIR
+//      TIMES:  MIN, MAX, PLUS, TIMES, FIRST, SECOND, PAIR
+//
+//  26 semirings for 2 complex types:
+//
+//      monoid  multiply ops
+//      PLUS:  PLUS, TIMES, FIRST, SECOND, PAIR
+//      TIMES: PLUS, TIMES, FIRST, SECOND, PAIR
+//
+//  36 semirings with the ANY monoid:
+//
+//      ANY:   FIRST, SECOND, PAIR (with bool, 10 real types, 2 complex types)
+//
+//  64 bitwise semirings: for 4 unsigned integer types:
+//
+//      (BOR, BAND, BXOR, BXNOR) x (BOR, BAND, BXOR, BXNOR)
+//
+//  60 positional semirings:
+//
+//      monoids: (MIN, MAX, ANY, PLUS, TIMES) x
+//      mult:    (FIRSTI, FIRSTI1, FIRSTJ, FIRSTJ1, SECONDJ, SECONDJ1) x
+//      types:   (int32, int64)
+//
+//  note: the above list includes the following identical semirings:
+//      EQ_PAIR_BOOL, XNOR_PAIR_BOOL, LAND_PAIR_BOOL, LOR_PAIR_BOOL, are all
+//      the same as ANY_PAIR_BOOL.  For the other types, MAX_PAIR, MIN_PAIR,
+//      and TIMES_PAIR are the same as ANY_PAIR.
+
+// These changes have no effect on the performance of binary operations such
+// as eWiseAdd, eWiseMult, or the unary GrB_apply to GrB_reduce.  They only
+// affect GrB_mxm, GrB_mxv, and GrB_vxm.
+
+// To renable the fast versions of these semirings, simply comment out the
+// specific "#define GxB_NO..." statements below, and recompile this library.
+// To renable all of them, uncomment the following line and the last #endif:
+
+// #if 0
+
 //------------------------------------------------------------------------------
 // uncomment any of these lines to disable the types
 //------------------------------------------------------------------------------
@@ -88,6 +168,9 @@
 // They also disable the hard-coded functions for GrB_eWiseAdd, GrB_eWiseMult,
 // GrB_reduce, GrB_*_build, GrB_apply, and GrB_transpose for this type.
 
+// If disabled, the types still work just fine, but operations on them will be
+// slower.
+
 // #define GxB_NO_BOOL      1
 // #define GxB_NO_FP32      1
 // #define GxB_NO_FP64      1
@@ -110,6 +193,9 @@
 // effect on GrB_mxm, GrB_vxm, GrB_mxv, GrB_eWiseAdd, GrB_eWiseMult,
 // GrB_reduce, or GrB_*_build.
 
+// Any disabled unary operators will still work just fine, but operations using
+// them will be slower.
+
 // #define GxB_NO_ABS       1
 // #define GxB_NO_AINV      1
 // #define GxB_NO_IDENTITY  1
@@ -177,6 +263,9 @@
 // They also disable the hard-coded functions for GrB_eWiseAdd, GrB_eWiseMult,
 // GrB_reduce, and GrB_*_build for this binary operator.
 
+// Any disabled binary operators will still work just fine, but operations
+// using them will be slower.
+
 // #define GxB_NO_FIRST     1
 // #define GxB_NO_SECOND    1
 // #define GxB_NO_PAIR      1
@@ -224,6 +313,15 @@
 // #define GxB_NO_CMPLX     1
 // #define GxB_NO_POW       1
 
+// #define GxB_NO_FIRSTI    1
+// #define GxB_NO_FIRSTI1   1
+// #define GxB_NO_FIRSTJ    1
+// #define GxB_NO_FIRSTJ1   1
+// #define GxB_NO_SECONDI   1
+// #define GxB_NO_SECONDI1  1
+// #define GxB_NO_SECONDJ   1
+// #define GxB_NO_SECONDJ1  1
+
 //------------------------------------------------------------------------------
 // uncomment any of these lines to disable the binary operators for each type
 //------------------------------------------------------------------------------
@@ -248,6 +346,9 @@
 // GrB_TIMES_BOOL are renamed internally to GrB_LAND_BOOL, for example, so
 // uncommenting GxB_NO_LAND_BOOL disables all three operators.
 
+// Any disabled binary operators will still work just fine, but operations
+// using them will be slower.
+
 // #define GxB_NO_FIRST_INT8    1
 // #define GxB_NO_FIRST_INT16   1
 // #define GxB_NO_FIRST_INT32   1
@@ -684,6 +785,24 @@
 // #define GxB_NO_BSHIFT_UINT32    1
 // #define GxB_NO_BSHIFT_UINT64    1
 
+// #define GxB_NO_FIRSTI_INT64    1
+// #define GxB_NO_FIRSTI1_INT64   1
+// #define GxB_NO_FIRSTJ_INT64    1
+// #define GxB_NO_FIRSTJ1_INT64   1
+// #define GxB_NO_SECONDI_INT64   1
+// #define GxB_NO_SECONDI1_INT64  1
+// #define GxB_NO_SECONDJ_INT64   1
+// #define GxB_NO_SECONDJ1_INT64  1
+
+// #define GxB_NO_FIRSTI_INT32    1
+// #define GxB_NO_FIRSTI1_INT32   1
+// #define GxB_NO_FIRSTJ_INT32    1
+// #define GxB_NO_FIRSTJ1_INT32   1
+// #define GxB_NO_SECONDI_INT32   1
+// #define GxB_NO_SECONDI1_INT32  1
+// #define GxB_NO_SECONDJ_INT32   1
+// #define GxB_NO_SECONDJ1_INT32  1
+
 //------------------------------------------------------------------------------
 // uncomment any of these lines to disable the corresponding semiring
 //------------------------------------------------------------------------------
@@ -695,510 +814,367 @@
 // If GrB_NO_PLUS is enabled above, then all semirings of the form
 // GxB_PLUS_<multop>_<type> and GxB_<multop>_<PLUS>_<type> are disabled.
 
-// These options have no effect on GrB_eWiseAdd, GrB_eWiseMult, GrB_reduce,
-// GrB_*_build, or GrB_apply.
+// These options have no effect on GrB_eWiseAdd, GrB_eWiseMult,
+// GrB_*_build, or GrB_apply.  They do affect GrB_reduce to vector, which
+// converts the reduction of a vector into a matrix-vector multiplication.
 
-// #define GxB_NO_EQ_EQ_BOOL            1
-// #define GxB_NO_EQ_EQ_FP32            1
-// #define GxB_NO_EQ_EQ_FP64            1
-// #define GxB_NO_EQ_EQ_INT16           1
-// #define GxB_NO_EQ_EQ_INT32           1
-// #define GxB_NO_EQ_EQ_INT64           1
-// #define GxB_NO_EQ_EQ_INT8            1
-// #define GxB_NO_EQ_EQ_UINT16          1
-// #define GxB_NO_EQ_EQ_UINT32          1
-// #define GxB_NO_EQ_EQ_UINT64          1
-// #define GxB_NO_EQ_EQ_UINT8           1
+// Any disabled semirings will still work just fine, but operations using them
+// will be slower.
 
-// #define GxB_NO_EQ_FIRST_BOOL         1
+//------------------------------------------------------------
+// semirings with the boolean EQ monoid (also called XNOR)
+//------------------------------------------------------------
 
-// #define GxB_NO_EQ_PAIR_BOOL          1
-
-// #define GxB_NO_EQ_ANY_BOOL           1
-
-// #define GxB_NO_EQ_GE_BOOL            1
-// #define GxB_NO_EQ_GE_FP32            1
-// #define GxB_NO_EQ_GE_FP64            1
-// #define GxB_NO_EQ_GE_INT16           1
-// #define GxB_NO_EQ_GE_INT32           1
-// #define GxB_NO_EQ_GE_INT64           1
-// #define GxB_NO_EQ_GE_INT8            1
-// #define GxB_NO_EQ_GE_UINT16          1
-// #define GxB_NO_EQ_GE_UINT32          1
-// #define GxB_NO_EQ_GE_UINT64          1
-// #define GxB_NO_EQ_GE_UINT8           1
-
-// #define GxB_NO_EQ_GT_BOOL            1
-
-// #define GxB_NO_EQ_GT_FP32            1
-// #define GxB_NO_EQ_GT_FP64            1
-// #define GxB_NO_EQ_GT_INT16           1
-// #define GxB_NO_EQ_GT_INT32           1
-// #define GxB_NO_EQ_GT_INT64           1
-// #define GxB_NO_EQ_GT_INT8            1
-// #define GxB_NO_EQ_GT_UINT16          1
-// #define GxB_NO_EQ_GT_UINT32          1
-// #define GxB_NO_EQ_GT_UINT64          1
-// #define GxB_NO_EQ_GT_UINT8           1
+// The only builtin GrB* semiring that uses the EQ (LXNOR) monoid
+// is LXNOR_LOR_BOOL == EQ_LOR_BOOL.
 
+// #define GxB_NO_EQ_EQ_BOOL            1
 // #define GxB_NO_EQ_LAND_BOOL          1
-
-// #define GxB_NO_EQ_LE_BOOL            1
-// #define GxB_NO_EQ_LE_FP32            1
-// #define GxB_NO_EQ_LE_FP64            1
-// #define GxB_NO_EQ_LE_INT16           1
-// #define GxB_NO_EQ_LE_INT32           1
-// #define GxB_NO_EQ_LE_INT64           1
-// #define GxB_NO_EQ_LE_INT8            1
-// #define GxB_NO_EQ_LE_UINT16          1
-// #define GxB_NO_EQ_LE_UINT32          1
-// #define GxB_NO_EQ_LE_UINT64          1
-// #define GxB_NO_EQ_LE_UINT8           1
-
+// builtin: GrB_LXNOR_LOR_SEMIRING_BOOL == GxB_EQ_LOR_BOOL:
 // #define GxB_NO_EQ_LOR_BOOL           1
-
-// #define GxB_NO_EQ_LT_BOOL            1
-// #define GxB_NO_EQ_LT_FP32            1
-// #define GxB_NO_EQ_LT_FP64            1
-// #define GxB_NO_EQ_LT_INT16           1
-// #define GxB_NO_EQ_LT_INT32           1
-// #define GxB_NO_EQ_LT_INT64           1
-// #define GxB_NO_EQ_LT_INT8            1
-// #define GxB_NO_EQ_LT_UINT16          1
-// #define GxB_NO_EQ_LT_UINT32          1
-// #define GxB_NO_EQ_LT_UINT64          1
-// #define GxB_NO_EQ_LT_UINT8           1
-
 // #define GxB_NO_EQ_LXOR_BOOL          1
-
-// #define GxB_NO_EQ_NE_FP32            1
-// #define GxB_NO_EQ_NE_FP64            1
-// #define GxB_NO_EQ_NE_INT16           1
-// #define GxB_NO_EQ_NE_INT32           1
-// #define GxB_NO_EQ_NE_INT64           1
-// #define GxB_NO_EQ_NE_INT8            1
-// #define GxB_NO_EQ_NE_UINT16          1
-// #define GxB_NO_EQ_NE_UINT32          1
-// #define GxB_NO_EQ_NE_UINT64          1
-// #define GxB_NO_EQ_NE_UINT8           1
-
+// #define GxB_NO_EQ_FIRST_BOOL         1
 // #define GxB_NO_EQ_SECOND_BOOL        1
 
-// #define GxB_NO_LAND_EQ_BOOL          1
-// #define GxB_NO_LAND_EQ_FP32          1
-// #define GxB_NO_LAND_EQ_FP64          1
-// #define GxB_NO_LAND_EQ_INT16         1
-// #define GxB_NO_LAND_EQ_INT32         1
-// #define GxB_NO_LAND_EQ_INT64         1
-// #define GxB_NO_LAND_EQ_INT8          1
-// #define GxB_NO_LAND_EQ_UINT16        1
-// #define GxB_NO_LAND_EQ_UINT32        1
-// #define GxB_NO_LAND_EQ_UINT64        1
-// #define GxB_NO_LAND_EQ_UINT8         1
-
-// #define GxB_NO_LAND_FIRST_BOOL       1
-// #define GxB_NO_LAND_PAIR_BOOL        1
-// #define GxB_NO_LAND_ANY_BOOL         1
-
-// #define GxB_NO_LAND_GE_BOOL          1
-// #define GxB_NO_LAND_GE_FP32          1
-// #define GxB_NO_LAND_GE_FP64          1
-// #define GxB_NO_LAND_GE_INT16         1
-// #define GxB_NO_LAND_GE_INT32         1
-// #define GxB_NO_LAND_GE_INT64         1
-// #define GxB_NO_LAND_GE_INT8          1
-// #define GxB_NO_LAND_GE_UINT16        1
-// #define GxB_NO_LAND_GE_UINT32        1
-// #define GxB_NO_LAND_GE_UINT64        1
-// #define GxB_NO_LAND_GE_UINT8         1
-
-// #define GxB_NO_LAND_GT_BOOL          1
-
-// #define GxB_NO_LAND_GT_FP32          1
-// #define GxB_NO_LAND_GT_FP64          1
-// #define GxB_NO_LAND_GT_INT16         1
-// #define GxB_NO_LAND_GT_INT32         1
-// #define GxB_NO_LAND_GT_INT64         1
-// #define GxB_NO_LAND_GT_INT8          1
-// #define GxB_NO_LAND_GT_UINT16        1
-// #define GxB_NO_LAND_GT_UINT32        1
-// #define GxB_NO_LAND_GT_UINT64        1
-// #define GxB_NO_LAND_GT_UINT8         1
+   #define GxB_NO_EQ_EQ_FP32            1
+   #define GxB_NO_EQ_EQ_FP64            1
+   #define GxB_NO_EQ_EQ_INT16           1
+   #define GxB_NO_EQ_EQ_INT32           1
+   #define GxB_NO_EQ_EQ_INT64           1
+   #define GxB_NO_EQ_EQ_INT8            1
+   #define GxB_NO_EQ_EQ_UINT16          1
+   #define GxB_NO_EQ_EQ_UINT32          1
+   #define GxB_NO_EQ_EQ_UINT64          1
+   #define GxB_NO_EQ_EQ_UINT8           1
+
+   #define GxB_NO_EQ_ANY_BOOL           1
+
+   #define GxB_NO_EQ_GE_BOOL            1
+   #define GxB_NO_EQ_GE_FP32            1
+   #define GxB_NO_EQ_GE_FP64            1
+   #define GxB_NO_EQ_GE_INT16           1
+   #define GxB_NO_EQ_GE_INT32           1
+   #define GxB_NO_EQ_GE_INT64           1
+   #define GxB_NO_EQ_GE_INT8            1
+   #define GxB_NO_EQ_GE_UINT16          1
+   #define GxB_NO_EQ_GE_UINT32          1
+   #define GxB_NO_EQ_GE_UINT64          1
+   #define GxB_NO_EQ_GE_UINT8           1
+
+   #define GxB_NO_EQ_GT_BOOL            1
+   #define GxB_NO_EQ_GT_FP32            1
+   #define GxB_NO_EQ_GT_FP64            1
+   #define GxB_NO_EQ_GT_INT16           1
+   #define GxB_NO_EQ_GT_INT32           1
+   #define GxB_NO_EQ_GT_INT64           1
+   #define GxB_NO_EQ_GT_INT8            1
+   #define GxB_NO_EQ_GT_UINT16          1
+   #define GxB_NO_EQ_GT_UINT32          1
+   #define GxB_NO_EQ_GT_UINT64          1
+   #define GxB_NO_EQ_GT_UINT8           1
+
+   #define GxB_NO_EQ_LE_BOOL            1
+   #define GxB_NO_EQ_LE_FP32            1
+   #define GxB_NO_EQ_LE_FP64            1
+   #define GxB_NO_EQ_LE_INT16           1
+   #define GxB_NO_EQ_LE_INT32           1
+   #define GxB_NO_EQ_LE_INT64           1
+   #define GxB_NO_EQ_LE_INT8            1
+   #define GxB_NO_EQ_LE_UINT16          1
+   #define GxB_NO_EQ_LE_UINT32          1
+   #define GxB_NO_EQ_LE_UINT64          1
+   #define GxB_NO_EQ_LE_UINT8           1
+
+   #define GxB_NO_EQ_LT_BOOL            1
+   #define GxB_NO_EQ_LT_FP32            1
+   #define GxB_NO_EQ_LT_FP64            1
+   #define GxB_NO_EQ_LT_INT16           1
+   #define GxB_NO_EQ_LT_INT32           1
+   #define GxB_NO_EQ_LT_INT64           1
+   #define GxB_NO_EQ_LT_INT8            1
+   #define GxB_NO_EQ_LT_UINT16          1
+   #define GxB_NO_EQ_LT_UINT32          1
+   #define GxB_NO_EQ_LT_UINT64          1
+   #define GxB_NO_EQ_LT_UINT8           1
+
+   #define GxB_NO_EQ_NE_FP32            1
+   #define GxB_NO_EQ_NE_FP64            1
+   #define GxB_NO_EQ_NE_INT16           1
+   #define GxB_NO_EQ_NE_INT32           1
+   #define GxB_NO_EQ_NE_INT64           1
+   #define GxB_NO_EQ_NE_INT8            1
+   #define GxB_NO_EQ_NE_UINT16          1
+   #define GxB_NO_EQ_NE_UINT32          1
+   #define GxB_NO_EQ_NE_UINT64          1
+   #define GxB_NO_EQ_NE_UINT8           1
+
+//------------------------------------------------------------
+// semirings with the boolean LAND monoid
+//------------------------------------------------------------
+
+// The only builtin GrB* semiring that uses the LAND monoid is LAND_LOR_BOOL
 
+// #define GxB_NO_LAND_EQ_BOOL          1
 // #define GxB_NO_LAND_LAND_BOOL        1
-
-// #define GxB_NO_LAND_LE_BOOL          1
-// #define GxB_NO_LAND_LE_FP32          1
-// #define GxB_NO_LAND_LE_FP64          1
-// #define GxB_NO_LAND_LE_INT16         1
-// #define GxB_NO_LAND_LE_INT32         1
-// #define GxB_NO_LAND_LE_INT64         1
-// #define GxB_NO_LAND_LE_INT8          1
-// #define GxB_NO_LAND_LE_UINT16        1
-// #define GxB_NO_LAND_LE_UINT32        1
-// #define GxB_NO_LAND_LE_UINT64        1
-// #define GxB_NO_LAND_LE_UINT8         1
-
+// builtin: GrB_LAND_LOR_SEMIRING_BOOL == GxB_LAND_LOR_BOOL:
 // #define GxB_NO_LAND_LOR_BOOL         1
-
-// #define GxB_NO_LAND_LT_BOOL          1
-// #define GxB_NO_LAND_LT_FP32          1
-// #define GxB_NO_LAND_LT_FP64          1
-// #define GxB_NO_LAND_LT_INT16         1
-// #define GxB_NO_LAND_LT_INT32         1
-// #define GxB_NO_LAND_LT_INT64         1
-// #define GxB_NO_LAND_LT_INT8          1
-// #define GxB_NO_LAND_LT_UINT16        1
-// #define GxB_NO_LAND_LT_UINT32        1
-// #define GxB_NO_LAND_LT_UINT64        1
-// #define GxB_NO_LAND_LT_UINT8         1
-
 // #define GxB_NO_LAND_LXOR_BOOL        1
-
-// #define GxB_NO_LAND_NE_FP32          1
-// #define GxB_NO_LAND_NE_FP64          1
-// #define GxB_NO_LAND_NE_INT16         1
-// #define GxB_NO_LAND_NE_INT32         1
-// #define GxB_NO_LAND_NE_INT64         1
-// #define GxB_NO_LAND_NE_INT8          1
-// #define GxB_NO_LAND_NE_UINT16        1
-// #define GxB_NO_LAND_NE_UINT32        1
-// #define GxB_NO_LAND_NE_UINT64        1
-// #define GxB_NO_LAND_NE_UINT8         1
-
+// #define GxB_NO_LAND_FIRST_BOOL       1
 // #define GxB_NO_LAND_SECOND_BOOL      1
 
-// #define GxB_NO_LOR_EQ_BOOL           1
-// #define GxB_NO_LOR_EQ_FP32           1
-// #define GxB_NO_LOR_EQ_FP64           1
-// #define GxB_NO_LOR_EQ_INT16          1
-// #define GxB_NO_LOR_EQ_INT32          1
-// #define GxB_NO_LOR_EQ_INT64          1
-// #define GxB_NO_LOR_EQ_INT8           1
-// #define GxB_NO_LOR_EQ_UINT16         1
-// #define GxB_NO_LOR_EQ_UINT32         1
-// #define GxB_NO_LOR_EQ_UINT64         1
-// #define GxB_NO_LOR_EQ_UINT8          1
-
-// #define GxB_NO_LOR_FIRST_BOOL        1
-// #define GxB_NO_LOR_PAIR_BOOL         1
-// #define GxB_NO_LOR_ANY_BOOL          1
-
-// #define GxB_NO_LOR_GE_BOOL           1
-// #define GxB_NO_LOR_GE_FP32           1
-// #define GxB_NO_LOR_GE_FP64           1
-// #define GxB_NO_LOR_GE_INT16          1
-// #define GxB_NO_LOR_GE_INT32          1
-// #define GxB_NO_LOR_GE_INT64          1
-// #define GxB_NO_LOR_GE_INT8           1
-// #define GxB_NO_LOR_GE_UINT16         1
-// #define GxB_NO_LOR_GE_UINT32         1
-// #define GxB_NO_LOR_GE_UINT64         1
-// #define GxB_NO_LOR_GE_UINT8          1
-
-// #define GxB_NO_LOR_GT_BOOL           1
-// #define GxB_NO_LOR_GT_FP32           1
-// #define GxB_NO_LOR_GT_FP64           1
-// #define GxB_NO_LOR_GT_INT16          1
-// #define GxB_NO_LOR_GT_INT32          1
-// #define GxB_NO_LOR_GT_INT64          1
-// #define GxB_NO_LOR_GT_INT8           1
-// #define GxB_NO_LOR_GT_UINT16         1
-// #define GxB_NO_LOR_GT_UINT32         1
-// #define GxB_NO_LOR_GT_UINT64         1
-// #define GxB_NO_LOR_GT_UINT8          1
+   #define GxB_NO_LAND_EQ_FP32          1
+   #define GxB_NO_LAND_EQ_FP64          1
+   #define GxB_NO_LAND_EQ_INT16         1
+   #define GxB_NO_LAND_EQ_INT32         1
+   #define GxB_NO_LAND_EQ_INT64         1
+   #define GxB_NO_LAND_EQ_INT8          1
+   #define GxB_NO_LAND_EQ_UINT16        1
+   #define GxB_NO_LAND_EQ_UINT32        1
+   #define GxB_NO_LAND_EQ_UINT64        1
+   #define GxB_NO_LAND_EQ_UINT8         1
+
+   #define GxB_NO_LAND_ANY_BOOL         1
+
+   #define GxB_NO_LAND_GE_BOOL          1
+   #define GxB_NO_LAND_GE_FP32          1
+   #define GxB_NO_LAND_GE_FP64          1
+   #define GxB_NO_LAND_GE_INT16         1
+   #define GxB_NO_LAND_GE_INT32         1
+   #define GxB_NO_LAND_GE_INT64         1
+   #define GxB_NO_LAND_GE_INT8          1
+   #define GxB_NO_LAND_GE_UINT16        1
+   #define GxB_NO_LAND_GE_UINT32        1
+   #define GxB_NO_LAND_GE_UINT64        1
+   #define GxB_NO_LAND_GE_UINT8         1
+
+   #define GxB_NO_LAND_GT_BOOL          1
+
+   #define GxB_NO_LAND_GT_FP32          1
+   #define GxB_NO_LAND_GT_FP64          1
+   #define GxB_NO_LAND_GT_INT16         1
+   #define GxB_NO_LAND_GT_INT32         1
+   #define GxB_NO_LAND_GT_INT64         1
+   #define GxB_NO_LAND_GT_INT8          1
+   #define GxB_NO_LAND_GT_UINT16        1
+   #define GxB_NO_LAND_GT_UINT32        1
+   #define GxB_NO_LAND_GT_UINT64        1
+   #define GxB_NO_LAND_GT_UINT8         1
+
+   #define GxB_NO_LAND_LE_BOOL          1
+   #define GxB_NO_LAND_LE_FP32          1
+   #define GxB_NO_LAND_LE_FP64          1
+   #define GxB_NO_LAND_LE_INT16         1
+   #define GxB_NO_LAND_LE_INT32         1
+   #define GxB_NO_LAND_LE_INT64         1
+   #define GxB_NO_LAND_LE_INT8          1
+   #define GxB_NO_LAND_LE_UINT16        1
+   #define GxB_NO_LAND_LE_UINT32        1
+   #define GxB_NO_LAND_LE_UINT64        1
+   #define GxB_NO_LAND_LE_UINT8         1
+
+   #define GxB_NO_LAND_LT_BOOL          1
+   #define GxB_NO_LAND_LT_FP32          1
+   #define GxB_NO_LAND_LT_FP64          1
+   #define GxB_NO_LAND_LT_INT16         1
+   #define GxB_NO_LAND_LT_INT32         1
+   #define GxB_NO_LAND_LT_INT64         1
+   #define GxB_NO_LAND_LT_INT8          1
+   #define GxB_NO_LAND_LT_UINT16        1
+   #define GxB_NO_LAND_LT_UINT32        1
+   #define GxB_NO_LAND_LT_UINT64        1
+   #define GxB_NO_LAND_LT_UINT8         1
+
+   #define GxB_NO_LAND_NE_FP32          1
+   #define GxB_NO_LAND_NE_FP64          1
+   #define GxB_NO_LAND_NE_INT16         1
+   #define GxB_NO_LAND_NE_INT32         1
+   #define GxB_NO_LAND_NE_INT64         1
+   #define GxB_NO_LAND_NE_INT8          1
+   #define GxB_NO_LAND_NE_UINT16        1
+   #define GxB_NO_LAND_NE_UINT32        1
+   #define GxB_NO_LAND_NE_UINT64        1
+   #define GxB_NO_LAND_NE_UINT8         1
+
+//------------------------------------------------------------
+// semirings with the boolean LOR monoid
+//------------------------------------------------------------
+
+// The only builtin GrB* semiring that uses the LOR monoid is LOR_LAND_BOOL
 
+// #define GxB_NO_LOR_EQ_BOOL           1
+// builtin GrB_LOR_LAND_SEMIRING_BOOL == GxB_LOR_LAND_BOOL:
 // #define GxB_NO_LOR_LAND_BOOL         1
-
-// #define GxB_NO_LOR_LE_BOOL           1
-// #define GxB_NO_LOR_LE_FP32           1
-// #define GxB_NO_LOR_LE_FP64           1
-// #define GxB_NO_LOR_LE_INT16          1
-// #define GxB_NO_LOR_LE_INT32          1
-// #define GxB_NO_LOR_LE_INT64          1
-// #define GxB_NO_LOR_LE_INT8           1
-// #define GxB_NO_LOR_LE_UINT16         1
-// #define GxB_NO_LOR_LE_UINT32         1
-// #define GxB_NO_LOR_LE_UINT64         1
-// #define GxB_NO_LOR_LE_UINT8          1
-
 // #define GxB_NO_LOR_LOR_BOOL          1
-
-// #define GxB_NO_LOR_LT_BOOL           1
-// #define GxB_NO_LOR_LT_FP32           1
-// #define GxB_NO_LOR_LT_FP64           1
-// #define GxB_NO_LOR_LT_INT16          1
-// #define GxB_NO_LOR_LT_INT32          1
-// #define GxB_NO_LOR_LT_INT64          1
-// #define GxB_NO_LOR_LT_INT8           1
-// #define GxB_NO_LOR_LT_UINT16         1
-// #define GxB_NO_LOR_LT_UINT32         1
-// #define GxB_NO_LOR_LT_UINT64         1
-// #define GxB_NO_LOR_LT_UINT8          1
-
 // #define GxB_NO_LOR_LXOR_BOOL         1
-
-// #define GxB_NO_LOR_NE_FP32           1
-// #define GxB_NO_LOR_NE_FP64           1
-// #define GxB_NO_LOR_NE_INT16          1
-// #define GxB_NO_LOR_NE_INT32          1
-// #define GxB_NO_LOR_NE_INT64          1
-// #define GxB_NO_LOR_NE_INT8           1
-// #define GxB_NO_LOR_NE_UINT16         1
-// #define GxB_NO_LOR_NE_UINT32         1
-// #define GxB_NO_LOR_NE_UINT64         1
-// #define GxB_NO_LOR_NE_UINT8          1
-
+// #define GxB_NO_LOR_FIRST_BOOL        1
 // #define GxB_NO_LOR_SECOND_BOOL       1
 
-// #define GxB_NO_LXOR_EQ_BOOL          1
-// #define GxB_NO_LXOR_EQ_FP32          1
-// #define GxB_NO_LXOR_EQ_FP64          1
-// #define GxB_NO_LXOR_EQ_INT16         1
-// #define GxB_NO_LXOR_EQ_INT32         1
-// #define GxB_NO_LXOR_EQ_INT64         1
-// #define GxB_NO_LXOR_EQ_INT8          1
-// #define GxB_NO_LXOR_EQ_UINT16        1
-// #define GxB_NO_LXOR_EQ_UINT32        1
-// #define GxB_NO_LXOR_EQ_UINT64        1
-// #define GxB_NO_LXOR_EQ_UINT8         1
-
-// #define GxB_NO_LXOR_FIRST_BOOL       1
-// #define GxB_NO_LXOR_PAIR_BOOL        1
-// #define GxB_NO_LXOR_ANY_BOOL         1
-
-// #define GxB_NO_LXOR_GE_BOOL          1
-// #define GxB_NO_LXOR_GE_FP32          1
-// #define GxB_NO_LXOR_GE_FP64          1
-// #define GxB_NO_LXOR_GE_INT16         1
-// #define GxB_NO_LXOR_GE_INT32         1
-// #define GxB_NO_LXOR_GE_INT64         1
-// #define GxB_NO_LXOR_GE_INT8          1
-// #define GxB_NO_LXOR_GE_UINT16        1
-// #define GxB_NO_LXOR_GE_UINT32        1
-// #define GxB_NO_LXOR_GE_UINT64        1
-// #define GxB_NO_LXOR_GE_UINT8         1
-
-// #define GxB_NO_LXOR_GT_BOOL          1
-// #define GxB_NO_LXOR_GT_FP32          1
-// #define GxB_NO_LXOR_GT_FP64          1
-// #define GxB_NO_LXOR_GT_INT16         1
-// #define GxB_NO_LXOR_GT_INT32         1
-// #define GxB_NO_LXOR_GT_INT64         1
-// #define GxB_NO_LXOR_GT_INT8          1
-// #define GxB_NO_LXOR_GT_UINT16        1
-// #define GxB_NO_LXOR_GT_UINT32        1
-// #define GxB_NO_LXOR_GT_UINT64        1
-// #define GxB_NO_LXOR_GT_UINT8         1
+   #define GxB_NO_LOR_EQ_FP32           1
+   #define GxB_NO_LOR_EQ_FP64           1
+   #define GxB_NO_LOR_EQ_INT16          1
+   #define GxB_NO_LOR_EQ_INT32          1
+   #define GxB_NO_LOR_EQ_INT64          1
+   #define GxB_NO_LOR_EQ_INT8           1
+   #define GxB_NO_LOR_EQ_UINT16         1
+   #define GxB_NO_LOR_EQ_UINT32         1
+   #define GxB_NO_LOR_EQ_UINT64         1
+   #define GxB_NO_LOR_EQ_UINT8          1
+
+   #define GxB_NO_LOR_ANY_BOOL          1
+
+   #define GxB_NO_LOR_GE_BOOL           1
+   #define GxB_NO_LOR_GE_FP32           1
+   #define GxB_NO_LOR_GE_FP64           1
+   #define GxB_NO_LOR_GE_INT16          1
+   #define GxB_NO_LOR_GE_INT32          1
+   #define GxB_NO_LOR_GE_INT64          1
+   #define GxB_NO_LOR_GE_INT8           1
+   #define GxB_NO_LOR_GE_UINT16         1
+   #define GxB_NO_LOR_GE_UINT32         1
+   #define GxB_NO_LOR_GE_UINT64         1
+   #define GxB_NO_LOR_GE_UINT8          1
+
+   #define GxB_NO_LOR_GT_BOOL           1
+   #define GxB_NO_LOR_GT_FP32           1
+   #define GxB_NO_LOR_GT_FP64           1
+   #define GxB_NO_LOR_GT_INT16          1
+   #define GxB_NO_LOR_GT_INT32          1
+   #define GxB_NO_LOR_GT_INT64          1
+   #define GxB_NO_LOR_GT_INT8           1
+   #define GxB_NO_LOR_GT_UINT16         1
+   #define GxB_NO_LOR_GT_UINT32         1
+   #define GxB_NO_LOR_GT_UINT64         1
+   #define GxB_NO_LOR_GT_UINT8          1
+
+   #define GxB_NO_LOR_LE_BOOL           1
+   #define GxB_NO_LOR_LE_FP32           1
+   #define GxB_NO_LOR_LE_FP64           1
+   #define GxB_NO_LOR_LE_INT16          1
+   #define GxB_NO_LOR_LE_INT32          1
+   #define GxB_NO_LOR_LE_INT64          1
+   #define GxB_NO_LOR_LE_INT8           1
+   #define GxB_NO_LOR_LE_UINT16         1
+   #define GxB_NO_LOR_LE_UINT32         1
+   #define GxB_NO_LOR_LE_UINT64         1
+   #define GxB_NO_LOR_LE_UINT8          1
+
+   #define GxB_NO_LOR_LT_BOOL           1
+   #define GxB_NO_LOR_LT_FP32           1
+   #define GxB_NO_LOR_LT_FP64           1
+   #define GxB_NO_LOR_LT_INT16          1
+   #define GxB_NO_LOR_LT_INT32          1
+   #define GxB_NO_LOR_LT_INT64          1
+   #define GxB_NO_LOR_LT_INT8           1
+   #define GxB_NO_LOR_LT_UINT16         1
+   #define GxB_NO_LOR_LT_UINT32         1
+   #define GxB_NO_LOR_LT_UINT64         1
+   #define GxB_NO_LOR_LT_UINT8          1
+
+   #define GxB_NO_LOR_NE_FP32           1
+   #define GxB_NO_LOR_NE_FP64           1
+   #define GxB_NO_LOR_NE_INT16          1
+   #define GxB_NO_LOR_NE_INT32          1
+   #define GxB_NO_LOR_NE_INT64          1
+   #define GxB_NO_LOR_NE_INT8           1
+   #define GxB_NO_LOR_NE_UINT16         1
+   #define GxB_NO_LOR_NE_UINT32         1
+   #define GxB_NO_LOR_NE_UINT64         1
+   #define GxB_NO_LOR_NE_UINT8          1
+
+//------------------------------------------------------------
+// semirings with the boolean LXOR monoid (also called NE)
+//------------------------------------------------------------
+
+// The only builtin GrB* semiring that uses the LXOR monoid is LXOR_LAND_BOOL
 
+// #define GxB_NO_LXOR_EQ_BOOL          1
+// builtin: GrB_LXOR_LAND_SEMIRING_BOOL == GxB_LXOR_LAND_BOOL:
 // #define GxB_NO_LXOR_LAND_BOOL        1
-
-// #define GxB_NO_LXOR_LE_BOOL          1
-// #define GxB_NO_LXOR_LE_FP32          1
-// #define GxB_NO_LXOR_LE_FP64          1
-// #define GxB_NO_LXOR_LE_INT16         1
-// #define GxB_NO_LXOR_LE_INT32         1
-// #define GxB_NO_LXOR_LE_INT64         1
-// #define GxB_NO_LXOR_LE_INT8          1
-// #define GxB_NO_LXOR_LE_UINT16        1
-// #define GxB_NO_LXOR_LE_UINT32        1
-// #define GxB_NO_LXOR_LE_UINT64        1
-// #define GxB_NO_LXOR_LE_UINT8         1
-
 // #define GxB_NO_LXOR_LOR_BOOL         1
-
-// #define GxB_NO_LXOR_LT_BOOL          1
-// #define GxB_NO_LXOR_LT_FP32          1
-// #define GxB_NO_LXOR_LT_FP64          1
-// #define GxB_NO_LXOR_LT_INT16         1
-// #define GxB_NO_LXOR_LT_INT32         1
-// #define GxB_NO_LXOR_LT_INT64         1
-// #define GxB_NO_LXOR_LT_INT8          1
-// #define GxB_NO_LXOR_LT_UINT16        1
-// #define GxB_NO_LXOR_LT_UINT32        1
-// #define GxB_NO_LXOR_LT_UINT64        1
-// #define GxB_NO_LXOR_LT_UINT8         1
-
 // #define GxB_NO_LXOR_LXOR_BOOL        1
-
-// #define GxB_NO_LXOR_NE_FP32          1
-// #define GxB_NO_LXOR_NE_FP64          1
-// #define GxB_NO_LXOR_NE_INT16         1
-// #define GxB_NO_LXOR_NE_INT32         1
-// #define GxB_NO_LXOR_NE_INT64         1
-// #define GxB_NO_LXOR_NE_INT8          1
-// #define GxB_NO_LXOR_NE_UINT16        1
-// #define GxB_NO_LXOR_NE_UINT32        1
-// #define GxB_NO_LXOR_NE_UINT64        1
-// #define GxB_NO_LXOR_NE_UINT8         1
-
+// #define GxB_NO_LXOR_FIRST_BOOL       1
 // #define GxB_NO_LXOR_SECOND_BOOL      1
+// #define GxB_NO_LXOR_PAIR_BOOL        1
 
-// #define GxB_NO_MAX_DIV_FP32          1
-// #define GxB_NO_MAX_DIV_FP64          1
-// #define GxB_NO_MAX_DIV_INT16         1
-// #define GxB_NO_MAX_DIV_INT32         1
-// #define GxB_NO_MAX_DIV_INT64         1
-// #define GxB_NO_MAX_DIV_INT8          1
-// #define GxB_NO_MAX_DIV_UINT16        1
-// #define GxB_NO_MAX_DIV_UINT32        1
-// #define GxB_NO_MAX_DIV_UINT64        1
-// #define GxB_NO_MAX_DIV_UINT8         1
-
-// #define GxB_NO_MAX_FIRST_FP32        1
-// #define GxB_NO_MAX_FIRST_FP64        1
-// #define GxB_NO_MAX_FIRST_INT16       1
-// #define GxB_NO_MAX_FIRST_INT32       1
-// #define GxB_NO_MAX_FIRST_INT64       1
-// #define GxB_NO_MAX_FIRST_INT8        1
-// #define GxB_NO_MAX_FIRST_UINT16      1
-// #define GxB_NO_MAX_FIRST_UINT32      1
-// #define GxB_NO_MAX_FIRST_UINT64      1
-// #define GxB_NO_MAX_FIRST_UINT8       1
-
-// #define GxB_NO_MAX_PAIR_FP32         1
-// #define GxB_NO_MAX_PAIR_FP64         1
-// #define GxB_NO_MAX_PAIR_INT16        1
-// #define GxB_NO_MAX_PAIR_INT32        1
-// #define GxB_NO_MAX_PAIR_INT64        1
-// #define GxB_NO_MAX_PAIR_INT8         1
-// #define GxB_NO_MAX_PAIR_UINT16       1
-// #define GxB_NO_MAX_PAIR_UINT32       1
-// #define GxB_NO_MAX_PAIR_UINT64       1
-// #define GxB_NO_MAX_PAIR_UINT8        1
-
-// #define GxB_NO_MAX_ANY_FP32          1
-// #define GxB_NO_MAX_ANY_FP64          1
-// #define GxB_NO_MAX_ANY_INT16         1
-// #define GxB_NO_MAX_ANY_INT32         1
-// #define GxB_NO_MAX_ANY_INT64         1
-// #define GxB_NO_MAX_ANY_INT8          1
-// #define GxB_NO_MAX_ANY_UINT16        1
-// #define GxB_NO_MAX_ANY_UINT32        1
-// #define GxB_NO_MAX_ANY_UINT64        1
-// #define GxB_NO_MAX_ANY_UINT8         1
-
-// #define GxB_NO_MAX_ISEQ_FP32         1
-// #define GxB_NO_MAX_ISEQ_FP64         1
-// #define GxB_NO_MAX_ISEQ_INT16        1
-// #define GxB_NO_MAX_ISEQ_INT32        1
-// #define GxB_NO_MAX_ISEQ_INT64        1
-// #define GxB_NO_MAX_ISEQ_INT8         1
-// #define GxB_NO_MAX_ISEQ_UINT16       1
-// #define GxB_NO_MAX_ISEQ_UINT32       1
-// #define GxB_NO_MAX_ISEQ_UINT64       1
-// #define GxB_NO_MAX_ISEQ_UINT8        1
-
-// #define GxB_NO_MAX_ISGE_FP32         1
-// #define GxB_NO_MAX_ISGE_FP64         1
-// #define GxB_NO_MAX_ISGE_INT16        1
-// #define GxB_NO_MAX_ISGE_INT32        1
-// #define GxB_NO_MAX_ISGE_INT64        1
-// #define GxB_NO_MAX_ISGE_INT8         1
-// #define GxB_NO_MAX_ISGE_UINT16       1
-// #define GxB_NO_MAX_ISGE_UINT32       1
-// #define GxB_NO_MAX_ISGE_UINT64       1
-// #define GxB_NO_MAX_ISGE_UINT8        1
-
-// #define GxB_NO_MAX_ISGT_FP32         1
-// #define GxB_NO_MAX_ISGT_FP64         1
-// #define GxB_NO_MAX_ISGT_INT16        1
-// #define GxB_NO_MAX_ISGT_INT32        1
-// #define GxB_NO_MAX_ISGT_INT64        1
-// #define GxB_NO_MAX_ISGT_INT8         1
-// #define GxB_NO_MAX_ISGT_UINT16       1
-// #define GxB_NO_MAX_ISGT_UINT32       1
-// #define GxB_NO_MAX_ISGT_UINT64       1
-// #define GxB_NO_MAX_ISGT_UINT8        1
-
-// #define GxB_NO_MAX_ISLE_FP32         1
-// #define GxB_NO_MAX_ISLE_FP64         1
-// #define GxB_NO_MAX_ISLE_INT16        1
-// #define GxB_NO_MAX_ISLE_INT32        1
-// #define GxB_NO_MAX_ISLE_INT64        1
-// #define GxB_NO_MAX_ISLE_INT8         1
-// #define GxB_NO_MAX_ISLE_UINT16       1
-// #define GxB_NO_MAX_ISLE_UINT32       1
-// #define GxB_NO_MAX_ISLE_UINT64       1
-// #define GxB_NO_MAX_ISLE_UINT8        1
-
-// #define GxB_NO_MAX_ISLT_FP32         1
-// #define GxB_NO_MAX_ISLT_FP64         1
-// #define GxB_NO_MAX_ISLT_INT16        1
-// #define GxB_NO_MAX_ISLT_INT32        1
-// #define GxB_NO_MAX_ISLT_INT64        1
-// #define GxB_NO_MAX_ISLT_INT8         1
-// #define GxB_NO_MAX_ISLT_UINT16       1
-// #define GxB_NO_MAX_ISLT_UINT32       1
-// #define GxB_NO_MAX_ISLT_UINT64       1
-// #define GxB_NO_MAX_ISLT_UINT8        1
-
-// #define GxB_NO_MAX_ISNE_FP32         1
-// #define GxB_NO_MAX_ISNE_FP64         1
-// #define GxB_NO_MAX_ISNE_INT16        1
-// #define GxB_NO_MAX_ISNE_INT32        1
-// #define GxB_NO_MAX_ISNE_INT64        1
-// #define GxB_NO_MAX_ISNE_INT8         1
-// #define GxB_NO_MAX_ISNE_UINT16       1
-// #define GxB_NO_MAX_ISNE_UINT32       1
-// #define GxB_NO_MAX_ISNE_UINT64       1
-// #define GxB_NO_MAX_ISNE_UINT8        1
-
-// #define GxB_NO_MAX_LAND_FP32         1
-// #define GxB_NO_MAX_LAND_FP64         1
-// #define GxB_NO_MAX_LAND_INT16        1
-// #define GxB_NO_MAX_LAND_INT32        1
-// #define GxB_NO_MAX_LAND_INT64        1
-// #define GxB_NO_MAX_LAND_INT8         1
-// #define GxB_NO_MAX_LAND_UINT16       1
-// #define GxB_NO_MAX_LAND_UINT32       1
-// #define GxB_NO_MAX_LAND_UINT64       1
-// #define GxB_NO_MAX_LAND_UINT8        1
-
-// #define GxB_NO_MAX_LOR_FP32          1
-// #define GxB_NO_MAX_LOR_FP64          1
-// #define GxB_NO_MAX_LOR_INT16         1
-// #define GxB_NO_MAX_LOR_INT32         1
-// #define GxB_NO_MAX_LOR_INT64         1
-// #define GxB_NO_MAX_LOR_INT8          1
-// #define GxB_NO_MAX_LOR_UINT16        1
-// #define GxB_NO_MAX_LOR_UINT32        1
-// #define GxB_NO_MAX_LOR_UINT64        1
-// #define GxB_NO_MAX_LOR_UINT8         1
-
-// #define GxB_NO_MAX_LXOR_FP32         1
-// #define GxB_NO_MAX_LXOR_FP64         1
-// #define GxB_NO_MAX_LXOR_INT16        1
-// #define GxB_NO_MAX_LXOR_INT32        1
-// #define GxB_NO_MAX_LXOR_INT64        1
-// #define GxB_NO_MAX_LXOR_INT8         1
-// #define GxB_NO_MAX_LXOR_UINT16       1
-// #define GxB_NO_MAX_LXOR_UINT32       1
-// #define GxB_NO_MAX_LXOR_UINT64       1
-// #define GxB_NO_MAX_LXOR_UINT8        1
-
-// #define GxB_NO_MAX_MAX_FP32          1
-// #define GxB_NO_MAX_MAX_FP64          1
-// #define GxB_NO_MAX_MAX_INT16         1
-// #define GxB_NO_MAX_MAX_INT32         1
-// #define GxB_NO_MAX_MAX_INT64         1
-// #define GxB_NO_MAX_MAX_INT8          1
-// #define GxB_NO_MAX_MAX_UINT16        1
-// #define GxB_NO_MAX_MAX_UINT32        1
-// #define GxB_NO_MAX_MAX_UINT64        1
-// #define GxB_NO_MAX_MAX_UINT8         1
-
-// #define GxB_NO_MAX_MINUS_FP32        1
-// #define GxB_NO_MAX_MINUS_FP64        1
-// #define GxB_NO_MAX_MINUS_INT16       1
-// #define GxB_NO_MAX_MINUS_INT32       1
-// #define GxB_NO_MAX_MINUS_INT64       1
-// #define GxB_NO_MAX_MINUS_INT8        1
-// #define GxB_NO_MAX_MINUS_UINT16      1
-// #define GxB_NO_MAX_MINUS_UINT32      1
-// #define GxB_NO_MAX_MINUS_UINT64      1
-// #define GxB_NO_MAX_MINUS_UINT8       1
-
+   #define GxB_NO_LXOR_EQ_FP32          1
+   #define GxB_NO_LXOR_EQ_FP64          1
+   #define GxB_NO_LXOR_EQ_INT16         1
+   #define GxB_NO_LXOR_EQ_INT32         1
+   #define GxB_NO_LXOR_EQ_INT64         1
+   #define GxB_NO_LXOR_EQ_INT8          1
+   #define GxB_NO_LXOR_EQ_UINT16        1
+   #define GxB_NO_LXOR_EQ_UINT32        1
+   #define GxB_NO_LXOR_EQ_UINT64        1
+   #define GxB_NO_LXOR_EQ_UINT8         1
+
+   #define GxB_NO_LXOR_ANY_BOOL         1
+
+   #define GxB_NO_LXOR_GE_BOOL          1
+   #define GxB_NO_LXOR_GE_FP32          1
+   #define GxB_NO_LXOR_GE_FP64          1
+   #define GxB_NO_LXOR_GE_INT16         1
+   #define GxB_NO_LXOR_GE_INT32         1
+   #define GxB_NO_LXOR_GE_INT64         1
+   #define GxB_NO_LXOR_GE_INT8          1
+   #define GxB_NO_LXOR_GE_UINT16        1
+   #define GxB_NO_LXOR_GE_UINT32        1
+   #define GxB_NO_LXOR_GE_UINT64        1
+   #define GxB_NO_LXOR_GE_UINT8         1
+
+   #define GxB_NO_LXOR_GT_BOOL          1
+   #define GxB_NO_LXOR_GT_FP32          1
+   #define GxB_NO_LXOR_GT_FP64          1
+   #define GxB_NO_LXOR_GT_INT16         1
+   #define GxB_NO_LXOR_GT_INT32         1
+   #define GxB_NO_LXOR_GT_INT64         1
+   #define GxB_NO_LXOR_GT_INT8          1
+   #define GxB_NO_LXOR_GT_UINT16        1
+   #define GxB_NO_LXOR_GT_UINT32        1
+   #define GxB_NO_LXOR_GT_UINT64        1
+   #define GxB_NO_LXOR_GT_UINT8         1
+
+   #define GxB_NO_LXOR_LE_BOOL          1
+   #define GxB_NO_LXOR_LE_FP32          1
+   #define GxB_NO_LXOR_LE_FP64          1
+   #define GxB_NO_LXOR_LE_INT16         1
+   #define GxB_NO_LXOR_LE_INT32         1
+   #define GxB_NO_LXOR_LE_INT64         1
+   #define GxB_NO_LXOR_LE_INT8          1
+   #define GxB_NO_LXOR_LE_UINT16        1
+   #define GxB_NO_LXOR_LE_UINT32        1
+   #define GxB_NO_LXOR_LE_UINT64        1
+   #define GxB_NO_LXOR_LE_UINT8         1
+
+   #define GxB_NO_LXOR_LT_BOOL          1
+   #define GxB_NO_LXOR_LT_FP32          1
+   #define GxB_NO_LXOR_LT_FP64          1
+   #define GxB_NO_LXOR_LT_INT16         1
+   #define GxB_NO_LXOR_LT_INT32         1
+   #define GxB_NO_LXOR_LT_INT64         1
+   #define GxB_NO_LXOR_LT_INT8          1
+   #define GxB_NO_LXOR_LT_UINT16        1
+   #define GxB_NO_LXOR_LT_UINT32        1
+   #define GxB_NO_LXOR_LT_UINT64        1
+   #define GxB_NO_LXOR_LT_UINT8         1
+
+   #define GxB_NO_LXOR_NE_FP32          1
+   #define GxB_NO_LXOR_NE_FP64          1
+   #define GxB_NO_LXOR_NE_INT16         1
+   #define GxB_NO_LXOR_NE_INT32         1
+   #define GxB_NO_LXOR_NE_INT64         1
+   #define GxB_NO_LXOR_NE_INT8          1
+   #define GxB_NO_LXOR_NE_UINT16        1
+   #define GxB_NO_LXOR_NE_UINT32        1
+   #define GxB_NO_LXOR_NE_UINT64        1
+   #define GxB_NO_LXOR_NE_UINT8         1
+
+//------------------------------------------------------------
+// semirings with the MAX monoid
+//------------------------------------------------------------
+
+// MAX_PLUS, MAX_TIMES, MAX_FIRST, MAX_SECOND, and MAX_MIN are GrB* builtins.
+
+// builtin GrB*:
 // #define GxB_NO_MAX_MIN_FP32          1
 // #define GxB_NO_MAX_MIN_FP64          1
 // #define GxB_NO_MAX_MIN_INT16         1
@@ -1210,6 +1186,7 @@
 // #define GxB_NO_MAX_MIN_UINT64        1
 // #define GxB_NO_MAX_MIN_UINT8         1
 
+// builtin GrB*:
 // #define GxB_NO_MAX_PLUS_FP32         1
 // #define GxB_NO_MAX_PLUS_FP64         1
 // #define GxB_NO_MAX_PLUS_INT16        1
@@ -1221,39 +1198,7 @@
 // #define GxB_NO_MAX_PLUS_UINT64       1
 // #define GxB_NO_MAX_PLUS_UINT8        1
 
-// #define GxB_NO_MAX_RDIV_FP32         1
-// #define GxB_NO_MAX_RDIV_FP64         1
-// #define GxB_NO_MAX_RDIV_INT16        1
-// #define GxB_NO_MAX_RDIV_INT32        1
-// #define GxB_NO_MAX_RDIV_INT64        1
-// #define GxB_NO_MAX_RDIV_INT8         1
-// #define GxB_NO_MAX_RDIV_UINT16       1
-// #define GxB_NO_MAX_RDIV_UINT32       1
-// #define GxB_NO_MAX_RDIV_UINT64       1
-// #define GxB_NO_MAX_RDIV_UINT8        1
-
-// #define GxB_NO_MAX_RMINUS_FP32       1
-// #define GxB_NO_MAX_RMINUS_FP64       1
-// #define GxB_NO_MAX_RMINUS_INT16      1
-// #define GxB_NO_MAX_RMINUS_INT32      1
-// #define GxB_NO_MAX_RMINUS_INT64      1
-// #define GxB_NO_MAX_RMINUS_INT8       1
-// #define GxB_NO_MAX_RMINUS_UINT16     1
-// #define GxB_NO_MAX_RMINUS_UINT32     1
-// #define GxB_NO_MAX_RMINUS_UINT64     1
-// #define GxB_NO_MAX_RMINUS_UINT8      1
-
-// #define GxB_NO_MAX_SECOND_FP32       1
-// #define GxB_NO_MAX_SECOND_FP64       1
-// #define GxB_NO_MAX_SECOND_INT16      1
-// #define GxB_NO_MAX_SECOND_INT32      1
-// #define GxB_NO_MAX_SECOND_INT64      1
-// #define GxB_NO_MAX_SECOND_INT8       1
-// #define GxB_NO_MAX_SECOND_UINT16     1
-// #define GxB_NO_MAX_SECOND_UINT32     1
-// #define GxB_NO_MAX_SECOND_UINT64     1
-// #define GxB_NO_MAX_SECOND_UINT8      1
-
+// builtin GrB*:
 // #define GxB_NO_MAX_TIMES_FP32        1
 // #define GxB_NO_MAX_TIMES_FP64        1
 // #define GxB_NO_MAX_TIMES_INT16       1
@@ -1265,149 +1210,202 @@
 // #define GxB_NO_MAX_TIMES_UINT64      1
 // #define GxB_NO_MAX_TIMES_UINT8       1
 
-// #define GxB_NO_MIN_DIV_FP32          1
-// #define GxB_NO_MIN_DIV_FP64          1
-// #define GxB_NO_MIN_DIV_INT16         1
-// #define GxB_NO_MIN_DIV_INT32         1
-// #define GxB_NO_MIN_DIV_INT64         1
-// #define GxB_NO_MIN_DIV_INT8          1
-// #define GxB_NO_MIN_DIV_UINT16        1
-// #define GxB_NO_MIN_DIV_UINT32        1
-// #define GxB_NO_MIN_DIV_UINT64        1
-// #define GxB_NO_MIN_DIV_UINT8         1
-
-// #define GxB_NO_MIN_FIRST_FP32        1
-// #define GxB_NO_MIN_FIRST_FP64        1
-// #define GxB_NO_MIN_FIRST_INT16       1
-// #define GxB_NO_MIN_FIRST_INT32       1
-// #define GxB_NO_MIN_FIRST_INT64       1
-// #define GxB_NO_MIN_FIRST_INT8        1
-// #define GxB_NO_MIN_FIRST_UINT16      1
-// #define GxB_NO_MIN_FIRST_UINT32      1
-// #define GxB_NO_MIN_FIRST_UINT64      1
-// #define GxB_NO_MIN_FIRST_UINT8       1
+// builtin GrB*: also needed by GrB_reduce to vector
+// #define GxB_NO_MAX_FIRST_FP32        1
+// #define GxB_NO_MAX_FIRST_FP64        1
+// #define GxB_NO_MAX_FIRST_INT16       1
+// #define GxB_NO_MAX_FIRST_INT32       1
+// #define GxB_NO_MAX_FIRST_INT64       1
+// #define GxB_NO_MAX_FIRST_INT8        1
+// #define GxB_NO_MAX_FIRST_UINT16      1
+// #define GxB_NO_MAX_FIRST_UINT32      1
+// #define GxB_NO_MAX_FIRST_UINT64      1
+// #define GxB_NO_MAX_FIRST_UINT8       1
 
-// #define GxB_NO_MIN_PAIR_FP32         1
-// #define GxB_NO_MIN_PAIR_FP64         1
-// #define GxB_NO_MIN_PAIR_INT16        1
-// #define GxB_NO_MIN_PAIR_INT32        1
-// #define GxB_NO_MIN_PAIR_INT64        1
-// #define GxB_NO_MIN_PAIR_INT8         1
-// #define GxB_NO_MIN_PAIR_UINT16       1
-// #define GxB_NO_MIN_PAIR_UINT32       1
-// #define GxB_NO_MIN_PAIR_UINT64       1
-// #define GxB_NO_MIN_PAIR_UINT8        1
-
-// #define GxB_NO_MIN_ANY_FP32          1
-// #define GxB_NO_MIN_ANY_FP64          1
-// #define GxB_NO_MIN_ANY_INT16         1
-// #define GxB_NO_MIN_ANY_INT32         1
-// #define GxB_NO_MIN_ANY_INT64         1
-// #define GxB_NO_MIN_ANY_INT8          1
-// #define GxB_NO_MIN_ANY_UINT16        1
-// #define GxB_NO_MIN_ANY_UINT32        1
-// #define GxB_NO_MIN_ANY_UINT64        1
-// #define GxB_NO_MIN_ANY_UINT8         1
-
-// #define GxB_NO_MIN_ISEQ_FP32         1
-// #define GxB_NO_MIN_ISEQ_FP64         1
-// #define GxB_NO_MIN_ISEQ_INT16        1
-// #define GxB_NO_MIN_ISEQ_INT32        1
-// #define GxB_NO_MIN_ISEQ_INT64        1
-// #define GxB_NO_MIN_ISEQ_INT8         1
-// #define GxB_NO_MIN_ISEQ_UINT16       1
-// #define GxB_NO_MIN_ISEQ_UINT32       1
-// #define GxB_NO_MIN_ISEQ_UINT64       1
-// #define GxB_NO_MIN_ISEQ_UINT8        1
-
-// #define GxB_NO_MIN_ISGE_FP32         1
-// #define GxB_NO_MIN_ISGE_FP64         1
-// #define GxB_NO_MIN_ISGE_INT16        1
-// #define GxB_NO_MIN_ISGE_INT32        1
-// #define GxB_NO_MIN_ISGE_INT64        1
-// #define GxB_NO_MIN_ISGE_INT8         1
-// #define GxB_NO_MIN_ISGE_UINT16       1
-// #define GxB_NO_MIN_ISGE_UINT32       1
-// #define GxB_NO_MIN_ISGE_UINT64       1
-// #define GxB_NO_MIN_ISGE_UINT8        1
-
-// #define GxB_NO_MIN_ISGT_FP32         1
-// #define GxB_NO_MIN_ISGT_FP64         1
-// #define GxB_NO_MIN_ISGT_INT16        1
-// #define GxB_NO_MIN_ISGT_INT32        1
-// #define GxB_NO_MIN_ISGT_INT64        1
-// #define GxB_NO_MIN_ISGT_INT8         1
-// #define GxB_NO_MIN_ISGT_UINT16       1
-// #define GxB_NO_MIN_ISGT_UINT32       1
-// #define GxB_NO_MIN_ISGT_UINT64       1
-// #define GxB_NO_MIN_ISGT_UINT8        1
-
-// #define GxB_NO_MIN_ISLE_FP32         1
-// #define GxB_NO_MIN_ISLE_FP64         1
-// #define GxB_NO_MIN_ISLE_INT16        1
-// #define GxB_NO_MIN_ISLE_INT32        1
-// #define GxB_NO_MIN_ISLE_INT64        1
-// #define GxB_NO_MIN_ISLE_INT8         1
-// #define GxB_NO_MIN_ISLE_UINT16       1
-// #define GxB_NO_MIN_ISLE_UINT32       1
-// #define GxB_NO_MIN_ISLE_UINT64       1
-// #define GxB_NO_MIN_ISLE_UINT8        1
-
-// #define GxB_NO_MIN_ISLT_FP32         1
-// #define GxB_NO_MIN_ISLT_FP64         1
-// #define GxB_NO_MIN_ISLT_INT16        1
-// #define GxB_NO_MIN_ISLT_INT32        1
-// #define GxB_NO_MIN_ISLT_INT64        1
-// #define GxB_NO_MIN_ISLT_INT8         1
-// #define GxB_NO_MIN_ISLT_UINT16       1
-// #define GxB_NO_MIN_ISLT_UINT32       1
-// #define GxB_NO_MIN_ISLT_UINT64       1
-// #define GxB_NO_MIN_ISLT_UINT8        1
-
-// #define GxB_NO_MIN_ISNE_FP32         1
-// #define GxB_NO_MIN_ISNE_FP64         1
-// #define GxB_NO_MIN_ISNE_INT16        1
-// #define GxB_NO_MIN_ISNE_INT32        1
-// #define GxB_NO_MIN_ISNE_INT64        1
-// #define GxB_NO_MIN_ISNE_INT8         1
-// #define GxB_NO_MIN_ISNE_UINT16       1
-// #define GxB_NO_MIN_ISNE_UINT32       1
-// #define GxB_NO_MIN_ISNE_UINT64       1
-// #define GxB_NO_MIN_ISNE_UINT8        1
-
-// #define GxB_NO_MIN_LAND_FP32         1
-// #define GxB_NO_MIN_LAND_FP64         1
-// #define GxB_NO_MIN_LAND_INT16        1
-// #define GxB_NO_MIN_LAND_INT32        1
-// #define GxB_NO_MIN_LAND_INT64        1
-// #define GxB_NO_MIN_LAND_INT8         1
-// #define GxB_NO_MIN_LAND_UINT16       1
-// #define GxB_NO_MIN_LAND_UINT32       1
-// #define GxB_NO_MIN_LAND_UINT64       1
-// #define GxB_NO_MIN_LAND_UINT8        1
-
-// #define GxB_NO_MIN_LOR_FP32          1
-// #define GxB_NO_MIN_LOR_FP64          1
-// #define GxB_NO_MIN_LOR_INT16         1
-// #define GxB_NO_MIN_LOR_INT32         1
-// #define GxB_NO_MIN_LOR_INT64         1
-// #define GxB_NO_MIN_LOR_INT8          1
-// #define GxB_NO_MIN_LOR_UINT16        1
-// #define GxB_NO_MIN_LOR_UINT32        1
-// #define GxB_NO_MIN_LOR_UINT64        1
-// #define GxB_NO_MIN_LOR_UINT8         1
-
-// #define GxB_NO_MIN_LXOR_FP32         1
-// #define GxB_NO_MIN_LXOR_FP64         1
-// #define GxB_NO_MIN_LXOR_INT16        1
-// #define GxB_NO_MIN_LXOR_INT32        1
-// #define GxB_NO_MIN_LXOR_INT64        1
-// #define GxB_NO_MIN_LXOR_INT8         1
-// #define GxB_NO_MIN_LXOR_UINT16       1
-// #define GxB_NO_MIN_LXOR_UINT32       1
-// #define GxB_NO_MIN_LXOR_UINT64       1
-// #define GxB_NO_MIN_LXOR_UINT8        1
+// builtin GrB*: also needed by GrB_reduce to vector
+// #define GxB_NO_MAX_SECOND_FP32       1
+// #define GxB_NO_MAX_SECOND_FP64       1
+// #define GxB_NO_MAX_SECOND_INT16      1
+// #define GxB_NO_MAX_SECOND_INT32      1
+// #define GxB_NO_MAX_SECOND_INT64      1
+// #define GxB_NO_MAX_SECOND_INT8       1
+// #define GxB_NO_MAX_SECOND_UINT16     1
+// #define GxB_NO_MAX_SECOND_UINT32     1
+// #define GxB_NO_MAX_SECOND_UINT64     1
+// #define GxB_NO_MAX_SECOND_UINT8      1
 
+   #define GxB_NO_MAX_DIV_FP32          1
+   #define GxB_NO_MAX_DIV_FP64          1
+   #define GxB_NO_MAX_DIV_INT16         1
+   #define GxB_NO_MAX_DIV_INT32         1
+   #define GxB_NO_MAX_DIV_INT64         1
+   #define GxB_NO_MAX_DIV_INT8          1
+   #define GxB_NO_MAX_DIV_UINT16        1
+   #define GxB_NO_MAX_DIV_UINT32        1
+   #define GxB_NO_MAX_DIV_UINT64        1
+   #define GxB_NO_MAX_DIV_UINT8         1
+
+   #define GxB_NO_MAX_ANY_FP32          1
+   #define GxB_NO_MAX_ANY_FP64          1
+   #define GxB_NO_MAX_ANY_INT16         1
+   #define GxB_NO_MAX_ANY_INT32         1
+   #define GxB_NO_MAX_ANY_INT64         1
+   #define GxB_NO_MAX_ANY_INT8          1
+   #define GxB_NO_MAX_ANY_UINT16        1
+   #define GxB_NO_MAX_ANY_UINT32        1
+   #define GxB_NO_MAX_ANY_UINT64        1
+   #define GxB_NO_MAX_ANY_UINT8         1
+
+   #define GxB_NO_MAX_ISEQ_FP32         1
+   #define GxB_NO_MAX_ISEQ_FP64         1
+   #define GxB_NO_MAX_ISEQ_INT16        1
+   #define GxB_NO_MAX_ISEQ_INT32        1
+   #define GxB_NO_MAX_ISEQ_INT64        1
+   #define GxB_NO_MAX_ISEQ_INT8         1
+   #define GxB_NO_MAX_ISEQ_UINT16       1
+   #define GxB_NO_MAX_ISEQ_UINT32       1
+   #define GxB_NO_MAX_ISEQ_UINT64       1
+   #define GxB_NO_MAX_ISEQ_UINT8        1
+
+   #define GxB_NO_MAX_ISGE_FP32         1
+   #define GxB_NO_MAX_ISGE_FP64         1
+   #define GxB_NO_MAX_ISGE_INT16        1
+   #define GxB_NO_MAX_ISGE_INT32        1
+   #define GxB_NO_MAX_ISGE_INT64        1
+   #define GxB_NO_MAX_ISGE_INT8         1
+   #define GxB_NO_MAX_ISGE_UINT16       1
+   #define GxB_NO_MAX_ISGE_UINT32       1
+   #define GxB_NO_MAX_ISGE_UINT64       1
+   #define GxB_NO_MAX_ISGE_UINT8        1
+
+   #define GxB_NO_MAX_ISGT_FP32         1
+   #define GxB_NO_MAX_ISGT_FP64         1
+   #define GxB_NO_MAX_ISGT_INT16        1
+   #define GxB_NO_MAX_ISGT_INT32        1
+   #define GxB_NO_MAX_ISGT_INT64        1
+   #define GxB_NO_MAX_ISGT_INT8         1
+   #define GxB_NO_MAX_ISGT_UINT16       1
+   #define GxB_NO_MAX_ISGT_UINT32       1
+   #define GxB_NO_MAX_ISGT_UINT64       1
+   #define GxB_NO_MAX_ISGT_UINT8        1
+
+   #define GxB_NO_MAX_ISLE_FP32         1
+   #define GxB_NO_MAX_ISLE_FP64         1
+   #define GxB_NO_MAX_ISLE_INT16        1
+   #define GxB_NO_MAX_ISLE_INT32        1
+   #define GxB_NO_MAX_ISLE_INT64        1
+   #define GxB_NO_MAX_ISLE_INT8         1
+   #define GxB_NO_MAX_ISLE_UINT16       1
+   #define GxB_NO_MAX_ISLE_UINT32       1
+   #define GxB_NO_MAX_ISLE_UINT64       1
+   #define GxB_NO_MAX_ISLE_UINT8        1
+
+   #define GxB_NO_MAX_ISLT_FP32         1
+   #define GxB_NO_MAX_ISLT_FP64         1
+   #define GxB_NO_MAX_ISLT_INT16        1
+   #define GxB_NO_MAX_ISLT_INT32        1
+   #define GxB_NO_MAX_ISLT_INT64        1
+   #define GxB_NO_MAX_ISLT_INT8         1
+   #define GxB_NO_MAX_ISLT_UINT16       1
+   #define GxB_NO_MAX_ISLT_UINT32       1
+   #define GxB_NO_MAX_ISLT_UINT64       1
+   #define GxB_NO_MAX_ISLT_UINT8        1
+
+   #define GxB_NO_MAX_ISNE_FP32         1
+   #define GxB_NO_MAX_ISNE_FP64         1
+   #define GxB_NO_MAX_ISNE_INT16        1
+   #define GxB_NO_MAX_ISNE_INT32        1
+   #define GxB_NO_MAX_ISNE_INT64        1
+   #define GxB_NO_MAX_ISNE_INT8         1
+   #define GxB_NO_MAX_ISNE_UINT16       1
+   #define GxB_NO_MAX_ISNE_UINT32       1
+   #define GxB_NO_MAX_ISNE_UINT64       1
+   #define GxB_NO_MAX_ISNE_UINT8        1
+
+   #define GxB_NO_MAX_LAND_FP32         1
+   #define GxB_NO_MAX_LAND_FP64         1
+   #define GxB_NO_MAX_LAND_INT16        1
+   #define GxB_NO_MAX_LAND_INT32        1
+   #define GxB_NO_MAX_LAND_INT64        1
+   #define GxB_NO_MAX_LAND_INT8         1
+   #define GxB_NO_MAX_LAND_UINT16       1
+   #define GxB_NO_MAX_LAND_UINT32       1
+   #define GxB_NO_MAX_LAND_UINT64       1
+   #define GxB_NO_MAX_LAND_UINT8        1
+
+   #define GxB_NO_MAX_LOR_FP32          1
+   #define GxB_NO_MAX_LOR_FP64          1
+   #define GxB_NO_MAX_LOR_INT16         1
+   #define GxB_NO_MAX_LOR_INT32         1
+   #define GxB_NO_MAX_LOR_INT64         1
+   #define GxB_NO_MAX_LOR_INT8          1
+   #define GxB_NO_MAX_LOR_UINT16        1
+   #define GxB_NO_MAX_LOR_UINT32        1
+   #define GxB_NO_MAX_LOR_UINT64        1
+   #define GxB_NO_MAX_LOR_UINT8         1
+
+   #define GxB_NO_MAX_LXOR_FP32         1
+   #define GxB_NO_MAX_LXOR_FP64         1
+   #define GxB_NO_MAX_LXOR_INT16        1
+   #define GxB_NO_MAX_LXOR_INT32        1
+   #define GxB_NO_MAX_LXOR_INT64        1
+   #define GxB_NO_MAX_LXOR_INT8         1
+   #define GxB_NO_MAX_LXOR_UINT16       1
+   #define GxB_NO_MAX_LXOR_UINT32       1
+   #define GxB_NO_MAX_LXOR_UINT64       1
+   #define GxB_NO_MAX_LXOR_UINT8        1
+
+   #define GxB_NO_MAX_MAX_FP32          1
+   #define GxB_NO_MAX_MAX_FP64          1
+   #define GxB_NO_MAX_MAX_INT16         1
+   #define GxB_NO_MAX_MAX_INT32         1
+   #define GxB_NO_MAX_MAX_INT64         1
+   #define GxB_NO_MAX_MAX_INT8          1
+   #define GxB_NO_MAX_MAX_UINT16        1
+   #define GxB_NO_MAX_MAX_UINT32        1
+   #define GxB_NO_MAX_MAX_UINT64        1
+   #define GxB_NO_MAX_MAX_UINT8         1
+
+   #define GxB_NO_MAX_MINUS_FP32        1
+   #define GxB_NO_MAX_MINUS_FP64        1
+   #define GxB_NO_MAX_MINUS_INT16       1
+   #define GxB_NO_MAX_MINUS_INT32       1
+   #define GxB_NO_MAX_MINUS_INT64       1
+   #define GxB_NO_MAX_MINUS_INT8        1
+   #define GxB_NO_MAX_MINUS_UINT16      1
+   #define GxB_NO_MAX_MINUS_UINT32      1
+   #define GxB_NO_MAX_MINUS_UINT64      1
+   #define GxB_NO_MAX_MINUS_UINT8       1
+
+   #define GxB_NO_MAX_RDIV_FP32         1
+   #define GxB_NO_MAX_RDIV_FP64         1
+   #define GxB_NO_MAX_RDIV_INT16        1
+   #define GxB_NO_MAX_RDIV_INT32        1
+   #define GxB_NO_MAX_RDIV_INT64        1
+   #define GxB_NO_MAX_RDIV_INT8         1
+   #define GxB_NO_MAX_RDIV_UINT16       1
+   #define GxB_NO_MAX_RDIV_UINT32       1
+   #define GxB_NO_MAX_RDIV_UINT64       1
+   #define GxB_NO_MAX_RDIV_UINT8        1
+
+   #define GxB_NO_MAX_RMINUS_FP32       1
+   #define GxB_NO_MAX_RMINUS_FP64       1
+   #define GxB_NO_MAX_RMINUS_INT16      1
+   #define GxB_NO_MAX_RMINUS_INT32      1
+   #define GxB_NO_MAX_RMINUS_INT64      1
+   #define GxB_NO_MAX_RMINUS_INT8       1
+   #define GxB_NO_MAX_RMINUS_UINT16     1
+   #define GxB_NO_MAX_RMINUS_UINT32     1
+   #define GxB_NO_MAX_RMINUS_UINT64     1
+   #define GxB_NO_MAX_RMINUS_UINT8      1
+
+//------------------------------------------------------------
+// semirings with the MIN monoid
+//------------------------------------------------------------
+
+// MIN_PLUS, MIN_TIMES, MIN_FIRST, MIN_SECOND, and MIN_MAX are GrB* builtins.
+
+// builtin GrB*:
 // #define GxB_NO_MIN_MAX_FP32          1
 // #define GxB_NO_MIN_MAX_FP64          1
 // #define GxB_NO_MIN_MAX_INT16         1
@@ -1419,28 +1417,7 @@
 // #define GxB_NO_MIN_MAX_UINT64        1
 // #define GxB_NO_MIN_MAX_UINT8         1
 
-// #define GxB_NO_MIN_MINUS_FP32        1
-// #define GxB_NO_MIN_MINUS_FP64        1
-// #define GxB_NO_MIN_MINUS_INT16       1
-// #define GxB_NO_MIN_MINUS_INT32       1
-// #define GxB_NO_MIN_MINUS_INT64       1
-// #define GxB_NO_MIN_MINUS_INT8        1
-// #define GxB_NO_MIN_MINUS_UINT16      1
-// #define GxB_NO_MIN_MINUS_UINT32      1
-// #define GxB_NO_MIN_MINUS_UINT64      1
-// #define GxB_NO_MIN_MINUS_UINT8       1
-
-// #define GxB_NO_MIN_MIN_FP32          1
-// #define GxB_NO_MIN_MIN_FP64          1
-// #define GxB_NO_MIN_MIN_INT16         1
-// #define GxB_NO_MIN_MIN_INT32         1
-// #define GxB_NO_MIN_MIN_INT64         1
-// #define GxB_NO_MIN_MIN_INT8          1
-// #define GxB_NO_MIN_MIN_UINT16        1
-// #define GxB_NO_MIN_MIN_UINT32        1
-// #define GxB_NO_MIN_MIN_UINT64        1
-// #define GxB_NO_MIN_MIN_UINT8         1
-
+// builtin GrB*:
 // #define GxB_NO_MIN_PLUS_FP32         1
 // #define GxB_NO_MIN_PLUS_FP64         1
 // #define GxB_NO_MIN_PLUS_INT16        1
@@ -1452,39 +1429,7 @@
 // #define GxB_NO_MIN_PLUS_UINT64       1
 // #define GxB_NO_MIN_PLUS_UINT8        1
 
-// #define GxB_NO_MIN_RDIV_FP32         1
-// #define GxB_NO_MIN_RDIV_FP64         1
-// #define GxB_NO_MIN_RDIV_INT16        1
-// #define GxB_NO_MIN_RDIV_INT32        1
-// #define GxB_NO_MIN_RDIV_INT64        1
-// #define GxB_NO_MIN_RDIV_INT8         1
-// #define GxB_NO_MIN_RDIV_UINT16       1
-// #define GxB_NO_MIN_RDIV_UINT32       1
-// #define GxB_NO_MIN_RDIV_UINT64       1
-// #define GxB_NO_MIN_RDIV_UINT8        1
-
-// #define GxB_NO_MIN_RMINUS_FP32       1
-// #define GxB_NO_MIN_RMINUS_FP64       1
-// #define GxB_NO_MIN_RMINUS_INT16      1
-// #define GxB_NO_MIN_RMINUS_INT32      1
-// #define GxB_NO_MIN_RMINUS_INT64      1
-// #define GxB_NO_MIN_RMINUS_INT8       1
-// #define GxB_NO_MIN_RMINUS_UINT16     1
-// #define GxB_NO_MIN_RMINUS_UINT32     1
-// #define GxB_NO_MIN_RMINUS_UINT64     1
-// #define GxB_NO_MIN_RMINUS_UINT8      1
-
-// #define GxB_NO_MIN_SECOND_FP32       1
-// #define GxB_NO_MIN_SECOND_FP64       1
-// #define GxB_NO_MIN_SECOND_INT16      1
-// #define GxB_NO_MIN_SECOND_INT32      1
-// #define GxB_NO_MIN_SECOND_INT64      1
-// #define GxB_NO_MIN_SECOND_INT8       1
-// #define GxB_NO_MIN_SECOND_UINT16     1
-// #define GxB_NO_MIN_SECOND_UINT32     1
-// #define GxB_NO_MIN_SECOND_UINT64     1
-// #define GxB_NO_MIN_SECOND_UINT8      1
-
+// builtin GrB*:
 // #define GxB_NO_MIN_TIMES_FP32        1
 // #define GxB_NO_MIN_TIMES_FP64        1
 // #define GxB_NO_MIN_TIMES_INT16       1
@@ -1496,28 +1441,202 @@
 // #define GxB_NO_MIN_TIMES_UINT64      1
 // #define GxB_NO_MIN_TIMES_UINT8       1
 
-// #define GxB_NO_PLUS_DIV_FP32         1
-// #define GxB_NO_PLUS_DIV_FP64         1
-// #define GxB_NO_PLUS_DIV_INT16        1
-// #define GxB_NO_PLUS_DIV_INT32        1
-// #define GxB_NO_PLUS_DIV_INT64        1
-// #define GxB_NO_PLUS_DIV_INT8         1
-// #define GxB_NO_PLUS_DIV_UINT16       1
-// #define GxB_NO_PLUS_DIV_UINT32       1
-// #define GxB_NO_PLUS_DIV_UINT64       1
-// #define GxB_NO_PLUS_DIV_UINT8        1
+// builtin GrB*: also needed by GrB_reduce to vector
+// #define GxB_NO_MIN_FIRST_FP32        1
+// #define GxB_NO_MIN_FIRST_FP64        1
+// #define GxB_NO_MIN_FIRST_INT16       1
+// #define GxB_NO_MIN_FIRST_INT32       1
+// #define GxB_NO_MIN_FIRST_INT64       1
+// #define GxB_NO_MIN_FIRST_INT8        1
+// #define GxB_NO_MIN_FIRST_UINT16      1
+// #define GxB_NO_MIN_FIRST_UINT32      1
+// #define GxB_NO_MIN_FIRST_UINT64      1
+// #define GxB_NO_MIN_FIRST_UINT8       1
 
-// #define GxB_NO_PLUS_FIRST_FP32       1
-// #define GxB_NO_PLUS_FIRST_FP64       1
-// #define GxB_NO_PLUS_FIRST_INT16      1
-// #define GxB_NO_PLUS_FIRST_INT32      1
-// #define GxB_NO_PLUS_FIRST_INT64      1
-// #define GxB_NO_PLUS_FIRST_INT8       1
-// #define GxB_NO_PLUS_FIRST_UINT16     1
-// #define GxB_NO_PLUS_FIRST_UINT32     1
-// #define GxB_NO_PLUS_FIRST_UINT64     1
-// #define GxB_NO_PLUS_FIRST_UINT8      1
+// builtin GrB*: also needed by GrB_reduce to vector
+// #define GxB_NO_MIN_SECOND_FP32       1
+// #define GxB_NO_MIN_SECOND_FP64       1
+// #define GxB_NO_MIN_SECOND_INT16      1
+// #define GxB_NO_MIN_SECOND_INT32      1
+// #define GxB_NO_MIN_SECOND_INT64      1
+// #define GxB_NO_MIN_SECOND_INT8       1
+// #define GxB_NO_MIN_SECOND_UINT16     1
+// #define GxB_NO_MIN_SECOND_UINT32     1
+// #define GxB_NO_MIN_SECOND_UINT64     1
+// #define GxB_NO_MIN_SECOND_UINT8      1
 
+   #define GxB_NO_MIN_DIV_FP32          1
+   #define GxB_NO_MIN_DIV_FP64          1
+   #define GxB_NO_MIN_DIV_INT16         1
+   #define GxB_NO_MIN_DIV_INT32         1
+   #define GxB_NO_MIN_DIV_INT64         1
+   #define GxB_NO_MIN_DIV_INT8          1
+   #define GxB_NO_MIN_DIV_UINT16        1
+   #define GxB_NO_MIN_DIV_UINT32        1
+   #define GxB_NO_MIN_DIV_UINT64        1
+   #define GxB_NO_MIN_DIV_UINT8         1
+
+   #define GxB_NO_MIN_ANY_FP32          1
+   #define GxB_NO_MIN_ANY_FP64          1
+   #define GxB_NO_MIN_ANY_INT16         1
+   #define GxB_NO_MIN_ANY_INT32         1
+   #define GxB_NO_MIN_ANY_INT64         1
+   #define GxB_NO_MIN_ANY_INT8          1
+   #define GxB_NO_MIN_ANY_UINT16        1
+   #define GxB_NO_MIN_ANY_UINT32        1
+   #define GxB_NO_MIN_ANY_UINT64        1
+   #define GxB_NO_MIN_ANY_UINT8         1
+
+   #define GxB_NO_MIN_ISEQ_FP32         1
+   #define GxB_NO_MIN_ISEQ_FP64         1
+   #define GxB_NO_MIN_ISEQ_INT16        1
+   #define GxB_NO_MIN_ISEQ_INT32        1
+   #define GxB_NO_MIN_ISEQ_INT64        1
+   #define GxB_NO_MIN_ISEQ_INT8         1
+   #define GxB_NO_MIN_ISEQ_UINT16       1
+   #define GxB_NO_MIN_ISEQ_UINT32       1
+   #define GxB_NO_MIN_ISEQ_UINT64       1
+   #define GxB_NO_MIN_ISEQ_UINT8        1
+
+   #define GxB_NO_MIN_ISGE_FP32         1
+   #define GxB_NO_MIN_ISGE_FP64         1
+   #define GxB_NO_MIN_ISGE_INT16        1
+   #define GxB_NO_MIN_ISGE_INT32        1
+   #define GxB_NO_MIN_ISGE_INT64        1
+   #define GxB_NO_MIN_ISGE_INT8         1
+   #define GxB_NO_MIN_ISGE_UINT16       1
+   #define GxB_NO_MIN_ISGE_UINT32       1
+   #define GxB_NO_MIN_ISGE_UINT64       1
+   #define GxB_NO_MIN_ISGE_UINT8        1
+
+   #define GxB_NO_MIN_ISGT_FP32         1
+   #define GxB_NO_MIN_ISGT_FP64         1
+   #define GxB_NO_MIN_ISGT_INT16        1
+   #define GxB_NO_MIN_ISGT_INT32        1
+   #define GxB_NO_MIN_ISGT_INT64        1
+   #define GxB_NO_MIN_ISGT_INT8         1
+   #define GxB_NO_MIN_ISGT_UINT16       1
+   #define GxB_NO_MIN_ISGT_UINT32       1
+   #define GxB_NO_MIN_ISGT_UINT64       1
+   #define GxB_NO_MIN_ISGT_UINT8        1
+
+   #define GxB_NO_MIN_ISLE_FP32         1
+   #define GxB_NO_MIN_ISLE_FP64         1
+   #define GxB_NO_MIN_ISLE_INT16        1
+   #define GxB_NO_MIN_ISLE_INT32        1
+   #define GxB_NO_MIN_ISLE_INT64        1
+   #define GxB_NO_MIN_ISLE_INT8         1
+   #define GxB_NO_MIN_ISLE_UINT16       1
+   #define GxB_NO_MIN_ISLE_UINT32       1
+   #define GxB_NO_MIN_ISLE_UINT64       1
+   #define GxB_NO_MIN_ISLE_UINT8        1
+
+   #define GxB_NO_MIN_ISLT_FP32         1
+   #define GxB_NO_MIN_ISLT_FP64         1
+   #define GxB_NO_MIN_ISLT_INT16        1
+   #define GxB_NO_MIN_ISLT_INT32        1
+   #define GxB_NO_MIN_ISLT_INT64        1
+   #define GxB_NO_MIN_ISLT_INT8         1
+   #define GxB_NO_MIN_ISLT_UINT16       1
+   #define GxB_NO_MIN_ISLT_UINT32       1
+   #define GxB_NO_MIN_ISLT_UINT64       1
+   #define GxB_NO_MIN_ISLT_UINT8        1
+
+   #define GxB_NO_MIN_ISNE_FP32         1
+   #define GxB_NO_MIN_ISNE_FP64         1
+   #define GxB_NO_MIN_ISNE_INT16        1
+   #define GxB_NO_MIN_ISNE_INT32        1
+   #define GxB_NO_MIN_ISNE_INT64        1
+   #define GxB_NO_MIN_ISNE_INT8         1
+   #define GxB_NO_MIN_ISNE_UINT16       1
+   #define GxB_NO_MIN_ISNE_UINT32       1
+   #define GxB_NO_MIN_ISNE_UINT64       1
+   #define GxB_NO_MIN_ISNE_UINT8        1
+
+   #define GxB_NO_MIN_LAND_FP32         1
+   #define GxB_NO_MIN_LAND_FP64         1
+   #define GxB_NO_MIN_LAND_INT16        1
+   #define GxB_NO_MIN_LAND_INT32        1
+   #define GxB_NO_MIN_LAND_INT64        1
+   #define GxB_NO_MIN_LAND_INT8         1
+   #define GxB_NO_MIN_LAND_UINT16       1
+   #define GxB_NO_MIN_LAND_UINT32       1
+   #define GxB_NO_MIN_LAND_UINT64       1
+   #define GxB_NO_MIN_LAND_UINT8        1
+
+   #define GxB_NO_MIN_LOR_FP32          1
+   #define GxB_NO_MIN_LOR_FP64          1
+   #define GxB_NO_MIN_LOR_INT16         1
+   #define GxB_NO_MIN_LOR_INT32         1
+   #define GxB_NO_MIN_LOR_INT64         1
+   #define GxB_NO_MIN_LOR_INT8          1
+   #define GxB_NO_MIN_LOR_UINT16        1
+   #define GxB_NO_MIN_LOR_UINT32        1
+   #define GxB_NO_MIN_LOR_UINT64        1
+   #define GxB_NO_MIN_LOR_UINT8         1
+
+   #define GxB_NO_MIN_LXOR_FP32         1
+   #define GxB_NO_MIN_LXOR_FP64         1
+   #define GxB_NO_MIN_LXOR_INT16        1
+   #define GxB_NO_MIN_LXOR_INT32        1
+   #define GxB_NO_MIN_LXOR_INT64        1
+   #define GxB_NO_MIN_LXOR_INT8         1
+   #define GxB_NO_MIN_LXOR_UINT16       1
+   #define GxB_NO_MIN_LXOR_UINT32       1
+   #define GxB_NO_MIN_LXOR_UINT64       1
+   #define GxB_NO_MIN_LXOR_UINT8        1
+
+   #define GxB_NO_MIN_MINUS_FP32        1
+   #define GxB_NO_MIN_MINUS_FP64        1
+   #define GxB_NO_MIN_MINUS_INT16       1
+   #define GxB_NO_MIN_MINUS_INT32       1
+   #define GxB_NO_MIN_MINUS_INT64       1
+   #define GxB_NO_MIN_MINUS_INT8        1
+   #define GxB_NO_MIN_MINUS_UINT16      1
+   #define GxB_NO_MIN_MINUS_UINT32      1
+   #define GxB_NO_MIN_MINUS_UINT64      1
+   #define GxB_NO_MIN_MINUS_UINT8       1
+
+   #define GxB_NO_MIN_MIN_FP32          1
+   #define GxB_NO_MIN_MIN_FP64          1
+   #define GxB_NO_MIN_MIN_INT16         1
+   #define GxB_NO_MIN_MIN_INT32         1
+   #define GxB_NO_MIN_MIN_INT64         1
+   #define GxB_NO_MIN_MIN_INT8          1
+   #define GxB_NO_MIN_MIN_UINT16        1
+   #define GxB_NO_MIN_MIN_UINT32        1
+   #define GxB_NO_MIN_MIN_UINT64        1
+   #define GxB_NO_MIN_MIN_UINT8         1
+
+   #define GxB_NO_MIN_RDIV_FP32         1
+   #define GxB_NO_MIN_RDIV_FP64         1
+   #define GxB_NO_MIN_RDIV_INT16        1
+   #define GxB_NO_MIN_RDIV_INT32        1
+   #define GxB_NO_MIN_RDIV_INT64        1
+   #define GxB_NO_MIN_RDIV_INT8         1
+   #define GxB_NO_MIN_RDIV_UINT16       1
+   #define GxB_NO_MIN_RDIV_UINT32       1
+   #define GxB_NO_MIN_RDIV_UINT64       1
+   #define GxB_NO_MIN_RDIV_UINT8        1
+
+   #define GxB_NO_MIN_RMINUS_FP32       1
+   #define GxB_NO_MIN_RMINUS_FP64       1
+   #define GxB_NO_MIN_RMINUS_INT16      1
+   #define GxB_NO_MIN_RMINUS_INT32      1
+   #define GxB_NO_MIN_RMINUS_INT64      1
+   #define GxB_NO_MIN_RMINUS_INT8       1
+   #define GxB_NO_MIN_RMINUS_UINT16     1
+   #define GxB_NO_MIN_RMINUS_UINT32     1
+   #define GxB_NO_MIN_RMINUS_UINT64     1
+   #define GxB_NO_MIN_RMINUS_UINT8      1
+
+//------------------------------------------------------------
+// semirings with the PLUS monoid
+//------------------------------------------------------------
+
+// PLUS_TIMES and PLUS_MIN are GrB* builtin (not for FC23 or FC64).
+
+// not GrB*, used in LAGraph: triangle count and BFS
 // #define GxB_NO_PLUS_PAIR_FP32        1
 // #define GxB_NO_PLUS_PAIR_FP64        1
 // #define GxB_NO_PLUS_PAIR_INT16       1
@@ -1529,115 +1648,17 @@
 // #define GxB_NO_PLUS_PAIR_UINT64      1
 // #define GxB_NO_PLUS_PAIR_UINT8       1
 
-// #define GxB_NO_PLUS_ANY_FP32         1
-// #define GxB_NO_PLUS_ANY_FP64         1
-// #define GxB_NO_PLUS_ANY_INT16        1
-// #define GxB_NO_PLUS_ANY_INT32        1
-// #define GxB_NO_PLUS_ANY_INT64        1
-// #define GxB_NO_PLUS_ANY_INT8         1
-// #define GxB_NO_PLUS_ANY_UINT16       1
-// #define GxB_NO_PLUS_ANY_UINT32       1
-// #define GxB_NO_PLUS_ANY_UINT64       1
-// #define GxB_NO_PLUS_ANY_UINT8        1
-
-// #define GxB_NO_PLUS_ISEQ_FP32        1
-// #define GxB_NO_PLUS_ISEQ_FP64        1
-// #define GxB_NO_PLUS_ISEQ_INT16       1
-// #define GxB_NO_PLUS_ISEQ_INT32       1
-// #define GxB_NO_PLUS_ISEQ_INT64       1
-// #define GxB_NO_PLUS_ISEQ_INT8        1
-// #define GxB_NO_PLUS_ISEQ_UINT16      1
-// #define GxB_NO_PLUS_ISEQ_UINT32      1
-// #define GxB_NO_PLUS_ISEQ_UINT64      1
-// #define GxB_NO_PLUS_ISEQ_UINT8       1
-
-// #define GxB_NO_PLUS_ISGE_FP32        1
-// #define GxB_NO_PLUS_ISGE_FP64        1
-// #define GxB_NO_PLUS_ISGE_INT16       1
-// #define GxB_NO_PLUS_ISGE_INT32       1
-// #define GxB_NO_PLUS_ISGE_INT64       1
-// #define GxB_NO_PLUS_ISGE_INT8        1
-// #define GxB_NO_PLUS_ISGE_UINT16      1
-// #define GxB_NO_PLUS_ISGE_UINT32      1
-// #define GxB_NO_PLUS_ISGE_UINT64      1
-// #define GxB_NO_PLUS_ISGE_UINT8       1
-
-// #define GxB_NO_PLUS_ISGT_FP32        1
-// #define GxB_NO_PLUS_ISGT_FP64        1
-// #define GxB_NO_PLUS_ISGT_INT16       1
-// #define GxB_NO_PLUS_ISGT_INT32       1
-// #define GxB_NO_PLUS_ISGT_INT64       1
-// #define GxB_NO_PLUS_ISGT_INT8        1
-// #define GxB_NO_PLUS_ISGT_UINT16      1
-// #define GxB_NO_PLUS_ISGT_UINT32      1
-// #define GxB_NO_PLUS_ISGT_UINT64      1
-// #define GxB_NO_PLUS_ISGT_UINT8       1
-
-// #define GxB_NO_PLUS_ISLE_FP32        1
-// #define GxB_NO_PLUS_ISLE_FP64        1
-// #define GxB_NO_PLUS_ISLE_INT16       1
-// #define GxB_NO_PLUS_ISLE_INT32       1
-// #define GxB_NO_PLUS_ISLE_INT64       1
-// #define GxB_NO_PLUS_ISLE_INT8        1
-// #define GxB_NO_PLUS_ISLE_UINT16      1
-// #define GxB_NO_PLUS_ISLE_UINT32      1
-// #define GxB_NO_PLUS_ISLE_UINT64      1
-// #define GxB_NO_PLUS_ISLE_UINT8       1
-
-// #define GxB_NO_PLUS_ISLT_FP32        1
-// #define GxB_NO_PLUS_ISLT_FP64        1
-// #define GxB_NO_PLUS_ISLT_INT16       1
-// #define GxB_NO_PLUS_ISLT_INT32       1
-// #define GxB_NO_PLUS_ISLT_INT64       1
-// #define GxB_NO_PLUS_ISLT_INT8        1
-// #define GxB_NO_PLUS_ISLT_UINT16      1
-// #define GxB_NO_PLUS_ISLT_UINT32      1
-// #define GxB_NO_PLUS_ISLT_UINT64      1
-// #define GxB_NO_PLUS_ISLT_UINT8       1
-
-// #define GxB_NO_PLUS_ISNE_FP32        1
-// #define GxB_NO_PLUS_ISNE_FP64        1
-// #define GxB_NO_PLUS_ISNE_INT16       1
-// #define GxB_NO_PLUS_ISNE_INT32       1
-// #define GxB_NO_PLUS_ISNE_INT64       1
-// #define GxB_NO_PLUS_ISNE_INT8        1
-// #define GxB_NO_PLUS_ISNE_UINT16      1
-// #define GxB_NO_PLUS_ISNE_UINT32      1
-// #define GxB_NO_PLUS_ISNE_UINT64      1
-// #define GxB_NO_PLUS_ISNE_UINT8       1
-
-// #define GxB_NO_PLUS_LAND_FP32        1
-// #define GxB_NO_PLUS_LAND_FP64        1
-// #define GxB_NO_PLUS_LAND_INT16       1
-// #define GxB_NO_PLUS_LAND_INT32       1
-// #define GxB_NO_PLUS_LAND_INT64       1
-// #define GxB_NO_PLUS_LAND_INT8        1
-// #define GxB_NO_PLUS_LAND_UINT16      1
-// #define GxB_NO_PLUS_LAND_UINT32      1
-// #define GxB_NO_PLUS_LAND_UINT64      1
-// #define GxB_NO_PLUS_LAND_UINT8       1
-
-// #define GxB_NO_PLUS_LOR_FP32         1
-// #define GxB_NO_PLUS_LOR_FP64         1
-// #define GxB_NO_PLUS_LOR_INT16        1
-// #define GxB_NO_PLUS_LOR_INT32        1
-// #define GxB_NO_PLUS_LOR_INT64        1
-// #define GxB_NO_PLUS_LOR_INT8         1
-// #define GxB_NO_PLUS_LOR_UINT16       1
-// #define GxB_NO_PLUS_LOR_UINT32       1
-// #define GxB_NO_PLUS_LOR_UINT64       1
-// #define GxB_NO_PLUS_LOR_UINT8        1
-
-// #define GxB_NO_PLUS_LXOR_FP32        1
-// #define GxB_NO_PLUS_LXOR_FP64        1
-// #define GxB_NO_PLUS_LXOR_INT16       1
-// #define GxB_NO_PLUS_LXOR_INT32       1
-// #define GxB_NO_PLUS_LXOR_INT64       1
-// #define GxB_NO_PLUS_LXOR_INT8        1
-// #define GxB_NO_PLUS_LXOR_UINT16      1
-// #define GxB_NO_PLUS_LXOR_UINT32      1
-// #define GxB_NO_PLUS_LXOR_UINT64      1
-// #define GxB_NO_PLUS_LXOR_UINT8       1
+// builtin GrB*:
+// #define GxB_NO_PLUS_MIN_FP32         1
+// #define GxB_NO_PLUS_MIN_FP64         1
+// #define GxB_NO_PLUS_MIN_INT16        1
+// #define GxB_NO_PLUS_MIN_INT32        1
+// #define GxB_NO_PLUS_MIN_INT64        1
+// #define GxB_NO_PLUS_MIN_INT8         1
+// #define GxB_NO_PLUS_MIN_UINT16       1
+// #define GxB_NO_PLUS_MIN_UINT32       1
+// #define GxB_NO_PLUS_MIN_UINT64       1
+// #define GxB_NO_PLUS_MIN_UINT8        1
 
 // #define GxB_NO_PLUS_MAX_FP32         1
 // #define GxB_NO_PLUS_MAX_FP64         1
@@ -1650,28 +1671,7 @@
 // #define GxB_NO_PLUS_MAX_UINT64       1
 // #define GxB_NO_PLUS_MAX_UINT8        1
 
-// #define GxB_NO_PLUS_MINUS_FP32       1
-// #define GxB_NO_PLUS_MINUS_FP64       1
-// #define GxB_NO_PLUS_MINUS_INT16      1
-// #define GxB_NO_PLUS_MINUS_INT32      1
-// #define GxB_NO_PLUS_MINUS_INT64      1
-// #define GxB_NO_PLUS_MINUS_INT8       1
-// #define GxB_NO_PLUS_MINUS_UINT16     1
-// #define GxB_NO_PLUS_MINUS_UINT32     1
-// #define GxB_NO_PLUS_MINUS_UINT64     1
-// #define GxB_NO_PLUS_MINUS_UINT8      1
-
-// #define GxB_NO_PLUS_MIN_FP32         1
-// #define GxB_NO_PLUS_MIN_FP64         1
-// #define GxB_NO_PLUS_MIN_INT16        1
-// #define GxB_NO_PLUS_MIN_INT32        1
-// #define GxB_NO_PLUS_MIN_INT64        1
-// #define GxB_NO_PLUS_MIN_INT8         1
-// #define GxB_NO_PLUS_MIN_UINT16       1
-// #define GxB_NO_PLUS_MIN_UINT32       1
-// #define GxB_NO_PLUS_MIN_UINT64       1
-// #define GxB_NO_PLUS_MIN_UINT8        1
-
+// not GrB*, used in LAGraph: sparse deep neural network
 // #define GxB_NO_PLUS_PLUS_FP32        1
 // #define GxB_NO_PLUS_PLUS_FP64        1
 // #define GxB_NO_PLUS_PLUS_INT16       1
@@ -1683,39 +1683,7 @@
 // #define GxB_NO_PLUS_PLUS_UINT64      1
 // #define GxB_NO_PLUS_PLUS_UINT8       1
 
-// #define GxB_NO_PLUS_RDIV_FP32        1
-// #define GxB_NO_PLUS_RDIV_FP64        1
-// #define GxB_NO_PLUS_RDIV_INT16       1
-// #define GxB_NO_PLUS_RDIV_INT32       1
-// #define GxB_NO_PLUS_RDIV_INT64       1
-// #define GxB_NO_PLUS_RDIV_INT8        1
-// #define GxB_NO_PLUS_RDIV_UINT16      1
-// #define GxB_NO_PLUS_RDIV_UINT32      1
-// #define GxB_NO_PLUS_RDIV_UINT64      1
-// #define GxB_NO_PLUS_RDIV_UINT8       1
-
-// #define GxB_NO_PLUS_RMINUS_FP32      1
-// #define GxB_NO_PLUS_RMINUS_FP64      1
-// #define GxB_NO_PLUS_RMINUS_INT16     1
-// #define GxB_NO_PLUS_RMINUS_INT32     1
-// #define GxB_NO_PLUS_RMINUS_INT64     1
-// #define GxB_NO_PLUS_RMINUS_INT8      1
-// #define GxB_NO_PLUS_RMINUS_UINT16    1
-// #define GxB_NO_PLUS_RMINUS_UINT32    1
-// #define GxB_NO_PLUS_RMINUS_UINT64    1
-// #define GxB_NO_PLUS_RMINUS_UINT8     1
-
-// #define GxB_NO_PLUS_SECOND_FP32      1
-// #define GxB_NO_PLUS_SECOND_FP64      1
-// #define GxB_NO_PLUS_SECOND_INT16     1
-// #define GxB_NO_PLUS_SECOND_INT32     1
-// #define GxB_NO_PLUS_SECOND_INT64     1
-// #define GxB_NO_PLUS_SECOND_INT8      1
-// #define GxB_NO_PLUS_SECOND_UINT16    1
-// #define GxB_NO_PLUS_SECOND_UINT32    1
-// #define GxB_NO_PLUS_SECOND_UINT64    1
-// #define GxB_NO_PLUS_SECOND_UINT8     1
-
+// builtin GrB*: the classical semiring of linear algebra
 // #define GxB_NO_PLUS_TIMES_FP32       1
 // #define GxB_NO_PLUS_TIMES_FP64       1
 // #define GxB_NO_PLUS_TIMES_INT16      1
@@ -1727,170 +1695,192 @@
 // #define GxB_NO_PLUS_TIMES_UINT64     1
 // #define GxB_NO_PLUS_TIMES_UINT8      1
 
-// #define GxB_NO_TIMES_DIV_FP32        1
-// #define GxB_NO_TIMES_DIV_FP64        1
-// #define GxB_NO_TIMES_DIV_INT16       1
-// #define GxB_NO_TIMES_DIV_INT32       1
-// #define GxB_NO_TIMES_DIV_INT64       1
-// #define GxB_NO_TIMES_DIV_INT8        1
-// #define GxB_NO_TIMES_DIV_UINT16      1
-// #define GxB_NO_TIMES_DIV_UINT32      1
-// #define GxB_NO_TIMES_DIV_UINT64      1
-// #define GxB_NO_TIMES_DIV_UINT8       1
-
-// #define GxB_NO_TIMES_FIRST_FP32      1
-// #define GxB_NO_TIMES_FIRST_FP64      1
-// #define GxB_NO_TIMES_FIRST_INT16     1
-// #define GxB_NO_TIMES_FIRST_INT32     1
-// #define GxB_NO_TIMES_FIRST_INT64     1
-// #define GxB_NO_TIMES_FIRST_INT8      1
-// #define GxB_NO_TIMES_FIRST_UINT16    1
-// #define GxB_NO_TIMES_FIRST_UINT32    1
-// #define GxB_NO_TIMES_FIRST_UINT64    1
-// #define GxB_NO_TIMES_FIRST_UINT8     1
-
-// #define GxB_NO_TIMES_PAIR_FP32       1
-// #define GxB_NO_TIMES_PAIR_FP64       1
-// #define GxB_NO_TIMES_PAIR_INT16      1
-// #define GxB_NO_TIMES_PAIR_INT32      1
-// #define GxB_NO_TIMES_PAIR_INT64      1
-// #define GxB_NO_TIMES_PAIR_INT8       1
-// #define GxB_NO_TIMES_PAIR_UINT16     1
-// #define GxB_NO_TIMES_PAIR_UINT32     1
-// #define GxB_NO_TIMES_PAIR_UINT64     1
-// #define GxB_NO_TIMES_PAIR_UINT8      1
-
-// #define GxB_NO_TIMES_ANY_FP32        1
-// #define GxB_NO_TIMES_ANY_FP64        1
-// #define GxB_NO_TIMES_ANY_INT16       1
-// #define GxB_NO_TIMES_ANY_INT32       1
-// #define GxB_NO_TIMES_ANY_INT64       1
-// #define GxB_NO_TIMES_ANY_INT8        1
-// #define GxB_NO_TIMES_ANY_UINT16      1
-// #define GxB_NO_TIMES_ANY_UINT32      1
-// #define GxB_NO_TIMES_ANY_UINT64      1
-// #define GxB_NO_TIMES_ANY_UINT8       1
-
-// #define GxB_NO_TIMES_ISEQ_FP32       1
-// #define GxB_NO_TIMES_ISEQ_FP64       1
-// #define GxB_NO_TIMES_ISEQ_INT16      1
-// #define GxB_NO_TIMES_ISEQ_INT32      1
-// #define GxB_NO_TIMES_ISEQ_INT64      1
-// #define GxB_NO_TIMES_ISEQ_INT8       1
-// #define GxB_NO_TIMES_ISEQ_UINT16     1
-// #define GxB_NO_TIMES_ISEQ_UINT32     1
-// #define GxB_NO_TIMES_ISEQ_UINT64     1
-// #define GxB_NO_TIMES_ISEQ_UINT8      1
-
-// #define GxB_NO_TIMES_ISGE_FP32       1
-// #define GxB_NO_TIMES_ISGE_FP64       1
-// #define GxB_NO_TIMES_ISGE_INT16      1
-// #define GxB_NO_TIMES_ISGE_INT32      1
-// #define GxB_NO_TIMES_ISGE_INT64      1
-// #define GxB_NO_TIMES_ISGE_INT8       1
-// #define GxB_NO_TIMES_ISGE_UINT16     1
-// #define GxB_NO_TIMES_ISGE_UINT32     1
-// #define GxB_NO_TIMES_ISGE_UINT64     1
-// #define GxB_NO_TIMES_ISGE_UINT8      1
-
-// #define GxB_NO_TIMES_ISGT_FP32       1
-// #define GxB_NO_TIMES_ISGT_FP64       1
-// #define GxB_NO_TIMES_ISGT_INT16      1
-// #define GxB_NO_TIMES_ISGT_INT32      1
-// #define GxB_NO_TIMES_ISGT_INT64      1
-// #define GxB_NO_TIMES_ISGT_INT8       1
-// #define GxB_NO_TIMES_ISGT_UINT16     1
-// #define GxB_NO_TIMES_ISGT_UINT32     1
-// #define GxB_NO_TIMES_ISGT_UINT64     1
-// #define GxB_NO_TIMES_ISGT_UINT8      1
-
-// #define GxB_NO_TIMES_ISLE_FP32       1
-// #define GxB_NO_TIMES_ISLE_FP64       1
-// #define GxB_NO_TIMES_ISLE_INT16      1
-// #define GxB_NO_TIMES_ISLE_INT32      1
-// #define GxB_NO_TIMES_ISLE_INT64      1
-// #define GxB_NO_TIMES_ISLE_INT8       1
-// #define GxB_NO_TIMES_ISLE_UINT16     1
-// #define GxB_NO_TIMES_ISLE_UINT32     1
-// #define GxB_NO_TIMES_ISLE_UINT64     1
-// #define GxB_NO_TIMES_ISLE_UINT8      1
-
-// #define GxB_NO_TIMES_ISLT_FP32       1
-// #define GxB_NO_TIMES_ISLT_FP64       1
-// #define GxB_NO_TIMES_ISLT_INT16      1
-// #define GxB_NO_TIMES_ISLT_INT32      1
-// #define GxB_NO_TIMES_ISLT_INT64      1
-// #define GxB_NO_TIMES_ISLT_INT8       1
-// #define GxB_NO_TIMES_ISLT_UINT16     1
-// #define GxB_NO_TIMES_ISLT_UINT32     1
-// #define GxB_NO_TIMES_ISLT_UINT64     1
-// #define GxB_NO_TIMES_ISLT_UINT8      1
-
-// #define GxB_NO_TIMES_ISNE_FP32       1
-// #define GxB_NO_TIMES_ISNE_FP64       1
-// #define GxB_NO_TIMES_ISNE_INT16      1
-// #define GxB_NO_TIMES_ISNE_INT32      1
-// #define GxB_NO_TIMES_ISNE_INT64      1
-// #define GxB_NO_TIMES_ISNE_INT8       1
-// #define GxB_NO_TIMES_ISNE_UINT16     1
-// #define GxB_NO_TIMES_ISNE_UINT32     1
-// #define GxB_NO_TIMES_ISNE_UINT64     1
-// #define GxB_NO_TIMES_ISNE_UINT8      1
-
-// #define GxB_NO_TIMES_LAND_FP32       1
-// #define GxB_NO_TIMES_LAND_FP64       1
-// #define GxB_NO_TIMES_LAND_INT16      1
-// #define GxB_NO_TIMES_LAND_INT32      1
-// #define GxB_NO_TIMES_LAND_INT64      1
-// #define GxB_NO_TIMES_LAND_INT8       1
-// #define GxB_NO_TIMES_LAND_UINT16     1
-// #define GxB_NO_TIMES_LAND_UINT32     1
-// #define GxB_NO_TIMES_LAND_UINT64     1
-// #define GxB_NO_TIMES_LAND_UINT8      1
-
-// #define GxB_NO_TIMES_LOR_FP32        1
-// #define GxB_NO_TIMES_LOR_FP64        1
-// #define GxB_NO_TIMES_LOR_INT16       1
-// #define GxB_NO_TIMES_LOR_INT32       1
-// #define GxB_NO_TIMES_LOR_INT64       1
-// #define GxB_NO_TIMES_LOR_INT8        1
-// #define GxB_NO_TIMES_LOR_UINT16      1
-// #define GxB_NO_TIMES_LOR_UINT32      1
-// #define GxB_NO_TIMES_LOR_UINT64      1
-// #define GxB_NO_TIMES_LOR_UINT8       1
-
-// #define GxB_NO_TIMES_LXOR_FP32       1
-// #define GxB_NO_TIMES_LXOR_FP64       1
-// #define GxB_NO_TIMES_LXOR_INT16      1
-// #define GxB_NO_TIMES_LXOR_INT32      1
-// #define GxB_NO_TIMES_LXOR_INT64      1
-// #define GxB_NO_TIMES_LXOR_INT8       1
-// #define GxB_NO_TIMES_LXOR_UINT16     1
-// #define GxB_NO_TIMES_LXOR_UINT32     1
-// #define GxB_NO_TIMES_LXOR_UINT64     1
-// #define GxB_NO_TIMES_LXOR_UINT8      1
+// not GrB*, used in LAGraph: pagerank and Betweeness-Centrality
+// also needed by GrB_reduce to vector
+// #define GxB_NO_PLUS_FIRST_FP32       1
+// #define GxB_NO_PLUS_FIRST_FP64       1
+// #define GxB_NO_PLUS_FIRST_INT16      1
+// #define GxB_NO_PLUS_FIRST_INT32      1
+// #define GxB_NO_PLUS_FIRST_INT64      1
+// #define GxB_NO_PLUS_FIRST_INT8       1
+// #define GxB_NO_PLUS_FIRST_UINT16     1
+// #define GxB_NO_PLUS_FIRST_UINT32     1
+// #define GxB_NO_PLUS_FIRST_UINT64     1
+// #define GxB_NO_PLUS_FIRST_UINT8      1
 
-// #define GxB_NO_TIMES_MAX_FP32        1
-// #define GxB_NO_TIMES_MAX_FP64        1
-// #define GxB_NO_TIMES_MAX_INT16       1
-// #define GxB_NO_TIMES_MAX_INT32       1
-// #define GxB_NO_TIMES_MAX_INT64       1
-// #define GxB_NO_TIMES_MAX_INT8        1
-// #define GxB_NO_TIMES_MAX_UINT16      1
-// #define GxB_NO_TIMES_MAX_UINT32      1
-// #define GxB_NO_TIMES_MAX_UINT64      1
-// #define GxB_NO_TIMES_MAX_UINT8       1
+// not GrB*, used in LAGraph: Betweeness-Centrality and PageRank
+// also needed by GrB_reduce to vector
+// #define GxB_NO_PLUS_SECOND_FP32      1
+// #define GxB_NO_PLUS_SECOND_FP64      1
+// #define GxB_NO_PLUS_SECOND_INT16     1
+// #define GxB_NO_PLUS_SECOND_INT32     1
+// #define GxB_NO_PLUS_SECOND_INT64     1
+// #define GxB_NO_PLUS_SECOND_INT8      1
+// #define GxB_NO_PLUS_SECOND_UINT16    1
+// #define GxB_NO_PLUS_SECOND_UINT32    1
+// #define GxB_NO_PLUS_SECOND_UINT64    1
+// #define GxB_NO_PLUS_SECOND_UINT8     1
 
-// #define GxB_NO_TIMES_MINUS_FP32      1
-// #define GxB_NO_TIMES_MINUS_FP64      1
-// #define GxB_NO_TIMES_MINUS_INT16     1
-// #define GxB_NO_TIMES_MINUS_INT32     1
-// #define GxB_NO_TIMES_MINUS_INT64     1
-// #define GxB_NO_TIMES_MINUS_INT8      1
-// #define GxB_NO_TIMES_MINUS_UINT16    1
-// #define GxB_NO_TIMES_MINUS_UINT32    1
-// #define GxB_NO_TIMES_MINUS_UINT64    1
-// #define GxB_NO_TIMES_MINUS_UINT8     1
+   #define GxB_NO_PLUS_DIV_FP32         1
+   #define GxB_NO_PLUS_DIV_FP64         1
+   #define GxB_NO_PLUS_DIV_INT16        1
+   #define GxB_NO_PLUS_DIV_INT32        1
+   #define GxB_NO_PLUS_DIV_INT64        1
+   #define GxB_NO_PLUS_DIV_INT8         1
+   #define GxB_NO_PLUS_DIV_UINT16       1
+   #define GxB_NO_PLUS_DIV_UINT32       1
+   #define GxB_NO_PLUS_DIV_UINT64       1
+   #define GxB_NO_PLUS_DIV_UINT8        1
+
+   #define GxB_NO_PLUS_ANY_FP32         1
+   #define GxB_NO_PLUS_ANY_FP64         1
+   #define GxB_NO_PLUS_ANY_INT16        1
+   #define GxB_NO_PLUS_ANY_INT32        1
+   #define GxB_NO_PLUS_ANY_INT64        1
+   #define GxB_NO_PLUS_ANY_INT8         1
+   #define GxB_NO_PLUS_ANY_UINT16       1
+   #define GxB_NO_PLUS_ANY_UINT32       1
+   #define GxB_NO_PLUS_ANY_UINT64       1
+   #define GxB_NO_PLUS_ANY_UINT8        1
+
+   #define GxB_NO_PLUS_ISEQ_FP32        1
+   #define GxB_NO_PLUS_ISEQ_FP64        1
+   #define GxB_NO_PLUS_ISEQ_INT16       1
+   #define GxB_NO_PLUS_ISEQ_INT32       1
+   #define GxB_NO_PLUS_ISEQ_INT64       1
+   #define GxB_NO_PLUS_ISEQ_INT8        1
+   #define GxB_NO_PLUS_ISEQ_UINT16      1
+   #define GxB_NO_PLUS_ISEQ_UINT32      1
+   #define GxB_NO_PLUS_ISEQ_UINT64      1
+   #define GxB_NO_PLUS_ISEQ_UINT8       1
+
+   #define GxB_NO_PLUS_ISGE_FP32        1
+   #define GxB_NO_PLUS_ISGE_FP64        1
+   #define GxB_NO_PLUS_ISGE_INT16       1
+   #define GxB_NO_PLUS_ISGE_INT32       1
+   #define GxB_NO_PLUS_ISGE_INT64       1
+   #define GxB_NO_PLUS_ISGE_INT8        1
+   #define GxB_NO_PLUS_ISGE_UINT16      1
+   #define GxB_NO_PLUS_ISGE_UINT32      1
+   #define GxB_NO_PLUS_ISGE_UINT64      1
+   #define GxB_NO_PLUS_ISGE_UINT8       1
+
+   #define GxB_NO_PLUS_ISGT_FP32        1
+   #define GxB_NO_PLUS_ISGT_FP64        1
+   #define GxB_NO_PLUS_ISGT_INT16       1
+   #define GxB_NO_PLUS_ISGT_INT32       1
+   #define GxB_NO_PLUS_ISGT_INT64       1
+   #define GxB_NO_PLUS_ISGT_INT8        1
+   #define GxB_NO_PLUS_ISGT_UINT16      1
+   #define GxB_NO_PLUS_ISGT_UINT32      1
+   #define GxB_NO_PLUS_ISGT_UINT64      1
+   #define GxB_NO_PLUS_ISGT_UINT8       1
+
+   #define GxB_NO_PLUS_ISLE_FP32        1
+   #define GxB_NO_PLUS_ISLE_FP64        1
+   #define GxB_NO_PLUS_ISLE_INT16       1
+   #define GxB_NO_PLUS_ISLE_INT32       1
+   #define GxB_NO_PLUS_ISLE_INT64       1
+   #define GxB_NO_PLUS_ISLE_INT8        1
+   #define GxB_NO_PLUS_ISLE_UINT16      1
+   #define GxB_NO_PLUS_ISLE_UINT32      1
+   #define GxB_NO_PLUS_ISLE_UINT64      1
+   #define GxB_NO_PLUS_ISLE_UINT8       1
+
+   #define GxB_NO_PLUS_ISLT_FP32        1
+   #define GxB_NO_PLUS_ISLT_FP64        1
+   #define GxB_NO_PLUS_ISLT_INT16       1
+   #define GxB_NO_PLUS_ISLT_INT32       1
+   #define GxB_NO_PLUS_ISLT_INT64       1
+   #define GxB_NO_PLUS_ISLT_INT8        1
+   #define GxB_NO_PLUS_ISLT_UINT16      1
+   #define GxB_NO_PLUS_ISLT_UINT32      1
+   #define GxB_NO_PLUS_ISLT_UINT64      1
+   #define GxB_NO_PLUS_ISLT_UINT8       1
+
+   #define GxB_NO_PLUS_ISNE_FP32        1
+   #define GxB_NO_PLUS_ISNE_FP64        1
+   #define GxB_NO_PLUS_ISNE_INT16       1
+   #define GxB_NO_PLUS_ISNE_INT32       1
+   #define GxB_NO_PLUS_ISNE_INT64       1
+   #define GxB_NO_PLUS_ISNE_INT8        1
+   #define GxB_NO_PLUS_ISNE_UINT16      1
+   #define GxB_NO_PLUS_ISNE_UINT32      1
+   #define GxB_NO_PLUS_ISNE_UINT64      1
+   #define GxB_NO_PLUS_ISNE_UINT8       1
+
+   #define GxB_NO_PLUS_LAND_FP32        1
+   #define GxB_NO_PLUS_LAND_FP64        1
+   #define GxB_NO_PLUS_LAND_INT16       1
+   #define GxB_NO_PLUS_LAND_INT32       1
+   #define GxB_NO_PLUS_LAND_INT64       1
+   #define GxB_NO_PLUS_LAND_INT8        1
+   #define GxB_NO_PLUS_LAND_UINT16      1
+   #define GxB_NO_PLUS_LAND_UINT32      1
+   #define GxB_NO_PLUS_LAND_UINT64      1
+   #define GxB_NO_PLUS_LAND_UINT8       1
+
+   #define GxB_NO_PLUS_LOR_FP32         1
+   #define GxB_NO_PLUS_LOR_FP64         1
+   #define GxB_NO_PLUS_LOR_INT16        1
+   #define GxB_NO_PLUS_LOR_INT32        1
+   #define GxB_NO_PLUS_LOR_INT64        1
+   #define GxB_NO_PLUS_LOR_INT8         1
+   #define GxB_NO_PLUS_LOR_UINT16       1
+   #define GxB_NO_PLUS_LOR_UINT32       1
+   #define GxB_NO_PLUS_LOR_UINT64       1
+   #define GxB_NO_PLUS_LOR_UINT8        1
+
+   #define GxB_NO_PLUS_LXOR_FP32        1
+   #define GxB_NO_PLUS_LXOR_FP64        1
+   #define GxB_NO_PLUS_LXOR_INT16       1
+   #define GxB_NO_PLUS_LXOR_INT32       1
+   #define GxB_NO_PLUS_LXOR_INT64       1
+   #define GxB_NO_PLUS_LXOR_INT8        1
+   #define GxB_NO_PLUS_LXOR_UINT16      1
+   #define GxB_NO_PLUS_LXOR_UINT32      1
+   #define GxB_NO_PLUS_LXOR_UINT64      1
+   #define GxB_NO_PLUS_LXOR_UINT8       1
+
+   #define GxB_NO_PLUS_MINUS_FP32       1
+   #define GxB_NO_PLUS_MINUS_FP64       1
+   #define GxB_NO_PLUS_MINUS_INT16      1
+   #define GxB_NO_PLUS_MINUS_INT32      1
+   #define GxB_NO_PLUS_MINUS_INT64      1
+   #define GxB_NO_PLUS_MINUS_INT8       1
+   #define GxB_NO_PLUS_MINUS_UINT16     1
+   #define GxB_NO_PLUS_MINUS_UINT32     1
+   #define GxB_NO_PLUS_MINUS_UINT64     1
+   #define GxB_NO_PLUS_MINUS_UINT8      1
+
+   #define GxB_NO_PLUS_RDIV_FP32        1
+   #define GxB_NO_PLUS_RDIV_FP64        1
+   #define GxB_NO_PLUS_RDIV_INT16       1
+   #define GxB_NO_PLUS_RDIV_INT32       1
+   #define GxB_NO_PLUS_RDIV_INT64       1
+   #define GxB_NO_PLUS_RDIV_INT8        1
+   #define GxB_NO_PLUS_RDIV_UINT16      1
+   #define GxB_NO_PLUS_RDIV_UINT32      1
+   #define GxB_NO_PLUS_RDIV_UINT64      1
+   #define GxB_NO_PLUS_RDIV_UINT8       1
+
+   #define GxB_NO_PLUS_RMINUS_FP32      1
+   #define GxB_NO_PLUS_RMINUS_FP64      1
+   #define GxB_NO_PLUS_RMINUS_INT16     1
+   #define GxB_NO_PLUS_RMINUS_INT32     1
+   #define GxB_NO_PLUS_RMINUS_INT64     1
+   #define GxB_NO_PLUS_RMINUS_INT8      1
+   #define GxB_NO_PLUS_RMINUS_UINT16    1
+   #define GxB_NO_PLUS_RMINUS_UINT32    1
+   #define GxB_NO_PLUS_RMINUS_UINT64    1
+   #define GxB_NO_PLUS_RMINUS_UINT8     1
+
+//------------------------------------------------------------
+// semirings with the TIMES monoid
+//------------------------------------------------------------
+
+// No builtin GrB* semirings use the TIMES monoid, and none are used
+// in LAGraph 0.1 yet.
 
 // #define GxB_NO_TIMES_MIN_FP32        1
 // #define GxB_NO_TIMES_MIN_FP64        1
@@ -1903,6 +1893,17 @@
 // #define GxB_NO_TIMES_MIN_UINT64      1
 // #define GxB_NO_TIMES_MIN_UINT8       1
 
+// #define GxB_NO_TIMES_MAX_FP32        1
+// #define GxB_NO_TIMES_MAX_FP64        1
+// #define GxB_NO_TIMES_MAX_INT16       1
+// #define GxB_NO_TIMES_MAX_INT32       1
+// #define GxB_NO_TIMES_MAX_INT64       1
+// #define GxB_NO_TIMES_MAX_INT8        1
+// #define GxB_NO_TIMES_MAX_UINT16      1
+// #define GxB_NO_TIMES_MAX_UINT32      1
+// #define GxB_NO_TIMES_MAX_UINT64      1
+// #define GxB_NO_TIMES_MAX_UINT8       1
+
 // #define GxB_NO_TIMES_PLUS_FP32       1
 // #define GxB_NO_TIMES_PLUS_FP64       1
 // #define GxB_NO_TIMES_PLUS_INT16      1
@@ -1914,28 +1915,30 @@
 // #define GxB_NO_TIMES_PLUS_UINT64     1
 // #define GxB_NO_TIMES_PLUS_UINT8      1
 
-// #define GxB_NO_TIMES_RDIV_FP32       1
-// #define GxB_NO_TIMES_RDIV_FP64       1
-// #define GxB_NO_TIMES_RDIV_INT16      1
-// #define GxB_NO_TIMES_RDIV_INT32      1
-// #define GxB_NO_TIMES_RDIV_INT64      1
-// #define GxB_NO_TIMES_RDIV_INT8       1
-// #define GxB_NO_TIMES_RDIV_UINT16     1
-// #define GxB_NO_TIMES_RDIV_UINT32     1
-// #define GxB_NO_TIMES_RDIV_UINT64     1
-// #define GxB_NO_TIMES_RDIV_UINT8      1
-
-// #define GxB_NO_TIMES_RMINUS_FP32     1
-// #define GxB_NO_TIMES_RMINUS_FP64     1
-// #define GxB_NO_TIMES_RMINUS_INT16    1
-// #define GxB_NO_TIMES_RMINUS_INT32    1
-// #define GxB_NO_TIMES_RMINUS_INT64    1
-// #define GxB_NO_TIMES_RMINUS_INT8     1
-// #define GxB_NO_TIMES_RMINUS_UINT16   1
-// #define GxB_NO_TIMES_RMINUS_UINT32   1
-// #define GxB_NO_TIMES_RMINUS_UINT64   1
-// #define GxB_NO_TIMES_RMINUS_UINT8    1
+// #define GxB_NO_TIMES_TIMES_FP32      1
+// #define GxB_NO_TIMES_TIMES_FP64      1
+// #define GxB_NO_TIMES_TIMES_INT16     1
+// #define GxB_NO_TIMES_TIMES_INT32     1
+// #define GxB_NO_TIMES_TIMES_INT64     1
+// #define GxB_NO_TIMES_TIMES_INT8      1
+// #define GxB_NO_TIMES_TIMES_UINT16    1
+// #define GxB_NO_TIMES_TIMES_UINT32    1
+// #define GxB_NO_TIMES_TIMES_UINT64    1
+// #define GxB_NO_TIMES_TIMES_UINT8     1
+
+// needed by GrB_reduce to vector
+// #define GxB_NO_TIMES_FIRST_FP32      1
+// #define GxB_NO_TIMES_FIRST_FP64      1
+// #define GxB_NO_TIMES_FIRST_INT16     1
+// #define GxB_NO_TIMES_FIRST_INT32     1
+// #define GxB_NO_TIMES_FIRST_INT64     1
+// #define GxB_NO_TIMES_FIRST_INT8      1
+// #define GxB_NO_TIMES_FIRST_UINT16    1
+// #define GxB_NO_TIMES_FIRST_UINT32    1
+// #define GxB_NO_TIMES_FIRST_UINT64    1
+// #define GxB_NO_TIMES_FIRST_UINT8     1
 
+// needed by GrB_reduce to vector
 // #define GxB_NO_TIMES_SECOND_FP32     1
 // #define GxB_NO_TIMES_SECOND_FP64     1
 // #define GxB_NO_TIMES_SECOND_INT16    1
@@ -1947,92 +1950,531 @@
 // #define GxB_NO_TIMES_SECOND_UINT64   1
 // #define GxB_NO_TIMES_SECOND_UINT8    1
 
-// #define GxB_NO_TIMES_TIMES_FP32      1
-// #define GxB_NO_TIMES_TIMES_FP64      1
-// #define GxB_NO_TIMES_TIMES_INT16     1
-// #define GxB_NO_TIMES_TIMES_INT32     1
-// #define GxB_NO_TIMES_TIMES_INT64     1
-// #define GxB_NO_TIMES_TIMES_INT8      1
-// #define GxB_NO_TIMES_TIMES_UINT16    1
-// #define GxB_NO_TIMES_TIMES_UINT32    1
-// #define GxB_NO_TIMES_TIMES_UINT64    1
-// #define GxB_NO_TIMES_TIMES_UINT8     1
+   #define GxB_NO_TIMES_DIV_FP32        1
+   #define GxB_NO_TIMES_DIV_FP64        1
+   #define GxB_NO_TIMES_DIV_INT16       1
+   #define GxB_NO_TIMES_DIV_INT32       1
+   #define GxB_NO_TIMES_DIV_INT64       1
+   #define GxB_NO_TIMES_DIV_INT8        1
+   #define GxB_NO_TIMES_DIV_UINT16      1
+   #define GxB_NO_TIMES_DIV_UINT32      1
+   #define GxB_NO_TIMES_DIV_UINT64      1
+   #define GxB_NO_TIMES_DIV_UINT8       1
+
+   #define GxB_NO_TIMES_ANY_FP32        1
+   #define GxB_NO_TIMES_ANY_FP64        1
+   #define GxB_NO_TIMES_ANY_INT16       1
+   #define GxB_NO_TIMES_ANY_INT32       1
+   #define GxB_NO_TIMES_ANY_INT64       1
+   #define GxB_NO_TIMES_ANY_INT8        1
+   #define GxB_NO_TIMES_ANY_UINT16      1
+   #define GxB_NO_TIMES_ANY_UINT32      1
+   #define GxB_NO_TIMES_ANY_UINT64      1
+   #define GxB_NO_TIMES_ANY_UINT8       1
+
+   #define GxB_NO_TIMES_ISEQ_FP32       1
+   #define GxB_NO_TIMES_ISEQ_FP64       1
+   #define GxB_NO_TIMES_ISEQ_INT16      1
+   #define GxB_NO_TIMES_ISEQ_INT32      1
+   #define GxB_NO_TIMES_ISEQ_INT64      1
+   #define GxB_NO_TIMES_ISEQ_INT8       1
+   #define GxB_NO_TIMES_ISEQ_UINT16     1
+   #define GxB_NO_TIMES_ISEQ_UINT32     1
+   #define GxB_NO_TIMES_ISEQ_UINT64     1
+   #define GxB_NO_TIMES_ISEQ_UINT8      1
+
+   #define GxB_NO_TIMES_ISGE_FP32       1
+   #define GxB_NO_TIMES_ISGE_FP64       1
+   #define GxB_NO_TIMES_ISGE_INT16      1
+   #define GxB_NO_TIMES_ISGE_INT32      1
+   #define GxB_NO_TIMES_ISGE_INT64      1
+   #define GxB_NO_TIMES_ISGE_INT8       1
+   #define GxB_NO_TIMES_ISGE_UINT16     1
+   #define GxB_NO_TIMES_ISGE_UINT32     1
+   #define GxB_NO_TIMES_ISGE_UINT64     1
+   #define GxB_NO_TIMES_ISGE_UINT8      1
+
+   #define GxB_NO_TIMES_ISGT_FP32       1
+   #define GxB_NO_TIMES_ISGT_FP64       1
+   #define GxB_NO_TIMES_ISGT_INT16      1
+   #define GxB_NO_TIMES_ISGT_INT32      1
+   #define GxB_NO_TIMES_ISGT_INT64      1
+   #define GxB_NO_TIMES_ISGT_INT8       1
+   #define GxB_NO_TIMES_ISGT_UINT16     1
+   #define GxB_NO_TIMES_ISGT_UINT32     1
+   #define GxB_NO_TIMES_ISGT_UINT64     1
+   #define GxB_NO_TIMES_ISGT_UINT8      1
+
+   #define GxB_NO_TIMES_ISLE_FP32       1
+   #define GxB_NO_TIMES_ISLE_FP64       1
+   #define GxB_NO_TIMES_ISLE_INT16      1
+   #define GxB_NO_TIMES_ISLE_INT32      1
+   #define GxB_NO_TIMES_ISLE_INT64      1
+   #define GxB_NO_TIMES_ISLE_INT8       1
+   #define GxB_NO_TIMES_ISLE_UINT16     1
+   #define GxB_NO_TIMES_ISLE_UINT32     1
+   #define GxB_NO_TIMES_ISLE_UINT64     1
+   #define GxB_NO_TIMES_ISLE_UINT8      1
+
+   #define GxB_NO_TIMES_ISLT_FP32       1
+   #define GxB_NO_TIMES_ISLT_FP64       1
+   #define GxB_NO_TIMES_ISLT_INT16      1
+   #define GxB_NO_TIMES_ISLT_INT32      1
+   #define GxB_NO_TIMES_ISLT_INT64      1
+   #define GxB_NO_TIMES_ISLT_INT8       1
+   #define GxB_NO_TIMES_ISLT_UINT16     1
+   #define GxB_NO_TIMES_ISLT_UINT32     1
+   #define GxB_NO_TIMES_ISLT_UINT64     1
+   #define GxB_NO_TIMES_ISLT_UINT8      1
+
+   #define GxB_NO_TIMES_ISNE_FP32       1
+   #define GxB_NO_TIMES_ISNE_FP64       1
+   #define GxB_NO_TIMES_ISNE_INT16      1
+   #define GxB_NO_TIMES_ISNE_INT32      1
+   #define GxB_NO_TIMES_ISNE_INT64      1
+   #define GxB_NO_TIMES_ISNE_INT8       1
+   #define GxB_NO_TIMES_ISNE_UINT16     1
+   #define GxB_NO_TIMES_ISNE_UINT32     1
+   #define GxB_NO_TIMES_ISNE_UINT64     1
+   #define GxB_NO_TIMES_ISNE_UINT8      1
+
+   #define GxB_NO_TIMES_LAND_FP32       1
+   #define GxB_NO_TIMES_LAND_FP64       1
+   #define GxB_NO_TIMES_LAND_INT16      1
+   #define GxB_NO_TIMES_LAND_INT32      1
+   #define GxB_NO_TIMES_LAND_INT64      1
+   #define GxB_NO_TIMES_LAND_INT8       1
+   #define GxB_NO_TIMES_LAND_UINT16     1
+   #define GxB_NO_TIMES_LAND_UINT32     1
+   #define GxB_NO_TIMES_LAND_UINT64     1
+   #define GxB_NO_TIMES_LAND_UINT8      1
+
+   #define GxB_NO_TIMES_LOR_FP32        1
+   #define GxB_NO_TIMES_LOR_FP64        1
+   #define GxB_NO_TIMES_LOR_INT16       1
+   #define GxB_NO_TIMES_LOR_INT32       1
+   #define GxB_NO_TIMES_LOR_INT64       1
+   #define GxB_NO_TIMES_LOR_INT8        1
+   #define GxB_NO_TIMES_LOR_UINT16      1
+   #define GxB_NO_TIMES_LOR_UINT32      1
+   #define GxB_NO_TIMES_LOR_UINT64      1
+   #define GxB_NO_TIMES_LOR_UINT8       1
+
+   #define GxB_NO_TIMES_LXOR_FP32       1
+   #define GxB_NO_TIMES_LXOR_FP64       1
+   #define GxB_NO_TIMES_LXOR_INT16      1
+   #define GxB_NO_TIMES_LXOR_INT32      1
+   #define GxB_NO_TIMES_LXOR_INT64      1
+   #define GxB_NO_TIMES_LXOR_INT8       1
+   #define GxB_NO_TIMES_LXOR_UINT16     1
+   #define GxB_NO_TIMES_LXOR_UINT32     1
+   #define GxB_NO_TIMES_LXOR_UINT64     1
+   #define GxB_NO_TIMES_LXOR_UINT8      1
+
+   #define GxB_NO_TIMES_MINUS_FP32      1
+   #define GxB_NO_TIMES_MINUS_FP64      1
+   #define GxB_NO_TIMES_MINUS_INT16     1
+   #define GxB_NO_TIMES_MINUS_INT32     1
+   #define GxB_NO_TIMES_MINUS_INT64     1
+   #define GxB_NO_TIMES_MINUS_INT8      1
+   #define GxB_NO_TIMES_MINUS_UINT16    1
+   #define GxB_NO_TIMES_MINUS_UINT32    1
+   #define GxB_NO_TIMES_MINUS_UINT64    1
+   #define GxB_NO_TIMES_MINUS_UINT8     1
+
+   #define GxB_NO_TIMES_RDIV_FP32       1
+   #define GxB_NO_TIMES_RDIV_FP64       1
+   #define GxB_NO_TIMES_RDIV_INT16      1
+   #define GxB_NO_TIMES_RDIV_INT32      1
+   #define GxB_NO_TIMES_RDIV_INT64      1
+   #define GxB_NO_TIMES_RDIV_INT8       1
+   #define GxB_NO_TIMES_RDIV_UINT16     1
+   #define GxB_NO_TIMES_RDIV_UINT32     1
+   #define GxB_NO_TIMES_RDIV_UINT64     1
+   #define GxB_NO_TIMES_RDIV_UINT8      1
+
+   #define GxB_NO_TIMES_RMINUS_FP32     1
+   #define GxB_NO_TIMES_RMINUS_FP64     1
+   #define GxB_NO_TIMES_RMINUS_INT16    1
+   #define GxB_NO_TIMES_RMINUS_INT32    1
+   #define GxB_NO_TIMES_RMINUS_INT64    1
+   #define GxB_NO_TIMES_RMINUS_INT8     1
+   #define GxB_NO_TIMES_RMINUS_UINT16   1
+   #define GxB_NO_TIMES_RMINUS_UINT32   1
+   #define GxB_NO_TIMES_RMINUS_UINT64   1
+   #define GxB_NO_TIMES_RMINUS_UINT8    1
 
 //----------------------------------------
-// complex semirings:
+// 52 unique complex semirings:
 //----------------------------------------
 
-// #define GxB_NO_PLUS_FIRST_FC32       1
-// #define GxB_NO_TIMES_FIRST_FC32      1
-// #define GxB_NO_ANY_FIRST_FC32        1
+// _FIRST and _SECOND are needed by GrB_reduce to vector
 
+// #define GxB_NO_PLUS_PLUS_FC32        1
+// #define GxB_NO_PLUS_PLUS_FC64        1
+// #define GxB_NO_PLUS_TIMES_FC32       1
+// #define GxB_NO_PLUS_TIMES_FC64       1
+// #define GxB_NO_PLUS_FIRST_FC32       1
 // #define GxB_NO_PLUS_FIRST_FC64       1
-// #define GxB_NO_TIMES_FIRST_FC64      1
-// #define GxB_NO_ANY_FIRST_FC64        1
-
 // #define GxB_NO_PLUS_SECOND_FC32      1
-// #define GxB_NO_TIMES_SECOND_FC32     1
-// #define GxB_NO_ANY_SECOND_FC32       1
-
 // #define GxB_NO_PLUS_SECOND_FC64      1
-// #define GxB_NO_TIMES_SECOND_FC64     1
-// #define GxB_NO_ANY_SECOND_FC64       1
-
 // #define GxB_NO_PLUS_PAIR_FC32        1
-// #define GxB_NO_TIMES_PAIR_FC32       1
-// #define GxB_NO_ANY_PAIR_FC32         1
-
 // #define GxB_NO_PLUS_PAIR_FC64        1
-// #define GxB_NO_TIMES_PAIR_FC64       1
-// #define GxB_NO_ANY_PAIR_FC64         1
 
-// #define GxB_NO_PLUS_PLUS_FC32        1
-// #define GxB_NO_TIMES_PLUS_FC32       1
-// #define GxB_NO_ANY_PLUS_FC32         1
+   #define GxB_NO_PLUS_MINUS_FC32       1
+   #define GxB_NO_PLUS_MINUS_FC64       1
+   #define GxB_NO_PLUS_DIV_FC32         1
+   #define GxB_NO_PLUS_DIV_FC64         1
+   #define GxB_NO_PLUS_RDIV_FC32        1
+   #define GxB_NO_PLUS_RDIV_FC64        1
+   #define GxB_NO_PLUS_RMINUS_FC32      1
+   #define GxB_NO_PLUS_RMINUS_FC64      1
 
-// #define GxB_NO_PLUS_PLUS_FC64        1
+// #define GxB_NO_TIMES_PLUS_FC32       1
 // #define GxB_NO_TIMES_PLUS_FC64       1
-// #define GxB_NO_ANY_PLUS_FC64         1
-
-// #define GxB_NO_PLUS_MINUS_FC32       1
-// #define GxB_NO_TIMES_MINUS_FC32      1
-// #define GxB_NO_ANY_MINUS_FC32        1
-
-// #define GxB_NO_PLUS_MINUS_FC64       1
-// #define GxB_NO_TIMES_MINUS_FC64      1
-// #define GxB_NO_ANY_MINUS_FC64        1
-
-// #define GxB_NO_PLUS_TIMES_FC32       1
 // #define GxB_NO_TIMES_TIMES_FC32      1
-// #define GxB_NO_ANY_TIMES_FC32        1
-
-// #define GxB_NO_PLUS_TIMES_FC64       1
 // #define GxB_NO_TIMES_TIMES_FC64      1
-// #define GxB_NO_ANY_TIMES_FC64        1
-
-// #define GxB_NO_PLUS_DIV_FC32         1
-// #define GxB_NO_TIMES_DIV_FC32        1
-// #define GxB_NO_ANY_DIV_FC32          1
+// #define GxB_NO_TIMES_FIRST_FC32      1
+// #define GxB_NO_TIMES_FIRST_FC64      1
+// #define GxB_NO_TIMES_SECOND_FC32     1
+// #define GxB_NO_TIMES_SECOND_FC64     1
 
-// #define GxB_NO_PLUS_DIV_FC64         1
-// #define GxB_NO_TIMES_DIV_FC64        1
-// #define GxB_NO_ANY_DIV_FC64          1
+   #define GxB_NO_TIMES_MINUS_FC32      1
+   #define GxB_NO_TIMES_MINUS_FC64      1
+   #define GxB_NO_TIMES_DIV_FC32        1
+   #define GxB_NO_TIMES_DIV_FC64        1
+   #define GxB_NO_TIMES_RDIV_FC32       1
+   #define GxB_NO_TIMES_RDIV_FC64       1
+   #define GxB_NO_TIMES_RMINUS_FC32     1
+   #define GxB_NO_TIMES_RMINUS_FC64     1
 
-// #define GxB_NO_PLUS_RDIV_FC32        1
-// #define GxB_NO_TIMES_RDIV_FC32       1
-// #define GxB_NO_ANY_RDIV_FC32         1
+// #define GxB_NO_ANY_FIRST_FC32        1
+// #define GxB_NO_ANY_FIRST_FC64        1
+// #define GxB_NO_ANY_SECOND_FC32       1
+// #define GxB_NO_ANY_SECOND_FC64       1
+// #define GxB_NO_ANY_PAIR_FC32         1
+// #define GxB_NO_ANY_PAIR_FC64         1
 
-// #define GxB_NO_PLUS_RDIV_FC64        1
-// #define GxB_NO_TIMES_RDIV_FC64       1
-// #define GxB_NO_ANY_RDIV_FC64         1
+   #define GxB_NO_ANY_PLUS_FC32         1
+   #define GxB_NO_ANY_PLUS_FC64         1
+   #define GxB_NO_ANY_TIMES_FC32        1
+   #define GxB_NO_ANY_TIMES_FC64        1
+   #define GxB_NO_ANY_MINUS_FC32        1
+   #define GxB_NO_ANY_MINUS_FC64        1
+   #define GxB_NO_ANY_DIV_FC32          1
+   #define GxB_NO_ANY_DIV_FC64          1
+   #define GxB_NO_ANY_RDIV_FC32         1
+   #define GxB_NO_ANY_RDIV_FC64         1
+   #define GxB_NO_ANY_RMINUS_FC32       1
+   #define GxB_NO_ANY_RMINUS_FC64       1
 
-// #define GxB_NO_PLUS_RMINUS_FC32      1
-// #define GxB_NO_TIMES_RMINUS_FC32     1
-// #define GxB_NO_ANY_RMINUS_FC32       1
+//----------------------------------------
+// semirings with the ANY monoid
+//----------------------------------------
 
-// #define GxB_NO_PLUS_RMINUS_FC64      1
-// #define GxB_NO_TIMES_RMINUS_FC64     1
-// #define GxB_NO_ANY_RMINUS_FC64       1
+// None of these are GrB*, since the ANY monoid is a GxB* extension.
+// However, semirings based on the ANY monoid are common: BFS in particular
+// uses ANY_FIRST, ANY_SECOND, and ANY_PAIR.
+
+// used in LAGraph: BFS, also needed by GrB_reduce to vector
+// #define GxB_NO_ANY_FIRST_BOOL        1
+// #define GxB_NO_ANY_FIRST_FP32        1
+// #define GxB_NO_ANY_FIRST_FP64        1
+// #define GxB_NO_ANY_FIRST_INT16       1
+// #define GxB_NO_ANY_FIRST_INT32       1
+// #define GxB_NO_ANY_FIRST_INT64       1
+// #define GxB_NO_ANY_FIRST_INT8        1
+// #define GxB_NO_ANY_FIRST_UINT16      1
+// #define GxB_NO_ANY_FIRST_UINT32      1
+// #define GxB_NO_ANY_FIRST_UINT64      1
+// #define GxB_NO_ANY_FIRST_UINT8       1
+
+// used in LAGraph: BFS
+// #define GxB_NO_ANY_SECOND_BOOL       1
+// #define GxB_NO_ANY_SECOND_FP32       1
+// #define GxB_NO_ANY_SECOND_FP64       1
+// #define GxB_NO_ANY_SECOND_INT16      1
+// #define GxB_NO_ANY_SECOND_INT32      1
+// #define GxB_NO_ANY_SECOND_INT64      1
+// #define GxB_NO_ANY_SECOND_INT8       1
+// #define GxB_NO_ANY_SECOND_UINT16     1
+// #define GxB_NO_ANY_SECOND_UINT32     1
+// #define GxB_NO_ANY_SECOND_UINT64     1
+// #define GxB_NO_ANY_SECOND_UINT8      1
+
+// Not GrB*, but used in BFS and others.  The only purely symbolic semiring.
+// #define GxB_NO_ANY_PAIR_BOOL         1
+// #define GxB_NO_ANY_PAIR_FP32         1
+// #define GxB_NO_ANY_PAIR_FP64         1
+// #define GxB_NO_ANY_PAIR_INT16        1
+// #define GxB_NO_ANY_PAIR_INT32        1
+// #define GxB_NO_ANY_PAIR_INT64        1
+// #define GxB_NO_ANY_PAIR_INT8         1
+// #define GxB_NO_ANY_PAIR_UINT16       1
+// #define GxB_NO_ANY_PAIR_UINT32       1
+// #define GxB_NO_ANY_PAIR_UINT64       1
+// #define GxB_NO_ANY_PAIR_UINT8        1
+
+   #define GxB_NO_ANY_DIV_FP32          1
+   #define GxB_NO_ANY_DIV_FP64          1
+   #define GxB_NO_ANY_DIV_INT16         1
+   #define GxB_NO_ANY_DIV_INT32         1
+   #define GxB_NO_ANY_DIV_INT64         1
+   #define GxB_NO_ANY_DIV_INT8          1
+   #define GxB_NO_ANY_DIV_UINT16        1
+   #define GxB_NO_ANY_DIV_UINT32        1
+   #define GxB_NO_ANY_DIV_UINT64        1
+   #define GxB_NO_ANY_DIV_UINT8         1
+
+   #define GxB_NO_ANY_EQ_BOOL           1
+   #define GxB_NO_ANY_EQ_FP32           1
+   #define GxB_NO_ANY_EQ_FP64           1
+   #define GxB_NO_ANY_EQ_INT16          1
+   #define GxB_NO_ANY_EQ_INT32          1
+   #define GxB_NO_ANY_EQ_INT64          1
+   #define GxB_NO_ANY_EQ_INT8           1
+   #define GxB_NO_ANY_EQ_UINT16         1
+   #define GxB_NO_ANY_EQ_UINT32         1
+   #define GxB_NO_ANY_EQ_UINT64         1
+   #define GxB_NO_ANY_EQ_UINT8          1
+
+   #define GxB_NO_ANY_GE_BOOL           1
+   #define GxB_NO_ANY_GE_FP32           1
+   #define GxB_NO_ANY_GE_FP64           1
+   #define GxB_NO_ANY_GE_INT16          1
+   #define GxB_NO_ANY_GE_INT32          1
+   #define GxB_NO_ANY_GE_INT64          1
+   #define GxB_NO_ANY_GE_INT8           1
+   #define GxB_NO_ANY_GE_UINT16         1
+   #define GxB_NO_ANY_GE_UINT32         1
+   #define GxB_NO_ANY_GE_UINT64         1
+   #define GxB_NO_ANY_GE_UINT8          1
+
+   #define GxB_NO_ANY_GT_BOOL           1
+   #define GxB_NO_ANY_GT_FP32           1
+   #define GxB_NO_ANY_GT_FP64           1
+   #define GxB_NO_ANY_GT_INT16          1
+   #define GxB_NO_ANY_GT_INT32          1
+   #define GxB_NO_ANY_GT_INT64          1
+   #define GxB_NO_ANY_GT_INT8           1
+   #define GxB_NO_ANY_GT_UINT16         1
+   #define GxB_NO_ANY_GT_UINT32         1
+   #define GxB_NO_ANY_GT_UINT64         1
+   #define GxB_NO_ANY_GT_UINT8          1
+
+   #define GxB_NO_ANY_ISEQ_FP32         1
+   #define GxB_NO_ANY_ISEQ_FP64         1
+   #define GxB_NO_ANY_ISEQ_INT16        1
+   #define GxB_NO_ANY_ISEQ_INT32        1
+   #define GxB_NO_ANY_ISEQ_INT64        1
+   #define GxB_NO_ANY_ISEQ_INT8         1
+   #define GxB_NO_ANY_ISEQ_UINT16       1
+   #define GxB_NO_ANY_ISEQ_UINT32       1
+   #define GxB_NO_ANY_ISEQ_UINT64       1
+   #define GxB_NO_ANY_ISEQ_UINT8        1
+
+   #define GxB_NO_ANY_ISGE_FP32         1
+   #define GxB_NO_ANY_ISGE_FP64         1
+   #define GxB_NO_ANY_ISGE_INT16        1
+   #define GxB_NO_ANY_ISGE_INT32        1
+   #define GxB_NO_ANY_ISGE_INT64        1
+   #define GxB_NO_ANY_ISGE_INT8         1
+   #define GxB_NO_ANY_ISGE_UINT16       1
+   #define GxB_NO_ANY_ISGE_UINT32       1
+   #define GxB_NO_ANY_ISGE_UINT64       1
+   #define GxB_NO_ANY_ISGE_UINT8        1
+
+   #define GxB_NO_ANY_ISGT_FP32         1
+   #define GxB_NO_ANY_ISGT_FP64         1
+   #define GxB_NO_ANY_ISGT_INT16        1
+   #define GxB_NO_ANY_ISGT_INT32        1
+   #define GxB_NO_ANY_ISGT_INT64        1
+   #define GxB_NO_ANY_ISGT_INT8         1
+   #define GxB_NO_ANY_ISGT_UINT16       1
+   #define GxB_NO_ANY_ISGT_UINT32       1
+   #define GxB_NO_ANY_ISGT_UINT64       1
+   #define GxB_NO_ANY_ISGT_UINT8        1
+
+   #define GxB_NO_ANY_ISLE_FP32         1
+   #define GxB_NO_ANY_ISLE_FP64         1
+   #define GxB_NO_ANY_ISLE_INT16        1
+   #define GxB_NO_ANY_ISLE_INT32        1
+   #define GxB_NO_ANY_ISLE_INT64        1
+   #define GxB_NO_ANY_ISLE_INT8         1
+   #define GxB_NO_ANY_ISLE_UINT16       1
+   #define GxB_NO_ANY_ISLE_UINT32       1
+   #define GxB_NO_ANY_ISLE_UINT64       1
+   #define GxB_NO_ANY_ISLE_UINT8        1
+
+   #define GxB_NO_ANY_ISLT_FP32         1
+   #define GxB_NO_ANY_ISLT_FP64         1
+   #define GxB_NO_ANY_ISLT_INT16        1
+   #define GxB_NO_ANY_ISLT_INT32        1
+   #define GxB_NO_ANY_ISLT_INT64        1
+   #define GxB_NO_ANY_ISLT_INT8         1
+   #define GxB_NO_ANY_ISLT_UINT16       1
+   #define GxB_NO_ANY_ISLT_UINT32       1
+   #define GxB_NO_ANY_ISLT_UINT64       1
+   #define GxB_NO_ANY_ISLT_UINT8        1
+
+   #define GxB_NO_ANY_ISNE_FP32         1
+   #define GxB_NO_ANY_ISNE_FP64         1
+   #define GxB_NO_ANY_ISNE_INT16        1
+   #define GxB_NO_ANY_ISNE_INT32        1
+   #define GxB_NO_ANY_ISNE_INT64        1
+   #define GxB_NO_ANY_ISNE_INT8         1
+   #define GxB_NO_ANY_ISNE_UINT16       1
+   #define GxB_NO_ANY_ISNE_UINT32       1
+   #define GxB_NO_ANY_ISNE_UINT64       1
+   #define GxB_NO_ANY_ISNE_UINT8        1
+
+   #define GxB_NO_ANY_LAND_BOOL         1
+   #define GxB_NO_ANY_LAND_FP32         1
+   #define GxB_NO_ANY_LAND_FP64         1
+   #define GxB_NO_ANY_LAND_INT16        1
+   #define GxB_NO_ANY_LAND_INT32        1
+   #define GxB_NO_ANY_LAND_INT64        1
+   #define GxB_NO_ANY_LAND_INT8         1
+   #define GxB_NO_ANY_LAND_UINT16       1
+   #define GxB_NO_ANY_LAND_UINT32       1
+   #define GxB_NO_ANY_LAND_UINT64       1
+   #define GxB_NO_ANY_LAND_UINT8        1
+
+   #define GxB_NO_ANY_LE_BOOL           1
+   #define GxB_NO_ANY_LE_FP32           1
+   #define GxB_NO_ANY_LE_FP64           1
+   #define GxB_NO_ANY_LE_INT16          1
+   #define GxB_NO_ANY_LE_INT32          1
+   #define GxB_NO_ANY_LE_INT64          1
+   #define GxB_NO_ANY_LE_INT8           1
+   #define GxB_NO_ANY_LE_UINT16         1
+   #define GxB_NO_ANY_LE_UINT32         1
+   #define GxB_NO_ANY_LE_UINT64         1
+   #define GxB_NO_ANY_LE_UINT8          1
+
+   #define GxB_NO_ANY_LOR_BOOL          1
+   #define GxB_NO_ANY_LOR_FP32          1
+   #define GxB_NO_ANY_LOR_FP64          1
+   #define GxB_NO_ANY_LOR_INT16         1
+   #define GxB_NO_ANY_LOR_INT32         1
+   #define GxB_NO_ANY_LOR_INT64         1
+   #define GxB_NO_ANY_LOR_INT8          1
+   #define GxB_NO_ANY_LOR_UINT16        1
+   #define GxB_NO_ANY_LOR_UINT32        1
+   #define GxB_NO_ANY_LOR_UINT64        1
+   #define GxB_NO_ANY_LOR_UINT8         1
+
+   #define GxB_NO_ANY_LT_BOOL           1
+   #define GxB_NO_ANY_LT_FP32           1
+   #define GxB_NO_ANY_LT_FP64           1
+   #define GxB_NO_ANY_LT_INT16          1
+   #define GxB_NO_ANY_LT_INT32          1
+   #define GxB_NO_ANY_LT_INT64          1
+   #define GxB_NO_ANY_LT_INT8           1
+   #define GxB_NO_ANY_LT_UINT16         1
+   #define GxB_NO_ANY_LT_UINT32         1
+   #define GxB_NO_ANY_LT_UINT64         1
+   #define GxB_NO_ANY_LT_UINT8          1
+
+   #define GxB_NO_ANY_LXOR_BOOL         1
+   #define GxB_NO_ANY_LXOR_FP32         1
+   #define GxB_NO_ANY_LXOR_FP64         1
+   #define GxB_NO_ANY_LXOR_INT16        1
+   #define GxB_NO_ANY_LXOR_INT32        1
+   #define GxB_NO_ANY_LXOR_INT64        1
+   #define GxB_NO_ANY_LXOR_INT8         1
+   #define GxB_NO_ANY_LXOR_UINT16       1
+   #define GxB_NO_ANY_LXOR_UINT32       1
+   #define GxB_NO_ANY_LXOR_UINT64       1
+   #define GxB_NO_ANY_LXOR_UINT8        1
+
+   #define GxB_NO_ANY_MAX_FP32          1
+   #define GxB_NO_ANY_MAX_FP64          1
+   #define GxB_NO_ANY_MAX_INT16         1
+   #define GxB_NO_ANY_MAX_INT32         1
+   #define GxB_NO_ANY_MAX_INT64         1
+   #define GxB_NO_ANY_MAX_INT8          1
+   #define GxB_NO_ANY_MAX_UINT16        1
+   #define GxB_NO_ANY_MAX_UINT32        1
+   #define GxB_NO_ANY_MAX_UINT64        1
+   #define GxB_NO_ANY_MAX_UINT8         1
+
+   #define GxB_NO_ANY_MIN_FP32          1
+   #define GxB_NO_ANY_MIN_FP64          1
+   #define GxB_NO_ANY_MIN_INT16         1
+   #define GxB_NO_ANY_MIN_INT32         1
+   #define GxB_NO_ANY_MIN_INT64         1
+   #define GxB_NO_ANY_MIN_INT8          1
+   #define GxB_NO_ANY_MIN_UINT16        1
+   #define GxB_NO_ANY_MIN_UINT32        1
+   #define GxB_NO_ANY_MIN_UINT64        1
+   #define GxB_NO_ANY_MIN_UINT8         1
+
+   #define GxB_NO_ANY_MINUS_FP32        1
+   #define GxB_NO_ANY_MINUS_FP64        1
+   #define GxB_NO_ANY_MINUS_INT16       1
+   #define GxB_NO_ANY_MINUS_INT32       1
+   #define GxB_NO_ANY_MINUS_INT64       1
+   #define GxB_NO_ANY_MINUS_INT8        1
+   #define GxB_NO_ANY_MINUS_UINT16      1
+   #define GxB_NO_ANY_MINUS_UINT32      1
+   #define GxB_NO_ANY_MINUS_UINT64      1
+   #define GxB_NO_ANY_MINUS_UINT8       1
+
+   #define GxB_NO_ANY_NE_FP32           1
+   #define GxB_NO_ANY_NE_FP64           1
+   #define GxB_NO_ANY_NE_INT16          1
+   #define GxB_NO_ANY_NE_INT32          1
+   #define GxB_NO_ANY_NE_INT64          1
+   #define GxB_NO_ANY_NE_INT8           1
+   #define GxB_NO_ANY_NE_UINT16         1
+   #define GxB_NO_ANY_NE_UINT32         1
+   #define GxB_NO_ANY_NE_UINT64         1
+   #define GxB_NO_ANY_NE_UINT8          1
+
+   #define GxB_NO_ANY_PLUS_FP32         1
+   #define GxB_NO_ANY_PLUS_FP64         1
+   #define GxB_NO_ANY_PLUS_INT16        1
+   #define GxB_NO_ANY_PLUS_INT32        1
+   #define GxB_NO_ANY_PLUS_INT64        1
+   #define GxB_NO_ANY_PLUS_INT8         1
+   #define GxB_NO_ANY_PLUS_UINT16       1
+   #define GxB_NO_ANY_PLUS_UINT32       1
+   #define GxB_NO_ANY_PLUS_UINT64       1
+   #define GxB_NO_ANY_PLUS_UINT8        1
+
+   #define GxB_NO_ANY_RDIV_FP32         1
+   #define GxB_NO_ANY_RDIV_FP64         1
+   #define GxB_NO_ANY_RDIV_INT16        1
+   #define GxB_NO_ANY_RDIV_INT32        1
+   #define GxB_NO_ANY_RDIV_INT64        1
+   #define GxB_NO_ANY_RDIV_INT8         1
+   #define GxB_NO_ANY_RDIV_UINT16       1
+   #define GxB_NO_ANY_RDIV_UINT32       1
+   #define GxB_NO_ANY_RDIV_UINT64       1
+   #define GxB_NO_ANY_RDIV_UINT8        1
+
+   #define GxB_NO_ANY_RMINUS_FP32       1
+   #define GxB_NO_ANY_RMINUS_FP64       1
+   #define GxB_NO_ANY_RMINUS_INT16      1
+   #define GxB_NO_ANY_RMINUS_INT32      1
+   #define GxB_NO_ANY_RMINUS_INT64      1
+   #define GxB_NO_ANY_RMINUS_INT8       1
+   #define GxB_NO_ANY_RMINUS_UINT16     1
+   #define GxB_NO_ANY_RMINUS_UINT32     1
+   #define GxB_NO_ANY_RMINUS_UINT64     1
+   #define GxB_NO_ANY_RMINUS_UINT8      1
+
+   #define GxB_NO_ANY_TIMES_FP32        1
+   #define GxB_NO_ANY_TIMES_FP64        1
+   #define GxB_NO_ANY_TIMES_INT16       1
+   #define GxB_NO_ANY_TIMES_INT32       1
+   #define GxB_NO_ANY_TIMES_INT64       1
+   #define GxB_NO_ANY_TIMES_INT8        1
+   #define GxB_NO_ANY_TIMES_UINT16      1
+   #define GxB_NO_ANY_TIMES_UINT32      1
+   #define GxB_NO_ANY_TIMES_UINT64      1
+   #define GxB_NO_ANY_TIMES_UINT8       1
 
 //----------------------------------------
 // bitwise semirings:
@@ -2118,3 +2560,79 @@
 // #define GxB_NO_BXNOR_BXNOR_UINT32    1
 // #define GxB_NO_BXNOR_BXNOR_UINT64    1
 
+//----------------------------------------
+// semirings with positional multiplicative operators:
+//----------------------------------------
+
+// No builtin GrB* semirings use positional multiplicative operators.
+// BFS_parent uses ANY_SECONDI.  1-based semirings are important for 1-based
+// framewarks such as MATLAB.  In a semiring, the multiplicative operator
+// SECONDI is the same as FIRSTJ.
+
+// #define GxB_NO_MIN_FIRSTI_INT32      1
+// #define GxB_NO_MIN_FIRSTI_INT64      1
+// #define GxB_NO_MIN_FIRSTI1_INT32     1
+// #define GxB_NO_MIN_FIRSTI1_INT64     1
+// #define GxB_NO_MIN_FIRSTJ_INT32      1
+// #define GxB_NO_MIN_FIRSTJ_INT64      1
+// #define GxB_NO_MIN_FIRSTJ1_INT32     1
+// #define GxB_NO_MIN_FIRSTJ1_INT64     1
+// #define GxB_NO_MIN_SECONDJ_INT32     1
+// #define GxB_NO_MIN_SECONDJ_INT64     1
+// #define GxB_NO_MIN_SECONDJ1_INT32    1
+// #define GxB_NO_MIN_SECONDJ1_INT64    1
+
+// #define GxB_NO_MAX_FIRSTI_INT32      1
+// #define GxB_NO_MAX_FIRSTI_INT64      1
+// #define GxB_NO_MAX_FIRSTI1_INT32     1
+// #define GxB_NO_MAX_FIRSTI1_INT64     1
+// #define GxB_NO_MAX_FIRSTJ_INT32      1
+// #define GxB_NO_MAX_FIRSTJ_INT64      1
+// #define GxB_NO_MAX_FIRSTJ1_INT32     1
+// #define GxB_NO_MAX_FIRSTJ1_INT64     1
+// #define GxB_NO_MAX_SECONDJ_INT32     1
+// #define GxB_NO_MAX_SECONDJ_INT64     1
+// #define GxB_NO_MAX_SECONDJ1_INT32    1
+// #define GxB_NO_MAX_SECONDJ1_INT64    1
+
+// #define GxB_NO_ANY_FIRSTI_INT32      1
+// #define GxB_NO_ANY_FIRSTI_INT64      1
+// #define GxB_NO_ANY_FIRSTI1_INT32     1
+// #define GxB_NO_ANY_FIRSTI1_INT64     1
+// #define GxB_NO_ANY_FIRSTJ_INT32      1
+// #define GxB_NO_ANY_FIRSTJ_INT64      1
+// #define GxB_NO_ANY_FIRSTJ1_INT32     1
+// #define GxB_NO_ANY_FIRSTJ1_INT64     1
+// #define GxB_NO_ANY_SECONDJ_INT32     1
+// #define GxB_NO_ANY_SECONDJ_INT64     1
+// #define GxB_NO_ANY_SECONDJ1_INT32    1
+// #define GxB_NO_ANY_SECONDJ1_INT64    1
+
+// #define GxB_NO_PLUS_FIRSTI_INT32     1
+// #define GxB_NO_PLUS_FIRSTI_INT64     1
+// #define GxB_NO_PLUS_FIRSTI1_INT32    1
+// #define GxB_NO_PLUS_FIRSTI1_INT64    1
+// #define GxB_NO_PLUS_FIRSTJ_INT32     1
+// #define GxB_NO_PLUS_FIRSTJ_INT64     1
+// #define GxB_NO_PLUS_FIRSTJ1_INT32    1
+// #define GxB_NO_PLUS_FIRSTJ1_INT64    1
+// #define GxB_NO_PLUS_SECONDJ_INT32    1
+// #define GxB_NO_PLUS_SECONDJ_INT64    1
+// #define GxB_NO_PLUS_SECONDJ1_INT32   1
+// #define GxB_NO_PLUS_SECONDJ1_INT64   1
+
+// #define GxB_NO_TIMES_FIRSTI_INT32    1
+// #define GxB_NO_TIMES_FIRSTI_INT64    1
+// #define GxB_NO_TIMES_FIRSTI1_INT32   1
+// #define GxB_NO_TIMES_FIRSTI1_INT64   1
+// #define GxB_NO_TIMES_FIRSTJ_INT32    1
+// #define GxB_NO_TIMES_FIRSTJ_INT64    1
+// #define GxB_NO_TIMES_FIRSTJ1_INT32   1
+// #define GxB_NO_TIMES_FIRSTJ1_INT64   1
+// #define GxB_NO_TIMES_SECONDJ_INT32   1
+// #define GxB_NO_TIMES_SECONDJ_INT64   1
+// #define GxB_NO_TIMES_SECONDJ1_INT32  1
+// #define GxB_NO_TIMES_SECONDJ1_INT64  1
+
+// #endif
+
diff --git a/GraphBLAS/Source/GB_convert.h b/GraphBLAS/Source/GB_convert.h
new file mode 100644
index 0000000000..9399779d04
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert.h
@@ -0,0 +1,325 @@
+//------------------------------------------------------------------------------
+// GB_convert.h: converting between sparsity structures
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_CONVERT_H
+#define GB_CONVERT_H
+
+// these parameters define the hyper_switch needed to ensure matrix stays
+// either always hypersparse, or never hypersparse.
+#define GB_ALWAYS_HYPER (1.0)
+#define GB_NEVER_HYPER  (-1.0)
+
+// true if A is bitmap
+#define GB_IS_BITMAP(A) ((A) != NULL && ((A)->b != NULL))
+
+// true if A is full (but not bitmap)
+#define GB_IS_FULL(A) \
+    ((A) != NULL && (A)->h == NULL && (A)->p == NULL && (A)->i == NULL \
+        && (A)->b == NULL)
+
+// true if A is hypersparse
+#define GB_IS_HYPERSPARSE(A) ((A) != NULL && ((A)->h != NULL))
+
+// true if A is sparse (but not hypersparse)
+#define GB_IS_SPARSE(A) ((A) != NULL && ((A)->h == NULL) && (A)->p != NULL)
+
+// determine the sparsity control for a matrix
+int GB_sparsity_control     // revised sparsity
+(
+    int sparsity,           // sparsity control
+    int64_t vdim            // A->vdim, or -1 to ignore this condition
+) ;
+
+// GB_sparsity: determine the current sparsity status of a matrix
+static inline int GB_sparsity (GrB_Matrix A)
+{
+    if (A == NULL)
+    {
+        // if A is NULL, pretend it is sparse
+        return (GxB_SPARSE) ;
+    }
+    else if (GB_IS_HYPERSPARSE (A))
+    { 
+        return (GxB_HYPERSPARSE) ;
+    }
+    else if (GB_IS_FULL (A))
+    { 
+        return (GxB_FULL) ;
+    }
+    else if (GB_IS_BITMAP (A))
+    { 
+        return (GxB_BITMAP) ;
+    }
+    else
+    { 
+        return (GxB_SPARSE) ;
+    }
+}
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_convert_hyper_to_sparse // convert hypersparse to sparse
+(
+    GrB_Matrix A,           // matrix to convert from hypersparse to sparse
+    GB_Context Context
+) ;
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_convert_sparse_to_hyper // convert from sparse to hypersparse
+(
+    GrB_Matrix A,           // matrix to convert to hypersparse
+    GB_Context Context
+) ;
+
+bool GB_convert_hyper_to_sparse_test    // test for hypersparse to sparse
+(
+    float hyper_switch,     // A->hyper_switch
+    int64_t k,              // # of non-empty vectors of A (an estimate is OK)
+    int64_t vdim            // A->vdim
+) ;
+
+bool GB_convert_sparse_to_hyper_test  // test sparse to hypersparse conversion
+(
+    float hyper_switch,     // A->hyper_switch
+    int64_t k,              // # of non-empty vectors of A (an estimate is OK)
+    int64_t vdim            // A->vdim
+) ;
+
+bool GB_convert_bitmap_to_sparse_test    // test for hyper/sparse to bitmap
+(
+    float bitmap_switch,    // A->bitmap_switch
+    int64_t anz,            // # of entries in A = GB_NNZ (A)
+    int64_t vlen,           // A->vlen
+    int64_t vdim            // A->vdim
+) ;
+
+bool GB_convert_sparse_to_bitmap_test    // test for hyper/sparse to bitmap
+(
+    float bitmap_switch,    // A->bitmap_switch
+    int64_t anz,            // # of entries in A = GB_NNZ (A)
+    int64_t vlen,           // A->vlen
+    int64_t vdim            // A->vdim
+) ;
+
+GrB_Info GB_convert_full_to_sparse      // convert matrix from full to sparse
+(
+    GrB_Matrix A,               // matrix to convert from full to sparse
+    GB_Context Context
+) ;
+
+GrB_Info GB_convert_full_to_bitmap      // convert matrix from full to bitmap
+(
+    GrB_Matrix A,               // matrix to convert from full to bitmap
+    GB_Context Context
+) ;
+
+GrB_Info GB_convert_sparse_to_bitmap    // convert sparse/hypersparse to bitmap
+(
+    GrB_Matrix A,               // matrix to convert from sparse to bitmap
+    GB_Context Context
+) ;
+
+GrB_Info GB_convert_bitmap_to_sparse    // convert matrix from bitmap to sparse
+(
+    GrB_Matrix A,               // matrix to convert from bitmap to sparse
+    GB_Context Context
+) ;
+
+GrB_Info GB_convert_bitmap_worker   // extract CSC/CSR or triplets from bitmap
+(
+    // outputs:
+    int64_t *GB_RESTRICT Ap,        // vector pointers for CSC/CSR form
+    int64_t *GB_RESTRICT Ai,        // indices for CSC/CSR or triplet form
+    int64_t *GB_RESTRICT Aj,        // vector indices for triplet form
+    GB_void *GB_RESTRICT Ax_new,    // values for CSC/CSR or triplet form
+    int64_t *anvec_nonempty,        // # of non-empty vectors
+    // inputs: not modified
+    const GrB_Matrix A,             // matrix to extract; not modified
+    GB_Context Context
+) ;
+
+GrB_Info GB_convert_to_full     // convert matrix to full; delete prior values
+(
+    GrB_Matrix A                // matrix to convert to full
+) ;
+
+GrB_Info GB_convert_any_to_bitmap   // convert to bitmap
+(
+    GrB_Matrix A,           // matrix to convert to bitmap
+    GB_Context Context
+) ;
+
+GB_PUBLIC                       // used by MATLAB interface
+void GB_convert_any_to_full     // convert any matrix to full
+(
+    GrB_Matrix A                // matrix to convert to full
+) ;
+
+GrB_Info GB_convert_any_to_hyper // convert to hypersparse
+(
+    GrB_Matrix A,           // matrix to convert to hypersparse
+    GB_Context Context
+) ;
+
+GrB_Info GB_convert_any_to_sparse // convert to sparse
+(
+    GrB_Matrix A,           // matrix to convert to sparse
+    GB_Context Context
+) ;
+
+GrB_Info GB_convert_to_nonfull      // ensure a matrix is not full
+(
+    GrB_Matrix A,
+    GB_Context Context
+) ;
+
+/* ensure C is sparse or hypersparse */
+#define GB_ENSURE_SPARSE(C)                                 \
+{                                                           \
+    if (GB_IS_BITMAP (C))                                   \
+    {                                                       \
+        /* convert C from bitmap to sparse */               \
+        GB_OK (GB_convert_bitmap_to_sparse (C, Context)) ;  \
+    }                                                       \
+    else if (GB_IS_FULL (C))                                \
+    {                                                       \
+        /* convert C from full to sparse */                 \
+        GB_OK (GB_convert_full_to_sparse (C, Context)) ;    \
+    }                                                       \
+}
+
+#define GB_ENSURE_FULL(C)                                       \
+{                                                               \
+    ASSERT (GB_is_dense (C)) ;                                  \
+    if (GB_sparsity_control (C->sparsity, C->vdim) & GxB_FULL)  \
+    {                                                           \
+        /* convert C from any structure to full, */             \
+        /* if permitted by C->sparsity */                       \
+        GB_convert_any_to_full (C) ;                            \
+    }                                                           \
+}
+
+//------------------------------------------------------------------------------
+// GB_is_dense
+//------------------------------------------------------------------------------
+
+static inline bool GB_is_dense
+(
+    const GrB_Matrix A
+)
+{
+    // check if A is competely dense:  all entries present.
+    // zombies, pending tuples, and jumbled status are not considered.
+    // A can have any sparsity structure: hyper, sparse, bitmap, or full.
+    // It can be converted to full, if zombies/tuples/jumbled are discarded.
+    if (A == NULL)
+    {
+        return (false) ;
+    }
+    if (GB_IS_FULL (A))
+    { 
+        // A is full; the pattern is not present
+        return (true) ;
+    }
+    // A is sparse, hyper, or bitmap: check if all entries present
+    GrB_Index anzmax ;
+    bool ok = GB_Index_multiply (&anzmax, A->vlen, A->vdim) ;
+    return (ok && (anzmax == GB_NNZ (A))) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_as_if_full
+//------------------------------------------------------------------------------
+
+static inline bool GB_as_if_full
+(
+    const GrB_Matrix A
+)
+{
+    // check if A is competely dense:  all entries present.
+    // zombies, pending tuples, and jumbled status are checked.
+    // A can have any sparsity structure: hyper, sparse, bitmap, or full.
+    // It can be converted to full.
+    if (A == NULL)
+    {
+        return (false) ;
+    }
+    if (GB_IS_FULL (A))
+    { 
+        // A is full; the pattern is not present
+        return (true) ;
+    }
+    if (GB_ANY_PENDING_WORK (A))
+    {
+        // A has pending work and so cannot be treated as if full.
+        return (false) ;
+    }
+    // A is sparse, hyper, or bitmap: check if all entries present
+    GrB_Index anzmax ;
+    bool ok = GB_Index_multiply (&anzmax, A->vlen, A->vdim) ;
+    return (ok && (anzmax == GB_NNZ (A))) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_is_packed
+//------------------------------------------------------------------------------
+
+static inline bool GB_is_packed
+(
+    const GrB_Matrix A
+)
+{
+    // check if A is a packed matrix.  A is packed if it is bitmap or full.  If
+    // A is hypersparse or sparse, it is packed if it is not jumbled, all
+    // entries are present, and it has no zombies or pending tuples. 
+    // If A is sparse or hypersparse, it can be converted to full via
+    // GB_convert_any_to_full, by deleting A->p, A->h, and A->i.  If bitmap,
+    // it cannot be converted to full unless GB_is_dense (A) is also true
+    // (it must have all entries present).
+
+    return (GB_IS_BITMAP (A) || GB_as_if_full (A)) ;
+}
+
+//------------------------------------------------------------------------------
+
+GrB_Info GB_conform     // conform a matrix to its desired sparsity structure
+(
+    GrB_Matrix A,       // matrix to conform
+    GB_Context Context
+) ;
+
+static inline char *GB_sparsity_char (int sparsity)
+{
+    switch (sparsity)
+    {
+        case GxB_HYPERSPARSE: return ("H") ;
+        case GxB_SPARSE:      return ("S") ;
+        case GxB_BITMAP:      return ("B") ;
+        case GxB_FULL:        return ("F") ;
+        default: ASSERT (0) ; return ("?") ;
+    }
+}
+
+static inline char *GB_sparsity_char_matrix (GrB_Matrix A)
+{
+    if (A == NULL)             return (".") ;
+    if (GB_IS_HYPERSPARSE (A)) return ("H") ;
+    if (GB_IS_SPARSE (A))      return ("S") ;
+    if (GB_IS_BITMAP (A))      return ("B") ;
+    if (GB_IS_FULL (A))        return ("F") ;
+    ASSERT (0) ;               return ("?") ;
+}
+
+GrB_Matrix GB_hyper_pack            // return C
+(
+    GrB_Matrix C,                   // output matrix
+    const GrB_Matrix A              // input matrix
+) ;
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_convert_any_to_bitmap.c b/GraphBLAS/Source/GB_convert_any_to_bitmap.c
new file mode 100644
index 0000000000..203c4644ac
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_any_to_bitmap.c
@@ -0,0 +1,70 @@
+//------------------------------------------------------------------------------
+// GB_convert_any_to_bitmap: convert any matrix to bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// The input matrix may be jumbled and have zombies, and can still be converted
+// to a bitmap.
+
+#include "GB.h"
+#define GB_FREE_ALL ;
+
+GrB_Info GB_convert_any_to_bitmap   // convert to bitmap
+(
+    GrB_Matrix A,           // matrix to convert to bitmap
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    ASSERT_MATRIX_OK (A, "A being converted to bitmap", GB0) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;    // A can have zombies
+    ASSERT (GB_JUMBLED_OK (A)) ;    // A can be jumbled
+    GB_MATRIX_WAIT_IF_PENDING (A) ; // A cannot have pending tuples
+    ASSERT (!GB_PENDING (A)) ;
+
+    //--------------------------------------------------------------------------
+    // convert A to bitmap
+    //--------------------------------------------------------------------------
+
+    if (GB_IS_HYPERSPARSE (A))
+    { 
+        // convert from hypersparse to bitmap
+        GB_OK (GB_convert_sparse_to_bitmap (A, Context)) ;
+    }
+    else if (GB_IS_FULL (A))
+    { 
+        // convert from full to bitmap
+        GB_OK (GB_convert_full_to_bitmap (A, Context)) ;
+    }
+    else if (GB_IS_BITMAP (A))
+    { 
+        // already bitmap; nothing to do
+        ;
+    }
+    else
+    { 
+        // convert from sparse to bitmap
+        GB_OK (GB_convert_sparse_to_bitmap (A, Context)) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "A to bitmap", GB0) ;
+    ASSERT (GB_IS_BITMAP (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_convert_any_to_full.c b/GraphBLAS/Source/GB_convert_any_to_full.c
new file mode 100644
index 0000000000..f6df1c3ce1
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_any_to_full.c
@@ -0,0 +1,72 @@
+//------------------------------------------------------------------------------
+// GB_convert_any_to_full: convert any matrix to full
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// All entries in A must be present, with no pending work; GB_as_if_full (A)
+// must be true on input.  A may be hypersparse, sparse, bitmap, or full on
+// input, and full on output.
+
+#include "GB.h"
+
+GB_PUBLIC                       // used by MATLAB interface
+void GB_convert_any_to_full     // convert any matrix to full
+(
+    GrB_Matrix A                // matrix to convert to full
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "A converting any to full", GB0) ;
+    ASSERT (GB_as_if_full (A)) ;
+
+    if (GB_IS_FULL (A))
+    { 
+        // already full; nothing to do
+        return ;
+    }
+
+    GBURBLE ("(%s to full) ", (A->h != NULL) ? "hypersparse" :
+        (GB_IS_BITMAP (A) ? "bitmap" : "sparse")) ;
+
+    //--------------------------------------------------------------------------
+    // free A->h, A->p, A->i, and A->b
+    //--------------------------------------------------------------------------
+
+    GB_ph_free (A) ;
+
+    if (!A->i_shallow) GB_FREE (A->i) ;
+    A->i = NULL ;
+    A->i_shallow = false ;
+
+    if (!A->b_shallow) GB_FREE (A->b) ;
+    A->b = NULL ;
+    A->b_shallow = false ;
+
+    int64_t avdim = A->vdim ;
+    int64_t avlen = A->vlen ;
+
+    A->plen = -1 ;
+    A->nvec = avdim ;
+    A->nvec_nonempty = (avlen == 0) ? 0 : avdim ;
+
+    A->magic = GB_MAGIC ;
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "A converted from any to full", GB0) ;
+    ASSERT (GB_IS_FULL (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+}
+
diff --git a/GraphBLAS/Source/GB_convert_any_to_hyper.c b/GraphBLAS/Source/GB_convert_any_to_hyper.c
new file mode 100644
index 0000000000..8331e95e1c
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_any_to_hyper.c
@@ -0,0 +1,68 @@
+//------------------------------------------------------------------------------
+// GB_convert_any_to_hyper: convert any matrix to hypersparse
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+#define GB_FREE_ALL ;
+
+GrB_Info GB_convert_any_to_hyper // convert to hypersparse
+(
+    GrB_Matrix A,           // matrix to convert to hypersparse
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    ASSERT_MATRIX_OK (A, "A being converted to hyper", GB0) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
+
+    //--------------------------------------------------------------------------
+    // convert A to hypersparse
+    //--------------------------------------------------------------------------
+
+    if (GB_IS_HYPERSPARSE (A))
+    { 
+        // already hypersparse, nothing to do
+        ;
+    }
+    else if (GB_IS_FULL (A))
+    { 
+        // convert from full to hypersparse
+        GB_OK (GB_convert_full_to_sparse (A, Context)) ;
+        GB_OK (GB_convert_sparse_to_hyper (A, Context)) ;
+    }
+    else if (GB_IS_BITMAP (A))
+    { 
+        // convert from bitmap to hypersparse
+        GB_OK (GB_convert_bitmap_to_sparse (A, Context)) ;
+        GB_OK (GB_convert_sparse_to_hyper (A, Context)) ;
+    }
+    else
+    { 
+        // convert from sparse to hypersparse
+        GB_OK (GB_convert_sparse_to_hyper (A, Context)) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "A to hypersparse", GB0) ;
+    ASSERT (GB_IS_HYPERSPARSE (A)) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_convert_any_to_sparse.c b/GraphBLAS/Source/GB_convert_any_to_sparse.c
new file mode 100644
index 0000000000..7cb4a17a83
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_any_to_sparse.c
@@ -0,0 +1,66 @@
+//------------------------------------------------------------------------------
+// GB_convert_any_to_sparse: convert any matrix to sparse
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+#define GB_FREE_ALL ;
+
+GrB_Info GB_convert_any_to_sparse // convert to sparse
+(
+    GrB_Matrix A,           // matrix to convert to sparse
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    ASSERT_MATRIX_OK (A, "A being converted to sparse", GB0) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
+
+    //--------------------------------------------------------------------------
+    // convert A to sparse
+    //--------------------------------------------------------------------------
+
+    if (GB_IS_HYPERSPARSE (A))
+    { 
+        // convert from hypersparse to sparse
+        GB_OK (GB_convert_hyper_to_sparse (A, Context)) ;
+    }
+    else if (GB_IS_FULL (A))
+    { 
+        // convert from full to sparse
+        GB_OK (GB_convert_full_to_sparse (A, Context)) ;
+    }
+    else if (GB_IS_BITMAP (A))
+    { 
+        // convert from bitmap to sparse
+        GB_OK (GB_convert_bitmap_to_sparse (A, Context)) ;
+    }
+    else
+    { 
+        // already sparse; nothing to do
+        ;
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "A to sparse", GB0) ;
+    ASSERT (GB_IS_SPARSE (A)) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_convert_bitmap_to_sparse.c b/GraphBLAS/Source/GB_convert_bitmap_to_sparse.c
new file mode 100644
index 0000000000..a9824e6a10
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_bitmap_to_sparse.c
@@ -0,0 +1,102 @@
+//------------------------------------------------------------------------------
+// GB_convert_bitmap_to_sparse: convert a matrix from bitmap to sparse
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+
+#define GB_FREE_ALL     \
+{                       \
+    GB_FREE (Ap) ;      \
+    GB_FREE (Ai) ;      \
+    GB_FREE (Ax) ;      \
+}
+
+GrB_Info GB_convert_bitmap_to_sparse    // convert matrix from bitmap to sparse
+(
+    GrB_Matrix A,               // matrix to convert from bitmap to sparse
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    ASSERT_MATRIX_OK (A, "A converting bitmap to sparse", GB0) ;
+    ASSERT (!GB_IS_FULL (A)) ;
+    ASSERT (GB_IS_BITMAP (A)) ;
+    ASSERT (!GB_IS_SPARSE (A)) ;
+    ASSERT (!GB_IS_HYPERSPARSE (A)) ;
+    ASSERT (!GB_PENDING (A)) ;      // bitmap never has pending tuples
+    ASSERT (!GB_JUMBLED (A)) ;      // bitmap is never jumbled
+    ASSERT (!GB_ZOMBIES (A)) ;      // bitmap never has zomies
+    GBURBLE ("(bitmap to sparse) ") ;
+
+    //--------------------------------------------------------------------------
+    // allocate Ap, Ai, and Ax
+    //--------------------------------------------------------------------------
+
+    const int64_t anz = GB_NNZ (A) ;
+    const int64_t anzmax = GB_IMAX (anz, 1) ;
+    int64_t anvec_nonempty ;
+    const int64_t avdim = A->vdim ;
+    const size_t asize = A->type->size ;
+    int64_t *GB_RESTRICT Ap = GB_MALLOC (avdim+1, int64_t) ; 
+    int64_t *GB_RESTRICT Ai = GB_MALLOC (anzmax, int64_t) ;
+    GB_void *GB_RESTRICT Ax = GB_MALLOC (anzmax * asize, GB_void) ;
+    if (Ap == NULL || Ai == NULL || Ax == NULL)
+    { 
+        // out of memory
+        GB_FREE_ALL ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // convert to sparse format (Ap, Ai, and Ax)
+    //--------------------------------------------------------------------------
+
+    GB_OK (GB_convert_bitmap_worker (Ap, Ai, NULL, Ax, &anvec_nonempty, A,
+        Context)) ;
+
+    //--------------------------------------------------------------------------
+    // free prior content of A and transplant the new content
+    //--------------------------------------------------------------------------
+
+    GB_phbix_free (A) ;
+
+    A->p = Ap ;
+    A->p_shallow = false ;
+
+    A->i = Ai ;
+    A->i_shallow = false ;
+
+    A->x = Ax ;
+    A->x_shallow = false ;
+
+    A->nzmax = anzmax ;
+    A->nvals = 0 ;              // only used when A is bitmap
+
+    A->plen = avdim ;
+    A->nvec = avdim ;
+    A->nvec_nonempty = anvec_nonempty ;
+
+    A->magic = GB_MAGIC ;
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "A converted from to bitmap to sparse", GB0) ;
+    ASSERT (GB_IS_SPARSE (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_convert_bitmap_to_sparse_test.c b/GraphBLAS/Source/GB_convert_bitmap_to_sparse_test.c
new file mode 100644
index 0000000000..429da3b5ca
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_bitmap_to_sparse_test.c
@@ -0,0 +1,46 @@
+//------------------------------------------------------------------------------
+// GB_convert_bitmap_to_sparse_test: test conversion of bitmap to sparse
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Returns true if a bitmap matrix should be converted to sparse.
+// Returns false if the matrix should stay bitmap.
+
+// If A is m-by-n and A->sparsity is GxB_ANY_SPARSITY with b =
+// A->bitmap_switch, the matrix switches to bitmap if nnz(A)/(m*n) > b.  A
+// bitmap matrix switches to sparse if nnz(A)/(m*n) <= b/2.  A matrix whose
+// density is between b/2 and b remains in its current state.
+
+// A->bitmap_switch is normally a fraction in range 0 to 1.  A value of 1
+// ensures that A is never converted to bitmap.
+
+// These default rules may change in future releases of SuiteSparse:GraphBLAS.
+
+// If this test returns true and the matrix changes to sparse, then the rule
+// for A->hyper_switch may then convert it from sparse to hypersparse.
+
+#include "GB.h"
+
+bool GB_convert_bitmap_to_sparse_test    // test for hyper/sparse to bitmap
+(
+    float bitmap_switch,    // A->bitmap_switch
+    int64_t anz,            // # of entries in A = GB_NNZ (A)
+    int64_t vlen,           // A->vlen
+    int64_t vdim            // A->vdim
+)
+{ 
+
+    // current number of entries in the matrix or vector
+    float nnz = (float) anz ;
+
+    // maximum number of entries in the matrix or vector
+    float nnz_dense = ((float) vlen) * ((float) vdim) ;
+
+    // A should switch to sparse if the following condition is true:
+    return (nnz <= (bitmap_switch/2) * nnz_dense) ;
+}
+
diff --git a/GraphBLAS/Source/GB_convert_bitmap_worker.c b/GraphBLAS/Source/GB_convert_bitmap_worker.c
new file mode 100644
index 0000000000..e07f9b9b79
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_bitmap_worker.c
@@ -0,0 +1,235 @@
+//------------------------------------------------------------------------------
+// GB_convert_bitmap_worker: construct triplets or CSC/CSR from bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// TODO allow this function to do typecasting.  Create 169 different versions
+// for all 13x13 versions.  Use this as part of Method 24, C=A assignment.
+
+#include "GB.h"
+#include "GB_partition.h"
+
+#define GB_FREE_ALL     \
+{                       \
+    GB_FREE (W) ;       \
+}
+
+GrB_Info GB_convert_bitmap_worker   // extract CSC/CSR or triplets from bitmap
+(
+    // outputs:
+    int64_t *GB_RESTRICT Ap,        // vector pointers for CSC/CSR form
+    int64_t *GB_RESTRICT Ai,        // indices for CSC/CSR or triplet form
+    int64_t *GB_RESTRICT Aj,        // vector indices for triplet form
+    GB_void *GB_RESTRICT Ax_new,    // values for CSC/CSR or triplet form
+    int64_t *anvec_nonempty,        // # of non-empty vectors
+    // inputs: not modified
+    const GrB_Matrix A,             // matrix to extract; not modified
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_BITMAP (A)) ;
+    ASSERT (Ap != NULL) ;           // must be provided on input, size avdim+1
+
+    int64_t *GB_RESTRICT W = NULL ;
+    const int64_t avdim = A->vdim ;
+    const int64_t avlen = A->vlen ;
+    const size_t asize = A->type->size ;
+
+    //--------------------------------------------------------------------------
+    // count the entries in each vector
+    //--------------------------------------------------------------------------
+
+    const int8_t *GB_RESTRICT Ab = A->b ;
+
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+    int nthreads = GB_nthreads (avlen*avdim, chunk, nthreads_max) ;
+    bool by_vector = (nthreads <= avdim) ;
+
+    if (by_vector)
+    {
+
+        //----------------------------------------------------------------------
+        // compute all vectors in parallel (no workspace)
+        //----------------------------------------------------------------------
+
+        int64_t j ;
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (j = 0 ; j < avdim ; j++)
+        {
+            // ajnz = nnz (A (:,j))
+            int64_t ajnz = 0 ;
+            int64_t pA_start = j * avlen ;
+            for (int64_t i = 0 ; i < avlen ; i++)
+            { 
+                // see if A(i,j) is present in the bitmap
+                int64_t p = i + pA_start ;
+                ajnz += Ab [p] ;
+                ASSERT (Ab [p] == 0 || Ab [p] == 1) ;
+            }
+            Ap [j] = ajnz ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // compute blocks of rows in parallel
+        //----------------------------------------------------------------------
+
+        // allocate one row of W per thread, each row of length avdim
+        W = GB_MALLOC (nthreads * avdim, int64_t) ;
+        if (W == NULL)
+        {
+            GB_FREE_ALL ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+
+        int taskid ;
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (taskid = 0 ; taskid < nthreads ; taskid++)
+        {
+            int64_t *GB_RESTRICT Wtask = W + taskid * avdim ;
+            int64_t istart, iend ;
+            GB_PARTITION (istart, iend, avlen, taskid, nthreads) ;
+            for (int64_t j = 0 ; j < avdim ; j++)
+            {
+                // ajnz = nnz (A (istart:iend-1,j))
+                int64_t ajnz = 0 ;
+                int64_t pA_start = j * avlen ;
+                for (int64_t i = istart ; i < iend ; i++)
+                { 
+                    // see if A(i,j) is present in the bitmap
+                    int64_t p = i + pA_start ;
+                    ajnz += Ab [p] ;
+                    ASSERT (Ab [p] == 0 || Ab [p] == 1) ;
+                }
+                Wtask [j] = ajnz ;
+            }
+        }
+
+        // cumulative sum to compute nnz(A(:,j)) for each vector j
+        int64_t j ;
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (j = 0 ; j < avdim ; j++)
+        {
+            int64_t ajnz = 0 ;
+            for (int taskid = 0 ; taskid < nthreads ; taskid++)
+            { 
+                int64_t *GB_RESTRICT Wtask = W + taskid * avdim ;
+                int64_t c = Wtask [j] ;
+                Wtask [j] = ajnz ;
+                ajnz += c ;
+            }
+            Ap [j] = ajnz ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // cumulative sum of Ap 
+    //--------------------------------------------------------------------------
+
+    int nth = GB_nthreads (avdim, chunk, nthreads_max) ;
+    GB_cumsum (Ap, avdim, anvec_nonempty, nth) ;
+    int64_t anz = Ap [avdim] ;
+    ASSERT (anz == A->nvals) ;
+
+    //--------------------------------------------------------------------------
+    // gather the pattern and values from the bitmap
+    //--------------------------------------------------------------------------
+
+    // TODO: add type-specific versions for built-in types
+
+    const GB_void *GB_RESTRICT Ax = A->x ;
+
+    if (by_vector)
+    {
+
+        //----------------------------------------------------------------------
+        // construct all vectors in parallel (no workspace)
+        //----------------------------------------------------------------------
+
+        int64_t j ;
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (j = 0 ; j < avdim ; j++)
+        {
+            // gather from the bitmap into the new A (:,j)
+            int64_t pnew = Ap [j] ;
+            int64_t pA_start = j * avlen ;
+            for (int64_t i = 0 ; i < avlen ; i++)
+            {
+                int64_t p = i + pA_start ;
+                if (Ab [p])
+                {
+                    // A(i,j) is in the bitmap
+                    if (Ai != NULL) Ai [pnew] = i ;
+                    if (Aj != NULL) Aj [pnew] = j ;
+                    if (Ax_new != NULL)
+                    { 
+                        // Ax_new [pnew] = Ax [p])
+                        memcpy (Ax_new +(pnew)*asize, Ax +(p)*asize, asize) ;
+                    }
+                    pnew++ ;
+                }
+            }
+            ASSERT (pnew == Ap [j+1]) ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // compute blocks of rows in parallel
+        //----------------------------------------------------------------------
+
+        int taskid ;
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (taskid = 0 ; taskid < nthreads ; taskid++)
+        {
+            int64_t *GB_RESTRICT Wtask = W + taskid * avdim ;
+            int64_t istart, iend ;
+            GB_PARTITION (istart, iend, avlen, taskid, nthreads) ;
+            for (int64_t j = 0 ; j < avdim ; j++)
+            {
+                // gather from the bitmap into the new A (:,j)
+                int64_t pnew = Ap [j] + Wtask [j] ;
+                int64_t pA_start = j * avlen ;
+                for (int64_t i = istart ; i < iend ; i++)
+                {
+                    // see if A(i,j) is present in the bitmap
+                    int64_t p = i + pA_start ;
+                    if (Ab [p])
+                    {
+                        // A(i,j) is in the bitmap
+                        if (Ai != NULL) Ai [pnew] = i ;
+                        if (Aj != NULL) Aj [pnew] = j ;
+                        if (Ax_new != NULL)
+                        { 
+                            // Ax_new [pnew] = Ax [p] ;
+                            memcpy (Ax_new +(pnew)*asize, Ax +(p)*asize, asize);
+                        }
+                        pnew++ ;
+                    }
+                }
+            }
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace return result
+    //--------------------------------------------------------------------------
+
+    GB_FREE (W) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_convert_full_to_bitmap.c b/GraphBLAS/Source/GB_convert_full_to_bitmap.c
new file mode 100644
index 0000000000..f18d32bbec
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_full_to_bitmap.c
@@ -0,0 +1,74 @@
+//------------------------------------------------------------------------------
+// GB_convert_full_to_bitmap: convert a matrix from full to bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+
+GrB_Info GB_convert_full_to_bitmap      // convert matrix from full to bitmap
+(
+    GrB_Matrix A,               // matrix to convert from full to bitmap
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "A converting full to bitmap", GB0) ;
+    ASSERT (GB_IS_FULL (A)) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (!GB_IS_SPARSE (A)) ;
+    ASSERT (!GB_IS_HYPERSPARSE (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    GBURBLE ("(full to bitmap) ") ;
+
+    //--------------------------------------------------------------------------
+    // allocate A->b
+    //--------------------------------------------------------------------------
+
+    int64_t avdim = A->vdim ;
+    int64_t avlen = A->vlen ;
+    int64_t anz = avdim * avlen ;
+    ASSERT (GB_Index_multiply (&anz, avdim, avlen) == true) ;
+
+    A->b = GB_MALLOC (anz, int8_t) ;
+    if (A->b == NULL)
+    { 
+        // out of memory
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // determine the number of threads to use
+    //--------------------------------------------------------------------------
+
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+    int nthreads = GB_nthreads (anz, chunk, nthreads_max) ;
+
+    //--------------------------------------------------------------------------
+    // fill the A->b bitmap in parallel
+    //--------------------------------------------------------------------------
+
+    GB_memset (A->b, 1, anz, nthreads) ;
+    A->nvals = anz ;
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "A converted from full to bitmap", GB0) ;
+    ASSERT (GB_IS_BITMAP (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_convert_full_to_sparse.c b/GraphBLAS/Source/GB_convert_full_to_sparse.c
new file mode 100644
index 0000000000..523d540ed3
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_full_to_sparse.c
@@ -0,0 +1,95 @@
+//------------------------------------------------------------------------------
+// GB_convert_full_to_sparse: convert a matrix from full to sparse
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+
+GrB_Info GB_convert_full_to_sparse      // convert matrix from full to sparse
+(
+    GrB_Matrix A,               // matrix to convert from full to sparse
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "A converting full to sparse", GB0) ;
+    ASSERT (GB_IS_FULL (A) || A->nzmax == 0) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (!GB_IS_SPARSE (A)) ;
+    ASSERT (!GB_IS_HYPERSPARSE (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    GBURBLE ("(full to sparse) ") ;
+
+    //--------------------------------------------------------------------------
+    // allocate A->p and A->i
+    //--------------------------------------------------------------------------
+
+    int64_t avdim = A->vdim ;
+    int64_t avlen = A->vlen ;
+    int64_t anz = avdim * avlen ;
+    ASSERT (GB_Index_multiply (&anz, avdim, avlen) == true) ;
+
+    int64_t *GB_RESTRICT Ap = GB_MALLOC (avdim+1, int64_t) ;
+    int64_t *GB_RESTRICT Ai = GB_MALLOC (anz, int64_t) ;
+
+    if (Ap == NULL || Ai == NULL)
+    { 
+        // out of memory
+        GB_FREE (Ap) ;
+        GB_FREE (Ai) ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    A->p = Ap ;
+    A->i = Ai ;
+    A->plen = avdim ;
+    A->nvec = avdim ;
+    A->nvec_nonempty = (avlen == 0) ? 0 : avdim ;
+
+    //--------------------------------------------------------------------------
+    // determine the number of threads to use
+    //--------------------------------------------------------------------------
+
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+    int nthreads = GB_nthreads (anz, chunk, nthreads_max) ;
+
+    //--------------------------------------------------------------------------
+    // fill the A->p and A->i pattern
+    //--------------------------------------------------------------------------
+
+    int64_t k ;
+    #pragma omp parallel for num_threads(nthreads) schedule(static)
+    for (k = 0 ; k <= avdim ; k++)
+    { 
+        Ap [k] = k * avlen ;
+    }
+
+    int64_t p ;
+    #pragma omp parallel for num_threads(nthreads) schedule(static)
+    for (p = 0 ; p < anz ; p++)
+    { 
+        Ai [p] = p % avlen ;
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "A converted from full to sparse", GB0) ;
+    ASSERT (GB_IS_SPARSE (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_to_nonhyper.c b/GraphBLAS/Source/GB_convert_hyper_to_sparse.c
similarity index 87%
rename from GraphBLAS/Source/GB_to_nonhyper.c
rename to GraphBLAS/Source/GB_convert_hyper_to_sparse.c
index 285440c988..a6ecc0834e 100644
--- a/GraphBLAS/Source/GB_to_nonhyper.c
+++ b/GraphBLAS/Source/GB_convert_hyper_to_sparse.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_to_nonhyper: convert a matrix to non-hypersparse form
+// GB_convert_hyper_to_sparse: convert a matrix from hypersparse to sparse
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,12 +17,12 @@
 
 // If an out-of-memory condition occurs, all content of the matrix is cleared.
 
-// The input matrix may be jumbled; this is not an error condition.
+// If the input matrix A is sparse, bitmap or full, it is unchanged.
 
 #include "GB.h"
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_to_nonhyper     // convert a matrix to non-hypersparse
+GrB_Info GB_convert_hyper_to_sparse // convert hypersparse to sparse
 (
     GrB_Matrix A,           // matrix to convert to non-hypersparse
     GB_Context Context
@@ -33,20 +33,23 @@ GrB_Info GB_to_nonhyper     // convert a matrix to non-hypersparse
     // check inputs
     //--------------------------------------------------------------------------
 
-    ASSERT_MATRIX_OK_OR_JUMBLED (A, "A being converted to nonhyper", GB0) ;
+    ASSERT_MATRIX_OK (A, "A being converted from hyper to sparse", GB0) ;
     ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
 
     //--------------------------------------------------------------------------
-    // convert A to non-hypersparse form
+    // convert A from hypersparse to sparse
     //--------------------------------------------------------------------------
 
-    if (A->is_hyper)
-    {
+    if (GB_IS_HYPERSPARSE (A))
+    { 
 
         //----------------------------------------------------------------------
         // determine the number of threads to use
         //----------------------------------------------------------------------
 
+        GBURBLE ("(hyper to sparse) ") ;
         int64_t n = A->vdim ;
 
         GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
@@ -64,9 +67,7 @@ GrB_Info GB_to_nonhyper     // convert a matrix to non-hypersparse
         if (Ap_new == NULL)
         { 
             // out of memory
-            A->is_hyper = false ;    // A is non-hypersparse, but invalid
-            GB_PHIX_FREE (A) ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         #ifdef GB_DEBUG
@@ -79,8 +80,8 @@ GrB_Info GB_to_nonhyper     // convert a matrix to non-hypersparse
         //----------------------------------------------------------------------
 
         int64_t nvec = A->nvec ;                // # of vectors in Ah_old
-        int64_t *GB_RESTRICT Ap_old = A->p ;       // size nvec+1
-        int64_t *GB_RESTRICT Ah_old = A->h ;       // size nvec
+        int64_t *GB_RESTRICT Ap_old = A->p ;    // size nvec+1
+        int64_t *GB_RESTRICT Ah_old = A->h ;    // size nvec
         int64_t nvec_nonempty = 0 ;             // recompute A->nvec_nonempty
         int64_t anz = GB_NNZ (A) ;
 
@@ -203,7 +204,6 @@ GrB_Info GB_to_nonhyper     // convert a matrix to non-hypersparse
         // transplant the new vector pointers; matrix is no longer hypersparse
         A->p = Ap_new ;
         A->h = NULL ;
-        A->is_hyper = false ;
         A->nvec = n ;
         A->nvec_nonempty = nvec_nonempty ;
         A->plen = n ;
@@ -211,15 +211,23 @@ GrB_Info GB_to_nonhyper     // convert a matrix to non-hypersparse
         A->h_shallow = false ;
         A->magic = GB_MAGIC ;
         ASSERT (anz == GB_NNZ (A)) ;
-        ASSERT (A->nvec_nonempty == GB_nvec_nonempty (A, Context)) ;
+
+        //----------------------------------------------------------------------
+        // A is now sparse
+        //----------------------------------------------------------------------
+
+        ASSERT (GB_IS_SPARSE (A)) ;
     }
 
     //--------------------------------------------------------------------------
-    // A is now in non-hypersparse form
+    // A is now in sparse form (or left as full or bitmap)
     //--------------------------------------------------------------------------
 
-    ASSERT_MATRIX_OK_OR_JUMBLED (A, "A converted to nonhypersparse", GB0) ;
-    ASSERT (!(A->is_hyper)) ;
+    ASSERT_MATRIX_OK (A, "A converted to sparse (or left as-is)", GB0) ;
+    ASSERT (!GB_IS_HYPERSPARSE (A)) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_convert_hyper_to_sparse_test.c b/GraphBLAS/Source/GB_convert_hyper_to_sparse_test.c
new file mode 100644
index 0000000000..fa35514144
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_hyper_to_sparse_test.c
@@ -0,0 +1,35 @@
+//------------------------------------------------------------------------------
+// GB_convert_hyper_to_sparse_test: test conversion of hypersparse to sparse
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Returns true if a hypersparse matrix should be converted to sparse.
+// Returns false if the matrix should stay hypersparse.
+
+// A matrix with vdim <= 1 must always be sparse, not hypersparse;
+// that is, a GrB_Vector is never hypersparse.
+
+#include "GB.h"
+
+bool GB_convert_hyper_to_sparse_test    // test for hypersparse to sparse
+(
+    float hyper_switch,     // A->hyper_switch
+    int64_t k,              // # of non-empty vectors of A (estimate is OK)
+    int64_t vdim            // A->vdim
+)
+{ 
+
+    // get the vector dimension of this matrix
+    float n = (float) vdim ;
+
+    // ensure k is in the range 0 to n, inclusive
+    k = GB_IMAX (k, 0) ;
+    k = GB_IMIN (k, n) ;
+
+    return (n <= 1 || (((float) k) > n * hyper_switch * 2)) ;
+}
+
diff --git a/GraphBLAS/Source/GB_convert_sparse_to_bitmap.c b/GraphBLAS/Source/GB_convert_sparse_to_bitmap.c
new file mode 100644
index 0000000000..1e1044b8db
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_sparse_to_bitmap.c
@@ -0,0 +1,268 @@
+//------------------------------------------------------------------------------
+// GB_convert_sparse_to_bitmap: convert from sparse/hypersparse to bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_ek_slice.h"
+#ifndef GBCOMPACT
+#include "GB_type__include.h"
+#endif
+
+#define GB_FREE_WORK                    \
+    GB_ek_slice_free (&pstart_slice, &kfirst_slice, &klast_slice) ;
+
+#define GB_FREE_ALL                     \
+{                                       \
+    GB_FREE_WORK ;                      \
+    if (!in_place) GB_FREE (Ax_new) ;   \
+    GB_FREE (Ab) ;                      \
+}
+
+GrB_Info GB_convert_sparse_to_bitmap    // convert sparse/hypersparse to bitmap
+(
+    GrB_Matrix A,               // matrix to convert from sparse to bitmap
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
+    int8_t *GB_RESTRICT Ab = NULL ;
+    GB_void *GB_RESTRICT Ax_new = NULL ;
+
+    ASSERT_MATRIX_OK (A, "A converting sparse/hypersparse to bitmap", GB0) ;
+    ASSERT (!GB_IS_FULL (A)) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;        // A can be jumbled on input
+    ASSERT (GB_ZOMBIES_OK (A)) ;        // A can have zombies on input
+    GBURBLE ("(sparse to bitmap) ") ;
+
+    //--------------------------------------------------------------------------
+    // determine the maximum number of threads to use
+    //--------------------------------------------------------------------------
+
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+
+    //--------------------------------------------------------------------------
+    // determine if the conversion can be done in-place
+    //--------------------------------------------------------------------------
+
+    // if in_place is true, then A->x does not change if A is as-if-full
+    bool in_place = GB_as_if_full (A) ;
+
+    //--------------------------------------------------------------------------
+    // allocate A->b
+    //--------------------------------------------------------------------------
+
+    const int64_t anz = GB_NNZ (A) ;
+    const int64_t avdim = A->vdim ;
+    const int64_t avlen = A->vlen ;
+    const int64_t anvec = A->nvec ;
+    int64_t anzmax ;
+    if (!GB_Index_multiply (&anzmax, avdim, avlen))
+    { 
+        // problem too large
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+    anzmax = GB_IMAX (anzmax, 1) ;
+
+    if (in_place)
+    { 
+        // if done in-place, malloc is fine since all of Ab will be set below
+        Ab = GB_MALLOC (anzmax, int8_t) ;
+    }
+    else if (anz > 0)
+    { 
+        // malloc Ab and set it to 0, in parallel.  This is faster than
+        // calloc since most of Ab will be set below.
+        Ab = GB_MALLOC (anzmax, int8_t) ;
+    }
+    else
+    { 
+        // calloc Ab so all bitmap entries are zero; no need to touch them.
+        // This case occurs when setting the GxB_SPARSITY_CONTROL of a new
+        // matrix to GxB_BITMAP, with no entries.
+        Ab = GB_CALLOC (anzmax, int8_t) ;       // anz is zero
+    }
+
+    if (Ab == NULL)
+    { 
+        // out of memory
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // allocate the new A->x
+    //--------------------------------------------------------------------------
+
+    const size_t asize = A->type->size ;
+    bool Ax_shallow ;
+
+    if (in_place)
+    { 
+        // keep the existing A->x
+        Ax_new = A->x ;
+        Ax_shallow = A->x_shallow ;
+    }
+    else
+    {
+        // A->x must be modified to fit the bitmap structure
+        Ax_new = GB_MALLOC (anzmax * asize, GB_void) ;
+        Ax_shallow = false ;
+        if (Ax_new == NULL)
+        { 
+            // out of memory
+            GB_FREE_ALL ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // scatter the pattern and values into the new bitmap
+    //--------------------------------------------------------------------------
+
+    int64_t nzombies = A->nzombies ;
+    if (in_place)
+    { 
+
+        //----------------------------------------------------------------------
+        // the sparse A has all entries: convert in-place
+        //----------------------------------------------------------------------
+
+        ASSERT (nzombies == 0) ;
+        // set all of Ab [0..anz-1] to 1, in parallel
+        GB_memset (Ab, 1, anz, nthreads_max) ;
+
+    }
+    else if (anz > 0)
+    {
+
+        //----------------------------------------------------------------------
+        // set all of Ab to zero
+        //----------------------------------------------------------------------
+
+        GB_memset (Ab, 0, anzmax, nthreads_max) ;
+
+        //----------------------------------------------------------------------
+        // scatter the values and pattern of A into the bitmap
+        //----------------------------------------------------------------------
+
+        int nthreads = GB_nthreads (anz + anvec, chunk, nthreads_max) ;
+        int ntasks = (nthreads == 1) ? 1 : (8 * nthreads) ;
+        if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A,
+            &ntasks))
+        { 
+            // out of memory
+            GB_FREE_ALL ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+
+        bool done = false ;
+
+        #ifndef GBCOMPACT
+
+            //------------------------------------------------------------------
+            // define the worker for the switch factory
+            //------------------------------------------------------------------
+
+            #define GB_convert_s2b_(cname) GB_convert_s2b_ ## cname
+            #define GB_WORKER(cname)                                        \
+            {                                                               \
+                info = GB_convert_s2b_(cname) (A, Ax_new, Ab, kfirst_slice, \
+                    klast_slice, pstart_slice, ntasks, nthreads) ;          \
+                done = (info != GrB_NO_VALUE) ;                             \
+            }                                                               \
+            break ;
+
+            //------------------------------------------------------------------
+            // launch the switch factory
+            //------------------------------------------------------------------
+
+            GB_Type_code acode = A->type->code ;
+            if (acode < GB_UDT_code)
+            { 
+                switch (acode)
+                {
+                    case GB_BOOL_code   : GB_WORKER (_bool )
+                    case GB_INT8_code   : GB_WORKER (_int8 )
+                    case GB_INT16_code  : GB_WORKER (_int16 )
+                    case GB_INT32_code  : GB_WORKER (_int32 )
+                    case GB_INT64_code  : GB_WORKER (_int64 )
+                    case GB_UINT8_code  : GB_WORKER (_uint8 )
+                    case GB_UINT16_code : GB_WORKER (_uint16)
+                    case GB_UINT32_code : GB_WORKER (_uint32)
+                    case GB_UINT64_code : GB_WORKER (_uint64)
+                    case GB_FP32_code   : GB_WORKER (_fp32  )
+                    case GB_FP64_code   : GB_WORKER (_fp64  )
+                    case GB_FC32_code   : GB_WORKER (_fc32  )
+                    case GB_FC64_code   : GB_WORKER (_fc64  )
+                    default: ;
+                }
+            }
+
+        #endif
+
+        if (!done)
+        { 
+            // Ax_new [pnew] = Ax [p]
+            #define GB_COPY_A_TO_C(Ax_new,pnew,Ax,p) \
+                memcpy (Ax_new +(pnew)*asize, Ax +(p)*asize, asize)
+            #define GB_ATYPE GB_void
+            #include "GB_convert_sparse_to_bitmap_template.c"
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // free prior content of A and transplant the new content
+    //--------------------------------------------------------------------------
+
+    if (in_place)
+    {
+        // if done in-place, remove A->x from A so it is not freed
+        A->x = NULL ;
+        A->x_shallow = false ;
+    }
+
+    GB_phbix_free (A) ;
+
+    A->b = Ab ;
+    A->b_shallow = false ;
+    Ab = NULL ;
+
+    A->x = Ax_new ;
+    A->x_shallow = Ax_shallow ;
+    Ax_new = NULL ;
+
+    A->nzmax = anzmax ;
+    A->nvals = anz - nzombies ;
+    ASSERT (A->nzombies == 0) ;
+
+    A->plen = -1 ;
+    A->nvec = avdim ;
+    A->nvec_nonempty = (avlen == 0) ? 0 : avdim ;
+
+    A->magic = GB_MAGIC ;
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    GB_FREE_WORK ;
+    ASSERT_MATRIX_OK (A, "A converted from sparse to bitmap", GB0) ;
+    ASSERT (GB_IS_BITMAP (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_convert_sparse_to_bitmap_test.c b/GraphBLAS/Source/GB_convert_sparse_to_bitmap_test.c
new file mode 100644
index 0000000000..c46d543e0c
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_sparse_to_bitmap_test.c
@@ -0,0 +1,41 @@
+//------------------------------------------------------------------------------
+// GB_convert_sparse_to_bitmap_test: test conversion of hyper/sparse to bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Returns true if a sparse or hypersparse matrix should be converted to bitmap
+// (or full if all entries are present, but in that case, this function is not
+// called).  Returns false if the matrix should stay hypersparse/sparse.
+
+// See GB_convert_bitmap_to_sparse_test for a description of this rule.
+// These default rules may change in future releases of SuiteSparse:GraphBLAS.
+
+// This rule is not used if all entries are present.  In that case, the matrix
+// becomes full, not bitmap, assuming the full format permitted by the sparsity
+// control setting of the matrix.
+
+#include "GB.h"
+
+bool GB_convert_sparse_to_bitmap_test    // test for hyper/sparse to bitmap
+(
+    float bitmap_switch,    // A->bitmap_switch
+    int64_t anz,            // # of entries in A = GB_NNZ (A)
+    int64_t vlen,           // A->vlen
+    int64_t vdim            // A->vdim
+)
+{ 
+    // current number of entries in the matrix or vector
+    float nnz = (float) anz ;
+
+    // maximum number of entries in the matrix or vector
+    float nnz_dense = ((float) vlen) * ((float) vdim) ;
+
+    // A should switch to bitmap if the following condition is true:
+    return (nnz > bitmap_switch * nnz_dense &&
+            nnz_dense < (float) GxB_INDEX_MAX) ;
+}
+
diff --git a/GraphBLAS/Source/GB_to_hyper.c b/GraphBLAS/Source/GB_convert_sparse_to_hyper.c
similarity index 83%
rename from GraphBLAS/Source/GB_to_hyper.c
rename to GraphBLAS/Source/GB_convert_sparse_to_hyper.c
index 4d73f738aa..1d0a9e6a8c 100644
--- a/GraphBLAS/Source/GB_to_hyper.c
+++ b/GraphBLAS/Source/GB_convert_sparse_to_hyper.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_to_hyper: convert a matrix to hyperspasre
+// GB_convert_sparse_to_hyper: convert a matrix from sparse to hyperspasre
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,11 +17,11 @@
 
 // If an out-of-memory condition occurs, all content of the matrix is cleared.
 
-// The input matrix may be jumbled; this is not an error condition.
+// If the input matrix A is hypersparse, bitmap or full, it is unchanged.
 
 #include "GB.h"
 
-GrB_Info GB_to_hyper        // convert a matrix to hypersparse
+GrB_Info GB_convert_sparse_to_hyper // convert from sparse to hypersparse
 (
     GrB_Matrix A,           // matrix to convert to hypersparse
     GB_Context Context
@@ -32,21 +32,24 @@ GrB_Info GB_to_hyper        // convert a matrix to hypersparse
     // check inputs
     //--------------------------------------------------------------------------
 
-    ASSERT_MATRIX_OK_OR_JUMBLED (A, "A converting to hypersparse", GB0) ;
+    ASSERT_MATRIX_OK (A, "A converting to hypersparse", GB0) ;
     int64_t anz = GB_NNZ (A) ;
     ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
 
     //--------------------------------------------------------------------------
-    // convert A to hypersparse form
+    // convert A from sparse to hypersparse
     //--------------------------------------------------------------------------
 
-    if (!A->is_hyper)
-    {
+    if (GB_IS_SPARSE (A))
+    { 
 
         //----------------------------------------------------------------------
         // determine the number of threads to use
         //----------------------------------------------------------------------
 
+        GBURBLE ("(sparse to hyper) ") ;
         int64_t n = A->vdim ;
         GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
         int nthreads = GB_nthreads (n, chunk, nthreads_max) ;
@@ -58,8 +61,6 @@ GrB_Info GB_to_hyper        // convert a matrix to hypersparse
         // count the number of non-empty vectors in A in each slice
         //----------------------------------------------------------------------
 
-        A->is_hyper = true ;    // A becomes hypersparse
-        ASSERT (A->h == NULL) ;
         ASSERT (A->nvec == A->plen && A->plen == n) ;
 
         const int64_t *GB_RESTRICT Ap_old = A->p ;
@@ -69,8 +70,7 @@ GrB_Info GB_to_hyper        // convert a matrix to hypersparse
         if (Count == NULL)
         { 
             // out of memory
-            GB_PHIX_FREE (A) ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         int tid ;
@@ -106,8 +106,7 @@ GrB_Info GB_to_hyper        // convert a matrix to hypersparse
             GB_FREE (Count) ;
             GB_FREE (Ap_new) ;
             GB_FREE (Ah_new) ;
-            GB_PHIX_FREE (A) ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         //----------------------------------------------------------------------
@@ -145,7 +144,6 @@ GrB_Info GB_to_hyper        // convert a matrix to hypersparse
 
         Ap_new [nvec_nonempty] = anz ;
         A->magic = GB_MAGIC ;
-        ASSERT (A->nvec_nonempty == GB_nvec_nonempty (A, Context)) ;
 
         //----------------------------------------------------------------------
         // free workspace, and free the old A->p unless it's shallow
@@ -156,15 +154,24 @@ GrB_Info GB_to_hyper        // convert a matrix to hypersparse
         { 
             GB_FREE (Ap_old) ;
         }
+
+        //----------------------------------------------------------------------
+        // A is now hypersparse
+        //----------------------------------------------------------------------
+
+        ASSERT (GB_IS_HYPERSPARSE (A)) ;
     }
 
     //--------------------------------------------------------------------------
-    // A is now in hypersparse form
+    // A is now in hypersparse form (or left as full or bitmap)
     //--------------------------------------------------------------------------
 
     ASSERT (anz == GB_NNZ (A)) ;
-    ASSERT_MATRIX_OK_OR_JUMBLED (A, "A converted to hypersparse", GB0) ;
-    ASSERT (A->is_hyper) ;
+    ASSERT_MATRIX_OK (A, "A conv to hypersparse (or left full/bitmap)", GB0) ;
+    ASSERT (!GB_IS_SPARSE (A)) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_convert_sparse_to_hyper_test.c b/GraphBLAS/Source/GB_convert_sparse_to_hyper_test.c
new file mode 100644
index 0000000000..e068f15c0a
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_sparse_to_hyper_test.c
@@ -0,0 +1,35 @@
+//------------------------------------------------------------------------------
+// GB_convert_sparse_to_hyper_test: test for sparse to hypersparse conversion
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Returns true if a sparse matrix should be converted to hypersparse.
+// Returns false if the matrix should stay sparse.
+
+// A matrix with vdim <= 1 must always be sparse, not hypersparse;
+// that is, a GrB_Vector is never hypersparse.
+
+#include "GB.h"
+
+bool GB_convert_sparse_to_hyper_test  // test sparse to hypersparse conversion
+(
+    float hyper_switch,     // A->hyper_switch
+    int64_t k,              // # of non-empty vectors of A (an estimate is OK)
+    int64_t vdim            // A->vdim
+)
+{ 
+
+    // get the vector dimension of this matrix
+    float n = (float) vdim ;
+
+    // ensure k is in the range 0 to n, inclusive
+    k = GB_IMAX (k, 0) ;
+    k = GB_IMIN (k, n) ;
+
+    return (n > 1 && (((float) k) <= n * hyper_switch)) ;
+}
+
diff --git a/GraphBLAS/Source/GB_convert_to_full.c b/GraphBLAS/Source/GB_convert_to_full.c
new file mode 100644
index 0000000000..46e20d9dff
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_to_full.c
@@ -0,0 +1,75 @@
+//------------------------------------------------------------------------------
+// GB_convert_to_full: convert a matrix to full; deleting prior values
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+
+GrB_Info GB_convert_to_full     // convert matrix to full; delete prior values
+(
+    GrB_Matrix A                // matrix to convert to full
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GB_void *Ax_new = NULL ;
+    ASSERT_MATRIX_OK (A, "A converting to full", GB0) ;
+    GBURBLE ("(to full) ") ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
+    ASSERT (GB_IS_FULL (A) || GB_IS_BITMAP (A) || GB_IS_SPARSE (A) ||
+        GB_IS_HYPERSPARSE (A)) ;
+
+    int64_t avdim = A->vdim ;
+    int64_t avlen = A->vlen ;
+    GrB_Index anzmax ;
+    bool ok = GB_Index_multiply (&anzmax, avlen, avdim) ;
+    if (!ok)
+    { 
+        // problem too large
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // free all prior content and allocate new space for A->x
+    //--------------------------------------------------------------------------
+
+    GB_phbix_free (A) ;
+
+    Ax_new = GB_MALLOC (anzmax * A->type->size, GB_void) ;
+    if (Ax_new == NULL)
+    { 
+        // out of memory
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // transplant the new content into A
+    //--------------------------------------------------------------------------
+
+    A->x = Ax_new ;
+    A->plen = -1 ;
+    A->nvec = avdim ;
+    A->nvec_nonempty = (avlen == 0) ? 0 : avdim ;
+    A->nzmax = GB_IMAX (anzmax, 1) ;
+    A->magic = GB_MAGIC ;
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_FULL (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_convert_to_nonfull.c b/GraphBLAS/Source/GB_convert_to_nonfull.c
new file mode 100644
index 0000000000..fd6345623e
--- /dev/null
+++ b/GraphBLAS/Source/GB_convert_to_nonfull.c
@@ -0,0 +1,56 @@
+//------------------------------------------------------------------------------
+// GB_convert_to_nonfull: ensure a matrix is not full (hyper, sparse, or bitmap)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// The matrix A must be converted from full to any other sparsity structure.
+// The full sparsity structure cannot tolerate the deletion of any entry but
+// the other three can.
+
+#include "GB.h"
+
+GrB_Info GB_convert_to_nonfull      // ensure a matrix is not full
+(
+    GrB_Matrix A,
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_FULL (A)) ;
+
+    //--------------------------------------------------------------------------
+    // convert to bitmap, sparse, or hypersparse
+    //--------------------------------------------------------------------------
+
+    int sparsity = GB_sparsity_control (A->sparsity, A->vdim) ;
+
+    if (sparsity & GxB_BITMAP)
+    { 
+        // C can become bitmap
+        return (GB_convert_full_to_bitmap (A, Context)) ;
+    }
+    else if (sparsity & GxB_SPARSE)
+    { 
+        // C can become sparse
+        return (GB_convert_full_to_sparse (A, Context)) ;
+    }
+    else if (sparsity & GxB_HYPERSPARSE)
+    { 
+        // C can become hypersparse
+        return (GB_convert_any_to_hyper (A, Context)) ;
+    }
+    else
+    { 
+        // none of the above conditions hold so make A bitmap
+        return (GB_convert_full_to_bitmap (A, Context)) ;
+    }
+}
+
diff --git a/GraphBLAS/Source/GB_copy_user_user.c b/GraphBLAS/Source/GB_copy_user_user.c
index 22e0d2a548..6607e35d71 100644
--- a/GraphBLAS/Source/GB_copy_user_user.c
+++ b/GraphBLAS/Source/GB_copy_user_user.c
@@ -2,8 +2,8 @@
 // GB_copy_user_user.c: copy user a type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_coverage.h b/GraphBLAS/Source/GB_coverage.h
new file mode 100644
index 0000000000..43110e1382
--- /dev/null
+++ b/GraphBLAS/Source/GB_coverage.h
@@ -0,0 +1,22 @@
+//------------------------------------------------------------------------------
+// GB_coverage.h: for coverage tests in Tcov
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// These global values are visible only from the GraphBLAS/Tcov tests.
+
+#ifndef GB_COVERAGE_H
+#define GB_COVERAGE_H
+
+#ifdef GBCOVER
+#define GBCOVER_MAX 20000
+GB_PUBLIC int64_t GB_cov [GBCOVER_MAX] ;
+GB_PUBLIC int GB_cover_max ;
+#endif
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_cplusplus.h b/GraphBLAS/Source/GB_cplusplus.h
index 9939d545cf..b261ed0e4e 100644
--- a/GraphBLAS/Source/GB_cplusplus.h
+++ b/GraphBLAS/Source/GB_cplusplus.h
@@ -2,8 +2,8 @@
 // GB_cplusplus.h: definitions for C++
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_cuda_gateway.h b/GraphBLAS/Source/GB_cuda_gateway.h
index 2ba3db467f..942b2cc705 100644
--- a/GraphBLAS/Source/GB_cuda_gateway.h
+++ b/GraphBLAS/Source/GB_cuda_gateway.h
@@ -3,7 +3,7 @@
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -44,6 +44,10 @@ typedef struct
     int  number_of_sms ;
     int  compute_capability_major;
     int  compute_capability_minor;
+    bool use_memory_pool;
+    int  pool_size;             // TODO: should this be size_t?
+    int  max_pool_size;         // TODO: should this be size_t?
+    void *memory_resource;
 }
 GB_cuda_device ;
 
@@ -92,6 +96,10 @@ bool GB_cuda_get_device_count   // true if OK, false if failure
 
 bool GB_cuda_warmup (int device) ;
 
+bool GB_cuda_get_device( int *device) ;
+
+bool GB_cuda_set_device( int device) ;
+
 bool GB_cuda_get_device_properties
 (
     int device,
@@ -119,7 +127,8 @@ GrB_Info GB_cuda_red__plus_int64
 GrB_Info GB_AxB_dot3_cuda           // C<M> = A'*B using dot product method
 (
     GrB_Matrix *Chandle,            // output matrix
-    const GrB_Matrix M,             // mask matrix for C<M>=A'*B or C<!M>=A'*B
+    const GrB_Matrix M,             // mask matrix
+    const bool Mask_struct,         // if true, use the only structure of M
     const GrB_Matrix A,             // input matrix
     const GrB_Matrix B,             // input matrix
     const GrB_Semiring semiring,    // semiring that defines C=A*B
diff --git a/GraphBLAS/Source/GB_cumsum.c b/GraphBLAS/Source/GB_cumsum.c
index 7ddfa238ee..a6e3176f68 100644
--- a/GraphBLAS/Source/GB_cumsum.c
+++ b/GraphBLAS/Source/GB_cumsum.c
@@ -2,8 +2,8 @@
 // GB_cumsum: cumlative sum of an array
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 #include "GB.h"
 
-// TODO for GPU: add error handling and GrB_Info return value
-
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 void GB_cumsum                      // cumulative sum of an array
 (
@@ -60,18 +58,6 @@ void GB_cumsum                      // cumulative sum of an array
     if (kresult == NULL)
     {
 
-#if 0
-        // FIXME
-        #if defined ( GBCUDA )
-        if (GB_cuda_is_on_GPU (count))
-        {
-            // 'count' is already on the GPU: compute the cumulative sum there
-            GB_cuda_cumsum (count, n) ;
-        }
-        else
-        #endif
-#endif
-
         if (nthreads <= 2)
         {
 
@@ -151,19 +137,6 @@ void GB_cumsum                      // cumulative sum of an array
     else
     {
 
-#if 0
-        // TODO for GPU: pop count on the GPU for GB_cumsum
-        #if defined ( GBCUDA )
-        if (GB_cuda_is_on_GPU (count))
-        {
-            // 'count' is already on the GPU: compute the cumulative sum there
-            (*kresult) = GB_cuda_population_count_int64 (count, n) ;
-            GB_cuda_cumsum (count, n) ;
-        }
-        else
-        #endif
-#endif
-
         if (nthreads <= 2)
         {
 
diff --git a/GraphBLAS/Source/GB_defaults.h b/GraphBLAS/Source/GB_defaults.h
new file mode 100644
index 0000000000..b71befc75b
--- /dev/null
+++ b/GraphBLAS/Source/GB_defaults.h
@@ -0,0 +1,38 @@
+//------------------------------------------------------------------------------
+// GB_defaults.h: default parameter settings
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_DEFAULTS_H
+#define GB_DEFAULTS_H
+
+//------------------------------------------------------------------------------
+// default options
+//------------------------------------------------------------------------------
+
+// These parameters define the content of values that can be
+// used as inputs to GxB_*Option_set.
+
+// The default format is by row (CSR).
+#define GB_HYPER_SWITCH_DEFAULT (0.0625)
+
+// compile SuiteSparse:GraphBLAS with "-DBYCOL" to make GxB_BY_COL the default
+// format
+#ifdef BYCOL
+#define GB_FORMAT_DEFAULT GxB_BY_COL
+#else
+#define GB_FORMAT_DEFAULT GxB_BY_ROW
+#endif
+
+// by default, give each thread at least 64K units of work to do
+#define GB_CHUNK_DEFAULT (64*1024)
+
+// initial size of the pending tuples
+#define GB_PENDING_INIT 256
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_dense.h b/GraphBLAS/Source/GB_dense.h
index 07a34baeec..5144d87275 100644
--- a/GraphBLAS/Source/GB_dense.h
+++ b/GraphBLAS/Source/GB_dense.h
@@ -2,8 +2,8 @@
 // GB_dense.h: defintions for dense matrix operations 
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -68,11 +68,11 @@ GrB_Info GB_dense_subassign_22      // C += x where C is dense and x is a scalar
 // GB_dense_subassign_21: C(:,:) = scalar where C becomes dense
 //------------------------------------------------------------------------------
 
-GrB_Info GB_dense_subassign_21      // C(:,:) = x; C is a matrix and x a scalar
+GrB_Info GB_dense_subassign_21      // C(:,:) = x, scalar to matrix assignment
 (
     GrB_Matrix C,                   // input/output matrix
     const void *scalar,             // input scalar
-    const GrB_Type atype,           // type of the input scalar
+    const GrB_Type scalar_type,     // type of the input scalar
     GB_Context Context
 ) ;
 
@@ -105,10 +105,10 @@ GrB_Info GB_dense_subassign_06d
 ) ;
 
 //------------------------------------------------------------------------------
-// GB_dense_subassign_24: C(:,:) = A ; C is dense
+// GB_subassign_24: C = A
 //------------------------------------------------------------------------------
 
-GrB_Info GB_dense_subassign_24   // C = A, copy A into an existing matrix C
+GrB_Info GB_subassign_24    // C = A, copy A into an existing matrix C
 (
     GrB_Matrix C,           // output matrix to modify
     const GrB_Matrix A,     // input matrix to copy
diff --git a/GraphBLAS/Source/GB_dense_ewise3_accum.c b/GraphBLAS/Source/GB_dense_ewise3_accum.c
index 0d95385b5e..37a16640eb 100644
--- a/GraphBLAS/Source/GB_dense_ewise3_accum.c
+++ b/GraphBLAS/Source/GB_dense_ewise3_accum.c
@@ -2,8 +2,8 @@
 // GB_dense_ewise3_accum: C += A+B where all 3 matries are dense
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,7 +17,7 @@ void GB_dense_ewise3_accum          // C += A+B, all matrices dense
     GrB_Matrix C,                   // input/output matrix
     const GrB_Matrix A,
     const GrB_Matrix B,
-    const GrB_BinaryOp op,
+    const GrB_BinaryOp op,          // only GB_BINOP_SUBSET operators supported
     GB_Context Context
 )
 {
@@ -27,13 +27,29 @@ void GB_dense_ewise3_accum          // C += A+B, all matrices dense
     //--------------------------------------------------------------------------
 
     ASSERT_MATRIX_OK (C, "C for dense C+=A+B", GB0) ;
-    ASSERT (!GB_PENDING (C)) ; ASSERT (!GB_ZOMBIES (C)) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
-    ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_ZOMBIES (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
     ASSERT (GB_is_dense (C)) ;
+
+    ASSERT_MATRIX_OK (A, "A for dense C+=A+B", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
     ASSERT (GB_is_dense (A)) ;
+
+    ASSERT_MATRIX_OK (B, "B for dense C+=A+B", GB0) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+    ASSERT (!GB_PENDING (B)) ;
     ASSERT (GB_is_dense (B)) ;
+
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (!GB_IS_BITMAP (B)) ;
+
     ASSERT_BINARYOP_OK (op, "op for dense C+=A+B", GB0) ;
+    ASSERT (!GB_OP_IS_POSITIONAL (op)) ;
     ASSERT (op->ztype == C->type) ;
     ASSERT (op->ztype == A->type) ;
     ASSERT (op->ztype == B->type) ;
@@ -42,6 +58,8 @@ void GB_dense_ewise3_accum          // C += A+B, all matrices dense
     ASSERT (op->opcode >= GB_MIN_opcode) ;
     ASSERT (op->opcode <= GB_RDIV_opcode) ;
 
+    GB_ENSURE_FULL (C) ;        // convert C to full
+
     // FUTURE::: handle IS*, LOR, LAND, LXOR operators
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_dense_ewise3_noaccum.c b/GraphBLAS/Source/GB_dense_ewise3_noaccum.c
index d24661d7b1..d2c889c441 100644
--- a/GraphBLAS/Source/GB_dense_ewise3_noaccum.c
+++ b/GraphBLAS/Source/GB_dense_ewise3_noaccum.c
@@ -2,12 +2,12 @@
 // GB_dense_ewise3_noaccum: C = A+B where A and B are dense, C is anything
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// FUTURE: extend to handle typecasting and generic operators.
+// C can have any sparsity on input; it becomes a full matrix on output.
 
 #include "GB_dense.h"
 #include "GB_binop.h"
@@ -19,10 +19,10 @@
 GrB_Info GB_dense_ewise3_noaccum    // C = A+B
 (
     GrB_Matrix C,                   // input/output matrix
-    const bool C_is_dense,          // true if C is dense
+    const bool C_is_dense,          // true if C is dense on input
     const GrB_Matrix A,
     const GrB_Matrix B,
-    const GrB_BinaryOp op,
+    const GrB_BinaryOp op,          // must not be a positional op
     GB_Context Context
 )
 {
@@ -32,14 +32,30 @@ GrB_Info GB_dense_ewise3_noaccum    // C = A+B
     //--------------------------------------------------------------------------
 
     GrB_Info info ;
+
     ASSERT_MATRIX_OK (C, "C for dense C=A+B", GB0) ;
-    ASSERT (!GB_PENDING (C)) ; ASSERT (!GB_ZOMBIES (C)) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
-    ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;    // C is entirely overwritten by A+B
+    ASSERT (!GB_PENDING (C)) ;
     ASSERT (GB_IMPLIES (!C_is_dense, (C != A && C != B))) ;
+
+    ASSERT_MATRIX_OK (A, "A for dense C=A+B", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
     ASSERT (GB_is_dense (A)) ;
+
+    ASSERT_MATRIX_OK (B, "B for dense C=A+B", GB0) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+    ASSERT (!GB_PENDING (B)) ;
     ASSERT (GB_is_dense (B)) ;
+
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (!GB_IS_BITMAP (B)) ;
+
     ASSERT_BINARYOP_OK (op, "op for dense C=A+B", GB0) ;
+    ASSERT (!GB_OP_IS_POSITIONAL (op)) ;
     ASSERT (op->ztype == C->type) ;
     ASSERT (op->xtype == A->type) ;
     ASSERT (op->ytype == B->type) ;
@@ -54,20 +70,23 @@ GrB_Info GB_dense_ewise3_noaccum    // C = A+B
     int nthreads = GB_nthreads (2 * anz, chunk, nthreads_max) ;
 
     //--------------------------------------------------------------------------
-    // if C not already dense, allocate it and create its pattern (same as A)
+    // if C not already dense, allocate it as full
     //--------------------------------------------------------------------------
 
-    // clear prior content and then create a copy of the pattern of A.  Keep
-    // the same type and CSR/CSC for C.  Allocate the values of C but do not
-    // initialize them.
+    // clear prior content and create C as a full matrix.  Keep the same type
+    // and CSR/CSC for C.  Allocate the values of C but do not initialize them.
 
     if (!C_is_dense)
     { 
-        bool C_is_csc = C->is_csc ;
-        GB_PHIX_FREE (C) ;
-        GB_OK (GB_dup2 (&C, A, false, C->type, Context)) ;
-        C->is_csc = C_is_csc ;
+        // convert C to full; just allocate C->x.  Keep the dimensions of C.
+        GB_OK (GB_convert_to_full (C)) ;    // prior content deleted
+    }
+    else if (!GB_IS_FULL (C))
+    {
+        // C is dense, but not full; convert to full
+        GB_convert_any_to_full (C) ;
     }
+    ASSERT (GB_IS_FULL (C)) ;
 
     //--------------------------------------------------------------------------
     // define the worker for the switch factory
@@ -96,7 +115,7 @@ GrB_Info GB_dense_ewise3_noaccum    // C = A+B
     else
     {
         // this function is not called if the op cannot be applied
-        ASSERT (0) ;
+        ASSERT (GB_DEAD_CODE) ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_dense_subassign_05d.c b/GraphBLAS/Source/GB_dense_subassign_05d.c
index 87f3f67abc..4e0d16118c 100644
--- a/GraphBLAS/Source/GB_dense_subassign_05d.c
+++ b/GraphBLAS/Source/GB_dense_subassign_05d.c
@@ -2,8 +2,8 @@
 // GB_dense_subassign_05d: C(:,:)<M> = scalar where C is dense
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,6 +16,9 @@
 // A:           scalar
 // S:           none
 
+// C can have any sparsity structure, but it must be entirely dense with
+// all entries present.
+
 #include "GB_subassign_methods.h"
 #include "GB_dense.h"
 #include "GB_unused.h"
@@ -42,19 +45,36 @@ GrB_Info GB_dense_subassign_05d
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
     GrB_Info info ;
-    ASSERT (GB_is_dense (C)) ;
-    ASSERT (!GB_PENDING (C)) ;
-    ASSERT (!GB_ZOMBIES (C)) ;
+    int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
+
     ASSERT_MATRIX_OK (C, "C for subassign method_05d", GB0) ;
+    ASSERT (!GB_ZOMBIES (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
+    ASSERT (GB_is_dense (C)) ;
+
+    ASSERT_MATRIX_OK (M, "M for subassign method_05d", GB0) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (!GB_PENDING (M)) ;
+
     const GB_Type_code ccode = C->type->code ;
     const size_t csize = C->type->size ;
     GB_GET_SCALAR ;
 
+    GB_ENSURE_FULL (C) ;        // convert C to full
+
     //--------------------------------------------------------------------------
     // Method 05d: C(:,:)<M> = scalar ; no S; C is dense
     //--------------------------------------------------------------------------
@@ -66,13 +86,11 @@ GrB_Info GB_dense_subassign_05d
     // Parallel: slice M into equal-sized chunks
     //--------------------------------------------------------------------------
 
-    int64_t mnz   = GB_NNZ (M) ;
+    int64_t mnz = GB_NNZ_HELD (M) ;
     int64_t mnvec = M->nvec ;
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
     int nthreads = GB_nthreads (mnz + mnvec, chunk, nthreads_max) ;
     int ntasks = (nthreads == 1) ? 1 : (8 * nthreads) ;
-    ntasks = GB_IMIN (ntasks, mnz) ;
-    ntasks = GB_IMAX (ntasks, 1) ;
 
     //--------------------------------------------------------------------------
     // slice the entries for each task
@@ -82,11 +100,10 @@ GrB_Info GB_dense_subassign_05d
     // vectors kfirst_slice [tid] to klast_slice [tid].  The first and last
     // vectors may be shared with prior slices and subsequent slices.
 
-    int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
-    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, M, ntasks))
+    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, M, &ntasks))
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -147,7 +164,7 @@ GrB_Info GB_dense_subassign_05d
         // get operators, functions, workspace, contents of A and C
         //----------------------------------------------------------------------
 
-        GB_BURBLE_MATRIX (M, "generic ") ;
+        GB_BURBLE_MATRIX (M, "(generic C(:,:)<M>=x assign) ") ;
 
         const size_t csize = C->type->size ;
 
diff --git a/GraphBLAS/Source/GB_dense_subassign_06d.c b/GraphBLAS/Source/GB_dense_subassign_06d.c
index 573c24db78..5cb21e64e0 100644
--- a/GraphBLAS/Source/GB_dense_subassign_06d.c
+++ b/GraphBLAS/Source/GB_dense_subassign_06d.c
@@ -2,8 +2,8 @@
 // GB_dense_subassign_06d: C(:,:)<A> = A; C is dense, and M and A are aliased
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,11 +11,24 @@
 
 // M:           present
 // Mask_comp:   false
+// Mask_struct: true or false (both cases handled)
 // C_replace:   false
 // accum:       NULL
 // A:           matrix, and aliased to M
 // S:           none
 
+// C must be a packed matrix.  No entries are deleted and thus no zombies are
+// introduced into C.  C can be hypersparse, sparse, bitmap, or full, and its
+// sparsity structure does not change.  If C is hypersparse, sparse, or full,
+// then the pattern does not change (all entries are present, and this does not
+// change), and these cases can all be treated the same (as if full).  If C is
+// bitmap, new entries can be inserted into the bitmap C->b.
+
+// TODO the caller checks GB_as_if_full (C), which is more restrictive than
+// what this function tolerates (GB_is_packed (C)).
+
+// C and A can have any sparsity structure.
+
 #include "GB_subassign_methods.h"
 #include "GB_dense.h"
 #ifndef GBCOMPACT
@@ -44,12 +57,24 @@ GrB_Info GB_dense_subassign_06d
     //--------------------------------------------------------------------------
 
     GrB_Info info ;
-    ASSERT (GB_is_dense (C)) ;
-    ASSERT (!GB_PENDING (C)) ; ASSERT (!GB_ZOMBIES (C)) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
+    int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
+
     ASSERT_MATRIX_OK (C, "C for subassign method_06d", GB0) ;
+    ASSERT (!GB_ZOMBIES (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
+    ASSERT (GB_is_packed (C)) ;
+    ASSERT (!GB_aliased (C, A)) ;   // NO ALIAS of C==A
+
     ASSERT_MATRIX_OK (A, "A for subassign method_06d", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+
     const GB_Type_code ccode = C->type->code ;
+    const bool C_is_bitmap = GB_IS_BITMAP (C) ;
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
+    const bool A_is_dense = GB_as_if_full (A) ;
 
     //--------------------------------------------------------------------------
     // Method 06d: C(:,:)<A> = A ; no S; C is dense, M and A are aliased
@@ -62,11 +87,10 @@ GrB_Info GB_dense_subassign_06d
     // Parallel: slice A into equal-sized chunks
     //--------------------------------------------------------------------------
 
+    int64_t anz = GB_NNZ_HELD (A) ;
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-    int nthreads = GB_nthreads (GB_NNZ (A) + A->nvec, chunk, nthreads_max) ;
+    int nthreads = GB_nthreads (anz + A->nvec, chunk, nthreads_max) ;
     int ntasks = (nthreads == 1) ? 1 : (8 * nthreads) ;
-    ntasks = GB_IMIN (ntasks, GB_NNZ (A)) ;
-    ntasks = GB_IMAX (ntasks, 1) ;
 
     //--------------------------------------------------------------------------
     // slice the entries for each task
@@ -76,11 +100,19 @@ GrB_Info GB_dense_subassign_06d
     // vectors kfirst_slice [tid] to klast_slice [tid].  The first and last
     // vectors may be shared with prior slices and subsequent slices.
 
-    int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
-    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A, ntasks))
+    if (A_is_bitmap || A_is_dense)
     { 
-        // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        // no need to construct tasks
+        ;
+    }
+    else
+    {
+        if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A,
+            &ntasks))
+        { 
+            // out of memory
+            return (GrB_OUT_OF_MEMORY) ;
+        }
     }
 
     //--------------------------------------------------------------------------
@@ -144,7 +176,7 @@ GrB_Info GB_dense_subassign_06d
         // get operators, functions, workspace, contents of A and C
         //----------------------------------------------------------------------
 
-        GB_BURBLE_MATRIX (A, "generic ") ;
+        GB_BURBLE_MATRIX (A, "(generic C(:,:)<Z>=Z assign) ") ;
 
         const size_t csize = C->type->size ;
         const size_t asize = A->type->size ;
diff --git a/GraphBLAS/Source/GB_dense_subassign_21.c b/GraphBLAS/Source/GB_dense_subassign_21.c
index 43413c104c..20a6cd149a 100644
--- a/GraphBLAS/Source/GB_dense_subassign_21.c
+++ b/GraphBLAS/Source/GB_dense_subassign_21.c
@@ -2,22 +2,31 @@
 // GB_dense_subassign_21: C(:,:) = x where x is a scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// C(:,:) = x where C is a matrix and x is a scalar
+// C(:,:) = x where C is a matrix and x is a scalar.
+
+// C can have any sparsity on input; it is recreated as a full matrix, or left
+// as bitmap.  If C is bitmap, it is either left as bitmap, or converted to
+// full if allowed by C->sparsity.
+
+// If C is bitmap, GB_subassigner_method does not select this method directly.
+// Instead, it selects GB_bitmap_assign, which then just calls this method
+// via GB_bitmap_assign_noM_noaccum_whole.
 
 #include "GB_dense.h"
 #include "GB_select.h"
 #include "GB_Pending.h"
+#include "GB_bitmap_assign_methods.h"
 
-GrB_Info GB_dense_subassign_21      // C(:,:) = x; C is a matrix and x a scalar
+GrB_Info GB_dense_subassign_21      // C(:,:) = x, scalar to matrix assignment
 (
     GrB_Matrix C,                   // input/output matrix
     const void *scalar,             // input scalar
-    const GrB_Type atype,           // type of the input scalar
+    const GrB_Type scalar_type,     // type of the input scalar
     GB_Context Context
 )
 {
@@ -28,10 +37,14 @@ GrB_Info GB_dense_subassign_21      // C(:,:) = x; C is a matrix and x a scalar
 
     GrB_Info info ;
     ASSERT_MATRIX_OK (C, "C for C(:,:)=x", GB0) ;
+    ASSERT (!GB_is_shallow (C)) ;
     ASSERT (scalar != NULL) ;
-    // any prior pending tuples are discarded, and all zombies will be killed
-    ASSERT (GB_PENDING_OK (C)) ; ASSERT (GB_ZOMBIES_OK (C)) ;
-    ASSERT_TYPE_OK (atype, "atype for C(:,:)=x", GB0) ;
+    // any prior pending tuples are discarded, and all zombies will be killed,
+    // so C can be anything on input.
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    ASSERT (GB_PENDING_OK (C)) ;
+    ASSERT_TYPE_OK (scalar_type, "scalar_type for C(:,:)=x", GB0) ;
 
     //--------------------------------------------------------------------------
     // determine the number of threads to use
@@ -44,11 +57,10 @@ GrB_Info GB_dense_subassign_21      // C(:,:) = x; C is a matrix and x a scalar
     if (!ok)
     { 
         // problem too large
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-    int nthreads = GB_nthreads (cnzmax, chunk, nthreads_max) ;
 
     //--------------------------------------------------------------------------
     // typecast the scalar into the same type as C
@@ -56,131 +68,110 @@ GrB_Info GB_dense_subassign_21      // C(:,:) = x; C is a matrix and x a scalar
 
     int64_t csize = C->type->size ;
     GB_cast_function
-        cast_A_to_C = GB_cast_factory (C->type->code, atype->code) ;
+        cast_A_to_C = GB_cast_factory (C->type->code, scalar_type->code) ;
     GB_void cwork [GB_VLA(csize)] ;
-    cast_A_to_C (cwork, scalar, atype->size) ;
+    cast_A_to_C (cwork, scalar, scalar_type->size) ;
 
     //--------------------------------------------------------------------------
-    // create the pattern, and allocate space for values, if needed
+    // ensure C is full or bitmap
     //--------------------------------------------------------------------------
 
     // discard any prior pending tuples
     GB_Pending_free (&(C->Pending)) ;
 
-    int64_t pC ;
-
-    if (GB_NNZ (C) < cnzmax || C->x_shallow || C->i_shallow || C->is_hyper
-        || GB_ZOMBIES (C))
+    if (GB_IS_SPARSE (C) || GB_IS_HYPERSPARSE (C))
     {
-
-        //----------------------------------------------------------------------
-        // C is not yet dense: create pattern and allocate values
-        //----------------------------------------------------------------------
-
         // clear prior content and recreate it; use exising header for C.
-        // do not malloc C->x if the scalar is zero; calloc it later.
-        bool scalar_is_nonzero = GB_is_nonzero (cwork, csize) ;
-        GB_PHIX_FREE (C) ;
-        info = GB_create (&C, C->type, cvlen, cvdim, GB_Ap_malloc, C->is_csc,
-            GB_FORCE_NONHYPER, C->hyper_ratio, C->vdim, cnzmax,
-            scalar_is_nonzero, Context) ;
+        GB_phbix_free (C) ;
+        int C_sparsity = C->sparsity ;  // save the sparsity control of C
+        info = GB_new_bix (&C,  // full, old header
+            C->type, cvlen, cvdim, GB_Ap_null, C->is_csc,
+            GxB_FULL, true, C->hyper_switch, -1, cnzmax, true, Context) ;
         if (info != GrB_SUCCESS)
         { 
             // out of memory
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
-
-        int64_t *GB_RESTRICT Cp = C->p ;
-        int64_t *GB_RESTRICT Ci = C->i ;
-        int nth = GB_nthreads (cvdim, chunk, nthreads_max) ;
-
-        // FUTURE:: dense data structure, where Cp and Ci will be implicit
-
-        int64_t k ;
-        #pragma omp parallel for num_threads(nth) schedule(static)
-        for (k = 0 ; k <= cvdim ; k++)
-        { 
-            Cp [k] = k * cvlen ;
-        }
-
         C->magic = GB_MAGIC ;
         C->nvec_nonempty = (cvlen == 0) ? 0 : cvdim ;
+        C->sparsity = C_sparsity ;      // restore the sparsity control of C
+    }
+    else if (GB_IS_BITMAP (C))
+    {
+        // free the bitmap or set it to all ones
+        GB_bitmap_assign_to_full (C, nthreads_max) ;
+    }
 
-        #pragma omp parallel for num_threads(nthreads) schedule(static)
-        for (pC = 0 ; pC < cnzmax ; pC++)
-        { 
-            Ci [pC] = pC % cvlen ;
-        }
+    //--------------------------------------------------------------------------
+    // C = x
+    //--------------------------------------------------------------------------
 
-        if (!scalar_is_nonzero)
-        { 
-            GBBURBLE ("calloc ") ;
-            C->x = GB_CALLOC (cnzmax * csize, GB_void) ;
-        }
+    if (!GB_is_nonzero (cwork, csize))
+    { 
 
-        if (C->x == NULL)
-        { 
-            // out of memory
-            GB_PHIX_FREE (C) ;
-            return (GB_OUT_OF_MEMORY) ;
-        }
+        //----------------------------------------------------------------------
+        // set all of C->x to zero
+        //----------------------------------------------------------------------
+
+        GB_memset (C->x, 0, cnzmax * csize, nthreads_max) ;
 
-        if (!scalar_is_nonzero)
-        { 
-            // quick return if the scalar is zero
-            ASSERT_MATRIX_OK (C, "C(:,:)=0 output", GB0) ;
-            return (GrB_SUCCESS) ;
-        }
     }
+    else
+    {
 
-    //--------------------------------------------------------------------------
-    // define the worker for the switch factory
-    //--------------------------------------------------------------------------
+        //----------------------------------------------------------------------
+        // define the worker for the switch factory
+        //----------------------------------------------------------------------
 
-    // worker for built-in types
-    #define GB_WORKER(ctype)                                                \
-    {                                                                       \
-        ctype *GB_RESTRICT Cx = (ctype *) C->x ;                            \
-        ctype x = (*(ctype *) cwork) ;                                      \
-        GB_PRAGMA (omp parallel for num_threads(nthreads) schedule(static)) \
-        for (pC = 0 ; pC < cnzmax ; pC++)                                   \
-        {                                                                   \
-            Cx [pC] = x ;                                                   \
-        }                                                                   \
-    }                                                                       \
-    break ;
+        int64_t pC ;
+        int nthreads = GB_nthreads (cnzmax, chunk, nthreads_max) ;
+
+        // worker for built-in types
+        #define GB_WORKER(ctype)                                               \
+        {                                                                      \
+            ctype *GB_RESTRICT Cx = (ctype *) C->x ;                           \
+            ctype x = (*(ctype *) cwork) ;                                     \
+            GB_PRAGMA (omp parallel for num_threads(nthreads) schedule(static))\
+            for (pC = 0 ; pC < cnzmax ; pC++)                                  \
+            {                                                                  \
+                Cx [pC] = x ;                                                  \
+            }                                                                  \
+        }                                                                      \
+        break ;
 
-    //--------------------------------------------------------------------------
-    // launch the switch factory
-    //--------------------------------------------------------------------------
+        //----------------------------------------------------------------------
+        // launch the switch factory
+        //----------------------------------------------------------------------
 
-    switch (C->type->code)
-    {
-        case GB_BOOL_code   : GB_WORKER (bool) ;
-        case GB_INT8_code   : GB_WORKER (int8_t) ;
-        case GB_INT16_code  : GB_WORKER (int16_t) ;
-        case GB_INT32_code  : GB_WORKER (int32_t) ;
-        case GB_INT64_code  : GB_WORKER (int64_t) ;
-        case GB_UINT8_code  : GB_WORKER (uint8_t) ;
-        case GB_UINT16_code : GB_WORKER (uint16_t) ;
-        case GB_UINT32_code : GB_WORKER (uint32_t) ;
-        case GB_UINT64_code : GB_WORKER (uint64_t) ;
-        case GB_FP32_code   : GB_WORKER (float) ;
-        case GB_FP64_code   : GB_WORKER (double) ;
-        case GB_FC32_code   : GB_WORKER (GxB_FC32_t) ;
-        case GB_FC64_code   : GB_WORKER (GxB_FC64_t) ;
-        default:
-            {
-                // worker for all user-defined types
-                GB_BURBLE_N (cnzmax, "generic ") ;
-                GB_void *GB_RESTRICT Cx = (GB_void *) C->x ;
-                #pragma omp parallel for num_threads(nthreads) schedule(static)
-                for (pC = 0 ; pC < cnzmax ; pC++)
-                { 
-                    memcpy (Cx +((pC)*csize), cwork, csize) ;
+        switch (C->type->code)
+        {
+            case GB_BOOL_code   : GB_WORKER (bool) ;
+            case GB_INT8_code   : GB_WORKER (int8_t) ;
+            case GB_INT16_code  : GB_WORKER (int16_t) ;
+            case GB_INT32_code  : GB_WORKER (int32_t) ;
+            case GB_INT64_code  : GB_WORKER (int64_t) ;
+            case GB_UINT8_code  : GB_WORKER (uint8_t) ;
+            case GB_UINT16_code : GB_WORKER (uint16_t) ;
+            case GB_UINT32_code : GB_WORKER (uint32_t) ;
+            case GB_UINT64_code : GB_WORKER (uint64_t) ;
+            case GB_FP32_code   : GB_WORKER (float) ;
+            case GB_FP64_code   : GB_WORKER (double) ;
+            case GB_FC32_code   : GB_WORKER (GxB_FC32_t) ;
+            case GB_FC64_code   : GB_WORKER (GxB_FC64_t) ;
+            default:
+                {
+                    // worker for all user-defined types
+                    GB_BURBLE_N (cnzmax, "(generic C(:,:)=x assign) ") ;
+                    GB_void *GB_RESTRICT Cx = (GB_void *) C->x ;
+                    #pragma omp parallel for num_threads(nthreads) \
+                        schedule(static)
+                    for (pC = 0 ; pC < cnzmax ; pC++)
+                    { 
+                        memcpy (Cx +((pC)*csize), cwork, csize) ;
+                    }
                 }
-            }
-            break ;
+                break ;
+        }
     }
 
     //--------------------------------------------------------------------------
@@ -188,6 +179,10 @@ GrB_Info GB_dense_subassign_21      // C(:,:) = x; C is a matrix and x a scalar
     //--------------------------------------------------------------------------
 
     ASSERT_MATRIX_OK (C, "C(:,:)=x output", GB0) ;
+    ASSERT (GB_IS_FULL (C) || GB_IS_BITMAP (C)) ;
+    ASSERT (!GB_ZOMBIES (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_dense_subassign_22.c b/GraphBLAS/Source/GB_dense_subassign_22.c
index a269b916d7..c3e8b5f126 100644
--- a/GraphBLAS/Source/GB_dense_subassign_22.c
+++ b/GraphBLAS/Source/GB_dense_subassign_22.c
@@ -2,12 +2,14 @@
 // GB_dense_subassign_22: C += b where C is dense and b is a scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// C += b where C is a dense matrix and b is a scalar
+// C += b where C is a dense or full matrix and b is a scalar
+// C can have any sparsity format, as long as all entries are present;
+// GB_is_dense (C)) must hold.
 
 #include "GB_dense.h"
 #include "GB_binop.h"
@@ -16,6 +18,8 @@
 #include "GB_binop__include.h"
 #endif
 
+#define GB_FREE_ALL ;
+
 GrB_Info GB_dense_subassign_22      // C += b where C is dense and b is a scalar 
 (
     GrB_Matrix C,                   // input/output matrix
@@ -32,11 +36,17 @@ GrB_Info GB_dense_subassign_22      // C += b where C is dense and b is a scalar
 
     GrB_Info info ;
     ASSERT_MATRIX_OK (C, "C for C+=b", GB0) ;
-    ASSERT (scalar != NULL) ;
-    ASSERT (!GB_PENDING (C)) ; ASSERT (!GB_ZOMBIES (C)) ;
     ASSERT (GB_is_dense (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_ZOMBIES (C)) ;
+
+    ASSERT (scalar != NULL) ;
     ASSERT_TYPE_OK (btype, "btype for C+=b", GB0) ;
     ASSERT_BINARYOP_OK (accum, "accum for C+=b", GB0) ;
+    ASSERT (!GB_OP_IS_POSITIONAL (accum)) ;
+
+    GB_ENSURE_FULL (C) ;        // convert C to full
 
     //--------------------------------------------------------------------------
     // get the operator
@@ -116,7 +126,7 @@ GrB_Info GB_dense_subassign_22      // C += b where C is dense and b is a scalar
 
     if (!done)
     { 
-        GB_BURBLE_MATRIX (C, "generic ") ;
+        GB_BURBLE_MATRIX (C, "(generic C(:,:)+=x assign) ") ;
 
         //----------------------------------------------------------------------
         // get operators, functions, workspace, contents of b and C
@@ -129,7 +139,7 @@ GrB_Info GB_dense_subassign_22      // C += b where C is dense and b is a scalar
         //----------------------------------------------------------------------
 
         // C(i,j) = C(i,j) + scalar
-        #define GB_BINOP(cout_ij, cin_aij, bwork) \
+        #define GB_BINOP(cout_ij, cin_aij, bwork, i, j) \
             fadd (cout_ij, cin_aij, bwork)
 
         // address of Cx [p]
diff --git a/GraphBLAS/Source/GB_dense_subassign_23.c b/GraphBLAS/Source/GB_dense_subassign_23.c
index 3e9920e98e..e0808be7ce 100644
--- a/GraphBLAS/Source/GB_dense_subassign_23.c
+++ b/GraphBLAS/Source/GB_dense_subassign_23.c
@@ -2,8 +2,8 @@
 // GB_dense_subassign_23: C += B where C is dense and B is sparse or dense
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,6 +16,10 @@
 // typecast B(i,j) but not C(i,j).  The case for typecasting of C is handled by
 // Method 04.
 
+// The caller passes in the second matrix as A, but it is called B here to
+// match its use as the 2nd input to the binary accum operator.  C and B can
+// have any sparsity structure, but C must be dense.
+
 #include "GB_dense.h"
 #include "GB_binop.h"
 #ifndef GBCOMPACT
@@ -41,15 +45,31 @@ GrB_Info GB_dense_subassign_23      // C += B; C is dense, B is sparse or dense
     // check inputs
     //--------------------------------------------------------------------------
 
+    ASSERT (!GB_aliased (C, B)) ;   // NO ALIAS of C==A (A is called B here)
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
     GrB_Info info ;
     ASSERT_MATRIX_OK (C, "C for C+=B", GB0) ;
+    ASSERT (!GB_PENDING (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_ZOMBIES (C)) ;
+    ASSERT (GB_is_dense (C)) ;
+
     ASSERT_MATRIX_OK (B, "B for C+=B", GB0) ;
-    ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_ZOMBIES (B)) ;
-    ASSERT (!GB_PENDING (C)) ; ASSERT (!GB_ZOMBIES (C)) ;
+    ASSERT (!GB_PENDING (B)) ;
+    ASSERT (GB_JUMBLED_OK (B)) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+
     ASSERT_BINARYOP_OK (accum, "accum for C+=B", GB0) ;
+    ASSERT (!GB_OP_IS_POSITIONAL (accum)) ;
     ASSERT (B->vlen == C->vlen) ;
     ASSERT (B->vdim == C->vdim) ;
 
+    GB_ENSURE_FULL (C) ;        // convert C to full
+
     //--------------------------------------------------------------------------
     // get the operator
     //--------------------------------------------------------------------------
@@ -69,13 +89,11 @@ GrB_Info GB_dense_subassign_23      // C += B; C is dense, B is sparse or dense
     // determine the number of threads to use
     //--------------------------------------------------------------------------
 
-    int64_t bnz   = GB_NNZ (B) ;
+    int64_t bnz = GB_NNZ_HELD (B) ;
     int64_t bnvec = B->nvec ;
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
     int nthreads = GB_nthreads (bnz + bnvec, chunk, nthreads_max) ;
     int ntasks = (nthreads == 1) ? 1 : (32 * nthreads) ;
-    ntasks = GB_IMIN (ntasks, bnz) ;
-    ntasks = GB_IMAX (ntasks, 1) ;
 
     //--------------------------------------------------------------------------
     // slice the entries for each task
@@ -86,19 +104,20 @@ GrB_Info GB_dense_subassign_23      // C += B; C is dense, B is sparse or dense
     // vectors may be shared with prior slices and subsequent slices.
 
     int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
-    if (GB_is_dense (B))
+    if (GB_is_packed (B))
     { 
-        // both C and B are dense; no need to construct tasks
-        GBBURBLE ("(Z dense) ") ;
+        // C is dense and B is either dense or bitmap
+        GBURBLE ("(Z packed) ") ;
+        ntasks = 0 ;   // unused
     }
     else
     {
         // create tasks to compute over the matrix B
         if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, B,
-            ntasks))
+            &ntasks))
         { 
             // out of memory
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
     }
 
@@ -151,7 +170,7 @@ GrB_Info GB_dense_subassign_23      // C += B; C is dense, B is sparse or dense
         // get operators, functions, workspace, contents of B and C
         //----------------------------------------------------------------------
 
-        GB_BURBLE_MATRIX (B, "generic ") ;
+        GB_BURBLE_MATRIX (B, "(generic C+=B) ") ;
 
         GxB_binary_function fadd = accum->function ;
 
@@ -183,7 +202,7 @@ GrB_Info GB_dense_subassign_23      // C += B; C is dense, B is sparse or dense
         // no vectorization
         #define GB_PRAGMA_SIMD_VECTORIZE ;
 
-        #define GB_BINOP(z,x,y) fadd (z,x,y)
+        #define GB_BINOP(z,x,y,i,j) fadd (z,x,y)
         #include "GB_dense_subassign_23_template.c"
     }
 
diff --git a/GraphBLAS/Source/GB_dense_subassign_24.c b/GraphBLAS/Source/GB_dense_subassign_24.c
deleted file mode 100644
index fa4f651554..0000000000
--- a/GraphBLAS/Source/GB_dense_subassign_24.c
+++ /dev/null
@@ -1,129 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_dense_subassign_24: make a deep copy of a sparse matrix
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// C = A, making a deep copy into an existing non-shallow matrix C, but
-// possibly reusing parts of C if C is dense.  See also GB_dup.
-
-// Handles arbitrary typecasting.  A is either sparse or dense; the name of
-// the function is a bit of a misnomer since it implies that only the dense
-// case is handled.
-
-#include "GB_dense.h"
-#define GB_FREE_ALL ;
-
-GrB_Info GB_dense_subassign_24      // C = A, copy A into an existing matrix C
-(
-    GrB_Matrix C,           // output matrix to modify
-    const GrB_Matrix A,     // input matrix to copy
-    GB_Context Context
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    GrB_Info info ;
-    ASSERT_MATRIX_OK (C, "C for C_dense_subassign_24", GB0) ;
-    ASSERT_MATRIX_OK (A, "A for A_dense_subassign_24", GB0) ;
-    ASSERT (GB_ZOMBIES_OK (A) && GB_PENDING_OK (A)) ;
-    ASSERT (GB_ZOMBIES_OK (C) && GB_PENDING_OK (C)) ;
-
-    //--------------------------------------------------------------------------
-    // delete any lingering zombies and assemble any pending tuples
-    //--------------------------------------------------------------------------
-
-    GB_MATRIX_WAIT (A) ;
-    if (A->nvec_nonempty < 0)
-    { 
-        A->nvec_nonempty = GB_nvec_nonempty (A, Context) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // determine the number of threads to use
-    //--------------------------------------------------------------------------
-
-    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-
-    //--------------------------------------------------------------------------
-    // C = A
-    //--------------------------------------------------------------------------
-
-    int64_t anz = GB_NNZ (A) ;
-
-    bool copy_dense_A_to_C =            // copy from dense A to dense C if:
-        (
-            GB_is_dense (C)             //      both A and C are dense
-            && GB_is_dense (A)
-            && !GB_ZOMBIES (C)          //      C has no pending work
-            && !GB_PENDING (C)          // (FUTURE::: tolerate pending tuples)
-//          && !GB_ZOMBIES (A)          //      A has no pending work
-//          && !GB_PENDING (A)          //      (see GB_MATRIX_WAIT (A) above)
-            && !(C->p_shallow)          //      C is not shallow
-            && !(C->h_shallow)
-            && !(C->i_shallow)
-            && !(C->x_shallow)
-            && !C->is_hyper             //      both A and C are standard
-            && !A->is_hyper
-            && C->vdim == A->vdim       //      A and C have the same size
-            && C->vlen == A->vlen
-            && C->is_csc == A->is_csc   //      A and C have the same format
-            && C->p != NULL             //      C exists
-            && C->i != NULL
-            && C->x != NULL
-            && C->h == NULL             //      C is standard
-        ) ;
-
-    if (copy_dense_A_to_C)
-    { 
-
-        //----------------------------------------------------------------------
-        // only copy the values from A to C; nothing else changes
-        //----------------------------------------------------------------------
-
-        GBBURBLE ("(dense copy) ") ;
-
-    }
-    else
-    { 
-
-        //----------------------------------------------------------------------
-        // copy a sparse matrix from A to C
-        //----------------------------------------------------------------------
-
-        // clear prior content of C, but keep the CSR/CSC format and its type
-        GBBURBLE ("(deep copy) ") ;
-        bool C_is_csc = C->is_csc ;
-        GB_PHIX_FREE (C) ;
-        // copy the pattern, not the values
-        GB_OK (GB_dup2 (&C, A, false, C->type, Context)) ;
-        C->is_csc = C_is_csc ;      // do not change the CSR/CSC format of C
-    }
-
-    //-------------------------------------------------------------------------
-    // copy the values from A to C, typecasting as needed
-    //-------------------------------------------------------------------------
-
-    if (C->type != A->type)
-    { 
-        GBBURBLE ("(typecast) ") ;
-    }
-
-    int nthreads = GB_nthreads (anz, chunk, nthreads_max) ;
-    GB_cast_array (C->x, C->type->code, A->x, A->type->code, A->type->size,
-                       anz, nthreads) ;
-
-    //-------------------------------------------------------------------------
-    // return the result
-    //--------------------------------------------------------------------------
-
-    ASSERT_MATRIX_OK (C, "C result for C_dense_subassign_24", GB0) ;
-    return (GrB_SUCCESS) ;
-}
-
diff --git a/GraphBLAS/Source/GB_dense_subassign_25.c b/GraphBLAS/Source/GB_dense_subassign_25.c
index f770368ab3..36170ebe03 100644
--- a/GraphBLAS/Source/GB_dense_subassign_25.c
+++ b/GraphBLAS/Source/GB_dense_subassign_25.c
@@ -2,12 +2,12 @@
 // GB_dense_subassign_25: C(:,:)<M,s> = A; C empty, A dense, M structural
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Method 25: C(:,:)<M,s> = A ; C is empty, M structure, A dense
+// Method 25: C(:,:)<M,s> = A ; C is empty, M structural, A dense
 
 // M:           present
 // Mask_comp:   false
@@ -17,6 +17,13 @@
 // A:           matrix
 // S:           none
 
+// C and M are sparse or hypersparse.
+// A can have any sparsity structure, even bitmap.  M may be jumbled.
+// If so, C is constructed as jumbled.  C is reconstructed with the same
+// structure as M and can have any sparsity structure on input.  The only
+// constraint is nnz(C) is zero on input.  A must be dense with no pending
+// work, or bitmap.
+
 #include "GB_subassign_methods.h"
 #include "GB_dense.h"
 #ifndef GBCOMPACT
@@ -40,18 +47,33 @@ GrB_Info GB_dense_subassign_25
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (M)) ; ASSERT (!GB_IS_FULL (M)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+    ASSERT (!GB_aliased (C, A)) ;   // NO ALIAS of C==A
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
     GrB_Info info ;
     ASSERT_MATRIX_OK (C, "C for subassign method_25", GB0) ;
+    ASSERT (GB_NNZ (C) == 0) ;
+    ASSERT (!GB_ZOMBIES (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
+
     ASSERT_MATRIX_OK (M, "M for subassign method_25", GB0) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (!GB_PENDING (M)) ;
+
     ASSERT_MATRIX_OK (A, "A for subassign method_25", GB0) ;
-    ASSERT (GB_NNZ (C) == 0) ;
-    ASSERT (!GB_PENDING (C)) ; ASSERT (!GB_ZOMBIES (C)) ;
-    ASSERT (!GB_PENDING (M)) ; ASSERT (!GB_ZOMBIES (M)) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_as_if_full (A) || GB_IS_BITMAP (A)) ;
+
     const GB_Type_code ccode = C->type->code ;
 
     //--------------------------------------------------------------------------
@@ -66,11 +88,9 @@ GrB_Info GB_dense_subassign_25
     //--------------------------------------------------------------------------
 
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-    int64_t mnz = GB_NNZ (M) ;
+    int64_t mnz = GB_NNZ_HELD (M) ;
     int nthreads = GB_nthreads (mnz + M->nvec, chunk, nthreads_max) ;
     int ntasks = (nthreads == 1) ? 1 : (8 * nthreads) ;
-    ntasks = GB_IMIN (ntasks, mnz) ;
-    ntasks = GB_IMAX (ntasks, 1) ;
 
     //--------------------------------------------------------------------------
     // slice the entries for each task
@@ -81,10 +101,10 @@ GrB_Info GB_dense_subassign_25
     // vectors may be shared with prior slices and subsequent slices.
 
     int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
-    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, M, ntasks))
+    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, M, &ntasks))
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -96,7 +116,7 @@ GrB_Info GB_dense_subassign_25
     // initialize them.
 
     bool C_is_csc = C->is_csc ;
-    GB_PHIX_FREE (C) ;
+    GB_phbix_free (C) ;
     GB_OK (GB_dup2 (&C, M, false, C->type, Context)) ;
     C->is_csc = C_is_csc ;
 
@@ -161,7 +181,7 @@ GrB_Info GB_dense_subassign_25
         // get operators, functions, workspace, contents of A and C
         //----------------------------------------------------------------------
 
-        GB_BURBLE_MATRIX (A, "generic ") ;
+        GB_BURBLE_MATRIX (A, "(generic C(:,:)<M,struct>=A assign, method 25) ");
 
         const size_t csize = C->type->size ;
         const size_t asize = A->type->size ;
@@ -187,6 +207,9 @@ GrB_Info GB_dense_subassign_25
 
     GB_FREE_WORK ;
     ASSERT_MATRIX_OK (C, "C output for subassign method_25", GB0) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_dev.h b/GraphBLAS/Source/GB_dev.h
new file mode 100644
index 0000000000..fc3e12f374
--- /dev/null
+++ b/GraphBLAS/Source/GB_dev.h
@@ -0,0 +1,54 @@
+//------------------------------------------------------------------------------
+// GB_dev.h: definitions for code development
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_DEV_H
+#define GB_DEV_H
+
+//------------------------------------------------------------------------------
+// code development settings
+//------------------------------------------------------------------------------
+
+// to turn on Debug for a single file of GraphBLAS, add '#define GB_DEBUG'
+// just before the statement '#include "GB.h"'
+
+// set GB_BURBLE to 0 to disable diagnostic output, or compile with
+// -DGB_BURBLE=0.
+#ifndef GB_BURBLE
+#define GB_BURBLE 1
+#endif
+
+// to turn on Debug for all of GraphBLAS, uncomment this line:
+// #define GB_DEBUG
+
+// to reduce code size and for faster time to compile, uncomment this line;
+// GraphBLAS will be slower.  Alternatively, use cmake with -DGBCOMPACT=1
+// #define GBCOMPACT 1
+
+// for code development only
+// #define GB_DEVELOPER 1
+
+//------------------------------------------------------------------------------
+// notes on future work
+//------------------------------------------------------------------------------
+
+// FUTURE: can handle transpose of full or bitmap input matrices just by
+// changing how they are accessed
+// 
+// FUTURE: add matrix I/O in binary format (see draft LAGraph_binread/binwrite)
+// 
+// For PageRank:
+// 
+//  FUTURE: constant-valued matrices/vectors (for r(:)=teleport)
+//      probably coupled with lazy malloc/free of A->x when converting from
+//      full (non-constant) to constant-valued.
+//      need aggressive exploit of non-blocking mode, for x = sum (abs (t-r)),
+//      or GrB_vxv dot product, with PLUS_ABSDIFF semiring
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_dup.c b/GraphBLAS/Source/GB_dup.c
index c1b5a9142a..81a81a46ab 100644
--- a/GraphBLAS/Source/GB_dup.c
+++ b/GraphBLAS/Source/GB_dup.c
@@ -2,8 +2,8 @@
 // GB_dup: make a deep copy of a sparse matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -45,7 +45,6 @@ GrB_Info GB_dup             // make an exact copy of a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    GrB_Info info ;
     ASSERT (Chandle != NULL) ;
     ASSERT_MATRIX_OK (A, "A to duplicate", GB0) ;
 
@@ -53,7 +52,7 @@ GrB_Info GB_dup             // make an exact copy of a matrix
     // delete any lingering zombies and assemble any pending tuples
     //--------------------------------------------------------------------------
 
-    GB_MATRIX_WAIT (A) ;
+    GB_MATRIX_WAIT (A) ;        // TODO: allow C and A to be jumbled
 
     //--------------------------------------------------------------------------
     // C = A
diff --git a/GraphBLAS/Source/GB_dup2.c b/GraphBLAS/Source/GB_dup2.c
index 237b526457..407fecd7e4 100644
--- a/GraphBLAS/Source/GB_dup2.c
+++ b/GraphBLAS/Source/GB_dup2.c
@@ -2,8 +2,8 @@
 // GB_dup2: make a deep copy of a sparse matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -30,56 +30,73 @@ GrB_Info GB_dup2            // make an exact copy of a matrix
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
 
     //--------------------------------------------------------------------------
-    // C = A
+    // get A
     //--------------------------------------------------------------------------
 
-    if (A->nvec_nonempty < 0)
-    { 
-        A->nvec_nonempty = GB_nvec_nonempty (A, Context) ;
-    }
+    int64_t anz = GB_NNZ_HELD (A) ;
+    int64_t *Ap = A->p ;
+    int64_t *Ah = A->h ;
+    int64_t *Ai = A->i ;
+    int8_t  *Ab = A->b ;
+    GB_void *Ax = A->x ;
+    int64_t anvec = A->nvec ;
+    int64_t anvals = A->nvals ;
+    int64_t anvec_nonempty = A->nvec_nonempty ;
+    bool A_jumbled = A->jumbled ;
+    int sparsity = A->sparsity ;
+    GrB_Type atype = A->type ;
 
-    // [ create C; allocate C->p and do not initialize it
-    // C has the exact same hypersparsity as A.
-    int64_t anz = GB_NNZ (A) ;
+    //--------------------------------------------------------------------------
+    // create C
+    //--------------------------------------------------------------------------
+
+    // create C; allocate C->p and do not initialize it.
+    // C has the exact same sparsity structure as A.
 
     // allocate a new header for C if (*Chandle) is NULL, or reuse the
     // existing header if (*Chandle) is not NULL.
     GrB_Matrix C = (*Chandle) ;
-    GrB_Info info = GB_create (&C, numeric ? A->type : ctype, A->vlen, A->vdim,
-        GB_Ap_malloc, A->is_csc, GB_SAME_HYPER_AS (A->is_hyper),
-        A->hyper_ratio, A->plen, anz, true, Context) ;
+    GrB_Info info = GB_new_bix (&C, // same sparsity as A; old or new header
+        numeric ? atype : ctype, A->vlen, A->vdim, GB_Ap_malloc, A->is_csc,
+        GB_sparsity (A), false, A->hyper_switch, A->plen, anz, true, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory
         return (info) ;
     }
 
+    //--------------------------------------------------------------------------
     // copy the contents of A into C
-    int64_t anvec = A->nvec ;
+    //--------------------------------------------------------------------------
+
     C->nvec = anvec ;
-    C->nvec_nonempty = A->nvec_nonempty ;
-    int64_t *GB_RESTRICT Cp = C->p ;
-    int64_t *GB_RESTRICT Ch = C->h ;
-    int64_t *GB_RESTRICT Ci = C->i ;
-    const int64_t *GB_RESTRICT Ap = A->p ;
-    const int64_t *GB_RESTRICT Ah = A->h ;
-    const int64_t *GB_RESTRICT Ai = A->i ;
-
-    int nthreads = GB_nthreads (anvec, chunk, nthreads_max) ;
-    GB_memcpy (Cp, Ap, (anvec+1) * sizeof (int64_t), nthreads) ;
-    if (A->is_hyper)
+    C->nvec_nonempty = anvec_nonempty ;
+    C->nvals = anvals ;             // for bitmap only
+    C->jumbled = A_jumbled ;        // C is jumbled if A is jumbled
+    C->sparsity = sparsity ;        // copy in the sparsity control
+
+    if (Ap != NULL)
     { 
-        GB_memcpy (Ch, Ah, anvec * sizeof (int64_t), nthreads) ;
+        GB_memcpy (C->p, Ap, (anvec+1) * sizeof (int64_t), nthreads_max) ;
+    }
+    if (Ah != NULL)
+    { 
+        GB_memcpy (C->h, Ah, anvec * sizeof (int64_t), nthreads_max) ;
+    }
+    if (Ab != NULL)
+    { 
+        GB_memcpy (C->b, Ab, anz * sizeof (int8_t), nthreads_max) ;
+    }
+    if (Ai != NULL)
+    {
+        GB_memcpy (C->i, Ai, anz * sizeof (int64_t), nthreads_max) ;
     }
-
-    nthreads = GB_nthreads (anz, chunk, nthreads_max) ;
-    GB_memcpy (Ci, Ai, anz * sizeof (int64_t), nthreads) ;
     if (numeric)
     { 
-        GB_memcpy (C->x, A->x, anz * A->type->size, nthreads) ;
+        GB_memcpy (C->x, Ax, anz * atype->size, nthreads_max) ;
     }
 
-    C->magic = GB_MAGIC ;      // C->p and C->h are now initialized ]
+    C->magic = GB_MAGIC ;      // C->p and C->h are now initialized
     #ifdef GB_DEBUG
     if (numeric) ASSERT_MATRIX_OK (C, "C duplicate of A", GB0) ;
     #endif
diff --git a/GraphBLAS/Source/GB_ek_slice.c b/GraphBLAS/Source/GB_ek_slice.c
index 35b9a5f6d6..74fa3309b9 100644
--- a/GraphBLAS/Source/GB_ek_slice.c
+++ b/GraphBLAS/Source/GB_ek_slice.c
@@ -2,19 +2,25 @@
 // GB_ek_slice: slice the entries and vectors of a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // Slice the entries of a matrix or vector into ntasks slices.
 
+// The function is called GB_ek_slice because it first partitions the e entries
+// into chunks of identical sizes, and then finds the first and last vector
+// (k) for each chunk.
+
 // Task t does entries pstart_slice [t] to pstart_slice [t+1]-1 and
 // vectors kfirst_slice [t] to klast_slice [t].  The first and last vectors
 // may be shared with prior slices and subsequent slices.
 
-// On input, ntasks must be <= nnz (A), unless nnz (A) is zero.  In that
-// case, ntasks must be 1.
+// On input, ntasks is the # of tasks requested.  On output, it may be
+// modified if too large or too small.
+
+// A can have any sparsity structure (sparse, hyper, bitmap, or full)
 
 #include "GB_ek_slice.h"
 
@@ -26,10 +32,44 @@ bool GB_ek_slice        // true if successful, false if out of memory
     int64_t *GB_RESTRICT *klast_slice_handle,  // size ntasks
     // input:
     GrB_Matrix A,                   // matrix to slice
-    int ntasks                      // # of tasks
+    // input/output:
+    int *ntasks_handle              // # of tasks (may be modified)
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (pstart_slice_handle != NULL) ;
+    ASSERT (kfirst_slice_handle != NULL) ;
+    ASSERT (klast_slice_handle  != NULL) ;
+    ASSERT (ntasks_handle != NULL) ;
+
+    //--------------------------------------------------------------------------
+    // get A
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_JUMBLED_OK (A)) ;    // pattern of A is not accessed
+
+    int64_t anvec = A->nvec ;
+    int64_t avlen = A->vlen ;
+    int64_t anz = GB_NNZ_HELD (A) ;
+    const int64_t *Ap = A->p ;      // NULL if bitmap or full
+
+    // ntasks must be in the range [1,anz], inclusive, unless anz is zero.
+    int ntasks = (*ntasks_handle) ;
+    if (anz == 0)
+    { 
+        ntasks = 1 ;
+    }
+    else
+    { 
+        ntasks = GB_IMIN (ntasks, anz) ;
+        ntasks = GB_IMAX (ntasks, 1) ;
+    }
+    (*ntasks_handle) = ntasks ;
+
     //--------------------------------------------------------------------------
     // allocate result
     //--------------------------------------------------------------------------
@@ -53,16 +93,12 @@ bool GB_ek_slice        // true if successful, false if out of memory
     (*klast_slice_handle ) = klast_slice ;
 
     //--------------------------------------------------------------------------
-    // get A
+    // quick return for empty matrices
     //--------------------------------------------------------------------------
 
-    int64_t anvec = A->nvec ;
-    int64_t anz = GB_NNZ (A) ;
-    const int64_t *Ap = A->p ;
-
     if (anz == 0)
     { 
-        // quick return for empty matrices
+        // construct a single empty task
         ASSERT (ntasks == 1) ;
         pstart_slice [0] = 0 ;
         pstart_slice [1] = 0 ;
@@ -71,12 +107,11 @@ bool GB_ek_slice        // true if successful, false if out of memory
         return (true) ;
     }
 
-    ASSERT (ntasks <= anz) ;
-
     //--------------------------------------------------------------------------
     // find the first and last entries in each slice
     //--------------------------------------------------------------------------
 
+    // FUTURE: this can be done in parallel if there are many tasks
     GB_eslice (pstart_slice, anz, ntasks) ;
 
     //--------------------------------------------------------------------------
@@ -91,6 +126,7 @@ bool GB_ek_slice        // true if successful, false if out of memory
     // pstart_slice [taskid+1]-1 is in the range Ap [k]...A[k+1]-1, and this
     // is vector is k = klast_slice [taskid].
 
+    // FUTURE: this can be done in parallel if there are many tasks
     for (int taskid = 0 ; taskid < ntasks ; taskid++)
     { 
 
@@ -102,11 +138,11 @@ bool GB_ek_slice        // true if successful, false if out of memory
 
         // find the first vector of the slice for task taskid: the
         // vector that owns the entry Ai [pfirst] and Ax [pfirst].
-        int64_t kfirst = GB_search_for_vector (pfirst, Ap, 0, anvec) ;
+        int64_t kfirst = GB_search_for_vector (pfirst, Ap, 0, anvec, avlen) ;
 
         // find the last vector of the slice for task taskid: the
         // vector that owns the entry Ai [plast] and Ax [plast].
-        int64_t klast = GB_search_for_vector (plast, Ap, kfirst, anvec) ;
+        int64_t klast = GB_search_for_vector (plast, Ap, kfirst, anvec, avlen) ;
 
         kfirst_slice [taskid] = kfirst ;
         klast_slice  [taskid] = klast ;
diff --git a/GraphBLAS/Source/GB_ek_slice.h b/GraphBLAS/Source/GB_ek_slice.h
index c5582ffdff..cde76d022b 100644
--- a/GraphBLAS/Source/GB_ek_slice.h
+++ b/GraphBLAS/Source/GB_ek_slice.h
@@ -2,11 +2,37 @@
 // GB_ek_slice.h: slice the entries and vectors of a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
+#ifndef GB_EK_SLICE_H
+#define GB_EK_SLICE_H
+#include "GB.h"
+
+//------------------------------------------------------------------------------
+// GB_SLICE_MATRIX: slice a single matrix using GB_ek_slice
+//------------------------------------------------------------------------------
+
+#define GB_SLICE_MATRIX(X,NTASKS_PER_THREAD)                                   \
+{                                                                              \
+    X ## _nthreads = GB_nthreads (GB_NNZ (X) + X->nvec, chunk, nthreads_max) ; \
+    X ## _ntasks = (X ## _nthreads == 1) ? 1 :                                 \
+        ((NTASKS_PER_THREAD) * (X ## _nthreads)) ;                             \
+    if (!GB_ek_slice (&(pstart_ ## X ## slice), &(kfirst_ ## X ## slice),      \
+        &(klast_ ## X ## slice), X, &(X ## _ntasks)))                          \
+    {                                                                          \
+        /* out of memory */                                                    \
+        GB_FREE_ALL ;                                                          \
+        return (GrB_OUT_OF_MEMORY) ;                                           \
+    }                                                                          \
+}
+
+//------------------------------------------------------------------------------
+// GB_ek_slice prototypes
+//------------------------------------------------------------------------------
+
 // Slice the entries of a matrix or vector into ntasks slices.
 
 // Task t does entries pstart_slice [t] to pstart_slice [t+1]-1 and
@@ -16,10 +42,6 @@
 // On input, ntasks must be <= nnz (A), unless nnz (A) is zero.  In that
 // case, ntasks must be 1.
 
-#ifndef GB_EK_SLICE_H
-#define GB_EK_SLICE_H
-#include "GB.h"
-
 bool GB_ek_slice        // true if successful, false if out of memory
 (
     // output:
@@ -28,7 +50,8 @@ bool GB_ek_slice        // true if successful, false if out of memory
     int64_t *GB_RESTRICT *klast_slice_handle,  // size ntasks
     // input:
     GrB_Matrix A,                   // matrix to slize
-    int ntasks                      // # of tasks
+    // input/output:
+    int *ntasks_handle              // # of tasks (may be modified)
 ) ;
 
 void GB_ek_slice_free
@@ -38,11 +61,11 @@ void GB_ek_slice_free
     int64_t *GB_RESTRICT *klast_slice_handle
 ) ;
 
-// define the static inline function GB_search_for_vector:
+// define the static inline function GB_search_for_vector
 #include "GB_search_for_vector_template.c"
 
 //------------------------------------------------------------------------------
-// GB_get_pA_and_pC: find the part of A(:,k) to be operated on by this task
+// GB_get_pA_and_pC: find the part of A(:,k) and C(:,k) for this task
 //------------------------------------------------------------------------------
 
 // The tasks were generated by GB_ek_slice.
@@ -61,29 +84,80 @@ static inline void GB_get_pA_and_pC
     const int64_t *GB_RESTRICT pstart_slice,   // start of each slice in A
     const int64_t *GB_RESTRICT C_pstart_slice, // start of each slice in C
     const int64_t *GB_RESTRICT Cp,             // vector pointers for C
-    const int64_t *GB_RESTRICT Ap              // vector pointers for A
+    int64_t cvlen,                             // C->vlen
+    const int64_t *GB_RESTRICT Ap,             // vector pointers for A
+    int64_t avlen                              // A->vlen
+)
+{
+
+    int64_t p0 = GBP (Ap, k, avlen) ;
+    int64_t p1 = GBP (Ap, k+1, avlen) ;
+
+    if (k == kfirst)
+    { 
+        // First vector for task tid; may only be partially owned.
+        (*pA_start) = pstart_slice [tid] ;
+        (*pA_end  ) = GB_IMIN (p1, pstart_slice [tid+1]) ;
+        (*pC) = C_pstart_slice [tid] ;
+    }
+    else if (k == klast)
+    { 
+        // Last vector for task tid; may only be partially owned.
+        (*pA_start) = p0 ;
+        (*pA_end  ) = pstart_slice [tid+1] ;
+        (*pC) = GBP (Cp, k, cvlen) ;
+    }
+    else
+    { 
+        // task tid entirely owns this vector A(:,k).
+        (*pA_start) = p0 ;
+        (*pA_end  ) = p1 ;
+        (*pC) = GBP (Cp, k, cvlen) ;
+    }
+}
+
+//------------------------------------------------------------------------------
+// GB_get_pA: find the part of A(:,k) to be operated on by this task
+//------------------------------------------------------------------------------
+
+// The tasks were generated by GB_ek_slice.
+
+static inline void GB_get_pA
+(
+    // output
+    int64_t *pA_start,
+    int64_t *pA_end,
+    // input
+    int tid,            // task id
+    int64_t k,          // current vector
+    int64_t kfirst,     // first vector for this slice
+    int64_t klast,      // last vector for this slice
+    const int64_t *GB_RESTRICT pstart_slice,   // start of each slice in A
+    const int64_t *GB_RESTRICT Ap,             // vector pointers for A
+    int64_t avlen                              // A->vlen
 )
 {
+
+    int64_t p0 = GBP (Ap, k, avlen) ;
+    int64_t p1 = GBP (Ap, k+1, avlen) ;
+
     if (k == kfirst)
     { 
         // First vector for task tid; may only be partially owned.
         (*pA_start) = pstart_slice [tid] ;
-        (*pA_end  ) = GB_IMIN (Ap [kfirst+1], pstart_slice [tid+1]) ;
-        if (pC != NULL) (*pC) = C_pstart_slice [tid] ;
+        (*pA_end  ) = GB_IMIN (p1, pstart_slice [tid+1]) ;
     }
     else if (k == klast)
     { 
         // Last vector for task tid; may only be partially owned.
-        (*pA_start) = Ap [k] ;
+        (*pA_start) = p0 ;
         (*pA_end  ) = pstart_slice [tid+1] ;
-        if (pC != NULL) (*pC) = Cp [k] ;
     }
     else
     { 
-        // task tid fully owns this vector A(:,k).
-        (*pA_start) = Ap [k] ;
-        (*pA_end  ) = Ap [k+1] ;
-        if (pC != NULL) (*pC) = Cp [k] ;
+        // task tid entirely owns this vector A(:,k).
+        (*pA_start) = p0 ;
+        (*pA_end  ) = p1 ;
     }
 }
 
diff --git a/GraphBLAS/Source/GB_ek_slice_free.c b/GraphBLAS/Source/GB_ek_slice_free.c
index 01bf9c07d3..cda5c9ac6f 100644
--- a/GraphBLAS/Source/GB_ek_slice_free.c
+++ b/GraphBLAS/Source/GB_ek_slice_free.c
@@ -2,8 +2,8 @@
 // GB_ek_slice_free: free workspace created by GB_ek_slice
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_emult.c b/GraphBLAS/Source/GB_emult.c
index 7981d3759a..c6c4ec3dda 100644
--- a/GraphBLAS/Source/GB_emult.c
+++ b/GraphBLAS/Source/GB_emult.c
@@ -1,17 +1,16 @@
 //------------------------------------------------------------------------------
-// GB_emult: C = A.*B or C<M>=A.*B
+// GB_emult: C = A.*B, C<M>=A.*B, or C<!M>=A.*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// GB_emult, does C=A.*B or C<M>=A.*B, using the given operator element-wise on
-// the matrices A and B.  The result is typecasted as needed.  The pattern of C
-// is the intersection of the pattern of A and B, intersection with the mask M,
-// if present and not complemented.  The complemented mask is not handled here,
-// but in GB_mask.
+// GB_emult, does C=A.*B, C<M>=A.*B, C<!M>=A.*B, using the given operator
+// element-wise on the matrices A and B.  The result is typecasted as needed.
+// The pattern of C is the intersection of the pattern of A and B, intersection
+// with the mask M or !M, if present.
 
 // Let the op be z=f(x,y) where x, y, and z have type xtype, ytype, and ztype.
 // If both A(i,j) and B(i,j) are present, then:
@@ -25,16 +24,19 @@
 // and B, and also intersection with M if present.
 
 #include "GB_emult.h"
+#include "GB_add.h"
 
 #define GB_FREE_ALL ;
 
-GrB_Info GB_emult           // C=A.*B or C<M>=A.*B
+GrB_Info GB_emult           // C=A.*B, C<M>=A.*B, or C<!M>=A.*B
 (
     GrB_Matrix *Chandle,    // output matrix (unallocated on input)
     const GrB_Type ctype,   // type of output matrix C
     const bool C_is_csc,    // format of output matrix C
-    const GrB_Matrix M,     // optional mask, unused if NULL.  Not complemented
+    const GrB_Matrix M,     // optional mask, unused if NULL
     const bool Mask_struct, // if true, use the only structure of M
+    const bool Mask_comp,   // if true, use !M
+    bool *mask_applied,     // if true, the mask was applied
     const GrB_Matrix A,     // input A matrix
     const GrB_Matrix B,     // input B matrix
     const GrB_BinaryOp op,  // op to perform C = op (A,B)
@@ -46,96 +48,146 @@ GrB_Info GB_emult           // C=A.*B or C<M>=A.*B
     // check inputs
     //--------------------------------------------------------------------------
 
-    GBBURBLE ((M == NULL) ? "emult " : "masked_emult ") ;
+    GrB_Info info ;
 
     ASSERT (Chandle != NULL) ;
+    GrB_Matrix C = NULL ;
+    (*Chandle) = NULL ;
+
     ASSERT_MATRIX_OK (A, "A for emult phased", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+
     ASSERT_MATRIX_OK (B, "B for emult phased", GB0) ;
-    ASSERT_BINARYOP_OK_OR_NULL (op, "op for emult phased", GB0) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+    ASSERT (!GB_PENDING (B)) ;
+
     ASSERT_MATRIX_OK_OR_NULL (M, "M for emult phased", GB0) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
-    ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+    ASSERT (!GB_JUMBLED (M)) ;
+    ASSERT (!GB_PENDING (M)) ;
+
+    ASSERT_BINARYOP_OK_OR_NULL (op, "op for emult phased", GB0) ;
     ASSERT (A->vdim == B->vdim && A->vlen == B->vlen) ;
-    if (M != NULL)
+    ASSERT (GB_IMPLIES (M != NULL, A->vdim == M->vdim && A->vlen == M->vlen)) ;
+
+    //--------------------------------------------------------------------------
+    // determine the sparsity of C
+    //--------------------------------------------------------------------------
+
+    bool apply_mask, use_add_instead ;
+    int C_sparsity = GB_emult_sparsity (&apply_mask, &use_add_instead,
+        M, Mask_comp, A, B) ;
+
+    //--------------------------------------------------------------------------
+    // use GB_add instead, as determined by GB_emult_sparsity
+    //--------------------------------------------------------------------------
+
+    if (use_add_instead)
     { 
-        ASSERT (!GB_PENDING (M)) ; ASSERT (!GB_ZOMBIES (M)) ;
-        ASSERT (A->vdim == M->vdim && A->vlen == M->vlen) ;
+        // A and B are both full.  The mask M may be present or not, and may be
+        // complemented or not.  GB_add computes the same thing in this case,
+        // so use it instead, to reduce the code needed for GB_emult.
+        return (GB_add (Chandle, ctype, C_is_csc, M, Mask_struct, Mask_comp,
+            mask_applied, A, B, op, Context)) ;
     }
 
     //--------------------------------------------------------------------------
     // initializations
     //--------------------------------------------------------------------------
 
-    GrB_Matrix C = NULL ;
-    (*Chandle) = NULL ;
     int64_t Cnvec, Cnvec_nonempty ;
     int64_t *GB_RESTRICT Cp = NULL ;
-    const int64_t *GB_RESTRICT Ch = NULL ;
+    const int64_t *GB_RESTRICT Ch = NULL ;  // shallow; must not be freed
     int64_t *GB_RESTRICT C_to_M = NULL ;
     int64_t *GB_RESTRICT C_to_A = NULL ;
     int64_t *GB_RESTRICT C_to_B = NULL ;
-    int ntasks, max_ntasks, nthreads ;
+    int C_ntasks = 0, TaskList_size = 0, C_nthreads ;
     GB_task_struct *TaskList = NULL ;
 
     //--------------------------------------------------------------------------
-    // phase0: determine the vectors in C(:,j)
+    // phase0: finalize the sparsity C and find the vectors in C
     //--------------------------------------------------------------------------
 
-    GrB_Info info = GB_emult_phase0 (
+    info = GB_emult_phase0 (
         // computed by phase0:
         &Cnvec, &Ch, &C_to_M, &C_to_A, &C_to_B,
+        // input/output to phase0:
+        &C_sparsity,
         // original input:
-        M, A, B, Context) ;
-
+        (apply_mask) ? M : NULL, A, B, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory
         return (info) ;
     }
 
+    GBURBLE ("emult:(%s<%s>=%s.*%s) ",
+        GB_sparsity_char (C_sparsity),
+        GB_sparsity_char_matrix (M),
+        GB_sparsity_char_matrix (A),
+        GB_sparsity_char_matrix (B)) ;
+
     //--------------------------------------------------------------------------
-    // phase0b: split C into tasks for phase1 and phase2
+    // phase1: split C into tasks, and count entries in each vector of C
     //--------------------------------------------------------------------------
 
-    info = GB_ewise_slice (
-        // computed by phase0b:
-        &TaskList, &max_ntasks, &ntasks, &nthreads,
-        // computed by phase0:
-        Cnvec, Ch, C_to_M, C_to_A, C_to_B, false,
-        // original input:
-        M, A, B, Context) ;
+    if (C_sparsity == GxB_SPARSE || C_sparsity == GxB_HYPERSPARSE)
+    {
+
+        //----------------------------------------------------------------------
+        // C is sparse or hypersparse: slice and analyze the C matrix
+        //----------------------------------------------------------------------
+
+        // phase1a: split C into tasks
+        info = GB_ewise_slice (
+            // computed by phase1a:
+            &TaskList, &TaskList_size, &C_ntasks, &C_nthreads,
+            // computed by phase0:
+            Cnvec, Ch, C_to_M, C_to_A, C_to_B, false,
+            // original input:
+            (apply_mask) ? M : NULL, A, B, Context) ;
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory; free everything allocated by GB_emult_phase0
+            GB_FREE (C_to_M) ;
+            GB_FREE (C_to_A) ;
+            GB_FREE (C_to_B) ;
+            return (info) ;
+        }
+
+        // count the number of entries in each vector of C
+        info = GB_emult_phase1 (
+            // computed by phase1:
+            &Cp, &Cnvec_nonempty,
+            // from phase1a:
+            TaskList, C_ntasks, C_nthreads,
+            // from phase0:
+            Cnvec, Ch, C_to_M, C_to_A, C_to_B,
+            // original input:
+            (apply_mask) ? M : NULL, Mask_struct, Mask_comp, A, B, Context) ;
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory; free everything allocated by phase 0
+            GB_FREE (TaskList) ;
+            GB_FREE (C_to_M) ;
+            GB_FREE (C_to_A) ;
+            GB_FREE (C_to_B) ;
+            return (info) ;
+        }
 
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory; free everything allocated by GB_emult_phase0
-        GB_FREE (C_to_M) ;
-        GB_FREE (C_to_A) ;
-        GB_FREE (C_to_B) ;
-        return (info) ;
     }
+    else
+    { 
 
-    //--------------------------------------------------------------------------
-    // phase1: count the number of entries in each vector of C
-    //--------------------------------------------------------------------------
-
-    info = GB_emult_phase1 (
-        // computed by phase1:
-        &Cp, &Cnvec_nonempty,
-        // from phase0b:
-        TaskList, ntasks, nthreads,
-        // from phase0:
-        Cnvec, Ch, C_to_M, C_to_A, C_to_B,
-        // original input:
-        M, Mask_struct, A, B, Context) ;
+        //----------------------------------------------------------------------
+        // C is bitmap or full: only determine how many threads to use
+        //----------------------------------------------------------------------
 
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory; free everything allocated by phase 0
-        GB_FREE (TaskList) ;
-        GB_FREE (C_to_M) ;
-        GB_FREE (C_to_A) ;
-        GB_FREE (C_to_B) ;
-        return (info) ;
+        GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+        C_nthreads = GB_nthreads (A->vlen * A->vdim, chunk, nthreads_max) ;
     }
 
     //--------------------------------------------------------------------------
@@ -150,12 +202,12 @@ GrB_Info GB_emult           // C=A.*B or C<M>=A.*B
         &C, ctype, C_is_csc, op,
         // from phase1:
         Cp, Cnvec_nonempty,
-        // from phase0b:
-        TaskList, ntasks, nthreads,
+        // from phase1a:
+        TaskList, C_ntasks, C_nthreads,
         // from phase0:
-        Cnvec, Ch, C_to_M, C_to_A, C_to_B,
+        Cnvec, Ch, C_to_M, C_to_A, C_to_B, C_sparsity,
         // original input:
-        M, Mask_struct, A, B, Context) ;
+        (apply_mask) ? M : NULL, Mask_struct, Mask_comp, A, B, Context) ;
 
     // free workspace
     GB_FREE (TaskList) ;
@@ -175,6 +227,9 @@ GrB_Info GB_emult           // C=A.*B or C<M>=A.*B
 
     ASSERT_MATRIX_OK (C, "C output for emult phased", GB0) ;
     (*Chandle) = C ;
+    ASSERT (!GB_ZOMBIES (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_PENDING (C)) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_emult.h b/GraphBLAS/Source/GB_emult.h
index f633d6cf67..1381fdf7bf 100644
--- a/GraphBLAS/Source/GB_emult.h
+++ b/GraphBLAS/Source/GB_emult.h
@@ -2,14 +2,15 @@
 // GB_emult.h: definitions for GB_emult
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #ifndef GB_EMULT_H
 #define GB_EMULT_H
 #include "GB.h"
+#include "GB_bitmap_assign_methods.h"
 
 GrB_Info GB_emult           // C=A.*B or C<M>=A.*B
 (
@@ -18,6 +19,8 @@ GrB_Info GB_emult           // C=A.*B or C<M>=A.*B
     const bool C_is_csc,    // format of output matrix C
     const GrB_Matrix M,     // optional mask, unused if NULL.  Not complemented
     const bool Mask_struct, // if true, use the only structure of M
+    const bool Mask_comp,   // if true, use the !M
+    bool *mask_applied,
     const GrB_Matrix A,     // input A matrix
     const GrB_Matrix B,     // input B matrix
     const GrB_BinaryOp op,  // op to perform C = op (A,B)
@@ -31,6 +34,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
     int64_t *GB_RESTRICT *C_to_M_handle,    // C_to_M: size Cnvec, or NULL
     int64_t *GB_RESTRICT *C_to_A_handle,    // C_to_A: size Cnvec, or NULL
     int64_t *GB_RESTRICT *C_to_B_handle,    // C_to_B: size Cnvec, or NULL
+    int *C_sparsity,            // sparsity structure of C
     // original input:
     const GrB_Matrix M,         // optional mask, may be NULL
     const GrB_Matrix A,
@@ -40,21 +44,22 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
 
 GrB_Info GB_emult_phase1                // count nnz in each C(:,j)
 (
-    int64_t *GB_RESTRICT *Cp_handle,       // output of size Cnvec+1
-    int64_t *Cnvec_nonempty,            // # of non-empty vectors in C
-    // tasks from phase0b:
-    GB_task_struct *GB_RESTRICT TaskList,  // array of structs
-    const int ntasks,                   // # of tasks
-    const int nthreads,                 // # of threads to use
+    int64_t *GB_RESTRICT *Cp_handle,        // output of size Cnvec+1
+    int64_t *Cnvec_nonempty,                // # of non-empty vectors in C
+    // tasks from phase1a:
+    GB_task_struct *GB_RESTRICT TaskList,   // array of structs
+    const int C_ntasks,                       // # of tasks
+    const int C_nthreads,                     // # of threads to use
     // analysis from phase0:
     const int64_t Cnvec,
-    const int64_t *GB_RESTRICT Ch,         // Ch is NULL, or shallow pointer
+    const int64_t *GB_RESTRICT Ch,          // Ch is NULL, or shallow pointer
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     // original input:
-    const GrB_Matrix M,                 // optional mask, may be NULL
-    const bool Mask_struct, // if true, use the only structure of M
+    const GrB_Matrix M,             // optional mask, may be NULL
+    const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, use !M
     const GrB_Matrix A,
     const GrB_Matrix B,
     GB_Context Context
@@ -62,29 +67,43 @@ GrB_Info GB_emult_phase1                // count nnz in each C(:,j)
 
 GrB_Info GB_emult_phase2                // C=A.*B or C<M>=A.*B
 (
-    GrB_Matrix *Chandle,                // output matrix
-    const GrB_Type ctype,               // type of output matrix C
-    const bool C_is_csc,                // format of output matrix C
-    const GrB_BinaryOp op,              // op to perform C = op (A,B)
+    GrB_Matrix *Chandle,    // output matrix (unallocated on input)
+    const GrB_Type ctype,   // type of output matrix C
+    const bool C_is_csc,    // format of output matrix C
+    const GrB_BinaryOp op,  // op to perform C = op (A,B)
     // from phase1:
-    const int64_t *GB_RESTRICT Cp,         // vector pointers for C
+    const int64_t *GB_RESTRICT Cp,      // vector pointers for C
     const int64_t Cnvec_nonempty,       // # of non-empty vectors in C
-    // tasks from phase0b:
+    // tasks from phase1a:
     const GB_task_struct *GB_RESTRICT TaskList,  // array of structs
-    const int ntasks,                         // # of tasks
-    const int nthreads,                       // # of threads to use
+    const int C_ntasks,                         // # of tasks
+    const int C_nthreads,                       // # of threads to use
     // analysis from phase0:
     const int64_t Cnvec,
-    const int64_t *GB_RESTRICT Ch,         // Ch is NULL, or a shallow pointer
+    const int64_t *GB_RESTRICT Ch,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
+    const int C_sparsity,
     // original input:
-    const GrB_Matrix M,                 // optional mask, may be NULL
-    const bool Mask_struct, // if true, use the only structure of M
+    const GrB_Matrix M,             // optional mask, may be NULL
+    const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, use !M
     const GrB_Matrix A,
     const GrB_Matrix B,
     GB_Context Context
 ) ;
 
+int GB_emult_sparsity       // return the sparsity structure for C
+(
+    // output:
+    bool *apply_mask,       // if true then mask will be applied by GB_emult
+    bool *use_add_instead,  // if true then use GB_add instead of GB_emult
+    // input:
+    const GrB_Matrix M,     // optional mask for C, unused if NULL
+    const bool Mask_comp,   // if true, use !M
+    const GrB_Matrix A,     // input A matrix
+    const GrB_Matrix B      // input B matrix
+) ;
+
 #endif
diff --git a/GraphBLAS/Source/GB_emult_phase0.c b/GraphBLAS/Source/GB_emult_phase0.c
index 91e5c800d2..f5b79cbf89 100644
--- a/GraphBLAS/Source/GB_emult_phase0.c
+++ b/GraphBLAS/Source/GB_emult_phase0.c
@@ -2,8 +2,8 @@
 // GB_emult_phase0: find vectors of C to compute for C=A.*B or C<M>=A.*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,9 +13,8 @@
 // On input, A and B are the two matrices being ewise multiplied, and M is the
 // optional mask matrix.  If present, it is not complemented.
 
-// The M, A, and B matrices are sparse or hypersparse (not a slice or
-// hyperslice).  C will be standard (if Ch is returned NULL) or hypersparse
-// (if Ch is returned non-NULL).
+// The M, A, and B matrices are sparse or hypersparse.  C will be sparse
+// (if Ch is returned NULL) or hypersparse (if Ch is returned non-NULL).
 
 //      Ch: the vectors to compute in C.  Not allocated, but equal to either
 //      A->h, B->h, or M->h, or NULL if C is not hypersparse.
@@ -31,7 +30,7 @@
 //      C is always hypersparse in this case.
 
 //      C_to_M:  if M is hypersparse, and Ch is not M->h, then C_to_M [k] = kM
-//      if the kth vector j = (Ch == NULL) ? k : Ch [k] is equal to Mh [kM].
+//      if the kth vector j = GBH (Ch, k) is equal to Mh [kM].
 //      If j does not appear in M, then C_to_M [k] = -1.  Otherwise, C_to_M is
 //      returned as NULL.  C is always hypersparse in this case.
 
@@ -46,6 +45,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
     int64_t *GB_RESTRICT *C_to_M_handle,    // C_to_M: size Cnvec, or NULL
     int64_t *GB_RESTRICT *C_to_A_handle,    // C_to_A: size Cnvec, or NULL
     int64_t *GB_RESTRICT *C_to_B_handle,    // C_to_B: size Cnvec, or NULL
+    int *C_sparsity,            // sparsity structure of C
     // original input:
     const GrB_Matrix M,         // optional mask, may be NULL
     const GrB_Matrix A,
@@ -58,26 +58,39 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
     // check inputs
     //--------------------------------------------------------------------------
 
+    // M, A, and B can be jumbled for this phase, but not phase1 or phase2
+
     ASSERT (p_Cnvec != NULL) ;
     ASSERT (Ch_handle != NULL) ;
     ASSERT (C_to_A_handle != NULL) ;
     ASSERT (C_to_B_handle != NULL) ;
+
+    ASSERT_MATRIX_OK_OR_NULL (M, "M for emult phase0", GB0) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;        // pattern not accessed
+    ASSERT (!GB_PENDING (M)) ;
+
     ASSERT_MATRIX_OK (A, "A for emult phase0", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_JUMBLED_OK (B)) ;        // pattern not accessed
+    ASSERT (!GB_PENDING (A)) ;
+
     ASSERT_MATRIX_OK (B, "B for emult phase0", GB0) ;
-    ASSERT_MATRIX_OK_OR_NULL (M, "M for emult phase0", GB0) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;        // pattern not accessed
+    ASSERT (!GB_PENDING (B)) ;
+
     ASSERT (A->vdim == B->vdim) ;
+    ASSERT (A->vlen == B->vlen) ;
     ASSERT (GB_IMPLIES (M != NULL, A->vdim == M->vdim)) ;
+    ASSERT (GB_IMPLIES (M != NULL, A->vlen == M->vlen)) ;
 
     //--------------------------------------------------------------------------
     // initializations
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Ch = NULL ;
-    int64_t *GB_RESTRICT C_to_M = NULL ;
-    int64_t *GB_RESTRICT C_to_A = NULL ;
-    int64_t *GB_RESTRICT C_to_B = NULL ;
-
-    (*Ch_handle    ) = NULL ;
+    (*p_Cnvec) = 0 ;          
+    (*Ch_handle) = NULL ;
     if (C_to_M_handle != NULL)
     { 
         (*C_to_M_handle) = NULL ;
@@ -85,6 +98,21 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
     (*C_to_A_handle) = NULL ;
     (*C_to_B_handle) = NULL ;
 
+    if ((*C_sparsity) == GxB_BITMAP || (*C_sparsity) == GxB_FULL)
+    { 
+        // nothing to do in phase0 for C bitmap or full.  C can be full only
+        // for C=A.*B where A and B are full.  C can be bitmap for C=A.*B,
+        // C<M>=A.*B, or C<!M>=A.*B only if A, B, and M (if present) are all
+        // bitmap or full.
+        (*p_Cnvec) = A->vdim ;
+        return (GrB_SUCCESS) ;
+    }
+
+    const int64_t *GB_RESTRICT Ch = NULL ;
+    int64_t *GB_RESTRICT C_to_M = NULL ;
+    int64_t *GB_RESTRICT C_to_A = NULL ;
+    int64_t *GB_RESTRICT C_to_B = NULL ;
+
     //--------------------------------------------------------------------------
     // get content of M, A, and B
     //--------------------------------------------------------------------------
@@ -92,14 +120,13 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
     int64_t n = A->vdim ;
 
     int64_t Anvec = A->nvec ;
+    int64_t vlen  = A->vlen ;
     const int64_t *GB_RESTRICT Ah = A->h ;
-    bool A_is_hyper = A->is_hyper ;
-    ASSERT (!A->is_slice) ;
+    bool A_is_hyper = (Ah != NULL) ;
 
     int64_t Bnvec = B->nvec ;
     const int64_t *GB_RESTRICT Bh = B->h ;
-    bool B_is_hyper = B->is_hyper ;
-    ASSERT (!B->is_slice) ;
+    bool B_is_hyper = (Bh != NULL) ;
 
     int64_t Mnvec = 0 ;
     const int64_t *GB_RESTRICT Mh = NULL ;
@@ -109,8 +136,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
     { 
         Mnvec = M->nvec ;
         Mh = M->h ;
-        M_is_hyper = M->is_hyper ;
-        ASSERT (!M->is_slice) ;
+        M_is_hyper = (Mh != NULL) ;
     }
 
     //--------------------------------------------------------------------------
@@ -121,7 +147,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
     {
 
         //----------------------------------------------------------------------
-        // 8 cases to consider: A, B, M can each be hyper or standard
+        // 8 cases to consider: A, B, M can each be hyper or sparse
         //----------------------------------------------------------------------
 
         // Mask is present and not complemented
@@ -159,7 +185,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
                 {
 
                     //----------------------------------------------------------
-                    // (2) A hyper, B hyper, M standard: C hyper
+                    // (2) A hyper, B hyper, M sparse: C hyper
                     //----------------------------------------------------------
 
                     // Ch = smaller of Ah, Bh
@@ -181,7 +207,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
                 {
 
                     //----------------------------------------------------------
-                    // (3) A hyper, B standard, M hyper: C hyper
+                    // (3) A hyper, B sparse, M hyper: C hyper
                     //----------------------------------------------------------
 
                     // Ch = smaller of Mh, Ah
@@ -199,7 +225,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
                 { 
 
                     //----------------------------------------------------------
-                    // (4) A hyper, B standard, M standard: C hyper
+                    // (4) A hyper, B sparse, M sparse: C hyper
                     //----------------------------------------------------------
 
                     Ch = Ah ;
@@ -216,7 +242,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
                 {
 
                     //----------------------------------------------------------
-                    // (5) A standard, B hyper, M hyper: C hyper
+                    // (5) A sparse, B hyper, M hyper: C hyper
                     //----------------------------------------------------------
 
                     // Ch = smaller of Mh, Bh
@@ -235,7 +261,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
                 { 
 
                     //----------------------------------------------------------
-                    // (6) A standard, B hyper, M standard: C hyper
+                    // (6) A sparse, B hyper, M sparse: C hyper
                     //----------------------------------------------------------
 
                     Ch = Bh ;
@@ -249,7 +275,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
                 { 
 
                     //----------------------------------------------------------
-                    // (7) A standard, B standard, M hyper: C hyper
+                    // (7) A sparse, B sparse, M hyper: C hyper
                     //----------------------------------------------------------
 
                     Ch = Mh ;
@@ -259,11 +285,10 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
                 { 
 
                     //----------------------------------------------------------
-                    // (8) A standard, B standard, M standard: C standard
+                    // (8) A sparse, B sparse, M sparse: C sparse
                     //----------------------------------------------------------
 
-                    ;
-
+                    Ch = NULL ;
                 }
             }
         }
@@ -273,7 +298,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
     {
 
         //----------------------------------------------------------------------
-        // 4 cases to consider:  A, B can be hyper or standard
+        // 4 cases to consider:  A, B can be hyper or sparse
         //----------------------------------------------------------------------
 
         // Mask is not present, or present and complemented.
@@ -301,7 +326,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
             { 
 
                 //--------------------------------------------------------------
-                // (2) A hyper, B standard: C hyper
+                // (2) A hyper, B sparse: C hyper
                 //--------------------------------------------------------------
 
                 Ch = Ah ;
@@ -316,7 +341,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
             { 
 
                 //--------------------------------------------------------------
-                // (3) A standard, B hyper: C hyper
+                // (3) A sparse, B hyper: C hyper
                 //--------------------------------------------------------------
 
                 Ch = Bh ;
@@ -326,10 +351,10 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
             { 
 
                 //--------------------------------------------------------------
-                // (4) A standard, B standard: C standard
+                // (4) A sparse, B sparse: C sparse
                 //--------------------------------------------------------------
 
-                ;
+                Ch = NULL ;
             }
         }
     }
@@ -341,21 +366,28 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
     int64_t Cnvec ;
 
     if (Ch == NULL)
-    {
-        // C is standard
-        Cnvec = n ;
-    }
-    else if (Ch == Ah)
-    { 
-        Cnvec = Anvec ;
-    }
-    else if (Ch == Bh)
     { 
-        Cnvec = Bnvec ;
+        // C is sparse
+        (*C_sparsity) = GxB_SPARSE ;
+        Cnvec = n ;
     }
-    else // (Ch == Mh)
-    { 
-        Cnvec = Mnvec ;
+    else
+    {
+        // C is hypersparse; one of A, B, or M are hypersparse
+        ASSERT (A_is_hyper || B_is_hyper || M_is_hyper) ;
+        (*C_sparsity) = GxB_HYPERSPARSE ;
+        if (Ch == Ah)
+        { 
+            Cnvec = Anvec ;
+        }
+        else if (Ch == Bh)
+        { 
+            Cnvec = Bnvec ;
+        }
+        else // (Ch == Mh)
+        { 
+            Cnvec = Mnvec ;
+        }
     }
 
     //--------------------------------------------------------------------------
@@ -376,7 +408,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
         if (C_to_M == NULL)
         { 
             // out of memory
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         // compute C_to_M
@@ -390,7 +422,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
         { 
             int64_t pM, pM_end, kM = 0 ;
             int64_t j = Ch [k] ;
-            GB_lookup (true, Mh, Mp, &kM, Mnvec-1, j, &pM, &pM_end) ;
+            GB_lookup (true, Mh, Mp, vlen, &kM, Mnvec-1, j, &pM, &pM_end) ;
             C_to_M [k] = (pM < pM_end) ? kM : -1 ;
         }
     }
@@ -407,7 +439,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
         { 
             // out of memory
             GB_FREE (C_to_M) ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         // compute C_to_A
@@ -420,7 +452,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
         { 
             int64_t pA, pA_end, kA = 0 ;
             int64_t j = Ch [k] ;
-            GB_lookup (true, Ah, Ap, &kA, Anvec-1, j, &pA, &pA_end) ;
+            GB_lookup (true, Ah, Ap, vlen, &kA, Anvec-1, j, &pA, &pA_end) ;
             C_to_A [k] = (pA < pA_end) ? kA : -1 ;
         }
     }
@@ -438,7 +470,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
             // out of memory
             GB_FREE (C_to_M) ;
             GB_FREE (C_to_A) ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         // compute C_to_B
@@ -451,7 +483,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
         { 
             int64_t pB, pB_end, kB = 0 ;
             int64_t j = Ch [k] ;
-            GB_lookup (true, Bh, Bp, &kB, Bnvec-1, j, &pB, &pB_end) ;
+            GB_lookup (true, Bh, Bp, vlen, &kB, Bnvec-1, j, &pB, &pB_end) ;
             C_to_B [k] = (pB < pB_end) ? kB : -1 ;
         }
     }
@@ -460,8 +492,8 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
     // return result
     //--------------------------------------------------------------------------
 
-    (*p_Cnvec      ) = Cnvec ;
-    (*Ch_handle    ) = Ch ;
+    (*p_Cnvec) = Cnvec ;
+    (*Ch_handle) = Ch ;
     if (C_to_M_handle != NULL)
     {
         (*C_to_M_handle) = C_to_M ;
@@ -484,7 +516,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
         int64_t j ;
         if (Ch == NULL)
         {
-            // C will be constructed as standard sparse
+            // C will be constructed as sparse
             j = k ;
         }
         else
@@ -502,7 +534,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
         if (C_to_A != NULL)
         {
             // A is hypersparse
-            ASSERT (A->is_hyper)
+            ASSERT (A_is_hyper)
             int64_t kA = C_to_A [k] ;
             ASSERT (kA >= -1 && kA < A->nvec) ;
             if (kA >= 0)
@@ -511,7 +543,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
                 ASSERT (j == jA) ;
             }
         }
-        else if (A->is_hyper)
+        else if (A_is_hyper)
         {
             // A is hypersparse, and Ch is a shallow copy of A->h
             ASSERT (Ch == A->h) ;
@@ -521,7 +553,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
         if (C_to_B != NULL)
         {
             // B is hypersparse
-            ASSERT (B->is_hyper)
+            ASSERT (B_is_hyper)
             int64_t kB = C_to_B [k] ;
             ASSERT (kB >= -1 && kB < B->nvec) ;
             if (kB >= 0)
@@ -530,7 +562,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
                 ASSERT (j == jB) ;
             }
         }
-        else if (B->is_hyper)
+        else if (B_is_hyper)
         {
             // A is hypersparse, and Ch is a shallow copy of A->h
             ASSERT (Ch == B->h) ;
@@ -541,7 +573,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
         {
             // Ch is the same as Mh
             ASSERT (M != NULL) ;
-            ASSERT (M->is_hyper) ;
+            ASSERT (M->h != NULL) ;
             ASSERT (Ch != NULL && M->h != NULL && Ch [k] == M->h [k]) ;
             ASSERT (C_to_M == NULL) ;
         }
@@ -549,7 +581,7 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
         {
             // M is present and hypersparse
             ASSERT (M != NULL) ;
-            ASSERT (M->is_hyper) ;
+            ASSERT (M->h != NULL) ;
             int64_t kM = C_to_M [k] ;
             ASSERT (kM >= -1 && kM < M->nvec) ;
             if (kM >= 0)
@@ -560,8 +592,8 @@ GrB_Info GB_emult_phase0        // find vectors in C for C=A.*B or C<M>=A.*B
         }
         else
         {
-            // M is not present, or in standard form
-            ASSERT (M == NULL || !(M->is_hyper)) ;
+            // M is not present, or in sparse form
+            ASSERT (M == NULL || M->h == NULL) ;
         }
     }
 
diff --git a/GraphBLAS/Source/GB_emult_phase1.c b/GraphBLAS/Source/GB_emult_phase1.c
index b1ee800bf2..2b860ce3e0 100644
--- a/GraphBLAS/Source/GB_emult_phase1.c
+++ b/GraphBLAS/Source/GB_emult_phase1.c
@@ -1,19 +1,21 @@
 //------------------------------------------------------------------------------
-// GB_emult_phase1: find # of entries in C=A.*B or C<M>=A.*B
+// GB_emult_phase1: # of entries in C=A.*B or C<M or !M>=A.*B (C sparse/hyper)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // GB_emult_phase1 counts the number of entries in each vector of C, for
-// C=A.*B or C<M>=A.*B and then does a cumulative sum to find Cp.
+// C=A.*B, C<M>=A.*B, or C<!M>=A.*B and then does a cumulative sum to find Cp.
 // GB_emult_phase1 is preceded by GB_emult_phase0, which finds the non-empty
-// vectors of C.  This phase is done entirely in parallel.
+// vectors of C.  If the mask M is saprse, it is not complemented; only a
+// bitmap or full M is complemented.
 
-// C, M, A, and B can be standard sparse or hypersparse, as determined by
-// GB_emult_phase0.  If present, the mask M is not complemented.
+// C is sparse or hypersparse, as determined by GB_add_sparsity.  
+// M, A, and B can have any sparsity structure, but only a specific set of
+// cases will be used (see the list if Template/GB_sparse_emult_template.c).
 
 // Cp is either freed by GB_emult_phase2, or transplanted into C.
 
@@ -21,21 +23,22 @@
 
 GrB_Info GB_emult_phase1                // count nnz in each C(:,j)
 (
-    int64_t *GB_RESTRICT *Cp_handle,       // output of size Cnvec+1
-    int64_t *Cnvec_nonempty,            // # of non-empty vectors in C
-    // tasks from phase0b:
-    GB_task_struct *GB_RESTRICT TaskList,  // array of structs
-    const int ntasks,                   // # of tasks
-    const int nthreads,                 // # of threads to use
+    int64_t *GB_RESTRICT *Cp_handle,        // output of size Cnvec+1
+    int64_t *Cnvec_nonempty,                // # of non-empty vectors in C
+    // tasks from phase1a:
+    GB_task_struct *GB_RESTRICT TaskList,   // array of structs
+    const int C_ntasks,                       // # of tasks
+    const int C_nthreads,                     // # of threads to use
     // analysis from phase0:
     const int64_t Cnvec,
-    const int64_t *GB_RESTRICT Ch,         // Ch is NULL, or shallow pointer
+    const int64_t *GB_RESTRICT Ch,          // Ch is NULL, or shallow pointer
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     // original input:
-    const GrB_Matrix M,                 // optional mask, may be NULL
+    const GrB_Matrix M,             // optional mask, may be NULL
     const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, use !M
     const GrB_Matrix A,
     const GrB_Matrix B,
     GB_Context Context
@@ -48,9 +51,22 @@ GrB_Info GB_emult_phase1                // count nnz in each C(:,j)
 
     ASSERT (Cp_handle != NULL) ;
     ASSERT (Cnvec_nonempty != NULL) ;
+
+    ASSERT_MATRIX_OK_OR_NULL (M, "M for emult phase1", GB0) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+    ASSERT (!GB_JUMBLED (M)) ;
+    ASSERT (!GB_PENDING (M)) ;
+
     ASSERT_MATRIX_OK (A, "A for emult phase1", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+
     ASSERT_MATRIX_OK (B, "B for emult phase1", GB0) ;
-    ASSERT_MATRIX_OK_OR_NULL (M, "M for emult phase1", GB0) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+    ASSERT (!GB_PENDING (B)) ;
+
     ASSERT (A->vdim == B->vdim) ;
 
     //--------------------------------------------------------------------------
@@ -62,7 +78,7 @@ GrB_Info GB_emult_phase1                // count nnz in each C(:,j)
     if (Cp == NULL)
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -76,7 +92,7 @@ GrB_Info GB_emult_phase1                // count nnz in each C(:,j)
     // cumulative sum of Cp and fine tasks in TaskList
     //--------------------------------------------------------------------------
 
-    GB_task_cumsum (Cp, Cnvec, Cnvec_nonempty, TaskList, ntasks, nthreads) ;
+    GB_task_cumsum (Cp, Cnvec, Cnvec_nonempty, TaskList, C_ntasks, C_nthreads) ;
 
     //--------------------------------------------------------------------------
     // return the result
diff --git a/GraphBLAS/Source/GB_emult_phase2.c b/GraphBLAS/Source/GB_emult_phase2.c
index 2318f3c2b1..085a952b30 100644
--- a/GraphBLAS/Source/GB_emult_phase2.c
+++ b/GraphBLAS/Source/GB_emult_phase2.c
@@ -1,22 +1,24 @@
 //------------------------------------------------------------------------------
-// GB_emult_phase2: C=A.*B or C<M>=A.*+B
+// GB_emult_phase2: C=A.*B, C<M>=A.*B, or C<!M>=A.*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// GB_emult_phase2 computes C=A.*B or C<M>=A.*B.  It is preceded first by
-// GB_emult_phase0, which computes the list of vectors of C to compute (Ch) and
-// their location in M, A, and B (C_to_[MAB]).  Next, GB_emult_phase1 counts
-// the entries in each vector C(:,j) and computes Cp.
+// GB_emult_phase2 computes C=A.*B, C<M>=A.*B, or C<!M>=A.*B.  It is preceded
+// first by GB_emult_phase0, which computes the list of vectors of C to compute
+// (Ch) and their location in M, A, and B (C_to_[MAB]).  Next, GB_emult_phase1
+// counts the entries in each vector C(:,j) and computes Cp.
 
 // GB_emult_phase2 computes the pattern and values of each vector of C(:,j),
-// fully in parallel.
+// entirely in parallel.
 
-// C, M, A, and B can be standard sparse or hypersparse, as determined by
-// GB_emult_phase0.  If present, the mask M is not complemented.
+// C, M, A, and B can be have any sparsity structure.  If M is sparse or
+// hypersparse, and complemented, however, then it is not applied and not
+// passed to this function.  It is applied later, as determined by
+// GB_emult_sparsity.
 
 // This function either frees Cp or transplants it into C, as C->p.  Either
 // way, the caller must not free it.
@@ -24,32 +26,50 @@
 #include "GB_emult.h"
 #include "GB_binop.h"
 #include "GB_unused.h"
+#include "GB_ek_slice.h"
 #ifndef GBCOMPACT
 #include "GB_binop__include.h"
 #endif
 
+#undef  GB_FREE_WORK
+#define GB_FREE_WORK                                                    \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL         \
+{                           \
+    GB_FREE_WORK ;          \
+    GB_Matrix_free (&C) ;   \
+}
+
 GrB_Info GB_emult_phase2                // C=A.*B or C<M>=A.*B
 (
-    GrB_Matrix *Chandle,                // output matrix
-    const GrB_Type ctype,               // type of output matrix C
-    const bool C_is_csc,                // format of output matrix C
-    const GrB_BinaryOp op,              // op to perform C = op (A,B)
+    GrB_Matrix *Chandle,    // output matrix (unallocated on input)
+    const GrB_Type ctype,   // type of output matrix C
+    const bool C_is_csc,    // format of output matrix C
+    const GrB_BinaryOp op,  // op to perform C = op (A,B)
     // from phase1:
-    const int64_t *GB_RESTRICT Cp,         // vector pointers for C
+    const int64_t *GB_RESTRICT Cp,      // vector pointers for C
     const int64_t Cnvec_nonempty,       // # of non-empty vectors in C
-    // tasks from phase0b:
+    // tasks from phase1a:
     const GB_task_struct *GB_RESTRICT TaskList,  // array of structs
-    const int ntasks,                         // # of tasks
-    const int nthreads,                       // # of threads to use
+    const int C_ntasks,                         // # of tasks
+    const int C_nthreads,                       // # of threads to use
     // analysis from phase0:
     const int64_t Cnvec,
-    const int64_t *GB_RESTRICT Ch,         // Ch is NULL, or a shallow pointer
+    const int64_t *GB_RESTRICT Ch,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
+    const int C_sparsity,
     // original input:
-    const GrB_Matrix M,                 // optional mask, may be NULL
+    const GrB_Matrix M,             // optional mask, may be NULL
     const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, use !M
     const GrB_Matrix A,
     const GrB_Matrix B,
     GB_Context Context
@@ -60,34 +80,61 @@ GrB_Info GB_emult_phase2                // C=A.*B or C<M>=A.*B
     // check inputs
     //--------------------------------------------------------------------------
 
-    ASSERT (Cp != NULL) ;
     ASSERT_BINARYOP_OK (op, "op for emult phase2", GB0) ;
     ASSERT_MATRIX_OK (A, "A for emult phase2", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+
     ASSERT_MATRIX_OK (B, "B for emult phase2", GB0) ;
+    ASSERT (!GB_ZOMBIES (B)) ;
+    ASSERT (!GB_JUMBLED (B)) ;
+    ASSERT (!GB_PENDING (B)) ;
+
     ASSERT_MATRIX_OK_OR_NULL (M, "M for emult phase2", GB0) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+    ASSERT (!GB_JUMBLED (M)) ;
+    ASSERT (!GB_PENDING (M)) ;
+
     ASSERT (A->vdim == B->vdim) ;
-    ASSERT (GB_Type_compatible (ctype,   op->ztype)) ;
-    ASSERT (GB_IMPLIES (
-           !(op->opcode == GB_SECOND_opcode || op->opcode == GB_PAIR_opcode),
+
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
+
+    //--------------------------------------------------------------------------
+    // get the opcode
+    //--------------------------------------------------------------------------
+
+    bool C_is_hyper = (C_sparsity == GxB_HYPERSPARSE) ;
+    bool C_is_sparse_or_hyper = (C_sparsity == GxB_SPARSE) || C_is_hyper ;
+    ASSERT (C_is_sparse_or_hyper == (Cp != NULL)) ;
+    ASSERT (C_is_hyper == (Ch != NULL)) ;
+
+    GB_Opcode opcode = op->opcode ;
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (opcode) ;
+    bool op_is_first  = (opcode == GB_FIRST_opcode) ;
+    bool op_is_second = (opcode == GB_SECOND_opcode) ;
+    bool op_is_pair   = (opcode == GB_PAIR_opcode) ;
+
+    ASSERT (GB_Type_compatible (ctype, op->ztype)) ;
+    ASSERT (GB_IMPLIES (!(op_is_second || op_is_pair || op_is_positional),
             GB_Type_compatible (A->type, op->xtype))) ;
-    ASSERT (GB_IMPLIES (
-           !(op->opcode == GB_FIRST_opcode  || op->opcode == GB_PAIR_opcode),
+    ASSERT (GB_IMPLIES (!(op_is_first || op_is_pair || op_is_positional),
             GB_Type_compatible (B->type, op->ytype))) ;
 
     //--------------------------------------------------------------------------
     // allocate the output matrix C
     //--------------------------------------------------------------------------
 
-    int64_t cnz = Cp [Cnvec] ;
+    int64_t cnz = (C_is_sparse_or_hyper) ? Cp [Cnvec] : (A->vlen*A->vdim) ;
     (*Chandle) = NULL ;
 
-    bool C_is_hyper = (Ch != NULL) ;
-
     // allocate the result C (but do not allocate C->p or C->h)
-    GrB_Matrix C = NULL ;           // allocate a new header for C
-    GrB_Info info = GB_create (&C, ctype, A->vlen, A->vdim, GB_Ap_null,
-        C_is_csc, GB_SAME_HYPER_AS (C_is_hyper), A->hyper_ratio, Cnvec, cnz,
-        true, Context) ;
+    GrB_Matrix C = NULL ;
+    GrB_Info info = GB_new_bix (&C,     // any sparsity, new header
+        ctype, A->vlen, A->vdim, GB_Ap_null, C_is_csc,
+        C_sparsity, true, A->hyper_switch, Cnvec, cnz, true, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory; caller must free C_to_M, C_to_A, C_to_B
@@ -97,7 +144,11 @@ GrB_Info GB_emult_phase2                // C=A.*B or C<M>=A.*B
     }
 
     // transplant Cp into C as the vector pointers, from GB_emult_phase1.
-    C->p = (int64_t *) Cp ;
+    if (C_is_sparse_or_hyper)
+    { 
+        C->nvec_nonempty = Cnvec_nonempty ;
+        C->p = (int64_t *) Cp ;
+    }
 
     // add Ch as the the hypersparse list for C, from GB_emult_phase0
     if (C_is_hyper)
@@ -108,13 +159,11 @@ GrB_Info GB_emult_phase2                // C=A.*B or C<M>=A.*B
         C->nvec = Cnvec ;
     }
 
-    C->nvec_nonempty = Cnvec_nonempty ;
     C->magic = GB_MAGIC ;
-
     GB_Type_code ccode = ctype->code ;
 
     //--------------------------------------------------------------------------
-    // check the types of A and B
+    // check if the values of A and/or B are ignored
     //--------------------------------------------------------------------------
 
     // With C = ewisemult (A,B), only the intersection of A and B is used.
@@ -123,15 +172,12 @@ GrB_Info GB_emult_phase2                // C=A.*B or C<M>=A.*B
     // If op is PAIR, the values of A and B are never accessed.
     // Contrast with ewiseadd.
 
-    bool op_is_first  = op->opcode == GB_FIRST_opcode ;
-    bool op_is_second = op->opcode == GB_SECOND_opcode ;
-    bool op_is_pair   = op->opcode == GB_PAIR_opcode ;
     // A is passed as x, and B as y, in z = op(x,y)
     bool A_is_pattern = op_is_second || op_is_pair ;
     bool B_is_pattern = op_is_first  || op_is_pair ;
 
     //--------------------------------------------------------------------------
-    // using a built-in binary operator
+    // using a built-in binary operator (except for positional operators)
     //--------------------------------------------------------------------------
 
     bool done = false ;
@@ -144,71 +190,99 @@ GrB_Info GB_emult_phase2                // C=A.*B or C<M>=A.*B
 
         #define GB_AemultB(mult,xname) GB_AemultB_ ## mult ## xname
 
-        #define GB_BINOP_WORKER(mult,xname)                             \
-        {                                                               \
-            info = GB_AemultB(mult,xname) (C, M, Mask_struct, A, B,     \
-                C_to_M, C_to_A, C_to_B, TaskList, ntasks, nthreads) ;   \
-            done = (info != GrB_NO_VALUE) ;                             \
-        }                                                               \
+        #define GB_BINOP_WORKER(mult,xname)                                 \
+        {                                                                   \
+            info = GB_AemultB(mult,xname) (C, C_sparsity,                   \
+                M, Mask_struct, Mask_comp,                                  \
+                A, B, C_to_M, C_to_A, C_to_B,                               \
+                TaskList, C_ntasks, C_nthreads, Context) ;                  \
+            done = (info != GrB_NO_VALUE) ;                                 \
+        }                                                                   \
         break ;
 
         //----------------------------------------------------------------------
         // launch the switch factory
         //----------------------------------------------------------------------
 
-        GB_Opcode opcode ;
         GB_Type_code xcode, ycode, zcode ;
-
-        if (GB_binop_builtin (A->type, A_is_pattern, B->type, A_is_pattern,
+        if (!op_is_positional &&
+            GB_binop_builtin (A->type, A_is_pattern, B->type, B_is_pattern,
             op, false, &opcode, &xcode, &ycode, &zcode) && ccode == zcode)
         { 
             #include "GB_binop_factory.c"
-            ASSERT (done) ;
+        }
+
+        if (info == GrB_OUT_OF_MEMORY)
+        { 
+            // out of memory
+            GB_FREE_ALL ;
+            return (info) ;
         }
 
     #endif
 
     //--------------------------------------------------------------------------
-    // generic worker: with typecasting and arbitrary operators
+    // generic worker
     //--------------------------------------------------------------------------
 
     if (!done)
     { 
-        GB_BURBLE_MATRIX (C, "generic ") ;
+        GB_BURBLE_MATRIX (C, "(generic emult: %s) ", op->name) ;
 
         GxB_binary_function fmult ;
         size_t csize, asize, bsize, xsize, ysize, zsize ;
         GB_cast_function cast_A_to_X, cast_B_to_Y, cast_Z_to_C ;
 
-        // C = A .* B with optional typecasting
-        fmult = op->function ;
-        csize = ctype->size ;
-        asize = A->type->size ;
-        bsize = B->type->size ;
-        xsize = op->xtype->size ;
-        ysize = op->ytype->size ;
-        zsize = op->ztype->size ;
-        cast_A_to_X = GB_cast_factory (op->xtype->code, A->type->code) ;
-        cast_B_to_Y = GB_cast_factory (op->ytype->code, B->type->code) ;
-        cast_Z_to_C = GB_cast_factory (ccode,           op->ztype->code) ;
+        { 
+            // C = A .* B with optional typecasting
+            fmult = op->function ;      // NULL if op is positional
+            csize = ctype->size ;
+            asize = A->type->size ;
+            bsize = B->type->size ;
+
+            if (op_is_second || op_is_pair || op_is_positional)
+            { 
+                // the op does not depend on the value of A(i,j)
+                xsize = 1 ;
+                cast_A_to_X = NULL ;
+            }
+            else
+            { 
+                xsize = op->xtype->size ;
+                cast_A_to_X = GB_cast_factory (op->xtype->code, A->type->code) ;
+            }
+
+            if (op_is_first || op_is_pair || op_is_positional)
+            { 
+                // the op does not depend on the value of B(i,j)
+                ysize = 1 ;
+                cast_B_to_Y = NULL ;
+            }
+            else
+            { 
+                ysize = op->ytype->size ;
+                cast_B_to_Y = GB_cast_factory (op->ytype->code, B->type->code) ;
+            }
+
+            zsize = op->ztype->size ;
+            cast_Z_to_C = GB_cast_factory (ccode, op->ztype->code) ;
+        }
 
         // aij = (xtype) A(i,j), located in Ax [pA]
         #define GB_GETA(aij,Ax,pA)                                          \
             GB_void aij [GB_VLA(xsize)] ;                                   \
-            cast_A_to_X (aij, Ax +((pA)*asize), asize) ;
+            if (cast_A_to_X != NULL)                                        \
+            {                                                               \
+                cast_A_to_X (aij, Ax +((pA)*asize), asize) ;                \
+            }
 
         // bij = (ytype) B(i,j), located in Bx [pB]
         #define GB_GETB(bij,Bx,pB)                                          \
             GB_void bij [GB_VLA(ysize)] ;                                   \
-            cast_B_to_Y (bij, Bx +((pB)*bsize), bsize) ;
-
-        // C(i,j) = (ctype) (A(i,j) + B(i,j))
-        // not used if op is null
-        #define GB_BINOP(cij, aij, bij)                                     \
-            ASSERT (op != NULL) ;                                           \
-            GB_void z [GB_VLA(zsize)] ;                                     \
-            fmult (z, aij, bij) ;                                           \
-            cast_Z_to_C (cij, z, csize) ;
+            if (cast_B_to_Y != NULL)                                        \
+            {                                                               \
+                cast_B_to_Y (bij, Bx +((pB)*bsize), bsize) ;                \
+            }
 
         // address of Cx [p]
         #define GB_CX(p) Cx +((p)*csize)
@@ -222,7 +296,88 @@ GrB_Info GB_emult_phase2                // C=A.*B or C<M>=A.*B
         // loops cannot be vectorized
         #define GB_PRAGMA_SIMD_VECTORIZE ;
 
-        #include "GB_emult_template.c"
+        if (op_is_positional)
+        { 
+
+            //------------------------------------------------------------------
+            // C(i,j) = positional_op (aij, bij)
+            //------------------------------------------------------------------
+
+            int64_t offset = GB_positional_offset (opcode) ;
+
+            if (op->ztype == GrB_INT64)
+            {
+                switch (opcode)
+                {
+                    case GB_FIRSTI_opcode    : // z = first_i(A(i,j),y) == i
+                    case GB_FIRSTI1_opcode   : // z = first_i1(A(i,j),y) == i+1
+                    case GB_SECONDI_opcode   : // z = second_i(x,A(i,j)) == i
+                    case GB_SECONDI1_opcode  : // z = second_i1(x,A(i,j)) == i+1
+                        #undef  GB_BINOP
+                        #define GB_BINOP(cij, aij, bij, i, j)   \
+                            int64_t z = i + offset ;            \
+                            cast_Z_to_C (cij, &z, csize) ;
+                        #include "GB_emult_template.c"
+                        break ;
+                    case GB_FIRSTJ_opcode    : // z = first_j(A(i,j),y) == j
+                    case GB_FIRSTJ1_opcode   : // z = first_j1(A(i,j),y) == j+1
+                    case GB_SECONDJ_opcode   : // z = second_j(x,A(i,j)) == j
+                    case GB_SECONDJ1_opcode  : // z = second_j1(x,A(i,j)) == j+1
+                        #undef  GB_BINOP
+                        #define GB_BINOP(cij, aij, bij, i, j)   \
+                            int64_t z = j + offset ;            \
+                            cast_Z_to_C (cij, &z, csize) ;
+                        #include "GB_emult_template.c"
+                        break ;
+                    default: ;
+                }
+            }
+            else
+            {
+                switch (opcode)
+                {
+                    case GB_FIRSTI_opcode    : // z = first_i(A(i,j),y) == i
+                    case GB_FIRSTI1_opcode   : // z = first_i1(A(i,j),y) == i+1
+                    case GB_SECONDI_opcode   : // z = second_i(x,A(i,j)) == i
+                    case GB_SECONDI1_opcode  : // z = second_i1(x,A(i,j)) == i+1
+                        #undef  GB_BINOP
+                        #define GB_BINOP(cij, aij, bij, i, j)       \
+                            int32_t z = (int32_t) (i + offset) ;    \
+                            cast_Z_to_C (cij, &z, csize) ;
+                        #include "GB_emult_template.c"
+                        break ;
+                    case GB_FIRSTJ_opcode    : // z = first_j(A(i,j),y) == j
+                    case GB_FIRSTJ1_opcode   : // z = first_j1(A(i,j),y) == j+1
+                    case GB_SECONDJ_opcode   : // z = second_j(x,A(i,j)) == j
+                    case GB_SECONDJ1_opcode  : // z = second_j1(x,A(i,j)) == j+1
+                        #undef  GB_BINOP
+                        #define GB_BINOP(cij, aij, bij, i, j)       \
+                            int32_t z = (int32_t) (j + offset) ;    \
+                            cast_Z_to_C (cij, &z, csize) ;
+                        #include "GB_emult_template.c"
+                        break ;
+                    default: ;
+                }
+            }
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // standard binary operator
+            //------------------------------------------------------------------
+
+            // C(i,j) = (ctype) (A(i,j) + B(i,j))
+            // not used if op is null
+            #undef  GB_BINOP
+            #define GB_BINOP(cij, aij, bij, i, j)   \
+                ASSERT (op != NULL) ;               \
+                GB_void z [GB_VLA(zsize)] ;         \
+                fmult (z, aij, bij) ;               \
+                cast_Z_to_C (cij, z, csize) ;
+            #include "GB_emult_template.c"
+        }
     }
 
     //--------------------------------------------------------------------------
@@ -230,15 +385,9 @@ GrB_Info GB_emult_phase2                // C=A.*B or C<M>=A.*B
     //--------------------------------------------------------------------------
 
     if (C_is_hyper)
-    {
-        C->nvec_nonempty = -1 ;
-        info = GB_hypermatrix_prune (C, Context) ;
-        if (info != GrB_SUCCESS)
-        { 
-            // out of memory
-            GB_MATRIX_FREE (&C) ;
-            return (info) ;
-        }
+    { 
+        C->nvec_nonempty = -1 ;     // recomputed just below
+        GB_OK (GB_hypermatrix_prune (C, Context)) ;
     }
 
     //--------------------------------------------------------------------------
@@ -246,6 +395,7 @@ GrB_Info GB_emult_phase2                // C=A.*B or C<M>=A.*B
     //--------------------------------------------------------------------------
 
     // caller must free C_to_M, C_to_A, and C_to_B, but not Cp or Ch
+    GB_FREE_WORK ;
     ASSERT_MATRIX_OK (C, "C output for emult phase2", GB0) ;
     (*Chandle) = C ;
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GB_emult_sparsity.c b/GraphBLAS/Source/GB_emult_sparsity.c
new file mode 100644
index 0000000000..5cd82aadf8
--- /dev/null
+++ b/GraphBLAS/Source/GB_emult_sparsity.c
@@ -0,0 +1,226 @@
+//------------------------------------------------------------------------------
+// GB_emult_sparsity: determine the sparsity structure for C<M or !M>=A.*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Determines the sparsity structure for C, for computing C=A.*B, C<M>=A.*B,
+// or C<!M>=A.*B, based on the sparsity structures of M, A, and B, and whether
+// or not M is complemented.  It also decides if the mask M should be applied
+// by GB_emult, or if C=A.*B should be computed without the mask, and the mask
+// applied later.
+
+// If C should be constructed as hypersparse or sparse, this function simply
+// returns GxB_SPARSE.  The final determination is made by GB_emult_phase0.
+
+// If both A and B are full, then GB_ewise calls GB_add instead of GB_emult.
+// This is the only case where the eWise multiply can produce a full C matrix,
+// and as a result, there is no need for a GB_emult to handle the case when
+// C is full.
+
+#include "GB_emult.h"
+
+int GB_emult_sparsity       // return the sparsity structure for C
+(
+    // output:
+    bool *apply_mask,       // if true then mask will be applied by GB_emult
+    bool *use_add_instead,  // if true then use GB_add instead of GB_emult
+    // input:
+    const GrB_Matrix M,     // optional mask for C, unused if NULL
+    const bool Mask_comp,   // if true, use !M
+    const GrB_Matrix A,     // input A matrix
+    const GrB_Matrix B      // input B matrix
+)
+{
+
+    //--------------------------------------------------------------------------
+    // determine the sparsity of C
+    //--------------------------------------------------------------------------
+
+    // Unless deciding otherwise, use the mask if it appears
+    (*apply_mask) = (M != NULL) ;
+
+    int C_sparsity ;
+
+    // In the table below, sparse/hypersparse are listed as "sparse".  If C is
+    // listed as sparse: it is hypersparse if M is hypersparse (and not
+    // complemented), or if both A and B are hypersparse, and sparse otherwise.
+    // This is determined by GB_emult_phase0.
+
+    bool M_is_sparse_or_hyper = GB_IS_SPARSE (M) || GB_IS_HYPERSPARSE (M) ;
+    bool A_is_sparse_or_hyper = GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A) ;
+    bool B_is_sparse_or_hyper = GB_IS_SPARSE (B) || GB_IS_HYPERSPARSE (B) ;
+    bool A_is_full = GB_as_if_full (A) ;
+    bool B_is_full = GB_as_if_full (B) ;
+
+    // Methods labeled as "use GB_add" give the same results with GB_add and
+    // GB_emult, when A and B are both full.  For those cases, GB_ewise should
+    // call GB_add instead of GB_add.
+    (*use_add_instead) = A_is_full && B_is_full ;
+
+    if (M == NULL)
+    {
+
+        //      ------------------------------------------
+        //      C       =           A       .*      B
+        //      ------------------------------------------
+        //      sparse  .           sparse          sparse
+        //      sparse  .           sparse          bitmap
+        //      sparse  .           sparse          full  
+        //      sparse  .           bitmap          sparse
+        //      bitmap  .           bitmap          bitmap
+        //      bitmap  .           bitmap          full  
+        //      sparse  .           full            sparse
+        //      bitmap  .           full            bitmap
+        //      full    .           full            full    (use GB_add)
+
+        if (A_is_sparse_or_hyper || B_is_sparse_or_hyper)
+        { 
+            // C=A.*B with A or B sparse/hyper, C sparse
+            C_sparsity = GxB_SPARSE ;
+        }
+        else if (A_is_full && B_is_full)
+        { 
+            // C=A.*B with A and B full, C full
+            C_sparsity = GxB_FULL ;
+        }
+        else
+        { 
+            // C=A.*B, otherwise, C bitmap
+            C_sparsity = GxB_BITMAP ;
+        }
+
+    }
+    else if (!Mask_comp)
+    {
+
+        if (M_is_sparse_or_hyper)
+        { 
+
+            //      ------------------------------------------
+            //      C       <M>=        A       .*      B
+            //      ------------------------------------------
+            //      sparse  sparse      sparse          sparse
+            //      sparse  sparse      sparse          bitmap
+            //      sparse  sparse      sparse          full  
+            //      sparse  sparse      bitmap          sparse
+            //      sparse  sparse      bitmap          bitmap
+            //      sparse  sparse      bitmap          full  
+            //      sparse  sparse      full            sparse
+            //      sparse  sparse      full            bitmap
+            //      sparse  sparse      full            full    (use GB_add)
+
+            // TODO: check same condition as GB_add, for very sparse mask M.
+
+            // C<M>=A.*B with M sparse/hyper, C sparse
+            C_sparsity = GxB_SPARSE ;
+
+        }
+        else
+        {
+
+            //      ------------------------------------------
+            //      C      <M> =        A       .*      B
+            //      ------------------------------------------
+            //      sparse  bitmap      sparse          sparse
+            //      sparse  bitmap      sparse          bitmap
+            //      sparse  bitmap      sparse          full  
+            //      sparse  bitmap      bitmap          sparse
+            //      bitmap  bitmap      bitmap          bitmap
+            //      bitmap  bitmap      bitmap          full  
+            //      sparse  bitmap      full            sparse
+            //      bitmap  bitmap      full            bitmap
+            //      bitmap  bitmap      full            full    (use GB_add)
+
+            //      ------------------------------------------
+            //      C      <M> =        A       .*      B
+            //      ------------------------------------------
+            //      sparse  full        sparse          sparse
+            //      sparse  full        sparse          bitmap
+            //      sparse  full        sparse          full  
+            //      sparse  full        bitmap          sparse
+            //      bitmap  full        bitmap          bitmap
+            //      bitmap  full        bitmap          full  
+            //      sparse  full        full            sparse
+            //      bitmap  full        full            bitmap
+            //      bitmap  full        full            full    (use GB_add)
+
+            // The mask is very efficient to use in the case, when C is sparse.
+
+            if (A_is_sparse_or_hyper || B_is_sparse_or_hyper)
+            { 
+                // C<M>=A.*B with A or B sparse/hyper, M bitmap/full, C sparse
+                C_sparsity = GxB_SPARSE ;
+            }
+            else
+            { 
+                // C<M>=A.*B with A, B, and M bitmap/full, C bitmap
+                C_sparsity = GxB_BITMAP ;
+            }
+        }
+
+    }
+    else // Mask_comp
+    {
+
+        //      ------------------------------------------
+        //      C       <!M>=       A       .*      B
+        //      ------------------------------------------
+        //      sparse  sparse      sparse          sparse  (mask later)
+        //      sparse  sparse      sparse          bitmap  (mask later)
+        //      sparse  sparse      sparse          full    (mask later)
+        //      sparse  sparse      bitmap          sparse  (mask later)
+        //      bitmap  sparse      bitmap          bitmap
+        //      bitmap  sparse      bitmap          full  
+        //      sparse  sparse      full            sparse  (mask later)
+        //      bitmap  sparse      full            bitmap
+        //      bitmap  sparse      full            full    (use GB_add)
+
+        //      ------------------------------------------
+        //      C      <!M> =       A       .*      B
+        //      ------------------------------------------
+        //      sparse  bitmap      sparse          sparse
+        //      sparse  bitmap      sparse          bitmap
+        //      sparse  bitmap      sparse          full  
+        //      sparse  bitmap      bitmap          sparse
+        //      bitmap  bitmap      bitmap          bitmap
+        //      bitmap  bitmap      bitmap          full  
+        //      sparse  bitmap      full            sparse
+        //      bitmap  bitmap      full            bitmap
+        //      bitmap  bitmap      full            full    (use GB_add)
+
+        //      ------------------------------------------
+        //      C      <!M> =       A       .*      B
+        //      ------------------------------------------
+        //      sparse  full        sparse          sparse
+        //      sparse  full        sparse          bitmap
+        //      sparse  full        sparse          full  
+        //      sparse  full        bitmap          sparse
+        //      bitmap  full        bitmap          bitmap
+        //      bitmap  full        bitmap          full  
+        //      sparse  full        full            sparse
+        //      bitmap  full        full            bitmap
+        //      bitmap  full        full            full    (use GB_add)
+
+        // GB_emult where C is sparse/hypersparse and C<!M>=A.*B:
+        // Do not use a complemented mask in this case.  Do it later.
+
+        if (A_is_sparse_or_hyper || B_is_sparse_or_hyper)
+        { 
+            // C<!M>=A.*B with A or B sparse/hyper, C sparse
+            C_sparsity = GxB_SPARSE ;
+            (*apply_mask) = !M_is_sparse_or_hyper ;
+        }
+        else
+        { 
+            // C<!M>=A.*B with A and B bitmap/full, C bitmap
+            C_sparsity = GxB_BITMAP ;
+        }
+    }
+
+    return (C_sparsity) ;
+}
+
diff --git a/GraphBLAS/Source/GB_entry_check.c b/GraphBLAS/Source/GB_entry_check.c
index 61a57a230a..69e5b13e70 100644
--- a/GraphBLAS/Source/GB_entry_check.c
+++ b/GraphBLAS/Source/GB_entry_check.c
@@ -2,8 +2,8 @@
 // GB_entry_check: print a single entry for a built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,8 +15,7 @@ GrB_Info GB_entry_check     // print a single value
     const GrB_Type type,    // type of value to print
     const void *x,          // value to print
     int pr,                 // print level
-    FILE *f,                // file to print to
-    GB_Context Context
+    FILE *f                 // file to print to
 )
 { 
 
@@ -31,6 +30,6 @@ GrB_Info GB_entry_check     // print a single value
     // print the value
     //--------------------------------------------------------------------------
 
-    return (GB_code_check (type->code, x, pr, f, Context)) ;
+    return (GB_code_check (type->code, x, pr, f)) ;
 }
 
diff --git a/GraphBLAS/Source/GB_error.c b/GraphBLAS/Source/GB_error.c
deleted file mode 100644
index 4f5974ab3d..0000000000
--- a/GraphBLAS/Source/GB_error.c
+++ /dev/null
@@ -1,78 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_error: log an error string
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// GB_error logs the details of an error to the error string in thread-local
-// storage so that it is accessible to GrB_error.  This function is called via
-// the GB_ERROR(info,args) macro.
-
-// SuiteSparse:GraphBLAS can generate a GrB_PANIC only in these cases:
-
-//  (1) GrB_init (or GxB*init) is called twice.
-
-//  (2) unrecoverable GPU failure
-
-//  (3) an internal error in the Intel MKL library
-
-//  (4) a failure to allocate thread-local storage for GrB_error
-//      (see GB_thread_local_get).
-
-#include "GB_thread_local.h"
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_error           // log an error in thread-local-storage
-(
-    GrB_Info info,          // error return code from a GraphBLAS function
-    GB_Context Context      // pointer to a Context struct, on the stack.
-                            // The Context may be NULL, which occurs when a
-                            // parallel region calls GB_* functions and
-                            // wants them to run with one thread.
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    // GrB_SUCCESS and GrB_NO_VALUE are not errors.
-
-    ASSERT (info != GrB_SUCCESS) ;
-    ASSERT (info > GrB_NO_VALUE) ;
-    ASSERT (info <= GrB_PANIC) ;
-
-    //--------------------------------------------------------------------------
-    // quick return if Context is NULL
-    //--------------------------------------------------------------------------
-
-    if (Context == NULL)
-    { 
-        // the error cannot be logged in the Context, inside a parallel region,
-        // so just return the error.  The error will be logged when the
-        // parallel region exits.
-        return (info) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // get pointer to thread-local storage
-    //--------------------------------------------------------------------------
-
-    char *p = GB_thread_local_get ( ) ;
-    if (p == NULL) return (GrB_PANIC) ;
-
-    //--------------------------------------------------------------------------
-    // write the error to the string p
-    //--------------------------------------------------------------------------
-
-    // p now points to thread-local storage (char array of size GB_RLEN+1)
-    snprintf (p, GB_RLEN, "GraphBLAS error: %s\nfunction: %s\n%s\n",
-        GB_status_code (info),
-        (Context == NULL) ? "" : Context->where,
-        (Context == NULL) ? "" : Context->details) ;
-    return (info) ;
-}
-
diff --git a/GraphBLAS/Source/GB_eslice.c b/GraphBLAS/Source/GB_eslice.c
index 54690e4514..233e351d9b 100644
--- a/GraphBLAS/Source/GB_eslice.c
+++ b/GraphBLAS/Source/GB_eslice.c
@@ -2,8 +2,8 @@
 // GB_eslice: uniform partition of e items to each task 
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_ewise.c b/GraphBLAS/Source/GB_ewise.c
index eb80030c94..bea4e03436 100644
--- a/GraphBLAS/Source/GB_ewise.c
+++ b/GraphBLAS/Source/GB_ewise.c
@@ -2,8 +2,8 @@
 // GB_ewise: C<M> = accum (C, A+B) or A.*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,10 +20,10 @@
 
 #define GB_FREE_ALL         \
 {                           \
-    GB_MATRIX_FREE (&T) ;   \
-    GB_MATRIX_FREE (&AT) ;  \
-    GB_MATRIX_FREE (&BT) ;  \
-    GB_MATRIX_FREE (&MT) ;  \
+    GB_Matrix_free (&T) ;   \
+    GB_Matrix_free (&AT) ;  \
+    GB_Matrix_free (&BT) ;  \
+    GB_Matrix_free (&MT) ;  \
 }
 
 GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
@@ -34,7 +34,7 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
     const bool Mask_comp,           // if true, complement the mask M
     const bool Mask_struct,         // if true, use the only structure of M
     const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_BinaryOp op,          // defines '+' for C=A+B, or .* for A.*B
+    const GrB_BinaryOp op_in,       // defines '+' for C=A+B, or .* for A.*B
     const GrB_Matrix A,             // input matrix
     bool A_transpose,               // if true, use A' instead of A
     const GrB_Matrix B,             // input matrix
@@ -54,16 +54,17 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
     GrB_Info info ;
     GrB_Matrix MT = NULL, BT = NULL, AT = NULL, T = NULL ;
 
-    GB_RETURN_IF_FAULTY (accum) ;
+    GB_RETURN_IF_FAULTY_OR_POSITIONAL (accum) ;
 
     ASSERT_MATRIX_OK (C, "C input for GB_ewise", GB0) ;
     ASSERT_MATRIX_OK_OR_NULL (M, "M for GB_ewise", GB0) ;
     ASSERT_BINARYOP_OK_OR_NULL (accum, "accum for GB_ewise", GB0) ;
-    ASSERT_BINARYOP_OK (op, "op for GB_ewise", GB0) ;
+    ASSERT_BINARYOP_OK (op_in, "op for GB_ewise", GB0) ;
     ASSERT_MATRIX_OK (A, "A for GB_ewise", GB0) ;
     ASSERT_MATRIX_OK (B, "B for GB_ewise", GB0) ;
 
     // T has the same type as the output z for z=op(a,b)
+    GrB_BinaryOp op = op_in ;
     GrB_Type T_type = op->ztype ;
 
     // check domains and dimensions for C<M> = accum (C,T)
@@ -78,18 +79,18 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
         // C = A is done for entries in A but not C
         if (!GB_Type_compatible (C->type, A->type))
         { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+            GB_ERROR (GrB_DOMAIN_MISMATCH,
                 "First input of type [%s]\n"
                 "cannot be typecast to final output of type [%s]",
-                A->type->name, C->type->name))) ;
+                A->type->name, C->type->name) ;
         }
         // C = B is done for entries in B but not C
         if (!GB_Type_compatible (C->type, B->type))
         { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+            GB_ERROR (GrB_DOMAIN_MISMATCH,
                 "Second input of type [%s]\n"
                 "cannot be typecast to final output of type [%s]",
-                B->type->name, C->type->name))) ;
+                B->type->name, C->type->name) ;
         }
     }
 
@@ -103,32 +104,29 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
     if (anrows != bnrows || ancols != bncols ||
         cnrows != anrows || cncols != bncols)
     { 
-        return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
+        GB_ERROR (GrB_DIMENSION_MISMATCH,
             "Dimensions not compatible:\n"
             "output is " GBd "-by-" GBd "\n"
             "first input is " GBd "-by-" GBd "%s\n"
             "second input is " GBd "-by-" GBd "%s",
             cnrows, cncols,
             anrows, ancols, A_transpose ? " (transposed)" : "",
-            bnrows, bncols, B_transpose ? " (transposed)" : ""))) ;
+            bnrows, bncols, B_transpose ? " (transposed)" : "") ;
     }
 
     // quick return if an empty mask M is complemented
     GB_RETURN_IF_QUICK_MASK (C, C_replace, M, Mask_comp) ;
 
-    // delete any lingering zombies and assemble any pending tuples
-    GB_MATRIX_WAIT (M) ;
-    GB_MATRIX_WAIT (A) ;
-    GB_MATRIX_WAIT (B) ;
-
     //--------------------------------------------------------------------------
     // handle CSR and CSC formats
     //--------------------------------------------------------------------------
 
-    // CSC/CSR format of T is same as C.  Conform A and B to the format of C.
+    GB_Opcode opcode = op->opcode ;
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (opcode) ;
 
-    bool C_is_csc = C->is_csc ;
-    if (C_is_csc != A->is_csc)
+    // CSC/CSR format of T is same as C.  Conform A and B to the format of C.
+    bool T_is_csc = C->is_csc ;
+    if (T_is_csc != A->is_csc)
     { 
         // Flip the sense of A_transpose.  For example, if C is CSC and A is
         // CSR, and A_transpose is true, then C=A'+B is being computed.  But
@@ -136,7 +134,7 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
         A_transpose = !A_transpose ;
     }
 
-    if (C_is_csc != B->is_csc)
+    if (T_is_csc != B->is_csc)
     { 
         // Flip the sense of B_transpose.
         B_transpose = !B_transpose ;
@@ -144,25 +142,31 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
 
     if (A_transpose && B_transpose)
     { 
-        // T=A'+B' replaced with T=(A+B)'
+        // T=A'+B' is not computed.  Instead, T=A+B is computed first,
+        // and then C = T' is computed.
         A_transpose = false ;
         B_transpose = false ;
-        C_is_csc = !C_is_csc ;
+        // The CSC format of T and C now differ.
+        T_is_csc = !T_is_csc ;
+    }
+
+    if (!T_is_csc)
+    { 
+        if (op_is_positional)
+        { 
+            // positional ops must be flipped, with i and j swapped
+            op = GB_positional_binop_ijflip (op) ;
+            opcode = op->opcode ;
+        }
     }
 
     //--------------------------------------------------------------------------
-    // determine if any matrices are dense
+    // determine if any matrices are dense or full
     //--------------------------------------------------------------------------
 
     bool C_is_dense = GB_is_dense (C) && !GB_PENDING_OR_ZOMBIES (C) ;
     bool A_is_dense = GB_is_dense (A) ;
     bool B_is_dense = GB_is_dense (B) ;
-    bool M_is_dense = GB_is_dense (M) ;
-
-    if (C_is_dense) { GBBURBLE ("(C dense) ") ; }
-    if (A_is_dense) { GBBURBLE ("(A dense) ") ; }
-    if (B_is_dense) { GBBURBLE ("(B dense) ") ; }
-    if (M_is_dense) { GBBURBLE ("(M dense) ") ; }
 
     //--------------------------------------------------------------------------
     // decide when to apply the mask
@@ -174,44 +178,22 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
     // to apply.
 
     // check the CSR/CSC format of M
-    bool M_is_csc = (M == NULL) ? C_is_csc : M->is_csc ;
-    bool mask_applied = false ;
-    GrB_Matrix M1 = NULL ;
+    bool M_is_csc = (M == NULL) ? T_is_csc : M->is_csc ;
 
-    if (M != NULL && !Mask_comp)
-    {
-        // mask is present, not complemented; see if it is quick or easy to use.
-        // it may be a structural or valued mask.
-        bool mask_is_easy = (A_is_dense || (A == M))    // A is easy
-                         && (B_is_dense || (B == M)) ;  // and B is easy
-        bool mask_is_very_sparse = GB_MASK_VERY_SPARSE (M, A, B) ;
-        if (mask_is_easy || mask_is_very_sparse)
-        {
-            // the mask is present, not complemented, and very sparse or easy
-            // to exploit ; use it during GB_add and GB_emult to reduce memory
-            // and work.
-            M1 = M ;
-            if (C_is_csc != M_is_csc)
-            { 
-                GBBURBLE ("(M transpose) ") ;
-                GB_OK (GB_transpose (&MT, GrB_BOOL, C_is_csc, M,
-                    NULL, NULL, NULL, false, Context)) ;
-                M1 = MT ;
-            }
-            mask_applied = true ;
-            if (mask_is_easy)
-            { 
-                GBBURBLE ("(mask is easy) ") ;
-            }
-            else // mask_is_very_sparse
-            { 
-                GBBURBLE ("(mask applied) ") ;
-            }
-        }
-        else
-        { 
-            GBBURBLE ("(mask later) ") ;
-        }
+    //--------------------------------------------------------------------------
+    // transpose M if needed
+    //--------------------------------------------------------------------------
+
+    GrB_Matrix M1 = M ;
+    bool M_transpose = (T_is_csc != M_is_csc) ;
+    if (M_transpose)
+    { 
+        // MT = M'
+        // transpose: no typecast, no op, not in-place
+        GBURBLE ("(M transpose) ") ;
+        GB_OK (GB_transpose (&MT, GrB_BOOL, T_is_csc, M,
+            NULL, NULL, NULL, false, Context)) ;
+        M1 = MT ;
     }
 
     //--------------------------------------------------------------------------
@@ -222,11 +204,12 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
     if (A_transpose)
     { 
         // AT = A'
-        // transpose: no typecast, no op, not in place
-        GBBURBLE ("(A transpose) ") ;
-        GB_OK (GB_transpose (&AT, NULL, C_is_csc, A,
+        // transpose: no typecast, no op, not in-place
+        GBURBLE ("(A transpose) ") ;
+        GB_OK (GB_transpose (&AT, NULL, T_is_csc, A,
             NULL, NULL, NULL, false, Context)) ;
         A1 = AT ;
+        ASSERT_MATRIX_OK (AT, "AT from transpose", GB0) ;
     }
 
     //--------------------------------------------------------------------------
@@ -237,30 +220,49 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
     if (B_transpose)
     { 
         // BT = B'
-        // transpose: no typecast, no op, not in place
-        GBBURBLE ("(B transpose) ") ;
-        GB_OK (GB_transpose (&BT, NULL, C_is_csc, B,
+        // transpose: no typecast, no op, not in-place
+        GBURBLE ("(B transpose) ") ;
+        GB_OK (GB_transpose (&BT, NULL, T_is_csc, B,
             NULL, NULL, NULL, false, Context)) ;
         B1 = BT ;
     }
 
     //--------------------------------------------------------------------------
-    // special cases
+    // delete any lingering zombies and assemble any pending tuples
     //--------------------------------------------------------------------------
 
-    // FUTURE::: handle more special cases
+    // TODO: delay the unjumbling of A1 and B1.  If either is bitmap/full,
+    // then they other can remain jumbled.
+    GB_MATRIX_WAIT (M1) ;       // cannot be jumbled
+    GB_MATRIX_WAIT (A1) ;       // cannot be jumbled
+    GB_MATRIX_WAIT (B1) ;       // cannot be jumbled
 
-    if (A_is_dense && B_is_dense)
-    { 
-        // no need to use eWiseAdd if both A and B are dense
-        eWiseAdd = false ;
-    }
+    //--------------------------------------------------------------------------
+    // special cases
+    //--------------------------------------------------------------------------
+
+    // FUTURE::: handle more special cases:
+    // C<M>=A+B when C and A are dense, B is sparse.  M can be sparse.
+    // C<M>=A+B when C and B are dense, A is sparse.  M can be sparse.
+    // C<M>=A+B when C, A, and B are dense.  M can be sparse.
+    // Also do:
+    // C<M>+=A+B when C and A are dense, B is sparse.  M can be sparse.
+    // C<M>+=A+B when C and B are dense, A is sparse.  M can be sparse.
+    // C<M>+=A+B when C, A, and B are dense.  M can be sparse.
+    // In all cases above, C remains dense and can be updated in-place
+    // C_replace must be false.  M can be valued or structural.
 
     bool no_typecast =
         (op->ztype == C->type)              // no typecasting of C
         && (op->xtype == A1->type)          // no typecasting of A
         && (op->ytype == B1->type) ;        // no typecasting of B
 
+    bool C_is_bitmap = GB_IS_BITMAP (C) ;
+    bool M_is_bitmap = GB_IS_BITMAP (M) ;
+    bool A_is_bitmap = GB_IS_BITMAP (A) ;
+    bool B_is_bitmap = GB_IS_BITMAP (B) ;
+    bool any_bitmap = C_is_bitmap || M_is_bitmap || A_is_bitmap || B_is_bitmap ;
+
     #ifndef GBCOMPACT
 
         // FUTURE: for sssp12:
@@ -272,16 +274,17 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
     if (A_is_dense                          // A and B are dense
         && B_is_dense
         && (M == NULL) && !Mask_comp        // no mask
-        && (C->is_csc == C_is_csc)          // no transpose of C
+        && (C->is_csc == T_is_csc)          // no transpose of C
         && no_typecast                      // no typecasting
-        && (op->opcode < GB_USER_opcode)    // not a user-defined operator
-        )
+        && (opcode < GB_USER_opcode)        // not a user-defined operator
+        && (!op_is_positional)              // op is not positional
+        && !any_bitmap)
     {
 
         if (C_is_dense                      // C is dense
         && accum == op                      // accum is same as the op
-        && (op->opcode >= GB_MIN_opcode)    // subset of binary operators
-        && (op->opcode <= GB_RDIV_opcode))
+        && (opcode >= GB_MIN_opcode)        // subset of binary operators
+        && (opcode <= GB_RDIV_opcode))
         { 
 
             //------------------------------------------------------------------
@@ -289,7 +292,7 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
             //------------------------------------------------------------------
 
             // C_replace is ignored
-            GBBURBLE ("dense C+=A+B ") ;
+            GBURBLE ("dense C+=A+B ") ;
             GB_dense_ewise3_accum (C, A1, B1, op, Context) ;    // cannot fail
             GB_FREE_ALL ;
             ASSERT_MATRIX_OK (C, "C output for GB_ewise, dense C+=A+B", GB0) ;
@@ -304,7 +307,7 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
             //------------------------------------------------------------------
 
             // C_replace is ignored
-            GBBURBLE ("dense C=A+B ") ;
+            GBURBLE ("dense C=A+B ") ;
             info = GB_dense_ewise3_noaccum (C, C_is_dense, A1, B1, op, Context);
             GB_FREE_ALL ;
             if (info == GrB_SUCCESS)
@@ -318,26 +321,144 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
     #endif
 
     //--------------------------------------------------------------------------
-    // T = A+B or A.*B
+    // T = A+B or A.*B, or with any mask M
     //--------------------------------------------------------------------------
 
+    bool mask_applied = false ;
+
     if (eWiseAdd)
     { 
-        GB_OK (GB_add (&T, T_type, C_is_csc, M1, Mask_struct, A1, B1, op,
-            Context)) ;
+
+        //----------------------------------------------------------------------
+        // T<any mask> = A+B
+        //----------------------------------------------------------------------
+
+        // TODO: check the mask condition in GB_add_sparsity.
+        // Only exploit the mask in GB_add if it's more efficient than
+        // exploiting it later, probably this condition:
+
+            // (accum == NULL) && (C->is_csc == T->is_csc)
+            // && (C_replace || GB_NNZ_UPPER_BOUND (C) == 0))
+
+        // If that is true and the mask is applied, then T is transplanted as
+        // the final C and the mask is no longer needed.  In this case, it
+        // could be faster to exploit the mask duing GB_add.
+
+        GB_OK (GB_add (&T, T_type, T_is_csc, M1, Mask_struct, Mask_comp,
+            &mask_applied, A1, B1, op, Context)) ;
     }
     else
     { 
-        GB_OK (GB_emult (&T, T_type, C_is_csc, M1, Mask_struct, A1, B1, op,
-            Context)) ;
+
+        //----------------------------------------------------------------------
+        // T<any mask> = A.*B
+        //----------------------------------------------------------------------
+
+        // T can be returned with shallow components derived from its inputs A1
+        // and/or B1.  In particular, if T is hypersparse, T->h may be a
+        // shallow copy of A1->h, B1->h, or M1->h.  T is hypersparse if any
+        // matrix A1, B1, or M1 are hypersparse.  Internally, T->h always
+        // starts as a shallow copy of A1->h, B1->h, or M1->h, but it may be
+        // pruned by GB_hypermatrix_prune, and thus no longer shallow.
+
+#if 0
+        // TODO: check whether or not to exploit the mask in GB_emult_sparsity.
+
+        if (M != NULL && !Mask_comp)
+        {
+            // mask is present, not complemented; see if it is quick or easy to
+            // use.  it may be a structural or valued mask.
+            bool mask_is_easy = (A_is_dense || (A == M))    // A is easy
+                             && (B_is_dense || (B == M)) ;  // and B is easy
+            bool mask_is_very_sparse = GB_MASK_VERY_SPARSE (M, A, B) ;
+            if (mask_is_easy || mask_is_very_sparse)
+            {
+                // the mask is present, not complemented, and very sparse or
+                // easy to exploit ; use it during GB_add and GB_emult to
+                // reduce memory and work.
+                mask_applied = true ;
+            }
+            else
+            {
+                // do not apply the mask now
+                M1 = NULL ;
+                mask_applied = false ;
+            }
+        }
+        else
+        {
+            // do not apply the mask now
+            M1 = NULL ;
+            mask_applied = false ;
+        }
+#endif
+
+        GB_OK (GB_emult (&T, T_type, T_is_csc, M1, Mask_struct, Mask_comp,
+            &mask_applied, A1, B1, op, Context)) ;
+
+        //----------------------------------------------------------------------
+        // transplant shallow content from AT, BT, or MT
+        //----------------------------------------------------------------------
+
+        // If T is hypersparse, T->h is always a shallow copy of A1->h, B1->h,
+        // or M1->h.  Any of the three matrices A1, B1, or M1 may be temporary
+        // transposes, AT, BT, and MT respectively.  If T->h is a shallow cpoy
+        // of a temporary matrix, then flip the ownership of the T->h array,
+        // from the temporary matrix into T, so that T->h is not freed when AT,
+        // BT, and MT are freed.
+
+        // GB_tranpose can return all kinds of shallow components, particularly
+        // when transposing vectors.  It can return AT->h as shallow copy of
+        // A->i, for example.
+
+        if (T->h_shallow)
+        {
+            // T->h is shallow and T is hypersparse
+            ASSERT (GB_IS_HYPERSPARSE (T)) ;
+            // one of A1, B1, or M1 is hypersparse
+            ASSERT (GB_IS_HYPERSPARSE (A1) || GB_IS_HYPERSPARSE (B1) ||
+                    GB_IS_HYPERSPARSE (M1))
+            if (A_transpose && T->h == A1->h)
+            { 
+                // A1 is the temporary matrix AT.  AT->h might itself be a
+                // shallow copy of A->h or A->i, from GB_transpose.
+                ASSERT (A1 == AT) ;
+                T->h_shallow = AT->h_shallow ;
+                AT->h_shallow = true ;
+            }
+            else if (B_transpose && T->h == B1->h)
+            { 
+                // B1 is the temporary matrix BT.  BT->h might itself be a
+                // shallow copy of B->h or B->i, from GB_transpose.
+                ASSERT (B1 == BT) ;
+                T->h_shallow = BT->h_shallow ;
+                BT->h_shallow = true ;
+            }
+            else if (M_transpose && T->h == M1->h)
+            { 
+                // M1 is the temporary matrix MT.  MT->h might itself be a
+                // shallow copy of M->h or M->i, from GB_transpose.
+                ASSERT (M1 == MT) ;
+                T->h_shallow = MT->h_shallow ;
+                MT->h_shallow = true ;
+            }
+
+            // T->h may still be shallow, but if so, it is a shallow copy of
+            // some component of the user input matrices A, B, or M, and must
+            // remain shallow.  A deep copy of it will be made when T->h is
+            // transplanted into the result C.
+            ASSERT (GB_IMPLIES (T->h_shallow,
+                (T->h == A1->h || T->h == B1->h ||
+                 (M1 != NULL && T->h == M1->h)))) ;
+        }
     }
 
     //--------------------------------------------------------------------------
     // free the transposed matrices
     //--------------------------------------------------------------------------
 
-    GB_MATRIX_FREE (&AT) ;
-    GB_MATRIX_FREE (&BT) ;
+    GB_Matrix_free (&AT) ;
+    GB_Matrix_free (&BT) ;
 
     //--------------------------------------------------------------------------
     // C<M> = accum (C,T): accumulate the results into C via the mask
@@ -354,8 +475,9 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
         // needed.  If no typecasting is done then this takes no time at all
         // and is a pure transplant.  Also conform C to its desired
         // hypersparsity.
-        GB_MATRIX_FREE (&MT) ;
-        return (GB_transplant_conform (C, C->type, &T, Context)) ;
+        GB_Matrix_free (&MT) ;
+        GB_OK (GB_transplant_conform (C, C->type, &T, Context)) ;
+        return (GB_block (C, Context)) ;
     }
     else
     { 
@@ -363,7 +485,7 @@ GrB_Info GB_ewise                   // C<M> = accum (C, A+B) or A.*B
         // GB_accum_mask also conforms C to its desired hypersparsity
         info = GB_accum_mask (C, M, MT, accum, &T, C_replace, Mask_comp,
             Mask_struct, Context) ;
-        GB_MATRIX_FREE (&MT) ;
+        GB_Matrix_free (&MT) ;
         return (info) ;
     }
 }
diff --git a/GraphBLAS/Source/GB_ewise.h b/GraphBLAS/Source/GB_ewise.h
index d3ecf59185..e47df52497 100644
--- a/GraphBLAS/Source/GB_ewise.h
+++ b/GraphBLAS/Source/GB_ewise.h
@@ -2,8 +2,8 @@
 // GB_ewise.h: definitions for GB_ewise
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_ewise_slice.c b/GraphBLAS/Source/GB_ewise_slice.c
index d0c0fca610..2fbcca1daf 100644
--- a/GraphBLAS/Source/GB_ewise_slice.c
+++ b/GraphBLAS/Source/GB_ewise_slice.c
@@ -2,8 +2,8 @@
 // GB_ewise_slice: slice the entries and vectors for an ewise operation
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,10 @@
 // C=op(A,B).  The mask is ignored for computing where to slice the work, but
 // it is sliced once the location has been found.
 
+// M, A, B: any sparsity structure (hypersparse, sparse, bitmap, or full) This
+// function should work if A or B are bitmap, but it is not needed in that
+// case.  C: constructed as sparse or hypersparse in the caller.
+
 #define GB_FREE_WORK    \
 {                       \
     GB_FREE (Coarse) ;  \
@@ -34,7 +38,7 @@ GrB_Info GB_ewise_slice
 (
     // output:
     GB_task_struct **p_TaskList,    // array of structs, of size max_ntasks
-    int *p_max_ntasks,              // size of TaskList
+    int *p_TaskList_size,           // size of TaskList
     int *p_ntasks,                  // # of tasks constructed
     int *p_nthreads,                // # of threads for eWise operation
     // input:
@@ -56,14 +60,27 @@ GrB_Info GB_ewise_slice
     //--------------------------------------------------------------------------
 
     ASSERT (p_TaskList != NULL) ;
-    ASSERT (p_max_ntasks != NULL) ;
+    ASSERT (p_TaskList_size != NULL) ;
     ASSERT (p_ntasks != NULL) ;
     ASSERT (p_nthreads != NULL) ;
+
     ASSERT_MATRIX_OK (A, "A for ewise_slice", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ; 
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ; 
+
     ASSERT_MATRIX_OK (B, "B for ewise_slice", GB0) ;
+    ASSERT (!GB_ZOMBIES (B)) ; 
+    ASSERT (!GB_JUMBLED (B)) ;
+    ASSERT (!GB_PENDING (B)) ; 
+
+    ASSERT_MATRIX_OK_OR_NULL (M, "M for ewise_slice", GB0) ;
+    ASSERT (!GB_ZOMBIES (M)) ; 
+    ASSERT (!GB_JUMBLED (M)) ;
+    ASSERT (!GB_PENDING (M)) ; 
 
     (*p_TaskList  ) = NULL ;
-    (*p_max_ntasks) = 0 ;
+    (*p_TaskList_size) = 0 ;
     (*p_ntasks    ) = 0 ;
     (*p_nthreads  ) = 1 ;
 
@@ -104,7 +121,7 @@ GrB_Info GB_ewise_slice
         TaskList [0].kfirst = 0 ;
         TaskList [0].klast  = Cnvec-1 ;
         (*p_TaskList  ) = TaskList ;
-        (*p_max_ntasks) = max_ntasks ;
+        (*p_TaskList_size) = max_ntasks ;
         (*p_ntasks    ) = (Cnvec == 0) ? 0 : 1 ;
         (*p_nthreads  ) = 1 ;
         return (GrB_SUCCESS) ;
@@ -124,13 +141,14 @@ GrB_Info GB_ewise_slice
 
     const int64_t *GB_RESTRICT Mp = NULL ;
     const int64_t *GB_RESTRICT Mi = NULL ;
+    bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
     if (M != NULL)
     { 
         Mp = M->p ;
         Mi = M->i ;
         // Ch_is_Mh is true if either true on input (for GB_add, which denotes
         // that Ch is a deep copy of M->h), or if Ch is a shallow copy of M->h.
-        Ch_is_Mh = Ch_is_Mh || (Ch != NULL && M->h != NULL && Ch == M->h) ;
+        Ch_is_Mh = Ch_is_Mh || (Ch != NULL && M_is_hyper && Ch == M->h) ;
     }
 
     //--------------------------------------------------------------------------
@@ -142,7 +160,7 @@ GrB_Info GB_ewise_slice
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -160,7 +178,7 @@ GrB_Info GB_ewise_slice
         // get the C(:,j) vector
         //----------------------------------------------------------------------
 
-        int64_t j = (Ch == NULL) ? k : Ch [k] ;
+        int64_t j = GBH (Ch, k) ;
 
         //----------------------------------------------------------------------
         // get the corresponding vector of A
@@ -170,26 +188,25 @@ GrB_Info GB_ewise_slice
         if (C_to_A != NULL)
         { 
             // A is hypersparse and the C_to_A mapping has been created
-            ASSERT (A->is_hyper || A->is_slice) ;
+            ASSERT (GB_IS_HYPERSPARSE (A)) ;
             kA = C_to_A [k] ;
             ASSERT (kA >= -1 && kA < A->nvec) ;
             if (kA >= 0)
             {
-                ASSERT (j == ((A->is_hyper) ? A->h [kA] : (A->hfirst + kA))) ;
+                ASSERT (j == GBH (A->h, kA)) ;
             }
         }
         else if (Ch_is_Ah)
         { 
             // A is hypersparse, but Ch is a shallow copy of A->h
+            ASSERT (GB_IS_HYPERSPARSE (A)) ;
             kA = k ;
             ASSERT (j == A->h [kA]) ;
         }
         else
         { 
-            // A is standard
-            ASSERT (!A->is_hyper) ;
-            ASSERT (!A->is_slice) ;
-            ASSERT (A->h == NULL) ;
+            // A is sparse, bitmap, or full
+            ASSERT (!GB_IS_HYPERSPARSE (A)) ;
             kA = j ;
         }
 
@@ -201,26 +218,25 @@ GrB_Info GB_ewise_slice
         if (C_to_B != NULL)
         { 
             // B is hypersparse and the C_to_B mapping has been created
-            ASSERT (B->is_hyper || B->is_slice) ;
+            ASSERT (GB_IS_HYPERSPARSE (B)) ;
             kB = C_to_B [k] ;
             ASSERT (kB >= -1 && kB < B->nvec) ;
             if (kB >= 0)
             {
-                ASSERT (j == ((B->is_hyper) ? B->h [kB] : (B->hfirst + kB))) ;
+                ASSERT (j == GBH (B->h, kB)) ;
             }
         }
         else if (Ch_is_Bh)
         { 
             // B is hypersparse, but Ch is a shallow copy of B->h
+            ASSERT (GB_IS_HYPERSPARSE (B)) ;
             kB = k ;
             ASSERT (j == B->h [kB]) ;
         }
         else
         { 
-            // B is standard
-            ASSERT (!B->is_hyper) ;
-            ASSERT (!B->is_slice) ;
-            ASSERT (B->h == NULL) ;
+            // B is sparse, bitmap, or full
+            ASSERT (!GB_IS_HYPERSPARSE (B)) ;
             kB = j ;
         }
 
@@ -230,8 +246,10 @@ GrB_Info GB_ewise_slice
 
         ASSERT (kA >= -1 && kA < A->nvec) ;
         ASSERT (kB >= -1 && kB < B->nvec) ;
-        int64_t aknz = (kA < 0) ? 0 : (Ap [kA+1] - Ap [kA]) ;
-        int64_t bknz = (kB < 0) ? 0 : (Bp [kB+1] - Bp [kB]) ;
+        int64_t aknz = (kA < 0) ? 0 :
+            ((Ap == NULL) ? vlen : (Ap [kA+1] - Ap [kA])) ;
+        int64_t bknz = (kB < 0) ? 0 :
+            ((Bp == NULL) ? vlen : (Bp [kB+1] - Bp [kB])) ;
 
         Cwork [k] = aknz + bknz + 1 ;
     }
@@ -259,11 +277,11 @@ GrB_Info GB_ewise_slice
     // slice the work into coarse tasks
     //--------------------------------------------------------------------------
 
-    if (!GB_pslice (&Coarse, Cwork, Cnvec, ntasks1))
+    if (!GB_pslice (&Coarse, Cwork, Cnvec, ntasks1, false))
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -338,7 +356,7 @@ GrB_Info GB_ewise_slice
             // get the vector of C
             //------------------------------------------------------------------
 
-            int64_t j = (Ch == NULL) ? k : Ch [k] ;
+            int64_t j = GBH (Ch, k) ;
 
             //------------------------------------------------------------------
             // get the corresponding vector of A
@@ -348,20 +366,23 @@ GrB_Info GB_ewise_slice
             if (C_to_A != NULL)
             { 
                 // A is hypersparse and the C_to_A mapping has been created
+                ASSERT (GB_IS_HYPERSPARSE (A)) ;
                 kA = C_to_A [k] ;
             }
             else if (Ch_is_Ah)
             { 
                 // A is hypersparse, but Ch is a shallow copy of A->h
+                ASSERT (GB_IS_HYPERSPARSE (A)) ;
                 kA = k ;
             }
             else
             { 
-                // A is standard
+                // A is sparse, bitmap, or full
+                ASSERT (!GB_IS_HYPERSPARSE (A)) ;
                 kA = j ;
             }
-            int64_t pA_start = (kA < 0) ? -1 : Ap [kA] ;
-            int64_t pA_end   = (kA < 0) ? -1 : Ap [kA+1] ;
+            int64_t pA_start = (kA < 0) ? (-1) : GBP (Ap, kA, vlen) ;
+            int64_t pA_end   = (kA < 0) ? (-1) : GBP (Ap, kA+1, vlen) ;
             bool a_empty = (pA_end == pA_start) ;
 
             //------------------------------------------------------------------
@@ -372,26 +393,31 @@ GrB_Info GB_ewise_slice
             if (C_to_B != NULL)
             { 
                 // B is hypersparse and the C_to_B mapping has been created
+                ASSERT (GB_IS_HYPERSPARSE (B)) ;
                 kB = C_to_B [k] ;
             }
             else if (Ch_is_Bh)
             { 
                 // B is hypersparse, but Ch is a shallow copy of B->h
+                ASSERT (GB_IS_HYPERSPARSE (B)) ;
                 kB = k ;
             }
             else
             { 
-                // B is standard
+                // B is sparse, bitmap, or full
+                ASSERT (!GB_IS_HYPERSPARSE (B)) ;
                 kB = j ;
             }
-            int64_t pB_start = (kB < 0) ? -1 : Bp [kB] ;
-            int64_t pB_end   = (kB < 0) ? -1 : Bp [kB+1] ;
+            int64_t pB_start = (kB < 0) ? (-1) : GBP (Bp, kB, vlen) ;
+            int64_t pB_end   = (kB < 0) ? (-1) : GBP (Bp, kB+1, vlen) ;
             bool b_empty = (pB_end == pB_start) ;
 
             //------------------------------------------------------------------
             // get the corresponding vector of M, if present
             //------------------------------------------------------------------
 
+            // M can have any sparsity structure (hyper, sparse, bitmap, full)
+
             int64_t pM_start = -1 ;
             int64_t pM_end   = -1 ;
             if (M != NULL)
@@ -400,20 +426,24 @@ GrB_Info GB_ewise_slice
                 if (C_to_M != NULL)
                 { 
                     // M is hypersparse and the C_to_M mapping has been created
+                    ASSERT (GB_IS_HYPERSPARSE (M)) ;
                     kM = C_to_M [k] ;
                 }
                 else if (Ch_is_Mh)
-                {
+                { 
+                    // M is hypersparse, but Ch is a copy of Mh
+                    ASSERT (GB_IS_HYPERSPARSE (M)) ;
                     // Ch is a deep or shallow copy of Mh
                     kM = k ;
                 }
                 else
                 { 
-                    // M is standard
+                    // M is sparse, bitmap, or full
+                    ASSERT (!GB_IS_HYPERSPARSE (M)) ;
                     kM = j ;
                 }
-                pM_start = (kM < 0) ? -1 : Mp [kM] ;
-                pM_end   = (kM < 0) ? -1 : Mp [kM+1] ;
+                pM_start = (kM < 0) ? -1 : GBP (Mp, kM, vlen) ;
+                pM_end   = (kM < 0) ? -1 : GBP (Mp, kM+1, vlen) ;
             }
             bool m_empty = (pM_end == pM_start) ;
 
@@ -467,9 +497,9 @@ GrB_Info GB_ewise_slice
                     double target_work = ((nfine-tfine) * ckwork) / nfine ;
                     int64_t pM, pA, pB ;
                     GB_slice_vector (&i, &pM, &pA, &pB,
-                        pM_start, pM_end, Mi,       // Mi NULL if M not present
-                        pA_start, pA_end, Ai, 0,    // Ai always explicit list
-                        pB_start, pB_end, Bi,       // Bi always explicit list
+                        pM_start, pM_end, Mi,
+                        pA_start, pA_end, Ai,
+                        pB_start, pB_end, Bi,
                         vlen, target_work) ;
 
                     // prior task ends at pM-1, pA-1, and pB-1
@@ -510,10 +540,10 @@ GrB_Info GB_ewise_slice
     //--------------------------------------------------------------------------
 
     GB_FREE_WORK ;
-    (*p_TaskList  ) = TaskList ;
-    (*p_max_ntasks) = max_ntasks ;
-    (*p_ntasks    ) = ntasks ;
-    (*p_nthreads  ) = nthreads ;
+    (*p_TaskList     ) = TaskList ;
+    (*p_TaskList_size) = max_ntasks ;
+    (*p_ntasks       ) = ntasks ;
+    (*p_nthreads     ) = nthreads ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_export.c b/GraphBLAS/Source/GB_export.c
new file mode 100644
index 0000000000..521c2bf143
--- /dev/null
+++ b/GraphBLAS/Source/GB_export.c
@@ -0,0 +1,135 @@
+//------------------------------------------------------------------------------
+// GB_export: export a matrix or vector
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// No conversion is done, and the matrix is exported in its current sparsity
+// structure and by-row/by-col format.
+
+#include "GB_export.h"
+
+GrB_Info GB_export      // export a matrix in any format
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix to export
+    GrB_Index *vlen,    // vector length
+    GrB_Index *vdim,    // vector dimension
+
+    // the 5 arrays:
+    GrB_Index **Ap,     // pointers, size nvec+1 for hyper, vdim+1 for sparse
+    GrB_Index *Ap_size, // size of Ap
+
+    GrB_Index **Ah,     // vector indices, size nvec for hyper
+    GrB_Index *Ah_size, // size of Ah
+
+    int8_t **Ab,        // bitmap, size nzmax
+    GrB_Index *Ab_size, // size of Ab
+
+    GrB_Index **Ai,     // indices, size nzmax
+    GrB_Index *Ai_size, // size of Ai
+
+    void **Ax,          // values, size nzmax
+    GrB_Index *Ax_size, // size of Ax (# of entries)
+
+    // additional information for specific formats:
+    GrB_Index *nvals,   // # of entries for bitmap format.
+    bool *jumbled,      // if true, sparse/hypersparse may be jumbled.
+    GrB_Index *nvec,    // size of Ah for hypersparse format.
+
+    // information for all formats:
+    int *sparsity,      // hypersparse, sparse, bitmap, or full
+    bool *is_csc,       // if true then matrix is by-column, else by-row
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (A != NULL) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (*A) ;
+    ASSERT_MATRIX_OK (*A, "A to export", GB0) ;
+    ASSERT (!GB_ZOMBIES (*A)) ;
+    ASSERT (GB_JUMBLED_OK (*A)) ;
+    ASSERT (!GB_PENDING (*A)) ;
+
+    GB_RETURN_IF_NULL (type) ;
+    GB_RETURN_IF_NULL (vlen) ;
+    GB_RETURN_IF_NULL (vdim) ;
+    GB_RETURN_IF_NULL (Ax) ;
+    GB_RETURN_IF_NULL (Ax_size) ;
+
+    int s = GB_sparsity (*A) ;
+    switch (s)
+    {
+        case GxB_HYPERSPARSE : 
+            GB_RETURN_IF_NULL (nvec) ;
+            GB_RETURN_IF_NULL (Ah) ; GB_RETURN_IF_NULL (Ah_size) ;
+
+        case GxB_SPARSE : 
+            GB_RETURN_IF_NULL (Ap) ; GB_RETURN_IF_NULL (Ap_size) ;
+            GB_RETURN_IF_NULL (Ai) ; GB_RETURN_IF_NULL (Ai_size) ;
+            break ;
+
+        case GxB_BITMAP : 
+            GB_RETURN_IF_NULL (nvals) ;
+            GB_RETURN_IF_NULL (Ab) ; GB_RETURN_IF_NULL (Ab_size) ;
+
+        case GxB_FULL : 
+            break ;
+
+        default: ;
+    }
+
+    //--------------------------------------------------------------------------
+    // export the matrix
+    //--------------------------------------------------------------------------
+
+    (*type) = (*A)->type ;
+    (*vlen) = (*A)->vlen ;
+    (*vdim) = (*A)->vdim ;
+    (*Ax) = (*A)->x ; (*A)->x = NULL ; (*Ax_size) = (*A)->nzmax ;
+
+    switch (s)
+    {
+        case GxB_HYPERSPARSE : 
+            (*nvec) = (*A)->nvec ;
+            (*Ah) = (*A)->h ; (*A)->h = NULL ; (*Ah_size) = (*A)->plen ;
+
+        case GxB_SPARSE : 
+            if (jumbled != NULL)
+            { 
+                (*jumbled) = (*A)->jumbled ;
+            }
+            (*Ap) = (*A)->p ; (*A)->p = NULL ; (*Ap_size) = (*A)->plen + 1 ;
+            (*Ai) = (*A)->i ; (*A)->i = NULL ; (*Ai_size) = (*A)->nzmax ;
+            break ;
+
+        case GxB_BITMAP : 
+            (*nvals) = (*A)->nvals ;
+            (*Ab) = (*A)->b ; (*A)->b = NULL ; (*Ab_size) = (*A)->nzmax ;
+
+        case GxB_FULL : 
+
+        default: ;
+    }
+
+    if (sparsity != NULL)
+    { 
+        (*sparsity) = s ;
+    }
+    if (is_csc != NULL)
+    { 
+        (*is_csc) = (*A)->is_csc ;
+    }
+
+    GB_Matrix_free (A) ;
+    ASSERT ((*A) == NULL) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_export.h b/GraphBLAS/Source/GB_export.h
index 128d7b4680..8b5626068b 100644
--- a/GraphBLAS/Source/GB_export.h
+++ b/GraphBLAS/Source/GB_export.h
@@ -2,8 +2,8 @@
 // GB_export.h: definitions for import/export
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,53 +11,76 @@
 #define GB_EXPORT_H
 #include "GB_transpose.h"
 
-//------------------------------------------------------------------------------
-// macros for import/export
-//------------------------------------------------------------------------------
+GrB_Info GB_import      // import a matrix in any format
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index vlen,     // vector length
+    GrB_Index vdim,     // vector dimension
+
+    // the 5 arrays:
+    GrB_Index **Ap,     // pointers, for sparse and hypersparse formats.
+                        // Ap_size >= nvec+1 for hyper, Ap_size >= vdim+1 for
+                        // sparse.  Ignored for bitmap and full formats.
+    GrB_Index Ap_size,  // size of Ap; ignored if Ap is ignored.
+    GrB_Index **Ah,     // vector indices, Ah_size >= nvec for hyper.
+                        // Ignored for sparse, bitmap, and full formats.
+    GrB_Index Ah_size,  // size of Ah; ignored if Ah is ignored.
+    int8_t **Ab,        // bitmap, for bitmap format only, Ab_size >= vlen*vdim.
+                        // Ignored for hyper, sparse, and full formats.  
+    GrB_Index Ab_size,  // size of Ab; ignored if Ab is ignored.
+    GrB_Index **Ai,     // indices, size Ai_size >= nvals(A) for hyper and
+                        // sparse formats.  Ignored for bitmap and full.
+    GrB_Index Ai_size,  // size of Ai; ignored if Ai is ignored.
+    void **Ax,          // values, Ax_size is either 1, or >= nvals(A) for
+                        // hyper or sparse formats.  Ax_size >= vlen*vdim for
+                        // bitmap or full formats.
+    GrB_Index Ax_size,  // size of Ax; never ignored.
+
+    // additional information for specific formats:
+    GrB_Index nvals,    // # of entries for bitmap format.
+    bool jumbled,       // if true, sparse/hypersparse may be jumbled.
+    GrB_Index nvec,     // size of Ah for hypersparse format.
+
+    // information for all formats:
+    int sparsity,       // hypersparse, sparse, bitmap, or full
+    bool is_csc,        // if true then matrix is by-column, else by-row
+    GB_Context Context
+) ;
+
+GrB_Info GB_export      // export a matrix in any format
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix to export
+    GrB_Index *vlen,    // vector length
+    GrB_Index *vdim,    // vector dimension
+
+    // the 5 arrays:
+    GrB_Index **Ap,     // pointers, size nvec+1 for hyper, vdim+1 for sparse
+    GrB_Index *Ap_size, // size of Ap
+
+    GrB_Index **Ah,     // vector indices, size nvec for hyper
+    GrB_Index *Ah_size, // size of Ah
+
+    int8_t **Ab,        // bitmap, size nzmax
+    GrB_Index *Ab_size, // size of Ab
+
+    GrB_Index **Ai,     // indices, size nzmax
+    GrB_Index *Ai_size, // size of Ai
+
+    void **Ax,          // values, size nzmax
+    GrB_Index *Ax_size, // size of Ax (# of entries)
+
+    // additional information for specific formats:
+    GrB_Index *nvals,   // # of entries for bitmap format.
+    bool *jumbled,      // if true, sparse/hypersparse may be jumbled.
+    GrB_Index *nvec,    // size of Ah for hypersparse format.
 
-#define GB_IMPORT_CHECK                                         \
-    GB_RETURN_IF_NULL (A) ;                                     \
-    (*A) = NULL ;                                               \
-    GB_RETURN_IF_NULL_OR_FAULTY (type) ;                        \
-    if (nrows > GxB_INDEX_MAX)                                  \
-    {                                                           \
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,           \
-            "problem too large: nrows " GBu " exceeds " GBu,    \
-            nrows, GxB_INDEX_MAX))) ;                           \
-    }                                                           \
-    if (ncols > GxB_INDEX_MAX)                                  \
-    {                                                           \
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,           \
-            "problem too large: ncols " GBu " exceeds " GBu,    \
-            ncols, GxB_INDEX_MAX))) ;                           \
-    }                                                           \
-    if (nvals > GxB_INDEX_MAX)                                  \
-    {                                                           \
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,           \
-            "problem too large: nvals " GBu " exceeds " GBu,    \
-            nvals, GxB_INDEX_MAX))) ;                           \
-    }                                                           \
-    /* get the descriptor */                                    \
-    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6) ;
-
-#define GB_EXPORT_CHECK                                         \
-    GB_RETURN_IF_NULL (A) ;                                     \
-    GB_RETURN_IF_NULL_OR_FAULTY (*A) ;                          \
-    ASSERT_MATRIX_OK (*A, "A to export", GB0) ;                 \
-    GB_RETURN_IF_NULL (type) ;                                  \
-    GB_RETURN_IF_NULL (nrows) ;                                 \
-    GB_RETURN_IF_NULL (ncols) ;                                 \
-    GB_RETURN_IF_NULL (nvals) ;                                 \
-    GB_RETURN_IF_NULL (nonempty) ;                              \
-    /* get the descriptor */                                    \
-    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6) ; \
-    /* finish any pending work */                               \
-    GB_MATRIX_WAIT (*A) ;                                       \
-    /* export basic attributes */                               \
-    (*type) = (*A)->type ;                                      \
-    (*nrows) = GB_NROWS (*A) ;                                  \
-    (*ncols) = GB_NCOLS (*A) ;                                  \
-    (*nvals) = GB_NNZ (*A) ;
+    // information for all formats:
+    int *sparsity,      // hypersparse, sparse, bitmap, or full
+    bool *is_csc,       // if true then matrix is by-column, else by-row
+    GB_Context Context
+) ;
 
 #endif
 
diff --git a/GraphBLAS/Source/GB_extract.c b/GraphBLAS/Source/GB_extract.c
index eec7d82b19..2c0630210a 100644
--- a/GraphBLAS/Source/GB_extract.c
+++ b/GraphBLAS/Source/GB_extract.c
@@ -2,8 +2,8 @@
 // GB_extract: C<M> = accum(C,A(I,J))
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -52,7 +52,7 @@ GrB_Info GB_extract                 // C<M> = accum (C, A(I,J))
     GrB_Info info ;
     GB_RETURN_IF_NULL (Rows) ;
     GB_RETURN_IF_NULL (Cols) ;
-    GB_RETURN_IF_FAULTY (accum) ;
+    GB_RETURN_IF_FAULTY_OR_POSITIONAL (accum) ;
 
     ASSERT_MATRIX_OK (C, "C input for GB_Matrix_extract", GB0) ;
     ASSERT_MATRIX_OK_OR_NULL (M, "M for GB_Matrix_extract", GB0) ;
@@ -84,19 +84,23 @@ GrB_Info GB_extract                 // C<M> = accum (C, A(I,J))
 
     if (cnrows != nRows || cncols != nCols)
     { 
-        return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
+        GB_ERROR (GrB_DIMENSION_MISMATCH,
             "Dimensions not compatible:\n"
             "required size of output is " GBd "-by-" GBd "\n"
             "but actual size output is  " GBd "-by-" GBd "\n",
-            nRows, nCols, cnrows, cncols))) ;
+            nRows, nCols, cnrows, cncols) ;
     }
 
     // quick return if an empty mask is complemented
     GB_RETURN_IF_QUICK_MASK (C, C_replace, M, Mask_comp) ;
 
     // delete any lingering zombies and assemble any pending tuples
-    GB_MATRIX_WAIT (M) ;
-    GB_MATRIX_WAIT (A) ;
+    GB_MATRIX_WAIT (M) ;        // cannot be jumbled
+    GB_MATRIX_WAIT (A) ;        // cannot be jumbled
+
+    GB_BURBLE_DENSE (C, "(C %s) ") ;
+    GB_BURBLE_DENSE (M, "(M %s) ") ;
+    GB_BURBLE_DENSE (A, "(A %s) ") ;
 
     //--------------------------------------------------------------------------
     // handle the CSR/CSC format and transpose; T = A (I,J) or T = A (J,I)
@@ -151,33 +155,14 @@ GrB_Info GB_extract                 // C<M> = accum (C, A(I,J))
 
     // T has the same hypersparsity as A.
 
-    // If T and C have different CRS/CSC formats, then GB_accum_mask must
-    // transpose T, and thus T can be returned from GB_subref with
-    // jumbled indices.  If T and C have the same CSR/CSC formats, then
-    // GB_subref must return T with sorted indices in each vector
-    // because GB_accum_mask will not transpose T.
-
-    // If T is a single column or a single row, it must be sorted, because the
-    // row/column transpose methods in GB_transpose do not do the sort.
-
-    bool must_sort = (T_is_csc == C->is_csc) || (cnrows == 1) || (cncols == 1) ;
-
     //--------------------------------------------------------------------------
     // T = A (I,J)
     //--------------------------------------------------------------------------
 
     GrB_Matrix T ;
-    GB_OK (GB_subref (&T, T_is_csc, A, I, ni, J, nj, false, must_sort,
-        Context)) ;
-
-    if (must_sort)
-    { 
-        ASSERT_MATRIX_OK (T, "T extracted", GB0) ;
-    }
-    else
-    { 
-        ASSERT_MATRIX_OK_OR_JUMBLED (T, "T extracted (jumbled OK)", GB0) ;
-    }
+    GB_OK (GB_subref (&T, T_is_csc, A, I, ni, J, nj, false, Context)) ;
+    ASSERT_MATRIX_OK (T, "T extracted", GB0) ;
+    ASSERT (GB_JUMBLED_OK (T)) ;
 
     //--------------------------------------------------------------------------
     // C<M> = accum (C,T): accumulate the results into C via the mask M
diff --git a/GraphBLAS/Source/GB_extract.h b/GraphBLAS/Source/GB_extract.h
index 96bb5325fa..d82754517b 100644
--- a/GraphBLAS/Source/GB_extract.h
+++ b/GraphBLAS/Source/GB_extract.h
@@ -2,8 +2,8 @@
 // GB_extract.h: definitions for GB_extract
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_extractTuples.c b/GraphBLAS/Source/GB_extractTuples.c
index d32ae1ba97..38a4b08771 100644
--- a/GraphBLAS/Source/GB_extractTuples.c
+++ b/GraphBLAS/Source/GB_extractTuples.c
@@ -2,8 +2,8 @@
 // GB_extractTuples: extract all the tuples from a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,12 +13,16 @@
 // which must be at least as large as GrB_nvals (&nvals, A).  The values in the
 // matrix are typecasted to the type of X, as needed.
 
-// This function is not user-callable.  It does the work for the user-callable
-// GrB_*_extractTuples functions.
+// This function does the work for the user-callable GrB_*_extractTuples
+// functions.
 
 #include "GB.h"
 
-#define GB_FREE_ALL ;
+#define GB_FREE_ALL         \
+{                           \
+    GB_FREE (Ap) ;          \
+    GB_FREE (X_bitmap) ;    \
+}
 
 GrB_Info GB_extractTuples       // extract all tuples from a matrix
 (
@@ -37,26 +41,27 @@ GrB_Info GB_extractTuples       // extract all tuples from a matrix
     //--------------------------------------------------------------------------
 
     GrB_Info info ;
+    GB_void *GB_RESTRICT X_bitmap = NULL ; 
+    int64_t *GB_RESTRICT Ap = NULL ; 
 
-    // delete any lingering zombies and assemble any pending tuples
-    ASSERT (A != NULL) ;
+    ASSERT_MATRIX_OK (A, "A to extract", GB0) ;
     ASSERT (p_nvals != NULL) ;
-    GB_MATRIX_WAIT (A) ;
+
+    // delete any lingering zombies and assemble any pending tuples;
+    // allow A to remain jumbled
+    GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (A) ;
+
+    GB_BURBLE_DENSE (A, "(A %s) ") ;
     ASSERT (xcode <= GB_UDT_code) ;
+    const GB_Type_code acode = A->type->code ;
 
     // xcode and A must be compatible
-    if (!GB_code_compatible (xcode, A->type->code))
+    if (!GB_code_compatible (xcode, acode))
     { 
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-            "entries in A of type [%s] cannot be typecast\n"
-            "to output array X of type [%s]",
-            A->type->name, GB_code_string (xcode)))) ;
+        return (GrB_DOMAIN_MISMATCH) ;
     }
 
-    ASSERT_MATRIX_OK (A, "A to extract", GB0) ;
-
-    int64_t anz = GB_NNZ (A) ;
-
+    const int64_t anz = GB_NNZ (A) ;
     if (anz == 0)
     { 
         // no work to do
@@ -69,14 +74,14 @@ GrB_Info GB_extractTuples       // extract all tuples from a matrix
     if (nvals < anz && (I_out != NULL || J_out != NULL || X != NULL))
     { 
         // output arrays are not big enough
-        return (GB_ERROR (GrB_INSUFFICIENT_SPACE, (GB_LOG,
-            "output arrays I,J,X are not big enough: nvals " GBd " < "
-            "number of entries " GBd, nvals, anz))) ;
+        return (GrB_INSUFFICIENT_SPACE) ;
     }
 
-    //--------------------------------------------------------------------------
+    const size_t asize = A->type->size ;
+
+    //-------------------------------------------------------------------------
     // determine the number of threads to use
-    //--------------------------------------------------------------------------
+    //-------------------------------------------------------------------------
 
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
     int nthreads = GB_nthreads (anz + A->nvec, chunk, nthreads_max) ;
@@ -98,44 +103,114 @@ GrB_Info GB_extractTuples       // extract all tuples from a matrix
     }
 
     //--------------------------------------------------------------------------
-    // extract the row indices
+    // bitmap case
     //--------------------------------------------------------------------------
 
-    if (I != NULL)
-    { 
-        GB_memcpy (I, A->i, anz * sizeof (int64_t), nthreads) ;
-    }
+    if (GB_IS_BITMAP (A))
+    {
 
-    //--------------------------------------------------------------------------
-    // extract the column indices
-    //--------------------------------------------------------------------------
+        //----------------------------------------------------------------------
+        // allocate workspace
+        //----------------------------------------------------------------------
 
-    if (J != NULL)
-    {
-        if (!GB_extract_vector_list ((int64_t *) J, A, nthreads))
+        bool need_typecast = (X != NULL) && (xcode != acode) ;
+        if (need_typecast)
+        { 
+            // X must be typecasted
+            int64_t anzmax = GB_IMAX (anz, 1) ;
+            X_bitmap = GB_MALLOC (anzmax * asize, GB_void) ;
+        }
+        Ap = GB_MALLOC (A->vdim+1, int64_t) ;
+        if (Ap == NULL || (need_typecast && X_bitmap == NULL))
         { 
             // out of memory
-            return (GB_OUT_OF_MEMORY) ;
+            GB_FREE_ALL ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
+
+        //----------------------------------------------------------------------
+        // extract the tuples
+        //----------------------------------------------------------------------
+
+        // TODO: pass xcode to GB_convert_bitmap_worker and let it do the
+        // typecasting.  This works for now, however.
+
+        GB_OK (GB_convert_bitmap_worker (Ap, I, J,
+            need_typecast ? X_bitmap : X, NULL, A, Context)) ;
+
+        //----------------------------------------------------------------------
+        // typecast the result if needed
+        //----------------------------------------------------------------------
+
+        if (need_typecast)
+        { 
+            // typecast the values from X_bitmap into X
+            GB_cast_array ((GB_void *) X, xcode, X_bitmap,
+                acode, NULL, asize, anz, nthreads) ;
+        }
+
     }
+    else
+    {
 
-    //--------------------------------------------------------------------------
-    // extract the values
-    //--------------------------------------------------------------------------
+        //----------------------------------------------------------------------
+        // sparse, hypersparse, or full case
+        //----------------------------------------------------------------------
+
+        //----------------------------------------------------------------------
+        // extract the row indices
+        //----------------------------------------------------------------------
+
+        if (I != NULL)
+        {
+            if (A->i == NULL)
+            {
+                // A is full; construct the row indices
+                int64_t avlen = A->vlen ;
+                int64_t p ;
+                #pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (p = 0 ; p < anz ; p++)
+                { 
+                    I [p] = (p % avlen) ;
+                }
+            }
+            else
+            { 
+                GB_memcpy (I, A->i, anz * sizeof (int64_t), nthreads) ;
+            }
+        }
 
-    if (X != NULL)
-    { 
-        // typecast or copy the values from A into X
-        GB_cast_array ((GB_void *) X, xcode,
-            (GB_void *) A->x, A->type->code, A->type->size, anz, nthreads) ;
+        //----------------------------------------------------------------------
+        // extract the column indices
+        //----------------------------------------------------------------------
+
+        if (J != NULL)
+        {
+            if (!GB_extract_vector_list ((int64_t *) J, A, nthreads))
+            { 
+                // out of memory
+                return (GrB_OUT_OF_MEMORY) ;
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // extract the values
+        //----------------------------------------------------------------------
+
+        if (X != NULL)
+        { 
+            // typecast or copy the values from A into X
+            GB_cast_array ((GB_void *) X, xcode, (GB_void *) A->x,
+                acode, NULL, asize, anz, nthreads) ;
+        }
     }
 
     //--------------------------------------------------------------------------
-    // return the number of tuples extracted
+    // free workspace and return result 
     //--------------------------------------------------------------------------
 
     *p_nvals = anz ;            // number of tuples extracted
-
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_extract_vector_list.c b/GraphBLAS/Source/GB_extract_vector_list.c
index cdac1c9aa8..b2092ef090 100644
--- a/GraphBLAS/Source/GB_extract_vector_list.c
+++ b/GraphBLAS/Source/GB_extract_vector_list.c
@@ -2,8 +2,8 @@
 // GB_extract_vector_list: extract vector indices for all entries in a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -33,6 +33,9 @@ bool GB_extract_vector_list     // true if successful, false if out of memory
     ASSERT (J != NULL) ;
     ASSERT (A != NULL) ;
     ASSERT (nthreads >= 1) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;        // pattern not accessed
+    ASSERT (GB_ZOMBIES_OK (A)) ;        // pattern not accessed
+    ASSERT (!GB_IS_BITMAP (A)) ;
 
     //--------------------------------------------------------------------------
     // get A
@@ -40,15 +43,15 @@ bool GB_extract_vector_list     // true if successful, false if out of memory
 
     const int64_t *GB_RESTRICT Ap = A->p ;
     const int64_t *GB_RESTRICT Ah = A->h ;
+    const int64_t avlen = A->vlen ;
 
     //--------------------------------------------------------------------------
     // determine the # of tasks to use
     //--------------------------------------------------------------------------
 
-    int64_t anz = GB_NNZ (A) ;
     int ntasks = (nthreads == 1) ? 1 : (2 * nthreads) ;
-    ntasks = GB_IMIN (ntasks, anz) ;
-    ntasks = GB_IMAX (ntasks, 1) ;
+
+    // TODO: use #include "GB_positional_op_ijp.c" here
 
     //--------------------------------------------------------------------------
     // slice the entries for each task
@@ -59,7 +62,7 @@ bool GB_extract_vector_list     // true if successful, false if out of memory
     // vectors may be shared with prior slices and subsequent slices.
 
     int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
-    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A, ntasks))
+    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A, &ntasks))
     { 
         // out of memory
         return (false) ;
@@ -85,10 +88,10 @@ bool GB_extract_vector_list     // true if successful, false if out of memory
             // find the part of A(:,k) to be operated on by this task
             //------------------------------------------------------------------
 
-            int64_t j = (Ah == NULL) ? k : Ah [k] ;
+            int64_t j = GBH (Ah, k) ;
             int64_t pA_start, pA_end ;
-            GB_get_pA_and_pC (&pA_start, &pA_end, NULL,
-                tid, k, kfirst, klast, pstart_slice, NULL, NULL, Ap) ;
+            GB_get_pA (&pA_start, &pA_end, tid, k, 
+                kfirst, klast, pstart_slice, Ap, avlen) ;
 
             //------------------------------------------------------------------
             // extract vector indices of A(:,j)
diff --git a/GraphBLAS/Source/GB_free_memory.c b/GraphBLAS/Source/GB_free_memory.c
index 8db17d5644..617e198906 100644
--- a/GraphBLAS/Source/GB_free_memory.c
+++ b/GraphBLAS/Source/GB_free_memory.c
@@ -2,8 +2,8 @@
 // GB_free_memory: wrapper for free
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_hyper_pack.c b/GraphBLAS/Source/GB_hyper_pack.c
new file mode 100644
index 0000000000..b1fa0f9942
--- /dev/null
+++ b/GraphBLAS/Source/GB_hyper_pack.c
@@ -0,0 +1,57 @@
+//------------------------------------------------------------------------------
+// GB_hyper_pack: create a sparse shallow copy of a hypersparse matrix
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+#include "GB_convert.h"
+
+GrB_Matrix GB_hyper_pack            // return C
+(
+    GrB_Matrix C,                   // output matrix
+    const GrB_Matrix A              // input matrix
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "hyperpack input", GB0) ;
+    ASSERT (C != NULL) ;
+    ASSERT (GB_IS_HYPERSPARSE (A)) ;
+
+    //--------------------------------------------------------------------------
+    // construct the shallow copy
+    //--------------------------------------------------------------------------
+
+    // copy the header
+    memcpy (C, A, sizeof (struct GB_Matrix_opaque)) ;
+
+    // remove the hyperlist
+    C->h = NULL ;
+    C->h_shallow = false ;
+
+    // flag all content of C as shallow
+    C->p_shallow = true ;
+    C->i_shallow = true ;
+    C->x_shallow = true ;
+
+    // C reduces in dimension to the # of vectors in A
+    C->vdim = C->nvec ;
+    C->plen = C->nvec ;
+    C->nvec_nonempty = C->nvec ;
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (C, "hyperpack output", GB0) ;
+    ASSERT (GB_IS_SPARSE (C)) ;
+    return (C) ;
+}
+
diff --git a/GraphBLAS/Source/GB_hyper_prune.c b/GraphBLAS/Source/GB_hyper_prune.c
index 9283f24c32..fca9f252c4 100644
--- a/GraphBLAS/Source/GB_hyper_prune.c
+++ b/GraphBLAS/Source/GB_hyper_prune.c
@@ -2,8 +2,8 @@
 // GB_hyper_prune: remove empty vectors from a hypersparse Ap, Ah list
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -57,7 +57,7 @@ GrB_Info GB_hyper_prune
     if (W == NULL)
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -87,7 +87,7 @@ GrB_Info GB_hyper_prune
         GB_FREE (W) ;
         GB_FREE (Ap) ;
         GB_FREE (Ah) ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_hyper_realloc.c b/GraphBLAS/Source/GB_hyper_realloc.c
index f98ddf4417..72c21c69f8 100644
--- a/GraphBLAS/Source/GB_hyper_realloc.c
+++ b/GraphBLAS/Source/GB_hyper_realloc.c
@@ -2,8 +2,8 @@
 // GB_hyper_realloc: reallocate a matrix hyperlist
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -25,26 +25,26 @@ GrB_Info GB_hyper_realloc
     //--------------------------------------------------------------------------
 
     ASSERT (A != NULL) ;
-    ASSERT (!A->p_shallow) ;
-    ASSERT (!A->h_shallow) ;
-    ASSERT (A->p != NULL) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;        // pattern not accessed
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
 
     //--------------------------------------------------------------------------
     // reallocate the hyperlist
     //--------------------------------------------------------------------------
 
-    if (A->is_hyper)
+    if (GB_IS_HYPERSPARSE (A))
     {
-
-        ASSERT (A->h != NULL) ;
+        ASSERT (!A->p_shallow) ;
+        ASSERT (!A->h_shallow) ;
 
         // old size of A->p and A->h
         int64_t plen_old = A->plen ;
 
         // change the size of A->h and A->p
         bool ok1 = true, ok2 = true ;
-        A->p = GB_REALLOC (A->p, plen_new+1, plen_old+1, int64_t, &ok1) ;
-        A->h = GB_REALLOC (A->h, plen_new,   plen_old,   int64_t, &ok2) ;
+        GB_REALLOC (A->p, plen_new+1, plen_old+1, int64_t, &ok1) ;
+        GB_REALLOC (A->h, plen_new,   plen_old,   int64_t, &ok2) ;
         bool ok = ok1 && ok2 ;
 
         // always succeeds if the space shrinks
@@ -53,8 +53,8 @@ GrB_Info GB_hyper_realloc
         if (!ok)
         { 
             // out of memory
-            GB_PHIX_FREE (A) ;
-            return (GB_OUT_OF_MEMORY) ;
+            GB_phbix_free (A) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         // size of A->p and A->h has been changed
diff --git a/GraphBLAS/Source/GB_hypermatrix_prune.c b/GraphBLAS/Source/GB_hypermatrix_prune.c
index c2ebd7448c..fa01f3d196 100644
--- a/GraphBLAS/Source/GB_hypermatrix_prune.c
+++ b/GraphBLAS/Source/GB_hypermatrix_prune.c
@@ -2,8 +2,8 @@
 // GB_hypermatrix_prune: prune empty vectors from a hypersparse matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -21,8 +21,10 @@ GrB_Info GB_hypermatrix_prune
     //--------------------------------------------------------------------------
 
     ASSERT (A != NULL) ;
-    ASSERT (GB_ZOMBIES_OK (A)) ;
-    if (!A->is_hyper)
+    ASSERT (GB_ZOMBIES_OK (A)) ;        // pattern not accessed
+    ASSERT (GB_JUMBLED_OK (A)) ;
+
+    if (!GB_IS_HYPERSPARSE (A))
     { 
         // nothing to do
         return (GrB_SUCCESS) ;
@@ -41,8 +43,8 @@ GrB_Info GB_hypermatrix_prune
     // prune empty vectors
     //--------------------------------------------------------------------------
 
-    if (A->nvec_nonempty < A->nvec)
-    { 
+    if (A->nvec_nonempty < A->nvec)     // A->nvec_nonempty used here
+    {
         // create new Ap_new and Ah_new arrays, with no empty vectors
         int64_t *GB_RESTRICT Ap_new = NULL ;
         int64_t *GB_RESTRICT Ah_new = NULL ;
@@ -63,13 +65,10 @@ GrB_Info GB_hypermatrix_prune
         A->plen = nvec_new ;
         A->nvec_nonempty = nvec_new ;
         A->magic = GB_MAGIC ;
+        ASSERT (!A->p_shallow) ;
+        ASSERT (!A->h_shallow) ;
     }
 
-    //--------------------------------------------------------------------------
-    // return result
-    //--------------------------------------------------------------------------
-
-    ASSERT (A->nvec_nonempty == GB_nvec_nonempty (A, Context)) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_ij.h b/GraphBLAS/Source/GB_ij.h
index 859b784891..38dade72bf 100644
--- a/GraphBLAS/Source/GB_ij.h
+++ b/GraphBLAS/Source/GB_ij.h
@@ -2,8 +2,8 @@
 // GB_ij.h: definitions for I and J index lists
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,12 +12,6 @@
 
 #include "GB.h"
 
-// kind of index list, Ikind and Jkind:
-#define GB_ALL 0
-#define GB_RANGE 1
-#define GB_STRIDE 2
-#define GB_LIST 4
-
 GB_PUBLIC   // accessed by the MATLAB interface only
 void GB_ijlength            // get the length and kind of an index list I
 (
@@ -96,7 +90,7 @@ static inline int64_t GB_ijlist     // get the kth item in a list of indices
 static inline bool GB_ij_is_in_list // determine if i is in the list I
 (
     const GrB_Index *I,         // list of indices for GB_LIST
-    const int64_t nI,           // length of I
+    const int64_t nI,           // length of I if Ikind is GB_LIST
     int64_t i,                  // find i = I [k] in the list
     const int Ikind,            // GB_ALL, GB_RANGE, GB_STRIDE, or GB_LIST
     const int64_t Icolon [3]    // begin:inc:end for all but GB_LIST
@@ -126,7 +120,8 @@ static inline bool GB_ij_is_in_list // determine if i is in the list I
         int64_t e   = Icolon [GxB_END] ;
         if (inc == 0)
         { 
-            // I is empty if inc is zero, so i is not in I
+            // lo:stride:hi with stride of zero.
+            // I is empty if inc is zero, so i is not in I.
             return (false) ;
         }
         else if (inc > 0)
diff --git a/GraphBLAS/Source/GB_ijlength.c b/GraphBLAS/Source/GB_ijlength.c
index 80dac55877..1263b50594 100644
--- a/GraphBLAS/Source/GB_ijlength.c
+++ b/GraphBLAS/Source/GB_ijlength.c
@@ -2,8 +2,8 @@
 // GB_ijlength: get the length and kind of an index list I
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_ijproperties.c b/GraphBLAS/Source/GB_ijproperties.c
index efde191977..91ea156cd4 100644
--- a/GraphBLAS/Source/GB_ijproperties.c
+++ b/GraphBLAS/Source/GB_ijproperties.c
@@ -2,8 +2,8 @@
 // GB_ijproperties: check I and determine its properties
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,8 +16,8 @@
 {                                                                           \
     if ((i) < 0 || (i) >= (limit))                                          \
     {                                                                       \
-        return (GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS, (GB_LOG,                 \
-        "index " GBd " out of bounds, must be < " GBd , (i), (limit)))) ;   \
+        GB_ERROR (GrB_INDEX_OUT_OF_BOUNDS,                                  \
+        "index " GBd " out of bounds, must be < " GBd , (i), (limit)) ;     \
     }                                                                       \
 }
 
@@ -73,7 +73,7 @@ GrB_Info GB_ijproperties        // check I and determine its properties
     //--------------------------------------------------------------------------
 
     // scan the list of indices: check if OK, determine if they are
-    // jumbled, or contiguous, their min and max index, and actual length
+    // unsorted, or contiguous, their min and max index, and actual length
     bool I_unsorted = false ;
     bool I_has_duplicates = false ;
     bool I_contig = true ;
@@ -203,7 +203,7 @@ GrB_Info GB_ijproperties        // check I and determine its properties
         if (Work == NULL)
         { 
             // out of memory
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
         int64_t *Work_imin = Work ;
         int64_t *Work_imax = Work + ntasks ;
@@ -320,7 +320,7 @@ GrB_Info GB_ijproperties        // check I and determine its properties
         //----------------------------------------------------------------------
 
         if (I_contig)
-        { 
+        {
             // I is a contigous list of stride 1, imin:imax.
             // change Ikind to GB_ALL if 0:limit-1, or GB_RANGE otherwise
             if (imin == 0 && imax == limit-1)
diff --git a/GraphBLAS/Source/GB_ijsort.c b/GraphBLAS/Source/GB_ijsort.c
index 29f7b8b806..e7d16fb7e8 100644
--- a/GraphBLAS/Source/GB_ijsort.c
+++ b/GraphBLAS/Source/GB_ijsort.c
@@ -2,8 +2,8 @@
 // GB_ijsort:  sort an index array I and remove duplicates
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,8 +22,6 @@
 #define GB_FREE_WORK    \
 {                       \
     GB_FREE (Count) ;   \
-    GB_FREE (W0) ;      \
-    GB_FREE (W1) ;      \
     GB_FREE (I1) ;      \
     GB_FREE (I1k) ;     \
 }
@@ -56,8 +54,6 @@ GrB_Info GB_ijsort
     GrB_Index *GB_RESTRICT I1k = NULL ;
     GrB_Index *GB_RESTRICT I2  = NULL ;
     GrB_Index *GB_RESTRICT I2k = NULL ;
-    int64_t *GB_RESTRICT W0 = NULL ;
-    int64_t *GB_RESTRICT W1 = NULL ;
     int64_t ni = *p_ni ;
     ASSERT (ni > 1) ;
     int64_t *GB_RESTRICT Count = NULL ;        // size ntasks+1
@@ -80,7 +76,7 @@ GrB_Info GB_ijsort
     { 
         // out of memory
         GB_FREE_WORK ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -103,25 +99,7 @@ GrB_Info GB_ijsort
     // sort [I1 I1k]
     //--------------------------------------------------------------------------
 
-    // determine # of threads to use in the parallel mergesort
-    int nth = GB_MSORT_NTHREADS (nthreads) ;
-
-    if (nth > 1)
-    { 
-        W0 = GB_MALLOC (ni, int64_t) ;
-        W1 = GB_MALLOC (ni, int64_t) ;
-        if (W0 == NULL || W1 == NULL)
-        { 
-            // out of memory
-            GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
-        }
-    }
-
-    GB_msort_2 ((int64_t *) I1, (int64_t *) I1k, W0, W1, ni, nth) ;
-
-    GB_FREE (W0) ;
-    GB_FREE (W1) ;
+    GB_msort_2b ((int64_t *) I1, (int64_t *) I1k, ni, nthreads) ;
 
     //--------------------------------------------------------------------------
     // determine number of tasks to create
@@ -140,7 +118,7 @@ GrB_Info GB_ijsort
     { 
         // out of memory
         GB_FREE_WORK ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -178,7 +156,7 @@ GrB_Info GB_ijsort
         GB_FREE_WORK ;
         GB_FREE (I2) ;
         GB_FREE (I2k) ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_imin.h b/GraphBLAS/Source/GB_imin.h
new file mode 100644
index 0000000000..d9938eba73
--- /dev/null
+++ b/GraphBLAS/Source/GB_imin.h
@@ -0,0 +1,18 @@
+//------------------------------------------------------------------------------
+// GB_imin.h: definitions of min/max macros
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_IMIN_H
+#define GB_IMIN_H
+
+#define GB_IMAX(x,y) (((x) > (y)) ? (x) : (y))
+#define GB_IMIN(x,y) (((x) < (y)) ? (x) : (y))
+
+#endif
+
+
diff --git a/GraphBLAS/Source/GB_import.c b/GraphBLAS/Source/GB_import.c
new file mode 100644
index 0000000000..7ec2db71a1
--- /dev/null
+++ b/GraphBLAS/Source/GB_import.c
@@ -0,0 +1,223 @@
+//------------------------------------------------------------------------------
+// GB_import: import a matrix in any format
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// TODO: import shallow for MATLAB
+
+#include "GB_export.h"
+
+GrB_Info GB_import      // import a matrix in any format
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index vlen,     // vector length
+    GrB_Index vdim,     // vector dimension
+
+    // the 5 arrays:
+    GrB_Index **Ap,     // pointers, for sparse and hypersparse formats.
+                        // Ap_size >= nvec+1 for hyper, Ap_size >= vdim+1 for
+                        // sparse.  Ignored for bitmap and full formats.
+    GrB_Index Ap_size,  // size of Ap
+
+    GrB_Index **Ah,     // vector indices, Ah_size >= nvec for hyper.
+                        // Ignored for sparse, bitmap, and full formats.
+    GrB_Index Ah_size,  // size of Ah
+
+    int8_t **Ab,        // bitmap, for bitmap format only, Ab_size >= vlen*vdim.
+                        // Ignored for hyper, sparse, and full formats.  
+    GrB_Index Ab_size,  // size of Ab
+
+    GrB_Index **Ai,     // indices, size Ai_size >= nvals(A) for hyper and
+                        // sparse formats.  Ignored for bitmap and full.
+    GrB_Index Ai_size,  // size of Ai
+
+    void **Ax,          // values, Ax_size is either 0, 1, or >= nvals(A) for
+                        // hyper or sparse formats or >= vlen*vdim for bitmap
+                        // or full formats.  Ax_size may be zero only if a
+                        // sparse or hypersparse matrix has no entries, or if a
+                        // bitmap or full matrix has a vlen or vdim of zero.
+                        // Ax and *Ax are ignored if Ax_size is zero.
+    GrB_Index Ax_size,  // size of Ax
+
+    // additional information for specific formats:
+    GrB_Index nvals,    // # of entries for bitmap format.
+    bool jumbled,       // if true, sparse/hypersparse may be jumbled.
+    GrB_Index nvec,     // size of Ah for hypersparse format.
+
+    // information for all formats:
+    int sparsity,       // hypersparse, sparse, bitmap, or full
+    bool is_csc,        // if true then matrix is by-column, else by-row
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GB_RETURN_IF_NULL (A) ;
+    (*A) = NULL ;
+    GB_RETURN_IF_NULL_OR_FAULTY (type) ;
+    if (vlen  > GxB_INDEX_MAX || vdim > GxB_INDEX_MAX ||
+        nvals > GxB_INDEX_MAX || nvec > GxB_INDEX_MAX ||
+        Ap_size > GxB_INDEX_MAX ||
+        Ah_size > GxB_INDEX_MAX || Ab_size > GxB_INDEX_MAX ||
+        Ai_size > GxB_INDEX_MAX || Ax_size > GxB_INDEX_MAX)
+    { 
+        return (GrB_INVALID_VALUE) ;
+    }
+
+    // full_size = vlen*vdim, for bitmap and full formats
+    bool ok = true ;
+    int64_t full_size ;
+    if (sparsity == GxB_BITMAP || sparsity == GxB_FULL)
+    {
+        ok = GB_Index_multiply ((GrB_Index *) &full_size, vlen, vdim) ;
+        if (!ok)
+        { 
+            // problem too large: only Ax_size == 1 is possible for GxB_FULL.
+            // GxB_BITMAP is infeasible and an error is returned below.
+            full_size = 1 ;
+        }
+    }
+
+    if (Ax_size > 0)
+    { 
+        // Ax and (*Ax) are ignored if Ax_size is zero
+        GB_RETURN_IF_NULL (Ax) ;
+        GB_RETURN_IF_NULL (*Ax) ;
+    }
+
+    switch (sparsity)
+    {
+        case GxB_HYPERSPARSE : 
+            // check Ap and get nvals
+            if (nvec > vdim) return (GrB_INVALID_VALUE) ;
+            if (Ap_size < nvec+1) return (GrB_INVALID_VALUE) ;
+            GB_RETURN_IF_NULL (Ap) ;
+            GB_RETURN_IF_NULL (*Ap) ;
+            nvals = (*Ap) [nvec] ;
+            // check Ah
+            GB_RETURN_IF_NULL (Ah) ;
+            GB_RETURN_IF_NULL (*Ah) ;
+            if (Ah_size < nvec) return (GrB_INVALID_VALUE) ;
+            // check Ai
+            if (Ai_size > 0)
+            {
+                GB_RETURN_IF_NULL (Ai) ;
+                GB_RETURN_IF_NULL (*Ai) ;
+            }
+            if (Ai_size < nvals) return (GrB_INVALID_VALUE) ;
+            // check Ax
+            if (Ax_size > 1 && Ax_size < nvals) return (GrB_INVALID_VALUE) ;
+            break ;
+
+        case GxB_SPARSE : 
+            // check Ap and get nvals
+            if (Ap_size < vdim+1) return (GrB_INVALID_VALUE) ;
+            GB_RETURN_IF_NULL (Ap) ;
+            GB_RETURN_IF_NULL (*Ap) ;
+            nvals = (*Ap) [vdim] ;
+            // check Ai
+            if (Ai_size > 0)
+            {
+                GB_RETURN_IF_NULL (Ai) ;
+                GB_RETURN_IF_NULL (*Ai) ;
+            }
+            if (Ai_size < nvals) return (GrB_INVALID_VALUE) ;
+            // check Ax
+            if (Ax_size > 1 && Ax_size < nvals) return (GrB_INVALID_VALUE) ;
+            break ;
+
+        case GxB_BITMAP : 
+            // check Ab
+            if (!ok) return (GrB_INVALID_VALUE) ;
+            if (Ab_size > 0)
+            {
+                GB_RETURN_IF_NULL (Ab) ;
+                GB_RETURN_IF_NULL (*Ab) ;
+            }
+            if (nvals > full_size) return (GrB_INVALID_VALUE) ;
+            if (Ab_size < full_size) return (GrB_INVALID_VALUE) ;
+            // check Ax
+            if (Ax_size > 1 && Ax_size < full_size) return (GrB_INVALID_VALUE) ;
+            break ;
+
+        case GxB_FULL : 
+            // check Ax
+            if (Ax_size > 1 && Ax_size < full_size) return (GrB_INVALID_VALUE) ;
+            break ;
+
+        default: ;
+    }
+
+    //--------------------------------------------------------------------------
+    // allocate just the header of the matrix, not the content
+    //--------------------------------------------------------------------------
+
+    GrB_Info info = GB_new (A, // any sparsity, new header
+        type, vlen, vdim, GB_Ap_null, is_csc,
+        sparsity, GB_Global_hyper_switch_get ( ), nvec, Context) ;
+    if (info != GrB_SUCCESS)
+    { 
+        // out of memory
+        ASSERT ((*A) == NULL) ;
+        return (info) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // import the matrix
+    //--------------------------------------------------------------------------
+
+    // transplant the user's content into the matrix
+    (*A)->magic = GB_MAGIC ;
+
+    // TODO: keep Ap_size, Ah_size, Ab_size, Ai_size, Ax_size in the
+    // GrB_Matrix data structure, and remove A->nzmax.
+
+    switch (sparsity)
+    {
+        case GxB_HYPERSPARSE : 
+            // transplant Ah and fall through to sparse case
+            (*A)->nvec = nvec ;
+            (*A)->h = (int64_t *) (*Ah) ; (*Ah) = NULL ;
+
+        case GxB_SPARSE : 
+            (*A)->jumbled = jumbled ;   // import jumbled status
+            (*A)->nvec_nonempty = -1 ;  // not computed; delay until required
+            (*A)->p = (int64_t *) (*Ap) ; (*Ap) = NULL ;
+            (*A)->i = (int64_t *) (*Ai) ; (*Ai) = NULL ;
+            (*A)->nzmax = GB_IMIN (Ai_size, Ax_size) ;
+            break ;
+
+        case GxB_BITMAP : 
+            (*A)->nvals = nvals ;
+            (*A)->b = (*Ab) ; (*Ab) = NULL ;
+            (*A)->nzmax = GB_IMIN (Ab_size, Ax_size) ;
+            break ;
+
+        case GxB_FULL : 
+            (*A)->nzmax = Ax_size ;
+            break ;
+
+        default: ;
+    }
+
+    if (Ax != NULL)
+    { 
+        (*A)->x = (*Ax) ; (*Ax) = NULL ;
+    }
+
+    //--------------------------------------------------------------------------
+    // import is successful
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (*A, "A imported", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_index.h b/GraphBLAS/Source/GB_index.h
new file mode 100644
index 0000000000..439b998f5f
--- /dev/null
+++ b/GraphBLAS/Source/GB_index.h
@@ -0,0 +1,28 @@
+//------------------------------------------------------------------------------
+// GB_index.h: definitions for index lists and types of assignments
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_INDEX_H
+#define GB_INDEX_H
+
+//------------------------------------------------------------------------------
+// kind of index list, Ikind and Jkind, and assign variations
+//------------------------------------------------------------------------------
+
+#define GB_ALL 0
+#define GB_RANGE 1
+#define GB_STRIDE 2
+#define GB_LIST 4
+
+#define GB_ASSIGN 0
+#define GB_SUBASSIGN 1
+#define GB_ROW_ASSIGN 2
+#define GB_COL_ASSIGN 3
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_init.c b/GraphBLAS/Source/GB_init.c
index d6565b0ba2..23b47358b1 100644
--- a/GraphBLAS/Source/GB_init.c
+++ b/GraphBLAS/Source/GB_init.c
@@ -2,8 +2,8 @@
 // GB_init: initialize GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -30,8 +30,6 @@
 // both pass this flag in as false.
 
 #include "GB.h"
-#include "GB_thread_local.h"
-#include "GB_mkl.h"
 
 //------------------------------------------------------------------------------
 // GB_init
@@ -58,12 +56,10 @@ GrB_Info GB_init            // start up GraphBLAS
     // check inputs
     //--------------------------------------------------------------------------
 
-    // Do not log the error for GrB_error, since it might not be initialized.
-
     if (GB_Global_GrB_init_called_get ( ))
     { 
         // GrB_init can only be called once
-        return (GrB_PANIC) ;
+        return (GrB_INVALID_VALUE) ;
     }
 
     GB_Global_GrB_init_called_set (true) ;
@@ -107,14 +103,7 @@ GrB_Info GB_init            // start up GraphBLAS
     GB_Global_free_function_set    (free_function   ) ;
     GB_Global_malloc_is_thread_safe_set (malloc_is_thread_safe) ;
 
-    #if GB_HAS_MKL_GRAPH
-    printf ("MKL version: %d\n", GB_INTEL_MKL_VERSION) ;
-    // also set the MKL allocator functions
-    i_malloc  = malloc_function ;
-    i_calloc  = calloc_function ;
-    i_realloc = realloc_function ;
-    i_free    = free_function ;
-    #endif
+    // #include "GB_init_mkl_template.c"
 
     //--------------------------------------------------------------------------
     // max number of threads
@@ -123,38 +112,16 @@ GrB_Info GB_init            // start up GraphBLAS
     // Maximum number of threads for internal parallelization.
     // SuiteSparse:GraphBLAS requires OpenMP to use parallelization within
     // calls to GraphBLAS.  The user application may also call GraphBLAS in
-    // parallel, from multiple user threads.  The user threads can use OpenMP,
-    // or POSIX pthreads.
+    // parallel, from multiple user threads.  The user threads can use
+    // any threading library; this has no effect on GraphBLAS.
 
     GB_Global_nthreads_max_set (GB_Global_omp_get_max_threads ( )) ;
     GB_Global_chunk_set (GB_CHUNK_DEFAULT) ;
 
-    //--------------------------------------------------------------------------
-    // control usage of Intel MKL
-    //--------------------------------------------------------------------------
-
-    GB_Global_use_mkl_set (false) ;
-
-    //--------------------------------------------------------------------------
-    // initialize thread-local storage
-    //--------------------------------------------------------------------------
-
-    if (!GB_thread_local_init (free_function)) GB_PANIC ;
-
-    #if defined (USER_POSIX_THREADS)
-    {
-        // TODO in 4.0: delete
-        bool ok = (pthread_mutex_init (&GB_sync, NULL) == 0) ;
-        if (!ok) GB_PANIC ;
-    }
-    #endif
-
     //--------------------------------------------------------------------------
     // initialize the blocking/nonblocking mode
     //--------------------------------------------------------------------------
 
-    GB_Global_queue_head_set (NULL) ;   // TODO in 4.0: delete
-
     // set the mode: blocking or nonblocking
     GB_Global_mode_set (mode) ;
 
@@ -162,10 +129,11 @@ GrB_Info GB_init            // start up GraphBLAS
     // set the global default format
     //--------------------------------------------------------------------------
 
-    // set the default hypersparsity ratio and CSR/CSC format;  any thread
+    // set the default hyper_switch and CSR/CSC format;  any thread
     // can do this later as well, so there is no race condition danger.
 
-    GB_Global_hyper_ratio_set (GB_HYPER_DEFAULT) ;
+    GB_Global_hyper_switch_set (GB_HYPER_SWITCH_DEFAULT) ;
+    GB_Global_bitmap_switch_default ( ) ;
     GB_Global_is_csc_set (GB_FORMAT_DEFAULT != GxB_BY_ROW) ;
 
     //--------------------------------------------------------------------------
@@ -178,11 +146,17 @@ GrB_Info GB_init            // start up GraphBLAS
     GB_Global_malloc_debug_count_set (0) ;
 
     //--------------------------------------------------------------------------
-    // development use only; controls diagnostic output
+    // diagnostic output
     //--------------------------------------------------------------------------
 
     GB_Global_burble_set (false) ;
 
+    //--------------------------------------------------------------------------
+    // development use only
+    //--------------------------------------------------------------------------
+
+    GB_Global_timing_clear_all ( ) ;
+
     //--------------------------------------------------------------------------
     // CUDA initializations
     //--------------------------------------------------------------------------
@@ -196,20 +170,26 @@ GrB_Info GB_init            // start up GraphBLAS
     if (caller_is_GxB_cuda_init)
     {
         // query the system for the # of GPUs
+        // TODO for GPU: make this a function in the CUDA folder
         GB_Global_gpu_control_set (GxB_DEFAULT) ;
-        if (!GB_Global_gpu_count_set (true)) GB_PANIC ;
+        if (!GB_Global_gpu_count_set (true)) return (GrB_PANIC) ;
         int gpu_count = GB_Global_gpu_count_get ( ) ;
-        fprintf (stderr, "gpu_count: %d\n", gpu_count) ;
-        for (int device = 0 ; device < gpu_count ; device++)
+        for (int device = 0 ; device < 1 ; device++) // TODO for GPU: gpu_count
         {
             // query the GPU and then warm it up
-            if (!GB_Global_gpu_device_properties_get (device)) GB_PANIC ;
-            if (!GB_cuda_warmup (device)) GB_PANIC ;
-            fprintf (stderr, "gpu %d memory %g Gbytes, %d SMs\n", device,
-                ((double) GB_Global_gpu_memorysize_get (device)) / 1e9,
-                GB_Global_gpu_sm_get (device)) ;
+            if (!GB_Global_gpu_device_properties_get (device))
+            {
+                return (GrB_PANIC) ;
+            }
+            if (!GB_cuda_warmup (device))
+            {
+                return (GrB_PANIC) ;
+            }
         }
-        // TODO for GPU: check for jit cache
+        // make GPU 0 the default device
+        GB_cuda_set_device( 0 );
+
+        // also check for jit cache, pre-load library of common kernels ...
     }
     else
     #endif
diff --git a/GraphBLAS/Source/GB_is_diagonal.c b/GraphBLAS/Source/GB_is_diagonal.c
index 2ae5740ad2..e372df2b68 100644
--- a/GraphBLAS/Source/GB_is_diagonal.c
+++ b/GraphBLAS/Source/GB_is_diagonal.c
@@ -2,8 +2,8 @@
 // GB_is_diagonal: check if A is a diagonal matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -26,6 +26,9 @@ bool GB_is_diagonal             // true if A is diagonal
 
     ASSERT (A != NULL) ;
     ASSERT_MATRIX_OK (A, "A check diag", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
 
     //--------------------------------------------------------------------------
     // trivial cases
@@ -40,6 +43,20 @@ bool GB_is_diagonal             // true if A is diagonal
         return (false) ;
     }
 
+    if (GB_IS_BITMAP (A))
+    { 
+        // never treat bitmaps as diagonal
+        return (false) ;
+    }
+
+    if (GB_IS_FULL (A))
+    { 
+        // A is full, and is diagonal only if 1-by-1, but always return
+        // false so that GB_AxB_rowscale and GB_AxB_colscale are not used
+        // by GB_reduce_to_vector.
+        return (false) ;
+    }
+
     int64_t anz  = GB_NNZ (A) ;
     int64_t nvec = A->nvec ;
 
@@ -126,7 +143,6 @@ bool GB_is_diagonal             // true if A is diagonal
     // return result
     //--------------------------------------------------------------------------
 
-    if (diagonal) A->nvec_nonempty = n ;
     return ((bool) diagonal) ;
 }
 
diff --git a/GraphBLAS/Source/GB_is_nonzero.h b/GraphBLAS/Source/GB_is_nonzero.h
new file mode 100644
index 0000000000..cf3977ca92
--- /dev/null
+++ b/GraphBLAS/Source/GB_is_nonzero.h
@@ -0,0 +1,23 @@
+//------------------------------------------------------------------------------
+// GB_is_nonzero.h: determine if a scalar is zero
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_IS_NONZERO_H
+#define GB_IS_NONZERO_H
+
+static inline bool GB_is_nonzero (const GB_void *value, int64_t size)
+{ 
+    for (int64_t i = 0 ; i < size ; i++)
+    {
+        if (value [i] != 0) return (true) ;
+    }
+    return (false) ;
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_is_shallow.c b/GraphBLAS/Source/GB_is_shallow.c
new file mode 100644
index 0000000000..14d8c63392
--- /dev/null
+++ b/GraphBLAS/Source/GB_is_shallow.c
@@ -0,0 +1,31 @@
+//------------------------------------------------------------------------------
+// GB_is_shallow: determine if a GrB_matrix has any shallow components
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+
+GB_PUBLIC                       // used by the MATLAB interface
+bool GB_is_shallow              // true if any component of A is shallow
+(
+    GrB_Matrix A                // matrix to query
+)
+{
+
+    if (A == NULL)
+    {
+        // a NULL pointer is not shallow
+        return (false) ;
+    }
+    else
+    { 
+        // check if any component of A is shallow
+        return (A->p_shallow || A->h_shallow || A->b_shallow ||
+                A->i_shallow || A->x_shallow) ;
+    }
+}
+
diff --git a/GraphBLAS/Source/GB_iterator.h b/GraphBLAS/Source/GB_iterator.h
deleted file mode 100644
index aa1b09bb90..0000000000
--- a/GraphBLAS/Source/GB_iterator.h
+++ /dev/null
@@ -1,236 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_iterator.h: definitions for the GrB_Matrix iterator
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-#ifndef GB_ITERATOR_H
-#define GB_ITERATOR_H
-#include "GB.h"
-
-// GBI_single_iterator: controls the iteration over the vectors of a single
-// matrix, which can be in any format (standard, hypersparse, slice, or
-// hyperslice).  It is easily parallelizable if the iterations are independent,
-// or for reduction-style loops via the appropriate #pragmas.
-
-//------------------------------------------------------------------------------
-// GBI_single_iterator: iterate over the vectors of a matrix
-//------------------------------------------------------------------------------
-
-// The Iter->* content of a GBI_single_iterator is accessed only in this file.
-// All typedefs, functions, and macros that operate on the
-// SuiteSparse:GraphBLAS iterator have names that start with the GBI prefix.
-// For both kinds of iterators, the A->h and A->p components of the matrices
-// may not change during the iteration.
-
-// The single-matrix iterator, GBI_for_each_vector (A) can handle any of the
-// four cases: standard, hypersparse, slice, or hyperslice.  The comments below
-// assume A is in CSC format.
-
-#ifdef for_comments_only    // only so vim will add color to the code below:
-
-    // The GBI_for_each_vector (A) macro, which uses the GBI_single_iterator,
-    // the two functions GBI1_init and GBI1_start, and the macro
-    // GBI_jth_iteration can do any one of the 4 following for loops, depending
-    // on whether A is standard, hypersparse, a slice, or a hyperslice.
-
-    // A->vdim: the vector dimension of A (ncols(A))
-    // A->nvec: # of vectors that appear in A.  For the hypersparse case,
-    //          these are the number of column indices in Ah [0..nvec-1], since
-    //          A is CSC.  For all cases, Ap [0...nvec] are the pointers.
-
-    //--------------------
-    // (1) standard     // A->is_hyper == false, A->is_slice == false
-                        // A->nvec == A->vdim, A->hfirst == 0
-
-        for (k = 0 ; k < A->nvec ; k++)
-        {
-            j = k ;
-            // operate on column A(:,j)
-            for (p = Ap [k] ; p < Ap [k+1] ; p++)
-            {
-                // A(i,j) has row i = Ai [p], value aij = Ax [p]
-            }
-        }
-
-    //--------------------
-    // (2) hypersparse  // A->is_hyper == true, A->is_slice == false
-                        // A->nvec <= A->dim, A->hfirst == 0 (ignored)
-
-        for (k = 0 ; k < A->nvec ; k++)
-        {
-            j = A->h [k]
-            // operate on column A(:,j)
-            for (p = Ap [k] ; p < Ap [k+1] ; p++)
-            {
-                // A(i,j) has row i = Ai [p], value aij = Ax [p]
-            }
-        }
-
-    //--------------------
-    // (3) slice, of another standard matrix S.
-                        // A->i == S->i, A->x == S->x
-                        // A->p = S->p + A->hfirst, A->h is NULL
-                        // A->nvec <= A->vdim == S->vdim
-
-        for (k = 0 ; k < A->nvec ; k++)
-        {
-            j = A->hfirst + k ;
-            // operate on column A(:,j), which is also S (:,j)
-            for (p = Ap [k] ; p < Ap [k+1] ; p++)
-            {
-                // A(i,j) has row i = Ai [p], value aij = Ax [p]
-                // This is identical to S(i,j)
-            }
-        }
-
-    //--------------------
-    // (4) hyperslice, of another hypersparse matrix S
-                        // A->i == S->i, A->x == S->x, A->p = S->p + kfirst,
-                        // A->h == S->h + kfirst where A(:,0) is the same
-                        // column as S->h [kfirst].  kfirst is not kept.
-                        // A->nvec <= A->vdim == S->vdim
-                        // A->hfirst == 0 (ignored)
-
-        for (k = 0 ; k < A->nvec ; k++)
-        {
-            j = A->h [k] ;
-            // operate on column A(:,j), which is also S (:,j)
-            for (p = Ap [k] ; p < Ap [k+1] ; p++)
-            {
-                // A(i,j) has row i = Ai [p], value aij = Ax [p].
-                // This is identical to S(i,j)
-            }
-        }
-
-    //--------------------
-    // all of the above: via GBI_for_each_vector (A)
-                        // are done with a single iterator that selects
-                        // the iteration method based on the format of A.
-
-        GBI_for_each_vector (A)
-        {
-            // get A(:,j)
-            GBI_jth_iteration (j, pstart, pend) ;
-            // operate on column A(:,j)
-            for (p = pstart ; p < pend ; p++)
-            {
-                // A(i,j) has row i = Ai [p], value aij = Ax [p].
-            }
-        }
-
-#endif
-
-//------------------------------------------------------------------------------
-// GBI_single_iterator: iterate over the vectors of a single matrix
-//------------------------------------------------------------------------------
-
-// The matrix may be sparse, hypersparse, slice, or hyperslice.
-
-typedef struct
-{
-    const int64_t *GB_RESTRICT p ; // vector pointer A->p of A
-    const int64_t *GB_RESTRICT h ; // A->h: hyperlist of vectors in A
-    int64_t nvec ;              // A->nvec: number of vectors in A
-    int64_t hfirst ;            // A->hfirst: first vector in slice A
-    bool is_hyper ;             // true if A is hypersparse
-    bool is_slice ;             // true if A is a slice or hyperslice
-
-} GBI_single_iterator ;
-
-//----------------------------------------
-// GBI1_init: initialize a GBI_single_iterator
-//----------------------------------------
-
-static inline void GBI1_init
-(
-    GBI_single_iterator *Iter,
-    const GrB_Matrix A
-)
-{ 
-    // load the content of A into the iterator
-    Iter->is_hyper = A->is_hyper ;
-    Iter->p = A->p ;
-    Iter->h = A->h ;
-    Iter->nvec = A->nvec ;
-    Iter->is_slice = A->is_slice ;
-    Iter->hfirst = A->hfirst ;
-}
-
-//----------------------------------------
-// GBI1_start: start the kth iteration for GBI_single_iterator
-//----------------------------------------
-
-static inline void GBI1_start
-(
-    int64_t Iter_k,
-    GBI_single_iterator *Iter,
-    int64_t *j,
-    int64_t *pstart,
-    int64_t *pend
-)
-{
-
-    // get j: next vector from A
-    if (Iter->is_slice)
-    {
-        if (Iter->is_hyper)
-        {
-            // A is a hyperslice of a hypersparse matrix
-            (*j) = Iter->h [Iter_k] ;
-        }
-        else
-        {
-            // A is a slice of a standard matrix
-            (*j) = (Iter->hfirst) + Iter_k ;
-        }
-    }
-    else
-    {
-        if (Iter->is_hyper)
-        { 
-            // A is a hypersparse matrix
-            (*j) = Iter->h [Iter_k] ;
-        }
-        else
-        { 
-            // A is a standard matrix
-            (*j) = Iter_k ;
-        }
-    }
-
-    // get the start and end of the next vector of A
-    (*pstart) = Iter->p [Iter_k  ] ;
-    (*pend)   = Iter->p [Iter_k+1] ;
-}
-
-// iterate over one matrix A (sparse, hypersparse, slice, or hyperslice)
-// with a named iterator
-#define GBI_for_each_vector_with_iter(Iter,A)                               \
-    GBI_single_iterator Iter ;                                              \
-    GBI1_init (&Iter, A) ;                                                  \
-    for (int64_t Iter ## _k = 0 ; Iter ## _k < Iter.nvec ; Iter ## _k++)
-
-// iterate over one matrix A (sparse, hypersparse, slice, or hyperslice)
-// with the iterator named "Iter"
-#define GBI_for_each_vector(A) GBI_for_each_vector_with_iter (Iter,A)
-
-// get the column at the current iteration, and the start/end pointers
-// of column j in the matrix A
-#define GBI_jth_iteration_with_iter(Iter,j0,pstart0,pend0)                  \
-    int64_t j0, pstart0, pend0 ;                                            \
-    GBI1_start (Iter ## _k, &Iter, &j0, &pstart0, &pend0) ;
-
-#define GBI_jth_iteration(j0,pstart0,pend0)                                 \
-    GBI_jth_iteration_with_iter(Iter,j0,pstart0,pend0)
-
-// iterate over a vector of a single matrix
-#define GBI_for_each_entry(j,p,pend)                                        \
-    GBI_jth_iteration (j, p, pend) ;                                        \
-    for ( ; (p) < (pend) ; (p)++)
-
-#endif
-
diff --git a/GraphBLAS/Source/GB_ix_alloc.c b/GraphBLAS/Source/GB_ix_alloc.c
deleted file mode 100644
index da48c9fb41..0000000000
--- a/GraphBLAS/Source/GB_ix_alloc.c
+++ /dev/null
@@ -1,67 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_ix_alloc: allocate a matrix to hold a given number of entries
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Does not modify A->p or A->h (unless an error occurs).  Frees A->x and A->i
-// and reallocates them to the requested size.  Frees any pending tuples and
-// deletes all entries (including zombies, if any).  If numeric is false, then
-// A->x is freed but not reallocated.
-
-// If this method fails, all content of A is freed (including A->p and A->h).
-
-#include "GB.h"
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_ix_alloc        // allocate A->i and A->x space in a matrix
-(
-    GrB_Matrix A,           // matrix to allocate space for
-    const GrB_Index nzmax,  // number of entries the matrix can hold
-    const bool numeric,     // if true, allocate A->x, otherwise A->x is NULL
-    GB_Context Context
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    // GB_new does not always initialize or even allocate A->p
-    ASSERT (A != NULL) ;
-
-    if (nzmax > GxB_INDEX_MAX)
-    { 
-        // problem too large
-        return (GB_OUT_OF_MEMORY) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // allocate the A->x and A->i content of the matrix
-    //--------------------------------------------------------------------------
-
-    // Free the existing A->x and A->i content, if any.
-    // Leave A->p and A->h unchanged.
-    GB_IX_FREE (A) ;
-
-    // allocate the new A->x and A->i content
-    A->nzmax = GB_IMAX (nzmax, 1) ;
-    A->i = GB_MALLOC (A->nzmax, int64_t) ;
-    if (numeric)
-    { 
-        A->x = GB_MALLOC (A->nzmax * A->type->size, GB_void) ;
-    }
-
-    if (A->i == NULL || (numeric && A->x == NULL))
-    { 
-        // out of memory
-        GB_PHIX_FREE (A) ;
-        return (GB_OUT_OF_MEMORY) ;
-    }
-
-    return (GrB_SUCCESS) ;
-}
-
diff --git a/GraphBLAS/Source/GB_ix_realloc.c b/GraphBLAS/Source/GB_ix_realloc.c
index 552f3c9b72..6c238f4428 100644
--- a/GraphBLAS/Source/GB_ix_realloc.c
+++ b/GraphBLAS/Source/GB_ix_realloc.c
@@ -2,8 +2,8 @@
 // GB_ix_realloc: reallocate a matrix to hold a given number of entries
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,9 +16,9 @@
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 GrB_Info GB_ix_realloc      // reallocate space in a matrix
 (
-    GrB_Matrix A,           // matrix to allocate space for
-    const GrB_Index nzmax,  // new number of entries the matrix can hold
-    const bool numeric,     // if true, reallocate A->x, otherwise A->x is NULL
+    GrB_Matrix A,               // matrix to allocate space for
+    const int64_t nzmax_new,    // new number of entries the matrix can hold
+    const bool numeric,         // if true, reallocate A->x, else A->x is NULL
     GB_Context Context
 )
 {
@@ -27,34 +27,41 @@ GrB_Info GB_ix_realloc      // reallocate space in a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    // GB_new does not always initialize A->p; GB_Matrix_check fails in this
-    // case.  So the following assertion is not possible here.  This is by
-    // design.  Thus, ASSERT_MATRIX_OK (A, "A", ...) ;  cannot be
+    // This method is used only by GB_ix_resize, which itself is used only by
+    // GrB_Matrix_wait.  Full and bitmap matrices never have pending work, so
+    // this function is only called for hypersparse and sparse matrices.
+    ASSERT (!GB_IS_FULL (A)) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ;
+
+    // A->p has been allocated but might not be initialized.  GB_Matrix_check
+    // fails in this case.  Thus, ASSERT_MATRIX_OK (A, "A", ...) ;  cannot be
     // used here.
     ASSERT (A != NULL && A->p != NULL) ;
-    ASSERT (GB_IMPLIES (A->is_hyper, A->h != NULL)) ;
     ASSERT (!A->i_shallow && !A->x_shallow) ;
 
-    // This function tolerates pending tuples and zombies
-    ASSERT (GB_PENDING_OK (A)) ; ASSERT (GB_ZOMBIES_OK (A)) ;
+    // This function tolerates pending tuples, zombies, and jumbled matrices.
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
 
-    if (nzmax > GxB_INDEX_MAX)
+    if (nzmax_new > GxB_INDEX_MAX)
     { 
         // problem too large
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
     // reallocate the space
     //--------------------------------------------------------------------------
 
-    size_t nzmax1 = GB_IMAX (nzmax, 1) ;
+    size_t nzmax_new1 = GB_IMAX (nzmax_new, 1) ;
     bool ok1 = true, ok2 = true ;
-    A->i = GB_REALLOC (A->i, nzmax1, A->nzmax, int64_t, &ok1) ;
+    GB_REALLOC (A->i, nzmax_new1, A->nzmax, int64_t, &ok1) ;
     if (numeric)
     { 
         size_t asize = A->type->size ;
-        A->x = GB_REALLOC (A->x, nzmax1*asize, (A->nzmax)*asize, GB_void, &ok2) ;
+        GB_REALLOC (A->x, nzmax_new1*asize, (A->nzmax)*asize, GB_void, &ok2) ;
     }
     else
     { 
@@ -63,18 +70,18 @@ GrB_Info GB_ix_realloc      // reallocate space in a matrix
     bool ok = ok1 && ok2 ;
 
     // always succeeds if the space shrinks
-    ASSERT (GB_IMPLIES (nzmax1 <= A->nzmax, ok)) ;
+    ASSERT (GB_IMPLIES (nzmax_new1 <= A->nzmax, ok)) ;
 
     if (ok)
     { 
-        A->nzmax = nzmax1 ;
+        A->nzmax = nzmax_new1 ;
     }
 
     // The matrix is always left in a valid state.  If the reallocation fails
     // it just won't have the requested size (and ok is false in this case).
     if (!ok)
     { 
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GB_ix_resize.c b/GraphBLAS/Source/GB_ix_resize.c
index 962608073c..be3a7069c1 100644
--- a/GraphBLAS/Source/GB_ix_resize.c
+++ b/GraphBLAS/Source/GB_ix_resize.c
@@ -2,8 +2,8 @@
 // GB_ix_resize:  reallocate a matrix with some slack for future growth
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -25,7 +25,18 @@ GrB_Info GB_ix_resize           // resize a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
+    // This function is only called by GB_Matrix_wait.  Full and bitmap
+    // matrices never have any pending work, so this method is needed only for
+    // sparse and hypersparse matrices.
     ASSERT_MATRIX_OK (A, "A to resize", GB0) ;
+    ASSERT (!GB_IS_FULL (A)) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ;
+
+    // This function tolerates pending tuples, zombies, and jumbled matrices.
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
 
     GrB_Info info ;
     int64_t anzmax_orig = A->nzmax ;
@@ -35,39 +46,16 @@ GrB_Info GB_ix_resize           // resize a matrix
     // resize the matrix
     //--------------------------------------------------------------------------
 
-    if (anz_new < anzmax_orig / 4)
-    { 
-
-        //----------------------------------------------------------------------
-        // shrink the space
-        //----------------------------------------------------------------------
-
-        // the new matrix has lots of leftover space.  Trim the size but leave
-        // 50% for future growth, if possible.  Do not increase the size beyond
-        // the existing space, however.
-
-        int64_t anzmax_new = GB_IMAX (anzmax_orig, anz_new + (anz_new/2)) ;
-
-        // since the space is shrinking, this is guaranteed not to fail
-        ASSERT (anzmax_new <= anzmax_orig) ;
-        ASSERT (anz_new <= anzmax_new) ;
-
-        info = GB_ix_realloc (A, anzmax_new, true, Context) ;
-        ASSERT (info == GrB_SUCCESS) ;
-        ASSERT_MATRIX_OK (A, "A trimmed in size", GB0) ;
-
-    }
-    else if (anz_new > anzmax_orig)
+    ASSERT (anz_new > anzmax_orig) ;
     {
 
         //----------------------------------------------------------------------
         // grow the space
         //----------------------------------------------------------------------
 
-        // original A->nzmax is not enough; give the matrix space for nnz(A)
-        // plus 50% for future growth
+        // original A->nzmax is not enough; double the matrix space for nnz(A)
 
-        int64_t anzmax_new = anz_new + (anz_new/2) ;
+        int64_t anzmax_new = 2 * anz_new ;
 
         // the space is growing so this might run out of memory
         ASSERT (anzmax_new > anzmax_orig) ;
@@ -77,24 +65,12 @@ GrB_Info GB_ix_resize           // resize a matrix
         if (info != GrB_SUCCESS)
         { 
             // out of memory
-            GB_PHIX_FREE (A) ;
+            GB_phbix_free (A) ;
             return (info) ;
         }
         ASSERT_MATRIX_OK (A, "A increased in size", GB0) ;
 
     }
-    else
-    { 
-
-        //----------------------------------------------------------------------
-        // leave as-is
-        //----------------------------------------------------------------------
-
-        // nnz(A) has changed but the old space is enough to use as-is;
-        // do nothing
-        ASSERT (anz_new <= anzmax_orig) ;
-        ASSERT_MATRIX_OK (A, "A left as-is", GB0) ;
-    }
 
     //--------------------------------------------------------------------------
     // return the result
diff --git a/GraphBLAS/Source/GB_jappend.h b/GraphBLAS/Source/GB_jappend.h
index 8388fd1a03..f9a79e0396 100644
--- a/GraphBLAS/Source/GB_jappend.h
+++ b/GraphBLAS/Source/GB_jappend.h
@@ -1,62 +1,31 @@
 //------------------------------------------------------------------------------
-// GB_jappend.h: definitions of GB_jstartup, GB_jappend, and GB_jwrapup
+// GB_jappend.h: definitions of GB_jappend, and GB_jwrapup
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
+// These methods are now only used by GB_Matrix_wait.
+
 #ifndef GB_JAPPEND_H
 #define GB_JAPPEND_H
 #include "GB.h"
 
-//------------------------------------------------------------------------------
-// GB_jstartup:  start the formation of a matrix
-//------------------------------------------------------------------------------
-
-// GB_jstartup is used with GB_jappend and GB_jwrapup to create the
-// hyperlist and vector pointers of a new matrix, one at a time.
-
-// GB_jstartup logs the start of C(:,0); it also acts as if it logs the end of
-// the sentinal vector C(:,-1).
-
-#if 0
-static inline void GB_jstartup          // no longer used in v3.2.0
-(
-    GrB_Matrix C,           // matrix to start creating
-    int64_t *jlast,         // last vector appended, set to -1
-    int64_t *cnz,           // set to zero
-    int64_t *cnz_last       // set to zero
-)
-{
-    C->p [0] = 0 ;          // log the start of C(:,0)
-    (*cnz) = 0 ;            //
-    (*cnz_last) = 0 ;
-    (*jlast) = -1 ;         // last sentinal vector is -1
-    if (C->is_hyper)
-    {
-        C->nvec = 0 ;       // clear existing vectors from C
-    }
-    C->nvec_nonempty = 0 ;  // # of non-empty vectors will be counted
-}
-#endif
-
 //------------------------------------------------------------------------------
 // GB_jappend:  append a new vector to the end of a matrix
 //------------------------------------------------------------------------------
 
 // Append a new vector to the end of a matrix C.
 
-// If C->is_hyper is true, C is in hypersparse form with
+// If C->h != NULL, C is in hypersparse form with
 // C->nvec <= C->plen <= C->vdim.  C->h has size C->plen.
-// If C->is_hyper is false, C is in non-hypersparse form with
+
+// If C->h == NULL, C is in non-hypersparse form with
 // C->nvec == C->plen == C->vdim.  C->h is NULL.
 // In both cases, C->p has size C->plen+1.
 
-// For both hypersparse and non-hypersparse, C->nvec_nonemty <= C->nvec
-// is the number of vectors with at least one entry.
-
 static inline GrB_Info GB_jappend
 (
     GrB_Matrix C,           // matrix to append vector j to
@@ -76,6 +45,7 @@ static inline GrB_Info GB_jappend
     ASSERT (!C->p_shallow) ;
     ASSERT (!C->h_shallow) ;
     ASSERT (C->p != NULL) ;
+    ASSERT (!GB_IS_FULL (C)) ;
 
     if (cnz <= (*cnz_last))
     {
@@ -86,7 +56,7 @@ static inline GrB_Info GB_jappend
     // one more non-empty vector
     C->nvec_nonempty++ ;
 
-    if (C->is_hyper)
+    if (C->h != NULL)
     { 
 
         //----------------------------------------------------------------------
@@ -115,8 +85,11 @@ static inline GrB_Info GB_jappend
         ASSERT (C->plen <= C->vdim) ;
         ASSERT (C->p [C->nvec] == (*cnz_last)) ;
 
-        C->h [C->nvec] = j ;            // add j to the hyperlist
-        C->p [C->nvec+1] = cnz ;        // mark the end of C(:,j)
+        // add j to the hyperlist
+        C->h [C->nvec] = j ;
+
+        // mark the end of C(:,j)
+        C->p [C->nvec+1] = cnz ;
         C->nvec++ ;                     // one more vector in the hyperlist
 
     }
@@ -140,9 +113,11 @@ static inline GrB_Info GB_jappend
 
         for (int64_t jprior = (*jlast)+1 ; jprior < j ; jprior++)
         { 
-            Cp [jprior+1] = (*cnz_last) ;   // mark the end of C(:,jprior)
+            // mark the end of C(:,jprior)
+            Cp [jprior+1] = (*cnz_last) ;
         }
-        Cp [j+1] = cnz ;                    // mark the end of C(:,j)
+        // mark the end of C(:,j)
+        Cp [j+1] = cnz ;
     }
 
     // record the last vector added to C
@@ -167,7 +142,7 @@ static inline void GB_jwrapup
 )
 {
 
-    if (!C->is_hyper)
+    if (C->h == NULL)
     {
 
         //----------------------------------------------------------------------
@@ -182,7 +157,8 @@ static inline void GB_jwrapup
 
         for (int64_t jprior = jlast+1 ; jprior <= j ; jprior++)
         { 
-            Cp [jprior+1] = cnz ;           // mark the end of C(:,jprior)
+            // mark the end of C(:,jprior)
+            Cp [jprior+1] = cnz ;
         }
     }
 
diff --git a/GraphBLAS/Source/GB_kron.c b/GraphBLAS/Source/GB_kron.c
index c2cfbf0ef9..63b8575f5c 100644
--- a/GraphBLAS/Source/GB_kron.c
+++ b/GraphBLAS/Source/GB_kron.c
@@ -2,8 +2,8 @@
 // GB_kron: C<M> = accum (C, kron(A,B))
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,12 +12,13 @@
 // The input matrices A and B are optionally transposed.
 
 #include "GB_kron.h"
+#include "GB_mxm.h"
 #include "GB_transpose.h"
 #include "GB_accum_mask.h"
 
 #define GB_FREE_ALL         \
-    GB_MATRIX_FREE (&AT) ;  \
-    GB_MATRIX_FREE (&BT) ;
+    GB_Matrix_free (&AT) ;  \
+    GB_Matrix_free (&BT) ;
 
 GrB_Info GB_kron                    // C<M> = accum (C, kron(A,B))
 (
@@ -27,7 +28,7 @@ GrB_Info GB_kron                    // C<M> = accum (C, kron(A,B))
     const bool Mask_comp,           // if true, use !M
     const bool Mask_struct,         // if true, use the only structure of M
     const GrB_BinaryOp accum,       // optional accum for Z=accum(C,T)
-    const GrB_BinaryOp op,          // defines '*' for kron(A,B)
+    const GrB_BinaryOp op_in,       // defines '*' for kron(A,B)
     const GrB_Matrix A,             // input matrix
     bool A_transpose,               // if true, use A' instead of A
     const GrB_Matrix B,             // input matrix
@@ -45,13 +46,14 @@ GrB_Info GB_kron                    // C<M> = accum (C, kron(A,B))
     GrB_Info info ;
     GrB_Matrix AT = NULL ;
     GrB_Matrix BT = NULL ;
+    GrB_BinaryOp op = op_in ;
 
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
     GB_RETURN_IF_NULL_OR_FAULTY (B) ;
     GB_RETURN_IF_FAULTY (M) ;
     GB_RETURN_IF_NULL_OR_FAULTY (op) ;
-    GB_RETURN_IF_FAULTY (accum) ;
+    GB_RETURN_IF_FAULTY_OR_POSITIONAL (accum) ;
 
     ASSERT_MATRIX_OK (C, "C input for GB_kron", GB0) ;
     ASSERT_MATRIX_OK_OR_NULL (M, "M for GB_kron", GB0) ;
@@ -84,45 +86,55 @@ GrB_Info GB_kron                    // C<M> = accum (C, kron(A,B))
     ok = ok && GB_Index_multiply (&cnz, GB_NNZ (A), GB_NNZ (B)) ;
     if (!ok || GB_NROWS (C) != cnrows || GB_NCOLS (C) != cncols)
     { 
-        return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG, "%s:\n"
+        GB_ERROR (GrB_DIMENSION_MISMATCH, "%s:\n"
             "output is " GBd "-by-" GBd "; must be " GBu "-by-" GBu "\n"
             "first input is " GBd "-by-" GBd "%s with " GBd " entries\n"
             "second input is " GBd "-by-" GBd "%s with " GBd " entries",
             ok ? "Dimensions not compatible:" : "Problem too large:",
             GB_NROWS (C), GB_NCOLS (C), cnrows, cncols,
             anrows, ancols, A_transpose ? " (transposed)" : "", GB_NNZ (A),
-            bnrows, bncols, B_transpose ? " (transposed)" : "", GB_NNZ (B)))) ;
+            bnrows, bncols, B_transpose ? " (transposed)" : "", GB_NNZ (B)) ;
     }
 
     // quick return if an empty mask is complemented
     GB_RETURN_IF_QUICK_MASK (C, C_replace, M, Mask_comp) ;
 
-    // delete any lingering zombies and assemble any pending tuples
-    GB_MATRIX_WAIT (M) ;
-
     //--------------------------------------------------------------------------
     // transpose A and B if requested
     //--------------------------------------------------------------------------
 
-    bool is_csc = C->is_csc ;
-    if (is_csc != A->is_csc)
+    bool T_is_csc = C->is_csc ;
+    if (T_is_csc != A->is_csc)
     { 
         // Flip the sense of A_transpose
         A_transpose = !A_transpose ;
     }
-    if (is_csc != B->is_csc)
+    if (T_is_csc != B->is_csc)
     { 
         // Flip the sense of B_transpose
         B_transpose = !B_transpose ;
     }
 
+    if (!T_is_csc)
+    { 
+        if (GB_OP_IS_POSITIONAL (op))
+        { 
+            // positional ops must be flipped, with i and j swapped
+            op = GB_positional_binop_ijflip (op) ;
+        }
+    }
+
+    // TODO: if A, B are pattern: do not compute values of AT=A', BT=B'
+    bool A_is_pattern, B_is_pattern ;
+    GB_AxB_pattern (&A_is_pattern, &B_is_pattern, false, op->opcode) ;
+
     if (A_transpose)
     {
         // AT = A' and typecast to op->xtype
-        // transpose: typecast, no op, not in place
-        GBBURBLE ("(A transpose) ") ;
-        GB_OK (GB_transpose (&AT, op->xtype, is_csc, A,
-            NULL, NULL, NULL, false, Context)) ;
+        // transpose: typecast, no op, not in-place
+        GBURBLE ("(A transpose) ") ;
+        GB_OK (GB_transpose (&AT, A_is_pattern ? A->type : op->xtype, T_is_csc,
+            A, NULL, NULL, NULL, false, Context)) ;
         ASSERT_MATRIX_OK (A , "A after AT kron", GB0) ;
         ASSERT_MATRIX_OK (AT, "AT kron", GB0) ;
     }
@@ -130,10 +142,10 @@ GrB_Info GB_kron                    // C<M> = accum (C, kron(A,B))
     if (B_transpose)
     {
         // BT = B' and typecast to op->ytype
-        // transpose: typecast, no op, not in place
-        GBBURBLE ("(B transpose) ") ;
-        GB_OK (GB_transpose (&BT, op->ytype, is_csc, B,
-            NULL, NULL, NULL, false, Context)) ;
+        // transpose: typecast, no op, not in-place
+        GBURBLE ("(B transpose) ") ;
+        GB_OK (GB_transpose (&BT, B_is_pattern ? B->type : op->ytype, T_is_csc,
+            B, NULL, NULL, NULL, false, Context)) ;
         ASSERT_MATRIX_OK (BT, "BT kron", GB0) ;
     }
 
@@ -142,8 +154,9 @@ GrB_Info GB_kron                    // C<M> = accum (C, kron(A,B))
     //--------------------------------------------------------------------------
 
     GrB_Matrix T ;
-    GB_OK (GB_kroner (&T, C->is_csc, op,
-        A_transpose ? AT : A, B_transpose ? BT : B, Context)) ;
+    GB_OK (GB_kroner (&T, T_is_csc, op,
+        A_transpose ? AT : A, A_is_pattern,
+        B_transpose ? BT : B, B_is_pattern, Context)) ;
 
     // free workspace
     GB_FREE_ALL ;
diff --git a/GraphBLAS/Source/GB_kron.h b/GraphBLAS/Source/GB_kron.h
index 94979bf7bb..1f22e74620 100644
--- a/GraphBLAS/Source/GB_kron.h
+++ b/GraphBLAS/Source/GB_kron.h
@@ -2,8 +2,8 @@
 // GB_kron.h: definitions for GB_kron
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -33,7 +33,9 @@ GrB_Info GB_kroner                  // C = kron (A,B)
     const bool C_is_csc,            // desired format of C
     const GrB_BinaryOp op,          // multiply operator
     const GrB_Matrix A,             // input matrix
+    bool A_is_pattern,              // true if values of A are not used
     const GrB_Matrix B,             // input matrix
+    bool B_is_pattern,              // true if values of B are not used
     GB_Context Context
 ) ;
 
diff --git a/GraphBLAS/Source/GB_kroner.c b/GraphBLAS/Source/GB_kroner.c
index 7f772a0461..1f51bccac3 100644
--- a/GraphBLAS/Source/GB_kroner.c
+++ b/GraphBLAS/Source/GB_kroner.c
@@ -2,8 +2,8 @@
 // GB_kroner: Kronecker product, C = kron (A,B)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,7 +12,7 @@
 // different.  The type of C is the type of z.  C is hypersparse if either A
 // or B are hypersparse.
 
-// FUTURE: GB_kron would be faster with built-in types and operators.
+// FUTURE: this would be faster with built-in types and operators.
 
 // FUTURE: at most one thread is used for each vector of C=kron(A,B).  The
 // matrix C is normally very large, but if both A and B are n-by-1, then C is
@@ -24,13 +24,27 @@
 
 #include "GB_kron.h"
 
+#define GB_FREE_WORK        \
+{                           \
+    GB_Matrix_free (&A2) ;  \
+    GB_Matrix_free (&B2) ;  \
+}
+
+#define GB_FREE_ALL         \
+{                           \
+    GB_FREE_WORK ;          \
+    GB_Matrix_free (Chandle) ; \
+}
+
 GrB_Info GB_kroner                  // C = kron (A,B)
 (
     GrB_Matrix *Chandle,            // output matrix
     const bool C_is_csc,            // desired format of C
     const GrB_BinaryOp op,          // multiply operator
-    const GrB_Matrix A,             // input matrix
-    const GrB_Matrix B,             // input matrix
+    const GrB_Matrix A_in,          // input matrix
+    bool A_is_pattern,              // true if values of A are not used
+    const GrB_Matrix B_in,          // input matrix
+    bool B_is_pattern,              // true if values of B are not used
     GB_Context Context
 )
 {
@@ -39,25 +53,53 @@ GrB_Info GB_kroner                  // C = kron (A,B)
     // check inputs
     //--------------------------------------------------------------------------
 
+    GrB_Info info ;
     ASSERT (Chandle != NULL) ;
-    ASSERT_MATRIX_OK (A, "A for kron (A,B)", GB0) ;
-    ASSERT_MATRIX_OK (B, "B for kron (A,B)", GB0) ;
+    (*Chandle) = NULL ;
+    GrB_Matrix A2 = NULL ;
+    GrB_Matrix B2 = NULL ;
+
+    ASSERT_MATRIX_OK (A_in, "A_in for kron (A,B)", GB0) ;
+    ASSERT_MATRIX_OK (B_in, "B_in for kron (A,B)", GB0) ;
     ASSERT_BINARYOP_OK (op, "op for kron (A,B)", GB0) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
-    ASSERT (!GB_PENDING (B)) ; ASSERT (!GB_ZOMBIES (B)) ;
 
     //--------------------------------------------------------------------------
-    // get inputs
+    // finish any pending work
     //--------------------------------------------------------------------------
 
-    GrB_Info info ;
+    GB_MATRIX_WAIT (A_in) ;
+    GB_MATRIX_WAIT (B_in) ;
 
-    (*Chandle) = NULL ;
+    //--------------------------------------------------------------------------
+    // bitmap case: create sparse copies of A and B if they are bitmap
+    //--------------------------------------------------------------------------
+
+    GrB_Matrix A = A_in ;
+    if (GB_IS_BITMAP (A))
+    { 
+        GBURBLE ("A:") ;
+        GB_OK (GB_dup2 (&A2, A, true, A->type, Context)) ;
+        GB_OK (GB_convert_bitmap_to_sparse (A2, Context)) ;
+        A = A2 ;
+    }
+
+    GrB_Matrix B = B_in ;
+    if (GB_IS_BITMAP (B))
+    { 
+        GBURBLE ("B:") ;
+        GB_OK (GB_dup2 (&B2, B, true, B->type, Context)) ;
+        GB_OK (GB_convert_bitmap_to_sparse (B2, Context)) ;
+        B = B2 ;
+    }
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
 
     const int64_t *GB_RESTRICT Ap = A->p ;
     const int64_t *GB_RESTRICT Ah = A->h ;
     const int64_t *GB_RESTRICT Ai = A->i ;
-    const GB_void *GB_RESTRICT Ax = (GB_void *) A->x ;
+    const GB_void *GB_RESTRICT Ax = A_is_pattern ? NULL : ((GB_void *) A->x) ;
     const int64_t asize = A->type->size ;
     const int64_t avlen = A->vlen ;
     const int64_t avdim = A->vdim ;
@@ -67,7 +109,7 @@ GrB_Info GB_kroner                  // C = kron (A,B)
     const int64_t *GB_RESTRICT Bp = B->p ;
     const int64_t *GB_RESTRICT Bh = B->h ;
     const int64_t *GB_RESTRICT Bi = B->i ;
-    const GB_void *GB_RESTRICT Bx = (GB_void *) B->x ;
+    const GB_void *GB_RESTRICT Bx = B_is_pattern ? NULL : ((GB_void *) B->x) ;
     const int64_t bsize = B->type->size ;
     const int64_t bvlen = B->vlen ;
     const int64_t bvdim = B->vdim ;
@@ -98,68 +140,88 @@ GrB_Info GB_kroner                  // C = kron (A,B)
     ok = ok & GB_Index_multiply (&cnvec, anvec, bnvec) ;
     ASSERT (ok) ;
 
-    // C is hypersparse if either A or B are hypersparse
-    bool C_is_hyper = (cvdim > 1) && (A->is_hyper || B->is_hyper) ;
+    // C is hypersparse if either A or B are hypersparse.  It is never bitmap.
+    bool C_is_hyper = (cvdim > 1) && (Ah != NULL || Bh != NULL) ;
+    bool C_is_full = GB_is_dense (A) && GB_is_dense (B) ;
+    int sparsity = C_is_full ? GxB_FULL :
+        ((C_is_hyper) ? GxB_HYPERSPARSE : GxB_SPARSE) ;
 
     GrB_Matrix C = NULL ;           // allocate a new header for C
-    info = GB_create (&C, op->ztype, (int64_t) cvlen, (int64_t) cvdim,
-        GB_Ap_malloc, C_is_csc, GB_SAME_HYPER_AS (C_is_hyper), B->hyper_ratio,
-        cnvec, cnzmax, true, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory
-        return (info) ;
-    }
+    GB_OK (GB_new_bix (&C, // full, sparse, or hyper; new header
+        op->ztype, (int64_t) cvlen, (int64_t) cvdim, GB_Ap_malloc, C_is_csc,
+        sparsity, true, B->hyper_switch, cnvec, cnzmax, true, Context)) ;
+    (*Chandle) = C ;
 
     //--------------------------------------------------------------------------
-    // get C
+    // get C and the operator
     //--------------------------------------------------------------------------
 
     int64_t *GB_RESTRICT Cp = C->p ;
     int64_t *GB_RESTRICT Ch = C->h ;
     int64_t *GB_RESTRICT Ci = C->i ;
     GB_void *GB_RESTRICT Cx = (GB_void *) C->x ;
+    int64_t *GB_RESTRICT Cx_int64 = NULL ;
+    int32_t *GB_RESTRICT Cx_int32 = NULL ;
     const int64_t csize = C->type->size ;
 
     GxB_binary_function fmult = op->function ;
-
-    GB_cast_function
-        cast_A = GB_cast_factory (op->xtype->code, A->type->code),
+    GB_Opcode opcode = op->opcode ;
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (opcode) ;
+    GB_cast_function cast_A = NULL, cast_B = NULL ;
+    if (!A_is_pattern)
+    { 
+        cast_A = GB_cast_factory (op->xtype->code, A->type->code) ;
+    }
+    if (!B_is_pattern)
+    { 
         cast_B = GB_cast_factory (op->ytype->code, B->type->code) ;
+    }
+
+    int64_t offset = 0 ;
+    if (op_is_positional)
+    { 
+        offset = GB_positional_offset (opcode) ;
+        Cx_int64 = (int64_t *) Cx ;
+        Cx_int32 = (int32_t *) Cx ;
+    }
+    bool is64 = (op->ztype == GrB_INT64) ;
 
     //--------------------------------------------------------------------------
     // compute the column counts of C, and C->h if C is hypersparse
     //--------------------------------------------------------------------------
 
     int64_t kC ;
-    #pragma omp parallel for num_threads(nthreads) schedule(guided)
-    for (kC = 0 ; kC < cnvec ; kC++)
-    {
-        int64_t kA = kC / bnvec ;
-        int64_t kB = kC % bnvec ;
+
+    if (!C_is_full)
+    { 
+        #pragma omp parallel for num_threads(nthreads) schedule(guided)
+        for (kC = 0 ; kC < cnvec ; kC++)
+        {
+            int64_t kA = kC / bnvec ;
+            int64_t kB = kC % bnvec ;
 
             // get A(:,jA), the (kA)th vector of A
-            int64_t jA = (Ah == NULL) ? kA : Ah [kA] ;
-            int64_t aknz = Ap [kA+1] - Ap [kA] ;
+            int64_t jA = GBH (Ah, kA) ;
+            int64_t aknz = (Ap == NULL) ? avlen : (Ap [kA+1] - Ap [kA]) ;
             // get B(:,jB), the (kB)th vector of B
-            int64_t jB = (Bh == NULL) ? kB : Bh [kB] ;
-            int64_t bknz = Bp [kB+1] - Bp [kB] ;
+            int64_t jB = GBH (Bh, kB) ;
+            int64_t bknz = (Bp == NULL) ? bvlen : (Bp [kB+1] - Bp [kB]) ;
             // determine # entries in C(:,jC), the (kC)th vector of C
             // int64_t kC = kA * bnvec + kB ;
-            Cp [kC] = aknz * bknz ;
+            if (!C_is_full)
+            { 
+                Cp [kC] = aknz * bknz ;
+            }
             if (C_is_hyper)
             { 
                 Ch [kC] = jA * bvdim + jB ;
             }
+        }
 
+        GB_cumsum (Cp, cnvec, &(C->nvec_nonempty), nthreads) ;
+        if (C_is_hyper) C->nvec = cnvec ;
     }
 
-    //--------------------------------------------------------------------------
-    // replace Cp with its cumulative sum
-    //--------------------------------------------------------------------------
-
-    GB_cumsum (Cp, cnvec, &(C->nvec_nonempty), nthreads) ;
-    if (C_is_hyper) C->nvec = cnvec ;
     C->magic = GB_MAGIC ;
 
     //--------------------------------------------------------------------------
@@ -173,36 +235,105 @@ GrB_Info GB_kroner                  // C = kron (A,B)
         int64_t kB = kC % bnvec ;
 
         // get B(:,jB), the (kB)th vector of B
-        int64_t pB_start = Bp [kB] ;
-        int64_t pB_end   = Bp [kB+1] ;
+        int64_t jB = GBH (Bh, kB) ;
+        int64_t pB_start = GBP (Bp, kB, bvlen) ;
+        int64_t pB_end   = GBP (Bp, kB+1, bvlen) ;
         int64_t bknz = pB_start - pB_end ;
         if (bknz == 0) continue ;
         GB_void bwork [GB_VLA(bsize)] ;
 
         // get C(:,jC), the (kC)th vector of C
         // int64_t kC = kA * bnvec + kB ;
-        int64_t pC = Cp [kC] ;
+        int64_t pC = GBP (Cp, kC, cvlen) ;
 
         // get A(:,jA), the (kA)th vector of A
-        int64_t pA_start = Ap [kA] ;
-        int64_t pA_end   = Ap [kA+1] ;
+        int64_t jA = GBH (Ah, kA) ;
+        int64_t pA_start = GBP (Ap, kA, avlen) ;
+        int64_t pA_end   = GBP (Ap, kA+1, avlen) ;
         GB_void awork [GB_VLA(asize)] ;
 
         for (int64_t pA = pA_start ; pA < pA_end ; pA++)
         {
             // awork = A(iA,jA), typecasted to op->xtype
-            int64_t iA = Ai [pA] ;
+            int64_t iA = GBI (Ai, pA, avlen) ;
             int64_t iAblock = iA * bvlen ;
-            cast_A (awork, Ax +(pA*asize), asize) ;
+            if (!A_is_pattern) cast_A (awork, Ax +(pA*asize), asize) ;
             for (int64_t pB = pB_start ; pB < pB_end ; pB++)
-            { 
+            {
                 // bwork = B(iB,jB), typecasted to op->ytype
-                int64_t iB = Bi [pB] ;
-                cast_B (bwork, Bx +(pB*bsize), bsize) ;
+                int64_t iB = GBI (Bi, pB, bvlen) ;
+                if (!B_is_pattern) cast_B (bwork, Bx +(pB*bsize), bsize) ;
                 // C(iC,jC) = A(iA,jA) * B(iB,jB)
-                int64_t iC = iAblock + iB ;
-                Ci [pC] = iC ;
-                fmult (Cx +(pC*csize), awork, bwork) ;
+                if (!C_is_full)
+                { 
+                    int64_t iC = iAblock + iB ;
+                    Ci [pC] = iC ;
+                }
+                if (op_is_positional)
+                {
+                    // positional binary operator
+                    switch (opcode)
+                    {
+                        case GB_FIRSTI_opcode   : 
+                            // z = first_i(A(iA,jA),y) == iA
+                        case GB_FIRSTI1_opcode  : 
+                            // z = first_i1(A(iA,jA),y) == iA+1
+                            if (is64)
+                            { 
+                                Cx_int64 [pC] = iA + offset ;
+                            }
+                            else
+                            { 
+                                Cx_int32 [pC] = (int32_t) (iA + offset) ;
+                            }
+                            break ;
+                        case GB_FIRSTJ_opcode   : 
+                            // z = first_j(A(iA,jA),y) == jA
+                        case GB_FIRSTJ1_opcode  : 
+                            // z = first_j1(A(iA,jA),y) == jA+1
+                            if (is64)
+                            { 
+                                Cx_int64 [pC] = jA + offset ;
+                            }
+                            else
+                            { 
+                                Cx_int32 [pC] = (int32_t) (jA + offset) ;
+                            }
+                            break ;
+                        case GB_SECONDI_opcode  : 
+                            // z = second_i(x,B(iB,jB)) == iB
+                        case GB_SECONDI1_opcode : 
+                            // z = second_i1(x,B(iB,jB)) == iB+1
+                            if (is64)
+                            { 
+                                Cx_int64 [pC] = iB + offset ;
+                            }
+                            else
+                            { 
+                                Cx_int32 [pC] = (int32_t) (iB + offset) ;
+                            }
+                            break ;
+                        case GB_SECONDJ_opcode  : 
+                            // z = second_j(x,B(iB,jB)) == jB
+                        case GB_SECONDJ1_opcode : 
+                            // z = second_j1(x,B(iB,jB)) == jB+1
+                            if (is64)
+                            { 
+                                Cx_int64 [pC] = jB + offset ;
+                            }
+                            else
+                            { 
+                                Cx_int32 [pC] = (int32_t) (jB + offset) ;
+                            }
+                            break ;
+                        default: ;
+                    }
+                }
+                else
+                { 
+                    // standard binary operator
+                    fmult (Cx +(pC*csize), awork, bwork) ;
+                }
                 pC++ ;
             }
         }
@@ -212,22 +343,14 @@ GrB_Info GB_kroner                  // C = kron (A,B)
     // remove empty vectors from C, if hypersparse
     //--------------------------------------------------------------------------
 
-    info = GB_hypermatrix_prune (C, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory
-        GB_MATRIX_FREE (&C) ;
-        return (info) ;
-    }
-
-    ASSERT (C->nvec_nonempty == GB_nvec_nonempty (C, Context)) ;
+    GB_OK (GB_hypermatrix_prune (C, Context)) ;
 
     //--------------------------------------------------------------------------
     // return result
     //--------------------------------------------------------------------------
 
     ASSERT_MATRIX_OK (C, "C=kron(A,B)", GB0) ;
-    (*Chandle) = C ;
+    GB_FREE_WORK ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_malloc_memory.c b/GraphBLAS/Source/GB_malloc_memory.c
index 64c019f447..9abbcaa43f 100644
--- a/GraphBLAS/Source/GB_malloc_memory.c
+++ b/GraphBLAS/Source/GB_malloc_memory.c
@@ -2,17 +2,17 @@
 // GB_malloc_memory: wrapper for malloc_function
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // A wrapper for malloc_function.  Space is not initialized.
 
-// Parameters are the same as the POSIX calloc, except that asking to allocate
-// a block of zero size causes a block of size 1 to be allocated instead.  This
-// allows the return pointer p to be checked for the out-of-memory condition,
-// even when allocating an object of size zero.
+// Parameters are the same as the ANSI C11 calloc, except that asking to
+// allocate a block of zero size causes a block of size 1 to be allocated
+// instead.  This allows the return pointer p to be checked for the
+// out-of-memory condition, even when allocating an object of size zero.
 
 #include "GB.h"
 
diff --git a/GraphBLAS/Source/GB_mask.c b/GraphBLAS/Source/GB_mask.c
index e91efe1362..07a8d32a30 100644
--- a/GraphBLAS/Source/GB_mask.c
+++ b/GraphBLAS/Source/GB_mask.c
@@ -2,25 +2,24 @@
 // GB_mask: apply a mask: C<M> = Z
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // C<M> = Z
 
+// GB_mask is only called by GB_accum_mask.
+
+// If M is NULL, C can have any sparsity.  Otherwise, if M is present then
+// C is sparse or hypersparse; if bitmap or full, GB_subassign is used instead.
+
 // Nearly all GraphBLAS operations take a mask, which controls how the result
 // of the computations, Z, are copied into the result matrix C.  The following
-// working MATLAB script, GB_spec_mask, defines how this is done.
-
-// This function can only handle the case when C, M, and Z all have the same
-// format (all CSC and CSR transposed, or all CSR or CSC transposed).  The
-// caller (GB_accum_mask) must transpose as needed, before calling this
-// function.  This function can handle any combination of hypersparsity of C,
-// M, and/or Z, as needed.  In the comments, C(i,j) is shorthand for the index
-// i in the jth vector, and likewise for M, Z, and R.  If the matrices are all
-// CSC, then this is row i and column j.  If the matrices are all CSR, then it
-// is row j and column i.
+// working MATLAB script, GB_spec_mask, defines how this is done.  In the
+// comments, C(i,j) is shorthand for the index i in the jth vector, and
+// likewise for M, Z, and R.  If the matrices are all CSC, then this is row i
+// and column j.  If the matrices are all CSR, then it is row j and column i.
 
 #include "GB_mask.h"
 
@@ -112,9 +111,9 @@
 
 #define GB_FREE_ALL                     \
 {                                       \
-    GB_MATRIX_FREE (Zhandle) ;          \
-    GB_MATRIX_FREE (&C_cleared) ;       \
-    GB_MATRIX_FREE (&R) ;               \
+    GB_Matrix_free (Zhandle) ;          \
+    GB_Matrix_free (&C_cleared) ;       \
+    GB_Matrix_free (&R) ;               \
 }
 
 //------------------------------------------------------------------------------
@@ -138,20 +137,23 @@ GrB_Info GB_mask                // C<M> = Z
 
     // C_result may be aliased with M
     ASSERT_MATRIX_OK (C_result, "C_result for GB_mask", GB0) ;
-    ASSERT_MATRIX_OK_OR_NULL (M, "M for GB_mask", GB0) ;
-
     // C may be cleared anyway, without the need for finishing it
-    ASSERT (GB_PENDING_OK (C_result)) ; ASSERT (GB_ZOMBIES_OK (C_result)) ;
+    ASSERT (GB_ZOMBIES_OK (C_result)) ;
+    ASSERT (GB_JUMBLED_OK (C_result)) ;
+    ASSERT (GB_PENDING_OK (C_result)) ;
 
+    ASSERT_MATRIX_OK_OR_NULL (M, "M for GB_mask", GB0) ;
     // M may have zombies and pending tuples
-    ASSERT (GB_PENDING_OK (M)) ; ASSERT (GB_ZOMBIES_OK (M)) ;
+    ASSERT (GB_PENDING_OK (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (GB_ZOMBIES_OK (M)) ;
 
+    // Z has the same type as C_result, with no zombies or pending tuples
     ASSERT (Zhandle != NULL) ;
     GrB_Matrix Z = *Zhandle ;
-
-    // Z has the same type as C_result, with no zombies or pending tuples
     ASSERT_MATRIX_OK (Z, "Z for GB_mask", GB0) ;
     ASSERT (!GB_PENDING (Z)) ;
+    ASSERT (GB_JUMBLED_OK (Z)) ;
     ASSERT (!GB_ZOMBIES (Z)) ;
     ASSERT (Z->type == C_result->type) ;
     // Z and C_result are never aliased. C_result and M might be.
@@ -180,7 +182,8 @@ GrB_Info GB_mask                // C<M> = Z
         // there is no mask (implicitly M(i,j)=1 for all i and j)
         //----------------------------------------------------------------------
 
-        // Any pending work on C is abandoned (zombies and/or pending tuples)
+        // Any pending work on C is abandoned (zombies and/or pending tuples).
+        // C and Z can have any sparsity, including bitmap or full.
 
         if (!Mask_comp)
         { 
@@ -192,7 +195,6 @@ GrB_Info GB_mask                // C<M> = Z
             // C_result = Z, but make sure a deep copy is made as needed.  It is
             // possible that Z is a shallow copy of another matrix.
             // Z is freed by GB_transplant_conform.
-            ASSERT (C_result->p != NULL) ;
             ASSERT (!C_result->p_shallow) ;
             ASSERT (!C_result->h_shallow) ;
 
@@ -221,7 +223,7 @@ GrB_Info GB_mask                // C<M> = Z
             ASSERT (GB_DEAD_CODE) ;    // the following is no longer used
 
             // free Z if it exists (this is OK if Zhandle is NULL)
-            GB_MATRIX_FREE (Zhandle) ;
+            GB_Matrix_free (Zhandle) ;
 
             if (C_replace)
             {
@@ -244,7 +246,8 @@ GrB_Info GB_mask                // C<M> = Z
         //----------------------------------------------------------------------
 
         // delete any lingering zombies and assemble any pending tuples
-        GB_MATRIX_WAIT (M) ;
+        GB_MATRIX_WAIT (M) ;        // also sort M if jumbled
+        GB_MATRIX_WAIT (Z) ;        // also sort Z if jumbled
 
         // R has the same CSR/CSC format as C_result.  It is hypersparse if
         // both C and Z are hypersparse.
@@ -262,39 +265,52 @@ GrB_Info GB_mask                // C<M> = Z
                 // must be cleared.  To resolve this, a new matrix C_cleared is
                 // created, which is what C_result would look like if cleared.
                 // C_result is left unchanged since changing it would change M.
-                // The C_cleared matrix has the same hypersparsity and CSC/CSR
-                // format as the orginal C matrix.
-                C_cleared = NULL;   // allocate a new header for C_cleared
-                GB_OK (GB_create (&C_cleared, C_result->type, vlen, vdim,
-                    GB_Ap_calloc, R_is_csc, GB_AUTO_HYPER,
-                    C_result->hyper_ratio, 0, 0, true, Context)) ;
+                // The C_cleared matrix is created as hypersparse.
+                C_cleared = NULL ;
+                int sparsity = GxB_HYPERSPARSE ;  
+                GB_OK (
+                GB_new_bix (&C_cleared, // auto (sparse or hyper), new header
+                    C_result->type, vlen, vdim, GB_Ap_calloc, R_is_csc,
+                    sparsity, true, C_result->hyper_switch, 0, 0, true,
+                    Context)) ;
                 C = C_cleared ;
             }
             else
             { 
-                // Clear all entries from C_result
+                // Clear all entries from C_result, and ensure C is hypersparse
+                // by temporarily changing the sparsity control
+                int save = C_result->sparsity ;         // save control
+                C_result->sparsity = GxB_HYPERSPARSE ;
                 GB_OK (GB_clear (C_result, Context)) ;
+                C_result->sparsity = save ;             // restore control
                 C = C_result ;
             }
             // C has been cleared, so it has no zombies or pending tuples
         }
         else
         { 
+            // C has already been finished if C_replace is false, via the
+            // GB_MATRIX_WAIT (C) in GB_accum_mask.
             C = C_result ;
-
-            // delete any lingering zombies and assemble any pending tuples
-            GB_MATRIX_WAIT (C) ;
         }
 
+        // C cannot be bitmap or full for GB_masker
+        ASSERT (!GB_IS_BITMAP (C)) ;
+        ASSERT (!GB_IS_FULL (C)) ;
+
         // no more zombies or pending tuples in M or C
-        ASSERT (!GB_PENDING (M)) ; ASSERT (!GB_ZOMBIES (M)) ;
-        ASSERT (!GB_PENDING (C)) ; ASSERT (!GB_ZOMBIES (C)) ;
+        ASSERT (!GB_PENDING (M)) ;
+        ASSERT (!GB_JUMBLED (M)) ;
+        ASSERT (!GB_ZOMBIES (M)) ;
+        ASSERT (!GB_PENDING (C)) ;
+        ASSERT (!GB_JUMBLED (C)) ;
+        ASSERT (!GB_ZOMBIES (C)) ;
 
         // continue with C, do not use C_result until the end since it may be
         // aliased with M.
 
         //----------------------------------------------------------------------
-        // R = masker (M, C, Z):  compute C<M>=Z, placing results in R
+        // R = masker (C, M, Z):  compute C<M>=Z, placing results in R
         //----------------------------------------------------------------------
 
         GB_OK (GB_masker (&R, R_is_csc, M, Mask_comp, Mask_struct, C, Z,
@@ -304,8 +320,8 @@ GrB_Info GB_mask                // C<M> = Z
         // free temporary matrices Z and C_cleared
         //----------------------------------------------------------------------
 
-        GB_MATRIX_FREE (Zhandle) ;
-        GB_MATRIX_FREE (&C_cleared) ;
+        GB_Matrix_free (Zhandle) ;
+        GB_Matrix_free (&C_cleared) ;
 
         //----------------------------------------------------------------------
         // transplant the result, conform, and free R
diff --git a/GraphBLAS/Source/GB_mask.h b/GraphBLAS/Source/GB_mask.h
index 2018d421e6..d66f737676 100644
--- a/GraphBLAS/Source/GB_mask.h
+++ b/GraphBLAS/Source/GB_mask.h
@@ -2,8 +2,8 @@
 // GB_mask: definitions for GB_mask and related functions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,7 +23,7 @@ GrB_Info GB_mask                // C<M> = Z
     GB_Context Context
 ) ;
 
-GrB_Info GB_masker          // R = masker (M, C, Z)
+GrB_Info GB_masker          // R = masker (C, M, Z)
 (
     GrB_Matrix *Rhandle,    // output matrix (unallocated on input)
     const bool R_is_csc,    // format of output matrix R
@@ -35,14 +35,14 @@ GrB_Info GB_masker          // R = masker (M, C, Z)
     GB_Context Context
 ) ;
 
-GrB_Info GB_mask_phase1                 // count nnz in each R(:,j)
+GrB_Info GB_masker_phase1           // count nnz in each R(:,j)
 (
-    int64_t **Rp_handle,                // output of size Rnvec+1
-    int64_t *Rnvec_nonempty,            // # of non-empty vectors in R
-    // tasks from phase0b:
-    GB_task_struct *GB_RESTRICT TaskList,      // array of structs
-    const int ntasks,                       // # of tasks
-    const int nthreads,                     // # of threads to use
+    int64_t **Rp_handle,            // output of size Rnvec+1
+    int64_t *Rnvec_nonempty,        // # of non-empty vectors in R
+    // tasks from phase1a:
+    GB_task_struct *GB_RESTRICT TaskList,       // array of structs
+    const int R_ntasks,               // # of tasks
+    const int R_nthreads,             // # of threads to use
     // analysis from phase0:
     const int64_t Rnvec,
     const int64_t *GB_RESTRICT Rh,
@@ -50,39 +50,49 @@ GrB_Info GB_mask_phase1                 // count nnz in each R(:,j)
     const int64_t *GB_RESTRICT R_to_C,
     const int64_t *GB_RESTRICT R_to_Z,
     // original input:
-    const GrB_Matrix M,                 // required mask
-    const bool Mask_comp,               // if true, then M is complemented
+    const GrB_Matrix M,             // required mask
+    const bool Mask_comp,           // if true, then M is complemented
     const bool Mask_struct,         // if true, use the only structure of M
     const GrB_Matrix C,
     const GrB_Matrix Z,
     GB_Context Context
 ) ;
 
-GrB_Info GB_mask_phase2     // phase2 for R = masker (M,C,Z)
+GrB_Info GB_masker_phase2           // phase2 for R = masker (C,M,Z)
 (
-    GrB_Matrix *Rhandle,    // output matrix (unallocated on input)
-    const bool R_is_csc,    // format of output matrix R
+    GrB_Matrix *Rhandle,            // output matrix (unallocated on input)
+    const bool R_is_csc,            // format of output matrix R
     // from phase1:
-    const int64_t *GB_RESTRICT Rp,         // vector pointers for R
-    const int64_t Rnvec_nonempty,       // # of non-empty vectors in R
-    // tasks from phase0b:
-    const GB_task_struct *GB_RESTRICT TaskList,    // array of structs
-    const int ntasks,                           // # of tasks
-    const int nthreads,                         // # of threads to use
+    const int64_t *GB_RESTRICT Rp,  // vector pointers for R
+    const int64_t Rnvec_nonempty,   // # of non-empty vectors in R
+    // tasks from phase1a:
+    const GB_task_struct *GB_RESTRICT TaskList,     // array of structs
+    const int R_ntasks,               // # of tasks
+    const int R_nthreads,             // # of threads to use
     // analysis from phase0:
     const int64_t Rnvec,
     const int64_t *GB_RESTRICT Rh,
     const int64_t *GB_RESTRICT R_to_M,
     const int64_t *GB_RESTRICT R_to_C,
     const int64_t *GB_RESTRICT R_to_Z,
+    const int R_sparsity,
     // original input:
-    const GrB_Matrix M,         // required mask
-    const bool Mask_comp,
+    const GrB_Matrix M,             // required mask
+    const bool Mask_comp,           // if true, then M is complemented
     const bool Mask_struct,         // if true, use the only structure of M
     const GrB_Matrix C,
     const GrB_Matrix Z,
     GB_Context Context
 ) ;
 
+int GB_masker_sparsity      // return the sparsity structure for R
+(
+    // input:
+    const GrB_Matrix C,     // input C matrix
+    const GrB_Matrix M,     // mask for C, always present
+    const bool Mask_comp,   // if true, use !M
+    const GrB_Matrix Z      // input Z matrix
+) ;
+
 #endif
 
diff --git a/GraphBLAS/Source/GB_masker.c b/GraphBLAS/Source/GB_masker.c
index 41819dcb95..350beedf71 100644
--- a/GraphBLAS/Source/GB_masker.c
+++ b/GraphBLAS/Source/GB_masker.c
@@ -1,22 +1,25 @@
 //------------------------------------------------------------------------------
-// GB_masker: R = masker (M, C, Z)
+// GB_masker: R = masker (C, M, Z) constructs R for C<M>=Z
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// GB_masker (R, M, C, Z), does R=C ; R<M>=Z.  No typecasting is performed.
+// GB_masker (R, C, M, Z), does R=C ; R<M>=Z.  No typecasting is performed.
 // The operation is similar to both R=C+Z via GB_add and R=C.*Z via GB_emult,
 // depending on the value of the mask.
 
+// GB_masker is only called by GB_mask, which itself is only called by
+// GB_accum_mask.
+
 // Let R be the result of the mask.  In the caller, R is written back into the
 // final C matrix, but in GB_masker, C is a read-only matrix.  Consider the
 // following table, where "add" is the result of C+Z, an "emult" is the result
 // of C.*Z.
 
-//                                      R = masker (M,C,Z)
+//                                      R = masker (C,M,Z)
 
 // C(i,j)   Z(i,j)  add     emult       M(i,j)=1    M(i,j)=0
 
@@ -30,26 +33,24 @@
 
 //   -      -       -       -           -           -
 
-// Half of the results are like C.*Z using the FIRST operator, and the
-// other are the same as C+Z using the SECOND operator:
-
-//  cij     zij     cij+zij cij*zij     2nd(C+Z)    1st(C.*Z)
-
-//   -      zij     zij     -           2nd(C+Z)    1st(C.*Z)
-
-//  cij     -       cij     -           1st(C.*Z)   2nd(C+Z)
-
-//   -      -       -       -           1st(C.*Z)   2nd(C+Z)
-
 // As a result, GB_masker is very similar to GB_add and GB_emult.  The
 // vectors that appear in R are bounded by the set union of C and Z, just
 // like GB_add when the mask is *not* present.  The pattern of R is bounded
 // by the pattern of C+Z, also ignoring the mask.
 
+// C is always sparse or hypersparse; if C is bitmap or full, GB_subassign is
+// used instead, since C(:,:)<M>=Z can directly modify C in that case, without
+// creating zombies or pending tuples, in GB_bitmap_assign.
+
+// M and Z can have any sparsity structure: sparse, hypersparse, bitmap, or
+// full.  R is constructed as sparse, hypersparse, or bitmap, depending on
+// the sparsity of M and Z, as determined by GB_masker_sparsity.
+
 #include "GB_mask.h"
 #include "GB_add.h"
+#define GB_FREE_ALL ;
 
-GrB_Info GB_masker          // R = masker (M, C, Z)
+GrB_Info GB_masker          // R = masker (C, M, Z)
 (
     GrB_Matrix *Rhandle,    // output matrix (unallocated on input)
     const bool R_is_csc,    // format of output matrix R
@@ -66,17 +67,36 @@ GrB_Info GB_masker          // R = masker (M, C, Z)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GBBURBLE ("mask ") ;
+    GrB_Info info ;
 
     ASSERT (Rhandle != NULL) ;
+
     ASSERT_MATRIX_OK (M, "M for masker", GB0) ;
+    ASSERT (!GB_PENDING (M)) ;
+    ASSERT (!GB_JUMBLED (M)) ;
+    ASSERT (!GB_ZOMBIES (M)) ;
+
     ASSERT_MATRIX_OK (C, "C for masker", GB0) ;
+    ASSERT (!GB_PENDING (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_ZOMBIES (C)) ;
+
     ASSERT_MATRIX_OK (Z, "Z for masker", GB0) ;
+    ASSERT (!GB_PENDING (Z)) ;
+    ASSERT (!GB_JUMBLED (Z)) ;
+    ASSERT (!GB_ZOMBIES (Z)) ;
+
+    ASSERT (!GB_IS_BITMAP (C)) ;    // GB_masker not used if C is bitmap
+    ASSERT (!GB_IS_FULL (C)) ;      // GB_masker not used if C is full
+
     ASSERT (C->vdim == Z->vdim && C->vlen == Z->vlen) ;
     ASSERT (C->vdim == M->vdim && C->vlen == M->vlen) ;
-    ASSERT (!GB_PENDING (M)) ; ASSERT (!GB_ZOMBIES (M)) ;
-    ASSERT (!GB_PENDING (C)) ; ASSERT (!GB_ZOMBIES (C)) ;
-    ASSERT (!GB_PENDING (Z)) ; ASSERT (!GB_ZOMBIES (Z)) ;
+
+    //--------------------------------------------------------------------------
+    // determine the sparsity of R
+    //--------------------------------------------------------------------------
+
+    int R_sparsity = GB_masker_sparsity (C, M, Mask_comp, Z) ;
 
     //--------------------------------------------------------------------------
     // initializations
@@ -86,73 +106,96 @@ GrB_Info GB_masker          // R = masker (M, C, Z)
     int64_t Rnvec, Rnvec_nonempty ;
     int64_t *Rp = NULL, *Rh = NULL ;
     int64_t *R_to_M = NULL, *R_to_C = NULL, *R_to_Z = NULL ;
-    int ntasks, max_ntasks, nthreads ;
+    int R_ntasks = 0, TaskList_size = 0, R_nthreads ;
     GB_task_struct *TaskList = NULL ;
 
     //--------------------------------------------------------------------------
-    // phase0: determine the vectors in R = C+Z
+    // phase0: finalize the sparsity structure of R and the vectors of R
     //--------------------------------------------------------------------------
 
     // This phase is identical to phase0 of GB_add, except that Ch is never a
     // deep or shallow copy of Mh.
 
-    GrB_Info info = GB_add_phase0 (
+    info = GB_add_phase0 (
         // computed by by phase0:
-        &Rnvec, &Rh, &R_to_M, &R_to_C, &R_to_Z, NULL,
+        &Rnvec, &Rh, &R_to_M, &R_to_C, &R_to_Z, NULL, &R_sparsity,
         // original input:
         M, C, Z, Context) ;
-
     if (info != GrB_SUCCESS)
     { 
         // out of memory
         return (info) ;
     }
 
+    GBURBLE ("masker:(%s:%s%s%s%s%s=%s) ",
+        GB_sparsity_char (R_sparsity),
+        GB_sparsity_char_matrix (C),
+        Mask_struct ? "{" : "<",
+        Mask_comp ? "!" : "",
+        GB_sparsity_char_matrix (M),
+        Mask_struct ? "}" : ">",
+        GB_sparsity_char_matrix (Z)) ;
+
     //--------------------------------------------------------------------------
-    // phase0b: split C into tasks for phase1 and phase2
+    // phase1: split R into tasks, and count entries in each vector of R
     //--------------------------------------------------------------------------
 
-    info = GB_ewise_slice (
-        // computed by phase0b
-        &TaskList, &max_ntasks, &ntasks, &nthreads,
-        // computed by phase0:
-        Rnvec, Rh, R_to_M, R_to_C, R_to_Z, false,
-        // original input:
-        M, C, Z, Context) ;
+    if (R_sparsity == GxB_SPARSE || R_sparsity == GxB_HYPERSPARSE)
+    {
+
+        //----------------------------------------------------------------------
+        // R is sparse or hypersparse: slice and analyze the R matrix
+        //----------------------------------------------------------------------
+
+        // phase1a: split R into tasks
+        info = GB_ewise_slice (
+            // computed by phase1a
+            &TaskList, &TaskList_size, &R_ntasks, &R_nthreads,
+            // computed by phase0:
+            Rnvec, Rh, R_to_M, R_to_C, R_to_Z, false,
+            // original input:
+            M, C, Z, Context) ;
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory; free everything allocated by GB_add_phase0
+            GB_FREE (Rh) ;
+            GB_FREE (R_to_M) ;
+            GB_FREE (R_to_C) ;
+            GB_FREE (R_to_Z) ;
+            return (info) ;
+        }
+
+        // count the number of entries in each vector of R
+        info = GB_masker_phase1 (
+            // computed or used by phase1:
+            &Rp, &Rnvec_nonempty,
+            // from phase1a:
+            TaskList, R_ntasks, R_nthreads,
+            // from phase0:
+            Rnvec, Rh, R_to_M, R_to_C, R_to_Z,
+            // original input:
+            M, Mask_comp, Mask_struct, C, Z, Context) ;
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory; free everything allocated by GB_add_phase0
+            GB_FREE (TaskList) ;
+            GB_FREE (Rh) ;
+            GB_FREE (R_to_M) ;
+            GB_FREE (R_to_C) ;
+            GB_FREE (R_to_Z) ;
+            return (info) ;
+        }
 
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory; free everything allocated by GB_add_phase0
-        GB_FREE (Rh) ;
-        GB_FREE (R_to_M) ;
-        GB_FREE (R_to_C) ;
-        GB_FREE (R_to_Z) ;
-        return (info) ;
     }
+    else
+    { 
 
-    //--------------------------------------------------------------------------
-    // phase1: count the number of entries in each vector of R
-    //--------------------------------------------------------------------------
+        //----------------------------------------------------------------------
+        // R is bitmap or full: only determine how many threads to use
+        //----------------------------------------------------------------------
 
-    info = GB_mask_phase1 (
-        // computed or used by phase1:
-        &Rp, &Rnvec_nonempty,
-        // from phase0b:
-        TaskList, ntasks, nthreads,
-        // from phase0:
-        Rnvec, Rh, R_to_M, R_to_C, R_to_Z,
-        // original input:
-        M, Mask_comp, Mask_struct, C, Z, Context) ;
-
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory; free everything allocated by GB_add_phase0
-        GB_FREE (TaskList) ;
-        GB_FREE (Rh) ;
-        GB_FREE (R_to_M) ;
-        GB_FREE (R_to_C) ;
-        GB_FREE (R_to_Z) ;
-        return (info) ;
+        GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+        R_nthreads = GB_nthreads (M->vlen * M->vdim, chunk, nthreads_max) ;
     }
 
     //--------------------------------------------------------------------------
@@ -162,18 +205,20 @@ GrB_Info GB_masker          // R = masker (M, C, Z)
     // Rp and Rh are either freed by phase2, or transplanted into R.
     // Either way, they are not freed here.
 
-    info = GB_mask_phase2 (
+    info = GB_masker_phase2 (
         // computed or used by phase2:
         &R, R_is_csc,
         // from phase1:
         Rp, Rnvec_nonempty,
-        // from phase0b:
-        TaskList, ntasks, nthreads,
+        // from phase1a:
+        TaskList, R_ntasks, R_nthreads,
         // from phase0:
-        Rnvec, Rh, R_to_M, R_to_C, R_to_Z,
+        Rnvec, Rh, R_to_M, R_to_C, R_to_Z, R_sparsity,
         // original input:
         M, Mask_comp, Mask_struct, C, Z, Context) ;
 
+    // if successful, Rh and Rp must not be freed; they are now R->h and R->p
+
     // free workspace
     GB_FREE (TaskList) ;
     GB_FREE (R_to_M) ;
@@ -186,8 +231,6 @@ GrB_Info GB_masker          // R = masker (M, C, Z)
         return (info) ;
     }
 
-    // if successful, Rh and Rp must not be freed; they are now R->h and R->p
-
     //--------------------------------------------------------------------------
     // return result
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_mask_phase1.c b/GraphBLAS/Source/GB_masker_phase1.c
similarity index 65%
rename from GraphBLAS/Source/GB_mask_phase1.c
rename to GraphBLAS/Source/GB_masker_phase1.c
index 0931061986..b90d3e1957 100644
--- a/GraphBLAS/Source/GB_mask_phase1.c
+++ b/GraphBLAS/Source/GB_masker_phase1.c
@@ -1,34 +1,35 @@
 //------------------------------------------------------------------------------
-// GB_mask_phase1: find # of entries in R = masker (M,C,Z)
+// GB_masker_phase1: find # of entries in R = masker (C,M,Z)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// GB_mask_phase1 counts the number of entries in each vector of R, for R =
-// masker (M,C,Z), and then does a cumulative sum to find Cp.  GB_mask_phase1
+// GB_masker_phase1 counts the number of entries in each vector of R, for R =
+// masker (C,M,Z), and then does a cumulative sum to find Cp.  GB_masker_phase1
 // is preceded by GB_add_phase0, which finds the non-empty vectors of R.  This
 // phase is done entirely in parallel.
 
 // R, M, C, and Z can be standard sparse or hypersparse, as determined by
 // GB_add_phase0.  All cases of the mask M are handled: present and not
 // complemented, and present and complemented.  The mask is always present for
-// R=masker(M,C,Z).
+// R=masker(C,M,Z).
 
 // Rp is either freed by phase2, or transplanted into R.
 
 #include "GB_mask.h"
+#include "GB_unused.h"
 
-GrB_Info GB_mask_phase1                 // count nnz in each R(:,j)
+GrB_Info GB_masker_phase1           // count nnz in each R(:,j)
 (
-    int64_t **Rp_handle,                // output of size Rnvec+1
-    int64_t *Rnvec_nonempty,            // # of non-empty vectors in R
-    // tasks from phase0b:
-    GB_task_struct *GB_RESTRICT TaskList,      // array of structs
-    const int ntasks,                       // # of tasks
-    const int nthreads,                     // # of threads to use
+    int64_t **Rp_handle,            // output of size Rnvec+1
+    int64_t *Rnvec_nonempty,        // # of non-empty vectors in R
+    // tasks from phase1a:
+    GB_task_struct *GB_RESTRICT TaskList,       // array of structs
+    const int R_ntasks,               // # of tasks
+    const int R_nthreads,             // # of threads to use
     // analysis from phase0:
     const int64_t Rnvec,
     const int64_t *GB_RESTRICT Rh,
@@ -36,8 +37,8 @@ GrB_Info GB_mask_phase1                 // count nnz in each R(:,j)
     const int64_t *GB_RESTRICT R_to_C,
     const int64_t *GB_RESTRICT R_to_Z,
     // original input:
-    const GrB_Matrix M,                 // required mask
-    const bool Mask_comp,               // if true, then M is complemented
+    const GrB_Matrix M,             // required mask
+    const bool Mask_comp,           // if true, then M is complemented
     const bool Mask_struct,         // if true, use the only structure of M
     const GrB_Matrix C,
     const GrB_Matrix Z,
@@ -51,9 +52,24 @@ GrB_Info GB_mask_phase1                 // count nnz in each R(:,j)
 
     ASSERT (Rp_handle != NULL) ;
     ASSERT (Rnvec_nonempty != NULL) ;
+
     ASSERT_MATRIX_OK (M, "M for mask phase1", GB0) ;
+    ASSERT (!GB_ZOMBIES (M)) ; 
+    ASSERT (!GB_JUMBLED (M)) ;
+    ASSERT (!GB_PENDING (M)) ; 
+
     ASSERT_MATRIX_OK (C, "C for mask phase1", GB0) ;
+    ASSERT (!GB_ZOMBIES (C)) ; 
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_PENDING (C)) ; 
+
     ASSERT_MATRIX_OK (Z, "Z for mask phase1", GB0) ;
+    ASSERT (!GB_ZOMBIES (Z)) ; 
+    ASSERT (!GB_JUMBLED (Z)) ;
+    ASSERT (!GB_PENDING (Z)) ; 
+
+    ASSERT (!GB_IS_BITMAP (C)) ;    // not used if C is bitmap
+
     ASSERT (C->vdim == Z->vdim && C->vlen == Z->vlen) ;
     ASSERT (C->vdim == M->vdim && C->vlen == M->vlen) ;
 
@@ -68,7 +84,7 @@ GrB_Info GB_mask_phase1                 // count nnz in each R(:,j)
     if (Rp == NULL)
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -76,13 +92,13 @@ GrB_Info GB_mask_phase1                 // count nnz in each R(:,j)
     //--------------------------------------------------------------------------
 
     #define GB_PHASE_1_OF_2
-    #include "GB_mask_template.c"
+    #include "GB_masker_template.c"
 
     //--------------------------------------------------------------------------
     // cumulative sum of Rp and fine tasks in TaskList
     //--------------------------------------------------------------------------
 
-    GB_task_cumsum (Rp, Rnvec, Rnvec_nonempty, TaskList, ntasks, nthreads) ;
+    GB_task_cumsum (Rp, Rnvec, Rnvec_nonempty, TaskList, R_ntasks, R_nthreads) ;
 
     //--------------------------------------------------------------------------
     // return the result
diff --git a/GraphBLAS/Source/GB_mask_phase2.c b/GraphBLAS/Source/GB_masker_phase2.c
similarity index 51%
rename from GraphBLAS/Source/GB_mask_phase2.c
rename to GraphBLAS/Source/GB_masker_phase2.c
index e82f80626f..d1607664a1 100644
--- a/GraphBLAS/Source/GB_mask_phase2.c
+++ b/GraphBLAS/Source/GB_masker_phase2.c
@@ -1,19 +1,19 @@
 //------------------------------------------------------------------------------
-// GB_mask_phase2: phase2 for R = masker (M,C,Z)
+// GB_masker_phase2: phase2 for R = masker (C,M,Z)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// GB_mask_phase2 computes R = masker (M,C,Z).  It is preceded first by
+// GB_masker_phase2 computes R = masker (C,M,Z).  It is preceded first by
 // GB_add_phase0, which computes the list of vectors of R to compute (Rh) and
-// their location in C and Z (R_to_[CZ]).  Next, GB_mask_phase1 counts the
+// their location in C and Z (R_to_[CZ]).  Next, GB_masker_phase1 counts the
 // entries in each vector R(:,j) and computes Rp.
 
-// GB_mask_phase2 computes the pattern and values of each vector of R(:,j),
-// fully in parallel.
+// GB_masker_phase2 computes the pattern and values of each vector of R(:,j),
+// entirely in parallel.
 
 // R, M, C, and Z can be standard sparse or hypersparse, as determined by
 // GB_add_phase0.  All cases of the mask M are handled: present and not
@@ -23,27 +23,44 @@
 // and R->h.  Either way, the caller must not free them.
 
 #include "GB_mask.h"
+#include "GB_ek_slice.h"
+#include "GB_unused.h"
+
+#undef  GB_FREE_WORK
+#define GB_FREE_WORK                                                    \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Cslice, &kfirst_Cslice, &klast_Cslice) ;  \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+}
+
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL         \
+{                           \
+    GB_FREE_WORK ;          \
+    GB_Matrix_free (&R) ;   \
+}
 
-GrB_Info GB_mask_phase2     // phase2 for R = masker (M,C,Z)
+GrB_Info GB_masker_phase2           // phase2 for R = masker (C,M,Z)
 (
-    GrB_Matrix *Rhandle,    // output matrix (unallocated on input)
-    const bool R_is_csc,    // format of output matrix R
+    GrB_Matrix *Rhandle,            // output matrix (unallocated on input)
+    const bool R_is_csc,            // format of output matrix R
     // from phase1:
-    const int64_t *GB_RESTRICT Rp,         // vector pointers for R
-    const int64_t Rnvec_nonempty,       // # of non-empty vectors in R
-    // tasks from phase0b:
-    const GB_task_struct *GB_RESTRICT TaskList,    // array of structs
-    const int ntasks,                           // # of tasks
-    const int nthreads,                         // # of threads to use
+    const int64_t *GB_RESTRICT Rp,  // vector pointers for R
+    const int64_t Rnvec_nonempty,   // # of non-empty vectors in R
+    // tasks from phase1a:
+    const GB_task_struct *GB_RESTRICT TaskList,     // array of structs
+    const int R_ntasks,               // # of tasks
+    const int R_nthreads,             // # of threads to use
     // analysis from phase0:
     const int64_t Rnvec,
     const int64_t *GB_RESTRICT Rh,
     const int64_t *GB_RESTRICT R_to_M,
     const int64_t *GB_RESTRICT R_to_C,
     const int64_t *GB_RESTRICT R_to_Z,
+    const int R_sparsity,
     // original input:
-    const GrB_Matrix M,         // required mask
-    const bool Mask_comp,
+    const GrB_Matrix M,             // required mask
+    const bool Mask_comp,           // if true, then M is complemented
     const bool Mask_struct,         // if true, use the only structure of M
     const GrB_Matrix C,
     const GrB_Matrix Z,
@@ -55,31 +72,47 @@ GrB_Info GB_mask_phase2     // phase2 for R = masker (M,C,Z)
     // check inputs
     //--------------------------------------------------------------------------
 
-    ASSERT (Rp != NULL) ;
     ASSERT_MATRIX_OK (M, "M for mask phase2", GB0) ;
+    ASSERT (!GB_ZOMBIES (M)) ; 
+    ASSERT (!GB_JUMBLED (M)) ;
+    ASSERT (!GB_PENDING (M)) ; 
+
     ASSERT_MATRIX_OK (C, "C for mask phase2", GB0) ;
+    ASSERT (!GB_ZOMBIES (C)) ; 
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_PENDING (C)) ; 
+
     ASSERT_MATRIX_OK (Z, "Z for mask phase2", GB0) ;
+    ASSERT (!GB_ZOMBIES (Z)) ; 
+    ASSERT (!GB_JUMBLED (Z)) ;
+    ASSERT (!GB_PENDING (Z)) ; 
+
+    ASSERT (!GB_IS_BITMAP (C)) ;        // not used if C is bitmap
+
     ASSERT (C->vdim == Z->vdim && C->vlen == Z->vlen) ;
     ASSERT (C->vdim == M->vdim && C->vlen == M->vlen) ;
     ASSERT (C->type == Z->type) ;
 
+    int64_t *pstart_Cslice = NULL, *kfirst_Cslice = NULL, *klast_Cslice = NULL ;
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+
     //--------------------------------------------------------------------------
     // allocate the output matrix R
     //--------------------------------------------------------------------------
 
-    int64_t rnz = Rp [Rnvec] ;
-    (*Rhandle) = NULL ;
-
-    // R is hypersparse if both C and Z are hypersparse.
-    // R acquires the same hyperatio as C.
+    bool R_is_hyper = (R_sparsity == GxB_HYPERSPARSE) ;
+    bool R_is_sparse_or_hyper = (R_sparsity == GxB_SPARSE) || R_is_hyper ;
+    ASSERT (R_is_sparse_or_hyper == (Rp != NULL)) ;
+    ASSERT (R_is_hyper == (Rh != NULL)) ;
 
-    bool R_is_hyper = (Rh != NULL) ;
+    int64_t rnz = (R_is_sparse_or_hyper) ? Rp [Rnvec] : C->vlen*C->vdim ;
+    (*Rhandle) = NULL ;
 
     // allocate the result R (but do not allocate R->p or R->h)
     GrB_Matrix R = NULL ;
-    GrB_Info info = GB_create (&R, C->type, C->vlen, C->vdim, GB_Ap_null,
-        R_is_csc, GB_SAME_HYPER_AS (R_is_hyper), C->hyper_ratio, Rnvec, rnz,
-        true, Context) ;
+    GrB_Info info = GB_new_bix (&R, // any sparsity, new header
+        C->type, C->vlen, C->vdim, GB_Ap_null, R_is_csc,
+        R_sparsity, true, C->hyper_switch, Rnvec, rnz, true, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory; caller must free R_to_M, R_to_C, R_to_Z
@@ -88,8 +121,12 @@ GrB_Info GB_mask_phase2     // phase2 for R = masker (M,C,Z)
         return (info) ;
     }
 
-    // add Rp as the vector pointers for R, from GB_mask_phase1
-    R->p = (int64_t *) Rp ;
+    // add Rp as the vector pointers for R, from GB_masker_phase1
+    if (R_is_sparse_or_hyper)
+    { 
+        R->nvec_nonempty = Rnvec_nonempty ;
+        R->p = (int64_t *) Rp ;
+    }
 
     // add Rh as the hypersparse list for R, from GB_add_phase0
     if (R_is_hyper)
@@ -99,8 +136,6 @@ GrB_Info GB_mask_phase2     // phase2 for R = masker (M,C,Z)
     }
 
     // now Rp and Rh have been transplanted into R, so they must not be freed.
-
-    R->nvec_nonempty = Rnvec_nonempty ;
     R->magic = GB_MAGIC ;
 
     //--------------------------------------------------------------------------
@@ -108,26 +143,24 @@ GrB_Info GB_mask_phase2     // phase2 for R = masker (M,C,Z)
     //--------------------------------------------------------------------------
 
     #define GB_PHASE_2_OF_2
-    #include "GB_mask_template.c"
+    #include "GB_masker_template.c"
 
     //--------------------------------------------------------------------------
     // prune empty vectors from Rh
     //--------------------------------------------------------------------------
 
-    info = GB_hypermatrix_prune (R, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory
-        GB_MATRIX_FREE (&R) ;
-        return (info) ;
-    }
+    GB_OK (GB_hypermatrix_prune (R, Context)) ;
 
     //--------------------------------------------------------------------------
-    // return result
+    // free workspace and return result
     //--------------------------------------------------------------------------
 
     // caller must free R_to_M, R_to_C, and R_to_Z, but not Rp or Rh
+    GB_FREE_WORK ;
     ASSERT_MATRIX_OK (R, "R output for mask phase2", GB0) ;
+    ASSERT (!GB_ZOMBIES (R)) ; 
+    ASSERT (!GB_JUMBLED (R)) ;
+    ASSERT (!GB_PENDING (R)) ; 
     (*Rhandle) = R ;
     return (GrB_SUCCESS) ;
 }
diff --git a/GraphBLAS/Source/GB_masker_sparsity.c b/GraphBLAS/Source/GB_masker_sparsity.c
new file mode 100644
index 0000000000..02abb9b84c
--- /dev/null
+++ b/GraphBLAS/Source/GB_masker_sparsity.c
@@ -0,0 +1,103 @@
+//------------------------------------------------------------------------------
+// GB_masker_sparsity: determine the sparsity structure for C<M or !M>=Z
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Determines the sparsity structure for R for computing R = masker (C,M,Z).
+// If R should be hypersparse or sparse, on output, this function simply
+// returns GxB_SPARSE.  The final determination is made by GB_add_phase0,
+// as called by GB_masker.
+
+// C is sparse or hypersparse on input.  It is never bitmap or full; that
+// case is handled by GB_subassign instead.  R can be constructed as sparse,
+// hypersparse, or bitmap (not full).  M and Z can have any sparsity pattern.
+
+#include "GB_mask.h"
+
+int GB_masker_sparsity      // return the sparsity structure for R
+(
+    // input:
+    const GrB_Matrix C,     // input C matrix
+    const GrB_Matrix M,     // mask for C, always present
+    const bool Mask_comp,   // if true, use !M
+    const GrB_Matrix Z      // input Z matrix
+)
+{
+
+    //--------------------------------------------------------------------------
+    // determine the sparsity of R
+    //--------------------------------------------------------------------------
+
+    // In the tables below "sparse" means either sparse or hypersparse.
+
+    ASSERT (GB_IS_SPARSE (C) || GB_IS_HYPERSPARSE (C)) ;
+
+    bool M_is_sparse = GB_IS_SPARSE (M) || GB_IS_HYPERSPARSE (M) ;
+    bool Z_is_sparse = GB_IS_SPARSE (Z) || GB_IS_HYPERSPARSE (Z) ;
+    int R_sparsity ;
+
+    if (Mask_comp)
+    {
+
+        //      ------------------------------------------
+        //      C       <!M> =       Z              R
+        //      ------------------------------------------
+
+        //      sparse  sparse      sparse          sparse
+        //      sparse  sparse      bitmap          bitmap
+        //      sparse  sparse      full            bitmap
+
+        //      sparse  bitmap      sparse          sparse
+        //      sparse  bitmap      bitmap          bitmap
+        //      sparse  bitmap      full            bitmap
+
+        //      sparse  full        sparse          sparse
+        //      sparse  full        bitmap          bitmap
+        //      sparse  full        full            bitmap
+
+        if (Z_is_sparse)
+        { 
+            R_sparsity = GxB_SPARSE ;
+        }
+        else
+        { 
+            R_sparsity = GxB_BITMAP ;
+        }
+
+    }
+    else
+    {
+
+        //      ------------------------------------------
+        //      C       <M> =        Z              R
+        //      ------------------------------------------
+
+        //      sparse  sparse      sparse          sparse
+        //      sparse  sparse      bitmap          sparse
+        //      sparse  sparse      full            sparse
+
+        //      sparse  bitmap      sparse          sparse
+        //      sparse  bitmap      bitmap          bitmap
+        //      sparse  bitmap      full            bitmap
+
+        //      sparse  full        sparse          sparse
+        //      sparse  full        bitmap          bitmap
+        //      sparse  full        full            bitmap
+
+        if (M_is_sparse || Z_is_sparse)
+        { 
+            R_sparsity = GxB_SPARSE ;
+        }
+        else
+        { 
+            R_sparsity = GxB_BITMAP ;
+        }
+    }
+
+    return (R_sparsity) ;
+}
+
diff --git a/GraphBLAS/Source/GB_math.c b/GraphBLAS/Source/GB_math.c
index 476e75a6d4..3bd3164d68 100644
--- a/GraphBLAS/Source/GB_math.c
+++ b/GraphBLAS/Source/GB_math.c
@@ -2,8 +2,8 @@
 // GB_math.c: declaring functions from GB_math.h
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_math.h b/GraphBLAS/Source/GB_math.h
index d41dbe8284..4ba6ffcbdd 100644
--- a/GraphBLAS/Source/GB_math.h
+++ b/GraphBLAS/Source/GB_math.h
@@ -2,8 +2,8 @@
 // GB_math.h: definitions for complex types, and mathematical operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -138,8 +138,7 @@
 #define GB_IABS(x) (((x) >= 0) ? (x) : (-(x)))
 
 // suitable for integers, and non-NaN floating point:
-#define GB_IMAX(x,y) (((x) > (y)) ? (x) : (y))
-#define GB_IMIN(x,y) (((x) < (y)) ? (x) : (y))
+#include "GB_imin.h"
 
 // ceiling of a/b for two integers a and b
 #define GB_ICEIL(a,b) (((a) + (b) - 1) / (b))
diff --git a/GraphBLAS/Source/GB_matlab_helper.c b/GraphBLAS/Source/GB_matlab_helper.c
index a541decc2e..8f0078a73d 100644
--- a/GraphBLAS/Source/GB_matlab_helper.c
+++ b/GraphBLAS/Source/GB_matlab_helper.c
@@ -2,8 +2,8 @@
 // GB_matlab_helper.c: helper functions for MATLAB interface
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -79,41 +79,6 @@ void GB_matlab_helper1i             // convert zero-based indices to one-based
     }
 }
 
-//------------------------------------------------------------------------------
-// GB_matlab_helper2: create structure for dense matrix for gb_get_shallow
-//------------------------------------------------------------------------------
-
-void GB_matlab_helper2              // fill Xp and Xi for a dense matrix
-(
-    GrB_Index *GB_RESTRICT Xp,      // size ncols+1
-    GrB_Index *GB_RESTRICT Xi,      // size nrows*ncols
-    int64_t ncols,
-    int64_t nrows
-)
-{
-
-    GB_NTHREADS (ncols) ;
-
-    int64_t j ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (j = 0 ; j <= ncols ; j++)
-    {
-        Xp [j] = j * nrows ;
-    }
-
-    double work = ((double) ncols) * ((double) nrows) ;
-    nthreads = GB_nthreads (work, chunk, nthreads_max) ;
-
-    int64_t nel = nrows * ncols ;
-    int64_t k ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (k = 0 ; k < nel ; k++)
-    {
-        int64_t i = k % nrows ;
-        Xi [k] = i ;
-    }
-}
-
 //------------------------------------------------------------------------------
 // GB_matlab_helper3: convert 1-based indices to 0-based for gb_mxarray_to_list
 //------------------------------------------------------------------------------
@@ -129,6 +94,10 @@ bool GB_matlab_helper3              // return true if OK, false on error
 
     GB_NTHREADS (len) ;
 
+    ASSERT (List != NULL) ;
+    ASSERT (List_double != NULL) ;
+    ASSERT (List_max != NULL) ;
+
     bool ok = true ;
     int64_t listmax = -1 ;
 
@@ -267,21 +236,27 @@ void GB_matlab_helper5              // construct pattern of S
 (
     GrB_Index *GB_RESTRICT Si,         // array of size anz
     GrB_Index *GB_RESTRICT Sj,         // array of size anz
-    const GrB_Index *GB_RESTRICT Mi,   // array of size mnz
-    const GrB_Index *GB_RESTRICT Mj,   // array of size mnz
-    GrB_Index *GB_RESTRICT Ai,         // array of size anz
+    const GrB_Index *GB_RESTRICT Mi,   // array of size mnz, M->i, may be NULL
+    const GrB_Index *GB_RESTRICT Mj,   // array of size mnz,
+    const int64_t mvlen,               // M->vlen
+    GrB_Index *GB_RESTRICT Ai,         // array of size anz, A->i, may be NULL
+    const int64_t avlen,               // M->vlen
     const GrB_Index anz
 )
 {
 
     GB_NTHREADS (anz) ;
+    ASSERT (Mj != NULL) ;
+    ASSERT (Si != NULL) ;
+    ASSERT (Sj != NULL) ;
 
     int64_t k ;
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (k = 0 ; k < anz ; k++)
     {
-        Si [k] = Mi [Ai [k]] ;
-        Sj [k] = Mj [Ai [k]] ;
+        int64_t i = GBI (Ai, k, avlen) ;
+        Si [k] = GBI (Mi, i, mvlen) ;
+        Sj [k] = Mj [i] ;
     }
 }
 
@@ -363,6 +338,10 @@ bool GB_matlab_helper9  // true if successful, false if out of memory
     GrB_Index *nvec     // # of non-empty vectors
 )
 {
+    ASSERT_MATRIX_OK (A, "A for matlab helper9", GB0) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A) || GB_IS_FULL (A)) ;
+
     int64_t anvec = A->nvec ;
     GB_NTHREADS (anvec) ;
 
@@ -378,13 +357,14 @@ bool GB_matlab_helper9  // true if successful, false if out of memory
 
     int64_t *Ah = A->h ;
     int64_t *Ap = A->p ;
+    int64_t avlen = A->vlen ;
 
     int64_t k ;
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (k = 0 ; k < anvec ; k++)
     {
-        List [k] = (Ah == NULL) ? k : Ah [k] ;
-        Degree [k] = Ap [k+1] - Ap [k] ;
+        List [k] = GBH (Ah, k) ;
+        Degree [k] = (Ap == NULL) ? avlen : (Ap [k+1] - Ap [k]) ;
     }
 
     // return result
diff --git a/GraphBLAS/Source/GB_matlab_helper.h b/GraphBLAS/Source/GB_matlab_helper.h
index 588d576dca..be59cd3ca5 100644
--- a/GraphBLAS/Source/GB_matlab_helper.h
+++ b/GraphBLAS/Source/GB_matlab_helper.h
@@ -2,8 +2,8 @@
 // GB_matlab_helper.h: helper functions for MATLAB interface
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -30,15 +30,6 @@ void GB_matlab_helper1i             // convert zero-based indices to one-based
     int64_t nvals                   // size of input/output array
 ) ;
 
-GB_PUBLIC
-void GB_matlab_helper2              // fill Xp and Xi for a dense matrix
-(
-    GrB_Index *GB_RESTRICT Xp,      // size ncols+1
-    GrB_Index *GB_RESTRICT Xi,      // size nrows*ncols
-    int64_t ncols,
-    int64_t nrows
-) ;
-
 GB_PUBLIC
 bool GB_matlab_helper3              // return true if OK, false on error
 (
@@ -70,9 +61,11 @@ void GB_matlab_helper5              // construct pattern of S
 (
     GrB_Index *GB_RESTRICT Si,         // array of size anz
     GrB_Index *GB_RESTRICT Sj,         // array of size anz
-    const GrB_Index *GB_RESTRICT Mi,   // array of size mnz
+    const GrB_Index *GB_RESTRICT Mi,   // array of size mnz, M->i
     const GrB_Index *GB_RESTRICT Mj,   // array of size mnz
-    GrB_Index *GB_RESTRICT Ai,         // array of size anz
+    const int64_t mvlen,               // M->vlen
+    GrB_Index *GB_RESTRICT Ai,         // array of size anz, A->i
+    const int64_t avlen,               // M->vlen
     const GrB_Index anz
 ) ;
 
diff --git a/GraphBLAS/Source/GB_matvec_build.c b/GraphBLAS/Source/GB_matvec_build.c
index ea0d64d947..450357aebd 100644
--- a/GraphBLAS/Source/GB_matvec_build.c
+++ b/GraphBLAS/Source/GB_matvec_build.c
@@ -2,15 +2,17 @@
 // GB_matvec_build: check inputs and build a matrix or vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // CALLED BY: GrB_Matrix_build_* and GrB_Vector_build_*
 // CALLS:     GB_build
 
-// This function implements GrB_Matrix_build_* and GrB_Vector_build_*.
+// This function implements GrB_Matrix_build_* and GrB_Vector_build_*.  It
+// first constructs T by GB_builder as hypersparse, and GB_build conforms the
+// result to the appropriate sparsity structure of C.
 
 #include "GB_build.h"
 
@@ -33,17 +35,18 @@ GrB_Info GB_matvec_build        // check inputs then build matrix or vector
     //--------------------------------------------------------------------------
 
     ASSERT_MATRIX_OK (C, "C for GB_matvec_build", GB0) ;
+
     GB_RETURN_IF_NULL (I) ;
     if (I == GrB_ALL)
     { 
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "List of row indices cannot be GrB_ALL"))) ;
+        GB_ERROR (GrB_INVALID_VALUE, "List of row indices cannot be %s",
+            "GrB_ALL") ;
     }
 
     if (nvals == GxB_RANGE || nvals == GxB_STRIDE || nvals == GxB_BACKWARDS)
     { 
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "nvals cannot be GxB_RANGE, GxB_STRIDE, or GxB_BACKWARDS"))) ;
+        GB_ERROR (GrB_INVALID_VALUE, "nvals cannot be %s",
+            "GxB_RANGE, GxB_STRIDE, or GxB_BACKWARDS") ;
     }
 
     if (is_matrix)
@@ -51,8 +54,8 @@ GrB_Info GB_matvec_build        // check inputs then build matrix or vector
         GB_RETURN_IF_NULL (J) ;
         if (J == GrB_ALL)
         { 
-            return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-                "List of column indices cannot be 'GrB_ALL'"))) ;
+            GB_ERROR (GrB_INVALID_VALUE, "List of column indices cannot be %s",
+                "GrB_ALL") ;
         }
     }
     else
@@ -63,6 +66,12 @@ GrB_Info GB_matvec_build        // check inputs then build matrix or vector
 
     GB_RETURN_IF_NULL (X) ;
     GB_RETURN_IF_NULL_OR_FAULTY (dup) ;
+    if (GB_OP_IS_POSITIONAL (dup))
+    { 
+        // dup operator cannot be a positional op
+        GB_ERROR (GrB_DOMAIN_MISMATCH,
+            "Positional op z=%s(x,y) not supported as dup op\n", dup->name) ;
+    }
 
     ASSERT_BINARYOP_OK (dup, "dup operator for assembling duplicates", GB0) ;
     ASSERT (scode <= GB_UDT_code) ;
@@ -70,9 +79,9 @@ GrB_Info GB_matvec_build        // check inputs then build matrix or vector
     if (nvals > GxB_INDEX_MAX)
     { 
         // problem too large
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+        GB_ERROR (GrB_INVALID_VALUE,
             "Problem too large: nvals " GBu " exceeds " GBu,
-            nvals, GxB_INDEX_MAX))) ;
+            nvals, GxB_INDEX_MAX) ;
     }
 
     // check types of dup
@@ -80,19 +89,19 @@ GrB_Info GB_matvec_build        // check inputs then build matrix or vector
     { 
         // all 3 types of z = dup (x,y) must be the same.  dup must also be
         // associative but there is no way to check this in general.
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG, "All domains of dup "
-        "operator for assembling duplicates must be identical.\n"
-        "operator is: [%s] = %s ([%s],[%s])",
-        dup->ztype->name, dup->name, dup->xtype->name, dup->ytype->name))) ;
+        GB_ERROR (GrB_DOMAIN_MISMATCH, "All domains of dup "
+            "operator for assembling duplicates must be identical.\n"
+            "operator is: [%s] = %s ([%s],[%s])",
+            dup->ztype->name, dup->name, dup->xtype->name, dup->ytype->name) ;
     }
 
     if (!GB_Type_compatible (C->type, dup->ztype))
     { 
         // the type of C and dup must be compatible
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-        "Operator dup [%s] has type [%s]\n"
-        "cannot be typecast to entries in output of type [%s]",
-        dup->name, dup->ztype->name, C->type->name))) ;
+        GB_ERROR (GrB_DOMAIN_MISMATCH,
+            "Operator [%s] for assembling duplicates has type [%s],\n"
+            "cannot be typecast to entries in output of type [%s]",
+            dup->name, dup->ztype->name, C->type->name) ;
     }
 
     // C and X must be compatible
@@ -104,22 +113,22 @@ GrB_Info GB_matvec_build        // check inputs then build matrix or vector
         // Thus, if C, dup, or X have any user-defined type, this
         // condition requires all three types to be identical: the same
         // user-defined type.  No casting will be done in this case.
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-        "Numerical values of tuples of type [%s]\n"
-        "cannot be typecast as input to the dup operator\n"
-        "z=%s(x,y), whose input types are [%s]",
-        GB_code_string (scode), dup->name, dup->ztype->name))) ;
+        GB_ERROR (GrB_DOMAIN_MISMATCH,
+            "Numerical values of tuples of type [%s]\n"
+            "cannot be typecast as input to the dup operator\n"
+            "z=%s(x,y), whose input types are [%s]",
+            GB_code_string (scode), dup->name, dup->ztype->name) ;
     }
 
-    if (!GB_EMPTY (C))
+    if (!GB_IS_EMPTY (C))
     { 
         // The matrix has existing entries.  This is required by the GraphBLAS
         // API specification to generate an error, so the test is made here.
         // However, any existing content is safely freed immediately below, so
         // this test is not required, except to conform to the spec.  Zombies
         // are excluded from this test.
-        return (GB_ERROR (GrB_OUTPUT_NOT_EMPTY, (GB_LOG,
-            "Output already has existing entries"))) ;
+        GB_ERROR (GrB_OUTPUT_NOT_EMPTY,
+            "Output already has %s", "existing entries") ;
     }
 
     //--------------------------------------------------------------------------
@@ -128,6 +137,7 @@ GrB_Info GB_matvec_build        // check inputs then build matrix or vector
 
     // GB_build treats I, J, and X as read-only; they must not be modified
 
-    return (GB_build (C, I, J, X, nvals, dup, scode, is_matrix, true, Context));
+    return (GB_build (C, I, J, X, nvals, dup, scode, is_matrix,
+        /* true, */ Context)) ;
 }
 
diff --git a/GraphBLAS/Source/GB_matvec_check.c b/GraphBLAS/Source/GB_matvec_check.c
index 1fdf000a93..d0b4b26d9f 100644
--- a/GraphBLAS/Source/GB_matvec_check.c
+++ b/GraphBLAS/Source/GB_matvec_check.c
@@ -2,8 +2,8 @@
 // GB_matvec_check: print a GraphBLAS matrix and check if it is valid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,7 +11,6 @@
 // #define GB_DEVELOPER 1
 
 #include "GB_Pending.h"
-#include "GB_iterator.h"
 #include "GB.h"
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
@@ -22,8 +21,7 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
     int pr,                 // print level; if negative, ignore nzombie
                             // conditions and use GB_FLIP(pr) for diagnostics
     FILE *f,                // file for output
-    const char *kind,       // "matrix" or "vector"
-    GB_Context Context
+    const char *kind        // "matrix" or "vector"
 )
 {
 
@@ -31,6 +29,11 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
     // decide what to print
     //--------------------------------------------------------------------------
 
+    bool is_hyper = GB_IS_HYPERSPARSE (A) ;
+    bool is_full = GB_IS_FULL (A) ;
+    bool is_bitmap = GB_IS_BITMAP (A) ;
+    bool is_sparse = GB_IS_SPARSE (A) ;
+
     bool ignore_zombies = false ;
     if (pr < 0)
     { 
@@ -38,6 +41,13 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
         ignore_zombies = true ;
     }
     pr = GB_IMIN (pr, GxB_COMPLETE_VERBOSE) ;
+    bool phantom = (is_full && A->x == NULL) ;
+    if (phantom)
+    { 
+        // convert GxB_COMPLETE* to GxB_SHORT*
+        if (pr == GxB_COMPLETE_VERBOSE) pr = GxB_SHORT_VERBOSE ;
+        if (pr == GxB_COMPLETE        ) pr = GxB_SHORT ;
+    }
     bool pr_silent   = (pr == GxB_SILENT) ;
     bool pr_complete = (pr == GxB_COMPLETE || pr == GxB_COMPLETE_VERBOSE) ;
     bool pr_short    = (pr == GxB_SHORT    || pr == GxB_SHORT_VERBOSE   ) ;
@@ -61,7 +71,6 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
 
     if (A == NULL)
     { 
-        // GrB_error status not modified since this may be an optional argument
         GBPR0 (" NULL\n") ;
         return (GrB_NULL_POINTER) ;
     }
@@ -72,10 +81,37 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
     // print the header
     //--------------------------------------------------------------------------
 
-    GBPR0 (", %s", A->is_hyper ?
-            (A->is_slice ? "hyperslice" : "hypersparse") :
-            (A->is_slice ? "slice" : "sparse")) ;
-    GBPR0 (" %s:\n", A->is_csc ? "by col" : "by row") ;
+    if (is_full)
+    { 
+        // A->p, A->h, A->i, and A->b all null
+        GBPR0 (", full") ;
+    }
+    else if (is_bitmap)
+    { 
+        // A->b not null
+        GBPR0 (", bitmap") ;
+    }
+    else if (is_sparse)
+    { 
+        // A->h null, A->p not null
+        GBPR0 (", sparse") ;
+    }
+    else if (is_hyper)
+    { 
+        // A->h not null
+        GBPR0 (", hypersparse") ;
+    }
+    else
+    { 
+        // A is not hyper, sparse, bitmap, or full
+        GBPR0 (" invalid structure\n") ;
+        return (GrB_INVALID_OBJECT) ;
+    }
+    if (A->jumbled)
+    { 
+        GBPR0 (" (jumbled)") ;
+    }
+    GBPR0 (" %s\n", A->is_csc ? "by col" : "by row") ;
 
     #if GB_DEVELOPER
     GBPR0 ("  max # entries: " GBd "\n", A->nzmax) ;
@@ -84,10 +120,98 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
     {
         GBPR0 (" nvec_nonempty: " GBd , A->nvec_nonempty) ;
     }
-    GBPR0 (" nvec: " GBd " plen: " GBd  " vdim: " GBd " hyper_ratio %g\n",
-        A->nvec, A->plen, A->vdim, A->hyper_ratio) ;
+    GBPR0 (" nvec: " GBd " plen: " GBd  " vdim: " GBd "\n  hyper_switch %g "
+        "bitmap_switch %g\n",
+        A->nvec, A->plen, A->vdim, A->hyper_switch, A->bitmap_switch) ;
     #endif
 
+    switch (A->sparsity)
+    {
+
+        // 1
+        case GxB_HYPERSPARSE : 
+            GBPR0 ("  sparsity control: hypersparse only\n") ;
+            break ;
+
+        // 2
+        case GxB_SPARSE : 
+            GBPR0 ("  sparsity control: sparse only\n") ;
+            break ;
+
+        // 3
+        case GxB_HYPERSPARSE + GxB_SPARSE : 
+            GBPR0 ("  sparsity control: sparse/hypersparse\n") ;
+            break ;
+
+        // 4
+        case GxB_BITMAP : 
+            GBPR0 ("  sparsity control: bitmap only\n") ;
+            break ;
+
+        // 5
+        case GxB_HYPERSPARSE + GxB_BITMAP : 
+            GBPR0 ("  sparsity control: hypersparse/bitmap\n") ;
+            break ;
+
+        // 6
+        case GxB_SPARSE + GxB_BITMAP : 
+            GBPR0 ("  sparsity control: sparse/bitmap\n") ;
+            break ;
+
+        // 7
+        case GxB_HYPERSPARSE + GxB_SPARSE + GxB_BITMAP : 
+            GBPR0 ("  sparsity control: hypersparse/sparse/bitmap\n") ;
+            break ;
+
+        // 8
+        case GxB_FULL : 
+            GBPR0 ("  sparsity control: full\n") ;
+            break ;
+
+        // 9
+        case GxB_HYPERSPARSE + GxB_FULL : 
+            GBPR0 ("  sparsity control: hypersparse/full\n") ;
+            break ;
+
+        // 10
+        case GxB_SPARSE + GxB_FULL : 
+            GBPR0 ("  sparsity control: sparse/full\n") ;
+            break ;
+
+        // 11
+        case GxB_HYPERSPARSE + GxB_SPARSE + GxB_FULL : 
+            GBPR0 ("  sparsity control: hypersparse/sparse/full\n") ;
+            break ;
+
+        // 12
+        case GxB_FULL + GxB_BITMAP : 
+            GBPR0 ("  sparsity control: bitmap/full\n") ;
+            break ;
+
+        // 13
+        case GxB_HYPERSPARSE + GxB_BITMAP + GxB_FULL : 
+            GBPR0 ("  sparsity control: hypersparse/bitmap/full\n") ;
+            break ;
+
+        // 14
+        case GxB_SPARSE + GxB_BITMAP + GxB_FULL : 
+            GBPR0 ("  sparsity control: sparse/bitmap/full\n") ;
+            break ;
+
+        // 15
+        case GxB_HYPERSPARSE + GxB_SPARSE + GxB_BITMAP + GxB_FULL : 
+            #if GB_DEVELOPER
+            GBPR0 ("  sparsity control: hyper/sparse/bitmap/full\n") ;
+            #endif
+            break ;
+
+        default : 
+            // invalid sparsity control
+            GBPR0 ("  sparsity control: invalid\n") ;
+            return (GrB_INVALID_OBJECT) ;
+            break ;
+    }
+
     //--------------------------------------------------------------------------
     // check the dimensions
     //--------------------------------------------------------------------------
@@ -97,57 +221,48 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
         A->nzmax < 0 || A->nzmax > GxB_INDEX_MAX)
     { 
         GBPR0 ("  invalid %s dimensions\n", kind) ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "%s invalid : nrows, ncols, or nzmax out of range: [%s]",
-            kind, GB_NAME))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
     //--------------------------------------------------------------------------
     // check vector structure
     //--------------------------------------------------------------------------
 
-    if (A->is_slice)
+    if (is_full) 
     {
-        if (A->is_hyper)
+        // A is full
+        if (! (A->nvec == A->vdim && A->plen == -1))
         { 
-            // A is a hyperslice of a hypersparse matrix
-            GBPR0 ("  hyperslice\n") ;
+            GBPR0 ("  invalid full %s structure\n", kind) ;
+            return (GrB_INVALID_OBJECT) ;
         }
-        else
+    }
+    else if (is_bitmap) 
+    {
+        // A is bitmap
+        if (! (A->nvec == A->vdim && A->plen == -1 &&
+               A->h == NULL && A->p == NULL && A->i == NULL))
         { 
-            // A is a slice of a standard matrix
-            GBPR0 ("  slice [" GBd ":" GBd "]\n",
-                A->hfirst, A->hfirst + A->nvec + - 1) ;
+            GBPR0 ("  invalid bitmap %s structure\n", kind) ;
+            return (GrB_INVALID_OBJECT) ;
         }
-        if (! (A->nvec <= A->vdim && A->plen == A->nvec))
+    }
+    else if (is_sparse)
+    {
+        // A is sparse
+        if (! (A->nvec == A->plen && A->plen == A->vdim))
         { 
-            // invalid slice
-            GBPR0 ("  invalid slice %s structure\n", kind) ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "invalid slice %s structure [%s]", kind, GB_NAME))) ;
+            GBPR0 ("  invalid sparse %s structure\n", kind) ;
+            return (GrB_INVALID_OBJECT) ;
         }
     }
     else
     {
-        if (A->is_hyper)
-        {
-            // A is hypersparse
-            if (! (A->nvec >= 0 && A->nvec <= A->plen && A->plen <= A->vdim))
-            { 
-                GBPR0 ("  invalid hypersparse %s structure\n", kind) ;
-                return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                    "invalid hypersparse %s structure [%s]", kind, GB_NAME))) ;
-            }
-        }
-        else
-        {
-            // A is standard
-            if (! (A->nvec == A->plen && A->plen == A->vdim))
-            { 
-                GBPR0 ("  invalid sparse %s structure\n", kind) ;
-                return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                    "invalid %s structure [%s]", kind, GB_NAME))) ;
-            }
+        // A is hypersparse
+        if (! (A->nvec >= 0 && A->nvec <= A->plen && A->plen <= A->vdim))
+        { 
+            GBPR0 ("  invalid hypersparse %s structure\n", kind) ;
+            return (GrB_INVALID_OBJECT) ;
         }
     }
 
@@ -160,8 +275,9 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
     #if GB_DEVELOPER
     // a matrix contains 1 to 9 different allocated blocks
     int64_t nallocs = 1 +                       // header
-        (A->h != NULL && !A->h_shallow) +       // A->h, if not shallow
         (A->p != NULL && !A->p_shallow) +       // A->p, if not shallow
+        (A->h != NULL && !A->h_shallow) +       // A->h, if not shallow
+        (A->b != NULL && !A->b_shallow) +       // A->b, if not shallow
         (A->i != NULL && !A->i_shallow) +       // A->i, if not shallow
         (A->x != NULL && !A->x_shallow) +       // A->x, if not shallow
         (Pending != NULL) +
@@ -170,7 +286,7 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
         (Pending != NULL && Pending->x != NULL) ;
     if (pr_short || pr_complete)
     {
-        GBPR ("  A %p number of memory blocks: " GBd "\n", A, nallocs) ;
+        GBPR ("  header %p number of memory blocks: " GBd "\n", A, nallocs) ;
     }
     #endif
 
@@ -178,33 +294,13 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
     // check the type
     //--------------------------------------------------------------------------
 
-    GrB_Info info = GB_Type_check (A->type, "", pr_type, f, Context) ;
-    if (info != GrB_SUCCESS || (A->type->size != A->type_size))
+    GrB_Info info = GB_Type_check (A->type, "", pr_type, f) ;
+    if (info != GrB_SUCCESS)
     { 
         GBPR0 ("  %s has an invalid type\n", kind) ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "%s has an invalid type: [%s]", kind, GB_NAME))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
-    //--------------------------------------------------------------------------
-    // report last method used for C=A*B
-    //--------------------------------------------------------------------------
-
-    #if GB_DEVELOPER
-    if ((pr_short || pr_complete) && A->AxB_method_used != GxB_DEFAULT)
-    {
-        GBPR ("  last method used for GrB_mxm, vxm, or mxv: ") ;
-        switch (A->AxB_method_used)
-        {
-            case GxB_AxB_GUSTAVSON: GBPR ("Gustavson") ; break ;
-            case GxB_AxB_HEAP:      GBPR ("heap")      ; break ;
-            case GxB_AxB_DOT:       GBPR ("dot")       ; break ;
-            default: ;
-        }
-        GBPR ("\n") ;
-    }
-    #endif
-
     //--------------------------------------------------------------------------
     // report shallow structure
     //--------------------------------------------------------------------------
@@ -215,157 +311,65 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
         GBPR ("  ->h: %p shallow: %d\n", A->h, A->h_shallow) ;
         GBPR ("  ->p: %p shallow: %d\n", A->p, A->p_shallow) ;
         GBPR ("  ->i: %p shallow: %d\n", A->i, A->i_shallow) ;
+        GBPR ("  ->b: %p shallow: %d\n", A->b, A->b_shallow) ;
         GBPR ("  ->x: %p shallow: %d\n", A->x, A->x_shallow) ;
     }
     #endif
 
-    if (A->is_slice)
-    {
-        // a slice or hyperslice must have shallow i and x content
-        if (!A->i_shallow || !A->x_shallow)
-        { 
-            // bad slice: must have shallow i and x
-            GBPR0 ("  invalid non-shallow slice %s\n", kind) ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "non-shallow: invalid slice %s [%s\n", kind, GB_NAME))) ;
-        }
-    }
-
     //--------------------------------------------------------------------------
     // check p
     //--------------------------------------------------------------------------
 
-    if (A->p == NULL)
-    { 
-        GBPR0 ("  ->p is NULL, invalid %s\n", kind) ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "%s contains a NULL A->p pointer: [%s]", kind, GB_NAME))) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // check h
-    //--------------------------------------------------------------------------
-
-    if (A->is_hyper)
-    {
-        // A is hypersparse
-        if (A->h == NULL)
-        { 
-            GBPR0 ("  ->h NULL, invalid hypersparse %s\n", kind) ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "hypersparse %s contains a NULL A->h pointer: [%s]",
-                kind, GB_NAME))) ;
-        }
-    }
-    else
+    if (is_hyper || is_sparse)
     {
-        // A is standard
-        if (A->h != NULL)
+        if (A->p == NULL)
         { 
-            GBPR0 ("  ->h not NULL, invalid non-hypersparse %s\n",
-                kind) ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "non-hypersparse %s contains a non-NULL A->h pointer: [%s]",
-                kind, GB_NAME))) ;
+            GBPR0 ("  ->p is NULL, invalid %s\n", kind) ;
+            return (GrB_INVALID_OBJECT) ;
         }
     }
 
     //--------------------------------------------------------------------------
-    // check hfirst
+    // check a non-empty matrix
     //--------------------------------------------------------------------------
 
-    if (A->is_slice && !A->is_hyper)
-    {
-        // hfirst is the first vector in a slice of a standard sparse matrix
-        if (A->hfirst < 0 || A->hfirst + A->nvec > A->vdim)
-        { 
-            // bad slice: hfirst invalid
-            GBPR0 ("  hfirst: invalid slice %s\n", kind) ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "hfirst: invalid slice %s [%s]\n", kind, GB_NAME))) ;
-        }
-    }
-    else
+    bool A_empty = (A->nzmax == 0) ;
+    if (is_hyper || is_sparse)
     {
-        // only a standard slice can have a nonzero hfirst
-        if (A->hfirst != 0)
+        if (!A_empty && A->i == NULL)
         { 
-            // bad hyperslice: only a standard slice can have a nonzero hfirst
-            GBPR0 ("  hfirst: invalid slice %s\n", kind) ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "hfirst: invalid slice %s [%s]\n", kind, GB_NAME))) ;
+            GBPR0 ("  ->i is NULL, invalid %s\n", kind) ;
+            return (GrB_INVALID_OBJECT) ;
         }
     }
 
     //--------------------------------------------------------------------------
-    // check an empty matrix
+    // check the content of p
     //--------------------------------------------------------------------------
 
-    bool A_empty = (A->nzmax == 0) ;
-
-    if (A_empty && !(A->is_slice))
+    if (is_hyper || is_sparse)
     {
-        // A->x and A->i pointers must be NULL and shallow must be false
-
-        if (A->i != NULL || A->i_shallow || A->x_shallow)
+        if (A->p [0] != 0)
         { 
-            GBPR0 ("  invalid empty %s\n", kind) ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "invalid empty %s: [%s]", kind, GB_NAME))) ;
+            GBPR0 ("  ->p [0] = " GBd " invalid\n", A->p [0]) ;
+            return (GrB_INVALID_OBJECT) ;
         }
 
-        // check the vector pointers
-        for (int64_t j = 0 ; j <= A->nvec ; j++)
+        for (int64_t j = 0 ; j < A->nvec ; j++)
         {
-            if (A->p [j] != 0)
+            if (A->p [j+1] < A->p [j] || A->p [j+1] > A->nzmax)
             { 
-                GBPR0 ("  ->p [" GBd "] = " GBd " invalid\n", j, A->p [j]) ;
-                return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                    "%s ->p [" GBd "] = " GBd " invalid: [%s]",
-                    kind, j, A->p[j], GB_NAME))) ;
+                GBPR0 ("  ->p [" GBd "] = " GBd " invalid\n", j+1, A->p [j+1]) ;
+                return (GrB_INVALID_OBJECT) ;
             }
         }
     }
 
-    //--------------------------------------------------------------------------
-    // check a non-empty matrix
-    //--------------------------------------------------------------------------
-
-    if (!A_empty && A->i == NULL)
-    { 
-        GBPR0 ("  ->i is NULL, invalid %s\n", kind) ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "%s contains a NULL A->i pointer: [%s]", kind, GB_NAME))) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // check the content of p
-    //--------------------------------------------------------------------------
-
-    if (A->is_slice ? (A->p [0] < 0) : (A->p [0] != 0))
-    { 
-        GBPR0 ("  ->p [0] = " GBd " invalid\n", A->p [0]) ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "%s A->p [0] = " GBd " invalid: [%s]", kind, A->p [0], GB_NAME))) ;
-    }
-
-    for (int64_t j = 0 ; j < A->nvec ; j++)
-    {
-        if (A->p [j+1] < A->p [j] || A->p [j+1] > A->nzmax)
-        { 
-            GBPR0 ("  ->p [" GBd "] = " GBd " invalid\n",
-                j+1, A->p [j+1]) ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "%s A->p [" GBd "] = " GBd " invalid: [%s]",
-                kind, j+1, A->p [j+1], GB_NAME))) ;
-        }
-    }
-
     //--------------------------------------------------------------------------
     // check the content of h
     //--------------------------------------------------------------------------
 
-    if (A->is_hyper)
+    if (is_hyper)
     {
         int64_t jlast = -1 ;
         for (int64_t k = 0 ; k < A->nvec ; k++)
@@ -374,9 +378,7 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
             if (jlast >= j || j < 0 || j >= A->vdim)
             { 
                 GBPR0 ("  ->h [" GBd "] = " GBd " invalid\n", k, j) ;
-                return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                    "%s A->h [" GBd "] = " GBd " invalid: [%s]",
-                    kind, k, j, GB_NAME))) ;
+                return (GrB_INVALID_OBJECT) ;
             }
             jlast = j ;
         }
@@ -393,7 +395,7 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
     }
 
     // # of entries cannot be computed until all the tests above are OK
-    int64_t anz = GB_NNZ (A) ;
+    int64_t anz = is_full ? GB_NNZ_FULL (A) : GB_NNZ (A) ;
     if (anz == 0)
     { 
         GBPR0 ("no entries\n") ;
@@ -413,13 +415,6 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
 
     if (Pending != NULL || A->nzombies != 0)
     { 
-        if (A->is_slice)
-        { 
-            // a slice or hyperslice cannot have pending work
-            GBPR0 ("  slice %s invalid: unfinished\n", kind) ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "slice %s invalid: unfinished [%s]", kind, GB_NAME))) ;
-        }
         GBPR0 ("  pending tuples: " GBd " max pending: " GBd 
             " zombies: " GBd "\n", GB_Pending_n (A),
             (Pending == NULL) ? 0 : (Pending->nmax),
@@ -430,59 +425,103 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
     { 
         GBPR0 ("  invalid number of zombies: " GBd " "
             "must be >= 0 and <= # entries (" GBd ")\n", A->nzombies, anz) ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "%s invalid number of zombies: " GBd "\n"
-            "must be >= 0 and <= # entries (" GBd ") [%s]",
-            kind, A->nzombies, anz, GB_NAME))) ;
+        return (GrB_INVALID_OBJECT) ;
+    }
+
+    if (is_full || is_bitmap)
+    {
+        if (A->nzombies != 0)
+        { 
+            // full/bitmap cannot have zombies
+            GBPR0 ("  %s %s cannot have zombies\n",
+                is_full ? "full" : "bitmap", kind) ;
+            return (GrB_INVALID_OBJECT) ;
+        }
+        if (Pending != NULL)
+        { 
+            // full/bitmap cannot have pending tuples
+            GBPR0 ("  %s %s cannot have pending tuples\n",
+                is_full ? "full" : "bitmap", kind) ;
+            return (GrB_INVALID_OBJECT) ;
+        }
+        if (A->jumbled)
+        { 
+            // full/bitmap jumbled
+            GBPR0 ("  %s %s cannot be jumbled\n",
+                is_full ? "full" : "bitmap", kind) ;
+            return (GrB_INVALID_OBJECT) ;
+        }
     }
 
     //--------------------------------------------------------------------------
     // check and print the row indices and numerical values
     //--------------------------------------------------------------------------
 
-    if (anz > 0) GBPR0 ("\n") ;
+    if (anz > 0 && !phantom) GBPR0 ("\n") ;
 
     #define GB_NBRIEF 10
     #define GB_NZBRIEF 30
 
-    bool jumbled = false ;
     int64_t nzombies = 0 ;
-    int64_t jcount = 0 ;
+    int64_t icount = 0 ;
     bool truncated = false ;
+    int64_t anz_actual = 0 ;
 
-    GBI_for_each_vector (A)
+    // for each vector of A
+    for (int64_t k = 0 ; k < A->nvec ; k++)
     {
+        if (phantom) break ;
         int64_t ilast = -1 ;
-        GBI_for_each_entry (j, p, pend)
+        int64_t j = GBH (A->h, k) ;
+        int64_t p = GBP (A->p, k, A->vlen) ;
+        int64_t pend = GBP (A->p, k+1, A->vlen) ;
+
+        // count the entries in A(:,j)
+        int64_t ajnz = pend - p ;
+        if (is_bitmap)
         {
-            bool prcol = ((pr_short && jcount < GB_NBRIEF) || pr_complete) ;
-            if (ilast == -1)
+            ajnz = 0 ;
+            for (int64_t p2 = p ; p2 < pend ; p2++)
             {
-                // print the header for vector j
-                if (prcol)
+                int8_t ab = A->b [p2] ;
+                if (ab < 0 || ab > 1)
                 { 
-                    #if GB_DEVELOPER
-                    GBPR ("  %s: " GBd " : " GBd " entries [" GBd ":" GBd "]\n",
-                        A->is_csc ? "column" : "row", j, pend - p, p, pend-1) ;
-                    #endif
+                    // bitmap with value other than 0, 1
+                    GBPR0 ("    invalid bitmap %d\n", ab) ;
+                    return (GrB_INVALID_OBJECT) ;
                 }
-                else if (pr_short && jcount == GB_NBRIEF)
-                { 
-                    truncated = true ;
-                    #if GB_DEVELOPER
-                    GBPR ("    ...\n") ;
-                    #endif
-                }
-                jcount++ ;      // count # of vectors printed so far
+                ajnz += (ab != 0)  ;
             }
-            int64_t i = A->i [p] ;
+        }
+
+        bool prcol = ((pr_short && !truncated) || pr_complete) ;
+        // print the header for vector j
+        if (prcol)
+        { 
+            #if GB_DEVELOPER
+            GBPR ("  %s: " GBd " : " GBd " entries [" GBd ":" GBd "]\n",
+                A->is_csc ? "column" : "row", j, ajnz, p, pend-1) ;
+            if (pr_short && k == GB_NBRIEF) truncated = true ;
+            #endif
+        }
+
+        // for each entry in A(:,j), the kth vector of A
+        for ( ; p < pend ; p++)
+        {
+            if (!GBB (A->b, p)) continue ;
+            anz_actual++ ;
+            icount++ ;
+
+            int64_t i = GBI (A->i, p, A->vlen) ;
             bool is_zombie = GB_IS_ZOMBIE (i) ;
             i = GB_UNFLIP (i) ;
             if (is_zombie) nzombies++ ;
+            bool print_value = false ;
             if (prcol)
             { 
-                if ((pr_short && p < GB_NZBRIEF) || pr_complete)
+                if ((pr_short && icount < GB_NZBRIEF) || pr_complete)
                 { 
+                    print_value = true ;
                     #if GB_DEVELOPER
                     GBPR ("    %s " GBd ": ", A->is_csc ? "row":"column", i) ;
                     #else
@@ -496,12 +535,9 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
                     }
                     #endif
                 }
-                else if (pr_short && (ilast == -1 || p == GB_NZBRIEF))
+                else if (pr_short && (ilast == -1 || icount == GB_NZBRIEF))
                 { 
                     truncated = true ;
-                    #if GB_DEVELOPER
-                    GBPR ("        ...\n") ;
-                    #endif
                 }
             }
             int64_t row = A->is_csc ? i : j ;
@@ -509,14 +545,10 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
             if (i < 0 || i >= A->vlen)
             { 
                 GBPR0 ("  index (" GBd "," GBd ") out of range\n", row, col) ;
-                return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                    "%s index (" GBd "," GBd ") out of range: [%s]",
-                    kind, row, col, GB_NAME))) ;
+                return (GrB_INVALID_OBJECT) ;
             }
 
             // print the value
-            bool print_value = prcol &&
-                ((pr_short && p < GB_NZBRIEF) || pr_complete) ;
             if (print_value)
             { 
                 if (is_zombie)
@@ -527,17 +559,19 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
                 { 
                     GB_void *Ax = (GB_void *) A->x ;
                     info = GB_entry_check (A->type, Ax +(p * (A->type->size)),
-                        pr, f, Context) ;
+                        pr, f) ;
                     if (info != GrB_SUCCESS) return (info) ;
                 }
             }
 
-            if (i <= ilast)
+            // If the matrix is known to be jumbled, then out-of-order
+            // indices are OK (but duplicates are not OK).  If the matrix is
+            // unjumbled, then all indices must appear in ascending order.
+            if (A->jumbled ? (i == ilast) : (i <= ilast))
             { 
                 // indices unsorted, or duplicates present
-                GBPR0 (" index (" GBd "," GBd ") jumbled", row, col) ;
-                jumbled = true ;
-                print_value = (!pr_silent) ;
+                GBPR0 (" index (" GBd "," GBd ") invalid\n", row, col) ;
+                return (GrB_INDEX_OUT_OF_BOUNDS) ;
             }
 
             if (print_value)
@@ -548,11 +582,19 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
         }
     }
 
-    #if GB_DEVELOPER
-    // ... already printed
-    #else
     if (pr_short && truncated) GBPR ("    ...\n") ;
-    #endif
+
+    //--------------------------------------------------------------------------
+    // check the entry count in the bitmap
+    //--------------------------------------------------------------------------
+
+    if (is_bitmap && anz != anz_actual)
+    { 
+        // bitmap with invalid nvals
+        GBPR0 ("  invalid bitmap count: " GBd " exist but"
+            " A->nvals = " GBd "\n", anz_actual, anz) ;
+        return (GrB_INVALID_OBJECT) ;
+    }
 
     //--------------------------------------------------------------------------
     // check the zombie count
@@ -562,9 +604,7 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
     { 
         GBPR0 ("  invalid zombie count: " GBd " exist but"
             " A->nzombies = " GBd "\n", nzombies, A->nzombies) ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "%s invalid zombie count: " GBd " exist but A->nzombies = " GBd " "
-            "[%s]", kind, nzombies, A->nzombies, GB_NAME))) ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
     //--------------------------------------------------------------------------
@@ -572,7 +612,7 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
     //--------------------------------------------------------------------------
 
     #if GB_DEVELOPER
-    if (pr_short || pr_complete)
+    if ((pr_short || pr_complete) && (is_sparse || is_hyper))
     {
         GBPR ("  Pending %p\n", Pending) ;
     }
@@ -598,9 +638,7 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
             Pending->nmax < 0)
         { 
             GBPR0 ("  invalid pending count\n") ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "%s invalid pending tuple count: pending " GBd " max "
-                GBd ": [%s]", kind, Pending->n, Pending->nmax, GB_NAME))) ;
+            return (GrB_INVALID_OBJECT) ;
         }
 
         // matrix has tuples, arrays and type must not be NULL
@@ -608,18 +646,17 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
             (A->vdim > 1 && Pending->j == NULL))
         { 
             GBPR0 ("  invalid pending tuples\n") ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "%s invalid pending tuples: [%s]", kind, GB_NAME))) ;
+            return (GrB_INVALID_OBJECT) ;
         }
 
         GBPR0 ("  pending tuples:\n") ;
 
-        info = GB_Type_check (Pending->type, "", pr, f, Context) ;
+        info = GB_Type_check (Pending->type, "", pr, f) ;
         if (info != GrB_SUCCESS || (Pending->type->size != Pending->size))
         { 
+            // invalid Pending->type
             GBPR0 ("  %s has an invalid Pending->type\n", kind) ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "%s has an invalid Pending->type: [%s]", kind, GB_NAME))) ;
+            return (GrB_INVALID_OBJECT) ;
         }
 
         int64_t ilast = -1 ;
@@ -638,7 +675,7 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
             { 
                 GBPR ("    row: " GBd " col: " GBd " ", row, col) ;
                 info = GB_entry_check (Pending->type,
-                    Pending->x +(k * Pending->type->size), pr, f, Context) ;
+                    Pending->x +(k * Pending->type->size), pr, f) ;
                 if (info != GrB_SUCCESS) return (info) ;
                 GBPR ("\n") ;
             }
@@ -646,9 +683,7 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
             if (i < 0 || i >= A->vlen || j < 0 || j >= A->vdim)
             { 
                 GBPR0 ("    tuple (" GBd "," GBd ") out of range\n", row, col) ;
-                return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                    "%s tuple index (" GBd "," GBd ") out of range: [%s]",
-                    kind, row, col, GB_NAME))) ;
+                return (GrB_INVALID_OBJECT) ;
             }
 
             sorted = sorted && ((jlast < j) || (jlast == j && ilast <= i)) ;
@@ -659,8 +694,7 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
         if (sorted != Pending->sorted)
         { 
             GBPR0 ("  invalid pending tuples: invalid sort\n") ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "%s invalid pending tuples: [%s]", kind, GB_NAME))) ;
+            return (GrB_INVALID_OBJECT) ;
         }
 
         if (Pending->op == NULL)
@@ -669,64 +703,14 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
         }
         else
         {
-            info = GB_BinaryOp_check (Pending->op, "pending operator:", pr, f,
-                Context) ;
+            info = GB_BinaryOp_check (Pending->op, "pending operator:", pr, f) ;
             if (info != GrB_SUCCESS)
             { 
                 GBPR0 ("  invalid pending operator\n") ;
-                return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                    "%s invalid operator: [%s]", kind, GB_NAME))) ;
-            }
-        }
-    }
-
-    // TODO in 4.0: delete this entire if(){...} [
-    if (!ignore_zombies)
-    {
-        GrB_Matrix head, prev, next ;
-        bool enqd ;
-
-        if (!GB_queue_status (A, &head, &prev, &next, &enqd)) GB_PANIC ;
-
-        #define GB_IS_NOT_IN_QUEUE(A) (prev == NULL && head != A)
-        #define GB_IS_IN_QUEUE(A) (! GB_IS_NOT_IN_QUEUE(A))
-        if (enqd != GB_IS_IN_QUEUE (A))
-        { 
-            GBPR0 ("  queued state inconsistent: [%d] != [%d]\n",
-                enqd, GB_IS_IN_QUEUE (A)) ;
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "%s queued state inconsistent: [%s], [%d] != [%d]", kind,
-                GB_NAME, enqd, GB_IS_IN_QUEUE (A)))) ;
-        }
-
-        if (GB_PENDING (A) || GB_ZOMBIES (A))
-        {
-            if (!enqd)
-            { 
-                GBPR0 ("  must be in queue but is not there\n") ;
-                return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                "%s must be in queue but is not there: [%s]", kind, GB_NAME))) ;
-            }
-            // prev is NULL if and only if A is at the head of the queue
-            if ((prev == NULL) != (head == A))
-            { 
-                GBPR0 ("  invalid queue\n") ;
-                return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                    "%s invalid queue: [%s]", kind, GB_NAME))) ;
-            }
-        }
-        else
-        {
-            if (enqd)
-            { 
-                GBPR0 ("  must not be in queue but is there\n") ;
-                return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-                    "%s must not be in queue but present there: [%s]",
-                    kind, GB_NAME))) ;
+                return (GrB_INVALID_OBJECT) ;
             }
         }
     }
-    // end TODO in 4: delete ]
 
     if (pr_complete)
     { 
@@ -742,36 +726,20 @@ GrB_Info GB_matvec_check    // check a GraphBLAS matrix or vector
     // GxB_Matrix_import*, and in other cases when its computation is postponed
     // or not needed.  If not -1, however, the value must be correct.
 
-    int64_t actual_nvec_nonempty = GB_nvec_nonempty (A, Context) ;
+    int64_t actual_nvec_nonempty = GB_nvec_nonempty (A, NULL) ;
 
     if (! ((A->nvec_nonempty == actual_nvec_nonempty) ||
            (A->nvec_nonempty == -1)))
     { 
-        GBPR0 ("  invalid count of non-empty vectors\n"
-            "A->nvec_nonempty = " GBd " actual " GBd "\n",
-            A->nvec_nonempty, actual_nvec_nonempty) ;
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "%s invalid count of nonempty-vectors [%s]", kind, GB_NAME))) ;
+        // invalid nvec_nonempty
+        GBPR0 ("  invalid count of non-empty vectors\n") ;
+        return (GrB_INVALID_OBJECT) ;
     }
 
     //--------------------------------------------------------------------------
     // return result
     //--------------------------------------------------------------------------
 
-    // Returns GrB_INVALID_OBJECT if a row or column index is out of bounds,
-    // since this indicates the object is corrupted.  No valid matrix is ever
-    // built with indices out of bounds since the indices are checked when the
-    // matrix is built.
-
-    // Returns GrB_INDEX_OUT_OF_BOUNDS if a column has unsorted indices, and
-    // perhaps duplicates as well.  For matrices passed back to the user, or
-    // obtained from the user, this is an error.  For some matrices internally,
-    // the row indices may be jumbled.  These are about to be sorted via qsort
-    // or transpose.  In this case, a jumbled matrix is OK.  Duplicates are
-    // still an error but this function does not distinguish between the two
-    // cases (it would require workspace to do so).  See the
-    // ASSERT_OK_OR_JUMBLED macro.
-
-    return (jumbled ? GrB_INDEX_OUT_OF_BOUNDS : GrB_SUCCESS) ;
+    return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_matvec_type.c b/GraphBLAS/Source/GB_matvec_type.c
index 5c3994e30c..3a8507c30c 100644
--- a/GraphBLAS/Source/GB_matvec_type.c
+++ b/GraphBLAS/Source/GB_matvec_type.c
@@ -2,8 +2,8 @@
 // GB_matvec_type: return the type of a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_memcpy.c b/GraphBLAS/Source/GB_memcpy.c
index 453e6a94ea..c5e71755ec 100644
--- a/GraphBLAS/Source/GB_memcpy.c
+++ b/GraphBLAS/Source/GB_memcpy.c
@@ -2,8 +2,8 @@
 // GB_memcpy: parallel memcpy
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,7 +18,7 @@ void GB_memcpy                  // parallel memcpy
     void *dest,                 // destination
     const void *src,            // source
     size_t n,                   // # of bytes to copy
-    int nthreads                // # of threads to use
+    int nthreads                // max # of threads to use
 )
 {
 
@@ -35,7 +35,7 @@ void GB_memcpy                  // parallel memcpy
     {
 
         //----------------------------------------------------------------------
-        // memcpy using a multiple threads
+        // memcpy using multiple threads
         //----------------------------------------------------------------------
 
         size_t nchunks = 1 + (n / GB_MEM_CHUNK) ;
diff --git a/GraphBLAS/Source/GB_memset.c b/GraphBLAS/Source/GB_memset.c
new file mode 100644
index 0000000000..cb169280e4
--- /dev/null
+++ b/GraphBLAS/Source/GB_memset.c
@@ -0,0 +1,61 @@
+//------------------------------------------------------------------------------
+// GB_memset: parallel memset
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Note that this function uses its own hard-coded chunk size.
+
+#include "GB.h"
+
+#define GB_MEM_CHUNK (1024*1024)
+
+void GB_memset                  // parallel memset
+(
+    void *dest,                 // destination
+    const int c,                // value to to set
+    size_t n,                   // # of bytes to set
+    int nthreads                // max # of threads to use
+)
+{
+
+    if (nthreads <= 1 || n <= GB_MEM_CHUNK)
+    { 
+
+        //----------------------------------------------------------------------
+        // memset using a single thread
+        //----------------------------------------------------------------------
+
+        memset (dest, c, n) ;
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // memset using multiple threads
+        //----------------------------------------------------------------------
+
+        size_t nchunks = 1 + (n / GB_MEM_CHUNK) ;
+        if (((size_t) nthreads) > nchunks)
+        { 
+            nthreads = (int) nchunks ;
+        }
+        GB_void *pdest = (GB_void *) dest ;
+
+        int64_t k ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+        for (k = 0 ; k < nchunks ; k++)
+        {
+            size_t start = k * GB_MEM_CHUNK ;
+            if (start < n)
+            { 
+                size_t chunk = GB_IMIN (n - start, GB_MEM_CHUNK) ;
+                memset (pdest + start, c, chunk) ;
+            }
+        }
+    }
+}
+
diff --git a/GraphBLAS/Source/GB_msort_1.c b/GraphBLAS/Source/GB_msort_1.c
deleted file mode 100644
index c1af789553..0000000000
--- a/GraphBLAS/Source/GB_msort_1.c
+++ /dev/null
@@ -1,381 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_msort_1: sort a list of integers
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// A parallel mergesort of an array of n integers.
-
-#include "GB_msort_1.h"
-
-//------------------------------------------------------------------------------
-// GB_merge_sequential_1: merge two sorted lists via a single thread
-//------------------------------------------------------------------------------
-
-// merge Left [0..nleft-1] and Right [0..nright-1] into S [0..nleft+nright-1] */
-
-static void GB_merge_sequential_1
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nleft + nright
-    const int64_t *GB_RESTRICT Left_0,     // left input of length nleft
-    const int64_t nleft,
-    const int64_t *GB_RESTRICT Right_0,    // right input of length nright
-    const int64_t nright
-)
-{
-    int64_t p, pleft, pright ;
-
-    // merge the two inputs, Left and Right, while both inputs exist
-    for (p = 0, pleft = 0, pright = 0 ; pleft < nleft && pright < nright ; p++)
-    {
-        if (GB_lt_1 (Left_0, pleft, Right_0, pright))
-        { 
-            // S [p] = Left [pleft++]
-            S_0 [p] = Left_0 [pleft] ;
-            pleft++ ;
-        }
-        else
-        { 
-            // S [p] = Right [pright++]
-            S_0 [p] = Right_0 [pright] ;
-            pright++ ;
-        }
-    }
-
-    // either input is exhausted; copy the remaining list into S
-    if (pleft < nleft)
-    { 
-        int64_t nremaining = (nleft - pleft) ;
-        memcpy (S_0 + p, Left_0 + pleft, nremaining * sizeof (int64_t)) ;
-    }
-    else if (pright < nright)
-    { 
-        int64_t nremaining = (nright - pright) ;
-        memcpy (S_0 + p, Right_0 + pright, nremaining * sizeof (int64_t)) ;
-    }
-}
-
-//------------------------------------------------------------------------------
-// GB_merge_parallel_1: parallel merge
-//------------------------------------------------------------------------------
-
-// The two input arrays, Bigger [0..nbigger-1] and Smaller [0..nsmaller-1], are
-// sorted.  They are merged into the output array S [0..nleft+nright-1], using
-// a parallel merge.  nbigger >= nsmaller always holds.
-
-void GB_merge_parallel_1                // parallel merge
-(
-    int64_t *GB_RESTRICT S_0,           // output of length nbigger + nsmaller
-    const int64_t *GB_RESTRICT Bigger_0,   // Bigger [0..nbigger-1]
-    const int64_t nbigger,
-    const int64_t *GB_RESTRICT Smaller_0,  // Smaller [0..nsmaller-1]
-    const int64_t nsmaller
-)
-{
-
-    //--------------------------------------------------------------------------
-    // split the bigger input in half
-    //--------------------------------------------------------------------------
-
-    // The first task will handle Bigger [0..nhalf-1], and the second task
-    // will handle Bigger [nhalf..n-1].
-
-    int64_t nhalf = nbigger/2 ;
-    int64_t Pivot_0 [1] ; Pivot_0 [0] = Bigger_0 [nhalf] ;
-
-    //--------------------------------------------------------------------------
-    // find where the Pivot appears in the smaller list
-    //--------------------------------------------------------------------------
-
-    // binary search of Smaller [0..nsmaller-1] for the Pivot
-
-    long pleft = 0, pright = nsmaller-1 ;
-    while (pleft < pright)
-    {
-        long pmiddle = (pleft + pright) >> 1 ;
-        if (GB_lt_1 (Smaller_0, pmiddle, Pivot_0, 0))
-        { 
-            // if in the list, Pivot appears in [pmiddle+1..pright]
-            pleft = pmiddle + 1 ;
-        }
-        else
-        { 
-            // if in the list, Pivot appears in [pleft..pmiddle]
-            pright = pmiddle ;
-        }
-    }
-
-    // binary search is narrowed down to a single item
-    // or it has found the list is empty:
-    ASSERT (pleft == pright || pleft == pright + 1) ;
-
-    // If found is true then Smaller [pleft == pright] == Pivot.  If duplicates
-    // appear then Smaller [pleft] is any one of the entries equal to the Pivot
-    // in the list.  If found is false then
-    //    Smaller [original_pleft ... pleft-1] < Pivot and
-    //    Smaller [pleft+1 ... original_pright] > Pivot holds.
-    //    The value Smaller [pleft] may be either < or > Pivot.
-    bool found = (pleft == pright &&
-        Smaller_0 [pleft] == Pivot_0 [0]) ;
-
-    // Modify pleft and pright:
-    if (!found && (pleft == pright))
-    { 
-        if (GB_lt_1 (Smaller_0, pleft, Pivot_0, 0))
-        {
-            pleft++ ;
-        }
-        else
-        {
-            pright++ ;
-        }
-    }
-
-    // Now the following conditions hold:
-
-    // If found is false then
-    //    Smaller [original_pleft ... pleft-1] < Pivot and
-    //    Smaller [pleft ... original_pright] > Pivot holds,
-    //    and pleft-1 == pright
-
-    // If Smaller has no duplicates, then whether or not Pivot is found,
-    //    Smaller [original_pleft ... pleft-1] < Pivot and
-    //    Smaller [pleft ... original_pright] >= Pivot holds.
-
-    //--------------------------------------------------------------------------
-    // merge each part in parallel
-    //--------------------------------------------------------------------------
-
-    // The first task merges Bigger [0..nhalf-1] and Smaller [0..pleft-1] into
-    // the output S [0..nhalf+pleft-1].  The entries in Bigger [0..nhalf-1] are
-    // all < Pivot (if no duplicates appear in Bigger) or <= Pivot otherwise.
-
-    int64_t *GB_RESTRICT S_task0_0 = S_0 ;
-
-    const int64_t *GB_RESTRICT Left_task0_0 = Bigger_0 ;
-    const int64_t nleft_task0 = nhalf ;
-
-    const int64_t *GB_RESTRICT Right_task0_0 = Smaller_0 ;
-    const int64_t nright_task0 = pleft ;
-
-    // The second task merges Bigger [nhalf..nbigger-1] and
-    // Smaller [pleft..nsmaller-1] into the output S [nhalf+pleft..n-1].
-    // The entries in Bigger [nhalf..nbigger-1] and Smaller [pleft..nsmaller-1]
-    // are all >= Pivot.
-
-    int64_t *GB_RESTRICT S_task1_0 = S_0 + nhalf + pleft ;
-
-    const int64_t *GB_RESTRICT Left_task1_0 = Bigger_0 + nhalf ;
-    const int64_t nleft_task1 = (nbigger - nhalf) ;
-
-    const int64_t *GB_RESTRICT Right_task1_0 = Smaller_0 + pleft ;
-    const int64_t nright_task1 = (nsmaller - pleft) ;
-
-    GB_TASK (GB_merge_select_1, S_task0_0,
-        Left_task0_0,  nleft_task0,
-        Right_task0_0, nright_task0) ;
-
-    GB_TASK (GB_merge_select_1, S_task1_0,
-        Left_task1_0,  nleft_task1,
-        Right_task1_0, nright_task1) ;
-
-    GB_TASK_WAIT
-}
-
-//------------------------------------------------------------------------------
-// GB_merge_select_1: parallel or sequential merge
-//------------------------------------------------------------------------------
-
-// The two input arrays, Left [0..nleft-1] and Right [0..nright-1], are sorted.
-// They are merged into the output array S [0..nleft+nright-1], using either
-// the sequential merge (for small lists) or the parallel merge (for big
-// lists).
-
-void GB_merge_select_1      // parallel or sequential merge of 2-by-n arrays
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nleft+nright
-    const int64_t *GB_RESTRICT Left_0,     // Left [0..nleft-1]
-    const int64_t nleft,
-    const int64_t *GB_RESTRICT Right_0,    // Right [0..nright-1]
-    const int64_t nright
-)
-{
-
-    if (nleft + nright < GB_BASECASE)
-    { 
-        // sequential merge
-        GB_merge_sequential_1 (S_0, Left_0, nleft, Right_0, nright) ;
-    }
-    else if (nleft >= nright)
-    { 
-        // parallel merge, where Left [0..nleft-1] is the bigger of the two.
-        GB_merge_parallel_1 (S_0, Left_0, nleft, Right_0, nright) ;
-    }
-    else
-    { 
-        // parallel merge, where Right [0..nright-1] is the bigger of the two.
-        GB_merge_parallel_1 (S_0, Right_0, nright, Left_0, nleft) ;
-    }
-}
-
-//------------------------------------------------------------------------------
-// GB_mergesort_1:  parallel merge sort of a length-n array
-//------------------------------------------------------------------------------
-
-// GB_mergesort_1 sorts an int64_t array A of size n in ascending
-// order, using a parallel mergesort.  W is a workspace array of size n.
-// Small arrays are sorted with a quicksort method.
-
-void GB_mergesort_1 // sort array A of size n
-(
-    int64_t *GB_RESTRICT A_0,      // size n array
-    int64_t *GB_RESTRICT W_0,      // size n array, workspace
-    const int64_t n
-)
-{
-
-    if (n <= GB_BASECASE)
-    { 
-
-        // ---------------------------------------------------------------------
-        // sequential quicksort; no workspace needed
-        // ---------------------------------------------------------------------
-
-        GB_qsort_1a (A_0, n) ;
-
-    }
-    else
-    { 
-
-        // ---------------------------------------------------------------------
-        // recursive merge sort if A has length greater than GB_BASECASE
-        // ---------------------------------------------------------------------
-
-        // ---------------------------------------------------------------------
-        // split A into four quarters
-        // ---------------------------------------------------------------------
-
-        int64_t n12 = n >> 1 ;          // split n into n12 and n34
-        int64_t n34 = n - n12 ;
-
-        int64_t n1 = n12 >> 1 ;         // split n12 into n1 and n2
-        int64_t n2 = n12 - n1 ;
-
-        int64_t n3 = n34 >> 1 ;         // split n34 into n3 and n4
-        int64_t n4 = n34 - n3 ;
-
-        int64_t n123 = n12 + n3 ;       // start of 4th quarter = n1 + n2 + n3
-
-        // 1st quarter of A and W
-        int64_t *GB_RESTRICT A_1st0 = A_0 ;
-
-        int64_t *GB_RESTRICT W_1st0 = W_0 ;
-
-        // 2nd quarter of A and W
-        int64_t *GB_RESTRICT A_2nd0 = A_0 + n1 ;
-
-        int64_t *GB_RESTRICT W_2nd0 = W_0 + n1 ;
-
-        // 3rd quarter of A and W
-        int64_t *GB_RESTRICT A_3rd0 = A_0 + n12 ;
-
-        int64_t *GB_RESTRICT W_3rd0 = W_0 + n12 ;
-
-        // 4th quarter of A and W
-        int64_t *GB_RESTRICT A_4th0 = A_0 + n123 ;
-
-        int64_t *GB_RESTRICT W_4th0 = W_0 + n123 ;
-
-        // ---------------------------------------------------------------------
-        // sort each quarter of A in parallel, using W as workspace
-        // ---------------------------------------------------------------------
-
-        GB_TASK (GB_mergesort_1, A_1st0, W_1st0, n1) ;
-        GB_TASK (GB_mergesort_1, A_2nd0, W_2nd0, n2) ;
-        GB_TASK (GB_mergesort_1, A_3rd0, W_3rd0, n3) ;
-        GB_TASK (GB_mergesort_1, A_4th0, W_4th0, n4) ;
-
-        GB_TASK_WAIT
-
-        // ---------------------------------------------------------------------
-        // merge pairs of quarters of A into two halves of W, in parallel
-        // ---------------------------------------------------------------------
-
-        GB_TASK (GB_merge_select_1, W_1st0, A_1st0, n1, A_2nd0, n2) ;
-        GB_TASK (GB_merge_select_1, W_3rd0, A_3rd0, n3, A_4th0, n4) ;
-
-        GB_TASK_WAIT
-
-        // ---------------------------------------------------------------------
-        // merge the two halves of W into A
-        // ---------------------------------------------------------------------
-
-        GB_merge_select_1 (A_0, W_1st0, n12, W_3rd0, n34) ;
-    }
-}
-
-//------------------------------------------------------------------------------
-// GB_msort_1: gateway for parallel merge sort
-//------------------------------------------------------------------------------
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void GB_msort_1     // sort array A of size n.
-(
-    int64_t *GB_RESTRICT A_0,   // size n array
-    int64_t *GB_RESTRICT W_0,   // size n array, workspace
-    const int64_t n,
-    int nthreads                // # of threads to use
-)
-{
-
-    nthreads = GB_MSORT_NTHREADS (nthreads) ;
-
-    if (nthreads > 1)
-    {
-
-        // ---------------------------------------------------------------------
-        // parallel mergesort
-        // ---------------------------------------------------------------------
-
-        if (GB_OPENMP_GET_NUM_THREADS > 1)
-        {
-
-            // -----------------------------------------------------------------
-            // parallel mergesort: already in parallel region
-            // -----------------------------------------------------------------
-
-            // GB_msort_1 is already in a parallel region in the caller.  This
-            // does not occur inside GraphBLAS, but the user application might
-            // be calling GraphBLAS inside its own parallel region.
-
-            GB_mergesort_1 (A_0, W_0, n) ;
-
-        }
-        else
-        { 
-
-            // -----------------------------------------------------------------
-            // parallel mergesort: start a parallel region
-            // -----------------------------------------------------------------
-
-            GB_TASK_MASTER (nthreads)
-            GB_mergesort_1 (A_0, W_0, n) ;
-
-        }
-
-    }
-    else
-    {
-
-        // ---------------------------------------------------------------------
-        // sequential quicksort
-        // ---------------------------------------------------------------------
-
-        // The method is in-place, and the workspace is not used.
-        GB_qsort_1a (A_0, n) ;
-    }
-}
-
diff --git a/GraphBLAS/Source/GB_msort_1.h b/GraphBLAS/Source/GB_msort_1.h
deleted file mode 100644
index b584dd337d..0000000000
--- a/GraphBLAS/Source/GB_msort_1.h
+++ /dev/null
@@ -1,42 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_msort_1.h: definitions for GB_msort_1.c
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// A parallel mergesort of an array of n integers.
-
-#include "GB_sort.h"
-
-//------------------------------------------------------------------------------
-// prototypes only needed for GB_msort_1
-//------------------------------------------------------------------------------
-
-void GB_merge_parallel_1                // parallel merge
-(
-    int64_t *GB_RESTRICT S_0,           // output of length nbigger + nsmaller
-    const int64_t *GB_RESTRICT Bigger_0,   // Bigger [0..nbigger-1]
-    const int64_t nbigger,
-    const int64_t *GB_RESTRICT Smaller_0,  // Smaller [0..nsmaller-1]
-    const int64_t nsmaller
-) ;
-
-void GB_merge_select_1      // parallel or sequential merge
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nleft+nright
-    const int64_t *GB_RESTRICT Left_0,     // Left [0..nleft-1]
-    const int64_t nleft,
-    const int64_t *GB_RESTRICT Right_0,    // Right [0..nright-1]
-    const int64_t nright
-) ;
-
-void GB_mergesort_1 // sort array A of size n
-(
-    int64_t *GB_RESTRICT A_0,      // size n array
-    int64_t *GB_RESTRICT W_0,      // size n array, workspace
-    const int64_t n
-) ;
-
diff --git a/GraphBLAS/Source/GB_msort_2.c b/GraphBLAS/Source/GB_msort_2.c
deleted file mode 100644
index 57fe8e5daa..0000000000
--- a/GraphBLAS/Source/GB_msort_2.c
+++ /dev/null
@@ -1,423 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_msort_2: sort a 2-by-n list of integers, using A[0:1][ ] as the key
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// A parallel mergesort of an array of 2-by-n integers.  Each key consists
-// of two integers.
-
-#include "GB_msort_2.h"
-
-//------------------------------------------------------------------------------
-// GB_merge_sequential_2: merge two sorted lists via a single thread
-//------------------------------------------------------------------------------
-
-// merge Left [0..nleft-1] and Right [0..nright-1] into S [0..nleft+nright-1] */
-
-static void GB_merge_sequential_2
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nleft + nright
-    int64_t *GB_RESTRICT S_1,
-    const int64_t *GB_RESTRICT Left_0,     // left input of length nleft
-    const int64_t *GB_RESTRICT Left_1,
-    const int64_t nleft,
-    const int64_t *GB_RESTRICT Right_0,    // right input of length nright
-    const int64_t *GB_RESTRICT Right_1,
-    const int64_t nright
-)
-{
-    int64_t p, pleft, pright ;
-
-    // merge the two inputs, Left and Right, while both inputs exist
-    for (p = 0, pleft = 0, pright = 0 ; pleft < nleft && pright < nright ; p++)
-    {
-        if (GB_lt_2 (Left_0, Left_1, pleft, Right_0, Right_1, pright))
-        { 
-            // S [p] = Left [pleft++]
-            S_0 [p] = Left_0 [pleft] ;
-            S_1 [p] = Left_1 [pleft] ;
-            pleft++ ;
-        }
-        else
-        { 
-            // S [p] = Right [pright++]
-            S_0 [p] = Right_0 [pright] ;
-            S_1 [p] = Right_1 [pright] ;
-            pright++ ;
-        }
-    }
-
-    // either input is exhausted; copy the remaining list into S
-    if (pleft < nleft)
-    { 
-        int64_t nremaining = (nleft - pleft) ;
-        memcpy (S_0 + p, Left_0 + pleft, nremaining * sizeof (int64_t)) ;
-        memcpy (S_1 + p, Left_1 + pleft, nremaining * sizeof (int64_t)) ;
-    }
-    else if (pright < nright)
-    { 
-        int64_t nremaining = (nright - pright) ;
-        memcpy (S_0 + p, Right_0 + pright, nremaining * sizeof (int64_t)) ;
-        memcpy (S_1 + p, Right_1 + pright, nremaining * sizeof (int64_t)) ;
-    }
-}
-
-//------------------------------------------------------------------------------
-// GB_merge_parallel_2: parallel merge
-//------------------------------------------------------------------------------
-
-// The two input arrays, Bigger [0..nbigger-1] and Smaller [0..nsmaller-1], are
-// sorted.  They are merged into the output array S [0..nleft+nright-1], using
-// a parallel merge.  nbigger >= nsmaller always holds.
-
-void GB_merge_parallel_2                // parallel merge
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nbigger + nsmaller
-    int64_t *GB_RESTRICT S_1,
-    const int64_t *GB_RESTRICT Bigger_0,   // Bigger [0..nbigger-1]
-    const int64_t *GB_RESTRICT Bigger_1,
-    const int64_t nbigger,
-    const int64_t *GB_RESTRICT Smaller_0,  // Smaller [0..nsmaller-1]
-    const int64_t *GB_RESTRICT Smaller_1,
-    const int64_t nsmaller
-)
-{
-
-    //--------------------------------------------------------------------------
-    // split the bigger input in half
-    //--------------------------------------------------------------------------
-
-    // The first task will handle Bigger [0..nhalf-1], and the second task
-    // will handle Bigger [nhalf..n-1].
-
-    int64_t nhalf = nbigger/2 ;
-    int64_t Pivot_0 [1] ; Pivot_0 [0] = Bigger_0 [nhalf] ;
-    int64_t Pivot_1 [1] ; Pivot_1 [0] = Bigger_1 [nhalf] ;
-
-    //--------------------------------------------------------------------------
-    // find where the Pivot appears in the smaller list
-    //--------------------------------------------------------------------------
-
-    // binary search of Smaller [0..nsmaller-1] for the Pivot
-
-    long pleft = 0, pright = nsmaller-1 ;
-    while (pleft < pright)
-    {
-        long pmiddle = (pleft + pright) >> 1 ;
-        if (GB_lt_2 (Smaller_0, Smaller_1, pmiddle, Pivot_0, Pivot_1, 0))
-        { 
-            // if in the list, Pivot appears in [pmiddle+1..pright]
-            pleft = pmiddle + 1 ;
-        }
-        else
-        { 
-            // if in the list, Pivot appears in [pleft..pmiddle]
-            pright = pmiddle ;
-        }
-    }
-
-    // binary search is narrowed down to a single item
-    // or it has found the list is empty:
-    ASSERT (pleft == pright || pleft == pright + 1) ;
-
-    // If found is true then Smaller [pleft == pright] == Pivot.  If duplicates
-    // appear then Smaller [pleft] is any one of the entries equal to the Pivot
-    // in the list.  If found is false then
-    //    Smaller [original_pleft ... pleft-1] < Pivot and
-    //    Smaller [pleft+1 ... original_pright] > Pivot holds.
-    //    The value Smaller [pleft] may be either < or > Pivot.
-    bool found = (pleft == pright &&
-        Smaller_0 [pleft] == Pivot_0 [0] &&
-        Smaller_1 [pleft] == Pivot_1 [0]) ;
-
-    // Modify pleft and pright:
-    if (!found && (pleft == pright))
-    { 
-        if (GB_lt_2 (Smaller_0, Smaller_1, pleft, Pivot_0, Pivot_1, 0))
-        {
-            pleft++ ;
-        }
-        else
-        {
-            pright++ ;
-        }
-    }
-
-    // Now the following conditions hold:
-
-    // If found is false then
-    //    Smaller [original_pleft ... pleft-1] < Pivot and
-    //    Smaller [pleft ... original_pright] > Pivot holds,
-    //    and pleft-1 == pright
-
-    // If Smaller has no duplicates, then whether or not Pivot is found,
-    //    Smaller [original_pleft ... pleft-1] < Pivot and
-    //    Smaller [pleft ... original_pright] >= Pivot holds.
-
-    //--------------------------------------------------------------------------
-    // merge each part in parallel
-    //--------------------------------------------------------------------------
-
-    // The first task merges Bigger [0..nhalf-1] and Smaller [0..pleft-1] into
-    // the output S [0..nhalf+pleft-1].  The entries in Bigger [0..nhalf-1] are
-    // all < Pivot (if no duplicates appear in Bigger) or <= Pivot otherwise.
-
-    int64_t *GB_RESTRICT S_task0_0 = S_0 ;
-    int64_t *GB_RESTRICT S_task0_1 = S_1 ;
-
-    const int64_t *GB_RESTRICT Left_task0_0 = Bigger_0 ;
-    const int64_t *GB_RESTRICT Left_task0_1 = Bigger_1 ;
-    const int64_t nleft_task0 = nhalf ;
-
-    const int64_t *GB_RESTRICT Right_task0_0 = Smaller_0 ;
-    const int64_t *GB_RESTRICT Right_task0_1 = Smaller_1 ;
-    const int64_t nright_task0 = pleft ;
-
-    // The second task merges Bigger [nhalf..nbigger-1] and
-    // Smaller [pleft..nsmaller-1] into the output S [nhalf+pleft..n-1].
-    // The entries in Bigger [nhalf..nbigger-1] and Smaller [pleft..nsmaller-1]
-    // are all >= Pivot.
-
-    int64_t *GB_RESTRICT S_task1_0 = S_0 + nhalf + pleft ;
-    int64_t *GB_RESTRICT S_task1_1 = S_1 + nhalf + pleft ;
-
-    const int64_t *GB_RESTRICT Left_task1_0 = Bigger_0 + nhalf ;
-    const int64_t *GB_RESTRICT Left_task1_1 = Bigger_1 + nhalf ;
-    const int64_t nleft_task1 = (nbigger - nhalf) ;
-
-    const int64_t *GB_RESTRICT Right_task1_0 = Smaller_0 + pleft ;
-    const int64_t *GB_RESTRICT Right_task1_1 = Smaller_1 + pleft ;
-    const int64_t nright_task1 = (nsmaller - pleft) ;
-
-    GB_TASK (GB_merge_select_2, S_task0_0, S_task0_1,
-        Left_task0_0,  Left_task0_1,  nleft_task0,
-        Right_task0_0, Right_task0_1, nright_task0) ;
-
-    GB_TASK (GB_merge_select_2, S_task1_0, S_task1_1,
-        Left_task1_0,  Left_task1_1,  nleft_task1,
-        Right_task1_0, Right_task1_1, nright_task1) ;
-
-    GB_TASK_WAIT
-}
-
-//------------------------------------------------------------------------------
-// GB_merge_select_2: parallel or sequential merge
-//------------------------------------------------------------------------------
-
-// The two input arrays, Left [0..nleft-1] and Right [0..nright-1], are sorted.
-// They are merged into the output array S [0..nleft+nright-1], using either
-// the sequential merge (for small lists) or the parallel merge (for big
-// lists).
-
-void GB_merge_select_2      // parallel or sequential merge of 2-by-n arrays
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nleft+nright
-    int64_t *GB_RESTRICT S_1,
-    const int64_t *GB_RESTRICT Left_0,     // Left [0..nleft-1]
-    const int64_t *GB_RESTRICT Left_1,
-    const int64_t nleft,
-    const int64_t *GB_RESTRICT Right_0,    // Right [0..nright-1]
-    const int64_t *GB_RESTRICT Right_1,
-    const int64_t nright
-)
-{
-
-    if (nleft + nright < GB_BASECASE)
-    { 
-        // sequential merge
-        GB_merge_sequential_2 (S_0, S_1,
-            Left_0,  Left_1,  nleft,
-            Right_0, Right_1, nright) ;
-    }
-    else if (nleft >= nright)
-    { 
-        // parallel merge, where Left [0..nleft-1] is the bigger of the two.
-        GB_merge_parallel_2 (S_0, S_1,
-            Left_0,  Left_1,  nleft,
-            Right_0, Right_1, nright) ;
-    }
-    else
-    { 
-        // parallel merge, where Right [0..nright-1] is the bigger of the two.
-        GB_merge_parallel_2 (S_0, S_1,
-            Right_0, Right_1, nright,
-            Left_0,  Left_1,  nleft) ;
-    }
-}
-
-//------------------------------------------------------------------------------
-// GB_mergesort_2:  parallel merge sort of a 2-by-n array
-//------------------------------------------------------------------------------
-
-// GB_mergesort_2 sorts an int64_t array A of size 2-by-n in ascending
-// order, using a parallel mergesort.  W is a workspace array of size 2-by-n.
-// Small arrays are sorted with a quicksort method.
-
-void GB_mergesort_2 // sort array A of size 2-by-n, using 2 keys (A [0:1][])
-(
-    int64_t *GB_RESTRICT A_0,      // size n array
-    int64_t *GB_RESTRICT A_1,      // size n array
-    int64_t *GB_RESTRICT W_0,      // size n array, workspace
-    int64_t *GB_RESTRICT W_1,      // size n array, workspace
-    const int64_t n
-)
-{
-
-    if (n <= GB_BASECASE)
-    { 
-
-        // ---------------------------------------------------------------------
-        // sequential quicksort; no workspace needed
-        // ---------------------------------------------------------------------
-
-        GB_qsort_2 (A_0, A_1, n) ;
-
-    }
-    else
-    { 
-
-        // ---------------------------------------------------------------------
-        // recursive merge sort if A has length greater than GB_BASECASE
-        // ---------------------------------------------------------------------
-
-        // ---------------------------------------------------------------------
-        // split A into four quarters
-        // ---------------------------------------------------------------------
-
-        int64_t n12 = n >> 1 ;          // split n into n12 and n34
-        int64_t n34 = n - n12 ;
-
-        int64_t n1 = n12 >> 1 ;         // split n12 into n1 and n2
-        int64_t n2 = n12 - n1 ;
-
-        int64_t n3 = n34 >> 1 ;         // split n34 into n3 and n4
-        int64_t n4 = n34 - n3 ;
-
-        int64_t n123 = n12 + n3 ;       // start of 4th quarter = n1 + n2 + n3
-
-        // 1st quarter of A and W
-        int64_t *GB_RESTRICT A_1st0 = A_0 ;
-        int64_t *GB_RESTRICT A_1st1 = A_1 ;
-
-        int64_t *GB_RESTRICT W_1st0 = W_0 ;
-        int64_t *GB_RESTRICT W_1st1 = W_1 ;
-
-        // 2nd quarter of A and W
-        int64_t *GB_RESTRICT A_2nd0 = A_0 + n1 ;
-        int64_t *GB_RESTRICT A_2nd1 = A_1 + n1 ;
-
-        int64_t *GB_RESTRICT W_2nd0 = W_0 + n1 ;
-        int64_t *GB_RESTRICT W_2nd1 = W_1 + n1 ;
-
-        // 3rd quarter of A and W
-        int64_t *GB_RESTRICT A_3rd0 = A_0 + n12 ;
-        int64_t *GB_RESTRICT A_3rd1 = A_1 + n12 ;
-
-        int64_t *GB_RESTRICT W_3rd0 = W_0 + n12 ;
-        int64_t *GB_RESTRICT W_3rd1 = W_1 + n12 ;
-
-        // 4th quarter of A and W
-        int64_t *GB_RESTRICT A_4th0 = A_0 + n123 ;
-        int64_t *GB_RESTRICT A_4th1 = A_1 + n123 ;
-
-        int64_t *GB_RESTRICT W_4th0 = W_0 + n123 ;
-        int64_t *GB_RESTRICT W_4th1 = W_1 + n123 ;
-
-        // ---------------------------------------------------------------------
-        // sort each quarter of A in parallel, using W as workspace
-        // ---------------------------------------------------------------------
-
-        GB_TASK (GB_mergesort_2, A_1st0, A_1st1, W_1st0, W_1st1, n1) ;
-        GB_TASK (GB_mergesort_2, A_2nd0, A_2nd1, W_2nd0, W_2nd1, n2) ;
-        GB_TASK (GB_mergesort_2, A_3rd0, A_3rd1, W_3rd0, W_3rd1, n3) ;
-        GB_TASK (GB_mergesort_2, A_4th0, A_4th1, W_4th0, W_4th1, n4) ;
-
-        GB_TASK_WAIT
-
-        // ---------------------------------------------------------------------
-        // merge pairs of quarters of A into two halves of W, in parallel
-        // ---------------------------------------------------------------------
-
-        GB_TASK (GB_merge_select_2,
-            W_1st0, W_1st1, A_1st0, A_1st1, n1, A_2nd0, A_2nd1, n2) ;
-        GB_TASK (GB_merge_select_2,
-            W_3rd0, W_3rd1, A_3rd0, A_3rd1, n3, A_4th0, A_4th1, n4) ;
-
-        GB_TASK_WAIT
-
-        // ---------------------------------------------------------------------
-        // merge the two halves of W into A
-        // ---------------------------------------------------------------------
-
-        GB_merge_select_2 (A_0, A_1, W_1st0, W_1st1, n12, W_3rd0, W_3rd1, n34) ;
-    }
-}
-
-//------------------------------------------------------------------------------
-// GB_msort_2: gateway for parallel merge sort
-//------------------------------------------------------------------------------
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void GB_msort_2     // sort array A of size 2-by-n, using 2 keys (A [0:1][])
-(
-    int64_t *GB_RESTRICT A_0,   // size n array
-    int64_t *GB_RESTRICT A_1,   // size n array
-    int64_t *GB_RESTRICT W_0,   // size n array, workspace
-    int64_t *GB_RESTRICT W_1,   // size n array, workspace
-    const int64_t n,
-    int nthreads                // # of threads to use
-)
-{
-
-    nthreads = GB_MSORT_NTHREADS (nthreads) ;
-
-    if (nthreads > 1)
-    {
-
-        // ---------------------------------------------------------------------
-        // parallel mergesort
-        // ---------------------------------------------------------------------
-
-        if (GB_OPENMP_GET_NUM_THREADS > 1)
-        {
-
-            // -----------------------------------------------------------------
-            // parallel mergesort: already in parallel region
-            // -----------------------------------------------------------------
-
-            // GB_msort_2 is already in a parallel region in the caller.  This
-            // does not occur inside GraphBLAS, but the user application might
-            // be calling GraphBLAS inside its own parallel region.
-
-            GB_mergesort_2 (A_0, A_1, W_0, W_1, n) ;
-
-        }
-        else
-        { 
-
-            // -----------------------------------------------------------------
-            // parallel mergesort: start a parallel region
-            // -----------------------------------------------------------------
-
-            GB_TASK_MASTER (nthreads)
-            GB_mergesort_2 (A_0, A_1, W_0, W_1, n) ;
-
-        }
-
-    }
-    else
-    {
-
-        // ---------------------------------------------------------------------
-        // sequential quicksort
-        // ---------------------------------------------------------------------
-
-        // The method is in-place, and the workspace is not used.
-        GB_qsort_2 (A_0, A_1, n) ;
-    }
-}
-
diff --git a/GraphBLAS/Source/GB_msort_2.h b/GraphBLAS/Source/GB_msort_2.h
deleted file mode 100644
index 752ff9f4fe..0000000000
--- a/GraphBLAS/Source/GB_msort_2.h
+++ /dev/null
@@ -1,51 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_msort_2.h: definitions for GB_msort_2.c
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// A parallel mergesort of an array of 2-by-n integers.  Each key consists
-// of two integers.
-
-#include "GB_sort.h"
-
-//------------------------------------------------------------------------------
-// prototypes only needed for GB_msort_2
-//------------------------------------------------------------------------------
-
-void GB_merge_parallel_2                // parallel merge
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nbigger + nsmaller
-    int64_t *GB_RESTRICT S_1,
-    const int64_t *GB_RESTRICT Bigger_0,   // Bigger [0..nbigger-1]
-    const int64_t *GB_RESTRICT Bigger_1,
-    const int64_t nbigger,
-    const int64_t *GB_RESTRICT Smaller_0,  // Smaller [0..nsmaller-1]
-    const int64_t *GB_RESTRICT Smaller_1,
-    const int64_t nsmaller
-) ;
-
-void GB_merge_select_2      // parallel or sequential merge of 2-by-n arrays
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nleft+nright
-    int64_t *GB_RESTRICT S_1,
-    const int64_t *GB_RESTRICT Left_0,     // Left [0..nleft-1]
-    const int64_t *GB_RESTRICT Left_1,
-    const int64_t nleft,
-    const int64_t *GB_RESTRICT Right_0,    // Right [0..nright-1]
-    const int64_t *GB_RESTRICT Right_1,
-    const int64_t nright
-) ;
-
-void GB_mergesort_2 // sort array A of size 2-by-n, using 2 keys (A [0:1][])
-(
-    int64_t *GB_RESTRICT A_0,      // size n array
-    int64_t *GB_RESTRICT A_1,      // size n array
-    int64_t *GB_RESTRICT W_0,      // size n array, workspace
-    int64_t *GB_RESTRICT W_1,      // size n array, workspace
-    const int64_t n
-) ;
-
diff --git a/GraphBLAS/Source/GB_msort_2b.c b/GraphBLAS/Source/GB_msort_2b.c
new file mode 100644
index 0000000000..d062295ba0
--- /dev/null
+++ b/GraphBLAS/Source/GB_msort_2b.c
@@ -0,0 +1,447 @@
+//------------------------------------------------------------------------------
+// GB_msort_2b: sort a 2-by-n list of integers, using A[0:1][ ] as the key
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// A parallel mergesort of an array of 2-by-n integers.  Each key
+// consists of two integers.
+
+#include "GB_msort_2b.h"
+
+//------------------------------------------------------------------------------
+// GB_msort_2b_binary_search: binary search for the pivot
+//------------------------------------------------------------------------------
+
+// The Pivot value is Y [pivot], and a binary search for the Pivot is made in
+// the array X [p_pstart...p_end-1], which is sorted in non-decreasing order on
+// input.  The return value is pleft, where
+//
+//    X [p_start ... pleft-1] <= Pivot and
+//    X [pleft ... p_end-1] >= Pivot holds.
+//
+// pleft is returned in the range p_start to p_end.  If pleft is p_start, then
+// the Pivot is smaller than all entries in X [p_start...p_end-1], and the left
+// list X [p_start...pleft-1] is empty.  If pleft is p_end, then the Pivot is
+// larger than all entries in X [p_start...p_end-1], and the right list X
+// [pleft...p_end-1] is empty.
+
+static int64_t GB_msort_2b_binary_search    // return pleft
+(
+    const int64_t *GB_RESTRICT Y_0,         // Pivot is Y [pivot]
+    const int64_t *GB_RESTRICT Y_1,
+    const int64_t pivot,
+    const int64_t *GB_RESTRICT X_0,         // search in X [p_start..p_end_-1]
+    const int64_t *GB_RESTRICT X_1,
+    const int64_t p_start,
+    const int64_t p_end
+)
+{
+
+    //--------------------------------------------------------------------------
+    // find where the Pivot appears in X
+    //--------------------------------------------------------------------------
+
+    // binary search of X [p_start...p_end-1] for the Pivot
+    int64_t pleft = p_start ;
+    int64_t pright = p_end - 1 ;
+    while (pleft < pright)
+    { 
+        int64_t pmiddle = (pleft + pright) >> 1 ;
+        // less = (X [pmiddle] < Pivot)
+        bool less = GB_lt_2 (X_0, X_1, pmiddle,
+                             Y_0, Y_1, pivot) ;
+        pleft  = less ? (pmiddle+1) : pleft ;
+        pright = less ? pright : pmiddle ;
+    }
+
+    // binary search is narrowed down to a single item
+    // or it has found the list is empty:
+    ASSERT (pleft == pright || pleft == pright + 1) ;
+
+    // If found is true then X [pleft == pright] == Pivot.  If duplicates
+    // appear then X [pleft] is any one of the entries equal to the Pivot
+    // in the list.  If found is false then
+    //    X [p_start ... pleft-1] < Pivot and
+    //    X [pleft+1 ... p_end-1] > Pivot holds.
+    //    The value X [pleft] may be either < or > Pivot.
+    bool found = (pleft == pright) && GB_eq_2 (X_0, X_1, pleft,
+                                               Y_0, Y_1, pivot) ;
+
+    // Modify pleft and pright:
+    if (!found && (pleft == pright))
+    {
+        if (GB_lt_2 (X_0, X_1, pleft,
+                     Y_0, Y_1, pivot))
+        { 
+            pleft++ ;
+        }
+        else
+        { 
+//          pright++ ;  // (not needed)
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    // If found is false then
+    //    X [p_start ... pleft-1] < Pivot and
+    //    X [pleft ... p_end-1] > Pivot holds,
+    //    and pleft-1 == pright
+
+    // If X has no duplicates, then whether or not Pivot is found,
+    //    X [p_start ... pleft-1] < Pivot and
+    //    X [pleft ... p_end-1] >= Pivot holds.
+
+    // If X has duplicates, then whether or not Pivot is found,
+    //    X [p_start ... pleft-1] <= Pivot and
+    //    X [pleft ... p_end-1] >= Pivot holds.
+
+    return (pleft) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_msort_2b_create_merge_tasks
+//------------------------------------------------------------------------------
+
+// Recursively constructs ntasks tasks to merge two arrays, Left and Right,
+// into Sresult, where Left is L [pL_start...pL_end-1], Right is R
+// [pR_start...pR_end-1], and Sresult is S [pS_start...pS_start+total_work-1],
+// and where total_work is the total size of Left and Right.
+//
+// Task tid will merge L [L_task [tid] ... L_task [tid] + L_len [tid] - 1] and
+// R [R_task [tid] ... R_task [tid] + R_len [tid] -1] into the merged output
+// array S [S_task [tid] ... ].  The task tids created are t0 to
+// t0+ntasks-1.
+
+void GB_msort_2b_create_merge_tasks
+(
+    // output:
+    int64_t *GB_RESTRICT L_task,        // L_task [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT L_len,         // L_len  [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT R_task,        // R_task [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT R_len,         // R_len  [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT S_task,        // S_task [t0...t0+ntasks-1] computed
+    // input:
+    const int t0,                       // first task tid to create
+    const int ntasks,                   // # of tasks to create
+    const int64_t pS_start,             // merge into S [pS_start...]
+    const int64_t *GB_RESTRICT L_0,     // Left = L [pL_start...pL_end-1]
+    const int64_t *GB_RESTRICT L_1,
+    const int64_t pL_start,
+    const int64_t pL_end,
+    const int64_t *GB_RESTRICT R_0,     // Right = R [pR_start...pR_end-1]
+    const int64_t *GB_RESTRICT R_1,
+    const int64_t pR_start,
+    const int64_t pR_end
+)
+{
+
+    //--------------------------------------------------------------------------
+    // get problem size
+    //--------------------------------------------------------------------------
+
+    int64_t nleft  = pL_end - pL_start ;        // size of Left array
+    int64_t nright = pR_end - pR_start ;        // size of Right array
+    int64_t total_work = nleft + nright ;       // total work to do
+    ASSERT (ntasks >= 1) ;
+    ASSERT (total_work > 0) ;
+
+    //--------------------------------------------------------------------------
+    // create the tasks
+    //--------------------------------------------------------------------------
+
+    if (ntasks == 1)
+    { 
+
+        //----------------------------------------------------------------------
+        // a single task will merge all of Left and Right into Sresult
+        //----------------------------------------------------------------------
+
+        L_task [t0] = pL_start ; L_len [t0] = nleft ;
+        R_task [t0] = pR_start ; R_len [t0] = nright ;
+        S_task [t0] = pS_start ;
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // partition the Left and Right arrays for multiple merge tasks
+        //----------------------------------------------------------------------
+
+        int64_t pleft, pright ;
+        if (nleft >= nright)
+        { 
+            // split Left in half, and search for its pivot in Right
+            pleft = (pL_end + pL_start) >> 1 ;
+            pright = GB_msort_2b_binary_search (
+                        L_0, L_1, pleft,
+                        R_0, R_1, pR_start, pR_end) ;
+        }
+        else
+        { 
+            // split Right in half, and search for its pivot in Left
+            pright = (pR_end + pR_start) >> 1 ;
+            pleft = GB_msort_2b_binary_search (
+                        R_0, R_1, pright,
+                        L_0, L_1, pL_start, pL_end) ;
+        }
+
+        //----------------------------------------------------------------------
+        // partition the tasks according to the work of each partition
+        //----------------------------------------------------------------------
+
+        // work0 is the total work in the first partition
+        int64_t work0 = (pleft - pL_start) + (pright - pR_start) ;
+        int ntasks0 = (int) round ((double) ntasks *
+            (((double) work0) / ((double) total_work))) ;
+
+        // ensure at least one task is assigned to each partition
+        ntasks0 = GB_IMAX (ntasks0, 1) ;
+        ntasks0 = GB_IMIN (ntasks0, ntasks-1) ;
+        int ntasks1 = ntasks - ntasks0 ;
+
+        //----------------------------------------------------------------------
+        // assign ntasks0 to the first half
+        //----------------------------------------------------------------------
+
+        // ntasks0 tasks merge L [pL_start...pleft-1] and R [pR_start..pright-1]
+        // into the result S [pS_start...work0-1].
+
+        GB_msort_2b_create_merge_tasks (
+            L_task, L_len, R_task, R_len, S_task, t0, ntasks0, pS_start,
+            L_0, L_1, pL_start, pleft,
+            R_0, R_1, pR_start, pright) ;
+
+        //----------------------------------------------------------------------
+        // assign ntasks1 to the second half
+        //----------------------------------------------------------------------
+
+        // ntasks1 tasks merge L [pleft...pL_end-1] and R [pright...pR_end-1]
+        // into the result S [pS_start+work0...pS_start+total_work].
+
+        int t1 = t0 + ntasks0 ;     // first task id of the second set of tasks
+        int64_t pS_start1 = pS_start + work0 ;  // 2nd set starts here in S
+        GB_msort_2b_create_merge_tasks (
+            L_task, L_len, R_task, R_len, S_task, t1, ntasks1, pS_start1,
+            L_0, L_1, pleft,  pL_end,
+            R_0, R_1, pright, pR_end) ;
+    }
+}
+
+//------------------------------------------------------------------------------
+// GB_msort_2b_merge: merge two sorted lists via a single thread
+//------------------------------------------------------------------------------
+
+// merge Left [0..nleft-1] and Right [0..nright-1] into S [0..nleft+nright-1] */
+
+static void GB_msort_2b_merge
+(
+    int64_t *GB_RESTRICT S_0,              // output of length nleft + nright
+    int64_t *GB_RESTRICT S_1,
+    const int64_t *GB_RESTRICT Left_0,     // left input of length nleft
+    const int64_t *GB_RESTRICT Left_1,
+    const int64_t nleft,
+    const int64_t *GB_RESTRICT Right_0,    // right input of length nright
+    const int64_t *GB_RESTRICT Right_1,
+    const int64_t nright
+)
+{
+    int64_t p, pleft, pright ;
+
+    // merge the two inputs, Left and Right, while both inputs exist
+    for (p = 0, pleft = 0, pright = 0 ; pleft < nleft && pright < nright ; p++)
+    {
+        if (GB_lt_2 (Left_0,  Left_1,  pleft,
+                     Right_0, Right_1, pright))
+        { 
+            // S [p] = Left [pleft++]
+            S_0 [p] = Left_0 [pleft] ;
+            S_1 [p] = Left_1 [pleft] ;
+            pleft++ ;
+        }
+        else
+        { 
+            // S [p] = Right [pright++]
+            S_0 [p] = Right_0 [pright] ;
+            S_1 [p] = Right_1 [pright] ;
+            pright++ ;
+        }
+    }
+
+    // either input is exhausted; copy the remaining list into S
+    if (pleft < nleft)
+    { 
+        int64_t nremaining = (nleft - pleft) ;
+        memcpy (S_0 + p, Left_0 + pleft, nremaining * sizeof (int64_t)) ;
+        memcpy (S_1 + p, Left_1 + pleft, nremaining * sizeof (int64_t)) ;
+    }
+    else if (pright < nright)
+    { 
+        int64_t nremaining = (nright - pright) ;
+        memcpy (S_0 + p, Right_0 + pright, nremaining * sizeof (int64_t)) ;
+        memcpy (S_1 + p, Right_1 + pright, nremaining * sizeof (int64_t)) ;
+    }
+}
+
+//------------------------------------------------------------------------------
+// GB_msort_2b: parallel mergesort
+//------------------------------------------------------------------------------
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_msort_2b    // sort array A of size 2-by-n, using 2 keys (A [0:1][])
+(
+    int64_t *GB_RESTRICT A_0,   // size n array
+    int64_t *GB_RESTRICT A_1,   // size n array
+    const int64_t n,
+    int nthreads                // # of threads to use
+)
+{
+
+    //--------------------------------------------------------------------------
+    // handle small problems with a single thread
+    //--------------------------------------------------------------------------
+
+    if (nthreads <= 1 || n <= GB_BASECASE)
+    { 
+        // sequential quicksort
+        GB_qsort_2 (A_0, A_1, n) ;
+        return (GrB_SUCCESS) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // determine # of tasks
+    //--------------------------------------------------------------------------
+
+    // determine the number of levels to create, which must always be an
+    // even number.  The # of levels is chosen to ensure that the # of leaves
+    // of the task tree is between 4*nthreads and 16*nthreads.
+
+    //  2 to 4 threads:     4 levels, 16 qsort leaves
+    //  5 to 16 threads:    6 levels, 64 qsort leaves
+    // 17 to 64 threads:    8 levels, 256 qsort leaves
+    // 65 to 256 threads:   10 levels, 1024 qsort leaves
+    // 256 to 1024 threads: 12 levels, 4096 qsort leaves
+    // ...
+
+    int k = (int) (2 + 2 * ceil (log2 ((double) nthreads) / 2)) ;
+    int ntasks = 1 << k ;
+
+    //--------------------------------------------------------------------------
+    // allocate workspace
+    //--------------------------------------------------------------------------
+
+    int64_t *GB_RESTRICT W = GB_MALLOC (2*n + 6*ntasks + 1, int64_t) ;
+    if (W == NULL)
+    { 
+        // out of memory
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    int64_t *T = W ;
+    int64_t *GB_RESTRICT W_0    = T ; T += n ;
+    int64_t *GB_RESTRICT W_1    = T ; T += n ;
+    int64_t *GB_RESTRICT L_task = T ; T += ntasks ;
+    int64_t *GB_RESTRICT L_len  = T ; T += ntasks ;
+    int64_t *GB_RESTRICT R_task = T ; T += ntasks ;
+    int64_t *GB_RESTRICT R_len  = T ; T += ntasks ;
+    int64_t *GB_RESTRICT S_task = T ; T += ntasks ;
+    int64_t *GB_RESTRICT Slice  = T ; T += (ntasks+1) ;  
+
+    //--------------------------------------------------------------------------
+    // partition and sort the leaves
+    //--------------------------------------------------------------------------
+
+    GB_eslice (Slice, n, ntasks) ;
+    int tid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+    for (tid = 0 ; tid < ntasks ; tid++)
+    { 
+        int64_t leaf = Slice [tid] ;
+        int64_t leafsize = Slice [tid+1] - leaf ;
+        GB_qsort_2 (A_0 + leaf, A_1 + leaf, leafsize) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // merge each level
+    //--------------------------------------------------------------------------
+
+    int nt = 1 ;
+    for ( ; k >= 2 ; k -= 2)
+    {
+
+        //----------------------------------------------------------------------
+        // merge level k into level k-1, from A into W
+        //----------------------------------------------------------------------
+
+        // TODO: skip k and k-1 for each group of 4 sublists of A if they are
+        // already sorted with respect to each other.
+
+        // this could be done in parallel if ntasks was large
+        for (int tid = 0 ; tid < ntasks ; tid += 2*nt)
+        { 
+            // create 2*nt tasks to merge two A sublists into one W sublist
+            GB_msort_2b_create_merge_tasks (
+                L_task, L_len, R_task, R_len, S_task, tid, 2*nt, Slice [tid],
+                A_0, A_1, Slice [tid],    Slice [tid+nt],
+                A_0, A_1, Slice [tid+nt], Slice [tid+2*nt]) ;
+        }
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+        for (tid = 0 ; tid < ntasks ; tid++)
+        { 
+            // merge A [pL...pL+nL-1] and A [pR...pR+nR-1] into W [pS..]
+            int64_t pL = L_task [tid], nL = L_len [tid] ;
+            int64_t pR = R_task [tid], nR = R_len [tid] ;
+            int64_t pS = S_task [tid] ;
+
+            GB_msort_2b_merge (
+                W_0 + pS, W_1 + pS,
+                A_0 + pL, A_1 + pL, nL,
+                A_0 + pR, A_1 + pR, nR) ;
+        }
+        nt = 2*nt ;
+
+        //----------------------------------------------------------------------
+        // merge level k-1 into level k-2, from W into A
+        //----------------------------------------------------------------------
+
+        // this could be done in parallel if ntasks was large
+        for (int tid = 0 ; tid < ntasks ; tid += 2*nt)
+        { 
+            // create 2*nt tasks to merge two W sublists into one A sublist
+            GB_msort_2b_create_merge_tasks (
+                L_task, L_len, R_task, R_len, S_task, tid, 2*nt, Slice [tid],
+                W_0, W_1, Slice [tid],    Slice [tid+nt],
+                W_0, W_1, Slice [tid+nt], Slice [tid+2*nt]) ;
+        }
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+        for (tid = 0 ; tid < ntasks ; tid++)
+        { 
+            // merge A [pL...pL+nL-1] and A [pR...pR+nR-1] into W [pS..]
+            int64_t pL = L_task [tid], nL = L_len [tid] ;
+            int64_t pR = R_task [tid], nR = R_len [tid] ;
+            int64_t pS = S_task [tid] ;
+            GB_msort_2b_merge (
+                A_0 + pS, A_1 + pS,
+                W_0 + pL, W_1 + pL, nL,
+                W_0 + pR, W_1 + pR, nR) ;
+        }
+        nt = 2*nt ;
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    GB_FREE (W) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_msort_2b.h b/GraphBLAS/Source/GB_msort_2b.h
new file mode 100644
index 0000000000..66561108b2
--- /dev/null
+++ b/GraphBLAS/Source/GB_msort_2b.h
@@ -0,0 +1,40 @@
+//------------------------------------------------------------------------------
+// GB_msort_2b.h: definitions for GB_msort_2b.c
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// A parallel mergesort of an array of 2-by-n integers.  Each key consists
+// of two integers.
+
+#include "GB_sort.h"
+
+//------------------------------------------------------------------------------
+// prototypes only needed for GB_msort_2b
+//------------------------------------------------------------------------------
+
+void GB_msort_2b_create_merge_tasks
+(
+    // output:
+    int64_t *GB_RESTRICT L_task,        // L_task [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT L_len,         // L_len  [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT R_task,        // R_task [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT R_len,         // R_len  [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT S_task,        // S_task [t0...t0+ntasks-1] computed
+    // input:
+    const int t0,                       // first task tid to create
+    const int ntasks,                   // # of tasks to create
+    const int64_t pS_start,             // merge into S [pS_start...]
+    const int64_t *GB_RESTRICT L_0,     // Left = L [pL_start...pL_end-1]
+    const int64_t *GB_RESTRICT L_1,
+    const int64_t pL_start,
+    const int64_t pL_end,
+    const int64_t *GB_RESTRICT R_0,     // Right = R [pR_start...pR_end-1]
+    const int64_t *GB_RESTRICT R_1,
+    const int64_t pR_start,
+    const int64_t pR_end
+) ;
+
diff --git a/GraphBLAS/Source/GB_msort_3.c b/GraphBLAS/Source/GB_msort_3.c
deleted file mode 100644
index 5d063ecbb7..0000000000
--- a/GraphBLAS/Source/GB_msort_3.c
+++ /dev/null
@@ -1,466 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_msort_3: sort a 3-by-n list of integers, using A[0:2][ ] as the key
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// A parallel mergesort of an array of 3-by-n integers.  Each key consists
-// of three integers.
-
-#include "GB_msort_3.h"
-
-//------------------------------------------------------------------------------
-// GB_merge_sequential_3: merge two sorted lists via a single thread
-//------------------------------------------------------------------------------
-
-// merge Left [0..nleft-1] and Right [0..nright-1] into S [0..nleft+nright-1] */
-
-static void GB_merge_sequential_3
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nleft + nright
-    int64_t *GB_RESTRICT S_1,
-    int64_t *GB_RESTRICT S_2,
-    const int64_t *GB_RESTRICT Left_0,     // left input of length nleft
-    const int64_t *GB_RESTRICT Left_1,
-    const int64_t *GB_RESTRICT Left_2,
-    const int64_t nleft,
-    const int64_t *GB_RESTRICT Right_0,    // right input of length nright
-    const int64_t *GB_RESTRICT Right_1,
-    const int64_t *GB_RESTRICT Right_2,
-    const int64_t nright
-)
-{
-    int64_t p, pleft, pright ;
-
-    // merge the two inputs, Left and Right, while both inputs exist
-    for (p = 0, pleft = 0, pright = 0 ; pleft < nleft && pright < nright ; p++)
-    {
-        if (GB_lt_3 (Left_0,  Left_1,  Left_2,  pleft,
-                     Right_0, Right_1, Right_2, pright))
-        { 
-            // S [p] = Left [pleft++]
-            S_0 [p] = Left_0 [pleft] ;
-            S_1 [p] = Left_1 [pleft] ;
-            S_2 [p] = Left_2 [pleft] ;
-            pleft++ ;
-        }
-        else
-        { 
-            // S [p] = Right [pright++]
-            S_0 [p] = Right_0 [pright] ;
-            S_1 [p] = Right_1 [pright] ;
-            S_2 [p] = Right_2 [pright] ;
-            pright++ ;
-        }
-    }
-
-    // either input is exhausted; copy the remaining list into S
-    if (pleft < nleft)
-    { 
-        int64_t nremaining = (nleft - pleft) ;
-        memcpy (S_0 + p, Left_0 + pleft, nremaining * sizeof (int64_t)) ;
-        memcpy (S_1 + p, Left_1 + pleft, nremaining * sizeof (int64_t)) ;
-        memcpy (S_2 + p, Left_2 + pleft, nremaining * sizeof (int64_t)) ;
-    }
-    else if (pright < nright)
-    { 
-        int64_t nremaining = (nright - pright) ;
-        memcpy (S_0 + p, Right_0 + pright, nremaining * sizeof (int64_t)) ;
-        memcpy (S_1 + p, Right_1 + pright, nremaining * sizeof (int64_t)) ;
-        memcpy (S_2 + p, Right_2 + pright, nremaining * sizeof (int64_t)) ;
-    }
-}
-
-//------------------------------------------------------------------------------
-// GB_merge_parallel_3: parallel merge
-//------------------------------------------------------------------------------
-
-// The two input arrays, Bigger [0..nbigger-1] and Smaller [0..nsmaller-1], are
-// sorted.  They are merged into the output array S [0..nleft+nright-1], using
-// a parallel merge.  nbigger >= nsmaller always holds.
-
-void GB_merge_parallel_3                // parallel merge
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nbigger + nsmaller
-    int64_t *GB_RESTRICT S_1,
-    int64_t *GB_RESTRICT S_2,
-    const int64_t *GB_RESTRICT Bigger_0,   // Bigger [0..nbigger-1]
-    const int64_t *GB_RESTRICT Bigger_1,
-    const int64_t *GB_RESTRICT Bigger_2,
-    const int64_t nbigger,
-    const int64_t *GB_RESTRICT Smaller_0,  // Smaller [0..nsmaller-1]
-    const int64_t *GB_RESTRICT Smaller_1,
-    const int64_t *GB_RESTRICT Smaller_2,
-    const int64_t nsmaller
-)
-{
-
-    //--------------------------------------------------------------------------
-    // split the bigger input in half
-    //--------------------------------------------------------------------------
-
-    // The first task will handle Bigger [0..nhalf-1], and the second task
-    // will handle Bigger [nhalf..n-1].
-
-    int64_t nhalf = nbigger/2 ;
-    int64_t Pivot_0 [1] ; Pivot_0 [0] = Bigger_0 [nhalf] ;
-    int64_t Pivot_1 [1] ; Pivot_1 [0] = Bigger_1 [nhalf] ;
-    int64_t Pivot_2 [1] ; Pivot_2 [0] = Bigger_2 [nhalf] ;
-
-    //--------------------------------------------------------------------------
-    // find where the Pivot appears in the smaller list
-    //--------------------------------------------------------------------------
-
-    // binary search of Smaller [0..nsmaller-1] for the Pivot
-
-    long pleft = 0, pright = nsmaller-1 ;
-    while (pleft < pright)
-    {
-        long pmiddle = (pleft + pright) >> 1 ;
-        if (GB_lt_3 (Smaller_0, Smaller_1, Smaller_2, pmiddle,
-                     Pivot_0, Pivot_1, Pivot_2, 0))
-        { 
-            // if in the list, Pivot appears in [pmiddle+1..pright]
-            pleft = pmiddle + 1 ;
-        }
-        else
-        { 
-            // if in the list, Pivot appears in [pleft..pmiddle]
-            pright = pmiddle ;
-        }
-    }
-
-    // binary search is narrowed down to a single item
-    // or it has found the list is empty:
-    ASSERT (pleft == pright || pleft == pright + 1) ;
-
-    // If found is true then Smaller [pleft == pright] == Pivot.  If duplicates
-    // appear then Smaller [pleft] is any one of the entries equal to the Pivot
-    // in the list.  If found is false then
-    //    Smaller [original_pleft ... pleft-1] < Pivot and
-    //    Smaller [pleft+1 ... original_pright] > Pivot holds.
-    //    The value Smaller [pleft] may be either < or > Pivot.
-    bool found = (pleft == pright &&
-        Smaller_0 [pleft] == Pivot_0 [0] &&
-        Smaller_1 [pleft] == Pivot_1 [0] &&
-        Smaller_2 [pleft] == Pivot_2 [0]) ;
-
-    // Modify pleft and pright:
-    if (!found && (pleft == pright))
-    { 
-        if (GB_lt_3 (Smaller_0, Smaller_1, Smaller_2, pleft,
-                     Pivot_0,   Pivot_1,   Pivot_2,   0))
-        {
-            pleft++ ;
-        }
-        else
-        {
-            pright++ ;
-        }
-    }
-
-    // Now the following conditions hold:
-
-    // If found is false then
-    //    Smaller [original_pleft ... pleft-1] < Pivot and
-    //    Smaller [pleft ... original_pright] > Pivot holds,
-    //    and pleft-1 == pright
-
-    // If Smaller has no duplicates, then whether or not Pivot is found,
-    //    Smaller [original_pleft ... pleft-1] < Pivot and
-    //    Smaller [pleft ... original_pright] >= Pivot holds.
-
-    //--------------------------------------------------------------------------
-    // merge each part in parallel
-    //--------------------------------------------------------------------------
-
-    // The first task merges Bigger [0..nhalf-1] and Smaller [0..pleft-1] into
-    // the output S [0..nhalf+pleft-1].  The entries in Bigger [0..nhalf-1] are
-    // all < Pivot (if no duplicates appear in Bigger) or <= Pivot otherwise.
-
-    int64_t *GB_RESTRICT S_task0_0 = S_0 ;
-    int64_t *GB_RESTRICT S_task0_1 = S_1 ;
-    int64_t *GB_RESTRICT S_task0_2 = S_2 ;
-
-    const int64_t *GB_RESTRICT Left_task0_0 = Bigger_0 ;
-    const int64_t *GB_RESTRICT Left_task0_1 = Bigger_1 ;
-    const int64_t *GB_RESTRICT Left_task0_2 = Bigger_2 ;
-    const int64_t nleft_task0 = nhalf ;
-
-    const int64_t *GB_RESTRICT Right_task0_0 = Smaller_0 ;
-    const int64_t *GB_RESTRICT Right_task0_1 = Smaller_1 ;
-    const int64_t *GB_RESTRICT Right_task0_2 = Smaller_2 ;
-    const int64_t nright_task0 = pleft ;
-
-    // The second task merges Bigger [nhalf..nbigger-1] and
-    // Smaller [pleft..nsmaller-1] into the output S [nhalf+pleft..n-1].
-    // The entries in Bigger [nhalf..nbigger-1] and Smaller [pleft..nsmaller-1]
-    // are all >= Pivot.
-
-    int64_t *GB_RESTRICT S_task1_0 = S_0 + nhalf + pleft ;
-    int64_t *GB_RESTRICT S_task1_1 = S_1 + nhalf + pleft ;
-    int64_t *GB_RESTRICT S_task1_2 = S_2 + nhalf + pleft ;
-
-    const int64_t *GB_RESTRICT Left_task1_0 = Bigger_0 + nhalf ;
-    const int64_t *GB_RESTRICT Left_task1_1 = Bigger_1 + nhalf ;
-    const int64_t *GB_RESTRICT Left_task1_2 = Bigger_2 + nhalf ;
-    const int64_t nleft_task1 = (nbigger - nhalf) ;
-
-    const int64_t *GB_RESTRICT Right_task1_0 = Smaller_0 + pleft ;
-    const int64_t *GB_RESTRICT Right_task1_1 = Smaller_1 + pleft ;
-    const int64_t *GB_RESTRICT Right_task1_2 = Smaller_2 + pleft ;
-    const int64_t nright_task1 = (nsmaller - pleft) ;
-
-    GB_TASK (GB_merge_select_3,
-        S_task0_0, S_task0_1, S_task0_2,
-        Left_task0_0,  Left_task0_1,  Left_task0_2,  nleft_task0,
-        Right_task0_0, Right_task0_1, Right_task0_2, nright_task0) ;
-
-    GB_TASK (GB_merge_select_3,
-        S_task1_0, S_task1_1, S_task1_2,
-        Left_task1_0,  Left_task1_1,  Left_task1_2,  nleft_task1,
-        Right_task1_0, Right_task1_1, Right_task1_2, nright_task1) ;
-
-    GB_TASK_WAIT
-}
-
-//------------------------------------------------------------------------------
-// GB_merge_select_3: parallel or sequential merge
-//------------------------------------------------------------------------------
-
-// The two input arrays, Left [0..nleft-1] and Right [0..nright-1], are sorted.
-// They are merged into the output array S [0..nleft+nright-1], using either
-// the sequential merge (for small lists) or the parallel merge (for big
-// lists).
-
-void GB_merge_select_3      // parallel or sequential merge of 3-by-n arrays
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nleft+nright
-    int64_t *GB_RESTRICT S_1,
-    int64_t *GB_RESTRICT S_2,
-    const int64_t *GB_RESTRICT Left_0,     // Left [0..nleft-1]
-    const int64_t *GB_RESTRICT Left_1,
-    const int64_t *GB_RESTRICT Left_2,
-    const int64_t nleft,
-    const int64_t *GB_RESTRICT Right_0,    // Right [0..nright-1]
-    const int64_t *GB_RESTRICT Right_1,
-    const int64_t *GB_RESTRICT Right_2,
-    const int64_t nright
-)
-{
-
-    if (nleft + nright < GB_BASECASE)
-    { 
-        // sequential merge
-        GB_merge_sequential_3 (S_0, S_1, S_2,
-            Left_0,  Left_1,  Left_2,  nleft,
-            Right_0, Right_1, Right_2, nright) ;
-    }
-    else if (nleft >= nright)
-    { 
-        // parallel merge, where Left [0..nleft-1] is the bigger of the two.
-        GB_merge_parallel_3 (S_0, S_1, S_2,
-            Left_0,  Left_1,  Left_2,  nleft,
-            Right_0, Right_1, Right_2, nright) ;
-    }
-    else
-    { 
-        // parallel merge, where Right [0..nright-1] is the bigger of the two.
-        GB_merge_parallel_3 (S_0, S_1, S_2,
-            Right_0, Right_1, Right_2, nright,
-            Left_0,  Left_1,  Left_2,  nleft) ;
-    }
-}
-
-//------------------------------------------------------------------------------
-// GB_mergesort_3:  parallel merge sort of a 3-by-n array
-//------------------------------------------------------------------------------
-
-// GB_mergesort_3 sorts an int64_t array A of size 3-by-n in ascending
-// order, using a parallel mergesort.  W is a workspace array of size 3-by-n.
-// Small arrays are sorted with a quicksort method.
-
-void GB_mergesort_3 // sort array A of size 3-by-n, using 3 keys (A [0:1][])
-(
-    int64_t *GB_RESTRICT A_0,      // size n array
-    int64_t *GB_RESTRICT A_1,      // size n array
-    int64_t *GB_RESTRICT A_2,      // size n array
-    int64_t *GB_RESTRICT W_0,      // size n array, workspace
-    int64_t *GB_RESTRICT W_1,      // size n array, workspace
-    int64_t *GB_RESTRICT W_2,      // size n array, workspace
-    const int64_t n
-)
-{
-
-    if (n <= GB_BASECASE)
-    { 
-
-        // ---------------------------------------------------------------------
-        // sequential quicksort; no workspace needed
-        // ---------------------------------------------------------------------
-
-        GB_qsort_3 (A_0, A_1, A_2, n) ;
-
-    }
-    else
-    { 
-
-        // ---------------------------------------------------------------------
-        // recursive merge sort if A has length greater than GB_BASECASE
-        // ---------------------------------------------------------------------
-
-        // ---------------------------------------------------------------------
-        // split A into four quarters
-        // ---------------------------------------------------------------------
-
-        int64_t n12 = n >> 1 ;          // split n into n12 and n34
-        int64_t n34 = n - n12 ;
-
-        int64_t n1 = n12 >> 1 ;         // split n12 into n1 and n2
-        int64_t n2 = n12 - n1 ;
-
-        int64_t n3 = n34 >> 1 ;         // split n34 into n3 and n4
-        int64_t n4 = n34 - n3 ;
-
-        int64_t n123 = n12 + n3 ;       // start of 4th quarter = n1 + n2 + n3
-
-        // 1st quarter of A and W
-        int64_t *GB_RESTRICT A_1st0 = A_0 ;
-        int64_t *GB_RESTRICT A_1st1 = A_1 ;
-        int64_t *GB_RESTRICT A_1st2 = A_2 ;
-
-        int64_t *GB_RESTRICT W_1st0 = W_0 ;
-        int64_t *GB_RESTRICT W_1st1 = W_1 ;
-        int64_t *GB_RESTRICT W_1st2 = W_2 ;
-
-        // 2nd quarter of A and W
-        int64_t *GB_RESTRICT A_2nd0 = A_0 + n1 ;
-        int64_t *GB_RESTRICT A_2nd1 = A_1 + n1 ;
-        int64_t *GB_RESTRICT A_2nd2 = A_2 + n1 ;
-
-        int64_t *GB_RESTRICT W_2nd0 = W_0 + n1 ;
-        int64_t *GB_RESTRICT W_2nd1 = W_1 + n1 ;
-        int64_t *GB_RESTRICT W_2nd2 = W_2 + n1 ;
-
-        // 3rd quarter of A and W
-        int64_t *GB_RESTRICT A_3rd0 = A_0 + n12 ;
-        int64_t *GB_RESTRICT A_3rd1 = A_1 + n12 ;
-        int64_t *GB_RESTRICT A_3rd2 = A_2 + n12 ;
-
-        int64_t *GB_RESTRICT W_3rd0 = W_0 + n12 ;
-        int64_t *GB_RESTRICT W_3rd1 = W_1 + n12 ;
-        int64_t *GB_RESTRICT W_3rd2 = W_2 + n12 ;
-
-        // 4th quarter of A and W
-        int64_t *GB_RESTRICT A_4th0 = A_0 + n123 ;
-        int64_t *GB_RESTRICT A_4th1 = A_1 + n123 ;
-        int64_t *GB_RESTRICT A_4th2 = A_2 + n123 ;
-
-        int64_t *GB_RESTRICT W_4th0 = W_0 + n123 ;
-        int64_t *GB_RESTRICT W_4th1 = W_1 + n123 ;
-        int64_t *GB_RESTRICT W_4th2 = W_2 + n123 ;
-
-        // ---------------------------------------------------------------------
-        // sort each quarter of A in parallel, using W as workspace
-        // ---------------------------------------------------------------------
-
-        GB_TASK (GB_mergesort_3,
-            A_1st0, A_1st1, A_1st2, W_1st0, W_1st1, W_1st2, n1) ;
-
-        GB_TASK (GB_mergesort_3,
-            A_2nd0, A_2nd1, A_2nd2, W_2nd0, W_2nd1, W_2nd2, n2) ;
-
-        GB_TASK (GB_mergesort_3,
-            A_3rd0, A_3rd1, A_3rd2, W_3rd0, W_3rd1, W_3rd2, n3) ;
-
-        GB_TASK (GB_mergesort_3,
-            A_4th0, A_4th1, A_4th2, W_4th0, W_4th1, W_4th2, n4) ;
-
-        GB_TASK_WAIT
-
-        // ---------------------------------------------------------------------
-        // merge pairs of quarters of A into two halves of W, in parallel
-        // ---------------------------------------------------------------------
-
-        GB_TASK (GB_merge_select_3, W_1st0, W_1st1, W_1st2,
-            A_1st0, A_1st1, A_1st2, n1, A_2nd0, A_2nd1, A_2nd2, n2) ;
-
-        GB_TASK (GB_merge_select_3, W_3rd0, W_3rd1, W_3rd2,
-            A_3rd0, A_3rd1, A_3rd2, n3, A_4th0, A_4th1, A_4th2, n4) ;
-
-        GB_TASK_WAIT
-
-        // ---------------------------------------------------------------------
-        // merge the two halves of W into A
-        // ---------------------------------------------------------------------
-
-        GB_merge_select_3 (A_0, A_1, A_2, W_1st0, W_1st1, W_1st2, n12,
-            W_3rd0, W_3rd1, W_3rd2, n34) ;
-    }
-}
-
-//------------------------------------------------------------------------------
-// GB_msort_3: gateway for parallel merge sort
-//------------------------------------------------------------------------------
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void GB_msort_3     // sort array A of size 3-by-n, using 3 keys (A [0:2][])
-(
-    int64_t *GB_RESTRICT A_0,   // size n array
-    int64_t *GB_RESTRICT A_1,   // size n array
-    int64_t *GB_RESTRICT A_2,   // size n array
-    int64_t *GB_RESTRICT W_0,   // size n array, workspace
-    int64_t *GB_RESTRICT W_1,   // size n array, workspace
-    int64_t *GB_RESTRICT W_2,   // size n array, workspace
-    const int64_t n,
-    int nthreads                // # of threads to use
-)
-{
-
-    nthreads = GB_MSORT_NTHREADS (nthreads) ;
-
-    if (nthreads > 1)
-    {
-
-        if (GB_OPENMP_GET_NUM_THREADS > 1)
-        {
-
-            // -----------------------------------------------------------------
-            // parallel mergesort: already in parallel region
-            // -----------------------------------------------------------------
-
-            // GB_msort_3 is already in a parallel region in the caller.  This
-            // does not occur inside GraphBLAS, but the user application might
-            // be calling GraphBLAS inside its own parallel region.
-
-            GB_mergesort_3 (A_0, A_1, A_2, W_0, W_1, W_2, n) ;
-
-        }
-        else
-        { 
-
-            // -----------------------------------------------------------------
-            // parallel mergesort: start a parallel region
-            // -----------------------------------------------------------------
-
-            GB_TASK_MASTER (nthreads)
-            GB_mergesort_3 (A_0, A_1, A_2, W_0, W_1, W_2, n) ;
-
-        }
-
-    }
-    else
-    {
-
-        // ---------------------------------------------------------------------
-        // sequential quicksort
-        // ---------------------------------------------------------------------
-
-        // The method is in-place, and the workspace is not used.
-        GB_qsort_3 (A_0, A_1, A_2, n) ;
-    }
-}
-
diff --git a/GraphBLAS/Source/GB_msort_3.h b/GraphBLAS/Source/GB_msort_3.h
deleted file mode 100644
index 254e8abd56..0000000000
--- a/GraphBLAS/Source/GB_msort_3.h
+++ /dev/null
@@ -1,59 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_msort_3.h: definitions for GB_msort_3.c
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// A parallel mergesort of an array of 3-by-n integers.  Each key consists
-// of three integers.
-
-#include "GB_sort.h"
-
-//------------------------------------------------------------------------------
-// prototypes only needed for GB_msort_3
-//------------------------------------------------------------------------------
-
-void GB_merge_parallel_3                // parallel merge
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nbigger + nsmaller
-    int64_t *GB_RESTRICT S_1,
-    int64_t *GB_RESTRICT S_2,
-    const int64_t *GB_RESTRICT Bigger_0,   // Bigger [0..nbigger-1]
-    const int64_t *GB_RESTRICT Bigger_1,
-    const int64_t *GB_RESTRICT Bigger_2,
-    const int64_t nbigger,
-    const int64_t *GB_RESTRICT Smaller_0,  // Smaller [0..nsmaller-1]
-    const int64_t *GB_RESTRICT Smaller_1,
-    const int64_t *GB_RESTRICT Smaller_2,
-    const int64_t nsmaller
-) ;
-
-void GB_merge_select_3      // parallel or sequential merge of 3-by-n arrays
-(
-    int64_t *GB_RESTRICT S_0,              // output of length nleft+nright
-    int64_t *GB_RESTRICT S_1,
-    int64_t *GB_RESTRICT S_2,
-    const int64_t *GB_RESTRICT Left_0,     // Left [0..nleft-1]
-    const int64_t *GB_RESTRICT Left_1,
-    const int64_t *GB_RESTRICT Left_2,
-    const int64_t nleft,
-    const int64_t *GB_RESTRICT Right_0,    // Right [0..nright-1]
-    const int64_t *GB_RESTRICT Right_1,
-    const int64_t *GB_RESTRICT Right_2,
-    const int64_t nright
-) ;
-
-void GB_mergesort_3 // sort array A of size 3-by-n, using 3 keys (A [0:1][])
-(
-    int64_t *GB_RESTRICT A_0,      // size n array
-    int64_t *GB_RESTRICT A_1,      // size n array
-    int64_t *GB_RESTRICT A_2,      // size n array
-    int64_t *GB_RESTRICT W_0,      // size n array, workspace
-    int64_t *GB_RESTRICT W_1,      // size n array, workspace
-    int64_t *GB_RESTRICT W_2,      // size n array, workspace
-    const int64_t n
-) ;
-
diff --git a/GraphBLAS/Source/GB_msort_3b.c b/GraphBLAS/Source/GB_msort_3b.c
new file mode 100644
index 0000000000..32f8fef653
--- /dev/null
+++ b/GraphBLAS/Source/GB_msort_3b.c
@@ -0,0 +1,457 @@
+//------------------------------------------------------------------------------
+// GB_msort_3b: sort a 3-by-n list of integers, using A[0:2][ ] as the key
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// A parallel mergesort of an array of 3-by-n integers.  Each key
+// consists of three integers.
+
+#include "GB_msort_3b.h"
+
+//------------------------------------------------------------------------------
+// GB_msort_3b_binary_search: binary search for the pivot
+//------------------------------------------------------------------------------
+
+// The Pivot value is Y [pivot], and a binary search for the Pivot is made in
+// the array X [p_pstart...p_end-1], which is sorted in non-decreasing order on
+// input.  The return value is pleft, where
+//
+//    X [p_start ... pleft-1] <= Pivot and
+//    X [pleft ... p_end-1] >= Pivot holds.
+//
+// pleft is returned in the range p_start to p_end.  If pleft is p_start, then
+// the Pivot is smaller than all entries in X [p_start...p_end-1], and the left
+// list X [p_start...pleft-1] is empty.  If pleft is p_end, then the Pivot is
+// larger than all entries in X [p_start...p_end-1], and the right list X
+// [pleft...p_end-1] is empty.
+
+static int64_t GB_msort_3b_binary_search    // return pleft
+(
+    const int64_t *GB_RESTRICT Y_0,         // Pivot is Y [pivot]
+    const int64_t *GB_RESTRICT Y_1,
+    const int64_t *GB_RESTRICT Y_2,
+    const int64_t pivot,
+    const int64_t *GB_RESTRICT X_0,         // search in X [p_start..p_end_-1]
+    const int64_t *GB_RESTRICT X_1,
+    const int64_t *GB_RESTRICT X_2,
+    const int64_t p_start,
+    const int64_t p_end
+)
+{
+
+    //--------------------------------------------------------------------------
+    // find where the Pivot appears in X
+    //--------------------------------------------------------------------------
+
+    // binary search of X [p_start...p_end-1] for the Pivot
+    int64_t pleft = p_start ;
+    int64_t pright = p_end - 1 ;
+    while (pleft < pright)
+    { 
+        int64_t pmiddle = (pleft + pright) >> 1 ;
+        // less = (X [pmiddle] < Pivot)
+        bool less = GB_lt_3 (X_0, X_1, X_2, pmiddle,
+                             Y_0, Y_1, Y_2, pivot) ;
+        pleft  = less ? (pmiddle+1) : pleft ;
+        pright = less ? pright : pmiddle ;
+    }
+
+    // binary search is narrowed down to a single item
+    // or it has found the list is empty:
+    ASSERT (pleft == pright || pleft == pright + 1) ;
+
+    // If found is true then X [pleft == pright] == Pivot.  If duplicates
+    // appear then X [pleft] is any one of the entries equal to the Pivot
+    // in the list.  If found is false then
+    //    X [p_start ... pleft-1] < Pivot and
+    //    X [pleft+1 ... p_end-1] > Pivot holds.
+    //    The value X [pleft] may be either < or > Pivot.
+    bool found = (pleft == pright) && GB_eq_3 (X_0, X_1, X_2, pleft,
+                                               Y_0, Y_1, Y_2, pivot) ;
+
+    // Modify pleft and pright:
+    if (!found && (pleft == pright))
+    {
+        if (GB_lt_3 (X_0, X_1, X_2, pleft,
+                     Y_0, Y_1, Y_2, pivot))
+        { 
+            pleft++ ;
+        }
+        else
+        { 
+//          pright++ ;  // (not needed)
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    // If found is false then
+    //    X [p_start ... pleft-1] < Pivot and
+    //    X [pleft ... p_end-1] > Pivot holds,
+    //    and pleft-1 == pright
+
+    // If X has no duplicates, then whether or not Pivot is found,
+    //    X [p_start ... pleft-1] < Pivot and
+    //    X [pleft ... p_end-1] >= Pivot holds.
+
+    // If X has duplicates, then whether or not Pivot is found,
+    //    X [p_start ... pleft-1] <= Pivot and
+    //    X [pleft ... p_end-1] >= Pivot holds.
+
+    return (pleft) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_msort_3b_create_merge_tasks
+//------------------------------------------------------------------------------
+
+// Recursively constructs ntasks tasks to merge two arrays, Left and Right,
+// into Sresult, where Left is L [pL_start...pL_end-1], Right is R
+// [pR_start...pR_end-1], and Sresult is S [pS_start...pS_start+total_work-1],
+// and where total_work is the total size of Left and Right.
+//
+// Task tid will merge L [L_task [tid] ... L_task [tid] + L_len [tid] - 1] and
+// R [R_task [tid] ... R_task [tid] + R_len [tid] -1] into the merged output
+// array S [S_task [tid] ... ].  The task tids created are t0 to
+// t0+ntasks-1.
+
+void GB_msort_3b_create_merge_tasks
+(
+    // output:
+    int64_t *GB_RESTRICT L_task,        // L_task [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT L_len,         // L_len  [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT R_task,        // R_task [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT R_len,         // R_len  [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT S_task,        // S_task [t0...t0+ntasks-1] computed
+    // input:
+    const int t0,                       // first task tid to create
+    const int ntasks,                   // # of tasks to create
+    const int64_t pS_start,             // merge into S [pS_start...]
+    const int64_t *GB_RESTRICT L_0,     // Left = L [pL_start...pL_end-1]
+    const int64_t *GB_RESTRICT L_1,
+    const int64_t *GB_RESTRICT L_2,
+    const int64_t pL_start,
+    const int64_t pL_end,
+    const int64_t *GB_RESTRICT R_0,     // Right = R [pR_start...pR_end-1]
+    const int64_t *GB_RESTRICT R_1,
+    const int64_t *GB_RESTRICT R_2,
+    const int64_t pR_start,
+    const int64_t pR_end
+)
+{
+
+    //--------------------------------------------------------------------------
+    // get problem size
+    //--------------------------------------------------------------------------
+
+    int64_t nleft  = pL_end - pL_start ;        // size of Left array
+    int64_t nright = pR_end - pR_start ;        // size of Right array
+    int64_t total_work = nleft + nright ;       // total work to do
+    ASSERT (ntasks >= 1) ;
+    ASSERT (total_work > 0) ;
+
+    //--------------------------------------------------------------------------
+    // create the tasks
+    //--------------------------------------------------------------------------
+
+    if (ntasks == 1)
+    { 
+
+        //----------------------------------------------------------------------
+        // a single task will merge all of Left and Right into Sresult
+        //----------------------------------------------------------------------
+
+        L_task [t0] = pL_start ; L_len [t0] = nleft ;
+        R_task [t0] = pR_start ; R_len [t0] = nright ;
+        S_task [t0] = pS_start ;
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // partition the Left and Right arrays for multiple merge tasks
+        //----------------------------------------------------------------------
+
+        int64_t pleft, pright ;
+        if (nleft >= nright)
+        { 
+            // split Left in half, and search for its pivot in Right
+            pleft = (pL_end + pL_start) >> 1 ;
+            pright = GB_msort_3b_binary_search (
+                        L_0, L_1, L_2, pleft,
+                        R_0, R_1, R_2, pR_start, pR_end) ;
+        }
+        else
+        { 
+            // split Right in half, and search for its pivot in Left
+            pright = (pR_end + pR_start) >> 1 ;
+            pleft = GB_msort_3b_binary_search (
+                        R_0, R_1, R_2, pright,
+                        L_0, L_1, L_2, pL_start, pL_end) ;
+        }
+
+        //----------------------------------------------------------------------
+        // partition the tasks according to the work of each partition
+        //----------------------------------------------------------------------
+
+        // work0 is the total work in the first partition
+        int64_t work0 = (pleft - pL_start) + (pright - pR_start) ;
+        int ntasks0 = (int) round ((double) ntasks *
+            (((double) work0) / ((double) total_work))) ;
+
+        // ensure at least one task is assigned to each partition
+        ntasks0 = GB_IMAX (ntasks0, 1) ;
+        ntasks0 = GB_IMIN (ntasks0, ntasks-1) ;
+        int ntasks1 = ntasks - ntasks0 ;
+
+        //----------------------------------------------------------------------
+        // assign ntasks0 to the first half
+        //----------------------------------------------------------------------
+
+        // ntasks0 tasks merge L [pL_start...pleft-1] and R [pR_start..pright-1]
+        // into the result S [pS_start...work0-1].
+
+        GB_msort_3b_create_merge_tasks (
+            L_task, L_len, R_task, R_len, S_task, t0, ntasks0, pS_start,
+            L_0, L_1, L_2, pL_start, pleft,
+            R_0, R_1, R_2, pR_start, pright) ;
+
+        //----------------------------------------------------------------------
+        // assign ntasks1 to the second half
+        //----------------------------------------------------------------------
+
+        // ntasks1 tasks merge L [pleft...pL_end-1] and R [pright...pR_end-1]
+        // into the result S [pS_start+work0...pS_start+total_work].
+
+        int t1 = t0 + ntasks0 ;     // first task id of the second set of tasks
+        int64_t pS_start1 = pS_start + work0 ;  // 2nd set starts here in S
+        GB_msort_3b_create_merge_tasks (
+            L_task, L_len, R_task, R_len, S_task, t1, ntasks1, pS_start1,
+            L_0, L_1, L_2, pleft,  pL_end,
+            R_0, R_1, R_2, pright, pR_end) ;
+    }
+}
+
+//------------------------------------------------------------------------------
+// GB_msort_3b_merge: merge two sorted lists via a single thread
+//------------------------------------------------------------------------------
+
+// merge Left [0..nleft-1] and Right [0..nright-1] into S [0..nleft+nright-1] */
+
+static void GB_msort_3b_merge
+(
+    int64_t *GB_RESTRICT S_0,              // output of length nleft + nright
+    int64_t *GB_RESTRICT S_1,
+    int64_t *GB_RESTRICT S_2,
+    const int64_t *GB_RESTRICT Left_0,     // left input of length nleft
+    const int64_t *GB_RESTRICT Left_1,
+    const int64_t *GB_RESTRICT Left_2,
+    const int64_t nleft,
+    const int64_t *GB_RESTRICT Right_0,    // right input of length nright
+    const int64_t *GB_RESTRICT Right_1,
+    const int64_t *GB_RESTRICT Right_2,
+    const int64_t nright
+)
+{
+    int64_t p, pleft, pright ;
+
+    // merge the two inputs, Left and Right, while both inputs exist
+    for (p = 0, pleft = 0, pright = 0 ; pleft < nleft && pright < nright ; p++)
+    {
+        if (GB_lt_3 (Left_0,  Left_1,  Left_2,  pleft,
+                     Right_0, Right_1, Right_2, pright))
+        { 
+            // S [p] = Left [pleft++]
+            S_0 [p] = Left_0 [pleft] ;
+            S_1 [p] = Left_1 [pleft] ;
+            S_2 [p] = Left_2 [pleft] ;
+            pleft++ ;
+        }
+        else
+        { 
+            // S [p] = Right [pright++]
+            S_0 [p] = Right_0 [pright] ;
+            S_1 [p] = Right_1 [pright] ;
+            S_2 [p] = Right_2 [pright] ;
+            pright++ ;
+        }
+    }
+
+    // either input is exhausted; copy the remaining list into S
+    if (pleft < nleft)
+    { 
+        int64_t nremaining = (nleft - pleft) ;
+        memcpy (S_0 + p, Left_0 + pleft, nremaining * sizeof (int64_t)) ;
+        memcpy (S_1 + p, Left_1 + pleft, nremaining * sizeof (int64_t)) ;
+        memcpy (S_2 + p, Left_2 + pleft, nremaining * sizeof (int64_t)) ;
+    }
+    else if (pright < nright)
+    { 
+        int64_t nremaining = (nright - pright) ;
+        memcpy (S_0 + p, Right_0 + pright, nremaining * sizeof (int64_t)) ;
+        memcpy (S_1 + p, Right_1 + pright, nremaining * sizeof (int64_t)) ;
+        memcpy (S_2 + p, Right_2 + pright, nremaining * sizeof (int64_t)) ;
+    }
+}
+
+//------------------------------------------------------------------------------
+// GB_msort_3b: parallel mergesort
+//------------------------------------------------------------------------------
+
+GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
+GrB_Info GB_msort_3b    // sort array A of size 3-by-n, using 3 keys (A [0:2][])
+(
+    int64_t *GB_RESTRICT A_0,   // size n array
+    int64_t *GB_RESTRICT A_1,   // size n array
+    int64_t *GB_RESTRICT A_2,   // size n array
+    const int64_t n,
+    int nthreads                // # of threads to use
+)
+{
+
+    //--------------------------------------------------------------------------
+    // handle small problems with a single thread
+    //--------------------------------------------------------------------------
+
+    if (nthreads <= 1 || n <= GB_BASECASE)
+    { 
+        // sequential quicksort
+        GB_qsort_3 (A_0, A_1, A_2, n) ;
+        return (GrB_SUCCESS) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // determine # of tasks
+    //--------------------------------------------------------------------------
+
+    // determine the number of levels to create, which must always be an
+    // even number.  The # of levels is chosen to ensure that the # of leaves
+    // of the task tree is between 4*nthreads and 16*nthreads.
+
+    //  2 to 4 threads:     4 levels, 16 qsort leaves
+    //  5 to 16 threads:    6 levels, 64 qsort leaves
+    // 17 to 64 threads:    8 levels, 256 qsort leaves
+    // 65 to 256 threads:   10 levels, 1024 qsort leaves
+    // 256 to 1024 threads: 12 levels, 4096 qsort leaves
+    // ...
+
+    int k = (int) (2 + 2 * ceil (log2 ((double) nthreads) / 2)) ;
+    int ntasks = 1 << k ;
+
+    //--------------------------------------------------------------------------
+    // allocate workspace
+    //--------------------------------------------------------------------------
+
+    int64_t *GB_RESTRICT W = GB_MALLOC (3*n + 6*ntasks + 1, int64_t) ;
+    if (W == NULL)
+    { 
+        // out of memory
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    int64_t *T = W ;
+    int64_t *GB_RESTRICT W_0    = T ; T += n ;
+    int64_t *GB_RESTRICT W_1    = T ; T += n ;
+    int64_t *GB_RESTRICT W_2    = T ; T += n ;
+    int64_t *GB_RESTRICT L_task = T ; T += ntasks ;
+    int64_t *GB_RESTRICT L_len  = T ; T += ntasks ;
+    int64_t *GB_RESTRICT R_task = T ; T += ntasks ;
+    int64_t *GB_RESTRICT R_len  = T ; T += ntasks ;
+    int64_t *GB_RESTRICT S_task = T ; T += ntasks ;
+    int64_t *GB_RESTRICT Slice  = T ; T += (ntasks+1) ;  
+
+    //--------------------------------------------------------------------------
+    // partition and sort the leaves
+    //--------------------------------------------------------------------------
+
+    GB_eslice (Slice, n, ntasks) ;
+    int tid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+    for (tid = 0 ; tid < ntasks ; tid++)
+    { 
+        int64_t leaf = Slice [tid] ;
+        int64_t leafsize = Slice [tid+1] - leaf ;
+        GB_qsort_3 (A_0 + leaf, A_1 + leaf, A_2 + leaf, leafsize) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // merge each level
+    //--------------------------------------------------------------------------
+
+    int nt = 1 ;
+    for ( ; k >= 2 ; k -= 2)
+    {
+
+        //----------------------------------------------------------------------
+        // merge level k into level k-1, from A into W
+        //----------------------------------------------------------------------
+
+        // this could be done in parallel if ntasks was large
+        for (int tid = 0 ; tid < ntasks ; tid += 2*nt)
+        { 
+            // create 2*nt tasks to merge two A sublists into one W sublist
+            GB_msort_3b_create_merge_tasks (
+                L_task, L_len, R_task, R_len, S_task, tid, 2*nt, Slice [tid],
+                A_0, A_1, A_2, Slice [tid],    Slice [tid+nt],
+                A_0, A_1, A_2, Slice [tid+nt], Slice [tid+2*nt]) ;
+        }
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+        for (tid = 0 ; tid < ntasks ; tid++)
+        { 
+            // merge A [pL...pL+nL-1] and A [pR...pR+nR-1] into W [pS..]
+            int64_t pL = L_task [tid], nL = L_len [tid] ;
+            int64_t pR = R_task [tid], nR = R_len [tid] ;
+            int64_t pS = S_task [tid] ;
+
+            GB_msort_3b_merge (
+                W_0 + pS, W_1 + pS, W_2 + pS,
+                A_0 + pL, A_1 + pL, A_2 + pL, nL,
+                A_0 + pR, A_1 + pR, A_2 + pR, nR) ;
+        }
+        nt = 2*nt ;
+
+        //----------------------------------------------------------------------
+        // merge level k-1 into level k-2, from W into A
+        //----------------------------------------------------------------------
+
+        // this could be done in parallel if ntasks was large
+        for (int tid = 0 ; tid < ntasks ; tid += 2*nt)
+        { 
+            // create 2*nt tasks to merge two W sublists into one A sublist
+            GB_msort_3b_create_merge_tasks (
+                L_task, L_len, R_task, R_len, S_task, tid, 2*nt, Slice [tid],
+                W_0, W_1, W_2, Slice [tid],    Slice [tid+nt],
+                W_0, W_1, W_2, Slice [tid+nt], Slice [tid+2*nt]) ;
+        }
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+        for (tid = 0 ; tid < ntasks ; tid++)
+        { 
+            // merge A [pL...pL+nL-1] and A [pR...pR+nR-1] into W [pS..]
+            int64_t pL = L_task [tid], nL = L_len [tid] ;
+            int64_t pR = R_task [tid], nR = R_len [tid] ;
+            int64_t pS = S_task [tid] ;
+            GB_msort_3b_merge (
+                A_0 + pS, A_1 + pS, A_2 + pS,
+                W_0 + pL, W_1 + pL, W_2 + pL, nL,
+                W_0 + pR, W_1 + pR, W_2 + pR, nR) ;
+        }
+        nt = 2*nt ;
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    GB_FREE (W) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_msort_3b.h b/GraphBLAS/Source/GB_msort_3b.h
new file mode 100644
index 0000000000..95d1443d8f
--- /dev/null
+++ b/GraphBLAS/Source/GB_msort_3b.h
@@ -0,0 +1,42 @@
+//------------------------------------------------------------------------------
+// GB_msort_3b.h: definitions for GB_msort_3b.c
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// A parallel mergesort of an array of 3-by-n integers.  Each key consists
+// of three integers.
+
+#include "GB_sort.h"
+
+//------------------------------------------------------------------------------
+// prototypes only needed for GB_msort_3b
+//------------------------------------------------------------------------------
+
+void GB_msort_3b_create_merge_tasks
+(
+    // output:
+    int64_t *GB_RESTRICT L_task,        // L_task [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT L_len,         // L_len  [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT R_task,        // R_task [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT R_len,         // R_len  [t0...t0+ntasks-1] computed
+    int64_t *GB_RESTRICT S_task,        // S_task [t0...t0+ntasks-1] computed
+    // input:
+    const int t0,                       // first task tid to create
+    const int ntasks,                   // # of tasks to create
+    const int64_t pS_start,             // merge into S [pS_start...]
+    const int64_t *GB_RESTRICT L_0,     // Left = L [pL_start...pL_end-1]
+    const int64_t *GB_RESTRICT L_1,
+    const int64_t *GB_RESTRICT L_2,
+    const int64_t pL_start,
+    const int64_t pL_end,
+    const int64_t *GB_RESTRICT R_0,     // Right = R [pR_start...pR_end-1]
+    const int64_t *GB_RESTRICT R_1,
+    const int64_t *GB_RESTRICT R_2,
+    const int64_t pR_start,
+    const int64_t pR_end
+) ;
+
diff --git a/GraphBLAS/Source/GB_mxm.c b/GraphBLAS/Source/GB_mxm.c
index b920c2bff8..7a34af56ba 100644
--- a/GraphBLAS/Source/GB_mxm.c
+++ b/GraphBLAS/Source/GB_mxm.c
@@ -2,8 +2,8 @@
 // GB_mxm: matrix-matrix multiply for GrB_mxm, GrB_mxv, and GrB_vxm
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,8 +17,8 @@
 
 #define GB_FREE_ALL         \
 {                           \
-    GB_MATRIX_FREE (&MT) ;  \
-    GB_MATRIX_FREE (&T) ;   \
+    GB_Matrix_free (&MT) ;  \
+    GB_Matrix_free (&T) ;   \
 }
 
 GrB_Info GB_mxm                     // C<M> = A*B
@@ -36,9 +36,12 @@ GrB_Info GB_mxm                     // C<M> = A*B
     const bool B_transpose,         // if true, use B' instead of B
     const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
     const GrB_Desc_Value AxB_method,// for auto vs user selection of methods
+    const int do_sort,              // if nonzero, try to return C unjumbled
     GB_Context Context
 )
 {
+// GB_Global_timing_clear_all ( ) ;
+// double ttt = omp_get_wtime ( ) ;
 
     //--------------------------------------------------------------------------
     // check inputs
@@ -49,7 +52,7 @@ GrB_Info GB_mxm                     // C<M> = A*B
     GrB_Info info ;
     GrB_Matrix T = NULL, MT = NULL ;
 
-    GB_RETURN_IF_FAULTY (accum) ;
+    GB_RETURN_IF_FAULTY_OR_POSITIONAL (accum) ;
     GB_RETURN_IF_NULL_OR_FAULTY (semiring) ;
 
     ASSERT_MATRIX_OK (C, "C input for GB_mxm", GB0) ;
@@ -84,52 +87,79 @@ GrB_Info GB_mxm                     // C<M> = A*B
     int64_t bncols = (B_transpose) ? GB_NROWS (B) : GB_NCOLS (B) ;
     if (ancols != bnrows || GB_NROWS (C) != anrows || GB_NCOLS (C) != bncols)
     { 
-        return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
+        GB_ERROR (GrB_DIMENSION_MISMATCH,
             "Dimensions not compatible:\n"
             "output is " GBd "-by-" GBd "\n"
             "first input is " GBd "-by-" GBd "%s\n"
             "second input is " GBd "-by-" GBd "%s",
             GB_NROWS (C), GB_NCOLS (C),
             anrows, ancols, A_transpose ? " (transposed)" : "",
-            bnrows, bncols, B_transpose ? " (transposed)" : ""))) ;
+            bnrows, bncols, B_transpose ? " (transposed)" : "") ;
     }
 
     // quick return if an empty mask is complemented
     GB_RETURN_IF_QUICK_MASK (C, C_replace, M, Mask_comp) ;
 
-    // delete any lingering zombies and assemble any pending tuples
-    GB_MATRIX_WAIT (M) ;
-    GB_MATRIX_WAIT (A) ;
-    GB_MATRIX_WAIT (B) ;
+    //--------------------------------------------------------------------------
+    // finish any pending work
+    //--------------------------------------------------------------------------
+
+    GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (M) ;
+    GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (A) ;
+    GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (B) ;
 
     //--------------------------------------------------------------------------
-    // T = A*B, A'*B, A*B', or A'*B', also using the mask to cut time and memory
+    // T = A*B, A'*B, A*B', or A'*B', also using the mask if present
     //--------------------------------------------------------------------------
 
     // If C is dense (with no pending work), and the accum is present, then
-    // C+=A*B can be done in place (C_replace is effectively false).  If C is
+    // C+=A*B can be done in-place (C_replace is effectively false).  If C is
     // dense, M is present, and C_replace is false, then C<M>+=A*B or
-    // C<!M>+=A*B can also be done in place.  In all of these cases, C remains
-    // dense.
+    // C<!M>+=A*B can also be done in-place.  In all of these cases, C remains
+    // dense with all entries present.  C can have any sparsity structure;
+    // its pattern is ignored.
+
+    // If C is bitmap, then it can always be be done in-place (assuming the
+    // type of C is OK).  The accum operator need not be present.  GB_AxB_meta
+    // can easily insert non-entries into C and check for non-entries, via the
+    // bitmap.
+
+    // To compute C in-place, its type must match the accum->ztype, or the
+    // semiring->add->ztype if accum is not present.  To compute in-place,
+    // C must also not be transposed, and it cannot be aliased with M, A, or B.
+
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (0, ttt) ;
+// ttt = omp_get_wtime ( ) ;
 
     bool mask_applied = false ;
     bool done_in_place = false ;
     GB_OK (GB_AxB_meta (&T, C, C_replace, C->is_csc, &MT, M, Mask_comp,
         Mask_struct, accum, A, B, semiring, A_transpose, B_transpose, flipxy,
-        &mask_applied, &done_in_place, AxB_method, &(C->AxB_method_used),
-        Context)) ;
+        &mask_applied, &done_in_place, AxB_method, do_sort, Context)) ;
+
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (1, ttt) ;
+// ttt = omp_get_wtime ( ) ;
+//  GBURBLE ("\n") ;
+//  for (int kk = 0 ; kk < 20 ; kk++)
+//  {
+//      double tttt = GB_Global_timing_get (kk) ;
+//      if (tttt > 0) GBURBLE ("phase %2d: %14.6f sec\n", kk, tttt) ;
+//  }
 
     if (done_in_place)
     { 
-        // C<...>+=A*B has been computed in place; no more work to do
-        GB_MATRIX_FREE (&MT) ;
-        ASSERT_MATRIX_OK (C, "C from GB_mxm (in place)", GB0) ;
+        // C has been computed in-place; no more work to do
+        GB_Matrix_free (&MT) ;
+        ASSERT_MATRIX_OK (C, "C from GB_mxm (in-place)", GB0) ;
         return (info) ;
     }
 
     ASSERT_MATRIX_OK (T, "T=A*B from GB_AxB_meta", GB0) ;
     ASSERT_MATRIX_OK_OR_NULL (MT, "MT from GB_AxB_meta", GB0) ;
     ASSERT (GB_ZOMBIES_OK (T)) ;
+    ASSERT (GB_JUMBLED_OK (T)) ;
     ASSERT (!GB_PENDING (T)) ;
 
     //--------------------------------------------------------------------------
@@ -139,7 +169,7 @@ GrB_Info GB_mxm                     // C<M> = A*B
     if ((accum == NULL) && (C->is_csc == T->is_csc)
         && (M == NULL || (M != NULL && mask_applied))
         && (C_replace || GB_NNZ_UPPER_BOUND (C) == 0))
-    { 
+    {
         // C = 0 ; C = (ctype) T ; with the same CSR/CSC format.  The mask M
         // (if any) has already been applied.  If C is also empty, or to be
         // cleared anyway, and if accum is not present, then T can be
@@ -147,7 +177,7 @@ GrB_Info GB_mxm                     // C<M> = A*B
         // needed.  If no typecasting is done then this takes no time at all
         // and is a pure transplant.  Also conform C to its desired
         // hypersparsity.
-        GB_MATRIX_FREE (&MT) ;
+        GB_Matrix_free (&MT) ;
         if (GB_ZOMBIES (T) && T->type != C->type)
         { 
             // T = A*B can be constructed with zombies, using the dot3 method.
@@ -158,42 +188,35 @@ GrB_Info GB_mxm                     // C<M> = A*B
             // Typecasting a zombie is safe, since the values of all zombies
             // are ignored.  But valgrind complains about it, so they are
             // killed now.  Also see the discussion in GB_transplant.
-            GBBURBLE ("(wait, so zombies are not typecasted) ") ;
+            GBURBLE ("(wait, so zombies are not typecasted) ") ;
             GB_OK (GB_Matrix_wait (T, Context)) ;
         }
-        info = GB_transplant_conform (C, C->type, &T, Context) ;
-        #ifdef GB_DEBUG
-        if (info == GrB_SUCCESS)
-        {
-            // C may be returned with zombies, but no pending tuples
-            ASSERT_MATRIX_OK (C, "C from GB_mxm (transplanted)", GB0) ;
-            ASSERT (GB_ZOMBIES_OK (C)) ;
-            ASSERT (!GB_PENDING (C)) ;
-        }
-        #endif
+        GB_OK (GB_transplant_conform (C, C->type, &T, Context)) ;
+        // C may be returned with zombies and jumbled, but no pending tuples
+        ASSERT_MATRIX_OK (C, "C from GB_mxm (transplanted)", GB0) ;
+        ASSERT (GB_ZOMBIES_OK (C)) ;
+        ASSERT (GB_JUMBLED_OK (C)) ;
+        ASSERT (!GB_PENDING (C)) ;
+        return (GB_block (C, Context)) ;
     }
     else
     { 
         // C<M> = accum (C,T)
-        // GB_accum_mask also conforms C to its desired hypersparsity
+        // GB_accum_mask also conforms C to its desired hypersparsity.
         info = GB_accum_mask (C, M, MT, accum, &T, C_replace, Mask_comp,
             Mask_struct, Context) ;
-        GB_MATRIX_FREE (&MT) ;
+        GB_Matrix_free (&MT) ;
         #ifdef GB_DEBUG
         if (info == GrB_SUCCESS)
         {
-            // C may be returned with zombies and pending tuples
+            // C may be returned jumbled, with zombies and pending tuples
             ASSERT_MATRIX_OK (C, "Final C from GB_mxm (accum_mask)", GB0) ;
             ASSERT (GB_ZOMBIES_OK (C)) ;
+            ASSERT (GB_JUMBLED_OK (C)) ;
             ASSERT (GB_PENDING_OK (C)) ;
         }
         #endif
+        return (info) ;
     }
-
-    //--------------------------------------------------------------------------
-    // return result
-    //--------------------------------------------------------------------------
-
-    return (info) ;
 }
 
diff --git a/GraphBLAS/Source/GB_mxm.h b/GraphBLAS/Source/GB_mxm.h
index 1449aa1862..794ca4b7fe 100644
--- a/GraphBLAS/Source/GB_mxm.h
+++ b/GraphBLAS/Source/GB_mxm.h
@@ -2,14 +2,14 @@
 // GB_mxm.h: definitions for C=A*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #ifndef GB_MXM_H
 #define GB_MXM_H
-#include "GB_AxB_saxpy3.h"
+#include "GB_AxB_saxpy.h"
 
 //------------------------------------------------------------------------------
 
@@ -28,13 +28,14 @@ GrB_Info GB_mxm                     // C<M> = A*B
     const bool B_transpose,         // if true, use B' instead of B
     const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
     const GrB_Desc_Value AxB_method,// for auto vs user selection of methods
+    const int do_sort,              // if nonzero, try to return C unjumbled
     GB_Context Context
 ) ;
 
 GrB_Info GB_AxB_dot                 // dot product (multiple methods)
 (
     GrB_Matrix *Chandle,            // output matrix, NULL on input
-    GrB_Matrix C_in_place,          // input/output matrix, if done in place
+    GrB_Matrix C_in_place,          // input/output matrix, if done in-place
     GrB_Matrix M,                   // optional mask matrix
     const bool Mask_comp,           // if true, use !M
     const bool Mask_struct,         // if true, use the only structure of M
@@ -43,27 +44,15 @@ GrB_Info GB_AxB_dot                 // dot product (multiple methods)
     const GrB_Semiring semiring,    // semiring that defines C=A*B
     const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
     bool *mask_applied,             // if true, mask was applied
-    bool *done_in_place,            // if true, C_in_place was computed in place
-    GB_Context Context
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_AxB_flopcount
-(
-    int64_t *Mwork,             // amount of work to handle the mask M
-    int64_t *Bflops,            // size B->nvec+1 and all zero
-    const GrB_Matrix M,         // optional mask matrix
-    const bool Mask_comp,       // if true, mask is complemented
-    const GrB_Matrix A,
-    const GrB_Matrix B,
+    bool *done_in_place,            // if true, C_in_place was computed in-place
     GB_Context Context
 ) ;
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
 (
-    GrB_Matrix *Chandle,            // output matrix (if not done in place)
-    GrB_Matrix C_in_place,          // input/output matrix, if done in place
+    GrB_Matrix *Chandle,            // output matrix (if not done in-place)
+    GrB_Matrix C_in_place,          // input/output matrix, if done in-place
     bool C_replace,                 // C matrix descriptor
     const bool C_is_csc,            // desired CSR/CSC format of C
     GrB_Matrix *MT_handle,          // return MT = M' to caller, if computed
@@ -78,9 +67,9 @@ GrB_Info GB_AxB_meta                // C<M>=A*B meta algorithm
     bool B_transpose,               // if true, use B', else B
     bool flipxy,                    // if true, do z=fmult(b,a) vs fmult(a,b)
     bool *mask_applied,             // if true, mask was applied
-    bool *done_in_place,            // if true, C was computed in place
+    bool *done_in_place,            // if true, C was computed in-place
     GrB_Desc_Value AxB_method,      // for auto vs user selection of methods
-    GrB_Desc_Value *AxB_method_used,// method selected
+    const int do_sort,              // if nonzero, try to return C unjumbled
     GB_Context Context
 ) ;
 
@@ -123,24 +112,16 @@ bool GB_AxB_semiring_builtin        // true if semiring is builtin
 ) ;
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_AxB_dot2                // C = A'*B using dot product method
+GrB_Info GB_AxB_dot2                // C=A'*B or C<!M>=A'*B, dot product method
 (
     GrB_Matrix *Chandle,            // output matrix
     const GrB_Matrix M,             // mask matrix for C<!M>=A'*B
-#if 0
-    // for dot2, if the mask M is present, this is now always true.
-    // dot3 is used for C<M>=A'*B
     const bool Mask_comp,           // if true, use !M
-#endif
     const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_Matrix *Aslice,       // input matrices (already sliced)
+    const GrB_Matrix A,             // input matrix
     const GrB_Matrix B,             // input matrix
     const GrB_Semiring semiring,    // semiring that defines C=A*B
     const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
-    bool *mask_applied,             // if true, mask was applied
-    int nthreads,
-    int naslice,
-    int nbslice,
     GB_Context Context
 ) ;
 
@@ -187,21 +168,6 @@ GrB_Info GB_AxB_dot3_one_slice
     GB_Context Context
 ) ;
 
-GrB_Info GB_AxB_saxpy3              // C = A*B using Gustavson+Hash
-(
-    GrB_Matrix *Chandle,            // output matrix
-    const GrB_Matrix M_input,       // optional mask matrix
-    const bool Mask_comp_input,     // if true, use !M
-    const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_Matrix A,             // input matrix A
-    const GrB_Matrix B,             // input matrix B
-    const GrB_Semiring semiring,    // semiring that defines C=A*B
-    const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
-    bool *mask_applied,             // if true, then mask was applied
-    const GrB_Desc_Value AxB_method,    // Default, Gustavson, or Hash
-    GB_Context Context
-) ;
-
 GrB_Info GB_AxB_dot4                // C+=A'*B, dot product method
 (
     GrB_Matrix C,                   // input/output matrix, must be dense
@@ -223,5 +189,51 @@ void GB_AxB_pattern
     const GB_Opcode mult_opcode // opcode of multiply operator
 ) ;
 
+//------------------------------------------------------------------------------
+// GB_AxB_dot4_control: determine if the dot4 method should be used
+//------------------------------------------------------------------------------
+
+// C += A'*B where C is modified in-place
+
+static inline bool GB_AxB_dot4_control
+(
+    const GrB_Matrix C_in,      // NULL if C cannot be modified in-place
+    const GrB_Matrix M,
+    const bool Mask_comp
+)
+{
+    return (C_in != NULL && M == NULL && !Mask_comp && !GB_IS_BITMAP (C_in)) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_AxB_dot3_control: determine if the dot3 method should be used
+//------------------------------------------------------------------------------
+
+// C<M>=A'*B where M is sparse or hypersparse, and not complemented
+
+static inline bool GB_AxB_dot3_control
+(
+    const GrB_Matrix M,
+    const bool Mask_comp
+)
+{
+    return (M != NULL && !Mask_comp &&
+        (GB_IS_SPARSE (M) || GB_IS_HYPERSPARSE (M))) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_AxB_dot2_control: determine if the dot2 method should be used
+//------------------------------------------------------------------------------
+
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B where C is constructed in bitmap format.
+// C must be small and likely very dense.
+
+bool GB_AxB_dot2_control  // true: use dot2, false: use saxpy
+(
+    const GrB_Matrix A,
+    const GrB_Matrix B,
+    GB_Context Context
+) ;
+
 #endif
 
diff --git a/GraphBLAS/Source/GB_new.c b/GraphBLAS/Source/GB_new.c
index d45043ba02..8f2eaa21df 100644
--- a/GraphBLAS/Source/GB_new.c
+++ b/GraphBLAS/Source/GB_new.c
@@ -1,22 +1,26 @@
 //------------------------------------------------------------------------------
-// GB_new: create a new GraphBLAS matrix
+// GB_new: create a new GraphBLAS matrix, but do not allocate A->{b,i,x}
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Creates a new matrix but does not allocate space for A->i and A->x.
-// See GB_create instead.
+// Creates a new matrix but does not allocate space for A->b, A->i, and A->x.
+// See GB_new_bix instead.
 
 // If the Ap_option is GB_Ap_calloc, the A->p and A->h are allocated and
 // initialized, and A->magic is set to GB_MAGIC to denote a valid matrix.
-// Otherwise, the matrix has not yet been fully initialized, and A->magic is
-// set to GB_MAGIC2 to denote this.  This case only occurs internally in
+// Otherwise, the matrix has not yet been completelyinitialized, and A->magic
+// is set to GB_MAGIC2 to denote this.  This case only occurs internally in
 // GraphBLAS.  The internal function that calls GB_new must then allocate or
 // initialize A->p itself, and then set A->magic = GB_MAGIC when it does so.
 
+// To allocate a full or bitmap matrix, the sparsity parameter
+// is GxB_FULL or GxB_BITMAP.  The Ap_option is ignored.  For a full or
+// bitmap matrix, only the header is allocated, if NULL on input.
+
 // Only GrB_SUCCESS and GrB_OUT_OF_MEMORY can be returned by this function.
 
 // The GrB_Matrix object holds both a sparse vector and a sparse matrix.  A
@@ -29,10 +33,6 @@
 // returned as NULL, and the existing header is freed as well, if non-NULL on
 // input.
 
-// To see where these options are used in SuiteSparse:GraphBLAS:
-// grep "allocate a new header"
-// which shows all uses of GB_new and GB_create
-
 #include "GB.h"
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
@@ -44,8 +44,8 @@ GrB_Info GB_new                 // create matrix, except for indices & values
     const int64_t vdim,         // number of vectors
     const GB_Ap_code Ap_option, // allocate A->p and A->h, or leave NULL
     const bool is_csc,          // true if CSC, false if CSR
-    const int hyper_option,     // 1:hyper, 0:nonhyper, -1:auto
-    const double hyper_ratio,   // A->hyper_ratio, unless auto
+    const int sparsity,         // hyper, sparse, bitmap, full, or auto
+    const float hyper_switch,   // A->hyper_switch
     const int64_t plen,         // size of A->p and A->h, if A hypersparse.
                                 // Ignored if A is not hypersparse.
     GB_Context Context
@@ -72,7 +72,7 @@ GrB_Info GB_new                 // create matrix, except for indices & values
         if (*Ahandle == NULL)
         { 
             // out of memory
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
         allocated_header = true ;
     }
@@ -86,119 +86,133 @@ GrB_Info GB_new                 // create matrix, except for indices & values
     // basic information
     A->magic = GB_MAGIC2 ;                 // object is not yet valid
     A->type = type ;
-    A->type_size = type->size ; // save the type->size for safe GrB_free
 
     // CSR/CSC format
     A->is_csc = is_csc ;
 
-    // hypersparsity
-    bool is_hyper ;
-    if (hyper_option == GB_FORCE_HYPER)
+    // initial sparsity format
+    bool A_is_hyper ;
+    bool A_is_full_or_bitmap = false ;
+    A->hyper_switch = hyper_switch ;
+    A->bitmap_switch = GB_Global_bitmap_switch_matrix_get (vlen, vdim) ;
+    A->sparsity = GxB_AUTO_SPARSITY ;
+
+    if (sparsity == GxB_HYPERSPARSE)
     { 
-        is_hyper = true ;               // force hypersparse
-        A->hyper_ratio = hyper_ratio ;
+        A_is_hyper = true ;             // force A to be hypersparse
     }
-    else if (hyper_option == GB_FORCE_NONHYPER)
+    else if (sparsity == GxB_SPARSE)
     { 
-        is_hyper = false ;              // force non-hypersparse
-        A->hyper_ratio = hyper_ratio ;
+        A_is_hyper = false ;            // force A to be sparse
     }
-    else // GB_AUTO_HYPER
+    else if (sparsity == GxB_FULL || sparsity == GxB_BITMAP)
     { 
-        // auto selection:  non-hypersparse if one vector or less, or
-        // if the global hyper_ratio is negative.  This is only used by
-        // GrB_Matrix_new, and in a special case in GB_mask.
-        ASSERT (hyper_option == GB_AUTO_HYPER) ;
-        double hyper_ratio = GB_Global_hyper_ratio_get ( ) ;
-        A->hyper_ratio = hyper_ratio ;
-        is_hyper = !(vdim <= 1 || 0 > hyper_ratio) ;
+        A_is_full_or_bitmap = true ;    // force A to be full or bitmap
+        A_is_hyper = false ;
+    }
+    else // auto: sparse or hypersparse
+    { 
+        // auto selection:  sparse if one vector or less or
+        // if the global hyper_switch is negative; hypersparse otherwise.
+        // Never select A to be full or bitmap for this case.
+        A_is_hyper = !(vdim <= 1 || 0 > hyper_switch) ;
     }
-    A->is_hyper = is_hyper ;
 
     // matrix dimensions
     A->vlen = vlen ;
     A->vdim = vdim ;
 
     // content that is freed or reset in GB_ph_free
-    if (is_hyper)
+    if (A_is_full_or_bitmap)
+    {  
+        // A is full or bitmap
+        A->plen = -1 ;
+        A->nvec = vdim ;
+        // all vectors present, unless matrix has a zero dimension 
+        A->nvec_nonempty = (vlen > 0) ? vdim : 0 ;
+    }
+    else if (A_is_hyper)
     { 
+        // A is hypersparse
         A->plen = GB_IMIN (plen, vdim) ;
-        A->nvec = 0 ;           // no vectors present
+        A->nvec = 0 ;                   // no vectors present
+        A->nvec_nonempty = 0 ;
     }
     else
     { 
+        // A is sparse
         A->plen = vdim ;
-        A->nvec = vdim ;        // all vectors present in the data structure
-                                // (but all are currently empty)
+        A->nvec = vdim ;                // all vectors present
+        A->nvec_nonempty = 0 ;
     }
+
     A->p = NULL ;
     A->h = NULL ;
-    A->hfirst = 0 ;
-    A->is_slice = false ;       // true if A is a slice or hyperslice
     A->p_shallow = false ;
     A->h_shallow = false ;
-    A->nvec_nonempty = 0 ;      // all vectors are empty
-    A->mkl = NULL ;             // no analysis from MKL yet
+    // #include "GB_new_mkl_template.c"
 
-    // content that is freed or reset in GB_ix_free
+    A->logger = NULL ;          // no error logged yet
+
+    // content that is freed or reset in GB_bix_free
+    A->b = NULL ;
     A->i = NULL ;
     A->x = NULL ;
     A->nzmax = 0 ;              // GB_NNZ(A) checks nzmax==0 before Ap[nvec]
+    A->nvals = 0 ;              // for bitmapped matrices only
+    A->b_shallow = false ;
     A->i_shallow = false ;
     A->x_shallow = false ;
     A->nzombies = 0 ;
+    A->jumbled = false ;
     A->Pending = NULL ;
 
-    A->queue_next = NULL ;      // TODO in 4.0: delete
-    A->queue_prev = NULL ;      // TODO in 4.0: delete
-    A->enqueued = false ;       // TODO in 4.0: delete
-
-    // method used in GrB_mxm, vxm, and mxv
-    A->AxB_method_used = GxB_DEFAULT ;
-
     //--------------------------------------------------------------------------
     // Allocate A->p and A->h if requested
     //--------------------------------------------------------------------------
 
     bool ok ;
-    if (Ap_option == GB_Ap_calloc)
+    if (A_is_full_or_bitmap || Ap_option == GB_Ap_null)
+    { 
+        // A is not initialized yet; A->p and A->h are both NULL.
+        // sparse case: GB_NNZ(A) must check A->nzmax == 0 since A->p might not
+        // be allocated.
+        // full case: A->x not yet allocated.  A->nzmax still zero
+        // bitmap case: A->b, A->x not yet allocated.  A->nzmax still zero
+        A->magic = GB_MAGIC2 ;
+        A->p = NULL ;
+        A->h = NULL ;
+        ok = true ;
+    }
+    else if (Ap_option == GB_Ap_calloc)
     {
         // Sets the vector pointers to zero, which defines all vectors as empty
         A->magic = GB_MAGIC ;
         A->p = GB_CALLOC (A->plen+1, int64_t) ;
         ok = (A->p != NULL) ;
-        if (is_hyper)
+        if (A_is_hyper)
         { 
             // since nvec is zero, there is never any need to initialize A->h
             A->h = GB_MALLOC (A->plen, int64_t) ;
             ok = ok && (A->h != NULL) ;
         }
     }
-    else if (Ap_option == GB_Ap_malloc)
+    else // Ap_option == GB_Ap_malloc
     {
         // This is faster but can only be used internally by GraphBLAS since
-        // the matrix is allocated but not yet fully initialized.  The caller
-        // must set A->p [0..plen] and then set A->magic to GB_MAGIC, before
-        // returning the matrix to the user application.  GB_NNZ(A) must check
-        // A->nzmax == 0 since A->p [A->nvec] is undefined.
+        // the matrix is allocated but not yet completely initialized.  The
+        // caller must set A->p [0..plen] and then set A->magic to GB_MAGIC,
+        // before returning the matrix to the user application.  GB_NNZ(A) must
+        // check A->nzmax == 0 since A->p [A->nvec] might be undefined.
         A->magic = GB_MAGIC2 ;
         A->p = GB_MALLOC (A->plen+1, int64_t) ;
         ok = (A->p != NULL) ;
-        if (is_hyper)
+        if (A_is_hyper)
         { 
             A->h = GB_MALLOC (A->plen, int64_t) ;
             ok = ok && (A->h != NULL) ;
         }
     }
-    else // Ap_option == GB_Ap_null
-    { 
-        // A is not initialized yet; A->p and A->h are both NULL.
-        // GB_NNZ(A) must check A->nzmax == 0 since A->p is not allocated.
-        A->magic = GB_MAGIC2 ;
-        A->p = NULL ;
-        A->h = NULL ;
-        ok = true ;
-    }
 
     if (!ok)
     {
@@ -206,13 +220,17 @@ GrB_Info GB_new                 // create matrix, except for indices & values
         if (allocated_header)
         { 
             // only free the header if it was allocated here
-            GB_MATRIX_FREE (Ahandle) ;
+            GB_Matrix_free (Ahandle) ;
         }
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
     // The vector pointers A->p are initialized only if Ap_calloc is true
-    if (Ap_option == GB_Ap_calloc)
+    if (A->magic == GB_MAGIC)
     { 
         ASSERT_MATRIX_OK (A, "new matrix from GB_new", GB0) ;
     }
diff --git a/GraphBLAS/Source/GB_create.c b/GraphBLAS/Source/GB_new_bix.c
similarity index 76%
rename from GraphBLAS/Source/GB_create.c
rename to GraphBLAS/Source/GB_new_bix.c
index ea4121a8cd..d51ee973eb 100644
--- a/GraphBLAS/Source/GB_create.c
+++ b/GraphBLAS/Source/GB_new_bix.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_create: create a matrix and allocate space
+// GB_new_bix: create a matrix and allocate space
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,13 +27,9 @@
 //      the preexisting header is not freed and *Ahandle is unmodified on
 //      output.
 
-// To see where these options are used in SuiteSparse:GraphBLAS:
-// grep "allocate a new header"
-// which shows all uses of GB_new and GB_create
-
 #include "GB.h"
 
-GrB_Info GB_create              // create a new matrix, including A->i and A->x
+GrB_Info GB_new_bix             // create a new matrix, incl. A->b, A->i, A->x
 (
     GrB_Matrix *Ahandle,        // output matrix to create
     const GrB_Type type,        // type of output matrix
@@ -41,8 +37,9 @@ GrB_Info GB_create              // create a new matrix, including A->i and A->x
     const int64_t vdim,         // number of vectors
     const GB_Ap_code Ap_option, // allocate A->p and A->h, or leave NULL
     const bool is_csc,          // true if CSC, false if CSR
-    const int hyper_option,     // 1:hyper, 0:nonhyper, -1:auto
-    const double hyper_ratio,   // A->hyper_ratio, unless auto
+    const int sparsity,         // hyper, sparse, bitmap, full, or auto
+    const bool bitmap_calloc,   // if true, calloc A->b, otherwise use malloc
+    const float hyper_switch,   // A->hyper_switch, unless auto
     const int64_t plen,         // size of A->p and A->h, if hypersparse
     const int64_t anz,          // number of nonzeros the matrix must hold
     const bool numeric,         // if true, allocate A->x, else A->x is NULL
@@ -62,11 +59,11 @@ GrB_Info GB_create              // create a new matrix, including A->i and A->x
 
     bool preexisting_header = (*Ahandle != NULL) ;
     GrB_Info info = GB_new (Ahandle, type, vlen, vdim, Ap_option,
-        is_csc, hyper_option, hyper_ratio, plen, Context) ;
+        is_csc, sparsity, hyper_switch, plen, Context) ;
     if (info != GrB_SUCCESS)
     { 
-        // out of memory.  If *Ahandle was non-NULL on input, it has not
-        // been freed by GB_new.
+        // out of memory.
+        // If *Ahandle was non-NULL on input, it has not been freed.
         ASSERT (preexisting_header == (*Ahandle != NULL)) ;
         return (info) ;
     }
@@ -74,18 +71,20 @@ GrB_Info GB_create              // create a new matrix, including A->i and A->x
     GrB_Matrix A = (*Ahandle) ;
 
     //--------------------------------------------------------------------------
-    // allocate the indices and values
+    // allocate the bitmap (A->b), indices (A->i), and values (A->x)
     //--------------------------------------------------------------------------
 
-    info = GB_ix_alloc (A, anz, numeric, Context) ;
+    info = GB_bix_alloc (A, anz, sparsity == GxB_BITMAP, bitmap_calloc,
+        ! (sparsity == GxB_FULL || sparsity == GxB_BITMAP),
+        numeric, Context) ;
     if (info != GrB_SUCCESS)
     {
         // out of memory
-        // GB_ix_alloc has already freed all content of A
+        // GB_bix_alloc has already freed all content of A
         if (!preexisting_header)
         { 
             // also free the header *Ahandle itself
-            GB_MATRIX_FREE (Ahandle) ;
+            GB_Matrix_free (Ahandle) ;
             ASSERT (*Ahandle == NULL) ;
         }
         return (info) ;
diff --git a/GraphBLAS/Source/GB_nnz.h b/GraphBLAS/Source/GB_nnz.h
new file mode 100644
index 0000000000..1e91559447
--- /dev/null
+++ b/GraphBLAS/Source/GB_nnz.h
@@ -0,0 +1,56 @@
+//------------------------------------------------------------------------------
+// GB_nnz.h: macros for matrices and vectors for counting # of entries
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_NNZ_H
+#define GB_NNZ_H
+
+// If A->nzmax is zero, then A->p might not be allocated.  Note the GB_NNZ
+// macro does not count pending tuples; use GB_MATRIX_WAIT(A) first, if
+// needed.  For both sparse and hypersparse matrices Ap [0] == 0,
+// and nnz(A) = Ap [nvec].  For full matrices, Ap is NULL.  For bitmap
+// matrices, GB_NNZ (A) is given by A->nvals, but the size of the space
+// is GB_NNZ_HELD (A).
+
+// nnz(A) if A is sparse or hypersparse
+#define GB_NNZ_SPARSE(A) ((A)->p [(A)->nvec])
+
+// nnz(A) if A is full
+#define GB_NNZ_FULL(A) ((A)->vlen * (A)->vdim)
+
+// nnz(A) if A is bitmap
+#define GB_NNZ_BITMAP(A) ((A)->nvals)
+
+// nnz(A) if A is full or bitmap
+#define GB_NNZ_FULL_OR_BITMAP(A) \
+    (((A)->b == NULL) ? GB_NNZ_FULL (A) : GB_NNZ_BITMAP (A))
+
+// nnz(A) for all non-empty matrices
+#define GB_NNZ_NONEMPTY(A) \
+    (((A)->p == NULL) ? GB_NNZ_FULL_OR_BITMAP (A) : GB_NNZ_SPARSE (A))
+
+// nnz(A) for any matrix: includes zombies for hypersparse and sparse,
+// but excluding entries flagged as not present in a bitmap.
+#define GB_NNZ(A) (((A)->nzmax <= 0) ? 0 : GB_NNZ_NONEMPTY (A))
+
+// nnz_held(A) is the number of entries held in the data structure, including
+// zombies and all entries in a bitmap.  For hypersparse, sparse, and full,
+// nnz(A) and nnz_held(A) are the same.  For bitmap, nnz_held(A) is the
+// same as the # of entries in a full matrix (# rows times # columns).
+#define GB_NNZ_HELD(A) (((A)->nzmax <= 0) ? 0 : GB_NNZ_HELD_NONEMPTY (A))
+
+// nnz_held(A) for all non-empty matrices
+#define GB_NNZ_HELD_NONEMPTY(A) \
+    (((A)->p == NULL) ? GB_NNZ_FULL (A) : GB_NNZ_SPARSE (A))
+
+// Upper bound on nnz(A) when the matrix has zombies and pending tuples;
+// does not need GB_MATRIX_WAIT(A) first.
+#define GB_NNZ_UPPER_BOUND(A) ((GB_NNZ (A) - (A)->nzombies) + GB_Pending_n (A))
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_nvals.c b/GraphBLAS/Source/GB_nvals.c
index 4ca98a0e71..47d83b7f86 100644
--- a/GraphBLAS/Source/GB_nvals.c
+++ b/GraphBLAS/Source/GB_nvals.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_nvals: number of entries in a sparse matrix
+// GB_nvals: number of entries in a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,20 +23,10 @@ GrB_Info GB_nvals           // get the number of entries in a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    GrB_Info info ;
-
-    // delete any lingering zombies and assemble any pending tuples
-    // TODO in 4.0: delete this line of code:
-    GB_MATRIX_WAIT (A) ; ASSERT (!GB_ZOMBIES (A)) ; ASSERT (!GB_PENDING (A)) ;
-
     GB_RETURN_IF_NULL (nvals) ;
 
-    // leave zombies alone, but assemble any pending tuples
-    if (GB_PENDING (A))
-    {
-        ASSERT (GB_DEAD_CODE) ; // TODO in 4.0: delete this line
-        GB_MATRIX_WAIT (A) ;
-    }
+    // leave zombies alone, and leave jumbled, but assemble any pending tuples
+    GB_MATRIX_WAIT_IF_PENDING (A) ;
 
     //--------------------------------------------------------------------------
     // return the number of entries in the matrix
@@ -48,6 +38,7 @@ GrB_Info GB_nvals           // get the number of entries in a matrix
     // tolerated but pending tuples cannot.
 
     ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
     ASSERT (!GB_PENDING (A)) ;
 
     (*nvals) = GB_NNZ (A) - (A->nzombies) ;
diff --git a/GraphBLAS/Source/GB_nvec_nonempty.c b/GraphBLAS/Source/GB_nvec_nonempty.c
index 6275007995..66e2023e40 100644
--- a/GraphBLAS/Source/GB_nvec_nonempty.c
+++ b/GraphBLAS/Source/GB_nvec_nonempty.c
@@ -2,8 +2,8 @@
 // GB_nvec_nonempty: count the number of non-empty vectors
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -26,13 +26,22 @@ int64_t GB_nvec_nonempty        // return # of non-empty vectors
 
     ASSERT (A != NULL) ;
     ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
 
     //--------------------------------------------------------------------------
-    // trivial case
+    // trivial cases
     //--------------------------------------------------------------------------
 
+    if (GB_IS_FULL (A) || GB_IS_BITMAP (A))
+    { 
+        // A is full or bitmap; nvec_nonempty depends only on the dimensions
+        return ((A->vlen == 0) ? 0 : A->vdim) ;
+    }
+
     if (GB_NNZ (A) == 0)
     { 
+        // A is sparse or hypersparse, with no entries
         return (0) ;
     }
 
@@ -41,7 +50,6 @@ int64_t GB_nvec_nonempty        // return # of non-empty vectors
     //--------------------------------------------------------------------------
 
     int64_t anvec = A->nvec ;
-
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
     int nthreads = GB_nthreads (anvec, chunk, nthreads_max) ;
 
diff --git a/GraphBLAS/Source/GB_omp.h b/GraphBLAS/Source/GB_omp.h
new file mode 100644
index 0000000000..8e3dbb779f
--- /dev/null
+++ b/GraphBLAS/Source/GB_omp.h
@@ -0,0 +1,56 @@
+//------------------------------------------------------------------------------
+// GB_omp.h: definitions using OpenMP in SuiteSparse:GraphBLAS
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_OMP_H
+#define GB_OMP_H
+
+//------------------------------------------------------------------------------
+// determine the OpenMP version
+//------------------------------------------------------------------------------
+
+#if GB_MICROSOFT
+
+    // MS Visual Studio supports OpenMP 2.0, and does not have the atomic
+    // capture clause.  However, it has interlocked compare/exchange functions
+    // that are used instead (see GB_atomics.h).
+    #include <intrin.h>
+
+#elif defined ( _OPENMP )
+
+    // All other compilers must either support OpenMP 3.1 or later, or not use
+    // OpenMP at all.
+    #if _OPENMP < 201107
+        #error "OpenMP 3.1 or later required (recompile without OpenMP)"
+    #endif
+
+#endif
+
+//------------------------------------------------------------------------------
+// OpenMP definitions
+//------------------------------------------------------------------------------
+
+#if defined ( _OPENMP )
+
+    #include <omp.h>
+    #define GB_OPENMP_MAX_THREADS       omp_get_max_threads ( )
+    #define GB_OPENMP_GET_NUM_THREADS   omp_get_num_threads ( )
+    #define GB_OPENMP_GET_WTIME         omp_get_wtime ( )
+    #define GB_OPENMP_GET_THREAD_ID     omp_get_thread_num ( )
+
+#else
+
+    #define GB_OPENMP_MAX_THREADS       (1)
+    #define GB_OPENMP_GET_NUM_THREADS   (1)
+    #define GB_OPENMP_GET_WTIME         (0)
+    #define GB_OPENMP_GET_THREAD_ID     (0)
+
+#endif
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_op_is_second.c b/GraphBLAS/Source/GB_op_is_second.c
index d904f26da7..eafe966bb4 100644
--- a/GraphBLAS/Source/GB_op_is_second.c
+++ b/GraphBLAS/Source/GB_op_is_second.c
@@ -2,8 +2,8 @@
 // GB_op_is_second: return true if op is the SECOND operator of the right type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_opaque.h b/GraphBLAS/Source/GB_opaque.h
index 460fe04ecd..de85e4cc15 100644
--- a/GraphBLAS/Source/GB_opaque.h
+++ b/GraphBLAS/Source/GB_opaque.h
@@ -2,8 +2,8 @@
 // GB_opaque.h: definitions of opaque objects
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,13 +11,10 @@
 #define GB_OPAQUE_H
 
 //------------------------------------------------------------------------------
-// pending tuples
+// GB_void: like void, but valid for pointer arithmetic
 //------------------------------------------------------------------------------
 
-// Pending tuples are a list of unsorted (i,j,x) tuples that have not yet been
-// added to a matrix.  The data structure is defined in GB_Pending.h.
-
-typedef struct GB_Pending_struct *GB_Pending ;
+typedef unsigned char GB_void ;
 
 //------------------------------------------------------------------------------
 // type codes for GrB_Type
@@ -27,20 +24,20 @@ typedef enum
 {
     // the 14 scalar types: 13 built-in types, and one user-defined type code
     GB_ignore_code  = 0,
-    GB_BOOL_code    = 0,        // 'logical' in MATLAB
-    GB_INT8_code    = 1,
-    GB_UINT8_code   = 2,
-    GB_INT16_code   = 3,
-    GB_UINT16_code  = 4,
-    GB_INT32_code   = 5,
-    GB_UINT32_code  = 6,
-    GB_INT64_code   = 7,
-    GB_UINT64_code  = 8,
-    GB_FP32_code    = 9,        // float ('single' in MATLAB)
-    GB_FP64_code    = 10,       // double
-    GB_FC32_code    = 11,       // float complex ('single complex' in MATLAB)
-    GB_FC64_code    = 12,       // double complex
-    GB_UDT_code     = 13        // void *, user-defined type
+    GB_BOOL_code    = 1,        // 'logical' in MATLAB
+    GB_INT8_code    = 2,
+    GB_UINT8_code   = 3,
+    GB_INT16_code   = 4,
+    GB_UINT16_code  = 5,
+    GB_INT32_code   = 6,
+    GB_UINT32_code  = 7,
+    GB_INT64_code   = 8,
+    GB_UINT64_code  = 9,
+    GB_FP32_code    = 10,       // float ('single' in MATLAB)
+    GB_FP64_code    = 11,       // double
+    GB_FC32_code    = 12,       // float complex ('single complex' in MATLAB)
+    GB_FC64_code    = 13,       // double complex
+    GB_UDT_code     = 14        // void *, user-defined type
 }
 GB_Type_code ;                  // enumerated type code
 
@@ -54,163 +51,218 @@ typedef enum
     // NOP
     //--------------------------------------------------------------------------
 
-    GB_NOP_opcode = 0,  // no operation
+    GB_NOP_opcode       = 0,    // no operation
+
+    //==========================================================================
+    // binary operators
+    //==========================================================================
 
     //--------------------------------------------------------------------------
     // primary unary operators x=f(x)
     //--------------------------------------------------------------------------
 
-    GB_ONE_opcode,      // z = 1
-    GB_IDENTITY_opcode, // z = x
-    GB_AINV_opcode,     // z = -x
-    GB_ABS_opcode,      // z = abs(x) ; except z is real if x is complex
-    GB_MINV_opcode,     // z = 1/x ; special cases for bool and integers
-    GB_LNOT_opcode,     // z = !x
-    GB_BNOT_opcode,     // z = ~x (bitwise complement)
+    GB_ONE_opcode       = 1,    // z = 1
+    GB_IDENTITY_opcode  = 2,    // z = x
+    GB_AINV_opcode      = 3,    // z = -x
+    GB_ABS_opcode       = 4,    // z = abs(x) ; except z is real if x is complex
+    GB_MINV_opcode      = 5,    // z = 1/x ; special cases for bool and integers
+    GB_LNOT_opcode      = 6,    // z = !x
+    GB_BNOT_opcode      = 7,    // z = ~x (bitwise complement)
 
     //--------------------------------------------------------------------------
     // unary operators for floating-point types (real and complex)
     //--------------------------------------------------------------------------
 
-    GB_SQRT_opcode,     // z = sqrt (x)
-    GB_LOG_opcode,      // z = log (x)
-    GB_EXP_opcode,      // z = exp (x)
+    GB_SQRT_opcode      = 8,    // z = sqrt (x)
+    GB_LOG_opcode       = 9,    // z = log (x)
+    GB_EXP_opcode       = 10,   // z = exp (x)
 
-    GB_SIN_opcode,      // z = sin (x)
-    GB_COS_opcode,      // z = cos (x)
-    GB_TAN_opcode,      // z = tan (x)
+    GB_SIN_opcode       = 11,   // z = sin (x)
+    GB_COS_opcode       = 12,   // z = cos (x)
+    GB_TAN_opcode       = 13,   // z = tan (x)
 
-    GB_ASIN_opcode,     // z = asin (x)
-    GB_ACOS_opcode,     // z = acos (x)
-    GB_ATAN_opcode,     // z = atan (x)
+    GB_ASIN_opcode      = 14,   // z = asin (x)
+    GB_ACOS_opcode      = 15,   // z = acos (x)
+    GB_ATAN_opcode      = 16,   // z = atan (x)
 
-    GB_SINH_opcode,     // z = sinh (x)
-    GB_COSH_opcode,     // z = cosh (x)
-    GB_TANH_opcode,     // z = tanh (x)
+    GB_SINH_opcode      = 17,   // z = sinh (x)
+    GB_COSH_opcode      = 18,   // z = cosh (x)
+    GB_TANH_opcode      = 19,   // z = tanh (x)
 
-    GB_ASINH_opcode,    // z = asinh (x)
-    GB_ACOSH_opcode,    // z = acosh (x)
-    GB_ATANH_opcode,    // z = atanh (x)
+    GB_ASINH_opcode     = 20,   // z = asinh (x)
+    GB_ACOSH_opcode     = 21,   // z = acosh (x)
+    GB_ATANH_opcode     = 22,   // z = atanh (x)
 
-    GB_SIGNUM_opcode,   // z = signum (x)
-    GB_CEIL_opcode,     // z = ceil (x)
-    GB_FLOOR_opcode,    // z = floor (x)
-    GB_ROUND_opcode,    // z = round (x)
-    GB_TRUNC_opcode,    // z = trunc (x)
+    GB_SIGNUM_opcode    = 23,   // z = signum (x)
+    GB_CEIL_opcode      = 24,   // z = ceil (x)
+    GB_FLOOR_opcode     = 25,   // z = floor (x)
+    GB_ROUND_opcode     = 26,   // z = round (x)
+    GB_TRUNC_opcode     = 27,   // z = trunc (x)
 
-    GB_EXP2_opcode,     // z = exp2 (x)
-    GB_EXPM1_opcode,    // z = expm1 (x)
-    GB_LOG10_opcode,    // z = log10 (x)
-    GB_LOG1P_opcode,    // z = log1P (x)
-    GB_LOG2_opcode,     // z = log2 (x)
+    GB_EXP2_opcode      = 28,   // z = exp2 (x)
+    GB_EXPM1_opcode     = 29,   // z = expm1 (x)
+    GB_LOG10_opcode     = 30,   // z = log10 (x)
+    GB_LOG1P_opcode     = 31,   // z = log1P (x)
+    GB_LOG2_opcode      = 32,   // z = log2 (x)
 
     //--------------------------------------------------------------------------
     // unary operators for real floating-point types
     //--------------------------------------------------------------------------
 
-    GB_LGAMMA_opcode,   // z = lgamma (x)
-    GB_TGAMMA_opcode,   // z = tgamma (x)
-    GB_ERF_opcode,      // z = erf (x)
-    GB_ERFC_opcode,     // z = erfc (x)
-    GB_FREXPX_opcode,   // z = frexpx (x), mantissa from ANSI C11 frexp
-    GB_FREXPE_opcode,   // z = frexpe (x), exponent from ANSI C11 frexp
+    GB_LGAMMA_opcode    = 33,   // z = lgamma (x)
+    GB_TGAMMA_opcode    = 34,   // z = tgamma (x)
+    GB_ERF_opcode       = 35,   // z = erf (x)
+    GB_ERFC_opcode      = 36,   // z = erfc (x)
+    GB_FREXPX_opcode    = 37,   // z = frexpx (x), mantissa from ANSI C11 frexp
+    GB_FREXPE_opcode    = 38,   // z = frexpe (x), exponent from ANSI C11 frexp
 
     //--------------------------------------------------------------------------
     // unary operators for complex types only
     //--------------------------------------------------------------------------
 
-    GB_CONJ_opcode,     // z = conj (x)
+    GB_CONJ_opcode      = 39,   // z = conj (x)
 
     //--------------------------------------------------------------------------
     // unary operators where z is real and x is complex
     //--------------------------------------------------------------------------
 
-    GB_CREAL_opcode,    // z = creal (x)
-    GB_CIMAG_opcode,    // z = cimag (x)
-    GB_CARG_opcode,     // z = carg (x)
+    GB_CREAL_opcode     = 40,   // z = creal (x)
+    GB_CIMAG_opcode     = 41,   // z = cimag (x)
+    GB_CARG_opcode      = 42,   // z = carg (x)
 
     //--------------------------------------------------------------------------
     // unary operators where z is bool and x is any floating-point type
     //--------------------------------------------------------------------------
 
-    GB_ISINF_opcode,    // z = isinf (x)
-    GB_ISNAN_opcode,    // z = isnan (x)
-    GB_ISFINITE_opcode, // z = isfinite (x)
+    GB_ISINF_opcode     = 43,   // z = isinf (x)
+    GB_ISNAN_opcode     = 44,   // z = isnan (x)
+    GB_ISFINITE_opcode  = 45,   // z = isfinite (x)
+
+    //--------------------------------------------------------------------------
+    // positional unary operators: z is int64, x is ignored
+    //--------------------------------------------------------------------------
+
+    GB_POSITIONI_opcode     = 46,   // z = position_i(A(i,j)) == i
+    GB_POSITIONI1_opcode    = 47,   // z = position_i1(A(i,j)) == i+1
+    GB_POSITIONJ_opcode     = 48,   // z = position_j(A(i,j)) == j
+    GB_POSITIONJ1_opcode    = 49,   // z = position_j1(A(i,j)) == j+1
+
+    //==========================================================================
+    // binary operators
+    //==========================================================================
 
     //--------------------------------------------------------------------------
     // binary operators z=f(x,y) that return the same type as their inputs
     //--------------------------------------------------------------------------
 
-    GB_FIRST_opcode,    // z = x
-    GB_SECOND_opcode,   // z = y
-    GB_ANY_opcode,      // z = x or y, selected arbitrarily
-    GB_PAIR_opcode,     // z = 1
-    GB_MIN_opcode,      // z = min(x,y)
-    GB_MAX_opcode,      // z = max(x,y)
-    GB_PLUS_opcode,     // z = x + y
-    GB_MINUS_opcode,    // z = x - y
-    GB_RMINUS_opcode,   // z = y - x
-    GB_TIMES_opcode,    // z = x * y
-    GB_DIV_opcode,      // z = x / y ; special cases for bool and ints
-    GB_RDIV_opcode,     // z = y / x ; special cases for bool and ints
-    GB_POW_opcode,      // z = pow (x,y)
-
-    GB_ISEQ_opcode,     // z = (x == y)
-    GB_ISNE_opcode,     // z = (x != y)
-    GB_ISGT_opcode,     // z = (x >  y)
-    GB_ISLT_opcode,     // z = (x <  y)
-    GB_ISGE_opcode,     // z = (x >= y)
-    GB_ISLE_opcode,     // z = (x <= y)
-
-    GB_LOR_opcode,      // z = (x != 0) || (y != 0)
-    GB_LAND_opcode,     // z = (x != 0) && (y != 0)
-    GB_LXOR_opcode,     // z = (x != 0) != (y != 0)
-
-    GB_BOR_opcode,      // z = (x | y), bitwise or
-    GB_BAND_opcode,     // z = (x & y), bitwise and
-    GB_BXOR_opcode,     // z = (x ^ y), bitwise xor
-    GB_BXNOR_opcode,    // z = ~(x ^ y), bitwise xnor
-    GB_BGET_opcode,     // z = bitget (x,y)
-    GB_BSET_opcode,     // z = bitset (x,y)
-    GB_BCLR_opcode,     // z = bitclr (x,y)
-    GB_BSHIFT_opcode,   // z = bitshift (x,y)
+    GB_FIRST_opcode     = 50,   // z = x
+    GB_SECOND_opcode    = 51,   // z = y
+    GB_ANY_opcode       = 52,   // z = x or y, selected arbitrarily
+    GB_PAIR_opcode      = 53,   // z = 1
+    GB_MIN_opcode       = 54,   // z = min(x,y)
+    GB_MAX_opcode       = 55,   // z = max(x,y)
+    GB_PLUS_opcode      = 56,   // z = x + y
+    GB_MINUS_opcode     = 57,   // z = x - y
+    GB_RMINUS_opcode    = 58,   // z = y - x
+    GB_TIMES_opcode     = 59,   // z = x * y
+    GB_DIV_opcode       = 60,   // z = x / y ; special cases for bool and ints
+    GB_RDIV_opcode      = 61,   // z = y / x ; special cases for bool and ints
+    GB_POW_opcode       = 62,   // z = pow (x,y)
+
+    GB_ISEQ_opcode      = 63,   // z = (x == y)
+    GB_ISNE_opcode      = 64,   // z = (x != y)
+    GB_ISGT_opcode      = 65,   // z = (x >  y)
+    GB_ISLT_opcode      = 66,   // z = (x <  y)
+    GB_ISGE_opcode      = 67,   // z = (x >= y)
+    GB_ISLE_opcode      = 68,   // z = (x <= y)
+
+    GB_LOR_opcode       = 69,   // z = (x != 0) || (y != 0)
+    GB_LAND_opcode      = 70,   // z = (x != 0) && (y != 0)
+    GB_LXOR_opcode      = 71,   // z = (x != 0) != (y != 0)
+
+    GB_BOR_opcode       = 72,   // z = (x | y), bitwise or
+    GB_BAND_opcode      = 73,   // z = (x & y), bitwise and
+    GB_BXOR_opcode      = 74,   // z = (x ^ y), bitwise xor
+    GB_BXNOR_opcode     = 75,   // z = ~(x ^ y), bitwise xnor
+    GB_BGET_opcode      = 76,   // z = bitget (x,y)
+    GB_BSET_opcode      = 77,   // z = bitset (x,y)
+    GB_BCLR_opcode      = 78,   // z = bitclr (x,y)
+    GB_BSHIFT_opcode    = 79,   // z = bitshift (x,y)
 
     //--------------------------------------------------------------------------
     // binary operators z=f(x,y) that return bool (TxT -> bool)
     //--------------------------------------------------------------------------
 
-    GB_EQ_opcode,       // z = (x == y)
-    GB_NE_opcode,       // z = (x != y)
-    GB_GT_opcode,       // z = (x >  y)
-    GB_LT_opcode,       // z = (x <  y)
-    GB_GE_opcode,       // z = (x >= y)
-    GB_LE_opcode,       // z = (x <= y)
+    GB_EQ_opcode        = 80,   // z = (x == y), same as LXNOR operator for bool
+    GB_NE_opcode        = 81,   // z = (x != y)
+    GB_GT_opcode        = 82,   // z = (x >  y)
+    GB_LT_opcode        = 83,   // z = (x <  y)
+    GB_GE_opcode        = 84,   // z = (x >= y)
+    GB_LE_opcode        = 85,   // z = (x <= y)
 
     //--------------------------------------------------------------------------
     // binary operators for real floating-point types (TxT -> T)
     //--------------------------------------------------------------------------
 
-    GB_ATAN2_opcode,        // z = atan2 (x,y)
-    GB_HYPOT_opcode,        // z = hypot (x,y)
-    GB_FMOD_opcode,         // z = fmod (x,y)
-    GB_REMAINDER_opcode,    // z = remainder (x,y)
-    GB_COPYSIGN_opcode,     // z = copysign (x,y)
-    GB_LDEXP_opcode,        // z = ldexp (x,y)
+    GB_ATAN2_opcode     = 86,   // z = atan2 (x,y)
+    GB_HYPOT_opcode     = 87,   // z = hypot (x,y)
+    GB_FMOD_opcode      = 88,   // z = fmod (x,y)
+    GB_REMAINDER_opcode = 89,   // z = remainder (x,y)
+    GB_COPYSIGN_opcode  = 90,   // z = copysign (x,y)
+    GB_LDEXP_opcode     = 91,   // z = ldexp (x,y)
 
     //--------------------------------------------------------------------------
     // binary operator z=f(x,y) where z is complex, x,y real:
     //--------------------------------------------------------------------------
 
-    GB_CMPLX_opcode,        // z = cmplx (x,y)
+    GB_CMPLX_opcode     = 92,   // z = cmplx (x,y)
 
     //--------------------------------------------------------------------------
-    // user-defined: unary and binary operators
+    // positional binary operators: z is int64, x and y are ignored
     //--------------------------------------------------------------------------
 
-    GB_USER_opcode          // user-defined operator
+    GB_FIRSTI_opcode    = 93,   // z = first_i(A(i,j),y) == i
+    GB_FIRSTI1_opcode   = 94,   // z = first_i1(A(i,j),y) == i+1
+    GB_FIRSTJ_opcode    = 95,   // z = first_j(A(i,j),y) == j
+    GB_FIRSTJ1_opcode   = 96,   // z = first_j1(A(i,j),y) == j+1
+
+    GB_SECONDI_opcode   = 97,   // z = second_i(x,B(i,j)) == i
+    GB_SECONDI1_opcode  = 98,   // z = second_i1(x,B(i,j)) == i+1
+    GB_SECONDJ_opcode   = 99,   // z = second_j(x,B(i,j)) == j
+    GB_SECONDJ1_opcode  = 100,  // z = second_j1(x,B(i,j)) == j+1
+
+    //==========================================================================
+    // user-defined: unary and binary operators
+    //==========================================================================
+
+    GB_USER_opcode = 101        // user-defined operator (unary or binary)
 }
 GB_Opcode ;
 
+// true if the opcode is for a unary or binary positional operator
+#define GB_OPCODE_IS_POSITIONAL(opcode) \
+    (((opcode) >= GB_POSITIONI_opcode && (opcode) <= GB_POSITIONJ1_opcode) \
+    || ((opcode) >= GB_FIRSTI_opcode && (opcode) <= GB_SECONDJ1_opcode))
+
+// true if the op is a unary or binary positional operator
+#define GB_OP_IS_POSITIONAL(op) \
+    (((op) == NULL) ? false : GB_OPCODE_IS_POSITIONAL ((op)->opcode))
+
+GrB_UnaryOp GB_positional_unop_ijflip   // return flipped operator
+(
+    GrB_UnaryOp op                      // operator to flip
+) ;
+
+GrB_BinaryOp GB_positional_binop_ijflip // return flipped operator
+(
+    GrB_BinaryOp op                     // operator to flip
+) ;
+
+int64_t GB_positional_offset        // return 0 or 1
+(
+    GB_Opcode opcode                // opcode of positional operator
+) ;
+
 //------------------------------------------------------------------------------
 // select opcodes
 //------------------------------------------------------------------------------
@@ -247,11 +299,27 @@ typedef enum
 }
 GB_Select_Opcode ;
 
+#define GB_SELECTOP_IS_POSITIONAL(opcode) \
+    ((opcode) >= GB_TRIL_opcode && (opcode) <= GB_OFFDIAG_opcode)
 
 //------------------------------------------------------------------------------
 // opaque content of GraphBLAS objects
 //------------------------------------------------------------------------------
 
+// GB_MAGIC is an arbitrary number that is placed inside each object when it is
+// initialized, as a way of detecting uninitialized objects.
+#define GB_MAGIC  0x72657473786f62ULL
+
+// The magic number is set to GB_FREED when the object is freed, as a way of
+// helping to detect dangling pointers.
+#define GB_FREED  0x6c6c756e786f62ULL
+
+// The value is set to GB_MAGIC2 when the object has been allocated but cannot
+// yet be used in most methods and operations.  Currently this is used only for
+// when A->p array is allocated but not initialized.
+#define GB_MAGIC2 0x7265745f786f62ULL
+
+// string length for names of opaque objects
 #define GB_LEN 128
 
 struct GB_Type_opaque       // content of GrB_Type
@@ -297,10 +365,9 @@ struct GB_Monoid_opaque     // content of GrB_Monoid
 {
     int64_t magic ;         // for detecting uninitialized objects
     GrB_BinaryOp op ;       // binary operator of the monoid
-    void *identity ;        // identity of the monoid
-    size_t op_ztype_size ;  // size of the type (also is op->ztype->size)
-    void *terminal ;        // value that triggers early-exit (NULL if no value)
-    bool builtin ;          // built-in or user defined
+    void *identity ;        // identity of the monoid; type is op->ztype
+    void *terminal ;        // early-exit (NULL if no value); type is op->ztype
+    bool monoid_is_builtin ;       // built-in or user defined
 } ;
 
 struct GB_Semiring_opaque   // content of GrB_Semiring
@@ -308,9 +375,51 @@ struct GB_Semiring_opaque   // content of GrB_Semiring
     int64_t magic ;         // for detecting uninitialized objects
     GrB_Monoid add ;        // add operator of the semiring
     GrB_BinaryOp multiply ; // multiply operator of the semiring
-    bool builtin ;          // built-in or user defined
+    bool semiring_is_builtin ;       // built-in or user defined
 } ;
 
+struct GB_Descriptor_opaque // content of GrB_Descriptor
+{
+    int64_t magic ;         // for detecting uninitialized objects
+    char *logger ;          // error logger string
+    GrB_Desc_Value out ;    // output descriptor
+    GrB_Desc_Value mask ;   // mask descriptor
+    GrB_Desc_Value in0 ;    // first input descriptor (A for C=A*B, for example)
+    GrB_Desc_Value in1 ;    // second input descriptor (B for C=A*B)
+    GrB_Desc_Value axb ;    // for selecting the method for C=A*B
+    int nthreads_max ;      // max # threads to use in this call to GraphBLAS
+    double chunk ;          // chunk size for # of threads for small problems
+    bool predefined ;       // if true, descriptor is predefined
+    bool do_sort ;          // if nonzero, do the sort in GrB_mxm
+    // #include "GB_Descriptor_opaque_mkl_template.h"
+} ;
+
+//------------------------------------------------------------------------------
+// GB_Pending data structure: for scalars, vectors, and matrices
+//------------------------------------------------------------------------------
+
+// Pending tuples are a list of unsorted (i,j,x) tuples that have not yet been
+// added to a matrix.  The data structure is defined in GB_Pending.h.
+
+struct GB_Pending_struct    // list of pending tuples for a matrix
+{
+    int64_t n ;         // number of pending tuples to add to matrix
+    int64_t nmax ;      // size of i,j,x
+    bool sorted ;       // true if pending tuples are in sorted order
+    int64_t *i ;        // row indices of pending tuples
+    int64_t *j ;        // col indices of pending tuples; NULL if A->vdim <= 1
+    GB_void *x ;        // values of pending tuples
+    GrB_Type type ;     // the type of s
+    size_t size ;       // type->size
+    GrB_BinaryOp op ;   // operator to assemble pending tuples
+} ;
+
+typedef struct GB_Pending_struct *GB_Pending ;
+
+//------------------------------------------------------------------------------
+// scalar, vector, and matrix types
+//------------------------------------------------------------------------------
+
 struct GB_Scalar_opaque     // content of GxB_Scalar: 1-by-1 standard CSC matrix
 {
     #include "GB_matrix.h"
@@ -326,19 +435,15 @@ struct GB_Matrix_opaque     // content of GrB_Matrix
     #include "GB_matrix.h"
 } ;
 
-struct GB_Descriptor_opaque // content of GrB_Descriptor
-{
-    int64_t magic ;         // for detecting uninitialized objects
-    GrB_Desc_Value out ;    // output descriptor
-    GrB_Desc_Value mask ;   // mask descriptor
-    GrB_Desc_Value in0 ;    // first input descriptor (A for C=A*B, for example)
-    GrB_Desc_Value in1 ;    // second input descriptor (B for C=A*B)
-    GrB_Desc_Value axb ;    // for selecting the method for C=A*B
-    int nthreads_max ;      // max # threads to use in this call to GraphBLAS
-    double chunk ;          // chunk size for # of threads for small problems
-    bool predefined ;       // if true, descriptor is predefined
-    bool use_mkl ;          // if true, use the Intel MKL
-} ;
+//------------------------------------------------------------------------------
+// Accessing the content of a scalar, vector, or matrix
+//------------------------------------------------------------------------------
+
+#define GBP(Ap,k,avlen) ((Ap == NULL) ? ((k) * (avlen)) : Ap [k])
+#define GBH(Ah,k)       ((Ah == NULL) ? (k) : Ah [k])
+#define GBI(Ai,p,avlen) ((Ai == NULL) ? ((p) % (avlen)) : Ai [p])
+#define GBB(Ab,p)       ((Ab == NULL) ? 1 : Ab [p])
+// #define GBX(...)     TODO: constant-valued matrices
 
 #endif
 
diff --git a/GraphBLAS/Source/GB_ops.c b/GraphBLAS/Source/GB_ops.c
index 418d0f1c22..cd1b2570b5 100644
--- a/GraphBLAS/Source/GB_ops.c
+++ b/GraphBLAS/Source/GB_ops.c
@@ -2,8 +2,8 @@
 // GB_ops.c: built-in types, functions, operators, and other externs
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -60,6 +60,7 @@ GrB_Type
 struct GB_Descriptor_opaque GB_opaque_desc_ ## name =           \
 {                                                               \
     GB_MAGIC,               /* initialized */                   \
+    "",                                                         \
     (GrB_Desc_Value) (out),                                     \
     (GrB_Desc_Value) (mask),                                    \
     (GrB_Desc_Value) (in0),                                     \
@@ -286,6 +287,68 @@ GrB_BinaryOp GrB_LAND  = & GB_opaque_GxB_LAND_BOOL ;
 GrB_BinaryOp GrB_LXOR  = & GB_opaque_GxB_LXOR_BOOL ;
 GrB_BinaryOp GrB_LXNOR = & GB_opaque_GrB_EQ_BOOL ;
 
+//------------------------------------------------------------------------------
+// positional unary and binary operators
+//------------------------------------------------------------------------------
+
+// The function pointer inside a positional operator cannot be called directly,
+// since it does not depend on the values of its two arguments.  The operator
+// can only be implemented via its opcode.
+
+// helper macros to define positional unary operators
+#define GB_OP1_POS(op,str,type)                                             \
+    struct GB_UnaryOp_opaque GB_opaque_GxB_ ## op ## type =                 \
+    {                                                                       \
+        GB_MAGIC,                                                           \
+        & GB_opaque (GrB ## type),                                          \
+        & GB_opaque (GrB ## type),                                          \
+        NULL,  /* op->function is NULL; it cannot be called */              \
+        str,                                                                \
+        GB_ ## op ## _opcode                                                \
+    } ;                                                                     \
+    GrB_UnaryOp GxB_ ## op ## type = & GB_opaque_GxB_ ## op ## type ;
+
+// helper macros to define positional binary operators
+#define GB_OP2_POS(op,str,type)                                             \
+    struct GB_BinaryOp_opaque GB_opaque_GxB_ ## op ## type =                \
+    {                                                                       \
+        GB_MAGIC,                                                           \
+        & GB_opaque (GrB ## type),                                          \
+        & GB_opaque (GrB ## type),                                          \
+        & GB_opaque (GrB ## type),                                          \
+        NULL,  /* op->function is NULL; it cannot be called */              \
+        str,                                                                \
+        GB_ ## op ## _opcode                                                \
+    } ;                                                                     \
+    GrB_BinaryOp GxB_ ## op ## type = & GB_opaque_GxB_ ## op ## type ;
+
+GB_OP1_POS (POSITIONI , "positioni" , _INT32) ;
+GB_OP1_POS (POSITIONI , "positioni" , _INT64) ;
+GB_OP1_POS (POSITIONI1, "positioni1", _INT32) ;
+GB_OP1_POS (POSITIONI1, "positioni1", _INT64) ;
+GB_OP1_POS (POSITIONJ , "positionj" , _INT32) ;
+GB_OP1_POS (POSITIONJ , "positionj" , _INT64) ;
+GB_OP1_POS (POSITIONJ1, "positionj1", _INT32) ;
+GB_OP1_POS (POSITIONJ1, "positionj1", _INT64) ;
+
+GB_OP2_POS (FIRSTI    , "firsti"    , _INT32) ;
+GB_OP2_POS (FIRSTI    , "firsti"    , _INT64) ;
+GB_OP2_POS (FIRSTI1   , "firsti1"   , _INT32) ;
+GB_OP2_POS (FIRSTI1   , "firsti1"   , _INT64) ;
+GB_OP2_POS (FIRSTJ    , "firstj"    , _INT32) ;
+GB_OP2_POS (FIRSTJ    , "firstj"    , _INT64) ;
+GB_OP2_POS (FIRSTJ1   , "firstj1"   , _INT32) ;
+GB_OP2_POS (FIRSTJ1   , "firstj1"   , _INT64) ;
+
+GB_OP2_POS (SECONDI   , "secondi"   , _INT32) ;
+GB_OP2_POS (SECONDI   , "secondi"   , _INT64) ;
+GB_OP2_POS (SECONDI1  , "secondi1"  , _INT32) ;
+GB_OP2_POS (SECONDI1  , "secondi1"  , _INT64) ;
+GB_OP2_POS (SECONDJ   , "secondj"   , _INT32) ;
+GB_OP2_POS (SECONDJ   , "secondj"   , _INT64) ;
+GB_OP2_POS (SECONDJ1  , "secondj1"  , _INT32) ;
+GB_OP2_POS (SECONDJ1  , "secondj1"  , _INT64) ;
+
 //------------------------------------------------------------------------------
 // built-in select operators
 //------------------------------------------------------------------------------
@@ -399,10 +462,10 @@ GxB_SelectOp GxB_LE_THUNK = & GB_opaque_GxB_LE_THUNK ;
 GrB_Index GB_opaque_GrB_ALL = 0 ;
 const GrB_Index *GrB_ALL = & GB_opaque_GrB_ALL ;
 
-// the default hypersparsity ratio is (1/16)
-const double GxB_HYPER_DEFAULT = GB_HYPER_DEFAULT ;
+// the default hyper_switch is defined in GB_defaults.h
+const double GxB_HYPER_DEFAULT = GB_HYPER_SWITCH_DEFAULT ;
 
-// set GxB_HYPER to either of these to ensure matrix is always, or never,
+// set GxB_HYPER_SWITCH to either of these to ensure matrix is always, or never,
 // stored in hypersparse format, respectively.
 const double GxB_ALWAYS_HYPER = GB_ALWAYS_HYPER ;
 const double GxB_NEVER_HYPER  = GB_NEVER_HYPER ;
@@ -432,7 +495,6 @@ struct GB_Monoid_opaque GB_opaque_GxB_ ## op ## _MONOID =                   \
     GB_MAGIC,                                                               \
     & GB_opaque_ ## prefix ## op,                                           \
     & GB_opaque_identity_ ## op,                                            \
-    sizeof (ztype),                                                         \
     NULL,                                                                   \
     true                                                                    \
 } ;                                                                         \
@@ -447,7 +509,6 @@ struct GB_Monoid_opaque GB_opaque_GxB_ ## op ## _MONOID =                   \
     GB_MAGIC,                                                               \
     & GB_opaque_ ## prefix ## op,                                           \
     & GB_opaque_identity_ ## op,                                            \
-    sizeof (ztype),                                                         \
     & GB_opaque_terminal_ ## op,                                            \
     true                                                                    \
 } ;                                                                         \
@@ -665,6 +726,47 @@ GrB_Semiring GxB_NAME (add ## _ ## mult) = & GB (GxB_ ## add ## _ ## mult) ;
 #define GxB_NAME(x)   GxB_ ## x ## _INT32
 #define GB(x)         GB_opaque_ ## x ## _INT32
 #define GB_MONOID(x)  GB_opaque_GxB_ ## x ## _INT32_MONOID
+    // 40 positional semirings for INT64 types:
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, FIRSTI   )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, FIRSTI1  )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, FIRSTJ   )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, FIRSTJ1  )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, SECONDI  )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, SECONDI1 )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, SECONDJ  )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, SECONDJ1 )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, FIRSTI   )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, FIRSTI1  )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, FIRSTJ   )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, FIRSTJ1  )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, SECONDI  )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, SECONDI1 )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, SECONDJ  )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, SECONDJ1 )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, FIRSTI   )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, FIRSTI1  )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, FIRSTJ   )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, FIRSTJ1  )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, SECONDI  )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, SECONDI1 )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, SECONDJ  )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, SECONDJ1 )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, FIRSTI   )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, FIRSTI1  )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, FIRSTJ   )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, FIRSTJ1  )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, SECONDI  )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, SECONDI1 )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, SECONDJ  )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, SECONDJ1 )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, FIRSTI   )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, FIRSTI1  )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, FIRSTJ   )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, FIRSTJ1  )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, SECONDI  )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, SECONDI1 )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, SECONDJ  )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, SECONDJ1 )
 #include "GB_semiring_template.c"
 
 #define GB_UNSIGNED_INT
@@ -676,6 +778,47 @@ GrB_Semiring GxB_NAME (add ## _ ## mult) = & GB (GxB_ ## add ## _ ## mult) ;
 #define GxB_NAME(x)   GxB_ ## x ## _INT64
 #define GB(x)         GB_opaque_ ## x ## _INT64
 #define GB_MONOID(x)  GB_opaque_GxB_ ## x ## _INT64_MONOID
+    // 40 positional semirings for INT64 types:
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, FIRSTI   )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, FIRSTI1  )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, FIRSTJ   )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, FIRSTJ1  )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, SECONDI  )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, SECONDI1 )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, SECONDJ  )
+    GB_SEMIRING_DEFINE ( MIN   , GxB_, SECONDJ1 )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, FIRSTI   )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, FIRSTI1  )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, FIRSTJ   )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, FIRSTJ1  )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, SECONDI  )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, SECONDI1 )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, SECONDJ  )
+    GB_SEMIRING_DEFINE ( MAX   , GxB_, SECONDJ1 )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, FIRSTI   )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, FIRSTI1  )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, FIRSTJ   )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, FIRSTJ1  )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, SECONDI  )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, SECONDI1 )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, SECONDJ  )
+    GB_SEMIRING_DEFINE ( ANY   , GxB_, SECONDJ1 )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, FIRSTI   )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, FIRSTI1  )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, FIRSTJ   )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, FIRSTJ1  )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, SECONDI  )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, SECONDI1 )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, SECONDJ  )
+    GB_SEMIRING_DEFINE ( PLUS  , GxB_, SECONDJ1 )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, FIRSTI   )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, FIRSTI1  )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, FIRSTJ   )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, FIRSTJ1  )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, SECONDI  )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, SECONDI1 )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, SECONDJ  )
+    GB_SEMIRING_DEFINE ( TIMES , GxB_, SECONDJ1 )
 #include "GB_semiring_template.c"
 
 #define GB_UNSIGNED_INT
diff --git a/GraphBLAS/Source/GB_partition.h b/GraphBLAS/Source/GB_partition.h
new file mode 100644
index 0000000000..2c5cd63e59
--- /dev/null
+++ b/GraphBLAS/Source/GB_partition.h
@@ -0,0 +1,24 @@
+//------------------------------------------------------------------------------
+// GB_parition.h: definitions for partitioning an index range
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_PARTITION_H
+#define GB_PARTITION_H
+
+// GB_PART and GB_PARTITION:  divide the index range 0:n-1 uniformly
+// for nthreads.  GB_PART(tid,n,nthreads) is the first index for thread tid.
+#define GB_PART(tid,n,nthreads)  \
+    (((tid) * ((double) (n))) / ((double) (nthreads)))
+
+// thread tid will operate on the range k1:(k2-1)
+#define GB_PARTITION(k1,k2,n,tid,nthreads)                                  \
+    k1 = ((tid) ==  0          ) ?  0  : GB_PART ((tid),  n, nthreads) ;    \
+    k2 = ((tid) == (nthreads)-1) ? (n) : GB_PART ((tid)+1,n, nthreads)
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_ph_free.c b/GraphBLAS/Source/GB_ph_free.c
index 9287fb180c..b7b02ec9b7 100644
--- a/GraphBLAS/Source/GB_ph_free.c
+++ b/GraphBLAS/Source/GB_ph_free.c
@@ -2,15 +2,14 @@
 // GB_ph_free: free the A->p and A->h content of a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Free the A->p and A->h content of a matrix.  If followed by GB_ix_free(A),
-// the header of A is just like GB_new with GB_Ap_null.  No content is left
-// except the header.  The matrix becomes invalid, and would generate a
-// GrB_INVALID_OBJECT error if passed to a user-callable GraphBLAS function.
+// Free the A->p and A->h content of a matrix.  The matrix becomes invalid, and
+// would generate a GrB_INVALID_OBJECT error if passed to a user-callable
+// GraphBLAS function.
 
 #include "GB.h"
 
@@ -44,12 +43,8 @@ void GB_ph_free                 // free A->p and A->h of a matrix
     A->h = NULL ;
     A->h_shallow = false ;
 
-    if (A->is_hyper)
-    { 
-        A->plen = 0 ;
-        A->nvec = 0 ;
-    }
-
+    A->plen = 0 ;
+    A->nvec = 0 ;
     A->nvec_nonempty = 0 ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_phix_free.c b/GraphBLAS/Source/GB_phbix_free.c
similarity index 65%
rename from GraphBLAS/Source/GB_phix_free.c
rename to GraphBLAS/Source/GB_phbix_free.c
index 7f9750cb32..e468777489 100644
--- a/GraphBLAS/Source/GB_phix_free.c
+++ b/GraphBLAS/Source/GB_phbix_free.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_phix_free: free all content of a matrix
+// GB_phbix_free: free all content of a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,17 +11,15 @@
 // A->magic becomes GB_MAGIC2.  If this matrix is given to a user-callable
 // GraphBLAS function, it will generate a GrB_INVALID_OBJECT error.
 
-// This function normally returns GrB_SUCCESS. 
-
 #include "GB.h"
 
-GrB_Info GB_phix_free           // free all content of a matrix
+void GB_phbix_free              // free all content of a matrix
 (
     GrB_Matrix A                // handle of matrix with content to free
 )
 { 
 
     GB_ph_free (A) ;
-    return (GB_ix_free (A)) ;
+    GB_bix_free (A) ;
 }
 
diff --git a/GraphBLAS/Source/GB_positional_binop_ijflip.c b/GraphBLAS/Source/GB_positional_binop_ijflip.c
new file mode 100644
index 0000000000..c5615d7cc9
--- /dev/null
+++ b/GraphBLAS/Source/GB_positional_binop_ijflip.c
@@ -0,0 +1,54 @@
+//------------------------------------------------------------------------------
+// GB_positional_binop_ijflip: swap i and j in a binary positional op
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+
+GrB_BinaryOp GB_positional_binop_ijflip // return flipped operator
+(
+    GrB_BinaryOp op                     // operator to flip
+)
+{
+
+    ASSERT (op != NULL) ;
+
+    if (op->ztype == GrB_INT64)
+    {
+        switch (op->opcode)
+        {
+            case GB_FIRSTI_opcode   : return (GxB_FIRSTJ_INT64  ) ;
+            case GB_FIRSTI1_opcode  : return (GxB_FIRSTJ1_INT64 ) ;
+            case GB_FIRSTJ_opcode   : return (GxB_FIRSTI_INT64  ) ;
+            case GB_FIRSTJ1_opcode  : return (GxB_FIRSTI1_INT64 ) ;
+            case GB_SECONDI_opcode  : return (GxB_SECONDJ_INT64 ) ;
+            case GB_SECONDI1_opcode : return (GxB_SECONDJ1_INT64) ;
+            case GB_SECONDJ_opcode  : return (GxB_SECONDI_INT64 ) ;
+            case GB_SECONDJ1_opcode : return (GxB_SECONDI1_INT64) ;
+            default: ;
+        }
+    }
+    else
+    {
+        switch (op->opcode)
+        {
+            case GB_FIRSTI_opcode   : return (GxB_FIRSTJ_INT32  ) ;
+            case GB_FIRSTI1_opcode  : return (GxB_FIRSTJ1_INT32 ) ;
+            case GB_FIRSTJ_opcode   : return (GxB_FIRSTI_INT32  ) ;
+            case GB_FIRSTJ1_opcode  : return (GxB_FIRSTI1_INT32 ) ;
+            case GB_SECONDI_opcode  : return (GxB_SECONDJ_INT32 ) ;
+            case GB_SECONDI1_opcode : return (GxB_SECONDJ1_INT32) ;
+            case GB_SECONDJ_opcode  : return (GxB_SECONDI_INT32 ) ;
+            case GB_SECONDJ1_opcode : return (GxB_SECONDI1_INT32) ;
+            default: ;
+        }
+    }
+
+    // non-positional op is returned unmodified
+    return (op) ;
+}
+
diff --git a/GraphBLAS/Source/GB_positional_offset.c b/GraphBLAS/Source/GB_positional_offset.c
new file mode 100644
index 0000000000..9a99450d28
--- /dev/null
+++ b/GraphBLAS/Source/GB_positional_offset.c
@@ -0,0 +1,35 @@
+//------------------------------------------------------------------------------
+// GB_positional_offset: return the offset of a positional operator
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+
+int64_t GB_positional_offset        // return 0 or 1
+(
+    GB_Opcode opcode                // opcode of positional operator
+)
+{
+
+    switch (opcode)
+    {
+
+        // these operators are offset by one
+        case GB_POSITIONI1_opcode : // z = position_i1(A(i,j)) == i+1
+        case GB_FIRSTI1_opcode    : // z = first_i1(A(i,j),y) == i+1
+        case GB_SECONDI1_opcode   : // z = second_i1(x,A(i,j)) == i+1
+        case GB_POSITIONJ1_opcode : // z = position_j1(A(i,j)) == j+1
+        case GB_FIRSTJ1_opcode    : // z = first_j1(A(i,j),y) == j+1
+        case GB_SECONDJ1_opcode   : // z = second_j1(x,A(i,j)) == j+1
+            return (1) ;
+
+        // all other operators have no offset
+        default:
+            return (0) ;
+    }
+}
+
diff --git a/GraphBLAS/Source/GB_positional_unop_ijflip.c b/GraphBLAS/Source/GB_positional_unop_ijflip.c
new file mode 100644
index 0000000000..efc264eb54
--- /dev/null
+++ b/GraphBLAS/Source/GB_positional_unop_ijflip.c
@@ -0,0 +1,45 @@
+//------------------------------------------------------------------------------
+// GB_positional_unop_ijflip: swap i and j in a unary positional op
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+
+GrB_UnaryOp GB_positional_unop_ijflip   // return flipped operator
+(
+    GrB_UnaryOp op                      // operator to flip
+)
+{
+
+    ASSERT (op != NULL) ;
+
+    if (op->ztype == GrB_INT64)
+    {
+        switch (op->opcode)
+        {
+            case GB_POSITIONI_opcode  : return (GxB_POSITIONJ_INT64 ) ;
+            case GB_POSITIONI1_opcode : return (GxB_POSITIONJ1_INT64) ;
+            case GB_POSITIONJ_opcode  : return (GxB_POSITIONI_INT64 ) ;
+            case GB_POSITIONJ1_opcode : return (GxB_POSITIONI1_INT64) ;
+            // non-positional op is returned unmodified
+            default                   : return (op) ;
+        }
+    }
+    else
+    {
+        switch (op->opcode)
+        {
+            case GB_POSITIONI_opcode  : return (GxB_POSITIONJ_INT32 ) ;
+            case GB_POSITIONI1_opcode : return (GxB_POSITIONJ1_INT32) ;
+            case GB_POSITIONJ_opcode  : return (GxB_POSITIONI_INT32 ) ;
+            case GB_POSITIONJ1_opcode : return (GxB_POSITIONI1_INT32) ;
+            // non-positional op is returned unmodified
+            default                   : return (op) ;
+        }
+    }
+}
+
diff --git a/GraphBLAS/Source/GB_printf.c b/GraphBLAS/Source/GB_printf.c
index e472fb5f1a..058acfac64 100644
--- a/GraphBLAS/Source/GB_printf.c
+++ b/GraphBLAS/Source/GB_printf.c
@@ -2,8 +2,8 @@
 // GB_printf.c: printing for GraphBLAS *check functions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,3 +12,135 @@
 int (* GB_printf_function ) (const char *format, ...) = NULL ;
 int (* GB_flush_function  ) ( void ) = NULL ;
 
+#if GB_BURBLE
+
+void GB_burble_assign
+(
+    const bool C_replace,       // descriptor for C
+    const int Ikind,
+    const int Jkind,
+    const GrB_Matrix M,         // mask matrix, which is not NULL here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present here
+    const GrB_Matrix A,         // input matrix, not transposed
+    const int assign_kind       // row assign, col assign, assign, or subassign
+)
+{
+
+    //--------------------------------------------------------------------------
+    // quick return if burble is disabled
+    //--------------------------------------------------------------------------
+
+    if (!GB_Global_burble_get ( ))
+    {
+        return ;
+    }
+
+    //--------------------------------------------------------------------------
+    // construct the accum operator string
+    //--------------------------------------------------------------------------
+
+    char *Op ;
+    if (accum == NULL)
+    {
+        // no accum operator is present
+        Op = "" ;
+    }
+    else
+    {
+        // use a simpler version of accum->name
+        if (accum->opcode == GB_USER_opcode) Op = "op" ;
+        else if (GB_STRING_MATCH (accum->name, "plus")) Op = "+" ;
+        else if (GB_STRING_MATCH (accum->name, "minus")) Op = "-" ;
+        else if (GB_STRING_MATCH (accum->name, "times")) Op = "*" ;
+        else if (GB_STRING_MATCH (accum->name, "div")) Op = "/" ;
+        else if (GB_STRING_MATCH (accum->name, "or")) Op = "|" ;
+        else if (GB_STRING_MATCH (accum->name, "and")) Op = "&" ;
+        else if (GB_STRING_MATCH (accum->name, "xor")) Op = "^" ;
+        else Op = accum->name ;
+    }
+
+    //--------------------------------------------------------------------------
+    // construct the Mask string
+    //--------------------------------------------------------------------------
+
+    char *Mask ;
+    char Mask_string [GB_LEN+1] ;
+    if (M == NULL)
+    {
+        // M is not present
+        if (Mask_comp)
+        {
+            Mask = C_replace ? "<!,replace>" : "<!>" ;
+        }
+        else
+        {
+            Mask = C_replace ? "<replace>" : "" ;
+        }
+    }
+    else
+    {
+        // M is present
+        snprintf (Mask_string, GB_LEN, "<%sM%s%s%s>",
+            (Mask_comp) ? "!" : "",
+            GB_IS_BITMAP (M) ? ",bitmap" : (GB_IS_FULL (M) ? ",full" : ""),
+            Mask_struct ? ",struct" : "",
+            C_replace ? ",replace" : "") ;
+        Mask = Mask_string ;
+    }
+
+    //--------------------------------------------------------------------------
+    // construct the string for A or the scalar
+    //--------------------------------------------------------------------------
+
+    char *S = (A == NULL) ? "scalar" : "A" ;
+
+    //--------------------------------------------------------------------------
+    // construct the string for (I,J)
+    //--------------------------------------------------------------------------
+
+    char *Istr = (Ikind == GB_ALL) ? ":" : "I" ;
+    char *Jstr = (Jkind == GB_ALL) ? ":" : "J" ;
+    char IJ [GB_LEN+1] ;
+    snprintf (IJ, GB_LEN, "(%s,%s)", Istr, Jstr) ;
+    if (Ikind == GB_ALL && Jkind == GB_ALL)
+    {
+        // do not print the (I,J) indices
+        IJ [0] = '\0' ;
+    }
+
+    //--------------------------------------------------------------------------
+    // burble the final result
+    //--------------------------------------------------------------------------
+
+    switch (assign_kind)
+    {
+        case GB_ROW_ASSIGN:
+            // C(i,J) = A
+            snprintf (IJ, GB_LEN, "(i,%s)", Jstr) ;
+            GBURBLE ("C%s%s %s= A ", Mask, IJ, Op) ;
+            break ;
+
+        case GB_COL_ASSIGN:
+            // C(I,j) = A
+            snprintf (IJ, GB_LEN, "(%s,j)", Istr) ;
+            GBURBLE ("C%s%s %s= A ", Mask, IJ, Op) ;
+            break ;
+
+        case GB_ASSIGN:
+            // C(I,J) = A
+            GBURBLE ("C%s%s %s= %s ", Mask, IJ, Op, S) ;
+            break ;
+
+        case GB_SUBASSIGN:
+            // C(i,J) = A
+            GBURBLE ("C%s%s %s= %s ", IJ, Mask, Op, S) ;
+            break ;
+
+        default: ;
+    }
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_printf.h b/GraphBLAS/Source/GB_printf.h
index a642348796..7be067d37d 100644
--- a/GraphBLAS/Source/GB_printf.h
+++ b/GraphBLAS/Source/GB_printf.h
@@ -2,8 +2,8 @@
 // GB_printf.h: definitions for printing from GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,6 +17,8 @@
 GB_PUBLIC int (* GB_printf_function ) (const char *format, ...) ;
 GB_PUBLIC int (* GB_flush_function  ) ( void ) ;
 
+#define GB_STRING_MATCH(s,t) (strcmp (s,t) == 0)
+
 //------------------------------------------------------------------------------
 // printing control
 //------------------------------------------------------------------------------
@@ -77,8 +79,7 @@ GB_PUBLIC int (* GB_flush_function  ) ( void ) ;
     if (printf_result < 0)                                                  \
     {                                                                       \
         int err = errno ;                                                   \
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,                       \
-            "File output error (%d): %s", err, strerror (err)))) ;          \
+        return (GrB_INVALID_VALUE) ;                                        \
     }                                                                       \
 }
 
@@ -102,21 +103,18 @@ GB_PUBLIC int (* GB_flush_function  ) ( void ) ;
                                                                         \
         case GB_FREED :                                                 \
             /* dangling pointer! */                                     \
-            GBPR0 ("already freed!\n") ;                                \
-            return (GB_ERROR (GrB_UNINITIALIZED_OBJECT, (GB_LOG,        \
-                "%s is freed: [%s]", kind, name))) ;                    \
+            GBPR0 (" object already freed!\n") ;                        \
+            return (GrB_UNINITIALIZED_OBJECT) ;                         \
                                                                         \
         case GB_MAGIC2 :                                                \
             /* invalid */                                               \
-            GBPR0 ("invalid\n") ;                                       \
-            return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,              \
-                "%s is invalid: [%s]", kind, name))) ;                  \
+            GBPR0 (" invalid object\n") ;                               \
+            return (GrB_INVALID_OBJECT) ;                               \
                                                                         \
         default :                                                       \
             /* uninitialized */                                         \
-            GBPR0 ("uninititialized\n") ;                               \
-            return (GB_ERROR (GrB_UNINITIALIZED_OBJECT, (GB_LOG,        \
-                "%s is uninitialized: [%s]", kind, name))) ;            \
+            GBPR0 (" uninititialized object\n") ;                       \
+            return (GrB_UNINITIALIZED_OBJECT) ;                         \
     }                                                                   \
 }
 
@@ -124,78 +122,104 @@ GB_PUBLIC int (* GB_flush_function  ) ( void ) ;
 // burble
 //------------------------------------------------------------------------------
 
-// GB_BURBLE is meant for development use, not production use.  To enable it,
-// set GB_BURBLE to 1, either with -DGB_BURBLE=1 as a compiler option, by
-// editting the setting above, or by adding the line
-//
-//      #define GB_BURBLE 1
-//
-// at the top of any source file, before #including any other file.  After
-// enabling it in the library, use GxB_set (GxB_BURBLE, true) to turn it on
-// at run time, and GxB_set (GxB_BURBLE, false) to turn it off.  By default,
-// the feature is not enabled when SuiteSparse:GraphBLAS is compiled, and
-// even then, the setting is set to false by GrB_init.
+// GB_BURBLE provides diagnostic output.
+// Use GxB_set (GxB_BURBLE, true) to turn it on
+// and GxB_set (GxB_BURBLE, false) to turn it off.
 
 #if GB_BURBLE
 
+void GB_burble_assign
+(
+    const bool C_replace,       // descriptor for C
+    const int Ikind,
+    const int Jkind,
+    const GrB_Matrix M,         // mask matrix, which is not NULL here
+    const bool Mask_comp,       // true for !M, false for M
+    const bool Mask_struct,     // true if M is structural, false if valued
+    const GrB_BinaryOp accum,   // present here
+    const GrB_Matrix A,         // input matrix, not transposed
+    const int assign_kind       // row assign, col assign, assign, or subassign
+) ;
+
 // define the function to use to burble
-#define GBBURBLE(...)                               \
+#define GBURBLE(...)                                \
 {                                                   \
-    bool burble = GB_Global_burble_get ( ) ;        \
-    if (burble)                                     \
+    if (GB_Global_burble_get ( ))                   \
     {                                               \
         GBDUMP (__VA_ARGS__) ;                      \
     }                                               \
 }
 
-#if defined ( _OPENMP )
-
-// burble with timing
-#define GB_BURBLE_START(func)                       \
-double t_burble = 0 ;                               \
-bool burble = GB_Global_burble_get ( ) ;            \
-{                                                   \
-    if (burble)                                     \
-    {                                               \
-        GBBURBLE (" [ " func " ") ;                 \
-        t_burble = GB_OPENMP_GET_WTIME ;            \
-    }                                               \
+// burble if a matrix is dense or full
+#define GB_BURBLE_DENSE(A,format)                               \
+{                                                               \
+    if (GB_IS_FULL (A))                                         \
+    {                                                           \
+        GBURBLE (format, "full") ;                              \
+    }                                                           \
+    else if (GB_IS_BITMAP (A))                                  \
+    {                                                           \
+        GBURBLE (format, "bitmap") ;                            \
+    }                                                           \
+    else if (GB_is_dense (A) && !GB_PENDING_OR_ZOMBIES (A))     \
+    {                                                           \
+        GBURBLE (format, "dense") ;                             \
+    }                                                           \
 }
 
-#define GB_BURBLE_END                               \
-{                                                   \
-    if (burble)                                     \
-    {                                               \
-        t_burble = GB_OPENMP_GET_WTIME - t_burble ; \
-        GBBURBLE ("%.3g sec ]\n", t_burble) ;       \
-    }                                               \
-}
+#if defined ( _OPENMP )
+
+    // burble with timing
+    #define GB_BURBLE_START(func)                       \
+    double t_burble = 0 ;                               \
+    {                                                   \
+        if (GB_Global_burble_get ( ))                   \
+        {                                               \
+            GBURBLE (" [ " func " ") ;                  \
+            t_burble = GB_OPENMP_GET_WTIME ;            \
+        }                                               \
+    }
+
+    #define GB_BURBLE_END                               \
+    {                                                   \
+        if (GB_Global_burble_get ( ))                   \
+        {                                               \
+            t_burble = GB_OPENMP_GET_WTIME - t_burble ; \
+            GBURBLE ("\n   %.3g sec ]\n", t_burble) ;   \
+        }                                               \
+    }
 
 #else
 
-// burble with no timing
-#define GB_BURBLE_START(func)                   \
-    GBBURBLE (" [ " func " ")
+    // burble with no timing
+
+    #define GB_BURBLE_START(func)                       \
+        GBURBLE (" [ " func " ")
 
-#define GB_BURBLE_END                           \
-    GBBURBLE ("]\n")
+    #define GB_BURBLE_END                               \
+        GBURBLE ("]\n")
 
 #endif
 
-#define GB_BURBLE_N(n,...)                      \
-    if (n > 1) GBBURBLE (__VA_ARGS__)
+#define GB_BURBLE_N(n,...)                              \
+{                                                       \
+    if (n > 1) GBURBLE (__VA_ARGS__)                    \
+}
 
-#define GB_BURBLE_MATRIX(A, ...)                \
-    if (!(A->vlen <= 1 && A->vdim <= 1)) GBBURBLE (__VA_ARGS__)
+#define GB_BURBLE_MATRIX(A, ...)                                    \
+{                                                                   \
+    if (!(A->vlen <= 1 && A->vdim <= 1)) GBURBLE (__VA_ARGS__)      \
+}
 
 #else
 
 // no burble
-#define GBBURBLE(...)
+#define GBURBLE(...)
 #define GB_BURBLE_START(func)
 #define GB_BURBLE_END
 #define GB_BURBLE_N(n,...)
 #define GB_BURBLE_MATRIX(A,...)
+#define GB_BURBLE_DENSE(A,format)
 
 #endif
 #endif
diff --git a/GraphBLAS/Source/GB_pslice.c b/GraphBLAS/Source/GB_pslice.c
index 783734281c..f0ed9444de 100644
--- a/GraphBLAS/Source/GB_pslice.c
+++ b/GraphBLAS/Source/GB_pslice.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_pslice: partition Ap for a parallel loop
+// GB_pslice: partition Ap for a set of tasks
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,19 +13,122 @@
 // the work needed for computing each vector of a matrix (see GB_ewise_slice
 // and GB_subref_slice, for example).
 
+// If Ap is NULL then the matrix A (not provided here) is full or bitmap,
+// which this function handles (Ap is implicit).
+
 #include "GB.h"
 
+//------------------------------------------------------------------------------
+// GB_pslice_worker: partition Ap for a set of tasks
+//------------------------------------------------------------------------------
+
+static void GB_pslice_worker
+(
+    int64_t *GB_RESTRICT Slice,     // size ntasks+1
+    const int64_t *GB_RESTRICT Ap,  // array size n+1
+    int tlo,                        // assign to Slice [(tlo+1):(thi-1)]
+    int thi                     
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    #ifdef GB_DEBUG
+    ASSERT (Ap != NULL) ;
+    ASSERT (Slice != NULL) ;
+    ASSERT (0 <= tlo && tlo < thi - 1) ;
+    for (int t = tlo+1 ; t <= thi-1 ; t++)
+    {
+        ASSERT (Slice [t] == -1) ;
+    }
+    #endif
+
+    //--------------------------------------------------------------------------
+    // assign work to Slice [(tlo+1):(thi-1)]
+    //--------------------------------------------------------------------------
+
+    // klo = Slice [tlo] and khi = Slice [thi] are defined on input, where
+    // tlo < thi - 1.  This determines the task boundaries for tasks
+    // tlo+1 to thi-1, which defines Slice [(tlo+1):(thi-1)].
+
+    int64_t klo = Slice [tlo] ;
+    int64_t khi = Slice [thi] ;         ASSERT (0 <= klo && klo <= khi) ;
+    int64_t p1 = Ap [klo] ;
+    int64_t p2 = Ap [khi] ;             ASSERT (p1 <= p2) ;
+
+    if (p1 == p2 || klo == khi)
+    {
+
+        //----------------------------------------------------------------------
+        // no work is left so simply fill in with empty tasks
+        //----------------------------------------------------------------------
+
+        int64_t k = klo ;
+        for (int64_t t = tlo+1 ; t <= thi-1 ; t++)
+        { 
+            Slice [t] = k ;
+        }
+
+    }
+    else // p1 < p2 && klo < khi
+    {
+
+        //----------------------------------------------------------------------
+        // find task t that evenly partitions the work p1:p2 to tasks tlo:thi
+        //----------------------------------------------------------------------
+
+        int64_t k = (klo + khi) / 2 ;       ASSERT (klo <= k && k <= khi) ;
+        int64_t p = Ap [k] ;                ASSERT (p1 <= p && p <= p2) ;
+        double ntasks = thi - tlo ;
+        double ratio = (((double) (p - p1)) / ((double) (p2 - p1))) ;
+        int t = tlo + (int) floor (ratio * ntasks) ;
+        t = GB_IMAX (t, tlo+1) ;
+        t = GB_IMIN (t, thi-1) ;            ASSERT (tlo < t && t < thi) ;
+
+        //----------------------------------------------------------------------
+        // assign work to task t
+        //----------------------------------------------------------------------
+
+        ASSERT (Slice [t] == -1) ;
+        Slice [t] = k ;
+
+        //----------------------------------------------------------------------
+        // recursively partition for tasks (tlo+1):(t-1) and (t+1):(thi-1)
+        //----------------------------------------------------------------------
+
+        if (tlo < t-1)
+        { 
+            GB_pslice_worker (Slice, Ap, tlo, t) ;
+        }
+        if (t < thi-1)
+        { 
+            GB_pslice_worker (Slice, Ap, t, thi) ;
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// GB_pslice: partition Ap for a set of tasks
+//------------------------------------------------------------------------------
+
+
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 bool GB_pslice          // slice Ap; return true if ok, false if out of memory
 (
     int64_t *GB_RESTRICT *Slice_handle,    // size ntasks+1
-    const int64_t *GB_RESTRICT Ap,         // array of size n+1
+    const int64_t *GB_RESTRICT Ap,  // array size n+1 (NULL if full or bitmap)
     const int64_t n,
-    const int ntasks                    // # of tasks
+    const int ntasks,               // # of tasks
+    const bool perfectly_balanced
 )
 {
 
+    //--------------------------------------------------------------------------
     // allocate result, unless it is already allocated on input
+    //--------------------------------------------------------------------------
+
     int64_t *Slice ;
     if ((*Slice_handle) == NULL)
     {
@@ -43,33 +146,97 @@ bool GB_pslice          // slice Ap; return true if ok, false if out of memory
         Slice = (*Slice_handle) ;
     }
 
-    const double work = (Ap == NULL) ? 0 : Ap [n] ;
+    #ifdef GB_DEBUG
+    for (int taskid = 0 ; taskid <= ntasks ; taskid++)
+    {
+        Slice [taskid] = -1 ;
+    }
+    #endif
+
+    //--------------------------------------------------------------------------
+    // assign first and last task boundaries
+    //--------------------------------------------------------------------------
 
     Slice [0] = 0 ;
-    if (Ap == NULL || n == 0 || ntasks <= 1 || work == 0)
+    Slice [ntasks] = n ;
+
+    //--------------------------------------------------------------------------
+    // slice the work for remainder of the tasks, 1:ntasks-1
+    //--------------------------------------------------------------------------
+
+    if (Ap == NULL)
     {
-        // matrix is empty, or a single thread is used
+
+        //----------------------------------------------------------------------
+        // A is full or bitmap
+        //----------------------------------------------------------------------
+
         for (int taskid = 1 ; taskid < ntasks ; taskid++)
         { 
-            Slice [taskid] = 0 ;
+            Slice [taskid] = (int64_t) GB_PART (taskid, n, ntasks) ;
         }
+
     }
     else
     {
-        // slice Ap by # of entries
-        int64_t k = 0 ;
-        for (int taskid = 1 ; taskid < ntasks ; taskid++)
-        { 
-            // binary search to find k so that Ap [k] == (taskid * work) /
-            // ntasks.  The exact value will not typically not be found;
-            // just pick what the binary search comes up with.
-            int64_t wtask = ((taskid * work) / (double) ntasks) ;
-            int64_t pright = n ;
-            GB_TRIM_BINARY_SEARCH (wtask, Ap, k, pright) ;
-            Slice [taskid] = k ;
+
+        //----------------------------------------------------------------------
+        // A is sparse or hypersparse
+        //----------------------------------------------------------------------
+
+        if (n == 0 || ntasks <= 1 || Ap [n] == 0)
+        {
+            // matrix is empty, or a single thread is used
+            for (int taskid = 1 ; taskid < ntasks ; taskid++)
+            { 
+                // slice sparse/hyper with 1 task, n == 0, or no work
+                Slice [taskid] = 0 ;
+            }
+        }
+        else
+        {
+            // slice Ap by # of entries
+            if (perfectly_balanced)
+            {
+                // this method is costly, and should only be used if the
+                // work is to be perfectly balanced (in particular, when there
+                // is just one task per thread, with static scheduling)
+                const double work = (double) (Ap [n]) ;
+                int64_t k = 0 ;
+                for (int taskid = 1 ; taskid < ntasks ; taskid++)
+                { 
+                    // binary search to find k so that Ap [k] == (taskid*work) /
+                    // ntasks.  The exact value will not typically not be found;
+                    // just pick what the binary search comes up with.
+                    int64_t wtask = (int64_t) GB_PART (taskid, work, ntasks) ;
+                    int64_t pright = n ;
+                    GB_TRIM_BINARY_SEARCH (wtask, Ap, k, pright) ;
+                    Slice [taskid] = k ;
+                }
+            }
+            else
+            { 
+                // this is much faster, and results in good load balancing if
+                // there is more than one task per thread, and dynamic
+                // scheduling is used.
+                GB_pslice_worker (Slice, Ap, 0, ntasks) ;
+            }
         }
     }
-    Slice [ntasks] = n ;
+
+    //--------------------------------------------------------------------------
+    // check result
+    //--------------------------------------------------------------------------
+
+    #ifdef GB_DEBUG
+    ASSERT (Slice [0] == 0) ;
+    ASSERT (Slice [ntasks] == n) ;
+    for (int taskid = 0 ; taskid < ntasks ; taskid++)
+    {
+        ASSERT (Slice [taskid] <= Slice [taskid+1]) ;
+    }
+    #endif
+
     return (true) ;
 }
 
diff --git a/GraphBLAS/Source/GB_qsort_1a.c b/GraphBLAS/Source/GB_qsort_1a.c
index 9ad738d743..9b9a5c2114 100644
--- a/GraphBLAS/Source/GB_qsort_1a.c
+++ b/GraphBLAS/Source/GB_qsort_1a.c
@@ -2,8 +2,8 @@
 // GB_qsort_1a: sort an 1-by-n list of integers
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -29,9 +29,9 @@
 #define GB_K 1
 
 // swap A [a] and A [b]
-#define GB_swap(A,a,b)                                                      \
-{                                                                           \
-    int64_t t = A ## _0 [a] ; A ## _0 [a] = A ## _0 [b] ; A ## _0 [b] = t ; \
+#define GB_swap(A,a,b)                                                        \
+{                                                                             \
+    int64_t t0 = A ## _0 [a] ; A ## _0 [a] = A ## _0 [b] ; A ## _0 [b] = t0 ; \
 }
 
 #define GB_partition GB_partition_1a
diff --git a/GraphBLAS/Source/GB_qsort_1b.c b/GraphBLAS/Source/GB_qsort_1b.c
index 329ffeaeda..89dc1d81f7 100644
--- a/GraphBLAS/Source/GB_qsort_1b.c
+++ b/GraphBLAS/Source/GB_qsort_1b.c
@@ -2,16 +2,22 @@
 // GB_qsort_1b: sort a 2-by-n list, using A [0][ ] as the sort key
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "GB_sort.h"
 
 // returns true if A [a] < B [b]
-#define GB_lt(A,a,B,b)                  \
-    GB_lt_1 (A ## _0, a, B ## _0, b)
+#define GB_lt(A,a,B,b) GB_lt_1 (A ## _0, a, B ## _0, b)
+
+// each entry has a single key
+#define GB_K 1
+
+//------------------------------------------------------------------------------
+// GB_qsort_1b: generic method for any data type
+//------------------------------------------------------------------------------
 
 // argument list for calling a function
 #define GB_arg(A)                       \
@@ -23,21 +29,18 @@
 
 // argument list for defining a function
 #define GB_args(A)                      \
-    int64_t *GB_RESTRICT A ## _0,          \
-    GB_void *GB_RESTRICT A ## _1,          \
+    int64_t *GB_RESTRICT A ## _0,       \
+    GB_void *GB_RESTRICT A ## _1,       \
     size_t xsize
 
-// each entry has a single key
-#define GB_K 1
-
 // swap A [a] and A [b]
-#define GB_swap(A,a,b)                                                      \
-{                                                                           \
-    int64_t t = A ## _0 [a] ; A ## _0 [a] = A ## _0 [b] ; A ## _0 [b] = t ; \
-    GB_void t1 [GB_VLA(xsize)] ;                                            \
-    memcpy (t1, A ## _1 + (a)*xsize, xsize) ;                               \
-    memcpy (A ## _1 + (a)*xsize, A ## _1 + (b)*xsize, xsize) ;              \
-    memcpy (A ## _1 + (b)*xsize, t1, xsize) ;                               \
+#define GB_swap(A,a,b)                                                        \
+{                                                                             \
+    int64_t t0 = A ## _0 [a] ; A ## _0 [a] = A ## _0 [b] ; A ## _0 [b] = t0 ; \
+    GB_void t1 [GB_VLA(xsize)] ;                                              \
+    memcpy (t1, A ## _1 + (a)*xsize, xsize) ;                                 \
+    memcpy (A ## _1 + (a)*xsize, A ## _1 + (b)*xsize, xsize) ;                \
+    memcpy (A ## _1 + (b)*xsize, t1, xsize) ;                                 \
 }
 
 #define GB_partition GB_partition_1b
@@ -58,3 +61,160 @@ void GB_qsort_1b    // sort array A of size 2-by-n, using 1 key (A [0][])
     GB_quicksort (GB_arg (A), n, &seed) ;
 }
 
+//------------------------------------------------------------------------------
+// GB_qsort_1b_size1:  quicksort with A_1 of type that has sizeof 1
+//------------------------------------------------------------------------------
+
+// for GrB_BOOL, GrB_INT8, GrB_UINT8, and user-defined types with sizeof(...)=1
+
+#define A1_type uint8_t
+
+// argument list for calling a function
+#undef  GB_arg
+#define GB_arg(A)                       \
+    A ## _0, A ## _1
+
+// argument list for calling a function, with offset
+#undef  GB_arg_offset
+#define GB_arg_offset(A,x)              \
+    A ## _0 + (x), A ## _1 + (x)
+
+// argument list for defining a function
+#undef  GB_args
+#define GB_args(A)                      \
+    int64_t *GB_RESTRICT A ## _0,       \
+    A1_type *GB_RESTRICT A ## _1        \
+
+// swap A [a] and A [b]
+#undef  GB_swap
+#define GB_swap(A,a,b)                  \
+{                                       \
+    int64_t t0 = A ## _0 [a] ; A ## _0 [a] = A ## _0 [b] ; A ## _0 [b] = t0 ; \
+    A1_type t1 = A ## _1 [a] ; A ## _1 [a] = A ## _1 [b] ; A ## _1 [b] = t1 ; \
+}
+
+#undef  GB_partition
+#define GB_partition GB_partition_1b_size1
+#undef  GB_quicksort
+#define GB_quicksort GB_quicksort_1b_size1
+
+#include "GB_qsort_template.c"
+
+void GB_qsort_1b_size1  // GB_qsort_1b with A_1 with sizeof = 1
+(
+    int64_t *GB_RESTRICT A_0,       // size n array
+    uint8_t *GB_RESTRICT A_1,       // size n array
+    const int64_t n
+)
+{ 
+    uint64_t seed = n ;
+    GB_quicksort (GB_arg (A), n, &seed) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_qsort_1b_size2:  quicksort with A_1 of type that has sizeof 2
+//------------------------------------------------------------------------------
+
+// for GrB_INT16, GrB_UINT16, and user-defined types of sizeof(...) = 2
+
+#undef  A1_type
+#define A1_type uint16_t
+#undef  GB_partition
+#define GB_partition GB_partition_1b_size2
+#undef  GB_quicksort
+#define GB_quicksort GB_quicksort_1b_size2
+
+#include "GB_qsort_template.c"
+
+void GB_qsort_1b_size2  // GB_qsort_1b with A_1 with sizeof = 2
+(
+    int64_t *GB_RESTRICT A_0,       // size n array
+    uint16_t *GB_RESTRICT A_1,      // size n array
+    const int64_t n
+)
+{ 
+    uint64_t seed = n ;
+    GB_quicksort (GB_arg (A), n, &seed) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_qsort_1b_size4:  quicksort with A_1 of type that has sizeof 4
+//------------------------------------------------------------------------------
+
+// for GrB_INT32, GrB_UINT32, GrB_FP32, and user-defined types with
+// sizeof(...) = 4.
+
+#undef  A1_type
+#define A1_type uint32_t
+#undef  GB_partition
+#define GB_partition GB_partition_1b_size4
+#undef  GB_quicksort
+#define GB_quicksort GB_quicksort_1b_size4
+
+#include "GB_qsort_template.c"
+
+void GB_qsort_1b_size4  // GB_qsort_1b with A_1 with sizeof = 4
+(
+    int64_t *GB_RESTRICT A_0,       // size n array
+    uint32_t *GB_RESTRICT A_1,      // size n array
+    const int64_t n
+)
+{ 
+    uint64_t seed = n ;
+    GB_quicksort (GB_arg (A), n, &seed) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_qsort_1b_size8:  quicksort with A_1 of type that has sizeof 8
+//------------------------------------------------------------------------------
+
+// for GrB_INT64, GrB_UINT64, GrB_FP64, GxB_FC32, and user-defined types
+// with sizeof(...) = 8.
+
+#undef  A1_type
+#define A1_type uint64_t
+#undef  GB_partition
+#define GB_partition GB_partition_1b_size8
+#undef  GB_quicksort
+#define GB_quicksort GB_quicksort_1b_size8
+
+#include "GB_qsort_template.c"
+
+void GB_qsort_1b_size8  // GB_qsort_1b with A_1 with sizeof = 8
+(
+    int64_t *GB_RESTRICT A_0,       // size n array
+    uint64_t *GB_RESTRICT A_1,      // size n array
+    const int64_t n
+)
+{ 
+    uint64_t seed = n ;
+    GB_quicksort (GB_arg (A), n, &seed) ;
+}
+
+//------------------------------------------------------------------------------
+// GB_qsort_1b_size16:  quicksort with A_1 of type that has sizeof 16
+//------------------------------------------------------------------------------
+
+// for GxB_FC64 and user-defined types with sizeof(...) = 16.
+
+#undef  A1_type
+#define A1_type GB_blob16
+#undef  GB_partition
+#define GB_partition GB_partition_1b_size16
+#undef  GB_quicksort
+#define GB_quicksort GB_quicksort_1b_size16
+
+#include "GB_qsort_template.c"
+
+void GB_qsort_1b_size16 // GB_qsort_1b with A_1 with sizeof = 16
+(
+    int64_t *GB_RESTRICT A_0,       // size n array
+    GB_blob16 *GB_RESTRICT A_1,     // size n array
+    const int64_t n
+)
+{ 
+    ASSERT (sizeof (GB_blob16) == 16) ;
+    uint64_t seed = n ;
+    GB_quicksort (GB_arg (A), n, &seed) ;
+}
+
diff --git a/GraphBLAS/Source/GB_qsort_2.c b/GraphBLAS/Source/GB_qsort_2.c
index c792de7f23..d8d86c54b4 100644
--- a/GraphBLAS/Source/GB_qsort_2.c
+++ b/GraphBLAS/Source/GB_qsort_2.c
@@ -2,8 +2,8 @@
 // GB_qsort_2: sort a 2-by-n list of integers, using A[0:1][ ] as the key
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_qsort_3.c b/GraphBLAS/Source/GB_qsort_3.c
index eeb81625a3..2e589c463e 100644
--- a/GraphBLAS/Source/GB_qsort_3.c
+++ b/GraphBLAS/Source/GB_qsort_3.c
@@ -2,8 +2,8 @@
 // GB_qsort_3: sort a 3-by-n list of integers, using A[0:2][] as the key
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_queue_insert.c b/GraphBLAS/Source/GB_queue_insert.c
deleted file mode 100644
index 35cc06e272..0000000000
--- a/GraphBLAS/Source/GB_queue_insert.c
+++ /dev/null
@@ -1,31 +0,0 @@
-// GB_queue_insert:  TODO in 4.0: delete 
-// DEPRECATED:  all GB_queue_* will be removed when GrB_wait() is gone.
-
-#include "GB.h"
-
-GB_PUBLIC
-bool GB_queue_insert (GrB_Matrix A)
-{
-    bool ok = true ;
-    if ((A->Pending != NULL || A->nzombies > 0) && !(A->enqueued))
-    {
-        #define GB_CRITICAL_SECTION                                         \
-        {                                                                   \
-            if ((A->Pending != NULL || A->nzombies > 0) && !(A->enqueued))  \
-            {                                                               \
-                GrB_Matrix Head = (GrB_Matrix) (GB_Global_queue_head_get ( )) ;\
-                A->queue_next = Head ;                                      \
-                A->queue_prev = NULL ;                                      \
-                A->enqueued = true ;                                        \
-                if (Head != NULL)                                           \
-                {                                                           \
-                    Head->queue_prev = A ;                                  \
-                }                                                           \
-                GB_Global_queue_head_set (A) ;                              \
-            }                                                               \
-        }
-        #include "GB_critical_section.c"
-    }
-    return (ok) ;
-}
-
diff --git a/GraphBLAS/Source/GB_queue_remove.c b/GraphBLAS/Source/GB_queue_remove.c
deleted file mode 100644
index 44f2838c86..0000000000
--- a/GraphBLAS/Source/GB_queue_remove.c
+++ /dev/null
@@ -1,38 +0,0 @@
-// GB_queue_remove: TODO in 4.0: delete
-// DEPRECATED:  all GB_queue_* will be removed when GrB_wait() is gone.
-
-#include "GB.h"
-
-bool GB_queue_remove (GrB_Matrix A)
-{
-    bool ok = true ;
-    if (A->enqueued)
-    { 
-        #define GB_CRITICAL_SECTION                                         \
-        {                                                                   \
-            if (A->enqueued)                                                \
-            {                                                               \
-                GrB_Matrix Prev = (GrB_Matrix) (A->queue_prev) ;            \
-                GrB_Matrix Next = (GrB_Matrix) (A->queue_next) ;            \
-                if (Prev == NULL)                                           \
-                {                                                           \
-                    GB_Global_queue_head_set (Next) ;                       \
-                }                                                           \
-                else                                                        \
-                {                                                           \
-                    Prev->queue_next = Next ;                               \
-                }                                                           \
-                if (Next != NULL)                                           \
-                {                                                           \
-                    Next->queue_prev = Prev ;                               \
-                }                                                           \
-                A->queue_prev = NULL ;                                      \
-                A->queue_next = NULL ;                                      \
-                A->enqueued = false ;                                       \
-            }                                                               \
-        }
-        #include "GB_critical_section.c"
-    }
-    return (ok) ;
-}
-
diff --git a/GraphBLAS/Source/GB_queue_remove_head.c b/GraphBLAS/Source/GB_queue_remove_head.c
deleted file mode 100644
index bf2a015b95..0000000000
--- a/GraphBLAS/Source/GB_queue_remove_head.c
+++ /dev/null
@@ -1,32 +0,0 @@
-// GB_queue_remove_head: TODO in 4.0: delete this
-// DEPRECATED:  all GB_queue_* will be removed when GrB_wait() is gone.
-
-#include "GB.h"
-
-bool GB_queue_remove_head       // remove matrix at the head of queue
-(
-    GrB_Matrix *Ahandle         // return matrix or NULL if queue empty
-)
-{
-    GrB_Matrix A = NULL ;
-    bool ok = true ;
-    #define GB_CRITICAL_SECTION                                             \
-    {                                                                       \
-        A = (GrB_Matrix) (GB_Global_queue_head_get ( )) ;                   \
-        if (A != NULL)                                                      \
-        {                                                                   \
-            GrB_Matrix Next = (GrB_Matrix) A->queue_next ;                  \
-            GB_Global_queue_head_set (Next) ;                               \
-            if (Next != NULL)                                               \
-            {                                                               \
-                Next->queue_prev = NULL ;                                   \
-            }                                                               \
-            A->queue_next = NULL ;                                          \
-            A->enqueued = false ;                                           \
-        }                                                                   \
-    }
-    #include "GB_critical_section.c"
-    (*Ahandle) = A ;
-    return (ok) ;
-}
-
diff --git a/GraphBLAS/Source/GB_queue_status.c b/GraphBLAS/Source/GB_queue_status.c
deleted file mode 100644
index 4330edf6fb..0000000000
--- a/GraphBLAS/Source/GB_queue_status.c
+++ /dev/null
@@ -1,30 +0,0 @@
-// GB_queue_status:  TODO in 4.0: delete
-// DEPRECATED:  all GB_queue_* will be removed when GrB_wait() is gone.
-
-#include "GB.h"
-
-bool GB_queue_status            // get the queue status of a matrix
-(
-    GrB_Matrix A,               // matrix to check
-    GrB_Matrix *p_head,         // head of the queue
-    GrB_Matrix *p_prev,         // prev from A
-    GrB_Matrix *p_next,         // next after A
-    bool *p_enqd                // true if A is in the queue
-)
-{ 
-    bool ok = true ;
-    (*p_head) = NULL ;
-    (*p_prev) = NULL ;
-    (*p_next) = NULL ;
-    (*p_enqd) = NULL ;
-    #define GB_CRITICAL_SECTION                                             \
-    {                                                                       \
-        (*p_head) = (GrB_Matrix) (GB_Global_queue_head_get ( )) ;           \
-        (*p_prev) = (GrB_Matrix) (A->queue_prev) ;                          \
-        (*p_next) = (GrB_Matrix) (A->queue_next) ;                          \
-        (*p_enqd) = A->enqueued ;                                           \
-    }
-    #include "GB_critical_section.c"
-    return (ok) ;
-}
-
diff --git a/GraphBLAS/Source/GB_realloc_memory.c b/GraphBLAS/Source/GB_realloc_memory.c
index 1f3b0569e4..7fa00013d7 100644
--- a/GraphBLAS/Source/GB_realloc_memory.c
+++ b/GraphBLAS/Source/GB_realloc_memory.c
@@ -2,8 +2,8 @@
 // GB_realloc_memory: wrapper for realloc_function
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -74,7 +74,7 @@ void *GB_realloc_memory     // pointer to reallocated block of memory, or
         (*ok) = true ;
     }
     else
-    { 
+    {
         // change the size of the object from nitems_old to nitems_new
         void *pnew ;
         
@@ -97,7 +97,7 @@ void *GB_realloc_memory     // pointer to reallocated block of memory, or
         //----------------------------------------------------------------------
 
         if (pretend_to_fail)
-        { 
+        {
             pnew = NULL ;
         }
         else
@@ -159,7 +159,7 @@ void *GB_realloc_memory     // pointer to reallocated block of memory, or
             }
         }
         else
-        {
+        { 
             // success
             p = pnew ;
             (*ok) = true ;
diff --git a/GraphBLAS/Source/GB_reduce.h b/GraphBLAS/Source/GB_reduce.h
index 702850f32f..261bfe27cd 100644
--- a/GraphBLAS/Source/GB_reduce.h
+++ b/GraphBLAS/Source/GB_reduce.h
@@ -2,8 +2,8 @@
 // GB_reduce.h: definitions for GB_reduce
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -26,8 +26,7 @@ GrB_Info GB_reduce_to_vector        // C<M> = accum (C,reduce(A))
     GrB_Matrix C,                   // input/output for results, size n-by-1
     const GrB_Matrix M,             // optional M for C, unused if NULL
     const GrB_BinaryOp accum,       // optional accum for z=accum(C,T)
-    const GrB_BinaryOp reduce,      // reduce operator for T=reduce(A)
-    const GB_void *terminal,        // for early exit (NULL if none)
+    const GrB_Monoid monoid,        // reduce monoid for T=reduce(A)
     const GrB_Matrix A,             // first input:  matrix A
     const GrB_Descriptor desc,      // descriptor for C, M, and A
     GB_Context Context
diff --git a/GraphBLAS/Source/GB_reduce_to_scalar.c b/GraphBLAS/Source/GB_reduce_to_scalar.c
index 6b007eecdf..79ba69fc2f 100644
--- a/GraphBLAS/Source/GB_reduce_to_scalar.c
+++ b/GraphBLAS/Source/GB_reduce_to_scalar.c
@@ -2,8 +2,8 @@
 // GB_reduce_to_scalar: reduce a matrix to a scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,6 +13,15 @@
 // This function does not need to know if A is hypersparse or not, and its
 // result is the same if A is in CSR or CSC format.
 
+// This function is the only place in all of GraphBLAS where the identity value
+// of a monoid is required, but only in one special case: it is required to be
+// the return value of c when A has no entries.  The identity value is also
+// used internally, in the parallel methods below, to initialize a scalar value
+// in each task.  The methods could be rewritten to avoid the use of the
+// identity value.  Since this function requires it anyway, for the special
+// case when nvals(A) is zero, the existence of the identity value makes the
+// code a little simpler.
+
 #include "GB_reduce.h"
 #include "GB_binop.h"
 #include "GB_atomics.h"
@@ -20,7 +29,11 @@
 #include "GB_red__include.h"
 #endif
 
-#define GB_FREE_ALL ;
+#define GB_FREE_ALL                 \
+{                                   \
+    GB_FREE (W) ;                   \
+    GB_FREE (F) ;                   \
+}
 
 GrB_Info GB_reduce_to_scalar    // s = reduce_to_scalar (A)
 (
@@ -39,8 +52,10 @@ GrB_Info GB_reduce_to_scalar    // s = reduce_to_scalar (A)
 
     GrB_Info info ;
     GB_RETURN_IF_NULL_OR_FAULTY (reduce) ;
-    GB_RETURN_IF_FAULTY (accum) ;
+    GB_RETURN_IF_FAULTY_OR_POSITIONAL (accum) ;
     GB_RETURN_IF_NULL (c) ;
+    GB_void *GB_RESTRICT W = NULL ;
+    bool    *GB_RESTRICT F = NULL ;
 
     ASSERT_TYPE_OK (ctype, "type of scalar c", GB0) ;
     ASSERT_MONOID_OK (reduce, "reduce for reduce_to_scalar", GB0) ;
@@ -54,18 +69,19 @@ GrB_Info GB_reduce_to_scalar    // s = reduce_to_scalar (A)
     // s = reduce (s,A) must be compatible
     if (!GB_Type_compatible (A->type, ztype))
     { 
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-            "Incompatible type for reduction operator z=%s(x,y):\n"
-            "input of type [%s]\n"
-            "cannot be typecast to reduction operator of type [%s]",
-            reduce->op->name, A->type->name, reduce->op->ztype->name))) ;
+        return (GrB_DOMAIN_MISMATCH) ;
     }
 
     //--------------------------------------------------------------------------
-    // delete any lingering zombies and assemble any pending tuples
+    // assemble any pending tuples; zombies are OK
     //--------------------------------------------------------------------------
 
-    GB_MATRIX_WAIT (A) ;
+    GB_MATRIX_WAIT_IF_PENDING (A) ;
+    GB_BURBLE_DENSE (A, "(A %s) ") ;
+
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
 
     //--------------------------------------------------------------------------
     // get A
@@ -73,208 +89,190 @@ GrB_Info GB_reduce_to_scalar    // s = reduce_to_scalar (A)
 
     int64_t asize = A->type->size ;
     int64_t zsize = ztype->size ;
-    int64_t anz = GB_NNZ (A) ;
+    int64_t anz = GB_NNZ_HELD (A) ;
+
+    // s = identity
+    GB_void s [GB_VLA(zsize)] ;
+    memcpy (s, reduce->identity, zsize) ;   // required, if nnz(A) is zero
 
     //--------------------------------------------------------------------------
-    // determine the number of OpenMP threads and/or GPUs to use
+    // s = reduce_to_scalar (A) on the GPU(s) or CPU
     //--------------------------------------------------------------------------
 
-    int nthreads = 0, ntasks = 0 ;
-    int ngpus_to_use = GB_ngpus_to_use ((double) anz) ;
-    int blocksize = 0 ;
-
-    if (ngpus_to_use > 0)
+    #if defined ( GBCUDA )
+    if (GB_reduce_to_scalar_cuda_branch (reduce, A, Context))
     {
-        // use the GPU
-        blocksize = 512 ;                                   // blockDim.x
-        // TODO for GPU: grid.x is too large
-        ntasks = GB_ICEIL (anz, 8*blocksize) ;              // grid.x
-        // ntasks = (anz + 8*blocksize - 1) / (8*blocksize) ;
-        ngpus_to_use = 1 ;              // assume one GPU (TODO for GPU)
-        nthreads = ngpus_to_use ;       // use one CPU thread per GPU
+
+        //----------------------------------------------------------------------
+        // use the GPU(s)
+        //----------------------------------------------------------------------
+
+        GB_OK (GB_reduce_to_scalar_cuda (s, reduce, A, Context)) ;
+
     }
     else
+    #endif
     {
-        // use the CPU
+
+        //----------------------------------------------------------------------
+        // use OpenMP on the CPU threads
+        //----------------------------------------------------------------------
+
+        int nthreads = 0, ntasks = 0 ;
         GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
         nthreads = GB_nthreads (anz, chunk, nthreads_max) ;
         ntasks = (nthreads == 1) ? 1 : (64 * nthreads) ;
         ntasks = GB_IMIN (ntasks, anz) ;
         ntasks = GB_IMAX (ntasks, 1) ;
-    }
 
-    //--------------------------------------------------------------------------
-    // allocate workspace
-    //--------------------------------------------------------------------------
-
-    GB_void *GB_RESTRICT W = GB_MALLOC (ntasks * zsize, GB_void) ;
-    if (W == NULL)
-    { 
-        // out of memory
-        return (GB_OUT_OF_MEMORY) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // s = reduce_to_scalar (A)
-    //--------------------------------------------------------------------------
-
-    // s = identity
-    GB_void s [GB_VLA(zsize)] ;
-    memcpy (s, reduce->identity, zsize) ;
-
-    // get terminal value, if any
-    GB_void *GB_RESTRICT terminal = (GB_void *) reduce->terminal ;
+        //----------------------------------------------------------------------
+        // allocate workspace
+        //----------------------------------------------------------------------
 
-    if (anz == 0)
-    { 
+        W = GB_MALLOC (ntasks * zsize, GB_void) ;
+        F = GB_MALLOC (ntasks, bool) ;
+        if (W == NULL || F == NULL)
+        { 
+            // out of memory
+            GB_FREE_ALL ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
 
         //----------------------------------------------------------------------
-        // nothing to do
+        // s = reduce_to_scalar (A)
         //----------------------------------------------------------------------
 
-        ;
+        // get terminal value, if any
+        GB_void *GB_RESTRICT terminal = (GB_void *) reduce->terminal ;
 
-    }
-    else if (A->type == ztype)
-    {
+        if (anz == 0)
+        { 
 
-        //----------------------------------------------------------------------
-        // reduce to scalar via built-in operator
-        //----------------------------------------------------------------------
+            //------------------------------------------------------------------
+            // nothing to do
+            //------------------------------------------------------------------
 
-        bool done = false ;
+            ;
 
-        #ifndef GBCOMPACT
+        }
+        else if (A->type == ztype)
+        {
 
             //------------------------------------------------------------------
-            // define the worker for the switch factory
+            // reduce to scalar via built-in operator
             //------------------------------------------------------------------
 
-            #define GB_red(opname,aname) GB_red_scalar_ ## opname ## aname
+            bool done = false ;
+
+            #ifndef GBCOMPACT
 
-            #define GB_RED_WORKER(opname,aname,atype)                       \
-            {                                                               \
-                info = GB_red (opname, aname) ((atype *) s, A, W,           \
-                    ntasks, nthreads) ;                                     \
-                done = (info != GrB_NO_VALUE) ;                             \
-            }                                                               \
-            break ;
+                //--------------------------------------------------------------
+                // define the worker for the switch factory
+                //--------------------------------------------------------------
+
+                #define GB_red(opname,aname) GB_red_scalar_ ## opname ## aname
+
+                #define GB_RED_WORKER(opname,aname,atype)                   \
+                {                                                           \
+                    info = GB_red (opname, aname) ((atype *) s, A, W, F,    \
+                        ntasks, nthreads) ;                                 \
+                    done = (info != GrB_NO_VALUE) ;                         \
+                }                                                           \
+                break ;
+
+                //--------------------------------------------------------------
+                // launch the switch factory
+                //--------------------------------------------------------------
+
+                // controlled by opcode and typecode
+                GB_Opcode opcode = reduce->op->opcode ;
+                GB_Type_code typecode = A->type->code ;
+                ASSERT (typecode <= GB_UDT_code) ;
+
+                #include "GB_red_factory.c"
+
+            #endif
 
             //------------------------------------------------------------------
-            // launch the switch factory
+            // generic worker: sum up the entries, no typecasting
             //------------------------------------------------------------------
 
-            // controlled by opcode and typecode
-            GB_Opcode opcode = reduce->op->opcode ;
-            GB_Type_code typecode = A->type->code ;
-            ASSERT (typecode <= GB_UDT_code) ;
+            if (!done)
+            { 
+                GB_BURBLE_MATRIX (A, "(generic reduce to scalar: %s) ",
+                    reduce->op->name) ;
 
-            #include "GB_red_factory.c"
+                // the switch factory didn't handle this case
+                GxB_binary_function freduce = reduce->op->function ;
 
-        #endif
+                #define GB_ATYPE GB_void
 
-        //----------------------------------------------------------------------
-        // generic worker: sum up the entries, no typecasting
-        //----------------------------------------------------------------------
+                // no panel used
+                #define GB_PANEL 1
+                #define GB_NO_PANEL_CASE
 
-        if (!done)
-        { 
-            GB_BURBLE_MATRIX (A, "generic ") ;
+                // ztype t = identity
+                #define GB_SCALAR_IDENTITY(t)                           \
+                    GB_void t [GB_VLA(zsize)] ;                         \
+                    memcpy (t, reduce->identity, zsize) ;
 
-            // the switch factory didn't handle this case
-            GxB_binary_function freduce = reduce->op->function ;
+                // t = W [tid], no typecast
+                #define GB_COPY_ARRAY_TO_SCALAR(t, W, tid)              \
+                    memcpy (t, W +(tid*zsize), zsize)
 
-            #define GB_ATYPE GB_void
-
-            // no panel used
-            #define GB_PANEL 1
-
-            // ztype t = identity
-            #define GB_SCALAR_IDENTITY(t)                           \
-                GB_void t [GB_VLA(zsize)] ;                         \
-                memcpy (t, reduce->identity, zsize) ;
-
-            // t = W [tid], no typecast
-            #define GB_COPY_ARRAY_TO_SCALAR(t, W, tid)              \
-                memcpy (t, W +(tid*zsize), zsize)
-
-            // W [tid] = t, no typecast
-            #define GB_COPY_SCALAR_TO_ARRAY(W, tid, t)              \
-                memcpy (W +(tid*zsize), t, zsize)
-
-            // s += W [k], no typecast
-            #define GB_ADD_ARRAY_TO_SCALAR(s,W,k)                   \
-                freduce (s, s, W +((k)*zsize))
-
-            // break if terminal value reached
-            #define GB_BREAK_IF_TERMINAL(s)                         \
-                if (terminal != NULL)                               \
-                {                                                   \
-                    if (memcmp (s, terminal, zsize) == 0) break ;   \
-                }
-
-            // skip the work for this task if early exit is reached
-            #define GB_IF_NOT_EARLY_EXIT                            \
-                bool my_exit ;                                      \
-                GB_ATOMIC_READ                                      \
-                my_exit = early_exit ;                              \
-                if (!my_exit)
-
-            // break if terminal value reached, inside parallel task
-            #define GB_PARALLEL_BREAK_IF_TERMINAL(s)                \
-                if (terminal != NULL)                               \
-                {                                                   \
-                    if (memcmp (s, terminal, zsize) == 0)           \
-                    {                                               \
-                        /* tell the other tasks to exit early */    \
-                        GB_ATOMIC_WRITE                             \
-                        early_exit = true ;                         \
-                        break ;                                     \
-                    }                                               \
-                }
-
-            // ztype t ;
-            #define GB_SCALAR(t)                                    \
-                GB_void t [GB_VLA(zsize)]
-
-            // t = (ztype) Ax [p], but no typecasting needed
-            #define GB_CAST_ARRAY_TO_SCALAR(t,Ax,p)                 \
-                memcpy (t, Ax +((p)*zsize), zsize)
-
-            // t += (ztype) Ax [p], but no typecasting needed
-            #define GB_ADD_CAST_ARRAY_TO_SCALAR(t,Ax,p)             \
-                freduce (t, t, Ax +((p)*zsize))
-
-            #include "GB_reduce_to_scalar_template.c"
-        }
+                // W [tid] = t, no typecast
+                #define GB_COPY_SCALAR_TO_ARRAY(W, tid, t)              \
+                    memcpy (W +(tid*zsize), t, zsize)
 
-    }
-    else
-    { 
+                // s += W [k], no typecast
+                #define GB_ADD_ARRAY_TO_SCALAR(s,W,k)                   \
+                    freduce (s, s, W +((k)*zsize))
 
-        //----------------------------------------------------------------------
-        // generic worker: sum up the entries, with typecasting
-        //----------------------------------------------------------------------
+                // break if terminal value reached
+                #define GB_HAS_TERMINAL 1
+                #define GB_IS_TERMINAL(s) \
+                    (terminal != NULL && memcmp (s, terminal, zsize) == 0)
 
-        GB_BURBLE_MATRIX (A, "generic ") ;
+                // t = (ztype) Ax [p], but no typecasting needed
+                #define GB_CAST_ARRAY_TO_SCALAR(t,Ax,p)                 \
+                    memcpy (t, Ax +((p)*zsize), zsize)
 
-        GxB_binary_function freduce = reduce->op->function ;
-        GB_cast_function
-            cast_A_to_Z = GB_cast_factory (ztype->code, A->type->code) ;
+                // t += (ztype) Ax [p], but no typecasting needed
+                #define GB_ADD_CAST_ARRAY_TO_SCALAR(t,Ax,p)             \
+                    freduce (t, t, Ax +((p)*zsize))
 
-            // t = (ztype) Ax [p], with typecast
-            #undef  GB_CAST_ARRAY_TO_SCALAR
-            #define GB_CAST_ARRAY_TO_SCALAR(t,Ax,p)                 \
-                cast_A_to_Z (t, Ax +((p)*asize), asize)
+                #include "GB_reduce_to_scalar_template.c"
+            }
 
-            // t += (ztype) Ax [p], with typecast
-            #undef  GB_ADD_CAST_ARRAY_TO_SCALAR
-            #define GB_ADD_CAST_ARRAY_TO_SCALAR(t,Ax,p)             \
-                GB_void awork [GB_VLA(zsize)] ;                     \
-                cast_A_to_Z (awork, Ax +((p)*asize), asize) ;       \
-                freduce (t, t, awork)
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // generic worker: sum up the entries, with typecasting
+            //------------------------------------------------------------------
+
+            GB_BURBLE_MATRIX (A, "(generic reduce to scalar, with typecast:"
+                " %s) ", reduce->op->name) ;
 
-          #include "GB_reduce_to_scalar_template.c"
+            GxB_binary_function freduce = reduce->op->function ;
+            GB_cast_function
+                cast_A_to_Z = GB_cast_factory (ztype->code, A->type->code) ;
+
+                // t = (ztype) Ax [p], with typecast
+                #undef  GB_CAST_ARRAY_TO_SCALAR
+                #define GB_CAST_ARRAY_TO_SCALAR(t,Ax,p)                 \
+                    cast_A_to_Z (t, Ax +((p)*asize), asize)
+
+                // t += (ztype) Ax [p], with typecast
+                #undef  GB_ADD_CAST_ARRAY_TO_SCALAR
+                #define GB_ADD_CAST_ARRAY_TO_SCALAR(t,Ax,p)             \
+                    GB_void awork [GB_VLA(zsize)] ;                     \
+                    cast_A_to_Z (awork, Ax +((p)*asize), asize) ;       \
+                    freduce (t, t, awork)
+
+                #include "GB_reduce_to_scalar_template.c"
+        }
     }
 
     //--------------------------------------------------------------------------
@@ -322,7 +320,7 @@ GrB_Info GB_reduce_to_scalar    // s = reduce_to_scalar (A)
     // free workspace and return result
     //--------------------------------------------------------------------------
 
-    GB_FREE (W) ;
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_reduce_to_vector.c b/GraphBLAS/Source/GB_reduce_to_vector.c
index 854c6bd381..126a8699a2 100644
--- a/GraphBLAS/Source/GB_reduce_to_vector.c
+++ b/GraphBLAS/Source/GB_reduce_to_vector.c
@@ -1,37 +1,25 @@
 //------------------------------------------------------------------------------
-// GB_reduce_to_vector: reduce a matrix to a vector using a binary op
+// GB_reduce_to_vector: reduce a matrix to a vector using a monoid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// CALLS:     GB_build
-
 // C<M> = accum (C,reduce(A)) where C is n-by-1.  Reduces a matrix A or A'
 // to a vector.
 
 #include "GB_reduce.h"
 #include "GB_binop.h"
-#include "GB_build.h"
-#include "GB_ek_slice.h"
-#include "GB_accum_mask.h"
-#ifndef GBCOMPACT
-#include "GB_red__include.h"
-#endif
-
-#define GB_FREE_WORK                                                    \
-{                                                                       \
-    GB_FREE (Wfirst_space) ;                                            \
-    GB_FREE (Wlast_space) ;                                             \
-    GB_ek_slice_free (&pstart_slice, &kfirst_slice, &klast_slice) ;     \
-}
-
-#define GB_FREE_ALL             \
-{                               \
-    GB_FREE_WORK ;              \
-    GB_MATRIX_FREE (&T) ;       \
+#include "GB_mxm.h"
+
+#define GB_FREE_ALL                     \
+{                                       \
+    GB_Matrix_free (&B) ;               \
+    /* cannot use GrB_BinaryOp_free: */ \
+    GB_FREE (first_op) ;                \
+    GrB_Semiring_free (&semiring) ;     \
 }
 
 GrB_Info GB_reduce_to_vector        // C<M> = accum (C,reduce(A))
@@ -39,8 +27,7 @@ GrB_Info GB_reduce_to_vector        // C<M> = accum (C,reduce(A))
     GrB_Matrix C,                   // input/output for results, size n-by-1
     const GrB_Matrix M,             // optional M for C, unused if NULL
     const GrB_BinaryOp accum,       // optional accum for z=accum(C,T)
-    const GrB_BinaryOp reduce,      // reduce operator for T=reduce(A)
-    const GB_void *terminal,        // for early exit (NULL if none)
+    const GrB_Monoid monoid,        // reduce monoid for T=reduce(A)
     const GrB_Matrix A,             // first input:  matrix A
     const GrB_Descriptor desc,      // descriptor for C, M, and A
     GB_Context Context
@@ -51,59 +38,45 @@ GrB_Info GB_reduce_to_vector        // C<M> = accum (C,reduce(A))
     // check inputs
     //--------------------------------------------------------------------------
 
-    // C may be aliased with M and/or A
+    GrB_Matrix B = NULL ;
+    GrB_BinaryOp first_op = NULL ;
+    GrB_Semiring semiring = NULL ;
 
+    // C may be aliased with M and/or A
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;
     GB_RETURN_IF_FAULTY (M) ;
-    GB_RETURN_IF_FAULTY (accum) ;
+    GB_RETURN_IF_FAULTY_OR_POSITIONAL (accum) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (monoid) ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
     GB_RETURN_IF_FAULTY (desc) ;
 
-    ASSERT_MATRIX_OK (C, "C input for reduce_BinaryOp", GB0) ;
-    ASSERT_MATRIX_OK_OR_NULL (M, "M for reduce_BinaryOp", GB0) ;
-    ASSERT_BINARYOP_OK_OR_NULL (accum, "accum for reduce_BinaryOp", GB0) ;
-    ASSERT_BINARYOP_OK (reduce, "reduce for reduce_BinaryOp", GB0) ;
-    ASSERT_MATRIX_OK (A, "A input for reduce_BinaryOp", GB0) ;
-    ASSERT_DESCRIPTOR_OK_OR_NULL (desc, "desc for reduce_BinaryOp", GB0) ;
-
-    GrB_Matrix T = NULL ;
-    int ntasks = 0 ;
-    size_t zsize = 0 ;
-    int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
-    GB_void *GB_RESTRICT Wfirst_space = NULL ;
-    GB_void *GB_RESTRICT Wlast_space = NULL ;
+    ASSERT_MATRIX_OK (C, "C input for reduce-to-vector", GB0) ;
+    ASSERT_MATRIX_OK_OR_NULL (M, "M for reduce-to-vector", GB0) ;
+    ASSERT_BINARYOP_OK_OR_NULL (accum, "accum for reduce-to-vector", GB0) ;
+    ASSERT_MONOID_OK (monoid, "monoid for reduce-to-vector", GB0) ;
+    ASSERT_MATRIX_OK (A, "A input for reduce-to-vector", GB0) ;
+    ASSERT_DESCRIPTOR_OK_OR_NULL (desc, "desc for reduce-to-vector", GB0) ;
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_transpose, xx1, xx2) ;
+        A_transpose, xx1, xx2, do_sort) ;
 
     // C and M are n-by-1 GrB_Vector objects, typecasted to GrB_Matrix
     ASSERT (GB_VECTOR_OK (C)) ;
     ASSERT (GB_IMPLIES (M != NULL, GB_VECTOR_OK (M))) ;
 
     // check domains and dimensions for C<M> = accum (C,T)
-    GrB_Type ttype = reduce->ztype ;
-    GB_OK (GB_compatible (C->type, C, M, accum, ttype, Context)) ;
-
-    // check types of reduce
-    if (reduce->xtype != reduce->ztype || reduce->ytype != reduce->ztype)
-    { 
-        // all 3 types of z = reduce (x,y) must be the same.  reduce must also
-        // be associative but there is no way to check this in general.
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-            "All domains of reduction operator must be identical;\n"
-            "operator is: [%s] = %s ([%s],[%s])", reduce->ztype->name,
-            reduce->name, reduce->xtype->name, reduce->ytype->name))) ;
-    }
+    GrB_Type ztype = monoid->op->ztype ;
+    GB_OK (GB_compatible (C->type, C, M, accum, ztype, Context)) ;
 
     // T = reduce (T,A) must be compatible
-    if (!GB_Type_compatible (A->type, reduce->ztype))
+    if (!GB_Type_compatible (A->type, ztype))
     { 
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-            "Incompatible type for reduction operator z=%s(x,y):\n"
+        GB_ERROR (GrB_DOMAIN_MISMATCH,
+            "Incompatible type for reduction monoid z=%s(x,y):\n"
             "input matrix A of type [%s]\n"
-            "cannot be typecast to reduction operator of type [%s]",
-            reduce->name, A->type->name, reduce->ztype->name))) ;
+            "cannot be typecast to reduction monoid of type [%s]",
+            monoid->op->name, A->type->name, ztype->name) ;
     }
 
     // check the dimensions
@@ -112,20 +85,20 @@ GrB_Info GB_reduce_to_vector        // C<M> = accum (C,reduce(A))
     {
         if (n != GB_NCOLS (A))
         { 
-            return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
+            GB_ERROR (GrB_DIMENSION_MISMATCH,
                 "w=reduce(A'):  length of w is " GBd ";\n"
                 "it must match the number of columns of A, which is " GBd ".",
-                n, GB_NCOLS (A)))) ;
+                n, GB_NCOLS (A)) ;
         }
     }
     else
     {
         if (n != GB_NROWS(A))
         { 
-            return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
+            GB_ERROR (GrB_DIMENSION_MISMATCH,
                 "w=reduce(A):  length of w is " GBd ";\n"
                 "it must match the number of rows of A, which is " GBd ".",
-                n, GB_NROWS (A)))) ;
+                n, GB_NROWS (A)) ;
         }
     }
 
@@ -133,423 +106,48 @@ GrB_Info GB_reduce_to_vector        // C<M> = accum (C,reduce(A))
     GB_RETURN_IF_QUICK_MASK (C, C_replace, M, Mask_comp) ;
 
     //--------------------------------------------------------------------------
-    // delete any lingering zombies and assemble any pending tuples
+    // allocate B as full vector but with B->x of NULL
     //--------------------------------------------------------------------------
 
-    GB_MATRIX_WAIT (M) ;
-    GB_MATRIX_WAIT (A) ;
+    // B is constructed in O(1) time and space, even though it is m-by-1
+    int64_t m = A_transpose ? GB_NROWS (A) : GB_NCOLS (A) ;
+    GB_OK (GB_new (&B,  // full, new header
+        ztype, m, 1, GB_Ap_null, true, GxB_FULL, GB_NEVER_HYPER, 1, Context)) ;
+    B->nzmax = m ;
+    B->magic = GB_MAGIC ;
+    ASSERT_MATRIX_OK (B, "temp vector for reduce-to-vector", GB0) ;
 
     //--------------------------------------------------------------------------
-    // handle the CSR/CSC format of A
+    // create the REDUCE_FIRST_ZTYPE semiring
     //--------------------------------------------------------------------------
 
-    // the result vector T is in CSC format
-    if (!(A->is_csc))
-    { 
-        A_transpose = !A_transpose ;
-    }
-
-    //--------------------------------------------------------------------------
-    // T = reduce (A) or reduce (A')
-    //--------------------------------------------------------------------------
+    // create the FIRST_ZTYPE operator; note the function pointer is NULL.
+    // first_op must be freed with GB_FREE, not GrB_BinaryOp_free, because the
+    // opcode is GB_FIRST_opcode.  GrB_BinaryOp_free uses the opcode to decide
+    // if the operator is user-defined or built-in, and it only frees operators
+    // with an opcode of GB_USER_opcode.
+    GB_OK (GB_binop_new (&first_op, NULL, ztype, ztype, ztype, "first_ztype",
+        GB_FIRST_opcode)) ;
+    ASSERT_BINARYOP_OK (first_op, "op for reduce-to-vector", GB0) ;
 
-    // T is created below so that it can be typecasted to a GrB_Vector when
-    // done: non-hypersparse n-by-1 matrix in CSC format.
-
-    // T = reduce_to_vector (A) or reduce_to_vector (A'), which is T = sum (A')
-    // or sum (A), in MATLAB notation, except where where 'sum' is any
-    // associative operator.
-
-    // By default, T(i) = op (A (i,:)) is a vector whose length is the same as
-    // the number of rows of A.  T(i) is the reduction of all entries in the
-    // ith row of A.  If A_transpose is true, the T is computed as if A were
-    // transposed first, and thus its length is equal to the number of vectors
-    // of the input matrix A.  The use of A_transpose is the opposite of
-    // MATLAB, since sum(A) in MATLAB sums up the columns of A, and sum(A')
-    // sums up the rows of A..
-
-    // T is an n-by-1 GrB_Matrix that represents the vector.  It is computed
-    // as a GrB_Matrix so it can be passed to GB_accum_mask without
-    // typecasting.
-
-    ASSERT (n == ((A_transpose) ? A->vdim : A->vlen)) ;
-
-    //--------------------------------------------------------------------------
-    // scalar workspace
-    //--------------------------------------------------------------------------
-
-    size_t asize = A->type->size ;
-    GB_Type_code acode = A->type->code ;
-    const int64_t *GB_RESTRICT Ai = A->i ;
-    const GB_void *GB_RESTRICT Ax = (GB_void *) A->x ;
-    int64_t anvec = A->nvec ;
-    int64_t anz = GB_NNZ (A) ;
-
-    zsize = reduce->ztype->size ;
-    GB_Type_code zcode = reduce->ztype->code ;
+    // create the REDUCE_FIRST_ZTYPE semiring
+    GB_OK (GB_Semiring_new (&semiring, monoid, first_op)) ;
+    ASSERT_SEMIRING_OK (semiring, "semiring for reduce-to-vector", GB0) ;
 
     //--------------------------------------------------------------------------
-    // determine the number of threads to use
+    // reduce the matrix to a vector via C<M> = accum (C, A*B)
     //--------------------------------------------------------------------------
 
-    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-    int nthreads = GB_nthreads (anz + anvec, chunk, nthreads_max) ;
-
-    //--------------------------------------------------------------------------
-    // T = reduce(A) or reduce(A')
-    //--------------------------------------------------------------------------
-
-    GxB_binary_function freduce = reduce->function ;
-    GB_cast_function cast_A_to_Z = GB_cast_factory (zcode, acode) ;
-    bool nocasting = (A->type == reduce->ztype) ;
-
-    if (A_transpose)
-    {
-
-        //----------------------------------------------------------------------
-        // T = reduce(A'), where T(j) = reduce (A (:,j))
-        //----------------------------------------------------------------------
-
-        // Each vector A(:,j) is reduced to the scalar T(j)
-
-        //----------------------------------------------------------------------
-        // allocate T, including T->p, T->i, and T->x.  T is not hypersparse.
-        //----------------------------------------------------------------------
-
-        // since T is a GrB_Vector, it is CSC and not hypersparse
-        GB_OK (GB_create (&T, ttype, n, 1, GB_Ap_calloc, true,
-            GB_FORCE_NONHYPER, GB_HYPER_DEFAULT, 1, anvec, true, Context)) ;
-        ASSERT (GB_VECTOR_OK (T)) ;
-
-        T->p [0] = 0 ;
-        T->p [1] = anvec ;
-        int64_t *GB_RESTRICT Ti = T->i ;
-        GB_void *GB_RESTRICT Tx = (GB_void *) T->x ;
-        T->nvec_nonempty = (anvec > 0) ? 1 : 0 ;
-        T->magic = GB_MAGIC ;
-
-        //----------------------------------------------------------------------
-        // symbolic phase
-        //----------------------------------------------------------------------
-
-        // Construct the pattern of T.  The kth vector in A creates one entry
-        // in T, but it is flagged as a zombie if it is empty.
-
-        int64_t nzombies = 0 ;
-        const int64_t *GB_RESTRICT Ah = A->h ;
-        const int64_t *GB_RESTRICT Ap = A->p ;
-
-        int nth = GB_nthreads (anvec, chunk, nthreads_max) ;
-
-        int64_t k ;
-        #pragma omp parallel for num_threads(nth) schedule(static) \
-            reduction(+:nzombies)
-        for (k = 0 ; k < anvec ; k++)
-        {
-            // if A(:,j) is empty, then the entry in T becomes a zombie
-            int64_t j = (Ah == NULL) ? k : Ah [k] ;
-            int64_t jnz = Ap [k+1] - Ap [k] ;
-            if (jnz == 0)
-            { 
-                // A(:,j) is empty: T(j) is a zombie
-                Ti [k] = GB_FLIP (j) ;
-                nzombies++ ;
-            }
-            else
-            { 
-                // A(:,j) has at least one entry; T(j) is live
-                Ti [k] = j ;
-            }
-        }
-
-        if (A->nvec_nonempty < 0)
-        { 
-            A->nvec_nonempty = anvec - nzombies ;
-        }
-        ASSERT (A->nvec_nonempty == (anvec - nzombies)) ;
-        T->nzombies = nzombies ;
-
-        //----------------------------------------------------------------------
-        // slice the entries of A for the numeric phase
-        //----------------------------------------------------------------------
-
-        // Task tid does entries pstart_slice [tid] to pstart_slice [tid+1]-1
-        // and vectors kfirst_slice [tid] to klast_slice [tid].  The first and
-        // last vectors may be shared with prior slices and subsequent slices.
-
-        ntasks = (nthreads == 1) ? 1 : (8 * nthreads) ;
-        ntasks = GB_IMIN (ntasks, anz) ;
-        ntasks = GB_IMAX (ntasks, 1) ;
-
-        Wfirst_space = GB_MALLOC (ntasks * zsize, GB_void) ;
-        Wlast_space  = GB_MALLOC (ntasks * zsize, GB_void) ;
-
-        if (Wfirst_space == NULL || Wlast_space == NULL ||
-           !GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A, ntasks))
-        { 
-            // out of memory
-            GB_FREE_ALL ;
-            return (GB_OUT_OF_MEMORY) ;
-        }
-
-        //----------------------------------------------------------------------
-        // reduce to vector with built-in operators
-        //----------------------------------------------------------------------
-
-        bool done = false ;
-
-        #ifndef GBCOMPACT
-
-            //------------------------------------------------------------------
-            // define the worker for the switch factory
-            //------------------------------------------------------------------
-
-            #define GB_red(opname,aname) GB_red_eachvec_ ## opname ## aname
-            #define GB_RED_WORKER(opname,aname,atype)                       \
-            {                                                               \
-                info = GB_red (opname, aname) ((atype *) Tx, A,             \
-                    kfirst_slice, klast_slice, pstart_slice,                \
-                    Wfirst_space, Wlast_space, ntasks, nthreads) ;          \
-                done = (info != GrB_NO_VALUE) ;                             \
-            }                                                               \
-            break ;
-
-            if (nocasting)
-            { 
-                // controlled by opcode and typecode.  No typecasting is done.
-                GB_Opcode opcode = reduce->opcode ;
-                GB_Type_code typecode = acode ;
-                ASSERT (typecode <= GB_UDT_code) ;
-                #include "GB_red_factory.c"
-            }
-
-        #endif
-
-        //----------------------------------------------------------------------
-        // generic worker: with typecasting
-        //----------------------------------------------------------------------
-
-        if (!done)
-        { 
-
-            GB_BURBLE_MATRIX (A, "generic ") ;
-
-            #define GB_ATYPE GB_void
-            #define GB_CTYPE GB_void
-
-            // ztype s ;
-            #define GB_SCALAR(s)                                    \
-                GB_void s [GB_VLA(zsize)]
-
-            // ztype s = (ztype) Ax [p], with typecast
-            #define GB_CAST_ARRAY_TO_SCALAR(s,Ax,p)                 \
-                cast_A_to_Z (s, Ax +((p)*asize), zsize) ;           \
-
-            // s += (ztype) Ax [p], with typecast
-            #define GB_ADD_CAST_ARRAY_TO_SCALAR(s, Ax, p)           \
-                GB_void awork [GB_VLA(zsize)] ;                     \
-                cast_A_to_Z (awork, Ax +((p)*asize), zsize) ;       \
-                freduce (s, s, awork) ;
-
-            // W [k] = s, no typecast
-            #define GB_COPY_SCALAR_TO_ARRAY(W,k,s)                  \
-                memcpy (W +((k)*zsize), s, zsize) ;
-
-            // W [k] = S [i], no typecast
-            #define GB_COPY_ARRAY_TO_ARRAY(W,k,S,i)                 \
-                memcpy (W +((k)*zsize), S +((i)*zsize), zsize) ;
-
-            // W [k] += S [i], no typecast
-            #define GB_ADD_ARRAY_TO_ARRAY(W,k,S,i)                  \
-                freduce (W +((k)*zsize), W +((k)*zsize), S +((i)*zsize)) ;
-
-            // W [k] += s, no typecast
-            #define GB_ADD_SCALAR_TO_ARRAY(W,k,s)                   \
-                freduce (W +((k)*zsize), W +((k)*zsize), s) ;
-
-            // break if terminal value reached
-            #define GB_BREAK_IF_TERMINAL(t)                         \
-                if (terminal != NULL)                               \
-                {                                                   \
-                    if (memcmp (t, terminal, zsize) == 0) break ;   \
-                }
-
-            #include "GB_reduce_each_vector.c"
-        }
-
-        //----------------------------------------------------------------------
-        // wrapup: delete any zombies
-        //----------------------------------------------------------------------
-
-        ASSERT_MATRIX_OK (T, "T before wait", GB_FLIP (GB0)) ;
-
-        if (nzombies > 0)
-        { 
-            ASSERT (GB_VECTOR_OK (T)) ;
-            ASSERT (!GB_PENDING (T)) ;
-            ASSERT (GB_ZOMBIES (T)) ;
-            GB_OK (GB_Matrix_wait (T, Context)) ;
-        }
-
-        ASSERT_MATRIX_OK (T, "T output = reduce_each_vector (A)", GB0) ;
-    }
-    else
-    {
-
-        //----------------------------------------------------------------------
-        // T = reduce(A), where T(i) = reduce (A (i,:))
-        //----------------------------------------------------------------------
-
-        //----------------------------------------------------------------------
-        // select the method
-        //----------------------------------------------------------------------
-
-        // When A_transpose is false (after flipping it to account for the
-        // CSR/CSC format), n is A->vlen, the vector length of A.  This is
-        // the number of rows of a CSC matrix, or the # of columns of a CSR
-        // matrix.  The matrix A itself requires O(vdim+anz) memory if
-        // non-hypersparse and O(anz) if hypersparse.  This does not depend on
-        // A->vlen.  So if the vector length is really huge (when anz << n),
-        // the bucket method would fail.  Thus, the qsort method, below, is
-        // used when A is very sparse.
-
-        if (GB_CHOOSE_QSORT_INSTEAD_OF_BUCKET (anz, n))
-        { 
-
-            //------------------------------------------------------------------
-            // qsort method
-            //------------------------------------------------------------------
-
-            // memory usage is O(anz) and time is O(anz*log(anz)).  This is
-            // more efficient than the bucket method, below, when A is very
-            // hypersparse.  The time and memory complexity does not depend
-            // on n.
-
-            // since T is a GrB_Vector, it is not hypersparse
-            GB_OK (GB_new (&T, ttype, n, 1, GB_Ap_null, true, GB_FORCE_NONHYPER,
-                GB_HYPER_DEFAULT, 1, Context)) ;
-
-            // GB_build treats Ai and Ax as read-only; they must not be modified
-            GB_OK (GB_build
-            (
-                T,                  // construct result in the T vector
-                (GrB_Index *) Ai,   // indices inside the vector
-                NULL,               // vector indices (none)
-                Ax,                 // values, of size anz
-                anz,                // number of tuples
-                reduce,             // reduction operator
-                acode,              // type code of the Ax array
-                false,              // the input is a vector
-                false,              // indices do not need to be checked
-                Context
-            )) ;
-
-            ASSERT (T->nvec_nonempty == GB_nvec_nonempty (T, NULL)) ;
-
-        }
-        else
-        {
-
-            //------------------------------------------------------------------
-            // bucket method
-            //------------------------------------------------------------------
-
-            // Determine number of threads to use for constructing the buckets.
-            // Each thread requires O(n) workspace, so this method does not
-            // scale well when there are many threads compared to anz.  Total
-            // workspace is O(n*ntasks), so limit the # of threads used so that
-            // at most anz workspace is used.  Each thread takes a single task.
-
-            ntasks = (n > 0) ? (anz / n) : 1 ;
-            ntasks = GB_IMIN (ntasks, nthreads) ;
-            ntasks = GB_IMAX (ntasks, 1) ;
-            int nth = ntasks ;      // one thread per task
-
-            //------------------------------------------------------------------
-            // slice the entries for each thread
-            //------------------------------------------------------------------
-
-            // Thread tid does entries pstart_slice [tid] to
-            // pstart_slice [tid+1]-1.  No need to compute kfirst or klast.
-
-            pstart_slice = GB_MALLOC (ntasks+1, int64_t) ;
-            if (pstart_slice == NULL)
-            { 
-                // out of memory
-                GB_FREE_ALL ;
-                return (GB_OUT_OF_MEMORY) ;
-            }
-
-            GB_eslice (pstart_slice, anz, ntasks) ;
-
-            //------------------------------------------------------------------
-            // T(i) = reduce (A (i,:)), built-in operators
-            //------------------------------------------------------------------
-
-            bool done = false ;
-
-            #ifndef GBCOMPACT
-
-                //--------------------------------------------------------------
-                // define the worker for the switch factory
-                //--------------------------------------------------------------
-
-                // Early exit cannot be exploited; ignore the terminal value.
-
-                #undef  GB_red
-                #define GB_red(opname,aname) \
-                    GB_red_eachindex_ ## opname ## aname
-                #undef  GB_RED_WORKER
-                #define GB_RED_WORKER(opname,aname,atype)                      \
-                {                                                              \
-                    info = GB_red (opname, aname) (&T, ttype, A, pstart_slice, \
-                        ntasks, nthreads, Context) ;                           \
-                    done = (info != GrB_NO_VALUE) ;                            \
-                }                                                              \
-                break ;
-
-                //--------------------------------------------------------------
-                // launch the switch factory
-                //--------------------------------------------------------------
-
-                if (nocasting)
-                { 
-                    // controlled by opcode and typecode.  No typecasting
-                    GB_Opcode opcode = reduce->opcode ;
-                    GB_Type_code typecode = acode ;
-                    ASSERT (typecode <= GB_UDT_code) ;
-                    #include "GB_red_factory.c"
-                    if (! (info == GrB_SUCCESS || info == GrB_NO_VALUE))
-                    { 
-                        // out of memory
-                        GB_FREE_ALL ;
-                        return (info) ;
-                    }
-                }
-
-            #endif
-
-            //------------------------------------------------------------------
-            // T(i) = reduce (A (i,:)), generic worker
-            //------------------------------------------------------------------
-
-            if (!done)
-            { 
-                // if this fails, the template frees all workspace with the
-                // GB_FREE_ALL macro, defined above.
-                GB_BURBLE_MATRIX (A, "generic ") ;
-                #include "GB_reduce_each_index.c"
-            }
-        }
-        ASSERT_MATRIX_OK (T, "T output for T = reduce_each_index (A)", GB0) ;
-    }
+    GB_OK (GB_mxm (C, C_replace, M, Mask_comp, Mask_struct, accum,
+        semiring, A, A_transpose, B, false, false, GxB_DEFAULT, do_sort,
+        Context)) ;
+    ASSERT_MATRIX_OK (C, "C result for reduce-to-vector", GB0) ;
 
     //--------------------------------------------------------------------------
-    // C<M> = accum (C,T): accumulate the results into C via the mask
+    // free workspace and return result
     //--------------------------------------------------------------------------
 
-    GB_FREE_WORK ;
-    return (GB_accum_mask (C, M, NULL, accum, &T, C_replace, Mask_comp,
-        Mask_struct, Context)) ;
+    GB_FREE_ALL ;
+    return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_resize.c b/GraphBLAS/Source/GB_resize.c
index c6bdcfe071..2767fee5cf 100644
--- a/GraphBLAS/Source/GB_resize.c
+++ b/GraphBLAS/Source/GB_resize.c
@@ -2,14 +2,23 @@
 // GB_resize: change the size of a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "GB_select.h"
 
-#define GB_FREE_ALL GB_PHIX_FREE (A) ;
+#define GB_FREE_ALL         \
+{                           \
+    GB_FREE (Ax_new) ;      \
+    GB_FREE (Ab_new) ;      \
+    GB_phbix_free (A) ;     \
+}
+
+//------------------------------------------------------------------------------
+// GB_resize: resize a GrB_Matrix
+//------------------------------------------------------------------------------
 
 GrB_Info GB_resize              // change the size of a matrix
 (
@@ -25,6 +34,8 @@ GrB_Info GB_resize              // change the size of a matrix
     //--------------------------------------------------------------------------
 
     GrB_Info info ;
+    GB_void *GB_RESTRICT Ax_new = NULL ;
+    int8_t  *GB_RESTRICT Ab_new = NULL ;
     ASSERT_MATRIX_OK (A, "A to resize", GB0) ;
 
     //--------------------------------------------------------------------------
@@ -45,14 +56,11 @@ GrB_Info GB_resize              // change the size of a matrix
         vdim_new = nrows_new ;
     }
 
-    //--------------------------------------------------------------------------
-    // determine the max # of threads to use here
-    //--------------------------------------------------------------------------
-
-    // GB_selector (RESIZE) will use a different # of threads
-
-    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-    int nthreads = GB_nthreads (vdim_new - vdim_old, chunk, nthreads_max) ;
+    if (vdim_new == vdim_old && vlen_new == vlen_old)
+    { 
+        // nothing to do
+        return (GrB_SUCCESS) ;
+    }
 
     //--------------------------------------------------------------------------
     // delete any lingering zombies and assemble any pending tuples
@@ -61,49 +69,169 @@ GrB_Info GB_resize              // change the size of a matrix
     // only do so if either dimension is shrinking, or if pending tuples exist
     // and vdim_old <= 1 and vdim_new > 1, since in that case, Pending->j has
     // not been allocated yet, but would be required in the resized matrix.
+    // If A is jumbled, it must be sorted.
 
-    if (vdim_new < vdim_old || vlen_new < vlen_old ||
+    if (vdim_new < vdim_old || vlen_new < vlen_old || A->jumbled ||
         (GB_PENDING (A) && vdim_old <= 1 && vdim_new > 1))
     { 
         GB_MATRIX_WAIT (A) ;
         ASSERT_MATRIX_OK (A, "A to resize, wait", GB0) ;
     }
 
+    ASSERT (!GB_JUMBLED (A)) ;
+
     //--------------------------------------------------------------------------
-    // check for early conversion to hypersparse
+    // resize the matrix
     //--------------------------------------------------------------------------
 
-    // If the # of vectors grows very large, it is costly to reallocate enough
-    // space for the non-hypersparse A->p component.  So convert the matrix to
-    // hypersparse if that happens.
+    bool A_is_bitmap = GB_IS_BITMAP (A) ;
+    bool A_is_full = GB_IS_FULL (A) ;
+    bool A_is_shrinking = (vdim_new <= vdim_old && vlen_new <= vlen_old) ;
 
-    if (A->nvec_nonempty < 0)
-    { 
-        A->nvec_nonempty = GB_nvec_nonempty (A, Context) ;
-    }
+    if ((A_is_full || A_is_bitmap) && A_is_shrinking)
+    {
 
-    if (GB_to_hyper_test (A, A->nvec_nonempty, vdim_new))
-    { 
-        GB_OK (GB_to_hyper (A, Context)) ;
-    }
+        //----------------------------------------------------------------------
+        // A is full or bitmap
+        //----------------------------------------------------------------------
 
-    //--------------------------------------------------------------------------
-    // resize the number of sparse vectors
-    //--------------------------------------------------------------------------
+        // get the old and new dimensions
+        int64_t anz_old = vlen_old * vdim_old ;
+        int64_t anz_new = vlen_new * vdim_new ;
+        size_t nzmax_new = GB_IMAX (anz_new, 1) ;
+        size_t nzmax_old = A->nzmax ;
+        bool in_place = A_is_full && (vlen_new == vlen_old || vdim_new <= 1) ;
+        size_t asize = A->type->size ;
 
-    bool ok = true ;
+        //----------------------------------------------------------------------
+        // allocate or reallocate A->x and A->b
+        //----------------------------------------------------------------------
 
-    int64_t *GB_RESTRICT Ah = A->h ;
-    int64_t *GB_RESTRICT Ap = A->p ;
-    A->vdim = vdim_new ;
+        bool ok = true ;
+        if (in_place)
+        { 
+            // reallocate A->x in-place; no data movement needed
+            GB_REALLOC (A->x, nzmax_new*asize, nzmax_old*asize, GB_void, &ok) ;
+        }
+        else
+        { 
+            // allocate new space for A->x
+            Ax_new = GB_MALLOC (nzmax_new*asize, GB_void) ;
+            ok = (Ax_new != NULL) ;
+            if (A_is_bitmap)
+            {
+                // allocate new space for A->b
+                Ab_new = GB_MALLOC (nzmax_new*asize, int8_t) ;
+                ok = ok && (Ab_new != NULL) ;
+            }
+        }
+        if (!ok)
+        { 
+            // out of memory
+            GB_FREE_ALL ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+
+        //----------------------------------------------------------------------
+        // move data if not in-place
+        //----------------------------------------------------------------------
+
+        if (!in_place)
+        {
+
+            //------------------------------------------------------------------
+            // determine number of threads to use
+            //------------------------------------------------------------------
+
+            GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+            int nthreads = GB_nthreads (anz_new, chunk, nthreads_max) ;
 
-    if (A->is_hyper)
+            //------------------------------------------------------------------
+            // resize Ax
+            //------------------------------------------------------------------
+        
+            GB_void *GB_RESTRICT Ax_old = A->x ;
+
+            int64_t j ;
+            if (vdim_new <= 4*nthreads)
+            {
+                // use all threads for each vector
+                for (j = 0 ; j < vdim_new ; j++)
+                { 
+                    GB_void *pdest = Ax_new + j * vlen_new * asize ;
+                    GB_void *psrc  = Ax_old + j * vlen_old * asize ;
+                    GB_memcpy (pdest, psrc, vlen_new * asize, nthreads) ;
+                }
+            }
+            else
+            {
+                // use a single thread for each vector
+                #pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (j = 0 ; j < vdim_new ; j++)
+                { 
+                    GB_void *pdest = Ax_new + j * vlen_new * asize ;
+                    GB_void *psrc  = Ax_old + j * vlen_old * asize ;
+                    memcpy (pdest, psrc, vlen_new * asize) ;
+                }
+            }
+            A->x = Ax_new ;
+            GB_FREE (Ax_old) ;
+
+            //------------------------------------------------------------------
+            // resize Ab if A is bitmap, and count the # of entries
+            //------------------------------------------------------------------
+
+            if (A_is_bitmap)
+            { 
+                int8_t *GB_RESTRICT Ab_old = A->b ;
+                int64_t pnew ;
+                int64_t anvals = 0 ;
+                #pragma omp parallel for num_threads(nthreads) \
+                    schedule(static) reduction(+:anvals)
+                for (pnew = 0 ; pnew < anz_new ; pnew++)
+                { 
+                    int64_t i = pnew % vlen_new ;
+                    int64_t j = pnew / vlen_new ;
+                    int64_t pold = i + j * vlen_old ;
+                    int8_t ab = Ab_old [pold] ;
+                    Ab_new [pnew] = ab ;
+                    anvals += ab ;
+                }
+                A->nvals = anvals ;
+                A->b = Ab_new ;
+                GB_FREE (Ab_old) ;
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // adjust dimensions and return result
+        //----------------------------------------------------------------------
+
+        A->vdim = vdim_new ;
+        A->vlen = vlen_new ;
+        A->nzmax = nzmax_new ;
+        A->nvec = vdim_new ;
+        A->nvec_nonempty = (vlen_new == 0) ? 0 : vdim_new ;
+        ASSERT_MATRIX_OK (A, "A bitmap/full shrunk", GB0) ;
+        return (GrB_SUCCESS) ;
+
+    }
+    else
     {
 
         //----------------------------------------------------------------------
-        // A is hypersparse: decrease size of A->p and A->h only if needed
+        // convert A to hypersparse and resize it
         //----------------------------------------------------------------------
 
+        // convert to hypersparse
+        GB_OK (GB_convert_any_to_hyper (A, Context)) ;
+        ASSERT (GB_IS_HYPERSPARSE (A)) ;
+
+        // resize the number of sparse vectors
+        int64_t *GB_RESTRICT Ah = A->h ;
+        int64_t *GB_RESTRICT Ap = A->p ;
+        A->vdim = vdim_new ;
+
         if (vdim_new < A->plen)
         { 
             // reduce the size of A->p and A->h; this cannot fail
@@ -112,6 +240,7 @@ GrB_Info GB_resize              // change the size of a matrix
             Ap = A->p ;
             Ah = A->h ;
         }
+
         if (vdim_new < vdim_old)
         { 
             // descrease A->nvec to delete the vectors outside the range
@@ -122,73 +251,37 @@ GrB_Info GB_resize              // change the size of a matrix
             GB_SPLIT_BINARY_SEARCH (vdim_new, Ah, pleft, pright, found) ;
             A->nvec = pleft ;
         }
-    }
-    else
-    {
+
+        if (vdim_new < vdim_old)
+        { 
+            // number of vectors is decreasing, need to count the new number of
+            // non-empty vectors: done during pruning or by selector, below.
+            A->nvec_nonempty = -1 ;     // recomputed just below
+        }
 
         //----------------------------------------------------------------------
-        // A is not hypersparse: change size of A->p to match the new vdim
+        // resize the length of each vector
         //----------------------------------------------------------------------
 
-        if (vdim_new != vdim_old)
-        {
-            // change the size of A->p
-            A->p = GB_REALLOC (A->p, vdim_new+1, vdim_old+1, int64_t, &ok) ;
-            if (!ok)
-            { 
-                // out of memory
-                GB_FREE_ALL ;
-                return (GB_OUT_OF_MEMORY) ;
-            }
-            Ap = A->p ;
-            A->plen = vdim_new ;
+        // if vlen is shrinking, delete entries outside the new matrix
+        if (vlen_new < vlen_old)
+        { 
+            GB_OK (GB_selector (NULL /* A in-place */, GB_RESIZE_opcode, NULL,
+                false, A, vlen_new-1, NULL, Context)) ;
         }
 
-        if (vdim_new > vdim_old)
-        {
-            // number of vectors is increasing, extend the vector pointers
-            int64_t anz = GB_NNZ (A) ;
-
-            int64_t j ;
-            #pragma omp parallel for num_threads(nthreads) schedule(static)
-            for (j = vdim_old + 1 ; j <= vdim_new ; j++)
-            { 
-                Ap [j] = anz ;
-            }
-            // A->nvec_nonempty does not change
-        }
-        A->nvec = vdim_new ;
-    }
+        //----------------------------------------------------------------------
+        // vlen has been resized
+        //----------------------------------------------------------------------
 
-    if (vdim_new < vdim_old)
-    { 
-        // number of vectors is decreasing, need to count the new number of
-        // non-empty vectors, unless it is done during pruning, just below.
-        A->nvec_nonempty = -1 ;         // compute when needed
-    }
+        A->vlen = vlen_new ;
+        ASSERT_MATRIX_OK (A, "A vlen resized", GB0) ;
 
-    //--------------------------------------------------------------------------
-    // resize the length of each vector
-    //--------------------------------------------------------------------------
+        //----------------------------------------------------------------------
+        // conform the matrix to its desired sparsity structure
+        //----------------------------------------------------------------------
 
-    // if vlen is shrinking, delete entries outside the new matrix
-    if (vlen_new < vlen_old)
-    { 
-        GB_OK (GB_selector (NULL, GB_RESIZE_opcode, NULL, false, A, vlen_new-1,
-            NULL, Context)) ;
+        return (GB_conform (A, Context)) ;
     }
-
-    //--------------------------------------------------------------------------
-    // vlen has been resized
-    //--------------------------------------------------------------------------
-
-    A->vlen = vlen_new ;
-    ASSERT_MATRIX_OK (A, "A vlen resized", GB0) ;
-
-    //--------------------------------------------------------------------------
-    // check for conversion to hypersparse or to non-hypersparse
-    //--------------------------------------------------------------------------
-
-    return (GB_to_hyper_conform (A, Context)) ;
 }
 
diff --git a/GraphBLAS/Source/GB_scalar.h b/GraphBLAS/Source/GB_scalar.h
index 4377d4b4f1..54aad160f5 100644
--- a/GraphBLAS/Source/GB_scalar.h
+++ b/GraphBLAS/Source/GB_scalar.h
@@ -2,8 +2,8 @@
 // GB_scalar.h: definitions for GxB_Scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,19 +14,17 @@ GxB_Scalar GB_Scalar_wrap   // create a new GxB_Scalar with one entry
 (
     GxB_Scalar s,           // GxB_Scalar to create
     GrB_Type type,          // type of GxB_Scalar to create
-    int64_t *Sp,            // becomes S->p, an array of size 2
-    int64_t *Si,            // becomes S->i, an array of size 1
     void *Sx                // becomes S->x, an array of size 1 * type->size
 ) ;
 
+// stype can be NULL if op is positional
+
 // wrap a bare scalar inside a statically-allocated GxB_Scalar
-#define GB_SCALAR_WRAP(scalar,prefix,T,ampersand,bare,stype)        \
-    struct GB_Scalar_opaque scalar ## _struct ;                     \
-    int64_t Sp [2], Si [1] ;                                        \
-    size_t ssize = stype->size ;                                    \
-    GB_void Sx [GB_VLA (ssize)] ;                                   \
-    GxB_Scalar scalar = GB_Scalar_wrap (& scalar ## _struct,        \
-        stype, Sp, Si, Sx) ;                                        \
+#define GB_SCALAR_WRAP(scalar,prefix,T,ampersand,bare,stype)                \
+    struct GB_Scalar_opaque scalar ## _struct ;                             \
+    size_t ssize = (stype == NULL) ? 1 : (stype->size) ;                    \
+    GB_void Sx [GB_VLA (ssize)] ;                                           \
+    GxB_Scalar scalar = GB_Scalar_wrap (& scalar ## _struct, stype, Sx) ;   \
     memcpy (Sx, ampersand bare, ssize) ;
 
 #endif
diff --git a/GraphBLAS/Source/GB_select.c b/GraphBLAS/Source/GB_select.c
index b8d2db516e..b0ace64a92 100644
--- a/GraphBLAS/Source/GB_select.c
+++ b/GraphBLAS/Source/GB_select.c
@@ -2,8 +2,8 @@
 // GB_select: apply a select operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,7 +11,7 @@
 
 #define GB_FREE_ALL                         \
 {                                           \
-    GB_MATRIX_FREE (&T) ;                   \
+    GB_Matrix_free (&T) ;                   \
 }
 
 #include "GB_select.h"
@@ -39,7 +39,7 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
 
     // C may be aliased with M and/or A
 
-    GB_RETURN_IF_FAULTY (accum) ;
+    GB_RETURN_IF_FAULTY_OR_POSITIONAL (accum) ;
     GB_RETURN_IF_FAULTY (Thunk_in) ;
     GB_RETURN_IF_NULL_OR_FAULTY (op) ;
 
@@ -59,7 +59,7 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
     GB_Type_code typecode = A->type->code ;
     GB_Select_Opcode opcode = op->opcode ;
 
-    // this opcodes are not availabe to the user
+    // these opcodes are not availabe to the user
     ASSERT (opcode != GB_RESIZE_opcode) ;
     ASSERT (opcode != GB_NONZOMBIE_opcode) ;
 
@@ -70,24 +70,32 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
         opcode == GB_LT_ZERO_opcode || opcode == GB_LT_THUNK_opcode ||
         opcode == GB_LE_ZERO_opcode || opcode == GB_LE_THUNK_opcode ;
 
-    if (op_is_ordered_comparator && typecode == GB_UDT_code)
-    { 
+    if (op_is_ordered_comparator)
+    {
         // built-in GT, GE, LT, and LE operators cannot be used with
-        // user-defined types.  There are no built-in ordered comparators
-        // for built-in complex types.
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-            "Operator %s not defined for user-defined types", op->name))) ;
+        // user-defined or complex types.
+        if (typecode == GB_UDT_code)
+        { 
+            GB_ERROR (GrB_DOMAIN_MISMATCH,
+                "Operator %s not defined for user-defined types", op->name) ;
+        }
+        else if (typecode == GB_FC32_code || typecode == GB_FC64_code)
+        { 
+            GB_ERROR (GrB_DOMAIN_MISMATCH,
+                "Operator %s not defined for complex types", op->name) ;
+        }
     }
 
     // C = op (A) must be compatible, already checked in GB_compatible
-    // A must also be compatible with op->xtype, unless op->xtype is NULL
-    if (op->xtype != NULL && !GB_Type_compatible (A->type, op->xtype))
+
+    // A must also be compatible with op->xtype
+    if (!GB_Type_compatible (A->type, op->xtype))
     { 
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+        GB_ERROR (GrB_DOMAIN_MISMATCH,
             "Incompatible type for C=%s(A,Thunk):\n"
             "input A type [%s]\n"
             "cannot be typecast to operator input of type [%s]",
-            op->name, A->type->name, op->xtype->name))) ;
+            op->name, A->type->name, op->xtype->name) ;
     }
 
     // check the dimensions
@@ -95,12 +103,12 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
     int64_t tncols = (A_transpose) ? GB_NROWS (A) : GB_NCOLS (A) ;
     if (GB_NROWS (C) != tnrows || GB_NCOLS (C) != tncols)
     { 
-        return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
+        GB_ERROR (GrB_DIMENSION_MISMATCH,
             "Dimensions not compatible:\n"
             "output is " GBd "-by-" GBd "\n"
             "input is " GBd "-by-" GBd "%s",
             GB_NROWS (C), GB_NCOLS (C),
-            tnrows, tncols, A_transpose ? " (transposed)" : ""))) ;
+            tnrows, tncols, A_transpose ? " (transposed)" : "") ;
     }
 
     // check if op is (NE, EQ, GT, GE, LT, LE)_THUNK
@@ -108,8 +116,7 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
         (opcode >= GB_NE_THUNK_opcode && opcode <= GB_LE_THUNK_opcode) ;
 
     // check if op is TRIL, TRIU, DIAG, or OFFDIAG
-    bool op_is_positional =
-        (opcode >= GB_TRIL_opcode && opcode <= GB_OFFDIAG_opcode) ;
+    bool op_is_positional = GB_SELECTOP_IS_POSITIONAL (opcode) ;
 
     // check if op is user-defined
     bool op_is_user_defined = (opcode >= GB_USER_SELECT_opcode) ;
@@ -121,27 +128,20 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
     {
 
         // finish any pending work on the Thunk
-        GB_SCALAR_WAIT (Thunk_in) ;
+        GB_MATRIX_WAIT (Thunk_in) ;
         nz_thunk = GB_NNZ (Thunk_in) ;
 
-        // if present, Thunk_in must be 1-by-1
-        if (GB_NROWS (Thunk_in) != 1 || GB_NCOLS (Thunk_in) != 1)
-        { 
-            // Thunk present, but empty, or wrong dimensions
-            return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
-                "Thunk must be a GxB_Scalar"))) ;
-        }
-
         // if op is TRIL, TRIU, DIAG, or OFFDIAG, Thunk_in must be
         // compatible with GrB_INT64
-        if (op_is_positional && !GB_Type_compatible (GrB_INT64, Thunk_in->type))
+        if (op_is_positional &&
+            !GB_Type_compatible (GrB_INT64, Thunk_in->type))
         { 
             // Thunk not a built-in type, for a built-in select operator
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+            GB_ERROR (GrB_DOMAIN_MISMATCH,
                 "Incompatible type for C=%s(A,Thunk):\n"
                 "input Thunk type [%s]\n"
                 "not compatible with GrB_INT64 input to built-in operator %s",
-                op->name, Thunk_in->type->name, op->name))) ;
+                op->name, Thunk_in->type->name, op->name) ;
         }
 
         // if op is (NE, EQ, GT, GE, LT, LE)_THUNK, then Thunk must be
@@ -149,10 +149,10 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
         if (op_is_thunk_comparator &&
            !GB_Type_compatible (A->type, Thunk_in->type))
         { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+            GB_ERROR (GrB_DOMAIN_MISMATCH,
                 "Incompatible type for C=%s(A,Thunk):\n"
                 "input A type [%s] and Thunk type [%s] not compatible",
-                op->name, A->type->name, Thunk_in->type->name))) ;
+                op->name, A->type->name, Thunk_in->type->name) ;
         }
 
         // get the pointer to the value of Thunk_in
@@ -165,15 +165,15 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
         if (op->ttype == NULL && Thunk_in != NULL)
         { 
             // select operator does not take a Thunk, but one is present
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+            GB_ERROR (GrB_DOMAIN_MISMATCH,
                 "User-defined operator %s(A,Thunk) does not take a Thunk\n"
-                "input, but Thunk parameter is non-NULL", op->name))) ;
+                "input, but Thunk parameter is non-NULL", op->name) ;
         }
         else if (op->ttype != NULL && Thunk_in == NULL)
         { 
             // select operator takes a Thunk, but Thunk parameter is missing
-            return (GB_ERROR (GrB_NULL_POINTER, (GB_LOG,
-                "Required argument is null: [Thunk]"))) ;
+            GB_ERROR (GrB_NULL_POINTER,
+                "Required argument is null: [%s]", "Thunk") ;
         }
         else if (op->ttype != NULL && Thunk_in != NULL)
         {
@@ -181,17 +181,17 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
             // The types must match exactly.
             if (op->ttype != Thunk_in->type)
             { 
-                return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+                GB_ERROR (GrB_DOMAIN_MISMATCH,
                     "User-defined operator %s(A,Thunk) has a Thunk input\n"
                     "type of [%s], which must exactly match the type of the\n"
                     "Thunk parameter; parameter to GxB_select has type [%s]",
-                    op->name, op->ttype->name, Thunk_in->type->name))) ;
+                    op->name, op->ttype->name, Thunk_in->type->name) ;
             }
             if (nz_thunk != 1)
             { 
-                return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+                GB_ERROR (GrB_INVALID_VALUE,
                     "User-defined operator %s(A,Thunk) has a Thunk input,\n"
-                    "which must not be empty", op->name))) ;
+                    "which must not be empty", op->name) ;
             }
         }
     }
@@ -203,8 +203,12 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
     // delete any lingering zombies and assemble any pending tuples
     //--------------------------------------------------------------------------
 
-    GB_MATRIX_WAIT (M) ;
-    GB_MATRIX_WAIT (A) ;
+    GB_MATRIX_WAIT (M) ;        // TODO: delay until accum/mask phase
+    GB_MATRIX_WAIT (A) ;        // TODO: could tolerate jumbled in some cases
+
+    GB_BURBLE_DENSE (C, "(C %s) ") ;
+    GB_BURBLE_DENSE (M, "(M %s) ") ;
+    GB_BURBLE_DENSE (A, "(A %s) ") ;
 
     //--------------------------------------------------------------------------
     // handle the CSR/CSC format and the transposed case
@@ -242,8 +246,8 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
     if (typecode == GB_BOOL_code && op_is_thunk_comparator && nz_thunk > 0)
     { 
         // bthunk = (bool) Thunk_in
-        GB_cast_array ((GB_void *) (&bthunk), GB_BOOL_code,
-            xthunk_in, Thunk_in->type->code, Thunk_in->type->size, 1, 1) ;
+        GB_cast_array ((GB_void *) (&bthunk), GB_BOOL_code, xthunk_in,
+            Thunk_in->type->code, NULL, Thunk_in->type->size, 1, 1) ;
     }
 
     int64_t ithunk = 0 ;        // ithunk = (int64_t) Thunk (0)
@@ -271,8 +275,8 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
         if (nz_thunk > 0)
         { 
             // ithunk = (int64_t) (Thunk_in (0)) ;
-            GB_cast_array ((GB_void *) &ithunk, GB_INT64_code,
-                xthunk_in, Thunk_in->type->code, Thunk_in->type->size, 1, 1) ;
+            GB_cast_array ((GB_void *) &ithunk, GB_INT64_code, xthunk_in,
+                Thunk_in->type->code, NULL, Thunk_in->type->size, 1, 1) ;
         }
 
         if (flipij)
@@ -472,8 +476,10 @@ GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
     else if (is_empty)
     { 
         // selectop is always false, so T is an empty matrix
-        info = GB_new (&T, A->type, A->vlen, A->vdim, GB_Ap_calloc, A_csc,
-            GB_AUTO_HYPER, GB_HYPER_DEFAULT, 1, Context) ;
+        info = GB_new (&T, // auto (sparse or hyper), new header
+            A->type, A->vlen, A->vdim, GB_Ap_calloc, A_csc,
+            GxB_SPARSE + GxB_HYPERSPARSE, GB_Global_hyper_switch_get ( ),
+            1, Context) ;
         GB_OK (info) ;
     }
     else
diff --git a/GraphBLAS/Source/GB_select.h b/GraphBLAS/Source/GB_select.h
index 46020cf1f2..3e76c294a5 100644
--- a/GraphBLAS/Source/GB_select.h
+++ b/GraphBLAS/Source/GB_select.h
@@ -2,14 +2,15 @@
 // GB_select.h: definitions for GrB_select and related functions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #ifndef GB_SELECT_H
 #define GB_SELECT_H
 #include "GB.h"
+#include "GB_is_nonzero.h"
 
 GrB_Info GB_select          // C<M> = accum (C, select(A,k)) or select(A',k)
 (
@@ -38,24 +39,23 @@ GrB_Info GB_selector
     GB_Context Context
 ) ;
 
-//------------------------------------------------------------------------------
-// GB_is_nonzero
-//------------------------------------------------------------------------------
-
-static inline bool GB_is_nonzero (const GB_void *value, int64_t size)
-{ 
-    for (int64_t i = 0 ; i < size ; i++)
-    {
-        if (value [i] != 0) return (true) ;
-    }
-    return (false) ;
-}
+GrB_Info GB_bitmap_selector
+(
+    GrB_Matrix *Chandle,        // output matrix, NULL to modify A in-place
+    GB_Select_Opcode opcode,    // selector opcode
+    const GxB_select_function user_select,      // user select function
+    const bool flipij,          // if true, flip i and j for user operator
+    GrB_Matrix A,               // input matrix
+    const int64_t ithunk,       // (int64_t) Thunk, if Thunk is NULL
+    const GB_void *GB_RESTRICT xthunk,
+    GB_Context Context
+) ;
 
 //------------------------------------------------------------------------------
 // compiler diagnostics
 //------------------------------------------------------------------------------
 
-// Tx unused for some uses of the Generated/GB_sel_* functions
+// Some parameters are unused for some uses of the Generated/GB_sel_* functions
 #include "GB_unused.h"
 
 #endif
diff --git a/GraphBLAS/Source/GB_selector.c b/GraphBLAS/Source/GB_selector.c
index 102dd600de..f17f5fb2c5 100644
--- a/GraphBLAS/Source/GB_selector.c
+++ b/GraphBLAS/Source/GB_selector.c
@@ -2,19 +2,25 @@
 // GB_selector:  select entries from a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
+// GB_selector does the work for GB_select and the GxB_*select methods.
+// It also deletes zombies for GB_Matrix_wait using the NONZOMBIE operator,
+// and deletes entries outside a smaller matrix for GxB_*resize.
+
+// TODO: GB_selector does not exploit the mask.
+
 #include "GB_select.h"
 #include "GB_ek_slice.h"
 #include "GB_sel__include.h"
 
-#define GB_FREE_ALL                         \
-{                                           \
-    GB_MATRIX_FREE (&C) ;                   \
-    GB_FREE_WORK ;                          \
+#define GB_FREE_ALL                 \
+{                                   \
+    GB_Matrix_free (&C) ;           \
+    GB_FREE_WORK ;                  \
 }
 
 #define GB_FREE_WORK                \
@@ -30,10 +36,6 @@
     GB_FREE (Cx) ;                  \
 }
 
-//------------------------------------------------------------------------------
-// GB_selector
-//------------------------------------------------------------------------------
-
 GrB_Info GB_selector
 (
     GrB_Matrix *Chandle,        // output matrix, NULL to modify A in-place
@@ -51,51 +53,22 @@ GrB_Info GB_selector
     // check inputs
     //--------------------------------------------------------------------------
 
-    ASSERT_MATRIX_OK (A, "A input for GB_selector", GB_FLIP (GB0)) ;
     ASSERT_SELECTOP_OK_OR_NULL (op, "selectop for GB_selector", GB0) ;
     ASSERT_SCALAR_OK_OR_NULL (Thunk, "Thunk for GB_selector", GB0) ;
     ASSERT (opcode >= 0 && opcode <= GB_USER_SELECT_opcode) ;
 
+    ASSERT_MATRIX_OK (A, "A input for GB_selector", GB_FLIP (GB0)) ;
+    // positional selector (tril, triu, diag, offdiag, resize): can't be jumbled
+    ASSERT (GB_IMPLIES (opcode <= GB_RESIZE_opcode, !GB_JUMBLED (A))) ;
+    // entry selector: jumbled OK
+    ASSERT (GB_IMPLIES (opcode >  GB_RESIZE_opcode, GB_JUMBLED_OK (A))) ;
+
     GrB_Info info ;
     if (Chandle != NULL)
     { 
         (*Chandle) = NULL ;
     }
 
-    int64_t *GB_RESTRICT Zp = NULL ;
-    int64_t *GB_RESTRICT Wfirst = NULL ;
-    int64_t *GB_RESTRICT Wlast = NULL ;
-    int64_t *GB_RESTRICT C_pstart_slice = NULL ;
-
-    //--------------------------------------------------------------------------
-    // determine the number of threads and tasks to use
-    //--------------------------------------------------------------------------
-
-    int64_t anz = GB_NNZ (A) ;
-    int64_t anvec = A->nvec ;
-    double work = 8*anvec + ((opcode == GB_DIAG_opcode) ? 0 : anz) ;
-
-    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-    int nthreads = GB_nthreads (work, chunk, nthreads_max) ;
-
-    int ntasks = (nthreads == 1) ? 1 : (8 * nthreads) ;
-    ntasks = GB_IMIN (ntasks, anz) ;
-    ntasks = GB_IMAX (ntasks, 1) ;
-
-    //--------------------------------------------------------------------------
-    // get A
-    //--------------------------------------------------------------------------
-
-    int64_t *GB_RESTRICT Ah = A->h ;
-    int64_t *GB_RESTRICT Ap = A->p ;
-    int64_t *GB_RESTRICT Ai = A->i ;
-    GB_void *GB_RESTRICT Ax = (GB_void *) A->x ;
-    int64_t asize = A->type->size ;
-    int64_t aplen = A->plen ;
-    int64_t avlen = A->vlen ;
-    int64_t avdim = A->vdim ;
-    GB_Type_code typecode = A->type->code ;
-
     //--------------------------------------------------------------------------
     // get Thunk
     //--------------------------------------------------------------------------
@@ -108,6 +81,9 @@ GrB_Info GB_selector
     // If Thunk is NULL, or has no entry, it is treated as a scalar value
     // of zero.
 
+    const int64_t asize = A->type->size ;
+    const GB_Type_code typecode = A->type->code ;
+
     GB_void athunk [GB_VLA(asize)] ;
     memset (athunk, 0, asize) ;
     GB_void *GB_RESTRICT xthunk = athunk ;
@@ -123,10 +99,9 @@ GrB_Info GB_selector
             // ithunk = (int64_t) Thunk (0)
             size_t tsize = Thunk->type->size ;
             GB_cast_array ((GB_void *GB_RESTRICT) &ithunk, GB_INT64_code,
-                xthunk, tcode, tsize, 1, 1) ;
+                xthunk, tcode, NULL, tsize, 1, 1) ;
             // athunk = (atype) Thunk (0)
-            GB_cast_array (athunk, A->type->code,
-                xthunk, tcode, tsize, 1, 1) ;
+            GB_cast_array (athunk, typecode, xthunk, tcode, NULL, tsize, 1, 1) ;
             // xthunk now points to the typecasted (atype) Thunk (0)
             xthunk = athunk ;
         }
@@ -139,16 +114,85 @@ GrB_Info GB_selector
     GxB_select_function user_select = NULL ;
     if (op != NULL && opcode >= GB_USER_SELECT_opcode)
     { 
-        GB_BURBLE_MATRIX (A, "generic ") ;
+        GB_BURBLE_MATRIX (A, "(generic select: %s) ", op->name) ;
         user_select = (GxB_select_function) (op->function) ;
     }
 
+    //--------------------------------------------------------------------------
+    // handle the packed case (bitmap, full, or all entries present)
+    //--------------------------------------------------------------------------
+
+    bool use_bitmap_selector ;
+    if (opcode == GB_RESIZE_opcode || opcode == GB_NONZOMBIE_opcode)
+    { 
+        // GB_bitmap_selector does not support these opcodes.  For the RESIZE
+        // and NONZOMBIE operators, A will never be bitmap.  A is converted to
+        // hypersparse first for RESIZE, and a full/bitmap matrix never has
+        // zombies.
+        use_bitmap_selector = false ;
+    }
+    else if (opcode == GB_DIAG_opcode)
+    { 
+        // GB_bitmap_selector supports the DIAG operator, but it is currently
+        // not efficient (GB_bitmap_selector should return a sparse diagonal
+        // matrix, not bitmap).  So use the sparse case if A is not bitmap,
+        // since the sparse case below does not support the bitmap case.
+        use_bitmap_selector = GB_IS_BITMAP (A) ;
+    }
+    else
+    { 
+        // For bitmap, full, or packed matrices (sparse/hypersparse with all
+        // entries present, not jumbled, no zombies, and no pending tuples),
+        // use the bitmap selector for all other operators (TRIL, TRIU,
+        // OFFDIAG, NONZERO, EQ*, GT*, GE*, LT*, LE*, and user-defined
+        // operators).
+        use_bitmap_selector = GB_is_packed (A) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // bitmap/full case
+    //--------------------------------------------------------------------------
+
+    if (use_bitmap_selector)
+    { 
+        GB_BURBLE_MATRIX (A, "(bitmap select: %s) ", op->name) ;
+        return (GB_bitmap_selector (Chandle, opcode, user_select, flipij, A,
+            ithunk, xthunk, Context)) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // get A: sparse, hypersparse, or full
+    //--------------------------------------------------------------------------
+
+    // the case when A is bitmap is always handled above by GB_bitmap_selector
+    ASSERT (!GB_IS_BITMAP (A)) ;
+
+    int64_t *GB_RESTRICT Ap = A->p ;
+    int64_t *GB_RESTRICT Ah = A->h ;
+    int64_t *GB_RESTRICT Ai = A->i ;
+    GB_void *GB_RESTRICT Ax = (GB_void *) A->x ;
+    int64_t aplen = A->plen ;
+    int64_t avlen = A->vlen ;
+    int64_t avdim = A->vdim ;
+    int64_t anvec = A->nvec ;
+    bool A_jumbled = A->jumbled ;
+
+    //--------------------------------------------------------------------------
+    // declare workspace
+    //--------------------------------------------------------------------------
+
+    int64_t *GB_RESTRICT Zp = NULL ;
+    int64_t *GB_RESTRICT Wfirst = NULL ;
+    int64_t *GB_RESTRICT Wlast = NULL ;
+    int64_t *GB_RESTRICT C_pstart_slice = NULL ;
+
     //--------------------------------------------------------------------------
     // allocate the new vector pointers of C
     //--------------------------------------------------------------------------
 
     GrB_Matrix C = NULL ;
-    int64_t *GB_RESTRICT Cp = GB_CALLOC (aplen+1, int64_t) ;
+    int64_t cplen = (GB_IS_FULL (A)) ? avdim : aplen ;
+    int64_t *GB_RESTRICT Cp = GB_CALLOC (cplen+1, int64_t) ;
     int64_t *GB_RESTRICT Ch = NULL ;
     int64_t *GB_RESTRICT Ci = NULL ;
     GB_void *GB_RESTRICT Cx = NULL ;
@@ -156,10 +200,22 @@ GrB_Info GB_selector
     if (Cp == NULL)
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
+    ASSERT (anvec <= cplen) ;
     Cp [anvec] = 0 ;
 
+    //--------------------------------------------------------------------------
+    // determine the number of threads and tasks to use
+    //--------------------------------------------------------------------------
+
+    int64_t anz = GB_NNZ_HELD (A) ;
+    double work = 8*anvec + ((opcode == GB_DIAG_opcode) ? 0 : anz) ;
+
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+    int nthreads = GB_nthreads (work, chunk, nthreads_max) ;
+    int ntasks = (nthreads == 1) ? 1 : (8 * nthreads) ;
+
     //--------------------------------------------------------------------------
     // slice the entries for each task
     //--------------------------------------------------------------------------
@@ -169,44 +225,44 @@ GrB_Info GB_selector
     // vectors may be shared with prior slices and subsequent slices.
 
     int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
-    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A, ntasks))
+    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A, &ntasks))
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
     // allocate workspace for each task
     //--------------------------------------------------------------------------
 
-    Wfirst         = GB_CALLOC (ntasks, int64_t) ;
-    Wlast          = GB_CALLOC (ntasks, int64_t) ;
+    Wfirst = GB_CALLOC (ntasks, int64_t) ;
+    Wlast  = GB_CALLOC (ntasks, int64_t) ;
     C_pstart_slice = GB_CALLOC (ntasks, int64_t) ;
     if (Wfirst == NULL || Wlast  == NULL || C_pstart_slice == NULL)
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
     // count the live entries in each vector
     //--------------------------------------------------------------------------
 
-    // Use the GB_reduce_each_vector template to count the number of live
-    // entries in each vector of A.  The result is computed in Cp, where Cp [k]
-    // is the number of live entries in the kth vector of A.
+    // Count the number of live entries in each vector of A.  The result is
+    // computed in Cp, where Cp [k] is the number of live entries in the kth
+    // vector of A.
 
     if (opcode <= GB_RESIZE_opcode)
     {
         // allocate Zp
-        Zp = GB_MALLOC (aplen, int64_t) ;
+        Zp = GB_MALLOC (cplen, int64_t) ;
         if (Zp == NULL)
         { 
             // out of memory
             GB_FREE_ALL ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
     }
 
@@ -218,8 +274,7 @@ GrB_Info GB_selector
     #define GB_sel1(opname,aname) GB_sel_phase1_ ## opname ## aname
     #define GB_SEL_WORKER(opname,aname,atype)                           \
     {                                                                   \
-        GB_sel1 (opname, aname) (Zp, Cp,                                \
-            (GB_void *) Wfirst, (GB_void *) Wlast,                      \
+        GB_sel1 (opname, aname) (Zp, Cp, Wfirst, Wlast,                 \
             A, kfirst_slice, klast_slice, pstart_slice, flipij, ithunk, \
             (atype *) xthunk, user_select, ntasks, nthreads) ;          \
     }                                                                   \
@@ -280,23 +335,19 @@ GrB_Info GB_selector
     //--------------------------------------------------------------------------
 
     Ci = GB_MALLOC (cnz, int64_t) ;
-
-    if (opcode == GB_EQ_ZERO_opcode)
-    { 
-        // since Cx [0..cnz-1] is all zero, phase2 only needs to construct
-        // the pattern in Ci
-        Cx = GB_CALLOC (cnz * asize, GB_void) ;
-    }
-    else
-    { 
-        Cx = GB_MALLOC (cnz * asize, GB_void) ;
-    }
-
+    Cx = GB_MALLOC (cnz * asize, GB_void) ;
     if (Ci == NULL || Cx == NULL)
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    if (opcode == GB_EQ_ZERO_opcode)
+    { 
+        // Set Cx [0..cnz-1] to all zero, so that phase2 only needs to
+        // construct the pattern in Ci.
+        GB_memset (Cx, 0, cnz * asize, nthreads_max) ;
     }
 
     //--------------------------------------------------------------------------
@@ -327,7 +378,8 @@ GrB_Info GB_selector
         // transplant C back into A
         //----------------------------------------------------------------------
 
-        if (A->is_hyper && C_nvec_nonempty < anvec)
+        // TODO: this is not parallel: use GB_hyper_prune
+        if (A->h != NULL && C_nvec_nonempty < anvec)
         {
             // prune empty vectors from Ah and Ap
             int64_t cnvec = 0 ;
@@ -359,17 +411,17 @@ GrB_Info GB_selector
         A->x = Cx ; Cx = NULL ;
         A->nzmax = cnz ;
         A->nvec_nonempty = C_nvec_nonempty ;
-
-        if (A->nzmax == 0)
-        { 
-            GB_FREE (A->i) ;
-            GB_FREE (A->x) ;
-        }
+        A->jumbled = A_jumbled ;        // A remains jumbled (in-place select)
 
         // the NONZOMBIES opcode may have removed all zombies, but A->nzombie
         // is still nonzero.  It set to zero in GB_Matrix_wait.
         ASSERT_MATRIX_OK (A, "A output for GB_selector", GB_FLIP (GB0)) ;
 
+        // positional selector (tril, triu, diag, offdiag, resize): not jumbled
+        ASSERT (GB_IMPLIES (opcode <= GB_RESIZE_opcode, !GB_JUMBLED (A))) ;
+        // entry selector: C can be returned as jumbled
+        ASSERT (GB_IMPLIES (opcode >  GB_RESIZE_opcode, GB_JUMBLED_OK (A))) ;
+
     }
     else
     {
@@ -378,21 +430,29 @@ GrB_Info GB_selector
         // create C and transplant Cp, Ch, Ci, Cx into C
         //----------------------------------------------------------------------
 
-        info = GB_new (&C, A->type, avlen, avdim, GB_Ap_null, true,
-            GB_SAME_HYPER_AS (A->is_hyper), A->hyper_ratio, aplen, Context) ;
+        ASSERT (C == NULL) ;
+        int sparsity = (A->h != NULL) ? GxB_HYPERSPARSE : GxB_SPARSE ;
+        info = GB_new (&C, // sparse or hyper (from A), new header
+            A->type, avlen, avdim, GB_Ap_null, true,
+            sparsity, A->hyper_switch, aplen, Context) ;
         GB_OK (info) ;
 
-        if (A->is_hyper)
-        {
+        if (A->h != NULL)
+        { 
+
+            //------------------------------------------------------------------
+            // A and C are hypersparse: copy non-empty vectors from Ah to Ch
+            //------------------------------------------------------------------
+
             Ch = GB_MALLOC (aplen, int64_t) ;
             if (Ch == NULL)
             { 
                 // out of memory
                 GB_FREE_ALL ;
-                return (GB_OUT_OF_MEMORY) ;
+                return (GrB_OUT_OF_MEMORY) ;
             }
 
-            // copy non-empty vectors from Ah to Ch
+            // TODO: do in parallel: use GB_hyper_prune
             int64_t cnvec = 0 ;
             for (int64_t k = 0 ; k < anvec ; k++)
             {
@@ -415,15 +475,15 @@ GrB_Info GB_selector
         C->nzmax = cnz ;
         C->magic = GB_MAGIC ;
         C->nvec_nonempty = C_nvec_nonempty ;
-
-        if (C->nzmax == 0)
-        { 
-            GB_FREE (C->i) ;
-            GB_FREE (C->x) ;
-        }
+        C->jumbled = A_jumbled ;    // C is jumbled if A is jumbled
 
         (*Chandle) = C ;
         ASSERT_MATRIX_OK (C, "C output for GB_selector", GB0) ;
+
+        // positional selector (tril, triu, diag, offdiag, resize): not jumbled
+        ASSERT (GB_IMPLIES (opcode <= GB_RESIZE_opcode, !GB_JUMBLED (C))) ;
+        // entry selector: C can be returned as jumbled
+        ASSERT (GB_IMPLIES (opcode >  GB_RESIZE_opcode, GB_JUMBLED_OK (C))) ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_setElement.c b/GraphBLAS/Source/GB_setElement.c
index c68a32bfaf..28d82473a5 100644
--- a/GraphBLAS/Source/GB_setElement.c
+++ b/GraphBLAS/Source/GB_setElement.c
@@ -2,8 +2,8 @@
 // GB_setElement: C(row,col) = scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -50,15 +50,15 @@ GrB_Info GB_setElement              // set a single entry, C(row,col) = scalar
 
     if (row >= GB_NROWS (C))
     { 
-        return (GB_ERROR (GrB_INVALID_INDEX, (GB_LOG,
+        GB_ERROR (GrB_INVALID_INDEX,
             "Row index " GBu " out of range; must be < " GBd,
-            row, GB_NROWS (C)))) ;
+            row, GB_NROWS (C)) ;
     }
     if (col >= GB_NCOLS (C))
     { 
-        return (GB_ERROR (GrB_INVALID_INDEX, (GB_LOG,
+        GB_ERROR (GrB_INVALID_INDEX,
             "Column index " GBu " out of range; must be < " GBd,
-            col, GB_NCOLS (C)))) ;
+            col, GB_NCOLS (C)) ;
     }
 
     ASSERT (scalar_code <= GB_UDT_code) ;
@@ -69,14 +69,16 @@ GrB_Info GB_setElement              // set a single entry, C(row,col) = scalar
     // scalar_code and C must be compatible
     if (!GB_code_compatible (scalar_code, ccode))
     { 
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
+        GB_ERROR (GrB_DOMAIN_MISMATCH,
             "Input scalar of type [%s]\n"
             "cannot be typecast to entry of type [%s]",
-            GB_code_string (scalar_code), ctype->name))) ;
+            GB_code_string (scalar_code), ctype->name) ;
     }
 
-    // pending tuples and zombies are expected
-    ASSERT (GB_PENDING_OK (C)) ; ASSERT (GB_ZOMBIES_OK (C)) ;
+    // pending tuples and zombies are expected, and C might be jumbled too
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    ASSERT (GB_PENDING_OK (C)) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
 
     #if GB_BURBLE
     bool burble = GB_Global_burble_get ( ) ;
@@ -85,6 +87,25 @@ GrB_Info GB_setElement              // set a single entry, C(row,col) = scalar
     burble = burble && ((C->vlen > 1) || (C->vdim > 1)) ;
     #endif
 
+    //--------------------------------------------------------------------------
+    // sort C if needed; do not assemble pending tuples or kill zombies yet
+    //--------------------------------------------------------------------------
+
+    if (C->jumbled)
+    { 
+        // C must not be jumbled; this also kills zombies and assembles
+        // pending tuples
+        GB_OK (GB_Matrix_wait (C, Context)) ;
+        ASSERT (!GB_JUMBLED (C)) ;
+        ASSERT (!GB_PENDING (C)) ;
+        ASSERT (!GB_ZOMBIES (C)) ;
+    }
+
+    // zombies and pending tuples are still OK, but C is no longer jumbled
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (GB_PENDING_OK (C)) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+
     //--------------------------------------------------------------------------
     // handle the CSR/CSC format
     //--------------------------------------------------------------------------
@@ -103,29 +124,49 @@ GrB_Info GB_setElement              // set a single entry, C(row,col) = scalar
         j = row ;
     }
 
-    //--------------------------------------------------------------------------
-    // binary search in C->h for vector j, or constant time lookup if not hyper
-    //--------------------------------------------------------------------------
+    int64_t pleft ;
+    bool found ;
+    bool is_zombie ;
+    bool C_is_bitmap = GB_IS_BITMAP (C) ;
 
-    int64_t pC_start, pC_end, pleft = 0, pright = C->nvec - 1 ;
-    bool found = GB_lookup (C->is_hyper, C->h, C->p, &pleft, pright, j,
-        &pC_start, &pC_end) ;
+    if (GB_IS_FULL (C) || C_is_bitmap)
+    { 
 
-    //--------------------------------------------------------------------------
-    // binary search in kth vector for index i
-    //--------------------------------------------------------------------------
+        //----------------------------------------------------------------------
+        // C is bitmap or full
+        //----------------------------------------------------------------------
 
-    bool is_zombie ;
-    if (found)
-    { 
-        // vector j has been found; now look for index i
-        pleft = pC_start ;
-        pright = pC_end - 1 ;
-
-        // Time taken for this step is at most O(log(nnz(C(:,j))).
-        const int64_t *Ci = C->i ;
-        GB_BINARY_SEARCH_ZOMBIE (i, Ci, pleft, pright, found, C->nzombies,
-            is_zombie) ;
+        pleft = i + j * C->vlen ;
+        found = true ;
+        is_zombie = false ;
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // binary search in C->h for vector j, or O(1)-time lookup if sparse
+        //----------------------------------------------------------------------
+
+        int64_t pC_start, pC_end, pright = C->nvec - 1 ;
+        pleft = 0 ;
+        found = GB_lookup (C->h != NULL, C->h, C->p, C->vlen, &pleft,
+            pright, j, &pC_start, &pC_end) ;
+
+        //----------------------------------------------------------------------
+        // binary search in kth vector for index i
+        //----------------------------------------------------------------------
+
+        if (found)
+        { 
+            // vector j has been found; now look for index i
+            pleft = pC_start ;
+            pright = pC_end - 1 ;
+
+            // Time taken for this step is at most O(log(nnz(C(:,j))).
+            const int64_t *GB_RESTRICT Ci = C->i ;
+            GB_BINARY_SEARCH_ZOMBIE (i, Ci, pleft, pright, found, C->nzombies,
+                is_zombie) ;
+        }
     }
 
     //--------------------------------------------------------------------------
@@ -147,14 +188,20 @@ GrB_Info GB_setElement              // set a single entry, C(row,col) = scalar
 
         // typecast or copy the scalar into C
         GB_cast_array (((GB_void *) C->x) +(pleft*csize), ccode,
-            (GB_void *) scalar, scalar_code, csize, 1, 1) ;
+            (GB_void *) scalar, scalar_code, NULL, csize, 1, 1) ;
 
         if (is_zombie)
-        {
+        { 
             // bring the zombie back to life
             C->i [pleft] = i ;
             C->nzombies-- ;
-            if (C->nzombies == 0 && C->Pending == NULL) { if (!GB_queue_remove (C)) GB_PANIC ; } // TODO in 4.0: delete
+        }
+        else if (C_is_bitmap)
+        { 
+            // set the entry in the C bitmap
+            int8_t cb = C->b [pleft] ;
+            C->nvals += (cb == 0) ;
+            C->b [pleft] = 1 ;
         }
 
         // the check is fine but just costly even when debugging
@@ -171,7 +218,7 @@ GrB_Info GB_setElement              // set a single entry, C(row,col) = scalar
 
         // No typecasting can be done.  The new pending tuple must either be
         // the first pending tuple, or its type must match the prior pending
-        // tuples.  See GB_subassigner for a complete description.
+        // tuples.  See GB_subassign_methods.h for a complete description.
 
         // stype is the type of this scalar
         GrB_Type stype = GB_code_type (scalar_code, ctype) ;
@@ -214,7 +261,7 @@ GrB_Info GB_setElement              // set a single entry, C(row,col) = scalar
             #if GB_BURBLE
             if (burble)
             {
-                GBBURBLE (" [ *_setElement ") ;
+                GBURBLE (" [ *_setElement ") ;
                 #if defined ( _OPENMP )
                 t_burble = GB_OPENMP_GET_WTIME ;
                 #endif
@@ -222,7 +269,7 @@ GrB_Info GB_setElement              // set a single entry, C(row,col) = scalar
             #endif
 
             // delete any lingering zombies and assemble the pending tuples
-            GB_MATRIX_WAIT (C) ;
+            GB_OK (GB_Matrix_wait (C, Context)) ;
 
             #if GB_BURBLE
             if (burble)
@@ -254,12 +301,11 @@ GrB_Info GB_setElement              // set a single entry, C(row,col) = scalar
             stype, NULL, i, j, C->vdim > 1))
         { 
             // out of memory
-            GB_PHIX_FREE (C) ;
-            return (GB_OUT_OF_MEMORY) ;
+            GB_phbix_free (C) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         ASSERT (GB_PENDING (C)) ;
-        if (!(C->enqueued)) { if (!GB_queue_insert (C)) GB_PANIC ; } // TODO in 4.0: delete
 
         // if this was the first tuple, then the pending operator and
         // pending type have been defined
@@ -275,7 +321,7 @@ GrB_Info GB_setElement              // set a single entry, C(row,col) = scalar
         burble = (burble && GB_shall_block (C)) ;
         if (burble)
         {
-            GBBURBLE (" [ *_setElement ") ;
+            GBURBLE (" [ *_setElement ") ;
             #if defined ( _OPENMP )
             t_burble = GB_OPENMP_GET_WTIME ;
             #endif
diff --git a/GraphBLAS/Source/GB_shallow_copy.c b/GraphBLAS/Source/GB_shallow_copy.c
index 4f7366bb35..c3483f1361 100644
--- a/GraphBLAS/Source/GB_shallow_copy.c
+++ b/GraphBLAS/Source/GB_shallow_copy.c
@@ -2,8 +2,8 @@
 // GB_shallow_copy: create a shallow copy of a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,9 +17,11 @@
 // not user-callable.  Shallow matrices are never passed back to the user.
 
 // Compare this function with GB_shallow_op.c
+// A has any sparsity structure (hypersparse, sparse, bitmap, or full)
 
 #include "GB_transpose.h"
 
+GB_PUBLIC                   // used by GraphBLAS MATLAB interface
 GrB_Info GB_shallow_copy    // create a purely shallow matrix
 (
     GrB_Matrix *Chandle,    // output matrix C
@@ -34,22 +36,24 @@ GrB_Info GB_shallow_copy    // create a purely shallow matrix
     //--------------------------------------------------------------------------
 
     ASSERT (Chandle != NULL) ;
-    ASSERT_MATRIX_OK (A, "A for shallow cast", GB0) ;
-    ASSERT ((A->nzmax == 0) == (A->i == NULL && A->x == NULL)) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
-
+    ASSERT_MATRIX_OK (A, "A for shallow copy", GB0) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
     (*Chandle) = NULL ;
 
     //--------------------------------------------------------------------------
     // construct a shallow copy of A for the pattern of C
     //--------------------------------------------------------------------------
 
-    // allocate the struct for C, but do not allocate C->h, C->p, C->i, or C->x.
-    // C has the exact same hypersparsity as A.
+    // allocate the struct for C, but do not allocate C->[p,h,b,i,x].
+    // C has the exact same sparsity structure as A.
     GrB_Info info ;
-    GrB_Matrix C = NULL ;           // allocate a new header for C
-    info = GB_new (&C, A->type, A->vlen, A->vdim, GB_Ap_null, C_is_csc,
-        GB_SAME_HYPER_AS (A->is_hyper), A->hyper_ratio, 0, Context) ;
+    GrB_Matrix C = NULL ;
+    int sparsity = GB_sparsity (C) ;
+    info = GB_new (&C, // sparse or hyper, new header
+        A->type, A->vlen, A->vdim, GB_Ap_null, C_is_csc,
+        sparsity, A->hyper_switch, 0, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory
@@ -60,17 +64,15 @@ GrB_Info GB_shallow_copy    // create a purely shallow matrix
     // make a shallow copy of the vector pointers
     //--------------------------------------------------------------------------
 
-    ASSERT (C->magic == GB_MAGIC2) ;   // [ be careful; C not yet initialized
-    C->p_shallow = true ;           // C->p not freed when freeing C
-    C->h_shallow = true ;           // C->h not freed when freeing C
-    C->p = A->p ;                   // C->p is of size A->plen + 1
-    C->h = A->h ;                   // C->h is of size A->plen
-    C->plen = A->plen ;             // C and A have the same hyperlist sizes
+    ASSERT (C->magic == GB_MAGIC2) ;    // C not yet initialized
+    C->p_shallow = (A->p != NULL) ;     // C->p not freed when freeing C
+    C->h_shallow = (A->h != NULL) ;     // C->h not freed when freeing C
+    C->p = A->p ;                       // C->p is of size A->plen + 1
+    C->h = A->h ;                       // C->h is of size A->plen
+    C->plen = A->plen ;                 // C and A have the same hyperlist size
     C->nvec = A->nvec ;
-    ASSERT (A->nvec_nonempty == -1 ||   // can be postponed
-            A->nvec_nonempty == GB_nvec_nonempty (A, Context)) ;
     C->nvec_nonempty = A->nvec_nonempty ;
-    C->magic = GB_MAGIC ;           // C is now initialized ]
+    C->magic = GB_MAGIC ;
 
     //--------------------------------------------------------------------------
     // check for empty matrix
@@ -80,8 +82,10 @@ GrB_Info GB_shallow_copy    // create a purely shallow matrix
     { 
         // C->p and C->h are shallow but the rest is empty
         C->nzmax = 0 ;
+        C->b = NULL ;
         C->i = NULL ;
         C->x = NULL ;
+        C->b_shallow = false ;
         C->i_shallow = false ;
         C->x_shallow = false ;
         ASSERT_MATRIX_OK (C, "C = quick copy of empty A", GB0) ;
@@ -93,8 +97,12 @@ GrB_Info GB_shallow_copy    // create a purely shallow matrix
     // make a shallow copy of the pattern
     //--------------------------------------------------------------------------
 
-    C->i = A->i ;               // of size A->nzmax
-    C->i_shallow = true ;       // C->i will not be freed when freeing C
+    C->b = A->b ;                   // of size A->nzmax
+    C->b_shallow = (A->b != NULL) ; // C->b will not be freed when freeing C
+    C->nvals = A->nvals ;
+
+    C->i = A->i ;                   // of size A->nzmax
+    C->i_shallow = (A->i != NULL) ; // C->i will not be freed when freeing C
 
     //--------------------------------------------------------------------------
     // make a shallow copy of the values
@@ -102,7 +110,7 @@ GrB_Info GB_shallow_copy    // create a purely shallow matrix
 
     C->nzmax = A->nzmax ;
     C->x = A->x ;
-    C->x_shallow = true ;       // C->x will not be freed when freeing C
+    C->x_shallow = (A->x != NULL) ; // C->x will not be freed when freeing C
     ASSERT_MATRIX_OK (C, "C = pure shallow (A)", GB0) ;
     (*Chandle) = C ;
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GB_shallow_op.c b/GraphBLAS/Source/GB_shallow_op.c
index aaed2a56a7..e056f163fb 100644
--- a/GraphBLAS/Source/GB_shallow_op.c
+++ b/GraphBLAS/Source/GB_shallow_op.c
@@ -2,8 +2,8 @@
 // GB_shallow_op:  create a shallow copy and apply a unary operator to a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -46,26 +46,33 @@ GrB_Info GB_shallow_op      // create shallow matrix and apply operator
 
     ASSERT (Chandle != NULL) ;
     ASSERT_MATRIX_OK (A, "A for shallow_op", GB0) ;
-    GrB_Type ztype, op_intype ;
-    GB_Opcode opcode ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+
+    GrB_Type ztype, op_intype = NULL ;
+    GB_Opcode opcode = (op1 != NULL) ? op1->opcode : op2->opcode ;
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (opcode) ;
     if (op1 != NULL)
     {
         ASSERT_UNARYOP_OK (op1, "unop for shallow_op", GB0) ;
-        ASSERT (GB_Type_compatible (op1->xtype, A->type)) ;
-        op_intype = op1->xtype ;
+        if (!op_is_positional)
+        { 
+            ASSERT (GB_Type_compatible (op1->xtype, A->type)) ;
+            op_intype = op1->xtype ;
+        }
         ztype = op1->ztype ;
-        opcode = op1->opcode ;
     }
     else // op2 != NULL
     {
         ASSERT_BINARYOP_OK (op2, "binop for shallow_op", GB0) ;
-        op_intype = (binop_bind1st) ? op2->xtype : op2->ytype ;
+        if (!op_is_positional)
+        { 
+            op_intype = (binop_bind1st) ? op2->xtype : op2->ytype ;
+            ASSERT (GB_Type_compatible (op_intype, A->type)) ;
+        }
         ztype = op2->ztype ;
-        opcode = op2->opcode ;
     }
-    ASSERT (GB_Type_compatible (op_intype, A->type)) ;
-    ASSERT ((A->nzmax == 0) == (A->i == NULL && A->x == NULL)) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
 
     (*Chandle) = NULL ;
 
@@ -73,12 +80,13 @@ GrB_Info GB_shallow_op      // create shallow matrix and apply operator
     // construct a shallow copy of A for the pattern of C
     //--------------------------------------------------------------------------
 
-    // allocate the struct for C, but do not allocate C->h, C->p, C->i, or C->x.
-    // C has the exact same hypersparsity as A.
+    // allocate the struct for C, but do not allocate C->{p,h,b,i,x}
+    // C has the exact same sparsity structure as A.
     GrB_Info info ;
-    GrB_Matrix C = NULL ;           // allocate a new header for C
-    info = GB_new (&C, ztype, A->vlen, A->vdim, GB_Ap_null, C_is_csc,
-        GB_SAME_HYPER_AS (A->is_hyper), A->hyper_ratio, 0, Context) ;
+    GrB_Matrix C = NULL ;
+    info = GB_new (&C, // full, bitmap, sparse or hyper; new header
+        ztype, A->vlen, A->vdim, GB_Ap_null, C_is_csc,
+        GB_sparsity (A), A->hyper_switch, 0, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory
@@ -89,17 +97,15 @@ GrB_Info GB_shallow_op      // create shallow matrix and apply operator
     // make a shallow copy of the vector pointers
     //--------------------------------------------------------------------------
 
-    ASSERT (C->magic == GB_MAGIC2) ;   // [ be careful; C not yet initialized
-    C->p_shallow = true ;           // C->p not freed when freeing C
-    C->h_shallow = true ;           // C->h not freed when freeing C
-    C->p = A->p ;                   // C->p is of size A->plen + 1
-    C->h = A->h ;                   // C->h is of size A->plen
-    C->plen = A->plen ;             // C and A have the same hyperlist sizes
+    C->p_shallow = (A->p != NULL) ;     // C->p not freed when freeing C
+    C->h_shallow = (A->h != NULL) ;     // C->h not freed when freeing C
+    C->p = A->p ;                       // C->p is of size A->plen + 1
+    C->h = A->h ;                       // C->h is of size A->plen
+    C->plen = A->plen ;                 // C and A have the same hyperlist sizes
     C->nvec = A->nvec ;
-    ASSERT (A->nvec_nonempty == -1 ||   // can be postponed
-            A->nvec_nonempty == GB_nvec_nonempty (A, Context)) ;
     C->nvec_nonempty = A->nvec_nonempty ;
-    C->magic = GB_MAGIC ;           // C is now initialized ]
+    C->nvals = A->nvals ;               // if A bitmap 
+    C->magic = GB_MAGIC ;
 
     //--------------------------------------------------------------------------
     // check for empty matrix
@@ -109,8 +115,10 @@ GrB_Info GB_shallow_op      // create shallow matrix and apply operator
     { 
         // C->p and C->h are shallow but the rest is empty
         C->nzmax = 0 ;
+        C->b = NULL ;
         C->i = NULL ;
         C->x = NULL ;
+        C->b_shallow = false ;
         C->i_shallow = false ;
         C->x_shallow = false ;
         ASSERT_MATRIX_OK (C, "C = quick copy of empty A", GB0) ;
@@ -123,7 +131,10 @@ GrB_Info GB_shallow_op      // create shallow matrix and apply operator
     //--------------------------------------------------------------------------
 
     C->i = A->i ;               // of size A->nzmax
-    C->i_shallow = true ;       // C->i will not be freed when freeing C
+    C->i_shallow = (A->i != NULL) ; // C->i will not be freed when freeing C
+
+    C->b = A->b ;               // of size A->nzmax
+    C->b_shallow = (A->b != NULL) ;  // C->b will not be freed when freeing C
 
     //--------------------------------------------------------------------------
     // make a shallow copy of the values, if possible
@@ -133,8 +144,8 @@ GrB_Info GB_shallow_op      // create shallow matrix and apply operator
     // are used with no typecasting, C->x becomes a shallow copy of A->x, and
     // no work is done.
 
-    int64_t anz = GB_NNZ (A) ;
-    ASSERT (A->nzmax >= GB_IMAX (anz,1)) ;
+    int64_t anz = GB_NNZ_HELD (A) ;
+    ASSERT (A->nzmax >= GB_IMAX (anz, 1)) ;
 
     if (A->type == op_intype &&
         ((opcode == GB_IDENTITY_opcode) || (opcode == GB_ANY_opcode) ||
@@ -142,7 +153,7 @@ GrB_Info GB_shallow_op      // create shallow matrix and apply operator
          (opcode == GB_SECOND_opcode &&  binop_bind1st)))
     { 
         // no work is done at all.  C is a pure shallow copy
-        GBBURBLE ("(pure shallow) ") ;
+        GBURBLE ("(pure shallow) ") ;
         C->nzmax = A->nzmax ;
         C->x = A->x ;
         C->x_shallow = true ;       // C->x will not be freed when freeing C
@@ -156,21 +167,25 @@ GrB_Info GB_shallow_op      // create shallow matrix and apply operator
     //--------------------------------------------------------------------------
 
     // allocate new space for the numerical values of C
-    C->nzmax = GB_IMAX (anz,1) ;
+    C->nzmax = GB_IMAX (anz, 1) ;
     C->x = GB_MALLOC (C->nzmax * C->type->size, GB_void) ;
     C->x_shallow = false ;          // free C->x when freeing C
     if (C->x == NULL)
     { 
         // out of memory
-        GB_MATRIX_FREE (&C) ;
-        return (GB_OUT_OF_MEMORY) ;
+        GB_Matrix_free (&C) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
-          GB_void *Cx = (GB_void *) C->x ;
-    const GB_void *Ax = (GB_void *) A->x ;
-    GB_apply_op (Cx, 
-        op1, op2, scalar, binop_bind1st,
-        Ax, A->type, anz, Context) ;
+    GB_void *Cx = (GB_void *) C->x ;
+    info = GB_apply_op (Cx, op1,    // op1 is never identity of same types
+        op2, scalar, binop_bind1st, A, Context) ;
+    if (info != GrB_SUCCESS)
+    { 
+        // out of memory
+        GB_Matrix_free (&C) ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
 
     //--------------------------------------------------------------------------
     // return the result
diff --git a/GraphBLAS/Source/GB_size_t_multiply.c b/GraphBLAS/Source/GB_size_t_multiply.c
index 27c4eb1299..141f1b73f4 100644
--- a/GraphBLAS/Source/GB_size_t_multiply.c
+++ b/GraphBLAS/Source/GB_size_t_multiply.c
@@ -2,8 +2,8 @@
 // GB_size_t_multiply:  multiply two size_t and guard against overflow
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_slice.c b/GraphBLAS/Source/GB_slice.c
deleted file mode 100644
index a1141976e8..0000000000
--- a/GraphBLAS/Source/GB_slice.c
+++ /dev/null
@@ -1,145 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_slice: create hypersparse shallow slices of a matrix B
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// For each slice s, create Bslice [s] as a slice or hyperslice of B.  The i
-// and x arrays are the same as B.
-
-// The p array is an offset into Bp (that is, Bp + Slice [s]), which means that
-// p [0] will not be zero (except for Bslice [0]).  If B is hypersparse, the h
-// array is also an offset into B->h.  If B is standard, then Bslice [s]
-// becomes an implicit hypersparse matrix.  Its h array is NULL, and the h list
-// is implicit: h[0..nvec-1] is implicitly [hfirst, hfirst+1, ...
-// hfirst+nvec-1], where nvec = Slice [s+1] - Slice [s].
-
-// The matrix dimensions of each slice are the same as B.  All slices have
-// vector length B->vlen and vector dimension B->vdim.   The slices are subsets
-// of the vectors of B, as defined by the Slice array.  The Bslice [s] consists
-// of the vectors Slice [s] to Slice [s+1]-1.
-
-// This function does only O(nslices) work and allocates O(nslices) space, so
-// it does not need to be parallel.
-
-#include "GB.h"
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-GrB_Info GB_slice       // slice B into nslices slices or hyperslices
-(
-    GrB_Matrix B,       // matrix to slice
-    int nslices,        // # of slices to create
-    int64_t *Slice,     // array of size nslices+1 that defines the slice
-    GrB_Matrix *Bslice, // array of output slices, of size nslices
-    GB_Context Context
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    ASSERT_MATRIX_OK (B, "B to slice", GB0) ;
-    ASSERT (nslices >= 1) ;
-    ASSERT (Bslice != NULL) ;
-    ASSERT (Slice != NULL) ;
-    ASSERT (Slice [0] == 0) ;
-    ASSERT (Slice [nslices] == B->nvec) ;
-    for (int s = 0 ; s < nslices ; s++)
-    {
-        ASSERT (Slice [s] <= Slice [s+1]) ;
-    }
-
-    GrB_Info info ;
-
-    // quick return
-    if (nslices == 1)
-    { 
-        // the caller must not free Bslice [0]
-        Bslice [0] = B ;
-        return (GrB_SUCCESS) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // create the slices or hyperslices
-    //--------------------------------------------------------------------------
-
-    for (int s = 0 ; s < nslices ; s++)
-    {
-        // Bslice [s] = B (:, bcol_first:bcol_last)
-        int64_t bvec_first  = Slice [s] ;
-        int64_t bvec_last   = Slice [s+1] - 1 ;
-        int64_t bslice_nvec = bvec_last - bvec_first + 1 ;
-
-        // allocate just the header for Bslice [s]; all content is shallow
-        Bslice [s] = NULL ;
-        info = GB_new (&(Bslice [s]), B->type, B->vlen, B->vdim, GB_Ap_null,
-            B->is_csc, GB_SAME_HYPER_AS (B->is_hyper), B->hyper_ratio,
-            bslice_nvec, NULL) ;
-        if (info != GrB_SUCCESS)
-        {
-            // out of memory
-            for (int i = 0 ; i < s ; i++)
-            { 
-                GB_MATRIX_FREE (&(Bslice [i])) ;
-            }
-            return (GB_OUT_OF_MEMORY) ;
-        }
-
-        // Bslice [s] is a slice or hyperslice
-        (Bslice [s])->is_slice = true ;
-
-        // Bslice has shallow pointers into B->i and B->x
-        (Bslice [s])->i = B->i ; (Bslice [s])->i_shallow = true ;
-        (Bslice [s])->x = B->x ; (Bslice [s])->x_shallow = true ;
-        (Bslice [s])->h_shallow = true ;
-
-        // Bslice->h hyperlist
-        if (B->is_hyper)
-        { 
-            // the columns of Bslice [s] are B->h [bvec_first:bvec_last].
-            // Bslice [s] is a hyperslice (with an explict h list).
-            (Bslice [s])->h = B->h + bvec_first ;
-            (Bslice [s])->hfirst = 0 ;      // unused
-        }
-        else
-        { 
-            // the columns of Bslice [s] are [bvec_first:bvec_last].
-            // Bslice [s] is a slice (with an implicit h list)
-            (Bslice [s])->h = NULL ;
-            (Bslice [s])->hfirst = bvec_first ;
-        }
-
-        // Bslice->p pointers
-        (Bslice [s])->p = B->p + bvec_first ;
-        (Bslice [s])->p_shallow = true ;
-        (Bslice [s])->plen = bslice_nvec ;
-
-        (Bslice [s])->nvec = bslice_nvec ;
-
-        if (B->nvec_nonempty == B->nvec)
-        { 
-            // all vectors present in B, so all vectors present in the slice
-            (Bslice [s])->nvec_nonempty = bslice_nvec ;
-        }
-        else
-        { 
-            (Bslice [s])->nvec_nonempty = -1 ;
-        }
-
-        (Bslice [s])->nzmax = B->nzmax ;
-        (Bslice [s])->magic = GB_MAGIC ;
-
-        ASSERT_MATRIX_OK (Bslice [s], "Bslice", GB0) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // return the slices
-    //--------------------------------------------------------------------------
-
-    return (GrB_SUCCESS) ;
-}
-
diff --git a/GraphBLAS/Source/GB_slice_vector.c b/GraphBLAS/Source/GB_slice_vector.c
index 17336021c0..1930dad376 100644
--- a/GraphBLAS/Source/GB_slice_vector.c
+++ b/GraphBLAS/Source/GB_slice_vector.c
@@ -2,8 +2,8 @@
 // GB_slice_vector:  slice a vector for GB_add, GB_emult, and GB_mask
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -29,20 +29,19 @@
 // The lists Ai and Bi can also be any sorted integer array.  This is used by
 // GB_add_phase0 to construct the set union of A->h and B->h.  In this case,
 // pA_start and pB_start are both zero, and pA_end and pB_end are A->nvec and
-// B->nvec, respectively.  A can be a non-hypersparse slice, so that A->h is
-// NULL.  In this case, Ai is NULL, and represents the implicit list
-// A_hfirst:A_hfirst+pA_end-1, inclusive.
-
-// This macro defines the kth entry in the Ai list, for k = 0 to pA_end-1:
-#define GB_Ai(k) ((Ai != NULL) ? Ai [k] : (A_hfirst + (k)))
+// B->nvec, respectively.
 
 // If n = A->vlen = B->vlen, anz = nnz (A (:,kA)), and bnz = nnz (B (:,kB)),
 // then the total time taken by this function is O(log(n)*(log(anz)+log(bnz))),
 // or at most O((log(n)^2)).
 
+// The input matrices M, A, and B are not present here, except for M->i,
+// A->i, and B->i if they are sparse or hypersparse.  They cannot be jumbled.
+// M, A, and B can have any sparsity structure.  If bitmap or full, their
+// corresponding [A,B,M]->i arrays are NULL.
+
 #include "GB.h"
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 void GB_slice_vector
 (
     // output: return i, pA, and pB
@@ -53,14 +52,13 @@ void GB_slice_vector
     // input:
     const int64_t pM_start,         // M(:,kM) starts at pM_start in Mi,Mx
     const int64_t pM_end,           // M(:,kM) ends at pM_end-1 in Mi,Mx
-    const int64_t *GB_RESTRICT Mi,     // indices of M (or NULL)
+    const int64_t *GB_RESTRICT Mi,  // indices of M (or NULL)
     const int64_t pA_start,         // A(:,kA) starts at pA_start in Ai,Ax
     const int64_t pA_end,           // A(:,kA) ends at pA_end-1 in Ai,Ax
-    const int64_t *GB_RESTRICT Ai,     // indices of A
-    const int64_t A_hfirst,         // if Ai is an implicit hyperlist
+    const int64_t *GB_RESTRICT Ai,  // indices of A (or NULL)
     const int64_t pB_start,         // B(:,kB) starts at pB_start in Bi,Bx
     const int64_t pB_end,           // B(:,kB) ends at pB_end-1 in Bi,Bx
-    const int64_t *GB_RESTRICT Bi,     // indices of B
+    const int64_t *GB_RESTRICT Bi,  // indices of B (or NULL)
     const int64_t vlen,             // A->vlen and B->vlen
     const double target_work        // target work
 )
@@ -93,9 +91,6 @@ void GB_slice_vector
     int64_t pA = (a_empty) ? -1 : pA_start ;
     int64_t pB = (b_empty) ? -1 : pB_start ;
 
-    ASSERT (GB_IMPLIES (!b_empty, Bi != NULL)) ;
-    ASSERT (GB_IMPLIES (!m_empty, Mi != NULL)) ;
-
     while (ileft < iright)
     {
 
@@ -114,33 +109,12 @@ void GB_slice_vector
             // Ai is empty so i does not appear
             pA = -1 ;
         }
-        else if (Ai == NULL)
-        { 
-            // Ai is an implicit hyperlist: A_hfirst, A_first+1, ... to
-            // A_hfirst + pA_end - 1, inclusive.  No need for a binary search.
-            ASSERT (pA_start == 0) ;
-            if (i < A_hfirst)
-            { 
-                // i comes before the first entry, so it does not appear
-                pA = 0 ;
-            }
-            else if (A_hfirst + pA_end - 1 < i)
-            { 
-                // i comes after the last entry, so it does not appear
-                pA = pA_end ;
-            }
-            else // (A_hfirst <= i && i <= A_hfirst + pA_end - 1)
-            { 
-                // i is in the implicit hyperlist
-                pA = i - A_hfirst ;
-                ASSERT (GB_Ai (pA) == i) ;
-            }
-        }
         else if (aknz == vlen)
         { 
-            // A(:,kA) is dense; no need for a binary search
+            // A(:,kA) is dense (bitmap, full, or all entries present)
+            // no need for a binary search
             pA = pA_start + i ;
-            ASSERT (GB_Ai (pA) == i) ;
+            ASSERT (GBI (Ai, pA, vlen) == i) ;
         }
         else
         { 
@@ -150,12 +124,14 @@ void GB_slice_vector
             bool afound ;
             int64_t apright = pA_end - 1 ;
             GB_SPLIT_BINARY_SEARCH (i, Ai, pA, apright, afound) ;
-            ASSERT (GB_IMPLIES (afound, GB_Ai (pA) == i)) ;
+            ASSERT (GB_IMPLIES (afound, GBI (Ai, pA, vlen) == i)) ;
             ASSERT (pA_start <= pA && pA <= pA_end) ;
         }
 
-        ASSERT (GB_IMPLIES (pA >  pA_start && pA < pA_end, (GB_Ai (pA-1) < i)));
-        ASSERT (GB_IMPLIES (pA >= pA_start && pA < pA_end, (GB_Ai (pA) >= i )));
+        ASSERT (GB_IMPLIES (pA >  pA_start && pA < pA_end,
+            (GBI (Ai, pA-1, vlen) < i))) ;
+        ASSERT (GB_IMPLIES (pA >= pA_start && pA < pA_end,
+            (GBI (Ai, pA, vlen) >= i ))) ;
 
         // Ai has been split.  If afound is false:
         //      Ai [pA_start : pA-1] < i
@@ -180,22 +156,26 @@ void GB_slice_vector
         }
         else if (bknz == vlen)
         { 
-            // B(:,kB) is dense; no need for a binary search
+            // B(:,kB) is dense (bitmap, full, or all entries present)
+            // no need for a binary search
             pB = pB_start + i ;
-            ASSERT (Bi [pB] == i) ;
+            ASSERT (GBI (Bi, pB, vlen) == i) ;
         }
         else
         { 
             // B(:,kB) is sparse, and not empty
             ASSERT (bknz > 0) ;
+            ASSERT (Bi != NULL) ;
             pB = pB_start ;
             bool bfound ;
             int64_t bpright = pB_end - 1 ;
             GB_SPLIT_BINARY_SEARCH (i, Bi, pB, bpright, bfound) ;
             ASSERT (pB_start <= pB && pB <= pB_end) ;
         }
-        ASSERT (GB_IMPLIES (pB >  pB_start && pB < pB_end, (Bi [pB-1] < i))) ;
-        ASSERT (GB_IMPLIES (pB >= pB_start && pB < pB_end, (Bi [pB] >= i ))) ;
+        ASSERT (GB_IMPLIES (pB >  pB_start && pB < pB_end,
+            (GBI (Bi, pB-1, vlen) < i))) ;
+        ASSERT (GB_IMPLIES (pB >= pB_start && pB < pB_end,
+            (GBI (Bi, pB, vlen) >= i ))) ;
 
         // Bi has been split.  If bfound is false:
         //      Bi [pB_start : pB-1] < i
@@ -267,14 +247,16 @@ void GB_slice_vector
     }
     else if (mknz == vlen)
     { 
-        // M(:,kM) is dense; no need for a binary search
+        // M(:,kM) is dense (bitmap, full, or all entries present)
+        // no need for a binary search
         pM = pM_start + i ;
-        ASSERT (Mi [pM] == i) ;
+        ASSERT (GBI (Mi, pM, vlen) == i) ;
     }
     else
     { 
         // M(:,kM) is sparse, and not empty
         ASSERT (mknz > 0) ;
+        ASSERT (Mi != NULL) ;
         pM = pM_start ;
         bool mfound ;
         int64_t mpright = pM_end - 1 ;
@@ -288,12 +270,18 @@ void GB_slice_vector
     // pM, pA, and pB partition the three vectors M(:,j), A(:,j), and B(:,j),
     // or if any vector is empty, their p* pointer is -1.
 
-    ASSERT (GB_IMPLIES ((pM >  pM_start && pM < pM_end), Mi [pM-1] <  i)) ;
-    ASSERT (GB_IMPLIES ((pM >= pM_start && pM < pM_end), Mi [pM  ] >= i)) ;
-    ASSERT (GB_IMPLIES ((pA >  pA_start && pA < pA_end), GB_Ai (pA-1) <  i)) ;
-    ASSERT (GB_IMPLIES ((pA >= pA_start && pA < pA_end), GB_Ai (pA  ) >= i)) ;
-    ASSERT (GB_IMPLIES ((pB >  pB_start && pB < pB_end), Bi [pB-1] <  i)) ;
-    ASSERT (GB_IMPLIES ((pB >= pB_start && pB < pB_end), Bi [pB  ] >= i)) ;
+    ASSERT (GB_IMPLIES ((pM >  pM_start && pM < pM_end),
+        GBI (Mi, pM-1, vlen) <  i)) ;
+    ASSERT (GB_IMPLIES ((pM >= pM_start && pM < pM_end),
+        GBI (Mi, pM, vlen) >= i)) ;
+    ASSERT (GB_IMPLIES ((pA >  pA_start && pA < pA_end),
+        GBI (Ai, pA-1, vlen) <  i)) ;
+    ASSERT (GB_IMPLIES ((pA >= pA_start && pA < pA_end),
+        GBI (Ai, pA, vlen) >= i)) ;
+    ASSERT (GB_IMPLIES ((pB >  pB_start && pB < pB_end),
+        GBI (Bi, pB-1, vlen) <  i)) ;
+    ASSERT (GB_IMPLIES ((pB >= pB_start && pB < pB_end),
+        GBI (Bi, pB, vlen) >= i)) ;
 
     if (p_i != NULL)
     { 
diff --git a/GraphBLAS/Source/GB_sort.h b/GraphBLAS/Source/GB_sort.h
index 3617dbc0d0..7db3107e4c 100644
--- a/GraphBLAS/Source/GB_sort.h
+++ b/GraphBLAS/Source/GB_sort.h
@@ -2,8 +2,8 @@
 // GB_sort.h: definitions for sorting functions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,18 +19,59 @@
 #define GB_BASECASE (64 * 1024)
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void GB_qsort_1a    // sort array A of size 1-by-n
+void GB_qsort_1b    // sort array A of size 2-by-n, using 1 key (A [0][])
 (
     int64_t *GB_RESTRICT A_0,      // size n array
+    GB_void *GB_RESTRICT A_1,      // size n array
+    const size_t xsize,         // size of entries in A_1
+    const int64_t n
+) ;
+
+void GB_qsort_1b_size1  // GB_qsort_1b with A1 with sizeof = 1
+(
+    int64_t *GB_RESTRICT A_0,       // size n array
+    uint8_t *GB_RESTRICT A_1,       // size n array
+    const int64_t n
+) ;
+
+void GB_qsort_1b_size2  // GB_qsort_1b with A1 with sizeof = 2
+(
+    int64_t *GB_RESTRICT A_0,       // size n array
+    uint16_t *GB_RESTRICT A_1,      // size n array
+    const int64_t n
+) ;
+
+void GB_qsort_1b_size4  // GB_qsort_1b with A1 with sizeof = 4
+(
+    int64_t *GB_RESTRICT A_0,       // size n array
+    uint32_t *GB_RESTRICT A_1,      // size n array
+    const int64_t n
+) ;
+
+void GB_qsort_1b_size8  // GB_qsort_1b with A_1 with sizeof = 8
+(
+    int64_t *GB_RESTRICT A_0,       // size n array
+    uint64_t *GB_RESTRICT A_1,      // size n array
+    const int64_t n
+) ;
+
+typedef struct
+{
+    uint8_t stuff [16] ;            // not accessed directly
+}
+GB_blob16 ;                         // sizeof (GB_blob16) is 16.
+
+void GB_qsort_1b_size16 // GB_qsort_1b with A_1 with sizeof = 16
+(
+    int64_t *GB_RESTRICT A_0,       // size n array
+    GB_blob16 *GB_RESTRICT A_1,     // size n array
     const int64_t n
 ) ;
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void GB_qsort_1b    // sort array A of size 2-by-n, using 1 key (A [0][])
+void GB_qsort_1a    // sort array A of size 1-by-n
 (
     int64_t *GB_RESTRICT A_0,      // size n array
-    GB_void *GB_RESTRICT A_1,      // size n array
-    const size_t xsize,         // size of entries in A_1
     const int64_t n
 ) ;
 
@@ -52,67 +93,33 @@ void GB_qsort_3     // sort array A of size 3-by-n, using 3 keys (A [0:2][])
 ) ;
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void GB_msort_1     // sort array A of size n.
-(
-    int64_t *GB_RESTRICT A_0,   // size n array
-    int64_t *GB_RESTRICT W_0,   // size n array, workspace
-    const int64_t n,
-    int nthreads                // # of threads to use
-) ;
-
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void GB_msort_2     // sort array A of size 2-by-n, using 2 keys (A [0:1][])
+GrB_Info GB_msort_2b    // sort array A of size 2-by-n, using 2 keys (A [0:1][])
 (
     int64_t *GB_RESTRICT A_0,   // size n array
     int64_t *GB_RESTRICT A_1,   // size n array
-    int64_t *GB_RESTRICT W_0,   // size n array, workspace
-    int64_t *GB_RESTRICT W_1,   // size n array, workspace
     const int64_t n,
     int nthreads                // # of threads to use
 ) ;
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
-void GB_msort_3     // sort array A of size 3-by-n, using 3 keys (A [0:2][])
+GrB_Info GB_msort_3b    // sort array A of size 3-by-n, using 3 keys (A [0:2][])
 (
     int64_t *GB_RESTRICT A_0,   // size n array
     int64_t *GB_RESTRICT A_1,   // size n array
     int64_t *GB_RESTRICT A_2,   // size n array
-    int64_t *GB_RESTRICT W_0,   // size n array, workspace
-    int64_t *GB_RESTRICT W_1,   // size n array, workspace
-    int64_t *GB_RESTRICT W_2,   // size n array, workspace
     const int64_t n,
     int nthreads                // # of threads to use
 ) ;
 
-//------------------------------------------------------------------------------
-// # of threads to use in parallel mergesort
-//------------------------------------------------------------------------------
-
-#if defined ( _OPENMP ) && GB_HAS_OPENMP_TASKS
-
-    // With OpenMP v4.0: use all available threads in a parallel mergesort.
-    #define GB_MSORT_NTHREADS(nthreads) nthreads
-
-#else
-
-    // OpenMP tasks are not available, so just use a sequential quicksort
-    // with a single thread.  OpenMP tasks requires OpenMP v4.0 or later.
-    // Microsoft Visual Studio only supports OpenMP 2.0, so the parallel
-    // mergesort is not available when using that compiler.
-    #define GB_MSORT_NTHREADS(nthreads) 1
-
-#endif
-
 //------------------------------------------------------------------------------
 // GB_lt_1: sorting comparator function, one key
 //------------------------------------------------------------------------------
 
 // A [a] and B [b] are keys of one integer.
 
-// GB_lt_1 returns true if A [a] < B [b], for GB_qsort_1a and GB_qsort_1b
+// GB_lt_1 returns true if A [a] < B [b], for GB_qsort_1b
 
-#define GB_lt_1(A_0, a, B_0, b)                                             \
-    (A_0 [a] < B_0 [b])
+#define GB_lt_1(A_0, a, B_0, b) (A_0 [a] < B_0 [b])
 
 //------------------------------------------------------------------------------
 // GB_lt_2: sorting comparator function, two keys
@@ -120,7 +127,7 @@ void GB_msort_3     // sort array A of size 3-by-n, using 3 keys (A [0:2][])
 
 // A [a] and B [b] are keys of two integers.
 
-// GB_lt_2 returns true if A [a] < B [b], for GB_qsort_2 and GB_msort_2
+// GB_lt_2 returns true if A [a] < B [b], for GB_qsort_2 and GB_msort_2b
 
 #define GB_lt_2(A_0, A_1, a, B_0, B_1, b)                                   \
 (                                                                           \
@@ -141,14 +148,14 @@ void GB_msort_3     // sort array A of size 3-by-n, using 3 keys (A [0:2][])
         )                                                                   \
     )                                                                       \
 )
-    
+
 //------------------------------------------------------------------------------
 // GB_lt_3: sorting comparator function, three keys
 //------------------------------------------------------------------------------
 
 // A [a] and B [b] are keys of three integers.
 
-// GB_lt_3 returns true if A [a] < B [b], for GB_qsort_3 and GB_msort_2
+// GB_lt_3 returns true if A [a] < B [b], for GB_qsort_3 and GB_msort_3b
 
 #define GB_lt_3(A_0, A_1, A_2, a, B_0, B_1, B_2, b)                         \
 (                                                                           \
@@ -170,6 +177,26 @@ void GB_msort_3     // sort array A of size 3-by-n, using 3 keys (A [0:2][])
     )                                                                       \
 )
 
+//------------------------------------------------------------------------------
+// GB_eq_*: sorting comparator function, three keys
+//------------------------------------------------------------------------------
+
+// A [a] and B [b] are keys of two or three integers.
+// GB_eq_* returns true if A [a] == B [b]
+
+#define GB_eq_3(A_0, A_1, A_2, a, B_0, B_1, B_2, b)                         \
+(                                                                           \
+    (A_0 [a] == B_0 [b]) &&                                                 \
+    (A_1 [a] == B_1 [b]) &&                                                 \
+    (A_2 [a] == B_2 [b])                                                    \
+)
+
+#define GB_eq_2(A_0, A_1, a, B_0, B_1, b)                                   \
+(                                                                           \
+    (A_0 [a] == B_0 [b]) &&                                                 \
+    (A_1 [a] == B_1 [b])                                                    \
+)
+
 //------------------------------------------------------------------------------
 // random number generator for quicksort
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_sparsity_control.c b/GraphBLAS/Source/GB_sparsity_control.c
new file mode 100644
index 0000000000..e31c8b85a4
--- /dev/null
+++ b/GraphBLAS/Source/GB_sparsity_control.c
@@ -0,0 +1,50 @@
+//------------------------------------------------------------------------------
+// GB_sparsity_control: ensure the sparsity control is in the proper range
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+
+int GB_sparsity_control     // revised sparsity control
+(
+    int sparsity,           // sparsity control
+    int64_t vdim            // A->vdim, or -1 to ignore this condition
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // ensure the sparsity control is in range 1 to 15
+    //--------------------------------------------------------------------------
+
+    sparsity = sparsity & GxB_ANY_SPARSITY ;
+    if (sparsity == GxB_DEFAULT)
+    { 
+        // if zero, set to auto sparsity
+        sparsity = GxB_AUTO_SPARSITY ;
+    }
+
+    //--------------------------------------------------------------------------
+    // ensure vectors and scalars cannot become hypersparse
+    //--------------------------------------------------------------------------
+
+    if ((vdim == 0 || vdim == 1) && (sparsity & GxB_HYPERSPARSE))
+    { 
+        // a GxB_Scalar, GrB_Vector, or a GrB_Matrix with a single vector,
+        // cannot be converted to hypersparse.  If the sparsity control
+        // allows for the hypersparse case, disable it and enable the
+        // sparse case instead.
+        sparsity = sparsity & (~GxB_HYPERSPARSE) ;
+        sparsity = sparsity | GxB_SPARSE ;
+    }
+
+    //--------------------------------------------------------------------------
+    // return revised sparsity control
+    //--------------------------------------------------------------------------
+
+    return (sparsity) ;
+}
+
diff --git a/GraphBLAS/Source/GB_status_code.c b/GraphBLAS/Source/GB_status_code.c
index 0eed2b6c69..40415b8d77 100644
--- a/GraphBLAS/Source/GB_status_code.c
+++ b/GraphBLAS/Source/GB_status_code.c
@@ -2,8 +2,8 @@
 // GB_status_code: return an error string describing the last error
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GB_subassign.c b/GraphBLAS/Source/GB_subassign.c
index 45ea0a9062..6a6306627a 100644
--- a/GraphBLAS/Source/GB_subassign.c
+++ b/GraphBLAS/Source/GB_subassign.c
@@ -2,8 +2,8 @@
 // GB_subassign: C(Rows,Cols)<M> = accum (C(Rows,Cols),A) or A'
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -21,26 +21,28 @@
 // Compare with GB_assign, which uses M and C_replace differently
 
 #include "GB_subassign.h"
-#include "GB_transpose.h"
-
-#define GB_FREE_ALL                                 \
-{                                                   \
-    GB_MATRIX_FREE (&Z2) ;                          \
-    GB_MATRIX_FREE (&AT) ;                          \
-    GB_MATRIX_FREE (&MT) ;                          \
+#include "GB_bitmap_assign.h"
+
+#define GB_FREE_ALL                 \
+{                                   \
+    GB_Matrix_free (&C2) ;          \
+    GB_Matrix_free (&M2) ;          \
+    GB_Matrix_free (&A2) ;          \
+    GB_FREE (I2) ;                  \
+    GB_FREE (J2) ;                  \
 }
 
 GrB_Info GB_subassign               // C(Rows,Cols)<M> += A or A'
 (
-    GrB_Matrix C,                   // input/output matrix for results
+    GrB_Matrix C_in,                // input/output matrix for results
     bool C_replace,                 // descriptor for C
     const GrB_Matrix M_in,          // optional mask for C(Rows,Cols)
     const bool Mask_comp,           // true if mask is complemented
     const bool Mask_struct,         // if true, use the only structure of M
-    bool M_transpose,               // true if the mask should be transposed
+    const bool M_transpose,         // true if the mask should be transposed
     const GrB_BinaryOp accum,       // optional accum for accum(C,T)
     const GrB_Matrix A_in,          // input matrix
-    bool A_transpose,               // true if A is transposed
+    const bool A_transpose,         // true if A is transposed
     const GrB_Index *Rows,          // row indices
     const GrB_Index nRows_in,       // number of row indices
     const GrB_Index *Cols,          // column indices
@@ -53,317 +55,84 @@ GrB_Info GB_subassign               // C(Rows,Cols)<M> += A or A'
 {
 
     //--------------------------------------------------------------------------
-    // check inputs
+    // check and prep inputs
     //--------------------------------------------------------------------------
 
-    // C may be aliased with M_in and/or A_in
-
-    GB_RETURN_IF_FAULTY (accum) ;
-    GB_RETURN_IF_NULL (Rows) ;
-    GB_RETURN_IF_NULL (Cols) ;
-
     GrB_Info info ;
-    GrB_Matrix M = M_in ;
-    GrB_Matrix A = A_in ;
-
-    if (scalar_expansion)
+    GrB_Matrix C = NULL ;           // C_in or C2
+    GrB_Matrix M = NULL ;           // M_in or M2
+    GrB_Matrix A = NULL ;           // A_in or A2
+    GrB_Index *I = NULL ;           // Rows, Cols, or I2
+    GrB_Index *J = NULL ;           // Rows, Cols, or J2
+
+    // temporary matrices and arrays
+    GrB_Matrix C2 = NULL ;
+    GrB_Matrix M2 = NULL ;
+    GrB_Matrix A2 = NULL ;
+    GrB_Index *I2  = NULL ;
+    GrB_Index *J2  = NULL ;
+
+    GrB_Type atype = NULL ;
+    bool done = false ;
+    int64_t ni, nj, nI, nJ, Icolon [3], Jcolon [3] ;
+    int Ikind, Jkind ;
+    int assign_kind = GB_SUBASSIGN ;
+
+    GB_OK (GB_assign_prep (&C, &M, &A, &C2, &M2, &A2,
+        &I, &I2, &ni, &nI, &Ikind, Icolon,
+        &J, &J2, &nj, &nJ, &Jkind, Jcolon,
+        &done, &atype, C_in, &C_replace, &assign_kind,
+        M_in, Mask_comp, Mask_struct, M_transpose, accum,
+        A_in, A_transpose, Rows, nRows_in, Cols, nCols_in,
+        scalar_expansion, scalar, scalar_code, Context)) ;
+
+    // GxB_Row_subassign, GxB_Col_subassign, GxB_Matrix_subassign and
+    // GxB_Vector_subassign all use GB_SUBASSIGN.
+    ASSERT (assign_kind == GB_SUBASSIGN) ;
+
+    if (done)
     { 
-        // for scalar expansion, the NULL pointer case has been already checked
-        // for user-defined types, and can't be NULL for built-in types.
-        ASSERT (scalar != NULL) ;
-        ASSERT (A == NULL) ;
+        // GB_assign_prep has handle the entire assignment itself
+        ASSERT (C == C_in) ;
+        ASSERT_MATRIX_OK (C_in, "Final C for subassign", GB0) ;
+        return (GrB_SUCCESS) ;
     }
-    else
-    { 
-        // GrB_*assign, not scalar:  The user's input matrix has been checked.
-        // The pointer to the scalar is NULL.
-        ASSERT (scalar == NULL) ;
-        ASSERT_MATRIX_OK (A, "A for GB_subassign", GB0) ;
-    }
-
-    ASSERT_MATRIX_OK (C, "C input for GB_subassign", GB0) ;
-    ASSERT_MATRIX_OK_OR_NULL (M, "M for GB_subassign", GB0) ;
-    ASSERT_BINARYOP_OK_OR_NULL (accum, "accum for GB_subassign", GB0) ;
-    ASSERT (scalar_code <= GB_UDT_code) ;
-
-    int64_t nRows, nCols, RowColon [3], ColColon [3] ;
-    int RowsKind, ColsKind ;
-    GB_ijlength (Rows, nRows_in, GB_NROWS (C), &nRows, &RowsKind, RowColon) ;
-    GB_ijlength (Cols, nCols_in, GB_NCOLS (C), &nCols, &ColsKind, ColColon) ;
-
-    bool whole_C_matrix = (RowsKind == GB_ALL && ColsKind == GB_ALL) ;
-
-    GrB_Matrix AT = NULL ;
-    GrB_Matrix MT = NULL ;
-    GrB_Matrix Z2 = NULL ;
-    GrB_Matrix Z = NULL ;
-
-    bool C_is_csc = C->is_csc ;
 
     //--------------------------------------------------------------------------
-    // check domains and dimensions for C(Rows,Cols)<M> += A or A'
+    // determine method for GB_subassigner
     //--------------------------------------------------------------------------
 
-    // GB_compatible is not used since most of it is slightly different here
-    if (accum != NULL)
-    { 
-        // C(Rows,Cols)<M> = accum (C(Rows,Cols),A)
-        GB_OK (GB_BinaryOp_compatible (accum, C->type, C->type,
-            (scalar_expansion) ? NULL : A->type,
-            (scalar_expansion) ? scalar_code : GB_ignore_code, Context)) ;
-    }
-
-    // C(Rows,Cols)<M> = T, so C and T must be compatible.
-    // also C(Rows,Cols)<M> = accum(C,T) for entries in T but not C
-    if (scalar_expansion)
-    {
-        if (!GB_code_compatible (C->type->code, scalar_code))
-        { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-                "Input scalar of type [%s]\n"
-                "cannot be typecast to output of type [%s]",
-                GB_code_string (scalar_code), C->type->name))) ;
-        }
-    }
-    else
-    {
-        if (!GB_Type_compatible (C->type, A->type))
-        { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-                "Input of type [%s]\n"
-                "cannot be typecast to output of type [%s]",
-                A->type->name, C->type->name))) ;
-        }
-    }
-
-    // check the dimensions and type of M
-    if (M != NULL)
-    {
-        // M is typecast to boolean
-        if (!GB_Type_compatible (M->type, GrB_BOOL))
-        { 
-            return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-                "M of type [%s] cannot be typecast to boolean",
-                M->type->name))) ;
-        }
-        // M is a matrix the same size as C(Rows,Cols)
-        int64_t mnrows = M_transpose ? GB_NCOLS (M) : GB_NROWS (M) ;
-        int64_t mncols = M_transpose ? GB_NROWS (M) : GB_NCOLS (M) ;
-        if (mnrows != nRows || mncols != nCols)
-        { 
-            return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
-                "M is " GBd "-by-" GBd "%s, "
-                "must match size of result C(I,J): " GBd "-by-" GBd "",
-                mnrows, mncols, M_transpose ? " (transposed)" : "",
-                nRows, nCols))) ;
-        }
-    }
-
-    // check the dimensions of A
-    if (!scalar_expansion)
-    {
-        int64_t anrows = (A_transpose) ? GB_NCOLS (A) : GB_NROWS (A) ;
-        int64_t ancols = (A_transpose) ? GB_NROWS (A) : GB_NCOLS (A) ;
-        if (nRows != anrows || nCols != ancols)
-        { 
-            return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
-                "Dimensions not compatible:\n"
-                "C(Rows,Cols) is " GBd "-by-" GBd "\n"
-                "input is " GBd "-by-" GBd "%s",
-                nRows, nCols, anrows, ancols,
-                A_transpose ? " (transposed)" : ""))) ;
-        }
-    }
+    int subassign_method = GB_subassigner_method (C, C_replace,
+        M, Mask_comp, Mask_struct, accum, A, Ikind, Jkind, scalar_expansion) ;
 
     //--------------------------------------------------------------------------
-    // apply pending updates to A and M
+    // C(I,J)<M> = A or accum (C(I,J),A) via GB_subassigner
     //--------------------------------------------------------------------------
 
-    // if C == M or C == A, pending updates are applied to C as well
-
-    // delete any lingering zombies and assemble any pending tuples
-    // but only in A and M, not C
-    GB_MATRIX_WAIT (M) ;
-    if (!scalar_expansion)
-    { 
-        GB_MATRIX_WAIT (A) ;
-    }
+    GB_OK (GB_subassigner (C, subassign_method, C_replace,
+        M, Mask_comp, Mask_struct, accum, A,
+        I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+        scalar_expansion, scalar, atype, Context)) ;
 
     //--------------------------------------------------------------------------
-    // handle the CSR/CSC format of C:
+    // transplant C back into C_in
     //--------------------------------------------------------------------------
 
-    const GrB_Index *I, *J ;
-    int64_t ni, nj ;
-
-    if (!scalar_expansion && C_is_csc != A->is_csc)
+    if (C == C2)
     { 
-        // Flip the sense of A_transpose
-        A_transpose = !A_transpose ;
+        // Transplant the content of C2 into C_in and free C2.  Zombies and
+        // pending tuples can be transplanted from C2 into C_in, and if C2 is
+        // jumbled, C_in becomes jumbled too.
+        GB_OK (GB_transplant (C_in, C_in->type, &C2, Context)) ;
     }
 
-    if (C_is_csc)
-    { 
-        // C is in CSC format
-        I = Rows ; ni = nRows_in ;     // indices into the vectors
-        J = Cols ; nj = nCols_in ;     // vectors
-    }
-    else
-    { 
-        // C is in CSR format
-        I = Cols ; ni = nCols_in ;     // indices into the vectors
-        J = Rows ; nj = nRows_in ;     // vectors
-    }
-
-    // C has C->vdim vectors, each of length C->vlen.
-    // J is a list of length |J| of vectors in the range 0:C->vdim-1.
-    // I is a list of length |I| of indices in the range 0:C->vlen-1.
-
-    //--------------------------------------------------------------------------
-    // transpose A if requested
-    //--------------------------------------------------------------------------
-
-    if (!scalar_expansion && A_transpose)
-    { 
-        // AT = A', with no typecasting
-        // transpose: no typecast, no op, not in place
-        GBBURBLE ("(A transpose) ") ;
-        GB_OK (GB_transpose (&AT, NULL, C_is_csc, A,
-            NULL, NULL, NULL, false, Context)) ;
-        A = AT ;
-    }
-
-    //--------------------------------------------------------------------------
-    // transpose the mask if requested
-    //--------------------------------------------------------------------------
-
-    // the mask for G*B_Col_*assign and G*B_Row_*assign is a GrB_Vector in CSC
-    // form, which is quickly transposed to a hypersparse matrix, if needed.
-    // G*B_Vector_*assign always has a CSC mask and CSC C matrix, since both
-    // are GrB_Vectors.
-
-    if (M != NULL)
-    {
-        if (M->is_csc != C_is_csc)
-        { 
-            // either G*B_Row_*assign and G*B_Col_*assign when matrix C is in
-            // CSR format, and or G*B_Matrix_assign when the format of the
-            // matrices C and M differ.
-            M_transpose = !M_transpose ;
-        }
-        if (M_transpose)
-        { 
-            // MT = M' to conform M to the same CSR/CSC format as C.
-            // typecast to boolean, if a full matrix transpose is done.
-            // transpose: no typecast, no op, not in place
-            GBBURBLE ("(M transpose) ") ;
-            GB_OK (GB_transpose (&MT, GrB_BOOL, C_is_csc, M,
-                NULL, NULL, NULL, false, Context)) ;
-            M = MT ;
-        }
-    }
-
-    //--------------------------------------------------------------------------
-    // make a copy Z = C if C is aliased to A or M
-    //--------------------------------------------------------------------------
-
-    // If C is aliased to A and/or M, a copy must be made.  GB_subassigner
-    // operates on the copy, Z, which is then transplanted back into C when
-    // done.  This is costly, and can have performance implications, but it is
-    // the only reasonable method.  If a copy of C must be made, then it is as
-    // large as M or A, so copying the whole matrix will not add much time.
-
-    bool C_aliased = GB_aliased (C, A) || GB_aliased (C, M) ;
-
-    if (C_aliased)
-    { 
-        // If C is aliased, it no longer has any pending work, A and M have
-        // been finished, above.  This also ensures GB_dup does not need to
-        // finish any pending work in C.
-        GBBURBLE ("(C aliased) ") ;
-        ASSERT (!GB_ZOMBIES (C)) ;
-        ASSERT (!GB_PENDING (C)) ;
-        if (whole_C_matrix && C_replace && accum == NULL)
-        { 
-            // C(:,:)<any mask, replace> = A or x, with C aliased to M or A.  C
-            // is about to be cleared in GB_subassigner anyway, but a duplicate
-            // is need.  Instead of duplicating it, create an empty matrix Z2.
-            // This also prevents the C_replace_phase from being needed.
-            GB_OK (GB_new (&Z2, C->type, C->vlen, C->vdim, GB_Ap_calloc,
-                C->is_csc, GB_SAME_HYPER_AS (C->is_hyper), C->hyper_ratio,
-                1, Context)) ;
-            GBBURBLE ("(C alias cleared; C_replace early) ") ;
-            C_replace = false ;
-        }
-        else
-        { 
-            // Z2 = duplicate of C, which must be freed when done
-            GB_OK (GB_dup (&Z2, C, true, NULL, Context)) ;
-        }
-        Z = Z2 ;
-    }
-    else
-    { 
-        // GB_subassigner can safely operate on C in place.
-        // FUTURE:  if C is dense and will remain so,
-        // it would be faster to delay the clearing of C.
-        if (whole_C_matrix && C_replace && accum == NULL)
-        { 
-            // C(:,:)<any mask, replace> = A or x, with C not aliased to M or
-            // A.  C is about to be cleared in GB_subassigner anyway, so clear
-            // it now.
-            GB_OK (GB_clear (C, Context)) ;
-            GBBURBLE ("(C(:,:)<any mask>: C_replace early) ") ;
-            C_replace = false ;
-        }
-        Z = C ;
-    }
-
-    //--------------------------------------------------------------------------
-    // Z(I,J)<M> = A or accum (Z(I,J),A)
-    //--------------------------------------------------------------------------
-
-    GB_OK (GB_subassigner (
-        Z,          C_replace,      // Z matrix and its descriptor
-        M, Mask_comp, Mask_struct,  // mask matrix and its descriptor
-        accum,                      // for accum (C(I,J),A)
-        A,                          // A matrix, NULL for scalar expansion
-        I, ni,                      // indices
-        J, nj,                      // vectors
-        scalar_expansion,           // if true, expand scalar to A
-        scalar,                     // scalar to expand, NULL if A not NULL
-        scalar_code,                // type code of scalar to expand
-        Context)) ;
-
-    // Z2 is still needed
-    GB_MATRIX_FREE (&AT) ;
-    GB_MATRIX_FREE (&MT) ;
-
-    //--------------------------------------------------------------------------
-    // transplant Z2 back into C
-    //--------------------------------------------------------------------------
-
-    if (C_aliased)
-    {
-        // zombies can be transplanted into C but pending tuples cannot
-        if (GB_PENDING (Z2))
-        { 
-            // assemble all pending tuples, and delete all zombies too
-            GB_OK (GB_Matrix_wait (Z2, Context)) ;
-        }
-        // transplants the content of Z2 into C and frees Z2
-        GB_OK (GB_transplant (C, C->type, &Z2, Context)) ;
-    }
-
-    // The hypersparsity of C is not modified.  This will be done eventually,
-    // when all pending operations are completed via GB_Matrix_wait.
-
     //--------------------------------------------------------------------------
     // free workspace, finalize C, and return result
     //--------------------------------------------------------------------------
 
-    ASSERT_MATRIX_OK (C, "Final C for subassign", GB0) ;
+    GB_OK (GB_conform (C_in, Context)) ;
+    ASSERT_MATRIX_OK (C_in, "Final C for subassign", GB0) ;
     GB_FREE_ALL ;
-    return (GB_block (C, Context)) ;
+    return (GB_block (C_in, Context)) ;
 }
 
diff --git a/GraphBLAS/Source/GB_subassign.h b/GraphBLAS/Source/GB_subassign.h
index 63a5337497..709730b480 100644
--- a/GraphBLAS/Source/GB_subassign.h
+++ b/GraphBLAS/Source/GB_subassign.h
@@ -2,8 +2,8 @@
 // GB_subassign.h: definitions for GB_subassign
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,15 +14,15 @@
 
 GrB_Info GB_subassign               // C(Rows,Cols)<M> += A or A'
 (
-    GrB_Matrix C,                   // input/output matrix for results
+    GrB_Matrix C_in,                // input/output matrix for results
     bool C_replace,                 // descriptor for C
     const GrB_Matrix M_in,          // optional mask for C(Rows,Cols)
     const bool Mask_comp,           // true if mask is complemented
     const bool Mask_struct,         // if true, use the only structure of M
-    bool M_transpose,               // true if the mask should be transposed
+    const bool M_transpose,         // true if the mask should be transposed
     const GrB_BinaryOp accum,       // optional accum for accum(C,T)
     const GrB_Matrix A_in,          // input matrix
-    bool A_transpose,               // true if A is transposed
+    const bool A_transpose,         // true if A is transposed
     const GrB_Index *Rows,          // row indices
     const GrB_Index nRows_in,       // number of row indices
     const GrB_Index *Cols,          // column indices
@@ -48,24 +48,134 @@ GrB_Info GB_subassign_scalar        // C(Rows,Cols)<M> += x
     GB_Context Context
 ) ;
 
+int GB_subassigner_method           // return method to use in GB_subassigner
+(
+    // inputs: no matrix is modified
+    const GrB_Matrix C,             // input/output matrix for results
+    const bool C_replace,           // C matrix descriptor
+    const GrB_Matrix M,             // optional mask for C(I,J), unused if NULL
+    const bool Mask_comp,           // mask descriptor
+    const bool Mask_struct,         // if true, use the only structure of M
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C(I,J),A)
+    const GrB_Matrix A,             // input matrix (NULL for scalar expansion)
+    const int Ikind,
+    const int Jkind,
+    const bool scalar_expansion     // if true, expand scalar to A
+) ;
+
 GrB_Info GB_subassigner             // C(I,J)<#M> = A or accum (C (I,J), A)
 (
+    // input/output
     GrB_Matrix C,                   // input/output matrix for results
-    bool C_replace,                 // C matrix descriptor
-    const GrB_Matrix M_input,       // optional mask for C(I,J), unused if NULL
+    // input
+    const int subassign_method,
+    const bool C_replace,           // C matrix descriptor
+    const GrB_Matrix M,             // optional mask for C(I,J), unused if NULL
     const bool Mask_comp,           // mask descriptor
     const bool Mask_struct,         // if true, use the only structure of M
     const GrB_BinaryOp accum,       // optional accum for Z=accum(C(I,J),A)
-    const GrB_Matrix A_input,       // input matrix (NULL for scalar expansion)
-    const GrB_Index *I_input,       // list of indices
-    const int64_t   ni_input,       // number of indices
-    const GrB_Index *J_input,       // list of vector indices
-    const int64_t   nj_input,       // number of column indices
+    const GrB_Matrix A,             // input matrix (NULL for scalar expansion)
+    const GrB_Index *I,             // list of indices
+    const int64_t   ni,             // number of indices
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,             // list of vector indices
+    const int64_t   nj,             // number of column indices
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const bool scalar_expansion,    // if true, expand scalar to A
+    const void *scalar,             // scalar to be expanded
+    const GrB_Type atype,           // type code of scalar to expand
+    GB_Context Context
+) ;
+
+GrB_Info GB_assign_prep
+(
+    // output:
+    GrB_Matrix *Chandle,            // C_in, or C2 if C is aliased to M or A
+    GrB_Matrix *Mhandle,            // M_in, or a modified version M2
+    GrB_Matrix *Ahandle,            // A_in, or a modified version A2
+
+    // modified versions of the matrices C, M, and A:
+    GrB_Matrix *C2_handle,          // NULL, or a copy of C
+    GrB_Matrix *M2_handle,          // NULL, or a temporary matrix
+    GrB_Matrix *A2_handle,          // NULL, or a temporary matrix
+
+    // modified versions of the Rows/Cols lists, and their analysis:
+    const GrB_Index **I_handle,     // Rows, Cols, or a modified copy I2
+    GrB_Index **I2_handle,          // NULL, or sorted/pruned Rows or Cols
+    int64_t *ni_handle,
+    int64_t *nI_handle,
+    int *Ikind_handle,
+    int64_t Icolon [3],
+
+    const GrB_Index **J_handle,     // Rows, Cols, or a modified copy J2
+    GrB_Index **J2_handle,          // NULL, or sorted/pruned Rows or Cols
+    int64_t *nj_handle,
+    int64_t *nJ_handle,
+    int *Jkind_handle,
+    int64_t Jcolon [3],
+
+    bool *done,                     // true if the prep has finished all work
+    GrB_Type *atype_handle,         // type of A or the scalar
+
+    // input/output
+    GrB_Matrix C_in,                // input/output matrix for results
+    bool *C_replace,                // descriptor for C
+    int *assign_kind,               // row assign, col assign, assign, or
+                                    // subassign
+
+    // input
+    const GrB_Matrix M_in,          // optional mask for C
+    const bool Mask_comp,           // true if mask is complemented
+    const bool Mask_struct,         // if true, use the only structure of M
+    bool M_transpose,               // true if the mask should be transposed
+    const GrB_BinaryOp accum,       // optional accum for accum(C,T)
+    const GrB_Matrix A_in,          // input matrix
+    bool A_transpose,               // true if A is transposed
+    const GrB_Index *Rows,          // row indices
+    const GrB_Index nRows_in,       // number of row indices
+    const GrB_Index *Cols,          // column indices
+    const GrB_Index nCols_in,       // number of column indices
     const bool scalar_expansion,    // if true, expand scalar to A
     const void *scalar,             // scalar to be expanded
     const GB_Type_code scalar_code, // type code of scalar to expand
     GB_Context Context
 ) ;
 
+#define GB_SUBASSIGN_METHOD_01   1     // C(I,J) = scalar
+#define GB_SUBASSIGN_METHOD_02   2     // C(I,J) = A
+#define GB_SUBASSIGN_METHOD_03   3     // C(I,J) += scalar
+#define GB_SUBASSIGN_METHOD_04   4     // C(I,J) += A
+#define GB_SUBASSIGN_METHOD_05   5     // C(I,J)<M> = scalar
+#define GB_SUBASSIGN_METHOD_05d 51     // C(:,:)<M> = scalar ; C is dense
+#define GB_SUBASSIGN_METHOD_05e 52     // C(:,:)<M,struct> = scalar
+#define GB_SUBASSIGN_METHOD_06d 61     // C(:,:)<A> = A ; C is dense/bitmap
+#define GB_SUBASSIGN_METHOD_06n 62     // C(I,J)<M> = A ; no S
+#define GB_SUBASSIGN_METHOD_06s 63     // C(I,J)<M> = A ; using S
+#define GB_SUBASSIGN_METHOD_07   7     // C(I,J)<M> += scalar
+#define GB_SUBASSIGN_METHOD_08n 80     // C(I,J)<M> += A, no S
+#define GB_SUBASSIGN_METHOD_08s 81     // C(I,J)<M> += A, with S
+#define GB_SUBASSIGN_METHOD_09   9     // C(I,J)<M,replace> = scalar
+#define GB_SUBASSIGN_METHOD_10  10     // C(I,J)<M,replace> = A
+#define GB_SUBASSIGN_METHOD_11  11     // C(I,J)<M,replace> += scalar
+#define GB_SUBASSIGN_METHOD_12  12     // C(I,J)<M,replace> += A
+#define GB_SUBASSIGN_METHOD_13  13     // C(I,J)<!M> = scalar
+#define GB_SUBASSIGN_METHOD_14  14     // C(I,J)<!M> = A
+#define GB_SUBASSIGN_METHOD_15  15     // C(I,J)<!M> += scalar
+#define GB_SUBASSIGN_METHOD_16  16     // C(I,J)<!M> += A
+#define GB_SUBASSIGN_METHOD_17  17     // C(I,J)<!M,replace> = scalar
+#define GB_SUBASSIGN_METHOD_18  18     // C(I,J)<!M,replace> = A
+#define GB_SUBASSIGN_METHOD_19  19     // C(I,J)<!M,replace> = scalar
+#define GB_SUBASSIGN_METHOD_20  20     // C(I,J)<!M,replace> += A
+#define GB_SUBASSIGN_METHOD_21  21     // C(:,:) = scalar ; C becomes full
+#define GB_SUBASSIGN_METHOD_22  22     // C += scalar ; C is dense
+#define GB_SUBASSIGN_METHOD_23  23     // C += A ; C is dense
+#define GB_SUBASSIGN_METHOD_24  24     // C = A
+#define GB_SUBASSIGN_METHOD_25  25     // C(:,:)<M,struct> = A ; C empty
+#define GB_SUBASSIGN_METHOD_BITMAP 999 // bitmap assignment
+
 #endif
 
diff --git a/GraphBLAS/Source/GB_subassign_01.c b/GraphBLAS/Source/GB_subassign_01.c
index d5c32b03cc..6087f996b7 100644
--- a/GraphBLAS/Source/GB_subassign_01.c
+++ b/GraphBLAS/Source/GB_subassign_01.c
@@ -2,8 +2,8 @@
 // GB_subassign_01: C(I,J) = scalar ; using S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,6 +16,8 @@
 // A:           scalar
 // S:           constructed
 
+// C: not bitmap
+
 #include "GB_subassign_methods.h"
 
 GrB_Info GB_subassign_01
@@ -23,34 +25,45 @@ GrB_Info GB_subassign_01
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ;
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
-    const bool C_is_hyper = C->is_hyper ;
+    GB_GET_C ;      // C must not be bitmap
     const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t *GB_RESTRICT Cp = C->p ;
+    const bool C_is_hyper = (Ch != NULL) ;
     const int64_t Cnvec = C->nvec ;
     GB_GET_SCALAR ;
     GB_GET_S ;
-    const int64_t *GB_RESTRICT Sh = S->h ;
-    const int64_t Snvec = S->nvec ;
-    const bool S_is_hyper = S->is_hyper ;
     GrB_BinaryOp accum = NULL ;
 
     //--------------------------------------------------------------------------
@@ -75,7 +88,6 @@ GrB_Info GB_subassign_01
     // phase 1: create zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
         reduction(+:nzombies)
     for (taskid = 0 ; taskid < ntasks ; taskid++)
@@ -85,7 +97,7 @@ GrB_Info GB_subassign_01
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 ;
+        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iA_start, iA_end) ;
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -98,13 +110,13 @@ GrB_Info GB_subassign_01
             // get jC, the corresponding vector of C
             //------------------------------------------------------------------
 
-            GB_GET_jC ;
+            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
 
             //------------------------------------------------------------------
             // get S(iA_start:end,j)
             //------------------------------------------------------------------
 
-            GB_GET_VECTOR_FOR_IXJ (S) ;
+            GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
 
             //------------------------------------------------------------------
             // C(I(iA_start,iA_end-1),jC) = scalar
@@ -112,7 +124,7 @@ GrB_Info GB_subassign_01
 
             for (int64_t iA = iA_start ; iA < iA_end ; iA++)
             {
-                bool found = (pS < pS_end) && (Si [pS] == iA) ;
+                bool found = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
                 if (!found)
                 { 
                     // ----[. A 1]----------------------------------------------
@@ -151,7 +163,7 @@ GrB_Info GB_subassign_01
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 ;
+        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iA_start, iA_end) ;
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -164,13 +176,13 @@ GrB_Info GB_subassign_01
             // get jC, the corresponding vector of C
             //------------------------------------------------------------------
 
-            GB_GET_jC ;
+            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
 
             //------------------------------------------------------------------
             // get S(iA_start:end,j)
             //------------------------------------------------------------------
 
-            GB_GET_VECTOR_FOR_IXJ (S) ;
+            GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
 
             //------------------------------------------------------------------
             // C(I(iA_start,iA_end-1),jC) = scalar
@@ -178,7 +190,7 @@ GrB_Info GB_subassign_01
 
             for (int64_t iA = iA_start ; iA < iA_end ; iA++)
             {
-                bool found = (pS < pS_end) && (Si [pS] == iA) ;
+                bool found = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
                 if (!found)
                 { 
                     // ----[. A 1]----------------------------------------------
diff --git a/GraphBLAS/Source/GB_subassign_02.c b/GraphBLAS/Source/GB_subassign_02.c
index e48e5708ad..b11890d43e 100644
--- a/GraphBLAS/Source/GB_subassign_02.c
+++ b/GraphBLAS/Source/GB_subassign_02.c
@@ -2,8 +2,8 @@
 // GB_subassign_02: C(I,J) = A ; using S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,7 +16,8 @@
 // A:           matrix
 // S:           constructed
 
-#define GB_FREE_WORK GB_FREE_TWO_SLICE
+// C: not bitmap or full: use GB_bitmap_assign instead
+// A: any sparsity structure.
 
 #include "GB_subassign_methods.h"
 
@@ -25,24 +26,41 @@ GrB_Info GB_subassign_02
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
     const GrB_Matrix A,
-    const GrB_Matrix S,
     GB_Context Context
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ; ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_aliased (C, A)) ;   // NO ALIAS of C==A
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
+    GB_MATRIX_WAIT_IF_JUMBLED (A) ;
+
+    GB_GET_C ;      // C must not be bitmap
     GB_GET_A ;
     GB_GET_S ;
     GrB_BinaryOp accum = NULL ;
@@ -57,57 +75,181 @@ GrB_Info GB_subassign_02
     // Method 02 and Method 04 are somewhat similar.  They differ on how C is
     // modified when the entry is present in S but not A.
 
+    // TODO: phase2 of Method 02 and 04 are identical and could be
+    // done in a single function.
+
     //--------------------------------------------------------------------------
-    // Parallel: Z=A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
+    // Parallel: A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
     //--------------------------------------------------------------------------
 
-    GB_SUBASSIGN_TWO_SLICE (A, S) ;
+    if (A_is_bitmap)
+    {
+        // all of IxJ must be examined
+        GB_SUBASSIGN_IXJ_SLICE ;
+    }
+    else
+    {
+        // traverse all A+S
+        GB_SUBASSIGN_TWO_SLICE (A, S) ;
+    }
 
     //--------------------------------------------------------------------------
     // phase 1: create zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(+:nzombies)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    if (A_is_bitmap)
     {
 
         //----------------------------------------------------------------------
-        // get the task descriptor
+        // phase1: A is bitmap
         //----------------------------------------------------------------------
 
-        GB_GET_TASK_DESCRIPTOR_PHASE1 ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iA_start, iA_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iA_start:iA_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+                int64_t pA_start = j * Avlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iA_start:iA_end,j) and A(ditto,j)
+                //--------------------------------------------------------------
+
+                for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+                {
+                    int64_t pA = pA_start + iA ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
+                    bool Afound = Ab [pA] ;
+                    if (Sfound && !Afound)
+                    { 
+                        // ----[C . 1] or [X . 1]-------------------------------
+                        // S (i,j) is present but A (i,j) is not
+                        // [C . 1]: action: ( delete ): becomes zombie
+                        // [X . 1]: action: ( X ): still a zombie
+                        GB_C_S_LOOKUP ;
+                        GB_DELETE_ENTRY ;
+                        GB_NEXT (S) ;
+                    }
+                    else if (!Sfound && Afound)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // S (i,j) is not present, A (i,j) is present
+                        // [. A 1]: action: ( insert )
+                        task_pending++ ;
+                    }
+                    else if (Sfound && Afound)
+                    { 
+                        // ----[C A 1] or [X A 1]-------------------------------
+                        // both S (i,j) and A (i,j) present
+                        // [C A 1]: action: ( =A ): copy A into C, no accum
+                        // [X A 1]: action: ( undelete ): zombie lives
+                        GB_C_S_LOOKUP ;
+                        GB_noaccum_C_A_1_matrix ;
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE1_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
 
         //----------------------------------------------------------------------
-        // compute all vectors in this task
+        // phase1: A is hypersparse, sparse, or full
         //----------------------------------------------------------------------
 
-        for (int64_t k = kfirst ; k <= klast ; k++)
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
         {
 
             //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
+            // get the task descriptor
             //------------------------------------------------------------------
 
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
+            GB_GET_TASK_DESCRIPTOR_PHASE1 ;
 
             //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
+            // compute all vectors in this task
             //------------------------------------------------------------------
 
-            // jC = J [j] ; or J is a colon expression
-            // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
+            for (int64_t k = kfirst ; k <= klast ; k++)
             {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
 
-                if (iS < iA)
+                //--------------------------------------------------------------
+                // get A(:,j) and S(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X, Avlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and A(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and A (:,j) have entries
+                while (pS < pS_end && pA < pA_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+
+                    if (iS < iA)
+                    { 
+                        // ----[C . 1] or [X . 1]-------------------------------
+                        // S (i,j) is present but A (i,j) is not
+                        // [C . 1]: action: ( delete ): becomes zombie
+                        // [X . 1]: action: ( X ): still a zombie
+                        GB_C_S_LOOKUP ;
+                        GB_DELETE_ENTRY ;
+                        GB_NEXT (S) ;
+                    }
+                    else if (iA < iS)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // S (i,j) is not present, A (i,j) is present
+                        // [. A 1]: action: ( insert )
+                        task_pending++ ;
+                        GB_NEXT (A) ;
+                    }
+                    else
+                    { 
+                        // ----[C A 1] or [X A 1]-------------------------------
+                        // both S (i,j) and A (i,j) present
+                        // [C A 1]: action: ( =A ): copy A into C, no accum
+                        // [X A 1]: action: ( undelete ): zombie lives
+                        GB_C_S_LOOKUP ;
+                        GB_noaccum_C_A_1_matrix ;
+                        GB_NEXT (S) ;
+                        GB_NEXT (A) ;
+                    }
+                }
+
+                // while list S (:,j) has entries.  List A (:,j) exhausted.
+                while (pS < pS_end)
                 { 
                     // ----[C . 1] or [X . 1]-----------------------------------
                     // S (i,j) is present but A (i,j) is not
@@ -117,44 +259,13 @@ GrB_Info GB_subassign_02
                     GB_DELETE_ENTRY ;
                     GB_NEXT (S) ;
                 }
-                else if (iA < iS)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // S (i,j) is not present, A (i,j) is present
-                    // [. A 1]: action: ( insert )
-                    task_pending++ ;
-                    GB_NEXT (A) ;
-                }
-                else
-                { 
-                    // ----[C A 1] or [X A 1]-----------------------------------
-                    // both S (i,j) and A (i,j) present
-                    // [C A 1]: action: ( =A ): copy A into C, no accum
-                    // [X A 1]: action: ( undelete ): zombie lives
-                    GB_C_S_LOOKUP ;
-                    GB_noaccum_C_A_1_matrix ;
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
 
-            // while list S (:,j) has entries.  List A (:,j) exhausted.
-            while (pS < pS_end)
-            { 
-                // ----[C . 1] or [X . 1]---------------------------------------
-                // S (i,j) is present but A (i,j) is not
-                // [C . 1]: action: ( delete ): becomes zombie
-                // [X . 1]: action: ( X ): still a zombie
-                GB_C_S_LOOKUP ;
-                GB_DELETE_ENTRY ;
-                GB_NEXT (S) ;
+                // List A (:,j) has entries.  List S (:,j) exhausted.
+                task_pending += (pA_end - pA) ;
             }
 
-            // List A (:,j) has entries.  List S (:,j) exhausted.
-            task_pending += (pA_end - pA) ;
+            GB_PHASE1_TASK_WRAPUP ;
         }
-
-        GB_PHASE1_TASK_WRAPUP ;
     }
 
     //--------------------------------------------------------------------------
@@ -163,79 +274,152 @@ GrB_Info GB_subassign_02
 
     GB_PENDING_CUMSUM ;
 
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(&&:pending_sorted)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    if (A_is_bitmap)
     {
 
         //----------------------------------------------------------------------
-        // get the task descriptor
+        // phase2: A is bitmap
         //----------------------------------------------------------------------
 
-        GB_GET_TASK_DESCRIPTOR_PHASE2 ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iA_start, iA_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iA_start:iA_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+                int64_t pA_start = j * Avlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iA_start:iA_end,j) and A(ditto,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+                {
+                    int64_t pA = pA_start + iA ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
+                    bool Afound = Ab [pA] ;
+                    if (!Sfound && Afound)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // S (i,j) is not present, A (i,j) is present
+                        // [. A 1]: action: ( insert )
+                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                        GB_NEXT (A) ;
+                    }
+                    else if (Sfound)
+                    { 
+                        // S (i,j) present
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE2_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
 
         //----------------------------------------------------------------------
-        // compute all vectors in this task
+        // phase2: A is hypersparse, sparse, or full
         //----------------------------------------------------------------------
 
-        for (int64_t k = kfirst ; k <= klast ; k++)
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
         {
 
             //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
+            // get the task descriptor
             //------------------------------------------------------------------
 
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
+            GB_GET_TASK_DESCRIPTOR_PHASE2 ;
 
             //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
+            // compute all vectors in this task
             //------------------------------------------------------------------
 
-            // jC = J [j] ; or J is a colon expression
-            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
+            for (int64_t k = kfirst ; k <= klast ; k++)
             {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
 
-                if (iS < iA)
-                { 
-                    GB_NEXT (S) ;
+                //--------------------------------------------------------------
+                // get A(:,j) and S(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X, Avlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and A(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and A (:,j) have entries
+                while (pS < pS_end && pA < pA_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+
+                    if (iS < iA)
+                    { 
+                        GB_NEXT (S) ;
+                    }
+                    else if (iA < iS)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // S (i,j) is not present, A (i,j) is present
+                        // [. A 1]: action: ( insert )
+                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                        GB_NEXT (A) ;
+                    }
+                    else
+                    { 
+                        GB_NEXT (S) ;
+                        GB_NEXT (A) ;
+                    }
                 }
-                else if (iA < iS)
+
+                // ignore the remainder of S (:,j)
+
+                // while list A (:,j) has entries.  List S (:,j) exhausted.
+                while (pA < pA_end)
                 { 
                     // ----[. A 1]----------------------------------------------
                     // S (i,j) is not present, A (i,j) is present
                     // [. A 1]: action: ( insert )
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
                     int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
                     GB_PENDING_INSERT (Ax +(pA*asize)) ;
                     GB_NEXT (A) ;
                 }
-                else
-                { 
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted.
-            while (pA < pA_end)
-            { 
-                // ----[. A 1]--------------------------------------------------
-                // S (i,j) is not present, A (i,j) is present
-                // [. A 1]: action: ( insert )
-                int64_t iA = Ai [pA] ;
-                int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                GB_NEXT (A) ;
             }
+            GB_PHASE2_TASK_WRAPUP ;
         }
-
-        GB_PHASE2_TASK_WRAPUP ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_subassign_03.c b/GraphBLAS/Source/GB_subassign_03.c
index 89399af66b..a2eb24fc63 100644
--- a/GraphBLAS/Source/GB_subassign_03.c
+++ b/GraphBLAS/Source/GB_subassign_03.c
@@ -2,8 +2,8 @@
 // GB_subassign_03: C(I,J) += scalar ; using S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,6 +16,8 @@
 // A:           scalar
 // S:           constructed
 
+// C is not bitmap: use GB_bitmap_assign instead
+
 #include "GB_subassign_methods.h"
 
 GrB_Info GB_subassign_03
@@ -23,34 +25,45 @@ GrB_Info GB_subassign_03
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
     const GrB_BinaryOp accum,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ;
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
-    const bool C_is_hyper = C->is_hyper ;
+    GB_GET_C ;      // C must not be bitmap
     const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t *GB_RESTRICT Cp = C->p ;
+    const bool C_is_hyper = (Ch != NULL) ;
     const int64_t Cnvec = C->nvec ;
     GB_GET_S ;
-    const int64_t *GB_RESTRICT Sh = S->h ;
-    const int64_t Snvec = S->nvec ;
-    const bool S_is_hyper = S->is_hyper ;
     GB_GET_ACCUM_SCALAR ;
 
     //--------------------------------------------------------------------------
@@ -74,7 +87,6 @@ GrB_Info GB_subassign_03
     // phase 1: create zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
         reduction(+:nzombies)
     for (taskid = 0 ; taskid < ntasks ; taskid++)
@@ -84,7 +96,7 @@ GrB_Info GB_subassign_03
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 ;
+        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iA_start, iA_end) ;
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -97,13 +109,13 @@ GrB_Info GB_subassign_03
             // get jC, the corresponding vector of C
             //------------------------------------------------------------------
 
-            GB_GET_jC ;
+            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
 
             //------------------------------------------------------------------
             // get S(iA_start:end,j)
             //------------------------------------------------------------------
 
-            GB_GET_VECTOR_FOR_IXJ (S) ;
+            GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
 
             //------------------------------------------------------------------
             // C(I(iA_start,iA_end-1),jC) += scalar
@@ -111,7 +123,7 @@ GrB_Info GB_subassign_03
 
             for (int64_t iA = iA_start ; iA < iA_end ; iA++)
             {
-                bool found = (pS < pS_end) && (Si [pS] == iA) ;
+                bool found = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
                 if (!found)
                 { 
                     // ----[. A 1]----------------------------------------------
@@ -150,7 +162,7 @@ GrB_Info GB_subassign_03
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 ;
+        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iA_start, iA_end) ;
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -163,13 +175,13 @@ GrB_Info GB_subassign_03
             // get jC, the corresponding vector of C
             //------------------------------------------------------------------
 
-            GB_GET_jC ;
+            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
 
             //------------------------------------------------------------------
             // get S(iA_start:end,j)
             //------------------------------------------------------------------
 
-            GB_GET_VECTOR_FOR_IXJ (S) ;
+            GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
 
             //------------------------------------------------------------------
             // C(I(iA_start,iA_end-1),jC) += scalar
@@ -177,7 +189,7 @@ GrB_Info GB_subassign_03
 
             for (int64_t iA = iA_start ; iA < iA_end ; iA++)
             {
-                bool found = (pS < pS_end) && (Si [pS] == iA) ;
+                bool found = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
                 if (!found)
                 { 
                     // ----[. A 1]----------------------------------------------
diff --git a/GraphBLAS/Source/GB_subassign_04.c b/GraphBLAS/Source/GB_subassign_04.c
index 17b1a64ca6..7d3b541463 100644
--- a/GraphBLAS/Source/GB_subassign_04.c
+++ b/GraphBLAS/Source/GB_subassign_04.c
@@ -2,8 +2,8 @@
 // GB_subassign_04: C(I,J) += A ; using S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,7 +16,8 @@
 // A:           matrix
 // S:           constructed
 
-#define GB_FREE_WORK GB_FREE_TWO_SLICE
+// C: not bitmap: use GB_bitmap_assign instead
+// A: any sparsity structure.
 
 #include "GB_subassign_methods.h"
 
@@ -25,25 +26,42 @@ GrB_Info GB_subassign_04
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
     const GrB_BinaryOp accum,
     const GrB_Matrix A,
-    const GrB_Matrix S,
     GB_Context Context
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (!GB_aliased (C, A)) ;   // NO ALIAS of C==A
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
+    GB_MATRIX_WAIT_IF_JUMBLED (A) ;
+
+    GB_GET_C ;      // C must not be bitmap
     GB_GET_A ;
     GB_GET_S ;
     GB_GET_ACCUM ;
@@ -64,95 +82,184 @@ GrB_Info GB_subassign_04
     // Method 02 and Method 04 are somewhat similar.  They differ on how C is
     // modified when the entry is present in S but not A.
 
+    // TODO: phase2 of Method 02 and 04 are identical and could be
+    // done in a single function.
+
     // Compare with Method 16, which computes C(I,J)<!M> += A, using S.
 
     //--------------------------------------------------------------------------
-    // Parallel: Z=A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
+    // Parallel: A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
     //--------------------------------------------------------------------------
 
-    GB_SUBASSIGN_TWO_SLICE (A, S) ;
+    if (A_is_bitmap)
+    {
+        // all of IxJ must be examined
+        GB_SUBASSIGN_IXJ_SLICE ;
+    }
+    else
+    {
+        // traverse all A+S
+        GB_SUBASSIGN_TWO_SLICE (A, S) ;
+    }
 
     //--------------------------------------------------------------------------
     // phase 1: create zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(+:nzombies)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    if (A_is_bitmap)
     {
 
         //----------------------------------------------------------------------
-        // get the task descriptor
+        // phase1: A is bitmap
         //----------------------------------------------------------------------
 
-        GB_GET_TASK_DESCRIPTOR_PHASE1 ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iA_start, iA_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iA_start:iA_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+                int64_t pA_start = j * Avlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iA_start:iA_end,j) and A(ditto,j)
+                //--------------------------------------------------------------
+
+                for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+                {
+                    int64_t pA = pA_start + iA ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
+                    bool Afound = Ab [pA] ;
+                    if (Sfound && !Afound)
+                    { 
+                        // ----[C . 1] or [X . 1]-------------------------------
+                        // S (i,j) is present but A (i,j) is not
+                        // [C . 1]: action: ( C ): no change, with accum
+                        // [X . 1]: action: ( X ): still a zombie
+                        GB_NEXT (S) ;
+                    }
+                    else if (!Sfound && Afound)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // S (i,j) is not present, A (i,j) is present
+                        // [. A 1]: action: ( insert )
+                        task_pending++ ;
+                    }
+                    else if (Sfound && Afound)
+                    { 
+                        // ----[C A 1] or [X A 1]-------------------------------
+                        // both S (i,j) and A (i,j) present
+                        // [C A 1]: action: ( =C+A ): apply accum
+                        // [X A 1]: action: ( undelete ): zombie lives
+                        GB_C_S_LOOKUP ;
+                        GB_withaccum_C_A_1_matrix ;
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE1_TASK_WRAPUP ;
+        }
+    }
+    else
+    {
 
         //----------------------------------------------------------------------
-        // compute all vectors in this task
+        // phase1: A is hypersparse, sparse, or full
         //----------------------------------------------------------------------
 
-        for (int64_t k = kfirst ; k <= klast ; k++)
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
         {
 
             //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
+            // get the task descriptor
             //------------------------------------------------------------------
 
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
+            GB_GET_TASK_DESCRIPTOR_PHASE1 ;
 
             //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
+            // compute all vectors in this task
             //------------------------------------------------------------------
 
-            // jC = J [j] ; or J is a colon expression
-            // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
+            for (int64_t k = kfirst ; k <= klast ; k++)
             {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                { 
-                    // ----[C . 1] or [X . 1]-----------------------------------
-                    // S (i,j) is present but A (i,j) is not
-                    // [C . 1]: action: ( C ): no change, with accum
-                    // [X . 1]: action: ( X ): still a zombie
-                    GB_NEXT (S) ;
 
+                //--------------------------------------------------------------
+                // get A(:,j) and S(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X, Avlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and A(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and A (:,j) have entries
+                while (pS < pS_end && pA < pA_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+
+                    if (iS < iA)
+                    { 
+                        // ----[C . 1] or [X . 1]-------------------------------
+                        // S (i,j) is present but A (i,j) is not
+                        // [C . 1]: action: ( C ): no change, with accum
+                        // [X . 1]: action: ( X ): still a zombie
+                        GB_NEXT (S) ;
+                    }
+                    else if (iA < iS)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // S (i,j) is not present, A (i,j) is present
+                        // [. A 1]: action: ( insert )
+                        task_pending++ ;
+                        GB_NEXT (A) ;
+                    }
+                    else
+                    { 
+                        // ----[C A 1] or [X A 1]-------------------------------
+                        // both S (i,j) and A (i,j) present
+                        // [C A 1]: action: ( =C+A ): apply accum
+                        // [X A 1]: action: ( undelete ): zombie lives
+                        GB_C_S_LOOKUP ;
+                        GB_withaccum_C_A_1_matrix ;
+                        GB_NEXT (S) ;
+                        GB_NEXT (A) ;
+                    }
                 }
-                else if (iA < iS)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // S (i,j) is not present, A (i,j) is present
-                    // [. A 1]: action: ( insert )
-                    task_pending++ ;
-                    GB_NEXT (A) ;
-                }
-                else
-                { 
-                    // ----[C A 1] or [X A 1]-----------------------------------
-                    // both S (i,j) and A (i,j) present
-                    // [C A 1]: action: ( =C+A ): apply accum
-                    // [X A 1]: action: ( undelete ): zombie lives
-                    GB_C_S_LOOKUP ;
-                    GB_withaccum_C_A_1_matrix ;
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
 
-            // ignore the remainder of S (:,j)
+                // ignore the remainder of S (:,j)
 
-            // List A (:,j) has entries.  List S (:,j) exhausted.
-            task_pending += (pA_end - pA) ;
-        }
+                // List A (:,j) has entries.  List S (:,j) exhausted.
+                task_pending += (pA_end - pA) ;
+            }
 
-        GB_PHASE1_TASK_WRAPUP ;
+            GB_PHASE1_TASK_WRAPUP ;
+        }
     }
 
     //--------------------------------------------------------------------------
@@ -161,81 +268,152 @@ GrB_Info GB_subassign_04
 
     GB_PENDING_CUMSUM ;
 
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(&&:pending_sorted)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    if (A_is_bitmap)
     {
 
         //----------------------------------------------------------------------
-        // get the task descriptor
+        // phase2: A is bitmap
         //----------------------------------------------------------------------
 
-        GB_GET_TASK_DESCRIPTOR_PHASE2 ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iA_start, iA_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iA_start:iA_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+                int64_t pA_start = j * Avlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iA_start:iA_end,j) and A(ditto,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+                {
+                    int64_t pA = pA_start + iA ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
+                    bool Afound = Ab [pA] ;
+                    if (!Sfound && Afound)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // S (i,j) is not present, A (i,j) is present
+                        // [. A 1]: action: ( insert )
+                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                        GB_NEXT (A) ;
+                    }
+                    else if (Sfound)
+                    { 
+                        // S (i,j) present
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE2_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
 
         //----------------------------------------------------------------------
-        // compute all vectors in this task
+        // phase2: A is hypersparse, sparse, or full
         //----------------------------------------------------------------------
 
-        for (int64_t k = kfirst ; k <= klast ; k++)
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
         {
 
             //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
+            // get the task descriptor
             //------------------------------------------------------------------
 
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
+            GB_GET_TASK_DESCRIPTOR_PHASE2 ;
 
             //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
+            // compute all vectors in this task
             //------------------------------------------------------------------
 
-            // jC = J [j] ; or J is a colon expression
-            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
+            for (int64_t k = kfirst ; k <= klast ; k++)
             {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                { 
-                    GB_NEXT (S) ;
 
+                //--------------------------------------------------------------
+                // get A(:,j) and S(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X, Avlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and A(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and A (:,j) have entries
+                while (pS < pS_end && pA < pA_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+
+                    if (iS < iA)
+                    { 
+                        GB_NEXT (S) ;
+                    }
+                    else if (iA < iS)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // S (i,j) is not present, A (i,j) is present
+                        // [. A 1]: action: ( insert )
+                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                        GB_NEXT (A) ;
+                    }
+                    else
+                    { 
+                        GB_NEXT (S) ;
+                        GB_NEXT (A) ;
+                    }
                 }
-                else if (iA < iS)
+
+                // ignore the remainder of S (:,j)
+
+                // while list A (:,j) has entries.  List S (:,j) exhausted.
+                while (pA < pA_end)
                 { 
                     // ----[. A 1]----------------------------------------------
                     // S (i,j) is not present, A (i,j) is present
                     // [. A 1]: action: ( insert )
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
                     int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
                     GB_PENDING_INSERT (Ax +(pA*asize)) ;
                     GB_NEXT (A) ;
                 }
-                else
-                { 
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // ignore the remainder of S (:,j)
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted.
-            while (pA < pA_end)
-            { 
-                // ----[. A 1]--------------------------------------------------
-                // S (i,j) is not present, A (i,j) is present
-                // [. A 1]: action: ( insert )
-                int64_t iA = Ai [pA] ;
-                int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                GB_NEXT (A) ;
             }
+            GB_PHASE2_TASK_WRAPUP ;
         }
-        GB_PHASE2_TASK_WRAPUP ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_subassign_05.c b/GraphBLAS/Source/GB_subassign_05.c
index 8134a4ac05..edfa59d0a9 100644
--- a/GraphBLAS/Source/GB_subassign_05.c
+++ b/GraphBLAS/Source/GB_subassign_05.c
@@ -2,8 +2,8 @@
 // GB_subassign_05: C(I,J)<M> = scalar ; no S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,6 +16,9 @@
 // A:           scalar
 // S:           none
 
+// C: not bitmap
+// M: any sparsity
+
 #include "GB_subassign_methods.h"
 
 GrB_Info GB_subassign_05
@@ -38,17 +41,27 @@ GrB_Info GB_subassign_05
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
+    GB_EMPTY_TASKLIST ;
+    GB_MATRIX_WAIT_IF_JUMBLED (C) ;
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+
+    GB_GET_C ;      // C must not be bitmap
     int64_t zorig = C->nzombies ;
-    const bool C_is_hyper = C->is_hyper ;
     const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t *GB_RESTRICT Cp = C->p ;
+    const bool C_is_hyper = (Ch != NULL) ;
     const int64_t Cnvec = C->nvec ;
-    const int64_t cvlen = C->vlen ;
     GB_GET_MASK ;
     GB_GET_SCALAR ;
     GrB_BinaryOp accum = NULL ;
@@ -73,13 +86,12 @@ GrB_Info GB_subassign_05
     // Parallel: slice M into coarse/fine tasks (Method 05, 06n, 07)
     //--------------------------------------------------------------------------
 
-    GB_SUBASSIGN_ONE_SLICE (M) ;
+    GB_SUBASSIGN_ONE_SLICE (M) ;    // M cannot be jumbled 
 
     //--------------------------------------------------------------------------
-    // phase 1: create zombies, update entries, and count pending tuples
+    // phase 1: undelete zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
         reduction(+:nzombies)
     for (taskid = 0 ; taskid < ntasks ; taskid++)
@@ -102,8 +114,8 @@ GrB_Info GB_subassign_05
             // get j, the kth vector of M
             //------------------------------------------------------------------
 
-            int64_t j = (Mh == NULL) ? k : Mh [k] ;
-            GB_GET_VECTOR (pM, pM_end, pA, pA_end, Mp, k) ;
+            int64_t j = GBH (Mh, k) ;
+            GB_GET_VECTOR (pM, pM_end, pA, pA_end, Mp, k, Mvlen) ;
             int64_t mjnz = pM_end - pM ;
             if (mjnz == 0) continue ;
 
@@ -113,7 +125,7 @@ GrB_Info GB_subassign_05
 
             GB_GET_jC ;
             int64_t cjnz = pC_end - pC_start ;
-            bool cjdense = (cjnz == cvlen) ;
+            bool cjdense = (cjnz == Cvlen) ;
 
             //------------------------------------------------------------------
             // C(I,jC)<M(:,j)> = scalar ; no S
@@ -133,9 +145,10 @@ GrB_Info GB_subassign_05
                     // update C(iC,jC), but only if M(iA,j) allows it
                     //----------------------------------------------------------
 
-                    if (GB_mcast (Mx, pM, msize))
+                    bool mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
+                    if (mij)
                     { 
-                        int64_t iA = Mi [pM] ;
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
                         GB_iC_DENSE_LOOKUP ;
 
                         // ----[C A 1] or [X A 1]-------------------------------
@@ -160,9 +173,10 @@ GrB_Info GB_subassign_05
                     // update C(iC,jC), but only if M(iA,j) allows it
                     //----------------------------------------------------------
 
-                    if (GB_mcast (Mx, pM, msize))
+                    bool mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
+                    if (mij)
                     {
-                        int64_t iA = Mi [pM] ;
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
 
                         // find C(iC,jC) in C(:,jC)
                         GB_iC_BINARY_SEARCH ;
@@ -216,8 +230,8 @@ GrB_Info GB_subassign_05
             // get j, the kth vector of M
             //------------------------------------------------------------------
 
-            int64_t j = (Mh == NULL) ? k : Mh [k] ;
-            GB_GET_VECTOR (pM, pM_end, pA, pA_end, Mp, k) ;
+            int64_t j = GBH (Mh, k) ;
+            GB_GET_VECTOR (pM, pM_end, pA, pA_end, Mp, k, Mvlen) ;
             int64_t mjnz = pM_end - pM ;
             if (mjnz == 0) continue ;
 
@@ -226,7 +240,7 @@ GrB_Info GB_subassign_05
             //------------------------------------------------------------------
 
             GB_GET_jC ;
-            bool cjdense = ((pC_end - pC_start) == cvlen) ;
+            bool cjdense = ((pC_end - pC_start) == Cvlen) ;
 
             //------------------------------------------------------------------
             // C(I,jC)<M(:,j)> = scalar ; no S
@@ -246,9 +260,10 @@ GrB_Info GB_subassign_05
                     // update C(iC,jC), but only if M(iA,j) allows it
                     //----------------------------------------------------------
 
-                    if (GB_mcast (Mx, pM, msize))
+                    bool mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
+                    if (mij)
                     {
-                        int64_t iA = Mi [pM] ;
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
 
                         // find C(iC,jC) in C(:,jC)
                         GB_iC_BINARY_SEARCH ;
diff --git a/GraphBLAS/Source/GB_subassign_05e.c b/GraphBLAS/Source/GB_subassign_05e.c
index a37cd6d296..744ccce859 100644
--- a/GraphBLAS/Source/GB_subassign_05e.c
+++ b/GraphBLAS/Source/GB_subassign_05e.c
@@ -2,8 +2,8 @@
 // GB_subassign_05e: C(:,:)<M,struct> = scalar ; no S, C empty, M structural
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,11 @@
 // A:           scalar
 // S:           none
 
+// C and M can have any sparsity on input.  The content of C is replace with
+// the structure of M, and the values of C are all set to the scalar.  If M is
+// bitmap, only assignments where (Mb [pC] == 1) are needed, but it's faster to
+// just assign all entries.
+
 #include "GB_subassign_methods.h"
 
 #undef  GB_FREE_ALL
@@ -34,6 +39,12 @@ GrB_Info GB_subassign_05e
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
@@ -42,13 +53,17 @@ GrB_Info GB_subassign_05e
     ASSERT_MATRIX_OK (C, "C for subassign method_05e", GB0) ;
     ASSERT_MATRIX_OK (M, "M for subassign method_05e", GB0) ;
     ASSERT (GB_NNZ (C) == 0) ;
-    ASSERT (!GB_PENDING (C)) ; ASSERT (!GB_ZOMBIES (C)) ;
-    ASSERT (!GB_PENDING (M)) ; ASSERT (!GB_ZOMBIES (M)) ;
+
+    // M can be jumbled, in which case C is jumbled on output 
+    ASSERT (!GB_ZOMBIES (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
+    ASSERT (!GB_PENDING (M)) ;
+
     const GB_Type_code ccode = C->type->code ;
     const size_t csize = C->type->size ;
     GB_GET_SCALAR ;
 
-    int64_t mnz = GB_NNZ (M) ;
+    int64_t mnz = GB_NNZ_HELD (M) ;
 
     //--------------------------------------------------------------------------
     // Method 05e: C(:,:)<M> = x ; C is empty, x is a scalar, M is structural
@@ -73,7 +88,7 @@ GrB_Info GB_subassign_05e
     // initialize them.
 
     bool C_is_csc = C->is_csc ;
-    GB_PHIX_FREE (C) ;
+    GB_phbix_free (C) ;
     GB_OK (GB_dup2 (&C, M, false, C->type, Context)) ;
     C->is_csc = C_is_csc ;
     int64_t pC ;
@@ -117,7 +132,7 @@ GrB_Info GB_subassign_05e
         default:
             {
                 // worker for all user-defined types
-                GB_BURBLE_N (mnz, "generic ") ;
+                GB_BURBLE_N (mnz, "(generic C(:,:)<M,struct>=x assign) ") ;
                 GB_void *GB_RESTRICT Cx = (GB_void *) C->x ;
                 #pragma omp parallel for num_threads(nthreads) schedule(static)
                 for (pC = 0 ; pC < mnz ; pC++)
@@ -133,7 +148,9 @@ GrB_Info GB_subassign_05e
     //--------------------------------------------------------------------------
 
     GB_FREE_WORK ;
+    C->jumbled = M->jumbled ;       // C is jumbled if M is jumbled
     ASSERT_MATRIX_OK (C, "C output for subassign method_05e", GB0) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_subassign_06n.c b/GraphBLAS/Source/GB_subassign_06n.c
index 8d35bc209d..daae7600f8 100644
--- a/GraphBLAS/Source/GB_subassign_06n.c
+++ b/GraphBLAS/Source/GB_subassign_06n.c
@@ -2,8 +2,8 @@
 // GB_subassign_06n: C(I,J)<M> = A ; no S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,6 +16,17 @@
 // A:           matrix
 // S:           none (see also GB_subassign_06s)
 
+// FULL: if A and C are dense, then C remains dense.
+
+// If A is sparse and C dense, C will likely become sparse, except if M(i,j)=0
+// wherever A(i,j) is not present.  So if M==A is aliased and A is sparse, then
+// C remains dense.  Need C(I,J)<A,struct>=A kernel.  Then in that case, if C
+// is dense it remains dense, even if A is sparse.   If that change is made,
+// this kernel can start with converting C to sparse if A is sparse.
+
+// C is not bitmap: GB_bitmap_assign is used if C is bitmap.
+// M and A are not bitmap: 06s is used instead, if M or A are bitmap.
+
 #include "GB_subassign_methods.h"
 
 GrB_Info GB_subassign_06n
@@ -37,23 +48,40 @@ GrB_Info GB_subassign_06n
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ; ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_IS_BITMAP (M)) ;    // Method 06n is not used for M bitmap
+    ASSERT (!GB_IS_BITMAP (A)) ;    // Method 06n is not used for A bitmap
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+    ASSERT (!GB_aliased (C, A)) ;   // NO ALIAS of C==A
+
+    ASSERT_MATRIX_OK (C, "C input for 06n", GB0) ;
+    ASSERT_MATRIX_OK (M, "M input for 06n", GB0) ;
+    ASSERT_MATRIX_OK (A, "A input for 06n", GB0) ;
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
+    GB_EMPTY_TASKLIST ;
+    GB_MATRIX_WAIT_IF_JUMBLED (C) ;
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+    GB_MATRIX_WAIT_IF_JUMBLED (A) ;
+
+    GB_GET_C ;      // C must not be bitmap
     int64_t zorig = C->nzombies ;
-    const bool C_is_hyper = C->is_hyper ;
     const int64_t Cnvec = C->nvec ;
-    const int64_t cvlen = C->vlen ;
     const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t *GB_RESTRICT Cp = C->p ;
+    const bool C_is_hyper = (Ch != NULL) ;
     GB_GET_MASK ;
     GB_GET_A ;
     const int64_t *GB_RESTRICT Ah = A->h ;
     const int64_t Anvec = A->nvec ;
-    const bool A_is_hyper = A->is_hyper ;
-    const int64_t avlen = A->vlen ;
+    const bool A_is_hyper = (Ah != NULL) ;
     GrB_BinaryOp accum = NULL ;
 
     //--------------------------------------------------------------------------
@@ -74,13 +102,12 @@ GrB_Info GB_subassign_06n
     // Parallel: slice M into coarse/fine tasks (Method 05, 06n, 07)
     //--------------------------------------------------------------------------
 
-    GB_SUBASSIGN_ONE_SLICE (M) ;
+    GB_SUBASSIGN_ONE_SLICE (M) ;    // M cannot be jumbled 
 
     //--------------------------------------------------------------------------
     // phase 1: create zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
         reduction(+:nzombies)
     for (taskid = 0 ; taskid < ntasks ; taskid++)
@@ -103,8 +130,8 @@ GrB_Info GB_subassign_06n
             // get j, the kth vector of M
             //------------------------------------------------------------------
 
-            int64_t j = (Mh == NULL) ? k : Mh [k] ;
-            GB_GET_VECTOR (pM, pM_end, pA, pA_end, Mp, k) ;
+            int64_t j = GBH (Mh, k) ;
+            GB_GET_VECTOR (pM, pM_end, pA, pA_end, Mp, k, Mvlen) ;
             int64_t mjnz = pM_end - pM ;
             if (mjnz == 0) continue ;
 
@@ -115,7 +142,7 @@ GrB_Info GB_subassign_06n
             int64_t pA, pA_end ;
             GB_VECTOR_LOOKUP (pA, pA_end, A, j) ;
             int64_t ajnz = pA_end - pA ;
-            bool ajdense = (ajnz == avlen) ;
+            bool ajdense = (ajnz == Avlen) ;
             int64_t pA_start = pA ;
 
             //------------------------------------------------------------------
@@ -125,7 +152,7 @@ GrB_Info GB_subassign_06n
             GB_GET_jC ;
             int64_t cjnz = pC_end - pC_start ;
             if (cjnz == 0 && ajnz == 0) continue ;
-            bool cjdense = (cjnz == cvlen) ;
+            bool cjdense = (cjnz == Cvlen) ;
 
             //------------------------------------------------------------------
             // C(I,jC)<M(:,j)> = A(:,j) ; no S
@@ -147,13 +174,13 @@ GrB_Info GB_subassign_06n
 
                     if (GB_mcast (Mx, pM, msize))
                     { 
-                        int64_t iA = Mi [pM] ;
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
                         GB_iC_DENSE_LOOKUP ;
 
                         // find iA in A(:,j)
                         // A(:,j) is dense; no need for binary search
                         pA = pA_start + iA ;
-                        ASSERT (Ai [pA] == iA) ;
+                        ASSERT (GBI (Ai, pA, Avlen) == iA) ;
                         // ----[C A 1] or [X A 1]-----------------------
                         // [C A 1]: action: ( =A ): copy A to C, no acc
                         // [X A 1]: action: ( undelete ): zombie lives
@@ -177,8 +204,8 @@ GrB_Info GB_subassign_06n
                     //----------------------------------------------------------
 
                     if (GB_mcast (Mx, pM, msize))
-                    { 
-                        int64_t iA = Mi [pM] ;
+                    {
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
                         GB_iC_DENSE_LOOKUP ;
 
                         // find iA in A(:,j)
@@ -221,14 +248,14 @@ GrB_Info GB_subassign_06n
 
                     if (GB_mcast (Mx, pM, msize))
                     {
-                        int64_t iA = Mi [pM] ;
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
 
                         // find C(iC,jC) in C(:,jC)
                         GB_iC_BINARY_SEARCH ;
 
                         // lookup iA in A(:,j)
                         pA = pA_start + iA ;
-                        ASSERT (Ai [pA] == iA) ;
+                        ASSERT (GBI (Ai, pA, Avlen) == iA) ;
 
                         if (cij_found)
                         { 
@@ -264,7 +291,7 @@ GrB_Info GB_subassign_06n
 
                     if (GB_mcast (Mx, pM, msize))
                     {
-                        int64_t iA = Mi [pM] ;
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
 
                         // find C(iC,jC) in C(:,jC)
                         GB_iC_BINARY_SEARCH ;
@@ -333,8 +360,8 @@ GrB_Info GB_subassign_06n
             // get j, the kth vector of M
             //------------------------------------------------------------------
 
-            int64_t j = (Mh == NULL) ? k : Mh [k] ;
-            GB_GET_VECTOR (pM, pM_end, pA, pA_end, Mp, k) ;
+            int64_t j = GBH (Mh, k) ;
+            GB_GET_VECTOR (pM, pM_end, pA, pA_end, Mp, k, Mvlen) ;
             int64_t mjnz = pM_end - pM ;
             if (mjnz == 0) continue ;
 
@@ -346,7 +373,7 @@ GrB_Info GB_subassign_06n
             GB_VECTOR_LOOKUP (pA, pA_end, A, j) ;
             int64_t ajnz = pA_end - pA ;
             if (ajnz == 0) continue ;
-            bool ajdense = (ajnz == avlen) ;
+            bool ajdense = (ajnz == Avlen) ;
             int64_t pA_start = pA ;
 
             //------------------------------------------------------------------
@@ -354,7 +381,7 @@ GrB_Info GB_subassign_06n
             //------------------------------------------------------------------
 
             GB_GET_jC ;
-            bool cjdense = ((pC_end - pC_start) == cvlen) ;
+            bool cjdense = ((pC_end - pC_start) == Cvlen) ;
 
             //------------------------------------------------------------------
             // C(I,jC)<M(:,j)> = A(:,j)
@@ -376,17 +403,17 @@ GrB_Info GB_subassign_06n
 
                     if (GB_mcast (Mx, pM, msize))
                     {
-                        int64_t iA = Mi [pM] ;
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
 
                         // find iA in A(:,j)
                         if (ajdense)
-                        {
+                        { 
                             // A(:,j) is dense; no need for binary search
                             pA = pA_start + iA ;
-                            ASSERT (Ai [pA] == iA) ;
+                            ASSERT (GBI (Ai, pA, Avlen) == iA) ;
                         }
                         else
-                        {
+                        { 
                             // A(:,j) is sparse; use binary search
                             int64_t apright = pA_end - 1 ;
                             bool aij_found ;
diff --git a/GraphBLAS/Source/GB_subassign_06s.c b/GraphBLAS/Source/GB_subassign_06s.c
deleted file mode 100644
index 233a264708..0000000000
--- a/GraphBLAS/Source/GB_subassign_06s.c
+++ /dev/null
@@ -1,310 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_subassign_06s: C(I,J)<M> = A ; using S
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Method 06s: C(I,J)<M> = A ; using S
-
-// M:           present
-// Mask_comp:   false
-// C_replace:   false
-// accum:       NULL
-// A:           matrix
-// S:           constructed (see also Method 06n)
-
-#define GB_FREE_WORK GB_FREE_TWO_SLICE
-
-#include "GB_subassign_methods.h"
-
-GrB_Info GB_subassign_06s
-(
-    GrB_Matrix C,
-    // input:
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    const GrB_Matrix M,
-    const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_Matrix A,
-    const GrB_Matrix S,
-    GB_Context Context
-)
-{
-
-    //--------------------------------------------------------------------------
-    // get inputs
-    //--------------------------------------------------------------------------
-
-    GB_GET_C ;
-    GB_GET_MASK ;
-    const bool M_is_hyper = M->is_hyper ;
-    const int64_t Mnvec = M->nvec ;
-    const int64_t mvlen = M->vlen ;
-    GB_GET_A ;
-    GB_GET_S ;
-    GrB_BinaryOp accum = NULL ;
-
-    //--------------------------------------------------------------------------
-    // Method 06s: C(I,J)<M> = A ; using S
-    //--------------------------------------------------------------------------
-
-    // Time: O((nnz(A)+nnz(S))*log(m)) where m is the # of entries in a vector
-    // of M, not including the time to construct S=C(I,J).  If A, S, and M
-    // are similar in sparsity, then this method can perform well.  If M is
-    // very sparse, Method 06n should be used instead.  This method is selected
-    // if nnz (A) < nnz (M).
-
-    // Method 06s and 14 are very similar.
-
-    //--------------------------------------------------------------------------
-    // Parallel: Z=A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_TWO_SLICE (A, S) ;
-
-    //--------------------------------------------------------------------------
-    // phase 1: create zombies, update entries, and count pending tuples
-    //--------------------------------------------------------------------------
-
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(+:nzombies)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE1 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                {
-                    // S (i,j) is present but A (i,j) is not
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iS) ;
-                    if (mij)
-                    { 
-                        // ----[C . 1] or [X . 1]-------------------------------
-                        // [C . 1]: action: ( delete ): becomes zombie
-                        // [X . 1]: action: ( X ): still zombie
-                        GB_C_S_LOOKUP ;
-                        GB_DELETE_ENTRY ;
-                    }
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        task_pending++ ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                {
-                    // both S (i,j) and A (i,j) present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    if (mij)
-                    { 
-                        // ----[C A 1] or [X A 1]-------------------------------
-                        // [C A 1]: action: ( =A ): A to C no accum
-                        // [X A 1]: action: ( undelete ): zombie lives
-                        GB_C_S_LOOKUP ;
-                        GB_noaccum_C_A_1_matrix ;
-                    }
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list S (:,j) has entries.  List A (:,j) exhausted
-            while (pS < pS_end)
-            {
-                // S (i,j) is present but A (i,j) is not
-                int64_t iS = Si [pS] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iS) ;
-                if (mij)
-                { 
-                    // ----[C . 1] or [X . 1]-----------------------------------
-                    // [C . 1]: action: ( delete ): becomes zombie
-                    // [X . 1]: action: ( X ): still zombie
-                    GB_C_S_LOOKUP ;
-                    GB_DELETE_ENTRY ;
-                }
-                GB_NEXT (S) ;
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    task_pending++ ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE1_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // phase 2: insert pending tuples
-    //--------------------------------------------------------------------------
-
-    GB_PENDING_CUMSUM ;
-
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(&&:pending_sorted)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE2 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                { 
-                    // S (i,j) is present but A (i,j) is not
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                { 
-                    // both S (i,j) and A (i,j) present
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                    GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE2_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // finalize the matrix and return result
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_WRAPUP ;
-}
-
diff --git a/GraphBLAS/Source/GB_subassign_06s_and_14.c b/GraphBLAS/Source/GB_subassign_06s_and_14.c
new file mode 100644
index 0000000000..e31648dd8c
--- /dev/null
+++ b/GraphBLAS/Source/GB_subassign_06s_and_14.c
@@ -0,0 +1,545 @@
+//------------------------------------------------------------------------------
+// GB_subassign_06s_and_14: C(I,J)<M or !M> = A ; using S
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Method 06s: C(I,J)<M> = A ; using S
+// Method 14:  C(I,J)<!M> = A ; using S
+
+// M:           present
+// Mask_comp:   true or false
+// C_replace:   false
+// accum:       NULL
+// A:           matrix
+// S:           constructed
+
+// C: not bitmap or full: use GB_bitmap_assign instead
+// M, A: any sparsity structure.
+
+#include "GB_subassign_methods.h"
+
+GrB_Info GB_subassign_06s_and_14
+(
+    GrB_Matrix C,
+    // input:
+    const GrB_Index *I,
+    const int64_t ni,
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,
+    const int64_t nj,
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,
+    const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, !M, else use M
+    const GrB_Matrix A,
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ; ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+    ASSERT (!GB_aliased (C, A)) ;   // NO ALIAS of C==A
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+    GB_MATRIX_WAIT_IF_JUMBLED (A) ;
+
+    ASSERT_MATRIX_OK (C, "C input for Method 06s/14", GB0) ;
+    ASSERT_MATRIX_OK (M, "M input for Method 06s/14", GB0) ;
+    ASSERT_MATRIX_OK (A, "A input for Method 06s/14", GB0) ;
+    ASSERT_MATRIX_OK (S, "S constructed for Method 06s/14", GB0) ;
+
+    GB_GET_C ;      // C must not be bitmap
+    GB_GET_MASK ;
+    GB_GET_A ;
+    GB_GET_S ;
+    GrB_BinaryOp accum = NULL ;
+
+    //--------------------------------------------------------------------------
+    // Method 06s: C(I,J)<M> = A ; using S
+    //--------------------------------------------------------------------------
+
+    // Time: O((nnz(A)+nnz(S))*log(m)) where m is the # of entries in a vector
+    // of M, not including the time to construct S=C(I,J).  If A, S, and M
+    // are similar in sparsity, then this method can perform well.  If M is
+    // very sparse, Method 06n should be used instead.  Method 06s is selected
+    // if nnz (A) < nnz (M) or if M is bitmap.
+
+    //--------------------------------------------------------------------------
+    // Method 14: C(I,J)<!M> = A ; using S
+    //--------------------------------------------------------------------------
+
+    // Time: Close to optimal.  Omega(nnz(S)+nnz(A)) is required, and the
+    // sparsity of !M cannot be exploited.  The time taken is
+    // O((nnz(A)+nnz(S))*log(m)) where m is the # of entries in a vector of M.
+
+    //--------------------------------------------------------------------------
+    // Parallel: A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
+    //--------------------------------------------------------------------------
+
+    if (A_is_bitmap)
+    { 
+        // all of IxJ must be examined
+        GB_SUBASSIGN_IXJ_SLICE ;
+    }
+    else
+    { 
+        // traverse all A+S
+        GB_SUBASSIGN_TWO_SLICE (A, S) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // phase 1: create zombies, update entries, and count pending tuples
+    //--------------------------------------------------------------------------
+
+    if (A_is_bitmap)
+    {
+
+        //----------------------------------------------------------------------
+        // phase1: A is bitmap TODO: this is SLOW! for method 06s
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iA_start, iA_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iA_start:iA_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+                int64_t pA_start = j * Avlen ;
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iA_start:iA_end,j) and A(ditto,j)
+                //--------------------------------------------------------------
+
+                for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+                {
+                    int64_t pA = pA_start + iA ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
+                    bool Afound = Ab [pA] ;
+
+                    if (Sfound && !Afound)
+                    {
+                        // S (i,j) is present but A (i,j) is not
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[C . 1] or [X . 1]---------------------------
+                            // [C . 1]: action: ( delete ): becomes zombie
+                            // [X . 1]: action: ( X ): still zombie
+                            GB_C_S_LOOKUP ;
+                            GB_DELETE_ENTRY ;
+                        }
+                        GB_NEXT (S) ;
+                    }
+                    else if (!Sfound && Afound)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            task_pending++ ;
+                        }
+                    }
+                    else if (Sfound && Afound)
+                    {
+                        // both S (i,j) and A (i,j) present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[C A 1] or [X A 1]---------------------------
+                            // [C A 1]: action: ( =A ): A to C no accum
+                            // [X A 1]: action: ( undelete ): zombie lives
+                            GB_C_S_LOOKUP ;
+                            GB_noaccum_C_A_1_matrix ;
+                        }
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE1_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // phase1: A is hypersparse, sparse, or full
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_TASK_DESCRIPTOR_PHASE1 ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+
+                //--------------------------------------------------------------
+                // get A(:,j) and S(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X, Avlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and A(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and A (:,j) have entries
+                while (pS < pS_end && pA < pA_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+
+                    if (iS < iA)
+                    {
+                        // S (i,j) is present but A (i,j) is not
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iS) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[C . 1] or [X . 1]---------------------------
+                            // [C . 1]: action: ( delete ): becomes zombie
+                            // [X . 1]: action: ( X ): still zombie
+                            GB_C_S_LOOKUP ;
+                            GB_DELETE_ENTRY ;
+                        }
+                        GB_NEXT (S) ;
+                    }
+                    else if (iA < iS)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            task_pending++ ;
+                        }
+                        GB_NEXT (A) ;
+                    }
+                    else
+                    {
+                        // both S (i,j) and A (i,j) present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[C A 1] or [X A 1]---------------------------
+                            // [C A 1]: action: ( =A ): A to C no accum
+                            // [X A 1]: action: ( undelete ): zombie lives
+                            GB_C_S_LOOKUP ;
+                            GB_noaccum_C_A_1_matrix ;
+                        }
+                        GB_NEXT (S) ;
+                        GB_NEXT (A) ;
+                    }
+                }
+
+                // while list S (:,j) has entries.  List A (:,j) exhausted.
+                while (pS < pS_end)
+                {
+                    // S (i,j) is present but A (i,j) is not
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iS) ;
+                    if (Mask_comp) mij = !mij ;
+                    if (mij)
+                    { 
+                        // ----[C . 1] or [X . 1]-------------------------------
+                        // [C . 1]: action: ( delete ): becomes zombie
+                        // [X . 1]: action: ( X ): still zombie
+                        GB_C_S_LOOKUP ;
+                        GB_DELETE_ENTRY ;
+                    }
+                    GB_NEXT (S) ;
+                }
+
+                // while list A (:,j) has entries.  List S (:,j) exhausted.
+                while (pA < pA_end)
+                {
+                    // S (i,j) is not present, A (i,j) is present
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                    if (Mask_comp) mij = !mij ;
+                    if (mij)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // [. A 1]: action: ( insert )
+                        task_pending++ ;
+                    }
+                    GB_NEXT (A) ;
+                }
+            }
+
+            GB_PHASE1_TASK_WRAPUP ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // phase 2: insert pending tuples
+    //--------------------------------------------------------------------------
+
+    GB_PENDING_CUMSUM ;
+
+    if (A_is_bitmap)
+    {
+
+        //----------------------------------------------------------------------
+        // phase2: A is bitmap
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iA_start, iA_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iA_start:iA_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+                int64_t pA_start = j * Avlen ;
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iA_start:iA_end,j) and A(ditto,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+                {
+                    int64_t pA = pA_start + iA ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
+                    bool Afound = Ab [pA] ;
+                    if (!Sfound && Afound)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                            GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                        }
+                    }
+                    else if (Sfound)
+                    { 
+                        // S (i,j) present
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE2_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // phase2: A is hypersparse, sparse, or full
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_TASK_DESCRIPTOR_PHASE2 ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+
+                //--------------------------------------------------------------
+                // get A(:,j) and S(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X, Avlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and A(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and A (:,j) have entries
+                while (pS < pS_end && pA < pA_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+
+                    if (iS < iA)
+                    { 
+                        // S (i,j) is present but A (i,j) is not
+                        GB_NEXT (S) ;
+                    }
+                    else if (iA < iS)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                            GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                        }
+                        GB_NEXT (A) ;
+                    }
+                    else
+                    { 
+                        // both S (i,j) and A (i,j) present
+                        GB_NEXT (S) ;
+                        GB_NEXT (A) ;
+                    }
+                }
+
+                // while list A (:,j) has entries.  List S (:,j) exhausted.
+                while (pA < pA_end)
+                {
+                    // S (i,j) is not present, A (i,j) is present
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                    if (Mask_comp) mij = !mij ;
+                    if (mij)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // [. A 1]: action: ( insert )
+                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                    }
+                    GB_NEXT (A) ;
+                }
+            }
+
+            GB_PHASE2_TASK_WRAPUP ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // finalize the matrix and return result
+    //--------------------------------------------------------------------------
+
+    GB_SUBASSIGN_WRAPUP ;
+}
+
diff --git a/GraphBLAS/Source/GB_subassign_07.c b/GraphBLAS/Source/GB_subassign_07.c
index a0afc54676..1f4c5caac3 100644
--- a/GraphBLAS/Source/GB_subassign_07.c
+++ b/GraphBLAS/Source/GB_subassign_07.c
@@ -2,8 +2,8 @@
 // GB_subassign_07: C(I,J)<M> += scalar ; no S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,6 +16,9 @@
 // A:           scalar
 // S:           none
 
+// C: not bitmap
+// M: any sparsity
+
 #include "GB_subassign_methods.h"
 
 GrB_Info GB_subassign_07
@@ -39,18 +42,27 @@ GrB_Info GB_subassign_07
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
+    GB_EMPTY_TASKLIST ;
+    GB_MATRIX_WAIT_IF_JUMBLED (C) ;
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+
+    GB_GET_C ;      // C must not be bitmap
     int64_t zorig = C->nzombies ;
-    const bool C_is_hyper = C->is_hyper ;
     const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t *GB_RESTRICT Cp = C->p ;
+    const bool C_is_hyper = (Ch != NULL) ;
     const int64_t Cnvec = C->nvec ;
-    const int64_t cvlen = C->vlen ;
-
     GB_GET_MASK ;
     GB_GET_ACCUM_SCALAR ;
 
@@ -66,13 +78,12 @@ GrB_Info GB_subassign_07
     // Parallel: slice M into coarse/fine tasks (Method 05, 06n, 07)
     //--------------------------------------------------------------------------
 
-    GB_SUBASSIGN_ONE_SLICE (M) ;
+    GB_SUBASSIGN_ONE_SLICE (M) ;    // M cannot be jumbled 
 
     //--------------------------------------------------------------------------
-    // phase 1: create zombies, update entries, and count pending tuples
+    // phase 1: undelete zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
         reduction(+:nzombies)
     for (taskid = 0 ; taskid < ntasks ; taskid++)
@@ -95,8 +106,8 @@ GrB_Info GB_subassign_07
             // get j, the kth vector of M
             //------------------------------------------------------------------
 
-            int64_t j = (Mh == NULL) ? k : Mh [k] ;
-            GB_GET_VECTOR (pM, pM_end, pA, pA_end, Mp, k) ;
+            int64_t j = GBH (Mh, k) ;
+            GB_GET_VECTOR (pM, pM_end, pA, pA_end, Mp, k, Mvlen) ;
             int64_t mjnz = pM_end - pM ;
             if (mjnz == 0) continue ;
 
@@ -106,7 +117,7 @@ GrB_Info GB_subassign_07
 
             GB_GET_jC ;
             int64_t cjnz = pC_end - pC_start ;
-            bool cjdense = (cjnz == cvlen) ;
+            bool cjdense = (cjnz == Cvlen) ;
 
             //------------------------------------------------------------------
             // C(I,jC)<M(:,j)> += scalar ; no S
@@ -126,9 +137,10 @@ GrB_Info GB_subassign_07
                     // update C(iC,jC), but only if M(iA,j) allows it
                     //----------------------------------------------------------
 
-                    if (GB_mcast (Mx, pM, msize))
+                    bool mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
+                    if (mij)
                     { 
-                        int64_t iA = Mi [pM] ;
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
                         GB_iC_DENSE_LOOKUP ;
 
                         // ----[C A 1] or [X A 1]-------------------------------
@@ -153,9 +165,10 @@ GrB_Info GB_subassign_07
                     // update C(iC,jC), but only if M(iA,j) allows it
                     //----------------------------------------------------------
 
-                    if (GB_mcast (Mx, pM, msize))
+                    bool mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
+                    if (mij)
                     {
-                        int64_t iA = Mi [pM] ;
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
 
                         // find C(iC,jC) in C(:,jC)
                         GB_iC_BINARY_SEARCH ;
@@ -209,8 +222,8 @@ GrB_Info GB_subassign_07
             // get j, the kth vector of M
             //------------------------------------------------------------------
 
-            int64_t j = (Mh == NULL) ? k : Mh [k] ;
-            GB_GET_VECTOR (pM, pM_end, pA, pA_end, Mp, k) ;
+            int64_t j = GBH (Mh, k) ;
+            GB_GET_VECTOR (pM, pM_end, pA, pA_end, Mp, k, Mvlen) ;
             int64_t mjnz = pM_end - pM ;
             if (mjnz == 0) continue ;
 
@@ -219,7 +232,7 @@ GrB_Info GB_subassign_07
             //------------------------------------------------------------------
 
             GB_GET_jC ;
-            bool cjdense = ((pC_end - pC_start) == cvlen) ;
+            bool cjdense = ((pC_end - pC_start) == Cvlen) ;
 
             //------------------------------------------------------------------
             // C(I,jC)<M(:,j)> += scalar ; no S
@@ -239,9 +252,10 @@ GrB_Info GB_subassign_07
                     // update C(iC,jC), but only if M(iA,j) allows it
                     //----------------------------------------------------------
 
-                    if (GB_mcast (Mx, pM, msize))
+                    bool mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
+                    if (mij)
                     {
-                        int64_t iA = Mi [pM] ;
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
 
                         // find C(iC,jC) in C(:,jC)
                         GB_iC_BINARY_SEARCH ;
diff --git a/GraphBLAS/Source/GB_subassign_08.c b/GraphBLAS/Source/GB_subassign_08n.c
similarity index 80%
rename from GraphBLAS/Source/GB_subassign_08.c
rename to GraphBLAS/Source/GB_subassign_08n.c
index 5a5b6c043d..21ae37eaa9 100644
--- a/GraphBLAS/Source/GB_subassign_08.c
+++ b/GraphBLAS/Source/GB_subassign_08n.c
@@ -1,13 +1,13 @@
 //------------------------------------------------------------------------------
-// GB_subassign_08: C(I,J)<M> += A ; no S
+// GB_subassign_08n: C(I,J)<M> += A ; no S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Method 08: C(I,J)<M> += A ; no S
+// Method 08n: C(I,J)<M> += A ; no S
 
 // M:           present
 // Mask_comp:   false
@@ -16,7 +16,9 @@
 // A:           matrix
 // S:           none
 
-#define GB_FREE_WORK GB_FREE_EMULT_SLICE
+// C not bitmap; C can be full since no zombies are inserted in that case.
+// If C is bitmap, then GB_bitmap_assign_M_accum is used instead.
+// M, A: not bitmap; Method 08s is used instead if M or A are bitmap.
 
 #include "GB_subassign_methods.h"
 
@@ -77,10 +79,10 @@
 }
 
 //------------------------------------------------------------------------------
-// GB_subassign_08: C(I,J)<M> += A ; no S
+// GB_subassign_08n: C(I,J)<M> += A ; no S
 //------------------------------------------------------------------------------
 
-GrB_Info GB_subassign_08
+GrB_Info GB_subassign_08n
 (
     GrB_Matrix C,
     // input:
@@ -100,25 +102,38 @@ GrB_Info GB_subassign_08
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (!GB_IS_BITMAP (M)) ;    // Method 08s is used if M is bitmap
+    ASSERT (!GB_IS_BITMAP (A)) ;    // Method 08s is used if A is bitmap
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+    ASSERT (!GB_aliased (C, A)) ;   // NO ALIAS of C==A
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
+    GB_EMPTY_TASKLIST ;
+    GB_MATRIX_WAIT_IF_JUMBLED (C) ;
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+    GB_MATRIX_WAIT_IF_JUMBLED (A) ;
+
+    GB_GET_C ;      // C must not be bitmap
     int64_t zorig = C->nzombies ;
-    const bool C_is_hyper = C->is_hyper ;
     const int64_t Cnvec = C->nvec ;
-    const int64_t cvlen = C->vlen ;
     const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t *GB_RESTRICT Cp = C->p ;
+    const bool C_is_hyper = (Ch != NULL) ;
     GB_GET_MASK ;
-    // const int64_t mvlen = M->vlen ;
     GB_GET_A ;
     const int64_t *GB_RESTRICT Ah = A->h ;
     GB_GET_ACCUM ;
 
     //--------------------------------------------------------------------------
-    // Method 08: C(I,J)<M> += A ; no S
+    // Method 08n: C(I,J)<M> += A ; no S
     //--------------------------------------------------------------------------
 
     // Time: Close to optimal. Omega (sum_j (min (nnz (A(:,j)), nnz (M(:,j)))),
@@ -130,19 +145,35 @@ GrB_Info GB_subassign_08
     // same index i, the entry A(i,j) is accumulated or inserted into C.
 
     // The algorithm is very much like the eWise multiplication of A.*M, so the
-    // parallel scheduling relies on GB_emult_phase0(AA and GB_ewise_slice.
+    // parallel scheduling relies on GB_emult_phase0 and GB_ewise_slice.
 
     //--------------------------------------------------------------------------
-    // Parallel: slice the eWiseMult of A.*M (Method 08)
+    // Parallel: slice the eWiseMult of Z=A.*M (Method 08n only)
     //--------------------------------------------------------------------------
 
-    GB_SUBASSIGN_EMULT_SLICE (A,M) ;
+    // Method 08n only.  If C is sparse, it is sliced for a fine task, so that
+    // it can do a binary search via GB_iC_BINARY_SEARCH.  But if C(:,jC) is
+    // dense, C(:,jC) is not sliced, so the fine task must do a direct lookup
+    // via GB_iC_DENSE_LOOKUP.  Otherwise a race condition will occur.
+    // The Z matrix is not constructed, except for its hyperlist (Zh_shallow)
+    // and mapping to A and M.
+
+    // No matrix (C, M, or A) can be bitmap.  C, M, A can be sparse/hyper/full,
+    // in any combination.
+
+    int64_t Znvec ;
+    int64_t *GB_RESTRICT Zh_shallow = NULL ;
+    GB_OK (GB_subassign_emult_slice (
+        &TaskList, &max_ntasks, &ntasks, &nthreads,
+        &Znvec, &Zh_shallow, &Z_to_A, &Z_to_M,
+        C, I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+        A, M, Context)) ;
+    GB_ALLOCATE_NPENDING ;
 
     //--------------------------------------------------------------------------
-    // phase 1: create zombies, update entries, and count pending tuples
+    // phase 1: undelete zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
         reduction(+:nzombies)
     for (taskid = 0 ; taskid < ntasks ; taskid++)
@@ -165,9 +196,9 @@ GrB_Info GB_subassign_08
             // get A(:,j) and M(:,j)
             //------------------------------------------------------------------
 
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_EMULT_VECTOR (pA, pA_end, pA, pA_end, Ap, Ah, j, k, Z_to_A) ;
-            GB_GET_EMULT_VECTOR (pM, pM_end, pB, pB_end, Mp, Mh, j, k, Z_to_M) ;
+            int64_t j = GBH (Zh_shallow, k) ;
+            GB_GET_EVEC (pA, pA_end, pA, pA_end, Ap, Ah, j, k, Z_to_A, Avlen) ;
+            GB_GET_EVEC (pM, pM_end, pB, pB_end, Mp, Mh, j, k, Z_to_M, Mvlen) ;
 
             //------------------------------------------------------------------
             // quick checks for empty intersection of A(:,j) and M(:,j)
@@ -176,10 +207,10 @@ GrB_Info GB_subassign_08
             int64_t ajnz = pA_end - pA ;
             int64_t mjnz = pM_end - pM ;
             if (ajnz == 0 || mjnz == 0) continue ;
-            int64_t iA_first = Ai [pA] ;
-            int64_t iA_last  = Ai [pA_end-1] ;
-            int64_t iM_first = Mi [pM] ;
-            int64_t iM_last  = Mi [pM_end-1] ;
+            int64_t iA_first = GBI (Ai, pA, Avlen) ;
+            int64_t iA_last  = GBI (Ai, pA_end-1, Avlen) ;
+            int64_t iM_first = GBI (Mi, pM, Mvlen) ;
+            int64_t iM_last  = GBI (Mi, pM_end-1, Mvlen) ;
             if (iA_last < iM_first || iM_last < iA_first) continue ;
             int64_t pM_start = pM ;
 
@@ -188,7 +219,7 @@ GrB_Info GB_subassign_08
             //------------------------------------------------------------------
 
             GB_GET_jC ;
-            bool cjdense = (pC_end - pC_start == cvlen) ;
+            bool cjdense = (pC_end - pC_start == Cvlen) ;
 
             //------------------------------------------------------------------
             // C(I,jC)<M(:,j)> += A(:,j) ; no S
@@ -205,7 +236,7 @@ GrB_Info GB_subassign_08
                 {
                     if (GB_mcast (Mx, pM, msize))
                     { 
-                        int64_t iA = Mi [pM] ;
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
                         // find iA in A(:,j)
                         int64_t pright = pA_end - 1 ;
                         bool found ;
@@ -228,7 +259,7 @@ GrB_Info GB_subassign_08
 
                 for ( ; pA < pA_end ; pA++)
                 { 
-                    int64_t iA = Ai [pA] ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
                     GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
                     if (mij) GB_PHASE1_ACTION ;
                 }
@@ -245,8 +276,8 @@ GrB_Info GB_subassign_08
 
                 while (pA < pA_end && pM < pM_end)
                 {
-                    int64_t iA = Ai [pA] ;
-                    int64_t iM = Mi [pM] ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+                    int64_t iM = GBI (Mi, pM, Mvlen) ;
                     if (iA < iM)
                     { 
                         // A(i,j) exists but not M(i,j)
@@ -300,9 +331,9 @@ GrB_Info GB_subassign_08
             // get A(:,j) and M(:,j)
             //------------------------------------------------------------------
 
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_EMULT_VECTOR (pA, pA_end, pA, pA_end, Ap, Ah, j, k, Z_to_A) ;
-            GB_GET_EMULT_VECTOR (pM, pM_end, pB, pB_end, Mp, Mh, j, k, Z_to_M) ;
+            int64_t j = GBH (Zh_shallow, k) ;
+            GB_GET_EVEC (pA, pA_end, pA, pA_end, Ap, Ah, j, k, Z_to_A, Avlen) ;
+            GB_GET_EVEC (pM, pM_end, pB, pB_end, Mp, Mh, j, k, Z_to_M, Mvlen) ;
 
             //------------------------------------------------------------------
             // quick checks for empty intersection of A(:,j) and M(:,j)
@@ -311,10 +342,10 @@ GrB_Info GB_subassign_08
             int64_t ajnz = pA_end - pA ;
             int64_t mjnz = pM_end - pM ;
             if (ajnz == 0 || mjnz == 0) continue ;
-            int64_t iA_first = Ai [pA] ;
-            int64_t iA_last  = Ai [pA_end-1] ;
-            int64_t iM_first = Mi [pM] ;
-            int64_t iM_last  = Mi [pM_end-1] ;
+            int64_t iA_first = GBI (Ai, pA, Avlen) ;
+            int64_t iA_last  = GBI (Ai, pA_end-1, Avlen) ;
+            int64_t iM_first = GBI (Mi, pM, Mvlen) ;
+            int64_t iM_last  = GBI (Mi, pM_end-1, Mvlen) ;
             if (iA_last < iM_first || iM_last < iA_first) continue ;
             int64_t pM_start = pM ;
 
@@ -323,7 +354,7 @@ GrB_Info GB_subassign_08
             //------------------------------------------------------------------
 
             GB_GET_jC ;
-            bool cjdense = (pC_end - pC_start == cvlen) ;
+            bool cjdense = (pC_end - pC_start == Cvlen) ;
             if (cjdense) continue ;
 
             //------------------------------------------------------------------
@@ -341,7 +372,7 @@ GrB_Info GB_subassign_08
                 {
                     if (GB_mcast (Mx, pM, msize))
                     { 
-                        int64_t iA = Mi [pM] ;
+                        int64_t iA = GBI (Mi, pM, Mvlen) ;
                         // find iA in A(:,j)
                         int64_t pright = pA_end - 1 ;
                         bool found ;
@@ -364,7 +395,7 @@ GrB_Info GB_subassign_08
 
                 for ( ; pA < pA_end ; pA++)
                 { 
-                    int64_t iA = Ai [pA] ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
                     GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
                     if (mij) GB_PHASE2_ACTION ;
                 }
@@ -381,8 +412,8 @@ GrB_Info GB_subassign_08
 
                 while (pA < pA_end && pM < pM_end)
                 {
-                    int64_t iA = Ai [pA] ;
-                    int64_t iM = Mi [pM] ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+                    int64_t iM = GBI (Mi, pM, Mvlen) ;
                     if (iA < iM)
                     { 
                         // A(i,j) exists but not M(i,j)
diff --git a/GraphBLAS/Source/GB_subassign_08s_and_16.c b/GraphBLAS/Source/GB_subassign_08s_and_16.c
new file mode 100644
index 0000000000..b890226f88
--- /dev/null
+++ b/GraphBLAS/Source/GB_subassign_08s_and_16.c
@@ -0,0 +1,517 @@
+//------------------------------------------------------------------------------
+// GB_subassign_08s_and_16: C(I,J)<M or !M> += A ; using S
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Method 08s: C(I,J)<M> += A ; using S
+// Method 16: C(I,J)<!M> += A ; using S
+
+// M:           present
+// Mask_comp:   true or false
+// C_replace:   false
+// accum:       present
+// A:           matrix
+// S:           constructed
+
+// C: not bitmap: use GB_bitmap_assign instead
+// M, A: any sparsity structure.
+
+#include "GB_subassign_methods.h"
+
+GrB_Info GB_subassign_08s_and_16
+(
+    GrB_Matrix C,
+    // input:
+    const GrB_Index *I,
+    const int64_t ni,
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,
+    const int64_t nj,
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,
+    const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, !M, else use M
+    const GrB_BinaryOp accum,
+    const GrB_Matrix A,
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+    ASSERT (!GB_aliased (C, A)) ;   // NO ALIAS of C==A
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+    GB_MATRIX_WAIT_IF_JUMBLED (A) ;
+
+    GB_GET_C ;      // C must not be bitmap
+    GB_GET_MASK ;
+    GB_GET_A ;
+    GB_GET_S ;
+    GB_GET_ACCUM ;
+
+    //--------------------------------------------------------------------------
+    // Method 16:  C(I,J)<!M> += A ; using S
+    //--------------------------------------------------------------------------
+
+    // Time: Close to optimal.  All entries in A+S must be traversed.
+
+    //--------------------------------------------------------------------------
+    // Method 08s: C(I,J)<M> += A ; using S
+    //--------------------------------------------------------------------------
+
+    // Time: Only entries in A must be traversed, and the corresponding entries
+    // in C located.  This method constructs S and traverses all of it in the
+    // worst case.  Compare with method 08n, which does not construct S but
+    // instead uses a binary search for entries in C, but it only traverses
+    // entries in A.*M.
+
+    //--------------------------------------------------------------------------
+    // Parallel: A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
+    //--------------------------------------------------------------------------
+
+    if (A_is_bitmap)
+    { 
+        // all of IxJ must be examined
+        GB_SUBASSIGN_IXJ_SLICE ;
+    }
+    else
+    { 
+        // traverse all A+S
+        GB_SUBASSIGN_TWO_SLICE (A, S) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // phase 1: create zombies, update entries, and count pending tuples
+    //--------------------------------------------------------------------------
+
+    if (A_is_bitmap)
+    {
+
+        //----------------------------------------------------------------------
+        // phase1: A is bitmap
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iA_start, iA_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iA_start:iA_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+                int64_t pA_start = j * Avlen ;
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iA_start:iA_end,j) and A(ditto,j)
+                //--------------------------------------------------------------
+
+                for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+                {
+                    int64_t pA = pA_start + iA ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
+                    bool Afound = Ab [pA] ;
+
+                    if (Sfound && !Afound)
+                    { 
+                        // S (i,j) is present but A (i,j) is not
+                        // ----[C . 1] or [X . 1]-------------------------------
+                        // [C . 1]: action: ( C ): no change, with accum
+                        // [X . 1]: action: ( X ): still a zombie
+                        // ----[C . 0] or [X . 0]-------------------------------
+                        // [C . 0]: action: ( C ): no change, with accum
+                        // [X . 0]: action: ( X ): still a zombie
+                        GB_NEXT (S) ;
+                    }
+                    else if (!Sfound && Afound)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            task_pending++ ;
+                        }
+                    }
+                    else if (Sfound && Afound)
+                    {
+                        // both S (i,j) and A (i,j) present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[C A 1] or [X A 1]---------------------------
+                            // [C A 1]: action: ( =A ): A to C no accum
+                            // [C A 1]: action: ( =C+A ): apply accum
+                            // [X A 1]: action: ( undelete ): zombie lives
+                            GB_C_S_LOOKUP ;
+                            GB_withaccum_C_A_1_matrix ;
+                        }
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE1_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // phase1: A is hypersparse, sparse, or full
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_TASK_DESCRIPTOR_PHASE1 ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+
+                //--------------------------------------------------------------
+                // get A(:,j) and S(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X, Avlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and A(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and A (:,j) have entries
+                while (pS < pS_end && pA < pA_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+
+                    if (iS < iA)
+                    { 
+                        // S (i,j) is present but A (i,j) is not
+                        // ----[C . 1] or [X . 1]-------------------------------
+                        // [C . 1]: action: ( C ): no change, with accum
+                        // [X . 1]: action: ( X ): still a zombie
+                        // ----[C . 0] or [X . 0]-------------------------------
+                        // [C . 0]: action: ( C ): no change, with accum
+                        // [X . 0]: action: ( X ): still a zombie
+                        GB_NEXT (S) ;
+                    }
+                    else if (iA < iS)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            task_pending++ ;
+                        }
+                        GB_NEXT (A) ;
+                    }
+                    else
+                    {
+                        // both S (i,j) and A (i,j) present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[C A 1] or [X A 1]---------------------------
+                            // [C A 1]: action: ( =A ): A to C no accum
+                            // [C A 1]: action: ( =C+A ): apply accum
+                            // [X A 1]: action: ( undelete ): zombie lives
+                            GB_C_S_LOOKUP ;
+                            GB_withaccum_C_A_1_matrix ;
+                        }
+                        GB_NEXT (S) ;
+                        GB_NEXT (A) ;
+                    }
+                }
+
+                // ignore the remainder of S(:,j)
+
+                // while list A (:,j) has entries.  List S (:,j) exhausted.
+                while (pA < pA_end)
+                {
+                    // S (i,j) is not present, A (i,j) is present
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                    if (Mask_comp) mij = !mij ;
+                    if (mij)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // [. A 1]: action: ( insert )
+                        task_pending++ ;
+                    }
+                    GB_NEXT (A) ;
+                }
+            }
+
+            GB_PHASE1_TASK_WRAPUP ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // phase 2: insert pending tuples
+    //--------------------------------------------------------------------------
+
+    GB_PENDING_CUMSUM ;
+
+    if (A_is_bitmap)
+    {
+
+        //----------------------------------------------------------------------
+        // phase2: A is bitmap
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iA_start, iA_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iA_start:iA_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+                int64_t pA_start = j * Avlen ;
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iA_start:iA_end,j) and A(ditto,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+                {
+                    int64_t pA = pA_start + iA ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
+                    bool Afound = Ab [pA] ;
+                    if (!Sfound && Afound)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                            GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                        }
+                    }
+                    else if (Sfound)
+                    { 
+                        // S (i,j) present
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE2_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // phase2: A is hypersparse, sparse, or full
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_TASK_DESCRIPTOR_PHASE2 ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+
+                //--------------------------------------------------------------
+                // get A(:,j) and S(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X, Avlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and A(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and A (:,j) have entries
+                while (pS < pS_end && pA < pA_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+
+                    if (iS < iA)
+                    { 
+                        // S (i,j) is present but A (i,j) is not
+                        GB_NEXT (S) ;
+                    }
+                    else if (iA < iS)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                            GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                        }
+                        GB_NEXT (A) ;
+                    }
+                    else
+                    { 
+                        // both S (i,j) and A (i,j) present
+                        GB_NEXT (S) ;
+                        GB_NEXT (A) ;
+                    }
+                }
+
+                // while list A (:,j) has entries.  List S (:,j) exhausted.
+                while (pA < pA_end)
+                {
+                    // S (i,j) is not present, A (i,j) is present
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                    if (Mask_comp) mij = !mij ;
+                    if (mij)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // [. A 1]: action: ( insert )
+                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                    }
+                    GB_NEXT (A) ;
+                }
+            }
+
+            GB_PHASE2_TASK_WRAPUP ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // finalize the matrix and return result
+    //--------------------------------------------------------------------------
+
+    GB_SUBASSIGN_WRAPUP ;
+}
+
diff --git a/GraphBLAS/Source/GB_subassign_09.c b/GraphBLAS/Source/GB_subassign_09.c
index 4f6fcbc530..e1805a5168 100644
--- a/GraphBLAS/Source/GB_subassign_09.c
+++ b/GraphBLAS/Source/GB_subassign_09.c
@@ -2,8 +2,8 @@
 // GB_subassign_09: C(I,J)<M,repl> = scalar ; using S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,8 +16,9 @@
 // A:           scalar
 // S:           constructed
 
-#define GB_FREE_WORK GB_FREE_TWO_SLICE
+// C: not bitmap or full
 
+#include "GB_unused.h"
 #include "GB_subassign_methods.h"
 
 GrB_Info GB_subassign_09
@@ -25,10 +26,12 @@ GrB_Info GB_subassign_09
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
@@ -36,22 +39,32 @@ GrB_Info GB_subassign_09
     const bool Mask_struct,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ; ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
-    // GB_GET_MASK ;
-    const int64_t *GB_RESTRICT Mp = M->p ;
-//  const int64_t *GB_RESTRICT Mh = M->h ;
-    const int64_t *GB_RESTRICT Mi = M->i ;
-    const GB_void *GB_RESTRICT Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
-    const size_t msize = M->type->size ;
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+
+    GB_GET_C ;      // C must not be bitmap
+    GB_GET_MASK ;
     GB_GET_SCALAR ;
     GB_GET_S ;
     GrB_BinaryOp accum = NULL ;
@@ -72,56 +85,192 @@ GrB_Info GB_subassign_09
     // Method 09 and Method 11 are very similar.
 
     //--------------------------------------------------------------------------
-    // Parallel: Z=M+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
+    // Parallel: M+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
     //--------------------------------------------------------------------------
 
-    GB_SUBASSIGN_TWO_SLICE (M, S) ;
+    if (M_is_bitmap)
+    {
+        // all of IxJ must be examined
+        GB_SUBASSIGN_IXJ_SLICE ;
+    }
+    else
+    {
+        // traverse all M+S
+        GB_SUBASSIGN_TWO_SLICE (M, S) ;
+    }
 
     //--------------------------------------------------------------------------
     // phase 1: create zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(+:nzombies)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    if (M_is_bitmap)
     {
 
         //----------------------------------------------------------------------
-        // get the task descriptor
+        // phase1: M is bitmap
         //----------------------------------------------------------------------
 
-        GB_GET_TASK_DESCRIPTOR_PHASE1 ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iM_start, iM_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iM_start:iM_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iM_start) ;
+                int64_t pM_start = j * Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iM_start:iM_end,j) and M(ditto,j)
+                //--------------------------------------------------------------
+
+                for (int64_t iM = iM_start ; iM < iM_end ; iM++)
+                {
+
+                    int64_t pM = pM_start + iM ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iM) ;
+                    bool mij = Mb [pM] && GB_mcast (Mx, pM, msize) ;
+
+                    if (Sfound && !mij)
+                    { 
+                        // S (i,j) is present but M (i,j) is false
+                        // ----[C A 0] or [X A 0]-------------------------------
+                        // [X A 0]: action: ( X ): still a zombie
+                        // [C A 0]: C_repl: action: ( delete ): becomes zombie
+                        GB_C_S_LOOKUP ;
+                        GB_DELETE_ENTRY ;
+                        GB_NEXT (S) ;
+                    }
+                    else if (!Sfound && mij)
+                    { 
+                        // S (i,j) is not present, M (i,j) is true
+                        // ----[. A 1]------------------------------------------
+                        // [. A 1]: action: ( insert )
+                        task_pending++ ;
+                    }
+                    else if (Sfound && mij)
+                    { 
+                        // S (i,j) present and M (i,j) is true
+                        GB_C_S_LOOKUP ;
+                        // ----[C A 1] or [X A 1]-------------------------------
+                        // [C A 1]: action: ( =A ): copy A, no accum
+                        // [X A 1]: action: ( undelete ): zombie lives
+                        GB_noaccum_C_A_1_scalar ;
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE1_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
 
         //----------------------------------------------------------------------
-        // compute all vectors in this task
+        // phase1: M is hypersparse, sparse, or full
         //----------------------------------------------------------------------
 
-        for (int64_t k = kfirst ; k <= klast ; k++)
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
         {
 
             //------------------------------------------------------------------
-            // get S(:,j) and M(:,j)
+            // get the task descriptor
             //------------------------------------------------------------------
 
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pM, pM_end, pA, pA_end, Mp, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
+            GB_GET_TASK_DESCRIPTOR_PHASE1 ;
 
             //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and M(:,j)
+            // compute all vectors in this task
             //------------------------------------------------------------------
 
-            // jC = J [j] ; or J is a colon expression
-            // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and M (:,j) have entries
-            while (pS < pS_end && pM < pM_end)
+            for (int64_t k = kfirst ; k <= klast ; k++)
             {
-                int64_t iS = Si [pS] ;
-                int64_t iM = Mi [pM] ;
 
-                if (iS < iM)
+                //--------------------------------------------------------------
+                // get S(:,j) and M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pM, pM_end, pA, pA_end, Mp, j, k, Z_to_X, Mvlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and M(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and M (:,j) have entries
+                while (pS < pS_end && pM < pM_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iM = GBI (Mi, pM, Mvlen) ;
+
+                    if (iS < iM)
+                    { 
+                        // S (i,j) is present but M (i,j) is not
+                        // ----[C A 0] or [X A 0]-------------------------------
+                        // [X A 0]: action: ( X ): still a zombie
+                        // [C A 0]: C_repl: action: ( delete ): becomes zombie
+                        GB_C_S_LOOKUP ;
+                        GB_DELETE_ENTRY ;
+                        GB_NEXT (S) ;
+                    }
+                    else if (iM < iS)
+                    {
+                        // S (i,j) is not present, M (i,j) is present
+                        if (GB_mcast (Mx, pM, msize))
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            task_pending++ ;
+                        }
+                        GB_NEXT (M) ;
+                    }
+                    else
+                    {
+                        // both S (i,j) and M (i,j) present
+                        GB_C_S_LOOKUP ;
+                        if (GB_mcast (Mx, pM, msize))
+                        { 
+                            // ----[C A 1] or [X A 1]---------------------------
+                            // [C A 1]: action: ( =A ): copy A, no accum
+                            // [X A 1]: action: ( undelete ): zombie lives
+                            GB_noaccum_C_A_1_scalar ;
+                        }
+                        else
+                        { 
+                            // ----[C A 0] or [X A 0]---------------------------
+                            // [X A 0]: action: ( X ): still a zombie
+                            // [C A 0]: C_repl: action: ( delete ): now zombie
+                            GB_DELETE_ENTRY ;
+                        }
+                        GB_NEXT (S) ;
+                        GB_NEXT (M) ;
+                    }
+                }
+
+                // while list S (:,j) has entries.  List M (:,j) exhausted.
+                while (pS < pS_end)
                 { 
                     // S (i,j) is present but M (i,j) is not
                     // ----[C A 0] or [X A 0]-----------------------------------
@@ -131,7 +280,9 @@ GrB_Info GB_subassign_09
                     GB_DELETE_ENTRY ;
                     GB_NEXT (S) ;
                 }
-                else if (iM < iS)
+
+                // while list M (:,j) has entries.  List S (:,j) exhausted.
+                while (pM < pM_end)
                 {
                     // S (i,j) is not present, M (i,j) is present
                     if (GB_mcast (Mx, pM, msize))
@@ -142,56 +293,10 @@ GrB_Info GB_subassign_09
                     }
                     GB_NEXT (M) ;
                 }
-                else
-                {
-                    // both S (i,j) and M (i,j) present
-                    GB_C_S_LOOKUP ;
-                    if (GB_mcast (Mx, pM, msize))
-                    { 
-                        // ----[C A 1] or [X A 1]-------------------------------
-                        // [C A 1]: action: ( =A ): copy A, no accum
-                        // [X A 1]: action: ( undelete ): zombie lives
-                        GB_noaccum_C_A_1_scalar ;
-                    }
-                    else
-                    { 
-                        // ----[C A 0] or [X A 0]-------------------------------
-                        // [X A 0]: action: ( X ): still a zombie
-                        // [C A 0]: C_repl: action: ( delete ): becomes zombie
-                        GB_DELETE_ENTRY ;
-                    }
-                    GB_NEXT (S) ;
-                    GB_NEXT (M) ;
-                }
-            }
-
-            // while list S (:,j) has entries.  List M (:,j) exhausted
-            while (pS < pS_end)
-            { 
-                // S (i,j) is present but M (i,j) is not
-                // ----[C A 0] or [X A 0]-----------------------------------
-                // [X A 0]: action: ( X ): still a zombie
-                // [C A 0]: C_repl: action: ( delete ): becomes zombie
-                GB_C_S_LOOKUP ;
-                GB_DELETE_ENTRY ;
-                GB_NEXT (S) ;
             }
 
-            // while list M (:,j) has entries.  List S (:,j) exhausted
-            while (pM < pM_end)
-            {
-                // S (i,j) is not present, M (i,j) is present
-                if (GB_mcast (Mx, pM, msize))
-                { 
-                    // ----[. A 1]------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    task_pending++ ;
-                }
-                GB_NEXT (M) ;
-            }
+            GB_PHASE1_TASK_WRAPUP ;
         }
-
-        GB_PHASE1_TASK_WRAPUP ;
     }
 
     //--------------------------------------------------------------------------
@@ -200,87 +305,159 @@ GrB_Info GB_subassign_09
 
     GB_PENDING_CUMSUM ;
 
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(&&:pending_sorted)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    if (M_is_bitmap)
     {
 
         //----------------------------------------------------------------------
-        // get the task descriptor
+        // phase2: M is bitmap
         //----------------------------------------------------------------------
 
-        GB_GET_TASK_DESCRIPTOR_PHASE2 ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iM_start, iM_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iM_start:iM_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iM_start) ;
+                int64_t pM_start = j * Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iM_start:iM_end,j) and M(ditto,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                for (int64_t iM = iM_start ; iM < iM_end ; iM++)
+                {
+                    int64_t pM = pM_start + iM ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iM) ;
+                    bool mij = Mb [pM] && GB_mcast (Mx, pM, msize) ;
+
+                    if (!Sfound && mij)
+                    { 
+                        // S (i,j) is not present, M (i,j) is true
+                        // ----[. A 1]------------------------------------------
+                        // [. A 1]: action: ( insert )
+                        int64_t iC = GB_ijlist (I, iM, Ikind, Icolon) ;
+                        GB_PENDING_INSERT (scalar) ;
+                    }
+                    else if (Sfound)
+                    { 
+                        // S (i,j) present
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE2_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
 
         //----------------------------------------------------------------------
-        // compute all vectors in this task
+        // phase2: M is hypersparse, sparse, or full
         //----------------------------------------------------------------------
 
-        for (int64_t k = kfirst ; k <= klast ; k++)
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
         {
 
             //------------------------------------------------------------------
-            // get S(:,j) and M(:,j)
+            // get the task descriptor
             //------------------------------------------------------------------
 
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pM, pM_end, pA, pA_end, Mp, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
+            GB_GET_TASK_DESCRIPTOR_PHASE2 ;
 
             //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and M(:,j)
+            // compute all vectors in this task
             //------------------------------------------------------------------
 
-            // jC = J [j] ; or J is a colon expression
-            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and M (:,j) have entries
-            while (pS < pS_end && pM < pM_end)
+            for (int64_t k = kfirst ; k <= klast ; k++)
             {
-                int64_t iS = Si [pS] ;
-                int64_t iM = Mi [pM] ;
 
-                if (iS < iM)
-                { 
-                    // S (i,j) is present but M (i,j) is not
-                    GB_NEXT (S) ;
+                //--------------------------------------------------------------
+                // get S(:,j) and M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pM, pM_end, pA, pA_end, Mp, j, k, Z_to_X, Mvlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and M(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and M (:,j) have entries
+                while (pS < pS_end && pM < pM_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iM = GBI (Mi, pM, Mvlen) ;
+
+                    if (iS < iM)
+                    { 
+                        // S (i,j) is present but M (i,j) is not
+                        GB_NEXT (S) ;
+                    }
+                    else if (iM < iS)
+                    {
+                        // S (i,j) is not present, M (i,j) is present
+                        if (GB_mcast (Mx, pM, msize))
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            int64_t iC = GB_ijlist (I, iM, Ikind, Icolon) ;
+                            GB_PENDING_INSERT (scalar) ;
+                        }
+                        GB_NEXT (M) ;
+                    }
+                    else
+                    { 
+                        // both S (i,j) and M (i,j) present
+                        GB_NEXT (S) ;
+                        GB_NEXT (M) ;
+                    }
                 }
-                else if (iM < iS)
+
+                // while list M (:,j) has entries.  List S (:,j) exhausted.
+                while (pM < pM_end)
                 {
                     // S (i,j) is not present, M (i,j) is present
                     if (GB_mcast (Mx, pM, msize))
                     { 
                         // ----[. A 1]------------------------------------------
                         // [. A 1]: action: ( insert )
+                        int64_t iM = GBI (Mi, pM, Mvlen) ;
                         int64_t iC = GB_ijlist (I, iM, Ikind, Icolon) ;
                         GB_PENDING_INSERT (scalar) ;
                     }
                     GB_NEXT (M) ;
                 }
-                else
-                { 
-                    // both S (i,j) and M (i,j) present
-                    GB_NEXT (S) ;
-                    GB_NEXT (M) ;
-                }
             }
 
-            // while list M (:,j) has entries.  List S (:,j) exhausted
-            while (pM < pM_end)
-            {
-                // S (i,j) is not present, M (i,j) is present
-                if (GB_mcast (Mx, pM, msize))
-                { 
-                    // ----[. A 1]------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    int64_t iM = Mi [pM] ;
-                    int64_t iC = GB_ijlist (I, iM, Ikind, Icolon) ;
-                    GB_PENDING_INSERT (scalar) ;
-                }
-                GB_NEXT (M) ;
-            }
+            GB_PHASE2_TASK_WRAPUP ;
         }
-
-        GB_PHASE2_TASK_WRAPUP ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_subassign_10.c b/GraphBLAS/Source/GB_subassign_10.c
deleted file mode 100644
index 947603d5bf..0000000000
--- a/GraphBLAS/Source/GB_subassign_10.c
+++ /dev/null
@@ -1,309 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_subassign_10: C(I,J)<M,repl> = A ; using S
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Method 10: C(I,J)<M,repl> = A ; using S
-
-// M:           present
-// Mask_comp:   false
-// C_replace:   true
-// accum:       NULL
-// A:           matrix
-// S:           constructed
-
-#define GB_FREE_WORK GB_FREE_TWO_SLICE
-
-#include "GB_subassign_methods.h"
-
-GrB_Info GB_subassign_10
-(
-    GrB_Matrix C,
-    // input:
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    const GrB_Matrix M,
-    const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_Matrix A,
-    const GrB_Matrix S,
-    GB_Context Context
-)
-{
-
-    //--------------------------------------------------------------------------
-    // get inputs
-    //--------------------------------------------------------------------------
-
-    GB_GET_C ;
-    GB_GET_MASK ;
-    const bool M_is_hyper = M->is_hyper ;
-    const int64_t Mnvec = M->nvec ;
-    const int64_t mvlen = M->vlen ;
-    GB_GET_A ;
-    GB_GET_S ;
-    GrB_BinaryOp accum = NULL ;
-
-    //--------------------------------------------------------------------------
-    // Method 10: C(I,J)<M,repl> = A ; using S
-    //--------------------------------------------------------------------------
-
-    // Time: Optimal.  Omega (nnz(A)+nnz(S)), since all entries in S+A must be
-    // traversed, and the corresponding entry in M (even if not present)
-    // determines the action to take.
-
-    // Method 10 and 18 are very similar.
-
-    //--------------------------------------------------------------------------
-    // Parallel: Z=A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_TWO_SLICE (A, S) ;
-
-    //--------------------------------------------------------------------------
-    // phase 1: create zombies, update entries, and count pending tuples
-    //--------------------------------------------------------------------------
-
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(+:nzombies)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE1 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                { 
-                    // S (i,j) is present but A (i,j) is not
-                    // ----[C . 1] or [X . 1]-----------------------------------
-                    // [C . 1]: action: ( delete ): becomes zombie
-                    // [X . 1]: action: ( X ): still zombie
-                    // ----[C . 0] or [X . 0]-----------------------------------
-                    // [X . 0]: action: ( X ): still a zombie
-                    // [C . 0]: C_repl: action: ( delete ): becomes zombie
-                    GB_C_S_LOOKUP ;
-                    GB_DELETE_ENTRY ;
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        task_pending++ ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                {
-                    // both S (i,j) and A (i,j) present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    GB_C_S_LOOKUP ;
-                    if (mij)
-                    { 
-                        // ----[C A 1] or [X A 1]-------------------------------
-                        // [C A 1]: action: ( =A ): A to C no accum
-                        // [X A 1]: action: ( undelete ): zombie lives
-                        GB_noaccum_C_A_1_matrix ;
-                    }
-                    else
-                    { 
-                        // ----[C A 0] or [X A 0]-------------------------------
-                        // [X A 0]: action: ( X ): still a zombie
-                        // [C A 0]: C_repl: action: ( delete ): becomes zombie
-                        GB_DELETE_ENTRY ;
-                    }
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list S (:,j) has entries.  List A (:,j) exhausted
-            while (pS < pS_end)
-            { 
-                // ----[C . 1] or [X . 1]---------------------------------------
-                // S (i,j) is present but A (i,j) is not
-                // [C . 1]: action: ( delete ): becomes zombie
-                // [X . 1]: action: ( X ): still a zombie
-                GB_C_S_LOOKUP ;
-                GB_DELETE_ENTRY ;
-                GB_NEXT (S) ;
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    task_pending++ ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE1_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // phase 2: insert pending tuples
-    //--------------------------------------------------------------------------
-
-    GB_PENDING_CUMSUM ;
-
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(&&:pending_sorted)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE2 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                { 
-                    // S (i,j) is present but A (i,j) is not
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                { 
-                    // both S (i,j) and A (i,j) present
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                    GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE2_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // finalize the matrix and return result
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_WRAPUP ;
-}
-
diff --git a/GraphBLAS/Source/GB_subassign_10_and_18.c b/GraphBLAS/Source/GB_subassign_10_and_18.c
new file mode 100644
index 0000000000..4cb8be3cb8
--- /dev/null
+++ b/GraphBLAS/Source/GB_subassign_10_and_18.c
@@ -0,0 +1,535 @@
+//------------------------------------------------------------------------------
+// GB_subassign_10_and_18: C(I,J)<M or !M,repl> = A ; using S
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Method 10: C(I,J)<M,repl> = A ; using S
+// Method 18: C(I,J)<!M,repl> = A ; using S
+
+// M:           present
+// Mask_comp:   true or false
+// C_replace:   true
+// accum:       NULL
+// A:           matrix
+// S:           constructed
+
+// C: not bitmap: use GB_bitmap_assign instead
+// M, A: any sparsity structure.
+
+#include "GB_subassign_methods.h"
+
+GrB_Info GB_subassign_10_and_18
+(
+    GrB_Matrix C,
+    // input:
+    const GrB_Index *I,
+    const int64_t ni,
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,
+    const int64_t nj,
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,
+    const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, !M, else use M
+    const GrB_Matrix A,
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ; ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+    ASSERT (!GB_aliased (C, A)) ;   // NO ALIAS of C==A
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+    GB_MATRIX_WAIT_IF_JUMBLED (A) ;
+
+    GB_GET_C ;      // C must not be bitmap
+    GB_GET_MASK ;
+    GB_GET_A ;
+    GB_GET_S ;
+    GrB_BinaryOp accum = NULL ;
+
+    //--------------------------------------------------------------------------
+    // Method 10: C(I,J)<M,repl> = A ; using S
+    // Method 18: C(I,J)<!M,repl> = A ; using S
+    //--------------------------------------------------------------------------
+
+    // Time: Optimal.  Omega (nnz(A)+nnz(S)), since all entries in S+A must be
+    // traversed, and the corresponding entry in M (even if not present)
+    // determines the action to take.  M can add a log(m) factor if sparse.
+
+    //--------------------------------------------------------------------------
+    // Parallel: A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
+    //--------------------------------------------------------------------------
+
+    if (A_is_bitmap)
+    {
+        // all of IxJ must be examined
+        GB_SUBASSIGN_IXJ_SLICE ;
+    }
+    else
+    {
+        // traverse all A+S
+        GB_SUBASSIGN_TWO_SLICE (A, S) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // phase 1: create zombies, update entries, and count pending tuples
+    //--------------------------------------------------------------------------
+
+    if (A_is_bitmap)
+    {
+
+        //----------------------------------------------------------------------
+        // phase1: A is bitmap
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iA_start, iA_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iA_start:iA_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+                int64_t pA_start = j * Avlen ;
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iA_start:iA_end,j) and A(ditto,j)
+                //--------------------------------------------------------------
+
+                for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+                {
+                    int64_t pA = pA_start + iA ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
+                    bool Afound = Ab [pA] ;
+
+                    if (Sfound && !Afound)
+                    { 
+                        // S (i,j) is present but A (i,j) is not
+                        // ----[C . 1] or [X . 1]-------------------------------
+                        // [C . 1]: action: ( delete ): becomes zombie
+                        // [X . 1]: action: ( X ): still zombie
+                        // ----[C . 0] or [X . 0]-------------------------------
+                        // [X . 0]: action: ( X ): still a zombie
+                        // [C . 0]: C_repl: action: ( delete ): becomes zombie
+                        GB_C_S_LOOKUP ;
+                        GB_DELETE_ENTRY ;
+                        GB_NEXT (S) ;
+                    }
+                    else if (!Sfound && Afound)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            task_pending++ ;
+                        }
+                    }
+                    else if (Sfound && Afound)
+                    {
+                        // both S (i,j) and A (i,j) present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        GB_C_S_LOOKUP ;
+                        if (mij)
+                        { 
+                            // ----[C A 1] or [X A 1]---------------------------
+                            // [C A 1]: action: ( =A ): A to C no accum
+                            // [X A 1]: action: ( undelete ): zombie lives
+                            GB_noaccum_C_A_1_matrix ;
+                        }
+                        else
+                        { 
+                            // ----[C A 0] or [X A 0]---------------------------
+                            // [X A 0]: action: ( X ): still a zombie
+                            // [C A 0]: C_repl: action: ( delete ): now zombie
+                            GB_DELETE_ENTRY ;
+                        }
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE1_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // phase1: A is hypersparse, sparse, or full
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_TASK_DESCRIPTOR_PHASE1 ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+
+                //--------------------------------------------------------------
+                // get A(:,j) and S(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X, Avlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and A(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and A (:,j) have entries
+                while (pS < pS_end && pA < pA_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+
+                    if (iS < iA)
+                    { 
+                        // S (i,j) is present but A (i,j) is not
+                        // ----[C . 1] or [X . 1]-------------------------------
+                        // [C . 1]: action: ( delete ): becomes zombie
+                        // [X . 1]: action: ( X ): still zombie
+                        // ----[C . 0] or [X . 0]-------------------------------
+                        // [X . 0]: action: ( X ): still a zombie
+                        // [C . 0]: C_repl: action: ( delete ): becomes zombie
+                        GB_C_S_LOOKUP ;
+                        GB_DELETE_ENTRY ;
+                        GB_NEXT (S) ;
+                    }
+                    else if (iA < iS)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            task_pending++ ;
+                        }
+                        GB_NEXT (A) ;
+                    }
+                    else
+                    {
+                        // both S (i,j) and A (i,j) present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        GB_C_S_LOOKUP ;
+                        if (mij)
+                        { 
+                            // ----[C A 1] or [X A 1]---------------------------
+                            // [C A 1]: action: ( =A ): A to C no accum
+                            // [X A 1]: action: ( undelete ): zombie lives
+                            GB_noaccum_C_A_1_matrix ;
+                        }
+                        else
+                        { 
+                            // ----[C A 0] or [X A 0]---------------------------
+                            // [X A 0]: action: ( X ): still a zombie
+                            // [C A 0]: C_repl: action: ( delete ): now zombie
+                            GB_DELETE_ENTRY ;
+                        }
+                        GB_NEXT (S) ;
+                        GB_NEXT (A) ;
+                    }
+                }
+
+                // while list S (:,j) has entries.  List A (:,j) exhausted.
+                while (pS < pS_end)
+                { 
+                    // ----[C . 1] or [X . 1]-----------------------------------
+                    // S (i,j) is present but A (i,j) is not
+                    // [C . 1]: action: ( delete ): becomes zombie
+                    // [X . 1]: action: ( X ): still a zombie
+                    GB_C_S_LOOKUP ;
+                    GB_DELETE_ENTRY ;
+                    GB_NEXT (S) ;
+                }
+
+                // while list A (:,j) has entries.  List S (:,j) exhausted.
+                while (pA < pA_end)
+                {
+                    // S (i,j) is not present, A (i,j) is present
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                    if (Mask_comp) mij = !mij ;
+                    if (mij)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // [. A 1]: action: ( insert )
+                        task_pending++ ;
+                    }
+                    GB_NEXT (A) ;
+                }
+            }
+
+            GB_PHASE1_TASK_WRAPUP ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // phase 2: insert pending tuples
+    //--------------------------------------------------------------------------
+
+    GB_PENDING_CUMSUM ;
+
+    if (A_is_bitmap)
+    {
+
+        //----------------------------------------------------------------------
+        // phase2: A is bitmap
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iA_start, iA_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iA_start:iA_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+                int64_t pA_start = j * Avlen ;
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iA_start:iA_end,j) and A(ditto,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+                {
+                    int64_t pA = pA_start + iA ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
+                    bool Afound = Ab [pA] ;
+                    if (!Sfound && Afound)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                            GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                        }
+                    }
+                    else if (Sfound)
+                    { 
+                        // S (i,j) present
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE2_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // phase2: A is hypersparse, sparse, or full
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_TASK_DESCRIPTOR_PHASE2 ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+
+                //--------------------------------------------------------------
+                // get A(:,j) and S(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X, Avlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and A(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and A (:,j) have entries
+                while (pS < pS_end && pA < pA_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+
+                    if (iS < iA)
+                    { 
+                        // S (i,j) is present but A (i,j) is not
+                        GB_NEXT (S) ;
+                    }
+                    else if (iA < iS)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                            GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                        }
+                        GB_NEXT (A) ;
+                    }
+                    else
+                    { 
+                        // both S (i,j) and A (i,j) present
+                        GB_NEXT (S) ;
+                        GB_NEXT (A) ;
+                    }
+                }
+
+                // while list A (:,j) has entries.  List S (:,j) exhausted.
+                while (pA < pA_end)
+                {
+                    // S (i,j) is not present, A (i,j) is present
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                    if (Mask_comp) mij = !mij ;
+                    if (mij)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // [. A 1]: action: ( insert )
+                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                    }
+                    GB_NEXT (A) ;
+                }
+            }
+
+            GB_PHASE2_TASK_WRAPUP ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // finalize the matrix and return result
+    //--------------------------------------------------------------------------
+
+    GB_SUBASSIGN_WRAPUP ;
+}
+
diff --git a/GraphBLAS/Source/GB_subassign_11.c b/GraphBLAS/Source/GB_subassign_11.c
index 6f591cf19a..10f79ddb0d 100644
--- a/GraphBLAS/Source/GB_subassign_11.c
+++ b/GraphBLAS/Source/GB_subassign_11.c
@@ -2,8 +2,8 @@
 // GB_subassign_11: C(I,J)<M,repl> += scalar ; using S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,8 +16,9 @@
 // A:           scalar
 // S:           constructed
 
-#define GB_FREE_WORK GB_FREE_TWO_SLICE
+// C, M: not bitmap
 
+#include "GB_unused.h"
 #include "GB_subassign_methods.h"
 
 GrB_Info GB_subassign_11
@@ -25,10 +26,12 @@ GrB_Info GB_subassign_11
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
@@ -37,22 +40,32 @@ GrB_Info GB_subassign_11
     const GrB_BinaryOp accum,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ; ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
-    // GB_GET_MASK ;
-    const int64_t *GB_RESTRICT Mp = M->p ;
-//  const int64_t *GB_RESTRICT Mh = M->h ;
-    const int64_t *GB_RESTRICT Mi = M->i ;
-    const GB_void *GB_RESTRICT Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
-    const size_t msize = M->type->size ;
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+
+    GB_GET_C ;      // C must not be bitmap
+    GB_GET_MASK ;
     GB_GET_ACCUM_SCALAR ;
     GB_GET_S ;
 
@@ -72,56 +85,192 @@ GrB_Info GB_subassign_11
     // Method 09 and Method 11 are very similar.
 
     //--------------------------------------------------------------------------
-    // Parallel: Z=M+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
+    // Parallel: M+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
     //--------------------------------------------------------------------------
 
-    GB_SUBASSIGN_TWO_SLICE (M, S) ;
+    if (M_is_bitmap)
+    { 
+        // all of IxJ must be examined
+        GB_SUBASSIGN_IXJ_SLICE ;
+    }
+    else
+    { 
+        // traverse all M+S
+        GB_SUBASSIGN_TWO_SLICE (M, S) ;
+    }
 
     //--------------------------------------------------------------------------
     // phase 1: create zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(+:nzombies)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    if (M_is_bitmap)
     {
 
         //----------------------------------------------------------------------
-        // get the task descriptor
+        // phase1: M is bitmap
         //----------------------------------------------------------------------
 
-        GB_GET_TASK_DESCRIPTOR_PHASE1 ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iM_start, iM_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iM_start:iM_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iM_start) ;
+                int64_t pM_start = j * Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iM_start:iM_end,j) and M(ditto,j)
+                //--------------------------------------------------------------
+
+                for (int64_t iM = iM_start ; iM < iM_end ; iM++)
+                {
+
+                    int64_t pM = pM_start + iM ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iM) ;
+                    bool mij = Mb [pM] && GB_mcast (Mx, pM, msize) ;
+
+                    if (Sfound && !mij)
+                    { 
+                        // S (i,j) is present but M (i,j) is false
+                        // ----[C A 0] or [X A 0]-------------------------------
+                        // [X A 0]: action: ( X ): still a zombie
+                        // [C A 0]: C_repl: action: ( delete ): becomes zombie
+                        GB_C_S_LOOKUP ;
+                        GB_DELETE_ENTRY ;
+                        GB_NEXT (S) ;
+                    }
+                    else if (!Sfound && mij)
+                    { 
+                        // S (i,j) is not present, M (i,j) is true
+                        // ----[. A 1]------------------------------------------
+                        // [. A 1]: action: ( insert )
+                        task_pending++ ;
+                    }
+                    else if (Sfound && mij)
+                    { 
+                        // S (i,j) present and M (i,j) is true
+                        GB_C_S_LOOKUP ;
+                        // ----[C A 1] or [X A 1]-------------------------------
+                        // [C A 1]: action: ( =C+A ): apply accum
+                        // [X A 1]: action: ( undelete ): zombie lives
+                        GB_withaccum_C_A_1_scalar ;
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE1_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
 
         //----------------------------------------------------------------------
-        // compute all vectors in this task
+        // phase1: M is hypersparse, sparse, or full
         //----------------------------------------------------------------------
 
-        for (int64_t k = kfirst ; k <= klast ; k++)
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
         {
 
             //------------------------------------------------------------------
-            // get S(:,j) and M(:,j)
+            // get the task descriptor
             //------------------------------------------------------------------
 
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pM, pM_end, pA, pA_end, Mp, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
+            GB_GET_TASK_DESCRIPTOR_PHASE1 ;
 
             //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and M(:,j)
+            // compute all vectors in this task
             //------------------------------------------------------------------
 
-            // jC = J [j] ; or J is a colon expression
-            // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and M (:,j) have entries
-            while (pS < pS_end && pM < pM_end)
+            for (int64_t k = kfirst ; k <= klast ; k++)
             {
-                int64_t iS = Si [pS] ;
-                int64_t iM = Mi [pM] ;
 
-                if (iS < iM)
+                //--------------------------------------------------------------
+                // get S(:,j) and M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pM, pM_end, pA, pA_end, Mp, j, k, Z_to_X, Mvlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and M(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and M (:,j) have entries
+                while (pS < pS_end && pM < pM_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iM = GBI (Mi, pM, Mvlen) ;
+
+                    if (iS < iM)
+                    { 
+                        // S (i,j) is present but M (i,j) is not
+                        // ----[C A 0] or [X A 0]-------------------------------
+                        // [X A 0]: action: ( X ): still a zombie
+                        // [C A 0]: C_repl: action: ( delete ): becomes zombie
+                        GB_C_S_LOOKUP ;
+                        GB_DELETE_ENTRY ;
+                        GB_NEXT (S) ;
+                    }
+                    else if (iM < iS)
+                    {
+                        // S (i,j) is not present, M (i,j) is present
+                        if (GB_mcast (Mx, pM, msize))
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            task_pending++ ;
+                        }
+                        GB_NEXT (M) ;
+                    }
+                    else
+                    {
+                        // both S (i,j) and M (i,j) present
+                        GB_C_S_LOOKUP ;
+                        if (GB_mcast (Mx, pM, msize))
+                        { 
+                            // ----[C A 1] or [X A 1]---------------------------
+                            // [C A 1]: action: ( =C+A ): apply accum
+                            // [X A 1]: action: ( undelete ): zombie lives
+                            GB_withaccum_C_A_1_scalar ;
+                        }
+                        else
+                        { 
+                            // ----[C A 0] or [X A 0]---------------------------
+                            // [X A 0]: action: ( X ): still a zombie
+                            // [C A 0]: C_repl: action: ( delete ): now zombie
+                            GB_DELETE_ENTRY ;
+                        }
+                        GB_NEXT (S) ;
+                        GB_NEXT (M) ;
+                    }
+                }
+
+                // while list S (:,j) has entries.  List M (:,j) exhausted.
+                while (pS < pS_end)
                 { 
                     // S (i,j) is present but M (i,j) is not
                     // ----[C A 0] or [X A 0]-----------------------------------
@@ -131,7 +280,9 @@ GrB_Info GB_subassign_11
                     GB_DELETE_ENTRY ;
                     GB_NEXT (S) ;
                 }
-                else if (iM < iS)
+
+                // while list M (:,j) has entries.  List S (:,j) exhausted.
+                while (pM < pM_end)
                 {
                     // S (i,j) is not present, M (i,j) is present
                     if (GB_mcast (Mx, pM, msize))
@@ -142,56 +293,10 @@ GrB_Info GB_subassign_11
                     }
                     GB_NEXT (M) ;
                 }
-                else
-                {
-                    // both S (i,j) and M (i,j) present
-                    GB_C_S_LOOKUP ;
-                    if (GB_mcast (Mx, pM, msize))
-                    { 
-                        // ----[C A 1] or [X A 1]-------------------------------
-                        // [C A 1]: action: ( =C+A ): apply accum
-                        // [X A 1]: action: ( undelete ): zombie lives
-                        GB_withaccum_C_A_1_scalar ;
-                    }
-                    else
-                    { 
-                        // ----[C A 0] or [X A 0]-------------------------------
-                        // [X A 0]: action: ( X ): still a zombie
-                        // [C A 0]: C_repl: action: ( delete ): becomes zombie
-                        GB_DELETE_ENTRY ;
-                    }
-                    GB_NEXT (S) ;
-                    GB_NEXT (M) ;
-                }
             }
 
-            // while list S (:,j) has entries.  List M (:,j) exhausted
-            while (pS < pS_end)
-            { 
-                // S (i,j) is present but M (i,j) is not
-                // ----[C A 0] or [X A 0]-----------------------------------
-                // [X A 0]: action: ( X ): still a zombie
-                // [C A 0]: C_repl: action: ( delete ): becomes zombie
-                GB_C_S_LOOKUP ;
-                GB_DELETE_ENTRY ;
-                GB_NEXT (S) ;
-            }
-
-            // while list M (:,j) has entries.  List S (:,j) exhausted
-            while (pM < pM_end)
-            {
-                // S (i,j) is not present, M (i,j) is present
-                if (GB_mcast (Mx, pM, msize))
-                { 
-                    // ----[. A 1]------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    task_pending++ ;
-                }
-                GB_NEXT (M) ;
-            }
+            GB_PHASE1_TASK_WRAPUP ;
         }
-
-        GB_PHASE1_TASK_WRAPUP ;
     }
 
     //--------------------------------------------------------------------------
@@ -200,87 +305,159 @@ GrB_Info GB_subassign_11
 
     GB_PENDING_CUMSUM ;
 
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(&&:pending_sorted)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    if (M_is_bitmap)
     {
 
         //----------------------------------------------------------------------
-        // get the task descriptor
+        // phase2: M is bitmap
         //----------------------------------------------------------------------
 
-        GB_GET_TASK_DESCRIPTOR_PHASE2 ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iM_start, iM_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iM_start:iM_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iM_start) ;
+                int64_t pM_start = j * Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iM_start:iM_end,j) and M(ditto,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                for (int64_t iM = iM_start ; iM < iM_end ; iM++)
+                {
+                    int64_t pM = pM_start + iM ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iM) ;
+                    bool mij = Mb [pM] && GB_mcast (Mx, pM, msize) ;
+
+                    if (!Sfound && mij)
+                    { 
+                        // S (i,j) is not present, M (i,j) is true
+                        // ----[. A 1]------------------------------------------
+                        // [. A 1]: action: ( insert )
+                        int64_t iC = GB_ijlist (I, iM, Ikind, Icolon) ;
+                        GB_PENDING_INSERT (scalar) ;
+                    }
+                    else if (Sfound)
+                    { 
+                        // S (i,j) present
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE2_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
 
         //----------------------------------------------------------------------
-        // compute all vectors in this task
+        // phase2: M is hypersparse, sparse, or full
         //----------------------------------------------------------------------
 
-        for (int64_t k = kfirst ; k <= klast ; k++)
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
         {
 
             //------------------------------------------------------------------
-            // get S(:,j) and M(:,j)
+            // get the task descriptor
             //------------------------------------------------------------------
 
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pM, pM_end, pA, pA_end, Mp, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
+            GB_GET_TASK_DESCRIPTOR_PHASE2 ;
 
             //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and M(:,j)
+            // compute all vectors in this task
             //------------------------------------------------------------------
 
-            // jC = J [j] ; or J is a colon expression
-            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and M (:,j) have entries
-            while (pS < pS_end && pM < pM_end)
+            for (int64_t k = kfirst ; k <= klast ; k++)
             {
-                int64_t iS = Si [pS] ;
-                int64_t iM = Mi [pM] ;
 
-                if (iS < iM)
-                { 
-                    // S (i,j) is present but M (i,j) is not
-                    GB_NEXT (S) ;
+                //--------------------------------------------------------------
+                // get S(:,j) and M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pM, pM_end, pA, pA_end, Mp, j, k, Z_to_X, Mvlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and M(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and M (:,j) have entries
+                while (pS < pS_end && pM < pM_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iM = GBI (Mi, pM, Mvlen) ;
+
+                    if (iS < iM)
+                    { 
+                        // S (i,j) is present but M (i,j) is not
+                        GB_NEXT (S) ;
+                    }
+                    else if (iM < iS)
+                    {
+                        // S (i,j) is not present, M (i,j) is present
+                        if (GB_mcast (Mx, pM, msize))
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            int64_t iC = GB_ijlist (I, iM, Ikind, Icolon) ;
+                            GB_PENDING_INSERT (scalar) ;
+                        }
+                        GB_NEXT (M) ;
+                    }
+                    else
+                    { 
+                        // both S (i,j) and M (i,j) present
+                        GB_NEXT (S) ;
+                        GB_NEXT (M) ;
+                    }
                 }
-                else if (iM < iS)
+
+                // while list M (:,j) has entries.  List S (:,j) exhausted.
+                while (pM < pM_end)
                 {
                     // S (i,j) is not present, M (i,j) is present
                     if (GB_mcast (Mx, pM, msize))
                     { 
                         // ----[. A 1]------------------------------------------
                         // [. A 1]: action: ( insert )
+                        int64_t iM = GBI (Mi, pM, Mvlen) ;
                         int64_t iC = GB_ijlist (I, iM, Ikind, Icolon) ;
                         GB_PENDING_INSERT (scalar) ;
                     }
                     GB_NEXT (M) ;
                 }
-                else
-                { 
-                    // both S (i,j) and M (i,j) present
-                    GB_NEXT (S) ;
-                    GB_NEXT (M) ;
-                }
             }
 
-            // while list M (:,j) has entries.  List S (:,j) exhausted
-            while (pM < pM_end)
-            {
-                // S (i,j) is not present, M (i,j) is present
-                if (GB_mcast (Mx, pM, msize))
-                { 
-                    // ----[. A 1]------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    int64_t iM = Mi [pM] ;
-                    int64_t iC = GB_ijlist (I, iM, Ikind, Icolon) ;
-                    GB_PENDING_INSERT (scalar) ;
-                }
-                GB_NEXT (M) ;
-            }
+            GB_PHASE2_TASK_WRAPUP ;
         }
-
-        GB_PHASE2_TASK_WRAPUP ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_subassign_12.c b/GraphBLAS/Source/GB_subassign_12.c
deleted file mode 100644
index 05ad2e2f69..0000000000
--- a/GraphBLAS/Source/GB_subassign_12.c
+++ /dev/null
@@ -1,316 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_subassign_12: C(I,J)<M,repl> += A ; using S
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Method 12: C(I,J)<M,repl> += A ; using S
-
-// M:           present
-// Mask_comp:   false
-// C_replace:   true
-// accum:       present
-// A:           matrix
-// S:           constructed
-
-#define GB_FREE_WORK GB_FREE_TWO_SLICE
-
-#include "GB_subassign_methods.h"
-
-GrB_Info GB_subassign_12
-(
-    GrB_Matrix C,
-    // input:
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    const GrB_Matrix M,
-    const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_BinaryOp accum,
-    const GrB_Matrix A,
-    const GrB_Matrix S,
-    GB_Context Context
-)
-{
-
-    //--------------------------------------------------------------------------
-    // get inputs
-    //--------------------------------------------------------------------------
-
-    GB_GET_C ;
-    GB_GET_MASK ;
-    const bool M_is_hyper = M->is_hyper ;
-    const int64_t Mnvec = M->nvec ;
-    const int64_t mvlen = M->vlen ;
-    GB_GET_A ;
-    GB_GET_S ;
-    GB_GET_ACCUM ;
-
-    //--------------------------------------------------------------------------
-    // Method 12: C(I,J)<M,repl> += A ; using S
-    //--------------------------------------------------------------------------
-
-    // Time: all entries in S+A must be traversed, so Omega(nnz(S)+nnz(A)) is
-    // required.  All cases of the mask (0, 1, or not present) must be
-    // considered, because of the C_replace descriptor being true.
-
-    // Method 12 and Method 20 are very similar.
-
-    //--------------------------------------------------------------------------
-    // Parallel: Z=A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_TWO_SLICE (A, S) ;
-
-    //--------------------------------------------------------------------------
-    // phase 1: create zombies, update entries, and count pending tuples
-    //--------------------------------------------------------------------------
-
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(+:nzombies)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE1 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                {
-                    // S (i,j) is present but A (i,j) is not
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iS) ;
-                    if (!mij)
-                    { 
-                        // ----[C . 0] or [X . 0]-------------------------------
-                        // [X . 0]: action: ( X ): still a zombie
-                        // [C . 0]: C_repl: action: ( delete ): becomes zombie
-                        GB_C_S_LOOKUP ;
-                        GB_DELETE_ENTRY ;
-                    }
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        task_pending++ ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                {
-                    // both S (i,j) and A (i,j) present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    GB_C_S_LOOKUP ;
-                    if (mij)
-                    { 
-                        // ----[C A 1] or [X A 1]-------------------------------
-                        // [C A 1]: action: ( =A ): A to C no accum
-                        // [C A 1]: action: ( =C+A ): apply accum
-                        // [X A 1]: action: ( undelete ): zombie lives
-                        GB_withaccum_C_A_1_matrix ;
-                    }
-                    else
-                    { 
-                        // ----[C A 0] or [X A 0]-------------------------------
-                        // [X A 0]: action: ( X ): still a zombie
-                        // [C A 0]: C_repl: action: ( delete ): becomes zombie
-                        GB_DELETE_ENTRY ;
-                    }
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list S (:,j) has entries.  List A (:,j) exhausted
-            while (pS < pS_end)
-            {
-                int64_t iS = Si [pS] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iS) ;
-                if (!mij)
-                { 
-                    // ----[C . 0] or [X . 0]-----------------------------------
-                    // [X . 0]: action: ( X ): still a zombie
-                    // [C . 0]: C_repl: action: ( delete ): becomes zombie
-                    GB_C_S_LOOKUP ;
-                    GB_DELETE_ENTRY ;
-                }
-                GB_NEXT (S) ;
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    task_pending++ ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE1_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // phase 2: insert pending tuples
-    //--------------------------------------------------------------------------
-
-    GB_PENDING_CUMSUM ;
-
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(&&:pending_sorted)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE2 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                { 
-                    // S (i,j) is present but A (i,j) is not
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                { 
-                    // both S (i,j) and A (i,j) present
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                    GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE2_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // finalize the matrix and return result
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_WRAPUP ;
-}
-
diff --git a/GraphBLAS/Source/GB_subassign_12_and_20.c b/GraphBLAS/Source/GB_subassign_12_and_20.c
new file mode 100644
index 0000000000..782f79133d
--- /dev/null
+++ b/GraphBLAS/Source/GB_subassign_12_and_20.c
@@ -0,0 +1,547 @@
+//------------------------------------------------------------------------------
+// GB_subassign_12_and_20: C(I,J)<M or !M,repl> += A ; using S
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Method 12: C(I,J)<M,repl> += A ; using S
+// Method 20: C(I,J)<!M,repl> += A ; using S
+
+// M:           present
+// Mask_comp:   true or false
+// C_replace:   true
+// accum:       present
+// A:           matrix
+// S:           constructed
+
+// C: not bitmap: use GB_bitmap_assign instead
+// M, A: any sparsity structure.
+
+#include "GB_subassign_methods.h"
+
+GrB_Info GB_subassign_12_and_20
+(
+    GrB_Matrix C,
+    // input:
+    const GrB_Index *I,
+    const int64_t ni,
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,
+    const int64_t nj,
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
+    const GrB_Matrix M,
+    const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, !M, else use M
+    const GrB_BinaryOp accum,
+    const GrB_Matrix A,
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ; ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+    ASSERT (!GB_aliased (C, A)) ;   // NO ALIAS of C==A
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+    GB_MATRIX_WAIT_IF_JUMBLED (A) ;
+
+    GB_GET_C ;      // C must not be bitmap
+    GB_GET_MASK ;
+    GB_GET_A ;
+    GB_GET_S ;
+    GB_GET_ACCUM ;
+
+    //--------------------------------------------------------------------------
+    // Method 12: C(I,J)<M,repl> += A ; using S
+    // Method 20: C(I,J)<!M,repl> += A ; using S
+    //--------------------------------------------------------------------------
+
+    // Time: all entries in S+A must be traversed, so Omega(nnz(S)+nnz(A)) is
+    // required.  All cases of the mask (0, 1, or not present) must be
+    // considered, because of the C_replace descriptor being true.
+
+    //--------------------------------------------------------------------------
+    // Parallel: A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
+    //--------------------------------------------------------------------------
+
+    if (A_is_bitmap)
+    {
+        // all of IxJ must be examined
+        GB_SUBASSIGN_IXJ_SLICE ;
+    }
+    else
+    {
+        // traverse all A+S
+        GB_SUBASSIGN_TWO_SLICE (A, S) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // phase 1: create zombies, update entries, and count pending tuples
+    //--------------------------------------------------------------------------
+
+    if (A_is_bitmap)
+    {
+
+        //----------------------------------------------------------------------
+        // phase1: A is bitmap
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iA_start, iA_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iA_start:iA_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+                int64_t pA_start = j * Avlen ;
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iA_start:iA_end,j) and A(ditto,j)
+                //--------------------------------------------------------------
+
+                for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+                {
+                    int64_t pA = pA_start + iA ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
+                    bool Afound = Ab [pA] ;
+
+                    if (Sfound && !Afound)
+                    {
+                        // S (i,j) is present but A (i,j) is not
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (!mij)
+                        { 
+                            // ----[C . 0] or [X . 0]---------------------------
+                            // [X . 0]: action: ( X ): still a zombie
+                            // [C . 0]: C_repl: action: ( delete ): now zombie
+                            GB_C_S_LOOKUP ;
+                            GB_DELETE_ENTRY ;
+                        }
+                        GB_NEXT (S) ;
+                    }
+                    else if (!Sfound && Afound)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            task_pending++ ;
+                        }
+                    }
+                    else if (Sfound && Afound)
+                    {
+                        // both S (i,j) and A (i,j) present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        GB_C_S_LOOKUP ;
+                        if (mij)
+                        { 
+                            // ----[C A 1] or [X A 1]---------------------------
+                            // [C A 1]: action: ( =A ): A to C no accum
+                            // [C A 1]: action: ( =C+A ): apply accum
+                            // [X A 1]: action: ( undelete ): zombie lives
+                            GB_withaccum_C_A_1_matrix ;
+                        }
+                        else
+                        { 
+                            // ----[C A 0] or [X A 0]---------------------------
+                            // [X A 0]: action: ( X ): still a zombie
+                            // [C A 0]: C_repl: action: ( delete ): now zombie
+                            GB_DELETE_ENTRY ;
+                        }
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE1_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // phase1: A is hypersparse, sparse, or full
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_TASK_DESCRIPTOR_PHASE1 ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+
+                //--------------------------------------------------------------
+                // get A(:,j) and S(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X, Avlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and A(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and A (:,j) have entries
+                while (pS < pS_end && pA < pA_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+
+                    if (iS < iA)
+                    {
+                        // S (i,j) is present but A (i,j) is not
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iS) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (!mij)
+                        { 
+                            // ----[C . 0] or [X . 0]---------------------------
+                            // [X . 0]: action: ( X ): still a zombie
+                            // [C . 0]: C_repl: action: ( delete ): now zombie
+                            GB_C_S_LOOKUP ;
+                            GB_DELETE_ENTRY ;
+                        }
+                        GB_NEXT (S) ;
+                    }
+                    else if (iA < iS)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            task_pending++ ;
+                        }
+                        GB_NEXT (A) ;
+                    }
+                    else
+                    {
+                        // both S (i,j) and A (i,j) present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        GB_C_S_LOOKUP ;
+                        if (mij)
+                        { 
+                            // ----[C A 1] or [X A 1]---------------------------
+                            // [C A 1]: action: ( =A ): A to C no accum
+                            // [C A 1]: action: ( =C+A ): apply accum
+                            // [X A 1]: action: ( undelete ): zombie lives
+                            GB_withaccum_C_A_1_matrix ;
+                        }
+                        else
+                        { 
+                            // ----[C A 0] or [X A 0]---------------------------
+                            // [X A 0]: action: ( X ): still a zombie
+                            // [C A 0]: C_repl: action: ( delete ): now zombie
+                            GB_DELETE_ENTRY ;
+                        }
+                        GB_NEXT (S) ;
+                        GB_NEXT (A) ;
+                    }
+                }
+
+                // while list S (:,j) has entries.  List A (:,j) exhausted.
+                while (pS < pS_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iS) ;
+                    if (Mask_comp) mij = !mij ;
+                    if (!mij)
+                    { 
+                        // ----[C . 0] or [X . 0]-------------------------------
+                        // [X . 0]: action: ( X ): still a zombie
+                        // [C . 0]: C_repl: action: ( delete ): becomes zombie
+                        GB_C_S_LOOKUP ;
+                        GB_DELETE_ENTRY ;
+                    }
+                    GB_NEXT (S) ;
+                }
+
+                // while list A (:,j) has entries.  List S (:,j) exhausted.
+                while (pA < pA_end)
+                {
+                    // S (i,j) is not present, A (i,j) is present
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                    if (Mask_comp) mij = !mij ;
+                    if (mij)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // [. A 1]: action: ( insert )
+                        task_pending++ ;
+                    }
+                    GB_NEXT (A) ;
+                }
+            }
+
+            GB_PHASE1_TASK_WRAPUP ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // phase 2: insert pending tuples
+    //--------------------------------------------------------------------------
+
+    GB_PENDING_CUMSUM ;
+
+    if (A_is_bitmap)
+    {
+
+        //----------------------------------------------------------------------
+        // phase2: A is bitmap
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iA_start, iA_end) ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t j = kfirst ; j <= klast ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // get S(iA_start:iA_end,j)
+                //--------------------------------------------------------------
+
+                GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+                int64_t pA_start = j * Avlen ;
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(iA_start:iA_end,j) and A(ditto,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+                {
+                    int64_t pA = pA_start + iA ;
+                    bool Sfound = (pS < pS_end) && (GBI (Si, pS, Svlen) == iA) ;
+                    bool Afound = Ab [pA] ;
+                    if (!Sfound && Afound)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                            GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                        }
+                    }
+                    else if (Sfound)
+                    { 
+                        // S (i,j) present
+                        GB_NEXT (S) ;
+                    }
+                }
+            }
+            GB_PHASE2_TASK_WRAPUP ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // phase2: A is hypersparse, sparse, or full
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(&&:pending_sorted)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the task descriptor
+            //------------------------------------------------------------------
+
+            GB_GET_TASK_DESCRIPTOR_PHASE2 ;
+
+            //------------------------------------------------------------------
+            // compute all vectors in this task
+            //------------------------------------------------------------------
+
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+
+                //--------------------------------------------------------------
+                // get A(:,j) and S(:,j)
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Zh, k) ;
+                GB_GET_MAPPED (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X, Avlen);
+                GB_GET_MAPPED (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S, Svlen);
+
+                //--------------------------------------------------------------
+                // get M(:,j)
+                //--------------------------------------------------------------
+
+                int64_t pM_start, pM_end ;
+                GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
+                bool mjdense = (pM_end - pM_start) == Mvlen ;
+
+                //--------------------------------------------------------------
+                // do a 2-way merge of S(:,j) and A(:,j)
+                //--------------------------------------------------------------
+
+                // jC = J [j] ; or J is a colon expression
+                int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
+
+                // while both list S (:,j) and A (:,j) have entries
+                while (pS < pS_end && pA < pA_end)
+                {
+                    int64_t iS = GBI (Si, pS, Svlen) ;
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+
+                    if (iS < iA)
+                    { 
+                        // S (i,j) is present but A (i,j) is not
+                        GB_NEXT (S) ;
+                    }
+                    else if (iA < iS)
+                    {
+                        // S (i,j) is not present, A (i,j) is present
+                        GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (mij)
+                        { 
+                            // ----[. A 1]--------------------------------------
+                            // [. A 1]: action: ( insert )
+                            int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                            GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                        }
+                        GB_NEXT (A) ;
+                    }
+                    else
+                    { 
+                        // both S (i,j) and A (i,j) present
+                        GB_NEXT (S) ;
+                        GB_NEXT (A) ;
+                    }
+                }
+
+                // while list A (:,j) has entries.  List S (:,j) exhausted.
+                while (pA < pA_end)
+                {
+                    // S (i,j) is not present, A (i,j) is present
+                    int64_t iA = GBI (Ai, pA, Avlen) ;
+                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
+                    if (Mask_comp) mij = !mij ;
+                    if (mij)
+                    { 
+                        // ----[. A 1]------------------------------------------
+                        // [. A 1]: action: ( insert )
+                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
+                    }
+                    GB_NEXT (A) ;
+                }
+            }
+
+            GB_PHASE2_TASK_WRAPUP ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // finalize the matrix and return result
+    //--------------------------------------------------------------------------
+
+    GB_SUBASSIGN_WRAPUP ;
+}
+
diff --git a/GraphBLAS/Source/GB_subassign_13.c b/GraphBLAS/Source/GB_subassign_13.c
index 96e5ac5a96..d43efd8387 100644
--- a/GraphBLAS/Source/GB_subassign_13.c
+++ b/GraphBLAS/Source/GB_subassign_13.c
@@ -2,8 +2,8 @@
 // GB_subassign_13: C(I,J)<!M> = scalar ; using S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,6 +16,9 @@
 // A:           scalar
 // S:           constructed
 
+// C: not bitmap, but can be full since no zombies are inserted in that case
+// M: not bitmap
+
 #include "GB_subassign_methods.h"
 
 GrB_Info GB_subassign_13
@@ -23,10 +26,12 @@ GrB_Info GB_subassign_13
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
@@ -34,28 +39,38 @@ GrB_Info GB_subassign_13
     const bool Mask_struct,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
-    const bool C_is_hyper = C->is_hyper ;
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+
+    GB_GET_C ;      // C must not be bitmap
     const int64_t Cnvec = C->nvec ;
     const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t *GB_RESTRICT Cp = C->p ;
+    const bool C_is_hyper = (Ch != NULL) ;
     GB_GET_MASK ;
-    const bool M_is_hyper = M->is_hyper ;
-    const int64_t Mnvec = M->nvec ;
     GB_GET_SCALAR ;
     GB_GET_S ;
-    const int64_t *GB_RESTRICT Sh = S->h ;
-    const int64_t Snvec = S->nvec ;
-    const bool S_is_hyper = S->is_hyper ;
     GrB_BinaryOp accum = NULL ;
 
     //--------------------------------------------------------------------------
@@ -77,7 +92,6 @@ GrB_Info GB_subassign_13
     // phase 1: create zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
         reduction(+:nzombies)
     for (taskid = 0 ; taskid < ntasks ; taskid++)
@@ -87,7 +101,7 @@ GrB_Info GB_subassign_13
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 ;
+        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iA_start, iA_end) ;
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -100,14 +114,14 @@ GrB_Info GB_subassign_13
             // get jC, the corresponding vector of C
             //------------------------------------------------------------------
 
-            GB_GET_jC ;
+            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
 
             //------------------------------------------------------------------
             // get S(iA_start:end,j) and M(iA_start:end,j)
             //------------------------------------------------------------------
 
-            GB_GET_VECTOR_FOR_IXJ (S) ;
-            GB_GET_VECTOR_FOR_IXJ (M) ;
+            GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+            GB_GET_VECTOR_FOR_IXJ (M, iA_start) ;
 
             //------------------------------------------------------------------
             // C(I(iA_start,iA_end-1),jC)<!M,repl> = scalar
@@ -120,8 +134,8 @@ GrB_Info GB_subassign_13
                 // Get the indices at the top of each list.
                 //--------------------------------------------------------------
 
-                int64_t iS = (pS < pS_end) ? Si [pS] : INT64_MAX ;
-                int64_t iM = (pM < pM_end) ? Mi [pM] : INT64_MAX ;
+                int64_t iS = (pS < pS_end) ? GBI (Si, pS, Svlen) : INT64_MAX ;
+                int64_t iM = (pM < pM_end) ? GBI (Mi, pM, Mvlen) : INT64_MAX ;
 
                 //--------------------------------------------------------------
                 // find the smallest index of [iS iA iM] (always iA)
@@ -137,7 +151,7 @@ GrB_Info GB_subassign_13
                 if (i == iM)
                 { 
                     // mij = (bool) M [pM]
-                    mij = GB_mcast (Mx, pM, msize) ;
+                    mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
                     GB_NEXT (M) ;
                 }
                 else
@@ -204,7 +218,7 @@ GrB_Info GB_subassign_13
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 ;
+        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iA_start, iA_end) ;
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -217,14 +231,14 @@ GrB_Info GB_subassign_13
             // get jC, the corresponding vector of C
             //------------------------------------------------------------------
 
-            GB_GET_jC ;
+            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
 
             //------------------------------------------------------------------
             // get S(iA_start:end,j) and M(iA_start:end,j)
             //------------------------------------------------------------------
 
-            GB_GET_VECTOR_FOR_IXJ (S) ;
-            GB_GET_VECTOR_FOR_IXJ (M) ;
+            GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+            GB_GET_VECTOR_FOR_IXJ (M, iA_start) ;
 
             //------------------------------------------------------------------
             // C(I(iA_start,iA_end-1),jC)<!M,repl> = scalar
@@ -237,8 +251,8 @@ GrB_Info GB_subassign_13
                 // Get the indices at the top of each list.
                 //--------------------------------------------------------------
 
-                int64_t iS = (pS < pS_end) ? Si [pS] : INT64_MAX ;
-                int64_t iM = (pM < pM_end) ? Mi [pM] : INT64_MAX ;
+                int64_t iS = (pS < pS_end) ? GBI (Si, pS, Svlen) : INT64_MAX ;
+                int64_t iM = (pM < pM_end) ? GBI (Mi, pM, Mvlen) : INT64_MAX ;
 
                 //--------------------------------------------------------------
                 // find the smallest index of [iS iA iM] (always iA)
@@ -254,7 +268,7 @@ GrB_Info GB_subassign_13
                 if (i == iM)
                 { 
                     // mij = (bool) M [pM]
-                    mij = GB_mcast (Mx, pM, msize) ;
+                    mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
                     GB_NEXT (M) ;
                 }
                 else
diff --git a/GraphBLAS/Source/GB_subassign_14.c b/GraphBLAS/Source/GB_subassign_14.c
deleted file mode 100644
index 2d8c20b6fb..0000000000
--- a/GraphBLAS/Source/GB_subassign_14.c
+++ /dev/null
@@ -1,315 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_subassign_14: C(I,J)<!M> = A ; using S
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Method 14: C(I,J)<!M> = A ; using S
-
-// M:           present
-// Mask_comp:   true
-// C_replace:   false
-// accum:       NULL
-// A:           matrix
-// S:           constructed
-
-#define GB_FREE_WORK GB_FREE_TWO_SLICE
-
-#include "GB_subassign_methods.h"
-
-GrB_Info GB_subassign_14
-(
-    GrB_Matrix C,
-    // input:
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    const GrB_Matrix M,
-    const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_Matrix A,
-    const GrB_Matrix S,
-    GB_Context Context
-)
-{
-
-    //--------------------------------------------------------------------------
-    // get inputs
-    //--------------------------------------------------------------------------
-
-    GB_GET_C ;
-    GB_GET_MASK ;
-    const bool M_is_hyper = M->is_hyper ;
-    const int64_t Mnvec = M->nvec ;
-    const int64_t mvlen = M->vlen ;
-    GB_GET_A ;
-    GB_GET_S ;
-    GrB_BinaryOp accum = NULL ;
-
-    //--------------------------------------------------------------------------
-    // Method 14: C(I,J)<!M> = A ; using S
-    //--------------------------------------------------------------------------
-
-    // Time: Close to optimal.  Omega(nnz(S)+nnz(A)) is required, and the
-    // sparsity of !M cannot be exploited.  The time taken is
-    // O((nnz(A)+nnz(S))*log(m)) where m is the # of entries in a vector of M.
-
-    // Method 06s and 14 are very similar.
-
-    //--------------------------------------------------------------------------
-    // Parallel: Z=A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_TWO_SLICE (A, S) ;
-
-    //--------------------------------------------------------------------------
-    // phase 1: create zombies, update entries, and count pending tuples
-    //--------------------------------------------------------------------------
-
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(+:nzombies)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE1 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                {
-                    // S (i,j) is present but A (i,j) is not
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iS) ;
-                    mij = !mij ;
-                    if (mij)
-                    { 
-                        // ----[C . 1] or [X . 1]-------------------------------
-                        // [C . 1]: action: ( delete ): becomes zombie
-                        // [X . 1]: action: ( X ): still zombie
-                        GB_C_S_LOOKUP ;
-                        GB_DELETE_ENTRY ;
-                    }
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    mij = !mij ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        task_pending++ ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                {
-                    // both S (i,j) and A (i,j) present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    mij = !mij ;
-                    if (mij)
-                    { 
-                        // ----[C A 1] or [X A 1]-------------------------------
-                        // [C A 1]: action: ( =A ): A to C no accum
-                        // [X A 1]: action: ( undelete ): zombie lives
-                        GB_C_S_LOOKUP ;
-                        GB_noaccum_C_A_1_matrix ;
-                    }
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list S (:,j) has entries.  List A (:,j) exhausted
-            while (pS < pS_end)
-            {
-                // S (i,j) is present but A (i,j) is not
-                int64_t iS = Si [pS] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iS) ;
-                mij = !mij ;
-                if (mij)
-                { 
-                    // ----[C . 1] or [X . 1]-----------------------------------
-                    // [C . 1]: action: ( delete ): becomes zombie
-                    // [X . 1]: action: ( X ): still zombie
-                    GB_C_S_LOOKUP ;
-                    GB_DELETE_ENTRY ;
-                }
-                GB_NEXT (S) ;
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                mij = !mij ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    task_pending++ ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE1_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // phase 2: insert pending tuples
-    //--------------------------------------------------------------------------
-
-    GB_PENDING_CUMSUM ;
-
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(&&:pending_sorted)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE2 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                { 
-                    // S (i,j) is present but A (i,j) is not
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    mij = !mij ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                { 
-                    // both S (i,j) and A (i,j) present
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                mij = !mij ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                    GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE2_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // finalize the matrix and return result
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_WRAPUP ;
-}
-
diff --git a/GraphBLAS/Source/GB_subassign_15.c b/GraphBLAS/Source/GB_subassign_15.c
index a957bd4570..5ae8bf97c1 100644
--- a/GraphBLAS/Source/GB_subassign_15.c
+++ b/GraphBLAS/Source/GB_subassign_15.c
@@ -2,8 +2,8 @@
 // GB_subassign_15: C(I,J)<!M> += scalar ; using S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,6 +16,9 @@
 // A:           scalar
 // S:           constructed
 
+// C: not bitmap, but can be full since no zombies are inserted in that case
+// M: not bitmap
+
 #include "GB_subassign_methods.h"
 
 GrB_Info GB_subassign_15
@@ -23,10 +26,12 @@ GrB_Info GB_subassign_15
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
@@ -35,27 +40,37 @@ GrB_Info GB_subassign_15
     const GrB_BinaryOp accum,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
-    const bool C_is_hyper = C->is_hyper ;
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+
+    GB_GET_C ;      // C must not be bitmap
     const int64_t Cnvec = C->nvec ;
     const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t *GB_RESTRICT Cp = C->p ;
+    const bool C_is_hyper = (Ch != NULL) ;
     GB_GET_MASK ;
-    const bool M_is_hyper = M->is_hyper ;
-    const int64_t Mnvec = M->nvec ;
     GB_GET_S ;
-    const int64_t *GB_RESTRICT Sh = S->h ;
-    const int64_t Snvec = S->nvec ;
-    const bool S_is_hyper = S->is_hyper ;
     GB_GET_ACCUM_SCALAR ;
 
     //--------------------------------------------------------------------------
@@ -77,7 +92,6 @@ GrB_Info GB_subassign_15
     // phase 1: create zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
         reduction(+:nzombies)
     for (taskid = 0 ; taskid < ntasks ; taskid++)
@@ -87,7 +101,7 @@ GrB_Info GB_subassign_15
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 ;
+        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iA_start, iA_end) ;
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -100,14 +114,14 @@ GrB_Info GB_subassign_15
             // get jC, the corresponding vector of C
             //------------------------------------------------------------------
 
-            GB_GET_jC ;
+            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
 
             //------------------------------------------------------------------
             // get S(iA_start:end,j) and M(iA_start:end,j)
             //------------------------------------------------------------------
 
-            GB_GET_VECTOR_FOR_IXJ (S) ;
-            GB_GET_VECTOR_FOR_IXJ (M) ;
+            GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+            GB_GET_VECTOR_FOR_IXJ (M, iA_start) ;
 
             //------------------------------------------------------------------
             // C(I(iA_start,iA_end-1),jC)<!M> += scalar
@@ -120,8 +134,8 @@ GrB_Info GB_subassign_15
                 // Get the indices at the top of each list.
                 //--------------------------------------------------------------
 
-                int64_t iS = (pS < pS_end) ? Si [pS] : INT64_MAX ;
-                int64_t iM = (pM < pM_end) ? Mi [pM] : INT64_MAX ;
+                int64_t iS = (pS < pS_end) ? GBI (Si, pS, Svlen) : INT64_MAX ;
+                int64_t iM = (pM < pM_end) ? GBI (Mi, pM, Mvlen) : INT64_MAX ;
 
                 //--------------------------------------------------------------
                 // find the smallest index of [iS iA iM] (always iA)
@@ -137,7 +151,7 @@ GrB_Info GB_subassign_15
                 if (i == iM)
                 { 
                     // mij = (bool) M [pM]
-                    mij = GB_mcast (Mx, pM, msize) ;
+                    mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
                     GB_NEXT (M) ;
                 }
                 else
@@ -204,7 +218,7 @@ GrB_Info GB_subassign_15
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 ;
+        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iA_start, iA_end) ;
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -217,14 +231,14 @@ GrB_Info GB_subassign_15
             // get jC, the corresponding vector of C
             //------------------------------------------------------------------
 
-            GB_GET_jC ;
+            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
 
             //------------------------------------------------------------------
             // get S(iA_start:end,j) and M(iA_start:end,j)
             //------------------------------------------------------------------
 
-            GB_GET_VECTOR_FOR_IXJ (S) ;
-            GB_GET_VECTOR_FOR_IXJ (M) ;
+            GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+            GB_GET_VECTOR_FOR_IXJ (M, iA_start) ;
 
             //------------------------------------------------------------------
             // C(I(iA_start,iA_end-1),jC)<!M> += scalar
@@ -237,8 +251,8 @@ GrB_Info GB_subassign_15
                 // Get the indices at the top of each list.
                 //--------------------------------------------------------------
 
-                int64_t iS = (pS < pS_end) ? Si [pS] : INT64_MAX ;
-                int64_t iM = (pM < pM_end) ? Mi [pM] : INT64_MAX ;
+                int64_t iS = (pS < pS_end) ? GBI (Si, pS, Svlen) : INT64_MAX ;
+                int64_t iM = (pM < pM_end) ? GBI (Mi, pM, Mvlen) : INT64_MAX ;
 
                 //--------------------------------------------------------------
                 // find the smallest index of [iS iA iM] (always iA)
@@ -254,7 +268,7 @@ GrB_Info GB_subassign_15
                 if (i == iM)
                 { 
                     // mij = (bool) M [pM]
-                    mij = GB_mcast (Mx, pM, msize) ;
+                    mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
                     GB_NEXT (M) ;
                 }
                 else
diff --git a/GraphBLAS/Source/GB_subassign_16.c b/GraphBLAS/Source/GB_subassign_16.c
deleted file mode 100644
index a9975fc4d1..0000000000
--- a/GraphBLAS/Source/GB_subassign_16.c
+++ /dev/null
@@ -1,295 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_subassign_16: C(I,J)<!M> += A ; using S
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Method 16: C(I,J)<!M> += A ; using S
-
-// M:           present
-// Mask_comp:   true
-// C_replace:   false
-// accum:       present
-// A:           matrix
-// S:           constructed
-
-#define GB_FREE_WORK GB_FREE_TWO_SLICE
-
-#include "GB_subassign_methods.h"
-
-GrB_Info GB_subassign_16
-(
-    GrB_Matrix C,
-    // input:
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    const GrB_Matrix M,
-    const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_BinaryOp accum,
-    const GrB_Matrix A,
-    const GrB_Matrix S,
-    GB_Context Context
-)
-{
-
-    //--------------------------------------------------------------------------
-    // get inputs
-    //--------------------------------------------------------------------------
-
-    GB_GET_C ;
-    GB_GET_MASK ;
-    const bool M_is_hyper = M->is_hyper ;
-    const int64_t Mnvec = M->nvec ;
-    const int64_t mvlen = M->vlen ;
-    GB_GET_A ;
-    GB_GET_S ;
-    GB_GET_ACCUM ;
-
-    //--------------------------------------------------------------------------
-    // Method 16: C(I,J)<!M> += A ; using S
-    //--------------------------------------------------------------------------
-
-    // Time: Close to optimal.  All entries in A+S must be traversed.
-
-    // Compare with Method 04.
-
-    //--------------------------------------------------------------------------
-    // Parallel: Z=A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_TWO_SLICE (A, S) ;
-
-    //--------------------------------------------------------------------------
-    // phase 1: create zombies, update entries, and count pending tuples
-    //--------------------------------------------------------------------------
-
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(+:nzombies)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE1 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                { 
-                    // S (i,j) is present but A (i,j) is not
-                    // ----[C . 1] or [X . 1]-------------------------------
-                    // [C . 1]: action: ( C ): no change, with accum
-                    // [X . 1]: action: ( X ): still a zombie
-                    // ----[C . 0] or [X . 0]-------------------------------
-                    // [C . 0]: action: ( C ): no change, with accum
-                    // [X . 0]: action: ( X ): still a zombie
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    mij = !mij ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        task_pending++ ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                {
-                    // both S (i,j) and A (i,j) present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    mij = !mij ;
-                    if (mij)
-                    { 
-                        // ----[C A 1] or [X A 1]-------------------------------
-                        // [C A 1]: action: ( =A ): A to C no accum
-                        // [C A 1]: action: ( =C+A ): apply accum
-                        // [X A 1]: action: ( undelete ): zombie lives
-                        GB_C_S_LOOKUP ;
-                        GB_withaccum_C_A_1_matrix ;
-                    }
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // ignore the remainder of S(:,j)
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                mij = !mij ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    task_pending++ ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE1_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // phase 2: insert pending tuples
-    //--------------------------------------------------------------------------
-
-    GB_PENDING_CUMSUM ;
-
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(&&:pending_sorted)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE2 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                { 
-                    // S (i,j) is present but A (i,j) is not
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                { 
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    mij = !mij ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                { 
-                    // both S (i,j) and A (i,j) present
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                mij = !mij ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                    GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE2_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // finalize the matrix and return result
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_WRAPUP ;
-}
-
diff --git a/GraphBLAS/Source/GB_subassign_17.c b/GraphBLAS/Source/GB_subassign_17.c
index dc52af7ad3..f3af4efc80 100644
--- a/GraphBLAS/Source/GB_subassign_17.c
+++ b/GraphBLAS/Source/GB_subassign_17.c
@@ -2,8 +2,8 @@
 // GB_subassign_17: C(I,J)<!M,repl> = scalar ; using S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,6 +16,9 @@
 // A:           scalar
 // S:           constructed
 
+// C: not bitmap
+// M: not bitmap
+
 #include "GB_subassign_methods.h"
 
 GrB_Info GB_subassign_17
@@ -23,10 +26,12 @@ GrB_Info GB_subassign_17
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
@@ -34,28 +39,38 @@ GrB_Info GB_subassign_17
     const bool Mask_struct,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ; ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
-    const bool C_is_hyper = C->is_hyper ;
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+
+    GB_GET_C ;      // C must not be bitmap
     const int64_t Cnvec = C->nvec ;
     const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t *GB_RESTRICT Cp = C->p ;
+    const bool C_is_hyper = (Ch != NULL) ;
     GB_GET_MASK ;
-    const bool M_is_hyper = M->is_hyper ;
-    const int64_t Mnvec = M->nvec ;
     GB_GET_SCALAR ;
     GB_GET_S ;
-    const int64_t *GB_RESTRICT Sh = S->h ;
-    const int64_t Snvec = S->nvec ;
-    const bool S_is_hyper = S->is_hyper ;
     GrB_BinaryOp accum = NULL ;
 
     //--------------------------------------------------------------------------
@@ -77,7 +92,6 @@ GrB_Info GB_subassign_17
     // phase 1: create zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
         reduction(+:nzombies)
     for (taskid = 0 ; taskid < ntasks ; taskid++)
@@ -87,7 +101,7 @@ GrB_Info GB_subassign_17
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 ;
+        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iA_start, iA_end) ;
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -100,14 +114,14 @@ GrB_Info GB_subassign_17
             // get jC, the corresponding vector of C
             //------------------------------------------------------------------
 
-            GB_GET_jC ;
+            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
 
             //------------------------------------------------------------------
             // get S(iA_start:end,j) and M(iA_start:end,j)
             //------------------------------------------------------------------
 
-            GB_GET_VECTOR_FOR_IXJ (S) ;
-            GB_GET_VECTOR_FOR_IXJ (M) ;
+            GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+            GB_GET_VECTOR_FOR_IXJ (M, iA_start) ;
 
             //------------------------------------------------------------------
             // C(I(iA_start,iA_end-1),jC)<!M,repl> = scalar
@@ -120,8 +134,8 @@ GrB_Info GB_subassign_17
                 // Get the indices at the top of each list.
                 //--------------------------------------------------------------
 
-                int64_t iS = (pS < pS_end) ? Si [pS] : INT64_MAX ;
-                int64_t iM = (pM < pM_end) ? Mi [pM] : INT64_MAX ;
+                int64_t iS = (pS < pS_end) ? GBI (Si, pS, Svlen) : INT64_MAX ;
+                int64_t iM = (pM < pM_end) ? GBI (Mi, pM, Mvlen) : INT64_MAX ;
 
                 //--------------------------------------------------------------
                 // find the smallest index of [iS iA iM] (always iA)
@@ -137,7 +151,7 @@ GrB_Info GB_subassign_17
                 if (i == iM)
                 { 
                     // mij = (bool) M [pM]
-                    mij = GB_mcast (Mx, pM, msize) ;
+                    mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
                     GB_NEXT (M) ;
                 }
                 else
@@ -211,7 +225,7 @@ GrB_Info GB_subassign_17
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 ;
+        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iA_start, iA_end) ;
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -224,14 +238,14 @@ GrB_Info GB_subassign_17
             // get jC, the corresponding vector of C
             //------------------------------------------------------------------
 
-            GB_GET_jC ;
+            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
 
             //------------------------------------------------------------------
             // get S(iA_start:end,j) and M(iA_start:end,j)
             //------------------------------------------------------------------
 
-            GB_GET_VECTOR_FOR_IXJ (S) ;
-            GB_GET_VECTOR_FOR_IXJ (M) ;
+            GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+            GB_GET_VECTOR_FOR_IXJ (M, iA_start) ;
 
             //------------------------------------------------------------------
             // C(I(iA_start,iA_end-1),jC)<!M,repl> = scalar
@@ -244,8 +258,8 @@ GrB_Info GB_subassign_17
                 // Get the indices at the top of each list.
                 //--------------------------------------------------------------
 
-                int64_t iS = (pS < pS_end) ? Si [pS] : INT64_MAX ;
-                int64_t iM = (pM < pM_end) ? Mi [pM] : INT64_MAX ;
+                int64_t iS = (pS < pS_end) ? GBI (Si, pS, Svlen) : INT64_MAX ;
+                int64_t iM = (pM < pM_end) ? GBI (Mi, pM, Mvlen) : INT64_MAX ;
 
                 //--------------------------------------------------------------
                 // find the smallest index of [iS iA iM] (always iA)
@@ -261,7 +275,7 @@ GrB_Info GB_subassign_17
                 if (i == iM)
                 { 
                     // mij = (bool) M [pM]
-                    mij = GB_mcast (Mx, pM, msize) ;
+                    mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
                     GB_NEXT (M) ;
                 }
                 else
diff --git a/GraphBLAS/Source/GB_subassign_18.c b/GraphBLAS/Source/GB_subassign_18.c
deleted file mode 100644
index 32137e6524..0000000000
--- a/GraphBLAS/Source/GB_subassign_18.c
+++ /dev/null
@@ -1,315 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_subassign_18: C(I,J)<!M,repl> = A ; using S
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Method 18: C(I,J)<!M,repl> = A ; using S
-
-// M:           present
-// Mask_comp:   true
-// C_replace:   true
-// accum:       NULL
-// A:           matrix
-// S:           constructed
-
-#define GB_FREE_WORK GB_FREE_TWO_SLICE
-
-#include "GB_subassign_methods.h"
-
-GrB_Info GB_subassign_18
-(
-    GrB_Matrix C,
-    // input:
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    const GrB_Matrix M,
-    const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_Matrix A,
-    const GrB_Matrix S,
-    GB_Context Context
-)
-{
-
-    //--------------------------------------------------------------------------
-    // get inputs
-    //--------------------------------------------------------------------------
-
-    GB_GET_C ;
-    GB_GET_MASK ;
-    const bool M_is_hyper = M->is_hyper ;
-    const int64_t Mnvec = M->nvec ;
-    const int64_t mvlen = M->vlen ;
-    GB_GET_A ;
-    GB_GET_S ;
-    GrB_BinaryOp accum = NULL ;
-
-    //--------------------------------------------------------------------------
-    // Method 18: C(I,J)<!M,repl> = A ; using S
-    //--------------------------------------------------------------------------
-
-    // Time: Optimal.  O((nnz(A)+nnz(S))*log(m)), since all entries in S+A must
-    // be traversed, and the corresponding entry in M (even if not present)
-    // determines the action to take. log(m) is the # of entries in a column
-    // of M.
-
-    // Method 10 and 18 are very similar.
-
-    //--------------------------------------------------------------------------
-    // Parallel: Z=A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_TWO_SLICE (A, S) ;
-
-    //--------------------------------------------------------------------------
-    // phase 1: create zombies, update entries, and count pending tuples
-    //--------------------------------------------------------------------------
-
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(+:nzombies)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE1 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                { 
-                    // S (i,j) is present but A (i,j) is not
-                    // ----[C . 1] or [X . 1]-----------------------------------
-                    // [C . 1]: action: ( delete ): becomes zombie
-                    // [X . 1]: action: ( X ): still zombie
-                    // ----[C . 0] or [X . 0]-----------------------------------
-                    // [X . 0]: action: ( X ): still a zombie
-                    // [C . 0]: C_repl: action: ( delete ): becomes zombie
-                    GB_C_S_LOOKUP ;
-                    GB_DELETE_ENTRY ;
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    mij = !mij ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        task_pending++ ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                {
-                    // both S (i,j) and A (i,j) present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    mij = !mij ;
-                    GB_C_S_LOOKUP ;
-                    if (mij)
-                    { 
-                        // ----[C A 1] or [X A 1]-------------------------------
-                        // [C A 1]: action: ( =A ): A to C no accum
-                        // [X A 1]: action: ( undelete ): zombie lives
-                        GB_noaccum_C_A_1_matrix ;
-                    }
-                    else
-                    { 
-                        // ----[C A 0] or [X A 0]-------------------------------
-                        // [X A 0]: action: ( X ): still a zombie
-                        // [C A 0]: C_repl: action: ( delete ): becomes zombie
-                        GB_DELETE_ENTRY ;
-                    }
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list S (:,j) has entries.  List A (:,j) exhausted
-            while (pS < pS_end)
-            { 
-                // ----[C . 1] or [X . 1]---------------------------------------
-                // S (i,j) is present but A (i,j) is not
-                // [C . 1]: action: ( delete ): becomes zombie
-                // [X . 1]: action: ( X ): still a zombie
-                GB_C_S_LOOKUP ;
-                GB_DELETE_ENTRY ;
-                GB_NEXT (S) ;
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                mij = !mij ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    task_pending++ ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE1_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // phase 2: insert pending tuples
-    //--------------------------------------------------------------------------
-
-    GB_PENDING_CUMSUM ;
-
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(&&:pending_sorted)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE2 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                { 
-                    // S (i,j) is present but A (i,j) is not
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    mij = !mij ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                { 
-                    // both S (i,j) and A (i,j) present
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                mij = !mij ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                    GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE2_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // finalize the matrix and return result
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_WRAPUP ;
-}
-
diff --git a/GraphBLAS/Source/GB_subassign_19.c b/GraphBLAS/Source/GB_subassign_19.c
index 1cfafd2842..093a239ca1 100644
--- a/GraphBLAS/Source/GB_subassign_19.c
+++ b/GraphBLAS/Source/GB_subassign_19.c
@@ -2,8 +2,8 @@
 // GB_subassign_19: C(I,J)<!M,repl> += scalar ; using S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,6 +16,9 @@
 // A:           scalar
 // S:           constructed
 
+// C: not bitmap
+// M: not bitmap
+
 #include "GB_subassign_methods.h"
 
 GrB_Info GB_subassign_19
@@ -23,10 +26,12 @@ GrB_Info GB_subassign_19
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
@@ -35,27 +40,37 @@ GrB_Info GB_subassign_19
     const GrB_BinaryOp accum,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ; ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (!GB_aliased (C, M)) ;   // NO ALIAS of C==M
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GB_EMPTY_TASKLIST ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, true, Context)) ;
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GB_GET_C ;
-    const bool C_is_hyper = C->is_hyper ;
-    const int64_t Cnvec = C->nvec ;
+    GB_MATRIX_WAIT_IF_JUMBLED (M) ;
+
+    GB_GET_C ;      // C must not be bitmap
     const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t *GB_RESTRICT Cp = C->p ;
+    const bool C_is_hyper = (Ch != NULL) ;
+    const int64_t Cnvec = C->nvec ;
     GB_GET_MASK ;
-    const bool M_is_hyper = M->is_hyper ;
-    const int64_t Mnvec = M->nvec ;
     GB_GET_S ;
-    const int64_t *GB_RESTRICT Sh = S->h ;
-    const int64_t Snvec = S->nvec ;
-    const bool S_is_hyper = S->is_hyper ;
     GB_GET_ACCUM_SCALAR ;
 
     //--------------------------------------------------------------------------
@@ -77,7 +92,6 @@ GrB_Info GB_subassign_19
     // phase 1: create zombies, update entries, and count pending tuples
     //--------------------------------------------------------------------------
 
-    int taskid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
         reduction(+:nzombies)
     for (taskid = 0 ; taskid < ntasks ; taskid++)
@@ -87,7 +101,7 @@ GrB_Info GB_subassign_19
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 ;
+        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1 (iA_start, iA_end) ;
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -100,14 +114,14 @@ GrB_Info GB_subassign_19
             // get jC, the corresponding vector of C
             //------------------------------------------------------------------
 
-            GB_GET_jC ;
+            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
 
             //------------------------------------------------------------------
             // get S(iA_start:end,j) and M(iA_start:end,j)
             //------------------------------------------------------------------
 
-            GB_GET_VECTOR_FOR_IXJ (S) ;
-            GB_GET_VECTOR_FOR_IXJ (M) ;
+            GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+            GB_GET_VECTOR_FOR_IXJ (M, iA_start) ;
 
             //------------------------------------------------------------------
             // C(I(iA_start,iA_end-1),jC)<!M,repl> += scalar
@@ -120,8 +134,8 @@ GrB_Info GB_subassign_19
                 // Get the indices at the top of each list.
                 //--------------------------------------------------------------
 
-                int64_t iS = (pS < pS_end) ? Si [pS] : INT64_MAX ;
-                int64_t iM = (pM < pM_end) ? Mi [pM] : INT64_MAX ;
+                int64_t iS = (pS < pS_end) ? GBI (Si, pS, Svlen) : INT64_MAX ;
+                int64_t iM = (pM < pM_end) ? GBI (Mi, pM, Mvlen) : INT64_MAX ;
 
                 //--------------------------------------------------------------
                 // find the smallest index of [iS iA iM] (always iA)
@@ -137,7 +151,7 @@ GrB_Info GB_subassign_19
                 if (i == iM)
                 { 
                     // mij = (bool) M [pM]
-                    mij = GB_mcast (Mx, pM, msize) ;
+                    mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
                     GB_NEXT (M) ;
                 }
                 else
@@ -211,7 +225,7 @@ GrB_Info GB_subassign_19
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 ;
+        GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2 (iA_start, iA_end) ;
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -224,14 +238,14 @@ GrB_Info GB_subassign_19
             // get jC, the corresponding vector of C
             //------------------------------------------------------------------
 
-            GB_GET_jC ;
+            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
 
             //------------------------------------------------------------------
             // get S(iA_start:end,j) and M(iA_start:end,j)
             //------------------------------------------------------------------
 
-            GB_GET_VECTOR_FOR_IXJ (S) ;
-            GB_GET_VECTOR_FOR_IXJ (M) ;
+            GB_GET_VECTOR_FOR_IXJ (S, iA_start) ;
+            GB_GET_VECTOR_FOR_IXJ (M, iA_start) ;
 
             //------------------------------------------------------------------
             // C(I(iA_start,iA_end-1),jC)<!M,repl> += scalar
@@ -244,8 +258,8 @@ GrB_Info GB_subassign_19
                 // Get the indices at the top of each list.
                 //--------------------------------------------------------------
 
-                int64_t iS = (pS < pS_end) ? Si [pS] : INT64_MAX ;
-                int64_t iM = (pM < pM_end) ? Mi [pM] : INT64_MAX ;
+                int64_t iS = (pS < pS_end) ? GBI (Si, pS, Svlen) : INT64_MAX ;
+                int64_t iM = (pM < pM_end) ? GBI (Mi, pM, Mvlen) : INT64_MAX ;
 
                 //--------------------------------------------------------------
                 // find the smallest index of [iS iA iM] (always iA)
@@ -261,7 +275,7 @@ GrB_Info GB_subassign_19
                 if (i == iM)
                 { 
                     // mij = (bool) M [pM]
-                    mij = GB_mcast (Mx, pM, msize) ;
+                    mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
                     GB_NEXT (M) ;
                 }
                 else
diff --git a/GraphBLAS/Source/GB_subassign_20.c b/GraphBLAS/Source/GB_subassign_20.c
deleted file mode 100644
index 7bb2b4f773..0000000000
--- a/GraphBLAS/Source/GB_subassign_20.c
+++ /dev/null
@@ -1,323 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_subassign_20: C(I,J)<!M,repl> += A ; using S
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Method 20: C(I,J)<!M,repl> += A ; using S
-
-// M:           present
-// Mask_comp:   true
-// C_replace:   true
-// accum:       present
-// A:           matrix
-// S:           constructed
-
-#define GB_FREE_WORK GB_FREE_TWO_SLICE
-
-#include "GB_subassign_methods.h"
-
-GrB_Info GB_subassign_20
-(
-    GrB_Matrix C,
-    // input:
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    const GrB_Matrix M,
-    const bool Mask_struct,         // if true, use the only structure of M
-    const GrB_BinaryOp accum,
-    const GrB_Matrix A,
-    const GrB_Matrix S,
-    GB_Context Context
-)
-{
-
-    //--------------------------------------------------------------------------
-    // get inputs
-    //--------------------------------------------------------------------------
-
-    GB_GET_C ;
-    GB_GET_MASK ;
-    const bool M_is_hyper = M->is_hyper ;
-    const int64_t Mnvec = M->nvec ;
-    const int64_t mvlen = M->vlen ;
-    GB_GET_A ;
-    GB_GET_S ;
-    GB_GET_ACCUM ;
-
-    //--------------------------------------------------------------------------
-    // Method 20: C(I,J)<!M,repl> += A ; using S
-    //--------------------------------------------------------------------------
-
-    // Time: Optimal.  Omega(nnz(S)+nnz(A)) since all entries in S+A must be
-    // traversed.  All cases of the mask (0, 1, or not present) must be
-    // considered, because of the C_replace descriptor being true.
-
-    // Method 12 and Method 20 are very similar.
-
-    //--------------------------------------------------------------------------
-    // Parallel: Z=A+S (Methods 02, 04, 09, 10, 11, 12, 14, 16, 18, 20)
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_TWO_SLICE (A, S) ;
-
-    //--------------------------------------------------------------------------
-    // phase 1: create zombies, update entries, and count pending tuples
-    //--------------------------------------------------------------------------
-
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(+:nzombies)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE1 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            // int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                {
-                    // S (i,j) is present but A (i,j) is not
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iS) ;
-                    mij = !mij ;
-                    if (!mij)
-                    { 
-                        // ----[C . 0] or [X . 0]-------------------------------
-                        // [X . 0]: action: ( X ): still a zombie
-                        // [C . 0]: C_repl: action: ( delete ): becomes zombie
-                        GB_C_S_LOOKUP ;
-                        GB_DELETE_ENTRY ;
-                    }
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    mij = !mij ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        task_pending++ ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                {
-                    // both S (i,j) and A (i,j) present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    mij = !mij ;
-                    GB_C_S_LOOKUP ;
-                    if (mij)
-                    { 
-                        // ----[C A 1] or [X A 1]-------------------------------
-                        // [C A 1]: action: ( =A ): A to C no accum
-                        // [C A 1]: action: ( =C+A ): apply accum
-                        // [X A 1]: action: ( undelete ): zombie lives
-                        GB_withaccum_C_A_1_matrix ;
-                    }
-                    else
-                    { 
-                        // ----[C A 0] or [X A 0]-------------------------------
-                        // [X A 0]: action: ( X ): still a zombie
-                        // [C A 0]: C_repl: action: ( delete ): becomes zombie
-                        GB_DELETE_ENTRY ;
-                    }
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list S (:,j) has entries.  List A (:,j) exhausted
-            while (pS < pS_end)
-            {
-                int64_t iS = Si [pS] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iS) ;
-                mij = !mij ;
-                if (!mij)
-                { 
-                    // ----[C . 0] or [X . 0]-----------------------------------
-                    // [X . 0]: action: ( X ): still a zombie
-                    // [C . 0]: C_repl: action: ( delete ): becomes zombie
-                    GB_C_S_LOOKUP ;
-                    GB_DELETE_ENTRY ;
-                }
-                GB_NEXT (S) ;
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                mij = !mij ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    task_pending++ ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE1_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // phase 2: insert pending tuples
-    //--------------------------------------------------------------------------
-
-    GB_PENDING_CUMSUM ;
-
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
-        reduction(&&:pending_sorted)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
-
-        GB_GET_TASK_DESCRIPTOR_PHASE2 ;
-
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get A(:,j) and S(:,j)
-            //------------------------------------------------------------------
-
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, Z_to_X) ;
-            GB_GET_MAPPED_VECTOR (pS, pS_end, pB, pB_end, Sp, j, k, Z_to_S) ;
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pM_start, pM_end ;
-            GB_VECTOR_LOOKUP (pM_start, pM_end, M, j) ;
-            bool mjdense = (pM_end - pM_start) == mvlen ;
-
-            //------------------------------------------------------------------
-            // do a 2-way merge of S(:,j) and A(:,j)
-            //------------------------------------------------------------------
-
-            // jC = J [j] ; or J is a colon expression
-            int64_t jC = GB_ijlist (J, j, Jkind, Jcolon) ;
-
-            // while both list S (:,j) and A (:,j) have entries
-            while (pS < pS_end && pA < pA_end)
-            {
-                int64_t iS = Si [pS] ;
-                int64_t iA = Ai [pA] ;
-
-                if (iS < iA)
-                { 
-                    // S (i,j) is present but A (i,j) is not
-                    GB_NEXT (S) ;
-                }
-                else if (iA < iS)
-                {
-                    // S (i,j) is not present, A (i,j) is present
-                    GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                    mij = !mij ;
-                    if (mij)
-                    { 
-                        // ----[. A 1]------------------------------------------
-                        // [. A 1]: action: ( insert )
-                        int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                        GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                    }
-                    GB_NEXT (A) ;
-                }
-                else
-                { 
-                    // both S (i,j) and A (i,j) present
-                    GB_NEXT (S) ;
-                    GB_NEXT (A) ;
-                }
-            }
-
-            // while list A (:,j) has entries.  List S (:,j) exhausted
-            while (pA < pA_end)
-            {
-                // S (i,j) is not present, A (i,j) is present
-                int64_t iA = Ai [pA] ;
-                GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP (iA) ;
-                mij = !mij ;
-                if (mij)
-                { 
-                    // ----[. A 1]----------------------------------------------
-                    // [. A 1]: action: ( insert )
-                    int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
-                    GB_PENDING_INSERT (Ax +(pA*asize)) ;
-                }
-                GB_NEXT (A) ;
-            }
-        }
-
-        GB_PHASE2_TASK_WRAPUP ;
-    }
-
-    //--------------------------------------------------------------------------
-    // finalize the matrix and return result
-    //--------------------------------------------------------------------------
-
-    GB_SUBASSIGN_WRAPUP ;
-}
-
diff --git a/GraphBLAS/Source/GB_subassign_24.c b/GraphBLAS/Source/GB_subassign_24.c
new file mode 100644
index 0000000000..bb443999c7
--- /dev/null
+++ b/GraphBLAS/Source/GB_subassign_24.c
@@ -0,0 +1,152 @@
+//------------------------------------------------------------------------------
+// GB_subassign_24: make a deep copy of a sparse or dense matrix
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// C = A, making a deep copy into an existing non-shallow matrix C, but
+// possibly reusing parts of C if C is dense.  See also GB_dup.
+// C can have any sparsity structure on input.  C takes on the same sparsity
+// structure as A.
+
+// Handles arbitrary typecasting.
+
+// if C sparse and A dense/full, C is converted to full, ignoring
+// C->sparsity.  C is conformed to its desired sparsity structure later.
+
+// A can be jumbled, in which case C is also jumbled.
+// A can have any sparsity structure (sparse, hyper, bitmap, or full).
+
+#include "GB_dense.h"
+#include "GB_Pending.h"
+#define GB_FREE_ALL ;
+
+GrB_Info GB_subassign_24    // C = A, copy A into an existing matrix C
+(
+    GrB_Matrix C,           // output matrix to modify
+    const GrB_Matrix A,     // input matrix to copy
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_aliased (C, A)) ;   // NO ALIAS of C==A
+    ASSERT (!GB_is_shallow (C)) ;
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    ASSERT_MATRIX_OK (C, "C for GB_subassign_24", GB0) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    ASSERT (GB_PENDING_OK (C)) ;
+
+    ASSERT_MATRIX_OK (A, "A for GB_subassign_24", GB0) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (GB_PENDING_OK (A)) ;
+
+    //--------------------------------------------------------------------------
+    // delete any lingering zombies and assemble any pending tuples
+    //--------------------------------------------------------------------------
+
+    GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (A) ;
+
+    C->jumbled = false ;        // prior contents of C are discarded
+
+    // save the sparsity control of C
+    int C_sparsity = C->sparsity ;
+    float C_hyper_switch = C->hyper_switch ;
+    float C_bitmap_switch = C->bitmap_switch ;
+
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+
+    //--------------------------------------------------------------------------
+    // determine the number of threads to use
+    //--------------------------------------------------------------------------
+
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+
+    //--------------------------------------------------------------------------
+    // C = A
+    //--------------------------------------------------------------------------
+
+    bool copy_dense_A_to_C =            // copy from dense A to dense C if:
+        (
+            GB_is_dense (C)             // both A and C are dense
+            && GB_is_dense (A)
+            && !(A->jumbled)            // A cannot be jumbled
+            && C->vdim == A->vdim       // A and C have the same size
+            && C->vlen == A->vlen
+            && C->is_csc == A->is_csc   // A and C have the same format
+            && C->x != NULL             // C->x exists
+        ) ;
+
+    if (copy_dense_A_to_C)
+    { 
+
+        //----------------------------------------------------------------------
+        // discard the pattern of C
+        //----------------------------------------------------------------------
+
+        // make C full, if not full already
+        C->nzombies = 0 ;                   // overwrite any zombies
+        GB_Pending_free (&(C->Pending)) ;   // abandon all pending tuples
+        GB_convert_any_to_full (C) ;        // ensure C is full
+
+    }
+    else
+    { 
+
+        //----------------------------------------------------------------------
+        // copy the pattern from A to C
+        //----------------------------------------------------------------------
+
+        // clear prior content of C, but keep the CSR/CSC format and its type
+        bool C_is_csc = C->is_csc ;
+        GB_phbix_free (C) ;
+        // copy the pattern, not the values
+        GB_OK (GB_dup2 (&C, A, false, C->type, Context)) ;
+        C->is_csc = C_is_csc ;      // do not change the CSR/CSC format of C
+    }
+
+    //-------------------------------------------------------------------------
+    // copy the values from A to C, typecasting as needed
+    //-------------------------------------------------------------------------
+
+    if (C->type != A->type)
+    { 
+        GBURBLE ("(typecast) ") ;
+    }
+
+    int64_t anz = GB_NNZ_HELD (A) ;
+    int nthreads = GB_nthreads (anz, chunk, nthreads_max) ;
+    GB_cast_array (C->x, C->type->code, A->x, A->type->code,
+        A->b, A->type->size, anz, nthreads) ;
+
+    //-------------------------------------------------------------------------
+    // restore the sparsity control of C
+    //-------------------------------------------------------------------------
+
+    C->sparsity = C_sparsity ;
+    C->hyper_switch = C_hyper_switch ;
+    C->bitmap_switch = C_bitmap_switch ;
+
+    //-------------------------------------------------------------------------
+    // return the result
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (C, "C result for GB_subassign_24", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_subassign_IxJ_slice.c b/GraphBLAS/Source/GB_subassign_IxJ_slice.c
index e95d85ba2e..37eb42e304 100644
--- a/GraphBLAS/Source/GB_subassign_IxJ_slice.c
+++ b/GraphBLAS/Source/GB_subassign_IxJ_slice.c
@@ -2,8 +2,8 @@
 // GB_subassign_IxJ_slice: slice IxJ for subassign
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -63,14 +63,14 @@ GrB_Info GB_subassign_IxJ_slice
     int *p_ntasks,                  // # of tasks constructed
     int *p_nthreads,                // # of threads to use
     // input:
-    const GrB_Index *I,
+//  const GrB_Index *I,
     const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
+//  const int Ikind,
+//  const int64_t Icolon [3],
+//  const GrB_Index *J,
     const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
+//  const int Jkind,
+//  const int64_t Jcolon [3],
     GB_Context Context
 )
 {
@@ -165,11 +165,9 @@ GrB_Info GB_subassign_IxJ_slice
 
         GB_REALLOC_TASK_LIST (TaskList, nJ * nI_fine_tasks, max_ntasks) ;
 
-        //------------------------------------------------------------------
+        //----------------------------------------------------------------------
         // construct fine tasks for index j
-        //------------------------------------------------------------------
-
-        // Method 7, 8, 11a, 11b, 12a, 12b: no need for binary search of C
+        //----------------------------------------------------------------------
 
         for (int64_t j = 0 ; j < nJ ; j++)
         {
diff --git a/GraphBLAS/Source/GB_subassign_IxJ_slice.h b/GraphBLAS/Source/GB_subassign_IxJ_slice.h
new file mode 100644
index 0000000000..529776a54c
--- /dev/null
+++ b/GraphBLAS/Source/GB_subassign_IxJ_slice.h
@@ -0,0 +1,40 @@
+//------------------------------------------------------------------------------
+// GB_subassign_IxJ_slice.h: definitions for GB_subassign_IxJ_slice
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_SUBASSIGN_IXJ_SLICE_H
+#define GB_SUBASSIGN_IXJ_SLICE_H
+
+//------------------------------------------------------------------------------
+// GB_subassign_IxJ_slice
+//------------------------------------------------------------------------------
+
+// Slice IxJ for a scalar assignment method (Methods 01, 03, 13, 15, 17, 19),
+// and for bitmap assignments (in GB_bitmap_assign_IxJ_template).
+
+GrB_Info GB_subassign_IxJ_slice
+(
+    // output:
+    GB_task_struct **p_TaskList,    // array of structs, of size max_ntasks
+    int *p_max_ntasks,              // size of TaskList
+    int *p_ntasks,                  // # of tasks constructed
+    int *p_nthreads,                // # of threads to use
+    // input:
+//  const GrB_Index *I,
+    const int64_t nI,
+//  const int Ikind,
+//  const int64_t Icolon [3],
+//  const GrB_Index *J,
+    const int64_t nJ,
+//  const int Jkind,
+//  const int64_t Jcolon [3],
+    GB_Context Context
+) ;
+
+#endif
+
diff --git a/GraphBLAS/Source/GB_subassign_emult_slice.c b/GraphBLAS/Source/GB_subassign_emult_slice.c
index 62bfcf86a1..1b1525fabd 100644
--- a/GraphBLAS/Source/GB_subassign_emult_slice.c
+++ b/GraphBLAS/Source/GB_subassign_emult_slice.c
@@ -1,37 +1,37 @@
 //------------------------------------------------------------------------------
-// GB_subassign_emult_slice: slice the entries and vectors for GB_subassign_08
+// GB_subassign_emult_slice: slice the entries and vectors for GB_subassign_08n
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Constructs a set of tasks to compute C for GB_subassign_80, based on
-// slicing a two input matrix (A and M).  Fine tasks must also find their
+// Constructs a set of tasks to compute C for GB_subassign_08n, based on
+// slicing two input matricies (A and M).  Fine tasks must also find their
 // location in their vector C(:,jC).
 
-// This method is used only by GB_subassign_08.  New zombies cannot be created,
-// since no entries are deleted.  Old zombies can be brought back to life,
-// however.
+// This method is used only by GB_subassign_08n.  New zombies cannot be
+// created, since no entries are deleted.  Old zombies can be brought back to
+// life, however.
 
         //  =====================       ==============
         //  M   cmp rpl acc A   S       method: action
         //  =====================       ==============
-        //  M   -   -   +   A   -       08:  C(I,J)<M> += A, no S
+        //  M   -   -   +   A   -       08n:  C(I,J)<M> += A, no S
+
+// C, M, A: not bitmap.  C can be full.
+
+// If C is bitmap, then GB_bitmap_assign_M_accum is used instead.
+// If M or A are bitmap, but C is sparse or hyper, then Method 08s is used
+// instead (which handles both M and A as bitmap).  As a result, this method
+// does not need to consider the bitmap case for C, M, or A.
 
 #include "GB_subassign_methods.h"
 #include "GB_emult.h"
 // Npending is set to NULL by the GB_EMPTY_TASKLIST macro, but unused here.
 #include "GB_unused.h"
 
-#undef  GB_FREE_ALL
-#define GB_FREE_ALL             \
-{                               \
-    GB_FREE_EMULT_SLICE ;       \
-    GB_FREE (TaskList) ;        \
-}
-
 GrB_Info GB_subassign_emult_slice
 (
     // output:
@@ -40,7 +40,7 @@ GrB_Info GB_subassign_emult_slice
     int *p_ntasks,                  // # of tasks constructed
     int *p_nthreads,                // # of threads to use
     int64_t *p_Znvec,               // # of vectors to compute in Z
-    const int64_t *GB_RESTRICT *Zh_handle,     // Zh is A->h, M->h, or NULL
+    const int64_t *GB_RESTRICT *Zh_handle, // Zh_shallow is A->h, M->h, or NULL
     int64_t *GB_RESTRICT *Z_to_A_handle, // Z_to_A: output size Znvec, or NULL
     int64_t *GB_RESTRICT *Z_to_M_handle, // Z_to_M: output size Znvec, or NULL
     // input:
@@ -63,6 +63,11 @@ GrB_Info GB_subassign_emult_slice
     // check inputs
     //--------------------------------------------------------------------------
 
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (!GB_IS_BITMAP (M)) ;    // Method 08n is not used for M bitmap
+    ASSERT (!GB_IS_BITMAP (A)) ;    // Method 08n is not used for A bitmap
+
+    GB_EMPTY_TASKLIST
     ASSERT (p_TaskList != NULL) ;
     ASSERT (p_max_ntasks != NULL) ;
     ASSERT (p_ntasks != NULL) ;
@@ -71,6 +76,10 @@ GrB_Info GB_subassign_emult_slice
     ASSERT_MATRIX_OK (M, "M for emult_slice", GB0) ;
     ASSERT_MATRIX_OK (A, "A for emult_slice", GB0) ;
 
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_JUMBLED (M)) ;
+    ASSERT (!GB_JUMBLED (A)) ;
+
     ASSERT (p_Znvec != NULL) ;
     ASSERT (Zh_handle != NULL) ;
     ASSERT (Z_to_A_handle != NULL) ;
@@ -86,48 +95,45 @@ GrB_Info GB_subassign_emult_slice
     (*Z_to_A_handle) = NULL ;
     (*Z_to_M_handle) = NULL ;
 
-    GB_EMPTY_TASKLIST ;
-
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    GrB_Info info ;
     int64_t *GB_RESTRICT Ci = C->i ;
     int64_t nzombies = C->nzombies ;
-    const bool C_is_hyper = C->is_hyper ;
     const int64_t Cnvec = C->nvec ;
-    const int64_t cvlen = C->vlen ;
+    const int64_t Cvlen = C->vlen ;
     const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t *GB_RESTRICT Cp = C->p ;
+    const bool C_is_hyper = (Ch != NULL) ;
 
     const int64_t *GB_RESTRICT Mp = M->p ;
     const int64_t *GB_RESTRICT Mh = M->h ;
     const int64_t *GB_RESTRICT Mi = M->i ;
+    const int64_t Mvlen = M->vlen ;
 
     const int64_t *GB_RESTRICT Ap = A->p ;
     const int64_t *GB_RESTRICT Ah = A->h ;
     const int64_t *GB_RESTRICT Ai = A->i ;
+    const int64_t Avlen = A->vlen ;
 
     //--------------------------------------------------------------------------
     // construct fine/coarse tasks for eWise multiply of A.*M
     //--------------------------------------------------------------------------
 
-    // Compare with the first part of GB_emult (A,B).  Note that M in this
+    // Compare with the first part of GB_emult for A.*B.  Note that M in this
     // function takes the place of B in GB_emult.
 
     int64_t Znvec ;
-    const int64_t *GB_RESTRICT Zh = NULL ;
-    int64_t *GB_RESTRICT Z_to_A = NULL ;
-    int64_t *GB_RESTRICT Z_to_M = NULL ;
-
+    int64_t *GB_RESTRICT Zh_shallow = NULL ;
+    int Z_sparsity = GxB_SPARSE ;
     GB_OK (GB_emult_phase0 (
-        &Znvec, &Zh, NULL, &Z_to_A, &Z_to_M,
+        &Znvec, &Zh_shallow, NULL, &Z_to_A, &Z_to_M, &Z_sparsity,
         NULL, A, M, Context)) ;
 
     GB_OK (GB_ewise_slice (
         &TaskList, &max_ntasks, &ntasks, &nthreads,
-        Znvec, Zh, NULL, Z_to_A, Z_to_M, false,
+        Znvec, Zh_shallow, NULL, Z_to_A, Z_to_M, false,
         NULL, A, M, Context)) ;
 
     //--------------------------------------------------------------------------
@@ -139,13 +145,13 @@ GrB_Info GB_subassign_emult_slice
     // at the same time another is attempting to do a binary search on that
     // entry.  This is safe as long as a 64-bit integer read/write is always
     // atomic, but there is no gaurantee that this is true for all
-    // architectures.  Note that GB_subassign_08 cannot create new zombies.
+    // architectures.  Note that GB_subassign_08n cannot create new zombies.
 
     // This work could be done in parallel, but each task does at most 2 binary
     // searches.  The total work for all the binary searches will likely be
     // small.  So do the work with a single thread.
 
-    for (int taskid = 0 ; taskid < ntasks ; taskid++)
+    for (taskid = 0 ; taskid < ntasks ; taskid++)
     {
 
         //----------------------------------------------------------------------
@@ -166,9 +172,9 @@ GrB_Info GB_subassign_emult_slice
             //------------------------------------------------------------------
 
             int64_t k = kfirst ;
-            int64_t j = (Zh == NULL) ? k : Zh [k] ;
-            GB_GET_EMULT_VECTOR (pA, pA_end, pA, pA_end, Ap, Ah, j, k, Z_to_A) ;
-            GB_GET_EMULT_VECTOR (pM, pM_end, pB, pB_end, Mp, Mh, j, k, Z_to_M) ;
+            int64_t j = GBH (Zh_shallow, k) ;
+            GB_GET_EVEC (pA, pA_end, pA, pA_end, Ap, Ah, j, k, Z_to_A, Avlen) ;
+            GB_GET_EVEC (pM, pM_end, pB, pB_end, Mp, Mh, j, k, Z_to_M, Mvlen) ;
 
             //------------------------------------------------------------------
             // quick checks for empty intersection of A(:,j) and M(:,j)
@@ -177,10 +183,10 @@ GrB_Info GB_subassign_emult_slice
             int64_t ajnz = pA_end - pA ;
             int64_t mjnz = pM_end - pM ;
             if (ajnz == 0 || mjnz == 0) continue ;
-            int64_t iA_first = Ai [pA] ;
-            int64_t iA_last  = Ai [pA_end-1] ;
-            int64_t iM_first = Mi [pM] ;
-            int64_t iM_last  = Mi [pM_end-1] ;
+            int64_t iA_first = GBI (Ai, pA, Avlen) ;
+            int64_t iA_last  = GBI (Ai, pA_end-1, Avlen) ;
+            int64_t iM_first = GBI (Mi, pM, Mvlen) ;
+            int64_t iM_last  = GBI (Mi, pM_end-1, Mvlen) ;
             if (iA_last < iM_first || iM_last < iA_first) continue ;
 
             //------------------------------------------------------------------
@@ -188,7 +194,7 @@ GrB_Info GB_subassign_emult_slice
             //------------------------------------------------------------------
 
             int64_t GB_LOOKUP_jC ;
-            bool cjdense = (pC_end - pC_start == cvlen) ;
+            bool cjdense = (pC_end - pC_start == Cvlen) ;
 
             //------------------------------------------------------------------
             // slice C(:,jC) for this fine task
@@ -246,7 +252,7 @@ GrB_Info GB_subassign_emult_slice
     (*p_nthreads  ) = nthreads ;
 
     (*p_Znvec      ) = Znvec ;
-    (*Zh_handle    ) = Zh ;
+    (*Zh_handle    ) = Zh_shallow ;
     (*Z_to_A_handle) = Z_to_A ;
     (*Z_to_M_handle) = Z_to_M ;
 
diff --git a/GraphBLAS/Source/GB_subassign_methods.h b/GraphBLAS/Source/GB_subassign_methods.h
index 2b3603f2ce..3ab35a62d1 100644
--- a/GraphBLAS/Source/GB_subassign_methods.h
+++ b/GraphBLAS/Source/GB_subassign_methods.h
@@ -2,8 +2,8 @@
 // GB_subassign_methods.h: definitions for GB_subassign methods
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,6 +15,7 @@
 #include "GB_ij.h"
 #include "GB_Pending.h"
 #include "GB_unused.h"
+#include "GB_subassign_IxJ_slice.h"
 
 //------------------------------------------------------------------------------
 // free workspace
@@ -28,45 +29,68 @@
 #define GB_FREE_ALL         \
 {                           \
     GB_FREE_WORK ;          \
-    GB_FREE (Npending) ;    \
     GB_FREE (TaskList) ;    \
+    GB_FREE (Npending) ;    \
+    GB_FREE (Zh) ;          \
+    GB_FREE (Z_to_X) ;      \
+    GB_FREE (Z_to_S) ;      \
+    GB_FREE (Z_to_A) ;      \
+    GB_FREE (Z_to_M) ;      \
+    GB_Matrix_free (&S);    \
 }
 
 //------------------------------------------------------------------------------
-// GB_GET_C: get the C matrix
+// GB_EMPTY_TASKLIST: declare an empty TaskList
 //------------------------------------------------------------------------------
 
-#define GB_GET_C                                                            \
+#define GB_EMPTY_TASKLIST                                                   \
     GrB_Info info ;                                                         \
+    int taskid, ntasks = 0, max_ntasks = 0, nthreads ;                      \
+    GB_task_struct *TaskList = NULL ;                                       \
+    int64_t *GB_RESTRICT Npending = NULL ;                                  \
+    int64_t *GB_RESTRICT Zh = NULL ;                                        \
+    int64_t *GB_RESTRICT Z_to_X = NULL ;                                    \
+    int64_t *GB_RESTRICT Z_to_S = NULL ;                                    \
+    int64_t *GB_RESTRICT Z_to_A = NULL ;                                    \
+    int64_t *GB_RESTRICT Z_to_M = NULL ;                                    \
+    GrB_Matrix S = NULL ;
+
+//------------------------------------------------------------------------------
+// GB_GET_C: get the C matrix (cannot be bitmap)
+//------------------------------------------------------------------------------
+
+// C cannot be aliased with M or A.
+
+#define GB_GET_C                                                            \
     ASSERT_MATRIX_OK (C, "C for subassign kernel", GB0) ;                   \
+    ASSERT (!GB_IS_BITMAP (C)) ;                                            \
     int64_t *GB_RESTRICT Ci = C->i ;                                        \
     GB_void *GB_RESTRICT Cx = (GB_void *) C->x ;                            \
     const size_t csize = C->type->size ;                                    \
     const GB_Type_code ccode = C->type->code ;                              \
     const int64_t cvdim = C->vdim ;                                         \
+    const int64_t Cvlen = C->vlen ;                                         \
     int64_t nzombies = C->nzombies ;                                        \
     const bool is_matrix = (cvdim > 1) ;
 
-//  const bool C_is_hyper = C->is_hyper ;
-//  const int64_t Cnvec = C->nvec ;
-//  const int64_t cvlen = C->vlen ;
-//  const int64_t *GB_RESTRICT Ch = C->h ;
-//  const int64_t *GB_RESTRICT Cp = C->p ;
-
 //------------------------------------------------------------------------------
 // GB_GET_MASK: get the mask matrix M
 //------------------------------------------------------------------------------
 
-#define GB_GET_MASK                                                            \
-    ASSERT_MATRIX_OK (M, "M for assign", GB0) ;                                \
-    const int64_t *GB_RESTRICT Mp = M->p ;                                     \
-    const int64_t *GB_RESTRICT Mh = M->h ;                                     \
-    const int64_t *GB_RESTRICT Mi = M->i ;                                     \
-    const GB_void *GB_RESTRICT Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;\
-    const size_t msize = M->type->size ;
+// M and A can be aliased, but both are const.
 
-//  const bool M_is_hyper = M->is_hyper ;
-//  const int64_t Mnvec = M->nvec ;
+#define GB_GET_MASK                                                         \
+    ASSERT_MATRIX_OK (M, "M for assign", GB0) ;                             \
+    const int64_t *Mp = M->p ;                                              \
+    const int64_t *Mh = M->h ;                                              \
+    const int8_t  *Mb = M->b ;                                              \
+    const int64_t *Mi = M->i ;                                              \
+    const GB_void *Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;         \
+    const size_t msize = M->type->size ;                                    \
+    const size_t Mvlen = M->vlen ;                                          \
+    const int64_t Mnvec = M->nvec ;                                         \
+    const bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;                         \
+    const bool M_is_bitmap = GB_IS_BITMAP (M) ;
 
 //------------------------------------------------------------------------------
 // GB_GET_ACCUM: get the accumulator op and its related typecasting functions
@@ -74,13 +98,15 @@
 
 #define GB_GET_ACCUM                                                        \
     ASSERT_BINARYOP_OK (accum, "accum for assign", GB0) ;                   \
-    GxB_binary_function faccum = accum->function ;                          \
-    GB_cast_function cast_A_to_Y = GB_cast_factory (accum->ytype->code, acode);\
-    GB_cast_function cast_C_to_X = GB_cast_factory (accum->xtype->code, ccode);\
-    GB_cast_function cast_Z_to_C = GB_cast_factory (ccode, accum->ztype->code);\
-    size_t xsize = accum->xtype->size ;                                     \
-    size_t ysize = accum->ytype->size ;                                     \
-    size_t zsize = accum->ztype->size ;
+    ASSERT (!GB_OP_IS_POSITIONAL (accum)) ;                                 \
+    const GxB_binary_function faccum = accum->function ;                    \
+    const GB_cast_function                                                  \
+        cast_A_to_Y = GB_cast_factory (accum->ytype->code, acode),          \
+        cast_C_to_X = GB_cast_factory (accum->xtype->code, ccode),          \
+        cast_Z_to_C = GB_cast_factory (ccode, accum->ztype->code) ;         \
+    const size_t xsize = accum->xtype->size ;                               \
+    const size_t ysize = accum->ytype->size ;                               \
+    const size_t zsize = accum->ztype->size ;
 
 //------------------------------------------------------------------------------
 // GB_GET_A: get the A matrix
@@ -88,17 +114,16 @@
 
 #define GB_GET_A                                                            \
     ASSERT_MATRIX_OK (A, "A for assign", GB0) ;                             \
-    GrB_Type atype = A->type ;                                              \
-    size_t asize = atype->size ;                                            \
-    GB_Type_code acode = atype->code ;                                      \
-    const int64_t *GB_RESTRICT Ap = A->p ;                                  \
-    const int64_t *GB_RESTRICT Ai = A->i ;                                  \
-    const GB_void *GB_RESTRICT Ax = (GB_void *) A->x ;                      \
-    GB_cast_function cast_A_to_C = GB_cast_factory (ccode, acode) ;
-
-//  const int64_t *GB_RESTRICT Ah = A->h ;
-//  const int64_t Anvec = A->nvec ;
-//  const bool A_is_hyper = A->is_hyper ;
+    const GrB_Type atype = A->type ;                                        \
+    const size_t asize = atype->size ;                                      \
+    const GB_Type_code acode = atype->code ;                                \
+    const int64_t *Ap = A->p ;                                              \
+    const int8_t  *Ab = A->b ;                                              \
+    const int64_t *Ai = A->i ;                                              \
+    const GB_void *Ax = (GB_void *) A->x ;                                  \
+    const GB_cast_function cast_A_to_C = GB_cast_factory (ccode, acode) ;   \
+    const int64_t Avlen = A->vlen ;                                         \
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
 
 //------------------------------------------------------------------------------
 // GB_GET_SCALAR: get the scalar
@@ -106,9 +131,9 @@
 
 #define GB_GET_SCALAR                                                       \
     ASSERT_TYPE_OK (atype, "atype for assign", GB0) ;                       \
-    size_t asize = atype->size ;                                            \
-    GB_Type_code acode = atype->code ;                                      \
-    GB_cast_function cast_A_to_C = GB_cast_factory (ccode, acode) ;         \
+    const size_t asize = atype->size ;                                      \
+    const GB_Type_code acode = atype->code ;                                \
+    const GB_cast_function cast_A_to_C = GB_cast_factory (ccode, acode) ;   \
     GB_void cwork [GB_VLA(csize)] ;                                         \
     cast_A_to_C (cwork, scalar, asize) ;                                    \
 
@@ -126,15 +151,18 @@
 // GB_GET_S: get the S matrix
 //------------------------------------------------------------------------------
 
+// S is never aliased with any other matrix.
+// FUTURE: S->p could be C->p and S->x NULL if I and J are (:,:)
+
 #define GB_GET_S                                                            \
     ASSERT_MATRIX_OK (S, "S extraction", GB0) ;                             \
     const int64_t *GB_RESTRICT Sp = S->p ;                                  \
+    const int64_t *GB_RESTRICT Sh = S->h ;                                  \
     const int64_t *GB_RESTRICT Si = S->i ;                                  \
-    const int64_t *GB_RESTRICT Sx = (int64_t *) S->x ;
-
-//  const int64_t *GB_RESTRICT Sh = S->h ;
-//  const int64_t Snvec = S->nvec ;
-//  const bool S_is_hyper = S->is_hyper ;
+    const int64_t *GB_RESTRICT Sx = (int64_t *) S->x ;                      \
+    const int64_t Svlen = S->vlen ;                                         \
+    const int64_t Snvec = S->nvec ;                                         \
+    const bool S_is_hyper = GB_IS_HYPERSPARSE (S) ;
 
 //------------------------------------------------------------------------------
 // basic actions
@@ -157,7 +185,7 @@
 
     #define GB_C_S_LOOKUP                                                   \
         int64_t pC = Sx [pS] ;                                              \
-        int64_t iC = Ci [pC] ;                                              \
+        int64_t iC = GBI (Ci, pC, Cvlen) ;                                  \
         bool is_zombie = GB_IS_ZOMBIE (iC) ;                                \
         if (is_zombie) iC = GB_FLIP (iC) ;
 
@@ -170,8 +198,8 @@
     #define GB_VECTOR_LOOKUP(pX_start,pX_end,X,j)                           \
     {                                                                       \
         int64_t pleft = 0, pright = X ## nvec-1 ;                           \
-        GB_lookup (X ## _is_hyper, X ## h, X ## p, &pleft, pright, j,       \
-            &pX_start, &pX_end) ;                                           \
+        GB_lookup (X ## _is_hyper, X ## h, X ## p, X ## vlen, &pleft,       \
+            pright, j, &pX_start, &pX_end) ;                                \
     }
 
     //--------------------------------------------------------------------------
@@ -182,7 +210,7 @@
     // time: O(1) if standard, O(log(Cnvec)) if hyper
 
     // This used for GB_subassign_one_slice and GB_subassign_emult_slice,
-    // which compute the parallel schedule for Methods 05, 06n, 07, and 08.
+    // which compute the parallel schedule for Methods 05, 06n, 07, and 08n.
 
     #define GB_LOOKUP_jC                                                    \
         /* lookup jC in C */                                                \
@@ -195,23 +223,23 @@
     // C(:,jC) is dense: iC = I [iA], and then look up C(iC,jC)
     //--------------------------------------------------------------------------
 
-    // C(:,jC) is dense, and thus can be accessed with a constant-time lookup
+    // C(:,jC) is dense, and thus can be accessed with a O(1)-time lookup
     // with the index iC, where the index iC comes from I [iA] or via a
     // colon notation for I.
 
-    // This used for Methods 05, 06n, 07, and 08, which do not use S.
+    // This used for Methods 05, 06n, 07, and 08n, which do not use S.
 
     #define GB_iC_DENSE_LOOKUP                                              \
         int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;                     \
         int64_t pC = pC_start + iC ;                                        \
-        bool is_zombie = GB_IS_ZOMBIE (Ci [pC]) ;                           \
-        ASSERT (GB_UNFLIP (Ci [pC]) == iC) ;
+        bool is_zombie = (Ci != NULL) && GB_IS_ZOMBIE (Ci [pC]) ;           \
+        ASSERT (GB_IMPLIES (Ci != NULL, GB_UNFLIP (Ci [pC]) == iC)) ;
 
     //--------------------------------------------------------------------------
     // get C(iC,jC) via binary search of C(:,jC)
     //--------------------------------------------------------------------------
 
-    // This used for Methods 05, 06n, 07, and 08, which do not use S.
+    // This used for Methods 05, 06n, 07, and 08n, which do not use S.
 
     // New zombies may be introduced into C during the parallel computation.
     // No coarse task shares the same C(:,jC) vector, so no race condition can
@@ -270,6 +298,7 @@
     #define GB_DELETE                                                       \
     {                                                                       \
         /* turn C(iC,jC) into a zombie */                                   \
+        ASSERT (!GB_IS_FULL (C)) ;                                          \
         task_nzombies++ ;                                                   \
         Ci [pC] = GB_FLIP (iC) ;                                            \
     }
@@ -278,6 +307,7 @@
     {                                                                       \
         /* bring a zombie C(iC,jC) back to life;                 */         \
         /* the value of C(iC,jC) must also be assigned.          */         \
+        ASSERT (!GB_IS_FULL (C)) ;                                          \
         Ci [pC] = iC ;                                                      \
         task_nzombies-- ;                                                   \
     }
@@ -674,24 +704,6 @@
                 GB_COPY_scalar_to_C ;                                       \
             }
 
-            // [C A 1] scalar case, with accum
-            #define GB_C_A_1_accum_matrix                                   \
-            {                                                               \
-                /* ----[C A 1] with accum, scalar expansion              */ \
-                /* action: ( =C+A ): apply the accumulator               */ \
-                GB_void ywork [GB_VLA(ysize)] ;                             \
-                GB_COPY_aij_to_ywork ;                                      \
-                GB_ACCUMULATE ;                                             \
-            }                                                               \
-
-            // [C A 1] scalar case, with accum
-            #define GB_C_A_1_accum_scalar                                   \
-            {                                                               \
-                /* ----[C A 1] with accum, scalar expansion              */ \
-                /* action: ( =C+A ): apply the accumulator               */ \
-                GB_ACCUMULATE ;                                             \
-            }
-
             // [C A 1] matrix case when accum is present
             #define GB_withaccum_C_A_1_matrix                               \
             {                                                               \
@@ -703,43 +715,45 @@
                 }                                                           \
                 else                                                        \
                 {                                                           \
-                    /* ----[C A 1] with accum, scalar expansion          */ \
+                    /* ----[C A 1] with accum                            */ \
                     /* action: ( =C+A ): apply the accumulator           */ \
-                    GB_C_A_1_accum_matrix ;                                 \
+                    GB_void ywork [GB_VLA(ysize)] ;                         \
+                    GB_COPY_aij_to_ywork ;                                  \
+                    GB_ACCUMULATE ;                                         \
                 }                                                           \
             }
 
-            // [C A 1] matrix case when no accum is present
-            #define GB_noaccum_C_A_1_matrix                                 \
+            // [C A 1] scalar case when accum is present
+            #define GB_withaccum_C_A_1_scalar                               \
             {                                                               \
                 if (is_zombie)                                              \
                 {                                                           \
                     /* ----[X A 1]                                       */ \
                     /* action: ( undelete ): bring a zombie back to life */ \
-                    GB_X_A_1_matrix ;                                       \
+                    GB_X_A_1_scalar ;                                       \
                 }                                                           \
                 else                                                        \
                 {                                                           \
-                    /* ----[C A 1] no accum, scalar expansion            */ \
-                    /* action: ( =A ): copy A into C                     */ \
-                    GB_COPY_aij_to_C ;                                      \
+                    /* ----[C A 1] with accum, scalar expansion          */ \
+                    /* action: ( =C+A ): apply the accumulator           */ \
+                    GB_ACCUMULATE ;                                         \
                 }                                                           \
             }
 
-            // [C A 1] scalar case when accum is present
-            #define GB_withaccum_C_A_1_scalar                               \
+            // [C A 1] matrix case when no accum is present
+            #define GB_noaccum_C_A_1_matrix                                 \
             {                                                               \
                 if (is_zombie)                                              \
                 {                                                           \
                     /* ----[X A 1]                                       */ \
                     /* action: ( undelete ): bring a zombie back to life */ \
-                    GB_X_A_1_scalar ;                                       \
+                    GB_X_A_1_matrix ;                                       \
                 }                                                           \
                 else                                                        \
                 {                                                           \
-                    /* ----[C A 1] with accum, scalar expansion          */ \
-                    /* action: ( =C+A ): apply the accumulator           */ \
-                    GB_C_A_1_accum_scalar ;                                 \
+                    /* ----[C A 1] no accum, scalar expansion            */ \
+                    /* action: ( =A ): copy A into C                     */ \
+                    GB_COPY_aij_to_C ;                                      \
                 }                                                           \
             }
 
@@ -998,22 +1012,41 @@
             // The M(i,j) entry has no effect.  There is nothing to do.
 
 //------------------------------------------------------------------------------
-// GB_subassign_00: C(I,J) = empty ; using S
+// GB_subassign_symbolic: S = C(I,J)
 //------------------------------------------------------------------------------
 
-GrB_Info GB_subassign_00
+GrB_Info GB_subassign_symbolic  // S = C(I,J), extracting the pattern not values
+(
+    // output
+    GrB_Matrix *Shandle,        // output matrix 
+    // inputs, not modified:
+    const GrB_Matrix C,         // matrix to extract the pattern of
+    const GrB_Index *I,         // index list for S = C(I,J), or GrB_ALL, etc.
+    const int64_t ni,           // length of I, or special
+    const GrB_Index *J,         // index list for S = C(I,J), or GrB_ALL, etc.
+    const int64_t nj,           // length of J, or special
+    const bool S_must_not_be_jumbled,   // if true, S cannot be jumbled
+    GB_Context Context
+) ;
+
+//------------------------------------------------------------------------------
+// GB_subassign_zombie: C(I,J) = empty ; using S
+//------------------------------------------------------------------------------
+
+GrB_Info GB_subassign_zombie
 (
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
@@ -1026,16 +1059,17 @@ GrB_Info GB_subassign_01
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
@@ -1048,15 +1082,16 @@ GrB_Info GB_subassign_02
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
     const GrB_Matrix A,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
@@ -1069,17 +1104,18 @@ GrB_Info GB_subassign_03
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
     const GrB_BinaryOp accum,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
@@ -1092,16 +1128,17 @@ GrB_Info GB_subassign_04
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
     const GrB_BinaryOp accum,
     const GrB_Matrix A,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
@@ -1165,25 +1202,27 @@ GrB_Info GB_subassign_06n
 ) ;
 
 //------------------------------------------------------------------------------
-// GB_subassign_06s: C(I,J)<M> = A ; using S
+// GB_subassign_06s_and_14: C(I,J)<M or !M> = A ; using S
 //------------------------------------------------------------------------------
 
-GrB_Info GB_subassign_06s
+GrB_Info GB_subassign_06s_and_14
 (
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
     const GrB_Matrix M,
-    const bool Mask_struct,
+    const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, !M, else use M
     const GrB_Matrix A,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
@@ -1212,10 +1251,10 @@ GrB_Info GB_subassign_07
 ) ;
 
 //------------------------------------------------------------------------------
-// GB_subassign_08: C(I,J)<M> += A ; no S
+// GB_subassign_08n: C(I,J)<M> += A ; no S
 //------------------------------------------------------------------------------
 
-GrB_Info GB_subassign_08
+GrB_Info GB_subassign_08n
 (
     GrB_Matrix C,
     // input:
@@ -1243,10 +1282,12 @@ GrB_Info GB_subassign_09
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
@@ -1254,30 +1295,31 @@ GrB_Info GB_subassign_09
     const bool Mask_struct,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
 //------------------------------------------------------------------------------
-// GB_subassign_10: C(I,J)<M,repl> = A ; using S
+// GB_subassign_10_and_18: C(I,J)<M or !M,repl> = A ; using S
 //------------------------------------------------------------------------------
 
-GrB_Info GB_subassign_10
+GrB_Info GB_subassign_10_and_18
 (
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
     const GrB_Matrix M,
-    const bool Mask_struct,
+    const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, !M, else use M
     const GrB_Matrix A,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
@@ -1290,10 +1332,12 @@ GrB_Info GB_subassign_11
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
@@ -1302,31 +1346,32 @@ GrB_Info GB_subassign_11
     const GrB_BinaryOp accum,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
 //------------------------------------------------------------------------------
-// GB_subassign_12: C(I,J)<M,repl> += A ; using S
+// GB_subassign_12_and_20: C(I,J)<M or !M,repl> += A ; using S
 //------------------------------------------------------------------------------
 
-GrB_Info GB_subassign_12
+GrB_Info GB_subassign_12_and_20
 (
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
     const GrB_Matrix M,
-    const bool Mask_struct,
+    const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, !M, else use M
     const GrB_BinaryOp accum,
     const GrB_Matrix A,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
@@ -1339,10 +1384,12 @@ GrB_Info GB_subassign_13
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
@@ -1350,30 +1397,6 @@ GrB_Info GB_subassign_13
     const bool Mask_struct,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
-    GB_Context Context
-) ;
-
-//------------------------------------------------------------------------------
-// GB_subassign_14: C(I,J)<!M> = A ; using S
-//------------------------------------------------------------------------------
-
-GrB_Info GB_subassign_14
-(
-    GrB_Matrix C,
-    // input:
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    const GrB_Matrix M,
-    const bool Mask_struct,
-    const GrB_Matrix A,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
@@ -1386,10 +1409,12 @@ GrB_Info GB_subassign_15
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
@@ -1398,31 +1423,33 @@ GrB_Info GB_subassign_15
     const GrB_BinaryOp accum,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
 //------------------------------------------------------------------------------
-// GB_subassign_16: C(I,J)<!M> += A ; using S
+// GB_subassign_16:  C(I,J)<!M> += A ; using S
+// GB_subassign_08s: C(I,J)<M> += A ; using S.  Compare with method 08n
 //------------------------------------------------------------------------------
 
-GrB_Info GB_subassign_16
+GrB_Info GB_subassign_08s_and_16
 (
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
     const GrB_Matrix M,
-    const bool Mask_struct,
+    const bool Mask_struct,         // if true, use the only structure of M
+    const bool Mask_comp,           // if true, !M, else use M
     const GrB_BinaryOp accum,
     const GrB_Matrix A,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
@@ -1435,10 +1462,12 @@ GrB_Info GB_subassign_17
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
@@ -1446,30 +1475,6 @@ GrB_Info GB_subassign_17
     const bool Mask_struct,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
-    GB_Context Context
-) ;
-
-//------------------------------------------------------------------------------
-// GB_subassign_18: C(I,J)<!M,repl> = A ; using S
-//------------------------------------------------------------------------------
-
-GrB_Info GB_subassign_18
-(
-    GrB_Matrix C,
-    // input:
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    const GrB_Matrix M,
-    const bool Mask_struct,
-    const GrB_Matrix A,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
@@ -1482,10 +1487,12 @@ GrB_Info GB_subassign_19
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
@@ -1494,43 +1501,9 @@ GrB_Info GB_subassign_19
     const GrB_BinaryOp accum,
     const void *scalar,
     const GrB_Type atype,
-    const GrB_Matrix S,
-    GB_Context Context
-) ;
-
-//------------------------------------------------------------------------------
-// GB_subassign_20: C(I,J)<!M,repl> += A ; using S
-//------------------------------------------------------------------------------
-
-GrB_Info GB_subassign_20
-(
-    GrB_Matrix C,
-    // input:
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    const GrB_Matrix M,
-    const bool Mask_struct,
-    const GrB_BinaryOp accum,
-    const GrB_Matrix A,
-    const GrB_Matrix S,
     GB_Context Context
 ) ;
 
-//------------------------------------------------------------------------------
-// GB_EMPTY_TASKLIST: declare an empty TaskList
-//------------------------------------------------------------------------------
-
-#define GB_EMPTY_TASKLIST                                                   \
-    int ntasks = 0, max_ntasks = 0, nthreads ;                              \
-    GB_task_struct *TaskList = NULL ;                                       \
-    int64_t *GB_RESTRICT Npending = NULL ;
-
 //------------------------------------------------------------------------------
 // GB_ALLOCATE_NPENDING: allocate Npending workspace
 //------------------------------------------------------------------------------
@@ -1540,11 +1513,11 @@ GrB_Info GB_subassign_20
     if (Npending == NULL)                                                   \
     {                                                                       \
         GB_FREE_ALL ;                                                       \
-        return (GB_OUT_OF_MEMORY) ;                                         \
+        return (GrB_OUT_OF_MEMORY) ;                                        \
     }
 
 //------------------------------------------------------------------------------
-// GB_SUBASSIGN_ONE_SLICE: slice one matrix (A or M)
+// GB_SUBASSIGN_ONE_SLICE: slice one matrix (M)
 //------------------------------------------------------------------------------
 
 // Methods: 05, 06n, 07.  If C is dense, it is sliced for a fine task, so that
@@ -1552,12 +1525,11 @@ GrB_Info GB_subassign_20
 // C(:,jC) is not sliced, so the fine task must do a direct lookup via
 // GB_iC_DENSE_LOOKUP.  Otherwise a race condition will occur.
 
-#define GB_SUBASSIGN_ONE_SLICE(X)                                           \
-    GB_EMPTY_TASKLIST ;                                                     \
+#define GB_SUBASSIGN_ONE_SLICE(M)                                           \
     GB_OK (GB_subassign_one_slice (                                         \
         &TaskList, &max_ntasks, &ntasks, &nthreads,                         \
         C, I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,                      \
-        X, Context)) ;                                                      \
+        M, Context)) ;                                                      \
     GB_ALLOCATE_NPENDING ;
 
 //------------------------------------------------------------------------------
@@ -1570,14 +1542,14 @@ GrB_Info GB_subassign_20
 // either A or M.  No need to examine C, since it will be accessed via S, not
 // via binary search.
 
+// If X is bitmap, this method is not used.  Instead, GB_SUBASSIGN_IXJ_SLICE is
+// used to iterate over the matrix X.
+
 #define GB_SUBASSIGN_TWO_SLICE(X,S)                                         \
-    GB_EMPTY_TASKLIST ;                                                     \
+    int Z_sparsity = GxB_SPARSE ;                                           \
     int64_t Znvec ;                                                         \
-    int64_t *GB_RESTRICT Zh = NULL ;                                           \
-    int64_t *GB_RESTRICT Z_to_X = NULL ;                                       \
-    int64_t *GB_RESTRICT Z_to_S = NULL ;                                       \
     GB_OK (GB_add_phase0 (                                                  \
-        &Znvec, &Zh, NULL, &Z_to_X, &Z_to_S, NULL,                          \
+        &Znvec, &Zh, NULL, &Z_to_X, &Z_to_S, NULL, &Z_sparsity,             \
         NULL, X, S, Context)) ;                                             \
     GB_OK (GB_ewise_slice (                                                 \
         &TaskList, &max_ntasks, &ntasks, &nthreads,                         \
@@ -1585,41 +1557,6 @@ GrB_Info GB_subassign_20
         NULL, X, S, Context)) ;                                             \
     GB_ALLOCATE_NPENDING ;
 
-#define GB_FREE_TWO_SLICE                                                   \
-{                                                                           \
-    GB_FREE (Zh) ;                                                          \
-    GB_FREE (Z_to_X) ;                                                      \
-    GB_FREE (Z_to_S) ;                                                      \
-}
-
-//------------------------------------------------------------------------------
-// GB_SUBASSIGN_EMULT_SLICE: slice A.*M (just Method 08)
-//------------------------------------------------------------------------------
-
-// Method 08 only.  If C is sparse, it is sliced for a fine task, so that
-// it can do a binary search via GB_iC_BINARY_SEARCH.  But if C(:,jC) is dense,
-// C(:,jC) is not sliced, so the fine task must do a direct lookup via
-// GB_iC_DENSE_LOOKUP.  Otherwise a race condition will occur.
-
-#define GB_SUBASSIGN_EMULT_SLICE(A,M)                                       \
-    GB_EMPTY_TASKLIST ;                                                     \
-    int64_t Znvec ;                                                         \
-    const int64_t *GB_RESTRICT Zh = NULL ;                                  \
-    int64_t *GB_RESTRICT Z_to_A = NULL ;                                    \
-    int64_t *GB_RESTRICT Z_to_M = NULL ;                                    \
-    GB_OK (GB_subassign_emult_slice (                                       \
-        &TaskList, &max_ntasks, &ntasks, &nthreads,                         \
-        &Znvec, &Zh, &Z_to_A, &Z_to_M,                                      \
-        C, I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,                      \
-        A, M, Context)) ;                                                   \
-    GB_ALLOCATE_NPENDING ;
-
-#define GB_FREE_EMULT_SLICE                                                 \
-{                                                                           \
-    GB_FREE (Z_to_A) ;                                                      \
-    GB_FREE (Z_to_M) ;                                                      \
-}
-
 //------------------------------------------------------------------------------
 // GB_SUBASSIGN_IXJ_SLICE: slice IxJ for a scalar assignement method
 //------------------------------------------------------------------------------
@@ -1628,10 +1565,9 @@ GrB_Info GB_subassign_20
 // via S, not via binary search.
 
 #define GB_SUBASSIGN_IXJ_SLICE                                              \
-    GB_EMPTY_TASKLIST ;                                                     \
     GB_OK (GB_subassign_IxJ_slice (                                         \
         &TaskList, &max_ntasks, &ntasks, &nthreads,                         \
-        I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,                         \
+        /* I, */ nI, /* Ikind, Icolon, J, */ nJ, /* Jkind, Jcolon, */       \
         Context)) ;                                                         \
     GB_ALLOCATE_NPENDING ;
 
@@ -1663,32 +1599,7 @@ GrB_Info GB_subassign_one_slice
 ) ;
 
 //------------------------------------------------------------------------------
-// GB_subassign_IxJ_slice
-//------------------------------------------------------------------------------
-
-// Slice IxJ for a scalar assignment method (Methods 01, 03, 13, 15, 17, 19)
-
-GrB_Info GB_subassign_IxJ_slice
-(
-    // output:
-    GB_task_struct **p_TaskList,    // array of structs, of size max_ntasks
-    int *p_max_ntasks,              // size of TaskList
-    int *p_ntasks,                  // # of tasks constructed
-    int *p_nthreads,                // # of threads to use
-    // input:
-    const GrB_Index *I,
-    const int64_t nI,
-    const int Ikind,
-    const int64_t Icolon [3],
-    const GrB_Index *J,
-    const int64_t nJ,
-    const int Jkind,
-    const int64_t Jcolon [3],
-    GB_Context Context
-) ;
-
-//------------------------------------------------------------------------------
-// GB_subassign_emult_slice: slice the entries and vectors for GB_subassign_08
+// GB_subassign_emult_slice: slice the entries and vectors for GB_subassign_08n
 //------------------------------------------------------------------------------
 
 GrB_Info GB_subassign_emult_slice
@@ -1699,7 +1610,7 @@ GrB_Info GB_subassign_emult_slice
     int *p_ntasks,                  // # of tasks constructed
     int *p_nthreads,                // # of threads to use
     int64_t *p_Znvec,               // # of vectors to compute in Z
-    const int64_t *GB_RESTRICT *Zh_handle,     // Zh is A->h, M->h, or NULL
+    const int64_t *GB_RESTRICT *Zh_handle, // Zh_shallow is A->h, M->h, or NULL
     int64_t *GB_RESTRICT *Z_to_A_handle, // Z_to_A: output, size Znvec, or NULL
     int64_t *GB_RESTRICT *Z_to_M_handle, // Z_to_M: output, size Znvec, or NULL
     // input:
@@ -1737,10 +1648,10 @@ GrB_Info GB_subassign_emult_slice
     int64_t task_pending = 0 ;
 
 //------------------------------------------------------------------------------
-// GB_GET_MAPPED_VECTOR: get the content of a vector for a coarse/fine task
+// GB_GET_MAPPED: get the content of a vector for a coarse/fine task
 //------------------------------------------------------------------------------
 
-#define GB_GET_MAPPED_VECTOR(pX_start, pX_fini, pX, pX_end, Xp, j, k, Z_to_X) \
+#define GB_GET_MAPPED(pX_start, pX_fini, pX, pX_end, Xp, j, k, Z_to_X, Xvlen) \
     int64_t pX_start = -1, pX_fini = -1 ;                                   \
     if (fine_task)                                                          \
     {                                                                       \
@@ -1754,16 +1665,16 @@ GrB_Info GB_subassign_emult_slice
         int64_t kX = (Z_to_X == NULL) ? j : Z_to_X [k] ;                    \
         if (kX >= 0)                                                        \
         {                                                                   \
-            pX_start = Xp [kX] ;                                            \
-            pX_fini  = Xp [kX+1] ;                                          \
+            pX_start = GBP (Xp, kX, Xvlen) ;                                \
+            pX_fini  = GBP (Xp, kX+1, Xvlen) ;                              \
         }                                                                   \
     }
 
 //------------------------------------------------------------------------------
-// GB_GET_EMULT_VECTOR: get the content of a vector for EMULT_SLICE method
+// GB_GET_EVEC: get the content of a vector for Method08n
 //------------------------------------------------------------------------------
 
-#define GB_GET_EMULT_VECTOR(pX_start, pX_fini, pX, pX_end, Xp, Xh, j,k,Z_to_X) \
+#define GB_GET_EVEC(pX_start, pX_fini, pX, pX_end, Xp, Xh, j,k,Z_to_X,Xvlen)\
     int64_t pX_start = -1, pX_fini = -1 ;                                   \
     if (fine_task)                                                          \
     {                                                                       \
@@ -1774,11 +1685,12 @@ GrB_Info GB_subassign_emult_slice
     else                                                                    \
     {                                                                       \
         /* vectors are never sliced for a coarse task */                    \
-        int64_t kX = (Zh == Xh) ? k : ((Z_to_X == NULL) ? j : Z_to_X [k]) ; \
+        int64_t kX = (Zh_shallow == Xh) ? k :                               \
+            ((Z_to_X == NULL) ? j : Z_to_X [k]) ;                           \
         if (kX >= 0)                                                        \
         {                                                                   \
-            pX_start = Xp [kX] ;                                            \
-            pX_fini  = Xp [kX+1] ;                                          \
+            pX_start = GBP (Xp, kX, Xvlen) ;                                \
+            pX_fini  = GBP (Xp, kX+1, Xvlen) ;                              \
         }                                                                   \
     }
 
@@ -1800,42 +1712,53 @@ GrB_Info GB_subassign_emult_slice
     }
 
 //------------------------------------------------------------------------------
-// GB_GET_I: get the range of indices in I for this task
+// GB_GET_IXJ_TASK_DESCRIPTOR*: get the task descriptor for IxJ
 //------------------------------------------------------------------------------
 
-#define GB_GET_IXJ_TASK_DESCRIPTOR                                          \
+// Q denotes the Cartesian product IXJ
+
+#define GB_GET_IXJ_TASK_DESCRIPTOR(iQ_start,iQ_end)                         \
     GB_GET_TASK_DESCRIPTOR ;                                                \
-    int64_t iA_start = 0, iA_end = nI ;                                     \
+    int64_t iQ_start = 0, iQ_end = nI ;                                     \
     if (fine_task)                                                          \
     {                                                                       \
-        iA_start = TaskList [taskid].pA ;                                   \
-        iA_end   = TaskList [taskid].pA_end ;                               \
+        iQ_start = TaskList [taskid].pA ;                                   \
+        iQ_end   = TaskList [taskid].pA_end ;                               \
     }
 
-#define GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1                                   \
-    GB_GET_IXJ_TASK_DESCRIPTOR ;                                            \
+#define GB_GET_IXJ_TASK_DESCRIPTOR_PHASE1(iQ_start,iQ_end)                  \
+    GB_GET_IXJ_TASK_DESCRIPTOR (iQ_start, iQ_end)                           \
     int64_t task_nzombies = 0 ;                                             \
     int64_t task_pending = 0 ;
 
-#define GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2                                   \
-    GB_GET_IXJ_TASK_DESCRIPTOR ;                                            \
+#define GB_GET_IXJ_TASK_DESCRIPTOR_PHASE2(iQ_start,iQ_end)                  \
+    GB_GET_IXJ_TASK_DESCRIPTOR (iQ_start, iQ_end)                           \
     GB_START_PENDING_INSERTION ;
 
 //------------------------------------------------------------------------------
 // GB_GET_VECTOR_FOR_IXJ: get the start of a vector for scalar assignment
 //------------------------------------------------------------------------------
 
-// Find pX and pX_end for the vector X (iA_start:end, j), for a scalar
-// assignment method.
+// Find pX and pX_end for the vector X (iQ_start:iQ_end, j), for a scalar
+// assignment method, or a method iterating over all IxJ for a bitmap M or A.
 
-#define GB_GET_VECTOR_FOR_IXJ(X)                                            \
+#define GB_GET_VECTOR_FOR_IXJ(X,iQ_start)                                   \
     int64_t p ## X, p ## X ## _end ;                                        \
     GB_VECTOR_LOOKUP (p ## X, p ## X ## _end, X, j) ;                       \
-    if (iA_start != 0)                                                      \
+    if (iQ_start != 0)                                                      \
     {                                                                       \
-        int64_t pright = p ## X ## _end - 1 ;                               \
-        bool found ;                                                        \
-        GB_SPLIT_BINARY_SEARCH (iA_start, X ## i, p ## X, pright, found) ;  \
+        if (X ## i == NULL)                                                 \
+        {                                                                   \
+            /* X is full or bitmap */                                       \
+            p ## X += iQ_start ;                                            \
+        }                                                                   \
+        else                                                                \
+        {                                                                   \
+            /* X is sparse or hypersparse */                                \
+            int64_t pright = p ## X ## _end - 1 ;                           \
+            bool found ;                                                    \
+            GB_SPLIT_BINARY_SEARCH (iQ_start, X ## i, p ## X, pright, found) ;\
+        }                                                                   \
     }
 
 //------------------------------------------------------------------------------
@@ -1846,7 +1769,13 @@ GrB_Info GB_subassign_emult_slice
 
 #define GB_MIJ_BINARY_SEARCH_OR_DENSE_LOOKUP(i)                             \
     bool mij ;                                                              \
-    if (mjdense)                                                            \
+    if (M_is_bitmap)                                                        \
+    {                                                                       \
+        /* M(:,j) is bitmap, no need for binary search */                   \
+        int64_t pM = pM_start + i ;                                         \
+        mij = Mb [pM] && GB_mcast (Mx, pM, msize) ;                         \
+    }                                                                       \
+    else if (mjdense)                                                       \
     {                                                                       \
         /* M(:,j) is dense, no need for binary search */                    \
         int64_t pM = pM_start + i ;                                         \
@@ -1891,19 +1820,20 @@ GrB_Info GB_subassign_emult_slice
     {                                                                       \
         /* no pending tuples, so skip phase 2 */                            \
         GB_FREE_ALL ;                                                       \
-        ASSERT_MATRIX_OK (C, "C, no pending tuples ", GB_FLIP (GB0)) ;      \
+        ASSERT_MATRIX_OK (C, "C, no pending tuples " __FILE__, \
+            GB_FLIP (GB0)) ;      \
         return (GrB_SUCCESS) ;                                              \
     }                                                                       \
     /* ensure that C->Pending is large enough to handle nnew more tuples */ \
     if (!GB_Pending_ensure (&(C->Pending), atype, accum, is_matrix, nnew))  \
     {                                                                       \
         GB_FREE_ALL ;                                                       \
-        return (GB_OUT_OF_MEMORY) ;                                         \
+        return (GrB_OUT_OF_MEMORY) ;                                        \
     }                                                                       \
     GB_Pending Pending = C->Pending ;                                       \
-    int64_t *GB_RESTRICT Pending_i = Pending->i ;                              \
-    int64_t *GB_RESTRICT Pending_j = Pending->j ;                              \
-    GB_void *GB_RESTRICT Pending_x = Pending->x ;                              \
+    int64_t *GB_RESTRICT Pending_i = Pending->i ;                           \
+    int64_t *GB_RESTRICT Pending_j = Pending->j ;                           \
+    GB_void *GB_RESTRICT Pending_x = Pending->x ;                           \
     int64_t npending_orig = Pending->n ;                                    \
     bool pending_sorted = Pending->sorted ;
 
@@ -1965,7 +1895,7 @@ GrB_Info GB_subassign_emult_slice
     Pending->n += nnew ;                                                    \
     Pending->sorted = pending_sorted ;                                      \
     GB_FREE_ALL ;                                                           \
-    ASSERT_MATRIX_OK (C, "C with pending tuples ", GB_FLIP (GB0)) ;         \
+    ASSERT_MATRIX_OK (C, "C with pending tuples :" __FILE__, GB_FLIP (GB0)) ;\
     return (GrB_SUCCESS) ;
 
 #endif
diff --git a/GraphBLAS/Source/GB_subassign_one_slice.c b/GraphBLAS/Source/GB_subassign_one_slice.c
index d39a651ec9..1efb5378f2 100644
--- a/GraphBLAS/Source/GB_subassign_one_slice.c
+++ b/GraphBLAS/Source/GB_subassign_one_slice.c
@@ -2,16 +2,19 @@
 // GB_subassign_one_slice: slice the entries and vectors for subassign
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // Constructs a set of tasks to compute C for a subassign method, based on
 // slicing a single input matrix (M or A).  Fine tasks must also find their
-// location in their vector C(:,jC).
+// location in their vector C(:,jC).  Currently this method is only used to
+// slice M, not A.
 
-// This method is used by GB_subassign_05, 06n, and 07
+// This method is used by GB_subassign_05, 06n, and 07.  Each of those methods
+// apply this function to M, but they use TaskList[...].pA and pA_end to
+// partition the matrix.
 
         //  =====================       ==============
         //  M   cmp rpl acc A   S       method: action
@@ -20,6 +23,8 @@
         //  M   -   -   +   -   -       07:  C(I,J)<M> += x      for M
         //  M   -   -   -   A   -       06n: C(I,J)<M> = A       for M
 
+// C: not bitmap
+
 #include "GB_subassign_methods.h"
 
 #undef  GB_FREE_WORK
@@ -54,7 +59,7 @@ GrB_Info GB_subassign_one_slice
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
-    const GrB_Matrix A,             // matrix to slice (M or A)
+    const GrB_Matrix M,             // matrix to slice
     GB_Context Context
 )
 {
@@ -68,7 +73,12 @@ GrB_Info GB_subassign_one_slice
     ASSERT (p_ntasks != NULL) ;
     ASSERT (p_nthreads != NULL) ;
     ASSERT_MATRIX_OK (C, "C for 1_slice", GB0) ;
-    ASSERT_MATRIX_OK (A, "A/M for 1_slice", GB0) ;
+    ASSERT_MATRIX_OK (M, "M for 1_slice", GB0) ;
+
+    ASSERT (!GB_IS_BITMAP (C)) ;
+
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_JUMBLED (M)) ;
 
     (*p_TaskList  ) = NULL ;
     (*p_max_ntasks) = 0 ;
@@ -82,22 +92,24 @@ GrB_Info GB_subassign_one_slice
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
 
     //--------------------------------------------------------------------------
-    // get A and C
+    // get M and C
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Ap = A->p ;
-    const int64_t *GB_RESTRICT Ah = A->h ;
-    const int64_t *GB_RESTRICT Ai = A->i ;
-    const int64_t anz = GB_NNZ (A) ;
-    const int64_t anvec = A->nvec ;
+    const int64_t *GB_RESTRICT Mp = M->p ;
+    const int64_t *GB_RESTRICT Mh = M->h ;
+//  const int8_t  *GB_RESTRICT Mb = M->b ;
+    const int64_t *GB_RESTRICT Mi = M->i ;
+    const int64_t mnz = GB_NNZ_HELD (M) ;
+    const int64_t mnvec = M->nvec ;
+    const int64_t mvlen = M->vlen ;
 
     const int64_t *GB_RESTRICT Cp = C->p ;
     const int64_t *GB_RESTRICT Ch = C->h ;
     const int64_t *GB_RESTRICT Ci = C->i ;
-    const bool C_is_hyper = C->is_hyper ;
+    const bool C_is_hyper = (Ch != NULL) ;
     const int64_t nzombies = C->nzombies ;
     const int64_t Cnvec = C->nvec ;
-    const int64_t cvlen = C->vlen ;
+    const int64_t Cvlen = C->vlen ;
 
     //--------------------------------------------------------------------------
     // allocate the initial TaskList
@@ -105,7 +117,7 @@ GrB_Info GB_subassign_one_slice
 
     int64_t *GB_RESTRICT Coarse = NULL ; // size ntasks1+1
     int ntasks1 = 0 ;
-    int nthreads = GB_nthreads (anz, chunk, nthreads_max) ;
+    int nthreads = GB_nthreads (mnz, chunk, nthreads_max) ;
     GB_task_struct *GB_RESTRICT TaskList = NULL ;
     int max_ntasks = 0 ;
     int ntasks = 0 ;
@@ -116,14 +128,14 @@ GrB_Info GB_subassign_one_slice
     // check for quick return for a single task
     //--------------------------------------------------------------------------
 
-    if (anvec == 0 || ntasks0 == 1)
+    if (mnvec == 0 || ntasks0 == 1)
     { 
         // construct a single coarse task that does all the work
         TaskList [0].kfirst = 0 ;
-        TaskList [0].klast  = anvec-1 ;
+        TaskList [0].klast  = mnvec-1 ;
         (*p_TaskList  ) = TaskList ;
         (*p_max_ntasks) = max_ntasks ;
-        (*p_ntasks    ) = (anvec == 0) ? 0 : 1 ;
+        (*p_ntasks    ) = (mnvec == 0) ? 0 : 1 ;
         (*p_nthreads  ) = 1 ;
         return (GrB_SUCCESS) ;
     }
@@ -132,20 +144,21 @@ GrB_Info GB_subassign_one_slice
     // determine # of threads and tasks for the subassign operation
     //--------------------------------------------------------------------------
 
-    double target_task_size = ((double) anz) / (double) (ntasks0) ;
+    double target_task_size = ((double) mnz) / (double) (ntasks0) ;
     target_task_size = GB_IMAX (target_task_size, chunk) ;
-    ntasks1 = ((double) anz) / target_task_size ;
+    ntasks1 = ((double) mnz) / target_task_size ;
     ntasks1 = GB_IMAX (ntasks1, 1) ;
 
     //--------------------------------------------------------------------------
     // slice the work into coarse tasks
     //--------------------------------------------------------------------------
 
-    if (!GB_pslice (&Coarse, /* A */ A->p, A->nvec, ntasks1))
+    // M may be hypersparse, sparse, bitmap, or full
+    if (!GB_pslice (&Coarse, Mp, mnvec, ntasks1, false))
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -156,13 +169,13 @@ GrB_Info GB_subassign_one_slice
     {
 
         //----------------------------------------------------------------------
-        // coarse task computes C (I, J(k:klast)) = A (I, k:klast)
+        // coarse task computes C (I, J(k:klast)) = M (I, k:klast)
         //----------------------------------------------------------------------
 
         int64_t k = Coarse [t] ;
-        int64_t klast  = Coarse [t+1] - 1 ;
+        int64_t klast = Coarse [t+1] - 1 ;
 
-        if (k >= anvec)
+        if (k >= mnvec)
         { 
 
             //------------------------------------------------------------------
@@ -180,7 +193,7 @@ GrB_Info GB_subassign_one_slice
             //------------------------------------------------------------------
 
             // This is a non-empty coarse-grain task that does two or more
-            // entire vectors of A, vectors k:klast, inclusive.
+            // entire vectors of M, vectors k:klast, inclusive.
             GB_REALLOC_TASK_LIST (TaskList, ntasks + 1, max_ntasks) ;
             TaskList [ntasks].kfirst = k ;
             TaskList [ntasks].klast  = klast ;
@@ -218,19 +231,19 @@ GrB_Info GB_subassign_one_slice
             // get the vector of C
             //------------------------------------------------------------------
 
-            ASSERT (k >= 0 && k < anvec) ;
-            int64_t j = (Ah == NULL) ? k : Ah [k] ;
+            ASSERT (k >= 0 && k < mnvec) ;
+            int64_t j = GBH (Mh, k) ;
             ASSERT (j >= 0 && j < nJ) ;
             int64_t GB_LOOKUP_jC ;
 
-            bool jC_dense = (pC_end - pC_start == cvlen) ;
+            bool jC_dense = (pC_end - pC_start == Cvlen) ;
 
             //------------------------------------------------------------------
             // determine the # of fine-grain tasks to create for vector k
             //------------------------------------------------------------------
 
-            int64_t aknz = Ap [k+1] - Ap [k] ;
-            int nfine = ((double) aknz) / target_task_size ;
+            int64_t mknz = (Mp == NULL) ? mvlen : (Mp [k+1] - Mp [k]) ;
+            int nfine = ((double) mknz) / target_task_size ;
             nfine = GB_IMAX (nfine, 1) ;
 
             // make the TaskList bigger, if needed
@@ -256,7 +269,7 @@ GrB_Info GB_subassign_one_slice
             {
 
                 //--------------------------------------------------------------
-                // slice vector A(:,k) into nfine fine tasks
+                // slice vector M(:,k) into nfine fine tasks
                 //--------------------------------------------------------------
 
                 ASSERT (ntasks < max_ntasks) ;
@@ -264,17 +277,18 @@ GrB_Info GB_subassign_one_slice
                 for (int tfine = 0 ; tfine < nfine ; tfine++)
                 {
 
-                    // this fine task operates on vector A(:,k)
+                    // this fine task operates on vector M(:,k)
                     TaskList [ntasks].kfirst = k ;
                     TaskList [ntasks].klast  = -1 ;
 
-                    // slice A(:,k) for this task
+                    // slice M(:,k) for this task
                     int64_t p1, p2 ;
-                    GB_PARTITION (p1, p2, aknz, tfine, nfine) ;
-                    int64_t pA     = Ap [k] + p1 ;
-                    int64_t pA_end = Ap [k] + p2 ;
-                    TaskList [ntasks].pA     = pA ;
-                    TaskList [ntasks].pA_end = pA_end ;
+                    GB_PARTITION (p1, p2, mknz, tfine, nfine) ;
+                    int64_t pM_start = GBP (Mp, k, mvlen) ;
+                    int64_t pM     = pM_start + p1 ;
+                    int64_t pM_end = pM_start + p2 ;
+                    TaskList [ntasks].pA     = pM ;
+                    TaskList [ntasks].pA_end = pM_end ;
 
                     if (jC_dense)
                     { 
@@ -285,10 +299,10 @@ GrB_Info GB_subassign_one_slice
                     else
                     { 
                         // find where this task starts and ends in C(:,jC)
-                        int64_t iA_start = Ai [pA] ;
-                        int64_t iC1 = GB_ijlist (I, iA_start, Ikind, Icolon) ;
-                        int64_t iA_end = Ai [pA_end-1] ;
-                        int64_t iC2 = GB_ijlist (I, iA_end, Ikind, Icolon) ;
+                        int64_t iM_start = GBI (Mi, pM, mvlen) ;
+                        int64_t iC1 = GB_ijlist (I, iM_start, Ikind, Icolon) ;
+                        int64_t iM_end = GBI (Mi, pM_end-1, mvlen) ;
+                        int64_t iC2 = GB_ijlist (I, iM_end, Ikind, Icolon) ;
 
                         // If I is an explicit list, it must be already sorted
                         // in ascending order, and thus iC1 <= iC2.  If I is
diff --git a/GraphBLAS/Source/GB_subassign_scalar.c b/GraphBLAS/Source/GB_subassign_scalar.c
index 6553110fa1..12dbc7138e 100644
--- a/GraphBLAS/Source/GB_subassign_scalar.c
+++ b/GraphBLAS/Source/GB_subassign_scalar.c
@@ -2,8 +2,8 @@
 // GB_subassign_scalar: C(Rows,Cols)<M> = accum (C(Rows,Cols),x)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -46,7 +46,7 @@ GrB_Info GB_subassign_scalar        // C(Rows,Cols)<M> += x
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // C(Rows,Cols)<M> = accum (C(Rows,Cols), scalar)
diff --git a/GraphBLAS/Source/GB_subassign_symbolic.c b/GraphBLAS/Source/GB_subassign_symbolic.c
new file mode 100644
index 0000000000..3aa03b796c
--- /dev/null
+++ b/GraphBLAS/Source/GB_subassign_symbolic.c
@@ -0,0 +1,152 @@
+//------------------------------------------------------------------------------
+// GB_subassign_symbolic: S = C(I,J)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// TODO: if I == GrB_ALL and J == GrB_ALL (pass in Ikind and Jkind), then
+// let S be a purely shallow copy of C.  Let S->x be NULL, which denotes
+// S->x [p] == p.  But S->i needs to be a deep, zombie-less copy of C->i,
+// because C->i is changing (zombie status).  S->p and S->h can be shallow
+// copies of C->p and C->h.
+
+#include "GB_subassign_methods.h"
+#include "GB_subref.h"
+
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL GB_Matrix_free (Shandle) ;
+
+GrB_Info GB_subassign_symbolic
+(
+    // output
+    GrB_Matrix *Shandle,        // S = C(I,J), extracting the pattern not values
+    // inputs, not modified:
+    const GrB_Matrix C,         // matrix to extract the pattern of
+    const GrB_Index *I,         // index list for S = C(I,J), or GrB_ALL, etc.
+    const int64_t ni,           // length of I, or special
+    const GrB_Index *J,         // index list for S = C(I,J), or GrB_ALL, etc.
+    const int64_t nj,           // length of J, or special
+    const bool S_must_not_be_jumbled,
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    ASSERT (!GB_IS_BITMAP (C)) ;    // the caller cannot tolerate C bitmap
+
+    //--------------------------------------------------------------------------
+    // extract the pattern: S = C(I,J) for S_Extraction method, and quick mask
+    //--------------------------------------------------------------------------
+
+    // S is a sparse int64_t matrix.  Its "values" are not numerical, but
+    // indices into C.  For example, suppose 100 = I [5] and 200 = J [7].  Then
+    // S(5,7) is the entry C(I(5),J(7)), and the value of S(5,7) is the
+    // position in C that holds that particular entry C(100,200):
+    // pC = S->x [...] gives the location of the value C->x [pC] and row index
+    // 100 = C->i [pC], and pC will be between C->p [200] ... C->p [200+1]-1
+    // if C is non-hypersparse.  If C is hyperparse then pC will be still
+    // reside inside the vector jC, in the range C->p [k] ... C->p [k+1]-1,
+    // if jC is the kth non-empty vector in the hyperlist of C.
+
+    //--------------------------------------------------------------------------
+    // extract symbolic structure S=C(I,J)
+    //--------------------------------------------------------------------------
+
+    // FUTURE::: if whole_C_matrix is true, then C(:,:) = ... and S == C,
+    // except that S is zombie-free, read-only; and C collects zombies.
+
+    // FUTURE:: the properties of I and J are already known, and thus do
+    // not need to be recomputed by GB_subref.
+
+    // S and C have the same CSR/CSC format.  S can be jumbled.  It is in
+    // in the same hypersparse form as C (unless S is empty, in which case
+    // it is always returned as hypersparse). This also checks I and J.
+    (*Shandle) = NULL ;
+    GB_OK (GB_subref (Shandle, C->is_csc, C, I, ni, J, nj, true, Context)) ;
+    GrB_Matrix S = (*Shandle) ;
+    ASSERT (GB_JUMBLED_OK (S)) ;    // GB_subref can return S as unsorted
+
+    //--------------------------------------------------------------------------
+    // sort S if requested
+    //--------------------------------------------------------------------------
+
+    if (S_must_not_be_jumbled)
+    { 
+        GB_MATRIX_WAIT_IF_JUMBLED (S) ; // but the caller requires S sorted
+        ASSERT (!GB_JUMBLED (S)) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // check the result of S=C(I,J)
+    //--------------------------------------------------------------------------
+
+    #ifdef GB_DEBUG
+    ASSERT_MATRIX_OK (C, "C for subref extraction", GB0) ;
+    ASSERT_MATRIX_OK (S, "S for subref extraction", GB0) ;
+
+    // since C is not bitmap, neither is S
+    ASSERT (!GB_IS_BITMAP (S)) ;
+
+    // GB_subref sorts its input matrix, so C is no longer jumbled
+    ASSERT (!GB_JUMBLED (C)) ;
+
+    // this body of code explains what S contains.
+    // S is nI-by-nJ where nI = length (I) and nJ = length (J)
+
+    int64_t nI, Icolon [3], nJ, Jcolon [3] ;
+    int Ikind, Jkind ;
+    GB_ijlength (I, ni, C->vlen, &nI, &Ikind, Icolon) ;
+    GB_ijlength (J, nj, C->vdim, &nJ, &Jkind, Jcolon) ;
+
+    // get S
+    const int64_t *GB_RESTRICT Sp = S->p ;
+    const int64_t *GB_RESTRICT Sh = S->h ;
+    const int64_t *GB_RESTRICT Si = S->i ;
+    const int64_t *GB_RESTRICT Sx = (int64_t *) S->x ;
+    // for each vector of S
+    for (int64_t k = 0 ; k < S->nvec ; k++)
+    {
+        // prepare to iterate over the entries of vector S(:,jnew)
+        int64_t jnew = GBH (Sh, k) ;
+        int64_t pS_start = GBP (Sp, k, S->vlen) ;
+        int64_t pS_end   = GBP (Sp, k+1, S->vlen) ;
+        // S (inew,jnew) corresponds to C (iC, jC) ;
+        // jC = J [j] ; or J is a colon expression
+        int64_t jC = GB_ijlist (J, jnew, Jkind, Jcolon) ;
+        for (int64_t pS = pS_start ; pS < pS_end ; pS++)
+        {
+            // S (inew,jnew) is a pointer back into C (I(inew), J(jnew))
+            int64_t inew = GBI (Si, pS, S->vlen) ;
+            ASSERT (inew >= 0 && inew < nI) ;
+            // iC = I [iA] ; or I is a colon expression
+            int64_t iC = GB_ijlist (I, inew, Ikind, Icolon) ;
+            int64_t p = Sx [pS] ;
+            ASSERT (p >= 0 && p < GB_NNZ (C)) ;
+            int64_t pC_start, pC_end, pleft = 0, pright = C->nvec-1 ;
+            bool found = GB_lookup (C->h != NULL, C->h, C->p, C->vlen,
+                &pleft, pright, jC, &pC_start, &pC_end) ;
+            ASSERT (found) ;
+            // If iC == I [inew] and jC == J [jnew], (or the equivaleent
+            // for GB_ALL, GB_RANGE, GB_STRIDE) then A(inew,jnew) will be
+            // assigned to C(iC,jC), and p = S(inew,jnew) gives the pointer
+            // into C to where the entry (C(iC,jC) appears in C:
+            ASSERT (pC_start <= p && p < pC_end) ;
+            ASSERT (iC == GB_UNFLIP (GBI (C->i, p, C->vlen))) ;
+        }
+    }
+    #endif
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_subassign_00.c b/GraphBLAS/Source/GB_subassign_zombie.c
similarity index 71%
rename from GraphBLAS/Source/GB_subassign_00.c
rename to GraphBLAS/Source/GB_subassign_zombie.c
index 64ebcb77f4..c9b229df20 100644
--- a/GraphBLAS/Source/GB_subassign_00.c
+++ b/GraphBLAS/Source/GB_subassign_zombie.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_subassign_00: C(I,J)<!,repl> = empty ; using S
+// GB_subassign_zombie: C(I,J)<!,repl> = empty ; using S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,31 +16,52 @@
 // A:           any (scalar or matrix; result is the same)
 // S:           constructed
 
+// C: not bitmap
+
 #include "GB_subassign_methods.h"
 
-GrB_Info GB_subassign_00
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL GB_Matrix_free (&S) ;
+
+GrB_Info GB_subassign_zombie
 (
     GrB_Matrix C,
     // input:
     const GrB_Index *I,
+    const int64_t ni,
     const int64_t nI,
     const int Ikind,
     const int64_t Icolon [3],
     const GrB_Index *J,
+    const int64_t nj,
     const int64_t nJ,
     const int Jkind,
     const int64_t Jcolon [3],
-    const GrB_Matrix S,
     GB_Context Context
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_BITMAP (C)) ; ASSERT (!GB_IS_FULL (C)) ;
+
+    //--------------------------------------------------------------------------
+    // S = C(I,J)
+    //--------------------------------------------------------------------------
+
+    GrB_Info info ;
+    GrB_Matrix S = NULL ;
+    GB_OK (GB_subassign_symbolic (&S, C, I, ni, J, nj, false, Context)) ;
+    ASSERT (GB_JUMBLED_OK (S)) ;        // S can be returned as jumbled
+
     //--------------------------------------------------------------------------
     // get inputs
     //--------------------------------------------------------------------------
 
-    int64_t *GB_RESTRICT Ci = C->i ;
     const int64_t *GB_RESTRICT Sx = (int64_t *) S->x ;
+    int64_t *GB_RESTRICT Ci = C->i ;
 
     //--------------------------------------------------------------------------
     // Method 00: C(I,J)<!,repl> = empty ; using S
@@ -49,7 +70,7 @@ GrB_Info GB_subassign_00
     // Time: Optimal, O(nnz(S)), assuming S has already been constructed.
 
     //--------------------------------------------------------------------------
-    // Parallel: all entries in S can be processed fully in parallel.
+    // Parallel: all entries in S can be processed entirely in parallel.
     //--------------------------------------------------------------------------
 
     // All entries in C(I,J) are deleted.  The result does not depend on A or
@@ -66,7 +87,7 @@ GrB_Info GB_subassign_00
     #pragma omp parallel for num_threads(nthreads) schedule(static) \
         reduction(+:nzombies)
     for (pS = 0 ; pS < snz ; pS++)
-    { 
+    {
         // S (inew,jnew) is a pointer back into C (I(inew), J(jnew))
         int64_t pC = Sx [pS] ;
         int64_t i = Ci [pC] ;
@@ -75,17 +96,18 @@ GrB_Info GB_subassign_00
         // ----[C A 0] or [C . 0]-----------------------------------------------
         // action: C_repl: ( delete ): becomes a zombie
         if (!GB_IS_ZOMBIE (i))
-        {
+        { 
             nzombies++ ;
             Ci [pC] = GB_FLIP (i) ;
         }
     }
 
     //--------------------------------------------------------------------------
-    // return result
+    // free workspace and return result
     //--------------------------------------------------------------------------
 
     C->nzombies = nzombies ;
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_subassigner.c b/GraphBLAS/Source/GB_subassigner.c
index 0280926a5c..6d55dbe025 100644
--- a/GraphBLAS/Source/GB_subassigner.c
+++ b/GraphBLAS/Source/GB_subassigner.c
@@ -2,8 +2,8 @@
 // GB_subassigner: C(I,J)<#M> = accum (C(I,J), A)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -39,58 +39,52 @@
 // mask M must be the same size as A, if present.
 
 // Any or all of the C, M, and/or A matrices may be hypersparse or standard
-// non-hypersparse.
+// non-hypersparse.  Some methods can operate on full and/or bitmap matrices;
+// see GB_subassigner_method, which checks these conditions.
 
 // C is operated on in-place and thus cannot be aliased with the inputs A or M.
 
-// Since the pattern of C does not change here, C->p, C->h, C->nvec, and
-// C->nvec_nonempty are constant.  C->x and C->i can be modified, but only one
-// entry at a time.  No entries are shifted.  C->x can be modified, and C->i
-// can be changed by turning an entry into a zombie, or by bringing a zombie
-// back to life, but no entry in C->i moves in position.
-
-#define GB_FREE_WORK            \
-{                               \
-    GB_MATRIX_FREE (&S) ;       \
-    GB_MATRIX_FREE (&A2) ;      \
-    GB_MATRIX_FREE (&M2) ;      \
-    GB_FREE (I2) ;              \
-    GB_FREE (I2k) ;             \
-    GB_FREE (J2) ;              \
-    GB_FREE (J2k) ;             \
-}
+// Since the pattern of C isn't reallocated here, and entries do not move in
+// position, C->p, C->h, C->nvec, and C->nvec_nonempty are not modified.  C->x
+// and C->i can be modified, but only one entry at a time.  No entries are
+// shifted.  C->i can be changed by turning an entry into a zombie, or by
+// bringing a zombie back to life, but no entry in C->i moves in position, and
+// the underlying indices in C->i do not change otherwise.  C->b can be
+// modified for a C bitmap.
 
 #include "GB_subassign.h"
 #include "GB_subassign_methods.h"
-#include "GB_subref.h"
 #include "GB_dense.h"
-#ifdef GB_DEBUG
-#include "GB_iterator.h"
-#endif
+#include "GB_bitmap_assign.h"
 
 #undef  GB_FREE_ALL
-#define GB_FREE_ALL                                     \
-{                                                       \
-    GB_PHIX_FREE (C) ;                                  \
-    GB_FREE_WORK ;                                      \
-}
+#define GB_FREE_ALL GB_phbix_free (C) ;
 
 GrB_Info GB_subassigner             // C(I,J)<#M> = A or accum (C (I,J), A)
 (
+    // input/output
     GrB_Matrix C,                   // input/output matrix for results
-    bool C_replace,                 // C matrix descriptor
-    const GrB_Matrix M_input,       // optional mask for C(I,J), unused if NULL
+    // input
+    const int subassign_method,
+    const bool C_replace,           // C matrix descriptor
+    const GrB_Matrix M,             // optional mask for C(I,J), unused if NULL
     const bool Mask_comp,           // mask descriptor
     const bool Mask_struct,         // if true, use the only structure of M
     const GrB_BinaryOp accum,       // optional accum for Z=accum(C(I,J),A)
-    const GrB_Matrix A_input,       // input matrix (NULL for scalar expansion)
-    const GrB_Index *I_input,       // list of indices
-    const int64_t   ni_input,       // number of indices
-    const GrB_Index *J_input,       // list of vector indices
-    const int64_t   nj_input,       // number of column indices
+    const GrB_Matrix A,             // input matrix (NULL for scalar expansion)
+    const GrB_Index *I,             // list of indices
+    const int64_t   ni,             // number of indices
+    const int64_t nI,
+    const int Ikind,
+    const int64_t Icolon [3],
+    const GrB_Index *J,             // list of vector indices
+    const int64_t   nj,             // number of column indices
+    const int64_t nJ,
+    const int Jkind,
+    const int64_t Jcolon [3],
     const bool scalar_expansion,    // if true, expand scalar to A
     const void *scalar,             // scalar to be expanded
-    const GB_Type_code scalar_code, // type code of scalar to expand
+    const GrB_Type atype,           // type code of scalar to expand
     GB_Context Context
 )
 {
@@ -100,770 +94,27 @@ GrB_Info GB_subassigner             // C(I,J)<#M> = A or accum (C (I,J), A)
     //--------------------------------------------------------------------------
 
     GrB_Info info ;
-    GrB_Matrix S = NULL ;
-    GrB_Matrix A2 = NULL ;
-    GrB_Matrix M2 = NULL ;
-
-    GrB_Index *GB_RESTRICT I2  = NULL ;
-    GrB_Index *GB_RESTRICT I2k = NULL ;
-    GrB_Index *GB_RESTRICT J2  = NULL ;
-    GrB_Index *GB_RESTRICT J2k = NULL ;
-
-    GrB_Matrix A = A_input ;
-    GrB_Matrix M = M_input ;
-    int64_t ni = ni_input ;
-    int64_t nj = nj_input ;
-
-    // I and J are either the user inputs, or sorted copies
-    #define I ((I_jumbled) ? I2 : I_input)
-    #define J ((J_jumbled) ? J2 : J_input)
-
-    // GB_subassigner cannot tolerate C==A and C==M aliasing.  A==M is OK.
-    ASSERT (C != NULL) ;
-    ASSERT (!GB_aliased (C, M)) ;
-    ASSERT (!GB_aliased (C, A)) ;
-
     ASSERT_MATRIX_OK (C, "C input for subassigner", GB0) ;
 
     //--------------------------------------------------------------------------
-    // delete any lingering zombies and assemble any pending tuples
-    //--------------------------------------------------------------------------
-
-    // subassign tolerates both zombies and pending tuples in C, but not M or A
-    GB_MATRIX_WAIT (M) ;
-    GB_MATRIX_WAIT (A) ;
-
-    //--------------------------------------------------------------------------
-    // check mask conditions
-    //--------------------------------------------------------------------------
-
-    bool empty_mask = false ;   // true if mask not present and complemented
-    bool no_mask = false ;      // true if mask not present and not complemented
-
-    if (M == NULL)
-    {
-        // the mask is not present
-        if (Mask_comp)
-        {
-            // empty_mask:  mask is not present, and complemented
-            empty_mask = true ;
-            if (!C_replace)
-            { 
-                // No work to do.  This the same as the GB_RETURN_IF_QUICK_MASK
-                // case in other GraphBLAS functions, except here only the
-                // sub-case of C_replace == false is handled.  The C_replace ==
-                // true sub-case needs to delete all entries in C(I,J), which
-                // is handled below in GB_subassign_00.  This "quick" case is
-                // checked again if C_replace becomes effectively false, below.
-                GBBURBLE ("quick ") ;
-                return (GrB_SUCCESS) ;
-            }
-        }
-        else
-        {
-            // no_mask:  mask is not present, and not complemented
-            no_mask = true ;
-            if (C_replace)
-            { 
-                // The mask is not present and not complemented.  In this case,
-                // C_replace is effectively false.  Disable it, since it can
-                // force pending tuples to be assembled.  In the comments below
-                // "C_replace effectively false" means that either C_replace is
-                // false on input, or the mask is not present and not
-                // complemented and thus C_replace is set to false here.
-                GBBURBLE ("(no mask: C_replace effectively false) ") ;
-                C_replace = false ;
-            }
-        }
-    }
-
-    //--------------------------------------------------------------------------
-    // check if C is empty
-    //--------------------------------------------------------------------------
-
-    bool C_is_empty = (GB_NNZ (C) == 0 && !GB_PENDING (C) && !GB_ZOMBIES (C)) ;
-    if (C_is_empty)
-    { 
-        // C is completely empty.  C_replace is irrelevant, so set it to false.
-        // The burble for this case occurs below, after GB_Matrix_wait (C),
-        // since C may become empty if it contains nothing but zombies, or
-        // after the GB_clear (C) below.
-        C_replace = false ;
-    }
-
-    //--------------------------------------------------------------------------
-    // get the C matrix
-    //--------------------------------------------------------------------------
-
-    int64_t cvlen = C->vlen ;
-    int64_t cvdim = C->vdim ;
-
-    // the matrix C may have pending tuples and/or zombies
-    ASSERT (GB_PENDING_OK (C)) ; ASSERT (GB_ZOMBIES_OK (C)) ;
-    ASSERT (scalar_code <= GB_UDT_code) ;
-
-    //--------------------------------------------------------------------------
-    // determine the length and kind of I and J, and check their properties
-    //--------------------------------------------------------------------------
-
-    int64_t nI, nJ, Icolon [3], Jcolon [3] ;
-    int Ikind, Jkind ;
-    GB_ijlength (I_input, ni, cvlen, &nI, &Ikind, Icolon) ;
-    GB_ijlength (J_input, nj, cvdim, &nJ, &Jkind, Jcolon) ;
-
-    // If the descriptor says that A must be transposed, it has already been
-    // transposed in the caller.  Thus C(I,J), A, and M (if present) all
-    // have the same size: length(I)-by-length(J)
-
-    bool I_unsorted, I_has_dupl, I_contig, J_unsorted, J_has_dupl, J_contig ;
-    int64_t imin, imax, jmin, jmax ;
-    GB_OK (GB_ijproperties (I_input, ni, nI, cvlen, &Ikind, Icolon,
-                &I_unsorted, &I_has_dupl, &I_contig, &imin, &imax, Context)) ;
-    GB_OK (GB_ijproperties (J_input, nj, nJ, cvdim, &Jkind, Jcolon,
-                &J_unsorted, &J_has_dupl, &J_contig, &jmin, &jmax, Context)) ;
-
-    //--------------------------------------------------------------------------
-    // sort I and J and remove duplicates, if needed
-    //--------------------------------------------------------------------------
-
-    // If I or J are explicit lists, and either of are unsorted or are sorted
-    // but have duplicate entries, then both I and J are sorted and their
-    // duplicates are removed.  A and M are adjusted accordingly.  Removing
-    // duplicates decreases the length of I and J.
-
-    bool I_jumbled = (I_unsorted || I_has_dupl) ;
-    bool J_jumbled = (J_unsorted || J_has_dupl) ;
-    bool presort = I_jumbled || J_jumbled ;
-
-    // This pre-sort of I and J is required for the parallel subassign.
-    // Otherwise, multiple threads may attempt to modify the same part of C.
-    // This could cause a race condition, if one thread flags a zombie at the
-    // same time another thread is using that index in a binary search.  If the
-    // 2nd thread finds either zombie/not-zombie, this is fine, but the
-    // modification would have to be atomic.  Atomic read/write is slow, so to
-    // avoid the use of atomics, the index lists I and J are sorted and all
-    // duplicates are removed.
-
-    // A side benefit of this pre-sort is that it ensures that the results of
-    // GrB_assign and GxB_subassign are fully defined if I and J have
-    // duplicates.  The definition of this pre-sort is given in the M-file
-    // below.
-
-    /*
-        function C = subassign (C, I, J, A)
-        % submatrix assignment with pre-sort of I and J; and remove duplicates
-
-        % delete duplicates from I, keeping the last one seen
-        [I2 I2k] = sort (I) ;
-        Idupl = [(I2 (1:end-1) == I2 (2:end)), false] ;
-        I2  = I2  (~Idupl) ;
-        I2k = I2k (~Idupl) ;
-        assert (isequal (I2, unique (I)))
-
-        % delete duplicates from J, keeping the last one seen
-        [J2 J2k] = sort (J) ;
-        Jdupl = [(J2 (1:end-1) == J2 (2:end)), false] ;
-        J2  = J2  (~Jdupl) ;
-        J2k = J2k (~Jdupl) ;
-        assert (isequal (J2, unique (J)))
-
-        % do the submatrix assignment, with no duplicates in I2 or J2
-        C (I2,J2) = A (I2k,J2k) ;
-    */
-
-    // With this subassign script, the result returned by GB_subassigner
-    // matches the behavior in MATLAB, so the following holds:
-
-    /*
-        C2 = C ;
-        C2 (I,J) = A ;
-        C3 = subassign (C, I, J, A) ;
-        assert (isequal (C2, C3)) ;
-    */
-
-    // That is, the pre-sort of I, J, and A has no effect on the final C, in
-    // MATLAB.
-
-    // The pre-sort itself takes additional work and memory space, but it may
-    // actually improve the performance of GB_subassigner, since it makes
-    // the data access of C more regular, even in the sequential case.
-
-    if (presort)
-    {
-
-        ASSERT (Ikind == GB_LIST || Jkind == GB_LIST) ;
-
-        if (I_jumbled)
-        { 
-            // I2 = sort I_input and remove duplicates
-            ASSERT (Ikind == GB_LIST) ;
-            GB_OK (GB_ijsort (I_input, &ni, &I2, &I2k, Context)) ;
-            // Recheck the length and properties of the new I2.  This may
-            // convert I2 to GB_ALL or GB_RANGE, after I2 has been sorted.
-            GB_ijlength (I2, ni, cvlen, &nI, &Ikind, Icolon) ;
-            GB_OK (GB_ijproperties (I2, ni, nI, cvlen, &Ikind, Icolon,
-                &I_unsorted, &I_has_dupl, &I_contig, &imin, &imax, Context)) ;
-            ASSERT (! (I_unsorted || I_has_dupl)) ;
-        }
-
-        if (J_jumbled)
-        { 
-            // J2 = sort J_input and remove duplicates
-            ASSERT (Jkind == GB_LIST) ;
-            GB_OK (GB_ijsort (J_input, &nj, &J2, &J2k, Context)) ;
-            // Recheck the length and properties of the new J2.  This may
-            // convert J2 to GB_ALL or GB_RANGE, after J2 has been sorted.
-            GB_ijlength (J2, nj, cvdim, &nJ, &Jkind, Jcolon) ;
-            GB_OK (GB_ijproperties (J2, nj, nJ, cvdim, &Jkind, Jcolon,
-                &J_unsorted, &J_has_dupl, &J_contig, &jmin, &jmax, Context)) ;
-            ASSERT (! (J_unsorted || J_has_dupl)) ;
-        }
-
-        if (!scalar_expansion)
-        { 
-            // A2 = A (I2k, J2k)
-            GB_OK (GB_subref (&A2, A->is_csc, A,
-                I_jumbled ? I2k : GrB_ALL, ni,
-                J_jumbled ? J2k : GrB_ALL, nj, false, true, Context)) ;
-            A = A2 ;
-        }
-
-        if (M != NULL)
-        { 
-            // M2 = M (I2k, J2k)
-            GB_OK (GB_subref (&M2, M->is_csc, M,
-                I_jumbled ? I2k : GrB_ALL, ni,
-                J_jumbled ? J2k : GrB_ALL, nj, false, true, Context)) ;
-            M = M2 ;
-        }
-
-        GB_FREE (I2k) ;
-        GB_FREE (J2k) ;
-    }
-
-    // I and J are now sorted, with no duplicate entries.  They are either
-    // GB_ALL, GB_RANGE, or GB_STRIDE, which are intrinsically sorted with no
-    // duplicates, or they are explicit GB_LISTs with sorted entries and no
-    // duplicates.
-
-    ASSERT (! (I_unsorted || I_has_dupl)) ;
-    ASSERT (! (J_unsorted || J_has_dupl)) ;
-
-    //--------------------------------------------------------------------------
-    // determine the type and nnz of A (from a scalar or matrix)
-    //--------------------------------------------------------------------------
-
-    // also determines if A is dense.  The scalar is always dense.
-
-    // mn = nI * nJ; valid only if mn_ok is true.
-    GrB_Index mn ;
-    bool mn_ok = GB_Index_multiply (&mn, nI, nJ) ;
-    bool A_is_dense ;   // true if A is dense (or scalar expansion)
-    int64_t anz ;       // nnz(A), or mn for scalar expansion
-    GrB_Type atype ;    // the type of A or the scalar
-
-    if (scalar_expansion)
-    { 
-        // The input is a scalar; the matrix A is not present.  Scalar
-        // expansion results in an implicit dense matrix A whose type is
-        // defined by the scalar_code.
-        ASSERT (A == NULL) ;
-        ASSERT (scalar != NULL) ;
-        anz = mn ;
-        A_is_dense = true ;
-        // a user-defined scalar is assumed to have the same type as C->type
-        // which is also user-defined (or else it would not be compatible).
-        // Compatibility has already been checked in the caller.  The type of
-        // scalar for built-in types is determined by scalar_code, instead,
-        // since it can differ from C (in which case it is typecasted into
-        // C->type).  User-defined scalars cannot be typecasted.
-        atype = GB_code_type (scalar_code, C->type) ;
-        ASSERT_TYPE_OK (atype, "atype for scalar expansion", GB0) ;
-    }
-    else
-    { 
-        // A is an nI-by-nJ matrix, with no pending computations
-        ASSERT_MATRIX_OK (A, "A for subassign kernel", GB0) ;
-        ASSERT (nI == A->vlen && nJ == A->vdim) ;
-        ASSERT (!GB_PENDING (A)) ;   ASSERT (!GB_ZOMBIES (A)) ;
-        ASSERT (scalar == NULL) ;
-        anz = GB_NNZ (A) ;
-        A_is_dense = (mn_ok && anz == (int64_t) mn) ;
-        atype = A->type ;
-    }
-
-    //--------------------------------------------------------------------------
-    // check the size of the mask
-    //--------------------------------------------------------------------------
-
-    // For subassignment, the mask must be |I|-by-|J|
-
-    if (M != NULL)
-    { 
-        // M can have no pending tuples nor zombies
-        ASSERT_MATRIX_OK (M, "M for subassign kernel", GB0) ;
-        ASSERT (!GB_PENDING (M)) ;  ASSERT (!GB_ZOMBIES (M)) ;
-        ASSERT (nI == M->vlen && nJ == M->vdim) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // C(:,:) assignment
-    //--------------------------------------------------------------------------
-
-    // whole_C_matrix is true if all of C(:,:) is being assigned to
-    bool whole_C_matrix = (Ikind == GB_ALL) && (Jkind == GB_ALL) ;
-
-    bool C_splat_scalar = false ;   // C(:,:) = x
-    bool C_splat_matrix = false ;   // C(:,:) = A
-
-    if (whole_C_matrix && no_mask && (accum == NULL))
-    {
-
-        //----------------------------------------------------------------------
-        // C(:,:) = x or A:  whole matrix assignment with no mask
-        //----------------------------------------------------------------------
-
-        if (scalar_expansion)
-        { 
-            // Method 21: C(:,:) = x
-            C_splat_scalar = true ;
-        }
-        else
-        { 
-            // Method 24: C(:,:) = A
-            C_splat_matrix = true ;
-        }
-        // C_replace is already effectively false (see no_mask case above)
-        ASSERT (C_replace == false) ;
-
-        // free pending tuples early but do not clear C.  If it is
-        // already dense then its pattern can be reused.
-        GB_Pending_free (&(C->Pending)) ;
-
-    }
-    else if (whole_C_matrix && C_replace && (accum == NULL))
-    {
-
-        //----------------------------------------------------------------------
-        // C(:,:)<any mask, replace> = A or x, no accum operator present
-        //----------------------------------------------------------------------
-
-        // If the entire C(:,:) is being assigned to, and if no accum operator
-        // is present, then the matrix can be cleared of all entries now, and
-        // then C_replace can be set false.  This can only be done because C is
-        // not aliased to M or A on input. which the caller ensures is true.
-        // See the assertion above.  Clearing C now speeds up the assignment
-        // since the wait on C can be skipped, below.  It also simplifies the
-        // kernels.  If S is constructed, it is just an empty matrix.
-
-        GB_OK (GB_clear (C, Context)) ;
-        if (C_replace)
-        { 
-            GBBURBLE ("(C cleared early) ") ;
-            C_replace = false ;
-        }
-
-        // By clearing C now and setting C_replace to false, the following
-        // methods are used: 09 becomes 05, 10 becomes 06n or 06s, 17
-        // becomes 13, and 18 becomes 14.  The S matrix for methods 06s,
-        // 13, and 14 is still created, but it is very fast to construct
-        // and traverse since C is empty.  Method 00 can be skipped since
-        // C is already empty (see "quick" case below).
-
-        // prior time             new  time           action
-        // ----- ----             ---  ----           ------
-
-        // 00:  O(S)              nothing, O(1)       C already cleared
-
-        // 09:  O(M+S)            05:  O(M)           C<M> = x, no S
-
-        // 10:  O((A+S)*log(m))   06n: O(M*(log(a))   C<M> = A, no S
-        //                        06s: O(A*(log(m))   C<M> = A, with S
-
-        // 17:  O(m*n)            13:  O(m*n)         C<!M> = x, with S
-
-        // 18:  O(A*log(m))       14:  O(A*log(m))    C<!M> = A, with S
-
-        //  =====================       ==============
-        //  M   cmp rpl acc A   S       method: action
-        //  =====================       ==============
-
-        //  M   -   -   -   -   -       05:  C(I,J)<M> = x, no S
-        //  M   -   -   -   A   -       06n: C(I,J)<M> = A, no S
-        //  M   -   -   -   A   S       06s: C(I,J)<M> = A, with S
-
-        //  M   -   r   -   -   S       09:  C(I,J)<M,repl> = x, with S
-        //  M   -   r   -   A   S       10:  C(I,J)<M,repl> = A, with S
-
-        //  M   c   -   -   -   S       13:  C(I,J)<!M> = x, with S
-        //  M   c   -   -   A   S       14:  C(I,J)<!M> = A, with S
-
-        //  M   c   r   -   -   S       17:  C(I,J)<!M,repl> = x, with S
-        //  M   c   r   -   A   S       18:  C(I,J)<!M,repl> = A, with S
-
-        // Methods 09, 10, 17, and 18 are now used only if C(I,J) is a
-        // submatrix of C, and not for the whole_C_matrix case.
-    }
-
-    //--------------------------------------------------------------------------
-    // check compatibilty of prior pending tuples
-    //--------------------------------------------------------------------------
-
-    // The action: ( delete ), described below, can only delete a live
-    // entry in the pattern.  It cannot delete a pending tuple; pending tuples
-    // cannot become zombies.  Thus, if this call to GB_subassigner has the
-    // potential for creating zombies, all prior pending tuples must be
-    // assembled now.  They thus become live entries in the pattern of C, so
-    // that this GB_subassigner can (potentially) turn them into zombies via
-    // action: ( delete ).
-
-    // If accum is NULL, the operation is C(I,J) = A, or C(I,J)<M> = A.
-    // If A has any implicit zeros at all, or if M is present, then
-    // the action: ( delete ) is possible.  This action is taken when an entry
-    // is found in C but not A.  It is thus not possible to check A in advance
-    // if an entry in C must be deleted.  If an entry does not appear in C but
-    // appears as a pending tuple, deleting it would require a scan of all the
-    // pending tuples in C.  This is costly, and simply assembling all pending
-    // tuples first is faster.
-
-    // The action: ( insert ), described below, adds additional pending tuples.
-    // All pending tuples will be assembled sometime later on, using a single
-    // pending operator, and thus the current accum operator must match the
-    // prior pending operator.  If the operators do not match, then all prior
-    // pending tuples must be assembled now, so that this GB_subassigner can
-    // (potentially) insert new pending tuples whose pending operator is accum.
-
-    // These tests are conservative because it is possible that this
-    // GxB_subassign will not need to use action: ( insert ).
-
-    // In the discussion below, let SECOND_Ctype denote the SECOND operator
-    // z=f(x,y) whose ztype, xtype, and ytype matches the type of C.
-
-    bool wait = false ;
-
-    if (C->Pending == NULL)
-    { 
-
-        //----------------------------------------------------------------------
-        // no pending tuples currently exist
-        //----------------------------------------------------------------------
-
-        // If any new pending tuples are added, their pending operator is
-        // accum, or the implicit SECOND_Ctype operator if accum is NULL.
-        // The type of any pending tuples will become C->type.
-        // Prior zombies have no effect on this decision.
-
-        wait = false ;
-
-    }
-    else
-    {
-
-        //----------------------------------------------------------------------
-        // prior pending tuples exist: check if action: ( delete ) can occur
-        //----------------------------------------------------------------------
-
-        // action: ( delete ) can only operate on entries in the pattern by
-        // turning them into zombies.  It cannot delete prior pending tuples.
-        // Thus all prior pending tuples must be assembled first if
-        // action: ( delete ) can occur.
-
-        if (C_replace)
-        { 
-            // C_replace must use the action: ( delete )
-            wait = true ;
-        }
-        else if (accum == NULL)
-        {
-            // This GxB_subassign can potentially use action: ( delete ), and
-            // thus prior pending tuples must be assembled first.  However, if
-            // A is completely dense and if there is no mask M, then C(I,J)=A
-            // cannot delete any entries from C.
-
-            if (M == NULL && A_is_dense)
-            { 
-                // A is a dense matrix, so entries cannot be deleted
-                wait = false ;
-            }
-            else
-            { 
-                // A is sparse or M is present.
-                // In this case, action: ( delete ) might occur
-                wait = true ;
-            }
-        }
-
-        //----------------------------------------------------------------------
-        // check if pending operator is compatible
-        //----------------------------------------------------------------------
-
-        if (!wait)
-        {
-
-            // ( delete ) will not occur, but new pending tuples may be added
-            // via the action: ( insert ).  Check if the accum operator is the
-            // same as the prior pending operator and ensure the types are
-            // the same.
-
-            ASSERT (C->Pending != NULL) ;
-            ASSERT (C->Pending->type != NULL) ;
-
-            if (atype != C->Pending->type)
-            { 
-                // entries in A are copied directly into the list of pending
-                // tuples for C, with no typecasting.  The type of the prior
-                // pending tuples must match the type of A.  Since the types
-                // do not match, prior updates must be assembled first.
-                wait = true ;
-            }
-            else if
-            (
-                // the types match, now check the pending operator
-                ! (
-                    // the operators are the same
-                    (accum == C->Pending->op)
-                    // or both operators are SECOND_Ctype, implicit or explicit
-                    || (GB_op_is_second (accum, C->type) &&
-                        GB_op_is_second (C->Pending->op, C->type))
-                  )
-            )
-            { 
-                wait = true ;
-            }
-        }
-    }
-
-    if (wait)
-    { 
-        // Prior computations are not compatible with this assignment, so all
-        // prior work must be finished.  This potentially costly.
-        // delete any lingering zombies and assemble any pending tuples
-        ASSERT_MATRIX_OK (C, "C before wait", GB0) ;
-        GB_OK (GB_Matrix_wait (C, Context)) ;
-    }
-
-    ASSERT_MATRIX_OK (C, "C before subassign", GB0) ;
-    ASSERT_BINARYOP_OK_OR_NULL (accum, "accum for assign", GB0) ;
-
-    //--------------------------------------------------------------------------
-    // check again if C is empty
-    //--------------------------------------------------------------------------
-
-    // GB_clear or GB_Matrix_wait, above, may have deleted all the zombies in
-    // C, so check again if C is empty.
-
-    C_is_empty = (GB_NNZ (C) == 0 && !GB_PENDING (C) && !GB_ZOMBIES (C)) ;
-    if (C_is_empty)
-    { 
-        // C is completely empty.  C_replace is irrelevant, so set it to false.
-        GBBURBLE ("(C empty) ") ;
-        C_replace = false ;
-    }
-
-    //--------------------------------------------------------------------------
-    // check "quick" case again
-    //--------------------------------------------------------------------------
-
-    if (empty_mask && !C_replace)
-    { 
-        // The mask is empty (not present, but complemented), and C_replace is
-        // now effectively false.  If C_replace was false on input, then the
-        // "quick" case above has already been triggered.  However, if C is now
-        // empty (either cleared with GB_clear, empty on input, or empty after
-        // GB_Matrix_wait), then C_replace is now effectively false.  In this
-        // case, the "quick" case can be checked again.  No more work to do.
-        GBBURBLE ("quick ") ;
-        return (GrB_SUCCESS) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // keep track of the current accum operator
+    // methods that rely on C and A being dense assume they are not jumbled
     //--------------------------------------------------------------------------
 
-    // If accum is NULL and pending tuples are added, they will be assembled
-    // sometime later (not here) using the implied SECOND_Ctype operator.  This
-    // GB_subassigner operation corresponds to C(I,J)=A or C(I,J)<M>=A.
-    // Subsequent calls to GrB_setElement, and subsequent calls to GrB_assign
-    // or GxB_subassign with an explict SECOND_Ctype operator, may create
-    // additional pending tuples and add them to the list without requiring
-    // that they be assembled first.
-
-    // If accum is non-NULL, then all prior pending tuples have the same
-    // pending operator as this accum.  If that prior operator was the implicit
-    // SECOND_Ctype and those pending tuples still exist, then this accum
-    // operator is the explicit SECOND_ctype operator.  The implicit
-    // SECOND_Ctype operator is replaced with the current accum, which is the
-    // explicit SECOND_Ctype operator.
-
-    if (C->Pending != NULL)
-    {
-        C->Pending->op = accum ;
-    }
-
-    //--------------------------------------------------------------------------
-    // select the method to use
-    //--------------------------------------------------------------------------
-
-    // check if C is competely dense:  all entries present and no pending work.
-    bool C_is_dense = !GB_PENDING_OR_ZOMBIES (C) && GB_is_dense (C) ;
-    bool C_dense_update = false ;
-    if (C_is_dense)
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    if (GB_is_dense (A))
     { 
-        GBBURBLE ("(C dense) ") ;
-        if (whole_C_matrix && no_mask && (accum != NULL)
-            && (C->type == accum->ztype) && (C->type == accum->xtype))
-        { 
-            // C(:,:) += x or A, where C is dense, no typecasting of C
-            C_dense_update = true ;
-        }
+        // methods that rely on A being dense assume A is not jumbled
+        GB_MATRIX_WAIT_IF_JUMBLED (A) ;
     }
 
-    // simple_mask: C(I,J)<M> = ... ; or C(I,J)<M> += ...
-    bool simple_mask = (!C_replace && M != NULL && !Mask_comp) ;
-
-    // C_Mask_scalar: C(I,J)<M> = scalar or += scalar
-    bool C_Mask_scalar = (scalar_expansion && simple_mask) ;
-
-    // C_Mask_matrix:  C(I,J)<M> = A or += A
-    bool C_Mask_matrix = (!scalar_expansion && simple_mask) ;
-
-    bool S_Extraction ;
-    if (empty_mask)
-    { 
-        // The mask is not present, but complemented.
-        // Method 00: C(I,J)<!,repl> = empty
-        S_Extraction = true ;
-    }
-    else if (C_splat_scalar)
-    { 
-        // Method 21: C(:,:) = x where x is a scalar; C becomes dense
-        S_Extraction = false ;
-    }
-    else if (C_splat_matrix)
-    { 
-        // Method 24: C(:,:) = A
-        S_Extraction = false ;
-    }
-    else if (C_dense_update)
-    { 
-        // Methods 22 and 23: C(:,:) += x or A where C is dense
-        S_Extraction = false ;
-    }
-    else if (C_Mask_scalar)
-    { 
-        // Method 05*, or 07: C(I,J)<M> = or += scalar; C_replace false
-        S_Extraction = false ;
-    }
-    else if (C_Mask_matrix)
-    {
-        // C(I,J)<M> = A or += A
-        if (accum != NULL)
-        { 
-            // Method 08: C(I,J)<M> += A
-            S_Extraction = false ;
-        }
-        else
-        { 
-            // C(I,J)<M> = A ;  use 06s (with S) or 06n (without S)
-            // method 06s (with S) is faster when nnz (A) < nnz (M).
-            // If M and A are aliased, then nnz (A) == nnz (M), so method
-            // 06n is used.
-            if (C_is_dense && whole_C_matrix && M == A)
-            {
-                // Method 06d: C<A> = A
-                S_Extraction = false ;
-            }
-            else if (C_is_empty && whole_C_matrix && A_is_dense && Mask_struct)
-            {
-                // Method 25: C<M,s> = A, where M is structural, A is
-                // dense, and C starts out empty.  The pattern of C will be the
-                // same as M, and the subassign method is extremely simple.
-                S_Extraction = false ;
-            }
-            else
-            {
-                // Method 06n: or Method 06s:
-                S_Extraction = (anz < GB_NNZ (M)) ;
-            }
-        }
-    }
-    else
+    if (GB_is_dense (C) && !GB_PENDING_OR_ZOMBIES (C) && !GB_IS_BITMAP (C))
     { 
-        // all other methods require S
-        S_Extraction = true ;
+        // C is dense or full
+        GB_MATRIX_WAIT_IF_JUMBLED (C) ;
     }
 
-    //--------------------------------------------------------------------------
-    // extract the pattern: S = C(I,J) for S_Extraction method, and quick mask
-    //--------------------------------------------------------------------------
-
-    // S is a sparse int64_t matrix.  Its "values" are not numerical, but
-    // indices into C.  For example, suppose 100 = I [5] and 200 = J [7].  Then
-    // S(5,7) is the entry C(I(5),J(7)), and the value of S(5,7) is the
-    // position in C that holds that particular entry C(100,200):
-    // pC = S->x [...] gives the location of the value C->x [pC] and row index
-    // 100 = C->i [pC], and pC will be between C->p [200] ... C->p [200+1]-1
-    // if C is non-hypersparse.  If C is hyperparse then pC will be still
-    // reside inside the vector jC, in the range C->p [k] ... C->p [k+1]-1,
-    // if jC is the kth non-empty vector in the hyperlist of C.
-
-    if (S_Extraction)
-    { 
-
-        //----------------------------------------------------------------------
-        // extract symbolic structure S=C(I,J)
-        //----------------------------------------------------------------------
-
-        // FUTURE::: if whole_C_matrix is true, then C(:,:) = ... and S == C,
-        // except that S is zombie-free, read-only; and C collects zombies.
-
-        // FUTURE:: the properties of I and J are already known, and thus do
-        // not need to be recomputed by GB_subref.
-
-        // S and C have the same CSR/CSC format.  S is always returned sorted,
-        // in the same hypersparse form as C (unless S is empty, in which case
-        // it is always returned as hypersparse). This also checks I and J.
-
-        GB_OK (GB_subref (&S, C->is_csc, C, I, ni, J, nj, true, true, Context));
-
-        ASSERT_MATRIX_OK (C, "C for subref extraction", GB0) ;
-        ASSERT_MATRIX_OK (S, "S for subref extraction", GB0) ;
-
-        #ifdef GB_DEBUG
-        const int64_t *GB_RESTRICT Si = S->i ;
-        const int64_t *GB_RESTRICT Sx = (int64_t *) S->x ;
-        // this body of code explains what S contains.
-        // S is nI-by-nJ where nI = length (I) and nJ = length (J)
-        GBI_for_each_vector (S)
-        {
-            // prepare to iterate over the entries of vector S(:,jnew)
-            GBI_jth_iteration (jnew, pS_start, pS_end) ;
-            // S (inew,jnew) corresponds to C (iC, jC) ;
-            // jC = J [j] ; or J is a colon expression
-            int64_t jC = GB_ijlist (J, jnew, Jkind, Jcolon) ;
-            for (int64_t pS = pS_start ; pS < pS_end ; pS++)
-            {
-                // S (inew,jnew) is a pointer back into C (I(inew), J(jnew))
-                int64_t inew = Si [pS] ;
-                ASSERT (inew >= 0 && inew < nI) ;
-                // iC = I [iA] ; or I is a colon expression
-                int64_t iC = GB_ijlist (I, inew, Ikind, Icolon) ;
-                int64_t p = Sx [pS] ;
-                ASSERT (p >= 0 && p < GB_NNZ (C)) ;
-                int64_t pC_start, pC_end, pleft = 0, pright = C->nvec-1 ;
-                bool found = GB_lookup (C->is_hyper, C->h, C->p,
-                    &pleft, pright, jC, &pC_start, &pC_end) ;
-                ASSERT (found) ;
-                // If iC == I [inew] and jC == J [jnew], (or the equivaleent
-                // for GB_ALL, GB_RANGE, GB_STRIDE) then A(inew,jnew) will be
-                // assigned to C(iC,jC), and p = S(inew,jnew) gives the pointer
-                // into C to where the entry (C(iC,jC) appears in C:
-                ASSERT (pC_start <= p && p < pC_end) ;
-                ASSERT (iC == GB_UNFLIP (C->i [p])) ;
-            }
-        }
-        #endif
-    }
+    GBURBLE ("(pending: "GBd") ", GB_Pending_n (C)) ;
 
     //==========================================================================
     // submatrix assignment C(I,J)<M> = accum (C(I,J),A): meta-algorithm
@@ -916,11 +167,12 @@ GrB_Info GB_subassigner             // C(I,J)<#M> = A or accum (C (I,J), A)
         //  M   -   -   -   -   -       05e: C<M,s> = x, no S, C empty
         //  M   -   -   -   -   -       05:  C(I,J)<M> = x, no S
         //  A   -   -   -   A   -       06d: C<A> = A, no S, C dense
-        //  M   -   -   -   A   -       20:  C<M,s> = A, A dense, C empty
+        //  M   -   -   -   A   -       25:  C<M,s> = A, A dense, C empty
         //  M   -   -   -   A   -       06n: C(I,J)<M> = A, no S
         //  M   -   -   -   A   S       06s: C(I,J)<M> = A, with S
         //  M   -   -   +   -   -       07:  C(I,J)<M> += x, no S
-        //  M   -   -   +   A   -       08:  C(I,J)<M> += A, no S
+        //  M   -   -   +   A   -       08n: C(I,J)<M> += A, no S
+        //  M   -   -   +   A   -       08s: C(I,J)<M> += A, with S
         //  M   -   r   -   -   S       09:  C(I,J)<M,repl> = x, with S
         //  M   -   r   -   A   S       10:  C(I,J)<M,repl> = A, with S
         //  M   -   r   +   -   S       11:  C(I,J)<M,repl> += x, with S
@@ -935,6 +187,23 @@ GrB_Info GB_subassigner             // C(I,J)<#M> = A or accum (C (I,J), A)
         //  M   c   r   +   -   S       19:  C(I,J)<!M,repl> += x, with S
         //  M   c   r   +   A   S       20:  C(I,J)<!M,repl> += A, with S
 
+        //----------------------------------------------------------------------
+        // FUTURE::: 8 simpler cases when I and J are ":" (S not needed):
+        //----------------------------------------------------------------------
+
+        // These methods could all tolerate C==M and C==A aliasing, assuming no
+        // binary search or if the binary search of C==M or C==A can be done
+        // with atomics.  These are all the methods used by GB_accum_mask.
+
+        //  M   -   -   -   A   ?       06x: C(:,:)<M> = A
+        //  M   -   -   +   A   ?       08x: C(:,:)<M> += A
+        //  M   -   r   -   A   ?       10x: C(:,:)<M,repl> = A
+        //  M   -   r   +   A   ?       12x: C(:,:)<M,repl> += A
+        //  M   c   -   -   A   ?       14x: C(:,:)<!M> = A
+        //  M   c   -   +   A   ?       16x: C(:,:)<!M> += A
+        //  M   c   r   -   A   ?       18x: C(:,:)<!M,repl> = A
+        //  M   c   r   +   A   ?       20x: C(:,:)<!M,repl> += A
+
         //----------------------------------------------------------------------
         // FUTURE::: C<C,s> = x    C == M, replace all values, C_replace ignored
         // FUTURE::: C<C,s> += x   C == M, update all values, C_replace ignored
@@ -944,86 +213,67 @@ GrB_Info GB_subassigner             // C(I,J)<#M> = A or accum (C (I,J), A)
     // For the single case C(I,J)<M>=A, two methods can be used: 06n and 06s.
 
     #define Istring ((Ikind == GB_ALL) ? ":" : "I")
-    #define Jstring ((Jkind == GB_ALL) ? ":" : "I")
+    #define Jstring ((Jkind == GB_ALL) ? ":" : "J")
 
-    if (empty_mask)
-    { 
+    switch (subassign_method)
+    {
 
         //----------------------------------------------------------------------
-        // C(I,J)<!,repl> = empty
+        // matrix or scalar subassign via GB_bitmap_assign
         //----------------------------------------------------------------------
 
-        //  =====================       ==============
-        //  M   cmp rpl acc A   S       method: action
-        //  =====================       ==============
-        //  -   c   r           S       00:  C(I,J)<!,repl> = empty, with S
-
-        ASSERT (C_replace) ;
-        ASSERT (S != NULL) ;
-
-        // Method 00: C(I,J) = empty ; using S
-        GBBURBLE ("Method 00: C(%s,%s) = empty ; using S ",
-            Istring, Jstring) ;
-        GB_OK (GB_subassign_00 (C,
-            I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-            S, Context)) ;
-
-    }
-    else if (C_splat_scalar)
-    { 
+        case GB_SUBASSIGN_METHOD_BITMAP : 
+        {
+            // C is bitmap, or is converted to bitmap.  M and A can have any
+            // sparsity (if present).
+            GBURBLE ("Method: bitmap_subassign ") ;
+            GB_OK (GB_bitmap_assign (C, C_replace,
+                I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                M, Mask_comp, Mask_struct, accum, A, scalar, atype,
+                GB_SUBASSIGN, Context)) ;
+        }
+        break ;
 
         //----------------------------------------------------------------------
-        // C = x where x is a scalar; C becomes dense
+        // C = x where x is a scalar; C becomes full
         //----------------------------------------------------------------------
 
-        //  =====================       ==============
-        //  M   cmp rpl acc A   S       method: action
-        //  =====================       ==============
-
-        //  -   -   x   -   -   -       21:  C = x, no S, C anything
+        case GB_SUBASSIGN_METHOD_21 : 
+        {
 
-        ASSERT (whole_C_matrix) ;           // C(:,:) is modified
-        ASSERT (M == NULL) ;                // no mask present
-        ASSERT (accum == NULL) ;            // accum is not present
-        ASSERT (!C_replace) ;               // C_replace is effectively false
-        ASSERT (S == NULL) ;                // S is not used
-        ASSERT (scalar_expansion) ;         // x is a scalar
+            //  =====================       ==============
+            //  M   cmp rpl acc A   S       method: action
+            //  =====================       ==============
 
-        // Method 21: C = x where x is a scalar; C becomes dense
-        GBBURBLE ("Method 21: (C dense) = scalar ") ;
-        GB_OK (GB_dense_subassign_21 (C, scalar, atype, Context)) ;
+            //  -   -   x   -   -   -       21:  C = x, no S, C anything
 
-    }
-    else if (C_splat_matrix)
-    { 
+            // Method 21: C = x where x is a scalar; C becomes full
+            GBURBLE ("Method 21: (C full) = scalar ") ;
+            GB_OK (GB_dense_subassign_21 (C, scalar, atype, Context)) ;
+        }
+        break ;
 
         //----------------------------------------------------------------------
         // C = A
         //----------------------------------------------------------------------
 
-        //  =====================       ==============
-        //  M   cmp rpl acc A   S       method: action
-        //  =====================       ==============
-
-        //  -   -   x   -   A   -       24:  C = A, no S, C and A anything
+        case GB_SUBASSIGN_METHOD_24 : 
+        {
 
-        ASSERT (whole_C_matrix) ;           // C(:,:) is modified
-        ASSERT (M == NULL) ;                // no mask present
-        ASSERT (accum == NULL) ;            // accum is not present
-        ASSERT (!C_replace) ;               // C_replace is effectively false
-        ASSERT (S == NULL) ;                // S is not used
-        ASSERT (!scalar_expansion) ;        // A is a matrix
+            //  =====================       ==============
+            //  M   cmp rpl acc A   S       method: action
+            //  =====================       ==============
 
-        // Method 24: C = A
-        GBBURBLE ("Method 24: C = Z ") ;
-        GB_OK (GB_dense_subassign_24 (C, A, Context)) ;
+            //  -   -   x   -   A   -       24:  C = A, no S, C and A anything
 
-    }
-    else if (C_dense_update)
-    { 
+            // Method 24: C = A
+            GBURBLE ("Method 24: C = Z ") ;
+            GB_OK (GB_subassign_24 (C, A, Context)) ;
+        }
+        break ;
 
         //----------------------------------------------------------------------
-        // C += A or x where C is dense
+        // C += A or x where C is dense or full (and becomes full)
         //----------------------------------------------------------------------
 
         //  =====================       ==============
@@ -1032,29 +282,21 @@ GrB_Info GB_subassigner             // C(I,J)<#M> = A or accum (C (I,J), A)
         //  -   -   -   +   -   -       22:  C += x, no S, C dense
         //  -   -   -   +   A   -       23:  C += A, no S, C dense
 
-        ASSERT (C_is_dense) ;               // C is dense
-        ASSERT (whole_C_matrix) ;           // C(:,:) is modified
-        ASSERT (M == NULL) ;                // no mask present
-        ASSERT (accum != NULL) ;            // accum is present
-        ASSERT (!C_replace) ;               // C_replace is false
-        ASSERT (S == NULL) ;                // S is not used
-
-        if (scalar_expansion)
+        case GB_SUBASSIGN_METHOD_22 : 
         {
-            // Method 22: C(:,:) += x where C is dense
-            GBBURBLE ("Method 22: (C dense) += scalar ") ;
+            // Method 22: C(:,:) += x where C is dense or full
+            GBURBLE ("Method 22: (C full) += scalar ") ;
             GB_OK (GB_dense_subassign_22 (C, scalar, atype, accum, Context)) ;
         }
-        else
+        break ;
+
+        case GB_SUBASSIGN_METHOD_23 : 
         {
-            // Method 23: C(:,:) += A where C is dense
-            GBBURBLE ("Method 23: (C dense) += Z ") ;
+            // Method 23: C(:,:) += A where C is dense or full
+            GBURBLE ("Method 23: (C full) += Z ") ;
             GB_OK (GB_dense_subassign_23 (C, A, accum, Context)) ;
         }
-
-    }
-    else if (C_Mask_scalar)
-    {
+        break ;
 
         //----------------------------------------------------------------------
         // C(I,J)<M> = scalar or +=scalar
@@ -1068,49 +310,45 @@ GrB_Info GB_subassigner             // C(I,J)<#M> = A or accum (C (I,J), A)
         //  M   -   -   -   -   -       05:  C(I,J)<M> = x, no S
         //  M   -   -   +   -   -       07:  C(I,J)<M> += x, no S
 
-        ASSERT (scalar_expansion) ;         // A is a scalar
-        ASSERT (M != NULL && !Mask_comp) ;  // mask M present, not compl.
-        ASSERT (!C_replace) ;               // C_replace is false
-        ASSERT (S == NULL) ;                // S is not used
+        case GB_SUBASSIGN_METHOD_05e : 
+        {
+            // Method 05e: C(:,:)<M> = scalar ; no S; C empty, M structural
+            GBURBLE ("Method 05e: (C empty)<M> = scalar ") ;
+            GB_OK (GB_subassign_05e (C, M, scalar, atype, Context)) ;
+        }
+        break ;
+
+        case GB_SUBASSIGN_METHOD_05d : 
+        {
+            // Method 05d: C(:,:)<M> = scalar ; no S; C is dense or full;
+            // C becomes full.
+            GBURBLE ("Method 05d: (C full)<M> = scalar ") ;
+            GB_OK (GB_dense_subassign_05d (C,
+                M, Mask_struct, scalar, atype, Context)) ;
+        }
+        break ;
 
-        if (accum == NULL)
+        case GB_SUBASSIGN_METHOD_05 : 
         {
-            if (C_is_empty && whole_C_matrix && Mask_struct)
-            { 
-                // Method 05e: C(:,:)<M> = scalar ; no S; C empty, M structural
-                GBBURBLE ("Method 05e: (C empty)<M> = scalar ") ;
-                GB_OK (GB_subassign_05e (C, M, scalar, atype, Context)) ;
-            }
-            else if (C_is_dense && whole_C_matrix)
-            { 
-                // Method 05d: C(:,:)<M> = scalar ; no S; C is dense
-                GBBURBLE ("Method 05d: (C dense)<M> = scalar ") ;
-                GB_OK (GB_dense_subassign_05d (C,
-                    M, Mask_struct, scalar, atype, Context)) ;
-            }
-            else
-            { 
-                // Method 05: C(I,J)<M> = scalar ; no S
-                GBBURBLE ("Method 05: C(%s,%s)<M> = scalar ; no S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_05 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, scalar, atype, Context)) ;
-            }
+            // Method 05: C(I,J)<M> = scalar ; no S
+            GBURBLE ("Method 05: C(%s,%s)<M> = scalar ; no S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_05 (C,
+                I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
+                M, Mask_struct, scalar, atype, Context)) ;
         }
-        else
-        { 
+        break ;
+
+        case GB_SUBASSIGN_METHOD_07 : 
+        {
             // Method 07: C(I,J)<M> += scalar ; no S
-            GBBURBLE ("Method 07: C(%s,%s)<M> += scalar ; no S",
+            GBURBLE ("Method 07: C(%s,%s)<M> += scalar ; no S ",
                 Istring, Jstring) ;
             GB_OK (GB_subassign_07 (C,
                 I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
                 M, Mask_struct, accum, scalar, atype, Context)) ;
         }
-
-    }
-    else if (C_Mask_matrix)
-    {
+        break ;
 
         //----------------------------------------------------------------------
         // C(I,J)<M> = A or += A
@@ -1119,59 +357,71 @@ GrB_Info GB_subassigner             // C(I,J)<#M> = A or accum (C (I,J), A)
         //  =====================       ==============
         //  M   cmp rpl acc A   S       method: action
         //  =====================       ==============
-        //  M   -   -   +   A   -       08:  C(I,J)<M> += A, no S
+        //  M   -   -   +   A   -       08n: C(I,J)<M> += A, no S
+        //  M   -   -   +   A   -       08s: C(I,J)<M> += A, with S
         //  A   -   -   -   A   -       06d: C<A> = A, no S, C dense
         //  M   -   x   -   A   -       25:  C<M,s> = A, A dense, C empty
         //  M   -   -   -   A   -       06n: C(I,J)<M> = A, no S
         //  M   -   -   -   A   S       06s: C(I,J)<M> = A, with S
 
-        ASSERT (!scalar_expansion) ;        // A is a matrix
-        ASSERT (M != NULL && !Mask_comp) ;  // mask M present, not compl.
-        ASSERT (!C_replace) ;
-
-        if (accum != NULL)
-        { 
-            // Method 08: C(I,J)<M> += A ; no S
-            GBBURBLE ("Method 08: C(%s,%s)<M> += Z ; no S ",
-                Istring, Jstring) ;
-            ASSERT (S == NULL) ;
-            GB_OK (GB_subassign_08 (C,
+        case GB_SUBASSIGN_METHOD_08n : 
+        {
+            // Method 08n: C(I,J)<M> += A ; no S
+            GBURBLE ("Method 08n: C(%s,%s)<M> += Z ; no S ", Istring, Jstring) ;
+            GB_OK (GB_subassign_08n (C,
                 I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
                 M, Mask_struct, accum, A, Context)) ;
         }
-        else if (C_is_dense && whole_C_matrix && M == A)
-        { 
-            // Method 06d: C(:,:)<A> = A ; no S, C dense
-            GBBURBLE ("Method 06d: (C dense)<Z> = Z ") ;
+        break ;
+
+        case GB_SUBASSIGN_METHOD_08s : 
+        {
+            // Method 08s: C(I,J)<M> += A ; with S
+            GBURBLE ("Method 08s: C(%s,%s)<M> += Z ; with S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_08s_and_16 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, false, accum, A, Context)) ;
+        }
+        break ;
+
+        case GB_SUBASSIGN_METHOD_06d : 
+        {
+            // Method 06d: C(:,:)<A> = A ; no S, C dense or full;
+            GBURBLE ("Method 06d: (C full)<Z> = Z ") ;
             GB_OK (GB_dense_subassign_06d (C, A, Mask_struct, Context)) ;
         }
-        else if (C_is_empty && whole_C_matrix && A_is_dense && Mask_struct)
-        { 
-            GBBURBLE ("Method 25: (C empty)<M> = (Z dense) ") ;
+        break ;
+
+        case GB_SUBASSIGN_METHOD_25 : 
+        {
+            // Method 25:  C<M,struct> = A, A dense, C empty
+            // A is dense or full; remains unchanged
+            GB_BURBLE_DENSE (A, "Method 25: (C empty)<M> = (Z %s) ") ;
             GB_OK (GB_dense_subassign_25 (C, M, A, Context)) ;
         }
-        else if (S == NULL)
-        { 
+        break ;
+
+        case GB_SUBASSIGN_METHOD_06n : 
+        {
             // Method 06n: C(I,J)<M> = A ; no S
-            GBBURBLE ("Method 06n: C(%s,%s)<M> = Z ; no S ",
-                Istring, Jstring) ;
+            GBURBLE ("Method 06n: C(%s,%s)<M> = Z ; no S ", Istring, Jstring) ;
             GB_OK (GB_subassign_06n (C,
                 I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
                 M, Mask_struct, A, Context)) ;
         }
-        else
-        { 
+        break ;
+
+        case GB_SUBASSIGN_METHOD_06s : 
+        {
             // Method 06s: C(I,J)<M> = A ; using S
-            GBBURBLE ("Method 06s: C(%s,%s)<M> = Z ; using S ",
+            GBURBLE ("Method 06s: C(%s,%s)<M> = Z ; using S ",
                 Istring, Jstring) ;
-            GB_OK (GB_subassign_06s (C,
-                I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                M, Mask_struct, A, S, Context)) ;
+            GB_OK (GB_subassign_06s_and_14 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, false, A, Context)) ;
         }
-
-    }
-    else if (M == NULL)
-    {
+        break ;
 
         //----------------------------------------------------------------------
         // assignment using S_Extraction method, no mask M
@@ -1185,56 +435,48 @@ GrB_Info GB_subassigner             // C(I,J)<#M> = A or accum (C (I,J), A)
         //  -   -   -   +   -   S       03:  C(I,J) += x, with S
         //  -   -   -   +   A   S       04:  C(I,J) += A, with S
 
-        ASSERT (!Mask_comp) ;
-        ASSERT (!C_replace) ;
-        ASSERT (S != NULL) ;
+        case GB_SUBASSIGN_METHOD_01 : 
+        {
+            // Method 01: C(I,J) = scalar ; using S
+            GBURBLE ("Method 01: C(%s,%s) = scalar ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_01 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                scalar, atype, Context)) ;
+        }
+        break ;
 
-        if (scalar_expansion)
+        case GB_SUBASSIGN_METHOD_03 : 
         {
-            if (accum == NULL)
-            { 
-                // Method 01: C(I,J) = scalar ; using S
-                GBBURBLE ("Method 01: C(%s,%s) = scalar ; using S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_01 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    scalar, atype, S, Context)) ;
-            }
-            else
-            { 
-                // Method 03: C(I,J) += scalar ; using S
-                GBBURBLE ("Method 03: C(%s,%s) += scalar ; using S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_03 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    accum, scalar, atype, S, Context)) ;
-            }
+            // Method 03: C(I,J) += scalar ; using S
+            GBURBLE ("Method 03: C(%s,%s) += scalar ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_03 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                accum, scalar, atype, Context)) ;
         }
-        else
+        break ;
+
+        case GB_SUBASSIGN_METHOD_02 : 
         {
-            if (accum == NULL)
-            { 
-                // Method 02: C(I,J) = A ; using S
-                GBBURBLE ("Method 02: C(%s,%s) = Z ; using S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_02 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    A, S, Context)) ;
-            }
-            else
-            { 
-                // Method 04: C(I,J) += A ; using S
-                GBBURBLE ("Method 04: C(%s,%s) += Z ; using S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_04 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    accum, A, S, Context)) ;
-            }
+            // Method 02: C(I,J) = A ; using S
+            GBURBLE ("Method 02: C(%s,%s) = Z ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_02 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                A, Context)) ;
         }
+        break ;
 
-    }
-    else if (scalar_expansion)
-    {
+        case GB_SUBASSIGN_METHOD_04 : 
+        {
+            // Method 04: C(I,J) += A ; using S
+            GBURBLE ("Method 04: C(%s,%s) += Z ; using S ", Istring, Jstring) ;
+            GB_OK (GB_subassign_04 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                accum, A, Context)) ;
+        }
+        break ;
 
         //----------------------------------------------------------------------
         // C(I,J)<#M> = scalar or += scalar ; using S
@@ -1250,76 +492,71 @@ GrB_Info GB_subassigner             // C(I,J)<#M> = A or accum (C (I,J), A)
         //  M   c   r   -   -   S       17:  C(I,J)<!M,repl> = x, with S
         //  M   c   r   +   -   S       19:  C(I,J)<!M,repl> += x, with S
 
-        ASSERT (!C_Mask_scalar) ;
-        ASSERT (C_replace || Mask_comp) ;
-        ASSERT (S != NULL) ;
+        case GB_SUBASSIGN_METHOD_17 : 
+        {
+            // Method 17: C(I,J)<!M,repl> = scalar ; using S
+            GBURBLE ("Method 17: C(%s,%s)<!M,repl> = scalar ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_17 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, scalar, atype, Context)) ;
+        }
+        break ;
 
-        if (accum == NULL)
+        case GB_SUBASSIGN_METHOD_13 : 
         {
-            if (Mask_comp && C_replace)
-            { 
-                // Method 17: C(I,J)<!M,repl> = scalar ; using S
-                GBBURBLE ("Method 17: C(%s,%s)<!M,repl> = scalar ; using S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_17 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, scalar, atype, S, Context)) ;
-            }
-            else if (Mask_comp)
-            { 
-                // Method 13: C(I,J)<!M> = scalar ; using S
-                GBBURBLE ("Method 13: C(%s,%s)<!M> = scalar ; using S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_13 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, scalar, atype, S, Context)) ;
-            }
-            else // if (C_replace)
-            { 
-                // Method 09: C(I,J)<M,repl> = scalar ; using S
-                GBBURBLE ("Method 09: C(%s,%s)<M,repl> = scalar ; using S ",
-                    Istring, Jstring) ;
-                ASSERT (C_replace) ;
-                GB_OK (GB_subassign_09 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, scalar, atype, S, Context)) ;
-            }
+            // Method 13: C(I,J)<!M> = scalar ; using S
+            GBURBLE ("Method 13: C(%s,%s)<!M> = scalar ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_13 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, scalar, atype, Context)) ;
         }
-        else
+        break ;
+
+        case GB_SUBASSIGN_METHOD_09 : 
         {
-            if (Mask_comp && C_replace)
-            { 
-                // Method 19: C(I,J)<!M,repl> += scalar ; using S
-                GBBURBLE ("Method 19: C(%s,%s)<!M,repl> += scalar ; using S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_19 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, accum, scalar, atype, S, Context)) ;
-            }
-            else if (Mask_comp)
-            { 
-                // Method 15: C(I,J)<!M> += scalar ; using S
-                GBBURBLE ("Method 15: C(%s,%s)<!M> += scalar ; using S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_15 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, accum, scalar, atype, S, Context)) ;
-            }
-            else // if (C_replace)
-            { 
-                // Method 11: C(I,J)<M,repl> += scalar ; using S
-                GBBURBLE ("Method 11: C(%s,%s)<M,repl> += scalar ; using S ",
-                    Istring, Jstring) ;
-                ASSERT (C_replace) ;
-                GB_OK (GB_subassign_11 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, accum, scalar, atype, S, Context)) ;
-            }
+            // Method 09: C(I,J)<M,repl> = scalar ; using S
+            GBURBLE ("Method 09: C(%s,%s)<M,repl> = scalar ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_09 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, scalar, atype, Context)) ;
         }
+        break ;
 
-    }
-    else
-    {
+        case GB_SUBASSIGN_METHOD_19 : 
+        {
+            // Method 19: C(I,J)<!M,repl> += scalar ; using S
+            GBURBLE ("Method 19: C(%s,%s)<!M,repl> += scalar ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_19 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, accum, scalar, atype, Context)) ;
+        }
+        break ;
+
+        case GB_SUBASSIGN_METHOD_15 : 
+        {
+            // Method 15: C(I,J)<!M> += scalar ; using S
+            GBURBLE ("Method 15: C(%s,%s)<!M> += scalar ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_15 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, accum, scalar, atype, Context)) ;
+        }
+        break ;
+
+        case GB_SUBASSIGN_METHOD_11 : 
+        {
+            // Method 11: C(I,J)<M,repl> += scalar ; using S
+            GBURBLE ("Method 11: C(%s,%s)<M,repl> += scalar ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_11 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, accum, scalar, atype, Context)) ;
+        }
+        break ;
 
         //------------------------------------------------------------------
         // C(I,J)<#M> = A or += A ; using S
@@ -1335,82 +572,75 @@ GrB_Info GB_subassigner             // C(I,J)<#M> = A or accum (C (I,J), A)
         //  M   c   r   -   A   S       18:  C(I,J)<!M,repl> = A, with S
         //  M   c   r   +   A   S       20:  C(I,J)<!M,repl> += A, with S
 
-        ASSERT (Mask_comp || C_replace) ;
-        ASSERT (S != NULL) ;
+        case GB_SUBASSIGN_METHOD_18 : 
+        {
+            // Method 18: C(I,J)<!M,repl> = A ; using S
+            GBURBLE ("Method 18: C(%s,%s)<!M,repl> = Z ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_10_and_18 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, true, A, Context)) ;
+        }
+        break ;
 
-        if (accum == NULL)
+        case GB_SUBASSIGN_METHOD_14 : 
         {
-            if (Mask_comp && C_replace)
-            { 
-                // Method 18: C(I,J)<!M,repl> = A ; using S
-                GBBURBLE ("Method 18: C(%s,%s)<!M,repl> = Z ; using S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_18 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, A, S, Context)) ;
-            }
-            else if (Mask_comp)
-            { 
-                // Method 14: C(I,J)<!M> = A ; using S
-                GBBURBLE ("Method 14: C(%s,%s)<!M> = Z ; using S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_14 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, A, S, Context)) ;
-            }
-            else // if (C_replace)
-            { 
-                // Method 10: C(I,J)<M,repl> = A ; using S
-                GBBURBLE ("Method 10: C(%s,%s)<M,repl> = Z ; using S ",
-                    Istring, Jstring) ;
-                ASSERT (C_replace) ;
-                GB_OK (GB_subassign_10 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, A, S, Context)) ;
-            }
+            // Method 14: C(I,J)<!M> = A ; using S
+            GBURBLE ("Method 14: C(%s,%s)<!M> = Z ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_06s_and_14 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, true, A, Context)) ;
         }
-        else
+        break ;
+
+        case GB_SUBASSIGN_METHOD_10 : 
         {
-            if (Mask_comp && C_replace)
-            { 
-                // Method 20: C(I,J)<!M,repl> += A ; using S
-                GBBURBLE ("Method 20: C(%s,%s)<!M,repl> += Z ; using S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_20 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, accum, A, S, Context)) ;
-            }
-            else if (Mask_comp)
-            { 
-                // Method 16: C(I,J)<!M> += A ; using S
-                GBBURBLE ("Method 16: C(%s,%s)<!M> += Z ; using S ",
-                    Istring, Jstring) ;
-                GB_OK (GB_subassign_16 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, accum, A, S, Context)) ;
-            }
-            else // if (C_replace)
-            { 
-                // Method 12: C(I,J)<M,repl> += A ; using S
-                GBBURBLE ("Method 12: C(%s,%s)<M,repl> += Z ; using S ",
-                    Istring, Jstring) ;
-                ASSERT (C_replace) ;
-                GB_OK (GB_subassign_12 (C,
-                    I, nI, Ikind, Icolon, J, nJ, Jkind, Jcolon,
-                    M, Mask_struct, accum, A, S, Context)) ;
-            }
-            // note that C(I,J)<M> += A always uses method 6b, without S.
+            // Method 10: C(I,J)<M,repl> = A ; using S
+            GBURBLE ("Method 10: C(%s,%s)<M,repl> = Z ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_10_and_18 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, false, A, Context)) ;
         }
-    }
+        break ;
 
-    //--------------------------------------------------------------------------
-    // free workspace
-    //--------------------------------------------------------------------------
+        case GB_SUBASSIGN_METHOD_20 : 
+        {
+            // Method 20: C(I,J)<!M,repl> += A ; using S
+            GBURBLE ("Method 20: C(%s,%s)<!M,repl> += Z ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_12_and_20 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, true, accum, A, Context)) ;
+        }
+        break ;
+
+        case GB_SUBASSIGN_METHOD_16 : 
+        {
+            // Method 16: C(I,J)<!M> += A ; using S
+            GBURBLE ("Method 16: C(%s,%s)<!M> += Z ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_08s_and_16 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, true, accum, A, Context)) ;
+        }
+        break ;
 
-    GB_FREE_WORK ;
+        case GB_SUBASSIGN_METHOD_12 : 
+        {
+            // Method 12: C(I,J)<M,repl> += A ; using S
+            GBURBLE ("Method 12: C(%s,%s)<M,repl> += Z ; using S ",
+                Istring, Jstring) ;
+            GB_OK (GB_subassign_12_and_20 (C,
+                I, ni, nI, Ikind, Icolon, J, nj, nJ, Jkind, Jcolon,
+                M, Mask_struct, false, accum, A, Context)) ;
+        }
+        break ;
 
-    // TODO in 4.0: delete this:
-    if (C->nzombies == 0 && C->Pending == NULL) { if (!GB_queue_remove (C)) GB_PANIC ; } else { if (!GB_queue_insert (C)) GB_PANIC ; }
+        default:
+            ASSERT (GB_DEAD_CODE) ;
+    }
 
     //--------------------------------------------------------------------------
     // finalize C and return result
diff --git a/GraphBLAS/Source/GB_subassigner_method.c b/GraphBLAS/Source/GB_subassigner_method.c
new file mode 100644
index 0000000000..a7c12a737b
--- /dev/null
+++ b/GraphBLAS/Source/GB_subassigner_method.c
@@ -0,0 +1,736 @@
+//------------------------------------------------------------------------------
+// GB_subassigner_method: determine method for GB_subassign
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_subassign.h"
+
+int GB_subassigner_method           // return method to use in GB_subassigner
+(
+    const GrB_Matrix C,             // input/output matrix for results
+    const bool C_replace,           // C matrix descriptor
+    const GrB_Matrix M,             // optional mask for C(I,J), unused if NULL
+    const bool Mask_comp,           // mask descriptor
+    const bool Mask_struct,         // if true, use the only structure of M
+    const GrB_BinaryOp accum,       // optional accum for Z=accum(C(I,J),A)
+    const GrB_Matrix A,             // input matrix (NULL for scalar expansion)
+    const int Ikind,
+    const int Jkind,
+    const bool scalar_expansion     // if true, expand scalar to A
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check mask conditions
+    //--------------------------------------------------------------------------
+
+    #ifdef GB_DEBUG
+    // empty_mask: mask not present and complemented.  This condition has
+    // already handled by GB_assign_prep.
+    bool empty_mask = (M == NULL) && Mask_comp ;
+    ASSERT (!empty_mask) ;
+    #endif
+
+    // no_mask: mask not present and not complemented
+    bool no_mask = (M == NULL) && !Mask_comp ;
+
+    // GB_assign_prep has already disabled C_replace if no mask present
+    ASSERT (GB_IMPLIES (no_mask, !C_replace)) ;
+
+    bool M_is_bitmap = GB_IS_BITMAP (M) ;
+
+    //--------------------------------------------------------------------------
+    // check if C is empty
+    //--------------------------------------------------------------------------
+
+    bool C_is_empty = (GB_NNZ (C) == 0 && !GB_PENDING (C) && !GB_ZOMBIES (C)) ;
+
+    // if C is empty, C_replace is effectively false and already disabled
+    ASSERT (GB_IMPLIES (C_is_empty, !C_replace)) ;
+
+    //--------------------------------------------------------------------------
+    // check if A is dense; a scalar is an implicit dense matrix, or bitmap
+    //--------------------------------------------------------------------------
+
+    bool A_is_bitmap = GB_IS_BITMAP (A) ;
+
+    //--------------------------------------------------------------------------
+    // determine the method to use
+    //--------------------------------------------------------------------------
+
+    // whole_C_matrix is true if all of C(:,:) is being assigned to
+    bool whole_C_matrix = (Ikind == GB_ALL) && (Jkind == GB_ALL) ;
+
+    bool C_splat_scalar = false ;   // C(:,:) = x
+    bool C_splat_matrix = false ;   // C(:,:) = A
+
+    if (whole_C_matrix && no_mask && (accum == NULL))
+    {
+
+        //----------------------------------------------------------------------
+        // C(:,:) = x or A:  whole matrix assignment with no mask
+        //----------------------------------------------------------------------
+
+        if (scalar_expansion)
+        { 
+            // Method 21: C(:,:) = x
+            C_splat_scalar = true ;
+        }
+        else
+        { 
+            // Method 24: C(:,:) = A
+            C_splat_matrix = true ;
+        }
+    }
+
+    // check if C is competely dense:  all entries present and no pending work.
+    bool C_is_bitmap = GB_IS_BITMAP (C) ;
+    bool C_as_if_full = GB_as_if_full (C) ;
+    bool C_dense_update = false ;
+    if (C_as_if_full)
+    { 
+        // C is dense with no pending work
+        if (whole_C_matrix && no_mask && (accum != NULL)
+            && (C->type == accum->ztype) && (C->type == accum->xtype))
+        { 
+            // C(:,:) += x or A, where C is dense, no typecasting of C
+            C_dense_update = true ;
+        }
+    }
+
+    // simple_mask: C(I,J)<M> = ... ; or C(I,J)<M> += ...
+    bool simple_mask = (!C_replace && M != NULL && !Mask_comp) ;
+
+    // C_Mask_scalar: C(I,J)<M> = scalar or += scalar
+    bool C_Mask_scalar = (scalar_expansion && simple_mask) ;
+
+    // C_Mask_matrix:  C(I,J)<M> = A or += A
+    bool C_Mask_matrix = (!scalar_expansion && simple_mask) ;
+
+    bool S_Extraction ;
+    bool method_06d = false ;
+    bool method_25 = false ;
+
+    if (C_splat_scalar)
+    { 
+        // Method 21: C(:,:) = x where x is a scalar; C becomes dense
+        S_Extraction = false ;
+    }
+    else if (C_splat_matrix)
+    { 
+        // Method 24: C(:,:) = A
+        S_Extraction = false ;
+    }
+    else if (C_dense_update)
+    { 
+        // Methods 22 and 23: C(:,:) += x or A where C is dense
+        S_Extraction = false ;
+    }
+    else if (C_Mask_scalar)
+    { 
+        // Method 05*, or 07: C(I,J)<M> = or += scalar; C_replace false
+        S_Extraction = false ;
+    }
+    else if (C_Mask_matrix)
+    {
+        // C(I,J)<M> = A or += A
+        if (accum != NULL)
+        { 
+            // Method 08n: C(I,J)<M> += A, no S.  Cannot use M or A as bitmap.
+            // Method 08s: C(I,J)<M> += A, with S.  Can use M or A as bitmap.
+            // if S_Extraction is true, Method 08s is used (with S).
+            // Method 08n is not used if any matrix is bitmap.
+            // If C is bitmap, GB_bitmap_assign_M_accum is used instead.
+            S_Extraction = M_is_bitmap || A_is_bitmap ;
+        }
+        else
+        { 
+            // C(I,J)<M> = A ;  use 06s (with S) or 06n (without S)
+            // method 06s (with S) is faster when nnz (A) < nnz (M).
+            if ((C_as_if_full || C_is_bitmap) && whole_C_matrix && M == A)
+            {
+                // Method 06d: C<A> = A
+                method_06d = true ;
+                S_Extraction = false ;
+            }
+            else if (C_is_empty && whole_C_matrix && Mask_struct &&
+                (scalar_expansion || GB_as_if_full (A) || A_is_bitmap))
+            {
+                // Method 25: C<M,s> = A, where M is structural, A is
+                // dense, and C starts out empty.  The pattern of C will be the
+                // same as M, and the subassign method is extremely simple.
+                method_25 = true ;
+                S_Extraction = false ;
+            }
+            else
+            {
+                // Method 06n (no S) or Method 06s (with S):
+                // Method 06n is not used if M or A are bitmap.  If M and A are
+                // aliased and Method 06d is not used, then 06s is used instead
+                // of 06n since M==A implies nnz(A) == nnz(M).
+                S_Extraction = (GB_NNZ (A) < GB_NNZ (M))
+                    || M_is_bitmap || A_is_bitmap ;
+            }
+        }
+    }
+    else
+    { 
+        // all other methods require S
+        S_Extraction = true ;
+    }
+
+    //==========================================================================
+    // submatrix assignment C(I,J)<M> = accum (C(I,J),A): meta-algorithm
+    //==========================================================================
+
+    // There are up to 128 combinations of options, but not all must be
+    // implemented, because they are either identical to another method
+    // (C_replace is effectively false if M=NULL and Mask_comp=false), or they
+    // are not used (the last option, whether or not S is constructed, is
+    // determined here; it is not a user input).  The first 5 options are
+    // determined by the input.  The table below has been pruned to remove
+    // combinations that are not used, or equivalent to other entries in the
+    // table.  Only 22 unique combinations of the 128 combinations are needed,
+    // with additional special cases when C(:,:) is dense.
+
+    //      M           present or NULL
+    //      Mask_comp   true or false
+    //      Mask_struct structural or valued mask
+    //      C_replace   true or false
+    //      accum       present or NULL
+    //      A           scalar (x) or matrix (A)
+    //      S           constructed or not 
+
+    // C(I,J)<(M,comp,repl)> ( = , += ) (A, scalar), (with or without S);
+    // I and J can be anything for any of these methods (":", colon, or list).
+
+    // See the "No work to do..." comment above:
+    // If M is not present, Mask_comp true, C_replace false: no work to do.
+    // If M is not present, Mask_comp true, C_replace true: use Method 00
+    // If M is not present, Mask_comp false:  C_replace is now false.
+
+        //  =====================       ==============
+        //  M   cmp rpl acc A   S       method: action
+        //  =====================       ==============
+
+        //  -   -   x   -   -   -       21:  C = x, no S, C anything
+        //  -   -   x   -   A   -       24:  C = A, no S, C and A anything
+        //  -   -   -   +   -   -       22:  C += x, no S, C dense
+        //  -   -   -   +   A   -       23:  C += A, no S, C dense
+
+        //  -   -   -   -   -   S       01:  C(I,J) = x, with S
+        //  -   -   -   -   A   S       02:  C(I,J) = A, with S
+        //  -   -   -   +   -   S       03:  C(I,J) += x, with S
+        //  -   -   -   +   A   S       04:  C(I,J) += A, with S
+        //  -   -   r                        uses methods 01, 02, 03, 04
+        //  -   c   -                        no work to do
+        //  -   c   r           S       00:  C(I,J)<!,repl> = empty, with S
+
+        //  M   -   -   -   -   -       05d: C<M> = x, no S, C dense
+        //  M   -   -   -   -   -       05e: C<M,s> = x, no S, C empty
+        //  M   -   -   -   -   -       05:  C(I,J)<M> = x, no S
+        //  A   -   -   -   A   -       06d: C<A> = A, no S, C dense
+        //  M   -   -   -   A   -       25:  C<M,s> = A, A dense, C empty
+        //  M   -   -   -   A   -       06n: C(I,J)<M> = A, no S
+        //  M   -   -   -   A   S       06s: C(I,J)<M> = A, with S
+        //  M   -   -   +   -   -       07:  C(I,J)<M> += x, no S
+        //  M   -   -   +   A   -       08n: C(I,J)<M> += A, no S
+        //  M   -   -   +   A   -       08s: C(I,J)<M> += A, with S
+        //  M   -   r   -   -   S       09:  C(I,J)<M,repl> = x, with S
+        //  M   -   r   -   A   S       10:  C(I,J)<M,repl> = A, with S
+        //  M   -   r   +   -   S       11:  C(I,J)<M,repl> += x, with S
+        //  M   -   r   +   A   S       12:  C(I,J)<M,repl> += A, with S
+
+        //  M   c   -   -   -   S       13:  C(I,J)<!M> = x, with S
+        //  M   c   -   -   A   S       14:  C(I,J)<!M> = A, with S
+        //  M   c   -   +   -   S       15:  C(I,J)<!M> += x, with S
+        //  M   c   -   +   A   S       16:  C(I,J)<!M> += A, with S
+        //  M   c   r   -   -   S       17:  C(I,J)<!M,repl> = x, with S
+        //  M   c   r   -   A   S       18:  C(I,J)<!M,repl> = A, with S
+        //  M   c   r   +   -   S       19:  C(I,J)<!M,repl> += x, with S
+        //  M   c   r   +   A   S       20:  C(I,J)<!M,repl> += A, with S
+
+        //----------------------------------------------------------------------
+        // FUTURE::: 8 simpler cases when I and J are ":" (S not needed):
+        //----------------------------------------------------------------------
+
+        //  M   -   -   -   A   -       06x: C(:,:)<M> = A
+        //  M   -   -   +   A   -       08x: C(:,:)<M> += A
+        //  M   -   r   -   A   -       10x: C(:,:)<M,repl> = A
+        //  M   -   r   +   A   -       12x: C(:,:)<M,repl> += A
+        //  M   c   -   -   A   -       14x: C(:,:)<!M> = A
+        //  M   c   -   +   A   -       16x: C(:,:)<!M> += A
+        //  M   c   r   -   A   -       18x: C(:,:)<!M,repl> = A
+        //  M   c   r   +   A   -       20x: C(:,:)<!M,repl> += A
+
+        //----------------------------------------------------------------------
+        // FUTURE::: C<C,s> = x    C == M, replace all values, C_replace ignored
+        // FUTURE::: C<C,s> += x   C == M, update all values, C_replace ignored
+        // FUTURE::: C<C,s> = A    C == M, A dense, C_replace ignored
+        //----------------------------------------------------------------------
+
+    // For the single case C(I,J)<M>=A, two methods can be used: 06n and 06s.
+
+    int subassign_method = -1 ;
+
+    if (C_splat_scalar)
+    { 
+
+        //----------------------------------------------------------------------
+        // C = x where x is a scalar; C becomes full
+        //----------------------------------------------------------------------
+
+        //  =====================       ==============
+        //  M   cmp rpl acc A   S       method: action
+        //  =====================       ==============
+
+        //  -   -   x   -   -   -       21:  C = x, no S, C anything
+
+        ASSERT (whole_C_matrix) ;           // C(:,:) is modified
+        ASSERT (M == NULL) ;                // no mask present
+        ASSERT (accum == NULL) ;            // accum is not present
+        ASSERT (!C_replace) ;               // C_replace is effectively false
+        ASSERT (!S_Extraction) ;            // S is not used
+        ASSERT (scalar_expansion) ;         // x is a scalar
+
+        // Method 21: C = x where x is a scalar; C becomes full
+        subassign_method = GB_SUBASSIGN_METHOD_21 ;
+
+    }
+    else if (C_splat_matrix)
+    { 
+
+        //----------------------------------------------------------------------
+        // C = A
+        //----------------------------------------------------------------------
+
+        //  =====================       ==============
+        //  M   cmp rpl acc A   S       method: action
+        //  =====================       ==============
+
+        //  -   -   x   -   A   -       24:  C = A, no S, C and A anything
+
+        ASSERT (whole_C_matrix) ;           // C(:,:) is modified
+        ASSERT (M == NULL) ;                // no mask present
+        ASSERT (accum == NULL) ;            // accum is not present
+        ASSERT (!C_replace) ;               // C_replace is effectively false
+        ASSERT (!S_Extraction) ;            // S is not used
+        ASSERT (!scalar_expansion) ;        // A is a matrix
+
+        // Method 24: C = A
+        subassign_method = GB_SUBASSIGN_METHOD_24 ;
+
+    }
+    else if (C_dense_update)
+    { 
+
+        //----------------------------------------------------------------------
+        // C += A or x where C is dense or full (and becomes full)
+        //----------------------------------------------------------------------
+
+        //  =====================       ==============
+        //  M   cmp rpl acc A   S       method: action
+        //  =====================       ==============
+        //  -   -   -   +   -   -       22:  C += x, no S, C dense
+        //  -   -   -   +   A   -       23:  C += A, no S, C dense
+
+        ASSERT (C_as_if_full) ;             // C is dense
+        ASSERT (whole_C_matrix) ;           // C(:,:) is modified
+        ASSERT (M == NULL) ;                // no mask present
+        ASSERT (accum != NULL) ;            // accum is present
+        ASSERT (!C_replace) ;               // C_replace is false
+        ASSERT (!S_Extraction) ;            // S is not used
+
+        if (scalar_expansion)
+        {
+            // Method 22: C(:,:) += x where C is dense or full
+            subassign_method = GB_SUBASSIGN_METHOD_22 ;
+        }
+        else
+        {
+            // Method 23: C(:,:) += A where C is dense or full
+            subassign_method = GB_SUBASSIGN_METHOD_23 ;
+        }
+
+    }
+    else if (C_Mask_scalar)
+    {
+
+        //----------------------------------------------------------------------
+        // C(I,J)<M> = scalar or +=scalar
+        //----------------------------------------------------------------------
+
+        //  =====================       ==============
+        //  M   cmp rpl acc A   S       method: action
+        //  =====================       ==============
+        //  M   -   -   -   -   -       05d: C(:,:)<M> = x, no S, C dense
+        //  M   -   -   -   -   -       05e: C(:,:)<M,s> = x, no S, C empty
+        //  M   -   -   -   -   -       05:  C(I,J)<M> = x, no S
+        //  M   -   -   +   -   -       07:  C(I,J)<M> += x, no S
+
+        ASSERT (scalar_expansion) ;         // A is a scalar
+        ASSERT (M != NULL && !Mask_comp) ;  // mask M present, not compl.
+        ASSERT (!C_replace) ;               // C_replace is false
+        ASSERT (!S_Extraction) ;            // S is not used
+
+        if (accum == NULL)
+        {
+            if (C_is_empty && whole_C_matrix && Mask_struct)
+            { 
+                // Method 05e: C(:,:)<M> = scalar ; no S; C empty, M structural
+                subassign_method = GB_SUBASSIGN_METHOD_05e ;
+            }
+            else if (C_as_if_full && whole_C_matrix)
+            { 
+                // Method 05d: C(:,:)<M> = scalar ; no S; C is dense or full;
+                // C becomes full.
+                subassign_method = GB_SUBASSIGN_METHOD_05d ;
+            }
+            else
+            { 
+                // Method 05: C(I,J)<M> = scalar ; no S
+                subassign_method = GB_SUBASSIGN_METHOD_05 ;
+            }
+        }
+        else
+        { 
+            // Method 07: C(I,J)<M> += scalar ; no S
+            subassign_method = GB_SUBASSIGN_METHOD_07 ;
+        }
+
+    }
+    else if (C_Mask_matrix)
+    {
+
+        //----------------------------------------------------------------------
+        // C(I,J)<M> = A or += A
+        //----------------------------------------------------------------------
+
+        //  =====================       ==============
+        //  M   cmp rpl acc A   S       method: action
+        //  =====================       ==============
+        //  M   -   -   +   A   -       08n:  C(I,J)<M> += A, no S
+        //  M   -   -   +   A   -       08s:  C(I,J)<M> += A, with S
+        //  A   -   -   -   A   -       06d: C<A> = A, no S, C dense
+        //  M   -   x   -   A   -       25:  C<M,s> = A, A dense, C empty
+        //  M   -   -   -   A   -       06n: C(I,J)<M> = A, no S
+        //  M   -   -   -   A   S       06s: C(I,J)<M> = A, with S
+
+        ASSERT (!scalar_expansion) ;        // A is a matrix
+        ASSERT (M != NULL && !Mask_comp) ;  // mask M present, not compl.
+        ASSERT (!C_replace) ;
+
+        if (accum != NULL)
+        { 
+            if (S_Extraction)
+            {
+                // Method 08s: C(I,J)<M> += A ; with S
+                subassign_method = GB_SUBASSIGN_METHOD_08s ;
+            }
+            else
+            {
+                // Method 08n: C(I,J)<M> += A ; no S
+                // No matrix can be bitmap.
+                subassign_method = GB_SUBASSIGN_METHOD_08n ;
+            }
+        }
+        else if (method_06d)
+        { 
+            // Method 06d: C(:,:)<A> = A ; no S, C dense or full;
+            subassign_method = GB_SUBASSIGN_METHOD_06d ;
+            ASSERT ((C_as_if_full || C_is_bitmap) && whole_C_matrix && M == A) ;
+            ASSERT ((C_as_if_full || C_is_bitmap) && whole_C_matrix && M == A) ;
+        }
+        else if (method_25)
+        { 
+            // Method 25: C<M,struct> = A, C empty; A is dense, full, or bitmap
+            subassign_method = GB_SUBASSIGN_METHOD_25 ;
+        }
+        else if (!S_Extraction)
+        { 
+            // Method 06n: C(I,J)<M> = A ; no S
+            // If M or A are bitmap, this method is not used;
+            // 06s is used instead.
+            subassign_method = GB_SUBASSIGN_METHOD_06n ;
+        }
+        else
+        { 
+            // Method 06s: C(I,J)<M> = A ; using S
+            subassign_method = GB_SUBASSIGN_METHOD_06s ;
+        }
+
+    }
+    else if (M == NULL)
+    {
+
+        //----------------------------------------------------------------------
+        // assignment using S_Extraction method, no mask M
+        //----------------------------------------------------------------------
+
+        //  =====================       ==============
+        //  M   cmp rpl acc A   S       method: action
+        //  =====================       ==============
+        //  -   -   -   -   -   S       01:  C(I,J) = x, with S
+        //  -   -   -   -   A   S       02:  C(I,J) = A, with S
+        //  -   -   -   +   -   S       03:  C(I,J) += x, with S
+        //  -   -   -   +   A   S       04:  C(I,J) += A, with S
+
+        ASSERT (!Mask_comp) ;
+        ASSERT (!C_replace) ;
+        ASSERT (S_Extraction) ;            // S is used
+
+        if (scalar_expansion)
+        {
+            if (accum == NULL)
+            { 
+                // Method 01: C(I,J) = scalar ; using S
+                subassign_method = GB_SUBASSIGN_METHOD_01 ;
+            }
+            else
+            { 
+                // Method 03: C(I,J) += scalar ; using S
+                subassign_method = GB_SUBASSIGN_METHOD_03 ;
+            }
+        }
+        else
+        {
+            if (accum == NULL)
+            { 
+                // Method 02: C(I,J) = A ; using S
+                subassign_method = GB_SUBASSIGN_METHOD_02 ;
+            }
+            else
+            { 
+                // Method 04: C(I,J) += A ; using S
+                subassign_method = GB_SUBASSIGN_METHOD_04 ;
+            }
+        }
+
+    }
+    else if (scalar_expansion)
+    {
+
+        //----------------------------------------------------------------------
+        // C(I,J)<#M> = scalar or += scalar ; using S
+        //----------------------------------------------------------------------
+
+        //  =====================       ==============
+        //  M   cmp rpl acc A   S       method: action
+        //  =====================       ==============
+        //  M   -   r   -   -   S       09:  C(I,J)<M,repl> = x, with S
+        //  M   -   r   +   -   S       11:  C(I,J)<M,repl> += x, with S
+        //  M   c   -   -   -   S       13:  C(I,J)<!M> = x, with S
+        //  M   c   -   +   -   S       15:  C(I,J)<!M> += x, with S
+        //  M   c   r   -   -   S       17:  C(I,J)<!M,repl> = x, with S
+        //  M   c   r   +   -   S       19:  C(I,J)<!M,repl> += x, with S
+
+        ASSERT (!C_Mask_scalar) ;
+        ASSERT (C_replace || Mask_comp) ;
+        ASSERT (S_Extraction) ;            // S is used
+
+        if (accum == NULL)
+        {
+            if (Mask_comp && C_replace)
+            { 
+                // Method 17: C(I,J)<!M,repl> = scalar ; using S
+                subassign_method = GB_SUBASSIGN_METHOD_17 ;
+            }
+            else if (Mask_comp)
+            { 
+                // Method 13: C(I,J)<!M> = scalar ; using S
+                subassign_method = GB_SUBASSIGN_METHOD_13 ;
+            }
+            else // if (C_replace)
+            { 
+                // Method 09: C(I,J)<M,repl> = scalar ; using S
+                ASSERT (C_replace) ;
+                subassign_method = GB_SUBASSIGN_METHOD_09 ;
+            }
+        }
+        else
+        {
+            if (Mask_comp && C_replace)
+            { 
+                // Method 19: C(I,J)<!M,repl> += scalar ; using S
+                subassign_method = GB_SUBASSIGN_METHOD_19 ;
+            }
+            else if (Mask_comp)
+            { 
+                // Method 15: C(I,J)<!M> += scalar ; using S
+                subassign_method = GB_SUBASSIGN_METHOD_15 ;
+            }
+            else // if (C_replace)
+            { 
+                // Method 11: C(I,J)<M,repl> += scalar ; using S
+                ASSERT (C_replace) ;
+                subassign_method = GB_SUBASSIGN_METHOD_11 ;
+            }
+        }
+
+    }
+    else
+    {
+
+        //------------------------------------------------------------------
+        // C(I,J)<#M> = A or += A ; using S
+        //------------------------------------------------------------------
+
+        //  =====================       ==============
+        //  M   cmp rpl acc A   S       method: action
+        //  =====================       ==============
+        //  M   -   r   -   A   S       10:  C(I,J)<M,repl> = A, with S
+        //  M   -   r   +   A   S       12:  C(I,J)<M,repl> += A, with S
+        //  M   c   -   -   A   S       14:  C(I,J)<!M> = A, with S
+        //  M   c   -   +   A   S       16:  C(I,J)<!M> += A, with S
+        //  M   c   r   -   A   S       18:  C(I,J)<!M,repl> = A, with S
+        //  M   c   r   +   A   S       20:  C(I,J)<!M,repl> += A, with S
+
+        ASSERT (Mask_comp || C_replace) ;
+        ASSERT (S_Extraction) ;            // S is used
+
+        if (accum == NULL)
+        {
+            if (Mask_comp && C_replace)
+            { 
+                // Method 18: C(I,J)<!M,repl> = A ; using S
+                subassign_method = GB_SUBASSIGN_METHOD_18 ;
+            }
+            else if (Mask_comp)
+            { 
+                // Method 14: C(I,J)<!M> = A ; using S
+                subassign_method = GB_SUBASSIGN_METHOD_14 ;
+            }
+            else // if (C_replace)
+            { 
+                // Method 10: C(I,J)<M,repl> = A ; using S
+                ASSERT (C_replace) ;
+                subassign_method = GB_SUBASSIGN_METHOD_10 ;
+            }
+        }
+        else
+        {
+            if (Mask_comp && C_replace)
+            { 
+                // Method 20: C(I,J)<!M,repl> += A ; using S
+                subassign_method = GB_SUBASSIGN_METHOD_20 ;
+            }
+            else if (Mask_comp)
+            { 
+                subassign_method = GB_SUBASSIGN_METHOD_16 ;
+            }
+            else // if (C_replace)
+            { 
+                // Method 12: C(I,J)<M,repl> += A ; using S
+                ASSERT (C_replace) ;
+                subassign_method = GB_SUBASSIGN_METHOD_12 ;
+            }
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // determine if the subassign method can handle this case for bitmaps
+    //--------------------------------------------------------------------------
+
+    bool C_is_full = GB_IS_FULL (C) ;
+
+    #define GB_USE_BITMAP_IF(condition) \
+        if (condition) subassign_method = GB_SUBASSIGN_METHOD_BITMAP ;
+
+    switch (subassign_method)
+    {
+
+        //----------------------------------------------------------------------
+        // scalar assignent methods
+        //----------------------------------------------------------------------
+
+        case GB_SUBASSIGN_METHOD_01 :   // C(I,J) = scalar
+        case GB_SUBASSIGN_METHOD_03 :   // C(I,J) += scalar
+        case GB_SUBASSIGN_METHOD_05 :   // C(I,J)<M> = scalar
+        case GB_SUBASSIGN_METHOD_07 :   // C(I,J)<M> += scalar
+        case GB_SUBASSIGN_METHOD_13 :   // C(I,J)<!M> = scalar
+        case GB_SUBASSIGN_METHOD_15 :   // C(I,J)<!M> += scalar
+        case GB_SUBASSIGN_METHOD_21 :   // C(:,:) = scalar
+            // M can have any sparsity structure, including bitmap
+            GB_USE_BITMAP_IF (C_is_bitmap) ;
+            break ;
+
+        case GB_SUBASSIGN_METHOD_05d :  // C(:,:)<M> = scalar ; C is dense
+        case GB_SUBASSIGN_METHOD_05e :  // C(:,:)<M,struct> = scalar
+        case GB_SUBASSIGN_METHOD_22 :   // C += scalar ; C is dense
+            // C and M can have any sparsity pattern, including bitmap
+            break ;
+
+        case GB_SUBASSIGN_METHOD_09 :   // C(I,J)<M,replace> = scalar
+        case GB_SUBASSIGN_METHOD_11 :   // C(I,J)<M,replace> += scalar
+        case GB_SUBASSIGN_METHOD_17 :   // C(I,J)<!M,replace> = scalar
+        case GB_SUBASSIGN_METHOD_19 :   // C(I,J)<!M,replace> = scalar
+            // M can have any sparsity structure, including bitmap
+            GB_USE_BITMAP_IF (C_is_bitmap || C_is_full) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        // matrix assignent methods
+        //----------------------------------------------------------------------
+
+        // GB_accum_mask may use any of these methods, with I and J as GB_ALL.
+
+        case GB_SUBASSIGN_METHOD_02 :   // C(I,J) = A
+        case GB_SUBASSIGN_METHOD_06s :  // C(I,J)< M> = A ; with S
+        case GB_SUBASSIGN_METHOD_14 :   // C(I,J)<!M> = A
+        case GB_SUBASSIGN_METHOD_10 :   // C(I,J)< M,replace> = A
+        case GB_SUBASSIGN_METHOD_18 :   // C(I,J)<!M,replace> = A
+        case GB_SUBASSIGN_METHOD_12 :   // C(I,J)< M,replace> += A
+        case GB_SUBASSIGN_METHOD_20 :   // C(I,J)<!M,replace> += A
+            // M can have any sparsity structure, including bitmap
+            GB_USE_BITMAP_IF (C_is_bitmap || C_is_full) ;
+            break ;
+
+        case GB_SUBASSIGN_METHOD_04 :   // C(I,J) += A
+        case GB_SUBASSIGN_METHOD_08s :  // C(I,J)< M> += A, with S
+        case GB_SUBASSIGN_METHOD_16 :   // C(I,J)<!M> += A 
+        case GB_SUBASSIGN_METHOD_24 :   // C = A
+            // M can have any sparsity structure, including bitmap
+            GB_USE_BITMAP_IF (C_is_bitmap) ;
+            break ;
+
+        case GB_SUBASSIGN_METHOD_06d :  // C(:,:)<A> = A ; C is dense
+        case GB_SUBASSIGN_METHOD_23 :   // C += A ; C is dense
+            // C, M, and A can have any sparsity structure, including bitmap
+            break ;
+
+        case GB_SUBASSIGN_METHOD_25 :   // C(:,:)<M,struct> = A ; C empty
+            // C, M, and A can have any sparsity structure, including bitmap,
+            // but if M is bitmap or full, use bitmap assignment instead.
+            GB_USE_BITMAP_IF (M_is_bitmap || GB_IS_FULL (M)) ;
+            break ;
+
+        case GB_SUBASSIGN_METHOD_06n :  // C(I,J)<M> = A ; no S
+            // If M or A are bitmap, Method 06s is used instead of 06n.
+            GB_USE_BITMAP_IF (C_is_bitmap || C_is_full) ;
+            ASSERT (!M_is_bitmap) ;
+            ASSERT (!A_is_bitmap) ;
+            break ;
+
+        case GB_SUBASSIGN_METHOD_08n :  // C(I,J)<M> += A, no S
+            // Method 08s is used instead of 08n if M or A are bitmap.
+            GB_USE_BITMAP_IF (C_is_bitmap) ;
+            ASSERT (!M_is_bitmap) ;
+            ASSERT (!A_is_bitmap) ;
+            break ;
+
+        // case GB_SUBASSIGN_METHOD_BITMAP:
+        default :;
+            subassign_method = GB_SUBASSIGN_METHOD_BITMAP ;
+    }
+
+    //--------------------------------------------------------------------------
+    // return result
+    //--------------------------------------------------------------------------
+
+    return (subassign_method) ;
+}
+
diff --git a/GraphBLAS/Source/GB_subref.c b/GraphBLAS/Source/GB_subref.c
index 385d134897..46a2228b6e 100644
--- a/GraphBLAS/Source/GB_subref.c
+++ b/GraphBLAS/Source/GB_subref.c
@@ -2,8 +2,8 @@
 // GB_subref: C = A(I,J)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -24,8 +24,8 @@
 
 //      Sparse submatrix reference, C = A(I,J), extracting the pattern, not the
 //      values.  For the symbolic case, this function is called only by
-//      GB_subassigner.  Symbolic extraction creates a matrix C with the same
-//      pattern (C->p and C->i) as numeric extraction, but with different
+//      GB_subassign_symbolic.  Symbolic extraction creates a matrix C with the
+//      same pattern (C->p and C->i) as numeric extraction, but with different
 //      values, C->x.  For numeric extracion if C(inew,jnew) = A(i,j), the
 //      value of A(i,j) is copied into C(i,j).  For symbolic extraction, its
 //      *pointer* is copied into C(i,j).  Suppose an entry A(i,j) is held in Ai
@@ -46,11 +46,12 @@
 
 //          Cx [pc] = pa ;          // for symbolic extraction
 
-//      This function is called with symbolic==true by only by GB_subassigner,
-//      which uses it to extract the pattern of C(I,J), for the submatrix
-//      assignment C(I,J)=A.  In this case, this function needs to deal with
-//      zombie entries.  GB_subassigner uses this function on its C matrix,
-//      which is called A here because it is not modified here.
+//      This function is called with symbolic==true by only by
+//      GB_subassign_symbolic, which uses it to extract the pattern of C(I,J),
+//      for the submatrix assignment C(I,J)=A.  In this case, this function
+//      needs to deal with zombie entries.  GB_subassign_symbolic uses this
+//      function on its C matrix, which is called A here because it is not
+//      modified here.
 
 //      Reading a zombie entry:  A zombie entry A(i,j) has been marked by
 //      flipping its index.  The value of a zombie is not important, just its
@@ -78,6 +79,13 @@
     GB_FREE (Inext) ;       \
 }
 
+#define GB_FREE_ALL         \
+{                           \
+    GB_FREE (Cp) ;          \
+    GB_FREE (Ch) ;          \
+    GB_FREE_WORK ;          \
+}
+
 #include "GB_subref.h"
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
@@ -92,8 +100,7 @@ GrB_Info GB_subref              // C = A(I,J): either symbolic or numeric
     const int64_t ni,           // length of I, or special
     const GrB_Index *J,         // index list for C = A(I,J), or GrB_ALL, etc.
     const int64_t nj,           // length of J, or special
-    const bool symbolic,        // if true, construct Cx as symbolic
-    const bool must_sort,       // if true, must return C sorted
+    const bool symbolic,        // if true, construct C as symbolic
     GB_Context Context
 )
 {
@@ -102,11 +109,26 @@ GrB_Info GB_subref              // C = A(I,J): either symbolic or numeric
     // check inputs
     //--------------------------------------------------------------------------
 
+    GrB_Info info ;
     ASSERT (Chandle != NULL) ;
     ASSERT_MATRIX_OK (A, "A for C=A(I,J) subref", GB0) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;    // A is sorted, below, if jumbled on input
+    ASSERT (GB_PENDING_OK (A)) ;
 
     //--------------------------------------------------------------------------
-    // phase0: find vectors for C=A(I,J), and I,J properties
+    // handle bitmap and full cases
+    //--------------------------------------------------------------------------
+
+    if (GB_IS_BITMAP (A) || GB_IS_FULL (A))
+    { 
+        // C is constructed with same sparsity as A (bitmap or full)
+        return (GB_bitmap_subref (Chandle, C_is_csc, A, I, ni, J, nj, symbolic,
+            Context)) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // initializations
     //--------------------------------------------------------------------------
 
     int64_t *GB_RESTRICT Cp = NULL ;
@@ -122,18 +144,22 @@ GrB_Info GB_subref              // C = A(I,J): either symbolic or numeric
     bool post_sort, need_qsort ;
     int Ikind, ntasks, max_ntasks = 0, nthreads ;
 
-    GrB_Info info = GB_subref_phase0 (
+    //--------------------------------------------------------------------------
+    // ensure A is unjumbled
+    //--------------------------------------------------------------------------
+
+    // ensure input matrix is not jumbled.  Zombies are OK.
+    GB_MATRIX_WAIT_IF_JUMBLED (A) ;
+
+    //--------------------------------------------------------------------------
+    // phase0: find vectors for C=A(I,J), and I,J properties
+    //--------------------------------------------------------------------------
+
+    GB_OK (GB_subref_phase0 (
         // computed by phase0:
         &Ch, &Ap_start, &Ap_end, &Cnvec, &need_qsort, &Ikind, &nI, Icolon, &nJ,
         // original input:
-        A, I, ni, J, nj, must_sort, Context) ;
-
-    if (info != GrB_SUCCESS)
-    { 
-        // I,J invalid, or out of memory
-        GB_FREE_WORK ;
-        return (info) ;
-    }
+        A, I, ni, J, nj, Context)) ;
 
     //--------------------------------------------------------------------------
     // phase0b: split C=A(I,J) into tasks for phase1 and phase2
@@ -141,28 +167,20 @@ GrB_Info GB_subref              // C = A(I,J): either symbolic or numeric
 
     // This phase also inverts I if needed.
 
-    info = GB_subref_slice (
+    GB_OK (GB_subref_slice (
         // computed by phase0b:
         &TaskList, &max_ntasks, &ntasks, &nthreads, &post_sort,
         &Mark, &Inext, &ndupl,
         // computed by phase0:
         Ap_start, Ap_end, Cnvec, need_qsort, Ikind, nI, Icolon,
         // original input:
-        A->vlen, GB_NNZ (A), I, Context) ;
-
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory
-        GB_FREE (Ch) ;
-        GB_FREE_WORK ;
-        return (info) ;
-    }
+        A->vlen, GB_NNZ (A), I, Context)) ;
 
     //--------------------------------------------------------------------------
     // phase1: count the number of entries in each vector of C
     //--------------------------------------------------------------------------
 
-    info = GB_subref_phase1 (
+    GB_OK (GB_subref_phase1 (
         // computed by phase1:
         &Cp, &Cnvec_nonempty,
         // computed by phase0b:
@@ -170,56 +188,40 @@ GrB_Info GB_subref              // C = A(I,J): either symbolic or numeric
         // computed by phase0:
         Ap_start, Ap_end, Cnvec, need_qsort, Ikind, nI, Icolon,
         // original input:
-        A, I, symbolic, Context) ;
-
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory
-        GB_FREE (Ch) ;
-        GB_FREE_WORK ;
-        return (info) ;
-    }
+        A, I, symbolic, Context)) ;
 
     //--------------------------------------------------------------------------
     // phase2: compute the entries (indices and values) in each vector of C
     //--------------------------------------------------------------------------
 
-    info = GB_subref_phase2 (
+    GB_OK (GB_subref_phase2 (
         // computed by phase2:
         &C,
         // from phase1:
-        Cp, Cnvec_nonempty,
+        &Cp, Cnvec_nonempty,
         // from phase0b:
         TaskList, ntasks, nthreads, post_sort, Mark, Inext, ndupl,
         // from phase0:
-        Ch, Ap_start, Ap_end, Cnvec, need_qsort, Ikind, nI, Icolon, nJ,
+        &Ch, Ap_start, Ap_end, Cnvec, need_qsort, Ikind, nI, Icolon, nJ,
         // original input:
-        C_is_csc, A, I, symbolic, Context) ;
+        C_is_csc, A, I, symbolic, Context)) ;
+
+    // Cp and Ch have been imported into C->p and C->h, or freed if phase2
+    // fails.  Either way, Cp and Ch are set to NULL so that they cannot be
+    // freed here (except by freeing C itself).
 
     // free workspace
     GB_FREE_WORK ;
 
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory
-        return (info) ;
-    }
-
     //--------------------------------------------------------------------------
     // return result
     //--------------------------------------------------------------------------
 
-    if (must_sort)
-    {
-        ASSERT_MATRIX_OK (C, "sorted C output for C=A(I,J)", GB0) ;
-    }
-    else
-    {
-        // The matrix may have jumbled indices.  If it will be transposed in
-        // GB_accum_mask, but needs sorting, then the sort is skipped since the
-        // transpose will handle the sort.
-        ASSERT_MATRIX_OK_OR_JUMBLED (C, "C output for C=A(I,J)", GB0) ;
-    }
+    // C can be returned jumbled, even if A is not jumbled
+    ASSERT_MATRIX_OK (C, "C output for C=A(I,J)", GB0) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ;
     (*Chandle) = C ;
     return (GrB_SUCCESS) ;
 }
diff --git a/GraphBLAS/Source/GB_subref.h b/GraphBLAS/Source/GB_subref.h
index 3bd5587668..5c13866c3e 100644
--- a/GraphBLAS/Source/GB_subref.h
+++ b/GraphBLAS/Source/GB_subref.h
@@ -2,8 +2,8 @@
 // GB_subref.h: definitions for GB_subref_* functions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -24,7 +24,7 @@ GrB_Info GB_subref              // C = A(I,J): either symbolic or numeric
     const GrB_Index *J,         // index list for C = A(I,J), or GrB_ALL, etc.
     const int64_t nj,           // length of J, or special
     const bool symbolic,        // if true, construct Cx as symbolic
-    const bool must_sort,       // if true, must return C sorted
+//  const bool must_sort,       // if true, must return C sorted
     GB_Context Context
 ) ;
 
@@ -46,7 +46,7 @@ GrB_Info GB_subref_phase0
     const int64_t ni,       // length of I, or special
     const GrB_Index *J,     // index list for C = A(I,J), or GrB_ALL, etc.
     const int64_t nj,       // length of J, or special
-    const bool must_sort,   // true if C must be returned sorted
+//  const bool must_sort,   // true if C must be returned sorted
     GB_Context Context
 ) ;
 
@@ -106,7 +106,7 @@ GrB_Info GB_subref_phase2   // C=A(I,J)
 (
     GrB_Matrix *Chandle,    // output matrix (unallocated on input)
     // from phase1:
-    const int64_t *GB_RESTRICT Cp,         // vector pointers for C
+    const int64_t *GB_RESTRICT *p_Cp,   // vector pointers for C
     const int64_t Cnvec_nonempty,       // # of non-empty vectors in C
     // from phase0b:
     const GB_task_struct *GB_RESTRICT TaskList,    // array of structs
@@ -117,7 +117,7 @@ GrB_Info GB_subref_phase2   // C=A(I,J)
     const int64_t *Inext,               // for I inverse buckets, size nI
     const int64_t nduplicates,          // # of duplicates, if I inverted
     // from phase0:
-    const int64_t *GB_RESTRICT Ch,
+    const int64_t *GB_RESTRICT *p_Ch,
     const int64_t *GB_RESTRICT Ap_start,
     const int64_t *GB_RESTRICT Ap_end,
     const int64_t Cnvec,
@@ -291,5 +291,20 @@ static inline int GB_subref_method  // return the method to use (1 to 12)
     return (method) ;
 }
 
+GrB_Info GB_bitmap_subref       // C = A(I,J): either symbolic or numeric
+(
+    // output
+    GrB_Matrix *Chandle,
+    // input, not modified
+    const bool C_is_csc,        // requested format of C
+    const GrB_Matrix A,
+    const GrB_Index *I,         // index list for C = A(I,J), or GrB_ALL, etc.
+    const int64_t ni,           // length of I, or special
+    const GrB_Index *J,         // index list for C = A(I,J), or GrB_ALL, etc.
+    const int64_t nj,           // length of J, or special
+    const bool symbolic,        // if true, construct C as symbolic
+    GB_Context Context
+) ;
+
 #endif
 
diff --git a/GraphBLAS/Source/GB_subref_phase0.c b/GraphBLAS/Source/GB_subref_phase0.c
index 4fd870247b..89f77a0270 100644
--- a/GraphBLAS/Source/GB_subref_phase0.c
+++ b/GraphBLAS/Source/GB_subref_phase0.c
@@ -2,14 +2,14 @@
 // GB_subref_phase0: find vectors of C = A(I,J) and determine I,J properties
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "GB_subref.h"
 
-#define GB_Ai(p) GB_UNFLIP (Ai [p])
+#define GB_Ai(p) GBI_UNFLIP (Ai, p, avlen)
 
 //------------------------------------------------------------------------------
 // GB_find_Ap_start_end
@@ -42,8 +42,8 @@ static inline void GB_find_Ap_start_end
     // get A(:,kA)
     //--------------------------------------------------------------------------
 
-    int64_t pA = Ap [kA] ;
-    int64_t pA_end = Ap [kA+1] ;
+    int64_t pA     = GBP (Ap, kA, avlen) ;
+    int64_t pA_end = GBP (Ap, kA+1, avlen) ;
     int64_t ajnz = pA_end - pA ;
 
     //--------------------------------------------------------------------------
@@ -114,7 +114,7 @@ static inline void GB_find_Ap_start_end
 
         #ifdef GB_DEBUG
         ajnz = pA_end - pA ;
-        if (ajnz > 0)
+        if (ajnz > 0 && Ap != NULL)
         {
             // A(imin:imax,kA) is now in Ai [pA:pA_end-1]
             ASSERT (GB_IMPLIES (Ap [kA] < pA,  GB_Ai (pA-1) < imin)) ;
@@ -161,7 +161,7 @@ GrB_Info GB_subref_phase0
     const int64_t ni,       // length of I, or special
     const GrB_Index *J,     // index list for C = A(I,J), or GrB_ALL, etc.
     const int64_t nj,       // length of J, or special
-    const bool must_sort,   // true if C must be returned sorted
+//  const bool must_sort,   // true if C must be returned sorted
     GB_Context Context
 )
 {
@@ -170,18 +170,17 @@ GrB_Info GB_subref_phase0
     // check inputs
     //--------------------------------------------------------------------------
 
+    ASSERT_MATRIX_OK (A, "A for subref phase 0", GB0) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;    // GB_bitmap_subref is used instead
+
     ASSERT (p_Ch != NULL) ;
     ASSERT (p_Ap_start != NULL) ;
     ASSERT (p_Ap_end != NULL) ;
     ASSERT (p_Cnvec != NULL) ;
-
     ASSERT (p_nJ != NULL) ;
-
     ASSERT (p_Ikind != NULL) ;
     ASSERT (p_nI != NULL) ;
     ASSERT (Icolon != NULL) ;
-
-    ASSERT_MATRIX_OK (A, "A for subref phase 0", GB0) ;
     ASSERT (I != NULL) ;
     ASSERT (J != NULL) ;
 
@@ -239,15 +238,6 @@ GrB_Info GB_subref_phase0
 
     bool need_qsort = I_unsorted ;
 
-    // For the symbolic case, GB_subref must always return C sorted.  For the
-    // numeric case, GB_subref may return C with jumbled indices in each
-    // vector, if C will be transposed later by GB_accum_mask.
-    if (must_sort == false)
-    { 
-        // The caller does not need C to be returned with sorted vectors.
-        need_qsort = false ;
-    }
-
     //--------------------------------------------------------------------------
     // determine if C is empty
     //--------------------------------------------------------------------------
@@ -264,7 +254,7 @@ GrB_Info GB_subref_phase0
     // jmax is avdim-1, so there is nothing to trim from Ah.  If C is empty,
     // then Ah and Ap will not be accessed at all, so this can be skipped.
 
-    bool A_is_hyper = A->is_hyper ;
+    bool A_is_hyper = (Ah != NULL) ;
 
     if (A_is_hyper && !C_empty)
     {
@@ -327,9 +317,9 @@ GrB_Info GB_subref_phase0
 
     Count = GB_CALLOC (max_ntasks+1, int64_t) ;
     if (Count == NULL)
-    {
+    { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -478,7 +468,7 @@ GrB_Info GB_subref_phase0
         if (Ch == NULL)
         { 
             GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
     }
 
@@ -493,7 +483,7 @@ GrB_Info GB_subref_phase0
             GB_FREE (Ch) ;
             GB_FREE (Ap_start) ;
             GB_FREE (Ap_end) ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
     }
 
@@ -659,15 +649,15 @@ GrB_Info GB_subref_phase0
     for (int64_t kC = 0 ; kC < Cnvec ; kC++)
     {
         // jC is the (kC)th vector of C = A(I,J)
-        int64_t jC = (Ch == NULL) ? kC : Ch [kC] ;
+        int64_t jC = GBH (Ch, kC) ;
         int64_t jA = GB_ijlist (J, jC, Jkind, Jcolon) ;
         // jA is the corresponding (kA)th vector of A.
         int64_t kA = 0 ;
         int64_t pright = A->nvec - 1 ;
         int64_t pA_start_all, pA_end_all ;
-        bool found = GB_lookup (A->is_hyper, A->h, A->p, &kA, pright, jA,
-            &pA_start_all, &pA_end_all) ;
-        if (found && A->is_hyper)
+        bool found = GB_lookup (A->h != NULL, A->h, A->p, A->vlen, &kA,
+            pright, jA, &pA_start_all, &pA_end_all) ;
+        if (found && A->h != NULL)
         {
             ASSERT (jA == A->h [kA]) ;
         }
diff --git a/GraphBLAS/Source/GB_subref_phase1.c b/GraphBLAS/Source/GB_subref_phase1.c
index 961354f4f6..05b9016017 100644
--- a/GraphBLAS/Source/GB_subref_phase1.c
+++ b/GraphBLAS/Source/GB_subref_phase1.c
@@ -2,8 +2,8 @@
 // GB_subref_phase1: find # of entries in C=A(I,J)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,6 +47,7 @@ GrB_Info GB_subref_phase1               // count nnz in each C(:,j)
 
     ASSERT (Cp_handle != NULL) ;
     ASSERT_MATRIX_OK (A, "A for subref phase1", GB0) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;    // GB_bitmap_subref is used instead
 
     //--------------------------------------------------------------------------
     // allocate the result
@@ -57,7 +58,7 @@ GrB_Info GB_subref_phase1               // count nnz in each C(:,j)
     if (Cp == NULL)
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -66,13 +67,13 @@ GrB_Info GB_subref_phase1               // count nnz in each C(:,j)
 
     #define GB_PHASE_1_OF_2
     if (symbolic)
-    {
+    { 
         #define GB_SYMBOLIC
         #include "GB_subref_template.c"
         #undef  GB_SYMBOLIC
     }
     else
-    {
+    { 
         #define GB_NUMERIC
         #include "GB_subref_template.c"
         #undef  GB_NUMERIC
diff --git a/GraphBLAS/Source/GB_subref_phase2.c b/GraphBLAS/Source/GB_subref_phase2.c
index 090e887c99..b5ad6622b3 100644
--- a/GraphBLAS/Source/GB_subref_phase2.c
+++ b/GraphBLAS/Source/GB_subref_phase2.c
@@ -2,8 +2,8 @@
 // GB_subref_phase2: C=A(I,J)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,7 +17,7 @@ GrB_Info GB_subref_phase2   // C=A(I,J)
 (
     GrB_Matrix *Chandle,    // output matrix (unallocated on input)
     // from phase1:
-    const int64_t *GB_RESTRICT Cp,         // vector pointers for C
+    const int64_t *GB_RESTRICT *p_Cp,   // vector pointers for C
     const int64_t Cnvec_nonempty,       // # of non-empty vectors in C
     // from phase0b:
     const GB_task_struct *GB_RESTRICT TaskList,    // array of structs
@@ -28,7 +28,7 @@ GrB_Info GB_subref_phase2   // C=A(I,J)
     const int64_t *Inext,               // for I inverse buckets, size nI
     const int64_t nduplicates,          // # of duplicates, if I inverted
     // from phase0:
-    const int64_t *GB_RESTRICT Ch,
+    const int64_t *GB_RESTRICT *p_Ch,
     const int64_t *GB_RESTRICT Ap_start,
     const int64_t *GB_RESTRICT Ap_end,
     const int64_t Cnvec,
@@ -50,8 +50,11 @@ GrB_Info GB_subref_phase2   // C=A(I,J)
     // check inputs
     //--------------------------------------------------------------------------
 
+    const int64_t *GB_RESTRICT Ch = *p_Ch ;
+    const int64_t *GB_RESTRICT Cp = *p_Cp ;
     ASSERT (Cp != NULL) ;
     ASSERT_MATRIX_OK (A, "A for subref phase2", GB0) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;    // GB_bitmap_subref is used instead
 
     //--------------------------------------------------------------------------
     // allocate the output matrix C
@@ -65,26 +68,29 @@ GrB_Info GB_subref_phase2   // C=A(I,J)
     GrB_Type ctype = (symbolic) ? GrB_INT64 : A->type ;
 
     // allocate the result C (but do not allocate C->p or C->h)
-    GrB_Matrix C = NULL ;           // allocate a new header for C
-    GrB_Info info = GB_create (&C, ctype, nI, nJ, GB_Ap_null, C_is_csc,
-        GB_SAME_HYPER_AS (C_is_hyper), A->hyper_ratio, Cnvec, cnz, true,
-        Context) ;
+    GrB_Matrix C = NULL ;
+    int sparsity = C_is_hyper ? GxB_HYPERSPARSE : GxB_SPARSE ;
+    GrB_Info info = GB_new_bix (&C, // sparse or hyper, new header
+        ctype, nI, nJ, GB_Ap_null, C_is_csc,
+        sparsity, true, A->hyper_switch, Cnvec, cnz, true, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory
-        GB_FREE (Cp) ;
-        GB_FREE (Ch) ;
+        GB_FREE (*p_Cp) ;
+        GB_FREE (*p_Ch) ;
         return (info) ;
     }
 
     // add Cp as the vector pointers for C, from GB_subref_phase1
     C->p = (int64_t *) Cp ;
+    (*p_Cp) = NULL ;
 
     // add Ch as the hypersparse list for C, from GB_subref_phase0
     if (C_is_hyper)
     { 
         // transplant Ch into C
         C->h = (int64_t *) Ch ;
+        (*p_Ch) = NULL ;
         C->nvec = Cnvec ;
     }
 
@@ -99,13 +105,13 @@ GrB_Info GB_subref_phase2   // C=A(I,J)
 
     #define GB_PHASE_2_OF_2
     if (symbolic)
-    {
+    { 
         #define GB_SYMBOLIC
         #include "GB_subref_template.c"
         #undef  GB_SYMBOLIC
     }
     else
-    {
+    { 
         #define GB_NUMERIC
         #include "GB_subref_template.c"
         #undef  GB_NUMERIC
@@ -119,7 +125,7 @@ GrB_Info GB_subref_phase2   // C=A(I,J)
     if (info != GrB_SUCCESS)
     { 
         // out of memory
-        GB_MATRIX_FREE (&C) ;
+        GB_Matrix_free (&C) ;
         return (info) ;
     }
 
@@ -127,10 +133,8 @@ GrB_Info GB_subref_phase2   // C=A(I,J)
     // return result
     //--------------------------------------------------------------------------
 
-    // caller must not free Cp or Ch.   The matrix may have jumbled indices.
-    // If it will be transposed in GB_accum_mask, but needs sorting, then the
-    // sort is skipped since the transpose will handle the sort.
-    ASSERT_MATRIX_OK_OR_JUMBLED (C, "C output for subref phase2", GB0) ;
+    // caller must not free Cp or Ch
+    ASSERT_MATRIX_OK (C, "C output for subref phase2", GB0) ;
     (*Chandle) = C ;
     return (GrB_SUCCESS) ;
 }
diff --git a/GraphBLAS/Source/GB_subref_slice.c b/GraphBLAS/Source/GB_subref_slice.c
index 98b3c823ba..1b7040f34c 100644
--- a/GraphBLAS/Source/GB_subref_slice.c
+++ b/GraphBLAS/Source/GB_subref_slice.c
@@ -2,8 +2,8 @@
 // GB_subref_slice: construct coarse/fine tasks for C = A(I,J)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -154,7 +154,7 @@ GrB_Info GB_subref_slice
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -169,7 +169,7 @@ GrB_Info GB_subref_slice
     for (kC = 0 ; kC < Cnvec ; kC++)
     { 
         // jC is the (kC)th vector of C = A(I,J)
-        // int64_t jC = (Ch == NULL) ? kC : Ch [kC] ;
+        // int64_t jC = GBH (Ch, kC) ;
         // C(:,kC) = A(I,kA) will be constructed
         int64_t pA      = Ap_start [kC] ;
         int64_t pA_end  = Ap_end   [kC] ;
@@ -248,11 +248,11 @@ GrB_Info GB_subref_slice
     // slice the work into coarse tasks
     //--------------------------------------------------------------------------
 
-    if (!GB_pslice (&Coarse, Cwork, Cnvec, ntasks1))
-    {
+    if (!GB_pslice (&Coarse, Cwork, Cnvec, ntasks1, false))
+    { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GB_task_cumsum.c b/GraphBLAS/Source/GB_task_cumsum.c
index 26a104477b..ba0ee48f21 100644
--- a/GraphBLAS/Source/GB_task_cumsum.c
+++ b/GraphBLAS/Source/GB_task_cumsum.c
@@ -2,11 +2,13 @@
 // GB_task_cumsum: cumulative sum of Cp and fine tasks in TaskList
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
+// Cp is never NULL.  C is created as sparse or hypersparse.
+
 #include "GB.h"
 
 void GB_task_cumsum
diff --git a/GraphBLAS/Source/GB_thread_local.c b/GraphBLAS/Source/GB_thread_local.c
deleted file mode 100644
index 91c862f68c..0000000000
--- a/GraphBLAS/Source/GB_thread_local.c
+++ /dev/null
@@ -1,93 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_thread_local: manage thread-local storage
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// TODO in 4.0: delete this
-
-// This implementation is complete for user threading with POSIX threads,
-// OpenMP, and no user threads.  Windows and ANSI C11 threads are not yet
-// supported.
-
-// Thread local storage is used to to record the details of the last error
-// encountered for GrB_error.  If the user application is multi-threaded, each
-// thread that calls GraphBLAS needs its own private copy of this report.
-
-// These two functions are defined here:
-
-//      GB_thread_local_init:       initialize thread-local storage
-//      GB_thread_local_get:        get pointer to thread-local storage
-
-// They access the following global or thread-local variables, which are
-// defined and accessible only in this file:
-
-//      GB_thread_local_key:        for POSIX threads only
-//      GB_thread_local_report:     for OpenMP and ANSI C11 threads only
-
-#include "GB_thread_local.h"
-
-#if defined ( USER_POSIX_THREADS )
-// thread-local storage for POSIX THREADS
-pthread_key_t GB_thread_local_key ;
-
-#else
-// OpenMP user threads, or no user threads: this is the default
-char GB_thread_local_report [GB_RLEN+1] = "" ;
-#pragma omp threadprivate(GB_thread_local_report)
-#endif
-
-//------------------------------------------------------------------------------
-// GB_thread_local_init: initialize thread-local storage
-//------------------------------------------------------------------------------
-
-bool GB_thread_local_init
-(
-    void (* free_function) (void *)     // used for POSIX threads only
-)
-{ 
-    #if defined ( USER_POSIX_THREADS )
-    {
-        // initialize the key for thread-local storage, allocated in
-        // GB_thread_local_get via GB_Global_calloc_function, and freed by
-        // GB_Global_free_function.
-        return (pthread_key_create (&GB_thread_local_key, free_function) == 0) ;
-    }
-    #else
-    {
-        GB_thread_local_report [0] = '\0' ;
-        return (true) ;
-    }
-    #endif
-}
-
-//------------------------------------------------------------------------------
-// GB_thread_local_get: get pointer to thread-local storage
-//------------------------------------------------------------------------------
-
-char *GB_thread_local_get (void)        // get pointer to thread-local storage
-{ 
-    #if defined ( USER_POSIX_THREADS )
-    {
-        // thread-local storage for POSIX
-        char *p = pthread_getspecific (GB_thread_local_key) ;
-        if (p == NULL)
-        {
-            // first time:  allocate the space for the report
-            p = (void *) GB_Global_calloc_function ((GB_RLEN+1), sizeof (char));
-            if (p != NULL) pthread_setspecific (GB_thread_local_key, p) ;
-        }
-        // do not attempt to recover from a failure to allocate the space;
-        // just return the NULL pointer on failure.  The caller will catch it.
-        return (p) ;
-    }
-    #else
-    {
-        return (GB_thread_local_report) ;
-    }
-    #endif
-}
-
diff --git a/GraphBLAS/Source/GB_thread_local.h b/GraphBLAS/Source/GB_thread_local.h
deleted file mode 100644
index eda1bb3dbd..0000000000
--- a/GraphBLAS/Source/GB_thread_local.h
+++ /dev/null
@@ -1,34 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_thread_local.h: definitions for thread local storage
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Thread local storage is created by GrB_init or GxB_init (via GB_init),
-// and then accessed by the error logging mechanism (GB_error), and the
-// error reporting function GrB_error.
-
-#ifndef GB_THREAD_LOCAL_H
-#define GB_THREAD_LOCAL_H
-
-#include "GB.h"
-
-#if defined ( USER_POSIX_THREADS )
-// use POSIX for thread-local storage
-extern pthread_key_t GB_thread_local_key ;
-#else
-// use OpenMP for thread-local storage
-extern char GB_thread_local_report [GB_RLEN+1] ;
-#endif
-
-bool GB_thread_local_init               // intialize thread-local storage
-(
-    void (* free_function) (void *)
-) ;
-
-char *GB_thread_local_get (void) ;      // get pointer to thread-local storage
-
-#endif
diff --git a/GraphBLAS/Source/GB_to_hyper_test.c b/GraphBLAS/Source/GB_to_hyper_test.c
deleted file mode 100644
index ec7055d66c..0000000000
--- a/GraphBLAS/Source/GB_to_hyper_test.c
+++ /dev/null
@@ -1,64 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_to_hyper_test: test if a matrix should convert to hyperspasre
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Returns true if a non-hypersparse matrix should be converted to hypersparse.
-// Returns false if the matrix is already hypersparse.
-
-#include "GB.h"
-
-bool GB_to_hyper_test       // test for conversion to hypersparse
-(
-    GrB_Matrix A,           // matrix to test
-    int64_t k,              // # of non-empty vectors of A, an estimate is OK,
-                            // but normally A->nvec_nonempty
-    int64_t vdim            // normally A->vdim
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    ASSERT (A != NULL) ;
-
-    //--------------------------------------------------------------------------
-    // test for conversion
-    //--------------------------------------------------------------------------
-
-    if (A->is_hyper)
-    { 
-
-        //----------------------------------------------------------------------
-        // A is already hypersparse: no need to convert it
-        //----------------------------------------------------------------------
-
-        return (false) ;
-
-    }
-    else
-    { 
-
-        //----------------------------------------------------------------------
-        // A is non-hypersparse; test for conversion to hypersparse
-        //----------------------------------------------------------------------
-
-        // get the vector dimension of this matrix
-        float n = (float) vdim ;
-
-        // get the hyper ratio for this matrix
-        float r = A->hyper_ratio ;
-
-        // ensure k is in the range 0 to n, inclusive
-        k = GB_IMAX (k, 0) ;
-        k = GB_IMIN (k, n) ;
-
-        return (n > 1 && (((float) k) <= n * r)) ;
-    }
-}
-
diff --git a/GraphBLAS/Source/GB_to_nonhyper_test.c b/GraphBLAS/Source/GB_to_nonhyper_test.c
deleted file mode 100644
index 7352d3a77a..0000000000
--- a/GraphBLAS/Source/GB_to_nonhyper_test.c
+++ /dev/null
@@ -1,64 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_to_nonhyper_test: test if a matrix should convert to non-hyperspasre
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Returns true if a hypersparse matrix should be converted to non-hypersparse.
-// Returns false if the matrix is already non-hypersparse.
-
-#include "GB.h"
-
-bool GB_to_nonhyper_test    // test for conversion to hypersparse
-(
-    GrB_Matrix A,           // matrix to test
-    int64_t k,              // # of non-empty vectors of A, an estimate is OK,
-                            // but normally A->nvec_nonempty
-    int64_t vdim            // normally A->vdim
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    ASSERT (A != NULL) ;
-
-    //--------------------------------------------------------------------------
-    // test for conversion
-    //--------------------------------------------------------------------------
-
-    if (!A->is_hyper)
-    { 
-
-        //----------------------------------------------------------------------
-        // A is already non-hypersparse: no need to convert it
-        //----------------------------------------------------------------------
-
-        return (false) ;
-
-    }
-    else
-    { 
-
-        //----------------------------------------------------------------------
-        // A is hypersparse; test for conversion to non-hypersparse
-        //----------------------------------------------------------------------
-
-        // get the vector dimension of this matrix
-        float n = (float) vdim ;
-
-        // get the hyper ratio for this matrix
-        float r = A->hyper_ratio ;
-
-        // ensure k is in the range 0 to n, inclusive
-        k = GB_IMAX (k, 0) ;
-        k = GB_IMIN (k, n) ;
-
-        return (n <= 1 || (((float) k) > n * r * 2)) ;
-    }
-}
-
diff --git a/GraphBLAS/Source/GB_transplant.c b/GraphBLAS/Source/GB_transplant.c
index ab443644b5..db15e7e162 100644
--- a/GraphBLAS/Source/GB_transplant.c
+++ b/GraphBLAS/Source/GB_transplant.c
@@ -2,8 +2,8 @@
 // GB_transplant: replace contents of one matrix with another
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,8 +14,6 @@
 // shallow.  This function is not user-callable.  The new type of C (ctype)
 // must be compatible with A->type.
 
-// Only GrB_SUCCESS and GrB_OUT_OF_MEMORY are returned by this function.
-
 #include "GB.h"
 
 GrB_Info GB_transplant          // transplant one matrix into another
@@ -35,18 +33,17 @@ GrB_Info GB_transplant          // transplant one matrix into another
     GrB_Matrix A = *Ahandle ;
     ASSERT (!GB_aliased (C, A)) ;
 
-    ASSERT (C != NULL) ;
     ASSERT_MATRIX_OK (A, "A before transplant", GB0) ;
-    ASSERT_TYPE_OK (ctype, "new type for C", GB0) ;
-
-    // pending tuples may not appear in A
-    ASSERT (!GB_PENDING (A)) ;
-
-    // zombies in A can be safely transplanted into C
-    ASSERT (GB_ZOMBIES_OK (A)) ;
+    ASSERT (GB_ZOMBIES_OK (A)) ;    // zombies in A transplanted into C
+    ASSERT (GB_JUMBLED_OK (A)) ;    // if A is jumbled, then C is jumbled
+    ASSERT (GB_PENDING_OK (A)) ;    // pending tuples n A transplanted into C
 
-    // C is about to be cleared, so zombies and pending tuples are OK
-    ASSERT (GB_PENDING_OK (C)) ; ASSERT (GB_ZOMBIES_OK (C)) ;
+    // C is about to be cleared, any pending work is OK
+    ASSERT (C != NULL) ;
+    ASSERT_TYPE_OK (ctype, "new type for C", GB0) ;
+    ASSERT (GB_PENDING_OK (C)) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
 
     // the ctype and A->type must be compatible.  C->type is ignored
     ASSERT (GB_Type_compatible (ctype, A->type)) ;
@@ -58,98 +55,66 @@ GrB_Info GB_transplant          // transplant one matrix into another
     // determine the number of threads to use
     //--------------------------------------------------------------------------
 
-    int64_t anz = GB_NNZ (A) ;
+    int64_t anz = GB_NNZ_HELD (A) ;
     int64_t anvec = A->nvec ;
 
     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
     int nthreads = GB_nthreads (anz + anvec, chunk, nthreads_max) ;
 
     //--------------------------------------------------------------------------
-    // save prior pattern of C, if dense
-    //--------------------------------------------------------------------------
-
-    bool A_is_dense = GB_is_dense (A) ;
-
-    bool keep_Cp_and_Ci =               // keep C->p and C->i if:
-        (
-            GB_is_dense (C)             //      both A and C are dense
-            && A_is_dense
-            && !GB_ZOMBIES (C)          //      neither have zombies
-            && !GB_ZOMBIES (A)
-            && !(C->p_shallow)          //      Cp and Ci are not shallow
-            && !(C->i_shallow)
-            && !C->is_hyper             //      both A and C are standard
-            && !A->is_hyper
-            && C->vdim == avdim         //      A and C have the same size
-            && C->vlen == avlen
-            && C->is_csc == A->is_csc   //      A and C have the same format
-            && C->p != NULL         
-            && C->i != NULL             //      Cp and Ci exist
-        ) ;
-
-    int64_t *GB_RESTRICT Cp_keep = NULL ;
-    int64_t *GB_RESTRICT Ci_keep = NULL ;
-    int64_t cplen_keep = 0 ;
-    int64_t cnvec_keep = 0 ;
-
-    if (keep_Cp_and_Ci)
-    { 
-        // Keep C->p and C->i by removing them from C.  They already contain
-        // the right pattern for a dense matrix C.  No need to free it and
-        // recreate the same thing.
-        GBBURBLE ("(remains dense) ") ;
-        Cp_keep = C->p ;
-        Ci_keep = C->i ;
-        cplen_keep = C->plen ;
-        cnvec_keep = C->nvec ;
-        C->p = NULL ;
-        C->i = NULL ;
-    }
-
-    //--------------------------------------------------------------------------
-    // clear C and transplant the type, size, and hypersparsity
+    // clear C and transplant the type, size, format, and pending tuples
     //--------------------------------------------------------------------------
 
     // free all content of C
-    GB_PHIX_FREE (C) ;
+    GB_phbix_free (C) ;
 
     ASSERT (!GB_PENDING (C)) ;
     ASSERT (!GB_ZOMBIES (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
     ASSERT (C->nzmax == 0) ;
 
-    // It is now safe to change the type, dimension, and hypersparsity of C
+    // It is now safe to change the type and dimension of C
     C->type = ctype ;
-    C->type_size = ctype->size ;
     C->is_csc = A->is_csc ;
-    C->is_hyper = A->is_hyper ;
     C->vlen = avlen ;
     C->vdim = avdim ;
-    ASSERT (A->nvec_nonempty == -1 ||   // can be postponed
-            A->nvec_nonempty == GB_nvec_nonempty (A, Context)) ;
     C->nvec_nonempty = A->nvec_nonempty ;
 
-    // C->hyper_ratio is not modified by the transplant
+    // C is not shallow, and has no content yet
+    ASSERT (!GB_is_shallow (C)) ;
+    ASSERT (C->p == NULL) ;
+    ASSERT (C->h == NULL) ;
+    ASSERT (C->b == NULL) ;
+    ASSERT (C->i == NULL) ;
+    ASSERT (C->x == NULL) ;
+    ASSERT (C->Pending == NULL) ;
+
+    // C->hyper_switch and C->bitmap_switch are not modified by the transplant
+
+    // determine if C should be constructed as a bitmap or full matrix
+    bool C_is_bitmap = GB_IS_BITMAP (A) ;
+    bool C_is_full = GB_as_if_full (A) && !C_is_bitmap ;
+
+    //--------------------------------------------------------------------------
+    // transplant pending tuples from A to C
+    //--------------------------------------------------------------------------
 
-    // C is not shallow, and has no content
-    ASSERT (!C->p_shallow && !C->h_shallow && !C->i_shallow && !C->x_shallow) ;
-    ASSERT (C->h == NULL && C->p == NULL && C->i == NULL && C->x == NULL) ;
+    C->Pending = A->Pending ;
+    A->Pending = NULL ;
 
     //--------------------------------------------------------------------------
     // transplant A->p vector pointers and A->h hyperlist
     //--------------------------------------------------------------------------
 
-    if (keep_Cp_and_Ci)
+    if (C_is_full || C_is_bitmap)
     { 
 
         //----------------------------------------------------------------------
-        // keep existing C->p
+        // C is full or bitmap: C->p and C->h do not exist
         //----------------------------------------------------------------------
 
-        C->p = Cp_keep ;
-        Cp_keep = NULL ;
-        C->h = NULL ;
-        C->plen = cplen_keep ;
-        C->nvec = cnvec_keep ;
+        C->plen = -1 ;
+        C->nvec = avdim ;
 
         // free any non-shallow A->p and A->h content of A
         GB_ph_free (A) ;
@@ -164,7 +129,7 @@ GrB_Info GB_transplant          // transplant one matrix into another
 
         int nth = GB_nthreads (anvec, chunk, nthreads_max) ;
 
-        if (A->is_hyper)
+        if (A->h != NULL)
         {
             // A is hypersparse, create new C->p and C->h
             C->plen = anvec ;
@@ -174,9 +139,9 @@ GrB_Info GB_transplant          // transplant one matrix into another
             if (C->p == NULL || C->h == NULL)
             { 
                 // out of memory
-                GB_PHIX_FREE (C) ;
-                GB_MATRIX_FREE (Ahandle) ;
-                return (GB_OUT_OF_MEMORY) ;
+                GB_phbix_free (C) ;
+                GB_Matrix_free (Ahandle) ;
+                return (GrB_OUT_OF_MEMORY) ;
             }
 
             // copy A->p and A->h into the newly created C->p and C->h
@@ -185,34 +150,20 @@ GrB_Info GB_transplant          // transplant one matrix into another
         }
         else
         {
-            // A is non-hypersparse, create new C->p
+            // A is sparse, create new C->p
             C->plen = avdim ;
             C->nvec = avdim ;
             C->p = GB_MALLOC (C->plen+1, int64_t) ;
             if (C->p == NULL)
             { 
                 // out of memory
-                GB_PHIX_FREE (C) ;
-                GB_MATRIX_FREE (Ahandle) ;
-                return (GB_OUT_OF_MEMORY) ;
+                GB_phbix_free (C) ;
+                GB_Matrix_free (Ahandle) ;
+                return (GrB_OUT_OF_MEMORY) ;
             }
 
-            if (A_is_dense)
-            {
-                // create C->p for a dense matrix C
-                int64_t *GB_RESTRICT Cp = C->p ;
-                int64_t k ;
-                #pragma omp parallel for num_threads(nth) schedule(static)
-                for (k = 0 ; k <= avdim ; k++)
-                { 
-                    Cp [k] = k * avlen ;
-                }
-            }
-            else
-            { 
-                // copy A->p into the newly created C->p
-                GB_memcpy (C->p, A->p, (avdim+1) * sizeof (int64_t), nth) ;
-            }
+            // copy A->p into the newly created C->p
+            GB_memcpy (C->p, A->p, (avdim+1) * sizeof (int64_t), nth) ;
         }
 
         // free any non-shallow A->p and A->h content of A
@@ -227,7 +178,7 @@ GrB_Info GB_transplant          // transplant one matrix into another
         //----------------------------------------------------------------------
 
         // Quick transplant of A->p and A->h into C.  This works for both
-        // standard and hypersparse cases.
+        // sparse and hypersparse cases.
         ASSERT (C->p == NULL) ;
         ASSERT (C->h == NULL) ;
         C->p = A->p ;
@@ -249,53 +200,60 @@ GrB_Info GB_transplant          // transplant one matrix into another
     if (anz == 0)
     { 
         // quick return if A has no entries
-        // Ci_keep is not needed after all, since C is empty
-        GB_FREE (Ci_keep) ;
         ASSERT_MATRIX_OK (C, "C empty transplant", GB0) ;
-        GB_MATRIX_FREE (Ahandle) ;
+        GB_Matrix_free (Ahandle) ;
         return (GrB_SUCCESS) ;
     }
 
     //--------------------------------------------------------------------------
-    // allocate new space for C->i and C->x if A is shallow
+    // allocate new space for C->b, C->i, and C->x if A is shallow
     //--------------------------------------------------------------------------
 
-    // get C->nzmax:  if either C->x or C->i must be allocated, then C->nzmax
-    // is set to their minimum size.  Otherwise, if both C->x and C->i can
+    // get C->nzmax:  if C->b, C->i, or C->x must be allocated, then C->nzmax
+    // is set to their minimum size.  Otherwise, if C->b, C->i, and C->x can
     // be transplanted from A, then they inherit the nzmax of A.
 
-    // Do not allocate C->i if the pattern of a dense matrix C is being kept.
+    // C->b is allocated only if A->b exists and is shallow.
+    // C->i is not allocated if C is full or bitmap.
+    // C->x is allocated if A->x is shallow, or if the type is changing
 
-    ASSERT (C->x == NULL && C->i == NULL) ;
-    bool allocate_Ci = (A->i_shallow) && !keep_Cp_and_Ci ;
+    ASSERT (C->b == NULL && C->i == NULL && C->x == NULL) ;
+    bool allocate_Cb = (A->b_shallow) && (C_is_bitmap) ;
+    bool allocate_Ci = (A->i_shallow) && (!(C_is_full || C_is_bitmap)) ;
     bool allocate_Cx = (A->x_shallow || C->type != A->type) ;
-    C->nzmax = (allocate_Cx || allocate_Ci) ? anz : A->nzmax ;
+    C->nzmax = (allocate_Cb || allocate_Ci || allocate_Cx) ? anz : A->nzmax ;
     C->nzmax = GB_IMAX (C->nzmax, 1) ;
 
     // allocate new components if needed
     bool ok = true ;
-    if (allocate_Cx)
+
+    if (allocate_Cb)
     { 
-        // allocate new C->x component
-        C->x = GB_MALLOC (C->nzmax * C->type->size, GB_void) ;
-        ok = ok && (C->x != NULL) ;
+        // allocate new C->b component
+        C->b = GB_MALLOC (C->nzmax, int8_t) ;
+        ok = ok && (C->b != NULL) ;
     }
 
     if (allocate_Ci)
     { 
-
         // allocate new C->i component
         C->i = GB_MALLOC (C->nzmax, int64_t) ;
         ok = ok && (C->i != NULL) ;
     }
 
+    if (allocate_Cx)
+    { 
+        // allocate new C->x component
+        C->x = GB_MALLOC (C->nzmax * C->type->size, GB_void) ;
+        ok = ok && (C->x != NULL) ;
+    }
+
     if (!ok)
     { 
         // out of memory
-        GB_PHIX_FREE (C) ;
-        GB_MATRIX_FREE (Ahandle) ;
-        GB_FREE (Ci_keep) ;
-        return (GB_OUT_OF_MEMORY) ;
+        GB_phbix_free (C) ;
+        GB_Matrix_free (Ahandle) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     //--------------------------------------------------------------------------
@@ -318,6 +276,7 @@ GrB_Info GB_transplant          // transplant one matrix into another
         if (A->x_shallow)
         { 
             // A is shallow so make a deep copy; no typecast needed
+            // TODO handle the bitmap better for valgrind: do not use memcpy
             GB_memcpy (C->x, A->x, anz * C->type->size, nthreads) ;
             A->x = NULL ;
         }
@@ -334,7 +293,7 @@ GrB_Info GB_transplant          // transplant one matrix into another
         GB_void *GB_RESTRICT Cx = (GB_void *) C->x ;
         GB_void *GB_RESTRICT Ax = (GB_void *) A->x ;
         GB_cast_array (Cx, C->type->code,
-            Ax, A->type->code, A->type->size, anz, nthreads) ;
+            Ax, A->type->code, A->b, A->type->size, anz, nthreads) ;
         if (!A->x_shallow)
         { 
             GB_FREE (A->x) ;
@@ -352,41 +311,26 @@ GrB_Info GB_transplant          // transplant one matrix into another
     // transplant or copy A->i row indices
     //--------------------------------------------------------------------------
 
-    if (keep_Cp_and_Ci)
+    if (C_is_full || C_is_bitmap)
     { 
 
         //----------------------------------------------------------------------
-        // keep existing C->i
+        // C is full or bitmap
         //----------------------------------------------------------------------
 
-        // C is dense; restore the prior C->i.  A->i will be freed
-        C->i = Ci_keep ;
-        Ci_keep = NULL ;
+        // C is full or bitmap; C->i stays NULL
+        C->i = NULL ;
 
     }
     else if (A->i_shallow)
-    {
+    { 
 
         //----------------------------------------------------------------------
         // A->i is a shallow copy of another matrix, so we need a deep copy
         //----------------------------------------------------------------------
 
-        if (A_is_dense && !GB_ZOMBIES (A))
-        {
-            // create C->i for a dense matrix C
-            int64_t *GB_RESTRICT Ci = C->i ;
-            int64_t pC ;
-            #pragma omp parallel for num_threads(nthreads) schedule(static)
-            for (pC = 0 ; pC < anz ; pC++)
-            { 
-                Ci [pC] = pC % avlen ;
-            }
-        }
-        else
-        { 
-            // copy A->i into C->i
-            GB_memcpy (C->i, A->i, anz * sizeof (int64_t), nthreads) ;
-        }
+        // copy A->i into C->i
+        GB_memcpy (C->i, A->i, anz * sizeof (int64_t), nthreads) ;
         A->i = NULL ;
         A->i_shallow = false ;
 
@@ -403,18 +347,58 @@ GrB_Info GB_transplant          // transplant one matrix into another
         A->i_shallow = false ;
     }
 
-    ASSERT (C->i != NULL) ;
     C->i_shallow = false ;
-
     C->nzombies = A->nzombies ;     // zombies may have been transplanted into C
+    C->jumbled = A->jumbled ;       // C is jumbled if A is jumbled
+
+    //--------------------------------------------------------------------------
+    // transplant or copy A->b bitmap
+    //--------------------------------------------------------------------------
+
+    if (!C_is_bitmap)
+    { 
+
+        //----------------------------------------------------------------------
+        // A is not bitmap; A->b does not exist
+        //----------------------------------------------------------------------
+
+        // C is not bitmap; C->b stays NULL
+        C->b = NULL ;
+
+    }
+    else if (A->b_shallow)
+    { 
+
+        //----------------------------------------------------------------------
+        // A->b is a shallow copy of another matrix, so we need a deep copy
+        //----------------------------------------------------------------------
+
+        // copy A->b into C->b
+        GB_memcpy (C->b, A->b, anz * sizeof (int8_t), nthreads) ;
+        A->b = NULL ;
+        A->b_shallow = false ;
+
+    }
+    else
+    { 
+
+        //----------------------------------------------------------------------
+        // A->b is not shallow, so just transplant the pointer from A to C
+        //----------------------------------------------------------------------
+
+        C->b = A->b ;
+        A->b = NULL ;
+        A->b_shallow = false ;
+    }
 
-    if (!GB_queue_insert (C)) GB_PANIC ;    // TODO in 4.0: delete
+    C->b_shallow = false ;
+    C->nvals = A->nvals ;
 
     //--------------------------------------------------------------------------
     // free A and return result
     //--------------------------------------------------------------------------
 
-    GB_MATRIX_FREE (Ahandle) ;
+    GB_Matrix_free (Ahandle) ;
     ASSERT_MATRIX_OK (C, "C after transplant", GB0) ;
     return (GrB_SUCCESS) ;
 }
diff --git a/GraphBLAS/Source/GB_transplant_conform.c b/GraphBLAS/Source/GB_transplant_conform.c
index ce766cb9f0..494436c3f3 100644
--- a/GraphBLAS/Source/GB_transplant_conform.c
+++ b/GraphBLAS/Source/GB_transplant_conform.c
@@ -2,18 +2,18 @@
 // GB_transplant_conform: transplant T into C, then conform C
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// C = (type) T, then conform C to its desired hypersparsity.  T is freed.
+// C = (type) T, then conform C to its desired sparsity structure.  T is freed.
 // All prior content of C is cleared; zombies and pending tuples are abandoned
-// in C.
+// in C.  C and T can have any sparsity structure on input.
 
 #include "GB.h"
 
-GrB_Info GB_transplant_conform      // transplant and conform hypersparsity
+GrB_Info GB_transplant_conform      // transplant and conform sparsity structure
 (
     GrB_Matrix C,                   // destination matrix to transplant into
     GrB_Type ctype,                 // type to cast into
@@ -31,7 +31,8 @@ GrB_Info GB_transplant_conform      // transplant and conform hypersparsity
     ASSERT_MATRIX_OK (*Thandle, "T to transplant into C", GB0) ;
     ASSERT_TYPE_OK (ctype, "ctype for transplant into C", GB0) ;
     ASSERT (GB_ZOMBIES_OK (*Thandle)) ;
-    ASSERT (!GB_PENDING (*Thandle)) ;
+    ASSERT (GB_JUMBLED_OK (*Thandle)) ;
+    ASSERT (GB_PENDING_OK (*Thandle)) ;
 
     //--------------------------------------------------------------------------
     // transplant and typecast T into C, and free T
@@ -51,9 +52,17 @@ GrB_Info GB_transplant_conform      // transplant and conform hypersparsity
     ASSERT_MATRIX_OK (C, "C transplanted", GB0) ;
 
     //--------------------------------------------------------------------------
-    // conform C to its desired hypersparsity
+    // conform C to its desired sparsity structure
     //--------------------------------------------------------------------------
 
-    return (GB_to_hyper_conform (C, Context)) ;
+    info = GB_conform (C, Context) ;
+    if (info != GrB_SUCCESS)
+    { 
+        // out of memory
+        return (info) ;
+    }
+
+    ASSERT_MATRIX_OK (C, "C conformed", GB0) ;
+    return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GB_transpose.c b/GraphBLAS/Source/GB_transpose.c
index 05400077bd..1bd855005a 100644
--- a/GraphBLAS/Source/GB_transpose.c
+++ b/GraphBLAS/Source/GB_transpose.c
@@ -2,55 +2,52 @@
 // GB_transpose:  C=A' or C=op(A'), with typecasting
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // CALLS:     GB_builder
 
 // Transpose a matrix, C=A', and optionally apply a unary operator and/or
-// typecast the values.  The transpose may be done in place, in which case C or
-// A are modified in place.  If the matrix to be transposed has more than one
-// vector, it may have jumbled indices in its vectors, which must be sorted.
+// typecast the values.  The transpose may be done in-place, in which case C or
+// A are modified in-place.
+
 // If the input matrix has a single vector, it must be already sorted on input.
-// The input matrix may have shallow components (even if in place), and the
-// output may also have shallow components (even in the input matrix is not
+// The input matrix may have shallow components (even if in-place), and the
+// output may also have shallow components (even if the input matrix is not
 // shallow).
 
 // This function is CSR/CSC agnostic; it sets the output matrix format from
 // C_is_csc but otherwise ignores the CSR/CSC type of A and C.
 
-// If A_in is NULL, then C = (*Chandle) is transposed in place.  If out of
+// If A_in is NULL, then C = (*Chandle) is transposed in-place.  If out of
 // memory, (*Chandle) is always returned as NULL, which frees the input matrix
-// C if the transpose is done in place.
+// C if the transpose is done in-place.
 
-// If A_in is not NULL and Chandle is NULL, then A is modified in place, and
+// If A_in is not NULL and Chandle is NULL, then A is modified in-place, and
 // the A_in matrix is not freed when done.
 
 // The bucket sort is parallel, but not highly scalable.  If e=nnz(A) and A is
-// m-by-n, then at most O(e/n) threads are used.  For many matrices, e is O(n),
-// although the constant can be high.  The qsort method is more scalable, but
-// not as fast with a modest number of threads.
+// m-by-n, then at most O(e/n) threads are used.  The GB_builder method is more
+// scalable, but not as fast with a modest number of threads.
 
 #include "GB_transpose.h"
 #include "GB_build.h"
 #include "GB_apply.h"
 
-#define GB_FREE_WORK    \
-{                       \
-    GB_FREE (Count) ;   \
-}                       \
+#define GB_FREE_ALL ;
 
-// free prior content of A, if transpose is done in place
+// free prior content of A, if transpose is done in-place
 #define GB_FREE_IN_PLACE_A                                                  \
 {                                                                           \
     if (in_place)                                                           \
     {                                                                       \
-        /* A is being transposed in placed */                               \
+        /* A is being transposed in-place */                                \
         /* free prior content of A but not &A itself */                     \
         if (!Ap_shallow) GB_FREE (Ap) ;                                     \
         if (!Ah_shallow) GB_FREE (Ah) ;                                     \
+        if (!Ab_shallow) GB_FREE (Ab) ;                                     \
         if (!Ai_shallow) GB_FREE (Ai) ;                                     \
         if (!Ax_shallow) GB_FREE (Ax) ;                                     \
     }                                                                       \
@@ -61,17 +58,17 @@
     }                                                                       \
 }
 
-// free the new C matrix, unless C=A' is being done in place of A
+// free the new C matrix, unless C=A' is being done in-place of A
 #define GB_FREE_C                                                           \
 {                                                                           \
     if (!in_place_A)                                                        \
     {                                                                       \
         /* free all of C and all its contents &C */                         \
-        GB_MATRIX_FREE (Chandle) ;                                          \
+        GB_Matrix_free (Chandle) ;                                          \
     }                                                                       \
 }
 
-// free both A (if in place) and C (if not in place of A)
+// free both A (if in-place) and C (if not in-place of A)
 #define GB_FREE_A_AND_C                                                     \
 {                                                                           \
     GB_FREE_IN_PLACE_A ;                                                    \
@@ -85,7 +82,7 @@
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
 (
-    GrB_Matrix *Chandle,        // output matrix C, possibly modified in place
+    GrB_Matrix *Chandle,        // output matrix C, possibly modified in-place
     GrB_Type ctype,             // desired type of C; if NULL use A->type.
                                 // ignored if op is present (cast to op->ztype)
     const bool C_is_csc,        // desired CSR/CSC format of C
@@ -100,28 +97,29 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
 {
 
     //--------------------------------------------------------------------------
-    // check inputs and determine if transpose is done in place
+    // check inputs and determine if transpose is done in-place
     //--------------------------------------------------------------------------
 
-    bool in_place_C, in_place_A ;
-
+    GrB_Info info ;
+    GBURBLE ("(transpose) ") ;
     GrB_Matrix A, C ;
+    bool in_place_C, in_place_A ;
 
     if (A_in == NULL)
     { 
 
         //----------------------------------------------------------------------
-        // C = C' ; &C is transposed in place
+        // C = C' ; &C is transposed in-place
         //----------------------------------------------------------------------
 
         // GB_transpose (&C, ctype, csc, NULL, op) ;
-        // C=A' is transposed in place, in the matrix C.
+        // C=A' is transposed in-place, in the matrix C.
         // The matrix C is freed if an error occurs and C is set to NULL.
 
         ASSERT (Chandle != NULL) ;  // at least &C or A must be non-NULL
         A = (*Chandle) ;
         C = A ;                     // C must be freed if an error occurs
-        in_place_C = true ;         // C is modified in place
+        in_place_C = true ;         // C is modified in-place
         in_place_A = false ;
         ASSERT (A == C && A == (*Chandle)) ;
 
@@ -130,19 +128,19 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
     { 
 
         //----------------------------------------------------------------------
-        // A = A' ; A is transposed in place; reuse the header of A
+        // A = A' ; A is transposed in-place; reuse the header of A
         //----------------------------------------------------------------------
 
         // GB_transpose (NULL, ctype, csc, A, op) ;
         // GB_transpose (&A, ctype, csc, A, op) ;
-        // C=A' is transposed in place, in the matrix A.
+        // C=A' is transposed in-place, in the matrix A.
         // The matrix A_in is not freed if an error occurs.
 
         A = A_in ;
         Chandle = &A ;              // C must not be freed if an error occurs
         C = A ;
         in_place_C = false ;
-        in_place_A = true ;         // A is modified in place
+        in_place_A = true ;         // A is modified in-place
         ASSERT (A == C && A == (*Chandle)) ;
 
     }
@@ -168,76 +166,60 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
 
     bool in_place = (in_place_A || in_place_C) ;
 
-    ASSERT_MATRIX_OK_OR_JUMBLED (A, "A input for GB_transpose", GB0) ;
+    ASSERT_MATRIX_OK (A, "A input for GB_transpose", GB0) ;
     ASSERT_TYPE_OK_OR_NULL (ctype, "ctype for GB_transpose", GB0) ;
     ASSERT_UNARYOP_OK_OR_NULL (op1_in, "unop for GB_transpose", GB0) ;
     ASSERT_BINARYOP_OK_OR_NULL (op2_in, "binop for GB_transpose", GB0) ;
     ASSERT_SCALAR_OK_OR_NULL (scalar, "scalar for GB_transpose", GB0) ;
-    ASSERT (!GB_PENDING (A)) ;
-    ASSERT (!GB_ZOMBIES (A)) ;
-
-    //--------------------------------------------------------------------------
-    // determine the number of threads to use here
-    //--------------------------------------------------------------------------
 
-    int64_t anz   = GB_NNZ (A) ;
-    int64_t anvec = A->nvec ;
+    // get the current sparsity control of A
+    float A_hyper_switch = A->hyper_switch ;
+    int A_sparsity = A->sparsity ;
 
-    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-    int nthreads = GB_nthreads (anz + anvec, chunk, nthreads_max) ;
+    // wait if A has pending tuples or zombies, but leave it jumbled
+    GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (A) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
 
     //--------------------------------------------------------------------------
     // get A
     //--------------------------------------------------------------------------
 
-    GrB_Info info ;
-
     GrB_Type atype = A->type ;
     size_t asize = atype->size ;
     GB_Type_code acode = atype->code ;
 
     int64_t avlen = A->vlen ;
     int64_t avdim = A->vdim ;
-    int64_t aplen = A->plen ;
-
-    bool A_is_hyper = A->is_hyper ;
-    double A_hyper_ratio = A->hyper_ratio ;
-
     int64_t anzmax = A->nzmax ;
 
-    // if in place, these must be freed when done, whether successful or not
+    // if in-place, these must be freed when done, whether successful or not
     int64_t *GB_RESTRICT Ap = A->p ;
     int64_t *GB_RESTRICT Ah = A->h ;
     int64_t *GB_RESTRICT Ai = A->i ;
+    int8_t  *GB_RESTRICT Ab = A->b ;
     GB_void *GB_RESTRICT Ax = (GB_void *) A->x ;
 
+    bool A_is_bitmap  = GB_IS_BITMAP (A) ;
+    bool A_is_packed  = GB_is_packed (A) ;
+    bool A_is_hyper   = GB_IS_HYPERSPARSE (A) ;
+
     bool Ap_shallow = A->p_shallow ;
     bool Ah_shallow = A->h_shallow ;
     bool Ai_shallow = A->i_shallow ;
     bool Ax_shallow = A->x_shallow ;
+    bool Ab_shallow = A->b_shallow ;
+
+    int64_t anz = GB_NNZ (A) ;
+    int64_t anvec = A->nvec ;
+    int64_t anvals = A->nvals ;
 
     //--------------------------------------------------------------------------
-    // allocate workspace
+    // determine the max number of threads to use
     //--------------------------------------------------------------------------
 
-    int nth = GB_nthreads (avdim, chunk, nthreads_max) ;
-
-    int ntasks = (nth == 1) ? 1 : (8 * nth) ;
-    ntasks = GB_IMIN (ntasks, avdim) ;
-    ntasks = GB_IMAX (ntasks, 1) ;
-    int64_t *GB_RESTRICT Count = NULL ;    // size ntasks+1, if allocated
-
-    if (anz > 0 && avdim != 1 && avlen == 1)
-    {
-        // Count is only used in one case below
-        Count = GB_CALLOC (ntasks+1, int64_t) ;
-        if (Count == NULL)
-        { 
-            // out of memory
-            GB_FREE_C ;
-            return (GB_OUT_OF_MEMORY) ;
-        }
-    }
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
 
     //--------------------------------------------------------------------------
     // determine the type of C and get the unary or binary operator
@@ -246,16 +228,18 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
     // If a unary or binary operator is present, C is always returned as
     // the ztype of the operator.  The input ctype is ignored.
 
-    GrB_UnaryOp op1 = NULL ;
+    GrB_UnaryOp  op1 = NULL ;
     GrB_BinaryOp op2 = NULL ;
+    GB_Opcode opcode = GB_NOP_opcode ;
 
     if (op1_in != NULL)
     {
         // get the unary operator
-        if (atype == op1_in->xtype && op1_in->opcode == GB_IDENTITY_opcode)
+        opcode = op1_in->opcode ;
+        if (atype == op1_in->xtype && opcode == GB_IDENTITY_opcode)
         { 
             // op1 is a built-in identity operator, with the same type as A, so
-            // do not apply the operator and do not typecast.
+            // do not apply the operator and do not typecast.  op1 is NULL.
             ctype = atype ;
         }
         else
@@ -266,10 +250,10 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
         }
     }
     else if (op2_in != NULL)
-    {
+    { 
         // get the binary operator
         GrB_Type op2_intype = binop_bind1st ? op2_in->xtype : op2_in->ytype ;
-        GB_Opcode opcode = op2_in->opcode ;
+        opcode = op2_in->opcode ;
         // only GB_apply calls GB_transpose with op2_in, and it ensures this
         // condition holds: the first(A,y), second(x,A), and any(...) have
         // been renamed to identity(A), so these cases do not occur here.
@@ -283,7 +267,7 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
     }
     else
     {
-        // no operator
+        // no operator.  both op1 and op2 are NULL
         if (ctype == NULL)
         { 
             // no typecasting if ctype is NULL
@@ -294,6 +278,23 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
     GB_Type_code ccode = ctype->code ;
     size_t csize = ctype->size ;
 
+    //--------------------------------------------------------------------------
+    // check for positional operators
+    //--------------------------------------------------------------------------
+
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (opcode) ;
+    GrB_UnaryOp  save_op1 = op1 ;
+    GrB_BinaryOp save_op2 = op2 ;
+    if (op_is_positional)
+    { 
+        // do not apply the op until after the transpose
+        op1 = NULL ;
+        op2 = NULL ;
+        // replace op1 with the ONE operator, as a placeholder
+        ASSERT (ctype == GrB_INT64 || ctype == GrB_INT32) ;
+        op1 = (ctype == GrB_INT64) ? GxB_ONE_INT64 : GxB_ONE_INT32 ;
+    }
+
     //--------------------------------------------------------------------------
     // C = A'
     //--------------------------------------------------------------------------
@@ -303,27 +304,157 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
     bool allocate_new_Cx = (ctype != atype) || (op1 != NULL) || (op2 != NULL) ;
 
     if (anz == 0)
-    {
+    { 
 
         //======================================================================
         // quick return if A is empty
         //======================================================================
 
+        // free prior space of A, if transpose is done in-place
         GB_FREE_IN_PLACE_A ;
 
         // A is empty; create a new empty matrix C, with the new type and
         // dimensions.  C is hypersparse for now but may convert when
         // returned.
-        info = GB_create (Chandle, ctype, avdim, avlen, GB_Ap_calloc,
-            C_is_csc, GB_FORCE_HYPER, A_hyper_ratio, 1, 1, true, Context) ;
+        info = GB_new_bix (Chandle, // hyper, old or new header
+            ctype, avdim, avlen, GB_Ap_calloc, C_is_csc,
+            GxB_HYPERSPARSE, true, A_hyper_switch, 1, 1, true, Context) ;
         if (info != GrB_SUCCESS)
         { 
             // out of memory
             GB_FREE_C ;
-            GB_FREE_WORK ;
             return (info) ;
         }
         ASSERT_MATRIX_OK (*Chandle, "C transpose empty", GB0) ;
+        ASSERT (!GB_JUMBLED (*Chandle)) ;
+
+    }
+    else if (A_is_packed)
+    {
+
+        //======================================================================
+        // transpose a packed or bitmap matrix or vector
+        //======================================================================
+
+        // A is packed if it is either: (a) bitmap, (b) full, or (c) sparse or
+        // hypersparse with all entries present, no zombies, no pending tuples,
+        // and not jumbled.  For (c), the matrix A can be treated as if it was
+        // full, and the pattern (A->p, A->h, and A->i) can be ignored.
+
+        int sparsity = (A_is_bitmap) ? GxB_BITMAP : GxB_FULL ;
+        bool T_cheap =                  // T can be done quickly if:
+            (avlen == 1 || avdim == 1)      // A is a row or column vector,
+            && op1 == NULL && op2 == NULL   // no operator to apply, and
+            && atype == ctype ;             // no typecasting
+
+        // allocate T
+        GrB_Matrix T = NULL ;
+        if (T_cheap)
+        { 
+            // allocate just the header of T, not T->b or T->x
+            info = GB_new (&T,  // bitmap or full, new header
+                ctype, avdim, avlen, GB_Ap_null, C_is_csc,
+                sparsity, A_hyper_switch, 1, Context) ;
+        }
+        else
+        { 
+            // allocate all of T, including T->b and T->x
+            info = GB_new_bix (&T,  // bitmap or full, new header
+                ctype, avdim, avlen, GB_Ap_null, C_is_csc,
+                sparsity, true, A_hyper_switch, 1, anzmax, true, Context) ;
+        }
+
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory
+            GB_FREE_C ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+
+        T->magic = GB_MAGIC ;
+        if (sparsity == GxB_BITMAP)
+        { 
+            T->nvals = anvals ;     // for bitmap case only
+        }
+
+        //----------------------------------------------------------------------
+        // T = A'
+        //----------------------------------------------------------------------
+
+        // Since A is full, # threads to use is nthreads, and the
+        // nworkspaces parameter is not used
+
+        int64_t anz_held = GB_NNZ_HELD (A) ;
+        int nthreads = GB_nthreads (anz_held + anvec, chunk, nthreads_max) ;
+
+        if (T_cheap)
+        {
+            // no work to do.  Transposing does not change A->b or A->x
+            T->b = Ab ;
+            T->x = Ax ;
+            T->nzmax = A->nzmax ;
+            if (in_place)
+            { 
+                // transplant A->b and A->x into T
+                T->b_shallow = Ab_shallow ;
+                T->x_shallow = Ax_shallow ;
+                Ab = NULL ;     // do not free prior Ab
+                Ax = NULL ;     // do not free prior Ax
+                A->b = NULL ;
+                A->x = NULL ;
+            }
+            else
+            { 
+                // T is a purely shallow copy of A 
+                T->b_shallow = (Ab != NULL) ;
+                T->x_shallow = true ;
+            }
+        }
+        else if (op1 == NULL && op2 == NULL)
+        { 
+            // do not apply an operator; optional typecast to C->type
+            GB_transpose_ix (T, A, NULL, NULL, 0, nthreads) ;
+        }
+        else
+        { 
+            // apply an operator, C has type op->ztype
+            GB_transpose_op (T, op1, op2, scalar, binop_bind1st, A,
+                NULL, NULL, 0, nthreads) ;
+        }
+
+        ASSERT_MATRIX_OK (T, "T dense/bitmap", GB0) ;
+        ASSERT (!GB_JUMBLED (T)) ;
+
+        // free prior space of A, if transpose is done in-place
+        GB_FREE_IN_PLACE_A ;
+
+        //----------------------------------------------------------------------
+        // transplace T into C
+        //----------------------------------------------------------------------
+
+        // allocate the output matrix C as a full or bitmap matrix
+        // if *Chandle == NULL, allocate a new header; otherwise reuse existing
+        info = GB_new (Chandle, // bitmap or full, old or new header
+            ctype, avdim, avlen, GB_Ap_null, C_is_csc,
+            sparsity, A_hyper_switch, 0, Context) ;
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory
+            ASSERT (!in_place) ;    // cannot fail if in-place
+            GB_FREE_C ;
+            GB_Matrix_free (&T) ;
+            return (info) ;
+        }
+
+        // Transplant T into the result C, making a copy if T is shallow
+        info = GB_transplant (*Chandle, ctype, &T, Context) ;
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory
+            GB_FREE_A_AND_C ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+        ASSERT_MATRIX_OK (*Chandle, "Chandle, GB_transpose, bitmap/full", GB0) ;
 
     }
     else if (avdim == 1)
@@ -334,28 +465,28 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
         //======================================================================
 
         // transpose a vector (avlen-by-1) into a "row" matrix (1-by-avlen).
-        // A must be already sorted on input
+        // A must be sorted first.
         ASSERT_MATRIX_OK (A, "the vector A must already be sorted", GB0) ;
+        GB_MATRIX_WAIT (A) ;
+        ASSERT (!GB_JUMBLED (A)) ;
 
         //----------------------------------------------------------------------
         // allocate space
         //----------------------------------------------------------------------
 
-        // Allocate the header of C, with no C->p, C->h, C->i, or C->x
-        // content, and initialize the type and dimension of C.   If in
-        // place, A->p, A->h, A->i, and A->x are all NULL.  The new matrix
-        // is hypersparse, but can be CSR or CSC.  This step does not
-        // allocate anything if in place.
+        // Allocate the header of C, with no C->p, C->h, C->i, C->b, or C->x
+        // content, and initialize the type and dimension of C.  The new matrix
+        // is hypersparse.  This step does not allocate anything if in-place.
 
         // if *Chandle == NULL, allocate a new header; otherwise reuse existing
-        info = GB_new (Chandle, ctype, 1, avlen, GB_Ap_null, C_is_csc,
-            GB_FORCE_HYPER, A_hyper_ratio, 0, Context) ;
+        info = GB_new (Chandle, // hyper; old or new header
+            ctype, 1, avlen, GB_Ap_null, C_is_csc,
+            GxB_HYPERSPARSE, A_hyper_switch, 0, Context) ;
         if (info != GrB_SUCCESS)
         { 
             // out of memory
-            ASSERT (!in_place) ;    // cannot fail if in place
+            ASSERT (!in_place) ;    // cannot fail if in-place
             GB_FREE_C ;
-            GB_FREE_WORK ;
             return (info) ;
         }
 
@@ -364,85 +495,94 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             C = (*Chandle) ;
         }
         else
-        { 
+        {
             ASSERT (A == C && A == (*Chandle)) ;
         }
 
         // allocate new space for the values and pattern
+        int64_t *GB_RESTRICT Cp = NULL ;
+        int64_t *GB_RESTRICT Ci = NULL ;
         GB_void *GB_RESTRICT Cx = NULL ;
-        int64_t *GB_RESTRICT Cp = GB_MALLOC (anz+1, int64_t) ;
-        int64_t *GB_RESTRICT Ci = GB_CALLOC (anz  , int64_t) ;
+        bool ok = true ;
+        Cp = GB_MALLOC (anz+1, int64_t) ;
+        Ci = GB_CALLOC (anz  , int64_t) ;
+        ok = (Cp != NULL && Ci != NULL) ;
+
         if (allocate_new_Cx)
         { 
             // allocate new space for the new typecasted numerical values of C
             Cx = GB_MALLOC (anz * ctype->size, GB_void) ;
+            ok = ok && (Cx != NULL) ;
         }
-        if (Cp == NULL || Ci == NULL || (allocate_new_Cx && (Cx == NULL)))
+
+        if (!ok)
         { 
             // out of memory
             GB_FREE (Cp) ;
             GB_FREE (Ci) ;
             GB_FREE (Cx) ;
             GB_FREE_A_AND_C ;
-            GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         //----------------------------------------------------------------------
-        // the transpose will now succeed; fill the content of C
+        // fill the content of C
         //----------------------------------------------------------------------
 
         // numerical values: apply the operator, typecast, or make shallow copy
         if (op1 != NULL || op2 != NULL)
         { 
-            // Cx = op ((op->xtype) Ax)
-            C->x = Cx ; C->x_shallow = false ;
-            GB_apply_op ((GB_void *) Cx,
-                op1, op2, scalar, binop_bind1st,
-                (const GB_void *) Ax, atype, anz, Context) ;
+            // Cx = op (A)
+            info = GB_apply_op ( // op1 != identity of same types
+                (GB_void *) Cx, op1, op2, scalar, binop_bind1st, A, Context) ;
+            // GB_apply_op can only fail if op1/op2 are positional
+            ASSERT (!GB_OP_IS_POSITIONAL (op1)) ;
+            ASSERT (!GB_OP_IS_POSITIONAL (op2)) ;
+            ASSERT (info == GrB_SUCCESS) ;
+            C->x = Cx ;
+            C->x_shallow = false ;
             // prior Ax will be freed
         }
         else if (ctype != atype)
         { 
             // copy the values from A into C and cast from atype to ctype
-            C->x = Cx ; C->x_shallow = false ;
-            GB_cast_array (Cx, ccode, Ax, acode, asize, anz, 1) ;
+            C->x = Cx ;
+            C->x_shallow = false ;
+            GB_cast_array (Cx, ccode, Ax, acode, Ab, asize, anz, 1) ;
             // prior Ax will be freed
         }
         else // ctype == atype
         { 
             // no type change; numerical values of C are a shallow copy of A.
-            C->x = Ax ; C->x_shallow = (in_place) ? Ax_shallow : true ;
+            C->x = Ax ;
+            C->x_shallow = (in_place) ? Ax_shallow : true ;
             Ax = NULL ;  // do not free prior Ax
         }
 
         // each entry in A becomes a non-empty vector in C
-        C->h = Ai ; C->h_shallow = (in_place) ? Ai_shallow : true ;
+        // C is a hypersparse 1-by-avlen matrix
+        C->h = Ai ;
+        C->h_shallow = (in_place) ? Ai_shallow : true ;
         Ai = NULL ;     // do not free prior Ai
-
-        C->nzmax = anz ;
-
         // C->p = 0:anz and C->i = zeros (1,anz), newly allocated
         C->plen = anz ;
         C->nvec = anz ;
         C->nvec_nonempty = anz ;
-
-        C->i = Ci ; C->i_shallow = false ;
-        C->p = Cp ; C->p_shallow = false ;
-
+        C->i = Ci ;
+        C->p = Cp ;
         // fill the vector pointers C->p
+        int nthreads = GB_nthreads (anz, chunk, nthreads_max) ;
         int64_t k ;
         #pragma omp parallel for num_threads(nthreads) schedule(static)
         for (k = 0 ; k <= anz ; k++)
         { 
             Cp [k] = k ;
         }
-        C->magic = GB_MAGIC ;
 
-        //----------------------------------------------------------------------
-        // free prior space
-        //----------------------------------------------------------------------
+        C->nzmax = anz ;
+        C->magic = GB_MAGIC ;
 
+        // free prior space of A, if transpose is done in-place
         GB_FREE_IN_PLACE_A ;
 
     }
@@ -458,24 +598,43 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
         ASSERT_MATRIX_OK (A, "1-by-n input A already sorted", GB0) ;
 
         //----------------------------------------------------------------------
-        // allocate space
+        // allocate workspace, if needed
         //----------------------------------------------------------------------
 
-        // Allocate the header of C, with no C->p, C->h, C->i, or C->x
-        // content, and initialize the type and dimension of C.   If in
-        // place, A->p, A->h, A->i, and A->x are all NULL.  The new matrix
-        // is NON-hypersparse, but can be CSR or CSC.  This step does not
-        // allocate anything if in place.
+        int ntasks = 0 ;
+        int nth = GB_nthreads (avdim, chunk, nthreads_max) ;
+        int64_t *GB_RESTRICT Count = NULL ;
+        if (nth > 1 && !A_is_hyper)
+        {
+            // ntasks and Count are not needed if nth == 1
+            ntasks = 8 * nth ;
+            ntasks = GB_IMIN (ntasks, avdim) ;
+            ntasks = GB_IMAX (ntasks, 1) ;
+            Count = GB_CALLOC (ntasks+1, int64_t) ;
+            if (Count == NULL)
+            { 
+                // out of memory
+                GB_FREE_C ;
+                return (GrB_OUT_OF_MEMORY) ;
+            }
+        }
+
+        // Allocate the header of C, with no C->p, C->h, C->i, or C->x content,
+        // and initialize the type and dimension of C.   If in-place, A->p,
+        // A->h, A->i, and A->x are all NULL.  The new matrix is sparse, but
+        // can be CSR or CSC.  This step does not allocate anything if in
+        // place.
 
         // if *Chandle == NULL, allocate a new header; otherwise reuse existing
-        info = GB_new (Chandle, ctype, avdim, 1, GB_Ap_null, C_is_csc,
-            GB_FORCE_NONHYPER, A_hyper_ratio, 0, Context) ;
+        info = GB_new (Chandle, // sparse; old or new header
+            ctype, avdim, 1, GB_Ap_null, C_is_csc,
+            GxB_SPARSE, A_hyper_switch, 0, Context) ;
         if (info != GrB_SUCCESS)
         { 
             // out of memory
-            ASSERT (!in_place) ;        // cannot fail if in place
+            ASSERT (!in_place) ;        // cannot fail if in-place
             GB_FREE_C ;
-            GB_FREE_WORK ;
+            GB_FREE (Count) ;
             return (info) ;
         }
 
@@ -484,66 +643,73 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             C = (*Chandle) ;
         }
         else
-        { 
+        {
             ASSERT (A == C && A == (*Chandle)) ;
         }
 
         // allocate new space for the values and pattern
         GB_void *GB_RESTRICT Cx = NULL ;
-        int64_t *GB_RESTRICT Cp ;
+        int64_t *GB_RESTRICT Cp = NULL ;
         int64_t *GB_RESTRICT Ci = NULL ;
+        bool ok = true ;
         Cp = GB_CALLOC (2, int64_t) ;
-
-        bool allocate_new_Ci = (!A_is_hyper) ;
-
-        if (allocate_new_Ci)
+        ok = ok && (Cp != NULL) ;
+        if (!A_is_hyper)
         { 
-            // A is not hypersparse, so new space is needed for Ci
+            // A is sparse, so new space is needed for Ci
             Ci = GB_MALLOC (anz, int64_t) ;
+            ok = ok && (Ci != NULL) ;
         }
 
         if (allocate_new_Cx)
         { 
             // allocate new space for the new typecasted numerical values of C
             Cx = GB_MALLOC (anz * ctype->size, GB_void) ;
+            ok = ok && (Cx != NULL) ;
         }
 
-        if (Cp == NULL || (allocate_new_Cx && (Cx == NULL))
-                       || (allocate_new_Ci && (Ci == NULL)))
+        if (!ok)
         { 
             // out of memory
             GB_FREE (Cp) ;
             GB_FREE (Ci) ;
             GB_FREE (Cx) ;
             GB_FREE_A_AND_C ;
-            GB_FREE_WORK ;
-            return (GB_OUT_OF_MEMORY) ;
+            GB_FREE (Count) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
 
         //----------------------------------------------------------------------
         // numerical values of C: apply the op, typecast, or make shallow copy
         //----------------------------------------------------------------------
 
+        // numerical values: apply the operator, typecast, or make shallow copy
         if (op1 != NULL || op2 != NULL)
         { 
-            // Cx = op ((op->xtype) Ax)
-            C->x = Cx ; C->x_shallow = false ;
-            GB_apply_op ((GB_void *) Cx,
-                op1, op2, scalar, binop_bind1st,
-                (const GB_void *) Ax, atype, anz, Context) ;
+            // Cx = op (A)
+            info = GB_apply_op ( // op1 != identity of same types
+                (GB_void *) Cx, op1, op2, scalar, binop_bind1st, A, Context) ;
+            // GB_apply_op can only fail if op1/op2 are positional
+            ASSERT (!GB_OP_IS_POSITIONAL (op1)) ;
+            ASSERT (!GB_OP_IS_POSITIONAL (op2)) ;
+            ASSERT (info == GrB_SUCCESS) ;
+            C->x = Cx ;
+            C->x_shallow = false ;
             // prior Ax will be freed
         }
         else if (ctype != atype)
         { 
             // copy the values from A into C and cast from atype to ctype
-            C->x = Cx ; C->x_shallow = false ;
-            GB_cast_array (Cx, ccode, Ax, acode, asize, anz, 1) ;
+            C->x = Cx ;
+            C->x_shallow = false ;
+            GB_cast_array (Cx, ccode, Ax, acode, Ab, asize, anz, 1) ;
             // prior Ax will be freed
         }
         else // ctype == atype
         { 
             // no type change; numerical values of C are a shallow copy of A
-            C->x = Ax ; C->x_shallow = (in_place) ? Ax_shallow : true ;
+            C->x = Ax ;
+            C->x_shallow = (in_place) ? Ax_shallow : true ;
             Ax = NULL ;  // do not free prior Ax
         }
 
@@ -558,10 +724,10 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             // each non-empty vector in A becomes an entry in C
             //------------------------------------------------------------------
 
-            ASSERT (!allocate_new_Ci) ;
-            C->i = Ah ; C->i_shallow = (in_place) ? Ah_shallow : true ;
+            C->i = Ah ;
+            C->i_shallow = (in_place) ? Ah_shallow : true ;
             ASSERT (anvec == anz) ;
-            Ah = NULL ;  // do not free prior Ah
+            Ah = NULL ;     // do not free prior Ah
 
         }
         else
@@ -571,7 +737,6 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             // find the non-empty vectors of A, which become entries in C
             //------------------------------------------------------------------
 
-            ASSERT (allocate_new_Ci) ;
             ASSERT (Ah == NULL) ;
 
             if (nth == 1)
@@ -646,44 +811,41 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             ASSERT (k == anz) ;
             #endif
 
-            C->i = Ci ; C->i_shallow = false ;
+            C->i = Ci ;
+            C->i_shallow = false ;
         }
 
-        //----------------------------------------------------------------------
+        //---------------------------------------------------------------------
         // vector pointers of C
-        //----------------------------------------------------------------------
-
-        C->nzmax = anz ;
+        //---------------------------------------------------------------------
 
         // C->p = [0 anz] and C->h = NULL
         ASSERT (C->plen == 1) ;
         ASSERT (C->nvec == 1) ;
         ASSERT (C->h == NULL) ;
-        C->p = Cp ; C->p_shallow = false ;
-
+        C->p = Cp ;
+        C->p_shallow = false ;
         C->nvec_nonempty = (anz == 0) ? 0 : 1 ;
-
         // fill the vector pointers C->p
         Cp [0] = 0 ;
         Cp [1] = anz ;
+        C->nzmax = anz ;
         C->magic = GB_MAGIC ;
+        ASSERT (!GB_JUMBLED (C)) ;
 
-        //----------------------------------------------------------------------
-        // free prior space
-        //----------------------------------------------------------------------
-
+        // free prior space of A, if transpose done in-place, and free workspace
         GB_FREE_IN_PLACE_A ;
+        GB_FREE (Count) ;
 
     }
     else
     {
 
         //======================================================================
-        // transpose a general matrix
+        // transpose a general sparse or hypersparse matrix
         //======================================================================
 
-        ASSERT_MATRIX_OK_OR_JUMBLED (A, "A GB_transpose jumbled ok", GB0) ;
-        ASSERT (avdim > 1 && avlen > 1) ;
+        ASSERT_MATRIX_OK (A, "A for GB_transpose", GB0) ;
 
         // T=A' with optional typecasting, or T=op(A')
 
@@ -691,42 +853,19 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
         // select the method
         //----------------------------------------------------------------------
 
-        // for the qsort method, if the transpose is done in place and A->i is
-        // not shallow, A->i can be used and then freed.  Otherwise, A->i is
-        // not modified at all.
-        bool recycle_Ai = (in_place && !Ai_shallow) ;
-        bool use_qsort ;
-
-        if (A_is_hyper)
-        { 
-
-            //------------------------------------------------------------------
-            // always use qsort for hypersparse matrices
-            //------------------------------------------------------------------
-
-            use_qsort = true ;
-
-        }
-        else
-        { 
-
-            //------------------------------------------------------------------
-            // select qsort if the transpose will likely be hypersparse
-            //------------------------------------------------------------------
-
-            use_qsort = GB_CHOOSE_QSORT_INSTEAD_OF_BUCKET (anz, avlen) ;
-
-        }
+        int nworkspaces_bucket, nthreads_bucket ;
+        bool use_builder = GB_transpose_method (A,
+            &nworkspaces_bucket, &nthreads_bucket, Context) ;
 
         //----------------------------------------------------------------------
         // transpose the matrix with the selected method
         //----------------------------------------------------------------------
 
-        if (use_qsort)
+        if (use_builder)
         {
 
             //==================================================================
-            // transpose via quicksort
+            // transpose via GB_builder
             //==================================================================
 
             //------------------------------------------------------------------
@@ -739,8 +878,7 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             { 
                 // out of memory
                 GB_FREE_C ;
-                GB_FREE_WORK ;
-                return (GB_OUT_OF_MEMORY) ;
+                return (GrB_OUT_OF_MEMORY) ;
             }
 
             // Construct the "row" indices of C, which are "column" indices of
@@ -748,6 +886,7 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             // must be done before Chandle is created below, since that step
             // destroys A.
 
+            int nthreads = GB_nthreads (anz + anvec, chunk, nthreads_max) ;
             GB_extract_vector_list (iwork, A, nthreads) ;
 
             //------------------------------------------------------------------
@@ -758,18 +897,18 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             // content, and initialize the type and dimension of C.   If in
             // place, A->p, A->h, A->i, and A->x are all NULL.  The new matrix
             // is hypersparse, but can be CSR or CSC.  This step does not
-            // allocate anything if in place.
+            // allocate anything if in-place.
 
             // if *Chandle == NULL, allocate a new header; otherwise reuse
-            info = GB_new (Chandle, ctype, avdim, avlen, GB_Ap_null, C_is_csc,
-                GB_FORCE_HYPER, A_hyper_ratio, 0, Context) ;
+            info = GB_new (Chandle, // hyper, old or new header
+                ctype, avdim, avlen, GB_Ap_null, C_is_csc,
+                GxB_HYPERSPARSE, A_hyper_switch, 0, Context) ;
             if (info != GrB_SUCCESS)
             { 
                 // out of memory
-                ASSERT (!in_place) ;        // cannot fail if in place
+                ASSERT (!in_place) ;        // cannot fail if in-place
                 GB_FREE (iwork) ;
                 GB_FREE_C ;
-                GB_FREE_WORK ;
                 return (info) ;
             }
 
@@ -794,7 +933,11 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             GB_void *S = NULL ;
             GB_void *Swork = NULL ;
 
+            // for the GB_builder method, if the transpose is done in-place and
+            // A->i is not shallow, A->i can be used and then freed.
+            // Otherwise, A->i is not modified at all.
             bool ok = true ;
+            bool recycle_Ai = (in_place && !Ai_shallow) ;
             if (!recycle_Ai)
             { 
                 // allocate jwork of size anz
@@ -816,8 +959,7 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
                 GB_FREE (jwork) ;
                 GB_FREE (Swork) ;
                 GB_FREE_A_AND_C ;
-                GB_FREE_WORK ;
-                return (GB_OUT_OF_MEMORY) ;
+                return (GrB_OUT_OF_MEMORY) ;
             }
 
             //------------------------------------------------------------------
@@ -844,22 +986,26 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             // numerical values: apply the op, typecast, or make shallow copy
             if (op1 != NULL || op2 != NULL)
             { 
-                // Swork = op ((op->xtype) Ax)
-                GB_apply_op ((GB_void *) Swork,
-                    op1, op2, scalar, binop_bind1st,
-                    (const GB_void *) Ax, atype, anz, Context) ;
+                // Swork = op (A)
+                info = GB_apply_op ( // op1 != identity of same types
+                    (GB_void *) Swork, op1, op2, scalar, binop_bind1st,
+                    A, Context) ;
+                // GB_apply_op can only fail if op1/op2 are positional
+                ASSERT (!GB_OP_IS_POSITIONAL (op1)) ;
+                ASSERT (!GB_OP_IS_POSITIONAL (op2)) ;
+                ASSERT (info == GrB_SUCCESS) ;
                 // GB_builder will not need to typecast Swork to T->x, and it
                 // may choose to transplant it into T->x
                 scode = ccode ;
                 #if 0
                 if (in_place && !Ax_shallow)
                 {
-                    // A is being transposed in place so A->x is no longer
+                    // A is being transposed in-place so A->x is no longer
                     // needed.  If A->x is shallow this can be skipped.  T->x
                     // will not be shallow if the op is present.  A->x should
                     // be freed early to free up space for GB_builder.
                     // However, in the current usage, when op is used, A is not
-                    // transposed in place, so this step is not needed.
+                    // transposed in-place, so this step is not needed.
                     ASSERT (GB_DEAD_CODE) ;
                     GB_FREE (Ax) ;
                 }
@@ -885,7 +1031,7 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             // GB_builder, instead.  However, this requires the tuples to be
             // sorted on input, which is possible but rare for GB_transpose.
 
-            GrB_Matrix T ;
+            GrB_Matrix T = NULL ;
             info = GB_builder
             (
                 &T,         // create T
@@ -900,7 +1046,6 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
                 true,       // tuples have no duplicates
                 anz,        // size of iwork, jwork, and Swork
                 true,       // is_matrix: unused
-                false,      // ijcheck: unused
                 NULL, NULL, // original I,J indices: not used here
                 S,          // array of values of type scode, not modified
                 anz,        // number of tuples
@@ -918,7 +1063,7 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             // free prior space and transplant T into C
             //------------------------------------------------------------------
 
-            // Free the prior content of the input matrix, if done in place.
+            // Free the prior content of the input matrix, if done in-place.
             // Ap, Ah, and Ai have already been freed, but Ax has not.
             GB_FREE_IN_PLACE_A ;
 
@@ -926,12 +1071,12 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             { 
                 // out of memory in GB_builder
                 GB_FREE_A_AND_C ;
-                GB_FREE_WORK ;
                 return (info) ;
             }
 
             // Transplant T in to the result C.  The matrix T is not shallow
             // and no typecasting is done, so this will always succeed.
+            ASSERT (!GB_JUMBLED (T)) ;
             info = GB_transplant (*Chandle, ctype, &T, Context) ;
             ASSERT (info == GrB_SUCCESS) ;
 
@@ -943,39 +1088,37 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             // transpose via bucket sort
             //==================================================================
 
-            // This method does not operate on the matrix in place, so it must
+            // This method does not operate on the matrix in-place, so it must
             // create a temporary matrix T.  Then the input matrix is freed and
             // replaced with the new matrix T.
 
-            ASSERT (!A_is_hyper) ;
-
             // T is also typecasted to ctype, if not NULL
-            GrB_Matrix T ;
+            GrB_Matrix T = NULL ;
             info = GB_transpose_bucket (&T, ctype, C_is_csc, A,
                 op1, op2, scalar, binop_bind1st,
-                Context) ;
+                nworkspaces_bucket, nthreads_bucket, Context) ;
 
-            // free prior content, if C=A' is being done in place
+            // free prior content, if C=A' is being done in-place
             if (in_place_A)
             { 
-                // free all content of A, but not the header, if in place of A
-                GB_PHIX_FREE (A) ;   // transpose in-place
+                // free all content of A, but not the header, if in-place of A
+                GB_phbix_free (A) ;   // transpose in-place
             }
             else if (in_place_C)
             { 
-                // free all of C, including the header, if done in place of C
-                GB_MATRIX_FREE (Chandle) ;
+                // free all of C, including the header, if done in-place of C
+                GB_Matrix_free (Chandle) ;
             }
 
             if (info != GrB_SUCCESS)
             { 
                 // out of memory in GB_transpose_bucket
                 GB_FREE_C ;
-                GB_FREE_WORK ;
                 return (info) ;
             }
 
             ASSERT_MATRIX_OK (T, "T from bucket", GB0) ;
+            ASSERT (GB_JUMBLED_OK (T)) ;
 
             if (in_place_A)
             { 
@@ -987,7 +1130,7 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
             }
             else
             { 
-                // If C=A' is done in place of C, then the header and content
+                // If C=A' is done in-place of C, then the header and content
                 // of the input C has been freed.  The output T can now be
                 // moved to the Chandle.
                 ASSERT (*Chandle == NULL) ;
@@ -997,24 +1140,43 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
     }
 
     //--------------------------------------------------------------------------
-    // free workspace
+    // get the output matrix
     //--------------------------------------------------------------------------
 
-    GB_FREE_WORK ;
+    C = (*Chandle) ;
+    ASSERT (GB_JUMBLED_OK (C)) ;
 
     //--------------------------------------------------------------------------
-    // conform the result to the desired hypersparsity of A
+    // apply a positional operator, after transposing the matrix
     //--------------------------------------------------------------------------
 
-    // get the output matrix
-    C = (*Chandle) ;
+    if (op_is_positional)
+    { 
+        // the positional operator is applied in-place to the values of C
+        op1 = save_op1 ;
+        op2 = save_op2 ;
+        // Cx = op (C)
+        info = GB_apply_op (C->x, op1,  // positional unary/binary op only
+            op2, scalar, binop_bind1st, C, Context) ;
+        if (info != GrB_SUCCESS)
+        { 
+            // out of memory
+            GB_FREE_C ;
+            return (info) ;
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // conform the result to the desired sparisty structure of A
+    //--------------------------------------------------------------------------
 
-    // transplant the hyper_ratio from A to C
-    C->hyper_ratio = A_hyper_ratio ;
+    // transplant the hyper_switch and sparsity structure from A to C
+    C->hyper_switch = A_hyper_switch ;
+    C->sparsity = A_sparsity ;  // transplant sparsity control into C
 
     ASSERT_MATRIX_OK (C, "C to conform in GB_transpose", GB0) ;
 
-    info = GB_to_hyper_conform (C, Context) ;
+    info = GB_conform (C, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory
diff --git a/GraphBLAS/Source/GB_transpose.h b/GraphBLAS/Source/GB_transpose.h
index 82556f0219..3da211e6f9 100644
--- a/GraphBLAS/Source/GB_transpose.h
+++ b/GraphBLAS/Source/GB_transpose.h
@@ -2,20 +2,28 @@
 // GB_transpose.h:  definitions for GB_transpose
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #ifndef GB_TRANSPOSE_H
 #define GB_TRANSPOSE_H
 #include "GB.h"
-#include "GB_iterator.h"
+#include "GB_atomics.h"
+
+bool GB_transpose_method        // if true: use GB_builder, false: use bucket
+(
+    const GrB_Matrix A,         // matrix to transpose
+    int *nworkspaces_bucket,    // # of slices of A for the bucket method
+    int *nthreads_bucket,       // # of threads to use for the bucket method
+    GB_Context Context
+) ;
 
 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
 (
-    GrB_Matrix *Chandle,        // output matrix C, possibly modified in place
+    GrB_Matrix *Chandle,        // output matrix C, possibly modified in-place
     GrB_Type ctype,             // desired type of C; if NULL use A->type.
                                 // ignored if op is present (cast to op->ztype)
     const bool C_is_csc,        // desired CSR/CSC format of C
@@ -28,7 +36,6 @@ GrB_Info GB_transpose           // C=A', C=(ctype)A or C=op(A')
     GB_Context Context
 ) ;
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 GrB_Info GB_transpose_bucket    // bucket transpose; typecast and apply op
 (
     GrB_Matrix *Chandle,        // output matrix (unallocated on input)
@@ -40,6 +47,8 @@ GrB_Info GB_transpose_bucket    // bucket transpose; typecast and apply op
         const GrB_BinaryOp op2,         // binary operator to apply
         const GxB_Scalar scalar,        // scalar to bind to binary operator
         bool binop_bind1st,             // if true, binop(x,A) else binop(A,y)
+    const int nworkspaces,      // # of workspaces to use
+    const int nthreads,         // # of threads to use
     GB_Context Context
 ) ;
 
@@ -47,10 +56,12 @@ void GB_transpose_ix            // transpose the pattern and values of a matrix
 (
     GrB_Matrix C,                       // output matrix
     const GrB_Matrix A,                 // input matrix
-    int64_t *GB_RESTRICT *Rowcounts,    // Rowcounts [naslice]
-    GBI_single_iterator Iter,           // iterator for the matrix A
-    const int64_t *GB_RESTRICT A_slice, // defines how A is sliced
-    int naslice                         // # of slices of A
+    // for sparse case:
+    int64_t *GB_RESTRICT *Workspaces,   // Workspaces, size nworkspaces
+    const int64_t *GB_RESTRICT A_slice, // how A is sliced, size nthreads+1
+    int nworkspaces,                    // # of workspaces to use
+    // for all cases:
+    int nthreads                        // # of threads to use
 ) ;
 
 void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
@@ -62,10 +73,12 @@ void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
         const GxB_Scalar scalar,        // scalar to bind to binary operator
         bool binop_bind1st,             // if true, binop(x,A) else binop(A,y)
     const GrB_Matrix A,                 // input matrix
-    int64_t *GB_RESTRICT *Rowcounts,    // Rowcounts [naslice]
-    GBI_single_iterator Iter,           // iterator for the matrix A
-    const int64_t *GB_RESTRICT A_slice, // defines how A is sliced
-    int naslice                         // # of slices of A
+    // for sparse or hypersparse case:
+    int64_t *GB_RESTRICT *Workspaces,   // Workspaces, size nworkspaces
+    const int64_t *GB_RESTRICT A_slice, // how A is sliced, size nthreads+1
+    int nworkspaces,                    // # of workspaces to use
+    // for all cases:
+    int nthreads                        // # of threads to use
 ) ;
 
 GB_PUBLIC   // accessed by the MATLAB interface only
diff --git a/GraphBLAS/Source/GB_transpose_bucket.c b/GraphBLAS/Source/GB_transpose_bucket.c
index e117adcbc8..d541fe61fb 100644
--- a/GraphBLAS/Source/GB_transpose_bucket.c
+++ b/GraphBLAS/Source/GB_transpose_bucket.c
@@ -2,8 +2,8 @@
 // GB_transpose_bucket: transpose and optionally typecast and/or apply operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,9 +15,6 @@
 // typecasted into the type of x).  These conditions must be checked in the
 // caller.
 
-// The input matrix A may have jumbled row indices; this is OK.
-// The output matrix C will always have sorted row indices.
-
 // This function is agnostic for the CSR/CSC format of C and A.  C_is_csc is
 // defined by the caller and assigned to C->is_csc, but otherwise unused.
 // A->is_csc is ignored.
@@ -40,24 +37,23 @@
 
 #define GB_FREE_WORK                                                    \
 {                                                                       \
-    if (Rowcounts != NULL)                                              \
+    if (Workspaces != NULL)                                             \
     {                                                                   \
-        for (int taskid = 0 ; taskid < naslice ; taskid++)              \
+        for (int tid = 0 ; tid < nworkspaces ; tid++)                   \
         {                                                               \
-            GB_FREE (Rowcounts [taskid]) ;                              \
+            GB_FREE (Workspaces [tid]) ;                                \
         }                                                               \
     }                                                                   \
-    GB_FREE (Rowcounts) ;                                               \
+    GB_FREE (Workspaces) ;                                              \
     GB_FREE (A_slice) ;                                                 \
 }
 
 #define GB_FREE_ALL                                                     \
 {                                                                       \
-    GB_MATRIX_FREE (&C) ;                                               \
+    GB_Matrix_free (&C) ;                                               \
     GB_FREE_WORK ;                                                      \
 }
 
-GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
 GrB_Info GB_transpose_bucket    // bucket transpose; typecast and apply op
 (
     GrB_Matrix *Chandle,        // output matrix (unallocated on input)
@@ -69,6 +65,8 @@ GrB_Info GB_transpose_bucket    // bucket transpose; typecast and apply op
         const GrB_BinaryOp op2,         // binary operator to apply
         const GxB_Scalar scalar,        // scalar to bind to binary operator
         bool binop_bind1st,             // if true, binop(x,A) else binop(A,y)
+    const int nworkspaces,      // # of workspaces to use
+    const int nthreads,         // # of threads to use
     GB_Context Context
 )
 {
@@ -80,12 +78,22 @@ GrB_Info GB_transpose_bucket    // bucket transpose; typecast and apply op
     ASSERT (Chandle != NULL) ;
     (*Chandle) = NULL ;
     ASSERT_TYPE_OK (ctype, "ctype for transpose", GB0) ;
-    // OK if the matrix A is jumbled; this function is intended to sort it.
-    ASSERT_MATRIX_OK_OR_JUMBLED (A, "A input for transpose_bucket", GB0) ;
-    ASSERT (!GB_PENDING (A)) ; ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT_MATRIX_OK (A, "A input for transpose_bucket", GB0) ;
+    ASSERT (!GB_PENDING (A)) ;
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
 
     // if op1 and op2 are NULL, then no operator is applied
 
+    // This method is only be used when A is sparse or hypersparse.
+    // The full and bitmap cases are handled in GB_transpose.
+    ASSERT (!GB_IS_FULL (A)) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ;
+
+    int64_t *GB_RESTRICT A_slice = NULL ;          // size nthreads+1
+    int64_t *GB_RESTRICT *Workspaces = NULL ;      // size nworkspaces
+
     //--------------------------------------------------------------------------
     // get A
     //--------------------------------------------------------------------------
@@ -97,37 +105,23 @@ GrB_Info GB_transpose_bucket    // bucket transpose; typecast and apply op
     // determine the number of threads to use
     //--------------------------------------------------------------------------
 
-    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
-
     // # of threads to use in the O(vlen) loops below
-    int nthreads = GB_nthreads (vlen, chunk, nthreads_max) ;
-
-    // A is sliced into naslice parts, so that each part has at least vlen
-    // entries.  The workspace required is naslice*vlen, so this ensures
-    // the workspace is no more than the size of A.
-
-    // naslice < floor (anz / vlen) < anz / vlen
-    // thus naslice*vlen < anz
-
-    // also, naslice < nthreads_max, since each part will be about the same size
-
-    int naslice = GB_nthreads (anz, GB_IMAX (vlen, chunk), nthreads_max) ;
-
-    int64_t *GB_RESTRICT A_slice = NULL ;          // size naslice+1
-    int64_t *GB_RESTRICT *Rowcounts = NULL ;       // size naslice
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+    int nth = GB_nthreads (vlen, chunk, nthreads_max) ;
 
     //--------------------------------------------------------------------------
-    // allocate C: always non-hypersparse
+    // allocate C: always sparse
     //--------------------------------------------------------------------------
 
-    // The bucket transpose only works when C is not hypersparse.
-    // A can be hypersparse.
+    // The bucket transpose only works when C is sparse.
+    // A can be sparse or hypersparse.
 
-    // [ C->p is allocated but not initialized.  It is NON-hypersparse.
+    // C->p is allocated but not initialized.
     GrB_Info info ;
     GrB_Matrix C = NULL ;
-    GB_OK (GB_create (&C, ctype, A->vdim, vlen, GB_Ap_malloc, C_is_csc,
-        GB_FORCE_NONHYPER, A->hyper_ratio, vlen, anz, true, Context)) ;
+    GB_OK (GB_new_bix (&C, // sparse, new header
+        ctype, A->vdim, vlen, GB_Ap_malloc, C_is_csc,
+        GxB_SPARSE, true, A->hyper_switch, vlen, anz, true, Context)) ;
 
     int64_t *GB_RESTRICT Cp = C->p ;
 
@@ -135,126 +129,192 @@ GrB_Info GB_transpose_bucket    // bucket transpose; typecast and apply op
     // allocate workspace
     //--------------------------------------------------------------------------
 
-    Rowcounts = GB_CALLOC (naslice, int64_t *) ;
-    if (Rowcounts == NULL)
+    Workspaces = GB_CALLOC (nworkspaces, int64_t *) ;
+    if (Workspaces == NULL)
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
-    for (int taskid = 0 ; taskid < naslice ; taskid++)
+    for (int tid = 0 ; tid < nworkspaces ; tid++)
     {
-        int64_t *rowcount = GB_CALLOC (vlen + 1, int64_t) ;
-        if (rowcount == NULL)
+        int64_t *workspace = GB_MALLOC (vlen + 1, int64_t) ;
+        if (workspace == NULL)
         { 
             // out of memory
             GB_FREE_ALL ;
-            return (GB_OUT_OF_MEMORY) ;
+            return (GrB_OUT_OF_MEMORY) ;
         }
-        Rowcounts [taskid] = rowcount ;
+        Workspaces [tid] = workspace ;
     }
 
-    //--------------------------------------------------------------------------
+    //==========================================================================
     // phase1: symbolic analysis
-    //--------------------------------------------------------------------------
+    //==========================================================================
 
-    // create the iterator for A
-    GBI_single_iterator Iter ;
-    if (!GB_pslice (&A_slice, /* A */ A->p, A->nvec, naslice))
+    // slice the A matrix, perfectly balanced for one task per thread
+    if (!GB_pslice (&A_slice, A->p, A->nvec, nthreads, true))
     { 
         // out of memory
         GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
-    GBI1_init (&Iter, A) ;
-
     // sum up the row counts and find C->p
-    if (naslice == 1)
+    if (nthreads == 1)
     {
 
         //----------------------------------------------------------------------
-        // A is not sliced
+        // sequential method: A is not sliced
         //----------------------------------------------------------------------
 
+        // Only requires a single int64 workspace of size vlen for a single
+        // thread.  The resulting C matrix is not jumbled.
+
         // compute the row counts of A.  No need to scan the A->p pointers
-        int64_t *GB_RESTRICT rowcount = Rowcounts [0] ;
+        ASSERT (nworkspaces == 1) ;
+        int64_t *GB_RESTRICT workspace = Workspaces [0] ;
+        memset (workspace, 0, (vlen + 1) * sizeof (int64_t)) ;
         const int64_t *GB_RESTRICT Ai = A->i ;
         for (int64_t p = 0 ; p < anz ; p++)
         { 
-            rowcount [Ai [p]]++ ;
+            int64_t i = Ai [p] ;
+            workspace [i]++ ;
         }
 
-        // cumulative sum of the rowcount, and copy back into C->p
-        GB_cumsum (rowcount, vlen, (&C->nvec_nonempty), nthreads) ;
-        GB_memcpy (Cp, rowcount, (vlen+1) * sizeof (int64_t), nthreads) ;
+        // cumulative sum of the workspace, and copy back into C->p
+        GB_cumsum (workspace, vlen, (&C->nvec_nonempty), 1) ;
+        memcpy (Cp, workspace, (vlen + 1) * sizeof (int64_t)) ;
+
+    }
+    else if (nworkspaces == 1)
+    {
+
+        //----------------------------------------------------------------------
+        // atomic method: A is sliced but workspace is shared
+        //----------------------------------------------------------------------
+
+        // Only requires a single int64 workspace of size vlen, shared by all
+        // threads.  Scales well, but requires atomics.  If the # of rows is
+        // very small and the average row degree is high, this can be very slow
+        // because of contention on the atomic workspace.  Otherwise, it is
+        // typically faster than the non-atomic method.  The resulting C matrix
+        // is jumbled.
+
+        // compute the row counts of A.  No need to scan the A->p pointers
+        int64_t *GB_RESTRICT workspace = Workspaces [0] ;
+        GB_memset (workspace, 0, (vlen + 1) * sizeof (int64_t), nth) ;
+        const int64_t *GB_RESTRICT Ai = A->i ;
+        int64_t p ;
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        { 
+            int64_t i = Ai [p] ;
+            // update workspace [i]++ automically:
+            GB_ATOMIC_UPDATE
+            workspace [i]++ ;
+        }
+
+        C->jumbled = true ; // atomic transpose leaves C jumbled
+
+        // cumulative sum of the workspace, and copy back into C->p
+        GB_cumsum (workspace, vlen, (&C->nvec_nonempty), nth) ;
+        GB_memcpy (Cp, workspace, (vlen+ 1) * sizeof (int64_t), nth) ;
 
     }
     else
     {
 
         //----------------------------------------------------------------------
-        // A is sliced
+        // non-atomic method
         //----------------------------------------------------------------------
 
-        // compute the row counts of A for each slice
-        #define GB_PHASE_1_OF_2
-        #include "GB_unop_transpose.c"
+        // compute the row counts of A for each slice, one per thread; This
+        // method is parallel, but not highly scalable.  Each thread requires
+        // int64 workspace of size vlen, but no atomics are required.  The
+        // resulting C matrix is not jumbled, so this can save work if C needs
+        // to be unjumbled later.
 
-        // cumulative sum of the rowcounts across the slices
-        int64_t i ;
+        ASSERT (nworkspaces == nthreads) ;
+        const int64_t *GB_RESTRICT Ap = A->p ;
+        const int64_t *GB_RESTRICT Ah = A->h ;
+        const int64_t *GB_RESTRICT Ai = A->i ;
+
+        int tid ;
         #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (tid = 0 ; tid < nthreads ; tid++)
+        {
+            // get the row counts for this slice, of size A->vlen
+            int64_t *GB_RESTRICT workspace = Workspaces [tid] ;
+            memset (workspace, 0, (vlen + 1) * sizeof (int64_t)) ;
+            for (int64_t k = A_slice [tid] ; k < A_slice [tid+1] ; k++)
+            {
+                // iterate over the entries in A(:,j)
+                int64_t j = GBH (Ah, k) ;
+                int64_t pA_start = Ap [k] ;
+                int64_t pA_end = Ap [k+1] ;
+                for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                { 
+                    // count one more entry in C(i,:) for this slice
+                    int64_t i = Ai [pA] ;
+                    workspace [i]++ ;
+                }
+            }
+        }
+
+        // cumulative sum of the workspaces across the slices
+        int64_t i ;
+        #pragma omp parallel for num_threads(nth) schedule(static)
         for (i = 0 ; i < vlen ; i++)
         {
             int64_t s = 0 ;
-            for (int taskid = 0 ; taskid < naslice ; taskid++)
+            for (int tid = 0 ; tid < nthreads ; tid++)
             { 
-                int64_t *GB_RESTRICT rowcount = Rowcounts [taskid] ;
-                int64_t c = rowcount [i] ;
-                rowcount [i] = s ;
+                int64_t *GB_RESTRICT workspace = Workspaces [tid] ;
+                int64_t c = workspace [i] ;
+                workspace [i] = s ;
                 s += c ;
             }
             Cp [i] = s ;
         }
         Cp [vlen] = 0 ;
 
-        // compute the vector pointers for C; also compute C->nvec_nonempty
-        GB_cumsum (Cp, vlen, &(C->nvec_nonempty), nthreads) ;
+        // compute the vector pointers for C
+        GB_cumsum (Cp, vlen, &(C->nvec_nonempty), nth) ;
 
-        // add Cp back to all Rowcounts
-        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        // add Cp back to all Workspaces
+        #pragma omp parallel for num_threads(nth) schedule(static)
         for (i = 0 ; i < vlen ; i++)
         {
             int64_t s = Cp [i] ;
-            int64_t *GB_RESTRICT rowcount = Rowcounts [0] ;
-            rowcount [i] = s ;
-            for (int taskid = 1 ; taskid < naslice ; taskid++)
+            int64_t *GB_RESTRICT workspace = Workspaces [0] ;
+            workspace [i] = s ;
+            for (int tid = 1 ; tid < nthreads ; tid++)
             { 
-                int64_t *GB_RESTRICT rowcount = Rowcounts [taskid] ;
-                rowcount [i] += s ;
+                int64_t *GB_RESTRICT workspace = Workspaces [tid] ;
+                workspace [i] += s ;
             }
         }
     }
 
-    C->magic = GB_MAGIC ;      // C is now initialized ]
+    C->magic = GB_MAGIC ;
 
-    //--------------------------------------------------------------------------
+    //==========================================================================
     // phase2: transpose A into C
-    //--------------------------------------------------------------------------
+    //==========================================================================
 
     // transpose both the pattern and the values
     if (op1 == NULL && op2 == NULL)
     { 
-        // do not apply an operator; optional typecast to ctype
-        GB_transpose_ix (C, A, Rowcounts, Iter, A_slice, naslice) ;
+        // do not apply an operator; optional typecast to C->type
+        GB_transpose_ix (C, A, Workspaces, A_slice, nworkspaces, nthreads) ;
     }
     else
     { 
         // apply an operator, C has type op->ztype
-        GB_transpose_op (C, 
-            op1, op2, scalar, binop_bind1st,
-            A, Rowcounts, Iter, A_slice, naslice) ;
+        GB_transpose_op (C, op1, op2, scalar, binop_bind1st, A,
+            Workspaces, A_slice, nworkspaces, nthreads) ;
     }
 
     //--------------------------------------------------------------------------
@@ -263,7 +323,7 @@ GrB_Info GB_transpose_bucket    // bucket transpose; typecast and apply op
 
     GB_FREE_WORK ;
     ASSERT_MATRIX_OK (C, "C transpose of A", GB0) ;
-    ASSERT (!C->is_hyper) ;
+    ASSERT (C->h == NULL) ;
     (*Chandle) = C ;
     return (GrB_SUCCESS) ;
 }
diff --git a/GraphBLAS/Source/GB_transpose_ix.c b/GraphBLAS/Source/GB_transpose_ix.c
index 2f9f0bb482..7008607ed7 100644
--- a/GraphBLAS/Source/GB_transpose_ix.c
+++ b/GraphBLAS/Source/GB_transpose_ix.c
@@ -2,16 +2,28 @@
 // GB_transpose_ix: transpose the values and pattern of a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // The values of A are typecasted to C->type, the type of the C matrix.
-// A can be sparse or hypersparse, but C is not hypersparse.
 
-// This method is parallel, but not highly scalable.  It uses only naslice =
-// nnz(A)/(A->vlen) threads.
+// If A is sparse or hypersparse
+//      The pattern of C is constructed.  C is sparse.
+//      Workspaces and A_slice are non-NULL.
+//      This method is parallel, but not highly scalable.  It uses only
+//      nthreads = nnz(A)/(A->vlen) threads.
+
+// If A is full or packed:
+//      The pattern of C is not constructed.  C is full.
+//      Workspaces and A_slice are NULL.
+//      This method is parallel and fully scalable.
+
+// If A is bitmap:
+//      C->b is constructed.  C is bitmap.
+//      Workspaces and A_slice are NULL.
+//      This method is parallel and fully scalable.
 
 #include "GB_transpose.h"
 #ifndef GBCOMPACT
@@ -22,13 +34,23 @@ void GB_transpose_ix            // transpose the pattern and values of a matrix
 (
     GrB_Matrix C,                       // output matrix
     const GrB_Matrix A,                 // input matrix
-    int64_t *GB_RESTRICT *Rowcounts,    // Rowcounts [naslice]
-    GBI_single_iterator Iter,           // iterator for the matrix A
-    const int64_t *GB_RESTRICT A_slice, // defines how A is sliced
-    int naslice                         // # of slices of A
+    // for sparse case:
+    int64_t *GB_RESTRICT *Workspaces,   // Workspaces, size nworkspaces
+    const int64_t *GB_RESTRICT A_slice, // how A is sliced, size nthreads+1
+    int nworkspaces,                    // # of workspaces to use
+    // for all cases:
+    int nthreads                        // # of threads to use
 )
 { 
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+
     GrB_Info info ;
     GrB_Type ctype = C->type ;
     GB_Type_code code1 = ctype->code ;          // defines ztype
@@ -50,7 +72,7 @@ void GB_transpose_ix            // transpose the pattern and values of a matrix
         #define GB_WORKER(ignore1,zname,ztype,aname,atype)          \
         {                                                           \
             info = GB_unop_tran (zname,aname)                       \
-                (C, A, Rowcounts, Iter, A_slice, naslice) ;         \
+                (C, A, Workspaces, A_slice, nworkspaces, nthreads) ;    \
             if (info == GrB_SUCCESS) return ;                       \
         }                                                           \
         break ;
@@ -59,7 +81,6 @@ void GB_transpose_ix            // transpose the pattern and values of a matrix
         // launch the switch factory
         //----------------------------------------------------------------------
 
-        // switch factory for two built-in types, controlled by code1 and code2
         #include "GB_2type_factory.c"
 
     #endif
@@ -68,7 +89,7 @@ void GB_transpose_ix            // transpose the pattern and values of a matrix
     // generic worker: transpose and typecast
     //--------------------------------------------------------------------------
 
-    GB_BURBLE_MATRIX (A, "generic ") ;
+    GB_BURBLE_MATRIX (A, "(generic transpose) ") ;
 
     size_t asize = A->type->size ;
     size_t csize = C->type->size ;
@@ -76,12 +97,11 @@ void GB_transpose_ix            // transpose the pattern and values of a matrix
 
     // Cx [pC] = (ctype) Ax [pA]
     #define GB_CAST_OP(pC,pA)  \
-        cast_A_to_X (Cx +(pC*csize), Ax +(pA*asize), asize) ;
+        cast_A_to_X (Cx +((pC)*csize), Ax +((pA)*asize), asize) ;
 
     #define GB_ATYPE GB_void
     #define GB_CTYPE GB_void
 
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
 }
 
diff --git a/GraphBLAS/Source/GB_transpose_method.c b/GraphBLAS/Source/GB_transpose_method.c
new file mode 100644
index 0000000000..e8cb9148a4
--- /dev/null
+++ b/GraphBLAS/Source/GB_transpose_method.c
@@ -0,0 +1,163 @@
+//------------------------------------------------------------------------------
+// GB_transpose_method: select method for GB_transpose
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_transpose.h"
+
+// GB_transpose can use choose between a merge-sort-based method that takes
+// O(anz*log(anz)) time, or a bucket-sort method that takes O(anz+m+n) time.
+// The bucket sort has 3 methods: sequential, atomic, and non-atomic.
+
+bool GB_transpose_method        // if true: use GB_builder, false: use bucket
+(
+    const GrB_Matrix A,         // matrix to transpose
+    int *nworkspaces_bucket,    // # of slices of A for the bucket method
+    int *nthreads_bucket,       // # of threads to use for the bucket method
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // get inputs
+    //--------------------------------------------------------------------------
+
+    int64_t anvec = (A->nvec_nonempty < 0) ? A->nvec : A->nvec_nonempty ;
+    int64_t anz = GB_NNZ (A) ;
+    int64_t avlen = A->vlen ;
+    int64_t avdim = A->vdim ;
+    int anzlog = (anz   == 0) ? 1 : (int) ceil (log2 ((double) anz)) ;
+    int mlog   = (avlen == 0) ? 1 : (int) ceil (log2 ((double) avlen)) ;
+    double alpha ;
+
+    // determine # of threads for bucket method
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+    int nthreads = GB_nthreads (anz + avlen, chunk, nthreads_max) ;
+
+    //--------------------------------------------------------------------------
+    // select between the atomic and non-atomic bucket method
+    //--------------------------------------------------------------------------
+
+    bool atomics ;
+    if (nthreads == 1)
+    { 
+        // sequential bucket method, no atomics needed
+        atomics = false ;
+    }
+    else if ((double) nthreads * (double) avlen > (double) anz)
+    { 
+        // non-atomic workspace is too high; use atomic method
+        atomics = true ;
+    }
+    else
+    {
+        // select between atomic and non-atomic methods.  This rule is based on
+        // performance on a 4-core system with 4 threads with gcc 7.5.  The icc
+        // compiler has much slower atomics than gcc and so beta should likely
+        // be smaller when using icc.
+
+        int beta ;
+        if (anzlog < 14)
+        { 
+            beta = -4 ;     // fewer than 16K entries in A
+        }
+        else
+        { 
+            switch (anzlog)
+            {
+                case 14: beta = -4 ; break ;        // 16K entried in A
+                case 15: beta = -3 ; break ;        // 32K
+                case 16: beta = -2 ; break ;        // 64K
+                case 17: beta = -1 ; break ;        // 128K
+                case 18: beta =  0 ; break ;        // 256K
+                case 19: beta =  1 ; break ;        // 512K
+                case 20: beta =  2 ; break ;        // 1M
+                case 21: beta =  3 ; break ;        // 2M
+                case 22: beta =  4 ; break ;        // 4M
+                case 23: beta =  5 ; break ;        // 8M
+                case 24: beta =  6 ; break ;        // 16M
+                case 25: beta =  8 ; break ;        // 32M
+                case 26: beta =  9 ; break ;        // 64M
+                case 27: beta =  9 ; break ;        // 128M
+                case 28: beta = 10 ; break ;        // 256M
+                default: beta = 10 ; break ;        // > 256M
+            }
+        }
+
+        if (anzlog - mlog <= beta)
+        { 
+            // use atomic method
+            // anzlog - mlog is the log2 of the average row degree, rounded.
+            // If the average row degree is <= 2^beta, use the atomic method.
+            // That is, the atomic method works better for sparser matrices,
+            // and the non-atomic works better or denser matrices.  However,
+            // the threshold changes as the problem gets larger, in terms of #
+            // of entries in A, when the atomic method becomes more attractive
+            // relative to the non-atomic method.  The atomic has the
+            // advantange of needing much less workspace, which becomes more
+            // important for larger problems.
+            atomics = true ;
+        }
+        else
+        { 
+            // use non-atomic method
+            atomics = false ;
+        }
+    }
+
+    (*nworkspaces_bucket) = (atomics) ? 1 : nthreads ;
+    (*nthreads_bucket) = nthreads ;
+
+    //--------------------------------------------------------------------------
+    // select between GB_builder method and bucket method
+    //--------------------------------------------------------------------------
+
+    // As the problem gets larger, the GB_builder method gets faster relative
+    // to the bucket method, in terms of the "constants" in the O(a log a) work
+    // for GB_builder, or O (a+m+n) for the bucket method.  Clearly, O (a log
+    // a) and O (a+m+n) do not fully model the performance of these two
+    // methods.  Perhaps this is because of cache effects.  The bucket method
+    // has more irregular memory accesses.  The GB_builder method uses
+    // mergesort, which has good memory locality.
+
+    if (anzlog < 14)
+    { 
+        alpha = 0.5 ;       // fewer than 2^14 = 16K entries
+    }
+    else
+    { 
+        switch (anzlog)
+        {
+            case 14: alpha = 0.6 ; break ;      // 16K entries in A
+            case 15: alpha = 0.7 ; break ;      // 32K
+            case 16: alpha = 1.0 ; break ;      // 64K
+            case 17: alpha = 1.7 ; break ;      // 128K
+            case 18: alpha = 3.0 ; break ;      // 256K
+            case 19: alpha = 4.0 ; break ;      // 512K
+            case 20: alpha = 6.0 ; break ;      // 1M
+            case 21: alpha = 7.0 ; break ;      // 2M
+            case 22: alpha = 8.0 ; break ;      // 4M
+            case 23: alpha = 5.0 ; break ;      // 8M
+            case 24: alpha = 5.0 ; break ;      // 16M
+            case 25: alpha = 5.0 ; break ;      // 32M
+            case 26: alpha = 5.0 ; break ;      // 64M
+            case 27: alpha = 5.0 ; break ;      // 128M
+            case 28: alpha = 5.0 ; break ;      // 256M
+            default: alpha = 5.0 ; break ;      // > 256M
+        }
+    }
+
+    double bucket_work  = (double) (anz + avlen + anvec) * alpha ;
+    double builder_work = (log2 ((double) anz + 1) * (anz)) ;
+
+    //--------------------------------------------------------------------------
+    // select the method with the least amount of work
+    //--------------------------------------------------------------------------
+
+    return (builder_work < bucket_work) ;
+}
+
diff --git a/GraphBLAS/Source/GB_transpose_op.c b/GraphBLAS/Source/GB_transpose_op.c
index 694b485210..700ff899cd 100644
--- a/GraphBLAS/Source/GB_transpose_op.c
+++ b/GraphBLAS/Source/GB_transpose_op.c
@@ -2,19 +2,36 @@
 // GB_transpose_op: transpose, typecast, and apply an operator to a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// C = op ((xtype) A')
+// C = op (A')
 
 // The values of A are typecasted to op->xtype and then passed to the unary
-// operator.  The output is assigned to R, which must be of type op->ztype; no
+// operator.  The output is assigned to C, which must be of type op->ztype; no
 // output typecasting done with the output of the operator.
 
-// This method is parallel, but not highly scalable.  It uses only naslice =
-// nnz(A)/(A->vlen) threads.
+// If the op is positional, it has been replaced with the unary op
+// GxB_ONE_INT64, as a placeholder.  The true op (either op1 or op2) is applied
+// later, in GB_transpose.
+
+// If A is sparse or hypersparse
+//      The pattern of C is constructed.  C is sparse.
+//      Workspaces and A_slice are non-NULL.
+//      This method is parallel, but not highly scalable.  It uses only
+//      nthreads = nnz(A)/(A->vlen) threads.
+
+// If A is full or packed:
+//      The pattern of C is not constructed.  C is full.
+//      Workspaces and A_slice are NULL.
+//      This method is parallel and fully scalable.
+
+// If A is bitmap:
+//      C->b is constructed.  C is bitmap.
+//      Workspaces and A_slice are NULL.
+//      This method is parallel and fully scalable.
 
 #include "GB_transpose.h"
 #include "GB_binop.h"
@@ -32,15 +49,30 @@ void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
         const GxB_Scalar scalar,        // scalar to bind to binary operator
         bool binop_bind1st,             // if true, binop(x,A) else binop(A,y)
     const GrB_Matrix A,                 // input matrix
-    int64_t *GB_RESTRICT *Rowcounts,    // Rowcounts [naslice]
-    GBI_single_iterator Iter,           // iterator for the matrix A
-    const int64_t *GB_RESTRICT A_slice, // defines how A is sliced
-    int naslice                         // # of slices of A
+    // for sparse or hypersparse case:
+    int64_t *GB_RESTRICT *Workspaces,   // Workspaces, size nworkspaces
+    const int64_t *GB_RESTRICT A_slice, // how A is sliced, size nthreads+1
+    int nworkspaces,                    // # of workspaces to use
+    // for all cases:
+    int nthreads                        // # of threads to use
 )
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+
     GrB_Info info ;
     GrB_Type Atype = A->type ;
+    ASSERT (op1 != NULL || op2 != NULL) ;
+    GB_Opcode opcode = (op1 != NULL) ? op1->opcode : op2->opcode ;
+
+    // positional operators are applied after the transpose
+    ASSERT (!GB_OPCODE_IS_POSITIONAL (opcode)) ;
 
     //--------------------------------------------------------------------------
     // transpose the matrix and apply the operator
@@ -50,19 +82,22 @@ void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
     {
 
         //----------------------------------------------------------------------
-        // built-in unary operator
+        // unary operator
         //----------------------------------------------------------------------
 
+        ASSERT_UNARYOP_OK (op1, "op1 for transpose", GB0) ;
         GrB_UnaryOp op = op1 ;
 
         #ifndef GBCOMPACT
-        bool no_typecasting = (Atype == op->xtype)
-            || (op->opcode == GB_IDENTITY_opcode)
-            || (op->opcode == GB_ONE_opcode) ;
-
-        if (no_typecasting)
+        if ((Atype == op->xtype)
+            || (opcode == GB_IDENTITY_opcode) || (opcode == GB_ONE_opcode))
         { 
 
+            // The switch factory is used if the op is IDENTITY or ONE, or if
+            // no typecasting is being done.  The ONE operator ignores the type
+            // of its input and just produces a 1 of op->ztype == op->xtype.
+            // The IDENTITY operator can do arbitrary typecasting.
+
             //------------------------------------------------------------------
             // define the worker for the switch factory
             //------------------------------------------------------------------
@@ -73,7 +108,7 @@ void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
             #define GB_WORKER(opname,zname,ztype,aname,atype)       \
             {                                                       \
                 info = GB_unop_tran (opname,zname,aname)            \
-                    (C, A, Rowcounts, Iter, A_slice, naslice) ;     \
+                    (C, A, Workspaces, A_slice, nworkspaces, nthreads) ; \
                 if (info == GrB_SUCCESS) return ;                   \
             }                                                       \
             break ;
@@ -91,7 +126,7 @@ void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
         // generic worker: transpose, typecast, and apply unary operator
         //----------------------------------------------------------------------
 
-        GB_BURBLE_MATRIX (A, "generic ") ;
+        GB_BURBLE_MATRIX (A, "(generic transpose: %s) ", op1->name) ;
 
         size_t asize = Atype->size ;
         size_t zsize = op->ztype->size ;
@@ -100,19 +135,24 @@ void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
             cast_A_to_X = GB_cast_factory (op->xtype->code, Atype->code) ;
         GxB_unary_function fop = op->function ;
 
+        ASSERT_TYPE_OK (op1->ztype, "op1 ztype", GB0) ;
+        ASSERT_TYPE_OK (op1->xtype, "op1 xtype", GB0) ;
+        ASSERT_TYPE_OK (C->type, "C type", GB0) ;
+        ASSERT (C->type->size == zsize) ;
+        ASSERT (C->type == op->ztype) ;
+
         // Cx [pC] = op (cast (Ax [pA]))
         #define GB_CAST_OP(pC,pA)                                       \
         {                                                               \
             /* xwork = (xtype) Ax [pA] */                               \
             GB_void xwork [GB_VLA(xsize)] ;                             \
-            cast_A_to_X (xwork, Ax +(pA*asize), asize) ;                \
+            cast_A_to_X (xwork, Ax +((pA)*asize), asize) ;              \
             /* Cx [pC] = fop (xwork) ; Cx is of type op->ztype */       \
-            fop (Cx +(pC*zsize), xwork) ;                               \
+            fop (Cx +((pC)*zsize), xwork) ;                             \
         }
 
         #define GB_ATYPE GB_void
         #define GB_CTYPE GB_void
-        #define GB_PHASE_2_OF_2
         #include "GB_unop_transpose.c"
 
     }
@@ -120,14 +160,15 @@ void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
     {
 
         //----------------------------------------------------------------------
-        // built-in binary operator
+        // binary operator
         //----------------------------------------------------------------------
 
-        GB_Opcode opcode = op2->opcode ;
+        ASSERT_BINARYOP_OK (op2, "op2 for transpose", GB0) ;
+
         GB_Type_code xcode, ycode, zcode ;
-        bool op_is_first  = opcode == GB_FIRST_opcode ;
-        bool op_is_second = opcode == GB_SECOND_opcode ;
-        bool op_is_pair   = opcode == GB_PAIR_opcode ;
+        bool op_is_first  = (opcode == GB_FIRST_opcode) ;
+        bool op_is_second = (opcode == GB_SECOND_opcode) ;
+        bool op_is_pair   = (opcode == GB_PAIR_opcode) ;
 
         size_t asize = Atype->size ;
         size_t ssize = scalar->type->size ;
@@ -186,11 +227,12 @@ void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
                 #define GB_bind1st_tran(op,xname) \
                     GB_bind1st_tran_ ## op ## xname
 
-                #define GB_BINOP_WORKER(op,xname)                             \
-                {                                                             \
-                    if (GB_bind1st_tran (op, xname) (C, scalarx, A, Rowcounts,\
-                        Iter, A_slice, naslice) == GrB_SUCCESS) return ;      \
-                }                                                             \
+                #define GB_BINOP_WORKER(op,xname)                           \
+                {                                                           \
+                    if (GB_bind1st_tran (op, xname) (C, scalarx, A,         \
+                        Workspaces, A_slice, nworkspaces, nthreads)         \
+                        == GrB_SUCCESS) return ;                            \
+                }                                                           \
                 break ;
 
                 //--------------------------------------------------------------
@@ -222,11 +264,12 @@ void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
                 #define GB_bind2nd_tran(op,xname) \
                     GB_bind2nd_tran_ ## op ## xname
                 #undef  GB_BINOP_WORKER
-                #define GB_BINOP_WORKER(op,xname)                             \
-                {                                                             \
-                    if (GB_bind2nd_tran (op, xname) (C, A, scalarx, Rowcounts,\
-                        Iter, A_slice, naslice) == GrB_SUCCESS) return ;      \
-                }                                                             \
+                #define GB_BINOP_WORKER(op,xname)                           \
+                {                                                           \
+                    if (GB_bind2nd_tran (op, xname) (C, A, scalarx,         \
+                        Workspaces, A_slice, nworkspaces, nthreads)              \
+                        == GrB_SUCCESS) return ;                            \
+                }                                                           \
                 break ;
 
                 //--------------------------------------------------------------
@@ -244,7 +287,7 @@ void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
         // generic worker: transpose, typecast and apply a binary operator
         //----------------------------------------------------------------------
 
-        GB_BURBLE_MATRIX (A, "generic ") ;
+        GB_BURBLE_MATRIX (A, "(generic transpose: %s) ", op2->name) ;
         GB_Type_code acode = Atype->code ;
         GxB_binary_function fop = op2->function ;
 
@@ -258,9 +301,9 @@ void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
             {                                                               \
                 /* ywork = (ytype) Ax [pA] */                               \
                 GB_void ywork [GB_VLA(ysize)] ;                             \
-                cast_A_to_Y (ywork, Ax +(pA*asize), asize) ;                \
+                cast_A_to_Y (ywork, Ax +((pA)*asize), asize) ;              \
                 /* Cx [pC] = fop (xwork) ; Cx is of type op->ztype */       \
-                fop (Cx +(pC*zsize), scalarx, ywork) ;                      \
+                fop (Cx +((pC)*zsize), scalarx, ywork) ;                    \
             }
             #include "GB_unop_transpose.c"
         }
@@ -274,7 +317,7 @@ void GB_transpose_op    // transpose, typecast, and apply operator to a matrix
             {                                                               \
                 /* xwork = (xtype) Ax [pA] */                               \
                 GB_void xwork [GB_VLA(xsize)] ;                             \
-                cast_A_to_X (xwork, Ax +(pA*asize), asize) ;                \
+                cast_A_to_X (xwork, Ax +((pA)*asize), asize) ;              \
                 /* Cx [pC] = fop (xwork) ; Cx is of type op->ztype */       \
                 fop (Cx +(pC*zsize), xwork, scalarx) ;                      \
             }
diff --git a/GraphBLAS/Source/GB_unjumble.c b/GraphBLAS/Source/GB_unjumble.c
new file mode 100644
index 0000000000..0094c12722
--- /dev/null
+++ b/GraphBLAS/Source/GB_unjumble.c
@@ -0,0 +1,135 @@
+//------------------------------------------------------------------------------
+// GB_unjumble: unjumble the vectors of a matrix
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_sort.h"
+
+GrB_Info GB_unjumble        // unjumble a matrix
+(
+    GrB_Matrix A,           // matrix to unjumble
+    GB_Context Context
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (A, "A to unjumble", GB0) ;
+    ASSERT (!GB_ZOMBIES (A)) ;      // zombies must be killed first
+    ASSERT (GB_PENDING_OK (A)) ;    // pending tuples are not modified
+
+    if (!A->jumbled)
+    {
+        // nothing to do
+        return (GrB_SUCCESS) ;
+    }
+
+    // full and bitmap matrices are never jumbled 
+    ASSERT (!GB_IS_FULL (A)) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ;
+
+    //--------------------------------------------------------------------------
+    // get A
+    //--------------------------------------------------------------------------
+
+    const int64_t anvec = A->nvec ;
+    const int64_t anz = GB_NNZ (A) ;
+    const int64_t *GB_RESTRICT Ap = A->p ;
+    int64_t *GB_RESTRICT Ai = A->i ;
+    const size_t asize = A->type->size ;
+
+    GB_void   *Ax   = (GB_void *) A->x ;
+    uint8_t   *Ax1  = (uint8_t *) A->x ;
+    uint16_t  *Ax2  = (uint16_t *) A->x ;
+    uint32_t  *Ax4  = (uint32_t *) A->x ;
+    uint64_t  *Ax8  = (uint64_t *) A->x ;
+    GB_blob16 *Ax16 = (GB_blob16 *) A->x ;
+
+    //--------------------------------------------------------------------------
+    // determine the number of threads to use
+    //--------------------------------------------------------------------------
+
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+    int nthreads = GB_nthreads (anz + anvec, chunk, nthreads_max) ;
+    int ntasks = (nthreads == 1) ? 1 : (32 * nthreads) ;
+    ntasks = GB_IMIN (ntasks, anvec) ;
+    ntasks = GB_IMAX (ntasks, 1) ;
+
+    //--------------------------------------------------------------------------
+    // slice the work
+    //--------------------------------------------------------------------------
+
+    int64_t *GB_RESTRICT A_slice = NULL ;   // size ntasks + 1
+    if (!GB_pslice (&A_slice, Ap, anvec, ntasks, false))
+    { 
+        // out of memory
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // sort the vectors
+    //--------------------------------------------------------------------------
+
+    switch (asize)
+    {
+        case 1 : 
+            // GrB_BOOL, GrB_UINT8, GrB_INT8, and user defined types of size 1
+            #define GB_QSORT_WORKER \
+                GB_qsort_1b_size1 (Ai+pA_start, Ax1+pA_start, aknz) ;
+            #include "GB_unjumbled_template.c"
+            break ;
+
+        case 2 : 
+            // GrB_UINT16, GrB_INT16, and user-defined types of size 2
+            #define GB_QSORT_WORKER \
+                GB_qsort_1b_size2 (Ai+pA_start, Ax2+pA_start, aknz) ;
+            #include "GB_unjumbled_template.c"
+            break ;
+
+        case 4 : 
+            // GrB_UINT32, GrB_INT32, GrB_FP32, and user-defined types of size 4
+            #define GB_QSORT_WORKER \
+                GB_qsort_1b_size4 (Ai+pA_start, Ax4+pA_start, aknz) ;
+            #include "GB_unjumbled_template.c"
+            break ;
+
+        case 8 : 
+            // GrB_UINT64, GrB_INT64, GrB_FP64, GxB_FC32, and user-defined
+            // types of size 8
+            #define GB_QSORT_WORKER \
+                GB_qsort_1b_size8 (Ai+pA_start, Ax8+pA_start, aknz) ;
+            #include "GB_unjumbled_template.c"
+            break ;
+
+        case 16 : 
+            // GxB_FC64, and user-defined types of size 16
+            #define GB_QSORT_WORKER \
+                GB_qsort_1b_size16 (Ai+pA_start, Ax16+pA_start, aknz) ;
+            #include "GB_unjumbled_template.c"
+            break ;
+
+        default : 
+            // user-defined types of arbitrary size
+            #define GB_QSORT_WORKER \
+                GB_qsort_1b (Ai+pA_start, Ax+pA_start*asize, asize, aknz) ;
+            #include "GB_unjumbled_template.c"
+            break ;
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace and return result
+    //--------------------------------------------------------------------------
+
+    GB_FREE (A_slice) ;
+    A->jumbled = false ;        // A has been unjumbled
+    ASSERT_MATRIX_OK (A, "A unjumbled", GB0) ;
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GB_unused.h b/GraphBLAS/Source/GB_unused.h
index 29ee06a741..6ec237851d 100644
--- a/GraphBLAS/Source/GB_unused.h
+++ b/GraphBLAS/Source/GB_unused.h
@@ -2,8 +2,8 @@
 // GB_unused.h: pragmas to disable compiler warnings
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,7 +15,7 @@
 
 #if ( _MSC_VER && !__INTEL_COMPILER )
 // disable MS Visual Studio warnings
-GB_PRAGMA (warning (disable : 4101 ))
+#pragma warning (disable: 4101 )
 #elif defined ( __INTEL_COMPILER )
 // disable icc -w3 warnings
 #pragma warning (disable: 177 593)
diff --git a/GraphBLAS/Source/GB_wait.h b/GraphBLAS/Source/GB_wait.h
deleted file mode 100644
index 79359d63f5..0000000000
--- a/GraphBLAS/Source/GB_wait.h
+++ /dev/null
@@ -1,36 +0,0 @@
-// GB_wait.h: DEPRECATED: TODO in 4.0: delete this
-
-#ifndef GB_DEPRECATED_H
-#define GB_DEPRECATED_H
-
-bool GB_queue_remove            // remove matrix from queue
-(
-    GrB_Matrix A                // matrix to remove
-) ;
- 
-GB_PUBLIC
-bool GB_queue_insert            // insert matrix at the head of queue
-(
-    GrB_Matrix A                // matrix to insert
-) ;
-
-bool GB_queue_remove_head       // remove matrix at the head of queue
-(
-    GrB_Matrix *Ahandle         // return matrix or NULL if queue empty
-) ;
-
-bool GB_queue_status            // get the queue status of a matrix
-(
-    GrB_Matrix A,               // matrix to check
-    GrB_Matrix *p_head,         // head of the queue
-    GrB_Matrix *p_prev,         // prev from A
-    GrB_Matrix *p_next,         // next after A
-    bool *p_enqd                // true if A is in the queue
-) ;
-
-#if defined (USER_POSIX_THREADS)
-GB_PUBLIC pthread_mutex_t GB_sync ;
-#endif
-
-#endif
-
diff --git a/GraphBLAS/Source/GB_warnings.h b/GraphBLAS/Source/GB_warnings.h
new file mode 100644
index 0000000000..9591b840b3
--- /dev/null
+++ b/GraphBLAS/Source/GB_warnings.h
@@ -0,0 +1,100 @@
+//------------------------------------------------------------------------------
+// GB_warnings.h: turn off compiler warnings
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#if defined __INTEL_COMPILER
+
+//  10397: remark about where *.optrpt reports are placed
+//  15552: loop not vectorized
+#pragma warning (disable: 10397 15552 )
+
+// disable icc -w2 warnings
+//  191:  type qualifier meangingless
+//  193:  zero used for undefined #define
+//  589:  bypass initialization
+#pragma warning (disable: 191 193 )
+
+// disable icc -w3 warnings
+//  144:  initialize with incompatible pointer
+//  181:  format
+//  869:  unused parameters
+//  1572: floating point comparisons
+//  1599: shadow
+//  2259: typecasting may lose bits
+//  2282: unrecognized pragma
+//  2557: sign compare
+#pragma warning (disable: 144 181 869 1572 1599 2259 2282 2557 )
+
+// See GB_unused.h, for warnings 177 and 593, which are not globally
+// disabled, but selectively by #include'ing GB_unused.h as needed.
+
+// resolved (warnings no longer disabled globally):
+//  58:   sign compare
+//  167:  incompatible pointer
+//  177:  declared but unused
+//  186:  useless comparison
+//  188:  mixing enum types
+//  593:  set but not used
+//  981:  unspecified order
+//  1418: no external declaration
+//  1419: external declaration in source file
+//  2330: const incompatible
+//  2547: remark about include files
+//  3280: shadow
+
+#elif defined __GNUC__
+
+// disable warnings for gcc 5.x and higher:
+#if (__GNUC__ > 4)
+// disable warnings
+// #pragma GCC diagnostic ignored "-Wunknown-warning-option"
+#pragma GCC diagnostic ignored "-Wint-in-bool-context"
+#pragma GCC diagnostic ignored "-Wformat-truncation="
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+// enable these warnings as errors
+#pragma GCC diagnostic error "-Wmisleading-indentation"
+#endif
+
+// disable warnings from -Wall -Wextra -Wpendantic
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wsign-compare"
+#if defined ( __cplusplus )
+#pragma GCC diagnostic ignored "-Wwrite-strings"
+#else
+#pragma GCC diagnostic ignored "-Wincompatible-pointer-types"
+#endif
+
+// See GB_unused.h, where these two pragmas are used:
+// #pragma GCC diagnostic ignored "-Wunused-but-set-variable"
+// #pragma GCC diagnostic ignored "-Wunused-variable"
+
+// resolved (warnings no longer disabled globally):
+// #pragma GCC diagnostic ignored "-Wunknown-pragmas"
+// #pragma GCC diagnostic ignored "-Wtype-limits"
+// #pragma GCC diagnostic ignored "-Wunused-result"
+// #pragma GCC diagnostic ignored "-Wdiscarded-qualifiers"
+
+// enable these warnings as errors
+#pragma GCC diagnostic error "-Wswitch-default"
+#if !defined ( __cplusplus )
+#pragma GCC diagnostic error "-Wmissing-prototypes"
+#endif
+
+// #pragma GCC diagnostic error "-Wdouble-promotion"
+
+#endif
+
+// disable warnings for clang
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wpointer-sign"
+#endif
+
+#if ( _MSC_VER && !__INTEL_COMPILER )
+// disable MS Visual Studio warnings
+#pragma warning(disable:4146)
+#endif
diff --git a/GraphBLAS/Source/GB_zombie.h b/GraphBLAS/Source/GB_zombie.h
new file mode 100644
index 0000000000..f5d7d6c819
--- /dev/null
+++ b/GraphBLAS/Source/GB_zombie.h
@@ -0,0 +1,47 @@
+//------------------------------------------------------------------------------
+// GB_zombie.h: definitions for zombies
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#ifndef GB_ZOMBIE_H
+#define GB_ZOMBIE_H
+
+// GB_FLIP is a kind of "negation" about (-1) of a zero-based index.
+// If i >= 0 then it is not flipped.
+// If i < 0 then it has been flipped.
+// Like negation, GB_FLIP is its own inverse: GB_FLIP (GB_FLIP (i)) == i.
+// The "nil" value, -1, doesn't change when flipped: GB_FLIP (-1) = -1.
+// GB_UNFLIP(i) is like taking an absolute value, undoing any GB_FLIP(i).
+
+// An entry A(i,j) in a matrix can be marked as a "zombie".  A zombie is an
+// entry that has been marked for deletion, but hasn't been deleted yet because
+// it's more efficient to delete all zombies all at once, instead of one at a
+// time.  Zombies are created by submatrix assignment, C(I,J)=A which copies
+// not only new entries into C, but it also deletes entries already present in
+// C.  If an entry appears in A but not C(I,J), it is a new entry; new entries
+// placed in the pending tuple lists to be added later.  If an entry appear in
+// C(I,J) but NOT in A, then it is marked for deletion by flipping its row
+// index, marking it as a zombie.
+
+// Zombies can be restored as regular entries by GrB_*assign.  If an assignment
+// C(I,J)=A finds an entry in A that is a zombie in C, the zombie becomes a
+// regular entry, taking on the value from A.  The row index is unflipped.
+
+// Zombies are deleted and pending tuples are added into the matrix all at
+// once, by GB_Matrix_wait.
+
+#define GB_FLIP(i)             (-(i)-2)
+#define GB_IS_FLIPPED(i)       ((i) < 0)
+#define GB_IS_ZOMBIE(i)        ((i) < 0)
+#define GB_IS_NOT_FLIPPED(i)   ((i) >= 0)
+#define GB_IS_NOT_ZOMBIE(Ai,p) ((Ai == NULL) ? true : (Ai [p] >= 0))
+#define GB_UNFLIP(i)           (((i) < 0) ? GB_FLIP(i) : (i))
+#define GBI_UNFLIP(Ai,p,avlen)      \
+    ((Ai == NULL) ? ((p) % (avlen)) : GB_UNFLIP (Ai [p]))
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_div_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__any_div_fc32.c
index 35d41dee2f..4f0c217d3f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_div_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_div_fc32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  GxB_FC32_t x_op_y = GB_FC32_div (aik, bkj) ; cij = x_op_y
 // Identity: GxB_CMPLXF(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_div (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_div (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    1
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_DIV || GxB_NO_FC32 || GxB_NO_ANY_FC32 || GxB_NO_DIV_FC32 || GxB_NO_ANY_DIV_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_div_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_div_fc32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_div_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_div_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_div_fc32
 GrB_Info GB_Asaxpy3B__any_div_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_div_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__any_div_fc64.c
index 384ac7ea83..ca6147deb4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_div_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_div_fc64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  GxB_FC64_t x_op_y = GB_FC64_div (aik, bkj) ; cij = x_op_y
 // Identity: GxB_CMPLX(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_div (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_div (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    1
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_DIV || GxB_NO_FC64 || GxB_NO_ANY_FC64 || GxB_NO_DIV_FC64 || GxB_NO_ANY_DIV_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_div_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_div_fc64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_div_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_div_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_div_fc64
 GrB_Info GB_Asaxpy3B__any_div_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_div_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_div_fp32.c
index b9d08607b9..a8df131f34 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_div_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_div_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik / bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x / y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x / y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_DIV || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_DIV_FP32 || GxB_NO_ANY_DIV_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_div_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_div_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_div_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_div_fp32
 GrB_Info GB_Asaxpy3B__any_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_div_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_div_fp64.c
index 978f269dbd..7fbb36220b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_div_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_div_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik / bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x / y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x / y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_DIV || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_DIV_FP64 || GxB_NO_ANY_DIV_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_div_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_div_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_div_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_div_fp64
 GrB_Info GB_Asaxpy3B__any_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_div_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_div_int16.c
index 4ca907a603..0962a477fe 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_div_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_div_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 16) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IDIV_SIGNED (x, y, 16) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) (GB_IDIV_SIGNED (ax, bx, 16))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_DIV || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_DIV_INT16 || GxB_NO_ANY_DIV_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_div_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_div_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_div_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_div_int16
 GrB_Info GB_Asaxpy3B__any_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_div_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_div_int32.c
index 85743d143e..c743b8e920 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_div_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_div_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 32) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IDIV_SIGNED (x, y, 32) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) (GB_IDIV_SIGNED (ax, bx, 32))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_DIV || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_DIV_INT32 || GxB_NO_ANY_DIV_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_div_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_div_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_div_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_div_int32
 GrB_Info GB_Asaxpy3B__any_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_div_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_div_int64.c
index 9e83a89a53..2270b202d1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_div_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_div_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 64) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IDIV_SIGNED (x, y, 64) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) (GB_IDIV_SIGNED (ax, bx, 64))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_DIV || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_DIV_INT64 || GxB_NO_ANY_DIV_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_div_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_div_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_div_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_div_int64
 GrB_Info GB_Asaxpy3B__any_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_div_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_div_int8.c
index 76211d9d0c..4cc9a94be7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_div_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_div_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 8) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IDIV_SIGNED (x, y, 8) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) (GB_IDIV_SIGNED (ax, bx, 8))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_DIV || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_DIV_INT8 || GxB_NO_ANY_DIV_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_div_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_div_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_div_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_div_int8
 GrB_Info GB_Asaxpy3B__any_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_div_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_div_uint16.c
index 8622b72fca..c8711c1361 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_div_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_div_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 16) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IDIV_UNSIGNED (x, y, 16) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) (GB_IDIV_UNSIGNED (ax, bx, 16))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_DIV || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_DIV_UINT16 || GxB_NO_ANY_DIV_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_div_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_div_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_div_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_div_uint16
 GrB_Info GB_Asaxpy3B__any_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_div_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_div_uint32.c
index 8698f585b5..b80ef68248 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_div_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_div_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 32) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IDIV_UNSIGNED (x, y, 32) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) (GB_IDIV_UNSIGNED (ax, bx, 32))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_DIV || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_DIV_UINT32 || GxB_NO_ANY_DIV_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_div_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_div_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_div_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_div_uint32
 GrB_Info GB_Asaxpy3B__any_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_div_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_div_uint64.c
index 653d3c5cc6..2525656367 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_div_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_div_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 64) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IDIV_UNSIGNED (x, y, 64) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) (GB_IDIV_UNSIGNED (ax, bx, 64))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_DIV || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_DIV_UINT64 || GxB_NO_ANY_DIV_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_div_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_div_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_div_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_div_uint64
 GrB_Info GB_Asaxpy3B__any_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_div_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_div_uint8.c
index a93d1b7823..21fe130bf0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_div_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_div_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 8) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IDIV_UNSIGNED (x, y, 8) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) (GB_IDIV_UNSIGNED (ax, bx, 8))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_DIV || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_DIV_UINT8 || GxB_NO_ANY_DIV_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_div_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_div_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_div_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_div_uint8
 GrB_Info GB_Asaxpy3B__any_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_eq_bool.c b/GraphBLAS/Source/Generated/GB_AxB__any_eq_bool.c
index a0679bccbc..b916f1ef26 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_eq_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_eq_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_EQ || GxB_NO_BOOL || GxB_NO_ANY_BOOL || GxB_NO_EQ_BOOL || GxB_NO_ANY_EQ_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_eq_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_eq_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_eq_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_eq_bool
 GrB_Info GB_Asaxpy3B__any_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_eq_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_eq_fp32.c
index eee4f440f4..9992de5cb5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_eq_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_eq_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_EQ || GxB_NO_FP32 || GxB_NO_ANY_BOOL || GxB_NO_EQ_FP32 || GxB_NO_ANY_EQ_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_eq_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_eq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_eq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_eq_fp32
 GrB_Info GB_Asaxpy3B__any_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_eq_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_eq_fp64.c
index 6e4e4bf179..3fa49f7537 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_eq_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_eq_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_EQ || GxB_NO_FP64 || GxB_NO_ANY_BOOL || GxB_NO_EQ_FP64 || GxB_NO_ANY_EQ_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_eq_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_eq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_eq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_eq_fp64
 GrB_Info GB_Asaxpy3B__any_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_eq_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_eq_int16.c
index f1d1a6873e..55de468274 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_eq_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_eq_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_EQ || GxB_NO_INT16 || GxB_NO_ANY_BOOL || GxB_NO_EQ_INT16 || GxB_NO_ANY_EQ_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_eq_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_eq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_eq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_eq_int16
 GrB_Info GB_Asaxpy3B__any_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_eq_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_eq_int32.c
index 08aa3124f4..97412b527c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_eq_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_eq_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_EQ || GxB_NO_INT32 || GxB_NO_ANY_BOOL || GxB_NO_EQ_INT32 || GxB_NO_ANY_EQ_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_eq_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_eq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_eq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_eq_int32
 GrB_Info GB_Asaxpy3B__any_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_eq_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_eq_int64.c
index e296f69e01..68b7fb2ee5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_eq_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_eq_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_EQ || GxB_NO_INT64 || GxB_NO_ANY_BOOL || GxB_NO_EQ_INT64 || GxB_NO_ANY_EQ_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_eq_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_eq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_eq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_eq_int64
 GrB_Info GB_Asaxpy3B__any_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_eq_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_eq_int8.c
index c7aaccf0fb..c9edf463b0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_eq_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_eq_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_EQ || GxB_NO_INT8 || GxB_NO_ANY_BOOL || GxB_NO_EQ_INT8 || GxB_NO_ANY_EQ_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_eq_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_eq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_eq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_eq_int8
 GrB_Info GB_Asaxpy3B__any_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint16.c
index c2f883bba2..43ff7dddd2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_EQ || GxB_NO_UINT16 || GxB_NO_ANY_BOOL || GxB_NO_EQ_UINT16 || GxB_NO_ANY_EQ_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_eq_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_eq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_eq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_eq_uint16
 GrB_Info GB_Asaxpy3B__any_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint32.c
index 100b1edf93..b2b6bbea18 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_EQ || GxB_NO_UINT32 || GxB_NO_ANY_BOOL || GxB_NO_EQ_UINT32 || GxB_NO_ANY_EQ_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_eq_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_eq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_eq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_eq_uint32
 GrB_Info GB_Asaxpy3B__any_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint64.c
index b4877d9e37..66d5ff2a88 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_EQ || GxB_NO_UINT64 || GxB_NO_ANY_BOOL || GxB_NO_EQ_UINT64 || GxB_NO_ANY_EQ_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_eq_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_eq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_eq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_eq_uint64
 GrB_Info GB_Asaxpy3B__any_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint8.c
index f8fff2d8d6..daf886e9a3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_eq_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_EQ || GxB_NO_UINT8 || GxB_NO_ANY_BOOL || GxB_NO_EQ_UINT8 || GxB_NO_ANY_EQ_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_eq_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_eq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_eq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_eq_uint8
 GrB_Info GB_Asaxpy3B__any_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_bool.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_bool.c
index 2bc11226c4..b8f27ed0db 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_BOOL || GxB_NO_ANY_BOOL || GxB_NO_FIRST_BOOL || GxB_NO_ANY_FIRST_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_bool
 GrB_Info GB_Asaxpy3B__any_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_fc32.c
index 08b6843d09..55aa2ed18f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_fc32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: GxB_CMPLXF(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    1
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_FC32 || GxB_NO_ANY_FC32 || GxB_NO_FIRST_FC32 || GxB_NO_ANY_FIRST_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_fc32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_fc32
 GrB_Info GB_Asaxpy3B__any_first_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_fc64.c
index 394c970942..d7333344a9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_fc64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: GxB_CMPLX(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    1
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_FC64 || GxB_NO_ANY_FC64 || GxB_NO_FIRST_FC64 || GxB_NO_ANY_FIRST_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_fc64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_fc64
 GrB_Info GB_Asaxpy3B__any_first_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_fp32.c
index b86c9493b2..d2246d44ea 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_FIRST_FP32 || GxB_NO_ANY_FIRST_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_fp32
 GrB_Info GB_Asaxpy3B__any_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_fp64.c
index 704f049a04..ca0043d590 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_FIRST_FP64 || GxB_NO_ANY_FIRST_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_fp64
 GrB_Info GB_Asaxpy3B__any_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_int16.c
index 4694c7830f..c4cdb31e18 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_FIRST_INT16 || GxB_NO_ANY_FIRST_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_int16
 GrB_Info GB_Asaxpy3B__any_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_int32.c
index bcc2283b7a..430369a8cf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_FIRST_INT32 || GxB_NO_ANY_FIRST_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_int32
 GrB_Info GB_Asaxpy3B__any_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_int64.c
index 16ae94092a..4c1a0eeb24 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_FIRST_INT64 || GxB_NO_ANY_FIRST_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_int64
 GrB_Info GB_Asaxpy3B__any_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_int8.c
index c29ab68c3e..d5ccdd83c4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_FIRST_INT8 || GxB_NO_ANY_FIRST_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_int8
 GrB_Info GB_Asaxpy3B__any_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_uint16.c
index cb3f703d7e..2ae921aa4d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_FIRST_UINT16 || GxB_NO_ANY_FIRST_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_uint16
 GrB_Info GB_Asaxpy3B__any_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_uint32.c
index e2bc5be2ee..7e8faf5bc9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_FIRST_UINT32 || GxB_NO_ANY_FIRST_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_uint32
 GrB_Info GB_Asaxpy3B__any_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_uint64.c
index 304bfde754..d192f67d1a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_FIRST_UINT64 || GxB_NO_ANY_FIRST_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_uint64
 GrB_Info GB_Asaxpy3B__any_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_first_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_first_uint8.c
index bdd1ca2018..96b1d14d32 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_first_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_first_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = aik
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_FIRST || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_FIRST_UINT8 || GxB_NO_ANY_FIRST_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_first_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_first_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_first_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_first_uint8
 GrB_Info GB_Asaxpy3B__any_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_firsti1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_firsti1_int32.c
new file mode 100644
index 0000000000..d9d9e120b5
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_firsti1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__any_firsti1_int32
+// A'*B function (dot3):     GB_Adot3B__any_firsti1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__any_firsti1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__any_firsti1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (i+1)
+// Add:      cij = z
+//           'any' monoid?  1
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  cij = (i+1)
+// Identity: 0
+// Terminal: break ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (i+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z = (i+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    break ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    1
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((i+1)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_ANY || GxB_NO_FIRSTI1 || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_FIRSTI1_INT32 || GxB_NO_ANY_FIRSTI1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__any_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__any_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__any_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__any_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_firsti1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_firsti1_int64.c
new file mode 100644
index 0000000000..b7da3f10b0
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_firsti1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__any_firsti1_int64
+// A'*B function (dot3):     GB_Adot3B__any_firsti1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__any_firsti1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__any_firsti1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (i+1)
+// Add:      cij = z
+//           'any' monoid?  1
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  cij = (i+1)
+// Identity: 0
+// Terminal: break ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (i+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z = (i+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    break ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    1
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((i+1)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_ANY || GxB_NO_FIRSTI1 || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_FIRSTI1_INT64 || GxB_NO_ANY_FIRSTI1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__any_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__any_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__any_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__any_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_firsti_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_firsti_int32.c
new file mode 100644
index 0000000000..c99e652787
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_firsti_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__any_firsti_int32
+// A'*B function (dot3):     GB_Adot3B__any_firsti_int32
+// C+=A'*B function (dot4):  GB_Adot4B__any_firsti_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__any_firsti_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = i
+// Add:      cij = z
+//           'any' monoid?  1
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  cij = i
+// Identity: 0
+// Terminal: break ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = i
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z = i
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    break ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    1
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (i) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_ANY || GxB_NO_FIRSTI || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_FIRSTI_INT32 || GxB_NO_ANY_FIRSTI_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__any_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__any_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__any_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__any_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_firsti_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_firsti_int64.c
new file mode 100644
index 0000000000..4f0b51876d
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_firsti_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__any_firsti_int64
+// A'*B function (dot3):     GB_Adot3B__any_firsti_int64
+// C+=A'*B function (dot4):  GB_Adot4B__any_firsti_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__any_firsti_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = i
+// Add:      cij = z
+//           'any' monoid?  1
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  cij = i
+// Identity: 0
+// Terminal: break ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = i
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z = i
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    break ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    1
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (i) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_ANY || GxB_NO_FIRSTI || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_FIRSTI_INT64 || GxB_NO_ANY_FIRSTI_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__any_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__any_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__any_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__any_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_firstj1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_firstj1_int32.c
new file mode 100644
index 0000000000..1180afb758
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_firstj1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__any_firstj1_int32
+// A'*B function (dot3):     GB_Adot3B__any_firstj1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__any_firstj1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__any_firstj1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (k+1)
+// Add:      cij = z
+//           'any' monoid?  1
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  cij = (k+1)
+// Identity: 0
+// Terminal: break ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (k+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z = (k+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    break ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    1
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((k+1)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_ANY || GxB_NO_FIRSTJ1 || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_FIRSTJ1_INT32 || GxB_NO_ANY_FIRSTJ1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__any_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__any_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__any_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__any_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_firstj1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_firstj1_int64.c
new file mode 100644
index 0000000000..16b731b385
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_firstj1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__any_firstj1_int64
+// A'*B function (dot3):     GB_Adot3B__any_firstj1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__any_firstj1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__any_firstj1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (k+1)
+// Add:      cij = z
+//           'any' monoid?  1
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  cij = (k+1)
+// Identity: 0
+// Terminal: break ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (k+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z = (k+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    break ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    1
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((k+1)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_ANY || GxB_NO_FIRSTJ1 || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_FIRSTJ1_INT64 || GxB_NO_ANY_FIRSTJ1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__any_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__any_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__any_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__any_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_firstj_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_firstj_int32.c
new file mode 100644
index 0000000000..ea95032177
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_firstj_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__any_firstj_int32
+// A'*B function (dot3):     GB_Adot3B__any_firstj_int32
+// C+=A'*B function (dot4):  GB_Adot4B__any_firstj_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__any_firstj_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = k
+// Add:      cij = z
+//           'any' monoid?  1
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  cij = k
+// Identity: 0
+// Terminal: break ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = k
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z = k
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    break ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    1
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (k) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_ANY || GxB_NO_FIRSTJ || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_FIRSTJ_INT32 || GxB_NO_ANY_FIRSTJ_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__any_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__any_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__any_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__any_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_firstj_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_firstj_int64.c
new file mode 100644
index 0000000000..d72b589d7a
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_firstj_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__any_firstj_int64
+// A'*B function (dot3):     GB_Adot3B__any_firstj_int64
+// C+=A'*B function (dot4):  GB_Adot4B__any_firstj_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__any_firstj_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = k
+// Add:      cij = z
+//           'any' monoid?  1
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  cij = k
+// Identity: 0
+// Terminal: break ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = k
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z = k
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    break ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    1
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (k) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_ANY || GxB_NO_FIRSTJ || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_FIRSTJ_INT64 || GxB_NO_ANY_FIRSTJ_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__any_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__any_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__any_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__any_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ge_bool.c b/GraphBLAS/Source/Generated/GB_AxB__any_ge_bool.c
index 9e39ba8dd2..60c611b406 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ge_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ge_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GE || GxB_NO_BOOL || GxB_NO_ANY_BOOL || GxB_NO_GE_BOOL || GxB_NO_ANY_GE_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ge_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ge_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ge_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ge_bool
 GrB_Info GB_Asaxpy3B__any_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ge_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_ge_fp32.c
index 1aca7f0d79..113f73fd62 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ge_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ge_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GE || GxB_NO_FP32 || GxB_NO_ANY_BOOL || GxB_NO_GE_FP32 || GxB_NO_ANY_GE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ge_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ge_fp32
 GrB_Info GB_Asaxpy3B__any_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ge_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_ge_fp64.c
index 2fa5a25b67..6f62a90661 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ge_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ge_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GE || GxB_NO_FP64 || GxB_NO_ANY_BOOL || GxB_NO_GE_FP64 || GxB_NO_ANY_GE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ge_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ge_fp64
 GrB_Info GB_Asaxpy3B__any_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ge_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_ge_int16.c
index de971a0b85..443529e822 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ge_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ge_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GE || GxB_NO_INT16 || GxB_NO_ANY_BOOL || GxB_NO_GE_INT16 || GxB_NO_ANY_GE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ge_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ge_int16
 GrB_Info GB_Asaxpy3B__any_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ge_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_ge_int32.c
index 8011c040bf..8937de5148 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ge_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ge_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GE || GxB_NO_INT32 || GxB_NO_ANY_BOOL || GxB_NO_GE_INT32 || GxB_NO_ANY_GE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ge_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ge_int32
 GrB_Info GB_Asaxpy3B__any_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ge_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_ge_int64.c
index 7a56655ef3..8b838fe8a6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ge_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ge_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GE || GxB_NO_INT64 || GxB_NO_ANY_BOOL || GxB_NO_GE_INT64 || GxB_NO_ANY_GE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ge_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ge_int64
 GrB_Info GB_Asaxpy3B__any_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ge_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_ge_int8.c
index a714091644..d94b77a6b0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ge_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ge_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GE || GxB_NO_INT8 || GxB_NO_ANY_BOOL || GxB_NO_GE_INT8 || GxB_NO_ANY_GE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ge_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ge_int8
 GrB_Info GB_Asaxpy3B__any_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint16.c
index 6567d62495..51bd84c6d4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GE || GxB_NO_UINT16 || GxB_NO_ANY_BOOL || GxB_NO_GE_UINT16 || GxB_NO_ANY_GE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ge_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ge_uint16
 GrB_Info GB_Asaxpy3B__any_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint32.c
index 07f97ad411..fce8538c07 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GE || GxB_NO_UINT32 || GxB_NO_ANY_BOOL || GxB_NO_GE_UINT32 || GxB_NO_ANY_GE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ge_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ge_uint32
 GrB_Info GB_Asaxpy3B__any_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint64.c
index 745409a9d2..5efffad338 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GE || GxB_NO_UINT64 || GxB_NO_ANY_BOOL || GxB_NO_GE_UINT64 || GxB_NO_ANY_GE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ge_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ge_uint64
 GrB_Info GB_Asaxpy3B__any_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint8.c
index 6f73c324d3..9cee918772 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ge_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GE || GxB_NO_UINT8 || GxB_NO_ANY_BOOL || GxB_NO_GE_UINT8 || GxB_NO_ANY_GE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ge_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ge_uint8
 GrB_Info GB_Asaxpy3B__any_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_gt_bool.c b/GraphBLAS/Source/Generated/GB_AxB__any_gt_bool.c
index a9891db738..ba8492f446 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_gt_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_gt_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GT || GxB_NO_BOOL || GxB_NO_ANY_BOOL || GxB_NO_GT_BOOL || GxB_NO_ANY_GT_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_gt_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_gt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_gt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_gt_bool
 GrB_Info GB_Asaxpy3B__any_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_gt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_gt_fp32.c
index 0cb5a63415..c4176055d5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_gt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_gt_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GT || GxB_NO_FP32 || GxB_NO_ANY_BOOL || GxB_NO_GT_FP32 || GxB_NO_ANY_GT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_gt_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_gt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_gt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_gt_fp32
 GrB_Info GB_Asaxpy3B__any_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_gt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_gt_fp64.c
index 8977b07f11..b1aed653f7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_gt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_gt_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GT || GxB_NO_FP64 || GxB_NO_ANY_BOOL || GxB_NO_GT_FP64 || GxB_NO_ANY_GT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_gt_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_gt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_gt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_gt_fp64
 GrB_Info GB_Asaxpy3B__any_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_gt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_gt_int16.c
index e9f37b7f82..6c86a1b755 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_gt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_gt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GT || GxB_NO_INT16 || GxB_NO_ANY_BOOL || GxB_NO_GT_INT16 || GxB_NO_ANY_GT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_gt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_gt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_gt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_gt_int16
 GrB_Info GB_Asaxpy3B__any_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_gt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_gt_int32.c
index 56c58c3fef..dcfc888008 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_gt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_gt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GT || GxB_NO_INT32 || GxB_NO_ANY_BOOL || GxB_NO_GT_INT32 || GxB_NO_ANY_GT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_gt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_gt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_gt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_gt_int32
 GrB_Info GB_Asaxpy3B__any_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_gt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_gt_int64.c
index 07d6c44f2c..70eafb59cb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_gt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_gt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GT || GxB_NO_INT64 || GxB_NO_ANY_BOOL || GxB_NO_GT_INT64 || GxB_NO_ANY_GT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_gt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_gt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_gt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_gt_int64
 GrB_Info GB_Asaxpy3B__any_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_gt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_gt_int8.c
index bbbab6972e..9204d07886 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_gt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_gt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GT || GxB_NO_INT8 || GxB_NO_ANY_BOOL || GxB_NO_GT_INT8 || GxB_NO_ANY_GT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_gt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_gt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_gt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_gt_int8
 GrB_Info GB_Asaxpy3B__any_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint16.c
index b0d2fc5fd7..c4f135e385 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GT || GxB_NO_UINT16 || GxB_NO_ANY_BOOL || GxB_NO_GT_UINT16 || GxB_NO_ANY_GT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_gt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_gt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_gt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_gt_uint16
 GrB_Info GB_Asaxpy3B__any_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint32.c
index 8b88353391..25fbcc9332 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GT || GxB_NO_UINT32 || GxB_NO_ANY_BOOL || GxB_NO_GT_UINT32 || GxB_NO_ANY_GT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_gt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_gt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_gt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_gt_uint32
 GrB_Info GB_Asaxpy3B__any_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint64.c
index ff59d37b97..1bf98866bd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GT || GxB_NO_UINT64 || GxB_NO_ANY_BOOL || GxB_NO_GT_UINT64 || GxB_NO_ANY_GT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_gt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_gt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_gt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_gt_uint64
 GrB_Info GB_Asaxpy3B__any_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint8.c
index 1f30f59898..0609214fb4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_gt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_GT || GxB_NO_UINT8 || GxB_NO_ANY_BOOL || GxB_NO_GT_UINT8 || GxB_NO_ANY_GT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_gt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_gt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_gt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_gt_uint8
 GrB_Info GB_Asaxpy3B__any_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_fp32.c
index 06a81779d8..786ec026e4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISEQ || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_ISEQ_FP32 || GxB_NO_ANY_ISEQ_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_iseq_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_iseq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_iseq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_iseq_fp32
 GrB_Info GB_Asaxpy3B__any_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_fp64.c
index f3a9f56a8b..f9c3f36e4b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik == bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISEQ || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_ISEQ_FP64 || GxB_NO_ANY_ISEQ_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_iseq_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_iseq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_iseq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_iseq_fp64
 GrB_Info GB_Asaxpy3B__any_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int16.c
index b8a76e6f1f..f980abdf7e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik == bkj)
+// MultAdd:  int16_t x_op_y = (aik == bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x == y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) ((ax == bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISEQ || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_ISEQ_INT16 || GxB_NO_ANY_ISEQ_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_iseq_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_iseq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_iseq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_iseq_int16
 GrB_Info GB_Asaxpy3B__any_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int32.c
index 17d284b1c7..943e531047 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik == bkj)
+// MultAdd:  int32_t x_op_y = (aik == bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x == y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) ((ax == bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISEQ || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_ISEQ_INT32 || GxB_NO_ANY_ISEQ_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_iseq_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_iseq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_iseq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_iseq_int32
 GrB_Info GB_Asaxpy3B__any_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int64.c
index ce1906d78f..fb2d7afcab 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik == bkj)
+// MultAdd:  int64_t x_op_y = (aik == bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x == y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) ((ax == bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISEQ || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_ISEQ_INT64 || GxB_NO_ANY_ISEQ_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_iseq_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_iseq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_iseq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_iseq_int64
 GrB_Info GB_Asaxpy3B__any_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int8.c
index 282fe7ae45..1dbed106e4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik == bkj)
+// MultAdd:  int8_t x_op_y = (aik == bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x == y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) ((ax == bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISEQ || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_ISEQ_INT8 || GxB_NO_ANY_ISEQ_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_iseq_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_iseq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_iseq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_iseq_int8
 GrB_Info GB_Asaxpy3B__any_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint16.c
index b9a8b24183..7bf2a3b20b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik == bkj)
+// MultAdd:  uint16_t x_op_y = (aik == bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x == y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) ((ax == bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISEQ || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_ISEQ_UINT16 || GxB_NO_ANY_ISEQ_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_iseq_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_iseq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_iseq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_iseq_uint16
 GrB_Info GB_Asaxpy3B__any_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint32.c
index db687a2a73..4272047865 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik == bkj)
+// MultAdd:  uint32_t x_op_y = (aik == bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x == y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) ((ax == bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISEQ || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_ISEQ_UINT32 || GxB_NO_ANY_ISEQ_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_iseq_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_iseq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_iseq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_iseq_uint32
 GrB_Info GB_Asaxpy3B__any_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint64.c
index bae68caefc..04c0fe6722 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik == bkj)
+// MultAdd:  uint64_t x_op_y = (aik == bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x == y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) ((ax == bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISEQ || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_ISEQ_UINT64 || GxB_NO_ANY_ISEQ_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_iseq_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_iseq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_iseq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_iseq_uint64
 GrB_Info GB_Asaxpy3B__any_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint8.c
index 376901e9a6..c7d19fa724 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_iseq_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik == bkj)
+// MultAdd:  uint8_t x_op_y = (aik == bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x == y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) ((ax == bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISEQ || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_ISEQ_UINT8 || GxB_NO_ANY_ISEQ_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_iseq_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_iseq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_iseq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_iseq_uint8
 GrB_Info GB_Asaxpy3B__any_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isge_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_isge_fp32.c
index 82bc05766b..821471db71 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isge_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isge_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGE || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_ISGE_FP32 || GxB_NO_ANY_ISGE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isge_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isge_fp32
 GrB_Info GB_Asaxpy3B__any_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isge_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_isge_fp64.c
index e1b9eb2e17..da27f945dd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isge_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isge_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik >= bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGE || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_ISGE_FP64 || GxB_NO_ANY_ISGE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isge_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isge_fp64
 GrB_Info GB_Asaxpy3B__any_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isge_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_isge_int16.c
index 0c82532e9c..c9a1eeed22 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isge_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isge_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik >= bkj)
+// MultAdd:  int16_t x_op_y = (aik >= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x >= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) ((ax >= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGE || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_ISGE_INT16 || GxB_NO_ANY_ISGE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isge_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isge_int16
 GrB_Info GB_Asaxpy3B__any_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isge_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_isge_int32.c
index 98e882d6e0..88c9780ed8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isge_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isge_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik >= bkj)
+// MultAdd:  int32_t x_op_y = (aik >= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x >= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) ((ax >= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGE || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_ISGE_INT32 || GxB_NO_ANY_ISGE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isge_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isge_int32
 GrB_Info GB_Asaxpy3B__any_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isge_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_isge_int64.c
index afe62721e0..73bd5b7ae5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isge_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isge_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik >= bkj)
+// MultAdd:  int64_t x_op_y = (aik >= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x >= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) ((ax >= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGE || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_ISGE_INT64 || GxB_NO_ANY_ISGE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isge_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isge_int64
 GrB_Info GB_Asaxpy3B__any_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isge_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_isge_int8.c
index b84e09eae8..e08f0fee6a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isge_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isge_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik >= bkj)
+// MultAdd:  int8_t x_op_y = (aik >= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x >= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) ((ax >= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGE || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_ISGE_INT8 || GxB_NO_ANY_ISGE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isge_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isge_int8
 GrB_Info GB_Asaxpy3B__any_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint16.c
index 0dc9fd4194..7ff8ee1145 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik >= bkj)
+// MultAdd:  uint16_t x_op_y = (aik >= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x >= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) ((ax >= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGE || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_ISGE_UINT16 || GxB_NO_ANY_ISGE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isge_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isge_uint16
 GrB_Info GB_Asaxpy3B__any_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint32.c
index b983ac133e..ed38c499e3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik >= bkj)
+// MultAdd:  uint32_t x_op_y = (aik >= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x >= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) ((ax >= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGE || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_ISGE_UINT32 || GxB_NO_ANY_ISGE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isge_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isge_uint32
 GrB_Info GB_Asaxpy3B__any_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint64.c
index 7d57764b61..1a69387787 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik >= bkj)
+// MultAdd:  uint64_t x_op_y = (aik >= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x >= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) ((ax >= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGE || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_ISGE_UINT64 || GxB_NO_ANY_ISGE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isge_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isge_uint64
 GrB_Info GB_Asaxpy3B__any_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint8.c
index 86cce2df6c..6fb9ac0216 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isge_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik >= bkj)
+// MultAdd:  uint8_t x_op_y = (aik >= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x >= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) ((ax >= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGE || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_ISGE_UINT8 || GxB_NO_ANY_ISGE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isge_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isge_uint8
 GrB_Info GB_Asaxpy3B__any_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_fp32.c
index 7e43faddd2..fa94e6818f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGT || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_ISGT_FP32 || GxB_NO_ANY_ISGT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isgt_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isgt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isgt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isgt_fp32
 GrB_Info GB_Asaxpy3B__any_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_fp64.c
index b279d3bb7f..3d2b06c404 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik > bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGT || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_ISGT_FP64 || GxB_NO_ANY_ISGT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isgt_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isgt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isgt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isgt_fp64
 GrB_Info GB_Asaxpy3B__any_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int16.c
index e453790a67..e5755a0c44 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik > bkj)
+// MultAdd:  int16_t x_op_y = (aik > bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x > y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) ((ax > bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGT || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_ISGT_INT16 || GxB_NO_ANY_ISGT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isgt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isgt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isgt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isgt_int16
 GrB_Info GB_Asaxpy3B__any_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int32.c
index b2d78a7008..9e700439d7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik > bkj)
+// MultAdd:  int32_t x_op_y = (aik > bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x > y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) ((ax > bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGT || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_ISGT_INT32 || GxB_NO_ANY_ISGT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isgt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isgt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isgt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isgt_int32
 GrB_Info GB_Asaxpy3B__any_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int64.c
index 13917292da..d8e489c405 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik > bkj)
+// MultAdd:  int64_t x_op_y = (aik > bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x > y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) ((ax > bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGT || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_ISGT_INT64 || GxB_NO_ANY_ISGT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isgt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isgt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isgt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isgt_int64
 GrB_Info GB_Asaxpy3B__any_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int8.c
index df003aa52b..4560e026b0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik > bkj)
+// MultAdd:  int8_t x_op_y = (aik > bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x > y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) ((ax > bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGT || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_ISGT_INT8 || GxB_NO_ANY_ISGT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isgt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isgt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isgt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isgt_int8
 GrB_Info GB_Asaxpy3B__any_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint16.c
index 13c555da82..7fcaa082a1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik > bkj)
+// MultAdd:  uint16_t x_op_y = (aik > bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x > y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) ((ax > bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGT || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_ISGT_UINT16 || GxB_NO_ANY_ISGT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isgt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isgt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isgt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isgt_uint16
 GrB_Info GB_Asaxpy3B__any_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint32.c
index 831124cbf6..ea0e4b390c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik > bkj)
+// MultAdd:  uint32_t x_op_y = (aik > bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x > y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) ((ax > bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGT || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_ISGT_UINT32 || GxB_NO_ANY_ISGT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isgt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isgt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isgt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isgt_uint32
 GrB_Info GB_Asaxpy3B__any_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint64.c
index 494db9c74b..1452277381 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik > bkj)
+// MultAdd:  uint64_t x_op_y = (aik > bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x > y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) ((ax > bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGT || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_ISGT_UINT64 || GxB_NO_ANY_ISGT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isgt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isgt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isgt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isgt_uint64
 GrB_Info GB_Asaxpy3B__any_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint8.c
index 3a4cf90269..462f6bcec5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isgt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik > bkj)
+// MultAdd:  uint8_t x_op_y = (aik > bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x > y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) ((ax > bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISGT || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_ISGT_UINT8 || GxB_NO_ANY_ISGT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isgt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isgt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isgt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isgt_uint8
 GrB_Info GB_Asaxpy3B__any_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isle_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_isle_fp32.c
index 57e520caeb..473417b429 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isle_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isle_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLE || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_ISLE_FP32 || GxB_NO_ANY_ISLE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isle_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isle_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isle_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isle_fp32
 GrB_Info GB_Asaxpy3B__any_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isle_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_isle_fp64.c
index 5f0a86fa78..601339e08d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isle_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isle_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLE || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_ISLE_FP64 || GxB_NO_ANY_ISLE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isle_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isle_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isle_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isle_fp64
 GrB_Info GB_Asaxpy3B__any_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isle_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_isle_int16.c
index 45b06904e6..01bf6dae53 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isle_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isle_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik <= bkj)
+// MultAdd:  int16_t x_op_y = (aik <= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x <= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) ((ax <= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLE || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_ISLE_INT16 || GxB_NO_ANY_ISLE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isle_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isle_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isle_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isle_int16
 GrB_Info GB_Asaxpy3B__any_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isle_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_isle_int32.c
index 6ffcbcee18..88aa8f6315 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isle_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isle_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik <= bkj)
+// MultAdd:  int32_t x_op_y = (aik <= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x <= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) ((ax <= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLE || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_ISLE_INT32 || GxB_NO_ANY_ISLE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isle_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isle_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isle_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isle_int32
 GrB_Info GB_Asaxpy3B__any_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isle_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_isle_int64.c
index 0b5afca75d..e417b07944 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isle_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isle_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik <= bkj)
+// MultAdd:  int64_t x_op_y = (aik <= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x <= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) ((ax <= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLE || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_ISLE_INT64 || GxB_NO_ANY_ISLE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isle_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isle_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isle_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isle_int64
 GrB_Info GB_Asaxpy3B__any_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isle_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_isle_int8.c
index 26d5483f51..ce917289ac 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isle_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isle_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik <= bkj)
+// MultAdd:  int8_t x_op_y = (aik <= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x <= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) ((ax <= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLE || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_ISLE_INT8 || GxB_NO_ANY_ISLE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isle_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isle_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isle_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isle_int8
 GrB_Info GB_Asaxpy3B__any_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint16.c
index 090cf88a8e..5d409c3c42 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik <= bkj)
+// MultAdd:  uint16_t x_op_y = (aik <= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x <= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) ((ax <= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLE || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_ISLE_UINT16 || GxB_NO_ANY_ISLE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isle_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isle_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isle_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isle_uint16
 GrB_Info GB_Asaxpy3B__any_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint32.c
index 2e15d46583..38862ba780 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik <= bkj)
+// MultAdd:  uint32_t x_op_y = (aik <= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x <= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) ((ax <= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLE || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_ISLE_UINT32 || GxB_NO_ANY_ISLE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isle_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isle_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isle_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isle_uint32
 GrB_Info GB_Asaxpy3B__any_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint64.c
index 3031dd5a54..15bd785c91 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik <= bkj)
+// MultAdd:  uint64_t x_op_y = (aik <= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x <= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) ((ax <= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLE || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_ISLE_UINT64 || GxB_NO_ANY_ISLE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isle_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isle_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isle_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isle_uint64
 GrB_Info GB_Asaxpy3B__any_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint8.c
index 5589872df4..ceb878a202 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isle_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik <= bkj)
+// MultAdd:  uint8_t x_op_y = (aik <= bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x <= y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) ((ax <= bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLE || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_ISLE_UINT8 || GxB_NO_ANY_ISLE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isle_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isle_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isle_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isle_uint8
 GrB_Info GB_Asaxpy3B__any_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_islt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_islt_fp32.c
index 454e9294e9..f79bbcda1a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_islt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_islt_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLT || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_ISLT_FP32 || GxB_NO_ANY_ISLT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_islt_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_islt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_islt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_islt_fp32
 GrB_Info GB_Asaxpy3B__any_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_islt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_islt_fp64.c
index 29d600fe53..7ed38b791e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_islt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_islt_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLT || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_ISLT_FP64 || GxB_NO_ANY_ISLT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_islt_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_islt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_islt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_islt_fp64
 GrB_Info GB_Asaxpy3B__any_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_islt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_islt_int16.c
index b591399e30..c83a3761ca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_islt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_islt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik < bkj)
+// MultAdd:  int16_t x_op_y = (aik < bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x < y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) ((ax < bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLT || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_ISLT_INT16 || GxB_NO_ANY_ISLT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_islt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_islt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_islt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_islt_int16
 GrB_Info GB_Asaxpy3B__any_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_islt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_islt_int32.c
index d9097db9e1..286dc809b8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_islt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_islt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik < bkj)
+// MultAdd:  int32_t x_op_y = (aik < bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x < y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) ((ax < bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLT || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_ISLT_INT32 || GxB_NO_ANY_ISLT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_islt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_islt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_islt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_islt_int32
 GrB_Info GB_Asaxpy3B__any_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_islt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_islt_int64.c
index c0fbbbebe0..22d2bbca4a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_islt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_islt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik < bkj)
+// MultAdd:  int64_t x_op_y = (aik < bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x < y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) ((ax < bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLT || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_ISLT_INT64 || GxB_NO_ANY_ISLT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_islt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_islt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_islt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_islt_int64
 GrB_Info GB_Asaxpy3B__any_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_islt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_islt_int8.c
index 4dfa7a9db1..afbd4d1ac0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_islt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_islt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik < bkj)
+// MultAdd:  int8_t x_op_y = (aik < bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x < y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) ((ax < bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLT || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_ISLT_INT8 || GxB_NO_ANY_ISLT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_islt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_islt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_islt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_islt_int8
 GrB_Info GB_Asaxpy3B__any_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint16.c
index b614a34144..1b35fc363e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik < bkj)
+// MultAdd:  uint16_t x_op_y = (aik < bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x < y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) ((ax < bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLT || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_ISLT_UINT16 || GxB_NO_ANY_ISLT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_islt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_islt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_islt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_islt_uint16
 GrB_Info GB_Asaxpy3B__any_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint32.c
index ea515dde1f..30fd8991a6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik < bkj)
+// MultAdd:  uint32_t x_op_y = (aik < bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x < y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) ((ax < bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLT || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_ISLT_UINT32 || GxB_NO_ANY_ISLT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_islt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_islt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_islt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_islt_uint32
 GrB_Info GB_Asaxpy3B__any_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint64.c
index 4d459357b4..f9e68955e8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik < bkj)
+// MultAdd:  uint64_t x_op_y = (aik < bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x < y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) ((ax < bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLT || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_ISLT_UINT64 || GxB_NO_ANY_ISLT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_islt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_islt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_islt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_islt_uint64
 GrB_Info GB_Asaxpy3B__any_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint8.c
index ade4daff71..2e9d1a775f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_islt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik < bkj)
+// MultAdd:  uint8_t x_op_y = (aik < bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x < y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) ((ax < bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISLT || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_ISLT_UINT8 || GxB_NO_ANY_ISLT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_islt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_islt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_islt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_islt_uint8
 GrB_Info GB_Asaxpy3B__any_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isne_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_isne_fp32.c
index 42c939c1d9..6514a1d035 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isne_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isne_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISNE || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_ISNE_FP32 || GxB_NO_ANY_ISNE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isne_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isne_fp32
 GrB_Info GB_Asaxpy3B__any_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isne_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_isne_fp64.c
index a73f539763..12bfdda145 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isne_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isne_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISNE || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_ISNE_FP64 || GxB_NO_ANY_ISNE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isne_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isne_fp64
 GrB_Info GB_Asaxpy3B__any_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isne_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_isne_int16.c
index 9060224a67..795d4da600 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isne_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isne_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik != bkj)
+// MultAdd:  int16_t x_op_y = (aik != bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x != y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) ((ax != bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISNE || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_ISNE_INT16 || GxB_NO_ANY_ISNE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isne_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isne_int16
 GrB_Info GB_Asaxpy3B__any_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isne_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_isne_int32.c
index eed8a31636..4b4bb49d26 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isne_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isne_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik != bkj)
+// MultAdd:  int32_t x_op_y = (aik != bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x != y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) ((ax != bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISNE || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_ISNE_INT32 || GxB_NO_ANY_ISNE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isne_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isne_int32
 GrB_Info GB_Asaxpy3B__any_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isne_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_isne_int64.c
index 4d38679ce3..f4741821ac 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isne_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isne_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik != bkj)
+// MultAdd:  int64_t x_op_y = (aik != bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x != y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) ((ax != bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISNE || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_ISNE_INT64 || GxB_NO_ANY_ISNE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isne_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isne_int64
 GrB_Info GB_Asaxpy3B__any_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isne_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_isne_int8.c
index 1f6151076f..ab329050de 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isne_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isne_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik != bkj)
+// MultAdd:  int8_t x_op_y = (aik != bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x != y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) ((ax != bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISNE || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_ISNE_INT8 || GxB_NO_ANY_ISNE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isne_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isne_int8
 GrB_Info GB_Asaxpy3B__any_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint16.c
index 0520bf4ef1..cb4c87723d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik != bkj)
+// MultAdd:  uint16_t x_op_y = (aik != bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x != y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) ((ax != bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISNE || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_ISNE_UINT16 || GxB_NO_ANY_ISNE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isne_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isne_uint16
 GrB_Info GB_Asaxpy3B__any_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint32.c
index e01ac6c380..30628404bd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik != bkj)
+// MultAdd:  uint32_t x_op_y = (aik != bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x != y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) ((ax != bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISNE || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_ISNE_UINT32 || GxB_NO_ANY_ISNE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isne_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isne_uint32
 GrB_Info GB_Asaxpy3B__any_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint64.c
index 8f46300dc8..63f40cacb9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik != bkj)
+// MultAdd:  uint64_t x_op_y = (aik != bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x != y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) ((ax != bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISNE || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_ISNE_UINT64 || GxB_NO_ANY_ISNE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isne_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isne_uint64
 GrB_Info GB_Asaxpy3B__any_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint8.c
index a3c32204e7..317ec89973 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_isne_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  1
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = (aik != bkj)
+// MultAdd:  uint8_t x_op_y = (aik != bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x != y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) ((ax != bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_ISNE || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_ISNE_UINT8 || GxB_NO_ANY_ISNE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_isne_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_isne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_isne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_isne_uint8
 GrB_Info GB_Asaxpy3B__any_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_land_bool.c b/GraphBLAS/Source/Generated/GB_AxB__any_land_bool.c
index 34ef7db674..66f776b02e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_land_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_land_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik && bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x && y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x && y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax && bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LAND || GxB_NO_BOOL || GxB_NO_ANY_BOOL || GxB_NO_LAND_BOOL || GxB_NO_ANY_LAND_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_land_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_land_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_land_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_land_bool
 GrB_Info GB_Asaxpy3B__any_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_land_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_land_fp32.c
index f1ca0231ce..c6a6710f5c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_land_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_land_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = ((aik != 0) && (bkj != 0))
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (((ax != 0) && (bx != 0))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LAND || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_LAND_FP32 || GxB_NO_ANY_LAND_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_land_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_land_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_land_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_land_fp32
 GrB_Info GB_Asaxpy3B__any_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_land_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_land_fp64.c
index 7ab5893f90..812d939a01 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_land_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_land_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = ((aik != 0) && (bkj != 0))
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (((ax != 0) && (bx != 0))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LAND || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_LAND_FP64 || GxB_NO_ANY_LAND_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_land_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_land_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_land_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_land_fp64
 GrB_Info GB_Asaxpy3B__any_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_land_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_land_int16.c
index ed6389ed51..d436576099 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_land_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_land_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) && (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) (((ax != 0) && (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LAND || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_LAND_INT16 || GxB_NO_ANY_LAND_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_land_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_land_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_land_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_land_int16
 GrB_Info GB_Asaxpy3B__any_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_land_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_land_int32.c
index 5c4a630bde..85bbb40651 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_land_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_land_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) && (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) (((ax != 0) && (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LAND || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_LAND_INT32 || GxB_NO_ANY_LAND_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_land_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_land_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_land_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_land_int32
 GrB_Info GB_Asaxpy3B__any_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_land_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_land_int64.c
index 62233a17ef..dce459a328 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_land_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_land_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) && (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) (((ax != 0) && (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LAND || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_LAND_INT64 || GxB_NO_ANY_LAND_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_land_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_land_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_land_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_land_int64
 GrB_Info GB_Asaxpy3B__any_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_land_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_land_int8.c
index 3a4685ef34..568ab03b1d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_land_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_land_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) && (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) (((ax != 0) && (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LAND || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_LAND_INT8 || GxB_NO_ANY_LAND_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_land_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_land_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_land_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_land_int8
 GrB_Info GB_Asaxpy3B__any_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_land_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_land_uint16.c
index d82bbf058d..95b57fc627 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_land_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_land_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) && (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) (((ax != 0) && (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LAND || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_LAND_UINT16 || GxB_NO_ANY_LAND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_land_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_land_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_land_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_land_uint16
 GrB_Info GB_Asaxpy3B__any_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_land_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_land_uint32.c
index f4465069ed..77e57bcf1e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_land_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_land_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) && (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) (((ax != 0) && (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LAND || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_LAND_UINT32 || GxB_NO_ANY_LAND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_land_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_land_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_land_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_land_uint32
 GrB_Info GB_Asaxpy3B__any_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_land_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_land_uint64.c
index 0009b59da9..80adcf8cc7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_land_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_land_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) && (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) (((ax != 0) && (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LAND || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_LAND_UINT64 || GxB_NO_ANY_LAND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_land_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_land_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_land_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_land_uint64
 GrB_Info GB_Asaxpy3B__any_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_land_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_land_uint8.c
index a5b1a303a5..49549a71ba 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_land_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_land_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) && (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) (((ax != 0) && (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LAND || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_LAND_UINT8 || GxB_NO_ANY_LAND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_land_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_land_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_land_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_land_uint8
 GrB_Info GB_Asaxpy3B__any_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_le_bool.c b/GraphBLAS/Source/Generated/GB_AxB__any_le_bool.c
index b786299e69..295f02379b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_le_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_le_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LE || GxB_NO_BOOL || GxB_NO_ANY_BOOL || GxB_NO_LE_BOOL || GxB_NO_ANY_LE_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_le_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_le_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_le_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_le_bool
 GrB_Info GB_Asaxpy3B__any_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_le_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_le_fp32.c
index 8c4b6dce2d..0217b7d80a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_le_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_le_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LE || GxB_NO_FP32 || GxB_NO_ANY_BOOL || GxB_NO_LE_FP32 || GxB_NO_ANY_LE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_le_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_le_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_le_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_le_fp32
 GrB_Info GB_Asaxpy3B__any_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_le_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_le_fp64.c
index 9a8e747a38..292c7ffb85 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_le_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_le_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LE || GxB_NO_FP64 || GxB_NO_ANY_BOOL || GxB_NO_LE_FP64 || GxB_NO_ANY_LE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_le_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_le_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_le_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_le_fp64
 GrB_Info GB_Asaxpy3B__any_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_le_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_le_int16.c
index 8c4f07c570..e5a5ff7ee6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_le_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_le_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LE || GxB_NO_INT16 || GxB_NO_ANY_BOOL || GxB_NO_LE_INT16 || GxB_NO_ANY_LE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_le_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_le_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_le_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_le_int16
 GrB_Info GB_Asaxpy3B__any_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_le_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_le_int32.c
index 2e49672f75..3fba20c39a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_le_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_le_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LE || GxB_NO_INT32 || GxB_NO_ANY_BOOL || GxB_NO_LE_INT32 || GxB_NO_ANY_LE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_le_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_le_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_le_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_le_int32
 GrB_Info GB_Asaxpy3B__any_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_le_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_le_int64.c
index dc3066b415..278c8b093b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_le_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_le_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LE || GxB_NO_INT64 || GxB_NO_ANY_BOOL || GxB_NO_LE_INT64 || GxB_NO_ANY_LE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_le_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_le_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_le_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_le_int64
 GrB_Info GB_Asaxpy3B__any_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_le_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_le_int8.c
index ed7942a206..0460847be6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_le_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_le_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LE || GxB_NO_INT8 || GxB_NO_ANY_BOOL || GxB_NO_LE_INT8 || GxB_NO_ANY_LE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_le_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_le_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_le_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_le_int8
 GrB_Info GB_Asaxpy3B__any_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_le_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_le_uint16.c
index f634baa5f5..701486414e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_le_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_le_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LE || GxB_NO_UINT16 || GxB_NO_ANY_BOOL || GxB_NO_LE_UINT16 || GxB_NO_ANY_LE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_le_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_le_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_le_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_le_uint16
 GrB_Info GB_Asaxpy3B__any_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_le_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_le_uint32.c
index c97b907e01..718274cd7b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_le_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_le_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LE || GxB_NO_UINT32 || GxB_NO_ANY_BOOL || GxB_NO_LE_UINT32 || GxB_NO_ANY_LE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_le_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_le_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_le_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_le_uint32
 GrB_Info GB_Asaxpy3B__any_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_le_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_le_uint64.c
index 592bd79abf..5bf4043709 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_le_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_le_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LE || GxB_NO_UINT64 || GxB_NO_ANY_BOOL || GxB_NO_LE_UINT64 || GxB_NO_ANY_LE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_le_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_le_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_le_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_le_uint64
 GrB_Info GB_Asaxpy3B__any_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_le_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_le_uint8.c
index be9620e026..866192af09 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_le_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_le_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik <= bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LE || GxB_NO_UINT8 || GxB_NO_ANY_BOOL || GxB_NO_LE_UINT8 || GxB_NO_ANY_LE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_le_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_le_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_le_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_le_uint8
 GrB_Info GB_Asaxpy3B__any_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lor_bool.c b/GraphBLAS/Source/Generated/GB_AxB__any_lor_bool.c
index f775e9f226..42fcd186e1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lor_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik || bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x || y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x || y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax || bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LOR || GxB_NO_BOOL || GxB_NO_ANY_BOOL || GxB_NO_LOR_BOOL || GxB_NO_ANY_LOR_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lor_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lor_bool
 GrB_Info GB_Asaxpy3B__any_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lor_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_lor_fp32.c
index e1adcca6dd..eb0dd58e09 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lor_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lor_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = ((aik != 0) || (bkj != 0))
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (((ax != 0) || (bx != 0))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LOR || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_LOR_FP32 || GxB_NO_ANY_LOR_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lor_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lor_fp32
 GrB_Info GB_Asaxpy3B__any_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lor_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_lor_fp64.c
index 3febf0949f..1ff1f0f4ba 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lor_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lor_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = ((aik != 0) || (bkj != 0))
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (((ax != 0) || (bx != 0))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LOR || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_LOR_FP64 || GxB_NO_ANY_LOR_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lor_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lor_fp64
 GrB_Info GB_Asaxpy3B__any_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lor_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_lor_int16.c
index 4ecfea40a1..f7d1bb4641 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lor_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) || (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) (((ax != 0) || (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LOR || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_LOR_INT16 || GxB_NO_ANY_LOR_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lor_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lor_int16
 GrB_Info GB_Asaxpy3B__any_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lor_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_lor_int32.c
index 53710e31d8..5251a14532 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lor_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) || (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) (((ax != 0) || (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LOR || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_LOR_INT32 || GxB_NO_ANY_LOR_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lor_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lor_int32
 GrB_Info GB_Asaxpy3B__any_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lor_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_lor_int64.c
index 5b7b2f30c0..8940f86b79 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lor_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) || (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) (((ax != 0) || (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LOR || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_LOR_INT64 || GxB_NO_ANY_LOR_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lor_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lor_int64
 GrB_Info GB_Asaxpy3B__any_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lor_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_lor_int8.c
index 287e3dbe55..a132821625 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lor_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) || (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) (((ax != 0) || (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LOR || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_LOR_INT8 || GxB_NO_ANY_LOR_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lor_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lor_int8
 GrB_Info GB_Asaxpy3B__any_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint16.c
index 0a7b21c1d9..d543573329 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) || (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) (((ax != 0) || (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LOR || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_LOR_UINT16 || GxB_NO_ANY_LOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lor_uint16
 GrB_Info GB_Asaxpy3B__any_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint32.c
index c92308d2ec..b120824573 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) || (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) (((ax != 0) || (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LOR || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_LOR_UINT32 || GxB_NO_ANY_LOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lor_uint32
 GrB_Info GB_Asaxpy3B__any_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint64.c
index 6faba6e425..ae743b5991 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) || (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) (((ax != 0) || (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LOR || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_LOR_UINT64 || GxB_NO_ANY_LOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lor_uint64
 GrB_Info GB_Asaxpy3B__any_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint8.c
index d25b39d3b7..96d3ea46a3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) || (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) (((ax != 0) || (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LOR || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_LOR_UINT8 || GxB_NO_ANY_LOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lor_uint8
 GrB_Info GB_Asaxpy3B__any_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lt_bool.c b/GraphBLAS/Source/Generated/GB_AxB__any_lt_bool.c
index 2d1adb41b5..81466c360d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lt_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lt_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LT || GxB_NO_BOOL || GxB_NO_ANY_BOOL || GxB_NO_LT_BOOL || GxB_NO_ANY_LT_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lt_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lt_bool
 GrB_Info GB_Asaxpy3B__any_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_lt_fp32.c
index 7ef7ea5500..8c819c83a1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lt_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LT || GxB_NO_FP32 || GxB_NO_ANY_BOOL || GxB_NO_LT_FP32 || GxB_NO_ANY_LT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lt_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lt_fp32
 GrB_Info GB_Asaxpy3B__any_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_lt_fp64.c
index 050cb5249c..47226017a3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lt_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LT || GxB_NO_FP64 || GxB_NO_ANY_BOOL || GxB_NO_LT_FP64 || GxB_NO_ANY_LT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lt_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lt_fp64
 GrB_Info GB_Asaxpy3B__any_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_lt_int16.c
index bbf214656b..cf9c4f3ce5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LT || GxB_NO_INT16 || GxB_NO_ANY_BOOL || GxB_NO_LT_INT16 || GxB_NO_ANY_LT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lt_int16
 GrB_Info GB_Asaxpy3B__any_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_lt_int32.c
index 563d2f112a..8c819cb5a2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LT || GxB_NO_INT32 || GxB_NO_ANY_BOOL || GxB_NO_LT_INT32 || GxB_NO_ANY_LT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lt_int32
 GrB_Info GB_Asaxpy3B__any_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_lt_int64.c
index 31626c6066..9224c8f5bc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LT || GxB_NO_INT64 || GxB_NO_ANY_BOOL || GxB_NO_LT_INT64 || GxB_NO_ANY_LT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lt_int64
 GrB_Info GB_Asaxpy3B__any_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_lt_int8.c
index aaf0e08da2..9e01ecc913 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LT || GxB_NO_INT8 || GxB_NO_ANY_BOOL || GxB_NO_LT_INT8 || GxB_NO_ANY_LT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lt_int8
 GrB_Info GB_Asaxpy3B__any_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint16.c
index 477e335eea..91a972c750 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LT || GxB_NO_UINT16 || GxB_NO_ANY_BOOL || GxB_NO_LT_UINT16 || GxB_NO_ANY_LT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lt_uint16
 GrB_Info GB_Asaxpy3B__any_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint32.c
index 1ce580605e..9a662a8360 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LT || GxB_NO_UINT32 || GxB_NO_ANY_BOOL || GxB_NO_LT_UINT32 || GxB_NO_ANY_LT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lt_uint32
 GrB_Info GB_Asaxpy3B__any_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint64.c
index 1e43d7642f..c56cb1cb0b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LT || GxB_NO_UINT64 || GxB_NO_ANY_BOOL || GxB_NO_LT_UINT64 || GxB_NO_ANY_LT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lt_uint64
 GrB_Info GB_Asaxpy3B__any_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint8.c
index 17ef00a267..3d61b6f9d6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik < bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LT || GxB_NO_UINT8 || GxB_NO_ANY_BOOL || GxB_NO_LT_UINT8 || GxB_NO_ANY_LT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lt_uint8
 GrB_Info GB_Asaxpy3B__any_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_bool.c b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_bool.c
index 39cb0c22c1..7eb7cefc82 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LXOR || GxB_NO_BOOL || GxB_NO_ANY_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_ANY_LXOR_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lxor_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lxor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lxor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lxor_bool
 GrB_Info GB_Asaxpy3B__any_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_fp32.c
index d972da1453..3ea2cb87ab 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = ((aik != 0) != (bkj != 0))
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (((ax != 0) != (bx != 0))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LXOR || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_LXOR_FP32 || GxB_NO_ANY_LXOR_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lxor_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lxor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lxor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lxor_fp32
 GrB_Info GB_Asaxpy3B__any_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_fp64.c
index f8c1fc1c38..70262e9b6c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = ((aik != 0) != (bkj != 0))
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (((ax != 0) != (bx != 0))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LXOR || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_LXOR_FP64 || GxB_NO_ANY_LXOR_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lxor_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lxor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lxor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lxor_fp64
 GrB_Info GB_Asaxpy3B__any_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int16.c
index 3e5fe3dbd4..af4bd48d9f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) != (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) (((ax != 0) != (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LXOR || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_LXOR_INT16 || GxB_NO_ANY_LXOR_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lxor_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lxor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lxor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lxor_int16
 GrB_Info GB_Asaxpy3B__any_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int32.c
index d6e2b4c6fa..b714bde85a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) != (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) (((ax != 0) != (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LXOR || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_LXOR_INT32 || GxB_NO_ANY_LXOR_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lxor_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lxor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lxor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lxor_int32
 GrB_Info GB_Asaxpy3B__any_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int64.c
index 1ccc7e7c1c..3856207f9f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) != (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) (((ax != 0) != (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LXOR || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_LXOR_INT64 || GxB_NO_ANY_LXOR_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lxor_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lxor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lxor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lxor_int64
 GrB_Info GB_Asaxpy3B__any_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int8.c
index 46ebb5c0af..c6deace449 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) != (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) (((ax != 0) != (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LXOR || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_LXOR_INT8 || GxB_NO_ANY_LXOR_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lxor_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lxor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lxor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lxor_int8
 GrB_Info GB_Asaxpy3B__any_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint16.c
index 113df1a2c8..83c21007bd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) != (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) (((ax != 0) != (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LXOR || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_LXOR_UINT16 || GxB_NO_ANY_LXOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lxor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lxor_uint16
 GrB_Info GB_Asaxpy3B__any_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint32.c
index 43952a4981..cca20c1393 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) != (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) (((ax != 0) != (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LXOR || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_LXOR_UINT32 || GxB_NO_ANY_LXOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lxor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lxor_uint32
 GrB_Info GB_Asaxpy3B__any_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint64.c
index 6b546a8097..01b468312b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) != (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) (((ax != 0) != (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LXOR || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_LXOR_UINT64 || GxB_NO_ANY_LXOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lxor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lxor_uint64
 GrB_Info GB_Asaxpy3B__any_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint8.c
index 6be19d8cac..b684ef3c9c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_lxor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) != (y != 0)) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) (((ax != 0) != (bx != 0)))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_LXOR || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_LXOR_UINT8 || GxB_NO_ANY_LXOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_lxor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_lxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_lxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_lxor_uint8
 GrB_Info GB_Asaxpy3B__any_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_max_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_max_fp32.c
index 6af30dae5f..430069825f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_max_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_max_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (aik, bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmaxf (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (x, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (fmaxf (ax, bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MAX || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_MAX_FP32 || GxB_NO_ANY_MAX_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_max_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_max_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_max_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_max_fp32
 GrB_Info GB_Asaxpy3B__any_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_max_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_max_fp64.c
index fb454ac5e8..a48161ee77 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_max_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_max_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (aik, bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmax (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (x, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (fmax (ax, bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MAX || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_MAX_FP64 || GxB_NO_ANY_MAX_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_max_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_max_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_max_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_max_fp64
 GrB_Info GB_Asaxpy3B__any_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_max_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_max_int16.c
index b570b0180e..24e5f9edb0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_max_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_max_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = GB_IMAX (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IMAX (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) (GB_IMAX (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MAX || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_MAX_INT16 || GxB_NO_ANY_MAX_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_max_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_max_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_max_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_max_int16
 GrB_Info GB_Asaxpy3B__any_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_max_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_max_int32.c
index bfe745d74d..c3e255b890 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_max_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_max_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = GB_IMAX (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IMAX (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) (GB_IMAX (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MAX || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_MAX_INT32 || GxB_NO_ANY_MAX_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_max_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_max_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_max_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_max_int32
 GrB_Info GB_Asaxpy3B__any_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_max_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_max_int64.c
index 02967be234..929b35a29d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_max_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_max_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = GB_IMAX (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IMAX (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) (GB_IMAX (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MAX || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_MAX_INT64 || GxB_NO_ANY_MAX_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_max_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_max_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_max_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_max_int64
 GrB_Info GB_Asaxpy3B__any_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_max_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_max_int8.c
index 742bd4ed05..33136b6bc8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_max_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_max_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = GB_IMAX (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IMAX (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) (GB_IMAX (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MAX || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_MAX_INT8 || GxB_NO_ANY_MAX_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_max_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_max_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_max_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_max_int8
 GrB_Info GB_Asaxpy3B__any_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_max_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_max_uint16.c
index 991ef62f31..8e381ab11f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_max_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_max_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = GB_IMAX (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IMAX (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) (GB_IMAX (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MAX || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_ANY_MAX_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_max_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_max_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_max_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_max_uint16
 GrB_Info GB_Asaxpy3B__any_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_max_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_max_uint32.c
index d904080715..d2d10a5537 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_max_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_max_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = GB_IMAX (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IMAX (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) (GB_IMAX (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MAX || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_ANY_MAX_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_max_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_max_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_max_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_max_uint32
 GrB_Info GB_Asaxpy3B__any_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_max_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_max_uint64.c
index a7fee3424e..9b3a96944c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_max_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_max_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = GB_IMAX (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IMAX (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) (GB_IMAX (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MAX || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_ANY_MAX_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_max_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_max_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_max_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_max_uint64
 GrB_Info GB_Asaxpy3B__any_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_max_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_max_uint8.c
index edaeab01a2..18ea13b81b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_max_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_max_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = GB_IMAX (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IMAX (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) (GB_IMAX (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MAX || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_ANY_MAX_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_max_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_max_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_max_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_max_uint8
 GrB_Info GB_Asaxpy3B__any_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_min_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_min_fp32.c
index a958293e0a..9d56ac1d61 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_min_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_min_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (aik, bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fminf (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (x, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (fminf (ax, bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MIN || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_MIN_FP32 || GxB_NO_ANY_MIN_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_min_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_min_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_min_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_min_fp32
 GrB_Info GB_Asaxpy3B__any_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_min_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_min_fp64.c
index 3a1187c7d0..493d0f1581 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_min_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_min_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (aik, bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmin (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (x, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (fmin (ax, bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MIN || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_MIN_FP64 || GxB_NO_ANY_MIN_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_min_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_min_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_min_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_min_fp64
 GrB_Info GB_Asaxpy3B__any_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_min_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_min_int16.c
index 552023f40a..ac62ff6d70 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_min_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_min_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = GB_IMIN (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IMIN (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) (GB_IMIN (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MIN || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_MIN_INT16 || GxB_NO_ANY_MIN_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_min_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_min_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_min_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_min_int16
 GrB_Info GB_Asaxpy3B__any_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_min_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_min_int32.c
index 593a1bf4b9..df7d5619e4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_min_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_min_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = GB_IMIN (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IMIN (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) (GB_IMIN (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MIN || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_MIN_INT32 || GxB_NO_ANY_MIN_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_min_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_min_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_min_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_min_int32
 GrB_Info GB_Asaxpy3B__any_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_min_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_min_int64.c
index 4d3b37b45d..aaeeb9f6fa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_min_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_min_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = GB_IMIN (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IMIN (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) (GB_IMIN (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MIN || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_MIN_INT64 || GxB_NO_ANY_MIN_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_min_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_min_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_min_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_min_int64
 GrB_Info GB_Asaxpy3B__any_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_min_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_min_int8.c
index 5f83ff166c..37471d4bd0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_min_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_min_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = GB_IMIN (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IMIN (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) (GB_IMIN (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MIN || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_MIN_INT8 || GxB_NO_ANY_MIN_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_min_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_min_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_min_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_min_int8
 GrB_Info GB_Asaxpy3B__any_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_min_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_min_uint16.c
index 0066f45f27..e17efd4b3b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_min_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_min_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = GB_IMIN (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IMIN (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) (GB_IMIN (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MIN || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_ANY_MIN_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_min_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_min_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_min_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_min_uint16
 GrB_Info GB_Asaxpy3B__any_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_min_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_min_uint32.c
index 97206d9716..beb6c75388 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_min_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_min_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = GB_IMIN (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IMIN (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) (GB_IMIN (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MIN || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_ANY_MIN_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_min_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_min_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_min_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_min_uint32
 GrB_Info GB_Asaxpy3B__any_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_min_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_min_uint64.c
index e06462d905..29c437ccd8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_min_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_min_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = GB_IMIN (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IMIN (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) (GB_IMIN (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MIN || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_ANY_MIN_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_min_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_min_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_min_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_min_uint64
 GrB_Info GB_Asaxpy3B__any_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_min_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_min_uint8.c
index d5f272034c..f4e903a8a2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_min_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_min_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = GB_IMIN (aik, bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IMIN (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) (GB_IMIN (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MIN || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_ANY_MIN_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_min_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_min_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_min_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_min_uint8
 GrB_Info GB_Asaxpy3B__any_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_minus_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__any_minus_fc32.c
index 02344521d9..e7c8811bfe 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_minus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_minus_fc32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  GxB_FC32_t x_op_y = GB_FC32_minus (aik, bkj) ; cij = x_op_y
 // Identity: GxB_CMPLXF(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_minus (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_minus (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    1
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((GxB_FC32_t) (GB_FC32_minus (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MINUS || GxB_NO_FC32 || GxB_NO_ANY_FC32 || GxB_NO_MINUS_FC32 || GxB_NO_ANY_MINUS_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_minus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_minus_fc32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_minus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_minus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_minus_fc32
 GrB_Info GB_Asaxpy3B__any_minus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_minus_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__any_minus_fc64.c
index bb1ad7eee7..081474c4bc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_minus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_minus_fc64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  GxB_FC64_t x_op_y = GB_FC64_minus (aik, bkj) ; cij = x_op_y
 // Identity: GxB_CMPLX(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_minus (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_minus (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    1
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((GxB_FC64_t) (GB_FC64_minus (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MINUS || GxB_NO_FC64 || GxB_NO_ANY_FC64 || GxB_NO_MINUS_FC64 || GxB_NO_ANY_MINUS_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_minus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_minus_fc64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_minus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_minus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_minus_fc64
 GrB_Info GB_Asaxpy3B__any_minus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_minus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_minus_fp32.c
index d9652865a5..12aa935c56 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_minus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_minus_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik - bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x - y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax - bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MINUS || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_MINUS_FP32 || GxB_NO_ANY_MINUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_minus_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_minus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_minus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_minus_fp32
 GrB_Info GB_Asaxpy3B__any_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_minus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_minus_fp64.c
index 8ca07a3c0a..bd07939112 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_minus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_minus_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik - bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x - y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax - bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MINUS || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_MINUS_FP64 || GxB_NO_ANY_MINUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_minus_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_minus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_minus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_minus_fp64
 GrB_Info GB_Asaxpy3B__any_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_minus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_minus_int16.c
index bf328919c6..acb4a2feca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_minus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_minus_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = (aik - bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x - y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) ((ax - bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MINUS || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_MINUS_INT16 || GxB_NO_ANY_MINUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_minus_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_minus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_minus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_minus_int16
 GrB_Info GB_Asaxpy3B__any_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_minus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_minus_int32.c
index 56d89dea53..f312a57674 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_minus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_minus_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = (aik - bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x - y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) ((ax - bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MINUS || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_MINUS_INT32 || GxB_NO_ANY_MINUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_minus_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_minus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_minus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_minus_int32
 GrB_Info GB_Asaxpy3B__any_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_minus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_minus_int64.c
index efe64e5d15..5df90f7845 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_minus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_minus_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = (aik - bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x - y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) ((ax - bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MINUS || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_MINUS_INT64 || GxB_NO_ANY_MINUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_minus_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_minus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_minus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_minus_int64
 GrB_Info GB_Asaxpy3B__any_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_minus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_minus_int8.c
index c3f8bbfdad..e924e4d9c7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_minus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_minus_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = (aik - bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x - y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) ((ax - bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MINUS || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_MINUS_INT8 || GxB_NO_ANY_MINUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_minus_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_minus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_minus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_minus_int8
 GrB_Info GB_Asaxpy3B__any_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint16.c
index 37c085260b..e6a4bfd33c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = (aik - bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x - y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) ((ax - bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MINUS || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_MINUS_UINT16 || GxB_NO_ANY_MINUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_minus_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_minus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_minus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_minus_uint16
 GrB_Info GB_Asaxpy3B__any_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint32.c
index 87fdaae270..afafe25c64 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = (aik - bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x - y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) ((ax - bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MINUS || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_MINUS_UINT32 || GxB_NO_ANY_MINUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_minus_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_minus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_minus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_minus_uint32
 GrB_Info GB_Asaxpy3B__any_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint64.c
index be3fbbd7cf..d96d5d76b7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (aik - bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x - y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) ((ax - bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MINUS || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_MINUS_UINT64 || GxB_NO_ANY_MINUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_minus_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_minus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_minus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_minus_uint64
 GrB_Info GB_Asaxpy3B__any_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint8.c
index e70b47b378..8e9e108f2b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_minus_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = (aik - bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x - y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) ((ax - bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_MINUS || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_MINUS_UINT8 || GxB_NO_ANY_MINUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_minus_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_minus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_minus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_minus_uint8
 GrB_Info GB_Asaxpy3B__any_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ne_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_ne_fp32.c
index 7d4ad5af7a..2361f2766d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ne_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ne_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_NE || GxB_NO_FP32 || GxB_NO_ANY_BOOL || GxB_NO_NE_FP32 || GxB_NO_ANY_NE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ne_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ne_fp32
 GrB_Info GB_Asaxpy3B__any_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ne_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_ne_fp64.c
index ddef900e43..4139b100c0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ne_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ne_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_NE || GxB_NO_FP64 || GxB_NO_ANY_BOOL || GxB_NO_NE_FP64 || GxB_NO_ANY_NE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ne_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ne_fp64
 GrB_Info GB_Asaxpy3B__any_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ne_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_ne_int16.c
index aa2b545128..ddaac64d76 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ne_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ne_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_NE || GxB_NO_INT16 || GxB_NO_ANY_BOOL || GxB_NO_NE_INT16 || GxB_NO_ANY_NE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ne_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ne_int16
 GrB_Info GB_Asaxpy3B__any_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ne_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_ne_int32.c
index 042c2da06f..4ccd0e6e30 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ne_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ne_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_NE || GxB_NO_INT32 || GxB_NO_ANY_BOOL || GxB_NO_NE_INT32 || GxB_NO_ANY_NE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ne_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ne_int32
 GrB_Info GB_Asaxpy3B__any_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ne_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_ne_int64.c
index e18f64a441..ce342f222b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ne_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ne_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_NE || GxB_NO_INT64 || GxB_NO_ANY_BOOL || GxB_NO_NE_INT64 || GxB_NO_ANY_NE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ne_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ne_int64
 GrB_Info GB_Asaxpy3B__any_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ne_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_ne_int8.c
index 30a8ae241c..d3b0cbed4a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ne_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ne_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_NE || GxB_NO_INT8 || GxB_NO_ANY_BOOL || GxB_NO_NE_INT8 || GxB_NO_ANY_NE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ne_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ne_int8
 GrB_Info GB_Asaxpy3B__any_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint16.c
index e7c88f701c..f651ad46e8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_NE || GxB_NO_UINT16 || GxB_NO_ANY_BOOL || GxB_NO_NE_UINT16 || GxB_NO_ANY_NE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ne_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ne_uint16
 GrB_Info GB_Asaxpy3B__any_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint32.c
index 2670a5caf5..cbe9c5ce5c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_NE || GxB_NO_UINT32 || GxB_NO_ANY_BOOL || GxB_NO_NE_UINT32 || GxB_NO_ANY_NE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ne_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ne_uint32
 GrB_Info GB_Asaxpy3B__any_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint64.c
index 86032c8121..1ed2d97413 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_NE || GxB_NO_UINT64 || GxB_NO_ANY_BOOL || GxB_NO_NE_UINT64 || GxB_NO_ANY_NE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ne_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ne_uint64
 GrB_Info GB_Asaxpy3B__any_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint8.c
index 6dc8ea3927..4bb492d4f6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_ne_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik != bkj)
 // Identity: false
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_NE || GxB_NO_UINT8 || GxB_NO_ANY_BOOL || GxB_NO_NE_UINT8 || GxB_NO_ANY_NE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_ne_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_ne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_ne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_ne_uint8
 GrB_Info GB_Asaxpy3B__any_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_bool.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_bool.c
index 7681437a76..54a425af2a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = 1
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_BOOL || GxB_NO_ANY_BOOL || GxB_NO_PAIR_BOOL || GxB_NO_ANY_PAIR_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_bool
 GrB_Info GB_Asaxpy3B__any_pair_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_fc32.c
index 9108fd9f78..3b22fd9651 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_fc32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = GxB_CMPLXF(1,0)
 // Identity: GxB_CMPLXF(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GxB_CMPLXF(1,0)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = GxB_CMPLXF(1,0)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    1
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_FC32 || GxB_NO_ANY_FC32 || GxB_NO_PAIR_FC32 || GxB_NO_ANY_PAIR_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_fc32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_fc32
 GrB_Info GB_Asaxpy3B__any_pair_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_fc64.c
index 2eb4bdb0e5..57fd3d04ea 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_fc64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = GxB_CMPLX(1,0)
 // Identity: GxB_CMPLX(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GxB_CMPLX(1,0)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = GxB_CMPLX(1,0)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    1
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_FC64 || GxB_NO_ANY_FC64 || GxB_NO_PAIR_FC64 || GxB_NO_ANY_PAIR_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_fc64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_fc64
 GrB_Info GB_Asaxpy3B__any_pair_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_fp32.c
index 3ca0d0f543..d2af9ea7a8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = 1
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_PAIR_FP32 || GxB_NO_ANY_PAIR_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_fp32
 GrB_Info GB_Asaxpy3B__any_pair_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_fp64.c
index 5042a579cb..7e7005ab71 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = 1
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_PAIR_FP64 || GxB_NO_ANY_PAIR_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_fp64
 GrB_Info GB_Asaxpy3B__any_pair_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_int16.c
index 592ae024e2..59ef6ba86e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = 1
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_PAIR_INT16 || GxB_NO_ANY_PAIR_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_int16
 GrB_Info GB_Asaxpy3B__any_pair_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_int32.c
index e2fa60eb47..2a5da9cc4e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = 1
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_PAIR_INT32 || GxB_NO_ANY_PAIR_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_int32
 GrB_Info GB_Asaxpy3B__any_pair_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_int64.c
index 2c3f4e3b36..40ec00e443 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = 1
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_PAIR_INT64 || GxB_NO_ANY_PAIR_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_int64
 GrB_Info GB_Asaxpy3B__any_pair_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_int8.c
index afbc535826..fc21c991f6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = 1
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_PAIR_INT8 || GxB_NO_ANY_PAIR_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_int8
 GrB_Info GB_Asaxpy3B__any_pair_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint16.c
index ef5749b233..c16db5c2b8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = 1
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_PAIR_UINT16 || GxB_NO_ANY_PAIR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_uint16
 GrB_Info GB_Asaxpy3B__any_pair_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint32.c
index 6bfc53dbc5..d869bf5ea4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = 1
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_PAIR_UINT32 || GxB_NO_ANY_PAIR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_uint32
 GrB_Info GB_Asaxpy3B__any_pair_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint64.c
index 670248a647..e921e06c59 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = 1
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_PAIR_UINT64 || GxB_NO_ANY_PAIR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_uint64
 GrB_Info GB_Asaxpy3B__any_pair_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint8.c
index 8dd8155799..e7c481e899 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_pair_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = 1
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PAIR || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_PAIR_UINT8 || GxB_NO_ANY_PAIR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_pair_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_pair_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_pair_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_pair_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_pair_uint8
 GrB_Info GB_Asaxpy3B__any_pair_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_plus_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__any_plus_fc32.c
index 5cd511c33e..e6dcd20b67 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_plus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_plus_fc32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  GxB_FC32_t x_op_y = GB_FC32_add (aik, bkj) ; cij = x_op_y
 // Identity: GxB_CMPLXF(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_add (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_add (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    1
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((GxB_FC32_t) (GB_FC32_add (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PLUS || GxB_NO_FC32 || GxB_NO_ANY_FC32 || GxB_NO_PLUS_FC32 || GxB_NO_ANY_PLUS_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_plus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_plus_fc32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_plus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_plus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_plus_fc32
 GrB_Info GB_Asaxpy3B__any_plus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_plus_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__any_plus_fc64.c
index db74d5f9a7..fac847bcf0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_plus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_plus_fc64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  GxB_FC64_t x_op_y = GB_FC64_add (aik, bkj) ; cij = x_op_y
 // Identity: GxB_CMPLX(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_add (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_add (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    1
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((GxB_FC64_t) (GB_FC64_add (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PLUS || GxB_NO_FC64 || GxB_NO_ANY_FC64 || GxB_NO_PLUS_FC64 || GxB_NO_ANY_PLUS_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_plus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_plus_fc64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_plus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_plus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_plus_fc64
 GrB_Info GB_Asaxpy3B__any_plus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_plus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_plus_fp32.c
index 56cca98adb..f6f5b3745d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_plus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_plus_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik + bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x + y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax + bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PLUS || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_ANY_PLUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_plus_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_plus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_plus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_plus_fp32
 GrB_Info GB_Asaxpy3B__any_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_plus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_plus_fp64.c
index f479b9aaa3..d154952139 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_plus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_plus_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik + bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x + y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax + bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PLUS || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_ANY_PLUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_plus_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_plus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_plus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_plus_fp64
 GrB_Info GB_Asaxpy3B__any_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_plus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_plus_int16.c
index 5a10e1025b..5791411ff6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_plus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_plus_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = (aik + bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x + y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) ((ax + bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PLUS || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_ANY_PLUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_plus_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_plus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_plus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_plus_int16
 GrB_Info GB_Asaxpy3B__any_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_plus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_plus_int32.c
index 137e17c992..45f2d5a744 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_plus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_plus_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = (aik + bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x + y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) ((ax + bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PLUS || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_ANY_PLUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_plus_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_plus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_plus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_plus_int32
 GrB_Info GB_Asaxpy3B__any_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_plus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_plus_int64.c
index 43fbc502cb..d6d1f21c20 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_plus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_plus_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = (aik + bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x + y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) ((ax + bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PLUS || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_ANY_PLUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_plus_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_plus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_plus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_plus_int64
 GrB_Info GB_Asaxpy3B__any_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_plus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_plus_int8.c
index 74813051ca..613e14dce1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_plus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_plus_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = (aik + bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x + y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) ((ax + bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PLUS || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_ANY_PLUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_plus_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_plus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_plus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_plus_int8
 GrB_Info GB_Asaxpy3B__any_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint16.c
index 5239329c1e..8634c8531c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = (aik + bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x + y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) ((ax + bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PLUS || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_ANY_PLUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_plus_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_plus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_plus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_plus_uint16
 GrB_Info GB_Asaxpy3B__any_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint32.c
index e936ceb493..9208153622 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = (aik + bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x + y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) ((ax + bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PLUS || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_ANY_PLUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_plus_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_plus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_plus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_plus_uint32
 GrB_Info GB_Asaxpy3B__any_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint64.c
index c00c5be0cb..f0ff911b5d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (aik + bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x + y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) ((ax + bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PLUS || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_ANY_PLUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_plus_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_plus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_plus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_plus_uint64
 GrB_Info GB_Asaxpy3B__any_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint8.c
index 61e5af7573..90f9d087e6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_plus_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = (aik + bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x + y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) ((ax + bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_PLUS || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_ANY_PLUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_plus_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_plus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_plus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_plus_uint8
 GrB_Info GB_Asaxpy3B__any_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fc32.c
index a56821dd25..9fbdb0a4a1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fc32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  GxB_FC32_t x_op_y = GB_FC32_div (bkj, aik) ; cij = x_op_y
 // Identity: GxB_CMPLXF(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_div (y, x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_div (y, x) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    1
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RDIV || GxB_NO_FC32 || GxB_NO_ANY_FC32 || GxB_NO_RDIV_FC32 || GxB_NO_ANY_RDIV_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rdiv_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rdiv_fc32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rdiv_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rdiv_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rdiv_fc32
 GrB_Info GB_Asaxpy3B__any_rdiv_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fc64.c
index e1b510b347..cce90612db 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fc64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  GxB_FC64_t x_op_y = GB_FC64_div (bkj, aik) ; cij = x_op_y
 // Identity: GxB_CMPLX(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_div (y, x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_div (y, x) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    1
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RDIV || GxB_NO_FC64 || GxB_NO_ANY_FC64 || GxB_NO_RDIV_FC64 || GxB_NO_ANY_RDIV_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rdiv_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rdiv_fc64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rdiv_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rdiv_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rdiv_fc64
 GrB_Info GB_Asaxpy3B__any_rdiv_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fp32.c
index 28b664163f..557d6d3984 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (bkj / aik)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y / x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (y / x)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RDIV || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_RDIV_FP32 || GxB_NO_ANY_RDIV_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rdiv_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rdiv_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rdiv_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rdiv_fp32
 GrB_Info GB_Asaxpy3B__any_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fp64.c
index 3735bb9427..0e7d2fb3a6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (bkj / aik)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y / x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (y / x)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RDIV || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_RDIV_FP64 || GxB_NO_ANY_RDIV_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rdiv_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rdiv_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rdiv_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rdiv_fp64
 GrB_Info GB_Asaxpy3B__any_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int16.c
index 9334cbe67f..4a390e0869 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 16) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IDIV_SIGNED (y, x, 16) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) (GB_IDIV_SIGNED (bx, ax, 16))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RDIV || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_RDIV_INT16 || GxB_NO_ANY_RDIV_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rdiv_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rdiv_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rdiv_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rdiv_int16
 GrB_Info GB_Asaxpy3B__any_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int32.c
index f450dcd8e3..39c3e2b6a4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 32) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IDIV_SIGNED (y, x, 32) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) (GB_IDIV_SIGNED (bx, ax, 32))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RDIV || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_RDIV_INT32 || GxB_NO_ANY_RDIV_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rdiv_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rdiv_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rdiv_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rdiv_int32
 GrB_Info GB_Asaxpy3B__any_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int64.c
index 9815211e99..e55a85f261 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 64) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IDIV_SIGNED (y, x, 64) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) (GB_IDIV_SIGNED (bx, ax, 64))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RDIV || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_RDIV_INT64 || GxB_NO_ANY_RDIV_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rdiv_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rdiv_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rdiv_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rdiv_int64
 GrB_Info GB_Asaxpy3B__any_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int8.c
index 09eb4ecf11..87cc03c4a6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 8) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IDIV_SIGNED (y, x, 8) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) (GB_IDIV_SIGNED (bx, ax, 8))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RDIV || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_RDIV_INT8 || GxB_NO_ANY_RDIV_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rdiv_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rdiv_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rdiv_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rdiv_int8
 GrB_Info GB_Asaxpy3B__any_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint16.c
index ad97713f26..b87c4f96fe 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 16) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IDIV_UNSIGNED (y, x, 16) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) (GB_IDIV_UNSIGNED (bx, ax, 16))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RDIV || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_RDIV_UINT16 || GxB_NO_ANY_RDIV_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rdiv_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rdiv_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rdiv_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rdiv_uint16
 GrB_Info GB_Asaxpy3B__any_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint32.c
index ca2d802640..0e8194a35e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 32) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IDIV_UNSIGNED (y, x, 32) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) (GB_IDIV_UNSIGNED (bx, ax, 32))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RDIV || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_RDIV_UINT32 || GxB_NO_ANY_RDIV_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rdiv_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rdiv_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rdiv_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rdiv_uint32
 GrB_Info GB_Asaxpy3B__any_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint64.c
index 28768b3283..b411b33a6b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 64) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IDIV_UNSIGNED (y, x, 64) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) (GB_IDIV_UNSIGNED (bx, ax, 64))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RDIV || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_RDIV_UINT64 || GxB_NO_ANY_RDIV_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rdiv_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rdiv_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rdiv_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rdiv_uint64
 GrB_Info GB_Asaxpy3B__any_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint8.c
index 840404caea..9c2d595dbe 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rdiv_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 8) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IDIV_UNSIGNED (y, x, 8) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) (GB_IDIV_UNSIGNED (bx, ax, 8))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RDIV || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_RDIV_UINT8 || GxB_NO_ANY_RDIV_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rdiv_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rdiv_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rdiv_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rdiv_uint8
 GrB_Info GB_Asaxpy3B__any_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fc32.c
index a5e3474aef..f3585245a2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fc32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  GxB_FC32_t x_op_y = GB_FC32_minus (bkj, aik) ; cij = x_op_y
 // Identity: GxB_CMPLXF(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_minus (y, x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_minus (y, x) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    1
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((GxB_FC32_t) (GB_FC32_minus (bx, ax))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RMINUS || GxB_NO_FC32 || GxB_NO_ANY_FC32 || GxB_NO_RMINUS_FC32 || GxB_NO_ANY_RMINUS_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rminus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rminus_fc32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rminus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rminus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rminus_fc32
 GrB_Info GB_Asaxpy3B__any_rminus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fc64.c
index 67d14e2625..165073734a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fc64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  GxB_FC64_t x_op_y = GB_FC64_minus (bkj, aik) ; cij = x_op_y
 // Identity: GxB_CMPLX(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_minus (y, x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_minus (y, x) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    1
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((GxB_FC64_t) (GB_FC64_minus (bx, ax))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RMINUS || GxB_NO_FC64 || GxB_NO_ANY_FC64 || GxB_NO_RMINUS_FC64 || GxB_NO_ANY_RMINUS_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rminus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rminus_fc64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rminus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rminus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rminus_fc64
 GrB_Info GB_Asaxpy3B__any_rminus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fp32.c
index f1d7a719c6..4967783150 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (bkj - aik)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (y - x)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((bx - ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RMINUS || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_RMINUS_FP32 || GxB_NO_ANY_RMINUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rminus_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rminus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rminus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rminus_fp32
 GrB_Info GB_Asaxpy3B__any_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fp64.c
index 8cc80a974f..881b85c1ed 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (bkj - aik)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (y - x)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((bx - ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RMINUS || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_RMINUS_FP64 || GxB_NO_ANY_RMINUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rminus_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rminus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rminus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rminus_fp64
 GrB_Info GB_Asaxpy3B__any_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int16.c
index 202219b7d5..f629f0aba8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = (bkj - aik) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (y - x) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) ((bx - ax))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RMINUS || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_RMINUS_INT16 || GxB_NO_ANY_RMINUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rminus_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rminus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rminus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rminus_int16
 GrB_Info GB_Asaxpy3B__any_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int32.c
index b97bb90811..22818b5763 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = (bkj - aik) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (y - x) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) ((bx - ax))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RMINUS || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_RMINUS_INT32 || GxB_NO_ANY_RMINUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rminus_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rminus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rminus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rminus_int32
 GrB_Info GB_Asaxpy3B__any_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int64.c
index c62017d484..8d69a615e5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = (bkj - aik) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (y - x) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) ((bx - ax))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RMINUS || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_RMINUS_INT64 || GxB_NO_ANY_RMINUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rminus_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rminus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rminus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rminus_int64
 GrB_Info GB_Asaxpy3B__any_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int8.c
index 80c2ee3adc..603e79be98 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = (bkj - aik) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (y - x) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) ((bx - ax))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RMINUS || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_RMINUS_INT8 || GxB_NO_ANY_RMINUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rminus_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rminus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rminus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rminus_int8
 GrB_Info GB_Asaxpy3B__any_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint16.c
index cb775f983b..6a5501eebc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = (bkj - aik) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (y - x) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) ((bx - ax))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RMINUS || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_RMINUS_UINT16 || GxB_NO_ANY_RMINUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rminus_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rminus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rminus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rminus_uint16
 GrB_Info GB_Asaxpy3B__any_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint32.c
index d50e1daa50..e08261b299 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = (bkj - aik) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (y - x) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) ((bx - ax))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RMINUS || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_RMINUS_UINT32 || GxB_NO_ANY_RMINUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rminus_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rminus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rminus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rminus_uint32
 GrB_Info GB_Asaxpy3B__any_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint64.c
index adce325ae4..7ccbfb1b0a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (bkj - aik) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (y - x) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) ((bx - ax))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RMINUS || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_RMINUS_UINT64 || GxB_NO_ANY_RMINUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rminus_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rminus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rminus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rminus_uint64
 GrB_Info GB_Asaxpy3B__any_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint8.c
index 1ebe69d2cc..efb08060a2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_rminus_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = (bkj - aik) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (y - x) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) ((bx - ax))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_RMINUS || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_RMINUS_UINT8 || GxB_NO_ANY_RMINUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_rminus_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_rminus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_rminus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_rminus_uint8
 GrB_Info GB_Asaxpy3B__any_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_bool.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_bool.c
index 2bb8182497..5a2bc62c59 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_BOOL || GxB_NO_ANY_BOOL || GxB_NO_SECOND_BOOL || GxB_NO_ANY_SECOND_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_bool
 GrB_Info GB_Asaxpy3B__any_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_fc32.c
index 0a24c94ce5..b95c9c0714 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_fc32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: GxB_CMPLXF(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    1
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_FC32 || GxB_NO_ANY_FC32 || GxB_NO_SECOND_FC32 || GxB_NO_ANY_SECOND_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_fc32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_fc32
 GrB_Info GB_Asaxpy3B__any_second_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_fc64.c
index 0f1e9bfaa8..588daa6d62 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_fc64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: GxB_CMPLX(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    1
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_FC64 || GxB_NO_ANY_FC64 || GxB_NO_SECOND_FC64 || GxB_NO_ANY_SECOND_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_fc64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_fc64
 GrB_Info GB_Asaxpy3B__any_second_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_fp32.c
index e14d98259c..1d3bd81728 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_SECOND_FP32 || GxB_NO_ANY_SECOND_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_fp32
 GrB_Info GB_Asaxpy3B__any_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_fp64.c
index b2d0f51262..afffddc48b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_SECOND_FP64 || GxB_NO_ANY_SECOND_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_fp64
 GrB_Info GB_Asaxpy3B__any_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_int16.c
index 01770391ba..7334d66b36 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_SECOND_INT16 || GxB_NO_ANY_SECOND_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_int16
 GrB_Info GB_Asaxpy3B__any_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_int32.c
index 70c5da3dac..5f0c9919d2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_SECOND_INT32 || GxB_NO_ANY_SECOND_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_int32
 GrB_Info GB_Asaxpy3B__any_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_int64.c
index 4aa1df4271..9aa2b31724 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_SECOND_INT64 || GxB_NO_ANY_SECOND_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_int64
 GrB_Info GB_Asaxpy3B__any_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_int8.c
index fc5f3ebfad..69eeb1de41 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_SECOND_INT8 || GxB_NO_ANY_SECOND_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_int8
 GrB_Info GB_Asaxpy3B__any_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_uint16.c
index a7494cd782..d8c6814887 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_SECOND_UINT16 || GxB_NO_ANY_SECOND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_uint16
 GrB_Info GB_Asaxpy3B__any_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_uint32.c
index ba281d144c..981a33ee12 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_SECOND_UINT32 || GxB_NO_ANY_SECOND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_uint32
 GrB_Info GB_Asaxpy3B__any_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_uint64.c
index 5c61f7f6c4..c3651a88bb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_SECOND_UINT64 || GxB_NO_ANY_SECOND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_uint64
 GrB_Info GB_Asaxpy3B__any_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_second_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_second_uint8.c
index 04da58ea48..1f1261ebfe 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_second_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_second_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = bkj
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_SECOND || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_SECOND_UINT8 || GxB_NO_ANY_SECOND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_second_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_second_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_second_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_second_uint8
 GrB_Info GB_Asaxpy3B__any_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_secondj1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_secondj1_int32.c
new file mode 100644
index 0000000000..4fe76abb76
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_secondj1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__any_secondj1_int32
+// A'*B function (dot3):     GB_Adot3B__any_secondj1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__any_secondj1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__any_secondj1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (j+1)
+// Add:      cij = z
+//           'any' monoid?  1
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  cij = (j+1)
+// Identity: 0
+// Terminal: break ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (j+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z = (j+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    break ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    1
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((j+1)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_ANY || GxB_NO_SECONDJ1 || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_SECONDJ1_INT32 || GxB_NO_ANY_SECONDJ1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__any_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__any_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__any_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__any_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_secondj1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_secondj1_int64.c
new file mode 100644
index 0000000000..b5bb3de531
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_secondj1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__any_secondj1_int64
+// A'*B function (dot3):     GB_Adot3B__any_secondj1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__any_secondj1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__any_secondj1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (j+1)
+// Add:      cij = z
+//           'any' monoid?  1
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  cij = (j+1)
+// Identity: 0
+// Terminal: break ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (j+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z = (j+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    break ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    1
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((j+1)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_ANY || GxB_NO_SECONDJ1 || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_SECONDJ1_INT64 || GxB_NO_ANY_SECONDJ1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__any_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__any_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__any_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__any_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_secondj_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_secondj_int32.c
new file mode 100644
index 0000000000..2599ed72c2
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_secondj_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__any_secondj_int32
+// A'*B function (dot3):     GB_Adot3B__any_secondj_int32
+// C+=A'*B function (dot4):  GB_Adot4B__any_secondj_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__any_secondj_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = j
+// Add:      cij = z
+//           'any' monoid?  1
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  cij = j
+// Identity: 0
+// Terminal: break ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = j
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z = j
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    break ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    1
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (j) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_ANY || GxB_NO_SECONDJ || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_SECONDJ_INT32 || GxB_NO_ANY_SECONDJ_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__any_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__any_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__any_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__any_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_secondj_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_secondj_int64.c
new file mode 100644
index 0000000000..12f9da6526
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_secondj_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__any_secondj_int64
+// A'*B function (dot3):     GB_Adot3B__any_secondj_int64
+// C+=A'*B function (dot4):  GB_Adot4B__any_secondj_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__any_secondj_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = j
+// Add:      cij = z
+//           'any' monoid?  1
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  cij = j
+// Identity: 0
+// Terminal: break ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = j
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z = j
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    break ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    1
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = (j) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_ANY || GxB_NO_SECONDJ || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_SECONDJ_INT64 || GxB_NO_ANY_SECONDJ_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__any_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__any_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__any_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__any_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_times_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__any_times_fc32.c
index b1b9c6275c..05445c1761 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_times_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_times_fc32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  GxB_FC32_t x_op_y = GB_FC32_mul (aik, bkj) ; cij = x_op_y
 // Identity: GxB_CMPLXF(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_mul (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_mul (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    1
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((GxB_FC32_t) (GB_FC32_mul (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_TIMES || GxB_NO_FC32 || GxB_NO_ANY_FC32 || GxB_NO_TIMES_FC32 || GxB_NO_ANY_TIMES_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_times_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_times_fc32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_times_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_times_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_times_fc32
 GrB_Info GB_Asaxpy3B__any_times_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_times_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__any_times_fc64.c
index 9fe4789f9c..743342ff6d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_times_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_times_fc64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  GxB_FC64_t x_op_y = GB_FC64_mul (aik, bkj) ; cij = x_op_y
 // Identity: GxB_CMPLX(0,0)
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     GxB_FC64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_mul (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_mul (x, y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    1
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((GxB_FC64_t) (GB_FC64_mul (ax, bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_TIMES || GxB_NO_FC64 || GxB_NO_ANY_FC64 || GxB_NO_TIMES_FC64 || GxB_NO_ANY_TIMES_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_times_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_times_fc64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_times_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_times_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_times_fc64
 GrB_Info GB_Asaxpy3B__any_times_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_times_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__any_times_fp32.c
index 653f582592..20bce3c4a4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_times_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_times_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik * bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x * y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax * bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_TIMES || GxB_NO_FP32 || GxB_NO_ANY_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_ANY_TIMES_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_times_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_times_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_times_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_times_fp32
 GrB_Info GB_Asaxpy3B__any_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_times_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__any_times_fp64.c
index 14bbf4ccc9..ac854e9586 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_times_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_times_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  cij = (aik * bkj)
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (x * y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((ax * bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_TIMES || GxB_NO_FP64 || GxB_NO_ANY_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_ANY_TIMES_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_times_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_times_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_times_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_times_fp64
 GrB_Info GB_Asaxpy3B__any_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_times_int16.c b/GraphBLAS/Source/Generated/GB_AxB__any_times_int16.c
index 7458123934..f859b11208 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_times_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_times_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = (aik * bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x * y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int16_t) ((ax * bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_TIMES || GxB_NO_INT16 || GxB_NO_ANY_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_ANY_TIMES_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_times_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_times_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_times_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_times_int16
 GrB_Info GB_Asaxpy3B__any_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_times_int32.c b/GraphBLAS/Source/Generated/GB_AxB__any_times_int32.c
index d6671fb20d..d54e7d5e89 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_times_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_times_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = (aik * bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x * y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int32_t) ((ax * bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_TIMES || GxB_NO_INT32 || GxB_NO_ANY_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_ANY_TIMES_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_times_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_times_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_times_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_times_int32
 GrB_Info GB_Asaxpy3B__any_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_times_int64.c b/GraphBLAS/Source/Generated/GB_AxB__any_times_int64.c
index 123644da94..9d3f18abd8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_times_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_times_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = (aik * bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x * y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int64_t) ((ax * bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_TIMES || GxB_NO_INT64 || GxB_NO_ANY_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_ANY_TIMES_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_times_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_times_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_times_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_times_int64
 GrB_Info GB_Asaxpy3B__any_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_times_int8.c b/GraphBLAS/Source/Generated/GB_AxB__any_times_int8.c
index 96b2ecc4ed..b5020c6248 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_times_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_times_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = (aik * bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x * y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((int8_t) ((ax * bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_TIMES || GxB_NO_INT8 || GxB_NO_ANY_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_ANY_TIMES_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_times_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_times_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_times_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_times_int8
 GrB_Info GB_Asaxpy3B__any_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_times_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__any_times_uint16.c
index 01f312a874..79c082f4e3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_times_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_times_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = (aik * bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x * y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint16_t) ((ax * bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_TIMES || GxB_NO_UINT16 || GxB_NO_ANY_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_ANY_TIMES_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_times_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_times_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_times_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_times_uint16
 GrB_Info GB_Asaxpy3B__any_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_times_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__any_times_uint32.c
index 789a9ea952..ce7e1b1d6d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_times_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_times_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = (aik * bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x * y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint32_t) ((ax * bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_TIMES || GxB_NO_UINT32 || GxB_NO_ANY_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_ANY_TIMES_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_times_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_times_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_times_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_times_uint32
 GrB_Info GB_Asaxpy3B__any_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_times_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__any_times_uint64.c
index 8dacbc2d99..5db7dd903f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_times_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_times_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (aik * bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x * y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint64_t) ((ax * bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_TIMES || GxB_NO_UINT64 || GxB_NO_ANY_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_ANY_TIMES_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_times_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_times_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_times_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_times_uint64
 GrB_Info GB_Asaxpy3B__any_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__any_times_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__any_times_uint8.c
index 762805ec45..c989cc9b19 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__any_times_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__any_times_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = (aik * bkj) ; cij = x_op_y
 // Identity: 0
-// Terminal: { cij_is_terminal = true ; break ; }
+// Terminal: break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x * y) ; z = x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    { cij_is_terminal = true ; break ; }
+    break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !cb) cx = ((uint8_t) ((ax * bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ANY || GxB_NO_TIMES || GxB_NO_UINT8 || GxB_NO_ANY_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_ANY_TIMES_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__any_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__any_times_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__any_times_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__any_times_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__any_times_uint8
 GrB_Info GB_Asaxpy3B__any_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_band_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__band_band_uint16.c
index 0cfb4751a9..4c96c14802 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_band_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_band_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = (aik & bkj) ; cij &= x_op_y
 // Identity: 0xFFFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x & y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint16_t) ((ax & bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0xFFFF,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_UINT16 || GxB_NO_BAND_UINT16 || GxB_NO_BAND_BAND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_band_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_band_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_band_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_band_uint16
 GrB_Info GB_Asaxpy3B__band_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_band_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__band_band_uint32.c
index 7caef55cd9..3462ffe84c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_band_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_band_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = (aik & bkj) ; cij &= x_op_y
 // Identity: 0xFFFFFFFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x & y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFFFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint32_t) ((ax & bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0xFFFFFFFF,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_UINT32 || GxB_NO_BAND_UINT32 || GxB_NO_BAND_BAND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_band_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_band_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_band_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_band_uint32
 GrB_Info GB_Asaxpy3B__band_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_band_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__band_band_uint64.c
index 44bf5a5ada..7bcfa6ef15 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_band_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_band_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,8 +38,8 @@
 //           atomic?        1
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = (aik & bkj) ; cij &= x_op_y
-// Identity: 0xFFFFFFFFFFFFFFFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Identity: 0xFFFFFFFFFFFFFFFFL
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x & y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
-    0xFFFFFFFFFFFFFFFF
+    0xFFFFFFFFFFFFFFFFL
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
 
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint64_t) ((ax & bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0xFFFFFFFFFFFFFFFFL,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_UINT64 || GxB_NO_BAND_UINT64 || GxB_NO_BAND_BAND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_band_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_band_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_band_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_band_uint64
 GrB_Info GB_Asaxpy3B__band_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_band_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__band_band_uint8.c
index 0dd2d3ae2f..d50a3c41ef 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_band_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_band_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = (aik & bkj) ; cij &= x_op_y
 // Identity: 0xFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x & y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint8_t) ((ax & bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0xFF,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_UINT8 || GxB_NO_BAND_UINT8 || GxB_NO_BAND_BAND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_band_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_band_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_band_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_band_uint8
 GrB_Info GB_Asaxpy3B__band_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint16.c
index 410081ea8b..5d273eca8c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = (aik | bkj) ; cij &= x_op_y
 // Identity: 0xFFFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x | y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint16_t) ((ax | bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0xFFFF,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_BOR || GxB_NO_UINT16 || GxB_NO_BAND_UINT16 || GxB_NO_BOR_UINT16 || GxB_NO_BAND_BOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_bor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_bor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_bor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_bor_uint16
 GrB_Info GB_Asaxpy3B__band_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint32.c
index 9e2c0afdae..42b1cd43f0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = (aik | bkj) ; cij &= x_op_y
 // Identity: 0xFFFFFFFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x | y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFFFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint32_t) ((ax | bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0xFFFFFFFF,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_BOR || GxB_NO_UINT32 || GxB_NO_BAND_UINT32 || GxB_NO_BOR_UINT32 || GxB_NO_BAND_BOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_bor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_bor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_bor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_bor_uint32
 GrB_Info GB_Asaxpy3B__band_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint64.c
index d8d967ed18..0c4f6c3dc1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,8 +38,8 @@
 //           atomic?        1
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = (aik | bkj) ; cij &= x_op_y
-// Identity: 0xFFFFFFFFFFFFFFFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Identity: 0xFFFFFFFFFFFFFFFFL
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x | y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
-    0xFFFFFFFFFFFFFFFF
+    0xFFFFFFFFFFFFFFFFL
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
 
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint64_t) ((ax | bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0xFFFFFFFFFFFFFFFFL,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_BOR || GxB_NO_UINT64 || GxB_NO_BAND_UINT64 || GxB_NO_BOR_UINT64 || GxB_NO_BAND_BOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_bor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_bor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_bor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_bor_uint64
 GrB_Info GB_Asaxpy3B__band_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint8.c
index 7ce8e177ea..f31dcad7b5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_bor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = (aik | bkj) ; cij &= x_op_y
 // Identity: 0xFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x | y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint8_t) ((ax | bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0xFF,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_BOR || GxB_NO_UINT8 || GxB_NO_BAND_UINT8 || GxB_NO_BOR_UINT8 || GxB_NO_BAND_BOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_bor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_bor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_bor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_bor_uint8
 GrB_Info GB_Asaxpy3B__band_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint16.c
index 860a194505..90096f72b0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = ~(aik ^ bkj) ; cij &= x_op_y
 // Identity: 0xFFFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ~(x ^ y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint16_t) (~(ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0xFFFF,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_BXNOR || GxB_NO_UINT16 || GxB_NO_BAND_UINT16 || GxB_NO_BXNOR_UINT16 || GxB_NO_BAND_BXNOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_bxnor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_bxnor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_bxnor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_bxnor_uint16
 GrB_Info GB_Asaxpy3B__band_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint32.c
index 9c07a62e86..61ad46589c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = ~(aik ^ bkj) ; cij &= x_op_y
 // Identity: 0xFFFFFFFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ~(x ^ y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFFFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint32_t) (~(ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0xFFFFFFFF,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_BXNOR || GxB_NO_UINT32 || GxB_NO_BAND_UINT32 || GxB_NO_BXNOR_UINT32 || GxB_NO_BAND_BXNOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_bxnor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_bxnor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_bxnor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_bxnor_uint32
 GrB_Info GB_Asaxpy3B__band_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint64.c
index f965f2394e..3beb247dbf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,8 +38,8 @@
 //           atomic?        1
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = ~(aik ^ bkj) ; cij &= x_op_y
-// Identity: 0xFFFFFFFFFFFFFFFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Identity: 0xFFFFFFFFFFFFFFFFL
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ~(x ^ y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
-    0xFFFFFFFFFFFFFFFF
+    0xFFFFFFFFFFFFFFFFL
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
 
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint64_t) (~(ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0xFFFFFFFFFFFFFFFFL,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_BXNOR || GxB_NO_UINT64 || GxB_NO_BAND_UINT64 || GxB_NO_BXNOR_UINT64 || GxB_NO_BAND_BXNOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_bxnor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_bxnor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_bxnor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_bxnor_uint64
 GrB_Info GB_Asaxpy3B__band_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint8.c
index 71b590791a..f859d5c3d1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_bxnor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = ~(aik ^ bkj) ; cij &= x_op_y
 // Identity: 0xFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ~(x ^ y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint8_t) (~(ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0xFF,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_BXNOR || GxB_NO_UINT8 || GxB_NO_BAND_UINT8 || GxB_NO_BXNOR_UINT8 || GxB_NO_BAND_BXNOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_bxnor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_bxnor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_bxnor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_bxnor_uint8
 GrB_Info GB_Asaxpy3B__band_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint16.c
index a4e0f4520a..d10eb13de7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = (aik ^ bkj) ; cij &= x_op_y
 // Identity: 0xFFFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x ^ y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint16_t) ((ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0xFFFF,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_BXOR || GxB_NO_UINT16 || GxB_NO_BAND_UINT16 || GxB_NO_BXOR_UINT16 || GxB_NO_BAND_BXOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_bxor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_bxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_bxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_bxor_uint16
 GrB_Info GB_Asaxpy3B__band_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint32.c
index e765db3b7b..e23d920374 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = (aik ^ bkj) ; cij &= x_op_y
 // Identity: 0xFFFFFFFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x ^ y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFFFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint32_t) ((ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0xFFFFFFFF,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_BXOR || GxB_NO_UINT32 || GxB_NO_BAND_UINT32 || GxB_NO_BXOR_UINT32 || GxB_NO_BAND_BXOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_bxor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_bxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_bxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_bxor_uint32
 GrB_Info GB_Asaxpy3B__band_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint64.c
index 5d4ec9ee7a..c8f11f6d90 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,8 +38,8 @@
 //           atomic?        1
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = (aik ^ bkj) ; cij &= x_op_y
-// Identity: 0xFFFFFFFFFFFFFFFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Identity: 0xFFFFFFFFFFFFFFFFL
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x ^ y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
-    0xFFFFFFFFFFFFFFFF
+    0xFFFFFFFFFFFFFFFFL
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
 
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint64_t) ((ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0xFFFFFFFFFFFFFFFFL,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_BXOR || GxB_NO_UINT64 || GxB_NO_BAND_UINT64 || GxB_NO_BXOR_UINT64 || GxB_NO_BAND_BXOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_bxor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_bxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_bxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_bxor_uint64
 GrB_Info GB_Asaxpy3B__band_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint8.c
index b088e5d49d..287ae42b9f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__band_bxor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = (aik ^ bkj) ; cij &= x_op_y
 // Identity: 0xFF
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x ^ y) ; z &= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x & y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= X [exists] | ((uint8_t) ((ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0xFF,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BAND || GxB_NO_BXOR || GxB_NO_UINT8 || GxB_NO_BAND_UINT8 || GxB_NO_BXOR_UINT8 || GxB_NO_BAND_BXOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__band_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__band_bxor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__band_bxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__band_bxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__band_bxor_uint8
 GrB_Info GB_Asaxpy3B__band_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint16.c
index 1ac83de8de..29dd4e5731 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = (aik & bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFFFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFFFF) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x & y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFFFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFFFF) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint16_t) ((ax & bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,0xFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_BAND || GxB_NO_UINT16 || GxB_NO_BOR_UINT16 || GxB_NO_BAND_UINT16 || GxB_NO_BOR_BAND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_band_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_band_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_band_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_band_uint16
 GrB_Info GB_Asaxpy3B__bor_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint32.c
index e2c0090987..7852549365 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = (aik & bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFFFFFFFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFFFFFFFF) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x & y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFFFFFFFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFFFFFFFF) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint32_t) ((ax & bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,0xFFFFFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_BAND || GxB_NO_UINT32 || GxB_NO_BOR_UINT32 || GxB_NO_BAND_UINT32 || GxB_NO_BOR_BAND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_band_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_band_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_band_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_band_uint32
 GrB_Info GB_Asaxpy3B__bor_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint64.c
index 46de422c2b..b8624e45e3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = (aik & bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFFFFFFFFFFFFFFFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFFFFFFFFFFFFFFFFL) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x & y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFFFFFFFFFFFFFFFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFFFFFFFFFFFFFFFFL) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint64_t) ((ax & bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,0xFFFFFFFFFFFFFFFFL}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_BAND || GxB_NO_UINT64 || GxB_NO_BOR_UINT64 || GxB_NO_BAND_UINT64 || GxB_NO_BOR_BAND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_band_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_band_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_band_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_band_uint64
 GrB_Info GB_Asaxpy3B__bor_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint8.c
index d528bd0610..9227cf4df9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_band_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = (aik & bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFF) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x & y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFF) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint8_t) ((ax & bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,0xFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_BAND || GxB_NO_UINT8 || GxB_NO_BOR_UINT8 || GxB_NO_BAND_UINT8 || GxB_NO_BOR_BAND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_band_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_band_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_band_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_band_uint8
 GrB_Info GB_Asaxpy3B__bor_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint16.c
index afd1efc92b..7f30ec28f5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = (aik | bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFFFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFFFF) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x | y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFFFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFFFF) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint16_t) ((ax | bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,0xFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_UINT16 || GxB_NO_BOR_UINT16 || GxB_NO_BOR_BOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_bor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_bor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_bor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_bor_uint16
 GrB_Info GB_Asaxpy3B__bor_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint32.c
index 082867c527..67efd8f407 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = (aik | bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFFFFFFFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFFFFFFFF) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x | y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFFFFFFFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFFFFFFFF) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint32_t) ((ax | bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,0xFFFFFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_UINT32 || GxB_NO_BOR_UINT32 || GxB_NO_BOR_BOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_bor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_bor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_bor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_bor_uint32
 GrB_Info GB_Asaxpy3B__bor_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint64.c
index 29a3c98aec..f30cc4205d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = (aik | bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFFFFFFFFFFFFFFFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFFFFFFFFFFFFFFFFL) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x | y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFFFFFFFFFFFFFFFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFFFFFFFFFFFFFFFFL) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint64_t) ((ax | bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,0xFFFFFFFFFFFFFFFFL}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_UINT64 || GxB_NO_BOR_UINT64 || GxB_NO_BOR_BOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_bor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_bor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_bor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_bor_uint64
 GrB_Info GB_Asaxpy3B__bor_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint8.c
index ac519fcdff..d39462bad3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_bor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = (aik | bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFF) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x | y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFF) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint8_t) ((ax | bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,0xFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_UINT8 || GxB_NO_BOR_UINT8 || GxB_NO_BOR_BOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_bor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_bor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_bor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_bor_uint8
 GrB_Info GB_Asaxpy3B__bor_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint16.c
index ad104a661c..69384f81df 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = ~(aik ^ bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFFFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFFFF) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ~(x ^ y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFFFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFFFF) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint16_t) (~(ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,0xFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_BXNOR || GxB_NO_UINT16 || GxB_NO_BOR_UINT16 || GxB_NO_BXNOR_UINT16 || GxB_NO_BOR_BXNOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_bxnor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_bxnor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_bxnor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_bxnor_uint16
 GrB_Info GB_Asaxpy3B__bor_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint32.c
index 2b846befc2..2dc731c6fb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = ~(aik ^ bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFFFFFFFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFFFFFFFF) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ~(x ^ y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFFFFFFFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFFFFFFFF) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint32_t) (~(ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,0xFFFFFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_BXNOR || GxB_NO_UINT32 || GxB_NO_BOR_UINT32 || GxB_NO_BXNOR_UINT32 || GxB_NO_BOR_BXNOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_bxnor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_bxnor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_bxnor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_bxnor_uint32
 GrB_Info GB_Asaxpy3B__bor_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint64.c
index a1f59ac60b..4c564af3b5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = ~(aik ^ bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFFFFFFFFFFFFFFFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFFFFFFFFFFFFFFFFL) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ~(x ^ y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFFFFFFFFFFFFFFFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFFFFFFFFFFFFFFFFL) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint64_t) (~(ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,0xFFFFFFFFFFFFFFFFL}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_BXNOR || GxB_NO_UINT64 || GxB_NO_BOR_UINT64 || GxB_NO_BXNOR_UINT64 || GxB_NO_BOR_BXNOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_bxnor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_bxnor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_bxnor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_bxnor_uint64
 GrB_Info GB_Asaxpy3B__bor_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint8.c
index 87ccc8d2c5..a116d19ec8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_bxnor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = ~(aik ^ bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFF) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ~(x ^ y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFF) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint8_t) (~(ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,0xFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_BXNOR || GxB_NO_UINT8 || GxB_NO_BOR_UINT8 || GxB_NO_BXNOR_UINT8 || GxB_NO_BOR_BXNOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_bxnor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_bxnor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_bxnor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_bxnor_uint8
 GrB_Info GB_Asaxpy3B__bor_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint16.c
index 174b154cec..ee945278c2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = (aik ^ bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFFFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFFFF) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x ^ y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFFFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFFFF) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint16_t) ((ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,0xFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_BXOR || GxB_NO_UINT16 || GxB_NO_BOR_UINT16 || GxB_NO_BXOR_UINT16 || GxB_NO_BOR_BXOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_bxor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_bxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_bxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_bxor_uint16
 GrB_Info GB_Asaxpy3B__bor_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint32.c
index ed85539f98..22730ada50 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = (aik ^ bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFFFFFFFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFFFFFFFF) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x ^ y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFFFFFFFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFFFFFFFF) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint32_t) ((ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,0xFFFFFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_BXOR || GxB_NO_UINT32 || GxB_NO_BOR_UINT32 || GxB_NO_BXOR_UINT32 || GxB_NO_BOR_BXOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_bxor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_bxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_bxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_bxor_uint32
 GrB_Info GB_Asaxpy3B__bor_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint64.c
index 31c124258f..d4d04d23ca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = (aik ^ bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFFFFFFFFFFFFFFFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFFFFFFFFFFFFFFFFL) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x ^ y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFFFFFFFFFFFFFFFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFFFFFFFFFFFFFFFFL) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint64_t) ((ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,0xFFFFFFFFFFFFFFFFL}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_BXOR || GxB_NO_UINT64 || GxB_NO_BOR_UINT64 || GxB_NO_BXOR_UINT64 || GxB_NO_BOR_BXOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_bxor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_bxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_bxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_bxor_uint64
 GrB_Info GB_Asaxpy3B__bor_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint8.c
index 5d562608fd..e73d6a9752 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bor_bxor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = (aik ^ bkj) ; cij |= x_op_y
 // Identity: 0
-// Terminal: if (cij == 0xFF) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0xFF) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x ^ y) ; z |= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0xFF) { cij_is_terminal = true ; break ; }
+    if (cij == 0xFF) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x | y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= X [exists] & ((uint8_t) ((ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,0xFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BOR || GxB_NO_BXOR || GxB_NO_UINT8 || GxB_NO_BOR_UINT8 || GxB_NO_BXOR_UINT8 || GxB_NO_BOR_BXOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bor_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bor_bxor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__bor_bxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__bor_bxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__bor_bxor_uint8
 GrB_Info GB_Asaxpy3B__bor_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint16.c
index 57dc4b6cb6..4f7522812b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x & y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_BAND || GxB_NO_UINT16 || GxB_NO_BXNOR_UINT16 || GxB_NO_BAND_UINT16 || GxB_NO_BXNOR_BAND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_band_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_band_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_band_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_band_uint16
 GrB_Info GB_Asaxpy3B__bxnor_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint32.c
index 17cf4b71cd..b70fb5bbc4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x & y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFFFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_BAND || GxB_NO_UINT32 || GxB_NO_BXNOR_UINT32 || GxB_NO_BAND_UINT32 || GxB_NO_BXNOR_BAND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_band_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_band_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_band_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_band_uint32
 GrB_Info GB_Asaxpy3B__bxnor_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint64.c
index 1aaea63732..e3c081bd02 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +38,7 @@
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (aik & bkj) ; cij = ~(cij ^ x_op_y)
-// Identity: 0xFFFFFFFFFFFFFFFF
+// Identity: 0xFFFFFFFFFFFFFFFFL
 // Terminal: ;
 
 #define GB_ATYPE \
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,23 +66,43 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x & y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
-    0xFFFFFFFFFFFFFFFF
+    0xFFFFFFFFFFFFFFFFL
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
 
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_BAND || GxB_NO_UINT64 || GxB_NO_BXNOR_UINT64 || GxB_NO_BAND_UINT64 || GxB_NO_BXNOR_BAND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_band_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_band_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_band_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_band_uint64
 GrB_Info GB_Asaxpy3B__bxnor_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint8.c
index 7b46c554af..4f1e8ccfc7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_band_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x & y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_BAND || GxB_NO_UINT8 || GxB_NO_BXNOR_UINT8 || GxB_NO_BAND_UINT8 || GxB_NO_BXNOR_BAND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_band_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_band_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_band_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_band_uint8
 GrB_Info GB_Asaxpy3B__bxnor_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint16.c
index 9a6db41c80..c289d16b22 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x | y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_BOR || GxB_NO_UINT16 || GxB_NO_BXNOR_UINT16 || GxB_NO_BOR_UINT16 || GxB_NO_BXNOR_BOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_bor_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_bor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_bor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_bor_uint16
 GrB_Info GB_Asaxpy3B__bxnor_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint32.c
index 814f349959..160b1bc09b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x | y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFFFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_BOR || GxB_NO_UINT32 || GxB_NO_BXNOR_UINT32 || GxB_NO_BOR_UINT32 || GxB_NO_BXNOR_BOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_bor_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_bor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_bor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_bor_uint32
 GrB_Info GB_Asaxpy3B__bxnor_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint64.c
index 2d71fa0c0d..b6a78386e3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +38,7 @@
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (aik | bkj) ; cij = ~(cij ^ x_op_y)
-// Identity: 0xFFFFFFFFFFFFFFFF
+// Identity: 0xFFFFFFFFFFFFFFFFL
 // Terminal: ;
 
 #define GB_ATYPE \
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,23 +66,43 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x | y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
-    0xFFFFFFFFFFFFFFFF
+    0xFFFFFFFFFFFFFFFFL
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
 
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_BOR || GxB_NO_UINT64 || GxB_NO_BXNOR_UINT64 || GxB_NO_BOR_UINT64 || GxB_NO_BXNOR_BOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_bor_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_bor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_bor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_bor_uint64
 GrB_Info GB_Asaxpy3B__bxnor_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint8.c
index b86e9939fd..cc09ba043a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bor_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x | y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_BOR || GxB_NO_UINT8 || GxB_NO_BXNOR_UINT8 || GxB_NO_BOR_UINT8 || GxB_NO_BXNOR_BOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_bor_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_bor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_bor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_bor_uint8
 GrB_Info GB_Asaxpy3B__bxnor_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint16.c
index c2e72df6e6..814482d58b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ~(x ^ y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_UINT16 || GxB_NO_BXNOR_UINT16 || GxB_NO_BXNOR_BXNOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_bxnor_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_bxnor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_bxnor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_bxnor_uint16
 GrB_Info GB_Asaxpy3B__bxnor_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint32.c
index 467ee002b9..ff056735fe 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ~(x ^ y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFFFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_UINT32 || GxB_NO_BXNOR_UINT32 || GxB_NO_BXNOR_BXNOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_bxnor_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_bxnor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_bxnor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_bxnor_uint32
 GrB_Info GB_Asaxpy3B__bxnor_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint64.c
index 4ba1c6b5e0..eea39e7a6d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +38,7 @@
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = ~(aik ^ bkj) ; cij = ~(cij ^ x_op_y)
-// Identity: 0xFFFFFFFFFFFFFFFF
+// Identity: 0xFFFFFFFFFFFFFFFFL
 // Terminal: ;
 
 #define GB_ATYPE \
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,23 +66,43 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ~(x ^ y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
-    0xFFFFFFFFFFFFFFFF
+    0xFFFFFFFFFFFFFFFFL
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
 
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_UINT64 || GxB_NO_BXNOR_UINT64 || GxB_NO_BXNOR_BXNOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_bxnor_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_bxnor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_bxnor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_bxnor_uint64
 GrB_Info GB_Asaxpy3B__bxnor_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint8.c
index bd9cf817cc..c4593d53f6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxnor_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ~(x ^ y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_UINT8 || GxB_NO_BXNOR_UINT8 || GxB_NO_BXNOR_BXNOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_bxnor_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_bxnor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_bxnor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_bxnor_uint8
 GrB_Info GB_Asaxpy3B__bxnor_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint16.c
index ed937cccbf..a20f392013 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x ^ y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_BXOR || GxB_NO_UINT16 || GxB_NO_BXNOR_UINT16 || GxB_NO_BXOR_UINT16 || GxB_NO_BXNOR_BXOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_bxor_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_bxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_bxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_bxor_uint16
 GrB_Info GB_Asaxpy3B__bxnor_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint32.c
index 17b9d79159..969577be37 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x ^ y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFFFFFFFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_BXOR || GxB_NO_UINT32 || GxB_NO_BXNOR_UINT32 || GxB_NO_BXOR_UINT32 || GxB_NO_BXNOR_BXOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_bxor_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_bxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_bxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_bxor_uint32
 GrB_Info GB_Asaxpy3B__bxnor_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint64.c
index 61e8786b22..374d445aac 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +38,7 @@
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (aik ^ bkj) ; cij = ~(cij ^ x_op_y)
-// Identity: 0xFFFFFFFFFFFFFFFF
+// Identity: 0xFFFFFFFFFFFFFFFFL
 // Terminal: ;
 
 #define GB_ATYPE \
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,23 +66,43 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x ^ y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
-    0xFFFFFFFFFFFFFFFF
+    0xFFFFFFFFFFFFFFFFL
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
 
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_BXOR || GxB_NO_UINT64 || GxB_NO_BXNOR_UINT64 || GxB_NO_BXOR_UINT64 || GxB_NO_BXNOR_BXOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_bxor_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_bxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_bxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_bxor_uint64
 GrB_Info GB_Asaxpy3B__bxnor_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint8.c
index c3294f237c..72fe63394d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxnor_bxor_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x ^ y) ; z = ~(z ^ x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0xFF
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     ~(x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = ~(Cx [p] ^ Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = ~(Hx [i] ^ t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXNOR || GxB_NO_BXOR || GxB_NO_UINT8 || GxB_NO_BXNOR_UINT8 || GxB_NO_BXOR_UINT8 || GxB_NO_BXNOR_BXOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxnor_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxnor_bxor_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxnor_bxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxnor_bxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxnor_bxor_uint8
 GrB_Info GB_Asaxpy3B__bxnor_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint16.c
index 7b196aac4d..2b21c4980c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x & y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint16_t) ((ax & bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,0xFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_BAND || GxB_NO_UINT16 || GxB_NO_BXOR_UINT16 || GxB_NO_BAND_UINT16 || GxB_NO_BXOR_BAND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_band_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_band_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_band_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_band_uint16
 GrB_Info GB_Asaxpy3B__bxor_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint32.c
index 8522b6bbfe..966320f119 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x & y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint32_t) ((ax & bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,0xFFFFFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_BAND || GxB_NO_UINT32 || GxB_NO_BXOR_UINT32 || GxB_NO_BAND_UINT32 || GxB_NO_BXOR_BAND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_band_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_band_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_band_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_band_uint32
 GrB_Info GB_Asaxpy3B__bxor_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint64.c
index b5423774b3..876bfb04eb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x & y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint64_t) ((ax & bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,0xFFFFFFFFFFFFFFFFL}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_BAND || GxB_NO_UINT64 || GxB_NO_BXOR_UINT64 || GxB_NO_BAND_UINT64 || GxB_NO_BXOR_BAND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_band_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_band_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_band_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_band_uint64
 GrB_Info GB_Asaxpy3B__bxor_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint8.c
index 6e2489ac2c..1847dde2b3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_band_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x & y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x & y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint8_t) ((ax & bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,0xFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_BAND || GxB_NO_UINT8 || GxB_NO_BXOR_UINT8 || GxB_NO_BAND_UINT8 || GxB_NO_BXOR_BAND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_band_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_band_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_band_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_band_uint8
 GrB_Info GB_Asaxpy3B__bxor_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint16.c
index 071f785b43..37a6f0ed6a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x | y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint16_t) ((ax | bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,0xFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_BOR || GxB_NO_UINT16 || GxB_NO_BXOR_UINT16 || GxB_NO_BOR_UINT16 || GxB_NO_BXOR_BOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_bor_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_bor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_bor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_bor_uint16
 GrB_Info GB_Asaxpy3B__bxor_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint32.c
index 9cd4dd6245..b0ed2032dc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x | y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint32_t) ((ax | bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,0xFFFFFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_BOR || GxB_NO_UINT32 || GxB_NO_BXOR_UINT32 || GxB_NO_BOR_UINT32 || GxB_NO_BXOR_BOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_bor_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_bor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_bor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_bor_uint32
 GrB_Info GB_Asaxpy3B__bxor_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint64.c
index 6581fe1cdc..753c07beb6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x | y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint64_t) ((ax | bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,0xFFFFFFFFFFFFFFFFL}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_BOR || GxB_NO_UINT64 || GxB_NO_BXOR_UINT64 || GxB_NO_BOR_UINT64 || GxB_NO_BXOR_BOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_bor_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_bor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_bor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_bor_uint64
 GrB_Info GB_Asaxpy3B__bxor_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint8.c
index c6fb503a4f..749e333a9f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_bor_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x | y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x | y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint8_t) ((ax | bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,0xFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_BOR || GxB_NO_UINT8 || GxB_NO_BXOR_UINT8 || GxB_NO_BOR_UINT8 || GxB_NO_BXOR_BOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_bor_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_bor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_bor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_bor_uint8
 GrB_Info GB_Asaxpy3B__bxor_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint16.c
index ff2fdffd2e..1a4c83ccec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ~(x ^ y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint16_t) (~(ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,0xFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_BXNOR || GxB_NO_UINT16 || GxB_NO_BXOR_UINT16 || GxB_NO_BXNOR_UINT16 || GxB_NO_BXOR_BXNOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_bxnor_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_bxnor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_bxnor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_bxnor_uint16
 GrB_Info GB_Asaxpy3B__bxor_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint32.c
index d726e4e0f3..6d03350a36 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ~(x ^ y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint32_t) (~(ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,0xFFFFFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_BXNOR || GxB_NO_UINT32 || GxB_NO_BXOR_UINT32 || GxB_NO_BXNOR_UINT32 || GxB_NO_BXOR_BXNOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_bxnor_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_bxnor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_bxnor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_bxnor_uint32
 GrB_Info GB_Asaxpy3B__bxor_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint64.c
index a009181f63..220a6219d1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ~(x ^ y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint64_t) (~(ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,0xFFFFFFFFFFFFFFFFL}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_BXNOR || GxB_NO_UINT64 || GxB_NO_BXOR_UINT64 || GxB_NO_BXNOR_UINT64 || GxB_NO_BXOR_BXNOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_bxnor_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_bxnor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_bxnor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_bxnor_uint64
 GrB_Info GB_Asaxpy3B__bxor_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint8.c
index f72ff53411..e53eba6cc9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxnor_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ~(x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ~(x ^ y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint8_t) (~(ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,0xFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_BXNOR || GxB_NO_UINT8 || GxB_NO_BXOR_UINT8 || GxB_NO_BXNOR_UINT8 || GxB_NO_BXOR_BXNOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_bxnor_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_bxnor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_bxnor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_bxnor_uint8
 GrB_Info GB_Asaxpy3B__bxor_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint16.c
index 94620c4464..293abf7bdd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x ^ y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint16_t) ((ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,0xFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_UINT16 || GxB_NO_BXOR_UINT16 || GxB_NO_BXOR_BXOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_bxor_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_bxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_bxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_bxor_uint16
 GrB_Info GB_Asaxpy3B__bxor_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint32.c
index e3d0f9822f..aeb975b740 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x ^ y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint32_t) ((ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,0xFFFFFFFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_UINT32 || GxB_NO_BXOR_UINT32 || GxB_NO_BXOR_BXOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_bxor_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_bxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_bxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_bxor_uint32
 GrB_Info GB_Asaxpy3B__bxor_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint64.c
index 3818ecb621..0efeb8a0a4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x ^ y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint64_t) ((ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,0xFFFFFFFFFFFFFFFFL}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_UINT64 || GxB_NO_BXOR_UINT64 || GxB_NO_BXOR_BXOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_bxor_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_bxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_bxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_bxor_uint64
 GrB_Info GB_Asaxpy3B__bxor_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint8.c
index 3b0c6aae37..a7beb666e2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__bxor_bxor_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x ^ y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x ^ y) ; z ^= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     (x ^ y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= X [exists] & ((uint8_t) ((ax ^ bx))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,0xFF}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BXOR || GxB_NO_UINT8 || GxB_NO_BXOR_UINT8 || GxB_NO_BXOR_BXOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__bxor_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__bxor_bxor_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__bxor_bxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__bxor_bxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__bxor_bxor_uint8
 GrB_Info GB_Asaxpy3B__bxor_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_bool.c b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_bool.c
index 215c716b69..a9542cb658 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_BOOL || GxB_NO_EQ_BOOL || GxB_NO_EQ_EQ_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_eq_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_eq_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_eq_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_eq_bool
 GrB_Info GB_Asaxpy3B__eq_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_fp32.c
index fea9aa03dc..3f252e8078 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_FP32 || GxB_NO_EQ_BOOL || GxB_NO_EQ_FP32 || GxB_NO_EQ_EQ_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_eq_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_eq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_eq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_eq_fp32
 GrB_Info GB_Asaxpy3B__eq_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_fp64.c
index 697d18ee7f..8c1def185b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_FP64 || GxB_NO_EQ_BOOL || GxB_NO_EQ_FP64 || GxB_NO_EQ_EQ_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_eq_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_eq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_eq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_eq_fp64
 GrB_Info GB_Asaxpy3B__eq_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int16.c b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int16.c
index f9ee45de84..966f59e66e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_INT16 || GxB_NO_EQ_BOOL || GxB_NO_EQ_INT16 || GxB_NO_EQ_EQ_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_eq_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_eq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_eq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_eq_int16
 GrB_Info GB_Asaxpy3B__eq_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int32.c
index ca30d2e9c2..a90d887aa7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_INT32 || GxB_NO_EQ_BOOL || GxB_NO_EQ_INT32 || GxB_NO_EQ_EQ_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_eq_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_eq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_eq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_eq_int32
 GrB_Info GB_Asaxpy3B__eq_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int64.c
index 30979721c9..2b569e82b2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_INT64 || GxB_NO_EQ_BOOL || GxB_NO_EQ_INT64 || GxB_NO_EQ_EQ_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_eq_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_eq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_eq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_eq_int64
 GrB_Info GB_Asaxpy3B__eq_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int8.c b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int8.c
index 817bc1135f..1e53e96acb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_INT8 || GxB_NO_EQ_BOOL || GxB_NO_EQ_INT8 || GxB_NO_EQ_EQ_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_eq_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_eq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_eq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_eq_int8
 GrB_Info GB_Asaxpy3B__eq_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint16.c
index 7be019bdf0..d2468a7d6c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_UINT16 || GxB_NO_EQ_BOOL || GxB_NO_EQ_UINT16 || GxB_NO_EQ_EQ_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_eq_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_eq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_eq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_eq_uint16
 GrB_Info GB_Asaxpy3B__eq_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint32.c
index 6d06f03db9..394a6fe22d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_UINT32 || GxB_NO_EQ_BOOL || GxB_NO_EQ_UINT32 || GxB_NO_EQ_EQ_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_eq_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_eq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_eq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_eq_uint32
 GrB_Info GB_Asaxpy3B__eq_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint64.c
index 2f2fc05df6..c5fdfe99d4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_UINT64 || GxB_NO_EQ_BOOL || GxB_NO_EQ_UINT64 || GxB_NO_EQ_EQ_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_eq_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_eq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_eq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_eq_uint64
 GrB_Info GB_Asaxpy3B__eq_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint8.c
index 72fe26b346..9d92ba27d0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_eq_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_UINT8 || GxB_NO_EQ_BOOL || GxB_NO_EQ_UINT8 || GxB_NO_EQ_EQ_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_eq_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_eq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_eq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_eq_uint8
 GrB_Info GB_Asaxpy3B__eq_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_first_bool.c b/GraphBLAS/Source/Generated/GB_AxB__eq_first_bool.c
index 2b6f701c3d..d1ea31c63d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_first_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_first_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == x)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_FIRST || GxB_NO_BOOL || GxB_NO_EQ_BOOL || GxB_NO_FIRST_BOOL || GxB_NO_EQ_FIRST_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_first_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_first_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_first_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_first_bool
 GrB_Info GB_Asaxpy3B__eq_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_bool.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_bool.c
index 0746d8d7e0..84118767de 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GE || GxB_NO_BOOL || GxB_NO_EQ_BOOL || GxB_NO_GE_BOOL || GxB_NO_EQ_GE_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ge_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ge_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ge_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ge_bool
 GrB_Info GB_Asaxpy3B__eq_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_fp32.c
index b4b2db4177..3484ac17e6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GE || GxB_NO_FP32 || GxB_NO_EQ_BOOL || GxB_NO_GE_FP32 || GxB_NO_EQ_GE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ge_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ge_fp32
 GrB_Info GB_Asaxpy3B__eq_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_fp64.c
index bef6bbf85f..d4753989a6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GE || GxB_NO_FP64 || GxB_NO_EQ_BOOL || GxB_NO_GE_FP64 || GxB_NO_EQ_GE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ge_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ge_fp64
 GrB_Info GB_Asaxpy3B__eq_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int16.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int16.c
index f9c522155e..c80e9a09b9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GE || GxB_NO_INT16 || GxB_NO_EQ_BOOL || GxB_NO_GE_INT16 || GxB_NO_EQ_GE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ge_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ge_int16
 GrB_Info GB_Asaxpy3B__eq_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int32.c
index f4f38f01ed..ab9886069c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GE || GxB_NO_INT32 || GxB_NO_EQ_BOOL || GxB_NO_GE_INT32 || GxB_NO_EQ_GE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ge_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ge_int32
 GrB_Info GB_Asaxpy3B__eq_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int64.c
index 2c2e880bb8..741233eb56 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GE || GxB_NO_INT64 || GxB_NO_EQ_BOOL || GxB_NO_GE_INT64 || GxB_NO_EQ_GE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ge_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ge_int64
 GrB_Info GB_Asaxpy3B__eq_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int8.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int8.c
index 44f30748b6..c007182fae 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GE || GxB_NO_INT8 || GxB_NO_EQ_BOOL || GxB_NO_GE_INT8 || GxB_NO_EQ_GE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ge_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ge_int8
 GrB_Info GB_Asaxpy3B__eq_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint16.c
index da52e2f75b..239ae786e6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GE || GxB_NO_UINT16 || GxB_NO_EQ_BOOL || GxB_NO_GE_UINT16 || GxB_NO_EQ_GE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ge_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ge_uint16
 GrB_Info GB_Asaxpy3B__eq_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint32.c
index 9f20182a2f..b21aa288b6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GE || GxB_NO_UINT32 || GxB_NO_EQ_BOOL || GxB_NO_GE_UINT32 || GxB_NO_EQ_GE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ge_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ge_uint32
 GrB_Info GB_Asaxpy3B__eq_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint64.c
index e4af125076..a0d8667f46 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GE || GxB_NO_UINT64 || GxB_NO_EQ_BOOL || GxB_NO_GE_UINT64 || GxB_NO_EQ_GE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ge_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ge_uint64
 GrB_Info GB_Asaxpy3B__eq_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint8.c
index 223498b58e..2288654583 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ge_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GE || GxB_NO_UINT8 || GxB_NO_EQ_BOOL || GxB_NO_GE_UINT8 || GxB_NO_EQ_GE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ge_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ge_uint8
 GrB_Info GB_Asaxpy3B__eq_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_bool.c b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_bool.c
index c3a46cd45f..83471a2061 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GT || GxB_NO_BOOL || GxB_NO_EQ_BOOL || GxB_NO_GT_BOOL || GxB_NO_EQ_GT_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_gt_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_gt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_gt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_gt_bool
 GrB_Info GB_Asaxpy3B__eq_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_fp32.c
index f955a76981..602d5df2f0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GT || GxB_NO_FP32 || GxB_NO_EQ_BOOL || GxB_NO_GT_FP32 || GxB_NO_EQ_GT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_gt_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_gt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_gt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_gt_fp32
 GrB_Info GB_Asaxpy3B__eq_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_fp64.c
index 4e50ab5a25..f929f9affe 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GT || GxB_NO_FP64 || GxB_NO_EQ_BOOL || GxB_NO_GT_FP64 || GxB_NO_EQ_GT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_gt_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_gt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_gt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_gt_fp64
 GrB_Info GB_Asaxpy3B__eq_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int16.c
index 0bbf404c65..446dc917fc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GT || GxB_NO_INT16 || GxB_NO_EQ_BOOL || GxB_NO_GT_INT16 || GxB_NO_EQ_GT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_gt_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_gt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_gt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_gt_int16
 GrB_Info GB_Asaxpy3B__eq_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int32.c
index f54d2ae4ca..b2b93eb2d7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GT || GxB_NO_INT32 || GxB_NO_EQ_BOOL || GxB_NO_GT_INT32 || GxB_NO_EQ_GT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_gt_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_gt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_gt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_gt_int32
 GrB_Info GB_Asaxpy3B__eq_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int64.c
index 76cd181643..669854a99b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GT || GxB_NO_INT64 || GxB_NO_EQ_BOOL || GxB_NO_GT_INT64 || GxB_NO_EQ_GT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_gt_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_gt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_gt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_gt_int64
 GrB_Info GB_Asaxpy3B__eq_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int8.c
index acde6b49a2..d29261de60 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GT || GxB_NO_INT8 || GxB_NO_EQ_BOOL || GxB_NO_GT_INT8 || GxB_NO_EQ_GT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_gt_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_gt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_gt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_gt_int8
 GrB_Info GB_Asaxpy3B__eq_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint16.c
index bf2cea3b09..f9d05a65d5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GT || GxB_NO_UINT16 || GxB_NO_EQ_BOOL || GxB_NO_GT_UINT16 || GxB_NO_EQ_GT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_gt_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_gt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_gt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_gt_uint16
 GrB_Info GB_Asaxpy3B__eq_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint32.c
index 213753ab91..bc14f6058c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GT || GxB_NO_UINT32 || GxB_NO_EQ_BOOL || GxB_NO_GT_UINT32 || GxB_NO_EQ_GT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_gt_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_gt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_gt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_gt_uint32
 GrB_Info GB_Asaxpy3B__eq_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint64.c
index 1e43f411bc..82a0ab755e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GT || GxB_NO_UINT64 || GxB_NO_EQ_BOOL || GxB_NO_GT_UINT64 || GxB_NO_EQ_GT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_gt_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_gt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_gt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_gt_uint64
 GrB_Info GB_Asaxpy3B__eq_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint8.c
index 91659c1b65..865c5c880b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_gt_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_GT || GxB_NO_UINT8 || GxB_NO_EQ_BOOL || GxB_NO_GT_UINT8 || GxB_NO_EQ_GT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_gt_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_gt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_gt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_gt_uint8
 GrB_Info GB_Asaxpy3B__eq_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_land_bool.c b/GraphBLAS/Source/Generated/GB_AxB__eq_land_bool.c
index 5316ed0a40..e31020c4cc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_land_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_land_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x && y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x && y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LAND || GxB_NO_BOOL || GxB_NO_EQ_BOOL || GxB_NO_LAND_BOOL || GxB_NO_EQ_LAND_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_land_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_land_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_land_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_land_bool
 GrB_Info GB_Asaxpy3B__eq_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_le_bool.c b/GraphBLAS/Source/Generated/GB_AxB__eq_le_bool.c
index 38bd412130..d352e60531 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_le_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_le_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LE || GxB_NO_BOOL || GxB_NO_EQ_BOOL || GxB_NO_LE_BOOL || GxB_NO_EQ_LE_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_le_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_le_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_le_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_le_bool
 GrB_Info GB_Asaxpy3B__eq_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_le_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_le_fp32.c
index 53e8654e32..edff2c73a9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_le_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_le_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LE || GxB_NO_FP32 || GxB_NO_EQ_BOOL || GxB_NO_LE_FP32 || GxB_NO_EQ_LE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_le_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_le_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_le_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_le_fp32
 GrB_Info GB_Asaxpy3B__eq_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_le_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_le_fp64.c
index 1ad2c6e567..786426fd20 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_le_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_le_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LE || GxB_NO_FP64 || GxB_NO_EQ_BOOL || GxB_NO_LE_FP64 || GxB_NO_EQ_LE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_le_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_le_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_le_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_le_fp64
 GrB_Info GB_Asaxpy3B__eq_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_le_int16.c b/GraphBLAS/Source/Generated/GB_AxB__eq_le_int16.c
index 5acb1bf32b..f2effe716d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_le_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_le_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LE || GxB_NO_INT16 || GxB_NO_EQ_BOOL || GxB_NO_LE_INT16 || GxB_NO_EQ_LE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_le_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_le_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_le_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_le_int16
 GrB_Info GB_Asaxpy3B__eq_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_le_int32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_le_int32.c
index 0d53fad133..fbb54a09e5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_le_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_le_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LE || GxB_NO_INT32 || GxB_NO_EQ_BOOL || GxB_NO_LE_INT32 || GxB_NO_EQ_LE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_le_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_le_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_le_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_le_int32
 GrB_Info GB_Asaxpy3B__eq_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_le_int64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_le_int64.c
index cf079995d4..521dbce952 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_le_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_le_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LE || GxB_NO_INT64 || GxB_NO_EQ_BOOL || GxB_NO_LE_INT64 || GxB_NO_EQ_LE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_le_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_le_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_le_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_le_int64
 GrB_Info GB_Asaxpy3B__eq_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_le_int8.c b/GraphBLAS/Source/Generated/GB_AxB__eq_le_int8.c
index df28d7d2d5..92ed25ce38 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_le_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_le_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LE || GxB_NO_INT8 || GxB_NO_EQ_BOOL || GxB_NO_LE_INT8 || GxB_NO_EQ_LE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_le_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_le_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_le_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_le_int8
 GrB_Info GB_Asaxpy3B__eq_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint16.c
index 403a89b618..e5fdaebbd8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LE || GxB_NO_UINT16 || GxB_NO_EQ_BOOL || GxB_NO_LE_UINT16 || GxB_NO_EQ_LE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_le_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_le_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_le_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_le_uint16
 GrB_Info GB_Asaxpy3B__eq_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint32.c
index 133543c6f0..0009ef512b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LE || GxB_NO_UINT32 || GxB_NO_EQ_BOOL || GxB_NO_LE_UINT32 || GxB_NO_EQ_LE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_le_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_le_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_le_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_le_uint32
 GrB_Info GB_Asaxpy3B__eq_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint64.c
index 47dee3bcca..47d515bd5e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LE || GxB_NO_UINT64 || GxB_NO_EQ_BOOL || GxB_NO_LE_UINT64 || GxB_NO_EQ_LE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_le_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_le_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_le_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_le_uint64
 GrB_Info GB_Asaxpy3B__eq_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint8.c
index e48fc272f1..68d99a5451 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_le_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LE || GxB_NO_UINT8 || GxB_NO_EQ_BOOL || GxB_NO_LE_UINT8 || GxB_NO_EQ_LE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_le_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_le_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_le_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_le_uint8
 GrB_Info GB_Asaxpy3B__eq_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lor_bool.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lor_bool.c
index 98a31768b4..c794cec317 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lor_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x || y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x || y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LOR || GxB_NO_BOOL || GxB_NO_EQ_BOOL || GxB_NO_LOR_BOOL || GxB_NO_EQ_LOR_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lor_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lor_bool
 GrB_Info GB_Asaxpy3B__eq_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_bool.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_bool.c
index 7b6f1d7f20..198a2c2346 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LT || GxB_NO_BOOL || GxB_NO_EQ_BOOL || GxB_NO_LT_BOOL || GxB_NO_EQ_LT_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lt_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lt_bool
 GrB_Info GB_Asaxpy3B__eq_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_fp32.c
index 3bd9c9c33f..94337c9153 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LT || GxB_NO_FP32 || GxB_NO_EQ_BOOL || GxB_NO_LT_FP32 || GxB_NO_EQ_LT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lt_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lt_fp32
 GrB_Info GB_Asaxpy3B__eq_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_fp64.c
index 0f7a61a672..b5f89f7031 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LT || GxB_NO_FP64 || GxB_NO_EQ_BOOL || GxB_NO_LT_FP64 || GxB_NO_EQ_LT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lt_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lt_fp64
 GrB_Info GB_Asaxpy3B__eq_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int16.c
index dcbfd54da3..7c4ba5d341 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LT || GxB_NO_INT16 || GxB_NO_EQ_BOOL || GxB_NO_LT_INT16 || GxB_NO_EQ_LT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lt_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lt_int16
 GrB_Info GB_Asaxpy3B__eq_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int32.c
index d7295aee8d..cfaf79292f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LT || GxB_NO_INT32 || GxB_NO_EQ_BOOL || GxB_NO_LT_INT32 || GxB_NO_EQ_LT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lt_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lt_int32
 GrB_Info GB_Asaxpy3B__eq_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int64.c
index 045f714041..5189681b0c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LT || GxB_NO_INT64 || GxB_NO_EQ_BOOL || GxB_NO_LT_INT64 || GxB_NO_EQ_LT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lt_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lt_int64
 GrB_Info GB_Asaxpy3B__eq_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int8.c
index 4166042478..755bba9743 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LT || GxB_NO_INT8 || GxB_NO_EQ_BOOL || GxB_NO_LT_INT8 || GxB_NO_EQ_LT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lt_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lt_int8
 GrB_Info GB_Asaxpy3B__eq_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint16.c
index d67c6a1f3c..460b15fd8b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LT || GxB_NO_UINT16 || GxB_NO_EQ_BOOL || GxB_NO_LT_UINT16 || GxB_NO_EQ_LT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lt_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lt_uint16
 GrB_Info GB_Asaxpy3B__eq_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint32.c
index 3d54d5ff40..30e99cd1c7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LT || GxB_NO_UINT32 || GxB_NO_EQ_BOOL || GxB_NO_LT_UINT32 || GxB_NO_EQ_LT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lt_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lt_uint32
 GrB_Info GB_Asaxpy3B__eq_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint64.c
index aba92993ab..154f5a3a76 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LT || GxB_NO_UINT64 || GxB_NO_EQ_BOOL || GxB_NO_LT_UINT64 || GxB_NO_EQ_LT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lt_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lt_uint64
 GrB_Info GB_Asaxpy3B__eq_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint8.c
index fe5f32687d..af2e6f92f0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lt_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LT || GxB_NO_UINT8 || GxB_NO_EQ_BOOL || GxB_NO_LT_UINT8 || GxB_NO_EQ_LT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lt_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lt_uint8
 GrB_Info GB_Asaxpy3B__eq_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_lxor_bool.c b/GraphBLAS/Source/Generated/GB_AxB__eq_lxor_bool.c
index 0b4e729985..5afd4e8031 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_lxor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_lxor_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_LXOR || GxB_NO_BOOL || GxB_NO_EQ_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_EQ_LXOR_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_lxor_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_lxor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_lxor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_lxor_bool
 GrB_Info GB_Asaxpy3B__eq_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_fp32.c
index fc48ef43e3..e8d8082d36 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_NE || GxB_NO_FP32 || GxB_NO_EQ_BOOL || GxB_NO_NE_FP32 || GxB_NO_EQ_NE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ne_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ne_fp32
 GrB_Info GB_Asaxpy3B__eq_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_fp64.c
index e252f0d0e1..c8d1398935 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_NE || GxB_NO_FP64 || GxB_NO_EQ_BOOL || GxB_NO_NE_FP64 || GxB_NO_EQ_NE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ne_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ne_fp64
 GrB_Info GB_Asaxpy3B__eq_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int16.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int16.c
index fb654708de..a62475dc73 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_NE || GxB_NO_INT16 || GxB_NO_EQ_BOOL || GxB_NO_NE_INT16 || GxB_NO_EQ_NE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ne_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ne_int16
 GrB_Info GB_Asaxpy3B__eq_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int32.c
index be3898ad15..1185526b3c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_NE || GxB_NO_INT32 || GxB_NO_EQ_BOOL || GxB_NO_NE_INT32 || GxB_NO_EQ_NE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ne_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ne_int32
 GrB_Info GB_Asaxpy3B__eq_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int64.c
index 47221ce266..5ee4433324 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_NE || GxB_NO_INT64 || GxB_NO_EQ_BOOL || GxB_NO_NE_INT64 || GxB_NO_EQ_NE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ne_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ne_int64
 GrB_Info GB_Asaxpy3B__eq_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int8.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int8.c
index ce2197a05b..49dbea7525 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_NE || GxB_NO_INT8 || GxB_NO_EQ_BOOL || GxB_NO_NE_INT8 || GxB_NO_EQ_NE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ne_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ne_int8
 GrB_Info GB_Asaxpy3B__eq_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint16.c
index 6999fec375..7fa3be0c6f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_NE || GxB_NO_UINT16 || GxB_NO_EQ_BOOL || GxB_NO_NE_UINT16 || GxB_NO_EQ_NE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ne_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ne_uint16
 GrB_Info GB_Asaxpy3B__eq_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint32.c
index 2f753d3c5a..10d47b01a8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_NE || GxB_NO_UINT32 || GxB_NO_EQ_BOOL || GxB_NO_NE_UINT32 || GxB_NO_EQ_NE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ne_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ne_uint32
 GrB_Info GB_Asaxpy3B__eq_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint64.c
index 8b0997f79c..ed25140c10 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_NE || GxB_NO_UINT64 || GxB_NO_EQ_BOOL || GxB_NO_NE_UINT64 || GxB_NO_EQ_NE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ne_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ne_uint64
 GrB_Info GB_Asaxpy3B__eq_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint8.c
index cc5dd67df0..1b9fb469a4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_ne_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_NE || GxB_NO_UINT8 || GxB_NO_EQ_BOOL || GxB_NO_NE_UINT8 || GxB_NO_EQ_NE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_ne_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_ne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_ne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_ne_uint8
 GrB_Info GB_Asaxpy3B__eq_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__eq_second_bool.c b/GraphBLAS/Source/Generated/GB_AxB__eq_second_bool.c
index fa8eb9ce05..af53eea07c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__eq_second_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__eq_second_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = (z == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x == y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = (Cx [p] == Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = (Hx [i] == t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EQ || GxB_NO_SECOND || GxB_NO_BOOL || GxB_NO_EQ_BOOL || GxB_NO_SECOND_BOOL || GxB_NO_EQ_SECOND_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__eq_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__eq_second_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__eq_second_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__eq_second_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__eq_second_bool
 GrB_Info GB_Asaxpy3B__eq_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__include.h b/GraphBLAS/Source/Generated/GB_AxB__include.h
index 8f73188893..d5b174eaa7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__include.h
+++ b/GraphBLAS/Source/Generated/GB_AxB__include.h
@@ -2,22 +2,18 @@
 // GB_AxB__include.h: definitions for GB_AxB__*.c
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_AxB.h
 
-#include "GB_iterator.h"
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35,13 +31,15 @@ GrB_Info GB_Adot3B__min_first_int8
 GrB_Info GB_Asaxpy3B__min_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54,14 +52,14 @@ GrB_Info GB_Adot4B__min_first_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -79,13 +77,15 @@ GrB_Info GB_Adot3B__min_first_int16
 GrB_Info GB_Asaxpy3B__min_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -98,14 +98,14 @@ GrB_Info GB_Adot4B__min_first_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -123,13 +123,15 @@ GrB_Info GB_Adot3B__min_first_int32
 GrB_Info GB_Asaxpy3B__min_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -142,14 +144,14 @@ GrB_Info GB_Adot4B__min_first_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -167,13 +169,15 @@ GrB_Info GB_Adot3B__min_first_int64
 GrB_Info GB_Asaxpy3B__min_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -186,14 +190,14 @@ GrB_Info GB_Adot4B__min_first_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -211,13 +215,15 @@ GrB_Info GB_Adot3B__min_first_uint8
 GrB_Info GB_Asaxpy3B__min_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -230,14 +236,14 @@ GrB_Info GB_Adot4B__min_first_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -255,13 +261,15 @@ GrB_Info GB_Adot3B__min_first_uint16
 GrB_Info GB_Asaxpy3B__min_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -274,14 +282,14 @@ GrB_Info GB_Adot4B__min_first_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -299,13 +307,15 @@ GrB_Info GB_Adot3B__min_first_uint32
 GrB_Info GB_Asaxpy3B__min_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -318,14 +328,14 @@ GrB_Info GB_Adot4B__min_first_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -343,13 +353,15 @@ GrB_Info GB_Adot3B__min_first_uint64
 GrB_Info GB_Asaxpy3B__min_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -362,14 +374,14 @@ GrB_Info GB_Adot4B__min_first_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -387,13 +399,15 @@ GrB_Info GB_Adot3B__min_first_fp32
 GrB_Info GB_Asaxpy3B__min_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -406,14 +420,14 @@ GrB_Info GB_Adot4B__min_first_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -431,13 +445,15 @@ GrB_Info GB_Adot3B__min_first_fp64
 GrB_Info GB_Asaxpy3B__min_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -450,14 +466,14 @@ GrB_Info GB_Adot4B__min_first_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -475,13 +491,15 @@ GrB_Info GB_Adot3B__max_first_int8
 GrB_Info GB_Asaxpy3B__max_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -494,14 +512,14 @@ GrB_Info GB_Adot4B__max_first_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -519,13 +537,15 @@ GrB_Info GB_Adot3B__max_first_int16
 GrB_Info GB_Asaxpy3B__max_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -538,14 +558,14 @@ GrB_Info GB_Adot4B__max_first_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -563,13 +583,15 @@ GrB_Info GB_Adot3B__max_first_int32
 GrB_Info GB_Asaxpy3B__max_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -582,14 +604,14 @@ GrB_Info GB_Adot4B__max_first_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -607,13 +629,15 @@ GrB_Info GB_Adot3B__max_first_int64
 GrB_Info GB_Asaxpy3B__max_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -626,14 +650,14 @@ GrB_Info GB_Adot4B__max_first_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -651,13 +675,15 @@ GrB_Info GB_Adot3B__max_first_uint8
 GrB_Info GB_Asaxpy3B__max_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -670,14 +696,14 @@ GrB_Info GB_Adot4B__max_first_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -695,13 +721,15 @@ GrB_Info GB_Adot3B__max_first_uint16
 GrB_Info GB_Asaxpy3B__max_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -714,14 +742,14 @@ GrB_Info GB_Adot4B__max_first_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -739,13 +767,15 @@ GrB_Info GB_Adot3B__max_first_uint32
 GrB_Info GB_Asaxpy3B__max_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -758,14 +788,14 @@ GrB_Info GB_Adot4B__max_first_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -783,13 +813,15 @@ GrB_Info GB_Adot3B__max_first_uint64
 GrB_Info GB_Asaxpy3B__max_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -802,14 +834,14 @@ GrB_Info GB_Adot4B__max_first_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -827,13 +859,15 @@ GrB_Info GB_Adot3B__max_first_fp32
 GrB_Info GB_Asaxpy3B__max_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -846,14 +880,14 @@ GrB_Info GB_Adot4B__max_first_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -871,13 +905,15 @@ GrB_Info GB_Adot3B__max_first_fp64
 GrB_Info GB_Asaxpy3B__max_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -890,14 +926,14 @@ GrB_Info GB_Adot4B__max_first_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -915,13 +951,15 @@ GrB_Info GB_Adot3B__any_first_int8
 GrB_Info GB_Asaxpy3B__any_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -934,14 +972,14 @@ GrB_Info GB_Adot4B__any_first_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -959,13 +997,15 @@ GrB_Info GB_Adot3B__any_first_int16
 GrB_Info GB_Asaxpy3B__any_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -978,14 +1018,14 @@ GrB_Info GB_Adot4B__any_first_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1003,13 +1043,15 @@ GrB_Info GB_Adot3B__any_first_int32
 GrB_Info GB_Asaxpy3B__any_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1022,14 +1064,14 @@ GrB_Info GB_Adot4B__any_first_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1047,13 +1089,15 @@ GrB_Info GB_Adot3B__any_first_int64
 GrB_Info GB_Asaxpy3B__any_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1066,14 +1110,14 @@ GrB_Info GB_Adot4B__any_first_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1091,13 +1135,15 @@ GrB_Info GB_Adot3B__any_first_uint8
 GrB_Info GB_Asaxpy3B__any_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1110,14 +1156,14 @@ GrB_Info GB_Adot4B__any_first_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1135,13 +1181,15 @@ GrB_Info GB_Adot3B__any_first_uint16
 GrB_Info GB_Asaxpy3B__any_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1154,14 +1202,14 @@ GrB_Info GB_Adot4B__any_first_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1179,13 +1227,15 @@ GrB_Info GB_Adot3B__any_first_uint32
 GrB_Info GB_Asaxpy3B__any_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1198,14 +1248,14 @@ GrB_Info GB_Adot4B__any_first_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1223,13 +1273,15 @@ GrB_Info GB_Adot3B__any_first_uint64
 GrB_Info GB_Asaxpy3B__any_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1242,14 +1294,14 @@ GrB_Info GB_Adot4B__any_first_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1267,13 +1319,15 @@ GrB_Info GB_Adot3B__any_first_fp32
 GrB_Info GB_Asaxpy3B__any_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1286,14 +1340,14 @@ GrB_Info GB_Adot4B__any_first_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1311,13 +1365,15 @@ GrB_Info GB_Adot3B__any_first_fp64
 GrB_Info GB_Asaxpy3B__any_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1330,14 +1386,14 @@ GrB_Info GB_Adot4B__any_first_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1355,13 +1411,15 @@ GrB_Info GB_Adot3B__any_first_fc32
 GrB_Info GB_Asaxpy3B__any_first_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1374,14 +1432,14 @@ GrB_Info GB_Adot4B__any_first_fc32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1399,13 +1457,15 @@ GrB_Info GB_Adot3B__any_first_fc64
 GrB_Info GB_Asaxpy3B__any_first_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1419,14 +1479,13 @@ GrB_Info GB_Adot4B__any_first_fc64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1444,13 +1503,15 @@ GrB_Info GB_Adot3B__plus_first_int8
 GrB_Info GB_Asaxpy3B__plus_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1464,14 +1525,13 @@ GrB_Info GB_Adot4B__plus_first_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1489,13 +1549,15 @@ GrB_Info GB_Adot3B__plus_first_uint8
 GrB_Info GB_Asaxpy3B__plus_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1509,14 +1571,13 @@ GrB_Info GB_Adot4B__plus_first_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1534,13 +1595,15 @@ GrB_Info GB_Adot3B__plus_first_int16
 GrB_Info GB_Asaxpy3B__plus_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1554,14 +1617,13 @@ GrB_Info GB_Adot4B__plus_first_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1579,13 +1641,15 @@ GrB_Info GB_Adot3B__plus_first_uint16
 GrB_Info GB_Asaxpy3B__plus_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1599,14 +1663,13 @@ GrB_Info GB_Adot4B__plus_first_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1624,13 +1687,15 @@ GrB_Info GB_Adot3B__plus_first_int32
 GrB_Info GB_Asaxpy3B__plus_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1644,14 +1709,13 @@ GrB_Info GB_Adot4B__plus_first_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1669,13 +1733,15 @@ GrB_Info GB_Adot3B__plus_first_uint32
 GrB_Info GB_Asaxpy3B__plus_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1689,14 +1755,13 @@ GrB_Info GB_Adot4B__plus_first_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1714,13 +1779,15 @@ GrB_Info GB_Adot3B__plus_first_int64
 GrB_Info GB_Asaxpy3B__plus_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1734,14 +1801,13 @@ GrB_Info GB_Adot4B__plus_first_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1759,13 +1825,15 @@ GrB_Info GB_Adot3B__plus_first_uint64
 GrB_Info GB_Asaxpy3B__plus_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1779,14 +1847,13 @@ GrB_Info GB_Adot4B__plus_first_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1804,13 +1871,15 @@ GrB_Info GB_Adot3B__plus_first_fp32
 GrB_Info GB_Asaxpy3B__plus_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1824,14 +1893,13 @@ GrB_Info GB_Adot4B__plus_first_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1849,13 +1917,15 @@ GrB_Info GB_Adot3B__plus_first_fp64
 GrB_Info GB_Asaxpy3B__plus_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1869,14 +1939,13 @@ GrB_Info GB_Adot4B__plus_first_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_first_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1894,13 +1963,15 @@ GrB_Info GB_Adot3B__plus_first_fc32
 GrB_Info GB_Asaxpy3B__plus_first_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1914,14 +1985,13 @@ GrB_Info GB_Adot4B__plus_first_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_first_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1939,13 +2009,15 @@ GrB_Info GB_Adot3B__plus_first_fc64
 GrB_Info GB_Asaxpy3B__plus_first_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -1958,14 +2030,14 @@ GrB_Info GB_Adot4B__plus_first_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -1983,13 +2055,15 @@ GrB_Info GB_Adot3B__times_first_int8
 GrB_Info GB_Asaxpy3B__times_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2002,14 +2076,14 @@ GrB_Info GB_Adot4B__times_first_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2027,13 +2101,15 @@ GrB_Info GB_Adot3B__times_first_uint8
 GrB_Info GB_Asaxpy3B__times_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2046,14 +2122,14 @@ GrB_Info GB_Adot4B__times_first_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2071,13 +2147,15 @@ GrB_Info GB_Adot3B__times_first_int16
 GrB_Info GB_Asaxpy3B__times_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2090,14 +2168,14 @@ GrB_Info GB_Adot4B__times_first_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2115,13 +2193,15 @@ GrB_Info GB_Adot3B__times_first_uint16
 GrB_Info GB_Asaxpy3B__times_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2134,14 +2214,14 @@ GrB_Info GB_Adot4B__times_first_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2159,13 +2239,15 @@ GrB_Info GB_Adot3B__times_first_int32
 GrB_Info GB_Asaxpy3B__times_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2178,14 +2260,14 @@ GrB_Info GB_Adot4B__times_first_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2203,13 +2285,15 @@ GrB_Info GB_Adot3B__times_first_uint32
 GrB_Info GB_Asaxpy3B__times_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2222,14 +2306,14 @@ GrB_Info GB_Adot4B__times_first_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2247,13 +2331,15 @@ GrB_Info GB_Adot3B__times_first_int64
 GrB_Info GB_Asaxpy3B__times_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2266,14 +2352,14 @@ GrB_Info GB_Adot4B__times_first_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2291,13 +2377,15 @@ GrB_Info GB_Adot3B__times_first_uint64
 GrB_Info GB_Asaxpy3B__times_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2311,14 +2399,13 @@ GrB_Info GB_Adot4B__times_first_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2336,13 +2423,15 @@ GrB_Info GB_Adot3B__times_first_fp32
 GrB_Info GB_Asaxpy3B__times_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2356,14 +2445,13 @@ GrB_Info GB_Adot4B__times_first_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2381,13 +2469,15 @@ GrB_Info GB_Adot3B__times_first_fp64
 GrB_Info GB_Asaxpy3B__times_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2401,14 +2491,13 @@ GrB_Info GB_Adot4B__times_first_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_first_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2426,13 +2515,15 @@ GrB_Info GB_Adot3B__times_first_fc32
 GrB_Info GB_Asaxpy3B__times_first_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2446,14 +2537,13 @@ GrB_Info GB_Adot4B__times_first_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_first_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2471,13 +2561,15 @@ GrB_Info GB_Adot3B__times_first_fc64
 GrB_Info GB_Asaxpy3B__times_first_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2490,14 +2582,14 @@ GrB_Info GB_Adot4B__times_first_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2515,13 +2607,15 @@ GrB_Info GB_Adot3B__lor_first_bool
 GrB_Info GB_Asaxpy3B__lor_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2534,14 +2628,14 @@ GrB_Info GB_Adot4B__lor_first_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2559,13 +2653,15 @@ GrB_Info GB_Adot3B__land_first_bool
 GrB_Info GB_Asaxpy3B__land_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2579,14 +2675,13 @@ GrB_Info GB_Adot4B__land_first_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2604,13 +2699,15 @@ GrB_Info GB_Adot3B__lxor_first_bool
 GrB_Info GB_Asaxpy3B__lxor_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2623,14 +2720,14 @@ GrB_Info GB_Adot4B__lxor_first_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2648,13 +2745,15 @@ GrB_Info GB_Adot3B__any_first_bool
 GrB_Info GB_Asaxpy3B__any_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2668,14 +2767,13 @@ GrB_Info GB_Adot4B__any_first_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2693,13 +2791,15 @@ GrB_Info GB_Adot3B__eq_first_bool
 GrB_Info GB_Asaxpy3B__eq_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2712,14 +2812,14 @@ GrB_Info GB_Adot4B__eq_first_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2737,13 +2837,15 @@ GrB_Info GB_Adot3B__min_second_int8
 GrB_Info GB_Asaxpy3B__min_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2756,14 +2858,14 @@ GrB_Info GB_Adot4B__min_second_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2781,13 +2883,15 @@ GrB_Info GB_Adot3B__min_second_int16
 GrB_Info GB_Asaxpy3B__min_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2800,14 +2904,14 @@ GrB_Info GB_Adot4B__min_second_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2825,13 +2929,15 @@ GrB_Info GB_Adot3B__min_second_int32
 GrB_Info GB_Asaxpy3B__min_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2844,14 +2950,14 @@ GrB_Info GB_Adot4B__min_second_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2869,13 +2975,15 @@ GrB_Info GB_Adot3B__min_second_int64
 GrB_Info GB_Asaxpy3B__min_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2888,14 +2996,14 @@ GrB_Info GB_Adot4B__min_second_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2913,13 +3021,15 @@ GrB_Info GB_Adot3B__min_second_uint8
 GrB_Info GB_Asaxpy3B__min_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2932,14 +3042,14 @@ GrB_Info GB_Adot4B__min_second_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -2957,13 +3067,15 @@ GrB_Info GB_Adot3B__min_second_uint16
 GrB_Info GB_Asaxpy3B__min_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -2976,14 +3088,14 @@ GrB_Info GB_Adot4B__min_second_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3001,13 +3113,15 @@ GrB_Info GB_Adot3B__min_second_uint32
 GrB_Info GB_Asaxpy3B__min_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3020,14 +3134,14 @@ GrB_Info GB_Adot4B__min_second_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3045,13 +3159,15 @@ GrB_Info GB_Adot3B__min_second_uint64
 GrB_Info GB_Asaxpy3B__min_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3064,14 +3180,14 @@ GrB_Info GB_Adot4B__min_second_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3089,13 +3205,15 @@ GrB_Info GB_Adot3B__min_second_fp32
 GrB_Info GB_Asaxpy3B__min_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3108,14 +3226,14 @@ GrB_Info GB_Adot4B__min_second_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3133,13 +3251,15 @@ GrB_Info GB_Adot3B__min_second_fp64
 GrB_Info GB_Asaxpy3B__min_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3152,14 +3272,14 @@ GrB_Info GB_Adot4B__min_second_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3177,13 +3297,15 @@ GrB_Info GB_Adot3B__max_second_int8
 GrB_Info GB_Asaxpy3B__max_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3196,14 +3318,14 @@ GrB_Info GB_Adot4B__max_second_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3221,13 +3343,15 @@ GrB_Info GB_Adot3B__max_second_int16
 GrB_Info GB_Asaxpy3B__max_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3240,14 +3364,14 @@ GrB_Info GB_Adot4B__max_second_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3265,13 +3389,15 @@ GrB_Info GB_Adot3B__max_second_int32
 GrB_Info GB_Asaxpy3B__max_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3284,14 +3410,14 @@ GrB_Info GB_Adot4B__max_second_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3309,13 +3435,15 @@ GrB_Info GB_Adot3B__max_second_int64
 GrB_Info GB_Asaxpy3B__max_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3328,14 +3456,14 @@ GrB_Info GB_Adot4B__max_second_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3353,13 +3481,15 @@ GrB_Info GB_Adot3B__max_second_uint8
 GrB_Info GB_Asaxpy3B__max_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3372,14 +3502,14 @@ GrB_Info GB_Adot4B__max_second_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3397,13 +3527,15 @@ GrB_Info GB_Adot3B__max_second_uint16
 GrB_Info GB_Asaxpy3B__max_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3416,14 +3548,14 @@ GrB_Info GB_Adot4B__max_second_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3441,13 +3573,15 @@ GrB_Info GB_Adot3B__max_second_uint32
 GrB_Info GB_Asaxpy3B__max_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3460,14 +3594,14 @@ GrB_Info GB_Adot4B__max_second_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3485,13 +3619,15 @@ GrB_Info GB_Adot3B__max_second_uint64
 GrB_Info GB_Asaxpy3B__max_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3504,14 +3640,14 @@ GrB_Info GB_Adot4B__max_second_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3529,13 +3665,15 @@ GrB_Info GB_Adot3B__max_second_fp32
 GrB_Info GB_Asaxpy3B__max_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3548,14 +3686,14 @@ GrB_Info GB_Adot4B__max_second_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3573,13 +3711,15 @@ GrB_Info GB_Adot3B__max_second_fp64
 GrB_Info GB_Asaxpy3B__max_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3592,14 +3732,14 @@ GrB_Info GB_Adot4B__max_second_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3617,13 +3757,15 @@ GrB_Info GB_Adot3B__any_second_int8
 GrB_Info GB_Asaxpy3B__any_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3636,14 +3778,14 @@ GrB_Info GB_Adot4B__any_second_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3661,13 +3803,15 @@ GrB_Info GB_Adot3B__any_second_int16
 GrB_Info GB_Asaxpy3B__any_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3680,14 +3824,14 @@ GrB_Info GB_Adot4B__any_second_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3705,13 +3849,15 @@ GrB_Info GB_Adot3B__any_second_int32
 GrB_Info GB_Asaxpy3B__any_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3724,14 +3870,14 @@ GrB_Info GB_Adot4B__any_second_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3749,13 +3895,15 @@ GrB_Info GB_Adot3B__any_second_int64
 GrB_Info GB_Asaxpy3B__any_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3768,14 +3916,14 @@ GrB_Info GB_Adot4B__any_second_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3793,13 +3941,15 @@ GrB_Info GB_Adot3B__any_second_uint8
 GrB_Info GB_Asaxpy3B__any_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3812,14 +3962,14 @@ GrB_Info GB_Adot4B__any_second_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3837,13 +3987,15 @@ GrB_Info GB_Adot3B__any_second_uint16
 GrB_Info GB_Asaxpy3B__any_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3856,14 +4008,14 @@ GrB_Info GB_Adot4B__any_second_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3881,13 +4033,15 @@ GrB_Info GB_Adot3B__any_second_uint32
 GrB_Info GB_Asaxpy3B__any_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3900,14 +4054,14 @@ GrB_Info GB_Adot4B__any_second_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3925,13 +4079,15 @@ GrB_Info GB_Adot3B__any_second_uint64
 GrB_Info GB_Asaxpy3B__any_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3944,14 +4100,14 @@ GrB_Info GB_Adot4B__any_second_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -3969,13 +4125,15 @@ GrB_Info GB_Adot3B__any_second_fp32
 GrB_Info GB_Asaxpy3B__any_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -3988,14 +4146,14 @@ GrB_Info GB_Adot4B__any_second_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4013,13 +4171,15 @@ GrB_Info GB_Adot3B__any_second_fp64
 GrB_Info GB_Asaxpy3B__any_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4032,14 +4192,14 @@ GrB_Info GB_Adot4B__any_second_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4057,13 +4217,15 @@ GrB_Info GB_Adot3B__any_second_fc32
 GrB_Info GB_Asaxpy3B__any_second_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4076,14 +4238,14 @@ GrB_Info GB_Adot4B__any_second_fc32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4101,13 +4263,15 @@ GrB_Info GB_Adot3B__any_second_fc64
 GrB_Info GB_Asaxpy3B__any_second_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4121,14 +4285,13 @@ GrB_Info GB_Adot4B__any_second_fc64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4146,13 +4309,15 @@ GrB_Info GB_Adot3B__plus_second_int8
 GrB_Info GB_Asaxpy3B__plus_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4166,14 +4331,13 @@ GrB_Info GB_Adot4B__plus_second_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4191,13 +4355,15 @@ GrB_Info GB_Adot3B__plus_second_uint8
 GrB_Info GB_Asaxpy3B__plus_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4211,14 +4377,13 @@ GrB_Info GB_Adot4B__plus_second_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4236,13 +4401,15 @@ GrB_Info GB_Adot3B__plus_second_int16
 GrB_Info GB_Asaxpy3B__plus_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4256,14 +4423,13 @@ GrB_Info GB_Adot4B__plus_second_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4281,13 +4447,15 @@ GrB_Info GB_Adot3B__plus_second_uint16
 GrB_Info GB_Asaxpy3B__plus_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4301,14 +4469,13 @@ GrB_Info GB_Adot4B__plus_second_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4326,13 +4493,15 @@ GrB_Info GB_Adot3B__plus_second_int32
 GrB_Info GB_Asaxpy3B__plus_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4346,14 +4515,13 @@ GrB_Info GB_Adot4B__plus_second_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4371,13 +4539,15 @@ GrB_Info GB_Adot3B__plus_second_uint32
 GrB_Info GB_Asaxpy3B__plus_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4391,14 +4561,13 @@ GrB_Info GB_Adot4B__plus_second_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4416,13 +4585,15 @@ GrB_Info GB_Adot3B__plus_second_int64
 GrB_Info GB_Asaxpy3B__plus_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4436,14 +4607,13 @@ GrB_Info GB_Adot4B__plus_second_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4461,13 +4631,15 @@ GrB_Info GB_Adot3B__plus_second_uint64
 GrB_Info GB_Asaxpy3B__plus_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4481,14 +4653,13 @@ GrB_Info GB_Adot4B__plus_second_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4506,13 +4677,15 @@ GrB_Info GB_Adot3B__plus_second_fp32
 GrB_Info GB_Asaxpy3B__plus_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4526,14 +4699,13 @@ GrB_Info GB_Adot4B__plus_second_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4551,13 +4723,15 @@ GrB_Info GB_Adot3B__plus_second_fp64
 GrB_Info GB_Asaxpy3B__plus_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4571,14 +4745,13 @@ GrB_Info GB_Adot4B__plus_second_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_second_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4596,13 +4769,15 @@ GrB_Info GB_Adot3B__plus_second_fc32
 GrB_Info GB_Asaxpy3B__plus_second_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4616,14 +4791,13 @@ GrB_Info GB_Adot4B__plus_second_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_second_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4641,13 +4815,15 @@ GrB_Info GB_Adot3B__plus_second_fc64
 GrB_Info GB_Asaxpy3B__plus_second_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4660,14 +4836,14 @@ GrB_Info GB_Adot4B__plus_second_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4685,13 +4861,15 @@ GrB_Info GB_Adot3B__times_second_int8
 GrB_Info GB_Asaxpy3B__times_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4704,14 +4882,14 @@ GrB_Info GB_Adot4B__times_second_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4729,13 +4907,15 @@ GrB_Info GB_Adot3B__times_second_uint8
 GrB_Info GB_Asaxpy3B__times_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4748,14 +4928,14 @@ GrB_Info GB_Adot4B__times_second_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4773,13 +4953,15 @@ GrB_Info GB_Adot3B__times_second_int16
 GrB_Info GB_Asaxpy3B__times_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4792,14 +4974,14 @@ GrB_Info GB_Adot4B__times_second_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4817,13 +4999,15 @@ GrB_Info GB_Adot3B__times_second_uint16
 GrB_Info GB_Asaxpy3B__times_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4836,14 +5020,14 @@ GrB_Info GB_Adot4B__times_second_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4861,13 +5045,15 @@ GrB_Info GB_Adot3B__times_second_int32
 GrB_Info GB_Asaxpy3B__times_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4880,14 +5066,14 @@ GrB_Info GB_Adot4B__times_second_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4905,13 +5091,15 @@ GrB_Info GB_Adot3B__times_second_uint32
 GrB_Info GB_Asaxpy3B__times_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4924,14 +5112,14 @@ GrB_Info GB_Adot4B__times_second_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4949,13 +5137,15 @@ GrB_Info GB_Adot3B__times_second_int64
 GrB_Info GB_Asaxpy3B__times_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -4968,14 +5158,14 @@ GrB_Info GB_Adot4B__times_second_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -4993,13 +5183,15 @@ GrB_Info GB_Adot3B__times_second_uint64
 GrB_Info GB_Asaxpy3B__times_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5013,14 +5205,13 @@ GrB_Info GB_Adot4B__times_second_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5038,13 +5229,15 @@ GrB_Info GB_Adot3B__times_second_fp32
 GrB_Info GB_Asaxpy3B__times_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5058,14 +5251,13 @@ GrB_Info GB_Adot4B__times_second_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5083,13 +5275,15 @@ GrB_Info GB_Adot3B__times_second_fp64
 GrB_Info GB_Asaxpy3B__times_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5103,14 +5297,13 @@ GrB_Info GB_Adot4B__times_second_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_second_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5128,13 +5321,15 @@ GrB_Info GB_Adot3B__times_second_fc32
 GrB_Info GB_Asaxpy3B__times_second_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5148,14 +5343,13 @@ GrB_Info GB_Adot4B__times_second_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_second_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5173,13 +5367,15 @@ GrB_Info GB_Adot3B__times_second_fc64
 GrB_Info GB_Asaxpy3B__times_second_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5192,14 +5388,14 @@ GrB_Info GB_Adot4B__times_second_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5217,13 +5413,15 @@ GrB_Info GB_Adot3B__lor_second_bool
 GrB_Info GB_Asaxpy3B__lor_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5236,14 +5434,14 @@ GrB_Info GB_Adot4B__lor_second_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5261,13 +5459,15 @@ GrB_Info GB_Adot3B__land_second_bool
 GrB_Info GB_Asaxpy3B__land_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5281,14 +5481,13 @@ GrB_Info GB_Adot4B__land_second_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5306,13 +5505,15 @@ GrB_Info GB_Adot3B__lxor_second_bool
 GrB_Info GB_Asaxpy3B__lxor_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5325,14 +5526,14 @@ GrB_Info GB_Adot4B__lxor_second_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5350,13 +5551,15 @@ GrB_Info GB_Adot3B__any_second_bool
 GrB_Info GB_Asaxpy3B__any_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5370,14 +5573,13 @@ GrB_Info GB_Adot4B__any_second_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5395,13 +5597,15 @@ GrB_Info GB_Adot3B__eq_second_bool
 GrB_Info GB_Asaxpy3B__eq_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5414,14 +5618,14 @@ GrB_Info GB_Adot4B__eq_second_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5439,13 +5643,15 @@ GrB_Info GB_Adot3B__any_pair_int8
 GrB_Info GB_Asaxpy3B__any_pair_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5458,14 +5664,14 @@ GrB_Info GB_Adot4B__any_pair_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5483,13 +5689,15 @@ GrB_Info GB_Adot3B__any_pair_int16
 GrB_Info GB_Asaxpy3B__any_pair_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5502,14 +5710,14 @@ GrB_Info GB_Adot4B__any_pair_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5527,13 +5735,15 @@ GrB_Info GB_Adot3B__any_pair_int32
 GrB_Info GB_Asaxpy3B__any_pair_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5546,14 +5756,14 @@ GrB_Info GB_Adot4B__any_pair_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5571,13 +5781,15 @@ GrB_Info GB_Adot3B__any_pair_int64
 GrB_Info GB_Asaxpy3B__any_pair_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5590,14 +5802,14 @@ GrB_Info GB_Adot4B__any_pair_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5615,13 +5827,15 @@ GrB_Info GB_Adot3B__any_pair_uint8
 GrB_Info GB_Asaxpy3B__any_pair_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5634,14 +5848,14 @@ GrB_Info GB_Adot4B__any_pair_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5659,13 +5873,15 @@ GrB_Info GB_Adot3B__any_pair_uint16
 GrB_Info GB_Asaxpy3B__any_pair_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5678,14 +5894,14 @@ GrB_Info GB_Adot4B__any_pair_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5703,13 +5919,15 @@ GrB_Info GB_Adot3B__any_pair_uint32
 GrB_Info GB_Asaxpy3B__any_pair_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5722,14 +5940,14 @@ GrB_Info GB_Adot4B__any_pair_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5747,13 +5965,15 @@ GrB_Info GB_Adot3B__any_pair_uint64
 GrB_Info GB_Asaxpy3B__any_pair_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5766,14 +5986,14 @@ GrB_Info GB_Adot4B__any_pair_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5791,13 +6011,15 @@ GrB_Info GB_Adot3B__any_pair_fp32
 GrB_Info GB_Asaxpy3B__any_pair_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5810,14 +6032,14 @@ GrB_Info GB_Adot4B__any_pair_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5835,13 +6057,15 @@ GrB_Info GB_Adot3B__any_pair_fp64
 GrB_Info GB_Asaxpy3B__any_pair_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5854,14 +6078,14 @@ GrB_Info GB_Adot4B__any_pair_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5879,13 +6103,15 @@ GrB_Info GB_Adot3B__any_pair_fc32
 GrB_Info GB_Asaxpy3B__any_pair_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5898,14 +6124,14 @@ GrB_Info GB_Adot4B__any_pair_fc32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5923,13 +6149,15 @@ GrB_Info GB_Adot3B__any_pair_fc64
 GrB_Info GB_Asaxpy3B__any_pair_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5943,14 +6171,13 @@ GrB_Info GB_Adot4B__any_pair_fc64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_pair_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -5968,13 +6195,15 @@ GrB_Info GB_Adot3B__plus_pair_int8
 GrB_Info GB_Asaxpy3B__plus_pair_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -5988,14 +6217,13 @@ GrB_Info GB_Adot4B__plus_pair_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_pair_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6013,13 +6241,15 @@ GrB_Info GB_Adot3B__plus_pair_uint8
 GrB_Info GB_Asaxpy3B__plus_pair_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6033,14 +6263,13 @@ GrB_Info GB_Adot4B__plus_pair_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_pair_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6058,13 +6287,15 @@ GrB_Info GB_Adot3B__plus_pair_int16
 GrB_Info GB_Asaxpy3B__plus_pair_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6078,14 +6309,13 @@ GrB_Info GB_Adot4B__plus_pair_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_pair_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6103,13 +6333,15 @@ GrB_Info GB_Adot3B__plus_pair_uint16
 GrB_Info GB_Asaxpy3B__plus_pair_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6123,14 +6355,13 @@ GrB_Info GB_Adot4B__plus_pair_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_pair_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6148,13 +6379,15 @@ GrB_Info GB_Adot3B__plus_pair_int32
 GrB_Info GB_Asaxpy3B__plus_pair_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6168,14 +6401,13 @@ GrB_Info GB_Adot4B__plus_pair_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_pair_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6193,13 +6425,15 @@ GrB_Info GB_Adot3B__plus_pair_uint32
 GrB_Info GB_Asaxpy3B__plus_pair_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6213,14 +6447,13 @@ GrB_Info GB_Adot4B__plus_pair_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_pair_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6238,13 +6471,15 @@ GrB_Info GB_Adot3B__plus_pair_int64
 GrB_Info GB_Asaxpy3B__plus_pair_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6258,14 +6493,13 @@ GrB_Info GB_Adot4B__plus_pair_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_pair_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6283,13 +6517,15 @@ GrB_Info GB_Adot3B__plus_pair_uint64
 GrB_Info GB_Asaxpy3B__plus_pair_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6303,14 +6539,13 @@ GrB_Info GB_Adot4B__plus_pair_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_pair_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6328,13 +6563,15 @@ GrB_Info GB_Adot3B__plus_pair_fp32
 GrB_Info GB_Asaxpy3B__plus_pair_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6348,14 +6585,13 @@ GrB_Info GB_Adot4B__plus_pair_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_pair_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6373,13 +6609,15 @@ GrB_Info GB_Adot3B__plus_pair_fp64
 GrB_Info GB_Asaxpy3B__plus_pair_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6393,14 +6631,13 @@ GrB_Info GB_Adot4B__plus_pair_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_pair_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6418,13 +6655,15 @@ GrB_Info GB_Adot3B__plus_pair_fc32
 GrB_Info GB_Asaxpy3B__plus_pair_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6438,14 +6677,13 @@ GrB_Info GB_Adot4B__plus_pair_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_pair_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6463,13 +6701,15 @@ GrB_Info GB_Adot3B__plus_pair_fc64
 GrB_Info GB_Asaxpy3B__plus_pair_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6483,14 +6723,13 @@ GrB_Info GB_Adot4B__plus_pair_fc64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_pair_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6508,13 +6747,15 @@ GrB_Info GB_Adot3B__lxor_pair_bool
 GrB_Info GB_Asaxpy3B__lxor_pair_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6527,14 +6768,14 @@ GrB_Info GB_Adot4B__lxor_pair_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_pair_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6552,13 +6793,15 @@ GrB_Info GB_Adot3B__any_pair_bool
 GrB_Info GB_Asaxpy3B__any_pair_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6571,14 +6814,14 @@ GrB_Info GB_Adot4B__any_pair_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6596,13 +6839,15 @@ GrB_Info GB_Adot3B__min_min_int8
 GrB_Info GB_Asaxpy3B__min_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6615,14 +6860,14 @@ GrB_Info GB_Adot4B__min_min_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6640,13 +6885,15 @@ GrB_Info GB_Adot3B__min_min_int16
 GrB_Info GB_Asaxpy3B__min_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6659,14 +6906,14 @@ GrB_Info GB_Adot4B__min_min_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6684,13 +6931,15 @@ GrB_Info GB_Adot3B__min_min_int32
 GrB_Info GB_Asaxpy3B__min_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6703,14 +6952,14 @@ GrB_Info GB_Adot4B__min_min_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6728,13 +6977,15 @@ GrB_Info GB_Adot3B__min_min_int64
 GrB_Info GB_Asaxpy3B__min_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6747,14 +6998,14 @@ GrB_Info GB_Adot4B__min_min_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6772,13 +7023,15 @@ GrB_Info GB_Adot3B__min_min_uint8
 GrB_Info GB_Asaxpy3B__min_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6791,14 +7044,14 @@ GrB_Info GB_Adot4B__min_min_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6816,13 +7069,15 @@ GrB_Info GB_Adot3B__min_min_uint16
 GrB_Info GB_Asaxpy3B__min_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6835,14 +7090,14 @@ GrB_Info GB_Adot4B__min_min_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6860,13 +7115,15 @@ GrB_Info GB_Adot3B__min_min_uint32
 GrB_Info GB_Asaxpy3B__min_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6879,14 +7136,14 @@ GrB_Info GB_Adot4B__min_min_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6904,13 +7161,15 @@ GrB_Info GB_Adot3B__min_min_uint64
 GrB_Info GB_Asaxpy3B__min_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6923,14 +7182,14 @@ GrB_Info GB_Adot4B__min_min_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6948,13 +7207,15 @@ GrB_Info GB_Adot3B__min_min_fp32
 GrB_Info GB_Asaxpy3B__min_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -6967,14 +7228,14 @@ GrB_Info GB_Adot4B__min_min_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -6992,13 +7253,15 @@ GrB_Info GB_Adot3B__min_min_fp64
 GrB_Info GB_Asaxpy3B__min_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7011,14 +7274,14 @@ GrB_Info GB_Adot4B__min_min_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7036,13 +7299,15 @@ GrB_Info GB_Adot3B__max_min_int8
 GrB_Info GB_Asaxpy3B__max_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7055,14 +7320,14 @@ GrB_Info GB_Adot4B__max_min_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7080,13 +7345,15 @@ GrB_Info GB_Adot3B__max_min_int16
 GrB_Info GB_Asaxpy3B__max_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7099,14 +7366,14 @@ GrB_Info GB_Adot4B__max_min_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7124,13 +7391,15 @@ GrB_Info GB_Adot3B__max_min_int32
 GrB_Info GB_Asaxpy3B__max_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7143,14 +7412,14 @@ GrB_Info GB_Adot4B__max_min_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7168,13 +7437,15 @@ GrB_Info GB_Adot3B__max_min_int64
 GrB_Info GB_Asaxpy3B__max_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7187,14 +7458,14 @@ GrB_Info GB_Adot4B__max_min_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7212,13 +7483,15 @@ GrB_Info GB_Adot3B__max_min_uint8
 GrB_Info GB_Asaxpy3B__max_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7231,14 +7504,14 @@ GrB_Info GB_Adot4B__max_min_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7256,13 +7529,15 @@ GrB_Info GB_Adot3B__max_min_uint16
 GrB_Info GB_Asaxpy3B__max_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7275,14 +7550,14 @@ GrB_Info GB_Adot4B__max_min_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7300,13 +7575,15 @@ GrB_Info GB_Adot3B__max_min_uint32
 GrB_Info GB_Asaxpy3B__max_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7319,14 +7596,14 @@ GrB_Info GB_Adot4B__max_min_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7344,13 +7621,15 @@ GrB_Info GB_Adot3B__max_min_uint64
 GrB_Info GB_Asaxpy3B__max_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7363,14 +7642,14 @@ GrB_Info GB_Adot4B__max_min_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7388,13 +7667,15 @@ GrB_Info GB_Adot3B__max_min_fp32
 GrB_Info GB_Asaxpy3B__max_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7407,14 +7688,14 @@ GrB_Info GB_Adot4B__max_min_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7432,13 +7713,15 @@ GrB_Info GB_Adot3B__max_min_fp64
 GrB_Info GB_Asaxpy3B__max_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7451,14 +7734,14 @@ GrB_Info GB_Adot4B__max_min_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7476,13 +7759,15 @@ GrB_Info GB_Adot3B__any_min_int8
 GrB_Info GB_Asaxpy3B__any_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7495,14 +7780,14 @@ GrB_Info GB_Adot4B__any_min_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7520,13 +7805,15 @@ GrB_Info GB_Adot3B__any_min_int16
 GrB_Info GB_Asaxpy3B__any_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7539,14 +7826,14 @@ GrB_Info GB_Adot4B__any_min_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7564,13 +7851,15 @@ GrB_Info GB_Adot3B__any_min_int32
 GrB_Info GB_Asaxpy3B__any_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7583,14 +7872,14 @@ GrB_Info GB_Adot4B__any_min_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7608,13 +7897,15 @@ GrB_Info GB_Adot3B__any_min_int64
 GrB_Info GB_Asaxpy3B__any_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7627,14 +7918,14 @@ GrB_Info GB_Adot4B__any_min_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7652,13 +7943,15 @@ GrB_Info GB_Adot3B__any_min_uint8
 GrB_Info GB_Asaxpy3B__any_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7671,14 +7964,14 @@ GrB_Info GB_Adot4B__any_min_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7696,13 +7989,15 @@ GrB_Info GB_Adot3B__any_min_uint16
 GrB_Info GB_Asaxpy3B__any_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7715,14 +8010,14 @@ GrB_Info GB_Adot4B__any_min_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7740,13 +8035,15 @@ GrB_Info GB_Adot3B__any_min_uint32
 GrB_Info GB_Asaxpy3B__any_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7759,14 +8056,14 @@ GrB_Info GB_Adot4B__any_min_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7784,13 +8081,15 @@ GrB_Info GB_Adot3B__any_min_uint64
 GrB_Info GB_Asaxpy3B__any_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7803,14 +8102,14 @@ GrB_Info GB_Adot4B__any_min_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7828,13 +8127,15 @@ GrB_Info GB_Adot3B__any_min_fp32
 GrB_Info GB_Asaxpy3B__any_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7847,14 +8148,14 @@ GrB_Info GB_Adot4B__any_min_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7872,13 +8173,15 @@ GrB_Info GB_Adot3B__any_min_fp64
 GrB_Info GB_Asaxpy3B__any_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7892,14 +8195,13 @@ GrB_Info GB_Adot4B__any_min_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7917,13 +8219,15 @@ GrB_Info GB_Adot3B__plus_min_int8
 GrB_Info GB_Asaxpy3B__plus_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7937,14 +8241,13 @@ GrB_Info GB_Adot4B__plus_min_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -7962,13 +8265,15 @@ GrB_Info GB_Adot3B__plus_min_uint8
 GrB_Info GB_Asaxpy3B__plus_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -7982,14 +8287,13 @@ GrB_Info GB_Adot4B__plus_min_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8007,13 +8311,15 @@ GrB_Info GB_Adot3B__plus_min_int16
 GrB_Info GB_Asaxpy3B__plus_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8027,14 +8333,13 @@ GrB_Info GB_Adot4B__plus_min_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8052,13 +8357,15 @@ GrB_Info GB_Adot3B__plus_min_uint16
 GrB_Info GB_Asaxpy3B__plus_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8072,14 +8379,13 @@ GrB_Info GB_Adot4B__plus_min_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8097,13 +8403,15 @@ GrB_Info GB_Adot3B__plus_min_int32
 GrB_Info GB_Asaxpy3B__plus_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8117,14 +8425,13 @@ GrB_Info GB_Adot4B__plus_min_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8142,13 +8449,15 @@ GrB_Info GB_Adot3B__plus_min_uint32
 GrB_Info GB_Asaxpy3B__plus_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8162,14 +8471,13 @@ GrB_Info GB_Adot4B__plus_min_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8187,13 +8495,15 @@ GrB_Info GB_Adot3B__plus_min_int64
 GrB_Info GB_Asaxpy3B__plus_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8207,14 +8517,13 @@ GrB_Info GB_Adot4B__plus_min_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8232,13 +8541,15 @@ GrB_Info GB_Adot3B__plus_min_uint64
 GrB_Info GB_Asaxpy3B__plus_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8252,14 +8563,13 @@ GrB_Info GB_Adot4B__plus_min_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8277,13 +8587,15 @@ GrB_Info GB_Adot3B__plus_min_fp32
 GrB_Info GB_Asaxpy3B__plus_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8297,14 +8609,13 @@ GrB_Info GB_Adot4B__plus_min_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8322,13 +8633,15 @@ GrB_Info GB_Adot3B__plus_min_fp64
 GrB_Info GB_Asaxpy3B__plus_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8341,14 +8654,14 @@ GrB_Info GB_Adot4B__plus_min_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8366,13 +8679,15 @@ GrB_Info GB_Adot3B__times_min_int8
 GrB_Info GB_Asaxpy3B__times_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8385,14 +8700,14 @@ GrB_Info GB_Adot4B__times_min_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8410,13 +8725,15 @@ GrB_Info GB_Adot3B__times_min_uint8
 GrB_Info GB_Asaxpy3B__times_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8429,14 +8746,14 @@ GrB_Info GB_Adot4B__times_min_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8454,13 +8771,15 @@ GrB_Info GB_Adot3B__times_min_int16
 GrB_Info GB_Asaxpy3B__times_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8473,14 +8792,14 @@ GrB_Info GB_Adot4B__times_min_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8498,13 +8817,15 @@ GrB_Info GB_Adot3B__times_min_uint16
 GrB_Info GB_Asaxpy3B__times_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8517,14 +8838,14 @@ GrB_Info GB_Adot4B__times_min_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8542,13 +8863,15 @@ GrB_Info GB_Adot3B__times_min_int32
 GrB_Info GB_Asaxpy3B__times_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8561,14 +8884,14 @@ GrB_Info GB_Adot4B__times_min_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8586,13 +8909,15 @@ GrB_Info GB_Adot3B__times_min_uint32
 GrB_Info GB_Asaxpy3B__times_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8605,14 +8930,14 @@ GrB_Info GB_Adot4B__times_min_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8630,13 +8955,15 @@ GrB_Info GB_Adot3B__times_min_int64
 GrB_Info GB_Asaxpy3B__times_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8649,14 +8976,14 @@ GrB_Info GB_Adot4B__times_min_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8674,13 +9001,15 @@ GrB_Info GB_Adot3B__times_min_uint64
 GrB_Info GB_Asaxpy3B__times_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8694,14 +9023,13 @@ GrB_Info GB_Adot4B__times_min_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8719,13 +9047,15 @@ GrB_Info GB_Adot3B__times_min_fp32
 GrB_Info GB_Asaxpy3B__times_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8739,14 +9069,13 @@ GrB_Info GB_Adot4B__times_min_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8764,13 +9093,15 @@ GrB_Info GB_Adot3B__times_min_fp64
 GrB_Info GB_Asaxpy3B__times_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8783,14 +9114,14 @@ GrB_Info GB_Adot4B__times_min_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8808,13 +9139,15 @@ GrB_Info GB_Adot3B__min_max_int8
 GrB_Info GB_Asaxpy3B__min_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8827,14 +9160,14 @@ GrB_Info GB_Adot4B__min_max_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8852,13 +9185,15 @@ GrB_Info GB_Adot3B__min_max_int16
 GrB_Info GB_Asaxpy3B__min_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8871,14 +9206,14 @@ GrB_Info GB_Adot4B__min_max_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8896,13 +9231,15 @@ GrB_Info GB_Adot3B__min_max_int32
 GrB_Info GB_Asaxpy3B__min_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8915,14 +9252,14 @@ GrB_Info GB_Adot4B__min_max_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8940,13 +9277,15 @@ GrB_Info GB_Adot3B__min_max_int64
 GrB_Info GB_Asaxpy3B__min_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -8959,14 +9298,14 @@ GrB_Info GB_Adot4B__min_max_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -8984,13 +9323,15 @@ GrB_Info GB_Adot3B__min_max_uint8
 GrB_Info GB_Asaxpy3B__min_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9003,14 +9344,14 @@ GrB_Info GB_Adot4B__min_max_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9028,13 +9369,15 @@ GrB_Info GB_Adot3B__min_max_uint16
 GrB_Info GB_Asaxpy3B__min_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9047,14 +9390,14 @@ GrB_Info GB_Adot4B__min_max_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9072,13 +9415,15 @@ GrB_Info GB_Adot3B__min_max_uint32
 GrB_Info GB_Asaxpy3B__min_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9091,14 +9436,14 @@ GrB_Info GB_Adot4B__min_max_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9116,13 +9461,15 @@ GrB_Info GB_Adot3B__min_max_uint64
 GrB_Info GB_Asaxpy3B__min_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9135,14 +9482,14 @@ GrB_Info GB_Adot4B__min_max_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9160,13 +9507,15 @@ GrB_Info GB_Adot3B__min_max_fp32
 GrB_Info GB_Asaxpy3B__min_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9179,14 +9528,14 @@ GrB_Info GB_Adot4B__min_max_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9204,13 +9553,15 @@ GrB_Info GB_Adot3B__min_max_fp64
 GrB_Info GB_Asaxpy3B__min_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9223,14 +9574,14 @@ GrB_Info GB_Adot4B__min_max_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9248,13 +9599,15 @@ GrB_Info GB_Adot3B__max_max_int8
 GrB_Info GB_Asaxpy3B__max_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9267,14 +9620,14 @@ GrB_Info GB_Adot4B__max_max_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9292,13 +9645,15 @@ GrB_Info GB_Adot3B__max_max_int16
 GrB_Info GB_Asaxpy3B__max_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9311,14 +9666,14 @@ GrB_Info GB_Adot4B__max_max_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9336,13 +9691,15 @@ GrB_Info GB_Adot3B__max_max_int32
 GrB_Info GB_Asaxpy3B__max_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9355,14 +9712,14 @@ GrB_Info GB_Adot4B__max_max_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9380,13 +9737,15 @@ GrB_Info GB_Adot3B__max_max_int64
 GrB_Info GB_Asaxpy3B__max_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9399,14 +9758,14 @@ GrB_Info GB_Adot4B__max_max_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9424,13 +9783,15 @@ GrB_Info GB_Adot3B__max_max_uint8
 GrB_Info GB_Asaxpy3B__max_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9443,14 +9804,14 @@ GrB_Info GB_Adot4B__max_max_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9468,13 +9829,15 @@ GrB_Info GB_Adot3B__max_max_uint16
 GrB_Info GB_Asaxpy3B__max_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9487,14 +9850,14 @@ GrB_Info GB_Adot4B__max_max_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9512,13 +9875,15 @@ GrB_Info GB_Adot3B__max_max_uint32
 GrB_Info GB_Asaxpy3B__max_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9531,14 +9896,14 @@ GrB_Info GB_Adot4B__max_max_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9556,13 +9921,15 @@ GrB_Info GB_Adot3B__max_max_uint64
 GrB_Info GB_Asaxpy3B__max_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9575,14 +9942,14 @@ GrB_Info GB_Adot4B__max_max_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9600,13 +9967,15 @@ GrB_Info GB_Adot3B__max_max_fp32
 GrB_Info GB_Asaxpy3B__max_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9619,14 +9988,14 @@ GrB_Info GB_Adot4B__max_max_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9644,13 +10013,15 @@ GrB_Info GB_Adot3B__max_max_fp64
 GrB_Info GB_Asaxpy3B__max_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9663,14 +10034,14 @@ GrB_Info GB_Adot4B__max_max_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9688,13 +10059,15 @@ GrB_Info GB_Adot3B__any_max_int8
 GrB_Info GB_Asaxpy3B__any_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9707,14 +10080,14 @@ GrB_Info GB_Adot4B__any_max_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9732,13 +10105,15 @@ GrB_Info GB_Adot3B__any_max_int16
 GrB_Info GB_Asaxpy3B__any_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9751,14 +10126,14 @@ GrB_Info GB_Adot4B__any_max_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9776,13 +10151,15 @@ GrB_Info GB_Adot3B__any_max_int32
 GrB_Info GB_Asaxpy3B__any_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9795,14 +10172,14 @@ GrB_Info GB_Adot4B__any_max_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9820,13 +10197,15 @@ GrB_Info GB_Adot3B__any_max_int64
 GrB_Info GB_Asaxpy3B__any_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9839,14 +10218,14 @@ GrB_Info GB_Adot4B__any_max_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9864,13 +10243,15 @@ GrB_Info GB_Adot3B__any_max_uint8
 GrB_Info GB_Asaxpy3B__any_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9883,14 +10264,14 @@ GrB_Info GB_Adot4B__any_max_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9908,13 +10289,15 @@ GrB_Info GB_Adot3B__any_max_uint16
 GrB_Info GB_Asaxpy3B__any_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9927,14 +10310,14 @@ GrB_Info GB_Adot4B__any_max_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9952,13 +10335,15 @@ GrB_Info GB_Adot3B__any_max_uint32
 GrB_Info GB_Asaxpy3B__any_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -9971,14 +10356,14 @@ GrB_Info GB_Adot4B__any_max_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -9996,13 +10381,15 @@ GrB_Info GB_Adot3B__any_max_uint64
 GrB_Info GB_Asaxpy3B__any_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10015,14 +10402,14 @@ GrB_Info GB_Adot4B__any_max_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10040,13 +10427,15 @@ GrB_Info GB_Adot3B__any_max_fp32
 GrB_Info GB_Asaxpy3B__any_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10059,14 +10448,14 @@ GrB_Info GB_Adot4B__any_max_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10084,13 +10473,15 @@ GrB_Info GB_Adot3B__any_max_fp64
 GrB_Info GB_Asaxpy3B__any_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10104,14 +10495,13 @@ GrB_Info GB_Adot4B__any_max_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10129,13 +10519,15 @@ GrB_Info GB_Adot3B__plus_max_int8
 GrB_Info GB_Asaxpy3B__plus_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10149,14 +10541,13 @@ GrB_Info GB_Adot4B__plus_max_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10174,13 +10565,15 @@ GrB_Info GB_Adot3B__plus_max_uint8
 GrB_Info GB_Asaxpy3B__plus_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10194,14 +10587,13 @@ GrB_Info GB_Adot4B__plus_max_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10219,13 +10611,15 @@ GrB_Info GB_Adot3B__plus_max_int16
 GrB_Info GB_Asaxpy3B__plus_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10239,14 +10633,13 @@ GrB_Info GB_Adot4B__plus_max_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10264,13 +10657,15 @@ GrB_Info GB_Adot3B__plus_max_uint16
 GrB_Info GB_Asaxpy3B__plus_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10284,14 +10679,13 @@ GrB_Info GB_Adot4B__plus_max_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10309,13 +10703,15 @@ GrB_Info GB_Adot3B__plus_max_int32
 GrB_Info GB_Asaxpy3B__plus_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10329,14 +10725,13 @@ GrB_Info GB_Adot4B__plus_max_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10354,13 +10749,15 @@ GrB_Info GB_Adot3B__plus_max_uint32
 GrB_Info GB_Asaxpy3B__plus_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10374,14 +10771,13 @@ GrB_Info GB_Adot4B__plus_max_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10399,13 +10795,15 @@ GrB_Info GB_Adot3B__plus_max_int64
 GrB_Info GB_Asaxpy3B__plus_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10419,14 +10817,13 @@ GrB_Info GB_Adot4B__plus_max_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10444,13 +10841,15 @@ GrB_Info GB_Adot3B__plus_max_uint64
 GrB_Info GB_Asaxpy3B__plus_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10464,14 +10863,13 @@ GrB_Info GB_Adot4B__plus_max_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10489,13 +10887,15 @@ GrB_Info GB_Adot3B__plus_max_fp32
 GrB_Info GB_Asaxpy3B__plus_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10509,14 +10909,13 @@ GrB_Info GB_Adot4B__plus_max_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10534,13 +10933,15 @@ GrB_Info GB_Adot3B__plus_max_fp64
 GrB_Info GB_Asaxpy3B__plus_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10553,14 +10954,14 @@ GrB_Info GB_Adot4B__plus_max_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10578,13 +10979,15 @@ GrB_Info GB_Adot3B__times_max_int8
 GrB_Info GB_Asaxpy3B__times_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10597,14 +11000,14 @@ GrB_Info GB_Adot4B__times_max_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10622,13 +11025,15 @@ GrB_Info GB_Adot3B__times_max_uint8
 GrB_Info GB_Asaxpy3B__times_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10641,14 +11046,14 @@ GrB_Info GB_Adot4B__times_max_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10666,13 +11071,15 @@ GrB_Info GB_Adot3B__times_max_int16
 GrB_Info GB_Asaxpy3B__times_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10685,14 +11092,14 @@ GrB_Info GB_Adot4B__times_max_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10710,13 +11117,15 @@ GrB_Info GB_Adot3B__times_max_uint16
 GrB_Info GB_Asaxpy3B__times_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10729,14 +11138,14 @@ GrB_Info GB_Adot4B__times_max_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10754,13 +11163,15 @@ GrB_Info GB_Adot3B__times_max_int32
 GrB_Info GB_Asaxpy3B__times_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10773,14 +11184,14 @@ GrB_Info GB_Adot4B__times_max_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10798,13 +11209,15 @@ GrB_Info GB_Adot3B__times_max_uint32
 GrB_Info GB_Asaxpy3B__times_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10817,14 +11230,14 @@ GrB_Info GB_Adot4B__times_max_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10842,13 +11255,15 @@ GrB_Info GB_Adot3B__times_max_int64
 GrB_Info GB_Asaxpy3B__times_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10861,14 +11276,14 @@ GrB_Info GB_Adot4B__times_max_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10886,13 +11301,15 @@ GrB_Info GB_Adot3B__times_max_uint64
 GrB_Info GB_Asaxpy3B__times_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10906,14 +11323,13 @@ GrB_Info GB_Adot4B__times_max_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10931,13 +11347,15 @@ GrB_Info GB_Adot3B__times_max_fp32
 GrB_Info GB_Asaxpy3B__times_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10951,14 +11369,13 @@ GrB_Info GB_Adot4B__times_max_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -10976,13 +11393,15 @@ GrB_Info GB_Adot3B__times_max_fp64
 GrB_Info GB_Asaxpy3B__times_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -10995,14 +11414,14 @@ GrB_Info GB_Adot4B__times_max_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11020,13 +11439,15 @@ GrB_Info GB_Adot3B__min_plus_int8
 GrB_Info GB_Asaxpy3B__min_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11039,14 +11460,14 @@ GrB_Info GB_Adot4B__min_plus_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11064,13 +11485,15 @@ GrB_Info GB_Adot3B__min_plus_int16
 GrB_Info GB_Asaxpy3B__min_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11083,14 +11506,14 @@ GrB_Info GB_Adot4B__min_plus_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11108,13 +11531,15 @@ GrB_Info GB_Adot3B__min_plus_int32
 GrB_Info GB_Asaxpy3B__min_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11127,14 +11552,14 @@ GrB_Info GB_Adot4B__min_plus_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11152,13 +11577,15 @@ GrB_Info GB_Adot3B__min_plus_int64
 GrB_Info GB_Asaxpy3B__min_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11171,14 +11598,14 @@ GrB_Info GB_Adot4B__min_plus_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11196,13 +11623,15 @@ GrB_Info GB_Adot3B__min_plus_uint8
 GrB_Info GB_Asaxpy3B__min_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11215,14 +11644,14 @@ GrB_Info GB_Adot4B__min_plus_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11240,13 +11669,15 @@ GrB_Info GB_Adot3B__min_plus_uint16
 GrB_Info GB_Asaxpy3B__min_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11259,14 +11690,14 @@ GrB_Info GB_Adot4B__min_plus_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11284,13 +11715,15 @@ GrB_Info GB_Adot3B__min_plus_uint32
 GrB_Info GB_Asaxpy3B__min_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11303,14 +11736,14 @@ GrB_Info GB_Adot4B__min_plus_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11328,13 +11761,15 @@ GrB_Info GB_Adot3B__min_plus_uint64
 GrB_Info GB_Asaxpy3B__min_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11347,14 +11782,14 @@ GrB_Info GB_Adot4B__min_plus_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11372,13 +11807,15 @@ GrB_Info GB_Adot3B__min_plus_fp32
 GrB_Info GB_Asaxpy3B__min_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11391,14 +11828,14 @@ GrB_Info GB_Adot4B__min_plus_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11416,13 +11853,15 @@ GrB_Info GB_Adot3B__min_plus_fp64
 GrB_Info GB_Asaxpy3B__min_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11435,14 +11874,14 @@ GrB_Info GB_Adot4B__min_plus_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11460,13 +11899,15 @@ GrB_Info GB_Adot3B__max_plus_int8
 GrB_Info GB_Asaxpy3B__max_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11479,14 +11920,14 @@ GrB_Info GB_Adot4B__max_plus_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11504,13 +11945,15 @@ GrB_Info GB_Adot3B__max_plus_int16
 GrB_Info GB_Asaxpy3B__max_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11523,14 +11966,14 @@ GrB_Info GB_Adot4B__max_plus_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11548,13 +11991,15 @@ GrB_Info GB_Adot3B__max_plus_int32
 GrB_Info GB_Asaxpy3B__max_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11567,14 +12012,14 @@ GrB_Info GB_Adot4B__max_plus_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11592,13 +12037,15 @@ GrB_Info GB_Adot3B__max_plus_int64
 GrB_Info GB_Asaxpy3B__max_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11611,14 +12058,14 @@ GrB_Info GB_Adot4B__max_plus_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11636,13 +12083,15 @@ GrB_Info GB_Adot3B__max_plus_uint8
 GrB_Info GB_Asaxpy3B__max_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11655,14 +12104,14 @@ GrB_Info GB_Adot4B__max_plus_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11680,13 +12129,15 @@ GrB_Info GB_Adot3B__max_plus_uint16
 GrB_Info GB_Asaxpy3B__max_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11699,14 +12150,14 @@ GrB_Info GB_Adot4B__max_plus_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11724,13 +12175,15 @@ GrB_Info GB_Adot3B__max_plus_uint32
 GrB_Info GB_Asaxpy3B__max_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11743,14 +12196,14 @@ GrB_Info GB_Adot4B__max_plus_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11768,13 +12221,15 @@ GrB_Info GB_Adot3B__max_plus_uint64
 GrB_Info GB_Asaxpy3B__max_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11787,14 +12242,14 @@ GrB_Info GB_Adot4B__max_plus_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11812,13 +12267,15 @@ GrB_Info GB_Adot3B__max_plus_fp32
 GrB_Info GB_Asaxpy3B__max_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11831,14 +12288,14 @@ GrB_Info GB_Adot4B__max_plus_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11856,13 +12313,15 @@ GrB_Info GB_Adot3B__max_plus_fp64
 GrB_Info GB_Asaxpy3B__max_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11875,14 +12334,14 @@ GrB_Info GB_Adot4B__max_plus_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11900,13 +12359,15 @@ GrB_Info GB_Adot3B__any_plus_int8
 GrB_Info GB_Asaxpy3B__any_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11919,14 +12380,14 @@ GrB_Info GB_Adot4B__any_plus_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11944,13 +12405,15 @@ GrB_Info GB_Adot3B__any_plus_int16
 GrB_Info GB_Asaxpy3B__any_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -11963,14 +12426,14 @@ GrB_Info GB_Adot4B__any_plus_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -11988,13 +12451,15 @@ GrB_Info GB_Adot3B__any_plus_int32
 GrB_Info GB_Asaxpy3B__any_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12007,14 +12472,14 @@ GrB_Info GB_Adot4B__any_plus_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12032,13 +12497,15 @@ GrB_Info GB_Adot3B__any_plus_int64
 GrB_Info GB_Asaxpy3B__any_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12051,14 +12518,14 @@ GrB_Info GB_Adot4B__any_plus_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12076,13 +12543,15 @@ GrB_Info GB_Adot3B__any_plus_uint8
 GrB_Info GB_Asaxpy3B__any_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12095,14 +12564,14 @@ GrB_Info GB_Adot4B__any_plus_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12120,13 +12589,15 @@ GrB_Info GB_Adot3B__any_plus_uint16
 GrB_Info GB_Asaxpy3B__any_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12139,14 +12610,14 @@ GrB_Info GB_Adot4B__any_plus_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12164,13 +12635,15 @@ GrB_Info GB_Adot3B__any_plus_uint32
 GrB_Info GB_Asaxpy3B__any_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12183,14 +12656,14 @@ GrB_Info GB_Adot4B__any_plus_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12208,13 +12681,15 @@ GrB_Info GB_Adot3B__any_plus_uint64
 GrB_Info GB_Asaxpy3B__any_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12227,14 +12702,14 @@ GrB_Info GB_Adot4B__any_plus_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12252,13 +12727,15 @@ GrB_Info GB_Adot3B__any_plus_fp32
 GrB_Info GB_Asaxpy3B__any_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12271,14 +12748,14 @@ GrB_Info GB_Adot4B__any_plus_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12296,13 +12773,15 @@ GrB_Info GB_Adot3B__any_plus_fp64
 GrB_Info GB_Asaxpy3B__any_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12315,14 +12794,14 @@ GrB_Info GB_Adot4B__any_plus_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_plus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12340,13 +12819,15 @@ GrB_Info GB_Adot3B__any_plus_fc32
 GrB_Info GB_Asaxpy3B__any_plus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12359,14 +12840,14 @@ GrB_Info GB_Adot4B__any_plus_fc32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_plus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12384,13 +12865,15 @@ GrB_Info GB_Adot3B__any_plus_fc64
 GrB_Info GB_Asaxpy3B__any_plus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12404,14 +12887,13 @@ GrB_Info GB_Adot4B__any_plus_fc64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12429,13 +12911,15 @@ GrB_Info GB_Adot3B__plus_plus_int8
 GrB_Info GB_Asaxpy3B__plus_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12449,14 +12933,13 @@ GrB_Info GB_Adot4B__plus_plus_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12474,13 +12957,15 @@ GrB_Info GB_Adot3B__plus_plus_uint8
 GrB_Info GB_Asaxpy3B__plus_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12494,14 +12979,13 @@ GrB_Info GB_Adot4B__plus_plus_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12519,13 +13003,15 @@ GrB_Info GB_Adot3B__plus_plus_int16
 GrB_Info GB_Asaxpy3B__plus_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12539,14 +13025,13 @@ GrB_Info GB_Adot4B__plus_plus_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12564,13 +13049,15 @@ GrB_Info GB_Adot3B__plus_plus_uint16
 GrB_Info GB_Asaxpy3B__plus_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12584,14 +13071,13 @@ GrB_Info GB_Adot4B__plus_plus_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12609,13 +13095,15 @@ GrB_Info GB_Adot3B__plus_plus_int32
 GrB_Info GB_Asaxpy3B__plus_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12629,14 +13117,13 @@ GrB_Info GB_Adot4B__plus_plus_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12654,13 +13141,15 @@ GrB_Info GB_Adot3B__plus_plus_uint32
 GrB_Info GB_Asaxpy3B__plus_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12674,14 +13163,13 @@ GrB_Info GB_Adot4B__plus_plus_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12699,13 +13187,15 @@ GrB_Info GB_Adot3B__plus_plus_int64
 GrB_Info GB_Asaxpy3B__plus_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12719,14 +13209,13 @@ GrB_Info GB_Adot4B__plus_plus_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12744,13 +13233,15 @@ GrB_Info GB_Adot3B__plus_plus_uint64
 GrB_Info GB_Asaxpy3B__plus_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12764,14 +13255,13 @@ GrB_Info GB_Adot4B__plus_plus_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12789,13 +13279,15 @@ GrB_Info GB_Adot3B__plus_plus_fp32
 GrB_Info GB_Asaxpy3B__plus_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12809,14 +13301,13 @@ GrB_Info GB_Adot4B__plus_plus_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12834,13 +13325,15 @@ GrB_Info GB_Adot3B__plus_plus_fp64
 GrB_Info GB_Asaxpy3B__plus_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12854,14 +13347,13 @@ GrB_Info GB_Adot4B__plus_plus_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_plus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12879,13 +13371,15 @@ GrB_Info GB_Adot3B__plus_plus_fc32
 GrB_Info GB_Asaxpy3B__plus_plus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12899,14 +13393,13 @@ GrB_Info GB_Adot4B__plus_plus_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_plus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12924,13 +13417,15 @@ GrB_Info GB_Adot3B__plus_plus_fc64
 GrB_Info GB_Asaxpy3B__plus_plus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12943,14 +13438,14 @@ GrB_Info GB_Adot4B__plus_plus_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -12968,13 +13463,15 @@ GrB_Info GB_Adot3B__times_plus_int8
 GrB_Info GB_Asaxpy3B__times_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -12987,14 +13484,14 @@ GrB_Info GB_Adot4B__times_plus_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13012,13 +13509,15 @@ GrB_Info GB_Adot3B__times_plus_uint8
 GrB_Info GB_Asaxpy3B__times_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13031,14 +13530,14 @@ GrB_Info GB_Adot4B__times_plus_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13056,13 +13555,15 @@ GrB_Info GB_Adot3B__times_plus_int16
 GrB_Info GB_Asaxpy3B__times_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13075,14 +13576,14 @@ GrB_Info GB_Adot4B__times_plus_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13100,13 +13601,15 @@ GrB_Info GB_Adot3B__times_plus_uint16
 GrB_Info GB_Asaxpy3B__times_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13119,14 +13622,14 @@ GrB_Info GB_Adot4B__times_plus_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13144,13 +13647,15 @@ GrB_Info GB_Adot3B__times_plus_int32
 GrB_Info GB_Asaxpy3B__times_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13163,14 +13668,14 @@ GrB_Info GB_Adot4B__times_plus_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13188,13 +13693,15 @@ GrB_Info GB_Adot3B__times_plus_uint32
 GrB_Info GB_Asaxpy3B__times_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13207,14 +13714,14 @@ GrB_Info GB_Adot4B__times_plus_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13232,13 +13739,15 @@ GrB_Info GB_Adot3B__times_plus_int64
 GrB_Info GB_Asaxpy3B__times_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13251,14 +13760,14 @@ GrB_Info GB_Adot4B__times_plus_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13276,13 +13785,15 @@ GrB_Info GB_Adot3B__times_plus_uint64
 GrB_Info GB_Asaxpy3B__times_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13296,14 +13807,13 @@ GrB_Info GB_Adot4B__times_plus_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13321,13 +13831,15 @@ GrB_Info GB_Adot3B__times_plus_fp32
 GrB_Info GB_Asaxpy3B__times_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13341,14 +13853,13 @@ GrB_Info GB_Adot4B__times_plus_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13366,13 +13877,15 @@ GrB_Info GB_Adot3B__times_plus_fp64
 GrB_Info GB_Asaxpy3B__times_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13386,14 +13899,13 @@ GrB_Info GB_Adot4B__times_plus_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_plus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13411,13 +13923,15 @@ GrB_Info GB_Adot3B__times_plus_fc32
 GrB_Info GB_Asaxpy3B__times_plus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13431,14 +13945,13 @@ GrB_Info GB_Adot4B__times_plus_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_plus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13456,13 +13969,15 @@ GrB_Info GB_Adot3B__times_plus_fc64
 GrB_Info GB_Asaxpy3B__times_plus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13475,14 +13990,14 @@ GrB_Info GB_Adot4B__times_plus_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13500,13 +14015,15 @@ GrB_Info GB_Adot3B__min_minus_int8
 GrB_Info GB_Asaxpy3B__min_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13519,14 +14036,14 @@ GrB_Info GB_Adot4B__min_minus_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13544,13 +14061,15 @@ GrB_Info GB_Adot3B__min_minus_int16
 GrB_Info GB_Asaxpy3B__min_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13563,14 +14082,14 @@ GrB_Info GB_Adot4B__min_minus_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13588,13 +14107,15 @@ GrB_Info GB_Adot3B__min_minus_int32
 GrB_Info GB_Asaxpy3B__min_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13607,14 +14128,14 @@ GrB_Info GB_Adot4B__min_minus_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13632,13 +14153,15 @@ GrB_Info GB_Adot3B__min_minus_int64
 GrB_Info GB_Asaxpy3B__min_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13651,14 +14174,14 @@ GrB_Info GB_Adot4B__min_minus_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13676,13 +14199,15 @@ GrB_Info GB_Adot3B__min_minus_uint8
 GrB_Info GB_Asaxpy3B__min_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13695,14 +14220,14 @@ GrB_Info GB_Adot4B__min_minus_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13720,13 +14245,15 @@ GrB_Info GB_Adot3B__min_minus_uint16
 GrB_Info GB_Asaxpy3B__min_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13739,14 +14266,14 @@ GrB_Info GB_Adot4B__min_minus_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13764,13 +14291,15 @@ GrB_Info GB_Adot3B__min_minus_uint32
 GrB_Info GB_Asaxpy3B__min_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13783,14 +14312,14 @@ GrB_Info GB_Adot4B__min_minus_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13808,13 +14337,15 @@ GrB_Info GB_Adot3B__min_minus_uint64
 GrB_Info GB_Asaxpy3B__min_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13827,14 +14358,14 @@ GrB_Info GB_Adot4B__min_minus_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13852,13 +14383,15 @@ GrB_Info GB_Adot3B__min_minus_fp32
 GrB_Info GB_Asaxpy3B__min_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13871,14 +14404,14 @@ GrB_Info GB_Adot4B__min_minus_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13896,13 +14429,15 @@ GrB_Info GB_Adot3B__min_minus_fp64
 GrB_Info GB_Asaxpy3B__min_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13915,14 +14450,14 @@ GrB_Info GB_Adot4B__min_minus_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13940,13 +14475,15 @@ GrB_Info GB_Adot3B__max_minus_int8
 GrB_Info GB_Asaxpy3B__max_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -13959,14 +14496,14 @@ GrB_Info GB_Adot4B__max_minus_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -13984,13 +14521,15 @@ GrB_Info GB_Adot3B__max_minus_int16
 GrB_Info GB_Asaxpy3B__max_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14003,14 +14542,14 @@ GrB_Info GB_Adot4B__max_minus_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14028,13 +14567,15 @@ GrB_Info GB_Adot3B__max_minus_int32
 GrB_Info GB_Asaxpy3B__max_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14047,14 +14588,14 @@ GrB_Info GB_Adot4B__max_minus_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14072,13 +14613,15 @@ GrB_Info GB_Adot3B__max_minus_int64
 GrB_Info GB_Asaxpy3B__max_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14091,14 +14634,14 @@ GrB_Info GB_Adot4B__max_minus_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14116,13 +14659,15 @@ GrB_Info GB_Adot3B__max_minus_uint8
 GrB_Info GB_Asaxpy3B__max_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14135,14 +14680,14 @@ GrB_Info GB_Adot4B__max_minus_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14160,13 +14705,15 @@ GrB_Info GB_Adot3B__max_minus_uint16
 GrB_Info GB_Asaxpy3B__max_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14179,14 +14726,14 @@ GrB_Info GB_Adot4B__max_minus_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14204,13 +14751,15 @@ GrB_Info GB_Adot3B__max_minus_uint32
 GrB_Info GB_Asaxpy3B__max_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14223,14 +14772,14 @@ GrB_Info GB_Adot4B__max_minus_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14248,13 +14797,15 @@ GrB_Info GB_Adot3B__max_minus_uint64
 GrB_Info GB_Asaxpy3B__max_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14267,14 +14818,14 @@ GrB_Info GB_Adot4B__max_minus_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14292,13 +14843,15 @@ GrB_Info GB_Adot3B__max_minus_fp32
 GrB_Info GB_Asaxpy3B__max_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14311,14 +14864,14 @@ GrB_Info GB_Adot4B__max_minus_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14336,13 +14889,15 @@ GrB_Info GB_Adot3B__max_minus_fp64
 GrB_Info GB_Asaxpy3B__max_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14355,14 +14910,14 @@ GrB_Info GB_Adot4B__max_minus_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14380,13 +14935,15 @@ GrB_Info GB_Adot3B__any_minus_int8
 GrB_Info GB_Asaxpy3B__any_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14399,14 +14956,14 @@ GrB_Info GB_Adot4B__any_minus_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14424,13 +14981,15 @@ GrB_Info GB_Adot3B__any_minus_int16
 GrB_Info GB_Asaxpy3B__any_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14443,14 +15002,14 @@ GrB_Info GB_Adot4B__any_minus_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14468,13 +15027,15 @@ GrB_Info GB_Adot3B__any_minus_int32
 GrB_Info GB_Asaxpy3B__any_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14487,14 +15048,14 @@ GrB_Info GB_Adot4B__any_minus_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14512,13 +15073,15 @@ GrB_Info GB_Adot3B__any_minus_int64
 GrB_Info GB_Asaxpy3B__any_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14531,14 +15094,14 @@ GrB_Info GB_Adot4B__any_minus_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14556,13 +15119,15 @@ GrB_Info GB_Adot3B__any_minus_uint8
 GrB_Info GB_Asaxpy3B__any_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14575,14 +15140,14 @@ GrB_Info GB_Adot4B__any_minus_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14600,13 +15165,15 @@ GrB_Info GB_Adot3B__any_minus_uint16
 GrB_Info GB_Asaxpy3B__any_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14619,14 +15186,14 @@ GrB_Info GB_Adot4B__any_minus_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14644,13 +15211,15 @@ GrB_Info GB_Adot3B__any_minus_uint32
 GrB_Info GB_Asaxpy3B__any_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14663,14 +15232,14 @@ GrB_Info GB_Adot4B__any_minus_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14688,13 +15257,15 @@ GrB_Info GB_Adot3B__any_minus_uint64
 GrB_Info GB_Asaxpy3B__any_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14707,14 +15278,14 @@ GrB_Info GB_Adot4B__any_minus_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14732,13 +15303,15 @@ GrB_Info GB_Adot3B__any_minus_fp32
 GrB_Info GB_Asaxpy3B__any_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14751,14 +15324,14 @@ GrB_Info GB_Adot4B__any_minus_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14776,13 +15349,15 @@ GrB_Info GB_Adot3B__any_minus_fp64
 GrB_Info GB_Asaxpy3B__any_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14795,14 +15370,14 @@ GrB_Info GB_Adot4B__any_minus_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_minus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14820,13 +15395,15 @@ GrB_Info GB_Adot3B__any_minus_fc32
 GrB_Info GB_Asaxpy3B__any_minus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14839,14 +15416,14 @@ GrB_Info GB_Adot4B__any_minus_fc32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_minus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14864,13 +15441,15 @@ GrB_Info GB_Adot3B__any_minus_fc64
 GrB_Info GB_Asaxpy3B__any_minus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14884,14 +15463,13 @@ GrB_Info GB_Adot4B__any_minus_fc64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14909,13 +15487,15 @@ GrB_Info GB_Adot3B__plus_minus_int8
 GrB_Info GB_Asaxpy3B__plus_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14929,14 +15509,13 @@ GrB_Info GB_Adot4B__plus_minus_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14954,13 +15533,15 @@ GrB_Info GB_Adot3B__plus_minus_uint8
 GrB_Info GB_Asaxpy3B__plus_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -14974,14 +15555,13 @@ GrB_Info GB_Adot4B__plus_minus_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -14999,13 +15579,15 @@ GrB_Info GB_Adot3B__plus_minus_int16
 GrB_Info GB_Asaxpy3B__plus_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15019,14 +15601,13 @@ GrB_Info GB_Adot4B__plus_minus_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15044,13 +15625,15 @@ GrB_Info GB_Adot3B__plus_minus_uint16
 GrB_Info GB_Asaxpy3B__plus_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15064,14 +15647,13 @@ GrB_Info GB_Adot4B__plus_minus_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15089,13 +15671,15 @@ GrB_Info GB_Adot3B__plus_minus_int32
 GrB_Info GB_Asaxpy3B__plus_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15109,14 +15693,13 @@ GrB_Info GB_Adot4B__plus_minus_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15134,13 +15717,15 @@ GrB_Info GB_Adot3B__plus_minus_uint32
 GrB_Info GB_Asaxpy3B__plus_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15154,14 +15739,13 @@ GrB_Info GB_Adot4B__plus_minus_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15179,13 +15763,15 @@ GrB_Info GB_Adot3B__plus_minus_int64
 GrB_Info GB_Asaxpy3B__plus_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15199,14 +15785,13 @@ GrB_Info GB_Adot4B__plus_minus_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15224,13 +15809,15 @@ GrB_Info GB_Adot3B__plus_minus_uint64
 GrB_Info GB_Asaxpy3B__plus_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15244,14 +15831,13 @@ GrB_Info GB_Adot4B__plus_minus_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15269,13 +15855,15 @@ GrB_Info GB_Adot3B__plus_minus_fp32
 GrB_Info GB_Asaxpy3B__plus_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15289,14 +15877,13 @@ GrB_Info GB_Adot4B__plus_minus_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15314,13 +15901,15 @@ GrB_Info GB_Adot3B__plus_minus_fp64
 GrB_Info GB_Asaxpy3B__plus_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15334,14 +15923,13 @@ GrB_Info GB_Adot4B__plus_minus_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_minus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15359,13 +15947,15 @@ GrB_Info GB_Adot3B__plus_minus_fc32
 GrB_Info GB_Asaxpy3B__plus_minus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15379,14 +15969,13 @@ GrB_Info GB_Adot4B__plus_minus_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_minus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15404,13 +15993,15 @@ GrB_Info GB_Adot3B__plus_minus_fc64
 GrB_Info GB_Asaxpy3B__plus_minus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15423,14 +16014,14 @@ GrB_Info GB_Adot4B__plus_minus_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15448,13 +16039,15 @@ GrB_Info GB_Adot3B__times_minus_int8
 GrB_Info GB_Asaxpy3B__times_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15467,14 +16060,14 @@ GrB_Info GB_Adot4B__times_minus_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15492,13 +16085,15 @@ GrB_Info GB_Adot3B__times_minus_uint8
 GrB_Info GB_Asaxpy3B__times_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15511,14 +16106,14 @@ GrB_Info GB_Adot4B__times_minus_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15536,13 +16131,15 @@ GrB_Info GB_Adot3B__times_minus_int16
 GrB_Info GB_Asaxpy3B__times_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15555,14 +16152,14 @@ GrB_Info GB_Adot4B__times_minus_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15580,13 +16177,15 @@ GrB_Info GB_Adot3B__times_minus_uint16
 GrB_Info GB_Asaxpy3B__times_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15599,14 +16198,14 @@ GrB_Info GB_Adot4B__times_minus_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15624,13 +16223,15 @@ GrB_Info GB_Adot3B__times_minus_int32
 GrB_Info GB_Asaxpy3B__times_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15643,14 +16244,14 @@ GrB_Info GB_Adot4B__times_minus_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15668,13 +16269,15 @@ GrB_Info GB_Adot3B__times_minus_uint32
 GrB_Info GB_Asaxpy3B__times_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15687,14 +16290,14 @@ GrB_Info GB_Adot4B__times_minus_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15712,13 +16315,15 @@ GrB_Info GB_Adot3B__times_minus_int64
 GrB_Info GB_Asaxpy3B__times_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15731,14 +16336,14 @@ GrB_Info GB_Adot4B__times_minus_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15756,13 +16361,15 @@ GrB_Info GB_Adot3B__times_minus_uint64
 GrB_Info GB_Asaxpy3B__times_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15776,14 +16383,13 @@ GrB_Info GB_Adot4B__times_minus_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15801,13 +16407,15 @@ GrB_Info GB_Adot3B__times_minus_fp32
 GrB_Info GB_Asaxpy3B__times_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15821,14 +16429,13 @@ GrB_Info GB_Adot4B__times_minus_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15846,13 +16453,15 @@ GrB_Info GB_Adot3B__times_minus_fp64
 GrB_Info GB_Asaxpy3B__times_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15866,14 +16475,13 @@ GrB_Info GB_Adot4B__times_minus_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_minus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15891,13 +16499,15 @@ GrB_Info GB_Adot3B__times_minus_fc32
 GrB_Info GB_Asaxpy3B__times_minus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15911,14 +16521,13 @@ GrB_Info GB_Adot4B__times_minus_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_minus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15936,13 +16545,15 @@ GrB_Info GB_Adot3B__times_minus_fc64
 GrB_Info GB_Asaxpy3B__times_minus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15955,14 +16566,14 @@ GrB_Info GB_Adot4B__times_minus_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -15980,13 +16591,15 @@ GrB_Info GB_Adot3B__min_rminus_int8
 GrB_Info GB_Asaxpy3B__min_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -15999,14 +16612,14 @@ GrB_Info GB_Adot4B__min_rminus_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16024,13 +16637,15 @@ GrB_Info GB_Adot3B__min_rminus_int16
 GrB_Info GB_Asaxpy3B__min_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16043,14 +16658,14 @@ GrB_Info GB_Adot4B__min_rminus_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16068,13 +16683,15 @@ GrB_Info GB_Adot3B__min_rminus_int32
 GrB_Info GB_Asaxpy3B__min_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16087,14 +16704,14 @@ GrB_Info GB_Adot4B__min_rminus_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16112,13 +16729,15 @@ GrB_Info GB_Adot3B__min_rminus_int64
 GrB_Info GB_Asaxpy3B__min_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16131,14 +16750,14 @@ GrB_Info GB_Adot4B__min_rminus_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16156,13 +16775,15 @@ GrB_Info GB_Adot3B__min_rminus_uint8
 GrB_Info GB_Asaxpy3B__min_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16175,14 +16796,14 @@ GrB_Info GB_Adot4B__min_rminus_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16200,13 +16821,15 @@ GrB_Info GB_Adot3B__min_rminus_uint16
 GrB_Info GB_Asaxpy3B__min_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16219,14 +16842,14 @@ GrB_Info GB_Adot4B__min_rminus_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16244,13 +16867,15 @@ GrB_Info GB_Adot3B__min_rminus_uint32
 GrB_Info GB_Asaxpy3B__min_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16263,14 +16888,14 @@ GrB_Info GB_Adot4B__min_rminus_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16288,13 +16913,15 @@ GrB_Info GB_Adot3B__min_rminus_uint64
 GrB_Info GB_Asaxpy3B__min_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16307,14 +16934,14 @@ GrB_Info GB_Adot4B__min_rminus_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16332,13 +16959,15 @@ GrB_Info GB_Adot3B__min_rminus_fp32
 GrB_Info GB_Asaxpy3B__min_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16351,14 +16980,14 @@ GrB_Info GB_Adot4B__min_rminus_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16376,13 +17005,15 @@ GrB_Info GB_Adot3B__min_rminus_fp64
 GrB_Info GB_Asaxpy3B__min_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16395,14 +17026,14 @@ GrB_Info GB_Adot4B__min_rminus_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16420,13 +17051,15 @@ GrB_Info GB_Adot3B__max_rminus_int8
 GrB_Info GB_Asaxpy3B__max_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16439,14 +17072,14 @@ GrB_Info GB_Adot4B__max_rminus_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16464,13 +17097,15 @@ GrB_Info GB_Adot3B__max_rminus_int16
 GrB_Info GB_Asaxpy3B__max_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16483,14 +17118,14 @@ GrB_Info GB_Adot4B__max_rminus_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16508,13 +17143,15 @@ GrB_Info GB_Adot3B__max_rminus_int32
 GrB_Info GB_Asaxpy3B__max_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16527,14 +17164,14 @@ GrB_Info GB_Adot4B__max_rminus_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16552,13 +17189,15 @@ GrB_Info GB_Adot3B__max_rminus_int64
 GrB_Info GB_Asaxpy3B__max_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16571,14 +17210,14 @@ GrB_Info GB_Adot4B__max_rminus_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16596,13 +17235,15 @@ GrB_Info GB_Adot3B__max_rminus_uint8
 GrB_Info GB_Asaxpy3B__max_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16615,14 +17256,14 @@ GrB_Info GB_Adot4B__max_rminus_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16640,13 +17281,15 @@ GrB_Info GB_Adot3B__max_rminus_uint16
 GrB_Info GB_Asaxpy3B__max_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16659,14 +17302,14 @@ GrB_Info GB_Adot4B__max_rminus_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16684,13 +17327,15 @@ GrB_Info GB_Adot3B__max_rminus_uint32
 GrB_Info GB_Asaxpy3B__max_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16703,14 +17348,14 @@ GrB_Info GB_Adot4B__max_rminus_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16728,13 +17373,15 @@ GrB_Info GB_Adot3B__max_rminus_uint64
 GrB_Info GB_Asaxpy3B__max_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16747,14 +17394,14 @@ GrB_Info GB_Adot4B__max_rminus_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16772,13 +17419,15 @@ GrB_Info GB_Adot3B__max_rminus_fp32
 GrB_Info GB_Asaxpy3B__max_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16791,14 +17440,14 @@ GrB_Info GB_Adot4B__max_rminus_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16816,13 +17465,15 @@ GrB_Info GB_Adot3B__max_rminus_fp64
 GrB_Info GB_Asaxpy3B__max_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16835,14 +17486,14 @@ GrB_Info GB_Adot4B__max_rminus_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16860,13 +17511,15 @@ GrB_Info GB_Adot3B__any_rminus_int8
 GrB_Info GB_Asaxpy3B__any_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16879,14 +17532,14 @@ GrB_Info GB_Adot4B__any_rminus_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16904,13 +17557,15 @@ GrB_Info GB_Adot3B__any_rminus_int16
 GrB_Info GB_Asaxpy3B__any_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16923,14 +17578,14 @@ GrB_Info GB_Adot4B__any_rminus_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16948,13 +17603,15 @@ GrB_Info GB_Adot3B__any_rminus_int32
 GrB_Info GB_Asaxpy3B__any_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -16967,14 +17624,14 @@ GrB_Info GB_Adot4B__any_rminus_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -16992,13 +17649,15 @@ GrB_Info GB_Adot3B__any_rminus_int64
 GrB_Info GB_Asaxpy3B__any_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17011,14 +17670,14 @@ GrB_Info GB_Adot4B__any_rminus_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17036,13 +17695,15 @@ GrB_Info GB_Adot3B__any_rminus_uint8
 GrB_Info GB_Asaxpy3B__any_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17055,14 +17716,14 @@ GrB_Info GB_Adot4B__any_rminus_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17080,13 +17741,15 @@ GrB_Info GB_Adot3B__any_rminus_uint16
 GrB_Info GB_Asaxpy3B__any_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17099,14 +17762,14 @@ GrB_Info GB_Adot4B__any_rminus_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17124,13 +17787,15 @@ GrB_Info GB_Adot3B__any_rminus_uint32
 GrB_Info GB_Asaxpy3B__any_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17143,14 +17808,14 @@ GrB_Info GB_Adot4B__any_rminus_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17168,13 +17833,15 @@ GrB_Info GB_Adot3B__any_rminus_uint64
 GrB_Info GB_Asaxpy3B__any_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17187,14 +17854,14 @@ GrB_Info GB_Adot4B__any_rminus_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17212,13 +17879,15 @@ GrB_Info GB_Adot3B__any_rminus_fp32
 GrB_Info GB_Asaxpy3B__any_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17231,14 +17900,14 @@ GrB_Info GB_Adot4B__any_rminus_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17256,13 +17925,15 @@ GrB_Info GB_Adot3B__any_rminus_fp64
 GrB_Info GB_Asaxpy3B__any_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17275,14 +17946,14 @@ GrB_Info GB_Adot4B__any_rminus_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rminus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17300,13 +17971,15 @@ GrB_Info GB_Adot3B__any_rminus_fc32
 GrB_Info GB_Asaxpy3B__any_rminus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17319,14 +17992,14 @@ GrB_Info GB_Adot4B__any_rminus_fc32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rminus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17344,13 +18017,15 @@ GrB_Info GB_Adot3B__any_rminus_fc64
 GrB_Info GB_Asaxpy3B__any_rminus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17364,14 +18039,13 @@ GrB_Info GB_Adot4B__any_rminus_fc64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17389,13 +18063,15 @@ GrB_Info GB_Adot3B__plus_rminus_int8
 GrB_Info GB_Asaxpy3B__plus_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17409,14 +18085,13 @@ GrB_Info GB_Adot4B__plus_rminus_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17434,13 +18109,15 @@ GrB_Info GB_Adot3B__plus_rminus_uint8
 GrB_Info GB_Asaxpy3B__plus_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17454,14 +18131,13 @@ GrB_Info GB_Adot4B__plus_rminus_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17479,13 +18155,15 @@ GrB_Info GB_Adot3B__plus_rminus_int16
 GrB_Info GB_Asaxpy3B__plus_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17499,14 +18177,13 @@ GrB_Info GB_Adot4B__plus_rminus_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17524,13 +18201,15 @@ GrB_Info GB_Adot3B__plus_rminus_uint16
 GrB_Info GB_Asaxpy3B__plus_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17544,14 +18223,13 @@ GrB_Info GB_Adot4B__plus_rminus_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17569,13 +18247,15 @@ GrB_Info GB_Adot3B__plus_rminus_int32
 GrB_Info GB_Asaxpy3B__plus_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17589,14 +18269,13 @@ GrB_Info GB_Adot4B__plus_rminus_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17614,13 +18293,15 @@ GrB_Info GB_Adot3B__plus_rminus_uint32
 GrB_Info GB_Asaxpy3B__plus_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17634,14 +18315,13 @@ GrB_Info GB_Adot4B__plus_rminus_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17659,13 +18339,15 @@ GrB_Info GB_Adot3B__plus_rminus_int64
 GrB_Info GB_Asaxpy3B__plus_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17679,14 +18361,13 @@ GrB_Info GB_Adot4B__plus_rminus_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17704,13 +18385,15 @@ GrB_Info GB_Adot3B__plus_rminus_uint64
 GrB_Info GB_Asaxpy3B__plus_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17724,14 +18407,13 @@ GrB_Info GB_Adot4B__plus_rminus_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17749,13 +18431,15 @@ GrB_Info GB_Adot3B__plus_rminus_fp32
 GrB_Info GB_Asaxpy3B__plus_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17769,14 +18453,13 @@ GrB_Info GB_Adot4B__plus_rminus_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17794,13 +18477,15 @@ GrB_Info GB_Adot3B__plus_rminus_fp64
 GrB_Info GB_Asaxpy3B__plus_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17814,14 +18499,13 @@ GrB_Info GB_Adot4B__plus_rminus_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rminus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17839,13 +18523,15 @@ GrB_Info GB_Adot3B__plus_rminus_fc32
 GrB_Info GB_Asaxpy3B__plus_rminus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17859,14 +18545,13 @@ GrB_Info GB_Adot4B__plus_rminus_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rminus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17884,13 +18569,15 @@ GrB_Info GB_Adot3B__plus_rminus_fc64
 GrB_Info GB_Asaxpy3B__plus_rminus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17903,14 +18590,14 @@ GrB_Info GB_Adot4B__plus_rminus_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17928,13 +18615,15 @@ GrB_Info GB_Adot3B__times_rminus_int8
 GrB_Info GB_Asaxpy3B__times_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17947,14 +18636,14 @@ GrB_Info GB_Adot4B__times_rminus_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -17972,13 +18661,15 @@ GrB_Info GB_Adot3B__times_rminus_uint8
 GrB_Info GB_Asaxpy3B__times_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -17991,14 +18682,14 @@ GrB_Info GB_Adot4B__times_rminus_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18016,13 +18707,15 @@ GrB_Info GB_Adot3B__times_rminus_int16
 GrB_Info GB_Asaxpy3B__times_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18035,14 +18728,14 @@ GrB_Info GB_Adot4B__times_rminus_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18060,13 +18753,15 @@ GrB_Info GB_Adot3B__times_rminus_uint16
 GrB_Info GB_Asaxpy3B__times_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18079,14 +18774,14 @@ GrB_Info GB_Adot4B__times_rminus_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18104,13 +18799,15 @@ GrB_Info GB_Adot3B__times_rminus_int32
 GrB_Info GB_Asaxpy3B__times_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18123,14 +18820,14 @@ GrB_Info GB_Adot4B__times_rminus_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18148,13 +18845,15 @@ GrB_Info GB_Adot3B__times_rminus_uint32
 GrB_Info GB_Asaxpy3B__times_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18167,14 +18866,14 @@ GrB_Info GB_Adot4B__times_rminus_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18192,13 +18891,15 @@ GrB_Info GB_Adot3B__times_rminus_int64
 GrB_Info GB_Asaxpy3B__times_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18211,14 +18912,14 @@ GrB_Info GB_Adot4B__times_rminus_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18236,13 +18937,15 @@ GrB_Info GB_Adot3B__times_rminus_uint64
 GrB_Info GB_Asaxpy3B__times_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18256,14 +18959,13 @@ GrB_Info GB_Adot4B__times_rminus_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18281,13 +18983,15 @@ GrB_Info GB_Adot3B__times_rminus_fp32
 GrB_Info GB_Asaxpy3B__times_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18301,14 +19005,13 @@ GrB_Info GB_Adot4B__times_rminus_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18326,13 +19029,15 @@ GrB_Info GB_Adot3B__times_rminus_fp64
 GrB_Info GB_Asaxpy3B__times_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18346,14 +19051,13 @@ GrB_Info GB_Adot4B__times_rminus_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rminus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18371,13 +19075,15 @@ GrB_Info GB_Adot3B__times_rminus_fc32
 GrB_Info GB_Asaxpy3B__times_rminus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18391,14 +19097,13 @@ GrB_Info GB_Adot4B__times_rminus_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rminus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18416,13 +19121,15 @@ GrB_Info GB_Adot3B__times_rminus_fc64
 GrB_Info GB_Asaxpy3B__times_rminus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18435,14 +19142,14 @@ GrB_Info GB_Adot4B__times_rminus_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18460,13 +19167,15 @@ GrB_Info GB_Adot3B__min_times_int8
 GrB_Info GB_Asaxpy3B__min_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18479,14 +19188,14 @@ GrB_Info GB_Adot4B__min_times_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18504,13 +19213,15 @@ GrB_Info GB_Adot3B__min_times_int16
 GrB_Info GB_Asaxpy3B__min_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18523,14 +19234,14 @@ GrB_Info GB_Adot4B__min_times_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18548,13 +19259,15 @@ GrB_Info GB_Adot3B__min_times_int32
 GrB_Info GB_Asaxpy3B__min_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18567,14 +19280,14 @@ GrB_Info GB_Adot4B__min_times_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18592,13 +19305,15 @@ GrB_Info GB_Adot3B__min_times_int64
 GrB_Info GB_Asaxpy3B__min_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18611,14 +19326,14 @@ GrB_Info GB_Adot4B__min_times_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18636,13 +19351,15 @@ GrB_Info GB_Adot3B__min_times_uint8
 GrB_Info GB_Asaxpy3B__min_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18655,14 +19372,14 @@ GrB_Info GB_Adot4B__min_times_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18680,13 +19397,15 @@ GrB_Info GB_Adot3B__min_times_uint16
 GrB_Info GB_Asaxpy3B__min_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18699,14 +19418,14 @@ GrB_Info GB_Adot4B__min_times_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18724,13 +19443,15 @@ GrB_Info GB_Adot3B__min_times_uint32
 GrB_Info GB_Asaxpy3B__min_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18743,14 +19464,14 @@ GrB_Info GB_Adot4B__min_times_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18768,13 +19489,15 @@ GrB_Info GB_Adot3B__min_times_uint64
 GrB_Info GB_Asaxpy3B__min_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18787,14 +19510,14 @@ GrB_Info GB_Adot4B__min_times_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18812,13 +19535,15 @@ GrB_Info GB_Adot3B__min_times_fp32
 GrB_Info GB_Asaxpy3B__min_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18831,14 +19556,14 @@ GrB_Info GB_Adot4B__min_times_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18856,13 +19581,15 @@ GrB_Info GB_Adot3B__min_times_fp64
 GrB_Info GB_Asaxpy3B__min_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18875,14 +19602,14 @@ GrB_Info GB_Adot4B__min_times_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18900,13 +19627,15 @@ GrB_Info GB_Adot3B__max_times_int8
 GrB_Info GB_Asaxpy3B__max_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18919,14 +19648,14 @@ GrB_Info GB_Adot4B__max_times_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18944,13 +19673,15 @@ GrB_Info GB_Adot3B__max_times_int16
 GrB_Info GB_Asaxpy3B__max_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -18963,14 +19694,14 @@ GrB_Info GB_Adot4B__max_times_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -18988,13 +19719,15 @@ GrB_Info GB_Adot3B__max_times_int32
 GrB_Info GB_Asaxpy3B__max_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19007,14 +19740,14 @@ GrB_Info GB_Adot4B__max_times_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19032,13 +19765,15 @@ GrB_Info GB_Adot3B__max_times_int64
 GrB_Info GB_Asaxpy3B__max_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19051,14 +19786,14 @@ GrB_Info GB_Adot4B__max_times_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19076,13 +19811,15 @@ GrB_Info GB_Adot3B__max_times_uint8
 GrB_Info GB_Asaxpy3B__max_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19095,14 +19832,14 @@ GrB_Info GB_Adot4B__max_times_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19120,13 +19857,15 @@ GrB_Info GB_Adot3B__max_times_uint16
 GrB_Info GB_Asaxpy3B__max_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19139,14 +19878,14 @@ GrB_Info GB_Adot4B__max_times_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19164,13 +19903,15 @@ GrB_Info GB_Adot3B__max_times_uint32
 GrB_Info GB_Asaxpy3B__max_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19183,14 +19924,14 @@ GrB_Info GB_Adot4B__max_times_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19208,13 +19949,15 @@ GrB_Info GB_Adot3B__max_times_uint64
 GrB_Info GB_Asaxpy3B__max_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19227,14 +19970,14 @@ GrB_Info GB_Adot4B__max_times_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19252,13 +19995,15 @@ GrB_Info GB_Adot3B__max_times_fp32
 GrB_Info GB_Asaxpy3B__max_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19271,14 +20016,14 @@ GrB_Info GB_Adot4B__max_times_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19296,13 +20041,15 @@ GrB_Info GB_Adot3B__max_times_fp64
 GrB_Info GB_Asaxpy3B__max_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19315,14 +20062,14 @@ GrB_Info GB_Adot4B__max_times_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19340,13 +20087,15 @@ GrB_Info GB_Adot3B__any_times_int8
 GrB_Info GB_Asaxpy3B__any_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19359,14 +20108,14 @@ GrB_Info GB_Adot4B__any_times_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19384,13 +20133,15 @@ GrB_Info GB_Adot3B__any_times_int16
 GrB_Info GB_Asaxpy3B__any_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19403,14 +20154,14 @@ GrB_Info GB_Adot4B__any_times_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19428,13 +20179,15 @@ GrB_Info GB_Adot3B__any_times_int32
 GrB_Info GB_Asaxpy3B__any_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19447,14 +20200,14 @@ GrB_Info GB_Adot4B__any_times_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19472,13 +20225,15 @@ GrB_Info GB_Adot3B__any_times_int64
 GrB_Info GB_Asaxpy3B__any_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19491,14 +20246,14 @@ GrB_Info GB_Adot4B__any_times_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19516,13 +20271,15 @@ GrB_Info GB_Adot3B__any_times_uint8
 GrB_Info GB_Asaxpy3B__any_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19535,14 +20292,14 @@ GrB_Info GB_Adot4B__any_times_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19560,13 +20317,15 @@ GrB_Info GB_Adot3B__any_times_uint16
 GrB_Info GB_Asaxpy3B__any_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19579,14 +20338,14 @@ GrB_Info GB_Adot4B__any_times_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19604,13 +20363,15 @@ GrB_Info GB_Adot3B__any_times_uint32
 GrB_Info GB_Asaxpy3B__any_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19623,14 +20384,14 @@ GrB_Info GB_Adot4B__any_times_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19648,13 +20409,15 @@ GrB_Info GB_Adot3B__any_times_uint64
 GrB_Info GB_Asaxpy3B__any_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19667,14 +20430,14 @@ GrB_Info GB_Adot4B__any_times_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19692,13 +20455,15 @@ GrB_Info GB_Adot3B__any_times_fp32
 GrB_Info GB_Asaxpy3B__any_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19711,14 +20476,14 @@ GrB_Info GB_Adot4B__any_times_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19736,13 +20501,15 @@ GrB_Info GB_Adot3B__any_times_fp64
 GrB_Info GB_Asaxpy3B__any_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19755,14 +20522,14 @@ GrB_Info GB_Adot4B__any_times_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_times_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19780,13 +20547,15 @@ GrB_Info GB_Adot3B__any_times_fc32
 GrB_Info GB_Asaxpy3B__any_times_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19799,14 +20568,14 @@ GrB_Info GB_Adot4B__any_times_fc32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_times_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19824,13 +20593,15 @@ GrB_Info GB_Adot3B__any_times_fc64
 GrB_Info GB_Asaxpy3B__any_times_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19844,14 +20615,13 @@ GrB_Info GB_Adot4B__any_times_fc64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19869,13 +20639,15 @@ GrB_Info GB_Adot3B__plus_times_int8
 GrB_Info GB_Asaxpy3B__plus_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19889,14 +20661,13 @@ GrB_Info GB_Adot4B__plus_times_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19914,13 +20685,15 @@ GrB_Info GB_Adot3B__plus_times_uint8
 GrB_Info GB_Asaxpy3B__plus_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19934,14 +20707,13 @@ GrB_Info GB_Adot4B__plus_times_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -19959,13 +20731,15 @@ GrB_Info GB_Adot3B__plus_times_int16
 GrB_Info GB_Asaxpy3B__plus_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -19979,14 +20753,13 @@ GrB_Info GB_Adot4B__plus_times_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20004,13 +20777,15 @@ GrB_Info GB_Adot3B__plus_times_uint16
 GrB_Info GB_Asaxpy3B__plus_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20024,14 +20799,13 @@ GrB_Info GB_Adot4B__plus_times_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20049,13 +20823,15 @@ GrB_Info GB_Adot3B__plus_times_int32
 GrB_Info GB_Asaxpy3B__plus_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20069,14 +20845,13 @@ GrB_Info GB_Adot4B__plus_times_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20094,13 +20869,15 @@ GrB_Info GB_Adot3B__plus_times_uint32
 GrB_Info GB_Asaxpy3B__plus_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20114,14 +20891,13 @@ GrB_Info GB_Adot4B__plus_times_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20139,13 +20915,15 @@ GrB_Info GB_Adot3B__plus_times_int64
 GrB_Info GB_Asaxpy3B__plus_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20159,14 +20937,13 @@ GrB_Info GB_Adot4B__plus_times_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20184,13 +20961,15 @@ GrB_Info GB_Adot3B__plus_times_uint64
 GrB_Info GB_Asaxpy3B__plus_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20204,14 +20983,13 @@ GrB_Info GB_Adot4B__plus_times_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20229,13 +21007,15 @@ GrB_Info GB_Adot3B__plus_times_fp32
 GrB_Info GB_Asaxpy3B__plus_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20249,14 +21029,13 @@ GrB_Info GB_Adot4B__plus_times_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20274,13 +21053,15 @@ GrB_Info GB_Adot3B__plus_times_fp64
 GrB_Info GB_Asaxpy3B__plus_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20294,14 +21075,13 @@ GrB_Info GB_Adot4B__plus_times_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_times_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20319,13 +21099,15 @@ GrB_Info GB_Adot3B__plus_times_fc32
 GrB_Info GB_Asaxpy3B__plus_times_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20339,14 +21121,13 @@ GrB_Info GB_Adot4B__plus_times_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_times_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20364,13 +21145,15 @@ GrB_Info GB_Adot3B__plus_times_fc64
 GrB_Info GB_Asaxpy3B__plus_times_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20383,14 +21166,14 @@ GrB_Info GB_Adot4B__plus_times_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20408,13 +21191,15 @@ GrB_Info GB_Adot3B__times_times_int8
 GrB_Info GB_Asaxpy3B__times_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20427,14 +21212,14 @@ GrB_Info GB_Adot4B__times_times_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20452,13 +21237,15 @@ GrB_Info GB_Adot3B__times_times_uint8
 GrB_Info GB_Asaxpy3B__times_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20471,14 +21258,14 @@ GrB_Info GB_Adot4B__times_times_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20496,13 +21283,15 @@ GrB_Info GB_Adot3B__times_times_int16
 GrB_Info GB_Asaxpy3B__times_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20515,14 +21304,14 @@ GrB_Info GB_Adot4B__times_times_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20540,13 +21329,15 @@ GrB_Info GB_Adot3B__times_times_uint16
 GrB_Info GB_Asaxpy3B__times_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20559,14 +21350,14 @@ GrB_Info GB_Adot4B__times_times_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20584,13 +21375,15 @@ GrB_Info GB_Adot3B__times_times_int32
 GrB_Info GB_Asaxpy3B__times_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20603,14 +21396,14 @@ GrB_Info GB_Adot4B__times_times_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20628,13 +21421,15 @@ GrB_Info GB_Adot3B__times_times_uint32
 GrB_Info GB_Asaxpy3B__times_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20647,14 +21442,14 @@ GrB_Info GB_Adot4B__times_times_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20672,13 +21467,15 @@ GrB_Info GB_Adot3B__times_times_int64
 GrB_Info GB_Asaxpy3B__times_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20691,14 +21488,14 @@ GrB_Info GB_Adot4B__times_times_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20716,13 +21513,15 @@ GrB_Info GB_Adot3B__times_times_uint64
 GrB_Info GB_Asaxpy3B__times_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20736,14 +21535,13 @@ GrB_Info GB_Adot4B__times_times_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20761,13 +21559,15 @@ GrB_Info GB_Adot3B__times_times_fp32
 GrB_Info GB_Asaxpy3B__times_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20781,14 +21581,13 @@ GrB_Info GB_Adot4B__times_times_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20806,13 +21605,15 @@ GrB_Info GB_Adot3B__times_times_fp64
 GrB_Info GB_Asaxpy3B__times_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20826,14 +21627,13 @@ GrB_Info GB_Adot4B__times_times_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_times_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20851,13 +21651,15 @@ GrB_Info GB_Adot3B__times_times_fc32
 GrB_Info GB_Asaxpy3B__times_times_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20871,14 +21673,13 @@ GrB_Info GB_Adot4B__times_times_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_times_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20896,13 +21697,15 @@ GrB_Info GB_Adot3B__times_times_fc64
 GrB_Info GB_Asaxpy3B__times_times_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20915,14 +21718,14 @@ GrB_Info GB_Adot4B__times_times_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20940,13 +21743,15 @@ GrB_Info GB_Adot3B__min_div_int8
 GrB_Info GB_Asaxpy3B__min_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -20959,14 +21764,14 @@ GrB_Info GB_Adot4B__min_div_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -20984,13 +21789,15 @@ GrB_Info GB_Adot3B__min_div_int16
 GrB_Info GB_Asaxpy3B__min_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21003,14 +21810,14 @@ GrB_Info GB_Adot4B__min_div_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21028,13 +21835,15 @@ GrB_Info GB_Adot3B__min_div_int32
 GrB_Info GB_Asaxpy3B__min_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21047,14 +21856,14 @@ GrB_Info GB_Adot4B__min_div_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21072,13 +21881,15 @@ GrB_Info GB_Adot3B__min_div_int64
 GrB_Info GB_Asaxpy3B__min_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21091,14 +21902,14 @@ GrB_Info GB_Adot4B__min_div_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21116,13 +21927,15 @@ GrB_Info GB_Adot3B__min_div_uint8
 GrB_Info GB_Asaxpy3B__min_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21135,14 +21948,14 @@ GrB_Info GB_Adot4B__min_div_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21160,13 +21973,15 @@ GrB_Info GB_Adot3B__min_div_uint16
 GrB_Info GB_Asaxpy3B__min_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21179,14 +21994,14 @@ GrB_Info GB_Adot4B__min_div_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21204,13 +22019,15 @@ GrB_Info GB_Adot3B__min_div_uint32
 GrB_Info GB_Asaxpy3B__min_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21223,14 +22040,14 @@ GrB_Info GB_Adot4B__min_div_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21248,13 +22065,15 @@ GrB_Info GB_Adot3B__min_div_uint64
 GrB_Info GB_Asaxpy3B__min_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21267,14 +22086,14 @@ GrB_Info GB_Adot4B__min_div_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21292,13 +22111,15 @@ GrB_Info GB_Adot3B__min_div_fp32
 GrB_Info GB_Asaxpy3B__min_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21311,14 +22132,14 @@ GrB_Info GB_Adot4B__min_div_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21336,13 +22157,15 @@ GrB_Info GB_Adot3B__min_div_fp64
 GrB_Info GB_Asaxpy3B__min_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21355,14 +22178,14 @@ GrB_Info GB_Adot4B__min_div_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21380,13 +22203,15 @@ GrB_Info GB_Adot3B__max_div_int8
 GrB_Info GB_Asaxpy3B__max_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21399,14 +22224,14 @@ GrB_Info GB_Adot4B__max_div_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21424,13 +22249,15 @@ GrB_Info GB_Adot3B__max_div_int16
 GrB_Info GB_Asaxpy3B__max_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21443,14 +22270,14 @@ GrB_Info GB_Adot4B__max_div_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21468,13 +22295,15 @@ GrB_Info GB_Adot3B__max_div_int32
 GrB_Info GB_Asaxpy3B__max_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21487,14 +22316,14 @@ GrB_Info GB_Adot4B__max_div_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21512,13 +22341,15 @@ GrB_Info GB_Adot3B__max_div_int64
 GrB_Info GB_Asaxpy3B__max_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21531,14 +22362,14 @@ GrB_Info GB_Adot4B__max_div_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21556,13 +22387,15 @@ GrB_Info GB_Adot3B__max_div_uint8
 GrB_Info GB_Asaxpy3B__max_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21575,14 +22408,14 @@ GrB_Info GB_Adot4B__max_div_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21600,13 +22433,15 @@ GrB_Info GB_Adot3B__max_div_uint16
 GrB_Info GB_Asaxpy3B__max_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21619,14 +22454,14 @@ GrB_Info GB_Adot4B__max_div_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21644,13 +22479,15 @@ GrB_Info GB_Adot3B__max_div_uint32
 GrB_Info GB_Asaxpy3B__max_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21663,14 +22500,14 @@ GrB_Info GB_Adot4B__max_div_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21688,13 +22525,15 @@ GrB_Info GB_Adot3B__max_div_uint64
 GrB_Info GB_Asaxpy3B__max_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21707,14 +22546,14 @@ GrB_Info GB_Adot4B__max_div_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21732,13 +22571,15 @@ GrB_Info GB_Adot3B__max_div_fp32
 GrB_Info GB_Asaxpy3B__max_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21751,14 +22592,14 @@ GrB_Info GB_Adot4B__max_div_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21776,13 +22617,15 @@ GrB_Info GB_Adot3B__max_div_fp64
 GrB_Info GB_Asaxpy3B__max_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21795,14 +22638,14 @@ GrB_Info GB_Adot4B__max_div_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21820,13 +22663,15 @@ GrB_Info GB_Adot3B__any_div_int8
 GrB_Info GB_Asaxpy3B__any_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21839,14 +22684,14 @@ GrB_Info GB_Adot4B__any_div_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21864,13 +22709,15 @@ GrB_Info GB_Adot3B__any_div_int16
 GrB_Info GB_Asaxpy3B__any_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21883,14 +22730,14 @@ GrB_Info GB_Adot4B__any_div_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21908,13 +22755,15 @@ GrB_Info GB_Adot3B__any_div_int32
 GrB_Info GB_Asaxpy3B__any_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21927,14 +22776,14 @@ GrB_Info GB_Adot4B__any_div_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21952,13 +22801,15 @@ GrB_Info GB_Adot3B__any_div_int64
 GrB_Info GB_Asaxpy3B__any_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -21971,14 +22822,14 @@ GrB_Info GB_Adot4B__any_div_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -21996,13 +22847,15 @@ GrB_Info GB_Adot3B__any_div_uint8
 GrB_Info GB_Asaxpy3B__any_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22015,14 +22868,14 @@ GrB_Info GB_Adot4B__any_div_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22040,13 +22893,15 @@ GrB_Info GB_Adot3B__any_div_uint16
 GrB_Info GB_Asaxpy3B__any_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22059,14 +22914,14 @@ GrB_Info GB_Adot4B__any_div_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22084,13 +22939,15 @@ GrB_Info GB_Adot3B__any_div_uint32
 GrB_Info GB_Asaxpy3B__any_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22103,14 +22960,14 @@ GrB_Info GB_Adot4B__any_div_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22128,13 +22985,15 @@ GrB_Info GB_Adot3B__any_div_uint64
 GrB_Info GB_Asaxpy3B__any_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22147,14 +23006,14 @@ GrB_Info GB_Adot4B__any_div_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22172,13 +23031,15 @@ GrB_Info GB_Adot3B__any_div_fp32
 GrB_Info GB_Asaxpy3B__any_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22191,14 +23052,14 @@ GrB_Info GB_Adot4B__any_div_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22216,13 +23077,15 @@ GrB_Info GB_Adot3B__any_div_fp64
 GrB_Info GB_Asaxpy3B__any_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22235,14 +23098,14 @@ GrB_Info GB_Adot4B__any_div_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_div_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22260,13 +23123,15 @@ GrB_Info GB_Adot3B__any_div_fc32
 GrB_Info GB_Asaxpy3B__any_div_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22279,14 +23144,14 @@ GrB_Info GB_Adot4B__any_div_fc32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_div_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22304,13 +23169,15 @@ GrB_Info GB_Adot3B__any_div_fc64
 GrB_Info GB_Asaxpy3B__any_div_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22324,14 +23191,13 @@ GrB_Info GB_Adot4B__any_div_fc64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22349,13 +23215,15 @@ GrB_Info GB_Adot3B__plus_div_int8
 GrB_Info GB_Asaxpy3B__plus_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22369,14 +23237,13 @@ GrB_Info GB_Adot4B__plus_div_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22394,13 +23261,15 @@ GrB_Info GB_Adot3B__plus_div_uint8
 GrB_Info GB_Asaxpy3B__plus_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22414,14 +23283,13 @@ GrB_Info GB_Adot4B__plus_div_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22439,13 +23307,15 @@ GrB_Info GB_Adot3B__plus_div_int16
 GrB_Info GB_Asaxpy3B__plus_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22459,14 +23329,13 @@ GrB_Info GB_Adot4B__plus_div_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22484,13 +23353,15 @@ GrB_Info GB_Adot3B__plus_div_uint16
 GrB_Info GB_Asaxpy3B__plus_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22504,14 +23375,13 @@ GrB_Info GB_Adot4B__plus_div_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22529,13 +23399,15 @@ GrB_Info GB_Adot3B__plus_div_int32
 GrB_Info GB_Asaxpy3B__plus_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22549,14 +23421,13 @@ GrB_Info GB_Adot4B__plus_div_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22574,13 +23445,15 @@ GrB_Info GB_Adot3B__plus_div_uint32
 GrB_Info GB_Asaxpy3B__plus_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22594,14 +23467,13 @@ GrB_Info GB_Adot4B__plus_div_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22619,13 +23491,15 @@ GrB_Info GB_Adot3B__plus_div_int64
 GrB_Info GB_Asaxpy3B__plus_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22639,14 +23513,13 @@ GrB_Info GB_Adot4B__plus_div_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22664,13 +23537,15 @@ GrB_Info GB_Adot3B__plus_div_uint64
 GrB_Info GB_Asaxpy3B__plus_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22684,14 +23559,13 @@ GrB_Info GB_Adot4B__plus_div_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22709,13 +23583,15 @@ GrB_Info GB_Adot3B__plus_div_fp32
 GrB_Info GB_Asaxpy3B__plus_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22729,14 +23605,13 @@ GrB_Info GB_Adot4B__plus_div_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22754,13 +23629,15 @@ GrB_Info GB_Adot3B__plus_div_fp64
 GrB_Info GB_Asaxpy3B__plus_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22774,14 +23651,13 @@ GrB_Info GB_Adot4B__plus_div_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_div_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22799,13 +23675,15 @@ GrB_Info GB_Adot3B__plus_div_fc32
 GrB_Info GB_Asaxpy3B__plus_div_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22819,14 +23697,13 @@ GrB_Info GB_Adot4B__plus_div_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_div_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22844,13 +23721,15 @@ GrB_Info GB_Adot3B__plus_div_fc64
 GrB_Info GB_Asaxpy3B__plus_div_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22863,14 +23742,14 @@ GrB_Info GB_Adot4B__plus_div_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22888,13 +23767,15 @@ GrB_Info GB_Adot3B__times_div_int8
 GrB_Info GB_Asaxpy3B__times_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22907,14 +23788,14 @@ GrB_Info GB_Adot4B__times_div_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22932,13 +23813,15 @@ GrB_Info GB_Adot3B__times_div_uint8
 GrB_Info GB_Asaxpy3B__times_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22951,14 +23834,14 @@ GrB_Info GB_Adot4B__times_div_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -22976,13 +23859,15 @@ GrB_Info GB_Adot3B__times_div_int16
 GrB_Info GB_Asaxpy3B__times_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -22995,14 +23880,14 @@ GrB_Info GB_Adot4B__times_div_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23020,13 +23905,15 @@ GrB_Info GB_Adot3B__times_div_uint16
 GrB_Info GB_Asaxpy3B__times_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23039,14 +23926,14 @@ GrB_Info GB_Adot4B__times_div_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23064,13 +23951,15 @@ GrB_Info GB_Adot3B__times_div_int32
 GrB_Info GB_Asaxpy3B__times_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23083,14 +23972,14 @@ GrB_Info GB_Adot4B__times_div_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23108,13 +23997,15 @@ GrB_Info GB_Adot3B__times_div_uint32
 GrB_Info GB_Asaxpy3B__times_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23127,14 +24018,14 @@ GrB_Info GB_Adot4B__times_div_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23152,13 +24043,15 @@ GrB_Info GB_Adot3B__times_div_int64
 GrB_Info GB_Asaxpy3B__times_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23171,14 +24064,14 @@ GrB_Info GB_Adot4B__times_div_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23196,13 +24089,15 @@ GrB_Info GB_Adot3B__times_div_uint64
 GrB_Info GB_Asaxpy3B__times_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23216,14 +24111,13 @@ GrB_Info GB_Adot4B__times_div_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23241,13 +24135,15 @@ GrB_Info GB_Adot3B__times_div_fp32
 GrB_Info GB_Asaxpy3B__times_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23261,14 +24157,13 @@ GrB_Info GB_Adot4B__times_div_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23286,13 +24181,15 @@ GrB_Info GB_Adot3B__times_div_fp64
 GrB_Info GB_Asaxpy3B__times_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23306,14 +24203,13 @@ GrB_Info GB_Adot4B__times_div_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_div_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23331,13 +24227,15 @@ GrB_Info GB_Adot3B__times_div_fc32
 GrB_Info GB_Asaxpy3B__times_div_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23351,14 +24249,13 @@ GrB_Info GB_Adot4B__times_div_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_div_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23376,13 +24273,15 @@ GrB_Info GB_Adot3B__times_div_fc64
 GrB_Info GB_Asaxpy3B__times_div_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23395,14 +24294,14 @@ GrB_Info GB_Adot4B__times_div_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23420,13 +24319,15 @@ GrB_Info GB_Adot3B__min_rdiv_int8
 GrB_Info GB_Asaxpy3B__min_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23439,14 +24340,14 @@ GrB_Info GB_Adot4B__min_rdiv_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23464,13 +24365,15 @@ GrB_Info GB_Adot3B__min_rdiv_int16
 GrB_Info GB_Asaxpy3B__min_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23483,14 +24386,14 @@ GrB_Info GB_Adot4B__min_rdiv_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23508,13 +24411,15 @@ GrB_Info GB_Adot3B__min_rdiv_int32
 GrB_Info GB_Asaxpy3B__min_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23527,14 +24432,14 @@ GrB_Info GB_Adot4B__min_rdiv_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23552,13 +24457,15 @@ GrB_Info GB_Adot3B__min_rdiv_int64
 GrB_Info GB_Asaxpy3B__min_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23571,14 +24478,14 @@ GrB_Info GB_Adot4B__min_rdiv_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23596,13 +24503,15 @@ GrB_Info GB_Adot3B__min_rdiv_uint8
 GrB_Info GB_Asaxpy3B__min_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23615,14 +24524,14 @@ GrB_Info GB_Adot4B__min_rdiv_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23640,13 +24549,15 @@ GrB_Info GB_Adot3B__min_rdiv_uint16
 GrB_Info GB_Asaxpy3B__min_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23659,14 +24570,14 @@ GrB_Info GB_Adot4B__min_rdiv_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23684,13 +24595,15 @@ GrB_Info GB_Adot3B__min_rdiv_uint32
 GrB_Info GB_Asaxpy3B__min_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23703,14 +24616,14 @@ GrB_Info GB_Adot4B__min_rdiv_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23728,13 +24641,15 @@ GrB_Info GB_Adot3B__min_rdiv_uint64
 GrB_Info GB_Asaxpy3B__min_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23747,14 +24662,14 @@ GrB_Info GB_Adot4B__min_rdiv_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23772,13 +24687,15 @@ GrB_Info GB_Adot3B__min_rdiv_fp32
 GrB_Info GB_Asaxpy3B__min_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23791,14 +24708,14 @@ GrB_Info GB_Adot4B__min_rdiv_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23816,13 +24733,15 @@ GrB_Info GB_Adot3B__min_rdiv_fp64
 GrB_Info GB_Asaxpy3B__min_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23835,14 +24754,14 @@ GrB_Info GB_Adot4B__min_rdiv_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23860,13 +24779,15 @@ GrB_Info GB_Adot3B__max_rdiv_int8
 GrB_Info GB_Asaxpy3B__max_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23879,14 +24800,14 @@ GrB_Info GB_Adot4B__max_rdiv_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23904,13 +24825,15 @@ GrB_Info GB_Adot3B__max_rdiv_int16
 GrB_Info GB_Asaxpy3B__max_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23923,14 +24846,14 @@ GrB_Info GB_Adot4B__max_rdiv_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23948,13 +24871,15 @@ GrB_Info GB_Adot3B__max_rdiv_int32
 GrB_Info GB_Asaxpy3B__max_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -23967,14 +24892,14 @@ GrB_Info GB_Adot4B__max_rdiv_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23992,13 +24917,15 @@ GrB_Info GB_Adot3B__max_rdiv_int64
 GrB_Info GB_Asaxpy3B__max_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24011,14 +24938,14 @@ GrB_Info GB_Adot4B__max_rdiv_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24036,13 +24963,15 @@ GrB_Info GB_Adot3B__max_rdiv_uint8
 GrB_Info GB_Asaxpy3B__max_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24055,14 +24984,14 @@ GrB_Info GB_Adot4B__max_rdiv_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24080,13 +25009,15 @@ GrB_Info GB_Adot3B__max_rdiv_uint16
 GrB_Info GB_Asaxpy3B__max_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24099,14 +25030,14 @@ GrB_Info GB_Adot4B__max_rdiv_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24124,13 +25055,15 @@ GrB_Info GB_Adot3B__max_rdiv_uint32
 GrB_Info GB_Asaxpy3B__max_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24143,14 +25076,14 @@ GrB_Info GB_Adot4B__max_rdiv_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24168,13 +25101,15 @@ GrB_Info GB_Adot3B__max_rdiv_uint64
 GrB_Info GB_Asaxpy3B__max_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24187,14 +25122,14 @@ GrB_Info GB_Adot4B__max_rdiv_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24212,13 +25147,15 @@ GrB_Info GB_Adot3B__max_rdiv_fp32
 GrB_Info GB_Asaxpy3B__max_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24231,14 +25168,14 @@ GrB_Info GB_Adot4B__max_rdiv_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24256,13 +25193,15 @@ GrB_Info GB_Adot3B__max_rdiv_fp64
 GrB_Info GB_Asaxpy3B__max_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24275,14 +25214,14 @@ GrB_Info GB_Adot4B__max_rdiv_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24300,13 +25239,15 @@ GrB_Info GB_Adot3B__any_rdiv_int8
 GrB_Info GB_Asaxpy3B__any_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24319,14 +25260,14 @@ GrB_Info GB_Adot4B__any_rdiv_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24344,13 +25285,15 @@ GrB_Info GB_Adot3B__any_rdiv_int16
 GrB_Info GB_Asaxpy3B__any_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24363,14 +25306,14 @@ GrB_Info GB_Adot4B__any_rdiv_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24388,13 +25331,15 @@ GrB_Info GB_Adot3B__any_rdiv_int32
 GrB_Info GB_Asaxpy3B__any_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24407,14 +25352,14 @@ GrB_Info GB_Adot4B__any_rdiv_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24432,13 +25377,15 @@ GrB_Info GB_Adot3B__any_rdiv_int64
 GrB_Info GB_Asaxpy3B__any_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24451,14 +25398,14 @@ GrB_Info GB_Adot4B__any_rdiv_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24476,13 +25423,15 @@ GrB_Info GB_Adot3B__any_rdiv_uint8
 GrB_Info GB_Asaxpy3B__any_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24495,14 +25444,14 @@ GrB_Info GB_Adot4B__any_rdiv_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24520,13 +25469,15 @@ GrB_Info GB_Adot3B__any_rdiv_uint16
 GrB_Info GB_Asaxpy3B__any_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24539,14 +25490,14 @@ GrB_Info GB_Adot4B__any_rdiv_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24564,13 +25515,15 @@ GrB_Info GB_Adot3B__any_rdiv_uint32
 GrB_Info GB_Asaxpy3B__any_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24583,14 +25536,14 @@ GrB_Info GB_Adot4B__any_rdiv_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24608,13 +25561,15 @@ GrB_Info GB_Adot3B__any_rdiv_uint64
 GrB_Info GB_Asaxpy3B__any_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24627,14 +25582,14 @@ GrB_Info GB_Adot4B__any_rdiv_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24652,13 +25607,15 @@ GrB_Info GB_Adot3B__any_rdiv_fp32
 GrB_Info GB_Asaxpy3B__any_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24671,14 +25628,14 @@ GrB_Info GB_Adot4B__any_rdiv_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24696,13 +25653,15 @@ GrB_Info GB_Adot3B__any_rdiv_fp64
 GrB_Info GB_Asaxpy3B__any_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24715,14 +25674,14 @@ GrB_Info GB_Adot4B__any_rdiv_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rdiv_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24740,13 +25699,15 @@ GrB_Info GB_Adot3B__any_rdiv_fc32
 GrB_Info GB_Asaxpy3B__any_rdiv_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24759,14 +25720,14 @@ GrB_Info GB_Adot4B__any_rdiv_fc32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_rdiv_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24784,13 +25745,15 @@ GrB_Info GB_Adot3B__any_rdiv_fc64
 GrB_Info GB_Asaxpy3B__any_rdiv_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24804,14 +25767,13 @@ GrB_Info GB_Adot4B__any_rdiv_fc64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24829,13 +25791,15 @@ GrB_Info GB_Adot3B__plus_rdiv_int8
 GrB_Info GB_Asaxpy3B__plus_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24849,14 +25813,13 @@ GrB_Info GB_Adot4B__plus_rdiv_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24874,13 +25837,15 @@ GrB_Info GB_Adot3B__plus_rdiv_uint8
 GrB_Info GB_Asaxpy3B__plus_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24894,14 +25859,13 @@ GrB_Info GB_Adot4B__plus_rdiv_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24919,13 +25883,15 @@ GrB_Info GB_Adot3B__plus_rdiv_int16
 GrB_Info GB_Asaxpy3B__plus_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24939,14 +25905,13 @@ GrB_Info GB_Adot4B__plus_rdiv_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -24964,13 +25929,15 @@ GrB_Info GB_Adot3B__plus_rdiv_uint16
 GrB_Info GB_Asaxpy3B__plus_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -24984,14 +25951,13 @@ GrB_Info GB_Adot4B__plus_rdiv_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25009,13 +25975,15 @@ GrB_Info GB_Adot3B__plus_rdiv_int32
 GrB_Info GB_Asaxpy3B__plus_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25029,14 +25997,13 @@ GrB_Info GB_Adot4B__plus_rdiv_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25054,13 +26021,15 @@ GrB_Info GB_Adot3B__plus_rdiv_uint32
 GrB_Info GB_Asaxpy3B__plus_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25074,14 +26043,13 @@ GrB_Info GB_Adot4B__plus_rdiv_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25099,13 +26067,15 @@ GrB_Info GB_Adot3B__plus_rdiv_int64
 GrB_Info GB_Asaxpy3B__plus_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25119,14 +26089,13 @@ GrB_Info GB_Adot4B__plus_rdiv_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25144,13 +26113,15 @@ GrB_Info GB_Adot3B__plus_rdiv_uint64
 GrB_Info GB_Asaxpy3B__plus_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25164,14 +26135,13 @@ GrB_Info GB_Adot4B__plus_rdiv_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25189,13 +26159,15 @@ GrB_Info GB_Adot3B__plus_rdiv_fp32
 GrB_Info GB_Asaxpy3B__plus_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25209,14 +26181,13 @@ GrB_Info GB_Adot4B__plus_rdiv_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25234,13 +26205,15 @@ GrB_Info GB_Adot3B__plus_rdiv_fp64
 GrB_Info GB_Asaxpy3B__plus_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25254,14 +26227,13 @@ GrB_Info GB_Adot4B__plus_rdiv_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rdiv_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25279,13 +26251,15 @@ GrB_Info GB_Adot3B__plus_rdiv_fc32
 GrB_Info GB_Asaxpy3B__plus_rdiv_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25299,14 +26273,13 @@ GrB_Info GB_Adot4B__plus_rdiv_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_rdiv_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25324,13 +26297,15 @@ GrB_Info GB_Adot3B__plus_rdiv_fc64
 GrB_Info GB_Asaxpy3B__plus_rdiv_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25343,14 +26318,14 @@ GrB_Info GB_Adot4B__plus_rdiv_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25368,13 +26343,15 @@ GrB_Info GB_Adot3B__times_rdiv_int8
 GrB_Info GB_Asaxpy3B__times_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25387,14 +26364,14 @@ GrB_Info GB_Adot4B__times_rdiv_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25412,13 +26389,15 @@ GrB_Info GB_Adot3B__times_rdiv_uint8
 GrB_Info GB_Asaxpy3B__times_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25431,14 +26410,14 @@ GrB_Info GB_Adot4B__times_rdiv_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25456,13 +26435,15 @@ GrB_Info GB_Adot3B__times_rdiv_int16
 GrB_Info GB_Asaxpy3B__times_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25475,14 +26456,14 @@ GrB_Info GB_Adot4B__times_rdiv_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25500,13 +26481,15 @@ GrB_Info GB_Adot3B__times_rdiv_uint16
 GrB_Info GB_Asaxpy3B__times_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25519,14 +26502,14 @@ GrB_Info GB_Adot4B__times_rdiv_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25544,13 +26527,15 @@ GrB_Info GB_Adot3B__times_rdiv_int32
 GrB_Info GB_Asaxpy3B__times_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25563,14 +26548,14 @@ GrB_Info GB_Adot4B__times_rdiv_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25588,13 +26573,15 @@ GrB_Info GB_Adot3B__times_rdiv_uint32
 GrB_Info GB_Asaxpy3B__times_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25607,14 +26594,14 @@ GrB_Info GB_Adot4B__times_rdiv_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25632,13 +26619,15 @@ GrB_Info GB_Adot3B__times_rdiv_int64
 GrB_Info GB_Asaxpy3B__times_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25651,14 +26640,14 @@ GrB_Info GB_Adot4B__times_rdiv_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25676,13 +26665,15 @@ GrB_Info GB_Adot3B__times_rdiv_uint64
 GrB_Info GB_Asaxpy3B__times_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25696,14 +26687,13 @@ GrB_Info GB_Adot4B__times_rdiv_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25721,13 +26711,15 @@ GrB_Info GB_Adot3B__times_rdiv_fp32
 GrB_Info GB_Asaxpy3B__times_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25741,14 +26733,13 @@ GrB_Info GB_Adot4B__times_rdiv_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25766,13 +26757,15 @@ GrB_Info GB_Adot3B__times_rdiv_fp64
 GrB_Info GB_Asaxpy3B__times_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25786,14 +26779,13 @@ GrB_Info GB_Adot4B__times_rdiv_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rdiv_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25811,13 +26803,15 @@ GrB_Info GB_Adot3B__times_rdiv_fc32
 GrB_Info GB_Asaxpy3B__times_rdiv_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25831,14 +26825,13 @@ GrB_Info GB_Adot4B__times_rdiv_fc32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_rdiv_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25856,13 +26849,15 @@ GrB_Info GB_Adot3B__times_rdiv_fc64
 GrB_Info GB_Asaxpy3B__times_rdiv_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25875,14 +26870,14 @@ GrB_Info GB_Adot4B__times_rdiv_fc64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25900,13 +26895,15 @@ GrB_Info GB_Adot3B__min_iseq_int8
 GrB_Info GB_Asaxpy3B__min_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25919,14 +26916,14 @@ GrB_Info GB_Adot4B__min_iseq_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25944,13 +26941,15 @@ GrB_Info GB_Adot3B__min_iseq_int16
 GrB_Info GB_Asaxpy3B__min_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -25963,14 +26962,14 @@ GrB_Info GB_Adot4B__min_iseq_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -25988,13 +26987,15 @@ GrB_Info GB_Adot3B__min_iseq_int32
 GrB_Info GB_Asaxpy3B__min_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26007,14 +27008,14 @@ GrB_Info GB_Adot4B__min_iseq_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26032,13 +27033,15 @@ GrB_Info GB_Adot3B__min_iseq_int64
 GrB_Info GB_Asaxpy3B__min_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26051,14 +27054,14 @@ GrB_Info GB_Adot4B__min_iseq_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26076,13 +27079,15 @@ GrB_Info GB_Adot3B__min_iseq_uint8
 GrB_Info GB_Asaxpy3B__min_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26095,14 +27100,14 @@ GrB_Info GB_Adot4B__min_iseq_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26120,13 +27125,15 @@ GrB_Info GB_Adot3B__min_iseq_uint16
 GrB_Info GB_Asaxpy3B__min_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26139,14 +27146,14 @@ GrB_Info GB_Adot4B__min_iseq_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26164,13 +27171,15 @@ GrB_Info GB_Adot3B__min_iseq_uint32
 GrB_Info GB_Asaxpy3B__min_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26183,14 +27192,14 @@ GrB_Info GB_Adot4B__min_iseq_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26208,13 +27217,15 @@ GrB_Info GB_Adot3B__min_iseq_uint64
 GrB_Info GB_Asaxpy3B__min_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26227,14 +27238,14 @@ GrB_Info GB_Adot4B__min_iseq_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26252,13 +27263,15 @@ GrB_Info GB_Adot3B__min_iseq_fp32
 GrB_Info GB_Asaxpy3B__min_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26271,14 +27284,14 @@ GrB_Info GB_Adot4B__min_iseq_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26296,13 +27309,15 @@ GrB_Info GB_Adot3B__min_iseq_fp64
 GrB_Info GB_Asaxpy3B__min_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26315,14 +27330,14 @@ GrB_Info GB_Adot4B__min_iseq_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26340,13 +27355,15 @@ GrB_Info GB_Adot3B__max_iseq_int8
 GrB_Info GB_Asaxpy3B__max_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26359,14 +27376,14 @@ GrB_Info GB_Adot4B__max_iseq_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26384,13 +27401,15 @@ GrB_Info GB_Adot3B__max_iseq_int16
 GrB_Info GB_Asaxpy3B__max_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26403,14 +27422,14 @@ GrB_Info GB_Adot4B__max_iseq_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26428,13 +27447,15 @@ GrB_Info GB_Adot3B__max_iseq_int32
 GrB_Info GB_Asaxpy3B__max_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26447,14 +27468,14 @@ GrB_Info GB_Adot4B__max_iseq_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26472,13 +27493,15 @@ GrB_Info GB_Adot3B__max_iseq_int64
 GrB_Info GB_Asaxpy3B__max_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26491,14 +27514,14 @@ GrB_Info GB_Adot4B__max_iseq_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26516,13 +27539,15 @@ GrB_Info GB_Adot3B__max_iseq_uint8
 GrB_Info GB_Asaxpy3B__max_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26535,14 +27560,14 @@ GrB_Info GB_Adot4B__max_iseq_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26560,13 +27585,15 @@ GrB_Info GB_Adot3B__max_iseq_uint16
 GrB_Info GB_Asaxpy3B__max_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26579,14 +27606,14 @@ GrB_Info GB_Adot4B__max_iseq_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26604,13 +27631,15 @@ GrB_Info GB_Adot3B__max_iseq_uint32
 GrB_Info GB_Asaxpy3B__max_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26623,14 +27652,14 @@ GrB_Info GB_Adot4B__max_iseq_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26648,13 +27677,15 @@ GrB_Info GB_Adot3B__max_iseq_uint64
 GrB_Info GB_Asaxpy3B__max_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26667,14 +27698,14 @@ GrB_Info GB_Adot4B__max_iseq_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26692,13 +27723,15 @@ GrB_Info GB_Adot3B__max_iseq_fp32
 GrB_Info GB_Asaxpy3B__max_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26711,14 +27744,14 @@ GrB_Info GB_Adot4B__max_iseq_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26736,13 +27769,15 @@ GrB_Info GB_Adot3B__max_iseq_fp64
 GrB_Info GB_Asaxpy3B__max_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26755,14 +27790,14 @@ GrB_Info GB_Adot4B__max_iseq_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26780,13 +27815,15 @@ GrB_Info GB_Adot3B__any_iseq_int8
 GrB_Info GB_Asaxpy3B__any_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26799,14 +27836,14 @@ GrB_Info GB_Adot4B__any_iseq_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26824,13 +27861,15 @@ GrB_Info GB_Adot3B__any_iseq_int16
 GrB_Info GB_Asaxpy3B__any_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26843,14 +27882,14 @@ GrB_Info GB_Adot4B__any_iseq_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26868,13 +27907,15 @@ GrB_Info GB_Adot3B__any_iseq_int32
 GrB_Info GB_Asaxpy3B__any_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26887,14 +27928,14 @@ GrB_Info GB_Adot4B__any_iseq_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26912,13 +27953,15 @@ GrB_Info GB_Adot3B__any_iseq_int64
 GrB_Info GB_Asaxpy3B__any_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26931,14 +27974,14 @@ GrB_Info GB_Adot4B__any_iseq_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -26956,13 +27999,15 @@ GrB_Info GB_Adot3B__any_iseq_uint8
 GrB_Info GB_Asaxpy3B__any_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -26975,14 +28020,14 @@ GrB_Info GB_Adot4B__any_iseq_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27000,13 +28045,15 @@ GrB_Info GB_Adot3B__any_iseq_uint16
 GrB_Info GB_Asaxpy3B__any_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27019,14 +28066,14 @@ GrB_Info GB_Adot4B__any_iseq_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27044,13 +28091,15 @@ GrB_Info GB_Adot3B__any_iseq_uint32
 GrB_Info GB_Asaxpy3B__any_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27063,14 +28112,14 @@ GrB_Info GB_Adot4B__any_iseq_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27088,13 +28137,15 @@ GrB_Info GB_Adot3B__any_iseq_uint64
 GrB_Info GB_Asaxpy3B__any_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27107,14 +28158,14 @@ GrB_Info GB_Adot4B__any_iseq_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27132,13 +28183,15 @@ GrB_Info GB_Adot3B__any_iseq_fp32
 GrB_Info GB_Asaxpy3B__any_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27151,14 +28204,14 @@ GrB_Info GB_Adot4B__any_iseq_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27176,13 +28229,15 @@ GrB_Info GB_Adot3B__any_iseq_fp64
 GrB_Info GB_Asaxpy3B__any_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27196,14 +28251,13 @@ GrB_Info GB_Adot4B__any_iseq_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27221,13 +28275,15 @@ GrB_Info GB_Adot3B__plus_iseq_int8
 GrB_Info GB_Asaxpy3B__plus_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27241,14 +28297,13 @@ GrB_Info GB_Adot4B__plus_iseq_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27266,13 +28321,15 @@ GrB_Info GB_Adot3B__plus_iseq_uint8
 GrB_Info GB_Asaxpy3B__plus_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27286,14 +28343,13 @@ GrB_Info GB_Adot4B__plus_iseq_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27311,13 +28367,15 @@ GrB_Info GB_Adot3B__plus_iseq_int16
 GrB_Info GB_Asaxpy3B__plus_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27331,14 +28389,13 @@ GrB_Info GB_Adot4B__plus_iseq_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27356,13 +28413,15 @@ GrB_Info GB_Adot3B__plus_iseq_uint16
 GrB_Info GB_Asaxpy3B__plus_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27376,14 +28435,13 @@ GrB_Info GB_Adot4B__plus_iseq_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27401,13 +28459,15 @@ GrB_Info GB_Adot3B__plus_iseq_int32
 GrB_Info GB_Asaxpy3B__plus_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27421,14 +28481,13 @@ GrB_Info GB_Adot4B__plus_iseq_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27446,13 +28505,15 @@ GrB_Info GB_Adot3B__plus_iseq_uint32
 GrB_Info GB_Asaxpy3B__plus_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27466,14 +28527,13 @@ GrB_Info GB_Adot4B__plus_iseq_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27491,13 +28551,15 @@ GrB_Info GB_Adot3B__plus_iseq_int64
 GrB_Info GB_Asaxpy3B__plus_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27511,14 +28573,13 @@ GrB_Info GB_Adot4B__plus_iseq_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27536,13 +28597,15 @@ GrB_Info GB_Adot3B__plus_iseq_uint64
 GrB_Info GB_Asaxpy3B__plus_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27556,14 +28619,13 @@ GrB_Info GB_Adot4B__plus_iseq_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27581,13 +28643,15 @@ GrB_Info GB_Adot3B__plus_iseq_fp32
 GrB_Info GB_Asaxpy3B__plus_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27601,14 +28665,13 @@ GrB_Info GB_Adot4B__plus_iseq_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27626,13 +28689,15 @@ GrB_Info GB_Adot3B__plus_iseq_fp64
 GrB_Info GB_Asaxpy3B__plus_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27645,14 +28710,14 @@ GrB_Info GB_Adot4B__plus_iseq_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27670,13 +28735,15 @@ GrB_Info GB_Adot3B__times_iseq_int8
 GrB_Info GB_Asaxpy3B__times_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27689,14 +28756,14 @@ GrB_Info GB_Adot4B__times_iseq_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27714,13 +28781,15 @@ GrB_Info GB_Adot3B__times_iseq_uint8
 GrB_Info GB_Asaxpy3B__times_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27733,14 +28802,14 @@ GrB_Info GB_Adot4B__times_iseq_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27758,13 +28827,15 @@ GrB_Info GB_Adot3B__times_iseq_int16
 GrB_Info GB_Asaxpy3B__times_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27777,14 +28848,14 @@ GrB_Info GB_Adot4B__times_iseq_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27802,13 +28873,15 @@ GrB_Info GB_Adot3B__times_iseq_uint16
 GrB_Info GB_Asaxpy3B__times_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27821,14 +28894,14 @@ GrB_Info GB_Adot4B__times_iseq_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27846,13 +28919,15 @@ GrB_Info GB_Adot3B__times_iseq_int32
 GrB_Info GB_Asaxpy3B__times_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27865,14 +28940,14 @@ GrB_Info GB_Adot4B__times_iseq_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27890,13 +28965,15 @@ GrB_Info GB_Adot3B__times_iseq_uint32
 GrB_Info GB_Asaxpy3B__times_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27909,14 +28986,14 @@ GrB_Info GB_Adot4B__times_iseq_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27934,13 +29011,15 @@ GrB_Info GB_Adot3B__times_iseq_int64
 GrB_Info GB_Asaxpy3B__times_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27953,14 +29032,14 @@ GrB_Info GB_Adot4B__times_iseq_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -27978,13 +29057,15 @@ GrB_Info GB_Adot3B__times_iseq_uint64
 GrB_Info GB_Asaxpy3B__times_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -27998,14 +29079,13 @@ GrB_Info GB_Adot4B__times_iseq_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28023,13 +29103,15 @@ GrB_Info GB_Adot3B__times_iseq_fp32
 GrB_Info GB_Asaxpy3B__times_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28043,14 +29125,13 @@ GrB_Info GB_Adot4B__times_iseq_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28068,13 +29149,15 @@ GrB_Info GB_Adot3B__times_iseq_fp64
 GrB_Info GB_Asaxpy3B__times_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28087,14 +29170,14 @@ GrB_Info GB_Adot4B__times_iseq_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28112,13 +29195,15 @@ GrB_Info GB_Adot3B__min_isne_int8
 GrB_Info GB_Asaxpy3B__min_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28131,14 +29216,14 @@ GrB_Info GB_Adot4B__min_isne_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28156,13 +29241,15 @@ GrB_Info GB_Adot3B__min_isne_int16
 GrB_Info GB_Asaxpy3B__min_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28175,14 +29262,14 @@ GrB_Info GB_Adot4B__min_isne_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28200,13 +29287,15 @@ GrB_Info GB_Adot3B__min_isne_int32
 GrB_Info GB_Asaxpy3B__min_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28219,14 +29308,14 @@ GrB_Info GB_Adot4B__min_isne_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28244,13 +29333,15 @@ GrB_Info GB_Adot3B__min_isne_int64
 GrB_Info GB_Asaxpy3B__min_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28263,14 +29354,14 @@ GrB_Info GB_Adot4B__min_isne_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28288,13 +29379,15 @@ GrB_Info GB_Adot3B__min_isne_uint8
 GrB_Info GB_Asaxpy3B__min_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28307,14 +29400,14 @@ GrB_Info GB_Adot4B__min_isne_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28332,13 +29425,15 @@ GrB_Info GB_Adot3B__min_isne_uint16
 GrB_Info GB_Asaxpy3B__min_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28351,14 +29446,14 @@ GrB_Info GB_Adot4B__min_isne_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28376,13 +29471,15 @@ GrB_Info GB_Adot3B__min_isne_uint32
 GrB_Info GB_Asaxpy3B__min_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28395,14 +29492,14 @@ GrB_Info GB_Adot4B__min_isne_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28420,13 +29517,15 @@ GrB_Info GB_Adot3B__min_isne_uint64
 GrB_Info GB_Asaxpy3B__min_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28439,14 +29538,14 @@ GrB_Info GB_Adot4B__min_isne_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28464,13 +29563,15 @@ GrB_Info GB_Adot3B__min_isne_fp32
 GrB_Info GB_Asaxpy3B__min_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28483,14 +29584,14 @@ GrB_Info GB_Adot4B__min_isne_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28508,13 +29609,15 @@ GrB_Info GB_Adot3B__min_isne_fp64
 GrB_Info GB_Asaxpy3B__min_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28527,14 +29630,14 @@ GrB_Info GB_Adot4B__min_isne_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28552,13 +29655,15 @@ GrB_Info GB_Adot3B__max_isne_int8
 GrB_Info GB_Asaxpy3B__max_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28571,14 +29676,14 @@ GrB_Info GB_Adot4B__max_isne_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28596,13 +29701,15 @@ GrB_Info GB_Adot3B__max_isne_int16
 GrB_Info GB_Asaxpy3B__max_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28615,14 +29722,14 @@ GrB_Info GB_Adot4B__max_isne_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28640,13 +29747,15 @@ GrB_Info GB_Adot3B__max_isne_int32
 GrB_Info GB_Asaxpy3B__max_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28659,14 +29768,14 @@ GrB_Info GB_Adot4B__max_isne_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28684,13 +29793,15 @@ GrB_Info GB_Adot3B__max_isne_int64
 GrB_Info GB_Asaxpy3B__max_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28703,14 +29814,14 @@ GrB_Info GB_Adot4B__max_isne_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28728,13 +29839,15 @@ GrB_Info GB_Adot3B__max_isne_uint8
 GrB_Info GB_Asaxpy3B__max_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28747,14 +29860,14 @@ GrB_Info GB_Adot4B__max_isne_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28772,13 +29885,15 @@ GrB_Info GB_Adot3B__max_isne_uint16
 GrB_Info GB_Asaxpy3B__max_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28791,14 +29906,14 @@ GrB_Info GB_Adot4B__max_isne_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28816,13 +29931,15 @@ GrB_Info GB_Adot3B__max_isne_uint32
 GrB_Info GB_Asaxpy3B__max_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28835,14 +29952,14 @@ GrB_Info GB_Adot4B__max_isne_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28860,13 +29977,15 @@ GrB_Info GB_Adot3B__max_isne_uint64
 GrB_Info GB_Asaxpy3B__max_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28879,14 +29998,14 @@ GrB_Info GB_Adot4B__max_isne_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28904,13 +30023,15 @@ GrB_Info GB_Adot3B__max_isne_fp32
 GrB_Info GB_Asaxpy3B__max_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28923,14 +30044,14 @@ GrB_Info GB_Adot4B__max_isne_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28948,13 +30069,15 @@ GrB_Info GB_Adot3B__max_isne_fp64
 GrB_Info GB_Asaxpy3B__max_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -28967,14 +30090,14 @@ GrB_Info GB_Adot4B__max_isne_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -28992,13 +30115,15 @@ GrB_Info GB_Adot3B__any_isne_int8
 GrB_Info GB_Asaxpy3B__any_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29011,14 +30136,14 @@ GrB_Info GB_Adot4B__any_isne_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29036,13 +30161,15 @@ GrB_Info GB_Adot3B__any_isne_int16
 GrB_Info GB_Asaxpy3B__any_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29055,14 +30182,14 @@ GrB_Info GB_Adot4B__any_isne_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29080,13 +30207,15 @@ GrB_Info GB_Adot3B__any_isne_int32
 GrB_Info GB_Asaxpy3B__any_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29099,14 +30228,14 @@ GrB_Info GB_Adot4B__any_isne_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29124,13 +30253,15 @@ GrB_Info GB_Adot3B__any_isne_int64
 GrB_Info GB_Asaxpy3B__any_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29143,14 +30274,14 @@ GrB_Info GB_Adot4B__any_isne_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29168,13 +30299,15 @@ GrB_Info GB_Adot3B__any_isne_uint8
 GrB_Info GB_Asaxpy3B__any_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29187,14 +30320,14 @@ GrB_Info GB_Adot4B__any_isne_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29212,13 +30345,15 @@ GrB_Info GB_Adot3B__any_isne_uint16
 GrB_Info GB_Asaxpy3B__any_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29231,14 +30366,14 @@ GrB_Info GB_Adot4B__any_isne_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29256,13 +30391,15 @@ GrB_Info GB_Adot3B__any_isne_uint32
 GrB_Info GB_Asaxpy3B__any_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29275,14 +30412,14 @@ GrB_Info GB_Adot4B__any_isne_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29300,13 +30437,15 @@ GrB_Info GB_Adot3B__any_isne_uint64
 GrB_Info GB_Asaxpy3B__any_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29319,14 +30458,14 @@ GrB_Info GB_Adot4B__any_isne_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29344,13 +30483,15 @@ GrB_Info GB_Adot3B__any_isne_fp32
 GrB_Info GB_Asaxpy3B__any_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29363,14 +30504,14 @@ GrB_Info GB_Adot4B__any_isne_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29388,13 +30529,15 @@ GrB_Info GB_Adot3B__any_isne_fp64
 GrB_Info GB_Asaxpy3B__any_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29408,14 +30551,13 @@ GrB_Info GB_Adot4B__any_isne_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29433,13 +30575,15 @@ GrB_Info GB_Adot3B__plus_isne_int8
 GrB_Info GB_Asaxpy3B__plus_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29453,14 +30597,13 @@ GrB_Info GB_Adot4B__plus_isne_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29478,13 +30621,15 @@ GrB_Info GB_Adot3B__plus_isne_uint8
 GrB_Info GB_Asaxpy3B__plus_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29498,14 +30643,13 @@ GrB_Info GB_Adot4B__plus_isne_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29523,13 +30667,15 @@ GrB_Info GB_Adot3B__plus_isne_int16
 GrB_Info GB_Asaxpy3B__plus_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29543,14 +30689,13 @@ GrB_Info GB_Adot4B__plus_isne_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29568,13 +30713,15 @@ GrB_Info GB_Adot3B__plus_isne_uint16
 GrB_Info GB_Asaxpy3B__plus_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29588,14 +30735,13 @@ GrB_Info GB_Adot4B__plus_isne_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29613,13 +30759,15 @@ GrB_Info GB_Adot3B__plus_isne_int32
 GrB_Info GB_Asaxpy3B__plus_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29633,14 +30781,13 @@ GrB_Info GB_Adot4B__plus_isne_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29658,13 +30805,15 @@ GrB_Info GB_Adot3B__plus_isne_uint32
 GrB_Info GB_Asaxpy3B__plus_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29678,14 +30827,13 @@ GrB_Info GB_Adot4B__plus_isne_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29703,13 +30851,15 @@ GrB_Info GB_Adot3B__plus_isne_int64
 GrB_Info GB_Asaxpy3B__plus_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29723,14 +30873,13 @@ GrB_Info GB_Adot4B__plus_isne_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29748,13 +30897,15 @@ GrB_Info GB_Adot3B__plus_isne_uint64
 GrB_Info GB_Asaxpy3B__plus_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29768,14 +30919,13 @@ GrB_Info GB_Adot4B__plus_isne_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29793,13 +30943,15 @@ GrB_Info GB_Adot3B__plus_isne_fp32
 GrB_Info GB_Asaxpy3B__plus_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29813,14 +30965,13 @@ GrB_Info GB_Adot4B__plus_isne_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29838,13 +30989,15 @@ GrB_Info GB_Adot3B__plus_isne_fp64
 GrB_Info GB_Asaxpy3B__plus_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29857,14 +31010,14 @@ GrB_Info GB_Adot4B__plus_isne_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29882,13 +31035,15 @@ GrB_Info GB_Adot3B__times_isne_int8
 GrB_Info GB_Asaxpy3B__times_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29901,14 +31056,14 @@ GrB_Info GB_Adot4B__times_isne_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29926,13 +31081,15 @@ GrB_Info GB_Adot3B__times_isne_uint8
 GrB_Info GB_Asaxpy3B__times_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29945,14 +31102,14 @@ GrB_Info GB_Adot4B__times_isne_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -29970,13 +31127,15 @@ GrB_Info GB_Adot3B__times_isne_int16
 GrB_Info GB_Asaxpy3B__times_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -29989,14 +31148,14 @@ GrB_Info GB_Adot4B__times_isne_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30014,13 +31173,15 @@ GrB_Info GB_Adot3B__times_isne_uint16
 GrB_Info GB_Asaxpy3B__times_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30033,14 +31194,14 @@ GrB_Info GB_Adot4B__times_isne_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30058,13 +31219,15 @@ GrB_Info GB_Adot3B__times_isne_int32
 GrB_Info GB_Asaxpy3B__times_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30077,14 +31240,14 @@ GrB_Info GB_Adot4B__times_isne_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30102,13 +31265,15 @@ GrB_Info GB_Adot3B__times_isne_uint32
 GrB_Info GB_Asaxpy3B__times_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30121,14 +31286,14 @@ GrB_Info GB_Adot4B__times_isne_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30146,13 +31311,15 @@ GrB_Info GB_Adot3B__times_isne_int64
 GrB_Info GB_Asaxpy3B__times_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30165,14 +31332,14 @@ GrB_Info GB_Adot4B__times_isne_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30190,13 +31357,15 @@ GrB_Info GB_Adot3B__times_isne_uint64
 GrB_Info GB_Asaxpy3B__times_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30210,14 +31379,13 @@ GrB_Info GB_Adot4B__times_isne_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30235,13 +31403,15 @@ GrB_Info GB_Adot3B__times_isne_fp32
 GrB_Info GB_Asaxpy3B__times_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30255,14 +31425,13 @@ GrB_Info GB_Adot4B__times_isne_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30280,13 +31449,15 @@ GrB_Info GB_Adot3B__times_isne_fp64
 GrB_Info GB_Asaxpy3B__times_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30299,14 +31470,14 @@ GrB_Info GB_Adot4B__times_isne_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30324,13 +31495,15 @@ GrB_Info GB_Adot3B__min_isgt_int8
 GrB_Info GB_Asaxpy3B__min_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30343,14 +31516,14 @@ GrB_Info GB_Adot4B__min_isgt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30368,13 +31541,15 @@ GrB_Info GB_Adot3B__min_isgt_int16
 GrB_Info GB_Asaxpy3B__min_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30387,14 +31562,14 @@ GrB_Info GB_Adot4B__min_isgt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30412,13 +31587,15 @@ GrB_Info GB_Adot3B__min_isgt_int32
 GrB_Info GB_Asaxpy3B__min_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30431,14 +31608,14 @@ GrB_Info GB_Adot4B__min_isgt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30456,13 +31633,15 @@ GrB_Info GB_Adot3B__min_isgt_int64
 GrB_Info GB_Asaxpy3B__min_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30475,14 +31654,14 @@ GrB_Info GB_Adot4B__min_isgt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30500,13 +31679,15 @@ GrB_Info GB_Adot3B__min_isgt_uint8
 GrB_Info GB_Asaxpy3B__min_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30519,14 +31700,14 @@ GrB_Info GB_Adot4B__min_isgt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30544,13 +31725,15 @@ GrB_Info GB_Adot3B__min_isgt_uint16
 GrB_Info GB_Asaxpy3B__min_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30563,14 +31746,14 @@ GrB_Info GB_Adot4B__min_isgt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30588,13 +31771,15 @@ GrB_Info GB_Adot3B__min_isgt_uint32
 GrB_Info GB_Asaxpy3B__min_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30607,14 +31792,14 @@ GrB_Info GB_Adot4B__min_isgt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30632,13 +31817,15 @@ GrB_Info GB_Adot3B__min_isgt_uint64
 GrB_Info GB_Asaxpy3B__min_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30651,14 +31838,14 @@ GrB_Info GB_Adot4B__min_isgt_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30676,13 +31863,15 @@ GrB_Info GB_Adot3B__min_isgt_fp32
 GrB_Info GB_Asaxpy3B__min_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30695,14 +31884,14 @@ GrB_Info GB_Adot4B__min_isgt_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30720,13 +31909,15 @@ GrB_Info GB_Adot3B__min_isgt_fp64
 GrB_Info GB_Asaxpy3B__min_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30739,14 +31930,14 @@ GrB_Info GB_Adot4B__min_isgt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30764,13 +31955,15 @@ GrB_Info GB_Adot3B__max_isgt_int8
 GrB_Info GB_Asaxpy3B__max_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30783,14 +31976,14 @@ GrB_Info GB_Adot4B__max_isgt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30808,13 +32001,15 @@ GrB_Info GB_Adot3B__max_isgt_int16
 GrB_Info GB_Asaxpy3B__max_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30827,14 +32022,14 @@ GrB_Info GB_Adot4B__max_isgt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30852,13 +32047,15 @@ GrB_Info GB_Adot3B__max_isgt_int32
 GrB_Info GB_Asaxpy3B__max_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30871,14 +32068,14 @@ GrB_Info GB_Adot4B__max_isgt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30896,13 +32093,15 @@ GrB_Info GB_Adot3B__max_isgt_int64
 GrB_Info GB_Asaxpy3B__max_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30915,14 +32114,14 @@ GrB_Info GB_Adot4B__max_isgt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30940,13 +32139,15 @@ GrB_Info GB_Adot3B__max_isgt_uint8
 GrB_Info GB_Asaxpy3B__max_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -30959,14 +32160,14 @@ GrB_Info GB_Adot4B__max_isgt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -30984,13 +32185,15 @@ GrB_Info GB_Adot3B__max_isgt_uint16
 GrB_Info GB_Asaxpy3B__max_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31003,14 +32206,14 @@ GrB_Info GB_Adot4B__max_isgt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31028,13 +32231,15 @@ GrB_Info GB_Adot3B__max_isgt_uint32
 GrB_Info GB_Asaxpy3B__max_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31047,14 +32252,14 @@ GrB_Info GB_Adot4B__max_isgt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31072,13 +32277,15 @@ GrB_Info GB_Adot3B__max_isgt_uint64
 GrB_Info GB_Asaxpy3B__max_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31091,14 +32298,14 @@ GrB_Info GB_Adot4B__max_isgt_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31116,13 +32323,15 @@ GrB_Info GB_Adot3B__max_isgt_fp32
 GrB_Info GB_Asaxpy3B__max_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31135,14 +32344,14 @@ GrB_Info GB_Adot4B__max_isgt_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31160,13 +32369,15 @@ GrB_Info GB_Adot3B__max_isgt_fp64
 GrB_Info GB_Asaxpy3B__max_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31179,14 +32390,14 @@ GrB_Info GB_Adot4B__max_isgt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31204,13 +32415,15 @@ GrB_Info GB_Adot3B__any_isgt_int8
 GrB_Info GB_Asaxpy3B__any_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31223,14 +32436,14 @@ GrB_Info GB_Adot4B__any_isgt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31248,13 +32461,15 @@ GrB_Info GB_Adot3B__any_isgt_int16
 GrB_Info GB_Asaxpy3B__any_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31267,14 +32482,14 @@ GrB_Info GB_Adot4B__any_isgt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31292,13 +32507,15 @@ GrB_Info GB_Adot3B__any_isgt_int32
 GrB_Info GB_Asaxpy3B__any_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31311,14 +32528,14 @@ GrB_Info GB_Adot4B__any_isgt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31336,13 +32553,15 @@ GrB_Info GB_Adot3B__any_isgt_int64
 GrB_Info GB_Asaxpy3B__any_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31355,14 +32574,14 @@ GrB_Info GB_Adot4B__any_isgt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31380,13 +32599,15 @@ GrB_Info GB_Adot3B__any_isgt_uint8
 GrB_Info GB_Asaxpy3B__any_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31399,14 +32620,14 @@ GrB_Info GB_Adot4B__any_isgt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31424,13 +32645,15 @@ GrB_Info GB_Adot3B__any_isgt_uint16
 GrB_Info GB_Asaxpy3B__any_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31443,14 +32666,14 @@ GrB_Info GB_Adot4B__any_isgt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31468,13 +32691,15 @@ GrB_Info GB_Adot3B__any_isgt_uint32
 GrB_Info GB_Asaxpy3B__any_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31487,14 +32712,14 @@ GrB_Info GB_Adot4B__any_isgt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31512,13 +32737,15 @@ GrB_Info GB_Adot3B__any_isgt_uint64
 GrB_Info GB_Asaxpy3B__any_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31531,14 +32758,14 @@ GrB_Info GB_Adot4B__any_isgt_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31556,13 +32783,15 @@ GrB_Info GB_Adot3B__any_isgt_fp32
 GrB_Info GB_Asaxpy3B__any_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31575,14 +32804,14 @@ GrB_Info GB_Adot4B__any_isgt_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31600,13 +32829,15 @@ GrB_Info GB_Adot3B__any_isgt_fp64
 GrB_Info GB_Asaxpy3B__any_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31620,14 +32851,13 @@ GrB_Info GB_Adot4B__any_isgt_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31645,13 +32875,15 @@ GrB_Info GB_Adot3B__plus_isgt_int8
 GrB_Info GB_Asaxpy3B__plus_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31665,14 +32897,13 @@ GrB_Info GB_Adot4B__plus_isgt_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31690,13 +32921,15 @@ GrB_Info GB_Adot3B__plus_isgt_uint8
 GrB_Info GB_Asaxpy3B__plus_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31710,14 +32943,13 @@ GrB_Info GB_Adot4B__plus_isgt_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31735,13 +32967,15 @@ GrB_Info GB_Adot3B__plus_isgt_int16
 GrB_Info GB_Asaxpy3B__plus_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31755,14 +32989,13 @@ GrB_Info GB_Adot4B__plus_isgt_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31780,13 +33013,15 @@ GrB_Info GB_Adot3B__plus_isgt_uint16
 GrB_Info GB_Asaxpy3B__plus_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31800,14 +33035,13 @@ GrB_Info GB_Adot4B__plus_isgt_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31825,13 +33059,15 @@ GrB_Info GB_Adot3B__plus_isgt_int32
 GrB_Info GB_Asaxpy3B__plus_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31845,14 +33081,13 @@ GrB_Info GB_Adot4B__plus_isgt_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31870,13 +33105,15 @@ GrB_Info GB_Adot3B__plus_isgt_uint32
 GrB_Info GB_Asaxpy3B__plus_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31890,14 +33127,13 @@ GrB_Info GB_Adot4B__plus_isgt_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31915,13 +33151,15 @@ GrB_Info GB_Adot3B__plus_isgt_int64
 GrB_Info GB_Asaxpy3B__plus_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31935,14 +33173,13 @@ GrB_Info GB_Adot4B__plus_isgt_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -31960,13 +33197,15 @@ GrB_Info GB_Adot3B__plus_isgt_uint64
 GrB_Info GB_Asaxpy3B__plus_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -31980,14 +33219,13 @@ GrB_Info GB_Adot4B__plus_isgt_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32005,13 +33243,15 @@ GrB_Info GB_Adot3B__plus_isgt_fp32
 GrB_Info GB_Asaxpy3B__plus_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32025,14 +33265,13 @@ GrB_Info GB_Adot4B__plus_isgt_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32050,13 +33289,15 @@ GrB_Info GB_Adot3B__plus_isgt_fp64
 GrB_Info GB_Asaxpy3B__plus_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32069,14 +33310,14 @@ GrB_Info GB_Adot4B__plus_isgt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32094,13 +33335,15 @@ GrB_Info GB_Adot3B__times_isgt_int8
 GrB_Info GB_Asaxpy3B__times_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32113,14 +33356,14 @@ GrB_Info GB_Adot4B__times_isgt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32138,13 +33381,15 @@ GrB_Info GB_Adot3B__times_isgt_uint8
 GrB_Info GB_Asaxpy3B__times_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32157,14 +33402,14 @@ GrB_Info GB_Adot4B__times_isgt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32182,13 +33427,15 @@ GrB_Info GB_Adot3B__times_isgt_int16
 GrB_Info GB_Asaxpy3B__times_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32201,14 +33448,14 @@ GrB_Info GB_Adot4B__times_isgt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32226,13 +33473,15 @@ GrB_Info GB_Adot3B__times_isgt_uint16
 GrB_Info GB_Asaxpy3B__times_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32245,14 +33494,14 @@ GrB_Info GB_Adot4B__times_isgt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32270,13 +33519,15 @@ GrB_Info GB_Adot3B__times_isgt_int32
 GrB_Info GB_Asaxpy3B__times_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32289,14 +33540,14 @@ GrB_Info GB_Adot4B__times_isgt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32314,13 +33565,15 @@ GrB_Info GB_Adot3B__times_isgt_uint32
 GrB_Info GB_Asaxpy3B__times_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32333,14 +33586,14 @@ GrB_Info GB_Adot4B__times_isgt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32358,13 +33611,15 @@ GrB_Info GB_Adot3B__times_isgt_int64
 GrB_Info GB_Asaxpy3B__times_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32377,14 +33632,14 @@ GrB_Info GB_Adot4B__times_isgt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32402,13 +33657,15 @@ GrB_Info GB_Adot3B__times_isgt_uint64
 GrB_Info GB_Asaxpy3B__times_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32422,14 +33679,13 @@ GrB_Info GB_Adot4B__times_isgt_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32447,13 +33703,15 @@ GrB_Info GB_Adot3B__times_isgt_fp32
 GrB_Info GB_Asaxpy3B__times_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32467,14 +33725,13 @@ GrB_Info GB_Adot4B__times_isgt_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32492,13 +33749,15 @@ GrB_Info GB_Adot3B__times_isgt_fp64
 GrB_Info GB_Asaxpy3B__times_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32511,14 +33770,14 @@ GrB_Info GB_Adot4B__times_isgt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32536,13 +33795,15 @@ GrB_Info GB_Adot3B__min_islt_int8
 GrB_Info GB_Asaxpy3B__min_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32555,14 +33816,14 @@ GrB_Info GB_Adot4B__min_islt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32580,13 +33841,15 @@ GrB_Info GB_Adot3B__min_islt_int16
 GrB_Info GB_Asaxpy3B__min_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32599,14 +33862,14 @@ GrB_Info GB_Adot4B__min_islt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32624,13 +33887,15 @@ GrB_Info GB_Adot3B__min_islt_int32
 GrB_Info GB_Asaxpy3B__min_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32643,14 +33908,14 @@ GrB_Info GB_Adot4B__min_islt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32668,13 +33933,15 @@ GrB_Info GB_Adot3B__min_islt_int64
 GrB_Info GB_Asaxpy3B__min_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32687,14 +33954,14 @@ GrB_Info GB_Adot4B__min_islt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32712,13 +33979,15 @@ GrB_Info GB_Adot3B__min_islt_uint8
 GrB_Info GB_Asaxpy3B__min_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32731,14 +34000,14 @@ GrB_Info GB_Adot4B__min_islt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32756,13 +34025,15 @@ GrB_Info GB_Adot3B__min_islt_uint16
 GrB_Info GB_Asaxpy3B__min_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32775,14 +34046,14 @@ GrB_Info GB_Adot4B__min_islt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32800,13 +34071,15 @@ GrB_Info GB_Adot3B__min_islt_uint32
 GrB_Info GB_Asaxpy3B__min_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32819,14 +34092,14 @@ GrB_Info GB_Adot4B__min_islt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32844,13 +34117,15 @@ GrB_Info GB_Adot3B__min_islt_uint64
 GrB_Info GB_Asaxpy3B__min_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32863,14 +34138,14 @@ GrB_Info GB_Adot4B__min_islt_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32888,13 +34163,15 @@ GrB_Info GB_Adot3B__min_islt_fp32
 GrB_Info GB_Asaxpy3B__min_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32907,14 +34184,14 @@ GrB_Info GB_Adot4B__min_islt_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32932,13 +34209,15 @@ GrB_Info GB_Adot3B__min_islt_fp64
 GrB_Info GB_Asaxpy3B__min_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32951,14 +34230,14 @@ GrB_Info GB_Adot4B__min_islt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -32976,13 +34255,15 @@ GrB_Info GB_Adot3B__max_islt_int8
 GrB_Info GB_Asaxpy3B__max_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -32995,14 +34276,14 @@ GrB_Info GB_Adot4B__max_islt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33020,13 +34301,15 @@ GrB_Info GB_Adot3B__max_islt_int16
 GrB_Info GB_Asaxpy3B__max_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33039,14 +34322,14 @@ GrB_Info GB_Adot4B__max_islt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33064,13 +34347,15 @@ GrB_Info GB_Adot3B__max_islt_int32
 GrB_Info GB_Asaxpy3B__max_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33083,14 +34368,14 @@ GrB_Info GB_Adot4B__max_islt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33108,13 +34393,15 @@ GrB_Info GB_Adot3B__max_islt_int64
 GrB_Info GB_Asaxpy3B__max_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33127,14 +34414,14 @@ GrB_Info GB_Adot4B__max_islt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33152,13 +34439,15 @@ GrB_Info GB_Adot3B__max_islt_uint8
 GrB_Info GB_Asaxpy3B__max_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33171,14 +34460,14 @@ GrB_Info GB_Adot4B__max_islt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33196,13 +34485,15 @@ GrB_Info GB_Adot3B__max_islt_uint16
 GrB_Info GB_Asaxpy3B__max_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33215,14 +34506,14 @@ GrB_Info GB_Adot4B__max_islt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33240,13 +34531,15 @@ GrB_Info GB_Adot3B__max_islt_uint32
 GrB_Info GB_Asaxpy3B__max_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33259,14 +34552,14 @@ GrB_Info GB_Adot4B__max_islt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33284,13 +34577,15 @@ GrB_Info GB_Adot3B__max_islt_uint64
 GrB_Info GB_Asaxpy3B__max_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33303,14 +34598,14 @@ GrB_Info GB_Adot4B__max_islt_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33328,13 +34623,15 @@ GrB_Info GB_Adot3B__max_islt_fp32
 GrB_Info GB_Asaxpy3B__max_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33347,14 +34644,14 @@ GrB_Info GB_Adot4B__max_islt_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33372,13 +34669,15 @@ GrB_Info GB_Adot3B__max_islt_fp64
 GrB_Info GB_Asaxpy3B__max_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33391,14 +34690,14 @@ GrB_Info GB_Adot4B__max_islt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33416,13 +34715,15 @@ GrB_Info GB_Adot3B__any_islt_int8
 GrB_Info GB_Asaxpy3B__any_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33435,14 +34736,14 @@ GrB_Info GB_Adot4B__any_islt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33460,13 +34761,15 @@ GrB_Info GB_Adot3B__any_islt_int16
 GrB_Info GB_Asaxpy3B__any_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33479,14 +34782,14 @@ GrB_Info GB_Adot4B__any_islt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33504,13 +34807,15 @@ GrB_Info GB_Adot3B__any_islt_int32
 GrB_Info GB_Asaxpy3B__any_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33523,14 +34828,14 @@ GrB_Info GB_Adot4B__any_islt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33548,13 +34853,15 @@ GrB_Info GB_Adot3B__any_islt_int64
 GrB_Info GB_Asaxpy3B__any_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33567,14 +34874,14 @@ GrB_Info GB_Adot4B__any_islt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33592,13 +34899,15 @@ GrB_Info GB_Adot3B__any_islt_uint8
 GrB_Info GB_Asaxpy3B__any_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33611,14 +34920,14 @@ GrB_Info GB_Adot4B__any_islt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33636,13 +34945,15 @@ GrB_Info GB_Adot3B__any_islt_uint16
 GrB_Info GB_Asaxpy3B__any_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33655,14 +34966,14 @@ GrB_Info GB_Adot4B__any_islt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33680,13 +34991,15 @@ GrB_Info GB_Adot3B__any_islt_uint32
 GrB_Info GB_Asaxpy3B__any_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33699,14 +35012,14 @@ GrB_Info GB_Adot4B__any_islt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33724,13 +35037,15 @@ GrB_Info GB_Adot3B__any_islt_uint64
 GrB_Info GB_Asaxpy3B__any_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33743,14 +35058,14 @@ GrB_Info GB_Adot4B__any_islt_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33768,13 +35083,15 @@ GrB_Info GB_Adot3B__any_islt_fp32
 GrB_Info GB_Asaxpy3B__any_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33787,14 +35104,14 @@ GrB_Info GB_Adot4B__any_islt_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33812,13 +35129,15 @@ GrB_Info GB_Adot3B__any_islt_fp64
 GrB_Info GB_Asaxpy3B__any_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33832,14 +35151,13 @@ GrB_Info GB_Adot4B__any_islt_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33857,13 +35175,15 @@ GrB_Info GB_Adot3B__plus_islt_int8
 GrB_Info GB_Asaxpy3B__plus_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33877,14 +35197,13 @@ GrB_Info GB_Adot4B__plus_islt_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33902,13 +35221,15 @@ GrB_Info GB_Adot3B__plus_islt_uint8
 GrB_Info GB_Asaxpy3B__plus_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33922,14 +35243,13 @@ GrB_Info GB_Adot4B__plus_islt_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33947,13 +35267,15 @@ GrB_Info GB_Adot3B__plus_islt_int16
 GrB_Info GB_Asaxpy3B__plus_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -33967,14 +35289,13 @@ GrB_Info GB_Adot4B__plus_islt_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -33992,13 +35313,15 @@ GrB_Info GB_Adot3B__plus_islt_uint16
 GrB_Info GB_Asaxpy3B__plus_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34012,14 +35335,13 @@ GrB_Info GB_Adot4B__plus_islt_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34037,13 +35359,15 @@ GrB_Info GB_Adot3B__plus_islt_int32
 GrB_Info GB_Asaxpy3B__plus_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34057,14 +35381,13 @@ GrB_Info GB_Adot4B__plus_islt_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34082,13 +35405,15 @@ GrB_Info GB_Adot3B__plus_islt_uint32
 GrB_Info GB_Asaxpy3B__plus_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34102,14 +35427,13 @@ GrB_Info GB_Adot4B__plus_islt_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34127,13 +35451,15 @@ GrB_Info GB_Adot3B__plus_islt_int64
 GrB_Info GB_Asaxpy3B__plus_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34147,14 +35473,13 @@ GrB_Info GB_Adot4B__plus_islt_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34172,13 +35497,15 @@ GrB_Info GB_Adot3B__plus_islt_uint64
 GrB_Info GB_Asaxpy3B__plus_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34192,14 +35519,13 @@ GrB_Info GB_Adot4B__plus_islt_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34217,13 +35543,15 @@ GrB_Info GB_Adot3B__plus_islt_fp32
 GrB_Info GB_Asaxpy3B__plus_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34237,14 +35565,13 @@ GrB_Info GB_Adot4B__plus_islt_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34262,13 +35589,15 @@ GrB_Info GB_Adot3B__plus_islt_fp64
 GrB_Info GB_Asaxpy3B__plus_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34281,14 +35610,14 @@ GrB_Info GB_Adot4B__plus_islt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34306,13 +35635,15 @@ GrB_Info GB_Adot3B__times_islt_int8
 GrB_Info GB_Asaxpy3B__times_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34325,14 +35656,14 @@ GrB_Info GB_Adot4B__times_islt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34350,13 +35681,15 @@ GrB_Info GB_Adot3B__times_islt_uint8
 GrB_Info GB_Asaxpy3B__times_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34369,14 +35702,14 @@ GrB_Info GB_Adot4B__times_islt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34394,13 +35727,15 @@ GrB_Info GB_Adot3B__times_islt_int16
 GrB_Info GB_Asaxpy3B__times_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34413,14 +35748,14 @@ GrB_Info GB_Adot4B__times_islt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34438,13 +35773,15 @@ GrB_Info GB_Adot3B__times_islt_uint16
 GrB_Info GB_Asaxpy3B__times_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34457,14 +35794,14 @@ GrB_Info GB_Adot4B__times_islt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34482,13 +35819,15 @@ GrB_Info GB_Adot3B__times_islt_int32
 GrB_Info GB_Asaxpy3B__times_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34501,14 +35840,14 @@ GrB_Info GB_Adot4B__times_islt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34526,13 +35865,15 @@ GrB_Info GB_Adot3B__times_islt_uint32
 GrB_Info GB_Asaxpy3B__times_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34545,14 +35886,14 @@ GrB_Info GB_Adot4B__times_islt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34570,13 +35911,15 @@ GrB_Info GB_Adot3B__times_islt_int64
 GrB_Info GB_Asaxpy3B__times_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34589,14 +35932,14 @@ GrB_Info GB_Adot4B__times_islt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34614,13 +35957,15 @@ GrB_Info GB_Adot3B__times_islt_uint64
 GrB_Info GB_Asaxpy3B__times_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34634,14 +35979,13 @@ GrB_Info GB_Adot4B__times_islt_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34659,13 +36003,15 @@ GrB_Info GB_Adot3B__times_islt_fp32
 GrB_Info GB_Asaxpy3B__times_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34679,14 +36025,13 @@ GrB_Info GB_Adot4B__times_islt_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34704,13 +36049,15 @@ GrB_Info GB_Adot3B__times_islt_fp64
 GrB_Info GB_Asaxpy3B__times_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34723,14 +36070,14 @@ GrB_Info GB_Adot4B__times_islt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34748,13 +36095,15 @@ GrB_Info GB_Adot3B__min_isge_int8
 GrB_Info GB_Asaxpy3B__min_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34767,14 +36116,14 @@ GrB_Info GB_Adot4B__min_isge_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34792,13 +36141,15 @@ GrB_Info GB_Adot3B__min_isge_int16
 GrB_Info GB_Asaxpy3B__min_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34811,14 +36162,14 @@ GrB_Info GB_Adot4B__min_isge_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34836,13 +36187,15 @@ GrB_Info GB_Adot3B__min_isge_int32
 GrB_Info GB_Asaxpy3B__min_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34855,14 +36208,14 @@ GrB_Info GB_Adot4B__min_isge_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34880,13 +36233,15 @@ GrB_Info GB_Adot3B__min_isge_int64
 GrB_Info GB_Asaxpy3B__min_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34899,14 +36254,14 @@ GrB_Info GB_Adot4B__min_isge_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34924,13 +36279,15 @@ GrB_Info GB_Adot3B__min_isge_uint8
 GrB_Info GB_Asaxpy3B__min_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34943,14 +36300,14 @@ GrB_Info GB_Adot4B__min_isge_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -34968,13 +36325,15 @@ GrB_Info GB_Adot3B__min_isge_uint16
 GrB_Info GB_Asaxpy3B__min_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -34987,14 +36346,14 @@ GrB_Info GB_Adot4B__min_isge_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35012,13 +36371,15 @@ GrB_Info GB_Adot3B__min_isge_uint32
 GrB_Info GB_Asaxpy3B__min_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35031,14 +36392,14 @@ GrB_Info GB_Adot4B__min_isge_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35056,13 +36417,15 @@ GrB_Info GB_Adot3B__min_isge_uint64
 GrB_Info GB_Asaxpy3B__min_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35075,14 +36438,14 @@ GrB_Info GB_Adot4B__min_isge_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35100,13 +36463,15 @@ GrB_Info GB_Adot3B__min_isge_fp32
 GrB_Info GB_Asaxpy3B__min_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35119,14 +36484,14 @@ GrB_Info GB_Adot4B__min_isge_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35144,13 +36509,15 @@ GrB_Info GB_Adot3B__min_isge_fp64
 GrB_Info GB_Asaxpy3B__min_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35163,14 +36530,14 @@ GrB_Info GB_Adot4B__min_isge_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35188,13 +36555,15 @@ GrB_Info GB_Adot3B__max_isge_int8
 GrB_Info GB_Asaxpy3B__max_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35207,14 +36576,14 @@ GrB_Info GB_Adot4B__max_isge_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35232,13 +36601,15 @@ GrB_Info GB_Adot3B__max_isge_int16
 GrB_Info GB_Asaxpy3B__max_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35251,14 +36622,14 @@ GrB_Info GB_Adot4B__max_isge_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35276,13 +36647,15 @@ GrB_Info GB_Adot3B__max_isge_int32
 GrB_Info GB_Asaxpy3B__max_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35295,14 +36668,14 @@ GrB_Info GB_Adot4B__max_isge_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35320,13 +36693,15 @@ GrB_Info GB_Adot3B__max_isge_int64
 GrB_Info GB_Asaxpy3B__max_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35339,14 +36714,14 @@ GrB_Info GB_Adot4B__max_isge_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35364,13 +36739,15 @@ GrB_Info GB_Adot3B__max_isge_uint8
 GrB_Info GB_Asaxpy3B__max_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35383,14 +36760,14 @@ GrB_Info GB_Adot4B__max_isge_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35408,13 +36785,15 @@ GrB_Info GB_Adot3B__max_isge_uint16
 GrB_Info GB_Asaxpy3B__max_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35427,14 +36806,14 @@ GrB_Info GB_Adot4B__max_isge_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35452,13 +36831,15 @@ GrB_Info GB_Adot3B__max_isge_uint32
 GrB_Info GB_Asaxpy3B__max_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35471,14 +36852,14 @@ GrB_Info GB_Adot4B__max_isge_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35496,13 +36877,15 @@ GrB_Info GB_Adot3B__max_isge_uint64
 GrB_Info GB_Asaxpy3B__max_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35515,14 +36898,14 @@ GrB_Info GB_Adot4B__max_isge_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35540,13 +36923,15 @@ GrB_Info GB_Adot3B__max_isge_fp32
 GrB_Info GB_Asaxpy3B__max_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35559,14 +36944,14 @@ GrB_Info GB_Adot4B__max_isge_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35584,13 +36969,15 @@ GrB_Info GB_Adot3B__max_isge_fp64
 GrB_Info GB_Asaxpy3B__max_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35603,14 +36990,14 @@ GrB_Info GB_Adot4B__max_isge_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35628,13 +37015,15 @@ GrB_Info GB_Adot3B__any_isge_int8
 GrB_Info GB_Asaxpy3B__any_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35647,14 +37036,14 @@ GrB_Info GB_Adot4B__any_isge_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35672,13 +37061,15 @@ GrB_Info GB_Adot3B__any_isge_int16
 GrB_Info GB_Asaxpy3B__any_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35691,14 +37082,14 @@ GrB_Info GB_Adot4B__any_isge_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35716,13 +37107,15 @@ GrB_Info GB_Adot3B__any_isge_int32
 GrB_Info GB_Asaxpy3B__any_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35735,14 +37128,14 @@ GrB_Info GB_Adot4B__any_isge_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35760,13 +37153,15 @@ GrB_Info GB_Adot3B__any_isge_int64
 GrB_Info GB_Asaxpy3B__any_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35779,14 +37174,14 @@ GrB_Info GB_Adot4B__any_isge_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35804,13 +37199,15 @@ GrB_Info GB_Adot3B__any_isge_uint8
 GrB_Info GB_Asaxpy3B__any_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35823,14 +37220,14 @@ GrB_Info GB_Adot4B__any_isge_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35848,13 +37245,15 @@ GrB_Info GB_Adot3B__any_isge_uint16
 GrB_Info GB_Asaxpy3B__any_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35867,14 +37266,14 @@ GrB_Info GB_Adot4B__any_isge_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35892,13 +37291,15 @@ GrB_Info GB_Adot3B__any_isge_uint32
 GrB_Info GB_Asaxpy3B__any_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35911,14 +37312,14 @@ GrB_Info GB_Adot4B__any_isge_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35936,13 +37337,15 @@ GrB_Info GB_Adot3B__any_isge_uint64
 GrB_Info GB_Asaxpy3B__any_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35955,14 +37358,14 @@ GrB_Info GB_Adot4B__any_isge_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -35980,13 +37383,15 @@ GrB_Info GB_Adot3B__any_isge_fp32
 GrB_Info GB_Asaxpy3B__any_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -35999,14 +37404,14 @@ GrB_Info GB_Adot4B__any_isge_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36024,13 +37429,15 @@ GrB_Info GB_Adot3B__any_isge_fp64
 GrB_Info GB_Asaxpy3B__any_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36044,14 +37451,13 @@ GrB_Info GB_Adot4B__any_isge_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36069,13 +37475,15 @@ GrB_Info GB_Adot3B__plus_isge_int8
 GrB_Info GB_Asaxpy3B__plus_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36089,14 +37497,13 @@ GrB_Info GB_Adot4B__plus_isge_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36114,13 +37521,15 @@ GrB_Info GB_Adot3B__plus_isge_uint8
 GrB_Info GB_Asaxpy3B__plus_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36134,14 +37543,13 @@ GrB_Info GB_Adot4B__plus_isge_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36159,13 +37567,15 @@ GrB_Info GB_Adot3B__plus_isge_int16
 GrB_Info GB_Asaxpy3B__plus_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36179,14 +37589,13 @@ GrB_Info GB_Adot4B__plus_isge_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36204,13 +37613,15 @@ GrB_Info GB_Adot3B__plus_isge_uint16
 GrB_Info GB_Asaxpy3B__plus_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36224,14 +37635,13 @@ GrB_Info GB_Adot4B__plus_isge_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36249,13 +37659,15 @@ GrB_Info GB_Adot3B__plus_isge_int32
 GrB_Info GB_Asaxpy3B__plus_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36269,14 +37681,13 @@ GrB_Info GB_Adot4B__plus_isge_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36294,13 +37705,15 @@ GrB_Info GB_Adot3B__plus_isge_uint32
 GrB_Info GB_Asaxpy3B__plus_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36314,14 +37727,13 @@ GrB_Info GB_Adot4B__plus_isge_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36339,13 +37751,15 @@ GrB_Info GB_Adot3B__plus_isge_int64
 GrB_Info GB_Asaxpy3B__plus_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36359,14 +37773,13 @@ GrB_Info GB_Adot4B__plus_isge_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36384,13 +37797,15 @@ GrB_Info GB_Adot3B__plus_isge_uint64
 GrB_Info GB_Asaxpy3B__plus_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36404,14 +37819,13 @@ GrB_Info GB_Adot4B__plus_isge_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36429,13 +37843,15 @@ GrB_Info GB_Adot3B__plus_isge_fp32
 GrB_Info GB_Asaxpy3B__plus_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36449,14 +37865,13 @@ GrB_Info GB_Adot4B__plus_isge_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36474,13 +37889,15 @@ GrB_Info GB_Adot3B__plus_isge_fp64
 GrB_Info GB_Asaxpy3B__plus_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36493,14 +37910,14 @@ GrB_Info GB_Adot4B__plus_isge_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36518,13 +37935,15 @@ GrB_Info GB_Adot3B__times_isge_int8
 GrB_Info GB_Asaxpy3B__times_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36537,14 +37956,14 @@ GrB_Info GB_Adot4B__times_isge_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36562,13 +37981,15 @@ GrB_Info GB_Adot3B__times_isge_uint8
 GrB_Info GB_Asaxpy3B__times_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36581,14 +38002,14 @@ GrB_Info GB_Adot4B__times_isge_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36606,13 +38027,15 @@ GrB_Info GB_Adot3B__times_isge_int16
 GrB_Info GB_Asaxpy3B__times_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36625,14 +38048,14 @@ GrB_Info GB_Adot4B__times_isge_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36650,13 +38073,15 @@ GrB_Info GB_Adot3B__times_isge_uint16
 GrB_Info GB_Asaxpy3B__times_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36669,14 +38094,14 @@ GrB_Info GB_Adot4B__times_isge_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36694,13 +38119,15 @@ GrB_Info GB_Adot3B__times_isge_int32
 GrB_Info GB_Asaxpy3B__times_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36713,14 +38140,14 @@ GrB_Info GB_Adot4B__times_isge_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36738,13 +38165,15 @@ GrB_Info GB_Adot3B__times_isge_uint32
 GrB_Info GB_Asaxpy3B__times_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36757,14 +38186,14 @@ GrB_Info GB_Adot4B__times_isge_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36782,13 +38211,15 @@ GrB_Info GB_Adot3B__times_isge_int64
 GrB_Info GB_Asaxpy3B__times_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36801,14 +38232,14 @@ GrB_Info GB_Adot4B__times_isge_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36826,13 +38257,15 @@ GrB_Info GB_Adot3B__times_isge_uint64
 GrB_Info GB_Asaxpy3B__times_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36846,14 +38279,13 @@ GrB_Info GB_Adot4B__times_isge_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36871,13 +38303,15 @@ GrB_Info GB_Adot3B__times_isge_fp32
 GrB_Info GB_Asaxpy3B__times_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36891,14 +38325,13 @@ GrB_Info GB_Adot4B__times_isge_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36916,13 +38349,15 @@ GrB_Info GB_Adot3B__times_isge_fp64
 GrB_Info GB_Asaxpy3B__times_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36935,14 +38370,14 @@ GrB_Info GB_Adot4B__times_isge_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -36960,13 +38395,15 @@ GrB_Info GB_Adot3B__min_isle_int8
 GrB_Info GB_Asaxpy3B__min_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -36979,14 +38416,14 @@ GrB_Info GB_Adot4B__min_isle_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37004,13 +38441,15 @@ GrB_Info GB_Adot3B__min_isle_int16
 GrB_Info GB_Asaxpy3B__min_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37023,14 +38462,14 @@ GrB_Info GB_Adot4B__min_isle_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37048,13 +38487,15 @@ GrB_Info GB_Adot3B__min_isle_int32
 GrB_Info GB_Asaxpy3B__min_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37067,14 +38508,14 @@ GrB_Info GB_Adot4B__min_isle_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37092,13 +38533,15 @@ GrB_Info GB_Adot3B__min_isle_int64
 GrB_Info GB_Asaxpy3B__min_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37111,14 +38554,14 @@ GrB_Info GB_Adot4B__min_isle_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37136,13 +38579,15 @@ GrB_Info GB_Adot3B__min_isle_uint8
 GrB_Info GB_Asaxpy3B__min_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37155,14 +38600,14 @@ GrB_Info GB_Adot4B__min_isle_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37180,13 +38625,15 @@ GrB_Info GB_Adot3B__min_isle_uint16
 GrB_Info GB_Asaxpy3B__min_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37199,14 +38646,14 @@ GrB_Info GB_Adot4B__min_isle_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37224,13 +38671,15 @@ GrB_Info GB_Adot3B__min_isle_uint32
 GrB_Info GB_Asaxpy3B__min_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37243,14 +38692,14 @@ GrB_Info GB_Adot4B__min_isle_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37268,13 +38717,15 @@ GrB_Info GB_Adot3B__min_isle_uint64
 GrB_Info GB_Asaxpy3B__min_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37287,14 +38738,14 @@ GrB_Info GB_Adot4B__min_isle_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37312,13 +38763,15 @@ GrB_Info GB_Adot3B__min_isle_fp32
 GrB_Info GB_Asaxpy3B__min_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37331,14 +38784,14 @@ GrB_Info GB_Adot4B__min_isle_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37356,13 +38809,15 @@ GrB_Info GB_Adot3B__min_isle_fp64
 GrB_Info GB_Asaxpy3B__min_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37375,14 +38830,14 @@ GrB_Info GB_Adot4B__min_isle_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37400,13 +38855,15 @@ GrB_Info GB_Adot3B__max_isle_int8
 GrB_Info GB_Asaxpy3B__max_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37419,14 +38876,14 @@ GrB_Info GB_Adot4B__max_isle_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37444,13 +38901,15 @@ GrB_Info GB_Adot3B__max_isle_int16
 GrB_Info GB_Asaxpy3B__max_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37463,14 +38922,14 @@ GrB_Info GB_Adot4B__max_isle_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37488,13 +38947,15 @@ GrB_Info GB_Adot3B__max_isle_int32
 GrB_Info GB_Asaxpy3B__max_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37507,14 +38968,14 @@ GrB_Info GB_Adot4B__max_isle_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37532,13 +38993,15 @@ GrB_Info GB_Adot3B__max_isle_int64
 GrB_Info GB_Asaxpy3B__max_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37551,14 +39014,14 @@ GrB_Info GB_Adot4B__max_isle_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37576,13 +39039,15 @@ GrB_Info GB_Adot3B__max_isle_uint8
 GrB_Info GB_Asaxpy3B__max_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37595,14 +39060,14 @@ GrB_Info GB_Adot4B__max_isle_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37620,13 +39085,15 @@ GrB_Info GB_Adot3B__max_isle_uint16
 GrB_Info GB_Asaxpy3B__max_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37639,14 +39106,14 @@ GrB_Info GB_Adot4B__max_isle_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37664,13 +39131,15 @@ GrB_Info GB_Adot3B__max_isle_uint32
 GrB_Info GB_Asaxpy3B__max_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37683,14 +39152,14 @@ GrB_Info GB_Adot4B__max_isle_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37708,13 +39177,15 @@ GrB_Info GB_Adot3B__max_isle_uint64
 GrB_Info GB_Asaxpy3B__max_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37727,14 +39198,14 @@ GrB_Info GB_Adot4B__max_isle_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37752,13 +39223,15 @@ GrB_Info GB_Adot3B__max_isle_fp32
 GrB_Info GB_Asaxpy3B__max_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37771,14 +39244,14 @@ GrB_Info GB_Adot4B__max_isle_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37796,13 +39269,15 @@ GrB_Info GB_Adot3B__max_isle_fp64
 GrB_Info GB_Asaxpy3B__max_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37815,14 +39290,14 @@ GrB_Info GB_Adot4B__max_isle_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37840,13 +39315,15 @@ GrB_Info GB_Adot3B__any_isle_int8
 GrB_Info GB_Asaxpy3B__any_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37859,14 +39336,14 @@ GrB_Info GB_Adot4B__any_isle_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37884,13 +39361,15 @@ GrB_Info GB_Adot3B__any_isle_int16
 GrB_Info GB_Asaxpy3B__any_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37903,14 +39382,14 @@ GrB_Info GB_Adot4B__any_isle_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37928,13 +39407,15 @@ GrB_Info GB_Adot3B__any_isle_int32
 GrB_Info GB_Asaxpy3B__any_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37947,14 +39428,14 @@ GrB_Info GB_Adot4B__any_isle_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -37972,13 +39453,15 @@ GrB_Info GB_Adot3B__any_isle_int64
 GrB_Info GB_Asaxpy3B__any_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -37991,14 +39474,14 @@ GrB_Info GB_Adot4B__any_isle_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38016,13 +39499,15 @@ GrB_Info GB_Adot3B__any_isle_uint8
 GrB_Info GB_Asaxpy3B__any_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38035,14 +39520,14 @@ GrB_Info GB_Adot4B__any_isle_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38060,13 +39545,15 @@ GrB_Info GB_Adot3B__any_isle_uint16
 GrB_Info GB_Asaxpy3B__any_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38079,14 +39566,14 @@ GrB_Info GB_Adot4B__any_isle_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38104,13 +39591,15 @@ GrB_Info GB_Adot3B__any_isle_uint32
 GrB_Info GB_Asaxpy3B__any_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38123,14 +39612,14 @@ GrB_Info GB_Adot4B__any_isle_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38148,13 +39637,15 @@ GrB_Info GB_Adot3B__any_isle_uint64
 GrB_Info GB_Asaxpy3B__any_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38167,14 +39658,14 @@ GrB_Info GB_Adot4B__any_isle_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38192,13 +39683,15 @@ GrB_Info GB_Adot3B__any_isle_fp32
 GrB_Info GB_Asaxpy3B__any_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38211,14 +39704,14 @@ GrB_Info GB_Adot4B__any_isle_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38236,13 +39729,15 @@ GrB_Info GB_Adot3B__any_isle_fp64
 GrB_Info GB_Asaxpy3B__any_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38256,14 +39751,13 @@ GrB_Info GB_Adot4B__any_isle_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38281,13 +39775,15 @@ GrB_Info GB_Adot3B__plus_isle_int8
 GrB_Info GB_Asaxpy3B__plus_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38301,14 +39797,13 @@ GrB_Info GB_Adot4B__plus_isle_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38326,13 +39821,15 @@ GrB_Info GB_Adot3B__plus_isle_uint8
 GrB_Info GB_Asaxpy3B__plus_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38346,14 +39843,13 @@ GrB_Info GB_Adot4B__plus_isle_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38371,13 +39867,15 @@ GrB_Info GB_Adot3B__plus_isle_int16
 GrB_Info GB_Asaxpy3B__plus_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38391,14 +39889,13 @@ GrB_Info GB_Adot4B__plus_isle_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38416,13 +39913,15 @@ GrB_Info GB_Adot3B__plus_isle_uint16
 GrB_Info GB_Asaxpy3B__plus_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38436,14 +39935,13 @@ GrB_Info GB_Adot4B__plus_isle_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38461,13 +39959,15 @@ GrB_Info GB_Adot3B__plus_isle_int32
 GrB_Info GB_Asaxpy3B__plus_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38481,14 +39981,13 @@ GrB_Info GB_Adot4B__plus_isle_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38506,13 +40005,15 @@ GrB_Info GB_Adot3B__plus_isle_uint32
 GrB_Info GB_Asaxpy3B__plus_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38526,14 +40027,13 @@ GrB_Info GB_Adot4B__plus_isle_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38551,13 +40051,15 @@ GrB_Info GB_Adot3B__plus_isle_int64
 GrB_Info GB_Asaxpy3B__plus_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38571,14 +40073,13 @@ GrB_Info GB_Adot4B__plus_isle_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38596,13 +40097,15 @@ GrB_Info GB_Adot3B__plus_isle_uint64
 GrB_Info GB_Asaxpy3B__plus_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38616,14 +40119,13 @@ GrB_Info GB_Adot4B__plus_isle_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38641,13 +40143,15 @@ GrB_Info GB_Adot3B__plus_isle_fp32
 GrB_Info GB_Asaxpy3B__plus_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38661,14 +40165,13 @@ GrB_Info GB_Adot4B__plus_isle_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38686,13 +40189,15 @@ GrB_Info GB_Adot3B__plus_isle_fp64
 GrB_Info GB_Asaxpy3B__plus_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38705,14 +40210,14 @@ GrB_Info GB_Adot4B__plus_isle_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38730,13 +40235,15 @@ GrB_Info GB_Adot3B__times_isle_int8
 GrB_Info GB_Asaxpy3B__times_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38749,14 +40256,14 @@ GrB_Info GB_Adot4B__times_isle_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38774,13 +40281,15 @@ GrB_Info GB_Adot3B__times_isle_uint8
 GrB_Info GB_Asaxpy3B__times_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38793,14 +40302,14 @@ GrB_Info GB_Adot4B__times_isle_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38818,13 +40327,15 @@ GrB_Info GB_Adot3B__times_isle_int16
 GrB_Info GB_Asaxpy3B__times_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38837,14 +40348,14 @@ GrB_Info GB_Adot4B__times_isle_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38862,13 +40373,15 @@ GrB_Info GB_Adot3B__times_isle_uint16
 GrB_Info GB_Asaxpy3B__times_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38881,14 +40394,14 @@ GrB_Info GB_Adot4B__times_isle_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38906,13 +40419,15 @@ GrB_Info GB_Adot3B__times_isle_int32
 GrB_Info GB_Asaxpy3B__times_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38925,14 +40440,14 @@ GrB_Info GB_Adot4B__times_isle_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38950,13 +40465,15 @@ GrB_Info GB_Adot3B__times_isle_uint32
 GrB_Info GB_Asaxpy3B__times_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -38969,14 +40486,14 @@ GrB_Info GB_Adot4B__times_isle_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -38994,13 +40511,15 @@ GrB_Info GB_Adot3B__times_isle_int64
 GrB_Info GB_Asaxpy3B__times_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39013,14 +40532,14 @@ GrB_Info GB_Adot4B__times_isle_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39038,13 +40557,15 @@ GrB_Info GB_Adot3B__times_isle_uint64
 GrB_Info GB_Asaxpy3B__times_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39058,14 +40579,13 @@ GrB_Info GB_Adot4B__times_isle_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39083,13 +40603,15 @@ GrB_Info GB_Adot3B__times_isle_fp32
 GrB_Info GB_Asaxpy3B__times_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39103,14 +40625,13 @@ GrB_Info GB_Adot4B__times_isle_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39128,13 +40649,15 @@ GrB_Info GB_Adot3B__times_isle_fp64
 GrB_Info GB_Asaxpy3B__times_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39147,14 +40670,14 @@ GrB_Info GB_Adot4B__times_isle_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39172,13 +40695,15 @@ GrB_Info GB_Adot3B__lor_eq_bool
 GrB_Info GB_Asaxpy3B__lor_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39191,14 +40716,14 @@ GrB_Info GB_Adot4B__lor_eq_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39216,13 +40741,15 @@ GrB_Info GB_Adot3B__lor_eq_int8
 GrB_Info GB_Asaxpy3B__lor_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39235,14 +40762,14 @@ GrB_Info GB_Adot4B__lor_eq_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39260,13 +40787,15 @@ GrB_Info GB_Adot3B__lor_eq_uint8
 GrB_Info GB_Asaxpy3B__lor_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39279,14 +40808,14 @@ GrB_Info GB_Adot4B__lor_eq_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39304,13 +40833,15 @@ GrB_Info GB_Adot3B__lor_eq_int16
 GrB_Info GB_Asaxpy3B__lor_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39323,14 +40854,14 @@ GrB_Info GB_Adot4B__lor_eq_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39348,13 +40879,15 @@ GrB_Info GB_Adot3B__lor_eq_uint16
 GrB_Info GB_Asaxpy3B__lor_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39367,14 +40900,14 @@ GrB_Info GB_Adot4B__lor_eq_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39392,13 +40925,15 @@ GrB_Info GB_Adot3B__lor_eq_int32
 GrB_Info GB_Asaxpy3B__lor_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39411,14 +40946,14 @@ GrB_Info GB_Adot4B__lor_eq_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39436,13 +40971,15 @@ GrB_Info GB_Adot3B__lor_eq_uint32
 GrB_Info GB_Asaxpy3B__lor_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39455,14 +40992,14 @@ GrB_Info GB_Adot4B__lor_eq_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39480,13 +41017,15 @@ GrB_Info GB_Adot3B__lor_eq_int64
 GrB_Info GB_Asaxpy3B__lor_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39499,14 +41038,14 @@ GrB_Info GB_Adot4B__lor_eq_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39524,13 +41063,15 @@ GrB_Info GB_Adot3B__lor_eq_uint64
 GrB_Info GB_Asaxpy3B__lor_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39543,14 +41084,14 @@ GrB_Info GB_Adot4B__lor_eq_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39568,13 +41109,15 @@ GrB_Info GB_Adot3B__lor_eq_fp32
 GrB_Info GB_Asaxpy3B__lor_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39587,14 +41130,14 @@ GrB_Info GB_Adot4B__lor_eq_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39612,13 +41155,15 @@ GrB_Info GB_Adot3B__lor_eq_fp64
 GrB_Info GB_Asaxpy3B__lor_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39631,14 +41176,14 @@ GrB_Info GB_Adot4B__lor_eq_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39656,13 +41201,15 @@ GrB_Info GB_Adot3B__any_eq_bool
 GrB_Info GB_Asaxpy3B__any_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39675,14 +41222,14 @@ GrB_Info GB_Adot4B__any_eq_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39700,13 +41247,15 @@ GrB_Info GB_Adot3B__any_eq_int8
 GrB_Info GB_Asaxpy3B__any_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39719,14 +41268,14 @@ GrB_Info GB_Adot4B__any_eq_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39744,13 +41293,15 @@ GrB_Info GB_Adot3B__any_eq_uint8
 GrB_Info GB_Asaxpy3B__any_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39763,14 +41314,14 @@ GrB_Info GB_Adot4B__any_eq_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39788,13 +41339,15 @@ GrB_Info GB_Adot3B__any_eq_int16
 GrB_Info GB_Asaxpy3B__any_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39807,14 +41360,14 @@ GrB_Info GB_Adot4B__any_eq_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39832,13 +41385,15 @@ GrB_Info GB_Adot3B__any_eq_uint16
 GrB_Info GB_Asaxpy3B__any_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39851,14 +41406,14 @@ GrB_Info GB_Adot4B__any_eq_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39876,13 +41431,15 @@ GrB_Info GB_Adot3B__any_eq_int32
 GrB_Info GB_Asaxpy3B__any_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39895,14 +41452,14 @@ GrB_Info GB_Adot4B__any_eq_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39920,13 +41477,15 @@ GrB_Info GB_Adot3B__any_eq_uint32
 GrB_Info GB_Asaxpy3B__any_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39939,14 +41498,14 @@ GrB_Info GB_Adot4B__any_eq_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -39964,13 +41523,15 @@ GrB_Info GB_Adot3B__any_eq_int64
 GrB_Info GB_Asaxpy3B__any_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -39983,14 +41544,14 @@ GrB_Info GB_Adot4B__any_eq_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40008,13 +41569,15 @@ GrB_Info GB_Adot3B__any_eq_uint64
 GrB_Info GB_Asaxpy3B__any_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40027,14 +41590,14 @@ GrB_Info GB_Adot4B__any_eq_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40052,13 +41615,15 @@ GrB_Info GB_Adot3B__any_eq_fp32
 GrB_Info GB_Asaxpy3B__any_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40071,14 +41636,14 @@ GrB_Info GB_Adot4B__any_eq_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40096,13 +41661,15 @@ GrB_Info GB_Adot3B__any_eq_fp64
 GrB_Info GB_Asaxpy3B__any_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40115,14 +41682,14 @@ GrB_Info GB_Adot4B__any_eq_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40140,13 +41707,15 @@ GrB_Info GB_Adot3B__land_eq_bool
 GrB_Info GB_Asaxpy3B__land_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40159,14 +41728,14 @@ GrB_Info GB_Adot4B__land_eq_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40184,13 +41753,15 @@ GrB_Info GB_Adot3B__land_eq_int8
 GrB_Info GB_Asaxpy3B__land_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40203,14 +41774,14 @@ GrB_Info GB_Adot4B__land_eq_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40228,13 +41799,15 @@ GrB_Info GB_Adot3B__land_eq_uint8
 GrB_Info GB_Asaxpy3B__land_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40247,14 +41820,14 @@ GrB_Info GB_Adot4B__land_eq_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40272,13 +41845,15 @@ GrB_Info GB_Adot3B__land_eq_int16
 GrB_Info GB_Asaxpy3B__land_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40291,14 +41866,14 @@ GrB_Info GB_Adot4B__land_eq_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40316,13 +41891,15 @@ GrB_Info GB_Adot3B__land_eq_uint16
 GrB_Info GB_Asaxpy3B__land_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40335,14 +41912,14 @@ GrB_Info GB_Adot4B__land_eq_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40360,13 +41937,15 @@ GrB_Info GB_Adot3B__land_eq_int32
 GrB_Info GB_Asaxpy3B__land_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40379,14 +41958,14 @@ GrB_Info GB_Adot4B__land_eq_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40404,13 +41983,15 @@ GrB_Info GB_Adot3B__land_eq_uint32
 GrB_Info GB_Asaxpy3B__land_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40423,14 +42004,14 @@ GrB_Info GB_Adot4B__land_eq_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40448,13 +42029,15 @@ GrB_Info GB_Adot3B__land_eq_int64
 GrB_Info GB_Asaxpy3B__land_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40467,14 +42050,14 @@ GrB_Info GB_Adot4B__land_eq_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40492,13 +42075,15 @@ GrB_Info GB_Adot3B__land_eq_uint64
 GrB_Info GB_Asaxpy3B__land_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40511,14 +42096,14 @@ GrB_Info GB_Adot4B__land_eq_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40536,13 +42121,15 @@ GrB_Info GB_Adot3B__land_eq_fp32
 GrB_Info GB_Asaxpy3B__land_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40555,14 +42142,14 @@ GrB_Info GB_Adot4B__land_eq_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40580,13 +42167,15 @@ GrB_Info GB_Adot3B__land_eq_fp64
 GrB_Info GB_Asaxpy3B__land_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40600,14 +42189,13 @@ GrB_Info GB_Adot4B__land_eq_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40625,13 +42213,15 @@ GrB_Info GB_Adot3B__lxor_eq_bool
 GrB_Info GB_Asaxpy3B__lxor_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40645,14 +42235,13 @@ GrB_Info GB_Adot4B__lxor_eq_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40670,13 +42259,15 @@ GrB_Info GB_Adot3B__lxor_eq_int8
 GrB_Info GB_Asaxpy3B__lxor_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40690,14 +42281,13 @@ GrB_Info GB_Adot4B__lxor_eq_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40715,13 +42305,15 @@ GrB_Info GB_Adot3B__lxor_eq_uint8
 GrB_Info GB_Asaxpy3B__lxor_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40735,14 +42327,13 @@ GrB_Info GB_Adot4B__lxor_eq_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40760,13 +42351,15 @@ GrB_Info GB_Adot3B__lxor_eq_int16
 GrB_Info GB_Asaxpy3B__lxor_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40780,14 +42373,13 @@ GrB_Info GB_Adot4B__lxor_eq_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40805,13 +42397,15 @@ GrB_Info GB_Adot3B__lxor_eq_uint16
 GrB_Info GB_Asaxpy3B__lxor_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40825,14 +42419,13 @@ GrB_Info GB_Adot4B__lxor_eq_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40850,13 +42443,15 @@ GrB_Info GB_Adot3B__lxor_eq_int32
 GrB_Info GB_Asaxpy3B__lxor_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40870,14 +42465,13 @@ GrB_Info GB_Adot4B__lxor_eq_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40895,13 +42489,15 @@ GrB_Info GB_Adot3B__lxor_eq_uint32
 GrB_Info GB_Asaxpy3B__lxor_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40915,14 +42511,13 @@ GrB_Info GB_Adot4B__lxor_eq_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40940,13 +42535,15 @@ GrB_Info GB_Adot3B__lxor_eq_int64
 GrB_Info GB_Asaxpy3B__lxor_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -40960,14 +42557,13 @@ GrB_Info GB_Adot4B__lxor_eq_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -40985,13 +42581,15 @@ GrB_Info GB_Adot3B__lxor_eq_uint64
 GrB_Info GB_Asaxpy3B__lxor_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41005,14 +42603,13 @@ GrB_Info GB_Adot4B__lxor_eq_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41030,13 +42627,15 @@ GrB_Info GB_Adot3B__lxor_eq_fp32
 GrB_Info GB_Asaxpy3B__lxor_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41050,14 +42649,13 @@ GrB_Info GB_Adot4B__lxor_eq_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41075,13 +42673,15 @@ GrB_Info GB_Adot3B__lxor_eq_fp64
 GrB_Info GB_Asaxpy3B__lxor_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41095,14 +42695,13 @@ GrB_Info GB_Adot4B__lxor_eq_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41120,13 +42719,15 @@ GrB_Info GB_Adot3B__eq_eq_bool
 GrB_Info GB_Asaxpy3B__eq_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41140,14 +42741,13 @@ GrB_Info GB_Adot4B__eq_eq_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41165,13 +42765,15 @@ GrB_Info GB_Adot3B__eq_eq_int8
 GrB_Info GB_Asaxpy3B__eq_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41185,14 +42787,13 @@ GrB_Info GB_Adot4B__eq_eq_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41210,13 +42811,15 @@ GrB_Info GB_Adot3B__eq_eq_uint8
 GrB_Info GB_Asaxpy3B__eq_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41230,14 +42833,13 @@ GrB_Info GB_Adot4B__eq_eq_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41255,13 +42857,15 @@ GrB_Info GB_Adot3B__eq_eq_int16
 GrB_Info GB_Asaxpy3B__eq_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41275,14 +42879,13 @@ GrB_Info GB_Adot4B__eq_eq_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41300,13 +42903,15 @@ GrB_Info GB_Adot3B__eq_eq_uint16
 GrB_Info GB_Asaxpy3B__eq_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41320,14 +42925,13 @@ GrB_Info GB_Adot4B__eq_eq_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41345,13 +42949,15 @@ GrB_Info GB_Adot3B__eq_eq_int32
 GrB_Info GB_Asaxpy3B__eq_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41365,14 +42971,13 @@ GrB_Info GB_Adot4B__eq_eq_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41390,13 +42995,15 @@ GrB_Info GB_Adot3B__eq_eq_uint32
 GrB_Info GB_Asaxpy3B__eq_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41410,14 +43017,13 @@ GrB_Info GB_Adot4B__eq_eq_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41435,13 +43041,15 @@ GrB_Info GB_Adot3B__eq_eq_int64
 GrB_Info GB_Asaxpy3B__eq_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41455,14 +43063,13 @@ GrB_Info GB_Adot4B__eq_eq_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41480,13 +43087,15 @@ GrB_Info GB_Adot3B__eq_eq_uint64
 GrB_Info GB_Asaxpy3B__eq_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41500,14 +43109,13 @@ GrB_Info GB_Adot4B__eq_eq_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41525,13 +43133,15 @@ GrB_Info GB_Adot3B__eq_eq_fp32
 GrB_Info GB_Asaxpy3B__eq_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41545,14 +43155,13 @@ GrB_Info GB_Adot4B__eq_eq_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41570,13 +43179,15 @@ GrB_Info GB_Adot3B__eq_eq_fp64
 GrB_Info GB_Asaxpy3B__eq_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41589,14 +43200,14 @@ GrB_Info GB_Adot4B__eq_eq_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41614,13 +43225,15 @@ GrB_Info GB_Adot3B__lor_ne_int8
 GrB_Info GB_Asaxpy3B__lor_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41633,14 +43246,14 @@ GrB_Info GB_Adot4B__lor_ne_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41658,13 +43271,15 @@ GrB_Info GB_Adot3B__lor_ne_uint8
 GrB_Info GB_Asaxpy3B__lor_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41677,14 +43292,14 @@ GrB_Info GB_Adot4B__lor_ne_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41702,13 +43317,15 @@ GrB_Info GB_Adot3B__lor_ne_int16
 GrB_Info GB_Asaxpy3B__lor_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41721,14 +43338,14 @@ GrB_Info GB_Adot4B__lor_ne_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41746,13 +43363,15 @@ GrB_Info GB_Adot3B__lor_ne_uint16
 GrB_Info GB_Asaxpy3B__lor_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41765,14 +43384,14 @@ GrB_Info GB_Adot4B__lor_ne_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41790,13 +43409,15 @@ GrB_Info GB_Adot3B__lor_ne_int32
 GrB_Info GB_Asaxpy3B__lor_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41809,14 +43430,14 @@ GrB_Info GB_Adot4B__lor_ne_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41834,13 +43455,15 @@ GrB_Info GB_Adot3B__lor_ne_uint32
 GrB_Info GB_Asaxpy3B__lor_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41853,14 +43476,14 @@ GrB_Info GB_Adot4B__lor_ne_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41878,13 +43501,15 @@ GrB_Info GB_Adot3B__lor_ne_int64
 GrB_Info GB_Asaxpy3B__lor_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41897,14 +43522,14 @@ GrB_Info GB_Adot4B__lor_ne_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41922,13 +43547,15 @@ GrB_Info GB_Adot3B__lor_ne_uint64
 GrB_Info GB_Asaxpy3B__lor_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41941,14 +43568,14 @@ GrB_Info GB_Adot4B__lor_ne_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -41966,13 +43593,15 @@ GrB_Info GB_Adot3B__lor_ne_fp32
 GrB_Info GB_Asaxpy3B__lor_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -41985,14 +43614,14 @@ GrB_Info GB_Adot4B__lor_ne_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42010,13 +43639,15 @@ GrB_Info GB_Adot3B__lor_ne_fp64
 GrB_Info GB_Asaxpy3B__lor_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42029,14 +43660,14 @@ GrB_Info GB_Adot4B__lor_ne_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42054,13 +43685,15 @@ GrB_Info GB_Adot3B__any_ne_int8
 GrB_Info GB_Asaxpy3B__any_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42073,14 +43706,14 @@ GrB_Info GB_Adot4B__any_ne_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42098,13 +43731,15 @@ GrB_Info GB_Adot3B__any_ne_uint8
 GrB_Info GB_Asaxpy3B__any_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42117,14 +43752,14 @@ GrB_Info GB_Adot4B__any_ne_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42142,13 +43777,15 @@ GrB_Info GB_Adot3B__any_ne_int16
 GrB_Info GB_Asaxpy3B__any_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42161,14 +43798,14 @@ GrB_Info GB_Adot4B__any_ne_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42186,13 +43823,15 @@ GrB_Info GB_Adot3B__any_ne_uint16
 GrB_Info GB_Asaxpy3B__any_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42205,14 +43844,14 @@ GrB_Info GB_Adot4B__any_ne_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42230,13 +43869,15 @@ GrB_Info GB_Adot3B__any_ne_int32
 GrB_Info GB_Asaxpy3B__any_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42249,14 +43890,14 @@ GrB_Info GB_Adot4B__any_ne_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42274,13 +43915,15 @@ GrB_Info GB_Adot3B__any_ne_uint32
 GrB_Info GB_Asaxpy3B__any_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42293,14 +43936,14 @@ GrB_Info GB_Adot4B__any_ne_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42318,13 +43961,15 @@ GrB_Info GB_Adot3B__any_ne_int64
 GrB_Info GB_Asaxpy3B__any_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42337,14 +43982,14 @@ GrB_Info GB_Adot4B__any_ne_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42362,13 +44007,15 @@ GrB_Info GB_Adot3B__any_ne_uint64
 GrB_Info GB_Asaxpy3B__any_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42381,14 +44028,14 @@ GrB_Info GB_Adot4B__any_ne_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42406,13 +44053,15 @@ GrB_Info GB_Adot3B__any_ne_fp32
 GrB_Info GB_Asaxpy3B__any_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42425,14 +44074,14 @@ GrB_Info GB_Adot4B__any_ne_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42450,13 +44099,15 @@ GrB_Info GB_Adot3B__any_ne_fp64
 GrB_Info GB_Asaxpy3B__any_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42469,14 +44120,14 @@ GrB_Info GB_Adot4B__any_ne_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42494,13 +44145,15 @@ GrB_Info GB_Adot3B__land_ne_int8
 GrB_Info GB_Asaxpy3B__land_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42513,14 +44166,14 @@ GrB_Info GB_Adot4B__land_ne_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42538,13 +44191,15 @@ GrB_Info GB_Adot3B__land_ne_uint8
 GrB_Info GB_Asaxpy3B__land_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42557,14 +44212,14 @@ GrB_Info GB_Adot4B__land_ne_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42582,13 +44237,15 @@ GrB_Info GB_Adot3B__land_ne_int16
 GrB_Info GB_Asaxpy3B__land_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42601,14 +44258,14 @@ GrB_Info GB_Adot4B__land_ne_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42626,13 +44283,15 @@ GrB_Info GB_Adot3B__land_ne_uint16
 GrB_Info GB_Asaxpy3B__land_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42645,14 +44304,14 @@ GrB_Info GB_Adot4B__land_ne_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42670,13 +44329,15 @@ GrB_Info GB_Adot3B__land_ne_int32
 GrB_Info GB_Asaxpy3B__land_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42689,14 +44350,14 @@ GrB_Info GB_Adot4B__land_ne_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42714,13 +44375,15 @@ GrB_Info GB_Adot3B__land_ne_uint32
 GrB_Info GB_Asaxpy3B__land_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42733,14 +44396,14 @@ GrB_Info GB_Adot4B__land_ne_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42758,13 +44421,15 @@ GrB_Info GB_Adot3B__land_ne_int64
 GrB_Info GB_Asaxpy3B__land_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42777,14 +44442,14 @@ GrB_Info GB_Adot4B__land_ne_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42802,13 +44467,15 @@ GrB_Info GB_Adot3B__land_ne_uint64
 GrB_Info GB_Asaxpy3B__land_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42821,14 +44488,14 @@ GrB_Info GB_Adot4B__land_ne_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42846,13 +44513,15 @@ GrB_Info GB_Adot3B__land_ne_fp32
 GrB_Info GB_Asaxpy3B__land_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42865,14 +44534,14 @@ GrB_Info GB_Adot4B__land_ne_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42890,13 +44559,15 @@ GrB_Info GB_Adot3B__land_ne_fp64
 GrB_Info GB_Asaxpy3B__land_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42910,14 +44581,13 @@ GrB_Info GB_Adot4B__land_ne_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42935,13 +44605,15 @@ GrB_Info GB_Adot3B__lxor_ne_int8
 GrB_Info GB_Asaxpy3B__lxor_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42955,14 +44627,13 @@ GrB_Info GB_Adot4B__lxor_ne_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -42980,13 +44651,15 @@ GrB_Info GB_Adot3B__lxor_ne_uint8
 GrB_Info GB_Asaxpy3B__lxor_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43000,14 +44673,13 @@ GrB_Info GB_Adot4B__lxor_ne_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43025,13 +44697,15 @@ GrB_Info GB_Adot3B__lxor_ne_int16
 GrB_Info GB_Asaxpy3B__lxor_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43045,14 +44719,13 @@ GrB_Info GB_Adot4B__lxor_ne_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43070,13 +44743,15 @@ GrB_Info GB_Adot3B__lxor_ne_uint16
 GrB_Info GB_Asaxpy3B__lxor_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43090,14 +44765,13 @@ GrB_Info GB_Adot4B__lxor_ne_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43115,13 +44789,15 @@ GrB_Info GB_Adot3B__lxor_ne_int32
 GrB_Info GB_Asaxpy3B__lxor_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43135,14 +44811,13 @@ GrB_Info GB_Adot4B__lxor_ne_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43160,13 +44835,15 @@ GrB_Info GB_Adot3B__lxor_ne_uint32
 GrB_Info GB_Asaxpy3B__lxor_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43180,14 +44857,13 @@ GrB_Info GB_Adot4B__lxor_ne_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43205,13 +44881,15 @@ GrB_Info GB_Adot3B__lxor_ne_int64
 GrB_Info GB_Asaxpy3B__lxor_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43225,14 +44903,13 @@ GrB_Info GB_Adot4B__lxor_ne_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43250,13 +44927,15 @@ GrB_Info GB_Adot3B__lxor_ne_uint64
 GrB_Info GB_Asaxpy3B__lxor_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43270,14 +44949,13 @@ GrB_Info GB_Adot4B__lxor_ne_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43295,13 +44973,15 @@ GrB_Info GB_Adot3B__lxor_ne_fp32
 GrB_Info GB_Asaxpy3B__lxor_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43315,14 +44995,13 @@ GrB_Info GB_Adot4B__lxor_ne_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43340,13 +45019,15 @@ GrB_Info GB_Adot3B__lxor_ne_fp64
 GrB_Info GB_Asaxpy3B__lxor_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43360,14 +45041,13 @@ GrB_Info GB_Adot4B__lxor_ne_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43385,13 +45065,15 @@ GrB_Info GB_Adot3B__eq_ne_int8
 GrB_Info GB_Asaxpy3B__eq_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43405,14 +45087,13 @@ GrB_Info GB_Adot4B__eq_ne_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43430,13 +45111,15 @@ GrB_Info GB_Adot3B__eq_ne_uint8
 GrB_Info GB_Asaxpy3B__eq_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43450,14 +45133,13 @@ GrB_Info GB_Adot4B__eq_ne_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43475,13 +45157,15 @@ GrB_Info GB_Adot3B__eq_ne_int16
 GrB_Info GB_Asaxpy3B__eq_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43495,14 +45179,13 @@ GrB_Info GB_Adot4B__eq_ne_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43520,13 +45203,15 @@ GrB_Info GB_Adot3B__eq_ne_uint16
 GrB_Info GB_Asaxpy3B__eq_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43540,14 +45225,13 @@ GrB_Info GB_Adot4B__eq_ne_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43565,13 +45249,15 @@ GrB_Info GB_Adot3B__eq_ne_int32
 GrB_Info GB_Asaxpy3B__eq_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43585,14 +45271,13 @@ GrB_Info GB_Adot4B__eq_ne_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43610,13 +45295,15 @@ GrB_Info GB_Adot3B__eq_ne_uint32
 GrB_Info GB_Asaxpy3B__eq_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43630,14 +45317,13 @@ GrB_Info GB_Adot4B__eq_ne_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43655,13 +45341,15 @@ GrB_Info GB_Adot3B__eq_ne_int64
 GrB_Info GB_Asaxpy3B__eq_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43675,14 +45363,13 @@ GrB_Info GB_Adot4B__eq_ne_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43700,13 +45387,15 @@ GrB_Info GB_Adot3B__eq_ne_uint64
 GrB_Info GB_Asaxpy3B__eq_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43720,14 +45409,13 @@ GrB_Info GB_Adot4B__eq_ne_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43745,13 +45433,15 @@ GrB_Info GB_Adot3B__eq_ne_fp32
 GrB_Info GB_Asaxpy3B__eq_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43765,14 +45455,13 @@ GrB_Info GB_Adot4B__eq_ne_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43790,13 +45479,15 @@ GrB_Info GB_Adot3B__eq_ne_fp64
 GrB_Info GB_Asaxpy3B__eq_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43809,14 +45500,14 @@ GrB_Info GB_Adot4B__eq_ne_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43834,13 +45525,15 @@ GrB_Info GB_Adot3B__lor_gt_bool
 GrB_Info GB_Asaxpy3B__lor_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43853,14 +45546,14 @@ GrB_Info GB_Adot4B__lor_gt_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43878,13 +45571,15 @@ GrB_Info GB_Adot3B__lor_gt_int8
 GrB_Info GB_Asaxpy3B__lor_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43897,14 +45592,14 @@ GrB_Info GB_Adot4B__lor_gt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43922,13 +45617,15 @@ GrB_Info GB_Adot3B__lor_gt_uint8
 GrB_Info GB_Asaxpy3B__lor_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43941,14 +45638,14 @@ GrB_Info GB_Adot4B__lor_gt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -43966,13 +45663,15 @@ GrB_Info GB_Adot3B__lor_gt_int16
 GrB_Info GB_Asaxpy3B__lor_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -43985,14 +45684,14 @@ GrB_Info GB_Adot4B__lor_gt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44010,13 +45709,15 @@ GrB_Info GB_Adot3B__lor_gt_uint16
 GrB_Info GB_Asaxpy3B__lor_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44029,14 +45730,14 @@ GrB_Info GB_Adot4B__lor_gt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44054,13 +45755,15 @@ GrB_Info GB_Adot3B__lor_gt_int32
 GrB_Info GB_Asaxpy3B__lor_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44073,14 +45776,14 @@ GrB_Info GB_Adot4B__lor_gt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44098,13 +45801,15 @@ GrB_Info GB_Adot3B__lor_gt_uint32
 GrB_Info GB_Asaxpy3B__lor_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44117,14 +45822,14 @@ GrB_Info GB_Adot4B__lor_gt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44142,13 +45847,15 @@ GrB_Info GB_Adot3B__lor_gt_int64
 GrB_Info GB_Asaxpy3B__lor_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44161,14 +45868,14 @@ GrB_Info GB_Adot4B__lor_gt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44186,13 +45893,15 @@ GrB_Info GB_Adot3B__lor_gt_uint64
 GrB_Info GB_Asaxpy3B__lor_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44205,14 +45914,14 @@ GrB_Info GB_Adot4B__lor_gt_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44230,13 +45939,15 @@ GrB_Info GB_Adot3B__lor_gt_fp32
 GrB_Info GB_Asaxpy3B__lor_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44249,14 +45960,14 @@ GrB_Info GB_Adot4B__lor_gt_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44274,13 +45985,15 @@ GrB_Info GB_Adot3B__lor_gt_fp64
 GrB_Info GB_Asaxpy3B__lor_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44293,14 +46006,14 @@ GrB_Info GB_Adot4B__lor_gt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44318,13 +46031,15 @@ GrB_Info GB_Adot3B__any_gt_bool
 GrB_Info GB_Asaxpy3B__any_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44337,14 +46052,14 @@ GrB_Info GB_Adot4B__any_gt_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44362,13 +46077,15 @@ GrB_Info GB_Adot3B__any_gt_int8
 GrB_Info GB_Asaxpy3B__any_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44381,14 +46098,14 @@ GrB_Info GB_Adot4B__any_gt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44406,13 +46123,15 @@ GrB_Info GB_Adot3B__any_gt_uint8
 GrB_Info GB_Asaxpy3B__any_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44425,14 +46144,14 @@ GrB_Info GB_Adot4B__any_gt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44450,13 +46169,15 @@ GrB_Info GB_Adot3B__any_gt_int16
 GrB_Info GB_Asaxpy3B__any_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44469,14 +46190,14 @@ GrB_Info GB_Adot4B__any_gt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44494,13 +46215,15 @@ GrB_Info GB_Adot3B__any_gt_uint16
 GrB_Info GB_Asaxpy3B__any_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44513,14 +46236,14 @@ GrB_Info GB_Adot4B__any_gt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44538,13 +46261,15 @@ GrB_Info GB_Adot3B__any_gt_int32
 GrB_Info GB_Asaxpy3B__any_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44557,14 +46282,14 @@ GrB_Info GB_Adot4B__any_gt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44582,13 +46307,15 @@ GrB_Info GB_Adot3B__any_gt_uint32
 GrB_Info GB_Asaxpy3B__any_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44601,14 +46328,14 @@ GrB_Info GB_Adot4B__any_gt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44626,13 +46353,15 @@ GrB_Info GB_Adot3B__any_gt_int64
 GrB_Info GB_Asaxpy3B__any_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44645,14 +46374,14 @@ GrB_Info GB_Adot4B__any_gt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44670,13 +46399,15 @@ GrB_Info GB_Adot3B__any_gt_uint64
 GrB_Info GB_Asaxpy3B__any_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44689,14 +46420,14 @@ GrB_Info GB_Adot4B__any_gt_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44714,13 +46445,15 @@ GrB_Info GB_Adot3B__any_gt_fp32
 GrB_Info GB_Asaxpy3B__any_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44733,14 +46466,14 @@ GrB_Info GB_Adot4B__any_gt_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44758,13 +46491,15 @@ GrB_Info GB_Adot3B__any_gt_fp64
 GrB_Info GB_Asaxpy3B__any_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44777,14 +46512,14 @@ GrB_Info GB_Adot4B__any_gt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44802,13 +46537,15 @@ GrB_Info GB_Adot3B__land_gt_bool
 GrB_Info GB_Asaxpy3B__land_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44821,14 +46558,14 @@ GrB_Info GB_Adot4B__land_gt_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44846,13 +46583,15 @@ GrB_Info GB_Adot3B__land_gt_int8
 GrB_Info GB_Asaxpy3B__land_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44865,14 +46604,14 @@ GrB_Info GB_Adot4B__land_gt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44890,13 +46629,15 @@ GrB_Info GB_Adot3B__land_gt_uint8
 GrB_Info GB_Asaxpy3B__land_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44909,14 +46650,14 @@ GrB_Info GB_Adot4B__land_gt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44934,13 +46675,15 @@ GrB_Info GB_Adot3B__land_gt_int16
 GrB_Info GB_Asaxpy3B__land_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44953,14 +46696,14 @@ GrB_Info GB_Adot4B__land_gt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -44978,13 +46721,15 @@ GrB_Info GB_Adot3B__land_gt_uint16
 GrB_Info GB_Asaxpy3B__land_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -44997,14 +46742,14 @@ GrB_Info GB_Adot4B__land_gt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45022,13 +46767,15 @@ GrB_Info GB_Adot3B__land_gt_int32
 GrB_Info GB_Asaxpy3B__land_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45041,14 +46788,14 @@ GrB_Info GB_Adot4B__land_gt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45066,13 +46813,15 @@ GrB_Info GB_Adot3B__land_gt_uint32
 GrB_Info GB_Asaxpy3B__land_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45085,14 +46834,14 @@ GrB_Info GB_Adot4B__land_gt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45110,13 +46859,15 @@ GrB_Info GB_Adot3B__land_gt_int64
 GrB_Info GB_Asaxpy3B__land_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45129,14 +46880,14 @@ GrB_Info GB_Adot4B__land_gt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45154,13 +46905,15 @@ GrB_Info GB_Adot3B__land_gt_uint64
 GrB_Info GB_Asaxpy3B__land_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45173,14 +46926,14 @@ GrB_Info GB_Adot4B__land_gt_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45198,13 +46951,15 @@ GrB_Info GB_Adot3B__land_gt_fp32
 GrB_Info GB_Asaxpy3B__land_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45217,14 +46972,14 @@ GrB_Info GB_Adot4B__land_gt_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45242,13 +46997,15 @@ GrB_Info GB_Adot3B__land_gt_fp64
 GrB_Info GB_Asaxpy3B__land_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45262,14 +47019,13 @@ GrB_Info GB_Adot4B__land_gt_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45287,13 +47043,15 @@ GrB_Info GB_Adot3B__lxor_gt_bool
 GrB_Info GB_Asaxpy3B__lxor_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45307,14 +47065,13 @@ GrB_Info GB_Adot4B__lxor_gt_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45332,13 +47089,15 @@ GrB_Info GB_Adot3B__lxor_gt_int8
 GrB_Info GB_Asaxpy3B__lxor_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45352,14 +47111,13 @@ GrB_Info GB_Adot4B__lxor_gt_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45377,13 +47135,15 @@ GrB_Info GB_Adot3B__lxor_gt_uint8
 GrB_Info GB_Asaxpy3B__lxor_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45397,14 +47157,13 @@ GrB_Info GB_Adot4B__lxor_gt_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45422,13 +47181,15 @@ GrB_Info GB_Adot3B__lxor_gt_int16
 GrB_Info GB_Asaxpy3B__lxor_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45442,14 +47203,13 @@ GrB_Info GB_Adot4B__lxor_gt_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45467,13 +47227,15 @@ GrB_Info GB_Adot3B__lxor_gt_uint16
 GrB_Info GB_Asaxpy3B__lxor_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45487,14 +47249,13 @@ GrB_Info GB_Adot4B__lxor_gt_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45512,13 +47273,15 @@ GrB_Info GB_Adot3B__lxor_gt_int32
 GrB_Info GB_Asaxpy3B__lxor_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45532,14 +47295,13 @@ GrB_Info GB_Adot4B__lxor_gt_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45557,13 +47319,15 @@ GrB_Info GB_Adot3B__lxor_gt_uint32
 GrB_Info GB_Asaxpy3B__lxor_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45577,14 +47341,13 @@ GrB_Info GB_Adot4B__lxor_gt_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45602,13 +47365,15 @@ GrB_Info GB_Adot3B__lxor_gt_int64
 GrB_Info GB_Asaxpy3B__lxor_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45622,14 +47387,13 @@ GrB_Info GB_Adot4B__lxor_gt_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45647,13 +47411,15 @@ GrB_Info GB_Adot3B__lxor_gt_uint64
 GrB_Info GB_Asaxpy3B__lxor_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45667,14 +47433,13 @@ GrB_Info GB_Adot4B__lxor_gt_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45692,13 +47457,15 @@ GrB_Info GB_Adot3B__lxor_gt_fp32
 GrB_Info GB_Asaxpy3B__lxor_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45712,14 +47479,13 @@ GrB_Info GB_Adot4B__lxor_gt_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45737,13 +47503,15 @@ GrB_Info GB_Adot3B__lxor_gt_fp64
 GrB_Info GB_Asaxpy3B__lxor_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45757,14 +47525,13 @@ GrB_Info GB_Adot4B__lxor_gt_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45782,13 +47549,15 @@ GrB_Info GB_Adot3B__eq_gt_bool
 GrB_Info GB_Asaxpy3B__eq_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45802,14 +47571,13 @@ GrB_Info GB_Adot4B__eq_gt_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45827,13 +47595,15 @@ GrB_Info GB_Adot3B__eq_gt_int8
 GrB_Info GB_Asaxpy3B__eq_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45847,14 +47617,13 @@ GrB_Info GB_Adot4B__eq_gt_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45872,13 +47641,15 @@ GrB_Info GB_Adot3B__eq_gt_uint8
 GrB_Info GB_Asaxpy3B__eq_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45892,14 +47663,13 @@ GrB_Info GB_Adot4B__eq_gt_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45917,13 +47687,15 @@ GrB_Info GB_Adot3B__eq_gt_int16
 GrB_Info GB_Asaxpy3B__eq_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45937,14 +47709,13 @@ GrB_Info GB_Adot4B__eq_gt_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -45962,13 +47733,15 @@ GrB_Info GB_Adot3B__eq_gt_uint16
 GrB_Info GB_Asaxpy3B__eq_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -45982,14 +47755,13 @@ GrB_Info GB_Adot4B__eq_gt_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46007,13 +47779,15 @@ GrB_Info GB_Adot3B__eq_gt_int32
 GrB_Info GB_Asaxpy3B__eq_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46027,14 +47801,13 @@ GrB_Info GB_Adot4B__eq_gt_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46052,13 +47825,15 @@ GrB_Info GB_Adot3B__eq_gt_uint32
 GrB_Info GB_Asaxpy3B__eq_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46072,14 +47847,13 @@ GrB_Info GB_Adot4B__eq_gt_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46097,13 +47871,15 @@ GrB_Info GB_Adot3B__eq_gt_int64
 GrB_Info GB_Asaxpy3B__eq_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46117,14 +47893,13 @@ GrB_Info GB_Adot4B__eq_gt_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46142,13 +47917,15 @@ GrB_Info GB_Adot3B__eq_gt_uint64
 GrB_Info GB_Asaxpy3B__eq_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46162,14 +47939,13 @@ GrB_Info GB_Adot4B__eq_gt_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46187,13 +47963,15 @@ GrB_Info GB_Adot3B__eq_gt_fp32
 GrB_Info GB_Asaxpy3B__eq_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46207,14 +47985,13 @@ GrB_Info GB_Adot4B__eq_gt_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46232,13 +48009,15 @@ GrB_Info GB_Adot3B__eq_gt_fp64
 GrB_Info GB_Asaxpy3B__eq_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46251,14 +48030,14 @@ GrB_Info GB_Adot4B__eq_gt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46276,13 +48055,15 @@ GrB_Info GB_Adot3B__lor_lt_bool
 GrB_Info GB_Asaxpy3B__lor_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46295,14 +48076,14 @@ GrB_Info GB_Adot4B__lor_lt_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46320,13 +48101,15 @@ GrB_Info GB_Adot3B__lor_lt_int8
 GrB_Info GB_Asaxpy3B__lor_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46339,14 +48122,14 @@ GrB_Info GB_Adot4B__lor_lt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46364,13 +48147,15 @@ GrB_Info GB_Adot3B__lor_lt_uint8
 GrB_Info GB_Asaxpy3B__lor_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46383,14 +48168,14 @@ GrB_Info GB_Adot4B__lor_lt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46408,13 +48193,15 @@ GrB_Info GB_Adot3B__lor_lt_int16
 GrB_Info GB_Asaxpy3B__lor_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46427,14 +48214,14 @@ GrB_Info GB_Adot4B__lor_lt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46452,13 +48239,15 @@ GrB_Info GB_Adot3B__lor_lt_uint16
 GrB_Info GB_Asaxpy3B__lor_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46471,14 +48260,14 @@ GrB_Info GB_Adot4B__lor_lt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46496,13 +48285,15 @@ GrB_Info GB_Adot3B__lor_lt_int32
 GrB_Info GB_Asaxpy3B__lor_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46515,14 +48306,14 @@ GrB_Info GB_Adot4B__lor_lt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46540,13 +48331,15 @@ GrB_Info GB_Adot3B__lor_lt_uint32
 GrB_Info GB_Asaxpy3B__lor_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46559,14 +48352,14 @@ GrB_Info GB_Adot4B__lor_lt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46584,13 +48377,15 @@ GrB_Info GB_Adot3B__lor_lt_int64
 GrB_Info GB_Asaxpy3B__lor_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46603,14 +48398,14 @@ GrB_Info GB_Adot4B__lor_lt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46628,13 +48423,15 @@ GrB_Info GB_Adot3B__lor_lt_uint64
 GrB_Info GB_Asaxpy3B__lor_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46647,14 +48444,14 @@ GrB_Info GB_Adot4B__lor_lt_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46672,13 +48469,15 @@ GrB_Info GB_Adot3B__lor_lt_fp32
 GrB_Info GB_Asaxpy3B__lor_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46691,14 +48490,14 @@ GrB_Info GB_Adot4B__lor_lt_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46716,13 +48515,15 @@ GrB_Info GB_Adot3B__lor_lt_fp64
 GrB_Info GB_Asaxpy3B__lor_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46735,14 +48536,14 @@ GrB_Info GB_Adot4B__lor_lt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46760,13 +48561,15 @@ GrB_Info GB_Adot3B__any_lt_bool
 GrB_Info GB_Asaxpy3B__any_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46779,14 +48582,14 @@ GrB_Info GB_Adot4B__any_lt_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46804,13 +48607,15 @@ GrB_Info GB_Adot3B__any_lt_int8
 GrB_Info GB_Asaxpy3B__any_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46823,14 +48628,14 @@ GrB_Info GB_Adot4B__any_lt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46848,13 +48653,15 @@ GrB_Info GB_Adot3B__any_lt_uint8
 GrB_Info GB_Asaxpy3B__any_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46867,14 +48674,14 @@ GrB_Info GB_Adot4B__any_lt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46892,13 +48699,15 @@ GrB_Info GB_Adot3B__any_lt_int16
 GrB_Info GB_Asaxpy3B__any_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46911,14 +48720,14 @@ GrB_Info GB_Adot4B__any_lt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46936,13 +48745,15 @@ GrB_Info GB_Adot3B__any_lt_uint16
 GrB_Info GB_Asaxpy3B__any_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46955,14 +48766,14 @@ GrB_Info GB_Adot4B__any_lt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -46980,13 +48791,15 @@ GrB_Info GB_Adot3B__any_lt_int32
 GrB_Info GB_Asaxpy3B__any_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -46999,14 +48812,14 @@ GrB_Info GB_Adot4B__any_lt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47024,13 +48837,15 @@ GrB_Info GB_Adot3B__any_lt_uint32
 GrB_Info GB_Asaxpy3B__any_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47043,14 +48858,14 @@ GrB_Info GB_Adot4B__any_lt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47068,13 +48883,15 @@ GrB_Info GB_Adot3B__any_lt_int64
 GrB_Info GB_Asaxpy3B__any_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47087,14 +48904,14 @@ GrB_Info GB_Adot4B__any_lt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47112,13 +48929,15 @@ GrB_Info GB_Adot3B__any_lt_uint64
 GrB_Info GB_Asaxpy3B__any_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47131,14 +48950,14 @@ GrB_Info GB_Adot4B__any_lt_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47156,13 +48975,15 @@ GrB_Info GB_Adot3B__any_lt_fp32
 GrB_Info GB_Asaxpy3B__any_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47175,14 +48996,14 @@ GrB_Info GB_Adot4B__any_lt_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47200,13 +49021,15 @@ GrB_Info GB_Adot3B__any_lt_fp64
 GrB_Info GB_Asaxpy3B__any_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47219,14 +49042,14 @@ GrB_Info GB_Adot4B__any_lt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47244,13 +49067,15 @@ GrB_Info GB_Adot3B__land_lt_bool
 GrB_Info GB_Asaxpy3B__land_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47263,14 +49088,14 @@ GrB_Info GB_Adot4B__land_lt_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47288,13 +49113,15 @@ GrB_Info GB_Adot3B__land_lt_int8
 GrB_Info GB_Asaxpy3B__land_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47307,14 +49134,14 @@ GrB_Info GB_Adot4B__land_lt_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47332,13 +49159,15 @@ GrB_Info GB_Adot3B__land_lt_uint8
 GrB_Info GB_Asaxpy3B__land_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47351,14 +49180,14 @@ GrB_Info GB_Adot4B__land_lt_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47376,13 +49205,15 @@ GrB_Info GB_Adot3B__land_lt_int16
 GrB_Info GB_Asaxpy3B__land_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47395,14 +49226,14 @@ GrB_Info GB_Adot4B__land_lt_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47420,13 +49251,15 @@ GrB_Info GB_Adot3B__land_lt_uint16
 GrB_Info GB_Asaxpy3B__land_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47439,14 +49272,14 @@ GrB_Info GB_Adot4B__land_lt_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47464,13 +49297,15 @@ GrB_Info GB_Adot3B__land_lt_int32
 GrB_Info GB_Asaxpy3B__land_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47483,14 +49318,14 @@ GrB_Info GB_Adot4B__land_lt_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47508,13 +49343,15 @@ GrB_Info GB_Adot3B__land_lt_uint32
 GrB_Info GB_Asaxpy3B__land_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47527,14 +49364,14 @@ GrB_Info GB_Adot4B__land_lt_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47552,13 +49389,15 @@ GrB_Info GB_Adot3B__land_lt_int64
 GrB_Info GB_Asaxpy3B__land_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47571,14 +49410,14 @@ GrB_Info GB_Adot4B__land_lt_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47596,13 +49435,15 @@ GrB_Info GB_Adot3B__land_lt_uint64
 GrB_Info GB_Asaxpy3B__land_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47615,14 +49456,14 @@ GrB_Info GB_Adot4B__land_lt_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47640,13 +49481,15 @@ GrB_Info GB_Adot3B__land_lt_fp32
 GrB_Info GB_Asaxpy3B__land_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47659,14 +49502,14 @@ GrB_Info GB_Adot4B__land_lt_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47684,13 +49527,15 @@ GrB_Info GB_Adot3B__land_lt_fp64
 GrB_Info GB_Asaxpy3B__land_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47704,14 +49549,13 @@ GrB_Info GB_Adot4B__land_lt_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47729,13 +49573,15 @@ GrB_Info GB_Adot3B__lxor_lt_bool
 GrB_Info GB_Asaxpy3B__lxor_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47749,14 +49595,13 @@ GrB_Info GB_Adot4B__lxor_lt_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47774,13 +49619,15 @@ GrB_Info GB_Adot3B__lxor_lt_int8
 GrB_Info GB_Asaxpy3B__lxor_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47794,14 +49641,13 @@ GrB_Info GB_Adot4B__lxor_lt_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47819,13 +49665,15 @@ GrB_Info GB_Adot3B__lxor_lt_uint8
 GrB_Info GB_Asaxpy3B__lxor_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47839,14 +49687,13 @@ GrB_Info GB_Adot4B__lxor_lt_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47864,13 +49711,15 @@ GrB_Info GB_Adot3B__lxor_lt_int16
 GrB_Info GB_Asaxpy3B__lxor_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47884,14 +49733,13 @@ GrB_Info GB_Adot4B__lxor_lt_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47909,13 +49757,15 @@ GrB_Info GB_Adot3B__lxor_lt_uint16
 GrB_Info GB_Asaxpy3B__lxor_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47929,14 +49779,13 @@ GrB_Info GB_Adot4B__lxor_lt_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47954,13 +49803,15 @@ GrB_Info GB_Adot3B__lxor_lt_int32
 GrB_Info GB_Asaxpy3B__lxor_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -47974,14 +49825,13 @@ GrB_Info GB_Adot4B__lxor_lt_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -47999,13 +49849,15 @@ GrB_Info GB_Adot3B__lxor_lt_uint32
 GrB_Info GB_Asaxpy3B__lxor_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48019,14 +49871,13 @@ GrB_Info GB_Adot4B__lxor_lt_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48044,13 +49895,15 @@ GrB_Info GB_Adot3B__lxor_lt_int64
 GrB_Info GB_Asaxpy3B__lxor_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48064,14 +49917,13 @@ GrB_Info GB_Adot4B__lxor_lt_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48089,13 +49941,15 @@ GrB_Info GB_Adot3B__lxor_lt_uint64
 GrB_Info GB_Asaxpy3B__lxor_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48109,14 +49963,13 @@ GrB_Info GB_Adot4B__lxor_lt_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48134,13 +49987,15 @@ GrB_Info GB_Adot3B__lxor_lt_fp32
 GrB_Info GB_Asaxpy3B__lxor_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48154,14 +50009,13 @@ GrB_Info GB_Adot4B__lxor_lt_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48179,13 +50033,15 @@ GrB_Info GB_Adot3B__lxor_lt_fp64
 GrB_Info GB_Asaxpy3B__lxor_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48199,14 +50055,13 @@ GrB_Info GB_Adot4B__lxor_lt_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48224,13 +50079,15 @@ GrB_Info GB_Adot3B__eq_lt_bool
 GrB_Info GB_Asaxpy3B__eq_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48244,14 +50101,13 @@ GrB_Info GB_Adot4B__eq_lt_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48269,13 +50125,15 @@ GrB_Info GB_Adot3B__eq_lt_int8
 GrB_Info GB_Asaxpy3B__eq_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48289,14 +50147,13 @@ GrB_Info GB_Adot4B__eq_lt_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48314,13 +50171,15 @@ GrB_Info GB_Adot3B__eq_lt_uint8
 GrB_Info GB_Asaxpy3B__eq_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48334,14 +50193,13 @@ GrB_Info GB_Adot4B__eq_lt_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48359,13 +50217,15 @@ GrB_Info GB_Adot3B__eq_lt_int16
 GrB_Info GB_Asaxpy3B__eq_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48379,14 +50239,13 @@ GrB_Info GB_Adot4B__eq_lt_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48404,13 +50263,15 @@ GrB_Info GB_Adot3B__eq_lt_uint16
 GrB_Info GB_Asaxpy3B__eq_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48424,14 +50285,13 @@ GrB_Info GB_Adot4B__eq_lt_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48449,13 +50309,15 @@ GrB_Info GB_Adot3B__eq_lt_int32
 GrB_Info GB_Asaxpy3B__eq_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48469,14 +50331,13 @@ GrB_Info GB_Adot4B__eq_lt_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48494,13 +50355,15 @@ GrB_Info GB_Adot3B__eq_lt_uint32
 GrB_Info GB_Asaxpy3B__eq_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48514,14 +50377,13 @@ GrB_Info GB_Adot4B__eq_lt_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48539,13 +50401,15 @@ GrB_Info GB_Adot3B__eq_lt_int64
 GrB_Info GB_Asaxpy3B__eq_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48559,14 +50423,13 @@ GrB_Info GB_Adot4B__eq_lt_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48584,13 +50447,15 @@ GrB_Info GB_Adot3B__eq_lt_uint64
 GrB_Info GB_Asaxpy3B__eq_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48604,14 +50469,13 @@ GrB_Info GB_Adot4B__eq_lt_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48629,13 +50493,15 @@ GrB_Info GB_Adot3B__eq_lt_fp32
 GrB_Info GB_Asaxpy3B__eq_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48649,14 +50515,13 @@ GrB_Info GB_Adot4B__eq_lt_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48674,13 +50539,15 @@ GrB_Info GB_Adot3B__eq_lt_fp64
 GrB_Info GB_Asaxpy3B__eq_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48693,14 +50560,14 @@ GrB_Info GB_Adot4B__eq_lt_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48718,13 +50585,15 @@ GrB_Info GB_Adot3B__lor_ge_bool
 GrB_Info GB_Asaxpy3B__lor_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48737,14 +50606,14 @@ GrB_Info GB_Adot4B__lor_ge_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48762,13 +50631,15 @@ GrB_Info GB_Adot3B__lor_ge_int8
 GrB_Info GB_Asaxpy3B__lor_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48781,14 +50652,14 @@ GrB_Info GB_Adot4B__lor_ge_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48806,13 +50677,15 @@ GrB_Info GB_Adot3B__lor_ge_uint8
 GrB_Info GB_Asaxpy3B__lor_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48825,14 +50698,14 @@ GrB_Info GB_Adot4B__lor_ge_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48850,13 +50723,15 @@ GrB_Info GB_Adot3B__lor_ge_int16
 GrB_Info GB_Asaxpy3B__lor_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48869,14 +50744,14 @@ GrB_Info GB_Adot4B__lor_ge_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48894,13 +50769,15 @@ GrB_Info GB_Adot3B__lor_ge_uint16
 GrB_Info GB_Asaxpy3B__lor_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48913,14 +50790,14 @@ GrB_Info GB_Adot4B__lor_ge_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48938,13 +50815,15 @@ GrB_Info GB_Adot3B__lor_ge_int32
 GrB_Info GB_Asaxpy3B__lor_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -48957,14 +50836,14 @@ GrB_Info GB_Adot4B__lor_ge_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -48982,13 +50861,15 @@ GrB_Info GB_Adot3B__lor_ge_uint32
 GrB_Info GB_Asaxpy3B__lor_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49001,14 +50882,14 @@ GrB_Info GB_Adot4B__lor_ge_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49026,13 +50907,15 @@ GrB_Info GB_Adot3B__lor_ge_int64
 GrB_Info GB_Asaxpy3B__lor_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49045,14 +50928,14 @@ GrB_Info GB_Adot4B__lor_ge_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49070,13 +50953,15 @@ GrB_Info GB_Adot3B__lor_ge_uint64
 GrB_Info GB_Asaxpy3B__lor_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49089,14 +50974,14 @@ GrB_Info GB_Adot4B__lor_ge_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49114,13 +50999,15 @@ GrB_Info GB_Adot3B__lor_ge_fp32
 GrB_Info GB_Asaxpy3B__lor_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49133,14 +51020,14 @@ GrB_Info GB_Adot4B__lor_ge_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49158,13 +51045,15 @@ GrB_Info GB_Adot3B__lor_ge_fp64
 GrB_Info GB_Asaxpy3B__lor_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49177,14 +51066,14 @@ GrB_Info GB_Adot4B__lor_ge_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49202,13 +51091,15 @@ GrB_Info GB_Adot3B__any_ge_bool
 GrB_Info GB_Asaxpy3B__any_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49221,14 +51112,14 @@ GrB_Info GB_Adot4B__any_ge_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49246,13 +51137,15 @@ GrB_Info GB_Adot3B__any_ge_int8
 GrB_Info GB_Asaxpy3B__any_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49265,14 +51158,14 @@ GrB_Info GB_Adot4B__any_ge_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49290,13 +51183,15 @@ GrB_Info GB_Adot3B__any_ge_uint8
 GrB_Info GB_Asaxpy3B__any_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49309,14 +51204,14 @@ GrB_Info GB_Adot4B__any_ge_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49334,13 +51229,15 @@ GrB_Info GB_Adot3B__any_ge_int16
 GrB_Info GB_Asaxpy3B__any_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49353,14 +51250,14 @@ GrB_Info GB_Adot4B__any_ge_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49378,13 +51275,15 @@ GrB_Info GB_Adot3B__any_ge_uint16
 GrB_Info GB_Asaxpy3B__any_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49397,14 +51296,14 @@ GrB_Info GB_Adot4B__any_ge_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49422,13 +51321,15 @@ GrB_Info GB_Adot3B__any_ge_int32
 GrB_Info GB_Asaxpy3B__any_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49441,14 +51342,14 @@ GrB_Info GB_Adot4B__any_ge_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49466,13 +51367,15 @@ GrB_Info GB_Adot3B__any_ge_uint32
 GrB_Info GB_Asaxpy3B__any_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49485,14 +51388,14 @@ GrB_Info GB_Adot4B__any_ge_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49510,13 +51413,15 @@ GrB_Info GB_Adot3B__any_ge_int64
 GrB_Info GB_Asaxpy3B__any_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49529,14 +51434,14 @@ GrB_Info GB_Adot4B__any_ge_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49554,13 +51459,15 @@ GrB_Info GB_Adot3B__any_ge_uint64
 GrB_Info GB_Asaxpy3B__any_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49573,14 +51480,14 @@ GrB_Info GB_Adot4B__any_ge_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49598,13 +51505,15 @@ GrB_Info GB_Adot3B__any_ge_fp32
 GrB_Info GB_Asaxpy3B__any_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49617,14 +51526,14 @@ GrB_Info GB_Adot4B__any_ge_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49642,13 +51551,15 @@ GrB_Info GB_Adot3B__any_ge_fp64
 GrB_Info GB_Asaxpy3B__any_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49661,14 +51572,14 @@ GrB_Info GB_Adot4B__any_ge_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49686,13 +51597,15 @@ GrB_Info GB_Adot3B__land_ge_bool
 GrB_Info GB_Asaxpy3B__land_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49705,14 +51618,14 @@ GrB_Info GB_Adot4B__land_ge_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49730,13 +51643,15 @@ GrB_Info GB_Adot3B__land_ge_int8
 GrB_Info GB_Asaxpy3B__land_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49749,14 +51664,14 @@ GrB_Info GB_Adot4B__land_ge_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49774,13 +51689,15 @@ GrB_Info GB_Adot3B__land_ge_uint8
 GrB_Info GB_Asaxpy3B__land_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49793,14 +51710,14 @@ GrB_Info GB_Adot4B__land_ge_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49818,13 +51735,15 @@ GrB_Info GB_Adot3B__land_ge_int16
 GrB_Info GB_Asaxpy3B__land_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49837,14 +51756,14 @@ GrB_Info GB_Adot4B__land_ge_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49862,13 +51781,15 @@ GrB_Info GB_Adot3B__land_ge_uint16
 GrB_Info GB_Asaxpy3B__land_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49881,14 +51802,14 @@ GrB_Info GB_Adot4B__land_ge_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49906,13 +51827,15 @@ GrB_Info GB_Adot3B__land_ge_int32
 GrB_Info GB_Asaxpy3B__land_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49925,14 +51848,14 @@ GrB_Info GB_Adot4B__land_ge_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49950,13 +51873,15 @@ GrB_Info GB_Adot3B__land_ge_uint32
 GrB_Info GB_Asaxpy3B__land_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -49969,14 +51894,14 @@ GrB_Info GB_Adot4B__land_ge_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -49994,13 +51919,15 @@ GrB_Info GB_Adot3B__land_ge_int64
 GrB_Info GB_Asaxpy3B__land_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50013,14 +51940,14 @@ GrB_Info GB_Adot4B__land_ge_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50038,13 +51965,15 @@ GrB_Info GB_Adot3B__land_ge_uint64
 GrB_Info GB_Asaxpy3B__land_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50057,14 +51986,14 @@ GrB_Info GB_Adot4B__land_ge_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50082,13 +52011,15 @@ GrB_Info GB_Adot3B__land_ge_fp32
 GrB_Info GB_Asaxpy3B__land_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50101,14 +52032,14 @@ GrB_Info GB_Adot4B__land_ge_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50126,13 +52057,15 @@ GrB_Info GB_Adot3B__land_ge_fp64
 GrB_Info GB_Asaxpy3B__land_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50146,14 +52079,13 @@ GrB_Info GB_Adot4B__land_ge_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50171,13 +52103,15 @@ GrB_Info GB_Adot3B__lxor_ge_bool
 GrB_Info GB_Asaxpy3B__lxor_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50191,14 +52125,13 @@ GrB_Info GB_Adot4B__lxor_ge_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50216,13 +52149,15 @@ GrB_Info GB_Adot3B__lxor_ge_int8
 GrB_Info GB_Asaxpy3B__lxor_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50236,14 +52171,13 @@ GrB_Info GB_Adot4B__lxor_ge_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50261,13 +52195,15 @@ GrB_Info GB_Adot3B__lxor_ge_uint8
 GrB_Info GB_Asaxpy3B__lxor_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50281,14 +52217,13 @@ GrB_Info GB_Adot4B__lxor_ge_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50306,13 +52241,15 @@ GrB_Info GB_Adot3B__lxor_ge_int16
 GrB_Info GB_Asaxpy3B__lxor_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50326,14 +52263,13 @@ GrB_Info GB_Adot4B__lxor_ge_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50351,13 +52287,15 @@ GrB_Info GB_Adot3B__lxor_ge_uint16
 GrB_Info GB_Asaxpy3B__lxor_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50371,14 +52309,13 @@ GrB_Info GB_Adot4B__lxor_ge_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50396,13 +52333,15 @@ GrB_Info GB_Adot3B__lxor_ge_int32
 GrB_Info GB_Asaxpy3B__lxor_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50416,14 +52355,13 @@ GrB_Info GB_Adot4B__lxor_ge_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50441,13 +52379,15 @@ GrB_Info GB_Adot3B__lxor_ge_uint32
 GrB_Info GB_Asaxpy3B__lxor_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50461,14 +52401,13 @@ GrB_Info GB_Adot4B__lxor_ge_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50486,13 +52425,15 @@ GrB_Info GB_Adot3B__lxor_ge_int64
 GrB_Info GB_Asaxpy3B__lxor_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50506,14 +52447,13 @@ GrB_Info GB_Adot4B__lxor_ge_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50531,13 +52471,15 @@ GrB_Info GB_Adot3B__lxor_ge_uint64
 GrB_Info GB_Asaxpy3B__lxor_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50551,14 +52493,13 @@ GrB_Info GB_Adot4B__lxor_ge_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50576,13 +52517,15 @@ GrB_Info GB_Adot3B__lxor_ge_fp32
 GrB_Info GB_Asaxpy3B__lxor_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50596,14 +52539,13 @@ GrB_Info GB_Adot4B__lxor_ge_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50621,13 +52563,15 @@ GrB_Info GB_Adot3B__lxor_ge_fp64
 GrB_Info GB_Asaxpy3B__lxor_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50641,14 +52585,13 @@ GrB_Info GB_Adot4B__lxor_ge_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50666,13 +52609,15 @@ GrB_Info GB_Adot3B__eq_ge_bool
 GrB_Info GB_Asaxpy3B__eq_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50686,14 +52631,13 @@ GrB_Info GB_Adot4B__eq_ge_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50711,13 +52655,15 @@ GrB_Info GB_Adot3B__eq_ge_int8
 GrB_Info GB_Asaxpy3B__eq_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50731,14 +52677,13 @@ GrB_Info GB_Adot4B__eq_ge_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50756,13 +52701,15 @@ GrB_Info GB_Adot3B__eq_ge_uint8
 GrB_Info GB_Asaxpy3B__eq_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50776,14 +52723,13 @@ GrB_Info GB_Adot4B__eq_ge_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50801,13 +52747,15 @@ GrB_Info GB_Adot3B__eq_ge_int16
 GrB_Info GB_Asaxpy3B__eq_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50821,14 +52769,13 @@ GrB_Info GB_Adot4B__eq_ge_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50846,13 +52793,15 @@ GrB_Info GB_Adot3B__eq_ge_uint16
 GrB_Info GB_Asaxpy3B__eq_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50866,14 +52815,13 @@ GrB_Info GB_Adot4B__eq_ge_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50891,13 +52839,15 @@ GrB_Info GB_Adot3B__eq_ge_int32
 GrB_Info GB_Asaxpy3B__eq_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50911,14 +52861,13 @@ GrB_Info GB_Adot4B__eq_ge_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50936,13 +52885,15 @@ GrB_Info GB_Adot3B__eq_ge_uint32
 GrB_Info GB_Asaxpy3B__eq_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -50956,14 +52907,13 @@ GrB_Info GB_Adot4B__eq_ge_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -50981,13 +52931,15 @@ GrB_Info GB_Adot3B__eq_ge_int64
 GrB_Info GB_Asaxpy3B__eq_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51001,14 +52953,13 @@ GrB_Info GB_Adot4B__eq_ge_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51026,13 +52977,15 @@ GrB_Info GB_Adot3B__eq_ge_uint64
 GrB_Info GB_Asaxpy3B__eq_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51046,14 +52999,13 @@ GrB_Info GB_Adot4B__eq_ge_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51071,13 +53023,15 @@ GrB_Info GB_Adot3B__eq_ge_fp32
 GrB_Info GB_Asaxpy3B__eq_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51091,14 +53045,13 @@ GrB_Info GB_Adot4B__eq_ge_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51116,13 +53069,15 @@ GrB_Info GB_Adot3B__eq_ge_fp64
 GrB_Info GB_Asaxpy3B__eq_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51135,14 +53090,14 @@ GrB_Info GB_Adot4B__eq_ge_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51160,13 +53115,15 @@ GrB_Info GB_Adot3B__lor_le_bool
 GrB_Info GB_Asaxpy3B__lor_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51179,14 +53136,14 @@ GrB_Info GB_Adot4B__lor_le_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51204,13 +53161,15 @@ GrB_Info GB_Adot3B__lor_le_int8
 GrB_Info GB_Asaxpy3B__lor_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51223,14 +53182,14 @@ GrB_Info GB_Adot4B__lor_le_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51248,13 +53207,15 @@ GrB_Info GB_Adot3B__lor_le_uint8
 GrB_Info GB_Asaxpy3B__lor_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51267,14 +53228,14 @@ GrB_Info GB_Adot4B__lor_le_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51292,13 +53253,15 @@ GrB_Info GB_Adot3B__lor_le_int16
 GrB_Info GB_Asaxpy3B__lor_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51311,14 +53274,14 @@ GrB_Info GB_Adot4B__lor_le_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51336,13 +53299,15 @@ GrB_Info GB_Adot3B__lor_le_uint16
 GrB_Info GB_Asaxpy3B__lor_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51355,14 +53320,14 @@ GrB_Info GB_Adot4B__lor_le_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51380,13 +53345,15 @@ GrB_Info GB_Adot3B__lor_le_int32
 GrB_Info GB_Asaxpy3B__lor_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51399,14 +53366,14 @@ GrB_Info GB_Adot4B__lor_le_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51424,13 +53391,15 @@ GrB_Info GB_Adot3B__lor_le_uint32
 GrB_Info GB_Asaxpy3B__lor_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51443,14 +53412,14 @@ GrB_Info GB_Adot4B__lor_le_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51468,13 +53437,15 @@ GrB_Info GB_Adot3B__lor_le_int64
 GrB_Info GB_Asaxpy3B__lor_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51487,14 +53458,14 @@ GrB_Info GB_Adot4B__lor_le_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51512,13 +53483,15 @@ GrB_Info GB_Adot3B__lor_le_uint64
 GrB_Info GB_Asaxpy3B__lor_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51531,14 +53504,14 @@ GrB_Info GB_Adot4B__lor_le_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51556,13 +53529,15 @@ GrB_Info GB_Adot3B__lor_le_fp32
 GrB_Info GB_Asaxpy3B__lor_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51575,14 +53550,14 @@ GrB_Info GB_Adot4B__lor_le_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51600,13 +53575,15 @@ GrB_Info GB_Adot3B__lor_le_fp64
 GrB_Info GB_Asaxpy3B__lor_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51619,14 +53596,14 @@ GrB_Info GB_Adot4B__lor_le_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51644,13 +53621,15 @@ GrB_Info GB_Adot3B__any_le_bool
 GrB_Info GB_Asaxpy3B__any_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51663,14 +53642,14 @@ GrB_Info GB_Adot4B__any_le_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51688,13 +53667,15 @@ GrB_Info GB_Adot3B__any_le_int8
 GrB_Info GB_Asaxpy3B__any_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51707,14 +53688,14 @@ GrB_Info GB_Adot4B__any_le_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51732,13 +53713,15 @@ GrB_Info GB_Adot3B__any_le_uint8
 GrB_Info GB_Asaxpy3B__any_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51751,14 +53734,14 @@ GrB_Info GB_Adot4B__any_le_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51776,13 +53759,15 @@ GrB_Info GB_Adot3B__any_le_int16
 GrB_Info GB_Asaxpy3B__any_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51795,14 +53780,14 @@ GrB_Info GB_Adot4B__any_le_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51820,13 +53805,15 @@ GrB_Info GB_Adot3B__any_le_uint16
 GrB_Info GB_Asaxpy3B__any_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51839,14 +53826,14 @@ GrB_Info GB_Adot4B__any_le_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51864,13 +53851,15 @@ GrB_Info GB_Adot3B__any_le_int32
 GrB_Info GB_Asaxpy3B__any_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51883,14 +53872,14 @@ GrB_Info GB_Adot4B__any_le_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51908,13 +53897,15 @@ GrB_Info GB_Adot3B__any_le_uint32
 GrB_Info GB_Asaxpy3B__any_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51927,14 +53918,14 @@ GrB_Info GB_Adot4B__any_le_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51952,13 +53943,15 @@ GrB_Info GB_Adot3B__any_le_int64
 GrB_Info GB_Asaxpy3B__any_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -51971,14 +53964,14 @@ GrB_Info GB_Adot4B__any_le_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -51996,13 +53989,15 @@ GrB_Info GB_Adot3B__any_le_uint64
 GrB_Info GB_Asaxpy3B__any_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52015,14 +54010,14 @@ GrB_Info GB_Adot4B__any_le_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52040,13 +54035,15 @@ GrB_Info GB_Adot3B__any_le_fp32
 GrB_Info GB_Asaxpy3B__any_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52059,14 +54056,14 @@ GrB_Info GB_Adot4B__any_le_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52084,13 +54081,15 @@ GrB_Info GB_Adot3B__any_le_fp64
 GrB_Info GB_Asaxpy3B__any_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52103,14 +54102,14 @@ GrB_Info GB_Adot4B__any_le_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52128,13 +54127,15 @@ GrB_Info GB_Adot3B__land_le_bool
 GrB_Info GB_Asaxpy3B__land_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52147,14 +54148,14 @@ GrB_Info GB_Adot4B__land_le_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52172,13 +54173,15 @@ GrB_Info GB_Adot3B__land_le_int8
 GrB_Info GB_Asaxpy3B__land_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52191,14 +54194,14 @@ GrB_Info GB_Adot4B__land_le_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52216,13 +54219,15 @@ GrB_Info GB_Adot3B__land_le_uint8
 GrB_Info GB_Asaxpy3B__land_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52235,14 +54240,14 @@ GrB_Info GB_Adot4B__land_le_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52260,13 +54265,15 @@ GrB_Info GB_Adot3B__land_le_int16
 GrB_Info GB_Asaxpy3B__land_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52279,14 +54286,14 @@ GrB_Info GB_Adot4B__land_le_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52304,13 +54311,15 @@ GrB_Info GB_Adot3B__land_le_uint16
 GrB_Info GB_Asaxpy3B__land_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52323,14 +54332,14 @@ GrB_Info GB_Adot4B__land_le_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52348,13 +54357,15 @@ GrB_Info GB_Adot3B__land_le_int32
 GrB_Info GB_Asaxpy3B__land_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52367,14 +54378,14 @@ GrB_Info GB_Adot4B__land_le_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52392,13 +54403,15 @@ GrB_Info GB_Adot3B__land_le_uint32
 GrB_Info GB_Asaxpy3B__land_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52411,14 +54424,14 @@ GrB_Info GB_Adot4B__land_le_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52436,13 +54449,15 @@ GrB_Info GB_Adot3B__land_le_int64
 GrB_Info GB_Asaxpy3B__land_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52455,14 +54470,14 @@ GrB_Info GB_Adot4B__land_le_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52480,13 +54495,15 @@ GrB_Info GB_Adot3B__land_le_uint64
 GrB_Info GB_Asaxpy3B__land_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52499,14 +54516,14 @@ GrB_Info GB_Adot4B__land_le_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52524,13 +54541,15 @@ GrB_Info GB_Adot3B__land_le_fp32
 GrB_Info GB_Asaxpy3B__land_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52543,14 +54562,14 @@ GrB_Info GB_Adot4B__land_le_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52568,13 +54587,15 @@ GrB_Info GB_Adot3B__land_le_fp64
 GrB_Info GB_Asaxpy3B__land_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52588,14 +54609,13 @@ GrB_Info GB_Adot4B__land_le_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52613,13 +54633,15 @@ GrB_Info GB_Adot3B__lxor_le_bool
 GrB_Info GB_Asaxpy3B__lxor_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52633,14 +54655,13 @@ GrB_Info GB_Adot4B__lxor_le_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52658,13 +54679,15 @@ GrB_Info GB_Adot3B__lxor_le_int8
 GrB_Info GB_Asaxpy3B__lxor_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52678,14 +54701,13 @@ GrB_Info GB_Adot4B__lxor_le_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52703,13 +54725,15 @@ GrB_Info GB_Adot3B__lxor_le_uint8
 GrB_Info GB_Asaxpy3B__lxor_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52723,14 +54747,13 @@ GrB_Info GB_Adot4B__lxor_le_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52748,13 +54771,15 @@ GrB_Info GB_Adot3B__lxor_le_int16
 GrB_Info GB_Asaxpy3B__lxor_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52768,14 +54793,13 @@ GrB_Info GB_Adot4B__lxor_le_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52793,13 +54817,15 @@ GrB_Info GB_Adot3B__lxor_le_uint16
 GrB_Info GB_Asaxpy3B__lxor_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52813,14 +54839,13 @@ GrB_Info GB_Adot4B__lxor_le_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52838,13 +54863,15 @@ GrB_Info GB_Adot3B__lxor_le_int32
 GrB_Info GB_Asaxpy3B__lxor_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52858,14 +54885,13 @@ GrB_Info GB_Adot4B__lxor_le_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52883,13 +54909,15 @@ GrB_Info GB_Adot3B__lxor_le_uint32
 GrB_Info GB_Asaxpy3B__lxor_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52903,14 +54931,13 @@ GrB_Info GB_Adot4B__lxor_le_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52928,13 +54955,15 @@ GrB_Info GB_Adot3B__lxor_le_int64
 GrB_Info GB_Asaxpy3B__lxor_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52948,14 +54977,13 @@ GrB_Info GB_Adot4B__lxor_le_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -52973,13 +55001,15 @@ GrB_Info GB_Adot3B__lxor_le_uint64
 GrB_Info GB_Asaxpy3B__lxor_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -52993,14 +55023,13 @@ GrB_Info GB_Adot4B__lxor_le_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53018,13 +55047,15 @@ GrB_Info GB_Adot3B__lxor_le_fp32
 GrB_Info GB_Asaxpy3B__lxor_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53038,14 +55069,13 @@ GrB_Info GB_Adot4B__lxor_le_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53063,13 +55093,15 @@ GrB_Info GB_Adot3B__lxor_le_fp64
 GrB_Info GB_Asaxpy3B__lxor_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53083,14 +55115,13 @@ GrB_Info GB_Adot4B__lxor_le_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53108,13 +55139,15 @@ GrB_Info GB_Adot3B__eq_le_bool
 GrB_Info GB_Asaxpy3B__eq_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53128,14 +55161,13 @@ GrB_Info GB_Adot4B__eq_le_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53153,13 +55185,15 @@ GrB_Info GB_Adot3B__eq_le_int8
 GrB_Info GB_Asaxpy3B__eq_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53173,14 +55207,13 @@ GrB_Info GB_Adot4B__eq_le_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53198,13 +55231,15 @@ GrB_Info GB_Adot3B__eq_le_uint8
 GrB_Info GB_Asaxpy3B__eq_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53218,14 +55253,13 @@ GrB_Info GB_Adot4B__eq_le_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53243,13 +55277,15 @@ GrB_Info GB_Adot3B__eq_le_int16
 GrB_Info GB_Asaxpy3B__eq_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53263,14 +55299,13 @@ GrB_Info GB_Adot4B__eq_le_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53288,13 +55323,15 @@ GrB_Info GB_Adot3B__eq_le_uint16
 GrB_Info GB_Asaxpy3B__eq_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53308,14 +55345,13 @@ GrB_Info GB_Adot4B__eq_le_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53333,13 +55369,15 @@ GrB_Info GB_Adot3B__eq_le_int32
 GrB_Info GB_Asaxpy3B__eq_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53353,14 +55391,13 @@ GrB_Info GB_Adot4B__eq_le_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53378,13 +55415,15 @@ GrB_Info GB_Adot3B__eq_le_uint32
 GrB_Info GB_Asaxpy3B__eq_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53398,14 +55437,13 @@ GrB_Info GB_Adot4B__eq_le_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53423,13 +55461,15 @@ GrB_Info GB_Adot3B__eq_le_int64
 GrB_Info GB_Asaxpy3B__eq_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53443,14 +55483,13 @@ GrB_Info GB_Adot4B__eq_le_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53468,13 +55507,15 @@ GrB_Info GB_Adot3B__eq_le_uint64
 GrB_Info GB_Asaxpy3B__eq_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53488,14 +55529,13 @@ GrB_Info GB_Adot4B__eq_le_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53513,13 +55553,15 @@ GrB_Info GB_Adot3B__eq_le_fp32
 GrB_Info GB_Asaxpy3B__eq_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53533,14 +55575,13 @@ GrB_Info GB_Adot4B__eq_le_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53558,13 +55599,15 @@ GrB_Info GB_Adot3B__eq_le_fp64
 GrB_Info GB_Asaxpy3B__eq_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53577,14 +55620,14 @@ GrB_Info GB_Adot4B__eq_le_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53602,13 +55645,15 @@ GrB_Info GB_Adot3B__min_lor_int8
 GrB_Info GB_Asaxpy3B__min_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53621,14 +55666,14 @@ GrB_Info GB_Adot4B__min_lor_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53646,13 +55691,15 @@ GrB_Info GB_Adot3B__min_lor_int16
 GrB_Info GB_Asaxpy3B__min_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53665,14 +55712,14 @@ GrB_Info GB_Adot4B__min_lor_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53690,13 +55737,15 @@ GrB_Info GB_Adot3B__min_lor_int32
 GrB_Info GB_Asaxpy3B__min_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53709,14 +55758,14 @@ GrB_Info GB_Adot4B__min_lor_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53734,13 +55783,15 @@ GrB_Info GB_Adot3B__min_lor_int64
 GrB_Info GB_Asaxpy3B__min_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53753,14 +55804,14 @@ GrB_Info GB_Adot4B__min_lor_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53778,13 +55829,15 @@ GrB_Info GB_Adot3B__min_lor_uint8
 GrB_Info GB_Asaxpy3B__min_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53797,14 +55850,14 @@ GrB_Info GB_Adot4B__min_lor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53822,13 +55875,15 @@ GrB_Info GB_Adot3B__min_lor_uint16
 GrB_Info GB_Asaxpy3B__min_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53841,14 +55896,14 @@ GrB_Info GB_Adot4B__min_lor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53866,13 +55921,15 @@ GrB_Info GB_Adot3B__min_lor_uint32
 GrB_Info GB_Asaxpy3B__min_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53885,14 +55942,14 @@ GrB_Info GB_Adot4B__min_lor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53910,13 +55967,15 @@ GrB_Info GB_Adot3B__min_lor_uint64
 GrB_Info GB_Asaxpy3B__min_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53929,14 +55988,14 @@ GrB_Info GB_Adot4B__min_lor_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53954,13 +56013,15 @@ GrB_Info GB_Adot3B__min_lor_fp32
 GrB_Info GB_Asaxpy3B__min_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -53973,14 +56034,14 @@ GrB_Info GB_Adot4B__min_lor_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -53998,13 +56059,15 @@ GrB_Info GB_Adot3B__min_lor_fp64
 GrB_Info GB_Asaxpy3B__min_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54017,14 +56080,14 @@ GrB_Info GB_Adot4B__min_lor_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54042,13 +56105,15 @@ GrB_Info GB_Adot3B__max_lor_int8
 GrB_Info GB_Asaxpy3B__max_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54061,14 +56126,14 @@ GrB_Info GB_Adot4B__max_lor_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54086,13 +56151,15 @@ GrB_Info GB_Adot3B__max_lor_int16
 GrB_Info GB_Asaxpy3B__max_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54105,14 +56172,14 @@ GrB_Info GB_Adot4B__max_lor_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54130,13 +56197,15 @@ GrB_Info GB_Adot3B__max_lor_int32
 GrB_Info GB_Asaxpy3B__max_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54149,14 +56218,14 @@ GrB_Info GB_Adot4B__max_lor_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54174,13 +56243,15 @@ GrB_Info GB_Adot3B__max_lor_int64
 GrB_Info GB_Asaxpy3B__max_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54193,14 +56264,14 @@ GrB_Info GB_Adot4B__max_lor_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54218,13 +56289,15 @@ GrB_Info GB_Adot3B__max_lor_uint8
 GrB_Info GB_Asaxpy3B__max_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54237,14 +56310,14 @@ GrB_Info GB_Adot4B__max_lor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54262,13 +56335,15 @@ GrB_Info GB_Adot3B__max_lor_uint16
 GrB_Info GB_Asaxpy3B__max_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54281,14 +56356,14 @@ GrB_Info GB_Adot4B__max_lor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54306,13 +56381,15 @@ GrB_Info GB_Adot3B__max_lor_uint32
 GrB_Info GB_Asaxpy3B__max_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54325,14 +56402,14 @@ GrB_Info GB_Adot4B__max_lor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54350,13 +56427,15 @@ GrB_Info GB_Adot3B__max_lor_uint64
 GrB_Info GB_Asaxpy3B__max_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54369,14 +56448,14 @@ GrB_Info GB_Adot4B__max_lor_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54394,13 +56473,15 @@ GrB_Info GB_Adot3B__max_lor_fp32
 GrB_Info GB_Asaxpy3B__max_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54413,14 +56494,14 @@ GrB_Info GB_Adot4B__max_lor_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54438,13 +56519,15 @@ GrB_Info GB_Adot3B__max_lor_fp64
 GrB_Info GB_Asaxpy3B__max_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54457,14 +56540,14 @@ GrB_Info GB_Adot4B__max_lor_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54482,13 +56565,15 @@ GrB_Info GB_Adot3B__any_lor_int8
 GrB_Info GB_Asaxpy3B__any_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54501,14 +56586,14 @@ GrB_Info GB_Adot4B__any_lor_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54526,13 +56611,15 @@ GrB_Info GB_Adot3B__any_lor_int16
 GrB_Info GB_Asaxpy3B__any_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54545,14 +56632,14 @@ GrB_Info GB_Adot4B__any_lor_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54570,13 +56657,15 @@ GrB_Info GB_Adot3B__any_lor_int32
 GrB_Info GB_Asaxpy3B__any_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54589,14 +56678,14 @@ GrB_Info GB_Adot4B__any_lor_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54614,13 +56703,15 @@ GrB_Info GB_Adot3B__any_lor_int64
 GrB_Info GB_Asaxpy3B__any_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54633,14 +56724,14 @@ GrB_Info GB_Adot4B__any_lor_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54658,13 +56749,15 @@ GrB_Info GB_Adot3B__any_lor_uint8
 GrB_Info GB_Asaxpy3B__any_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54677,14 +56770,14 @@ GrB_Info GB_Adot4B__any_lor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54702,13 +56795,15 @@ GrB_Info GB_Adot3B__any_lor_uint16
 GrB_Info GB_Asaxpy3B__any_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54721,14 +56816,14 @@ GrB_Info GB_Adot4B__any_lor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54746,13 +56841,15 @@ GrB_Info GB_Adot3B__any_lor_uint32
 GrB_Info GB_Asaxpy3B__any_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54765,14 +56862,14 @@ GrB_Info GB_Adot4B__any_lor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54790,13 +56887,15 @@ GrB_Info GB_Adot3B__any_lor_uint64
 GrB_Info GB_Asaxpy3B__any_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54809,14 +56908,14 @@ GrB_Info GB_Adot4B__any_lor_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54834,13 +56933,15 @@ GrB_Info GB_Adot3B__any_lor_fp32
 GrB_Info GB_Asaxpy3B__any_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54853,14 +56954,14 @@ GrB_Info GB_Adot4B__any_lor_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54878,13 +56979,15 @@ GrB_Info GB_Adot3B__any_lor_fp64
 GrB_Info GB_Asaxpy3B__any_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54898,14 +57001,13 @@ GrB_Info GB_Adot4B__any_lor_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54923,13 +57025,15 @@ GrB_Info GB_Adot3B__plus_lor_int8
 GrB_Info GB_Asaxpy3B__plus_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54943,14 +57047,13 @@ GrB_Info GB_Adot4B__plus_lor_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -54968,13 +57071,15 @@ GrB_Info GB_Adot3B__plus_lor_uint8
 GrB_Info GB_Asaxpy3B__plus_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -54988,14 +57093,13 @@ GrB_Info GB_Adot4B__plus_lor_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55013,13 +57117,15 @@ GrB_Info GB_Adot3B__plus_lor_int16
 GrB_Info GB_Asaxpy3B__plus_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55033,14 +57139,13 @@ GrB_Info GB_Adot4B__plus_lor_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55058,13 +57163,15 @@ GrB_Info GB_Adot3B__plus_lor_uint16
 GrB_Info GB_Asaxpy3B__plus_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55078,14 +57185,13 @@ GrB_Info GB_Adot4B__plus_lor_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55103,13 +57209,15 @@ GrB_Info GB_Adot3B__plus_lor_int32
 GrB_Info GB_Asaxpy3B__plus_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55123,14 +57231,13 @@ GrB_Info GB_Adot4B__plus_lor_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55148,13 +57255,15 @@ GrB_Info GB_Adot3B__plus_lor_uint32
 GrB_Info GB_Asaxpy3B__plus_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55168,14 +57277,13 @@ GrB_Info GB_Adot4B__plus_lor_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55193,13 +57301,15 @@ GrB_Info GB_Adot3B__plus_lor_int64
 GrB_Info GB_Asaxpy3B__plus_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55213,14 +57323,13 @@ GrB_Info GB_Adot4B__plus_lor_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55238,13 +57347,15 @@ GrB_Info GB_Adot3B__plus_lor_uint64
 GrB_Info GB_Asaxpy3B__plus_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55258,14 +57369,13 @@ GrB_Info GB_Adot4B__plus_lor_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55283,13 +57393,15 @@ GrB_Info GB_Adot3B__plus_lor_fp32
 GrB_Info GB_Asaxpy3B__plus_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55303,14 +57415,13 @@ GrB_Info GB_Adot4B__plus_lor_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55328,13 +57439,15 @@ GrB_Info GB_Adot3B__plus_lor_fp64
 GrB_Info GB_Asaxpy3B__plus_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55347,14 +57460,14 @@ GrB_Info GB_Adot4B__plus_lor_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55372,13 +57485,15 @@ GrB_Info GB_Adot3B__times_lor_int8
 GrB_Info GB_Asaxpy3B__times_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55391,14 +57506,14 @@ GrB_Info GB_Adot4B__times_lor_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55416,13 +57531,15 @@ GrB_Info GB_Adot3B__times_lor_uint8
 GrB_Info GB_Asaxpy3B__times_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55435,14 +57552,14 @@ GrB_Info GB_Adot4B__times_lor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55460,13 +57577,15 @@ GrB_Info GB_Adot3B__times_lor_int16
 GrB_Info GB_Asaxpy3B__times_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55479,14 +57598,14 @@ GrB_Info GB_Adot4B__times_lor_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55504,13 +57623,15 @@ GrB_Info GB_Adot3B__times_lor_uint16
 GrB_Info GB_Asaxpy3B__times_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55523,14 +57644,14 @@ GrB_Info GB_Adot4B__times_lor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55548,13 +57669,15 @@ GrB_Info GB_Adot3B__times_lor_int32
 GrB_Info GB_Asaxpy3B__times_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55567,14 +57690,14 @@ GrB_Info GB_Adot4B__times_lor_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55592,13 +57715,15 @@ GrB_Info GB_Adot3B__times_lor_uint32
 GrB_Info GB_Asaxpy3B__times_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55611,14 +57736,14 @@ GrB_Info GB_Adot4B__times_lor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55636,13 +57761,15 @@ GrB_Info GB_Adot3B__times_lor_int64
 GrB_Info GB_Asaxpy3B__times_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55655,14 +57782,14 @@ GrB_Info GB_Adot4B__times_lor_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55680,13 +57807,15 @@ GrB_Info GB_Adot3B__times_lor_uint64
 GrB_Info GB_Asaxpy3B__times_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55700,14 +57829,13 @@ GrB_Info GB_Adot4B__times_lor_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55725,13 +57853,15 @@ GrB_Info GB_Adot3B__times_lor_fp32
 GrB_Info GB_Asaxpy3B__times_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55745,14 +57875,13 @@ GrB_Info GB_Adot4B__times_lor_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55770,13 +57899,15 @@ GrB_Info GB_Adot3B__times_lor_fp64
 GrB_Info GB_Asaxpy3B__times_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55789,14 +57920,14 @@ GrB_Info GB_Adot4B__times_lor_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55814,13 +57945,15 @@ GrB_Info GB_Adot3B__lor_lor_bool
 GrB_Info GB_Asaxpy3B__lor_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55833,14 +57966,14 @@ GrB_Info GB_Adot4B__lor_lor_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55858,13 +57991,15 @@ GrB_Info GB_Adot3B__land_lor_bool
 GrB_Info GB_Asaxpy3B__land_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55878,14 +58013,13 @@ GrB_Info GB_Adot4B__land_lor_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55903,13 +58037,15 @@ GrB_Info GB_Adot3B__lxor_lor_bool
 GrB_Info GB_Asaxpy3B__lxor_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55922,14 +58058,14 @@ GrB_Info GB_Adot4B__lxor_lor_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55947,13 +58083,15 @@ GrB_Info GB_Adot3B__any_lor_bool
 GrB_Info GB_Asaxpy3B__any_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -55967,14 +58105,13 @@ GrB_Info GB_Adot4B__any_lor_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -55992,13 +58129,15 @@ GrB_Info GB_Adot3B__eq_lor_bool
 GrB_Info GB_Asaxpy3B__eq_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56011,14 +58150,14 @@ GrB_Info GB_Adot4B__eq_lor_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56036,13 +58175,15 @@ GrB_Info GB_Adot3B__min_land_int8
 GrB_Info GB_Asaxpy3B__min_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56055,14 +58196,14 @@ GrB_Info GB_Adot4B__min_land_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56080,13 +58221,15 @@ GrB_Info GB_Adot3B__min_land_int16
 GrB_Info GB_Asaxpy3B__min_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56099,14 +58242,14 @@ GrB_Info GB_Adot4B__min_land_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56124,13 +58267,15 @@ GrB_Info GB_Adot3B__min_land_int32
 GrB_Info GB_Asaxpy3B__min_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56143,14 +58288,14 @@ GrB_Info GB_Adot4B__min_land_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56168,13 +58313,15 @@ GrB_Info GB_Adot3B__min_land_int64
 GrB_Info GB_Asaxpy3B__min_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56187,14 +58334,14 @@ GrB_Info GB_Adot4B__min_land_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56212,13 +58359,15 @@ GrB_Info GB_Adot3B__min_land_uint8
 GrB_Info GB_Asaxpy3B__min_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56231,14 +58380,14 @@ GrB_Info GB_Adot4B__min_land_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56256,13 +58405,15 @@ GrB_Info GB_Adot3B__min_land_uint16
 GrB_Info GB_Asaxpy3B__min_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56275,14 +58426,14 @@ GrB_Info GB_Adot4B__min_land_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56300,13 +58451,15 @@ GrB_Info GB_Adot3B__min_land_uint32
 GrB_Info GB_Asaxpy3B__min_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56319,14 +58472,14 @@ GrB_Info GB_Adot4B__min_land_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56344,13 +58497,15 @@ GrB_Info GB_Adot3B__min_land_uint64
 GrB_Info GB_Asaxpy3B__min_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56363,14 +58518,14 @@ GrB_Info GB_Adot4B__min_land_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56388,13 +58543,15 @@ GrB_Info GB_Adot3B__min_land_fp32
 GrB_Info GB_Asaxpy3B__min_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56407,14 +58564,14 @@ GrB_Info GB_Adot4B__min_land_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56432,13 +58589,15 @@ GrB_Info GB_Adot3B__min_land_fp64
 GrB_Info GB_Asaxpy3B__min_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56451,14 +58610,14 @@ GrB_Info GB_Adot4B__min_land_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56476,13 +58635,15 @@ GrB_Info GB_Adot3B__max_land_int8
 GrB_Info GB_Asaxpy3B__max_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56495,14 +58656,14 @@ GrB_Info GB_Adot4B__max_land_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56520,13 +58681,15 @@ GrB_Info GB_Adot3B__max_land_int16
 GrB_Info GB_Asaxpy3B__max_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56539,14 +58702,14 @@ GrB_Info GB_Adot4B__max_land_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56564,13 +58727,15 @@ GrB_Info GB_Adot3B__max_land_int32
 GrB_Info GB_Asaxpy3B__max_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56583,14 +58748,14 @@ GrB_Info GB_Adot4B__max_land_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56608,13 +58773,15 @@ GrB_Info GB_Adot3B__max_land_int64
 GrB_Info GB_Asaxpy3B__max_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56627,14 +58794,14 @@ GrB_Info GB_Adot4B__max_land_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56652,13 +58819,15 @@ GrB_Info GB_Adot3B__max_land_uint8
 GrB_Info GB_Asaxpy3B__max_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56671,14 +58840,14 @@ GrB_Info GB_Adot4B__max_land_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56696,13 +58865,15 @@ GrB_Info GB_Adot3B__max_land_uint16
 GrB_Info GB_Asaxpy3B__max_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56715,14 +58886,14 @@ GrB_Info GB_Adot4B__max_land_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56740,13 +58911,15 @@ GrB_Info GB_Adot3B__max_land_uint32
 GrB_Info GB_Asaxpy3B__max_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56759,14 +58932,14 @@ GrB_Info GB_Adot4B__max_land_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56784,13 +58957,15 @@ GrB_Info GB_Adot3B__max_land_uint64
 GrB_Info GB_Asaxpy3B__max_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56803,14 +58978,14 @@ GrB_Info GB_Adot4B__max_land_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56828,13 +59003,15 @@ GrB_Info GB_Adot3B__max_land_fp32
 GrB_Info GB_Asaxpy3B__max_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56847,14 +59024,14 @@ GrB_Info GB_Adot4B__max_land_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56872,13 +59049,15 @@ GrB_Info GB_Adot3B__max_land_fp64
 GrB_Info GB_Asaxpy3B__max_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56891,14 +59070,14 @@ GrB_Info GB_Adot4B__max_land_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56916,13 +59095,15 @@ GrB_Info GB_Adot3B__any_land_int8
 GrB_Info GB_Asaxpy3B__any_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56935,14 +59116,14 @@ GrB_Info GB_Adot4B__any_land_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -56960,13 +59141,15 @@ GrB_Info GB_Adot3B__any_land_int16
 GrB_Info GB_Asaxpy3B__any_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -56979,14 +59162,14 @@ GrB_Info GB_Adot4B__any_land_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57004,13 +59187,15 @@ GrB_Info GB_Adot3B__any_land_int32
 GrB_Info GB_Asaxpy3B__any_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57023,14 +59208,14 @@ GrB_Info GB_Adot4B__any_land_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57048,13 +59233,15 @@ GrB_Info GB_Adot3B__any_land_int64
 GrB_Info GB_Asaxpy3B__any_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57067,14 +59254,14 @@ GrB_Info GB_Adot4B__any_land_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57092,13 +59279,15 @@ GrB_Info GB_Adot3B__any_land_uint8
 GrB_Info GB_Asaxpy3B__any_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57111,14 +59300,14 @@ GrB_Info GB_Adot4B__any_land_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57136,13 +59325,15 @@ GrB_Info GB_Adot3B__any_land_uint16
 GrB_Info GB_Asaxpy3B__any_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57155,14 +59346,14 @@ GrB_Info GB_Adot4B__any_land_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57180,13 +59371,15 @@ GrB_Info GB_Adot3B__any_land_uint32
 GrB_Info GB_Asaxpy3B__any_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57199,14 +59392,14 @@ GrB_Info GB_Adot4B__any_land_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57224,13 +59417,15 @@ GrB_Info GB_Adot3B__any_land_uint64
 GrB_Info GB_Asaxpy3B__any_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57243,14 +59438,14 @@ GrB_Info GB_Adot4B__any_land_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57268,13 +59463,15 @@ GrB_Info GB_Adot3B__any_land_fp32
 GrB_Info GB_Asaxpy3B__any_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57287,14 +59484,14 @@ GrB_Info GB_Adot4B__any_land_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57312,13 +59509,15 @@ GrB_Info GB_Adot3B__any_land_fp64
 GrB_Info GB_Asaxpy3B__any_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57332,14 +59531,13 @@ GrB_Info GB_Adot4B__any_land_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57357,13 +59555,15 @@ GrB_Info GB_Adot3B__plus_land_int8
 GrB_Info GB_Asaxpy3B__plus_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57377,14 +59577,13 @@ GrB_Info GB_Adot4B__plus_land_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57402,13 +59601,15 @@ GrB_Info GB_Adot3B__plus_land_uint8
 GrB_Info GB_Asaxpy3B__plus_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57422,14 +59623,13 @@ GrB_Info GB_Adot4B__plus_land_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57447,13 +59647,15 @@ GrB_Info GB_Adot3B__plus_land_int16
 GrB_Info GB_Asaxpy3B__plus_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57467,14 +59669,13 @@ GrB_Info GB_Adot4B__plus_land_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57492,13 +59693,15 @@ GrB_Info GB_Adot3B__plus_land_uint16
 GrB_Info GB_Asaxpy3B__plus_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57512,14 +59715,13 @@ GrB_Info GB_Adot4B__plus_land_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57537,13 +59739,15 @@ GrB_Info GB_Adot3B__plus_land_int32
 GrB_Info GB_Asaxpy3B__plus_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57557,14 +59761,13 @@ GrB_Info GB_Adot4B__plus_land_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57582,13 +59785,15 @@ GrB_Info GB_Adot3B__plus_land_uint32
 GrB_Info GB_Asaxpy3B__plus_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57602,14 +59807,13 @@ GrB_Info GB_Adot4B__plus_land_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57627,13 +59831,15 @@ GrB_Info GB_Adot3B__plus_land_int64
 GrB_Info GB_Asaxpy3B__plus_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57647,14 +59853,13 @@ GrB_Info GB_Adot4B__plus_land_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57672,13 +59877,15 @@ GrB_Info GB_Adot3B__plus_land_uint64
 GrB_Info GB_Asaxpy3B__plus_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57692,14 +59899,13 @@ GrB_Info GB_Adot4B__plus_land_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57717,13 +59923,15 @@ GrB_Info GB_Adot3B__plus_land_fp32
 GrB_Info GB_Asaxpy3B__plus_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57737,14 +59945,13 @@ GrB_Info GB_Adot4B__plus_land_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57762,13 +59969,15 @@ GrB_Info GB_Adot3B__plus_land_fp64
 GrB_Info GB_Asaxpy3B__plus_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57781,14 +59990,14 @@ GrB_Info GB_Adot4B__plus_land_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57806,13 +60015,15 @@ GrB_Info GB_Adot3B__times_land_int8
 GrB_Info GB_Asaxpy3B__times_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57825,14 +60036,14 @@ GrB_Info GB_Adot4B__times_land_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57850,13 +60061,15 @@ GrB_Info GB_Adot3B__times_land_uint8
 GrB_Info GB_Asaxpy3B__times_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57869,14 +60082,14 @@ GrB_Info GB_Adot4B__times_land_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57894,13 +60107,15 @@ GrB_Info GB_Adot3B__times_land_int16
 GrB_Info GB_Asaxpy3B__times_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57913,14 +60128,14 @@ GrB_Info GB_Adot4B__times_land_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57938,13 +60153,15 @@ GrB_Info GB_Adot3B__times_land_uint16
 GrB_Info GB_Asaxpy3B__times_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -57957,14 +60174,14 @@ GrB_Info GB_Adot4B__times_land_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -57982,13 +60199,15 @@ GrB_Info GB_Adot3B__times_land_int32
 GrB_Info GB_Asaxpy3B__times_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58001,14 +60220,14 @@ GrB_Info GB_Adot4B__times_land_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58026,13 +60245,15 @@ GrB_Info GB_Adot3B__times_land_uint32
 GrB_Info GB_Asaxpy3B__times_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58045,14 +60266,14 @@ GrB_Info GB_Adot4B__times_land_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58070,13 +60291,15 @@ GrB_Info GB_Adot3B__times_land_int64
 GrB_Info GB_Asaxpy3B__times_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58089,14 +60312,14 @@ GrB_Info GB_Adot4B__times_land_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58114,13 +60337,15 @@ GrB_Info GB_Adot3B__times_land_uint64
 GrB_Info GB_Asaxpy3B__times_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58134,14 +60359,13 @@ GrB_Info GB_Adot4B__times_land_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58159,13 +60383,15 @@ GrB_Info GB_Adot3B__times_land_fp32
 GrB_Info GB_Asaxpy3B__times_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58179,14 +60405,13 @@ GrB_Info GB_Adot4B__times_land_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58204,13 +60429,15 @@ GrB_Info GB_Adot3B__times_land_fp64
 GrB_Info GB_Asaxpy3B__times_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58223,14 +60450,14 @@ GrB_Info GB_Adot4B__times_land_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58248,13 +60475,15 @@ GrB_Info GB_Adot3B__lor_land_bool
 GrB_Info GB_Asaxpy3B__lor_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58267,14 +60496,14 @@ GrB_Info GB_Adot4B__lor_land_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58292,13 +60521,15 @@ GrB_Info GB_Adot3B__land_land_bool
 GrB_Info GB_Asaxpy3B__land_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58312,14 +60543,13 @@ GrB_Info GB_Adot4B__land_land_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58337,13 +60567,15 @@ GrB_Info GB_Adot3B__lxor_land_bool
 GrB_Info GB_Asaxpy3B__lxor_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58356,14 +60588,14 @@ GrB_Info GB_Adot4B__lxor_land_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58381,13 +60613,15 @@ GrB_Info GB_Adot3B__any_land_bool
 GrB_Info GB_Asaxpy3B__any_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58401,14 +60635,13 @@ GrB_Info GB_Adot4B__any_land_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58426,13 +60659,15 @@ GrB_Info GB_Adot3B__eq_land_bool
 GrB_Info GB_Asaxpy3B__eq_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58445,14 +60680,14 @@ GrB_Info GB_Adot4B__eq_land_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58470,13 +60705,15 @@ GrB_Info GB_Adot3B__min_lxor_int8
 GrB_Info GB_Asaxpy3B__min_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58489,14 +60726,14 @@ GrB_Info GB_Adot4B__min_lxor_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58514,13 +60751,15 @@ GrB_Info GB_Adot3B__min_lxor_int16
 GrB_Info GB_Asaxpy3B__min_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58533,14 +60772,14 @@ GrB_Info GB_Adot4B__min_lxor_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58558,13 +60797,15 @@ GrB_Info GB_Adot3B__min_lxor_int32
 GrB_Info GB_Asaxpy3B__min_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58577,14 +60818,14 @@ GrB_Info GB_Adot4B__min_lxor_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58602,13 +60843,15 @@ GrB_Info GB_Adot3B__min_lxor_int64
 GrB_Info GB_Asaxpy3B__min_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58621,14 +60864,14 @@ GrB_Info GB_Adot4B__min_lxor_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58646,13 +60889,15 @@ GrB_Info GB_Adot3B__min_lxor_uint8
 GrB_Info GB_Asaxpy3B__min_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58665,14 +60910,14 @@ GrB_Info GB_Adot4B__min_lxor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58690,13 +60935,15 @@ GrB_Info GB_Adot3B__min_lxor_uint16
 GrB_Info GB_Asaxpy3B__min_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58709,14 +60956,14 @@ GrB_Info GB_Adot4B__min_lxor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58734,13 +60981,15 @@ GrB_Info GB_Adot3B__min_lxor_uint32
 GrB_Info GB_Asaxpy3B__min_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58753,14 +61002,14 @@ GrB_Info GB_Adot4B__min_lxor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58778,13 +61027,15 @@ GrB_Info GB_Adot3B__min_lxor_uint64
 GrB_Info GB_Asaxpy3B__min_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58797,14 +61048,14 @@ GrB_Info GB_Adot4B__min_lxor_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58822,13 +61073,15 @@ GrB_Info GB_Adot3B__min_lxor_fp32
 GrB_Info GB_Asaxpy3B__min_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58841,14 +61094,14 @@ GrB_Info GB_Adot4B__min_lxor_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__min_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58866,13 +61119,15 @@ GrB_Info GB_Adot3B__min_lxor_fp64
 GrB_Info GB_Asaxpy3B__min_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58885,14 +61140,14 @@ GrB_Info GB_Adot4B__min_lxor_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58910,13 +61165,15 @@ GrB_Info GB_Adot3B__max_lxor_int8
 GrB_Info GB_Asaxpy3B__max_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58929,14 +61186,14 @@ GrB_Info GB_Adot4B__max_lxor_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58954,13 +61211,15 @@ GrB_Info GB_Adot3B__max_lxor_int16
 GrB_Info GB_Asaxpy3B__max_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -58973,14 +61232,14 @@ GrB_Info GB_Adot4B__max_lxor_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -58998,13 +61257,15 @@ GrB_Info GB_Adot3B__max_lxor_int32
 GrB_Info GB_Asaxpy3B__max_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59017,14 +61278,14 @@ GrB_Info GB_Adot4B__max_lxor_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59042,13 +61303,15 @@ GrB_Info GB_Adot3B__max_lxor_int64
 GrB_Info GB_Asaxpy3B__max_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59061,14 +61324,14 @@ GrB_Info GB_Adot4B__max_lxor_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59086,13 +61349,15 @@ GrB_Info GB_Adot3B__max_lxor_uint8
 GrB_Info GB_Asaxpy3B__max_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59105,14 +61370,14 @@ GrB_Info GB_Adot4B__max_lxor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59130,13 +61395,15 @@ GrB_Info GB_Adot3B__max_lxor_uint16
 GrB_Info GB_Asaxpy3B__max_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59149,14 +61416,14 @@ GrB_Info GB_Adot4B__max_lxor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59174,13 +61441,15 @@ GrB_Info GB_Adot3B__max_lxor_uint32
 GrB_Info GB_Asaxpy3B__max_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59193,14 +61462,14 @@ GrB_Info GB_Adot4B__max_lxor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59218,13 +61487,15 @@ GrB_Info GB_Adot3B__max_lxor_uint64
 GrB_Info GB_Asaxpy3B__max_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59237,14 +61508,14 @@ GrB_Info GB_Adot4B__max_lxor_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59262,13 +61533,15 @@ GrB_Info GB_Adot3B__max_lxor_fp32
 GrB_Info GB_Asaxpy3B__max_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59281,14 +61554,14 @@ GrB_Info GB_Adot4B__max_lxor_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__max_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59306,13 +61579,15 @@ GrB_Info GB_Adot3B__max_lxor_fp64
 GrB_Info GB_Asaxpy3B__max_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59325,14 +61600,14 @@ GrB_Info GB_Adot4B__max_lxor_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59350,13 +61625,15 @@ GrB_Info GB_Adot3B__any_lxor_int8
 GrB_Info GB_Asaxpy3B__any_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59369,14 +61646,14 @@ GrB_Info GB_Adot4B__any_lxor_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59394,13 +61671,15 @@ GrB_Info GB_Adot3B__any_lxor_int16
 GrB_Info GB_Asaxpy3B__any_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59413,14 +61692,14 @@ GrB_Info GB_Adot4B__any_lxor_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59438,13 +61717,15 @@ GrB_Info GB_Adot3B__any_lxor_int32
 GrB_Info GB_Asaxpy3B__any_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59457,14 +61738,14 @@ GrB_Info GB_Adot4B__any_lxor_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59482,13 +61763,15 @@ GrB_Info GB_Adot3B__any_lxor_int64
 GrB_Info GB_Asaxpy3B__any_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59501,14 +61784,14 @@ GrB_Info GB_Adot4B__any_lxor_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59526,13 +61809,15 @@ GrB_Info GB_Adot3B__any_lxor_uint8
 GrB_Info GB_Asaxpy3B__any_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59545,14 +61830,14 @@ GrB_Info GB_Adot4B__any_lxor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59570,13 +61855,15 @@ GrB_Info GB_Adot3B__any_lxor_uint16
 GrB_Info GB_Asaxpy3B__any_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59589,14 +61876,14 @@ GrB_Info GB_Adot4B__any_lxor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59614,13 +61901,15 @@ GrB_Info GB_Adot3B__any_lxor_uint32
 GrB_Info GB_Asaxpy3B__any_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59633,14 +61922,14 @@ GrB_Info GB_Adot4B__any_lxor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59658,13 +61947,15 @@ GrB_Info GB_Adot3B__any_lxor_uint64
 GrB_Info GB_Asaxpy3B__any_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59677,14 +61968,14 @@ GrB_Info GB_Adot4B__any_lxor_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59702,13 +61993,15 @@ GrB_Info GB_Adot3B__any_lxor_fp32
 GrB_Info GB_Asaxpy3B__any_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59721,14 +62014,14 @@ GrB_Info GB_Adot4B__any_lxor_fp32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59746,13 +62039,15 @@ GrB_Info GB_Adot3B__any_lxor_fp64
 GrB_Info GB_Asaxpy3B__any_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59766,14 +62061,13 @@ GrB_Info GB_Adot4B__any_lxor_fp64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59791,13 +62085,15 @@ GrB_Info GB_Adot3B__plus_lxor_int8
 GrB_Info GB_Asaxpy3B__plus_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59811,14 +62107,13 @@ GrB_Info GB_Adot4B__plus_lxor_int8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59836,13 +62131,15 @@ GrB_Info GB_Adot3B__plus_lxor_uint8
 GrB_Info GB_Asaxpy3B__plus_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59856,14 +62153,13 @@ GrB_Info GB_Adot4B__plus_lxor_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59881,13 +62177,15 @@ GrB_Info GB_Adot3B__plus_lxor_int16
 GrB_Info GB_Asaxpy3B__plus_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59901,14 +62199,13 @@ GrB_Info GB_Adot4B__plus_lxor_int16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59926,13 +62223,15 @@ GrB_Info GB_Adot3B__plus_lxor_uint16
 GrB_Info GB_Asaxpy3B__plus_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59946,14 +62245,13 @@ GrB_Info GB_Adot4B__plus_lxor_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -59971,13 +62269,15 @@ GrB_Info GB_Adot3B__plus_lxor_int32
 GrB_Info GB_Asaxpy3B__plus_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -59991,14 +62291,13 @@ GrB_Info GB_Adot4B__plus_lxor_int32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60016,13 +62315,15 @@ GrB_Info GB_Adot3B__plus_lxor_uint32
 GrB_Info GB_Asaxpy3B__plus_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60036,14 +62337,13 @@ GrB_Info GB_Adot4B__plus_lxor_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60061,13 +62361,15 @@ GrB_Info GB_Adot3B__plus_lxor_int64
 GrB_Info GB_Asaxpy3B__plus_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60081,14 +62383,13 @@ GrB_Info GB_Adot4B__plus_lxor_int64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60106,13 +62407,15 @@ GrB_Info GB_Adot3B__plus_lxor_uint64
 GrB_Info GB_Asaxpy3B__plus_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60126,14 +62429,13 @@ GrB_Info GB_Adot4B__plus_lxor_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60151,13 +62453,15 @@ GrB_Info GB_Adot3B__plus_lxor_fp32
 GrB_Info GB_Asaxpy3B__plus_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60171,14 +62475,13 @@ GrB_Info GB_Adot4B__plus_lxor_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__plus_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60196,13 +62499,15 @@ GrB_Info GB_Adot3B__plus_lxor_fp64
 GrB_Info GB_Asaxpy3B__plus_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60215,14 +62520,14 @@ GrB_Info GB_Adot4B__plus_lxor_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60240,13 +62545,15 @@ GrB_Info GB_Adot3B__times_lxor_int8
 GrB_Info GB_Asaxpy3B__times_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60259,14 +62566,14 @@ GrB_Info GB_Adot4B__times_lxor_int8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60284,13 +62591,15 @@ GrB_Info GB_Adot3B__times_lxor_uint8
 GrB_Info GB_Asaxpy3B__times_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60303,14 +62612,14 @@ GrB_Info GB_Adot4B__times_lxor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60328,13 +62637,15 @@ GrB_Info GB_Adot3B__times_lxor_int16
 GrB_Info GB_Asaxpy3B__times_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60347,14 +62658,14 @@ GrB_Info GB_Adot4B__times_lxor_int16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60372,13 +62683,15 @@ GrB_Info GB_Adot3B__times_lxor_uint16
 GrB_Info GB_Asaxpy3B__times_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60391,14 +62704,14 @@ GrB_Info GB_Adot4B__times_lxor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60416,13 +62729,15 @@ GrB_Info GB_Adot3B__times_lxor_int32
 GrB_Info GB_Asaxpy3B__times_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60435,14 +62750,14 @@ GrB_Info GB_Adot4B__times_lxor_int32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60460,13 +62775,15 @@ GrB_Info GB_Adot3B__times_lxor_uint32
 GrB_Info GB_Asaxpy3B__times_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60479,14 +62796,14 @@ GrB_Info GB_Adot4B__times_lxor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60504,13 +62821,15 @@ GrB_Info GB_Adot3B__times_lxor_int64
 GrB_Info GB_Asaxpy3B__times_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60523,14 +62842,14 @@ GrB_Info GB_Adot4B__times_lxor_int64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60548,13 +62867,15 @@ GrB_Info GB_Adot3B__times_lxor_uint64
 GrB_Info GB_Asaxpy3B__times_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60568,14 +62889,13 @@ GrB_Info GB_Adot4B__times_lxor_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60593,13 +62913,15 @@ GrB_Info GB_Adot3B__times_lxor_fp32
 GrB_Info GB_Asaxpy3B__times_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60613,14 +62935,13 @@ GrB_Info GB_Adot4B__times_lxor_fp32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__times_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60638,13 +62959,15 @@ GrB_Info GB_Adot3B__times_lxor_fp64
 GrB_Info GB_Asaxpy3B__times_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60657,14 +62980,14 @@ GrB_Info GB_Adot4B__times_lxor_fp64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lor_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60682,13 +63005,15 @@ GrB_Info GB_Adot3B__lor_lxor_bool
 GrB_Info GB_Asaxpy3B__lor_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60701,14 +63026,14 @@ GrB_Info GB_Adot4B__lor_lxor_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__land_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60726,13 +63051,15 @@ GrB_Info GB_Adot3B__land_lxor_bool
 GrB_Info GB_Asaxpy3B__land_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60746,14 +63073,13 @@ GrB_Info GB_Adot4B__land_lxor_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__lxor_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60771,13 +63097,15 @@ GrB_Info GB_Adot3B__lxor_lxor_bool
 GrB_Info GB_Asaxpy3B__lxor_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60790,14 +63118,14 @@ GrB_Info GB_Adot4B__lxor_lxor_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__any_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60815,13 +63143,15 @@ GrB_Info GB_Adot3B__any_lxor_bool
 GrB_Info GB_Asaxpy3B__any_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60835,14 +63165,13 @@ GrB_Info GB_Adot4B__any_lxor_bool
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__eq_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60860,13 +63189,15 @@ GrB_Info GB_Adot3B__eq_lxor_bool
 GrB_Info GB_Asaxpy3B__eq_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60879,14 +63210,14 @@ GrB_Info GB_Adot4B__eq_lxor_bool
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60904,13 +63235,15 @@ GrB_Info GB_Adot3B__bor_bor_uint8
 GrB_Info GB_Asaxpy3B__bor_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60923,14 +63256,14 @@ GrB_Info GB_Adot4B__bor_bor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60948,13 +63281,15 @@ GrB_Info GB_Adot3B__bor_bor_uint16
 GrB_Info GB_Asaxpy3B__bor_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -60967,14 +63302,14 @@ GrB_Info GB_Adot4B__bor_bor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -60992,13 +63327,15 @@ GrB_Info GB_Adot3B__bor_bor_uint32
 GrB_Info GB_Asaxpy3B__bor_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61011,14 +63348,14 @@ GrB_Info GB_Adot4B__bor_bor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61036,13 +63373,15 @@ GrB_Info GB_Adot3B__bor_bor_uint64
 GrB_Info GB_Asaxpy3B__bor_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61055,14 +63394,14 @@ GrB_Info GB_Adot4B__bor_bor_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61080,13 +63419,15 @@ GrB_Info GB_Adot3B__bor_band_uint8
 GrB_Info GB_Asaxpy3B__bor_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61099,14 +63440,14 @@ GrB_Info GB_Adot4B__bor_band_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61124,13 +63465,15 @@ GrB_Info GB_Adot3B__bor_band_uint16
 GrB_Info GB_Asaxpy3B__bor_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61143,14 +63486,14 @@ GrB_Info GB_Adot4B__bor_band_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61168,13 +63511,15 @@ GrB_Info GB_Adot3B__bor_band_uint32
 GrB_Info GB_Asaxpy3B__bor_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61187,14 +63532,14 @@ GrB_Info GB_Adot4B__bor_band_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61212,13 +63557,15 @@ GrB_Info GB_Adot3B__bor_band_uint64
 GrB_Info GB_Asaxpy3B__bor_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61231,14 +63578,14 @@ GrB_Info GB_Adot4B__bor_band_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61256,13 +63603,15 @@ GrB_Info GB_Adot3B__bor_bxor_uint8
 GrB_Info GB_Asaxpy3B__bor_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61275,14 +63624,14 @@ GrB_Info GB_Adot4B__bor_bxor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61300,13 +63649,15 @@ GrB_Info GB_Adot3B__bor_bxor_uint16
 GrB_Info GB_Asaxpy3B__bor_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61319,14 +63670,14 @@ GrB_Info GB_Adot4B__bor_bxor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61344,13 +63695,15 @@ GrB_Info GB_Adot3B__bor_bxor_uint32
 GrB_Info GB_Asaxpy3B__bor_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61363,14 +63716,14 @@ GrB_Info GB_Adot4B__bor_bxor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61388,13 +63741,15 @@ GrB_Info GB_Adot3B__bor_bxor_uint64
 GrB_Info GB_Asaxpy3B__bor_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61407,14 +63762,14 @@ GrB_Info GB_Adot4B__bor_bxor_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61432,13 +63787,15 @@ GrB_Info GB_Adot3B__bor_bxnor_uint8
 GrB_Info GB_Asaxpy3B__bor_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61451,14 +63808,14 @@ GrB_Info GB_Adot4B__bor_bxnor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61476,13 +63833,15 @@ GrB_Info GB_Adot3B__bor_bxnor_uint16
 GrB_Info GB_Asaxpy3B__bor_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61495,14 +63854,14 @@ GrB_Info GB_Adot4B__bor_bxnor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61520,13 +63879,15 @@ GrB_Info GB_Adot3B__bor_bxnor_uint32
 GrB_Info GB_Asaxpy3B__bor_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61539,14 +63900,14 @@ GrB_Info GB_Adot4B__bor_bxnor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bor_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61564,13 +63925,15 @@ GrB_Info GB_Adot3B__bor_bxnor_uint64
 GrB_Info GB_Asaxpy3B__bor_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61583,14 +63946,14 @@ GrB_Info GB_Adot4B__bor_bxnor_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61608,13 +63971,15 @@ GrB_Info GB_Adot3B__band_bor_uint8
 GrB_Info GB_Asaxpy3B__band_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61627,14 +63992,14 @@ GrB_Info GB_Adot4B__band_bor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61652,13 +64017,15 @@ GrB_Info GB_Adot3B__band_bor_uint16
 GrB_Info GB_Asaxpy3B__band_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61671,14 +64038,14 @@ GrB_Info GB_Adot4B__band_bor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61696,13 +64063,15 @@ GrB_Info GB_Adot3B__band_bor_uint32
 GrB_Info GB_Asaxpy3B__band_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61715,14 +64084,14 @@ GrB_Info GB_Adot4B__band_bor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61740,13 +64109,15 @@ GrB_Info GB_Adot3B__band_bor_uint64
 GrB_Info GB_Asaxpy3B__band_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61759,14 +64130,14 @@ GrB_Info GB_Adot4B__band_bor_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61784,13 +64155,15 @@ GrB_Info GB_Adot3B__band_band_uint8
 GrB_Info GB_Asaxpy3B__band_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61803,14 +64176,14 @@ GrB_Info GB_Adot4B__band_band_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61828,13 +64201,15 @@ GrB_Info GB_Adot3B__band_band_uint16
 GrB_Info GB_Asaxpy3B__band_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61847,14 +64222,14 @@ GrB_Info GB_Adot4B__band_band_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61872,13 +64247,15 @@ GrB_Info GB_Adot3B__band_band_uint32
 GrB_Info GB_Asaxpy3B__band_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61891,14 +64268,14 @@ GrB_Info GB_Adot4B__band_band_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61916,13 +64293,15 @@ GrB_Info GB_Adot3B__band_band_uint64
 GrB_Info GB_Asaxpy3B__band_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61935,14 +64314,14 @@ GrB_Info GB_Adot4B__band_band_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -61960,13 +64339,15 @@ GrB_Info GB_Adot3B__band_bxor_uint8
 GrB_Info GB_Asaxpy3B__band_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -61979,14 +64360,14 @@ GrB_Info GB_Adot4B__band_bxor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62004,13 +64385,15 @@ GrB_Info GB_Adot3B__band_bxor_uint16
 GrB_Info GB_Asaxpy3B__band_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62023,14 +64406,14 @@ GrB_Info GB_Adot4B__band_bxor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62048,13 +64431,15 @@ GrB_Info GB_Adot3B__band_bxor_uint32
 GrB_Info GB_Asaxpy3B__band_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62067,14 +64452,14 @@ GrB_Info GB_Adot4B__band_bxor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62092,13 +64477,15 @@ GrB_Info GB_Adot3B__band_bxor_uint64
 GrB_Info GB_Asaxpy3B__band_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62111,14 +64498,14 @@ GrB_Info GB_Adot4B__band_bxor_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62136,13 +64523,15 @@ GrB_Info GB_Adot3B__band_bxnor_uint8
 GrB_Info GB_Asaxpy3B__band_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62155,14 +64544,14 @@ GrB_Info GB_Adot4B__band_bxnor_uint8
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62180,13 +64569,15 @@ GrB_Info GB_Adot3B__band_bxnor_uint16
 GrB_Info GB_Asaxpy3B__band_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62199,14 +64590,14 @@ GrB_Info GB_Adot4B__band_bxnor_uint16
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62224,13 +64615,15 @@ GrB_Info GB_Adot3B__band_bxnor_uint32
 GrB_Info GB_Asaxpy3B__band_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62243,14 +64636,14 @@ GrB_Info GB_Adot4B__band_bxnor_uint32
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__band_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62268,13 +64661,15 @@ GrB_Info GB_Adot3B__band_bxnor_uint64
 GrB_Info GB_Asaxpy3B__band_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62288,14 +64683,13 @@ GrB_Info GB_Adot4B__band_bxnor_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62313,13 +64707,15 @@ GrB_Info GB_Adot3B__bxor_bor_uint8
 GrB_Info GB_Asaxpy3B__bxor_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62333,14 +64729,13 @@ GrB_Info GB_Adot4B__bxor_bor_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62358,13 +64753,15 @@ GrB_Info GB_Adot3B__bxor_bor_uint16
 GrB_Info GB_Asaxpy3B__bxor_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62378,14 +64775,13 @@ GrB_Info GB_Adot4B__bxor_bor_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62403,13 +64799,15 @@ GrB_Info GB_Adot3B__bxor_bor_uint32
 GrB_Info GB_Asaxpy3B__bxor_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62423,14 +64821,13 @@ GrB_Info GB_Adot4B__bxor_bor_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62448,13 +64845,15 @@ GrB_Info GB_Adot3B__bxor_bor_uint64
 GrB_Info GB_Asaxpy3B__bxor_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62468,14 +64867,13 @@ GrB_Info GB_Adot4B__bxor_bor_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62493,13 +64891,15 @@ GrB_Info GB_Adot3B__bxor_band_uint8
 GrB_Info GB_Asaxpy3B__bxor_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62513,14 +64913,13 @@ GrB_Info GB_Adot4B__bxor_band_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62538,13 +64937,15 @@ GrB_Info GB_Adot3B__bxor_band_uint16
 GrB_Info GB_Asaxpy3B__bxor_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62558,14 +64959,13 @@ GrB_Info GB_Adot4B__bxor_band_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62583,13 +64983,15 @@ GrB_Info GB_Adot3B__bxor_band_uint32
 GrB_Info GB_Asaxpy3B__bxor_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62603,14 +65005,13 @@ GrB_Info GB_Adot4B__bxor_band_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62628,13 +65029,15 @@ GrB_Info GB_Adot3B__bxor_band_uint64
 GrB_Info GB_Asaxpy3B__bxor_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62648,14 +65051,13 @@ GrB_Info GB_Adot4B__bxor_band_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62673,13 +65075,15 @@ GrB_Info GB_Adot3B__bxor_bxor_uint8
 GrB_Info GB_Asaxpy3B__bxor_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62693,14 +65097,13 @@ GrB_Info GB_Adot4B__bxor_bxor_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62718,13 +65121,15 @@ GrB_Info GB_Adot3B__bxor_bxor_uint16
 GrB_Info GB_Asaxpy3B__bxor_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62738,14 +65143,13 @@ GrB_Info GB_Adot4B__bxor_bxor_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62763,13 +65167,15 @@ GrB_Info GB_Adot3B__bxor_bxor_uint32
 GrB_Info GB_Asaxpy3B__bxor_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62783,14 +65189,13 @@ GrB_Info GB_Adot4B__bxor_bxor_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62808,13 +65213,15 @@ GrB_Info GB_Adot3B__bxor_bxor_uint64
 GrB_Info GB_Asaxpy3B__bxor_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62828,14 +65235,13 @@ GrB_Info GB_Adot4B__bxor_bxor_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62853,13 +65259,15 @@ GrB_Info GB_Adot3B__bxor_bxnor_uint8
 GrB_Info GB_Asaxpy3B__bxor_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62873,14 +65281,13 @@ GrB_Info GB_Adot4B__bxor_bxnor_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62898,13 +65305,15 @@ GrB_Info GB_Adot3B__bxor_bxnor_uint16
 GrB_Info GB_Asaxpy3B__bxor_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62918,14 +65327,13 @@ GrB_Info GB_Adot4B__bxor_bxnor_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62943,13 +65351,15 @@ GrB_Info GB_Adot3B__bxor_bxnor_uint32
 GrB_Info GB_Asaxpy3B__bxor_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -62963,14 +65373,13 @@ GrB_Info GB_Adot4B__bxor_bxnor_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxor_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -62988,13 +65397,15 @@ GrB_Info GB_Adot3B__bxor_bxnor_uint64
 GrB_Info GB_Asaxpy3B__bxor_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63008,14 +65419,13 @@ GrB_Info GB_Adot4B__bxor_bxnor_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63033,13 +65443,15 @@ GrB_Info GB_Adot3B__bxnor_bor_uint8
 GrB_Info GB_Asaxpy3B__bxnor_bor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63053,14 +65465,13 @@ GrB_Info GB_Adot4B__bxnor_bor_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63078,13 +65489,15 @@ GrB_Info GB_Adot3B__bxnor_bor_uint16
 GrB_Info GB_Asaxpy3B__bxnor_bor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63098,14 +65511,13 @@ GrB_Info GB_Adot4B__bxnor_bor_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63123,13 +65535,15 @@ GrB_Info GB_Adot3B__bxnor_bor_uint32
 GrB_Info GB_Asaxpy3B__bxnor_bor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63143,14 +65557,13 @@ GrB_Info GB_Adot4B__bxnor_bor_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63168,13 +65581,15 @@ GrB_Info GB_Adot3B__bxnor_bor_uint64
 GrB_Info GB_Asaxpy3B__bxnor_bor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63188,14 +65603,13 @@ GrB_Info GB_Adot4B__bxnor_bor_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63213,13 +65627,15 @@ GrB_Info GB_Adot3B__bxnor_band_uint8
 GrB_Info GB_Asaxpy3B__bxnor_band_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63233,14 +65649,13 @@ GrB_Info GB_Adot4B__bxnor_band_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63258,13 +65673,15 @@ GrB_Info GB_Adot3B__bxnor_band_uint16
 GrB_Info GB_Asaxpy3B__bxnor_band_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63278,14 +65695,13 @@ GrB_Info GB_Adot4B__bxnor_band_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63303,13 +65719,15 @@ GrB_Info GB_Adot3B__bxnor_band_uint32
 GrB_Info GB_Asaxpy3B__bxnor_band_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63323,14 +65741,13 @@ GrB_Info GB_Adot4B__bxnor_band_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63348,13 +65765,15 @@ GrB_Info GB_Adot3B__bxnor_band_uint64
 GrB_Info GB_Asaxpy3B__bxnor_band_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63368,14 +65787,13 @@ GrB_Info GB_Adot4B__bxnor_band_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63393,13 +65811,15 @@ GrB_Info GB_Adot3B__bxnor_bxor_uint8
 GrB_Info GB_Asaxpy3B__bxnor_bxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63413,14 +65833,13 @@ GrB_Info GB_Adot4B__bxnor_bxor_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63438,13 +65857,15 @@ GrB_Info GB_Adot3B__bxnor_bxor_uint16
 GrB_Info GB_Asaxpy3B__bxnor_bxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63458,14 +65879,13 @@ GrB_Info GB_Adot4B__bxnor_bxor_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63483,13 +65903,15 @@ GrB_Info GB_Adot3B__bxnor_bxor_uint32
 GrB_Info GB_Asaxpy3B__bxnor_bxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63503,14 +65925,13 @@ GrB_Info GB_Adot4B__bxnor_bxor_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63528,13 +65949,15 @@ GrB_Info GB_Adot3B__bxnor_bxor_uint64
 GrB_Info GB_Asaxpy3B__bxnor_bxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63548,14 +65971,13 @@ GrB_Info GB_Adot4B__bxnor_bxor_uint64
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63573,13 +65995,15 @@ GrB_Info GB_Adot3B__bxnor_bxnor_uint8
 GrB_Info GB_Asaxpy3B__bxnor_bxnor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63593,14 +66017,13 @@ GrB_Info GB_Adot4B__bxnor_bxnor_uint8
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63618,13 +66041,15 @@ GrB_Info GB_Adot3B__bxnor_bxnor_uint16
 GrB_Info GB_Asaxpy3B__bxnor_bxnor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63638,14 +66063,13 @@ GrB_Info GB_Adot4B__bxnor_bxnor_uint16
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63663,13 +66087,15 @@ GrB_Info GB_Adot3B__bxnor_bxnor_uint32
 GrB_Info GB_Asaxpy3B__bxnor_bxnor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63683,14 +66109,13 @@ GrB_Info GB_Adot4B__bxnor_bxnor_uint32
     const int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B__bxnor_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -63708,13 +66133,15 @@ GrB_Info GB_Adot3B__bxnor_bxnor_uint64
 GrB_Info GB_Asaxpy3B__bxnor_bxnor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -63727,3 +66154,2764 @@ GrB_Info GB_Adot4B__bxnor_bxnor_uint64
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__min_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__min_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__min_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__min_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__min_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__min_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__min_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__min_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__max_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__max_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__max_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__max_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__max_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__max_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__max_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__max_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__any_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__any_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__any_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__any_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__any_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__any_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__any_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__any_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__plus_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__plus_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__plus_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__plus_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__plus_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__plus_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__plus_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__plus_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__times_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__times_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__times_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__times_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__times_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__times_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__times_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__times_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__min_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__min_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__min_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__min_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__min_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__min_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__min_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__min_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__max_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__max_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__max_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__max_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__max_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__max_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__max_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__max_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__any_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__any_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__any_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__any_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__any_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__any_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__any_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__any_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__plus_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__plus_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__plus_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__plus_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__plus_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__plus_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__plus_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__plus_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__times_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__times_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__times_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__times_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__times_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__times_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__times_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__times_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__min_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__min_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__min_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__min_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__min_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__min_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__min_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__min_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__max_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__max_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__max_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__max_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__max_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__max_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__max_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__max_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__any_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__any_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__any_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__any_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__any_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__any_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__any_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__any_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__plus_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__plus_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__plus_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__plus_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__plus_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__plus_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__plus_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__plus_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__times_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__times_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__times_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__times_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__times_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__times_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__times_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__times_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__min_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__min_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__min_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__min_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__min_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__min_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__min_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__min_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__max_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__max_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__max_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__max_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__max_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__max_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__max_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__max_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__any_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__any_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__any_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__any_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__any_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__any_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__any_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__any_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__plus_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__plus_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__plus_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__plus_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__plus_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__plus_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__plus_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__plus_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__times_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__times_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__times_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__times_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__times_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__times_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__times_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__times_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__min_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__min_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__min_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__min_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__min_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__min_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__min_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__min_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__max_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__max_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__max_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__max_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__max_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__max_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__max_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__max_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__any_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__any_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__any_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__any_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__any_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__any_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__any_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__any_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__plus_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__plus_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__plus_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__plus_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__plus_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__plus_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__plus_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__plus_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__times_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__times_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__times_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__times_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__times_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__times_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__times_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__times_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__min_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__min_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__min_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__min_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__min_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__min_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__min_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__min_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__max_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__max_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__max_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__max_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__max_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__max_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__max_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__max_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__any_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__any_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__any_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__any_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__any_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__any_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__any_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__any_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__plus_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__plus_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__plus_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__plus_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__plus_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__plus_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__plus_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__plus_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__times_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__times_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__times_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__times_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_Adot2B__times_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+) ;
+
+GrB_Info GB_Adot3B__times_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+) ;
+
+GrB_Info GB_Asaxpy3B__times_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+) ;
+
+GrB_Info GB_Adot4B__times_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+) ;
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_eq_bool.c b/GraphBLAS/Source/Generated/GB_AxB__land_eq_bool.c
index 48f28ef04b..38e2535743 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_eq_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_eq_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik == bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_EQ || GxB_NO_BOOL || GxB_NO_LAND_BOOL || GxB_NO_EQ_BOOL || GxB_NO_LAND_EQ_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_eq_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_eq_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_eq_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_eq_bool
 GrB_Info GB_Asaxpy3B__land_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_eq_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__land_eq_fp32.c
index 2256fd7fbd..f76623b317 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_eq_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_eq_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik == bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_EQ || GxB_NO_FP32 || GxB_NO_LAND_BOOL || GxB_NO_EQ_FP32 || GxB_NO_LAND_EQ_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_eq_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_eq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_eq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_eq_fp32
 GrB_Info GB_Asaxpy3B__land_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_eq_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__land_eq_fp64.c
index efa1ef912c..1b2322c71e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_eq_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_eq_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik == bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_EQ || GxB_NO_FP64 || GxB_NO_LAND_BOOL || GxB_NO_EQ_FP64 || GxB_NO_LAND_EQ_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_eq_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_eq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_eq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_eq_fp64
 GrB_Info GB_Asaxpy3B__land_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_eq_int16.c b/GraphBLAS/Source/Generated/GB_AxB__land_eq_int16.c
index f3e39a5815..63c629cb5c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_eq_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_eq_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik == bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_EQ || GxB_NO_INT16 || GxB_NO_LAND_BOOL || GxB_NO_EQ_INT16 || GxB_NO_LAND_EQ_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_eq_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_eq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_eq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_eq_int16
 GrB_Info GB_Asaxpy3B__land_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_eq_int32.c b/GraphBLAS/Source/Generated/GB_AxB__land_eq_int32.c
index 3ae173c8cb..0ccce9bf16 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_eq_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_eq_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik == bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_EQ || GxB_NO_INT32 || GxB_NO_LAND_BOOL || GxB_NO_EQ_INT32 || GxB_NO_LAND_EQ_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_eq_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_eq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_eq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_eq_int32
 GrB_Info GB_Asaxpy3B__land_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_eq_int64.c b/GraphBLAS/Source/Generated/GB_AxB__land_eq_int64.c
index 1bd59ce35d..41c351a2b4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_eq_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_eq_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik == bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_EQ || GxB_NO_INT64 || GxB_NO_LAND_BOOL || GxB_NO_EQ_INT64 || GxB_NO_LAND_EQ_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_eq_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_eq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_eq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_eq_int64
 GrB_Info GB_Asaxpy3B__land_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_eq_int8.c b/GraphBLAS/Source/Generated/GB_AxB__land_eq_int8.c
index aad76d6fe0..236844a5e5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_eq_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_eq_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik == bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_EQ || GxB_NO_INT8 || GxB_NO_LAND_BOOL || GxB_NO_EQ_INT8 || GxB_NO_LAND_EQ_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_eq_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_eq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_eq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_eq_int8
 GrB_Info GB_Asaxpy3B__land_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint16.c
index d9018a0b73..5ba97087d3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik == bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_EQ || GxB_NO_UINT16 || GxB_NO_LAND_BOOL || GxB_NO_EQ_UINT16 || GxB_NO_LAND_EQ_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_eq_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_eq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_eq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_eq_uint16
 GrB_Info GB_Asaxpy3B__land_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint32.c
index 5a764eec82..92911c2687 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik == bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_EQ || GxB_NO_UINT32 || GxB_NO_LAND_BOOL || GxB_NO_EQ_UINT32 || GxB_NO_LAND_EQ_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_eq_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_eq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_eq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_eq_uint32
 GrB_Info GB_Asaxpy3B__land_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint64.c
index 84e2092206..44e6d89624 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik == bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_EQ || GxB_NO_UINT64 || GxB_NO_LAND_BOOL || GxB_NO_EQ_UINT64 || GxB_NO_LAND_EQ_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_eq_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_eq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_eq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_eq_uint64
 GrB_Info GB_Asaxpy3B__land_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint8.c
index 4ee02eb91b..01142ccff5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_eq_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik == bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_EQ || GxB_NO_UINT8 || GxB_NO_LAND_BOOL || GxB_NO_EQ_UINT8 || GxB_NO_LAND_EQ_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_eq_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_eq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_eq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_eq_uint8
 GrB_Info GB_Asaxpy3B__land_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_first_bool.c b/GraphBLAS/Source/Generated/GB_AxB__land_first_bool.c
index b2989fda7a..82e4850e4e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_first_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_first_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= aik
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= x
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_FIRST || GxB_NO_BOOL || GxB_NO_LAND_BOOL || GxB_NO_FIRST_BOOL || GxB_NO_LAND_FIRST_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_first_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_first_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_first_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_first_bool
 GrB_Info GB_Asaxpy3B__land_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ge_bool.c b/GraphBLAS/Source/Generated/GB_AxB__land_ge_bool.c
index 8afae4fe98..f42596bf14 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ge_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ge_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik >= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GE || GxB_NO_BOOL || GxB_NO_LAND_BOOL || GxB_NO_GE_BOOL || GxB_NO_LAND_GE_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ge_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ge_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ge_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ge_bool
 GrB_Info GB_Asaxpy3B__land_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ge_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__land_ge_fp32.c
index 894a884047..e1401dc5e9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ge_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ge_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik >= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GE || GxB_NO_FP32 || GxB_NO_LAND_BOOL || GxB_NO_GE_FP32 || GxB_NO_LAND_GE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ge_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ge_fp32
 GrB_Info GB_Asaxpy3B__land_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ge_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__land_ge_fp64.c
index 21020e835c..101639a3c4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ge_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ge_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik >= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GE || GxB_NO_FP64 || GxB_NO_LAND_BOOL || GxB_NO_GE_FP64 || GxB_NO_LAND_GE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ge_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ge_fp64
 GrB_Info GB_Asaxpy3B__land_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ge_int16.c b/GraphBLAS/Source/Generated/GB_AxB__land_ge_int16.c
index ed24c4427c..47021efccd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ge_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ge_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik >= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GE || GxB_NO_INT16 || GxB_NO_LAND_BOOL || GxB_NO_GE_INT16 || GxB_NO_LAND_GE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ge_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ge_int16
 GrB_Info GB_Asaxpy3B__land_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ge_int32.c b/GraphBLAS/Source/Generated/GB_AxB__land_ge_int32.c
index 308e5562ae..1e8a9d3834 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ge_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ge_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik >= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GE || GxB_NO_INT32 || GxB_NO_LAND_BOOL || GxB_NO_GE_INT32 || GxB_NO_LAND_GE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ge_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ge_int32
 GrB_Info GB_Asaxpy3B__land_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ge_int64.c b/GraphBLAS/Source/Generated/GB_AxB__land_ge_int64.c
index 6d858a39d4..5cf5fbee04 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ge_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ge_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik >= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GE || GxB_NO_INT64 || GxB_NO_LAND_BOOL || GxB_NO_GE_INT64 || GxB_NO_LAND_GE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ge_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ge_int64
 GrB_Info GB_Asaxpy3B__land_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ge_int8.c b/GraphBLAS/Source/Generated/GB_AxB__land_ge_int8.c
index 05ed9c71ef..5cb14f31a0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ge_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ge_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik >= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GE || GxB_NO_INT8 || GxB_NO_LAND_BOOL || GxB_NO_GE_INT8 || GxB_NO_LAND_GE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ge_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ge_int8
 GrB_Info GB_Asaxpy3B__land_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint16.c
index 86c5022594..8d1d3a4cb6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik >= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GE || GxB_NO_UINT16 || GxB_NO_LAND_BOOL || GxB_NO_GE_UINT16 || GxB_NO_LAND_GE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ge_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ge_uint16
 GrB_Info GB_Asaxpy3B__land_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint32.c
index 05ce4e7b69..20057d13b6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik >= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GE || GxB_NO_UINT32 || GxB_NO_LAND_BOOL || GxB_NO_GE_UINT32 || GxB_NO_LAND_GE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ge_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ge_uint32
 GrB_Info GB_Asaxpy3B__land_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint64.c
index da20554828..1f66e681da 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik >= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GE || GxB_NO_UINT64 || GxB_NO_LAND_BOOL || GxB_NO_GE_UINT64 || GxB_NO_LAND_GE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ge_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ge_uint64
 GrB_Info GB_Asaxpy3B__land_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint8.c
index aa5ae3d5a4..f048c72814 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ge_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik >= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GE || GxB_NO_UINT8 || GxB_NO_LAND_BOOL || GxB_NO_GE_UINT8 || GxB_NO_LAND_GE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ge_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ge_uint8
 GrB_Info GB_Asaxpy3B__land_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_gt_bool.c b/GraphBLAS/Source/Generated/GB_AxB__land_gt_bool.c
index 6a713beebb..a105848619 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_gt_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_gt_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik > bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GT || GxB_NO_BOOL || GxB_NO_LAND_BOOL || GxB_NO_GT_BOOL || GxB_NO_LAND_GT_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_gt_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_gt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_gt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_gt_bool
 GrB_Info GB_Asaxpy3B__land_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_gt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__land_gt_fp32.c
index 2d6a94d2fd..d215b663a0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_gt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_gt_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik > bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GT || GxB_NO_FP32 || GxB_NO_LAND_BOOL || GxB_NO_GT_FP32 || GxB_NO_LAND_GT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_gt_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_gt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_gt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_gt_fp32
 GrB_Info GB_Asaxpy3B__land_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_gt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__land_gt_fp64.c
index 99a07cdbd9..71e2ab1fba 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_gt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_gt_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik > bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GT || GxB_NO_FP64 || GxB_NO_LAND_BOOL || GxB_NO_GT_FP64 || GxB_NO_LAND_GT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_gt_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_gt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_gt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_gt_fp64
 GrB_Info GB_Asaxpy3B__land_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_gt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__land_gt_int16.c
index 58d397da45..27d2a05fc5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_gt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_gt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik > bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GT || GxB_NO_INT16 || GxB_NO_LAND_BOOL || GxB_NO_GT_INT16 || GxB_NO_LAND_GT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_gt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_gt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_gt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_gt_int16
 GrB_Info GB_Asaxpy3B__land_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_gt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__land_gt_int32.c
index 28d042dc72..d4a006d6a6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_gt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_gt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik > bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GT || GxB_NO_INT32 || GxB_NO_LAND_BOOL || GxB_NO_GT_INT32 || GxB_NO_LAND_GT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_gt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_gt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_gt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_gt_int32
 GrB_Info GB_Asaxpy3B__land_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_gt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__land_gt_int64.c
index 7b213809b0..041266cea3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_gt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_gt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik > bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GT || GxB_NO_INT64 || GxB_NO_LAND_BOOL || GxB_NO_GT_INT64 || GxB_NO_LAND_GT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_gt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_gt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_gt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_gt_int64
 GrB_Info GB_Asaxpy3B__land_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_gt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__land_gt_int8.c
index 9e51e676a2..488db79bed 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_gt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_gt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik > bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GT || GxB_NO_INT8 || GxB_NO_LAND_BOOL || GxB_NO_GT_INT8 || GxB_NO_LAND_GT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_gt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_gt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_gt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_gt_int8
 GrB_Info GB_Asaxpy3B__land_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint16.c
index 1029ea1566..c71817128a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik > bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GT || GxB_NO_UINT16 || GxB_NO_LAND_BOOL || GxB_NO_GT_UINT16 || GxB_NO_LAND_GT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_gt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_gt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_gt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_gt_uint16
 GrB_Info GB_Asaxpy3B__land_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint32.c
index 29fceb96e4..a69b66e4f8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik > bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GT || GxB_NO_UINT32 || GxB_NO_LAND_BOOL || GxB_NO_GT_UINT32 || GxB_NO_LAND_GT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_gt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_gt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_gt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_gt_uint32
 GrB_Info GB_Asaxpy3B__land_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint64.c
index cf8d08a62b..c27150ff56 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik > bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GT || GxB_NO_UINT64 || GxB_NO_LAND_BOOL || GxB_NO_GT_UINT64 || GxB_NO_LAND_GT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_gt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_gt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_gt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_gt_uint64
 GrB_Info GB_Asaxpy3B__land_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint8.c
index 517767c026..b4717c7882 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_gt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik > bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_GT || GxB_NO_UINT8 || GxB_NO_LAND_BOOL || GxB_NO_GT_UINT8 || GxB_NO_LAND_GT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_gt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_gt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_gt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_gt_uint8
 GrB_Info GB_Asaxpy3B__land_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_land_bool.c b/GraphBLAS/Source/Generated/GB_AxB__land_land_bool.c
index c54bc00da2..bd77c927c5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_land_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_land_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik && bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x && y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x && y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax && bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_BOOL || GxB_NO_LAND_BOOL || GxB_NO_LAND_LAND_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_land_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_land_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_land_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_land_bool
 GrB_Info GB_Asaxpy3B__land_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_le_bool.c b/GraphBLAS/Source/Generated/GB_AxB__land_le_bool.c
index c13f4be51a..447c3ade9f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_le_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_le_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik <= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LE || GxB_NO_BOOL || GxB_NO_LAND_BOOL || GxB_NO_LE_BOOL || GxB_NO_LAND_LE_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_le_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_le_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_le_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_le_bool
 GrB_Info GB_Asaxpy3B__land_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_le_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__land_le_fp32.c
index e3fa56f3f1..9c1855cc6e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_le_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_le_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik <= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LE || GxB_NO_FP32 || GxB_NO_LAND_BOOL || GxB_NO_LE_FP32 || GxB_NO_LAND_LE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_le_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_le_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_le_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_le_fp32
 GrB_Info GB_Asaxpy3B__land_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_le_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__land_le_fp64.c
index cb2b8085ce..55ef41ef72 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_le_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_le_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik <= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LE || GxB_NO_FP64 || GxB_NO_LAND_BOOL || GxB_NO_LE_FP64 || GxB_NO_LAND_LE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_le_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_le_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_le_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_le_fp64
 GrB_Info GB_Asaxpy3B__land_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_le_int16.c b/GraphBLAS/Source/Generated/GB_AxB__land_le_int16.c
index 41fddbcba1..4d3f12e60c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_le_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_le_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik <= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LE || GxB_NO_INT16 || GxB_NO_LAND_BOOL || GxB_NO_LE_INT16 || GxB_NO_LAND_LE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_le_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_le_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_le_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_le_int16
 GrB_Info GB_Asaxpy3B__land_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_le_int32.c b/GraphBLAS/Source/Generated/GB_AxB__land_le_int32.c
index b4c37ac89c..b96c63cfec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_le_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_le_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik <= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LE || GxB_NO_INT32 || GxB_NO_LAND_BOOL || GxB_NO_LE_INT32 || GxB_NO_LAND_LE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_le_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_le_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_le_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_le_int32
 GrB_Info GB_Asaxpy3B__land_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_le_int64.c b/GraphBLAS/Source/Generated/GB_AxB__land_le_int64.c
index f65bc42e94..70f70de6bf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_le_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_le_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik <= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LE || GxB_NO_INT64 || GxB_NO_LAND_BOOL || GxB_NO_LE_INT64 || GxB_NO_LAND_LE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_le_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_le_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_le_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_le_int64
 GrB_Info GB_Asaxpy3B__land_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_le_int8.c b/GraphBLAS/Source/Generated/GB_AxB__land_le_int8.c
index 0f1591c505..d9d59445af 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_le_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_le_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik <= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LE || GxB_NO_INT8 || GxB_NO_LAND_BOOL || GxB_NO_LE_INT8 || GxB_NO_LAND_LE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_le_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_le_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_le_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_le_int8
 GrB_Info GB_Asaxpy3B__land_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_le_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__land_le_uint16.c
index a71731c8a0..d788aac4e5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_le_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_le_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik <= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LE || GxB_NO_UINT16 || GxB_NO_LAND_BOOL || GxB_NO_LE_UINT16 || GxB_NO_LAND_LE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_le_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_le_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_le_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_le_uint16
 GrB_Info GB_Asaxpy3B__land_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_le_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__land_le_uint32.c
index 4fb5795e94..50950ca419 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_le_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_le_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik <= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LE || GxB_NO_UINT32 || GxB_NO_LAND_BOOL || GxB_NO_LE_UINT32 || GxB_NO_LAND_LE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_le_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_le_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_le_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_le_uint32
 GrB_Info GB_Asaxpy3B__land_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_le_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__land_le_uint64.c
index c7604981b1..948224d331 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_le_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_le_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik <= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LE || GxB_NO_UINT64 || GxB_NO_LAND_BOOL || GxB_NO_LE_UINT64 || GxB_NO_LAND_LE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_le_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_le_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_le_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_le_uint64
 GrB_Info GB_Asaxpy3B__land_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_le_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__land_le_uint8.c
index e636c06868..1055cffd52 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_le_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_le_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik <= bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LE || GxB_NO_UINT8 || GxB_NO_LAND_BOOL || GxB_NO_LE_UINT8 || GxB_NO_LAND_LE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_le_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_le_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_le_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_le_uint8
 GrB_Info GB_Asaxpy3B__land_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lor_bool.c b/GraphBLAS/Source/Generated/GB_AxB__land_lor_bool.c
index b359037235..27b62be9a6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lor_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik || bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x || y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x || y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax || bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LOR || GxB_NO_BOOL || GxB_NO_LAND_BOOL || GxB_NO_LOR_BOOL || GxB_NO_LAND_LOR_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lor_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lor_bool
 GrB_Info GB_Asaxpy3B__land_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lt_bool.c b/GraphBLAS/Source/Generated/GB_AxB__land_lt_bool.c
index c34ee0e705..6b34815fa0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lt_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lt_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik < bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LT || GxB_NO_BOOL || GxB_NO_LAND_BOOL || GxB_NO_LT_BOOL || GxB_NO_LAND_LT_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lt_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lt_bool
 GrB_Info GB_Asaxpy3B__land_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__land_lt_fp32.c
index 271d79bb22..4a01dd0b6c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lt_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik < bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LT || GxB_NO_FP32 || GxB_NO_LAND_BOOL || GxB_NO_LT_FP32 || GxB_NO_LAND_LT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lt_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lt_fp32
 GrB_Info GB_Asaxpy3B__land_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__land_lt_fp64.c
index 41a0d074c7..3dff524272 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lt_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik < bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LT || GxB_NO_FP64 || GxB_NO_LAND_BOOL || GxB_NO_LT_FP64 || GxB_NO_LAND_LT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lt_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lt_fp64
 GrB_Info GB_Asaxpy3B__land_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__land_lt_int16.c
index 75e44a7b94..f333eb262d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik < bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LT || GxB_NO_INT16 || GxB_NO_LAND_BOOL || GxB_NO_LT_INT16 || GxB_NO_LAND_LT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lt_int16
 GrB_Info GB_Asaxpy3B__land_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__land_lt_int32.c
index bd8a01076c..5ea8cb3f27 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik < bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LT || GxB_NO_INT32 || GxB_NO_LAND_BOOL || GxB_NO_LT_INT32 || GxB_NO_LAND_LT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lt_int32
 GrB_Info GB_Asaxpy3B__land_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__land_lt_int64.c
index 19c788a441..32e8e6b558 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik < bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LT || GxB_NO_INT64 || GxB_NO_LAND_BOOL || GxB_NO_LT_INT64 || GxB_NO_LAND_LT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lt_int64
 GrB_Info GB_Asaxpy3B__land_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__land_lt_int8.c
index 0ba5a459ac..c476e2f515 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik < bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LT || GxB_NO_INT8 || GxB_NO_LAND_BOOL || GxB_NO_LT_INT8 || GxB_NO_LAND_LT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lt_int8
 GrB_Info GB_Asaxpy3B__land_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint16.c
index 93699010fd..f91be3dfe7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik < bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LT || GxB_NO_UINT16 || GxB_NO_LAND_BOOL || GxB_NO_LT_UINT16 || GxB_NO_LAND_LT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lt_uint16
 GrB_Info GB_Asaxpy3B__land_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint32.c
index a87a748825..b5b88888ee 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik < bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LT || GxB_NO_UINT32 || GxB_NO_LAND_BOOL || GxB_NO_LT_UINT32 || GxB_NO_LAND_LT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lt_uint32
 GrB_Info GB_Asaxpy3B__land_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint64.c
index 7899c1ab9b..07e94770fe 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik < bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LT || GxB_NO_UINT64 || GxB_NO_LAND_BOOL || GxB_NO_LT_UINT64 || GxB_NO_LAND_LT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lt_uint64
 GrB_Info GB_Asaxpy3B__land_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint8.c
index 50297e7392..e12444318b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik < bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LT || GxB_NO_UINT8 || GxB_NO_LAND_BOOL || GxB_NO_LT_UINT8 || GxB_NO_LAND_LT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lt_uint8
 GrB_Info GB_Asaxpy3B__land_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_lxor_bool.c b/GraphBLAS/Source/Generated/GB_AxB__land_lxor_bool.c
index 631e0c02a6..b8bca5a7eb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_lxor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_lxor_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik != bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_LXOR || GxB_NO_BOOL || GxB_NO_LAND_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_LAND_LXOR_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_lxor_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_lxor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_lxor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_lxor_bool
 GrB_Info GB_Asaxpy3B__land_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ne_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__land_ne_fp32.c
index 6bdf7b5317..34c42580e9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ne_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ne_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik != bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_NE || GxB_NO_FP32 || GxB_NO_LAND_BOOL || GxB_NO_NE_FP32 || GxB_NO_LAND_NE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ne_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ne_fp32
 GrB_Info GB_Asaxpy3B__land_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ne_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__land_ne_fp64.c
index 1304a0ed5a..1b4ed9df5a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ne_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ne_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik != bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_NE || GxB_NO_FP64 || GxB_NO_LAND_BOOL || GxB_NO_NE_FP64 || GxB_NO_LAND_NE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ne_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ne_fp64
 GrB_Info GB_Asaxpy3B__land_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ne_int16.c b/GraphBLAS/Source/Generated/GB_AxB__land_ne_int16.c
index 1a9a8199e0..62d58b7ccb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ne_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ne_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik != bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_NE || GxB_NO_INT16 || GxB_NO_LAND_BOOL || GxB_NO_NE_INT16 || GxB_NO_LAND_NE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ne_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ne_int16
 GrB_Info GB_Asaxpy3B__land_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ne_int32.c b/GraphBLAS/Source/Generated/GB_AxB__land_ne_int32.c
index cdcd4a748d..7c889160da 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ne_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ne_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik != bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_NE || GxB_NO_INT32 || GxB_NO_LAND_BOOL || GxB_NO_NE_INT32 || GxB_NO_LAND_NE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ne_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ne_int32
 GrB_Info GB_Asaxpy3B__land_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ne_int64.c b/GraphBLAS/Source/Generated/GB_AxB__land_ne_int64.c
index 95d7953918..078ebd1cdc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ne_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ne_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik != bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_NE || GxB_NO_INT64 || GxB_NO_LAND_BOOL || GxB_NO_NE_INT64 || GxB_NO_LAND_NE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ne_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ne_int64
 GrB_Info GB_Asaxpy3B__land_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ne_int8.c b/GraphBLAS/Source/Generated/GB_AxB__land_ne_int8.c
index 5a1bf2fbea..40f1c031c6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ne_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ne_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik != bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_NE || GxB_NO_INT8 || GxB_NO_LAND_BOOL || GxB_NO_NE_INT8 || GxB_NO_LAND_NE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ne_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ne_int8
 GrB_Info GB_Asaxpy3B__land_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint16.c
index 0fb44a1c51..f8179bc4a1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik != bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_NE || GxB_NO_UINT16 || GxB_NO_LAND_BOOL || GxB_NO_NE_UINT16 || GxB_NO_LAND_NE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ne_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ne_uint16
 GrB_Info GB_Asaxpy3B__land_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint32.c
index 3354dfface..6078da5503 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik != bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_NE || GxB_NO_UINT32 || GxB_NO_LAND_BOOL || GxB_NO_NE_UINT32 || GxB_NO_LAND_NE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ne_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ne_uint32
 GrB_Info GB_Asaxpy3B__land_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint64.c
index 41a1fe8647..61f13262b0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik != bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_NE || GxB_NO_UINT64 || GxB_NO_LAND_BOOL || GxB_NO_NE_UINT64 || GxB_NO_LAND_NE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ne_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ne_uint64
 GrB_Info GB_Asaxpy3B__land_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint8.c
index 6ec6ac6890..aaaa72e3a1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_ne_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= (aik != bkj)
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_NE || GxB_NO_UINT8 || GxB_NO_LAND_BOOL || GxB_NO_NE_UINT8 || GxB_NO_LAND_NE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_ne_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_ne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_ne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_ne_uint8
 GrB_Info GB_Asaxpy3B__land_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__land_second_bool.c b/GraphBLAS/Source/Generated/GB_AxB__land_second_bool.c
index ff50b11f17..ae314a90af 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__land_second_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__land_second_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij &= bkj
 // Identity: true
-// Terminal: if (cij == false) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == false) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z &= y
 
 // monoid identity value
 #define GB_IDENTITY \
     true
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    1
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == false) { cij_is_terminal = true ; break ; }
+    if (cij == false) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x & y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] &= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] &= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx &= ~exists | (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LAND || GxB_NO_SECOND || GxB_NO_BOOL || GxB_NO_LAND_BOOL || GxB_NO_SECOND_BOOL || GxB_NO_LAND_SECOND_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__land_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__land_second_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__land_second_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__land_second_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__land_second_bool
 GrB_Info GB_Asaxpy3B__land_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_bool.c
index c3caecf61b..83a67d3919 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik == bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_EQ || GxB_NO_BOOL || GxB_NO_LOR_BOOL || GxB_NO_EQ_BOOL || GxB_NO_LOR_EQ_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_eq_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_eq_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_eq_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_eq_bool
 GrB_Info GB_Asaxpy3B__lor_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_fp32.c
index caaa0a77ad..cd9eacfb70 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik == bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_EQ || GxB_NO_FP32 || GxB_NO_LOR_BOOL || GxB_NO_EQ_FP32 || GxB_NO_LOR_EQ_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_eq_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_eq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_eq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_eq_fp32
 GrB_Info GB_Asaxpy3B__lor_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_fp64.c
index 46878a165b..cba4c63d4e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik == bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_EQ || GxB_NO_FP64 || GxB_NO_LOR_BOOL || GxB_NO_EQ_FP64 || GxB_NO_LOR_EQ_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_eq_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_eq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_eq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_eq_fp64
 GrB_Info GB_Asaxpy3B__lor_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int16.c b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int16.c
index 3814c17267..935c574f3d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik == bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_EQ || GxB_NO_INT16 || GxB_NO_LOR_BOOL || GxB_NO_EQ_INT16 || GxB_NO_LOR_EQ_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_eq_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_eq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_eq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_eq_int16
 GrB_Info GB_Asaxpy3B__lor_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int32.c
index 1c502fd763..ff433b3ac9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik == bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_EQ || GxB_NO_INT32 || GxB_NO_LOR_BOOL || GxB_NO_EQ_INT32 || GxB_NO_LOR_EQ_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_eq_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_eq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_eq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_eq_int32
 GrB_Info GB_Asaxpy3B__lor_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int64.c
index 5fc974c610..00f5bc47d6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik == bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_EQ || GxB_NO_INT64 || GxB_NO_LOR_BOOL || GxB_NO_EQ_INT64 || GxB_NO_LOR_EQ_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_eq_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_eq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_eq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_eq_int64
 GrB_Info GB_Asaxpy3B__lor_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int8.c b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int8.c
index a953870d72..847545c529 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik == bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_EQ || GxB_NO_INT8 || GxB_NO_LOR_BOOL || GxB_NO_EQ_INT8 || GxB_NO_LOR_EQ_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_eq_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_eq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_eq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_eq_int8
 GrB_Info GB_Asaxpy3B__lor_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint16.c
index 2601fb61ff..c4213872e5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik == bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_EQ || GxB_NO_UINT16 || GxB_NO_LOR_BOOL || GxB_NO_EQ_UINT16 || GxB_NO_LOR_EQ_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_eq_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_eq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_eq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_eq_uint16
 GrB_Info GB_Asaxpy3B__lor_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint32.c
index 8800d74155..ac13c9a808 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik == bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_EQ || GxB_NO_UINT32 || GxB_NO_LOR_BOOL || GxB_NO_EQ_UINT32 || GxB_NO_LOR_EQ_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_eq_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_eq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_eq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_eq_uint32
 GrB_Info GB_Asaxpy3B__lor_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint64.c
index bce6c856d2..b75a6a3dd6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik == bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_EQ || GxB_NO_UINT64 || GxB_NO_LOR_BOOL || GxB_NO_EQ_UINT64 || GxB_NO_LOR_EQ_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_eq_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_eq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_eq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_eq_uint64
 GrB_Info GB_Asaxpy3B__lor_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint8.c
index e0f37d4d74..9b0d0e0542 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_eq_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik == bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_EQ || GxB_NO_UINT8 || GxB_NO_LOR_BOOL || GxB_NO_EQ_UINT8 || GxB_NO_LOR_EQ_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_eq_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_eq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_eq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_eq_uint8
 GrB_Info GB_Asaxpy3B__lor_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_first_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lor_first_bool.c
index 5a7055e947..8bf78c2d8e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_first_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_first_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= aik
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= x
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_FIRST || GxB_NO_BOOL || GxB_NO_LOR_BOOL || GxB_NO_FIRST_BOOL || GxB_NO_LOR_FIRST_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_first_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_first_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_first_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_first_bool
 GrB_Info GB_Asaxpy3B__lor_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_bool.c
index 8e20fedc24..bbe5992b38 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik >= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GE || GxB_NO_BOOL || GxB_NO_LOR_BOOL || GxB_NO_GE_BOOL || GxB_NO_LOR_GE_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ge_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ge_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ge_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ge_bool
 GrB_Info GB_Asaxpy3B__lor_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_fp32.c
index 78ae1028d9..4c11b5b4ff 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik >= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GE || GxB_NO_FP32 || GxB_NO_LOR_BOOL || GxB_NO_GE_FP32 || GxB_NO_LOR_GE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ge_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ge_fp32
 GrB_Info GB_Asaxpy3B__lor_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_fp64.c
index c99b14fb5a..41088b6c69 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik >= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GE || GxB_NO_FP64 || GxB_NO_LOR_BOOL || GxB_NO_GE_FP64 || GxB_NO_LOR_GE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ge_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ge_fp64
 GrB_Info GB_Asaxpy3B__lor_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int16.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int16.c
index 911ca125ac..4b73c6f0af 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik >= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GE || GxB_NO_INT16 || GxB_NO_LOR_BOOL || GxB_NO_GE_INT16 || GxB_NO_LOR_GE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ge_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ge_int16
 GrB_Info GB_Asaxpy3B__lor_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int32.c
index 5cf98252f9..49ab84f52d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik >= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GE || GxB_NO_INT32 || GxB_NO_LOR_BOOL || GxB_NO_GE_INT32 || GxB_NO_LOR_GE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ge_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ge_int32
 GrB_Info GB_Asaxpy3B__lor_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int64.c
index 2e1ac819d7..163ff51de9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik >= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GE || GxB_NO_INT64 || GxB_NO_LOR_BOOL || GxB_NO_GE_INT64 || GxB_NO_LOR_GE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ge_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ge_int64
 GrB_Info GB_Asaxpy3B__lor_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int8.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int8.c
index 596d1060be..7823d7a0f3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik >= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GE || GxB_NO_INT8 || GxB_NO_LOR_BOOL || GxB_NO_GE_INT8 || GxB_NO_LOR_GE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ge_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ge_int8
 GrB_Info GB_Asaxpy3B__lor_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint16.c
index 14cb1dc51f..2911e410ca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik >= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GE || GxB_NO_UINT16 || GxB_NO_LOR_BOOL || GxB_NO_GE_UINT16 || GxB_NO_LOR_GE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ge_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ge_uint16
 GrB_Info GB_Asaxpy3B__lor_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint32.c
index fd0f1fd175..9c5f52e975 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik >= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GE || GxB_NO_UINT32 || GxB_NO_LOR_BOOL || GxB_NO_GE_UINT32 || GxB_NO_LOR_GE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ge_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ge_uint32
 GrB_Info GB_Asaxpy3B__lor_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint64.c
index 1816fc7943..d106fd8c99 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik >= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GE || GxB_NO_UINT64 || GxB_NO_LOR_BOOL || GxB_NO_GE_UINT64 || GxB_NO_LOR_GE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ge_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ge_uint64
 GrB_Info GB_Asaxpy3B__lor_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint8.c
index 06516f0a00..e18b8644cb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ge_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik >= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GE || GxB_NO_UINT8 || GxB_NO_LOR_BOOL || GxB_NO_GE_UINT8 || GxB_NO_LOR_GE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ge_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ge_uint8
 GrB_Info GB_Asaxpy3B__lor_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_bool.c
index dcdc61e306..9ad1290b3e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik > bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GT || GxB_NO_BOOL || GxB_NO_LOR_BOOL || GxB_NO_GT_BOOL || GxB_NO_LOR_GT_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_gt_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_gt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_gt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_gt_bool
 GrB_Info GB_Asaxpy3B__lor_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_fp32.c
index ccecaaedff..14a7a19e20 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik > bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GT || GxB_NO_FP32 || GxB_NO_LOR_BOOL || GxB_NO_GT_FP32 || GxB_NO_LOR_GT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_gt_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_gt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_gt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_gt_fp32
 GrB_Info GB_Asaxpy3B__lor_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_fp64.c
index 5e5daa4e24..e3be276e6d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik > bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GT || GxB_NO_FP64 || GxB_NO_LOR_BOOL || GxB_NO_GT_FP64 || GxB_NO_LOR_GT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_gt_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_gt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_gt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_gt_fp64
 GrB_Info GB_Asaxpy3B__lor_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int16.c
index 3c2fbed4fd..4488d3a596 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik > bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GT || GxB_NO_INT16 || GxB_NO_LOR_BOOL || GxB_NO_GT_INT16 || GxB_NO_LOR_GT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_gt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_gt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_gt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_gt_int16
 GrB_Info GB_Asaxpy3B__lor_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int32.c
index 4f1a8b1261..dfa1349774 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik > bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GT || GxB_NO_INT32 || GxB_NO_LOR_BOOL || GxB_NO_GT_INT32 || GxB_NO_LOR_GT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_gt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_gt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_gt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_gt_int32
 GrB_Info GB_Asaxpy3B__lor_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int64.c
index 94c2a4ff49..9f287c9dfa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik > bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GT || GxB_NO_INT64 || GxB_NO_LOR_BOOL || GxB_NO_GT_INT64 || GxB_NO_LOR_GT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_gt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_gt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_gt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_gt_int64
 GrB_Info GB_Asaxpy3B__lor_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int8.c
index da299bc565..e770772a1c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik > bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GT || GxB_NO_INT8 || GxB_NO_LOR_BOOL || GxB_NO_GT_INT8 || GxB_NO_LOR_GT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_gt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_gt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_gt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_gt_int8
 GrB_Info GB_Asaxpy3B__lor_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint16.c
index dc6b4a9b57..a1000981e1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik > bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GT || GxB_NO_UINT16 || GxB_NO_LOR_BOOL || GxB_NO_GT_UINT16 || GxB_NO_LOR_GT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_gt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_gt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_gt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_gt_uint16
 GrB_Info GB_Asaxpy3B__lor_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint32.c
index 60f00957b0..402ca2f5c4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik > bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GT || GxB_NO_UINT32 || GxB_NO_LOR_BOOL || GxB_NO_GT_UINT32 || GxB_NO_LOR_GT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_gt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_gt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_gt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_gt_uint32
 GrB_Info GB_Asaxpy3B__lor_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint64.c
index b0a6592da8..509495f722 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik > bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GT || GxB_NO_UINT64 || GxB_NO_LOR_BOOL || GxB_NO_GT_UINT64 || GxB_NO_LOR_GT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_gt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_gt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_gt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_gt_uint64
 GrB_Info GB_Asaxpy3B__lor_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint8.c
index 89da3c46f8..ef79bacc3d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_gt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik > bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_GT || GxB_NO_UINT8 || GxB_NO_LOR_BOOL || GxB_NO_GT_UINT8 || GxB_NO_LOR_GT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_gt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_gt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_gt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_gt_uint8
 GrB_Info GB_Asaxpy3B__lor_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_land_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lor_land_bool.c
index 6091f7403d..37efc2f53c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_land_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_land_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik && bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x && y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x && y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax && bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LAND || GxB_NO_BOOL || GxB_NO_LOR_BOOL || GxB_NO_LAND_BOOL || GxB_NO_LOR_LAND_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_land_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_land_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_land_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_land_bool
 GrB_Info GB_Asaxpy3B__lor_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_le_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lor_le_bool.c
index e4855719b3..8dc87134eb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_le_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_le_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik <= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LE || GxB_NO_BOOL || GxB_NO_LOR_BOOL || GxB_NO_LE_BOOL || GxB_NO_LOR_LE_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_le_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_le_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_le_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_le_bool
 GrB_Info GB_Asaxpy3B__lor_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_le_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_le_fp32.c
index ae2dc81289..70bcb1bb0c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_le_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_le_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik <= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LE || GxB_NO_FP32 || GxB_NO_LOR_BOOL || GxB_NO_LE_FP32 || GxB_NO_LOR_LE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_le_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_le_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_le_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_le_fp32
 GrB_Info GB_Asaxpy3B__lor_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_le_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_le_fp64.c
index d457365e36..28ea61cecf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_le_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_le_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik <= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LE || GxB_NO_FP64 || GxB_NO_LOR_BOOL || GxB_NO_LE_FP64 || GxB_NO_LOR_LE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_le_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_le_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_le_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_le_fp64
 GrB_Info GB_Asaxpy3B__lor_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_le_int16.c b/GraphBLAS/Source/Generated/GB_AxB__lor_le_int16.c
index 5460980097..5ca3f48c0f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_le_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_le_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik <= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LE || GxB_NO_INT16 || GxB_NO_LOR_BOOL || GxB_NO_LE_INT16 || GxB_NO_LOR_LE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_le_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_le_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_le_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_le_int16
 GrB_Info GB_Asaxpy3B__lor_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_le_int32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_le_int32.c
index 7e10d48873..11cd627512 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_le_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_le_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik <= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LE || GxB_NO_INT32 || GxB_NO_LOR_BOOL || GxB_NO_LE_INT32 || GxB_NO_LOR_LE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_le_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_le_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_le_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_le_int32
 GrB_Info GB_Asaxpy3B__lor_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_le_int64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_le_int64.c
index e46b0728f7..a88d2522a8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_le_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_le_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik <= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LE || GxB_NO_INT64 || GxB_NO_LOR_BOOL || GxB_NO_LE_INT64 || GxB_NO_LOR_LE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_le_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_le_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_le_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_le_int64
 GrB_Info GB_Asaxpy3B__lor_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_le_int8.c b/GraphBLAS/Source/Generated/GB_AxB__lor_le_int8.c
index b9666abfda..039b0a1025 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_le_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_le_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik <= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LE || GxB_NO_INT8 || GxB_NO_LOR_BOOL || GxB_NO_LE_INT8 || GxB_NO_LOR_LE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_le_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_le_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_le_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_le_int8
 GrB_Info GB_Asaxpy3B__lor_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint16.c
index ca8e1113b3..ae0b05f65b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik <= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LE || GxB_NO_UINT16 || GxB_NO_LOR_BOOL || GxB_NO_LE_UINT16 || GxB_NO_LOR_LE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_le_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_le_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_le_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_le_uint16
 GrB_Info GB_Asaxpy3B__lor_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint32.c
index c88d81504d..8b5b599086 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik <= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LE || GxB_NO_UINT32 || GxB_NO_LOR_BOOL || GxB_NO_LE_UINT32 || GxB_NO_LOR_LE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_le_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_le_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_le_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_le_uint32
 GrB_Info GB_Asaxpy3B__lor_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint64.c
index 90618aad7e..a6ee5301a5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik <= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LE || GxB_NO_UINT64 || GxB_NO_LOR_BOOL || GxB_NO_LE_UINT64 || GxB_NO_LOR_LE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_le_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_le_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_le_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_le_uint64
 GrB_Info GB_Asaxpy3B__lor_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint8.c
index 68b3a1e7b0..232cafa2ca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_le_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik <= bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LE || GxB_NO_UINT8 || GxB_NO_LOR_BOOL || GxB_NO_LE_UINT8 || GxB_NO_LOR_LE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_le_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_le_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_le_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_le_uint8
 GrB_Info GB_Asaxpy3B__lor_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lor_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lor_bool.c
index 4e0d57471f..10f06f6726 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lor_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik || bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x || y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x || y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax || bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_BOOL || GxB_NO_LOR_BOOL || GxB_NO_LOR_LOR_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lor_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lor_bool
 GrB_Info GB_Asaxpy3B__lor_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_bool.c
index 8c3d471966..7411c979d6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik < bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LT || GxB_NO_BOOL || GxB_NO_LOR_BOOL || GxB_NO_LT_BOOL || GxB_NO_LOR_LT_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lt_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lt_bool
 GrB_Info GB_Asaxpy3B__lor_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_fp32.c
index 3226efbe12..0704558c2b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik < bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LT || GxB_NO_FP32 || GxB_NO_LOR_BOOL || GxB_NO_LT_FP32 || GxB_NO_LOR_LT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lt_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lt_fp32
 GrB_Info GB_Asaxpy3B__lor_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_fp64.c
index 01ffc6c59c..485b86fc0b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik < bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LT || GxB_NO_FP64 || GxB_NO_LOR_BOOL || GxB_NO_LT_FP64 || GxB_NO_LOR_LT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lt_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lt_fp64
 GrB_Info GB_Asaxpy3B__lor_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int16.c
index f764cfb9f6..d8f55bacb8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik < bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LT || GxB_NO_INT16 || GxB_NO_LOR_BOOL || GxB_NO_LT_INT16 || GxB_NO_LOR_LT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lt_int16
 GrB_Info GB_Asaxpy3B__lor_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int32.c
index 69c3d8d876..25b4396858 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik < bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LT || GxB_NO_INT32 || GxB_NO_LOR_BOOL || GxB_NO_LT_INT32 || GxB_NO_LOR_LT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lt_int32
 GrB_Info GB_Asaxpy3B__lor_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int64.c
index 6aa5dbea04..897c66cc30 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik < bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LT || GxB_NO_INT64 || GxB_NO_LOR_BOOL || GxB_NO_LT_INT64 || GxB_NO_LOR_LT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lt_int64
 GrB_Info GB_Asaxpy3B__lor_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int8.c
index 172aa1a56d..c9fef14a90 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik < bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LT || GxB_NO_INT8 || GxB_NO_LOR_BOOL || GxB_NO_LT_INT8 || GxB_NO_LOR_LT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lt_int8
 GrB_Info GB_Asaxpy3B__lor_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint16.c
index 0a93de7ebe..e44c627f04 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik < bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LT || GxB_NO_UINT16 || GxB_NO_LOR_BOOL || GxB_NO_LT_UINT16 || GxB_NO_LOR_LT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lt_uint16
 GrB_Info GB_Asaxpy3B__lor_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint32.c
index 135037b65e..9b2d654c33 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik < bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LT || GxB_NO_UINT32 || GxB_NO_LOR_BOOL || GxB_NO_LT_UINT32 || GxB_NO_LOR_LT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lt_uint32
 GrB_Info GB_Asaxpy3B__lor_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint64.c
index bcc4cabf05..2cfee4820c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik < bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LT || GxB_NO_UINT64 || GxB_NO_LOR_BOOL || GxB_NO_LT_UINT64 || GxB_NO_LOR_LT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lt_uint64
 GrB_Info GB_Asaxpy3B__lor_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint8.c
index 60bf165a23..65905f7888 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik < bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LT || GxB_NO_UINT8 || GxB_NO_LOR_BOOL || GxB_NO_LT_UINT8 || GxB_NO_LOR_LT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lt_uint8
 GrB_Info GB_Asaxpy3B__lor_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_lxor_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lor_lxor_bool.c
index b885ed310b..647f1fb18e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_lxor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_lxor_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik != bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_LXOR || GxB_NO_BOOL || GxB_NO_LOR_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_LOR_LXOR_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_lxor_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_lxor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_lxor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_lxor_bool
 GrB_Info GB_Asaxpy3B__lor_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_fp32.c
index 17160d5d91..a748ff82f4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik != bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_NE || GxB_NO_FP32 || GxB_NO_LOR_BOOL || GxB_NO_NE_FP32 || GxB_NO_LOR_NE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ne_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ne_fp32
 GrB_Info GB_Asaxpy3B__lor_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_fp64.c
index 447bba55c6..5e067290f8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik != bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_NE || GxB_NO_FP64 || GxB_NO_LOR_BOOL || GxB_NO_NE_FP64 || GxB_NO_LOR_NE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ne_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ne_fp64
 GrB_Info GB_Asaxpy3B__lor_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int16.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int16.c
index 5e7cbf7804..7950f4deae 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik != bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_NE || GxB_NO_INT16 || GxB_NO_LOR_BOOL || GxB_NO_NE_INT16 || GxB_NO_LOR_NE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ne_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ne_int16
 GrB_Info GB_Asaxpy3B__lor_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int32.c
index 2716d8073c..d1c67fcffe 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik != bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_NE || GxB_NO_INT32 || GxB_NO_LOR_BOOL || GxB_NO_NE_INT32 || GxB_NO_LOR_NE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ne_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ne_int32
 GrB_Info GB_Asaxpy3B__lor_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int64.c
index 754604abfd..e80dea8fbf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik != bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_NE || GxB_NO_INT64 || GxB_NO_LOR_BOOL || GxB_NO_NE_INT64 || GxB_NO_LOR_NE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ne_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ne_int64
 GrB_Info GB_Asaxpy3B__lor_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int8.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int8.c
index 5395cbed6b..63c293b0cc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik != bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_NE || GxB_NO_INT8 || GxB_NO_LOR_BOOL || GxB_NO_NE_INT8 || GxB_NO_LOR_NE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ne_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ne_int8
 GrB_Info GB_Asaxpy3B__lor_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint16.c
index 2b286548b0..6cc50076e0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik != bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_NE || GxB_NO_UINT16 || GxB_NO_LOR_BOOL || GxB_NO_NE_UINT16 || GxB_NO_LOR_NE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ne_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ne_uint16
 GrB_Info GB_Asaxpy3B__lor_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint32.c
index 6b9fce6c05..2cbecb1caa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik != bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_NE || GxB_NO_UINT32 || GxB_NO_LOR_BOOL || GxB_NO_NE_UINT32 || GxB_NO_LOR_NE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ne_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ne_uint32
 GrB_Info GB_Asaxpy3B__lor_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint64.c
index b08868d869..626398d762 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik != bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_NE || GxB_NO_UINT64 || GxB_NO_LOR_BOOL || GxB_NO_NE_UINT64 || GxB_NO_LOR_NE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ne_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ne_uint64
 GrB_Info GB_Asaxpy3B__lor_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint8.c
index 062a1b18cc..ae77e4bf16 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_ne_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= (aik != bkj)
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_NE || GxB_NO_UINT8 || GxB_NO_LOR_BOOL || GxB_NO_NE_UINT8 || GxB_NO_LOR_NE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_ne_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_ne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_ne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_ne_uint8
 GrB_Info GB_Asaxpy3B__lor_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lor_second_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lor_second_bool.c
index 1da39c7519..a24d8d13ec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lor_second_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lor_second_bool.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij |= bkj
 // Identity: false
-// Terminal: if (cij == true) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == true) break ;
 
 #define GB_ATYPE \
     bool
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z |= y
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == true) { cij_is_terminal = true ; break ; }
+    if (cij == true) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x | y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] |= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] |= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx |= exists & (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOR || GxB_NO_SECOND || GxB_NO_BOOL || GxB_NO_LOR_BOOL || GxB_NO_SECOND_BOOL || GxB_NO_LOR_SECOND_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lor_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lor_second_bool
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__lor_second_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__lor_second_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__lor_second_bool
 GrB_Info GB_Asaxpy3B__lor_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_bool.c
index 458e89758a..94eeb4004f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_EQ || GxB_NO_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_EQ_BOOL || GxB_NO_LXOR_EQ_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_eq_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_eq_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_eq_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_eq_bool
 GrB_Info GB_Asaxpy3B__lxor_eq_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_fp32.c
index 5f41fc4f52..426821cc5c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_EQ || GxB_NO_FP32 || GxB_NO_LXOR_BOOL || GxB_NO_EQ_FP32 || GxB_NO_LXOR_EQ_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_eq_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_eq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_eq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_eq_fp32
 GrB_Info GB_Asaxpy3B__lxor_eq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_fp64.c
index c3cc38ca6a..3f7e7e8e36 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_EQ || GxB_NO_FP64 || GxB_NO_LXOR_BOOL || GxB_NO_EQ_FP64 || GxB_NO_LXOR_EQ_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_eq_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_eq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_eq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_eq_fp64
 GrB_Info GB_Asaxpy3B__lxor_eq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int16.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int16.c
index ca39e55363..051b82004e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_EQ || GxB_NO_INT16 || GxB_NO_LXOR_BOOL || GxB_NO_EQ_INT16 || GxB_NO_LXOR_EQ_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_eq_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_eq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_eq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_eq_int16
 GrB_Info GB_Asaxpy3B__lxor_eq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int32.c
index b02e48457b..3efe1394d0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_EQ || GxB_NO_INT32 || GxB_NO_LXOR_BOOL || GxB_NO_EQ_INT32 || GxB_NO_LXOR_EQ_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_eq_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_eq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_eq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_eq_int32
 GrB_Info GB_Asaxpy3B__lxor_eq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int64.c
index e3bf3d3a46..3b59787394 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_EQ || GxB_NO_INT64 || GxB_NO_LXOR_BOOL || GxB_NO_EQ_INT64 || GxB_NO_LXOR_EQ_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_eq_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_eq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_eq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_eq_int64
 GrB_Info GB_Asaxpy3B__lxor_eq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int8.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int8.c
index d3df43db2d..e76266b3e2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_EQ || GxB_NO_INT8 || GxB_NO_LXOR_BOOL || GxB_NO_EQ_INT8 || GxB_NO_LXOR_EQ_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_eq_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_eq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_eq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_eq_int8
 GrB_Info GB_Asaxpy3B__lxor_eq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint16.c
index 54586c7a97..380a1ae7b0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_EQ || GxB_NO_UINT16 || GxB_NO_LXOR_BOOL || GxB_NO_EQ_UINT16 || GxB_NO_LXOR_EQ_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_eq_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_eq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_eq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_eq_uint16
 GrB_Info GB_Asaxpy3B__lxor_eq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint32.c
index 98cdc80001..f11435ab5a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_EQ || GxB_NO_UINT32 || GxB_NO_LXOR_BOOL || GxB_NO_EQ_UINT32 || GxB_NO_LXOR_EQ_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_eq_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_eq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_eq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_eq_uint32
 GrB_Info GB_Asaxpy3B__lxor_eq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint64.c
index 044bb9ba55..945b9e6da3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_EQ || GxB_NO_UINT64 || GxB_NO_LXOR_BOOL || GxB_NO_EQ_UINT64 || GxB_NO_LXOR_EQ_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_eq_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_eq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_eq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_eq_uint64
 GrB_Info GB_Asaxpy3B__lxor_eq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint8.c
index 551f367e4a..ba694c451c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_eq_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax == bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_EQ || GxB_NO_UINT8 || GxB_NO_LXOR_BOOL || GxB_NO_EQ_UINT8 || GxB_NO_LXOR_EQ_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_eq_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_eq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_eq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_eq_uint8
 GrB_Info GB_Asaxpy3B__lxor_eq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_first_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_first_bool.c
index 2c186bc491..d65383e16a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_first_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_first_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= x
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_FIRST || GxB_NO_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_FIRST_BOOL || GxB_NO_LXOR_FIRST_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_first_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_first_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_first_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_first_bool
 GrB_Info GB_Asaxpy3B__lxor_first_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_bool.c
index 6268a2a9be..b66a849e1b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GE || GxB_NO_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_GE_BOOL || GxB_NO_LXOR_GE_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ge_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ge_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ge_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ge_bool
 GrB_Info GB_Asaxpy3B__lxor_ge_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_fp32.c
index 1c2a5858c7..37e5b374f6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GE || GxB_NO_FP32 || GxB_NO_LXOR_BOOL || GxB_NO_GE_FP32 || GxB_NO_LXOR_GE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ge_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ge_fp32
 GrB_Info GB_Asaxpy3B__lxor_ge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_fp64.c
index 42d385a0ec..ae41483b94 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GE || GxB_NO_FP64 || GxB_NO_LXOR_BOOL || GxB_NO_GE_FP64 || GxB_NO_LXOR_GE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ge_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ge_fp64
 GrB_Info GB_Asaxpy3B__lxor_ge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int16.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int16.c
index f69aadf76a..b38bee97e5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GE || GxB_NO_INT16 || GxB_NO_LXOR_BOOL || GxB_NO_GE_INT16 || GxB_NO_LXOR_GE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ge_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ge_int16
 GrB_Info GB_Asaxpy3B__lxor_ge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int32.c
index e66f0ed84f..132f73569f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GE || GxB_NO_INT32 || GxB_NO_LXOR_BOOL || GxB_NO_GE_INT32 || GxB_NO_LXOR_GE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ge_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ge_int32
 GrB_Info GB_Asaxpy3B__lxor_ge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int64.c
index 5a2f8c0cdb..56771aa037 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GE || GxB_NO_INT64 || GxB_NO_LXOR_BOOL || GxB_NO_GE_INT64 || GxB_NO_LXOR_GE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ge_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ge_int64
 GrB_Info GB_Asaxpy3B__lxor_ge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int8.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int8.c
index 769ec54512..f4f08ce53a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GE || GxB_NO_INT8 || GxB_NO_LXOR_BOOL || GxB_NO_GE_INT8 || GxB_NO_LXOR_GE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ge_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ge_int8
 GrB_Info GB_Asaxpy3B__lxor_ge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint16.c
index 84a24399b3..9ac02575ca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GE || GxB_NO_UINT16 || GxB_NO_LXOR_BOOL || GxB_NO_GE_UINT16 || GxB_NO_LXOR_GE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ge_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ge_uint16
 GrB_Info GB_Asaxpy3B__lxor_ge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint32.c
index ce8a7c0482..b0f68f8299 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GE || GxB_NO_UINT32 || GxB_NO_LXOR_BOOL || GxB_NO_GE_UINT32 || GxB_NO_LXOR_GE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ge_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ge_uint32
 GrB_Info GB_Asaxpy3B__lxor_ge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint64.c
index 49975c0bea..cd7277ae76 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GE || GxB_NO_UINT64 || GxB_NO_LXOR_BOOL || GxB_NO_GE_UINT64 || GxB_NO_LXOR_GE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ge_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ge_uint64
 GrB_Info GB_Asaxpy3B__lxor_ge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint8.c
index cf40a9b74f..af0578937e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ge_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax >= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GE || GxB_NO_UINT8 || GxB_NO_LXOR_BOOL || GxB_NO_GE_UINT8 || GxB_NO_LXOR_GE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ge_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ge_uint8
 GrB_Info GB_Asaxpy3B__lxor_ge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_bool.c
index 01612f44dd..4a17a9bdae 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GT || GxB_NO_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_GT_BOOL || GxB_NO_LXOR_GT_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_gt_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_gt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_gt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_gt_bool
 GrB_Info GB_Asaxpy3B__lxor_gt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_fp32.c
index b71197f2e3..3eb939f4c5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GT || GxB_NO_FP32 || GxB_NO_LXOR_BOOL || GxB_NO_GT_FP32 || GxB_NO_LXOR_GT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_gt_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_gt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_gt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_gt_fp32
 GrB_Info GB_Asaxpy3B__lxor_gt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_fp64.c
index f052639cb5..ff7e2453d5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GT || GxB_NO_FP64 || GxB_NO_LXOR_BOOL || GxB_NO_GT_FP64 || GxB_NO_LXOR_GT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_gt_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_gt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_gt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_gt_fp64
 GrB_Info GB_Asaxpy3B__lxor_gt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int16.c
index 189ddf7898..95ced93e86 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GT || GxB_NO_INT16 || GxB_NO_LXOR_BOOL || GxB_NO_GT_INT16 || GxB_NO_LXOR_GT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_gt_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_gt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_gt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_gt_int16
 GrB_Info GB_Asaxpy3B__lxor_gt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int32.c
index 30e92f6dd9..debcf6be81 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GT || GxB_NO_INT32 || GxB_NO_LXOR_BOOL || GxB_NO_GT_INT32 || GxB_NO_LXOR_GT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_gt_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_gt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_gt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_gt_int32
 GrB_Info GB_Asaxpy3B__lxor_gt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int64.c
index 48916cc344..6883fdfdbf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GT || GxB_NO_INT64 || GxB_NO_LXOR_BOOL || GxB_NO_GT_INT64 || GxB_NO_LXOR_GT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_gt_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_gt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_gt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_gt_int64
 GrB_Info GB_Asaxpy3B__lxor_gt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int8.c
index c1b63c1ef9..33a5b92d9e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GT || GxB_NO_INT8 || GxB_NO_LXOR_BOOL || GxB_NO_GT_INT8 || GxB_NO_LXOR_GT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_gt_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_gt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_gt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_gt_int8
 GrB_Info GB_Asaxpy3B__lxor_gt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint16.c
index 352d55dc77..bb21628f56 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GT || GxB_NO_UINT16 || GxB_NO_LXOR_BOOL || GxB_NO_GT_UINT16 || GxB_NO_LXOR_GT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_gt_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_gt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_gt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_gt_uint16
 GrB_Info GB_Asaxpy3B__lxor_gt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint32.c
index ae8d7f0f1f..2a37b47ad4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GT || GxB_NO_UINT32 || GxB_NO_LXOR_BOOL || GxB_NO_GT_UINT32 || GxB_NO_LXOR_GT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_gt_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_gt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_gt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_gt_uint32
 GrB_Info GB_Asaxpy3B__lxor_gt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint64.c
index 43e37e99e0..51b7ee2755 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GT || GxB_NO_UINT64 || GxB_NO_LXOR_BOOL || GxB_NO_GT_UINT64 || GxB_NO_LXOR_GT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_gt_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_gt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_gt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_gt_uint64
 GrB_Info GB_Asaxpy3B__lxor_gt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint8.c
index 1b237ff2ab..eeb29b70c1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_gt_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax > bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_GT || GxB_NO_UINT8 || GxB_NO_LXOR_BOOL || GxB_NO_GT_UINT8 || GxB_NO_LXOR_GT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_gt_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_gt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_gt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_gt_uint8
 GrB_Info GB_Asaxpy3B__lxor_gt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_land_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_land_bool.c
index d3338b0326..7e82dc33f1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_land_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_land_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x && y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x && y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax && bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LAND || GxB_NO_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_LAND_BOOL || GxB_NO_LXOR_LAND_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_land_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_land_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_land_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_land_bool
 GrB_Info GB_Asaxpy3B__lxor_land_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_bool.c
index ef67b8919c..07879f6a8c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LE || GxB_NO_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_LE_BOOL || GxB_NO_LXOR_LE_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_le_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_le_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_le_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_le_bool
 GrB_Info GB_Asaxpy3B__lxor_le_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_fp32.c
index 50cec6725b..59f9012ae8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LE || GxB_NO_FP32 || GxB_NO_LXOR_BOOL || GxB_NO_LE_FP32 || GxB_NO_LXOR_LE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_le_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_le_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_le_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_le_fp32
 GrB_Info GB_Asaxpy3B__lxor_le_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_fp64.c
index c8c733318c..a02a1f44ec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LE || GxB_NO_FP64 || GxB_NO_LXOR_BOOL || GxB_NO_LE_FP64 || GxB_NO_LXOR_LE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_le_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_le_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_le_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_le_fp64
 GrB_Info GB_Asaxpy3B__lxor_le_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int16.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int16.c
index af5824954d..8115bf294a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LE || GxB_NO_INT16 || GxB_NO_LXOR_BOOL || GxB_NO_LE_INT16 || GxB_NO_LXOR_LE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_le_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_le_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_le_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_le_int16
 GrB_Info GB_Asaxpy3B__lxor_le_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int32.c
index 0cfc81873d..5659f3e1f5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LE || GxB_NO_INT32 || GxB_NO_LXOR_BOOL || GxB_NO_LE_INT32 || GxB_NO_LXOR_LE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_le_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_le_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_le_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_le_int32
 GrB_Info GB_Asaxpy3B__lxor_le_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int64.c
index e19e836c27..f96469e825 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LE || GxB_NO_INT64 || GxB_NO_LXOR_BOOL || GxB_NO_LE_INT64 || GxB_NO_LXOR_LE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_le_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_le_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_le_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_le_int64
 GrB_Info GB_Asaxpy3B__lxor_le_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int8.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int8.c
index 3e95ea45d0..900883032a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LE || GxB_NO_INT8 || GxB_NO_LXOR_BOOL || GxB_NO_LE_INT8 || GxB_NO_LXOR_LE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_le_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_le_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_le_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_le_int8
 GrB_Info GB_Asaxpy3B__lxor_le_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint16.c
index 506ba19198..b1b2fa4a08 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LE || GxB_NO_UINT16 || GxB_NO_LXOR_BOOL || GxB_NO_LE_UINT16 || GxB_NO_LXOR_LE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_le_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_le_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_le_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_le_uint16
 GrB_Info GB_Asaxpy3B__lxor_le_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint32.c
index 87092f162f..dc2af4e46c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LE || GxB_NO_UINT32 || GxB_NO_LXOR_BOOL || GxB_NO_LE_UINT32 || GxB_NO_LXOR_LE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_le_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_le_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_le_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_le_uint32
 GrB_Info GB_Asaxpy3B__lxor_le_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint64.c
index f9c830e518..7bb4297e34 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LE || GxB_NO_UINT64 || GxB_NO_LXOR_BOOL || GxB_NO_LE_UINT64 || GxB_NO_LXOR_LE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_le_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_le_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_le_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_le_uint64
 GrB_Info GB_Asaxpy3B__lxor_le_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint8.c
index bf8a30e827..38ac8f0f49 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_le_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax <= bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LE || GxB_NO_UINT8 || GxB_NO_LXOR_BOOL || GxB_NO_LE_UINT8 || GxB_NO_LXOR_LE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_le_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_le_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_le_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_le_uint8
 GrB_Info GB_Asaxpy3B__lxor_le_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lor_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lor_bool.c
index 3acfd5c956..a814ab1925 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lor_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x || y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x || y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax || bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LOR || GxB_NO_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_LOR_BOOL || GxB_NO_LXOR_LOR_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lor_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lor_bool
 GrB_Info GB_Asaxpy3B__lxor_lor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_bool.c
index 674c215b01..93418732a3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LT || GxB_NO_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_LT_BOOL || GxB_NO_LXOR_LT_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lt_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lt_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lt_bool
 GrB_Info GB_Asaxpy3B__lxor_lt_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_fp32.c
index 4bfc615c66..89cee5cd3d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LT || GxB_NO_FP32 || GxB_NO_LXOR_BOOL || GxB_NO_LT_FP32 || GxB_NO_LXOR_LT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lt_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lt_fp32
 GrB_Info GB_Asaxpy3B__lxor_lt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_fp64.c
index 39e24b133e..cbafafba39 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LT || GxB_NO_FP64 || GxB_NO_LXOR_BOOL || GxB_NO_LT_FP64 || GxB_NO_LXOR_LT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lt_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lt_fp64
 GrB_Info GB_Asaxpy3B__lxor_lt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int16.c
index ce899a7ac8..121809de8a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LT || GxB_NO_INT16 || GxB_NO_LXOR_BOOL || GxB_NO_LT_INT16 || GxB_NO_LXOR_LT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lt_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lt_int16
 GrB_Info GB_Asaxpy3B__lxor_lt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int32.c
index c7d893a968..9c14f7cb1f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LT || GxB_NO_INT32 || GxB_NO_LXOR_BOOL || GxB_NO_LT_INT32 || GxB_NO_LXOR_LT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lt_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lt_int32
 GrB_Info GB_Asaxpy3B__lxor_lt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int64.c
index 21871ed3d6..bf286984b4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LT || GxB_NO_INT64 || GxB_NO_LXOR_BOOL || GxB_NO_LT_INT64 || GxB_NO_LXOR_LT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lt_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lt_int64
 GrB_Info GB_Asaxpy3B__lxor_lt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int8.c
index 9fd5e86037..ac74b08af0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LT || GxB_NO_INT8 || GxB_NO_LXOR_BOOL || GxB_NO_LT_INT8 || GxB_NO_LXOR_LT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lt_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lt_int8
 GrB_Info GB_Asaxpy3B__lxor_lt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint16.c
index 18b6cd2c75..e8547fe004 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LT || GxB_NO_UINT16 || GxB_NO_LXOR_BOOL || GxB_NO_LT_UINT16 || GxB_NO_LXOR_LT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lt_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lt_uint16
 GrB_Info GB_Asaxpy3B__lxor_lt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint32.c
index c286154e43..30eebe9fe9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LT || GxB_NO_UINT32 || GxB_NO_LXOR_BOOL || GxB_NO_LT_UINT32 || GxB_NO_LXOR_LT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lt_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lt_uint32
 GrB_Info GB_Asaxpy3B__lxor_lt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint64.c
index 9d17de1397..8794b1a663 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LT || GxB_NO_UINT64 || GxB_NO_LXOR_BOOL || GxB_NO_LT_UINT64 || GxB_NO_LXOR_LT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lt_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lt_uint64
 GrB_Info GB_Asaxpy3B__lxor_lt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint8.c
index 9a497e6e47..496206ae02 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lt_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax < bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_LT || GxB_NO_UINT8 || GxB_NO_LXOR_BOOL || GxB_NO_LT_UINT8 || GxB_NO_LXOR_LT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lt_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lt_uint8
 GrB_Info GB_Asaxpy3B__lxor_lt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_lxor_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_lxor_bool.c
index 8bcb78111f..4b123f064d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_lxor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_lxor_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_LXOR_LXOR_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_lxor_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_lxor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_lxor_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_lxor_bool
 GrB_Info GB_Asaxpy3B__lxor_lxor_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_fp32.c
index ada339aa0a..fdc2380a5d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_NE || GxB_NO_FP32 || GxB_NO_LXOR_BOOL || GxB_NO_NE_FP32 || GxB_NO_LXOR_NE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ne_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ne_fp32
 GrB_Info GB_Asaxpy3B__lxor_ne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_fp64.c
index 45c0982ae9..f0b3ee37cd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_NE || GxB_NO_FP64 || GxB_NO_LXOR_BOOL || GxB_NO_NE_FP64 || GxB_NO_LXOR_NE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ne_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ne_fp64
 GrB_Info GB_Asaxpy3B__lxor_ne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int16.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int16.c
index e3f2b2db53..18d4df62bd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_NE || GxB_NO_INT16 || GxB_NO_LXOR_BOOL || GxB_NO_NE_INT16 || GxB_NO_LXOR_NE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ne_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ne_int16
 GrB_Info GB_Asaxpy3B__lxor_ne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int32.c
index 23081653f4..3c139705db 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_NE || GxB_NO_INT32 || GxB_NO_LXOR_BOOL || GxB_NO_NE_INT32 || GxB_NO_LXOR_NE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ne_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ne_int32
 GrB_Info GB_Asaxpy3B__lxor_ne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int64.c
index 7de1866b9d..c681b774e5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_NE || GxB_NO_INT64 || GxB_NO_LXOR_BOOL || GxB_NO_NE_INT64 || GxB_NO_LXOR_NE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ne_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ne_int64
 GrB_Info GB_Asaxpy3B__lxor_ne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int8.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int8.c
index 75caf1dba5..773771845f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_NE || GxB_NO_INT8 || GxB_NO_LXOR_BOOL || GxB_NO_NE_INT8 || GxB_NO_LXOR_NE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ne_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ne_int8
 GrB_Info GB_Asaxpy3B__lxor_ne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint16.c
index 2d78a242df..1d0c2411ec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_NE || GxB_NO_UINT16 || GxB_NO_LXOR_BOOL || GxB_NO_NE_UINT16 || GxB_NO_LXOR_NE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ne_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ne_uint16
 GrB_Info GB_Asaxpy3B__lxor_ne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint32.c
index 73ae5270f8..97b4d1313e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_NE || GxB_NO_UINT32 || GxB_NO_LXOR_BOOL || GxB_NO_NE_UINT32 || GxB_NO_LXOR_NE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ne_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ne_uint32
 GrB_Info GB_Asaxpy3B__lxor_ne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint64.c
index 9acf574a12..abc1e124b9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_NE || GxB_NO_UINT64 || GxB_NO_LXOR_BOOL || GxB_NO_NE_UINT64 || GxB_NO_LXOR_NE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ne_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ne_uint64
 GrB_Info GB_Asaxpy3B__lxor_ne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint8.c
index e37284120f..b5d9bfb5d1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_ne_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & ((ax != bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_NE || GxB_NO_UINT8 || GxB_NO_LXOR_BOOL || GxB_NO_NE_UINT8 || GxB_NO_LXOR_NE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_ne_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_ne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_ne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_ne_uint8
 GrB_Info GB_Asaxpy3B__lxor_ne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_pair_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_pair_bool.c
index c3edc85efb..61022d124e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_pair_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_pair_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= 1
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_PAIR || GxB_NO_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_PAIR_BOOL || GxB_NO_LXOR_PAIR_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_pair_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_pair_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_pair_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_pair_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_pair_bool
 GrB_Info GB_Asaxpy3B__lxor_pair_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__lxor_second_bool.c b/GraphBLAS/Source/Generated/GB_AxB__lxor_second_bool.c
index 7f2202c38e..0b2fa5e0be 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__lxor_second_bool.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__lxor_second_bool.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     bool
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     bool bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((bool) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((bool) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z ^= y
 
 // monoid identity value
 #define GB_IDENTITY \
     false
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         bool cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x ^ y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    bool
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0x1L
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] ^= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] ^= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx ^= exists & (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LXOR || GxB_NO_SECOND || GxB_NO_BOOL || GxB_NO_LXOR_BOOL || GxB_NO_SECOND_BOOL || GxB_NO_LXOR_SECOND_BOOL)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__lxor_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__lxor_second_bool
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__lxor_second_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__lxor_second_bool
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__lxor_second_bool
 GrB_Info GB_Asaxpy3B__lxor_second_bool
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_div_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_div_fp32.c
index 87c7b2111e..2e835f05f1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_div_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_div_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik / bkj)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, (aik / bkj))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x / y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, (x / y))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_DIV || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_DIV_FP32 || GxB_NO_MAX_DIV_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_div_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_div_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_div_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_div_fp32
 GrB_Info GB_Asaxpy3B__max_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_div_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_div_fp64.c
index 19927d9c8f..c7a085846f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_div_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_div_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik / bkj)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, (aik / bkj))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x / y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, (x / y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_DIV || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_DIV_FP64 || GxB_NO_MAX_DIV_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_div_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_div_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_div_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_div_fp64
 GrB_Info GB_Asaxpy3B__max_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_div_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_div_int16.c
index 9e508e0f64..113937c96e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_div_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_div_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = GB_IDIV_SIGNED (aik, bkj, 16)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 16) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IDIV_SIGNED (x, y, 16) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (GB_IDIV_SIGNED (ax, bx, 16))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_DIV || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_DIV_INT16 || GxB_NO_MAX_DIV_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_div_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_div_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_div_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_div_int16
 GrB_Info GB_Asaxpy3B__max_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_div_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_div_int32.c
index fabff9de7d..71048694cc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_div_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_div_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = GB_IDIV_SIGNED (aik, bkj, 32)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 32) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IDIV_SIGNED (x, y, 32) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (GB_IDIV_SIGNED (ax, bx, 32))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_DIV || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_DIV_INT32 || GxB_NO_MAX_DIV_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_div_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_div_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_div_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_div_int32
 GrB_Info GB_Asaxpy3B__max_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_div_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_div_int64.c
index 1a6035d9b9..76da0e7f7d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_div_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_div_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = GB_IDIV_SIGNED (aik, bkj, 64)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 64) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IDIV_SIGNED (x, y, 64) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (GB_IDIV_SIGNED (ax, bx, 64))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_DIV || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_DIV_INT64 || GxB_NO_MAX_DIV_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_div_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_div_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_div_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_div_int64
 GrB_Info GB_Asaxpy3B__max_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_div_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_div_int8.c
index c571b11f01..efcf084f50 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_div_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_div_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = GB_IDIV_SIGNED (aik, bkj, 8)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 8) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IDIV_SIGNED (x, y, 8) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (GB_IDIV_SIGNED (ax, bx, 8))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_DIV || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_DIV_INT8 || GxB_NO_MAX_DIV_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_div_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_div_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_div_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_div_int8
 GrB_Info GB_Asaxpy3B__max_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_div_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_div_uint16.c
index 92a72e577e..7fbd56d9b9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_div_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_div_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (aik, bkj, 16)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 16) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IDIV_UNSIGNED (x, y, 16) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (GB_IDIV_UNSIGNED (ax, bx, 16))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_DIV || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_DIV_UINT16 || GxB_NO_MAX_DIV_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_div_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_div_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_div_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_div_uint16
 GrB_Info GB_Asaxpy3B__max_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_div_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_div_uint32.c
index 568bc5dadc..c7031f073f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_div_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_div_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (aik, bkj, 32)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 32) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IDIV_UNSIGNED (x, y, 32) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (GB_IDIV_UNSIGNED (ax, bx, 32))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_DIV || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_DIV_UINT32 || GxB_NO_MAX_DIV_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_div_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_div_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_div_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_div_uint32
 GrB_Info GB_Asaxpy3B__max_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_div_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_div_uint64.c
index 446bc04b45..1934a9e414 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_div_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_div_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (aik, bkj, 64)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 64) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IDIV_UNSIGNED (x, y, 64) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (GB_IDIV_UNSIGNED (ax, bx, 64))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_DIV || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_DIV_UINT64 || GxB_NO_MAX_DIV_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_div_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_div_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_div_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_div_uint64
 GrB_Info GB_Asaxpy3B__max_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_div_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_div_uint8.c
index df68f92eb2..2fcdf90f98 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_div_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_div_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (aik, bkj, 8)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 8) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IDIV_UNSIGNED (x, y, 8) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (GB_IDIV_UNSIGNED (ax, bx, 8))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_DIV || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_DIV_UINT8 || GxB_NO_MAX_DIV_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_div_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_div_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_div_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_div_uint8
 GrB_Info GB_Asaxpy3B__max_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_first_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_first_fp32.c
index 83fff4245b..d85d75bbf6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_first_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_first_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = aik
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, aik)
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, x)
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !isnan ((ax)) && !isgreaterequal (cx, (ax))) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_FIRST || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_FIRST_FP32 || GxB_NO_MAX_FIRST_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_first_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_first_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_first_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_first_fp32
 GrB_Info GB_Asaxpy3B__max_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_first_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_first_fp64.c
index 39a3259375..b0455c3590 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_first_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_first_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = aik
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, aik)
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, x)
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !isnan ((ax)) && !isgreaterequal (cx, (ax))) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_FIRST || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_FIRST_FP64 || GxB_NO_MAX_FIRST_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_first_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_first_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_first_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_first_fp64
 GrB_Info GB_Asaxpy3B__max_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_first_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_first_int16.c
index 1c74e659af..711e27b497 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_first_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_first_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, aik)
+// MultAdd:  int16_t x_op_y = aik ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = x ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int16_t) (ax))) cx = ((int16_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_FIRST || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_FIRST_INT16 || GxB_NO_MAX_FIRST_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_first_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_first_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_first_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_first_int16
 GrB_Info GB_Asaxpy3B__max_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_first_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_first_int32.c
index 4e88a055e2..5b478fc52c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_first_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_first_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, aik)
+// MultAdd:  int32_t x_op_y = aik ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = x ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int32_t) (ax))) cx = ((int32_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_FIRST || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_FIRST_INT32 || GxB_NO_MAX_FIRST_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_first_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_first_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_first_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_first_int32
 GrB_Info GB_Asaxpy3B__max_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_first_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_first_int64.c
index e06977cdbb..cabbf42605 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_first_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_first_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, aik)
+// MultAdd:  int64_t x_op_y = aik ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = x ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int64_t) (ax))) cx = ((int64_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_FIRST || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_FIRST_INT64 || GxB_NO_MAX_FIRST_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_first_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_first_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_first_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_first_int64
 GrB_Info GB_Asaxpy3B__max_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_first_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_first_int8.c
index 8ba6be4945..3c00e8e16a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_first_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_first_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, aik)
+// MultAdd:  int8_t x_op_y = aik ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = x ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int8_t) (ax))) cx = ((int8_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_FIRST || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_FIRST_INT8 || GxB_NO_MAX_FIRST_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_first_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_first_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_first_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_first_int8
 GrB_Info GB_Asaxpy3B__max_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_first_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_first_uint16.c
index d79e7b2b78..eedb616161 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_first_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_first_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, aik)
+// MultAdd:  uint16_t x_op_y = aik ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = x ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((uint16_t) (ax))) cx = ((uint16_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_FIRST || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_FIRST_UINT16 || GxB_NO_MAX_FIRST_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_first_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_first_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_first_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_first_uint16
 GrB_Info GB_Asaxpy3B__max_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_first_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_first_uint32.c
index 0cfa9c22ce..726ff195db 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_first_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_first_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, aik)
+// MultAdd:  uint32_t x_op_y = aik ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = x ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((uint32_t) (ax))) cx = ((uint32_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_FIRST || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_FIRST_UINT32 || GxB_NO_MAX_FIRST_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_first_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_first_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_first_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_first_uint32
 GrB_Info GB_Asaxpy3B__max_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_first_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_first_uint64.c
index 133e389056..cfd27487b1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_first_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_first_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, aik)
+// MultAdd:  uint64_t x_op_y = aik ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = x ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((uint64_t) (ax))) cx = ((uint64_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_FIRST || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_FIRST_UINT64 || GxB_NO_MAX_FIRST_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_first_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_first_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_first_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_first_uint64
 GrB_Info GB_Asaxpy3B__max_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_first_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_first_uint8.c
index 70884ede97..c723d84758 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_first_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_first_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, aik)
+// MultAdd:  uint8_t x_op_y = aik ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = x ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((uint8_t) (ax))) cx = ((uint8_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_FIRST || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_FIRST_UINT8 || GxB_NO_MAX_FIRST_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_first_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_first_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_first_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_first_uint8
 GrB_Info GB_Asaxpy3B__max_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_firsti1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_firsti1_int32.c
new file mode 100644
index 0000000000..79386e2afc
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_firsti1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__max_firsti1_int32
+// A'*B function (dot3):     GB_Adot3B__max_firsti1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__max_firsti1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__max_firsti1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (i+1)
+// Add:      if (cij < z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int32_t x_op_y = (i+1) ; cij = GB_IMAX (cij, x_op_y)
+// Identity: INT32_MIN
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (i+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (i+1) ; z = GB_IMAX (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT32_MIN
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] < t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMAX (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] < t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int32_t) ((i+1)))) cx = ((int32_t) ((i+1))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MAX || GxB_NO_FIRSTI1 || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_FIRSTI1_INT32 || GxB_NO_MAX_FIRSTI1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__max_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__max_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__max_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__max_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_firsti1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_firsti1_int64.c
new file mode 100644
index 0000000000..e4bb740c21
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_firsti1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__max_firsti1_int64
+// A'*B function (dot3):     GB_Adot3B__max_firsti1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__max_firsti1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__max_firsti1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (i+1)
+// Add:      if (cij < z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int64_t x_op_y = (i+1) ; cij = GB_IMAX (cij, x_op_y)
+// Identity: INT64_MIN
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (i+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (i+1) ; z = GB_IMAX (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT64_MIN
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] < t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMAX (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] < t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int64_t) ((i+1)))) cx = ((int64_t) ((i+1))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MAX || GxB_NO_FIRSTI1 || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_FIRSTI1_INT64 || GxB_NO_MAX_FIRSTI1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__max_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__max_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__max_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__max_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_firsti_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_firsti_int32.c
new file mode 100644
index 0000000000..21c9b33263
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_firsti_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__max_firsti_int32
+// A'*B function (dot3):     GB_Adot3B__max_firsti_int32
+// C+=A'*B function (dot4):  GB_Adot4B__max_firsti_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__max_firsti_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = i
+// Add:      if (cij < z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int32_t x_op_y = i ; cij = GB_IMAX (cij, x_op_y)
+// Identity: INT32_MIN
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = i
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = i ; z = GB_IMAX (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT32_MIN
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] < t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMAX (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] < t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int32_t) (i))) cx = ((int32_t) (i)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MAX || GxB_NO_FIRSTI || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_FIRSTI_INT32 || GxB_NO_MAX_FIRSTI_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__max_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__max_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__max_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__max_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_firsti_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_firsti_int64.c
new file mode 100644
index 0000000000..cdb3f0c71e
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_firsti_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__max_firsti_int64
+// A'*B function (dot3):     GB_Adot3B__max_firsti_int64
+// C+=A'*B function (dot4):  GB_Adot4B__max_firsti_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__max_firsti_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = i
+// Add:      if (cij < z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int64_t x_op_y = i ; cij = GB_IMAX (cij, x_op_y)
+// Identity: INT64_MIN
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = i
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = i ; z = GB_IMAX (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT64_MIN
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] < t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMAX (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] < t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int64_t) (i))) cx = ((int64_t) (i)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MAX || GxB_NO_FIRSTI || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_FIRSTI_INT64 || GxB_NO_MAX_FIRSTI_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__max_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__max_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__max_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__max_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_firstj1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_firstj1_int32.c
new file mode 100644
index 0000000000..a2898c5b52
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_firstj1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__max_firstj1_int32
+// A'*B function (dot3):     GB_Adot3B__max_firstj1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__max_firstj1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__max_firstj1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (k+1)
+// Add:      if (cij < z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int32_t x_op_y = (k+1) ; cij = GB_IMAX (cij, x_op_y)
+// Identity: INT32_MIN
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (k+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (k+1) ; z = GB_IMAX (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT32_MIN
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] < t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMAX (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] < t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int32_t) ((k+1)))) cx = ((int32_t) ((k+1))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MAX || GxB_NO_FIRSTJ1 || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_FIRSTJ1_INT32 || GxB_NO_MAX_FIRSTJ1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__max_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__max_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__max_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__max_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_firstj1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_firstj1_int64.c
new file mode 100644
index 0000000000..04ff90b615
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_firstj1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__max_firstj1_int64
+// A'*B function (dot3):     GB_Adot3B__max_firstj1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__max_firstj1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__max_firstj1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (k+1)
+// Add:      if (cij < z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int64_t x_op_y = (k+1) ; cij = GB_IMAX (cij, x_op_y)
+// Identity: INT64_MIN
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (k+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (k+1) ; z = GB_IMAX (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT64_MIN
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] < t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMAX (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] < t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int64_t) ((k+1)))) cx = ((int64_t) ((k+1))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MAX || GxB_NO_FIRSTJ1 || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_FIRSTJ1_INT64 || GxB_NO_MAX_FIRSTJ1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__max_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__max_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__max_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__max_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_firstj_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_firstj_int32.c
new file mode 100644
index 0000000000..1a4cafcec5
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_firstj_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__max_firstj_int32
+// A'*B function (dot3):     GB_Adot3B__max_firstj_int32
+// C+=A'*B function (dot4):  GB_Adot4B__max_firstj_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__max_firstj_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = k
+// Add:      if (cij < z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int32_t x_op_y = k ; cij = GB_IMAX (cij, x_op_y)
+// Identity: INT32_MIN
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = k
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = k ; z = GB_IMAX (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT32_MIN
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] < t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMAX (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] < t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int32_t) (k))) cx = ((int32_t) (k)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MAX || GxB_NO_FIRSTJ || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_FIRSTJ_INT32 || GxB_NO_MAX_FIRSTJ_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__max_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__max_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__max_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__max_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_firstj_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_firstj_int64.c
new file mode 100644
index 0000000000..b1b3e0f9ff
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_firstj_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__max_firstj_int64
+// A'*B function (dot3):     GB_Adot3B__max_firstj_int64
+// C+=A'*B function (dot4):  GB_Adot4B__max_firstj_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__max_firstj_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = k
+// Add:      if (cij < z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int64_t x_op_y = k ; cij = GB_IMAX (cij, x_op_y)
+// Identity: INT64_MIN
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = k
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = k ; z = GB_IMAX (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT64_MIN
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] < t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMAX (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] < t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int64_t) (k))) cx = ((int64_t) (k)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MAX || GxB_NO_FIRSTJ || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_FIRSTJ_INT64 || GxB_NO_MAX_FIRSTJ_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__max_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__max_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__max_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__max_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_fp32.c
index dcc14f284d..7aaa12b518 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, (aik == bkj))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax == bx)) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISEQ || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_ISEQ_FP32 || GxB_NO_MAX_ISEQ_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_iseq_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_iseq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_iseq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_iseq_fp32
 GrB_Info GB_Asaxpy3B__max_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_fp64.c
index e1f9546bd2..11b5954bf6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, (aik == bkj))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax == bx)) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISEQ || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_ISEQ_FP64 || GxB_NO_MAX_ISEQ_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_iseq_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_iseq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_iseq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_iseq_fp64
 GrB_Info GB_Asaxpy3B__max_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int16.c
index 1888e1a6a8..e9203e6cf1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik == bkj))
+// MultAdd:  int16_t x_op_y = (aik == bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x == y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax == bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISEQ || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_ISEQ_INT16 || GxB_NO_MAX_ISEQ_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_iseq_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_iseq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_iseq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_iseq_int16
 GrB_Info GB_Asaxpy3B__max_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int32.c
index 70bae48962..fb8a6c353f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik == bkj))
+// MultAdd:  int32_t x_op_y = (aik == bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x == y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax == bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISEQ || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_ISEQ_INT32 || GxB_NO_MAX_ISEQ_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_iseq_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_iseq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_iseq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_iseq_int32
 GrB_Info GB_Asaxpy3B__max_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int64.c
index fe2a7a5380..b00d030d50 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik == bkj))
+// MultAdd:  int64_t x_op_y = (aik == bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x == y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax == bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISEQ || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_ISEQ_INT64 || GxB_NO_MAX_ISEQ_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_iseq_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_iseq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_iseq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_iseq_int64
 GrB_Info GB_Asaxpy3B__max_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int8.c
index 1fb6239b7b..23ccbdc393 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik == bkj))
+// MultAdd:  int8_t x_op_y = (aik == bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x == y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax == bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISEQ || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_ISEQ_INT8 || GxB_NO_MAX_ISEQ_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_iseq_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_iseq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_iseq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_iseq_int8
 GrB_Info GB_Asaxpy3B__max_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint16.c
index c6a90a2589..583c692d4d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik == bkj))
+// MultAdd:  uint16_t x_op_y = (aik == bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x == y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax == bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISEQ || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_ISEQ_UINT16 || GxB_NO_MAX_ISEQ_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_iseq_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_iseq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_iseq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_iseq_uint16
 GrB_Info GB_Asaxpy3B__max_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint32.c
index 71a76aa64d..89d1df3d03 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik == bkj))
+// MultAdd:  uint32_t x_op_y = (aik == bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x == y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax == bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISEQ || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_ISEQ_UINT32 || GxB_NO_MAX_ISEQ_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_iseq_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_iseq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_iseq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_iseq_uint32
 GrB_Info GB_Asaxpy3B__max_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint64.c
index 6989769b8f..936b681bc5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik == bkj))
+// MultAdd:  uint64_t x_op_y = (aik == bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x == y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax == bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISEQ || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_ISEQ_UINT64 || GxB_NO_MAX_ISEQ_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_iseq_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_iseq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_iseq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_iseq_uint64
 GrB_Info GB_Asaxpy3B__max_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint8.c
index 86ccd5ed76..eb9d74a924 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_iseq_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik == bkj))
+// MultAdd:  uint8_t x_op_y = (aik == bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x == y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax == bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISEQ || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_ISEQ_UINT8 || GxB_NO_MAX_ISEQ_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_iseq_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_iseq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_iseq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_iseq_uint8
 GrB_Info GB_Asaxpy3B__max_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isge_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_isge_fp32.c
index fc0e1a4a06..a8dadef290 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isge_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isge_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, (aik >= bkj))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax >= bx)) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGE || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_ISGE_FP32 || GxB_NO_MAX_ISGE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isge_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isge_fp32
 GrB_Info GB_Asaxpy3B__max_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isge_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_isge_fp64.c
index 4aedba7668..534f700137 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isge_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isge_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, (aik >= bkj))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax >= bx)) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGE || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_ISGE_FP64 || GxB_NO_MAX_ISGE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isge_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isge_fp64
 GrB_Info GB_Asaxpy3B__max_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isge_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_isge_int16.c
index 849ccd753f..592d099e2c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isge_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isge_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik >= bkj))
+// MultAdd:  int16_t x_op_y = (aik >= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x >= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax >= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGE || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_ISGE_INT16 || GxB_NO_MAX_ISGE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isge_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isge_int16
 GrB_Info GB_Asaxpy3B__max_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isge_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_isge_int32.c
index e735da1cbf..c5d76c2876 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isge_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isge_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik >= bkj))
+// MultAdd:  int32_t x_op_y = (aik >= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x >= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax >= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGE || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_ISGE_INT32 || GxB_NO_MAX_ISGE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isge_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isge_int32
 GrB_Info GB_Asaxpy3B__max_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isge_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_isge_int64.c
index e697a25723..cc705297e7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isge_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isge_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik >= bkj))
+// MultAdd:  int64_t x_op_y = (aik >= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x >= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax >= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGE || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_ISGE_INT64 || GxB_NO_MAX_ISGE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isge_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isge_int64
 GrB_Info GB_Asaxpy3B__max_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isge_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_isge_int8.c
index 84887cdc6d..bcc6c8f4c7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isge_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isge_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik >= bkj))
+// MultAdd:  int8_t x_op_y = (aik >= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x >= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax >= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGE || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_ISGE_INT8 || GxB_NO_MAX_ISGE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isge_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isge_int8
 GrB_Info GB_Asaxpy3B__max_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint16.c
index 1439126efb..b6dbdfa2d4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik >= bkj))
+// MultAdd:  uint16_t x_op_y = (aik >= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x >= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax >= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGE || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_ISGE_UINT16 || GxB_NO_MAX_ISGE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isge_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isge_uint16
 GrB_Info GB_Asaxpy3B__max_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint32.c
index b4bea5edee..81e59d5073 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik >= bkj))
+// MultAdd:  uint32_t x_op_y = (aik >= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x >= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax >= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGE || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_ISGE_UINT32 || GxB_NO_MAX_ISGE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isge_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isge_uint32
 GrB_Info GB_Asaxpy3B__max_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint64.c
index 2875a2b399..94105ffa5e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik >= bkj))
+// MultAdd:  uint64_t x_op_y = (aik >= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x >= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax >= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGE || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_ISGE_UINT64 || GxB_NO_MAX_ISGE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isge_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isge_uint64
 GrB_Info GB_Asaxpy3B__max_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint8.c
index 7a74ff9075..7dd55d828d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isge_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik >= bkj))
+// MultAdd:  uint8_t x_op_y = (aik >= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x >= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax >= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGE || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_ISGE_UINT8 || GxB_NO_MAX_ISGE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isge_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isge_uint8
 GrB_Info GB_Asaxpy3B__max_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_fp32.c
index 1c7556edb3..d462c9727b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, (aik > bkj))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax > bx)) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGT || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_ISGT_FP32 || GxB_NO_MAX_ISGT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isgt_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isgt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isgt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isgt_fp32
 GrB_Info GB_Asaxpy3B__max_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_fp64.c
index f446c8f95a..bab71773bd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, (aik > bkj))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax > bx)) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGT || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_ISGT_FP64 || GxB_NO_MAX_ISGT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isgt_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isgt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isgt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isgt_fp64
 GrB_Info GB_Asaxpy3B__max_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int16.c
index a74e3cdd84..a960ea2398 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik > bkj))
+// MultAdd:  int16_t x_op_y = (aik > bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x > y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax > bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGT || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_ISGT_INT16 || GxB_NO_MAX_ISGT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isgt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isgt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isgt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isgt_int16
 GrB_Info GB_Asaxpy3B__max_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int32.c
index 32a2e9a031..735d2ef5ec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik > bkj))
+// MultAdd:  int32_t x_op_y = (aik > bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x > y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax > bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGT || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_ISGT_INT32 || GxB_NO_MAX_ISGT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isgt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isgt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isgt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isgt_int32
 GrB_Info GB_Asaxpy3B__max_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int64.c
index a9f184e02b..7915d4fea0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik > bkj))
+// MultAdd:  int64_t x_op_y = (aik > bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x > y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax > bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGT || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_ISGT_INT64 || GxB_NO_MAX_ISGT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isgt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isgt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isgt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isgt_int64
 GrB_Info GB_Asaxpy3B__max_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int8.c
index 8838468f04..b0fffe1bb0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik > bkj))
+// MultAdd:  int8_t x_op_y = (aik > bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x > y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax > bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGT || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_ISGT_INT8 || GxB_NO_MAX_ISGT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isgt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isgt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isgt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isgt_int8
 GrB_Info GB_Asaxpy3B__max_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint16.c
index 988c2f2221..936ea65cf9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik > bkj))
+// MultAdd:  uint16_t x_op_y = (aik > bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x > y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax > bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGT || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_ISGT_UINT16 || GxB_NO_MAX_ISGT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isgt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isgt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isgt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isgt_uint16
 GrB_Info GB_Asaxpy3B__max_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint32.c
index 99c039ca46..43a771a741 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik > bkj))
+// MultAdd:  uint32_t x_op_y = (aik > bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x > y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax > bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGT || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_ISGT_UINT32 || GxB_NO_MAX_ISGT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isgt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isgt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isgt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isgt_uint32
 GrB_Info GB_Asaxpy3B__max_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint64.c
index 87a1912f7a..fe40becea1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik > bkj))
+// MultAdd:  uint64_t x_op_y = (aik > bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x > y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax > bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGT || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_ISGT_UINT64 || GxB_NO_MAX_ISGT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isgt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isgt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isgt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isgt_uint64
 GrB_Info GB_Asaxpy3B__max_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint8.c
index d67bcd9371..94390a8da3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isgt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik > bkj))
+// MultAdd:  uint8_t x_op_y = (aik > bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x > y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax > bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISGT || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_ISGT_UINT8 || GxB_NO_MAX_ISGT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isgt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isgt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isgt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isgt_uint8
 GrB_Info GB_Asaxpy3B__max_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isle_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_isle_fp32.c
index 7a90e7b15a..234e930385 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isle_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isle_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, (aik <= bkj))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax <= bx)) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLE || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_ISLE_FP32 || GxB_NO_MAX_ISLE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isle_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isle_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isle_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isle_fp32
 GrB_Info GB_Asaxpy3B__max_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isle_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_isle_fp64.c
index 0e003fef93..96492ecd79 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isle_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isle_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, (aik <= bkj))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax <= bx)) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLE || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_ISLE_FP64 || GxB_NO_MAX_ISLE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isle_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isle_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isle_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isle_fp64
 GrB_Info GB_Asaxpy3B__max_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isle_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_isle_int16.c
index d4972a3c56..836d879b9a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isle_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isle_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik <= bkj))
+// MultAdd:  int16_t x_op_y = (aik <= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x <= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax <= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLE || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_ISLE_INT16 || GxB_NO_MAX_ISLE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isle_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isle_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isle_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isle_int16
 GrB_Info GB_Asaxpy3B__max_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isle_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_isle_int32.c
index 22162c2cc4..4fb6b2750e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isle_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isle_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik <= bkj))
+// MultAdd:  int32_t x_op_y = (aik <= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x <= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax <= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLE || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_ISLE_INT32 || GxB_NO_MAX_ISLE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isle_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isle_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isle_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isle_int32
 GrB_Info GB_Asaxpy3B__max_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isle_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_isle_int64.c
index 5442e2ac1c..f9e2a525e4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isle_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isle_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik <= bkj))
+// MultAdd:  int64_t x_op_y = (aik <= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x <= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax <= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLE || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_ISLE_INT64 || GxB_NO_MAX_ISLE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isle_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isle_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isle_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isle_int64
 GrB_Info GB_Asaxpy3B__max_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isle_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_isle_int8.c
index 1d67d0f0b9..fe9d54d134 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isle_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isle_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik <= bkj))
+// MultAdd:  int8_t x_op_y = (aik <= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x <= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax <= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLE || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_ISLE_INT8 || GxB_NO_MAX_ISLE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isle_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isle_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isle_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isle_int8
 GrB_Info GB_Asaxpy3B__max_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint16.c
index 2e79d33152..71f01a21ac 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik <= bkj))
+// MultAdd:  uint16_t x_op_y = (aik <= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x <= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax <= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLE || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_ISLE_UINT16 || GxB_NO_MAX_ISLE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isle_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isle_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isle_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isle_uint16
 GrB_Info GB_Asaxpy3B__max_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint32.c
index 40edcaceeb..cbfc43ca6f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik <= bkj))
+// MultAdd:  uint32_t x_op_y = (aik <= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x <= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax <= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLE || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_ISLE_UINT32 || GxB_NO_MAX_ISLE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isle_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isle_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isle_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isle_uint32
 GrB_Info GB_Asaxpy3B__max_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint64.c
index 9d5966a393..b6215c4440 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik <= bkj))
+// MultAdd:  uint64_t x_op_y = (aik <= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x <= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax <= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLE || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_ISLE_UINT64 || GxB_NO_MAX_ISLE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isle_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isle_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isle_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isle_uint64
 GrB_Info GB_Asaxpy3B__max_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint8.c
index 34a0070ab7..8a41983105 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isle_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik <= bkj))
+// MultAdd:  uint8_t x_op_y = (aik <= bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x <= y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax <= bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLE || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_ISLE_UINT8 || GxB_NO_MAX_ISLE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isle_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isle_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isle_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isle_uint8
 GrB_Info GB_Asaxpy3B__max_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_islt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_islt_fp32.c
index 48b8b00ea3..0703be0d95 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_islt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_islt_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, (aik < bkj))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax < bx)) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLT || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_ISLT_FP32 || GxB_NO_MAX_ISLT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_islt_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_islt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_islt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_islt_fp32
 GrB_Info GB_Asaxpy3B__max_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_islt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_islt_fp64.c
index 530d5b9bde..3bdef545f1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_islt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_islt_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, (aik < bkj))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax < bx)) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLT || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_ISLT_FP64 || GxB_NO_MAX_ISLT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_islt_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_islt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_islt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_islt_fp64
 GrB_Info GB_Asaxpy3B__max_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_islt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_islt_int16.c
index 02665a9b99..e7f7c5b078 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_islt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_islt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik < bkj))
+// MultAdd:  int16_t x_op_y = (aik < bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x < y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax < bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLT || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_ISLT_INT16 || GxB_NO_MAX_ISLT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_islt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_islt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_islt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_islt_int16
 GrB_Info GB_Asaxpy3B__max_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_islt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_islt_int32.c
index f7c8fa58d6..9c3357ea28 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_islt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_islt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik < bkj))
+// MultAdd:  int32_t x_op_y = (aik < bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x < y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax < bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLT || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_ISLT_INT32 || GxB_NO_MAX_ISLT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_islt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_islt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_islt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_islt_int32
 GrB_Info GB_Asaxpy3B__max_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_islt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_islt_int64.c
index 03a948928e..170c23c04d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_islt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_islt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik < bkj))
+// MultAdd:  int64_t x_op_y = (aik < bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x < y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax < bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLT || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_ISLT_INT64 || GxB_NO_MAX_ISLT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_islt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_islt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_islt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_islt_int64
 GrB_Info GB_Asaxpy3B__max_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_islt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_islt_int8.c
index d6bfd30a47..1df02b5a52 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_islt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_islt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik < bkj))
+// MultAdd:  int8_t x_op_y = (aik < bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x < y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax < bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLT || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_ISLT_INT8 || GxB_NO_MAX_ISLT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_islt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_islt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_islt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_islt_int8
 GrB_Info GB_Asaxpy3B__max_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint16.c
index 9f643c28e5..a3dbfd9080 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik < bkj))
+// MultAdd:  uint16_t x_op_y = (aik < bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x < y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax < bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLT || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_ISLT_UINT16 || GxB_NO_MAX_ISLT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_islt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_islt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_islt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_islt_uint16
 GrB_Info GB_Asaxpy3B__max_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint32.c
index 19386ab472..10002debf5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik < bkj))
+// MultAdd:  uint32_t x_op_y = (aik < bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x < y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax < bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLT || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_ISLT_UINT32 || GxB_NO_MAX_ISLT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_islt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_islt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_islt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_islt_uint32
 GrB_Info GB_Asaxpy3B__max_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint64.c
index a3fe6c5cf1..463524f54c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik < bkj))
+// MultAdd:  uint64_t x_op_y = (aik < bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x < y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax < bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLT || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_ISLT_UINT64 || GxB_NO_MAX_ISLT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_islt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_islt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_islt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_islt_uint64
 GrB_Info GB_Asaxpy3B__max_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint8.c
index 4f80109c43..c29377d63e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_islt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik < bkj))
+// MultAdd:  uint8_t x_op_y = (aik < bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x < y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax < bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISLT || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_ISLT_UINT8 || GxB_NO_MAX_ISLT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_islt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_islt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_islt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_islt_uint8
 GrB_Info GB_Asaxpy3B__max_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isne_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_isne_fp32.c
index 83d1716c23..e9f3a122ea 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isne_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isne_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, (aik != bkj))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax != bx)) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISNE || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_ISNE_FP32 || GxB_NO_MAX_ISNE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isne_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isne_fp32
 GrB_Info GB_Asaxpy3B__max_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isne_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_isne_fp64.c
index ca744a52d1..e32dbf5382 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isne_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isne_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, (aik != bkj))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax != bx)) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISNE || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_ISNE_FP64 || GxB_NO_MAX_ISNE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isne_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isne_fp64
 GrB_Info GB_Asaxpy3B__max_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isne_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_isne_int16.c
index 82c1e18150..71595de11e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isne_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isne_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik != bkj))
+// MultAdd:  int16_t x_op_y = (aik != bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x != y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax != bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISNE || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_ISNE_INT16 || GxB_NO_MAX_ISNE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isne_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isne_int16
 GrB_Info GB_Asaxpy3B__max_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isne_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_isne_int32.c
index 8ea5a687d1..7fa162a6d2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isne_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isne_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik != bkj))
+// MultAdd:  int32_t x_op_y = (aik != bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x != y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax != bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISNE || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_ISNE_INT32 || GxB_NO_MAX_ISNE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isne_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isne_int32
 GrB_Info GB_Asaxpy3B__max_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isne_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_isne_int64.c
index 9f06bf5ba1..fd955e1822 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isne_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isne_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik != bkj))
+// MultAdd:  int64_t x_op_y = (aik != bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x != y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax != bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISNE || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_ISNE_INT64 || GxB_NO_MAX_ISNE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isne_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isne_int64
 GrB_Info GB_Asaxpy3B__max_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isne_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_isne_int8.c
index bd1d45d13a..4314e73164 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isne_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isne_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik != bkj))
+// MultAdd:  int8_t x_op_y = (aik != bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x != y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax != bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISNE || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_ISNE_INT8 || GxB_NO_MAX_ISNE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isne_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isne_int8
 GrB_Info GB_Asaxpy3B__max_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint16.c
index c26f89453c..b450ddb41b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik != bkj))
+// MultAdd:  uint16_t x_op_y = (aik != bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x != y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax != bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISNE || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_ISNE_UINT16 || GxB_NO_MAX_ISNE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isne_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isne_uint16
 GrB_Info GB_Asaxpy3B__max_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint32.c
index ef431ab12f..37c4559076 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik != bkj))
+// MultAdd:  uint32_t x_op_y = (aik != bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x != y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax != bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISNE || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_ISNE_UINT32 || GxB_NO_MAX_ISNE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isne_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isne_uint32
 GrB_Info GB_Asaxpy3B__max_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint64.c
index e8edcdd34a..4fceb5a370 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik != bkj))
+// MultAdd:  uint64_t x_op_y = (aik != bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x != y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax != bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISNE || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_ISNE_UINT64 || GxB_NO_MAX_ISNE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isne_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isne_uint64
 GrB_Info GB_Asaxpy3B__max_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint8.c
index e5c07c8230..20426f34bb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_isne_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, (aik != bkj))
+// MultAdd:  uint8_t x_op_y = (aik != bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x != y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax != bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_ISNE || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_ISNE_UINT8 || GxB_NO_MAX_ISNE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_isne_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_isne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_isne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_isne_uint8
 GrB_Info GB_Asaxpy3B__max_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_land_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_land_fp32.c
index a76a0b53a6..56a80fc483 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_land_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_land_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, ((aik != 0) && (bkj != 0)))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, ((x != 0) && (y != 0)))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = (((ax != 0) && (bx != 0))) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LAND || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_LAND_FP32 || GxB_NO_MAX_LAND_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_land_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_land_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_land_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_land_fp32
 GrB_Info GB_Asaxpy3B__max_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_land_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_land_fp64.c
index efeb0757a1..fee94ec131 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_land_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_land_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, ((aik != 0) && (bkj != 0)))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, ((x != 0) && (y != 0)))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = (((ax != 0) && (bx != 0))) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LAND || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_LAND_FP64 || GxB_NO_MAX_LAND_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_land_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_land_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_land_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_land_fp64
 GrB_Info GB_Asaxpy3B__max_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_land_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_land_int16.c
index bbefc0035e..59f2fec042 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_land_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_land_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LAND || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_LAND_INT16 || GxB_NO_MAX_LAND_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_land_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_land_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_land_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_land_int16
 GrB_Info GB_Asaxpy3B__max_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_land_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_land_int32.c
index 2667ca426c..e83f6efe64 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_land_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_land_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LAND || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_LAND_INT32 || GxB_NO_MAX_LAND_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_land_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_land_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_land_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_land_int32
 GrB_Info GB_Asaxpy3B__max_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_land_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_land_int64.c
index 1d940d8dec..dfa7ed8434 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_land_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_land_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LAND || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_LAND_INT64 || GxB_NO_MAX_LAND_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_land_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_land_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_land_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_land_int64
 GrB_Info GB_Asaxpy3B__max_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_land_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_land_int8.c
index 71e61ece36..1bd90686a3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_land_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_land_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LAND || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_LAND_INT8 || GxB_NO_MAX_LAND_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_land_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_land_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_land_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_land_int8
 GrB_Info GB_Asaxpy3B__max_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_land_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_land_uint16.c
index a96c02a5cb..f5e0a7654d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_land_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_land_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LAND || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_LAND_UINT16 || GxB_NO_MAX_LAND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_land_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_land_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_land_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_land_uint16
 GrB_Info GB_Asaxpy3B__max_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_land_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_land_uint32.c
index 30ed597727..59bb0b15f6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_land_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_land_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LAND || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_LAND_UINT32 || GxB_NO_MAX_LAND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_land_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_land_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_land_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_land_uint32
 GrB_Info GB_Asaxpy3B__max_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_land_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_land_uint64.c
index 30aabf442a..1cd5b502e7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_land_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_land_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LAND || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_LAND_UINT64 || GxB_NO_MAX_LAND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_land_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_land_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_land_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_land_uint64
 GrB_Info GB_Asaxpy3B__max_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_land_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_land_uint8.c
index 990b3b14ab..8812f4fd98 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_land_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_land_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LAND || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_LAND_UINT8 || GxB_NO_MAX_LAND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_land_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_land_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_land_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_land_uint8
 GrB_Info GB_Asaxpy3B__max_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lor_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_lor_fp32.c
index 3879723a38..53c07f690e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lor_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lor_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, ((aik != 0) || (bkj != 0)))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, ((x != 0) || (y != 0)))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = (((ax != 0) || (bx != 0))) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LOR || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_LOR_FP32 || GxB_NO_MAX_LOR_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lor_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lor_fp32
 GrB_Info GB_Asaxpy3B__max_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lor_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_lor_fp64.c
index 60abf8080b..84cd5f8582 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lor_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lor_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, ((aik != 0) || (bkj != 0)))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, ((x != 0) || (y != 0)))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = (((ax != 0) || (bx != 0))) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LOR || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_LOR_FP64 || GxB_NO_MAX_LOR_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lor_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lor_fp64
 GrB_Info GB_Asaxpy3B__max_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lor_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_lor_int16.c
index bb2dfad696..ebf207c440 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lor_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LOR || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_LOR_INT16 || GxB_NO_MAX_LOR_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lor_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lor_int16
 GrB_Info GB_Asaxpy3B__max_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lor_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_lor_int32.c
index 91c9b8ace9..ff01071f21 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lor_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LOR || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_LOR_INT32 || GxB_NO_MAX_LOR_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lor_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lor_int32
 GrB_Info GB_Asaxpy3B__max_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lor_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_lor_int64.c
index dadaae65c9..79856ae16f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lor_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LOR || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_LOR_INT64 || GxB_NO_MAX_LOR_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lor_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lor_int64
 GrB_Info GB_Asaxpy3B__max_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lor_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_lor_int8.c
index 9403e856e0..d50b7a0bee 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lor_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LOR || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_LOR_INT8 || GxB_NO_MAX_LOR_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lor_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lor_int8
 GrB_Info GB_Asaxpy3B__max_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint16.c
index 778c4f86d7..ae14c8e369 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LOR || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_LOR_UINT16 || GxB_NO_MAX_LOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lor_uint16
 GrB_Info GB_Asaxpy3B__max_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint32.c
index ec80fb3748..93e640c36f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LOR || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_LOR_UINT32 || GxB_NO_MAX_LOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lor_uint32
 GrB_Info GB_Asaxpy3B__max_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint64.c
index ffa55f5052..9e336e703d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LOR || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_LOR_UINT64 || GxB_NO_MAX_LOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lor_uint64
 GrB_Info GB_Asaxpy3B__max_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint8.c
index c1bdf7e37b..47b9eb8d59 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LOR || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_LOR_UINT8 || GxB_NO_MAX_LOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lor_uint8
 GrB_Info GB_Asaxpy3B__max_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_fp32.c
index 7a4b5253b7..77f7e6df4b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, ((aik != 0) != (bkj != 0)))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, ((x != 0) != (y != 0)))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = (((ax != 0) != (bx != 0))) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LXOR || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_LXOR_FP32 || GxB_NO_MAX_LXOR_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lxor_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lxor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lxor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lxor_fp32
 GrB_Info GB_Asaxpy3B__max_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_fp64.c
index 8d6904ffe4..04a50256d5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, ((aik != 0) != (bkj != 0)))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, ((x != 0) != (y != 0)))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = (((ax != 0) != (bx != 0))) ; if (exists && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LXOR || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_LXOR_FP64 || GxB_NO_MAX_LXOR_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lxor_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lxor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lxor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lxor_fp64
 GrB_Info GB_Asaxpy3B__max_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int16.c
index 628e518c08..79b8a886bf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LXOR || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_LXOR_INT16 || GxB_NO_MAX_LXOR_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lxor_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lxor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lxor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lxor_int16
 GrB_Info GB_Asaxpy3B__max_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int32.c
index 3a14e4c9ec..6e65fa7b55 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LXOR || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_LXOR_INT32 || GxB_NO_MAX_LXOR_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lxor_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lxor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lxor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lxor_int32
 GrB_Info GB_Asaxpy3B__max_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int64.c
index d7d38636b1..4071f1af1c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LXOR || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_LXOR_INT64 || GxB_NO_MAX_LXOR_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lxor_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lxor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lxor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lxor_int64
 GrB_Info GB_Asaxpy3B__max_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int8.c
index 9c95fd89b3..73c0245fc0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LXOR || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_LXOR_INT8 || GxB_NO_MAX_LXOR_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lxor_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lxor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lxor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lxor_int8
 GrB_Info GB_Asaxpy3B__max_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint16.c
index 212f1f238f..fbf45f7adb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LXOR || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_LXOR_UINT16 || GxB_NO_MAX_LXOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lxor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lxor_uint16
 GrB_Info GB_Asaxpy3B__max_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint32.c
index 199ef2c22d..331865bb47 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LXOR || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_LXOR_UINT32 || GxB_NO_MAX_LXOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lxor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lxor_uint32
 GrB_Info GB_Asaxpy3B__max_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint64.c
index ab25101a1f..35c6a2e01f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LXOR || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_LXOR_UINT64 || GxB_NO_MAX_LXOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lxor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lxor_uint64
 GrB_Info GB_Asaxpy3B__max_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint8.c
index e218a83393..84d09eabaa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_lxor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_LXOR || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_LXOR_UINT8 || GxB_NO_MAX_LXOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_lxor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_lxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_lxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_lxor_uint8
 GrB_Info GB_Asaxpy3B__max_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_max_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_max_fp32.c
index ca9c182b47..8a9c157dc5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_max_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_max_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = fmaxf (aik, bkj)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, fmaxf (aik, bkj))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmaxf (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, fmaxf (x, y))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = (fmaxf (ax, bx)) ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_MAX_MAX_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_max_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_max_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_max_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_max_fp32
 GrB_Info GB_Asaxpy3B__max_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_max_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_max_fp64.c
index ddfc35518f..99fc4566c1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_max_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_max_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = fmax (aik, bkj)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, fmax (aik, bkj))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmax (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, fmax (x, y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = (fmax (ax, bx)) ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_MAX_MAX_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_max_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_max_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_max_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_max_fp64
 GrB_Info GB_Asaxpy3B__max_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_max_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_max_int16.c
index e80c406ed0..705997fda5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_max_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_max_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IMAX (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (GB_IMAX (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_MAX_MAX_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_max_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_max_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_max_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_max_int16
 GrB_Info GB_Asaxpy3B__max_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_max_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_max_int32.c
index dbb692592a..13a7a8d9ca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_max_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_max_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IMAX (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (GB_IMAX (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_MAX_MAX_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_max_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_max_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_max_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_max_int32
 GrB_Info GB_Asaxpy3B__max_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_max_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_max_int64.c
index 2b88dd8021..d48e829f10 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_max_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_max_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IMAX (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (GB_IMAX (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_MAX_MAX_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_max_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_max_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_max_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_max_int64
 GrB_Info GB_Asaxpy3B__max_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_max_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_max_int8.c
index 422be0d087..bf2859fd9b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_max_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_max_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IMAX (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (GB_IMAX (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_MAX_MAX_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_max_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_max_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_max_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_max_int8
 GrB_Info GB_Asaxpy3B__max_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_max_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_max_uint16.c
index a043eb3ec9..80f80935f6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_max_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_max_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IMAX (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (GB_IMAX (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_MAX_MAX_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_max_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_max_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_max_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_max_uint16
 GrB_Info GB_Asaxpy3B__max_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_max_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_max_uint32.c
index 261fc7b865..224a12f561 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_max_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_max_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IMAX (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (GB_IMAX (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_MAX_MAX_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_max_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_max_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_max_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_max_uint32
 GrB_Info GB_Asaxpy3B__max_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_max_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_max_uint64.c
index 946edc0faf..1858e82f04 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_max_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_max_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IMAX (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (GB_IMAX (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_MAX_MAX_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_max_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_max_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_max_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_max_uint64
 GrB_Info GB_Asaxpy3B__max_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_max_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_max_uint8.c
index 29471c2134..20c07f3e65 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_max_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_max_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IMAX (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (GB_IMAX (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_MAX_MAX_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_max_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_max_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_max_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_max_uint8
 GrB_Info GB_Asaxpy3B__max_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_min_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_min_fp32.c
index d65f9472dd..8d66a38592 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_min_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_min_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = fminf (aik, bkj)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, fminf (aik, bkj))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fminf (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, fminf (x, y))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = (fminf (ax, bx)) ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MIN || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_MIN_FP32 || GxB_NO_MAX_MIN_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_min_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_min_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_min_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_min_fp32
 GrB_Info GB_Asaxpy3B__max_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_min_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_min_fp64.c
index bd653b4d59..111eedf76e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_min_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_min_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = fmin (aik, bkj)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, fmin (aik, bkj))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmin (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, fmin (x, y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = (fmin (ax, bx)) ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MIN || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_MIN_FP64 || GxB_NO_MAX_MIN_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_min_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_min_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_min_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_min_fp64
 GrB_Info GB_Asaxpy3B__max_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_min_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_min_int16.c
index 41bc16c461..4c374275c8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_min_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_min_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IMIN (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (GB_IMIN (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MIN || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_MIN_INT16 || GxB_NO_MAX_MIN_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_min_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_min_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_min_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_min_int16
 GrB_Info GB_Asaxpy3B__max_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_min_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_min_int32.c
index 4ad4cc85e4..c8f4d253e5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_min_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_min_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IMIN (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (GB_IMIN (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MIN || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_MIN_INT32 || GxB_NO_MAX_MIN_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_min_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_min_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_min_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_min_int32
 GrB_Info GB_Asaxpy3B__max_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_min_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_min_int64.c
index 8c66dc4112..ce366fe2aa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_min_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_min_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IMIN (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (GB_IMIN (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MIN || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_MIN_INT64 || GxB_NO_MAX_MIN_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_min_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_min_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_min_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_min_int64
 GrB_Info GB_Asaxpy3B__max_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_min_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_min_int8.c
index 5fc4b694e7..9dee30796f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_min_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_min_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IMIN (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (GB_IMIN (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MIN || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_MIN_INT8 || GxB_NO_MAX_MIN_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_min_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_min_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_min_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_min_int8
 GrB_Info GB_Asaxpy3B__max_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_min_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_min_uint16.c
index 1151e51e58..db2d2473a3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_min_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_min_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IMIN (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (GB_IMIN (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MIN || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_MAX_MIN_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_min_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_min_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_min_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_min_uint16
 GrB_Info GB_Asaxpy3B__max_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_min_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_min_uint32.c
index 6b3f1e3339..5b4b77febb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_min_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_min_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IMIN (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (GB_IMIN (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MIN || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_MAX_MIN_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_min_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_min_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_min_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_min_uint32
 GrB_Info GB_Asaxpy3B__max_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_min_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_min_uint64.c
index d1347295f1..c7ba84af34 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_min_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_min_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IMIN (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (GB_IMIN (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MIN || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_MAX_MIN_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_min_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_min_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_min_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_min_uint64
 GrB_Info GB_Asaxpy3B__max_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_min_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_min_uint8.c
index 5be01677f9..fbd71e6d96 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_min_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_min_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IMIN (x, y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (GB_IMIN (ax, bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MIN || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_MAX_MIN_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_min_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_min_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_min_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_min_uint8
 GrB_Info GB_Asaxpy3B__max_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_minus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_minus_fp32.c
index b848d19e67..e1e598f428 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_minus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_minus_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, (aik - bkj))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, (x - y))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax - bx)) ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MINUS || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_MINUS_FP32 || GxB_NO_MAX_MINUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_minus_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_minus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_minus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_minus_fp32
 GrB_Info GB_Asaxpy3B__max_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_minus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_minus_fp64.c
index bfdf5da9ab..aaf7e8d77b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_minus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_minus_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, (aik - bkj))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, (x - y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax - bx)) ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MINUS || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_MINUS_FP64 || GxB_NO_MAX_MINUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_minus_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_minus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_minus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_minus_fp64
 GrB_Info GB_Asaxpy3B__max_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_minus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_minus_int16.c
index 254c1bea59..910cfd859a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_minus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_minus_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = (aik - bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x - y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax - bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MINUS || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_MINUS_INT16 || GxB_NO_MAX_MINUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_minus_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_minus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_minus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_minus_int16
 GrB_Info GB_Asaxpy3B__max_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_minus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_minus_int32.c
index 733e9fd0c9..0c331bb5b9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_minus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_minus_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = (aik - bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x - y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax - bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MINUS || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_MINUS_INT32 || GxB_NO_MAX_MINUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_minus_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_minus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_minus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_minus_int32
 GrB_Info GB_Asaxpy3B__max_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_minus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_minus_int64.c
index 5b14bc954d..71ce24a745 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_minus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_minus_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = (aik - bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x - y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax - bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MINUS || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_MINUS_INT64 || GxB_NO_MAX_MINUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_minus_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_minus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_minus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_minus_int64
 GrB_Info GB_Asaxpy3B__max_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_minus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_minus_int8.c
index 8ef530abc4..c07d9475ad 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_minus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_minus_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = (aik - bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x - y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax - bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MINUS || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_MINUS_INT8 || GxB_NO_MAX_MINUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_minus_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_minus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_minus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_minus_int8
 GrB_Info GB_Asaxpy3B__max_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint16.c
index ba850869ef..ad12cbb114 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = (aik - bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x - y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax - bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MINUS || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_MINUS_UINT16 || GxB_NO_MAX_MINUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_minus_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_minus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_minus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_minus_uint16
 GrB_Info GB_Asaxpy3B__max_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint32.c
index 4656c173d1..bc6a601fed 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = (aik - bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x - y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax - bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MINUS || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_MINUS_UINT32 || GxB_NO_MAX_MINUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_minus_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_minus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_minus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_minus_uint32
 GrB_Info GB_Asaxpy3B__max_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint64.c
index 7ef1e3969b..1f371b0002 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (aik - bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x - y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax - bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MINUS || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_MINUS_UINT64 || GxB_NO_MAX_MINUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_minus_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_minus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_minus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_minus_uint64
 GrB_Info GB_Asaxpy3B__max_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint8.c
index fc26957a1d..eed57ba3b7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_minus_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = (aik - bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x - y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax - bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_MINUS || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_MINUS_UINT8 || GxB_NO_MAX_MINUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_minus_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_minus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_minus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_minus_uint8
 GrB_Info GB_Asaxpy3B__max_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_plus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_plus_fp32.c
index 4783029ed4..7636b79298 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_plus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_plus_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, (aik + bkj))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, (x + y))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax + bx)) ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_PLUS || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_MAX_PLUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_plus_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_plus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_plus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_plus_fp32
 GrB_Info GB_Asaxpy3B__max_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_plus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_plus_fp64.c
index 7aae9b1f64..f8a0af8ec0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_plus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_plus_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, (aik + bkj))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, (x + y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax + bx)) ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_PLUS || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_MAX_PLUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_plus_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_plus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_plus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_plus_fp64
 GrB_Info GB_Asaxpy3B__max_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_plus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_plus_int16.c
index 009f99baa0..c26515dc30 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_plus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_plus_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = (aik + bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x + y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax + bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_PLUS || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_MAX_PLUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_plus_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_plus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_plus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_plus_int16
 GrB_Info GB_Asaxpy3B__max_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_plus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_plus_int32.c
index 2b34001982..403503b8d1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_plus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_plus_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = (aik + bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x + y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax + bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_PLUS || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_MAX_PLUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_plus_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_plus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_plus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_plus_int32
 GrB_Info GB_Asaxpy3B__max_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_plus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_plus_int64.c
index 7f10777398..3114d25f61 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_plus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_plus_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = (aik + bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x + y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax + bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_PLUS || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_MAX_PLUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_plus_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_plus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_plus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_plus_int64
 GrB_Info GB_Asaxpy3B__max_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_plus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_plus_int8.c
index d939771f41..3596c36b27 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_plus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_plus_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = (aik + bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x + y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax + bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_PLUS || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_MAX_PLUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_plus_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_plus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_plus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_plus_int8
 GrB_Info GB_Asaxpy3B__max_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint16.c
index 37010ce80b..d163e3c036 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = (aik + bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x + y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax + bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_PLUS || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_MAX_PLUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_plus_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_plus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_plus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_plus_uint16
 GrB_Info GB_Asaxpy3B__max_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint32.c
index cc16fe55cb..e1afde4c92 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = (aik + bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x + y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax + bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_PLUS || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_MAX_PLUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_plus_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_plus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_plus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_plus_uint32
 GrB_Info GB_Asaxpy3B__max_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint64.c
index 81ba7758c7..1913b84c30 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (aik + bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x + y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax + bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_PLUS || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_MAX_PLUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_plus_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_plus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_plus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_plus_uint64
 GrB_Info GB_Asaxpy3B__max_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint8.c
index f6b079fd99..f4c6eb15e7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_plus_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = (aik + bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x + y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax + bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_PLUS || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_MAX_PLUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_plus_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_plus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_plus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_plus_uint8
 GrB_Info GB_Asaxpy3B__max_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_fp32.c
index 216c4474d9..658a8a85ba 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (bkj / aik)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, (bkj / aik))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y / x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, (y / x))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RDIV || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_RDIV_FP32 || GxB_NO_MAX_RDIV_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rdiv_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rdiv_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rdiv_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rdiv_fp32
 GrB_Info GB_Asaxpy3B__max_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_fp64.c
index 429355b17f..6ac4303061 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (bkj / aik)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, (bkj / aik))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y / x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, (y / x))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RDIV || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_RDIV_FP64 || GxB_NO_MAX_RDIV_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rdiv_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rdiv_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rdiv_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rdiv_fp64
 GrB_Info GB_Asaxpy3B__max_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int16.c
index 17a90b67a9..2b667815a5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = GB_IDIV_SIGNED (bkj, aik, 16)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 16) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IDIV_SIGNED (y, x, 16) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (GB_IDIV_SIGNED (bx, ax, 16))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RDIV || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_RDIV_INT16 || GxB_NO_MAX_RDIV_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rdiv_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rdiv_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rdiv_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rdiv_int16
 GrB_Info GB_Asaxpy3B__max_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int32.c
index 0e23f12b04..4ebce189e1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = GB_IDIV_SIGNED (bkj, aik, 32)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 32) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IDIV_SIGNED (y, x, 32) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (GB_IDIV_SIGNED (bx, ax, 32))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RDIV || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_RDIV_INT32 || GxB_NO_MAX_RDIV_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rdiv_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rdiv_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rdiv_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rdiv_int32
 GrB_Info GB_Asaxpy3B__max_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int64.c
index 767b8143bc..6640f0c000 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = GB_IDIV_SIGNED (bkj, aik, 64)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 64) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IDIV_SIGNED (y, x, 64) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (GB_IDIV_SIGNED (bx, ax, 64))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RDIV || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_RDIV_INT64 || GxB_NO_MAX_RDIV_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rdiv_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rdiv_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rdiv_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rdiv_int64
 GrB_Info GB_Asaxpy3B__max_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int8.c
index 34572d6e7f..e7ffe21d4d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = GB_IDIV_SIGNED (bkj, aik, 8)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 8) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IDIV_SIGNED (y, x, 8) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (GB_IDIV_SIGNED (bx, ax, 8))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RDIV || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_RDIV_INT8 || GxB_NO_MAX_RDIV_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rdiv_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rdiv_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rdiv_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rdiv_int8
 GrB_Info GB_Asaxpy3B__max_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint16.c
index 66e3421e78..e3ea9e5c10 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (bkj, aik, 16)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 16) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IDIV_UNSIGNED (y, x, 16) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (GB_IDIV_UNSIGNED (bx, ax, 16))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RDIV || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_RDIV_UINT16 || GxB_NO_MAX_RDIV_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rdiv_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rdiv_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rdiv_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rdiv_uint16
 GrB_Info GB_Asaxpy3B__max_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint32.c
index dfe030f6b8..d696f7d0f9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (bkj, aik, 32)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 32) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IDIV_UNSIGNED (y, x, 32) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (GB_IDIV_UNSIGNED (bx, ax, 32))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RDIV || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_RDIV_UINT32 || GxB_NO_MAX_RDIV_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rdiv_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rdiv_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rdiv_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rdiv_uint32
 GrB_Info GB_Asaxpy3B__max_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint64.c
index b94d517d30..954adcbcca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (bkj, aik, 64)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 64) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IDIV_UNSIGNED (y, x, 64) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (GB_IDIV_UNSIGNED (bx, ax, 64))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RDIV || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_RDIV_UINT64 || GxB_NO_MAX_RDIV_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rdiv_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rdiv_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rdiv_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rdiv_uint64
 GrB_Info GB_Asaxpy3B__max_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint8.c
index d481b2154e..de7c616fdf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rdiv_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (bkj, aik, 8)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 8) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IDIV_UNSIGNED (y, x, 8) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (GB_IDIV_UNSIGNED (bx, ax, 8))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RDIV || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_RDIV_UINT8 || GxB_NO_MAX_RDIV_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rdiv_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rdiv_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rdiv_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rdiv_uint8
 GrB_Info GB_Asaxpy3B__max_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_fp32.c
index 8466860255..939bfcfeca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, (bkj - aik))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, (y - x))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((bx - ax)) ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RMINUS || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_RMINUS_FP32 || GxB_NO_MAX_RMINUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rminus_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rminus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rminus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rminus_fp32
 GrB_Info GB_Asaxpy3B__max_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_fp64.c
index 0157b17ee8..58cb6bfcce 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, (bkj - aik))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, (y - x))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((bx - ax)) ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RMINUS || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_RMINUS_FP64 || GxB_NO_MAX_RMINUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rminus_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rminus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rminus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rminus_fp64
 GrB_Info GB_Asaxpy3B__max_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int16.c
index b741f48706..b1d7b1cdcb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = (bkj - aik) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (y - x) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((bx - ax))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RMINUS || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_RMINUS_INT16 || GxB_NO_MAX_RMINUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rminus_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rminus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rminus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rminus_int16
 GrB_Info GB_Asaxpy3B__max_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int32.c
index e073cd825a..d44ec3f8b5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = (bkj - aik) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (y - x) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((bx - ax))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RMINUS || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_RMINUS_INT32 || GxB_NO_MAX_RMINUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rminus_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rminus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rminus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rminus_int32
 GrB_Info GB_Asaxpy3B__max_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int64.c
index cd69f668fc..2c10d3d4a9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = (bkj - aik) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (y - x) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((bx - ax))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RMINUS || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_RMINUS_INT64 || GxB_NO_MAX_RMINUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rminus_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rminus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rminus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rminus_int64
 GrB_Info GB_Asaxpy3B__max_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int8.c
index 10572310d7..1f6d1fb3dd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = (bkj - aik) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (y - x) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((bx - ax))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RMINUS || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_RMINUS_INT8 || GxB_NO_MAX_RMINUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rminus_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rminus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rminus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rminus_int8
 GrB_Info GB_Asaxpy3B__max_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint16.c
index 4c50ab27c7..d7f397fb5a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = (bkj - aik) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (y - x) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((bx - ax))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RMINUS || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_RMINUS_UINT16 || GxB_NO_MAX_RMINUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rminus_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rminus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rminus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rminus_uint16
 GrB_Info GB_Asaxpy3B__max_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint32.c
index 60c57cf477..a084b8e6e1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = (bkj - aik) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (y - x) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((bx - ax))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RMINUS || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_RMINUS_UINT32 || GxB_NO_MAX_RMINUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rminus_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rminus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rminus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rminus_uint32
 GrB_Info GB_Asaxpy3B__max_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint64.c
index 743c38eae6..4966b35cbe 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (bkj - aik) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (y - x) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((bx - ax))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RMINUS || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_RMINUS_UINT64 || GxB_NO_MAX_RMINUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rminus_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rminus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rminus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rminus_uint64
 GrB_Info GB_Asaxpy3B__max_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint8.c
index 7e02c97558..6ced8bd896 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_rminus_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = (bkj - aik) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (y - x) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((bx - ax))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_RMINUS || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_RMINUS_UINT8 || GxB_NO_MAX_RMINUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_rminus_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_rminus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_rminus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_rminus_uint8
 GrB_Info GB_Asaxpy3B__max_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_second_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_second_fp32.c
index fd257b67c7..7ab733cd13 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_second_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_second_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = bkj
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, bkj)
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !isnan ((bx)) && !isgreaterequal (cx, (bx))) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_SECOND || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_SECOND_FP32 || GxB_NO_MAX_SECOND_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_second_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_second_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_second_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_second_fp32
 GrB_Info GB_Asaxpy3B__max_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_second_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_second_fp64.c
index bba933cddd..a8371366b3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_second_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_second_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = bkj
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, bkj)
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !isnan ((bx)) && !isgreaterequal (cx, (bx))) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_SECOND || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_SECOND_FP64 || GxB_NO_MAX_SECOND_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_second_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_second_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_second_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_second_fp64
 GrB_Info GB_Asaxpy3B__max_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_second_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_second_int16.c
index 0c9ec846a2..ffd6ec9941 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_second_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_second_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, bkj)
+// MultAdd:  int16_t x_op_y = bkj ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = y ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int16_t) (bx))) cx = ((int16_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_SECOND || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_SECOND_INT16 || GxB_NO_MAX_SECOND_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_second_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_second_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_second_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_second_int16
 GrB_Info GB_Asaxpy3B__max_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_second_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_second_int32.c
index b1e0a1f3a8..c11ea9ef0b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_second_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_second_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, bkj)
+// MultAdd:  int32_t x_op_y = bkj ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = y ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int32_t) (bx))) cx = ((int32_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_SECOND || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_SECOND_INT32 || GxB_NO_MAX_SECOND_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_second_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_second_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_second_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_second_int32
 GrB_Info GB_Asaxpy3B__max_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_second_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_second_int64.c
index 1c4ec1b38f..74e70e18da 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_second_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_second_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, bkj)
+// MultAdd:  int64_t x_op_y = bkj ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = y ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int64_t) (bx))) cx = ((int64_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_SECOND || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_SECOND_INT64 || GxB_NO_MAX_SECOND_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_second_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_second_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_second_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_second_int64
 GrB_Info GB_Asaxpy3B__max_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_second_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_second_int8.c
index 1557d43328..7f1932e379 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_second_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_second_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, bkj)
+// MultAdd:  int8_t x_op_y = bkj ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = y ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int8_t) (bx))) cx = ((int8_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_SECOND || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_SECOND_INT8 || GxB_NO_MAX_SECOND_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_second_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_second_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_second_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_second_int8
 GrB_Info GB_Asaxpy3B__max_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_second_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_second_uint16.c
index 3c6269e956..919d509feb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_second_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_second_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, bkj)
+// MultAdd:  uint16_t x_op_y = bkj ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = y ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((uint16_t) (bx))) cx = ((uint16_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_SECOND || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_SECOND_UINT16 || GxB_NO_MAX_SECOND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_second_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_second_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_second_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_second_uint16
 GrB_Info GB_Asaxpy3B__max_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_second_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_second_uint32.c
index 73ae85b163..258d329ff1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_second_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_second_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, bkj)
+// MultAdd:  uint32_t x_op_y = bkj ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = y ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((uint32_t) (bx))) cx = ((uint32_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_SECOND || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_SECOND_UINT32 || GxB_NO_MAX_SECOND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_second_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_second_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_second_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_second_uint32
 GrB_Info GB_Asaxpy3B__max_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_second_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_second_uint64.c
index 7a2643c0fc..c1927f001e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_second_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_second_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, bkj)
+// MultAdd:  uint64_t x_op_y = bkj ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = y ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((uint64_t) (bx))) cx = ((uint64_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_SECOND || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_SECOND_UINT64 || GxB_NO_MAX_SECOND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_second_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_second_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_second_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_second_uint64
 GrB_Info GB_Asaxpy3B__max_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_second_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_second_uint8.c
index 67d1f5da28..57fd77b7b9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_second_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_second_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMAX (cij, bkj)
+// MultAdd:  uint8_t x_op_y = bkj ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMAX (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = y ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((uint8_t) (bx))) cx = ((uint8_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_SECOND || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_SECOND_UINT8 || GxB_NO_MAX_SECOND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_second_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_second_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_second_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_second_uint8
 GrB_Info GB_Asaxpy3B__max_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_secondj1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_secondj1_int32.c
new file mode 100644
index 0000000000..f5772b827f
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_secondj1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__max_secondj1_int32
+// A'*B function (dot3):     GB_Adot3B__max_secondj1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__max_secondj1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__max_secondj1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (j+1)
+// Add:      if (cij < z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int32_t x_op_y = (j+1) ; cij = GB_IMAX (cij, x_op_y)
+// Identity: INT32_MIN
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (j+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (j+1) ; z = GB_IMAX (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT32_MIN
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] < t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMAX (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] < t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int32_t) ((j+1)))) cx = ((int32_t) ((j+1))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MAX || GxB_NO_SECONDJ1 || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_SECONDJ1_INT32 || GxB_NO_MAX_SECONDJ1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__max_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__max_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__max_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__max_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_secondj1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_secondj1_int64.c
new file mode 100644
index 0000000000..411c5bd6a1
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_secondj1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__max_secondj1_int64
+// A'*B function (dot3):     GB_Adot3B__max_secondj1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__max_secondj1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__max_secondj1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (j+1)
+// Add:      if (cij < z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int64_t x_op_y = (j+1) ; cij = GB_IMAX (cij, x_op_y)
+// Identity: INT64_MIN
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (j+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (j+1) ; z = GB_IMAX (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT64_MIN
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] < t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMAX (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] < t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int64_t) ((j+1)))) cx = ((int64_t) ((j+1))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MAX || GxB_NO_SECONDJ1 || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_SECONDJ1_INT64 || GxB_NO_MAX_SECONDJ1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__max_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__max_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__max_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__max_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_secondj_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_secondj_int32.c
new file mode 100644
index 0000000000..e8548291a4
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_secondj_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__max_secondj_int32
+// A'*B function (dot3):     GB_Adot3B__max_secondj_int32
+// C+=A'*B function (dot4):  GB_Adot4B__max_secondj_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__max_secondj_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = j
+// Add:      if (cij < z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int32_t x_op_y = j ; cij = GB_IMAX (cij, x_op_y)
+// Identity: INT32_MIN
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = j
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = j ; z = GB_IMAX (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT32_MIN
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] < t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMAX (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] < t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int32_t) (j))) cx = ((int32_t) (j)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MAX || GxB_NO_SECONDJ || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_SECONDJ_INT32 || GxB_NO_MAX_SECONDJ_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__max_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__max_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__max_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__max_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_secondj_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_secondj_int64.c
new file mode 100644
index 0000000000..eb008f5b6b
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_secondj_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__max_secondj_int64
+// A'*B function (dot3):     GB_Adot3B__max_secondj_int64
+// C+=A'*B function (dot4):  GB_Adot4B__max_secondj_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__max_secondj_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = j
+// Add:      if (cij < z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int64_t x_op_y = j ; cij = GB_IMAX (cij, x_op_y)
+// Identity: INT64_MIN
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = j
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = j ; z = GB_IMAX (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT64_MIN
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] < t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMAX (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] < t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx < ((int64_t) (j))) cx = ((int64_t) (j)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MAX || GxB_NO_SECONDJ || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_SECONDJ_INT64 || GxB_NO_MAX_SECONDJ_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__max_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__max_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__max_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__max_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_times_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__max_times_fp32.c
index 8638228e8e..b5bec3b351 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_times_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_times_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = fmaxf (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmaxf (cij, (aik * bkj))
 // Identity: (-INFINITY)
-// Terminal: if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INFINITY) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmaxf (z, (x * y))
 
 // monoid identity value
 #define GB_IDENTITY \
     (-INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INFINITY) { cij_is_terminal = true ; break ; }
+    if (cij == INFINITY) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmaxf (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmaxf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmaxf (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax * bx)) ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_TIMES || GxB_NO_FP32 || GxB_NO_MAX_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_MAX_TIMES_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_times_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_times_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_times_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_times_fp32
 GrB_Info GB_Asaxpy3B__max_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_times_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__max_times_fp64.c
index 710a1e8c26..866806cedc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_times_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_times_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = fmax (cij, z)
+// Add:      if (!isnan (z) && !isgreaterequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmax (cij, (aik * bkj))
 // Identity: ((double) -INFINITY)
-// Terminal: if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmax (z, (x * y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) -INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmax (Cx [p], t)
+    if (!isnan (t) && !isgreaterequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmax (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    1
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !isgreaterequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmax (Hx [i], t)
+        if (!isnan (t) && !isgreaterequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax * bx)) ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_TIMES || GxB_NO_FP64 || GxB_NO_MAX_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_MAX_TIMES_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_times_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_times_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_times_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_times_fp64
 GrB_Info GB_Asaxpy3B__max_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_times_int16.c b/GraphBLAS/Source/Generated/GB_AxB__max_times_int16.c
index 3878e7a340..aace5e235c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_times_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_times_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = (aik * bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT16_MIN
-// Terminal: if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MAX) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x * y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax * bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_TIMES || GxB_NO_INT16 || GxB_NO_MAX_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_MAX_TIMES_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_times_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_times_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_times_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_times_int16
 GrB_Info GB_Asaxpy3B__max_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_times_int32.c b/GraphBLAS/Source/Generated/GB_AxB__max_times_int32.c
index 0b8326cf4a..fb0699ef65 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_times_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_times_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = (aik * bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT32_MIN
-// Terminal: if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MAX) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x * y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax * bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_TIMES || GxB_NO_INT32 || GxB_NO_MAX_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_MAX_TIMES_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_times_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_times_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_times_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_times_int32
 GrB_Info GB_Asaxpy3B__max_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_times_int64.c b/GraphBLAS/Source/Generated/GB_AxB__max_times_int64.c
index 4bab945843..73851be40a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_times_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_times_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = (aik * bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT64_MIN
-// Terminal: if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MAX) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x * y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax * bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_TIMES || GxB_NO_INT64 || GxB_NO_MAX_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_MAX_TIMES_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_times_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_times_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_times_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_times_int64
 GrB_Info GB_Asaxpy3B__max_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_times_int8.c b/GraphBLAS/Source/Generated/GB_AxB__max_times_int8.c
index 831f28c7ff..7b27f7a4da 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_times_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_times_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = (aik * bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: INT8_MIN
-// Terminal: if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MAX) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x * y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MIN
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax * bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_TIMES || GxB_NO_INT8 || GxB_NO_MAX_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_MAX_TIMES_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_times_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_times_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_times_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_times_int8
 GrB_Info GB_Asaxpy3B__max_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_times_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__max_times_uint16.c
index 5e596895d3..ce7b3eacd3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_times_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_times_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = (aik * bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT16_MAX) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x * y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT16_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT16_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax * bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_TIMES || GxB_NO_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_MAX_TIMES_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_times_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_times_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_times_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_times_uint16
 GrB_Info GB_Asaxpy3B__max_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_times_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__max_times_uint32.c
index 89dd9249c8..807b500672 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_times_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_times_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = (aik * bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT32_MAX) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x * y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT32_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT32_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax * bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_TIMES || GxB_NO_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_MAX_TIMES_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_times_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_times_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_times_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_times_uint32
 GrB_Info GB_Asaxpy3B__max_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_times_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__max_times_uint64.c
index 1c45d66bee..6632e2f800 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_times_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_times_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (aik * bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT64_MAX) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x * y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT64_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT64_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax * bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_TIMES || GxB_NO_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_MAX_TIMES_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_times_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_times_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_times_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_times_uint64
 GrB_Info GB_Asaxpy3B__max_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__max_times_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__max_times_uint8.c
index 7c97c3653f..a430dd4453 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__max_times_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__max_times_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMAX (cij, z)
+// Add:      if (cij < z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = (aik * bkj) ; cij = GB_IMAX (cij, x_op_y)
 // Identity: 0
-// Terminal: if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == UINT8_MAX) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x * y) ; z = GB_IMAX (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == UINT8_MAX) { cij_is_terminal = true ; break ; }
+    if (cij == UINT8_MAX) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMAX (Cx [p], t)
+    if (Cx [p] < t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMAX (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    1
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] < Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMAX (Hx [i], t)
+        if (Hx [i] < t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax * bx))) ; if (exists && cx < t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MAX || GxB_NO_TIMES || GxB_NO_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_MAX_TIMES_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__max_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__max_times_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__max_times_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__max_times_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__max_times_uint8
 GrB_Info GB_Asaxpy3B__max_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_div_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_div_fp32.c
index a50d344c3f..44cb7e62c9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_div_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_div_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik / bkj)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, (aik / bkj))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x / y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, (x / y))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_DIV || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_DIV_FP32 || GxB_NO_MIN_DIV_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_div_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_div_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_div_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_div_fp32
 GrB_Info GB_Asaxpy3B__min_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_div_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_div_fp64.c
index da8eb09820..a38cc08284 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_div_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_div_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik / bkj)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, (aik / bkj))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x / y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, (x / y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_DIV || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_DIV_FP64 || GxB_NO_MIN_DIV_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_div_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_div_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_div_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_div_fp64
 GrB_Info GB_Asaxpy3B__min_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_div_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_div_int16.c
index a44f77a597..da30c7ff2c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_div_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_div_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = GB_IDIV_SIGNED (aik, bkj, 16)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 16) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IDIV_SIGNED (x, y, 16) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (GB_IDIV_SIGNED (ax, bx, 16))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_DIV || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_DIV_INT16 || GxB_NO_MIN_DIV_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_div_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_div_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_div_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_div_int16
 GrB_Info GB_Asaxpy3B__min_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_div_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_div_int32.c
index 2afffbc3e9..90c7d57ea3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_div_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_div_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = GB_IDIV_SIGNED (aik, bkj, 32)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 32) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IDIV_SIGNED (x, y, 32) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (GB_IDIV_SIGNED (ax, bx, 32))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_DIV || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_DIV_INT32 || GxB_NO_MIN_DIV_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_div_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_div_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_div_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_div_int32
 GrB_Info GB_Asaxpy3B__min_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_div_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_div_int64.c
index 9208557328..6e276c75f1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_div_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_div_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = GB_IDIV_SIGNED (aik, bkj, 64)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 64) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IDIV_SIGNED (x, y, 64) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (GB_IDIV_SIGNED (ax, bx, 64))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_DIV || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_DIV_INT64 || GxB_NO_MIN_DIV_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_div_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_div_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_div_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_div_int64
 GrB_Info GB_Asaxpy3B__min_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_div_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_div_int8.c
index cd507c5762..6850223b3a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_div_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_div_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = GB_IDIV_SIGNED (aik, bkj, 8)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 8) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IDIV_SIGNED (x, y, 8) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (GB_IDIV_SIGNED (ax, bx, 8))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_DIV || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_DIV_INT8 || GxB_NO_MIN_DIV_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_div_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_div_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_div_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_div_int8
 GrB_Info GB_Asaxpy3B__min_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_div_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_div_uint16.c
index a123fb3c61..08b4a83cf0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_div_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_div_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (aik, bkj, 16)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 16) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IDIV_UNSIGNED (x, y, 16) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (GB_IDIV_UNSIGNED (ax, bx, 16))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_DIV || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_DIV_UINT16 || GxB_NO_MIN_DIV_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_div_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_div_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_div_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_div_uint16
 GrB_Info GB_Asaxpy3B__min_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_div_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_div_uint32.c
index ec3c7b7f24..f5b3115b61 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_div_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_div_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (aik, bkj, 32)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 32) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IDIV_UNSIGNED (x, y, 32) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (GB_IDIV_UNSIGNED (ax, bx, 32))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_DIV || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_DIV_UINT32 || GxB_NO_MIN_DIV_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_div_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_div_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_div_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_div_uint32
 GrB_Info GB_Asaxpy3B__min_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_div_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_div_uint64.c
index efbed61fa2..ada819c82d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_div_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_div_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (aik, bkj, 64)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 64) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IDIV_UNSIGNED (x, y, 64) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (GB_IDIV_UNSIGNED (ax, bx, 64))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_DIV || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_DIV_UINT64 || GxB_NO_MIN_DIV_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_div_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_div_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_div_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_div_uint64
 GrB_Info GB_Asaxpy3B__min_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_div_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_div_uint8.c
index a05ad69b53..90ec04ba3b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_div_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_div_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (aik, bkj, 8)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 8) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IDIV_UNSIGNED (x, y, 8) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (GB_IDIV_UNSIGNED (ax, bx, 8))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_DIV || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_DIV_UINT8 || GxB_NO_MIN_DIV_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_div_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_div_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_div_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_div_uint8
 GrB_Info GB_Asaxpy3B__min_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_first_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_first_fp32.c
index e58297fbc3..dbd811a077 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_first_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_first_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = aik
-// Add:      cij = fminf (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, aik)
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, x)
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !isnan ((ax)) && !islessequal (cx, (ax))) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_FIRST || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_FIRST_FP32 || GxB_NO_MIN_FIRST_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_first_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_first_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_first_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_first_fp32
 GrB_Info GB_Asaxpy3B__min_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_first_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_first_fp64.c
index c5c6cc879f..8f0b570140 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_first_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_first_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = aik
-// Add:      cij = fmin (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, aik)
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, x)
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !isnan ((ax)) && !islessequal (cx, (ax))) cx = (ax) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_FIRST || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_FIRST_FP64 || GxB_NO_MIN_FIRST_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_first_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_first_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_first_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_first_fp64
 GrB_Info GB_Asaxpy3B__min_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_first_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_first_int16.c
index 3ddc01e553..ddea3915ca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_first_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_first_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, aik)
+// MultAdd:  int16_t x_op_y = aik ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = x ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int16_t) (ax))) cx = ((int16_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_FIRST || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_FIRST_INT16 || GxB_NO_MIN_FIRST_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_first_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_first_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_first_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_first_int16
 GrB_Info GB_Asaxpy3B__min_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_first_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_first_int32.c
index bf4b77b22c..391ff7d5d5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_first_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_first_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, aik)
+// MultAdd:  int32_t x_op_y = aik ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = x ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int32_t) (ax))) cx = ((int32_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_FIRST || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_FIRST_INT32 || GxB_NO_MIN_FIRST_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_first_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_first_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_first_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_first_int32
 GrB_Info GB_Asaxpy3B__min_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_first_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_first_int64.c
index fe2a2ada82..445f166fcf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_first_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_first_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, aik)
+// MultAdd:  int64_t x_op_y = aik ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = x ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int64_t) (ax))) cx = ((int64_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_FIRST || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_FIRST_INT64 || GxB_NO_MIN_FIRST_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_first_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_first_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_first_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_first_int64
 GrB_Info GB_Asaxpy3B__min_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_first_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_first_int8.c
index 2dee24ba23..d8f47ae316 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_first_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_first_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, aik)
+// MultAdd:  int8_t x_op_y = aik ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = x ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int8_t) (ax))) cx = ((int8_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_FIRST || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_FIRST_INT8 || GxB_NO_MIN_FIRST_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_first_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_first_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_first_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_first_int8
 GrB_Info GB_Asaxpy3B__min_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_first_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_first_uint16.c
index 7d8301ea8f..8363e89852 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_first_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_first_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, aik)
+// MultAdd:  uint16_t x_op_y = aik ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = x ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((uint16_t) (ax))) cx = ((uint16_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_FIRST || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_FIRST_UINT16 || GxB_NO_MIN_FIRST_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_first_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_first_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_first_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_first_uint16
 GrB_Info GB_Asaxpy3B__min_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_first_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_first_uint32.c
index 6fc71b453c..46c7b5427a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_first_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_first_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, aik)
+// MultAdd:  uint32_t x_op_y = aik ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = x ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((uint32_t) (ax))) cx = ((uint32_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_FIRST || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_FIRST_UINT32 || GxB_NO_MIN_FIRST_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_first_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_first_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_first_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_first_uint32
 GrB_Info GB_Asaxpy3B__min_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_first_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_first_uint64.c
index e5f440e36c..8d74eb92c5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_first_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_first_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, aik)
+// MultAdd:  uint64_t x_op_y = aik ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = x ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((uint64_t) (ax))) cx = ((uint64_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_FIRST || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_FIRST_UINT64 || GxB_NO_MIN_FIRST_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_first_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_first_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_first_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_first_uint64
 GrB_Info GB_Asaxpy3B__min_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_first_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_first_uint8.c
index 91de80752a..462c16abda 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_first_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_first_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = aik
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, aik)
+// MultAdd:  uint8_t x_op_y = aik ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, x)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = x ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((uint8_t) (ax))) cx = ((uint8_t) (ax)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_FIRST || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_FIRST_UINT8 || GxB_NO_MIN_FIRST_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_first_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_first_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_first_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_first_uint8
 GrB_Info GB_Asaxpy3B__min_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_firsti1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_firsti1_int32.c
new file mode 100644
index 0000000000..1cf82b7d89
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_firsti1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__min_firsti1_int32
+// A'*B function (dot3):     GB_Adot3B__min_firsti1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__min_firsti1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__min_firsti1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (i+1)
+// Add:      if (cij > z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int32_t x_op_y = (i+1) ; cij = GB_IMIN (cij, x_op_y)
+// Identity: INT32_MAX
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (i+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (i+1) ; z = GB_IMIN (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT32_MAX
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] > t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMIN (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] > t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int32_t) ((i+1)))) cx = ((int32_t) ((i+1))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MIN || GxB_NO_FIRSTI1 || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_FIRSTI1_INT32 || GxB_NO_MIN_FIRSTI1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__min_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__min_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__min_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__min_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_firsti1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_firsti1_int64.c
new file mode 100644
index 0000000000..5e8824766c
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_firsti1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__min_firsti1_int64
+// A'*B function (dot3):     GB_Adot3B__min_firsti1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__min_firsti1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__min_firsti1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (i+1)
+// Add:      if (cij > z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int64_t x_op_y = (i+1) ; cij = GB_IMIN (cij, x_op_y)
+// Identity: INT64_MAX
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (i+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (i+1) ; z = GB_IMIN (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT64_MAX
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] > t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMIN (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] > t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int64_t) ((i+1)))) cx = ((int64_t) ((i+1))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MIN || GxB_NO_FIRSTI1 || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_FIRSTI1_INT64 || GxB_NO_MIN_FIRSTI1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__min_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__min_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__min_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__min_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_firsti_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_firsti_int32.c
new file mode 100644
index 0000000000..cce8c7b88a
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_firsti_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__min_firsti_int32
+// A'*B function (dot3):     GB_Adot3B__min_firsti_int32
+// C+=A'*B function (dot4):  GB_Adot4B__min_firsti_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__min_firsti_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = i
+// Add:      if (cij > z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int32_t x_op_y = i ; cij = GB_IMIN (cij, x_op_y)
+// Identity: INT32_MAX
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = i
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = i ; z = GB_IMIN (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT32_MAX
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] > t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMIN (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] > t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int32_t) (i))) cx = ((int32_t) (i)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MIN || GxB_NO_FIRSTI || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_FIRSTI_INT32 || GxB_NO_MIN_FIRSTI_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__min_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__min_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__min_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__min_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_firsti_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_firsti_int64.c
new file mode 100644
index 0000000000..b42ee084fa
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_firsti_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__min_firsti_int64
+// A'*B function (dot3):     GB_Adot3B__min_firsti_int64
+// C+=A'*B function (dot4):  GB_Adot4B__min_firsti_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__min_firsti_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = i
+// Add:      if (cij > z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int64_t x_op_y = i ; cij = GB_IMIN (cij, x_op_y)
+// Identity: INT64_MAX
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = i
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = i ; z = GB_IMIN (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT64_MAX
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] > t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMIN (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] > t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int64_t) (i))) cx = ((int64_t) (i)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MIN || GxB_NO_FIRSTI || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_FIRSTI_INT64 || GxB_NO_MIN_FIRSTI_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__min_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__min_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__min_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__min_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_firstj1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_firstj1_int32.c
new file mode 100644
index 0000000000..f400078d4a
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_firstj1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__min_firstj1_int32
+// A'*B function (dot3):     GB_Adot3B__min_firstj1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__min_firstj1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__min_firstj1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (k+1)
+// Add:      if (cij > z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int32_t x_op_y = (k+1) ; cij = GB_IMIN (cij, x_op_y)
+// Identity: INT32_MAX
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (k+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (k+1) ; z = GB_IMIN (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT32_MAX
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] > t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMIN (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] > t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int32_t) ((k+1)))) cx = ((int32_t) ((k+1))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MIN || GxB_NO_FIRSTJ1 || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_FIRSTJ1_INT32 || GxB_NO_MIN_FIRSTJ1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__min_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__min_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__min_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__min_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_firstj1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_firstj1_int64.c
new file mode 100644
index 0000000000..1467d284c7
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_firstj1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__min_firstj1_int64
+// A'*B function (dot3):     GB_Adot3B__min_firstj1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__min_firstj1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__min_firstj1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (k+1)
+// Add:      if (cij > z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int64_t x_op_y = (k+1) ; cij = GB_IMIN (cij, x_op_y)
+// Identity: INT64_MAX
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (k+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (k+1) ; z = GB_IMIN (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT64_MAX
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] > t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMIN (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] > t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int64_t) ((k+1)))) cx = ((int64_t) ((k+1))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MIN || GxB_NO_FIRSTJ1 || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_FIRSTJ1_INT64 || GxB_NO_MIN_FIRSTJ1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__min_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__min_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__min_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__min_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_firstj_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_firstj_int32.c
new file mode 100644
index 0000000000..9f1a645b7c
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_firstj_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__min_firstj_int32
+// A'*B function (dot3):     GB_Adot3B__min_firstj_int32
+// C+=A'*B function (dot4):  GB_Adot4B__min_firstj_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__min_firstj_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = k
+// Add:      if (cij > z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int32_t x_op_y = k ; cij = GB_IMIN (cij, x_op_y)
+// Identity: INT32_MAX
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = k
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = k ; z = GB_IMIN (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT32_MAX
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] > t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMIN (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] > t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int32_t) (k))) cx = ((int32_t) (k)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MIN || GxB_NO_FIRSTJ || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_FIRSTJ_INT32 || GxB_NO_MIN_FIRSTJ_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__min_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__min_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__min_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__min_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_firstj_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_firstj_int64.c
new file mode 100644
index 0000000000..9e3babf699
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_firstj_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__min_firstj_int64
+// A'*B function (dot3):     GB_Adot3B__min_firstj_int64
+// C+=A'*B function (dot4):  GB_Adot4B__min_firstj_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__min_firstj_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = k
+// Add:      if (cij > z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int64_t x_op_y = k ; cij = GB_IMIN (cij, x_op_y)
+// Identity: INT64_MAX
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = k
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = k ; z = GB_IMIN (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT64_MAX
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] > t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMIN (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] > t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int64_t) (k))) cx = ((int64_t) (k)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MIN || GxB_NO_FIRSTJ || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_FIRSTJ_INT64 || GxB_NO_MIN_FIRSTJ_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__min_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__min_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__min_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__min_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_fp32.c
index a8a6d15ae4..454fa61898 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, (aik == bkj))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax == bx)) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISEQ || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_ISEQ_FP32 || GxB_NO_MIN_ISEQ_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_iseq_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_iseq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_iseq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_iseq_fp32
 GrB_Info GB_Asaxpy3B__min_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_fp64.c
index 64d834468f..da1699ca94 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, (aik == bkj))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, (x == y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax == bx)) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISEQ || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_ISEQ_FP64 || GxB_NO_MIN_ISEQ_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_iseq_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_iseq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_iseq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_iseq_fp64
 GrB_Info GB_Asaxpy3B__min_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int16.c
index 9644f55a25..88bb78e5ec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik == bkj))
+// MultAdd:  int16_t x_op_y = (aik == bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x == y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax == bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISEQ || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_ISEQ_INT16 || GxB_NO_MIN_ISEQ_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_iseq_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_iseq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_iseq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_iseq_int16
 GrB_Info GB_Asaxpy3B__min_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int32.c
index 2f7afa9ca1..dfcf95e38c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik == bkj))
+// MultAdd:  int32_t x_op_y = (aik == bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x == y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax == bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISEQ || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_ISEQ_INT32 || GxB_NO_MIN_ISEQ_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_iseq_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_iseq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_iseq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_iseq_int32
 GrB_Info GB_Asaxpy3B__min_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int64.c
index 9a3890783a..122f55ab19 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik == bkj))
+// MultAdd:  int64_t x_op_y = (aik == bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x == y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax == bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISEQ || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_ISEQ_INT64 || GxB_NO_MIN_ISEQ_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_iseq_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_iseq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_iseq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_iseq_int64
 GrB_Info GB_Asaxpy3B__min_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int8.c
index 66a8a47098..0affbb3645 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik == bkj))
+// MultAdd:  int8_t x_op_y = (aik == bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x == y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax == bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISEQ || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_ISEQ_INT8 || GxB_NO_MIN_ISEQ_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_iseq_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_iseq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_iseq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_iseq_int8
 GrB_Info GB_Asaxpy3B__min_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint16.c
index 9da01bd8ad..e4f334170b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik == bkj))
+// MultAdd:  uint16_t x_op_y = (aik == bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x == y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax == bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISEQ || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_ISEQ_UINT16 || GxB_NO_MIN_ISEQ_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_iseq_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_iseq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_iseq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_iseq_uint16
 GrB_Info GB_Asaxpy3B__min_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint32.c
index 3df8902afd..97cb5e5ab0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik == bkj))
+// MultAdd:  uint32_t x_op_y = (aik == bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x == y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax == bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISEQ || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_ISEQ_UINT32 || GxB_NO_MIN_ISEQ_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_iseq_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_iseq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_iseq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_iseq_uint32
 GrB_Info GB_Asaxpy3B__min_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint64.c
index d5b9fd1a40..3e2cbcc3fc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik == bkj))
+// MultAdd:  uint64_t x_op_y = (aik == bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x == y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax == bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISEQ || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_ISEQ_UINT64 || GxB_NO_MIN_ISEQ_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_iseq_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_iseq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_iseq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_iseq_uint64
 GrB_Info GB_Asaxpy3B__min_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint8.c
index 51a8fde0c7..c3270f2ec7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_iseq_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik == bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik == bkj))
+// MultAdd:  uint8_t x_op_y = (aik == bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x == y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x == y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax == bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISEQ || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_ISEQ_UINT8 || GxB_NO_MIN_ISEQ_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_iseq_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_iseq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_iseq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_iseq_uint8
 GrB_Info GB_Asaxpy3B__min_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isge_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_isge_fp32.c
index dea1732191..f9212485e5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isge_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isge_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, (aik >= bkj))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax >= bx)) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGE || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_ISGE_FP32 || GxB_NO_MIN_ISGE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isge_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isge_fp32
 GrB_Info GB_Asaxpy3B__min_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isge_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_isge_fp64.c
index ca8efdbcf6..91c623fb1e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isge_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isge_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, (aik >= bkj))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, (x >= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax >= bx)) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGE || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_ISGE_FP64 || GxB_NO_MIN_ISGE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isge_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isge_fp64
 GrB_Info GB_Asaxpy3B__min_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isge_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_isge_int16.c
index 4f5056e6ba..169a730a1b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isge_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isge_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik >= bkj))
+// MultAdd:  int16_t x_op_y = (aik >= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x >= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax >= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGE || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_ISGE_INT16 || GxB_NO_MIN_ISGE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isge_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isge_int16
 GrB_Info GB_Asaxpy3B__min_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isge_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_isge_int32.c
index 63b0d3eb5d..bfba8b75ea 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isge_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isge_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik >= bkj))
+// MultAdd:  int32_t x_op_y = (aik >= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x >= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax >= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGE || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_ISGE_INT32 || GxB_NO_MIN_ISGE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isge_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isge_int32
 GrB_Info GB_Asaxpy3B__min_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isge_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_isge_int64.c
index 0a92f3db54..336cc6fea4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isge_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isge_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik >= bkj))
+// MultAdd:  int64_t x_op_y = (aik >= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x >= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax >= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGE || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_ISGE_INT64 || GxB_NO_MIN_ISGE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isge_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isge_int64
 GrB_Info GB_Asaxpy3B__min_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isge_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_isge_int8.c
index 8e55b82cfc..d5f1ab9596 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isge_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isge_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik >= bkj))
+// MultAdd:  int8_t x_op_y = (aik >= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x >= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax >= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGE || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_ISGE_INT8 || GxB_NO_MIN_ISGE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isge_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isge_int8
 GrB_Info GB_Asaxpy3B__min_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint16.c
index 3d50436400..b4af76f962 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik >= bkj))
+// MultAdd:  uint16_t x_op_y = (aik >= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x >= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax >= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGE || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_ISGE_UINT16 || GxB_NO_MIN_ISGE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isge_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isge_uint16
 GrB_Info GB_Asaxpy3B__min_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint32.c
index 185504f8de..901547c08c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik >= bkj))
+// MultAdd:  uint32_t x_op_y = (aik >= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x >= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax >= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGE || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_ISGE_UINT32 || GxB_NO_MIN_ISGE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isge_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isge_uint32
 GrB_Info GB_Asaxpy3B__min_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint64.c
index a785b4d040..507a95efc5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik >= bkj))
+// MultAdd:  uint64_t x_op_y = (aik >= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x >= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax >= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGE || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_ISGE_UINT64 || GxB_NO_MIN_ISGE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isge_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isge_uint64
 GrB_Info GB_Asaxpy3B__min_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint8.c
index 87e751bd2c..db588558e4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isge_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik >= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik >= bkj))
+// MultAdd:  uint8_t x_op_y = (aik >= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x >= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x >= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax >= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGE || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_ISGE_UINT8 || GxB_NO_MIN_ISGE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isge_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isge_uint8
 GrB_Info GB_Asaxpy3B__min_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_fp32.c
index a9c0e16965..09aa46098b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, (aik > bkj))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax > bx)) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGT || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_ISGT_FP32 || GxB_NO_MIN_ISGT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isgt_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isgt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isgt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isgt_fp32
 GrB_Info GB_Asaxpy3B__min_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_fp64.c
index 51a6c6797d..c69a41791a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, (aik > bkj))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, (x > y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax > bx)) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGT || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_ISGT_FP64 || GxB_NO_MIN_ISGT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isgt_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isgt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isgt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isgt_fp64
 GrB_Info GB_Asaxpy3B__min_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int16.c
index a6d3c1c151..a26c0f9103 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik > bkj))
+// MultAdd:  int16_t x_op_y = (aik > bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x > y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax > bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGT || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_ISGT_INT16 || GxB_NO_MIN_ISGT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isgt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isgt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isgt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isgt_int16
 GrB_Info GB_Asaxpy3B__min_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int32.c
index f3dbf33932..8989d2ed64 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik > bkj))
+// MultAdd:  int32_t x_op_y = (aik > bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x > y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax > bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGT || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_ISGT_INT32 || GxB_NO_MIN_ISGT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isgt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isgt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isgt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isgt_int32
 GrB_Info GB_Asaxpy3B__min_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int64.c
index 3db483c664..3ba0cfcc89 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik > bkj))
+// MultAdd:  int64_t x_op_y = (aik > bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x > y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax > bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGT || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_ISGT_INT64 || GxB_NO_MIN_ISGT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isgt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isgt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isgt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isgt_int64
 GrB_Info GB_Asaxpy3B__min_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int8.c
index 620dcf192c..4b5528c56c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik > bkj))
+// MultAdd:  int8_t x_op_y = (aik > bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x > y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax > bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGT || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_ISGT_INT8 || GxB_NO_MIN_ISGT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isgt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isgt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isgt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isgt_int8
 GrB_Info GB_Asaxpy3B__min_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint16.c
index 2637da8e10..9d473f319a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik > bkj))
+// MultAdd:  uint16_t x_op_y = (aik > bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x > y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax > bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGT || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_ISGT_UINT16 || GxB_NO_MIN_ISGT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isgt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isgt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isgt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isgt_uint16
 GrB_Info GB_Asaxpy3B__min_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint32.c
index 5b31f0912b..9faf7feaf6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik > bkj))
+// MultAdd:  uint32_t x_op_y = (aik > bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x > y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax > bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGT || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_ISGT_UINT32 || GxB_NO_MIN_ISGT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isgt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isgt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isgt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isgt_uint32
 GrB_Info GB_Asaxpy3B__min_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint64.c
index 78f28dcbe1..ed20f8bfee 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik > bkj))
+// MultAdd:  uint64_t x_op_y = (aik > bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x > y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax > bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGT || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_ISGT_UINT64 || GxB_NO_MIN_ISGT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isgt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isgt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isgt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isgt_uint64
 GrB_Info GB_Asaxpy3B__min_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint8.c
index fc0bb1e3c6..600b548965 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isgt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik > bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik > bkj))
+// MultAdd:  uint8_t x_op_y = (aik > bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x > y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x > y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax > bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISGT || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_ISGT_UINT8 || GxB_NO_MIN_ISGT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isgt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isgt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isgt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isgt_uint8
 GrB_Info GB_Asaxpy3B__min_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isle_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_isle_fp32.c
index 59a10b86ff..f3a2bd095e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isle_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isle_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, (aik <= bkj))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax <= bx)) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLE || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_ISLE_FP32 || GxB_NO_MIN_ISLE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isle_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isle_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isle_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isle_fp32
 GrB_Info GB_Asaxpy3B__min_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isle_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_isle_fp64.c
index cfdc7f4099..430c34ae0f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isle_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isle_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, (aik <= bkj))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, (x <= y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax <= bx)) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLE || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_ISLE_FP64 || GxB_NO_MIN_ISLE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isle_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isle_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isle_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isle_fp64
 GrB_Info GB_Asaxpy3B__min_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isle_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_isle_int16.c
index dc0d213885..df129626c7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isle_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isle_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik <= bkj))
+// MultAdd:  int16_t x_op_y = (aik <= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x <= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax <= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLE || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_ISLE_INT16 || GxB_NO_MIN_ISLE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isle_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isle_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isle_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isle_int16
 GrB_Info GB_Asaxpy3B__min_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isle_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_isle_int32.c
index ce93632ce3..a5c2f66e5b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isle_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isle_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik <= bkj))
+// MultAdd:  int32_t x_op_y = (aik <= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x <= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax <= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLE || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_ISLE_INT32 || GxB_NO_MIN_ISLE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isle_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isle_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isle_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isle_int32
 GrB_Info GB_Asaxpy3B__min_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isle_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_isle_int64.c
index 255c08cb7e..9c3f0c32bc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isle_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isle_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik <= bkj))
+// MultAdd:  int64_t x_op_y = (aik <= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x <= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax <= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLE || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_ISLE_INT64 || GxB_NO_MIN_ISLE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isle_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isle_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isle_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isle_int64
 GrB_Info GB_Asaxpy3B__min_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isle_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_isle_int8.c
index f6e5934ffb..cb9946a2ed 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isle_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isle_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik <= bkj))
+// MultAdd:  int8_t x_op_y = (aik <= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x <= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax <= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLE || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_ISLE_INT8 || GxB_NO_MIN_ISLE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isle_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isle_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isle_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isle_int8
 GrB_Info GB_Asaxpy3B__min_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint16.c
index 771d943b36..dc7333077d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik <= bkj))
+// MultAdd:  uint16_t x_op_y = (aik <= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x <= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax <= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLE || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_ISLE_UINT16 || GxB_NO_MIN_ISLE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isle_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isle_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isle_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isle_uint16
 GrB_Info GB_Asaxpy3B__min_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint32.c
index 86d8235701..5a5f26b2d5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik <= bkj))
+// MultAdd:  uint32_t x_op_y = (aik <= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x <= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax <= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLE || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_ISLE_UINT32 || GxB_NO_MIN_ISLE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isle_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isle_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isle_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isle_uint32
 GrB_Info GB_Asaxpy3B__min_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint64.c
index 50b264d09c..51ef282a13 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik <= bkj))
+// MultAdd:  uint64_t x_op_y = (aik <= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x <= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax <= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLE || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_ISLE_UINT64 || GxB_NO_MIN_ISLE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isle_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isle_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isle_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isle_uint64
 GrB_Info GB_Asaxpy3B__min_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint8.c
index 7335c45c5c..8f8736b186 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isle_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik <= bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik <= bkj))
+// MultAdd:  uint8_t x_op_y = (aik <= bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x <= y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x <= y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax <= bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLE || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_ISLE_UINT8 || GxB_NO_MIN_ISLE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isle_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isle_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isle_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isle_uint8
 GrB_Info GB_Asaxpy3B__min_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_islt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_islt_fp32.c
index d22fa1c82c..050a105d1d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_islt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_islt_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, (aik < bkj))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax < bx)) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLT || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_ISLT_FP32 || GxB_NO_MIN_ISLT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_islt_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_islt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_islt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_islt_fp32
 GrB_Info GB_Asaxpy3B__min_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_islt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_islt_fp64.c
index aa8491ae53..591f2ae72a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_islt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_islt_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, (aik < bkj))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, (x < y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax < bx)) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLT || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_ISLT_FP64 || GxB_NO_MIN_ISLT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_islt_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_islt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_islt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_islt_fp64
 GrB_Info GB_Asaxpy3B__min_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_islt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_islt_int16.c
index fc1b85c52d..49557275a7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_islt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_islt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik < bkj))
+// MultAdd:  int16_t x_op_y = (aik < bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x < y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax < bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLT || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_ISLT_INT16 || GxB_NO_MIN_ISLT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_islt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_islt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_islt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_islt_int16
 GrB_Info GB_Asaxpy3B__min_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_islt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_islt_int32.c
index e8b300c8cf..9b4a24b5af 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_islt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_islt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik < bkj))
+// MultAdd:  int32_t x_op_y = (aik < bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x < y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax < bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLT || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_ISLT_INT32 || GxB_NO_MIN_ISLT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_islt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_islt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_islt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_islt_int32
 GrB_Info GB_Asaxpy3B__min_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_islt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_islt_int64.c
index e6bdf9ef84..e7003e1247 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_islt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_islt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik < bkj))
+// MultAdd:  int64_t x_op_y = (aik < bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x < y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax < bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLT || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_ISLT_INT64 || GxB_NO_MIN_ISLT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_islt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_islt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_islt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_islt_int64
 GrB_Info GB_Asaxpy3B__min_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_islt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_islt_int8.c
index 37457e46ca..bfe076ed2c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_islt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_islt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik < bkj))
+// MultAdd:  int8_t x_op_y = (aik < bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x < y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax < bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLT || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_ISLT_INT8 || GxB_NO_MIN_ISLT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_islt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_islt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_islt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_islt_int8
 GrB_Info GB_Asaxpy3B__min_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint16.c
index 3c83248dc1..bca92f08dc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik < bkj))
+// MultAdd:  uint16_t x_op_y = (aik < bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x < y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax < bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLT || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_ISLT_UINT16 || GxB_NO_MIN_ISLT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_islt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_islt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_islt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_islt_uint16
 GrB_Info GB_Asaxpy3B__min_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint32.c
index f97bf33921..acfb4745e0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik < bkj))
+// MultAdd:  uint32_t x_op_y = (aik < bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x < y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax < bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLT || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_ISLT_UINT32 || GxB_NO_MIN_ISLT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_islt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_islt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_islt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_islt_uint32
 GrB_Info GB_Asaxpy3B__min_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint64.c
index 2819f22c61..9810ce8c1b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik < bkj))
+// MultAdd:  uint64_t x_op_y = (aik < bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x < y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax < bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLT || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_ISLT_UINT64 || GxB_NO_MIN_ISLT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_islt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_islt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_islt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_islt_uint64
 GrB_Info GB_Asaxpy3B__min_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint8.c
index 967182c058..04d58ee996 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_islt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik < bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik < bkj))
+// MultAdd:  uint8_t x_op_y = (aik < bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x < y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x < y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax < bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISLT || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_ISLT_UINT8 || GxB_NO_MIN_ISLT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_islt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_islt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_islt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_islt_uint8
 GrB_Info GB_Asaxpy3B__min_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isne_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_isne_fp32.c
index 3aba64341a..430c15ae75 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isne_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isne_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, (aik != bkj))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax != bx)) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISNE || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_ISNE_FP32 || GxB_NO_MIN_ISNE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isne_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isne_fp32
 GrB_Info GB_Asaxpy3B__min_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isne_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_isne_fp64.c
index 47fb8d7a77..43de1dbe42 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isne_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isne_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, (aik != bkj))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, (x != y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax != bx)) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISNE || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_ISNE_FP64 || GxB_NO_MIN_ISNE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isne_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isne_fp64
 GrB_Info GB_Asaxpy3B__min_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isne_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_isne_int16.c
index 9f3cced581..b71e1222c9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isne_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isne_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik != bkj))
+// MultAdd:  int16_t x_op_y = (aik != bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x != y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax != bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISNE || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_ISNE_INT16 || GxB_NO_MIN_ISNE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isne_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isne_int16
 GrB_Info GB_Asaxpy3B__min_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isne_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_isne_int32.c
index a2734ef6e7..ad777a16a3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isne_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isne_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik != bkj))
+// MultAdd:  int32_t x_op_y = (aik != bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x != y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax != bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISNE || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_ISNE_INT32 || GxB_NO_MIN_ISNE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isne_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isne_int32
 GrB_Info GB_Asaxpy3B__min_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isne_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_isne_int64.c
index 7a5a9e41ab..babe2fae5a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isne_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isne_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik != bkj))
+// MultAdd:  int64_t x_op_y = (aik != bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x != y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax != bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISNE || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_ISNE_INT64 || GxB_NO_MIN_ISNE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isne_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isne_int64
 GrB_Info GB_Asaxpy3B__min_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isne_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_isne_int8.c
index 876a38418e..a70747b00e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isne_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isne_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik != bkj))
+// MultAdd:  int8_t x_op_y = (aik != bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x != y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax != bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISNE || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_ISNE_INT8 || GxB_NO_MIN_ISNE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isne_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isne_int8
 GrB_Info GB_Asaxpy3B__min_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint16.c
index 8662f06c86..bd38dd3828 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik != bkj))
+// MultAdd:  uint16_t x_op_y = (aik != bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x != y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax != bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISNE || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_ISNE_UINT16 || GxB_NO_MIN_ISNE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isne_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isne_uint16
 GrB_Info GB_Asaxpy3B__min_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint32.c
index 257679ab76..4e0b67ad0f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik != bkj))
+// MultAdd:  uint32_t x_op_y = (aik != bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x != y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax != bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISNE || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_ISNE_UINT32 || GxB_NO_MIN_ISNE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isne_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isne_uint32
 GrB_Info GB_Asaxpy3B__min_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint64.c
index 825c10708d..327d862e33 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik != bkj))
+// MultAdd:  uint64_t x_op_y = (aik != bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x != y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax != bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISNE || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_ISNE_UINT64 || GxB_NO_MIN_ISNE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isne_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isne_uint64
 GrB_Info GB_Asaxpy3B__min_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint8.c
index f9d68c25ec..7dabc2256f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_isne_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik != bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, (aik != bkj))
+// MultAdd:  uint8_t x_op_y = (aik != bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, (x != y))
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x != y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax != bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_ISNE || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_ISNE_UINT8 || GxB_NO_MIN_ISNE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_isne_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_isne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_isne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_isne_uint8
 GrB_Info GB_Asaxpy3B__min_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_land_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_land_fp32.c
index c94d564b1d..70e5e5e431 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_land_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_land_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = fminf (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, ((aik != 0) && (bkj != 0)))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, ((x != 0) && (y != 0)))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = (((ax != 0) && (bx != 0))) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LAND || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_LAND_FP32 || GxB_NO_MIN_LAND_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_land_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_land_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_land_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_land_fp32
 GrB_Info GB_Asaxpy3B__min_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_land_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_land_fp64.c
index a505af2a4f..a692298bbd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_land_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_land_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = fmin (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, ((aik != 0) && (bkj != 0)))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, ((x != 0) && (y != 0)))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = (((ax != 0) && (bx != 0))) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LAND || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_LAND_FP64 || GxB_NO_MIN_LAND_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_land_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_land_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_land_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_land_fp64
 GrB_Info GB_Asaxpy3B__min_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_land_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_land_int16.c
index 65ea91f92e..a3ddb7d808 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_land_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_land_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LAND || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_LAND_INT16 || GxB_NO_MIN_LAND_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_land_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_land_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_land_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_land_int16
 GrB_Info GB_Asaxpy3B__min_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_land_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_land_int32.c
index 34ce90896e..4d69a46f57 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_land_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_land_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LAND || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_LAND_INT32 || GxB_NO_MIN_LAND_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_land_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_land_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_land_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_land_int32
 GrB_Info GB_Asaxpy3B__min_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_land_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_land_int64.c
index 5e23b23ef5..09b53dc1c1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_land_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_land_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LAND || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_LAND_INT64 || GxB_NO_MIN_LAND_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_land_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_land_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_land_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_land_int64
 GrB_Info GB_Asaxpy3B__min_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_land_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_land_int8.c
index 4ff145466f..3627dc3c99 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_land_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_land_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LAND || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_LAND_INT8 || GxB_NO_MIN_LAND_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_land_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_land_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_land_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_land_int8
 GrB_Info GB_Asaxpy3B__min_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_land_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_land_uint16.c
index 55fa35ed13..ae40872c43 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_land_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_land_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LAND || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_LAND_UINT16 || GxB_NO_MIN_LAND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_land_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_land_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_land_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_land_uint16
 GrB_Info GB_Asaxpy3B__min_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_land_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_land_uint32.c
index 7a29937ec5..ee96722b4b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_land_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_land_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LAND || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_LAND_UINT32 || GxB_NO_MIN_LAND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_land_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_land_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_land_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_land_uint32
 GrB_Info GB_Asaxpy3B__min_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_land_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_land_uint64.c
index 859f51c366..f35bcd638d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_land_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_land_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LAND || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_LAND_UINT64 || GxB_NO_MIN_LAND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_land_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_land_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_land_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_land_uint64
 GrB_Info GB_Asaxpy3B__min_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_land_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_land_uint8.c
index 62fdbd6da2..4f1c6f96d6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_land_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_land_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = ((aik != 0) && (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) && (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (((ax != 0) && (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LAND || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_LAND_UINT8 || GxB_NO_MIN_LAND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_land_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_land_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_land_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_land_uint8
 GrB_Info GB_Asaxpy3B__min_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lor_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_lor_fp32.c
index 6c1a60d121..8ad283bf40 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lor_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lor_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = fminf (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, ((aik != 0) || (bkj != 0)))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, ((x != 0) || (y != 0)))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = (((ax != 0) || (bx != 0))) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LOR || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_LOR_FP32 || GxB_NO_MIN_LOR_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lor_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lor_fp32
 GrB_Info GB_Asaxpy3B__min_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lor_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_lor_fp64.c
index 695c43b17e..ce5d2661b8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lor_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lor_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = fmin (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, ((aik != 0) || (bkj != 0)))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, ((x != 0) || (y != 0)))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = (((ax != 0) || (bx != 0))) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LOR || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_LOR_FP64 || GxB_NO_MIN_LOR_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lor_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lor_fp64
 GrB_Info GB_Asaxpy3B__min_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lor_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_lor_int16.c
index 229cbdb948..91cf203766 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lor_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LOR || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_LOR_INT16 || GxB_NO_MIN_LOR_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lor_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lor_int16
 GrB_Info GB_Asaxpy3B__min_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lor_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_lor_int32.c
index e7d285853f..9f0673ec47 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lor_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LOR || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_LOR_INT32 || GxB_NO_MIN_LOR_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lor_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lor_int32
 GrB_Info GB_Asaxpy3B__min_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lor_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_lor_int64.c
index f2bc760e7d..6418639894 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lor_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LOR || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_LOR_INT64 || GxB_NO_MIN_LOR_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lor_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lor_int64
 GrB_Info GB_Asaxpy3B__min_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lor_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_lor_int8.c
index e747dddfc3..45ec7db40a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lor_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LOR || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_LOR_INT8 || GxB_NO_MIN_LOR_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lor_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lor_int8
 GrB_Info GB_Asaxpy3B__min_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint16.c
index cb4bf38a04..3486b37c5c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LOR || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_LOR_UINT16 || GxB_NO_MIN_LOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lor_uint16
 GrB_Info GB_Asaxpy3B__min_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint32.c
index c4509b2234..3bbc8d01c6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LOR || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_LOR_UINT32 || GxB_NO_MIN_LOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lor_uint32
 GrB_Info GB_Asaxpy3B__min_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint64.c
index 43b5abfdc0..89682e592c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LOR || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_LOR_UINT64 || GxB_NO_MIN_LOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lor_uint64
 GrB_Info GB_Asaxpy3B__min_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint8.c
index 37dd30e09e..0e886c1f16 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = ((aik != 0) || (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) || (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (((ax != 0) || (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LOR || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_LOR_UINT8 || GxB_NO_MIN_LOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lor_uint8
 GrB_Info GB_Asaxpy3B__min_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_fp32.c
index 4bed3b5f03..9adfa19a8c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = fminf (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, ((aik != 0) != (bkj != 0)))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, ((x != 0) != (y != 0)))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = (((ax != 0) != (bx != 0))) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LXOR || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_LXOR_FP32 || GxB_NO_MIN_LXOR_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lxor_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lxor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lxor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lxor_fp32
 GrB_Info GB_Asaxpy3B__min_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_fp64.c
index 068c2af52e..361a9576fd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = fmin (cij, z)
+// Add:      if (!islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, ((aik != 0) != (bkj != 0)))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, ((x != 0) != (y != 0)))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = (((ax != 0) != (bx != 0))) ; if (exists && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LXOR || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_LXOR_FP64 || GxB_NO_MIN_LXOR_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lxor_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lxor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lxor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lxor_fp64
 GrB_Info GB_Asaxpy3B__min_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int16.c
index 5061b1ef32..5af169462a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LXOR || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_LXOR_INT16 || GxB_NO_MIN_LXOR_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lxor_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lxor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lxor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lxor_int16
 GrB_Info GB_Asaxpy3B__min_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int32.c
index 8baf174a4a..4b502f0424 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LXOR || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_LXOR_INT32 || GxB_NO_MIN_LXOR_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lxor_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lxor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lxor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lxor_int32
 GrB_Info GB_Asaxpy3B__min_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int64.c
index f01c6a556b..55f42e2411 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LXOR || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_LXOR_INT64 || GxB_NO_MIN_LXOR_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lxor_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lxor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lxor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lxor_int64
 GrB_Info GB_Asaxpy3B__min_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int8.c
index 298e6c737e..d19dd037bf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LXOR || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_LXOR_INT8 || GxB_NO_MIN_LXOR_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lxor_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lxor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lxor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lxor_int8
 GrB_Info GB_Asaxpy3B__min_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint16.c
index fc0c7afc9d..1dc176a3c0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LXOR || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_LXOR_UINT16 || GxB_NO_MIN_LXOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lxor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lxor_uint16
 GrB_Info GB_Asaxpy3B__min_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint32.c
index 3d55d20123..e2f43e654d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LXOR || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_LXOR_UINT32 || GxB_NO_MIN_LXOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lxor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lxor_uint32
 GrB_Info GB_Asaxpy3B__min_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint64.c
index 014933abe9..b968e84003 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LXOR || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_LXOR_UINT64 || GxB_NO_MIN_LXOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lxor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lxor_uint64
 GrB_Info GB_Asaxpy3B__min_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint8.c
index 5efcc6e9e2..f011c7586a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_lxor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = ((aik != 0) != (bkj != 0))
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) != (y != 0)) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (((ax != 0) != (bx != 0)))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_LXOR || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_LXOR_UINT8 || GxB_NO_MIN_LXOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_lxor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_lxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_lxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_lxor_uint8
 GrB_Info GB_Asaxpy3B__min_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_max_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_max_fp32.c
index e0c5cf3ec7..a7f5e05aba 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_max_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_max_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = fmaxf (aik, bkj)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, fmaxf (aik, bkj))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmaxf (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, fmaxf (x, y))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = (fmaxf (ax, bx)) ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MAX || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_MAX_FP32 || GxB_NO_MIN_MAX_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_max_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_max_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_max_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_max_fp32
 GrB_Info GB_Asaxpy3B__min_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_max_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_max_fp64.c
index 570f08b43c..5313366940 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_max_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_max_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = fmax (aik, bkj)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, fmax (aik, bkj))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmax (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, fmax (x, y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = (fmax (ax, bx)) ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MAX || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_MAX_FP64 || GxB_NO_MIN_MAX_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_max_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_max_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_max_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_max_fp64
 GrB_Info GB_Asaxpy3B__min_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_max_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_max_int16.c
index b6130f9d80..52d0d1df29 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_max_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_max_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IMAX (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (GB_IMAX (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MAX || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_MAX_INT16 || GxB_NO_MIN_MAX_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_max_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_max_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_max_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_max_int16
 GrB_Info GB_Asaxpy3B__min_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_max_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_max_int32.c
index 3377010a67..b7022d9cf8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_max_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_max_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IMAX (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (GB_IMAX (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MAX || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_MAX_INT32 || GxB_NO_MIN_MAX_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_max_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_max_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_max_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_max_int32
 GrB_Info GB_Asaxpy3B__min_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_max_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_max_int64.c
index 4500be2b97..3e830ddbad 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_max_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_max_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IMAX (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (GB_IMAX (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MAX || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_MAX_INT64 || GxB_NO_MIN_MAX_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_max_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_max_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_max_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_max_int64
 GrB_Info GB_Asaxpy3B__min_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_max_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_max_int8.c
index 1f4c86d99c..2f7f8bb9e0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_max_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_max_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IMAX (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (GB_IMAX (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MAX || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_MAX_INT8 || GxB_NO_MIN_MAX_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_max_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_max_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_max_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_max_int8
 GrB_Info GB_Asaxpy3B__min_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_max_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_max_uint16.c
index e698e1e8f9..3e21fbf79a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_max_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_max_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IMAX (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (GB_IMAX (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MAX || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_MIN_MAX_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_max_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_max_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_max_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_max_uint16
 GrB_Info GB_Asaxpy3B__min_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_max_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_max_uint32.c
index 2476c6711f..1eff17b6c7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_max_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_max_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IMAX (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (GB_IMAX (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MAX || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_MIN_MAX_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_max_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_max_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_max_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_max_uint32
 GrB_Info GB_Asaxpy3B__min_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_max_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_max_uint64.c
index 444cdfc7bc..004b91faaa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_max_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_max_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IMAX (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (GB_IMAX (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MAX || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_MIN_MAX_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_max_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_max_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_max_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_max_uint64
 GrB_Info GB_Asaxpy3B__min_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_max_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_max_uint8.c
index 577ea4ae95..a5cf3a6401 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_max_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_max_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = GB_IMAX (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = GB_IMAX (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IMAX (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (GB_IMAX (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MAX || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_MIN_MAX_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_max_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_max_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_max_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_max_uint8
 GrB_Info GB_Asaxpy3B__min_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_min_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_min_fp32.c
index 8536ae1306..cdb89c3f80 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_min_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_min_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = fminf (aik, bkj)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, fminf (aik, bkj))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fminf (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, fminf (x, y))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = (fminf (ax, bx)) ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_MIN_MIN_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_min_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_min_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_min_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_min_fp32
 GrB_Info GB_Asaxpy3B__min_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_min_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_min_fp64.c
index 6c97c502d1..bab1105042 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_min_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_min_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = fmin (aik, bkj)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, fmin (aik, bkj))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmin (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, fmin (x, y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = (fmin (ax, bx)) ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_MIN_MIN_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_min_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_min_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_min_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_min_fp64
 GrB_Info GB_Asaxpy3B__min_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_min_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_min_int16.c
index f7f6f9712a..b44bd112c4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_min_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_min_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IMIN (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (GB_IMIN (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_MIN_MIN_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_min_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_min_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_min_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_min_int16
 GrB_Info GB_Asaxpy3B__min_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_min_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_min_int32.c
index bfce58d4ce..a4b0dd5bf8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_min_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_min_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IMIN (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (GB_IMIN (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_MIN_MIN_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_min_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_min_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_min_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_min_int32
 GrB_Info GB_Asaxpy3B__min_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_min_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_min_int64.c
index 6ea210775c..4819316ad6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_min_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_min_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IMIN (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (GB_IMIN (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_MIN_MIN_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_min_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_min_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_min_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_min_int64
 GrB_Info GB_Asaxpy3B__min_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_min_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_min_int8.c
index bd97796486..9eadc1f650 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_min_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_min_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IMIN (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (GB_IMIN (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_MIN_MIN_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_min_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_min_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_min_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_min_int8
 GrB_Info GB_Asaxpy3B__min_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_min_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_min_uint16.c
index ba3059800c..6fe7419826 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_min_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_min_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IMIN (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (GB_IMIN (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_MIN_MIN_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_min_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_min_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_min_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_min_uint16
 GrB_Info GB_Asaxpy3B__min_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_min_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_min_uint32.c
index 34bd9ef6b8..cc242d8cc1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_min_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_min_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IMIN (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (GB_IMIN (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_MIN_MIN_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_min_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_min_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_min_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_min_uint32
 GrB_Info GB_Asaxpy3B__min_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_min_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_min_uint64.c
index 3859d0c2dc..de1583b3bd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_min_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_min_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IMIN (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (GB_IMIN (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_MIN_MIN_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_min_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_min_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_min_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_min_uint64
 GrB_Info GB_Asaxpy3B__min_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_min_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_min_uint8.c
index 5171a0139e..3a99892e45 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_min_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_min_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = GB_IMIN (aik, bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = GB_IMIN (aik, bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IMIN (x, y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (GB_IMIN (ax, bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_MIN_MIN_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_min_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_min_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_min_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_min_uint8
 GrB_Info GB_Asaxpy3B__min_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_minus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_minus_fp32.c
index 6b1797f7c7..bc4b204f08 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_minus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_minus_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, (aik - bkj))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, (x - y))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax - bx)) ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MINUS || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_MINUS_FP32 || GxB_NO_MIN_MINUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_minus_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_minus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_minus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_minus_fp32
 GrB_Info GB_Asaxpy3B__min_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_minus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_minus_fp64.c
index a689e7c44a..9b92f4ad4d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_minus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_minus_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, (aik - bkj))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, (x - y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax - bx)) ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MINUS || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_MINUS_FP64 || GxB_NO_MIN_MINUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_minus_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_minus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_minus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_minus_fp64
 GrB_Info GB_Asaxpy3B__min_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_minus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_minus_int16.c
index 2b8a1b0a1d..a29ec82c10 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_minus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_minus_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = (aik - bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x - y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax - bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MINUS || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_MINUS_INT16 || GxB_NO_MIN_MINUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_minus_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_minus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_minus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_minus_int16
 GrB_Info GB_Asaxpy3B__min_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_minus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_minus_int32.c
index d110517aa2..8b31a5c971 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_minus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_minus_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = (aik - bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x - y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax - bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MINUS || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_MINUS_INT32 || GxB_NO_MIN_MINUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_minus_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_minus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_minus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_minus_int32
 GrB_Info GB_Asaxpy3B__min_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_minus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_minus_int64.c
index e4cc25c5b8..e9057f28a3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_minus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_minus_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = (aik - bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x - y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax - bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MINUS || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_MINUS_INT64 || GxB_NO_MIN_MINUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_minus_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_minus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_minus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_minus_int64
 GrB_Info GB_Asaxpy3B__min_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_minus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_minus_int8.c
index b1c952d91a..8f00a983ef 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_minus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_minus_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = (aik - bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x - y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax - bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MINUS || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_MINUS_INT8 || GxB_NO_MIN_MINUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_minus_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_minus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_minus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_minus_int8
 GrB_Info GB_Asaxpy3B__min_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint16.c
index 19a6b224ce..454db40395 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = (aik - bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x - y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax - bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MINUS || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_MINUS_UINT16 || GxB_NO_MIN_MINUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_minus_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_minus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_minus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_minus_uint16
 GrB_Info GB_Asaxpy3B__min_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint32.c
index 370a1904c4..dc9aed5cfd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = (aik - bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x - y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax - bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MINUS || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_MINUS_UINT32 || GxB_NO_MIN_MINUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_minus_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_minus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_minus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_minus_uint32
 GrB_Info GB_Asaxpy3B__min_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint64.c
index 2899d7d1ec..7a31f1adac 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (aik - bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x - y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax - bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MINUS || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_MINUS_UINT64 || GxB_NO_MIN_MINUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_minus_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_minus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_minus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_minus_uint64
 GrB_Info GB_Asaxpy3B__min_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint8.c
index c3fb8f6258..933fc73a3b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_minus_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik - bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = (aik - bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x - y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax - bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_MINUS || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_MINUS_UINT8 || GxB_NO_MIN_MINUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_minus_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_minus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_minus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_minus_uint8
 GrB_Info GB_Asaxpy3B__min_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_plus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_plus_fp32.c
index 3c07bb95c8..8e9c87c3bc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_plus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_plus_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, (aik + bkj))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, (x + y))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax + bx)) ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_PLUS || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_MIN_PLUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_plus_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_plus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_plus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_plus_fp32
 GrB_Info GB_Asaxpy3B__min_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_plus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_plus_fp64.c
index 524ea17269..25b5f69159 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_plus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_plus_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, (aik + bkj))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, (x + y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax + bx)) ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_PLUS || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_MIN_PLUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_plus_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_plus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_plus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_plus_fp64
 GrB_Info GB_Asaxpy3B__min_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_plus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_plus_int16.c
index d5dce76481..8e03deb8a8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_plus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_plus_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = (aik + bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x + y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax + bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_PLUS || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_MIN_PLUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_plus_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_plus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_plus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_plus_int16
 GrB_Info GB_Asaxpy3B__min_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_plus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_plus_int32.c
index a1853332c9..f77aafd3cb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_plus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_plus_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = (aik + bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x + y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax + bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_PLUS || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_MIN_PLUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_plus_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_plus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_plus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_plus_int32
 GrB_Info GB_Asaxpy3B__min_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_plus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_plus_int64.c
index 16c2728797..5f59263939 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_plus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_plus_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = (aik + bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x + y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax + bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_PLUS || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_MIN_PLUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_plus_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_plus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_plus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_plus_int64
 GrB_Info GB_Asaxpy3B__min_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_plus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_plus_int8.c
index c66ce0357c..81704ec751 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_plus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_plus_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = (aik + bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x + y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax + bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_PLUS || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_MIN_PLUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_plus_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_plus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_plus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_plus_int8
 GrB_Info GB_Asaxpy3B__min_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint16.c
index f189754530..0982f4c73f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = (aik + bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x + y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax + bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_PLUS || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_MIN_PLUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_plus_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_plus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_plus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_plus_uint16
 GrB_Info GB_Asaxpy3B__min_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint32.c
index 1807408ad8..2deba677ca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = (aik + bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x + y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax + bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_PLUS || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_MIN_PLUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_plus_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_plus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_plus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_plus_uint32
 GrB_Info GB_Asaxpy3B__min_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint64.c
index 4db7671428..71c16f6887 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (aik + bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x + y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax + bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_PLUS || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_MIN_PLUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_plus_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_plus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_plus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_plus_uint64
 GrB_Info GB_Asaxpy3B__min_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint8.c
index dfe9d9946e..347c8c7c1f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_plus_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik + bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = (aik + bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x + y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax + bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_PLUS || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_MIN_PLUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_plus_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_plus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_plus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_plus_uint8
 GrB_Info GB_Asaxpy3B__min_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_fp32.c
index c0a81868a8..5a0e559b7f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (bkj / aik)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, (bkj / aik))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y / x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, (y / x))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RDIV || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_RDIV_FP32 || GxB_NO_MIN_RDIV_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rdiv_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rdiv_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rdiv_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rdiv_fp32
 GrB_Info GB_Asaxpy3B__min_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_fp64.c
index a4061d9387..c36fd2a6a0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (bkj / aik)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, (bkj / aik))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y / x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, (y / x))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RDIV || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_RDIV_FP64 || GxB_NO_MIN_RDIV_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rdiv_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rdiv_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rdiv_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rdiv_fp64
 GrB_Info GB_Asaxpy3B__min_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int16.c
index 3938c5fc2b..674d299c28 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = GB_IDIV_SIGNED (bkj, aik, 16)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 16) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IDIV_SIGNED (y, x, 16) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) (GB_IDIV_SIGNED (bx, ax, 16))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RDIV || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_RDIV_INT16 || GxB_NO_MIN_RDIV_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rdiv_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rdiv_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rdiv_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rdiv_int16
 GrB_Info GB_Asaxpy3B__min_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int32.c
index 78e228f596..f466140419 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = GB_IDIV_SIGNED (bkj, aik, 32)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 32) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IDIV_SIGNED (y, x, 32) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) (GB_IDIV_SIGNED (bx, ax, 32))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RDIV || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_RDIV_INT32 || GxB_NO_MIN_RDIV_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rdiv_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rdiv_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rdiv_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rdiv_int32
 GrB_Info GB_Asaxpy3B__min_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int64.c
index a547ca99d9..225e97f2d8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = GB_IDIV_SIGNED (bkj, aik, 64)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 64) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IDIV_SIGNED (y, x, 64) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) (GB_IDIV_SIGNED (bx, ax, 64))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RDIV || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_RDIV_INT64 || GxB_NO_MIN_RDIV_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rdiv_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rdiv_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rdiv_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rdiv_int64
 GrB_Info GB_Asaxpy3B__min_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int8.c
index 8c14fe60d0..f1b7110ea5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = GB_IDIV_SIGNED (bkj, aik, 8)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 8) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IDIV_SIGNED (y, x, 8) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) (GB_IDIV_SIGNED (bx, ax, 8))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RDIV || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_RDIV_INT8 || GxB_NO_MIN_RDIV_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rdiv_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rdiv_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rdiv_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rdiv_int8
 GrB_Info GB_Asaxpy3B__min_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint16.c
index 17622e307f..4a1242f708 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (bkj, aik, 16)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 16) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IDIV_UNSIGNED (y, x, 16) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) (GB_IDIV_UNSIGNED (bx, ax, 16))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RDIV || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_RDIV_UINT16 || GxB_NO_MIN_RDIV_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rdiv_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rdiv_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rdiv_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rdiv_uint16
 GrB_Info GB_Asaxpy3B__min_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint32.c
index fe60c50929..797fc335cc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (bkj, aik, 32)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 32) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IDIV_UNSIGNED (y, x, 32) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) (GB_IDIV_UNSIGNED (bx, ax, 32))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RDIV || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_RDIV_UINT32 || GxB_NO_MIN_RDIV_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rdiv_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rdiv_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rdiv_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rdiv_uint32
 GrB_Info GB_Asaxpy3B__min_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint64.c
index 3d20fc8918..6a64f9ad1e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (bkj, aik, 64)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 64) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IDIV_UNSIGNED (y, x, 64) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) (GB_IDIV_UNSIGNED (bx, ax, 64))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RDIV || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_RDIV_UINT64 || GxB_NO_MIN_RDIV_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rdiv_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rdiv_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rdiv_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rdiv_uint64
 GrB_Info GB_Asaxpy3B__min_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint8.c
index a28a25983c..f3e58156b2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rdiv_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = GB_IDIV_UNSIGNED (bkj, aik, 8)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 8) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IDIV_UNSIGNED (y, x, 8) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) (GB_IDIV_UNSIGNED (bx, ax, 8))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RDIV || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_RDIV_UINT8 || GxB_NO_MIN_RDIV_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rdiv_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rdiv_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rdiv_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rdiv_uint8
 GrB_Info GB_Asaxpy3B__min_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_fp32.c
index 5359702c96..10f7ff78d5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, (bkj - aik))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, (y - x))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((bx - ax)) ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RMINUS || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_RMINUS_FP32 || GxB_NO_MIN_RMINUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rminus_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rminus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rminus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rminus_fp32
 GrB_Info GB_Asaxpy3B__min_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_fp64.c
index 6a8010adbd..061771598f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, (bkj - aik))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, (y - x))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((bx - ax)) ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RMINUS || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_RMINUS_FP64 || GxB_NO_MIN_RMINUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rminus_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rminus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rminus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rminus_fp64
 GrB_Info GB_Asaxpy3B__min_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int16.c
index 0faf7813e1..8ef0c3995f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = (bkj - aik) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (y - x) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((bx - ax))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RMINUS || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_RMINUS_INT16 || GxB_NO_MIN_RMINUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rminus_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rminus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rminus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rminus_int16
 GrB_Info GB_Asaxpy3B__min_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int32.c
index 7ee2b2ec2d..573a7c80a9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = (bkj - aik) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (y - x) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((bx - ax))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RMINUS || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_RMINUS_INT32 || GxB_NO_MIN_RMINUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rminus_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rminus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rminus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rminus_int32
 GrB_Info GB_Asaxpy3B__min_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int64.c
index 2a41802cc5..048f965c5f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = (bkj - aik) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (y - x) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((bx - ax))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RMINUS || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_RMINUS_INT64 || GxB_NO_MIN_RMINUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rminus_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rminus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rminus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rminus_int64
 GrB_Info GB_Asaxpy3B__min_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int8.c
index 8582f1a3ba..a450a9c7fa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = (bkj - aik) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (y - x) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((bx - ax))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RMINUS || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_RMINUS_INT8 || GxB_NO_MIN_RMINUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rminus_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rminus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rminus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rminus_int8
 GrB_Info GB_Asaxpy3B__min_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint16.c
index 2f939e9b3a..e4f4ecd417 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = (bkj - aik) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (y - x) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((bx - ax))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RMINUS || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_RMINUS_UINT16 || GxB_NO_MIN_RMINUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rminus_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rminus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rminus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rminus_uint16
 GrB_Info GB_Asaxpy3B__min_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint32.c
index a290faf60f..96b03a00a6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = (bkj - aik) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (y - x) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((bx - ax))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RMINUS || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_RMINUS_UINT32 || GxB_NO_MIN_RMINUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rminus_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rminus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rminus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rminus_uint32
 GrB_Info GB_Asaxpy3B__min_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint64.c
index 821d567438..f45d2724de 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (bkj - aik) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (y - x) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((bx - ax))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RMINUS || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_RMINUS_UINT64 || GxB_NO_MIN_RMINUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rminus_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rminus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rminus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rminus_uint64
 GrB_Info GB_Asaxpy3B__min_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint8.c
index 485a187536..efbb8610f7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_rminus_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (bkj - aik)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = (bkj - aik) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (y - x) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((bx - ax))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_RMINUS || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_RMINUS_UINT8 || GxB_NO_MIN_RMINUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_rminus_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_rminus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_rminus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_rminus_uint8
 GrB_Info GB_Asaxpy3B__min_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_second_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_second_fp32.c
index f94fadfce3..2e9e88d599 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_second_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_second_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = bkj
-// Add:      cij = fminf (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, bkj)
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !isnan ((bx)) && !islessequal (cx, (bx))) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_SECOND || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_SECOND_FP32 || GxB_NO_MIN_SECOND_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_second_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_second_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_second_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_second_fp32
 GrB_Info GB_Asaxpy3B__min_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_second_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_second_fp64.c
index c3ac98e57b..624fb75f48 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_second_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_second_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = bkj
-// Add:      cij = fmin (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, bkj)
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && !isnan ((bx)) && !islessequal (cx, (bx))) cx = (bx) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_SECOND || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_SECOND_FP64 || GxB_NO_MIN_SECOND_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_second_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_second_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_second_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_second_fp64
 GrB_Info GB_Asaxpy3B__min_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_second_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_second_int16.c
index b52fd11044..0fbcc66d7e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_second_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_second_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, bkj)
+// MultAdd:  int16_t x_op_y = bkj ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = y ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int16_t) (bx))) cx = ((int16_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_SECOND || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_SECOND_INT16 || GxB_NO_MIN_SECOND_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_second_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_second_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_second_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_second_int16
 GrB_Info GB_Asaxpy3B__min_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_second_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_second_int32.c
index e3b2dbd741..ac1b386ead 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_second_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_second_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, bkj)
+// MultAdd:  int32_t x_op_y = bkj ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = y ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int32_t) (bx))) cx = ((int32_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_SECOND || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_SECOND_INT32 || GxB_NO_MIN_SECOND_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_second_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_second_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_second_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_second_int32
 GrB_Info GB_Asaxpy3B__min_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_second_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_second_int64.c
index ce18448fe5..0a070cd318 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_second_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_second_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, bkj)
+// MultAdd:  int64_t x_op_y = bkj ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = y ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int64_t) (bx))) cx = ((int64_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_SECOND || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_SECOND_INT64 || GxB_NO_MIN_SECOND_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_second_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_second_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_second_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_second_int64
 GrB_Info GB_Asaxpy3B__min_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_second_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_second_int8.c
index ee8a05bbc1..e665973c88 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_second_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_second_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, bkj)
+// MultAdd:  int8_t x_op_y = bkj ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = y ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int8_t) (bx))) cx = ((int8_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_SECOND || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_SECOND_INT8 || GxB_NO_MIN_SECOND_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_second_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_second_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_second_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_second_int8
 GrB_Info GB_Asaxpy3B__min_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_second_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_second_uint16.c
index 0c73f336f2..75df23d2be 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_second_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_second_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, bkj)
+// MultAdd:  uint16_t x_op_y = bkj ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = y ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((uint16_t) (bx))) cx = ((uint16_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_SECOND || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_SECOND_UINT16 || GxB_NO_MIN_SECOND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_second_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_second_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_second_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_second_uint16
 GrB_Info GB_Asaxpy3B__min_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_second_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_second_uint32.c
index afe4c6277e..18ac2cb682 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_second_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_second_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, bkj)
+// MultAdd:  uint32_t x_op_y = bkj ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = y ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((uint32_t) (bx))) cx = ((uint32_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_SECOND || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_SECOND_UINT32 || GxB_NO_MIN_SECOND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_second_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_second_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_second_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_second_uint32
 GrB_Info GB_Asaxpy3B__min_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_second_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_second_uint64.c
index 8f83d9e31b..f9a22f929e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_second_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_second_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, bkj)
+// MultAdd:  uint64_t x_op_y = bkj ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = y ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((uint64_t) (bx))) cx = ((uint64_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_SECOND || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_SECOND_UINT64 || GxB_NO_MIN_SECOND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_second_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_second_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_second_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_second_uint64
 GrB_Info GB_Asaxpy3B__min_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_second_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_second_uint8.c
index c4d0f1f709..0d31bd4f37 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_second_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_second_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = bkj
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
-// MultAdd:  cij = GB_IMIN (cij, bkj)
+// MultAdd:  uint8_t x_op_y = bkj ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z = GB_IMIN (z, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = y ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((uint8_t) (bx))) cx = ((uint8_t) (bx)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_SECOND || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_SECOND_UINT8 || GxB_NO_MIN_SECOND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_second_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_second_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_second_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_second_uint8
 GrB_Info GB_Asaxpy3B__min_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_secondj1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_secondj1_int32.c
new file mode 100644
index 0000000000..75d91dcbf5
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_secondj1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__min_secondj1_int32
+// A'*B function (dot3):     GB_Adot3B__min_secondj1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__min_secondj1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__min_secondj1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (j+1)
+// Add:      if (cij > z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int32_t x_op_y = (j+1) ; cij = GB_IMIN (cij, x_op_y)
+// Identity: INT32_MAX
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (j+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (j+1) ; z = GB_IMIN (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT32_MAX
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] > t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMIN (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] > t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int32_t) ((j+1)))) cx = ((int32_t) ((j+1))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MIN || GxB_NO_SECONDJ1 || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_SECONDJ1_INT32 || GxB_NO_MIN_SECONDJ1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__min_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__min_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__min_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__min_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_secondj1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_secondj1_int64.c
new file mode 100644
index 0000000000..c2dda53318
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_secondj1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__min_secondj1_int64
+// A'*B function (dot3):     GB_Adot3B__min_secondj1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__min_secondj1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__min_secondj1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (j+1)
+// Add:      if (cij > z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int64_t x_op_y = (j+1) ; cij = GB_IMIN (cij, x_op_y)
+// Identity: INT64_MAX
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (j+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (j+1) ; z = GB_IMIN (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT64_MAX
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] > t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMIN (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] > t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int64_t) ((j+1)))) cx = ((int64_t) ((j+1))) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MIN || GxB_NO_SECONDJ1 || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_SECONDJ1_INT64 || GxB_NO_MIN_SECONDJ1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__min_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__min_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__min_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__min_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_secondj_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_secondj_int32.c
new file mode 100644
index 0000000000..90f1c1bb9a
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_secondj_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__min_secondj_int32
+// A'*B function (dot3):     GB_Adot3B__min_secondj_int32
+// C+=A'*B function (dot4):  GB_Adot4B__min_secondj_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__min_secondj_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = j
+// Add:      if (cij > z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int32_t x_op_y = j ; cij = GB_IMIN (cij, x_op_y)
+// Identity: INT32_MAX
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = j
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = j ; z = GB_IMIN (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT32_MAX
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] > t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMIN (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] > t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int32_t) (j))) cx = ((int32_t) (j)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MIN || GxB_NO_SECONDJ || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_SECONDJ_INT32 || GxB_NO_MIN_SECONDJ_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__min_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__min_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__min_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__min_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_secondj_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_secondj_int64.c
new file mode 100644
index 0000000000..c35ac1360f
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_secondj_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__min_secondj_int64
+// A'*B function (dot3):     GB_Adot3B__min_secondj_int64
+// C+=A'*B function (dot4):  GB_Adot4B__min_secondj_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__min_secondj_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = j
+// Add:      if (cij > z) cij = z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 0
+// MultAdd:  int64_t x_op_y = j ; cij = GB_IMIN (cij, x_op_y)
+// Identity: INT64_MAX
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = j
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = j ; z = GB_IMIN (z, x_op_y)
+
+// monoid identity value
+#define GB_IDENTITY \
+    INT64_MAX
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    ;
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    if (Cx [p] > t) Cx [p] = t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    GB_IMIN (x, y)
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        if (Hx [i] > t) Hx [i] = t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    if (exists && cx > ((int64_t) (j))) cx = ((int64_t) (j)) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_MIN || GxB_NO_SECONDJ || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_SECONDJ_INT64 || GxB_NO_MIN_SECONDJ_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__min_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__min_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__min_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__min_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_times_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__min_times_fp32.c
index 63571f5fd8..3a5d36dd75 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_times_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_times_fp32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   float
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = fminf (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fminf (cij, (aik * bkj))
 // Identity: INFINITY
-// Terminal: if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == (-INFINITY)) break ;
 
 #define GB_ATYPE \
     float
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fminf (z, (x * y))
 
 // monoid identity value
 #define GB_IDENTITY \
     INFINITY
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == (-INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == (-INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fminf (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fminf (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fminf (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    float t = ((ax * bx)) ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_TIMES || GxB_NO_FP32 || GxB_NO_MIN_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_MIN_TIMES_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_times_fp32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_times_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_times_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_times_fp32
 GrB_Info GB_Asaxpy3B__min_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_times_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__min_times_fp64.c
index b72f835c5c..142bf1dc8d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_times_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_times_fp64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   double
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = fmin (cij, z)
+// Add:      if (!isnan (z) && !islessequal (cij, z)) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  cij = fmin (cij, (aik * bkj))
 // Identity: ((double) INFINITY)
-// Terminal: if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == ((double) -INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = fmin (z, (x * y))
 
 // monoid identity value
 #define GB_IDENTITY \
     ((double) INFINITY)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == ((double) -INFINITY)) { cij_is_terminal = true ; break ; }
+    if (cij == ((double) -INFINITY)) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = fmin (Cx [p], t)
+    if (!isnan (t) && !islessequal (Cx [p], t)) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     fmin (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (!isnan (Hx [i]) && !islessequal (Cx [p], Hx [i])) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = fmin (Hx [i], t)
+        if (!isnan (t) && !islessequal (Hx [i], t)) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    double t = ((ax * bx)) ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_TIMES || GxB_NO_FP64 || GxB_NO_MIN_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_MIN_TIMES_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_times_fp64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_times_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_times_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_times_fp64
 GrB_Info GB_Asaxpy3B__min_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_times_int16.c b/GraphBLAS/Source/Generated/GB_AxB__min_times_int16.c
index 67a85ff7d0..8a67759619 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_times_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_times_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int16_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int16_t x_op_y = (aik * bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT16_MAX
-// Terminal: if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT16_MIN) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x * y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT16_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT16_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int16_t t = ((int16_t) ((ax * bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_TIMES || GxB_NO_INT16 || GxB_NO_MIN_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_MIN_TIMES_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_times_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_times_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_times_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_times_int16
 GrB_Info GB_Asaxpy3B__min_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_times_int32.c b/GraphBLAS/Source/Generated/GB_AxB__min_times_int32.c
index 84ff76574e..1c5fa013dd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_times_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_times_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int32_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int32_t x_op_y = (aik * bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT32_MAX
-// Terminal: if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT32_MIN) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x * y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT32_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT32_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int32_t t = ((int32_t) ((ax * bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_TIMES || GxB_NO_INT32 || GxB_NO_MIN_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_MIN_TIMES_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_times_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_times_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_times_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_times_int32
 GrB_Info GB_Asaxpy3B__min_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_times_int64.c b/GraphBLAS/Source/Generated/GB_AxB__min_times_int64.c
index a54fa8522f..08bfaa2860 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_times_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_times_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int64_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int64_t x_op_y = (aik * bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT64_MAX
-// Terminal: if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT64_MIN) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x * y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT64_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT64_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int64_t t = ((int64_t) ((ax * bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_TIMES || GxB_NO_INT64 || GxB_NO_MIN_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_MIN_TIMES_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_times_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_times_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_times_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_times_int64
 GrB_Info GB_Asaxpy3B__min_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_times_int8.c b/GraphBLAS/Source/Generated/GB_AxB__min_times_int8.c
index baa53c95b6..613e57f703 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_times_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_times_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   int8_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  int8_t x_op_y = (aik * bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: INT8_MAX
-// Terminal: if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == INT8_MIN) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x * y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     INT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == INT8_MIN) { cij_is_terminal = true ; break ; }
+    if (cij == INT8_MIN) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    int8_t t = ((int8_t) ((ax * bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_TIMES || GxB_NO_INT8 || GxB_NO_MIN_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_MIN_TIMES_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_times_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_times_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_times_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_times_int8
 GrB_Info GB_Asaxpy3B__min_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_times_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__min_times_uint16.c
index 128a2a2514..0793a1d9ae 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_times_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_times_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint16_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint16_t x_op_y = (aik * bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT16_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x * y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT16_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint16_t t = ((uint16_t) ((ax * bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_TIMES || GxB_NO_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_MIN_TIMES_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_times_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_times_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_times_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_times_uint16
 GrB_Info GB_Asaxpy3B__min_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_times_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__min_times_uint32.c
index c593f6d1e4..a3570c683d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_times_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_times_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint32_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint32_t x_op_y = (aik * bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT32_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x * y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT32_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint32_t t = ((uint32_t) ((ax * bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_TIMES || GxB_NO_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_MIN_TIMES_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_times_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_times_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_times_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_times_uint32
 GrB_Info GB_Asaxpy3B__min_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_times_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__min_times_uint64.c
index eafc1ae7e3..baf34a1d59 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_times_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_times_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint64_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint64_t x_op_y = (aik * bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT64_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x * y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT64_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint64_t t = ((uint64_t) ((ax * bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_TIMES || GxB_NO_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_MIN_TIMES_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_times_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_times_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_times_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_times_uint64
 GrB_Info GB_Asaxpy3B__min_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__min_times_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__min_times_uint8.c
index 0f67879d88..6510631c00 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__min_times_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__min_times_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,13 +33,13 @@
 // B type:   uint8_t
 
 // Multiply: z = (aik * bkj)
-// Add:      cij = GB_IMIN (cij, z)
+// Add:      if (cij > z) cij = z
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 0
 // MultAdd:  uint8_t x_op_y = (aik * bkj) ; cij = GB_IMIN (cij, x_op_y)
 // Identity: UINT8_MAX
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x * y) ; z = GB_IMIN (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     UINT8_MAX
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0xFF
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,32 +134,23 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
 
 // C(i,j) += t
 #define GB_CIJ_UPDATE(p,t) \
-    Cx [p] = GB_IMIN (Cx [p], t)
+    if (Cx [p] > t) Cx [p] = t
 
 // x + y
 #define GB_ADD_FUNCTION(x,y) \
     GB_IMIN (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    1
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,9 +249,13 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        if (Cx [p] > Hx [i]) Cx [p] = Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
-        Hx [i] = GB_IMIN (Hx [i], t)
+        if (Hx [i] > t) Hx [i] = t
 
     // memcpy (&(Cx [p]), &(Hx [i]), len)
     #define GB_CIJ_MEMCPY(p,i,len) \
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    uint8_t t = ((uint8_t) ((ax * bx))) ; if (exists && cx > t) cx = t ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MIN || GxB_NO_TIMES || GxB_NO_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_MIN_TIMES_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__min_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__min_times_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__min_times_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__min_times_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__min_times_uint8
 GrB_Info GB_Asaxpy3B__min_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_div_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_div_fc32.c
index 9d24cd8229..0ab55d11fa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_div_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_div_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_div (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_div (x, y) ; z = GB_FC32_add (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_DIV || GxB_NO_FC32 || GxB_NO_PLUS_FC32 || GxB_NO_DIV_FC32 || GxB_NO_PLUS_DIV_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_div_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_div_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_div_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_div_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_div_fc32
 GrB_Info GB_Asaxpy3B__plus_div_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_div_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_div_fc64.c
index 81db72277c..db7a68dea0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_div_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_div_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_div (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_div (x, y) ; z = GB_FC64_add (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     1
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_DIV || GxB_NO_FC64 || GxB_NO_PLUS_FC64 || GxB_NO_DIV_FC64 || GxB_NO_PLUS_DIV_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_div_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_div_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_div_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_div_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_div_fc64
 GrB_Info GB_Asaxpy3B__plus_div_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_div_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_div_fp32.c
index 89529e65c6..d3c1bc7e52 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_div_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_div_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x / y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x / y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_DIV || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_DIV_FP32 || GxB_NO_PLUS_DIV_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_div_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_div_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_div_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_div_fp32
 GrB_Info GB_Asaxpy3B__plus_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_div_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_div_fp64.c
index cdb9c6d7a4..3a8a2352c5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_div_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_div_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x / y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x / y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_DIV || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_DIV_FP64 || GxB_NO_PLUS_DIV_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_div_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_div_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_div_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_div_fp64
 GrB_Info GB_Asaxpy3B__plus_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_div_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_div_int16.c
index e9d2779685..46332fd6b9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_div_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_div_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IDIV_SIGNED (x, y, 16) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) (GB_IDIV_SIGNED (ax, bx, 16))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_DIV || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_DIV_INT16 || GxB_NO_PLUS_DIV_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_div_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_div_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_div_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_div_int16
 GrB_Info GB_Asaxpy3B__plus_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_div_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_div_int32.c
index 560d46c22a..c841948337 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_div_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_div_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IDIV_SIGNED (x, y, 32) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) (GB_IDIV_SIGNED (ax, bx, 32))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_DIV || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_DIV_INT32 || GxB_NO_PLUS_DIV_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_div_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_div_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_div_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_div_int32
 GrB_Info GB_Asaxpy3B__plus_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_div_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_div_int64.c
index 28074bcd78..a8be2dc355 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_div_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_div_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IDIV_SIGNED (x, y, 64) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) (GB_IDIV_SIGNED (ax, bx, 64))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_DIV || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_DIV_INT64 || GxB_NO_PLUS_DIV_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_div_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_div_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_div_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_div_int64
 GrB_Info GB_Asaxpy3B__plus_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_div_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_div_int8.c
index b8103fcb50..77711876fa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_div_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_div_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IDIV_SIGNED (x, y, 8) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) (GB_IDIV_SIGNED (ax, bx, 8))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_DIV || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_DIV_INT8 || GxB_NO_PLUS_DIV_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_div_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_div_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_div_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_div_int8
 GrB_Info GB_Asaxpy3B__plus_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint16.c
index 9291de15dd..2efd8696bb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IDIV_UNSIGNED (x, y, 16) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) (GB_IDIV_UNSIGNED (ax, bx, 16))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_DIV || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_DIV_UINT16 || GxB_NO_PLUS_DIV_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_div_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_div_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_div_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_div_uint16
 GrB_Info GB_Asaxpy3B__plus_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint32.c
index acd533d441..04222d1213 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IDIV_UNSIGNED (x, y, 32) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) (GB_IDIV_UNSIGNED (ax, bx, 32))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_DIV || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_DIV_UINT32 || GxB_NO_PLUS_DIV_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_div_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_div_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_div_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_div_uint32
 GrB_Info GB_Asaxpy3B__plus_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint64.c
index 40cfec3bf8..30c00dea75 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IDIV_UNSIGNED (x, y, 64) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) (GB_IDIV_UNSIGNED (ax, bx, 64))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_DIV || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_DIV_UINT64 || GxB_NO_PLUS_DIV_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_div_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_div_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_div_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_div_uint64
 GrB_Info GB_Asaxpy3B__plus_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint8.c
index 4bf980ada4..0bec61369b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_div_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IDIV_UNSIGNED (x, y, 8) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) (GB_IDIV_UNSIGNED (ax, bx, 8))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_DIV || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_DIV_UINT8 || GxB_NO_PLUS_DIV_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_div_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_div_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_div_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_div_uint8
 GrB_Info GB_Asaxpy3B__plus_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_first_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_first_fc32.c
index c2c308a71a..721196b9ba 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_first_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_first_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = GB_FC32_add (z, x)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FIRST || GxB_NO_FC32 || GxB_NO_PLUS_FC32 || GxB_NO_FIRST_FC32 || GxB_NO_PLUS_FIRST_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_first_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_first_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_first_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_first_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_first_fc32
 GrB_Info GB_Asaxpy3B__plus_first_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_first_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_first_fc64.c
index 1f1f344d19..b4a4c45040 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_first_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_first_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = GB_FC64_add (z, x)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     1
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FIRST || GxB_NO_FC64 || GxB_NO_PLUS_FC64 || GxB_NO_FIRST_FC64 || GxB_NO_PLUS_FIRST_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_first_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_first_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_first_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_first_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_first_fc64
 GrB_Info GB_Asaxpy3B__plus_first_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_first_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_first_fp32.c
index 30f849a283..dd204bc639 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_first_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_first_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (ax) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FIRST || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_FIRST_FP32 || GxB_NO_PLUS_FIRST_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_first_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_first_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_first_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_first_fp32
 GrB_Info GB_Asaxpy3B__plus_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_first_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_first_fp64.c
index 4a8d516ac0..25abf29f90 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_first_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_first_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (ax) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FIRST || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_FIRST_FP64 || GxB_NO_PLUS_FIRST_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_first_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_first_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_first_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_first_fp64
 GrB_Info GB_Asaxpy3B__plus_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_first_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_first_int16.c
index a67a60868c..c84f3a2a5f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_first_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_first_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (ax) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FIRST || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_FIRST_INT16 || GxB_NO_PLUS_FIRST_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_first_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_first_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_first_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_first_int16
 GrB_Info GB_Asaxpy3B__plus_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_first_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_first_int32.c
index 5165061321..2c71bfa724 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_first_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_first_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (ax) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FIRST || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_FIRST_INT32 || GxB_NO_PLUS_FIRST_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_first_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_first_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_first_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_first_int32
 GrB_Info GB_Asaxpy3B__plus_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_first_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_first_int64.c
index e6982c7cde..f74a5e7602 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_first_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_first_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (ax) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FIRST || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_FIRST_INT64 || GxB_NO_PLUS_FIRST_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_first_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_first_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_first_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_first_int64
 GrB_Info GB_Asaxpy3B__plus_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_first_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_first_int8.c
index 158f2680ab..222e9bc686 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_first_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_first_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (ax) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FIRST || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_FIRST_INT8 || GxB_NO_PLUS_FIRST_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_first_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_first_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_first_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_first_int8
 GrB_Info GB_Asaxpy3B__plus_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint16.c
index 9b73219521..aed44cf040 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (ax) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FIRST || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_FIRST_UINT16 || GxB_NO_PLUS_FIRST_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_first_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_first_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_first_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_first_uint16
 GrB_Info GB_Asaxpy3B__plus_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint32.c
index dcc502c879..33e205ddd5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (ax) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FIRST || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_FIRST_UINT32 || GxB_NO_PLUS_FIRST_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_first_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_first_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_first_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_first_uint32
 GrB_Info GB_Asaxpy3B__plus_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint64.c
index c21f900656..9d582fafa4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (ax) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FIRST || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_FIRST_UINT64 || GxB_NO_PLUS_FIRST_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_first_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_first_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_first_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_first_uint64
 GrB_Info GB_Asaxpy3B__plus_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint8.c
index 1f57cdafdd..c37fba632b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_first_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += x
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (ax) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FIRST || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_FIRST_UINT8 || GxB_NO_PLUS_FIRST_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_first_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_first_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_first_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_first_uint8
 GrB_Info GB_Asaxpy3B__plus_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_firsti1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_firsti1_int32.c
new file mode 100644
index 0000000000..f143d573bd
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_firsti1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__plus_firsti1_int32
+// A'*B function (dot3):     GB_Adot3B__plus_firsti1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__plus_firsti1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__plus_firsti1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (i+1)
+// Add:      cij += z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij += (i+1)
+// Identity: 0
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (i+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z += (i+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (+,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] += t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x + y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] += t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((i+1)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_PLUS || GxB_NO_FIRSTI1 || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_FIRSTI1_INT32 || GxB_NO_PLUS_FIRSTI1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__plus_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__plus_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__plus_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__plus_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_firsti1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_firsti1_int64.c
new file mode 100644
index 0000000000..47b7ab8855
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_firsti1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__plus_firsti1_int64
+// A'*B function (dot3):     GB_Adot3B__plus_firsti1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__plus_firsti1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__plus_firsti1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (i+1)
+// Add:      cij += z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij += (i+1)
+// Identity: 0
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (i+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z += (i+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (+,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] += t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x + y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] += t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((i+1)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_PLUS || GxB_NO_FIRSTI1 || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_FIRSTI1_INT64 || GxB_NO_PLUS_FIRSTI1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__plus_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__plus_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__plus_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__plus_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_firsti_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_firsti_int32.c
new file mode 100644
index 0000000000..313477aa58
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_firsti_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__plus_firsti_int32
+// A'*B function (dot3):     GB_Adot3B__plus_firsti_int32
+// C+=A'*B function (dot4):  GB_Adot4B__plus_firsti_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__plus_firsti_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = i
+// Add:      cij += z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij += i
+// Identity: 0
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = i
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z += i
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (+,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] += t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x + y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] += t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (i) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_PLUS || GxB_NO_FIRSTI || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_FIRSTI_INT32 || GxB_NO_PLUS_FIRSTI_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__plus_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__plus_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__plus_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__plus_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_firsti_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_firsti_int64.c
new file mode 100644
index 0000000000..1ba62765d9
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_firsti_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__plus_firsti_int64
+// A'*B function (dot3):     GB_Adot3B__plus_firsti_int64
+// C+=A'*B function (dot4):  GB_Adot4B__plus_firsti_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__plus_firsti_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = i
+// Add:      cij += z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij += i
+// Identity: 0
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = i
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z += i
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (+,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] += t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x + y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] += t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (i) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_PLUS || GxB_NO_FIRSTI || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_FIRSTI_INT64 || GxB_NO_PLUS_FIRSTI_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__plus_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__plus_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__plus_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__plus_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_firstj1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_firstj1_int32.c
new file mode 100644
index 0000000000..63b2e16a40
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_firstj1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__plus_firstj1_int32
+// A'*B function (dot3):     GB_Adot3B__plus_firstj1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__plus_firstj1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__plus_firstj1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (k+1)
+// Add:      cij += z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij += (k+1)
+// Identity: 0
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (k+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z += (k+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (+,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] += t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x + y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] += t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((k+1)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_PLUS || GxB_NO_FIRSTJ1 || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_FIRSTJ1_INT32 || GxB_NO_PLUS_FIRSTJ1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__plus_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__plus_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__plus_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__plus_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_firstj1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_firstj1_int64.c
new file mode 100644
index 0000000000..a13e6a120a
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_firstj1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__plus_firstj1_int64
+// A'*B function (dot3):     GB_Adot3B__plus_firstj1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__plus_firstj1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__plus_firstj1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (k+1)
+// Add:      cij += z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij += (k+1)
+// Identity: 0
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (k+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z += (k+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (+,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] += t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x + y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] += t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((k+1)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_PLUS || GxB_NO_FIRSTJ1 || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_FIRSTJ1_INT64 || GxB_NO_PLUS_FIRSTJ1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__plus_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__plus_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__plus_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__plus_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_firstj_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_firstj_int32.c
new file mode 100644
index 0000000000..b0e642c17b
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_firstj_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__plus_firstj_int32
+// A'*B function (dot3):     GB_Adot3B__plus_firstj_int32
+// C+=A'*B function (dot4):  GB_Adot4B__plus_firstj_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__plus_firstj_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = k
+// Add:      cij += z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij += k
+// Identity: 0
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = k
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z += k
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (+,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] += t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x + y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] += t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (k) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_PLUS || GxB_NO_FIRSTJ || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_FIRSTJ_INT32 || GxB_NO_PLUS_FIRSTJ_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__plus_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__plus_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__plus_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__plus_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_firstj_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_firstj_int64.c
new file mode 100644
index 0000000000..60a475ff94
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_firstj_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__plus_firstj_int64
+// A'*B function (dot3):     GB_Adot3B__plus_firstj_int64
+// C+=A'*B function (dot4):  GB_Adot4B__plus_firstj_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__plus_firstj_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = k
+// Add:      cij += z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij += k
+// Identity: 0
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = k
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z += k
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (+,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] += t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x + y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] += t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (k) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_PLUS || GxB_NO_FIRSTJ || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_FIRSTJ_INT64 || GxB_NO_PLUS_FIRSTJ_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__plus_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__plus_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__plus_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__plus_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_fp32.c
index 8be7014fd7..fc3b2ce149 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax == bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISEQ || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_ISEQ_FP32 || GxB_NO_PLUS_ISEQ_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_iseq_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_iseq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_iseq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_iseq_fp32
 GrB_Info GB_Asaxpy3B__plus_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_fp64.c
index 96341fc74c..a27f1fbc91 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax == bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISEQ || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_ISEQ_FP64 || GxB_NO_PLUS_ISEQ_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_iseq_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_iseq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_iseq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_iseq_fp64
 GrB_Info GB_Asaxpy3B__plus_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int16.c
index 3f1a2ce102..ee392a84c0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik == bkj)
+// MultAdd:  int16_t x_op_y = (aik == bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x == y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) ((ax == bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISEQ || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_ISEQ_INT16 || GxB_NO_PLUS_ISEQ_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_iseq_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_iseq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_iseq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_iseq_int16
 GrB_Info GB_Asaxpy3B__plus_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int32.c
index f36f716450..bee7bbf854 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik == bkj)
+// MultAdd:  int32_t x_op_y = (aik == bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x == y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) ((ax == bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISEQ || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_ISEQ_INT32 || GxB_NO_PLUS_ISEQ_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_iseq_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_iseq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_iseq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_iseq_int32
 GrB_Info GB_Asaxpy3B__plus_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int64.c
index 0abc73ce33..0c0b57398c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik == bkj)
+// MultAdd:  int64_t x_op_y = (aik == bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x == y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) ((ax == bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISEQ || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_ISEQ_INT64 || GxB_NO_PLUS_ISEQ_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_iseq_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_iseq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_iseq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_iseq_int64
 GrB_Info GB_Asaxpy3B__plus_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int8.c
index e34a3a6525..5a4b39e17d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik == bkj)
+// MultAdd:  int8_t x_op_y = (aik == bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x == y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) ((ax == bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISEQ || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_ISEQ_INT8 || GxB_NO_PLUS_ISEQ_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_iseq_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_iseq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_iseq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_iseq_int8
 GrB_Info GB_Asaxpy3B__plus_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint16.c
index e31425d9ad..7b29818194 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik == bkj)
+// MultAdd:  uint16_t x_op_y = (aik == bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x == y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) ((ax == bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISEQ || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_ISEQ_UINT16 || GxB_NO_PLUS_ISEQ_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_iseq_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_iseq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_iseq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_iseq_uint16
 GrB_Info GB_Asaxpy3B__plus_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint32.c
index 645c76af14..78b3fdccce 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik == bkj)
+// MultAdd:  uint32_t x_op_y = (aik == bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x == y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) ((ax == bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISEQ || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_ISEQ_UINT32 || GxB_NO_PLUS_ISEQ_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_iseq_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_iseq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_iseq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_iseq_uint32
 GrB_Info GB_Asaxpy3B__plus_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint64.c
index 17b4c8c756..279ee6aee6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik == bkj)
+// MultAdd:  uint64_t x_op_y = (aik == bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x == y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) ((ax == bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISEQ || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_ISEQ_UINT64 || GxB_NO_PLUS_ISEQ_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_iseq_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_iseq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_iseq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_iseq_uint64
 GrB_Info GB_Asaxpy3B__plus_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint8.c
index a84e6926a1..07a507ddba 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_iseq_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik == bkj)
+// MultAdd:  uint8_t x_op_y = (aik == bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x == y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) ((ax == bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISEQ || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_ISEQ_UINT8 || GxB_NO_PLUS_ISEQ_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_iseq_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_iseq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_iseq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_iseq_uint8
 GrB_Info GB_Asaxpy3B__plus_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_fp32.c
index bc9fb83f88..6adb6cf8a9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax >= bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGE || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_ISGE_FP32 || GxB_NO_PLUS_ISGE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isge_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isge_fp32
 GrB_Info GB_Asaxpy3B__plus_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_fp64.c
index 7affbf257b..574e3d6d6e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax >= bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGE || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_ISGE_FP64 || GxB_NO_PLUS_ISGE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isge_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isge_fp64
 GrB_Info GB_Asaxpy3B__plus_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int16.c
index 51a288ea18..2cb95e6709 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik >= bkj)
+// MultAdd:  int16_t x_op_y = (aik >= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x >= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) ((ax >= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGE || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_ISGE_INT16 || GxB_NO_PLUS_ISGE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isge_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isge_int16
 GrB_Info GB_Asaxpy3B__plus_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int32.c
index a730619562..457533ffad 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik >= bkj)
+// MultAdd:  int32_t x_op_y = (aik >= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x >= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) ((ax >= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGE || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_ISGE_INT32 || GxB_NO_PLUS_ISGE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isge_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isge_int32
 GrB_Info GB_Asaxpy3B__plus_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int64.c
index a4bba652a4..d4ac574ed6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik >= bkj)
+// MultAdd:  int64_t x_op_y = (aik >= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x >= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) ((ax >= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGE || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_ISGE_INT64 || GxB_NO_PLUS_ISGE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isge_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isge_int64
 GrB_Info GB_Asaxpy3B__plus_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int8.c
index 0f06777f8f..d8b5b9d227 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik >= bkj)
+// MultAdd:  int8_t x_op_y = (aik >= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x >= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) ((ax >= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGE || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_ISGE_INT8 || GxB_NO_PLUS_ISGE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isge_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isge_int8
 GrB_Info GB_Asaxpy3B__plus_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint16.c
index bc83e0d47d..1d092e1fe0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik >= bkj)
+// MultAdd:  uint16_t x_op_y = (aik >= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x >= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) ((ax >= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGE || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_ISGE_UINT16 || GxB_NO_PLUS_ISGE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isge_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isge_uint16
 GrB_Info GB_Asaxpy3B__plus_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint32.c
index 1d3ed27556..c101278f85 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik >= bkj)
+// MultAdd:  uint32_t x_op_y = (aik >= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x >= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) ((ax >= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGE || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_ISGE_UINT32 || GxB_NO_PLUS_ISGE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isge_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isge_uint32
 GrB_Info GB_Asaxpy3B__plus_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint64.c
index ba36addf2f..c2a7b6a580 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik >= bkj)
+// MultAdd:  uint64_t x_op_y = (aik >= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x >= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) ((ax >= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGE || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_ISGE_UINT64 || GxB_NO_PLUS_ISGE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isge_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isge_uint64
 GrB_Info GB_Asaxpy3B__plus_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint8.c
index 59216306b2..5c50846b25 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isge_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik >= bkj)
+// MultAdd:  uint8_t x_op_y = (aik >= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x >= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) ((ax >= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGE || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_ISGE_UINT8 || GxB_NO_PLUS_ISGE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isge_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isge_uint8
 GrB_Info GB_Asaxpy3B__plus_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_fp32.c
index 20f66ff471..492efc9c3c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax > bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGT || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_ISGT_FP32 || GxB_NO_PLUS_ISGT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isgt_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isgt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isgt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isgt_fp32
 GrB_Info GB_Asaxpy3B__plus_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_fp64.c
index 7e0b1f43f7..3cf28316f5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax > bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGT || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_ISGT_FP64 || GxB_NO_PLUS_ISGT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isgt_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isgt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isgt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isgt_fp64
 GrB_Info GB_Asaxpy3B__plus_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int16.c
index 508d70abc7..e100cd68d5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik > bkj)
+// MultAdd:  int16_t x_op_y = (aik > bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x > y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) ((ax > bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGT || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_ISGT_INT16 || GxB_NO_PLUS_ISGT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isgt_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isgt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isgt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isgt_int16
 GrB_Info GB_Asaxpy3B__plus_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int32.c
index 4902b946bb..933178bdcc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik > bkj)
+// MultAdd:  int32_t x_op_y = (aik > bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x > y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) ((ax > bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGT || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_ISGT_INT32 || GxB_NO_PLUS_ISGT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isgt_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isgt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isgt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isgt_int32
 GrB_Info GB_Asaxpy3B__plus_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int64.c
index 786fff4086..51972f4c28 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik > bkj)
+// MultAdd:  int64_t x_op_y = (aik > bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x > y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) ((ax > bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGT || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_ISGT_INT64 || GxB_NO_PLUS_ISGT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isgt_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isgt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isgt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isgt_int64
 GrB_Info GB_Asaxpy3B__plus_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int8.c
index 48f7731297..0bc4c3baff 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik > bkj)
+// MultAdd:  int8_t x_op_y = (aik > bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x > y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) ((ax > bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGT || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_ISGT_INT8 || GxB_NO_PLUS_ISGT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isgt_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isgt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isgt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isgt_int8
 GrB_Info GB_Asaxpy3B__plus_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint16.c
index ce99b637dc..870f7699be 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik > bkj)
+// MultAdd:  uint16_t x_op_y = (aik > bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x > y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) ((ax > bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGT || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_ISGT_UINT16 || GxB_NO_PLUS_ISGT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isgt_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isgt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isgt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isgt_uint16
 GrB_Info GB_Asaxpy3B__plus_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint32.c
index 7b19ddab5e..1a2c624356 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik > bkj)
+// MultAdd:  uint32_t x_op_y = (aik > bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x > y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) ((ax > bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGT || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_ISGT_UINT32 || GxB_NO_PLUS_ISGT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isgt_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isgt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isgt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isgt_uint32
 GrB_Info GB_Asaxpy3B__plus_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint64.c
index 1c2b32ca51..cfeaad79eb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik > bkj)
+// MultAdd:  uint64_t x_op_y = (aik > bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x > y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) ((ax > bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGT || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_ISGT_UINT64 || GxB_NO_PLUS_ISGT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isgt_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isgt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isgt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isgt_uint64
 GrB_Info GB_Asaxpy3B__plus_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint8.c
index 80c6bd87cf..aebb9545d6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isgt_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik > bkj)
+// MultAdd:  uint8_t x_op_y = (aik > bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x > y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) ((ax > bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISGT || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_ISGT_UINT8 || GxB_NO_PLUS_ISGT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isgt_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isgt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isgt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isgt_uint8
 GrB_Info GB_Asaxpy3B__plus_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_fp32.c
index c5428c3205..3d9da654d3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax <= bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLE || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_ISLE_FP32 || GxB_NO_PLUS_ISLE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isle_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isle_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isle_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isle_fp32
 GrB_Info GB_Asaxpy3B__plus_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_fp64.c
index 287275e338..83936f1457 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax <= bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLE || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_ISLE_FP64 || GxB_NO_PLUS_ISLE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isle_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isle_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isle_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isle_fp64
 GrB_Info GB_Asaxpy3B__plus_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int16.c
index 895039fed4..42858a2180 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik <= bkj)
+// MultAdd:  int16_t x_op_y = (aik <= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x <= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) ((ax <= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLE || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_ISLE_INT16 || GxB_NO_PLUS_ISLE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isle_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isle_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isle_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isle_int16
 GrB_Info GB_Asaxpy3B__plus_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int32.c
index 47aad1365a..24286ab27c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik <= bkj)
+// MultAdd:  int32_t x_op_y = (aik <= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x <= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) ((ax <= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLE || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_ISLE_INT32 || GxB_NO_PLUS_ISLE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isle_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isle_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isle_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isle_int32
 GrB_Info GB_Asaxpy3B__plus_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int64.c
index 9ee3aa4320..2924022154 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik <= bkj)
+// MultAdd:  int64_t x_op_y = (aik <= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x <= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) ((ax <= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLE || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_ISLE_INT64 || GxB_NO_PLUS_ISLE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isle_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isle_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isle_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isle_int64
 GrB_Info GB_Asaxpy3B__plus_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int8.c
index 564ca8215d..9b633bee0f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik <= bkj)
+// MultAdd:  int8_t x_op_y = (aik <= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x <= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) ((ax <= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLE || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_ISLE_INT8 || GxB_NO_PLUS_ISLE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isle_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isle_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isle_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isle_int8
 GrB_Info GB_Asaxpy3B__plus_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint16.c
index 1b46ee3938..9d0181526e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik <= bkj)
+// MultAdd:  uint16_t x_op_y = (aik <= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x <= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) ((ax <= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLE || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_ISLE_UINT16 || GxB_NO_PLUS_ISLE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isle_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isle_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isle_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isle_uint16
 GrB_Info GB_Asaxpy3B__plus_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint32.c
index 34fb426010..09e738bcfa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik <= bkj)
+// MultAdd:  uint32_t x_op_y = (aik <= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x <= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) ((ax <= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLE || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_ISLE_UINT32 || GxB_NO_PLUS_ISLE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isle_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isle_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isle_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isle_uint32
 GrB_Info GB_Asaxpy3B__plus_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint64.c
index 7e42e5439b..ef65a9da7f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik <= bkj)
+// MultAdd:  uint64_t x_op_y = (aik <= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x <= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) ((ax <= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLE || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_ISLE_UINT64 || GxB_NO_PLUS_ISLE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isle_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isle_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isle_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isle_uint64
 GrB_Info GB_Asaxpy3B__plus_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint8.c
index 7a285d15d9..07119cba3f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isle_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik <= bkj)
+// MultAdd:  uint8_t x_op_y = (aik <= bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x <= y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) ((ax <= bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLE || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_ISLE_UINT8 || GxB_NO_PLUS_ISLE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isle_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isle_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isle_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isle_uint8
 GrB_Info GB_Asaxpy3B__plus_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_fp32.c
index 93757cb7c6..69297008fb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax < bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLT || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_ISLT_FP32 || GxB_NO_PLUS_ISLT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_islt_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_islt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_islt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_islt_fp32
 GrB_Info GB_Asaxpy3B__plus_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_fp64.c
index 373c762129..a0d15f4a89 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax < bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLT || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_ISLT_FP64 || GxB_NO_PLUS_ISLT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_islt_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_islt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_islt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_islt_fp64
 GrB_Info GB_Asaxpy3B__plus_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int16.c
index 9478cbf5a6..2bdc7a550e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik < bkj)
+// MultAdd:  int16_t x_op_y = (aik < bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x < y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) ((ax < bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLT || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_ISLT_INT16 || GxB_NO_PLUS_ISLT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_islt_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_islt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_islt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_islt_int16
 GrB_Info GB_Asaxpy3B__plus_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int32.c
index afbe42f6e9..766f91acc8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik < bkj)
+// MultAdd:  int32_t x_op_y = (aik < bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x < y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) ((ax < bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLT || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_ISLT_INT32 || GxB_NO_PLUS_ISLT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_islt_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_islt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_islt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_islt_int32
 GrB_Info GB_Asaxpy3B__plus_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int64.c
index f6a5d53642..f05a0a58b6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik < bkj)
+// MultAdd:  int64_t x_op_y = (aik < bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x < y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) ((ax < bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLT || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_ISLT_INT64 || GxB_NO_PLUS_ISLT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_islt_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_islt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_islt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_islt_int64
 GrB_Info GB_Asaxpy3B__plus_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int8.c
index 2b27cb6746..08201dbb8f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik < bkj)
+// MultAdd:  int8_t x_op_y = (aik < bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x < y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) ((ax < bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLT || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_ISLT_INT8 || GxB_NO_PLUS_ISLT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_islt_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_islt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_islt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_islt_int8
 GrB_Info GB_Asaxpy3B__plus_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint16.c
index f9c4927f42..3fbc1b44d7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik < bkj)
+// MultAdd:  uint16_t x_op_y = (aik < bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x < y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) ((ax < bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLT || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_ISLT_UINT16 || GxB_NO_PLUS_ISLT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_islt_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_islt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_islt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_islt_uint16
 GrB_Info GB_Asaxpy3B__plus_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint32.c
index e6225e90f9..b3c87ca199 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik < bkj)
+// MultAdd:  uint32_t x_op_y = (aik < bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x < y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) ((ax < bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLT || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_ISLT_UINT32 || GxB_NO_PLUS_ISLT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_islt_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_islt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_islt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_islt_uint32
 GrB_Info GB_Asaxpy3B__plus_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint64.c
index b484b47856..95dbb0cbd3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik < bkj)
+// MultAdd:  uint64_t x_op_y = (aik < bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x < y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) ((ax < bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLT || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_ISLT_UINT64 || GxB_NO_PLUS_ISLT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_islt_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_islt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_islt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_islt_uint64
 GrB_Info GB_Asaxpy3B__plus_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint8.c
index 1fa9a760b3..cb71b61c02 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_islt_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik < bkj)
+// MultAdd:  uint8_t x_op_y = (aik < bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x < y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) ((ax < bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISLT || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_ISLT_UINT8 || GxB_NO_PLUS_ISLT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_islt_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_islt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_islt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_islt_uint8
 GrB_Info GB_Asaxpy3B__plus_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_fp32.c
index bb17453f62..bb5a90a1d6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax != bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISNE || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_ISNE_FP32 || GxB_NO_PLUS_ISNE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isne_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isne_fp32
 GrB_Info GB_Asaxpy3B__plus_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_fp64.c
index 2d92672821..b899547ca6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax != bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISNE || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_ISNE_FP64 || GxB_NO_PLUS_ISNE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isne_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isne_fp64
 GrB_Info GB_Asaxpy3B__plus_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int16.c
index 0b99ea9b69..b29402c64e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik != bkj)
+// MultAdd:  int16_t x_op_y = (aik != bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x != y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) ((ax != bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISNE || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_ISNE_INT16 || GxB_NO_PLUS_ISNE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isne_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isne_int16
 GrB_Info GB_Asaxpy3B__plus_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int32.c
index 187d8d132f..09f3baccd1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik != bkj)
+// MultAdd:  int32_t x_op_y = (aik != bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x != y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) ((ax != bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISNE || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_ISNE_INT32 || GxB_NO_PLUS_ISNE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isne_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isne_int32
 GrB_Info GB_Asaxpy3B__plus_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int64.c
index a9107cb30c..4c587d488a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik != bkj)
+// MultAdd:  int64_t x_op_y = (aik != bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x != y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) ((ax != bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISNE || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_ISNE_INT64 || GxB_NO_PLUS_ISNE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isne_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isne_int64
 GrB_Info GB_Asaxpy3B__plus_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int8.c
index 1786d0fe94..0f1e816d8e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik != bkj)
+// MultAdd:  int8_t x_op_y = (aik != bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x != y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) ((ax != bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISNE || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_ISNE_INT8 || GxB_NO_PLUS_ISNE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isne_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isne_int8
 GrB_Info GB_Asaxpy3B__plus_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint16.c
index 6b8dd56c9b..ffb7ab98d5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik != bkj)
+// MultAdd:  uint16_t x_op_y = (aik != bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x != y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) ((ax != bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISNE || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_ISNE_UINT16 || GxB_NO_PLUS_ISNE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isne_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isne_uint16
 GrB_Info GB_Asaxpy3B__plus_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint32.c
index 0cdcb212e8..6861638be1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik != bkj)
+// MultAdd:  uint32_t x_op_y = (aik != bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x != y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) ((ax != bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISNE || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_ISNE_UINT32 || GxB_NO_PLUS_ISNE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isne_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isne_uint32
 GrB_Info GB_Asaxpy3B__plus_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint64.c
index 55b85bbe06..588eb56d3e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik != bkj)
+// MultAdd:  uint64_t x_op_y = (aik != bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x != y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) ((ax != bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISNE || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_ISNE_UINT64 || GxB_NO_PLUS_ISNE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isne_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isne_uint64
 GrB_Info GB_Asaxpy3B__plus_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint8.c
index 4ed988bbb4..953655ad4c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_isne_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -36,7 +37,7 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij += (aik != bkj)
+// MultAdd:  uint8_t x_op_y = (aik != bkj) ; cij += x_op_y
 // Identity: 0
 // Terminal: ;
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z += (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x != y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) ((ax != bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_ISNE || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_ISNE_UINT8 || GxB_NO_PLUS_ISNE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_isne_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_isne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_isne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_isne_uint8
 GrB_Info GB_Asaxpy3B__plus_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_land_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_land_fp32.c
index d84fa2b543..0633245889 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_land_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_land_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += ((x != 0) && (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (((ax != 0) && (bx != 0))) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LAND || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_LAND_FP32 || GxB_NO_PLUS_LAND_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_land_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_land_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_land_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_land_fp32
 GrB_Info GB_Asaxpy3B__plus_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_land_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_land_fp64.c
index 116d7017bf..7eccde94c9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_land_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_land_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += ((x != 0) && (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (((ax != 0) && (bx != 0))) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LAND || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_LAND_FP64 || GxB_NO_PLUS_LAND_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_land_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_land_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_land_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_land_fp64
 GrB_Info GB_Asaxpy3B__plus_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_land_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_land_int16.c
index e265fde1b9..b6cab422fa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_land_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_land_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) && (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) (((ax != 0) && (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LAND || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_LAND_INT16 || GxB_NO_PLUS_LAND_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_land_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_land_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_land_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_land_int16
 GrB_Info GB_Asaxpy3B__plus_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_land_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_land_int32.c
index 124ff03ef5..7e14f7ff42 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_land_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_land_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) && (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) (((ax != 0) && (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LAND || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_LAND_INT32 || GxB_NO_PLUS_LAND_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_land_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_land_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_land_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_land_int32
 GrB_Info GB_Asaxpy3B__plus_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_land_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_land_int64.c
index bbf7c04ca2..8cc9702740 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_land_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_land_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) && (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) (((ax != 0) && (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LAND || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_LAND_INT64 || GxB_NO_PLUS_LAND_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_land_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_land_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_land_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_land_int64
 GrB_Info GB_Asaxpy3B__plus_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_land_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_land_int8.c
index 331fdbd280..5769c389d9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_land_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_land_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) && (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) (((ax != 0) && (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LAND || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_LAND_INT8 || GxB_NO_PLUS_LAND_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_land_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_land_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_land_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_land_int8
 GrB_Info GB_Asaxpy3B__plus_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint16.c
index a2c246b1f7..4ebac4b8ba 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) && (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) (((ax != 0) && (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LAND || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_LAND_UINT16 || GxB_NO_PLUS_LAND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_land_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_land_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_land_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_land_uint16
 GrB_Info GB_Asaxpy3B__plus_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint32.c
index eee14d53a4..4d38458e18 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) && (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) (((ax != 0) && (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LAND || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_LAND_UINT32 || GxB_NO_PLUS_LAND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_land_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_land_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_land_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_land_uint32
 GrB_Info GB_Asaxpy3B__plus_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint64.c
index 3a70394882..cf37eec5f6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) && (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) (((ax != 0) && (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LAND || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_LAND_UINT64 || GxB_NO_PLUS_LAND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_land_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_land_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_land_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_land_uint64
 GrB_Info GB_Asaxpy3B__plus_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint8.c
index 799a7ebdf9..d95a92536d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_land_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) && (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) (((ax != 0) && (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LAND || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_LAND_UINT8 || GxB_NO_PLUS_LAND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_land_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_land_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_land_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_land_uint8
 GrB_Info GB_Asaxpy3B__plus_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_fp32.c
index 08a802d6f1..0630e03ca8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += ((x != 0) || (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (((ax != 0) || (bx != 0))) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LOR || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_LOR_FP32 || GxB_NO_PLUS_LOR_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lor_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lor_fp32
 GrB_Info GB_Asaxpy3B__plus_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_fp64.c
index a963ca64da..b5b7fe74f4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += ((x != 0) || (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (((ax != 0) || (bx != 0))) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LOR || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_LOR_FP64 || GxB_NO_PLUS_LOR_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lor_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lor_fp64
 GrB_Info GB_Asaxpy3B__plus_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int16.c
index 03b4093e1c..344be0c796 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) || (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) (((ax != 0) || (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LOR || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_LOR_INT16 || GxB_NO_PLUS_LOR_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lor_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lor_int16
 GrB_Info GB_Asaxpy3B__plus_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int32.c
index aac5fe3314..fd7be94bd4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) || (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) (((ax != 0) || (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LOR || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_LOR_INT32 || GxB_NO_PLUS_LOR_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lor_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lor_int32
 GrB_Info GB_Asaxpy3B__plus_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int64.c
index 1360d453cb..70910cb40f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) || (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) (((ax != 0) || (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LOR || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_LOR_INT64 || GxB_NO_PLUS_LOR_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lor_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lor_int64
 GrB_Info GB_Asaxpy3B__plus_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int8.c
index 236807f363..aea8d3244d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) || (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) (((ax != 0) || (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LOR || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_LOR_INT8 || GxB_NO_PLUS_LOR_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lor_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lor_int8
 GrB_Info GB_Asaxpy3B__plus_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint16.c
index 0026ec7c71..40b99fd9a6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) || (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) (((ax != 0) || (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LOR || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_LOR_UINT16 || GxB_NO_PLUS_LOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lor_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lor_uint16
 GrB_Info GB_Asaxpy3B__plus_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint32.c
index dcb379e0a7..e51fa41eec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) || (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) (((ax != 0) || (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LOR || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_LOR_UINT32 || GxB_NO_PLUS_LOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lor_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lor_uint32
 GrB_Info GB_Asaxpy3B__plus_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint64.c
index 2c0182330e..72d73b8b5c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) || (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) (((ax != 0) || (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LOR || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_LOR_UINT64 || GxB_NO_PLUS_LOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lor_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lor_uint64
 GrB_Info GB_Asaxpy3B__plus_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint8.c
index 9e9d2ed3c2..22ed72daed 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lor_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) || (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) (((ax != 0) || (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LOR || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_LOR_UINT8 || GxB_NO_PLUS_LOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lor_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lor_uint8
 GrB_Info GB_Asaxpy3B__plus_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_fp32.c
index 746c505ad5..6a4584d40d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += ((x != 0) != (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (((ax != 0) != (bx != 0))) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LXOR || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_LXOR_FP32 || GxB_NO_PLUS_LXOR_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lxor_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lxor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lxor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lxor_fp32
 GrB_Info GB_Asaxpy3B__plus_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_fp64.c
index 3fab499881..0067e4d9c4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += ((x != 0) != (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (((ax != 0) != (bx != 0))) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LXOR || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_LXOR_FP64 || GxB_NO_PLUS_LXOR_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lxor_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lxor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lxor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lxor_fp64
 GrB_Info GB_Asaxpy3B__plus_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int16.c
index 4182ca551e..97d7cf8037 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) != (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) (((ax != 0) != (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LXOR || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_LXOR_INT16 || GxB_NO_PLUS_LXOR_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lxor_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lxor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lxor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lxor_int16
 GrB_Info GB_Asaxpy3B__plus_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int32.c
index 89d3e5a135..7295abf8fe 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) != (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) (((ax != 0) != (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LXOR || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_LXOR_INT32 || GxB_NO_PLUS_LXOR_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lxor_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lxor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lxor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lxor_int32
 GrB_Info GB_Asaxpy3B__plus_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int64.c
index ac03482112..847e2ce6d9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) != (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) (((ax != 0) != (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LXOR || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_LXOR_INT64 || GxB_NO_PLUS_LXOR_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lxor_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lxor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lxor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lxor_int64
 GrB_Info GB_Asaxpy3B__plus_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int8.c
index 6f3b347428..ec9b1a67ea 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) != (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) (((ax != 0) != (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LXOR || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_LXOR_INT8 || GxB_NO_PLUS_LXOR_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lxor_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lxor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lxor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lxor_int8
 GrB_Info GB_Asaxpy3B__plus_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint16.c
index 3cd62f8169..bc04895843 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) != (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) (((ax != 0) != (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LXOR || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_LXOR_UINT16 || GxB_NO_PLUS_LXOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lxor_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lxor_uint16
 GrB_Info GB_Asaxpy3B__plus_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint32.c
index b885dcff0d..e76d03ded7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) != (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) (((ax != 0) != (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LXOR || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_LXOR_UINT32 || GxB_NO_PLUS_LXOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lxor_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lxor_uint32
 GrB_Info GB_Asaxpy3B__plus_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint64.c
index 16f95527da..44428a5462 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) != (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) (((ax != 0) != (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LXOR || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_LXOR_UINT64 || GxB_NO_PLUS_LXOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lxor_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lxor_uint64
 GrB_Info GB_Asaxpy3B__plus_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint8.c
index b2e6bcae11..15930d2c52 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_lxor_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) != (y != 0)) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) (((ax != 0) != (bx != 0)))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_LXOR || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_LXOR_UINT8 || GxB_NO_PLUS_LXOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_lxor_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_lxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_lxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_lxor_uint8
 GrB_Info GB_Asaxpy3B__plus_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_max_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_max_fp32.c
index 27a3ed5903..fb7d106c52 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_max_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_max_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmaxf (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += fmaxf (x, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (fmaxf (ax, bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MAX || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_MAX_FP32 || GxB_NO_PLUS_MAX_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_max_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_max_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_max_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_max_fp32
 GrB_Info GB_Asaxpy3B__plus_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_max_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_max_fp64.c
index 1380211b2e..09cb34c3c6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_max_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_max_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmax (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += fmax (x, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (fmax (ax, bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MAX || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_MAX_FP64 || GxB_NO_PLUS_MAX_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_max_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_max_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_max_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_max_fp64
 GrB_Info GB_Asaxpy3B__plus_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_max_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_max_int16.c
index ff6fc7b1e5..ae1eec82ae 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_max_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_max_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IMAX (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) (GB_IMAX (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MAX || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_MAX_INT16 || GxB_NO_PLUS_MAX_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_max_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_max_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_max_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_max_int16
 GrB_Info GB_Asaxpy3B__plus_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_max_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_max_int32.c
index ea72ad561a..630bfde750 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_max_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_max_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IMAX (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) (GB_IMAX (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MAX || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_MAX_INT32 || GxB_NO_PLUS_MAX_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_max_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_max_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_max_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_max_int32
 GrB_Info GB_Asaxpy3B__plus_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_max_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_max_int64.c
index 6453bb420f..c7b08dbc6b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_max_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_max_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IMAX (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) (GB_IMAX (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MAX || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_MAX_INT64 || GxB_NO_PLUS_MAX_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_max_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_max_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_max_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_max_int64
 GrB_Info GB_Asaxpy3B__plus_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_max_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_max_int8.c
index bed6824b0f..8c4b36cd73 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_max_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_max_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IMAX (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) (GB_IMAX (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MAX || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_MAX_INT8 || GxB_NO_PLUS_MAX_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_max_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_max_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_max_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_max_int8
 GrB_Info GB_Asaxpy3B__plus_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint16.c
index 83228abf2d..b41cf1b4f9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IMAX (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) (GB_IMAX (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MAX || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_PLUS_MAX_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_max_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_max_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_max_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_max_uint16
 GrB_Info GB_Asaxpy3B__plus_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint32.c
index a95db076c1..d301adcf9a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IMAX (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) (GB_IMAX (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MAX || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_PLUS_MAX_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_max_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_max_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_max_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_max_uint32
 GrB_Info GB_Asaxpy3B__plus_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint64.c
index fb3727d52f..b1780d50a4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IMAX (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) (GB_IMAX (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MAX || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_PLUS_MAX_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_max_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_max_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_max_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_max_uint64
 GrB_Info GB_Asaxpy3B__plus_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint8.c
index 5f961055de..79c1d415fd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_max_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IMAX (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) (GB_IMAX (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MAX || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_PLUS_MAX_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_max_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_max_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_max_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_max_uint8
 GrB_Info GB_Asaxpy3B__plus_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_min_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_min_fp32.c
index 9ad2c5016d..ed1633b6b4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_min_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_min_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fminf (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += fminf (x, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (fminf (ax, bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MIN || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_MIN_FP32 || GxB_NO_PLUS_MIN_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_min_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_min_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_min_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_min_fp32
 GrB_Info GB_Asaxpy3B__plus_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_min_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_min_fp64.c
index c2fa426a85..c81fa87e2a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_min_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_min_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmin (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += fmin (x, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (fmin (ax, bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MIN || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_MIN_FP64 || GxB_NO_PLUS_MIN_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_min_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_min_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_min_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_min_fp64
 GrB_Info GB_Asaxpy3B__plus_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_min_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_min_int16.c
index 7b5563c238..511ca08607 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_min_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_min_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IMIN (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) (GB_IMIN (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MIN || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_MIN_INT16 || GxB_NO_PLUS_MIN_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_min_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_min_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_min_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_min_int16
 GrB_Info GB_Asaxpy3B__plus_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_min_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_min_int32.c
index bf989cc2e9..2b43426949 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_min_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_min_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IMIN (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) (GB_IMIN (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MIN || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_MIN_INT32 || GxB_NO_PLUS_MIN_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_min_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_min_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_min_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_min_int32
 GrB_Info GB_Asaxpy3B__plus_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_min_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_min_int64.c
index 3717091de1..a4235a198c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_min_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_min_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IMIN (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) (GB_IMIN (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MIN || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_MIN_INT64 || GxB_NO_PLUS_MIN_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_min_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_min_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_min_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_min_int64
 GrB_Info GB_Asaxpy3B__plus_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_min_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_min_int8.c
index 6af0d3b34c..9ba36657c7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_min_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_min_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IMIN (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) (GB_IMIN (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MIN || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_MIN_INT8 || GxB_NO_PLUS_MIN_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_min_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_min_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_min_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_min_int8
 GrB_Info GB_Asaxpy3B__plus_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint16.c
index 6b52ab86a3..9afbd77a7c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IMIN (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) (GB_IMIN (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MIN || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_PLUS_MIN_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_min_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_min_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_min_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_min_uint16
 GrB_Info GB_Asaxpy3B__plus_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint32.c
index 13bfd50af6..444f487156 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IMIN (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) (GB_IMIN (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MIN || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_PLUS_MIN_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_min_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_min_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_min_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_min_uint32
 GrB_Info GB_Asaxpy3B__plus_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint64.c
index 9483ab2aa5..7b7de29d69 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IMIN (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) (GB_IMIN (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MIN || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_PLUS_MIN_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_min_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_min_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_min_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_min_uint64
 GrB_Info GB_Asaxpy3B__plus_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint8.c
index 05092f7f00..c67e9ac4bd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_min_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IMIN (x, y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) (GB_IMIN (ax, bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MIN || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_PLUS_MIN_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_min_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_min_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_min_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_min_uint8
 GrB_Info GB_Asaxpy3B__plus_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fc32.c
index 43f9806942..30d96108be 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_minus (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_minus (x, y) ; z = GB_FC32_add (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MINUS || GxB_NO_FC32 || GxB_NO_PLUS_FC32 || GxB_NO_MINUS_FC32 || GxB_NO_PLUS_MINUS_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_minus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_minus_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_minus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_minus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_minus_fc32
 GrB_Info GB_Asaxpy3B__plus_minus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fc64.c
index 73a5d500e2..3ecce49a19 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_minus (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_minus (x, y) ; z = GB_FC64_add (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     1
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MINUS || GxB_NO_FC64 || GxB_NO_PLUS_FC64 || GxB_NO_MINUS_FC64 || GxB_NO_PLUS_MINUS_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_minus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_minus_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_minus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_minus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_minus_fc64
 GrB_Info GB_Asaxpy3B__plus_minus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fp32.c
index d39a3d4bab..5d32d1c1ec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x - y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax - bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MINUS || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_MINUS_FP32 || GxB_NO_PLUS_MINUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_minus_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_minus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_minus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_minus_fp32
 GrB_Info GB_Asaxpy3B__plus_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fp64.c
index 3830139b76..0c4cf421f0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x - y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax - bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MINUS || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_MINUS_FP64 || GxB_NO_PLUS_MINUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_minus_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_minus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_minus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_minus_fp64
 GrB_Info GB_Asaxpy3B__plus_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int16.c
index d93492c79a..cc6867bf11 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x - y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) ((ax - bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MINUS || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_MINUS_INT16 || GxB_NO_PLUS_MINUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_minus_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_minus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_minus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_minus_int16
 GrB_Info GB_Asaxpy3B__plus_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int32.c
index 493c7897cf..fbe173ef3a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x - y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) ((ax - bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MINUS || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_MINUS_INT32 || GxB_NO_PLUS_MINUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_minus_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_minus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_minus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_minus_int32
 GrB_Info GB_Asaxpy3B__plus_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int64.c
index 499910b4fe..959bee2f0f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x - y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) ((ax - bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MINUS || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_MINUS_INT64 || GxB_NO_PLUS_MINUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_minus_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_minus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_minus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_minus_int64
 GrB_Info GB_Asaxpy3B__plus_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int8.c
index 7c186e2cc2..9a4b6c1366 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x - y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) ((ax - bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MINUS || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_MINUS_INT8 || GxB_NO_PLUS_MINUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_minus_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_minus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_minus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_minus_int8
 GrB_Info GB_Asaxpy3B__plus_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint16.c
index 1b9e52f89b..551e23d811 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x - y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) ((ax - bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MINUS || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_MINUS_UINT16 || GxB_NO_PLUS_MINUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_minus_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_minus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_minus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_minus_uint16
 GrB_Info GB_Asaxpy3B__plus_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint32.c
index 1548a3c34b..dd5ebf360f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x - y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) ((ax - bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MINUS || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_MINUS_UINT32 || GxB_NO_PLUS_MINUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_minus_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_minus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_minus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_minus_uint32
 GrB_Info GB_Asaxpy3B__plus_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint64.c
index 61329b179e..1bc4ff5296 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x - y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) ((ax - bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MINUS || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_MINUS_UINT64 || GxB_NO_PLUS_MINUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_minus_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_minus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_minus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_minus_uint64
 GrB_Info GB_Asaxpy3B__plus_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint8.c
index 0ffc55ab4e..bbc6daa51a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_minus_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x - y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) ((ax - bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_MINUS || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_MINUS_UINT8 || GxB_NO_PLUS_MINUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_minus_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_minus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_minus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_minus_uint8
 GrB_Info GB_Asaxpy3B__plus_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fc32.c
index 76cf4c63af..664435a00d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GxB_CMPLXF(1,0)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = GB_FC32_add (z, GxB_CMPLXF(1,0))
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_PAIR || GxB_NO_FC32 || GxB_NO_PLUS_FC32 || GxB_NO_PAIR_FC32 || GxB_NO_PLUS_PAIR_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_pair_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_pair_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_pair_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_pair_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_pair_fc32
 GrB_Info GB_Asaxpy3B__plus_pair_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fc64.c
index f752a380f3..43d24d99de 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GxB_CMPLX(1,0)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = GB_FC64_add (z, GxB_CMPLX(1,0))
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     1
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_PAIR || GxB_NO_FC64 || GxB_NO_PLUS_FC64 || GxB_NO_PAIR_FC64 || GxB_NO_PLUS_PAIR_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_pair_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_pair_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_pair_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_pair_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_pair_fc64
 GrB_Info GB_Asaxpy3B__plus_pair_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fp32.c
index c69b98041b..468a6e1a4c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     1
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += exists ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_PAIR || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_PAIR_FP32 || GxB_NO_PLUS_PAIR_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_pair_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_pair_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_pair_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_pair_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_pair_fp32
 GrB_Info GB_Asaxpy3B__plus_pair_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fp64.c
index a502b50439..62f86a559d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     1
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += exists ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_PAIR || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_PAIR_FP64 || GxB_NO_PLUS_PAIR_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_pair_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_pair_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_pair_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_pair_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_pair_fp64
 GrB_Info GB_Asaxpy3B__plus_pair_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int16.c
index c643b79def..955e928db5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     1
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += exists ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_PAIR || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_PAIR_INT16 || GxB_NO_PLUS_PAIR_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_pair_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_pair_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_pair_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_pair_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_pair_int16
 GrB_Info GB_Asaxpy3B__plus_pair_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int32.c
index 22ed1bd000..292859fad6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     1
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += exists ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_PAIR || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_PAIR_INT32 || GxB_NO_PLUS_PAIR_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_pair_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_pair_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_pair_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_pair_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_pair_int32
 GrB_Info GB_Asaxpy3B__plus_pair_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int64.c
index affd675832..84c0120e06 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     1
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += exists ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_PAIR || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_PAIR_INT64 || GxB_NO_PLUS_PAIR_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_pair_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_pair_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_pair_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_pair_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_pair_int64
 GrB_Info GB_Asaxpy3B__plus_pair_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int8.c
index 8383c35f59..8f6398a87e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     1
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += exists ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_PAIR || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_PAIR_INT8 || GxB_NO_PLUS_PAIR_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_pair_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_pair_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_pair_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_pair_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_pair_int8
 GrB_Info GB_Asaxpy3B__plus_pair_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint16.c
index c6de236dee..37ebe7d302 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     1
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += exists ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_PAIR || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_PAIR_UINT16 || GxB_NO_PLUS_PAIR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_pair_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_pair_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_pair_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_pair_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_pair_uint16
 GrB_Info GB_Asaxpy3B__plus_pair_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint32.c
index 9523761ea4..bc897847d2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     1
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += exists ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_PAIR || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_PAIR_UINT32 || GxB_NO_PLUS_PAIR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_pair_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_pair_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_pair_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_pair_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_pair_uint32
 GrB_Info GB_Asaxpy3B__plus_pair_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint64.c
index 7c0e4ed17c..611ffbebc8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     1
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += exists ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_PAIR || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_PAIR_UINT64 || GxB_NO_PLUS_PAIR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_pair_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_pair_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_pair_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_pair_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_pair_uint64
 GrB_Info GB_Asaxpy3B__plus_pair_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint8.c
index af48d2b0f7..18a412cf1f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_pair_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = 1
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += 1
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     1
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += exists ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_PAIR || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_PAIR_UINT8 || GxB_NO_PLUS_PAIR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_pair_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_pair_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_pair_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_pair_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_pair_uint8
 GrB_Info GB_Asaxpy3B__plus_pair_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fc32.c
index f48ca477cf..d2d24874a3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_add (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_add (x, y) ; z = GB_FC32_add (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FC32 || GxB_NO_PLUS_FC32 || GxB_NO_PLUS_PLUS_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_plus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_plus_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_plus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_plus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_plus_fc32
 GrB_Info GB_Asaxpy3B__plus_plus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fc64.c
index ea7e12c8b1..f4d52dc2dd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_add (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_add (x, y) ; z = GB_FC64_add (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     1
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FC64 || GxB_NO_PLUS_FC64 || GxB_NO_PLUS_PLUS_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_plus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_plus_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_plus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_plus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_plus_fc64
 GrB_Info GB_Asaxpy3B__plus_plus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fp32.c
index 8708810cba..765e2a0ac3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x + y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax + bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_PLUS_PLUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_plus_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_plus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_plus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_plus_fp32
 GrB_Info GB_Asaxpy3B__plus_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fp64.c
index 38be5c9f6f..0c622625cd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x + y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((ax + bx)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_PLUS_PLUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_plus_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_plus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_plus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_plus_fp64
 GrB_Info GB_Asaxpy3B__plus_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int16.c
index a2df856a98..4092e93918 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x + y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) ((ax + bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_PLUS_PLUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_plus_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_plus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_plus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_plus_int16
 GrB_Info GB_Asaxpy3B__plus_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int32.c
index 78e2267830..f1e08eab1b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x + y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) ((ax + bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_PLUS_PLUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_plus_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_plus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_plus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_plus_int32
 GrB_Info GB_Asaxpy3B__plus_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int64.c
index 427470d202..6672c9d3fa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x + y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) ((ax + bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_PLUS_PLUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_plus_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_plus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_plus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_plus_int64
 GrB_Info GB_Asaxpy3B__plus_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int8.c
index 487bd83935..de54eeea04 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x + y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) ((ax + bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_PLUS_PLUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_plus_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_plus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_plus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_plus_int8
 GrB_Info GB_Asaxpy3B__plus_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint16.c
index ab1379d752..690ae10d04 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x + y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) ((ax + bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_PLUS_PLUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_plus_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_plus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_plus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_plus_uint16
 GrB_Info GB_Asaxpy3B__plus_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint32.c
index 28958c2693..f0718019cf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x + y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) ((ax + bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_PLUS_PLUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_plus_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_plus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_plus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_plus_uint32
 GrB_Info GB_Asaxpy3B__plus_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint64.c
index 48a546f1a4..6d50df567f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x + y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) ((ax + bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_PLUS_PLUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_plus_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_plus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_plus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_plus_uint64
 GrB_Info GB_Asaxpy3B__plus_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint8.c
index 5b1d065fef..b0ae0a16f3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_plus_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x + y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) ((ax + bx))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_PLUS_PLUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_plus_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_plus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_plus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_plus_uint8
 GrB_Info GB_Asaxpy3B__plus_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fc32.c
index 07b466f779..64b7475f9c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_div (y, x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_div (y, x) ; z = GB_FC32_add (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RDIV || GxB_NO_FC32 || GxB_NO_PLUS_FC32 || GxB_NO_RDIV_FC32 || GxB_NO_PLUS_RDIV_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rdiv_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rdiv_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rdiv_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rdiv_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rdiv_fc32
 GrB_Info GB_Asaxpy3B__plus_rdiv_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fc64.c
index d25b46a899..f8c5f7a01a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_div (y, x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_div (y, x) ; z = GB_FC64_add (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     1
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RDIV || GxB_NO_FC64 || GxB_NO_PLUS_FC64 || GxB_NO_RDIV_FC64 || GxB_NO_PLUS_RDIV_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rdiv_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rdiv_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rdiv_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rdiv_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rdiv_fc64
 GrB_Info GB_Asaxpy3B__plus_rdiv_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fp32.c
index 184fd724b0..4e5f07d4df 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y / x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (y / x)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RDIV || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_RDIV_FP32 || GxB_NO_PLUS_RDIV_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rdiv_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rdiv_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rdiv_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rdiv_fp32
 GrB_Info GB_Asaxpy3B__plus_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fp64.c
index dc6efd184c..6394d918ea 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y / x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (y / x)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RDIV || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_RDIV_FP64 || GxB_NO_PLUS_RDIV_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rdiv_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rdiv_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rdiv_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rdiv_fp64
 GrB_Info GB_Asaxpy3B__plus_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int16.c
index a072975607..32f34e206c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IDIV_SIGNED (y, x, 16) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) (GB_IDIV_SIGNED (bx, ax, 16))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RDIV || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_RDIV_INT16 || GxB_NO_PLUS_RDIV_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rdiv_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rdiv_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rdiv_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rdiv_int16
 GrB_Info GB_Asaxpy3B__plus_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int32.c
index 077e85bcd9..9c8f81c569 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IDIV_SIGNED (y, x, 32) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) (GB_IDIV_SIGNED (bx, ax, 32))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RDIV || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_RDIV_INT32 || GxB_NO_PLUS_RDIV_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rdiv_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rdiv_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rdiv_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rdiv_int32
 GrB_Info GB_Asaxpy3B__plus_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int64.c
index f8e6caa002..7c7216765f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IDIV_SIGNED (y, x, 64) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) (GB_IDIV_SIGNED (bx, ax, 64))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RDIV || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_RDIV_INT64 || GxB_NO_PLUS_RDIV_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rdiv_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rdiv_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rdiv_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rdiv_int64
 GrB_Info GB_Asaxpy3B__plus_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int8.c
index 5a9e71eb5b..407793c9f3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IDIV_SIGNED (y, x, 8) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) (GB_IDIV_SIGNED (bx, ax, 8))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RDIV || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_RDIV_INT8 || GxB_NO_PLUS_RDIV_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rdiv_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rdiv_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rdiv_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rdiv_int8
 GrB_Info GB_Asaxpy3B__plus_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint16.c
index c8f00ed748..2d138adfc5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IDIV_UNSIGNED (y, x, 16) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) (GB_IDIV_UNSIGNED (bx, ax, 16))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RDIV || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_RDIV_UINT16 || GxB_NO_PLUS_RDIV_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rdiv_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rdiv_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rdiv_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rdiv_uint16
 GrB_Info GB_Asaxpy3B__plus_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint32.c
index 876c68941a..8be7451722 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IDIV_UNSIGNED (y, x, 32) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) (GB_IDIV_UNSIGNED (bx, ax, 32))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RDIV || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_RDIV_UINT32 || GxB_NO_PLUS_RDIV_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rdiv_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rdiv_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rdiv_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rdiv_uint32
 GrB_Info GB_Asaxpy3B__plus_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint64.c
index 938739b669..378815623a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IDIV_UNSIGNED (y, x, 64) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) (GB_IDIV_UNSIGNED (bx, ax, 64))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RDIV || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_RDIV_UINT64 || GxB_NO_PLUS_RDIV_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rdiv_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rdiv_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rdiv_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rdiv_uint64
 GrB_Info GB_Asaxpy3B__plus_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint8.c
index e7cbda653b..b56624c0d1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rdiv_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IDIV_UNSIGNED (y, x, 8) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) (GB_IDIV_UNSIGNED (bx, ax, 8))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RDIV || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_RDIV_UINT8 || GxB_NO_PLUS_RDIV_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rdiv_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rdiv_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rdiv_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rdiv_uint8
 GrB_Info GB_Asaxpy3B__plus_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fc32.c
index 14f1bcee22..fac17196b9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_minus (y, x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_minus (y, x) ; z = GB_FC32_add (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RMINUS || GxB_NO_FC32 || GxB_NO_PLUS_FC32 || GxB_NO_RMINUS_FC32 || GxB_NO_PLUS_RMINUS_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rminus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rminus_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rminus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rminus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rminus_fc32
 GrB_Info GB_Asaxpy3B__plus_rminus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fc64.c
index 9549ae6dd7..132b0a630f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_minus (y, x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_minus (y, x) ; z = GB_FC64_add (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     1
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RMINUS || GxB_NO_FC64 || GxB_NO_PLUS_FC64 || GxB_NO_RMINUS_FC64 || GxB_NO_PLUS_RMINUS_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rminus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rminus_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rminus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rminus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rminus_fc64
 GrB_Info GB_Asaxpy3B__plus_rminus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fp32.c
index db8c5e8d03..6a1b078b54 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (y - x)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((bx - ax)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RMINUS || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_RMINUS_FP32 || GxB_NO_PLUS_RMINUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rminus_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rminus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rminus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rminus_fp32
 GrB_Info GB_Asaxpy3B__plus_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fp64.c
index f81fb1694c..c857e54d52 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (y - x)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((bx - ax)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RMINUS || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_RMINUS_FP64 || GxB_NO_PLUS_RMINUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rminus_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rminus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rminus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rminus_fp64
 GrB_Info GB_Asaxpy3B__plus_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int16.c
index ffe65c493f..ece0390df2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (y - x) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (((int16_t) ((bx - ax))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RMINUS || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_RMINUS_INT16 || GxB_NO_PLUS_RMINUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rminus_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rminus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rminus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rminus_int16
 GrB_Info GB_Asaxpy3B__plus_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int32.c
index 2968524374..37a896c433 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (y - x) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (((int32_t) ((bx - ax))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RMINUS || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_RMINUS_INT32 || GxB_NO_PLUS_RMINUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rminus_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rminus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rminus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rminus_int32
 GrB_Info GB_Asaxpy3B__plus_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int64.c
index 10e047f9f0..f464f36004 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (y - x) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (((int64_t) ((bx - ax))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RMINUS || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_RMINUS_INT64 || GxB_NO_PLUS_RMINUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rminus_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rminus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rminus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rminus_int64
 GrB_Info GB_Asaxpy3B__plus_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int8.c
index 48b41a807a..9f298d3263 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (y - x) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (((int8_t) ((bx - ax))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RMINUS || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_RMINUS_INT8 || GxB_NO_PLUS_RMINUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rminus_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rminus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rminus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rminus_int8
 GrB_Info GB_Asaxpy3B__plus_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint16.c
index 2c486c304f..6f350b5924 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (y - x) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (((uint16_t) ((bx - ax))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RMINUS || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_RMINUS_UINT16 || GxB_NO_PLUS_RMINUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rminus_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rminus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rminus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rminus_uint16
 GrB_Info GB_Asaxpy3B__plus_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint32.c
index 2d37e8dd1c..1d8f8d15c6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (y - x) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (((uint32_t) ((bx - ax))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RMINUS || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_RMINUS_UINT32 || GxB_NO_PLUS_RMINUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rminus_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rminus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rminus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rminus_uint32
 GrB_Info GB_Asaxpy3B__plus_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint64.c
index 39521fffe5..474eff190a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (y - x) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (((uint64_t) ((bx - ax))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RMINUS || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_RMINUS_UINT64 || GxB_NO_PLUS_RMINUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rminus_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rminus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rminus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rminus_uint64
 GrB_Info GB_Asaxpy3B__plus_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint8.c
index ea2dfa8ce2..ce90b50eec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_rminus_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (y - x) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (((uint8_t) ((bx - ax))) * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_RMINUS || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_RMINUS_UINT8 || GxB_NO_PLUS_RMINUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_rminus_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_rminus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_rminus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_rminus_uint8
 GrB_Info GB_Asaxpy3B__plus_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_second_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_second_fc32.c
index 13eb68eccd..e5c430f53a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_second_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_second_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = GB_FC32_add (z, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_SECOND || GxB_NO_FC32 || GxB_NO_PLUS_FC32 || GxB_NO_SECOND_FC32 || GxB_NO_PLUS_SECOND_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_second_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_second_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_second_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_second_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_second_fc32
 GrB_Info GB_Asaxpy3B__plus_second_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_second_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_second_fc64.c
index 0470b5f050..98105cbe9b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_second_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_second_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = GB_FC64_add (z, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     1
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_SECOND || GxB_NO_FC64 || GxB_NO_PLUS_FC64 || GxB_NO_SECOND_FC64 || GxB_NO_PLUS_SECOND_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_second_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_second_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_second_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_second_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_second_fc64
 GrB_Info GB_Asaxpy3B__plus_second_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_second_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_second_fp32.c
index 16de47d32b..b56c0e2e6f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_second_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_second_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (bx) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_SECOND || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_SECOND_FP32 || GxB_NO_PLUS_SECOND_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_second_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_second_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_second_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_second_fp32
 GrB_Info GB_Asaxpy3B__plus_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_second_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_second_fp64.c
index 23bd8bad31..db15f022d2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_second_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_second_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (bx) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_SECOND || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_SECOND_FP64 || GxB_NO_PLUS_SECOND_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_second_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_second_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_second_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_second_fp64
 GrB_Info GB_Asaxpy3B__plus_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_second_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_second_int16.c
index 45942c4a65..274ca771b5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_second_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_second_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (bx) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_SECOND || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_SECOND_INT16 || GxB_NO_PLUS_SECOND_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_second_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_second_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_second_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_second_int16
 GrB_Info GB_Asaxpy3B__plus_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_second_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_second_int32.c
index c44f7f19e4..8fa0483144 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_second_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_second_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (bx) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_SECOND || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_SECOND_INT32 || GxB_NO_PLUS_SECOND_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_second_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_second_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_second_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_second_int32
 GrB_Info GB_Asaxpy3B__plus_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_second_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_second_int64.c
index ed090764ea..70e81b064c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_second_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_second_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (bx) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_SECOND || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_SECOND_INT64 || GxB_NO_PLUS_SECOND_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_second_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_second_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_second_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_second_int64
 GrB_Info GB_Asaxpy3B__plus_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_second_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_second_int8.c
index 0caefe7383..324db518f2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_second_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_second_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (bx) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_SECOND || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_SECOND_INT8 || GxB_NO_PLUS_SECOND_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_second_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_second_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_second_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_second_int8
 GrB_Info GB_Asaxpy3B__plus_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint16.c
index e80b6e197d..1854503023 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (bx) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_SECOND || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_SECOND_UINT16 || GxB_NO_PLUS_SECOND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_second_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_second_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_second_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_second_uint16
 GrB_Info GB_Asaxpy3B__plus_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint32.c
index de39d41f23..b61ffe76b8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (bx) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_SECOND || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_SECOND_UINT32 || GxB_NO_PLUS_SECOND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_second_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_second_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_second_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_second_uint32
 GrB_Info GB_Asaxpy3B__plus_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint64.c
index 154cb86fbb..b4ae06d985 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (bx) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_SECOND || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_SECOND_UINT64 || GxB_NO_PLUS_SECOND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_second_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_second_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_second_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_second_uint64
 GrB_Info GB_Asaxpy3B__plus_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint8.c
index 54777cbd26..a26d4ef49d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_second_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (bx) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_SECOND || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_SECOND_UINT8 || GxB_NO_PLUS_SECOND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_second_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_second_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_second_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_second_uint8
 GrB_Info GB_Asaxpy3B__plus_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_secondj1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_secondj1_int32.c
new file mode 100644
index 0000000000..08f1acd41c
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_secondj1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__plus_secondj1_int32
+// A'*B function (dot3):     GB_Adot3B__plus_secondj1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__plus_secondj1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__plus_secondj1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (j+1)
+// Add:      cij += z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij += (j+1)
+// Identity: 0
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (j+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z += (j+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (+,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] += t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x + y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] += t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((j+1)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_PLUS || GxB_NO_SECONDJ1 || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_SECONDJ1_INT32 || GxB_NO_PLUS_SECONDJ1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__plus_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__plus_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__plus_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__plus_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_secondj1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_secondj1_int64.c
new file mode 100644
index 0000000000..2ea935c61e
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_secondj1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__plus_secondj1_int64
+// A'*B function (dot3):     GB_Adot3B__plus_secondj1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__plus_secondj1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__plus_secondj1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (j+1)
+// Add:      cij += z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij += (j+1)
+// Identity: 0
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (j+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z += (j+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (+,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] += t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x + y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] += t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ((j+1)) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_PLUS || GxB_NO_SECONDJ1 || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_SECONDJ1_INT64 || GxB_NO_PLUS_SECONDJ1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__plus_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__plus_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__plus_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__plus_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_secondj_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_secondj_int32.c
new file mode 100644
index 0000000000..ad44f64a6d
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_secondj_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__plus_secondj_int32
+// A'*B function (dot3):     GB_Adot3B__plus_secondj_int32
+// C+=A'*B function (dot4):  GB_Adot4B__plus_secondj_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__plus_secondj_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = j
+// Add:      cij += z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij += j
+// Identity: 0
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = j
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z += j
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (+,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] += t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x + y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] += t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (j) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_PLUS || GxB_NO_SECONDJ || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_SECONDJ_INT32 || GxB_NO_PLUS_SECONDJ_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__plus_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__plus_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__plus_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__plus_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_secondj_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_secondj_int64.c
new file mode 100644
index 0000000000..a8859f1d1d
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_secondj_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__plus_secondj_int64
+// A'*B function (dot3):     GB_Adot3B__plus_secondj_int64
+// C+=A'*B function (dot4):  GB_Adot4B__plus_secondj_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__plus_secondj_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = j
+// Add:      cij += z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij += j
+// Identity: 0
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = j
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z += j
+
+// monoid identity value
+#define GB_IDENTITY \
+    0
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (+,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] += t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x + y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] += t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (j) * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,1}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_PLUS || GxB_NO_SECONDJ || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_SECONDJ_INT64 || GxB_NO_PLUS_SECONDJ_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__plus_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__plus_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__plus_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__plus_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_times_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_times_fc32.c
index 3213957a12..2664b5e342 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_times_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_times_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_mul (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_mul (x, y) ; z = GB_FC32_add (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    1
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_TIMES || GxB_NO_FC32 || GxB_NO_PLUS_FC32 || GxB_NO_TIMES_FC32 || GxB_NO_PLUS_TIMES_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_times_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_times_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_times_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_times_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_times_fc32
 GrB_Info GB_Asaxpy3B__plus_times_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_times_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_times_fc64.c
index 8bc2bfc0e4..d93ed0bc7c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_times_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_times_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_mul (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_mul (x, y) ; z = GB_FC64_add (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(0,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    1
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_add (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     1
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_add (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_add (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_TIMES || GxB_NO_FC64 || GxB_NO_PLUS_FC64 || GxB_NO_TIMES_FC64 || GxB_NO_PLUS_TIMES_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_times_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_times_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_times_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_times_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_times_fc64
 GrB_Info GB_Asaxpy3B__plus_times_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_times_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_times_fp32.c
index eec0ef149b..8c26ce6034 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_times_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_times_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x * y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    1
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ax * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    float X [2] = {0,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    X [1] = bkj
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_TIMES || GxB_NO_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_PLUS_TIMES_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_times_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_times_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_times_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_times_fp32
 GrB_Info GB_Asaxpy3B__plus_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_times_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_times_fp64.c
index 0e6ab92465..0008e448ec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_times_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_times_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z += (x * y)
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    1
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += ax * X [exists] ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    double X [2] = {0,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    X [1] = bkj
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_TIMES || GxB_NO_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_PLUS_TIMES_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_times_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_times_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_times_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_times_fp64
 GrB_Info GB_Asaxpy3B__plus_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_times_int16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_times_int16.c
index e8de78fd60..cef1671822 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_times_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_times_int16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x * y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int16_t) (ax * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int16_t X [2] = {0,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    X [1] = bkj
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_TIMES || GxB_NO_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_PLUS_TIMES_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_times_int16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_times_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_times_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_times_int16
 GrB_Info GB_Asaxpy3B__plus_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_times_int32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_times_int32.c
index b76ae50a98..0f73d0bafa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_times_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_times_int32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x * y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int32_t) (ax * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int32_t X [2] = {0,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    X [1] = bkj
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_TIMES || GxB_NO_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_PLUS_TIMES_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_times_int32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_times_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_times_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_times_int32
 GrB_Info GB_Asaxpy3B__plus_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_times_int64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_times_int64.c
index 70a0bd654f..21cc5135c8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_times_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_times_int64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x * y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int64_t) (ax * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int64_t X [2] = {0,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    X [1] = bkj
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_TIMES || GxB_NO_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_PLUS_TIMES_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_times_int64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_times_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_times_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_times_int64
 GrB_Info GB_Asaxpy3B__plus_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_times_int8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_times_int8.c
index bd8d2a2f9b..155a02bcf8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_times_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_times_int8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x * y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (int8_t) (ax * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    int8_t X [2] = {0,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    X [1] = bkj
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_TIMES || GxB_NO_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_PLUS_TIMES_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_times_int8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_times_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_times_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_times_int8
 GrB_Info GB_Asaxpy3B__plus_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint16.c
index d761f59c49..40da2cd782 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint16.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x * y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint16_t) (ax * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint16_t X [2] = {0,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    X [1] = bkj
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_TIMES || GxB_NO_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_PLUS_TIMES_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_times_uint16
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_times_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_times_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_times_uint16
 GrB_Info GB_Asaxpy3B__plus_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint32.c
index a987dfd04c..26588bda4b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x * y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint32_t) (ax * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint32_t X [2] = {0,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    X [1] = bkj
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_TIMES || GxB_NO_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_PLUS_TIMES_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_times_uint32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_times_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_times_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_times_uint32
 GrB_Info GB_Asaxpy3B__plus_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint64.c
index b1eee23faa..9d9be9aabd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x * y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint64_t) (ax * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint64_t X [2] = {0,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    X [1] = bkj
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_TIMES || GxB_NO_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_PLUS_TIMES_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_times_uint64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_times_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_times_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_times_uint64
 GrB_Info GB_Asaxpy3B__plus_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint8.c
index e7d28a695e..f7bb5ed2d4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__plus_times_uint8.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x * y) ; z += x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     0
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    1
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    0
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x + y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] += Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] += t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    1
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    cx += (uint8_t) (ax * X [exists]) ; cb |= exists
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    uint8_t X [2] = {0,0}
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    X [1] = bkj
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_PLUS || GxB_NO_TIMES || GxB_NO_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_PLUS_TIMES_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__plus_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__plus_times_uint8
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__plus_times_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__plus_times_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__plus_times_uint8
 GrB_Info GB_Asaxpy3B__plus_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_div_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__times_div_fc32.c
index 0ff4160548..82a151fa83 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_div_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_div_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_div (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_div (x, y) ; z = GB_FC32_mul (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_DIV || GxB_NO_FC32 || GxB_NO_TIMES_FC32 || GxB_NO_DIV_FC32 || GxB_NO_TIMES_DIV_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_div_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_div_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_div_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_div_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_div_fc32
 GrB_Info GB_Asaxpy3B__times_div_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_div_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__times_div_fc64.c
index 4af7e08727..9b39c219f6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_div_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_div_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_div (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_div (x, y) ; z = GB_FC64_mul (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_DIV || GxB_NO_FC64 || GxB_NO_TIMES_FC64 || GxB_NO_DIV_FC64 || GxB_NO_TIMES_DIV_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_div_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_div_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_div_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_div_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_div_fc64
 GrB_Info GB_Asaxpy3B__times_div_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_div_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_div_fp32.c
index c5d3dc1c5d..c844c98f11 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_div_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_div_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x / y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x / y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_DIV || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_DIV_FP32 || GxB_NO_TIMES_DIV_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_div_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_div_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_div_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_div_fp32
 GrB_Info GB_Asaxpy3B__times_div_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_div_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_div_fp64.c
index bf90856156..c109b316d3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_div_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_div_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x / y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x / y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_DIV || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_DIV_FP64 || GxB_NO_TIMES_DIV_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_div_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_div_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_div_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_div_fp64
 GrB_Info GB_Asaxpy3B__times_div_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_div_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_div_int16.c
index db622fadd8..c85248cf69 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_div_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_div_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int16_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 16) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IDIV_SIGNED (x, y, 16) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_DIV || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_DIV_INT16 || GxB_NO_TIMES_DIV_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_div_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_div_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_div_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_div_int16
 GrB_Info GB_Asaxpy3B__times_div_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_div_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_div_int32.c
index d0e1799659..e2ef174648 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_div_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_div_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int32_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 32) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IDIV_SIGNED (x, y, 32) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_DIV || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_DIV_INT32 || GxB_NO_TIMES_DIV_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_div_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_div_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_div_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_div_int32
 GrB_Info GB_Asaxpy3B__times_div_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_div_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_div_int64.c
index e5a005dbe7..bcddc95854 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_div_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_div_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int64_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 64) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IDIV_SIGNED (x, y, 64) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_DIV || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_DIV_INT64 || GxB_NO_TIMES_DIV_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_div_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_div_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_div_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_div_int64
 GrB_Info GB_Asaxpy3B__times_div_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_div_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_div_int8.c
index bbbf105637..4b5ecf51d2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_div_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_div_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int8_t x_op_y = GB_IDIV_SIGNED (aik, bkj, 8) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (x, y, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IDIV_SIGNED (x, y, 8) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_DIV || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_DIV_INT8 || GxB_NO_TIMES_DIV_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_div_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_div_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_div_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_div_int8
 GrB_Info GB_Asaxpy3B__times_div_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_div_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_div_uint16.c
index fffad18af9..e3a6ebd8bf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_div_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_div_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 16) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IDIV_UNSIGNED (x, y, 16) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_DIV || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_DIV_UINT16 || GxB_NO_TIMES_DIV_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_div_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_div_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_div_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_div_uint16
 GrB_Info GB_Asaxpy3B__times_div_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_div_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_div_uint32.c
index 8a219a56d3..b7a9529d77 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_div_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_div_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 32) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IDIV_UNSIGNED (x, y, 32) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_DIV || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_DIV_UINT32 || GxB_NO_TIMES_DIV_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_div_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_div_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_div_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_div_uint32
 GrB_Info GB_Asaxpy3B__times_div_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_div_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_div_uint64.c
index b0f00c1eb9..6b095de36e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_div_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_div_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 64) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IDIV_UNSIGNED (x, y, 64) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_DIV || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_DIV_UINT64 || GxB_NO_TIMES_DIV_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_div_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_div_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_div_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_div_uint64
 GrB_Info GB_Asaxpy3B__times_div_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_div_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_div_uint8.c
index a7f3b1f797..36051aee5c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_div_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_div_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = GB_IDIV_UNSIGNED (aik, bkj, 8) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (x, y, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IDIV_UNSIGNED (x, y, 8) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_DIV || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_DIV_UINT8 || GxB_NO_TIMES_DIV_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_div_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_div_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_div_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_div_uint8
 GrB_Info GB_Asaxpy3B__times_div_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_first_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__times_first_fc32.c
index ec03d3d132..3685beb248 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_first_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_first_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = GB_FC32_mul (z, x)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FIRST || GxB_NO_FC32 || GxB_NO_TIMES_FC32 || GxB_NO_FIRST_FC32 || GxB_NO_TIMES_FIRST_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_first_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_first_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_first_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_first_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_first_fc32
 GrB_Info GB_Asaxpy3B__times_first_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_first_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__times_first_fc64.c
index e6cfc6de3c..8106f95a79 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_first_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_first_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = GB_FC64_mul (z, x)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FIRST || GxB_NO_FC64 || GxB_NO_TIMES_FC64 || GxB_NO_FIRST_FC64 || GxB_NO_TIMES_FIRST_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_first_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_first_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_first_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_first_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_first_fc64
 GrB_Info GB_Asaxpy3B__times_first_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_first_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_first_fp32.c
index 114422f30d..5d77e17aa1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_first_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_first_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= x
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FIRST || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_FIRST_FP32 || GxB_NO_TIMES_FIRST_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_first_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_first_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_first_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_first_fp32
 GrB_Info GB_Asaxpy3B__times_first_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_first_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_first_fp64.c
index a9891b7e9d..f761760b06 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_first_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_first_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= x
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FIRST || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_FIRST_FP64 || GxB_NO_TIMES_FIRST_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_first_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_first_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_first_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_first_fp64
 GrB_Info GB_Asaxpy3B__times_first_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_first_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_first_int16.c
index eb6a36a12e..5f409f0361 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_first_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_first_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= aik
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= x
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FIRST || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_FIRST_INT16 || GxB_NO_TIMES_FIRST_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_first_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_first_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_first_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_first_int16
 GrB_Info GB_Asaxpy3B__times_first_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_first_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_first_int32.c
index b7ffb51d8d..1d97c92794 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_first_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_first_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= aik
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= x
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FIRST || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_FIRST_INT32 || GxB_NO_TIMES_FIRST_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_first_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_first_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_first_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_first_int32
 GrB_Info GB_Asaxpy3B__times_first_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_first_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_first_int64.c
index d085a3c016..49e9f76853 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_first_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_first_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= aik
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= x
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FIRST || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_FIRST_INT64 || GxB_NO_TIMES_FIRST_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_first_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_first_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_first_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_first_int64
 GrB_Info GB_Asaxpy3B__times_first_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_first_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_first_int8.c
index 4036b759f7..947f5720ad 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_first_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_first_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= aik
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= x
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FIRST || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_FIRST_INT8 || GxB_NO_TIMES_FIRST_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_first_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_first_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_first_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_first_int8
 GrB_Info GB_Asaxpy3B__times_first_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_first_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_first_uint16.c
index 0cf104c1c6..803ec1fbfc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_first_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_first_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= aik
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= x
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FIRST || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_FIRST_UINT16 || GxB_NO_TIMES_FIRST_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_first_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_first_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_first_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_first_uint16
 GrB_Info GB_Asaxpy3B__times_first_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_first_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_first_uint32.c
index 4666826870..dd4d3c04cf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_first_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_first_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= aik
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= x
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FIRST || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_FIRST_UINT32 || GxB_NO_TIMES_FIRST_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_first_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_first_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_first_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_first_uint32
 GrB_Info GB_Asaxpy3B__times_first_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_first_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_first_uint64.c
index acb4e0b122..1b2cd68cad 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_first_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_first_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= aik
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= x
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FIRST || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_FIRST_UINT64 || GxB_NO_TIMES_FIRST_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_first_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_first_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_first_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_first_uint64
 GrB_Info GB_Asaxpy3B__times_first_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_first_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_first_uint8.c
index d402009425..a4d103ec57 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_first_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_first_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= aik
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     ;
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = x
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= x
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FIRST || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_FIRST_UINT8 || GxB_NO_TIMES_FIRST_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_first_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_first_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_first_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_first_uint8
 GrB_Info GB_Asaxpy3B__times_first_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_firsti1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_firsti1_int32.c
new file mode 100644
index 0000000000..2b6e94dc2b
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_firsti1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__times_firsti1_int32
+// A'*B function (dot3):     GB_Adot3B__times_firsti1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__times_firsti1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__times_firsti1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (i+1)
+// Add:      cij *= z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij *= (i+1)
+// Identity: 1
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (i+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z *= (i+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    1
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (*,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] *= t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x * y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] *= t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_TIMES || GxB_NO_FIRSTI1 || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_FIRSTI1_INT32 || GxB_NO_TIMES_FIRSTI1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__times_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__times_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__times_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__times_firsti1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_firsti1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_firsti1_int64.c
new file mode 100644
index 0000000000..4cadbcd02e
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_firsti1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__times_firsti1_int64
+// A'*B function (dot3):     GB_Adot3B__times_firsti1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__times_firsti1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__times_firsti1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (i+1)
+// Add:      cij *= z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij *= (i+1)
+// Identity: 1
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (i+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z *= (i+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    1
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (*,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] *= t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x * y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] *= t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_TIMES || GxB_NO_FIRSTI1 || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_FIRSTI1_INT64 || GxB_NO_TIMES_FIRSTI1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__times_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__times_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__times_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__times_firsti1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_firsti_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_firsti_int32.c
new file mode 100644
index 0000000000..2188ebc720
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_firsti_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__times_firsti_int32
+// A'*B function (dot3):     GB_Adot3B__times_firsti_int32
+// C+=A'*B function (dot4):  GB_Adot4B__times_firsti_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__times_firsti_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = i
+// Add:      cij *= z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij *= i
+// Identity: 1
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = i
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z *= i
+
+// monoid identity value
+#define GB_IDENTITY \
+    1
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (*,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] *= t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x * y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] *= t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_TIMES || GxB_NO_FIRSTI || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_FIRSTI_INT32 || GxB_NO_TIMES_FIRSTI_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__times_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__times_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__times_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__times_firsti_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_firsti_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_firsti_int64.c
new file mode 100644
index 0000000000..1daa145dff
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_firsti_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__times_firsti_int64
+// A'*B function (dot3):     GB_Adot3B__times_firsti_int64
+// C+=A'*B function (dot4):  GB_Adot4B__times_firsti_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__times_firsti_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = i
+// Add:      cij *= z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij *= i
+// Identity: 1
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = i
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z *= i
+
+// monoid identity value
+#define GB_IDENTITY \
+    1
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (*,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] *= t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x * y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    1
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] *= t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_TIMES || GxB_NO_FIRSTI || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_FIRSTI_INT64 || GxB_NO_TIMES_FIRSTI_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__times_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__times_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__times_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__times_firsti_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_firstj1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_firstj1_int32.c
new file mode 100644
index 0000000000..466e0ee111
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_firstj1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__times_firstj1_int32
+// A'*B function (dot3):     GB_Adot3B__times_firstj1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__times_firstj1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__times_firstj1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (k+1)
+// Add:      cij *= z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij *= (k+1)
+// Identity: 1
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (k+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z *= (k+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    1
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (*,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] *= t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x * y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] *= t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_TIMES || GxB_NO_FIRSTJ1 || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_FIRSTJ1_INT32 || GxB_NO_TIMES_FIRSTJ1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__times_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__times_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__times_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__times_firstj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_firstj1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_firstj1_int64.c
new file mode 100644
index 0000000000..04ede1d7d8
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_firstj1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__times_firstj1_int64
+// A'*B function (dot3):     GB_Adot3B__times_firstj1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__times_firstj1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__times_firstj1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (k+1)
+// Add:      cij *= z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij *= (k+1)
+// Identity: 1
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (k+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z *= (k+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    1
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (*,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] *= t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x * y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] *= t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_TIMES || GxB_NO_FIRSTJ1 || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_FIRSTJ1_INT64 || GxB_NO_TIMES_FIRSTJ1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__times_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__times_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__times_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__times_firstj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_firstj_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_firstj_int32.c
new file mode 100644
index 0000000000..ed64fde770
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_firstj_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__times_firstj_int32
+// A'*B function (dot3):     GB_Adot3B__times_firstj_int32
+// C+=A'*B function (dot4):  GB_Adot4B__times_firstj_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__times_firstj_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = k
+// Add:      cij *= z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij *= k
+// Identity: 1
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = k
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z *= k
+
+// monoid identity value
+#define GB_IDENTITY \
+    1
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (*,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] *= t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x * y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] *= t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_TIMES || GxB_NO_FIRSTJ || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_FIRSTJ_INT32 || GxB_NO_TIMES_FIRSTJ_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__times_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__times_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__times_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__times_firstj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_firstj_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_firstj_int64.c
new file mode 100644
index 0000000000..216a3edc01
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_firstj_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__times_firstj_int64
+// A'*B function (dot3):     GB_Adot3B__times_firstj_int64
+// C+=A'*B function (dot4):  GB_Adot4B__times_firstj_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__times_firstj_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = k
+// Add:      cij *= z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij *= k
+// Identity: 1
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = k
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z *= k
+
+// monoid identity value
+#define GB_IDENTITY \
+    1
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (*,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] *= t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x * y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    1
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] *= t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_TIMES || GxB_NO_FIRSTJ || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_FIRSTJ_INT64 || GxB_NO_TIMES_FIRSTJ_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__times_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__times_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__times_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__times_firstj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_fp32.c
index a61de39eab..2327dcb752 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISEQ || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_ISEQ_FP32 || GxB_NO_TIMES_ISEQ_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_iseq_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_iseq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_iseq_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_iseq_fp32
 GrB_Info GB_Asaxpy3B__times_iseq_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_fp64.c
index 467a4dc40e..7b09710fdf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x == y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISEQ || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_ISEQ_FP64 || GxB_NO_TIMES_ISEQ_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_iseq_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_iseq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_iseq_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_iseq_fp64
 GrB_Info GB_Asaxpy3B__times_iseq_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int16.c
index 62e22b73a1..b5d392134b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik == bkj)
+// MultAdd:  int16_t x_op_y = (aik == bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x == y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISEQ || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_ISEQ_INT16 || GxB_NO_TIMES_ISEQ_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_iseq_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_iseq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_iseq_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_iseq_int16
 GrB_Info GB_Asaxpy3B__times_iseq_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int32.c
index f505dba7f4..23e13f385b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik == bkj)
+// MultAdd:  int32_t x_op_y = (aik == bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x == y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISEQ || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_ISEQ_INT32 || GxB_NO_TIMES_ISEQ_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_iseq_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_iseq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_iseq_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_iseq_int32
 GrB_Info GB_Asaxpy3B__times_iseq_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int64.c
index 77f31b4992..45f88c91ec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik == bkj)
+// MultAdd:  int64_t x_op_y = (aik == bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x == y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISEQ || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_ISEQ_INT64 || GxB_NO_TIMES_ISEQ_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_iseq_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_iseq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_iseq_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_iseq_int64
 GrB_Info GB_Asaxpy3B__times_iseq_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int8.c
index f57540343a..523d4bf7ee 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik == bkj)
+// MultAdd:  int8_t x_op_y = (aik == bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x == y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISEQ || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_ISEQ_INT8 || GxB_NO_TIMES_ISEQ_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_iseq_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_iseq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_iseq_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_iseq_int8
 GrB_Info GB_Asaxpy3B__times_iseq_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint16.c
index 2a8379f680..fb368b0e9e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik == bkj)
+// MultAdd:  uint16_t x_op_y = (aik == bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x == y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISEQ || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_ISEQ_UINT16 || GxB_NO_TIMES_ISEQ_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_iseq_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_iseq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_iseq_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_iseq_uint16
 GrB_Info GB_Asaxpy3B__times_iseq_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint32.c
index f3fcb62309..6c6f401ed2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik == bkj)
+// MultAdd:  uint32_t x_op_y = (aik == bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x == y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISEQ || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_ISEQ_UINT32 || GxB_NO_TIMES_ISEQ_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_iseq_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_iseq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_iseq_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_iseq_uint32
 GrB_Info GB_Asaxpy3B__times_iseq_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint64.c
index 6363228c47..8d9335d8dc 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik == bkj)
+// MultAdd:  uint64_t x_op_y = (aik == bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x == y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISEQ || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_ISEQ_UINT64 || GxB_NO_TIMES_ISEQ_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_iseq_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_iseq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_iseq_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_iseq_uint64
 GrB_Info GB_Asaxpy3B__times_iseq_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint8.c
index 8811227086..d9dbbff94e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_iseq_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik == bkj)
+// MultAdd:  uint8_t x_op_y = (aik == bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x == y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x == y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x == y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISEQ || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_ISEQ_UINT8 || GxB_NO_TIMES_ISEQ_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_iseq_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_iseq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_iseq_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_iseq_uint8
 GrB_Info GB_Asaxpy3B__times_iseq_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isge_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_isge_fp32.c
index b195ae4584..5cfbb80306 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isge_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isge_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGE || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_ISGE_FP32 || GxB_NO_TIMES_ISGE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isge_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_isge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_isge_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_isge_fp32
 GrB_Info GB_Asaxpy3B__times_isge_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isge_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_isge_fp64.c
index c39ed0fea3..453fb1e3b0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isge_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isge_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x >= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGE || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_ISGE_FP64 || GxB_NO_TIMES_ISGE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isge_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_isge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_isge_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_isge_fp64
 GrB_Info GB_Asaxpy3B__times_isge_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isge_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_isge_int16.c
index c57398f1c5..23e95ddfb4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isge_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isge_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik >= bkj)
+// MultAdd:  int16_t x_op_y = (aik >= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x >= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGE || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_ISGE_INT16 || GxB_NO_TIMES_ISGE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isge_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isge_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isge_int16
 GrB_Info GB_Asaxpy3B__times_isge_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isge_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_isge_int32.c
index 0efce6b3ee..b03fe63534 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isge_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isge_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik >= bkj)
+// MultAdd:  int32_t x_op_y = (aik >= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x >= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGE || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_ISGE_INT32 || GxB_NO_TIMES_ISGE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isge_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isge_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isge_int32
 GrB_Info GB_Asaxpy3B__times_isge_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isge_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_isge_int64.c
index 421548079d..78c08aa068 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isge_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isge_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik >= bkj)
+// MultAdd:  int64_t x_op_y = (aik >= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x >= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGE || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_ISGE_INT64 || GxB_NO_TIMES_ISGE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isge_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isge_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isge_int64
 GrB_Info GB_Asaxpy3B__times_isge_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isge_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_isge_int8.c
index e371d49151..7327b32d2c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isge_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isge_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik >= bkj)
+// MultAdd:  int8_t x_op_y = (aik >= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x >= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGE || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_ISGE_INT8 || GxB_NO_TIMES_ISGE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isge_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isge_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isge_int8
 GrB_Info GB_Asaxpy3B__times_isge_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint16.c
index fd6828039d..049ba90a04 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik >= bkj)
+// MultAdd:  uint16_t x_op_y = (aik >= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x >= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGE || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_ISGE_UINT16 || GxB_NO_TIMES_ISGE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isge_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isge_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isge_uint16
 GrB_Info GB_Asaxpy3B__times_isge_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint32.c
index 087072ed9f..020f177754 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik >= bkj)
+// MultAdd:  uint32_t x_op_y = (aik >= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x >= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGE || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_ISGE_UINT32 || GxB_NO_TIMES_ISGE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isge_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isge_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isge_uint32
 GrB_Info GB_Asaxpy3B__times_isge_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint64.c
index 79752cc5bd..e23f09e8c7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik >= bkj)
+// MultAdd:  uint64_t x_op_y = (aik >= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x >= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGE || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_ISGE_UINT64 || GxB_NO_TIMES_ISGE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isge_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isge_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isge_uint64
 GrB_Info GB_Asaxpy3B__times_isge_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint8.c
index 86238673b2..74cd6e4279 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isge_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik >= bkj)
+// MultAdd:  uint8_t x_op_y = (aik >= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x >= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x >= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x >= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGE || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_ISGE_UINT8 || GxB_NO_TIMES_ISGE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isge_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isge_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isge_uint8
 GrB_Info GB_Asaxpy3B__times_isge_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_fp32.c
index 3a1ae69b8c..f0e9f3e3af 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGT || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_ISGT_FP32 || GxB_NO_TIMES_ISGT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isgt_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_isgt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_isgt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_isgt_fp32
 GrB_Info GB_Asaxpy3B__times_isgt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_fp64.c
index 09fb9d25bc..e8e9274a8f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x > y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGT || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_ISGT_FP64 || GxB_NO_TIMES_ISGT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isgt_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_isgt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_isgt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_isgt_fp64
 GrB_Info GB_Asaxpy3B__times_isgt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int16.c
index 7a3a1258c3..bf83ece46c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik > bkj)
+// MultAdd:  int16_t x_op_y = (aik > bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x > y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGT || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_ISGT_INT16 || GxB_NO_TIMES_ISGT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isgt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isgt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isgt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isgt_int16
 GrB_Info GB_Asaxpy3B__times_isgt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int32.c
index 9c3b4f4e8d..62682b6fdd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik > bkj)
+// MultAdd:  int32_t x_op_y = (aik > bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x > y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGT || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_ISGT_INT32 || GxB_NO_TIMES_ISGT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isgt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isgt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isgt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isgt_int32
 GrB_Info GB_Asaxpy3B__times_isgt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int64.c
index d5c80acdeb..66736ea8bf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik > bkj)
+// MultAdd:  int64_t x_op_y = (aik > bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x > y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGT || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_ISGT_INT64 || GxB_NO_TIMES_ISGT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isgt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isgt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isgt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isgt_int64
 GrB_Info GB_Asaxpy3B__times_isgt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int8.c
index 651d4add51..6b57e38c12 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik > bkj)
+// MultAdd:  int8_t x_op_y = (aik > bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x > y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGT || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_ISGT_INT8 || GxB_NO_TIMES_ISGT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isgt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isgt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isgt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isgt_int8
 GrB_Info GB_Asaxpy3B__times_isgt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint16.c
index 67cb8fd80d..b1ae610b4b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik > bkj)
+// MultAdd:  uint16_t x_op_y = (aik > bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x > y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGT || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_ISGT_UINT16 || GxB_NO_TIMES_ISGT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isgt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isgt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isgt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isgt_uint16
 GrB_Info GB_Asaxpy3B__times_isgt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint32.c
index 097a7b199a..eb73aa7aa0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik > bkj)
+// MultAdd:  uint32_t x_op_y = (aik > bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x > y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGT || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_ISGT_UINT32 || GxB_NO_TIMES_ISGT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isgt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isgt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isgt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isgt_uint32
 GrB_Info GB_Asaxpy3B__times_isgt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint64.c
index 562602db4e..89ec5412c1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik > bkj)
+// MultAdd:  uint64_t x_op_y = (aik > bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x > y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGT || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_ISGT_UINT64 || GxB_NO_TIMES_ISGT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isgt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isgt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isgt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isgt_uint64
 GrB_Info GB_Asaxpy3B__times_isgt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint8.c
index 540e0ce4d4..82ded133a1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isgt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik > bkj)
+// MultAdd:  uint8_t x_op_y = (aik > bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x > y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x > y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x > y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISGT || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_ISGT_UINT8 || GxB_NO_TIMES_ISGT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isgt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isgt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isgt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isgt_uint8
 GrB_Info GB_Asaxpy3B__times_isgt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isle_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_isle_fp32.c
index 07a55a1412..ed1dbc526a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isle_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isle_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLE || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_ISLE_FP32 || GxB_NO_TIMES_ISLE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isle_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_isle_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_isle_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_isle_fp32
 GrB_Info GB_Asaxpy3B__times_isle_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isle_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_isle_fp64.c
index fea37b9671..591fc584e2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isle_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isle_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x <= y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLE || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_ISLE_FP64 || GxB_NO_TIMES_ISLE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isle_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_isle_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_isle_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_isle_fp64
 GrB_Info GB_Asaxpy3B__times_isle_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isle_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_isle_int16.c
index 592f01ae1b..bc58d86e57 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isle_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isle_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik <= bkj)
+// MultAdd:  int16_t x_op_y = (aik <= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x <= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLE || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_ISLE_INT16 || GxB_NO_TIMES_ISLE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isle_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isle_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isle_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isle_int16
 GrB_Info GB_Asaxpy3B__times_isle_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isle_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_isle_int32.c
index 7724becfd2..f30452c0f2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isle_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isle_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik <= bkj)
+// MultAdd:  int32_t x_op_y = (aik <= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x <= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLE || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_ISLE_INT32 || GxB_NO_TIMES_ISLE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isle_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isle_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isle_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isle_int32
 GrB_Info GB_Asaxpy3B__times_isle_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isle_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_isle_int64.c
index f944d05fe8..62697fac44 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isle_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isle_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik <= bkj)
+// MultAdd:  int64_t x_op_y = (aik <= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x <= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLE || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_ISLE_INT64 || GxB_NO_TIMES_ISLE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isle_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isle_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isle_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isle_int64
 GrB_Info GB_Asaxpy3B__times_isle_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isle_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_isle_int8.c
index f94c094e53..1d86af8fb5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isle_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isle_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik <= bkj)
+// MultAdd:  int8_t x_op_y = (aik <= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x <= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLE || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_ISLE_INT8 || GxB_NO_TIMES_ISLE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isle_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isle_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isle_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isle_int8
 GrB_Info GB_Asaxpy3B__times_isle_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint16.c
index 68bb7e03e0..70255f20ec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik <= bkj)
+// MultAdd:  uint16_t x_op_y = (aik <= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x <= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLE || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_ISLE_UINT16 || GxB_NO_TIMES_ISLE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isle_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isle_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isle_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isle_uint16
 GrB_Info GB_Asaxpy3B__times_isle_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint32.c
index b3c393e946..e188731f38 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik <= bkj)
+// MultAdd:  uint32_t x_op_y = (aik <= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x <= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLE || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_ISLE_UINT32 || GxB_NO_TIMES_ISLE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isle_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isle_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isle_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isle_uint32
 GrB_Info GB_Asaxpy3B__times_isle_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint64.c
index 9c1171771b..8b6549f5eb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik <= bkj)
+// MultAdd:  uint64_t x_op_y = (aik <= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x <= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLE || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_ISLE_UINT64 || GxB_NO_TIMES_ISLE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isle_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isle_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isle_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isle_uint64
 GrB_Info GB_Asaxpy3B__times_isle_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint8.c
index 2a42467971..c5dd590b8b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isle_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik <= bkj)
+// MultAdd:  uint8_t x_op_y = (aik <= bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x <= y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x <= y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x <= y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLE || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_ISLE_UINT8 || GxB_NO_TIMES_ISLE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isle_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isle_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isle_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isle_uint8
 GrB_Info GB_Asaxpy3B__times_isle_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_islt_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_islt_fp32.c
index d73707c20f..222f4d795e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_islt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_islt_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLT || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_ISLT_FP32 || GxB_NO_TIMES_ISLT_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_islt_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_islt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_islt_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_islt_fp32
 GrB_Info GB_Asaxpy3B__times_islt_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_islt_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_islt_fp64.c
index 01fe44a7dc..c4f8de02b7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_islt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_islt_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x < y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLT || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_ISLT_FP64 || GxB_NO_TIMES_ISLT_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_islt_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_islt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_islt_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_islt_fp64
 GrB_Info GB_Asaxpy3B__times_islt_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_islt_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_islt_int16.c
index 5edb354abc..15bc732b5b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_islt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_islt_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik < bkj)
+// MultAdd:  int16_t x_op_y = (aik < bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x < y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLT || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_ISLT_INT16 || GxB_NO_TIMES_ISLT_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_islt_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_islt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_islt_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_islt_int16
 GrB_Info GB_Asaxpy3B__times_islt_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_islt_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_islt_int32.c
index 152f49820c..ab25c7e307 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_islt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_islt_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik < bkj)
+// MultAdd:  int32_t x_op_y = (aik < bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x < y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLT || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_ISLT_INT32 || GxB_NO_TIMES_ISLT_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_islt_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_islt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_islt_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_islt_int32
 GrB_Info GB_Asaxpy3B__times_islt_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_islt_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_islt_int64.c
index fe6dc2da65..405f9e9dca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_islt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_islt_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik < bkj)
+// MultAdd:  int64_t x_op_y = (aik < bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x < y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLT || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_ISLT_INT64 || GxB_NO_TIMES_ISLT_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_islt_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_islt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_islt_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_islt_int64
 GrB_Info GB_Asaxpy3B__times_islt_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_islt_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_islt_int8.c
index 0c837af038..480d288c65 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_islt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_islt_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik < bkj)
+// MultAdd:  int8_t x_op_y = (aik < bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x < y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLT || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_ISLT_INT8 || GxB_NO_TIMES_ISLT_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_islt_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_islt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_islt_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_islt_int8
 GrB_Info GB_Asaxpy3B__times_islt_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint16.c
index 3d94da2c33..871c5d9147 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik < bkj)
+// MultAdd:  uint16_t x_op_y = (aik < bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x < y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLT || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_ISLT_UINT16 || GxB_NO_TIMES_ISLT_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_islt_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_islt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_islt_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_islt_uint16
 GrB_Info GB_Asaxpy3B__times_islt_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint32.c
index c32ebdaacc..1d160f5e6b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik < bkj)
+// MultAdd:  uint32_t x_op_y = (aik < bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x < y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLT || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_ISLT_UINT32 || GxB_NO_TIMES_ISLT_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_islt_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_islt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_islt_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_islt_uint32
 GrB_Info GB_Asaxpy3B__times_islt_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint64.c
index 8b4b7c9fdc..6fc17d7553 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik < bkj)
+// MultAdd:  uint64_t x_op_y = (aik < bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x < y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLT || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_ISLT_UINT64 || GxB_NO_TIMES_ISLT_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_islt_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_islt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_islt_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_islt_uint64
 GrB_Info GB_Asaxpy3B__times_islt_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint8.c
index 005acd3bb2..0cd0564aae 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_islt_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik < bkj)
+// MultAdd:  uint8_t x_op_y = (aik < bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x < y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x < y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x < y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISLT || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_ISLT_UINT8 || GxB_NO_TIMES_ISLT_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_islt_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_islt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_islt_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_islt_uint8
 GrB_Info GB_Asaxpy3B__times_islt_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isne_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_isne_fp32.c
index cc0b8bbdb6..4bbbaa5f05 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isne_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isne_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISNE || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_ISNE_FP32 || GxB_NO_TIMES_ISNE_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isne_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_isne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_isne_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_isne_fp32
 GrB_Info GB_Asaxpy3B__times_isne_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isne_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_isne_fp64.c
index dd47de6614..97b3a6575f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isne_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isne_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x != y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISNE || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_ISNE_FP64 || GxB_NO_TIMES_ISNE_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isne_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_isne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_isne_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_isne_fp64
 GrB_Info GB_Asaxpy3B__times_isne_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isne_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_isne_int16.c
index 4acbf9f790..3956b81a0d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isne_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isne_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik != bkj)
+// MultAdd:  int16_t x_op_y = (aik != bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int16_t x_op_y = (x != y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISNE || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_ISNE_INT16 || GxB_NO_TIMES_ISNE_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isne_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isne_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isne_int16
 GrB_Info GB_Asaxpy3B__times_isne_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isne_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_isne_int32.c
index 075e74e601..256d2b12ec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isne_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isne_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik != bkj)
+// MultAdd:  int32_t x_op_y = (aik != bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int32_t x_op_y = (x != y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISNE || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_ISNE_INT32 || GxB_NO_TIMES_ISNE_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isne_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isne_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isne_int32
 GrB_Info GB_Asaxpy3B__times_isne_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isne_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_isne_int64.c
index 07ed1dbc77..7677078360 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isne_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isne_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik != bkj)
+// MultAdd:  int64_t x_op_y = (aik != bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int64_t x_op_y = (x != y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISNE || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_ISNE_INT64 || GxB_NO_TIMES_ISNE_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isne_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isne_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isne_int64
 GrB_Info GB_Asaxpy3B__times_isne_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isne_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_isne_int8.c
index 35a203eab4..762782e1d6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isne_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isne_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik != bkj)
+// MultAdd:  int8_t x_op_y = (aik != bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    int8_t x_op_y = (x != y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISNE || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_ISNE_INT8 || GxB_NO_TIMES_ISNE_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isne_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isne_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isne_int8
 GrB_Info GB_Asaxpy3B__times_isne_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint16.c
index c4dff31540..5745e33b04 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik != bkj)
+// MultAdd:  uint16_t x_op_y = (aik != bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint16_t x_op_y = (x != y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISNE || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_ISNE_UINT16 || GxB_NO_TIMES_ISNE_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isne_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isne_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isne_uint16
 GrB_Info GB_Asaxpy3B__times_isne_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint32.c
index 6436bb847d..f65f18f9b8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik != bkj)
+// MultAdd:  uint32_t x_op_y = (aik != bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint32_t x_op_y = (x != y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISNE || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_ISNE_UINT32 || GxB_NO_TIMES_ISNE_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isne_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isne_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isne_uint32
 GrB_Info GB_Asaxpy3B__times_isne_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint64.c
index fe67e0e6d2..71250e0fde 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik != bkj)
+// MultAdd:  uint64_t x_op_y = (aik != bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint64_t x_op_y = (x != y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISNE || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_ISNE_UINT64 || GxB_NO_TIMES_ISNE_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isne_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isne_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isne_uint64
 GrB_Info GB_Asaxpy3B__times_isne_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint8.c
index 28683de3dc..2c9c40daed 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_isne_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -35,9 +37,9 @@
 //           'any' monoid?  0
 //           atomic?        1
 //           OpenMP atomic? 1
-// MultAdd:  cij *= (aik != bkj)
+// MultAdd:  uint8_t x_op_y = (aik != bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x != y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    z *= (x != y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    uint8_t x_op_y = (x != y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_ISNE || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_ISNE_UINT8 || GxB_NO_TIMES_ISNE_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_isne_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_isne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_isne_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_isne_uint8
 GrB_Info GB_Asaxpy3B__times_isne_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_land_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_land_fp32.c
index 9bf4c9c484..af214ffcd6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_land_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_land_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= ((x != 0) && (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LAND || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_LAND_FP32 || GxB_NO_TIMES_LAND_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_land_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_land_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_land_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_land_fp32
 GrB_Info GB_Asaxpy3B__times_land_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_land_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_land_fp64.c
index 6e74da25d6..b2c515b66b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_land_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_land_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= ((x != 0) && (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LAND || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_LAND_FP64 || GxB_NO_TIMES_LAND_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_land_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_land_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_land_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_land_fp64
 GrB_Info GB_Asaxpy3B__times_land_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_land_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_land_int16.c
index 68bce062bb..b2f05b2009 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_land_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_land_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int16_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) && (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LAND || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_LAND_INT16 || GxB_NO_TIMES_LAND_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_land_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_land_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_land_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_land_int16
 GrB_Info GB_Asaxpy3B__times_land_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_land_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_land_int32.c
index 46e4f279e8..6f234185ee 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_land_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_land_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int32_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) && (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LAND || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_LAND_INT32 || GxB_NO_TIMES_LAND_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_land_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_land_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_land_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_land_int32
 GrB_Info GB_Asaxpy3B__times_land_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_land_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_land_int64.c
index 1da5638838..dce7822820 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_land_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_land_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int64_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) && (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LAND || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_LAND_INT64 || GxB_NO_TIMES_LAND_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_land_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_land_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_land_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_land_int64
 GrB_Info GB_Asaxpy3B__times_land_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_land_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_land_int8.c
index 64a19a3e95..b0170827ec 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_land_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_land_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int8_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) && (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LAND || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_LAND_INT8 || GxB_NO_TIMES_LAND_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_land_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_land_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_land_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_land_int8
 GrB_Info GB_Asaxpy3B__times_land_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_land_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_land_uint16.c
index 0f13d5b2eb..169b8e3170 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_land_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_land_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) && (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LAND || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_LAND_UINT16 || GxB_NO_TIMES_LAND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_land_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_land_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_land_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_land_uint16
 GrB_Info GB_Asaxpy3B__times_land_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_land_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_land_uint32.c
index e8ef3b16c0..ba8eed93b1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_land_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_land_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) && (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LAND || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_LAND_UINT32 || GxB_NO_TIMES_LAND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_land_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_land_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_land_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_land_uint32
 GrB_Info GB_Asaxpy3B__times_land_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_land_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_land_uint64.c
index 4f0eb2a20f..cb45cbc55a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_land_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_land_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) && (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LAND || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_LAND_UINT64 || GxB_NO_TIMES_LAND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_land_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_land_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_land_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_land_uint64
 GrB_Info GB_Asaxpy3B__times_land_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_land_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_land_uint8.c
index 768b4cd677..dc1a27dc44 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_land_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_land_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = ((aik != 0) && (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) && (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) && (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LAND || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_LAND_UINT8 || GxB_NO_TIMES_LAND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_land_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_land_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_land_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_land_uint8
 GrB_Info GB_Asaxpy3B__times_land_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lor_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_lor_fp32.c
index a5b5c1a788..c0a2165505 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lor_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lor_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= ((x != 0) || (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LOR || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_LOR_FP32 || GxB_NO_TIMES_LOR_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lor_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_lor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_lor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_lor_fp32
 GrB_Info GB_Asaxpy3B__times_lor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lor_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_lor_fp64.c
index 7ac59fc7d2..f2a0137312 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lor_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lor_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= ((x != 0) || (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LOR || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_LOR_FP64 || GxB_NO_TIMES_LOR_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lor_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_lor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_lor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_lor_fp64
 GrB_Info GB_Asaxpy3B__times_lor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lor_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_lor_int16.c
index 3c74ee278d..895034273d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lor_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int16_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) || (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LOR || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_LOR_INT16 || GxB_NO_TIMES_LOR_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lor_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lor_int16
 GrB_Info GB_Asaxpy3B__times_lor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lor_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_lor_int32.c
index c822f6c9b4..34cf3b3c78 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lor_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int32_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) || (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LOR || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_LOR_INT32 || GxB_NO_TIMES_LOR_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lor_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lor_int32
 GrB_Info GB_Asaxpy3B__times_lor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lor_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_lor_int64.c
index dcf8f4db00..97ab79d705 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lor_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int64_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) || (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LOR || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_LOR_INT64 || GxB_NO_TIMES_LOR_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lor_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lor_int64
 GrB_Info GB_Asaxpy3B__times_lor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lor_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_lor_int8.c
index 7b62f8db60..5ed0c97035 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lor_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int8_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) || (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LOR || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_LOR_INT8 || GxB_NO_TIMES_LOR_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lor_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lor_int8
 GrB_Info GB_Asaxpy3B__times_lor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint16.c
index 4776084d82..b1be5d4bc5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) || (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LOR || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_LOR_UINT16 || GxB_NO_TIMES_LOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lor_uint16
 GrB_Info GB_Asaxpy3B__times_lor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint32.c
index a880d8c721..01db3113d7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) || (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LOR || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_LOR_UINT32 || GxB_NO_TIMES_LOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lor_uint32
 GrB_Info GB_Asaxpy3B__times_lor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint64.c
index e3fd0474cd..c5a6f656be 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) || (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LOR || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_LOR_UINT64 || GxB_NO_TIMES_LOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lor_uint64
 GrB_Info GB_Asaxpy3B__times_lor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint8.c
index 3be688908c..e3540df3cd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = ((aik != 0) || (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) || (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) || (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LOR || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_LOR_UINT8 || GxB_NO_TIMES_LOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lor_uint8
 GrB_Info GB_Asaxpy3B__times_lor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_fp32.c
index c6c7f79acc..fc5633e827 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= ((x != 0) != (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LXOR || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_LXOR_FP32 || GxB_NO_TIMES_LXOR_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lxor_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_lxor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_lxor_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_lxor_fp32
 GrB_Info GB_Asaxpy3B__times_lxor_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_fp64.c
index 90e521e59b..697c7a6173 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= ((x != 0) != (y != 0))
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LXOR || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_LXOR_FP64 || GxB_NO_TIMES_LXOR_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lxor_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_lxor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_lxor_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_lxor_fp64
 GrB_Info GB_Asaxpy3B__times_lxor_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int16.c
index 07187b071e..00e9e4d729 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int16_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = ((x != 0) != (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LXOR || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_LXOR_INT16 || GxB_NO_TIMES_LXOR_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lxor_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lxor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lxor_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lxor_int16
 GrB_Info GB_Asaxpy3B__times_lxor_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int32.c
index e983373ab8..4e9279dc06 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int32_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = ((x != 0) != (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LXOR || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_LXOR_INT32 || GxB_NO_TIMES_LXOR_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lxor_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lxor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lxor_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lxor_int32
 GrB_Info GB_Asaxpy3B__times_lxor_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int64.c
index 9ea77b900f..6e303916ed 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int64_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = ((x != 0) != (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LXOR || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_LXOR_INT64 || GxB_NO_TIMES_LXOR_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lxor_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lxor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lxor_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lxor_int64
 GrB_Info GB_Asaxpy3B__times_lxor_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int8.c
index c8b39dc636..ea6ac7bc99 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int8_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = ((x != 0) != (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LXOR || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_LXOR_INT8 || GxB_NO_TIMES_LXOR_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lxor_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lxor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lxor_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lxor_int8
 GrB_Info GB_Asaxpy3B__times_lxor_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint16.c
index a96a336ce1..d29fd82c4a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = ((x != 0) != (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LXOR || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_LXOR_UINT16 || GxB_NO_TIMES_LXOR_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lxor_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lxor_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lxor_uint16
 GrB_Info GB_Asaxpy3B__times_lxor_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint32.c
index f07f177da0..5edfaf39c2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = ((x != 0) != (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LXOR || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_LXOR_UINT32 || GxB_NO_TIMES_LXOR_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lxor_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lxor_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lxor_uint32
 GrB_Info GB_Asaxpy3B__times_lxor_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint64.c
index 24d7a0b55b..887470ae6c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = ((x != 0) != (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LXOR || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_LXOR_UINT64 || GxB_NO_TIMES_LXOR_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lxor_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lxor_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lxor_uint64
 GrB_Info GB_Asaxpy3B__times_lxor_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint8.c
index 078fe8a757..fa565f235b 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_lxor_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = ((aik != 0) != (bkj != 0)) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = ((x != 0) != (y != 0))
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = ((x != 0) != (y != 0)) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_LXOR || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_LXOR_UINT8 || GxB_NO_TIMES_LXOR_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_lxor_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_lxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_lxor_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_lxor_uint8
 GrB_Info GB_Asaxpy3B__times_lxor_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_max_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_max_fp32.c
index 2437d53fa8..ed7d4238aa 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_max_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_max_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmaxf (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= fmaxf (x, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MAX || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_MAX_FP32 || GxB_NO_TIMES_MAX_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_max_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_max_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_max_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_max_fp32
 GrB_Info GB_Asaxpy3B__times_max_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_max_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_max_fp64.c
index 4867b7cb84..f202a9a9fb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_max_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_max_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmax (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= fmax (x, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MAX || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_MAX_FP64 || GxB_NO_TIMES_MAX_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_max_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_max_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_max_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_max_fp64
 GrB_Info GB_Asaxpy3B__times_max_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_max_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_max_int16.c
index bcf4ffd67c..12036c44d9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_max_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_max_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int16_t x_op_y = GB_IMAX (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IMAX (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MAX || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_MAX_INT16 || GxB_NO_TIMES_MAX_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_max_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_max_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_max_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_max_int16
 GrB_Info GB_Asaxpy3B__times_max_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_max_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_max_int32.c
index 979b1ec7d8..cd0771325a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_max_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_max_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int32_t x_op_y = GB_IMAX (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IMAX (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MAX || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_MAX_INT32 || GxB_NO_TIMES_MAX_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_max_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_max_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_max_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_max_int32
 GrB_Info GB_Asaxpy3B__times_max_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_max_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_max_int64.c
index c4ca4aecb5..ecbd0eafcb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_max_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_max_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int64_t x_op_y = GB_IMAX (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IMAX (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MAX || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_MAX_INT64 || GxB_NO_TIMES_MAX_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_max_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_max_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_max_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_max_int64
 GrB_Info GB_Asaxpy3B__times_max_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_max_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_max_int8.c
index a11c3a8eed..ca9b3d0f34 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_max_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_max_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int8_t x_op_y = GB_IMAX (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IMAX (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MAX || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_MAX_INT8 || GxB_NO_TIMES_MAX_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_max_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_max_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_max_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_max_int8
 GrB_Info GB_Asaxpy3B__times_max_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_max_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_max_uint16.c
index 6f7766f986..87ae800004 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_max_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_max_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = GB_IMAX (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IMAX (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MAX || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_MAX_UINT16 || GxB_NO_TIMES_MAX_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_max_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_max_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_max_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_max_uint16
 GrB_Info GB_Asaxpy3B__times_max_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_max_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_max_uint32.c
index b217714e76..730c8c40fd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_max_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_max_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = GB_IMAX (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IMAX (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MAX || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_MAX_UINT32 || GxB_NO_TIMES_MAX_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_max_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_max_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_max_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_max_uint32
 GrB_Info GB_Asaxpy3B__times_max_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_max_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_max_uint64.c
index d4eae8ef21..9b4c151c3e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_max_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_max_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = GB_IMAX (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IMAX (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MAX || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_MAX_UINT64 || GxB_NO_TIMES_MAX_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_max_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_max_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_max_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_max_uint64
 GrB_Info GB_Asaxpy3B__times_max_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_max_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_max_uint8.c
index 705995e349..80e200a9b6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_max_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_max_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = GB_IMAX (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMAX (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IMAX (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MAX || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_MAX_UINT8 || GxB_NO_TIMES_MAX_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_max_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_max_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_max_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_max_uint8
 GrB_Info GB_Asaxpy3B__times_max_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_min_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_min_fp32.c
index 52cfe95506..939534b348 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_min_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_min_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fminf (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= fminf (x, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MIN || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_MIN_FP32 || GxB_NO_TIMES_MIN_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_min_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_min_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_min_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_min_fp32
 GrB_Info GB_Asaxpy3B__times_min_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_min_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_min_fp64.c
index e466b3935a..e3022bbf57 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_min_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_min_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = fmin (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= fmin (x, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MIN || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_MIN_FP64 || GxB_NO_TIMES_MIN_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_min_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_min_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_min_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_min_fp64
 GrB_Info GB_Asaxpy3B__times_min_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_min_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_min_int16.c
index ba0f9c3062..492563f688 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_min_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_min_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int16_t x_op_y = GB_IMIN (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IMIN (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MIN || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_MIN_INT16 || GxB_NO_TIMES_MIN_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_min_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_min_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_min_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_min_int16
 GrB_Info GB_Asaxpy3B__times_min_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_min_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_min_int32.c
index 70572af299..04f3dba69c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_min_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_min_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int32_t x_op_y = GB_IMIN (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IMIN (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MIN || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_MIN_INT32 || GxB_NO_TIMES_MIN_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_min_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_min_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_min_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_min_int32
 GrB_Info GB_Asaxpy3B__times_min_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_min_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_min_int64.c
index 9c9310da6e..debf0bb74f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_min_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_min_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int64_t x_op_y = GB_IMIN (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IMIN (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MIN || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_MIN_INT64 || GxB_NO_TIMES_MIN_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_min_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_min_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_min_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_min_int64
 GrB_Info GB_Asaxpy3B__times_min_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_min_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_min_int8.c
index 0ffad21829..4ef23d8bf4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_min_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_min_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int8_t x_op_y = GB_IMIN (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IMIN (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MIN || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_MIN_INT8 || GxB_NO_TIMES_MIN_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_min_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_min_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_min_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_min_int8
 GrB_Info GB_Asaxpy3B__times_min_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_min_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_min_uint16.c
index dcaa2237d7..518f0467df 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_min_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_min_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = GB_IMIN (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IMIN (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MIN || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_MIN_UINT16 || GxB_NO_TIMES_MIN_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_min_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_min_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_min_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_min_uint16
 GrB_Info GB_Asaxpy3B__times_min_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_min_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_min_uint32.c
index 224fc49be6..7294bf51a6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_min_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_min_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = GB_IMIN (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IMIN (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MIN || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_MIN_UINT32 || GxB_NO_TIMES_MIN_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_min_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_min_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_min_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_min_uint32
 GrB_Info GB_Asaxpy3B__times_min_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_min_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_min_uint64.c
index f4df1eeabd..b74c21091d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_min_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_min_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = GB_IMIN (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IMIN (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MIN || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_MIN_UINT64 || GxB_NO_TIMES_MIN_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_min_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_min_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_min_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_min_uint64
 GrB_Info GB_Asaxpy3B__times_min_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_min_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_min_uint8.c
index 54493097df..db2fb23beb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_min_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_min_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = GB_IMIN (aik, bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IMIN (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IMIN (x, y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MIN || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_MIN_UINT8 || GxB_NO_TIMES_MIN_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_min_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_min_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_min_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_min_uint8
 GrB_Info GB_Asaxpy3B__times_min_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_minus_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__times_minus_fc32.c
index 36ee276ea6..4cc0a1a624 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_minus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_minus_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_minus (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_minus (x, y) ; z = GB_FC32_mul (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MINUS || GxB_NO_FC32 || GxB_NO_TIMES_FC32 || GxB_NO_MINUS_FC32 || GxB_NO_TIMES_MINUS_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_minus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_minus_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_minus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_minus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_minus_fc32
 GrB_Info GB_Asaxpy3B__times_minus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_minus_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__times_minus_fc64.c
index a33b5dcae6..99e20d4465 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_minus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_minus_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_minus (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_minus (x, y) ; z = GB_FC64_mul (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MINUS || GxB_NO_FC64 || GxB_NO_TIMES_FC64 || GxB_NO_MINUS_FC64 || GxB_NO_TIMES_MINUS_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_minus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_minus_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_minus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_minus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_minus_fc64
 GrB_Info GB_Asaxpy3B__times_minus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_minus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_minus_fp32.c
index 597a3f2088..3284e4c62d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_minus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_minus_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x - y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MINUS || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_MINUS_FP32 || GxB_NO_TIMES_MINUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_minus_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_minus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_minus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_minus_fp32
 GrB_Info GB_Asaxpy3B__times_minus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_minus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_minus_fp64.c
index 195faacf7a..96c4bc0f6d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_minus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_minus_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x - y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MINUS || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_MINUS_FP64 || GxB_NO_TIMES_MINUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_minus_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_minus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_minus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_minus_fp64
 GrB_Info GB_Asaxpy3B__times_minus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_minus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_minus_int16.c
index 5e17555013..5dc25f400c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_minus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_minus_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int16_t x_op_y = (aik - bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x - y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MINUS || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_MINUS_INT16 || GxB_NO_TIMES_MINUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_minus_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_minus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_minus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_minus_int16
 GrB_Info GB_Asaxpy3B__times_minus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_minus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_minus_int32.c
index f39a10b3d8..730a8836d5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_minus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_minus_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int32_t x_op_y = (aik - bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x - y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MINUS || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_MINUS_INT32 || GxB_NO_TIMES_MINUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_minus_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_minus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_minus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_minus_int32
 GrB_Info GB_Asaxpy3B__times_minus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_minus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_minus_int64.c
index bfddd73628..b37c4f5448 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_minus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_minus_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int64_t x_op_y = (aik - bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x - y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MINUS || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_MINUS_INT64 || GxB_NO_TIMES_MINUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_minus_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_minus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_minus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_minus_int64
 GrB_Info GB_Asaxpy3B__times_minus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_minus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_minus_int8.c
index 2b9f9e2772..eeba8a49d0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_minus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_minus_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int8_t x_op_y = (aik - bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x - y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MINUS || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_MINUS_INT8 || GxB_NO_TIMES_MINUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_minus_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_minus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_minus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_minus_int8
 GrB_Info GB_Asaxpy3B__times_minus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint16.c
index 20acfd4505..1e934c42f1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = (aik - bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x - y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MINUS || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_MINUS_UINT16 || GxB_NO_TIMES_MINUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_minus_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_minus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_minus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_minus_uint16
 GrB_Info GB_Asaxpy3B__times_minus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint32.c
index 3b240a2cf4..e915c52732 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = (aik - bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x - y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MINUS || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_MINUS_UINT32 || GxB_NO_TIMES_MINUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_minus_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_minus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_minus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_minus_uint32
 GrB_Info GB_Asaxpy3B__times_minus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint64.c
index e31b9429de..2621bed5c1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = (aik - bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x - y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MINUS || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_MINUS_UINT64 || GxB_NO_TIMES_MINUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_minus_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_minus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_minus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_minus_uint64
 GrB_Info GB_Asaxpy3B__times_minus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint8.c
index c6267c3c29..8b10b2fe27 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_minus_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = (aik - bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x - y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x - y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_MINUS || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_MINUS_UINT8 || GxB_NO_TIMES_MINUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_minus_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_minus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_minus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_minus_uint8
 GrB_Info GB_Asaxpy3B__times_minus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_plus_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__times_plus_fc32.c
index 3ea8ebd2d1..03731bd9a4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_plus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_plus_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_add (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_add (x, y) ; z = GB_FC32_mul (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_PLUS || GxB_NO_FC32 || GxB_NO_TIMES_FC32 || GxB_NO_PLUS_FC32 || GxB_NO_TIMES_PLUS_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_plus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_plus_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_plus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_plus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_plus_fc32
 GrB_Info GB_Asaxpy3B__times_plus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_plus_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__times_plus_fc64.c
index 43672d1d5a..f3b5c469b3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_plus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_plus_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_add (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_add (x, y) ; z = GB_FC64_mul (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_PLUS || GxB_NO_FC64 || GxB_NO_TIMES_FC64 || GxB_NO_PLUS_FC64 || GxB_NO_TIMES_PLUS_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_plus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_plus_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_plus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_plus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_plus_fc64
 GrB_Info GB_Asaxpy3B__times_plus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_plus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_plus_fp32.c
index 41a51d8006..9c25dba0c2 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_plus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_plus_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x + y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_PLUS || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_PLUS_FP32 || GxB_NO_TIMES_PLUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_plus_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_plus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_plus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_plus_fp32
 GrB_Info GB_Asaxpy3B__times_plus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_plus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_plus_fp64.c
index edf75cb7ec..46d0ad93cf 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_plus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_plus_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x + y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_PLUS || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_PLUS_FP64 || GxB_NO_TIMES_PLUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_plus_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_plus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_plus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_plus_fp64
 GrB_Info GB_Asaxpy3B__times_plus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_plus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_plus_int16.c
index 11c4dd0183..683836dea8 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_plus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_plus_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int16_t x_op_y = (aik + bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x + y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_PLUS || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_PLUS_INT16 || GxB_NO_TIMES_PLUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_plus_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_plus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_plus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_plus_int16
 GrB_Info GB_Asaxpy3B__times_plus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_plus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_plus_int32.c
index 98433e9b0b..9b9a42d292 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_plus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_plus_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int32_t x_op_y = (aik + bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x + y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_PLUS || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_PLUS_INT32 || GxB_NO_TIMES_PLUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_plus_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_plus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_plus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_plus_int32
 GrB_Info GB_Asaxpy3B__times_plus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_plus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_plus_int64.c
index dc5cf91f91..57aa790cae 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_plus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_plus_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int64_t x_op_y = (aik + bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x + y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_PLUS || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_PLUS_INT64 || GxB_NO_TIMES_PLUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_plus_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_plus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_plus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_plus_int64
 GrB_Info GB_Asaxpy3B__times_plus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_plus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_plus_int8.c
index ea5bd5147d..1062ccd8fd 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_plus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_plus_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int8_t x_op_y = (aik + bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x + y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_PLUS || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_PLUS_INT8 || GxB_NO_TIMES_PLUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_plus_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_plus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_plus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_plus_int8
 GrB_Info GB_Asaxpy3B__times_plus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint16.c
index a294921da1..1e59046f01 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = (aik + bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x + y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_PLUS || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_PLUS_UINT16 || GxB_NO_TIMES_PLUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_plus_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_plus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_plus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_plus_uint16
 GrB_Info GB_Asaxpy3B__times_plus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint32.c
index 285043a9ad..5cf0f11df6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = (aik + bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x + y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_PLUS || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_PLUS_UINT32 || GxB_NO_TIMES_PLUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_plus_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_plus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_plus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_plus_uint32
 GrB_Info GB_Asaxpy3B__times_plus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint64.c
index 4bb1b7c24e..5509beeb61 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = (aik + bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x + y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_PLUS || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_PLUS_UINT64 || GxB_NO_TIMES_PLUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_plus_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_plus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_plus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_plus_uint64
 GrB_Info GB_Asaxpy3B__times_plus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint8.c
index 0df3016cf8..3ba3300d1a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_plus_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = (aik + bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x + y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x + y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_PLUS || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_PLUS_UINT8 || GxB_NO_TIMES_PLUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_plus_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_plus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_plus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_plus_uint8
 GrB_Info GB_Asaxpy3B__times_plus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fc32.c
index 7cd7f86427..55fb11d3e9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_div (y, x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_div (y, x) ; z = GB_FC32_mul (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RDIV || GxB_NO_FC32 || GxB_NO_TIMES_FC32 || GxB_NO_RDIV_FC32 || GxB_NO_TIMES_RDIV_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rdiv_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rdiv_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_rdiv_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_rdiv_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_rdiv_fc32
 GrB_Info GB_Asaxpy3B__times_rdiv_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fc64.c
index 6d9360cc66..2af427164d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_div (y, x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_div (y, x) ; z = GB_FC64_mul (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RDIV || GxB_NO_FC64 || GxB_NO_TIMES_FC64 || GxB_NO_RDIV_FC64 || GxB_NO_TIMES_RDIV_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rdiv_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rdiv_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_rdiv_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_rdiv_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_rdiv_fc64
 GrB_Info GB_Asaxpy3B__times_rdiv_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fp32.c
index 0db3572046..3426b2ab0c 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y / x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (y / x)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RDIV || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_RDIV_FP32 || GxB_NO_TIMES_RDIV_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rdiv_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_rdiv_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_rdiv_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_rdiv_fp32
 GrB_Info GB_Asaxpy3B__times_rdiv_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fp64.c
index 693a0bb46e..5d1775291f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y / x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (y / x)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RDIV || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_RDIV_FP64 || GxB_NO_TIMES_RDIV_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rdiv_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_rdiv_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_rdiv_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_rdiv_fp64
 GrB_Info GB_Asaxpy3B__times_rdiv_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int16.c
index 2fe558e872..2162e582a7 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int16_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 16) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = GB_IDIV_SIGNED (y, x, 16) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RDIV || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_RDIV_INT16 || GxB_NO_TIMES_RDIV_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rdiv_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rdiv_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rdiv_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rdiv_int16
 GrB_Info GB_Asaxpy3B__times_rdiv_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int32.c
index 90fc97d33c..a29fd20580 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int32_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 32) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = GB_IDIV_SIGNED (y, x, 32) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RDIV || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_RDIV_INT32 || GxB_NO_TIMES_RDIV_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rdiv_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rdiv_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rdiv_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rdiv_int32
 GrB_Info GB_Asaxpy3B__times_rdiv_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int64.c
index 859aeb0550..354ff2fc85 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int64_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 64) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = GB_IDIV_SIGNED (y, x, 64) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RDIV || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_RDIV_INT64 || GxB_NO_TIMES_RDIV_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rdiv_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rdiv_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rdiv_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rdiv_int64
 GrB_Info GB_Asaxpy3B__times_rdiv_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int8.c
index 24cb70c840..1f0b7053e9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int8_t x_op_y = GB_IDIV_SIGNED (bkj, aik, 8) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_SIGNED (y, x, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = GB_IDIV_SIGNED (y, x, 8) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RDIV || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_RDIV_INT8 || GxB_NO_TIMES_RDIV_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rdiv_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rdiv_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rdiv_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rdiv_int8
 GrB_Info GB_Asaxpy3B__times_rdiv_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint16.c
index 04113c0a41..3c9b8c7f7e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 16) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 16)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = GB_IDIV_UNSIGNED (y, x, 16) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RDIV || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_RDIV_UINT16 || GxB_NO_TIMES_RDIV_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rdiv_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rdiv_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rdiv_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rdiv_uint16
 GrB_Info GB_Asaxpy3B__times_rdiv_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint32.c
index 9fda719a31..c6c08c625f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 32) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 32)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = GB_IDIV_UNSIGNED (y, x, 32) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RDIV || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_RDIV_UINT32 || GxB_NO_TIMES_RDIV_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rdiv_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rdiv_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rdiv_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rdiv_uint32
 GrB_Info GB_Asaxpy3B__times_rdiv_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint64.c
index fd23331cb1..23cda7c065 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 64) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 64)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = GB_IDIV_UNSIGNED (y, x, 64) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RDIV || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_RDIV_UINT64 || GxB_NO_TIMES_RDIV_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rdiv_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rdiv_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rdiv_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rdiv_uint64
 GrB_Info GB_Asaxpy3B__times_rdiv_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint8.c
index d7f0a441c4..4ee961eb6f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rdiv_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = GB_IDIV_UNSIGNED (bkj, aik, 8) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_IDIV_UNSIGNED (y, x, 8)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = GB_IDIV_UNSIGNED (y, x, 8) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RDIV || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_RDIV_UINT8 || GxB_NO_TIMES_RDIV_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rdiv_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rdiv_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rdiv_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rdiv_uint8
 GrB_Info GB_Asaxpy3B__times_rdiv_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fc32.c
index 908546e84c..5e9b89b334 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_minus (y, x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_minus (y, x) ; z = GB_FC32_mul (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RMINUS || GxB_NO_FC32 || GxB_NO_TIMES_FC32 || GxB_NO_RMINUS_FC32 || GxB_NO_TIMES_RMINUS_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rminus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rminus_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_rminus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_rminus_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_rminus_fc32
 GrB_Info GB_Asaxpy3B__times_rminus_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fc64.c
index e2bbc0310f..949190b4de 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_minus (y, x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_minus (y, x) ; z = GB_FC64_mul (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RMINUS || GxB_NO_FC64 || GxB_NO_TIMES_FC64 || GxB_NO_RMINUS_FC64 || GxB_NO_TIMES_RMINUS_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rminus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rminus_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_rminus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_rminus_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_rminus_fc64
 GrB_Info GB_Asaxpy3B__times_rminus_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fp32.c
index 4cd6892f30..80088d0a75 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (y - x)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RMINUS || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_RMINUS_FP32 || GxB_NO_TIMES_RMINUS_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rminus_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_rminus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_rminus_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_rminus_fp32
 GrB_Info GB_Asaxpy3B__times_rminus_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fp64.c
index 2004ec3b60..b6a8c4fddb 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (y - x)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RMINUS || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_RMINUS_FP64 || GxB_NO_TIMES_RMINUS_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rminus_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_rminus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_rminus_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_rminus_fp64
 GrB_Info GB_Asaxpy3B__times_rminus_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int16.c
index affb24f72c..21d138fe82 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int16_t x_op_y = (bkj - aik) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (y - x) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RMINUS || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_RMINUS_INT16 || GxB_NO_TIMES_RMINUS_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rminus_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rminus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rminus_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rminus_int16
 GrB_Info GB_Asaxpy3B__times_rminus_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int32.c
index 21d5317bf1..1586943891 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int32_t x_op_y = (bkj - aik) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (y - x) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RMINUS || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_RMINUS_INT32 || GxB_NO_TIMES_RMINUS_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rminus_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rminus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rminus_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rminus_int32
 GrB_Info GB_Asaxpy3B__times_rminus_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int64.c
index 065ed85a03..c633c7c88f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int64_t x_op_y = (bkj - aik) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (y - x) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RMINUS || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_RMINUS_INT64 || GxB_NO_TIMES_RMINUS_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rminus_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rminus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rminus_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rminus_int64
 GrB_Info GB_Asaxpy3B__times_rminus_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int8.c
index e30c999ed2..a7e8207263 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int8_t x_op_y = (bkj - aik) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (y - x) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RMINUS || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_RMINUS_INT8 || GxB_NO_TIMES_RMINUS_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rminus_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rminus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rminus_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rminus_int8
 GrB_Info GB_Asaxpy3B__times_rminus_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint16.c
index ef7f2b1ac6..7f5f59afde 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = (bkj - aik) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (y - x) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RMINUS || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_RMINUS_UINT16 || GxB_NO_TIMES_RMINUS_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rminus_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rminus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rminus_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rminus_uint16
 GrB_Info GB_Asaxpy3B__times_rminus_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint32.c
index a3332f355f..c6dbc446b4 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = (bkj - aik) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (y - x) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RMINUS || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_RMINUS_UINT32 || GxB_NO_TIMES_RMINUS_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rminus_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rminus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rminus_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rminus_uint32
 GrB_Info GB_Asaxpy3B__times_rminus_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint64.c
index cb5528a979..1227e9e66e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = (bkj - aik) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (y - x) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RMINUS || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_RMINUS_UINT64 || GxB_NO_TIMES_RMINUS_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rminus_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rminus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rminus_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rminus_uint64
 GrB_Info GB_Asaxpy3B__times_rminus_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint8.c
index db0a3bca1e..f3c75f0343 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_rminus_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = (bkj - aik) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (y - x)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (y - x) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_RMINUS || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_RMINUS_UINT8 || GxB_NO_TIMES_RMINUS_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_rminus_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_rminus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_rminus_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_rminus_uint8
 GrB_Info GB_Asaxpy3B__times_rminus_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_second_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__times_second_fc32.c
index dfb2c6d9fa..4d7a874e53 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_second_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_second_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = GB_FC32_mul (z, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_SECOND || GxB_NO_FC32 || GxB_NO_TIMES_FC32 || GxB_NO_SECOND_FC32 || GxB_NO_TIMES_SECOND_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_second_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_second_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_second_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_second_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_second_fc32
 GrB_Info GB_Asaxpy3B__times_second_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_second_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__times_second_fc64.c
index 1894b3e914..4c7b4d1667 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_second_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_second_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z = GB_FC64_mul (z, y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_SECOND || GxB_NO_FC64 || GxB_NO_TIMES_FC64 || GxB_NO_SECOND_FC64 || GxB_NO_TIMES_SECOND_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_second_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_second_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_second_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_second_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_second_fc64
 GrB_Info GB_Asaxpy3B__times_second_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_second_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_second_fp32.c
index 77e06bb8ba..887b8c8b73 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_second_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_second_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_SECOND || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_SECOND_FP32 || GxB_NO_TIMES_SECOND_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_second_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_second_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_second_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_second_fp32
 GrB_Info GB_Asaxpy3B__times_second_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_second_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_second_fp64.c
index 5110c2d3a5..f71bdbdb5d 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_second_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_second_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_SECOND || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_SECOND_FP64 || GxB_NO_TIMES_SECOND_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_second_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_second_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_second_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_second_fp64
 GrB_Info GB_Asaxpy3B__times_second_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_second_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_second_int16.c
index 6915968b46..014f0d37ca 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_second_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_second_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= bkj
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_SECOND || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_SECOND_INT16 || GxB_NO_TIMES_SECOND_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_second_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_second_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_second_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_second_int16
 GrB_Info GB_Asaxpy3B__times_second_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_second_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_second_int32.c
index 8ae6217eee..6fc48b623e 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_second_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_second_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= bkj
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_SECOND || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_SECOND_INT32 || GxB_NO_TIMES_SECOND_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_second_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_second_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_second_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_second_int32
 GrB_Info GB_Asaxpy3B__times_second_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_second_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_second_int64.c
index 9925b900fe..766bca5ab5 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_second_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_second_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= bkj
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_SECOND || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_SECOND_INT64 || GxB_NO_TIMES_SECOND_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_second_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_second_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_second_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_second_int64
 GrB_Info GB_Asaxpy3B__times_second_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_second_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_second_int8.c
index 91125e06d3..b6e41f46d0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_second_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_second_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= bkj
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_SECOND || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_SECOND_INT8 || GxB_NO_TIMES_SECOND_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_second_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_second_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_second_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_second_int8
 GrB_Info GB_Asaxpy3B__times_second_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_second_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_second_uint16.c
index 723d1c2389..4d5942228a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_second_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_second_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= bkj
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_SECOND || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_SECOND_UINT16 || GxB_NO_TIMES_SECOND_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_second_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_second_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_second_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_second_uint16
 GrB_Info GB_Asaxpy3B__times_second_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_second_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_second_uint32.c
index 05c54694ba..23c01e9bc9 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_second_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_second_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= bkj
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_SECOND || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_SECOND_UINT32 || GxB_NO_TIMES_SECOND_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_second_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_second_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_second_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_second_uint32
 GrB_Info GB_Asaxpy3B__times_second_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_second_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_second_uint64.c
index 1db22033b4..57f72ead70 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_second_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_second_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= bkj
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_SECOND || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_SECOND_UINT64 || GxB_NO_TIMES_SECOND_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_second_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_second_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_second_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_second_uint64
 GrB_Info GB_Asaxpy3B__times_second_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_second_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_second_uint8.c
index f74cdca2a4..f6d4285df1 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_second_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_second_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  cij *= bkj
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = y
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_SECOND || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_SECOND_UINT8 || GxB_NO_TIMES_SECOND_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_second_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_second_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_second_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_second_uint8
 GrB_Info GB_Asaxpy3B__times_second_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_secondj1_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_secondj1_int32.c
new file mode 100644
index 0000000000..89b5dd41eb
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_secondj1_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__times_secondj1_int32
+// A'*B function (dot3):     GB_Adot3B__times_secondj1_int32
+// C+=A'*B function (dot4):  GB_Adot4B__times_secondj1_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__times_secondj1_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = (j+1)
+// Add:      cij *= z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij *= (j+1)
+// Identity: 1
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (j+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z *= (j+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    1
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (*,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] *= t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x * y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] *= t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_TIMES || GxB_NO_SECONDJ1 || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_SECONDJ1_INT32 || GxB_NO_TIMES_SECONDJ1_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__times_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__times_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__times_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__times_secondj1_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_secondj1_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_secondj1_int64.c
new file mode 100644
index 0000000000..bfa45f299e
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_secondj1_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__times_secondj1_int64
+// A'*B function (dot3):     GB_Adot3B__times_secondj1_int64
+// C+=A'*B function (dot4):  GB_Adot4B__times_secondj1_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__times_secondj1_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = (j+1)
+// Add:      cij *= z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij *= (j+1)
+// Identity: 1
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = (j+1)
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z *= (j+1)
+
+// monoid identity value
+#define GB_IDENTITY \
+    1
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (*,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] *= t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x * y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] *= t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_TIMES || GxB_NO_SECONDJ1 || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_SECONDJ1_INT64 || GxB_NO_TIMES_SECONDJ1_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__times_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__times_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__times_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__times_secondj1_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_secondj_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_secondj_int32.c
new file mode 100644
index 0000000000..b69fccf47f
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_secondj_int32.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__times_secondj_int32
+// A'*B function (dot3):     GB_Adot3B__times_secondj_int32
+// C+=A'*B function (dot4):  GB_Adot4B__times_secondj_int32
+// A*B function (saxpy3):    GB_Asaxpy3B__times_secondj_int32
+
+// C type:   int32_t
+// A type:   int32_t
+// B type:   int32_t
+
+// Multiply: z = j
+// Add:      cij *= z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij *= j
+// Identity: 1
+// Terminal: ;
+
+#define GB_ATYPE \
+    int32_t
+
+#define GB_BTYPE \
+    int32_t
+
+#define GB_CTYPE \
+    int32_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    0
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = j
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z *= j
+
+// monoid identity value
+#define GB_IDENTITY \
+    1
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (*,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int32_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] *= t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x * y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0xffffffffL
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] *= t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int32_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_TIMES || GxB_NO_SECONDJ || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_SECONDJ_INT32 || GxB_NO_TIMES_SECONDJ_INT32)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__times_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__times_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__times_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__times_secondj_int32
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_secondj_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_secondj_int64.c
new file mode 100644
index 0000000000..72fbe39515
--- /dev/null
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_secondj_int64.c
@@ -0,0 +1,388 @@
+//------------------------------------------------------------------------------
+// GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// If this file is in the Generated/ folder, do not edit it (auto-generated).
+
+#include "GB.h"
+#ifndef GBCOMPACT
+#include "GB_control.h"
+#include "GB_ek_slice.h"
+#include "GB_bracket.h"
+#include "GB_sort.h"
+#include "GB_atomics.h"
+#include "GB_AxB_saxpy3.h"
+#include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
+
+// The C=A*B semiring is defined by the following types and operators:
+
+// A'*B function (dot2):     GB_Adot2B__times_secondj_int64
+// A'*B function (dot3):     GB_Adot3B__times_secondj_int64
+// C+=A'*B function (dot4):  GB_Adot4B__times_secondj_int64
+// A*B function (saxpy3):    GB_Asaxpy3B__times_secondj_int64
+
+// C type:   int64_t
+// A type:   int64_t
+// B type:   int64_t
+
+// Multiply: z = j
+// Add:      cij *= z
+//           'any' monoid?  0
+//           atomic?        1
+//           OpenMP atomic? 1
+// MultAdd:  cij *= j
+// Identity: 1
+// Terminal: ;
+
+#define GB_ATYPE \
+    int64_t
+
+#define GB_BTYPE \
+    int64_t
+
+#define GB_CTYPE \
+    int64_t
+
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
+// true for int64, uint64, float, double, float complex, and double complex 
+#define GB_CTYPE_IGNORE_OVERFLOW \
+    1
+
+// aik = Ax [pA]
+#define GB_GETA(aik,Ax,pA) \
+    ;
+
+// bkj = Bx [pB]
+#define GB_GETB(bkj,Bx,pB) \
+    ;
+
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    ;
+
+#define GB_CX(p) Cx [p]
+
+// multiply operator
+#define GB_MULT(z, x, y, i, k, j) \
+    z = j
+
+// cast from a real scalar (or 2, if C is complex) to the type of C
+#define GB_CTYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
+// multiply-add
+#define GB_MULTADD(z, x, y, i, k, j) \
+    z *= j
+
+// monoid identity value
+#define GB_IDENTITY \
+    1
+
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
+// break if cij reaches the terminal value (dot product only)
+#define GB_DOT_TERMINAL(cij) \
+    ;
+
+// simd pragma for dot-product loop vectorization
+#define GB_PRAGMA_SIMD_DOT(cij) \
+    GB_PRAGMA_SIMD_REDUCTION (*,cij)
+
+// simd pragma for other loop vectorization
+#define GB_PRAGMA_SIMD_VECTORIZE GB_PRAGMA_SIMD
+
+// 1 for the PLUS_PAIR_(real) semirings, not for the complex case
+#define GB_IS_PLUS_PAIR_REAL_SEMIRING \
+    0
+
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
+// declare the cij scalar
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+    // also initialize cij to zero
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij = 0
+#else
+    // all other semirings: just declare cij, do not initialize it
+    #define GB_CIJ_DECLARE(cij) \
+        int64_t cij
+#endif
+
+// cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
+
+// Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
+
+// Cx [p] = t
+#define GB_CIJ_WRITE(p,t) Cx [p] = t
+
+// C(i,j) += t
+#define GB_CIJ_UPDATE(p,t) \
+    Cx [p] *= t
+
+// x + y
+#define GB_ADD_FUNCTION(x,y) \
+    x * y
+
+// bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
+#define GB_CTYPE_BITS \
+    0
+
+// 1 if monoid update can skipped entirely (the ANY monoid)
+#define GB_IS_ANY_MONOID \
+    0
+
+// 1 if monoid update is EQ
+#define GB_IS_EQ_MONOID \
+    0
+
+// 1 if monoid update can be done atomically, 0 otherwise
+#define GB_HAS_ATOMIC \
+    1
+
+// 1 if monoid update can be done with an OpenMP atomic update, 0 otherwise
+#if GB_MICROSOFT
+    #define GB_HAS_OMP_ATOMIC \
+        0
+#else
+    #define GB_HAS_OMP_ATOMIC \
+        1
+#endif
+
+// 1 for the ANY_PAIR semirings
+#define GB_IS_ANY_PAIR_SEMIRING \
+    0
+
+// 1 if PAIR is the multiply operator 
+#define GB_IS_PAIR_MULTIPLIER \
+    0
+
+// 1 if monoid is PLUS_FC32
+#define GB_IS_PLUS_FC32_MONOID \
+    0
+
+// 1 if monoid is PLUS_FC64
+#define GB_IS_PLUS_FC64_MONOID \
+    0
+
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    1
+
+// atomic compare-exchange
+#define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
+    GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // result is purely symbolic; no numeric work to do.  Hx is not used.
+    #define GB_HX_WRITE(i,t)
+    #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
+    #define GB_HX_UPDATE(i,t)
+    #define GB_CIJ_MEMCPY(p,i,len)
+
+#else
+
+    // Hx [i] = t
+    #define GB_HX_WRITE(i,t) Hx [i] = t
+
+    // Cx [p] = Hx [i]
+    #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
+
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
+    // Hx [i] += t
+    #define GB_HX_UPDATE(i,t) \
+        Hx [i] *= t
+
+    // memcpy (&(Cx [p]), &(Hx [i]), len)
+    #define GB_CIJ_MEMCPY(p,i,len) \
+        memcpy (Cx +(p), Hx +(i), (len) * sizeof(int64_t))
+
+#endif
+
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
+// disable this semiring and use the generic case if these conditions hold
+#define GB_DISABLE \
+    (GxB_NO_TIMES || GxB_NO_SECONDJ || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_SECONDJ_INT64 || GxB_NO_TIMES_SECONDJ_INT64)
+
+//------------------------------------------------------------------------------
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot2B__times_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
+    int nthreads, int naslice, int nbslice
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot2_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot3B__times_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    const GB_task_struct *GB_RESTRICT TaskList,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot3_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C+=A'*B: dense dot product
+//------------------------------------------------------------------------------
+
+GrB_Info GB_Adot4B__times_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix A, bool A_is_pattern,
+    int64_t *GB_RESTRICT A_slice, int naslice,
+    const GrB_Matrix B, bool B_is_pattern,
+    int64_t *GB_RESTRICT B_slice, int nbslice,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_dot4_meta.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+//------------------------------------------------------------------------------
+// C=A*B, C<M>=A*B, C<!M>=A*B: saxpy3 method (Gustavson + Hash)
+//------------------------------------------------------------------------------
+
+#include "GB_AxB_saxpy3_template.h"
+
+GrB_Info GB_Asaxpy3B__times_secondj_int64
+(
+    GrB_Matrix C,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
+    const GrB_Matrix A, bool A_is_pattern,
+    const GrB_Matrix B, bool B_is_pattern,
+    GB_saxpy3task_struct *GB_RESTRICT TaskList,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
+    GB_Context Context
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    #include "GB_AxB_saxpy_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
+#endif
+
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_times_fc32.c b/GraphBLAS/Source/Generated/GB_AxB__times_times_fc32.c
index cfd4007058..d079edef42 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_times_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_times_fc32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC32_mul (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLXF (((float) x), ((float) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLXF (((float) x), ((float) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC32_t x_op_y = GB_FC32_mul (x, y) ; z = GB_FC32_mul (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLXF(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC32_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC32_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC32_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FC32 || GxB_NO_TIMES_FC32 || GxB_NO_TIMES_TIMES_FC32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_times_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_times_fc32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_times_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_times_fc32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_times_fc32
 GrB_Info GB_Asaxpy3B__times_times_fc32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_times_fc64.c b/GraphBLAS/Source/Generated/GB_AxB__times_times_fc64.c
index e5edd0c8a1..b8f3f4423a 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_times_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_times_fc64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GxB_FC64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = GB_FC64_mul (x, y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GxB_CMPLX (((double) x), ((double) y))
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GxB_CMPLX (((double) x), ((double) y))
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     GxB_FC64_t x_op_y = GB_FC64_mul (x, y) ; z = GB_FC64_mul (z, x_op_y)
 
 // monoid identity value
 #define GB_IDENTITY \
     GxB_CMPLX(1,0)
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GxB_FC64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_FC64_mul (x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GxB_FC64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_128 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] = GB_FC64_mul (Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] = GB_FC64_mul (Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FC64 || GxB_NO_TIMES_FC64 || GxB_NO_TIMES_TIMES_FC64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_times_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_times_fc64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_times_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_times_fc64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_times_fc64
 GrB_Info GB_Asaxpy3B__times_times_fc64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_times_fp32.c b/GraphBLAS/Source/Generated/GB_AxB__times_times_fp32.c
index 265e50c4b6..550ca81a55 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_times_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_times_fp32.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     float
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     float bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((float) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((float) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x * y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         float cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FP32 || GxB_NO_TIMES_FP32 || GxB_NO_TIMES_TIMES_FP32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_times_fp32
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_times_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_times_fp32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_times_fp32
 GrB_Info GB_Asaxpy3B__times_times_fp32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_times_fp64.c b/GraphBLAS/Source/Generated/GB_AxB__times_times_fp64.c
index 5ca1c6a454..513b8997f0 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_times_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_times_fp64.c
@@ -1,10 +1,9 @@
-
 //------------------------------------------------------------------------------
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     double
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     double bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((double) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((double) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     z *= (x * y)
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     ;
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         double cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_FP64 || GxB_NO_TIMES_FP64 || GxB_NO_TIMES_TIMES_FP64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_times_fp64
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B__times_times_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B__times_times_fp64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B__times_times_fp64
 GrB_Info GB_Asaxpy3B__times_times_fp64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_times_int16.c b/GraphBLAS/Source/Generated/GB_AxB__times_times_int16.c
index dd2e73c986..55c798d853 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_times_int16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_times_int16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int16_t x_op_y = (aik * bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int16_t x_op_y = (x * y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_INT16 || GxB_NO_TIMES_INT16 || GxB_NO_TIMES_TIMES_INT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_times_int16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_times_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_times_int16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_times_int16
 GrB_Info GB_Asaxpy3B__times_times_int16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_times_int32.c b/GraphBLAS/Source/Generated/GB_AxB__times_times_int32.c
index 8bac7075a3..f150aa9ff3 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_times_int32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_times_int32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int32_t x_op_y = (aik * bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int32_t x_op_y = (x * y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_INT32 || GxB_NO_TIMES_INT32 || GxB_NO_TIMES_TIMES_INT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_times_int32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_times_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_times_int32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_times_int32
 GrB_Info GB_Asaxpy3B__times_times_int32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_times_int64.c b/GraphBLAS/Source/Generated/GB_AxB__times_times_int64.c
index adb8565afa..969a771382 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_times_int64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_times_int64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int64_t x_op_y = (aik * bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int64_t x_op_y = (x * y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_INT64 || GxB_NO_TIMES_INT64 || GxB_NO_TIMES_TIMES_INT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_times_int64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_times_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_times_int64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_times_int64
 GrB_Info GB_Asaxpy3B__times_times_int64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_times_int8.c b/GraphBLAS/Source/Generated/GB_AxB__times_times_int8.c
index 55f40ca17b..226e5a5144 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_times_int8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_times_int8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  int8_t x_op_y = (aik * bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     int8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     int8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     int8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((int8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((int8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     int8_t x_op_y = (x * y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         int8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    int8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_INT8 || GxB_NO_TIMES_INT8 || GxB_NO_TIMES_TIMES_INT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_times_int8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_times_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_times_int8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_times_int8
 GrB_Info GB_Asaxpy3B__times_times_int8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_times_uint16.c b/GraphBLAS/Source/Generated/GB_AxB__times_times_uint16.c
index 61bb0ebeca..8c80cade7f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_times_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_times_uint16.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint16_t x_op_y = (aik * bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint16_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint16_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint16_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint16_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint16_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint16_t x_op_y = (x * y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint16_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint16_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_16 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_UINT16 || GxB_NO_TIMES_UINT16 || GxB_NO_TIMES_TIMES_UINT16)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_times_uint16
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_times_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_times_uint16
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_times_uint16
 GrB_Info GB_Asaxpy3B__times_times_uint16
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_times_uint32.c b/GraphBLAS/Source/Generated/GB_AxB__times_times_uint32.c
index f7516f936b..e03b8ec94f 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_times_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_times_uint32.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint32_t x_op_y = (aik * bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint32_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint32_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint32_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint32_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint32_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint32_t x_op_y = (x * y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint32_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint32_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffffffffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_32 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_UINT32 || GxB_NO_TIMES_UINT32 || GxB_NO_TIMES_TIMES_UINT32)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_times_uint32
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_times_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_times_uint32
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_times_uint32
 GrB_Info GB_Asaxpy3B__times_times_uint32
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_times_uint64.c b/GraphBLAS/Source/Generated/GB_AxB__times_times_uint64.c
index fe9e565edd..58e347e7e6 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_times_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_times_uint64.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint64_t x_op_y = (aik * bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint64_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint64_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     1
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint64_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint64_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint64_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint64_t x_op_y = (x * y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint64_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint64_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_64 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_UINT64 || GxB_NO_TIMES_UINT64 || GxB_NO_TIMES_TIMES_UINT64)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_times_uint64
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_times_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_times_uint64
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_times_uint64
 GrB_Info GB_Asaxpy3B__times_times_uint64
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_AxB__times_times_uint8.c b/GraphBLAS/Source/Generated/GB_AxB__times_times_uint8.c
index dfa36d25a3..c8ffbb4b64 100644
--- a/GraphBLAS/Source/Generated/GB_AxB__times_times_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_AxB__times_times_uint8.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,6 +18,8 @@
 #include "GB_atomics.h"
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
+#include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -37,7 +39,7 @@
 //           OpenMP atomic? 1
 // MultAdd:  uint8_t x_op_y = (aik * bkj) ; cij *= x_op_y
 // Identity: 1
-// Terminal: if (cij == 0) { cij_is_terminal = true ; break ; }
+// Terminal: if (cij == 0) break ;
 
 #define GB_ATYPE \
     uint8_t
@@ -48,6 +50,10 @@
 #define GB_CTYPE \
     uint8_t
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     0
@@ -60,27 +66,47 @@
 #define GB_GETB(bkj,Bx,pB) \
     uint8_t bkj = Bx [pB]
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    Gx [pG] = Bx [pB]
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
+#define GB_MULT(z, x, y, i, k, j) \
     z = (x * y)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     ((uint8_t) x)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    ((uint8_t) x)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
+#define GB_MULTADD(z, x, y, i, k, j) \
     uint8_t x_op_y = (x * y) ; z *= x_op_y
 
 // monoid identity value
 #define GB_IDENTITY \
     1
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    0
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    (none)
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
-    if (cij == 0) { cij_is_terminal = true ; break ; }
+    if (cij == 0) break ;
 
 // simd pragma for dot-product loop vectorization
 #define GB_PRAGMA_SIMD_DOT(cij) \
@@ -93,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     0
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    0
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -104,16 +134,11 @@
         uint8_t cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -126,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     x * y
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    uint8_t
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     0xffL
@@ -171,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     0
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    0
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    0
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    0
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    0
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    0
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    0
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    0
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    0
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_ATOMIC_COMPARE_EXCHANGE_8 (target, expected, desired)
@@ -180,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -191,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        Cx [p] *= Hx [i]
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         Hx [i] *= t
@@ -201,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    0
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    (none)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    ;
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    ;
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TIMES || GxB_NO_UINT8 || GxB_NO_TIMES_UINT8 || GxB_NO_TIMES_TIMES_UINT8)
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B__times_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B__times_times_uint8
@@ -249,7 +328,7 @@ GrB_Info GB_Adot3B__times_times_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -271,7 +350,7 @@ GrB_Info GB_Adot4B__times_times_uint8
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -285,20 +364,22 @@ GrB_Info GB_Adot4B__times_times_uint8
 GrB_Info GB_Asaxpy3B__times_times_uint8
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generated/GB_binop__atan2_fp32.c b/GraphBLAS/Source/Generated/GB_binop__atan2_fp32.c
index c6e86003e2..476255ec5e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__atan2_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__atan2_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = atan2f (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__atan2_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__atan2_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__atan2_fp32
 GrB_Info GB_AemultB__atan2_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__atan2_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__atan2_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = atan2f (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__atan2_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__atan2_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = atan2f (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__atan2_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = atan2f (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = atan2f (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__atan2_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__atan2_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__atan2_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__atan2_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = atan2f (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = atan2f (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__atan2_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__atan2_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__atan2_fp64.c b/GraphBLAS/Source/Generated/GB_binop__atan2_fp64.c
index 9ae3a3bc4f..ea143906e0 100644
--- a/GraphBLAS/Source/Generated/GB_binop__atan2_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__atan2_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = atan2 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__atan2_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__atan2_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__atan2_fp64
 GrB_Info GB_AemultB__atan2_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__atan2_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__atan2_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = atan2 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__atan2_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__atan2_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = atan2 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__atan2_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = atan2 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = atan2 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__atan2_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__atan2_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__atan2_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__atan2_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = atan2 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = atan2 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__atan2_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__atan2_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__band_int16.c b/GraphBLAS/Source/Generated/GB_binop__band_int16.c
index 0bd1a757c4..82473d5279 100644
--- a/GraphBLAS/Source/Generated/GB_binop__band_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__band_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) & (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__band_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__band_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__band_int16
 GrB_Info GB_AemultB__band_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__band_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__band_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x) & (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__band_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__band_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij) & (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__band_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) & (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) & (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__band_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__band_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__band_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__band_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) & (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) & (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__band_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__band_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__band_int32.c b/GraphBLAS/Source/Generated/GB_binop__band_int32.c
index 7ed46b630f..b49d5e9c13 100644
--- a/GraphBLAS/Source/Generated/GB_binop__band_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__band_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) & (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__band_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__band_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__band_int32
 GrB_Info GB_AemultB__band_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__band_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__band_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x) & (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__band_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__band_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij) & (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__band_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) & (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) & (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__band_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__band_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__band_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__band_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) & (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) & (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__band_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__band_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__band_int64.c b/GraphBLAS/Source/Generated/GB_binop__band_int64.c
index 33559eb754..c93100c322 100644
--- a/GraphBLAS/Source/Generated/GB_binop__band_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__band_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) & (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__band_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__band_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__band_int64
 GrB_Info GB_AemultB__band_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__band_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__band_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x) & (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__band_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__band_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij) & (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__band_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) & (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) & (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__band_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__band_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__band_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__band_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) & (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) & (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__band_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__band_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__band_int8.c b/GraphBLAS/Source/Generated/GB_binop__band_int8.c
index 5153943548..dee6c8ab45 100644
--- a/GraphBLAS/Source/Generated/GB_binop__band_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__band_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) & (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__band_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__band_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__band_int8
 GrB_Info GB_AemultB__band_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__band_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__band_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x) & (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__band_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__band_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij) & (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__band_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) & (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) & (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__band_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__band_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__band_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__band_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) & (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) & (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__band_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__band_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__band_uint16.c b/GraphBLAS/Source/Generated/GB_binop__band_uint16.c
index 6ea4e4196b..e0f4a68764 100644
--- a/GraphBLAS/Source/Generated/GB_binop__band_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__band_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) & (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__band_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__band_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__band_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__band_uint16
 GrB_Info GB_AemultB__band_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__band_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__band_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x) & (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__band_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__band_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij) & (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__band_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) & (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) & (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__band_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__band_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__band_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__band_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) & (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) & (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__band_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__band_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__band_uint32.c b/GraphBLAS/Source/Generated/GB_binop__band_uint32.c
index 90b1711f1c..aefef24936 100644
--- a/GraphBLAS/Source/Generated/GB_binop__band_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__band_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) & (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__band_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__band_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__band_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__band_uint32
 GrB_Info GB_AemultB__band_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__band_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__band_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x) & (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__band_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__band_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij) & (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__band_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) & (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) & (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__band_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__band_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__band_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__band_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) & (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) & (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__band_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__band_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__band_uint64.c b/GraphBLAS/Source/Generated/GB_binop__band_uint64.c
index 4430fd415f..0fe5dff46a 100644
--- a/GraphBLAS/Source/Generated/GB_binop__band_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__band_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) & (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__band_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__band_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__band_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__band_uint64
 GrB_Info GB_AemultB__band_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__band_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__band_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x) & (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__band_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__band_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij) & (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__band_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) & (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) & (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__band_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__band_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__band_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__band_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) & (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) & (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__band_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__band_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__band_uint8.c b/GraphBLAS/Source/Generated/GB_binop__band_uint8.c
index edc4c521e0..dd931462a4 100644
--- a/GraphBLAS/Source/Generated/GB_binop__band_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__band_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) & (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__band_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__band_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__band_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__band_uint8
 GrB_Info GB_AemultB__band_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__band_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__band_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x) & (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__band_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__band_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij) & (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__band_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) & (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) & (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__band_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__band_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__band_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__band_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) & (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) & (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__band_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__band_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bclr_int16.c b/GraphBLAS/Source/Generated/GB_binop__bclr_int16.c
index e8c6267079..e78a235daf 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bclr_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bclr_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITCLR (x, y, int16_t, 16) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bclr_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bclr_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bclr_int16
 GrB_Info GB_AemultB__bclr_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bclr_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bclr_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = GB_BITCLR (x, bij, int16_t, 16) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bclr_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bclr_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = GB_BITCLR (aij, y, int16_t, 16) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bclr_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (x, aij, int16_t, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (x, aij, int16_t, 16) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bclr_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bclr_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bclr_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bclr_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (aij, y, int16_t, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (aij, y, int16_t, 16) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bclr_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bclr_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bclr_int32.c b/GraphBLAS/Source/Generated/GB_binop__bclr_int32.c
index 38e9fd3766..3d5dc28753 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bclr_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bclr_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITCLR (x, y, int32_t, 32) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bclr_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bclr_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bclr_int32
 GrB_Info GB_AemultB__bclr_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bclr_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bclr_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = GB_BITCLR (x, bij, int32_t, 32) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bclr_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bclr_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = GB_BITCLR (aij, y, int32_t, 32) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bclr_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (x, aij, int32_t, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (x, aij, int32_t, 32) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bclr_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bclr_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bclr_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bclr_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (aij, y, int32_t, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (aij, y, int32_t, 32) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bclr_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bclr_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bclr_int64.c b/GraphBLAS/Source/Generated/GB_binop__bclr_int64.c
index 5691c894b1..247112232c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bclr_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bclr_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITCLR (x, y, int64_t, 64) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bclr_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bclr_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bclr_int64
 GrB_Info GB_AemultB__bclr_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bclr_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bclr_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = GB_BITCLR (x, bij, int64_t, 64) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bclr_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bclr_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = GB_BITCLR (aij, y, int64_t, 64) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bclr_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (x, aij, int64_t, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (x, aij, int64_t, 64) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bclr_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bclr_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bclr_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bclr_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (aij, y, int64_t, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (aij, y, int64_t, 64) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bclr_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bclr_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bclr_int8.c b/GraphBLAS/Source/Generated/GB_binop__bclr_int8.c
index b193d38f8a..4fe6389634 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bclr_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bclr_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITCLR (x, y, int8_t, 8) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bclr_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bclr_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bclr_int8
 GrB_Info GB_AemultB__bclr_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bclr_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bclr_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_BITCLR (x, bij, int8_t, 8) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bclr_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bclr_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = GB_BITCLR (aij, y, int8_t, 8) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bclr_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (x, aij, int8_t, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (x, aij, int8_t, 8) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bclr_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bclr_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bclr_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bclr_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (aij, y, int8_t, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (aij, y, int8_t, 8) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bclr_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bclr_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bclr_uint16.c b/GraphBLAS/Source/Generated/GB_binop__bclr_uint16.c
index 8342c22bc9..2002ea10d6 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bclr_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bclr_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITCLR (x, y, uint16_t, 16) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bclr_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bclr_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bclr_uint16
 GrB_Info GB_AemultB__bclr_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bclr_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bclr_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = GB_BITCLR (x, bij, uint16_t, 16) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bclr_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bclr_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = GB_BITCLR (aij, y, uint16_t, 16) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bclr_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (x, aij, uint16_t, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (x, aij, uint16_t, 16) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bclr_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bclr_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bclr_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bclr_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (aij, y, uint16_t, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (aij, y, uint16_t, 16) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bclr_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bclr_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bclr_uint32.c b/GraphBLAS/Source/Generated/GB_binop__bclr_uint32.c
index 92640d027f..2ce70781d7 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bclr_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bclr_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITCLR (x, y, uint32_t, 32) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bclr_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bclr_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bclr_uint32
 GrB_Info GB_AemultB__bclr_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bclr_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bclr_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = GB_BITCLR (x, bij, uint32_t, 32) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bclr_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bclr_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = GB_BITCLR (aij, y, uint32_t, 32) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bclr_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (x, aij, uint32_t, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (x, aij, uint32_t, 32) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bclr_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bclr_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bclr_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bclr_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (aij, y, uint32_t, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (aij, y, uint32_t, 32) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bclr_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bclr_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bclr_uint64.c b/GraphBLAS/Source/Generated/GB_binop__bclr_uint64.c
index d10ba2e825..24651b3e0f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bclr_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bclr_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITCLR (x, y, uint64_t, 64) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bclr_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bclr_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bclr_uint64
 GrB_Info GB_AemultB__bclr_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bclr_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bclr_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = GB_BITCLR (x, bij, uint64_t, 64) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bclr_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bclr_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = GB_BITCLR (aij, y, uint64_t, 64) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bclr_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (x, aij, uint64_t, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (x, aij, uint64_t, 64) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bclr_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bclr_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bclr_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bclr_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (aij, y, uint64_t, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (aij, y, uint64_t, 64) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bclr_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bclr_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bclr_uint8.c b/GraphBLAS/Source/Generated/GB_binop__bclr_uint8.c
index 9c20955210..c0735f5765 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bclr_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bclr_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITCLR (x, y, uint8_t, 8) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bclr_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bclr_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bclr_uint8
 GrB_Info GB_AemultB__bclr_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bclr_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bclr_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = GB_BITCLR (x, bij, uint8_t, 8) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bclr_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bclr_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = GB_BITCLR (aij, y, uint8_t, 8) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bclr_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (x, aij, uint8_t, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (x, aij, uint8_t, 8) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bclr_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bclr_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bclr_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bclr_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITCLR (aij, y, uint8_t, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITCLR (aij, y, uint8_t, 8) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bclr_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bclr_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bget_int16.c b/GraphBLAS/Source/Generated/GB_binop__bget_int16.c
index 3272ec58f2..b8726e65b0 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bget_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bget_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITGET (x, y, int16_t, 16) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bget_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bget_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bget_int16
 GrB_Info GB_AemultB__bget_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bget_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bget_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = GB_BITGET (x, bij, int16_t, 16) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bget_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bget_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = GB_BITGET (aij, y, int16_t, 16) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bget_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (x, aij, int16_t, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (x, aij, int16_t, 16) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bget_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bget_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bget_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bget_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (aij, y, int16_t, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (aij, y, int16_t, 16) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bget_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bget_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bget_int32.c b/GraphBLAS/Source/Generated/GB_binop__bget_int32.c
index 154ddef929..23a138c761 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bget_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bget_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITGET (x, y, int32_t, 32) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bget_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bget_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bget_int32
 GrB_Info GB_AemultB__bget_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bget_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bget_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = GB_BITGET (x, bij, int32_t, 32) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bget_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bget_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = GB_BITGET (aij, y, int32_t, 32) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bget_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (x, aij, int32_t, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (x, aij, int32_t, 32) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bget_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bget_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bget_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bget_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (aij, y, int32_t, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (aij, y, int32_t, 32) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bget_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bget_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bget_int64.c b/GraphBLAS/Source/Generated/GB_binop__bget_int64.c
index 835181b08c..94494b72ea 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bget_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bget_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITGET (x, y, int64_t, 64) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bget_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bget_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bget_int64
 GrB_Info GB_AemultB__bget_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bget_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bget_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = GB_BITGET (x, bij, int64_t, 64) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bget_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bget_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = GB_BITGET (aij, y, int64_t, 64) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bget_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (x, aij, int64_t, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (x, aij, int64_t, 64) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bget_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bget_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bget_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bget_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (aij, y, int64_t, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (aij, y, int64_t, 64) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bget_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bget_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bget_int8.c b/GraphBLAS/Source/Generated/GB_binop__bget_int8.c
index ff83a8e25c..ad3a959e23 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bget_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bget_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITGET (x, y, int8_t, 8) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bget_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bget_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bget_int8
 GrB_Info GB_AemultB__bget_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bget_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bget_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_BITGET (x, bij, int8_t, 8) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bget_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bget_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = GB_BITGET (aij, y, int8_t, 8) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bget_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (x, aij, int8_t, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (x, aij, int8_t, 8) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bget_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bget_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bget_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bget_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (aij, y, int8_t, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (aij, y, int8_t, 8) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bget_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bget_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bget_uint16.c b/GraphBLAS/Source/Generated/GB_binop__bget_uint16.c
index c7788c4610..b00e5e1ca3 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bget_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bget_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITGET (x, y, uint16_t, 16) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bget_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bget_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bget_uint16
 GrB_Info GB_AemultB__bget_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bget_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bget_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = GB_BITGET (x, bij, uint16_t, 16) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bget_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bget_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = GB_BITGET (aij, y, uint16_t, 16) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bget_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (x, aij, uint16_t, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (x, aij, uint16_t, 16) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bget_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bget_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bget_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bget_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (aij, y, uint16_t, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (aij, y, uint16_t, 16) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bget_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bget_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bget_uint32.c b/GraphBLAS/Source/Generated/GB_binop__bget_uint32.c
index 8484821110..714bee7ce8 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bget_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bget_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITGET (x, y, uint32_t, 32) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bget_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bget_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bget_uint32
 GrB_Info GB_AemultB__bget_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bget_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bget_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = GB_BITGET (x, bij, uint32_t, 32) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bget_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bget_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = GB_BITGET (aij, y, uint32_t, 32) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bget_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (x, aij, uint32_t, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (x, aij, uint32_t, 32) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bget_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bget_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bget_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bget_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (aij, y, uint32_t, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (aij, y, uint32_t, 32) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bget_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bget_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bget_uint64.c b/GraphBLAS/Source/Generated/GB_binop__bget_uint64.c
index edddb8fe56..6bb8e0c9ad 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bget_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bget_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITGET (x, y, uint64_t, 64) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bget_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bget_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bget_uint64
 GrB_Info GB_AemultB__bget_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bget_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bget_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = GB_BITGET (x, bij, uint64_t, 64) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bget_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bget_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = GB_BITGET (aij, y, uint64_t, 64) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bget_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (x, aij, uint64_t, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (x, aij, uint64_t, 64) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bget_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bget_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bget_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bget_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (aij, y, uint64_t, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (aij, y, uint64_t, 64) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bget_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bget_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bget_uint8.c b/GraphBLAS/Source/Generated/GB_binop__bget_uint8.c
index 2d69df9f95..e57d0dd8cc 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bget_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bget_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITGET (x, y, uint8_t, 8) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bget_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bget_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bget_uint8
 GrB_Info GB_AemultB__bget_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bget_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bget_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = GB_BITGET (x, bij, uint8_t, 8) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bget_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bget_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = GB_BITGET (aij, y, uint8_t, 8) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bget_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (x, aij, uint8_t, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (x, aij, uint8_t, 8) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bget_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bget_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bget_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bget_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITGET (aij, y, uint8_t, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITGET (aij, y, uint8_t, 8) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bget_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bget_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bor_int16.c b/GraphBLAS/Source/Generated/GB_binop__bor_int16.c
index e78689be7d..cf8aae80ea 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bor_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) | (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bor_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bor_int16
 GrB_Info GB_AemultB__bor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bor_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bor_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x) | (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bor_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bor_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij) | (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bor_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) | (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) | (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bor_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bor_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bor_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bor_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) | (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) | (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bor_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bor_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bor_int32.c b/GraphBLAS/Source/Generated/GB_binop__bor_int32.c
index 94bdd0cc56..497eaf78d6 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bor_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) | (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bor_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bor_int32
 GrB_Info GB_AemultB__bor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bor_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bor_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x) | (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bor_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bor_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij) | (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bor_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) | (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) | (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bor_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bor_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bor_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bor_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) | (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) | (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bor_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bor_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bor_int64.c b/GraphBLAS/Source/Generated/GB_binop__bor_int64.c
index 0498285c4d..f411b2f8f5 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bor_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) | (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bor_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bor_int64
 GrB_Info GB_AemultB__bor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bor_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bor_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x) | (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bor_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bor_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij) | (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bor_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) | (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) | (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bor_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bor_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bor_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bor_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) | (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) | (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bor_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bor_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bor_int8.c b/GraphBLAS/Source/Generated/GB_binop__bor_int8.c
index 18f022a8d1..174f102eb5 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bor_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) | (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bor_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bor_int8
 GrB_Info GB_AemultB__bor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bor_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bor_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x) | (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bor_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bor_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij) | (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bor_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) | (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) | (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bor_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bor_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bor_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bor_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) | (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) | (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bor_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bor_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bor_uint16.c b/GraphBLAS/Source/Generated/GB_binop__bor_uint16.c
index de9d56599c..72d8d1df43 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bor_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) | (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__bor_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bor_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bor_uint16
 GrB_Info GB_AemultB__bor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bor_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bor_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x) | (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bor_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bor_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij) | (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bor_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) | (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) | (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bor_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bor_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bor_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bor_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) | (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) | (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bor_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bor_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bor_uint32.c b/GraphBLAS/Source/Generated/GB_binop__bor_uint32.c
index 5255660450..69ecc93114 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bor_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) | (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__bor_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bor_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bor_uint32
 GrB_Info GB_AemultB__bor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bor_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bor_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x) | (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bor_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bor_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij) | (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bor_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) | (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) | (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bor_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bor_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bor_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bor_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) | (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) | (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bor_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bor_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bor_uint64.c b/GraphBLAS/Source/Generated/GB_binop__bor_uint64.c
index 67be1f7a75..b42f1b73d6 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bor_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) | (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__bor_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bor_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bor_uint64
 GrB_Info GB_AemultB__bor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bor_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bor_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x) | (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bor_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bor_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij) | (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bor_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) | (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) | (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bor_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bor_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bor_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bor_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) | (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) | (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bor_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bor_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bor_uint8.c b/GraphBLAS/Source/Generated/GB_binop__bor_uint8.c
index cf9733d6e9..d7278e7604 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bor_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) | (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__bor_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bor_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bor_uint8
 GrB_Info GB_AemultB__bor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bor_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bor_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x) | (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bor_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bor_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij) | (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bor_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) | (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) | (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bor_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bor_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bor_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bor_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) | (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) | (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bor_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bor_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bset_int16.c b/GraphBLAS/Source/Generated/GB_binop__bset_int16.c
index 2c6adb38eb..b994070f79 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bset_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bset_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITSET (x, y, int16_t, 16) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bset_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bset_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bset_int16
 GrB_Info GB_AemultB__bset_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bset_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bset_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = GB_BITSET (x, bij, int16_t, 16) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bset_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bset_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = GB_BITSET (aij, y, int16_t, 16) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bset_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (x, aij, int16_t, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (x, aij, int16_t, 16) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bset_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bset_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bset_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bset_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (aij, y, int16_t, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (aij, y, int16_t, 16) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bset_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bset_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bset_int32.c b/GraphBLAS/Source/Generated/GB_binop__bset_int32.c
index 2cf7539bbe..271aa986ea 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bset_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bset_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITSET (x, y, int32_t, 32) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bset_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bset_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bset_int32
 GrB_Info GB_AemultB__bset_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bset_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bset_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = GB_BITSET (x, bij, int32_t, 32) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bset_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bset_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = GB_BITSET (aij, y, int32_t, 32) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bset_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (x, aij, int32_t, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (x, aij, int32_t, 32) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bset_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bset_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bset_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bset_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (aij, y, int32_t, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (aij, y, int32_t, 32) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bset_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bset_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bset_int64.c b/GraphBLAS/Source/Generated/GB_binop__bset_int64.c
index ef9a05e29b..51248bff61 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bset_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bset_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITSET (x, y, int64_t, 64) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bset_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bset_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bset_int64
 GrB_Info GB_AemultB__bset_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bset_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bset_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = GB_BITSET (x, bij, int64_t, 64) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bset_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bset_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = GB_BITSET (aij, y, int64_t, 64) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bset_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (x, aij, int64_t, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (x, aij, int64_t, 64) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bset_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bset_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bset_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bset_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (aij, y, int64_t, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (aij, y, int64_t, 64) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bset_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bset_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bset_int8.c b/GraphBLAS/Source/Generated/GB_binop__bset_int8.c
index cad263178f..e4c500e090 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bset_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bset_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITSET (x, y, int8_t, 8) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bset_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bset_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bset_int8
 GrB_Info GB_AemultB__bset_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bset_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bset_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_BITSET (x, bij, int8_t, 8) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bset_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bset_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = GB_BITSET (aij, y, int8_t, 8) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bset_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (x, aij, int8_t, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (x, aij, int8_t, 8) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bset_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bset_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bset_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bset_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (aij, y, int8_t, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (aij, y, int8_t, 8) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bset_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bset_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bset_uint16.c b/GraphBLAS/Source/Generated/GB_binop__bset_uint16.c
index ef7ecac0c4..5977cb7bfb 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bset_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bset_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITSET (x, y, uint16_t, 16) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bset_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bset_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bset_uint16
 GrB_Info GB_AemultB__bset_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bset_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bset_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = GB_BITSET (x, bij, uint16_t, 16) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bset_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bset_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = GB_BITSET (aij, y, uint16_t, 16) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bset_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (x, aij, uint16_t, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (x, aij, uint16_t, 16) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bset_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bset_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bset_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bset_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (aij, y, uint16_t, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (aij, y, uint16_t, 16) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bset_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bset_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bset_uint32.c b/GraphBLAS/Source/Generated/GB_binop__bset_uint32.c
index 3a697407f7..f97761cdac 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bset_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bset_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITSET (x, y, uint32_t, 32) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bset_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bset_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bset_uint32
 GrB_Info GB_AemultB__bset_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bset_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bset_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = GB_BITSET (x, bij, uint32_t, 32) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bset_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bset_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = GB_BITSET (aij, y, uint32_t, 32) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bset_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (x, aij, uint32_t, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (x, aij, uint32_t, 32) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bset_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bset_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bset_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bset_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (aij, y, uint32_t, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (aij, y, uint32_t, 32) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bset_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bset_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bset_uint64.c b/GraphBLAS/Source/Generated/GB_binop__bset_uint64.c
index 925896fb3c..2368d67702 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bset_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bset_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITSET (x, y, uint64_t, 64) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bset_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bset_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bset_uint64
 GrB_Info GB_AemultB__bset_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bset_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bset_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = GB_BITSET (x, bij, uint64_t, 64) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bset_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bset_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = GB_BITSET (aij, y, uint64_t, 64) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bset_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (x, aij, uint64_t, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (x, aij, uint64_t, 64) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bset_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bset_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bset_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bset_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (aij, y, uint64_t, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (aij, y, uint64_t, 64) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bset_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bset_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bset_uint8.c b/GraphBLAS/Source/Generated/GB_binop__bset_uint8.c
index 6630239113..464dbb96ca 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bset_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bset_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_BITSET (x, y, uint8_t, 8) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bset_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bset_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bset_uint8
 GrB_Info GB_AemultB__bset_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bset_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bset_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = GB_BITSET (x, bij, uint8_t, 8) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bset_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bset_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = GB_BITSET (aij, y, uint8_t, 8) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bset_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (x, aij, uint8_t, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (x, aij, uint8_t, 8) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bset_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bset_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bset_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bset_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_BITSET (aij, y, uint8_t, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_BITSET (aij, y, uint8_t, 8) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bset_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bset_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bshift_int16.c b/GraphBLAS/Source/Generated/GB_binop__bshift_int16.c
index b48c420707..b14d8bee28 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bshift_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bshift_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_bitshift_int16 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bshift_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bshift_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bshift_int16
 GrB_Info GB_AemultB__bshift_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bshift_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bshift_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_bitshift_int16 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bshift_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bshift_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = GB_bitshift_int16 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bshift_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_int16 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_int16 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bshift_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bshift_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bshift_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bshift_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_int16 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_int16 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bshift_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bshift_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bshift_int32.c b/GraphBLAS/Source/Generated/GB_binop__bshift_int32.c
index 6e5c72d616..a06c78cc2d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bshift_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bshift_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_bitshift_int32 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bshift_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bshift_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bshift_int32
 GrB_Info GB_AemultB__bshift_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bshift_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bshift_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_bitshift_int32 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bshift_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bshift_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = GB_bitshift_int32 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bshift_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_int32 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_int32 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bshift_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bshift_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bshift_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bshift_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_int32 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_int32 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bshift_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bshift_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bshift_int64.c b/GraphBLAS/Source/Generated/GB_binop__bshift_int64.c
index cea0d09904..2e87e78325 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bshift_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bshift_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_bitshift_int64 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bshift_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bshift_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bshift_int64
 GrB_Info GB_AemultB__bshift_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bshift_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bshift_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_bitshift_int64 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bshift_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bshift_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = GB_bitshift_int64 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bshift_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_int64 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_int64 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bshift_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bshift_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bshift_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bshift_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_int64 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_int64 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bshift_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bshift_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bshift_int8.c b/GraphBLAS/Source/Generated/GB_binop__bshift_int8.c
index 98c3716a41..ada877cfe1 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bshift_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bshift_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_bitshift_int8 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bshift_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bshift_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bshift_int8
 GrB_Info GB_AemultB__bshift_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bshift_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bshift_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_bitshift_int8 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bshift_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bshift_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = GB_bitshift_int8 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bshift_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_int8 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_int8 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bshift_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bshift_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bshift_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bshift_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_int8 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_int8 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bshift_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bshift_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bshift_uint16.c b/GraphBLAS/Source/Generated/GB_binop__bshift_uint16.c
index c717953d50..f9866773e7 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bshift_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bshift_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_bitshift_uint16 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bshift_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bshift_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bshift_uint16
 GrB_Info GB_AemultB__bshift_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bshift_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bshift_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_bitshift_uint16 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bshift_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bshift_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = GB_bitshift_uint16 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bshift_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_uint16 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_uint16 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bshift_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bshift_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bshift_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bshift_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_uint16 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_uint16 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bshift_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bshift_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bshift_uint32.c b/GraphBLAS/Source/Generated/GB_binop__bshift_uint32.c
index 7c21d4accd..422ad40967 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bshift_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bshift_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_bitshift_uint32 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bshift_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bshift_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bshift_uint32
 GrB_Info GB_AemultB__bshift_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bshift_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bshift_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_bitshift_uint32 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bshift_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bshift_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = GB_bitshift_uint32 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bshift_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_uint32 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_uint32 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bshift_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bshift_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bshift_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bshift_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_uint32 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_uint32 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bshift_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bshift_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bshift_uint64.c b/GraphBLAS/Source/Generated/GB_binop__bshift_uint64.c
index c56dbd5e21..e5695e3ac2 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bshift_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bshift_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_bitshift_uint64 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bshift_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bshift_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bshift_uint64
 GrB_Info GB_AemultB__bshift_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bshift_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bshift_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_bitshift_uint64 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bshift_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bshift_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = GB_bitshift_uint64 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bshift_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_uint64 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_uint64 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bshift_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bshift_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bshift_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bshift_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_uint64 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_uint64 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bshift_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bshift_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bshift_uint8.c b/GraphBLAS/Source/Generated/GB_binop__bshift_uint8.c
index 8f47185691..9de6d8ea9e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bshift_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bshift_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_bitshift_uint8 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bshift_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bshift_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bshift_uint8
 GrB_Info GB_AemultB__bshift_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bshift_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bshift_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_bitshift_uint8 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bshift_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bshift_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = GB_bitshift_uint8 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bshift_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_uint8 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_uint8 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bshift_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bshift_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bshift_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bshift_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_bitshift_uint8 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_bitshift_uint8 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bshift_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bshift_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxnor_int16.c b/GraphBLAS/Source/Generated/GB_binop__bxnor_int16.c
index faacfaf13c..98e7a3c3ed 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxnor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxnor_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ~((x) ^ (y)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxnor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxnor_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxnor_int16
 GrB_Info GB_AemultB__bxnor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxnor_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxnor_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = ~((x) ^ (bij)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxnor_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxnor_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = ~((aij) ^ (y)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxnor_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((x) ^ (aij)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((x) ^ (aij)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxnor_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxnor_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxnor_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxnor_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((aij) ^ (y)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((aij) ^ (y)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxnor_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxnor_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxnor_int32.c b/GraphBLAS/Source/Generated/GB_binop__bxnor_int32.c
index cb9489ad70..9d8a0c2a04 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxnor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxnor_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ~((x) ^ (y)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxnor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxnor_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxnor_int32
 GrB_Info GB_AemultB__bxnor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxnor_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxnor_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = ~((x) ^ (bij)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxnor_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxnor_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = ~((aij) ^ (y)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxnor_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((x) ^ (aij)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((x) ^ (aij)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxnor_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxnor_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxnor_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxnor_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((aij) ^ (y)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((aij) ^ (y)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxnor_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxnor_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxnor_int64.c b/GraphBLAS/Source/Generated/GB_binop__bxnor_int64.c
index b3920b933e..415ae64eac 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxnor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxnor_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ~((x) ^ (y)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxnor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxnor_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxnor_int64
 GrB_Info GB_AemultB__bxnor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxnor_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxnor_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = ~((x) ^ (bij)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxnor_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxnor_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = ~((aij) ^ (y)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxnor_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((x) ^ (aij)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((x) ^ (aij)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxnor_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxnor_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxnor_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxnor_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((aij) ^ (y)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((aij) ^ (y)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxnor_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxnor_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxnor_int8.c b/GraphBLAS/Source/Generated/GB_binop__bxnor_int8.c
index 605c69af47..7f41136881 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxnor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxnor_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ~((x) ^ (y)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxnor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxnor_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxnor_int8
 GrB_Info GB_AemultB__bxnor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxnor_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxnor_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = ~((x) ^ (bij)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxnor_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxnor_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = ~((aij) ^ (y)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxnor_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((x) ^ (aij)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((x) ^ (aij)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxnor_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxnor_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxnor_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxnor_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((aij) ^ (y)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((aij) ^ (y)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxnor_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxnor_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxnor_uint16.c b/GraphBLAS/Source/Generated/GB_binop__bxnor_uint16.c
index e47307f476..9f0f1fa444 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxnor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxnor_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ~((x) ^ (y)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__bxnor_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxnor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxnor_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxnor_uint16
 GrB_Info GB_AemultB__bxnor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxnor_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxnor_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = ~((x) ^ (bij)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxnor_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxnor_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = ~((aij) ^ (y)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxnor_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((x) ^ (aij)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((x) ^ (aij)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxnor_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxnor_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxnor_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxnor_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((aij) ^ (y)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((aij) ^ (y)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxnor_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxnor_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxnor_uint32.c b/GraphBLAS/Source/Generated/GB_binop__bxnor_uint32.c
index 0b2c233ee3..837ed02a17 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxnor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxnor_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ~((x) ^ (y)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__bxnor_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxnor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxnor_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxnor_uint32
 GrB_Info GB_AemultB__bxnor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxnor_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxnor_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = ~((x) ^ (bij)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxnor_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxnor_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = ~((aij) ^ (y)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxnor_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((x) ^ (aij)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((x) ^ (aij)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxnor_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxnor_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxnor_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxnor_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((aij) ^ (y)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((aij) ^ (y)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxnor_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxnor_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxnor_uint64.c b/GraphBLAS/Source/Generated/GB_binop__bxnor_uint64.c
index fea2d011b6..f2628e0f58 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxnor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxnor_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ~((x) ^ (y)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__bxnor_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxnor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxnor_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxnor_uint64
 GrB_Info GB_AemultB__bxnor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxnor_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxnor_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = ~((x) ^ (bij)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxnor_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxnor_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = ~((aij) ^ (y)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxnor_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((x) ^ (aij)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((x) ^ (aij)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxnor_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxnor_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxnor_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxnor_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((aij) ^ (y)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((aij) ^ (y)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxnor_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxnor_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxnor_uint8.c b/GraphBLAS/Source/Generated/GB_binop__bxnor_uint8.c
index 3cfbc3bf13..8b9a7282b4 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxnor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxnor_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ~((x) ^ (y)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__bxnor_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxnor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxnor_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxnor_uint8
 GrB_Info GB_AemultB__bxnor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxnor_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxnor_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = ~((x) ^ (bij)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxnor_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxnor_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = ~((aij) ^ (y)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxnor_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((x) ^ (aij)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((x) ^ (aij)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxnor_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxnor_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxnor_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxnor_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = ~((aij) ^ (y)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ~((aij) ^ (y)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxnor_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxnor_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxor_int16.c b/GraphBLAS/Source/Generated/GB_binop__bxor_int16.c
index 69b7c63d4d..36e6a94535 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxor_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) ^ (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxor_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxor_int16
 GrB_Info GB_AemultB__bxor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxor_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxor_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x) ^ (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxor_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxor_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij) ^ (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxor_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) ^ (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) ^ (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxor_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxor_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxor_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxor_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) ^ (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) ^ (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxor_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxor_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxor_int32.c b/GraphBLAS/Source/Generated/GB_binop__bxor_int32.c
index 2371048824..59656aa910 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxor_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) ^ (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxor_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxor_int32
 GrB_Info GB_AemultB__bxor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxor_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxor_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x) ^ (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxor_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxor_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij) ^ (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxor_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) ^ (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) ^ (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxor_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxor_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxor_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxor_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) ^ (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) ^ (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxor_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxor_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxor_int64.c b/GraphBLAS/Source/Generated/GB_binop__bxor_int64.c
index 4b07972ff1..80508c9f42 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxor_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) ^ (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxor_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxor_int64
 GrB_Info GB_AemultB__bxor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxor_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxor_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x) ^ (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxor_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxor_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij) ^ (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxor_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) ^ (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) ^ (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxor_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxor_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxor_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxor_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) ^ (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) ^ (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxor_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxor_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxor_int8.c b/GraphBLAS/Source/Generated/GB_binop__bxor_int8.c
index 5645ce437f..1a4cc73bfc 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxor_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) ^ (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxor_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxor_int8
 GrB_Info GB_AemultB__bxor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxor_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxor_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x) ^ (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxor_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxor_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij) ^ (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxor_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) ^ (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) ^ (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxor_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxor_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxor_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxor_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) ^ (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) ^ (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxor_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxor_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxor_uint16.c b/GraphBLAS/Source/Generated/GB_binop__bxor_uint16.c
index 2b665c20e0..3a0b4f831e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxor_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) ^ (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__bxor_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxor_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxor_uint16
 GrB_Info GB_AemultB__bxor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxor_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxor_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x) ^ (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxor_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxor_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij) ^ (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxor_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) ^ (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) ^ (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxor_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxor_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxor_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxor_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) ^ (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) ^ (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxor_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxor_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxor_uint32.c b/GraphBLAS/Source/Generated/GB_binop__bxor_uint32.c
index 5ac45036b2..5eb4af5cec 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxor_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) ^ (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__bxor_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxor_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxor_uint32
 GrB_Info GB_AemultB__bxor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxor_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxor_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x) ^ (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxor_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxor_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij) ^ (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxor_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) ^ (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) ^ (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxor_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxor_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxor_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxor_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) ^ (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) ^ (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxor_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxor_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxor_uint64.c b/GraphBLAS/Source/Generated/GB_binop__bxor_uint64.c
index ce8ef0cb6a..31a1e5698d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxor_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) ^ (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__bxor_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxor_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxor_uint64
 GrB_Info GB_AemultB__bxor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxor_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxor_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x) ^ (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxor_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxor_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij) ^ (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxor_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) ^ (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) ^ (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxor_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxor_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxor_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxor_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) ^ (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) ^ (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxor_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxor_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__bxor_uint8.c b/GraphBLAS/Source/Generated/GB_binop__bxor_uint8.c
index 1d07fcc46c..7fa13512ca 100644
--- a/GraphBLAS/Source/Generated/GB_binop__bxor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__bxor_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x) ^ (y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__bxor_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__bxor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__bxor_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__bxor_uint8
 GrB_Info GB_AemultB__bxor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__bxor_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__bxor_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x) ^ (bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__bxor_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__bxor_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij) ^ (y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__bxor_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x) ^ (aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x) ^ (aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__bxor_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__bxor_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__bxor_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__bxor_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij) ^ (y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij) ^ (y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__bxor_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__bxor_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__cmplx_fp32.c b/GraphBLAS/Source/Generated/GB_binop__cmplx_fp32.c
index 27ba4b5558..4296d4979c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__cmplx_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__cmplx_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GxB_CMPLXF (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__cmplx_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__cmplx_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__cmplx_fp32
 GrB_Info GB_AemultB__cmplx_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__cmplx_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__cmplx_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = GxB_CMPLXF (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__cmplx_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__cmplx_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = GxB_CMPLXF (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__cmplx_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = GxB_CMPLXF (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = GxB_CMPLXF (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__cmplx_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__cmplx_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__cmplx_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__cmplx_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = GxB_CMPLXF (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = GxB_CMPLXF (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__cmplx_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__cmplx_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__cmplx_fp64.c b/GraphBLAS/Source/Generated/GB_binop__cmplx_fp64.c
index d3491cd484..43c4709e88 100644
--- a/GraphBLAS/Source/Generated/GB_binop__cmplx_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__cmplx_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GxB_CMPLX (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__cmplx_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__cmplx_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__cmplx_fp64
 GrB_Info GB_AemultB__cmplx_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__cmplx_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__cmplx_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = GxB_CMPLX (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__cmplx_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__cmplx_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = GxB_CMPLX (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__cmplx_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = GxB_CMPLX (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = GxB_CMPLX (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__cmplx_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__cmplx_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__cmplx_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__cmplx_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = GxB_CMPLX (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = GxB_CMPLX (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__cmplx_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__cmplx_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__copysign_fp32.c b/GraphBLAS/Source/Generated/GB_binop__copysign_fp32.c
index 638245a1dd..a0677b0cf0 100644
--- a/GraphBLAS/Source/Generated/GB_binop__copysign_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__copysign_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = copysignf (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__copysign_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__copysign_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__copysign_fp32
 GrB_Info GB_AemultB__copysign_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__copysign_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__copysign_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = copysignf (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__copysign_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__copysign_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = copysignf (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__copysign_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = copysignf (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = copysignf (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__copysign_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__copysign_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__copysign_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__copysign_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = copysignf (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = copysignf (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__copysign_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__copysign_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__copysign_fp64.c b/GraphBLAS/Source/Generated/GB_binop__copysign_fp64.c
index a744674c39..73d31f0a92 100644
--- a/GraphBLAS/Source/Generated/GB_binop__copysign_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__copysign_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = copysign (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__copysign_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__copysign_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__copysign_fp64
 GrB_Info GB_AemultB__copysign_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__copysign_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__copysign_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = copysign (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__copysign_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__copysign_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = copysign (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__copysign_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = copysign (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = copysign (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__copysign_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__copysign_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__copysign_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__copysign_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = copysign (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = copysign (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__copysign_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__copysign_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__div_fc32.c b/GraphBLAS/Source/Generated/GB_binop__div_fc32.c
index c87ad96d0e..ab37f292db 100644
--- a/GraphBLAS/Source/Generated/GB_binop__div_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__div_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC32_div (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__div_fc32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__div_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__div_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__div_fc32
 GrB_Info GB_AemultB__div_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__div_fc32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__div_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC32_t bij = Bx [p] ;
         Cx [p] = GB_FC32_div (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__div_fc32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__div_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC32_t aij = Ax [p] ;
         Cx [p] = GB_FC32_div (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__div_fc32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_div (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_div (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__div_fc32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__div_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__div_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__div_fc32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_div (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_div (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__div_fc32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__div_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__div_fc64.c b/GraphBLAS/Source/Generated/GB_binop__div_fc64.c
index 7b33db691f..f9b42d04bf 100644
--- a/GraphBLAS/Source/Generated/GB_binop__div_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__div_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC64_div (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__div_fc64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__div_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__div_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__div_fc64
 GrB_Info GB_AemultB__div_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__div_fc64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__div_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC64_t bij = Bx [p] ;
         Cx [p] = GB_FC64_div (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__div_fc64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__div_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC64_t aij = Ax [p] ;
         Cx [p] = GB_FC64_div (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__div_fc64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_div (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_div (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__div_fc64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__div_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__div_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__div_fc64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_div (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_div (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__div_fc64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__div_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__div_fp32.c b/GraphBLAS/Source/Generated/GB_binop__div_fp32.c
index 31b55ad811..8c23cf8a89 100644
--- a/GraphBLAS/Source/Generated/GB_binop__div_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__div_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x / y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__div_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__div_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__div_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__div_fp32
 GrB_Info GB_AemultB__div_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__div_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__div_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x / bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__div_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__div_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij / y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__div_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x / aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x / aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__div_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__div_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__div_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__div_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij / y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij / y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__div_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__div_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__div_fp64.c b/GraphBLAS/Source/Generated/GB_binop__div_fp64.c
index abd701ecc9..97c67a0f8c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__div_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__div_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x / y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__div_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__div_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__div_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__div_fp64
 GrB_Info GB_AemultB__div_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__div_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__div_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x / bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__div_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__div_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij / y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__div_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x / aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x / aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__div_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__div_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__div_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__div_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij / y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij / y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__div_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__div_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__div_int16.c b/GraphBLAS/Source/Generated/GB_binop__div_int16.c
index 02f342dc97..c7979bf1e4 100644
--- a/GraphBLAS/Source/Generated/GB_binop__div_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__div_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_SIGNED (x, y, 16) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__div_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__div_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__div_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__div_int16
 GrB_Info GB_AemultB__div_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__div_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__div_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_SIGNED (x, bij, 16) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__div_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__div_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_SIGNED (aij, y, 16) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__div_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (x, aij, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (x, aij, 16) ;        \
 }
 
 GrB_Info GB_bind1st_tran__div_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__div_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__div_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__div_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (aij, y, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (aij, y, 16) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__div_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__div_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__div_int32.c b/GraphBLAS/Source/Generated/GB_binop__div_int32.c
index 4704edbce9..d91c86de6c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__div_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__div_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_SIGNED (x, y, 32) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__div_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__div_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__div_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__div_int32
 GrB_Info GB_AemultB__div_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__div_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__div_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_SIGNED (x, bij, 32) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__div_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__div_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_SIGNED (aij, y, 32) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__div_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (x, aij, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (x, aij, 32) ;        \
 }
 
 GrB_Info GB_bind1st_tran__div_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__div_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__div_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__div_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (aij, y, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (aij, y, 32) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__div_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__div_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__div_int64.c b/GraphBLAS/Source/Generated/GB_binop__div_int64.c
index 0b8e12636e..98c6f3fa3b 100644
--- a/GraphBLAS/Source/Generated/GB_binop__div_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__div_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_SIGNED (x, y, 64) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__div_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__div_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__div_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__div_int64
 GrB_Info GB_AemultB__div_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__div_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__div_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_SIGNED (x, bij, 64) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__div_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__div_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_SIGNED (aij, y, 64) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__div_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (x, aij, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (x, aij, 64) ;        \
 }
 
 GrB_Info GB_bind1st_tran__div_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__div_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__div_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__div_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (aij, y, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (aij, y, 64) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__div_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__div_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__div_int8.c b/GraphBLAS/Source/Generated/GB_binop__div_int8.c
index 09b6fa842b..8ec82e9142 100644
--- a/GraphBLAS/Source/Generated/GB_binop__div_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__div_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_SIGNED (x, y, 8) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__div_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__div_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__div_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__div_int8
 GrB_Info GB_AemultB__div_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__div_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__div_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_SIGNED (x, bij, 8) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__div_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__div_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_SIGNED (aij, y, 8) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__div_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (x, aij, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (x, aij, 8) ;        \
 }
 
 GrB_Info GB_bind1st_tran__div_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__div_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__div_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__div_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (aij, y, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (aij, y, 8) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__div_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__div_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__div_uint16.c b/GraphBLAS/Source/Generated/GB_binop__div_uint16.c
index 032266dd1c..514814d3c8 100644
--- a/GraphBLAS/Source/Generated/GB_binop__div_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__div_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_UNSIGNED (x, y, 16) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__div_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__div_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__div_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__div_uint16
 GrB_Info GB_AemultB__div_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__div_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__div_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (x, bij, 16) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__div_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__div_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (aij, y, 16) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__div_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (x, aij, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (x, aij, 16) ;        \
 }
 
 GrB_Info GB_bind1st_tran__div_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__div_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__div_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__div_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (aij, y, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (aij, y, 16) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__div_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__div_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__div_uint32.c b/GraphBLAS/Source/Generated/GB_binop__div_uint32.c
index e8799221fe..f90dc70e1d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__div_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__div_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_UNSIGNED (x, y, 32) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__div_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__div_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__div_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__div_uint32
 GrB_Info GB_AemultB__div_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__div_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__div_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (x, bij, 32) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__div_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__div_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (aij, y, 32) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__div_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (x, aij, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (x, aij, 32) ;        \
 }
 
 GrB_Info GB_bind1st_tran__div_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__div_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__div_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__div_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (aij, y, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (aij, y, 32) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__div_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__div_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__div_uint64.c b/GraphBLAS/Source/Generated/GB_binop__div_uint64.c
index 7492dd1883..d78ca6f356 100644
--- a/GraphBLAS/Source/Generated/GB_binop__div_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__div_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_UNSIGNED (x, y, 64) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__div_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__div_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__div_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__div_uint64
 GrB_Info GB_AemultB__div_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__div_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__div_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (x, bij, 64) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__div_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__div_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (aij, y, 64) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__div_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (x, aij, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (x, aij, 64) ;        \
 }
 
 GrB_Info GB_bind1st_tran__div_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__div_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__div_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__div_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (aij, y, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (aij, y, 64) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__div_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__div_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__div_uint8.c b/GraphBLAS/Source/Generated/GB_binop__div_uint8.c
index 8ed06a9f7f..a4c843e9de 100644
--- a/GraphBLAS/Source/Generated/GB_binop__div_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__div_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_UNSIGNED (x, y, 8) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__div_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__div_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__div_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__div_uint8
 GrB_Info GB_AemultB__div_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__div_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__div_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (x, bij, 8) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__div_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__div_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (aij, y, 8) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__div_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (x, aij, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (x, aij, 8) ;        \
 }
 
 GrB_Info GB_bind1st_tran__div_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__div_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__div_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__div_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (aij, y, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (aij, y, 8) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__div_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__div_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_bool.c b/GraphBLAS/Source/Generated/GB_binop__eq_bool.c
index dedce8c3ac..8228732454 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_bool.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_bool.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__eq_bool
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_bool
 GrB_Info GB_AemultB__eq_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_bool
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         bool bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_bool
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         bool aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_bool
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_bool
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_bool
     return (GrB_NO_VALUE) ;
     #else
     bool x = (*((const bool *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_bool
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_bool
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     bool y = (*((const bool *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_fc32.c b/GraphBLAS/Source/Generated/GB_binop__eq_fc32.c
index f0f0203daf..1ba7c4150a 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC32_eq (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_fc32
 GrB_Info GB_AemultB__eq_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_fc32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC32_t bij = Bx [p] ;
         Cx [p] = GB_FC32_eq (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_fc32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC32_t aij = Ax [p] ;
         Cx [p] = GB_FC32_eq (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_fc32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_eq (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_eq (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_fc32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_fc32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_eq (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_eq (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_fc32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_fc64.c b/GraphBLAS/Source/Generated/GB_binop__eq_fc64.c
index 8747e4e83c..a0b93bbf8b 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC64_eq (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_fc64
 GrB_Info GB_AemultB__eq_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_fc64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC64_t bij = Bx [p] ;
         Cx [p] = GB_FC64_eq (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_fc64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC64_t aij = Ax [p] ;
         Cx [p] = GB_FC64_eq (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_fc64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_eq (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_eq (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_fc64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_fc64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_eq (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_eq (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_fc64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_fp32.c b/GraphBLAS/Source/Generated/GB_binop__eq_fp32.c
index d22572c3eb..599e4e93b7 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__eq_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_fp32
 GrB_Info GB_AemultB__eq_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_fp64.c b/GraphBLAS/Source/Generated/GB_binop__eq_fp64.c
index 7f17f397e4..0b06bb134b 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__eq_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_fp64
 GrB_Info GB_AemultB__eq_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_int16.c b/GraphBLAS/Source/Generated/GB_binop__eq_int16.c
index 6262751abc..82c1aeb030 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__eq_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_int16
 GrB_Info GB_AemultB__eq_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_int32.c b/GraphBLAS/Source/Generated/GB_binop__eq_int32.c
index 22cf5788be..ee07d086bf 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__eq_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_int32
 GrB_Info GB_AemultB__eq_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_int64.c b/GraphBLAS/Source/Generated/GB_binop__eq_int64.c
index 90269ec1d5..333047851d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__eq_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_int64
 GrB_Info GB_AemultB__eq_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_int8.c b/GraphBLAS/Source/Generated/GB_binop__eq_int8.c
index b979d0b93d..bd01cbd871 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__eq_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_int8
 GrB_Info GB_AemultB__eq_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_uint16.c b/GraphBLAS/Source/Generated/GB_binop__eq_uint16.c
index 7b584dd1fd..24d82d05b8 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__eq_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_uint16
 GrB_Info GB_AemultB__eq_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_uint32.c b/GraphBLAS/Source/Generated/GB_binop__eq_uint32.c
index d6e0924992..fb661ecec3 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__eq_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_uint32
 GrB_Info GB_AemultB__eq_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_uint64.c b/GraphBLAS/Source/Generated/GB_binop__eq_uint64.c
index c3b7ce9f94..744edb73fd 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__eq_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_uint64
 GrB_Info GB_AemultB__eq_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__eq_uint8.c b/GraphBLAS/Source/Generated/GB_binop__eq_uint8.c
index 5e17b111da..48922d352a 100644
--- a/GraphBLAS/Source/Generated/GB_binop__eq_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__eq_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__eq_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__eq_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__eq_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__eq_uint8
 GrB_Info GB_AemultB__eq_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__eq_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__eq_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__eq_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__eq_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__eq_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__eq_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__eq_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__eq_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__eq_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__eq_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__eq_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_bool.c b/GraphBLAS/Source/Generated/GB_binop__first_bool.c
index e01502caba..f12b5afdf1 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_bool.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_bool.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_bool
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_bool
 GrB_Info GB_AemultB__first_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_bool
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         bool aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_bool
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_bool
     return (GrB_NO_VALUE) ;
     #else
     bool x = (*((const bool *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_bool
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     bool y = (*((const bool *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_fc32.c b/GraphBLAS/Source/Generated/GB_binop__first_fc32.c
index c0a5960046..fc3182994d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_fc32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_fc32
 GrB_Info GB_AemultB__first_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_fc32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC32_t aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_fc32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_fc32
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_fc64.c b/GraphBLAS/Source/Generated/GB_binop__first_fc64.c
index b60a423c4a..cabb91dfbe 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_fc64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_fc64
 GrB_Info GB_AemultB__first_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_fc64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC64_t aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_fc64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_fc64
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_fp32.c b/GraphBLAS/Source/Generated/GB_binop__first_fp32.c
index 083444e202..5a9c534f61 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_fp32
 GrB_Info GB_AemultB__first_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_fp32
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_fp64.c b/GraphBLAS/Source/Generated/GB_binop__first_fp64.c
index 3a10cdaa82..624cace486 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_fp64
 GrB_Info GB_AemultB__first_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_fp64
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_int16.c b/GraphBLAS/Source/Generated/GB_binop__first_int16.c
index bab0cdc99b..38d95a04ce 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_int16
 GrB_Info GB_AemultB__first_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_int16
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_int32.c b/GraphBLAS/Source/Generated/GB_binop__first_int32.c
index e4970e05bb..f578b682f9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_int32
 GrB_Info GB_AemultB__first_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_int32
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_int64.c b/GraphBLAS/Source/Generated/GB_binop__first_int64.c
index 76634bf84b..2499849c6a 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_int64
 GrB_Info GB_AemultB__first_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_int64
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_int8.c b/GraphBLAS/Source/Generated/GB_binop__first_int8.c
index a2bd75ee72..a4641d8d29 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_int8
 GrB_Info GB_AemultB__first_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_int8
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_uint16.c b/GraphBLAS/Source/Generated/GB_binop__first_uint16.c
index c5e4a33235..962dbd5194 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_uint16
 GrB_Info GB_AemultB__first_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_uint16
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_uint32.c b/GraphBLAS/Source/Generated/GB_binop__first_uint32.c
index d8cd6fd799..716e6088d9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_uint32
 GrB_Info GB_AemultB__first_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_uint32
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_uint64.c b/GraphBLAS/Source/Generated/GB_binop__first_uint64.c
index 719527169d..0886f48139 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_uint64
 GrB_Info GB_AemultB__first_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_uint64
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__first_uint8.c b/GraphBLAS/Source/Generated/GB_binop__first_uint8.c
index 95980f42f5..64ca2b3528 100644
--- a/GraphBLAS/Source/Generated/GB_binop__first_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__first_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = x ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__first_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__first_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__first_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__first_uint8
 GrB_Info GB_AemultB__first_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__first_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__first_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = x ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = aij ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = x ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = x ;        \
 }
 
 GrB_Info GB_bind1st_tran__first_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__first_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__first_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__first_uint8
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__fmod_fp32.c b/GraphBLAS/Source/Generated/GB_binop__fmod_fp32.c
index e08135efcc..448f0e04a5 100644
--- a/GraphBLAS/Source/Generated/GB_binop__fmod_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__fmod_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = fmodf (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__fmod_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__fmod_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__fmod_fp32
 GrB_Info GB_AemultB__fmod_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__fmod_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__fmod_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = fmodf (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__fmod_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__fmod_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = fmodf (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__fmod_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = fmodf (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = fmodf (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__fmod_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__fmod_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__fmod_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__fmod_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = fmodf (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = fmodf (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__fmod_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__fmod_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__fmod_fp64.c b/GraphBLAS/Source/Generated/GB_binop__fmod_fp64.c
index 91cd2a24b2..5eaf0ff755 100644
--- a/GraphBLAS/Source/Generated/GB_binop__fmod_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__fmod_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = fmod (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__fmod_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__fmod_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__fmod_fp64
 GrB_Info GB_AemultB__fmod_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__fmod_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__fmod_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = fmod (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__fmod_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__fmod_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = fmod (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__fmod_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = fmod (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = fmod (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__fmod_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__fmod_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__fmod_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__fmod_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = fmod (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = fmod (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__fmod_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__fmod_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ge_bool.c b/GraphBLAS/Source/Generated/GB_binop__ge_bool.c
index d89d16bdcc..28ae7abb73 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ge_bool.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ge_bool.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ge_bool
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ge_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ge_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ge_bool
 GrB_Info GB_AemultB__ge_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ge_bool
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ge_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         bool bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ge_bool
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ge_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         bool aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ge_bool
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ge_bool
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ge_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ge_bool
     return (GrB_NO_VALUE) ;
     #else
     bool x = (*((const bool *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ge_bool
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ge_bool
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ge_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     bool y = (*((const bool *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ge_fp32.c b/GraphBLAS/Source/Generated/GB_binop__ge_fp32.c
index 6023d3dadc..908c9e31ff 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ge_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ge_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ge_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ge_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ge_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ge_fp32
 GrB_Info GB_AemultB__ge_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ge_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ge_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ge_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ge_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ge_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ge_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ge_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ge_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ge_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ge_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ge_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ge_fp64.c b/GraphBLAS/Source/Generated/GB_binop__ge_fp64.c
index ec5aae8579..8d1a384326 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ge_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ge_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ge_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ge_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ge_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ge_fp64
 GrB_Info GB_AemultB__ge_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ge_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ge_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ge_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ge_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ge_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ge_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ge_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ge_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ge_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ge_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ge_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ge_int16.c b/GraphBLAS/Source/Generated/GB_binop__ge_int16.c
index 5890812179..cce99e5bc4 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ge_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ge_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ge_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ge_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ge_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ge_int16
 GrB_Info GB_AemultB__ge_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ge_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ge_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ge_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ge_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ge_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ge_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ge_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ge_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ge_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ge_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ge_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ge_int32.c b/GraphBLAS/Source/Generated/GB_binop__ge_int32.c
index 74b192f94c..92453293d2 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ge_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ge_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ge_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ge_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ge_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ge_int32
 GrB_Info GB_AemultB__ge_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ge_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ge_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ge_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ge_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ge_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ge_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ge_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ge_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ge_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ge_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ge_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ge_int64.c b/GraphBLAS/Source/Generated/GB_binop__ge_int64.c
index be5db81daa..76deeb042b 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ge_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ge_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ge_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ge_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ge_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ge_int64
 GrB_Info GB_AemultB__ge_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ge_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ge_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ge_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ge_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ge_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ge_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ge_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ge_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ge_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ge_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ge_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ge_int8.c b/GraphBLAS/Source/Generated/GB_binop__ge_int8.c
index 0586c57127..51d13d0758 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ge_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ge_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ge_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ge_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ge_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ge_int8
 GrB_Info GB_AemultB__ge_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ge_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ge_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ge_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ge_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ge_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ge_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ge_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ge_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ge_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ge_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ge_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ge_uint16.c b/GraphBLAS/Source/Generated/GB_binop__ge_uint16.c
index 3b12c0c7a6..c1a26ba99f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ge_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ge_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ge_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ge_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ge_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ge_uint16
 GrB_Info GB_AemultB__ge_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ge_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ge_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ge_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ge_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ge_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ge_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ge_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ge_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ge_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ge_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ge_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ge_uint32.c b/GraphBLAS/Source/Generated/GB_binop__ge_uint32.c
index b95c3cb16f..db005468ef 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ge_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ge_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ge_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ge_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ge_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ge_uint32
 GrB_Info GB_AemultB__ge_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ge_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ge_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ge_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ge_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ge_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ge_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ge_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ge_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ge_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ge_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ge_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ge_uint64.c b/GraphBLAS/Source/Generated/GB_binop__ge_uint64.c
index 336e8d3237..51c337f6fc 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ge_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ge_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ge_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ge_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ge_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ge_uint64
 GrB_Info GB_AemultB__ge_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ge_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ge_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ge_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ge_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ge_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ge_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ge_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ge_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ge_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ge_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ge_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ge_uint8.c b/GraphBLAS/Source/Generated/GB_binop__ge_uint8.c
index 15ef6636c7..160d46e192 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ge_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ge_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ge_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ge_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ge_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ge_uint8
 GrB_Info GB_AemultB__ge_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ge_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ge_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ge_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ge_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ge_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ge_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ge_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ge_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ge_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ge_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ge_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__gt_bool.c b/GraphBLAS/Source/Generated/GB_binop__gt_bool.c
index 2c598058e9..d4de52d2a2 100644
--- a/GraphBLAS/Source/Generated/GB_binop__gt_bool.c
+++ b/GraphBLAS/Source/Generated/GB_binop__gt_bool.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__gt_bool
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__gt_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__gt_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__gt_bool
 GrB_Info GB_AemultB__gt_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__gt_bool
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__gt_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         bool bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__gt_bool
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__gt_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         bool aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__gt_bool
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__gt_bool
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__gt_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__gt_bool
     return (GrB_NO_VALUE) ;
     #else
     bool x = (*((const bool *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__gt_bool
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__gt_bool
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__gt_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     bool y = (*((const bool *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__gt_fp32.c b/GraphBLAS/Source/Generated/GB_binop__gt_fp32.c
index 466542cfa0..000e5f0efa 100644
--- a/GraphBLAS/Source/Generated/GB_binop__gt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__gt_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__gt_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__gt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__gt_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__gt_fp32
 GrB_Info GB_AemultB__gt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__gt_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__gt_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__gt_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__gt_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__gt_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__gt_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__gt_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__gt_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__gt_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__gt_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__gt_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__gt_fp64.c b/GraphBLAS/Source/Generated/GB_binop__gt_fp64.c
index a812f57247..76ff3ee72e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__gt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__gt_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__gt_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__gt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__gt_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__gt_fp64
 GrB_Info GB_AemultB__gt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__gt_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__gt_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__gt_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__gt_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__gt_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__gt_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__gt_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__gt_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__gt_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__gt_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__gt_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__gt_int16.c b/GraphBLAS/Source/Generated/GB_binop__gt_int16.c
index 84e55ebf31..340ac40885 100644
--- a/GraphBLAS/Source/Generated/GB_binop__gt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__gt_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__gt_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__gt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__gt_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__gt_int16
 GrB_Info GB_AemultB__gt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__gt_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__gt_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__gt_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__gt_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__gt_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__gt_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__gt_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__gt_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__gt_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__gt_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__gt_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__gt_int32.c b/GraphBLAS/Source/Generated/GB_binop__gt_int32.c
index 132e6a0577..db9f328af7 100644
--- a/GraphBLAS/Source/Generated/GB_binop__gt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__gt_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__gt_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__gt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__gt_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__gt_int32
 GrB_Info GB_AemultB__gt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__gt_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__gt_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__gt_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__gt_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__gt_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__gt_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__gt_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__gt_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__gt_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__gt_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__gt_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__gt_int64.c b/GraphBLAS/Source/Generated/GB_binop__gt_int64.c
index f3ef345086..039d72e834 100644
--- a/GraphBLAS/Source/Generated/GB_binop__gt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__gt_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__gt_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__gt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__gt_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__gt_int64
 GrB_Info GB_AemultB__gt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__gt_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__gt_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__gt_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__gt_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__gt_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__gt_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__gt_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__gt_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__gt_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__gt_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__gt_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__gt_int8.c b/GraphBLAS/Source/Generated/GB_binop__gt_int8.c
index 5d94381c4b..04f65e3718 100644
--- a/GraphBLAS/Source/Generated/GB_binop__gt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__gt_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__gt_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__gt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__gt_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__gt_int8
 GrB_Info GB_AemultB__gt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__gt_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__gt_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__gt_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__gt_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__gt_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__gt_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__gt_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__gt_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__gt_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__gt_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__gt_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__gt_uint16.c b/GraphBLAS/Source/Generated/GB_binop__gt_uint16.c
index d7376da312..abe05c38e7 100644
--- a/GraphBLAS/Source/Generated/GB_binop__gt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__gt_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__gt_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__gt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__gt_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__gt_uint16
 GrB_Info GB_AemultB__gt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__gt_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__gt_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__gt_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__gt_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__gt_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__gt_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__gt_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__gt_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__gt_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__gt_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__gt_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__gt_uint32.c b/GraphBLAS/Source/Generated/GB_binop__gt_uint32.c
index d64e439795..6e126012ff 100644
--- a/GraphBLAS/Source/Generated/GB_binop__gt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__gt_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__gt_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__gt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__gt_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__gt_uint32
 GrB_Info GB_AemultB__gt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__gt_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__gt_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__gt_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__gt_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__gt_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__gt_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__gt_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__gt_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__gt_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__gt_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__gt_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__gt_uint64.c b/GraphBLAS/Source/Generated/GB_binop__gt_uint64.c
index 833aed1e64..f9204c76d2 100644
--- a/GraphBLAS/Source/Generated/GB_binop__gt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__gt_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__gt_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__gt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__gt_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__gt_uint64
 GrB_Info GB_AemultB__gt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__gt_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__gt_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__gt_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__gt_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__gt_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__gt_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__gt_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__gt_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__gt_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__gt_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__gt_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__gt_uint8.c b/GraphBLAS/Source/Generated/GB_binop__gt_uint8.c
index 27376e73db..a2b34f9607 100644
--- a/GraphBLAS/Source/Generated/GB_binop__gt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__gt_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__gt_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__gt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__gt_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__gt_uint8
 GrB_Info GB_AemultB__gt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__gt_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__gt_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__gt_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__gt_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__gt_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__gt_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__gt_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__gt_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__gt_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__gt_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__gt_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__hypot_fp32.c b/GraphBLAS/Source/Generated/GB_binop__hypot_fp32.c
index 69003a1bff..7e291bda42 100644
--- a/GraphBLAS/Source/Generated/GB_binop__hypot_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__hypot_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = hypotf (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__hypot_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__hypot_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__hypot_fp32
 GrB_Info GB_AemultB__hypot_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__hypot_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__hypot_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = hypotf (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__hypot_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__hypot_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = hypotf (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__hypot_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = hypotf (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = hypotf (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__hypot_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__hypot_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__hypot_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__hypot_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = hypotf (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = hypotf (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__hypot_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__hypot_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__hypot_fp64.c b/GraphBLAS/Source/Generated/GB_binop__hypot_fp64.c
index 31b1111a59..50968d5007 100644
--- a/GraphBLAS/Source/Generated/GB_binop__hypot_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__hypot_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = hypot (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__hypot_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__hypot_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__hypot_fp64
 GrB_Info GB_AemultB__hypot_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__hypot_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__hypot_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = hypot (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__hypot_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__hypot_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = hypot (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__hypot_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = hypot (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = hypot (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__hypot_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__hypot_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__hypot_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__hypot_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = hypot (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = hypot (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__hypot_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__hypot_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__include.h b/GraphBLAS/Source/Generated/GB_binop__include.h
index 81d617619c..350414c578 100644
--- a/GraphBLAS/Source/Generated/GB_binop__include.h
+++ b/GraphBLAS/Source/Generated/GB_binop__include.h
@@ -2,15 +2,12 @@
 // GB_binop__include.h: definitions for GB_binop__*.c
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_binop.h
 
-#include "GB_iterator.h"
-
-
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -74,8 +71,10 @@ GrB_Info GB_DxB__first_bool
 GrB_Info GB_AaddB__first_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -83,23 +82,27 @@ GrB_Info GB_AaddB__first_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -108,6 +111,7 @@ GrB_Info GB_bind1st__first_bool
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -119,6 +123,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -130,10 +135,10 @@ GrB_Info GB_bind1st_tran__first_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -143,14 +148,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -214,8 +219,10 @@ GrB_Info GB_DxB__first_int8
 GrB_Info GB_AaddB__first_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -223,23 +230,27 @@ GrB_Info GB_AaddB__first_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -248,6 +259,7 @@ GrB_Info GB_bind1st__first_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -259,6 +271,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -270,10 +283,10 @@ GrB_Info GB_bind1st_tran__first_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -283,14 +296,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -354,8 +367,10 @@ GrB_Info GB_DxB__first_int16
 GrB_Info GB_AaddB__first_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -363,23 +378,27 @@ GrB_Info GB_AaddB__first_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -388,6 +407,7 @@ GrB_Info GB_bind1st__first_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -399,6 +419,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -410,10 +431,10 @@ GrB_Info GB_bind1st_tran__first_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -423,14 +444,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -494,8 +515,10 @@ GrB_Info GB_DxB__first_int32
 GrB_Info GB_AaddB__first_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -503,23 +526,27 @@ GrB_Info GB_AaddB__first_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -528,6 +555,7 @@ GrB_Info GB_bind1st__first_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -539,6 +567,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -550,10 +579,10 @@ GrB_Info GB_bind1st_tran__first_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -563,14 +592,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -634,8 +663,10 @@ GrB_Info GB_DxB__first_int64
 GrB_Info GB_AaddB__first_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -643,23 +674,27 @@ GrB_Info GB_AaddB__first_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -668,6 +703,7 @@ GrB_Info GB_bind1st__first_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -679,6 +715,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -690,10 +727,10 @@ GrB_Info GB_bind1st_tran__first_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -703,14 +740,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -774,8 +811,10 @@ GrB_Info GB_DxB__first_uint8
 GrB_Info GB_AaddB__first_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -783,23 +822,27 @@ GrB_Info GB_AaddB__first_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -808,6 +851,7 @@ GrB_Info GB_bind1st__first_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -819,6 +863,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -830,10 +875,10 @@ GrB_Info GB_bind1st_tran__first_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -843,14 +888,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -914,8 +959,10 @@ GrB_Info GB_DxB__first_uint16
 GrB_Info GB_AaddB__first_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -923,23 +970,27 @@ GrB_Info GB_AaddB__first_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -948,6 +999,7 @@ GrB_Info GB_bind1st__first_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -959,6 +1011,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -970,10 +1023,10 @@ GrB_Info GB_bind1st_tran__first_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -983,14 +1036,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -1054,8 +1107,10 @@ GrB_Info GB_DxB__first_uint32
 GrB_Info GB_AaddB__first_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -1063,23 +1118,27 @@ GrB_Info GB_AaddB__first_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -1088,6 +1147,7 @@ GrB_Info GB_bind1st__first_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1099,6 +1159,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1110,10 +1171,10 @@ GrB_Info GB_bind1st_tran__first_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -1123,14 +1184,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -1194,8 +1255,10 @@ GrB_Info GB_DxB__first_uint64
 GrB_Info GB_AaddB__first_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -1203,23 +1266,27 @@ GrB_Info GB_AaddB__first_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -1228,6 +1295,7 @@ GrB_Info GB_bind1st__first_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1239,6 +1307,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1250,10 +1319,10 @@ GrB_Info GB_bind1st_tran__first_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -1263,14 +1332,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -1334,8 +1403,10 @@ GrB_Info GB_DxB__first_fp32
 GrB_Info GB_AaddB__first_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -1343,23 +1414,27 @@ GrB_Info GB_AaddB__first_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -1368,6 +1443,7 @@ GrB_Info GB_bind1st__first_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1379,6 +1455,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1390,10 +1467,10 @@ GrB_Info GB_bind1st_tran__first_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -1403,14 +1480,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -1474,8 +1551,10 @@ GrB_Info GB_DxB__first_fp64
 GrB_Info GB_AaddB__first_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -1483,23 +1562,27 @@ GrB_Info GB_AaddB__first_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -1508,6 +1591,7 @@ GrB_Info GB_bind1st__first_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1519,6 +1603,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1530,10 +1615,10 @@ GrB_Info GB_bind1st_tran__first_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -1543,14 +1628,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -1614,8 +1699,10 @@ GrB_Info GB_DxB__first_fc32
 GrB_Info GB_AaddB__first_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -1623,23 +1710,27 @@ GrB_Info GB_AaddB__first_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -1648,6 +1739,7 @@ GrB_Info GB_bind1st__first_fc32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1659,6 +1751,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1670,10 +1763,10 @@ GrB_Info GB_bind1st_tran__first_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -1683,14 +1776,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -1754,8 +1847,10 @@ GrB_Info GB_DxB__first_fc64
 GrB_Info GB_AaddB__first_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -1763,23 +1858,27 @@ GrB_Info GB_AaddB__first_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__first_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -1788,6 +1887,7 @@ GrB_Info GB_bind1st__first_fc64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1799,6 +1899,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1810,10 +1911,10 @@ GrB_Info GB_bind1st_tran__first_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -1823,14 +1924,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -1894,8 +1995,10 @@ GrB_Info GB_DxB__second_bool
 GrB_Info GB_AaddB__second_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -1903,23 +2006,27 @@ GrB_Info GB_AaddB__second_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -1928,6 +2035,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1939,6 +2047,7 @@ GrB_Info GB_bind2nd__second_bool
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -1950,10 +2059,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -1963,14 +2072,14 @@ GrB_Info GB_bind2nd_tran__second_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -2034,8 +2143,10 @@ GrB_Info GB_DxB__second_int8
 GrB_Info GB_AaddB__second_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -2043,23 +2154,27 @@ GrB_Info GB_AaddB__second_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -2068,6 +2183,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2079,6 +2195,7 @@ GrB_Info GB_bind2nd__second_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2090,10 +2207,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -2103,14 +2220,14 @@ GrB_Info GB_bind2nd_tran__second_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -2174,8 +2291,10 @@ GrB_Info GB_DxB__second_int16
 GrB_Info GB_AaddB__second_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -2183,23 +2302,27 @@ GrB_Info GB_AaddB__second_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -2208,6 +2331,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2219,6 +2343,7 @@ GrB_Info GB_bind2nd__second_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2230,10 +2355,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -2243,14 +2368,14 @@ GrB_Info GB_bind2nd_tran__second_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -2314,8 +2439,10 @@ GrB_Info GB_DxB__second_int32
 GrB_Info GB_AaddB__second_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -2323,23 +2450,27 @@ GrB_Info GB_AaddB__second_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -2348,6 +2479,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2359,6 +2491,7 @@ GrB_Info GB_bind2nd__second_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2370,10 +2503,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -2383,14 +2516,14 @@ GrB_Info GB_bind2nd_tran__second_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -2454,8 +2587,10 @@ GrB_Info GB_DxB__second_int64
 GrB_Info GB_AaddB__second_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -2463,23 +2598,27 @@ GrB_Info GB_AaddB__second_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -2488,6 +2627,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2499,6 +2639,7 @@ GrB_Info GB_bind2nd__second_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2510,10 +2651,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -2523,14 +2664,14 @@ GrB_Info GB_bind2nd_tran__second_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -2594,8 +2735,10 @@ GrB_Info GB_DxB__second_uint8
 GrB_Info GB_AaddB__second_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -2603,23 +2746,27 @@ GrB_Info GB_AaddB__second_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -2628,6 +2775,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2639,6 +2787,7 @@ GrB_Info GB_bind2nd__second_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2650,10 +2799,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -2663,14 +2812,14 @@ GrB_Info GB_bind2nd_tran__second_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -2734,8 +2883,10 @@ GrB_Info GB_DxB__second_uint16
 GrB_Info GB_AaddB__second_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -2743,23 +2894,27 @@ GrB_Info GB_AaddB__second_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -2768,6 +2923,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2779,6 +2935,7 @@ GrB_Info GB_bind2nd__second_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2790,10 +2947,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -2803,14 +2960,14 @@ GrB_Info GB_bind2nd_tran__second_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -2874,8 +3031,10 @@ GrB_Info GB_DxB__second_uint32
 GrB_Info GB_AaddB__second_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -2883,23 +3042,27 @@ GrB_Info GB_AaddB__second_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -2908,6 +3071,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2919,6 +3083,7 @@ GrB_Info GB_bind2nd__second_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -2930,10 +3095,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -2943,14 +3108,14 @@ GrB_Info GB_bind2nd_tran__second_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -3014,8 +3179,10 @@ GrB_Info GB_DxB__second_uint64
 GrB_Info GB_AaddB__second_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -3023,23 +3190,27 @@ GrB_Info GB_AaddB__second_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -3048,6 +3219,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3059,6 +3231,7 @@ GrB_Info GB_bind2nd__second_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3070,10 +3243,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -3083,14 +3256,14 @@ GrB_Info GB_bind2nd_tran__second_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -3154,8 +3327,10 @@ GrB_Info GB_DxB__second_fp32
 GrB_Info GB_AaddB__second_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -3163,23 +3338,27 @@ GrB_Info GB_AaddB__second_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -3188,6 +3367,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3199,6 +3379,7 @@ GrB_Info GB_bind2nd__second_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3210,10 +3391,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -3223,14 +3404,14 @@ GrB_Info GB_bind2nd_tran__second_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -3294,8 +3475,10 @@ GrB_Info GB_DxB__second_fp64
 GrB_Info GB_AaddB__second_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -3303,23 +3486,27 @@ GrB_Info GB_AaddB__second_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -3328,6 +3515,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3339,6 +3527,7 @@ GrB_Info GB_bind2nd__second_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3350,10 +3539,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -3363,14 +3552,14 @@ GrB_Info GB_bind2nd_tran__second_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -3434,8 +3623,10 @@ GrB_Info GB_DxB__second_fc32
 GrB_Info GB_AaddB__second_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -3443,23 +3634,27 @@ GrB_Info GB_AaddB__second_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -3468,6 +3663,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3479,6 +3675,7 @@ GrB_Info GB_bind2nd__second_fc32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3490,10 +3687,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -3503,14 +3700,14 @@ GrB_Info GB_bind2nd_tran__second_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -3574,8 +3771,10 @@ GrB_Info GB_DxB__second_fc64
 GrB_Info GB_AaddB__second_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -3583,23 +3782,27 @@ GrB_Info GB_AaddB__second_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__second_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -3608,6 +3811,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3619,6 +3823,7 @@ GrB_Info GB_bind2nd__second_fc64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3630,10 +3835,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -3643,14 +3848,14 @@ GrB_Info GB_bind2nd_tran__second_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -3714,8 +3919,10 @@ GrB_Info GB_DxB__pair_bool
 GrB_Info GB_AaddB__pair_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -3723,23 +3930,27 @@ GrB_Info GB_AaddB__pair_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -3748,6 +3959,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3759,6 +3971,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3770,10 +3983,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -3783,14 +3996,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -3854,8 +4067,10 @@ GrB_Info GB_DxB__pair_int8
 GrB_Info GB_AaddB__pair_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -3863,23 +4078,27 @@ GrB_Info GB_AaddB__pair_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -3888,6 +4107,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3899,6 +4119,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -3910,10 +4131,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -3923,14 +4144,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -3994,8 +4215,10 @@ GrB_Info GB_DxB__pair_int16
 GrB_Info GB_AaddB__pair_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -4003,23 +4226,27 @@ GrB_Info GB_AaddB__pair_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -4028,6 +4255,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4039,6 +4267,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4050,10 +4279,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -4063,14 +4292,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -4134,8 +4363,10 @@ GrB_Info GB_DxB__pair_int32
 GrB_Info GB_AaddB__pair_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -4143,23 +4374,27 @@ GrB_Info GB_AaddB__pair_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -4168,6 +4403,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4179,6 +4415,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4190,10 +4427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -4203,14 +4440,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -4274,8 +4511,10 @@ GrB_Info GB_DxB__pair_int64
 GrB_Info GB_AaddB__pair_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -4283,23 +4522,27 @@ GrB_Info GB_AaddB__pair_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -4308,6 +4551,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4319,6 +4563,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4330,10 +4575,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -4343,14 +4588,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -4414,8 +4659,10 @@ GrB_Info GB_DxB__pair_uint8
 GrB_Info GB_AaddB__pair_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -4423,23 +4670,27 @@ GrB_Info GB_AaddB__pair_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -4448,6 +4699,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4459,6 +4711,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4470,10 +4723,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -4483,14 +4736,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -4554,8 +4807,10 @@ GrB_Info GB_DxB__pair_uint16
 GrB_Info GB_AaddB__pair_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -4563,23 +4818,27 @@ GrB_Info GB_AaddB__pair_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -4588,6 +4847,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4599,6 +4859,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4610,10 +4871,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -4623,14 +4884,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -4694,8 +4955,10 @@ GrB_Info GB_DxB__pair_uint32
 GrB_Info GB_AaddB__pair_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -4703,23 +4966,27 @@ GrB_Info GB_AaddB__pair_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -4728,6 +4995,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4739,6 +5007,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4750,10 +5019,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -4763,14 +5032,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -4834,8 +5103,10 @@ GrB_Info GB_DxB__pair_uint64
 GrB_Info GB_AaddB__pair_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -4843,23 +5114,27 @@ GrB_Info GB_AaddB__pair_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -4868,6 +5143,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4879,6 +5155,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -4890,10 +5167,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -4903,14 +5180,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -4974,8 +5251,10 @@ GrB_Info GB_DxB__pair_fp32
 GrB_Info GB_AaddB__pair_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -4983,23 +5262,27 @@ GrB_Info GB_AaddB__pair_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -5008,6 +5291,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5019,6 +5303,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5030,10 +5315,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -5043,14 +5328,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -5114,8 +5399,10 @@ GrB_Info GB_DxB__pair_fp64
 GrB_Info GB_AaddB__pair_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -5123,23 +5410,27 @@ GrB_Info GB_AaddB__pair_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -5148,6 +5439,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5159,6 +5451,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5170,10 +5463,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -5183,14 +5476,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -5254,8 +5547,10 @@ GrB_Info GB_DxB__pair_fc32
 GrB_Info GB_AaddB__pair_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -5263,23 +5558,27 @@ GrB_Info GB_AaddB__pair_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -5288,6 +5587,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5299,6 +5599,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5310,10 +5611,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -5323,14 +5624,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -5394,8 +5695,10 @@ GrB_Info GB_DxB__pair_fc64
 GrB_Info GB_AaddB__pair_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -5403,23 +5706,27 @@ GrB_Info GB_AaddB__pair_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pair_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 #if 0
@@ -5428,6 +5735,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5439,6 +5747,7 @@ GrB_Info (none)
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5450,10 +5759,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
@@ -5463,14 +5772,14 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 #endif
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__min_int8
 (
@@ -5534,8 +5843,10 @@ GrB_Info GB_DxB__min_int8
 GrB_Info GB_AaddB__min_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -5543,23 +5854,27 @@ GrB_Info GB_AaddB__min_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__min_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -5568,6 +5883,7 @@ GrB_Info GB_bind1st__min_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5579,6 +5895,7 @@ GrB_Info GB_bind2nd__min_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5590,10 +5907,10 @@ GrB_Info GB_bind1st_tran__min_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -5603,14 +5920,14 @@ GrB_Info GB_bind2nd_tran__min_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__min_int16
 (
@@ -5674,8 +5991,10 @@ GrB_Info GB_DxB__min_int16
 GrB_Info GB_AaddB__min_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -5683,23 +6002,27 @@ GrB_Info GB_AaddB__min_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__min_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -5708,6 +6031,7 @@ GrB_Info GB_bind1st__min_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5719,6 +6043,7 @@ GrB_Info GB_bind2nd__min_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5730,10 +6055,10 @@ GrB_Info GB_bind1st_tran__min_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -5743,14 +6068,14 @@ GrB_Info GB_bind2nd_tran__min_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__min_int32
 (
@@ -5814,8 +6139,10 @@ GrB_Info GB_DxB__min_int32
 GrB_Info GB_AaddB__min_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -5823,23 +6150,27 @@ GrB_Info GB_AaddB__min_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__min_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -5848,6 +6179,7 @@ GrB_Info GB_bind1st__min_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5859,6 +6191,7 @@ GrB_Info GB_bind2nd__min_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5870,10 +6203,10 @@ GrB_Info GB_bind1st_tran__min_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -5883,14 +6216,14 @@ GrB_Info GB_bind2nd_tran__min_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__min_int64
 (
@@ -5954,8 +6287,10 @@ GrB_Info GB_DxB__min_int64
 GrB_Info GB_AaddB__min_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -5963,23 +6298,27 @@ GrB_Info GB_AaddB__min_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__min_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -5988,6 +6327,7 @@ GrB_Info GB_bind1st__min_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -5999,6 +6339,7 @@ GrB_Info GB_bind2nd__min_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6010,10 +6351,10 @@ GrB_Info GB_bind1st_tran__min_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -6023,14 +6364,14 @@ GrB_Info GB_bind2nd_tran__min_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__min_uint8
 (
@@ -6094,8 +6435,10 @@ GrB_Info GB_DxB__min_uint8
 GrB_Info GB_AaddB__min_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -6103,23 +6446,27 @@ GrB_Info GB_AaddB__min_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__min_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -6128,6 +6475,7 @@ GrB_Info GB_bind1st__min_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6139,6 +6487,7 @@ GrB_Info GB_bind2nd__min_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6150,10 +6499,10 @@ GrB_Info GB_bind1st_tran__min_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -6163,14 +6512,14 @@ GrB_Info GB_bind2nd_tran__min_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__min_uint16
 (
@@ -6234,8 +6583,10 @@ GrB_Info GB_DxB__min_uint16
 GrB_Info GB_AaddB__min_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -6243,23 +6594,27 @@ GrB_Info GB_AaddB__min_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__min_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -6268,6 +6623,7 @@ GrB_Info GB_bind1st__min_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6279,6 +6635,7 @@ GrB_Info GB_bind2nd__min_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6290,10 +6647,10 @@ GrB_Info GB_bind1st_tran__min_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -6303,14 +6660,14 @@ GrB_Info GB_bind2nd_tran__min_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__min_uint32
 (
@@ -6374,8 +6731,10 @@ GrB_Info GB_DxB__min_uint32
 GrB_Info GB_AaddB__min_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -6383,23 +6742,27 @@ GrB_Info GB_AaddB__min_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__min_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -6408,6 +6771,7 @@ GrB_Info GB_bind1st__min_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6419,6 +6783,7 @@ GrB_Info GB_bind2nd__min_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6430,10 +6795,10 @@ GrB_Info GB_bind1st_tran__min_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -6443,14 +6808,14 @@ GrB_Info GB_bind2nd_tran__min_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__min_uint64
 (
@@ -6514,8 +6879,10 @@ GrB_Info GB_DxB__min_uint64
 GrB_Info GB_AaddB__min_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -6523,23 +6890,27 @@ GrB_Info GB_AaddB__min_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__min_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -6548,6 +6919,7 @@ GrB_Info GB_bind1st__min_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6559,6 +6931,7 @@ GrB_Info GB_bind2nd__min_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6570,10 +6943,10 @@ GrB_Info GB_bind1st_tran__min_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -6583,14 +6956,14 @@ GrB_Info GB_bind2nd_tran__min_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__min_fp32
 (
@@ -6654,8 +7027,10 @@ GrB_Info GB_DxB__min_fp32
 GrB_Info GB_AaddB__min_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -6663,23 +7038,27 @@ GrB_Info GB_AaddB__min_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__min_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -6688,6 +7067,7 @@ GrB_Info GB_bind1st__min_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6699,6 +7079,7 @@ GrB_Info GB_bind2nd__min_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6710,10 +7091,10 @@ GrB_Info GB_bind1st_tran__min_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -6723,14 +7104,14 @@ GrB_Info GB_bind2nd_tran__min_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__min_fp64
 (
@@ -6794,8 +7175,10 @@ GrB_Info GB_DxB__min_fp64
 GrB_Info GB_AaddB__min_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -6803,23 +7186,27 @@ GrB_Info GB_AaddB__min_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__min_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -6828,6 +7215,7 @@ GrB_Info GB_bind1st__min_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6839,6 +7227,7 @@ GrB_Info GB_bind2nd__min_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6850,10 +7239,10 @@ GrB_Info GB_bind1st_tran__min_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -6863,14 +7252,14 @@ GrB_Info GB_bind2nd_tran__min_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__max_int8
 (
@@ -6934,8 +7323,10 @@ GrB_Info GB_DxB__max_int8
 GrB_Info GB_AaddB__max_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -6943,23 +7334,27 @@ GrB_Info GB_AaddB__max_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__max_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -6968,6 +7363,7 @@ GrB_Info GB_bind1st__max_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6979,6 +7375,7 @@ GrB_Info GB_bind2nd__max_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -6990,10 +7387,10 @@ GrB_Info GB_bind1st_tran__max_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -7003,14 +7400,14 @@ GrB_Info GB_bind2nd_tran__max_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__max_int16
 (
@@ -7074,8 +7471,10 @@ GrB_Info GB_DxB__max_int16
 GrB_Info GB_AaddB__max_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -7083,23 +7482,27 @@ GrB_Info GB_AaddB__max_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__max_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -7108,6 +7511,7 @@ GrB_Info GB_bind1st__max_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7119,6 +7523,7 @@ GrB_Info GB_bind2nd__max_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7130,10 +7535,10 @@ GrB_Info GB_bind1st_tran__max_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -7143,14 +7548,14 @@ GrB_Info GB_bind2nd_tran__max_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__max_int32
 (
@@ -7214,8 +7619,10 @@ GrB_Info GB_DxB__max_int32
 GrB_Info GB_AaddB__max_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -7223,23 +7630,27 @@ GrB_Info GB_AaddB__max_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__max_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -7248,6 +7659,7 @@ GrB_Info GB_bind1st__max_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7259,6 +7671,7 @@ GrB_Info GB_bind2nd__max_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7270,10 +7683,10 @@ GrB_Info GB_bind1st_tran__max_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -7283,14 +7696,14 @@ GrB_Info GB_bind2nd_tran__max_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__max_int64
 (
@@ -7354,8 +7767,10 @@ GrB_Info GB_DxB__max_int64
 GrB_Info GB_AaddB__max_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -7363,23 +7778,27 @@ GrB_Info GB_AaddB__max_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__max_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -7388,6 +7807,7 @@ GrB_Info GB_bind1st__max_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7399,6 +7819,7 @@ GrB_Info GB_bind2nd__max_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7410,10 +7831,10 @@ GrB_Info GB_bind1st_tran__max_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -7423,14 +7844,14 @@ GrB_Info GB_bind2nd_tran__max_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__max_uint8
 (
@@ -7494,8 +7915,10 @@ GrB_Info GB_DxB__max_uint8
 GrB_Info GB_AaddB__max_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -7503,23 +7926,27 @@ GrB_Info GB_AaddB__max_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__max_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -7528,6 +7955,7 @@ GrB_Info GB_bind1st__max_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7539,6 +7967,7 @@ GrB_Info GB_bind2nd__max_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7550,10 +7979,10 @@ GrB_Info GB_bind1st_tran__max_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -7563,14 +7992,14 @@ GrB_Info GB_bind2nd_tran__max_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__max_uint16
 (
@@ -7634,8 +8063,10 @@ GrB_Info GB_DxB__max_uint16
 GrB_Info GB_AaddB__max_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -7643,23 +8074,27 @@ GrB_Info GB_AaddB__max_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__max_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -7668,6 +8103,7 @@ GrB_Info GB_bind1st__max_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7679,6 +8115,7 @@ GrB_Info GB_bind2nd__max_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7690,10 +8127,10 @@ GrB_Info GB_bind1st_tran__max_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -7703,14 +8140,14 @@ GrB_Info GB_bind2nd_tran__max_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__max_uint32
 (
@@ -7774,8 +8211,10 @@ GrB_Info GB_DxB__max_uint32
 GrB_Info GB_AaddB__max_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -7783,23 +8222,27 @@ GrB_Info GB_AaddB__max_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__max_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -7808,6 +8251,7 @@ GrB_Info GB_bind1st__max_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7819,6 +8263,7 @@ GrB_Info GB_bind2nd__max_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7830,10 +8275,10 @@ GrB_Info GB_bind1st_tran__max_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -7843,14 +8288,14 @@ GrB_Info GB_bind2nd_tran__max_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__max_uint64
 (
@@ -7914,8 +8359,10 @@ GrB_Info GB_DxB__max_uint64
 GrB_Info GB_AaddB__max_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -7923,23 +8370,27 @@ GrB_Info GB_AaddB__max_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__max_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -7948,6 +8399,7 @@ GrB_Info GB_bind1st__max_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7959,6 +8411,7 @@ GrB_Info GB_bind2nd__max_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -7970,10 +8423,10 @@ GrB_Info GB_bind1st_tran__max_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -7983,14 +8436,14 @@ GrB_Info GB_bind2nd_tran__max_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__max_fp32
 (
@@ -8054,8 +8507,10 @@ GrB_Info GB_DxB__max_fp32
 GrB_Info GB_AaddB__max_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -8063,23 +8518,27 @@ GrB_Info GB_AaddB__max_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__max_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -8088,6 +8547,7 @@ GrB_Info GB_bind1st__max_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8099,6 +8559,7 @@ GrB_Info GB_bind2nd__max_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8110,10 +8571,10 @@ GrB_Info GB_bind1st_tran__max_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -8123,14 +8584,14 @@ GrB_Info GB_bind2nd_tran__max_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__max_fp64
 (
@@ -8194,8 +8655,10 @@ GrB_Info GB_DxB__max_fp64
 GrB_Info GB_AaddB__max_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -8203,23 +8666,27 @@ GrB_Info GB_AaddB__max_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__max_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -8228,6 +8695,7 @@ GrB_Info GB_bind1st__max_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8239,6 +8707,7 @@ GrB_Info GB_bind2nd__max_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8250,10 +8719,10 @@ GrB_Info GB_bind1st_tran__max_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -8263,14 +8732,14 @@ GrB_Info GB_bind2nd_tran__max_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__plus_int8
 (
@@ -8334,8 +8803,10 @@ GrB_Info GB_DxB__plus_int8
 GrB_Info GB_AaddB__plus_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -8343,23 +8814,27 @@ GrB_Info GB_AaddB__plus_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__plus_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -8368,6 +8843,7 @@ GrB_Info GB_bind1st__plus_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8379,6 +8855,7 @@ GrB_Info GB_bind2nd__plus_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8390,10 +8867,10 @@ GrB_Info GB_bind1st_tran__plus_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -8403,14 +8880,14 @@ GrB_Info GB_bind2nd_tran__plus_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__plus_int16
 (
@@ -8474,8 +8951,10 @@ GrB_Info GB_DxB__plus_int16
 GrB_Info GB_AaddB__plus_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -8483,23 +8962,27 @@ GrB_Info GB_AaddB__plus_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__plus_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -8508,6 +8991,7 @@ GrB_Info GB_bind1st__plus_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8519,6 +9003,7 @@ GrB_Info GB_bind2nd__plus_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8530,10 +9015,10 @@ GrB_Info GB_bind1st_tran__plus_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -8543,14 +9028,14 @@ GrB_Info GB_bind2nd_tran__plus_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__plus_int32
 (
@@ -8614,8 +9099,10 @@ GrB_Info GB_DxB__plus_int32
 GrB_Info GB_AaddB__plus_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -8623,23 +9110,27 @@ GrB_Info GB_AaddB__plus_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__plus_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -8648,6 +9139,7 @@ GrB_Info GB_bind1st__plus_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8659,6 +9151,7 @@ GrB_Info GB_bind2nd__plus_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8670,10 +9163,10 @@ GrB_Info GB_bind1st_tran__plus_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -8683,14 +9176,14 @@ GrB_Info GB_bind2nd_tran__plus_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__plus_int64
 (
@@ -8754,8 +9247,10 @@ GrB_Info GB_DxB__plus_int64
 GrB_Info GB_AaddB__plus_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -8763,23 +9258,27 @@ GrB_Info GB_AaddB__plus_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__plus_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -8788,6 +9287,7 @@ GrB_Info GB_bind1st__plus_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8799,6 +9299,7 @@ GrB_Info GB_bind2nd__plus_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8810,10 +9311,10 @@ GrB_Info GB_bind1st_tran__plus_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -8823,14 +9324,14 @@ GrB_Info GB_bind2nd_tran__plus_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__plus_uint8
 (
@@ -8894,8 +9395,10 @@ GrB_Info GB_DxB__plus_uint8
 GrB_Info GB_AaddB__plus_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -8903,23 +9406,27 @@ GrB_Info GB_AaddB__plus_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__plus_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -8928,6 +9435,7 @@ GrB_Info GB_bind1st__plus_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8939,6 +9447,7 @@ GrB_Info GB_bind2nd__plus_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -8950,10 +9459,10 @@ GrB_Info GB_bind1st_tran__plus_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -8963,14 +9472,14 @@ GrB_Info GB_bind2nd_tran__plus_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__plus_uint16
 (
@@ -9034,8 +9543,10 @@ GrB_Info GB_DxB__plus_uint16
 GrB_Info GB_AaddB__plus_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -9043,23 +9554,27 @@ GrB_Info GB_AaddB__plus_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__plus_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -9068,6 +9583,7 @@ GrB_Info GB_bind1st__plus_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9079,6 +9595,7 @@ GrB_Info GB_bind2nd__plus_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9090,10 +9607,10 @@ GrB_Info GB_bind1st_tran__plus_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -9103,14 +9620,14 @@ GrB_Info GB_bind2nd_tran__plus_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__plus_uint32
 (
@@ -9174,8 +9691,10 @@ GrB_Info GB_DxB__plus_uint32
 GrB_Info GB_AaddB__plus_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -9183,23 +9702,27 @@ GrB_Info GB_AaddB__plus_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__plus_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -9208,6 +9731,7 @@ GrB_Info GB_bind1st__plus_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9219,6 +9743,7 @@ GrB_Info GB_bind2nd__plus_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9230,10 +9755,10 @@ GrB_Info GB_bind1st_tran__plus_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -9243,14 +9768,14 @@ GrB_Info GB_bind2nd_tran__plus_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__plus_uint64
 (
@@ -9314,8 +9839,10 @@ GrB_Info GB_DxB__plus_uint64
 GrB_Info GB_AaddB__plus_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -9323,23 +9850,27 @@ GrB_Info GB_AaddB__plus_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__plus_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -9348,6 +9879,7 @@ GrB_Info GB_bind1st__plus_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9359,6 +9891,7 @@ GrB_Info GB_bind2nd__plus_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9370,10 +9903,10 @@ GrB_Info GB_bind1st_tran__plus_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -9383,14 +9916,14 @@ GrB_Info GB_bind2nd_tran__plus_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__plus_fp32
 (
@@ -9454,8 +9987,10 @@ GrB_Info GB_DxB__plus_fp32
 GrB_Info GB_AaddB__plus_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -9463,23 +9998,27 @@ GrB_Info GB_AaddB__plus_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__plus_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -9488,6 +10027,7 @@ GrB_Info GB_bind1st__plus_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9499,6 +10039,7 @@ GrB_Info GB_bind2nd__plus_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9510,10 +10051,10 @@ GrB_Info GB_bind1st_tran__plus_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -9523,14 +10064,14 @@ GrB_Info GB_bind2nd_tran__plus_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__plus_fp64
 (
@@ -9594,8 +10135,10 @@ GrB_Info GB_DxB__plus_fp64
 GrB_Info GB_AaddB__plus_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -9603,23 +10146,27 @@ GrB_Info GB_AaddB__plus_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__plus_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -9628,6 +10175,7 @@ GrB_Info GB_bind1st__plus_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9639,6 +10187,7 @@ GrB_Info GB_bind2nd__plus_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9650,10 +10199,10 @@ GrB_Info GB_bind1st_tran__plus_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -9663,14 +10212,14 @@ GrB_Info GB_bind2nd_tran__plus_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__plus_fc32
 (
@@ -9734,8 +10283,10 @@ GrB_Info GB_DxB__plus_fc32
 GrB_Info GB_AaddB__plus_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -9743,23 +10294,27 @@ GrB_Info GB_AaddB__plus_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__plus_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -9768,6 +10323,7 @@ GrB_Info GB_bind1st__plus_fc32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9779,6 +10335,7 @@ GrB_Info GB_bind2nd__plus_fc32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9790,10 +10347,10 @@ GrB_Info GB_bind1st_tran__plus_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -9803,14 +10360,14 @@ GrB_Info GB_bind2nd_tran__plus_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__plus_fc64
 (
@@ -9874,8 +10431,10 @@ GrB_Info GB_DxB__plus_fc64
 GrB_Info GB_AaddB__plus_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -9883,23 +10442,27 @@ GrB_Info GB_AaddB__plus_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__plus_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -9908,6 +10471,7 @@ GrB_Info GB_bind1st__plus_fc64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9919,6 +10483,7 @@ GrB_Info GB_bind2nd__plus_fc64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -9930,10 +10495,10 @@ GrB_Info GB_bind1st_tran__plus_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -9943,14 +10508,14 @@ GrB_Info GB_bind2nd_tran__plus_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__minus_int8
 (
@@ -10014,8 +10579,10 @@ GrB_Info GB_DxB__minus_int8
 GrB_Info GB_AaddB__minus_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -10023,23 +10590,27 @@ GrB_Info GB_AaddB__minus_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__minus_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -10048,6 +10619,7 @@ GrB_Info GB_bind1st__minus_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10059,6 +10631,7 @@ GrB_Info GB_bind2nd__minus_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10070,10 +10643,10 @@ GrB_Info GB_bind1st_tran__minus_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -10083,14 +10656,14 @@ GrB_Info GB_bind2nd_tran__minus_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__minus_int16
 (
@@ -10154,8 +10727,10 @@ GrB_Info GB_DxB__minus_int16
 GrB_Info GB_AaddB__minus_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -10163,23 +10738,27 @@ GrB_Info GB_AaddB__minus_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__minus_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -10188,6 +10767,7 @@ GrB_Info GB_bind1st__minus_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10199,6 +10779,7 @@ GrB_Info GB_bind2nd__minus_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10210,10 +10791,10 @@ GrB_Info GB_bind1st_tran__minus_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -10223,14 +10804,14 @@ GrB_Info GB_bind2nd_tran__minus_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__minus_int32
 (
@@ -10294,8 +10875,10 @@ GrB_Info GB_DxB__minus_int32
 GrB_Info GB_AaddB__minus_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -10303,23 +10886,27 @@ GrB_Info GB_AaddB__minus_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__minus_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -10328,6 +10915,7 @@ GrB_Info GB_bind1st__minus_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10339,6 +10927,7 @@ GrB_Info GB_bind2nd__minus_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10350,10 +10939,10 @@ GrB_Info GB_bind1st_tran__minus_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -10363,14 +10952,14 @@ GrB_Info GB_bind2nd_tran__minus_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__minus_int64
 (
@@ -10434,8 +11023,10 @@ GrB_Info GB_DxB__minus_int64
 GrB_Info GB_AaddB__minus_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -10443,23 +11034,27 @@ GrB_Info GB_AaddB__minus_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__minus_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -10468,6 +11063,7 @@ GrB_Info GB_bind1st__minus_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10479,6 +11075,7 @@ GrB_Info GB_bind2nd__minus_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10490,10 +11087,10 @@ GrB_Info GB_bind1st_tran__minus_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -10503,14 +11100,14 @@ GrB_Info GB_bind2nd_tran__minus_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__minus_uint8
 (
@@ -10574,8 +11171,10 @@ GrB_Info GB_DxB__minus_uint8
 GrB_Info GB_AaddB__minus_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -10583,23 +11182,27 @@ GrB_Info GB_AaddB__minus_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__minus_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -10608,6 +11211,7 @@ GrB_Info GB_bind1st__minus_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10619,6 +11223,7 @@ GrB_Info GB_bind2nd__minus_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10630,10 +11235,10 @@ GrB_Info GB_bind1st_tran__minus_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -10643,14 +11248,14 @@ GrB_Info GB_bind2nd_tran__minus_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__minus_uint16
 (
@@ -10714,8 +11319,10 @@ GrB_Info GB_DxB__minus_uint16
 GrB_Info GB_AaddB__minus_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -10723,23 +11330,27 @@ GrB_Info GB_AaddB__minus_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__minus_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -10748,6 +11359,7 @@ GrB_Info GB_bind1st__minus_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10759,6 +11371,7 @@ GrB_Info GB_bind2nd__minus_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10770,10 +11383,10 @@ GrB_Info GB_bind1st_tran__minus_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -10783,14 +11396,14 @@ GrB_Info GB_bind2nd_tran__minus_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__minus_uint32
 (
@@ -10854,8 +11467,10 @@ GrB_Info GB_DxB__minus_uint32
 GrB_Info GB_AaddB__minus_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -10863,23 +11478,27 @@ GrB_Info GB_AaddB__minus_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__minus_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -10888,6 +11507,7 @@ GrB_Info GB_bind1st__minus_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10899,6 +11519,7 @@ GrB_Info GB_bind2nd__minus_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -10910,10 +11531,10 @@ GrB_Info GB_bind1st_tran__minus_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -10923,14 +11544,14 @@ GrB_Info GB_bind2nd_tran__minus_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__minus_uint64
 (
@@ -10994,8 +11615,10 @@ GrB_Info GB_DxB__minus_uint64
 GrB_Info GB_AaddB__minus_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -11003,23 +11626,27 @@ GrB_Info GB_AaddB__minus_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__minus_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -11028,6 +11655,7 @@ GrB_Info GB_bind1st__minus_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11039,6 +11667,7 @@ GrB_Info GB_bind2nd__minus_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11050,10 +11679,10 @@ GrB_Info GB_bind1st_tran__minus_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -11063,14 +11692,14 @@ GrB_Info GB_bind2nd_tran__minus_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__minus_fp32
 (
@@ -11134,8 +11763,10 @@ GrB_Info GB_DxB__minus_fp32
 GrB_Info GB_AaddB__minus_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -11143,23 +11774,27 @@ GrB_Info GB_AaddB__minus_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__minus_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -11168,6 +11803,7 @@ GrB_Info GB_bind1st__minus_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11179,6 +11815,7 @@ GrB_Info GB_bind2nd__minus_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11190,10 +11827,10 @@ GrB_Info GB_bind1st_tran__minus_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -11203,14 +11840,14 @@ GrB_Info GB_bind2nd_tran__minus_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__minus_fp64
 (
@@ -11274,8 +11911,10 @@ GrB_Info GB_DxB__minus_fp64
 GrB_Info GB_AaddB__minus_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -11283,23 +11922,27 @@ GrB_Info GB_AaddB__minus_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__minus_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -11308,6 +11951,7 @@ GrB_Info GB_bind1st__minus_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11319,6 +11963,7 @@ GrB_Info GB_bind2nd__minus_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11330,10 +11975,10 @@ GrB_Info GB_bind1st_tran__minus_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -11343,14 +11988,14 @@ GrB_Info GB_bind2nd_tran__minus_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__minus_fc32
 (
@@ -11414,8 +12059,10 @@ GrB_Info GB_DxB__minus_fc32
 GrB_Info GB_AaddB__minus_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -11423,23 +12070,27 @@ GrB_Info GB_AaddB__minus_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__minus_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -11448,6 +12099,7 @@ GrB_Info GB_bind1st__minus_fc32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11459,6 +12111,7 @@ GrB_Info GB_bind2nd__minus_fc32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11470,10 +12123,10 @@ GrB_Info GB_bind1st_tran__minus_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -11483,14 +12136,14 @@ GrB_Info GB_bind2nd_tran__minus_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__minus_fc64
 (
@@ -11554,8 +12207,10 @@ GrB_Info GB_DxB__minus_fc64
 GrB_Info GB_AaddB__minus_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -11563,23 +12218,27 @@ GrB_Info GB_AaddB__minus_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__minus_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -11588,6 +12247,7 @@ GrB_Info GB_bind1st__minus_fc64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11599,6 +12259,7 @@ GrB_Info GB_bind2nd__minus_fc64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11610,10 +12271,10 @@ GrB_Info GB_bind1st_tran__minus_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -11623,14 +12284,14 @@ GrB_Info GB_bind2nd_tran__minus_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rminus_int8
 (
@@ -11694,8 +12355,10 @@ GrB_Info GB_DxB__rminus_int8
 GrB_Info GB_AaddB__rminus_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -11703,23 +12366,27 @@ GrB_Info GB_AaddB__rminus_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rminus_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -11728,6 +12395,7 @@ GrB_Info GB_bind1st__rminus_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11739,6 +12407,7 @@ GrB_Info GB_bind2nd__rminus_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11750,10 +12419,10 @@ GrB_Info GB_bind1st_tran__rminus_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -11763,14 +12432,14 @@ GrB_Info GB_bind2nd_tran__rminus_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rminus_int16
 (
@@ -11834,8 +12503,10 @@ GrB_Info GB_DxB__rminus_int16
 GrB_Info GB_AaddB__rminus_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -11843,23 +12514,27 @@ GrB_Info GB_AaddB__rminus_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rminus_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -11868,6 +12543,7 @@ GrB_Info GB_bind1st__rminus_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11879,6 +12555,7 @@ GrB_Info GB_bind2nd__rminus_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -11890,10 +12567,10 @@ GrB_Info GB_bind1st_tran__rminus_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -11903,14 +12580,14 @@ GrB_Info GB_bind2nd_tran__rminus_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rminus_int32
 (
@@ -11974,8 +12651,10 @@ GrB_Info GB_DxB__rminus_int32
 GrB_Info GB_AaddB__rminus_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -11983,23 +12662,27 @@ GrB_Info GB_AaddB__rminus_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rminus_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -12008,6 +12691,7 @@ GrB_Info GB_bind1st__rminus_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12019,6 +12703,7 @@ GrB_Info GB_bind2nd__rminus_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12030,10 +12715,10 @@ GrB_Info GB_bind1st_tran__rminus_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -12043,14 +12728,14 @@ GrB_Info GB_bind2nd_tran__rminus_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rminus_int64
 (
@@ -12114,8 +12799,10 @@ GrB_Info GB_DxB__rminus_int64
 GrB_Info GB_AaddB__rminus_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -12123,23 +12810,27 @@ GrB_Info GB_AaddB__rminus_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rminus_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -12148,6 +12839,7 @@ GrB_Info GB_bind1st__rminus_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12159,6 +12851,7 @@ GrB_Info GB_bind2nd__rminus_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12170,10 +12863,10 @@ GrB_Info GB_bind1st_tran__rminus_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -12183,14 +12876,14 @@ GrB_Info GB_bind2nd_tran__rminus_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rminus_uint8
 (
@@ -12254,8 +12947,10 @@ GrB_Info GB_DxB__rminus_uint8
 GrB_Info GB_AaddB__rminus_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -12263,23 +12958,27 @@ GrB_Info GB_AaddB__rminus_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rminus_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -12288,6 +12987,7 @@ GrB_Info GB_bind1st__rminus_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12299,6 +12999,7 @@ GrB_Info GB_bind2nd__rminus_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12310,10 +13011,10 @@ GrB_Info GB_bind1st_tran__rminus_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -12323,14 +13024,14 @@ GrB_Info GB_bind2nd_tran__rminus_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rminus_uint16
 (
@@ -12394,8 +13095,10 @@ GrB_Info GB_DxB__rminus_uint16
 GrB_Info GB_AaddB__rminus_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -12403,23 +13106,27 @@ GrB_Info GB_AaddB__rminus_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rminus_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -12428,6 +13135,7 @@ GrB_Info GB_bind1st__rminus_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12439,6 +13147,7 @@ GrB_Info GB_bind2nd__rminus_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12450,10 +13159,10 @@ GrB_Info GB_bind1st_tran__rminus_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -12463,14 +13172,14 @@ GrB_Info GB_bind2nd_tran__rminus_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rminus_uint32
 (
@@ -12534,8 +13243,10 @@ GrB_Info GB_DxB__rminus_uint32
 GrB_Info GB_AaddB__rminus_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -12543,23 +13254,27 @@ GrB_Info GB_AaddB__rminus_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rminus_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -12568,6 +13283,7 @@ GrB_Info GB_bind1st__rminus_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12579,6 +13295,7 @@ GrB_Info GB_bind2nd__rminus_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12590,10 +13307,10 @@ GrB_Info GB_bind1st_tran__rminus_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -12603,14 +13320,14 @@ GrB_Info GB_bind2nd_tran__rminus_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rminus_uint64
 (
@@ -12674,8 +13391,10 @@ GrB_Info GB_DxB__rminus_uint64
 GrB_Info GB_AaddB__rminus_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -12683,23 +13402,27 @@ GrB_Info GB_AaddB__rminus_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rminus_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -12708,6 +13431,7 @@ GrB_Info GB_bind1st__rminus_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12719,6 +13443,7 @@ GrB_Info GB_bind2nd__rminus_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12730,10 +13455,10 @@ GrB_Info GB_bind1st_tran__rminus_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -12743,14 +13468,14 @@ GrB_Info GB_bind2nd_tran__rminus_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rminus_fp32
 (
@@ -12814,8 +13539,10 @@ GrB_Info GB_DxB__rminus_fp32
 GrB_Info GB_AaddB__rminus_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -12823,23 +13550,27 @@ GrB_Info GB_AaddB__rminus_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rminus_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -12848,6 +13579,7 @@ GrB_Info GB_bind1st__rminus_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12859,6 +13591,7 @@ GrB_Info GB_bind2nd__rminus_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12870,10 +13603,10 @@ GrB_Info GB_bind1st_tran__rminus_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -12883,14 +13616,14 @@ GrB_Info GB_bind2nd_tran__rminus_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rminus_fp64
 (
@@ -12954,8 +13687,10 @@ GrB_Info GB_DxB__rminus_fp64
 GrB_Info GB_AaddB__rminus_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -12963,23 +13698,27 @@ GrB_Info GB_AaddB__rminus_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rminus_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -12988,6 +13727,7 @@ GrB_Info GB_bind1st__rminus_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -12999,6 +13739,7 @@ GrB_Info GB_bind2nd__rminus_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13010,10 +13751,10 @@ GrB_Info GB_bind1st_tran__rminus_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -13023,14 +13764,14 @@ GrB_Info GB_bind2nd_tran__rminus_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rminus_fc32
 (
@@ -13094,8 +13835,10 @@ GrB_Info GB_DxB__rminus_fc32
 GrB_Info GB_AaddB__rminus_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -13103,23 +13846,27 @@ GrB_Info GB_AaddB__rminus_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rminus_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -13128,6 +13875,7 @@ GrB_Info GB_bind1st__rminus_fc32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13139,6 +13887,7 @@ GrB_Info GB_bind2nd__rminus_fc32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13150,10 +13899,10 @@ GrB_Info GB_bind1st_tran__rminus_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -13163,14 +13912,14 @@ GrB_Info GB_bind2nd_tran__rminus_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rminus_fc64
 (
@@ -13234,8 +13983,10 @@ GrB_Info GB_DxB__rminus_fc64
 GrB_Info GB_AaddB__rminus_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -13243,23 +13994,27 @@ GrB_Info GB_AaddB__rminus_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rminus_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -13268,6 +14023,7 @@ GrB_Info GB_bind1st__rminus_fc64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13279,6 +14035,7 @@ GrB_Info GB_bind2nd__rminus_fc64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13290,10 +14047,10 @@ GrB_Info GB_bind1st_tran__rminus_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -13303,14 +14060,14 @@ GrB_Info GB_bind2nd_tran__rminus_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__times_int8
 (
@@ -13374,8 +14131,10 @@ GrB_Info GB_DxB__times_int8
 GrB_Info GB_AaddB__times_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -13383,23 +14142,27 @@ GrB_Info GB_AaddB__times_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__times_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -13408,6 +14171,7 @@ GrB_Info GB_bind1st__times_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13419,6 +14183,7 @@ GrB_Info GB_bind2nd__times_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13430,10 +14195,10 @@ GrB_Info GB_bind1st_tran__times_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -13443,14 +14208,14 @@ GrB_Info GB_bind2nd_tran__times_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__times_int16
 (
@@ -13514,8 +14279,10 @@ GrB_Info GB_DxB__times_int16
 GrB_Info GB_AaddB__times_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -13523,23 +14290,27 @@ GrB_Info GB_AaddB__times_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__times_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -13548,6 +14319,7 @@ GrB_Info GB_bind1st__times_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13559,6 +14331,7 @@ GrB_Info GB_bind2nd__times_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13570,10 +14343,10 @@ GrB_Info GB_bind1st_tran__times_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -13583,14 +14356,14 @@ GrB_Info GB_bind2nd_tran__times_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__times_int32
 (
@@ -13654,8 +14427,10 @@ GrB_Info GB_DxB__times_int32
 GrB_Info GB_AaddB__times_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -13663,23 +14438,27 @@ GrB_Info GB_AaddB__times_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__times_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -13688,6 +14467,7 @@ GrB_Info GB_bind1st__times_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13699,6 +14479,7 @@ GrB_Info GB_bind2nd__times_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13710,10 +14491,10 @@ GrB_Info GB_bind1st_tran__times_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -13723,14 +14504,14 @@ GrB_Info GB_bind2nd_tran__times_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__times_int64
 (
@@ -13794,8 +14575,10 @@ GrB_Info GB_DxB__times_int64
 GrB_Info GB_AaddB__times_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -13803,23 +14586,27 @@ GrB_Info GB_AaddB__times_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__times_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -13828,6 +14615,7 @@ GrB_Info GB_bind1st__times_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13839,6 +14627,7 @@ GrB_Info GB_bind2nd__times_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13850,10 +14639,10 @@ GrB_Info GB_bind1st_tran__times_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -13863,14 +14652,14 @@ GrB_Info GB_bind2nd_tran__times_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__times_uint8
 (
@@ -13934,8 +14723,10 @@ GrB_Info GB_DxB__times_uint8
 GrB_Info GB_AaddB__times_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -13943,23 +14734,27 @@ GrB_Info GB_AaddB__times_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__times_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -13968,6 +14763,7 @@ GrB_Info GB_bind1st__times_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13979,6 +14775,7 @@ GrB_Info GB_bind2nd__times_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -13990,10 +14787,10 @@ GrB_Info GB_bind1st_tran__times_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -14003,14 +14800,14 @@ GrB_Info GB_bind2nd_tran__times_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__times_uint16
 (
@@ -14074,8 +14871,10 @@ GrB_Info GB_DxB__times_uint16
 GrB_Info GB_AaddB__times_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -14083,23 +14882,27 @@ GrB_Info GB_AaddB__times_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__times_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -14108,6 +14911,7 @@ GrB_Info GB_bind1st__times_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14119,6 +14923,7 @@ GrB_Info GB_bind2nd__times_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14130,10 +14935,10 @@ GrB_Info GB_bind1st_tran__times_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -14143,14 +14948,14 @@ GrB_Info GB_bind2nd_tran__times_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__times_uint32
 (
@@ -14214,8 +15019,10 @@ GrB_Info GB_DxB__times_uint32
 GrB_Info GB_AaddB__times_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -14223,23 +15030,27 @@ GrB_Info GB_AaddB__times_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__times_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -14248,6 +15059,7 @@ GrB_Info GB_bind1st__times_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14259,6 +15071,7 @@ GrB_Info GB_bind2nd__times_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14270,10 +15083,10 @@ GrB_Info GB_bind1st_tran__times_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -14283,14 +15096,14 @@ GrB_Info GB_bind2nd_tran__times_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__times_uint64
 (
@@ -14354,8 +15167,10 @@ GrB_Info GB_DxB__times_uint64
 GrB_Info GB_AaddB__times_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -14363,23 +15178,27 @@ GrB_Info GB_AaddB__times_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__times_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -14388,6 +15207,7 @@ GrB_Info GB_bind1st__times_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14399,6 +15219,7 @@ GrB_Info GB_bind2nd__times_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14410,10 +15231,10 @@ GrB_Info GB_bind1st_tran__times_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -14423,14 +15244,14 @@ GrB_Info GB_bind2nd_tran__times_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__times_fp32
 (
@@ -14494,8 +15315,10 @@ GrB_Info GB_DxB__times_fp32
 GrB_Info GB_AaddB__times_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -14503,23 +15326,27 @@ GrB_Info GB_AaddB__times_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__times_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -14528,6 +15355,7 @@ GrB_Info GB_bind1st__times_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14539,6 +15367,7 @@ GrB_Info GB_bind2nd__times_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14550,10 +15379,10 @@ GrB_Info GB_bind1st_tran__times_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -14563,14 +15392,14 @@ GrB_Info GB_bind2nd_tran__times_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__times_fp64
 (
@@ -14634,8 +15463,10 @@ GrB_Info GB_DxB__times_fp64
 GrB_Info GB_AaddB__times_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -14643,23 +15474,27 @@ GrB_Info GB_AaddB__times_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__times_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -14668,6 +15503,7 @@ GrB_Info GB_bind1st__times_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14679,6 +15515,7 @@ GrB_Info GB_bind2nd__times_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14690,10 +15527,10 @@ GrB_Info GB_bind1st_tran__times_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -14703,14 +15540,14 @@ GrB_Info GB_bind2nd_tran__times_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__times_fc32
 (
@@ -14774,8 +15611,10 @@ GrB_Info GB_DxB__times_fc32
 GrB_Info GB_AaddB__times_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -14783,23 +15622,27 @@ GrB_Info GB_AaddB__times_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__times_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -14808,6 +15651,7 @@ GrB_Info GB_bind1st__times_fc32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14819,6 +15663,7 @@ GrB_Info GB_bind2nd__times_fc32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14830,10 +15675,10 @@ GrB_Info GB_bind1st_tran__times_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -14843,14 +15688,14 @@ GrB_Info GB_bind2nd_tran__times_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__times_fc64
 (
@@ -14914,8 +15759,10 @@ GrB_Info GB_DxB__times_fc64
 GrB_Info GB_AaddB__times_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -14923,23 +15770,27 @@ GrB_Info GB_AaddB__times_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__times_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -14948,6 +15799,7 @@ GrB_Info GB_bind1st__times_fc64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14959,6 +15811,7 @@ GrB_Info GB_bind2nd__times_fc64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -14970,10 +15823,10 @@ GrB_Info GB_bind1st_tran__times_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -14983,14 +15836,14 @@ GrB_Info GB_bind2nd_tran__times_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__div_int8
 (
@@ -15054,8 +15907,10 @@ GrB_Info GB_DxB__div_int8
 GrB_Info GB_AaddB__div_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -15063,23 +15918,27 @@ GrB_Info GB_AaddB__div_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__div_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -15088,6 +15947,7 @@ GrB_Info GB_bind1st__div_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15099,6 +15959,7 @@ GrB_Info GB_bind2nd__div_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15110,10 +15971,10 @@ GrB_Info GB_bind1st_tran__div_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -15123,14 +15984,14 @@ GrB_Info GB_bind2nd_tran__div_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__div_int16
 (
@@ -15194,8 +16055,10 @@ GrB_Info GB_DxB__div_int16
 GrB_Info GB_AaddB__div_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -15203,23 +16066,27 @@ GrB_Info GB_AaddB__div_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__div_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -15228,6 +16095,7 @@ GrB_Info GB_bind1st__div_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15239,6 +16107,7 @@ GrB_Info GB_bind2nd__div_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15250,10 +16119,10 @@ GrB_Info GB_bind1st_tran__div_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -15263,14 +16132,14 @@ GrB_Info GB_bind2nd_tran__div_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__div_int32
 (
@@ -15334,8 +16203,10 @@ GrB_Info GB_DxB__div_int32
 GrB_Info GB_AaddB__div_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -15343,23 +16214,27 @@ GrB_Info GB_AaddB__div_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__div_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -15368,6 +16243,7 @@ GrB_Info GB_bind1st__div_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15379,6 +16255,7 @@ GrB_Info GB_bind2nd__div_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15390,10 +16267,10 @@ GrB_Info GB_bind1st_tran__div_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -15403,14 +16280,14 @@ GrB_Info GB_bind2nd_tran__div_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__div_int64
 (
@@ -15474,8 +16351,10 @@ GrB_Info GB_DxB__div_int64
 GrB_Info GB_AaddB__div_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -15483,23 +16362,27 @@ GrB_Info GB_AaddB__div_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__div_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -15508,6 +16391,7 @@ GrB_Info GB_bind1st__div_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15519,6 +16403,7 @@ GrB_Info GB_bind2nd__div_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15530,10 +16415,10 @@ GrB_Info GB_bind1st_tran__div_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -15543,14 +16428,14 @@ GrB_Info GB_bind2nd_tran__div_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__div_uint8
 (
@@ -15614,8 +16499,10 @@ GrB_Info GB_DxB__div_uint8
 GrB_Info GB_AaddB__div_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -15623,23 +16510,27 @@ GrB_Info GB_AaddB__div_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__div_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -15648,6 +16539,7 @@ GrB_Info GB_bind1st__div_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15659,6 +16551,7 @@ GrB_Info GB_bind2nd__div_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15670,10 +16563,10 @@ GrB_Info GB_bind1st_tran__div_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -15683,14 +16576,14 @@ GrB_Info GB_bind2nd_tran__div_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__div_uint16
 (
@@ -15754,8 +16647,10 @@ GrB_Info GB_DxB__div_uint16
 GrB_Info GB_AaddB__div_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -15763,23 +16658,27 @@ GrB_Info GB_AaddB__div_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__div_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -15788,6 +16687,7 @@ GrB_Info GB_bind1st__div_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15799,6 +16699,7 @@ GrB_Info GB_bind2nd__div_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15810,10 +16711,10 @@ GrB_Info GB_bind1st_tran__div_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -15823,14 +16724,14 @@ GrB_Info GB_bind2nd_tran__div_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__div_uint32
 (
@@ -15894,8 +16795,10 @@ GrB_Info GB_DxB__div_uint32
 GrB_Info GB_AaddB__div_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -15903,23 +16806,27 @@ GrB_Info GB_AaddB__div_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__div_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -15928,6 +16835,7 @@ GrB_Info GB_bind1st__div_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15939,6 +16847,7 @@ GrB_Info GB_bind2nd__div_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -15950,10 +16859,10 @@ GrB_Info GB_bind1st_tran__div_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -15963,14 +16872,14 @@ GrB_Info GB_bind2nd_tran__div_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__div_uint64
 (
@@ -16034,8 +16943,10 @@ GrB_Info GB_DxB__div_uint64
 GrB_Info GB_AaddB__div_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -16043,23 +16954,27 @@ GrB_Info GB_AaddB__div_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__div_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -16068,6 +16983,7 @@ GrB_Info GB_bind1st__div_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16079,6 +16995,7 @@ GrB_Info GB_bind2nd__div_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16090,10 +17007,10 @@ GrB_Info GB_bind1st_tran__div_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -16103,14 +17020,14 @@ GrB_Info GB_bind2nd_tran__div_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__div_fp32
 (
@@ -16174,8 +17091,10 @@ GrB_Info GB_DxB__div_fp32
 GrB_Info GB_AaddB__div_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -16183,23 +17102,27 @@ GrB_Info GB_AaddB__div_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__div_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -16208,6 +17131,7 @@ GrB_Info GB_bind1st__div_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16219,6 +17143,7 @@ GrB_Info GB_bind2nd__div_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16230,10 +17155,10 @@ GrB_Info GB_bind1st_tran__div_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -16243,14 +17168,14 @@ GrB_Info GB_bind2nd_tran__div_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__div_fp64
 (
@@ -16314,8 +17239,10 @@ GrB_Info GB_DxB__div_fp64
 GrB_Info GB_AaddB__div_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -16323,23 +17250,27 @@ GrB_Info GB_AaddB__div_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__div_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -16348,6 +17279,7 @@ GrB_Info GB_bind1st__div_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16359,6 +17291,7 @@ GrB_Info GB_bind2nd__div_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16370,10 +17303,10 @@ GrB_Info GB_bind1st_tran__div_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -16383,14 +17316,14 @@ GrB_Info GB_bind2nd_tran__div_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__div_fc32
 (
@@ -16454,8 +17387,10 @@ GrB_Info GB_DxB__div_fc32
 GrB_Info GB_AaddB__div_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -16463,23 +17398,27 @@ GrB_Info GB_AaddB__div_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__div_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -16488,6 +17427,7 @@ GrB_Info GB_bind1st__div_fc32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16499,6 +17439,7 @@ GrB_Info GB_bind2nd__div_fc32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16510,10 +17451,10 @@ GrB_Info GB_bind1st_tran__div_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -16523,14 +17464,14 @@ GrB_Info GB_bind2nd_tran__div_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__div_fc64
 (
@@ -16594,8 +17535,10 @@ GrB_Info GB_DxB__div_fc64
 GrB_Info GB_AaddB__div_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -16603,23 +17546,27 @@ GrB_Info GB_AaddB__div_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__div_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -16628,6 +17575,7 @@ GrB_Info GB_bind1st__div_fc64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16639,6 +17587,7 @@ GrB_Info GB_bind2nd__div_fc64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16650,10 +17599,10 @@ GrB_Info GB_bind1st_tran__div_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -16663,14 +17612,14 @@ GrB_Info GB_bind2nd_tran__div_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rdiv_int8
 (
@@ -16734,8 +17683,10 @@ GrB_Info GB_DxB__rdiv_int8
 GrB_Info GB_AaddB__rdiv_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -16743,23 +17694,27 @@ GrB_Info GB_AaddB__rdiv_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rdiv_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -16768,6 +17723,7 @@ GrB_Info GB_bind1st__rdiv_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16779,6 +17735,7 @@ GrB_Info GB_bind2nd__rdiv_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16790,10 +17747,10 @@ GrB_Info GB_bind1st_tran__rdiv_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -16803,14 +17760,14 @@ GrB_Info GB_bind2nd_tran__rdiv_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rdiv_int16
 (
@@ -16874,8 +17831,10 @@ GrB_Info GB_DxB__rdiv_int16
 GrB_Info GB_AaddB__rdiv_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -16883,23 +17842,27 @@ GrB_Info GB_AaddB__rdiv_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rdiv_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -16908,6 +17871,7 @@ GrB_Info GB_bind1st__rdiv_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16919,6 +17883,7 @@ GrB_Info GB_bind2nd__rdiv_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -16930,10 +17895,10 @@ GrB_Info GB_bind1st_tran__rdiv_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -16943,14 +17908,14 @@ GrB_Info GB_bind2nd_tran__rdiv_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rdiv_int32
 (
@@ -17014,8 +17979,10 @@ GrB_Info GB_DxB__rdiv_int32
 GrB_Info GB_AaddB__rdiv_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -17023,23 +17990,27 @@ GrB_Info GB_AaddB__rdiv_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rdiv_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -17048,6 +18019,7 @@ GrB_Info GB_bind1st__rdiv_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17059,6 +18031,7 @@ GrB_Info GB_bind2nd__rdiv_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17070,10 +18043,10 @@ GrB_Info GB_bind1st_tran__rdiv_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -17083,14 +18056,14 @@ GrB_Info GB_bind2nd_tran__rdiv_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rdiv_int64
 (
@@ -17154,8 +18127,10 @@ GrB_Info GB_DxB__rdiv_int64
 GrB_Info GB_AaddB__rdiv_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -17163,23 +18138,27 @@ GrB_Info GB_AaddB__rdiv_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rdiv_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -17188,6 +18167,7 @@ GrB_Info GB_bind1st__rdiv_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17199,6 +18179,7 @@ GrB_Info GB_bind2nd__rdiv_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17210,10 +18191,10 @@ GrB_Info GB_bind1st_tran__rdiv_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -17223,14 +18204,14 @@ GrB_Info GB_bind2nd_tran__rdiv_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rdiv_uint8
 (
@@ -17294,8 +18275,10 @@ GrB_Info GB_DxB__rdiv_uint8
 GrB_Info GB_AaddB__rdiv_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -17303,23 +18286,27 @@ GrB_Info GB_AaddB__rdiv_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rdiv_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -17328,6 +18315,7 @@ GrB_Info GB_bind1st__rdiv_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17339,6 +18327,7 @@ GrB_Info GB_bind2nd__rdiv_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17350,10 +18339,10 @@ GrB_Info GB_bind1st_tran__rdiv_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -17363,14 +18352,14 @@ GrB_Info GB_bind2nd_tran__rdiv_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rdiv_uint16
 (
@@ -17434,8 +18423,10 @@ GrB_Info GB_DxB__rdiv_uint16
 GrB_Info GB_AaddB__rdiv_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -17443,23 +18434,27 @@ GrB_Info GB_AaddB__rdiv_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rdiv_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -17468,6 +18463,7 @@ GrB_Info GB_bind1st__rdiv_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17479,6 +18475,7 @@ GrB_Info GB_bind2nd__rdiv_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17490,10 +18487,10 @@ GrB_Info GB_bind1st_tran__rdiv_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -17503,14 +18500,14 @@ GrB_Info GB_bind2nd_tran__rdiv_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rdiv_uint32
 (
@@ -17574,8 +18571,10 @@ GrB_Info GB_DxB__rdiv_uint32
 GrB_Info GB_AaddB__rdiv_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -17583,23 +18582,27 @@ GrB_Info GB_AaddB__rdiv_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rdiv_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -17608,6 +18611,7 @@ GrB_Info GB_bind1st__rdiv_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17619,6 +18623,7 @@ GrB_Info GB_bind2nd__rdiv_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17630,10 +18635,10 @@ GrB_Info GB_bind1st_tran__rdiv_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -17643,14 +18648,14 @@ GrB_Info GB_bind2nd_tran__rdiv_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rdiv_uint64
 (
@@ -17714,8 +18719,10 @@ GrB_Info GB_DxB__rdiv_uint64
 GrB_Info GB_AaddB__rdiv_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -17723,23 +18730,27 @@ GrB_Info GB_AaddB__rdiv_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rdiv_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -17748,6 +18759,7 @@ GrB_Info GB_bind1st__rdiv_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17759,6 +18771,7 @@ GrB_Info GB_bind2nd__rdiv_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17770,10 +18783,10 @@ GrB_Info GB_bind1st_tran__rdiv_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -17783,14 +18796,14 @@ GrB_Info GB_bind2nd_tran__rdiv_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rdiv_fp32
 (
@@ -17854,8 +18867,10 @@ GrB_Info GB_DxB__rdiv_fp32
 GrB_Info GB_AaddB__rdiv_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -17863,23 +18878,27 @@ GrB_Info GB_AaddB__rdiv_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rdiv_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -17888,6 +18907,7 @@ GrB_Info GB_bind1st__rdiv_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17899,6 +18919,7 @@ GrB_Info GB_bind2nd__rdiv_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -17910,10 +18931,10 @@ GrB_Info GB_bind1st_tran__rdiv_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -17923,14 +18944,14 @@ GrB_Info GB_bind2nd_tran__rdiv_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rdiv_fp64
 (
@@ -17994,8 +19015,10 @@ GrB_Info GB_DxB__rdiv_fp64
 GrB_Info GB_AaddB__rdiv_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -18003,23 +19026,27 @@ GrB_Info GB_AaddB__rdiv_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rdiv_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -18028,6 +19055,7 @@ GrB_Info GB_bind1st__rdiv_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18039,6 +19067,7 @@ GrB_Info GB_bind2nd__rdiv_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18050,10 +19079,10 @@ GrB_Info GB_bind1st_tran__rdiv_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -18063,14 +19092,14 @@ GrB_Info GB_bind2nd_tran__rdiv_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rdiv_fc32
 (
@@ -18134,8 +19163,10 @@ GrB_Info GB_DxB__rdiv_fc32
 GrB_Info GB_AaddB__rdiv_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -18143,23 +19174,27 @@ GrB_Info GB_AaddB__rdiv_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rdiv_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -18168,6 +19203,7 @@ GrB_Info GB_bind1st__rdiv_fc32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18179,6 +19215,7 @@ GrB_Info GB_bind2nd__rdiv_fc32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18190,10 +19227,10 @@ GrB_Info GB_bind1st_tran__rdiv_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -18203,14 +19240,14 @@ GrB_Info GB_bind2nd_tran__rdiv_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_Cdense_ewise3_accum__rdiv_fc64
 (
@@ -18274,8 +19311,10 @@ GrB_Info GB_DxB__rdiv_fc64
 GrB_Info GB_AaddB__rdiv_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -18283,23 +19322,27 @@ GrB_Info GB_AaddB__rdiv_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__rdiv_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -18308,6 +19351,7 @@ GrB_Info GB_bind1st__rdiv_fc64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18319,6 +19363,7 @@ GrB_Info GB_bind2nd__rdiv_fc64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18330,10 +19375,10 @@ GrB_Info GB_bind1st_tran__rdiv_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -18343,14 +19388,14 @@ GrB_Info GB_bind2nd_tran__rdiv_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -18414,8 +19459,10 @@ GrB_Info GB_DxB__iseq_int8
 GrB_Info GB_AaddB__iseq_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -18423,23 +19470,27 @@ GrB_Info GB_AaddB__iseq_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__iseq_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -18448,6 +19499,7 @@ GrB_Info GB_bind1st__iseq_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18459,6 +19511,7 @@ GrB_Info GB_bind2nd__iseq_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18470,10 +19523,10 @@ GrB_Info GB_bind1st_tran__iseq_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -18483,14 +19536,14 @@ GrB_Info GB_bind2nd_tran__iseq_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -18554,8 +19607,10 @@ GrB_Info GB_DxB__iseq_int16
 GrB_Info GB_AaddB__iseq_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -18563,23 +19618,27 @@ GrB_Info GB_AaddB__iseq_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__iseq_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -18588,6 +19647,7 @@ GrB_Info GB_bind1st__iseq_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18599,6 +19659,7 @@ GrB_Info GB_bind2nd__iseq_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18610,10 +19671,10 @@ GrB_Info GB_bind1st_tran__iseq_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -18623,14 +19684,14 @@ GrB_Info GB_bind2nd_tran__iseq_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -18694,8 +19755,10 @@ GrB_Info GB_DxB__iseq_int32
 GrB_Info GB_AaddB__iseq_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -18703,23 +19766,27 @@ GrB_Info GB_AaddB__iseq_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__iseq_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -18728,6 +19795,7 @@ GrB_Info GB_bind1st__iseq_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18739,6 +19807,7 @@ GrB_Info GB_bind2nd__iseq_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18750,10 +19819,10 @@ GrB_Info GB_bind1st_tran__iseq_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -18763,14 +19832,14 @@ GrB_Info GB_bind2nd_tran__iseq_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -18834,8 +19903,10 @@ GrB_Info GB_DxB__iseq_int64
 GrB_Info GB_AaddB__iseq_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -18843,23 +19914,27 @@ GrB_Info GB_AaddB__iseq_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__iseq_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -18868,6 +19943,7 @@ GrB_Info GB_bind1st__iseq_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18879,6 +19955,7 @@ GrB_Info GB_bind2nd__iseq_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -18890,10 +19967,10 @@ GrB_Info GB_bind1st_tran__iseq_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -18903,14 +19980,14 @@ GrB_Info GB_bind2nd_tran__iseq_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -18974,8 +20051,10 @@ GrB_Info GB_DxB__iseq_uint8
 GrB_Info GB_AaddB__iseq_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -18983,23 +20062,27 @@ GrB_Info GB_AaddB__iseq_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__iseq_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -19008,6 +20091,7 @@ GrB_Info GB_bind1st__iseq_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19019,6 +20103,7 @@ GrB_Info GB_bind2nd__iseq_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19030,10 +20115,10 @@ GrB_Info GB_bind1st_tran__iseq_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -19043,14 +20128,14 @@ GrB_Info GB_bind2nd_tran__iseq_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -19114,8 +20199,10 @@ GrB_Info GB_DxB__iseq_uint16
 GrB_Info GB_AaddB__iseq_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -19123,23 +20210,27 @@ GrB_Info GB_AaddB__iseq_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__iseq_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -19148,6 +20239,7 @@ GrB_Info GB_bind1st__iseq_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19159,6 +20251,7 @@ GrB_Info GB_bind2nd__iseq_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19170,10 +20263,10 @@ GrB_Info GB_bind1st_tran__iseq_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -19183,14 +20276,14 @@ GrB_Info GB_bind2nd_tran__iseq_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -19254,8 +20347,10 @@ GrB_Info GB_DxB__iseq_uint32
 GrB_Info GB_AaddB__iseq_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -19263,23 +20358,27 @@ GrB_Info GB_AaddB__iseq_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__iseq_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -19288,6 +20387,7 @@ GrB_Info GB_bind1st__iseq_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19299,6 +20399,7 @@ GrB_Info GB_bind2nd__iseq_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19310,10 +20411,10 @@ GrB_Info GB_bind1st_tran__iseq_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -19323,14 +20424,14 @@ GrB_Info GB_bind2nd_tran__iseq_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -19394,8 +20495,10 @@ GrB_Info GB_DxB__iseq_uint64
 GrB_Info GB_AaddB__iseq_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -19403,23 +20506,27 @@ GrB_Info GB_AaddB__iseq_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__iseq_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -19428,6 +20535,7 @@ GrB_Info GB_bind1st__iseq_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19439,6 +20547,7 @@ GrB_Info GB_bind2nd__iseq_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19450,10 +20559,10 @@ GrB_Info GB_bind1st_tran__iseq_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -19463,14 +20572,14 @@ GrB_Info GB_bind2nd_tran__iseq_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -19534,8 +20643,10 @@ GrB_Info GB_DxB__iseq_fp32
 GrB_Info GB_AaddB__iseq_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -19543,23 +20654,27 @@ GrB_Info GB_AaddB__iseq_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__iseq_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -19568,6 +20683,7 @@ GrB_Info GB_bind1st__iseq_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19579,6 +20695,7 @@ GrB_Info GB_bind2nd__iseq_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19590,10 +20707,10 @@ GrB_Info GB_bind1st_tran__iseq_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -19603,14 +20720,14 @@ GrB_Info GB_bind2nd_tran__iseq_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -19674,8 +20791,10 @@ GrB_Info GB_DxB__iseq_fp64
 GrB_Info GB_AaddB__iseq_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -19683,23 +20802,27 @@ GrB_Info GB_AaddB__iseq_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__iseq_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -19708,6 +20831,7 @@ GrB_Info GB_bind1st__iseq_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19719,6 +20843,7 @@ GrB_Info GB_bind2nd__iseq_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19730,10 +20855,10 @@ GrB_Info GB_bind1st_tran__iseq_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -19743,14 +20868,14 @@ GrB_Info GB_bind2nd_tran__iseq_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -19814,8 +20939,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__iseq_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -19823,23 +20950,27 @@ GrB_Info GB_AaddB__iseq_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__iseq_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -19848,6 +20979,7 @@ GrB_Info GB_bind1st__iseq_fc32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19859,6 +20991,7 @@ GrB_Info GB_bind2nd__iseq_fc32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19870,10 +21003,10 @@ GrB_Info GB_bind1st_tran__iseq_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -19883,14 +21016,14 @@ GrB_Info GB_bind2nd_tran__iseq_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -19954,8 +21087,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__iseq_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -19963,23 +21098,27 @@ GrB_Info GB_AaddB__iseq_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__iseq_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -19988,6 +21127,7 @@ GrB_Info GB_bind1st__iseq_fc64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -19999,6 +21139,7 @@ GrB_Info GB_bind2nd__iseq_fc64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20010,10 +21151,10 @@ GrB_Info GB_bind1st_tran__iseq_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -20023,14 +21164,14 @@ GrB_Info GB_bind2nd_tran__iseq_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -20094,8 +21235,10 @@ GrB_Info GB_DxB__isne_int8
 GrB_Info GB_AaddB__isne_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -20103,23 +21246,27 @@ GrB_Info GB_AaddB__isne_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isne_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -20128,6 +21275,7 @@ GrB_Info GB_bind1st__isne_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20139,6 +21287,7 @@ GrB_Info GB_bind2nd__isne_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20150,10 +21299,10 @@ GrB_Info GB_bind1st_tran__isne_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -20163,14 +21312,14 @@ GrB_Info GB_bind2nd_tran__isne_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -20234,8 +21383,10 @@ GrB_Info GB_DxB__isne_int16
 GrB_Info GB_AaddB__isne_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -20243,23 +21394,27 @@ GrB_Info GB_AaddB__isne_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isne_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -20268,6 +21423,7 @@ GrB_Info GB_bind1st__isne_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20279,6 +21435,7 @@ GrB_Info GB_bind2nd__isne_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20290,10 +21447,10 @@ GrB_Info GB_bind1st_tran__isne_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -20303,14 +21460,14 @@ GrB_Info GB_bind2nd_tran__isne_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -20374,8 +21531,10 @@ GrB_Info GB_DxB__isne_int32
 GrB_Info GB_AaddB__isne_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -20383,23 +21542,27 @@ GrB_Info GB_AaddB__isne_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isne_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -20408,6 +21571,7 @@ GrB_Info GB_bind1st__isne_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20419,6 +21583,7 @@ GrB_Info GB_bind2nd__isne_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20430,10 +21595,10 @@ GrB_Info GB_bind1st_tran__isne_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -20443,14 +21608,14 @@ GrB_Info GB_bind2nd_tran__isne_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -20514,8 +21679,10 @@ GrB_Info GB_DxB__isne_int64
 GrB_Info GB_AaddB__isne_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -20523,23 +21690,27 @@ GrB_Info GB_AaddB__isne_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isne_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -20548,6 +21719,7 @@ GrB_Info GB_bind1st__isne_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20559,6 +21731,7 @@ GrB_Info GB_bind2nd__isne_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20570,10 +21743,10 @@ GrB_Info GB_bind1st_tran__isne_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -20583,14 +21756,14 @@ GrB_Info GB_bind2nd_tran__isne_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -20654,8 +21827,10 @@ GrB_Info GB_DxB__isne_uint8
 GrB_Info GB_AaddB__isne_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -20663,23 +21838,27 @@ GrB_Info GB_AaddB__isne_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isne_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -20688,6 +21867,7 @@ GrB_Info GB_bind1st__isne_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20699,6 +21879,7 @@ GrB_Info GB_bind2nd__isne_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20710,10 +21891,10 @@ GrB_Info GB_bind1st_tran__isne_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -20723,14 +21904,14 @@ GrB_Info GB_bind2nd_tran__isne_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -20794,8 +21975,10 @@ GrB_Info GB_DxB__isne_uint16
 GrB_Info GB_AaddB__isne_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -20803,23 +21986,27 @@ GrB_Info GB_AaddB__isne_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isne_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -20828,6 +22015,7 @@ GrB_Info GB_bind1st__isne_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20839,6 +22027,7 @@ GrB_Info GB_bind2nd__isne_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20850,10 +22039,10 @@ GrB_Info GB_bind1st_tran__isne_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -20863,14 +22052,14 @@ GrB_Info GB_bind2nd_tran__isne_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -20934,8 +22123,10 @@ GrB_Info GB_DxB__isne_uint32
 GrB_Info GB_AaddB__isne_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -20943,23 +22134,27 @@ GrB_Info GB_AaddB__isne_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isne_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -20968,6 +22163,7 @@ GrB_Info GB_bind1st__isne_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20979,6 +22175,7 @@ GrB_Info GB_bind2nd__isne_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -20990,10 +22187,10 @@ GrB_Info GB_bind1st_tran__isne_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -21003,14 +22200,14 @@ GrB_Info GB_bind2nd_tran__isne_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -21074,8 +22271,10 @@ GrB_Info GB_DxB__isne_uint64
 GrB_Info GB_AaddB__isne_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -21083,23 +22282,27 @@ GrB_Info GB_AaddB__isne_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isne_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -21108,6 +22311,7 @@ GrB_Info GB_bind1st__isne_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21119,6 +22323,7 @@ GrB_Info GB_bind2nd__isne_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21130,10 +22335,10 @@ GrB_Info GB_bind1st_tran__isne_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -21143,14 +22348,14 @@ GrB_Info GB_bind2nd_tran__isne_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -21214,8 +22419,10 @@ GrB_Info GB_DxB__isne_fp32
 GrB_Info GB_AaddB__isne_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -21223,23 +22430,27 @@ GrB_Info GB_AaddB__isne_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isne_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -21248,6 +22459,7 @@ GrB_Info GB_bind1st__isne_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21259,6 +22471,7 @@ GrB_Info GB_bind2nd__isne_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21270,10 +22483,10 @@ GrB_Info GB_bind1st_tran__isne_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -21283,14 +22496,14 @@ GrB_Info GB_bind2nd_tran__isne_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -21354,8 +22567,10 @@ GrB_Info GB_DxB__isne_fp64
 GrB_Info GB_AaddB__isne_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -21363,23 +22578,27 @@ GrB_Info GB_AaddB__isne_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isne_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -21388,6 +22607,7 @@ GrB_Info GB_bind1st__isne_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21399,6 +22619,7 @@ GrB_Info GB_bind2nd__isne_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21410,10 +22631,10 @@ GrB_Info GB_bind1st_tran__isne_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -21423,14 +22644,14 @@ GrB_Info GB_bind2nd_tran__isne_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -21494,8 +22715,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__isne_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -21503,23 +22726,27 @@ GrB_Info GB_AaddB__isne_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isne_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -21528,6 +22755,7 @@ GrB_Info GB_bind1st__isne_fc32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21539,6 +22767,7 @@ GrB_Info GB_bind2nd__isne_fc32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21550,10 +22779,10 @@ GrB_Info GB_bind1st_tran__isne_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -21563,14 +22792,14 @@ GrB_Info GB_bind2nd_tran__isne_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -21634,8 +22863,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__isne_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -21643,23 +22874,27 @@ GrB_Info GB_AaddB__isne_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isne_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -21668,6 +22903,7 @@ GrB_Info GB_bind1st__isne_fc64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21679,6 +22915,7 @@ GrB_Info GB_bind2nd__isne_fc64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21690,10 +22927,10 @@ GrB_Info GB_bind1st_tran__isne_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -21703,14 +22940,14 @@ GrB_Info GB_bind2nd_tran__isne_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -21774,8 +23011,10 @@ GrB_Info GB_DxB__isgt_int8
 GrB_Info GB_AaddB__isgt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -21783,23 +23022,27 @@ GrB_Info GB_AaddB__isgt_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isgt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -21808,6 +23051,7 @@ GrB_Info GB_bind1st__isgt_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21819,6 +23063,7 @@ GrB_Info GB_bind2nd__isgt_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21830,10 +23075,10 @@ GrB_Info GB_bind1st_tran__isgt_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -21843,14 +23088,14 @@ GrB_Info GB_bind2nd_tran__isgt_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -21914,8 +23159,10 @@ GrB_Info GB_DxB__isgt_int16
 GrB_Info GB_AaddB__isgt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -21923,23 +23170,27 @@ GrB_Info GB_AaddB__isgt_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isgt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -21948,6 +23199,7 @@ GrB_Info GB_bind1st__isgt_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21959,6 +23211,7 @@ GrB_Info GB_bind2nd__isgt_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -21970,10 +23223,10 @@ GrB_Info GB_bind1st_tran__isgt_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -21983,14 +23236,14 @@ GrB_Info GB_bind2nd_tran__isgt_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -22054,8 +23307,10 @@ GrB_Info GB_DxB__isgt_int32
 GrB_Info GB_AaddB__isgt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -22063,23 +23318,27 @@ GrB_Info GB_AaddB__isgt_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isgt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -22088,6 +23347,7 @@ GrB_Info GB_bind1st__isgt_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22099,6 +23359,7 @@ GrB_Info GB_bind2nd__isgt_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22110,10 +23371,10 @@ GrB_Info GB_bind1st_tran__isgt_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -22123,14 +23384,14 @@ GrB_Info GB_bind2nd_tran__isgt_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -22194,8 +23455,10 @@ GrB_Info GB_DxB__isgt_int64
 GrB_Info GB_AaddB__isgt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -22203,23 +23466,27 @@ GrB_Info GB_AaddB__isgt_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isgt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -22228,6 +23495,7 @@ GrB_Info GB_bind1st__isgt_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22239,6 +23507,7 @@ GrB_Info GB_bind2nd__isgt_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22250,10 +23519,10 @@ GrB_Info GB_bind1st_tran__isgt_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -22263,14 +23532,14 @@ GrB_Info GB_bind2nd_tran__isgt_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -22334,8 +23603,10 @@ GrB_Info GB_DxB__isgt_uint8
 GrB_Info GB_AaddB__isgt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -22343,23 +23614,27 @@ GrB_Info GB_AaddB__isgt_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isgt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -22368,6 +23643,7 @@ GrB_Info GB_bind1st__isgt_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22379,6 +23655,7 @@ GrB_Info GB_bind2nd__isgt_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22390,10 +23667,10 @@ GrB_Info GB_bind1st_tran__isgt_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -22403,14 +23680,14 @@ GrB_Info GB_bind2nd_tran__isgt_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -22474,8 +23751,10 @@ GrB_Info GB_DxB__isgt_uint16
 GrB_Info GB_AaddB__isgt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -22483,23 +23762,27 @@ GrB_Info GB_AaddB__isgt_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isgt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -22508,6 +23791,7 @@ GrB_Info GB_bind1st__isgt_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22519,6 +23803,7 @@ GrB_Info GB_bind2nd__isgt_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22530,10 +23815,10 @@ GrB_Info GB_bind1st_tran__isgt_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -22543,14 +23828,14 @@ GrB_Info GB_bind2nd_tran__isgt_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -22614,8 +23899,10 @@ GrB_Info GB_DxB__isgt_uint32
 GrB_Info GB_AaddB__isgt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -22623,23 +23910,27 @@ GrB_Info GB_AaddB__isgt_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isgt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -22648,6 +23939,7 @@ GrB_Info GB_bind1st__isgt_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22659,6 +23951,7 @@ GrB_Info GB_bind2nd__isgt_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22670,10 +23963,10 @@ GrB_Info GB_bind1st_tran__isgt_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -22683,14 +23976,14 @@ GrB_Info GB_bind2nd_tran__isgt_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -22754,8 +24047,10 @@ GrB_Info GB_DxB__isgt_uint64
 GrB_Info GB_AaddB__isgt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -22763,23 +24058,27 @@ GrB_Info GB_AaddB__isgt_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isgt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -22788,6 +24087,7 @@ GrB_Info GB_bind1st__isgt_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22799,6 +24099,7 @@ GrB_Info GB_bind2nd__isgt_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22810,10 +24111,10 @@ GrB_Info GB_bind1st_tran__isgt_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -22823,14 +24124,14 @@ GrB_Info GB_bind2nd_tran__isgt_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -22894,8 +24195,10 @@ GrB_Info GB_DxB__isgt_fp32
 GrB_Info GB_AaddB__isgt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -22903,23 +24206,27 @@ GrB_Info GB_AaddB__isgt_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isgt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -22928,6 +24235,7 @@ GrB_Info GB_bind1st__isgt_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22939,6 +24247,7 @@ GrB_Info GB_bind2nd__isgt_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -22950,10 +24259,10 @@ GrB_Info GB_bind1st_tran__isgt_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -22963,14 +24272,14 @@ GrB_Info GB_bind2nd_tran__isgt_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -23034,8 +24343,10 @@ GrB_Info GB_DxB__isgt_fp64
 GrB_Info GB_AaddB__isgt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -23043,23 +24354,27 @@ GrB_Info GB_AaddB__isgt_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isgt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -23068,6 +24383,7 @@ GrB_Info GB_bind1st__isgt_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23079,6 +24395,7 @@ GrB_Info GB_bind2nd__isgt_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23090,10 +24407,10 @@ GrB_Info GB_bind1st_tran__isgt_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -23103,14 +24420,14 @@ GrB_Info GB_bind2nd_tran__isgt_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -23174,8 +24491,10 @@ GrB_Info GB_DxB__islt_int8
 GrB_Info GB_AaddB__islt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -23183,23 +24502,27 @@ GrB_Info GB_AaddB__islt_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__islt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -23208,6 +24531,7 @@ GrB_Info GB_bind1st__islt_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23219,6 +24543,7 @@ GrB_Info GB_bind2nd__islt_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23230,10 +24555,10 @@ GrB_Info GB_bind1st_tran__islt_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -23243,14 +24568,14 @@ GrB_Info GB_bind2nd_tran__islt_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -23314,8 +24639,10 @@ GrB_Info GB_DxB__islt_int16
 GrB_Info GB_AaddB__islt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -23323,23 +24650,27 @@ GrB_Info GB_AaddB__islt_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__islt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -23348,6 +24679,7 @@ GrB_Info GB_bind1st__islt_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23359,6 +24691,7 @@ GrB_Info GB_bind2nd__islt_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23370,10 +24703,10 @@ GrB_Info GB_bind1st_tran__islt_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -23383,14 +24716,14 @@ GrB_Info GB_bind2nd_tran__islt_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -23454,8 +24787,10 @@ GrB_Info GB_DxB__islt_int32
 GrB_Info GB_AaddB__islt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -23463,23 +24798,27 @@ GrB_Info GB_AaddB__islt_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__islt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -23488,6 +24827,7 @@ GrB_Info GB_bind1st__islt_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23499,6 +24839,7 @@ GrB_Info GB_bind2nd__islt_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23510,10 +24851,10 @@ GrB_Info GB_bind1st_tran__islt_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -23523,14 +24864,14 @@ GrB_Info GB_bind2nd_tran__islt_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -23594,8 +24935,10 @@ GrB_Info GB_DxB__islt_int64
 GrB_Info GB_AaddB__islt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -23603,23 +24946,27 @@ GrB_Info GB_AaddB__islt_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__islt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -23628,6 +24975,7 @@ GrB_Info GB_bind1st__islt_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23639,6 +24987,7 @@ GrB_Info GB_bind2nd__islt_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23650,10 +24999,10 @@ GrB_Info GB_bind1st_tran__islt_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -23663,14 +25012,14 @@ GrB_Info GB_bind2nd_tran__islt_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -23734,8 +25083,10 @@ GrB_Info GB_DxB__islt_uint8
 GrB_Info GB_AaddB__islt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -23743,23 +25094,27 @@ GrB_Info GB_AaddB__islt_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__islt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -23768,6 +25123,7 @@ GrB_Info GB_bind1st__islt_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23779,6 +25135,7 @@ GrB_Info GB_bind2nd__islt_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23790,10 +25147,10 @@ GrB_Info GB_bind1st_tran__islt_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -23803,14 +25160,14 @@ GrB_Info GB_bind2nd_tran__islt_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -23874,8 +25231,10 @@ GrB_Info GB_DxB__islt_uint16
 GrB_Info GB_AaddB__islt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -23883,23 +25242,27 @@ GrB_Info GB_AaddB__islt_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__islt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -23908,6 +25271,7 @@ GrB_Info GB_bind1st__islt_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23919,6 +25283,7 @@ GrB_Info GB_bind2nd__islt_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -23930,10 +25295,10 @@ GrB_Info GB_bind1st_tran__islt_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -23943,14 +25308,14 @@ GrB_Info GB_bind2nd_tran__islt_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -24014,8 +25379,10 @@ GrB_Info GB_DxB__islt_uint32
 GrB_Info GB_AaddB__islt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -24023,23 +25390,27 @@ GrB_Info GB_AaddB__islt_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__islt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -24048,6 +25419,7 @@ GrB_Info GB_bind1st__islt_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24059,6 +25431,7 @@ GrB_Info GB_bind2nd__islt_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24070,10 +25443,10 @@ GrB_Info GB_bind1st_tran__islt_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -24083,14 +25456,14 @@ GrB_Info GB_bind2nd_tran__islt_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -24154,8 +25527,10 @@ GrB_Info GB_DxB__islt_uint64
 GrB_Info GB_AaddB__islt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -24163,23 +25538,27 @@ GrB_Info GB_AaddB__islt_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__islt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -24188,6 +25567,7 @@ GrB_Info GB_bind1st__islt_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24199,6 +25579,7 @@ GrB_Info GB_bind2nd__islt_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24210,10 +25591,10 @@ GrB_Info GB_bind1st_tran__islt_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -24223,14 +25604,14 @@ GrB_Info GB_bind2nd_tran__islt_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -24294,8 +25675,10 @@ GrB_Info GB_DxB__islt_fp32
 GrB_Info GB_AaddB__islt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -24303,23 +25686,27 @@ GrB_Info GB_AaddB__islt_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__islt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -24328,6 +25715,7 @@ GrB_Info GB_bind1st__islt_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24339,6 +25727,7 @@ GrB_Info GB_bind2nd__islt_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24350,10 +25739,10 @@ GrB_Info GB_bind1st_tran__islt_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -24363,14 +25752,14 @@ GrB_Info GB_bind2nd_tran__islt_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -24434,8 +25823,10 @@ GrB_Info GB_DxB__islt_fp64
 GrB_Info GB_AaddB__islt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -24443,23 +25834,27 @@ GrB_Info GB_AaddB__islt_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__islt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -24468,6 +25863,7 @@ GrB_Info GB_bind1st__islt_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24479,6 +25875,7 @@ GrB_Info GB_bind2nd__islt_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24490,10 +25887,10 @@ GrB_Info GB_bind1st_tran__islt_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -24503,14 +25900,14 @@ GrB_Info GB_bind2nd_tran__islt_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -24574,8 +25971,10 @@ GrB_Info GB_DxB__isge_int8
 GrB_Info GB_AaddB__isge_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -24583,23 +25982,27 @@ GrB_Info GB_AaddB__isge_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isge_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -24608,6 +26011,7 @@ GrB_Info GB_bind1st__isge_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24619,6 +26023,7 @@ GrB_Info GB_bind2nd__isge_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24630,10 +26035,10 @@ GrB_Info GB_bind1st_tran__isge_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -24643,14 +26048,14 @@ GrB_Info GB_bind2nd_tran__isge_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -24714,8 +26119,10 @@ GrB_Info GB_DxB__isge_int16
 GrB_Info GB_AaddB__isge_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -24723,23 +26130,27 @@ GrB_Info GB_AaddB__isge_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isge_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -24748,6 +26159,7 @@ GrB_Info GB_bind1st__isge_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24759,6 +26171,7 @@ GrB_Info GB_bind2nd__isge_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24770,10 +26183,10 @@ GrB_Info GB_bind1st_tran__isge_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -24783,14 +26196,14 @@ GrB_Info GB_bind2nd_tran__isge_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -24854,8 +26267,10 @@ GrB_Info GB_DxB__isge_int32
 GrB_Info GB_AaddB__isge_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -24863,23 +26278,27 @@ GrB_Info GB_AaddB__isge_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isge_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -24888,6 +26307,7 @@ GrB_Info GB_bind1st__isge_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24899,6 +26319,7 @@ GrB_Info GB_bind2nd__isge_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -24910,10 +26331,10 @@ GrB_Info GB_bind1st_tran__isge_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -24923,14 +26344,14 @@ GrB_Info GB_bind2nd_tran__isge_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -24994,8 +26415,10 @@ GrB_Info GB_DxB__isge_int64
 GrB_Info GB_AaddB__isge_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -25003,23 +26426,27 @@ GrB_Info GB_AaddB__isge_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isge_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -25028,6 +26455,7 @@ GrB_Info GB_bind1st__isge_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25039,6 +26467,7 @@ GrB_Info GB_bind2nd__isge_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25050,10 +26479,10 @@ GrB_Info GB_bind1st_tran__isge_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -25063,14 +26492,14 @@ GrB_Info GB_bind2nd_tran__isge_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -25134,8 +26563,10 @@ GrB_Info GB_DxB__isge_uint8
 GrB_Info GB_AaddB__isge_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -25143,23 +26574,27 @@ GrB_Info GB_AaddB__isge_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isge_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -25168,6 +26603,7 @@ GrB_Info GB_bind1st__isge_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25179,6 +26615,7 @@ GrB_Info GB_bind2nd__isge_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25190,10 +26627,10 @@ GrB_Info GB_bind1st_tran__isge_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -25203,14 +26640,14 @@ GrB_Info GB_bind2nd_tran__isge_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -25274,8 +26711,10 @@ GrB_Info GB_DxB__isge_uint16
 GrB_Info GB_AaddB__isge_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -25283,23 +26722,27 @@ GrB_Info GB_AaddB__isge_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isge_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -25308,6 +26751,7 @@ GrB_Info GB_bind1st__isge_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25319,6 +26763,7 @@ GrB_Info GB_bind2nd__isge_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25330,10 +26775,10 @@ GrB_Info GB_bind1st_tran__isge_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -25343,14 +26788,14 @@ GrB_Info GB_bind2nd_tran__isge_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -25414,8 +26859,10 @@ GrB_Info GB_DxB__isge_uint32
 GrB_Info GB_AaddB__isge_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -25423,23 +26870,27 @@ GrB_Info GB_AaddB__isge_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isge_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -25448,6 +26899,7 @@ GrB_Info GB_bind1st__isge_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25459,6 +26911,7 @@ GrB_Info GB_bind2nd__isge_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25470,10 +26923,10 @@ GrB_Info GB_bind1st_tran__isge_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -25483,14 +26936,14 @@ GrB_Info GB_bind2nd_tran__isge_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -25554,8 +27007,10 @@ GrB_Info GB_DxB__isge_uint64
 GrB_Info GB_AaddB__isge_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -25563,23 +27018,27 @@ GrB_Info GB_AaddB__isge_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isge_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -25588,6 +27047,7 @@ GrB_Info GB_bind1st__isge_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25599,6 +27059,7 @@ GrB_Info GB_bind2nd__isge_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25610,10 +27071,10 @@ GrB_Info GB_bind1st_tran__isge_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -25623,14 +27084,14 @@ GrB_Info GB_bind2nd_tran__isge_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -25694,8 +27155,10 @@ GrB_Info GB_DxB__isge_fp32
 GrB_Info GB_AaddB__isge_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -25703,23 +27166,27 @@ GrB_Info GB_AaddB__isge_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isge_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -25728,6 +27195,7 @@ GrB_Info GB_bind1st__isge_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25739,6 +27207,7 @@ GrB_Info GB_bind2nd__isge_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25750,10 +27219,10 @@ GrB_Info GB_bind1st_tran__isge_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -25763,14 +27232,14 @@ GrB_Info GB_bind2nd_tran__isge_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -25834,8 +27303,10 @@ GrB_Info GB_DxB__isge_fp64
 GrB_Info GB_AaddB__isge_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -25843,23 +27314,27 @@ GrB_Info GB_AaddB__isge_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isge_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -25868,6 +27343,7 @@ GrB_Info GB_bind1st__isge_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25879,6 +27355,7 @@ GrB_Info GB_bind2nd__isge_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -25890,10 +27367,10 @@ GrB_Info GB_bind1st_tran__isge_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -25903,14 +27380,14 @@ GrB_Info GB_bind2nd_tran__isge_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -25974,8 +27451,10 @@ GrB_Info GB_DxB__isle_int8
 GrB_Info GB_AaddB__isle_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -25983,23 +27462,27 @@ GrB_Info GB_AaddB__isle_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isle_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -26008,6 +27491,7 @@ GrB_Info GB_bind1st__isle_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26019,6 +27503,7 @@ GrB_Info GB_bind2nd__isle_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26030,10 +27515,10 @@ GrB_Info GB_bind1st_tran__isle_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -26043,14 +27528,14 @@ GrB_Info GB_bind2nd_tran__isle_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -26114,8 +27599,10 @@ GrB_Info GB_DxB__isle_int16
 GrB_Info GB_AaddB__isle_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -26123,23 +27610,27 @@ GrB_Info GB_AaddB__isle_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isle_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -26148,6 +27639,7 @@ GrB_Info GB_bind1st__isle_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26159,6 +27651,7 @@ GrB_Info GB_bind2nd__isle_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26170,10 +27663,10 @@ GrB_Info GB_bind1st_tran__isle_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -26183,14 +27676,14 @@ GrB_Info GB_bind2nd_tran__isle_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -26254,8 +27747,10 @@ GrB_Info GB_DxB__isle_int32
 GrB_Info GB_AaddB__isle_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -26263,23 +27758,27 @@ GrB_Info GB_AaddB__isle_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isle_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -26288,6 +27787,7 @@ GrB_Info GB_bind1st__isle_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26299,6 +27799,7 @@ GrB_Info GB_bind2nd__isle_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26310,10 +27811,10 @@ GrB_Info GB_bind1st_tran__isle_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -26323,14 +27824,14 @@ GrB_Info GB_bind2nd_tran__isle_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -26394,8 +27895,10 @@ GrB_Info GB_DxB__isle_int64
 GrB_Info GB_AaddB__isle_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -26403,23 +27906,27 @@ GrB_Info GB_AaddB__isle_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isle_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -26428,6 +27935,7 @@ GrB_Info GB_bind1st__isle_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26439,6 +27947,7 @@ GrB_Info GB_bind2nd__isle_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26450,10 +27959,10 @@ GrB_Info GB_bind1st_tran__isle_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -26463,14 +27972,14 @@ GrB_Info GB_bind2nd_tran__isle_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -26534,8 +28043,10 @@ GrB_Info GB_DxB__isle_uint8
 GrB_Info GB_AaddB__isle_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -26543,23 +28054,27 @@ GrB_Info GB_AaddB__isle_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isle_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -26568,6 +28083,7 @@ GrB_Info GB_bind1st__isle_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26579,6 +28095,7 @@ GrB_Info GB_bind2nd__isle_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26590,10 +28107,10 @@ GrB_Info GB_bind1st_tran__isle_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -26603,14 +28120,14 @@ GrB_Info GB_bind2nd_tran__isle_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -26674,8 +28191,10 @@ GrB_Info GB_DxB__isle_uint16
 GrB_Info GB_AaddB__isle_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -26683,23 +28202,27 @@ GrB_Info GB_AaddB__isle_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isle_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -26708,6 +28231,7 @@ GrB_Info GB_bind1st__isle_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26719,6 +28243,7 @@ GrB_Info GB_bind2nd__isle_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26730,10 +28255,10 @@ GrB_Info GB_bind1st_tran__isle_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -26743,14 +28268,14 @@ GrB_Info GB_bind2nd_tran__isle_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -26814,8 +28339,10 @@ GrB_Info GB_DxB__isle_uint32
 GrB_Info GB_AaddB__isle_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -26823,23 +28350,27 @@ GrB_Info GB_AaddB__isle_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isle_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -26848,6 +28379,7 @@ GrB_Info GB_bind1st__isle_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26859,6 +28391,7 @@ GrB_Info GB_bind2nd__isle_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26870,10 +28403,10 @@ GrB_Info GB_bind1st_tran__isle_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -26883,14 +28416,14 @@ GrB_Info GB_bind2nd_tran__isle_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -26954,8 +28487,10 @@ GrB_Info GB_DxB__isle_uint64
 GrB_Info GB_AaddB__isle_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -26963,23 +28498,27 @@ GrB_Info GB_AaddB__isle_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isle_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -26988,6 +28527,7 @@ GrB_Info GB_bind1st__isle_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -26999,6 +28539,7 @@ GrB_Info GB_bind2nd__isle_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27010,10 +28551,10 @@ GrB_Info GB_bind1st_tran__isle_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -27023,14 +28564,14 @@ GrB_Info GB_bind2nd_tran__isle_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -27094,8 +28635,10 @@ GrB_Info GB_DxB__isle_fp32
 GrB_Info GB_AaddB__isle_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -27103,23 +28646,27 @@ GrB_Info GB_AaddB__isle_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isle_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -27128,6 +28675,7 @@ GrB_Info GB_bind1st__isle_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27139,6 +28687,7 @@ GrB_Info GB_bind2nd__isle_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27150,10 +28699,10 @@ GrB_Info GB_bind1st_tran__isle_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -27163,14 +28712,14 @@ GrB_Info GB_bind2nd_tran__isle_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -27234,8 +28783,10 @@ GrB_Info GB_DxB__isle_fp64
 GrB_Info GB_AaddB__isle_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -27243,23 +28794,27 @@ GrB_Info GB_AaddB__isle_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__isle_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -27268,6 +28823,7 @@ GrB_Info GB_bind1st__isle_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27279,6 +28835,7 @@ GrB_Info GB_bind2nd__isle_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27290,10 +28847,10 @@ GrB_Info GB_bind1st_tran__isle_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -27303,14 +28860,14 @@ GrB_Info GB_bind2nd_tran__isle_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -27374,8 +28931,10 @@ GrB_Info GB_DxB__eq_bool
 GrB_Info GB_AaddB__eq_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -27383,23 +28942,27 @@ GrB_Info GB_AaddB__eq_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -27408,6 +28971,7 @@ GrB_Info GB_bind1st__eq_bool
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27419,6 +28983,7 @@ GrB_Info GB_bind2nd__eq_bool
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27430,10 +28995,10 @@ GrB_Info GB_bind1st_tran__eq_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -27443,14 +29008,14 @@ GrB_Info GB_bind2nd_tran__eq_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -27514,8 +29079,10 @@ GrB_Info GB_DxB__eq_int8
 GrB_Info GB_AaddB__eq_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -27523,23 +29090,27 @@ GrB_Info GB_AaddB__eq_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -27548,6 +29119,7 @@ GrB_Info GB_bind1st__eq_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27559,6 +29131,7 @@ GrB_Info GB_bind2nd__eq_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27570,10 +29143,10 @@ GrB_Info GB_bind1st_tran__eq_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -27583,14 +29156,14 @@ GrB_Info GB_bind2nd_tran__eq_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -27654,8 +29227,10 @@ GrB_Info GB_DxB__eq_int16
 GrB_Info GB_AaddB__eq_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -27663,23 +29238,27 @@ GrB_Info GB_AaddB__eq_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -27688,6 +29267,7 @@ GrB_Info GB_bind1st__eq_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27699,6 +29279,7 @@ GrB_Info GB_bind2nd__eq_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27710,10 +29291,10 @@ GrB_Info GB_bind1st_tran__eq_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -27723,14 +29304,14 @@ GrB_Info GB_bind2nd_tran__eq_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -27794,8 +29375,10 @@ GrB_Info GB_DxB__eq_int32
 GrB_Info GB_AaddB__eq_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -27803,23 +29386,27 @@ GrB_Info GB_AaddB__eq_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -27828,6 +29415,7 @@ GrB_Info GB_bind1st__eq_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27839,6 +29427,7 @@ GrB_Info GB_bind2nd__eq_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27850,10 +29439,10 @@ GrB_Info GB_bind1st_tran__eq_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -27863,14 +29452,14 @@ GrB_Info GB_bind2nd_tran__eq_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -27934,8 +29523,10 @@ GrB_Info GB_DxB__eq_int64
 GrB_Info GB_AaddB__eq_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -27943,23 +29534,27 @@ GrB_Info GB_AaddB__eq_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -27968,6 +29563,7 @@ GrB_Info GB_bind1st__eq_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27979,6 +29575,7 @@ GrB_Info GB_bind2nd__eq_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -27990,10 +29587,10 @@ GrB_Info GB_bind1st_tran__eq_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -28003,14 +29600,14 @@ GrB_Info GB_bind2nd_tran__eq_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -28074,8 +29671,10 @@ GrB_Info GB_DxB__eq_uint8
 GrB_Info GB_AaddB__eq_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -28083,23 +29682,27 @@ GrB_Info GB_AaddB__eq_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -28108,6 +29711,7 @@ GrB_Info GB_bind1st__eq_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28119,6 +29723,7 @@ GrB_Info GB_bind2nd__eq_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28130,10 +29735,10 @@ GrB_Info GB_bind1st_tran__eq_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -28143,14 +29748,14 @@ GrB_Info GB_bind2nd_tran__eq_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -28214,8 +29819,10 @@ GrB_Info GB_DxB__eq_uint16
 GrB_Info GB_AaddB__eq_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -28223,23 +29830,27 @@ GrB_Info GB_AaddB__eq_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -28248,6 +29859,7 @@ GrB_Info GB_bind1st__eq_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28259,6 +29871,7 @@ GrB_Info GB_bind2nd__eq_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28270,10 +29883,10 @@ GrB_Info GB_bind1st_tran__eq_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -28283,14 +29896,14 @@ GrB_Info GB_bind2nd_tran__eq_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -28354,8 +29967,10 @@ GrB_Info GB_DxB__eq_uint32
 GrB_Info GB_AaddB__eq_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -28363,23 +29978,27 @@ GrB_Info GB_AaddB__eq_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -28388,6 +30007,7 @@ GrB_Info GB_bind1st__eq_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28399,6 +30019,7 @@ GrB_Info GB_bind2nd__eq_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28410,10 +30031,10 @@ GrB_Info GB_bind1st_tran__eq_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -28423,14 +30044,14 @@ GrB_Info GB_bind2nd_tran__eq_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -28494,8 +30115,10 @@ GrB_Info GB_DxB__eq_uint64
 GrB_Info GB_AaddB__eq_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -28503,23 +30126,27 @@ GrB_Info GB_AaddB__eq_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -28528,6 +30155,7 @@ GrB_Info GB_bind1st__eq_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28539,6 +30167,7 @@ GrB_Info GB_bind2nd__eq_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28550,10 +30179,10 @@ GrB_Info GB_bind1st_tran__eq_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -28563,14 +30192,14 @@ GrB_Info GB_bind2nd_tran__eq_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -28634,8 +30263,10 @@ GrB_Info GB_DxB__eq_fp32
 GrB_Info GB_AaddB__eq_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -28643,23 +30274,27 @@ GrB_Info GB_AaddB__eq_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -28668,6 +30303,7 @@ GrB_Info GB_bind1st__eq_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28679,6 +30315,7 @@ GrB_Info GB_bind2nd__eq_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28690,10 +30327,10 @@ GrB_Info GB_bind1st_tran__eq_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -28703,14 +30340,14 @@ GrB_Info GB_bind2nd_tran__eq_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -28774,8 +30411,10 @@ GrB_Info GB_DxB__eq_fp64
 GrB_Info GB_AaddB__eq_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -28783,23 +30422,27 @@ GrB_Info GB_AaddB__eq_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -28808,6 +30451,7 @@ GrB_Info GB_bind1st__eq_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28819,6 +30463,7 @@ GrB_Info GB_bind2nd__eq_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28830,10 +30475,10 @@ GrB_Info GB_bind1st_tran__eq_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -28843,14 +30488,14 @@ GrB_Info GB_bind2nd_tran__eq_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -28914,8 +30559,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__eq_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -28923,23 +30570,27 @@ GrB_Info GB_AaddB__eq_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -28948,6 +30599,7 @@ GrB_Info GB_bind1st__eq_fc32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28959,6 +30611,7 @@ GrB_Info GB_bind2nd__eq_fc32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -28970,10 +30623,10 @@ GrB_Info GB_bind1st_tran__eq_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -28983,14 +30636,14 @@ GrB_Info GB_bind2nd_tran__eq_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -29054,8 +30707,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__eq_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -29063,23 +30718,27 @@ GrB_Info GB_AaddB__eq_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__eq_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -29088,6 +30747,7 @@ GrB_Info GB_bind1st__eq_fc64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29099,6 +30759,7 @@ GrB_Info GB_bind2nd__eq_fc64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29110,10 +30771,10 @@ GrB_Info GB_bind1st_tran__eq_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -29123,14 +30784,14 @@ GrB_Info GB_bind2nd_tran__eq_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -29194,8 +30855,10 @@ GrB_Info GB_DxB__ne_int8
 GrB_Info GB_AaddB__ne_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -29203,23 +30866,27 @@ GrB_Info GB_AaddB__ne_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ne_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -29228,6 +30895,7 @@ GrB_Info GB_bind1st__ne_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29239,6 +30907,7 @@ GrB_Info GB_bind2nd__ne_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29250,10 +30919,10 @@ GrB_Info GB_bind1st_tran__ne_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -29263,14 +30932,14 @@ GrB_Info GB_bind2nd_tran__ne_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -29334,8 +31003,10 @@ GrB_Info GB_DxB__ne_int16
 GrB_Info GB_AaddB__ne_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -29343,23 +31014,27 @@ GrB_Info GB_AaddB__ne_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ne_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -29368,6 +31043,7 @@ GrB_Info GB_bind1st__ne_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29379,6 +31055,7 @@ GrB_Info GB_bind2nd__ne_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29390,10 +31067,10 @@ GrB_Info GB_bind1st_tran__ne_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -29403,14 +31080,14 @@ GrB_Info GB_bind2nd_tran__ne_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -29474,8 +31151,10 @@ GrB_Info GB_DxB__ne_int32
 GrB_Info GB_AaddB__ne_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -29483,23 +31162,27 @@ GrB_Info GB_AaddB__ne_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ne_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -29508,6 +31191,7 @@ GrB_Info GB_bind1st__ne_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29519,6 +31203,7 @@ GrB_Info GB_bind2nd__ne_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29530,10 +31215,10 @@ GrB_Info GB_bind1st_tran__ne_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -29543,14 +31228,14 @@ GrB_Info GB_bind2nd_tran__ne_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -29614,8 +31299,10 @@ GrB_Info GB_DxB__ne_int64
 GrB_Info GB_AaddB__ne_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -29623,23 +31310,27 @@ GrB_Info GB_AaddB__ne_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ne_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -29648,6 +31339,7 @@ GrB_Info GB_bind1st__ne_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29659,6 +31351,7 @@ GrB_Info GB_bind2nd__ne_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29670,10 +31363,10 @@ GrB_Info GB_bind1st_tran__ne_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -29683,14 +31376,14 @@ GrB_Info GB_bind2nd_tran__ne_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -29754,8 +31447,10 @@ GrB_Info GB_DxB__ne_uint8
 GrB_Info GB_AaddB__ne_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -29763,23 +31458,27 @@ GrB_Info GB_AaddB__ne_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ne_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -29788,6 +31487,7 @@ GrB_Info GB_bind1st__ne_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29799,6 +31499,7 @@ GrB_Info GB_bind2nd__ne_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29810,10 +31511,10 @@ GrB_Info GB_bind1st_tran__ne_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -29823,14 +31524,14 @@ GrB_Info GB_bind2nd_tran__ne_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -29894,8 +31595,10 @@ GrB_Info GB_DxB__ne_uint16
 GrB_Info GB_AaddB__ne_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -29903,23 +31606,27 @@ GrB_Info GB_AaddB__ne_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ne_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -29928,6 +31635,7 @@ GrB_Info GB_bind1st__ne_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29939,6 +31647,7 @@ GrB_Info GB_bind2nd__ne_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -29950,10 +31659,10 @@ GrB_Info GB_bind1st_tran__ne_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -29963,14 +31672,14 @@ GrB_Info GB_bind2nd_tran__ne_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -30034,8 +31743,10 @@ GrB_Info GB_DxB__ne_uint32
 GrB_Info GB_AaddB__ne_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -30043,23 +31754,27 @@ GrB_Info GB_AaddB__ne_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ne_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -30068,6 +31783,7 @@ GrB_Info GB_bind1st__ne_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30079,6 +31795,7 @@ GrB_Info GB_bind2nd__ne_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30090,10 +31807,10 @@ GrB_Info GB_bind1st_tran__ne_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -30103,14 +31820,14 @@ GrB_Info GB_bind2nd_tran__ne_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -30174,8 +31891,10 @@ GrB_Info GB_DxB__ne_uint64
 GrB_Info GB_AaddB__ne_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -30183,23 +31902,27 @@ GrB_Info GB_AaddB__ne_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ne_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -30208,6 +31931,7 @@ GrB_Info GB_bind1st__ne_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30219,6 +31943,7 @@ GrB_Info GB_bind2nd__ne_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30230,10 +31955,10 @@ GrB_Info GB_bind1st_tran__ne_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -30243,14 +31968,14 @@ GrB_Info GB_bind2nd_tran__ne_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -30314,8 +32039,10 @@ GrB_Info GB_DxB__ne_fp32
 GrB_Info GB_AaddB__ne_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -30323,23 +32050,27 @@ GrB_Info GB_AaddB__ne_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ne_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -30348,6 +32079,7 @@ GrB_Info GB_bind1st__ne_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30359,6 +32091,7 @@ GrB_Info GB_bind2nd__ne_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30370,10 +32103,10 @@ GrB_Info GB_bind1st_tran__ne_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -30383,14 +32116,14 @@ GrB_Info GB_bind2nd_tran__ne_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -30454,8 +32187,10 @@ GrB_Info GB_DxB__ne_fp64
 GrB_Info GB_AaddB__ne_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -30463,23 +32198,27 @@ GrB_Info GB_AaddB__ne_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ne_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -30488,6 +32227,7 @@ GrB_Info GB_bind1st__ne_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30499,6 +32239,7 @@ GrB_Info GB_bind2nd__ne_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30510,10 +32251,10 @@ GrB_Info GB_bind1st_tran__ne_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -30523,14 +32264,14 @@ GrB_Info GB_bind2nd_tran__ne_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -30594,8 +32335,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__ne_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -30603,23 +32346,27 @@ GrB_Info GB_AaddB__ne_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ne_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -30628,6 +32375,7 @@ GrB_Info GB_bind1st__ne_fc32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30639,6 +32387,7 @@ GrB_Info GB_bind2nd__ne_fc32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30650,10 +32399,10 @@ GrB_Info GB_bind1st_tran__ne_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -30663,14 +32412,14 @@ GrB_Info GB_bind2nd_tran__ne_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -30734,8 +32483,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__ne_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -30743,23 +32494,27 @@ GrB_Info GB_AaddB__ne_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ne_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -30768,6 +32523,7 @@ GrB_Info GB_bind1st__ne_fc64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30779,6 +32535,7 @@ GrB_Info GB_bind2nd__ne_fc64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30790,10 +32547,10 @@ GrB_Info GB_bind1st_tran__ne_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -30803,14 +32560,14 @@ GrB_Info GB_bind2nd_tran__ne_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -30874,8 +32631,10 @@ GrB_Info GB_DxB__gt_bool
 GrB_Info GB_AaddB__gt_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -30883,23 +32642,27 @@ GrB_Info GB_AaddB__gt_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__gt_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -30908,6 +32671,7 @@ GrB_Info GB_bind1st__gt_bool
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30919,6 +32683,7 @@ GrB_Info GB_bind2nd__gt_bool
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -30930,10 +32695,10 @@ GrB_Info GB_bind1st_tran__gt_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -30943,14 +32708,14 @@ GrB_Info GB_bind2nd_tran__gt_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -31014,8 +32779,10 @@ GrB_Info GB_DxB__gt_int8
 GrB_Info GB_AaddB__gt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -31023,23 +32790,27 @@ GrB_Info GB_AaddB__gt_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__gt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -31048,6 +32819,7 @@ GrB_Info GB_bind1st__gt_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31059,6 +32831,7 @@ GrB_Info GB_bind2nd__gt_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31070,10 +32843,10 @@ GrB_Info GB_bind1st_tran__gt_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -31083,14 +32856,14 @@ GrB_Info GB_bind2nd_tran__gt_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -31154,8 +32927,10 @@ GrB_Info GB_DxB__gt_int16
 GrB_Info GB_AaddB__gt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -31163,23 +32938,27 @@ GrB_Info GB_AaddB__gt_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__gt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -31188,6 +32967,7 @@ GrB_Info GB_bind1st__gt_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31199,6 +32979,7 @@ GrB_Info GB_bind2nd__gt_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31210,10 +32991,10 @@ GrB_Info GB_bind1st_tran__gt_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -31223,14 +33004,14 @@ GrB_Info GB_bind2nd_tran__gt_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -31294,8 +33075,10 @@ GrB_Info GB_DxB__gt_int32
 GrB_Info GB_AaddB__gt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -31303,23 +33086,27 @@ GrB_Info GB_AaddB__gt_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__gt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -31328,6 +33115,7 @@ GrB_Info GB_bind1st__gt_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31339,6 +33127,7 @@ GrB_Info GB_bind2nd__gt_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31350,10 +33139,10 @@ GrB_Info GB_bind1st_tran__gt_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -31363,14 +33152,14 @@ GrB_Info GB_bind2nd_tran__gt_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -31434,8 +33223,10 @@ GrB_Info GB_DxB__gt_int64
 GrB_Info GB_AaddB__gt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -31443,23 +33234,27 @@ GrB_Info GB_AaddB__gt_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__gt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -31468,6 +33263,7 @@ GrB_Info GB_bind1st__gt_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31479,6 +33275,7 @@ GrB_Info GB_bind2nd__gt_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31490,10 +33287,10 @@ GrB_Info GB_bind1st_tran__gt_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -31503,14 +33300,14 @@ GrB_Info GB_bind2nd_tran__gt_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -31574,8 +33371,10 @@ GrB_Info GB_DxB__gt_uint8
 GrB_Info GB_AaddB__gt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -31583,23 +33382,27 @@ GrB_Info GB_AaddB__gt_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__gt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -31608,6 +33411,7 @@ GrB_Info GB_bind1st__gt_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31619,6 +33423,7 @@ GrB_Info GB_bind2nd__gt_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31630,10 +33435,10 @@ GrB_Info GB_bind1st_tran__gt_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -31643,14 +33448,14 @@ GrB_Info GB_bind2nd_tran__gt_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -31714,8 +33519,10 @@ GrB_Info GB_DxB__gt_uint16
 GrB_Info GB_AaddB__gt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -31723,23 +33530,27 @@ GrB_Info GB_AaddB__gt_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__gt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -31748,6 +33559,7 @@ GrB_Info GB_bind1st__gt_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31759,6 +33571,7 @@ GrB_Info GB_bind2nd__gt_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31770,10 +33583,10 @@ GrB_Info GB_bind1st_tran__gt_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -31783,14 +33596,14 @@ GrB_Info GB_bind2nd_tran__gt_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -31854,8 +33667,10 @@ GrB_Info GB_DxB__gt_uint32
 GrB_Info GB_AaddB__gt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -31863,23 +33678,27 @@ GrB_Info GB_AaddB__gt_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__gt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -31888,6 +33707,7 @@ GrB_Info GB_bind1st__gt_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31899,6 +33719,7 @@ GrB_Info GB_bind2nd__gt_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -31910,10 +33731,10 @@ GrB_Info GB_bind1st_tran__gt_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -31923,14 +33744,14 @@ GrB_Info GB_bind2nd_tran__gt_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -31994,8 +33815,10 @@ GrB_Info GB_DxB__gt_uint64
 GrB_Info GB_AaddB__gt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -32003,23 +33826,27 @@ GrB_Info GB_AaddB__gt_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__gt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -32028,6 +33855,7 @@ GrB_Info GB_bind1st__gt_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32039,6 +33867,7 @@ GrB_Info GB_bind2nd__gt_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32050,10 +33879,10 @@ GrB_Info GB_bind1st_tran__gt_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -32063,14 +33892,14 @@ GrB_Info GB_bind2nd_tran__gt_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -32134,8 +33963,10 @@ GrB_Info GB_DxB__gt_fp32
 GrB_Info GB_AaddB__gt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -32143,23 +33974,27 @@ GrB_Info GB_AaddB__gt_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__gt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -32168,6 +34003,7 @@ GrB_Info GB_bind1st__gt_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32179,6 +34015,7 @@ GrB_Info GB_bind2nd__gt_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32190,10 +34027,10 @@ GrB_Info GB_bind1st_tran__gt_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -32203,14 +34040,14 @@ GrB_Info GB_bind2nd_tran__gt_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -32274,8 +34111,10 @@ GrB_Info GB_DxB__gt_fp64
 GrB_Info GB_AaddB__gt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -32283,23 +34122,27 @@ GrB_Info GB_AaddB__gt_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__gt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -32308,6 +34151,7 @@ GrB_Info GB_bind1st__gt_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32319,6 +34163,7 @@ GrB_Info GB_bind2nd__gt_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32330,10 +34175,10 @@ GrB_Info GB_bind1st_tran__gt_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -32343,14 +34188,14 @@ GrB_Info GB_bind2nd_tran__gt_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -32414,8 +34259,10 @@ GrB_Info GB_DxB__lt_bool
 GrB_Info GB_AaddB__lt_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -32423,23 +34270,27 @@ GrB_Info GB_AaddB__lt_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lt_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -32448,6 +34299,7 @@ GrB_Info GB_bind1st__lt_bool
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32459,6 +34311,7 @@ GrB_Info GB_bind2nd__lt_bool
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32470,10 +34323,10 @@ GrB_Info GB_bind1st_tran__lt_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -32483,14 +34336,14 @@ GrB_Info GB_bind2nd_tran__lt_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -32554,8 +34407,10 @@ GrB_Info GB_DxB__lt_int8
 GrB_Info GB_AaddB__lt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -32563,23 +34418,27 @@ GrB_Info GB_AaddB__lt_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -32588,6 +34447,7 @@ GrB_Info GB_bind1st__lt_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32599,6 +34459,7 @@ GrB_Info GB_bind2nd__lt_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32610,10 +34471,10 @@ GrB_Info GB_bind1st_tran__lt_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -32623,14 +34484,14 @@ GrB_Info GB_bind2nd_tran__lt_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -32694,8 +34555,10 @@ GrB_Info GB_DxB__lt_int16
 GrB_Info GB_AaddB__lt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -32703,23 +34566,27 @@ GrB_Info GB_AaddB__lt_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -32728,6 +34595,7 @@ GrB_Info GB_bind1st__lt_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32739,6 +34607,7 @@ GrB_Info GB_bind2nd__lt_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32750,10 +34619,10 @@ GrB_Info GB_bind1st_tran__lt_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -32763,14 +34632,14 @@ GrB_Info GB_bind2nd_tran__lt_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -32834,8 +34703,10 @@ GrB_Info GB_DxB__lt_int32
 GrB_Info GB_AaddB__lt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -32843,23 +34714,27 @@ GrB_Info GB_AaddB__lt_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -32868,6 +34743,7 @@ GrB_Info GB_bind1st__lt_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32879,6 +34755,7 @@ GrB_Info GB_bind2nd__lt_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -32890,10 +34767,10 @@ GrB_Info GB_bind1st_tran__lt_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -32903,14 +34780,14 @@ GrB_Info GB_bind2nd_tran__lt_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -32974,8 +34851,10 @@ GrB_Info GB_DxB__lt_int64
 GrB_Info GB_AaddB__lt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -32983,23 +34862,27 @@ GrB_Info GB_AaddB__lt_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -33008,6 +34891,7 @@ GrB_Info GB_bind1st__lt_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33019,6 +34903,7 @@ GrB_Info GB_bind2nd__lt_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33030,10 +34915,10 @@ GrB_Info GB_bind1st_tran__lt_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -33043,14 +34928,14 @@ GrB_Info GB_bind2nd_tran__lt_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -33114,8 +34999,10 @@ GrB_Info GB_DxB__lt_uint8
 GrB_Info GB_AaddB__lt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -33123,23 +35010,27 @@ GrB_Info GB_AaddB__lt_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -33148,6 +35039,7 @@ GrB_Info GB_bind1st__lt_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33159,6 +35051,7 @@ GrB_Info GB_bind2nd__lt_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33170,10 +35063,10 @@ GrB_Info GB_bind1st_tran__lt_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -33183,14 +35076,14 @@ GrB_Info GB_bind2nd_tran__lt_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -33254,8 +35147,10 @@ GrB_Info GB_DxB__lt_uint16
 GrB_Info GB_AaddB__lt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -33263,23 +35158,27 @@ GrB_Info GB_AaddB__lt_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -33288,6 +35187,7 @@ GrB_Info GB_bind1st__lt_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33299,6 +35199,7 @@ GrB_Info GB_bind2nd__lt_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33310,10 +35211,10 @@ GrB_Info GB_bind1st_tran__lt_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -33323,14 +35224,14 @@ GrB_Info GB_bind2nd_tran__lt_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -33394,8 +35295,10 @@ GrB_Info GB_DxB__lt_uint32
 GrB_Info GB_AaddB__lt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -33403,23 +35306,27 @@ GrB_Info GB_AaddB__lt_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -33428,6 +35335,7 @@ GrB_Info GB_bind1st__lt_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33439,6 +35347,7 @@ GrB_Info GB_bind2nd__lt_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33450,10 +35359,10 @@ GrB_Info GB_bind1st_tran__lt_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -33463,14 +35372,14 @@ GrB_Info GB_bind2nd_tran__lt_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -33534,8 +35443,10 @@ GrB_Info GB_DxB__lt_uint64
 GrB_Info GB_AaddB__lt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -33543,23 +35454,27 @@ GrB_Info GB_AaddB__lt_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -33568,6 +35483,7 @@ GrB_Info GB_bind1st__lt_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33579,6 +35495,7 @@ GrB_Info GB_bind2nd__lt_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33590,10 +35507,10 @@ GrB_Info GB_bind1st_tran__lt_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -33603,14 +35520,14 @@ GrB_Info GB_bind2nd_tran__lt_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -33674,8 +35591,10 @@ GrB_Info GB_DxB__lt_fp32
 GrB_Info GB_AaddB__lt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -33683,23 +35602,27 @@ GrB_Info GB_AaddB__lt_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -33708,6 +35631,7 @@ GrB_Info GB_bind1st__lt_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33719,6 +35643,7 @@ GrB_Info GB_bind2nd__lt_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33730,10 +35655,10 @@ GrB_Info GB_bind1st_tran__lt_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -33743,14 +35668,14 @@ GrB_Info GB_bind2nd_tran__lt_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -33814,8 +35739,10 @@ GrB_Info GB_DxB__lt_fp64
 GrB_Info GB_AaddB__lt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -33823,23 +35750,27 @@ GrB_Info GB_AaddB__lt_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -33848,6 +35779,7 @@ GrB_Info GB_bind1st__lt_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33859,6 +35791,7 @@ GrB_Info GB_bind2nd__lt_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33870,10 +35803,10 @@ GrB_Info GB_bind1st_tran__lt_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -33883,14 +35816,14 @@ GrB_Info GB_bind2nd_tran__lt_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -33954,8 +35887,10 @@ GrB_Info GB_DxB__ge_bool
 GrB_Info GB_AaddB__ge_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -33963,23 +35898,27 @@ GrB_Info GB_AaddB__ge_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ge_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -33988,6 +35927,7 @@ GrB_Info GB_bind1st__ge_bool
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -33999,6 +35939,7 @@ GrB_Info GB_bind2nd__ge_bool
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34010,10 +35951,10 @@ GrB_Info GB_bind1st_tran__ge_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -34023,14 +35964,14 @@ GrB_Info GB_bind2nd_tran__ge_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -34094,8 +36035,10 @@ GrB_Info GB_DxB__ge_int8
 GrB_Info GB_AaddB__ge_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -34103,23 +36046,27 @@ GrB_Info GB_AaddB__ge_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ge_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -34128,6 +36075,7 @@ GrB_Info GB_bind1st__ge_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34139,6 +36087,7 @@ GrB_Info GB_bind2nd__ge_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34150,10 +36099,10 @@ GrB_Info GB_bind1st_tran__ge_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -34163,14 +36112,14 @@ GrB_Info GB_bind2nd_tran__ge_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -34234,8 +36183,10 @@ GrB_Info GB_DxB__ge_int16
 GrB_Info GB_AaddB__ge_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -34243,23 +36194,27 @@ GrB_Info GB_AaddB__ge_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ge_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -34268,6 +36223,7 @@ GrB_Info GB_bind1st__ge_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34279,6 +36235,7 @@ GrB_Info GB_bind2nd__ge_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34290,10 +36247,10 @@ GrB_Info GB_bind1st_tran__ge_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -34303,14 +36260,14 @@ GrB_Info GB_bind2nd_tran__ge_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -34374,8 +36331,10 @@ GrB_Info GB_DxB__ge_int32
 GrB_Info GB_AaddB__ge_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -34383,23 +36342,27 @@ GrB_Info GB_AaddB__ge_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ge_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -34408,6 +36371,7 @@ GrB_Info GB_bind1st__ge_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34419,6 +36383,7 @@ GrB_Info GB_bind2nd__ge_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34430,10 +36395,10 @@ GrB_Info GB_bind1st_tran__ge_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -34443,14 +36408,14 @@ GrB_Info GB_bind2nd_tran__ge_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -34514,8 +36479,10 @@ GrB_Info GB_DxB__ge_int64
 GrB_Info GB_AaddB__ge_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -34523,23 +36490,27 @@ GrB_Info GB_AaddB__ge_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ge_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -34548,6 +36519,7 @@ GrB_Info GB_bind1st__ge_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34559,6 +36531,7 @@ GrB_Info GB_bind2nd__ge_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34570,10 +36543,10 @@ GrB_Info GB_bind1st_tran__ge_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -34583,14 +36556,14 @@ GrB_Info GB_bind2nd_tran__ge_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -34654,8 +36627,10 @@ GrB_Info GB_DxB__ge_uint8
 GrB_Info GB_AaddB__ge_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -34663,23 +36638,27 @@ GrB_Info GB_AaddB__ge_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ge_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -34688,6 +36667,7 @@ GrB_Info GB_bind1st__ge_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34699,6 +36679,7 @@ GrB_Info GB_bind2nd__ge_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34710,10 +36691,10 @@ GrB_Info GB_bind1st_tran__ge_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -34723,14 +36704,14 @@ GrB_Info GB_bind2nd_tran__ge_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -34794,8 +36775,10 @@ GrB_Info GB_DxB__ge_uint16
 GrB_Info GB_AaddB__ge_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -34803,23 +36786,27 @@ GrB_Info GB_AaddB__ge_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ge_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -34828,6 +36815,7 @@ GrB_Info GB_bind1st__ge_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34839,6 +36827,7 @@ GrB_Info GB_bind2nd__ge_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34850,10 +36839,10 @@ GrB_Info GB_bind1st_tran__ge_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -34863,14 +36852,14 @@ GrB_Info GB_bind2nd_tran__ge_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -34934,8 +36923,10 @@ GrB_Info GB_DxB__ge_uint32
 GrB_Info GB_AaddB__ge_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -34943,23 +36934,27 @@ GrB_Info GB_AaddB__ge_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ge_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -34968,6 +36963,7 @@ GrB_Info GB_bind1st__ge_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34979,6 +36975,7 @@ GrB_Info GB_bind2nd__ge_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -34990,10 +36987,10 @@ GrB_Info GB_bind1st_tran__ge_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -35003,14 +37000,14 @@ GrB_Info GB_bind2nd_tran__ge_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -35074,8 +37071,10 @@ GrB_Info GB_DxB__ge_uint64
 GrB_Info GB_AaddB__ge_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -35083,23 +37082,27 @@ GrB_Info GB_AaddB__ge_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ge_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -35108,6 +37111,7 @@ GrB_Info GB_bind1st__ge_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35119,6 +37123,7 @@ GrB_Info GB_bind2nd__ge_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35130,10 +37135,10 @@ GrB_Info GB_bind1st_tran__ge_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -35143,14 +37148,14 @@ GrB_Info GB_bind2nd_tran__ge_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -35214,8 +37219,10 @@ GrB_Info GB_DxB__ge_fp32
 GrB_Info GB_AaddB__ge_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -35223,23 +37230,27 @@ GrB_Info GB_AaddB__ge_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ge_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -35248,6 +37259,7 @@ GrB_Info GB_bind1st__ge_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35259,6 +37271,7 @@ GrB_Info GB_bind2nd__ge_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35270,10 +37283,10 @@ GrB_Info GB_bind1st_tran__ge_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -35283,14 +37296,14 @@ GrB_Info GB_bind2nd_tran__ge_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -35354,8 +37367,10 @@ GrB_Info GB_DxB__ge_fp64
 GrB_Info GB_AaddB__ge_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -35363,23 +37378,27 @@ GrB_Info GB_AaddB__ge_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ge_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -35388,6 +37407,7 @@ GrB_Info GB_bind1st__ge_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35399,6 +37419,7 @@ GrB_Info GB_bind2nd__ge_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35410,10 +37431,10 @@ GrB_Info GB_bind1st_tran__ge_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -35423,14 +37444,14 @@ GrB_Info GB_bind2nd_tran__ge_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -35494,8 +37515,10 @@ GrB_Info GB_DxB__le_bool
 GrB_Info GB_AaddB__le_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -35503,23 +37526,27 @@ GrB_Info GB_AaddB__le_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__le_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -35528,6 +37555,7 @@ GrB_Info GB_bind1st__le_bool
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35539,6 +37567,7 @@ GrB_Info GB_bind2nd__le_bool
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35550,10 +37579,10 @@ GrB_Info GB_bind1st_tran__le_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -35563,14 +37592,14 @@ GrB_Info GB_bind2nd_tran__le_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -35634,8 +37663,10 @@ GrB_Info GB_DxB__le_int8
 GrB_Info GB_AaddB__le_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -35643,23 +37674,27 @@ GrB_Info GB_AaddB__le_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__le_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -35668,6 +37703,7 @@ GrB_Info GB_bind1st__le_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35679,6 +37715,7 @@ GrB_Info GB_bind2nd__le_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35690,10 +37727,10 @@ GrB_Info GB_bind1st_tran__le_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -35703,14 +37740,14 @@ GrB_Info GB_bind2nd_tran__le_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -35774,8 +37811,10 @@ GrB_Info GB_DxB__le_int16
 GrB_Info GB_AaddB__le_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -35783,23 +37822,27 @@ GrB_Info GB_AaddB__le_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__le_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -35808,6 +37851,7 @@ GrB_Info GB_bind1st__le_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35819,6 +37863,7 @@ GrB_Info GB_bind2nd__le_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35830,10 +37875,10 @@ GrB_Info GB_bind1st_tran__le_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -35843,14 +37888,14 @@ GrB_Info GB_bind2nd_tran__le_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -35914,8 +37959,10 @@ GrB_Info GB_DxB__le_int32
 GrB_Info GB_AaddB__le_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -35923,23 +37970,27 @@ GrB_Info GB_AaddB__le_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__le_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -35948,6 +37999,7 @@ GrB_Info GB_bind1st__le_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35959,6 +38011,7 @@ GrB_Info GB_bind2nd__le_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -35970,10 +38023,10 @@ GrB_Info GB_bind1st_tran__le_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -35983,14 +38036,14 @@ GrB_Info GB_bind2nd_tran__le_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -36054,8 +38107,10 @@ GrB_Info GB_DxB__le_int64
 GrB_Info GB_AaddB__le_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -36063,23 +38118,27 @@ GrB_Info GB_AaddB__le_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__le_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -36088,6 +38147,7 @@ GrB_Info GB_bind1st__le_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36099,6 +38159,7 @@ GrB_Info GB_bind2nd__le_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36110,10 +38171,10 @@ GrB_Info GB_bind1st_tran__le_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -36123,14 +38184,14 @@ GrB_Info GB_bind2nd_tran__le_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -36194,8 +38255,10 @@ GrB_Info GB_DxB__le_uint8
 GrB_Info GB_AaddB__le_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -36203,23 +38266,27 @@ GrB_Info GB_AaddB__le_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__le_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -36228,6 +38295,7 @@ GrB_Info GB_bind1st__le_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36239,6 +38307,7 @@ GrB_Info GB_bind2nd__le_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36250,10 +38319,10 @@ GrB_Info GB_bind1st_tran__le_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -36263,14 +38332,14 @@ GrB_Info GB_bind2nd_tran__le_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -36334,8 +38403,10 @@ GrB_Info GB_DxB__le_uint16
 GrB_Info GB_AaddB__le_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -36343,23 +38414,27 @@ GrB_Info GB_AaddB__le_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__le_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -36368,6 +38443,7 @@ GrB_Info GB_bind1st__le_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36379,6 +38455,7 @@ GrB_Info GB_bind2nd__le_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36390,10 +38467,10 @@ GrB_Info GB_bind1st_tran__le_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -36403,14 +38480,14 @@ GrB_Info GB_bind2nd_tran__le_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -36474,8 +38551,10 @@ GrB_Info GB_DxB__le_uint32
 GrB_Info GB_AaddB__le_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -36483,23 +38562,27 @@ GrB_Info GB_AaddB__le_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__le_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -36508,6 +38591,7 @@ GrB_Info GB_bind1st__le_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36519,6 +38603,7 @@ GrB_Info GB_bind2nd__le_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36530,10 +38615,10 @@ GrB_Info GB_bind1st_tran__le_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -36543,14 +38628,14 @@ GrB_Info GB_bind2nd_tran__le_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -36614,8 +38699,10 @@ GrB_Info GB_DxB__le_uint64
 GrB_Info GB_AaddB__le_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -36623,23 +38710,27 @@ GrB_Info GB_AaddB__le_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__le_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -36648,6 +38739,7 @@ GrB_Info GB_bind1st__le_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36659,6 +38751,7 @@ GrB_Info GB_bind2nd__le_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36670,10 +38763,10 @@ GrB_Info GB_bind1st_tran__le_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -36683,14 +38776,14 @@ GrB_Info GB_bind2nd_tran__le_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -36754,8 +38847,10 @@ GrB_Info GB_DxB__le_fp32
 GrB_Info GB_AaddB__le_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -36763,23 +38858,27 @@ GrB_Info GB_AaddB__le_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__le_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -36788,6 +38887,7 @@ GrB_Info GB_bind1st__le_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36799,6 +38899,7 @@ GrB_Info GB_bind2nd__le_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36810,10 +38911,10 @@ GrB_Info GB_bind1st_tran__le_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -36823,14 +38924,14 @@ GrB_Info GB_bind2nd_tran__le_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -36894,8 +38995,10 @@ GrB_Info GB_DxB__le_fp64
 GrB_Info GB_AaddB__le_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -36903,23 +39006,27 @@ GrB_Info GB_AaddB__le_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__le_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -36928,6 +39035,7 @@ GrB_Info GB_bind1st__le_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36939,6 +39047,7 @@ GrB_Info GB_bind2nd__le_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -36950,10 +39059,10 @@ GrB_Info GB_bind1st_tran__le_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -36963,14 +39072,14 @@ GrB_Info GB_bind2nd_tran__le_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -37034,8 +39143,10 @@ GrB_Info GB_DxB__lor_bool
 GrB_Info GB_AaddB__lor_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -37043,23 +39154,27 @@ GrB_Info GB_AaddB__lor_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lor_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -37068,6 +39183,7 @@ GrB_Info GB_bind1st__lor_bool
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37079,6 +39195,7 @@ GrB_Info GB_bind2nd__lor_bool
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37090,10 +39207,10 @@ GrB_Info GB_bind1st_tran__lor_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -37103,14 +39220,14 @@ GrB_Info GB_bind2nd_tran__lor_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -37174,8 +39291,10 @@ GrB_Info GB_DxB__lor_int8
 GrB_Info GB_AaddB__lor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -37183,23 +39302,27 @@ GrB_Info GB_AaddB__lor_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -37208,6 +39331,7 @@ GrB_Info GB_bind1st__lor_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37219,6 +39343,7 @@ GrB_Info GB_bind2nd__lor_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37230,10 +39355,10 @@ GrB_Info GB_bind1st_tran__lor_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -37243,14 +39368,14 @@ GrB_Info GB_bind2nd_tran__lor_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -37314,8 +39439,10 @@ GrB_Info GB_DxB__lor_int16
 GrB_Info GB_AaddB__lor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -37323,23 +39450,27 @@ GrB_Info GB_AaddB__lor_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -37348,6 +39479,7 @@ GrB_Info GB_bind1st__lor_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37359,6 +39491,7 @@ GrB_Info GB_bind2nd__lor_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37370,10 +39503,10 @@ GrB_Info GB_bind1st_tran__lor_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -37383,14 +39516,14 @@ GrB_Info GB_bind2nd_tran__lor_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -37454,8 +39587,10 @@ GrB_Info GB_DxB__lor_int32
 GrB_Info GB_AaddB__lor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -37463,23 +39598,27 @@ GrB_Info GB_AaddB__lor_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -37488,6 +39627,7 @@ GrB_Info GB_bind1st__lor_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37499,6 +39639,7 @@ GrB_Info GB_bind2nd__lor_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37510,10 +39651,10 @@ GrB_Info GB_bind1st_tran__lor_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -37523,14 +39664,14 @@ GrB_Info GB_bind2nd_tran__lor_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -37594,8 +39735,10 @@ GrB_Info GB_DxB__lor_int64
 GrB_Info GB_AaddB__lor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -37603,23 +39746,27 @@ GrB_Info GB_AaddB__lor_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -37628,6 +39775,7 @@ GrB_Info GB_bind1st__lor_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37639,6 +39787,7 @@ GrB_Info GB_bind2nd__lor_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37650,10 +39799,10 @@ GrB_Info GB_bind1st_tran__lor_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -37663,14 +39812,14 @@ GrB_Info GB_bind2nd_tran__lor_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -37734,8 +39883,10 @@ GrB_Info GB_DxB__lor_uint8
 GrB_Info GB_AaddB__lor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -37743,23 +39894,27 @@ GrB_Info GB_AaddB__lor_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -37768,6 +39923,7 @@ GrB_Info GB_bind1st__lor_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37779,6 +39935,7 @@ GrB_Info GB_bind2nd__lor_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37790,10 +39947,10 @@ GrB_Info GB_bind1st_tran__lor_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -37803,14 +39960,14 @@ GrB_Info GB_bind2nd_tran__lor_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -37874,8 +40031,10 @@ GrB_Info GB_DxB__lor_uint16
 GrB_Info GB_AaddB__lor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -37883,23 +40042,27 @@ GrB_Info GB_AaddB__lor_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -37908,6 +40071,7 @@ GrB_Info GB_bind1st__lor_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37919,6 +40083,7 @@ GrB_Info GB_bind2nd__lor_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -37930,10 +40095,10 @@ GrB_Info GB_bind1st_tran__lor_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -37943,14 +40108,14 @@ GrB_Info GB_bind2nd_tran__lor_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -38014,8 +40179,10 @@ GrB_Info GB_DxB__lor_uint32
 GrB_Info GB_AaddB__lor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -38023,23 +40190,27 @@ GrB_Info GB_AaddB__lor_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -38048,6 +40219,7 @@ GrB_Info GB_bind1st__lor_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38059,6 +40231,7 @@ GrB_Info GB_bind2nd__lor_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38070,10 +40243,10 @@ GrB_Info GB_bind1st_tran__lor_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -38083,14 +40256,14 @@ GrB_Info GB_bind2nd_tran__lor_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -38154,8 +40327,10 @@ GrB_Info GB_DxB__lor_uint64
 GrB_Info GB_AaddB__lor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -38163,23 +40338,27 @@ GrB_Info GB_AaddB__lor_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -38188,6 +40367,7 @@ GrB_Info GB_bind1st__lor_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38199,6 +40379,7 @@ GrB_Info GB_bind2nd__lor_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38210,10 +40391,10 @@ GrB_Info GB_bind1st_tran__lor_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -38223,14 +40404,14 @@ GrB_Info GB_bind2nd_tran__lor_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -38294,8 +40475,10 @@ GrB_Info GB_DxB__lor_fp32
 GrB_Info GB_AaddB__lor_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -38303,23 +40486,27 @@ GrB_Info GB_AaddB__lor_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lor_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -38328,6 +40515,7 @@ GrB_Info GB_bind1st__lor_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38339,6 +40527,7 @@ GrB_Info GB_bind2nd__lor_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38350,10 +40539,10 @@ GrB_Info GB_bind1st_tran__lor_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -38363,14 +40552,14 @@ GrB_Info GB_bind2nd_tran__lor_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -38434,8 +40623,10 @@ GrB_Info GB_DxB__lor_fp64
 GrB_Info GB_AaddB__lor_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -38443,23 +40634,27 @@ GrB_Info GB_AaddB__lor_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lor_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -38468,6 +40663,7 @@ GrB_Info GB_bind1st__lor_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38479,6 +40675,7 @@ GrB_Info GB_bind2nd__lor_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38490,10 +40687,10 @@ GrB_Info GB_bind1st_tran__lor_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -38503,14 +40700,14 @@ GrB_Info GB_bind2nd_tran__lor_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -38574,8 +40771,10 @@ GrB_Info GB_DxB__land_bool
 GrB_Info GB_AaddB__land_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -38583,23 +40782,27 @@ GrB_Info GB_AaddB__land_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__land_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -38608,6 +40811,7 @@ GrB_Info GB_bind1st__land_bool
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38619,6 +40823,7 @@ GrB_Info GB_bind2nd__land_bool
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38630,10 +40835,10 @@ GrB_Info GB_bind1st_tran__land_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -38643,14 +40848,14 @@ GrB_Info GB_bind2nd_tran__land_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -38714,8 +40919,10 @@ GrB_Info GB_DxB__land_int8
 GrB_Info GB_AaddB__land_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -38723,23 +40930,27 @@ GrB_Info GB_AaddB__land_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__land_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -38748,6 +40959,7 @@ GrB_Info GB_bind1st__land_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38759,6 +40971,7 @@ GrB_Info GB_bind2nd__land_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38770,10 +40983,10 @@ GrB_Info GB_bind1st_tran__land_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -38783,14 +40996,14 @@ GrB_Info GB_bind2nd_tran__land_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -38854,8 +41067,10 @@ GrB_Info GB_DxB__land_int16
 GrB_Info GB_AaddB__land_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -38863,23 +41078,27 @@ GrB_Info GB_AaddB__land_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__land_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -38888,6 +41107,7 @@ GrB_Info GB_bind1st__land_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38899,6 +41119,7 @@ GrB_Info GB_bind2nd__land_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -38910,10 +41131,10 @@ GrB_Info GB_bind1st_tran__land_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -38923,14 +41144,14 @@ GrB_Info GB_bind2nd_tran__land_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -38994,8 +41215,10 @@ GrB_Info GB_DxB__land_int32
 GrB_Info GB_AaddB__land_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -39003,23 +41226,27 @@ GrB_Info GB_AaddB__land_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__land_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -39028,6 +41255,7 @@ GrB_Info GB_bind1st__land_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39039,6 +41267,7 @@ GrB_Info GB_bind2nd__land_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39050,10 +41279,10 @@ GrB_Info GB_bind1st_tran__land_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -39063,14 +41292,14 @@ GrB_Info GB_bind2nd_tran__land_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -39134,8 +41363,10 @@ GrB_Info GB_DxB__land_int64
 GrB_Info GB_AaddB__land_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -39143,23 +41374,27 @@ GrB_Info GB_AaddB__land_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__land_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -39168,6 +41403,7 @@ GrB_Info GB_bind1st__land_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39179,6 +41415,7 @@ GrB_Info GB_bind2nd__land_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39190,10 +41427,10 @@ GrB_Info GB_bind1st_tran__land_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -39203,14 +41440,14 @@ GrB_Info GB_bind2nd_tran__land_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -39274,8 +41511,10 @@ GrB_Info GB_DxB__land_uint8
 GrB_Info GB_AaddB__land_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -39283,23 +41522,27 @@ GrB_Info GB_AaddB__land_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__land_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -39308,6 +41551,7 @@ GrB_Info GB_bind1st__land_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39319,6 +41563,7 @@ GrB_Info GB_bind2nd__land_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39330,10 +41575,10 @@ GrB_Info GB_bind1st_tran__land_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -39343,14 +41588,14 @@ GrB_Info GB_bind2nd_tran__land_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -39414,8 +41659,10 @@ GrB_Info GB_DxB__land_uint16
 GrB_Info GB_AaddB__land_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -39423,23 +41670,27 @@ GrB_Info GB_AaddB__land_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__land_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -39448,6 +41699,7 @@ GrB_Info GB_bind1st__land_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39459,6 +41711,7 @@ GrB_Info GB_bind2nd__land_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39470,10 +41723,10 @@ GrB_Info GB_bind1st_tran__land_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -39483,14 +41736,14 @@ GrB_Info GB_bind2nd_tran__land_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -39554,8 +41807,10 @@ GrB_Info GB_DxB__land_uint32
 GrB_Info GB_AaddB__land_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -39563,23 +41818,27 @@ GrB_Info GB_AaddB__land_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__land_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -39588,6 +41847,7 @@ GrB_Info GB_bind1st__land_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39599,6 +41859,7 @@ GrB_Info GB_bind2nd__land_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39610,10 +41871,10 @@ GrB_Info GB_bind1st_tran__land_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -39623,14 +41884,14 @@ GrB_Info GB_bind2nd_tran__land_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -39694,8 +41955,10 @@ GrB_Info GB_DxB__land_uint64
 GrB_Info GB_AaddB__land_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -39703,23 +41966,27 @@ GrB_Info GB_AaddB__land_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__land_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -39728,6 +41995,7 @@ GrB_Info GB_bind1st__land_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39739,6 +42007,7 @@ GrB_Info GB_bind2nd__land_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39750,10 +42019,10 @@ GrB_Info GB_bind1st_tran__land_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -39763,14 +42032,14 @@ GrB_Info GB_bind2nd_tran__land_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -39834,8 +42103,10 @@ GrB_Info GB_DxB__land_fp32
 GrB_Info GB_AaddB__land_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -39843,23 +42114,27 @@ GrB_Info GB_AaddB__land_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__land_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -39868,6 +42143,7 @@ GrB_Info GB_bind1st__land_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39879,6 +42155,7 @@ GrB_Info GB_bind2nd__land_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -39890,10 +42167,10 @@ GrB_Info GB_bind1st_tran__land_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -39903,14 +42180,14 @@ GrB_Info GB_bind2nd_tran__land_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -39974,8 +42251,10 @@ GrB_Info GB_DxB__land_fp64
 GrB_Info GB_AaddB__land_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -39983,23 +42262,27 @@ GrB_Info GB_AaddB__land_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__land_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -40008,6 +42291,7 @@ GrB_Info GB_bind1st__land_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40019,6 +42303,7 @@ GrB_Info GB_bind2nd__land_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40030,10 +42315,10 @@ GrB_Info GB_bind1st_tran__land_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -40043,14 +42328,14 @@ GrB_Info GB_bind2nd_tran__land_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -40114,8 +42399,10 @@ GrB_Info GB_DxB__lxor_bool
 GrB_Info GB_AaddB__lxor_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -40123,23 +42410,27 @@ GrB_Info GB_AaddB__lxor_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lxor_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -40148,6 +42439,7 @@ GrB_Info GB_bind1st__lxor_bool
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40159,6 +42451,7 @@ GrB_Info GB_bind2nd__lxor_bool
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40170,10 +42463,10 @@ GrB_Info GB_bind1st_tran__lxor_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -40183,14 +42476,14 @@ GrB_Info GB_bind2nd_tran__lxor_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -40254,8 +42547,10 @@ GrB_Info GB_DxB__lxor_int8
 GrB_Info GB_AaddB__lxor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -40263,23 +42558,27 @@ GrB_Info GB_AaddB__lxor_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lxor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -40288,6 +42587,7 @@ GrB_Info GB_bind1st__lxor_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40299,6 +42599,7 @@ GrB_Info GB_bind2nd__lxor_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40310,10 +42611,10 @@ GrB_Info GB_bind1st_tran__lxor_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -40323,14 +42624,14 @@ GrB_Info GB_bind2nd_tran__lxor_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -40394,8 +42695,10 @@ GrB_Info GB_DxB__lxor_int16
 GrB_Info GB_AaddB__lxor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -40403,23 +42706,27 @@ GrB_Info GB_AaddB__lxor_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lxor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -40428,6 +42735,7 @@ GrB_Info GB_bind1st__lxor_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40439,6 +42747,7 @@ GrB_Info GB_bind2nd__lxor_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40450,10 +42759,10 @@ GrB_Info GB_bind1st_tran__lxor_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -40463,14 +42772,14 @@ GrB_Info GB_bind2nd_tran__lxor_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -40534,8 +42843,10 @@ GrB_Info GB_DxB__lxor_int32
 GrB_Info GB_AaddB__lxor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -40543,23 +42854,27 @@ GrB_Info GB_AaddB__lxor_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lxor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -40568,6 +42883,7 @@ GrB_Info GB_bind1st__lxor_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40579,6 +42895,7 @@ GrB_Info GB_bind2nd__lxor_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40590,10 +42907,10 @@ GrB_Info GB_bind1st_tran__lxor_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -40603,14 +42920,14 @@ GrB_Info GB_bind2nd_tran__lxor_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -40674,8 +42991,10 @@ GrB_Info GB_DxB__lxor_int64
 GrB_Info GB_AaddB__lxor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -40683,23 +43002,27 @@ GrB_Info GB_AaddB__lxor_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lxor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -40708,6 +43031,7 @@ GrB_Info GB_bind1st__lxor_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40719,6 +43043,7 @@ GrB_Info GB_bind2nd__lxor_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40730,10 +43055,10 @@ GrB_Info GB_bind1st_tran__lxor_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -40743,14 +43068,14 @@ GrB_Info GB_bind2nd_tran__lxor_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -40814,8 +43139,10 @@ GrB_Info GB_DxB__lxor_uint8
 GrB_Info GB_AaddB__lxor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -40823,23 +43150,27 @@ GrB_Info GB_AaddB__lxor_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lxor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -40848,6 +43179,7 @@ GrB_Info GB_bind1st__lxor_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40859,6 +43191,7 @@ GrB_Info GB_bind2nd__lxor_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40870,10 +43203,10 @@ GrB_Info GB_bind1st_tran__lxor_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -40883,14 +43216,14 @@ GrB_Info GB_bind2nd_tran__lxor_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -40954,8 +43287,10 @@ GrB_Info GB_DxB__lxor_uint16
 GrB_Info GB_AaddB__lxor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -40963,23 +43298,27 @@ GrB_Info GB_AaddB__lxor_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lxor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -40988,6 +43327,7 @@ GrB_Info GB_bind1st__lxor_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -40999,6 +43339,7 @@ GrB_Info GB_bind2nd__lxor_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41010,10 +43351,10 @@ GrB_Info GB_bind1st_tran__lxor_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -41023,14 +43364,14 @@ GrB_Info GB_bind2nd_tran__lxor_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -41094,8 +43435,10 @@ GrB_Info GB_DxB__lxor_uint32
 GrB_Info GB_AaddB__lxor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -41103,23 +43446,27 @@ GrB_Info GB_AaddB__lxor_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lxor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -41128,6 +43475,7 @@ GrB_Info GB_bind1st__lxor_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41139,6 +43487,7 @@ GrB_Info GB_bind2nd__lxor_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41150,10 +43499,10 @@ GrB_Info GB_bind1st_tran__lxor_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -41163,14 +43512,14 @@ GrB_Info GB_bind2nd_tran__lxor_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -41234,8 +43583,10 @@ GrB_Info GB_DxB__lxor_uint64
 GrB_Info GB_AaddB__lxor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -41243,23 +43594,27 @@ GrB_Info GB_AaddB__lxor_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lxor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -41268,6 +43623,7 @@ GrB_Info GB_bind1st__lxor_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41279,6 +43635,7 @@ GrB_Info GB_bind2nd__lxor_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41290,10 +43647,10 @@ GrB_Info GB_bind1st_tran__lxor_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -41303,14 +43660,14 @@ GrB_Info GB_bind2nd_tran__lxor_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -41374,8 +43731,10 @@ GrB_Info GB_DxB__lxor_fp32
 GrB_Info GB_AaddB__lxor_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -41383,23 +43742,27 @@ GrB_Info GB_AaddB__lxor_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lxor_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -41408,6 +43771,7 @@ GrB_Info GB_bind1st__lxor_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41419,6 +43783,7 @@ GrB_Info GB_bind2nd__lxor_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41430,10 +43795,10 @@ GrB_Info GB_bind1st_tran__lxor_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -41443,14 +43808,14 @@ GrB_Info GB_bind2nd_tran__lxor_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -41514,8 +43879,10 @@ GrB_Info GB_DxB__lxor_fp64
 GrB_Info GB_AaddB__lxor_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -41523,23 +43890,27 @@ GrB_Info GB_AaddB__lxor_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__lxor_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -41548,6 +43919,7 @@ GrB_Info GB_bind1st__lxor_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41559,6 +43931,7 @@ GrB_Info GB_bind2nd__lxor_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41570,10 +43943,10 @@ GrB_Info GB_bind1st_tran__lxor_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -41583,14 +43956,14 @@ GrB_Info GB_bind2nd_tran__lxor_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -41654,8 +44027,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__atan2_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -41663,23 +44038,27 @@ GrB_Info GB_AaddB__atan2_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__atan2_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -41688,6 +44067,7 @@ GrB_Info GB_bind1st__atan2_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41699,6 +44079,7 @@ GrB_Info GB_bind2nd__atan2_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41710,10 +44091,10 @@ GrB_Info GB_bind1st_tran__atan2_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -41723,14 +44104,14 @@ GrB_Info GB_bind2nd_tran__atan2_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -41794,8 +44175,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__atan2_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -41803,23 +44186,27 @@ GrB_Info GB_AaddB__atan2_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__atan2_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -41828,6 +44215,7 @@ GrB_Info GB_bind1st__atan2_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41839,6 +44227,7 @@ GrB_Info GB_bind2nd__atan2_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41850,10 +44239,10 @@ GrB_Info GB_bind1st_tran__atan2_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -41863,14 +44252,14 @@ GrB_Info GB_bind2nd_tran__atan2_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -41934,8 +44323,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__hypot_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -41943,23 +44334,27 @@ GrB_Info GB_AaddB__hypot_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__hypot_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -41968,6 +44363,7 @@ GrB_Info GB_bind1st__hypot_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41979,6 +44375,7 @@ GrB_Info GB_bind2nd__hypot_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -41990,10 +44387,10 @@ GrB_Info GB_bind1st_tran__hypot_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -42003,14 +44400,14 @@ GrB_Info GB_bind2nd_tran__hypot_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -42074,8 +44471,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__hypot_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -42083,23 +44482,27 @@ GrB_Info GB_AaddB__hypot_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__hypot_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -42108,6 +44511,7 @@ GrB_Info GB_bind1st__hypot_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42119,6 +44523,7 @@ GrB_Info GB_bind2nd__hypot_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42130,10 +44535,10 @@ GrB_Info GB_bind1st_tran__hypot_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -42143,14 +44548,14 @@ GrB_Info GB_bind2nd_tran__hypot_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -42214,8 +44619,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__fmod_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -42223,23 +44630,27 @@ GrB_Info GB_AaddB__fmod_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__fmod_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -42248,6 +44659,7 @@ GrB_Info GB_bind1st__fmod_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42259,6 +44671,7 @@ GrB_Info GB_bind2nd__fmod_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42270,10 +44683,10 @@ GrB_Info GB_bind1st_tran__fmod_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -42283,14 +44696,14 @@ GrB_Info GB_bind2nd_tran__fmod_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -42354,8 +44767,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__fmod_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -42363,23 +44778,27 @@ GrB_Info GB_AaddB__fmod_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__fmod_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -42388,6 +44807,7 @@ GrB_Info GB_bind1st__fmod_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42399,6 +44819,7 @@ GrB_Info GB_bind2nd__fmod_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42410,10 +44831,10 @@ GrB_Info GB_bind1st_tran__fmod_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -42423,14 +44844,14 @@ GrB_Info GB_bind2nd_tran__fmod_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -42494,8 +44915,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__remainder_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -42503,23 +44926,27 @@ GrB_Info GB_AaddB__remainder_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__remainder_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -42528,6 +44955,7 @@ GrB_Info GB_bind1st__remainder_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42539,6 +44967,7 @@ GrB_Info GB_bind2nd__remainder_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42550,10 +44979,10 @@ GrB_Info GB_bind1st_tran__remainder_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -42563,14 +44992,14 @@ GrB_Info GB_bind2nd_tran__remainder_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -42634,8 +45063,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__remainder_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -42643,23 +45074,27 @@ GrB_Info GB_AaddB__remainder_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__remainder_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -42668,6 +45103,7 @@ GrB_Info GB_bind1st__remainder_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42679,6 +45115,7 @@ GrB_Info GB_bind2nd__remainder_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42690,10 +45127,10 @@ GrB_Info GB_bind1st_tran__remainder_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -42703,14 +45140,14 @@ GrB_Info GB_bind2nd_tran__remainder_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -42774,8 +45211,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__copysign_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -42783,23 +45222,27 @@ GrB_Info GB_AaddB__copysign_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__copysign_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -42808,6 +45251,7 @@ GrB_Info GB_bind1st__copysign_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42819,6 +45263,7 @@ GrB_Info GB_bind2nd__copysign_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42830,10 +45275,10 @@ GrB_Info GB_bind1st_tran__copysign_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -42843,14 +45288,14 @@ GrB_Info GB_bind2nd_tran__copysign_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -42914,8 +45359,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__copysign_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -42923,23 +45370,27 @@ GrB_Info GB_AaddB__copysign_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__copysign_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -42948,6 +45399,7 @@ GrB_Info GB_bind1st__copysign_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42959,6 +45411,7 @@ GrB_Info GB_bind2nd__copysign_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -42970,10 +45423,10 @@ GrB_Info GB_bind1st_tran__copysign_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -42983,14 +45436,14 @@ GrB_Info GB_bind2nd_tran__copysign_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -43054,8 +45507,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__ldexp_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -43063,23 +45518,27 @@ GrB_Info GB_AaddB__ldexp_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ldexp_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -43088,6 +45547,7 @@ GrB_Info GB_bind1st__ldexp_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43099,6 +45559,7 @@ GrB_Info GB_bind2nd__ldexp_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43110,10 +45571,10 @@ GrB_Info GB_bind1st_tran__ldexp_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -43123,14 +45584,14 @@ GrB_Info GB_bind2nd_tran__ldexp_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -43194,8 +45655,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__ldexp_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -43203,23 +45666,27 @@ GrB_Info GB_AaddB__ldexp_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__ldexp_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -43228,6 +45695,7 @@ GrB_Info GB_bind1st__ldexp_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43239,6 +45707,7 @@ GrB_Info GB_bind2nd__ldexp_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43250,10 +45719,10 @@ GrB_Info GB_bind1st_tran__ldexp_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -43263,14 +45732,14 @@ GrB_Info GB_bind2nd_tran__ldexp_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -43334,8 +45803,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__cmplx_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -43343,23 +45814,27 @@ GrB_Info GB_AaddB__cmplx_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__cmplx_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -43368,6 +45843,7 @@ GrB_Info GB_bind1st__cmplx_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43379,6 +45855,7 @@ GrB_Info GB_bind2nd__cmplx_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43390,10 +45867,10 @@ GrB_Info GB_bind1st_tran__cmplx_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -43403,14 +45880,14 @@ GrB_Info GB_bind2nd_tran__cmplx_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -43474,8 +45951,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__cmplx_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -43483,23 +45962,27 @@ GrB_Info GB_AaddB__cmplx_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__cmplx_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -43508,6 +45991,7 @@ GrB_Info GB_bind1st__cmplx_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43519,6 +46003,7 @@ GrB_Info GB_bind2nd__cmplx_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43530,10 +46015,10 @@ GrB_Info GB_bind1st_tran__cmplx_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -43543,14 +46028,14 @@ GrB_Info GB_bind2nd_tran__cmplx_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -43614,8 +46099,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -43623,23 +46110,27 @@ GrB_Info GB_AaddB__bor_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -43648,6 +46139,7 @@ GrB_Info GB_bind1st__bor_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43659,6 +46151,7 @@ GrB_Info GB_bind2nd__bor_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43670,10 +46163,10 @@ GrB_Info GB_bind1st_tran__bor_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -43683,14 +46176,14 @@ GrB_Info GB_bind2nd_tran__bor_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -43754,8 +46247,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -43763,23 +46258,27 @@ GrB_Info GB_AaddB__bor_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -43788,6 +46287,7 @@ GrB_Info GB_bind1st__bor_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43799,6 +46299,7 @@ GrB_Info GB_bind2nd__bor_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43810,10 +46311,10 @@ GrB_Info GB_bind1st_tran__bor_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -43823,14 +46324,14 @@ GrB_Info GB_bind2nd_tran__bor_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -43894,8 +46395,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -43903,23 +46406,27 @@ GrB_Info GB_AaddB__bor_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -43928,6 +46435,7 @@ GrB_Info GB_bind1st__bor_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43939,6 +46447,7 @@ GrB_Info GB_bind2nd__bor_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -43950,10 +46459,10 @@ GrB_Info GB_bind1st_tran__bor_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -43963,14 +46472,14 @@ GrB_Info GB_bind2nd_tran__bor_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -44034,8 +46543,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -44043,23 +46554,27 @@ GrB_Info GB_AaddB__bor_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -44068,6 +46583,7 @@ GrB_Info GB_bind1st__bor_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44079,6 +46595,7 @@ GrB_Info GB_bind2nd__bor_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44090,10 +46607,10 @@ GrB_Info GB_bind1st_tran__bor_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -44103,14 +46620,14 @@ GrB_Info GB_bind2nd_tran__bor_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -44174,8 +46691,10 @@ GrB_Info GB_DxB__bor_uint8
 GrB_Info GB_AaddB__bor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -44183,23 +46702,27 @@ GrB_Info GB_AaddB__bor_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -44208,6 +46731,7 @@ GrB_Info GB_bind1st__bor_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44219,6 +46743,7 @@ GrB_Info GB_bind2nd__bor_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44230,10 +46755,10 @@ GrB_Info GB_bind1st_tran__bor_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -44243,14 +46768,14 @@ GrB_Info GB_bind2nd_tran__bor_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -44314,8 +46839,10 @@ GrB_Info GB_DxB__bor_uint16
 GrB_Info GB_AaddB__bor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -44323,23 +46850,27 @@ GrB_Info GB_AaddB__bor_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -44348,6 +46879,7 @@ GrB_Info GB_bind1st__bor_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44359,6 +46891,7 @@ GrB_Info GB_bind2nd__bor_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44370,10 +46903,10 @@ GrB_Info GB_bind1st_tran__bor_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -44383,14 +46916,14 @@ GrB_Info GB_bind2nd_tran__bor_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -44454,8 +46987,10 @@ GrB_Info GB_DxB__bor_uint32
 GrB_Info GB_AaddB__bor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -44463,23 +46998,27 @@ GrB_Info GB_AaddB__bor_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -44488,6 +47027,7 @@ GrB_Info GB_bind1st__bor_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44499,6 +47039,7 @@ GrB_Info GB_bind2nd__bor_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44510,10 +47051,10 @@ GrB_Info GB_bind1st_tran__bor_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -44523,14 +47064,14 @@ GrB_Info GB_bind2nd_tran__bor_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -44594,8 +47135,10 @@ GrB_Info GB_DxB__bor_uint64
 GrB_Info GB_AaddB__bor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -44603,23 +47146,27 @@ GrB_Info GB_AaddB__bor_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -44628,6 +47175,7 @@ GrB_Info GB_bind1st__bor_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44639,6 +47187,7 @@ GrB_Info GB_bind2nd__bor_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44650,10 +47199,10 @@ GrB_Info GB_bind1st_tran__bor_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -44663,14 +47212,14 @@ GrB_Info GB_bind2nd_tran__bor_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -44734,8 +47283,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__band_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -44743,23 +47294,27 @@ GrB_Info GB_AaddB__band_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__band_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -44768,6 +47323,7 @@ GrB_Info GB_bind1st__band_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44779,6 +47335,7 @@ GrB_Info GB_bind2nd__band_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44790,10 +47347,10 @@ GrB_Info GB_bind1st_tran__band_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -44803,14 +47360,14 @@ GrB_Info GB_bind2nd_tran__band_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -44874,8 +47431,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__band_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -44883,23 +47442,27 @@ GrB_Info GB_AaddB__band_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__band_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -44908,6 +47471,7 @@ GrB_Info GB_bind1st__band_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44919,6 +47483,7 @@ GrB_Info GB_bind2nd__band_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -44930,10 +47495,10 @@ GrB_Info GB_bind1st_tran__band_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -44943,14 +47508,14 @@ GrB_Info GB_bind2nd_tran__band_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -45014,8 +47579,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__band_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -45023,23 +47590,27 @@ GrB_Info GB_AaddB__band_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__band_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -45048,6 +47619,7 @@ GrB_Info GB_bind1st__band_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45059,6 +47631,7 @@ GrB_Info GB_bind2nd__band_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45070,10 +47643,10 @@ GrB_Info GB_bind1st_tran__band_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -45083,14 +47656,14 @@ GrB_Info GB_bind2nd_tran__band_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -45154,8 +47727,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__band_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -45163,23 +47738,27 @@ GrB_Info GB_AaddB__band_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__band_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -45188,6 +47767,7 @@ GrB_Info GB_bind1st__band_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45199,6 +47779,7 @@ GrB_Info GB_bind2nd__band_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45210,10 +47791,10 @@ GrB_Info GB_bind1st_tran__band_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -45223,14 +47804,14 @@ GrB_Info GB_bind2nd_tran__band_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -45294,8 +47875,10 @@ GrB_Info GB_DxB__band_uint8
 GrB_Info GB_AaddB__band_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -45303,23 +47886,27 @@ GrB_Info GB_AaddB__band_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__band_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -45328,6 +47915,7 @@ GrB_Info GB_bind1st__band_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45339,6 +47927,7 @@ GrB_Info GB_bind2nd__band_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45350,10 +47939,10 @@ GrB_Info GB_bind1st_tran__band_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -45363,14 +47952,14 @@ GrB_Info GB_bind2nd_tran__band_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -45434,8 +48023,10 @@ GrB_Info GB_DxB__band_uint16
 GrB_Info GB_AaddB__band_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -45443,23 +48034,27 @@ GrB_Info GB_AaddB__band_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__band_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -45468,6 +48063,7 @@ GrB_Info GB_bind1st__band_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45479,6 +48075,7 @@ GrB_Info GB_bind2nd__band_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45490,10 +48087,10 @@ GrB_Info GB_bind1st_tran__band_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -45503,14 +48100,14 @@ GrB_Info GB_bind2nd_tran__band_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -45574,8 +48171,10 @@ GrB_Info GB_DxB__band_uint32
 GrB_Info GB_AaddB__band_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -45583,23 +48182,27 @@ GrB_Info GB_AaddB__band_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__band_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -45608,6 +48211,7 @@ GrB_Info GB_bind1st__band_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45619,6 +48223,7 @@ GrB_Info GB_bind2nd__band_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45630,10 +48235,10 @@ GrB_Info GB_bind1st_tran__band_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -45643,14 +48248,14 @@ GrB_Info GB_bind2nd_tran__band_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -45714,8 +48319,10 @@ GrB_Info GB_DxB__band_uint64
 GrB_Info GB_AaddB__band_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -45723,23 +48330,27 @@ GrB_Info GB_AaddB__band_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__band_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -45748,6 +48359,7 @@ GrB_Info GB_bind1st__band_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45759,6 +48371,7 @@ GrB_Info GB_bind2nd__band_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45770,10 +48383,10 @@ GrB_Info GB_bind1st_tran__band_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -45783,14 +48396,14 @@ GrB_Info GB_bind2nd_tran__band_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -45854,8 +48467,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bxor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -45863,23 +48478,27 @@ GrB_Info GB_AaddB__bxor_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -45888,6 +48507,7 @@ GrB_Info GB_bind1st__bxor_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45899,6 +48519,7 @@ GrB_Info GB_bind2nd__bxor_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -45910,10 +48531,10 @@ GrB_Info GB_bind1st_tran__bxor_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -45923,14 +48544,14 @@ GrB_Info GB_bind2nd_tran__bxor_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -45994,8 +48615,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bxor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -46003,23 +48626,27 @@ GrB_Info GB_AaddB__bxor_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -46028,6 +48655,7 @@ GrB_Info GB_bind1st__bxor_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46039,6 +48667,7 @@ GrB_Info GB_bind2nd__bxor_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46050,10 +48679,10 @@ GrB_Info GB_bind1st_tran__bxor_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -46063,14 +48692,14 @@ GrB_Info GB_bind2nd_tran__bxor_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -46134,8 +48763,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bxor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -46143,23 +48774,27 @@ GrB_Info GB_AaddB__bxor_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -46168,6 +48803,7 @@ GrB_Info GB_bind1st__bxor_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46179,6 +48815,7 @@ GrB_Info GB_bind2nd__bxor_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46190,10 +48827,10 @@ GrB_Info GB_bind1st_tran__bxor_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -46203,14 +48840,14 @@ GrB_Info GB_bind2nd_tran__bxor_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -46274,8 +48911,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bxor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -46283,23 +48922,27 @@ GrB_Info GB_AaddB__bxor_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -46308,6 +48951,7 @@ GrB_Info GB_bind1st__bxor_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46319,6 +48963,7 @@ GrB_Info GB_bind2nd__bxor_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46330,10 +48975,10 @@ GrB_Info GB_bind1st_tran__bxor_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -46343,14 +48988,14 @@ GrB_Info GB_bind2nd_tran__bxor_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -46414,8 +49059,10 @@ GrB_Info GB_DxB__bxor_uint8
 GrB_Info GB_AaddB__bxor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -46423,23 +49070,27 @@ GrB_Info GB_AaddB__bxor_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -46448,6 +49099,7 @@ GrB_Info GB_bind1st__bxor_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46459,6 +49111,7 @@ GrB_Info GB_bind2nd__bxor_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46470,10 +49123,10 @@ GrB_Info GB_bind1st_tran__bxor_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -46483,14 +49136,14 @@ GrB_Info GB_bind2nd_tran__bxor_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -46554,8 +49207,10 @@ GrB_Info GB_DxB__bxor_uint16
 GrB_Info GB_AaddB__bxor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -46563,23 +49218,27 @@ GrB_Info GB_AaddB__bxor_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -46588,6 +49247,7 @@ GrB_Info GB_bind1st__bxor_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46599,6 +49259,7 @@ GrB_Info GB_bind2nd__bxor_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46610,10 +49271,10 @@ GrB_Info GB_bind1st_tran__bxor_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -46623,14 +49284,14 @@ GrB_Info GB_bind2nd_tran__bxor_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -46694,8 +49355,10 @@ GrB_Info GB_DxB__bxor_uint32
 GrB_Info GB_AaddB__bxor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -46703,23 +49366,27 @@ GrB_Info GB_AaddB__bxor_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -46728,6 +49395,7 @@ GrB_Info GB_bind1st__bxor_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46739,6 +49407,7 @@ GrB_Info GB_bind2nd__bxor_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46750,10 +49419,10 @@ GrB_Info GB_bind1st_tran__bxor_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -46763,14 +49432,14 @@ GrB_Info GB_bind2nd_tran__bxor_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -46834,8 +49503,10 @@ GrB_Info GB_DxB__bxor_uint64
 GrB_Info GB_AaddB__bxor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -46843,23 +49514,27 @@ GrB_Info GB_AaddB__bxor_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -46868,6 +49543,7 @@ GrB_Info GB_bind1st__bxor_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46879,6 +49555,7 @@ GrB_Info GB_bind2nd__bxor_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -46890,10 +49567,10 @@ GrB_Info GB_bind1st_tran__bxor_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -46903,14 +49580,14 @@ GrB_Info GB_bind2nd_tran__bxor_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -46974,8 +49651,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bxnor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -46983,23 +49662,27 @@ GrB_Info GB_AaddB__bxnor_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxnor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -47008,6 +49691,7 @@ GrB_Info GB_bind1st__bxnor_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47019,6 +49703,7 @@ GrB_Info GB_bind2nd__bxnor_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47030,10 +49715,10 @@ GrB_Info GB_bind1st_tran__bxnor_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -47043,14 +49728,14 @@ GrB_Info GB_bind2nd_tran__bxnor_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -47114,8 +49799,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bxnor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -47123,23 +49810,27 @@ GrB_Info GB_AaddB__bxnor_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxnor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -47148,6 +49839,7 @@ GrB_Info GB_bind1st__bxnor_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47159,6 +49851,7 @@ GrB_Info GB_bind2nd__bxnor_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47170,10 +49863,10 @@ GrB_Info GB_bind1st_tran__bxnor_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -47183,14 +49876,14 @@ GrB_Info GB_bind2nd_tran__bxnor_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -47254,8 +49947,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bxnor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -47263,23 +49958,27 @@ GrB_Info GB_AaddB__bxnor_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxnor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -47288,6 +49987,7 @@ GrB_Info GB_bind1st__bxnor_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47299,6 +49999,7 @@ GrB_Info GB_bind2nd__bxnor_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47310,10 +50011,10 @@ GrB_Info GB_bind1st_tran__bxnor_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -47323,14 +50024,14 @@ GrB_Info GB_bind2nd_tran__bxnor_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -47394,8 +50095,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bxnor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -47403,23 +50106,27 @@ GrB_Info GB_AaddB__bxnor_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxnor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -47428,6 +50135,7 @@ GrB_Info GB_bind1st__bxnor_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47439,6 +50147,7 @@ GrB_Info GB_bind2nd__bxnor_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47450,10 +50159,10 @@ GrB_Info GB_bind1st_tran__bxnor_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -47463,14 +50172,14 @@ GrB_Info GB_bind2nd_tran__bxnor_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -47534,8 +50243,10 @@ GrB_Info GB_DxB__bxnor_uint8
 GrB_Info GB_AaddB__bxnor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -47543,23 +50254,27 @@ GrB_Info GB_AaddB__bxnor_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxnor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -47568,6 +50283,7 @@ GrB_Info GB_bind1st__bxnor_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47579,6 +50295,7 @@ GrB_Info GB_bind2nd__bxnor_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47590,10 +50307,10 @@ GrB_Info GB_bind1st_tran__bxnor_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -47603,14 +50320,14 @@ GrB_Info GB_bind2nd_tran__bxnor_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -47674,8 +50391,10 @@ GrB_Info GB_DxB__bxnor_uint16
 GrB_Info GB_AaddB__bxnor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -47683,23 +50402,27 @@ GrB_Info GB_AaddB__bxnor_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxnor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -47708,6 +50431,7 @@ GrB_Info GB_bind1st__bxnor_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47719,6 +50443,7 @@ GrB_Info GB_bind2nd__bxnor_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47730,10 +50455,10 @@ GrB_Info GB_bind1st_tran__bxnor_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -47743,14 +50468,14 @@ GrB_Info GB_bind2nd_tran__bxnor_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -47814,8 +50539,10 @@ GrB_Info GB_DxB__bxnor_uint32
 GrB_Info GB_AaddB__bxnor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -47823,23 +50550,27 @@ GrB_Info GB_AaddB__bxnor_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxnor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -47848,6 +50579,7 @@ GrB_Info GB_bind1st__bxnor_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47859,6 +50591,7 @@ GrB_Info GB_bind2nd__bxnor_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47870,10 +50603,10 @@ GrB_Info GB_bind1st_tran__bxnor_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -47883,14 +50616,14 @@ GrB_Info GB_bind2nd_tran__bxnor_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -47954,8 +50687,10 @@ GrB_Info GB_DxB__bxnor_uint64
 GrB_Info GB_AaddB__bxnor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -47963,23 +50698,27 @@ GrB_Info GB_AaddB__bxnor_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bxnor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -47988,6 +50727,7 @@ GrB_Info GB_bind1st__bxnor_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -47999,6 +50739,7 @@ GrB_Info GB_bind2nd__bxnor_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48010,10 +50751,10 @@ GrB_Info GB_bind1st_tran__bxnor_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -48023,14 +50764,14 @@ GrB_Info GB_bind2nd_tran__bxnor_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -48094,8 +50835,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bget_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -48103,23 +50846,27 @@ GrB_Info GB_AaddB__bget_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bget_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -48128,6 +50875,7 @@ GrB_Info GB_bind1st__bget_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48139,6 +50887,7 @@ GrB_Info GB_bind2nd__bget_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48150,10 +50899,10 @@ GrB_Info GB_bind1st_tran__bget_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -48163,14 +50912,14 @@ GrB_Info GB_bind2nd_tran__bget_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -48234,8 +50983,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bget_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -48243,23 +50994,27 @@ GrB_Info GB_AaddB__bget_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bget_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -48268,6 +51023,7 @@ GrB_Info GB_bind1st__bget_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48279,6 +51035,7 @@ GrB_Info GB_bind2nd__bget_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48290,10 +51047,10 @@ GrB_Info GB_bind1st_tran__bget_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -48303,14 +51060,14 @@ GrB_Info GB_bind2nd_tran__bget_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -48374,8 +51131,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bget_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -48383,23 +51142,27 @@ GrB_Info GB_AaddB__bget_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bget_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -48408,6 +51171,7 @@ GrB_Info GB_bind1st__bget_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48419,6 +51183,7 @@ GrB_Info GB_bind2nd__bget_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48430,10 +51195,10 @@ GrB_Info GB_bind1st_tran__bget_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -48443,14 +51208,14 @@ GrB_Info GB_bind2nd_tran__bget_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -48514,8 +51279,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bget_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -48523,23 +51290,27 @@ GrB_Info GB_AaddB__bget_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bget_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -48548,6 +51319,7 @@ GrB_Info GB_bind1st__bget_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48559,6 +51331,7 @@ GrB_Info GB_bind2nd__bget_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48570,10 +51343,10 @@ GrB_Info GB_bind1st_tran__bget_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -48583,14 +51356,14 @@ GrB_Info GB_bind2nd_tran__bget_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -48654,8 +51427,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bget_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -48663,23 +51438,27 @@ GrB_Info GB_AaddB__bget_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bget_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -48688,6 +51467,7 @@ GrB_Info GB_bind1st__bget_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48699,6 +51479,7 @@ GrB_Info GB_bind2nd__bget_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48710,10 +51491,10 @@ GrB_Info GB_bind1st_tran__bget_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -48723,14 +51504,14 @@ GrB_Info GB_bind2nd_tran__bget_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -48794,8 +51575,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bget_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -48803,23 +51586,27 @@ GrB_Info GB_AaddB__bget_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bget_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -48828,6 +51615,7 @@ GrB_Info GB_bind1st__bget_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48839,6 +51627,7 @@ GrB_Info GB_bind2nd__bget_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48850,10 +51639,10 @@ GrB_Info GB_bind1st_tran__bget_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -48863,14 +51652,14 @@ GrB_Info GB_bind2nd_tran__bget_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -48934,8 +51723,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bget_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -48943,23 +51734,27 @@ GrB_Info GB_AaddB__bget_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bget_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -48968,6 +51763,7 @@ GrB_Info GB_bind1st__bget_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48979,6 +51775,7 @@ GrB_Info GB_bind2nd__bget_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -48990,10 +51787,10 @@ GrB_Info GB_bind1st_tran__bget_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -49003,14 +51800,14 @@ GrB_Info GB_bind2nd_tran__bget_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -49074,8 +51871,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bget_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -49083,23 +51882,27 @@ GrB_Info GB_AaddB__bget_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bget_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -49108,6 +51911,7 @@ GrB_Info GB_bind1st__bget_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49119,6 +51923,7 @@ GrB_Info GB_bind2nd__bget_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49130,10 +51935,10 @@ GrB_Info GB_bind1st_tran__bget_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -49143,14 +51948,14 @@ GrB_Info GB_bind2nd_tran__bget_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -49214,8 +52019,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bset_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -49223,23 +52030,27 @@ GrB_Info GB_AaddB__bset_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bset_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -49248,6 +52059,7 @@ GrB_Info GB_bind1st__bset_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49259,6 +52071,7 @@ GrB_Info GB_bind2nd__bset_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49270,10 +52083,10 @@ GrB_Info GB_bind1st_tran__bset_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -49283,14 +52096,14 @@ GrB_Info GB_bind2nd_tran__bset_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -49354,8 +52167,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bset_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -49363,23 +52178,27 @@ GrB_Info GB_AaddB__bset_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bset_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -49388,6 +52207,7 @@ GrB_Info GB_bind1st__bset_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49399,6 +52219,7 @@ GrB_Info GB_bind2nd__bset_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49410,10 +52231,10 @@ GrB_Info GB_bind1st_tran__bset_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -49423,14 +52244,14 @@ GrB_Info GB_bind2nd_tran__bset_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -49494,8 +52315,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bset_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -49503,23 +52326,27 @@ GrB_Info GB_AaddB__bset_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bset_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -49528,6 +52355,7 @@ GrB_Info GB_bind1st__bset_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49539,6 +52367,7 @@ GrB_Info GB_bind2nd__bset_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49550,10 +52379,10 @@ GrB_Info GB_bind1st_tran__bset_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -49563,14 +52392,14 @@ GrB_Info GB_bind2nd_tran__bset_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -49634,8 +52463,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bset_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -49643,23 +52474,27 @@ GrB_Info GB_AaddB__bset_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bset_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -49668,6 +52503,7 @@ GrB_Info GB_bind1st__bset_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49679,6 +52515,7 @@ GrB_Info GB_bind2nd__bset_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49690,10 +52527,10 @@ GrB_Info GB_bind1st_tran__bset_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -49703,14 +52540,14 @@ GrB_Info GB_bind2nd_tran__bset_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -49774,8 +52611,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bset_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -49783,23 +52622,27 @@ GrB_Info GB_AaddB__bset_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bset_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -49808,6 +52651,7 @@ GrB_Info GB_bind1st__bset_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49819,6 +52663,7 @@ GrB_Info GB_bind2nd__bset_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49830,10 +52675,10 @@ GrB_Info GB_bind1st_tran__bset_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -49843,14 +52688,14 @@ GrB_Info GB_bind2nd_tran__bset_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -49914,8 +52759,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bset_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -49923,23 +52770,27 @@ GrB_Info GB_AaddB__bset_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bset_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -49948,6 +52799,7 @@ GrB_Info GB_bind1st__bset_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49959,6 +52811,7 @@ GrB_Info GB_bind2nd__bset_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -49970,10 +52823,10 @@ GrB_Info GB_bind1st_tran__bset_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -49983,14 +52836,14 @@ GrB_Info GB_bind2nd_tran__bset_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -50054,8 +52907,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bset_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -50063,23 +52918,27 @@ GrB_Info GB_AaddB__bset_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bset_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -50088,6 +52947,7 @@ GrB_Info GB_bind1st__bset_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50099,6 +52959,7 @@ GrB_Info GB_bind2nd__bset_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50110,10 +52971,10 @@ GrB_Info GB_bind1st_tran__bset_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -50123,14 +52984,14 @@ GrB_Info GB_bind2nd_tran__bset_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -50194,8 +53055,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bset_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -50203,23 +53066,27 @@ GrB_Info GB_AaddB__bset_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bset_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -50228,6 +53095,7 @@ GrB_Info GB_bind1st__bset_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50239,6 +53107,7 @@ GrB_Info GB_bind2nd__bset_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50250,10 +53119,10 @@ GrB_Info GB_bind1st_tran__bset_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -50263,14 +53132,14 @@ GrB_Info GB_bind2nd_tran__bset_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -50334,8 +53203,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bclr_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -50343,23 +53214,27 @@ GrB_Info GB_AaddB__bclr_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bclr_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -50368,6 +53243,7 @@ GrB_Info GB_bind1st__bclr_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50379,6 +53255,7 @@ GrB_Info GB_bind2nd__bclr_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50390,10 +53267,10 @@ GrB_Info GB_bind1st_tran__bclr_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -50403,14 +53280,14 @@ GrB_Info GB_bind2nd_tran__bclr_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -50474,8 +53351,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bclr_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -50483,23 +53362,27 @@ GrB_Info GB_AaddB__bclr_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bclr_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -50508,6 +53391,7 @@ GrB_Info GB_bind1st__bclr_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50519,6 +53403,7 @@ GrB_Info GB_bind2nd__bclr_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50530,10 +53415,10 @@ GrB_Info GB_bind1st_tran__bclr_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -50543,14 +53428,14 @@ GrB_Info GB_bind2nd_tran__bclr_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -50614,8 +53499,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bclr_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -50623,23 +53510,27 @@ GrB_Info GB_AaddB__bclr_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bclr_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -50648,6 +53539,7 @@ GrB_Info GB_bind1st__bclr_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50659,6 +53551,7 @@ GrB_Info GB_bind2nd__bclr_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50670,10 +53563,10 @@ GrB_Info GB_bind1st_tran__bclr_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -50683,14 +53576,14 @@ GrB_Info GB_bind2nd_tran__bclr_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -50754,8 +53647,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bclr_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -50763,23 +53658,27 @@ GrB_Info GB_AaddB__bclr_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bclr_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -50788,6 +53687,7 @@ GrB_Info GB_bind1st__bclr_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50799,6 +53699,7 @@ GrB_Info GB_bind2nd__bclr_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50810,10 +53711,10 @@ GrB_Info GB_bind1st_tran__bclr_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -50823,14 +53724,14 @@ GrB_Info GB_bind2nd_tran__bclr_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -50894,8 +53795,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bclr_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -50903,23 +53806,27 @@ GrB_Info GB_AaddB__bclr_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bclr_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -50928,6 +53835,7 @@ GrB_Info GB_bind1st__bclr_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50939,6 +53847,7 @@ GrB_Info GB_bind2nd__bclr_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -50950,10 +53859,10 @@ GrB_Info GB_bind1st_tran__bclr_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -50963,14 +53872,14 @@ GrB_Info GB_bind2nd_tran__bclr_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -51034,8 +53943,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bclr_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -51043,23 +53954,27 @@ GrB_Info GB_AaddB__bclr_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bclr_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -51068,6 +53983,7 @@ GrB_Info GB_bind1st__bclr_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51079,6 +53995,7 @@ GrB_Info GB_bind2nd__bclr_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51090,10 +54007,10 @@ GrB_Info GB_bind1st_tran__bclr_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -51103,14 +54020,14 @@ GrB_Info GB_bind2nd_tran__bclr_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -51174,8 +54091,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bclr_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -51183,23 +54102,27 @@ GrB_Info GB_AaddB__bclr_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bclr_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -51208,6 +54131,7 @@ GrB_Info GB_bind1st__bclr_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51219,6 +54143,7 @@ GrB_Info GB_bind2nd__bclr_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51230,10 +54155,10 @@ GrB_Info GB_bind1st_tran__bclr_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -51243,14 +54168,14 @@ GrB_Info GB_bind2nd_tran__bclr_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -51314,8 +54239,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bclr_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -51323,23 +54250,27 @@ GrB_Info GB_AaddB__bclr_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bclr_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -51348,6 +54279,7 @@ GrB_Info GB_bind1st__bclr_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51359,6 +54291,7 @@ GrB_Info GB_bind2nd__bclr_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51370,10 +54303,10 @@ GrB_Info GB_bind1st_tran__bclr_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -51383,14 +54316,14 @@ GrB_Info GB_bind2nd_tran__bclr_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -51454,8 +54387,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bshift_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -51463,23 +54398,27 @@ GrB_Info GB_AaddB__bshift_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bshift_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -51488,6 +54427,7 @@ GrB_Info GB_bind1st__bshift_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51499,6 +54439,7 @@ GrB_Info GB_bind2nd__bshift_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51510,10 +54451,10 @@ GrB_Info GB_bind1st_tran__bshift_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -51523,14 +54464,14 @@ GrB_Info GB_bind2nd_tran__bshift_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -51594,8 +54535,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bshift_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -51603,23 +54546,27 @@ GrB_Info GB_AaddB__bshift_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bshift_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -51628,6 +54575,7 @@ GrB_Info GB_bind1st__bshift_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51639,6 +54587,7 @@ GrB_Info GB_bind2nd__bshift_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51650,10 +54599,10 @@ GrB_Info GB_bind1st_tran__bshift_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -51663,14 +54612,14 @@ GrB_Info GB_bind2nd_tran__bshift_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -51734,8 +54683,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bshift_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -51743,23 +54694,27 @@ GrB_Info GB_AaddB__bshift_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bshift_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -51768,6 +54723,7 @@ GrB_Info GB_bind1st__bshift_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51779,6 +54735,7 @@ GrB_Info GB_bind2nd__bshift_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51790,10 +54747,10 @@ GrB_Info GB_bind1st_tran__bshift_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -51803,14 +54760,14 @@ GrB_Info GB_bind2nd_tran__bshift_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -51874,8 +54831,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bshift_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -51883,23 +54842,27 @@ GrB_Info GB_AaddB__bshift_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bshift_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -51908,6 +54871,7 @@ GrB_Info GB_bind1st__bshift_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51919,6 +54883,7 @@ GrB_Info GB_bind2nd__bshift_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -51930,10 +54895,10 @@ GrB_Info GB_bind1st_tran__bshift_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -51943,14 +54908,14 @@ GrB_Info GB_bind2nd_tran__bshift_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -52014,8 +54979,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bshift_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -52023,23 +54990,27 @@ GrB_Info GB_AaddB__bshift_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bshift_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -52048,6 +55019,7 @@ GrB_Info GB_bind1st__bshift_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52059,6 +55031,7 @@ GrB_Info GB_bind2nd__bshift_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52070,10 +55043,10 @@ GrB_Info GB_bind1st_tran__bshift_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -52083,14 +55056,14 @@ GrB_Info GB_bind2nd_tran__bshift_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -52154,8 +55127,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bshift_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -52163,23 +55138,27 @@ GrB_Info GB_AaddB__bshift_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bshift_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -52188,6 +55167,7 @@ GrB_Info GB_bind1st__bshift_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52199,6 +55179,7 @@ GrB_Info GB_bind2nd__bshift_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52210,10 +55191,10 @@ GrB_Info GB_bind1st_tran__bshift_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -52223,14 +55204,14 @@ GrB_Info GB_bind2nd_tran__bshift_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -52294,8 +55275,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bshift_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -52303,23 +55286,27 @@ GrB_Info GB_AaddB__bshift_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bshift_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -52328,6 +55315,7 @@ GrB_Info GB_bind1st__bshift_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52339,6 +55327,7 @@ GrB_Info GB_bind2nd__bshift_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52350,10 +55339,10 @@ GrB_Info GB_bind1st_tran__bshift_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -52363,14 +55352,14 @@ GrB_Info GB_bind2nd_tran__bshift_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -52434,8 +55423,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__bshift_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -52443,23 +55434,27 @@ GrB_Info GB_AaddB__bshift_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__bshift_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -52468,6 +55463,7 @@ GrB_Info GB_bind1st__bshift_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52479,6 +55475,7 @@ GrB_Info GB_bind2nd__bshift_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52490,10 +55487,10 @@ GrB_Info GB_bind1st_tran__bshift_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -52503,14 +55500,14 @@ GrB_Info GB_bind2nd_tran__bshift_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -52574,8 +55571,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__pow_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -52583,23 +55582,27 @@ GrB_Info GB_AaddB__pow_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pow_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -52608,6 +55611,7 @@ GrB_Info GB_bind1st__pow_int8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52619,6 +55623,7 @@ GrB_Info GB_bind2nd__pow_int8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52630,10 +55635,10 @@ GrB_Info GB_bind1st_tran__pow_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -52643,14 +55648,14 @@ GrB_Info GB_bind2nd_tran__pow_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -52714,8 +55719,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__pow_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -52723,23 +55730,27 @@ GrB_Info GB_AaddB__pow_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pow_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -52748,6 +55759,7 @@ GrB_Info GB_bind1st__pow_int16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52759,6 +55771,7 @@ GrB_Info GB_bind2nd__pow_int16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52770,10 +55783,10 @@ GrB_Info GB_bind1st_tran__pow_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -52783,14 +55796,14 @@ GrB_Info GB_bind2nd_tran__pow_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -52854,8 +55867,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__pow_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -52863,23 +55878,27 @@ GrB_Info GB_AaddB__pow_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pow_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -52888,6 +55907,7 @@ GrB_Info GB_bind1st__pow_int32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52899,6 +55919,7 @@ GrB_Info GB_bind2nd__pow_int32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -52910,10 +55931,10 @@ GrB_Info GB_bind1st_tran__pow_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -52923,14 +55944,14 @@ GrB_Info GB_bind2nd_tran__pow_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -52994,8 +56015,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__pow_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -53003,23 +56026,27 @@ GrB_Info GB_AaddB__pow_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pow_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -53028,6 +56055,7 @@ GrB_Info GB_bind1st__pow_int64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53039,6 +56067,7 @@ GrB_Info GB_bind2nd__pow_int64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53050,10 +56079,10 @@ GrB_Info GB_bind1st_tran__pow_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -53063,14 +56092,14 @@ GrB_Info GB_bind2nd_tran__pow_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -53134,8 +56163,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__pow_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -53143,23 +56174,27 @@ GrB_Info GB_AaddB__pow_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pow_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -53168,6 +56203,7 @@ GrB_Info GB_bind1st__pow_uint8
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53179,6 +56215,7 @@ GrB_Info GB_bind2nd__pow_uint8
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53190,10 +56227,10 @@ GrB_Info GB_bind1st_tran__pow_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -53203,14 +56240,14 @@ GrB_Info GB_bind2nd_tran__pow_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -53274,8 +56311,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__pow_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -53283,23 +56322,27 @@ GrB_Info GB_AaddB__pow_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pow_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -53308,6 +56351,7 @@ GrB_Info GB_bind1st__pow_uint16
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53319,6 +56363,7 @@ GrB_Info GB_bind2nd__pow_uint16
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53330,10 +56375,10 @@ GrB_Info GB_bind1st_tran__pow_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -53343,14 +56388,14 @@ GrB_Info GB_bind2nd_tran__pow_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -53414,8 +56459,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__pow_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -53423,23 +56470,27 @@ GrB_Info GB_AaddB__pow_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pow_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -53448,6 +56499,7 @@ GrB_Info GB_bind1st__pow_uint32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53459,6 +56511,7 @@ GrB_Info GB_bind2nd__pow_uint32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53470,10 +56523,10 @@ GrB_Info GB_bind1st_tran__pow_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -53483,14 +56536,14 @@ GrB_Info GB_bind2nd_tran__pow_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -53554,8 +56607,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__pow_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -53563,23 +56618,27 @@ GrB_Info GB_AaddB__pow_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pow_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -53588,6 +56647,7 @@ GrB_Info GB_bind1st__pow_uint64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53599,6 +56659,7 @@ GrB_Info GB_bind2nd__pow_uint64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53610,10 +56671,10 @@ GrB_Info GB_bind1st_tran__pow_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -53623,14 +56684,14 @@ GrB_Info GB_bind2nd_tran__pow_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -53694,8 +56755,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__pow_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -53703,23 +56766,27 @@ GrB_Info GB_AaddB__pow_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pow_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -53728,6 +56795,7 @@ GrB_Info GB_bind1st__pow_fp32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53739,6 +56807,7 @@ GrB_Info GB_bind2nd__pow_fp32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53750,10 +56819,10 @@ GrB_Info GB_bind1st_tran__pow_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -53763,14 +56832,14 @@ GrB_Info GB_bind2nd_tran__pow_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -53834,8 +56903,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__pow_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -53843,23 +56914,27 @@ GrB_Info GB_AaddB__pow_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pow_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -53868,6 +56943,7 @@ GrB_Info GB_bind1st__pow_fp64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53879,6 +56955,7 @@ GrB_Info GB_bind2nd__pow_fp64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -53890,10 +56967,10 @@ GrB_Info GB_bind1st_tran__pow_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -53903,14 +56980,14 @@ GrB_Info GB_bind2nd_tran__pow_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -53974,8 +57051,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__pow_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -53983,23 +57062,27 @@ GrB_Info GB_AaddB__pow_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pow_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -54008,6 +57091,7 @@ GrB_Info GB_bind1st__pow_fc32
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -54019,6 +57103,7 @@ GrB_Info GB_bind2nd__pow_fc32
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -54030,10 +57115,10 @@ GrB_Info GB_bind1st_tran__pow_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -54043,14 +57128,14 @@ GrB_Info GB_bind2nd_tran__pow_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
-
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 void (none)
 (
@@ -54114,8 +57199,10 @@ GrB_Info (node)
 GrB_Info GB_AaddB__pow_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -54123,23 +57210,27 @@ GrB_Info GB_AaddB__pow_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB__pow_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 
@@ -54148,6 +57239,7 @@ GrB_Info GB_bind1st__pow_fc64
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -54159,6 +57251,7 @@ GrB_Info GB_bind2nd__pow_fc64
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -54170,10 +57263,10 @@ GrB_Info GB_bind1st_tran__pow_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
@@ -54183,10 +57276,10 @@ GrB_Info GB_bind2nd_tran__pow_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
 
diff --git a/GraphBLAS/Source/Generated/GB_binop__iseq_fc32.c b/GraphBLAS/Source/Generated/GB_binop__iseq_fc32.c
index d14ef75daa..959b25e81c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__iseq_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__iseq_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC32_iseq (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__iseq_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__iseq_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__iseq_fc32
 GrB_Info GB_AemultB__iseq_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__iseq_fc32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__iseq_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC32_t bij = Bx [p] ;
         Cx [p] = GB_FC32_iseq (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__iseq_fc32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__iseq_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC32_t aij = Ax [p] ;
         Cx [p] = GB_FC32_iseq (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__iseq_fc32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_iseq (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_iseq (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__iseq_fc32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__iseq_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__iseq_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__iseq_fc32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_iseq (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_iseq (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__iseq_fc32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__iseq_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__iseq_fc64.c b/GraphBLAS/Source/Generated/GB_binop__iseq_fc64.c
index d6cca1a4d9..538e9801de 100644
--- a/GraphBLAS/Source/Generated/GB_binop__iseq_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__iseq_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC64_iseq (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__iseq_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__iseq_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__iseq_fc64
 GrB_Info GB_AemultB__iseq_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__iseq_fc64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__iseq_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC64_t bij = Bx [p] ;
         Cx [p] = GB_FC64_iseq (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__iseq_fc64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__iseq_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC64_t aij = Ax [p] ;
         Cx [p] = GB_FC64_iseq (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__iseq_fc64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_iseq (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_iseq (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__iseq_fc64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__iseq_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__iseq_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__iseq_fc64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_iseq (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_iseq (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__iseq_fc64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__iseq_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__iseq_fp32.c b/GraphBLAS/Source/Generated/GB_binop__iseq_fp32.c
index 7a215bc7e6..83fafc7078 100644
--- a/GraphBLAS/Source/Generated/GB_binop__iseq_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__iseq_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__iseq_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__iseq_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__iseq_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__iseq_fp32
 GrB_Info GB_AemultB__iseq_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__iseq_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__iseq_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__iseq_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__iseq_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__iseq_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__iseq_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__iseq_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__iseq_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__iseq_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__iseq_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__iseq_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__iseq_fp64.c b/GraphBLAS/Source/Generated/GB_binop__iseq_fp64.c
index 98c605b8f0..24e8a85f1f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__iseq_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__iseq_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__iseq_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__iseq_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__iseq_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__iseq_fp64
 GrB_Info GB_AemultB__iseq_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__iseq_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__iseq_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__iseq_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__iseq_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__iseq_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__iseq_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__iseq_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__iseq_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__iseq_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__iseq_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__iseq_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__iseq_int16.c b/GraphBLAS/Source/Generated/GB_binop__iseq_int16.c
index f97d5d5652..cd81939c85 100644
--- a/GraphBLAS/Source/Generated/GB_binop__iseq_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__iseq_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__iseq_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__iseq_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__iseq_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__iseq_int16
 GrB_Info GB_AemultB__iseq_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__iseq_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__iseq_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__iseq_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__iseq_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__iseq_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__iseq_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__iseq_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__iseq_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__iseq_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__iseq_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__iseq_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__iseq_int32.c b/GraphBLAS/Source/Generated/GB_binop__iseq_int32.c
index e829addeb0..29a7594c55 100644
--- a/GraphBLAS/Source/Generated/GB_binop__iseq_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__iseq_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__iseq_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__iseq_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__iseq_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__iseq_int32
 GrB_Info GB_AemultB__iseq_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__iseq_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__iseq_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__iseq_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__iseq_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__iseq_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__iseq_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__iseq_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__iseq_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__iseq_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__iseq_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__iseq_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__iseq_int64.c b/GraphBLAS/Source/Generated/GB_binop__iseq_int64.c
index 1c3e59a190..600cba8323 100644
--- a/GraphBLAS/Source/Generated/GB_binop__iseq_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__iseq_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__iseq_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__iseq_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__iseq_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__iseq_int64
 GrB_Info GB_AemultB__iseq_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__iseq_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__iseq_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__iseq_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__iseq_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__iseq_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__iseq_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__iseq_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__iseq_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__iseq_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__iseq_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__iseq_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__iseq_int8.c b/GraphBLAS/Source/Generated/GB_binop__iseq_int8.c
index 69e04a8780..260daf4c09 100644
--- a/GraphBLAS/Source/Generated/GB_binop__iseq_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__iseq_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__iseq_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__iseq_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__iseq_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__iseq_int8
 GrB_Info GB_AemultB__iseq_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__iseq_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__iseq_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__iseq_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__iseq_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__iseq_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__iseq_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__iseq_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__iseq_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__iseq_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__iseq_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__iseq_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__iseq_uint16.c b/GraphBLAS/Source/Generated/GB_binop__iseq_uint16.c
index 874d08fe74..97e6768c68 100644
--- a/GraphBLAS/Source/Generated/GB_binop__iseq_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__iseq_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__iseq_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__iseq_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__iseq_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__iseq_uint16
 GrB_Info GB_AemultB__iseq_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__iseq_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__iseq_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__iseq_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__iseq_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__iseq_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__iseq_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__iseq_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__iseq_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__iseq_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__iseq_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__iseq_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__iseq_uint32.c b/GraphBLAS/Source/Generated/GB_binop__iseq_uint32.c
index c34fedbd60..44b5f58d09 100644
--- a/GraphBLAS/Source/Generated/GB_binop__iseq_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__iseq_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__iseq_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__iseq_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__iseq_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__iseq_uint32
 GrB_Info GB_AemultB__iseq_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__iseq_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__iseq_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__iseq_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__iseq_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__iseq_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__iseq_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__iseq_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__iseq_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__iseq_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__iseq_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__iseq_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__iseq_uint64.c b/GraphBLAS/Source/Generated/GB_binop__iseq_uint64.c
index f63bfcaabb..cc1d7b456d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__iseq_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__iseq_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__iseq_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__iseq_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__iseq_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__iseq_uint64
 GrB_Info GB_AemultB__iseq_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__iseq_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__iseq_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__iseq_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__iseq_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__iseq_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__iseq_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__iseq_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__iseq_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__iseq_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__iseq_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__iseq_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__iseq_uint8.c b/GraphBLAS/Source/Generated/GB_binop__iseq_uint8.c
index 50ddce2f30..4e08d3c568 100644
--- a/GraphBLAS/Source/Generated/GB_binop__iseq_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__iseq_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x == y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__iseq_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__iseq_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__iseq_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__iseq_uint8
 GrB_Info GB_AemultB__iseq_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__iseq_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__iseq_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x == bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__iseq_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__iseq_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij == y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__iseq_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x == aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x == aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__iseq_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__iseq_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__iseq_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__iseq_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij == y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij == y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__iseq_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__iseq_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isge_fp32.c b/GraphBLAS/Source/Generated/GB_binop__isge_fp32.c
index dd940bd59f..53f76143cd 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isge_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isge_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isge_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isge_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isge_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isge_fp32
 GrB_Info GB_AemultB__isge_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isge_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isge_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isge_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isge_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isge_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isge_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isge_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isge_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isge_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isge_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isge_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isge_fp64.c b/GraphBLAS/Source/Generated/GB_binop__isge_fp64.c
index 509f7d3c88..71fe4b4c6f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isge_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isge_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isge_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isge_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isge_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isge_fp64
 GrB_Info GB_AemultB__isge_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isge_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isge_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isge_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isge_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isge_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isge_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isge_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isge_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isge_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isge_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isge_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isge_int16.c b/GraphBLAS/Source/Generated/GB_binop__isge_int16.c
index 1e56e1917f..33aa7ecae8 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isge_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isge_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isge_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isge_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isge_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isge_int16
 GrB_Info GB_AemultB__isge_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isge_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isge_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isge_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isge_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isge_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isge_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isge_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isge_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isge_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isge_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isge_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isge_int32.c b/GraphBLAS/Source/Generated/GB_binop__isge_int32.c
index f6e2771458..d77ad76f42 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isge_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isge_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isge_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isge_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isge_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isge_int32
 GrB_Info GB_AemultB__isge_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isge_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isge_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isge_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isge_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isge_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isge_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isge_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isge_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isge_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isge_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isge_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isge_int64.c b/GraphBLAS/Source/Generated/GB_binop__isge_int64.c
index 7d49494031..4a0560bcde 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isge_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isge_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isge_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isge_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isge_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isge_int64
 GrB_Info GB_AemultB__isge_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isge_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isge_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isge_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isge_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isge_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isge_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isge_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isge_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isge_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isge_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isge_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isge_int8.c b/GraphBLAS/Source/Generated/GB_binop__isge_int8.c
index 46c0aaa281..f4cc6739e4 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isge_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isge_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isge_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isge_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isge_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isge_int8
 GrB_Info GB_AemultB__isge_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isge_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isge_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isge_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isge_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isge_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isge_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isge_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isge_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isge_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isge_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isge_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isge_uint16.c b/GraphBLAS/Source/Generated/GB_binop__isge_uint16.c
index 1d999537e9..7caa2ee63d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isge_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isge_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isge_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isge_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isge_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isge_uint16
 GrB_Info GB_AemultB__isge_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isge_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isge_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isge_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isge_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isge_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isge_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isge_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isge_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isge_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isge_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isge_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isge_uint32.c b/GraphBLAS/Source/Generated/GB_binop__isge_uint32.c
index 0d5cdf6eff..609f9cf10f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isge_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isge_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isge_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isge_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isge_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isge_uint32
 GrB_Info GB_AemultB__isge_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isge_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isge_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isge_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isge_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isge_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isge_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isge_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isge_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isge_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isge_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isge_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isge_uint64.c b/GraphBLAS/Source/Generated/GB_binop__isge_uint64.c
index 3f82de8f1d..f0161357e3 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isge_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isge_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isge_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isge_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isge_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isge_uint64
 GrB_Info GB_AemultB__isge_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isge_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isge_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isge_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isge_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isge_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isge_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isge_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isge_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isge_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isge_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isge_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isge_uint8.c b/GraphBLAS/Source/Generated/GB_binop__isge_uint8.c
index 0c6d7315d1..ada73b3ce2 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isge_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isge_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x >= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isge_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isge_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isge_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isge_uint8
 GrB_Info GB_AemultB__isge_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isge_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isge_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x >= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isge_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isge_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij >= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isge_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x >= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x >= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isge_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isge_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isge_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isge_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij >= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij >= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isge_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isge_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isgt_fp32.c b/GraphBLAS/Source/Generated/GB_binop__isgt_fp32.c
index 65e63de100..01351f595a 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isgt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isgt_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isgt_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isgt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isgt_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isgt_fp32
 GrB_Info GB_AemultB__isgt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isgt_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isgt_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isgt_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isgt_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isgt_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isgt_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isgt_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isgt_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isgt_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isgt_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isgt_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isgt_fp64.c b/GraphBLAS/Source/Generated/GB_binop__isgt_fp64.c
index e118ccae3d..066087a5db 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isgt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isgt_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isgt_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isgt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isgt_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isgt_fp64
 GrB_Info GB_AemultB__isgt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isgt_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isgt_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isgt_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isgt_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isgt_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isgt_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isgt_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isgt_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isgt_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isgt_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isgt_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isgt_int16.c b/GraphBLAS/Source/Generated/GB_binop__isgt_int16.c
index 9580201430..8dd0c156b0 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isgt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isgt_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isgt_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isgt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isgt_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isgt_int16
 GrB_Info GB_AemultB__isgt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isgt_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isgt_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isgt_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isgt_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isgt_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isgt_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isgt_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isgt_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isgt_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isgt_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isgt_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isgt_int32.c b/GraphBLAS/Source/Generated/GB_binop__isgt_int32.c
index 4bf381a17c..9e71f8bc1d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isgt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isgt_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isgt_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isgt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isgt_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isgt_int32
 GrB_Info GB_AemultB__isgt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isgt_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isgt_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isgt_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isgt_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isgt_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isgt_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isgt_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isgt_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isgt_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isgt_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isgt_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isgt_int64.c b/GraphBLAS/Source/Generated/GB_binop__isgt_int64.c
index d45f94b7fc..c311d2684f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isgt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isgt_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isgt_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isgt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isgt_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isgt_int64
 GrB_Info GB_AemultB__isgt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isgt_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isgt_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isgt_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isgt_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isgt_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isgt_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isgt_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isgt_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isgt_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isgt_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isgt_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isgt_int8.c b/GraphBLAS/Source/Generated/GB_binop__isgt_int8.c
index 7e55581630..0511579c9a 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isgt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isgt_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isgt_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isgt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isgt_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isgt_int8
 GrB_Info GB_AemultB__isgt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isgt_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isgt_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isgt_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isgt_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isgt_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isgt_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isgt_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isgt_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isgt_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isgt_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isgt_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isgt_uint16.c b/GraphBLAS/Source/Generated/GB_binop__isgt_uint16.c
index e96db1438e..12947d49dc 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isgt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isgt_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isgt_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isgt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isgt_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isgt_uint16
 GrB_Info GB_AemultB__isgt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isgt_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isgt_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isgt_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isgt_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isgt_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isgt_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isgt_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isgt_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isgt_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isgt_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isgt_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isgt_uint32.c b/GraphBLAS/Source/Generated/GB_binop__isgt_uint32.c
index c47e2ffa81..5fd133369c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isgt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isgt_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isgt_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isgt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isgt_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isgt_uint32
 GrB_Info GB_AemultB__isgt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isgt_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isgt_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isgt_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isgt_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isgt_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isgt_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isgt_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isgt_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isgt_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isgt_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isgt_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isgt_uint64.c b/GraphBLAS/Source/Generated/GB_binop__isgt_uint64.c
index 522ad9e00e..738233480a 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isgt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isgt_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isgt_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isgt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isgt_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isgt_uint64
 GrB_Info GB_AemultB__isgt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isgt_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isgt_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isgt_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isgt_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isgt_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isgt_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isgt_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isgt_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isgt_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isgt_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isgt_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isgt_uint8.c b/GraphBLAS/Source/Generated/GB_binop__isgt_uint8.c
index ba74a7f79e..a90d2859fa 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isgt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isgt_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x > y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isgt_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isgt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isgt_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isgt_uint8
 GrB_Info GB_AemultB__isgt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isgt_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isgt_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x > bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isgt_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isgt_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij > y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isgt_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x > aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x > aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isgt_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isgt_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isgt_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isgt_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij > y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij > y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isgt_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isgt_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isle_fp32.c b/GraphBLAS/Source/Generated/GB_binop__isle_fp32.c
index 88f7706ec2..333022d3c6 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isle_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isle_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isle_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isle_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isle_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isle_fp32
 GrB_Info GB_AemultB__isle_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isle_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isle_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isle_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isle_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isle_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isle_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isle_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isle_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isle_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isle_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isle_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isle_fp64.c b/GraphBLAS/Source/Generated/GB_binop__isle_fp64.c
index 2257fc2c45..d39a64d07e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isle_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isle_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isle_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isle_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isle_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isle_fp64
 GrB_Info GB_AemultB__isle_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isle_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isle_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isle_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isle_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isle_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isle_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isle_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isle_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isle_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isle_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isle_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isle_int16.c b/GraphBLAS/Source/Generated/GB_binop__isle_int16.c
index b725402273..145a5fe984 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isle_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isle_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isle_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isle_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isle_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isle_int16
 GrB_Info GB_AemultB__isle_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isle_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isle_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isle_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isle_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isle_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isle_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isle_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isle_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isle_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isle_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isle_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isle_int32.c b/GraphBLAS/Source/Generated/GB_binop__isle_int32.c
index 74fbcfdd87..d7f3fe494c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isle_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isle_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isle_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isle_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isle_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isle_int32
 GrB_Info GB_AemultB__isle_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isle_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isle_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isle_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isle_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isle_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isle_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isle_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isle_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isle_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isle_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isle_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isle_int64.c b/GraphBLAS/Source/Generated/GB_binop__isle_int64.c
index 1f09bcd335..40fa818e7c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isle_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isle_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isle_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isle_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isle_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isle_int64
 GrB_Info GB_AemultB__isle_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isle_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isle_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isle_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isle_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isle_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isle_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isle_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isle_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isle_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isle_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isle_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isle_int8.c b/GraphBLAS/Source/Generated/GB_binop__isle_int8.c
index 4f132d5ded..2faa557d33 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isle_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isle_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isle_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isle_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isle_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isle_int8
 GrB_Info GB_AemultB__isle_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isle_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isle_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isle_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isle_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isle_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isle_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isle_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isle_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isle_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isle_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isle_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isle_uint16.c b/GraphBLAS/Source/Generated/GB_binop__isle_uint16.c
index 1eae018cfe..551d4d4572 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isle_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isle_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isle_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isle_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isle_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isle_uint16
 GrB_Info GB_AemultB__isle_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isle_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isle_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isle_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isle_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isle_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isle_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isle_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isle_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isle_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isle_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isle_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isle_uint32.c b/GraphBLAS/Source/Generated/GB_binop__isle_uint32.c
index 35367686d0..dca6876d3e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isle_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isle_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isle_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isle_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isle_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isle_uint32
 GrB_Info GB_AemultB__isle_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isle_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isle_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isle_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isle_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isle_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isle_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isle_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isle_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isle_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isle_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isle_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isle_uint64.c b/GraphBLAS/Source/Generated/GB_binop__isle_uint64.c
index 4d26981f6c..d3b3b78c5d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isle_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isle_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isle_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isle_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isle_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isle_uint64
 GrB_Info GB_AemultB__isle_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isle_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isle_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isle_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isle_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isle_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isle_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isle_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isle_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isle_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isle_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isle_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isle_uint8.c b/GraphBLAS/Source/Generated/GB_binop__isle_uint8.c
index 04fd3f8f32..cd06e7d1c1 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isle_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isle_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isle_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isle_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isle_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isle_uint8
 GrB_Info GB_AemultB__isle_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isle_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isle_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isle_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isle_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isle_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isle_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isle_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isle_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isle_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isle_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isle_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__islt_fp32.c b/GraphBLAS/Source/Generated/GB_binop__islt_fp32.c
index 9d0be24a64..06deb05142 100644
--- a/GraphBLAS/Source/Generated/GB_binop__islt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__islt_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__islt_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__islt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__islt_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__islt_fp32
 GrB_Info GB_AemultB__islt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__islt_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__islt_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__islt_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__islt_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__islt_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__islt_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__islt_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__islt_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__islt_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__islt_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__islt_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__islt_fp64.c b/GraphBLAS/Source/Generated/GB_binop__islt_fp64.c
index de60984454..a330457b6e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__islt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__islt_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__islt_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__islt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__islt_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__islt_fp64
 GrB_Info GB_AemultB__islt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__islt_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__islt_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__islt_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__islt_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__islt_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__islt_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__islt_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__islt_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__islt_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__islt_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__islt_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__islt_int16.c b/GraphBLAS/Source/Generated/GB_binop__islt_int16.c
index 751f0c0755..9d596b2f87 100644
--- a/GraphBLAS/Source/Generated/GB_binop__islt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__islt_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__islt_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__islt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__islt_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__islt_int16
 GrB_Info GB_AemultB__islt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__islt_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__islt_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__islt_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__islt_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__islt_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__islt_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__islt_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__islt_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__islt_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__islt_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__islt_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__islt_int32.c b/GraphBLAS/Source/Generated/GB_binop__islt_int32.c
index ae2911723c..0957099bd5 100644
--- a/GraphBLAS/Source/Generated/GB_binop__islt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__islt_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__islt_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__islt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__islt_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__islt_int32
 GrB_Info GB_AemultB__islt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__islt_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__islt_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__islt_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__islt_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__islt_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__islt_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__islt_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__islt_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__islt_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__islt_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__islt_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__islt_int64.c b/GraphBLAS/Source/Generated/GB_binop__islt_int64.c
index 0c41221c30..66b8e23aea 100644
--- a/GraphBLAS/Source/Generated/GB_binop__islt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__islt_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__islt_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__islt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__islt_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__islt_int64
 GrB_Info GB_AemultB__islt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__islt_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__islt_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__islt_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__islt_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__islt_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__islt_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__islt_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__islt_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__islt_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__islt_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__islt_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__islt_int8.c b/GraphBLAS/Source/Generated/GB_binop__islt_int8.c
index 8f96ab2241..1766598589 100644
--- a/GraphBLAS/Source/Generated/GB_binop__islt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__islt_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__islt_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__islt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__islt_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__islt_int8
 GrB_Info GB_AemultB__islt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__islt_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__islt_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__islt_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__islt_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__islt_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__islt_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__islt_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__islt_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__islt_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__islt_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__islt_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__islt_uint16.c b/GraphBLAS/Source/Generated/GB_binop__islt_uint16.c
index e008437db3..fcf42fd3d9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__islt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__islt_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__islt_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__islt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__islt_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__islt_uint16
 GrB_Info GB_AemultB__islt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__islt_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__islt_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__islt_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__islt_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__islt_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__islt_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__islt_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__islt_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__islt_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__islt_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__islt_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__islt_uint32.c b/GraphBLAS/Source/Generated/GB_binop__islt_uint32.c
index 02a460ac8e..11249ce38a 100644
--- a/GraphBLAS/Source/Generated/GB_binop__islt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__islt_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__islt_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__islt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__islt_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__islt_uint32
 GrB_Info GB_AemultB__islt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__islt_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__islt_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__islt_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__islt_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__islt_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__islt_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__islt_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__islt_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__islt_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__islt_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__islt_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__islt_uint64.c b/GraphBLAS/Source/Generated/GB_binop__islt_uint64.c
index 15b87d020b..741168ce64 100644
--- a/GraphBLAS/Source/Generated/GB_binop__islt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__islt_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__islt_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__islt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__islt_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__islt_uint64
 GrB_Info GB_AemultB__islt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__islt_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__islt_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__islt_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__islt_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__islt_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__islt_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__islt_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__islt_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__islt_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__islt_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__islt_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__islt_uint8.c b/GraphBLAS/Source/Generated/GB_binop__islt_uint8.c
index 5dc5a8aa17..b4b04294ea 100644
--- a/GraphBLAS/Source/Generated/GB_binop__islt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__islt_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__islt_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__islt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__islt_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__islt_uint8
 GrB_Info GB_AemultB__islt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__islt_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__islt_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__islt_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__islt_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__islt_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__islt_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__islt_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__islt_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__islt_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__islt_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__islt_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isne_fc32.c b/GraphBLAS/Source/Generated/GB_binop__isne_fc32.c
index f8c5e98e46..c5eb66f63c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isne_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isne_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC32_isne (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isne_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isne_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isne_fc32
 GrB_Info GB_AemultB__isne_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isne_fc32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isne_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC32_t bij = Bx [p] ;
         Cx [p] = GB_FC32_isne (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isne_fc32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isne_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC32_t aij = Ax [p] ;
         Cx [p] = GB_FC32_isne (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isne_fc32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_isne (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_isne (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isne_fc32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isne_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isne_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isne_fc32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_isne (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_isne (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isne_fc32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isne_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isne_fc64.c b/GraphBLAS/Source/Generated/GB_binop__isne_fc64.c
index 6541d7b643..e182e98eea 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isne_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isne_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC64_isne (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isne_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isne_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isne_fc64
 GrB_Info GB_AemultB__isne_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isne_fc64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isne_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC64_t bij = Bx [p] ;
         Cx [p] = GB_FC64_isne (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isne_fc64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isne_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC64_t aij = Ax [p] ;
         Cx [p] = GB_FC64_isne (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isne_fc64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_isne (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_isne (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isne_fc64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isne_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isne_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isne_fc64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_isne (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_isne (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isne_fc64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isne_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isne_fp32.c b/GraphBLAS/Source/Generated/GB_binop__isne_fp32.c
index e6dec6847e..821aa86bfc 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isne_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isne_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isne_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isne_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isne_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isne_fp32
 GrB_Info GB_AemultB__isne_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isne_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isne_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isne_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isne_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isne_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isne_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isne_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isne_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isne_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isne_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isne_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isne_fp64.c b/GraphBLAS/Source/Generated/GB_binop__isne_fp64.c
index 1b4ef74fdd..9693437ed2 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isne_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isne_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isne_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isne_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isne_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isne_fp64
 GrB_Info GB_AemultB__isne_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isne_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isne_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isne_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isne_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isne_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isne_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isne_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isne_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isne_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isne_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isne_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isne_int16.c b/GraphBLAS/Source/Generated/GB_binop__isne_int16.c
index 25bcb47e17..7fee5ccb85 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isne_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isne_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isne_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isne_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isne_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isne_int16
 GrB_Info GB_AemultB__isne_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isne_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isne_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isne_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isne_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isne_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isne_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isne_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isne_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isne_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isne_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isne_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isne_int32.c b/GraphBLAS/Source/Generated/GB_binop__isne_int32.c
index ed6ed7379d..c5e6b1ab9c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isne_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isne_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isne_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isne_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isne_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isne_int32
 GrB_Info GB_AemultB__isne_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isne_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isne_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isne_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isne_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isne_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isne_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isne_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isne_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isne_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isne_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isne_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isne_int64.c b/GraphBLAS/Source/Generated/GB_binop__isne_int64.c
index f1c3412fd2..7a3a340496 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isne_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isne_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isne_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isne_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isne_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isne_int64
 GrB_Info GB_AemultB__isne_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isne_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isne_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isne_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isne_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isne_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isne_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isne_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isne_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isne_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isne_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isne_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isne_int8.c b/GraphBLAS/Source/Generated/GB_binop__isne_int8.c
index a9c60aa018..7ec5634bdc 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isne_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isne_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isne_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isne_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isne_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isne_int8
 GrB_Info GB_AemultB__isne_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isne_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isne_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isne_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isne_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isne_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isne_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isne_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isne_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isne_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isne_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isne_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isne_uint16.c b/GraphBLAS/Source/Generated/GB_binop__isne_uint16.c
index b384ee89f7..10cc87bb7c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isne_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isne_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isne_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isne_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isne_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isne_uint16
 GrB_Info GB_AemultB__isne_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isne_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isne_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isne_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isne_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isne_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isne_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isne_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isne_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isne_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isne_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isne_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isne_uint32.c b/GraphBLAS/Source/Generated/GB_binop__isne_uint32.c
index 39dc5e6004..f5b68ba18b 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isne_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isne_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isne_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isne_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isne_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isne_uint32
 GrB_Info GB_AemultB__isne_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isne_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isne_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isne_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isne_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isne_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isne_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isne_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isne_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isne_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isne_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isne_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isne_uint64.c b/GraphBLAS/Source/Generated/GB_binop__isne_uint64.c
index 8a2db02dfd..87cbb489c3 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isne_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isne_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isne_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isne_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isne_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isne_uint64
 GrB_Info GB_AemultB__isne_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isne_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isne_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isne_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isne_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isne_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isne_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isne_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isne_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isne_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isne_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isne_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__isne_uint8.c b/GraphBLAS/Source/Generated/GB_binop__isne_uint8.c
index 081714df10..e28865cffa 100644
--- a/GraphBLAS/Source/Generated/GB_binop__isne_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__isne_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__isne_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__isne_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__isne_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__isne_uint8
 GrB_Info GB_AemultB__isne_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__isne_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__isne_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__isne_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__isne_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__isne_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__isne_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__isne_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__isne_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__isne_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__isne_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__isne_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__land_bool.c b/GraphBLAS/Source/Generated/GB_binop__land_bool.c
index 82477b6d41..0666ef31ec 100644
--- a/GraphBLAS/Source/Generated/GB_binop__land_bool.c
+++ b/GraphBLAS/Source/Generated/GB_binop__land_bool.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x && y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__land_bool
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__land_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__land_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__land_bool
 GrB_Info GB_AemultB__land_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__land_bool
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__land_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         bool bij = Bx [p] ;
         Cx [p] = (x && bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__land_bool
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__land_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         bool aij = Ax [p] ;
         Cx [p] = (aij && y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__land_bool
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (x && aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (x && aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__land_bool
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__land_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__land_bool
     return (GrB_NO_VALUE) ;
     #else
     bool x = (*((const bool *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__land_bool
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (aij && y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (aij && y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__land_bool
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__land_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     bool y = (*((const bool *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__land_fp32.c b/GraphBLAS/Source/Generated/GB_binop__land_fp32.c
index 9ef9de1ea6..2a8065a0d2 100644
--- a/GraphBLAS/Source/Generated/GB_binop__land_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__land_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) && (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__land_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__land_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__land_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__land_fp32
 GrB_Info GB_AemultB__land_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__land_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__land_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = ((x != 0) && (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__land_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__land_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = ((aij != 0) && (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__land_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) && (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) && (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__land_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__land_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__land_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__land_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) && (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) && (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__land_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__land_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__land_fp64.c b/GraphBLAS/Source/Generated/GB_binop__land_fp64.c
index 5c54875373..7a414ff8a2 100644
--- a/GraphBLAS/Source/Generated/GB_binop__land_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__land_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) && (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__land_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__land_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__land_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__land_fp64
 GrB_Info GB_AemultB__land_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__land_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__land_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = ((x != 0) && (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__land_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__land_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = ((aij != 0) && (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__land_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) && (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) && (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__land_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__land_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__land_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__land_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) && (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) && (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__land_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__land_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__land_int16.c b/GraphBLAS/Source/Generated/GB_binop__land_int16.c
index cfa9bee0b0..02d8f19391 100644
--- a/GraphBLAS/Source/Generated/GB_binop__land_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__land_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) && (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__land_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__land_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__land_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__land_int16
 GrB_Info GB_AemultB__land_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__land_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__land_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = ((x != 0) && (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__land_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__land_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) && (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__land_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) && (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) && (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__land_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__land_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__land_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__land_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) && (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) && (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__land_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__land_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__land_int32.c b/GraphBLAS/Source/Generated/GB_binop__land_int32.c
index 63c44d8e02..a921089f39 100644
--- a/GraphBLAS/Source/Generated/GB_binop__land_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__land_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) && (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__land_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__land_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__land_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__land_int32
 GrB_Info GB_AemultB__land_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__land_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__land_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = ((x != 0) && (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__land_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__land_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) && (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__land_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) && (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) && (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__land_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__land_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__land_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__land_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) && (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) && (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__land_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__land_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__land_int64.c b/GraphBLAS/Source/Generated/GB_binop__land_int64.c
index cf83c48954..df28cffc3d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__land_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__land_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) && (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__land_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__land_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__land_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__land_int64
 GrB_Info GB_AemultB__land_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__land_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__land_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = ((x != 0) && (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__land_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__land_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) && (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__land_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) && (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) && (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__land_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__land_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__land_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__land_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) && (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) && (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__land_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__land_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__land_int8.c b/GraphBLAS/Source/Generated/GB_binop__land_int8.c
index 3e3b4e5b09..03aeb60633 100644
--- a/GraphBLAS/Source/Generated/GB_binop__land_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__land_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) && (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__land_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__land_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__land_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__land_int8
 GrB_Info GB_AemultB__land_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__land_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__land_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = ((x != 0) && (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__land_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__land_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) && (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__land_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) && (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) && (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__land_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__land_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__land_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__land_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) && (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) && (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__land_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__land_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__land_uint16.c b/GraphBLAS/Source/Generated/GB_binop__land_uint16.c
index 106e0d6705..e90d79eee1 100644
--- a/GraphBLAS/Source/Generated/GB_binop__land_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__land_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) && (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__land_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__land_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__land_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__land_uint16
 GrB_Info GB_AemultB__land_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__land_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__land_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = ((x != 0) && (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__land_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__land_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) && (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__land_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) && (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) && (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__land_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__land_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__land_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__land_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) && (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) && (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__land_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__land_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__land_uint32.c b/GraphBLAS/Source/Generated/GB_binop__land_uint32.c
index e5b75dca73..5c5c363640 100644
--- a/GraphBLAS/Source/Generated/GB_binop__land_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__land_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) && (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__land_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__land_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__land_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__land_uint32
 GrB_Info GB_AemultB__land_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__land_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__land_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = ((x != 0) && (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__land_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__land_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) && (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__land_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) && (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) && (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__land_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__land_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__land_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__land_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) && (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) && (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__land_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__land_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__land_uint64.c b/GraphBLAS/Source/Generated/GB_binop__land_uint64.c
index 45123541b2..e1c245af9d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__land_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__land_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) && (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__land_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__land_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__land_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__land_uint64
 GrB_Info GB_AemultB__land_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__land_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__land_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = ((x != 0) && (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__land_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__land_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) && (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__land_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) && (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) && (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__land_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__land_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__land_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__land_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) && (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) && (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__land_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__land_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__land_uint8.c b/GraphBLAS/Source/Generated/GB_binop__land_uint8.c
index 076769cbca..657d29b905 100644
--- a/GraphBLAS/Source/Generated/GB_binop__land_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__land_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) && (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__land_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__land_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__land_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__land_uint8
 GrB_Info GB_AemultB__land_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__land_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__land_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = ((x != 0) && (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__land_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__land_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) && (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__land_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) && (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) && (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__land_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__land_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__land_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__land_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) && (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) && (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__land_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__land_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ldexp_fp32.c b/GraphBLAS/Source/Generated/GB_binop__ldexp_fp32.c
index dc7aebc8ea..d9503ae728 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ldexp_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ldexp_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ldexpf (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ldexp_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ldexp_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ldexp_fp32
 GrB_Info GB_AemultB__ldexp_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ldexp_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ldexp_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = ldexpf (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ldexp_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ldexp_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = ldexpf (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ldexp_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = ldexpf (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = ldexpf (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ldexp_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ldexp_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ldexp_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ldexp_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = ldexpf (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = ldexpf (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ldexp_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ldexp_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ldexp_fp64.c b/GraphBLAS/Source/Generated/GB_binop__ldexp_fp64.c
index aa1b1844ed..5d39591cd2 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ldexp_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ldexp_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ldexp (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ldexp_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ldexp_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ldexp_fp64
 GrB_Info GB_AemultB__ldexp_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ldexp_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ldexp_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = ldexp (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ldexp_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ldexp_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = ldexp (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ldexp_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = ldexp (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = ldexp (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ldexp_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ldexp_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ldexp_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ldexp_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = ldexp (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = ldexp (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ldexp_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ldexp_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__le_bool.c b/GraphBLAS/Source/Generated/GB_binop__le_bool.c
index 1dcc3503e1..d0bcf62a85 100644
--- a/GraphBLAS/Source/Generated/GB_binop__le_bool.c
+++ b/GraphBLAS/Source/Generated/GB_binop__le_bool.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__le_bool
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__le_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__le_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__le_bool
 GrB_Info GB_AemultB__le_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__le_bool
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__le_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         bool bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__le_bool
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__le_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         bool aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__le_bool
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__le_bool
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__le_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__le_bool
     return (GrB_NO_VALUE) ;
     #else
     bool x = (*((const bool *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__le_bool
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__le_bool
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__le_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     bool y = (*((const bool *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__le_fp32.c b/GraphBLAS/Source/Generated/GB_binop__le_fp32.c
index 6f43eada5c..4e291ad5de 100644
--- a/GraphBLAS/Source/Generated/GB_binop__le_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__le_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__le_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__le_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__le_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__le_fp32
 GrB_Info GB_AemultB__le_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__le_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__le_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__le_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__le_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__le_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__le_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__le_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__le_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__le_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__le_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__le_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__le_fp64.c b/GraphBLAS/Source/Generated/GB_binop__le_fp64.c
index fd4520dd17..3d5e2f454e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__le_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__le_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__le_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__le_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__le_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__le_fp64
 GrB_Info GB_AemultB__le_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__le_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__le_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__le_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__le_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__le_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__le_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__le_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__le_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__le_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__le_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__le_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__le_int16.c b/GraphBLAS/Source/Generated/GB_binop__le_int16.c
index 825b68344b..f19b1b2bc9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__le_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__le_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__le_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__le_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__le_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__le_int16
 GrB_Info GB_AemultB__le_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__le_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__le_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__le_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__le_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__le_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__le_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__le_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__le_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__le_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__le_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__le_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__le_int32.c b/GraphBLAS/Source/Generated/GB_binop__le_int32.c
index 035336b6b6..97e38b0236 100644
--- a/GraphBLAS/Source/Generated/GB_binop__le_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__le_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__le_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__le_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__le_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__le_int32
 GrB_Info GB_AemultB__le_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__le_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__le_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__le_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__le_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__le_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__le_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__le_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__le_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__le_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__le_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__le_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__le_int64.c b/GraphBLAS/Source/Generated/GB_binop__le_int64.c
index 85262de003..000a867fdb 100644
--- a/GraphBLAS/Source/Generated/GB_binop__le_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__le_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__le_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__le_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__le_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__le_int64
 GrB_Info GB_AemultB__le_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__le_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__le_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__le_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__le_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__le_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__le_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__le_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__le_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__le_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__le_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__le_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__le_int8.c b/GraphBLAS/Source/Generated/GB_binop__le_int8.c
index a2aaca5f86..8fb2f10f64 100644
--- a/GraphBLAS/Source/Generated/GB_binop__le_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__le_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__le_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__le_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__le_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__le_int8
 GrB_Info GB_AemultB__le_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__le_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__le_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__le_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__le_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__le_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__le_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__le_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__le_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__le_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__le_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__le_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__le_uint16.c b/GraphBLAS/Source/Generated/GB_binop__le_uint16.c
index 2e276f67cb..341f7b5155 100644
--- a/GraphBLAS/Source/Generated/GB_binop__le_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__le_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__le_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__le_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__le_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__le_uint16
 GrB_Info GB_AemultB__le_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__le_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__le_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__le_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__le_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__le_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__le_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__le_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__le_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__le_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__le_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__le_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__le_uint32.c b/GraphBLAS/Source/Generated/GB_binop__le_uint32.c
index 79c310e1a0..c76c50288d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__le_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__le_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__le_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__le_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__le_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__le_uint32
 GrB_Info GB_AemultB__le_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__le_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__le_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__le_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__le_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__le_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__le_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__le_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__le_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__le_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__le_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__le_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__le_uint64.c b/GraphBLAS/Source/Generated/GB_binop__le_uint64.c
index f992676ebe..ad045b35dc 100644
--- a/GraphBLAS/Source/Generated/GB_binop__le_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__le_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__le_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__le_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__le_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__le_uint64
 GrB_Info GB_AemultB__le_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__le_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__le_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__le_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__le_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__le_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__le_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__le_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__le_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__le_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__le_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__le_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__le_uint8.c b/GraphBLAS/Source/Generated/GB_binop__le_uint8.c
index bd1b74159f..41a4de1327 100644
--- a/GraphBLAS/Source/Generated/GB_binop__le_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__le_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x <= y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__le_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__le_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__le_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__le_uint8
 GrB_Info GB_AemultB__le_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__le_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__le_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x <= bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__le_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__le_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij <= y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__le_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x <= aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x <= aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__le_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__le_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__le_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__le_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij <= y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij <= y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__le_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__le_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lor_bool.c b/GraphBLAS/Source/Generated/GB_binop__lor_bool.c
index 797f09069a..f9c763f1b9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lor_bool.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x || y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lor_bool
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lor_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lor_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lor_bool
 GrB_Info GB_AemultB__lor_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lor_bool
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lor_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         bool bij = Bx [p] ;
         Cx [p] = (x || bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lor_bool
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lor_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         bool aij = Ax [p] ;
         Cx [p] = (aij || y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lor_bool
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (x || aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (x || aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lor_bool
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lor_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lor_bool
     return (GrB_NO_VALUE) ;
     #else
     bool x = (*((const bool *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lor_bool
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (aij || y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (aij || y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lor_bool
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lor_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     bool y = (*((const bool *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lor_fp32.c b/GraphBLAS/Source/Generated/GB_binop__lor_fp32.c
index f8ea343044..8e1c23263d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lor_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lor_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) || (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lor_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lor_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lor_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lor_fp32
 GrB_Info GB_AemultB__lor_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lor_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lor_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = ((x != 0) || (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lor_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lor_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = ((aij != 0) || (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lor_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) || (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) || (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lor_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lor_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lor_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lor_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) || (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) || (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lor_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lor_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lor_fp64.c b/GraphBLAS/Source/Generated/GB_binop__lor_fp64.c
index d73c0ed8e2..0f8c8cb10d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lor_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lor_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) || (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lor_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lor_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lor_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lor_fp64
 GrB_Info GB_AemultB__lor_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lor_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lor_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = ((x != 0) || (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lor_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lor_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = ((aij != 0) || (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lor_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) || (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) || (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lor_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lor_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lor_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lor_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) || (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) || (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lor_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lor_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lor_int16.c b/GraphBLAS/Source/Generated/GB_binop__lor_int16.c
index f76279721d..60375d55a4 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lor_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) || (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lor_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lor_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lor_int16
 GrB_Info GB_AemultB__lor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lor_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lor_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = ((x != 0) || (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lor_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lor_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) || (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lor_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) || (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) || (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lor_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lor_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lor_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lor_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) || (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) || (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lor_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lor_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lor_int32.c b/GraphBLAS/Source/Generated/GB_binop__lor_int32.c
index 73f19d9369..7abae33266 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lor_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) || (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lor_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lor_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lor_int32
 GrB_Info GB_AemultB__lor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lor_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lor_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = ((x != 0) || (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lor_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lor_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) || (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lor_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) || (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) || (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lor_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lor_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lor_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lor_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) || (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) || (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lor_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lor_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lor_int64.c b/GraphBLAS/Source/Generated/GB_binop__lor_int64.c
index f1f5e94921..0b602db289 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lor_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) || (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lor_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lor_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lor_int64
 GrB_Info GB_AemultB__lor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lor_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lor_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = ((x != 0) || (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lor_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lor_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) || (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lor_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) || (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) || (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lor_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lor_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lor_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lor_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) || (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) || (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lor_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lor_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lor_int8.c b/GraphBLAS/Source/Generated/GB_binop__lor_int8.c
index 53598727ca..ef87af6ad9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lor_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) || (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lor_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lor_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lor_int8
 GrB_Info GB_AemultB__lor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lor_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lor_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = ((x != 0) || (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lor_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lor_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) || (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lor_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) || (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) || (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lor_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lor_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lor_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lor_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) || (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) || (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lor_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lor_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lor_uint16.c b/GraphBLAS/Source/Generated/GB_binop__lor_uint16.c
index 357028ded4..d16e3643a7 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lor_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) || (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lor_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lor_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lor_uint16
 GrB_Info GB_AemultB__lor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lor_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lor_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = ((x != 0) || (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lor_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lor_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) || (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lor_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) || (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) || (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lor_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lor_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lor_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lor_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) || (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) || (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lor_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lor_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lor_uint32.c b/GraphBLAS/Source/Generated/GB_binop__lor_uint32.c
index 478f502e34..0b17be9ea9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lor_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) || (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lor_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lor_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lor_uint32
 GrB_Info GB_AemultB__lor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lor_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lor_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = ((x != 0) || (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lor_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lor_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) || (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lor_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) || (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) || (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lor_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lor_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lor_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lor_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) || (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) || (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lor_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lor_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lor_uint64.c b/GraphBLAS/Source/Generated/GB_binop__lor_uint64.c
index 3499e511f3..964ac1f42f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lor_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) || (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lor_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lor_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lor_uint64
 GrB_Info GB_AemultB__lor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lor_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lor_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = ((x != 0) || (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lor_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lor_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) || (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lor_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) || (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) || (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lor_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lor_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lor_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lor_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) || (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) || (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lor_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lor_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lor_uint8.c b/GraphBLAS/Source/Generated/GB_binop__lor_uint8.c
index d8d1b15b38..edae2d0612 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lor_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) || (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lor_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lor_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lor_uint8
 GrB_Info GB_AemultB__lor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lor_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lor_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = ((x != 0) || (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lor_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lor_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) || (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lor_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) || (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) || (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lor_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lor_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lor_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lor_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) || (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) || (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lor_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lor_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lt_bool.c b/GraphBLAS/Source/Generated/GB_binop__lt_bool.c
index 307d770103..a4cb0319db 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lt_bool.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lt_bool.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lt_bool
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lt_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lt_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lt_bool
 GrB_Info GB_AemultB__lt_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lt_bool
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lt_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         bool bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lt_bool
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lt_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         bool aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lt_bool
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lt_bool
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lt_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lt_bool
     return (GrB_NO_VALUE) ;
     #else
     bool x = (*((const bool *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lt_bool
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lt_bool
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lt_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     bool y = (*((const bool *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lt_fp32.c b/GraphBLAS/Source/Generated/GB_binop__lt_fp32.c
index e0883c96e7..0d1c51879e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lt_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lt_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lt_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lt_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lt_fp32
 GrB_Info GB_AemultB__lt_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lt_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lt_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lt_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lt_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lt_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lt_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lt_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lt_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lt_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lt_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lt_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lt_fp64.c b/GraphBLAS/Source/Generated/GB_binop__lt_fp64.c
index ef1fadde0a..cc2cdd1fcf 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lt_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lt_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lt_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lt_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lt_fp64
 GrB_Info GB_AemultB__lt_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lt_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lt_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lt_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lt_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lt_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lt_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lt_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lt_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lt_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lt_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lt_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lt_int16.c b/GraphBLAS/Source/Generated/GB_binop__lt_int16.c
index 1137b60e14..0bcf1203e4 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lt_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lt_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lt_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lt_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lt_int16
 GrB_Info GB_AemultB__lt_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lt_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lt_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lt_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lt_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lt_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lt_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lt_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lt_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lt_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lt_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lt_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lt_int32.c b/GraphBLAS/Source/Generated/GB_binop__lt_int32.c
index 45e7a69189..742b8b750f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lt_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lt_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lt_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lt_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lt_int32
 GrB_Info GB_AemultB__lt_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lt_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lt_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lt_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lt_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lt_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lt_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lt_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lt_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lt_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lt_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lt_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lt_int64.c b/GraphBLAS/Source/Generated/GB_binop__lt_int64.c
index 2a24004ab9..aa09bbe204 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lt_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lt_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lt_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lt_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lt_int64
 GrB_Info GB_AemultB__lt_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lt_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lt_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lt_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lt_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lt_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lt_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lt_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lt_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lt_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lt_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lt_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lt_int8.c b/GraphBLAS/Source/Generated/GB_binop__lt_int8.c
index 263dbc2e9a..04e98c089d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lt_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lt_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lt_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lt_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lt_int8
 GrB_Info GB_AemultB__lt_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lt_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lt_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lt_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lt_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lt_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lt_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lt_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lt_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lt_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lt_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lt_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lt_uint16.c b/GraphBLAS/Source/Generated/GB_binop__lt_uint16.c
index 192d49435c..ff0c2a8dd0 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lt_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lt_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lt_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lt_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lt_uint16
 GrB_Info GB_AemultB__lt_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lt_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lt_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lt_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lt_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lt_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lt_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lt_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lt_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lt_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lt_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lt_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lt_uint32.c b/GraphBLAS/Source/Generated/GB_binop__lt_uint32.c
index 41fcc53227..951518076e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lt_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lt_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lt_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lt_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lt_uint32
 GrB_Info GB_AemultB__lt_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lt_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lt_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lt_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lt_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lt_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lt_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lt_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lt_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lt_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lt_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lt_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lt_uint64.c b/GraphBLAS/Source/Generated/GB_binop__lt_uint64.c
index b017c23fdf..8f3555bd86 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lt_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lt_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lt_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lt_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lt_uint64
 GrB_Info GB_AemultB__lt_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lt_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lt_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lt_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lt_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lt_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lt_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lt_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lt_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lt_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lt_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lt_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lt_uint8.c b/GraphBLAS/Source/Generated/GB_binop__lt_uint8.c
index 5950cd7488..f612fff025 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lt_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lt_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x < y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lt_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lt_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lt_uint8
 GrB_Info GB_AemultB__lt_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lt_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lt_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x < bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lt_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lt_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij < y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lt_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x < aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x < aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lt_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lt_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lt_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lt_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij < y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij < y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lt_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lt_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lxor_bool.c b/GraphBLAS/Source/Generated/GB_binop__lxor_bool.c
index 9fe517a208..3ce8b15cd9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lxor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lxor_bool.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lxor_bool
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lxor_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lxor_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lxor_bool
 GrB_Info GB_AemultB__lxor_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lxor_bool
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lxor_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         bool bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lxor_bool
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lxor_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         bool aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lxor_bool
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lxor_bool
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lxor_bool
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lxor_bool
     return (GrB_NO_VALUE) ;
     #else
     bool x = (*((const bool *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lxor_bool
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lxor_bool
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lxor_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     bool y = (*((const bool *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lxor_fp32.c b/GraphBLAS/Source/Generated/GB_binop__lxor_fp32.c
index 603a4c9ca3..77a40230e7 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lxor_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lxor_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) != (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lxor_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lxor_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lxor_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lxor_fp32
 GrB_Info GB_AemultB__lxor_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lxor_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lxor_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = ((x != 0) != (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lxor_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lxor_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = ((aij != 0) != (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lxor_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) != (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) != (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lxor_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lxor_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lxor_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lxor_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) != (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) != (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lxor_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lxor_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lxor_fp64.c b/GraphBLAS/Source/Generated/GB_binop__lxor_fp64.c
index be5f3caa9b..b1565c8ad4 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lxor_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lxor_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) != (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lxor_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lxor_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lxor_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lxor_fp64
 GrB_Info GB_AemultB__lxor_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lxor_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lxor_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = ((x != 0) != (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lxor_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lxor_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = ((aij != 0) != (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lxor_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) != (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) != (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lxor_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lxor_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lxor_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lxor_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) != (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) != (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lxor_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lxor_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lxor_int16.c b/GraphBLAS/Source/Generated/GB_binop__lxor_int16.c
index 533417a7a4..6c98705287 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lxor_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lxor_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) != (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lxor_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lxor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lxor_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lxor_int16
 GrB_Info GB_AemultB__lxor_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lxor_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lxor_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = ((x != 0) != (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lxor_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lxor_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) != (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lxor_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) != (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) != (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lxor_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lxor_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lxor_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lxor_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) != (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) != (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lxor_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lxor_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lxor_int32.c b/GraphBLAS/Source/Generated/GB_binop__lxor_int32.c
index 52a3368d0f..c51f0a8e1e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lxor_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lxor_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) != (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lxor_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lxor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lxor_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lxor_int32
 GrB_Info GB_AemultB__lxor_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lxor_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lxor_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = ((x != 0) != (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lxor_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lxor_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) != (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lxor_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) != (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) != (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lxor_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lxor_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lxor_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lxor_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) != (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) != (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lxor_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lxor_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lxor_int64.c b/GraphBLAS/Source/Generated/GB_binop__lxor_int64.c
index a86bab2985..68ea94361c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lxor_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lxor_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) != (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lxor_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lxor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lxor_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lxor_int64
 GrB_Info GB_AemultB__lxor_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lxor_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lxor_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = ((x != 0) != (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lxor_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lxor_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) != (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lxor_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) != (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) != (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lxor_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lxor_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lxor_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lxor_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) != (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) != (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lxor_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lxor_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lxor_int8.c b/GraphBLAS/Source/Generated/GB_binop__lxor_int8.c
index e666084156..bcae717c0e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lxor_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lxor_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) != (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lxor_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lxor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lxor_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lxor_int8
 GrB_Info GB_AemultB__lxor_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lxor_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lxor_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = ((x != 0) != (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lxor_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lxor_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) != (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lxor_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) != (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) != (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lxor_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lxor_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lxor_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lxor_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) != (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) != (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lxor_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lxor_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lxor_uint16.c b/GraphBLAS/Source/Generated/GB_binop__lxor_uint16.c
index b227024051..1f27ea5eda 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lxor_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lxor_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) != (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lxor_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lxor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lxor_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lxor_uint16
 GrB_Info GB_AemultB__lxor_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lxor_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lxor_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = ((x != 0) != (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lxor_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lxor_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) != (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lxor_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) != (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) != (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lxor_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lxor_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lxor_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lxor_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) != (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) != (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lxor_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lxor_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lxor_uint32.c b/GraphBLAS/Source/Generated/GB_binop__lxor_uint32.c
index 1fb42ca63e..a7d9ea04b9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lxor_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lxor_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) != (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lxor_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lxor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lxor_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lxor_uint32
 GrB_Info GB_AemultB__lxor_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lxor_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lxor_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = ((x != 0) != (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lxor_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lxor_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) != (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lxor_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) != (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) != (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lxor_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lxor_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lxor_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lxor_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) != (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) != (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lxor_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lxor_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lxor_uint64.c b/GraphBLAS/Source/Generated/GB_binop__lxor_uint64.c
index 4d36ab5167..26ad06025d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lxor_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lxor_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) != (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lxor_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lxor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lxor_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lxor_uint64
 GrB_Info GB_AemultB__lxor_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lxor_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lxor_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = ((x != 0) != (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lxor_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lxor_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) != (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lxor_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) != (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) != (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lxor_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lxor_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lxor_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lxor_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) != (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) != (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lxor_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lxor_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__lxor_uint8.c b/GraphBLAS/Source/Generated/GB_binop__lxor_uint8.c
index 66e57d8bdf..1ec32d7029 100644
--- a/GraphBLAS/Source/Generated/GB_binop__lxor_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__lxor_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = ((x != 0) != (y != 0)) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__lxor_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__lxor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__lxor_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__lxor_uint8
 GrB_Info GB_AemultB__lxor_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__lxor_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__lxor_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = ((x != 0) != (bij != 0)) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__lxor_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__lxor_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = ((aij != 0) != (y != 0)) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__lxor_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = ((x != 0) != (aij != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((x != 0) != (aij != 0)) ;        \
 }
 
 GrB_Info GB_bind1st_tran__lxor_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__lxor_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__lxor_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__lxor_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = ((aij != 0) != (y != 0)) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = ((aij != 0) != (y != 0)) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__lxor_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__lxor_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__max_fp32.c b/GraphBLAS/Source/Generated/GB_binop__max_fp32.c
index 87942c86f5..e49ffcb558 100644
--- a/GraphBLAS/Source/Generated/GB_binop__max_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__max_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = fmaxf (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__max_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__max_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__max_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__max_fp32
 GrB_Info GB_AemultB__max_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__max_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__max_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = fmaxf (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__max_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__max_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = fmaxf (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__max_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = fmaxf (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = fmaxf (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__max_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__max_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__max_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__max_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = fmaxf (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = fmaxf (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__max_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__max_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__max_fp64.c b/GraphBLAS/Source/Generated/GB_binop__max_fp64.c
index 4dedca759f..97acec663f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__max_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__max_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = fmax (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__max_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__max_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__max_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__max_fp64
 GrB_Info GB_AemultB__max_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__max_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__max_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = fmax (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__max_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__max_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = fmax (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__max_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = fmax (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = fmax (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__max_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__max_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__max_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__max_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = fmax (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = fmax (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__max_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__max_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__max_int16.c b/GraphBLAS/Source/Generated/GB_binop__max_int16.c
index 283fdaad30..1a6d8d2d2a 100644
--- a/GraphBLAS/Source/Generated/GB_binop__max_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__max_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMAX (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__max_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__max_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__max_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__max_int16
 GrB_Info GB_AemultB__max_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__max_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__max_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = GB_IMAX (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__max_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__max_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = GB_IMAX (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__max_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__max_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__max_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__max_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__max_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__max_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__max_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__max_int32.c b/GraphBLAS/Source/Generated/GB_binop__max_int32.c
index 6754bf68a0..4e55cf5c8d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__max_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__max_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMAX (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__max_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__max_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__max_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__max_int32
 GrB_Info GB_AemultB__max_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__max_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__max_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = GB_IMAX (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__max_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__max_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = GB_IMAX (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__max_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__max_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__max_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__max_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__max_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__max_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__max_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__max_int64.c b/GraphBLAS/Source/Generated/GB_binop__max_int64.c
index 45e3f2db53..6a29282d73 100644
--- a/GraphBLAS/Source/Generated/GB_binop__max_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__max_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMAX (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__max_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__max_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__max_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__max_int64
 GrB_Info GB_AemultB__max_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__max_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__max_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = GB_IMAX (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__max_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__max_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = GB_IMAX (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__max_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__max_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__max_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__max_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__max_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__max_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__max_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__max_int8.c b/GraphBLAS/Source/Generated/GB_binop__max_int8.c
index 1d909dddb3..674fbf0799 100644
--- a/GraphBLAS/Source/Generated/GB_binop__max_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__max_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMAX (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__max_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__max_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__max_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__max_int8
 GrB_Info GB_AemultB__max_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__max_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__max_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_IMAX (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__max_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__max_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = GB_IMAX (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__max_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__max_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__max_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__max_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__max_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__max_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__max_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__max_uint16.c b/GraphBLAS/Source/Generated/GB_binop__max_uint16.c
index 7861754380..17f0aa9192 100644
--- a/GraphBLAS/Source/Generated/GB_binop__max_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__max_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMAX (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__max_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__max_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__max_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__max_uint16
 GrB_Info GB_AemultB__max_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__max_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__max_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = GB_IMAX (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__max_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__max_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = GB_IMAX (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__max_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__max_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__max_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__max_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__max_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__max_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__max_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__max_uint32.c b/GraphBLAS/Source/Generated/GB_binop__max_uint32.c
index fee929cd75..9914df0bcf 100644
--- a/GraphBLAS/Source/Generated/GB_binop__max_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__max_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMAX (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__max_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__max_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__max_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__max_uint32
 GrB_Info GB_AemultB__max_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__max_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__max_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = GB_IMAX (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__max_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__max_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = GB_IMAX (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__max_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__max_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__max_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__max_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__max_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__max_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__max_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__max_uint64.c b/GraphBLAS/Source/Generated/GB_binop__max_uint64.c
index 74d644cbce..10acc94082 100644
--- a/GraphBLAS/Source/Generated/GB_binop__max_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__max_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMAX (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__max_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__max_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__max_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__max_uint64
 GrB_Info GB_AemultB__max_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__max_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__max_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = GB_IMAX (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__max_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__max_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = GB_IMAX (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__max_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__max_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__max_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__max_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__max_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__max_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__max_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__max_uint8.c b/GraphBLAS/Source/Generated/GB_binop__max_uint8.c
index 56776ecb09..e25ba19bd5 100644
--- a/GraphBLAS/Source/Generated/GB_binop__max_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__max_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMAX (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__max_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__max_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__max_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__max_uint8
 GrB_Info GB_AemultB__max_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__max_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__max_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = GB_IMAX (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__max_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__max_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = GB_IMAX (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__max_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__max_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__max_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__max_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__max_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMAX (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMAX (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__max_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__max_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__min_fp32.c b/GraphBLAS/Source/Generated/GB_binop__min_fp32.c
index c88197da3c..486f6a0659 100644
--- a/GraphBLAS/Source/Generated/GB_binop__min_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__min_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = fminf (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__min_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__min_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__min_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__min_fp32
 GrB_Info GB_AemultB__min_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__min_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__min_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = fminf (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__min_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__min_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = fminf (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__min_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = fminf (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = fminf (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__min_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__min_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__min_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__min_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = fminf (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = fminf (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__min_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__min_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__min_fp64.c b/GraphBLAS/Source/Generated/GB_binop__min_fp64.c
index 0743d1d7a2..89624d66a3 100644
--- a/GraphBLAS/Source/Generated/GB_binop__min_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__min_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = fmin (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__min_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__min_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__min_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__min_fp64
 GrB_Info GB_AemultB__min_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__min_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__min_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = fmin (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__min_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__min_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = fmin (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__min_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = fmin (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = fmin (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__min_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__min_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__min_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__min_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = fmin (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = fmin (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__min_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__min_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__min_int16.c b/GraphBLAS/Source/Generated/GB_binop__min_int16.c
index 76b953fbb6..340fbaab17 100644
--- a/GraphBLAS/Source/Generated/GB_binop__min_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__min_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMIN (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__min_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__min_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__min_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__min_int16
 GrB_Info GB_AemultB__min_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__min_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__min_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = GB_IMIN (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__min_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__min_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = GB_IMIN (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__min_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__min_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__min_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__min_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__min_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__min_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__min_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__min_int32.c b/GraphBLAS/Source/Generated/GB_binop__min_int32.c
index 1a62050459..72ce0f6bb2 100644
--- a/GraphBLAS/Source/Generated/GB_binop__min_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__min_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMIN (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__min_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__min_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__min_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__min_int32
 GrB_Info GB_AemultB__min_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__min_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__min_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = GB_IMIN (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__min_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__min_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = GB_IMIN (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__min_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__min_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__min_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__min_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__min_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__min_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__min_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__min_int64.c b/GraphBLAS/Source/Generated/GB_binop__min_int64.c
index 68451e74be..30c005bbb7 100644
--- a/GraphBLAS/Source/Generated/GB_binop__min_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__min_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMIN (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__min_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__min_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__min_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__min_int64
 GrB_Info GB_AemultB__min_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__min_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__min_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = GB_IMIN (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__min_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__min_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = GB_IMIN (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__min_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__min_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__min_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__min_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__min_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__min_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__min_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__min_int8.c b/GraphBLAS/Source/Generated/GB_binop__min_int8.c
index 141a84d007..8712c5bc30 100644
--- a/GraphBLAS/Source/Generated/GB_binop__min_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__min_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMIN (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__min_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__min_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__min_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__min_int8
 GrB_Info GB_AemultB__min_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__min_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__min_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_IMIN (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__min_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__min_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = GB_IMIN (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__min_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__min_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__min_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__min_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__min_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__min_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__min_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__min_uint16.c b/GraphBLAS/Source/Generated/GB_binop__min_uint16.c
index def392f1f3..4c495de1d4 100644
--- a/GraphBLAS/Source/Generated/GB_binop__min_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__min_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMIN (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__min_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__min_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__min_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__min_uint16
 GrB_Info GB_AemultB__min_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__min_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__min_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = GB_IMIN (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__min_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__min_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = GB_IMIN (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__min_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__min_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__min_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__min_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__min_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__min_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__min_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__min_uint32.c b/GraphBLAS/Source/Generated/GB_binop__min_uint32.c
index 941161f072..6396c50d2f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__min_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__min_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMIN (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__min_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__min_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__min_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__min_uint32
 GrB_Info GB_AemultB__min_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__min_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__min_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = GB_IMIN (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__min_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__min_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = GB_IMIN (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__min_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__min_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__min_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__min_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__min_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__min_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__min_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__min_uint64.c b/GraphBLAS/Source/Generated/GB_binop__min_uint64.c
index 3d6df9dc9f..851ccd2d21 100644
--- a/GraphBLAS/Source/Generated/GB_binop__min_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__min_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMIN (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__min_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__min_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__min_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__min_uint64
 GrB_Info GB_AemultB__min_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__min_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__min_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = GB_IMIN (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__min_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__min_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = GB_IMIN (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__min_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__min_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__min_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__min_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__min_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__min_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__min_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__min_uint8.c b/GraphBLAS/Source/Generated/GB_binop__min_uint8.c
index 97d2afaa44..377743caf8 100644
--- a/GraphBLAS/Source/Generated/GB_binop__min_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__min_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IMIN (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__min_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__min_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__min_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__min_uint8
 GrB_Info GB_AemultB__min_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__min_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__min_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = GB_IMIN (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__min_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__min_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = GB_IMIN (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__min_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__min_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__min_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__min_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__min_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IMIN (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IMIN (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__min_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__min_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__minus_fc32.c b/GraphBLAS/Source/Generated/GB_binop__minus_fc32.c
index e2d2a22fc2..8f09669cd8 100644
--- a/GraphBLAS/Source/Generated/GB_binop__minus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__minus_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC32_minus (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__minus_fc32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__minus_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__minus_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__minus_fc32
 GrB_Info GB_AemultB__minus_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__minus_fc32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__minus_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC32_t bij = Bx [p] ;
         Cx [p] = GB_FC32_minus (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__minus_fc32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__minus_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC32_t aij = Ax [p] ;
         Cx [p] = GB_FC32_minus (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__minus_fc32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_minus (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_minus (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__minus_fc32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__minus_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__minus_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__minus_fc32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_minus (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_minus (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__minus_fc32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__minus_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__minus_fc64.c b/GraphBLAS/Source/Generated/GB_binop__minus_fc64.c
index 11182e30a4..4d858ed9a3 100644
--- a/GraphBLAS/Source/Generated/GB_binop__minus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__minus_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC64_minus (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__minus_fc64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__minus_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__minus_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__minus_fc64
 GrB_Info GB_AemultB__minus_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__minus_fc64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__minus_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC64_t bij = Bx [p] ;
         Cx [p] = GB_FC64_minus (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__minus_fc64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__minus_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC64_t aij = Ax [p] ;
         Cx [p] = GB_FC64_minus (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__minus_fc64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_minus (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_minus (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__minus_fc64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__minus_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__minus_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__minus_fc64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_minus (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_minus (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__minus_fc64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__minus_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__minus_fp32.c b/GraphBLAS/Source/Generated/GB_binop__minus_fp32.c
index 9b6101e658..3c5fcba279 100644
--- a/GraphBLAS/Source/Generated/GB_binop__minus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__minus_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x - y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__minus_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__minus_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__minus_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__minus_fp32
 GrB_Info GB_AemultB__minus_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__minus_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__minus_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x - bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__minus_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__minus_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij - y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__minus_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x - aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__minus_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__minus_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__minus_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__minus_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij - y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__minus_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__minus_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__minus_fp64.c b/GraphBLAS/Source/Generated/GB_binop__minus_fp64.c
index 4271301402..61083b3ea5 100644
--- a/GraphBLAS/Source/Generated/GB_binop__minus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__minus_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x - y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__minus_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__minus_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__minus_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__minus_fp64
 GrB_Info GB_AemultB__minus_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__minus_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__minus_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x - bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__minus_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__minus_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij - y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__minus_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x - aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__minus_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__minus_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__minus_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__minus_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij - y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__minus_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__minus_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__minus_int16.c b/GraphBLAS/Source/Generated/GB_binop__minus_int16.c
index 173ac098f0..77acdf0482 100644
--- a/GraphBLAS/Source/Generated/GB_binop__minus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__minus_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x - y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__minus_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__minus_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__minus_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__minus_int16
 GrB_Info GB_AemultB__minus_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__minus_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__minus_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x - bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__minus_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__minus_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij - y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__minus_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x - aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__minus_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__minus_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__minus_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__minus_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__minus_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__minus_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__minus_int32.c b/GraphBLAS/Source/Generated/GB_binop__minus_int32.c
index fa3bbf4f8c..7d41661189 100644
--- a/GraphBLAS/Source/Generated/GB_binop__minus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__minus_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x - y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__minus_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__minus_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__minus_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__minus_int32
 GrB_Info GB_AemultB__minus_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__minus_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__minus_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x - bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__minus_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__minus_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij - y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__minus_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x - aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__minus_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__minus_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__minus_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__minus_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__minus_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__minus_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__minus_int64.c b/GraphBLAS/Source/Generated/GB_binop__minus_int64.c
index d6903c2d03..47132f21c9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__minus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__minus_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x - y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__minus_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__minus_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__minus_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__minus_int64
 GrB_Info GB_AemultB__minus_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__minus_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__minus_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x - bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__minus_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__minus_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij - y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__minus_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x - aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__minus_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__minus_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__minus_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__minus_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__minus_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__minus_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__minus_int8.c b/GraphBLAS/Source/Generated/GB_binop__minus_int8.c
index 687757fbab..6614e9b1b5 100644
--- a/GraphBLAS/Source/Generated/GB_binop__minus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__minus_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x - y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__minus_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__minus_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__minus_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__minus_int8
 GrB_Info GB_AemultB__minus_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__minus_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__minus_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x - bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__minus_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__minus_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij - y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__minus_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x - aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__minus_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__minus_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__minus_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__minus_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__minus_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__minus_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__minus_uint16.c b/GraphBLAS/Source/Generated/GB_binop__minus_uint16.c
index 2b67f5e751..b96f2d374a 100644
--- a/GraphBLAS/Source/Generated/GB_binop__minus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__minus_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x - y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__minus_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__minus_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__minus_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__minus_uint16
 GrB_Info GB_AemultB__minus_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__minus_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__minus_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x - bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__minus_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__minus_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij - y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__minus_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x - aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__minus_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__minus_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__minus_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__minus_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__minus_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__minus_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__minus_uint32.c b/GraphBLAS/Source/Generated/GB_binop__minus_uint32.c
index 1c645c15b7..df59412bd4 100644
--- a/GraphBLAS/Source/Generated/GB_binop__minus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__minus_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x - y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__minus_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__minus_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__minus_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__minus_uint32
 GrB_Info GB_AemultB__minus_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__minus_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__minus_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x - bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__minus_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__minus_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij - y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__minus_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x - aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__minus_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__minus_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__minus_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__minus_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__minus_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__minus_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__minus_uint64.c b/GraphBLAS/Source/Generated/GB_binop__minus_uint64.c
index 7737ebe9a0..443a0f5b20 100644
--- a/GraphBLAS/Source/Generated/GB_binop__minus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__minus_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x - y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__minus_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__minus_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__minus_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__minus_uint64
 GrB_Info GB_AemultB__minus_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__minus_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__minus_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x - bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__minus_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__minus_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij - y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__minus_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x - aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__minus_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__minus_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__minus_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__minus_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__minus_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__minus_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__minus_uint8.c b/GraphBLAS/Source/Generated/GB_binop__minus_uint8.c
index d8e41bc5ff..c3042ebd02 100644
--- a/GraphBLAS/Source/Generated/GB_binop__minus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__minus_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x - y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__minus_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__minus_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__minus_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__minus_uint8
 GrB_Info GB_AemultB__minus_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__minus_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__minus_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x - bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__minus_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__minus_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij - y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__minus_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x - aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__minus_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__minus_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__minus_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__minus_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__minus_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__minus_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ne_fc32.c b/GraphBLAS/Source/Generated/GB_binop__ne_fc32.c
index a637d6372c..de8777944a 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ne_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ne_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC32_ne (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ne_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ne_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ne_fc32
 GrB_Info GB_AemultB__ne_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ne_fc32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ne_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC32_t bij = Bx [p] ;
         Cx [p] = GB_FC32_ne (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ne_fc32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ne_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC32_t aij = Ax [p] ;
         Cx [p] = GB_FC32_ne (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ne_fc32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_ne (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_ne (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ne_fc32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ne_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ne_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ne_fc32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_ne (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_ne (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ne_fc32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ne_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ne_fc64.c b/GraphBLAS/Source/Generated/GB_binop__ne_fc64.c
index 9ad1aad427..7dde5cf21e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ne_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ne_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC64_ne (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ne_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ne_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ne_fc64
 GrB_Info GB_AemultB__ne_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ne_fc64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ne_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC64_t bij = Bx [p] ;
         Cx [p] = GB_FC64_ne (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ne_fc64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ne_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC64_t aij = Ax [p] ;
         Cx [p] = GB_FC64_ne (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ne_fc64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_ne (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_ne (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ne_fc64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ne_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ne_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ne_fc64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_ne (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_ne (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ne_fc64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ne_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ne_fp32.c b/GraphBLAS/Source/Generated/GB_binop__ne_fp32.c
index ce6e00fe03..f8db6e08c8 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ne_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ne_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ne_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ne_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ne_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ne_fp32
 GrB_Info GB_AemultB__ne_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ne_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ne_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ne_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ne_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ne_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ne_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ne_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ne_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ne_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ne_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ne_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ne_fp64.c b/GraphBLAS/Source/Generated/GB_binop__ne_fp64.c
index 07adf2fe72..12146caf2d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ne_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ne_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ne_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ne_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ne_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ne_fp64
 GrB_Info GB_AemultB__ne_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ne_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ne_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ne_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ne_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ne_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ne_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ne_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ne_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ne_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ne_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ne_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ne_int16.c b/GraphBLAS/Source/Generated/GB_binop__ne_int16.c
index 70644bb1b5..8e2da6a064 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ne_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ne_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ne_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ne_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ne_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ne_int16
 GrB_Info GB_AemultB__ne_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ne_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ne_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ne_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ne_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ne_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ne_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ne_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ne_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ne_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ne_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ne_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ne_int32.c b/GraphBLAS/Source/Generated/GB_binop__ne_int32.c
index 8660dccc40..17fcc2ff30 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ne_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ne_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ne_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ne_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ne_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ne_int32
 GrB_Info GB_AemultB__ne_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ne_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ne_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ne_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ne_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ne_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ne_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ne_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ne_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ne_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ne_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ne_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ne_int64.c b/GraphBLAS/Source/Generated/GB_binop__ne_int64.c
index ce623b1471..9cf2b5fed5 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ne_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ne_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ne_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ne_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ne_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ne_int64
 GrB_Info GB_AemultB__ne_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ne_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ne_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ne_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ne_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ne_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ne_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ne_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ne_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ne_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ne_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ne_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ne_int8.c b/GraphBLAS/Source/Generated/GB_binop__ne_int8.c
index c15445265d..aafcf1080d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ne_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ne_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ne_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ne_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ne_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ne_int8
 GrB_Info GB_AemultB__ne_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ne_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ne_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ne_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ne_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ne_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ne_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ne_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ne_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ne_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ne_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ne_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ne_uint16.c b/GraphBLAS/Source/Generated/GB_binop__ne_uint16.c
index e7dc7878f1..e568b9865d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ne_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ne_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ne_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ne_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ne_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ne_uint16
 GrB_Info GB_AemultB__ne_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ne_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ne_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ne_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ne_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ne_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ne_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ne_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ne_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ne_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ne_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ne_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ne_uint32.c b/GraphBLAS/Source/Generated/GB_binop__ne_uint32.c
index 5e991c0bef..ac56f9d0e7 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ne_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ne_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ne_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ne_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ne_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ne_uint32
 GrB_Info GB_AemultB__ne_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ne_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ne_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ne_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ne_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ne_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ne_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ne_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ne_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ne_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ne_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ne_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ne_uint64.c b/GraphBLAS/Source/Generated/GB_binop__ne_uint64.c
index 527bea7653..ed3cb9f4c0 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ne_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ne_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ne_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ne_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ne_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ne_uint64
 GrB_Info GB_AemultB__ne_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ne_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ne_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ne_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ne_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ne_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ne_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ne_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ne_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ne_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ne_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ne_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__ne_uint8.c b/GraphBLAS/Source/Generated/GB_binop__ne_uint8.c
index b53ab63122..9c94a4edeb 100644
--- a/GraphBLAS/Source/Generated/GB_binop__ne_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__ne_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x != y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__ne_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__ne_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__ne_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__ne_uint8
 GrB_Info GB_AemultB__ne_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__ne_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__ne_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x != bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__ne_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__ne_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij != y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__ne_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x != aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x != aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__ne_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__ne_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__ne_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__ne_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij != y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij != y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__ne_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__ne_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_bool.c b/GraphBLAS/Source/Generated/GB_binop__pair_bool.c
index 8de4f6f9ee..fcd3066331 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_bool.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_bool.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = 1 ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_bool
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_bool
 GrB_Info GB_AemultB__pair_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     bool x = (*((const bool *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     bool y = (*((const bool *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_fc32.c b/GraphBLAS/Source/Generated/GB_binop__pair_fc32.c
index efd1db3678..b1d6c73ae5 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GxB_CMPLXF(1,0) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_fc32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_fc32
 GrB_Info GB_AemultB__pair_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = GxB_CMPLXF(1,0) ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = GxB_CMPLXF(1,0) ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = GxB_CMPLXF(1,0) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = GxB_CMPLXF(1,0) ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = GxB_CMPLXF(1,0) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = GxB_CMPLXF(1,0) ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_fc64.c b/GraphBLAS/Source/Generated/GB_binop__pair_fc64.c
index 8ebef3024e..511dad7283 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GxB_CMPLX(1,0) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_fc64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_fc64
 GrB_Info GB_AemultB__pair_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = GxB_CMPLX(1,0) ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = GxB_CMPLX(1,0) ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = GxB_CMPLX(1,0) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = GxB_CMPLX(1,0) ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = GxB_CMPLX(1,0) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = GxB_CMPLX(1,0) ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_fp32.c b/GraphBLAS/Source/Generated/GB_binop__pair_fp32.c
index 6db6fe1960..33fbd4849f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = 1 ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_fp32
 GrB_Info GB_AemultB__pair_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_fp64.c b/GraphBLAS/Source/Generated/GB_binop__pair_fp64.c
index bac3028fab..42a95fbbdb 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = 1 ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_fp64
 GrB_Info GB_AemultB__pair_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_int16.c b/GraphBLAS/Source/Generated/GB_binop__pair_int16.c
index 8780f062fd..543984c42f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = 1 ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_int16
 GrB_Info GB_AemultB__pair_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_int32.c b/GraphBLAS/Source/Generated/GB_binop__pair_int32.c
index 07cbbfa5b8..b443b5aff2 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = 1 ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_int32
 GrB_Info GB_AemultB__pair_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_int64.c b/GraphBLAS/Source/Generated/GB_binop__pair_int64.c
index 32cbf4af98..e40abec124 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = 1 ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_int64
 GrB_Info GB_AemultB__pair_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_int8.c b/GraphBLAS/Source/Generated/GB_binop__pair_int8.c
index bdfed6121a..a669d6c3b7 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = 1 ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_int8
 GrB_Info GB_AemultB__pair_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_uint16.c b/GraphBLAS/Source/Generated/GB_binop__pair_uint16.c
index 9a14d004e1..81d611a137 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = 1 ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_uint16
 GrB_Info GB_AemultB__pair_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_uint32.c b/GraphBLAS/Source/Generated/GB_binop__pair_uint32.c
index f608de5b03..7aba2b3492 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = 1 ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_uint32
 GrB_Info GB_AemultB__pair_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_uint64.c b/GraphBLAS/Source/Generated/GB_binop__pair_uint64.c
index e7a2f1dd83..5e2c60cb4c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = 1 ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_uint64
 GrB_Info GB_AemultB__pair_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pair_uint8.c b/GraphBLAS/Source/Generated/GB_binop__pair_uint8.c
index 3e4bb344cb..5ce59dbed9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pair_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pair_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = 1 ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__pair_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pair_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pair_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pair_uint8
 GrB_Info GB_AemultB__pair_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -358,6 +383,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = 1 ;
     }
@@ -387,12 +414,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 #if 0
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = 1 ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = 1 ;        \
 }
 
 GrB_Info (none)
@@ -445,17 +471,16 @@ GrB_Info (none)
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__plus_fc32.c b/GraphBLAS/Source/Generated/GB_binop__plus_fc32.c
index 7547ed4f67..d3c44c02ce 100644
--- a/GraphBLAS/Source/Generated/GB_binop__plus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__plus_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC32_add (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__plus_fc32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__plus_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__plus_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__plus_fc32
 GrB_Info GB_AemultB__plus_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__plus_fc32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__plus_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC32_t bij = Bx [p] ;
         Cx [p] = GB_FC32_add (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__plus_fc32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__plus_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC32_t aij = Ax [p] ;
         Cx [p] = GB_FC32_add (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__plus_fc32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_add (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_add (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__plus_fc32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__plus_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__plus_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__plus_fc32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_add (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_add (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__plus_fc32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__plus_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__plus_fc64.c b/GraphBLAS/Source/Generated/GB_binop__plus_fc64.c
index eded6ffc46..2b8145bfb0 100644
--- a/GraphBLAS/Source/Generated/GB_binop__plus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__plus_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC64_add (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__plus_fc64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__plus_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__plus_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__plus_fc64
 GrB_Info GB_AemultB__plus_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__plus_fc64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__plus_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC64_t bij = Bx [p] ;
         Cx [p] = GB_FC64_add (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__plus_fc64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__plus_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC64_t aij = Ax [p] ;
         Cx [p] = GB_FC64_add (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__plus_fc64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_add (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_add (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__plus_fc64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__plus_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__plus_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__plus_fc64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_add (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_add (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__plus_fc64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__plus_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__plus_fp32.c b/GraphBLAS/Source/Generated/GB_binop__plus_fp32.c
index 45acd19ffe..d4feb58f22 100644
--- a/GraphBLAS/Source/Generated/GB_binop__plus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__plus_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x + y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__plus_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__plus_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__plus_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__plus_fp32
 GrB_Info GB_AemultB__plus_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__plus_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__plus_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x + bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__plus_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__plus_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij + y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__plus_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x + aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x + aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__plus_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__plus_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__plus_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__plus_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij + y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij + y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__plus_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__plus_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__plus_fp64.c b/GraphBLAS/Source/Generated/GB_binop__plus_fp64.c
index 9f0941c122..fbda81b05c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__plus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__plus_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x + y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__plus_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__plus_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__plus_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__plus_fp64
 GrB_Info GB_AemultB__plus_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__plus_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__plus_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x + bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__plus_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__plus_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij + y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__plus_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x + aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x + aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__plus_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__plus_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__plus_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__plus_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij + y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij + y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__plus_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__plus_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__plus_int16.c b/GraphBLAS/Source/Generated/GB_binop__plus_int16.c
index 8e5c1e2127..fad3310594 100644
--- a/GraphBLAS/Source/Generated/GB_binop__plus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__plus_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x + y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__plus_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__plus_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__plus_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__plus_int16
 GrB_Info GB_AemultB__plus_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__plus_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__plus_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x + bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__plus_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__plus_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij + y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__plus_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x + aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x + aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__plus_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__plus_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__plus_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__plus_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij + y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij + y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__plus_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__plus_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__plus_int32.c b/GraphBLAS/Source/Generated/GB_binop__plus_int32.c
index 0f9b95ca93..430ed06db6 100644
--- a/GraphBLAS/Source/Generated/GB_binop__plus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__plus_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x + y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__plus_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__plus_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__plus_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__plus_int32
 GrB_Info GB_AemultB__plus_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__plus_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__plus_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x + bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__plus_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__plus_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij + y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__plus_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x + aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x + aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__plus_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__plus_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__plus_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__plus_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij + y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij + y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__plus_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__plus_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__plus_int64.c b/GraphBLAS/Source/Generated/GB_binop__plus_int64.c
index 4001b14e38..32b39106d9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__plus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__plus_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x + y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__plus_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__plus_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__plus_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__plus_int64
 GrB_Info GB_AemultB__plus_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__plus_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__plus_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x + bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__plus_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__plus_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij + y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__plus_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x + aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x + aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__plus_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__plus_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__plus_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__plus_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij + y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij + y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__plus_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__plus_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__plus_int8.c b/GraphBLAS/Source/Generated/GB_binop__plus_int8.c
index d05614266a..19c58de789 100644
--- a/GraphBLAS/Source/Generated/GB_binop__plus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__plus_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x + y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__plus_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__plus_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__plus_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__plus_int8
 GrB_Info GB_AemultB__plus_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__plus_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__plus_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x + bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__plus_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__plus_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij + y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__plus_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x + aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x + aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__plus_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__plus_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__plus_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__plus_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij + y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij + y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__plus_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__plus_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__plus_uint16.c b/GraphBLAS/Source/Generated/GB_binop__plus_uint16.c
index 57bbbb8be8..c0fe206738 100644
--- a/GraphBLAS/Source/Generated/GB_binop__plus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__plus_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x + y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__plus_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__plus_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__plus_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__plus_uint16
 GrB_Info GB_AemultB__plus_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__plus_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__plus_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x + bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__plus_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__plus_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij + y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__plus_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x + aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x + aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__plus_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__plus_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__plus_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__plus_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij + y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij + y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__plus_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__plus_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__plus_uint32.c b/GraphBLAS/Source/Generated/GB_binop__plus_uint32.c
index 7ef8e4f51c..3fb1df98e4 100644
--- a/GraphBLAS/Source/Generated/GB_binop__plus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__plus_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x + y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__plus_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__plus_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__plus_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__plus_uint32
 GrB_Info GB_AemultB__plus_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__plus_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__plus_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x + bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__plus_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__plus_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij + y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__plus_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x + aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x + aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__plus_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__plus_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__plus_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__plus_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij + y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij + y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__plus_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__plus_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__plus_uint64.c b/GraphBLAS/Source/Generated/GB_binop__plus_uint64.c
index 3aac6eafac..dcd0c660af 100644
--- a/GraphBLAS/Source/Generated/GB_binop__plus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__plus_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x + y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__plus_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__plus_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__plus_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__plus_uint64
 GrB_Info GB_AemultB__plus_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__plus_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__plus_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x + bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__plus_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__plus_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij + y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__plus_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x + aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x + aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__plus_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__plus_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__plus_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__plus_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij + y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij + y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__plus_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__plus_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__plus_uint8.c b/GraphBLAS/Source/Generated/GB_binop__plus_uint8.c
index 1664da3f7b..9abc79c4b8 100644
--- a/GraphBLAS/Source/Generated/GB_binop__plus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__plus_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x + y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__plus_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__plus_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__plus_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__plus_uint8
 GrB_Info GB_AemultB__plus_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__plus_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__plus_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x + bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__plus_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__plus_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij + y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__plus_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x + aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x + aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__plus_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__plus_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__plus_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__plus_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij + y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij + y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__plus_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__plus_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pow_fc32.c b/GraphBLAS/Source/Generated/GB_binop__pow_fc32.c
index 2886abbf39..8be0325c27 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pow_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pow_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_cpowf (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pow_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pow_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pow_fc32
 GrB_Info GB_AemultB__pow_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__pow_fc32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__pow_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC32_t bij = Bx [p] ;
         Cx [p] = GB_cpowf (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__pow_fc32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__pow_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC32_t aij = Ax [p] ;
         Cx [p] = GB_cpowf (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__pow_fc32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_cpowf (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_cpowf (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__pow_fc32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__pow_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__pow_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__pow_fc32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_cpowf (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_cpowf (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__pow_fc32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__pow_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pow_fc64.c b/GraphBLAS/Source/Generated/GB_binop__pow_fc64.c
index 1a954b2745..d9ce43d118 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pow_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pow_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_cpow (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pow_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pow_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pow_fc64
 GrB_Info GB_AemultB__pow_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__pow_fc64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__pow_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC64_t bij = Bx [p] ;
         Cx [p] = GB_cpow (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__pow_fc64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__pow_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC64_t aij = Ax [p] ;
         Cx [p] = GB_cpow (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__pow_fc64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_cpow (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_cpow (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__pow_fc64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__pow_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__pow_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__pow_fc64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_cpow (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_cpow (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__pow_fc64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__pow_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pow_fp32.c b/GraphBLAS/Source/Generated/GB_binop__pow_fp32.c
index 39370b02f7..a3287e54be 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pow_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pow_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_powf (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pow_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pow_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pow_fp32
 GrB_Info GB_AemultB__pow_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__pow_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__pow_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = GB_powf (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__pow_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__pow_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = GB_powf (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__pow_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = GB_powf (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = GB_powf (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__pow_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__pow_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__pow_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__pow_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = GB_powf (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = GB_powf (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__pow_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__pow_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pow_fp64.c b/GraphBLAS/Source/Generated/GB_binop__pow_fp64.c
index 3566625702..5a37c6f246 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pow_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pow_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_pow (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pow_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pow_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pow_fp64
 GrB_Info GB_AemultB__pow_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__pow_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__pow_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = GB_pow (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__pow_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__pow_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = GB_pow (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__pow_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__pow_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__pow_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__pow_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__pow_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__pow_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__pow_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pow_int16.c b/GraphBLAS/Source/Generated/GB_binop__pow_int16.c
index 1c41d59b80..52f6ff09c5 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pow_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pow_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_pow_int16 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pow_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pow_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pow_int16
 GrB_Info GB_AemultB__pow_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__pow_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__pow_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = GB_pow_int16 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__pow_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__pow_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = GB_pow_int16 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__pow_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_int16 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_int16 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__pow_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__pow_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__pow_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__pow_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_int16 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_int16 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__pow_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__pow_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pow_int32.c b/GraphBLAS/Source/Generated/GB_binop__pow_int32.c
index dc0141cccd..accf7a3e48 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pow_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pow_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_pow_int32 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pow_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pow_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pow_int32
 GrB_Info GB_AemultB__pow_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__pow_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__pow_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = GB_pow_int32 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__pow_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__pow_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = GB_pow_int32 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__pow_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_int32 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_int32 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__pow_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__pow_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__pow_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__pow_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_int32 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_int32 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__pow_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__pow_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pow_int64.c b/GraphBLAS/Source/Generated/GB_binop__pow_int64.c
index ccb6a34c78..8011cd6a05 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pow_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pow_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_pow_int64 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pow_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pow_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pow_int64
 GrB_Info GB_AemultB__pow_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__pow_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__pow_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = GB_pow_int64 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__pow_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__pow_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = GB_pow_int64 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__pow_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_int64 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_int64 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__pow_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__pow_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__pow_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__pow_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_int64 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_int64 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__pow_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__pow_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pow_int8.c b/GraphBLAS/Source/Generated/GB_binop__pow_int8.c
index e850c9898a..7b306afad6 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pow_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pow_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_pow_int8 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pow_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pow_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pow_int8
 GrB_Info GB_AemultB__pow_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__pow_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__pow_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_pow_int8 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__pow_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__pow_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = GB_pow_int8 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__pow_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_int8 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_int8 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__pow_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__pow_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__pow_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__pow_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_int8 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_int8 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__pow_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__pow_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pow_uint16.c b/GraphBLAS/Source/Generated/GB_binop__pow_uint16.c
index 17c8b6bca0..e107082b86 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pow_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pow_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_pow_uint16 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pow_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pow_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pow_uint16
 GrB_Info GB_AemultB__pow_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__pow_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__pow_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = GB_pow_uint16 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__pow_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__pow_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = GB_pow_uint16 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__pow_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_uint16 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_uint16 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__pow_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__pow_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__pow_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__pow_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_uint16 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_uint16 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__pow_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__pow_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pow_uint32.c b/GraphBLAS/Source/Generated/GB_binop__pow_uint32.c
index 44730f8559..3014b67473 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pow_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pow_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_pow_uint32 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pow_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pow_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pow_uint32
 GrB_Info GB_AemultB__pow_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__pow_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__pow_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = GB_pow_uint32 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__pow_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__pow_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = GB_pow_uint32 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__pow_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_uint32 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_uint32 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__pow_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__pow_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__pow_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__pow_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_uint32 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_uint32 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__pow_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__pow_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pow_uint64.c b/GraphBLAS/Source/Generated/GB_binop__pow_uint64.c
index 1b8d54d295..c04b26bb0e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pow_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pow_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_pow_uint64 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pow_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pow_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pow_uint64
 GrB_Info GB_AemultB__pow_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__pow_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__pow_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = GB_pow_uint64 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__pow_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__pow_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = GB_pow_uint64 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__pow_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_uint64 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_uint64 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__pow_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__pow_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__pow_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__pow_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_uint64 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_uint64 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__pow_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__pow_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__pow_uint8.c b/GraphBLAS/Source/Generated/GB_binop__pow_uint8.c
index fdfcdf47f7..4371bcb5c7 100644
--- a/GraphBLAS/Source/Generated/GB_binop__pow_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__pow_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_pow_uint8 (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__pow_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__pow_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__pow_uint8
 GrB_Info GB_AemultB__pow_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__pow_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__pow_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = GB_pow_uint8 (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__pow_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__pow_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = GB_pow_uint8 (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__pow_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_uint8 (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_uint8 (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__pow_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__pow_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__pow_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__pow_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_pow_uint8 (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_pow_uint8 (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__pow_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__pow_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rdiv_fc32.c b/GraphBLAS/Source/Generated/GB_binop__rdiv_fc32.c
index 19ed11bcdb..6e291b593c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rdiv_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rdiv_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC32_div (y, x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rdiv_fc32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rdiv_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rdiv_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rdiv_fc32
 GrB_Info GB_AemultB__rdiv_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rdiv_fc32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rdiv_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC32_t bij = Bx [p] ;
         Cx [p] = GB_FC32_div (bij, x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rdiv_fc32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rdiv_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC32_t aij = Ax [p] ;
         Cx [p] = GB_FC32_div (y, aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rdiv_fc32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_div (aij, x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_div (aij, x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rdiv_fc32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rdiv_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rdiv_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rdiv_fc32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_div (y, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_div (y, aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rdiv_fc32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rdiv_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rdiv_fc64.c b/GraphBLAS/Source/Generated/GB_binop__rdiv_fc64.c
index 27cff3c693..bc04c3b16e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rdiv_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rdiv_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC64_div (y, x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rdiv_fc64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rdiv_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rdiv_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rdiv_fc64
 GrB_Info GB_AemultB__rdiv_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rdiv_fc64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rdiv_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC64_t bij = Bx [p] ;
         Cx [p] = GB_FC64_div (bij, x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rdiv_fc64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rdiv_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC64_t aij = Ax [p] ;
         Cx [p] = GB_FC64_div (y, aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rdiv_fc64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_div (aij, x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_div (aij, x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rdiv_fc64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rdiv_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rdiv_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rdiv_fc64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_div (y, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_div (y, aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rdiv_fc64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rdiv_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rdiv_fp32.c b/GraphBLAS/Source/Generated/GB_binop__rdiv_fp32.c
index 4d8dcdf920..64847b9920 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rdiv_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rdiv_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (y / x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rdiv_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rdiv_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rdiv_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rdiv_fp32
 GrB_Info GB_AemultB__rdiv_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rdiv_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rdiv_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (bij / x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rdiv_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rdiv_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (y / aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rdiv_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij / x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij / x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rdiv_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rdiv_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rdiv_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rdiv_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (y / aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (y / aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rdiv_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rdiv_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rdiv_fp64.c b/GraphBLAS/Source/Generated/GB_binop__rdiv_fp64.c
index 6245403fa1..883bfb6c76 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rdiv_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rdiv_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (y / x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rdiv_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rdiv_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rdiv_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rdiv_fp64
 GrB_Info GB_AemultB__rdiv_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rdiv_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rdiv_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (bij / x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rdiv_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rdiv_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (y / aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rdiv_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij / x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij / x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rdiv_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rdiv_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rdiv_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rdiv_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (y / aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (y / aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rdiv_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rdiv_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rdiv_int16.c b/GraphBLAS/Source/Generated/GB_binop__rdiv_int16.c
index 976a5e8e8e..b49268e677 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rdiv_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rdiv_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_SIGNED (y, x, 16) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rdiv_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rdiv_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rdiv_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rdiv_int16
 GrB_Info GB_AemultB__rdiv_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rdiv_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rdiv_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_SIGNED (bij, x, 16) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rdiv_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rdiv_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_SIGNED (y, aij, 16) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rdiv_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (aij, x, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (aij, x, 16) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rdiv_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rdiv_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rdiv_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rdiv_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (y, aij, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (y, aij, 16) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rdiv_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rdiv_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rdiv_int32.c b/GraphBLAS/Source/Generated/GB_binop__rdiv_int32.c
index b6797c7988..c77def0ced 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rdiv_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rdiv_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_SIGNED (y, x, 32) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rdiv_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rdiv_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rdiv_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rdiv_int32
 GrB_Info GB_AemultB__rdiv_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rdiv_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rdiv_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_SIGNED (bij, x, 32) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rdiv_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rdiv_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_SIGNED (y, aij, 32) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rdiv_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (aij, x, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (aij, x, 32) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rdiv_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rdiv_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rdiv_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rdiv_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (y, aij, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (y, aij, 32) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rdiv_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rdiv_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rdiv_int64.c b/GraphBLAS/Source/Generated/GB_binop__rdiv_int64.c
index daec43d11d..026b429f0f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rdiv_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rdiv_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_SIGNED (y, x, 64) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rdiv_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rdiv_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rdiv_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rdiv_int64
 GrB_Info GB_AemultB__rdiv_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rdiv_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rdiv_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_SIGNED (bij, x, 64) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rdiv_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rdiv_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_SIGNED (y, aij, 64) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rdiv_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (aij, x, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (aij, x, 64) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rdiv_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rdiv_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rdiv_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rdiv_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (y, aij, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (y, aij, 64) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rdiv_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rdiv_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rdiv_int8.c b/GraphBLAS/Source/Generated/GB_binop__rdiv_int8.c
index 2a0f71e19b..b30e13d8cb 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rdiv_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rdiv_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_SIGNED (y, x, 8) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rdiv_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rdiv_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rdiv_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rdiv_int8
 GrB_Info GB_AemultB__rdiv_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rdiv_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rdiv_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_SIGNED (bij, x, 8) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rdiv_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rdiv_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_SIGNED (y, aij, 8) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rdiv_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (aij, x, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (aij, x, 8) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rdiv_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rdiv_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rdiv_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rdiv_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_SIGNED (y, aij, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_SIGNED (y, aij, 8) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rdiv_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rdiv_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rdiv_uint16.c b/GraphBLAS/Source/Generated/GB_binop__rdiv_uint16.c
index be1a6ac089..dc6d2fa1ad 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rdiv_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rdiv_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_UNSIGNED (y, x, 16) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rdiv_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rdiv_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rdiv_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rdiv_uint16
 GrB_Info GB_AemultB__rdiv_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rdiv_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rdiv_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (bij, x, 16) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rdiv_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rdiv_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (y, aij, 16) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rdiv_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (aij, x, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (aij, x, 16) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rdiv_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rdiv_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rdiv_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rdiv_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (y, aij, 16) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (y, aij, 16) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rdiv_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rdiv_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rdiv_uint32.c b/GraphBLAS/Source/Generated/GB_binop__rdiv_uint32.c
index db3f029c60..92204e0e68 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rdiv_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rdiv_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_UNSIGNED (y, x, 32) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rdiv_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rdiv_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rdiv_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rdiv_uint32
 GrB_Info GB_AemultB__rdiv_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rdiv_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rdiv_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (bij, x, 32) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rdiv_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rdiv_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (y, aij, 32) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rdiv_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (aij, x, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (aij, x, 32) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rdiv_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rdiv_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rdiv_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rdiv_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (y, aij, 32) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (y, aij, 32) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rdiv_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rdiv_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rdiv_uint64.c b/GraphBLAS/Source/Generated/GB_binop__rdiv_uint64.c
index 927e8c93a7..88f467d994 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rdiv_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rdiv_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_UNSIGNED (y, x, 64) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rdiv_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rdiv_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rdiv_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rdiv_uint64
 GrB_Info GB_AemultB__rdiv_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rdiv_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rdiv_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (bij, x, 64) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rdiv_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rdiv_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (y, aij, 64) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rdiv_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (aij, x, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (aij, x, 64) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rdiv_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rdiv_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rdiv_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rdiv_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (y, aij, 64) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (y, aij, 64) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rdiv_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rdiv_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rdiv_uint8.c b/GraphBLAS/Source/Generated/GB_binop__rdiv_uint8.c
index 5ab569019f..f7fbc52d33 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rdiv_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rdiv_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_IDIV_UNSIGNED (y, x, 8) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rdiv_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rdiv_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rdiv_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rdiv_uint8
 GrB_Info GB_AemultB__rdiv_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rdiv_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rdiv_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (bij, x, 8) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rdiv_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rdiv_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = GB_IDIV_UNSIGNED (y, aij, 8) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rdiv_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (aij, x, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (aij, x, 8) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rdiv_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rdiv_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rdiv_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rdiv_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_IDIV_UNSIGNED (y, aij, 8) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_IDIV_UNSIGNED (y, aij, 8) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rdiv_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rdiv_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__remainder_fp32.c b/GraphBLAS/Source/Generated/GB_binop__remainder_fp32.c
index 5a7861c35e..19ac9d0f8e 100644
--- a/GraphBLAS/Source/Generated/GB_binop__remainder_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__remainder_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = remainderf (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__remainder_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__remainder_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__remainder_fp32
 GrB_Info GB_AemultB__remainder_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__remainder_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__remainder_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = remainderf (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__remainder_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__remainder_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = remainderf (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__remainder_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = remainderf (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = remainderf (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__remainder_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__remainder_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__remainder_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__remainder_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = remainderf (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = remainderf (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__remainder_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__remainder_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__remainder_fp64.c b/GraphBLAS/Source/Generated/GB_binop__remainder_fp64.c
index 4511c8de8a..a7f2ee362d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__remainder_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__remainder_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = remainder (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info (node)
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__remainder_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__remainder_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__remainder_fp64
 GrB_Info GB_AemultB__remainder_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__remainder_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__remainder_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = remainder (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__remainder_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__remainder_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = remainder (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__remainder_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = remainder (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = remainder (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__remainder_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__remainder_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__remainder_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__remainder_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = remainder (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = remainder (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__remainder_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__remainder_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rminus_fc32.c b/GraphBLAS/Source/Generated/GB_binop__rminus_fc32.c
index f77a751eba..9cf7917078 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rminus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rminus_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC32_minus (y, x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rminus_fc32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rminus_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rminus_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rminus_fc32
 GrB_Info GB_AemultB__rminus_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rminus_fc32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rminus_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC32_t bij = Bx [p] ;
         Cx [p] = GB_FC32_minus (bij, x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rminus_fc32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rminus_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC32_t aij = Ax [p] ;
         Cx [p] = GB_FC32_minus (y, aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rminus_fc32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_minus (aij, x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_minus (aij, x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rminus_fc32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rminus_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rminus_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rminus_fc32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_minus (y, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_minus (y, aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rminus_fc32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rminus_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rminus_fc64.c b/GraphBLAS/Source/Generated/GB_binop__rminus_fc64.c
index 52edf6a237..2755876fe3 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rminus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rminus_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC64_minus (y, x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rminus_fc64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rminus_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rminus_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rminus_fc64
 GrB_Info GB_AemultB__rminus_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rminus_fc64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rminus_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC64_t bij = Bx [p] ;
         Cx [p] = GB_FC64_minus (bij, x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rminus_fc64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rminus_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC64_t aij = Ax [p] ;
         Cx [p] = GB_FC64_minus (y, aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rminus_fc64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_minus (aij, x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_minus (aij, x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rminus_fc64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rminus_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rminus_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rminus_fc64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_minus (y, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_minus (y, aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rminus_fc64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rminus_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rminus_fp32.c b/GraphBLAS/Source/Generated/GB_binop__rminus_fp32.c
index d6ebc3fe6e..3e12658806 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rminus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rminus_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (y - x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rminus_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rminus_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rminus_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rminus_fp32
 GrB_Info GB_AemultB__rminus_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rminus_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rminus_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (bij - x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rminus_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rminus_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (y - aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rminus_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij - x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rminus_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rminus_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rminus_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rminus_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (y - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (y - aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rminus_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rminus_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rminus_fp64.c b/GraphBLAS/Source/Generated/GB_binop__rminus_fp64.c
index 13ce19f49f..bd7e54f688 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rminus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rminus_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (y - x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rminus_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rminus_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rminus_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rminus_fp64
 GrB_Info GB_AemultB__rminus_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rminus_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rminus_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (bij - x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rminus_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rminus_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (y - aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rminus_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij - x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rminus_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rminus_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rminus_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rminus_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (y - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (y - aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rminus_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rminus_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rminus_int16.c b/GraphBLAS/Source/Generated/GB_binop__rminus_int16.c
index 042689a11e..c79fb1ebf0 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rminus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rminus_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (y - x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rminus_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rminus_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rminus_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rminus_int16
 GrB_Info GB_AemultB__rminus_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rminus_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rminus_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (bij - x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rminus_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rminus_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (y - aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rminus_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rminus_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rminus_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rminus_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rminus_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (y - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (y - aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rminus_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rminus_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rminus_int32.c b/GraphBLAS/Source/Generated/GB_binop__rminus_int32.c
index eb269b4b94..c8da43a5bf 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rminus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rminus_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (y - x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rminus_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rminus_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rminus_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rminus_int32
 GrB_Info GB_AemultB__rminus_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rminus_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rminus_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (bij - x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rminus_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rminus_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (y - aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rminus_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rminus_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rminus_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rminus_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rminus_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (y - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (y - aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rminus_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rminus_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rminus_int64.c b/GraphBLAS/Source/Generated/GB_binop__rminus_int64.c
index c79efbca61..e1d20a09e5 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rminus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rminus_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (y - x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rminus_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rminus_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rminus_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rminus_int64
 GrB_Info GB_AemultB__rminus_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rminus_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rminus_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (bij - x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rminus_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rminus_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (y - aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rminus_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rminus_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rminus_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rminus_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rminus_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (y - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (y - aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rminus_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rminus_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rminus_int8.c b/GraphBLAS/Source/Generated/GB_binop__rminus_int8.c
index e2f7ed632e..e3b2e819fa 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rminus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rminus_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (y - x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rminus_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rminus_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rminus_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rminus_int8
 GrB_Info GB_AemultB__rminus_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rminus_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rminus_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (bij - x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rminus_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rminus_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (y - aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rminus_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rminus_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rminus_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rminus_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rminus_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (y - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (y - aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rminus_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rminus_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rminus_uint16.c b/GraphBLAS/Source/Generated/GB_binop__rminus_uint16.c
index c5a8c3ef09..afe16b53ac 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rminus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rminus_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (y - x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rminus_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rminus_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rminus_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rminus_uint16
 GrB_Info GB_AemultB__rminus_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rminus_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rminus_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (bij - x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rminus_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rminus_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (y - aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rminus_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rminus_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rminus_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rminus_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rminus_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (y - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (y - aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rminus_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rminus_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rminus_uint32.c b/GraphBLAS/Source/Generated/GB_binop__rminus_uint32.c
index 2d723309eb..577fa28b62 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rminus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rminus_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (y - x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rminus_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rminus_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rminus_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rminus_uint32
 GrB_Info GB_AemultB__rminus_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rminus_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rminus_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (bij - x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rminus_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rminus_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (y - aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rminus_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rminus_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rminus_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rminus_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rminus_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (y - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (y - aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rminus_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rminus_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rminus_uint64.c b/GraphBLAS/Source/Generated/GB_binop__rminus_uint64.c
index c82b12f52e..e8b83810fd 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rminus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rminus_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (y - x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rminus_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rminus_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rminus_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rminus_uint64
 GrB_Info GB_AemultB__rminus_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rminus_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rminus_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (bij - x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rminus_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rminus_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (y - aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rminus_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rminus_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rminus_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rminus_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rminus_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (y - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (y - aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rminus_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rminus_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__rminus_uint8.c b/GraphBLAS/Source/Generated/GB_binop__rminus_uint8.c
index 8f8603c00e..16d289bf24 100644
--- a/GraphBLAS/Source/Generated/GB_binop__rminus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__rminus_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (y - x) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__rminus_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__rminus_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__rminus_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__rminus_uint8
 GrB_Info GB_AemultB__rminus_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__rminus_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__rminus_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (bij - x) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__rminus_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__rminus_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (y - aij) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__rminus_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij - x) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij - x) ;        \
 }
 
 GrB_Info GB_bind1st_tran__rminus_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__rminus_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__rminus_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__rminus_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (y - aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (y - aij) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__rminus_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__rminus_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_bool.c b/GraphBLAS/Source/Generated/GB_binop__second_bool.c
index bdaad3e7e5..7455fcd86f 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_bool.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_bool.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_bool
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_bool
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_bool
 GrB_Info GB_AemultB__second_bool
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         bool bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_bool
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_bool
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_bool
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    bool aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    bool aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     bool x = (*((const bool *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_bool
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_bool
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     bool y = (*((const bool *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_fc32.c b/GraphBLAS/Source/Generated/GB_binop__second_fc32.c
index 502f6ffe23..951dccf9b1 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_fc32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_fc32
 GrB_Info GB_AemultB__second_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC32_t bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_fc32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_fc32
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_fc32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_fc64.c b/GraphBLAS/Source/Generated/GB_binop__second_fc64.c
index 164cda30a5..f6210e7dfc 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_fc64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_fc64
 GrB_Info GB_AemultB__second_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC64_t bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_fc64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_fc64
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_fc64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_fp32.c b/GraphBLAS/Source/Generated/GB_binop__second_fp32.c
index 547e164450..cc1b650d9d 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_fp32
 GrB_Info GB_AemultB__second_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_fp32
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_fp64.c b/GraphBLAS/Source/Generated/GB_binop__second_fp64.c
index 6bd4dd8c72..f3ba7236cd 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_fp64
 GrB_Info GB_AemultB__second_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_fp64
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_int16.c b/GraphBLAS/Source/Generated/GB_binop__second_int16.c
index 4fce2c41f3..36a43144de 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_int16
 GrB_Info GB_AemultB__second_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_int16
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_int32.c b/GraphBLAS/Source/Generated/GB_binop__second_int32.c
index 907217133d..ac6f8ebb11 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_int32
 GrB_Info GB_AemultB__second_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_int32
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_int64.c b/GraphBLAS/Source/Generated/GB_binop__second_int64.c
index 8d7d77b842..1888615424 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_int64
 GrB_Info GB_AemultB__second_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_int64
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_int8.c b/GraphBLAS/Source/Generated/GB_binop__second_int8.c
index 56887069fc..4c5ae89c43 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_int8
 GrB_Info GB_AemultB__second_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_int8
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_uint16.c b/GraphBLAS/Source/Generated/GB_binop__second_uint16.c
index c3703c853f..8a797bd0f0 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_uint16
 GrB_Info GB_AemultB__second_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_uint16
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_uint32.c b/GraphBLAS/Source/Generated/GB_binop__second_uint32.c
index cd14e3622f..36cd2f7ccf 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_uint32
 GrB_Info GB_AemultB__second_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_uint32
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_uint64.c b/GraphBLAS/Source/Generated/GB_binop__second_uint64.c
index f67c3bc9bd..4a54e96833 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_uint64
 GrB_Info GB_AemultB__second_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_uint64
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__second_uint8.c b/GraphBLAS/Source/Generated/GB_binop__second_uint8.c
index e6160c2f15..96aabfffa8 100644
--- a/GraphBLAS/Source/Generated/GB_binop__second_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__second_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = y ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__second_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__second_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__second_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__second_uint8
 GrB_Info GB_AemultB__second_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info (none)
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info (none)
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = bij ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__second_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__second_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         ; ;
         Cx [p] = y ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__second_uint8
 
 #if 0
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = aij ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = aij ;        \
 }
 
 GrB_Info (none)
@@ -400,10 +427,10 @@ GrB_Info (none)
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info (none)
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info (none)
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    ; ;              \
-    Cx [pC] = y ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    ; ;                      \
+    Cx [pC] = y ;        \
 }
 
 GrB_Info GB_bind2nd_tran__second_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__second_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__times_fc32.c b/GraphBLAS/Source/Generated/GB_binop__times_fc32.c
index 61786f7cff..5835a3ba89 100644
--- a/GraphBLAS/Source/Generated/GB_binop__times_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__times_fc32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC32_mul (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__times_fc32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__times_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__times_fc32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__times_fc32
 GrB_Info GB_AemultB__times_fc32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__times_fc32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__times_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC32_t bij = Bx [p] ;
         Cx [p] = GB_FC32_mul (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__times_fc32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__times_fc32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC32_t aij = Ax [p] ;
         Cx [p] = GB_FC32_mul (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__times_fc32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_mul (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_mul (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__times_fc32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__times_fc32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__times_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t x = (*((const GxB_FC32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__times_fc32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC32_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC32_mul (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC32_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC32_mul (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__times_fc32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__times_fc32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t y = (*((const GxB_FC32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__times_fc64.c b/GraphBLAS/Source/Generated/GB_binop__times_fc64.c
index 20c6e6fe3f..75e8efb926 100644
--- a/GraphBLAS/Source/Generated/GB_binop__times_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__times_fc64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = GB_FC64_mul (x, y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__times_fc64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__times_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__times_fc64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__times_fc64
 GrB_Info GB_AemultB__times_fc64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__times_fc64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__times_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GxB_FC64_t bij = Bx [p] ;
         Cx [p] = GB_FC64_mul (x, bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__times_fc64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__times_fc64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GxB_FC64_t aij = Ax [p] ;
         Cx [p] = GB_FC64_mul (aij, y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__times_fc64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_mul (x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_mul (x, aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__times_fc64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__times_fc64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__times_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t x = (*((const GxB_FC64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__times_fc64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GxB_FC64_t aij = Ax [pA] ;              \
-    Cx [pC] = GB_FC64_mul (aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GxB_FC64_t aij = Ax [pA] ;                      \
+    Cx [pC] = GB_FC64_mul (aij, y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__times_fc64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__times_fc64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t y = (*((const GxB_FC64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__times_fp32.c b/GraphBLAS/Source/Generated/GB_binop__times_fp32.c
index 24e46e6b4d..43c4edc1dd 100644
--- a/GraphBLAS/Source/Generated/GB_binop__times_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__times_fp32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x * y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__times_fp32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__times_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__times_fp32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__times_fp32
 GrB_Info GB_AemultB__times_fp32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__times_fp32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__times_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         float bij = Bx [p] ;
         Cx [p] = (x * bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__times_fp32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__times_fp32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         float aij = Ax [p] ;
         Cx [p] = (aij * y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__times_fp32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (x * aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (x * aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__times_fp32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__times_fp32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__times_fp32
     return (GrB_NO_VALUE) ;
     #else
     float x = (*((const float *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__times_fp32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    float aij = Ax [pA] ;              \
-    Cx [pC] = (aij * y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    float aij = Ax [pA] ;                      \
+    Cx [pC] = (aij * y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__times_fp32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__times_fp32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     float y = (*((const float *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__times_fp64.c b/GraphBLAS/Source/Generated/GB_binop__times_fp64.c
index b068095874..9f202429ef 100644
--- a/GraphBLAS/Source/Generated/GB_binop__times_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__times_fp64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x * y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__times_fp64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__times_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__times_fp64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__times_fp64
 GrB_Info GB_AemultB__times_fp64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__times_fp64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__times_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         double bij = Bx [p] ;
         Cx [p] = (x * bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__times_fp64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__times_fp64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         double aij = Ax [p] ;
         Cx [p] = (aij * y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__times_fp64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (x * aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (x * aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__times_fp64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__times_fp64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__times_fp64
     return (GrB_NO_VALUE) ;
     #else
     double x = (*((const double *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__times_fp64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    double aij = Ax [pA] ;              \
-    Cx [pC] = (aij * y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    double aij = Ax [pA] ;                      \
+    Cx [pC] = (aij * y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__times_fp64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__times_fp64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     double y = (*((const double *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__times_int16.c b/GraphBLAS/Source/Generated/GB_binop__times_int16.c
index add456ebf3..444bc0dc91 100644
--- a/GraphBLAS/Source/Generated/GB_binop__times_int16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__times_int16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x * y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__times_int16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__times_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__times_int16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__times_int16
 GrB_Info GB_AemultB__times_int16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__times_int16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__times_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int16_t bij = Bx [p] ;
         Cx [p] = (x * bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__times_int16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__times_int16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int16_t aij = Ax [p] ;
         Cx [p] = (aij * y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__times_int16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x * aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x * aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__times_int16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__times_int16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__times_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t x = (*((const int16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__times_int16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij * y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij * y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__times_int16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__times_int16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int16_t y = (*((const int16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__times_int32.c b/GraphBLAS/Source/Generated/GB_binop__times_int32.c
index bc947f2c15..c182f22baa 100644
--- a/GraphBLAS/Source/Generated/GB_binop__times_int32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__times_int32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x * y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__times_int32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__times_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__times_int32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__times_int32
 GrB_Info GB_AemultB__times_int32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__times_int32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__times_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int32_t bij = Bx [p] ;
         Cx [p] = (x * bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__times_int32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__times_int32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int32_t aij = Ax [p] ;
         Cx [p] = (aij * y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__times_int32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x * aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x * aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__times_int32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__times_int32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__times_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t x = (*((const int32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__times_int32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij * y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij * y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__times_int32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__times_int32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int32_t y = (*((const int32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__times_int64.c b/GraphBLAS/Source/Generated/GB_binop__times_int64.c
index 2c77907ff5..ec1bfb77a9 100644
--- a/GraphBLAS/Source/Generated/GB_binop__times_int64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__times_int64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x * y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__times_int64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__times_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__times_int64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__times_int64
 GrB_Info GB_AemultB__times_int64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__times_int64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__times_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int64_t bij = Bx [p] ;
         Cx [p] = (x * bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__times_int64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__times_int64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int64_t aij = Ax [p] ;
         Cx [p] = (aij * y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__times_int64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x * aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x * aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__times_int64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__times_int64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__times_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t x = (*((const int64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__times_int64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij * y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij * y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__times_int64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__times_int64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t y = (*((const int64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__times_int8.c b/GraphBLAS/Source/Generated/GB_binop__times_int8.c
index e083e20022..2010ce0002 100644
--- a/GraphBLAS/Source/Generated/GB_binop__times_int8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__times_int8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x * y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__times_int8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__times_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__times_int8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__times_int8
 GrB_Info GB_AemultB__times_int8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__times_int8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__times_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         int8_t bij = Bx [p] ;
         Cx [p] = (x * bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__times_int8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__times_int8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         int8_t aij = Ax [p] ;
         Cx [p] = (aij * y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__times_int8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x * aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x * aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__times_int8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__times_int8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__times_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t x = (*((const int8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__times_int8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    int8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij * y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    int8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij * y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__times_int8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__times_int8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int8_t y = (*((const int8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__times_uint16.c b/GraphBLAS/Source/Generated/GB_binop__times_uint16.c
index 9925d8ac01..20625b794c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__times_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_binop__times_uint16.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x * y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__times_uint16
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__times_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__times_uint16
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__times_uint16
 GrB_Info GB_AemultB__times_uint16
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__times_uint16
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__times_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint16_t bij = Bx [p] ;
         Cx [p] = (x * bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__times_uint16
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__times_uint16
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint16_t aij = Ax [p] ;
         Cx [p] = (aij * y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__times_uint16
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (x * aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x * aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__times_uint16
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__times_uint16
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__times_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t x = (*((const uint16_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__times_uint16
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint16_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij * y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint16_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij * y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__times_uint16
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__times_uint16
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint16_t y = (*((const uint16_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__times_uint32.c b/GraphBLAS/Source/Generated/GB_binop__times_uint32.c
index 1f082399c1..b80b568ca2 100644
--- a/GraphBLAS/Source/Generated/GB_binop__times_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_binop__times_uint32.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x * y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__times_uint32
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__times_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__times_uint32
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__times_uint32
 GrB_Info GB_AemultB__times_uint32
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__times_uint32
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__times_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint32_t bij = Bx [p] ;
         Cx [p] = (x * bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__times_uint32
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__times_uint32
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint32_t aij = Ax [p] ;
         Cx [p] = (aij * y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__times_uint32
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (x * aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x * aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__times_uint32
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__times_uint32
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__times_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t x = (*((const uint32_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__times_uint32
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint32_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij * y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint32_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij * y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__times_uint32
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__times_uint32
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint32_t y = (*((const uint32_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__times_uint64.c b/GraphBLAS/Source/Generated/GB_binop__times_uint64.c
index a903f1734c..3911190ca6 100644
--- a/GraphBLAS/Source/Generated/GB_binop__times_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_binop__times_uint64.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x * y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__times_uint64
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__times_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__times_uint64
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__times_uint64
 GrB_Info GB_AemultB__times_uint64
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__times_uint64
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__times_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint64_t bij = Bx [p] ;
         Cx [p] = (x * bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__times_uint64
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__times_uint64
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint64_t aij = Ax [p] ;
         Cx [p] = (aij * y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__times_uint64
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (x * aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x * aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__times_uint64
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__times_uint64
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__times_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t x = (*((const uint64_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__times_uint64
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint64_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij * y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint64_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij * y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__times_uint64
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__times_uint64
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint64_t y = (*((const uint64_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_binop__times_uint8.c b/GraphBLAS/Source/Generated/GB_binop__times_uint8.c
index 4608d33aec..dd9b09056c 100644
--- a/GraphBLAS/Source/Generated/GB_binop__times_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_binop__times_uint8.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -81,7 +82,7 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
+#define GB_BINOP(z, x, y, i, j) \
     z = (x * y) ;
 
 // op is second
@@ -262,11 +263,21 @@ GrB_Info GB_DxB__times_uint8
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB__times_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB__times_uint8
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB__times_uint8
 GrB_Info GB_AemultB__times_uint8
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st__times_uint8
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,6 +362,7 @@ GrB_Info GB_bind1st__times_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         uint8_t bij = Bx [p] ;
         Cx [p] = (x * bij) ;
     }
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd__times_uint8
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,6 +398,7 @@ GrB_Info GB_bind2nd__times_uint8
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         uint8_t aij = Ax [p] ;
         Cx [p] = (aij * y) ;
     }
@@ -387,12 +414,12 @@ GrB_Info GB_bind2nd__times_uint8
 
 
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (x * aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (x * aij) ;        \
 }
 
 GrB_Info GB_bind1st_tran__times_uint8
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran__times_uint8
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran__times_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t x = (*((const uint8_t *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ GrB_Info GB_bind1st_tran__times_uint8
 
 
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    uint8_t aij = Ax [pA] ;              \
-    Cx [pC] = (aij * y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    uint8_t aij = Ax [pA] ;                      \
+    Cx [pC] = (aij * y) ;        \
 }
 
 GrB_Info GB_bind2nd_tran__times_uint8
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran__times_uint8
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     uint8_t y = (*((const uint8_t *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_red__any_bool.c b/GraphBLAS/Source/Generated/GB_red__any_bool.c
index 787f2467ae..c253eac823 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_bool.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_bool.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_bool
 // Reduce to scalar:   GB_red_scalar__any_bool
-// Reduce each vector: GB_red_eachvec__any_bool
-// Reduce each index:  GB_red_eachindex__any_bool
 
 // A type:   bool
 // C type:   bool
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     bool
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        bool s
+    #define GB_IDENTITY \
+        false
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        bool s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_bool
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_bool
     return (GrB_NO_VALUE) ;
     #else
     bool s = (*result) ;
-    #include "GB_reduce_panel.c"
+    bool *GB_RESTRICT W = (bool *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_bool
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_bool
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_bool
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__any_fc32.c b/GraphBLAS/Source/Generated/GB_red__any_fc32.c
index 9931e49d4a..21cea30953 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_fc32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_fc32
 // Reduce to scalar:   GB_red_scalar__any_fc32
-// Reduce each vector: GB_red_eachvec__any_fc32
-// Reduce each index:  GB_red_eachindex__any_fc32
 
 // A type:   GxB_FC32_t
 // C type:   GxB_FC32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        GxB_FC32_t s
+    #define GB_IDENTITY \
+        GxB_CMPLXF(0,0)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        GxB_FC32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_fc32
     GxB_FC32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    GxB_FC32_t *GB_RESTRICT W = (GxB_FC32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_fc32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_fc32
-(
-    GxB_FC32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_fc32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__any_fc64.c b/GraphBLAS/Source/Generated/GB_red__any_fc64.c
index 04c3a562e3..754ee6f326 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_fc64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_fc64
 // Reduce to scalar:   GB_red_scalar__any_fc64
-// Reduce each vector: GB_red_eachvec__any_fc64
-// Reduce each index:  GB_red_eachindex__any_fc64
 
 // A type:   GxB_FC64_t
 // C type:   GxB_FC64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        GxB_FC64_t s
+    #define GB_IDENTITY \
+        GxB_CMPLX(0,0)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        GxB_FC64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_fc64
     GxB_FC64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    GxB_FC64_t *GB_RESTRICT W = (GxB_FC64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_fc64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_fc64
-(
-    GxB_FC64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_fc64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__any_fp32.c b/GraphBLAS/Source/Generated/GB_red__any_fp32.c
index eed1d2f58c..289d236067 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_fp32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_fp32
 // Reduce to scalar:   GB_red_scalar__any_fp32
-// Reduce each vector: GB_red_eachvec__any_fp32
-// Reduce each index:  GB_red_eachindex__any_fp32
 
 // A type:   float
 // C type:   float
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     float
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        float s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        float s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_fp32
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_fp32
     return (GrB_NO_VALUE) ;
     #else
     float s = (*result) ;
-    #include "GB_reduce_panel.c"
+    float *GB_RESTRICT W = (float *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_fp32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_fp32
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_fp32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__any_fp64.c b/GraphBLAS/Source/Generated/GB_red__any_fp64.c
index cb820c8aa2..b4b1856264 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_fp64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_fp64
 // Reduce to scalar:   GB_red_scalar__any_fp64
-// Reduce each vector: GB_red_eachvec__any_fp64
-// Reduce each index:  GB_red_eachindex__any_fp64
 
 // A type:   double
 // C type:   double
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     double
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        double s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        double s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_fp64
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_fp64
     return (GrB_NO_VALUE) ;
     #else
     double s = (*result) ;
-    #include "GB_reduce_panel.c"
+    double *GB_RESTRICT W = (double *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_fp64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_fp64
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_fp64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__any_int16.c b/GraphBLAS/Source/Generated/GB_red__any_int16.c
index 3937e61120..0b0678bb26 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_int16.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_int16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_int16
 // Reduce to scalar:   GB_red_scalar__any_int16
-// Reduce each vector: GB_red_eachvec__any_int16
-// Reduce each index:  GB_red_eachindex__any_int16
 
 // A type:   int16_t
 // C type:   int16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int16_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_int16
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int16_t *GB_RESTRICT W = (int16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_int16
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_int16
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_int16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__any_int32.c b/GraphBLAS/Source/Generated/GB_red__any_int32.c
index a5e565af96..b2c8bcbcbf 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_int32.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_int32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_int32
 // Reduce to scalar:   GB_red_scalar__any_int32
-// Reduce each vector: GB_red_eachvec__any_int32
-// Reduce each index:  GB_red_eachindex__any_int32
 
 // A type:   int32_t
 // C type:   int32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int32_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_int32
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int32_t *GB_RESTRICT W = (int32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_int32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_int32
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_int32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__any_int64.c b/GraphBLAS/Source/Generated/GB_red__any_int64.c
index 70ccc212ab..01ca37e082 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_int64.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_int64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_int64
 // Reduce to scalar:   GB_red_scalar__any_int64
-// Reduce each vector: GB_red_eachvec__any_int64
-// Reduce each index:  GB_red_eachindex__any_int64
 
 // A type:   int64_t
 // C type:   int64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int64_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_int64
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int64_t *GB_RESTRICT W = (int64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_int64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_int64
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_int64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__any_int8.c b/GraphBLAS/Source/Generated/GB_red__any_int8.c
index 41de8479ee..dbd5243324 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_int8.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_int8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_int8
 // Reduce to scalar:   GB_red_scalar__any_int8
-// Reduce each vector: GB_red_eachvec__any_int8
-// Reduce each index:  GB_red_eachindex__any_int8
 
 // A type:   int8_t
 // C type:   int8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int8_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_int8
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int8_t *GB_RESTRICT W = (int8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_int8
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_int8
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_int8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__any_uint16.c b/GraphBLAS/Source/Generated/GB_red__any_uint16.c
index 1d40a49607..c0335f099a 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_uint16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_uint16
 // Reduce to scalar:   GB_red_scalar__any_uint16
-// Reduce each vector: GB_red_eachvec__any_uint16
-// Reduce each index:  GB_red_eachindex__any_uint16
 
 // A type:   uint16_t
 // C type:   uint16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint16_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_uint16
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint16_t *GB_RESTRICT W = (uint16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_uint16
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_uint16
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_uint16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__any_uint32.c b/GraphBLAS/Source/Generated/GB_red__any_uint32.c
index fd4b071892..7b19a155b9 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_uint32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_uint32
 // Reduce to scalar:   GB_red_scalar__any_uint32
-// Reduce each vector: GB_red_eachvec__any_uint32
-// Reduce each index:  GB_red_eachindex__any_uint32
 
 // A type:   uint32_t
 // C type:   uint32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint32_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_uint32
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint32_t *GB_RESTRICT W = (uint32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_uint32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_uint32
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_uint32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__any_uint64.c b/GraphBLAS/Source/Generated/GB_red__any_uint64.c
index 7fa05273e1..b372510776 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_uint64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_uint64
 // Reduce to scalar:   GB_red_scalar__any_uint64
-// Reduce each vector: GB_red_eachvec__any_uint64
-// Reduce each index:  GB_red_eachindex__any_uint64
 
 // A type:   uint64_t
 // C type:   uint64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint64_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_uint64
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint64_t *GB_RESTRICT W = (uint64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_uint64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_uint64
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_uint64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__any_uint8.c b/GraphBLAS/Source/Generated/GB_red__any_uint8.c
index a0ddbd4083..7e9fd0073a 100644
--- a/GraphBLAS/Source/Generated/GB_red__any_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_red__any_uint8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__any_uint8
 // Reduce to scalar:   GB_red_scalar__any_uint8
-// Reduce each vector: GB_red_eachvec__any_uint8
-// Reduce each index:  GB_red_eachindex__any_uint8
 
 // A type:   uint8_t
 // C type:   uint8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint8_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        true
+
     #define GB_TERMINAL_VALUE                       \
         (any value)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__any_uint8
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__any_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint8_t *GB_RESTRICT W = (uint8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__any_uint8
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__any_uint8
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__any_uint8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__eq_bool.c b/GraphBLAS/Source/Generated/GB_red__eq_bool.c
index 01a9b1b10c..a1045d9344 100644
--- a/GraphBLAS/Source/Generated/GB_red__eq_bool.c
+++ b/GraphBLAS/Source/Generated/GB_red__eq_bool.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__eq_bool
 // Reduce to scalar:   GB_red_scalar__eq_bool
-// Reduce each vector: GB_red_eachvec__eq_bool
-// Reduce each index:  GB_red_eachindex__eq_bool
 
 // A type:   bool
 // C type:   bool
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     bool
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        bool s
+    #define GB_IDENTITY \
+        true
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        bool s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__eq_bool
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__eq_bool
     return (GrB_NO_VALUE) ;
     #else
     bool s = (*result) ;
-    #include "GB_reduce_panel.c"
+    bool *GB_RESTRICT W = (bool *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__eq_bool
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__eq_bool
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__eq_bool
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_bool.c b/GraphBLAS/Source/Generated/GB_red__first_bool.c
index 6311f15ff7..d5af5c50ff 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_bool.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_bool.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_bool
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   bool
 // C type:   bool
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     bool
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        bool s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        bool s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     bool s = (*result) ;
-    #include "GB_reduce_panel.c"
+    bool *GB_RESTRICT W = (bool *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_fc32.c b/GraphBLAS/Source/Generated/GB_red__first_fc32.c
index 940a716cb6..fe02e0522e 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_fc32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_fc32
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   GxB_FC32_t
 // C type:   GxB_FC32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        GxB_FC32_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        GxB_FC32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     GxB_FC32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    GxB_FC32_t *GB_RESTRICT W = (GxB_FC32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    GxB_FC32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_fc64.c b/GraphBLAS/Source/Generated/GB_red__first_fc64.c
index b219534a39..9245ee8281 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_fc64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_fc64
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   GxB_FC64_t
 // C type:   GxB_FC64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        GxB_FC64_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        GxB_FC64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     GxB_FC64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    GxB_FC64_t *GB_RESTRICT W = (GxB_FC64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    GxB_FC64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_fp32.c b/GraphBLAS/Source/Generated/GB_red__first_fp32.c
index 867fb5e364..72e3b7f418 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_fp32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_fp32
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   float
 // C type:   float
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     float
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        float s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        float s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     float s = (*result) ;
-    #include "GB_reduce_panel.c"
+    float *GB_RESTRICT W = (float *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_fp64.c b/GraphBLAS/Source/Generated/GB_red__first_fp64.c
index 0c47a1c166..5782703c8e 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_fp64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_fp64
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   double
 // C type:   double
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     double
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        double s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        double s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     double s = (*result) ;
-    #include "GB_reduce_panel.c"
+    double *GB_RESTRICT W = (double *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_int16.c b/GraphBLAS/Source/Generated/GB_red__first_int16.c
index b69a79b57e..436f563465 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_int16.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_int16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_int16
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   int16_t
 // C type:   int16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int16_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     int16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int16_t *GB_RESTRICT W = (int16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_int32.c b/GraphBLAS/Source/Generated/GB_red__first_int32.c
index aae7b2f20e..00d8b3b923 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_int32.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_int32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_int32
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   int32_t
 // C type:   int32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int32_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     int32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int32_t *GB_RESTRICT W = (int32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_int64.c b/GraphBLAS/Source/Generated/GB_red__first_int64.c
index 80451e9e47..007d5b815b 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_int64.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_int64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_int64
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   int64_t
 // C type:   int64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int64_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     int64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int64_t *GB_RESTRICT W = (int64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_int8.c b/GraphBLAS/Source/Generated/GB_red__first_int8.c
index b4e2617515..8fb7ea5e4d 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_int8.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_int8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_int8
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   int8_t
 // C type:   int8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int8_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     int8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int8_t *GB_RESTRICT W = (int8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_uint16.c b/GraphBLAS/Source/Generated/GB_red__first_uint16.c
index 5ad66d9fc6..ee2c7a74ef 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_uint16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_uint16
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   uint16_t
 // C type:   uint16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint16_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     uint16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint16_t *GB_RESTRICT W = (uint16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_uint32.c b/GraphBLAS/Source/Generated/GB_red__first_uint32.c
index e9dd76aa2f..f220055777 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_uint32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_uint32
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   uint32_t
 // C type:   uint32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint32_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     uint32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint32_t *GB_RESTRICT W = (uint32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_uint64.c b/GraphBLAS/Source/Generated/GB_red__first_uint64.c
index 6cbcf95ff3..2472620953 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_uint64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_uint64
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   uint64_t
 // C type:   uint64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint64_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     uint64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint64_t *GB_RESTRICT W = (uint64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__first_uint8.c b/GraphBLAS/Source/Generated/GB_red__first_uint8.c
index 1b4489806c..a8e59492f7 100644
--- a/GraphBLAS/Source/Generated/GB_red__first_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_red__first_uint8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__first_uint8
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   uint8_t
 // C type:   uint8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint8_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     uint8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint8_t *GB_RESTRICT W = (uint8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__include.h b/GraphBLAS/Source/Generated/GB_red__include.h
index d3c60da4a0..151a1d82c8 100644
--- a/GraphBLAS/Source/Generated/GB_red__include.h
+++ b/GraphBLAS/Source/Generated/GB_red__include.h
@@ -2,11 +2,12 @@
 // GB_red__include.h: definitions for GB_red__*.c
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txargt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_red.h
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__min_int8
@@ -14,34 +15,11 @@ GrB_Info GB_red_scalar__min_int8
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__min_int8
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__min_int8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__min_int8
@@ -58,6 +36,7 @@ GrB_Info GB_red_build__min_int8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__min_int16
@@ -65,34 +44,11 @@ GrB_Info GB_red_scalar__min_int16
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__min_int16
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__min_int16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__min_int16
@@ -109,6 +65,7 @@ GrB_Info GB_red_build__min_int16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__min_int32
@@ -116,34 +73,11 @@ GrB_Info GB_red_scalar__min_int32
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__min_int32
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__min_int32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__min_int32
@@ -160,6 +94,7 @@ GrB_Info GB_red_build__min_int32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__min_int64
@@ -167,34 +102,11 @@ GrB_Info GB_red_scalar__min_int64
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__min_int64
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__min_int64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__min_int64
@@ -211,6 +123,7 @@ GrB_Info GB_red_build__min_int64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__min_uint8
@@ -218,34 +131,11 @@ GrB_Info GB_red_scalar__min_uint8
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__min_uint8
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__min_uint8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__min_uint8
@@ -262,6 +152,7 @@ GrB_Info GB_red_build__min_uint8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__min_uint16
@@ -269,34 +160,11 @@ GrB_Info GB_red_scalar__min_uint16
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__min_uint16
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__min_uint16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__min_uint16
@@ -313,6 +181,7 @@ GrB_Info GB_red_build__min_uint16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__min_uint32
@@ -320,34 +189,11 @@ GrB_Info GB_red_scalar__min_uint32
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__min_uint32
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__min_uint32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__min_uint32
@@ -364,6 +210,7 @@ GrB_Info GB_red_build__min_uint32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__min_uint64
@@ -371,34 +218,11 @@ GrB_Info GB_red_scalar__min_uint64
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__min_uint64
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__min_uint64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__min_uint64
@@ -415,6 +239,7 @@ GrB_Info GB_red_build__min_uint64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__min_fp32
@@ -422,34 +247,11 @@ GrB_Info GB_red_scalar__min_fp32
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__min_fp32
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__min_fp32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__min_fp32
@@ -466,6 +268,7 @@ GrB_Info GB_red_build__min_fp32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__min_fp64
@@ -473,34 +276,11 @@ GrB_Info GB_red_scalar__min_fp64
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__min_fp64
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__min_fp64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__min_fp64
@@ -517,6 +297,7 @@ GrB_Info GB_red_build__min_fp64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__max_int8
@@ -524,34 +305,11 @@ GrB_Info GB_red_scalar__max_int8
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__max_int8
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__max_int8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__max_int8
@@ -568,6 +326,7 @@ GrB_Info GB_red_build__max_int8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__max_int16
@@ -575,34 +334,11 @@ GrB_Info GB_red_scalar__max_int16
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__max_int16
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__max_int16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__max_int16
@@ -619,6 +355,7 @@ GrB_Info GB_red_build__max_int16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__max_int32
@@ -626,34 +363,11 @@ GrB_Info GB_red_scalar__max_int32
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__max_int32
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__max_int32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__max_int32
@@ -670,6 +384,7 @@ GrB_Info GB_red_build__max_int32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__max_int64
@@ -677,34 +392,11 @@ GrB_Info GB_red_scalar__max_int64
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__max_int64
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__max_int64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__max_int64
@@ -721,6 +413,7 @@ GrB_Info GB_red_build__max_int64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__max_uint8
@@ -728,34 +421,11 @@ GrB_Info GB_red_scalar__max_uint8
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__max_uint8
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__max_uint8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__max_uint8
@@ -772,6 +442,7 @@ GrB_Info GB_red_build__max_uint8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__max_uint16
@@ -779,34 +450,11 @@ GrB_Info GB_red_scalar__max_uint16
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__max_uint16
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__max_uint16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__max_uint16
@@ -823,6 +471,7 @@ GrB_Info GB_red_build__max_uint16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__max_uint32
@@ -830,34 +479,11 @@ GrB_Info GB_red_scalar__max_uint32
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__max_uint32
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__max_uint32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__max_uint32
@@ -874,6 +500,7 @@ GrB_Info GB_red_build__max_uint32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__max_uint64
@@ -881,34 +508,11 @@ GrB_Info GB_red_scalar__max_uint64
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__max_uint64
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__max_uint64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__max_uint64
@@ -925,6 +529,7 @@ GrB_Info GB_red_build__max_uint64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__max_fp32
@@ -932,34 +537,11 @@ GrB_Info GB_red_scalar__max_fp32
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__max_fp32
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__max_fp32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__max_fp32
@@ -976,6 +558,7 @@ GrB_Info GB_red_build__max_fp32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__max_fp64
@@ -983,34 +566,11 @@ GrB_Info GB_red_scalar__max_fp64
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__max_fp64
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__max_fp64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__max_fp64
@@ -1027,6 +587,7 @@ GrB_Info GB_red_build__max_fp64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_bool
@@ -1034,34 +595,11 @@ GrB_Info GB_red_scalar__any_bool
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_bool
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_bool
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_bool
@@ -1078,6 +616,7 @@ GrB_Info GB_red_build__any_bool
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_int8
@@ -1085,34 +624,11 @@ GrB_Info GB_red_scalar__any_int8
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_int8
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_int8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_int8
@@ -1129,6 +645,7 @@ GrB_Info GB_red_build__any_int8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_int16
@@ -1136,34 +653,11 @@ GrB_Info GB_red_scalar__any_int16
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_int16
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_int16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_int16
@@ -1180,6 +674,7 @@ GrB_Info GB_red_build__any_int16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_int32
@@ -1187,34 +682,11 @@ GrB_Info GB_red_scalar__any_int32
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_int32
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_int32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_int32
@@ -1231,6 +703,7 @@ GrB_Info GB_red_build__any_int32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_int64
@@ -1238,34 +711,11 @@ GrB_Info GB_red_scalar__any_int64
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_int64
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_int64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_int64
@@ -1282,6 +732,7 @@ GrB_Info GB_red_build__any_int64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_uint8
@@ -1289,34 +740,11 @@ GrB_Info GB_red_scalar__any_uint8
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_uint8
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_uint8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_uint8
@@ -1333,6 +761,7 @@ GrB_Info GB_red_build__any_uint8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_uint16
@@ -1340,34 +769,11 @@ GrB_Info GB_red_scalar__any_uint16
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_uint16
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_uint16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_uint16
@@ -1384,6 +790,7 @@ GrB_Info GB_red_build__any_uint16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_uint32
@@ -1391,34 +798,11 @@ GrB_Info GB_red_scalar__any_uint32
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_uint32
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_uint32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_uint32
@@ -1435,6 +819,7 @@ GrB_Info GB_red_build__any_uint32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_uint64
@@ -1442,37 +827,14 @@ GrB_Info GB_red_scalar__any_uint64
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_uint64
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_uint64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
-
-
-GrB_Info GB_red_build__any_uint64
+
+
+GrB_Info GB_red_build__any_uint64
 (
     uint64_t *GB_RESTRICT Tx,
     int64_t  *GB_RESTRICT Ti,
@@ -1486,6 +848,7 @@ GrB_Info GB_red_build__any_uint64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_fp32
@@ -1493,34 +856,11 @@ GrB_Info GB_red_scalar__any_fp32
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_fp32
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_fp32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_fp32
@@ -1537,6 +877,7 @@ GrB_Info GB_red_build__any_fp32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_fp64
@@ -1544,34 +885,11 @@ GrB_Info GB_red_scalar__any_fp64
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_fp64
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_fp64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_fp64
@@ -1588,6 +906,7 @@ GrB_Info GB_red_build__any_fp64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_fc32
@@ -1595,34 +914,11 @@ GrB_Info GB_red_scalar__any_fc32
     GxB_FC32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_fc32
-(
-    GxB_FC32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_fc32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_fc32
@@ -1639,6 +935,7 @@ GrB_Info GB_red_build__any_fc32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_fc64
@@ -1646,34 +943,11 @@ GrB_Info GB_red_scalar__any_fc64
     GxB_FC64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_fc64
-(
-    GxB_FC64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_fc64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_fc64
@@ -1690,6 +964,7 @@ GrB_Info GB_red_build__any_fc64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__plus_int8
@@ -1697,34 +972,11 @@ GrB_Info GB_red_scalar__plus_int8
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__plus_int8
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__plus_int8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__plus_int8
@@ -1741,6 +993,7 @@ GrB_Info GB_red_build__plus_int8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__plus_int16
@@ -1748,34 +1001,11 @@ GrB_Info GB_red_scalar__plus_int16
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__plus_int16
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__plus_int16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__plus_int16
@@ -1792,6 +1022,7 @@ GrB_Info GB_red_build__plus_int16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__plus_int32
@@ -1799,34 +1030,11 @@ GrB_Info GB_red_scalar__plus_int32
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__plus_int32
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__plus_int32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__plus_int32
@@ -1843,6 +1051,7 @@ GrB_Info GB_red_build__plus_int32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__plus_int64
@@ -1850,34 +1059,11 @@ GrB_Info GB_red_scalar__plus_int64
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__plus_int64
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__plus_int64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__plus_int64
@@ -1894,6 +1080,7 @@ GrB_Info GB_red_build__plus_int64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__plus_uint8
@@ -1901,34 +1088,11 @@ GrB_Info GB_red_scalar__plus_uint8
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__plus_uint8
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__plus_uint8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__plus_uint8
@@ -1945,6 +1109,7 @@ GrB_Info GB_red_build__plus_uint8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__plus_uint16
@@ -1952,34 +1117,11 @@ GrB_Info GB_red_scalar__plus_uint16
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__plus_uint16
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__plus_uint16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__plus_uint16
@@ -1996,6 +1138,7 @@ GrB_Info GB_red_build__plus_uint16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__plus_uint32
@@ -2003,34 +1146,11 @@ GrB_Info GB_red_scalar__plus_uint32
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__plus_uint32
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__plus_uint32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__plus_uint32
@@ -2047,6 +1167,7 @@ GrB_Info GB_red_build__plus_uint32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__plus_uint64
@@ -2054,34 +1175,11 @@ GrB_Info GB_red_scalar__plus_uint64
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__plus_uint64
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__plus_uint64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__plus_uint64
@@ -2098,6 +1196,7 @@ GrB_Info GB_red_build__plus_uint64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__plus_fp32
@@ -2105,34 +1204,11 @@ GrB_Info GB_red_scalar__plus_fp32
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__plus_fp32
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__plus_fp32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__plus_fp32
@@ -2149,6 +1225,7 @@ GrB_Info GB_red_build__plus_fp32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__plus_fp64
@@ -2156,34 +1233,11 @@ GrB_Info GB_red_scalar__plus_fp64
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__plus_fp64
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__plus_fp64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__plus_fp64
@@ -2200,6 +1254,7 @@ GrB_Info GB_red_build__plus_fp64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__plus_fc32
@@ -2207,34 +1262,11 @@ GrB_Info GB_red_scalar__plus_fc32
     GxB_FC32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__plus_fc32
-(
-    GxB_FC32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__plus_fc32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__plus_fc32
@@ -2251,6 +1283,7 @@ GrB_Info GB_red_build__plus_fc32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__plus_fc64
@@ -2258,34 +1291,11 @@ GrB_Info GB_red_scalar__plus_fc64
     GxB_FC64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__plus_fc64
-(
-    GxB_FC64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__plus_fc64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__plus_fc64
@@ -2302,6 +1312,7 @@ GrB_Info GB_red_build__plus_fc64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__times_int8
@@ -2309,34 +1320,11 @@ GrB_Info GB_red_scalar__times_int8
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__times_int8
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__times_int8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__times_int8
@@ -2353,6 +1341,7 @@ GrB_Info GB_red_build__times_int8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__times_int16
@@ -2360,34 +1349,11 @@ GrB_Info GB_red_scalar__times_int16
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__times_int16
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__times_int16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__times_int16
@@ -2404,6 +1370,7 @@ GrB_Info GB_red_build__times_int16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__times_int32
@@ -2411,34 +1378,11 @@ GrB_Info GB_red_scalar__times_int32
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__times_int32
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__times_int32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__times_int32
@@ -2455,6 +1399,7 @@ GrB_Info GB_red_build__times_int32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__times_int64
@@ -2462,34 +1407,11 @@ GrB_Info GB_red_scalar__times_int64
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__times_int64
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__times_int64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__times_int64
@@ -2506,6 +1428,7 @@ GrB_Info GB_red_build__times_int64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__times_uint8
@@ -2513,34 +1436,11 @@ GrB_Info GB_red_scalar__times_uint8
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__times_uint8
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__times_uint8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__times_uint8
@@ -2557,6 +1457,7 @@ GrB_Info GB_red_build__times_uint8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__times_uint16
@@ -2564,34 +1465,11 @@ GrB_Info GB_red_scalar__times_uint16
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__times_uint16
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__times_uint16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__times_uint16
@@ -2608,6 +1486,7 @@ GrB_Info GB_red_build__times_uint16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__times_uint32
@@ -2615,34 +1494,11 @@ GrB_Info GB_red_scalar__times_uint32
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__times_uint32
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__times_uint32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__times_uint32
@@ -2659,6 +1515,7 @@ GrB_Info GB_red_build__times_uint32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__times_uint64
@@ -2666,34 +1523,11 @@ GrB_Info GB_red_scalar__times_uint64
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__times_uint64
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__times_uint64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__times_uint64
@@ -2710,6 +1544,7 @@ GrB_Info GB_red_build__times_uint64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__times_fp32
@@ -2717,34 +1552,11 @@ GrB_Info GB_red_scalar__times_fp32
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__times_fp32
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__times_fp32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__times_fp32
@@ -2761,6 +1573,7 @@ GrB_Info GB_red_build__times_fp32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__times_fp64
@@ -2768,34 +1581,11 @@ GrB_Info GB_red_scalar__times_fp64
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__times_fp64
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__times_fp64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__times_fp64
@@ -2812,6 +1602,7 @@ GrB_Info GB_red_build__times_fp64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__times_fc32
@@ -2819,34 +1610,11 @@ GrB_Info GB_red_scalar__times_fc32
     GxB_FC32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__times_fc32
-(
-    GxB_FC32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__times_fc32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__times_fc32
@@ -2863,6 +1631,7 @@ GrB_Info GB_red_build__times_fc32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__times_fc64
@@ -2870,34 +1639,11 @@ GrB_Info GB_red_scalar__times_fc64
     GxB_FC64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__times_fc64
-(
-    GxB_FC64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__times_fc64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__times_fc64
@@ -2914,6 +1660,7 @@ GrB_Info GB_red_build__times_fc64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__lor_bool
@@ -2921,34 +1668,11 @@ GrB_Info GB_red_scalar__lor_bool
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__lor_bool
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__lor_bool
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__lor_bool
@@ -2965,6 +1689,7 @@ GrB_Info GB_red_build__lor_bool
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__land_bool
@@ -2972,34 +1697,11 @@ GrB_Info GB_red_scalar__land_bool
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__land_bool
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__land_bool
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__land_bool
@@ -3016,41 +1718,19 @@ GrB_Info GB_red_build__land_bool
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__lxor_bool
 (
     bool *result,
     const GrB_Matrix A,
-    GB_void *GB_RESTRICT W_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachvec__lxor_bool
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
+    GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachindex__lxor_bool
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__lxor_bool
@@ -3067,6 +1747,7 @@ GrB_Info GB_red_build__lxor_bool
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__eq_bool
@@ -3074,34 +1755,11 @@ GrB_Info GB_red_scalar__eq_bool
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__eq_bool
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__eq_bool
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__eq_bool
@@ -3118,6 +1776,7 @@ GrB_Info GB_red_build__eq_bool
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 
 
 GrB_Info GB_red_scalar__any_bool
@@ -3125,34 +1784,11 @@ GrB_Info GB_red_scalar__any_bool
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__any_bool
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__any_bool
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 
 
 GrB_Info GB_red_build__any_bool
@@ -3169,6 +1805,7 @@ GrB_Info GB_red_build__any_bool
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3176,34 +1813,11 @@ GrB_Info GB_red_scalar__(none)
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_bool
@@ -3220,6 +1834,7 @@ GrB_Info GB_red_build__first_bool
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3227,34 +1842,11 @@ GrB_Info GB_red_scalar__(none)
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_int8
@@ -3271,6 +1863,7 @@ GrB_Info GB_red_build__first_int8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3278,34 +1871,11 @@ GrB_Info GB_red_scalar__(none)
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_int16
@@ -3322,6 +1892,7 @@ GrB_Info GB_red_build__first_int16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3329,34 +1900,11 @@ GrB_Info GB_red_scalar__(none)
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_int32
@@ -3373,6 +1921,7 @@ GrB_Info GB_red_build__first_int32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3380,34 +1929,11 @@ GrB_Info GB_red_scalar__(none)
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_int64
@@ -3424,6 +1950,7 @@ GrB_Info GB_red_build__first_int64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3431,34 +1958,11 @@ GrB_Info GB_red_scalar__(none)
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_uint8
@@ -3475,6 +1979,7 @@ GrB_Info GB_red_build__first_uint8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3482,34 +1987,11 @@ GrB_Info GB_red_scalar__(none)
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_uint16
@@ -3526,6 +2008,7 @@ GrB_Info GB_red_build__first_uint16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3533,34 +2016,11 @@ GrB_Info GB_red_scalar__(none)
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_uint32
@@ -3577,6 +2037,7 @@ GrB_Info GB_red_build__first_uint32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3584,34 +2045,11 @@ GrB_Info GB_red_scalar__(none)
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_uint64
@@ -3628,6 +2066,7 @@ GrB_Info GB_red_build__first_uint64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3635,34 +2074,11 @@ GrB_Info GB_red_scalar__(none)
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_fp32
@@ -3679,6 +2095,7 @@ GrB_Info GB_red_build__first_fp32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3686,34 +2103,11 @@ GrB_Info GB_red_scalar__(none)
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_fp64
@@ -3730,6 +2124,7 @@ GrB_Info GB_red_build__first_fp64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3737,34 +2132,11 @@ GrB_Info GB_red_scalar__(none)
     GxB_FC32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    GxB_FC32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_fc32
@@ -3781,6 +2153,7 @@ GrB_Info GB_red_build__first_fc32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3788,34 +2161,11 @@ GrB_Info GB_red_scalar__(none)
     GxB_FC64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    GxB_FC64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__first_fc64
@@ -3832,6 +2182,7 @@ GrB_Info GB_red_build__first_fc64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3839,34 +2190,11 @@ GrB_Info GB_red_scalar__(none)
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_bool
@@ -3883,6 +2211,7 @@ GrB_Info GB_red_build__second_bool
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3890,34 +2219,11 @@ GrB_Info GB_red_scalar__(none)
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_int8
@@ -3934,6 +2240,7 @@ GrB_Info GB_red_build__second_int8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3941,34 +2248,11 @@ GrB_Info GB_red_scalar__(none)
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_int16
@@ -3985,6 +2269,7 @@ GrB_Info GB_red_build__second_int16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -3992,34 +2277,11 @@ GrB_Info GB_red_scalar__(none)
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_int32
@@ -4036,6 +2298,7 @@ GrB_Info GB_red_build__second_int32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -4043,34 +2306,11 @@ GrB_Info GB_red_scalar__(none)
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_int64
@@ -4087,6 +2327,7 @@ GrB_Info GB_red_build__second_int64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -4094,34 +2335,11 @@ GrB_Info GB_red_scalar__(none)
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_uint8
@@ -4138,6 +2356,7 @@ GrB_Info GB_red_build__second_uint8
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -4145,34 +2364,11 @@ GrB_Info GB_red_scalar__(none)
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_uint16
@@ -4189,6 +2385,7 @@ GrB_Info GB_red_build__second_uint16
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -4196,34 +2393,11 @@ GrB_Info GB_red_scalar__(none)
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_uint32
@@ -4240,6 +2414,7 @@ GrB_Info GB_red_build__second_uint32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -4247,34 +2422,11 @@ GrB_Info GB_red_scalar__(none)
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_uint64
@@ -4291,6 +2443,7 @@ GrB_Info GB_red_build__second_uint64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -4298,34 +2451,11 @@ GrB_Info GB_red_scalar__(none)
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_fp32
@@ -4342,6 +2472,7 @@ GrB_Info GB_red_build__second_fp32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -4349,34 +2480,11 @@ GrB_Info GB_red_scalar__(none)
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_fp64
@@ -4393,6 +2501,7 @@ GrB_Info GB_red_build__second_fp64
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -4400,34 +2509,11 @@ GrB_Info GB_red_scalar__(none)
     GxB_FC32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    GxB_FC32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_fc32
@@ -4444,6 +2530,7 @@ GrB_Info GB_red_build__second_fc32
     int nthreads
 ) ;
 
+// SPDX-License-Identifier: Apache-2.0
 #if 0
 
 GrB_Info GB_red_scalar__(none)
@@ -4451,34 +2538,11 @@ GrB_Info GB_red_scalar__(none)
     GxB_FC64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec__(none)
-(
-    GxB_FC64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 #endif
 
 GrB_Info GB_red_build__second_fc64
diff --git a/GraphBLAS/Source/Generated/GB_red__land_bool.c b/GraphBLAS/Source/Generated/GB_red__land_bool.c
index 34ccf89340..0fded72af1 100644
--- a/GraphBLAS/Source/Generated/GB_red__land_bool.c
+++ b/GraphBLAS/Source/Generated/GB_red__land_bool.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__land_bool
 // Reduce to scalar:   GB_red_scalar__land_bool
-// Reduce each vector: GB_red_eachvec__land_bool
-// Reduce each index:  GB_red_eachindex__land_bool
 
 // A type:   bool
 // C type:   bool
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     bool
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        bool s
+    #define GB_IDENTITY \
+        true
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        bool s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == false)
+
     #define GB_TERMINAL_VALUE                       \
         false
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == false) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__land_bool
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__land_bool
     return (GrB_NO_VALUE) ;
     #else
     bool s = (*result) ;
-    #include "GB_reduce_panel.c"
+    bool *GB_RESTRICT W = (bool *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__land_bool
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__land_bool
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__land_bool
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__lor_bool.c b/GraphBLAS/Source/Generated/GB_red__lor_bool.c
index eec039cbd9..3e5dbb4c88 100644
--- a/GraphBLAS/Source/Generated/GB_red__lor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_red__lor_bool.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__lor_bool
 // Reduce to scalar:   GB_red_scalar__lor_bool
-// Reduce each vector: GB_red_eachvec__lor_bool
-// Reduce each index:  GB_red_eachindex__lor_bool
 
 // A type:   bool
 // C type:   bool
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     bool
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        bool s
+    #define GB_IDENTITY \
+        false
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        bool s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == true)
+
     #define GB_TERMINAL_VALUE                       \
         true
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == true) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__lor_bool
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__lor_bool
     return (GrB_NO_VALUE) ;
     #else
     bool s = (*result) ;
-    #include "GB_reduce_panel.c"
+    bool *GB_RESTRICT W = (bool *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__lor_bool
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__lor_bool
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__lor_bool
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__lxor_bool.c b/GraphBLAS/Source/Generated/GB_red__lxor_bool.c
index 8a29661fb5..5c88b30201 100644
--- a/GraphBLAS/Source/Generated/GB_red__lxor_bool.c
+++ b/GraphBLAS/Source/Generated/GB_red__lxor_bool.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__lxor_bool
 // Reduce to scalar:   GB_red_scalar__lxor_bool
-// Reduce each vector: GB_red_eachvec__lxor_bool
-// Reduce each index:  GB_red_eachindex__lxor_bool
 
 // A type:   bool
 // C type:   bool
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     bool
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        bool s
+    #define GB_IDENTITY \
+        false
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        bool s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__lxor_bool
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__lxor_bool
     return (GrB_NO_VALUE) ;
     #else
     bool s = (*result) ;
-    #include "GB_reduce_panel.c"
+    bool *GB_RESTRICT W = (bool *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__lxor_bool
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__lxor_bool
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__lxor_bool
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__max_fp32.c b/GraphBLAS/Source/Generated/GB_red__max_fp32.c
index 45834dfe78..7bc182a331 100644
--- a/GraphBLAS/Source/Generated/GB_red__max_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_red__max_fp32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__max_fp32
 // Reduce to scalar:   GB_red_scalar__max_fp32
-// Reduce each vector: GB_red_eachvec__max_fp32
-// Reduce each index:  GB_red_eachindex__max_fp32
 
 // A type:   float
 // C type:   float
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     float
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        float s
+    #define GB_IDENTITY \
+        (-INFINITY)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        float s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == INFINITY)
+
     #define GB_TERMINAL_VALUE                       \
         INFINITY
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == INFINITY) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__max_fp32
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__max_fp32
     return (GrB_NO_VALUE) ;
     #else
     float s = (*result) ;
-    #include "GB_reduce_panel.c"
+    float *GB_RESTRICT W = (float *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__max_fp32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__max_fp32
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__max_fp32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__max_fp64.c b/GraphBLAS/Source/Generated/GB_red__max_fp64.c
index ec6cb27592..96ee00c360 100644
--- a/GraphBLAS/Source/Generated/GB_red__max_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_red__max_fp64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,15 +20,13 @@
 
 // Assemble tuples:    GB_red_build__max_fp64
 // Reduce to scalar:   GB_red_scalar__max_fp64
-// Reduce each vector: GB_red_eachvec__max_fp64
-// Reduce each index:  GB_red_eachindex__max_fp64
 
 // A type:   double
 // C type:   double
 
 // Reduce:   if ((aij > s) || (s != s)) s = aij
-// Identity: ((double) INFINITY)
-// Terminal: if (s == ((double) -INFINITY)) break ;
+// Identity: ((double) -INFINITY)
+// Terminal: if (s == ((double) INFINITY)) break ;
 
 #define GB_ATYPE \
     double
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     double
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        double s
+    #define GB_IDENTITY \
+        ((double) -INFINITY)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        double s = GB_IDENTITY
 
 // Array to array
 
@@ -92,11 +95,14 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == ((double) INFINITY))
+
     #define GB_TERMINAL_VALUE                       \
-        ((double) -INFINITY)
+        ((double) INFINITY)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
-        if (s == ((double) -INFINITY)) break ;
+    #define GB_BREAK_IF_TERMINAL(s)                 \
+        if (s == ((double) INFINITY)) break ;
 
 // panel size for built-in operators
 
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__max_fp64
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__max_fp64
     return (GrB_NO_VALUE) ;
     #else
     double s = (*result) ;
-    #include "GB_reduce_panel.c"
+    double *GB_RESTRICT W = (double *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__max_fp64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__max_fp64
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__max_fp64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__max_int16.c b/GraphBLAS/Source/Generated/GB_red__max_int16.c
index bc41b3b442..d168a03742 100644
--- a/GraphBLAS/Source/Generated/GB_red__max_int16.c
+++ b/GraphBLAS/Source/Generated/GB_red__max_int16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__max_int16
 // Reduce to scalar:   GB_red_scalar__max_int16
-// Reduce each vector: GB_red_eachvec__max_int16
-// Reduce each index:  GB_red_eachindex__max_int16
 
 // A type:   int16_t
 // C type:   int16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int16_t s
+    #define GB_IDENTITY \
+        INT16_MIN
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == INT16_MAX)
+
     #define GB_TERMINAL_VALUE                       \
         INT16_MAX
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == INT16_MAX) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__max_int16
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__max_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int16_t *GB_RESTRICT W = (int16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__max_int16
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__max_int16
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__max_int16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__max_int32.c b/GraphBLAS/Source/Generated/GB_red__max_int32.c
index 164e5f9655..b92cfe2dd2 100644
--- a/GraphBLAS/Source/Generated/GB_red__max_int32.c
+++ b/GraphBLAS/Source/Generated/GB_red__max_int32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__max_int32
 // Reduce to scalar:   GB_red_scalar__max_int32
-// Reduce each vector: GB_red_eachvec__max_int32
-// Reduce each index:  GB_red_eachindex__max_int32
 
 // A type:   int32_t
 // C type:   int32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int32_t s
+    #define GB_IDENTITY \
+        INT32_MIN
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == INT32_MAX)
+
     #define GB_TERMINAL_VALUE                       \
         INT32_MAX
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == INT32_MAX) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__max_int32
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__max_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int32_t *GB_RESTRICT W = (int32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__max_int32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__max_int32
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__max_int32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__max_int64.c b/GraphBLAS/Source/Generated/GB_red__max_int64.c
index 38ea97a217..74403b50d9 100644
--- a/GraphBLAS/Source/Generated/GB_red__max_int64.c
+++ b/GraphBLAS/Source/Generated/GB_red__max_int64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__max_int64
 // Reduce to scalar:   GB_red_scalar__max_int64
-// Reduce each vector: GB_red_eachvec__max_int64
-// Reduce each index:  GB_red_eachindex__max_int64
 
 // A type:   int64_t
 // C type:   int64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int64_t s
+    #define GB_IDENTITY \
+        INT64_MIN
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == INT64_MAX)
+
     #define GB_TERMINAL_VALUE                       \
         INT64_MAX
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == INT64_MAX) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__max_int64
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__max_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int64_t *GB_RESTRICT W = (int64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__max_int64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__max_int64
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__max_int64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__max_int8.c b/GraphBLAS/Source/Generated/GB_red__max_int8.c
index 6e312aa4ea..264c39387a 100644
--- a/GraphBLAS/Source/Generated/GB_red__max_int8.c
+++ b/GraphBLAS/Source/Generated/GB_red__max_int8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__max_int8
 // Reduce to scalar:   GB_red_scalar__max_int8
-// Reduce each vector: GB_red_eachvec__max_int8
-// Reduce each index:  GB_red_eachindex__max_int8
 
 // A type:   int8_t
 // C type:   int8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int8_t s
+    #define GB_IDENTITY \
+        INT8_MIN
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == INT8_MAX)
+
     #define GB_TERMINAL_VALUE                       \
         INT8_MAX
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == INT8_MAX) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__max_int8
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__max_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int8_t *GB_RESTRICT W = (int8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__max_int8
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__max_int8
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__max_int8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__max_uint16.c b/GraphBLAS/Source/Generated/GB_red__max_uint16.c
index ff1703ce20..2a82500b1a 100644
--- a/GraphBLAS/Source/Generated/GB_red__max_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_red__max_uint16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__max_uint16
 // Reduce to scalar:   GB_red_scalar__max_uint16
-// Reduce each vector: GB_red_eachvec__max_uint16
-// Reduce each index:  GB_red_eachindex__max_uint16
 
 // A type:   uint16_t
 // C type:   uint16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint16_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == UINT16_MAX)
+
     #define GB_TERMINAL_VALUE                       \
         UINT16_MAX
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == UINT16_MAX) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__max_uint16
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__max_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint16_t *GB_RESTRICT W = (uint16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__max_uint16
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__max_uint16
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__max_uint16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__max_uint32.c b/GraphBLAS/Source/Generated/GB_red__max_uint32.c
index 7d9bdb2772..f319f166bf 100644
--- a/GraphBLAS/Source/Generated/GB_red__max_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_red__max_uint32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__max_uint32
 // Reduce to scalar:   GB_red_scalar__max_uint32
-// Reduce each vector: GB_red_eachvec__max_uint32
-// Reduce each index:  GB_red_eachindex__max_uint32
 
 // A type:   uint32_t
 // C type:   uint32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint32_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == UINT32_MAX)
+
     #define GB_TERMINAL_VALUE                       \
         UINT32_MAX
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == UINT32_MAX) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__max_uint32
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__max_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint32_t *GB_RESTRICT W = (uint32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__max_uint32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__max_uint32
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__max_uint32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__max_uint64.c b/GraphBLAS/Source/Generated/GB_red__max_uint64.c
index d979352412..c567b55cbd 100644
--- a/GraphBLAS/Source/Generated/GB_red__max_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_red__max_uint64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__max_uint64
 // Reduce to scalar:   GB_red_scalar__max_uint64
-// Reduce each vector: GB_red_eachvec__max_uint64
-// Reduce each index:  GB_red_eachindex__max_uint64
 
 // A type:   uint64_t
 // C type:   uint64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint64_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == UINT64_MAX)
+
     #define GB_TERMINAL_VALUE                       \
         UINT64_MAX
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == UINT64_MAX) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__max_uint64
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__max_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint64_t *GB_RESTRICT W = (uint64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__max_uint64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__max_uint64
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__max_uint64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__max_uint8.c b/GraphBLAS/Source/Generated/GB_red__max_uint8.c
index 65116003c8..a25b6f7d54 100644
--- a/GraphBLAS/Source/Generated/GB_red__max_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_red__max_uint8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__max_uint8
 // Reduce to scalar:   GB_red_scalar__max_uint8
-// Reduce each vector: GB_red_eachvec__max_uint8
-// Reduce each index:  GB_red_eachindex__max_uint8
 
 // A type:   uint8_t
 // C type:   uint8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint8_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == UINT8_MAX)
+
     #define GB_TERMINAL_VALUE                       \
         UINT8_MAX
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == UINT8_MAX) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__max_uint8
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__max_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint8_t *GB_RESTRICT W = (uint8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__max_uint8
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__max_uint8
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__max_uint8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__min_fp32.c b/GraphBLAS/Source/Generated/GB_red__min_fp32.c
index 5dedacdb6b..45c9f924dd 100644
--- a/GraphBLAS/Source/Generated/GB_red__min_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_red__min_fp32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__min_fp32
 // Reduce to scalar:   GB_red_scalar__min_fp32
-// Reduce each vector: GB_red_eachvec__min_fp32
-// Reduce each index:  GB_red_eachindex__min_fp32
 
 // A type:   float
 // C type:   float
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     float
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        float s
+    #define GB_IDENTITY \
+        INFINITY
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        float s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == (-INFINITY))
+
     #define GB_TERMINAL_VALUE                       \
         (-INFINITY)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == (-INFINITY)) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__min_fp32
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__min_fp32
     return (GrB_NO_VALUE) ;
     #else
     float s = (*result) ;
-    #include "GB_reduce_panel.c"
+    float *GB_RESTRICT W = (float *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__min_fp32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__min_fp32
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__min_fp32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__min_fp64.c b/GraphBLAS/Source/Generated/GB_red__min_fp64.c
index 953c85bea1..52ce078643 100644
--- a/GraphBLAS/Source/Generated/GB_red__min_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_red__min_fp64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__min_fp64
 // Reduce to scalar:   GB_red_scalar__min_fp64
-// Reduce each vector: GB_red_eachvec__min_fp64
-// Reduce each index:  GB_red_eachindex__min_fp64
 
 // A type:   double
 // C type:   double
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     double
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        double s
+    #define GB_IDENTITY \
+        ((double) INFINITY)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        double s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == ((double) -INFINITY))
+
     #define GB_TERMINAL_VALUE                       \
         ((double) -INFINITY)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == ((double) -INFINITY)) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__min_fp64
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__min_fp64
     return (GrB_NO_VALUE) ;
     #else
     double s = (*result) ;
-    #include "GB_reduce_panel.c"
+    double *GB_RESTRICT W = (double *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__min_fp64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__min_fp64
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__min_fp64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__min_int16.c b/GraphBLAS/Source/Generated/GB_red__min_int16.c
index bbad390c3f..5c46968b99 100644
--- a/GraphBLAS/Source/Generated/GB_red__min_int16.c
+++ b/GraphBLAS/Source/Generated/GB_red__min_int16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__min_int16
 // Reduce to scalar:   GB_red_scalar__min_int16
-// Reduce each vector: GB_red_eachvec__min_int16
-// Reduce each index:  GB_red_eachindex__min_int16
 
 // A type:   int16_t
 // C type:   int16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int16_t s
+    #define GB_IDENTITY \
+        INT16_MAX
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == INT16_MIN)
+
     #define GB_TERMINAL_VALUE                       \
         INT16_MIN
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == INT16_MIN) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__min_int16
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__min_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int16_t *GB_RESTRICT W = (int16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__min_int16
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__min_int16
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__min_int16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__min_int32.c b/GraphBLAS/Source/Generated/GB_red__min_int32.c
index 149b7e6516..b17fd9f2eb 100644
--- a/GraphBLAS/Source/Generated/GB_red__min_int32.c
+++ b/GraphBLAS/Source/Generated/GB_red__min_int32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__min_int32
 // Reduce to scalar:   GB_red_scalar__min_int32
-// Reduce each vector: GB_red_eachvec__min_int32
-// Reduce each index:  GB_red_eachindex__min_int32
 
 // A type:   int32_t
 // C type:   int32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int32_t s
+    #define GB_IDENTITY \
+        INT32_MAX
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == INT32_MIN)
+
     #define GB_TERMINAL_VALUE                       \
         INT32_MIN
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == INT32_MIN) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__min_int32
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__min_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int32_t *GB_RESTRICT W = (int32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__min_int32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__min_int32
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__min_int32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__min_int64.c b/GraphBLAS/Source/Generated/GB_red__min_int64.c
index 5e189093f7..da44f302b7 100644
--- a/GraphBLAS/Source/Generated/GB_red__min_int64.c
+++ b/GraphBLAS/Source/Generated/GB_red__min_int64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__min_int64
 // Reduce to scalar:   GB_red_scalar__min_int64
-// Reduce each vector: GB_red_eachvec__min_int64
-// Reduce each index:  GB_red_eachindex__min_int64
 
 // A type:   int64_t
 // C type:   int64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int64_t s
+    #define GB_IDENTITY \
+        INT64_MAX
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == INT64_MIN)
+
     #define GB_TERMINAL_VALUE                       \
         INT64_MIN
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == INT64_MIN) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__min_int64
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__min_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int64_t *GB_RESTRICT W = (int64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__min_int64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__min_int64
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__min_int64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__min_int8.c b/GraphBLAS/Source/Generated/GB_red__min_int8.c
index 75ff926ab9..254b25cc06 100644
--- a/GraphBLAS/Source/Generated/GB_red__min_int8.c
+++ b/GraphBLAS/Source/Generated/GB_red__min_int8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__min_int8
 // Reduce to scalar:   GB_red_scalar__min_int8
-// Reduce each vector: GB_red_eachvec__min_int8
-// Reduce each index:  GB_red_eachindex__min_int8
 
 // A type:   int8_t
 // C type:   int8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int8_t s
+    #define GB_IDENTITY \
+        INT8_MAX
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == INT8_MIN)
+
     #define GB_TERMINAL_VALUE                       \
         INT8_MIN
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == INT8_MIN) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__min_int8
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__min_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int8_t *GB_RESTRICT W = (int8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__min_int8
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__min_int8
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__min_int8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__min_uint16.c b/GraphBLAS/Source/Generated/GB_red__min_uint16.c
index 096d7e4025..0f83b8599c 100644
--- a/GraphBLAS/Source/Generated/GB_red__min_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_red__min_uint16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__min_uint16
 // Reduce to scalar:   GB_red_scalar__min_uint16
-// Reduce each vector: GB_red_eachvec__min_uint16
-// Reduce each index:  GB_red_eachindex__min_uint16
 
 // A type:   uint16_t
 // C type:   uint16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint16_t s
+    #define GB_IDENTITY \
+        UINT16_MAX
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == 0)
+
     #define GB_TERMINAL_VALUE                       \
         0
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == 0) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__min_uint16
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__min_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint16_t *GB_RESTRICT W = (uint16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__min_uint16
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__min_uint16
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__min_uint16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__min_uint32.c b/GraphBLAS/Source/Generated/GB_red__min_uint32.c
index 0f1d32fc93..8509142fc8 100644
--- a/GraphBLAS/Source/Generated/GB_red__min_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_red__min_uint32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__min_uint32
 // Reduce to scalar:   GB_red_scalar__min_uint32
-// Reduce each vector: GB_red_eachvec__min_uint32
-// Reduce each index:  GB_red_eachindex__min_uint32
 
 // A type:   uint32_t
 // C type:   uint32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint32_t s
+    #define GB_IDENTITY \
+        UINT32_MAX
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == 0)
+
     #define GB_TERMINAL_VALUE                       \
         0
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == 0) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__min_uint32
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__min_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint32_t *GB_RESTRICT W = (uint32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__min_uint32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__min_uint32
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__min_uint32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__min_uint64.c b/GraphBLAS/Source/Generated/GB_red__min_uint64.c
index 0905aa034a..5d600ecf89 100644
--- a/GraphBLAS/Source/Generated/GB_red__min_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_red__min_uint64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__min_uint64
 // Reduce to scalar:   GB_red_scalar__min_uint64
-// Reduce each vector: GB_red_eachvec__min_uint64
-// Reduce each index:  GB_red_eachindex__min_uint64
 
 // A type:   uint64_t
 // C type:   uint64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint64_t s
+    #define GB_IDENTITY \
+        UINT64_MAX
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == 0)
+
     #define GB_TERMINAL_VALUE                       \
         0
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == 0) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__min_uint64
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__min_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint64_t *GB_RESTRICT W = (uint64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__min_uint64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__min_uint64
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__min_uint64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__min_uint8.c b/GraphBLAS/Source/Generated/GB_red__min_uint8.c
index d80196b5dd..8c0fc60627 100644
--- a/GraphBLAS/Source/Generated/GB_red__min_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_red__min_uint8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__min_uint8
 // Reduce to scalar:   GB_red_scalar__min_uint8
-// Reduce each vector: GB_red_eachvec__min_uint8
-// Reduce each index:  GB_red_eachindex__min_uint8
 
 // A type:   uint8_t
 // C type:   uint8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint8_t s
+    #define GB_IDENTITY \
+        UINT8_MAX
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == 0)
+
     #define GB_TERMINAL_VALUE                       \
         0
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == 0) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__min_uint8
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__min_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint8_t *GB_RESTRICT W = (uint8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__min_uint8
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__min_uint8
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__min_uint8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__plus_fc32.c b/GraphBLAS/Source/Generated/GB_red__plus_fc32.c
index 7d69f02eb0..4e089c2031 100644
--- a/GraphBLAS/Source/Generated/GB_red__plus_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_red__plus_fc32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__plus_fc32
 // Reduce to scalar:   GB_red_scalar__plus_fc32
-// Reduce each vector: GB_red_eachvec__plus_fc32
-// Reduce each index:  GB_red_eachindex__plus_fc32
 
 // A type:   GxB_FC32_t
 // C type:   GxB_FC32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        GxB_FC32_t s
+    #define GB_IDENTITY \
+        GxB_CMPLXF(0,0)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        GxB_FC32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__plus_fc32
     GxB_FC32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__plus_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    GxB_FC32_t *GB_RESTRICT W = (GxB_FC32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__plus_fc32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__plus_fc32
-(
-    GxB_FC32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__plus_fc32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__plus_fc64.c b/GraphBLAS/Source/Generated/GB_red__plus_fc64.c
index ccbdf0a468..b02b58d8de 100644
--- a/GraphBLAS/Source/Generated/GB_red__plus_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_red__plus_fc64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__plus_fc64
 // Reduce to scalar:   GB_red_scalar__plus_fc64
-// Reduce each vector: GB_red_eachvec__plus_fc64
-// Reduce each index:  GB_red_eachindex__plus_fc64
 
 // A type:   GxB_FC64_t
 // C type:   GxB_FC64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        GxB_FC64_t s
+    #define GB_IDENTITY \
+        GxB_CMPLX(0,0)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        GxB_FC64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__plus_fc64
     GxB_FC64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__plus_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    GxB_FC64_t *GB_RESTRICT W = (GxB_FC64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__plus_fc64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__plus_fc64
-(
-    GxB_FC64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__plus_fc64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__plus_fp32.c b/GraphBLAS/Source/Generated/GB_red__plus_fp32.c
index b58d5771f0..26d663934e 100644
--- a/GraphBLAS/Source/Generated/GB_red__plus_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_red__plus_fp32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__plus_fp32
 // Reduce to scalar:   GB_red_scalar__plus_fp32
-// Reduce each vector: GB_red_eachvec__plus_fp32
-// Reduce each index:  GB_red_eachindex__plus_fp32
 
 // A type:   float
 // C type:   float
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     float
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        float s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        float s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__plus_fp32
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__plus_fp32
     return (GrB_NO_VALUE) ;
     #else
     float s = (*result) ;
-    #include "GB_reduce_panel.c"
+    float *GB_RESTRICT W = (float *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__plus_fp32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__plus_fp32
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__plus_fp32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__plus_fp64.c b/GraphBLAS/Source/Generated/GB_red__plus_fp64.c
index bc024695eb..1fd639d515 100644
--- a/GraphBLAS/Source/Generated/GB_red__plus_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_red__plus_fp64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__plus_fp64
 // Reduce to scalar:   GB_red_scalar__plus_fp64
-// Reduce each vector: GB_red_eachvec__plus_fp64
-// Reduce each index:  GB_red_eachindex__plus_fp64
 
 // A type:   double
 // C type:   double
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     double
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        double s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        double s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__plus_fp64
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__plus_fp64
     return (GrB_NO_VALUE) ;
     #else
     double s = (*result) ;
-    #include "GB_reduce_panel.c"
+    double *GB_RESTRICT W = (double *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__plus_fp64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__plus_fp64
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__plus_fp64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__plus_int16.c b/GraphBLAS/Source/Generated/GB_red__plus_int16.c
index 2a30357ddd..984d7db13b 100644
--- a/GraphBLAS/Source/Generated/GB_red__plus_int16.c
+++ b/GraphBLAS/Source/Generated/GB_red__plus_int16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__plus_int16
 // Reduce to scalar:   GB_red_scalar__plus_int16
-// Reduce each vector: GB_red_eachvec__plus_int16
-// Reduce each index:  GB_red_eachindex__plus_int16
 
 // A type:   int16_t
 // C type:   int16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int16_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__plus_int16
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__plus_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int16_t *GB_RESTRICT W = (int16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__plus_int16
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__plus_int16
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__plus_int16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__plus_int32.c b/GraphBLAS/Source/Generated/GB_red__plus_int32.c
index feae8cb88e..c554b49549 100644
--- a/GraphBLAS/Source/Generated/GB_red__plus_int32.c
+++ b/GraphBLAS/Source/Generated/GB_red__plus_int32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__plus_int32
 // Reduce to scalar:   GB_red_scalar__plus_int32
-// Reduce each vector: GB_red_eachvec__plus_int32
-// Reduce each index:  GB_red_eachindex__plus_int32
 
 // A type:   int32_t
 // C type:   int32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int32_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__plus_int32
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__plus_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int32_t *GB_RESTRICT W = (int32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__plus_int32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__plus_int32
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__plus_int32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__plus_int64.c b/GraphBLAS/Source/Generated/GB_red__plus_int64.c
index 038340e614..cb860136ba 100644
--- a/GraphBLAS/Source/Generated/GB_red__plus_int64.c
+++ b/GraphBLAS/Source/Generated/GB_red__plus_int64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__plus_int64
 // Reduce to scalar:   GB_red_scalar__plus_int64
-// Reduce each vector: GB_red_eachvec__plus_int64
-// Reduce each index:  GB_red_eachindex__plus_int64
 
 // A type:   int64_t
 // C type:   int64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int64_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__plus_int64
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__plus_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int64_t *GB_RESTRICT W = (int64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__plus_int64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__plus_int64
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__plus_int64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__plus_int8.c b/GraphBLAS/Source/Generated/GB_red__plus_int8.c
index 09752ba89a..aa2c33a0c9 100644
--- a/GraphBLAS/Source/Generated/GB_red__plus_int8.c
+++ b/GraphBLAS/Source/Generated/GB_red__plus_int8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__plus_int8
 // Reduce to scalar:   GB_red_scalar__plus_int8
-// Reduce each vector: GB_red_eachvec__plus_int8
-// Reduce each index:  GB_red_eachindex__plus_int8
 
 // A type:   int8_t
 // C type:   int8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int8_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__plus_int8
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__plus_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int8_t *GB_RESTRICT W = (int8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__plus_int8
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__plus_int8
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__plus_int8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__plus_uint16.c b/GraphBLAS/Source/Generated/GB_red__plus_uint16.c
index 0b3e26d1b7..582534838a 100644
--- a/GraphBLAS/Source/Generated/GB_red__plus_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_red__plus_uint16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__plus_uint16
 // Reduce to scalar:   GB_red_scalar__plus_uint16
-// Reduce each vector: GB_red_eachvec__plus_uint16
-// Reduce each index:  GB_red_eachindex__plus_uint16
 
 // A type:   uint16_t
 // C type:   uint16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint16_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__plus_uint16
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__plus_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint16_t *GB_RESTRICT W = (uint16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__plus_uint16
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__plus_uint16
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__plus_uint16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__plus_uint32.c b/GraphBLAS/Source/Generated/GB_red__plus_uint32.c
index c6ef31ea53..bbc7bb7089 100644
--- a/GraphBLAS/Source/Generated/GB_red__plus_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_red__plus_uint32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__plus_uint32
 // Reduce to scalar:   GB_red_scalar__plus_uint32
-// Reduce each vector: GB_red_eachvec__plus_uint32
-// Reduce each index:  GB_red_eachindex__plus_uint32
 
 // A type:   uint32_t
 // C type:   uint32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint32_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__plus_uint32
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__plus_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint32_t *GB_RESTRICT W = (uint32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__plus_uint32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__plus_uint32
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__plus_uint32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__plus_uint64.c b/GraphBLAS/Source/Generated/GB_red__plus_uint64.c
index 8d7435632f..99d9bf39a7 100644
--- a/GraphBLAS/Source/Generated/GB_red__plus_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_red__plus_uint64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__plus_uint64
 // Reduce to scalar:   GB_red_scalar__plus_uint64
-// Reduce each vector: GB_red_eachvec__plus_uint64
-// Reduce each index:  GB_red_eachindex__plus_uint64
 
 // A type:   uint64_t
 // C type:   uint64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint64_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__plus_uint64
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__plus_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint64_t *GB_RESTRICT W = (uint64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__plus_uint64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__plus_uint64
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__plus_uint64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__plus_uint8.c b/GraphBLAS/Source/Generated/GB_red__plus_uint8.c
index 0808192be9..fdccbe03fb 100644
--- a/GraphBLAS/Source/Generated/GB_red__plus_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_red__plus_uint8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__plus_uint8
 // Reduce to scalar:   GB_red_scalar__plus_uint8
-// Reduce each vector: GB_red_eachvec__plus_uint8
-// Reduce each index:  GB_red_eachindex__plus_uint8
 
 // A type:   uint8_t
 // C type:   uint8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint8_t s
+    #define GB_IDENTITY \
+        0
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__plus_uint8
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__plus_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint8_t *GB_RESTRICT W = (uint8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__plus_uint8
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__plus_uint8
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__plus_uint8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_bool.c b/GraphBLAS/Source/Generated/GB_red__second_bool.c
index 2f17f9f83a..7aff18d8d6 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_bool.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_bool.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_bool
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   bool
 // C type:   bool
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     bool
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        bool s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        bool s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     bool *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     bool s = (*result) ;
-    #include "GB_reduce_panel.c"
+    bool *GB_RESTRICT W = (bool *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    bool *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_fc32.c b/GraphBLAS/Source/Generated/GB_red__second_fc32.c
index 9eaffc01cf..43ef8de363 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_fc32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_fc32
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   GxB_FC32_t
 // C type:   GxB_FC32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        GxB_FC32_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        GxB_FC32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     GxB_FC32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    GxB_FC32_t *GB_RESTRICT W = (GxB_FC32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    GxB_FC32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_fc64.c b/GraphBLAS/Source/Generated/GB_red__second_fc64.c
index 5a6d0c2b5d..23017ce79a 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_fc64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_fc64
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   GxB_FC64_t
 // C type:   GxB_FC64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        GxB_FC64_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        GxB_FC64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     GxB_FC64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    GxB_FC64_t *GB_RESTRICT W = (GxB_FC64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    GxB_FC64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_fp32.c b/GraphBLAS/Source/Generated/GB_red__second_fp32.c
index 4699804cf0..a2a5cd8022 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_fp32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_fp32
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   float
 // C type:   float
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     float
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        float s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        float s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     float s = (*result) ;
-    #include "GB_reduce_panel.c"
+    float *GB_RESTRICT W = (float *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_fp64.c b/GraphBLAS/Source/Generated/GB_red__second_fp64.c
index 795f1f54ca..77d63d2de4 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_fp64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_fp64
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   double
 // C type:   double
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     double
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        double s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        double s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     double s = (*result) ;
-    #include "GB_reduce_panel.c"
+    double *GB_RESTRICT W = (double *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_int16.c b/GraphBLAS/Source/Generated/GB_red__second_int16.c
index e58c6e2b64..a89a0daa0b 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_int16.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_int16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_int16
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   int16_t
 // C type:   int16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int16_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     int16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int16_t *GB_RESTRICT W = (int16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_int32.c b/GraphBLAS/Source/Generated/GB_red__second_int32.c
index cb30633947..915d45d071 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_int32.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_int32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_int32
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   int32_t
 // C type:   int32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int32_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     int32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int32_t *GB_RESTRICT W = (int32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_int64.c b/GraphBLAS/Source/Generated/GB_red__second_int64.c
index 18704f6c69..61d40ed9d0 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_int64.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_int64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_int64
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   int64_t
 // C type:   int64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int64_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     int64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int64_t *GB_RESTRICT W = (int64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_int8.c b/GraphBLAS/Source/Generated/GB_red__second_int8.c
index 2f20917d5d..d398a751c7 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_int8.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_int8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_int8
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   int8_t
 // C type:   int8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int8_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     int8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int8_t *GB_RESTRICT W = (int8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_uint16.c b/GraphBLAS/Source/Generated/GB_red__second_uint16.c
index 8ad3d30e51..40843e7b63 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_uint16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_uint16
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   uint16_t
 // C type:   uint16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint16_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     uint16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint16_t *GB_RESTRICT W = (uint16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_uint32.c b/GraphBLAS/Source/Generated/GB_red__second_uint32.c
index b4d5427a68..af80e38766 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_uint32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_uint32
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   uint32_t
 // C type:   uint32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint32_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     uint32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint32_t *GB_RESTRICT W = (uint32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_uint64.c b/GraphBLAS/Source/Generated/GB_red__second_uint64.c
index a0dec713c8..6f46a42f78 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_uint64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_uint64
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   uint64_t
 // C type:   uint64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint64_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     uint64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint64_t *GB_RESTRICT W = (uint64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__second_uint8.c b/GraphBLAS/Source/Generated/GB_red__second_uint8.c
index 1af0975e6d..2d8725e96f 100644
--- a/GraphBLAS/Source/Generated/GB_red__second_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_red__second_uint8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__second_uint8
 // Reduce to scalar:   GB_red_scalar__(none)
-// Reduce each vector: GB_red_eachvec__(none)
-// Reduce each index:  GB_red_eachindex__(none)
 
 // A type:   uint8_t
 // C type:   uint8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint8_t s
+    #define GB_IDENTITY \
+        (none)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__(none)
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__(none)
     return (GrB_NO_VALUE) ;
     #else
     uint8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint8_t *GB_RESTRICT W = (uint8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__(none)
 
 #endif
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachvec__(none)
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-#endif
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-#if 0
-
-GrB_Info GB_red_eachindex__(none)
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-#endif
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__times_fc32.c b/GraphBLAS/Source/Generated/GB_red__times_fc32.c
index 4dc8add150..9f7c9ac34d 100644
--- a/GraphBLAS/Source/Generated/GB_red__times_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_red__times_fc32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__times_fc32
 // Reduce to scalar:   GB_red_scalar__times_fc32
-// Reduce each vector: GB_red_eachvec__times_fc32
-// Reduce each index:  GB_red_eachindex__times_fc32
 
 // A type:   GxB_FC32_t
 // C type:   GxB_FC32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     GxB_FC32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        GxB_FC32_t s
+    #define GB_IDENTITY \
+        GxB_CMPLXF(1,0)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        GxB_FC32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__times_fc32
     GxB_FC32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__times_fc32
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    GxB_FC32_t *GB_RESTRICT W = (GxB_FC32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__times_fc32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__times_fc32
-(
-    GxB_FC32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__times_fc32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__times_fc64.c b/GraphBLAS/Source/Generated/GB_red__times_fc64.c
index 54b0798439..745656b94b 100644
--- a/GraphBLAS/Source/Generated/GB_red__times_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_red__times_fc64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__times_fc64
 // Reduce to scalar:   GB_red_scalar__times_fc64
-// Reduce each vector: GB_red_eachvec__times_fc64
-// Reduce each index:  GB_red_eachindex__times_fc64
 
 // A type:   GxB_FC64_t
 // C type:   GxB_FC64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     GxB_FC64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        GxB_FC64_t s
+    #define GB_IDENTITY \
+        GxB_CMPLX(1,0)
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        GxB_FC64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__times_fc64
     GxB_FC64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__times_fc64
     return (GrB_NO_VALUE) ;
     #else
     GxB_FC64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    GxB_FC64_t *GB_RESTRICT W = (GxB_FC64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__times_fc64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__times_fc64
-(
-    GxB_FC64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__times_fc64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__times_fp32.c b/GraphBLAS/Source/Generated/GB_red__times_fp32.c
index 40f2065225..0b690b5d6f 100644
--- a/GraphBLAS/Source/Generated/GB_red__times_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_red__times_fp32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__times_fp32
 // Reduce to scalar:   GB_red_scalar__times_fp32
-// Reduce each vector: GB_red_eachvec__times_fp32
-// Reduce each index:  GB_red_eachindex__times_fp32
 
 // A type:   float
 // C type:   float
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     float
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        float s
+    #define GB_IDENTITY \
+        1
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        float s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__times_fp32
     float *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__times_fp32
     return (GrB_NO_VALUE) ;
     #else
     float s = (*result) ;
-    #include "GB_reduce_panel.c"
+    float *GB_RESTRICT W = (float *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__times_fp32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__times_fp32
-(
-    float *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__times_fp32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__times_fp64.c b/GraphBLAS/Source/Generated/GB_red__times_fp64.c
index facacab7a8..478ebb1703 100644
--- a/GraphBLAS/Source/Generated/GB_red__times_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_red__times_fp64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__times_fp64
 // Reduce to scalar:   GB_red_scalar__times_fp64
-// Reduce each vector: GB_red_eachvec__times_fp64
-// Reduce each index:  GB_red_eachindex__times_fp64
 
 // A type:   double
 // C type:   double
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     double
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        double s
+    #define GB_IDENTITY \
+        1
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        double s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         0
 
+    #define GB_IS_TERMINAL(s)                       \
+        (none)
+
     #define GB_TERMINAL_VALUE                       \
         (none)
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__times_fp64
     double *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__times_fp64
     return (GrB_NO_VALUE) ;
     #else
     double s = (*result) ;
-    #include "GB_reduce_panel.c"
+    double *GB_RESTRICT W = (double *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__times_fp64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__times_fp64
-(
-    double *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__times_fp64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__times_int16.c b/GraphBLAS/Source/Generated/GB_red__times_int16.c
index 67102d9898..13a2d4e2e2 100644
--- a/GraphBLAS/Source/Generated/GB_red__times_int16.c
+++ b/GraphBLAS/Source/Generated/GB_red__times_int16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__times_int16
 // Reduce to scalar:   GB_red_scalar__times_int16
-// Reduce each vector: GB_red_eachvec__times_int16
-// Reduce each index:  GB_red_eachindex__times_int16
 
 // A type:   int16_t
 // C type:   int16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int16_t s
+    #define GB_IDENTITY \
+        1
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == 0)
+
     #define GB_TERMINAL_VALUE                       \
         0
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == 0) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__times_int16
     int16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__times_int16
     return (GrB_NO_VALUE) ;
     #else
     int16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int16_t *GB_RESTRICT W = (int16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__times_int16
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__times_int16
-(
-    int16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__times_int16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__times_int32.c b/GraphBLAS/Source/Generated/GB_red__times_int32.c
index fae7b47013..75ac7ebe22 100644
--- a/GraphBLAS/Source/Generated/GB_red__times_int32.c
+++ b/GraphBLAS/Source/Generated/GB_red__times_int32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__times_int32
 // Reduce to scalar:   GB_red_scalar__times_int32
-// Reduce each vector: GB_red_eachvec__times_int32
-// Reduce each index:  GB_red_eachindex__times_int32
 
 // A type:   int32_t
 // C type:   int32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int32_t s
+    #define GB_IDENTITY \
+        1
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == 0)
+
     #define GB_TERMINAL_VALUE                       \
         0
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == 0) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__times_int32
     int32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__times_int32
     return (GrB_NO_VALUE) ;
     #else
     int32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int32_t *GB_RESTRICT W = (int32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__times_int32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__times_int32
-(
-    int32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__times_int32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__times_int64.c b/GraphBLAS/Source/Generated/GB_red__times_int64.c
index e83a8dcc42..56d0999a25 100644
--- a/GraphBLAS/Source/Generated/GB_red__times_int64.c
+++ b/GraphBLAS/Source/Generated/GB_red__times_int64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__times_int64
 // Reduce to scalar:   GB_red_scalar__times_int64
-// Reduce each vector: GB_red_eachvec__times_int64
-// Reduce each index:  GB_red_eachindex__times_int64
 
 // A type:   int64_t
 // C type:   int64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int64_t s
+    #define GB_IDENTITY \
+        1
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == 0)
+
     #define GB_TERMINAL_VALUE                       \
         0
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == 0) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__times_int64
     int64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__times_int64
     return (GrB_NO_VALUE) ;
     #else
     int64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int64_t *GB_RESTRICT W = (int64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__times_int64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__times_int64
-(
-    int64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__times_int64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__times_int8.c b/GraphBLAS/Source/Generated/GB_red__times_int8.c
index a081cb72f5..99e2c5218e 100644
--- a/GraphBLAS/Source/Generated/GB_red__times_int8.c
+++ b/GraphBLAS/Source/Generated/GB_red__times_int8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__times_int8
 // Reduce to scalar:   GB_red_scalar__times_int8
-// Reduce each vector: GB_red_eachvec__times_int8
-// Reduce each index:  GB_red_eachindex__times_int8
 
 // A type:   int8_t
 // C type:   int8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     int8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        int8_t s
+    #define GB_IDENTITY \
+        1
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        int8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == 0)
+
     #define GB_TERMINAL_VALUE                       \
         0
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == 0) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__times_int8
     int8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__times_int8
     return (GrB_NO_VALUE) ;
     #else
     int8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    int8_t *GB_RESTRICT W = (int8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__times_int8
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__times_int8
-(
-    int8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__times_int8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__times_uint16.c b/GraphBLAS/Source/Generated/GB_red__times_uint16.c
index bed69b444d..1bae2270c2 100644
--- a/GraphBLAS/Source/Generated/GB_red__times_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_red__times_uint16.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__times_uint16
 // Reduce to scalar:   GB_red_scalar__times_uint16
-// Reduce each vector: GB_red_eachvec__times_uint16
-// Reduce each index:  GB_red_eachindex__times_uint16
 
 // A type:   uint16_t
 // C type:   uint16_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint16_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint16_t s
+    #define GB_IDENTITY \
+        1
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint16_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == 0)
+
     #define GB_TERMINAL_VALUE                       \
         0
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == 0) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__times_uint16
     uint16_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__times_uint16
     return (GrB_NO_VALUE) ;
     #else
     uint16_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint16_t *GB_RESTRICT W = (uint16_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__times_uint16
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__times_uint16
-(
-    uint16_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__times_uint16
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__times_uint32.c b/GraphBLAS/Source/Generated/GB_red__times_uint32.c
index d1cda770ea..ae07706af4 100644
--- a/GraphBLAS/Source/Generated/GB_red__times_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_red__times_uint32.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__times_uint32
 // Reduce to scalar:   GB_red_scalar__times_uint32
-// Reduce each vector: GB_red_eachvec__times_uint32
-// Reduce each index:  GB_red_eachindex__times_uint32
 
 // A type:   uint32_t
 // C type:   uint32_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint32_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint32_t s
+    #define GB_IDENTITY \
+        1
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint32_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == 0)
+
     #define GB_TERMINAL_VALUE                       \
         0
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == 0) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__times_uint32
     uint32_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__times_uint32
     return (GrB_NO_VALUE) ;
     #else
     uint32_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint32_t *GB_RESTRICT W = (uint32_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__times_uint32
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__times_uint32
-(
-    uint32_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__times_uint32
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__times_uint64.c b/GraphBLAS/Source/Generated/GB_red__times_uint64.c
index b58d41fba0..4a4b03f641 100644
--- a/GraphBLAS/Source/Generated/GB_red__times_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_red__times_uint64.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__times_uint64
 // Reduce to scalar:   GB_red_scalar__times_uint64
-// Reduce each vector: GB_red_eachvec__times_uint64
-// Reduce each index:  GB_red_eachindex__times_uint64
 
 // A type:   uint64_t
 // C type:   uint64_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint64_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint64_t s
+    #define GB_IDENTITY \
+        1
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint64_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == 0)
+
     #define GB_TERMINAL_VALUE                       \
         0
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == 0) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__times_uint64
     uint64_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__times_uint64
     return (GrB_NO_VALUE) ;
     #else
     uint64_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint64_t *GB_RESTRICT W = (uint64_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__times_uint64
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__times_uint64
-(
-    uint64_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__times_uint64
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_red__times_uint8.c b/GraphBLAS/Source/Generated/GB_red__times_uint8.c
index d7819478b0..ddd4248312 100644
--- a/GraphBLAS/Source/Generated/GB_red__times_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_red__times_uint8.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build__times_uint8
 // Reduce to scalar:   GB_red_scalar__times_uint8
-// Reduce each vector: GB_red_eachvec__times_uint8
-// Reduce each index:  GB_red_eachindex__times_uint8
 
 // A type:   uint8_t
 // C type:   uint8_t
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     uint8_t
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        uint8_t s
+    #define GB_IDENTITY \
+        1
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        uint8_t s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         1
 
+    #define GB_IS_TERMINAL(s)                       \
+        (s == 0)
+
     #define GB_TERMINAL_VALUE                       \
         0
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         if (s == 0) break ;
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar__times_uint8
     uint8_t *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar__times_uint8
     return (GrB_NO_VALUE) ;
     #else
     uint8_t s = (*result) ;
-    #include "GB_reduce_panel.c"
+    uint8_t *GB_RESTRICT W = (uint8_t *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar__times_uint8
 
 
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachvec__times_uint8
-(
-    uint8_t *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-
-
-GrB_Info GB_red_eachindex__times_uint8
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generated/GB_sel__diag_any.c b/GraphBLAS/Source/Generated/GB_sel__diag_any.c
index aee7c37353..ddac4eeda8 100644
--- a/GraphBLAS/Source/Generated/GB_sel__diag_any.c
+++ b/GraphBLAS/Source/Generated/GB_sel__diag_any.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__diag_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__diag_any
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__diag_any
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__diag_any
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__diag_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_any.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_any.c
index 078ae79a52..cf69a158ea 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_any.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_any.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_any
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_any
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_any
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fc32.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fc32.c
index 751bceb532..d8e819b200 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fc32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_fc32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_fc32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     GxB_FC32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_fc32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_fc32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_fc32
+(
+    int8_t *Cb,
+    GxB_FC32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    GxB_FC32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fc64.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fc64.c
index 5841fb58f6..a651651acf 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fc64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_fc64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_fc64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     GxB_FC64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_fc64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_fc64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_fc64
+(
+    int8_t *Cb,
+    GxB_FC64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    GxB_FC64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fp32.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fp32.c
index a319fb5e83..5bf138c69f 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_fp32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     float thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_fp32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    float thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fp64.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fp64.c
index cb65f7b2f4..0ce3d04853 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_fp64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     double thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_fp64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    double thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int16.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int16.c
index f8ba67b49e..1267c1a2fc 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_int16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int16_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_int16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int16_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int32.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int32.c
index 91277567fc..a94e84b031 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_int32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_int32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int64.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int64.c
index 54ed59c357..08e0c905b7 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_int64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_int64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int8.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int8.c
index bb68eeb968..307fe8a76c 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_int8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int8_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_int8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int8_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint16.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint16.c
index 2bcbe7fe5d..f557467616 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_uint16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint16_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_uint16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_uint16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint16_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint32.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint32.c
index 4b3cb5ea84..1f26423901 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_uint32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_uint32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_uint32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint64.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint64.c
index caa2413c0d..cb49eb0534 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_uint64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_uint64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_uint64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint8.c b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint8.c
index 19abb17ffb..fd2fe6c2ec 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_thunk_uint8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_thunk_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_thunk_uint8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint8_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_thunk_uint8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_thunk_uint8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_thunk_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint8_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_any.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_any.c
index 005f1dc824..ec27a047e0 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_any.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_any.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_any
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_any
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_any
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_bool.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_bool.c
index 1108de5812..09e1e21bc8 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_bool.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_bool.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_bool
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_bool
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_bool
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_bool
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_bool
+(
+    int8_t *Cb,
+    bool *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const bool *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_fc32.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_fc32.c
index ce1b9b7dc7..11b91bee4b 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_fc32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_fc32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_fc32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_fc32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_fc32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_fc32
+(
+    int8_t *Cb,
+    GxB_FC32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_fc64.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_fc64.c
index d21f13da58..5efd89d152 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_fc64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_fc64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_fc64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_fc64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_fc64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_fc64
+(
+    int8_t *Cb,
+    GxB_FC64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_fp32.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_fp32.c
index dd145e1de5..eaaa8a7cd5 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_fp32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_fp32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_fp64.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_fp64.c
index d37da2b2b2..56adf285e9 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_fp64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_fp64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_int16.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_int16.c
index b06a3d8169..70e054561d 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_int16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_int16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_int32.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_int32.c
index a83c0f88f7..ba17f6c20e 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_int32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_int32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_int64.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_int64.c
index 6f69c682a5..acccd97c6f 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_int64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_int64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_int8.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_int8.c
index 7c51a32c55..bad9296f34 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_int8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_int8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint16.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint16.c
index d2b89aa2aa..f93d638a23 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_uint16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_uint16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_uint16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint32.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint32.c
index 491aa40b5a..ac4ee54b62 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_uint32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_uint32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_uint32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint64.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint64.c
index a091cf0d61..58d1e89192 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_uint64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_uint64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_uint64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint8.c b/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint8.c
index f366cbe7b9..eab04cb9c2 100644
--- a/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__eq_zero_uint8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__eq_zero_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__eq_zero_uint8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__eq_zero_uint8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__eq_zero_uint8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__eq_zero_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_fp32.c b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_fp32.c
index 18c950db57..bd691e8032 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_thunk_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_thunk_fp32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     float thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_thunk_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_thunk_fp32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_thunk_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    float thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_fp64.c b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_fp64.c
index 659f54b211..54c520ddcf 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_thunk_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_thunk_fp64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     double thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_thunk_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_thunk_fp64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_thunk_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    double thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int16.c b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int16.c
index 764459285f..335670bd59 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_thunk_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_thunk_int16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int16_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_thunk_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_thunk_int16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_thunk_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int16_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int32.c b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int32.c
index 8b63be2162..1fa4b4ff47 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_thunk_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_thunk_int32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_thunk_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_thunk_int32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_thunk_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int64.c b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int64.c
index e6119eb604..52adcb3ed1 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_thunk_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_thunk_int64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_thunk_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_thunk_int64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_thunk_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int8.c b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int8.c
index 85238be233..839538ec0c 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_thunk_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_thunk_int8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int8_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_thunk_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_thunk_int8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_thunk_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int8_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint16.c b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint16.c
index 4c15702529..ac5aedc999 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_thunk_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_thunk_uint16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint16_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_thunk_uint16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_thunk_uint16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_thunk_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint16_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint32.c b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint32.c
index 9856208773..6b22c97b6c 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_thunk_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_thunk_uint32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_thunk_uint32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_thunk_uint32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_thunk_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint64.c b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint64.c
index 44c18fde38..209b5bad94 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_thunk_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_thunk_uint64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_thunk_uint64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_thunk_uint64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_thunk_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint8.c b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint8.c
index dab6333ffe..ca002fff0a 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_thunk_uint8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_thunk_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_thunk_uint8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint8_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_thunk_uint8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_thunk_uint8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_thunk_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint8_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_zero_fp32.c b/GraphBLAS/Source/Generated/GB_sel__ge_zero_fp32.c
index 594fb79b39..27ef1ee0c2 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_zero_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_zero_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_zero_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_zero_fp32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_zero_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_zero_fp32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_zero_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_zero_fp64.c b/GraphBLAS/Source/Generated/GB_sel__ge_zero_fp64.c
index 9b5f944c51..700c4d39be 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_zero_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_zero_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_zero_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_zero_fp64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_zero_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_zero_fp64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_zero_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_zero_int16.c b/GraphBLAS/Source/Generated/GB_sel__ge_zero_int16.c
index a1a56289f7..8dc861abad 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_zero_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_zero_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_zero_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_zero_int16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_zero_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_zero_int16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_zero_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_zero_int32.c b/GraphBLAS/Source/Generated/GB_sel__ge_zero_int32.c
index 276d3a94d5..e1b48d1ff9 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_zero_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_zero_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_zero_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_zero_int32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_zero_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_zero_int32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_zero_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_zero_int64.c b/GraphBLAS/Source/Generated/GB_sel__ge_zero_int64.c
index 9704860e91..59068bd7f5 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_zero_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_zero_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_zero_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_zero_int64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_zero_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_zero_int64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_zero_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ge_zero_int8.c b/GraphBLAS/Source/Generated/GB_sel__ge_zero_int8.c
index 6f3159e47e..39d13f79cd 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ge_zero_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ge_zero_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ge_zero_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ge_zero_int8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ge_zero_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ge_zero_int8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ge_zero_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_fp32.c b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_fp32.c
index a29e3ee268..d9b9efabec 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_thunk_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_thunk_fp32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     float thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_thunk_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_thunk_fp32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_thunk_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    float thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_fp64.c b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_fp64.c
index de426c148e..28eb110b5d 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_thunk_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_thunk_fp64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     double thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_thunk_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_thunk_fp64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_thunk_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    double thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int16.c b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int16.c
index 329d40ef61..7b8ab59438 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_thunk_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_thunk_int16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int16_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_thunk_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_thunk_int16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_thunk_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int16_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int32.c b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int32.c
index 62b494f3af..0adf404de9 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_thunk_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_thunk_int32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_thunk_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_thunk_int32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_thunk_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int64.c b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int64.c
index 2928ab14fc..2b0ea6c62f 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_thunk_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_thunk_int64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_thunk_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_thunk_int64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_thunk_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int8.c b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int8.c
index 97a7da0c49..87284e21d7 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_thunk_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_thunk_int8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int8_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_thunk_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_thunk_int8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_thunk_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int8_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint16.c b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint16.c
index 5a2d606966..b49dcfc936 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_thunk_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_thunk_uint16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint16_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_thunk_uint16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_thunk_uint16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_thunk_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint16_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint32.c b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint32.c
index eef29f7e3f..db9f2533f4 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_thunk_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_thunk_uint32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_thunk_uint32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_thunk_uint32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_thunk_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint64.c b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint64.c
index 22dfae07f0..f0aa320beb 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_thunk_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_thunk_uint64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_thunk_uint64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_thunk_uint64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_thunk_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint8.c b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint8.c
index 89b6b495f3..b3e0e82dcc 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_thunk_uint8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_thunk_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_thunk_uint8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint8_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_thunk_uint8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_thunk_uint8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_thunk_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint8_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_zero_fp32.c b/GraphBLAS/Source/Generated/GB_sel__gt_zero_fp32.c
index 14964e16a4..2ebe271118 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_zero_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_zero_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_zero_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_zero_fp32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_zero_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_zero_fp32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_zero_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_zero_fp64.c b/GraphBLAS/Source/Generated/GB_sel__gt_zero_fp64.c
index e4e3861972..04843e8045 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_zero_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_zero_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_zero_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_zero_fp64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_zero_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_zero_fp64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_zero_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_zero_int16.c b/GraphBLAS/Source/Generated/GB_sel__gt_zero_int16.c
index 3f9db4aad7..bed0f1934b 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_zero_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_zero_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_zero_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_zero_int16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_zero_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_zero_int16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_zero_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_zero_int32.c b/GraphBLAS/Source/Generated/GB_sel__gt_zero_int32.c
index 19ba6d4b7c..ef274d162c 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_zero_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_zero_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_zero_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_zero_int32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_zero_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_zero_int32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_zero_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_zero_int64.c b/GraphBLAS/Source/Generated/GB_sel__gt_zero_int64.c
index b71125ace6..232148bdf1 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_zero_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_zero_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_zero_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_zero_int64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_zero_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_zero_int64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_zero_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__gt_zero_int8.c b/GraphBLAS/Source/Generated/GB_sel__gt_zero_int8.c
index aa3e50ea11..2d4f8afaee 100644
--- a/GraphBLAS/Source/Generated/GB_sel__gt_zero_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__gt_zero_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__gt_zero_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__gt_zero_int8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__gt_zero_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__gt_zero_int8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__gt_zero_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__include.h b/GraphBLAS/Source/Generated/GB_sel__include.h
index 445b0d10de..436ae85af7 100644
--- a/GraphBLAS/Source/Generated/GB_sel__include.h
+++ b/GraphBLAS/Source/Generated/GB_sel__include.h
@@ -2,19 +2,19 @@
 // GB_sel__include.h: definitions for GB_sel__*.c
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txargt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_sel.h
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__user_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -28,7 +28,6 @@ void GB_sel_phase1__user_any
 ) ;
 
 
-
 void GB_sel_phase2__user_any
 (
     int64_t *GB_RESTRICT Ci,
@@ -49,13 +48,27 @@ void GB_sel_phase2__user_any
 ) ;
 
 
+void GB_sel_bitmap__user_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__tril_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -69,7 +82,6 @@ void GB_sel_phase1__tril_any
 ) ;
 
 
-
 void GB_sel_phase2__tril_any
 (
     int64_t *GB_RESTRICT Ci,
@@ -90,13 +102,27 @@ void GB_sel_phase2__tril_any
 ) ;
 
 
+void GB_sel_bitmap__tril_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__triu_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -110,7 +136,6 @@ void GB_sel_phase1__triu_any
 ) ;
 
 
-
 void GB_sel_phase2__triu_any
 (
     int64_t *GB_RESTRICT Ci,
@@ -131,13 +156,27 @@ void GB_sel_phase2__triu_any
 ) ;
 
 
+void GB_sel_bitmap__triu_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__diag_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -151,7 +190,6 @@ void GB_sel_phase1__diag_any
 ) ;
 
 
-
 void GB_sel_phase2__diag_any
 (
     int64_t *GB_RESTRICT Ci,
@@ -172,13 +210,27 @@ void GB_sel_phase2__diag_any
 ) ;
 
 
+void GB_sel_bitmap__diag_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__offdiag_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -192,7 +244,6 @@ void GB_sel_phase1__offdiag_any
 ) ;
 
 
-
 void GB_sel_phase2__offdiag_any
 (
     int64_t *GB_RESTRICT Ci,
@@ -213,13 +264,27 @@ void GB_sel_phase2__offdiag_any
 ) ;
 
 
+void GB_sel_bitmap__offdiag_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__resize_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -233,7 +298,6 @@ void GB_sel_phase1__resize_any
 ) ;
 
 
-
 void GB_sel_phase2__resize_any
 (
     int64_t *GB_RESTRICT Ci,
@@ -254,13 +318,27 @@ void GB_sel_phase2__resize_any
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -272,7 +350,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_bool
@@ -295,13 +372,27 @@ void GB_sel_phase2__nonzombie_bool
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    bool *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const bool *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -313,7 +404,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_int8
@@ -336,13 +426,27 @@ void GB_sel_phase2__nonzombie_int8
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -354,7 +458,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_int16
@@ -377,13 +480,27 @@ void GB_sel_phase2__nonzombie_int16
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -395,7 +512,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_int32
@@ -418,13 +534,27 @@ void GB_sel_phase2__nonzombie_int32
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -436,7 +566,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_int64
@@ -459,13 +588,27 @@ void GB_sel_phase2__nonzombie_int64
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -477,7 +620,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_uint8
@@ -500,13 +642,27 @@ void GB_sel_phase2__nonzombie_uint8
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -518,7 +674,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_uint16
@@ -541,13 +696,27 @@ void GB_sel_phase2__nonzombie_uint16
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -559,7 +728,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_uint32
@@ -582,13 +750,27 @@ void GB_sel_phase2__nonzombie_uint32
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -600,7 +782,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_uint64
@@ -623,13 +804,27 @@ void GB_sel_phase2__nonzombie_uint64
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -641,7 +836,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_fp32
@@ -664,13 +858,27 @@ void GB_sel_phase2__nonzombie_fp32
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -682,7 +890,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_fp64
@@ -705,13 +912,27 @@ void GB_sel_phase2__nonzombie_fp64
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -723,7 +944,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_fc32
@@ -746,13 +966,27 @@ void GB_sel_phase2__nonzombie_fc32
 ) ;
 
 #if 0
-
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    GxB_FC32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
+#if 0
 void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -764,7 +998,6 @@ void GB_sel_phase1__(none)
     const int ntasks,
     const int nthreads
 ) ;
-
 #endif
 
 void GB_sel_phase2__nonzombie_fc64
@@ -786,14 +1019,28 @@ void GB_sel_phase2__nonzombie_fc64
     const int nthreads
 ) ;
 
-
+#if 0
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    GxB_FC64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzombie_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -807,7 +1054,6 @@ void GB_sel_phase1__nonzombie_any
 ) ;
 
 
-
 void GB_sel_phase2__nonzombie_any
 (
     int64_t *GB_RESTRICT Ci,
@@ -827,14 +1073,28 @@ void GB_sel_phase2__nonzombie_any
     const int nthreads
 ) ;
 
-
+#if 0
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+#endif
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_bool
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -848,7 +1108,6 @@ void GB_sel_phase1__nonzero_bool
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_bool
 (
     int64_t *GB_RESTRICT Ci,
@@ -869,13 +1128,27 @@ void GB_sel_phase2__nonzero_bool
 ) ;
 
 
+void GB_sel_bitmap__nonzero_bool
+(
+    int8_t *Cb,
+    bool *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const bool *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -889,7 +1162,6 @@ void GB_sel_phase1__nonzero_int8
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_int8
 (
     int64_t *GB_RESTRICT Ci,
@@ -910,13 +1182,27 @@ void GB_sel_phase2__nonzero_int8
 ) ;
 
 
+void GB_sel_bitmap__nonzero_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -930,7 +1216,6 @@ void GB_sel_phase1__nonzero_int16
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_int16
 (
     int64_t *GB_RESTRICT Ci,
@@ -951,13 +1236,27 @@ void GB_sel_phase2__nonzero_int16
 ) ;
 
 
+void GB_sel_bitmap__nonzero_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -971,7 +1270,6 @@ void GB_sel_phase1__nonzero_int32
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_int32
 (
     int64_t *GB_RESTRICT Ci,
@@ -992,13 +1290,27 @@ void GB_sel_phase2__nonzero_int32
 ) ;
 
 
+void GB_sel_bitmap__nonzero_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1012,7 +1324,6 @@ void GB_sel_phase1__nonzero_int64
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_int64
 (
     int64_t *GB_RESTRICT Ci,
@@ -1033,13 +1344,27 @@ void GB_sel_phase2__nonzero_int64
 ) ;
 
 
+void GB_sel_bitmap__nonzero_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1053,7 +1378,6 @@ void GB_sel_phase1__nonzero_uint8
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_uint8
 (
     int64_t *GB_RESTRICT Ci,
@@ -1074,13 +1398,27 @@ void GB_sel_phase2__nonzero_uint8
 ) ;
 
 
+void GB_sel_bitmap__nonzero_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1094,7 +1432,6 @@ void GB_sel_phase1__nonzero_uint16
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_uint16
 (
     int64_t *GB_RESTRICT Ci,
@@ -1115,13 +1452,27 @@ void GB_sel_phase2__nonzero_uint16
 ) ;
 
 
+void GB_sel_bitmap__nonzero_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1135,7 +1486,6 @@ void GB_sel_phase1__nonzero_uint32
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_uint32
 (
     int64_t *GB_RESTRICT Ci,
@@ -1156,13 +1506,27 @@ void GB_sel_phase2__nonzero_uint32
 ) ;
 
 
+void GB_sel_bitmap__nonzero_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1176,7 +1540,6 @@ void GB_sel_phase1__nonzero_uint64
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_uint64
 (
     int64_t *GB_RESTRICT Ci,
@@ -1197,13 +1560,27 @@ void GB_sel_phase2__nonzero_uint64
 ) ;
 
 
+void GB_sel_bitmap__nonzero_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1217,7 +1594,6 @@ void GB_sel_phase1__nonzero_fp32
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_fp32
 (
     int64_t *GB_RESTRICT Ci,
@@ -1238,13 +1614,27 @@ void GB_sel_phase2__nonzero_fp32
 ) ;
 
 
+void GB_sel_bitmap__nonzero_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1258,7 +1648,6 @@ void GB_sel_phase1__nonzero_fp64
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_fp64
 (
     int64_t *GB_RESTRICT Ci,
@@ -1279,13 +1668,27 @@ void GB_sel_phase2__nonzero_fp64
 ) ;
 
 
+void GB_sel_bitmap__nonzero_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_fc32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1299,7 +1702,6 @@ void GB_sel_phase1__nonzero_fc32
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_fc32
 (
     int64_t *GB_RESTRICT Ci,
@@ -1320,13 +1722,27 @@ void GB_sel_phase2__nonzero_fc32
 ) ;
 
 
+void GB_sel_bitmap__nonzero_fc32
+(
+    int8_t *Cb,
+    GxB_FC32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_fc64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1340,7 +1756,6 @@ void GB_sel_phase1__nonzero_fc64
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_fc64
 (
     int64_t *GB_RESTRICT Ci,
@@ -1361,13 +1776,27 @@ void GB_sel_phase2__nonzero_fc64
 ) ;
 
 
+void GB_sel_bitmap__nonzero_fc64
+(
+    int8_t *Cb,
+    GxB_FC64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__nonzero_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1381,7 +1810,6 @@ void GB_sel_phase1__nonzero_any
 ) ;
 
 
-
 void GB_sel_phase2__nonzero_any
 (
     int64_t *GB_RESTRICT Ci,
@@ -1402,13 +1830,27 @@ void GB_sel_phase2__nonzero_any
 ) ;
 
 
+void GB_sel_bitmap__nonzero_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_bool
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1422,7 +1864,6 @@ void GB_sel_phase1__eq_zero_bool
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_bool
 (
     int64_t *GB_RESTRICT Ci,
@@ -1443,13 +1884,27 @@ void GB_sel_phase2__eq_zero_bool
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_bool
+(
+    int8_t *Cb,
+    bool *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const bool *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1463,7 +1918,6 @@ void GB_sel_phase1__eq_zero_int8
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_int8
 (
     int64_t *GB_RESTRICT Ci,
@@ -1484,13 +1938,27 @@ void GB_sel_phase2__eq_zero_int8
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1504,7 +1972,6 @@ void GB_sel_phase1__eq_zero_int16
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_int16
 (
     int64_t *GB_RESTRICT Ci,
@@ -1520,18 +1987,32 @@ void GB_sel_phase2__eq_zero_int16
     const int64_t ithunk,
     const int16_t *GB_RESTRICT xthunk,
     const GxB_select_function user_select,
-    const int ntasks,
+    const int ntasks,
+    const int nthreads
+) ;
+
+
+void GB_sel_bitmap__eq_zero_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
     const int nthreads
 ) ;
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1545,7 +2026,6 @@ void GB_sel_phase1__eq_zero_int32
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_int32
 (
     int64_t *GB_RESTRICT Ci,
@@ -1566,13 +2046,27 @@ void GB_sel_phase2__eq_zero_int32
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1586,7 +2080,6 @@ void GB_sel_phase1__eq_zero_int64
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_int64
 (
     int64_t *GB_RESTRICT Ci,
@@ -1607,13 +2100,27 @@ void GB_sel_phase2__eq_zero_int64
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1627,7 +2134,6 @@ void GB_sel_phase1__eq_zero_uint8
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_uint8
 (
     int64_t *GB_RESTRICT Ci,
@@ -1648,13 +2154,27 @@ void GB_sel_phase2__eq_zero_uint8
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1668,7 +2188,6 @@ void GB_sel_phase1__eq_zero_uint16
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_uint16
 (
     int64_t *GB_RESTRICT Ci,
@@ -1689,13 +2208,27 @@ void GB_sel_phase2__eq_zero_uint16
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1709,7 +2242,6 @@ void GB_sel_phase1__eq_zero_uint32
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_uint32
 (
     int64_t *GB_RESTRICT Ci,
@@ -1730,13 +2262,27 @@ void GB_sel_phase2__eq_zero_uint32
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1750,7 +2296,6 @@ void GB_sel_phase1__eq_zero_uint64
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_uint64
 (
     int64_t *GB_RESTRICT Ci,
@@ -1771,13 +2316,27 @@ void GB_sel_phase2__eq_zero_uint64
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1791,7 +2350,6 @@ void GB_sel_phase1__eq_zero_fp32
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_fp32
 (
     int64_t *GB_RESTRICT Ci,
@@ -1812,13 +2370,27 @@ void GB_sel_phase2__eq_zero_fp32
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1832,7 +2404,6 @@ void GB_sel_phase1__eq_zero_fp64
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_fp64
 (
     int64_t *GB_RESTRICT Ci,
@@ -1853,13 +2424,27 @@ void GB_sel_phase2__eq_zero_fp64
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_fc32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1873,7 +2458,6 @@ void GB_sel_phase1__eq_zero_fc32
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_fc32
 (
     int64_t *GB_RESTRICT Ci,
@@ -1894,13 +2478,27 @@ void GB_sel_phase2__eq_zero_fc32
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_fc32
+(
+    int8_t *Cb,
+    GxB_FC32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_fc64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1914,7 +2512,6 @@ void GB_sel_phase1__eq_zero_fc64
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_fc64
 (
     int64_t *GB_RESTRICT Ci,
@@ -1935,13 +2532,27 @@ void GB_sel_phase2__eq_zero_fc64
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_fc64
+(
+    int8_t *Cb,
+    GxB_FC64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_zero_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1955,7 +2566,6 @@ void GB_sel_phase1__eq_zero_any
 ) ;
 
 
-
 void GB_sel_phase2__eq_zero_any
 (
     int64_t *GB_RESTRICT Ci,
@@ -1976,13 +2586,27 @@ void GB_sel_phase2__eq_zero_any
 ) ;
 
 
+void GB_sel_bitmap__eq_zero_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_zero_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -1996,7 +2620,6 @@ void GB_sel_phase1__gt_zero_int8
 ) ;
 
 
-
 void GB_sel_phase2__gt_zero_int8
 (
     int64_t *GB_RESTRICT Ci,
@@ -2017,13 +2640,27 @@ void GB_sel_phase2__gt_zero_int8
 ) ;
 
 
+void GB_sel_bitmap__gt_zero_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_zero_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2037,7 +2674,6 @@ void GB_sel_phase1__gt_zero_int16
 ) ;
 
 
-
 void GB_sel_phase2__gt_zero_int16
 (
     int64_t *GB_RESTRICT Ci,
@@ -2058,13 +2694,27 @@ void GB_sel_phase2__gt_zero_int16
 ) ;
 
 
+void GB_sel_bitmap__gt_zero_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_zero_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2078,7 +2728,6 @@ void GB_sel_phase1__gt_zero_int32
 ) ;
 
 
-
 void GB_sel_phase2__gt_zero_int32
 (
     int64_t *GB_RESTRICT Ci,
@@ -2099,13 +2748,27 @@ void GB_sel_phase2__gt_zero_int32
 ) ;
 
 
+void GB_sel_bitmap__gt_zero_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_zero_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2119,7 +2782,6 @@ void GB_sel_phase1__gt_zero_int64
 ) ;
 
 
-
 void GB_sel_phase2__gt_zero_int64
 (
     int64_t *GB_RESTRICT Ci,
@@ -2140,13 +2802,27 @@ void GB_sel_phase2__gt_zero_int64
 ) ;
 
 
+void GB_sel_bitmap__gt_zero_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_zero_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2160,7 +2836,6 @@ void GB_sel_phase1__gt_zero_fp32
 ) ;
 
 
-
 void GB_sel_phase2__gt_zero_fp32
 (
     int64_t *GB_RESTRICT Ci,
@@ -2181,13 +2856,27 @@ void GB_sel_phase2__gt_zero_fp32
 ) ;
 
 
+void GB_sel_bitmap__gt_zero_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_zero_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2201,7 +2890,6 @@ void GB_sel_phase1__gt_zero_fp64
 ) ;
 
 
-
 void GB_sel_phase2__gt_zero_fp64
 (
     int64_t *GB_RESTRICT Ci,
@@ -2222,13 +2910,27 @@ void GB_sel_phase2__gt_zero_fp64
 ) ;
 
 
+void GB_sel_bitmap__gt_zero_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_zero_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2242,7 +2944,6 @@ void GB_sel_phase1__ge_zero_int8
 ) ;
 
 
-
 void GB_sel_phase2__ge_zero_int8
 (
     int64_t *GB_RESTRICT Ci,
@@ -2263,13 +2964,27 @@ void GB_sel_phase2__ge_zero_int8
 ) ;
 
 
+void GB_sel_bitmap__ge_zero_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_zero_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2283,7 +2998,6 @@ void GB_sel_phase1__ge_zero_int16
 ) ;
 
 
-
 void GB_sel_phase2__ge_zero_int16
 (
     int64_t *GB_RESTRICT Ci,
@@ -2304,13 +3018,27 @@ void GB_sel_phase2__ge_zero_int16
 ) ;
 
 
+void GB_sel_bitmap__ge_zero_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_zero_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2324,7 +3052,6 @@ void GB_sel_phase1__ge_zero_int32
 ) ;
 
 
-
 void GB_sel_phase2__ge_zero_int32
 (
     int64_t *GB_RESTRICT Ci,
@@ -2345,13 +3072,27 @@ void GB_sel_phase2__ge_zero_int32
 ) ;
 
 
+void GB_sel_bitmap__ge_zero_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_zero_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2365,7 +3106,6 @@ void GB_sel_phase1__ge_zero_int64
 ) ;
 
 
-
 void GB_sel_phase2__ge_zero_int64
 (
     int64_t *GB_RESTRICT Ci,
@@ -2386,13 +3126,27 @@ void GB_sel_phase2__ge_zero_int64
 ) ;
 
 
+void GB_sel_bitmap__ge_zero_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_zero_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2406,7 +3160,6 @@ void GB_sel_phase1__ge_zero_fp32
 ) ;
 
 
-
 void GB_sel_phase2__ge_zero_fp32
 (
     int64_t *GB_RESTRICT Ci,
@@ -2427,13 +3180,27 @@ void GB_sel_phase2__ge_zero_fp32
 ) ;
 
 
+void GB_sel_bitmap__ge_zero_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_zero_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2447,7 +3214,6 @@ void GB_sel_phase1__ge_zero_fp64
 ) ;
 
 
-
 void GB_sel_phase2__ge_zero_fp64
 (
     int64_t *GB_RESTRICT Ci,
@@ -2468,13 +3234,27 @@ void GB_sel_phase2__ge_zero_fp64
 ) ;
 
 
+void GB_sel_bitmap__ge_zero_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_zero_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2488,7 +3268,6 @@ void GB_sel_phase1__lt_zero_int8
 ) ;
 
 
-
 void GB_sel_phase2__lt_zero_int8
 (
     int64_t *GB_RESTRICT Ci,
@@ -2509,13 +3288,27 @@ void GB_sel_phase2__lt_zero_int8
 ) ;
 
 
+void GB_sel_bitmap__lt_zero_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_zero_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2529,7 +3322,6 @@ void GB_sel_phase1__lt_zero_int16
 ) ;
 
 
-
 void GB_sel_phase2__lt_zero_int16
 (
     int64_t *GB_RESTRICT Ci,
@@ -2550,13 +3342,27 @@ void GB_sel_phase2__lt_zero_int16
 ) ;
 
 
+void GB_sel_bitmap__lt_zero_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_zero_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2570,7 +3376,6 @@ void GB_sel_phase1__lt_zero_int32
 ) ;
 
 
-
 void GB_sel_phase2__lt_zero_int32
 (
     int64_t *GB_RESTRICT Ci,
@@ -2591,13 +3396,27 @@ void GB_sel_phase2__lt_zero_int32
 ) ;
 
 
+void GB_sel_bitmap__lt_zero_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_zero_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2611,7 +3430,6 @@ void GB_sel_phase1__lt_zero_int64
 ) ;
 
 
-
 void GB_sel_phase2__lt_zero_int64
 (
     int64_t *GB_RESTRICT Ci,
@@ -2632,13 +3450,27 @@ void GB_sel_phase2__lt_zero_int64
 ) ;
 
 
+void GB_sel_bitmap__lt_zero_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_zero_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2652,7 +3484,6 @@ void GB_sel_phase1__lt_zero_fp32
 ) ;
 
 
-
 void GB_sel_phase2__lt_zero_fp32
 (
     int64_t *GB_RESTRICT Ci,
@@ -2673,13 +3504,27 @@ void GB_sel_phase2__lt_zero_fp32
 ) ;
 
 
+void GB_sel_bitmap__lt_zero_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_zero_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2693,7 +3538,6 @@ void GB_sel_phase1__lt_zero_fp64
 ) ;
 
 
-
 void GB_sel_phase2__lt_zero_fp64
 (
     int64_t *GB_RESTRICT Ci,
@@ -2714,13 +3558,27 @@ void GB_sel_phase2__lt_zero_fp64
 ) ;
 
 
+void GB_sel_bitmap__lt_zero_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_zero_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2734,7 +3592,6 @@ void GB_sel_phase1__le_zero_int8
 ) ;
 
 
-
 void GB_sel_phase2__le_zero_int8
 (
     int64_t *GB_RESTRICT Ci,
@@ -2755,13 +3612,27 @@ void GB_sel_phase2__le_zero_int8
 ) ;
 
 
+void GB_sel_bitmap__le_zero_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_zero_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2775,7 +3646,6 @@ void GB_sel_phase1__le_zero_int16
 ) ;
 
 
-
 void GB_sel_phase2__le_zero_int16
 (
     int64_t *GB_RESTRICT Ci,
@@ -2796,13 +3666,27 @@ void GB_sel_phase2__le_zero_int16
 ) ;
 
 
+void GB_sel_bitmap__le_zero_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_zero_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2816,7 +3700,6 @@ void GB_sel_phase1__le_zero_int32
 ) ;
 
 
-
 void GB_sel_phase2__le_zero_int32
 (
     int64_t *GB_RESTRICT Ci,
@@ -2837,13 +3720,27 @@ void GB_sel_phase2__le_zero_int32
 ) ;
 
 
+void GB_sel_bitmap__le_zero_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_zero_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2857,7 +3754,6 @@ void GB_sel_phase1__le_zero_int64
 ) ;
 
 
-
 void GB_sel_phase2__le_zero_int64
 (
     int64_t *GB_RESTRICT Ci,
@@ -2878,13 +3774,27 @@ void GB_sel_phase2__le_zero_int64
 ) ;
 
 
+void GB_sel_bitmap__le_zero_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_zero_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2898,7 +3808,6 @@ void GB_sel_phase1__le_zero_fp32
 ) ;
 
 
-
 void GB_sel_phase2__le_zero_fp32
 (
     int64_t *GB_RESTRICT Ci,
@@ -2919,13 +3828,27 @@ void GB_sel_phase2__le_zero_fp32
 ) ;
 
 
+void GB_sel_bitmap__le_zero_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_zero_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2939,7 +3862,6 @@ void GB_sel_phase1__le_zero_fp64
 ) ;
 
 
-
 void GB_sel_phase2__le_zero_fp64
 (
     int64_t *GB_RESTRICT Ci,
@@ -2960,13 +3882,27 @@ void GB_sel_phase2__le_zero_fp64
 ) ;
 
 
+void GB_sel_bitmap__le_zero_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -2980,7 +3916,6 @@ void GB_sel_phase1__ne_thunk_int8
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_int8
 (
     int64_t *GB_RESTRICT Ci,
@@ -3001,13 +3936,27 @@ void GB_sel_phase2__ne_thunk_int8
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3021,7 +3970,6 @@ void GB_sel_phase1__ne_thunk_int16
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_int16
 (
     int64_t *GB_RESTRICT Ci,
@@ -3042,13 +3990,27 @@ void GB_sel_phase2__ne_thunk_int16
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3062,7 +4024,6 @@ void GB_sel_phase1__ne_thunk_int32
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_int32
 (
     int64_t *GB_RESTRICT Ci,
@@ -3083,13 +4044,27 @@ void GB_sel_phase2__ne_thunk_int32
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3103,7 +4078,6 @@ void GB_sel_phase1__ne_thunk_int64
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_int64
 (
     int64_t *GB_RESTRICT Ci,
@@ -3124,13 +4098,27 @@ void GB_sel_phase2__ne_thunk_int64
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3144,7 +4132,6 @@ void GB_sel_phase1__ne_thunk_uint8
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_uint8
 (
     int64_t *GB_RESTRICT Ci,
@@ -3165,13 +4152,27 @@ void GB_sel_phase2__ne_thunk_uint8
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3185,7 +4186,6 @@ void GB_sel_phase1__ne_thunk_uint16
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_uint16
 (
     int64_t *GB_RESTRICT Ci,
@@ -3206,13 +4206,27 @@ void GB_sel_phase2__ne_thunk_uint16
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3226,7 +4240,6 @@ void GB_sel_phase1__ne_thunk_uint32
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_uint32
 (
     int64_t *GB_RESTRICT Ci,
@@ -3247,13 +4260,27 @@ void GB_sel_phase2__ne_thunk_uint32
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3267,7 +4294,6 @@ void GB_sel_phase1__ne_thunk_uint64
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_uint64
 (
     int64_t *GB_RESTRICT Ci,
@@ -3288,13 +4314,27 @@ void GB_sel_phase2__ne_thunk_uint64
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3308,7 +4348,6 @@ void GB_sel_phase1__ne_thunk_fp32
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_fp32
 (
     int64_t *GB_RESTRICT Ci,
@@ -3329,13 +4368,27 @@ void GB_sel_phase2__ne_thunk_fp32
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3349,7 +4402,6 @@ void GB_sel_phase1__ne_thunk_fp64
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_fp64
 (
     int64_t *GB_RESTRICT Ci,
@@ -3370,13 +4422,27 @@ void GB_sel_phase2__ne_thunk_fp64
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_fc32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3390,7 +4456,6 @@ void GB_sel_phase1__ne_thunk_fc32
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_fc32
 (
     int64_t *GB_RESTRICT Ci,
@@ -3411,13 +4476,27 @@ void GB_sel_phase2__ne_thunk_fc32
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_fc32
+(
+    int8_t *Cb,
+    GxB_FC32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_fc64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3431,7 +4510,6 @@ void GB_sel_phase1__ne_thunk_fc64
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_fc64
 (
     int64_t *GB_RESTRICT Ci,
@@ -3452,13 +4530,27 @@ void GB_sel_phase2__ne_thunk_fc64
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_fc64
+(
+    int8_t *Cb,
+    GxB_FC64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ne_thunk_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3472,7 +4564,6 @@ void GB_sel_phase1__ne_thunk_any
 ) ;
 
 
-
 void GB_sel_phase2__ne_thunk_any
 (
     int64_t *GB_RESTRICT Ci,
@@ -3493,13 +4584,27 @@ void GB_sel_phase2__ne_thunk_any
 ) ;
 
 
+void GB_sel_bitmap__ne_thunk_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3513,7 +4618,6 @@ void GB_sel_phase1__eq_thunk_int8
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_int8
 (
     int64_t *GB_RESTRICT Ci,
@@ -3534,13 +4638,27 @@ void GB_sel_phase2__eq_thunk_int8
 ) ;
 
 
+void GB_sel_bitmap__eq_thunk_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3554,7 +4672,6 @@ void GB_sel_phase1__eq_thunk_int16
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_int16
 (
     int64_t *GB_RESTRICT Ci,
@@ -3575,13 +4692,27 @@ void GB_sel_phase2__eq_thunk_int16
 ) ;
 
 
+void GB_sel_bitmap__eq_thunk_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3595,7 +4726,6 @@ void GB_sel_phase1__eq_thunk_int32
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_int32
 (
     int64_t *GB_RESTRICT Ci,
@@ -3611,18 +4741,32 @@ void GB_sel_phase2__eq_thunk_int32
     const int64_t ithunk,
     const int32_t *GB_RESTRICT xthunk,
     const GxB_select_function user_select,
-    const int ntasks,
+    const int ntasks,
+    const int nthreads
+) ;
+
+
+void GB_sel_bitmap__eq_thunk_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
     const int nthreads
 ) ;
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3636,7 +4780,6 @@ void GB_sel_phase1__eq_thunk_int64
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_int64
 (
     int64_t *GB_RESTRICT Ci,
@@ -3657,13 +4800,27 @@ void GB_sel_phase2__eq_thunk_int64
 ) ;
 
 
+void GB_sel_bitmap__eq_thunk_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3677,7 +4834,6 @@ void GB_sel_phase1__eq_thunk_uint8
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_uint8
 (
     int64_t *GB_RESTRICT Ci,
@@ -3698,13 +4854,27 @@ void GB_sel_phase2__eq_thunk_uint8
 ) ;
 
 
+void GB_sel_bitmap__eq_thunk_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3718,7 +4888,6 @@ void GB_sel_phase1__eq_thunk_uint16
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_uint16
 (
     int64_t *GB_RESTRICT Ci,
@@ -3739,13 +4908,27 @@ void GB_sel_phase2__eq_thunk_uint16
 ) ;
 
 
+void GB_sel_bitmap__eq_thunk_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3759,7 +4942,6 @@ void GB_sel_phase1__eq_thunk_uint32
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_uint32
 (
     int64_t *GB_RESTRICT Ci,
@@ -3780,13 +4962,27 @@ void GB_sel_phase2__eq_thunk_uint32
 ) ;
 
 
+void GB_sel_bitmap__eq_thunk_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3800,7 +4996,6 @@ void GB_sel_phase1__eq_thunk_uint64
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_uint64
 (
     int64_t *GB_RESTRICT Ci,
@@ -3821,13 +5016,27 @@ void GB_sel_phase2__eq_thunk_uint64
 ) ;
 
 
+void GB_sel_bitmap__eq_thunk_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3841,7 +5050,6 @@ void GB_sel_phase1__eq_thunk_fp32
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_fp32
 (
     int64_t *GB_RESTRICT Ci,
@@ -3862,13 +5070,27 @@ void GB_sel_phase2__eq_thunk_fp32
 ) ;
 
 
+void GB_sel_bitmap__eq_thunk_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3882,7 +5104,6 @@ void GB_sel_phase1__eq_thunk_fp64
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_fp64
 (
     int64_t *GB_RESTRICT Ci,
@@ -3903,13 +5124,27 @@ void GB_sel_phase2__eq_thunk_fp64
 ) ;
 
 
+void GB_sel_bitmap__eq_thunk_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_fc32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3923,7 +5158,6 @@ void GB_sel_phase1__eq_thunk_fc32
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_fc32
 (
     int64_t *GB_RESTRICT Ci,
@@ -3944,13 +5178,27 @@ void GB_sel_phase2__eq_thunk_fc32
 ) ;
 
 
+void GB_sel_bitmap__eq_thunk_fc32
+(
+    int8_t *Cb,
+    GxB_FC32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_fc64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -3964,7 +5212,6 @@ void GB_sel_phase1__eq_thunk_fc64
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_fc64
 (
     int64_t *GB_RESTRICT Ci,
@@ -3985,13 +5232,27 @@ void GB_sel_phase2__eq_thunk_fc64
 ) ;
 
 
+void GB_sel_bitmap__eq_thunk_fc64
+(
+    int8_t *Cb,
+    GxB_FC64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__eq_thunk_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4005,7 +5266,6 @@ void GB_sel_phase1__eq_thunk_any
 ) ;
 
 
-
 void GB_sel_phase2__eq_thunk_any
 (
     int64_t *GB_RESTRICT Ci,
@@ -4026,13 +5286,27 @@ void GB_sel_phase2__eq_thunk_any
 ) ;
 
 
+void GB_sel_bitmap__eq_thunk_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_thunk_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4046,7 +5320,6 @@ void GB_sel_phase1__gt_thunk_int8
 ) ;
 
 
-
 void GB_sel_phase2__gt_thunk_int8
 (
     int64_t *GB_RESTRICT Ci,
@@ -4067,13 +5340,27 @@ void GB_sel_phase2__gt_thunk_int8
 ) ;
 
 
+void GB_sel_bitmap__gt_thunk_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_thunk_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4087,7 +5374,6 @@ void GB_sel_phase1__gt_thunk_int16
 ) ;
 
 
-
 void GB_sel_phase2__gt_thunk_int16
 (
     int64_t *GB_RESTRICT Ci,
@@ -4108,13 +5394,27 @@ void GB_sel_phase2__gt_thunk_int16
 ) ;
 
 
+void GB_sel_bitmap__gt_thunk_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_thunk_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4128,7 +5428,6 @@ void GB_sel_phase1__gt_thunk_int32
 ) ;
 
 
-
 void GB_sel_phase2__gt_thunk_int32
 (
     int64_t *GB_RESTRICT Ci,
@@ -4149,13 +5448,27 @@ void GB_sel_phase2__gt_thunk_int32
 ) ;
 
 
+void GB_sel_bitmap__gt_thunk_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_thunk_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4169,7 +5482,6 @@ void GB_sel_phase1__gt_thunk_int64
 ) ;
 
 
-
 void GB_sel_phase2__gt_thunk_int64
 (
     int64_t *GB_RESTRICT Ci,
@@ -4190,13 +5502,27 @@ void GB_sel_phase2__gt_thunk_int64
 ) ;
 
 
+void GB_sel_bitmap__gt_thunk_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_thunk_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4210,7 +5536,6 @@ void GB_sel_phase1__gt_thunk_uint8
 ) ;
 
 
-
 void GB_sel_phase2__gt_thunk_uint8
 (
     int64_t *GB_RESTRICT Ci,
@@ -4231,13 +5556,27 @@ void GB_sel_phase2__gt_thunk_uint8
 ) ;
 
 
+void GB_sel_bitmap__gt_thunk_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_thunk_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4251,7 +5590,6 @@ void GB_sel_phase1__gt_thunk_uint16
 ) ;
 
 
-
 void GB_sel_phase2__gt_thunk_uint16
 (
     int64_t *GB_RESTRICT Ci,
@@ -4272,13 +5610,27 @@ void GB_sel_phase2__gt_thunk_uint16
 ) ;
 
 
+void GB_sel_bitmap__gt_thunk_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_thunk_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4292,7 +5644,6 @@ void GB_sel_phase1__gt_thunk_uint32
 ) ;
 
 
-
 void GB_sel_phase2__gt_thunk_uint32
 (
     int64_t *GB_RESTRICT Ci,
@@ -4313,13 +5664,27 @@ void GB_sel_phase2__gt_thunk_uint32
 ) ;
 
 
+void GB_sel_bitmap__gt_thunk_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_thunk_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4333,7 +5698,6 @@ void GB_sel_phase1__gt_thunk_uint64
 ) ;
 
 
-
 void GB_sel_phase2__gt_thunk_uint64
 (
     int64_t *GB_RESTRICT Ci,
@@ -4354,13 +5718,27 @@ void GB_sel_phase2__gt_thunk_uint64
 ) ;
 
 
+void GB_sel_bitmap__gt_thunk_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_thunk_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4374,7 +5752,6 @@ void GB_sel_phase1__gt_thunk_fp32
 ) ;
 
 
-
 void GB_sel_phase2__gt_thunk_fp32
 (
     int64_t *GB_RESTRICT Ci,
@@ -4395,13 +5772,27 @@ void GB_sel_phase2__gt_thunk_fp32
 ) ;
 
 
+void GB_sel_bitmap__gt_thunk_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__gt_thunk_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4415,7 +5806,6 @@ void GB_sel_phase1__gt_thunk_fp64
 ) ;
 
 
-
 void GB_sel_phase2__gt_thunk_fp64
 (
     int64_t *GB_RESTRICT Ci,
@@ -4436,13 +5826,27 @@ void GB_sel_phase2__gt_thunk_fp64
 ) ;
 
 
+void GB_sel_bitmap__gt_thunk_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_thunk_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4456,7 +5860,6 @@ void GB_sel_phase1__ge_thunk_int8
 ) ;
 
 
-
 void GB_sel_phase2__ge_thunk_int8
 (
     int64_t *GB_RESTRICT Ci,
@@ -4477,13 +5880,27 @@ void GB_sel_phase2__ge_thunk_int8
 ) ;
 
 
+void GB_sel_bitmap__ge_thunk_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_thunk_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4497,7 +5914,6 @@ void GB_sel_phase1__ge_thunk_int16
 ) ;
 
 
-
 void GB_sel_phase2__ge_thunk_int16
 (
     int64_t *GB_RESTRICT Ci,
@@ -4518,13 +5934,27 @@ void GB_sel_phase2__ge_thunk_int16
 ) ;
 
 
+void GB_sel_bitmap__ge_thunk_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_thunk_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4538,7 +5968,6 @@ void GB_sel_phase1__ge_thunk_int32
 ) ;
 
 
-
 void GB_sel_phase2__ge_thunk_int32
 (
     int64_t *GB_RESTRICT Ci,
@@ -4559,13 +5988,27 @@ void GB_sel_phase2__ge_thunk_int32
 ) ;
 
 
+void GB_sel_bitmap__ge_thunk_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_thunk_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4579,7 +6022,6 @@ void GB_sel_phase1__ge_thunk_int64
 ) ;
 
 
-
 void GB_sel_phase2__ge_thunk_int64
 (
     int64_t *GB_RESTRICT Ci,
@@ -4600,13 +6042,27 @@ void GB_sel_phase2__ge_thunk_int64
 ) ;
 
 
+void GB_sel_bitmap__ge_thunk_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_thunk_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4620,7 +6076,6 @@ void GB_sel_phase1__ge_thunk_uint8
 ) ;
 
 
-
 void GB_sel_phase2__ge_thunk_uint8
 (
     int64_t *GB_RESTRICT Ci,
@@ -4636,18 +6091,32 @@ void GB_sel_phase2__ge_thunk_uint8
     const int64_t ithunk,
     const uint8_t *GB_RESTRICT xthunk,
     const GxB_select_function user_select,
-    const int ntasks,
+    const int ntasks,
+    const int nthreads
+) ;
+
+
+void GB_sel_bitmap__ge_thunk_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
     const int nthreads
 ) ;
 
-
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_thunk_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4661,7 +6130,6 @@ void GB_sel_phase1__ge_thunk_uint16
 ) ;
 
 
-
 void GB_sel_phase2__ge_thunk_uint16
 (
     int64_t *GB_RESTRICT Ci,
@@ -4682,13 +6150,27 @@ void GB_sel_phase2__ge_thunk_uint16
 ) ;
 
 
+void GB_sel_bitmap__ge_thunk_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_thunk_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4702,7 +6184,6 @@ void GB_sel_phase1__ge_thunk_uint32
 ) ;
 
 
-
 void GB_sel_phase2__ge_thunk_uint32
 (
     int64_t *GB_RESTRICT Ci,
@@ -4723,13 +6204,27 @@ void GB_sel_phase2__ge_thunk_uint32
 ) ;
 
 
+void GB_sel_bitmap__ge_thunk_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_thunk_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4743,7 +6238,6 @@ void GB_sel_phase1__ge_thunk_uint64
 ) ;
 
 
-
 void GB_sel_phase2__ge_thunk_uint64
 (
     int64_t *GB_RESTRICT Ci,
@@ -4764,13 +6258,27 @@ void GB_sel_phase2__ge_thunk_uint64
 ) ;
 
 
+void GB_sel_bitmap__ge_thunk_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_thunk_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4784,7 +6292,6 @@ void GB_sel_phase1__ge_thunk_fp32
 ) ;
 
 
-
 void GB_sel_phase2__ge_thunk_fp32
 (
     int64_t *GB_RESTRICT Ci,
@@ -4805,13 +6312,27 @@ void GB_sel_phase2__ge_thunk_fp32
 ) ;
 
 
+void GB_sel_bitmap__ge_thunk_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__ge_thunk_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4825,7 +6346,6 @@ void GB_sel_phase1__ge_thunk_fp64
 ) ;
 
 
-
 void GB_sel_phase2__ge_thunk_fp64
 (
     int64_t *GB_RESTRICT Ci,
@@ -4846,13 +6366,27 @@ void GB_sel_phase2__ge_thunk_fp64
 ) ;
 
 
+void GB_sel_bitmap__ge_thunk_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_thunk_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4866,7 +6400,6 @@ void GB_sel_phase1__lt_thunk_int8
 ) ;
 
 
-
 void GB_sel_phase2__lt_thunk_int8
 (
     int64_t *GB_RESTRICT Ci,
@@ -4887,13 +6420,27 @@ void GB_sel_phase2__lt_thunk_int8
 ) ;
 
 
+void GB_sel_bitmap__lt_thunk_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_thunk_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4907,7 +6454,6 @@ void GB_sel_phase1__lt_thunk_int16
 ) ;
 
 
-
 void GB_sel_phase2__lt_thunk_int16
 (
     int64_t *GB_RESTRICT Ci,
@@ -4928,13 +6474,27 @@ void GB_sel_phase2__lt_thunk_int16
 ) ;
 
 
+void GB_sel_bitmap__lt_thunk_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_thunk_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4948,7 +6508,6 @@ void GB_sel_phase1__lt_thunk_int32
 ) ;
 
 
-
 void GB_sel_phase2__lt_thunk_int32
 (
     int64_t *GB_RESTRICT Ci,
@@ -4969,13 +6528,27 @@ void GB_sel_phase2__lt_thunk_int32
 ) ;
 
 
+void GB_sel_bitmap__lt_thunk_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_thunk_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -4989,7 +6562,6 @@ void GB_sel_phase1__lt_thunk_int64
 ) ;
 
 
-
 void GB_sel_phase2__lt_thunk_int64
 (
     int64_t *GB_RESTRICT Ci,
@@ -5010,13 +6582,27 @@ void GB_sel_phase2__lt_thunk_int64
 ) ;
 
 
+void GB_sel_bitmap__lt_thunk_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_thunk_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5030,7 +6616,6 @@ void GB_sel_phase1__lt_thunk_uint8
 ) ;
 
 
-
 void GB_sel_phase2__lt_thunk_uint8
 (
     int64_t *GB_RESTRICT Ci,
@@ -5051,13 +6636,27 @@ void GB_sel_phase2__lt_thunk_uint8
 ) ;
 
 
+void GB_sel_bitmap__lt_thunk_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_thunk_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5071,7 +6670,6 @@ void GB_sel_phase1__lt_thunk_uint16
 ) ;
 
 
-
 void GB_sel_phase2__lt_thunk_uint16
 (
     int64_t *GB_RESTRICT Ci,
@@ -5092,13 +6690,27 @@ void GB_sel_phase2__lt_thunk_uint16
 ) ;
 
 
+void GB_sel_bitmap__lt_thunk_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_thunk_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5112,7 +6724,6 @@ void GB_sel_phase1__lt_thunk_uint32
 ) ;
 
 
-
 void GB_sel_phase2__lt_thunk_uint32
 (
     int64_t *GB_RESTRICT Ci,
@@ -5133,13 +6744,27 @@ void GB_sel_phase2__lt_thunk_uint32
 ) ;
 
 
+void GB_sel_bitmap__lt_thunk_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_thunk_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5153,7 +6778,6 @@ void GB_sel_phase1__lt_thunk_uint64
 ) ;
 
 
-
 void GB_sel_phase2__lt_thunk_uint64
 (
     int64_t *GB_RESTRICT Ci,
@@ -5174,13 +6798,27 @@ void GB_sel_phase2__lt_thunk_uint64
 ) ;
 
 
+void GB_sel_bitmap__lt_thunk_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_thunk_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5194,7 +6832,6 @@ void GB_sel_phase1__lt_thunk_fp32
 ) ;
 
 
-
 void GB_sel_phase2__lt_thunk_fp32
 (
     int64_t *GB_RESTRICT Ci,
@@ -5215,13 +6852,27 @@ void GB_sel_phase2__lt_thunk_fp32
 ) ;
 
 
+void GB_sel_bitmap__lt_thunk_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__lt_thunk_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5235,7 +6886,6 @@ void GB_sel_phase1__lt_thunk_fp64
 ) ;
 
 
-
 void GB_sel_phase2__lt_thunk_fp64
 (
     int64_t *GB_RESTRICT Ci,
@@ -5256,13 +6906,27 @@ void GB_sel_phase2__lt_thunk_fp64
 ) ;
 
 
+void GB_sel_bitmap__lt_thunk_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_thunk_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5276,7 +6940,6 @@ void GB_sel_phase1__le_thunk_int8
 ) ;
 
 
-
 void GB_sel_phase2__le_thunk_int8
 (
     int64_t *GB_RESTRICT Ci,
@@ -5297,13 +6960,27 @@ void GB_sel_phase2__le_thunk_int8
 ) ;
 
 
+void GB_sel_bitmap__le_thunk_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_thunk_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5317,7 +6994,6 @@ void GB_sel_phase1__le_thunk_int16
 ) ;
 
 
-
 void GB_sel_phase2__le_thunk_int16
 (
     int64_t *GB_RESTRICT Ci,
@@ -5338,13 +7014,27 @@ void GB_sel_phase2__le_thunk_int16
 ) ;
 
 
+void GB_sel_bitmap__le_thunk_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_thunk_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5358,7 +7048,6 @@ void GB_sel_phase1__le_thunk_int32
 ) ;
 
 
-
 void GB_sel_phase2__le_thunk_int32
 (
     int64_t *GB_RESTRICT Ci,
@@ -5379,13 +7068,27 @@ void GB_sel_phase2__le_thunk_int32
 ) ;
 
 
+void GB_sel_bitmap__le_thunk_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_thunk_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5399,7 +7102,6 @@ void GB_sel_phase1__le_thunk_int64
 ) ;
 
 
-
 void GB_sel_phase2__le_thunk_int64
 (
     int64_t *GB_RESTRICT Ci,
@@ -5420,13 +7122,27 @@ void GB_sel_phase2__le_thunk_int64
 ) ;
 
 
+void GB_sel_bitmap__le_thunk_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_thunk_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5440,7 +7156,6 @@ void GB_sel_phase1__le_thunk_uint8
 ) ;
 
 
-
 void GB_sel_phase2__le_thunk_uint8
 (
     int64_t *GB_RESTRICT Ci,
@@ -5461,13 +7176,27 @@ void GB_sel_phase2__le_thunk_uint8
 ) ;
 
 
+void GB_sel_bitmap__le_thunk_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_thunk_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5481,7 +7210,6 @@ void GB_sel_phase1__le_thunk_uint16
 ) ;
 
 
-
 void GB_sel_phase2__le_thunk_uint16
 (
     int64_t *GB_RESTRICT Ci,
@@ -5502,13 +7230,27 @@ void GB_sel_phase2__le_thunk_uint16
 ) ;
 
 
+void GB_sel_bitmap__le_thunk_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_thunk_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5522,7 +7264,6 @@ void GB_sel_phase1__le_thunk_uint32
 ) ;
 
 
-
 void GB_sel_phase2__le_thunk_uint32
 (
     int64_t *GB_RESTRICT Ci,
@@ -5543,13 +7284,27 @@ void GB_sel_phase2__le_thunk_uint32
 ) ;
 
 
+void GB_sel_bitmap__le_thunk_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_thunk_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5563,7 +7318,6 @@ void GB_sel_phase1__le_thunk_uint64
 ) ;
 
 
-
 void GB_sel_phase2__le_thunk_uint64
 (
     int64_t *GB_RESTRICT Ci,
@@ -5584,13 +7338,27 @@ void GB_sel_phase2__le_thunk_uint64
 ) ;
 
 
+void GB_sel_bitmap__le_thunk_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_thunk_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5604,7 +7372,6 @@ void GB_sel_phase1__le_thunk_fp32
 ) ;
 
 
-
 void GB_sel_phase2__le_thunk_fp32
 (
     int64_t *GB_RESTRICT Ci,
@@ -5625,13 +7392,27 @@ void GB_sel_phase2__le_thunk_fp32
 ) ;
 
 
+void GB_sel_bitmap__le_thunk_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
+// SPDX-License-Identifier: Apache-2.0
 
 void GB_sel_phase1__le_thunk_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -5645,7 +7426,6 @@ void GB_sel_phase1__le_thunk_fp64
 ) ;
 
 
-
 void GB_sel_phase2__le_thunk_fp64
 (
     int64_t *GB_RESTRICT Ci,
@@ -5665,3 +7445,17 @@ void GB_sel_phase2__le_thunk_fp64
     const int nthreads
 ) ;
 
+
+void GB_sel_bitmap__le_thunk_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_thunk_fp32.c b/GraphBLAS/Source/Generated/GB_sel__le_thunk_fp32.c
index df97e0f4bf..ea5f9989e2 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_thunk_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_thunk_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_thunk_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_thunk_fp32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     float thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_thunk_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_thunk_fp32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_thunk_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    float thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_thunk_fp64.c b/GraphBLAS/Source/Generated/GB_sel__le_thunk_fp64.c
index 13c9fbf351..3bb6c45c7e 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_thunk_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_thunk_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_thunk_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_thunk_fp64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     double thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_thunk_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_thunk_fp64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_thunk_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    double thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_thunk_int16.c b/GraphBLAS/Source/Generated/GB_sel__le_thunk_int16.c
index ca565dec3e..33e3e4d046 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_thunk_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_thunk_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_thunk_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_thunk_int16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int16_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_thunk_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_thunk_int16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_thunk_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int16_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_thunk_int32.c b/GraphBLAS/Source/Generated/GB_sel__le_thunk_int32.c
index e2e13a85f1..0be40d73f7 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_thunk_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_thunk_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_thunk_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_thunk_int32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_thunk_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_thunk_int32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_thunk_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_thunk_int64.c b/GraphBLAS/Source/Generated/GB_sel__le_thunk_int64.c
index ccb5f2681b..104f559f11 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_thunk_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_thunk_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_thunk_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_thunk_int64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_thunk_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_thunk_int64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_thunk_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_thunk_int8.c b/GraphBLAS/Source/Generated/GB_sel__le_thunk_int8.c
index 4cde2a4aba..ad267d46b8 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_thunk_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_thunk_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_thunk_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_thunk_int8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int8_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_thunk_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_thunk_int8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_thunk_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int8_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint16.c b/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint16.c
index 8641b8b720..817775e367 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_thunk_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_thunk_uint16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint16_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_thunk_uint16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_thunk_uint16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_thunk_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint16_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint32.c b/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint32.c
index be29dba933..423a51c1c0 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_thunk_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_thunk_uint32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_thunk_uint32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_thunk_uint32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_thunk_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint64.c b/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint64.c
index c4c988ad7e..d82ee8a70b 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_thunk_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_thunk_uint64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_thunk_uint64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_thunk_uint64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_thunk_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint8.c b/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint8.c
index 5e1ca74ebd..0897728b69 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_thunk_uint8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_thunk_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_thunk_uint8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint8_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_thunk_uint8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_thunk_uint8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_thunk_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint8_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_zero_fp32.c b/GraphBLAS/Source/Generated/GB_sel__le_zero_fp32.c
index 075fe03807..81493884c6 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_zero_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_zero_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_zero_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_zero_fp32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_zero_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_zero_fp32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_zero_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_zero_fp64.c b/GraphBLAS/Source/Generated/GB_sel__le_zero_fp64.c
index 230d56f9e4..23050fc260 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_zero_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_zero_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_zero_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_zero_fp64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_zero_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_zero_fp64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_zero_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_zero_int16.c b/GraphBLAS/Source/Generated/GB_sel__le_zero_int16.c
index a53be22934..e56b61bc0e 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_zero_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_zero_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_zero_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_zero_int16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_zero_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_zero_int16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_zero_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_zero_int32.c b/GraphBLAS/Source/Generated/GB_sel__le_zero_int32.c
index 7384889592..bfcc210c59 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_zero_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_zero_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_zero_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_zero_int32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_zero_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_zero_int32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_zero_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_zero_int64.c b/GraphBLAS/Source/Generated/GB_sel__le_zero_int64.c
index 841e88481a..fefe7fe668 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_zero_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_zero_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_zero_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_zero_int64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_zero_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_zero_int64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_zero_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__le_zero_int8.c b/GraphBLAS/Source/Generated/GB_sel__le_zero_int8.c
index 75ac8bd4c0..92ca0aebc2 100644
--- a/GraphBLAS/Source/Generated/GB_sel__le_zero_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__le_zero_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__le_zero_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__le_zero_int8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__le_zero_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__le_zero_int8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__le_zero_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_fp32.c b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_fp32.c
index cf050a409b..af20c62ead 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_thunk_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_thunk_fp32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     float thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_thunk_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_thunk_fp32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_thunk_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    float thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_fp64.c b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_fp64.c
index 70762ac9b9..d998328288 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_thunk_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_thunk_fp64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     double thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_thunk_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_thunk_fp64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_thunk_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    double thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int16.c b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int16.c
index 2dd13ef3df..100c451e73 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_thunk_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_thunk_int16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int16_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_thunk_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_thunk_int16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_thunk_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int16_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int32.c b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int32.c
index d4e1438ad4..b4601d35f2 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_thunk_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_thunk_int32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_thunk_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_thunk_int32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_thunk_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int64.c b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int64.c
index fd332f187e..fce564bb54 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_thunk_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_thunk_int64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_thunk_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_thunk_int64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_thunk_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int8.c b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int8.c
index 2fc090f756..dd4af04fcb 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_thunk_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_thunk_int8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int8_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_thunk_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_thunk_int8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_thunk_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int8_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint16.c b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint16.c
index accb4285d6..7d0dc34e97 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_thunk_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_thunk_uint16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint16_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_thunk_uint16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_thunk_uint16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_thunk_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint16_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint32.c b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint32.c
index 1c86aaadfb..dad9c3a13b 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_thunk_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_thunk_uint32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_thunk_uint32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_thunk_uint32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_thunk_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint64.c b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint64.c
index f7512db61c..9f9d76e6fa 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_thunk_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_thunk_uint64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_thunk_uint64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_thunk_uint64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_thunk_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint8.c b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint8.c
index 49ad15bc40..1e4e25ccf5 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_thunk_uint8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_thunk_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_thunk_uint8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint8_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_thunk_uint8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_thunk_uint8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_thunk_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint8_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_zero_fp32.c b/GraphBLAS/Source/Generated/GB_sel__lt_zero_fp32.c
index 1e23e13fe6..bf5c53b440 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_zero_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_zero_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_zero_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_zero_fp32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_zero_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_zero_fp32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_zero_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_zero_fp64.c b/GraphBLAS/Source/Generated/GB_sel__lt_zero_fp64.c
index 70afa91423..0814ca356f 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_zero_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_zero_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_zero_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_zero_fp64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_zero_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_zero_fp64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_zero_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_zero_int16.c b/GraphBLAS/Source/Generated/GB_sel__lt_zero_int16.c
index ad31c8f3c0..d8ce1ba843 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_zero_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_zero_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_zero_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_zero_int16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_zero_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_zero_int16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_zero_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_zero_int32.c b/GraphBLAS/Source/Generated/GB_sel__lt_zero_int32.c
index 70cbdd2d2d..bdbd90f603 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_zero_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_zero_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_zero_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_zero_int32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_zero_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_zero_int32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_zero_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_zero_int64.c b/GraphBLAS/Source/Generated/GB_sel__lt_zero_int64.c
index 61e239625e..b223cc118b 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_zero_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_zero_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_zero_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_zero_int64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_zero_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_zero_int64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_zero_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__lt_zero_int8.c b/GraphBLAS/Source/Generated/GB_sel__lt_zero_int8.c
index 3c3f0e639c..78c1ad9928 100644
--- a/GraphBLAS/Source/Generated/GB_sel__lt_zero_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__lt_zero_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__lt_zero_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__lt_zero_int8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__lt_zero_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__lt_zero_int8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__lt_zero_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_any.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_any.c
index 24a416dd00..3d20cc6ded 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_any.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_any.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_any
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_any
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_any
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fc32.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fc32.c
index 10f83d6b81..6bcf2c7316 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fc32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_fc32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_fc32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     GxB_FC32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_fc32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_fc32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_fc32
+(
+    int8_t *Cb,
+    GxB_FC32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    GxB_FC32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fc64.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fc64.c
index 32cfd03fdd..8984b28b77 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fc64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_fc64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_fc64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     GxB_FC64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_fc64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_fc64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_fc64
+(
+    int8_t *Cb,
+    GxB_FC64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    GxB_FC64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fp32.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fp32.c
index 0dbf225a2e..e7dc6205a3 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_fp32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     float thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_fp32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    float thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fp64.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fp64.c
index 55a88dbc3a..2d91086dd2 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_fp64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     double thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_fp64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    double thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int16.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int16.c
index b5cec9dfb4..c356caf2f3 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_int16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int16_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_int16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int16_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int32.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int32.c
index f9279e71eb..0e1d7eb1bc 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_int32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_int32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int64.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int64.c
index 1a378fc8cf..ddd79298d0 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_int64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_int64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int8.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int8.c
index 41b7382ebd..249c181a71 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_int8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     int8_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_int8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    int8_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint16.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint16.c
index 790f13d54e..71136c225b 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_uint16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint16_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_uint16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_uint16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint16_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint32.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint32.c
index 0f8931f5b3..48e2a444bd 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_uint32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint32_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_uint32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_uint32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint32_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint64.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint64.c
index bb537ab76d..909f69eeff 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_uint64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint64_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_uint64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_uint64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint64_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint8.c b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint8.c
index 2f81007ed8..8cf98f2b5c 100644
--- a/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__ne_thunk_uint8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__ne_thunk_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__ne_thunk_uint8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     uint8_t thunk = (*xthunk) ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__ne_thunk_uint8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__ne_thunk_uint8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__ne_thunk_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    uint8_t thunk = (*xthunk) ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_any.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_any.c
index 745499b97c..8a172294e8 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_any.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_any.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_any
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_any
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_any
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_bool.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_bool.c
index 0e99f5a120..6690cb1f0d 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_bool.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_bool.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_bool
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_bool
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_bool
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_bool
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_bool
+(
+    int8_t *Cb,
+    bool *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const bool *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_fc32.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_fc32.c
index 89ece9c13d..26305666da 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_fc32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_fc32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_fc32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_fc32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_fc32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_fc32
+(
+    int8_t *Cb,
+    GxB_FC32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_fc64.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_fc64.c
index e53777050e..6f6c88bb89 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_fc64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_fc64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_fc64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_fc64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_fc64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_fc64
+(
+    int8_t *Cb,
+    GxB_FC64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_fp32.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_fp32.c
index 2e8e9cb95a..ee735e6002 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_fp32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_fp32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_fp32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_fp32
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_fp64.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_fp64.c
index 522e696f17..58a2784e56 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_fp64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_fp64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_fp64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_fp64
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_int16.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_int16.c
index d0153e1e8b..74b9af8868 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_int16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_int16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_int16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_int16
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_int32.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_int32.c
index fa17b358eb..bc67765515 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_int32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_int32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_int32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_int32
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_int64.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_int64.c
index 72821cf18c..d9bfd51e63 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_int64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_int64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_int64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_int64
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_int8.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_int8.c
index ce12acfd90..f2b3fc825e 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_int8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_int8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_int8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_int8
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_uint16.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_uint16.c
index 5a06d9e0fb..fd9163e01f 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_uint16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_uint16
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_uint16
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_uint16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_uint16
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_uint16
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_uint32.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_uint32.c
index 317d28b490..dfd866e5d2 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_uint32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_uint32
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_uint32
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_uint32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_uint32
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_uint32
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_uint64.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_uint64.c
index db7d4d197e..9509cd59cd 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_uint64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_uint64
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_uint64
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_uint64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_uint64
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_uint64
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzero_uint8.c b/GraphBLAS/Source/Generated/GB_sel__nonzero_uint8.c
index 6af3c12c44..265618333c 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzero_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzero_uint8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzero_uint8
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzero_uint8
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzero_uint8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__nonzero_uint8
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__nonzero_uint8
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_any.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_any.c
index 232885e61d..988f2200e7 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_any.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_any.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__nonzombie_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__nonzombie_any
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_any
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_bool.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_bool.c
index b630a1ce8c..9b86c9345d 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_bool.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_bool.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_bool
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    bool *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const bool *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_fc32.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_fc32.c
index cefa0a3596..00165814d1 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_fc32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_fc32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    GxB_FC32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_fc64.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_fc64.c
index 36c20fd87c..36552e139c 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_fc64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_fc64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    GxB_FC64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GxB_FC64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_fp32.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_fp32.c
index 12a7d98fa4..2d5e17ea65 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_fp32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_fp32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    float *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const float *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_fp64.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_fp64.c
index ca5220359e..70c69909ee 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_fp64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_fp64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    double *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const double *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_int16.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_int16.c
index efc5072d37..07795627bd 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_int16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_int16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_int16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    int16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_int32.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_int32.c
index 4f32534018..57f0cf7d2c 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_int32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_int32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_int32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    int32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_int64.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_int64.c
index d243fc87d5..a70c1b547d 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_int64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_int64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_int64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    int64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_int8.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_int8.c
index 41c1e3961e..0f64927d06 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_int8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_int8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_int8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    int8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const int8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint16.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint16.c
index 013598d7c8..34de598b12 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint16.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_uint16
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    uint16_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint16_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint32.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint32.c
index 829b4d79d4..a0deae56a1 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint32.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_uint32
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    uint32_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint32_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint64.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint64.c
index 7c1ebfa015..59caa4a9e6 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint64.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_uint64
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    uint64_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint64_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint8.c b/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint8.c
index 63ee3f6c61..c658e9c424 100644
--- a/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_sel__nonzombie_uint8.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    GB_IS_NOT_ZOMBIE (Ai [p])
+    GB_IS_NOT_ZOMBIE (Ai, p)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
@@ -47,8 +47,8 @@ void GB_sel_phase1__(none)
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__(none)
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__nonzombie_uint8
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    uint8_t *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const uint8_t *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__offdiag_any.c b/GraphBLAS/Source/Generated/GB_sel__offdiag_any.c
index 0bd9825776..2e67640867 100644
--- a/GraphBLAS/Source/Generated/GB_sel__offdiag_any.c
+++ b/GraphBLAS/Source/Generated/GB_sel__offdiag_any.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__offdiag_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__offdiag_any
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__offdiag_any
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__offdiag_any
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__offdiag_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__resize_any.c b/GraphBLAS/Source/Generated/GB_sel__resize_any.c
index b3948687cb..e2ce68a3a9 100644
--- a/GraphBLAS/Source/Generated/GB_sel__resize_any.c
+++ b/GraphBLAS/Source/Generated/GB_sel__resize_any.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__resize_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__resize_any
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__resize_any
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__(none)
+//------------------------------------------------------------------------------
+
+#if 0
+
+void GB_sel_bitmap__(none)
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+#endif
diff --git a/GraphBLAS/Source/Generated/GB_sel__tril_any.c b/GraphBLAS/Source/Generated/GB_sel__tril_any.c
index e5e47d43a8..06f9354b8e 100644
--- a/GraphBLAS/Source/Generated/GB_sel__tril_any.c
+++ b/GraphBLAS/Source/Generated/GB_sel__tril_any.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__tril_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__tril_any
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__tril_any
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__tril_any
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__tril_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__triu_any.c b/GraphBLAS/Source/Generated/GB_sel__triu_any.c
index f90782145a..4605f7dbd7 100644
--- a/GraphBLAS/Source/Generated/GB_sel__triu_any.c
+++ b/GraphBLAS/Source/Generated/GB_sel__triu_any.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1__triu_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__triu_any
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__triu_any
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__triu_any
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__triu_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_sel__user_any.c b/GraphBLAS/Source/Generated/GB_sel__user_any.c
index dab90058e2..831056ae19 100644
--- a/GraphBLAS/Source/Generated/GB_sel__user_any.c
+++ b/GraphBLAS/Source/Generated/GB_sel__user_any.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,11 +27,11 @@
 
 // test value of Ax [p]
 #define GB_TEST_VALUE_OF_ENTRY(p)                       \
-    user_select ( flipij ? j : Ai[p],  flipij ? Ai[p] : j,  flipij ? avdim : avlen,  flipij ? avlen : avdim, Ax +((p)*asize), xthunk)
+    user_select (flipij ? j : GBI (Ai, p, avlen), flipij ? GBI (Ai, p, avlen) : j, Ax +((p)*asize), xthunk)
 
 // get the vector index (user select operators only)
 #define GB_GET_J                                        \
-    int64_t j = (Ah == NULL) ? k : Ah [k]
+    int64_t j = GBH (Ah, k)
 
 // Cx [pC] = Ax [pA], no typecast
 #define GB_SELECT_ENTRY(Cx,pC,Ax,pA)                    \
@@ -47,8 +47,8 @@ void GB_sel_phase1__user_any
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1__user_any
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     ;
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2__user_any
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap__user_any
+//------------------------------------------------------------------------------
+
+
+
+void GB_sel_bitmap__user_any
+(
+    int8_t *Cb,
+    GB_void *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_void *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    ;
+    #include "GB_bitmap_select_template.c"
+}
+
+
diff --git a/GraphBLAS/Source/Generated/GB_type__bool.c b/GraphBLAS/Source/Generated/GB_type__bool.c
index 8887b5ea88..1c8dfc9285 100644
--- a/GraphBLAS/Source/Generated/GB_type__bool.c
+++ b/GraphBLAS/Source/Generated/GB_type__bool.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__bool
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__bool
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    bool *GB_RESTRICT Ax_new = (bool *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_type__fc32.c b/GraphBLAS/Source/Generated/GB_type__fc32.c
index b6613b72b2..c91c401dcc 100644
--- a/GraphBLAS/Source/Generated/GB_type__fc32.c
+++ b/GraphBLAS/Source/Generated/GB_type__fc32.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__fc32
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__fc32
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    GxB_FC32_t *GB_RESTRICT Ax_new = (GxB_FC32_t *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_type__fc64.c b/GraphBLAS/Source/Generated/GB_type__fc64.c
index 4728a70596..4ac77a4fdb 100644
--- a/GraphBLAS/Source/Generated/GB_type__fc64.c
+++ b/GraphBLAS/Source/Generated/GB_type__fc64.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__fc64
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__fc64
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    GxB_FC64_t *GB_RESTRICT Ax_new = (GxB_FC64_t *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_type__fp32.c b/GraphBLAS/Source/Generated/GB_type__fp32.c
index adf9be642e..4d0092157e 100644
--- a/GraphBLAS/Source/Generated/GB_type__fp32.c
+++ b/GraphBLAS/Source/Generated/GB_type__fp32.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__fp32
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__fp32
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    float *GB_RESTRICT Ax_new = (float *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_type__fp64.c b/GraphBLAS/Source/Generated/GB_type__fp64.c
index 0813e5109a..2828ef89f5 100644
--- a/GraphBLAS/Source/Generated/GB_type__fp64.c
+++ b/GraphBLAS/Source/Generated/GB_type__fp64.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__fp64
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__fp64
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    double *GB_RESTRICT Ax_new = (double *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_type__include.h b/GraphBLAS/Source/Generated/GB_type__include.h
index 8614091286..8b151f6186 100644
--- a/GraphBLAS/Source/Generated/GB_type__include.h
+++ b/GraphBLAS/Source/Generated/GB_type__include.h
@@ -2,12 +2,12 @@
 // GB_type__include.h: definitions for GB_type__*.c
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_type.h
 
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__bool
 (
     GrB_Matrix C,
@@ -45,7 +45,19 @@ GrB_Info GB_Cdense_25__bool
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__bool
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__int8
 (
     GrB_Matrix C,
@@ -83,7 +95,19 @@ GrB_Info GB_Cdense_25__int8
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__int8
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__int16
 (
     GrB_Matrix C,
@@ -121,7 +145,19 @@ GrB_Info GB_Cdense_25__int16
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__int16
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__int32
 (
     GrB_Matrix C,
@@ -159,7 +195,19 @@ GrB_Info GB_Cdense_25__int32
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__int32
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__int64
 (
     GrB_Matrix C,
@@ -197,7 +245,19 @@ GrB_Info GB_Cdense_25__int64
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__int64
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__uint8
 (
     GrB_Matrix C,
@@ -235,7 +295,19 @@ GrB_Info GB_Cdense_25__uint8
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__uint8
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__uint16
 (
     GrB_Matrix C,
@@ -273,7 +345,19 @@ GrB_Info GB_Cdense_25__uint16
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__uint16
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__uint32
 (
     GrB_Matrix C,
@@ -311,7 +395,19 @@ GrB_Info GB_Cdense_25__uint32
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__uint32
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__uint64
 (
     GrB_Matrix C,
@@ -349,7 +445,19 @@ GrB_Info GB_Cdense_25__uint64
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__uint64
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__fp32
 (
     GrB_Matrix C,
@@ -387,7 +495,19 @@ GrB_Info GB_Cdense_25__fp32
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__fp32
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__fp64
 (
     GrB_Matrix C,
@@ -425,7 +545,19 @@ GrB_Info GB_Cdense_25__fp64
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__fp64
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__fc32
 (
     GrB_Matrix C,
@@ -463,7 +595,19 @@ GrB_Info GB_Cdense_25__fc32
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__fc32
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
 
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d__fc64
 (
     GrB_Matrix C,
@@ -501,3 +645,15 @@ GrB_Info GB_Cdense_25__fc64
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b__fc64
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
+
diff --git a/GraphBLAS/Source/Generated/GB_type__int16.c b/GraphBLAS/Source/Generated/GB_type__int16.c
index dd324e7b7e..ff0156b069 100644
--- a/GraphBLAS/Source/Generated/GB_type__int16.c
+++ b/GraphBLAS/Source/Generated/GB_type__int16.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__int16
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__int16
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    int16_t *GB_RESTRICT Ax_new = (int16_t *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_type__int32.c b/GraphBLAS/Source/Generated/GB_type__int32.c
index ed425d17dd..afb74f9d16 100644
--- a/GraphBLAS/Source/Generated/GB_type__int32.c
+++ b/GraphBLAS/Source/Generated/GB_type__int32.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__int32
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__int32
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    int32_t *GB_RESTRICT Ax_new = (int32_t *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_type__int64.c b/GraphBLAS/Source/Generated/GB_type__int64.c
index 324eeb645b..4a970915fd 100644
--- a/GraphBLAS/Source/Generated/GB_type__int64.c
+++ b/GraphBLAS/Source/Generated/GB_type__int64.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__int64
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__int64
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    int64_t *GB_RESTRICT Ax_new = (int64_t *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_type__int8.c b/GraphBLAS/Source/Generated/GB_type__int8.c
index a970e55df1..2a638187ca 100644
--- a/GraphBLAS/Source/Generated/GB_type__int8.c
+++ b/GraphBLAS/Source/Generated/GB_type__int8.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__int8
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__int8
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    int8_t *GB_RESTRICT Ax_new = (int8_t *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_type__uint16.c b/GraphBLAS/Source/Generated/GB_type__uint16.c
index 46a90d1246..cd434907d5 100644
--- a/GraphBLAS/Source/Generated/GB_type__uint16.c
+++ b/GraphBLAS/Source/Generated/GB_type__uint16.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__uint16
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__uint16
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    uint16_t *GB_RESTRICT Ax_new = (uint16_t *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_type__uint32.c b/GraphBLAS/Source/Generated/GB_type__uint32.c
index c2d90724f4..eac5d9be41 100644
--- a/GraphBLAS/Source/Generated/GB_type__uint32.c
+++ b/GraphBLAS/Source/Generated/GB_type__uint32.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__uint32
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__uint32
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    uint32_t *GB_RESTRICT Ax_new = (uint32_t *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_type__uint64.c b/GraphBLAS/Source/Generated/GB_type__uint64.c
index c64140519a..c3e686c444 100644
--- a/GraphBLAS/Source/Generated/GB_type__uint64.c
+++ b/GraphBLAS/Source/Generated/GB_type__uint64.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__uint64
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__uint64
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    uint64_t *GB_RESTRICT Ax_new = (uint64_t *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_type__uint8.c b/GraphBLAS/Source/Generated/GB_type__uint8.c
index 52ca5ac804..ef1412811e 100644
--- a/GraphBLAS/Source/Generated/GB_type__uint8.c
+++ b/GraphBLAS/Source/Generated/GB_type__uint8.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25__uint8
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b__uint8
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    uint8_t *GB_RESTRICT Ax_new = (uint8_t *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_bool_bool.c b/GraphBLAS/Source/Generated/GB_unop__abs_bool_bool.c
index 570640d1df..451624c2b5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_bool_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_bool_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_bool_bool
 (
     bool *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        bool z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                bool z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            bool z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_bool_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_fp32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__abs_fp32_fc32.c
index 995d7a830b..c98d0f65bc 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_fp32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_fp32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cabsf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_FP32 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_fp32_fc32
 (
     float *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = (aij) ;
-        Cx [p] = cabsf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = (aij) ;
+                Cx [p] = cabsf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = (aij) ;
+            Cx [p] = cabsf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_fp32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__abs_fp32_fp32.c
index 2ab5216728..df377fa9de 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = fabsf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = fabsf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = fabsf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = fabsf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_fp64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__abs_fp64_fc64.c
index 3543f525c0..b260f601cb 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_fp64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_fp64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cabs (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_FP64 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_fp64_fc64
 (
     double *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = (aij) ;
-        Cx [p] = cabs (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = (aij) ;
+                Cx [p] = cabs (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = (aij) ;
+            Cx [p] = cabs (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_fp64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__abs_fp64_fp64.c
index dbf6e0a8fd..a58f4aa335 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = fabs (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = fabs (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = fabs (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = fabs (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_int16_int16.c b/GraphBLAS/Source/Generated/GB_unop__abs_int16_int16.c
index 7a1108ed7e..70e73f9673 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_int16_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_int16_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_IABS (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_int16_int16
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        int16_t z = aij ;
-        Cx [p] = GB_IABS (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                int16_t z = aij ;
+                Cx [p] = GB_IABS (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            int16_t z = aij ;
+            Cx [p] = GB_IABS (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_int32_int32.c b/GraphBLAS/Source/Generated/GB_unop__abs_int32_int32.c
index 2855a95879..4f9048824d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_int32_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_int32_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_IABS (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_int32_int32
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        int32_t z = aij ;
-        Cx [p] = GB_IABS (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                int32_t z = aij ;
+                Cx [p] = GB_IABS (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            int32_t z = aij ;
+            Cx [p] = GB_IABS (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_int64_int64.c b/GraphBLAS/Source/Generated/GB_unop__abs_int64_int64.c
index 96f32e6838..c3b7433daa 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_int64_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_int64_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_IABS (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_int64_int64
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        int64_t z = aij ;
-        Cx [p] = GB_IABS (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                int64_t z = aij ;
+                Cx [p] = GB_IABS (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            int64_t z = aij ;
+            Cx [p] = GB_IABS (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_int8_int8.c b/GraphBLAS/Source/Generated/GB_unop__abs_int8_int8.c
index 436af66fe4..aa0ce94f5d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_int8_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_int8_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_IABS (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_int8_int8
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        int8_t z = aij ;
-        Cx [p] = GB_IABS (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                int8_t z = aij ;
+                Cx [p] = GB_IABS (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            int8_t z = aij ;
+            Cx [p] = GB_IABS (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_uint16_uint16.c b/GraphBLAS/Source/Generated/GB_unop__abs_uint16_uint16.c
index b86e028c43..5bf357ba43 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_uint16_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_uint16_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_uint16_uint16
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        uint16_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                uint16_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            uint16_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_uint32_uint32.c b/GraphBLAS/Source/Generated/GB_unop__abs_uint32_uint32.c
index 0741e06cf5..d704b5ae15 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_uint32_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_uint32_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_uint32_uint32
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        uint32_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                uint32_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            uint32_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_uint64_uint64.c b/GraphBLAS/Source/Generated/GB_unop__abs_uint64_uint64.c
index 1b6ff7ce0b..dd1bd010e7 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_uint64_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_uint64_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_uint64_uint64
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        uint64_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                uint64_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            uint64_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__abs_uint8_uint8.c b/GraphBLAS/Source/Generated/GB_unop__abs_uint8_uint8.c
index 2ba00cf705..a58ce0af35 100644
--- a/GraphBLAS/Source/Generated/GB_unop__abs_uint8_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__abs_uint8_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ABS || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__abs_uint8_uint8
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        uint8_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                uint8_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            uint8_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__abs_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__acos_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__acos_fc32_fc32.c
index b5841e5027..ce2b92c4ba 100644
--- a/GraphBLAS/Source/Generated/GB_unop__acos_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__acos_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cacosf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ACOS || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__acos_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = cacosf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = cacosf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = cacosf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__acos_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__acos_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__acos_fc64_fc64.c
index c40a3e23a4..95daa0e0a6 100644
--- a/GraphBLAS/Source/Generated/GB_unop__acos_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__acos_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cacos (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ACOS || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__acos_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = cacos (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = cacos (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = cacos (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__acos_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__acos_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__acos_fp32_fp32.c
index 02e813b391..a66f3b2e88 100644
--- a/GraphBLAS/Source/Generated/GB_unop__acos_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__acos_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = acosf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ACOS || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__acos_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = acosf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = acosf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = acosf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__acos_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__acos_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__acos_fp64_fp64.c
index 3d1264bf59..2934f762d6 100644
--- a/GraphBLAS/Source/Generated/GB_unop__acos_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__acos_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = acos (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ACOS || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__acos_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = acos (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = acos (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = acos (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__acos_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__acosh_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__acosh_fc32_fc32.c
index d450dbf8ec..09d2e01c2a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__acosh_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__acosh_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cacoshf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ACOSH || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__acosh_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = cacoshf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = cacoshf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = cacoshf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__acosh_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__acosh_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__acosh_fc64_fc64.c
index f509fc0774..207b6760f4 100644
--- a/GraphBLAS/Source/Generated/GB_unop__acosh_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__acosh_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cacosh (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ACOSH || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__acosh_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = cacosh (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = cacosh (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = cacosh (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__acosh_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__acosh_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__acosh_fp32_fp32.c
index 1982de3dcd..1af5e60d09 100644
--- a/GraphBLAS/Source/Generated/GB_unop__acosh_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__acosh_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = acoshf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ACOSH || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__acosh_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = acoshf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = acoshf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = acoshf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__acosh_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__acosh_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__acosh_fp64_fp64.c
index cb79318ad5..90244bea76 100644
--- a/GraphBLAS/Source/Generated/GB_unop__acosh_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__acosh_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = acosh (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ACOSH || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__acosh_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = acosh (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = acosh (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = acosh (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__acosh_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_bool_bool.c b/GraphBLAS/Source/Generated/GB_unop__ainv_bool_bool.c
index 5a989c847a..6197c5e6e1 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_bool_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_bool_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_bool_bool
 (
     bool *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        bool z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                bool z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            bool z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_bool_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__ainv_fc32_fc32.c
index 21391d3874..49315c28b9 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_FC32_ainv (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = GB_FC32_ainv (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = GB_FC32_ainv (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = GB_FC32_ainv (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__ainv_fc64_fc64.c
index affa715c04..304b818d09 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_FC64_ainv (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = GB_FC64_ainv (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = GB_FC64_ainv (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = GB_FC64_ainv (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__ainv_fp32_fp32.c
index b0ea753d3a..18574be182 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = -z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = -z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = -z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = -z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__ainv_fp64_fp64.c
index 5eaf66edd3..d94e45b150 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = -z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = -z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = -z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = -z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_int16_int16.c b/GraphBLAS/Source/Generated/GB_unop__ainv_int16_int16.c
index f376a3371d..c02c1f61da 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_int16_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_int16_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = -z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_int16_int16
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        int16_t z = aij ;
-        Cx [p] = -z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                int16_t z = aij ;
+                Cx [p] = -z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            int16_t z = aij ;
+            Cx [p] = -z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_int32_int32.c b/GraphBLAS/Source/Generated/GB_unop__ainv_int32_int32.c
index 9fa0ef0c76..cb85edaf1c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_int32_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_int32_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = -z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_int32_int32
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        int32_t z = aij ;
-        Cx [p] = -z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                int32_t z = aij ;
+                Cx [p] = -z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            int32_t z = aij ;
+            Cx [p] = -z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_int64_int64.c b/GraphBLAS/Source/Generated/GB_unop__ainv_int64_int64.c
index 0dba1dde45..e90769fd30 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_int64_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_int64_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = -z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_int64_int64
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        int64_t z = aij ;
-        Cx [p] = -z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                int64_t z = aij ;
+                Cx [p] = -z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            int64_t z = aij ;
+            Cx [p] = -z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_int8_int8.c b/GraphBLAS/Source/Generated/GB_unop__ainv_int8_int8.c
index bc2eebedf5..9afaee48ef 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_int8_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_int8_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = -z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_int8_int8
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        int8_t z = aij ;
-        Cx [p] = -z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                int8_t z = aij ;
+                Cx [p] = -z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            int8_t z = aij ;
+            Cx [p] = -z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_uint16_uint16.c b/GraphBLAS/Source/Generated/GB_unop__ainv_uint16_uint16.c
index ed053844a4..f9ce2ad032 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_uint16_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_uint16_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = -z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_uint16_uint16
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        uint16_t z = aij ;
-        Cx [p] = -z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                uint16_t z = aij ;
+                Cx [p] = -z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            uint16_t z = aij ;
+            Cx [p] = -z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_uint32_uint32.c b/GraphBLAS/Source/Generated/GB_unop__ainv_uint32_uint32.c
index 21733f4622..dc1c6eb05c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_uint32_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_uint32_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = -z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_uint32_uint32
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        uint32_t z = aij ;
-        Cx [p] = -z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                uint32_t z = aij ;
+                Cx [p] = -z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            uint32_t z = aij ;
+            Cx [p] = -z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_uint64_uint64.c b/GraphBLAS/Source/Generated/GB_unop__ainv_uint64_uint64.c
index b6a29975da..ad0a689643 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_uint64_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_uint64_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = -z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_uint64_uint64
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        uint64_t z = aij ;
-        Cx [p] = -z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                uint64_t z = aij ;
+                Cx [p] = -z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            uint64_t z = aij ;
+            Cx [p] = -z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ainv_uint8_uint8.c b/GraphBLAS/Source/Generated/GB_unop__ainv_uint8_uint8.c
index 4c78b1189a..bf3627d46e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ainv_uint8_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ainv_uint8_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = -z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_AINV || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ainv_uint8_uint8
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        uint8_t z = aij ;
-        Cx [p] = -z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                uint8_t z = aij ;
+                Cx [p] = -z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            uint8_t z = aij ;
+            Cx [p] = -z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ainv_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__asin_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__asin_fc32_fc32.c
index 154dfd2753..545a5656b8 100644
--- a/GraphBLAS/Source/Generated/GB_unop__asin_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__asin_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = casinf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ASIN || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__asin_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = casinf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = casinf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = casinf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__asin_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__asin_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__asin_fc64_fc64.c
index 29ab56f4b8..d76d21a41d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__asin_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__asin_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = casin (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ASIN || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__asin_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = casin (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = casin (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = casin (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__asin_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__asin_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__asin_fp32_fp32.c
index 8f9871fef6..6338ef6483 100644
--- a/GraphBLAS/Source/Generated/GB_unop__asin_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__asin_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = asinf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ASIN || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__asin_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = asinf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = asinf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = asinf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__asin_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__asin_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__asin_fp64_fp64.c
index 0709790838..a1b704e8c9 100644
--- a/GraphBLAS/Source/Generated/GB_unop__asin_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__asin_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = asin (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ASIN || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__asin_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = asin (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = asin (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = asin (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__asin_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__asinh_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__asinh_fc32_fc32.c
index 57ee9bbd3f..f9fe9ca8e5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__asinh_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__asinh_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = casinhf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ASINH || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__asinh_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = casinhf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = casinhf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = casinhf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__asinh_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__asinh_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__asinh_fc64_fc64.c
index cd760f5d93..b5d2dcb942 100644
--- a/GraphBLAS/Source/Generated/GB_unop__asinh_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__asinh_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = casinh (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ASINH || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__asinh_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = casinh (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = casinh (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = casinh (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__asinh_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__asinh_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__asinh_fp32_fp32.c
index 1e4cde3fb7..8b2130cdf0 100644
--- a/GraphBLAS/Source/Generated/GB_unop__asinh_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__asinh_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = asinhf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ASINH || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__asinh_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = asinhf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = asinhf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = asinhf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__asinh_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__asinh_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__asinh_fp64_fp64.c
index 9d6f37ddff..8026557432 100644
--- a/GraphBLAS/Source/Generated/GB_unop__asinh_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__asinh_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = asinh (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ASINH || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__asinh_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = asinh (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = asinh (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = asinh (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__asinh_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__atan_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__atan_fc32_fc32.c
index f7216a07d5..d7a8b19c70 100644
--- a/GraphBLAS/Source/Generated/GB_unop__atan_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__atan_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = catanf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ATAN || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__atan_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = catanf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = catanf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = catanf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__atan_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__atan_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__atan_fc64_fc64.c
index f0ee10f340..d335aadac3 100644
--- a/GraphBLAS/Source/Generated/GB_unop__atan_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__atan_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = catan (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ATAN || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__atan_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = catan (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = catan (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = catan (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__atan_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__atan_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__atan_fp32_fp32.c
index 238f7929f7..60a39f1741 100644
--- a/GraphBLAS/Source/Generated/GB_unop__atan_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__atan_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = atanf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ATAN || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__atan_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = atanf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = atanf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = atanf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__atan_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__atan_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__atan_fp64_fp64.c
index 683d4119ad..07034dab63 100644
--- a/GraphBLAS/Source/Generated/GB_unop__atan_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__atan_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = atan (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ATAN || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__atan_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = atan (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = atan (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = atan (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__atan_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__atanh_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__atanh_fc32_fc32.c
index 8c7c6a2325..a94dfc8a11 100644
--- a/GraphBLAS/Source/Generated/GB_unop__atanh_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__atanh_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = catanhf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ATANH || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__atanh_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = catanhf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = catanhf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = catanhf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__atanh_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__atanh_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__atanh_fc64_fc64.c
index 6348846f3f..44881636f3 100644
--- a/GraphBLAS/Source/Generated/GB_unop__atanh_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__atanh_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = catanh (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ATANH || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__atanh_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = catanh (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = catanh (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = catanh (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__atanh_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__atanh_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__atanh_fp32_fp32.c
index df6fa9eba2..74d5d93d83 100644
--- a/GraphBLAS/Source/Generated/GB_unop__atanh_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__atanh_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = atanhf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ATANH || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__atanh_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = atanhf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = atanhf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = atanhf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__atanh_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__atanh_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__atanh_fp64_fp64.c
index 975fa990a7..308946d8c5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__atanh_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__atanh_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = atanh (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ATANH || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__atanh_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = atanh (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = atanh (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = atanh (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__atanh_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__bnot_int16_int16.c b/GraphBLAS/Source/Generated/GB_unop__bnot_int16_int16.c
index a35fdda60e..a5f4e6b6d8 100644
--- a/GraphBLAS/Source/Generated/GB_unop__bnot_int16_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__bnot_int16_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ~(z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BNOT || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__bnot_int16_int16
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        int16_t z = aij ;
-        Cx [p] = ~(z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                int16_t z = aij ;
+                Cx [p] = ~(z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            int16_t z = aij ;
+            Cx [p] = ~(z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__bnot_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__bnot_int32_int32.c b/GraphBLAS/Source/Generated/GB_unop__bnot_int32_int32.c
index f6d154bc18..d83d6ae190 100644
--- a/GraphBLAS/Source/Generated/GB_unop__bnot_int32_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__bnot_int32_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ~(z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BNOT || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__bnot_int32_int32
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        int32_t z = aij ;
-        Cx [p] = ~(z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                int32_t z = aij ;
+                Cx [p] = ~(z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            int32_t z = aij ;
+            Cx [p] = ~(z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__bnot_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__bnot_int64_int64.c b/GraphBLAS/Source/Generated/GB_unop__bnot_int64_int64.c
index 250bdf4325..4c414c3147 100644
--- a/GraphBLAS/Source/Generated/GB_unop__bnot_int64_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__bnot_int64_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ~(z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BNOT || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__bnot_int64_int64
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        int64_t z = aij ;
-        Cx [p] = ~(z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                int64_t z = aij ;
+                Cx [p] = ~(z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            int64_t z = aij ;
+            Cx [p] = ~(z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__bnot_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__bnot_int8_int8.c b/GraphBLAS/Source/Generated/GB_unop__bnot_int8_int8.c
index 075da6fcb1..9ad96b1e4a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__bnot_int8_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__bnot_int8_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ~(z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BNOT || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__bnot_int8_int8
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        int8_t z = aij ;
-        Cx [p] = ~(z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                int8_t z = aij ;
+                Cx [p] = ~(z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            int8_t z = aij ;
+            Cx [p] = ~(z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__bnot_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__bnot_uint16_uint16.c b/GraphBLAS/Source/Generated/GB_unop__bnot_uint16_uint16.c
index bcbc7814f2..f91ab7d4e1 100644
--- a/GraphBLAS/Source/Generated/GB_unop__bnot_uint16_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__bnot_uint16_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ~(z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BNOT || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__bnot_uint16_uint16
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        uint16_t z = aij ;
-        Cx [p] = ~(z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                uint16_t z = aij ;
+                Cx [p] = ~(z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            uint16_t z = aij ;
+            Cx [p] = ~(z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__bnot_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__bnot_uint32_uint32.c b/GraphBLAS/Source/Generated/GB_unop__bnot_uint32_uint32.c
index c8d27cf7f6..8122d5deb2 100644
--- a/GraphBLAS/Source/Generated/GB_unop__bnot_uint32_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__bnot_uint32_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ~(z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BNOT || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__bnot_uint32_uint32
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        uint32_t z = aij ;
-        Cx [p] = ~(z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                uint32_t z = aij ;
+                Cx [p] = ~(z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            uint32_t z = aij ;
+            Cx [p] = ~(z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__bnot_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__bnot_uint64_uint64.c b/GraphBLAS/Source/Generated/GB_unop__bnot_uint64_uint64.c
index a218fc28e1..cfe0ae382a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__bnot_uint64_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__bnot_uint64_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ~(z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BNOT || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__bnot_uint64_uint64
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        uint64_t z = aij ;
-        Cx [p] = ~(z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                uint64_t z = aij ;
+                Cx [p] = ~(z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            uint64_t z = aij ;
+            Cx [p] = ~(z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__bnot_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__bnot_uint8_uint8.c b/GraphBLAS/Source/Generated/GB_unop__bnot_uint8_uint8.c
index f9334751c8..7856e6bf61 100644
--- a/GraphBLAS/Source/Generated/GB_unop__bnot_uint8_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__bnot_uint8_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ~(z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_BNOT || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__bnot_uint8_uint8
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        uint8_t z = aij ;
-        Cx [p] = ~(z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                uint8_t z = aij ;
+                Cx [p] = ~(z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            uint8_t z = aij ;
+            Cx [p] = ~(z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__bnot_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__carg_fp32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__carg_fp32_fc32.c
index 873d6762d5..f73b8a7dbc 100644
--- a/GraphBLAS/Source/Generated/GB_unop__carg_fp32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__carg_fp32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cargf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_CARG || GxB_NO_FP32 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__carg_fp32_fc32
 (
     float *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = (aij) ;
-        Cx [p] = cargf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = (aij) ;
+                Cx [p] = cargf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = (aij) ;
+            Cx [p] = cargf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__carg_fp32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__carg_fp64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__carg_fp64_fc64.c
index 80959ba940..f27da13e4c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__carg_fp64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__carg_fp64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = carg (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_CARG || GxB_NO_FP64 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__carg_fp64_fc64
 (
     double *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = (aij) ;
-        Cx [p] = carg (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = (aij) ;
+                Cx [p] = carg (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = (aij) ;
+            Cx [p] = carg (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__carg_fp64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ceil_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__ceil_fc32_fc32.c
index a6e7e95bd5..3d473628d6 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ceil_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ceil_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cceilf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_CEIL || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ceil_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = GB_cceilf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = GB_cceilf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = GB_cceilf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ceil_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ceil_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__ceil_fc64_fc64.c
index 0fae9b84ba..d8d3a97361 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ceil_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ceil_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cceil (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_CEIL || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ceil_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = GB_cceil (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = GB_cceil (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = GB_cceil (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ceil_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ceil_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__ceil_fp32_fp32.c
index 758d4b8eff..d8d60e550d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ceil_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ceil_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ceilf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_CEIL || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ceil_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = ceilf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = ceilf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = ceilf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ceil_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__ceil_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__ceil_fp64_fp64.c
index 54ee9d12c9..f1228307e8 100644
--- a/GraphBLAS/Source/Generated/GB_unop__ceil_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__ceil_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ceil (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_CEIL || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__ceil_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = ceil (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = ceil (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = ceil (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__ceil_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__cimag_fp32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__cimag_fp32_fc32.c
index dc85d02856..a56849dd09 100644
--- a/GraphBLAS/Source/Generated/GB_unop__cimag_fp32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__cimag_fp32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cimagf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_CIMAG || GxB_NO_FP32 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__cimag_fp32_fc32
 (
     float *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = (aij) ;
-        Cx [p] = cimagf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = (aij) ;
+                Cx [p] = cimagf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = (aij) ;
+            Cx [p] = cimagf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__cimag_fp32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__cimag_fp64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__cimag_fp64_fc64.c
index 83e24c9bf1..a506413863 100644
--- a/GraphBLAS/Source/Generated/GB_unop__cimag_fp64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__cimag_fp64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cimag (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_CIMAG || GxB_NO_FP64 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__cimag_fp64_fc64
 (
     double *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = (aij) ;
-        Cx [p] = cimag (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = (aij) ;
+                Cx [p] = cimag (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = (aij) ;
+            Cx [p] = cimag (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__cimag_fp64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__conj_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__conj_fc32_fc32.c
index b267c2d3b5..23aaca3138 100644
--- a/GraphBLAS/Source/Generated/GB_unop__conj_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__conj_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = conjf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_CONJ || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__conj_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = conjf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = conjf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = conjf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__conj_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__conj_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__conj_fc64_fc64.c
index e31c117fff..9506615906 100644
--- a/GraphBLAS/Source/Generated/GB_unop__conj_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__conj_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = conj (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_CONJ || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__conj_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = conj (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = conj (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = conj (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__conj_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__cos_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__cos_fc32_fc32.c
index 134c586c0c..e5a389b7d1 100644
--- a/GraphBLAS/Source/Generated/GB_unop__cos_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__cos_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ccosf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_COS || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__cos_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = ccosf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = ccosf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = ccosf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__cos_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__cos_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__cos_fc64_fc64.c
index 6c412aa2e7..a4a6895f4e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__cos_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__cos_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ccos (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_COS || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__cos_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = ccos (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = ccos (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = ccos (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__cos_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__cos_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__cos_fp32_fp32.c
index 21bdf7f7d9..0b6a445f77 100644
--- a/GraphBLAS/Source/Generated/GB_unop__cos_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__cos_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cosf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_COS || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__cos_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = cosf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = cosf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = cosf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__cos_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__cos_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__cos_fp64_fp64.c
index d8a62408d2..e33a769d70 100644
--- a/GraphBLAS/Source/Generated/GB_unop__cos_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__cos_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cos (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_COS || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__cos_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = cos (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = cos (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = cos (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__cos_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__cosh_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__cosh_fc32_fc32.c
index 5145c2b3f5..e4c8cdf76e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__cosh_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__cosh_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ccoshf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_COSH || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__cosh_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = ccoshf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = ccoshf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = ccoshf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__cosh_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__cosh_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__cosh_fc64_fc64.c
index aa4dd6738b..fa20b16795 100644
--- a/GraphBLAS/Source/Generated/GB_unop__cosh_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__cosh_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ccosh (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_COSH || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__cosh_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = ccosh (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = ccosh (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = ccosh (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__cosh_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__cosh_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__cosh_fp32_fp32.c
index 65e5620267..2cf34879fe 100644
--- a/GraphBLAS/Source/Generated/GB_unop__cosh_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__cosh_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = coshf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_COSH || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__cosh_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = coshf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = coshf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = coshf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__cosh_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__cosh_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__cosh_fp64_fp64.c
index 0c1b772a34..debd631a22 100644
--- a/GraphBLAS/Source/Generated/GB_unop__cosh_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__cosh_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cosh (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_COSH || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__cosh_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = cosh (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = cosh (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = cosh (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__cosh_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__creal_fp32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__creal_fp32_fc32.c
index 65ab7643b6..103c339ec9 100644
--- a/GraphBLAS/Source/Generated/GB_unop__creal_fp32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__creal_fp32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = crealf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_CREAL || GxB_NO_FP32 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__creal_fp32_fc32
 (
     float *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = (aij) ;
-        Cx [p] = crealf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = (aij) ;
+                Cx [p] = crealf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = (aij) ;
+            Cx [p] = crealf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__creal_fp32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__creal_fp64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__creal_fp64_fc64.c
index 5f7789a902..05045bd234 100644
--- a/GraphBLAS/Source/Generated/GB_unop__creal_fp64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__creal_fp64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = creal (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_CREAL || GxB_NO_FP64 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__creal_fp64_fc64
 (
     double *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = (aij) ;
-        Cx [p] = creal (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = (aij) ;
+                Cx [p] = creal (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = (aij) ;
+            Cx [p] = creal (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__creal_fp64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__erf_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__erf_fp32_fp32.c
index d9c5fa13ac..3f60f16361 100644
--- a/GraphBLAS/Source/Generated/GB_unop__erf_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__erf_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = erff (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ERF || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__erf_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = erff (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = erff (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = erff (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__erf_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__erf_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__erf_fp64_fp64.c
index f8c48c7f73..a8a2bfa6c6 100644
--- a/GraphBLAS/Source/Generated/GB_unop__erf_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__erf_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = erf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ERF || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__erf_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = erf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = erf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = erf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__erf_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__erfc_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__erfc_fp32_fp32.c
index e52017601e..69379e994a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__erfc_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__erfc_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = erfcf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ERFC || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__erfc_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = erfcf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = erfcf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = erfcf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__erfc_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__erfc_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__erfc_fp64_fp64.c
index 2b02da6de7..2952aff059 100644
--- a/GraphBLAS/Source/Generated/GB_unop__erfc_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__erfc_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = erfc (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ERFC || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__erfc_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = erfc (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = erfc (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = erfc (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__erfc_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__exp2_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__exp2_fc32_fc32.c
index e613b9daff..fb42e64265 100644
--- a/GraphBLAS/Source/Generated/GB_unop__exp2_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__exp2_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cexp2f (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EXP2 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__exp2_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = GB_cexp2f (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = GB_cexp2f (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = GB_cexp2f (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__exp2_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__exp2_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__exp2_fc64_fc64.c
index 17b8fa9cb1..3204c9cdb5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__exp2_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__exp2_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cexp2 (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EXP2 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__exp2_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = GB_cexp2 (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = GB_cexp2 (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = GB_cexp2 (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__exp2_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__exp2_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__exp2_fp32_fp32.c
index 52e09a7c40..1b31b52a6e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__exp2_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__exp2_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = exp2f (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EXP2 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__exp2_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = exp2f (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = exp2f (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = exp2f (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__exp2_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__exp2_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__exp2_fp64_fp64.c
index c27f03d7f9..796b12da59 100644
--- a/GraphBLAS/Source/Generated/GB_unop__exp2_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__exp2_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = exp2 (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EXP2 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__exp2_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = exp2 (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = exp2 (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = exp2 (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__exp2_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__exp_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__exp_fc32_fc32.c
index b220f117b2..0266a14b63 100644
--- a/GraphBLAS/Source/Generated/GB_unop__exp_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__exp_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cexpf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EXP || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__exp_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = cexpf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = cexpf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = cexpf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__exp_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__exp_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__exp_fc64_fc64.c
index 838f52a71b..dbbef90f73 100644
--- a/GraphBLAS/Source/Generated/GB_unop__exp_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__exp_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = cexp (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EXP || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__exp_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = cexp (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = cexp (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = cexp (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__exp_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__exp_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__exp_fp32_fp32.c
index 379b97f180..7ea2885501 100644
--- a/GraphBLAS/Source/Generated/GB_unop__exp_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__exp_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = expf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EXP || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__exp_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = expf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = expf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = expf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__exp_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__exp_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__exp_fp64_fp64.c
index bc42445407..657e851554 100644
--- a/GraphBLAS/Source/Generated/GB_unop__exp_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__exp_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = exp (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EXP || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__exp_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = exp (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = exp (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = exp (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__exp_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__expm1_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__expm1_fc32_fc32.c
index 521daf0ca1..555636044a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__expm1_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__expm1_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cexpm1f (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EXPM1 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__expm1_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = GB_cexpm1f (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = GB_cexpm1f (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = GB_cexpm1f (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__expm1_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__expm1_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__expm1_fc64_fc64.c
index c1d73109ec..6d99955803 100644
--- a/GraphBLAS/Source/Generated/GB_unop__expm1_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__expm1_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cexpm1 (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EXPM1 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__expm1_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = GB_cexpm1 (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = GB_cexpm1 (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = GB_cexpm1 (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__expm1_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__expm1_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__expm1_fp32_fp32.c
index 954d52f116..d68a7603af 100644
--- a/GraphBLAS/Source/Generated/GB_unop__expm1_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__expm1_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = expm1f (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EXPM1 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__expm1_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = expm1f (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = expm1f (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = expm1f (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__expm1_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__expm1_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__expm1_fp64_fp64.c
index ac08beb04a..5f5da1edf3 100644
--- a/GraphBLAS/Source/Generated/GB_unop__expm1_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__expm1_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = expm1 (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_EXPM1 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__expm1_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = expm1 (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = expm1 (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = expm1 (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__expm1_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__floor_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__floor_fc32_fc32.c
index f2b1790326..39d14a28a6 100644
--- a/GraphBLAS/Source/Generated/GB_unop__floor_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__floor_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cfloorf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_FLOOR || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__floor_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = GB_cfloorf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = GB_cfloorf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = GB_cfloorf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__floor_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__floor_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__floor_fc64_fc64.c
index c1a4ce4e85..0dcba438ad 100644
--- a/GraphBLAS/Source/Generated/GB_unop__floor_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__floor_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cfloor (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_FLOOR || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__floor_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = GB_cfloor (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = GB_cfloor (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = GB_cfloor (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__floor_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__floor_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__floor_fp32_fp32.c
index 29e023c416..9b6fe2a266 100644
--- a/GraphBLAS/Source/Generated/GB_unop__floor_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__floor_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = floorf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_FLOOR || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__floor_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = floorf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = floorf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = floorf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__floor_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__floor_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__floor_fp64_fp64.c
index 6d26f1dac2..0bc0d06f50 100644
--- a/GraphBLAS/Source/Generated/GB_unop__floor_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__floor_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = floor (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_FLOOR || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__floor_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = floor (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = floor (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = floor (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__floor_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__frexpe_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__frexpe_fp32_fp32.c
index 65dee24d1d..ca4ca2fec2 100644
--- a/GraphBLAS/Source/Generated/GB_unop__frexpe_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__frexpe_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_frexpef (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_FREXPE || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__frexpe_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = GB_frexpef (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = GB_frexpef (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = GB_frexpef (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__frexpe_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__frexpe_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__frexpe_fp64_fp64.c
index bb46359de3..4e04c519a9 100644
--- a/GraphBLAS/Source/Generated/GB_unop__frexpe_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__frexpe_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_frexpe (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_FREXPE || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__frexpe_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = GB_frexpe (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = GB_frexpe (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = GB_frexpe (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__frexpe_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__frexpx_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__frexpx_fp32_fp32.c
index 59f20bbd97..8fd74e48ea 100644
--- a/GraphBLAS/Source/Generated/GB_unop__frexpx_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__frexpx_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_frexpxf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_FREXPX || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__frexpx_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = GB_frexpxf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = GB_frexpxf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = GB_frexpxf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__frexpx_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__frexpx_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__frexpx_fp64_fp64.c
index 1f0dc3e45a..1d8485580a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__frexpx_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__frexpx_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_frexpx (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_FREXPX || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__frexpx_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = GB_frexpx (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = GB_frexpx (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = GB_frexpx (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__frexpx_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_bool.c
index 00d6494949..cab74c5f9f 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_bool_bool
 // op(A') function:  GB_unop_tran__identity_bool_bool
 
 // C type:   bool
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_bool_bool
 (
     bool *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        bool z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                bool z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            bool z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_fc32.c
index 6c55beee1a..3a76a000d1 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_bool_fc32
 (
     bool *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        bool z = (crealf (aij) != 0) || (cimagf (aij) != 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                bool z = (crealf (aij) != 0) || (cimagf (aij) != 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            bool z = (crealf (aij) != 0) || (cimagf (aij) != 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_fc64.c
index 3f6514ea90..23da823a6e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_bool_fc64
 (
     bool *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        bool z = (creal (aij) != 0) || (cimag (aij) != 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                bool z = (creal (aij) != 0) || (cimag (aij) != 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            bool z = (creal (aij) != 0) || (cimag (aij) != 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_fp32.c
index f94599798a..9541ac9aa3 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_bool_fp32
 (
     bool *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        bool z = (aij != 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                bool z = (aij != 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            bool z = (aij != 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_fp64.c
index 7b70d28cf5..2275ad0006 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_bool_fp64
 (
     bool *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        bool z = (aij != 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                bool z = (aij != 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            bool z = (aij != 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_int16.c
index 219c735747..71df702d76 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_bool_int16
 (
     bool *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        bool z = (bool) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                bool z = (bool) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            bool z = (bool) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_int32.c
index dbf0e8a24f..bce628571d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_bool_int32
 (
     bool *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        bool z = (bool) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                bool z = (bool) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            bool z = (bool) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_int64.c
index cc7052834b..3db21c8efa 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_bool_int64
 (
     bool *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        bool z = (bool) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                bool z = (bool) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            bool z = (bool) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_int8.c
index 9633e50ff4..6516e40aed 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_bool_int8
 (
     bool *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        bool z = (bool) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                bool z = (bool) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            bool z = (bool) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint16.c
index 5f00c54069..401f64c9dd 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_bool_uint16
 (
     bool *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        bool z = (bool) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                bool z = (bool) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            bool z = (bool) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint32.c
index dc20c1ad46..061896fd55 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_bool_uint32
 (
     bool *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        bool z = (bool) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                bool z = (bool) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            bool z = (bool) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint64.c
index d935c3b1cf..ae5bcaabf6 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_bool_uint64
 (
     bool *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        bool z = (bool) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                bool z = (bool) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            bool z = (bool) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint8.c
index 6536f6d540..ac0d413b0a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_bool_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_BOOL || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_bool_uint8
 (
     bool *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        bool z = (bool) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                bool z = (bool) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            bool z = (bool) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_bool_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_bool.c
index 543caf62d4..7f95240ff3 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32 || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc32_bool
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fc32.c
index 58dc21f221..d8cf9bdc69 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_fc32_fc32
 // op(A') function:  GB_unop_tran__identity_fc32_fc32
 
 // C type:   GxB_FC32_t
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fc64.c
index d6e348670c..a8a7bacbeb 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc32_fc64
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC32_t z = GxB_CMPLXF ((float) creal (aij), (float) cimag (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC32_t z = GxB_CMPLXF ((float) creal (aij), (float) cimag (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC32_t z = GxB_CMPLXF ((float) creal (aij), (float) cimag (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fp32.c
index 2938ec6801..764a77918b 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc32_fp32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fp64.c
index c0dc09bf83..5c7e279251 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc32_fp64
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int16.c
index 2c0527391f..63ddbdac58 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32 || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc32_int16
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int32.c
index 6ab69cc893..712dfd3455 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32 || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc32_int32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int64.c
index 08741da9c5..a8ce37ab58 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32 || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc32_int64
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int8.c
index 83073f6dfe..9b0ee477ca 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32 || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc32_int8
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint16.c
index c50b6c2cfd..8a19e632f5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32 || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc32_uint16
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint32.c
index 4b8f43bdfa..83fdf90b67 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32 || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc32_uint32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint64.c
index d6d0d8ec49..238a3ea903 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32 || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc32_uint64
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint8.c
index a81631c27c..a62517d244 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc32_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC32 || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc32_uint8
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            GxB_FC32_t z = GxB_CMPLXF ((float) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc32_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_bool.c
index a056fa8ecb..8eec9136bc 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64 || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc64_bool
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fc32.c
index bbbba05760..20c266d7eb 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc64_fc32
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC64_t z = GxB_CMPLX ((double) crealf (aij), (double) cimagf (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC64_t z = GxB_CMPLX ((double) crealf (aij), (double) cimagf (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC64_t z = GxB_CMPLX ((double) crealf (aij), (double) cimagf (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fc64.c
index 35136d1e69..000d3f8833 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_fc64_fc64
 // op(A') function:  GB_unop_tran__identity_fc64_fc64
 
 // C type:   GxB_FC64_t
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fp32.c
index 0467c2a656..1137952225 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc64_fp32
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fp64.c
index d57b84f6a3..07a9885e08 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc64_fp64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int16.c
index fbd0a37abc..3824591d93 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64 || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc64_int16
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int32.c
index 333eb9ea57..959600973d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64 || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc64_int32
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int64.c
index 72eccbe1d9..ddaccdc24c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64 || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc64_int64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int8.c
index f55dbdf370..c0da53f437 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64 || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc64_int8
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint16.c
index c9db701189..f3db676c2a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64 || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc64_uint16
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint32.c
index 39b8754981..a9772461bd 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64 || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc64_uint32
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint64.c
index da30c760f7..fceed59ba4 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64 || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc64_uint64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint8.c
index f262e59f0b..2cc66256cb 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fc64_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FC64 || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fc64_uint8
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            GxB_FC64_t z = GxB_CMPLX ((double) (aij), 0) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fc64_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_bool.c
index 2753239a71..163bb4e462 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32 || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp32_bool
 (
     float *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        float z = (float) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                float z = (float) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            float z = (float) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fc32.c
index ee6013be00..a5dd3b2977 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp32_fc32
 (
     float *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        float z = (float) crealf (aij) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                float z = (float) crealf (aij) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            float z = (float) crealf (aij) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fc64.c
index 9c69ba9244..898986d2da 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp32_fc64
 (
     float *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        float z = (float) creal (aij) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                float z = (float) creal (aij) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            float z = (float) creal (aij) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fp32.c
index d11b797722..0256cc7bea 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_fp32_fp32
 // op(A') function:  GB_unop_tran__identity_fp32_fp32
 
 // C type:   float
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fp64.c
index 1db3cb083d..55799a3125 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp32_fp64
 (
     float *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        float z = (float) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                float z = (float) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            float z = (float) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int16.c
index e2253dc3cc..12a923468c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32 || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp32_int16
 (
     float *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        float z = (float) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                float z = (float) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            float z = (float) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int32.c
index 78ff1c80c7..a2cbf9c879 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32 || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp32_int32
 (
     float *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        float z = (float) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                float z = (float) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            float z = (float) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int64.c
index 3ca86406a5..8bd54569d9 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32 || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp32_int64
 (
     float *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        float z = (float) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                float z = (float) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            float z = (float) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int8.c
index 2c92e2a193..7b41ad3611 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32 || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp32_int8
 (
     float *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        float z = (float) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                float z = (float) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            float z = (float) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint16.c
index 952bbd4e93..b1c6648762 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32 || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp32_uint16
 (
     float *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        float z = (float) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                float z = (float) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            float z = (float) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint32.c
index 92cf66bc15..aaf29ff53e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32 || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp32_uint32
 (
     float *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        float z = (float) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                float z = (float) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            float z = (float) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint64.c
index 3538167a54..92173a2099 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32 || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp32_uint64
 (
     float *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        float z = (float) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                float z = (float) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            float z = (float) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint8.c
index dc14632b7e..081ed806ac 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp32_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP32 || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp32_uint8
 (
     float *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        float z = (float) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                float z = (float) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            float z = (float) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp32_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_bool.c
index f287eec9f4..537fb6f16d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64 || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp64_bool
 (
     double *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        double z = (double) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                double z = (double) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            double z = (double) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fc32.c
index 816dd88eea..315be0e9e8 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp64_fc32
 (
     double *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        double z = (double) crealf (aij) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                double z = (double) crealf (aij) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            double z = (double) crealf (aij) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fc64.c
index fdfddb1579..ed87c33791 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp64_fc64
 (
     double *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        double z = (double) creal (aij) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                double z = (double) creal (aij) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            double z = (double) creal (aij) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fp32.c
index a3730e73fc..cf34a6d330 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp64_fp32
 (
     double *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        double z = (double) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                double z = (double) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            double z = (double) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fp64.c
index a83eff22c4..079e4ef746 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_fp64_fp64
 // op(A') function:  GB_unop_tran__identity_fp64_fp64
 
 // C type:   double
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int16.c
index ac3ea36482..3eb1adcab2 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64 || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp64_int16
 (
     double *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        double z = (double) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                double z = (double) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            double z = (double) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int32.c
index afc078b47e..2ffd07eb18 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64 || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp64_int32
 (
     double *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        double z = (double) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                double z = (double) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            double z = (double) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int64.c
index 3b9d9c9a19..c52a5059c0 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64 || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp64_int64
 (
     double *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        double z = (double) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                double z = (double) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            double z = (double) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int8.c
index 5f1db6fa2a..7411f90f49 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64 || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp64_int8
 (
     double *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        double z = (double) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                double z = (double) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            double z = (double) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint16.c
index 13bd8e4359..bb75a28873 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64 || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp64_uint16
 (
     double *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        double z = (double) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                double z = (double) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            double z = (double) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint32.c
index 561a2449d9..cc1dfca2c7 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64 || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp64_uint32
 (
     double *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        double z = (double) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                double z = (double) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            double z = (double) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint64.c
index b17343a0b2..8ce771fe1a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64 || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp64_uint64
 (
     double *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        double z = (double) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                double z = (double) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            double z = (double) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint8.c
index 46e56dedc0..ccaaa14c57 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_fp64_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_FP64 || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_fp64_uint8
 (
     double *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        double z = (double) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                double z = (double) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            double z = (double) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_fp64_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_bool.c
index d4f8b7a97a..00f6d2f4ed 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16 || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int16_bool
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        int16_t z = (int16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                int16_t z = (int16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            int16_t z = (int16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_fc32.c
index b5b68e42eb..210c180292 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int16_fc32
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        int16_t z = GB_cast_to_int16_t ((double) crealf (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                int16_t z = GB_cast_to_int16_t ((double) crealf (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            int16_t z = GB_cast_to_int16_t ((double) crealf (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_fc64.c
index b089b23ac9..684f78c70f 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int16_fc64
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        int16_t z = GB_cast_to_int16_t (creal (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                int16_t z = GB_cast_to_int16_t (creal (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            int16_t z = GB_cast_to_int16_t (creal (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_fp32.c
index 9df5969c77..e27c4b7912 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int16_fp32
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        int16_t z = GB_cast_to_int16_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                int16_t z = GB_cast_to_int16_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            int16_t z = GB_cast_to_int16_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_fp64.c
index c14ca18179..04fc247b55 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int16_fp64
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        int16_t z = GB_cast_to_int16_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                int16_t z = GB_cast_to_int16_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            int16_t z = GB_cast_to_int16_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_int16.c
index a551ea6c2f..2f71c51008 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_int16_int16
 // op(A') function:  GB_unop_tran__identity_int16_int16
 
 // C type:   int16_t
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_int16_int16
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        int16_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                int16_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            int16_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_int32.c
index 4041e304db..293c0f0d9d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16 || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int16_int32
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        int16_t z = (int16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                int16_t z = (int16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            int16_t z = (int16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_int64.c
index 97d1f316c9..7131ff5e18 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16 || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int16_int64
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        int16_t z = (int16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                int16_t z = (int16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            int16_t z = (int16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_int8.c
index fc7fec7736..a273f07e3f 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16 || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int16_int8
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        int16_t z = (int16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                int16_t z = (int16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            int16_t z = (int16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint16.c
index 60487cb159..35af07672f 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16 || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int16_uint16
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        int16_t z = (int16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                int16_t z = (int16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            int16_t z = (int16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint32.c
index 739016c2f8..42346d506c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16 || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int16_uint32
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        int16_t z = (int16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                int16_t z = (int16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            int16_t z = (int16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint64.c
index e65cc72cf8..abe3b4e83e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16 || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int16_uint64
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        int16_t z = (int16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                int16_t z = (int16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            int16_t z = (int16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint8.c
index 536f8eede6..59f5e750c9 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int16_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT16 || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int16_uint8
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        int16_t z = (int16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                int16_t z = (int16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            int16_t z = (int16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int16_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_bool.c
index c9ef401a44..e6d0f62375 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32 || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int32_bool
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        int32_t z = (int32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                int32_t z = (int32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            int32_t z = (int32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_fc32.c
index 0441b93bef..bf632590e4 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int32_fc32
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        int32_t z = GB_cast_to_int32_t ((double) crealf (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                int32_t z = GB_cast_to_int32_t ((double) crealf (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            int32_t z = GB_cast_to_int32_t ((double) crealf (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_fc64.c
index c9be28c675..b623a32c36 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int32_fc64
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        int32_t z = GB_cast_to_int32_t (creal (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                int32_t z = GB_cast_to_int32_t (creal (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            int32_t z = GB_cast_to_int32_t (creal (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_fp32.c
index 071a217991..60cd5e1245 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int32_fp32
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        int32_t z = GB_cast_to_int32_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                int32_t z = GB_cast_to_int32_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            int32_t z = GB_cast_to_int32_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_fp64.c
index 5f9ebf3580..5afe6ce0a7 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int32_fp64
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        int32_t z = GB_cast_to_int32_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                int32_t z = GB_cast_to_int32_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            int32_t z = GB_cast_to_int32_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_int16.c
index e1b9f12883..3e0a46cf71 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32 || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int32_int16
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        int32_t z = (int32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                int32_t z = (int32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            int32_t z = (int32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_int32.c
index b6ed43815f..9fb7f8b187 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_int32_int32
 // op(A') function:  GB_unop_tran__identity_int32_int32
 
 // C type:   int32_t
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_int32_int32
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        int32_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                int32_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            int32_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_int64.c
index e06308068e..74d2611d58 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32 || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int32_int64
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        int32_t z = (int32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                int32_t z = (int32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            int32_t z = (int32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_int8.c
index 088963ed5e..eb4c528dea 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32 || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int32_int8
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        int32_t z = (int32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                int32_t z = (int32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            int32_t z = (int32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint16.c
index 5d55e2a90f..bd92b39066 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32 || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int32_uint16
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        int32_t z = (int32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                int32_t z = (int32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            int32_t z = (int32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint32.c
index d87f3c7cfb..fd5fd432b5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32 || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int32_uint32
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        int32_t z = (int32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                int32_t z = (int32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            int32_t z = (int32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint64.c
index 46a3ecbc99..710204f32c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32 || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int32_uint64
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        int32_t z = (int32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                int32_t z = (int32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            int32_t z = (int32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint8.c
index 4ced1ddb94..2cb240e8ec 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int32_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT32 || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int32_uint8
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        int32_t z = (int32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                int32_t z = (int32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            int32_t z = (int32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int32_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_bool.c
index 38b76be742..82cff264b2 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64 || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int64_bool
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        int64_t z = (int64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                int64_t z = (int64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            int64_t z = (int64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_fc32.c
index f24d496e24..08ad028e4d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int64_fc32
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        int64_t z = GB_cast_to_int64_t ((double) crealf (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                int64_t z = GB_cast_to_int64_t ((double) crealf (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            int64_t z = GB_cast_to_int64_t ((double) crealf (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_fc64.c
index 04108d74e1..9c0a930143 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int64_fc64
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        int64_t z = GB_cast_to_int64_t (creal (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                int64_t z = GB_cast_to_int64_t (creal (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            int64_t z = GB_cast_to_int64_t (creal (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_fp32.c
index 1c2abdff49..1c8c5723b0 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int64_fp32
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        int64_t z = GB_cast_to_int64_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                int64_t z = GB_cast_to_int64_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            int64_t z = GB_cast_to_int64_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_fp64.c
index 842b54a2e8..2d774c35bd 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int64_fp64
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        int64_t z = GB_cast_to_int64_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                int64_t z = GB_cast_to_int64_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            int64_t z = GB_cast_to_int64_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_int16.c
index 9f0e815c0e..898af79eb7 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64 || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int64_int16
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        int64_t z = (int64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                int64_t z = (int64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            int64_t z = (int64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_int32.c
index 533c44a313..716f4d8e15 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64 || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int64_int32
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        int64_t z = (int64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                int64_t z = (int64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            int64_t z = (int64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_int64.c
index 3c1b4eac00..d0c1d46c94 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_int64_int64
 // op(A') function:  GB_unop_tran__identity_int64_int64
 
 // C type:   int64_t
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_int64_int64
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        int64_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                int64_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            int64_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_int8.c
index 1b0a2257f5..f0223da4a7 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64 || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int64_int8
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        int64_t z = (int64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                int64_t z = (int64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            int64_t z = (int64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint16.c
index ad35b08661..751f547285 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64 || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int64_uint16
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        int64_t z = (int64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                int64_t z = (int64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            int64_t z = (int64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint32.c
index a7849e21d5..d88ffb3038 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64 || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int64_uint32
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        int64_t z = (int64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                int64_t z = (int64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            int64_t z = (int64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint64.c
index 96d3d21851..b720c4ee76 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64 || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int64_uint64
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        int64_t z = (int64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                int64_t z = (int64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            int64_t z = (int64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint8.c
index d72ad09665..4568ae1396 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int64_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT64 || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int64_uint8
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        int64_t z = (int64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                int64_t z = (int64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            int64_t z = (int64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int64_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_bool.c
index 3cd7c2f953..9f52018943 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8 || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int8_bool
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        int8_t z = (int8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                int8_t z = (int8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            int8_t z = (int8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_fc32.c
index 37750538cc..415b75e51e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int8_fc32
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        int8_t z = GB_cast_to_int8_t ((double) crealf (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                int8_t z = GB_cast_to_int8_t ((double) crealf (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            int8_t z = GB_cast_to_int8_t ((double) crealf (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_fc64.c
index 4e3f761e09..d45ddbf913 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int8_fc64
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        int8_t z = GB_cast_to_int8_t (creal (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                int8_t z = GB_cast_to_int8_t (creal (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            int8_t z = GB_cast_to_int8_t (creal (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_fp32.c
index ec8fecf39d..e9250d5894 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int8_fp32
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        int8_t z = GB_cast_to_int8_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                int8_t z = GB_cast_to_int8_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            int8_t z = GB_cast_to_int8_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_fp64.c
index 16902717a5..9c211f311d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int8_fp64
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        int8_t z = GB_cast_to_int8_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                int8_t z = GB_cast_to_int8_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            int8_t z = GB_cast_to_int8_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_int16.c
index c50a1f0cdb..66198fde6a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8 || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int8_int16
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        int8_t z = (int8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                int8_t z = (int8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            int8_t z = (int8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_int32.c
index ee6882a0d9..77595613b4 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8 || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int8_int32
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        int8_t z = (int8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                int8_t z = (int8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            int8_t z = (int8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_int64.c
index 6ffea3eb4f..df45809d44 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8 || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int8_int64
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        int8_t z = (int8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                int8_t z = (int8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            int8_t z = (int8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_int8.c
index 182b49f664..3f64aeaec5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_int8_int8
 // op(A') function:  GB_unop_tran__identity_int8_int8
 
 // C type:   int8_t
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_int8_int8
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        int8_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                int8_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            int8_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint16.c
index fccd3ccfd0..e1b6344c48 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8 || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int8_uint16
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        int8_t z = (int8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                int8_t z = (int8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            int8_t z = (int8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint32.c
index 22e7244dd1..862bca1a5c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8 || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int8_uint32
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        int8_t z = (int8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                int8_t z = (int8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            int8_t z = (int8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint64.c
index b4f395d434..9f60ad14f6 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8 || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int8_uint64
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        int8_t z = (int8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                int8_t z = (int8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            int8_t z = (int8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint8.c
index 63531a0a35..10e512c2f5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_int8_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_INT8 || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_int8_uint8
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        int8_t z = (int8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                int8_t z = (int8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            int8_t z = (int8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_int8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_bool.c
index e9b190941b..cf1e0b9ec1 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16 || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint16_bool
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        uint16_t z = (uint16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                uint16_t z = (uint16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            uint16_t z = (uint16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fc32.c
index fbcbcd9997..fae88ee72f 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint16_fc32
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        uint16_t z = GB_cast_to_uint16_t ((double) crealf (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                uint16_t z = GB_cast_to_uint16_t ((double) crealf (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            uint16_t z = GB_cast_to_uint16_t ((double) crealf (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fc64.c
index 244a725b94..9f63e093f5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint16_fc64
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        uint16_t z = GB_cast_to_uint16_t (creal (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                uint16_t z = GB_cast_to_uint16_t (creal (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            uint16_t z = GB_cast_to_uint16_t (creal (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fp32.c
index ebf4a02ce2..c7ff8c5c61 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint16_fp32
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        uint16_t z = GB_cast_to_uint16_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                uint16_t z = GB_cast_to_uint16_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            uint16_t z = GB_cast_to_uint16_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fp64.c
index 389beba601..9dca470295 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint16_fp64
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        uint16_t z = GB_cast_to_uint16_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                uint16_t z = GB_cast_to_uint16_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            uint16_t z = GB_cast_to_uint16_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int16.c
index 2c5d0a40b6..016490d264 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16 || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint16_int16
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        uint16_t z = (uint16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                uint16_t z = (uint16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            uint16_t z = (uint16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int32.c
index e95f97669a..e806c7973b 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16 || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint16_int32
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        uint16_t z = (uint16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                uint16_t z = (uint16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            uint16_t z = (uint16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int64.c
index a9610a6137..1338278a77 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16 || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint16_int64
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        uint16_t z = (uint16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                uint16_t z = (uint16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            uint16_t z = (uint16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int8.c
index 8807f765e3..fa0c862741 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16 || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint16_int8
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        uint16_t z = (uint16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                uint16_t z = (uint16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            uint16_t z = (uint16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint16.c
index 81b04bb1bf..fb9775b009 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_uint16_uint16
 // op(A') function:  GB_unop_tran__identity_uint16_uint16
 
 // C type:   uint16_t
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_uint16_uint16
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        uint16_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                uint16_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            uint16_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint32.c
index 1123e494df..ca8c0fad4d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16 || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint16_uint32
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        uint16_t z = (uint16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                uint16_t z = (uint16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            uint16_t z = (uint16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint64.c
index 227965caad..e88d33f2fb 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16 || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint16_uint64
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        uint16_t z = (uint16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                uint16_t z = (uint16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            uint16_t z = (uint16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint8.c
index 30dd0ff318..4e810cec3e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint16_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT16 || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint16_uint8
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        uint16_t z = (uint16_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                uint16_t z = (uint16_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            uint16_t z = (uint16_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint16_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_bool.c
index 6a0a2449f2..cb1759c972 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32 || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint32_bool
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        uint32_t z = (uint32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                uint32_t z = (uint32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            uint32_t z = (uint32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fc32.c
index df8188f67c..f863119ec0 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint32_fc32
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        uint32_t z = GB_cast_to_uint32_t ((double) crealf (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                uint32_t z = GB_cast_to_uint32_t ((double) crealf (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            uint32_t z = GB_cast_to_uint32_t ((double) crealf (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fc64.c
index 6cb3153910..aab13d92ca 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint32_fc64
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        uint32_t z = GB_cast_to_uint32_t (creal (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                uint32_t z = GB_cast_to_uint32_t (creal (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            uint32_t z = GB_cast_to_uint32_t (creal (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fp32.c
index 2f5f6cb400..c5d8824ab3 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint32_fp32
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        uint32_t z = GB_cast_to_uint32_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                uint32_t z = GB_cast_to_uint32_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            uint32_t z = GB_cast_to_uint32_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fp64.c
index a6658cd01d..82d35736d6 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint32_fp64
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        uint32_t z = GB_cast_to_uint32_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                uint32_t z = GB_cast_to_uint32_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            uint32_t z = GB_cast_to_uint32_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int16.c
index 379f32a4fb..d8323b5d89 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32 || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint32_int16
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        uint32_t z = (uint32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                uint32_t z = (uint32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            uint32_t z = (uint32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int32.c
index e68716f54f..9b4d7315e9 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32 || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint32_int32
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        uint32_t z = (uint32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                uint32_t z = (uint32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            uint32_t z = (uint32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int64.c
index 3f7aae8781..668f835537 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32 || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint32_int64
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        uint32_t z = (uint32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                uint32_t z = (uint32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            uint32_t z = (uint32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int8.c
index 7eac66cd17..22d23c1140 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32 || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint32_int8
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        uint32_t z = (uint32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                uint32_t z = (uint32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            uint32_t z = (uint32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint16.c
index e83f4efa40..018247f2aa 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32 || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint32_uint16
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        uint32_t z = (uint32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                uint32_t z = (uint32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            uint32_t z = (uint32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint32.c
index 0441e85964..7cb6e97036 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_uint32_uint32
 // op(A') function:  GB_unop_tran__identity_uint32_uint32
 
 // C type:   uint32_t
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_uint32_uint32
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        uint32_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                uint32_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            uint32_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint64.c
index 19ed8ad7e7..c0b91e8b48 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32 || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint32_uint64
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        uint32_t z = (uint32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                uint32_t z = (uint32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            uint32_t z = (uint32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint8.c
index 00268a12e0..277079ddd4 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint32_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT32 || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint32_uint8
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        uint32_t z = (uint32_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                uint32_t z = (uint32_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            uint32_t z = (uint32_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint32_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_bool.c
index b7099a0b3f..175157fb1d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64 || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint64_bool
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        uint64_t z = (uint64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                uint64_t z = (uint64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            uint64_t z = (uint64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fc32.c
index dd40867474..bf2f167a7f 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint64_fc32
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        uint64_t z = GB_cast_to_uint64_t ((double) crealf (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                uint64_t z = GB_cast_to_uint64_t ((double) crealf (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            uint64_t z = GB_cast_to_uint64_t ((double) crealf (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fc64.c
index 52a918991f..2508917f50 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint64_fc64
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        uint64_t z = GB_cast_to_uint64_t (creal (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                uint64_t z = GB_cast_to_uint64_t (creal (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            uint64_t z = GB_cast_to_uint64_t (creal (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fp32.c
index 43fa4db4fb..96396adbf3 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint64_fp32
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        uint64_t z = GB_cast_to_uint64_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                uint64_t z = GB_cast_to_uint64_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            uint64_t z = GB_cast_to_uint64_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fp64.c
index ae428fb4a3..cd0970c21d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint64_fp64
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        uint64_t z = GB_cast_to_uint64_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                uint64_t z = GB_cast_to_uint64_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            uint64_t z = GB_cast_to_uint64_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int16.c
index cdabde9b4d..182696acdc 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64 || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint64_int16
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        uint64_t z = (uint64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                uint64_t z = (uint64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            uint64_t z = (uint64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int32.c
index 96ad6882b8..84d6776e24 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64 || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint64_int32
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        uint64_t z = (uint64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                uint64_t z = (uint64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            uint64_t z = (uint64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int64.c
index 0291805c7e..d86bf738a3 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64 || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint64_int64
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        uint64_t z = (uint64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                uint64_t z = (uint64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            uint64_t z = (uint64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int8.c
index 19b4e98003..94d6786f20 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64 || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint64_int8
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        uint64_t z = (uint64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                uint64_t z = (uint64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            uint64_t z = (uint64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint16.c
index 7e7d56f3c6..c8d13c8f75 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64 || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint64_uint16
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        uint64_t z = (uint64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                uint64_t z = (uint64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            uint64_t z = (uint64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint32.c
index bfa5eed6e8..03c80faa42 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64 || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint64_uint32
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        uint64_t z = (uint64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                uint64_t z = (uint64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            uint64_t z = (uint64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint64.c
index 5ff7e42160..628121fe89 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_uint64_uint64
 // op(A') function:  GB_unop_tran__identity_uint64_uint64
 
 // C type:   uint64_t
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_uint64_uint64
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        uint64_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                uint64_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            uint64_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint8.c
index 2bd0b4aa1f..f9d2d43f68 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint64_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT64 || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint64_uint8
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        uint64_t z = (uint64_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                uint64_t z = (uint64_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            uint64_t z = (uint64_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint64_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_bool.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_bool.c
index e93cbd5eee..bd1124fef4 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8 || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint8_bool
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        uint8_t z = (uint8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                uint8_t z = (uint8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            uint8_t z = (uint8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fc32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fc32.c
index ef2eed905f..fcf6eb10d5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint8_fc32
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        uint8_t z = GB_cast_to_uint8_t ((double) crealf (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                uint8_t z = GB_cast_to_uint8_t ((double) crealf (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            uint8_t z = GB_cast_to_uint8_t ((double) crealf (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fc64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fc64.c
index 5716972b3d..cc44ab27d0 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint8_fc64
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        uint8_t z = GB_cast_to_uint8_t (creal (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                uint8_t z = GB_cast_to_uint8_t (creal (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            uint8_t z = GB_cast_to_uint8_t (creal (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fp32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fp32.c
index d83e9218d5..83e6f13670 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint8_fp32
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        uint8_t z = GB_cast_to_uint8_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                uint8_t z = GB_cast_to_uint8_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            uint8_t z = GB_cast_to_uint8_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fp64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fp64.c
index 7cd1ee6d7d..570d12a78e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint8_fp64
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        uint8_t z = GB_cast_to_uint8_t ((double) (aij)) ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                uint8_t z = GB_cast_to_uint8_t ((double) (aij)) ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            uint8_t z = GB_cast_to_uint8_t ((double) (aij)) ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int16.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int16.c
index ac47a6f766..5430270749 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8 || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint8_int16
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        uint8_t z = (uint8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                uint8_t z = (uint8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            uint8_t z = (uint8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int32.c
index 350391f03d..0d2bf3c4bd 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8 || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint8_int32
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        uint8_t z = (uint8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                uint8_t z = (uint8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            uint8_t z = (uint8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int64.c
index 4d5c147ecf..a1ee3f6565 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8 || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint8_int64
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        uint8_t z = (uint8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                uint8_t z = (uint8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            uint8_t z = (uint8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int8.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int8.c
index 3977b4dd6a..368d868196 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8 || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint8_int8
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        uint8_t z = (uint8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                uint8_t z = (uint8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            uint8_t z = (uint8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint16.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint16.c
index 5514d9157e..e78b3c3b19 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8 || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint8_uint16
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        uint8_t z = (uint8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                uint8_t z = (uint8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            uint8_t z = (uint8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint32.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint32.c
index 80a5a3a69d..809e4a1069 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8 || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint8_uint32
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        uint8_t z = (uint8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                uint8_t z = (uint8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            uint8_t z = (uint8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint64.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint64.c
index a8d558f064..531108ef95 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8 || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__identity_uint8_uint64
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        uint8_t z = (uint8_t) aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                uint8_t z = (uint8_t) aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            uint8_t z = (uint8_t) aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint8.c b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint8.c
index da7b808699..5bc4b4fc47 100644
--- a/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__identity_uint8_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,11 +12,12 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
 
-// op(A)  function:  (none)
+// op(A)  function:  GB_unop_apply__identity_uint8_uint8
 // op(A') function:  GB_unop_tran__identity_uint8_uint8
 
 // C type:   uint8_t
@@ -54,6 +55,10 @@
     Cx [pC] = z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    1
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_IDENTITY || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-#if 0
-
-GrB_Info (none)
+GrB_Info GB_unop_apply__identity_uint8_uint8
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        uint8_t z = aij ;
-        Cx [p] = z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                uint8_t z = aij ;
+                Cx [p] = z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            uint8_t z = aij ;
+            Cx [p] = z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-#endif
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__identity_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__include.h b/GraphBLAS/Source/Generated/GB_unop__include.h
index 60ee263cf9..cbb7372ac6 100644
--- a/GraphBLAS/Source/Generated/GB_unop__include.h
+++ b/GraphBLAS/Source/Generated/GB_unop__include.h
@@ -2,8195 +2,7448 @@
 // GB_unop__include.h: definitions for GB_unop__*.c
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txargt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 // This file has been automatically generated from Generator/GB_unop.h
 
-#include "GB_iterator.h"
-
-
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_bool_bool
 (
     bool *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_bool_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_int8_int8
 (
     int8_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_int16_int16
 (
     int16_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_int32_int32
 (
     int32_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_int64_int64
 (
     int64_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_uint8_uint8
 (
     uint8_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_uint16_uint16
 (
     uint16_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_uint32_uint32
 (
     uint32_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_uint64_uint64
 (
     uint64_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__one_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__one_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_bool_bool
 (
     bool *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_bool_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_bool_int8
 (
     bool *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_bool_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_bool_int16
 (
     bool *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_bool_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_bool_int32
 (
     bool *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_bool_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_bool_int64
 (
     bool *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_bool_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_bool_uint8
 (
     bool *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_bool_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_bool_uint16
 (
     bool *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_bool_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_bool_uint32
 (
     bool *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_bool_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_bool_uint64
 (
     bool *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_bool_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_bool_fp32
 (
     bool *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_bool_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_bool_fp64
 (
     bool *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_bool_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_bool_fc32
 (
     bool *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_bool_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_bool_fc64
 (
     bool *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_bool_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int8_bool
 (
     int8_t *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int8_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_int8_int8
 (
     int8_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int8_int16
 (
     int8_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int8_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int8_int32
 (
     int8_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int8_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int8_int64
 (
     int8_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int8_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int8_uint8
 (
     int8_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int8_uint16
 (
     int8_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int8_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int8_uint32
 (
     int8_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int8_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int8_uint64
 (
     int8_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int8_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int8_fp32
 (
     int8_t *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int8_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int8_fp64
 (
     int8_t *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int8_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int8_fc32
 (
     int8_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int8_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int8_fc64
 (
     int8_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int8_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int16_bool
 (
     int16_t *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int16_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int16_int8
 (
     int16_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int16_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_int16_int16
 (
     int16_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int16_int32
 (
     int16_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int16_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int16_int64
 (
     int16_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int16_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int16_uint8
 (
     int16_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int16_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int16_uint16
 (
     int16_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int16_uint32
 (
     int16_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int16_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int16_uint64
 (
     int16_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int16_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int16_fp32
 (
     int16_t *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int16_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int16_fp64
 (
     int16_t *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int16_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int16_fc32
 (
     int16_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int16_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int16_fc64
 (
     int16_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int16_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int32_bool
 (
     int32_t *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int32_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int32_int8
 (
     int32_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int32_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int32_int16
 (
     int32_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int32_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_int32_int32
 (
     int32_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int32_int64
 (
     int32_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int32_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int32_uint8
 (
     int32_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int32_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int32_uint16
 (
     int32_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int32_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int32_uint32
 (
     int32_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int32_uint64
 (
     int32_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int32_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int32_fp32
 (
     int32_t *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int32_fp64
 (
     int32_t *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int32_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int32_fc32
 (
     int32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int32_fc64
 (
     int32_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int32_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int64_bool
 (
     int64_t *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int64_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int64_int8
 (
     int64_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int64_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int64_int16
 (
     int64_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int64_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int64_int32
 (
     int64_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int64_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_int64_int64
 (
     int64_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int64_uint8
 (
     int64_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int64_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int64_uint16
 (
     int64_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int64_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int64_uint32
 (
     int64_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int64_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int64_uint64
 (
     int64_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int64_fp32
 (
     int64_t *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int64_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int64_fp64
 (
     int64_t *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int64_fc32
 (
     int64_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int64_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_int64_fc64
 (
     int64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_int64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint8_bool
 (
     uint8_t *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint8_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint8_int8
 (
     uint8_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint8_int16
 (
     uint8_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint8_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint8_int32
 (
     uint8_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint8_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint8_int64
 (
     uint8_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint8_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_uint8_uint8
 (
     uint8_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint8_uint16
 (
     uint8_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint8_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint8_uint32
 (
     uint8_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint8_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint8_uint64
 (
     uint8_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint8_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint8_fp32
 (
     uint8_t *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint8_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint8_fp64
 (
     uint8_t *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint8_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint8_fc32
 (
     uint8_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint8_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint8_fc64
 (
     uint8_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint8_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint16_bool
 (
     uint16_t *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint16_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint16_int8
 (
     uint16_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint16_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint16_int16
 (
     uint16_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint16_int32
 (
     uint16_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint16_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint16_int64
 (
     uint16_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint16_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint16_uint8
 (
     uint16_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint16_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_uint16_uint16
 (
     uint16_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint16_uint32
 (
     uint16_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint16_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint16_uint64
 (
     uint16_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint16_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint16_fp32
 (
     uint16_t *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint16_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint16_fp64
 (
     uint16_t *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint16_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint16_fc32
 (
     uint16_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint16_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint16_fc64
 (
     uint16_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint16_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint32_bool
 (
     uint32_t *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint32_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint32_int8
 (
     uint32_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint32_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint32_int16
 (
     uint32_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint32_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint32_int32
 (
     uint32_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint32_int64
 (
     uint32_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint32_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint32_uint8
 (
     uint32_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint32_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint32_uint16
 (
     uint32_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint32_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_uint32_uint32
 (
     uint32_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint32_uint64
 (
     uint32_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint32_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint32_fp32
 (
     uint32_t *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint32_fp64
 (
     uint32_t *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint32_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint32_fc32
 (
     uint32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint32_fc64
 (
     uint32_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint32_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint64_bool
 (
     uint64_t *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint64_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint64_int8
 (
     uint64_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint64_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint64_int16
 (
     uint64_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint64_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint64_int32
 (
     uint64_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint64_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint64_int64
 (
     uint64_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint64_uint8
 (
     uint64_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint64_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint64_uint16
 (
     uint64_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint64_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint64_uint32
 (
     uint64_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint64_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_uint64_uint64
 (
     uint64_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint64_fp32
 (
     uint64_t *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint64_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint64_fp64
 (
     uint64_t *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint64_fc32
 (
     uint64_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint64_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_uint64_fc64
 (
     uint64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_uint64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp32_bool
 (
     float *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp32_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp32_int8
 (
     float *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp32_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp32_int16
 (
     float *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp32_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp32_int32
 (
     float *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp32_int64
 (
     float *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp32_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp32_uint8
 (
     float *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp32_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp32_uint16
 (
     float *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp32_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp32_uint32
 (
     float *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp32_uint64
 (
     float *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp32_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp32_fp64
 (
     float *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp32_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp32_fc32
 (
     float *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp32_fc64
 (
     float *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp32_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp64_bool
 (
     double *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp64_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp64_int8
 (
     double *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp64_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp64_int16
 (
     double *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp64_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp64_int32
 (
     double *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp64_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp64_int64
 (
     double *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp64_uint8
 (
     double *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp64_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp64_uint16
 (
     double *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp64_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp64_uint32
 (
     double *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp64_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp64_uint64
 (
     double *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp64_fp32
 (
     double *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp64_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp64_fc32
 (
     double *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp64_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fp64_fc64
 (
     double *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fp64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc32_bool
 (
     GxB_FC32_t *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc32_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc32_int8
 (
     GxB_FC32_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc32_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc32_int16
 (
     GxB_FC32_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc32_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc32_int32
 (
     GxB_FC32_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc32_int64
 (
     GxB_FC32_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc32_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc32_uint8
 (
     GxB_FC32_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc32_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc32_uint16
 (
     GxB_FC32_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc32_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc32_uint32
 (
     GxB_FC32_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc32_uint64
 (
     GxB_FC32_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc32_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc32_fp32
 (
     GxB_FC32_t *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc32_fp64
 (
     GxB_FC32_t *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc32_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc32_fc64
 (
     GxB_FC32_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc32_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc64_bool
 (
     GxB_FC64_t *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc64_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc64_int8
 (
     GxB_FC64_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc64_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc64_int16
 (
     GxB_FC64_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc64_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc64_int32
 (
     GxB_FC64_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc64_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc64_int64
 (
     GxB_FC64_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc64_uint8
 (
     GxB_FC64_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc64_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc64_uint16
 (
     GxB_FC64_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc64_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc64_uint32
 (
     GxB_FC64_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc64_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc64_uint64
 (
     GxB_FC64_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc64_fp32
 (
     GxB_FC64_t *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc64_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc64_fp64
 (
     GxB_FC64_t *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__identity_fc64_fc32
 (
     GxB_FC64_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__identity_fc64_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-#if 0
-
-GrB_Info (none)
+// SPDX-License-Identifier: Apache-2.0
+GrB_Info GB_unop_apply__identity_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-#endif
-
 GrB_Info GB_unop_tran__identity_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_bool_bool
 (
     bool *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_bool_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_int8_int8
 (
     int8_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_int16_int16
 (
     int16_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_int32_int32
 (
     int32_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_int64_int64
 (
     int64_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_uint8_uint8
 (
     uint8_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_uint16_uint16
 (
     uint16_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_uint32_uint32
 (
     uint32_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_uint64_uint64
 (
     uint64_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ainv_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ainv_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_bool_bool
 (
     bool *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_bool_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_int8_int8
 (
     int8_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_int16_int16
 (
     int16_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_int32_int32
 (
     int32_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_int64_int64
 (
     int64_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_uint8_uint8
 (
     uint8_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_uint16_uint16
 (
     uint16_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_uint32_uint32
 (
     uint32_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_uint64_uint64
 (
     uint64_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_bool_bool
 (
     bool *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_bool_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_int8_int8
 (
     int8_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_int16_int16
 (
     int16_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_int32_int32
 (
     int32_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_int64_int64
 (
     int64_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_uint8_uint8
 (
     uint8_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_uint16_uint16
 (
     uint16_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_uint32_uint32
 (
     uint32_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_uint64_uint64
 (
     uint64_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__minv_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__minv_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lnot_bool_bool
 (
     bool *Cx,
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lnot_bool_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lnot_int8_int8
 (
     int8_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lnot_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lnot_int16_int16
 (
     int16_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lnot_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lnot_int32_int32
 (
     int32_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lnot_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lnot_int64_int64
 (
     int64_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lnot_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lnot_uint8_uint8
 (
     uint8_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lnot_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lnot_uint16_uint16
 (
     uint16_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lnot_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lnot_uint32_uint32
 (
     uint32_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lnot_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lnot_uint64_uint64
 (
     uint64_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lnot_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lnot_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lnot_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lnot_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lnot_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__bnot_int8_int8
 (
     int8_t *Cx,
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__bnot_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__bnot_int16_int16
 (
     int16_t *Cx,
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__bnot_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__bnot_int32_int32
 (
     int32_t *Cx,
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__bnot_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__bnot_int64_int64
 (
     int64_t *Cx,
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__bnot_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__bnot_uint8_uint8
 (
     uint8_t *Cx,
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__bnot_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__bnot_uint16_uint16
 (
     uint16_t *Cx,
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__bnot_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__bnot_uint32_uint32
 (
     uint32_t *Cx,
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__bnot_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__bnot_uint64_uint64
 (
     uint64_t *Cx,
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__bnot_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__sqrt_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__sqrt_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__sqrt_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__sqrt_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__sqrt_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__sqrt_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__sqrt_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__sqrt_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__exp_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__exp_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__exp_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__exp_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__exp_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__exp_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__exp_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__exp_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__sin_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__sin_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__sin_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__sin_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__sin_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__sin_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__sin_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__sin_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__cos_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__cos_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__cos_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__cos_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__cos_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__cos_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__cos_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__cos_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__tan_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__tan_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__tan_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__tan_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__tan_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__tan_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__tan_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__tan_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__asin_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__asin_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__asin_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__asin_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__asin_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__asin_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__asin_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__asin_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__acos_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__acos_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__acos_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__acos_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__acos_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__acos_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__acos_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__acos_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__atan_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__atan_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__atan_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__atan_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__atan_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__atan_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__atan_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__atan_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__sinh_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__sinh_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__sinh_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__sinh_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__sinh_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__sinh_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__sinh_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__sinh_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__cosh_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__cosh_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__cosh_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__cosh_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__cosh_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__cosh_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__cosh_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__cosh_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__tanh_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__tanh_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__tanh_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__tanh_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__tanh_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__tanh_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__tanh_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__tanh_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__asinh_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__asinh_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__asinh_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__asinh_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__asinh_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__asinh_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__asinh_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__asinh_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__acosh_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__acosh_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__acosh_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__acosh_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__acosh_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__acosh_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__acosh_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__acosh_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__atanh_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__atanh_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__atanh_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__atanh_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__atanh_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__atanh_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__atanh_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__atanh_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__signum_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__signum_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__signum_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__signum_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__signum_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__signum_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__signum_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__signum_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ceil_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ceil_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ceil_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ceil_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ceil_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ceil_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__ceil_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__ceil_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__floor_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__floor_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__floor_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__floor_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__floor_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__floor_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__floor_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__floor_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__round_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__round_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__round_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__round_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__round_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__round_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__round_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__round_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__trunc_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__trunc_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__trunc_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__trunc_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__trunc_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__trunc_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__trunc_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__trunc_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__exp2_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__exp2_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__exp2_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__exp2_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__exp2_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__exp2_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__exp2_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__exp2_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__expm1_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__expm1_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__expm1_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__expm1_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__expm1_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__expm1_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__expm1_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__expm1_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log10_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log10_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log10_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log10_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log10_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log10_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log10_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log10_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log1p_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log1p_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log1p_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log1p_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log1p_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log1p_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log1p_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log1p_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log2_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log2_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log2_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log2_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log2_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log2_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__log2_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__log2_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__frexpx_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__frexpx_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__frexpx_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__frexpx_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__frexpe_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__frexpe_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__frexpe_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__frexpe_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lgamma_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lgamma_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__lgamma_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__lgamma_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__tgamma_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__tgamma_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__tgamma_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__tgamma_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__erf_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__erf_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__erf_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__erf_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__erfc_fp32_fp32
 (
     float *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__erfc_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__erfc_fp64_fp64
 (
     double *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__erfc_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__conj_fc32_fc32
 (
     GxB_FC32_t *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__conj_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__conj_fc64_fc64
 (
     GxB_FC64_t *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__conj_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_fp32_fc32
 (
     float *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_fp32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__abs_fp64_fc64
 (
     double *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__abs_fp64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__creal_fp32_fc32
 (
     float *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__creal_fp32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__creal_fp64_fc64
 (
     double *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__creal_fp64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__cimag_fp32_fc32
 (
     float *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__cimag_fp32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__cimag_fp64_fc64
 (
     double *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__cimag_fp64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__carg_fp32_fc32
 (
     float *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__carg_fp32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__carg_fp64_fc64
 (
     double *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__carg_fp64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__isinf_bool_fp32
 (
     bool *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__isinf_bool_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__isinf_bool_fp64
 (
     bool *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__isinf_bool_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__isinf_bool_fc32
 (
     bool *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__isinf_bool_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__isinf_bool_fc64
 (
     bool *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__isinf_bool_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__isnan_bool_fp32
 (
     bool *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__isnan_bool_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__isnan_bool_fp64
 (
     bool *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__isnan_bool_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__isnan_bool_fc32
 (
     bool *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__isnan_bool_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__isnan_bool_fc64
 (
     bool *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__isnan_bool_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__isfinite_bool_fp32
 (
     bool *Cx,
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__isfinite_bool_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__isfinite_bool_fp64
 (
     bool *Cx,
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__isfinite_bool_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__isfinite_bool_fc32
 (
     bool *Cx,
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__isfinite_bool_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
-
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply__isfinite_bool_fc64
 (
     bool *Cx,
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-
-
 GrB_Info GB_unop_tran__isfinite_bool_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
diff --git a/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fc32.c b/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fc32.c
index 51d6ce99f3..bb4e7f325c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cisfinitef (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ISFINITE || GxB_NO_BOOL || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__isfinite_bool_fc32
 (
     bool *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = (aij) ;
-        Cx [p] = GB_cisfinitef (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = (aij) ;
+                Cx [p] = GB_cisfinitef (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = (aij) ;
+            Cx [p] = GB_cisfinitef (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__isfinite_bool_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fc64.c b/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fc64.c
index ef73e5a752..c6dcf5e40a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cisfinite (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ISFINITE || GxB_NO_BOOL || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__isfinite_bool_fc64
 (
     bool *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = (aij) ;
-        Cx [p] = GB_cisfinite (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = (aij) ;
+                Cx [p] = GB_cisfinite (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = (aij) ;
+            Cx [p] = GB_cisfinite (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__isfinite_bool_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fp32.c b/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fp32.c
index b726daf59b..dc40665787 100644
--- a/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = isfinite (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ISFINITE || GxB_NO_BOOL || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__isfinite_bool_fp32
 (
     bool *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = (aij) ;
-        Cx [p] = isfinite (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = (aij) ;
+                Cx [p] = isfinite (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = (aij) ;
+            Cx [p] = isfinite (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__isfinite_bool_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fp64.c b/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fp64.c
index c8710b99c1..56815aac2c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__isfinite_bool_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = isfinite (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ISFINITE || GxB_NO_BOOL || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__isfinite_bool_fp64
 (
     bool *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = (aij) ;
-        Cx [p] = isfinite (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = (aij) ;
+                Cx [p] = isfinite (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = (aij) ;
+            Cx [p] = isfinite (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__isfinite_bool_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fc32.c b/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fc32.c
index 2977878a7e..cd6162c867 100644
--- a/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cisinff (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ISINF || GxB_NO_BOOL || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__isinf_bool_fc32
 (
     bool *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = (aij) ;
-        Cx [p] = GB_cisinff (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = (aij) ;
+                Cx [p] = GB_cisinff (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = (aij) ;
+            Cx [p] = GB_cisinff (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__isinf_bool_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fc64.c b/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fc64.c
index 70de6f4969..fc343d5c33 100644
--- a/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cisinf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ISINF || GxB_NO_BOOL || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__isinf_bool_fc64
 (
     bool *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = (aij) ;
-        Cx [p] = GB_cisinf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = (aij) ;
+                Cx [p] = GB_cisinf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = (aij) ;
+            Cx [p] = GB_cisinf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__isinf_bool_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fp32.c b/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fp32.c
index 140fc8db24..b0c10839e5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = isinf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ISINF || GxB_NO_BOOL || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__isinf_bool_fp32
 (
     bool *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = (aij) ;
-        Cx [p] = isinf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = (aij) ;
+                Cx [p] = isinf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = (aij) ;
+            Cx [p] = isinf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__isinf_bool_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fp64.c b/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fp64.c
index ef6a9d21b1..6496a6fa82 100644
--- a/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__isinf_bool_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = isinf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ISINF || GxB_NO_BOOL || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__isinf_bool_fp64
 (
     bool *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = (aij) ;
-        Cx [p] = isinf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = (aij) ;
+                Cx [p] = isinf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = (aij) ;
+            Cx [p] = isinf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__isinf_bool_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fc32.c b/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fc32.c
index 69e50bb65c..843bf7c527 100644
--- a/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cisnanf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ISNAN || GxB_NO_BOOL || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__isnan_bool_fc32
 (
     bool *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = (aij) ;
-        Cx [p] = GB_cisnanf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = (aij) ;
+                Cx [p] = GB_cisnanf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = (aij) ;
+            Cx [p] = GB_cisnanf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__isnan_bool_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fc64.c b/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fc64.c
index 5325f81d0c..a97b37f8c2 100644
--- a/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cisnan (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ISNAN || GxB_NO_BOOL || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__isnan_bool_fc64
 (
     bool *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = (aij) ;
-        Cx [p] = GB_cisnan (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = (aij) ;
+                Cx [p] = GB_cisnan (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = (aij) ;
+            Cx [p] = GB_cisnan (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__isnan_bool_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fp32.c b/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fp32.c
index ae0cae03da..6ac57ae497 100644
--- a/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = isnan (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ISNAN || GxB_NO_BOOL || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__isnan_bool_fp32
 (
     bool *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = (aij) ;
-        Cx [p] = isnan (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = (aij) ;
+                Cx [p] = isnan (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = (aij) ;
+            Cx [p] = isnan (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__isnan_bool_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fp64.c b/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fp64.c
index feb23eccfc..c55e462017 100644
--- a/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__isnan_bool_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = isnan (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ISNAN || GxB_NO_BOOL || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__isnan_bool_fp64
 (
     bool *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = (aij) ;
-        Cx [p] = isnan (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = (aij) ;
+                Cx [p] = isnan (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = (aij) ;
+            Cx [p] = isnan (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__isnan_bool_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lgamma_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__lgamma_fp32_fp32.c
index 6f67629e45..755018165c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lgamma_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lgamma_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = lgammaf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LGAMMA || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lgamma_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = lgammaf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = lgammaf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = lgammaf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lgamma_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lgamma_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__lgamma_fp64_fp64.c
index b3947059fe..5662c1a6ee 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lgamma_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lgamma_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = lgamma (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LGAMMA || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lgamma_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = lgamma (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = lgamma (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = lgamma (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lgamma_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lnot_bool_bool.c b/GraphBLAS/Source/Generated/GB_unop__lnot_bool_bool.c
index 36851685a0..ab55c432df 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lnot_bool_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lnot_bool_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = !z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LNOT || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lnot_bool_bool
 (
     bool *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        bool aij = Ax [p] ;
-        bool z = aij ;
-        Cx [p] = !z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                bool aij = Ax [p] ;
+                bool z = aij ;
+                Cx [p] = !z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            bool aij = Ax [p] ;
+            bool z = aij ;
+            Cx [p] = !z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lnot_bool_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lnot_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__lnot_fp32_fp32.c
index 6938a0f3e0..f87068f9fe 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lnot_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lnot_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = !(z != 0) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LNOT || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lnot_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = !(z != 0) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = !(z != 0) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = !(z != 0) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lnot_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lnot_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__lnot_fp64_fp64.c
index a24d13d83c..f8ce98250f 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lnot_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lnot_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = !(z != 0) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LNOT || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lnot_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = !(z != 0) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = !(z != 0) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = !(z != 0) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lnot_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lnot_int16_int16.c b/GraphBLAS/Source/Generated/GB_unop__lnot_int16_int16.c
index 64e118834e..bb43a239bd 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lnot_int16_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lnot_int16_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = !(z != 0) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LNOT || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lnot_int16_int16
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        int16_t z = aij ;
-        Cx [p] = !(z != 0) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                int16_t z = aij ;
+                Cx [p] = !(z != 0) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            int16_t z = aij ;
+            Cx [p] = !(z != 0) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lnot_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lnot_int32_int32.c b/GraphBLAS/Source/Generated/GB_unop__lnot_int32_int32.c
index 27c720769c..a83a6d86c5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lnot_int32_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lnot_int32_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = !(z != 0) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LNOT || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lnot_int32_int32
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        int32_t z = aij ;
-        Cx [p] = !(z != 0) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                int32_t z = aij ;
+                Cx [p] = !(z != 0) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            int32_t z = aij ;
+            Cx [p] = !(z != 0) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lnot_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lnot_int64_int64.c b/GraphBLAS/Source/Generated/GB_unop__lnot_int64_int64.c
index 36a5aff6d0..2025716dd1 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lnot_int64_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lnot_int64_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = !(z != 0) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LNOT || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lnot_int64_int64
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        int64_t z = aij ;
-        Cx [p] = !(z != 0) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                int64_t z = aij ;
+                Cx [p] = !(z != 0) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            int64_t z = aij ;
+            Cx [p] = !(z != 0) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lnot_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lnot_int8_int8.c b/GraphBLAS/Source/Generated/GB_unop__lnot_int8_int8.c
index e358a2eee2..90a0f5c7ed 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lnot_int8_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lnot_int8_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = !(z != 0) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LNOT || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lnot_int8_int8
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        int8_t z = aij ;
-        Cx [p] = !(z != 0) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                int8_t z = aij ;
+                Cx [p] = !(z != 0) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            int8_t z = aij ;
+            Cx [p] = !(z != 0) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lnot_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lnot_uint16_uint16.c b/GraphBLAS/Source/Generated/GB_unop__lnot_uint16_uint16.c
index 8656956ed4..c9a0a0d555 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lnot_uint16_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lnot_uint16_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = !(z != 0) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LNOT || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lnot_uint16_uint16
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        uint16_t z = aij ;
-        Cx [p] = !(z != 0) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                uint16_t z = aij ;
+                Cx [p] = !(z != 0) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            uint16_t z = aij ;
+            Cx [p] = !(z != 0) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lnot_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lnot_uint32_uint32.c b/GraphBLAS/Source/Generated/GB_unop__lnot_uint32_uint32.c
index 0addac8bed..57eb044283 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lnot_uint32_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lnot_uint32_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = !(z != 0) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LNOT || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lnot_uint32_uint32
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        uint32_t z = aij ;
-        Cx [p] = !(z != 0) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                uint32_t z = aij ;
+                Cx [p] = !(z != 0) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            uint32_t z = aij ;
+            Cx [p] = !(z != 0) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lnot_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lnot_uint64_uint64.c b/GraphBLAS/Source/Generated/GB_unop__lnot_uint64_uint64.c
index f3d1a54439..374cb07752 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lnot_uint64_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lnot_uint64_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = !(z != 0) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LNOT || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lnot_uint64_uint64
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        uint64_t z = aij ;
-        Cx [p] = !(z != 0) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                uint64_t z = aij ;
+                Cx [p] = !(z != 0) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            uint64_t z = aij ;
+            Cx [p] = !(z != 0) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lnot_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__lnot_uint8_uint8.c b/GraphBLAS/Source/Generated/GB_unop__lnot_uint8_uint8.c
index d449a36654..c95c2d3d32 100644
--- a/GraphBLAS/Source/Generated/GB_unop__lnot_uint8_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__lnot_uint8_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = !(z != 0) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LNOT || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__lnot_uint8_uint8
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        uint8_t z = aij ;
-        Cx [p] = !(z != 0) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                uint8_t z = aij ;
+                Cx [p] = !(z != 0) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            uint8_t z = aij ;
+            Cx [p] = !(z != 0) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__lnot_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log10_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__log10_fc32_fc32.c
index 7169c82086..3b246b748b 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log10_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log10_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_clog10f (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG10 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log10_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = GB_clog10f (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = GB_clog10f (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = GB_clog10f (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log10_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log10_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__log10_fc64_fc64.c
index b87fcfe673..573edd1851 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log10_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log10_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_clog10 (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG10 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log10_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = GB_clog10 (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = GB_clog10 (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = GB_clog10 (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log10_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log10_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__log10_fp32_fp32.c
index fa724703b0..bb658c9fc6 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log10_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log10_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = log10f (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG10 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log10_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = log10f (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = log10f (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = log10f (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log10_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log10_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__log10_fp64_fp64.c
index 5bf90bbfe6..b0d408fe19 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log10_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log10_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = log10 (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG10 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log10_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = log10 (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = log10 (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = log10 (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log10_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log1p_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__log1p_fc32_fc32.c
index d2ee4e5b30..9115ecb01f 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log1p_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log1p_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_clog1pf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG1P || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log1p_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = GB_clog1pf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = GB_clog1pf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = GB_clog1pf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log1p_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log1p_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__log1p_fc64_fc64.c
index 50a6434f3c..42041d1a69 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log1p_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log1p_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_clog1p (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG1P || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log1p_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = GB_clog1p (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = GB_clog1p (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = GB_clog1p (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log1p_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log1p_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__log1p_fp32_fp32.c
index ab98ce34d2..eb2806933c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log1p_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log1p_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = log1pf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG1P || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log1p_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = log1pf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = log1pf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = log1pf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log1p_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log1p_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__log1p_fp64_fp64.c
index 90c6d29fdf..9c00378828 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log1p_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log1p_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = log1p (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG1P || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log1p_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = log1p (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = log1p (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = log1p (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log1p_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log2_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__log2_fc32_fc32.c
index 7b4799d8c6..b5fc8c3a35 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log2_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log2_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_clog2f (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG2 || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log2_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = GB_clog2f (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = GB_clog2f (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = GB_clog2f (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log2_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log2_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__log2_fc64_fc64.c
index a9e90d9c9f..beb2172731 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log2_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log2_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_clog2 (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG2 || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log2_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = GB_clog2 (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = GB_clog2 (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = GB_clog2 (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log2_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log2_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__log2_fp32_fp32.c
index 479388cac2..eebb6f5efd 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log2_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log2_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = log2f (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG2 || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log2_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = log2f (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = log2f (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = log2f (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log2_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log2_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__log2_fp64_fp64.c
index d49968c71c..2be9ef6247 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log2_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log2_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = log2 (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG2 || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log2_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = log2 (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = log2 (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = log2 (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log2_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__log_fc32_fc32.c
index a345bbb0e2..5d3cd83fdf 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = clogf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = clogf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = clogf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = clogf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__log_fc64_fc64.c
index 5ce715db85..f56077a931 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = clog (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = clog (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = clog (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = clog (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__log_fp32_fp32.c
index 376b192d12..3f54faee30 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = logf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = logf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = logf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = logf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__log_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__log_fp64_fp64.c
index 84a4054c44..1d16aa356f 100644
--- a/GraphBLAS/Source/Generated/GB_unop__log_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__log_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = log (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_LOG || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__log_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = log (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = log (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = log (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__log_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_bool_bool.c b/GraphBLAS/Source/Generated/GB_unop__minv_bool_bool.c
index 033b59f7db..1403d5c0ab 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_bool_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_bool_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = true ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_bool_bool
 (
     bool *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = true ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = true ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = true ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_bool_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__minv_fc32_fc32.c
index 30870e52a9..2a6117baa8 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_FC32_minv (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = GB_FC32_minv (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = GB_FC32_minv (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = GB_FC32_minv (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__minv_fc64_fc64.c
index 310689368e..df2b52158a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_FC64_minv (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = GB_FC64_minv (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = GB_FC64_minv (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = GB_FC64_minv (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__minv_fp32_fp32.c
index 805ddb99c4..2ac0a52544 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = (1.0F)/z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = (1.0F)/z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = (1.0F)/z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = (1.0F)/z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__minv_fp64_fp64.c
index 6cda5e0771..4ff168d5e0 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = 1./z ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = 1./z ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = 1./z ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = 1./z ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_int16_int16.c b/GraphBLAS/Source/Generated/GB_unop__minv_int16_int16.c
index 7fb36f22ce..78f3795b91 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_int16_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_int16_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_IMINV_SIGNED (z, 16) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_int16_int16
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int16_t aij = Ax [p] ;
-        int16_t z = aij ;
-        Cx [p] = GB_IMINV_SIGNED (z, 16) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int16_t aij = Ax [p] ;
+                int16_t z = aij ;
+                Cx [p] = GB_IMINV_SIGNED (z, 16) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int16_t aij = Ax [p] ;
+            int16_t z = aij ;
+            Cx [p] = GB_IMINV_SIGNED (z, 16) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_int32_int32.c b/GraphBLAS/Source/Generated/GB_unop__minv_int32_int32.c
index 7702b9c68f..e9e33328fd 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_int32_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_int32_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_IMINV_SIGNED (z, 32) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_int32_int32
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int32_t aij = Ax [p] ;
-        int32_t z = aij ;
-        Cx [p] = GB_IMINV_SIGNED (z, 32) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int32_t aij = Ax [p] ;
+                int32_t z = aij ;
+                Cx [p] = GB_IMINV_SIGNED (z, 32) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int32_t aij = Ax [p] ;
+            int32_t z = aij ;
+            Cx [p] = GB_IMINV_SIGNED (z, 32) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_int64_int64.c b/GraphBLAS/Source/Generated/GB_unop__minv_int64_int64.c
index eb71238307..b5c54a2f52 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_int64_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_int64_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_IMINV_SIGNED (z, 64) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_int64_int64
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int64_t aij = Ax [p] ;
-        int64_t z = aij ;
-        Cx [p] = GB_IMINV_SIGNED (z, 64) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int64_t aij = Ax [p] ;
+                int64_t z = aij ;
+                Cx [p] = GB_IMINV_SIGNED (z, 64) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int64_t aij = Ax [p] ;
+            int64_t z = aij ;
+            Cx [p] = GB_IMINV_SIGNED (z, 64) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_int8_int8.c b/GraphBLAS/Source/Generated/GB_unop__minv_int8_int8.c
index e6dd1e3ad6..12ca72e332 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_int8_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_int8_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_IMINV_SIGNED (z, 8) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_int8_int8
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        int8_t aij = Ax [p] ;
-        int8_t z = aij ;
-        Cx [p] = GB_IMINV_SIGNED (z, 8) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                int8_t aij = Ax [p] ;
+                int8_t z = aij ;
+                Cx [p] = GB_IMINV_SIGNED (z, 8) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            int8_t aij = Ax [p] ;
+            int8_t z = aij ;
+            Cx [p] = GB_IMINV_SIGNED (z, 8) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_uint16_uint16.c b/GraphBLAS/Source/Generated/GB_unop__minv_uint16_uint16.c
index 824482a40d..e959ebfa8d 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_uint16_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_uint16_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_IMINV_UNSIGNED (z, 16) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_uint16_uint16
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint16_t aij = Ax [p] ;
-        uint16_t z = aij ;
-        Cx [p] = GB_IMINV_UNSIGNED (z, 16) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint16_t aij = Ax [p] ;
+                uint16_t z = aij ;
+                Cx [p] = GB_IMINV_UNSIGNED (z, 16) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint16_t aij = Ax [p] ;
+            uint16_t z = aij ;
+            Cx [p] = GB_IMINV_UNSIGNED (z, 16) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_uint32_uint32.c b/GraphBLAS/Source/Generated/GB_unop__minv_uint32_uint32.c
index dc35d97c01..14d0268f09 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_uint32_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_uint32_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_IMINV_UNSIGNED (z, 32) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_uint32_uint32
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint32_t aij = Ax [p] ;
-        uint32_t z = aij ;
-        Cx [p] = GB_IMINV_UNSIGNED (z, 32) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint32_t aij = Ax [p] ;
+                uint32_t z = aij ;
+                Cx [p] = GB_IMINV_UNSIGNED (z, 32) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint32_t aij = Ax [p] ;
+            uint32_t z = aij ;
+            Cx [p] = GB_IMINV_UNSIGNED (z, 32) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_uint64_uint64.c b/GraphBLAS/Source/Generated/GB_unop__minv_uint64_uint64.c
index 2abe3df3c9..3092274977 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_uint64_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_uint64_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_IMINV_UNSIGNED (z, 64) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_uint64_uint64
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint64_t aij = Ax [p] ;
-        uint64_t z = aij ;
-        Cx [p] = GB_IMINV_UNSIGNED (z, 64) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint64_t aij = Ax [p] ;
+                uint64_t z = aij ;
+                Cx [p] = GB_IMINV_UNSIGNED (z, 64) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint64_t aij = Ax [p] ;
+            uint64_t z = aij ;
+            Cx [p] = GB_IMINV_UNSIGNED (z, 64) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__minv_uint8_uint8.c b/GraphBLAS/Source/Generated/GB_unop__minv_uint8_uint8.c
index b5e46b4b52..9bdb46d0c2 100644
--- a/GraphBLAS/Source/Generated/GB_unop__minv_uint8_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__minv_uint8_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_IMINV_UNSIGNED (z, 8) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_MINV || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__minv_uint8_uint8
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        uint8_t aij = Ax [p] ;
-        uint8_t z = aij ;
-        Cx [p] = GB_IMINV_UNSIGNED (z, 8) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                uint8_t aij = Ax [p] ;
+                uint8_t z = aij ;
+                Cx [p] = GB_IMINV_UNSIGNED (z, 8) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            uint8_t aij = Ax [p] ;
+            uint8_t z = aij ;
+            Cx [p] = GB_IMINV_UNSIGNED (z, 8) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__minv_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_bool_bool.c b/GraphBLAS/Source/Generated/GB_unop__one_bool_bool.c
index d9bdf22d3e..b26183e72b 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_bool_bool.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_bool_bool.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = true ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_BOOL)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_bool_bool
 (
     bool *Cx,       // Cx and Ax may be aliased
     const bool *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = true ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (bool), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = true ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = true ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_bool_bool
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__one_fc32_fc32.c
index c807496f44..38f14e11f6 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GxB_CMPLXF(1,0) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = GxB_CMPLXF(1,0) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = GxB_CMPLXF(1,0) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = GxB_CMPLXF(1,0) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__one_fc64_fc64.c
index 05af7ceefc..8be6d394ac 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GxB_CMPLX(1,0) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = GxB_CMPLX(1,0) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = GxB_CMPLX(1,0) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = GxB_CMPLX(1,0) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__one_fp32_fp32.c
index ecf52916c3..dff2cca1ac 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = 1 ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = 1 ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = 1 ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = 1 ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__one_fp64_fp64.c
index cfa0f3ffb8..d48d54326e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = 1 ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = 1 ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = 1 ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = 1 ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_int16_int16.c b/GraphBLAS/Source/Generated/GB_unop__one_int16_int16.c
index d1d5bec89f..ea52b23e59 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_int16_int16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_int16_int16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = 1 ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_INT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_int16_int16
 (
     int16_t *Cx,       // Cx and Ax may be aliased
     const int16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = 1 ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = 1 ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = 1 ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_int16_int16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_int32_int32.c b/GraphBLAS/Source/Generated/GB_unop__one_int32_int32.c
index 8c3bf4be77..dba98f4e6e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_int32_int32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_int32_int32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = 1 ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_INT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_int32_int32
 (
     int32_t *Cx,       // Cx and Ax may be aliased
     const int32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = 1 ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = 1 ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = 1 ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_int32_int32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_int64_int64.c b/GraphBLAS/Source/Generated/GB_unop__one_int64_int64.c
index 60767967e6..f8573f7a28 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_int64_int64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_int64_int64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = 1 ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_INT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_int64_int64
 (
     int64_t *Cx,       // Cx and Ax may be aliased
     const int64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = 1 ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = 1 ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = 1 ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_int64_int64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_int8_int8.c b/GraphBLAS/Source/Generated/GB_unop__one_int8_int8.c
index fa88f333b5..c3c002d156 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_int8_int8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_int8_int8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = 1 ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_INT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_int8_int8
 (
     int8_t *Cx,       // Cx and Ax may be aliased
     const int8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = 1 ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (int8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = 1 ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = 1 ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_int8_int8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_uint16_uint16.c b/GraphBLAS/Source/Generated/GB_unop__one_uint16_uint16.c
index 0a0ba95842..636b93c2a1 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_uint16_uint16.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_uint16_uint16.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = 1 ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_UINT16)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_uint16_uint16
 (
     uint16_t *Cx,       // Cx and Ax may be aliased
     const uint16_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = 1 ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint16_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = 1 ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = 1 ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_uint16_uint16
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_uint32_uint32.c b/GraphBLAS/Source/Generated/GB_unop__one_uint32_uint32.c
index 269e5570e3..1d5791d178 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_uint32_uint32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_uint32_uint32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = 1 ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_UINT32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_uint32_uint32
 (
     uint32_t *Cx,       // Cx and Ax may be aliased
     const uint32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = 1 ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = 1 ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = 1 ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_uint32_uint32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_uint64_uint64.c b/GraphBLAS/Source/Generated/GB_unop__one_uint64_uint64.c
index 41b2f56101..2f2939c70a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_uint64_uint64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_uint64_uint64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = 1 ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_UINT64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_uint64_uint64
 (
     uint64_t *Cx,       // Cx and Ax may be aliased
     const uint64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = 1 ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = 1 ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = 1 ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_uint64_uint64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__one_uint8_uint8.c b/GraphBLAS/Source/Generated/GB_unop__one_uint8_uint8.c
index 64e19982fb..98a761ab1c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__one_uint8_uint8.c
+++ b/GraphBLAS/Source/Generated/GB_unop__one_uint8_uint8.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = 1 ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ONE || GxB_NO_UINT8)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__one_uint8_uint8
 (
     uint8_t *Cx,       // Cx and Ax may be aliased
     const uint8_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        ; ;
-        ; ;
-        Cx [p] = 1 ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (uint8_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                ; ;
+                ; ;
+                Cx [p] = 1 ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            ; ;
+            ; ;
+            Cx [p] = 1 ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__one_uint8_uint8
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__round_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__round_fc32_fc32.c
index bf414bd5da..0b4525fdaa 100644
--- a/GraphBLAS/Source/Generated/GB_unop__round_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__round_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_croundf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ROUND || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__round_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = GB_croundf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = GB_croundf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = GB_croundf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__round_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__round_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__round_fc64_fc64.c
index 9850215fba..3a35f5a893 100644
--- a/GraphBLAS/Source/Generated/GB_unop__round_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__round_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_cround (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ROUND || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__round_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = GB_cround (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = GB_cround (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = GB_cround (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__round_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__round_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__round_fp32_fp32.c
index cbe3dbb313..9ce3b24121 100644
--- a/GraphBLAS/Source/Generated/GB_unop__round_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__round_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = roundf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ROUND || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__round_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = roundf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = roundf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = roundf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__round_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__round_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__round_fp64_fp64.c
index 043c970f56..4feeb44466 100644
--- a/GraphBLAS/Source/Generated/GB_unop__round_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__round_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = round (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_ROUND || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__round_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = round (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = round (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = round (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__round_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__signum_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__signum_fc32_fc32.c
index f6de1477d9..9dc73e0f62 100644
--- a/GraphBLAS/Source/Generated/GB_unop__signum_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__signum_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_csignumf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SIGNUM || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__signum_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = GB_csignumf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = GB_csignumf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = GB_csignumf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__signum_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__signum_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__signum_fc64_fc64.c
index d63798a56b..7c0594bd30 100644
--- a/GraphBLAS/Source/Generated/GB_unop__signum_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__signum_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_csignum (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SIGNUM || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__signum_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = GB_csignum (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = GB_csignum (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = GB_csignum (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__signum_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__signum_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__signum_fp32_fp32.c
index 5b3152102f..561fd46179 100644
--- a/GraphBLAS/Source/Generated/GB_unop__signum_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__signum_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_signumf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SIGNUM || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__signum_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = GB_signumf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = GB_signumf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = GB_signumf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__signum_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__signum_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__signum_fp64_fp64.c
index 334c1c3243..3737b8634a 100644
--- a/GraphBLAS/Source/Generated/GB_unop__signum_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__signum_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_signum (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SIGNUM || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__signum_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = GB_signum (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = GB_signum (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = GB_signum (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__signum_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__sin_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__sin_fc32_fc32.c
index 094168702d..6950bbf0e6 100644
--- a/GraphBLAS/Source/Generated/GB_unop__sin_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__sin_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = csinf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SIN || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__sin_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = csinf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = csinf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = csinf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__sin_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__sin_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__sin_fc64_fc64.c
index a7a840b7a6..9f194e9b35 100644
--- a/GraphBLAS/Source/Generated/GB_unop__sin_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__sin_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = csin (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SIN || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__sin_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = csin (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = csin (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = csin (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__sin_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__sin_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__sin_fp32_fp32.c
index da53108add..e7ca8919b1 100644
--- a/GraphBLAS/Source/Generated/GB_unop__sin_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__sin_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = sinf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SIN || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__sin_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = sinf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = sinf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = sinf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__sin_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__sin_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__sin_fp64_fp64.c
index a792cb6ad0..f90258e7b9 100644
--- a/GraphBLAS/Source/Generated/GB_unop__sin_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__sin_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = sin (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SIN || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__sin_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = sin (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = sin (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = sin (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__sin_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__sinh_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__sinh_fc32_fc32.c
index 18caa0153b..78de475c80 100644
--- a/GraphBLAS/Source/Generated/GB_unop__sinh_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__sinh_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = csinhf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SINH || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__sinh_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = csinhf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = csinhf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = csinhf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__sinh_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__sinh_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__sinh_fc64_fc64.c
index 0332b822fa..50e5e09e4f 100644
--- a/GraphBLAS/Source/Generated/GB_unop__sinh_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__sinh_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = csinh (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SINH || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__sinh_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = csinh (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = csinh (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = csinh (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__sinh_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__sinh_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__sinh_fp32_fp32.c
index 21f93b8b70..70e362aea4 100644
--- a/GraphBLAS/Source/Generated/GB_unop__sinh_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__sinh_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = sinhf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SINH || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__sinh_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = sinhf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = sinhf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = sinhf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__sinh_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__sinh_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__sinh_fp64_fp64.c
index 73ae94fe11..320e40e065 100644
--- a/GraphBLAS/Source/Generated/GB_unop__sinh_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__sinh_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = sinh (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SINH || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__sinh_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = sinh (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = sinh (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = sinh (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__sinh_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__sqrt_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__sqrt_fc32_fc32.c
index 969f0ac2e1..4f75369bfc 100644
--- a/GraphBLAS/Source/Generated/GB_unop__sqrt_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__sqrt_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = csqrtf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SQRT || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__sqrt_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = csqrtf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = csqrtf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = csqrtf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__sqrt_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__sqrt_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__sqrt_fc64_fc64.c
index 680a6e4705..33844ecc79 100644
--- a/GraphBLAS/Source/Generated/GB_unop__sqrt_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__sqrt_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = csqrt (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SQRT || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__sqrt_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = csqrt (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = csqrt (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = csqrt (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__sqrt_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__sqrt_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__sqrt_fp32_fp32.c
index bbbbe6075d..a64ca35e05 100644
--- a/GraphBLAS/Source/Generated/GB_unop__sqrt_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__sqrt_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = sqrtf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SQRT || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__sqrt_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = sqrtf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = sqrtf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = sqrtf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__sqrt_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__sqrt_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__sqrt_fp64_fp64.c
index a33dd81c6c..178f2851a5 100644
--- a/GraphBLAS/Source/Generated/GB_unop__sqrt_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__sqrt_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = sqrt (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_SQRT || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__sqrt_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = sqrt (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = sqrt (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = sqrt (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__sqrt_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__tan_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__tan_fc32_fc32.c
index 5c9c9d9d76..bf7868a387 100644
--- a/GraphBLAS/Source/Generated/GB_unop__tan_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__tan_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ctanf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TAN || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__tan_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = ctanf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = ctanf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = ctanf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__tan_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__tan_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__tan_fc64_fc64.c
index 6590aa0e55..ac4eff74ec 100644
--- a/GraphBLAS/Source/Generated/GB_unop__tan_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__tan_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ctan (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TAN || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__tan_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = ctan (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = ctan (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = ctan (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__tan_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__tan_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__tan_fp32_fp32.c
index baeff3d8c6..826045d189 100644
--- a/GraphBLAS/Source/Generated/GB_unop__tan_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__tan_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = tanf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TAN || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__tan_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = tanf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = tanf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = tanf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__tan_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__tan_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__tan_fp64_fp64.c
index e976b2ce89..d8925347e2 100644
--- a/GraphBLAS/Source/Generated/GB_unop__tan_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__tan_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = tan (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TAN || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__tan_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = tan (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = tan (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = tan (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__tan_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__tanh_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__tanh_fc32_fc32.c
index abc2fafcc7..65244b450c 100644
--- a/GraphBLAS/Source/Generated/GB_unop__tanh_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__tanh_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ctanhf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TANH || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__tanh_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = ctanhf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = ctanhf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = ctanhf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__tanh_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__tanh_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__tanh_fc64_fc64.c
index d4da9f4ca0..39ff72f71e 100644
--- a/GraphBLAS/Source/Generated/GB_unop__tanh_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__tanh_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = ctanh (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TANH || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__tanh_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = ctanh (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = ctanh (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = ctanh (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__tanh_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__tanh_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__tanh_fp32_fp32.c
index 8a5268c69b..7dddd964de 100644
--- a/GraphBLAS/Source/Generated/GB_unop__tanh_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__tanh_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = tanhf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TANH || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__tanh_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = tanhf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = tanhf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = tanhf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__tanh_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__tanh_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__tanh_fp64_fp64.c
index 412e7776a5..61aa9b5777 100644
--- a/GraphBLAS/Source/Generated/GB_unop__tanh_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__tanh_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = tanh (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TANH || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__tanh_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = tanh (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = tanh (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = tanh (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__tanh_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__tgamma_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__tgamma_fp32_fp32.c
index b7cbe6d5a4..1f0fe5eb54 100644
--- a/GraphBLAS/Source/Generated/GB_unop__tgamma_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__tgamma_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = tgammaf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TGAMMA || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__tgamma_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = tgammaf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = tgammaf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = tgammaf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__tgamma_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__tgamma_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__tgamma_fp64_fp64.c
index b272a5379b..ae14ec67d0 100644
--- a/GraphBLAS/Source/Generated/GB_unop__tgamma_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__tgamma_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = tgamma (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TGAMMA || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__tgamma_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = tgamma (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = tgamma (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = tgamma (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__tgamma_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__trunc_fc32_fc32.c b/GraphBLAS/Source/Generated/GB_unop__trunc_fc32_fc32.c
index a9250b5bed..d17a277d71 100644
--- a/GraphBLAS/Source/Generated/GB_unop__trunc_fc32_fc32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__trunc_fc32_fc32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_ctruncf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TRUNC || GxB_NO_FC32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__trunc_fc32_fc32
 (
     GxB_FC32_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC32_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC32_t aij = Ax [p] ;
-        GxB_FC32_t z = aij ;
-        Cx [p] = GB_ctruncf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC32_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC32_t aij = Ax [p] ;
+                GxB_FC32_t z = aij ;
+                Cx [p] = GB_ctruncf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC32_t aij = Ax [p] ;
+            GxB_FC32_t z = aij ;
+            Cx [p] = GB_ctruncf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__trunc_fc32_fc32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__trunc_fc64_fc64.c b/GraphBLAS/Source/Generated/GB_unop__trunc_fc64_fc64.c
index fa75809b03..4719b49906 100644
--- a/GraphBLAS/Source/Generated/GB_unop__trunc_fc64_fc64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__trunc_fc64_fc64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = GB_ctrunc (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TRUNC || GxB_NO_FC64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__trunc_fc64_fc64
 (
     GxB_FC64_t *Cx,       // Cx and Ax may be aliased
     const GxB_FC64_t *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GxB_FC64_t aij = Ax [p] ;
-        GxB_FC64_t z = aij ;
-        Cx [p] = GB_ctrunc (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GxB_FC64_t), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GxB_FC64_t aij = Ax [p] ;
+                GxB_FC64_t z = aij ;
+                Cx [p] = GB_ctrunc (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GxB_FC64_t aij = Ax [p] ;
+            GxB_FC64_t z = aij ;
+            Cx [p] = GB_ctrunc (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__trunc_fc64_fc64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__trunc_fp32_fp32.c b/GraphBLAS/Source/Generated/GB_unop__trunc_fp32_fp32.c
index fe450a441f..b47162a8ca 100644
--- a/GraphBLAS/Source/Generated/GB_unop__trunc_fp32_fp32.c
+++ b/GraphBLAS/Source/Generated/GB_unop__trunc_fp32_fp32.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = truncf (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TRUNC || GxB_NO_FP32)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__trunc_fp32_fp32
 (
     float *Cx,       // Cx and Ax may be aliased
     const float *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        float aij = Ax [p] ;
-        float z = aij ;
-        Cx [p] = truncf (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (float), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                float aij = Ax [p] ;
+                float z = aij ;
+                Cx [p] = truncf (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            float aij = Ax [p] ;
+            float z = aij ;
+            Cx [p] = truncf (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__trunc_fp32_fp32
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generated/GB_unop__trunc_fp64_fp64.c b/GraphBLAS/Source/Generated/GB_unop__trunc_fp64_fp64.c
index 9d35a94c63..860bd9ad65 100644
--- a/GraphBLAS/Source/Generated/GB_unop__trunc_fp64_fp64.c
+++ b/GraphBLAS/Source/Generated/GB_unop__trunc_fp64_fp64.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     Cx [pC] = trunc (z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    0
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     (GxB_NO_TRUNC || GxB_NO_FP64)
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-
-
 GrB_Info GB_unop_apply__trunc_fp64_fp64
 (
     double *Cx,       // Cx and Ax may be aliased
     const double *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        double aij = Ax [p] ;
-        double z = aij ;
-        Cx [p] = trunc (z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (double), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                double aij = Ax [p] ;
+                double z = aij ;
+                Cx [p] = trunc (z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            double aij = Ax [p] ;
+            double z = aij ;
+            Cx [p] = trunc (z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran__trunc_fp64_fp64
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generator/GB_AxB.c b/GraphBLAS/Source/Generator/GB_AxB.c
index c8a2c14ebc..b7279a8a0e 100644
--- a/GraphBLAS/Source/Generator/GB_AxB.c
+++ b/GraphBLAS/Source/Generator/GB_AxB.c
@@ -2,8 +2,8 @@
 // GB_AxB:  hard-coded functions for semiring: C<M>=A*B or A'*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +19,7 @@
 #include "GB_AxB_saxpy3.h"
 #include "GB_AxB__include.h"
 #include "GB_unused.h"
+#include "GB_bitmap_assign_methods.h"
 
 // The C=A*B semiring is defined by the following types and operators:
 
@@ -31,12 +32,12 @@
 // A type:   GB_atype
 // B type:   GB_btype
 
-// Multiply: GB_multiply(z,aik,bkj)
+// Multiply: GB_multiply(z,aik,bkj,i,k,j)
 // Add:      GB_add_update(cij, z)
 //           'any' monoid?  GB_is_any_monoid
 //           atomic?        GB_has_atomic
 //           OpenMP atomic? GB_has_omp_atomic
-// MultAdd:  GB_multiply_add(cij,aik,bkj)
+// MultAdd:  GB_multiply_add(cij,aik,bkj,i,k,j)
 // Identity: GB_identity
 // Terminal: GB_terminal
 
@@ -49,6 +50,10 @@
 #define GB_CTYPE \
     GB_ctype
 
+#define GB_ASIZE (sizeof (GB_BTYPE))
+#define GB_BSIZE (sizeof (GB_BTYPE))
+#define GB_CSIZE (sizeof (GB_CTYPE))
+
 // true for int64, uint64, float, double, float complex, and double complex 
 #define GB_CTYPE_IGNORE_OVERFLOW \
     GB_ctype_ignore_overflow
@@ -61,24 +66,44 @@
 #define GB_GETB(bkj,Bx,pB) \
     GB_getb(bkj,Bx,pB)
 
+// Gx [pG] = Ax [pA]
+#define GB_LOADA(Gx,pG,Ax,pA) \
+    Gx [pG] = Ax [pA]
+
+// Gx [pG] = Bx [pB]
+#define GB_LOADB(Gx,pG,Bx,pB) \
+    GB_loadb(Gx,pG,Bx,pB)
+
 #define GB_CX(p) Cx [p]
 
 // multiply operator
-#define GB_MULT(z, x, y) \
-    GB_multiply(z, x, y)
+#define GB_MULT(z, x, y, i, k, j) \
+    GB_multiply(z, x, y, i, k, j)
 
 // cast from a real scalar (or 2, if C is complex) to the type of C
 #define GB_CTYPE_CAST(x,y) \
     GB_ctype_cast(x,y)
 
+// cast from a real scalar (or 2, if A is complex) to the type of A
+#define GB_ATYPE_CAST(x,y) \
+    GB_atype_cast(x,y)
+
 // multiply-add
-#define GB_MULTADD(z, x, y) \
-    GB_multiply_add(z, x, y)
+#define GB_MULTADD(z, x, y, i, k, j) \
+    GB_multiply_add(z, x, y, i, k, j)
 
 // monoid identity value
 #define GB_IDENTITY \
     GB_identity
 
+// 1 if the identity value can be assigned via memset, with all bytes the same
+#define GB_HAS_IDENTITY_BYTE \
+    GB_has_identity_byte
+
+// identity byte, for memset
+#define GB_IDENTITY_BYTE \
+    GB_identity_byte
+
 // break if cij reaches the terminal value (dot product only)
 #define GB_DOT_TERMINAL(cij) \
     GB_terminal
@@ -94,6 +119,10 @@
 #define GB_IS_PLUS_PAIR_REAL_SEMIRING \
     GB_is_plus_pair_real_semiring
 
+// 1 for performance-critical semirings, which get extra optimization
+#define GB_IS_PERFORMANCE_CRITICAL_SEMIRING \
+    GB_is_performance_critical_semiring
+
 // declare the cij scalar
 #if GB_IS_PLUS_PAIR_REAL_SEMIRING
     // also initialize cij to zero
@@ -105,16 +134,11 @@
         GB_ctype cij
 #endif
 
-// save the value of C(i,j)
-#define GB_CIJ_SAVE(cij,p) Cx [p] = cij
-
 // cij = Cx [pC]
-#define GB_GETC(cij,pC) \
-    cij = Cx [pC]
+#define GB_GETC(cij,p) cij = Cx [p]
 
 // Cx [pC] = cij
-#define GB_PUTC(cij,pC) \
-    Cx [pC] = cij
+#define GB_PUTC(cij,p) Cx [p] = cij
 
 // Cx [p] = t
 #define GB_CIJ_WRITE(p,t) Cx [p] = t
@@ -127,10 +151,6 @@
 #define GB_ADD_FUNCTION(x,y) \
     GB_add_function(x, y)
 
-// type with size of GB_CTYPE, and can be used in compare-and-swap
-#define GB_CTYPE_PUN \
-    GB_ctype_pun
-
 // bit pattern for bool, 8-bit, 16-bit, and 32-bit integers
 #define GB_CTYPE_BITS \
     GB_ctype_bits
@@ -172,6 +192,42 @@
 #define GB_IS_PLUS_FC64_MONOID \
     GB_is_plus_fc64_monoid
 
+// 1 if monoid is ANY_FC32
+#define GB_IS_ANY_FC32_MONOID \
+    GB_is_any_fc32_monoid
+
+// 1 if monoid is ANY_FC64
+#define GB_IS_ANY_FC64_MONOID \
+    GB_is_any_fc64_monoid
+
+// 1 if monoid is MIN for signed or unsigned integers
+#define GB_IS_IMIN_MONOID \
+    GB_is_imin_monoid
+
+// 1 if monoid is MAX for signed or unsigned integers
+#define GB_IS_IMAX_MONOID \
+    GB_is_imax_monoid
+
+// 1 if monoid is MIN for float or double
+#define GB_IS_FMIN_MONOID \
+    GB_is_fmin_monoid
+
+// 1 if monoid is MAX for float or double
+#define GB_IS_FMAX_MONOID \
+    GB_is_fmax_monoid
+
+// 1 for the FIRSTI or FIRSTI1 multiply operator
+#define GB_IS_FIRSTI_MULTIPLIER \
+    GB_is_firsti_multiplier
+
+// 1 for the FIRSTJ or FIRSTJ1 multiply operator
+#define GB_IS_FIRSTJ_MULTIPLIER \
+    GB_is_firstj_multiplier
+
+// 1 for the SECONDJ or SECONDJ1 multiply operator
+#define GB_IS_SECONDJ_MULTIPLIER \
+    GB_is_secondj_multiplier
+
 // atomic compare-exchange
 #define GB_ATOMIC_COMPARE_EXCHANGE(target, expected, desired) \
     GB_atomic_compare_exchange (target, expected, desired)
@@ -181,6 +237,7 @@
     // result is purely symbolic; no numeric work to do.  Hx is not used.
     #define GB_HX_WRITE(i,t)
     #define GB_CIJ_GATHER(p,i)
+    #define GB_CIJ_GATHER_UPDATE(p,i)
     #define GB_HX_UPDATE(i,t)
     #define GB_CIJ_MEMCPY(p,i,len)
 
@@ -192,6 +249,10 @@
     // Cx [p] = Hx [i]
     #define GB_CIJ_GATHER(p,i) Cx [p] = Hx [i]
 
+    // Cx [p] += Hx [i]
+    #define GB_CIJ_GATHER_UPDATE(p,i) \
+        GB_add_update(Cx [p], Hx [i])
+
     // Hx [i] += t
     #define GB_HX_UPDATE(i,t) \
         GB_add_update(Hx [i], t)
@@ -202,38 +263,55 @@
 
 #endif
 
+// 1 if the semiring has a concise bitmap multiply-add
+#define GB_HAS_BITMAP_MULTADD \
+    GB_has_bitmap_multadd
+
+// concise statement(s) for the bitmap case:
+//  if (exists)
+//      if (cb == 0)
+//          cx = ax * bx
+//          cb = 1
+//      else
+//          cx += ax * bx
+#define GB_BITMAP_MULTADD(cb,cx,exists,ax,bx) \
+    GB_bitmap_multadd(cb,cx,exists,ax,bx)
+
+// define X for bitmap multiply-add
+#define GB_XINIT \
+    GB_xinit
+
+// load X [1] = bkj for bitmap multiply-add
+#define GB_XLOAD(bkj) \
+    GB_xload(bkj)
+
 // disable this semiring and use the generic case if these conditions hold
 #define GB_DISABLE \
     GB_disable
 
 //------------------------------------------------------------------------------
-// C=A'*B or C<!M>=A'*B: dot product (phase 2)
+// C=A'*B, C<M>=A'*B, or C<!M>=A'*B: dot product method where C is bitmap
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot2B
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 )
 { 
-    // C<M>=A'*B now uses dot3
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_AxB_dot2_meta.c"
-    #undef GB_PHASE_2_OF_2
     return (GrB_SUCCESS) ;
     #endif
 }
 
 //------------------------------------------------------------------------------
-// C<M>=A'*B: masked dot product method (phase 2)
+// C<M>=A'*B: masked dot product method (phase 2) where C is sparse or hyper
 //------------------------------------------------------------------------------
 
 GrB_Info GB_Adot3B
@@ -250,7 +328,7 @@ GrB_Info GB_Adot3B
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot3_template.c"
+    #include "GB_AxB_dot3_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -272,7 +350,7 @@ GrB_Info GB_Adot4B
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_dot4_template.c"
+    #include "GB_AxB_dot4_meta.c"
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -286,20 +364,22 @@ GrB_Info GB_Adot4B
 GrB_Info GB_Asaxpy3B
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #include "GB_AxB_saxpy3_template.c"
+    #include "GB_AxB_saxpy_template.c"
     return (GrB_SUCCESS) ;
     #endif
 }
diff --git a/GraphBLAS/Source/Generator/GB_AxB.h b/GraphBLAS/Source/Generator/GB_AxB.h
index 211c20042c..c0647b8fa6 100644
--- a/GraphBLAS/Source/Generator/GB_AxB.h
+++ b/GraphBLAS/Source/Generator/GB_AxB.h
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Adot2B
 (
     GrB_Matrix C,
-    const GrB_Matrix M, const bool Mask_struct,
-    const GrB_Matrix *Aslice, bool A_is_pattern,
-    const GrB_Matrix B, bool B_is_pattern,
-    int64_t *GB_RESTRICT B_slice,
-    int64_t *GB_RESTRICT *C_counts,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix A, bool A_is_pattern, int64_t *GB_RESTRICT A_slice,
+    const GrB_Matrix B, bool B_is_pattern, int64_t *GB_RESTRICT B_slice,
     int nthreads, int naslice, int nbslice
 ) ;
 
@@ -23,13 +22,15 @@ GrB_Info GB_Adot3B
 GrB_Info GB_Asaxpy3B
 (
     GrB_Matrix C,
-    const GrB_Matrix M, bool Mask_comp, const bool Mask_struct,
+    const GrB_Matrix M, const bool Mask_comp, const bool Mask_struct,
+    const bool M_dense_in_place,
     const GrB_Matrix A, bool A_is_pattern,
     const GrB_Matrix B, bool B_is_pattern,
     GB_saxpy3task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nfine,
-    const int nthreads,
+    int ntasks,
+    int nfine,
+    int nthreads,
+    const int do_sort,
     GB_Context Context
 ) ;
 
@@ -42,3 +43,4 @@ GrB_Info GB_Adot4B
     int64_t *GB_RESTRICT B_slice, int nbslice,
     const int nthreads
 ) ;
+
diff --git a/GraphBLAS/Source/Generator/GB_binop.c b/GraphBLAS/Source/Generator/GB_binop.c
index ebb92a6d6c..f1e93e6bb3 100644
--- a/GraphBLAS/Source/Generator/GB_binop.c
+++ b/GraphBLAS/Source/Generator/GB_binop.c
@@ -2,8 +2,8 @@
 // GB_binop:  hard-coded functions for each built-in binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,8 @@
 #include "GB_control.h"
 #include "GB_ek_slice.h"
 #include "GB_dense.h"
-#include "GB_mkl.h"
+#include "GB_atomics.h"
+#include "GB_bitmap_assign_methods.h"
 #include "GB_binop__include.h"
 
 // C=binop(A,B) is defined by the following types and operators:
@@ -35,7 +36,7 @@
 // C type:   GB_ctype
 // A type:   GB_atype
 // B,b type: GB_btype
-// BinaryOp: GB_binaryop(cij,aij,bij)
+// BinaryOp: GB_binaryop(cij,aij,bij,i,j)
 
 #define GB_ATYPE \
     GB_atype
@@ -81,8 +82,8 @@
 #define GB_CX(p) Cx [p]
 
 // binary operator
-#define GB_BINOP(z, x, y)   \
-    GB_binaryop(z, x, y) ;
+#define GB_BINOP(z, x, y, i, j) \
+    GB_binaryop(z, x, y, i, j) ;
 
 // op is second
 #define GB_OP_IS_SECOND \
@@ -262,11 +263,21 @@ endif_binop_is_semiring_multiplier
 // eWiseAdd: C = A+B or C<M> = A+B
 //------------------------------------------------------------------------------
 
+#undef  GB_FREE_ALL
+#define GB_FREE_ALL                                                     \
+{                                                                       \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+    GB_ek_slice_free (&pstart_Aslice, &kfirst_Aslice, &klast_Aslice) ;  \
+    GB_ek_slice_free (&pstart_Bslice, &kfirst_Bslice, &klast_Bslice) ;  \
+}
+
 GrB_Info GB_AaddB
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -274,14 +285,19 @@ GrB_Info GB_AaddB
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_add_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -293,22 +309,29 @@ GrB_Info GB_AaddB
 GrB_Info GB_AemultB
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
+    int64_t *pstart_Mslice = NULL, *kfirst_Mslice = NULL, *klast_Mslice = NULL ;
+    int64_t *pstart_Aslice = NULL, *kfirst_Aslice = NULL, *klast_Aslice = NULL ;
+    int64_t *pstart_Bslice = NULL, *kfirst_Bslice = NULL, *klast_Bslice = NULL ;
     #include "GB_emult_template.c"
+    GB_FREE_ALL ;
     return (GrB_SUCCESS) ;
     #endif
 }
@@ -324,6 +347,7 @@ GrB_Info GB_bind1st
     GB_void *Cx_output,         // Cx and Bx may be aliased
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Bb,
     int64_t anz,
     int nthreads
 )
@@ -338,8 +362,9 @@ GrB_Info GB_bind1st
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Bb, p)) continue ;
         GB_getb(bij, Bx, p) ;
-        GB_binaryop(Cx [p], x, bij) ;
+        GB_binaryop(Cx [p], x, bij, 0, 0) ;
     }
     return (GrB_SUCCESS) ;
     #endif
@@ -358,6 +383,7 @@ GrB_Info GB_bind2nd
     GB_void *Cx_output,         // Cx and Ax may be aliased
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 )
@@ -372,8 +398,9 @@ GrB_Info GB_bind2nd
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (p = 0 ; p < anz ; p++)
     {
+        if (!GBB (Ab, p)) continue ;
         GB_geta(aij, Ax, p) ;
-        GB_binaryop(Cx [p], aij, y) ;
+        GB_binaryop(Cx [p], aij, y, 0, 0) ;
     }
     return (GrB_SUCCESS) ;
     #endif
@@ -387,12 +414,12 @@ endif_binop_bind2nd_is_enabled
 
 if_binop_bind1st_is_enabled
 
-// cij = op (x, aij), no typcasting (in spite of the macro name)
+// cij = op (x, aij), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GB_getb(aij, Ax, pA) ;              \
-    GB_binaryop(Cx [pC], x, aij) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GB_getb(aij, Ax, pA) ;                      \
+    GB_binaryop(Cx [pC], x, aij, 0, 0) ;        \
 }
 
 GrB_Info GB_bind1st_tran
@@ -400,10 +427,10 @@ GrB_Info GB_bind1st_tran
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     // GB_unop_transpose.c uses GB_ATYPE, but A is
@@ -415,7 +442,6 @@ GrB_Info GB_bind1st_tran
     return (GrB_NO_VALUE) ;
     #else
     GB_atype x = (*((const GB_atype *) x_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
@@ -432,12 +458,12 @@ endif_binop_bind1st_is_enabled
 
 if_binop_bind2nd_is_enabled
 
-// cij = op (aij, y), no typcasting (in spite of the macro name)
+// cij = op (aij, y), no typecasting (in spite of the macro name)
 #undef  GB_CAST_OP
-#define GB_CAST_OP(pC,pA)               \
-{                                       \
-    GB_geta(aij, Ax, pA) ;              \
-    GB_binaryop(Cx [pC], aij, y) ;      \
+#define GB_CAST_OP(pC,pA)                       \
+{                                               \
+    GB_geta(aij, Ax, pA) ;                      \
+    GB_binaryop(Cx [pC], aij, y, 0, 0) ;        \
 }
 
 GrB_Info GB_bind2nd_tran
@@ -445,17 +471,16 @@ GrB_Info GB_bind2nd_tran
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     GB_btype y = (*((const GB_btype *) y_input)) ;
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generator/GB_binop.h b/GraphBLAS/Source/Generator/GB_binop.h
index c25f6e5d67..77c7c351f9 100644
--- a/GraphBLAS/Source/Generator/GB_binop.h
+++ b/GraphBLAS/Source/Generator/GB_binop.h
@@ -1,4 +1,4 @@
-
+// SPDX-License-Identifier: Apache-2.0
 if_is_binop_subset
 void GB_Cdense_ewise3_accum
 (
@@ -62,8 +62,10 @@ endif_binop_is_semiring_multiplier
 GrB_Info GB_AaddB
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const bool Ch_is_Mh,
@@ -71,23 +73,27 @@ GrB_Info GB_AaddB
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 GrB_Info GB_AemultB
 (
     GrB_Matrix C,
+    const int C_sparsity,
     const GrB_Matrix M,
     const bool Mask_struct,
+    const bool Mask_comp,
     const GrB_Matrix A,
     const GrB_Matrix B,
     const int64_t *GB_RESTRICT C_to_M,
     const int64_t *GB_RESTRICT C_to_A,
     const int64_t *GB_RESTRICT C_to_B,
     const GB_task_struct *GB_RESTRICT TaskList,
-    const int ntasks,
-    const int nthreads
+    const int C_ntasks,
+    const int C_nthreads,
+    GB_Context Context
 ) ;
 
 if_binop_bind1st_is_enabled
@@ -96,6 +102,7 @@ GrB_Info GB_bind1st
     GB_void *Cx_output,
     const GB_void *x_input,
     const GB_void *Bx_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -107,6 +114,7 @@ GrB_Info GB_bind2nd
     GB_void *Cx_output,
     const GB_void *Ax_input,
     const GB_void *y_input,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
@@ -118,10 +126,10 @@ GrB_Info GB_bind1st_tran
     GrB_Matrix C,
     const GB_void *x_input,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 endif_binop_bind1st_is_enabled
 
@@ -131,10 +139,10 @@ GrB_Info GB_bind2nd_tran
     GrB_Matrix C,
     const GrB_Matrix A,
     const GB_void *y_input,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 endif_binop_bind2nd_is_enabled
 
diff --git a/GraphBLAS/Source/Generator/GB_red.c b/GraphBLAS/Source/Generator/GB_red.c
index 7e91afce0d..eb046c9121 100644
--- a/GraphBLAS/Source/Generator/GB_red.c
+++ b/GraphBLAS/Source/Generator/GB_red.c
@@ -2,8 +2,8 @@
 // GB_red:  hard-coded functions for reductions
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,6 @@
 
 // Assemble tuples:    GB_red_build
 // Reduce to scalar:   GB_red_scalar
-// Reduce each vector: GB_red_eachvec
-// Reduce each index:  GB_red_eachindex
 
 // A type:   GB_atype
 // C type:   GB_ctype
@@ -36,10 +34,15 @@
 #define GB_CTYPE \
     GB_ctype
 
-// declare scalar
+// monoid identity value
 
-    #define GB_SCALAR(s)                            \
-        GB_ctype s
+    #define GB_IDENTITY \
+        GB_identity
+
+// declare a scalar and set it equal to the monoid identity value
+
+    #define GB_SCALAR_IDENTITY(s)                   \
+        GB_ctype s = GB_IDENTITY
 
 // Array to array
 
@@ -92,10 +95,13 @@
     #define GB_HAS_TERMINAL                         \
         GB_has_terminal
 
+    #define GB_IS_TERMINAL(s)                       \
+        GB_is_terminal
+
     #define GB_TERMINAL_VALUE                       \
         GB_terminal_value
 
-    #define GB_BREAK_IF_TERMINAL(t)                 \
+    #define GB_BREAK_IF_TERMINAL(s)                 \
         GB_terminal
 
 // panel size for built-in operators
@@ -123,6 +129,7 @@ GrB_Info GB_red_scalar
     GB_atype *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 )
@@ -131,7 +138,15 @@ GrB_Info GB_red_scalar
     return (GrB_NO_VALUE) ;
     #else
     GB_ctype s = (*result) ;
-    #include "GB_reduce_panel.c"
+    GB_ctype *GB_RESTRICT W = (GB_ctype *) W_space ;
+    if (A->nzombies > 0 || GB_IS_BITMAP (A))
+    {
+        #include "GB_reduce_to_scalar_template.c"
+    }
+    else
+    {
+        #include "GB_reduce_panel.c"
+    }
     (*result) = s ;
     return (GrB_SUCCESS) ;
     #endif
@@ -139,67 +154,6 @@ GrB_Info GB_red_scalar
 
 endif_is_monoid
 
-//------------------------------------------------------------------------------
-// reduce to each vector: each vector A(:,k) reduces to a scalar Tx (k)
-//------------------------------------------------------------------------------
-
-if_is_monoid
-
-GrB_Info GB_red_eachvec
-(
-    GB_atype *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    #include "GB_reduce_each_vector.c"
-    return (GrB_SUCCESS) ;
-    #endif
-}
-
-endif_is_monoid
-
-//------------------------------------------------------------------------------
-// reduce to each index: each A(i,:) reduces to a scalar T (i)
-//------------------------------------------------------------------------------
-
-if_is_monoid
-
-GrB_Info GB_red_eachindex
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-)
-{ 
-    #if GB_DISABLE
-    return (GrB_NO_VALUE) ;
-    #else
-    GrB_Info info = GrB_SUCCESS ;
-    GrB_Matrix T = NULL ;
-    (*Thandle) = NULL ;
-    #define GB_FREE_ALL ;
-    #include "GB_reduce_each_index.c"
-    (*Thandle) = T ;
-    return (info) ;
-    #endif
-}
-
-endif_is_monoid
-
 //------------------------------------------------------------------------------
 // build matrix
 //------------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Generator/GB_red.h b/GraphBLAS/Source/Generator/GB_red.h
index faeffb5852..3df14e6273 100644
--- a/GraphBLAS/Source/Generator/GB_red.h
+++ b/GraphBLAS/Source/Generator/GB_red.h
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: Apache-2.0
 if_is_monoid
 
 GrB_Info GB_red_scalar
@@ -5,34 +6,11 @@ GrB_Info GB_red_scalar
     GB_atype *result,
     const GrB_Matrix A,
     GB_void *GB_RESTRICT W_space,
+    bool *GB_RESTRICT F,
     int ntasks,
     int nthreads
 ) ;
 
-GrB_Info GB_red_eachvec
-(
-    GB_atype *GB_RESTRICT Tx,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT kfirst_slice,
-    const int64_t *GB_RESTRICT klast_slice,
-    const int64_t *GB_RESTRICT pstart_slice,
-    GB_void *Wfirst_space,
-    GB_void *Wlast_space,
-    int ntasks,
-    int nthreads
-) ;
-
-GrB_Info GB_red_eachindex
-(
-    GrB_Matrix *Thandle,
-    GrB_Type ttype,
-    GrB_Matrix A,
-    const int64_t *GB_RESTRICT pstart_slice,
-    int nth,
-    int nthreads,
-    GB_Context Context
-) ;
-
 endif_is_monoid
 
 GrB_Info GB_red_build
diff --git a/GraphBLAS/Source/Generator/GB_sel.c b/GraphBLAS/Source/Generator/GB_sel.c
index 18625a9a44..cda4ebd01b 100644
--- a/GraphBLAS/Source/Generator/GB_sel.c
+++ b/GraphBLAS/Source/Generator/GB_sel.c
@@ -2,8 +2,8 @@
 // GB_sel:  hard-coded functions for selection operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -47,8 +47,8 @@ void GB_sel_phase1
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -61,7 +61,6 @@ void GB_sel_phase1
     const int nthreads
 )
 { 
-    int64_t *GB_RESTRICT Tx = Cp ;
     GB_get_thunk
     #include "GB_select_phase1.c"
 }
@@ -95,3 +94,27 @@ void GB_sel_phase2
     #include "GB_select_phase2.c"
 }
 
+//------------------------------------------------------------------------------
+// GB_sel_bitmap
+//------------------------------------------------------------------------------
+
+if_bitmap
+
+void GB_sel_bitmap
+(
+    int8_t *Cb,
+    GB_atype *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_atype *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+)
+{ 
+    GB_get_thunk
+    #include "GB_bitmap_select_template.c"
+}
+
+endif_bitmap
diff --git a/GraphBLAS/Source/Generator/GB_sel.h b/GraphBLAS/Source/Generator/GB_sel.h
index cd394e0e14..a710dd3c6c 100644
--- a/GraphBLAS/Source/Generator/GB_sel.h
+++ b/GraphBLAS/Source/Generator/GB_sel.h
@@ -1,11 +1,11 @@
+// SPDX-License-Identifier: Apache-2.0
 if_phase1
-
 void GB_sel_phase1
 (
     int64_t *GB_RESTRICT Zp,
     int64_t *GB_RESTRICT Cp,
-    GB_void *GB_RESTRICT Wfirst_space,
-    GB_void *GB_RESTRICT Wlast_space,
+    int64_t *GB_RESTRICT Wfirst,
+    int64_t *GB_RESTRICT Wlast,
     const GrB_Matrix A,
     const int64_t *GB_RESTRICT kfirst_slice,
     const int64_t *GB_RESTRICT klast_slice,
@@ -17,7 +17,6 @@ void GB_sel_phase1
     const int ntasks,
     const int nthreads
 ) ;
-
 endif_phase1
 
 void GB_sel_phase2
@@ -39,3 +38,17 @@ void GB_sel_phase2
     const int nthreads
 ) ;
 
+if_bitmap
+void GB_sel_bitmap
+(
+    int8_t *Cb,
+    GB_atype *GB_RESTRICT Cx,
+    int64_t *cnvals_handle,
+    GrB_Matrix A,
+    const bool flipij,
+    const int64_t ithunk,
+    const GB_atype *GB_RESTRICT xthunk,
+    const GxB_select_function user_select,
+    const int nthreads
+) ;
+endif_bitmap
diff --git a/GraphBLAS/Source/Generator/GB_type.c b/GraphBLAS/Source/Generator/GB_type.c
index 33f736ae32..b0d6acace4 100644
--- a/GraphBLAS/Source/Generator/GB_type.c
+++ b/GraphBLAS/Source/Generator/GB_type.c
@@ -2,8 +2,8 @@
 // GB_type:  hard-coded functions for each built-in type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -124,5 +124,30 @@ GrB_Info GB_Cdense_25
     #endif
 }
 
+//------------------------------------------------------------------------------
+// convert sparse to bitmap
+//------------------------------------------------------------------------------
+
+GrB_Info GB_convert_s2b
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+)
+{ 
+    #if GB_DISABLE
+    return (GrB_NO_VALUE) ;
+    #else
+    GB_ctype *GB_RESTRICT Ax_new = (GB_ctype *) Ax_new_void ;
+    #include "GB_convert_sparse_to_bitmap_template.c"
+    return (GrB_SUCCESS) ;
+    #endif
+}
+
 #endif
 
diff --git a/GraphBLAS/Source/Generator/GB_type.h b/GraphBLAS/Source/Generator/GB_type.h
index 63ebf53fe3..6e6e24ac7f 100644
--- a/GraphBLAS/Source/Generator/GB_type.h
+++ b/GraphBLAS/Source/Generator/GB_type.h
@@ -1,4 +1,4 @@
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_Cdense_05d
 (
     GrB_Matrix C,
@@ -36,3 +36,15 @@ GrB_Info GB_Cdense_25
     const int nthreads
 ) ;
 
+GrB_Info GB_convert_s2b
+(
+    GrB_Matrix A,
+    GB_void *GB_RESTRICT Ax_new_void,
+    int8_t  *GB_RESTRICT Ab,
+    const int64_t *GB_RESTRICT kfirst_slice,
+    const int64_t *GB_RESTRICT klast_slice,
+    const int64_t *GB_RESTRICT pstart_slice,
+    const int ntasks,
+    const int nthreads
+) ;
+
diff --git a/GraphBLAS/Source/Generator/GB_unop.c b/GraphBLAS/Source/Generator/GB_unop.c
index 6d2c563dc8..07bad3db8c 100644
--- a/GraphBLAS/Source/Generator/GB_unop.c
+++ b/GraphBLAS/Source/Generator/GB_unop.c
@@ -2,8 +2,8 @@
 // GB_unop:  hard-coded functions for each built-in unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,6 +12,7 @@
 #include "GB.h"
 #ifndef GBCOMPACT
 #include "GB_control.h"
+#include "GB_atomics.h"
 #include "GB_unop__include.h"
 
 // C=unop(A) is defined by the following types and operators:
@@ -54,6 +55,10 @@
     GB_unaryop(Cx [pC], z) ;        \
 }
 
+// true if operator is the identity op with no typecasting
+#define GB_OP_IS_IDENTITY_WITH_NO_TYPECAST \
+    GB_op_is_identity_with_no_typecast
+
 // disable this operator and use the generic case if these conditions hold
 #define GB_DISABLE \
     GB_disable
@@ -62,33 +67,49 @@
 // Cx = op (cast (Ax)): apply a unary operator
 //------------------------------------------------------------------------------
 
-if_operator_is_enabled
-
 GrB_Info GB_unop_apply
 (
     GB_ctype *Cx,       // Cx and Ax may be aliased
     const GB_atype *Ax,
+    const int8_t *GB_RESTRICT Ab,   // A->b if A is bitmap
     int64_t anz,
     int nthreads
 )
-{ 
+{
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
     int64_t p ;
-    #pragma omp parallel for num_threads(nthreads) schedule(static)
-    for (p = 0 ; p < anz ; p++)
-    {
-        GB_geta(aij, Ax, p) ;
-        GB_cast(z, aij) ;
-        GB_unaryop(Cx [p], z) ;
+    if (Ab == NULL)
+    { 
+        #if ( GB_OP_IS_IDENTITY_WITH_NO_TYPECAST )
+            GB_memcpy (Cx, Ax, anz * sizeof (GB_atype), nthreads) ;
+        #else
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (p = 0 ; p < anz ; p++)
+            {
+                GB_geta(aij, Ax, p) ;
+                GB_cast(z, aij) ;
+                GB_unaryop(Cx [p], z) ;
+            }
+        #endif
+    }
+    else
+    { 
+        // bitmap case, no transpose; A->b already memcpy'd into C->b
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < anz ; p++)
+        {
+            if (!Ab [p]) continue ;
+            GB_geta(aij, Ax, p) ;
+            GB_cast(z, aij) ;
+            GB_unaryop(Cx [p], z) ;
+        }
     }
     return (GrB_SUCCESS) ;
     #endif
 }
 
-endif_operator_is_enabled
-
 //------------------------------------------------------------------------------
 // C = op (cast (A')): transpose, typecast, and apply a unary operator
 //------------------------------------------------------------------------------
@@ -97,16 +118,15 @@ GrB_Info GB_unop_tran
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 )
 { 
     #if GB_DISABLE
     return (GrB_NO_VALUE) ;
     #else
-    #define GB_PHASE_2_OF_2
     #include "GB_unop_transpose.c"
     return (GrB_SUCCESS) ;
     #endif
diff --git a/GraphBLAS/Source/Generator/GB_unop.h b/GraphBLAS/Source/Generator/GB_unop.h
index 09002c7ef3..c7d2cb77b6 100644
--- a/GraphBLAS/Source/Generator/GB_unop.h
+++ b/GraphBLAS/Source/Generator/GB_unop.h
@@ -1,22 +1,20 @@
-if_operator_is_enabled
-
+// SPDX-License-Identifier: Apache-2.0
 GrB_Info GB_unop_apply
 (
     GB_ctype *Cx,
     const GB_atype *Ax,
+    const int8_t *GB_RESTRICT Ab,
     int64_t anz,
     int nthreads
 ) ;
 
-endif_operator_is_enabled
-
 GrB_Info GB_unop_tran
 (
     GrB_Matrix C,
     const GrB_Matrix A,
-    int64_t *GB_RESTRICT *Rowcounts,
-    GBI_single_iterator Iter,
+    int64_t *GB_RESTRICT *Workspaces,
     const int64_t *GB_RESTRICT A_slice,
-    int naslice
+    int nworkspaces,
+    int nthreads
 ) ;
 
diff --git a/GraphBLAS/Source/GrB_BinaryOp_free.c b/GraphBLAS/Source/GrB_BinaryOp_free.c
index c0c8caf097..7017b50beb 100644
--- a/GraphBLAS/Source/GrB_BinaryOp_free.c
+++ b/GraphBLAS/Source/GrB_BinaryOp_free.c
@@ -2,8 +2,8 @@
 // GrB_BinaryOp_free: free a binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GrB_BinaryOp_new.c b/GraphBLAS/Source/GrB_BinaryOp_new.c
index 0d13066a55..87221c2235 100644
--- a/GraphBLAS/Source/GrB_BinaryOp_new.c
+++ b/GraphBLAS/Source/GrB_BinaryOp_new.c
@@ -2,8 +2,8 @@
 // GrB_BinaryOp_new: create a new user-defined binary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GrB_BinaryOp_wait.c b/GraphBLAS/Source/GrB_BinaryOp_wait.c
index 2cda0cb99b..48de59e35d 100644
--- a/GraphBLAS/Source/GrB_BinaryOp_wait.c
+++ b/GraphBLAS/Source/GrB_BinaryOp_wait.c
@@ -2,14 +2,14 @@
 // GrB_BinaryOp_wait: wait for a user-defined GrB_BinaryOp to complete
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // In SuiteSparse:GraphBLAS, a user-defined GrB_BinaryOp has no pending
 // operations to wait for.  All this method does is verify that the op is
-// properly initialized.
+// properly initialized, and then it does an OpenMP flush.
 
 #include "GB.h"
 
@@ -23,7 +23,8 @@ GrB_Info GrB_BinaryOp_wait   // no work, just check if the GrB_BinaryOp is valid
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_BinaryOp_wait (&op)") ;
+    #pragma omp flush
+    GB_WHERE1 ("GrB_BinaryOp_wait (&op)") ;
     GB_RETURN_IF_NULL (op) ;
     GB_RETURN_IF_NULL_OR_FAULTY (*op) ;
 
@@ -31,6 +32,7 @@ GrB_Info GrB_BinaryOp_wait   // no work, just check if the GrB_BinaryOp is valid
     // return result
     //--------------------------------------------------------------------------
 
+    #pragma omp flush
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Col_assign.c b/GraphBLAS/Source/GrB_Col_assign.c
index fe7f15d245..93cca0a9d2 100644
--- a/GraphBLAS/Source/GrB_Col_assign.c
+++ b/GraphBLAS/Source/GrB_Col_assign.c
@@ -1,15 +1,16 @@
 //------------------------------------------------------------------------------
-// GrB_Col_assign:    C<M>(Rows,col) = accum (C(Rows,col),u)
+// GrB_Col_assign: C<M>(Rows,col) = accum (C(Rows,col),u)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // Compare with GxB_Col_subassign, which uses the M and C_replace differently
 
 #include "GB_assign.h"
+#include "GB_bitmap_assign.h"
 
 GrB_Info GrB_Col_assign             // C<M>(Rows,col) = accum (C(Rows,col),u)
 (
@@ -28,7 +29,7 @@ GrB_Info GrB_Col_assign             // C<M>(Rows,col) = accum (C(Rows,col),u)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Col_assign (C, M, accum, u, Rows, nRows, col, desc)") ;
+    GB_WHERE (C, "GrB_Col_assign (C, M, accum, u, Rows, nRows, col, desc)") ;
     GB_BURBLE_START ("GrB_assign") ;
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -38,7 +39,7 @@ GrB_Info GrB_Col_assign             // C<M>(Rows,col) = accum (C(Rows,col),u)
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // C(Rows,col)<M> = accum (C(Rows,col), u)
@@ -57,7 +58,7 @@ GrB_Info GrB_Col_assign             // C<M>(Rows,col) = accum (C(Rows,col),u)
         Rows, nRows,                        // row indices
         Cols, 1,                            // a single column index
         false, NULL, GB_ignore_code,        // no scalar expansion
-        true, false,                        // GrB_Col_assign
+        GB_COL_ASSIGN,
         Context) ;
 
     GB_BURBLE_END ;
diff --git a/GraphBLAS/Source/GrB_Col_extract.c b/GraphBLAS/Source/GrB_Col_extract.c
index 8e5f36b84b..5ad4bf3be5 100644
--- a/GraphBLAS/Source/GrB_Col_extract.c
+++ b/GraphBLAS/Source/GrB_Col_extract.c
@@ -2,8 +2,8 @@
 // GrB_Col_extract: w<M> = accum (w, A(I,j)) or A(j,I)'
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -31,7 +31,7 @@ GrB_Info GrB_Col_extract        // w<M> = accum (w, A(I,j)) or (A(j,I))'
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Col_extract (w, M, accum, A, I, ni, j, desc)") ;
+    GB_WHERE (w, "GrB_Col_extract (w, M, accum, A, I, ni, j, desc)") ;
     GB_BURBLE_START ("GrB_extract") ;
     GB_RETURN_IF_NULL_OR_FAULTY (w) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -41,14 +41,14 @@ GrB_Info GrB_Col_extract        // w<M> = accum (w, A(I,j)) or (A(j,I))'
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_transpose, xx1, xx2) ;
+        A_transpose, xx1, xx2, xx7) ;
 
     GrB_Index ancols = (A_transpose ? GB_NROWS (A) : GB_NCOLS (A)) ;
     if (j >= ancols)
     { 
-        return (GB_ERROR (GrB_INVALID_INDEX, (GB_LOG,
+        GB_ERROR (GrB_INVALID_INDEX,
             "Column index j=" GBu " out of bounds; must be < " GBu ,
-            j, ancols))) ;
+            j, ancols) ;
     }
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GrB_Descriptor_free.c b/GraphBLAS/Source/GrB_Descriptor_free.c
index e66510336e..c32720c319 100644
--- a/GraphBLAS/Source/GrB_Descriptor_free.c
+++ b/GraphBLAS/Source/GrB_Descriptor_free.c
@@ -2,8 +2,8 @@
 // GrB_Descriptor_free: free a descriptor
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,7 +23,8 @@ GrB_Info GrB_Descriptor_free            // free a descriptor
         GrB_Descriptor desc = *descriptor ;
         if (desc != NULL && desc->magic == GB_MAGIC && !(desc->predefined))
         { 
-            desc->magic = GB_FREED ;     // to help detect dangling pointers
+            GB_FREE (desc->logger) ;    // free the error logger string
+            desc->magic = GB_FREED ;    // to help detect dangling pointers
             GB_FREE (*descriptor) ;
         }
         (*descriptor) = NULL ;
diff --git a/GraphBLAS/Source/GrB_Descriptor_new.c b/GraphBLAS/Source/GrB_Descriptor_new.c
index c3f38180da..9042d6e4ff 100644
--- a/GraphBLAS/Source/GrB_Descriptor_new.c
+++ b/GraphBLAS/Source/GrB_Descriptor_new.c
@@ -2,8 +2,8 @@
 // GrB_Descriptor_new: create a new descriptor
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -21,7 +21,7 @@ GrB_Info GrB_Descriptor_new     // create a new descriptor
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Descriptor_new (&descriptor)") ;
+    GB_WHERE1 ("GrB_Descriptor_new (&descriptor)") ;
     GB_RETURN_IF_NULL (descriptor) ;
     (*descriptor) = NULL ;
 
@@ -34,7 +34,7 @@ GrB_Info GrB_Descriptor_new     // create a new descriptor
     if (*descriptor == NULL)
     { 
         // out of memory
-        return (GB_OUT_OF_MEMORY) ;
+        return (GrB_OUT_OF_MEMORY) ;
     }
 
     // initialize the descriptor
@@ -47,7 +47,7 @@ GrB_Info GrB_Descriptor_new     // create a new descriptor
     desc->axb  = GxB_DEFAULT ;     // descriptor for selecting the C=A*B method
     desc->nthreads_max = GxB_DEFAULT ;  // max # of threads to use
     desc->chunk = GxB_DEFAULT ;         // chunk for auto-tuning of # threads
-    desc->use_mkl = false ;        // control usage of Intel MKL
+    // #include "GrB_Descriptor_new_mkl_template.c"
     desc->predefined = false ;     // user-defined
     return (GrB_SUCCESS) ;
 }
diff --git a/GraphBLAS/Source/GrB_Descriptor_set.c b/GraphBLAS/Source/GrB_Descriptor_set.c
index afd41f8ec6..8b4eaf19d3 100644
--- a/GraphBLAS/Source/GrB_Descriptor_set.c
+++ b/GraphBLAS/Source/GrB_Descriptor_set.c
@@ -2,8 +2,8 @@
 // GrB_Descriptor_set: set a field in a descriptor
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -21,16 +21,16 @@ GrB_Info GrB_Descriptor_set     // set a parameter in a descriptor
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Descriptor_set (desc, field, value)") ;
-    GB_RETURN_IF_NULL_OR_FAULTY (desc) ;
-    ASSERT_DESCRIPTOR_OK (desc, "desc to set", GB0) ;
-
-    if (desc->predefined)
+    if (desc != NULL && desc->predefined)
     { 
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "predefined descriptors may not be modified"))) ;
+        // built-in descriptors may not be modified
+        return (GrB_INVALID_VALUE) ;
     }
 
+    GB_WHERE (desc, "GrB_Descriptor_set (desc, field, value)") ;
+    GB_RETURN_IF_NULL_OR_FAULTY (desc) ;
+    ASSERT_DESCRIPTOR_OK (desc, "desc to set", GB0) ;
+
     //--------------------------------------------------------------------------
     // set the parameter
     //--------------------------------------------------------------------------
@@ -42,10 +42,10 @@ GrB_Info GrB_Descriptor_set     // set a parameter in a descriptor
 
             if (! (value == GxB_DEFAULT || value == GrB_REPLACE))
             { 
-                return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+                GB_ERROR (GrB_INVALID_VALUE,
                     "invalid descriptor value [%d] for GrB_OUTP field;\n"
                     "must be GxB_DEFAULT [%d] or GrB_REPLACE [%d]",
-                    (int) value, (int) GxB_DEFAULT, (int) GrB_REPLACE))) ;
+                    (int) value, (int) GxB_DEFAULT, (int) GrB_REPLACE) ;
             }
             desc->out = value ;
             break ;
@@ -58,13 +58,13 @@ GrB_Info GrB_Descriptor_set     // set a parameter in a descriptor
                    value == GrB_STRUCTURE ||
                    value == (GrB_COMP + GrB_STRUCTURE)))
             { 
-                return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+                GB_ERROR (GrB_INVALID_VALUE,
                     "invalid descriptor value [%d] for GrB_MASK field;\n"
                     "must be GxB_DEFAULT [%d], GrB_COMP [%d],\n"
                     "GrB_STRUCTURE [%d], or GrB_COMP+GrB_STRUCTURE [%d]",
                     (int) value, (int) GxB_DEFAULT, (int) GrB_COMP,
                     (int) GrB_STRUCTURE,
-                    (int) (GrB_COMP + GrB_STRUCTURE)))) ;
+                    (int) (GrB_COMP + GrB_STRUCTURE)) ;
             }
             int mask = (int) desc->mask ;
             switch (value)
@@ -81,10 +81,10 @@ GrB_Info GrB_Descriptor_set     // set a parameter in a descriptor
 
             if (! (value == GxB_DEFAULT || value == GrB_TRAN))
             { 
-                return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+                GB_ERROR (GrB_INVALID_VALUE,
                     "invalid descriptor value [%d] for GrB_INP0 field;\n"
                     "must be GxB_DEFAULT [%d] or GrB_TRAN [%d]",
-                    (int) value, (int) GxB_DEFAULT, (int) GrB_TRAN))) ;
+                    (int) value, (int) GxB_DEFAULT, (int) GrB_TRAN) ;
             }
             desc->in0 = value ;
             break ;
@@ -93,10 +93,10 @@ GrB_Info GrB_Descriptor_set     // set a parameter in a descriptor
 
             if (! (value == GxB_DEFAULT || value == GrB_TRAN))
             { 
-                return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+                GB_ERROR (GrB_INVALID_VALUE,
                     "invalid descriptor value [%d] for GrB_INP1 field;\n"
                     "must be GxB_DEFAULT [%d] or GrB_TRAN [%d]",
-                    (int) value, (int) GxB_DEFAULT, (int) GrB_TRAN))) ;
+                    (int) value, (int) GxB_DEFAULT, (int) GrB_TRAN) ;
             }
             desc->in1 = value ;
             break ;
@@ -104,29 +104,28 @@ GrB_Info GrB_Descriptor_set     // set a parameter in a descriptor
         case GxB_AxB_METHOD : 
 
             if (! (value == GxB_DEFAULT  || value == GxB_AxB_GUSTAVSON
-                || value == GxB_AxB_HEAP || value == GxB_AxB_DOT
+                || value == GxB_AxB_DOT
                 || value == GxB_AxB_HASH || value == GxB_AxB_SAXPY))
             { 
-                return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+                GB_ERROR (GrB_INVALID_VALUE,
                     "invalid descriptor value [%d] for GrB_AxB_METHOD field;\n"
                     "must be GxB_DEFAULT [%d], GxB_AxB_GUSTAVSON [%d]\n"
-                    "GxB_AxB_HEAP [%d], GxB_AxB_DOT [%d]"
-                    "GxB_AxB_HASH [%d] or GxB_AxB_SAXPY [%d]",
+                    "GxB_AxB_DOT [%d], GxB_AxB_HASH [%d] or GxB_AxB_SAXPY [%d]",
                     (int) value, (int) GxB_DEFAULT, (int) GxB_AxB_GUSTAVSON,
-                    (int) GxB_AxB_HEAP, (int) GxB_AxB_DOT,
-                    (int) GxB_AxB_HASH, (int) GxB_AxB_SAXPY))) ;
+                    (int) GxB_AxB_DOT,
+                    (int) GxB_AxB_HASH, (int) GxB_AxB_SAXPY) ;
             }
             desc->axb = value ;
             break ;
 
         default : 
 
-            return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+            GB_ERROR (GrB_INVALID_VALUE,
                 "invalid descriptor field [%d], must be one of:\n"
                 "GrB_OUTP [%d], GrB_MASK [%d], GrB_INP0 [%d], GrB_INP1 [%d]"
                 "or GxB_AxB_METHOD [%d]", (int) field, (int) GrB_OUTP,
                 (int) GrB_MASK, (int) GrB_INP0, (int) GrB_INP1,
-                (int) GxB_AxB_METHOD))) ;
+                (int) GxB_AxB_METHOD) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GrB_Descriptor_wait.c b/GraphBLAS/Source/GrB_Descriptor_wait.c
index 7a9ccbbd21..cef05b6d92 100644
--- a/GraphBLAS/Source/GrB_Descriptor_wait.c
+++ b/GraphBLAS/Source/GrB_Descriptor_wait.c
@@ -2,16 +2,16 @@
 // GrB_Descriptor_wait: wait for a user-defined GrB_Descriptor to complete
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // In SuiteSparse:GraphBLAS, a user-defined GrB_Descriptor has no pending
 // operations to wait for.  All this method does is verify that the descriptor
-// is properly initialized.  Note that unlike other methods, passing in a NULL
-// pointer, or a pointer to a NULL descriptor is valid, since a NULL descriptor
-// results in default settings.
+// is properly initialized, and then it does an OpenMP flush.  Note that unlike
+// other methods, passing in a NULL pointer, or a pointer to a NULL descriptor
+// is valid, since a NULL descriptor results in default settings.
 
 #include "GB.h"
 
@@ -19,13 +19,14 @@ GrB_Info GrB_Descriptor_wait // no work, just check if GrB_Descriptor is valid
 (
     GrB_Descriptor *desc     // required; may not be NULL a pointer to NULL
 )
-{ 
+{
 
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Descriptor_wait (&desc)") ;
+    #pragma omp flush
+    GB_WHERE1 ("GrB_Descriptor_wait (&desc)") ;
     if (desc != NULL && (*desc) != NULL)
     { 
         GB_RETURN_IF_FAULTY (*desc) ;
@@ -35,6 +36,7 @@ GrB_Info GrB_Descriptor_wait // no work, just check if GrB_Descriptor is valid
     // return result
     //--------------------------------------------------------------------------
 
+    #pragma omp flush
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Matrix_apply.c b/GraphBLAS/Source/GrB_Matrix_apply.c
index 9088b0638c..f2ca491843 100644
--- a/GraphBLAS/Source/GrB_Matrix_apply.c
+++ b/GraphBLAS/Source/GrB_Matrix_apply.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_apply: apply a unary or binary operator to a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -31,7 +31,7 @@ GrB_Info GrB_Matrix_apply           // C<M> = accum (C, op(A)) or op(A')
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_apply (C, M, accum, op, A, desc)") ;
+    GB_WHERE (C, "GrB_Matrix_apply (C, M, accum, op, A, desc)") ;
     GB_BURBLE_START ("GrB_apply") ;
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -39,7 +39,7 @@ GrB_Info GrB_Matrix_apply           // C<M> = accum (C, op(A)) or op(A')
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_transpose, xx1, xx2) ;
+        A_transpose, xx1, xx2, xx7) ;
 
     //--------------------------------------------------------------------------
     // apply the operator and optionally transpose
@@ -87,7 +87,7 @@ static inline GrB_Info GB_1st       // C<M>=accum(C,op(x,A))
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_transpose, xx1, xx2) ;
+        A_transpose, xx1, xx2, xx7) ;
 
     //--------------------------------------------------------------------------
     // apply the operator and optionally transpose
@@ -137,7 +137,7 @@ static inline GrB_Info GB_2nd       // C<M>=accum(C,op(A,y))
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, A_transpose, xx2) ;
+        xx1, A_transpose, xx2, xx7) ;
 
     //--------------------------------------------------------------------------
     // apply the operator and optionally transpose
@@ -173,7 +173,7 @@ GrB_Info GxB_Matrix_apply_BinaryOp1st           // C<M>=accum(C,op(x,A))
     const GrB_Descriptor desc       // descriptor for C, M, and A
 )
 { 
-    GB_WHERE ("GxB_Matrix_apply_BinaryOp1st (C, M, accum, op, x, A, desc)") ;
+    GB_WHERE (C, "GxB_Matrix_apply_BinaryOp1st (C, M, accum, op, x, A, desc)") ;
     return (GB_1st (C, M, accum, op, x, A, desc, Context)) ;
 }
 
@@ -194,7 +194,7 @@ GrB_Info GxB_Matrix_apply_BinaryOp2nd           // C<M>=accum(C,op(A,y))
     const GrB_Descriptor desc       // descriptor for C, M, and A
 )
 { 
-    GB_WHERE ("GxB_Matrix_apply_BinaryOp2nd (C, M, accum, op, A, y, desc)") ;
+    GB_WHERE (C, "GxB_Matrix_apply_BinaryOp2nd (C, M, accum, op, A, y, desc)") ;
     return (GB_2nd (C, M, accum, op, A, y, desc, Context)) ;
 }
 
@@ -214,7 +214,7 @@ GrB_Info prefix ## Matrix_apply_BinaryOp1st_ ## T                           \
     const GrB_Descriptor desc       /* descriptor for C, M, and A */        \
 )                                                                           \
 {                                                                           \
-    GB_WHERE (GB_STR(prefix) "Matrix_apply_BinaryOp1st_" GB_STR(T)          \
+    GB_WHERE (C, GB_STR(prefix) "Matrix_apply_BinaryOp1st_" GB_STR(T)       \
         "(C, M, accum, op, x, A, desc)") ;                                  \
     GB_SCALAR_WRAP (scalar, prefix, T, ampersand, x, stype) ;               \
     ASSERT_SCALAR_OK (scalar, "scalar for matrix_apply_bind1st", GB0) ;     \
@@ -252,7 +252,7 @@ GrB_Info prefix ## Matrix_apply_BinaryOp2nd_ ## T                           \
     const GrB_Descriptor desc       /* descriptor for C, M, and A */        \
 )                                                                           \
 {                                                                           \
-    GB_WHERE (GB_STR(prefix) "Matrix_apply_BinaryOp2nd_" GB_STR(T)          \
+    GB_WHERE (C, GB_STR(prefix) "Matrix_apply_BinaryOp2nd_" GB_STR(T)       \
         "(C, M, accum, op, A, y, desc)") ;                                  \
     GB_SCALAR_WRAP (scalar, prefix, T, ampersand, y, stype) ;               \
     ASSERT_SCALAR_OK (scalar, "scalar for matrix_apply_bind2nd", GB0) ;     \
diff --git a/GraphBLAS/Source/GrB_Matrix_assign.c b/GraphBLAS/Source/GrB_Matrix_assign.c
index ec77497897..675ad8634a 100644
--- a/GraphBLAS/Source/GrB_Matrix_assign.c
+++ b/GraphBLAS/Source/GrB_Matrix_assign.c
@@ -1,13 +1,14 @@
 //------------------------------------------------------------------------------
-// GrB_Matrix_assign:    C<M>(Rows,Cols) = accum (C(Rows,Cols),A) or A'
+// GrB_Matrix_assign: C<M>(Rows,Cols) = accum (C(Rows,Cols),A) or A'
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "GB_assign.h"
+#include "GB_bitmap_assign.h"
 
 GrB_Info GrB_Matrix_assign          // C<M>(Rows,Cols) += A or A'
 (
@@ -27,7 +28,7 @@ GrB_Info GrB_Matrix_assign          // C<M>(Rows,Cols) += A or A'
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_assign"
+    GB_WHERE (C, "GrB_Matrix_assign"
         " (C, M, accum, A, Rows, nRows, Cols, nCols, desc)") ;
     GB_BURBLE_START ("GrB_assign") ;
 
@@ -37,7 +38,7 @@ GrB_Info GrB_Matrix_assign          // C<M>(Rows,Cols) += A or A'
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_transpose, xx1, xx2) ;
+        A_transpose, xx1, xx2, xx7) ;
 
     //--------------------------------------------------------------------------
     // C<M>(Rows,Cols) = accum (C(Rows,Cols), A) and variations
@@ -52,7 +53,7 @@ GrB_Info GrB_Matrix_assign          // C<M>(Rows,Cols) += A or A'
         Rows, nRows,                // row indices
         Cols, nCols,                // column indices
         false, NULL, GB_ignore_code,// no scalar expansion
-        false, false,               // not GrB_Col_assign nor GrB_row_assign
+        GB_ASSIGN,
         Context) ;
 
     GB_BURBLE_END ;
diff --git a/GraphBLAS/Source/GrB_Matrix_assign_scalar.c b/GraphBLAS/Source/GrB_Matrix_assign_scalar.c
index 04b65b5d64..ad5e9f7ae0 100644
--- a/GraphBLAS/Source/GrB_Matrix_assign_scalar.c
+++ b/GraphBLAS/Source/GrB_Matrix_assign_scalar.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_assign_[SCALAR]: assign a scalar to matrix, via scalar expansion
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -21,7 +21,7 @@
 
 #include "GB_assign.h"
 
-#define GB_ASSIGN(prefix,type,T,ampersand)                                     \
+#define GB_ASSIGN_SCALAR(prefix,type,T,ampersand)                              \
 GrB_Info prefix ## Matrix_assign_ ## T    /* C<M>(Rows,Cols) += x           */ \
 (                                                                              \
     GrB_Matrix C,                   /* input/output matrix for results      */ \
@@ -35,7 +35,7 @@ GrB_Info prefix ## Matrix_assign_ ## T    /* C<M>(Rows,Cols) += x           */ \
     const GrB_Descriptor desc       /* descriptor for C and M               */ \
 )                                                                              \
 {                                                                              \
-    GB_WHERE (GB_STR(prefix) "Matrix_assign_" GB_STR(T)                        \
+    GB_WHERE (C, GB_STR(prefix) "Matrix_assign_" GB_STR(T)                     \
         " (C, M, accum, x, Rows, nRows, Cols, nCols, desc)") ;                 \
     GB_BURBLE_START ("GrB_assign") ;                                           \
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;                                          \
@@ -46,18 +46,18 @@ GrB_Info prefix ## Matrix_assign_ ## T    /* C<M>(Rows,Cols) += x           */ \
     return (info) ;                                                            \
 }
 
-GB_ASSIGN (GrB_, bool      , BOOL   , &)
-GB_ASSIGN (GrB_, int8_t    , INT8   , &)
-GB_ASSIGN (GrB_, uint8_t   , UINT8  , &)
-GB_ASSIGN (GrB_, int16_t   , INT16  , &)
-GB_ASSIGN (GrB_, uint16_t  , UINT16 , &)
-GB_ASSIGN (GrB_, int32_t   , INT32  , &)
-GB_ASSIGN (GrB_, uint32_t  , UINT32 , &)
-GB_ASSIGN (GrB_, int64_t   , INT64  , &)
-GB_ASSIGN (GrB_, uint64_t  , UINT64 , &)
-GB_ASSIGN (GrB_, float     , FP32   , &)
-GB_ASSIGN (GrB_, double    , FP64   , &)
-GB_ASSIGN (GxB_, GxB_FC32_t, FC32   , &)
-GB_ASSIGN (GxB_, GxB_FC64_t, FC64   , &)
-GB_ASSIGN (GrB_, void *    , UDT    ,  )
+GB_ASSIGN_SCALAR (GrB_, bool      , BOOL   , &)
+GB_ASSIGN_SCALAR (GrB_, int8_t    , INT8   , &)
+GB_ASSIGN_SCALAR (GrB_, uint8_t   , UINT8  , &)
+GB_ASSIGN_SCALAR (GrB_, int16_t   , INT16  , &)
+GB_ASSIGN_SCALAR (GrB_, uint16_t  , UINT16 , &)
+GB_ASSIGN_SCALAR (GrB_, int32_t   , INT32  , &)
+GB_ASSIGN_SCALAR (GrB_, uint32_t  , UINT32 , &)
+GB_ASSIGN_SCALAR (GrB_, int64_t   , INT64  , &)
+GB_ASSIGN_SCALAR (GrB_, uint64_t  , UINT64 , &)
+GB_ASSIGN_SCALAR (GrB_, float     , FP32   , &)
+GB_ASSIGN_SCALAR (GrB_, double    , FP64   , &)
+GB_ASSIGN_SCALAR (GxB_, GxB_FC32_t, FC32   , &)
+GB_ASSIGN_SCALAR (GxB_, GxB_FC64_t, FC64   , &)
+GB_ASSIGN_SCALAR (GrB_, void *    , UDT    ,  )
 
diff --git a/GraphBLAS/Source/GrB_Matrix_build.c b/GraphBLAS/Source/GrB_Matrix_build.c
index 52e7115bbd..1a495d6539 100644
--- a/GraphBLAS/Source/GrB_Matrix_build.c
+++ b/GraphBLAS/Source/GrB_Matrix_build.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_build: build a sparse GraphBLAS matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info prefix ## Matrix_build_ ## T /* build a matrix from (I,J,X) tuples */\
     const GrB_BinaryOp dup          /* binary op to assemble duplicates   */  \
 )                                                                             \
 {                                                                             \
-    GB_WHERE (GB_STR(prefix) "Matrix_build_" GB_STR(T)                        \
+    GB_WHERE (C, GB_STR(prefix) "Matrix_build_" GB_STR(T)                     \
         " (C, I, J, X, nvals, dup)") ;                                        \
     GB_BURBLE_START ("GrB_Matrix_build") ;                                    \
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;                                         \
diff --git a/GraphBLAS/Source/GrB_Matrix_clear.c b/GraphBLAS/Source/GrB_Matrix_clear.c
index 87152ffe81..e9f27161b5 100644
--- a/GraphBLAS/Source/GrB_Matrix_clear.c
+++ b/GraphBLAS/Source/GrB_Matrix_clear.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_clear: clears the content of a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,7 +23,7 @@ GrB_Info GrB_Matrix_clear   // clear a matrix of all entries;
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_clear (A)") ;
+    GB_WHERE (A, "GrB_Matrix_clear (A)") ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GrB_Matrix_dup.c b/GraphBLAS/Source/GrB_Matrix_dup.c
index c882c848b3..d6e8f1c5e5 100644
--- a/GraphBLAS/Source/GrB_Matrix_dup.c
+++ b/GraphBLAS/Source/GrB_Matrix_dup.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_dup: make a deep copy of a sparse matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,7 +22,7 @@ GrB_Info GrB_Matrix_dup     // make an exact copy of a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_dup (&C, A)") ;
+    GB_WHERE1 ("GrB_Matrix_dup (&C, A)") ;
     GB_BURBLE_START ("GrB_Matrix_dup") ;
     GB_RETURN_IF_NULL (C) ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
diff --git a/GraphBLAS/Source/GrB_Matrix_eWiseAdd.c b/GraphBLAS/Source/GrB_Matrix_eWiseAdd.c
index c19299abef..e3fd7597ac 100644
--- a/GraphBLAS/Source/GrB_Matrix_eWiseAdd.c
+++ b/GraphBLAS/Source/GrB_Matrix_eWiseAdd.c
@@ -2,19 +2,13 @@
 // GrB_Matrix_eWiseAdd: matrix element-wise operations, set union
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // C<M> = accum (C,A+B) and variations.
 
-// SuiteSparse:GraphBLAS v3.2 and earlier included these functions from the C
-// API with the wrong name.  It is corrected in this version.  The prior
-// misnamed functions are kept for backward compatibility, but they are
-// deprecated and their use is not recommend. The generic version,
-// GrB_eWiseAdd, is not affected.
-
 #include "GB_ewise.h"
 
 #define GB_EWISE(op)                                                        \
@@ -25,7 +19,7 @@
     GB_RETURN_IF_FAULTY (M) ;                                               \
     /* get the descriptor */                                                \
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,       \
-        A_tran, B_tran, xx) ;                                               \
+        A_tran, B_tran, xx, xx7) ;                                          \
     /* C<M> = accum (C,T) where T = A+B, A'+B, A+B', or A'+B' */            \
     info = GB_ewise (                                                       \
         C,              C_replace,  /* C and its descriptor        */       \
@@ -57,7 +51,7 @@ GrB_Info GrB_Matrix_eWiseAdd_BinaryOp       // C<M> = accum (C, A+B)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_eWiseAdd_BinaryOp (C, M, accum, add, A, B, desc)") ;
+    GB_WHERE (C, "GrB_Matrix_eWiseAdd_BinaryOp (C, M, accum, add, A, B, desc)");
     GB_BURBLE_START ("GrB_eWiseAdd") ;
     GB_RETURN_IF_NULL_OR_FAULTY (add) ;
 
@@ -70,17 +64,6 @@ GrB_Info GrB_Matrix_eWiseAdd_BinaryOp       // C<M> = accum (C, A+B)
     return (info) ;
 }
 
-GrB_Info GrB_eWiseAdd_Matrix_BinaryOp       // misnamed
-(
-    GrB_Matrix C, const GrB_Matrix M, const GrB_BinaryOp accum,
-    const GrB_BinaryOp add, const GrB_Matrix A, const GrB_Matrix B,
-    const GrB_Descriptor desc
-)
-{ 
-    // call the correctly-named function
-    return (GrB_Matrix_eWiseAdd_BinaryOp (C, M, accum, add, A, B, desc)) ;
-}
-
 //------------------------------------------------------------------------------
 // GrB_Matrix_eWiseAdd_Monoid: matrix addition
 //------------------------------------------------------------------------------
@@ -103,7 +86,8 @@ GrB_Info GrB_Matrix_eWiseAdd_Monoid         // C<M> = accum (C, A+B)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_eWiseAdd_Monoid (C, M, accum, monoid, A, B, desc)") ;
+    GB_WHERE (C, "GrB_Matrix_eWiseAdd_Monoid "
+        "(C, M, accum, monoid, A, B, desc)") ;
     GB_BURBLE_START ("GrB_eWiseAdd") ;
     GB_RETURN_IF_NULL_OR_FAULTY (monoid) ;
 
@@ -116,17 +100,6 @@ GrB_Info GrB_Matrix_eWiseAdd_Monoid         // C<M> = accum (C, A+B)
     return (info) ;
 }
 
-GrB_Info GrB_eWiseAdd_Matrix_Monoid         // misnamed
-(
-    GrB_Matrix C, const GrB_Matrix M, const GrB_BinaryOp accum,
-    const GrB_Monoid monoid, const GrB_Matrix A, const GrB_Matrix B,
-    const GrB_Descriptor desc
-)
-{ 
-    // call the correctly-named function
-    return (GrB_Matrix_eWiseAdd_Monoid (C, M, accum, monoid, A, B, desc)) ;
-}
-
 //------------------------------------------------------------------------------
 // GrB_Matrix_eWiseAdd_Semiring: matrix addition
 //------------------------------------------------------------------------------
@@ -149,7 +122,7 @@ GrB_Info GrB_Matrix_eWiseAdd_Semiring       // C<M> = accum (C, A+B)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_eWiseAdd_Semiring (C, M, accum, semiring, A, B,"
+    GB_WHERE (C, "GrB_Matrix_eWiseAdd_Semiring (C, M, accum, semiring, A, B,"
         " desc)") ;
     GB_BURBLE_START ("GrB_eWiseAdd") ;
     GB_RETURN_IF_NULL_OR_FAULTY (semiring) ;
@@ -163,14 +136,3 @@ GrB_Info GrB_Matrix_eWiseAdd_Semiring       // C<M> = accum (C, A+B)
     return (info) ;
 }
 
-GrB_Info GrB_eWiseAdd_Matrix_Semiring       // misnamed
-(
-    GrB_Matrix C, const GrB_Matrix M, const GrB_BinaryOp accum,
-    const GrB_Semiring semiring, const GrB_Matrix A, const GrB_Matrix B,
-    const GrB_Descriptor desc
-)
-{ 
-    // call the correctly-named function
-    return (GrB_Matrix_eWiseAdd_Semiring (C, M, accum, semiring, A, B, desc)) ;
-}
-
diff --git a/GraphBLAS/Source/GrB_Matrix_eWiseMult.c b/GraphBLAS/Source/GrB_Matrix_eWiseMult.c
index b0cc4d5820..af79f93c2c 100644
--- a/GraphBLAS/Source/GrB_Matrix_eWiseMult.c
+++ b/GraphBLAS/Source/GrB_Matrix_eWiseMult.c
@@ -2,19 +2,13 @@
 // GrB_Matrix_eWiseMult: matrix element-wise operations, using set intersection
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // C<M> = accum (C,A.*B) and variations.
 
-// SuiteSparse:GraphBLAS v3.2 and earlier included these functions from the C
-// API with the wrong name.  It is corrected in this version.  The prior
-// misnamed functions are kept for backward compatibility, but they are
-// deprecated and their use is not recommend. The generic version,
-// GrB_eWiseMult, is not affected.
-
 #include "GB_ewise.h"
 
 #define GB_EWISE(op)                                                        \
@@ -25,7 +19,7 @@
     GB_RETURN_IF_FAULTY (M) ;                                               \
     /* get the descriptor */                                                \
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,       \
-        A_tran, B_tran, xx) ;                                               \
+        A_tran, B_tran, xx, xx7) ;                                          \
     /* C<M> = accum (C,T) where T = A.*B, A'.*B, A.*B', or A'.*B' */        \
     info = GB_ewise (                                                       \
         C,              C_replace,  /* C and its descriptor        */       \
@@ -57,7 +51,8 @@ GrB_Info GrB_Matrix_eWiseMult_BinaryOp       // C<M> = accum (C, A.*B)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_eWiseMult_BinaryOp (C, M, accum, mult, A, B, desc)") ;
+    GB_WHERE (C, "GrB_Matrix_eWiseMult_BinaryOp "
+        "(C, M, accum, mult, A, B, desc)") ;
     GB_BURBLE_START ("GrB_eWiseMult") ;
     GB_RETURN_IF_NULL_OR_FAULTY (mult) ;
 
@@ -70,17 +65,6 @@ GrB_Info GrB_Matrix_eWiseMult_BinaryOp       // C<M> = accum (C, A.*B)
     return (info) ;
 }
 
-GrB_Info GrB_eWiseMult_Matrix_BinaryOp       // misnamed
-(
-    GrB_Matrix C, const GrB_Matrix M, const GrB_BinaryOp accum,
-    const GrB_BinaryOp mult, const GrB_Matrix A, const GrB_Matrix B,
-    const GrB_Descriptor desc
-)
-{ 
-    // call the correctly-named function:
-    return (GrB_Matrix_eWiseMult_BinaryOp (C, M, accum, mult, A, B, desc)) ;
-}
-
 //------------------------------------------------------------------------------
 // GrB_Matrix_eWiseMult_Monoid: matrix element-wise multiplication
 //------------------------------------------------------------------------------
@@ -103,7 +87,8 @@ GrB_Info GrB_Matrix_eWiseMult_Monoid         // C<M> = accum (C, A.*B)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_eWiseMult_Monoid (C, M, accum, monoid, A, B, desc)") ;
+    GB_WHERE (C, "GrB_Matrix_eWiseMult_Monoid "
+        "(C, M, accum, monoid, A, B, desc)") ;
     GB_BURBLE_START ("GrB_eWiseMult") ;
     GB_RETURN_IF_NULL_OR_FAULTY (monoid) ;
 
@@ -116,17 +101,6 @@ GrB_Info GrB_Matrix_eWiseMult_Monoid         // C<M> = accum (C, A.*B)
     return (info) ;
 }
 
-GrB_Info GrB_eWiseMult_Matrix_Monoid         // misnamed
-(
-    GrB_Matrix C, const GrB_Matrix M, const GrB_BinaryOp accum,
-    const GrB_Monoid monoid, const GrB_Matrix A, const GrB_Matrix B,
-    const GrB_Descriptor desc
-)
-{ 
-    // call the correctly-named function:
-    return (GrB_Matrix_eWiseMult_Monoid (C, M, accum, monoid, A, B, desc)) ;
-}
-
 //------------------------------------------------------------------------------
 // GrB_Matrix_eWiseMult_Semiring: matrix element-wise multiplication
 //------------------------------------------------------------------------------
@@ -149,8 +123,8 @@ GrB_Info GrB_Matrix_eWiseMult_Semiring       // C<M> = accum (C, A.*B)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_eWiseMult_Semiring (C, M, accum, semiring, A, B,"
-        " desc)") ;
+    GB_WHERE (C, "GrB_Matrix_eWiseMult_Semiring "
+        "(C, M, accum, semiring, A, B, desc)") ;
     GB_BURBLE_START ("GrB_eWiseMult") ;
     GB_RETURN_IF_NULL_OR_FAULTY (semiring) ;
 
@@ -163,18 +137,3 @@ GrB_Info GrB_Matrix_eWiseMult_Semiring       // C<M> = accum (C, A.*B)
     return (info) ;
 }
 
-GrB_Info GrB_eWiseMult_Matrix_Semiring       // misnamed
-(
-    GrB_Matrix C,
-    const GrB_Matrix M,
-    const GrB_BinaryOp accum,
-    const GrB_Semiring semiring,
-    const GrB_Matrix A,
-    const GrB_Matrix B,
-    const GrB_Descriptor desc
-)
-{ 
-    // call the correctly-named function:
-    return (GrB_Matrix_eWiseMult_Semiring (C, M, accum, semiring, A, B, desc)) ;
-}
-
diff --git a/GraphBLAS/Source/GrB_Matrix_extract.c b/GraphBLAS/Source/GrB_Matrix_extract.c
index 588ab0887c..5e15b93e3c 100644
--- a/GraphBLAS/Source/GrB_Matrix_extract.c
+++ b/GraphBLAS/Source/GrB_Matrix_extract.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_extract: C<M> = accum (C, A(I,J)) or A(J,I)'
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@ GrB_Info GrB_Matrix_extract     // C<M> = accum (C, A(I,J))
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_extract (C, M, accum, A, I, ni, J, nj, desc)") ;
+    GB_WHERE (C, "GrB_Matrix_extract (C, M, accum, A, I, ni, J, nj, desc)") ;
     GB_BURBLE_START ("GrB_extract") ;
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -35,7 +35,7 @@ GrB_Info GrB_Matrix_extract     // C<M> = accum (C, A(I,J))
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_transpose, xx1, xx2) ;
+        A_transpose, xx1, xx2, xx7) ;
 
     //--------------------------------------------------------------------------
     // do the work in GB_extract
diff --git a/GraphBLAS/Source/GrB_Matrix_extractElement.c b/GraphBLAS/Source/GrB_Matrix_extractElement.c
index 299b5687d1..c73379db35 100644
--- a/GraphBLAS/Source/GrB_Matrix_extractElement.c
+++ b/GraphBLAS/Source/GrB_Matrix_extractElement.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_extractElement: extract a single entry from a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GrB_Matrix_extractTuples.c b/GraphBLAS/Source/GrB_Matrix_extractTuples.c
index 6e5e17fc35..e9cb76af21 100644
--- a/GraphBLAS/Source/GrB_Matrix_extractTuples.c
+++ b/GraphBLAS/Source/GrB_Matrix_extractTuples.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_extractTuples: extract all tuples from a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -29,7 +29,7 @@ GrB_Info prefix ## Matrix_extractTuples_ ## T     /* [I,J,X] = find (A) */    \
     const GrB_Matrix A      /* matrix to extract tuples from             */   \
 )                                                                             \
 {                                                                             \
-    GB_WHERE (GB_STR(prefix) "Matrix_extractTuples_" GB_STR(T)                \
+    GB_WHERE1 (GB_STR(prefix) "Matrix_extractTuples_" GB_STR(T)          \
         " (I, J, X, nvals, A)") ;                                             \
     GB_BURBLE_START ("GrB_Matrix_extractTuples") ;                            \
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;                                         \
diff --git a/GraphBLAS/Source/GrB_Matrix_free.c b/GraphBLAS/Source/GrB_Matrix_free.c
index 7d632f667e..bd5148f274 100644
--- a/GraphBLAS/Source/GrB_Matrix_free.c
+++ b/GraphBLAS/Source/GrB_Matrix_free.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_free: free a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,7 +18,7 @@ GrB_Info GrB_Matrix_free        // free a matrix
 )
 { 
 
-    GB_MATRIX_FREE (A) ;
+    GB_Matrix_free (A) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Matrix_ncols.c b/GraphBLAS/Source/GrB_Matrix_ncols.c
index 85f2a496a1..f1fef3cb31 100644
--- a/GraphBLAS/Source/GrB_Matrix_ncols.c
+++ b/GraphBLAS/Source/GrB_Matrix_ncols.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_ncols: number of columns of a sparse matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GrB_Matrix_ncols   // get the number of columns of a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_ncols (&ncols, A)") ;
+    GB_WHERE1 ("GrB_Matrix_ncols (&ncols, A)") ;
     GB_RETURN_IF_NULL (ncols) ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
 
diff --git a/GraphBLAS/Source/GrB_Matrix_new.c b/GraphBLAS/Source/GrB_Matrix_new.c
index be3ffbe743..27085b570a 100644
--- a/GraphBLAS/Source/GrB_Matrix_new.c
+++ b/GraphBLAS/Source/GrB_Matrix_new.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_new: create a new matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,25 +27,15 @@ GrB_Info GrB_Matrix_new     // create a new matrix with no entries
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_new (&A, type, nrows, ncols)") ;
+    GB_WHERE1 ("GrB_Matrix_new (&A, type, nrows, ncols)") ;
     GB_RETURN_IF_NULL (A) ;
     (*A) = NULL ;
     GB_RETURN_IF_NULL_OR_FAULTY (type) ;
 
-    if (nrows > GxB_INDEX_MAX)
+    if (nrows > GxB_INDEX_MAX || ncols > GxB_INDEX_MAX)
     { 
         // problem too large
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "problem too large: nrows " GBu " exceeds " GBu,
-            nrows, GxB_INDEX_MAX))) ;
-    }
-
-    if (ncols > GxB_INDEX_MAX)
-    { 
-        // problem too large
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "problem too large: ncols " GBu " exceeds " GBu,
-            ncols, GxB_INDEX_MAX))) ;
+        return (GrB_INVALID_VALUE) ;
     }
 
     //--------------------------------------------------------------------------
@@ -55,9 +45,6 @@ GrB_Info GrB_Matrix_new     // create a new matrix with no entries
     GrB_Info info ;
     int64_t vlen, vdim ;
 
-    // A is created with auto hypersparsity (typically hypersparse unless
-    // vdim <= 1 or hyper_ratio < 0) and default CSR/CSC format.
-
     bool A_is_csc = GB_Global_is_csc_get ( ) ;
 
     if (A_is_csc)
@@ -71,9 +58,9 @@ GrB_Info GrB_Matrix_new     // create a new matrix with no entries
         vdim = (int64_t) nrows ;
     }
 
-    // *A == NULL ;                 // allocate a new header for A
-    info = GB_new (A, type, vlen, vdim, GB_Ap_calloc, A_is_csc,
-        GB_AUTO_HYPER, GB_HYPER_DEFAULT, 1, Context) ;
+    info = GB_new (A, // auto sparsity, new header
+        type, vlen, vdim, GB_Ap_calloc, A_is_csc,
+        GxB_AUTO_SPARSITY, GB_Global_hyper_switch_get ( ), 1, Context) ;
     return (info) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Matrix_nrows.c b/GraphBLAS/Source/GrB_Matrix_nrows.c
index 565a3ccb2b..193f899d99 100644
--- a/GraphBLAS/Source/GrB_Matrix_nrows.c
+++ b/GraphBLAS/Source/GrB_Matrix_nrows.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_nrows: number of rows of a sparse matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GrB_Matrix_nrows   // get the number of rows of a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_nrows (&nrows, A)") ;
+    GB_WHERE1 ("GrB_Matrix_nrows (&nrows, A)") ;
     GB_RETURN_IF_NULL (nrows) ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
 
diff --git a/GraphBLAS/Source/GrB_Matrix_nvals.c b/GraphBLAS/Source/GrB_Matrix_nvals.c
index 6894c30eae..db4aab62b7 100644
--- a/GraphBLAS/Source/GrB_Matrix_nvals.c
+++ b/GraphBLAS/Source/GrB_Matrix_nvals.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_nvals: number of entries in a sparse matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GrB_Matrix_nvals   // get the number of entries in a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_nvals (&nvals, A)") ;
+    GB_WHERE1 ("GrB_Matrix_nvals (&nvals, A)") ;
     GB_BURBLE_START ("GrB_Matrix_nvals") ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
 
diff --git a/GraphBLAS/Source/GrB_Matrix_reduce.c b/GraphBLAS/Source/GrB_Matrix_reduce.c
index 08111cd78e..b949a5ff4b 100644
--- a/GraphBLAS/Source/GrB_Matrix_reduce.c
+++ b/GraphBLAS/Source/GrB_Matrix_reduce.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_reduce: reduce a matrix to a vector or scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 // GrB_Matrix_reduce_TYPE: reduce a matrix to a scalar
@@ -12,42 +12,42 @@
 // Reduce entries in a matrix to a scalar, c = accum (c, reduce_to_scalar(A)))
 
 // All entries in the matrix are "summed" to a single scalar t using the reduce
-// monoid, which must be associative (otherwise the results are undefined).
-// The result is either assigned to the output scalar c (if accum is NULL), or
-// it accumulated in the result c via c = accum(c,t).  If A has no entries, the
-// result t is the identity value of the monoid.  Unlike most other GraphBLAS
-// operations, this operation uses an accum operator but no mask.
+// monoid.  The result is either assigned to the output scalar c (if accum is
+// NULL), or it accumulated in the result c via c = accum(c,t).  If A has no
+// entries, the result t is the identity value of the monoid.  Unlike most
+// other GraphBLAS operations, this operation uses an accum operator but no
+// mask.
 
 #include "GB_reduce.h"
+#include "GB_binop.h"
 
 #define GB_MATRIX_TO_SCALAR(prefix,type,T)                                     \
 GrB_Info prefix ## Matrix_reduce_ ## T    /* c = accum (c, reduce (A))  */     \
 (                                                                              \
     type *c,                        /* result scalar                        */ \
     const GrB_BinaryOp accum,       /* optional accum for c=accum(c,t)      */ \
-    const GrB_Monoid reduce,        /* monoid to do the reduction           */ \
+    const GrB_Monoid monoid,        /* monoid to do the reduction           */ \
     const GrB_Matrix A,             /* matrix to reduce                     */ \
     const GrB_Descriptor desc       /* descriptor (currently unused)        */ \
 )                                                                              \
 {                                                                              \
-    GB_WHERE (GB_STR(prefix) "Matrix_reduce_" GB_STR(T)                        \
-        " (&c, accum, reduce, A, desc)") ;                                     \
+    GB_WHERE1 ("Matrix_reduce_" GB_STR(T) " (&c, accum, monoid, A, desc)") ;   \
     GB_BURBLE_START ("GrB_reduce") ;                                           \
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;                                          \
-    GrB_Info info = GB_reduce_to_scalar (c, prefix ## T, accum, reduce, A,     \
-        Context) ;                                                             \
+    GrB_Info info = GB_reduce_to_scalar (c, prefix ## T, accum, monoid,        \
+        A, Context) ;                                                          \
     GB_BURBLE_END ;                                                            \
     return (info) ;                                                            \
 }
 
 GB_MATRIX_TO_SCALAR (GrB_, bool      , BOOL   )
 GB_MATRIX_TO_SCALAR (GrB_, int8_t    , INT8   )
-GB_MATRIX_TO_SCALAR (GrB_, uint8_t   , UINT8  )
 GB_MATRIX_TO_SCALAR (GrB_, int16_t   , INT16  )
-GB_MATRIX_TO_SCALAR (GrB_, uint16_t  , UINT16 )
 GB_MATRIX_TO_SCALAR (GrB_, int32_t   , INT32  )
-GB_MATRIX_TO_SCALAR (GrB_, uint32_t  , UINT32 )
 GB_MATRIX_TO_SCALAR (GrB_, int64_t   , INT64  )
+GB_MATRIX_TO_SCALAR (GrB_, uint8_t   , UINT8  )
+GB_MATRIX_TO_SCALAR (GrB_, uint16_t  , UINT16 )
+GB_MATRIX_TO_SCALAR (GrB_, uint32_t  , UINT32 )
 GB_MATRIX_TO_SCALAR (GrB_, uint64_t  , UINT64 )
 GB_MATRIX_TO_SCALAR (GrB_, float     , FP32   )
 GB_MATRIX_TO_SCALAR (GrB_, double    , FP64   )
@@ -58,7 +58,7 @@ GrB_Info GrB_Matrix_reduce_UDT      // c = accum (c, reduce_to_scalar (A))
 (
     void *c,                        // result scalar
     const GrB_BinaryOp accum,       // optional accum for c=accum(c,t)
-    const GrB_Monoid reduce,        // monoid to do the reduction
+    const GrB_Monoid monoid,        // monoid to do the reduction
     const GrB_Matrix A,             // matrix to reduce
     const GrB_Descriptor desc       // descriptor (currently unused)
 )
@@ -70,47 +70,278 @@ GrB_Info GrB_Matrix_reduce_UDT      // c = accum (c, reduce_to_scalar (A))
     // monoid, and no typecasting can be done between user-defined types.
     // Thus, the type of c must be the same as the reduce monoid.
 
-    GB_WHERE ("GrB_Matrix_reduce_UDT (&c, accum, reduce, A, desc)") ;
+    GB_WHERE1 ("GrB_Matrix_reduce_UDT (&c, accum, monoid, A, desc)") ;
     GB_BURBLE_START ("GrB_reduce") ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
-    GB_RETURN_IF_NULL_OR_FAULTY (reduce) ;
-    GrB_Info info = GB_reduce_to_scalar (c, reduce->op->ztype, accum, reduce,
+    GB_RETURN_IF_NULL_OR_FAULTY (monoid) ;
+    GrB_Info info = GB_reduce_to_scalar (c, monoid->op->ztype, accum, monoid,
         A, Context) ;
     GB_BURBLE_END ;
     return (info) ;
 }
 
 //------------------------------------------------------------------------------
-// GrB_Matrix_reduce_OP: reduce a matrix to a vector
+// GrB_Matrix_reduce_Monoid: reduce a matrix to a vector via a monoid
 //------------------------------------------------------------------------------
 
-#define GB_MATRIX_TO_VECTOR(kind,reduceop,terminal)                           \
-GrB_Info GrB_Matrix_reduce_ ## kind /* w<M> = accum (w,reduce(A))          */ \
-(                                                                             \
-    GrB_Vector w,                   /* input/output vector for results     */ \
-    const GrB_Vector M,             /* optional mask for w, unused if NULL */ \
-    const GrB_BinaryOp accum,       /* optional accum for z=accum(w,t)     */ \
-    const GrB_ ## kind reduce,      /* reduce operator for t=reduce(A)     */ \
-    const GrB_Matrix A,             /* first input:  matrix A              */ \
-    const GrB_Descriptor desc       /* descriptor for w, M, and A          */ \
-)                                                                             \
-{                                                                             \
-    GB_WHERE ("GrB_Matrix_reduce_" GB_STR(kind)                               \
-        " (w, M, accum, reduce, A, desc)") ;                                  \
-    GB_BURBLE_START ("GrB_reduce") ;                                          \
-    GB_RETURN_IF_NULL_OR_FAULTY (reduce) ;                                    \
-    GrB_Info info = GB_reduce_to_vector ((GrB_Matrix) w, (GrB_Matrix) M,      \
-        accum, reduceop, terminal, A, desc, Context) ;                        \
-    GB_BURBLE_END ;                                                           \
-    return (info) ;                                                           \
+GrB_Info GrB_Matrix_reduce_Monoid   // w<M> = accum (w,reduce(A))
+(
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector M,             // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_Monoid monoid,        // reduce monoid for t=reduce(A)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Descriptor desc       // descriptor for w, M, and A
+)
+{ 
+    GB_WHERE (w, "GrB_Matrix_reduce_Monoid (w, M, accum, monoid, A, desc)") ;
+    GB_BURBLE_START ("GrB_reduce") ;
+    GrB_Info info = GB_reduce_to_vector ((GrB_Matrix) w, (GrB_Matrix) M,
+        accum, monoid, A, desc, Context) ;
+    GB_BURBLE_END ;
+    return (info) ;
 }
 
-// With just a GrB_BinaryOp, built-in operators can terminate early (MIN, MAX,
-// LOR, and LAND).  User-defined binary operators do not have a terminal value.
-GB_MATRIX_TO_VECTOR (BinaryOp, reduce    , NULL)
+//------------------------------------------------------------------------------
+// GrB_Matrix_reduce_BinaryOp: reduce a matrix to a vector via a binary op
+//------------------------------------------------------------------------------
+
+// In my opinion, GrB_Matrix_reduce_BinaryOp should not be in the GraphBLAS C
+// API Specification.  All other case of reduction in the C API, including all
+// other uses of GrB_reduce, GrB_vxm, GrB_mxv, and GrB_mxm, require a monoid
+// with an identity value.  This function is an outlier.  Obtaining good
+// performance in a parallel implementation requires knowledge of the monoid
+// identity value.  To implement this method in SuiteSparse:GraphBLAS, the
+// binary operator "op_in" is promoted to the corresponding monoid, for
+// built-in operators:
+//
+//      operator                data-types (all built-in)
+//      ----------------------  ---------------------------
+//      MIN, MAX                int*, uint*, fp*
+//      TIMES, PLUS             int*, uint*, fp*, fc*
+//      ANY                     int*, uint*, fp*, fc*, bool
+//      LOR, LAND, LXOR, EQ     bool
+//      BOR, BAND, BXOR, BXNOR  uint*
+//
+// No other cases are supported.  In particular, user-defined types and
+// operators are not supported, since in those cases the identity value cannot
+// be inferred.  Use GrB_Matrix_reduce_Monoid with a user-defined monoid
+// instead.  The use of this function in SuiteSparse:GraphBLAS is discouraged.
+
+GrB_Info GrB_Matrix_reduce_BinaryOp
+(
+    GrB_Vector w,                   // input/output vector for results
+    const GrB_Vector M,             // optional mask for w, unused if NULL
+    const GrB_BinaryOp accum,       // optional accum for z=accum(w,t)
+    const GrB_BinaryOp op_in,       // reduce operator for t=reduce(A)
+    const GrB_Matrix A,             // first input:  matrix A
+    const GrB_Descriptor desc       // descriptor for w, M, and A
+)
+{ 
+    GB_WHERE (w, "GrB_Matrix_reduce_BinaryOp (w, M, accum, op, A, desc)") ;
+    GB_BURBLE_START ("GrB_reduce") ;
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GB_RETURN_IF_NULL_OR_FAULTY (op_in) ;
+    ASSERT_BINARYOP_OK (op_in, "binary op for reduce-to-vector", GB0) ;
+
+    // check operator types; all must be identical
+    if (op_in->xtype != op_in->ztype || op_in->ytype != op_in->ztype)
+    { 
+        GB_ERROR (GrB_DOMAIN_MISMATCH, "Invalid binary operator:"
+            " z=%s(x,y) has no equivalent monoid\n", op_in->name) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // convert the binary op_in to its corresponding monoid
+    //--------------------------------------------------------------------------
+
+    GrB_Monoid monoid = NULL ;
+    GrB_BinaryOp op = GB_boolean_rename_op (op_in) ;
+    GB_Type_code zcode = op->ztype->code ;
+    GB_Opcode opcode = op->opcode ;
+
+    switch (opcode)
+    {
 
-// Built-in monoids ignore the terminal parameter, and use the terminal value
-// based on the built-in operator.  User-defined monoids can be created with an
-// arbitrary non-NULL terminal value.
-GB_MATRIX_TO_VECTOR (Monoid  , reduce->op, (GB_void *) reduce->terminal)
+        case GB_MIN_opcode:
+
+            switch (zcode)
+            {
+                // 10 MIN monoids: for 10 real types
+                case GB_INT8_code   : monoid = GrB_MIN_MONOID_INT8     ; break ;
+                case GB_INT16_code  : monoid = GrB_MIN_MONOID_INT16    ; break ;
+                case GB_INT32_code  : monoid = GrB_MIN_MONOID_INT32    ; break ;
+                case GB_INT64_code  : monoid = GrB_MIN_MONOID_INT64    ; break ;
+                case GB_UINT8_code  : monoid = GrB_MIN_MONOID_UINT8    ; break ;
+                case GB_UINT16_code : monoid = GrB_MIN_MONOID_UINT16   ; break ;
+                case GB_UINT32_code : monoid = GrB_MIN_MONOID_UINT32   ; break ;
+                case GB_UINT64_code : monoid = GrB_MIN_MONOID_UINT64   ; break ;
+                case GB_FP32_code   : monoid = GrB_MIN_MONOID_FP32     ; break ;
+                case GB_FP64_code   : monoid = GrB_MIN_MONOID_FP64     ; break ;
+                default: ;
+            }
+            break ;
+
+        case GB_MAX_opcode:
+
+            switch (zcode)
+            {
+                // 10 MAX monoids: for 10 real types
+                case GB_INT8_code   : monoid = GrB_MAX_MONOID_INT8     ; break ;
+                case GB_INT16_code  : monoid = GrB_MAX_MONOID_INT16    ; break ;
+                case GB_INT32_code  : monoid = GrB_MAX_MONOID_INT32    ; break ;
+                case GB_INT64_code  : monoid = GrB_MAX_MONOID_INT64    ; break ;
+                case GB_UINT8_code  : monoid = GrB_MAX_MONOID_UINT8    ; break ;
+                case GB_UINT16_code : monoid = GrB_MAX_MONOID_UINT16   ; break ;
+                case GB_UINT32_code : monoid = GrB_MAX_MONOID_UINT32   ; break ;
+                case GB_UINT64_code : monoid = GrB_MAX_MONOID_UINT64   ; break ;
+                case GB_FP32_code   : monoid = GrB_MAX_MONOID_FP32     ; break ;
+                case GB_FP64_code   : monoid = GrB_MAX_MONOID_FP64     ; break ;
+                default: ;
+            }
+            break ;
+
+        case GB_TIMES_opcode:
+
+            switch (zcode)
+            {
+                // 12 TIMES monoids: 10 real types, and 2 complex types
+                case GB_INT8_code   : monoid = GrB_TIMES_MONOID_INT8   ; break ;
+                case GB_INT16_code  : monoid = GrB_TIMES_MONOID_INT16  ; break ;
+                case GB_INT32_code  : monoid = GrB_TIMES_MONOID_INT32  ; break ;
+                case GB_INT64_code  : monoid = GrB_TIMES_MONOID_INT64  ; break ;
+                case GB_UINT8_code  : monoid = GrB_TIMES_MONOID_UINT8  ; break ;
+                case GB_UINT16_code : monoid = GrB_TIMES_MONOID_UINT16 ; break ;
+                case GB_UINT32_code : monoid = GrB_TIMES_MONOID_UINT32 ; break ;
+                case GB_UINT64_code : monoid = GrB_TIMES_MONOID_UINT64 ; break ;
+                case GB_FP32_code   : monoid = GrB_TIMES_MONOID_FP32   ; break ;
+                case GB_FP64_code   : monoid = GrB_TIMES_MONOID_FP64   ; break ;
+                case GB_FC32_code   : monoid = GxB_TIMES_FC32_MONOID   ; break ;
+                case GB_FC64_code   : monoid = GxB_TIMES_FC64_MONOID   ; break ;
+                default: ;
+            }
+            break ;
+
+        case GB_PLUS_opcode:
+
+            switch (zcode)
+            {
+                // 12 PLUS monoids: 10 real types, and 2 complex types
+                case GB_INT8_code   : monoid = GrB_PLUS_MONOID_INT8    ; break ;
+                case GB_INT16_code  : monoid = GrB_PLUS_MONOID_INT16   ; break ;
+                case GB_INT32_code  : monoid = GrB_PLUS_MONOID_INT32   ; break ;
+                case GB_INT64_code  : monoid = GrB_PLUS_MONOID_INT64   ; break ;
+                case GB_UINT8_code  : monoid = GrB_PLUS_MONOID_UINT8   ; break ;
+                case GB_UINT16_code : monoid = GrB_PLUS_MONOID_UINT16  ; break ;
+                case GB_UINT32_code : monoid = GrB_PLUS_MONOID_UINT32  ; break ;
+                case GB_UINT64_code : monoid = GrB_PLUS_MONOID_UINT64  ; break ;
+                case GB_FP32_code   : monoid = GrB_PLUS_MONOID_FP32    ; break ;
+                case GB_FP64_code   : monoid = GrB_PLUS_MONOID_FP64    ; break ;
+                case GB_FC32_code   : monoid = GxB_PLUS_FC32_MONOID    ; break ;
+                case GB_FC64_code   : monoid = GxB_PLUS_FC64_MONOID    ; break ;
+                default: ;
+            }
+            break ;
+
+        case GB_ANY_opcode:
+
+            switch (zcode)
+            {
+                // 13 ANY monoids: bool, 10 real types, and 2 complex types
+                case GB_BOOL_code   : monoid = GxB_ANY_BOOL_MONOID     ; break ;
+                case GB_INT8_code   : monoid = GxB_ANY_INT8_MONOID     ; break ;
+                case GB_INT16_code  : monoid = GxB_ANY_INT16_MONOID    ; break ;
+                case GB_INT32_code  : monoid = GxB_ANY_INT32_MONOID    ; break ;
+                case GB_INT64_code  : monoid = GxB_ANY_INT64_MONOID    ; break ;
+                case GB_UINT8_code  : monoid = GxB_ANY_UINT8_MONOID    ; break ;
+                case GB_UINT16_code : monoid = GxB_ANY_UINT16_MONOID   ; break ;
+                case GB_UINT32_code : monoid = GxB_ANY_UINT32_MONOID   ; break ;
+                case GB_UINT64_code : monoid = GxB_ANY_UINT64_MONOID   ; break ;
+                case GB_FP32_code   : monoid = GxB_ANY_FP32_MONOID     ; break ;
+                case GB_FP64_code   : monoid = GxB_ANY_FP64_MONOID     ; break ;
+                case GB_FC32_code   : monoid = GxB_ANY_FC32_MONOID     ; break ;
+                case GB_FC64_code   : monoid = GxB_ANY_FC64_MONOID     ; break ;
+                default: ;
+            }
+            break ;
+
+        // 4 boolean monoids: (see also GxB_ANY_BOOL_MONOID above)
+        #define B(x) if (zcode == GB_BOOL_code) monoid = x ; break ;
+        case GB_LOR_opcode   : B (GrB_LOR_MONOID_BOOL)   ;
+        case GB_LAND_opcode  : B (GrB_LAND_MONOID_BOOL)  ;
+        case GB_LXOR_opcode  : B (GrB_LXOR_MONOID_BOOL)  ;
+        case GB_EQ_opcode    : B (GrB_LXNOR_MONOID_BOOL) ;
+
+        case GB_BOR_opcode:
+
+            switch (zcode)
+            {
+                // 4 BOR monoids
+                case GB_UINT8_code  : monoid = GxB_BOR_UINT8_MONOID    ; break ;
+                case GB_UINT16_code : monoid = GxB_BOR_UINT16_MONOID   ; break ;
+                case GB_UINT32_code : monoid = GxB_BOR_UINT32_MONOID   ; break ;
+                case GB_UINT64_code : monoid = GxB_BOR_UINT64_MONOID   ; break ;
+                default: ;
+            }
+            break ;
+
+        case GB_BAND_opcode:
+
+            switch (zcode)
+            {
+                // 4 BAND monoids
+                case GB_UINT8_code  : monoid = GxB_BAND_UINT8_MONOID   ; break ;
+                case GB_UINT16_code : monoid = GxB_BAND_UINT16_MONOID  ; break ;
+                case GB_UINT32_code : monoid = GxB_BAND_UINT32_MONOID  ; break ;
+                case GB_UINT64_code : monoid = GxB_BAND_UINT64_MONOID  ; break ;
+                default: ;
+            }
+            break ;
+
+        case GB_BXOR_opcode:
+
+            switch (zcode)
+            {
+                // 4 BXOR monoids
+                case GB_UINT8_code  : monoid = GxB_BXOR_UINT8_MONOID   ; break ;
+                case GB_UINT16_code : monoid = GxB_BXOR_UINT16_MONOID  ; break ;
+                case GB_UINT32_code : monoid = GxB_BXOR_UINT32_MONOID  ; break ;
+                case GB_UINT64_code : monoid = GxB_BXOR_UINT64_MONOID  ; break ;
+                default: ;
+            }
+            break ;
+
+        case GB_BXNOR_opcode:
+
+            switch (zcode)
+            {
+                // 4 BXNOR monoids
+                case GB_UINT8_code  : monoid = GxB_BXNOR_UINT8_MONOID  ; break ;
+                case GB_UINT16_code : monoid = GxB_BXNOR_UINT16_MONOID ; break ;
+                case GB_UINT32_code : monoid = GxB_BXNOR_UINT32_MONOID ; break ;
+                case GB_UINT64_code : monoid = GxB_BXNOR_UINT64_MONOID ; break ;
+                default: ;
+            }
+            break ;
+
+        default : 
+
+            // op_in binary operator does not correspond to a known monoid
+            GB_ERROR (GrB_DOMAIN_MISMATCH, "Invalid binary operator:"
+                " z=%s(x,y) has no equivalent monoid\n", op_in->name) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // w<M> = reduce (A) via the monoid
+    //--------------------------------------------------------------------------
+
+    GrB_Info info = GB_reduce_to_vector ((GrB_Matrix) w, (GrB_Matrix) M,
+        accum, monoid, A, desc, Context) ;
+    GB_BURBLE_END ;
+    return (info) ;
+}
 
diff --git a/GraphBLAS/Source/GrB_Matrix_removeElement.c b/GraphBLAS/Source/GrB_Matrix_removeElement.c
index 0c5c799f0d..7a728434d2 100644
--- a/GraphBLAS/Source/GrB_Matrix_removeElement.c
+++ b/GraphBLAS/Source/GrB_Matrix_removeElement.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_removeElement: remove a single entry from a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,60 +27,103 @@ static inline bool GB_removeElement
 {
 
     //--------------------------------------------------------------------------
-    // binary search in C->h for vector j
+    // check inputs
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Cp = C->p ;
-    const int64_t *GB_RESTRICT Ci = C->i ;
-    bool found ;
+    ASSERT (!GB_IS_FULL (C)) ;
+    int64_t cvlen = C->vlen ;
+
+    //--------------------------------------------------------------------------
+    // remove C(i,j)
+    //--------------------------------------------------------------------------
 
-    // remove an entry from vector j of a GrB_Matrix
-    int64_t k ;
-    if (C->is_hyper)
+    if (GB_IS_BITMAP (C))
     {
-        // look for vector j in hyperlist C->h [0 ... C->nvec-1]
-        const int64_t *Ch = C->h ;
-        int64_t pleft = 0 ;
-        int64_t pright = C->nvec-1 ;
-        GB_BINARY_SEARCH (j, Ch, pleft, pright, found) ;
-        if (!found)
+
+        //----------------------------------------------------------------------
+        // C is bitmap
+        //----------------------------------------------------------------------
+
+        int8_t *GB_RESTRICT Cb = C->b ;
+        int64_t p = i + j * cvlen ;
+        int8_t cb = Cb [p] ;
+        if (cb != 0)
         { 
-            // vector j is empty
-            return (false) ;
+            // C(i,j) is present; remove it
+            Cb [p] = 0 ;
+            C->nvals-- ;
         }
-        ASSERT (j == Ch [pleft]) ;
-        k = pleft ;
+        // C(i,j) is always found, whether present or not
+        return (true) ;
+
     }
     else
-    { 
-        k = j ;
-    }
-    int64_t pleft = Cp [k] ;
-    int64_t pright = Cp [k+1] - 1 ;
+    {
 
-    //--------------------------------------------------------------------------
-    // binary search in kth vector for index i
-    //--------------------------------------------------------------------------
+        //----------------------------------------------------------------------
+        // C is sparse or hypersparse
+        //----------------------------------------------------------------------
+
+        const int64_t *GB_RESTRICT Cp = C->p ;
+        const int64_t *GB_RESTRICT Ci = C->i ;
+        bool found ;
+        int64_t k ;
+
+        if (GB_IS_HYPERSPARSE (C))
+        {
+            // binary search in C->h for vector j
+            const int64_t *GB_RESTRICT Ch = C->h ;
+            // find vector j as the kth vector in C
+            // look for vector j in hyperlist C->h [0 ... C->nvec-1]
+            int64_t pleft = 0 ;
+            int64_t pright = C->nvec-1 ;
+            GB_BINARY_SEARCH (j, Ch, pleft, pright, found) ;
+            if (!found)
+            { 
+                // vector j is empty
+                return (false) ;
+            }
+            ASSERT (j == Ch [pleft]) ;
+            k = pleft ;
+        }
+        else
+        { 
+            // C is sparse, C(:,j) is the jth vector of C
+            k = j ;
+        }
 
-    // Time taken for this step is at most O(log(nnz(C(:,j))).
-    bool is_zombie ;
-    int64_t nzombies = C->nzombies ;
-    GB_BINARY_SEARCH_ZOMBIE (i, Ci, pleft, pright, found, nzombies, is_zombie) ;
+        // look in C(:,k), the kth vector of C
+        int64_t pleft = Cp [k] ;
+        int64_t pright = Cp [k+1] ;
+        int64_t cknz = pright - pleft ;
 
-    //--------------------------------------------------------------------------
-    // remove the entry
-    //--------------------------------------------------------------------------
+        bool is_zombie ;
+        if (cknz == cvlen)
+        { 
+            // C(:,k) is packed so no binary search is needed to find C(i,k)
+            pleft = pleft + i ;
+            ASSERT (GB_UNFLIP (Ci [pleft]) == i) ;
+            found = true ;
+            is_zombie = GB_IS_ZOMBIE (Ci [pleft]) ;
+        }
+        else
+        { 
+            // binary search for C(i,k): time is O(log(cknz))
+            int64_t nzombies = C->nzombies ;
+            pright-- ;
+            GB_BINARY_SEARCH_ZOMBIE (i, Ci, pleft, pright, found,
+                nzombies, is_zombie) ;
+        }
 
-    if (found && !is_zombie)
-    { 
-        // C(i,j) becomes a zombie
-        C->i [pleft] = GB_FLIP (i) ;
-        C->nzombies++ ;
-        // TODO in 4.0: delete:
-        if (!(C->enqueued)) { if (!GB_queue_insert (C)) GB_PANIC ; }
+        // remove the entry
+        if (found && !is_zombie)
+        { 
+            // C(i,j) becomes a zombie
+            C->i [pleft] = GB_FLIP (i) ;
+            C->nzombies++ ;
+        }
+        return (found) ;
     }
-
-    return (found) ;
 }
 
 //------------------------------------------------------------------------------
@@ -99,8 +142,45 @@ GrB_Info GrB_Matrix_removeElement
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_CONTEXT_RETURN_IF_NULL (C) ;
-    GB_CONTEXT_RETURN_IF_FAULTY (C) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (C) ;
+
+    //--------------------------------------------------------------------------
+    // if C is jumbled, wait on the matrix first.  If full, convert to nonfull
+    //--------------------------------------------------------------------------
+
+    if (C->jumbled || GB_IS_FULL (C))
+    {
+        GrB_Info info ;
+        GB_WHERE (C, GB_WHERE_STRING) ;
+        GB_BURBLE_START ("GrB_Matrix_removeElement") ;
+        if (GB_IS_FULL (C))
+        { 
+            // convert C from full to sparse
+            GB_OK (GB_convert_to_nonfull (C, Context)) ;
+        }
+        else
+        { 
+            // C is sparse or hypersparse, and jumbled
+            GB_OK (GB_Matrix_wait (C, Context)) ;
+        }
+        ASSERT (!GB_IS_FULL (C)) ;
+        ASSERT (!GB_ZOMBIES (C)) ;
+        ASSERT (!GB_JUMBLED (C)) ;
+        ASSERT (!GB_PENDING (C)) ;
+        // remove the entry
+        info = GrB_Matrix_removeElement (C, row, col) ;
+        GB_BURBLE_END ;
+        return (info) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // C is not jumbled and not full; it may have zombies and pending tuples
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_FULL (C)) ;
+    ASSERT (GB_ZOMBIES_OK (C)) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (GB_PENDING_OK (C)) ;
 
     // look for index i in vector j
     int64_t i, j, nrows, ncols ;
@@ -124,17 +204,18 @@ GrB_Info GrB_Matrix_removeElement
     // check row and column indices
     if (row >= nrows)
     { 
-        GB_WHERE (GB_WHERE_STRING) ;
-        return (GB_ERROR (GrB_INVALID_INDEX, (GB_LOG, "Row index "
-            GBu " out of range; must be < " GBd, row, nrows))) ;
+        GB_WHERE (C, GB_WHERE_STRING) ;
+        GB_ERROR (GrB_INVALID_INDEX, "Row index "
+            GBu " out of range; must be < " GBd, row, nrows) ;
     }
     if (col >= ncols)
     { 
-        GB_WHERE (GB_WHERE_STRING) ;
-        return (GB_ERROR (GrB_INVALID_INDEX, (GB_LOG, "Column index "
-            GBu " out of range; must be < " GBd, col, ncols))) ;
+        GB_WHERE (C, GB_WHERE_STRING) ;
+        GB_ERROR (GrB_INVALID_INDEX, "Column index "
+            GBu " out of range; must be < " GBd, col, ncols) ;
     }
 
+    // if C is sparse or hyper, it may have pending tuples
     bool C_is_pending = GB_PENDING (C) ;
     if (C->nzmax == 0 && !C_is_pending)
     { 
@@ -153,16 +234,17 @@ GrB_Info GrB_Matrix_removeElement
     if (C_is_pending)
     { 
         GrB_Info info ;
-        GB_WHERE (GB_WHERE_STRING) ;
+        GB_WHERE (C, GB_WHERE_STRING) ;
         GB_BURBLE_START ("GrB_Matrix_removeElement") ;
         GB_OK (GB_Matrix_wait (C, Context)) ;
         ASSERT (!GB_ZOMBIES (C)) ;
+        ASSERT (!GB_JUMBLED (C)) ;
         ASSERT (!GB_PENDING (C)) ;
+        // look again; remove the entry if it was a pending tuple
+        GB_removeElement (C, i, j) ;
         GB_BURBLE_END ;
     }
 
-    // look again; remove the entry if it was a pending tuple
-    GB_removeElement (C, i, j) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Matrix_resize.c b/GraphBLAS/Source/GrB_Matrix_resize.c
index 1b5d633449..1fbe6a990f 100644
--- a/GraphBLAS/Source/GrB_Matrix_resize.c
+++ b/GraphBLAS/Source/GrB_Matrix_resize.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_resize: change the size of a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,7 +11,7 @@
 
 GrB_Info GrB_Matrix_resize      // change the size of a matrix
 (
-    GrB_Matrix A,               // matrix to modify
+    GrB_Matrix C,               // matrix to modify
     GrB_Index nrows_new,        // new number of rows in matrix
     GrB_Index ncols_new         // new number of columns in matrix
 )
@@ -21,13 +21,33 @@ GrB_Info GrB_Matrix_resize      // change the size of a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_resize (A, nrows_new, ncols_new)") ;
-    GB_RETURN_IF_NULL_OR_FAULTY (A) ;
+    GB_WHERE (C, "GrB_Matrix_resize (C, nrows_new, ncols_new)") ;
+    GB_BURBLE_START ("GrB_Matrix_resize") ;
+    GB_RETURN_IF_NULL_OR_FAULTY (C) ;
 
     //--------------------------------------------------------------------------
     // resize the matrix
     //--------------------------------------------------------------------------
 
-    return (GB_resize (A, nrows_new, ncols_new, Context)) ;
+    GrB_Info info = GB_resize (C, nrows_new, ncols_new, Context) ;
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
+//------------------------------------------------------------------------------
+// GxB_Matrix_resize: deprecated
+//------------------------------------------------------------------------------
+
+// This function now appears in the C API Specification as GrB_Matrix_resize.
+// The new name is preferred.
+
+GrB_Info GxB_Matrix_resize      // change the size of a matrix
+(
+    GrB_Matrix A,               // matrix to modify
+    GrB_Index nrows_new,        // new number of rows in matrix
+    GrB_Index ncols_new         // new number of columns in matrix
+)
+{ 
+    return (GrB_Matrix_resize (A, nrows_new, ncols_new)) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Matrix_setElement.c b/GraphBLAS/Source/GrB_Matrix_setElement.c
index de6ac05a5c..2b2f52c70b 100644
--- a/GraphBLAS/Source/GrB_Matrix_setElement.c
+++ b/GraphBLAS/Source/GrB_Matrix_setElement.c
@@ -2,8 +2,8 @@
 // GrB_Matrix_setElement: set an entry in a matrix, C(row,col) = x
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -21,7 +21,7 @@ GrB_Info prefix ## Matrix_setElement_ ## T    /* C (row,col) = x */         \
     GrB_Index col                       /* column index                   */\
 )                                                                           \
 {                                                                           \
-    GB_WHERE (GB_STR(prefix) "Matrix_setElement_" GB_STR(T)                 \
+    GB_WHERE (C, GB_STR(prefix) "Matrix_setElement_" GB_STR(T)              \
         " (C, row, col, x)") ;                                              \
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;                                       \
     return (GB_setElement (C, ampersand x, row, col, GB_ ## T ## _code,     \
diff --git a/GraphBLAS/Source/GrB_Matrix_wait.c b/GraphBLAS/Source/GrB_Matrix_wait.c
index fe374fba3a..6da628f1f4 100644
--- a/GraphBLAS/Source/GrB_Matrix_wait.c
+++ b/GraphBLAS/Source/GrB_Matrix_wait.c
@@ -2,11 +2,13 @@
 // GrB_Matrix_wait: wait for a matrix to complete
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
+// Finishes all work on a matrix, followed by an OpenMP flush.
+
 #include "GB.h"
 
 #define GB_FREE_ALL ;
@@ -21,8 +23,8 @@ GrB_Info GrB_Matrix_wait    // finish all work on a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_wait (&A)") ;
-    GB_BURBLE_START ("GrB_Matrix_wait") ;
+    #pragma omp flush
+    GB_WHERE ((*A), "GrB_Matrix_wait (&A)") ;
     GB_RETURN_IF_NULL (A) ;
     GB_RETURN_IF_NULL_OR_FAULTY (*A) ;
 
@@ -30,14 +32,19 @@ GrB_Info GrB_Matrix_wait    // finish all work on a matrix
     // finish all pending work on the matrix
     //--------------------------------------------------------------------------
 
-    GrB_Info info ;
-    GB_MATRIX_WAIT (*A) ;
+    if (GB_ANY_PENDING_WORK (*A))
+    { 
+        GrB_Info info ;
+        GB_BURBLE_START ("GrB_Matrix_wait") ;
+        GB_OK (GB_Matrix_wait (*A, Context)) ;
+        GB_BURBLE_END ;
+    }
 
     //--------------------------------------------------------------------------
     // return result
     //--------------------------------------------------------------------------
 
-    GB_BURBLE_END ;
+    #pragma omp flush
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Monoid_free.c b/GraphBLAS/Source/GrB_Monoid_free.c
index 42d4c4f740..96eea3b310 100644
--- a/GraphBLAS/Source/GrB_Monoid_free.c
+++ b/GraphBLAS/Source/GrB_Monoid_free.c
@@ -2,8 +2,8 @@
 // GrB_Monoid_free:  free a monoid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,18 +18,15 @@ GrB_Info GrB_Monoid_free            // free a user-created monoid
     if (monoid != NULL)
     {
         GrB_Monoid mon = *monoid ;
-        if (mon != NULL && !mon->builtin)
+        if (mon != NULL && !mon->monoid_is_builtin)
         {
             if (mon->magic == GB_MAGIC)
             { 
                 // only user-defined monoids are freed.  predefined monoids
                 // are statically allocated and cannot be freed.
                 mon->magic = GB_FREED ; // to help detect dangling pointers
-                // mon->op->ztype->size might not be safe if op or ztype are
-                // user-defined and have already been freed; use op_ztype_size.
-                size_t zsize = mon->op_ztype_size ;
-                GB_FREE (mon->identity) ;
-                GB_FREE (mon->terminal) ;
+                GB_FREE (mon->identity) ;   // ok if already NULL
+                GB_FREE (mon->terminal) ;   // ok if already NULL
                 GB_FREE (*monoid) ;
             }
             (*monoid) = NULL ;
diff --git a/GraphBLAS/Source/GrB_Monoid_new.c b/GraphBLAS/Source/GrB_Monoid_new.c
index d249dc0024..2b0dfacc5e 100644
--- a/GraphBLAS/Source/GrB_Monoid_new.c
+++ b/GraphBLAS/Source/GrB_Monoid_new.c
@@ -2,8 +2,8 @@
 // GrB_Monoid_new:  create a new monoid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -21,7 +21,7 @@ GrB_Info prefix ## Monoid_new_ ## T       /* create a new monoid */         \
     type identity                   /* identity value of the monoid  */     \
 )                                                                           \
 {                                                                           \
-    GB_WHERE (GB_STR(prefix) "Monoid_new_" GB_STR(T)                        \
+    GB_WHERE1 ("Monoid_new_" GB_STR(T)                  \
         " (&monoid, op, identity)") ;                                       \
     type id = identity ;                                                    \
     return (GB_Monoid_new (monoid, op, &id, NULL, GB_ ## T ## _code, Context));\
@@ -45,11 +45,10 @@ GrB_Info GrB_Monoid_new_UDT         // create a monoid with a user-defined type
 (
     GrB_Monoid *monoid,             // handle of monoid to create
     GrB_BinaryOp op,                // binary operator of the monoid
-    void *identity                  // identity value of the monoid
+    void *identity                  // identity value of monoid
 )
 { 
-    GB_WHERE ("GrB_Monoid_new_UDT (&monoid, op, identity)") ;
-    GB_RETURN_IF_NULL (identity) ;
+    GB_WHERE1 ("GrB_Monoid_new_UDT (&monoid, op, identity)") ;
     return (GB_Monoid_new (monoid, op, identity, NULL, GB_UDT_code, Context)) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Monoid_wait.c b/GraphBLAS/Source/GrB_Monoid_wait.c
index 53ed8b74a3..a70f5e1639 100644
--- a/GraphBLAS/Source/GrB_Monoid_wait.c
+++ b/GraphBLAS/Source/GrB_Monoid_wait.c
@@ -2,14 +2,14 @@
 // GrB_Monoid_wait: wait for a user-defined GrB_Monoid to complete
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // In SuiteSparse:GraphBLAS, a user-defined GrB_Monoid has no pending
 // operations to wait for.  All this method does is verify that the monoid is
-// properly initialized.
+// properly initialized, and then it does an OpenMP flush.
 
 #include "GB.h"
 
@@ -23,7 +23,8 @@ GrB_Info GrB_Monoid_wait   // no work, just check if the GrB_Monoid is valid
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Monoid_wait (&monoid)") ;
+    #pragma omp flush
+    GB_WHERE1 ("GrB_Monoid_wait (&monoid)") ;
     GB_RETURN_IF_NULL (monoid) ;
     GB_RETURN_IF_NULL_OR_FAULTY (*monoid) ;
 
@@ -31,6 +32,7 @@ GrB_Info GrB_Monoid_wait   // no work, just check if the GrB_Monoid is valid
     // return result
     //--------------------------------------------------------------------------
 
+    #pragma omp flush
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Row_assign.c b/GraphBLAS/Source/GrB_Row_assign.c
index 634e9c89c6..1881ebdefe 100644
--- a/GraphBLAS/Source/GrB_Row_assign.c
+++ b/GraphBLAS/Source/GrB_Row_assign.c
@@ -1,15 +1,16 @@
 //------------------------------------------------------------------------------
-// GrB_Row_assign:    C<M'>(row,Cols) = accum (C(row,Cols),u')
+// GrB_Row_assign: C<M'>(row,Cols) = accum (C(row,Cols),u')
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // Compare with GxB_Row_subassign, which uses M and C_replace differently
 
 #include "GB_assign.h"
+#include "GB_bitmap_assign.h"
 
 GrB_Info GrB_Row_assign             // C<M'>(row,Cols) += u'
 (
@@ -28,7 +29,7 @@ GrB_Info GrB_Row_assign             // C<M'>(row,Cols) += u'
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Row_assign (C, M, accum, u, row, Cols, nCols, desc)") ;
+    GB_WHERE (C, "GrB_Row_assign (C, M, accum, u, row, Cols, nCols, desc)") ;
     GB_BURBLE_START ("GrB_assign") ;
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -38,7 +39,7 @@ GrB_Info GrB_Row_assign             // C<M'>(row,Cols) += u'
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // C<M'>(row,Cols) = accum (C(row,Cols), u')
@@ -57,7 +58,7 @@ GrB_Info GrB_Row_assign             // C<M'>(row,Cols) += u'
         Rows, 1,                            // a single row index
         Cols, nCols,                        // column indices
         false, NULL, GB_ignore_code,        // no scalar expansion
-        false, true,                        // GrB_Row_assign
+        GB_ROW_ASSIGN,
         Context) ;
 
     GB_BURBLE_END ;
diff --git a/GraphBLAS/Source/GrB_Semiring_free.c b/GraphBLAS/Source/GrB_Semiring_free.c
index fe4cfb16ae..3955bb8676 100644
--- a/GraphBLAS/Source/GrB_Semiring_free.c
+++ b/GraphBLAS/Source/GrB_Semiring_free.c
@@ -2,8 +2,8 @@
 // GrB_Semiring_free: free a semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,7 +18,7 @@ GrB_Info GrB_Semiring_free          // free a user-created semiring
     if (semiring != NULL)
     {
         GrB_Semiring s = *semiring ;
-        if (s != NULL && !s->builtin)
+        if (s != NULL && !s->semiring_is_builtin)
         {
             if (s->magic == GB_MAGIC)
             { 
diff --git a/GraphBLAS/Source/GrB_Semiring_new.c b/GraphBLAS/Source/GrB_Semiring_new.c
index 444d204db3..fe7f134e61 100644
--- a/GraphBLAS/Source/GrB_Semiring_new.c
+++ b/GraphBLAS/Source/GrB_Semiring_new.c
@@ -2,8 +2,8 @@
 // GrB_Semiring_new: create a new semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -35,13 +35,13 @@ GrB_Info GrB_Semiring_new           // create a semiring
     GrB_Monoid add,                 // additive monoid of the semiring
     GrB_BinaryOp multiply           // multiply operator of the semiring
 )
-{
+{ 
 
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Semiring_new (&semiring, add, multiply)") ;
+    GB_WHERE1 ("GrB_Semiring_new (&semiring, add, multiply)") ;
     GB_RETURN_IF_NULL (semiring) ;
     (*semiring) = NULL ;
     GB_RETURN_IF_NULL_OR_FAULTY (add) ;
@@ -49,34 +49,10 @@ GrB_Info GrB_Semiring_new           // create a semiring
     ASSERT_MONOID_OK (add, "semiring->add", GB0) ;
     ASSERT_BINARYOP_OK (multiply, "semiring->multiply", GB0) ;
 
-    // z = multiply(x,y); type of z must match monoid z = add(z,z)
-    if (multiply->ztype != add->op->ztype)
-    { 
-        (*semiring) = NULL ;
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-            "Semiring multiply output domain must match monoid domain"))) ;
-    }
-
     //--------------------------------------------------------------------------
     // create the semiring
     //--------------------------------------------------------------------------
 
-    // allocate the semiring
-    (*semiring) = GB_CALLOC (1, struct GB_Semiring_opaque) ;
-    if (*semiring == NULL)
-    { 
-        // out of memory
-        return (GB_OUT_OF_MEMORY) ;
-    }
-
-    // initialize the semiring
-    GrB_Semiring s = *semiring ;
-    s->magic = GB_MAGIC ;
-    s->add = add ;
-    s->multiply = multiply ;
-    s->builtin = false ;
-
-    ASSERT_SEMIRING_OK (s, "new semiring", GB0) ;
-    return (GrB_SUCCESS) ;
+    return (GB_Semiring_new (semiring, add, multiply)) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Semiring_wait.c b/GraphBLAS/Source/GrB_Semiring_wait.c
index f18adae892..5749aae463 100644
--- a/GraphBLAS/Source/GrB_Semiring_wait.c
+++ b/GraphBLAS/Source/GrB_Semiring_wait.c
@@ -2,14 +2,14 @@
 // GrB_Semiring_wait: wait for a user-defined GrB_Semiring to complete
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // In SuiteSparse:GraphBLAS, a user-defined GrB_Semiring has no pending
 // operations to wait for.  All this method does is verify that the semiring is
-// properly initialized.
+// properly initialized, and then it does an OpenMP flush.
 
 #include "GB.h"
 
@@ -23,7 +23,8 @@ GrB_Info GrB_Semiring_wait   // no work, just check if the GrB_Semiring is valid
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Semiring_wait (&semiring)") ;
+    #pragma omp flush
+    GB_WHERE1 ("GrB_Semiring_wait (&semiring)") ;
     GB_RETURN_IF_NULL (semiring) ;
     GB_RETURN_IF_NULL_OR_FAULTY (*semiring) ;
 
@@ -31,6 +32,7 @@ GrB_Info GrB_Semiring_wait   // no work, just check if the GrB_Semiring is valid
     // return result
     //--------------------------------------------------------------------------
 
+    #pragma omp flush
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Type_free.c b/GraphBLAS/Source/GrB_Type_free.c
index ed1c3d239c..7d79c9cd28 100644
--- a/GraphBLAS/Source/GrB_Type_free.c
+++ b/GraphBLAS/Source/GrB_Type_free.c
@@ -2,8 +2,8 @@
 // GrB_Type_free:  free a user-defined type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GrB_Type_new.c b/GraphBLAS/Source/GrB_Type_new.c
index 8a3d8e19ca..a83586ef90 100644
--- a/GraphBLAS/Source/GrB_Type_new.c
+++ b/GraphBLAS/Source/GrB_Type_new.c
@@ -2,8 +2,8 @@
 // GrB_Type_new: create a new user-defined type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GrB_Type_wait.c b/GraphBLAS/Source/GrB_Type_wait.c
index 8f86970ec4..df78de4a1b 100644
--- a/GraphBLAS/Source/GrB_Type_wait.c
+++ b/GraphBLAS/Source/GrB_Type_wait.c
@@ -2,14 +2,14 @@
 // GrB_Type_wait: wait for a user-defined GrB_Type to complete
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // In SuiteSparse:GraphBLAS, a user-defined GrB_Type has no pending operations
 // to wait for.  All this method does is verify that the type is properly
-// initialized.
+// initialized, and then it does an OpenMP flush.
 
 #include "GB.h"
 
@@ -23,7 +23,8 @@ GrB_Info GrB_Type_wait      // no work, just check if the GrB_Type is valid
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Type_wait (&type)") ;
+    #pragma omp flush
+    GB_WHERE1 ("GrB_Type_wait (&type)") ;
     GB_RETURN_IF_NULL (type) ;
     GB_RETURN_IF_NULL_OR_FAULTY (*type) ;
 
@@ -31,6 +32,7 @@ GrB_Info GrB_Type_wait      // no work, just check if the GrB_Type is valid
     // return result
     //--------------------------------------------------------------------------
 
+    #pragma omp flush
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_UnaryOp_free.c b/GraphBLAS/Source/GrB_UnaryOp_free.c
index f16298d0cd..83ffe755ed 100644
--- a/GraphBLAS/Source/GrB_UnaryOp_free.c
+++ b/GraphBLAS/Source/GrB_UnaryOp_free.c
@@ -2,8 +2,8 @@
 // GrB_UnaryOp_free: free a unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GrB_UnaryOp_new.c b/GraphBLAS/Source/GrB_UnaryOp_new.c
index c89c61ad43..d802c9dc60 100644
--- a/GraphBLAS/Source/GrB_UnaryOp_new.c
+++ b/GraphBLAS/Source/GrB_UnaryOp_new.c
@@ -2,8 +2,8 @@
 // GrB_UnaryOp_new: create a new user-defined unary operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GrB_UnaryOp_wait.c b/GraphBLAS/Source/GrB_UnaryOp_wait.c
index 1c0a9699a0..aa9aec69fc 100644
--- a/GraphBLAS/Source/GrB_UnaryOp_wait.c
+++ b/GraphBLAS/Source/GrB_UnaryOp_wait.c
@@ -2,14 +2,14 @@
 // GrB_UnaryOp_wait: wait for a user-defined GrB_UnaryOp to complete
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // In SuiteSparse:GraphBLAS, a user-defined GrB_UnaryOp has no pending
 // operations to wait for.  All this method does is verify that the op is
-// properly initialized.
+// properly initialized, and then it does an OpenMP flush.
 
 #include "GB.h"
 
@@ -23,7 +23,8 @@ GrB_Info GrB_UnaryOp_wait   // no work, just check if the GrB_UnaryOp is valid
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_UnaryOp_wait (&op)") ;
+    #pragma omp flush
+    GB_WHERE1 ("GrB_UnaryOp_wait (&op)") ;
     GB_RETURN_IF_NULL (op) ;
     GB_RETURN_IF_NULL_OR_FAULTY (*op) ;
 
@@ -31,6 +32,7 @@ GrB_Info GrB_UnaryOp_wait   // no work, just check if the GrB_UnaryOp is valid
     // return result
     //--------------------------------------------------------------------------
 
+    #pragma omp flush
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Vector_apply.c b/GraphBLAS/Source/GrB_Vector_apply.c
index 4bef54b4cc..6c1ff90d5c 100644
--- a/GraphBLAS/Source/GrB_Vector_apply.c
+++ b/GraphBLAS/Source/GrB_Vector_apply.c
@@ -2,8 +2,8 @@
 // GrB_Vector_apply: apply a unary or binary operator to a vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -29,7 +29,7 @@ GrB_Info GrB_Vector_apply           // w<M> = accum (w, op(u))
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_apply (w, M, accum, op, u, desc)") ;
+    GB_WHERE (w, "GrB_Vector_apply (w, M, accum, op, u, desc)") ;
     GB_BURBLE_START ("GrB_apply") ;
     GB_RETURN_IF_NULL_OR_FAULTY (w) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -41,7 +41,7 @@ GrB_Info GrB_Vector_apply           // w<M> = accum (w, op(u))
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // apply the operator; do not transpose
@@ -93,7 +93,7 @@ static inline GrB_Info GB_1st       // w<mask> = accum (w, op(x,u))
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // apply the operator; do not transpose
@@ -145,7 +145,7 @@ static inline GrB_Info GB_2nd       // w<mask> = accum (w, op(u,y))
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // apply the operator; do not transpose
@@ -179,7 +179,7 @@ GrB_Info GxB_Vector_apply_BinaryOp1st           // w<mask> = accum (w, op(x,u))
     const GrB_Descriptor desc       // descriptor for w and M
 )
 { 
-    GB_WHERE ("GxB_Vector_apply_BinaryOp1st (w, M, accum, op, x, u, desc)") ;
+    GB_WHERE (w, "GxB_Vector_apply_BinaryOp1st (w, M, accum, op, x, u, desc)") ;
     return (GB_1st (w, M, accum, op, x, u, desc, Context)) ;
 }
 
@@ -198,7 +198,7 @@ GrB_Info GxB_Vector_apply_BinaryOp2nd           // w<mask> = accum (w, op(u,y))
     const GrB_Descriptor desc       // descriptor for w and M
 )
 { 
-    GB_WHERE ("GxB_Vector_apply_BinaryOp2nd (w, M, accum, op, u, y, desc)") ;
+    GB_WHERE (w, "GxB_Vector_apply_BinaryOp2nd (w, M, accum, op, u, y, desc)") ;
     return (GB_2nd (w, M, accum, op, u, y, desc, Context)) ;
 }
 
@@ -218,7 +218,7 @@ GrB_Info prefix ## Vector_apply_BinaryOp1st_ ## T                           \
     const GrB_Descriptor desc       /* descriptor for w and M */            \
 )                                                                           \
 {                                                                           \
-    GB_WHERE (GB_STR(prefix) "Vector_apply_BinaryOp1st_" GB_STR(T)          \
+    GB_WHERE (w, GB_STR(prefix) "Vector_apply_BinaryOp1st_" GB_STR(T)       \
         "(w, M, accum, op, x, u, desc)") ;                                  \
     GB_SCALAR_WRAP (scalar, prefix, T, ampersand, x, stype) ;               \
     ASSERT_SCALAR_OK (scalar, "scalar for vector_apply_bind1st", GB0) ;     \
@@ -256,7 +256,7 @@ GrB_Info prefix ## Vector_apply_BinaryOp2nd_ ## T                           \
     const GrB_Descriptor desc       /* descriptor for w and M */            \
 )                                                                           \
 {                                                                           \
-    GB_WHERE (GB_STR(prefix) "Vector_apply_BinaryOp2nd_" GB_STR(T)          \
+    GB_WHERE (w, GB_STR(prefix) "Vector_apply_BinaryOp2nd_" GB_STR(T)       \
         "(w, M, accum, op, u, y, desc)") ;                                  \
     GB_SCALAR_WRAP (scalar, prefix, T, ampersand, y, stype) ;               \
     ASSERT_SCALAR_OK (scalar, "scalar for vector_apply_bind2nd", GB0) ;     \
diff --git a/GraphBLAS/Source/GrB_Vector_assign.c b/GraphBLAS/Source/GrB_Vector_assign.c
index ef3f0a644b..3ec675e086 100644
--- a/GraphBLAS/Source/GrB_Vector_assign.c
+++ b/GraphBLAS/Source/GrB_Vector_assign.c
@@ -1,15 +1,16 @@
 //------------------------------------------------------------------------------
-// GrB_Vector_assign:    w<M>(Rows) = accum (w(Rows),u)
+// GrB_Vector_assign: w<M>(Rows) = accum (w(Rows),u)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // Compare with GxB_Vector_subassign, which uses M and C_replace differently
 
 #include "GB_assign.h"
+#include "GB_bitmap_assign.h"
 
 GrB_Info GrB_Vector_assign          // w<M>(Rows) = accum (w(Rows),u)
 (
@@ -27,7 +28,7 @@ GrB_Info GrB_Vector_assign          // w<M>(Rows) = accum (w(Rows),u)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_assign (w, M, accum, u, Rows, nRows, desc)") ;
+    GB_WHERE (w, "GrB_Vector_assign (w, M, accum, u, Rows, nRows, desc)") ;
     GB_BURBLE_START ("GrB_assign") ;
     GB_RETURN_IF_NULL_OR_FAULTY (w) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -38,7 +39,7 @@ GrB_Info GrB_Vector_assign          // w<M>(Rows) = accum (w(Rows),u)
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // w(Rows)<M> = accum (w(Rows), u) and variations
@@ -53,7 +54,7 @@ GrB_Info GrB_Vector_assign          // w<M>(Rows) = accum (w(Rows),u)
         Rows, nRows,                    // row indices
         GrB_ALL, 1,                     // all column indices
         false, NULL, GB_ignore_code,    // no scalar expansion
-        false, false,                   // not GrB_Col_assign nor GrB_Row_assign
+        GB_ASSIGN,
         Context) ;
 
     GB_BURBLE_END ;
diff --git a/GraphBLAS/Source/GrB_Vector_assign_scalar.c b/GraphBLAS/Source/GrB_Vector_assign_scalar.c
index a5b9194373..37ed452b49 100644
--- a/GraphBLAS/Source/GrB_Vector_assign_scalar.c
+++ b/GraphBLAS/Source/GrB_Vector_assign_scalar.c
@@ -2,8 +2,8 @@
 // GrB_Vector_assign_[SCALAR]: assign scalar to vector, via scalar expansion
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,7 +13,7 @@
 
 #include "GB_assign.h"
 
-#define GB_ASSIGN(prefix,type,T,ampersand)                                     \
+#define GB_ASSIGN_SCALAR(prefix,type,T,ampersand)                              \
 GrB_Info prefix ## Vector_assign_ ## T    /* w<M>(Rows) = accum (w(Rows),x) */ \
 (                                                                              \
     GrB_Vector w,                   /* input/output vector for results      */ \
@@ -25,7 +25,7 @@ GrB_Info prefix ## Vector_assign_ ## T    /* w<M>(Rows) = accum (w(Rows),x) */ \
     const GrB_Descriptor desc       /* descriptor for w and mask            */ \
 )                                                                              \
 {                                                                              \
-    GB_WHERE (GB_STR(prefix) "Vector_assign_" GB_STR(T)                        \
+    GB_WHERE (w, GB_STR(prefix) "Vector_assign_" GB_STR(T)                     \
         " (w, M, accum, x, Rows, nRows, desc)") ;                              \
     GB_BURBLE_START ("GrB_assign") ;                                           \
     GB_RETURN_IF_NULL_OR_FAULTY (w) ;                                          \
@@ -39,18 +39,18 @@ GrB_Info prefix ## Vector_assign_ ## T    /* w<M>(Rows) = accum (w(Rows),x) */ \
     return (info) ;                                                            \
 }
 
-GB_ASSIGN (GrB_, bool      , BOOL   , &)
-GB_ASSIGN (GrB_, int8_t    , INT8   , &)
-GB_ASSIGN (GrB_, uint8_t   , UINT8  , &)
-GB_ASSIGN (GrB_, int16_t   , INT16  , &)
-GB_ASSIGN (GrB_, uint16_t  , UINT16 , &)
-GB_ASSIGN (GrB_, int32_t   , INT32  , &)
-GB_ASSIGN (GrB_, uint32_t  , UINT32 , &)
-GB_ASSIGN (GrB_, int64_t   , INT64  , &)
-GB_ASSIGN (GrB_, uint64_t  , UINT64 , &)
-GB_ASSIGN (GrB_, float     , FP32   , &)
-GB_ASSIGN (GrB_, double    , FP64   , &)
-GB_ASSIGN (GxB_, GxB_FC32_t, FC32   , &)
-GB_ASSIGN (GxB_, GxB_FC64_t, FC64   , &)
-GB_ASSIGN (GrB_, void *    , UDT    ,  )
+GB_ASSIGN_SCALAR (GrB_, bool      , BOOL   , &)
+GB_ASSIGN_SCALAR (GrB_, int8_t    , INT8   , &)
+GB_ASSIGN_SCALAR (GrB_, uint8_t   , UINT8  , &)
+GB_ASSIGN_SCALAR (GrB_, int16_t   , INT16  , &)
+GB_ASSIGN_SCALAR (GrB_, uint16_t  , UINT16 , &)
+GB_ASSIGN_SCALAR (GrB_, int32_t   , INT32  , &)
+GB_ASSIGN_SCALAR (GrB_, uint32_t  , UINT32 , &)
+GB_ASSIGN_SCALAR (GrB_, int64_t   , INT64  , &)
+GB_ASSIGN_SCALAR (GrB_, uint64_t  , UINT64 , &)
+GB_ASSIGN_SCALAR (GrB_, float     , FP32   , &)
+GB_ASSIGN_SCALAR (GrB_, double    , FP64   , &)
+GB_ASSIGN_SCALAR (GxB_, GxB_FC32_t, FC32   , &)
+GB_ASSIGN_SCALAR (GxB_, GxB_FC64_t, FC64   , &)
+GB_ASSIGN_SCALAR (GrB_, void *    , UDT    ,  )
 
diff --git a/GraphBLAS/Source/GrB_Vector_build.c b/GraphBLAS/Source/GrB_Vector_build.c
index 4d06d7989d..b24e0976df 100644
--- a/GraphBLAS/Source/GrB_Vector_build.c
+++ b/GraphBLAS/Source/GrB_Vector_build.c
@@ -2,8 +2,8 @@
 // GrB_Vector_build: build a sparse GraphBLAS vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,7 +19,7 @@ GrB_Info prefix ## Vector_build_ ## T   /* build a vector from (I,X) tuples*/ \
     const GrB_BinaryOp dup          /* binary op to assemble duplicates   */  \
 )                                                                             \
 {                                                                             \
-    GB_WHERE (GB_STR(prefix) "Vector_build_" GB_STR(T)                        \
+    GB_WHERE (w, GB_STR(prefix) "Vector_build_" GB_STR(T)                     \
         " (w, I, X, nvals, dup)") ;                                           \
     GB_BURBLE_START ("GrB_Vector_build") ;                                    \
     GB_RETURN_IF_NULL_OR_FAULTY (w) ;                                         \
diff --git a/GraphBLAS/Source/GrB_Vector_clear.c b/GraphBLAS/Source/GrB_Vector_clear.c
index 98628a6577..bf2298e05b 100644
--- a/GraphBLAS/Source/GrB_Vector_clear.c
+++ b/GraphBLAS/Source/GrB_Vector_clear.c
@@ -2,8 +2,8 @@
 // GrB_Vector_clear: clears the content of a vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,7 +19,7 @@ GrB_Info GrB_Vector_clear   // clear a vector of all entries;
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_clear (v)") ;
+    GB_WHERE (v, "GrB_Vector_clear (v)") ;
     GB_RETURN_IF_NULL_OR_FAULTY (v) ;
     ASSERT (GB_VECTOR_OK (v)) ;
 
diff --git a/GraphBLAS/Source/GrB_Vector_dup.c b/GraphBLAS/Source/GrB_Vector_dup.c
index d5baf334c9..67b88f1961 100644
--- a/GraphBLAS/Source/GrB_Vector_dup.c
+++ b/GraphBLAS/Source/GrB_Vector_dup.c
@@ -2,8 +2,8 @@
 // GrB_Vector_dup: make a deep copy of a sparse vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,7 +22,7 @@ GrB_Info GrB_Vector_dup     // make an exact copy of a vector
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_dup (&w, u)") ;
+    GB_WHERE1 ("GrB_Vector_dup (&w, u)") ;
     GB_BURBLE_START ("GrB_Vector_dup") ;
     GB_RETURN_IF_NULL (w) ;
     GB_RETURN_IF_NULL_OR_FAULTY (u) ;
diff --git a/GraphBLAS/Source/GrB_Vector_eWiseAdd.c b/GraphBLAS/Source/GrB_Vector_eWiseAdd.c
index 7a9e0da5e5..61d21c507d 100644
--- a/GraphBLAS/Source/GrB_Vector_eWiseAdd.c
+++ b/GraphBLAS/Source/GrB_Vector_eWiseAdd.c
@@ -2,19 +2,13 @@
 // GrB_Vector_eWiseAdd: vector element-wise operations, set union
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // w<M> = accum (w,u+v)
 
-// SuiteSparse:GraphBLAS v3.2 and earlier included these functions from the C
-// API with the wrong name.  It is corrected in this version.  The prior
-// misnamed functions are kept for backward compatibility, but they are
-// deprecated and their use is not recommend. The generic version,
-// GrB_eWiseAdd, is not affected.
-
 #include "GB_ewise.h"
 
 #define GB_EWISE(op)                                                        \
@@ -29,7 +23,7 @@
     ASSERT (M == NULL || GB_VECTOR_OK (M)) ;                                \
     /* get the descriptor */                                                \
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,       \
-        xx1, xx2, xx3) ;                                                    \
+        xx1, xx2, xx3, xx7) ;                                               \
     /* w<M> = accum (w,t) where t = u+v, u'+v, u+v', or u'+v' */            \
     info = GB_ewise (                                                       \
         (GrB_Matrix) w, C_replace,  /* w and its descriptor        */       \
@@ -61,7 +55,7 @@ GrB_Info GrB_Vector_eWiseAdd_BinaryOp       // w<M> = accum (w, u+v)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_eWiseAdd_BinaryOp (w, M, accum, add, u, v, desc)") ;
+    GB_WHERE (w, "GrB_Vector_eWiseAdd_BinaryOp (w, M, accum, add, u, v, desc)");
     GB_BURBLE_START ("GrB_eWiseAdd") ;
     GB_RETURN_IF_NULL_OR_FAULTY (add) ;
 
@@ -74,17 +68,6 @@ GrB_Info GrB_Vector_eWiseAdd_BinaryOp       // w<M> = accum (w, u+v)
     return (info) ;
 }
 
-GrB_Info GrB_eWiseAdd_Vector_BinaryOp       // misnamed
-(
-    GrB_Vector w, const GrB_Vector M, const GrB_BinaryOp accum,
-    const GrB_BinaryOp add, const GrB_Vector u, const GrB_Vector v,
-    const GrB_Descriptor desc
-)
-{ 
-    // call the correctly-named function:
-    return (GrB_Vector_eWiseAdd_BinaryOp (w, M, accum, add, u, v, desc)) ;
-}
-
 //------------------------------------------------------------------------------
 // GrB_Vector_eWiseAdd_Monoid: vector addition
 //------------------------------------------------------------------------------
@@ -105,7 +88,8 @@ GrB_Info GrB_Vector_eWiseAdd_Monoid         // w<M> = accum (w, u+v)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_eWiseAdd_Monoid (w, M, accum, monoid, u, v, desc)") ;
+    GB_WHERE (w, "GrB_Vector_eWiseAdd_Monoid "
+        "(w, M, accum, monoid, u, v, desc)") ;
     GB_BURBLE_START ("GrB_eWiseAdd") ;
     GB_RETURN_IF_NULL_OR_FAULTY (monoid) ;
 
@@ -118,17 +102,6 @@ GrB_Info GrB_Vector_eWiseAdd_Monoid         // w<M> = accum (w, u+v)
     return (info) ;
 }
 
-GrB_Info GrB_eWiseAdd_Vector_Monoid         // misnamed
-(
-    GrB_Vector w, const GrB_Vector M, const GrB_BinaryOp accum,
-    const GrB_Monoid monoid, const GrB_Vector u, const GrB_Vector v,
-    const GrB_Descriptor desc
-)
-{ 
-    // call the correctly-named function:
-    return (GrB_Vector_eWiseAdd_Monoid (w, M, accum, monoid, u, v, desc)) ;
-}
-
 //------------------------------------------------------------------------------
 // GrB_Vector_eWiseAdd_Semiring: vector addition
 //------------------------------------------------------------------------------
@@ -149,8 +122,8 @@ GrB_Info GrB_Vector_eWiseAdd_Semiring       // w<M> = accum (w, u+v)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_eWiseAdd_Semiring (w, M, accum, semiring, u, v,"
-        " desc)") ;
+    GB_WHERE (w, "GrB_Vector_eWiseAdd_Semiring "
+        "(w, M, accum, semiring, u, v, desc)") ;
     GB_BURBLE_START ("GrB_eWiseAdd") ;
     GB_RETURN_IF_NULL_OR_FAULTY (semiring) ;
 
@@ -163,14 +136,3 @@ GrB_Info GrB_Vector_eWiseAdd_Semiring       // w<M> = accum (w, u+v)
     return (info) ;
 }
 
-GrB_Info GrB_eWiseAdd_Vector_Semiring       // misnamed
-(
-    GrB_Vector w, const GrB_Vector M, const GrB_BinaryOp accum,
-    const GrB_Semiring semiring, const GrB_Vector u, const GrB_Vector v,
-    const GrB_Descriptor desc
-)
-{ 
-    // call the correctly-named function:
-    return (GrB_Vector_eWiseAdd_Semiring (w, M, accum, semiring, u, v, desc)) ;
-}
-
diff --git a/GraphBLAS/Source/GrB_Vector_eWiseMult.c b/GraphBLAS/Source/GrB_Vector_eWiseMult.c
index 36e98b0c2a..db3f382e0f 100644
--- a/GraphBLAS/Source/GrB_Vector_eWiseMult.c
+++ b/GraphBLAS/Source/GrB_Vector_eWiseMult.c
@@ -2,19 +2,13 @@
 // GrB_Vector_eWiseMult: vector element-wise multiplication
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // w<M> = accum (w,u.*v)
 
-// SuiteSparse:GraphBLAS v3.2 and earlier included these functions from the C
-// API with the wrong name.  It is corrected in this version.  The prior
-// misnamed functions are kept for backward compatibility, but they are
-// deprecated and their use is not recommend. The generic version,
-// GrB_eWiseMult, is not affected.
-
 #include "GB_ewise.h"
 
 #define GB_EWISE(op)                                                        \
@@ -29,7 +23,7 @@
     ASSERT (M == NULL || GB_VECTOR_OK (M)) ;                                \
     /* get the descriptor */                                                \
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,       \
-        xx1, xx2, xx3) ;                                                    \
+        xx1, xx2, xx3, xx7) ;                                               \
     /* w<M> = accum (w,t) where t = u.*v, u'.*v, u.*v', or u'.*v' */        \
     info = GB_ewise (                                                       \
         (GrB_Matrix) w, C_replace,  /* w and its descriptor        */       \
@@ -61,7 +55,8 @@ GrB_Info GrB_Vector_eWiseMult_BinaryOp       // w<M> = accum (w, u.*v)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_eWiseMult_BinaryOp (w, M, accum, mult, u, v, desc)") ;
+    GB_WHERE (w, "GrB_Vector_eWiseMult_BinaryOp "
+        "(w, M, accum, mult, u, v, desc)") ;
     GB_BURBLE_START ("GrB_eWiseMult") ;
     GB_RETURN_IF_NULL_OR_FAULTY (mult) ;
 
@@ -74,17 +69,6 @@ GrB_Info GrB_Vector_eWiseMult_BinaryOp       // w<M> = accum (w, u.*v)
     return (info) ;
 }
 
-GrB_Info GrB_eWiseMult_Vector_BinaryOp       // misnamed
-(
-    GrB_Vector w, const GrB_Vector M, const GrB_BinaryOp accum,
-    const GrB_BinaryOp mult, const GrB_Vector u, const GrB_Vector v,
-    const GrB_Descriptor desc
-)
-{ 
-    // call the correctly-named function:
-    return (GrB_Vector_eWiseMult_BinaryOp (w, M, accum, mult, u, v, desc)) ;
-}
-
 //------------------------------------------------------------------------------
 // GrB_Vector_eWiseMult_Monoid: vector element-wise multiplication
 //------------------------------------------------------------------------------
@@ -105,7 +89,8 @@ GrB_Info GrB_Vector_eWiseMult_Monoid         // w<M> = accum (w, u.*v)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_eWiseMult_Monoid (w, M, accum, monoid, u, v, desc)") ;
+    GB_WHERE (w, "GrB_Vector_eWiseMult_Monoid "
+        "(w, M, accum, monoid, u, v, desc)") ;
     GB_BURBLE_START ("GrB_eWiseMult") ;
     GB_RETURN_IF_NULL_OR_FAULTY (monoid) ;
 
@@ -118,17 +103,6 @@ GrB_Info GrB_Vector_eWiseMult_Monoid         // w<M> = accum (w, u.*v)
     return (info) ;
 }
 
-GrB_Info GrB_eWiseMult_Vector_Monoid         // misnamed
-(
-    GrB_Vector w, const GrB_Vector M, const GrB_BinaryOp accum,
-    const GrB_Monoid monoid, const GrB_Vector u, const GrB_Vector v,
-    const GrB_Descriptor desc
-)
-{ 
-    // call the correctly-named function:
-    return (GrB_Vector_eWiseMult_Monoid (w, M, accum, monoid, u, v, desc)) ;
-}
-
 //------------------------------------------------------------------------------
 // GrB_Vector_eWiseMult_Semiring: vector element-wise multiplication
 //------------------------------------------------------------------------------
@@ -149,8 +123,8 @@ GrB_Info GrB_Vector_eWiseMult_Semiring       // w<M> = accum (w, u.*v)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_eWiseMult_Semiring (w, M, accum, semiring, u, v,"
-        " desc)") ;
+    GB_WHERE (w, "GrB_Vector_eWiseMult_Semiring "
+        "(w, M, accum, semiring, u, v, desc)") ;
     GB_BURBLE_START ("GrB_eWiseMult") ;
     GB_RETURN_IF_NULL_OR_FAULTY (semiring) ;
 
@@ -163,14 +137,3 @@ GrB_Info GrB_Vector_eWiseMult_Semiring       // w<M> = accum (w, u.*v)
     return (info) ;
 }
 
-GrB_Info GrB_eWiseMult_Vector_Semiring       // misnamed
-(
-    GrB_Vector w, const GrB_Vector M, const GrB_BinaryOp accum,
-    const GrB_Semiring semiring, const GrB_Vector u, const GrB_Vector v,
-    const GrB_Descriptor desc
-)
-{ 
-    // call the correctly-named function:
-    return (GrB_Vector_eWiseMult_Semiring (w, M, accum, semiring, u, v, desc)) ;
-}
-
diff --git a/GraphBLAS/Source/GrB_Vector_extract.c b/GraphBLAS/Source/GrB_Vector_extract.c
index 3b163d28de..4fb97cef16 100644
--- a/GraphBLAS/Source/GrB_Vector_extract.c
+++ b/GraphBLAS/Source/GrB_Vector_extract.c
@@ -2,8 +2,8 @@
 // GrB_Vector_extract: w<M> = accum (w, u(I))
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -25,7 +25,7 @@ GrB_Info GrB_Vector_extract         // w<M> = accum (w, u(I))
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_extract (w, M, accum, u, I, ni, desc)") ;
+    GB_WHERE (w, "GrB_Vector_extract (w, M, accum, u, I, ni, desc)") ;
     GB_BURBLE_START ("GrB_extract") ;
     GB_RETURN_IF_NULL_OR_FAULTY (w) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -36,7 +36,7 @@ GrB_Info GrB_Vector_extract         // w<M> = accum (w, u(I))
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // extract entries
diff --git a/GraphBLAS/Source/GrB_Vector_extractElement.c b/GraphBLAS/Source/GrB_Vector_extractElement.c
index c0b63011a7..be81a3f815 100644
--- a/GraphBLAS/Source/GrB_Vector_extractElement.c
+++ b/GraphBLAS/Source/GrB_Vector_extractElement.c
@@ -2,8 +2,8 @@
 // GrB_Vector_extractElement: extract a single entry from a vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GrB_Vector_extractTuples.c b/GraphBLAS/Source/GrB_Vector_extractTuples.c
index 75ac9b446d..07a1413fe6 100644
--- a/GraphBLAS/Source/GrB_Vector_extractTuples.c
+++ b/GraphBLAS/Source/GrB_Vector_extractTuples.c
@@ -2,8 +2,8 @@
 // GrB_Vector_extractTuples: extract all tuples from a vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -28,7 +28,7 @@ GrB_Info prefix ## Vector_extractTuples_ ## T     /* [I,~,X] = find (A) */    \
     const GrB_Vector v      /* vector to extract tuples from             */   \
 )                                                                             \
 {                                                                             \
-    GB_WHERE (GB_STR(prefix) "Vector_extractTuples_" GB_STR(T)                \
+    GB_WHERE1 (GB_STR(prefix) "Vector_extractTuples_" GB_STR(T)          \
         " (I, X, nvals, v)") ;                                                \
     GB_BURBLE_START ("GrB_Vector_extractTuples") ;                            \
     GB_RETURN_IF_NULL_OR_FAULTY (v) ;                                         \
diff --git a/GraphBLAS/Source/GrB_Vector_free.c b/GraphBLAS/Source/GrB_Vector_free.c
index ab6da1e7fd..11e48c7f31 100644
--- a/GraphBLAS/Source/GrB_Vector_free.c
+++ b/GraphBLAS/Source/GrB_Vector_free.c
@@ -2,8 +2,8 @@
 // GrB_Vector_free: free a sparse vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,7 +18,7 @@ GrB_Info GrB_Vector_free    // free a vector
 )
 { 
 
-    GB_VECTOR_FREE (v) ;
+    GB_Matrix_free ((GrB_Matrix *) v) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Vector_new.c b/GraphBLAS/Source/GrB_Vector_new.c
index 98972c557d..9ea5dcda51 100644
--- a/GraphBLAS/Source/GrB_Vector_new.c
+++ b/GraphBLAS/Source/GrB_Vector_new.c
@@ -2,8 +2,8 @@
 // GrB_Vector_new: create a new vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -26,7 +26,7 @@ GrB_Info GrB_Vector_new     // create a new vector with no entries
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_new (&v, type, n)") ;
+    GB_WHERE1 ("GrB_Vector_new (&v, type, n)") ;
     GB_RETURN_IF_NULL (v) ;
     (*v) = NULL ;
     GB_RETURN_IF_NULL_OR_FAULTY (type) ;
@@ -34,8 +34,7 @@ GrB_Info GrB_Vector_new     // create a new vector with no entries
     if (n > GxB_INDEX_MAX)
     { 
         // problem too large
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "problem too large: n " GBu " exceeds " GBu, n, GxB_INDEX_MAX))) ;
+        return (GrB_INVALID_VALUE) ;
     }
 
     //--------------------------------------------------------------------------
@@ -45,16 +44,9 @@ GrB_Info GrB_Vector_new     // create a new vector with no entries
     GrB_Info info ;
     int64_t vlen = (int64_t) n ;
 
-    // v is always non-hypersparse, but use the auto rule so that
-    // v->hyper_ratio is assigned from the global option.  This way, if the
-    // vector is ever typecast into a matrix, and used in a matrix computation,
-    // the hyper_ratio will propagate to the result matrix.  A vector will not
-    // use its hyper_ratio, since vdim == 1 ensures that v always remains
-    // non-hypersparse.
-
-    // *v == NULL ;                 // allocate a new header for v
-    info = GB_new ((GrB_Matrix *) v, type, vlen, 1, GB_Ap_calloc, true,
-        GB_AUTO_HYPER, GB_HYPER_DEFAULT, 1, Context) ;
+    info = GB_new ((GrB_Matrix *) v, // new vector (sparse), new header
+        type, vlen, 1, GB_Ap_calloc, true,
+        GxB_SPARSE, GB_Global_hyper_switch_get ( ), 1, Context) ;
     ASSERT (GB_IMPLIES (info == GrB_SUCCESS, GB_VECTOR_OK (*v))) ;
     return (info) ;
 }
diff --git a/GraphBLAS/Source/GrB_Vector_nvals.c b/GraphBLAS/Source/GrB_Vector_nvals.c
index fd5e747934..5fe864484f 100644
--- a/GraphBLAS/Source/GrB_Vector_nvals.c
+++ b/GraphBLAS/Source/GrB_Vector_nvals.c
@@ -2,8 +2,8 @@
 // GrB_Vector_nvals: number of entries in a sparse vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GrB_Vector_nvals   // get the number of entries in a vector
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_nvals (&nvals, v)") ;
+    GB_WHERE1 ("GrB_Vector_nvals (&nvals, v)") ;
     GB_BURBLE_START ("GrB_Vector_nvals") ;
     GB_RETURN_IF_NULL_OR_FAULTY (v) ;
     ASSERT (GB_VECTOR_OK (v)) ;
diff --git a/GraphBLAS/Source/GrB_Vector_reduce.c b/GraphBLAS/Source/GrB_Vector_reduce.c
index bb3f642b04..6f7853fbe0 100644
--- a/GraphBLAS/Source/GrB_Vector_reduce.c
+++ b/GraphBLAS/Source/GrB_Vector_reduce.c
@@ -2,8 +2,8 @@
 // GrB_Vector_reduce: reduce a vector to a scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,16 +23,17 @@ GrB_Info prefix ## Vector_reduce_ ## T  /* c = accum (c, reduce (u))*/         \
 (                                                                              \
     type *c,                        /* result scalar                        */ \
     const GrB_BinaryOp accum,       /* optional accum for c=accum(c,t)      */ \
-    const GrB_Monoid reduce,        /* monoid to do the reduction           */ \
+    const GrB_Monoid monoid,        /* monoid to do the reduction           */ \
     const GrB_Vector u,             /* vector to reduce                     */ \
     const GrB_Descriptor desc       /* descriptor (currently unused)        */ \
 )                                                                              \
 {                                                                              \
-    GB_WHERE ("GrB_Vector_reduce_" GB_STR(T) " (&c, accum, reduce, u, desc)") ;\
+    GB_WHERE1 ("GrB_Vector_reduce_" GB_STR(T)                             \
+        " (&c, accum, monoid, u, desc)") ;                                     \
     GB_BURBLE_START ("GrB_reduce") ;                                           \
     GB_RETURN_IF_NULL_OR_FAULTY (u) ;                                          \
     ASSERT (GB_VECTOR_OK (u)) ;                                                \
-    GrB_Info info = GB_reduce_to_scalar (c, prefix ## T, accum, reduce,        \
+    GrB_Info info = GB_reduce_to_scalar (c, prefix ## T, accum, monoid,        \
         (GrB_Matrix) u, Context) ;                                             \
     GB_BURBLE_END ;                                                            \
     return (info) ;                                                            \
@@ -56,19 +57,19 @@ GrB_Info GrB_Vector_reduce_UDT      // c = accum (c, reduce_to_scalar (u))
 (
     void *c,                        // result scalar
     const GrB_BinaryOp accum,       // optional accum for c=accum(c,t)
-    const GrB_Monoid reduce,        // monoid to do the reduction
+    const GrB_Monoid monoid,        // monoid to do the reduction
     const GrB_Vector u,             // vector to reduce
     const GrB_Descriptor desc       // descriptor (currently unused)
 )
 { 
     // See comments on GrB_Matrix_reduce_UDT
-    GB_WHERE ("GrB_Vector_reduce_UDT (&c, accum, reduce, u, desc)") ;
+    GB_WHERE1 ("GrB_Vector_reduce_UDT (&c, accum, monoid, u, desc)") ;
     GB_BURBLE_START ("GrB_reduce") ;
     GB_RETURN_IF_NULL_OR_FAULTY (u) ;
-    GB_RETURN_IF_NULL_OR_FAULTY (reduce) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (monoid) ;
     ASSERT (GB_VECTOR_OK (u)) ;
-    GrB_Info info = GB_reduce_to_scalar (c, reduce->op->ztype,
-        accum, reduce, (GrB_Matrix) u, Context) ;
+    GrB_Info info = GB_reduce_to_scalar (c, monoid->op->ztype,
+        accum, monoid, (GrB_Matrix) u, Context) ;
     GB_BURBLE_END ;
     return (info) ;
 }
diff --git a/GraphBLAS/Source/GrB_Vector_removeElement.c b/GraphBLAS/Source/GrB_Vector_removeElement.c
index 024eabc58f..c3c49a51b0 100644
--- a/GraphBLAS/Source/GrB_Vector_removeElement.c
+++ b/GraphBLAS/Source/GrB_Vector_removeElement.c
@@ -2,8 +2,8 @@
 // GrB_Vector_removeElement: remove a single entry from a vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -26,40 +26,77 @@ static inline bool GB_removeElement
 {
 
     //--------------------------------------------------------------------------
-    // get the pattern of the vector
+    // check inputs
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Vp = V->p ;
-    const int64_t *GB_RESTRICT Vi = V->i ;
-    bool found ;
-
-    // remove from a GrB_Vector
-    int64_t pleft = 0 ;
-    int64_t pright = Vp [1] - 1 ;
+    ASSERT (!GB_IS_FULL (V)) ;
 
     //--------------------------------------------------------------------------
-    // binary search in kth vector for index i
+    // remove V(i)
     //--------------------------------------------------------------------------
 
-    // Time taken for this step is at most O(log(nnz(V))).
-    bool is_zombie ;
-    int64_t nzombies = V->nzombies ;
-    GB_BINARY_SEARCH_ZOMBIE (i, Vi, pleft, pright, found, nzombies, is_zombie) ;
+    if (GB_IS_BITMAP (V))
+    {
 
-    //--------------------------------------------------------------------------
-    // remove the entry
-    //--------------------------------------------------------------------------
+        //----------------------------------------------------------------------
+        // V is bitmap
+        //----------------------------------------------------------------------
 
-    if (found && !is_zombie)
-    { 
-        // V(i) becomes a zombie
-        V->i [pleft] = GB_FLIP (i) ;
-        V->nzombies++ ;
-        // TODO in 4.0: delete:
-        if (!(V->enqueued)) { if (!GB_queue_insert ((GrB_Matrix) V)) GB_PANIC ;}
-    }
+        int8_t *GB_RESTRICT Vb = V->b ;
+        int8_t vb = Vb [i] ;
+        if (vb != 0)
+        { 
+            // V(i) is present; remove it
+            Vb [i] = 0 ;
+            V->nvals-- ;
+        }
+        // V(i) is always found, whether present or not
+        return (true) ;
 
-    return (found) ;
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // V is sparse
+        //----------------------------------------------------------------------
+
+        const int64_t *GB_RESTRICT Vp = V->p ;
+        const int64_t *GB_RESTRICT Vi = V->i ;
+        bool found ;
+
+        // look in V(:)
+        int64_t pleft = 0 ;
+        int64_t pright = Vp [1] ;
+        int64_t vnz = pright ;
+
+        bool is_zombie ;
+        if (vnz == V->vlen)
+        { 
+            // V(:) is packed so no binary search is needed to find V(i)
+            pleft = i ;
+            ASSERT (GB_UNFLIP (Vi [pleft]) == i) ;
+            found = true ;
+            is_zombie = GB_IS_ZOMBIE (Vi [pleft]) ;
+        }
+        else
+        { 
+            // binary search for V(i): time is O(log(vnz))
+            int64_t nzombies = V->nzombies ;
+            pright-- ;
+            GB_BINARY_SEARCH_ZOMBIE (i, Vi, pleft, pright, found,
+                nzombies, is_zombie) ;
+        }
+
+        // remove the entry
+        if (found && !is_zombie)
+        { 
+            // V(i) becomes a zombie
+            V->i [pleft] = GB_FLIP (i) ;
+            V->nzombies++ ;
+        }
+        return (found) ;
+    }
 }
 
 //------------------------------------------------------------------------------
@@ -77,18 +114,55 @@ GrB_Info GrB_Vector_removeElement
     // check inputs
     //--------------------------------------------------------------------------
 
-    // GB_WHERE ("Vec here") ;
-    GB_CONTEXT_RETURN_IF_NULL (V) ;
-    GB_CONTEXT_RETURN_IF_FAULTY (V) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (V) ;
+
+    //--------------------------------------------------------------------------
+    // if V is jumbled, wait on the vector first.  If full, convert to nonfull
+    //--------------------------------------------------------------------------
+
+    if (V->jumbled || GB_IS_FULL (V))
+    {
+        GrB_Info info ;
+        GB_WHERE (V, GB_WHERE_STRING) ;
+        GB_BURBLE_START ("GrB_Vector_removeElement") ;
+        if (GB_IS_FULL (V))
+        { 
+            // convert V from full to sparse
+            GB_OK (GB_convert_to_nonfull ((GrB_Matrix) V, Context)) ;
+        }
+        else
+        { 
+            // V is sparse and jumbled
+            GB_OK (GB_Matrix_wait ((GrB_Matrix) V, Context)) ;
+        }
+        ASSERT (!GB_IS_FULL (V)) ;
+        ASSERT (!GB_ZOMBIES (V)) ;
+        ASSERT (!GB_JUMBLED (V)) ;
+        ASSERT (!GB_PENDING (V)) ;
+        // remove the entry
+        info = GrB_Vector_removeElement (V, i) ;
+        GB_BURBLE_END ;
+        return (info) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // V is not jumbled and not full; it may have zombies and pending tuples
+    //--------------------------------------------------------------------------
+
+    ASSERT (!GB_IS_FULL (V)) ;
+    ASSERT (GB_ZOMBIES_OK (V)) ;
+    ASSERT (!GB_JUMBLED (V)) ;
+    ASSERT (GB_PENDING_OK (V)) ;
 
     // check index
     if (i >= V->vlen)
     { 
-        GB_WHERE (GB_WHERE_STRING) ;
-        return (GB_ERROR (GrB_INVALID_INDEX, (GB_LOG, "Row index "
-            GBu " out of range; must be < " GBd, i, V->vlen))) ;
+        GB_WHERE (V, GB_WHERE_STRING) ;
+        GB_ERROR (GrB_INVALID_INDEX, "Row index "
+            GBu " out of range; must be < " GBd, i, V->vlen) ;
     }
 
+    // if V is sparse, it may have pending tuples
     bool V_is_pending = GB_PENDING (V) ; 
     if (V->nzmax == 0 && !V_is_pending)
     { 
@@ -107,16 +181,17 @@ GrB_Info GrB_Vector_removeElement
     if (V_is_pending)
     { 
         GrB_Info info ;
-        GB_WHERE (GB_WHERE_STRING) ;
+        GB_WHERE (V, GB_WHERE_STRING) ;
         GB_BURBLE_START ("GrB_Vector_removeElement") ;
         GB_OK (GB_Matrix_wait ((GrB_Matrix) V, Context)) ;
         ASSERT (!GB_ZOMBIES (V)) ;
+        ASSERT (!GB_JUMBLED (V)) ;
         ASSERT (!GB_PENDING (V)) ;
+        // look again; remove the entry if it was a pending tuple
+        GB_removeElement (V, i) ;
         GB_BURBLE_END ;
     }
 
-    // look again; remove the entry if it was a pending tuple
-    GB_removeElement (V, i) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Vector_resize.c b/GraphBLAS/Source/GrB_Vector_resize.c
index 314e364020..654a67555f 100644
--- a/GraphBLAS/Source/GrB_Vector_resize.c
+++ b/GraphBLAS/Source/GrB_Vector_resize.c
@@ -2,8 +2,8 @@
 // GrB_Vector_resize: change the size of a vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,7 +11,7 @@
 
 GrB_Info GrB_Vector_resize      // change the size of a vector
 (
-    GrB_Vector u,               // vector to modify
+    GrB_Vector w,               // vector to modify
     GrB_Index nrows_new         // new number of rows in vector
 )
 { 
@@ -20,13 +20,32 @@ GrB_Info GrB_Vector_resize      // change the size of a vector
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_resize (u, nrows_new)") ;
-    GB_RETURN_IF_NULL_OR_FAULTY (u) ;
+    GB_WHERE (w, "GrB_Vector_resize (w, nrows_new)") ;
+    GB_BURBLE_START ("GrB_Vector_resize") ;
+    GB_RETURN_IF_NULL_OR_FAULTY (w) ;
 
     //--------------------------------------------------------------------------
     // resize the vector
     //--------------------------------------------------------------------------
 
-    return (GB_resize ((GrB_Matrix) u, nrows_new, 1, Context)) ;
+    GrB_Info info = GB_resize ((GrB_Matrix) w, nrows_new, 1, Context) ;
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
+//------------------------------------------------------------------------------
+// GxB_Vector_resize: deprecated
+//------------------------------------------------------------------------------
+
+// This function now appears in the C API Specification as GrB_Vector_resize.
+// The new name is preferred.
+
+GrB_Info GxB_Vector_resize      // change the size of a vector
+(
+    GrB_Vector u,               // vector to modify
+    GrB_Index nrows_new         // new number of rows in vector
+)
+{ 
+    return (GrB_Vector_resize (u, nrows_new)) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_Vector_setElement.c b/GraphBLAS/Source/GrB_Vector_setElement.c
index 7930c29f48..1650f833a4 100644
--- a/GraphBLAS/Source/GrB_Vector_setElement.c
+++ b/GraphBLAS/Source/GrB_Vector_setElement.c
@@ -2,8 +2,8 @@
 // GrB_Vector_setElement: set an entry in a vector, w (row) = x
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,8 @@ GrB_Info prefix ## Vector_setElement_ ## T    /* w(row) = x */              \
     GrB_Index row                       /* row index                  */    \
 )                                                                           \
 {                                                                           \
-    GB_WHERE (GB_STR(prefix) "Vector_setElement_" GB_STR(T) " (w, x, row)");\
+    GB_WHERE (w, GB_STR(prefix) "Vector_setElement_" GB_STR(T)              \
+        " (w, x, row)");                                                    \
     GB_RETURN_IF_NULL_OR_FAULTY (w) ;                                       \
     ASSERT (GB_VECTOR_OK (w)) ;                                             \
     return (GB_setElement ((GrB_Matrix) w, ampersand x, row, 0,             \
diff --git a/GraphBLAS/Source/GrB_Vector_size.c b/GraphBLAS/Source/GrB_Vector_size.c
index 7047466fd3..653bb47a2b 100644
--- a/GraphBLAS/Source/GrB_Vector_size.c
+++ b/GraphBLAS/Source/GrB_Vector_size.c
@@ -2,8 +2,8 @@
 // GrB_Vector_size: dimension of a sparse vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GrB_Vector_size    // get the dimension of a vector
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_size (&n, v)") ;
+    GB_WHERE1 ("GrB_Vector_size (&n, v)") ;
     GB_RETURN_IF_NULL (n) ;
     GB_RETURN_IF_NULL_OR_FAULTY (v) ;
     ASSERT (GB_VECTOR_OK (v)) ;
diff --git a/GraphBLAS/Source/GrB_Vector_wait.c b/GraphBLAS/Source/GrB_Vector_wait.c
index 2027bca8a0..bff3cb613d 100644
--- a/GraphBLAS/Source/GrB_Vector_wait.c
+++ b/GraphBLAS/Source/GrB_Vector_wait.c
@@ -2,11 +2,13 @@
 // GrB_Vector_wait: wait for a vector to complete
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
+// Finishes all work on a vector, followed by an OpenMP flush.
+
 #include "GB.h"
 
 #define GB_FREE_ALL ;
@@ -21,8 +23,8 @@ GrB_Info GrB_Vector_wait    // finish all work on a vector
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Vector_wait (&v)") ;
-    GB_BURBLE_START ("GrB_Vector_wait") ;
+    #pragma omp flush
+    GB_WHERE ((*v), "GrB_Vector_wait (&v)") ;
     GB_RETURN_IF_NULL (v) ;
     GB_RETURN_IF_NULL_OR_FAULTY (*v) ;
 
@@ -30,14 +32,19 @@ GrB_Info GrB_Vector_wait    // finish all work on a vector
     // finish all pending work on the vector
     //--------------------------------------------------------------------------
 
-    GrB_Info info ;
-    GB_VECTOR_WAIT (*v) ;
+    if (GB_ANY_PENDING_WORK (*v))
+    {
+        GrB_Info info ;
+        GB_BURBLE_START ("GrB_Vector_wait") ;
+        GB_OK (GB_Matrix_wait ((GrB_Matrix) (*v), Context)) ;
+        GB_BURBLE_END ;
+    }
 
     //--------------------------------------------------------------------------
     // return result
     //--------------------------------------------------------------------------
 
-    GB_BURBLE_END ;
+    #pragma omp flush
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_error.c b/GraphBLAS/Source/GrB_error.c
index 8a21db73ee..18e9295c5b 100644
--- a/GraphBLAS/Source/GrB_error.c
+++ b/GraphBLAS/Source/GrB_error.c
@@ -2,20 +2,94 @@
 // GrB_error: return an error string describing the last error
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-#include "GB_thread_local.h"
+#include "GB.h"
 
-// if dynamic allocation of memory for POSIX threads fails, use this string:
-const char panic [GB_RLEN+1] =
-"GraphBLAS error: GrB_PANIC: unable to allocate thread-local storage.\n" ;
+const char empty [8] = "" ;
 
-const char *GrB_error ( )       // return a string describing the last error
+GrB_Info GrB_Type_error (const char **error, const GrB_Type type)
 { 
-    char *p = GB_thread_local_get ( ) ;
-    return (p == NULL ? panic : p) ;
+    GB_RETURN_IF_NULL (error) ;
+    (*error) = empty ;
+    return (GrB_SUCCESS) ;
+}
+
+GrB_Info GrB_UnaryOp_error (const char **error, const GrB_UnaryOp op)
+{ 
+    GB_RETURN_IF_NULL (error) ;
+    (*error) = empty ;
+    return (GrB_SUCCESS) ;
+}
+
+GrB_Info GrB_BinaryOp_error (const char **error, const GrB_BinaryOp op)
+{ 
+    GB_RETURN_IF_NULL (error) ;
+    (*error) = empty ;
+    return (GrB_SUCCESS) ;
+}
+
+GrB_Info GxB_SelectOp_error (const char **error, const GxB_SelectOp op)
+{ 
+    GB_RETURN_IF_NULL (error) ;
+    (*error) = empty ;
+    return (GrB_SUCCESS) ;
+}
+
+GrB_Info GrB_Monoid_error (const char **error, const GrB_Monoid monoid)
+{ 
+    GB_RETURN_IF_NULL (error) ;
+    (*error) = empty ;
+    return (GrB_SUCCESS) ;
+}
+
+GrB_Info GrB_Semiring_error (const char **error, const GrB_Semiring semiring)
+{ 
+    GB_RETURN_IF_NULL (error) ;
+    (*error) = empty ;
+    return (GrB_SUCCESS) ;
+}
+
+GrB_Info GxB_Scalar_error (const char **error, const GxB_Scalar s)
+{ 
+    GB_RETURN_IF_NULL (error) ;
+    (*error) = empty ;
+    GB_RETURN_IF_NULL_OR_FAULTY (s) ;
+    if (s->logger == NULL) return (GrB_SUCCESS) ;
+    (*error) = s->logger ;
+    return (GrB_SUCCESS) ;
+}
+
+GrB_Info GrB_Vector_error (const char **error, const GrB_Vector v)
+{ 
+    GB_RETURN_IF_NULL (error) ;
+    (*error) = empty ;
+    GB_RETURN_IF_NULL_OR_FAULTY (v) ;
+    if (v->logger == NULL) return (GrB_SUCCESS) ;
+    (*error) = v->logger ;
+    return (GrB_SUCCESS) ;
+}
+
+GrB_Info GrB_Matrix_error (const char **error, const GrB_Matrix A)
+{ 
+    GB_RETURN_IF_NULL (error) ;
+    (*error) = empty ;
+    GB_RETURN_IF_NULL_OR_FAULTY (A) ;
+    if (A->logger == NULL) return (GrB_SUCCESS) ;
+    (*error) = A->logger ;
+    return (GrB_SUCCESS) ;
+}
+
+GrB_Info GrB_Descriptor_error (const char **error, const GrB_Descriptor d)
+{ 
+    GB_RETURN_IF_NULL (error) ;
+    (*error) = empty ;
+    GB_RETURN_IF_NULL_OR_FAULTY (d) ;
+    if (d->logger == NULL) return (GrB_SUCCESS) ;
+    (*error) = d->logger ;
+    return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_finalize.c b/GraphBLAS/Source/GrB_finalize.c
index 181cc007d0..269b1d947b 100644
--- a/GraphBLAS/Source/GrB_finalize.c
+++ b/GraphBLAS/Source/GrB_finalize.c
@@ -2,35 +2,23 @@
 // GrB_finalize: finalize GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // GrB_finalize must be called as the last GraphBLAS function, per the
-// GraphBLAS C API Specification.  Only one user thread can call this
-// function.  Results are undefined if more than one thread calls this
-// function at the same time.
+// GraphBLAS C API Specification.  Only one user thread can call this function.
+// Results are undefined if more than one thread calls this function at the
+// same time.
+
+// However, in the current version of SuiteSparse:GraphBLAS, this function has
+// nothing to do.
 
 #include "GB.h"
 
 GrB_Info GrB_finalize ( )
 { 
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    GB_WHERE ("GrB_finalize") ;
-
-    #if defined (USER_POSIX_THREADS)
-    { pthread_mutex_destroy (&GB_sync) ; }  // TODO in 4.0: delete
-    #endif
-
-    //--------------------------------------------------------------------------
-    // return result
-    //--------------------------------------------------------------------------
-
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GrB_getVersion.c b/GraphBLAS/Source/GrB_getVersion.c
index c971d9fc6e..aee9dbd135 100644
--- a/GraphBLAS/Source/GrB_getVersion.c
+++ b/GraphBLAS/Source/GrB_getVersion.c
@@ -2,8 +2,8 @@
 // GrB_getVersion: get the version number of the GraphBLAS C API standard
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GrB_init.c b/GraphBLAS/Source/GrB_init.c
index 7087e1a49f..d2ae4d3527 100644
--- a/GraphBLAS/Source/GrB_init.c
+++ b/GraphBLAS/Source/GrB_init.c
@@ -2,8 +2,8 @@
 // GrB_init: initialize GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,7 +16,7 @@ GrB_Info GrB_init           // start up GraphBLAS
 (
     GrB_Mode mode           // blocking or non-blocking mode
 )
-{
+{ 
 
     //--------------------------------------------------------------------------
     // check inputs
diff --git a/GraphBLAS/Source/GrB_kronecker.c b/GraphBLAS/Source/GrB_kronecker.c
index 07281512a1..dd8c6ea24e 100644
--- a/GraphBLAS/Source/GrB_kronecker.c
+++ b/GraphBLAS/Source/GrB_kronecker.c
@@ -2,8 +2,8 @@
 // GrB_kronecker: Kronecker product
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -29,12 +29,13 @@ GrB_Info GrB_Matrix_kronecker_BinaryOp  // C<M> = accum (C, kron(A,B))
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_kronecker_BinaryOp (C, M, accum, op, A, B, desc)") ;
+    GB_WHERE (C, "GrB_Matrix_kronecker_BinaryOp "
+        "(C, M, accum, op, A, B, desc)") ;
     GB_BURBLE_START ("GrB_kronecker") ;
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_tran, B_tran, xx) ;
+        A_tran, B_tran, xx, xx7) ;
 
     //--------------------------------------------------------------------------
     // C = kron(A,B)
@@ -74,13 +75,14 @@ GrB_Info GrB_Matrix_kronecker_Monoid  // C<M> = accum (C, kron(A,B))
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_kronecker_Monoid (C, M, accum, op, monoid, B, desc)");
+    GB_WHERE (C, "GrB_Matrix_kronecker_Monoid "
+        "(C, M, accum, op, monoid, B, desc)");
     GB_BURBLE_START ("GrB_kronecker") ;
     GB_RETURN_IF_NULL_OR_FAULTY (monoid) ;
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_tran, B_tran, xx) ;
+        A_tran, B_tran, xx, xx7) ;
 
     //--------------------------------------------------------------------------
     // C = kron(A,B)
@@ -120,14 +122,14 @@ GrB_Info GrB_Matrix_kronecker_Semiring  // C<M> = accum (C, kron(A,B))
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_Matrix_kronecker_Semiring "
-              "(C, M, accum, semiring, A, B, desc)") ;
+    GB_WHERE (C, "GrB_Matrix_kronecker_Semiring "
+        "(C, M, accum, semiring, A, B, desc)") ;
     GB_BURBLE_START ("GrB_kronecker") ;
     GB_RETURN_IF_NULL_OR_FAULTY (semiring) ;
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_tran, B_tran, xx) ;
+        A_tran, B_tran, xx, xx7) ;
 
     //--------------------------------------------------------------------------
     // C = kron(A,B)
diff --git a/GraphBLAS/Source/GrB_mxm.c b/GraphBLAS/Source/GrB_mxm.c
index 516f96f9b0..cf8d38f65d 100644
--- a/GraphBLAS/Source/GrB_mxm.c
+++ b/GraphBLAS/Source/GrB_mxm.c
@@ -2,8 +2,8 @@
 // GrB_mxm: matrix-matrix multiply
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -31,7 +31,7 @@ GrB_Info GrB_mxm                    // C<M> = accum (C, A*B)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_mxm (C, M, accum, semiring, A, B, desc)") ;
+    GB_WHERE (C, "GrB_mxm (C, M, accum, semiring, A, B, desc)") ;
     GB_BURBLE_START ("GrB_mxm") ;
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -40,7 +40,7 @@ GrB_Info GrB_mxm                    // C<M> = accum (C, A*B)
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_transpose, B_transpose, AxB_method) ;
+        A_transpose, B_transpose, AxB_method, do_sort) ;
 
     //--------------------------------------------------------------------------
     // C<M> = accum (C,A*B) and variations, using the mxm kernel
@@ -55,7 +55,7 @@ GrB_Info GrB_mxm                    // C<M> = accum (C, A*B)
         A,          A_transpose,    // A matrix and its descriptor
         B,          B_transpose,    // B matrix and its descriptor
         false,                      // use fmult(x,y), flipxy false
-        AxB_method,                 // algorithm selector
+        AxB_method, do_sort,        // algorithm selector
         Context) ;
 
     GB_BURBLE_END ;
diff --git a/GraphBLAS/Source/GrB_mxv.c b/GraphBLAS/Source/GrB_mxv.c
index 74f5826196..0da982a56f 100644
--- a/GraphBLAS/Source/GrB_mxv.c
+++ b/GraphBLAS/Source/GrB_mxv.c
@@ -2,8 +2,8 @@
 // GrB_mxv: matrix-vector multiply
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -31,7 +31,7 @@ GrB_Info GrB_mxv                    // w<M> = accum (w, A*u)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_mxv (w, M, accum, semiring, A, u, desc)") ;
+    GB_WHERE (w, "GrB_mxv (w, M, accum, semiring, A, u, desc)") ;
     GB_BURBLE_START ("GrB_mxv") ;
     GB_RETURN_IF_NULL_OR_FAULTY (w) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -43,7 +43,7 @@ GrB_Info GrB_mxv                    // w<M> = accum (w, A*u)
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_transpose, xx, AxB_method) ;
+        A_transpose, xx, AxB_method, do_sort) ;
 
     //--------------------------------------------------------------------------
     // w<M> = accum (w,A*u) and variations, using the mxm kernel
@@ -58,7 +58,7 @@ GrB_Info GrB_mxv                    // w<M> = accum (w, A*u)
         A,                  A_transpose,    // allow A to be transposed
         (GrB_Matrix) u,     false,          // u is never transposed
         false,                              // fmult(x,y), flipxy false
-        AxB_method,                         // algorithm selector
+        AxB_method, do_sort,                // algorithm selector
         Context) ;
 
     GB_BURBLE_END ;
diff --git a/GraphBLAS/Source/GrB_transpose.c b/GraphBLAS/Source/GrB_transpose.c
index 8cc5206693..737214b314 100644
--- a/GraphBLAS/Source/GrB_transpose.c
+++ b/GraphBLAS/Source/GrB_transpose.c
@@ -2,8 +2,8 @@
 // GrB_transpose: transpose a sparse matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -32,11 +32,11 @@ GrB_Info GrB_transpose              // C<M> = accum(C,A') or accum(C,A)
 
     // C may be aliased with M and/or A
 
-    GB_WHERE ("GrB_transpose (C, M, accum, A, desc)") ;
+    GB_WHERE (C, "GrB_transpose (C, M, accum, A, desc)") ;
     GB_BURBLE_START ("GrB_transpose") ;
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;
     GB_RETURN_IF_FAULTY (M) ;
-    GB_RETURN_IF_FAULTY (accum) ;
+    GB_RETURN_IF_FAULTY_OR_POSITIONAL (accum) ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
 
     ASSERT_MATRIX_OK (C, "C input for GrB_transpose", GB0) ;
@@ -46,7 +46,7 @@ GrB_Info GrB_transpose              // C<M> = accum(C,A') or accum(C,A)
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_transpose, xx1, xx2) ;
+        A_transpose, xx1, xx2, xx7) ;
 
     // check domains and dimensions for C<M> = accum (C,T)
     GB_OK (GB_compatible (C->type, C, M, accum, A->type, Context)) ;
@@ -56,21 +56,17 @@ GrB_Info GrB_transpose              // C<M> = accum(C,A') or accum(C,A)
     int64_t tncols = (!A_transpose) ? GB_NROWS (A) : GB_NCOLS (A) ;
     if (GB_NROWS (C) != tnrows || GB_NCOLS (C) != tncols)
     { 
-        return (GB_ERROR (GrB_DIMENSION_MISMATCH, (GB_LOG,
+        GB_ERROR (GrB_DIMENSION_MISMATCH,
             "Dimensions not compatible:\n"
             "output is " GBd "-by-" GBd "\n"
             "input is " GBd "-by-" GBd "%s",
             GB_NROWS (C), GB_NCOLS (C),
-            tnrows, tncols, (!A_transpose) ? " (transposed)" : ""))) ;
+            tnrows, tncols, (!A_transpose) ? " (transposed)" : "") ;
     }
 
     // quick return if an empty mask is complemented
     GB_RETURN_IF_QUICK_MASK (C, C_replace, M, Mask_comp) ;
 
-    // delete any lingering zombies and assemble any pending tuples
-    GB_MATRIX_WAIT (M) ;
-    GB_MATRIX_WAIT (A) ;
-
     //--------------------------------------------------------------------------
     // T = A or A', where T can have the type of C or the type of A
     //--------------------------------------------------------------------------
@@ -87,14 +83,13 @@ GrB_Info GrB_transpose              // C<M> = accum(C,A') or accum(C,A)
 
         // T = A', the default behavior.  This step may seem counter-intuitive,
         // but method computes C<M>=A' by default when A_transpose is false.
-        GBBURBLE ("(transpose) ") ;
 
         // Precasting:
         if (accum == NULL)
         { 
             // If there is no accum operator, T is transplanted into Z and
             // typecasted into the C->type during the transpose.
-            // transpose: typecast, no op, not in place
+            // transpose: typecast, no op, not in-place
             GB_OK (GB_transpose (&T, C->type, C_is_csc, A,
                 NULL, NULL, NULL, false, Context)) ;
         }
@@ -105,7 +100,7 @@ GrB_Info GrB_transpose              // C<M> = accum(C,A') or accum(C,A)
             // but not C are typecasted directly into C->type.  Thus, the
             // typecast of T (if any) must wait, and be done in call to GB_add
             // in GB_accum_mask.
-            // transpose: no typecast, no op, not in place
+            // transpose: no typecast, no op, not in-place
             GB_OK (GB_transpose (&T, A->type, C_is_csc, A,
                 NULL, NULL, NULL, false, Context)) ;
         }
@@ -123,7 +118,8 @@ GrB_Info GrB_transpose              // C<M> = accum(C,A') or accum(C,A)
         // typecasted eventually, into the type of C if the types of T and C
         // differ.  That can be postponed at no cost since the following step
         // is free.
-        GBBURBLE ("(cheap) ") ;
+        GBURBLE ("(cheap) ") ;
+        GB_MATRIX_WAIT (A) ;
         GB_OK (GB_shallow_copy (&T, C_is_csc, A, Context)) ;
     }
 
@@ -138,6 +134,10 @@ GrB_Info GrB_transpose              // C<M> = accum(C,A') or accum(C,A)
     info = GB_accum_mask (C, M, NULL, accum, &T, C_replace, Mask_comp, 
         Mask_struct, Context) ;
     ASSERT (T == NULL) ;
+    if (info == GrB_SUCCESS)
+    {
+        ASSERT_MATRIX_OK (C, "final C for GrB_transpose", GB0) ;
+    }
 
     GB_BURBLE_END ;
     return (info) ;
diff --git a/GraphBLAS/Source/GrB_vxm.c b/GraphBLAS/Source/GrB_vxm.c
index 41282674e9..ce75f8a9ce 100644
--- a/GraphBLAS/Source/GrB_vxm.c
+++ b/GraphBLAS/Source/GrB_vxm.c
@@ -2,8 +2,8 @@
 // GrB_vxm: vector-matrix multiply
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -31,7 +31,7 @@ GrB_Info GrB_vxm                    // w'<M> = accum (w, u'*A)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GrB_vxm (w, M, accum, semiring, u, A, desc)") ;
+    GB_WHERE (w, "GrB_vxm (w, M, accum, semiring, u, A, desc)") ;
     GB_BURBLE_START ("GrB_vxm") ;
     GB_RETURN_IF_NULL_OR_FAULTY (w) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -43,7 +43,7 @@ GrB_Info GrB_vxm                    // w'<M> = accum (w, u'*A)
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx, A_transpose, AxB_method) ;
+        xx, A_transpose, AxB_method, do_sort) ;
 
     //--------------------------------------------------------------------------
     // w'<M'> = accum (w',u'*A) and variations, using the mxm kernel
@@ -64,7 +64,7 @@ GrB_Info GrB_vxm                    // w'<M> = accum (w, u'*A)
         A,                  !A_transpose,   // allow A to be transposed
         (GrB_Matrix) u,     false,          // u is never transposed
         true,                               // flipxy: fmult(y,x)
-        AxB_method,                         // algorithm selector
+        AxB_method, do_sort,                // algorithm selector
         Context) ;
 
     GB_BURBLE_END ;
diff --git a/GraphBLAS/Source/GrB_wait.c b/GraphBLAS/Source/GrB_wait.c
deleted file mode 100644
index fe6e3770a5..0000000000
--- a/GraphBLAS/Source/GrB_wait.c
+++ /dev/null
@@ -1,27 +0,0 @@
-// GrB_wait ( with no inputs ): DEPRECATED: TODO in 4.0: delete this
-// DEPRECATED: This will be removed in SuiteSparse:GraphBLAS v4.0.
-
-#include "GB.h"
-
-#define GB_FREE_ALL ;
-
-#if defined (USER_POSIX_THREADS)
-pthread_mutex_t GB_sync ;
-#endif
-
-GrB_Info GrB_wait ( )       // DEPRECATED.  Do *not* use this function.
-{
-    GrB_Info info ;
-    GB_WHERE ("GrB_wait (with no inputs) DEPRECATED ") ;
-    GB_BURBLE_START ("GrB_wait (DEPRECATED: USE GrB_*_wait(object) instead) ") ;
-    GrB_Matrix A = NULL ;
-    while (true)
-    {
-        if (!GB_queue_remove_head (&A)) GB_PANIC ;
-        if (A == NULL) break ;
-        GB_MATRIX_WAIT (A) ;
-    }
-    GB_BURBLE_END ;
-    return (GrB_SUCCESS) ;
-}
-
diff --git a/GraphBLAS/Source/GxB_BinaryOp_fprint.c b/GraphBLAS/Source/GxB_BinaryOp_fprint.c
index af50065c4b..d2230b0dff 100644
--- a/GraphBLAS/Source/GxB_BinaryOp_fprint.c
+++ b/GraphBLAS/Source/GxB_BinaryOp_fprint.c
@@ -2,8 +2,8 @@
 // GxB_BinaryOp_fprint: print and check a GrB_BinaryOp object
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,12 +22,12 @@ GrB_Info GxB_BinaryOp_fprint        // print and check a GrB_BinaryOp
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_BinaryOp_fprint (binaryop, name, pr, f)") ;
+    GB_WHERE1 ("GxB_BinaryOp_fprint (binaryop, name, pr, f)") ;
 
     //--------------------------------------------------------------------------
     // print and check the object
     //--------------------------------------------------------------------------
 
-    return (GB_BinaryOp_check (binaryop, name, pr, f, Context)) ;
+    return (GB_BinaryOp_check (binaryop, name, pr, f)) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_BinaryOp_xtype.c b/GraphBLAS/Source/GxB_BinaryOp_xtype.c
index b3aa9fb107..b164d3d37c 100644
--- a/GraphBLAS/Source/GxB_BinaryOp_xtype.c
+++ b/GraphBLAS/Source/GxB_BinaryOp_xtype.c
@@ -2,14 +2,14 @@
 // GxB_BinaryOp_xtype: return the type of x for z=f(x,y)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "GB.h"
 
-GrB_Info GxB_BinaryOp_xtype         // return the type of x
+GrB_Info GxB_BinaryOp_xtype         // type of x
 (
     GrB_Type *xtype,                // return type of input x
     GrB_BinaryOp binaryop           // binary operator to query
@@ -20,7 +20,7 @@ GrB_Info GxB_BinaryOp_xtype         // return the type of x
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_BinaryOp_xtype (&xtype, binaryop)") ;
+    GB_WHERE1 ("GxB_BinaryOp_xtype (&xtype, binaryop)") ;
     GB_RETURN_IF_NULL (xtype) ;
     GB_RETURN_IF_NULL_OR_FAULTY (binaryop) ;
     ASSERT_BINARYOP_OK (binaryop, "binaryop for xtype", GB0) ;
diff --git a/GraphBLAS/Source/GxB_BinaryOp_ytype.c b/GraphBLAS/Source/GxB_BinaryOp_ytype.c
index e1c91faa8b..4127552902 100644
--- a/GraphBLAS/Source/GxB_BinaryOp_ytype.c
+++ b/GraphBLAS/Source/GxB_BinaryOp_ytype.c
@@ -2,14 +2,14 @@
 // GxB_BinaryOp_ytype: return the type of y for z=f(x,y)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "GB.h"
 
-GrB_Info GxB_BinaryOp_ytype         // return the type of y
+GrB_Info GxB_BinaryOp_ytype         // type of y
 (
     GrB_Type *ytype,                // return type of input y
     GrB_BinaryOp binaryop           // binary operator to query
@@ -20,7 +20,7 @@ GrB_Info GxB_BinaryOp_ytype         // return the type of y
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_BinaryOp_ytype (&ytype, binaryop)") ;
+    GB_WHERE1 ("GxB_BinaryOp_ytype (&ytype, binaryop)") ;
     GB_RETURN_IF_NULL (ytype) ;
     GB_RETURN_IF_NULL_OR_FAULTY (binaryop) ;
     ASSERT_BINARYOP_OK (binaryop, "binaryop for ytype", GB0) ;
diff --git a/GraphBLAS/Source/GxB_BinaryOp_ztype.c b/GraphBLAS/Source/GxB_BinaryOp_ztype.c
index d99c42ee2e..a1a9bef4f6 100644
--- a/GraphBLAS/Source/GxB_BinaryOp_ztype.c
+++ b/GraphBLAS/Source/GxB_BinaryOp_ztype.c
@@ -2,8 +2,8 @@
 // GxB_BinaryOp_ztype: return the type of z for z=f(x,y)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_BinaryOp_ztype         // return the type of z
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_BinaryOp_ztype (&ztype, binaryop)") ;
+    GB_WHERE1 ("GxB_BinaryOp_ztype (&ztype, binaryop)") ;
     GB_RETURN_IF_NULL (ztype) ;
     GB_RETURN_IF_NULL_OR_FAULTY (binaryop) ;
     ASSERT_BINARYOP_OK (binaryop, "binaryop for ztype", GB0) ;
diff --git a/GraphBLAS/Source/GxB_Col_subassign.c b/GraphBLAS/Source/GxB_Col_subassign.c
index e79311aca1..8d1c82f028 100644
--- a/GraphBLAS/Source/GxB_Col_subassign.c
+++ b/GraphBLAS/Source/GxB_Col_subassign.c
@@ -2,8 +2,8 @@
 // GxB_Col_subassign: C(Rows,col)<M> = accum (C(Rows,col),u)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -28,7 +28,7 @@ GrB_Info GxB_Col_subassign          // C(Rows,col)<M> = accum (C(Rows,col),u)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Col_subassign (C, M, accum, u, Rows, nRows, col, desc)") ;
+    GB_WHERE (C, "GxB_Col_subassign (C, M, accum, u, Rows, nRows, col, desc)") ;
     GB_BURBLE_START ("GxB_subassign") ;
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -39,7 +39,7 @@ GrB_Info GxB_Col_subassign          // C(Rows,col)<M> = accum (C(Rows,col),u)
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // C(Rows,col)<M> = accum (C(Rows,col), u) and variations
diff --git a/GraphBLAS/Source/GxB_Desc_get.c b/GraphBLAS/Source/GxB_Desc_get.c
index 4bf77a7c24..11ac576f98 100644
--- a/GraphBLAS/Source/GxB_Desc_get.c
+++ b/GraphBLAS/Source/GxB_Desc_get.c
@@ -2,8 +2,8 @@
 // GxB_Desc_get: get a field in a descriptor
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -21,7 +21,7 @@ GrB_Info GxB_Desc_get           // get a parameter from a descriptor
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Desc_get (desc, field, &value)") ;
+    GB_WHERE1 ("GxB_Desc_get (desc, field, &value)") ;
     GB_RETURN_IF_FAULTY (desc) ;
 
     //--------------------------------------------------------------------------
@@ -99,37 +99,34 @@ GrB_Info GxB_Desc_get           // get a parameter from a descriptor
             }
             break ;
 
-        case GxB_DESCRIPTOR_MKL :     // same as GxB_MKL
+        case GxB_AxB_METHOD : 
 
             {
                 va_start (ap, field) ;
-                int *use_mkl = va_arg (ap, int *) ;
+                GrB_Desc_Value *value = va_arg (ap, GrB_Desc_Value *) ;
                 va_end (ap) ;
-                GB_RETURN_IF_NULL (use_mkl) ;
-                (*use_mkl) = (desc == NULL) ? false : desc->use_mkl ;
+                GB_RETURN_IF_NULL (value) ;
+                (*value) = (desc == NULL) ? GxB_DEFAULT : desc->axb ;
             }
             break ;
 
-        case GxB_AxB_METHOD : 
+        case GxB_SORT :
 
             {
                 va_start (ap, field) ;
-                GrB_Desc_Value *value = va_arg (ap, GrB_Desc_Value *) ;
+                int *do_sort = va_arg (ap, int *) ;
                 va_end (ap) ;
-                GB_RETURN_IF_NULL (value) ;
-                (*value) = (desc == NULL) ? GxB_DEFAULT : desc->axb ;
+                GB_RETURN_IF_NULL (do_sort) ;
+                int s = (desc == NULL) ? GxB_DEFAULT : desc->do_sort ;
+                (*do_sort) = s ;
             }
             break ;
 
+        // #include "GxB_Desc_get_mkl_template.c"
+
         default : 
 
-            return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-                "invalid descriptor field [%d], must be one of:\n"
-                "GrB_OUTP [%d], GrB_MASK [%d], GrB_INP0 [%d], GrB_INP1 [%d],\n"
-                "GxB_NTHREADS [%d], GxB_CHUNK [%d] or GxB_AxB_METHOD [%d]",
-                (int) field, (int) GrB_OUTP, (int) GrB_MASK, (int) GrB_INP0,
-                (int) GrB_INP1, (int) GxB_NTHREADS, (int) GxB_CHUNK, 
-                (int) GxB_AxB_METHOD))) ;
+            return (GrB_INVALID_VALUE) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GxB_Desc_set.c b/GraphBLAS/Source/GxB_Desc_set.c
index d957ac5348..46f5647849 100644
--- a/GraphBLAS/Source/GxB_Desc_set.c
+++ b/GraphBLAS/Source/GxB_Desc_set.c
@@ -2,8 +2,8 @@
 // GxB_Desc_set: set a field in a descriptor
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -26,16 +26,16 @@ GrB_Info GxB_Desc_set           // set a parameter in a descriptor
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Desc_set (desc, field, value)") ;
-    GB_RETURN_IF_NULL_OR_FAULTY (desc) ;
-    ASSERT_DESCRIPTOR_OK (desc, "desc to set", GB0) ;
-
-    if (desc->predefined)
+    if (desc != NULL && desc->predefined)
     { 
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "predefined descriptors may not be modified"))) ;
+        // built-in descriptors may not be modified
+        return (GrB_INVALID_VALUE) ;
     }
 
+    GB_WHERE (desc, "GxB_Desc_set (desc, field, value)") ;
+    GB_RETURN_IF_NULL_OR_FAULTY (desc) ;
+    ASSERT_DESCRIPTOR_OK (desc, "desc to set", GB0) ;
+
     //--------------------------------------------------------------------------
     // set the parameter
     //--------------------------------------------------------------------------
@@ -53,10 +53,10 @@ GrB_Info GxB_Desc_set           // set a parameter in a descriptor
                 va_end (ap) ;
                 if (! (value == GxB_DEFAULT || value == GrB_REPLACE))
                 { 
-                    return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+                    GB_ERROR (GrB_INVALID_VALUE,
                         "invalid descriptor value [%d] for GrB_OUTP field;\n"
                         "must be GxB_DEFAULT [%d] or GrB_REPLACE [%d]",
-                        value, (int) GxB_DEFAULT, (int) GrB_REPLACE))) ;
+                        value, (int) GxB_DEFAULT, (int) GrB_REPLACE) ;
                 }
                 desc->out = (GrB_Desc_Value) value ;
             }
@@ -73,13 +73,13 @@ GrB_Info GxB_Desc_set           // set a parameter in a descriptor
                        value == GrB_STRUCTURE ||
                        value == (GrB_COMP + GrB_STRUCTURE)))
                 { 
-                    return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+                    GB_ERROR (GrB_INVALID_VALUE,
                         "invalid descriptor value [%d] for GrB_MASK field;\n"
                         "must be GxB_DEFAULT [%d], GrB_COMP [%d],\n"
                         "GrB_STRUCTURE [%d], or GrB_COMP+GrB_STRUCTURE [%d]",
                         value, (int) GxB_DEFAULT, (int) GrB_COMP,
                         (int) GrB_STRUCTURE,
-                        (int) (GrB_COMP + GrB_STRUCTURE)))) ;
+                        (int) (GrB_COMP + GrB_STRUCTURE)) ;
                 }
                 int mask = (int) desc->mask ;
                 switch (value)
@@ -100,10 +100,10 @@ GrB_Info GxB_Desc_set           // set a parameter in a descriptor
                 va_end (ap) ;
                 if (! (value == GxB_DEFAULT || value == GrB_TRAN))
                 { 
-                    return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+                    GB_ERROR (GrB_INVALID_VALUE,
                         "invalid descriptor value [%d] for GrB_INP0 field;\n"
                         "must be GxB_DEFAULT [%d] or GrB_TRAN [%d]",
-                        value, (int) GxB_DEFAULT, (int) GrB_TRAN))) ;
+                        value, (int) GxB_DEFAULT, (int) GrB_TRAN) ;
                 }
                 desc->in0 = (GrB_Desc_Value) value ;
             }
@@ -117,10 +117,10 @@ GrB_Info GxB_Desc_set           // set a parameter in a descriptor
                 va_end (ap) ;
                 if (! (value == GxB_DEFAULT || value == GrB_TRAN))
                 { 
-                    return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+                    GB_ERROR (GrB_INVALID_VALUE,
                         "invalid descriptor value [%d] for GrB_INP1 field;\n"
                         "must be GxB_DEFAULT [%d] or GrB_TRAN [%d]",
-                        value, (int) GxB_DEFAULT, (int) GrB_TRAN))) ;
+                        value, (int) GxB_DEFAULT, (int) GrB_TRAN) ;
                 }
                 desc->in1 = (GrB_Desc_Value) value ;
             }
@@ -144,15 +144,6 @@ GrB_Info GxB_Desc_set           // set a parameter in a descriptor
             }
             break ;
 
-        case GxB_DESCRIPTOR_MKL :         // same as GxB_MKL
-
-            {
-                va_start (ap, field) ;
-                desc->use_mkl = va_arg (ap, int) ;
-                va_end (ap) ;
-            }
-            break ;
-
         case GxB_AxB_METHOD : 
 
             {
@@ -160,32 +151,43 @@ GrB_Info GxB_Desc_set           // set a parameter in a descriptor
                 int value = va_arg (ap, int) ;
                 va_end (ap) ;
                 if (! (value == GxB_DEFAULT  || value == GxB_AxB_GUSTAVSON
-                    || value == GxB_AxB_HEAP || value == GxB_AxB_DOT
+                    || value == GxB_AxB_DOT
                     || value == GxB_AxB_HASH || value == GxB_AxB_SAXPY))
                 { 
-                    return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+                    GB_ERROR (GrB_INVALID_VALUE,
                         "invalid descriptor value [%d] for GrB_AxB_METHOD"
                         " field;\nmust be GxB_DEFAULT [%d], GxB_AxB_GUSTAVSON"
-                        " [%d]\nGxB_AxB_HEAP [%d], GxB_AxB_DOT [%d]\n"
+                        " [%d]\nGxB_AxB_DOT [%d]"
                         " GxB_AxB_HASH [%d] or GxB_AxB_SAXPY [%d]",
                         value, (int) GxB_DEFAULT, (int) GxB_AxB_GUSTAVSON,
-                        (int) GxB_AxB_HEAP, (int) GxB_AxB_DOT,
-                        (int) GxB_AxB_HASH, (int) GxB_AxB_SAXPY))) ;
+                        (int) GxB_AxB_DOT,
+                        (int) GxB_AxB_HASH, (int) GxB_AxB_SAXPY) ;
                 }
                 desc->axb = (GrB_Desc_Value) value ;
             }
             break ;
 
+        case GxB_SORT :
+
+            {
+                va_start (ap, field) ;
+                desc->do_sort = va_arg (ap, int) ;
+                va_end (ap) ;
+            }
+            break ;
+
+        // #include "GxB_Desc_set_mkl_template.c"
+
         default : 
 
-            return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
+            GB_ERROR (GrB_INVALID_VALUE,
                 "invalid descriptor field [%d], must be one of:\n"
                 "GrB_OUTP [%d], GrB_MASK [%d], GrB_INP0 [%d], GrB_INP1 [%d]\n"
-                "GxB_NTHREADS [%d], GxB_CHUNK [%d], GxB_AxB_METHOD [%d], "
-                "or GxB_MKL [%d]\n",
+                "GxB_NTHREADS [%d], GxB_CHUNK [%d], GxB_AxB_METHOD [%d]\n"
+                "or GxB_SORT [%d]\n",
                 (int) field, (int) GrB_OUTP, (int) GrB_MASK, (int) GrB_INP0,
                 (int) GrB_INP1, (int) GxB_NTHREADS, (int) GxB_CHUNK,
-                (int) GxB_AxB_METHOD, (int) GxB_MKL))) ;
+                (int) GxB_AxB_METHOD, (int) GxB_SORT) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GxB_Descriptor_fprint.c b/GraphBLAS/Source/GxB_Descriptor_fprint.c
index 98871ac4ae..1e9de9c0c0 100644
--- a/GraphBLAS/Source/GxB_Descriptor_fprint.c
+++ b/GraphBLAS/Source/GxB_Descriptor_fprint.c
@@ -2,8 +2,8 @@
 // GxB_Descriptor_fprint: print and check a GrB_Descriptor object
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,12 +22,12 @@ GrB_Info GxB_Descriptor_fprint      // print and check a GrB_Descriptor
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Descriptor_fprint (descriptor, name, pr, f)") ;
+    GB_WHERE1 ("GxB_Descriptor_fprint (descriptor, name, pr, f)") ;
 
     //--------------------------------------------------------------------------
     // print and check the object
     //--------------------------------------------------------------------------
 
-    return (GB_Descriptor_check (descriptor, name, pr, f, Context)) ;
+    return (GB_Descriptor_check (descriptor, name, pr, f)) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Descriptor_get.c b/GraphBLAS/Source/GxB_Descriptor_get.c
index 8d2970ac1b..76c0742470 100644
--- a/GraphBLAS/Source/GxB_Descriptor_get.c
+++ b/GraphBLAS/Source/GxB_Descriptor_get.c
@@ -2,8 +2,8 @@
 // GxB_Descriptor_get: get a field in a descriptor
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,7 +23,7 @@ GrB_Info GxB_Descriptor_get     // get a parameter from a descriptor
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Descriptor_get (&value, desc, field)") ;
+    GB_WHERE1 ("GxB_Descriptor_get (&value, desc, field)") ;
     GB_RETURN_IF_NULL (val) ;
     GB_RETURN_IF_FAULTY (desc) ;
 
@@ -55,8 +55,7 @@ GrB_Info GxB_Descriptor_get     // get a parameter from a descriptor
 
         default : 
 
-            return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-                "invalid descriptor field"))) ;
+            return (GrB_INVALID_VALUE) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GxB_Global_Option_get.c b/GraphBLAS/Source/GxB_Global_Option_get.c
index 2c1f20ae0a..6ef682a523 100644
--- a/GraphBLAS/Source/GxB_Global_Option_get.c
+++ b/GraphBLAS/Source/GxB_Global_Option_get.c
@@ -2,8 +2,8 @@
 // GxB_Global_Option_get: get a global default option for all future matrices
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Global_Option_get (field, &value)") ;
+    GB_WHERE1 ("GxB_Global_Option_get (field, &value)") ;
 
     //--------------------------------------------------------------------------
     // get the option
@@ -32,27 +32,38 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
     {
 
         //----------------------------------------------------------------------
-        // hyper_ratio
+        // matrix format
         //----------------------------------------------------------------------
 
-        case GxB_HYPER :
+        case GxB_HYPER_SWITCH : 
 
-            { 
+            {
                 va_start (ap, field) ;
-                double *hyper_ratio = va_arg (ap, double *) ;
+                double *hyper_switch = va_arg (ap, double *) ;
                 va_end (ap) ;
-                GB_RETURN_IF_NULL (hyper_ratio) ;
-                (*hyper_ratio) = GB_Global_hyper_ratio_get ( ) ;
+                GB_RETURN_IF_NULL (hyper_switch) ;
+                (*hyper_switch) = (double) GB_Global_hyper_switch_get ( ) ;
             }
             break ;
 
-        //----------------------------------------------------------------------
-        // matrix format (CSR or CSC)
-        //----------------------------------------------------------------------
+        case GxB_BITMAP_SWITCH : 
+
+            {
+                va_start (ap, field) ;
+                double *bitmap_switch = va_arg (ap, double *) ;
+                va_end (ap) ;
+                GB_RETURN_IF_NULL (bitmap_switch) ;
+                for (int k = 0 ; k < GxB_NBITMAP_SWITCH ; k++)
+                {
+                    double b = (double) GB_Global_bitmap_switch_get (k) ;
+                    bitmap_switch [k] = b ;
+                }
+            }
+            break ;
 
         case GxB_FORMAT : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 GxB_Format_Value *format = va_arg (ap, GxB_Format_Value *) ;
                 va_end (ap) ;
@@ -68,7 +79,7 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
 
         case GxB_MODE : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 GrB_Mode *mode = va_arg (ap, GrB_Mode *) ;
                 va_end (ap) ;
@@ -77,54 +88,13 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
-        //----------------------------------------------------------------------
-        // threading model for synchronizing user threads
-        //----------------------------------------------------------------------
-
-        case GxB_THREAD_SAFETY : 
-
-            { 
-                va_start (ap, field) ;
-                GxB_Thread_Model *safety = va_arg (ap, GxB_Thread_Model *) ;
-                va_end (ap) ;
-                GB_RETURN_IF_NULL (safety) ;
-                (*safety) = 
-                    #if defined (USER_POSIX_THREADS)
-                    GxB_THREAD_POSIX ;
-                    #elif defined ( _OPENMP ) || defined (USER_OPENMP_THREADS)
-                    GxB_THREAD_OPENMP ;
-                    #else
-                    GxB_THREAD_NONE ;
-                    #endif
-            }
-            break ;
-
-        //----------------------------------------------------------------------
-        // internal parallel threading in GraphBLAS
-        //----------------------------------------------------------------------
-
-        case GxB_THREADING : 
-
-            { 
-                va_start (ap, field) ;
-                GxB_Thread_Model *threading = va_arg (ap, GxB_Thread_Model *) ;
-                va_end (ap) ;
-                GB_RETURN_IF_NULL (threading) ;
-                #if defined ( _OPENMP )
-                (*threading) = GxB_THREAD_OPENMP ;
-                #else
-                (*threading) = GxB_THREAD_NONE ;
-                #endif
-            }
-            break ;
-
         //----------------------------------------------------------------------
         // default number of threads
         //----------------------------------------------------------------------
 
         case GxB_GLOBAL_NTHREADS :      // same as GxB_NTHREADS
 
-            { 
+            {
                 va_start (ap, field) ;
                 int *nthreads_max = va_arg (ap, int *) ;
                 va_end (ap) ;
@@ -139,7 +109,7 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
 
         case GxB_GLOBAL_CHUNK :         // same as GxB_CHUNK
 
-            { 
+            {
                 va_start (ap, field) ;
                 double *chunk = va_arg (ap, double *) ;
                 va_end (ap) ;
@@ -149,12 +119,12 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             break ;
 
         //----------------------------------------------------------------------
-        // SuiteSparse:GraphBLAS version, etc
+        // SuiteSparse:GraphBLAS version, date, license, etc
         //----------------------------------------------------------------------
 
-        case GxB_LIBRARY_NAME :
+        case GxB_LIBRARY_NAME : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 char **name = va_arg (ap, char **) ;
                 va_end (ap) ;
@@ -163,9 +133,9 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
-        case GxB_LIBRARY_VERSION :
+        case GxB_LIBRARY_VERSION : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 int *version = va_arg (ap, int *) ;
                 va_end (ap) ;
@@ -176,9 +146,9 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
-        case GxB_LIBRARY_DATE :
+        case GxB_LIBRARY_DATE : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 char **date = va_arg (ap, char **) ;
                 va_end (ap) ;
@@ -187,9 +157,9 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
-        case GxB_LIBRARY_ABOUT :
+        case GxB_LIBRARY_ABOUT : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 char **about = va_arg (ap, char **) ;
                 va_end (ap) ;
@@ -198,9 +168,9 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
-        case GxB_LIBRARY_LICENSE :
+        case GxB_LIBRARY_LICENSE : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 char **license = va_arg (ap, char **) ;
                 va_end (ap) ;
@@ -209,9 +179,9 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
-        case GxB_LIBRARY_COMPILE_DATE :
+        case GxB_LIBRARY_COMPILE_DATE : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 char **compile_date = va_arg (ap, char **) ;
                 va_end (ap) ;
@@ -220,9 +190,9 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
-        case GxB_LIBRARY_COMPILE_TIME :
+        case GxB_LIBRARY_COMPILE_TIME : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 char **compile_time = va_arg (ap, char **) ;
                 va_end (ap) ;
@@ -231,9 +201,9 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
-        case GxB_LIBRARY_URL :
+        case GxB_LIBRARY_URL : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 char **url = va_arg (ap, char **) ;
                 va_end (ap) ;
@@ -243,12 +213,12 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             break ;
 
         //----------------------------------------------------------------------
-        // GraphBLAS API version, tec
+        // GraphBLAS API version, date, etc
         //----------------------------------------------------------------------
 
-        case GxB_API_VERSION :
+        case GxB_API_VERSION : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 int *api_version = va_arg (ap, int *) ;
                 va_end (ap) ;
@@ -259,9 +229,9 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
-        case GxB_API_DATE :
+        case GxB_API_DATE : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 char **api_date = va_arg (ap, char **) ;
                 va_end (ap) ;
@@ -270,9 +240,9 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
-        case GxB_API_ABOUT :
+        case GxB_API_ABOUT : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 char **api_about = va_arg (ap, char **) ;
                 va_end (ap) ;
@@ -281,9 +251,9 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
-        case GxB_API_URL :
+        case GxB_API_URL : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 char **api_url = va_arg (ap, char **) ;
                 va_end (ap) ;
@@ -293,12 +263,12 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             break ;
 
         //----------------------------------------------------------------------
-        // controlling diagnostic output, for development only
+        // controlling diagnostic output
         //----------------------------------------------------------------------
 
         case GxB_BURBLE : 
 
-            { 
+            {
                 va_start (ap, field) ;
                 bool *burble = va_arg (ap, bool *) ;
                 va_end (ap) ;
@@ -308,12 +278,12 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             break ;
 
         //----------------------------------------------------------------------
-        // CUDA (in progress)
+        // CUDA (DRAFT: in progress, do not use)
         //----------------------------------------------------------------------
 
         case GxB_GLOBAL_GPU_CONTROL :       // same as GxB_GPU_CONTROL
 
-            { 
+            {
                 va_start (ap, field) ;
                 GrB_Desc_Value *gpu_control = va_arg (ap, GrB_Desc_Value *) ;
                 va_end (ap) ;
@@ -324,7 +294,7 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
 
         case GxB_GLOBAL_GPU_CHUNK :         // same as GxB_GPU_CHUNK
 
-            { 
+            {
                 va_start (ap, field) ;
                 double *gpu_chunk = va_arg (ap, double *) ;
                 va_end (ap) ;
@@ -333,41 +303,11 @@ GrB_Info GxB_Global_Option_get      // gets the current global option
             }
             break ;
 
-        case GxB_GPU_COUNT : 
-
-            { 
-                va_start (ap, field) ;
-                int *gpu_count = va_arg (ap, int *) ;
-                va_end (ap) ;
-                GB_RETURN_IF_NULL (gpu_count) ;
-                (*gpu_count) = GB_Global_gpu_count_get ( ) ;
-            }
-            break ;
-
-
-        //----------------------------------------------------------------------
-        // Intel MKL (in progress)
-        //----------------------------------------------------------------------
-
-        case GxB_GLOBAL_MKL :           // same as GxB_MKL
-
-            { 
-                va_start (ap, field) ;
-                int *use_mkl = va_arg (ap, int *) ;
-                va_end (ap) ;
-                GB_RETURN_IF_NULL (use_mkl) ;
-                (*use_mkl) = GB_Global_use_mkl_get ( ) ;
-            }
-            break ;
-
-        //----------------------------------------------------------------------
-        // invalid option
-        //----------------------------------------------------------------------
+        // #include "GxB_Global_Option_get_mkl_template.c"
 
         default : 
 
-            return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-                    "invalid option field [%d]\n", (int) field))) ;
+            return (GrB_INVALID_VALUE) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GxB_Global_Option_set.c b/GraphBLAS/Source/GxB_Global_Option_set.c
index b065d2de2e..881b1eee0f 100644
--- a/GraphBLAS/Source/GxB_Global_Option_set.c
+++ b/GraphBLAS/Source/GxB_Global_Option_set.c
@@ -2,8 +2,8 @@
 // GxB_Global_Option_set: set a global default option for all future matrices
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_Global_Option_set      // set a global default option
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Global_Option_set (field, value)") ;
+    GB_WHERE1 ("GxB_Global_Option_set (field, value)") ;
 
     //--------------------------------------------------------------------------
     // set the global option
@@ -35,13 +35,35 @@ GrB_Info GxB_Global_Option_set      // set a global default option
         // matrix format
         //----------------------------------------------------------------------
 
-        case GxB_HYPER : 
+        case GxB_HYPER_SWITCH : 
 
             { 
                 va_start (ap, field) ;
-                double hyper_ratio = va_arg (ap, double) ;
+                double hyper_switch = va_arg (ap, double) ;
                 va_end (ap) ;
-                GB_Global_hyper_ratio_set (hyper_ratio) ;
+                GB_Global_hyper_switch_set ((float) hyper_switch) ;
+            }
+            break ;
+
+        case GxB_BITMAP_SWITCH : 
+
+            { 
+                va_start (ap, field) ;
+                double *bitmap_switch = va_arg (ap, double *) ;
+                va_end (ap) ;
+                if (bitmap_switch == NULL)
+                {
+                    // set all switches to their default
+                    GB_Global_bitmap_switch_default ( ) ;
+                }
+                else
+                {
+                    for (int k = 0 ; k < GxB_NBITMAP_SWITCH ; k++)
+                    {
+                        float b = (float) (bitmap_switch [k]) ;
+                        GB_Global_bitmap_switch_set (k, b) ;
+                    }
+                }
             }
             break ;
 
@@ -53,10 +75,7 @@ GrB_Info GxB_Global_Option_set      // set a global default option
                 va_end (ap) ;
                 if (! (format == GxB_BY_ROW || format == GxB_BY_COL))
                 { 
-                    return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-                            "unsupported format [%d], must be one of:\n"
-                            "GxB_BY_ROW [%d] or GxB_BY_COL [%d]", format,
-                            (int) GxB_BY_ROW, (int) GxB_BY_COL))) ;
+                    return (GrB_INVALID_VALUE) ;
                 }
                 GB_Global_is_csc_set (format != (int) GxB_BY_ROW) ; 
             }
@@ -92,7 +111,7 @@ GrB_Info GxB_Global_Option_set      // set a global default option
         // diagnostics
         //----------------------------------------------------------------------
 
-        case GxB_BURBLE :
+        case GxB_BURBLE : 
 
             { 
                 va_start (ap, field) ;
@@ -103,7 +122,7 @@ GrB_Info GxB_Global_Option_set      // set a global default option
             break ;
 
         //----------------------------------------------------------------------
-        // CUDA (in progress)
+        // CUDA (DRAFT: in progress, do not use)
         //----------------------------------------------------------------------
 
         case GxB_GLOBAL_GPU_CONTROL :       // same as GxB_GPU_CONTROL
@@ -126,36 +145,11 @@ GrB_Info GxB_Global_Option_set      // set a global default option
             }
             break ;
 
-        //----------------------------------------------------------------------
-        // Intel MKL (in progress)
-        //----------------------------------------------------------------------
-
-        case GxB_GLOBAL_MKL :          // same as GxB_MKL
-
-            { 
-                va_start (ap, field) ;
-                int use_mkl = va_arg (ap, int) ;
-                va_end (ap) ;
-                GB_Global_use_mkl_set (use_mkl != 0) ;
-            }
-            break ;
-
-        //----------------------------------------------------------------------
-        // invalid option
-        //----------------------------------------------------------------------
+        // #include "GxB_Global_Option_set_mkl_template.c"
 
         default : 
 
-            return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-                    "invalid option field [%d], must be one of:\n"
-                    "GxB_HYPER [%d], GxB_FORMAT [%d], GxB_NTHREADS [%d]\n"
-                    "GxB_CHUNK [%d], GxB_BURBLE [%d], GxB_GPU_CONTROL [%d]\n"
-                    "GxB_GPU_CHUNK [%d], or GxB_MKL [%d]\n",
-                    (int) field, (int) GxB_HYPER, (int) GxB_FORMAT,
-                    (int) GxB_NTHREADS, (int) GxB_CHUNK, (int) GxB_BURBLE,
-                    (int) GxB_GPU_CONTROL, (int) GxB_GPU_CHUNK, (int)
-                    GxB_MKL))) ;
-
+            return (GrB_INVALID_VALUE) ;
     }
 
     return (GrB_SUCCESS) ;
diff --git a/GraphBLAS/Source/GxB_Matrix_Option_get.c b/GraphBLAS/Source/GxB_Matrix_Option_get.c
index bb612974d9..29bd9c6167 100644
--- a/GraphBLAS/Source/GxB_Matrix_Option_get.c
+++ b/GraphBLAS/Source/GxB_Matrix_Option_get.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_Option_get: get an option in a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -21,7 +21,7 @@ GrB_Info GxB_Matrix_Option_get      // gets the current option of a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_Option_get (A, field, &value)") ;
+    GB_WHERE1 ("GxB_Matrix_Option_get (A, field, &value)") ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
     ASSERT_MATRIX_OK (A, "A to get option", GB0) ;
 
@@ -34,14 +34,47 @@ GrB_Info GxB_Matrix_Option_get      // gets the current option of a matrix
     switch (field)
     {
 
-        case GxB_HYPER : 
+        case GxB_HYPER_SWITCH : 
 
             {
                 va_start (ap, field) ;
-                double *hyper_ratio = va_arg (ap, double *) ;
+                double *hyper_switch = va_arg (ap, double *) ;
                 va_end (ap) ;
-                GB_RETURN_IF_NULL (hyper_ratio) ;
-                (*hyper_ratio) = A->hyper_ratio ;
+                GB_RETURN_IF_NULL (hyper_switch) ;
+                (*hyper_switch) = (double) A->hyper_switch ;
+            }
+            break ;
+
+        case GxB_BITMAP_SWITCH : 
+
+            {
+                va_start (ap, field) ;
+                double *bitmap_switch = va_arg (ap, double *) ;
+                va_end (ap) ;
+                GB_RETURN_IF_NULL (bitmap_switch) ;
+                (*bitmap_switch) = (double) A->bitmap_switch ;
+            }
+            break ;
+
+        case GxB_SPARSITY_CONTROL : 
+
+            {
+                va_start (ap, field) ;
+                int *sparsity = va_arg (ap, int *) ;
+                va_end (ap) ;
+                GB_RETURN_IF_NULL (sparsity) ;
+                (*sparsity) = A->sparsity ;
+            }
+            break ;
+
+        case GxB_SPARSITY_STATUS : 
+
+            {
+                va_start (ap, field) ;
+                int *sparsity = va_arg (ap, int *) ;
+                va_end (ap) ;
+                GB_RETURN_IF_NULL (sparsity) ;
+                (*sparsity) = GB_sparsity (A) ;
             }
             break ;
 
@@ -56,25 +89,21 @@ GrB_Info GxB_Matrix_Option_get      // gets the current option of a matrix
             }
             break ;
 
-        case GxB_IS_HYPER : 
+        case GxB_IS_HYPER : // deprecated; use GxB_SPARSITY_STATUS instead
 
             {
                 va_start (ap, field) ;
-                bool *is_hyper = va_arg (ap, bool *) ;
+                bool *A_is_hyper = va_arg (ap, bool *) ;
                 va_end (ap) ;
-                GB_RETURN_IF_NULL (is_hyper) ;
-                (*is_hyper) = A->is_hyper ;
+                GB_RETURN_IF_NULL (A_is_hyper) ;
+                (*A_is_hyper) = (GB_sparsity (A) == GxB_HYPERSPARSE) ;
             }
             break ;
 
         default : 
 
-            return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-                    "invalid option field [%d], must be one of:\n"
-                    "GxB_HYPER [%d], GxB_FORMAT [%d], or GxB_IS_HYPER [%d]",
-                    (int) field, (int) GxB_HYPER, (int) GxB_FORMAT,
-                    (int) GxB_IS_HYPER))) ;
-
+            return (GrB_INVALID_VALUE) ;
     }
     return (GrB_SUCCESS) ;
 }
+
diff --git a/GraphBLAS/Source/GxB_Matrix_Option_set.c b/GraphBLAS/Source/GxB_Matrix_Option_set.c
index 5f0707cd28..1469d2343d 100644
--- a/GraphBLAS/Source/GxB_Matrix_Option_set.c
+++ b/GraphBLAS/Source/GxB_Matrix_Option_set.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_Option_set: set an option in a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,14 +23,12 @@ GrB_Info GxB_Matrix_Option_set      // set an option in a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    GrB_Info info = GrB_SUCCESS ;
-    GB_WHERE ("GxB_Matrix_Option_set (A, field, value)") ;
+    GrB_Info info ;
+    GB_WHERE (A, "GxB_Matrix_Option_set (A, field, value)") ;
     GB_BURBLE_START ("GxB_set") ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
     ASSERT_MATRIX_OK (A, "A to set option", GB0) ;
 
-    GB_MATRIX_WAIT (A) ;
-
     //--------------------------------------------------------------------------
     // set the matrix option
     //--------------------------------------------------------------------------
@@ -40,15 +38,33 @@ GrB_Info GxB_Matrix_Option_set      // set an option in a matrix
     switch (field)
     {
 
-        case GxB_HYPER : 
+        case GxB_HYPER_SWITCH : 
+
+            {
+                va_start (ap, field) ;
+                double hyper_switch = va_arg (ap, double) ;
+                va_end (ap) ;
+                A->hyper_switch = (float) hyper_switch ;
+            }
+            break ;
+
+        case GxB_BITMAP_SWITCH : 
 
             {
                 va_start (ap, field) ;
-                double hyper_ratio = va_arg (ap, double) ;
+                double bitmap_switch = va_arg (ap, double) ;
                 va_end (ap) ;
-                A->hyper_ratio = hyper_ratio ;
-                // conform the matrix to its new desired hypersparsity
-                info = GB_to_hyper_conform (A, Context) ;
+                A->bitmap_switch = (float) bitmap_switch ;
+            }
+            break ;
+
+        case GxB_SPARSITY_CONTROL : 
+
+            {
+                va_start (ap, field) ;
+                int sparsity = va_arg (ap, int) ;
+                va_end (ap) ;
+                A->sparsity = GB_sparsity_control (sparsity, (int64_t) (-1)) ;
             }
             break ;
 
@@ -60,10 +76,7 @@ GrB_Info GxB_Matrix_Option_set      // set an option in a matrix
                 va_end (ap) ;
                 if (! (format == GxB_BY_ROW || format == GxB_BY_COL))
                 { 
-                    return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-                            "unsupported format [%d], must be one of:\n"
-                            "GxB_BY_ROW [%d] or GxB_BY_COL [%d]", format,
-                            (int) GxB_BY_ROW, (int) GxB_BY_COL))) ;
+                    return (GrB_INVALID_VALUE) ;
                 }
                 // the value is normally GxB_BY_ROW (0) or GxB_BY_COL (1), but
                 // any nonzero value results in GxB_BY_COL.
@@ -71,27 +84,29 @@ GrB_Info GxB_Matrix_Option_set      // set an option in a matrix
                 // conform the matrix to the new CSR/CSC format
                 if (A->is_csc != new_csc)
                 { 
-                    // A = A', done in place, and change to the new format.
-                    // transpose: no typecast, no op, in place of A
-                    GBBURBLE ("(transpose) ") ;
-                    info = GB_transpose (NULL, NULL, new_csc, A,
-                        NULL, NULL, NULL, false, Context);
-                    ASSERT (GB_IMPLIES (info == GrB_SUCCESS,
-                        A->is_csc == new_csc)) ;
+                    // A = A', done in-place, and change to the new format.
+                    // transpose: no typecast, no op, in-place of A
+                    GB_BURBLE_N (GB_NNZ (A), "(transpose) ") ;
+                    GB_OK (GB_transpose (NULL, NULL, new_csc, A,
+                        NULL, NULL, NULL, false, Context)) ;
+                    ASSERT (A->is_csc == new_csc) ;
+                    ASSERT (GB_JUMBLED_OK (A)) ;
                 }
             }
             break ;
 
         default : 
 
-            return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-                    "invalid option field [%d], must be one of:\n"
-                    "GxB_HYPER [%d], GxB_FORMAT [%d]",
-                    (int) field, (int) GxB_HYPER, (int) GxB_FORMAT))) ;
-
+            return (GrB_INVALID_VALUE) ;
     }
 
+    //--------------------------------------------------------------------------
+    // conform the matrix to its new desired sparsity structure
+    //--------------------------------------------------------------------------
+
+    GB_OK (GB_conform (A, Context)) ;
     GB_BURBLE_END ;
-    return (info) ;
+    ASSERT_MATRIX_OK (A, "A set", GB0) ;
+    return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Matrix_export_BitmapC.c b/GraphBLAS/Source/GxB_Matrix_export_BitmapC.c
new file mode 100644
index 0000000000..3d2c9e0399
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Matrix_export_BitmapC.c
@@ -0,0 +1,87 @@
+//------------------------------------------------------------------------------
+// GxB_Matrix_export_BitmapC: export a bitmap matrix, held by column
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GxB_Matrix_export_BitmapC  // export and free a bitmap matrix, by col
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size 1, or >= nrows*ncols
+    GrB_Index *Ab_size, // size of Ab
+    GrB_Index *Ax_size, // size of Ax
+
+    GrB_Index *nvals,   // # of entries in bitmap
+    const GrB_Descriptor desc
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs and get the descriptor
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Matrix_export_BitmapC (&A, &type, &nrows, &ncols, "
+        " &Ab, &Ax, &Ab_size, &Ax_size, &nvals, desc)") ;
+    GB_BURBLE_START ("GxB_Matrix_export_BitmapC") ;
+    GB_RETURN_IF_NULL (A) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (*A) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // ensure the matrix is bitmap CSC
+    //--------------------------------------------------------------------------
+
+    // ensure the matrix is in CSC format
+    if (!((*A)->is_csc))
+    { 
+        // A = A', done in-place, to put A in CSC format
+        GBURBLE ("(transpose) ") ;
+        GB_OK (GB_transpose (NULL, NULL, true, *A,
+            NULL, NULL, NULL, false, Context)) ;
+    }
+
+    GB_OK (GB_convert_any_to_bitmap (*A, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // export the matrix
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_BITMAP (*A)) ;
+    ASSERT (((*A)->is_csc)) ;
+    ASSERT (!GB_ZOMBIES (*A)) ;
+    ASSERT (!GB_JUMBLED (*A)) ;
+    ASSERT (!GB_PENDING (*A)) ;
+
+    int sparsity ;
+    bool is_csc ;
+
+    info = GB_export (A, type, nrows, ncols,
+        NULL, NULL,     // Ap
+        NULL, NULL,     // Ah
+        Ab,   Ab_size,  // Ab
+        NULL, NULL,     // Ai
+        Ax,   Ax_size,  // Ax
+        nvals, NULL, NULL,                  // nvals for bitmap
+        &sparsity, &is_csc, Context) ;      // bitmap by col
+
+    if (info == GrB_SUCCESS)
+    {
+        ASSERT (sparsity == GxB_BITMAP) ;
+        ASSERT (is_csc) ;
+    }
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Matrix_export_BitmapR.c b/GraphBLAS/Source/GxB_Matrix_export_BitmapR.c
new file mode 100644
index 0000000000..8bcd1d9b2a
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Matrix_export_BitmapR.c
@@ -0,0 +1,87 @@
+//------------------------------------------------------------------------------
+// GxB_Matrix_export_BitmapR: export a bitmap matrix, held by row
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GxB_Matrix_export_BitmapR  // export and free a bitmap matrix, by row
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size 1, or >= nrows*ncols
+    GrB_Index *Ab_size, // size of Ab
+    GrB_Index *Ax_size, // size of Ax
+
+    GrB_Index *nvals,   // # of entries in bitmap
+    const GrB_Descriptor desc
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs and get the descriptor
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Matrix_export_BitmapR (&A, &type, &nrows, &ncols, "
+        " &Ab, &Ax, &Ab_size, &Ax_size, &nvals, desc)") ;
+    GB_BURBLE_START ("GxB_Matrix_export_BitmapR") ;
+    GB_RETURN_IF_NULL (A) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (*A) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // ensure the matrix is bitmap CSR
+    //--------------------------------------------------------------------------
+
+    // ensure the matrix is in CSR format
+    if ((*A)->is_csc)
+    { 
+        // A = A', done in-place, to put A in CSR format
+        GBURBLE ("(transpose) ") ;
+        GB_OK (GB_transpose (NULL, NULL, false, *A,
+            NULL, NULL, NULL, false, Context)) ;
+    }
+
+    GB_OK (GB_convert_any_to_bitmap (*A, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // export the matrix
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_BITMAP (*A)) ;
+    ASSERT (!((*A)->is_csc)) ;
+    ASSERT (!GB_ZOMBIES (*A)) ;
+    ASSERT (!GB_JUMBLED (*A)) ;
+    ASSERT (!GB_PENDING (*A)) ;
+
+    int sparsity ;
+    bool is_csc ;
+
+    info = GB_export (A, type, ncols, nrows,
+        NULL, NULL,     // Ap
+        NULL, NULL,     // Ah
+        Ab,   Ab_size,  // Ab
+        NULL, NULL,     // Ai
+        Ax,   Ax_size,  // Ax
+        nvals, NULL, NULL,                  // nvals for bitmap
+        &sparsity, &is_csc, Context) ;      // bitmap by col
+
+    if (info == GrB_SUCCESS)
+    {
+        ASSERT (sparsity == GxB_BITMAP) ;
+        ASSERT (!is_csc) ;
+    }
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Matrix_export_CSC.c b/GraphBLAS/Source/GxB_Matrix_export_CSC.c
index 2e71777450..ae5dbd7dfa 100644
--- a/GraphBLAS/Source/GxB_Matrix_export_CSC.c
+++ b/GraphBLAS/Source/GxB_Matrix_export_CSC.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_export_CSC: export a matrix in CSC format
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,89 +13,96 @@
 
 GrB_Info GxB_Matrix_export_CSC  // export and free a CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // CSC format:
-    int64_t *nonempty,      // number of columns with at least one entry
-    GrB_Index **Ap,         // column "pointers", size ncols+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+
+    GrB_Index **Ap,     // column "pointers", Ap_size >= ncols+1
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size 1, or >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Ai_size, // size of Ai
+    GrB_Index *Ax_size, // size of Ax
+
+    bool *jumbled,      // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
 )
-{
+{ 
 
     //--------------------------------------------------------------------------
-    // check inputs
+    // check inputs and get the descriptor
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_export_CSC (&A, &type, &nrows, &ncols, &nvals,"
-        " &nonempty, &Ap, &Ai, &Ax, desc)") ;
+    GB_WHERE1 ("GxB_Matrix_export_CSC (&A, &type, &nrows, &ncols,"
+        "&Ap, &Ai, &Ax, &Ap_size, &Ai_size, &Ax_size, &jumbled, desc)") ;
     GB_BURBLE_START ("GxB_Matrix_export_CSC") ;
-    GB_EXPORT_CHECK ;
-
-    GB_RETURN_IF_NULL (Ap) ;
-    GB_RETURN_IF_NULL (Ai) ;
-    GB_RETURN_IF_NULL (Ax) ;
+    GB_RETURN_IF_NULL (A) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (*A) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+    ASSERT_MATRIX_OK (*A, "A to export as CSC", GB0) ;
 
     //--------------------------------------------------------------------------
-    // export the matrix
+    // ensure the matrix is in CSC format
     //--------------------------------------------------------------------------
 
-    // ensure the matrix is in standard CSC format
-    (*A)->hyper_ratio = GB_NEVER_HYPER ;
     if (!((*A)->is_csc))
     { 
-        // A = A', done in place, to put A in CSC format
-        GBBURBLE ("(transpose) ") ;
-        GB_OK (GB_transpose (NULL, NULL, true, (*A),
+        // A = A', done in-place, to put A in CSC format
+        GBURBLE ("(transpose) ") ;
+        GB_OK (GB_transpose (NULL, NULL, true, *A,
             NULL, NULL, NULL, false, Context)) ;
     }
-    if ((*A)->is_hyper)
-    { 
-        // convert A from hypersparse to standard format
-        GB_OK (GB_to_nonhyper ((*A), Context)) ;
-    }
 
-    ASSERT_MATRIX_OK ((*A), "A export: standard CSC", GB0) ;
-    ASSERT ((*A)->is_csc) ;
-    ASSERT (!((*A)->is_hyper)) ;
-
-    if ((*A)->nvec_nonempty < 0)
-    { 
-        // count # of non-empty vectors
-        (*A)->nvec_nonempty = GB_nvec_nonempty (*A, Context) ;
-    }
-    (*nonempty) = (*A)->nvec_nonempty ;
+    //--------------------------------------------------------------------------
+    // finish any pending work
+    //--------------------------------------------------------------------------
 
-    // export the content and remove it from A
-    (*Ap) = (GrB_Index *) (*A)->p ;
-    (*A)->p = NULL ;
-    if ((*nvals) > 0)
+    if (jumbled == NULL)
     { 
-        (*Ai) = (GrB_Index *) (*A)->i ;
-        (*Ax) = (*A)->x ;
-        (*A)->i = NULL ;
-        (*A)->x = NULL ;
+        // the exported matrix cannot be jumbled
+        GB_MATRIX_WAIT (*A) ;
     }
     else
     { 
-        (*Ai) = NULL ;
-        (*Ax) = NULL ;
+        // the exported matrix is allowed to be jumbled
+        GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (*A) ;
     }
-    ASSERT ((*A)->h == NULL) ;
 
     //--------------------------------------------------------------------------
-    // export is successful
+    // ensure the matrix is sparse
     //--------------------------------------------------------------------------
 
-    // free the matrix header; do not free the exported content of the matrix,
-    // which has already been removed above.
-    GB_MATRIX_FREE (A) ;
-    ASSERT (*A == NULL) ;
+    GB_OK (GB_convert_any_to_sparse (*A, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // export the matrix
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_SPARSE (*A)) ;
+    ASSERT (((*A)->is_csc)) ;
+    ASSERT (!GB_ZOMBIES (*A)) ;
+    ASSERT (GB_IMPLIES (jumbled == NULL, !GB_JUMBLED (*A))) ;
+    ASSERT (!GB_PENDING (*A)) ;
+
+    int sparsity ;
+    bool is_csc ;
+
+    info = GB_export (A, type, nrows, ncols,
+        Ap,   Ap_size,  // Ap
+        NULL, NULL,     // Ah
+        NULL, NULL,     // Ab
+        Ai,   Ai_size,  // Ai
+        Ax,   Ax_size,  // Ax
+        NULL, jumbled, NULL,                // jumbled or not
+        &sparsity, &is_csc, Context) ;      // sparse by col
+
+    if (info == GrB_SUCCESS)
+    {
+        ASSERT (sparsity == GxB_SPARSE) ;
+        ASSERT (is_csc) ;
+    }
     GB_BURBLE_END ;
-    return (GrB_SUCCESS) ;
+    return (info) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Matrix_export_CSR.c b/GraphBLAS/Source/GxB_Matrix_export_CSR.c
index 30e4aa10c0..ad753a6be8 100644
--- a/GraphBLAS/Source/GxB_Matrix_export_CSR.c
+++ b/GraphBLAS/Source/GxB_Matrix_export_CSR.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_export_CSR: export a matrix in CSR format
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,89 +13,96 @@
 
 GrB_Info GxB_Matrix_export_CSR  // export and free a CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // CSR format:
-    int64_t *nonempty,      // number of rows with at least one entry
-    GrB_Index **Ap,         // row "pointers", size nrows+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nrows+1
+    GrB_Index **Aj,     // row indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size 1, or >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Aj_size, // size of Aj
+    GrB_Index *Ax_size, // size of Ax
+
+    bool *jumbled,      // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
 )
-{
+{ 
 
     //--------------------------------------------------------------------------
-    // check inputs
+    // check inputs and get the descriptor
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_export_CSR (&A, &type, &nrows, &ncols, &nvals,"
-        " &nonempty, &Ap, &Aj, &Ax, desc)") ;
+    GB_WHERE1 ("GxB_Matrix_export_CSR (&A, &type, &nrows, &ncols, "
+        "&Ap, &Aj, &Ax, &Ap_size, &Aj_size, &Ax_size, &jumbled, desc)") ;
     GB_BURBLE_START ("GxB_Matrix_export_CSR") ;
-    GB_EXPORT_CHECK ;
-
-    GB_RETURN_IF_NULL (Ap) ;
-    GB_RETURN_IF_NULL (Aj) ;
-    GB_RETURN_IF_NULL (Ax) ;
+    GB_RETURN_IF_NULL (A) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (*A) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+    ASSERT_MATRIX_OK (*A, "A to export as CSR", GB0) ;
 
     //--------------------------------------------------------------------------
-    // export the matrix
+    // ensure the matrix is sparse CSR
     //--------------------------------------------------------------------------
 
-    // ensure the matrix is in standard CSR format
-    (*A)->hyper_ratio = GB_NEVER_HYPER ;
     if ((*A)->is_csc)
     { 
-        // A = A', done in place, to put A in CSR format
-        GBBURBLE ("(transpose) ") ;
-        GB_OK (GB_transpose (NULL, NULL, false, (*A),
+        // A = A', done in-place, to put A in CSR format
+        GBURBLE ("(transpose) ") ;
+        GB_OK (GB_transpose (NULL, NULL, false, *A,
             NULL, NULL, NULL, false, Context)) ;
     }
-    if ((*A)->is_hyper)
-    { 
-        // convert A from hypersparse to standard format
-        GB_OK (GB_to_nonhyper ((*A), Context)) ;
-    }
 
-    ASSERT_MATRIX_OK ((*A), "A export: standard CSR", GB0) ;
-    ASSERT (!((*A)->is_csc)) ;
-    ASSERT (!((*A)->is_hyper)) ;
-
-    if ((*A)->nvec_nonempty < 0)
-    { 
-        // count # of non-empty vectors
-        (*A)->nvec_nonempty = GB_nvec_nonempty (*A, Context) ;
-    }
-    (*nonempty) = (*A)->nvec_nonempty ;
+    //--------------------------------------------------------------------------
+    // finish any pending work
+    //--------------------------------------------------------------------------
 
-    // export the content and remove it from A
-    (*Ap) = (GrB_Index *) (*A)->p ;
-    (*A)->p = NULL ;
-    if ((*nvals) > 0)
+    if (jumbled == NULL)
     { 
-        (*Aj) = (GrB_Index *) (*A)->i ;
-        (*Ax) = (*A)->x ;
-        (*A)->i = NULL ;
-        (*A)->x = NULL ;
+        // the exported matrix cannot be jumbled
+        GB_MATRIX_WAIT (*A) ;
     }
     else
     { 
-        (*Aj) = NULL ;
-        (*Ax) = NULL ;
+        // the exported matrix is allowed to be jumbled
+        GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (*A) ;
     }
-    ASSERT ((*A)->h == NULL) ;
 
     //--------------------------------------------------------------------------
-    // export is successful
+    // ensure the matrix is sparse
+    //--------------------------------------------------------------------------
+
+    GB_OK (GB_convert_any_to_sparse (*A, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // export the matrix
     //--------------------------------------------------------------------------
 
-    // free the matrix header; do not free the exported content of the matrix,
-    // which has already been removed above.
-    GB_MATRIX_FREE (A) ;
-    ASSERT (*A == NULL) ;
+    ASSERT (GB_IS_SPARSE (*A)) ;
+    ASSERT (!((*A)->is_csc)) ;
+    ASSERT (!GB_ZOMBIES (*A)) ;
+    ASSERT (GB_IMPLIES (jumbled == NULL, !GB_JUMBLED (*A))) ;
+    ASSERT (!GB_PENDING (*A)) ;
+
+    int sparsity ;
+    bool is_csc ;
+
+    info = GB_export (A, type, ncols, nrows,
+        Ap,   Ap_size,  // Ap
+        NULL, NULL,     // Ah
+        NULL, NULL,     // Ab
+        Aj,   Aj_size,  // Ai
+        Ax,   Ax_size,  // Ax
+        NULL, jumbled, NULL,                // jumbled or not
+        &sparsity, &is_csc, Context) ;      // sparse by row
+
+    if (info == GrB_SUCCESS)
+    {
+        ASSERT (sparsity == GxB_SPARSE) ;
+        ASSERT (!is_csc) ;
+    }
     GB_BURBLE_END ;
-    return (GrB_SUCCESS) ;
+    return (info) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Matrix_export_FullC.c b/GraphBLAS/Source/GxB_Matrix_export_FullC.c
new file mode 100644
index 0000000000..4bd82114f9
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Matrix_export_FullC.c
@@ -0,0 +1,96 @@
+//------------------------------------------------------------------------------
+// GxB_Matrix_export_FullC: export a full matrix, held by column
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GxB_Matrix_export_FullC  // export and free a full matrix, by column
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+
+    void **Ax,          // values, Ax_size 1, or >= nrows*ncols
+    GrB_Index *Ax_size, // size of Ax
+
+    const GrB_Descriptor desc
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs and get the descriptor
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Matrix_export_FullC (&A, &type, &nrows, &ncols, "
+        "&Ax, &Ax_size, desc)") ;
+    GB_BURBLE_START ("GxB_Matrix_export_FullC") ;
+    GB_RETURN_IF_NULL (A) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (*A) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // finish any pending work
+    //--------------------------------------------------------------------------
+
+    GB_MATRIX_WAIT (*A) ;
+    if (!GB_is_dense (*A))
+    { 
+        // A must be dense or full
+        return (GrB_INVALID_VALUE) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // ensure the matrix is full CSC
+    //--------------------------------------------------------------------------
+
+    // ensure the matrix is in CSC format
+    if (!((*A)->is_csc))
+    { 
+        // A = A', done in-place, to put A in CSC format
+        GBURBLE ("(transpose) ") ;
+        GB_OK (GB_transpose (NULL, NULL, true, *A,
+            NULL, NULL, NULL, false, Context)) ;
+        GB_MATRIX_WAIT (*A) ;
+    }
+
+    GB_convert_any_to_full (*A) ;
+
+    //--------------------------------------------------------------------------
+    // export the matrix
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_FULL (*A)) ;
+    ASSERT ((*A)->is_csc) ;
+    ASSERT (!GB_ZOMBIES (*A)) ;
+    ASSERT (!GB_JUMBLED (*A)) ;
+    ASSERT (!GB_PENDING (*A)) ;
+
+    int sparsity ;
+    bool is_csc ;
+
+    info = GB_export (A, type, nrows, ncols,
+        NULL, NULL,     // Ap
+        NULL, NULL,     // Ah
+        NULL, NULL,     // Ab
+        NULL, NULL,     // Ai
+        Ax,   Ax_size,  // Ax
+        NULL, NULL, NULL,
+        &sparsity, &is_csc, Context) ;      // full by col
+
+    if (info == GrB_SUCCESS)
+    {
+        ASSERT (sparsity == GxB_FULL) ;
+        ASSERT (is_csc) ;
+    }
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Matrix_export_FullR.c b/GraphBLAS/Source/GxB_Matrix_export_FullR.c
new file mode 100644
index 0000000000..a91a6db48c
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Matrix_export_FullR.c
@@ -0,0 +1,96 @@
+//------------------------------------------------------------------------------
+// GxB_Matrix_export_FullR: export a full matrix, held by row
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GxB_Matrix_export_FullR  // export and free a full matrix, by row
+(
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+
+    void **Ax,          // values, Ax_size 1, or >= nrows*ncols
+    GrB_Index *Ax_size, // size of Ax
+
+    const GrB_Descriptor desc
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs and get the descriptor
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Matrix_export_FullR (&A, &type, &nrows, &ncols, "
+        "&Ax, &Ax_size, desc)") ;
+    GB_BURBLE_START ("GxB_Matrix_export_FullR") ;
+    GB_RETURN_IF_NULL (A) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (*A) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // finish any pending work
+    //--------------------------------------------------------------------------
+
+    GB_MATRIX_WAIT (*A) ;
+    if (!GB_is_dense (*A))
+    { 
+        // A must be dense or full
+        return (GrB_INVALID_VALUE) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // ensure the matrix is full CSR
+    //--------------------------------------------------------------------------
+
+    // ensure the matrix is in CSR format
+    if ((*A)->is_csc)
+    { 
+        // A = A', done in-place, to put A in CSR format
+        GBURBLE ("(transpose) ") ;
+        GB_OK (GB_transpose (NULL, NULL, false, *A,
+            NULL, NULL, NULL, false, Context)) ;
+        GB_MATRIX_WAIT (*A) ;
+    }
+
+    GB_convert_any_to_full (*A) ;
+
+    //--------------------------------------------------------------------------
+    // export the matrix
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_FULL (*A)) ;
+    ASSERT (!((*A)->is_csc)) ;
+    ASSERT (!GB_ZOMBIES (*A)) ;
+    ASSERT (!GB_JUMBLED (*A)) ;
+    ASSERT (!GB_PENDING (*A)) ;
+
+    int sparsity ;
+    bool is_csc ;
+
+    info = GB_export (A, type, ncols, nrows,
+        NULL, NULL,     // Ap
+        NULL, NULL,     // Ah
+        NULL, NULL,     // Ab
+        NULL, NULL,     // Ai
+        Ax,   Ax_size,  // Ax
+        NULL, NULL, NULL,
+        &sparsity, &is_csc, Context) ;      // full by row
+
+    if (info == GrB_SUCCESS)
+    {
+        ASSERT (sparsity == GxB_FULL) ;
+        ASSERT (!is_csc) ;
+    }
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Matrix_export_HyperCSC.c b/GraphBLAS/Source/GxB_Matrix_export_HyperCSC.c
index a1ba099704..f46fb68231 100644
--- a/GraphBLAS/Source/GxB_Matrix_export_HyperCSC.c
+++ b/GraphBLAS/Source/GxB_Matrix_export_HyperCSC.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_export_HyperCSC: export a matrix in hypersparse CSC format
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,100 +13,98 @@
 
 GrB_Info GxB_Matrix_export_HyperCSC  // export and free a hypersparse CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // hypersparse CSC format:
-    int64_t *nonempty,      // number of columns in Ah with at least one entry
-    GrB_Index *nvec,        // number of columns in Ah list
-    GrB_Index **Ah,         // list of size nvec of columns that appear in A
-    GrB_Index **Ap,         // columns "pointers", size nvec+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+
+    GrB_Index **Ap,     // column "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // column indices, Ah_size >= nvec
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size 1, or >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Ah_size, // size of Ah
+    GrB_Index *Ai_size, // size of Ai
+    GrB_Index *Ax_size, // size of Ax
+
+    GrB_Index *nvec,    // number of columns that appear in Ah
+    bool *jumbled,      // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
 )
-{
+{ 
 
     //--------------------------------------------------------------------------
-    // check inputs
+    // check inputs and get the descriptor
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_export_HyperCSC (&A, &type, &nrows, &ncols, &nvals,"
-        " &nonempty, &nvec, &Ah, &Ap, &Ai, &Ax, desc)") ;
+    GB_WHERE1 ("GxB_Matrix_export_HyperCSC (&A, &type, &nrows, &ncols, "
+        "&Ap, &Ah, &Ai, &Ax, &Ap_size, &Ah_size, &Ai_size, &Ax_size, "
+        "&nvec, &jumbled, desc)") ;
     GB_BURBLE_START ("GxB_Matrix_export_HyperCSC") ;
-    GB_EXPORT_CHECK ;
-
-    GB_RETURN_IF_NULL (nvec) ;
-    GB_RETURN_IF_NULL (Ah) ;
-    GB_RETURN_IF_NULL (Ap) ;
-    GB_RETURN_IF_NULL (Ai) ;
-    GB_RETURN_IF_NULL (Ax) ;
+    GB_RETURN_IF_NULL (A) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (*A) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
 
     //--------------------------------------------------------------------------
-    // export the matrix
+    // ensure the matrix is in CSC format
     //--------------------------------------------------------------------------
 
-    // ensure the matrix is in hypersparse CSC format
-    (*A)->hyper_ratio = GB_ALWAYS_HYPER ;
-    if (!((*A)->is_hyper))
-    { 
-        // convert A from standard to hypersparse format
-        GB_OK (GB_to_hyper ((*A), Context)) ;
-    }
     if (!((*A)->is_csc))
-    {
-        // A = A', done in place, to put A in CSC format
-        GBBURBLE ("(transpose) ") ;
-        GB_OK (GB_transpose (NULL, NULL, true, (*A),
+    { 
+        // A = A', done in-place, to put A in CSC format
+        GBURBLE ("(transpose) ") ;
+        GB_OK (GB_transpose (NULL, NULL, true, *A,
             NULL, NULL, NULL, false, Context)) ;
-        // the transpose might make it non-hypersparse (if vdim is 1)
-        if (!((*A)->is_hyper))
-        { 
-            // convert A from standard to hypersparse format
-            GB_OK (GB_to_hyper ((*A), Context)) ;
-        }
     }
 
-    ASSERT_MATRIX_OK ((*A), "A export: hyper CSC", GB0) ;
-    ASSERT ((*A)->is_csc) ;
-    ASSERT ((*A)->is_hyper) ;
+    //--------------------------------------------------------------------------
+    // finish any pending work
+    //--------------------------------------------------------------------------
 
-    if ((*A)->nvec_nonempty < 0)
+    if (jumbled == NULL)
     { 
-        // count # of non-empty vectors
-        (*A)->nvec_nonempty = GB_nvec_nonempty (*A, Context) ;
-    }
-    (*nonempty) = (*A)->nvec_nonempty ;
-
-    // export the content and remove it from A
-    (*nvec) = (*A)->nvec ;
-    (*Ah) = (GrB_Index *) (*A)->h ;
-    (*A)->h = NULL ;
-    (*Ap) = (GrB_Index *) (*A)->p ;
-    (*A)->p = NULL ;
-    if ((*nvals) > 0)
-    { 
-        (*Ai) = (GrB_Index *) (*A)->i ;
-        (*Ax) = (*A)->x ;
-        (*A)->i = NULL ;
-        (*A)->x = NULL ;
+        // the exported matrix cannot be jumbled
+        GB_MATRIX_WAIT (*A) ;
     }
     else
     { 
-        (*Ai) = NULL ;
-        (*Ax) = NULL ;
+        // the exported matrix is allowed to be jumbled
+        GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (*A) ;
     }
 
     //--------------------------------------------------------------------------
-    // export is successful
+    // ensure the matrix is hypersparse
     //--------------------------------------------------------------------------
 
-    // free the matrix header; do not free the exported content of the matrix,
-    // which has already been removed above.
-    GB_MATRIX_FREE (A) ;
-    ASSERT (*A == NULL) ;
+    GB_OK (GB_convert_any_to_hyper (*A, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // export the matrix
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_HYPERSPARSE (*A)) ;
+    ASSERT ((*A)->is_csc) ;
+    ASSERT (!GB_ZOMBIES (*A)) ;
+    ASSERT (GB_IMPLIES (jumbled == NULL, !GB_JUMBLED (*A))) ;
+    ASSERT (!GB_PENDING (*A)) ;
+
+    int sparsity ;
+    bool is_csc ;
+
+    info = GB_export (A, type, nrows, ncols,
+        Ap,   Ap_size,  // Ap
+        Ah,   Ah_size,  // Ah
+        NULL, NULL,     // Ab
+        Ai,   Ai_size,  // Ai
+        Ax,   Ax_size,  // Ax
+        NULL, jumbled, nvec,                // jumbled or not
+        &sparsity, &is_csc, Context) ;      // hypersparse by col
+
+    if (info == GrB_SUCCESS)
+    {
+        ASSERT (sparsity == GxB_HYPERSPARSE) ;
+        ASSERT (is_csc) ;
+    }
     GB_BURBLE_END ;
     return (info) ;
 }
diff --git a/GraphBLAS/Source/GxB_Matrix_export_HyperCSR.c b/GraphBLAS/Source/GxB_Matrix_export_HyperCSR.c
index de86eceaa1..e1918d941b 100644
--- a/GraphBLAS/Source/GxB_Matrix_export_HyperCSR.c
+++ b/GraphBLAS/Source/GxB_Matrix_export_HyperCSR.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_export_HyperCSR: export a matrix in hypersparse CSR format
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,101 +13,99 @@
 
 GrB_Info GxB_Matrix_export_HyperCSR  // export and free a hypersparse CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to export and free
-    GrB_Type *type,         // type of matrix exported
-    GrB_Index *nrows,       // matrix dimension is nrows-by-ncols
-    GrB_Index *ncols,
-    GrB_Index *nvals,       // number of entries in the matrix
-    // hypersparse CSR format:
-    int64_t *nonempty,      // number of rows in Ah with at least one entry
-    GrB_Index *nvec,        // number of rows in Ah list
-    GrB_Index **Ah,         // list of size nvec of rows that appear in A
-    GrB_Index **Ap,         // row "pointers", size nvec+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to export and free
+    GrB_Type *type,     // type of matrix exported
+    GrB_Index *nrows,   // number of rows of the matrix
+    GrB_Index *ncols,   // number of columns of the matrix
+
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // row indices, Ah_size >= nvec
+    GrB_Index **Aj,     // column indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size 1, or >= nvals(A)
+    GrB_Index *Ap_size, // size of Ap
+    GrB_Index *Ah_size, // size of Ah
+    GrB_Index *Aj_size, // size of Aj
+    GrB_Index *Ax_size, // size of Ax
+
+    GrB_Index *nvec,    // number of rows that appear in Ah
+    bool *jumbled,      // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
 )
-{
+{ 
 
     //--------------------------------------------------------------------------
-    // check inputs
+    // check inputs and get the descriptor
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_export_HyperCSR (&A, &type, &nrows, &ncols, &nvals,"
-        " &nonempty, &nvec, &Ah, &Ap, &Aj, &Ax, desc)") ;
+    GB_WHERE1 ("GxB_Matrix_export_HyperCSR (&A, &type, &nrows, &ncols, "
+        "&Ap, &Ah, &Aj, &Ax, &Ap_size, &Ah_size, &Aj_size, &Ax_size, "
+        "&nvec, &jumbled, desc)") ;
     GB_BURBLE_START ("GxB_Matrix_export_HyperCSR") ;
-    GB_EXPORT_CHECK ;
-
-    GB_RETURN_IF_NULL (nvec) ;
-    GB_RETURN_IF_NULL (Ah) ;
-    GB_RETURN_IF_NULL (Ap) ;
-    GB_RETURN_IF_NULL (Aj) ;
-    GB_RETURN_IF_NULL (Ax) ;
+    GB_RETURN_IF_NULL (A) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (*A) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
 
     //--------------------------------------------------------------------------
-    // export the matrix
+    // ensure the matrix is in CSR format
     //--------------------------------------------------------------------------
 
-    // ensure the matrix is in hypersparse CSR format
-    (*A)->hyper_ratio = GB_ALWAYS_HYPER ;
-    if (!((*A)->is_hyper))
-    { 
-        // convert A from standard to hypersparse format
-        GB_OK (GB_to_hyper ((*A), Context)) ;
-    }
     if ((*A)->is_csc)
-    {
-        // A = A', done in place, to put A in CSR format
-        GBBURBLE ("(transpose) ") ;
-        GB_OK (GB_transpose (NULL, NULL, false, (*A),
+    { 
+        // A = A', done in-place, to put A in CSR format
+        GBURBLE ("(transpose) ") ;
+        GB_OK (GB_transpose (NULL, NULL, false, *A,
             NULL, NULL, NULL, false, Context)) ;
-        // the transpose might make it non-hypersparse (if vdim is 1)
-        if (!((*A)->is_hyper))
-        { 
-            // convert A from standard to hypersparse format
-            GB_OK (GB_to_hyper ((*A), Context)) ;
-        }
     }
 
-    ASSERT_MATRIX_OK ((*A), "A export: hyper CSR", GB0) ;
-    ASSERT (!((*A)->is_csc)) ;
-    ASSERT ((*A)->is_hyper) ;
+    //--------------------------------------------------------------------------
+    // finish any pending work
+    //--------------------------------------------------------------------------
 
-    if ((*A)->nvec_nonempty < 0)
+    if (jumbled == NULL)
     { 
-        // count # of non-empty vectors
-        (*A)->nvec_nonempty = GB_nvec_nonempty (*A, Context) ;
-    }
-    (*nonempty) = (*A)->nvec_nonempty ;
-
-    // export the content and remove it from A
-    (*nvec) = (*A)->nvec ;
-    (*Ah) = (GrB_Index *) (*A)->h ;
-    (*A)->h = NULL ;
-    (*Ap) = (GrB_Index *) (*A)->p ;
-    (*A)->p = NULL ;
-    if ((*nvals) > 0)
-    { 
-        (*Aj) = (GrB_Index *) (*A)->i ;
-        (*Ax) = (*A)->x ;
-        (*A)->i = NULL ;
-        (*A)->x = NULL ;
+        // the exported matrix cannot be jumbled
+        GB_MATRIX_WAIT (*A) ;
     }
     else
     { 
-        (*Aj) = NULL ;
-        (*Ax) = NULL ;
+        // the exported matrix is allowed to be jumbled
+        GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (*A) ;
     }
 
     //--------------------------------------------------------------------------
-    // export is successful
+    // ensure the matrix is hypersparse
+    //--------------------------------------------------------------------------
+
+    GB_OK (GB_convert_any_to_hyper (*A, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // export the matrix
     //--------------------------------------------------------------------------
 
-    // free the matrix header; do not free the exported content of the matrix,
-    // which has already been removed above.
-    GB_MATRIX_FREE (A) ;
-    ASSERT (*A == NULL) ;
+    ASSERT (GB_IS_HYPERSPARSE (*A)) ;
+    ASSERT (!(*A)->is_csc) ;
+    ASSERT (!GB_ZOMBIES (*A)) ;
+    ASSERT (GB_IMPLIES (jumbled == NULL, !GB_JUMBLED (*A))) ;
+    ASSERT (!GB_PENDING (*A)) ;
+
+    int sparsity ;
+    bool is_csc ;
+
+    info = GB_export (A, type, ncols, nrows,
+        Ap,   Ap_size,  // Ap
+        Ah,   Ah_size,  // Ah
+        NULL, NULL,     // Ab
+        Aj,   Aj_size,  // Aj
+        Ax,   Ax_size,  // Ax
+        NULL, jumbled, nvec,                // jumbled or not
+        &sparsity, &is_csc, Context) ;      // hypersparse by row
+
+    if (info == GrB_SUCCESS)
+    {
+        ASSERT (sparsity == GxB_HYPERSPARSE) ;
+        ASSERT (!is_csc) ;
+    }
     GB_BURBLE_END ;
-    return (GrB_SUCCESS) ;
+    return (info) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Matrix_fprint.c b/GraphBLAS/Source/GxB_Matrix_fprint.c
index 7d16986188..5ff7a11b57 100644
--- a/GraphBLAS/Source/GxB_Matrix_fprint.c
+++ b/GraphBLAS/Source/GxB_Matrix_fprint.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_fprint: print and check a GrB_Matrix object
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,13 +22,13 @@ GrB_Info GxB_Matrix_fprint          // print and check a GrB_Matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_fprint (A, name, pr, f)") ;
+    GB_WHERE1 ("GxB_Matrix_fprint (A, name, pr, f)") ;
 
     //--------------------------------------------------------------------------
     // print and check the object
     //--------------------------------------------------------------------------
 
-    GrB_Info info = GB_Matrix_check (A, name, pr, f, Context) ;
+    GrB_Info info = GB_Matrix_check (A, name, pr, f) ;
 
     //--------------------------------------------------------------------------
     // return result
@@ -36,8 +36,8 @@ GrB_Info GxB_Matrix_fprint          // print and check a GrB_Matrix
 
     if (info == GrB_INDEX_OUT_OF_BOUNDS)
     { 
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "matrix invalid: indices out of order [%s]", GB_NAME))) ;
+        // indices out of order
+        return (GrB_INVALID_OBJECT) ;
     }
     else
     { 
diff --git a/GraphBLAS/Source/GxB_Matrix_import_BitmapC.c b/GraphBLAS/Source/GxB_Matrix_import_BitmapC.c
new file mode 100644
index 0000000000..13dd510291
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Matrix_import_BitmapC.c
@@ -0,0 +1,54 @@
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_BitmapC: import a matrix in bitmap format, held by column
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+GrB_Info GxB_Matrix_import_BitmapC  // import a bitmap matrix, held by column
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size 1, or >= nrows*ncols
+    GrB_Index Ab_size,  // size of Ab
+    GrB_Index Ax_size,  // size of Ax
+
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs and get the descriptor
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Matrix_import_BitmapC (&A, type, nrows, ncols, "
+        "&Ab, &Ax, Ab_size, Ax_size, nvals, desc)") ;
+    GB_BURBLE_START ("GxB_Matrix_import_BitmapC") ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // import the matrix
+    //--------------------------------------------------------------------------
+
+    info = GB_import (A, type, nrows, ncols,
+        NULL, 0,        // Ap
+        NULL, 0,        // Ah
+        Ab,   Ab_size,  // Ab
+        NULL, 0,        // Ai
+        Ax,   Ax_size,  // Ax
+        nvals, false, 0,                    // nvals for bitmap
+        GxB_BITMAP, true, Context) ;        // bitmap by col
+
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Matrix_import_BitmapR.c b/GraphBLAS/Source/GxB_Matrix_import_BitmapR.c
new file mode 100644
index 0000000000..be7b479e1f
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Matrix_import_BitmapR.c
@@ -0,0 +1,54 @@
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_BitmapR: import a matrix in bitmap format, held by row
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+GrB_Info GxB_Matrix_import_BitmapR  // import a bitmap matrix, held by row
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+
+    int8_t **Ab,        // bitmap, Ab_size >= nrows*ncols
+    void **Ax,          // values, Ax_size 1, or >= nrows*ncols
+    GrB_Index Ab_size,  // size of Ab
+    GrB_Index Ax_size,  // size of Ax
+
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs and get the descriptor
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Matrix_import_BitmapR (&A, type, nrows, ncols, "
+        "&Ab, &Ax, Ab_size, Ax_size, nvals, desc)") ;
+    GB_BURBLE_START ("GxB_Matrix_import_BitmapR") ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // import the matrix
+    //--------------------------------------------------------------------------
+
+    info = GB_import (A, type, ncols, nrows,
+        NULL, 0,        // Ap
+        NULL, 0,        // Ah
+        Ab,   Ab_size,  // Ab
+        NULL, 0,        // Ai
+        Ax,   Ax_size,  // Ax
+        nvals, false, 0,                    // nvals for bitmap
+        GxB_BITMAP, false, Context) ;       // bitmap by row
+
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Matrix_import_CSC.c b/GraphBLAS/Source/GxB_Matrix_import_CSC.c
index fc3d13b7c0..6f576c1112 100644
--- a/GraphBLAS/Source/GxB_Matrix_import_CSC.c
+++ b/GraphBLAS/Source/GxB_Matrix_import_CSC.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_import_CSC: import a matrix in CSC format
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,88 +11,46 @@
 
 GrB_Info GxB_Matrix_import_CSC      // import a CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,    // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // CSC format:
-    int64_t nonempty,       // number of columns with at least one entry:
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index **Ap,         // column "pointers", size ncols+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+
+    GrB_Index **Ap,     // column "pointers", Ap_size >= ncols+1
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size 1, or >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Ai_size,  // size of Ai
+    GrB_Index Ax_size,  // size of Ax
+
+    bool jumbled,       // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
 )
-{
+{ 
 
     //--------------------------------------------------------------------------
-    // check inputs
+    // check inputs and get the descriptor
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_import_CSC (&A, type, nrows, ncols, nvals,"
-        " nonempty, &Ap, &Ai, &Ax, desc)") ;
+    GB_WHERE1 ("GxB_Matrix_import_CSC (&A, type, nrows, ncols,"
+        "&Ap, &Ai, &Ax, Ap_size, Ai_size, Ax_size, jumbled, desc)") ;
     GB_BURBLE_START ("GxB_Matrix_import_CSC") ;
-    GB_IMPORT_CHECK ;
-
-    GB_RETURN_IF_NULL (Ap) ;
-    if (nvals > 0)
-    { 
-        GB_RETURN_IF_NULL (Ai) ;
-        GB_RETURN_IF_NULL (Ax) ;
-    }
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
 
     //--------------------------------------------------------------------------
     // import the matrix
     //--------------------------------------------------------------------------
 
-    // allocate just the header of the matrix, not the content
-    info = GB_new (A, type, nrows, ncols, GB_Ap_null, true,
-        GB_FORCE_NONHYPER, GB_Global_hyper_ratio_get ( ), 0, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory for matrix header (size O(1))
-        ASSERT (*A == NULL) ;
-        return (info) ;
-    }
-
-    // transplant the user's content into the matrix
-    (*A)->h = NULL ;
-    (*A)->p = (int64_t *) (*Ap) ;
-    (*Ap) = NULL ;
-    (*A)->nzmax = nvals ;
-    (*A)->plen = ncols ;
-    (*A)->nvec = ncols ;
-    (*A)->magic = GB_MAGIC ;
-
-    if (nvals == 0)
-    { 
-        // free the user input Ai and Ax arrays, if they exist
-        if (Ai != NULL) GB_FREE (*Ai) ;
-        if (Ax != NULL) GB_FREE (*Ax) ;
-    }
-    else
-    { 
-        // transplant Ai and Ax into the matrix
-        (*A)->i = (int64_t *) (*Ai) ;
-        (*A)->x = (*Ax) ;
-        (*Ai) = NULL ;
-        (*Ax) = NULL ;
-    }
-
-    // < 0:  compute nvec_nonempty when needed
-    // >= 0: nvec_nonempty must be exact
-    (*A)->nvec_nonempty = (nonempty < 0) ? (-1) : nonempty ;
-
-    //--------------------------------------------------------------------------
-    // import is successful
-    //--------------------------------------------------------------------------
+    info = GB_import (A, type, nrows, ncols,
+        Ap,   Ap_size,  // Ap
+        NULL, 0,        // Ah
+        NULL, 0,        // Ab
+        Ai,   Ai_size,  // Ai
+        Ax,   Ax_size,  // Ax
+        0, jumbled, 0,                      // jumbled or not
+        GxB_SPARSE, true, Context) ;        // sparse by col
 
-    ASSERT (*Ap == NULL) ;
-    ASSERT (*Ai == NULL) ;
-    ASSERT (*Ax == NULL) ;
-    ASSERT_MATRIX_OK (*A, "A CSC imported", GB0) ;
     GB_BURBLE_END ;
-    return (GrB_SUCCESS) ;
+    return (info) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Matrix_import_CSR.c b/GraphBLAS/Source/GxB_Matrix_import_CSR.c
index cd0442a677..09a79bb45c 100644
--- a/GraphBLAS/Source/GxB_Matrix_import_CSR.c
+++ b/GraphBLAS/Source/GxB_Matrix_import_CSR.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_import_CSR: import a matrix in CSR format
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,88 +11,46 @@
 
 GrB_Info GxB_Matrix_import_CSR      // import a CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // CSR format:
-    int64_t nonempty,       // number of rows with at least one entry:
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index **Ap,         // row "pointers", size nrows+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nrows+1
+    GrB_Index **Aj,     // row indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size 1, or >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Aj_size,  // size of Aj
+    GrB_Index Ax_size,  // size of Ax
+
+    bool jumbled,       // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
 )
-{
+{ 
 
     //--------------------------------------------------------------------------
-    // check inputs
+    // check inputs and get the descriptor
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_import_CSR (&A, type, nrows, ncols, nvals,"
-        " nonempty, &Ap, &Aj, &Ax, desc)") ;
+    GB_WHERE1 ("GxB_Matrix_import_CSR (&A, type, nrows, ncols, "
+        "&Ap, &Aj, &Ax, Ap_size, Aj_size, Ax_size, jumbled, desc)") ;
     GB_BURBLE_START ("GxB_Matrix_import_CSR") ;
-    GB_IMPORT_CHECK ;
-
-    GB_RETURN_IF_NULL (Ap) ;
-    if (nvals > 0)
-    { 
-        GB_RETURN_IF_NULL (Aj) ;
-        GB_RETURN_IF_NULL (Ax) ;
-    }
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
 
     //--------------------------------------------------------------------------
     // import the matrix
     //--------------------------------------------------------------------------
 
-    // allocate just the header of the matrix, not the content
-    info = GB_new (A, type, ncols, nrows, GB_Ap_null, false,
-        GB_FORCE_NONHYPER, GB_Global_hyper_ratio_get ( ), 0, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory for matrix header (size O(1))
-        ASSERT (*A == NULL) ;
-        return (info) ;
-    }
-
-    // transplant the user's content into the matrix
-    (*A)->h = NULL ;
-    (*A)->p = (int64_t *) (*Ap) ;
-    (*Ap) = NULL ;
-    (*A)->nzmax = nvals ;
-    (*A)->plen = nrows ;
-    (*A)->nvec = nrows ;
-    (*A)->magic = GB_MAGIC ;
-
-    if (nvals == 0)
-    { 
-        // free the user input Aj and Ax arrays, if they exist
-        if (Aj != NULL) GB_FREE (*Aj) ;
-        if (Ax != NULL) GB_FREE (*Ax) ;
-    }
-    else
-    { 
-        // transplant Aj and Ax into the matrix
-        (*A)->i = (int64_t *) (*Aj) ;
-        (*A)->x = (*Ax) ;
-        (*Aj) = NULL ;
-        (*Ax) = NULL ;
-    }
-
-    // < 0:  compute nvec_nonempty when needed
-    // >= 0: nvec_nonempty must be exact
-    (*A)->nvec_nonempty = (nonempty < 0) ? (-1) : nonempty ;
-
-    //--------------------------------------------------------------------------
-    // import is successful
-    //--------------------------------------------------------------------------
+    info = GB_import (A, type, ncols, nrows,
+        Ap,   Ap_size,  // Ap
+        NULL, 0,        // Ah
+        NULL, 0,        // Ab
+        Aj,   Aj_size,  // Ai
+        Ax,   Ax_size,  // Ax
+        0, jumbled, 0,                      // jumbled or not
+        GxB_SPARSE, false, Context) ;       // sparse by row
 
-    ASSERT (*Ap == NULL) ;
-    ASSERT (*Aj == NULL) ;
-    ASSERT (*Ax == NULL) ;
-    ASSERT_MATRIX_OK ((*A), "A CSR imported", GB0) ;
     GB_BURBLE_END ;
-    return (GrB_SUCCESS) ;
+    return (info) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Matrix_import_FullC.c b/GraphBLAS/Source/GxB_Matrix_import_FullC.c
new file mode 100644
index 0000000000..d97ae5e49c
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Matrix_import_FullC.c
@@ -0,0 +1,51 @@
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_FullC: import a matrix in full format, held by column
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+GrB_Info GxB_Matrix_import_FullC  // import a full matrix, held by column
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+
+    void **Ax,          // values, Ax_size 1, or >= nrows*ncols
+    GrB_Index Ax_size,  // size of Ax
+
+    const GrB_Descriptor desc
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs and get the descriptor
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Matrix_import_FullC (&A, type, nrows, ncols, "
+        "&Ax, Ax_size, desc)") ;
+    GB_BURBLE_START ("GxB_Matrix_import_FullC") ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // import the matrix
+    //--------------------------------------------------------------------------
+
+    info = GB_import (A, type, nrows, ncols,
+        NULL, 0,        // Ap
+        NULL, 0,        // Ah
+        NULL, 0,        // Ab
+        NULL, 0,        // Ai
+        Ax,   Ax_size,  // Ax
+        0, false, 0,
+        GxB_FULL, true, Context) ;          // full by col
+
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Matrix_import_FullR.c b/GraphBLAS/Source/GxB_Matrix_import_FullR.c
new file mode 100644
index 0000000000..2fe5fe921e
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Matrix_import_FullR.c
@@ -0,0 +1,51 @@
+//------------------------------------------------------------------------------
+// GxB_Matrix_import_FullR: import a matrix in full format, held by row
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+GrB_Info GxB_Matrix_import_FullR  // import a full matrix, held by row
+(
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+
+    void **Ax,          // values, Ax_size 1, or >= nrows*ncols
+    GrB_Index Ax_size,  // size of Ax
+
+    const GrB_Descriptor desc
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs and get the descriptor
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Matrix_import_FullR (&A, type, nrows, ncols, "
+        "&Ax, Ax_size, desc)") ;
+    GB_BURBLE_START ("GxB_Matrix_import_FullR") ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // import the matrix
+    //--------------------------------------------------------------------------
+
+    info = GB_import (A, type, ncols, nrows,
+        NULL, 0,        // Ap
+        NULL, 0,        // Ah
+        NULL, 0,        // Ab
+        NULL, 0,        // Ai
+        Ax,   Ax_size,  // Ax
+        0, false, 0,
+        GxB_FULL, false, Context) ;         // full by row
+
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Matrix_import_HyperCSC.c b/GraphBLAS/Source/GxB_Matrix_import_HyperCSC.c
index 271efb7e58..c79e69c351 100644
--- a/GraphBLAS/Source/GxB_Matrix_import_HyperCSC.c
+++ b/GraphBLAS/Source/GxB_Matrix_import_HyperCSC.c
@@ -2,107 +2,59 @@
 // GxB_Matrix_import_HyperCSC: import a matrix in hypersparse CSC format
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "GB_export.h"
 
-GrB_Info GxB_Matrix_import_HyperCSC     // import a hypersparse CSC matrix
+GrB_Info GxB_Matrix_import_HyperCSC      // import a hypersparse CSC matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // hypersparse CSC format:
-    int64_t nonempty,       // number of columns in Ah with at least one entry,
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index nvec,         // number of columns in Ah list
-    GrB_Index **Ah,         // list of size nvec of columns that appear in A
-    GrB_Index **Ap,         // column "pointers", size nvec+1
-    GrB_Index **Ai,         // row indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+
+    GrB_Index **Ap,     // column "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // column indices, Ah_size >= nvec
+    GrB_Index **Ai,     // row indices, Ai_size >= nvals(A)
+    void **Ax,          // values, Ax_size 1, or >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Ah_size,  // size of Ah
+    GrB_Index Ai_size,  // size of Ai
+    GrB_Index Ax_size,  // size of Ax
+
+    GrB_Index nvec,     // number of columns that appear in Ah
+    bool jumbled,       // if true, indices in each column may be unsorted
+    const GrB_Descriptor desc
 )
-{
+{ 
 
     //--------------------------------------------------------------------------
-    // check inputs
+    // check inputs and get the descriptor
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_import_HyperCSC (&A, type, nrows, ncols, nvals,"
-        " nonempty, nvec, &Ah, &Ap, &Ai, &Ax, desc)") ;
+    GB_WHERE1 ("GxB_Matrix_import_HyperCSC (&A, type, nrows, ncols, "
+        "&Ap, &Ah, &Ai, &Ax, Ap_size, Ah_size, Ai_size, Ax_size, "
+        "nvec, jumbled, desc)") ;
     GB_BURBLE_START ("GxB_Matrix_import_HyperCSC") ;
-    GB_IMPORT_CHECK ;
-
-    GB_RETURN_IF_NULL (Ah) ;
-    GB_RETURN_IF_NULL (Ap) ;
-    if (nvals > 0)
-    { 
-        GB_RETURN_IF_NULL (Ai) ;
-        GB_RETURN_IF_NULL (Ax) ;
-    }
-    if (nvec > ncols)
-    { 
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "nvec [" GBu "] must be <= ncols [" GBu "]\n", nvec, ncols))) ;
-    }
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
 
     //--------------------------------------------------------------------------
     // import the matrix
     //--------------------------------------------------------------------------
 
-    // allocate just the header of the matrix, not the content
-    info = GB_new (A, type, nrows, ncols, GB_Ap_null, true,
-        GB_FORCE_HYPER, GB_Global_hyper_ratio_get ( ), nvec, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory for matrix header (size O(1))
-        ASSERT (*A == NULL) ;
-        return (info) ;
-    }
-
-    // transplant the user's content into the matrix
-    (*A)->h = (int64_t *) (*Ah) ;
-    (*Ah) = NULL ;
-    (*A)->p = (int64_t *) (*Ap) ;
-    (*Ap) = NULL ;
-    (*A)->nzmax = nvals ;
-    (*A)->plen = nvec ;
-    (*A)->nvec = nvec ;
-    (*A)->magic = GB_MAGIC ;
-
-    if (nvals == 0)
-    { 
-        // free the user input Ai and Ax arrays, if they exist
-        if (Ai != NULL) GB_FREE (*Ai) ;
-        if (Ax != NULL) GB_FREE (*Ax) ;
-    }
-    else
-    { 
-        // transplant Ai and Ax into the matrix
-        (*A)->i = (int64_t *) (*Ai) ;
-        (*A)->x = (*Ax) ;
-        (*Ai) = NULL ;
-        (*Ax) = NULL ;
-    }
-
-    // < 0:  compute nvec_nonempty when needed
-    // >= 0: nvec_nonempty must be exact
-    (*A)->nvec_nonempty = (nonempty < 0) ? (-1) : nonempty ;
-
-    //--------------------------------------------------------------------------
-    // import is successful
-    //--------------------------------------------------------------------------
+    info = GB_import (A, type, nrows, ncols,
+        Ap,   Ap_size,  // Ap
+        Ah,   Ah_size,  // Ah
+        NULL, 0,        // Ab
+        Ai,   Ai_size,  // Ai
+        Ax,   Ax_size,  // Ax
+        0, jumbled, nvec,                   // jumbled or not
+        GxB_HYPERSPARSE, true, Context) ;   // hypersparse by col
 
-    ASSERT (*Ah == NULL) ;
-    ASSERT (*Ap == NULL) ;
-    ASSERT (*Ai == NULL) ;
-    ASSERT (*Ax == NULL) ;
-    ASSERT_MATRIX_OK (*A, "A hyper CSC imported", GB0) ;
     GB_BURBLE_END ;
-    return (GrB_SUCCESS) ;
+    return (info) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Matrix_import_HyperCSR.c b/GraphBLAS/Source/GxB_Matrix_import_HyperCSR.c
index 5d7c3172c9..2aced4151d 100644
--- a/GraphBLAS/Source/GxB_Matrix_import_HyperCSR.c
+++ b/GraphBLAS/Source/GxB_Matrix_import_HyperCSR.c
@@ -2,107 +2,59 @@
 // GxB_Matrix_import_HyperCSR: import a matrix in hypersparse CSR format
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #include "GB_export.h"
 
-GrB_Info GxB_Matrix_import_HyperCSR     // import a hypersparse CSR matrix
+GrB_Info GxB_Matrix_import_HyperCSR      // import a hypersparse CSR matrix
 (
-    GrB_Matrix *A,          // handle of matrix to create
-    GrB_Type type,          // type of matrix to create
-    GrB_Index nrows,        // matrix dimension is nrows-by-ncols
-    GrB_Index ncols,
-    GrB_Index nvals,        // number of entries in the matrix
-    // hypersparse CSR format:
-    int64_t nonempty,       // number of rows in Ah with at least one entry,
-                            // either < 0 if not known, or >= 0 if exact
-    GrB_Index nvec,         // number of rows in Ah list
-    GrB_Index **Ah,         // list of size nvec of rows that appear in A
-    GrB_Index **Ap,         // row "pointers", size nvec+1
-    GrB_Index **Aj,         // column indices, size nvals
-    void      **Ax,         // values, size nvals
-    const GrB_Descriptor desc       // descriptor for # of threads to use
+    GrB_Matrix *A,      // handle of matrix to create
+    GrB_Type type,      // type of matrix to create
+    GrB_Index nrows,    // number of rows of the matrix
+    GrB_Index ncols,    // number of columns of the matrix
+
+    GrB_Index **Ap,     // row "pointers", Ap_size >= nvec+1
+    GrB_Index **Ah,     // row indices, Ah_size >= nvec
+    GrB_Index **Aj,     // column indices, Aj_size >= nvals(A)
+    void **Ax,          // values, Ax_size 1, or >= nvals(A)
+    GrB_Index Ap_size,  // size of Ap
+    GrB_Index Ah_size,  // size of Ah
+    GrB_Index Aj_size,  // size of Aj
+    GrB_Index Ax_size,  // size of Ax
+
+    GrB_Index nvec,     // number of rows that appear in Ah
+    bool jumbled,       // if true, indices in each row may be unsorted
+    const GrB_Descriptor desc
 )
-{
+{ 
 
     //--------------------------------------------------------------------------
-    // check inputs
+    // check inputs and get the descriptor
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_import_HyperCSR (&A, type, nrows, ncols, nvals,"
-        " nonempty, nvec, &Ah, &Ap, &Aj, &Ax, desc)") ;
+    GB_WHERE1 ("GxB_Matrix_import_HyperCSR (&A, type, nrows, ncols, "
+        "&Ap, &Ah, &Aj, &Ax, Ap_size, Ah_size, Aj_size, Ax_size, "
+        "nvec, jumbled, desc)") ;
     GB_BURBLE_START ("GxB_Matrix_import_HyperCSR") ;
-    GB_IMPORT_CHECK ;
-
-    GB_RETURN_IF_NULL (Ah) ;
-    GB_RETURN_IF_NULL (Ap) ;
-    if (nvals > 0)
-    { 
-        GB_RETURN_IF_NULL (Aj) ;
-        GB_RETURN_IF_NULL (Ax) ;
-    }
-    if (nvec > nrows)
-    { 
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "nvec [" GBu "] must be <= nrows [" GBu "]\n", nvec, nrows))) ;
-    }
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
 
     //--------------------------------------------------------------------------
     // import the matrix
     //--------------------------------------------------------------------------
 
-    // allocate just the header of the matrix, not the content
-    info = GB_new (A, type, ncols, nrows, GB_Ap_null, false,
-        GB_FORCE_HYPER, GB_Global_hyper_ratio_get ( ), nvec, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory for matrix header (size O(1))
-        ASSERT (*A == NULL) ;
-        return (info) ;
-    }
-
-    // transplant the user's content into the matrix
-    (*A)->h = (int64_t *) (*Ah) ;
-    (*Ah) = NULL ;
-    (*A)->p = (int64_t *) (*Ap) ;
-    (*Ap) = NULL ;
-    (*A)->nzmax = nvals ;
-    (*A)->plen = nvec ;
-    (*A)->nvec = nvec ;
-    (*A)->magic = GB_MAGIC ;
-
-    if (nvals == 0)
-    { 
-        // free the user input Aj and Ax arrays, if they exist
-        if (Aj != NULL) GB_FREE (*Aj) ;
-        if (Ax != NULL) GB_FREE (*Ax) ;
-    }
-    else
-    { 
-        // transplant Aj and Ax into the matrix
-        (*A)->i = (int64_t *) (*Aj) ;
-        (*A)->x = (*Ax) ;
-        (*Aj) = NULL ;
-        (*Ax) = NULL ;
-    }
-
-    // < 0:  compute nvec_nonempty when needed
-    // >= 0: nvec_nonempty must be exact
-    (*A)->nvec_nonempty = (nonempty < 0) ? (-1) : nonempty ;
-
-    //--------------------------------------------------------------------------
-    // import is successful
-    //--------------------------------------------------------------------------
+    info = GB_import (A, type, ncols, nrows,
+        Ap,   Ap_size,  // Ap
+        Ah,   Ah_size,  // Ah
+        NULL, 0,        // Ab
+        Aj,   Aj_size,  // Aj
+        Ax,   Ax_size,  // Ax
+        0, jumbled, nvec,                   // jumbled or not
+        GxB_HYPERSPARSE, false, Context) ;  // hypersparse by row
 
-    ASSERT (*Ah == NULL) ;
-    ASSERT (*Ap == NULL) ;
-    ASSERT (*Aj == NULL) ;
-    ASSERT (*Ax == NULL) ;
-    ASSERT_MATRIX_OK (*A, "A hyper CSR imported", GB0) ;
     GB_BURBLE_END ;
-    return (GrB_SUCCESS) ;
+    return (info) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Matrix_resize.c b/GraphBLAS/Source/GxB_Matrix_resize.c
deleted file mode 100644
index eb177f7d9c..0000000000
--- a/GraphBLAS/Source/GxB_Matrix_resize.c
+++ /dev/null
@@ -1,24 +0,0 @@
-//------------------------------------------------------------------------------
-// GxB_Matrix_resize: change the size of a matrix
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// This function now appears in the C API Specification as GrB_Matrix_resize.
-// The new name is preferred.
-
-#include "GB.h"
-
-GrB_Info GxB_Matrix_resize      // change the size of a matrix
-(
-    GrB_Matrix A,               // matrix to modify
-    GrB_Index nrows_new,        // new number of rows in matrix
-    GrB_Index ncols_new         // new number of columns in matrix
-)
-{ 
-    return (GrB_Matrix_resize (A, nrows_new, ncols_new)) ;
-}
-
diff --git a/GraphBLAS/Source/GxB_Matrix_select.c b/GraphBLAS/Source/GxB_Matrix_select.c
index b2e8283ab4..3ffebe84c4 100644
--- a/GraphBLAS/Source/GxB_Matrix_select.c
+++ b/GraphBLAS/Source/GxB_Matrix_select.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_select: select entries from a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@ GrB_Info GxB_Matrix_select  // C<M> = accum (C, select(A,k)) or select(A',k)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_select (C, M, accum, op, A, Thunk, desc)") ;
+    GB_WHERE (C, "GxB_Matrix_select (C, M, accum, op, A, Thunk, desc)") ;
     GB_BURBLE_START ("GxB_select") ;
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -35,7 +35,7 @@ GrB_Info GxB_Matrix_select  // C<M> = accum (C, select(A,k)) or select(A',k)
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_transpose, xx1, xx2) ;
+        A_transpose, xx1, xx2, xx7) ;
 
     //--------------------------------------------------------------------------
     // select the entries and optionally transpose; assemble pending tuples
diff --git a/GraphBLAS/Source/GxB_Matrix_subassign.c b/GraphBLAS/Source/GxB_Matrix_subassign.c
index b623489a83..e3ae6cf710 100644
--- a/GraphBLAS/Source/GxB_Matrix_subassign.c
+++ b/GraphBLAS/Source/GxB_Matrix_subassign.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_subassign: C(Rows,Cols)<M> = accum (C(Rows,Cols),A) or A'
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -29,7 +29,7 @@ GrB_Info GxB_Matrix_subassign       // C(Rows,Cols)<M> += A or A'
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_subassign"
+    GB_WHERE (C, "GxB_Matrix_subassign"
         " (C, M, accum, A, Rows, nRows, Cols, nCols, desc)") ;
     GB_BURBLE_START ("GxB_subassign") ;
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;
@@ -38,7 +38,7 @@ GrB_Info GxB_Matrix_subassign       // C(Rows,Cols)<M> += A or A'
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        A_transpose, xx1, xx2) ;
+        A_transpose, xx1, xx2, xx7) ;
 
     //--------------------------------------------------------------------------
     // C(Rows,Cols)<M> = accum (C(Rows,Cols), A) and variations
diff --git a/GraphBLAS/Source/GxB_Matrix_subassign_scalar.c b/GraphBLAS/Source/GxB_Matrix_subassign_scalar.c
index 4b9ddea06b..a919d30441 100644
--- a/GraphBLAS/Source/GxB_Matrix_subassign_scalar.c
+++ b/GraphBLAS/Source/GxB_Matrix_subassign_scalar.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_subassign_[SCALAR]: assign to submatrix, via scalar expansion
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,7 +19,7 @@
 
 #include "GB_subassign.h"
 
-#define GB_ASSIGN(type,T,ampersand)                                            \
+#define GB_ASSIGN_SCALAR(type,T,ampersand)                                     \
 GrB_Info GxB_Matrix_subassign_ ## T /* C(Rows,Cols)<M> += x                 */ \
 (                                                                              \
     GrB_Matrix C,                   /* input/output matrix for results      */ \
@@ -33,7 +33,7 @@ GrB_Info GxB_Matrix_subassign_ ## T /* C(Rows,Cols)<M> += x                 */ \
     const GrB_Descriptor desc       /* descriptor for C(Rows,Cols) and M */    \
 )                                                                              \
 {                                                                              \
-    GB_WHERE ("GxB_Matrix_subassign_" GB_STR(T)                                \
+    GB_WHERE (C, "GxB_Matrix_subassign_" GB_STR(T)                             \
         " (C, M, accum, x, Rows, nRows, Cols, nCols, desc)") ;                 \
     GB_BURBLE_START ("GxB_Matrix_subassign " GB_STR(T)) ;                      \
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;                                          \
@@ -44,18 +44,18 @@ GrB_Info GxB_Matrix_subassign_ ## T /* C(Rows,Cols)<M> += x                 */ \
     return (info) ;                                                            \
 }
 
-GB_ASSIGN (bool      , BOOL   , &)
-GB_ASSIGN (int8_t    , INT8   , &)
-GB_ASSIGN (uint8_t   , UINT8  , &)
-GB_ASSIGN (int16_t   , INT16  , &)
-GB_ASSIGN (uint16_t  , UINT16 , &)
-GB_ASSIGN (int32_t   , INT32  , &)
-GB_ASSIGN (uint32_t  , UINT32 , &)
-GB_ASSIGN (int64_t   , INT64  , &)
-GB_ASSIGN (uint64_t  , UINT64 , &)
-GB_ASSIGN (float     , FP32   , &)
-GB_ASSIGN (double    , FP64   , &)
-GB_ASSIGN (GxB_FC32_t, FC32   , &)
-GB_ASSIGN (GxB_FC64_t, FC64   , &)
-GB_ASSIGN (void *    , UDT    ,  )
+GB_ASSIGN_SCALAR (bool      , BOOL   , &)
+GB_ASSIGN_SCALAR (int8_t    , INT8   , &)
+GB_ASSIGN_SCALAR (uint8_t   , UINT8  , &)
+GB_ASSIGN_SCALAR (int16_t   , INT16  , &)
+GB_ASSIGN_SCALAR (uint16_t  , UINT16 , &)
+GB_ASSIGN_SCALAR (int32_t   , INT32  , &)
+GB_ASSIGN_SCALAR (uint32_t  , UINT32 , &)
+GB_ASSIGN_SCALAR (int64_t   , INT64  , &)
+GB_ASSIGN_SCALAR (uint64_t  , UINT64 , &)
+GB_ASSIGN_SCALAR (float     , FP32   , &)
+GB_ASSIGN_SCALAR (double    , FP64   , &)
+GB_ASSIGN_SCALAR (GxB_FC32_t, FC32   , &)
+GB_ASSIGN_SCALAR (GxB_FC64_t, FC64   , &)
+GB_ASSIGN_SCALAR (void *    , UDT    ,  )
 
diff --git a/GraphBLAS/Source/GxB_Matrix_type.c b/GraphBLAS/Source/GxB_Matrix_type.c
index 22596074ac..e4e0a0a0af 100644
--- a/GraphBLAS/Source/GxB_Matrix_type.c
+++ b/GraphBLAS/Source/GxB_Matrix_type.c
@@ -2,8 +2,8 @@
 // GxB_Matrix_type: return the type of a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_Matrix_type    // get the type of a matrix
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Matrix_type (&type, A)") ;
+    GB_WHERE1 ("GxB_Matrix_type (&type, A)") ;
     GB_RETURN_IF_NULL_OR_FAULTY (A) ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GxB_Monoid_fprint.c b/GraphBLAS/Source/GxB_Monoid_fprint.c
index 8df6f44d85..9dff83112a 100644
--- a/GraphBLAS/Source/GxB_Monoid_fprint.c
+++ b/GraphBLAS/Source/GxB_Monoid_fprint.c
@@ -2,8 +2,8 @@
 // GxB_Monoid_fprint: print and check a GrB_Monoid object
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,12 +22,12 @@ GrB_Info GxB_Monoid_fprint          // print and check a GrB_Monoid
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Monoid_fprint (monoid, name, pr, f)") ;
+    GB_WHERE1 ("GxB_Monoid_fprint (monoid, name, pr, f)") ;
 
     //--------------------------------------------------------------------------
     // print and check the object
     //--------------------------------------------------------------------------
 
-    return (GB_Monoid_check (monoid, name, pr, f, Context)) ;
+    return (GB_Monoid_check (monoid, name, pr, f)) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Monoid_identity.c b/GraphBLAS/Source/GxB_Monoid_identity.c
index edd2b397b2..4d0f78f2c0 100644
--- a/GraphBLAS/Source/GxB_Monoid_identity.c
+++ b/GraphBLAS/Source/GxB_Monoid_identity.c
@@ -2,8 +2,8 @@
 // GxB_Monoid_identity: return the identity of a monoid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,13 +14,13 @@ GrB_Info GxB_Monoid_identity        // return the monoid identity
     void *identity,                 // returns the identity of the monoid
     GrB_Monoid monoid               // monoid to query
 )
-{ 
+{
 
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Monoid_identity (&identity, monoid)") ;
+    GB_WHERE1 ("GxB_Monoid_identity (&identity, monoid)") ;
     GB_RETURN_IF_NULL (identity) ;
     GB_RETURN_IF_NULL_OR_FAULTY (monoid) ;
     ASSERT_MONOID_OK (monoid, "monoid for identity", GB0) ;
diff --git a/GraphBLAS/Source/GxB_Monoid_operator.c b/GraphBLAS/Source/GxB_Monoid_operator.c
index 3ae86009c0..80785c8d88 100644
--- a/GraphBLAS/Source/GxB_Monoid_operator.c
+++ b/GraphBLAS/Source/GxB_Monoid_operator.c
@@ -2,8 +2,8 @@
 // GxB_Monoid_operator: return the op of a monoid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_Monoid_operator        // return the monoid operator
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Monoid_operator (&op, monoid)") ;
+    GB_WHERE1 ("GxB_Monoid_operator (&op, monoid)") ;
     GB_RETURN_IF_NULL (op) ;
     GB_RETURN_IF_NULL_OR_FAULTY (monoid) ;
     ASSERT_MONOID_OK (monoid, "monoid for op", GB0) ;
diff --git a/GraphBLAS/Source/GxB_Monoid_terminal.c b/GraphBLAS/Source/GxB_Monoid_terminal.c
index 22096bd554..0d69ab99e5 100644
--- a/GraphBLAS/Source/GxB_Monoid_terminal.c
+++ b/GraphBLAS/Source/GxB_Monoid_terminal.c
@@ -2,8 +2,8 @@
 // GxB_Monoid_terminal: return the terminal of a monoid (if any)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,7 +22,7 @@ GrB_Info GxB_Monoid_terminal        // return the monoid terminal
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Monoid_terminal (&has_terminal, &terminal, monoid)") ;
+    GB_WHERE1 ("GxB_Monoid_terminal (&has_terminal, &terminal, monoid)") ;
     GB_RETURN_IF_NULL (has_terminal) ;
     GB_RETURN_IF_NULL (terminal) ;
     GB_RETURN_IF_NULL_OR_FAULTY (monoid) ;
diff --git a/GraphBLAS/Source/GxB_Monoid_terminal_new.c b/GraphBLAS/Source/GxB_Monoid_terminal_new.c
index ef0d483af0..1cd2386500 100644
--- a/GraphBLAS/Source/GxB_Monoid_terminal_new.c
+++ b/GraphBLAS/Source/GxB_Monoid_terminal_new.c
@@ -2,8 +2,8 @@
 // GxB_Monoid_terminal_new:  create a new monoid with a terminal value
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,7 +22,7 @@ GrB_Info GxB_Monoid_terminal_new_ ## T   /* create a new monoid */          \
     type terminal                   /* terminal value of the monoid  */     \
 )                                                                           \
 {                                                                           \
-    GB_WHERE ("GxB_Monoid_terminal_new" GB_STR(T)                           \
+    GB_WHERE1 ("GxB_Monoid_terminal_new" GB_STR(T)                          \
         " (&monoid, op, identity, terminal)") ;                             \
     type id = identity ;                                                    \
     type tr = terminal ;                                                    \
@@ -51,8 +51,8 @@ GrB_Info GxB_Monoid_terminal_new_UDT        // create a monoid with a user type
     void *terminal                  // terminal value of the monoid
 )
 { 
-    GB_WHERE ("GxB_Monoid_terminal_new_UDT (&monoid, op, identity, terminal)") ;
-    GB_RETURN_IF_NULL (identity) ;
+    GB_WHERE1 ("GxB_Monoid_terminal_new_UDT "
+        "(&monoid, op, identity, terminal)") ;
     GB_RETURN_IF_NULL (terminal) ;
     return (GB_Monoid_new (monoid, op, identity, terminal, GB_UDT_code,
         Context)) ;
diff --git a/GraphBLAS/Source/GxB_Row_subassign.c b/GraphBLAS/Source/GxB_Row_subassign.c
index 7dd5982714..b6192b5312 100644
--- a/GraphBLAS/Source/GxB_Row_subassign.c
+++ b/GraphBLAS/Source/GxB_Row_subassign.c
@@ -2,8 +2,8 @@
 // GxB_Row_subassign: C(row,Cols)<M'> = accum (C(row,Cols),u')
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -28,7 +28,7 @@ GrB_Info GxB_Row_subassign          // C(row,Cols)<M'> += u'
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Row_subassign (C, M, accum, u, row, Cols, nCols, desc)") ;
+    GB_WHERE (C, "GxB_Row_subassign (C, M, accum, u, row, Cols, nCols, desc)") ;
     GB_BURBLE_START ("GxB_subassign") ;
     GB_RETURN_IF_NULL_OR_FAULTY (C) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -38,7 +38,7 @@ GrB_Info GxB_Row_subassign          // C(row,Cols)<M'> += u'
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // C(row,Cols)<M'> = accum (C(row,Cols), u')
diff --git a/GraphBLAS/Source/GxB_Scalar_clear.c b/GraphBLAS/Source/GxB_Scalar_clear.c
index 1dfaa8fe70..9275c7056b 100644
--- a/GraphBLAS/Source/GxB_Scalar_clear.c
+++ b/GraphBLAS/Source/GxB_Scalar_clear.c
@@ -2,8 +2,8 @@
 // GxB_Scalar_clear: clears the content of a GxB_Scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,7 +19,7 @@ GrB_Info GxB_Scalar_clear   // clear a GxB_Scalar of its entry
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Scalar_clear (s)") ;
+    GB_WHERE (s, "GxB_Scalar_clear (s)") ;
     GB_RETURN_IF_NULL_OR_FAULTY (s) ;
     ASSERT (GB_SCALAR_OK (s)) ;
 
diff --git a/GraphBLAS/Source/GxB_Scalar_dup.c b/GraphBLAS/Source/GxB_Scalar_dup.c
index 7ea106306f..fda727131b 100644
--- a/GraphBLAS/Source/GxB_Scalar_dup.c
+++ b/GraphBLAS/Source/GxB_Scalar_dup.c
@@ -2,8 +2,8 @@
 // GxB_Scalar_dup: make a deep copy of a sparse GxB_Scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,7 +22,7 @@ GrB_Info GxB_Scalar_dup     // make an exact copy of a GxB_Scalar
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Scalar_dup (&s, t)") ;
+    GB_WHERE1 ("GxB_Scalar_dup (&s, t)") ;
     GB_RETURN_IF_NULL (s) ;
     GB_RETURN_IF_NULL_OR_FAULTY (t) ;
     ASSERT (GB_SCALAR_OK (t)) ;
diff --git a/GraphBLAS/Source/GxB_Scalar_extractElement.c b/GraphBLAS/Source/GxB_Scalar_extractElement.c
index b753bf2e11..ea0ae99d2a 100644
--- a/GraphBLAS/Source/GxB_Scalar_extractElement.c
+++ b/GraphBLAS/Source/GxB_Scalar_extractElement.c
@@ -2,8 +2,8 @@
 // GxB_Scalar_extractElement: extract a single entry from a GxB_Scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GxB_Scalar_fprint.c b/GraphBLAS/Source/GxB_Scalar_fprint.c
index 1240876bed..18c5d0073d 100644
--- a/GraphBLAS/Source/GxB_Scalar_fprint.c
+++ b/GraphBLAS/Source/GxB_Scalar_fprint.c
@@ -2,8 +2,8 @@
 // GxB_Scalar_fprint: print and check a GxB_Scalar object
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,12 +22,12 @@ GrB_Info GxB_Scalar_fprint          // print and check a GxB_Scalar
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Scalar_fprint (s, name, pr, f)") ;
+    GB_WHERE1 ("GxB_Scalar_fprint (s, name, pr, f)") ;
 
     //--------------------------------------------------------------------------
     // print and check the object
     //--------------------------------------------------------------------------
 
-    return (GB_Scalar_check (s, name, pr, f, Context)) ;
+    return (GB_Scalar_check (s, name, pr, f)) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Scalar_free.c b/GraphBLAS/Source/GxB_Scalar_free.c
index 4748af2372..cb87cc6975 100644
--- a/GraphBLAS/Source/GxB_Scalar_free.c
+++ b/GraphBLAS/Source/GxB_Scalar_free.c
@@ -2,8 +2,8 @@
 // GxB_Scalar_free: free a sparse GxB_Scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,7 +18,7 @@ GrB_Info GxB_Scalar_free    // free a GxB_Scalar
 )
 { 
 
-    GB_SCALAR_FREE (s) ;
+    GB_Matrix_free ((GrB_Matrix *) s) ;
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Scalar_new.c b/GraphBLAS/Source/GxB_Scalar_new.c
index 040045d1df..8c113bb99b 100644
--- a/GraphBLAS/Source/GxB_Scalar_new.c
+++ b/GraphBLAS/Source/GxB_Scalar_new.c
@@ -2,8 +2,8 @@
 // GxB_Scalar_new: create a new GxB_Scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,7 +23,7 @@ GrB_Info GxB_Scalar_new     // create a new GxB_Scalar with no entries
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Scalar_new (&s, type)") ;
+    GB_WHERE1 ("GxB_Scalar_new (&s, type)") ;
     GB_RETURN_IF_NULL (s) ;
     (*s) = NULL ;
     GB_RETURN_IF_NULL_OR_FAULTY (type) ;
@@ -34,9 +34,9 @@ GrB_Info GxB_Scalar_new     // create a new GxB_Scalar with no entries
 
     GrB_Info info ;
 
-    // *s == NULL ;                 // allocate a new header for s
-    info = GB_new ((GrB_Matrix *) s, type, 1, 1, GB_Ap_calloc, true,
-        GB_AUTO_HYPER, GB_HYPER_DEFAULT, 1, Context) ;
+    info = GB_new ((GrB_Matrix *) s,    // new scalar (sparse), new header
+        type, 1, 1, GB_Ap_calloc, true,
+        GxB_SPARSE, GB_Global_hyper_switch_get ( ), 1, Context) ;
     ASSERT (GB_IMPLIES (info == GrB_SUCCESS, GB_SCALAR_OK (*s))) ;
     return (info) ;
 }
diff --git a/GraphBLAS/Source/GxB_Scalar_nvals.c b/GraphBLAS/Source/GxB_Scalar_nvals.c
index 1c53ee4fff..712a94c820 100644
--- a/GraphBLAS/Source/GxB_Scalar_nvals.c
+++ b/GraphBLAS/Source/GxB_Scalar_nvals.c
@@ -2,8 +2,8 @@
 // GxB_Scalar_nvals: number of entries in a sparse GxB_Scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_Scalar_nvals   // get the number of entries in a GxB_Scalar
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Scalar_nvals (&nvals, s)") ;
+    GB_WHERE1 ("GxB_Scalar_nvals (&nvals, s)") ;
     GB_RETURN_IF_NULL_OR_FAULTY (s) ;
     ASSERT (GB_SCALAR_OK (s)) ;
 
diff --git a/GraphBLAS/Source/GxB_Scalar_setElement.c b/GraphBLAS/Source/GxB_Scalar_setElement.c
index 4473b4e3d7..82eb646ccb 100644
--- a/GraphBLAS/Source/GxB_Scalar_setElement.c
+++ b/GraphBLAS/Source/GxB_Scalar_setElement.c
@@ -2,8 +2,8 @@
 // GxB_Scalar_setElement: set an entry in a GxB_Scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,7 +19,7 @@ GrB_Info GxB_Scalar_setElement_ ## T    /* s = x */                         \
     type x                              /* user scalar to assign to s */    \
 )                                                                           \
 {                                                                           \
-    GB_WHERE ("GxB_Scalar_setElement_" GB_STR(T) " (w, x)") ;               \
+    GB_WHERE (s, "GxB_Scalar_setElement_" GB_STR(T) " (w, x)") ;            \
     GB_RETURN_IF_NULL_OR_FAULTY (s) ;                                       \
     ASSERT (GB_SCALAR_OK (s)) ;                                             \
     return (GB_setElement ((GrB_Matrix) s, ampersand x, 0, 0,               \
diff --git a/GraphBLAS/Source/GxB_Scalar_type.c b/GraphBLAS/Source/GxB_Scalar_type.c
index 69614c0a01..9a92dad589 100644
--- a/GraphBLAS/Source/GxB_Scalar_type.c
+++ b/GraphBLAS/Source/GxB_Scalar_type.c
@@ -2,8 +2,8 @@
 // GxB_Scalar_type: return the type of a GxB_Scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_Scalar_type    // get the type of a GxB_Scalar
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Scalar_type (&type, s)") ;
+    GB_WHERE1 ("GxB_Scalar_type (&type, s)") ;
     GB_RETURN_IF_NULL_OR_FAULTY (s) ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GxB_Scalar_wait.c b/GraphBLAS/Source/GxB_Scalar_wait.c
index 59302837cf..7f94764e2c 100644
--- a/GraphBLAS/Source/GxB_Scalar_wait.c
+++ b/GraphBLAS/Source/GxB_Scalar_wait.c
@@ -2,11 +2,13 @@
 // GxB_Scalar_wait: wait for a scalar to complete
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
+// Finishes all work on a scalar, followed by an OpenMP flush.
+
 #include "GB.h"
 
 #define GB_FREE_ALL ;
@@ -21,8 +23,8 @@ GrB_Info GxB_Scalar_wait    // finish all work on a scalar
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Scalar_wait (&s)") ;
-    GB_BURBLE_START ("GxB_Scalar_wait") ;
+    #pragma omp flush
+    GB_WHERE ((*s), "GxB_Scalar_wait (&s)") ;
     GB_RETURN_IF_NULL (s) ;
     GB_RETURN_IF_NULL_OR_FAULTY (*s) ;
 
@@ -30,14 +32,19 @@ GrB_Info GxB_Scalar_wait    // finish all work on a scalar
     // finish all pending work on the scalar
     //--------------------------------------------------------------------------
 
-    GrB_Info info ;
-    GB_SCALAR_WAIT (*s) ;
+    if (GB_ANY_PENDING_WORK (*s))
+    {
+        GrB_Info info ;
+        GB_BURBLE_START ("GxB_Scalar_wait") ;
+        GB_OK (GB_Matrix_wait ((GrB_Matrix) (*s), Context)) ;
+        GB_BURBLE_END ;
+    }
 
     //--------------------------------------------------------------------------
     // return result
     //--------------------------------------------------------------------------
 
-    GB_BURBLE_END ;
+    #pragma omp flush
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_SelectOp_fprint.c b/GraphBLAS/Source/GxB_SelectOp_fprint.c
index 7691ad6ad8..85e9295be9 100644
--- a/GraphBLAS/Source/GxB_SelectOp_fprint.c
+++ b/GraphBLAS/Source/GxB_SelectOp_fprint.c
@@ -2,8 +2,8 @@
 // GxB_SelectOp_fprint: print and check a GrB_SelectOp object
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,12 +22,12 @@ GrB_Info GxB_SelectOp_fprint        // print and check a GrB_SelectOp
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_SelectOp_fprint (selectop, name, pr, f)") ;
+    GB_WHERE1 ("GxB_SelectOp_fprint (selectop, name, pr, f)") ;
 
     //--------------------------------------------------------------------------
     // print and check the object
     //--------------------------------------------------------------------------
 
-    return (GB_SelectOp_check (selectop, name, pr, f, Context)) ;
+    return (GB_SelectOp_check (selectop, name, pr, f)) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_SelectOp_free.c b/GraphBLAS/Source/GxB_SelectOp_free.c
index 1515b82250..377b7f9935 100644
--- a/GraphBLAS/Source/GxB_SelectOp_free.c
+++ b/GraphBLAS/Source/GxB_SelectOp_free.c
@@ -2,8 +2,8 @@
 // GxB_SelectOp_free: free a select operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GxB_SelectOp_new.c b/GraphBLAS/Source/GxB_SelectOp_new.c
index 2c2aa37586..8ab65feb51 100644
--- a/GraphBLAS/Source/GxB_SelectOp_new.c
+++ b/GraphBLAS/Source/GxB_SelectOp_new.c
@@ -2,8 +2,8 @@
 // GxB_SelectOp_new: create a new user-defined select operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GxB_SelectOp_ttype.c b/GraphBLAS/Source/GxB_SelectOp_ttype.c
index a666bb1e92..f90b3c82d0 100644
--- a/GraphBLAS/Source/GxB_SelectOp_ttype.c
+++ b/GraphBLAS/Source/GxB_SelectOp_ttype.c
@@ -2,8 +2,8 @@
 // GxB_SelectOp_ttype: return the type of thunk for z=f(x,thunk)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.ttt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_SelectOp_ttype         // return type of thunk or NULL if generic
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_SelectOp_ttype (&ttype, selectop)") ;
+    GB_WHERE1 ("GxB_SelectOp_ttype (&ttype, selectop)") ;
     GB_RETURN_IF_NULL (ttype) ;
     GB_RETURN_IF_NULL_OR_FAULTY (selectop) ;
     ASSERT_SELECTOP_OK (selectop, "selectop for ttype", GB0) ;
diff --git a/GraphBLAS/Source/GxB_SelectOp_wait.c b/GraphBLAS/Source/GxB_SelectOp_wait.c
index e56ae487e2..65263620af 100644
--- a/GraphBLAS/Source/GxB_SelectOp_wait.c
+++ b/GraphBLAS/Source/GxB_SelectOp_wait.c
@@ -2,14 +2,14 @@
 // GxB_SelectOp_wait: wait for a user-defined GxB_SelectOp to complete
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // In SuiteSparse:GraphBLAS, a user-defined GxB_SelectOp has no pending
 // operations to wait for.  All this method does is verify that the op is
-// properly initialized.
+// properly initialized, and then it does an OpenMP flush.
 
 #include "GB.h"
 
@@ -23,7 +23,8 @@ GrB_Info GxB_SelectOp_wait   // no work, just check if the GxB_SelectOp is valid
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_SelectOp_wait (&op)") ;
+    #pragma omp flush
+    GB_WHERE1 ("GxB_SelectOp_wait (&op)") ;
     GB_RETURN_IF_NULL (op) ;
     GB_RETURN_IF_NULL_OR_FAULTY (*op) ;
 
@@ -31,6 +32,7 @@ GrB_Info GxB_SelectOp_wait   // no work, just check if the GxB_SelectOp is valid
     // return result
     //--------------------------------------------------------------------------
 
+    #pragma omp flush
     return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_SelectOp_xtype.c b/GraphBLAS/Source/GxB_SelectOp_xtype.c
index 781bfe88e4..9a2573862f 100644
--- a/GraphBLAS/Source/GxB_SelectOp_xtype.c
+++ b/GraphBLAS/Source/GxB_SelectOp_xtype.c
@@ -2,8 +2,8 @@
 // GxB_SelectOp_xtype: return the type of x for z=f(x,thunk)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_SelectOp_xtype         // return the type of x or NULL if generic
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_SelectOp_xtype (&xtype, selectop)") ;
+    GB_WHERE1 ("GxB_SelectOp_xtype (&xtype, selectop)") ;
     GB_RETURN_IF_NULL (xtype) ;
     GB_RETURN_IF_NULL_OR_FAULTY (selectop) ;
     ASSERT_SELECTOP_OK (selectop, "selectop for xtype", GB0) ;
diff --git a/GraphBLAS/Source/GxB_Semiring_add.c b/GraphBLAS/Source/GxB_Semiring_add.c
index 7387e24e88..5e5cbb5652 100644
--- a/GraphBLAS/Source/GxB_Semiring_add.c
+++ b/GraphBLAS/Source/GxB_Semiring_add.c
@@ -2,8 +2,8 @@
 // GxB_Semiring_add: return the additive monoid of a semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_Semiring_add           // return the additive monoid of a semiring
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Semiring_add (&add, semiring)") ;
+    GB_WHERE1 ("GxB_Semiring_add (&add, semiring)") ;
     GB_RETURN_IF_NULL (add) ;
     GB_RETURN_IF_NULL_OR_FAULTY (semiring) ;
     ASSERT_SEMIRING_OK (semiring, "semiring for add", GB0) ;
diff --git a/GraphBLAS/Source/GxB_Semiring_fprint.c b/GraphBLAS/Source/GxB_Semiring_fprint.c
index 6a4834eefa..b5e81e0ace 100644
--- a/GraphBLAS/Source/GxB_Semiring_fprint.c
+++ b/GraphBLAS/Source/GxB_Semiring_fprint.c
@@ -2,8 +2,8 @@
 // GxB_Semiring_fprint: print and check a GrB_Semiring object
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,12 +22,12 @@ GrB_Info GxB_Semiring_fprint        // print and check a GrB_Semiring
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Semiring_fprint (semiring, name, pr, f)") ;
+    GB_WHERE1 ("GxB_Semiring_fprint (semiring, name, pr, f)") ;
 
     //--------------------------------------------------------------------------
     // print and check the object
     //--------------------------------------------------------------------------
 
-    return (GB_Semiring_check (semiring, name, pr, f, Context)) ;
+    return (GB_Semiring_check (semiring, name, pr, f)) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Semiring_multiply.c b/GraphBLAS/Source/GxB_Semiring_multiply.c
index 45dafce73e..d7d883c813 100644
--- a/GraphBLAS/Source/GxB_Semiring_multiply.c
+++ b/GraphBLAS/Source/GxB_Semiring_multiply.c
@@ -2,8 +2,8 @@
 // GxB_Semiring_multiply: return the multiply operator of a semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_Semiring_multiply      // return multiply operator of a semiring
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Semiring_multiply (&multiply, semiring)") ;
+    GB_WHERE1 ("GxB_Semiring_multiply (&multiply, semiring)") ;
     GB_RETURN_IF_NULL (multiply) ;
     GB_RETURN_IF_NULL_OR_FAULTY (semiring) ;
     ASSERT_SEMIRING_OK (semiring, "semiring for mult", GB0) ;
diff --git a/GraphBLAS/Source/GxB_Type_fprint.c b/GraphBLAS/Source/GxB_Type_fprint.c
index b55fd449a1..4a4e165866 100644
--- a/GraphBLAS/Source/GxB_Type_fprint.c
+++ b/GraphBLAS/Source/GxB_Type_fprint.c
@@ -2,8 +2,8 @@
 // GxB_Type_fprint: print and check a GrB_Type object
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,12 +22,12 @@ GrB_Info GxB_Type_fprint            // print and check a GrB_Type
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Type_fprint (type, name, pr, f)") ;
+    GB_WHERE1 ("GxB_Type_fprint (type, name, pr, f)") ;
 
     //--------------------------------------------------------------------------
     // print and check the object
     //--------------------------------------------------------------------------
 
-    return (GB_Type_check (type, name, pr, f, Context)) ;
+    return (GB_Type_check (type, name, pr, f)) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_Type_size.c b/GraphBLAS/Source/GxB_Type_size.c
index 9673fe7474..4ae2ce3379 100644
--- a/GraphBLAS/Source/GxB_Type_size.c
+++ b/GraphBLAS/Source/GxB_Type_size.c
@@ -2,8 +2,8 @@
 // GxB_Type_size: return the size of a type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_Type_size          // determine the size of the type
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Type_size (&size, type)") ;
+    GB_WHERE1 ("GxB_Type_size (&size, type)") ;
     GB_RETURN_IF_NULL (size) ;
     GB_RETURN_IF_NULL_OR_FAULTY (type) ;
 
diff --git a/GraphBLAS/Source/GxB_UnaryOp_fprint.c b/GraphBLAS/Source/GxB_UnaryOp_fprint.c
index 44f970a5b6..6a7562383b 100644
--- a/GraphBLAS/Source/GxB_UnaryOp_fprint.c
+++ b/GraphBLAS/Source/GxB_UnaryOp_fprint.c
@@ -2,8 +2,8 @@
 // GxB_UnaryOp_fprint: print and check a GrB_UnaryOp object
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,12 +22,12 @@ GrB_Info GxB_UnaryOp_fprint         // print and check a GrB_UnaryOp
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_UnaryOp_fprint (unaryop, name, pr, f)") ;
+    GB_WHERE1 ("GxB_UnaryOp_fprint (unaryop, name, pr, f)") ;
 
     //--------------------------------------------------------------------------
     // print and check the object
     //--------------------------------------------------------------------------
 
-    return (GB_UnaryOp_check (unaryop, name, pr, f, Context)) ;
+    return (GB_UnaryOp_check (unaryop, name, pr, f)) ;
 }
 
diff --git a/GraphBLAS/Source/GxB_UnaryOp_xtype.c b/GraphBLAS/Source/GxB_UnaryOp_xtype.c
index c3351c516c..b6ef1a8360 100644
--- a/GraphBLAS/Source/GxB_UnaryOp_xtype.c
+++ b/GraphBLAS/Source/GxB_UnaryOp_xtype.c
@@ -2,8 +2,8 @@
 // GxB_UnaryOp_xtype: return the type of x for z=f(x)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_UnaryOp_xtype          // return the type of x
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_UnaryOp_xtype (&xtype, unaryop)") ;
+    GB_WHERE1 ("GxB_UnaryOp_xtype (&xtype, unaryop)") ;
     GB_RETURN_IF_NULL (xtype) ;
     GB_RETURN_IF_NULL_OR_FAULTY (unaryop) ;
     ASSERT_UNARYOP_OK (unaryop, "unaryop for xtype", GB0) ;
diff --git a/GraphBLAS/Source/GxB_UnaryOp_ztype.c b/GraphBLAS/Source/GxB_UnaryOp_ztype.c
index 683632c076..a77f58f8b1 100644
--- a/GraphBLAS/Source/GxB_UnaryOp_ztype.c
+++ b/GraphBLAS/Source/GxB_UnaryOp_ztype.c
@@ -2,8 +2,8 @@
 // GxB_UnaryOp_ztype: return the type of z for z=f(x)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_UnaryOp_ztype          // return the type of z
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_UnaryOp_ztype (&ztype, unaryop)") ;
+    GB_WHERE1 ("GxB_UnaryOp_ztype (&ztype, unaryop)") ;
     GB_RETURN_IF_NULL (ztype) ;
     GB_RETURN_IF_NULL_OR_FAULTY (unaryop) ;
     ASSERT_UNARYOP_OK (unaryop, "unaryop for ztype", GB0) ;
diff --git a/GraphBLAS/Source/GxB_Vector_Option_get.c b/GraphBLAS/Source/GxB_Vector_Option_get.c
new file mode 100644
index 0000000000..b327b39bd0
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Vector_Option_get.c
@@ -0,0 +1,101 @@
+//------------------------------------------------------------------------------
+// GxB_Vector_Option_get: get an option in a vector
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB.h"
+
+GrB_Info GxB_Vector_Option_get      // gets the current option of a vector
+(
+    GrB_Vector v,                   // vector to query
+    GxB_Option_Field field,         // option to query
+    ...                             // return value of the vector option
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Vector_Option_get (v, field, &value)") ;
+    GB_RETURN_IF_NULL_OR_FAULTY (v) ;
+    ASSERT_VECTOR_OK (v, "v to get option", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // get the option
+    //--------------------------------------------------------------------------
+
+    va_list ap ;
+
+    switch (field)
+    {
+
+        case GxB_BITMAP_SWITCH : 
+
+            {
+                va_start (ap, field) ;
+                double *bitmap_switch = va_arg (ap, double *) ;
+                va_end (ap) ;
+                GB_RETURN_IF_NULL (bitmap_switch) ;
+                (*bitmap_switch) = (double) v->bitmap_switch ;
+            }
+            break ;
+
+        case GxB_SPARSITY_CONTROL : 
+
+            {
+                va_start (ap, field) ;
+                int *sparsity = va_arg (ap, int *) ;
+                va_end (ap) ;
+                GB_RETURN_IF_NULL (sparsity) ;
+                (*sparsity) = v->sparsity ;
+            }
+            break ;
+
+        case GxB_SPARSITY_STATUS : 
+
+            {
+                va_start (ap, field) ;
+                int *sparsity = va_arg (ap, int *) ;
+                va_end (ap) ;
+                GB_RETURN_IF_NULL (sparsity) ;
+                (*sparsity) = GB_sparsity ((GrB_Matrix) v) ;
+            }
+            break ;
+
+        case GxB_FORMAT : 
+
+            {
+                // a GrB_Vector is always stored by-column
+                va_start (ap, field) ;
+                GxB_Format_Value *format = va_arg (ap, GxB_Format_Value *) ;
+                va_end (ap) ;
+                GB_RETURN_IF_NULL (format) ;
+                (*format) = GxB_BY_COL ;
+            }
+            break ;
+
+        case GxB_IS_HYPER : // deprecated; use GxB_SPARSITY_STATUS instead
+
+            {
+                // a GrB_Vector is never hypersparse
+                va_start (ap, field) ;
+                bool *v_is_hyper = va_arg (ap, bool *) ;
+                va_end (ap) ;
+                GB_RETURN_IF_NULL (v_is_hyper) ;
+                (*v_is_hyper) = false ;
+            }
+            break ;
+
+        default : 
+
+            return (GrB_INVALID_VALUE) ;
+
+    }
+    return (GrB_SUCCESS) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Vector_Option_set.c b/GraphBLAS/Source/GxB_Vector_Option_set.c
new file mode 100644
index 0000000000..ebf74fad57
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Vector_Option_set.c
@@ -0,0 +1,75 @@
+//------------------------------------------------------------------------------
+// GxB_Vector_Option_set: set an option in a vector
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_transpose.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GxB_Vector_Option_set      // set an option in a vector
+(
+    GrB_Vector v,                   // descriptor to modify
+    GxB_Option_Field field,         // option to change
+    ...                             // value to change it to
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GrB_Info info = GrB_SUCCESS ;
+    GB_WHERE (v, "GxB_Vector_Option_set (v, field, value)") ;
+    GB_BURBLE_START ("GxB_set (vector option)") ;
+    GB_RETURN_IF_NULL_OR_FAULTY (v) ;
+    ASSERT_VECTOR_OK (v, "v to set option", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // set the vector option
+    //--------------------------------------------------------------------------
+
+    va_list ap ;
+
+    switch (field)
+    {
+
+        case GxB_BITMAP_SWITCH : 
+
+            {
+                va_start (ap, field) ;
+                double bitmap_switch = va_arg (ap, double) ;
+                va_end (ap) ;
+                v->bitmap_switch = (float) bitmap_switch ;
+            }
+            break ;
+
+        case GxB_SPARSITY_CONTROL : 
+
+            {
+                va_start (ap, field) ;
+                int sparsity = va_arg (ap, int) ;
+                va_end (ap) ;
+                v->sparsity = GB_sparsity_control (sparsity, (int64_t) (-1)) ;
+            }
+            break ;
+
+        default : 
+
+            return (GrB_INVALID_VALUE) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // conform the vector to its new desired sparsity structure
+    //--------------------------------------------------------------------------
+
+    GB_OK (GB_conform ((GrB_Matrix) v, Context)) ;
+    GB_BURBLE_END ;
+    ASSERT_VECTOR_OK (v, "v set", GB0) ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Vector_export.c b/GraphBLAS/Source/GxB_Vector_export.c
deleted file mode 100644
index 2236ecbd1a..0000000000
--- a/GraphBLAS/Source/GxB_Vector_export.c
+++ /dev/null
@@ -1,84 +0,0 @@
-//------------------------------------------------------------------------------
-// GxB_Vector_export: export a vector in CSR/CSC format
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// The indices are returned in sorted order.
-
-#include "GB_export.h"
-
-#define GB_FREE_ALL ;
-
-GrB_Info GxB_Vector_export  // export and free a vector
-(
-    GrB_Vector *v,          // handle of vector to export and free
-    GrB_Type *type,         // type of vector exported
-    GrB_Index *n,           // length of the vector
-    GrB_Index *nvals,       // number of entries in the vector
-    // CSR/CSC format:
-    GrB_Index **vi,         // indices, size nvals (in sorted order)
-    void      **vx,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    GB_WHERE ("GxB_Vector_export (&v, &type, &n, &nvals, &vi, &vx, desc)") ;
-    GB_BURBLE_START ("GxB_Vector_export") ;
-    GB_RETURN_IF_NULL (v) ;
-    GB_RETURN_IF_NULL_OR_FAULTY (*v) ;
-    ASSERT_VECTOR_OK (*v, "v to export", GB0) ;
-
-    // finish any pending work
-    GrB_Info info ;
-    GB_VECTOR_WAIT (*v) ;
-
-    // check these after forcing completion
-    GB_RETURN_IF_NULL (type) ;
-    GB_RETURN_IF_NULL (n) ;
-    GB_RETURN_IF_NULL (nvals) ;
-    GB_RETURN_IF_NULL (vi) ;
-    GB_RETURN_IF_NULL (vx) ;
-
-    //--------------------------------------------------------------------------
-    // export the vector
-    //--------------------------------------------------------------------------
-
-    // export basic attributes
-    (*type) = (*v)->type ;
-    (*n) = (*v)->vlen ;
-    (*nvals) = GB_NNZ (*v) ;
-
-    // export the content and remove it from v
-    if ((*nvals) > 0)
-    { 
-        (*vi) = (GrB_Index *) (*v)->i ;
-        (*vx) = (*v)->x ;
-        (*v)->i = NULL ;
-        (*v)->x = NULL ;
-    }
-    else
-    { 
-        (*vi) = NULL ;
-        (*vx) = NULL ;
-    }
-
-    //--------------------------------------------------------------------------
-    // export is successful
-    //--------------------------------------------------------------------------
-
-    // free the vector header; do not free the exported content of the vector,
-    // which has already been removed above.
-    GB_VECTOR_FREE (v) ;
-    ASSERT (*v == NULL) ;
-    GB_BURBLE_END ;
-    return (GrB_SUCCESS) ;
-}
-
diff --git a/GraphBLAS/Source/GxB_Vector_export_Bitmap.c b/GraphBLAS/Source/GxB_Vector_export_Bitmap.c
new file mode 100644
index 0000000000..af7868b922
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Vector_export_Bitmap.c
@@ -0,0 +1,86 @@
+//------------------------------------------------------------------------------
+// GxB_Vector_export_Bitmap: export a bitmap vector
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GxB_Vector_export_Bitmap   // export and free a bitmap vector
+(
+    GrB_Vector *v,      // handle of vector to export and free
+    GrB_Type *type,     // type of vector exported
+    GrB_Index *n,       // length of the vector
+
+    int8_t **vb,        // bitmap, vb_size >= n
+    void **vx,          // values, vx_size 1, or >= n
+    GrB_Index *vb_size, // size of vb
+    GrB_Index *vx_size, // size of vx
+
+    GrB_Index *nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Vector_export_Bitmap (&v, &type, &n, "
+        " &vb, &vx, &vb_size, &vx_size, &nvals, desc)") ;
+    GB_BURBLE_START ("GxB_Vector_export_Bitmap") ;
+    GB_RETURN_IF_NULL (v) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (*v) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // finish any pending work
+    //--------------------------------------------------------------------------
+
+    GB_MATRIX_WAIT (*v) ;
+
+    //--------------------------------------------------------------------------
+    // ensure the vector is bitmap CSC
+    //--------------------------------------------------------------------------
+
+    ASSERT ((*v)->is_csc) ;
+    GB_OK (GB_convert_any_to_bitmap ((GrB_Matrix) *v, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // export the vector
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_BITMAP (*v)) ;
+    ASSERT ((*v)->is_csc) ;
+    ASSERT (!GB_ZOMBIES (*v)) ;
+    ASSERT (!GB_JUMBLED (*v)) ;
+    ASSERT (!GB_PENDING (*v)) ;
+
+    int sparsity ;
+    bool is_csc ;
+    GrB_Index vdim ;
+
+    info = GB_export ((GrB_Matrix *) v, type, n, &vdim,
+        NULL, NULL,     // Ap
+        NULL, NULL,     // Ah
+        vb,   vb_size,  // Ab
+        NULL, NULL,     // Ai
+        vx,   vx_size,  // Ax
+        nvals, NULL, NULL,                  // nvals for bitmap
+        &sparsity, &is_csc, Context) ;      // bitmap by col
+
+    if (info == GrB_SUCCESS)
+    {
+        ASSERT (sparsity == GxB_BITMAP) ;
+        ASSERT (is_csc) ;
+        ASSERT (vdim == 1) ;
+    }
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Vector_export_CSC.c b/GraphBLAS/Source/GxB_Vector_export_CSC.c
new file mode 100644
index 0000000000..4ea1efbdbe
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Vector_export_CSC.c
@@ -0,0 +1,101 @@
+//------------------------------------------------------------------------------
+// GxB_Vector_export_CSC: export a vector in CSC format
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GxB_Vector_export_CSC  // export and free a CSC vector
+(
+    GrB_Vector *v,      // handle of vector to export and free
+    GrB_Type *type,     // type of vector exported
+    GrB_Index *n,       // length of the vector
+
+    GrB_Index **vi,     // indices, vi_size >= nvals(v)
+    void **vx,          // values, vx_size 1, or >= nvals(v)
+    GrB_Index *vi_size, // size of Ai
+    GrB_Index *vx_size, // size of Ax
+
+    GrB_Index *nvals,   // # of entries in vector
+    bool *jumbled,      // if true, indices may be unsorted
+    const GrB_Descriptor desc
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Vector_export_CSC (&v, &type, &n, "
+        " &vi, &vx, &vi_size, &vx_size, &nvals, &jumbled, desc)") ;
+    GB_BURBLE_START ("GxB_Vector_export_CSC") ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+    GB_RETURN_IF_NULL (v) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (*v) ;
+    GB_RETURN_IF_NULL (nvals) ;
+    ASSERT_VECTOR_OK (*v, "v to export", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // finish any pending work
+    //--------------------------------------------------------------------------
+
+    if (jumbled == NULL)
+    { 
+        // the exported vector cannot be jumbled
+        GB_MATRIX_WAIT (*v) ;
+    }
+    else
+    { 
+        // the exported vector is allowed to be jumbled
+        GB_MATRIX_WAIT_IF_PENDING_OR_ZOMBIES (*v) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // ensure the vector is sparse
+    //--------------------------------------------------------------------------
+
+    GB_OK (GB_convert_any_to_sparse ((GrB_Matrix) *v, Context)) ;
+
+    //--------------------------------------------------------------------------
+    // export the vector
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_SPARSE (*v)) ;
+    ASSERT ((*v)->is_csc) ;
+    ASSERT (!GB_ZOMBIES (*v)) ;
+    ASSERT (GB_IMPLIES (jumbled == NULL, !GB_JUMBLED (*v))) ;
+    ASSERT (!GB_PENDING (*v)) ;
+
+    int sparsity ;
+    bool is_csc ;
+    int64_t *vp = NULL ;
+    GrB_Index vdim, vp_size ;
+
+    info = GB_export ((GrB_Matrix *) v, type, n, &vdim,
+        &vp,  &vp_size, // Ap
+        NULL, NULL,     // Ah
+        NULL, NULL,     // Ab
+        vi,   vi_size,  // Ai
+        vx,   vx_size,  // Ax
+        NULL, jumbled, NULL,                // jumbled or not
+        &sparsity, &is_csc, Context) ;      // sparse by col
+
+    if (info == GrB_SUCCESS)
+    { 
+        (*nvals) = vp [1] ;
+        ASSERT (sparsity == GxB_SPARSE) ;
+        ASSERT (is_csc) ;
+        ASSERT (vdim == 1) ;
+        ASSERT (vp_size == 2) ;
+    }
+    GB_FREE (vp) ;
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Vector_export_Full.c b/GraphBLAS/Source/GxB_Vector_export_Full.c
new file mode 100644
index 0000000000..f2eb3af874
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Vector_export_Full.c
@@ -0,0 +1,88 @@
+//------------------------------------------------------------------------------
+// GxB_Vector_export_Full: export a full vector
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+#define GB_FREE_ALL ;
+
+GrB_Info GxB_Vector_export_Full   // export and free a full vector
+(
+    GrB_Vector *v,      // handle of vector to export and free
+    GrB_Type *type,     // type of vector exported
+    GrB_Index *n,       // length of the vector
+
+    void **vx,          // values, vx_size 1, or >= nvals(v)
+    GrB_Index *vx_size, // size of vx
+
+    const GrB_Descriptor desc
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Vector_export_Full (&v, &type, &n, "
+        "&vx, &vx_size, desc)") ;
+    GB_BURBLE_START ("GxB_Vector_export_Full") ;
+    GB_RETURN_IF_NULL (v) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (*v) ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // finish any pending work
+    //--------------------------------------------------------------------------
+
+    GB_MATRIX_WAIT (*v) ;
+    if (!GB_is_dense ((GrB_Matrix) (*v)))
+    { 
+        // v must be dense or full
+        return (GrB_INVALID_VALUE) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // ensure the vector is full CSC
+    //--------------------------------------------------------------------------
+
+    ASSERT ((*v)->is_csc) ;
+    GB_convert_any_to_full ((GrB_Matrix) *v) ;
+
+    //--------------------------------------------------------------------------
+    // export the vector
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_FULL (*v)) ;
+    ASSERT ((*v)->is_csc) ;
+    ASSERT (!GB_ZOMBIES (*v)) ;
+    ASSERT (!GB_JUMBLED (*v)) ;
+    ASSERT (!GB_PENDING (*v)) ;
+
+    int sparsity ;
+    bool is_csc ;
+    GrB_Index vdim ;
+
+    info = GB_export ((GrB_Matrix *) v, type, n, &vdim,
+        NULL, NULL,     // Ap
+        NULL, NULL,     // Ah
+        NULL, NULL,     // Ab
+        NULL, NULL,     // Ai
+        vx,   vx_size,  // Ax
+        NULL, NULL, NULL,
+        &sparsity, &is_csc, Context) ;      // full by col
+
+    if (info == GrB_SUCCESS)
+    {
+        ASSERT (sparsity == GxB_FULL) ;
+        ASSERT (is_csc) ;
+        ASSERT (vdim == 1) ;
+    }
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Vector_fprint.c b/GraphBLAS/Source/GxB_Vector_fprint.c
index 97ba51318c..3dcacc9809 100644
--- a/GraphBLAS/Source/GxB_Vector_fprint.c
+++ b/GraphBLAS/Source/GxB_Vector_fprint.c
@@ -2,8 +2,8 @@
 // GxB_Vector_fprint: print and check a GrB_Vector object
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,13 +22,13 @@ GrB_Info GxB_Vector_fprint          // print and check a GrB_Vector
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Vector_fprint (v, name, pr, f)") ;
+    GB_WHERE1 ("GxB_Vector_fprint (v, name, pr, f)") ;
 
     //--------------------------------------------------------------------------
     // print and check the object
     //--------------------------------------------------------------------------
 
-    GrB_Info info = GB_Vector_check (v, name, pr, f, Context) ;
+    GrB_Info info = GB_Vector_check (v, name, pr, f) ;
 
     //--------------------------------------------------------------------------
     // return result
@@ -36,8 +36,8 @@ GrB_Info GxB_Vector_fprint          // print and check a GrB_Vector
 
     if (info == GrB_INDEX_OUT_OF_BOUNDS)
     { 
-        return (GB_ERROR (GrB_INVALID_OBJECT, (GB_LOG,
-            "vector invalid: indices out of order [%s]", GB_NAME))) ;
+        // indices out of order
+        return (GrB_INVALID_OBJECT) ;
     }
     else
     { 
diff --git a/GraphBLAS/Source/GxB_Vector_import.c b/GraphBLAS/Source/GxB_Vector_import.c
deleted file mode 100644
index 6ef4d33498..0000000000
--- a/GraphBLAS/Source/GxB_Vector_import.c
+++ /dev/null
@@ -1,102 +0,0 @@
-//------------------------------------------------------------------------------
-// GxB_Vector_import: import a vector in CSR/CSC format
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// The indices must appear in sorted order.
-
-#include "GB_export.h"
-
-GrB_Info GxB_Vector_import  // import a vector in CSC format
-(
-    GrB_Vector *v,          // handle of vector to create
-    GrB_Type type,          // type of vector to create
-    GrB_Index n,            // vector length
-    GrB_Index nvals,        // number of entries in the vector
-    // CSR/CSC format:
-    GrB_Index **vi,         // indices, size nvals (in sorted order)
-    void      **vx,         // values, size nvals
-    const GrB_Descriptor desc       // currently unused
-)
-{
-
-    //--------------------------------------------------------------------------
-    // check inputs
-    //--------------------------------------------------------------------------
-
-    GB_WHERE ("GxB_Vector_import (&v, type, n, nvals, &vi, &vx, desc)") ;
-    GB_BURBLE_START ("GxB_Vector_import") ;
-    GB_RETURN_IF_NULL (v) ;
-    (*v) = NULL ;
-    GB_RETURN_IF_NULL_OR_FAULTY (type) ;
-
-    if (n > GxB_INDEX_MAX)
-    { 
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "problem too large: n " GBu " exceeds " GBu ,
-            n, GxB_INDEX_MAX))) ;
-    }
-    if (nvals > GxB_INDEX_MAX)
-    { 
-        return (GB_ERROR (GrB_INVALID_VALUE, (GB_LOG,
-            "problem too large: nvals " GBu " exceeds " GBu ,
-            nvals, GxB_INDEX_MAX))) ;
-    }
-
-    if (nvals > 0)
-    { 
-        GB_RETURN_IF_NULL (vi) ;
-        GB_RETURN_IF_NULL (vx) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // import the vector
-    //--------------------------------------------------------------------------
-
-    GrB_Info info ;
-
-    // allocate the header of the vector; allocate v->p of size 2 and clear it
-    info = GB_new ((GrB_Matrix *) v, type, (int64_t) n, 1, GB_Ap_calloc, true,
-        GB_AUTO_HYPER, GB_HYPER_DEFAULT, 1, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory for vector header (size O(1))
-        ASSERT (*v == NULL) ;
-        return (info) ;
-    }
-
-    // transplant the user's content into the vector
-    (*v)->nzmax = nvals ;
-    (*v)->p [1] = nvals ;
-
-    if (nvals == 0)
-    { 
-        // free the user input vi and vx arrays, if they exist
-        if (vi != NULL) GB_FREE (*vi) ;
-        if (vx != NULL) GB_FREE (*vx) ;
-    }
-    else
-    { 
-        // transplant vi and vx into the vector
-        (*v)->i = (int64_t *) (*vi) ;
-        (*v)->x = (*vx) ;
-        (*vi) = NULL ;
-        (*vx) = NULL ;
-        (*v)->nvec_nonempty = 1 ;
-    }
-
-    //--------------------------------------------------------------------------
-    // import is successful
-    //--------------------------------------------------------------------------
-
-    ASSERT (*vi == NULL) ;
-    ASSERT (*vx == NULL) ;
-    ASSERT_VECTOR_OK (*v, "v imported", GB0) ;
-    GB_BURBLE_END ;
-    return (GrB_SUCCESS) ;
-}
-
diff --git a/GraphBLAS/Source/GxB_Vector_import_Bitmap.c b/GraphBLAS/Source/GxB_Vector_import_Bitmap.c
new file mode 100644
index 0000000000..db23de8382
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Vector_import_Bitmap.c
@@ -0,0 +1,53 @@
+//------------------------------------------------------------------------------
+// GxB_Vector_import_Bitmap: import a vector in bitmap format
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+GrB_Info GxB_Vector_import_Bitmap // import a bitmap vector
+(
+    GrB_Vector *v,      // handle of vector to create
+    GrB_Type type,      // type of vector to create
+    GrB_Index n,        // vector length
+
+    int8_t **vb,        // bitmap, vb_size >= n
+    void **vx,          // values, vx_size 1, or >= n
+    GrB_Index vb_size,  // size of vb
+    GrB_Index vx_size,  // size of vx
+
+    GrB_Index nvals,    // # of entries in bitmap
+    const GrB_Descriptor desc
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs and get the descriptor
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Vector_import_Bitmap (&v, type, n, "
+        " &vb, &vx, vb_size, vx_size, nvals, desc)") ;
+    GB_BURBLE_START ("GxB_Vector_import_Bitmap") ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // import the vector
+    //--------------------------------------------------------------------------
+
+    info = GB_import ((GrB_Matrix *) v, type, n, 1,
+        NULL, 0,        // Ap
+        NULL, 0,        // Ah
+        vb,   vb_size,  // Ab
+        NULL, 0,        // Ai
+        vx,   vx_size,  // Ax
+        nvals, false, 0,                    // nvals for bitmap
+        GxB_BITMAP, true, Context) ;        // bitmap by col
+
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Vector_import_CSC.c b/GraphBLAS/Source/GxB_Vector_import_CSC.c
new file mode 100644
index 0000000000..009f849fa9
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Vector_import_CSC.c
@@ -0,0 +1,64 @@
+//------------------------------------------------------------------------------
+// GxB_Vector_import_CSC: import a vector in CSC format
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+GrB_Info GxB_Vector_import_CSC  // import a vector in CSC format
+(
+    GrB_Vector *v,      // handle of vector to create
+    GrB_Type type,      // type of vector to create
+    GrB_Index n,        // vector length
+
+    GrB_Index **vi,     // indices, vi_size >= nvals(v)
+    void **vx,          // values, vx_size 1, or >= nvals(v)
+    GrB_Index vi_size,  // size of Ai
+    GrB_Index vx_size,  // size of Ax
+
+    GrB_Index nvals,    // # of entries in vector
+    bool jumbled,       // if true, indices may be unsorted
+    const GrB_Descriptor desc
+)
+{
+
+    //--------------------------------------------------------------------------
+    // check inputs and get the descriptor
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Vector_import_CSC (&v, type, n, "
+        " &vi, &vx, vi_size, vx_size, nvals, jumbled, desc)") ;
+    GB_BURBLE_START ("GxB_Vector_import_CSC") ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // import the vector
+    //--------------------------------------------------------------------------
+
+    GrB_Index *vp = GB_MALLOC (2, int64_t) ;
+    if (vp == NULL)
+    { 
+        // out of memory
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+    vp [0] = 0 ;
+    vp [1] = nvals ;
+
+    info = GB_import ((GrB_Matrix *) v, type, n, 1,
+        &vp,  2,        // Ap
+        NULL, 0,        // Ah
+        NULL, 0,        // Ab
+        vi,   vi_size,  // Ai
+        vx,   vx_size,  // Ax
+        0, jumbled, 0,                      // jumbled or not
+        GxB_SPARSE, true, Context) ;        // sparse by col
+
+    GB_FREE (vp) ;
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Vector_import_Full.c b/GraphBLAS/Source/GxB_Vector_import_Full.c
new file mode 100644
index 0000000000..56c74a99a7
--- /dev/null
+++ b/GraphBLAS/Source/GxB_Vector_import_Full.c
@@ -0,0 +1,50 @@
+//------------------------------------------------------------------------------
+// GxB_Vector_import_Full: import a vector in full format
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_export.h"
+
+GrB_Info GxB_Vector_import_Full // import a full vector
+(
+    GrB_Vector *v,      // handle of vector to create
+    GrB_Type type,      // type of vector to create
+    GrB_Index n,        // vector length
+
+    void **vx,          // values, vx_size 1, or >= nvals(v)
+    GrB_Index vx_size,  // size of vx
+
+    const GrB_Descriptor desc
+)
+{ 
+
+    //--------------------------------------------------------------------------
+    // check inputs and get the descriptor
+    //--------------------------------------------------------------------------
+
+    GB_WHERE1 ("GxB_Vector_import_Full (&v, type, n, "
+        "&vx, vx_size, desc)") ;
+    GB_BURBLE_START ("GxB_Vector_import_Full") ;
+    GB_GET_DESCRIPTOR (info, desc, xx1, xx2, xx3, xx4, xx5, xx6, xx7) ;
+
+    //--------------------------------------------------------------------------
+    // import the vector
+    //--------------------------------------------------------------------------
+
+    info = GB_import ((GrB_Matrix *) v, type, n, 1,
+        NULL, 0,        // Ap
+        NULL, 0,        // Ah
+        NULL, 0,        // Ab
+        NULL, 0,        // Ai
+        vx,   vx_size,  // Ax
+        0, false, 0,
+        GxB_FULL, true, Context) ;          // full by col
+
+    GB_BURBLE_END ;
+    return (info) ;
+}
+
diff --git a/GraphBLAS/Source/GxB_Vector_resize.c b/GraphBLAS/Source/GxB_Vector_resize.c
deleted file mode 100644
index ab665f6100..0000000000
--- a/GraphBLAS/Source/GxB_Vector_resize.c
+++ /dev/null
@@ -1,23 +0,0 @@
-//------------------------------------------------------------------------------
-// GxB_Vector_resize: change the size of a vector
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// This function now appears in the C API Specification as GrB_Vector_resize.
-// The new name is preferred.
-
-#include "GB.h"
-
-GrB_Info GxB_Vector_resize      // change the size of a vector
-(
-    GrB_Vector u,               // vector to modify
-    GrB_Index nrows_new         // new number of rows in vector
-)
-{ 
-    return (GrB_Vector_resize (u, nrows_new)) ;
-}
-
diff --git a/GraphBLAS/Source/GxB_Vector_select.c b/GraphBLAS/Source/GxB_Vector_select.c
index 1314a32b35..5ea65b504f 100644
--- a/GraphBLAS/Source/GxB_Vector_select.c
+++ b/GraphBLAS/Source/GxB_Vector_select.c
@@ -2,8 +2,8 @@
 // GxB_Vector_select: select entries from a vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -25,7 +25,7 @@ GrB_Info GxB_Vector_select          // w<M> = accum (w, select(u,k))
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Vector_select (w, M, accum, op, u, Thunk, desc)") ;
+    GB_WHERE (w, "GxB_Vector_select (w, M, accum, op, u, Thunk, desc)") ;
     GB_BURBLE_START ("GxB_select") ;
     GB_RETURN_IF_NULL_OR_FAULTY (w) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -33,7 +33,7 @@ GrB_Info GxB_Vector_select          // w<M> = accum (w, select(u,k))
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // select the entries; do not transpose; assemble pending entries
diff --git a/GraphBLAS/Source/GxB_Vector_subassign.c b/GraphBLAS/Source/GxB_Vector_subassign.c
index a24fd0b15b..b4c3734f37 100644
--- a/GraphBLAS/Source/GxB_Vector_subassign.c
+++ b/GraphBLAS/Source/GxB_Vector_subassign.c
@@ -2,8 +2,8 @@
 // GxB_Vector_subassign: w(Rows)<M> = accum (w(Rows),u)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -27,7 +27,7 @@ GrB_Info GxB_Vector_subassign       // w(Rows)<M> = accum (w(Rows),u)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Vector_subassign (w, M, accum, u, Rows, nRows, desc)") ;
+    GB_WHERE (w, "GxB_Vector_subassign (w, M, accum, u, Rows, nRows, desc)") ;
     GB_BURBLE_START ("GxB_subassign") ;
     GB_RETURN_IF_NULL_OR_FAULTY (w) ;
     GB_RETURN_IF_FAULTY (M) ;
@@ -39,7 +39,7 @@ GrB_Info GxB_Vector_subassign       // w(Rows)<M> = accum (w(Rows),u)
 
     // get the descriptor
     GB_GET_DESCRIPTOR (info, desc, C_replace, Mask_comp, Mask_struct,
-        xx1, xx2, xx3) ;
+        xx1, xx2, xx3, xx7) ;
 
     //--------------------------------------------------------------------------
     // w(Rows)<M> = accum (w(Rows), u) and variations
diff --git a/GraphBLAS/Source/GxB_Vector_subassign_scalar.c b/GraphBLAS/Source/GxB_Vector_subassign_scalar.c
index 89c6249891..dd71652f7b 100644
--- a/GraphBLAS/Source/GxB_Vector_subassign_scalar.c
+++ b/GraphBLAS/Source/GxB_Vector_subassign_scalar.c
@@ -2,8 +2,8 @@
 // GxB_Vector_subassign_[SCALAR]: assign scalar to vector, via scalar expansion
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,7 +15,7 @@
 
 #include "GB_subassign.h"
 
-#define GB_ASSIGN(type,T,ampersand)                                            \
+#define GB_ASSIGN_SCALAR(type,T,ampersand)                                     \
 GrB_Info GxB_Vector_subassign_ ## T /* w(Rows)<M> = accum (w(Rows),x)       */ \
 (                                                                              \
     GrB_Vector w,                   /* input/output vector for results      */ \
@@ -27,7 +27,7 @@ GrB_Info GxB_Vector_subassign_ ## T /* w(Rows)<M> = accum (w(Rows),x)       */ \
     const GrB_Descriptor desc       /* descriptor for w(Rows) and M         */ \
 )                                                                              \
 {                                                                              \
-    GB_WHERE ("GxB_Vector_subassign_" GB_STR(T)                                \
+    GB_WHERE (w, "GxB_Vector_subassign_" GB_STR(T)                             \
         " (w, M, accum, x, Rows, nRows, desc)") ;                              \
     GB_BURBLE_START ("GxB_subassign") ;                                        \
     GB_RETURN_IF_NULL_OR_FAULTY (w) ;                                          \
@@ -41,18 +41,18 @@ GrB_Info GxB_Vector_subassign_ ## T /* w(Rows)<M> = accum (w(Rows),x)       */ \
     return (info) ;                                                            \
 }
 
-GB_ASSIGN (bool      , BOOL   , &)
-GB_ASSIGN (int8_t    , INT8   , &)
-GB_ASSIGN (uint8_t   , UINT8  , &)
-GB_ASSIGN (int16_t   , INT16  , &)
-GB_ASSIGN (uint16_t  , UINT16 , &)
-GB_ASSIGN (int32_t   , INT32  , &)
-GB_ASSIGN (uint32_t  , UINT32 , &)
-GB_ASSIGN (int64_t   , INT64  , &)
-GB_ASSIGN (uint64_t  , UINT64 , &)
-GB_ASSIGN (float     , FP32   , &)
-GB_ASSIGN (double    , FP64   , &)
-GB_ASSIGN (GxB_FC32_t, FC32   , &)
-GB_ASSIGN (GxB_FC64_t, FC64   , &)
-GB_ASSIGN (void *    , UDT    ,  )
+GB_ASSIGN_SCALAR (bool      , BOOL   , &)
+GB_ASSIGN_SCALAR (int8_t    , INT8   , &)
+GB_ASSIGN_SCALAR (uint8_t   , UINT8  , &)
+GB_ASSIGN_SCALAR (int16_t   , INT16  , &)
+GB_ASSIGN_SCALAR (uint16_t  , UINT16 , &)
+GB_ASSIGN_SCALAR (int32_t   , INT32  , &)
+GB_ASSIGN_SCALAR (uint32_t  , UINT32 , &)
+GB_ASSIGN_SCALAR (int64_t   , INT64  , &)
+GB_ASSIGN_SCALAR (uint64_t  , UINT64 , &)
+GB_ASSIGN_SCALAR (float     , FP32   , &)
+GB_ASSIGN_SCALAR (double    , FP64   , &)
+GB_ASSIGN_SCALAR (GxB_FC32_t, FC32   , &)
+GB_ASSIGN_SCALAR (GxB_FC64_t, FC64   , &)
+GB_ASSIGN_SCALAR (void *    , UDT    ,  )
 
diff --git a/GraphBLAS/Source/GxB_Vector_type.c b/GraphBLAS/Source/GxB_Vector_type.c
index 59c0a6dc4a..7bbaeb6a84 100644
--- a/GraphBLAS/Source/GxB_Vector_type.c
+++ b/GraphBLAS/Source/GxB_Vector_type.c
@@ -2,8 +2,8 @@
 // GxB_Vector_type: return the type of a vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@ GrB_Info GxB_Vector_type    // get the type of a vector
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("GxB_Vector_type (&type, v)") ;
+    GB_WHERE1 ("GxB_Vector_type (&type, v)") ;
     GB_RETURN_IF_NULL_OR_FAULTY (v) ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/GxB_cuda_calloc.c b/GraphBLAS/Source/GxB_cuda_calloc.c
index f855ffb610..68ef4322d3 100644
--- a/GraphBLAS/Source/GxB_cuda_calloc.c
+++ b/GraphBLAS/Source/GxB_cuda_calloc.c
@@ -3,7 +3,7 @@
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GxB_cuda_free.c b/GraphBLAS/Source/GxB_cuda_free.c
index 0a9a6c0743..958ebb8a19 100644
--- a/GraphBLAS/Source/GxB_cuda_free.c
+++ b/GraphBLAS/Source/GxB_cuda_free.c
@@ -3,7 +3,7 @@
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GxB_cuda_init.c b/GraphBLAS/Source/GxB_cuda_init.c
index c5b6e23c91..0a65f87e16 100644
--- a/GraphBLAS/Source/GxB_cuda_init.c
+++ b/GraphBLAS/Source/GxB_cuda_init.c
@@ -3,7 +3,7 @@
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GxB_cuda_malloc.c b/GraphBLAS/Source/GxB_cuda_malloc.c
index 4c190e87f5..9a9af0de16 100644
--- a/GraphBLAS/Source/GxB_cuda_malloc.c
+++ b/GraphBLAS/Source/GxB_cuda_malloc.c
@@ -3,7 +3,7 @@
 //------------------------------------------------------------------------------
 
 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2019, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/GxB_init.c b/GraphBLAS/Source/GxB_init.c
index a2731caef8..b8972bfe23 100644
--- a/GraphBLAS/Source/GxB_init.c
+++ b/GraphBLAS/Source/GxB_init.c
@@ -2,8 +2,8 @@
 // GxB_init: initialize GraphBLAS and declare malloc/calloc/realloc/free to use
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/README.txt b/GraphBLAS/Source/README.txt
index 3019bf4515..cecc276a50 100644
--- a/GraphBLAS/Source/README.txt
+++ b/GraphBLAS/Source/README.txt
@@ -1,5 +1,5 @@
-SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved
-http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
 
 This folder, GraphBLAS/Source, contains all the primary source files for
 GraphBLAS, and an internal include file GB.h that is meant for internal
diff --git a/GraphBLAS/Source/Template/GB_2type_factory.c b/GraphBLAS/Source/Template/GB_2type_factory.c
index befd19cce9..fbf03f7789 100644
--- a/GraphBLAS/Source/Template/GB_2type_factory.c
+++ b/GraphBLAS/Source/Template/GB_2type_factory.c
@@ -2,8 +2,8 @@
 // GB_2type_factory.c: 2-type switch factory
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -36,8 +36,11 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _bool, bool, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _bool, bool, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _bool, bool, _fp64,   double    )
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _bool, bool, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _bool, bool, _fc64,   GxB_FC64_t)
+            #endif
             default: ;
         }
         break ;
@@ -59,8 +62,11 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _int8, int8_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _int8, int8_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _int8, int8_t, _fp64,   double    )
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _int8, int8_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _int8, int8_t, _fc64,   GxB_FC64_t)
+            #endif
             default: ;
         }
         break ;
@@ -82,8 +88,11 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _int16, int16_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _int16, int16_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _int16, int16_t, _fp64,   double    )
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _int16, int16_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _int16, int16_t, _fc64,   GxB_FC64_t)
+            #endif
             default: ;
         }
         break ;
@@ -105,8 +114,11 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _int32, int32_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _int32, int32_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _int32, int32_t, _fp64,   double    )
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _int32, int32_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _int32, int32_t, _fc64,   GxB_FC64_t)
+            #endif
             default: ;
         }
         break ;
@@ -128,8 +140,11 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _int64, int64_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _int64, int64_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _int64, int64_t, _fp64,   double    )
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _int64, int64_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _int64, int64_t, _fc64,   GxB_FC64_t)
+            #endif
             default: ;
         }
         break ;
@@ -151,8 +166,11 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _fp64,   double    )
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _uint8, uint8_t, _fc64,   GxB_FC64_t)
+            #endif
             default: ;
         }
         break ;
@@ -174,8 +192,11 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _fp64,   double    )
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _uint16, uint16_t, _fc64,   GxB_FC64_t)
+            #endif
             default: ;
         }
         break ;
@@ -197,8 +218,11 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _fp64,   double    )
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _uint32, uint32_t, _fc64,   GxB_FC64_t)
+            #endif
             default: ;
         }
         break ;
@@ -220,8 +244,11 @@ switch (code1)
             #endif
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _uint64, uint64_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _uint64, uint64_t, _fp64,   double    )
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _uint64, uint64_t, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _uint64, uint64_t, _fc64,   GxB_FC64_t)
+            #endif
             default: ;
         }
         break ;
@@ -243,8 +270,11 @@ switch (code1)
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _fp32, float, _fp32,   float     )
             #endif
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _fp32, float, _fp64,   double    )
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _fp32, float, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _fp32, float, _fc64,   GxB_FC64_t)
+            #endif
             default: ;
         }
         break ;
@@ -266,8 +296,11 @@ switch (code1)
             #if !defined ( GB_EXCLUDE_SAME_TYPES )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _fp64, double, _fp64,   double    )
             #endif
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _fp64, double, _fc32,   GxB_FC32_t)
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _fp64, double, _fc64,   GxB_FC64_t)
+            #endif
             default: ;
         }
         break ;
@@ -287,10 +320,13 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _fp64,   double    )
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             #if !defined ( GB_EXCLUDE_SAME_TYPES )
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _fc32,   GxB_FC32_t)
             #endif
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _fc32, GxB_FC32_t, _fc64,   GxB_FC64_t)
+            #endif
             default: ;
         }
         break ;
@@ -310,10 +346,13 @@ switch (code1)
             case GB_UINT64_code : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _uint64, uint64_t  )
             case GB_FP32_code   : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _fp32,   float     )
             case GB_FP64_code   : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _fp64,   double    )
+            #ifndef GBCUDA
+            // TODO: does not yet work in CUDA
             case GB_FC32_code   : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _fc32,   GxB_FC32_t)
             #if !defined ( GB_EXCLUDE_SAME_TYPES )
             case GB_FC64_code   : GB_WORKER (GB_OPNAME, _fc64, GxB_FC64_t, _fc64,   GxB_FC64_t)
             #endif
+            #endif
             default: ;
         }
         break ;
diff --git a/GraphBLAS/Source/Template/GB_AxB_bitwise_factory.c b/GraphBLAS/Source/Template/GB_AxB_bitwise_factory.c
index 6e1e66e8a8..9bed7e0c23 100644
--- a/GraphBLAS/Source/Template/GB_AxB_bitwise_factory.c
+++ b/GraphBLAS/Source/Template/GB_AxB_bitwise_factory.c
@@ -2,13 +2,14 @@
 // GB_AxB_bitwise_factory.c: switch factory for C=A*B (bitwise monoids)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // A template file #include'd in GB_AxB_factory.c, which calls up to 16
-// bitwise semirings.
+// bitwise semirings.  The multiply operators are bor, band, bxor, or bxnor,
+// as defined by GB_MNAME.
 
 {
     switch (add_opcode)
@@ -20,10 +21,10 @@
 
             switch (zcode)
             {
-                case GB_UINT8_code  : GB_AxB_WORKER (_bor, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_bor, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_bor, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_bor, GB_MULT_NAME, _uint64)
+                case GB_UINT8_code  : GB_AxB_WORKER (_bor, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_bor, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_bor, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_bor, GB_MNAME, _uint64)
                 default: ;
             }
             break ;
@@ -34,10 +35,10 @@
 
             switch (zcode)
             {
-                case GB_UINT8_code  : GB_AxB_WORKER (_band, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_band, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_band, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_band, GB_MULT_NAME, _uint64)
+                case GB_UINT8_code  : GB_AxB_WORKER (_band, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_band, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_band, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_band, GB_MNAME, _uint64)
                 default: ;
             }
             break ;
@@ -48,10 +49,10 @@
 
             switch (zcode)
             {
-                case GB_UINT8_code  : GB_AxB_WORKER (_bxor, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_bxor, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_bxor, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_bxor, GB_MULT_NAME, _uint64)
+                case GB_UINT8_code  : GB_AxB_WORKER (_bxor, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_bxor, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_bxor, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_bxor, GB_MNAME, _uint64)
                 default: ;
             }
             break ;
@@ -62,10 +63,10 @@
 
             switch (zcode)
             {
-                case GB_UINT8_code  : GB_AxB_WORKER (_bxnor, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_bxnor, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_bxnor, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_bxnor, GB_MULT_NAME, _uint64)
+                case GB_UINT8_code  : GB_AxB_WORKER (_bxnor, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_bxnor, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_bxnor, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_bxnor, GB_MNAME, _uint64)
                 default: ;
             }
             break ;
@@ -74,5 +75,5 @@
     }
 }
 
-#undef GB_MULT_NAME
+#undef GB_MNAME
 
diff --git a/GraphBLAS/Source/Template/GB_AxB_colscale_meta.c b/GraphBLAS/Source/Template/GB_AxB_colscale_meta.c
index ac54ff3e9d..69ba13b5cb 100644
--- a/GraphBLAS/Source/Template/GB_AxB_colscale_meta.c
+++ b/GraphBLAS/Source/Template/GB_AxB_colscale_meta.c
@@ -2,19 +2,29 @@
 // GB_AxB_colscale_meta: C=A*D where D is a square diagonal matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// All entries in C=A*D are computed fully in parallel, using the same kind of
-// parallelism as Template/GB_reduce_each_vector.c.
+// All entries in C=A*D are computed entirely in parallel.
+
+// A and C can be jumbled.  D cannot, but it is a diagonal matrix so it is
+// never jumbled.
 
 {
 
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
     // Dx, j, and Ah are unused if the operator is FIRST or PAIR
     #include "GB_unused.h"
 
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_JUMBLED (D)) ;
+
     //--------------------------------------------------------------------------
     // get C, A, and D
     //--------------------------------------------------------------------------
@@ -23,6 +33,7 @@
     const int64_t  *GB_RESTRICT Ah = A->h ;
     const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) (A_is_pattern ? NULL : A->x) ;
     const GB_BTYPE *GB_RESTRICT Dx = (GB_BTYPE *) (D_is_pattern ? NULL : D->x) ;
+    const int64_t avlen = A->vlen ;
 
     //--------------------------------------------------------------------------
     // C=A*D
@@ -48,10 +59,10 @@
             // find the part of A(:,k) and C(:,k) to be operated on by this task
             //------------------------------------------------------------------
 
-            int64_t j = (Ah == NULL) ? k : Ah [k] ;
+            int64_t j = GBH (Ah, k) ;
             int64_t pA_start, pA_end ;
-            GB_get_pA_and_pC (&pA_start, &pA_end, NULL,
-                tid, k, kfirst, klast, pstart_slice, NULL, NULL, Ap) ;
+            GB_get_pA (&pA_start, &pA_end, tid, k,
+                kfirst, klast, pstart_slice, Ap, avlen) ;
 
             //------------------------------------------------------------------
             // C(:,j) = A(:,j)*D(j,j)
@@ -61,8 +72,8 @@
             GB_PRAGMA_SIMD_VECTORIZE
             for (int64_t p = pA_start ; p < pA_end ; p++)
             { 
-                GB_GETA (aij, Ax, p) ;              // aij = A(i,j)
-                GB_BINOP (GB_CX (p), aij, djj) ;    // C(i,j) = aij * djj
+                GB_GETA (aij, Ax, p) ;                  // aij = A(i,j)
+                GB_BINOP (GB_CX (p), aij, djj, 0, 0) ;  // C(i,j) = aij * djj
             }
         }
     }
diff --git a/GraphBLAS/Source/Template/GB_AxB_compare_factory.c b/GraphBLAS/Source/Template/GB_AxB_compare_factory.c
index 11841b07e8..cf05d3fde6 100644
--- a/GraphBLAS/Source/Template/GB_AxB_compare_factory.c
+++ b/GraphBLAS/Source/Template/GB_AxB_compare_factory.c
@@ -2,8 +2,8 @@
 // GB_AxB_compare_factory.c: switch factory for C=A*B with comparator ops
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -41,18 +41,18 @@ ASSERT (zcode == GB_BOOL_code) ;
             switch (xcode)
             {
                 #ifndef GB_NO_BOOLEAN
-                case GB_BOOL_code   : GB_AxB_WORKER (_lor, GB_MULT_NAME, _bool  )
+                case GB_BOOL_code   : GB_AxB_WORKER (_lor, GB_MNAME, _bool  )
                 #endif
-                case GB_INT8_code   : GB_AxB_WORKER (_lor, GB_MULT_NAME, _int8  )
-                case GB_INT16_code  : GB_AxB_WORKER (_lor, GB_MULT_NAME, _int16 )
-                case GB_INT32_code  : GB_AxB_WORKER (_lor, GB_MULT_NAME, _int32 )
-                case GB_INT64_code  : GB_AxB_WORKER (_lor, GB_MULT_NAME, _int64 )
-                case GB_UINT8_code  : GB_AxB_WORKER (_lor, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_lor, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_lor, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_lor, GB_MULT_NAME, _uint64)
-                case GB_FP32_code   : GB_AxB_WORKER (_lor, GB_MULT_NAME, _fp32  )
-                case GB_FP64_code   : GB_AxB_WORKER (_lor, GB_MULT_NAME, _fp64  )
+                case GB_INT8_code   : GB_AxB_WORKER (_lor, GB_MNAME, _int8  )
+                case GB_INT16_code  : GB_AxB_WORKER (_lor, GB_MNAME, _int16 )
+                case GB_INT32_code  : GB_AxB_WORKER (_lor, GB_MNAME, _int32 )
+                case GB_INT64_code  : GB_AxB_WORKER (_lor, GB_MNAME, _int64 )
+                case GB_UINT8_code  : GB_AxB_WORKER (_lor, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_lor, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_lor, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_lor, GB_MNAME, _uint64)
+                case GB_FP32_code   : GB_AxB_WORKER (_lor, GB_MNAME, _fp32  )
+                case GB_FP64_code   : GB_AxB_WORKER (_lor, GB_MNAME, _fp64  )
                 default: ;
             }
             break ;
@@ -63,18 +63,18 @@ ASSERT (zcode == GB_BOOL_code) ;
             {
                 // 10 real, non-boolean types, plus boolean
                 #ifndef GB_NO_BOOLEAN
-                case GB_BOOL_code   : GB_AxB_WORKER (_land, GB_MULT_NAME, _bool  )
+                case GB_BOOL_code   : GB_AxB_WORKER (_land, GB_MNAME, _bool  )
                 #endif
-                case GB_INT8_code   : GB_AxB_WORKER (_land, GB_MULT_NAME, _int8  )
-                case GB_INT16_code  : GB_AxB_WORKER (_land, GB_MULT_NAME, _int16 )
-                case GB_INT32_code  : GB_AxB_WORKER (_land, GB_MULT_NAME, _int32 )
-                case GB_INT64_code  : GB_AxB_WORKER (_land, GB_MULT_NAME, _int64 )
-                case GB_UINT8_code  : GB_AxB_WORKER (_land, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_land, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_land, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_land, GB_MULT_NAME, _uint64)
-                case GB_FP32_code   : GB_AxB_WORKER (_land, GB_MULT_NAME, _fp32  )
-                case GB_FP64_code   : GB_AxB_WORKER (_land, GB_MULT_NAME, _fp64  )
+                case GB_INT8_code   : GB_AxB_WORKER (_land, GB_MNAME, _int8  )
+                case GB_INT16_code  : GB_AxB_WORKER (_land, GB_MNAME, _int16 )
+                case GB_INT32_code  : GB_AxB_WORKER (_land, GB_MNAME, _int32 )
+                case GB_INT64_code  : GB_AxB_WORKER (_land, GB_MNAME, _int64 )
+                case GB_UINT8_code  : GB_AxB_WORKER (_land, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_land, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_land, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_land, GB_MNAME, _uint64)
+                case GB_FP32_code   : GB_AxB_WORKER (_land, GB_MNAME, _fp32  )
+                case GB_FP64_code   : GB_AxB_WORKER (_land, GB_MNAME, _fp64  )
                 default: ;
             }
             break ;
@@ -84,18 +84,18 @@ ASSERT (zcode == GB_BOOL_code) ;
             switch (xcode)
             {
                 #ifndef GB_NO_BOOLEAN
-                case GB_BOOL_code   : GB_AxB_WORKER (_lxor, GB_MULT_NAME, _bool  )
+                case GB_BOOL_code   : GB_AxB_WORKER (_lxor, GB_MNAME, _bool  )
                 #endif
-                case GB_INT8_code   : GB_AxB_WORKER (_lxor, GB_MULT_NAME, _int8  )
-                case GB_INT16_code  : GB_AxB_WORKER (_lxor, GB_MULT_NAME, _int16 )
-                case GB_INT32_code  : GB_AxB_WORKER (_lxor, GB_MULT_NAME, _int32 )
-                case GB_INT64_code  : GB_AxB_WORKER (_lxor, GB_MULT_NAME, _int64 )
-                case GB_UINT8_code  : GB_AxB_WORKER (_lxor, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_lxor, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_lxor, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_lxor, GB_MULT_NAME, _uint64)
-                case GB_FP32_code   : GB_AxB_WORKER (_lxor, GB_MULT_NAME, _fp32  )
-                case GB_FP64_code   : GB_AxB_WORKER (_lxor, GB_MULT_NAME, _fp64  )
+                case GB_INT8_code   : GB_AxB_WORKER (_lxor, GB_MNAME, _int8  )
+                case GB_INT16_code  : GB_AxB_WORKER (_lxor, GB_MNAME, _int16 )
+                case GB_INT32_code  : GB_AxB_WORKER (_lxor, GB_MNAME, _int32 )
+                case GB_INT64_code  : GB_AxB_WORKER (_lxor, GB_MNAME, _int64 )
+                case GB_UINT8_code  : GB_AxB_WORKER (_lxor, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_lxor, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_lxor, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_lxor, GB_MNAME, _uint64)
+                case GB_FP32_code   : GB_AxB_WORKER (_lxor, GB_MNAME, _fp32  )
+                case GB_FP64_code   : GB_AxB_WORKER (_lxor, GB_MNAME, _fp64  )
                 default: ;
             }
             break ;
@@ -105,18 +105,18 @@ ASSERT (zcode == GB_BOOL_code) ;
             switch (xcode)
             {
                 #ifndef GB_NO_BOOLEAN
-                case GB_BOOL_code   : GB_AxB_WORKER (_eq, GB_MULT_NAME, _bool  )
+                case GB_BOOL_code   : GB_AxB_WORKER (_eq, GB_MNAME, _bool  )
                 #endif
-                case GB_INT8_code   : GB_AxB_WORKER (_eq, GB_MULT_NAME, _int8  )
-                case GB_INT16_code  : GB_AxB_WORKER (_eq, GB_MULT_NAME, _int16 )
-                case GB_INT32_code  : GB_AxB_WORKER (_eq, GB_MULT_NAME, _int32 )
-                case GB_INT64_code  : GB_AxB_WORKER (_eq, GB_MULT_NAME, _int64 )
-                case GB_UINT8_code  : GB_AxB_WORKER (_eq, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_eq, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_eq, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_eq, GB_MULT_NAME, _uint64)
-                case GB_FP32_code   : GB_AxB_WORKER (_eq, GB_MULT_NAME, _fp32  )
-                case GB_FP64_code   : GB_AxB_WORKER (_eq, GB_MULT_NAME, _fp64  )
+                case GB_INT8_code   : GB_AxB_WORKER (_eq, GB_MNAME, _int8  )
+                case GB_INT16_code  : GB_AxB_WORKER (_eq, GB_MNAME, _int16 )
+                case GB_INT32_code  : GB_AxB_WORKER (_eq, GB_MNAME, _int32 )
+                case GB_INT64_code  : GB_AxB_WORKER (_eq, GB_MNAME, _int64 )
+                case GB_UINT8_code  : GB_AxB_WORKER (_eq, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_eq, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_eq, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_eq, GB_MNAME, _uint64)
+                case GB_FP32_code   : GB_AxB_WORKER (_eq, GB_MNAME, _fp32  )
+                case GB_FP64_code   : GB_AxB_WORKER (_eq, GB_MNAME, _fp64  )
                 default: ;
             }
             break ;
@@ -126,18 +126,18 @@ ASSERT (zcode == GB_BOOL_code) ;
             switch (xcode)
             {
                 #ifndef GB_NO_BOOLEAN
-                case GB_BOOL_code   : GB_AxB_WORKER (_any, GB_MULT_NAME, _bool  )
+                case GB_BOOL_code   : GB_AxB_WORKER (_any, GB_MNAME, _bool  )
                 #endif
-                case GB_INT8_code   : GB_AxB_WORKER (_any, GB_MULT_NAME, _int8  )
-                case GB_INT16_code  : GB_AxB_WORKER (_any, GB_MULT_NAME, _int16 )
-                case GB_INT32_code  : GB_AxB_WORKER (_any, GB_MULT_NAME, _int32 )
-                case GB_INT64_code  : GB_AxB_WORKER (_any, GB_MULT_NAME, _int64 )
-                case GB_UINT8_code  : GB_AxB_WORKER (_any, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_any, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_any, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_any, GB_MULT_NAME, _uint64)
-                case GB_FP32_code   : GB_AxB_WORKER (_any, GB_MULT_NAME, _fp32  )
-                case GB_FP64_code   : GB_AxB_WORKER (_any, GB_MULT_NAME, _fp64  )
+                case GB_INT8_code   : GB_AxB_WORKER (_any, GB_MNAME, _int8  )
+                case GB_INT16_code  : GB_AxB_WORKER (_any, GB_MNAME, _int16 )
+                case GB_INT32_code  : GB_AxB_WORKER (_any, GB_MNAME, _int32 )
+                case GB_INT64_code  : GB_AxB_WORKER (_any, GB_MNAME, _int64 )
+                case GB_UINT8_code  : GB_AxB_WORKER (_any, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_any, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_any, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_any, GB_MNAME, _uint64)
+                case GB_FP32_code   : GB_AxB_WORKER (_any, GB_MNAME, _fp32  )
+                case GB_FP64_code   : GB_AxB_WORKER (_any, GB_MNAME, _fp64  )
                 default: ;
             }
             break ;
@@ -147,5 +147,5 @@ ASSERT (zcode == GB_BOOL_code) ;
 }
 
 #undef GB_NO_BOOLEAN
-#undef GB_MULT_NAME
+#undef GB_MNAME
 
diff --git a/GraphBLAS/Source/Template/GB_AxB_dot2_compmask.c b/GraphBLAS/Source/Template/GB_AxB_dot2_compmask.c
deleted file mode 100644
index 47af51ee3b..0000000000
--- a/GraphBLAS/Source/Template/GB_AxB_dot2_compmask.c
+++ /dev/null
@@ -1,111 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_AxB_dot2_compmask:  C<!M>=A'*B via dot products
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-{
-    int ntasks = naslice * nbslice ;
-
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-        int a_taskid = taskid / nbslice ;
-        int b_taskid = taskid % nbslice ;
-
-        //----------------------------------------------------------------------
-        // get A
-        //----------------------------------------------------------------------
-
-        GrB_Matrix A = Aslice [a_taskid] ;
-        const int64_t *GB_RESTRICT Ai = A->i ;
-
-        #if defined ( GB_PHASE_1_OF_2 )
-        int64_t *GB_RESTRICT C_count = C_counts [a_taskid] ;
-        #else
-        const int64_t *GB_RESTRICT C_count_start =
-            (a_taskid == 0) ?         NULL : C_counts [a_taskid] ;
-        const int64_t *GB_RESTRICT C_count_end   =
-            (a_taskid == naslice-1) ? NULL : C_counts [a_taskid+1] ;
-        const GB_ATYPE *GB_RESTRICT Ax =
-            (GB_ATYPE *) (A_is_pattern ? NULL : A->x) ;
-        #endif
-
-        //----------------------------------------------------------------------
-        // C<!M>=A'*B via dot products
-        //----------------------------------------------------------------------
-
-        for (int64_t Iter_k = B_slice [b_taskid] ;
-                     Iter_k < B_slice [b_taskid+1] ;
-                     Iter_k++)
-        {
-
-            //------------------------------------------------------------------
-            // get B(:,j)
-            //------------------------------------------------------------------
-
-            GBI_jth_iteration_with_iter (Iter, j, pB_start, pB_end) ;
-            int64_t bjnz = pB_end - pB_start ;
-            // no work to do if B(:,j) is empty
-            if (bjnz == 0) continue ;
-
-            //------------------------------------------------------------------
-            // phase 2 of 2: get the range of entries in C(:,j) to compute
-            //------------------------------------------------------------------
-
-            #if defined ( GB_PHASE_2_OF_2 )
-            // this thread computes Ci and Cx [cnz:cnz_last]
-            int64_t cnz = Cp [Iter_k] +
-                ((C_count_start == NULL) ? 0 : C_count_start [Iter_k]) ;
-            int64_t cnz_last = (C_count_end == NULL) ?
-                (Cp [Iter_k+1] - 1) :
-                (Cp [Iter_k] + C_count_end [Iter_k] - 1) ;
-            if (cnz > cnz_last) continue ;
-            #endif
-
-            //------------------------------------------------------------------
-            // get M(:,j)
-            //------------------------------------------------------------------
-
-            // find vector j in M
-            int64_t pM, pM_end ;
-            int64_t mpleft = 0 ;
-            GB_lookup (M_is_hyper, Mh, Mp, &mpleft, mnvec-1, j, &pM, &pM_end) ;
-
-            //------------------------------------------------------------------
-            // C(:,j)<!M(:,j)> = A'*B(:,j)
-            //------------------------------------------------------------------
-
-            // get the first and last index in B(:,j)
-            int64_t ib_first = Bi [pB_start] ;
-            int64_t ib_last  = Bi [pB_end-1] ;
-
-            // for each vector A(:,i):
-            GBI_for_each_vector_with_iter (Iter_A, A)
-            {
-                GBI_jth_iteration_with_iter (Iter_A, i, pA, pA_end) ;
-
-                // A(:,i) and B(:,j) are both present.  Check M(i,j).
-                // FUTURE:: skip binary search if mask is dense.
-                bool mij = false ;
-                bool found ;
-                int64_t pright = pM_end - 1 ;
-                GB_BINARY_SEARCH (i, Mi, pM, pright, found) ;
-                if (found)
-                {
-                    mij = GB_mcast (Mx, pM, msize) ;
-                }
-                if (!mij)
-                { 
-                    // C(i,j) = A(:,i)'*B(:,j)
-                    #include "GB_AxB_dot_cij.c"
-                }
-            }
-        }
-    }
-}
-
diff --git a/GraphBLAS/Source/Template/GB_AxB_dot2_meta.c b/GraphBLAS/Source/Template/GB_AxB_dot2_meta.c
index df07203086..636686524b 100644
--- a/GraphBLAS/Source/Template/GB_AxB_dot2_meta.c
+++ b/GraphBLAS/Source/Template/GB_AxB_dot2_meta.c
@@ -1,63 +1,181 @@
 //------------------------------------------------------------------------------
-// GB_AxB_dot2_meta: C=A'*B or C<!M>=A'*B via dot productes
+// GB_AxB_dot2_meta: C=A'*B, C<M>=A'*B or C<!M>=A'*B via dot products
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
+#define GB_DOT2
+
+#include "GB_unused.h"
+#include "GB_AxB_dot_cij.h"
+
+// GB_DOT_ALWAYS_SAVE_CIJ: C(i,j) = cij
+#define GB_DOT_ALWAYS_SAVE_CIJ      \
+{                                   \
+    GB_PUTC (cij, pC) ;             \
+    Cb [pC] = 1 ;                   \
+    task_cnvals++ ;                 \
+}
+
+// GB_DOT_SAVE_CIJ: C(i,j) = cij, unless already done by GB_DOT
+#if GB_IS_ANY_MONOID
+
+    // for the ANY monoid, GB_DOT saves C(i,j) as soon as a value is found
+    #define GB_DOT_SAVE_CIJ
+
+#else
+
+    // all other monoids: C(i,j) = cij if it exists
+    #define GB_DOT_SAVE_CIJ             \
+    {                                   \
+        if (GB_CIJ_EXISTS)              \
+        {                               \
+            GB_DOT_ALWAYS_SAVE_CIJ ;    \
+        }                               \
+    }
+
+#endif
+
 {
 
     //--------------------------------------------------------------------------
-    // get B and C
+    // get A, B, and C
     //--------------------------------------------------------------------------
 
-    #if defined ( GB_PHASE_2_OF_2)
-    int64_t  *GB_RESTRICT Cp = C->p ;
-    int64_t  *GB_RESTRICT Ci = C->i ;
+    // A and B are never hypersparse.  If they are hypersparse on input, they
+    // are converted to packed sparse form first, and the C matrix has smaller
+    // dimensions.  The C bitmap matrix is unpacked into a sparse or
+    // hypersparse matrix when done.
+
+    int64_t cnvals = 0 ;
+
+    ASSERT (GB_IS_BITMAP (C)) ;
+    int8_t   *GB_RESTRICT Cb = C->b ;
     GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
-    const GB_BTYPE *GB_RESTRICT Bx = (GB_BTYPE *) (B_is_pattern ? NULL : B->x) ;
-    #endif
+    const int64_t cvlen = C->vlen ;
 
+    const int64_t *GB_RESTRICT Bp = B->p ;
+    const int8_t  *GB_RESTRICT Bb = B->b ;
     const int64_t *GB_RESTRICT Bi = B->i ;
-    int64_t bvlen = B->vlen ;
+    const GB_BTYPE *GB_RESTRICT Bx = (GB_BTYPE *) (B_is_pattern ? NULL : B->x) ;
+    const bool B_is_bitmap = GB_IS_BITMAP (B) ;
+    const bool B_is_sparse = GB_IS_SPARSE (B) ;
+    ASSERT (!GB_IS_HYPERSPARSE (B)) ;
+    #define B_is_hyper false
+
+    const int64_t *GB_RESTRICT Ap = A->p ;
+    const int8_t  *GB_RESTRICT Ab = A->b ;
+    const int64_t *GB_RESTRICT Ai = A->i ;
+    const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) (A_is_pattern ? NULL : A->x) ;
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
+    const bool A_is_sparse = GB_IS_SPARSE (A) ;
+    ASSERT (!GB_IS_HYPERSPARSE (A)) ;
+    #define A_is_hyper false
 
-    // create the iterator for B.  Since the iterator is a read-only object
-    // after initialization with GBI1_init, it can be shared by all threads.
-    GBI_single_iterator Iter ;
-    GBI1_init (&Iter, B) ;
+    const int64_t vlen = A->vlen ;
+    ASSERT (A->vlen == B->vlen) ;
+
+    const int ntasks = naslice * nbslice ;
 
     //--------------------------------------------------------------------------
-    // C=A'*B or C<!M>=A'*B via dot products
+    // C=A'*B, C<M>=A'*B, or C<!M>=A'*B via dot products
     //--------------------------------------------------------------------------
 
     if (M == NULL)
     { 
 
-        // C = A'*B via dot products
-        #include "GB_AxB_dot2_nomask.c"
+        //----------------------------------------------------------------------
+        // C = A'*B
+        //----------------------------------------------------------------------
+
+        #undef GB_MASK_IS_PRESENT
+        #include "GB_meta16_factory.c"
 
     }
     else
-    { 
+    {
 
         //----------------------------------------------------------------------
-        // get M
+        // C<M>=A'*B or C<!M>=A'*B
         //----------------------------------------------------------------------
 
-        const int64_t *GB_RESTRICT Mp = M->p ;
-        const int64_t *GB_RESTRICT Mh = M->h ;
-        const int64_t *GB_RESTRICT Mi = M->i ;
-        const GB_void *GB_RESTRICT Mx ;
-        Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
-        size_t msize = M->type->size ;
-        const int64_t mnvec = M->nvec ;
-        bool M_is_hyper = GB_IS_HYPER (M) ;
-
-        // C<!M> = A'*B via dot products
-        #include "GB_AxB_dot2_compmask.c"
+        // 12 possible cases of the mask are handled:
+
+        // if M is not complemented (Mask_comp is false): 4 cases
+        // M can be bitmap or full, not sparse or hyper (dot3 handles that)
+        // M can be structural or valued
+
+        // if M is complemented (Mask_comp is true): 8 cases
+        // M can be sparse, hyper, bitmap, or full
+        // M can be structural or valued
+
+        const int8_t *GB_RESTRICT Mb = M->b ;
+        const bool M_is_bitmap = GB_IS_BITMAP (M) ;
+        const bool M_is_full = GB_IS_FULL (M) ;
+
+        #if ( GB_IS_ANY_MONOID )
+        if (B_is_bitmap && A_is_sparse && M_is_bitmap && Mask_struct
+            && Mask_comp)
+        {
+
+            //------------------------------------------------------------------
+            // C<#M,struct> = A'*B, special case
+            //------------------------------------------------------------------
+
+            // GB_ANY_SPECIALIZED is defined if the following conditions hold:
+            // semirings: all built-in semirings with the ANY monoid
+            // A: sparse
+            // B: bitmap
+            // M: bitmap
+            // Mask_comp: true
+            // Mask_struct: true
+
+            GBURBLE ("(specialized) ") ;
+            #define GB_ANY_SPECIALIZED
+            #define GB_MASK_IS_PRESENT
+            #define GB_A_IS_SPARSE 1
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_BITMAP 1
+            #define GB_B_IS_FULL   0
+            #include "GB_AxB_dot2_template.c"
+            #undef  GB_ANY_SPECIALIZED
+            #undef GB_MASK_IS_PRESENT
+
+        }
+        else
+        #endif
+        { 
+
+            //------------------------------------------------------------------
+            // C<M>=A'*B or C<!M>=A'*B
+            //------------------------------------------------------------------
+
+            const GB_void *GB_RESTRICT Mx = (GB_void *)
+                (Mask_struct ? NULL : (M->x)) ;
+            const size_t msize = M->type->size ;
+
+            #define GB_MASK_IS_PRESENT
+            #include "GB_meta16_factory.c"
+            #undef GB_MASK_IS_PRESENT
+
+        }
     }
 
+    C->nvals = cnvals ;
 }
 
+#undef A_is_hyper
+#undef B_is_hyper
+
+#undef GB_DOT_ALWAYS_SAVE_CIJ
+#undef GB_DOT_SAVE_CIJ
+
+#undef GB_DOT2
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_dot2_nomask.c b/GraphBLAS/Source/Template/GB_AxB_dot2_nomask.c
deleted file mode 100644
index 1c44ca3469..0000000000
--- a/GraphBLAS/Source/Template/GB_AxB_dot2_nomask.c
+++ /dev/null
@@ -1,101 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_AxB_dot2_nomask:  C=A'*B via dot products
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-{
-    int ntasks = naslice * nbslice ;
-
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-        int a_taskid = taskid / nbslice ;
-        int b_taskid = taskid % nbslice ;
-
-        //----------------------------------------------------------------------
-        // get A
-        //----------------------------------------------------------------------
-
-        GrB_Matrix A = Aslice [a_taskid] ;
-        const int64_t *GB_RESTRICT Ai = A->i ;
-
-        #if defined ( GB_PHASE_1_OF_2 )
-        int64_t *GB_RESTRICT C_count = C_counts [a_taskid] ;
-        #else
-        int64_t *GB_RESTRICT C_count_start =
-            (a_taskid == 0) ?         NULL : C_counts [a_taskid] ;
-        int64_t *GB_RESTRICT C_count_end   =
-            (a_taskid == naslice-1) ? NULL : C_counts [a_taskid+1] ;
-        const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *)
-            (A_is_pattern ? NULL : A->x) ;
-        #endif
-
-        //----------------------------------------------------------------------
-        // C=A'*B via dot products
-        //----------------------------------------------------------------------
-
-        for (int64_t Iter_k = B_slice [b_taskid] ;
-                     Iter_k < B_slice [b_taskid+1] ;
-                     Iter_k++)
-        {
-
-            //------------------------------------------------------------------
-            // get B(:,j)
-            //------------------------------------------------------------------
-
-            GBI_jth_iteration_with_iter (Iter, j, pB_start, pB_end) ;
-            int64_t bjnz = pB_end - pB_start ;
-            // no work to do if B(:,j) is empty
-            if (bjnz == 0) continue ;
-
-            //------------------------------------------------------------------
-            // phase 1 of 2: skip if B(:,j) is dense
-            //------------------------------------------------------------------
-
-            #if defined ( GB_PHASE_1_OF_2 )
-            if (bjnz == bvlen)
-            { 
-                // C(i,j) is if A(:i) not empty
-                C_count [Iter_k] = A->nvec_nonempty ;
-                continue ;
-            }
-            #endif
-
-            //------------------------------------------------------------------
-            // phase 2 of 2: get the range of entries in C(:,j) to compute
-            //------------------------------------------------------------------
-
-            #if defined ( GB_PHASE_2_OF_2 )
-            // this thread computes Ci and Cx [cnz:cnz_last]
-            int64_t cnz = Cp [Iter_k] +
-                ((C_count_start == NULL) ? 0 : C_count_start [Iter_k]) ;
-            int64_t cnz_last = (C_count_end == NULL) ?
-                (Cp [Iter_k+1] - 1) :
-                (Cp [Iter_k] + C_count_end [Iter_k] - 1) ;
-            if (cnz > cnz_last) continue ;
-            #endif
-
-            //------------------------------------------------------------------
-            // C(:,j) = A'*B(:,j)
-            //------------------------------------------------------------------
-
-            // get the first and last index in B(:,j)
-            int64_t ib_first = Bi [pB_start] ;
-            int64_t ib_last  = Bi [pB_end-1] ;
-
-            // for each vector A(:,i):
-            GBI_for_each_vector_with_iter (Iter_A, A)
-            { 
-                GBI_jth_iteration_with_iter (Iter_A, i, pA, pA_end) ;
-                // C(i,j) = A(:,i)'*B(:,j)
-                #include "GB_AxB_dot_cij.c"
-            }
-        }
-    }
-}
-
diff --git a/GraphBLAS/Source/Template/GB_AxB_dot2_template.c b/GraphBLAS/Source/Template/GB_AxB_dot2_template.c
new file mode 100644
index 0000000000..6bb9182f2e
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_dot2_template.c
@@ -0,0 +1,151 @@
+//------------------------------------------------------------------------------
+// GB_AxB_dot2_template:  C=A'B, C<!M>=A'*B, or C<M>=A'*B via dot products
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// A and B are sparse, bitmap, or full; never hypersparse.  If the input
+// matrices A and/or B are hypersparse, they are packed into sparse matrices,
+// and C is unpacked from bitmap to sparse/hypersparse when done.
+
+#if ( !GB_A_IS_HYPER && !GB_B_IS_HYPER )
+{
+
+    //--------------------------------------------------------------------------
+    // C=A'*B, C<M>=A'*B, or C<!M>=A'*B where C is bitmap
+    //--------------------------------------------------------------------------
+
+    int tid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+        reduction(+:cnvals)
+    for (tid = 0 ; tid < ntasks ; tid++)
+    {
+
+        //----------------------------------------------------------------------
+        // get the task descriptor
+        //----------------------------------------------------------------------
+
+        const int a_tid = tid / nbslice ;
+        const int b_tid = tid % nbslice ;
+        const int64_t kA_start = A_slice [a_tid] ;
+        const int64_t kA_end   = A_slice [a_tid+1] ;
+        const int64_t kB_start = B_slice [b_tid] ;
+        const int64_t kB_end   = B_slice [b_tid+1] ;
+        int64_t task_cnvals = 0 ;
+
+        //----------------------------------------------------------------------
+        // C=A'*B, C<M>=A'*B, or C<!M>=A'*B via dot products
+        //----------------------------------------------------------------------
+
+        for (int64_t j = kB_start ; j < kB_end ; j++)
+        {
+
+            //------------------------------------------------------------------
+            // get B(:,j) and C(:,j)
+            //------------------------------------------------------------------
+
+            const int64_t pC_start = j * cvlen ;
+
+            #if GB_B_IS_SPARSE
+                // B is sparse (never hypersparse)
+                const int64_t pB_start = Bp [j] ;
+                const int64_t pB_end = Bp [j+1] ;
+                const int64_t bjnz = pB_end - pB_start ;
+                if (bjnz == 0)
+                { 
+                    // no work to do if B(:,j) is empty, except to clear Cb
+                    memset (&Cb [pC_start + kA_start], 0, kA_end - kA_start) ;
+                    continue ;
+                }
+                #if GB_A_IS_SPARSE
+                    // Both A and B are sparse; get first and last in B(:,j)
+                    const int64_t ib_first = Bi [pB_start] ;
+                    const int64_t ib_last  = Bi [pB_end-1] ;
+                #endif
+            #else
+                // B is bitmap or full
+                const int64_t pB_start = j * vlen ;
+            #endif
+
+            //------------------------------------------------------------------
+            // C(:,j)<#M(:,j)> = A'*B(:,j), or C(:,j) = A'*B(:,j) if no mask
+            //------------------------------------------------------------------
+
+            for (int64_t i = kA_start ; i < kA_end ; i++)
+            {
+
+                //--------------------------------------------------------------
+                // get C(i,j), M(i,j), and clear the C(i,j) bitmap
+                //--------------------------------------------------------------
+
+                int64_t pC = pC_start + i ;     // C is bitmap
+
+                #if defined ( GB_ANY_SPECIALIZED )
+                // M is bitmap and structural; Mask_comp true
+                Cb [pC] = 0 ;
+                if (!Mb [pC])
+                #elif defined ( GB_MASK_IS_PRESENT )
+                bool mij ;
+                if (M_is_bitmap)
+                { 
+                    // M is bitmap
+                    mij = Mb [pC] && GB_mcast (Mx, pC, msize) ;
+                }
+                else if (M_is_full)
+                { 
+                    // M is full
+                    mij = GB_mcast (Mx, pC, msize) ;
+                }
+                else // M is sparse or hyper
+                { 
+                    // M has been scattered into the C bitmap
+                    mij = (Cb [pC] > 1) ;
+                }
+                Cb [pC] = 0 ;
+                if (mij ^ Mask_comp)
+                #else
+                // M is not present
+                Cb [pC] = 0 ;
+                #endif
+                { 
+
+                    //----------------------------------------------------------
+                    // the mask allows C(i,j) to be computed
+                    //----------------------------------------------------------
+
+                    #if GB_A_IS_SPARSE
+                    // A is sparse
+                    int64_t pA = Ap [i] ;
+                    const int64_t pA_end = Ap [i+1] ;
+                    const int64_t ainz = pA_end - pA ;
+                    if (ainz > 0)
+                    #else
+                    // A is bitmap or full
+                    const int64_t pA = i * vlen ;
+                    #endif
+                    { 
+                        // C(i,j) = A(:,i)'*B(:,j)
+                        bool cij_exists = false ;
+                        GB_CIJ_DECLARE (cij) ;
+                        #include "GB_AxB_dot_cij.c"
+                    }
+                }
+            }
+        }
+        cnvals += task_cnvals ;
+    }
+}
+#endif
+
+#undef GB_A_IS_SPARSE
+#undef GB_A_IS_HYPER
+#undef GB_A_IS_BITMAP
+#undef GB_A_IS_FULL
+#undef GB_B_IS_SPARSE
+#undef GB_B_IS_HYPER
+#undef GB_B_IS_BITMAP
+#undef GB_B_IS_FULL
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_dot3_meta.c b/GraphBLAS/Source/Template/GB_AxB_dot3_meta.c
new file mode 100644
index 0000000000..861e56f5cb
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_dot3_meta.c
@@ -0,0 +1,137 @@
+//------------------------------------------------------------------------------
+// GB_AxB_dot3_meta: C<M>=A'*B via dot products, where C is sparse/hypersparse
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#define GB_DOT3
+#define GB_DOT3_PHASE2
+
+#include "GB_unused.h"
+#include "GB_AxB_dot_cij.h"
+
+// GB_DOT_ALWAYS_SAVE_CIJ: C(i,j) = cij
+#if GB_CIJ_CHECK
+
+    #define GB_DOT_ALWAYS_SAVE_CIJ      \
+    {                                   \
+        cij_exists = true ;             \
+        GB_PUTC (cij, pC) ;             \
+        Ci [pC] = i ;                   \
+    }
+
+#else
+
+    #define GB_DOT_ALWAYS_SAVE_CIJ      \
+    {                                   \
+        GB_PUTC (cij, pC) ;             \
+        Ci [pC] = i ;                   \
+    }
+
+#endif
+
+// GB_DOT_SAVE_CIJ: C(i,j) = cij, if it exists
+#define GB_DOT_SAVE_CIJ             \
+{                                   \
+    if (GB_CIJ_EXISTS)              \
+    {                               \
+        GB_PUTC (cij, pC) ;         \
+        Ci [pC] = i ;               \
+    }                               \
+}
+
+{
+
+    //--------------------------------------------------------------------------
+    // get M, A, B, and C
+    //--------------------------------------------------------------------------
+
+    // C and M have the same sparsity patter (both are sparse or hyper),
+    // except entries of C may become zombies.  M is not complemented.
+
+    int64_t nzombies = 0 ;
+
+    ASSERT (GB_IS_SPARSE (C) || GB_IS_HYPERSPARSE (C)) ;
+    const int64_t *GB_RESTRICT Cp = C->p ;
+    const int64_t *GB_RESTRICT Ch = C->h ;
+    int64_t  *GB_RESTRICT Ci = C->i ;
+    GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
+    const int64_t cvlen = C->vlen ;
+
+    const int64_t *GB_RESTRICT Bp = B->p ;
+    const int64_t *GB_RESTRICT Bh = B->h ;
+    const int8_t  *GB_RESTRICT Bb = B->b ;
+    const int64_t *GB_RESTRICT Bi = B->i ;
+    const GB_BTYPE *GB_RESTRICT Bx = (GB_BTYPE *) (B_is_pattern ? NULL : B->x) ;
+    const int64_t bnvec = B->nvec ;
+    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
+    const bool B_is_bitmap = GB_IS_BITMAP (B) ;
+    const bool B_is_sparse = GB_IS_SPARSE (B) ;
+
+    const int64_t *GB_RESTRICT Ap = A->p ;
+    const int64_t *GB_RESTRICT Ah = A->h ;
+    const int8_t  *GB_RESTRICT Ab = A->b ;
+    const int64_t *GB_RESTRICT Ai = A->i ;
+    const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) (A_is_pattern ? NULL : A->x) ;
+    const int64_t anvec = A->nvec ;
+    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
+    const bool A_is_sparse = GB_IS_SPARSE (A) ;
+
+    const int64_t vlen = A->vlen ;
+    ASSERT (A->vlen == B->vlen) ;
+
+    const bool M_is_sparse = GB_IS_SPARSE (M) ;
+    ASSERT (M_is_sparse || GB_IS_HYPERSPARSE (M)) ;
+    const int64_t *GB_RESTRICT Mi = M->i ;
+    const size_t mvlen = M->vlen ;
+
+    //--------------------------------------------------------------------------
+    // C<M> = A'*B via dot products, where C and M are both sparse/hyper
+    //--------------------------------------------------------------------------
+
+    // 4 possible cases of the mask are handled:
+
+    // M can be sparse or hyper, and always present
+    // M can be structural or valued
+    // M is not complemented
+
+    // The other 12 cases of the mask, and the one no-mask case, are handled
+    // by dot2.
+
+    if (M_is_sparse && Mask_struct && A_is_sparse && B_is_sparse)
+    {
+        // special case: M is sparse and structural, and A and B are sparse
+        #define GB_MASK_SPARSE_AND_STRUCTURAL
+        #define GB_A_IS_SPARSE 1
+        #define GB_A_IS_HYPER  0
+        #define GB_A_IS_BITMAP 0
+        #define GB_A_IS_FULL   0
+        #define GB_B_IS_SPARSE 1
+        #define GB_B_IS_HYPER  0
+        #define GB_B_IS_BITMAP 0
+        #define GB_B_IS_FULL   0
+        #include "GB_AxB_dot3_template.c"
+        #undef GB_MASK_SPARSE_AND_STRUCTURAL
+    }
+    else
+    {
+        // general case
+        const GB_void *GB_RESTRICT Mx = (GB_void *)
+            (Mask_struct ? NULL : (M->x)) ;
+        const size_t msize = M->type->size ;
+        #include "GB_meta16_factory.c"
+    }
+
+    C->nzombies = nzombies ;
+}
+
+#undef GB_DOT_ALWAYS_SAVE_CIJ
+#undef GB_DOT_SAVE_CIJ
+
+#undef GB_DOT3
+#undef GB_DOT3_PHASE2
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_dot3_phase1_template.c b/GraphBLAS/Source/Template/GB_AxB_dot3_phase1_template.c
new file mode 100644
index 0000000000..98c218d222
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_dot3_phase1_template.c
@@ -0,0 +1,129 @@
+//------------------------------------------------------------------------------
+// GB_AxB_dot3_phase1_template: analysis phase for dot3 (C<M> = A'*B)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+    int taskid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    {
+
+        //----------------------------------------------------------------------
+        // get the task descriptor
+        //----------------------------------------------------------------------
+
+        int64_t kfirst = TaskList [taskid].kfirst ;
+        int64_t klast  = TaskList [taskid].klast ;
+        bool fine_task = (klast == -1) ;
+        if (fine_task)
+        { 
+            // a fine task operates on a slice of a single vector
+            klast = kfirst ;
+        }
+        int64_t bpleft = 0 ;    // Ch is not jumbled
+
+        //----------------------------------------------------------------------
+        // compute all vectors in this task
+        //----------------------------------------------------------------------
+
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // get j, the kth vector of C and M
+            //------------------------------------------------------------------
+
+            #if defined ( GB_MASK_SPARSE_AND_STRUCTURAL )
+            // M and C are sparse
+            const int64_t j = k ;
+            #else
+            // M and C are either both sparse or both hypersparse
+            const int64_t j = GBH (Ch, k) ;
+            #endif
+
+            GB_GET_VECTOR (pM, pM_end, pM, pM_end, Mp, k, mvlen) ;
+
+            //------------------------------------------------------------------
+            // get B(:,j)
+            //------------------------------------------------------------------
+
+            #if GB_B_IS_HYPER
+                // B is hyper
+                int64_t pB_start, pB_end ;
+                GB_lookup (true, Bh, Bp, vlen, &bpleft, bnvec-1, j,
+                    &pB_start, &pB_end) ;
+            #elif GB_B_IS_SPARSE
+                // B is sparse
+                const int64_t pB_start = Bp [j] ;
+                const int64_t pB_end = Bp [j+1] ;
+            #else
+                // B is bitmap or full
+                const int64_t pB_start = j * vlen ;
+                const int64_t pB_end = (j+1) * vlen ;
+            #endif
+            const int64_t bjnz = pB_end - pB_start ;
+
+            //------------------------------------------------------------------
+            // estimate the work to compute each entry of C(:,j)
+            //------------------------------------------------------------------
+
+            // A decent estimate of the work to compute the dot product C(i,j)
+            // = A(:,i)'*B(:,j) is min (|A(:,i)|, |B(:,j)|) + 1.  This is a
+            // lower bound.  The actual work could require a binary search of
+            // either A(:,i) or B(:,j), or a merge of the two vectors.  Or it
+            // could require no work at all if all entries in A(:,i) appear
+            // before all entries in B(:,j), or visa versa.  No work is done if
+            // M(i,j)=0.
+
+            if (bjnz == 0)
+            {
+                // B(:,j) is empty, so C(:,j) is empty as well.  No work is to
+                // be done, but it still takes unit work to flag each C(:,j) as
+                // a zombie
+                for ( ; pM < pM_end ; pM++)
+                { 
+                    Cwork [pM] = 1 ;
+                }
+            }
+            else
+            {
+                for ( ; pM < pM_end ; pM++)
+                {
+                    int64_t work = 1 ;
+                    #if !defined ( GB_MASK_SPARSE_AND_STRUCTURAL )
+                    // if M is structural, no need to check its values
+                    if (GB_mcast (Mx, pM, msize))
+                    #endif
+                    { 
+                        const int64_t i = Mi [pM] ;
+                        #if GB_A_IS_HYPER
+                        // A is hyper
+                        int64_t pA, pA_end ;
+                        int64_t apleft = 0 ;    // M might be jumbled
+                        GB_lookup (true, Ah, Ap, vlen, &apleft, anvec-1, i,
+                            &pA, &pA_end) ;
+                        const int64_t ainz = pA_end - pA ;
+                        work += GB_IMIN (ainz, bjnz) ;
+                        #elif GB_A_IS_SPARSE
+                        // A is sparse
+                        const int64_t pA = Ap [i] ;
+                        const int64_t pA_end = Ap [i+1] ;
+                        const int64_t ainz = pA_end - pA ;
+                        work += GB_IMIN (ainz, bjnz) ;
+                        #else
+                        // A is bitmap or full
+                        work += bjnz ;
+                        #endif
+                    }
+                    Cwork [pM] = work ;
+                }
+            }
+        }
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_dot3_template.c b/GraphBLAS/Source/Template/GB_AxB_dot3_template.c
index 22c8a6f5a1..5c6e6ea10e 100644
--- a/GraphBLAS/Source/Template/GB_AxB_dot3_template.c
+++ b/GraphBLAS/Source/Template/GB_AxB_dot3_template.c
@@ -1,70 +1,33 @@
 //------------------------------------------------------------------------------
-// GB_AxB_dot3_template: C<M>=A'*B via dot products
+// GB_AxB_dot3_template: C<M>=A'*B via dot products, where C is sparse/hyper
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-#ifndef GB_DOT3
-#define GB_DOT3
-#endif
+// C and M are both sparse or hyper, and C->h is a copy of M->h.
+// M is present, and not complemented.  It may be valued or structural.
 
 {
 
-    //--------------------------------------------------------------------------
-    // get M, A, B, and C
-    //--------------------------------------------------------------------------
-
-    const int64_t *GB_RESTRICT Cp = C->p ;
-    const int64_t *GB_RESTRICT Ch = C->h ;
-    int64_t  *GB_RESTRICT Ci = C->i ;
-    GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
-
-    const int64_t *GB_RESTRICT Bp = B->p ;
-    const int64_t *GB_RESTRICT Bh = B->h ;
-    const int64_t *GB_RESTRICT Bi = B->i ;
-    const GB_BTYPE *GB_RESTRICT Bx = (GB_BTYPE *) (B_is_pattern ? NULL : B->x) ;
-    const int64_t bvlen = B->vlen ;
-    const int64_t bnvec = B->nvec ;
-    const bool B_is_hyper = B->is_hyper ;
-
-    const int64_t *GB_RESTRICT Mi = M->i ;
-    const GB_void *GB_RESTRICT Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
-    const size_t msize = M->type->size ;
-
-    const int64_t *GB_RESTRICT Ah = A->h ;
-    const int64_t *GB_RESTRICT Ap = A->p ;
-    const int64_t *GB_RESTRICT Ai = A->i ;
-    const int64_t anvec = A->nvec ;
-    const bool A_is_hyper = GB_IS_HYPER (A) ;
-    const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) (A_is_pattern ? NULL : A->x) ;
-
-    //--------------------------------------------------------------------------
-    // C<M> = A'*B
-    //--------------------------------------------------------------------------
-
-    // C and M have the same pattern, except some entries of C may become
-    // zombies.
-    int64_t nzombies = 0 ;
-
-    int taskid ;
+    int tid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
         reduction(+:nzombies)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    for (tid = 0 ; tid < ntasks ; tid++)
     {
 
         //----------------------------------------------------------------------
         // get the task descriptor
         //----------------------------------------------------------------------
 
-        int64_t kfirst = TaskList [taskid].kfirst ;
-        int64_t klast  = TaskList [taskid].klast ;
-        int64_t pC_first = TaskList [taskid].pC ;
-        int64_t pC_last  = TaskList [taskid].pC_end ;
-        int64_t task_nzombies = 0 ;
-        int64_t bpleft = 0 ;
+        int64_t kfirst = TaskList [tid].kfirst ;
+        int64_t klast  = TaskList [tid].klast ;
+        int64_t pC_first = TaskList [tid].pC ;
+        int64_t pC_last  = TaskList [tid].pC_end ;
+        int64_t bpleft = 0 ;            // Ch is not jumbled
+        int64_t task_nzombies = 0 ;     // # of zombies found by this task
 
         //----------------------------------------------------------------------
         // compute all vectors in this task
@@ -77,115 +40,139 @@
             // get C(:,k) and M(:k)
             //------------------------------------------------------------------
 
-            int64_t j = (Ch == NULL) ? k : Ch [k] ;
-            int64_t pC_start, pC_end ;
+            #if defined ( GB_MASK_SPARSE_AND_STRUCTURAL )
+            // M and C are sparse
+            const int64_t j = k ;
+            #else
+            // M and C are either both sparse or both hypersparse
+            const int64_t j = GBH (Ch, k) ;
+            #endif
+
+            int64_t pC_start = Cp [k] ;
+            int64_t pC_end   = Cp [k+1] ;
             if (k == kfirst)
             { 
                 // First vector for task; may only be partially owned.
                 pC_start = pC_first ;
-                pC_end   = GB_IMIN (Cp [k+1], pC_last) ;
+                pC_end   = GB_IMIN (pC_end, pC_last) ;
             }
             else if (k == klast)
             { 
                 // Last vector for task; may only be partially owned.
-                pC_start = Cp [k] ;
                 pC_end   = pC_last ;
             }
             else
             { 
-                // task fully owns this vector C(:,k).
-                pC_start = Cp [k] ;
-                pC_end   = Cp [k+1] ;
+                // task completely owns this vector C(:,k).
             }
 
             //------------------------------------------------------------------
             // get B(:,j)
             //------------------------------------------------------------------
 
-            int64_t pB_start, pB_end ;
-            GB_lookup (B_is_hyper, Bh, Bp, &bpleft, bnvec-1, j,
-                &pB_start, &pB_end) ;
-            int64_t bjnz = pB_end - pB_start ;
+            #if GB_B_IS_HYPER
+                // B is hyper
+                int64_t pB_start, pB_end ;
+                GB_lookup (true, Bh, Bp, vlen, &bpleft, bnvec-1, j,
+                    &pB_start, &pB_end) ;
+            #elif GB_B_IS_SPARSE
+                // B is sparse
+                const int64_t pB_start = Bp [j] ;
+                const int64_t pB_end = Bp [j+1] ;
+            #else
+                // B is bitmap or full
+                const int64_t pB_start = j * vlen ;
+            #endif
+
+            #if (GB_B_IS_SPARSE || GB_B_IS_HYPER)
+                const int64_t bjnz = pB_end - pB_start ;
+                if (bjnz == 0)
+                {
+                    // no work to do if B(:,j) is empty, except for zombies
+                    task_nzombies += (pC_end - pC_start) ;
+                    for (int64_t pC = pC_start ; pC < pC_end ; pC++)
+                    { 
+                        // C(i,j) is a zombie
+                        int64_t i = Mi [pC] ;
+                        Ci [pC] = GB_FLIP (i) ;
+                    }
+                    continue ;
+                }
+                #if (GB_A_IS_SPARSE || GB_A_IS_HYPER)
+                    // Both A and B are sparse; get first and last in B(:,j)
+                    const int64_t ib_first = Bi [pB_start] ;
+                    const int64_t ib_last  = Bi [pB_end-1] ;
+                #endif
+            #endif
 
             //------------------------------------------------------------------
             // C(:,j)<M(:,j)> = A(:,i)'*B(:,j)
             //------------------------------------------------------------------
 
-            if (bjnz == 0)
-            {
-            
-                //--------------------------------------------------------------
-                // C(:,j) is empty if B(:,j) is empty
-                //--------------------------------------------------------------
-
-                task_nzombies += (pC_end - pC_start) ;
-                for (int64_t pC = pC_start ; pC < pC_end ; pC++)
-                { 
-                    // C(i,j) is a zombie
-                    Ci [pC] = GB_FLIP (Mi [pC]) ;
-                }
-            }
-            else
+            for (int64_t pC = pC_start ; pC < pC_end ; pC++)
             {
 
                 //--------------------------------------------------------------
-                // B(:,j) not empty
+                // get C(i,j) and M(i,j)
                 //--------------------------------------------------------------
 
-                int64_t ib_first = Bi [pB_start] ;
-                int64_t ib_last  = Bi [pB_end-1] ;
-                int64_t apleft = 0 ;
+                bool cij_exists = false ;
+                GB_CIJ_DECLARE (cij) ;
 
-                for (int64_t pC = pC_start ; pC < pC_end ; pC++)
-                {
+                // get the value of M(i,j)
+                int64_t i = Mi [pC] ;
+                #if !defined ( GB_MASK_SPARSE_AND_STRUCTURAL )
+                // if M is structural, no need to check its values
+                if (GB_mcast (Mx, pC, msize))
+                #endif
+                { 
 
                     //----------------------------------------------------------
-                    // compute C(i,j)
+                    // the mask allows C(i,j) to be computed
                     //----------------------------------------------------------
 
-                    // get the value of M(i,j)
-                    int64_t i = Mi [pC] ;
-                    if (GB_mcast (Mx, pC, msize))   // note: Mx [pC], same as Cx
+                    #if GB_A_IS_HYPER
+                    // A is hyper
+                    int64_t pA, pA_end ;
+                    int64_t apleft = 0 ;    // M might be jumbled
+                    GB_lookup (true, Ah, Ap, vlen, &apleft, anvec-1, i,
+                        &pA, &pA_end) ;
+                    const int64_t ainz = pA_end - pA ;
+                    if (ainz > 0)
+                    #elif GB_A_IS_SPARSE
+                    // A is sparse
+                    int64_t pA = Ap [i] ;
+                    const int64_t pA_end = Ap [i+1] ;
+                    const int64_t ainz = pA_end - pA ;
+                    if (ainz > 0)
+                    #else
+                    // A is bitmap or full
+                    const int64_t pA = i * vlen ;
+                    #endif
                     { 
-
-                        //------------------------------------------------------
-                        // M(i,j) is true, so compute C(i,j)
-                        //------------------------------------------------------
-
-                        // get A(:,i), if it exists
-                        int64_t pA, pA_end ;
-                        GB_lookup (A_is_hyper, Ah, Ap, &apleft, anvec-1, i,
-                            &pA, &pA_end) ;
-
                         // C(i,j) = A(:,i)'*B(:,j)
                         #include "GB_AxB_dot_cij.c"
                     }
-                    else
-                    { 
-
-                        //------------------------------------------------------
-                        // M(i,j) is false, so C(i,j) is a zombie
-                        //------------------------------------------------------
+                }
 
-                        task_nzombies++ ;
-                        Ci [pC] = GB_FLIP (i) ;
-                    }
+                if (!GB_CIJ_EXISTS)
+                { 
+                    // C(i,j) is a zombie
+                    task_nzombies++ ;
+                    Ci [pC] = GB_FLIP (i) ;
                 }
             }
         }
-
-        //----------------------------------------------------------------------
-        // sum up the zombies found by this task
-        //----------------------------------------------------------------------
-
         nzombies += task_nzombies ;
     }
-
-    //--------------------------------------------------------------------------
-    // finalize the zombie count for C
-    //--------------------------------------------------------------------------
-
-    C->nzombies = nzombies ;
 }
 
-#undef GB_DOT3
+#undef GB_A_IS_SPARSE
+#undef GB_A_IS_HYPER
+#undef GB_A_IS_BITMAP
+#undef GB_A_IS_FULL
+#undef GB_B_IS_SPARSE
+#undef GB_B_IS_HYPER
+#undef GB_B_IS_BITMAP
+#undef GB_B_IS_FULL
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_dot4_meta.c b/GraphBLAS/Source/Template/GB_AxB_dot4_meta.c
new file mode 100644
index 0000000000..cf8ab33175
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_dot4_meta.c
@@ -0,0 +1,90 @@
+//------------------------------------------------------------------------------
+// GB_AxB_dot4_meta:  C+=A'*B via dot products, where C is full
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// C+=A'*B where C is a dense matrix and computed in-place.  The monoid of the
+// semiring matches the accum operator, and the type of C matches the ztype of
+// accum.  That is, no typecasting can be done with C.
+
+#define GB_DOT4
+
+// cij += A(k,i) * B(k,j)
+#undef  GB_DOT
+#define GB_DOT(k,pA,pB)                                             \
+{                                                                   \
+    if (!cij_updated)                                               \
+    {                                                               \
+        cij_updated = true ;                                        \
+        GB_GETC (cij, pC) ;                 /* cij = Cx [pC] */     \
+    }                                                               \
+    GB_GETA (aki, Ax, pA) ;                 /* aki = A(k,i) */      \
+    GB_GETB (bkj, Bx, pB) ;                 /* bkj = B(k,j) */      \
+    GB_MULTADD (cij, aki, bkj, i, k, j) ;   /* cij += aki * bkj */  \
+    GB_DOT_TERMINAL (cij) ;         /* break if cij == terminal */  \
+}
+
+// C(i,j) = cij
+#undef  GB_DOT_ALWAYS_SAVE_CIJ
+#define GB_DOT_ALWAYS_SAVE_CIJ  \
+{                               \
+    GB_PUTC (cij, pC) ;         \
+}
+
+// save C(i,j) if it has been updated
+#undef  GB_DOT_SAVE_CIJ
+#define GB_DOT_SAVE_CIJ         \
+{                               \
+    if (cij_updated)            \
+    {                           \
+        GB_PUTC (cij, pC) ;     \
+    }                           \
+}
+
+{ 
+
+    //--------------------------------------------------------------------------
+    // get A, B, and C
+    //--------------------------------------------------------------------------
+
+    GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
+    const int64_t cvlen = C->vlen ;
+
+    const int64_t  *GB_RESTRICT Bp = B->p ;
+    const int8_t   *GB_RESTRICT Bb = B->b ;
+    const int64_t  *GB_RESTRICT Bh = B->h ;
+    const int64_t  *GB_RESTRICT Bi = B->i ;
+    const GB_BTYPE *GB_RESTRICT Bx = (GB_BTYPE *) (B_is_pattern ? NULL : B->x) ;
+    const int64_t vlen = B->vlen ;
+    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
+    const bool B_is_bitmap = GB_IS_BITMAP (B) ;
+    const bool B_is_sparse = GB_IS_SPARSE (B) ;
+
+    const int64_t  *GB_RESTRICT Ap = A->p ;
+    const int8_t   *GB_RESTRICT Ab = A->b ;
+    const int64_t  *GB_RESTRICT Ah = A->h ;
+    const int64_t  *GB_RESTRICT Ai = A->i ;
+    const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) (A_is_pattern ? NULL : A->x) ;
+    ASSERT (A->vlen == B->vlen) ;
+    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
+    const bool A_is_sparse = GB_IS_SPARSE (A) ;
+
+    int ntasks = naslice * nbslice ;
+
+    //--------------------------------------------------------------------------
+    // C += A'*B
+    //--------------------------------------------------------------------------
+
+    #include "GB_meta16_factory.c"
+}
+
+#undef GB_DOT_ALWAYS_SAVE_CIJ
+#undef GB_DOT_SAVE_CIJ
+
+#undef GB_DOT4
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_dot4_template.c b/GraphBLAS/Source/Template/GB_AxB_dot4_template.c
index 1b733f1910..1e1f4c9af1 100644
--- a/GraphBLAS/Source/Template/GB_AxB_dot4_template.c
+++ b/GraphBLAS/Source/Template/GB_AxB_dot4_template.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_AxB_dot4:  C+=A'*B via dot products, where C is dense
+// GB_AxB_dot4_template:  C+=A'*B via dot products, where C is dense
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -12,181 +12,308 @@
 // accum.  That is, no typecasting can be done with C.
 
 // The PAIR operator as the multiplier provides important special cases.
-// See Template/GB_AxB_dot_cij.c for details.
-
-// cij += A(k,i) * B(k,j)
-#undef  GB_DOT_MERGE
-#define GB_DOT_MERGE                                                \
-{                                                                   \
-    if (!cij_updated)                                               \
-    {                                                               \
-        cij_updated = true ;                                        \
-        GB_GETC (cij, pC) ;                                         \
-    }                                                               \
-    GB_GETA (aki, Ax, pA) ;         /* aki = A(k,i) */              \
-    GB_GETB (bkj, Bx, pB) ;         /* bkj = B(k,j) */              \
-    GB_MULTADD (cij, aki, bkj) ;    /* cij += aki * bkj */          \
-    GB_DOT_TERMINAL (cij) ;         /* break if cij == terminal */  \
-    pA++ ;                                                          \
-    pB++ ;                                                          \
-}
 
 {
 
-    //--------------------------------------------------------------------------
-    // get A, B, and C
-    //--------------------------------------------------------------------------
-
-    GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
-    const int64_t cvlen = C->vlen ;
-
-    const int64_t  *GB_RESTRICT Bp = B->p ;
-    const int64_t  *GB_RESTRICT Bh = B->h ;
-    const int64_t  *GB_RESTRICT Bi = B->i ;
-    const GB_BTYPE *GB_RESTRICT Bx = (GB_BTYPE *) (B_is_pattern ? NULL : B->x) ;
-    const int64_t bvlen = B->vlen ;
-
-    const int64_t  *GB_RESTRICT Ap = A->p ;
-    const int64_t  *GB_RESTRICT Ah = A->h ;
-    const int64_t  *GB_RESTRICT Ai = A->i ;
-    const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) (A_is_pattern ? NULL : A->x) ;
-    ASSERT (A->vlen == B->vlen) ;
-
-    int ntasks = naslice * nbslice ;
-    bool cij_is_terminal ;
-
     //--------------------------------------------------------------------------
     // C += A'*B
     //--------------------------------------------------------------------------
 
-    int taskid ;
+    int tid ;
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    for (tid = 0 ; tid < ntasks ; tid++)
     {
 
         //----------------------------------------------------------------------
-        // get the entries in A and B to compute
+        // get the task descriptor
         //----------------------------------------------------------------------
 
-        int a_taskid = taskid / nbslice ;
-        int b_taskid = taskid % nbslice ;
-
-        int64_t akfirst = A_slice [a_taskid] ;
-        int64_t aklast  = A_slice [a_taskid+1] ;
-        if (akfirst >= aklast) continue ;
-
-        int64_t bkfirst = B_slice [b_taskid] ;
-        int64_t bklast  = B_slice [b_taskid+1] ;
-        if (bkfirst >= bklast) continue ;
+        const int a_tid = tid / nbslice ;
+        const int b_tid = tid % nbslice ;
+        const int64_t kA_start = A_slice [a_tid] ;
+        const int64_t kA_end   = A_slice [a_tid+1] ;
+        const int64_t kB_start = B_slice [b_tid] ;
+        const int64_t kB_end   = B_slice [b_tid+1] ;
 
         //----------------------------------------------------------------------
         // C+=A'*B via dot products
         //----------------------------------------------------------------------
 
-        for (int64_t bk = bkfirst ; bk < bklast ; bk++)
+        for (int64_t kB = kB_start ; kB < kB_end ; kB++)
         {
 
             //------------------------------------------------------------------
-            // get B(:,j)
+            // get B(:,j) and C(:,j)
             //------------------------------------------------------------------
 
-            int64_t j = (Bh == NULL) ? bk : Bh [bk] ;
-            int64_t pB_start = Bp [bk] ;
-            int64_t pB_end   = Bp [bk+1] ;
-            int64_t pC_start = j * cvlen ;
-            int64_t bjnz = pB_end - pB_start ;
-            if (bjnz == 0) continue ;
+            #if GB_B_IS_HYPER
+            const int64_t j = Bh [kB] ;
+            #else
+            const int64_t j = kB ;
+            #endif
+
+            const int64_t pC_start = j * cvlen ;
+
+            #if ( GB_B_IS_HYPER || GB_B_IS_SPARSE )
+                // B is sparse or hyper
+                const int64_t pB_start = Bp [kB] ;
+                const int64_t pB_end = Bp [kB+1] ;
+                const int64_t bjnz = pB_end - pB_start ;
+                if (bjnz == 0) continue ;
+                #if ( GB_A_IS_HYPER || GB_A_IS_SPARSE )
+                    // Both A and B are sparse/hyper; get first & last in B(:,j)
+                    const int64_t ib_first = Bi [pB_start] ;
+                    const int64_t ib_last  = Bi [pB_end-1] ;
+                #endif
+            #else
+                // B is bitmap or full
+                const int64_t pB_start = j * vlen ;
+            #endif
+
+            //------------------------------------------------------------------
+            // C(:,j) += A'*B(:,j) where C is full
+            //------------------------------------------------------------------
 
-            if (bjnz == bvlen)
+            for (int64_t kA = kA_start ; kA < kA_end ; kA++)
             {
 
                 //--------------------------------------------------------------
-                // B(:,j) is dense
+                // get A(:,i)
+                //--------------------------------------------------------------
+
+                #if GB_A_IS_HYPER
+                const int64_t i = Ah [kA] ;
+                #else
+                const int64_t i = kA ;
+                #endif
+
+                #if ( GB_A_IS_HYPER || GB_A_IS_SPARSE )
+                // A is sparse or hyper
+                int64_t pA = Ap [kA] ;
+                const int64_t pA_end = Ap [kA+1] ;
+                const int64_t ainz = pA_end - pA ;
+                if (ainz == 0) continue ;
+                #else
+                // A is bitmap or full
+                const int64_t pA = kA * vlen ;
+                #endif
+
+                //--------------------------------------------------------------
+                // get C(i,j)
                 //--------------------------------------------------------------
 
-                for (int64_t ak = akfirst ; ak < aklast ; ak++)
+                GB_CIJ_DECLARE (cij) ;          // declare the cij scalar
+                int64_t pC = i + pC_start ;     // C(i,j) is at Cx [pC]
+                bool cij_updated = false ;
+
+                //--------------------------------------------------------------
+                // C(i,j) += A (:,i)*B(:,j): a single dot product
+                //--------------------------------------------------------------
+
+                int64_t pB = pB_start ;
+
+                #if ( GB_A_IS_FULL && GB_B_IS_FULL )
                 {
 
                     //----------------------------------------------------------
-                    // get A(:,i)
+                    // both A and B are full
                     //----------------------------------------------------------
 
-                    int64_t i = (Ah == NULL) ? ak : Ah [ak] ;
-                    int64_t pA     = Ap [ak] ;
-                    int64_t pA_end = Ap [ak+1] ;
-                    int64_t ainz = pA_end - pA ;
-                    if (ainz == 0) continue ;
-
-                    GB_CIJ_DECLARE (cij) ;          // declare the cij scalar
-                    int64_t pC = i + pC_start ;     // C(i,j) is at Cx [pC]
-                    int64_t pB = pB_start ;
                     GB_GETC (cij, pC) ;             // cij = Cx [pC]
+                    #if GB_IS_PAIR_MULTIPLIER
+                    { 
+                        #if GB_IS_ANY_MONOID
+                        // ANY monoid: take the first entry found
+                        GB_MULT (cij, ignore, ignore, 0, 0, 0) ;
+                        #elif GB_IS_EQ_MONOID
+                        // EQ_PAIR semiring
+                        cij = (cij == 1) ;
+                        #elif (GB_CTYPE_BITS > 0)
+                        // PLUS, XOR monoids: A(:,i)'*B(:,j) is nnz(A(:,i)),
+                        // for bool, 8-bit, 16-bit, or 32-bit integer
+                        uint64_t t = ((uint64_t) cij) + vlen ;
+                        cij = (GB_CTYPE) (t & GB_CTYPE_BITS) ;
+                        #elif GB_IS_PLUS_FC32_MONOID
+                        // PLUS monoid for float complex
+                        cij = GxB_CMPLXF (crealf (cij) + (float) vlen, 0) ;
+                        #elif GB_IS_PLUS_FC64_MONOID
+                        // PLUS monoid for double complex
+                        cij = GxB_CMPLX (creal (cij) + (double) vlen, 0) ;
+                        #else
+                        // PLUS monoid for float, double, or 64-bit integers 
+                        cij += (GB_CTYPE) vlen ;
+                        #endif
+                    }
+                    #else
+                    {
+                        GB_PRAGMA_SIMD_DOT (cij)
+                        for (int64_t k = 0 ; k < vlen ; k++)
+                        { 
+                            GB_DOT_TERMINAL (cij) ;         // break if terminal
+                            // cij += A(k,i) * B(k,j)
+                            GB_GETA (aki, Ax, pA+k) ;       // aki = A(k,i)
+                            GB_GETB (bkj, Bx, pB+k) ;       // bkj = B(k,j)
+                            // cij += aki * bkj
+                            GB_MULTADD (cij, aki, bkj, i, k, j) ;
+                        }
+                    }
+                    #endif
+                    GB_DOT_ALWAYS_SAVE_CIJ ;
+
+                }
+                #elif ( GB_A_IS_FULL && GB_B_IS_BITMAP )
+                {
 
                     //----------------------------------------------------------
-                    // special cases for the PAIR multiplier
+                    // A is full and B is bitmap
                     //----------------------------------------------------------
 
-                    // Since B(:,j) is dense, C(i,j) += A(:,i)'*B(:,j) is
-                    // trivial to compute with the PAIR multiplier.
+                    for (int64_t k = 0 ; k < vlen ; k++)
+                    {
+                        if (Bb [pB+k])
+                        { 
+                            GB_DOT (k, pA+k, pB+k) ;
+                        }
+                    }
+                    GB_DOT_SAVE_CIJ ;
+
+                }
+                #elif ( GB_A_IS_FULL && ( GB_B_IS_SPARSE || GB_B_IS_HYPER ) )
+                {
 
-                    #if GB_IS_PAIR_MULTIPLIER
+                    //----------------------------------------------------------
+                    // A is full and B is sparse/hyper
+                    //----------------------------------------------------------
 
+                    GB_GETC (cij, pC) ;                 // cij = Cx [pC]
+                    #if GB_IS_PAIR_MULTIPLIER
+                    { 
                         #if GB_IS_ANY_MONOID
                         // ANY monoid: take the first entry found
                         // cij = 1, or CMPLX(1,0) for complex ANY
-                        GB_MULT (cij, ignore, ignore) ;
+                        GB_MULT (cij, ignore, ignore, 0, 0, 0) ;
                         #elif GB_IS_EQ_MONOID
-                        // A(:,i)'*B(:j) is one, so this result must be
-                        // accumulated into cij, as cij += 1, where the
-                        // accumulator is the EQ operator.
+                        // EQ_PAIR semiring
                         cij = (cij == 1) ;
                         #elif (GB_CTYPE_BITS > 0)
                         // PLUS, XOR monoids: A(:,i)'*B(:,j) is nnz(A(:,i)),
                         // for bool, 8-bit, 16-bit, or 32-bit integer
-                        uint64_t t = ((uint64_t) cij) + ainz ;
+                        uint64_t t = ((uint64_t) cij) + bjnz ;
                         cij = (GB_CTYPE) (t & GB_CTYPE_BITS) ;
                         #elif GB_IS_PLUS_FC32_MONOID
                         // PLUS monoid for float complex
-                        cij = GxB_CMPLXF (crealf (cij) + (float) ainz, 0) ;
+                        cij = GxB_CMPLXF (crealf (cij) + (float) bjnz, 0) ;
                         #elif GB_IS_PLUS_FC64_MONOID
                         // PLUS monoid for double complex
-                        cij = GxB_CMPLX (creal (cij) + (double) ainz, 0) ;
+                        cij = GxB_CMPLX (creal (cij) + (double) bjnz, 0) ;
                         #else
-                        // PLUS monoid for float, double, or 64-bit integers 
-                        cij += (GB_CTYPE) ainz ;
+                        // PLUS monoid for float, double, or 64-bit integers
+                        cij += (GB_CTYPE) bjnz ;
                         #endif
-
+                    }
                     #else
+                    {
+                        GB_PRAGMA_SIMD_DOT (cij)
+                        for (int64_t p = pB ; p < pB_end ; p++)
+                        { 
+                            GB_DOT_TERMINAL (cij) ;   // break if terminal
+                            int64_t k = Bi [p] ;
+                            // cij += A(k,i) * B(k,j)
+                            GB_GETA (aki, Ax, pA+k) ;     // aki = A(k,i)
+                            GB_GETB (bkj, Bx, p   ) ;     // bkj = B(k,j)
+                            GB_MULTADD (cij, aki, bkj, i, k, j) ;
+                        }
+                    }
+                    #endif
+                    GB_DOT_ALWAYS_SAVE_CIJ ;
+
+                }
+                #elif ( GB_A_IS_BITMAP && GB_B_IS_FULL )
+                {
 
                     //----------------------------------------------------------
-                    // general case
+                    // A is bitmap and B is full
                     //----------------------------------------------------------
 
-                    if (ainz == bvlen)
+                    for (int64_t k = 0 ; k < vlen ; k++)
                     {
+                        if (Ab [pA+k])
+                        { 
+                            GB_DOT (k, pA+k, pB+k) ;
+                        }
+                    }
+                    GB_DOT_SAVE_CIJ ;
 
-                        //------------------------------------------------------
-                        // both A(:,i) and B(:,j) are dense
-                        //------------------------------------------------------
+                }
+                #elif ( GB_A_IS_BITMAP && GB_B_IS_BITMAP )
+                {
 
-                        GB_PRAGMA_SIMD_DOT (cij)
-                        for (int64_t k = 0 ; k < bvlen ; k++)
+                    //----------------------------------------------------------
+                    // both A and B are bitmap
+                    //----------------------------------------------------------
+
+                    for (int64_t k = 0 ; k < vlen ; k++)
+                    {
+                        if (Ab [pA+k] && Bb [pB+k])
                         { 
-                            GB_DOT_TERMINAL (cij) ;         // break if terminal
-                            // cij += A(k,i) * B(k,j)
-                            GB_GETA (aki, Ax, pA+k) ;       // aki = A(k,i)
-                            GB_GETB (bkj, Bx, pB+k) ;       // bkj = B(k,j)
-                            GB_MULTADD (cij, aki, bkj) ;    // cij += aki * bkj
+                            GB_DOT (k, pA+k, pB+k) ;
                         }
-
                     }
-                    else
+                    GB_DOT_SAVE_CIJ ;
+
+                }
+                #elif ( GB_A_IS_BITMAP && ( GB_B_IS_SPARSE || GB_B_IS_HYPER ) )
+                {
+
+                    //----------------------------------------------------------
+                    // A is bitmap and B is sparse/hyper
+                    //----------------------------------------------------------
+
+                    for (int64_t p = pB ; p < pB_end ; p++)
                     {
+                        int64_t k = Bi [p] ;
+                        if (Ab [pA+k])
+                        { 
+                            GB_DOT (k, pA+k, p) ;
+                        }
+                    }
+                    GB_DOT_SAVE_CIJ ;
 
-                        //------------------------------------------------------
-                        // A(:,i) is sparse and B(:,j) is dense
-                        //------------------------------------------------------
+                }
+                #elif ( (GB_A_IS_SPARSE || GB_A_IS_HYPER) && GB_B_IS_FULL )
+                {
 
+                    //----------------------------------------------------------
+                    // A is sparse/hyper and B is full
+                    //----------------------------------------------------------
+
+                    GB_GETC (cij, pC) ;             // cij = Cx [pC]
+                    #if GB_IS_PAIR_MULTIPLIER
+                    { 
+                        #if GB_IS_ANY_MONOID
+                        // ANY monoid: take the first entry found
+                        GB_MULT (cij, ignore, ignore, 0, 0, 0) ;
+                        #elif GB_IS_EQ_MONOID
+                        // EQ_PAIR semiring
+                        cij = (cij == 1) ;
+                        #elif (GB_CTYPE_BITS > 0)
+                        // PLUS, XOR monoids: A(:,i)'*B(:,j) is nnz(A(:,i)),
+                        // for bool, 8-bit, 16-bit, or 32-bit integer
+                        uint64_t t = ((uint64_t) cij) + ainz ;
+                        cij = (GB_CTYPE) (t & GB_CTYPE_BITS) ;
+                        #elif GB_IS_PLUS_FC32_MONOID
+                        // PLUS monoid for float complex
+                        cij = GxB_CMPLXF (crealf (cij) + (float) ainz, 0) ;
+                        #elif GB_IS_PLUS_FC64_MONOID
+                        // PLUS monoid for double complex
+                        cij = GxB_CMPLX (creal (cij) + (double) ainz, 0) ;
+                        #else
+                        // PLUS monoid for float, double, or 64-bit integers 
+                        cij += (GB_CTYPE) ainz ;
+                        #endif
+                    }
+                    #else
+                    {
                         GB_PRAGMA_SIMD_DOT (cij)
                         for (int64_t p = pA ; p < pA_end ; p++)
                         { 
@@ -195,102 +322,45 @@
                             // cij += A(k,i) * B(k,j)
                             GB_GETA (aki, Ax, p   ) ;       // aki = A(k,i)
                             GB_GETB (bkj, Bx, pB+k) ;       // bkj = B(k,j)
-                            GB_MULTADD (cij, aki, bkj) ;    // cij += aki * bkj
+                            GB_MULTADD (cij, aki, bkj, i, k, j) ;
                         }
                     }
-
                     #endif
-                    GB_PUTC (cij, pC) ;                 // Cx [pC] = cij
-                }
+                    GB_DOT_ALWAYS_SAVE_CIJ ;
 
-            }
-            else
-            {
-
-                //--------------------------------------------------------------
-                // B(:,j) is sparse
-                //--------------------------------------------------------------
-
-                // get the first and last index in B(:,j)
-                int64_t ib_first = Bi [pB_start] ;
-                int64_t ib_last  = Bi [pB_end-1] ;
-
-                for (int64_t ak = akfirst ; ak < aklast ; ak++)
+                }
+                #elif ( (GB_A_IS_SPARSE || GB_A_IS_HYPER) && GB_B_IS_BITMAP )
                 {
 
                     //----------------------------------------------------------
-                    // get A(:,i)
+                    // A is sparse/hyper and B is bitmap
                     //----------------------------------------------------------
 
-                    int64_t i = (Ah == NULL) ? ak : Ah [ak] ;
-                    int64_t pA     = Ap [ak] ;
-                    int64_t pA_end = Ap [ak+1] ;
-                    int64_t ainz = pA_end - pA ;
-                    if (ainz == 0) continue ;
-                    // get the first and last index in A(:,i)
-                    if (Ai [pA_end-1] < ib_first || ib_last < Ai [pA]) continue;
+                    for (int64_t p = pA ; p < pA_end ; p++)
+                    {
+                        int64_t k = Ai [p] ;
+                        if (Bb [pB+k])
+                        { 
+                            GB_DOT (k, p, pB+k) ;
+                        }
+                    }
+                    GB_DOT_SAVE_CIJ ;
+
+                }
+                #else
+                {
 
                     //----------------------------------------------------------
-                    // C(i,j) += A(:,i)'*B(:,j)
+                    // both A and B are sparse/hyper
                     //----------------------------------------------------------
 
-                    GB_CIJ_DECLARE (cij) ;          // declare the cij scalar
-                    int64_t pC = i + pC_start ;     // C(i,j) is at Cx [pC]
-                    int64_t pB = pB_start ;
-
-                    if (ainz == bvlen)
-                    {
+                    if (Ai [pA_end-1] < ib_first || ib_last < Ai [pA])
+                    { 
 
                         //------------------------------------------------------
-                        // A(:,i) is dense and B(:,j) is sparse
+                        // pattern of A(:,i) and B(:,j) don't overlap
                         //------------------------------------------------------
 
-                        GB_GETC (cij, pC) ;                 // cij = Cx [pC]
-
-                        #if GB_IS_PAIR_MULTIPLIER
-
-                            #if GB_IS_ANY_MONOID
-                            // ANY monoid: take the first entry found
-                            // cij = 1, or CMPLX(1,0) for complex ANY
-                            GB_MULT (cij, ignore, ignore) ;
-                            #elif GB_IS_EQ_MONOID
-                            // A(:,i)'*B(:j) is one, so this result must be
-                            // accumulated into cij, as cij += 1, where the
-                            // accumulator is the EQ operator.
-                            cij = (cij == 1) ;
-                            #elif (GB_CTYPE_BITS > 0)
-                            // PLUS, XOR monoids: A(:,i)'*B(:,j) is nnz(A(:,i)),
-                            // for bool, 8-bit, 16-bit, or 32-bit integer
-                            uint64_t t = ((uint64_t) cij) + bjnz ;
-                            cij = (GB_CTYPE) (t & GB_CTYPE_BITS) ;
-                            #elif GB_IS_PLUS_FC32_MONOID
-                            // PLUS monoid for float complex
-                            cij = GxB_CMPLXF (crealf (cij) + (float) bjnz, 0) ;
-                            #elif GB_IS_PLUS_FC64_MONOID
-                            // PLUS monoid for double complex
-                            cij = GxB_CMPLX (creal (cij) + (double) bjnz, 0) ;
-                            #else
-                            // PLUS monoid for float, double, or 64-bit integers
-                            cij += (GB_CTYPE) bjnz ;
-                            #endif
-
-                        #else
-
-                            GB_PRAGMA_SIMD_DOT (cij)
-                            for (int64_t p = pB ; p < pB_end ; p++)
-                            { 
-                                GB_DOT_TERMINAL (cij) ;   // break if terminal
-                                int64_t k = Bi [p] ;
-                                // cij += A(k,i) * B(k,j)
-                                GB_GETA (aki, Ax, pA+k) ;     // aki = A(k,i)
-                                GB_GETB (bkj, Bx, p   ) ;     // bkj = B(k,j)
-                                GB_MULTADD (cij, aki, bkj) ;  // cij += aki*bkj
-                            }
-
-                        #endif
-
-                        GB_PUTC (cij, pC) ;                 // Cx [pC] = cij
-
                     }
                     else if (ainz > 8 * bjnz)
                     {
@@ -299,7 +369,6 @@
                         // B(:,j) is very sparse compared to A(:,i)
                         //------------------------------------------------------
 
-                        bool cij_updated = false ;
                         while (pA < pA_end && pB < pB_end)
                         {
                             int64_t ia = Ai [pA] ;
@@ -322,10 +391,12 @@
                             else // ia == ib == k
                             { 
                                 // A(k,i) and B(k,j) are next entries to merge
-                                GB_DOT_MERGE ;
+                                GB_DOT (ia, pA, pB) ;
+                                pA++ ;
+                                pB++ ;
                             }
                         }
-                        if (cij_updated) GB_PUTC (cij, pC) ;
+                        GB_DOT_SAVE_CIJ ;
 
                     }
                     else if (bjnz > 8 * ainz)
@@ -335,7 +406,6 @@
                         // A(:,i) is very sparse compared to B(:,j)
                         //------------------------------------------------------
 
-                        bool cij_updated = false ;
                         while (pA < pA_end && pB < pB_end)
                         {
                             int64_t ia = Ai [pA] ;
@@ -358,10 +428,12 @@
                             else // ia == ib == k
                             { 
                                 // A(k,i) and B(k,j) are next entries to merge
-                                GB_DOT_MERGE ;
+                                GB_DOT (ia, pA, pB) ;
+                                pA++ ;
+                                pB++ ;
                             }
                         }
-                        if (cij_updated) GB_PUTC (cij, pC) ;
+                        GB_DOT_SAVE_CIJ ;
 
                     }
                     else
@@ -371,7 +443,6 @@
                         // A(:,i) and B(:,j) have about the same sparsity
                         //------------------------------------------------------
 
-                        bool cij_updated = false ;
                         while (pA < pA_end && pB < pB_end)
                         {
                             int64_t ia = Ai [pA] ;
@@ -389,12 +460,15 @@
                             else // ia == ib == k
                             { 
                                 // A(k,i) and B(k,j) are the entries to merge
-                                GB_DOT_MERGE ;
+                                GB_DOT (ia, pA, pB) ;
+                                pA++ ;
+                                pB++ ;
                             }
                         }
-                        if (cij_updated) GB_PUTC (cij, pC) ;
+                        GB_DOT_SAVE_CIJ ;
                     }
                 }
+                #endif
             }
         }
     }
diff --git a/GraphBLAS/Source/Template/GB_AxB_dot_cij.c b/GraphBLAS/Source/Template/GB_AxB_dot_cij.c
index 065872a31a..71df20090e 100644
--- a/GraphBLAS/Source/Template/GB_AxB_dot_cij.c
+++ b/GraphBLAS/Source/Template/GB_AxB_dot_cij.c
@@ -2,24 +2,14 @@
 // GB_AxB_dot_cij: compute C(i,j) = A(:,i)'*B(:,j)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // computes C(i,j) = A (:,i)'*B(:,j) via sparse dot product.  This template is
-// used for all three cases: C=A'*B and C<!M>=A'*B in dot2, and C<M>=A'*B in
-// dot3.
-
-// GB_AxB_dot2 defines either one of these, and uses this template twice:
-
-//      GB_PHASE_1_OF_2 ; determine if cij exists, and increment C_count
-//      GB_PHASE_2_OF_2 : 2nd phase, compute cij, no realloc of C
-
-// GB_AxB_dot3 defines GB_DOT3, and uses this template just once.
-
-// Only one of the three are #defined: either GB_PHASE_1_OF_2, GB_PHASE_2_OF_2,
-// or GB_DOT3.
+// used for all three cases: C=A'*B, C<M>=A'*B, and C<!M>=A'*B in dot2 when C
+// is bitmap, and for C<M>=A'*B when C and M are sparse or hyper in dot3.
 
 // When used as the multiplicative operator, the PAIR operator provides some
 // useful special cases.  Its output is always one, for any matching pair of
@@ -34,743 +24,366 @@
 
 // If both A(:,i) and B(:,j) are sparse, then the intersection must still be
 // found, so these optimizations can be used only if A(:,i) and/or B(:,j) are
-// fully populated.
+// entirely populated.
 
 // For built-in, pre-generated semirings, the PAIR operator is only coupled
 // with either the ANY, PLUS, EQ, or XOR monoids, since the other monoids are
-// equivalent to the ANY monoid.  With no accumulator, EQ_PAIR is the same as
-// ANY_PAIR, they differ for the C+=A'*B operation (see *dot4*).
-
-#include "GB_unused.h"
-
-{
-
-//------------------------------------------------------------------------------
-// GB_DOT: cij += A(k,i) * B(k,j) with offset, then break if terminal
-//------------------------------------------------------------------------------
-
-// Ai [pA+adelta] and Bi [pB+bdelta] are both equal to the index k.
-
-// use the boolean flag cij_exists to set/check if C(i,j) exists
-#define GB_CIJ_CHECK true
-#define GB_CIJ_EXISTS (cij_exists)
-
-#if defined ( GB_PHASE_1_OF_2 )
-
-    // symbolic phase (phase 1 of 2 for dot2):
-    bool cij_exists = false ;
-    #define GB_DOT(adelta,bdelta)                                           \
-        cij_exists = true ;                                                 \
-        break ;
-
-#else
-
-    // numerical phase (phase 2 of 2 for dot2, or dot3):
-    #if GB_IS_PLUS_PAIR_REAL_SEMIRING
-
-        #if GB_CTYPE_IGNORE_OVERFLOW
-
-            // PLUS_PAIR for 64-bit integers, float, and double (not complex):
-            // To check if C(i,j) exists, test (cij != 0) when done.  The
-            // boolean flag cij_exists is not defined.
-            #if defined ( __AVX2__ )
-            #define GB_USE_AVX2
-            #endif
-            #undef  GB_CIJ_CHECK
-            #define GB_CIJ_CHECK false
-            #undef  GB_CIJ_EXISTS
-            #define GB_CIJ_EXISTS (cij != 0)
-            #define GB_DOT(adelta,bdelta)                                   \
-                cij++ ;
-
-        #else
-
-            // PLUS_PAIR semiring for small integers
-            bool cij_exists = false ;
-            #define GB_DOT(adelta,bdelta)                                   \
-                cij_exists = true ;                                         \
-                cij++ ;
-
-        #endif
-
-    #else
-
-        // all other semirings
-        bool cij_exists = false ;
-        #define GB_DOT(adelta,bdelta)                                       \
-        {                                                                   \
-            GB_GETA (aki, Ax, (pA + adelta)) ;  /* aki = A(k,i) */          \
-            GB_GETB (bkj, Bx, (pB + bdelta)) ;  /* bkj = B(k,j) */          \
-            if (cij_exists)                                                 \
-            {                                                               \
-                GB_MULTADD (cij, aki, bkj) ;    /* cij += aki * bkj */      \
-            }                                                               \
-            else                                                            \
-            {                                                               \
-                /* cij = A(k,i) * B(k,j), and add to the pattern */         \
-                cij_exists = true ;                                         \
-                GB_MULT (cij, aki, bkj) ;       /* cij = aki * bkj */       \
-            }                                                               \
-            /* if (cij is terminal) { cij_is_terminal = true ; break ; } */ \
-            GB_DOT_TERMINAL (cij) ;                                         \
-        }
-
-    #endif
-
-#endif
+// equivalent to the ANY monoid.
 
 //------------------------------------------------------------------------------
-// C(i,j) = A(:,i)'*B(:,j)
+// C(i,j) = A(:,i)'*B(:,j): a single dot product
 //------------------------------------------------------------------------------
 
-    //--------------------------------------------------------------------------
-    // get the start of A(:,i) and B(:,j)
-    //--------------------------------------------------------------------------
-
-    bool cij_is_terminal = false ;  // C(i,j) not yet reached terminal condition
+{
     int64_t pB = pB_start ;
-    int64_t ainz = pA_end - pA ;
-    ASSERT (ainz >= 0) ;
-
-    //--------------------------------------------------------------------------
-    // declare the cij scalar
-    //--------------------------------------------------------------------------
-
-    #if defined ( GB_PHASE_2_OF_2 ) || defined ( GB_DOT3 )
-    GB_CIJ_DECLARE (cij) ;
-    #endif
 
-    //--------------------------------------------------------------------------
-    // 8 cases for computing C(i,j) = A(:,i)' * B(j,:)
-    //--------------------------------------------------------------------------
-
-    if (ainz == 0)
-    { 
+    #if ( GB_A_IS_FULL && GB_B_IS_FULL )
+    {
 
         //----------------------------------------------------------------------
-        // A(:,i) is empty so C(i,j) cannot be present
+        // both A and B are full
         //----------------------------------------------------------------------
 
-        ;
+        #if GB_IS_PAIR_MULTIPLIER
+        { 
+            #if GB_IS_ANY_MONOID
+            // ANY monoid: take the first entry found; this sets cij = 1
+            GB_MULT (cij, ignore, ignore, 0, 0, 0) ;
+            #elif GB_IS_EQ_MONOID
+            // EQ_PAIR semiring: all entries are equal to 1
+            cij = 1 ;
+            #elif (GB_CTYPE_BITS > 0)
+            // PLUS, XOR monoids: A(:,i)'*B(:,j) is nnz(A(:,i)),
+            // for bool, 8-bit, 16-bit, or 32-bit integer
+            cij = (GB_CTYPE) (((uint64_t) vlen) & GB_CTYPE_BITS) ;
+            #else
+            // PLUS monoid for float, double, or 64-bit integers 
+            cij = GB_CTYPE_CAST (vlen, 0) ;
+            #endif
+        }
+        #else
+        {
+            // cij = A(0,i) * B(0,j)
+            GB_GETA (aki, Ax, pA) ;             // aki = A(0,i)
+            GB_GETB (bkj, Bx, pB) ;             // bkj = B(0,j)
+            GB_MULT (cij, aki, bkj, i, 0, j) ;  // cij = aki * bkj
+            GB_PRAGMA_SIMD_DOT (cij)
+            for (int64_t k = 1 ; k < vlen ; k++)
+            { 
+                GB_DOT_TERMINAL (cij) ;             // break if cij terminal
+                // cij += A(k,i) * B(k,j)
+                GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
+                GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
+                GB_MULTADD (cij, aki, bkj, i, k, j) ; // cij += aki * bkj
+            }
+        }
+        #endif
+        GB_DOT_ALWAYS_SAVE_CIJ ;
 
     }
-    else if (Ai [pA_end-1] < ib_first || ib_last < Ai [pA])
-    { 
+    #elif ( GB_A_IS_FULL && GB_B_IS_BITMAP )
+    {
 
         //----------------------------------------------------------------------
-        // pattern of A(:,i) and B(:,j) do not overlap
+        // A is full and B is bitmap
         //----------------------------------------------------------------------
 
-        ;
+        for (int64_t k = 0 ; k < vlen ; k++)
+        {
+            if (Bb [pB+k])
+            { 
+                GB_DOT (k, pA+k, pB+k) ;
+            }
+        }
+        GB_DOT_SAVE_CIJ ;
 
     }
-    else if (bjnz == bvlen && ainz == bvlen)
+    #elif ( GB_A_IS_FULL && ( GB_B_IS_SPARSE || GB_B_IS_HYPER ) )
     {
 
         //----------------------------------------------------------------------
-        // both A(:,i) and B(:,j) are dense
+        // A is full and B is sparse/hyper
         //----------------------------------------------------------------------
 
-        #if defined ( GB_PHASE_2_OF_2 ) || defined ( GB_DOT3 )
-
-            #if GB_IS_PAIR_MULTIPLIER
-
-                #if GB_IS_ANY_MONOID
-                // ANY monoid: take the first entry found; this sets cij = 1
-                GB_MULT (cij, ignore, ignore) ;
-                #elif GB_IS_EQ_MONOID
-                // EQ_PAIR semiring: all entries are equal to 1
-                cij = 1 ;
-                #elif (GB_CTYPE_BITS > 0)
-                // PLUS, XOR monoids: A(:,i)'*B(:,j) is nnz(A(:,i)),
-                // for bool, 8-bit, 16-bit, or 32-bit integer
-                cij = (GB_CTYPE) (((uint64_t) bvlen) & GB_CTYPE_BITS) ;
-                #else
-                // PLUS monoid for float, double, or 64-bit integers 
-                cij = GB_CTYPE_CAST (bvlen, 0) ;
-                #endif
-
+        #if GB_IS_PAIR_MULTIPLIER
+        {
+            #if GB_IS_ANY_MONOID
+            // ANY monoid: take the first entry found; this sets cij = 1
+            GB_MULT (cij, ignore, ignore, 0, 0, 0) ;
+            #elif GB_IS_EQ_MONOID
+            // EQ_PAIR semiring: all entries are equal to 1
+            cij = 1 ;
+            #elif (GB_CTYPE_BITS > 0)
+            // PLUS, XOR monoids: A(:,i)'*B(:,j) is nnz(A(:,i)),
+            // for bool, 8-bit, 16-bit, or 32-bit integer
+            cij = (GB_CTYPE) (((uint64_t) bjnz) & GB_CTYPE_BITS) ;
             #else
-
-                // cij = A(0,i) * B(0,j)
-                GB_GETA (aki, Ax, pA) ;             // aki = A(0,i)
-                GB_GETB (bkj, Bx, pB) ;             // bkj = B(0,j)
-                GB_MULT (cij, aki, bkj) ;           // cij = aki * bkj
-                GB_PRAGMA_SIMD_DOT (cij)
-                for (int64_t k = 1 ; k < bvlen ; k++)
-                { 
-                    GB_DOT_TERMINAL (cij) ;             // break if cij terminal
-                    // cij += A(k,i) * B(k,j)
-                    GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
-                    GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
-                    GB_MULTADD (cij, aki, bkj) ;        // cij += aki * bkj
-                }
-
+            // PLUS monoid for float, double, or 64-bit integers 
+            cij = GB_CTYPE_CAST (bjnz, 0) ;
             #endif
-
-        #endif
-
-        #if GB_CIJ_CHECK
-        cij_exists = true ;
+        }
+        #else
+        {
+            int64_t k = Bi [pB] ;               // first row index of B(:,j)
+            // cij = A(k,i) * B(k,j)
+            GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
+            GB_GETB (bkj, Bx, pB  ) ;           // bkj = B(k,j)
+            GB_MULT (cij, aki, bkj, i, k, j) ;  // cij = aki * bkj
+            GB_PRAGMA_SIMD_DOT (cij)
+            for (int64_t p = pB+1 ; p < pB_end ; p++)
+            { 
+                GB_DOT_TERMINAL (cij) ;             // break if cij terminal
+                int64_t k = Bi [p] ;
+                // cij += A(k,i) * B(k,j)
+                GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
+                GB_GETB (bkj, Bx, p   ) ;           // bkj = B(k,j)
+                GB_MULTADD (cij, aki, bkj, i, k, j) ;   // cij += aki * bkj
+            }
+        }
         #endif
+        GB_DOT_ALWAYS_SAVE_CIJ ;
 
     }
-    else if (ainz == bvlen)
+    #elif ( GB_A_IS_BITMAP && GB_B_IS_FULL )
     {
 
         //----------------------------------------------------------------------
-        // A(:,i) is dense and B(:,j) is sparse
+        // A is bitmap and B is full
         //----------------------------------------------------------------------
 
-        #if defined ( GB_PHASE_2_OF_2 ) || defined ( GB_DOT3 )
-
-            #if GB_IS_PAIR_MULTIPLIER
-
-                #if GB_IS_ANY_MONOID
-                // ANY monoid: take the first entry found; this sets cij = 1
-                GB_MULT (cij, ignore, ignore) ;
-                #elif GB_IS_EQ_MONOID
-                // EQ_PAIR semiring: all entries are equal to 1
-                cij = 1 ;
-                #elif (GB_CTYPE_BITS > 0)
-                // PLUS, XOR monoids: A(:,i)'*B(:,j) is nnz(A(:,i)),
-                // for bool, 8-bit, 16-bit, or 32-bit integer
-                cij = (GB_CTYPE) (((uint64_t) bjnz) & GB_CTYPE_BITS) ;
-                #else
-                // PLUS monoid for float, double, or 64-bit integers 
-                cij = GB_CTYPE_CAST (bjnz, 0) ;
-                #endif
-
-            #else
-
-                int64_t k = Bi [pB] ;               // first row index of B(:,j)
-                // cij = A(k,i) * B(k,j)
-                GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
-                GB_GETB (bkj, Bx, pB  ) ;           // bkj = B(k,j)
-                GB_MULT (cij, aki, bkj) ;           // cij = aki * bkj
-                GB_PRAGMA_SIMD_DOT (cij)
-                for (int64_t p = pB+1 ; p < pB_end ; p++)
-                { 
-                    GB_DOT_TERMINAL (cij) ;             // break if cij terminal
-                    int64_t k = Bi [p] ;                // next index of B(:,j)
-                    // cij += A(k,i) * B(k,j)
-                    GB_GETA (aki, Ax, pA+k) ;           // aki = A(k,i)
-                    GB_GETB (bkj, Bx, p   ) ;           // bkj = B(k,j)
-                    GB_MULTADD (cij, aki, bkj) ;        // cij += aki * bkj
-                }
-
-            #endif
-        #endif
-
-        #if GB_CIJ_CHECK
-        cij_exists = true ;
-        #endif
+        for (int64_t k = 0 ; k < vlen ; k++)
+        {
+            if (Ab [pA+k])
+            { 
+                GB_DOT (k, pA+k, pB+k) ;
+            }
+        }
+        GB_DOT_SAVE_CIJ ;
 
     }
-    else if (bjnz == bvlen)
+    #elif ( GB_A_IS_BITMAP && GB_B_IS_BITMAP )
     {
 
         //----------------------------------------------------------------------
-        // A(:,i) is sparse and B(:,j) is dense
+        // both A and B are bitmap
         //----------------------------------------------------------------------
 
-        #if defined ( GB_PHASE_2_OF_2 ) || defined ( GB_DOT3 )
-
-            #if GB_IS_PAIR_MULTIPLIER
-
-                #if GB_IS_ANY_MONOID
-                // ANY monoid: take the first entry found; this sets cij = 1
-                GB_MULT (cij, ignore, ignore) ;
-                #elif GB_IS_EQ_MONOID
-                // EQ_PAIR semiring: all entries are equal to 1
-                cij = 1 ;
-                #elif (GB_CTYPE_BITS > 0)
-                // PLUS, XOR monoids: A(:,i)'*B(:,j) is nnz(A(:,i)),
-                // for bool, 8-bit, 16-bit, or 32-bit integer
-                cij = (GB_CTYPE) (((uint64_t) ainz) & GB_CTYPE_BITS) ;
-                #else
-                // PLUS monoid for float, double, or 64-bit integers 
-                cij = GB_CTYPE_CAST (ainz, 0) ;
-                #endif
-
-            #else
-
-                int64_t k = Ai [pA] ;               // first row index of A(:,i)
-                // cij = A(k,i) * B(k,j)
-                GB_GETA (aki, Ax, pA  ) ;           // aki = A(k,i)
-                GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
-                GB_MULT (cij, aki, bkj) ;           // cij = aki * bkj
-                GB_PRAGMA_SIMD_DOT (cij)
-                for (int64_t p = pA+1 ; p < pA_end ; p++)
-                { 
-                    GB_DOT_TERMINAL (cij) ;             // break if cij terminal
-                    int64_t k = Ai [p] ;                // next index of A(:,i)
-                    // cij += A(k,i) * B(k,j)
-                    GB_GETA (aki, Ax, p   ) ;           // aki = A(k,i)
-                    GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
-                    GB_MULTADD (cij, aki, bkj) ;        // cij += aki * bkj
-                }
+        for (int64_t k = 0 ; k < vlen ; k++)
+        {
+            if (Ab [pA+k] && Bb [pB+k])
+            { 
+                GB_DOT (k, pA+k, pB+k) ;
+            }
+        }
+        GB_DOT_SAVE_CIJ ;
 
-            #endif
+    }
+    #elif ( GB_A_IS_BITMAP && ( GB_B_IS_SPARSE || GB_B_IS_HYPER ) )
+    {
 
-        #endif
+        //----------------------------------------------------------------------
+        // A is bitmap and B is sparse/hyper
+        //----------------------------------------------------------------------
 
-        #if GB_CIJ_CHECK
-        cij_exists = true ;
-        #endif
+        for (int64_t p = pB ; p < pB_end ; p++)
+        {
+            int64_t k = Bi [p] ;
+            if (Ab [pA+k])
+            { 
+                GB_DOT (k, pA+k, p) ;
+            }
+        }
+        GB_DOT_SAVE_CIJ ;
 
     }
-    else if (ainz > 8 * bjnz)
+    #elif ( (GB_A_IS_SPARSE || GB_A_IS_HYPER) && GB_B_IS_FULL )
     {
 
         //----------------------------------------------------------------------
-        // B(:,j) is very sparse compared to A(:,i)
+        // A is sparse/hyper and B is full
         //----------------------------------------------------------------------
 
-        while (pA < pA_end && pB < pB_end)
+        #if GB_IS_PAIR_MULTIPLIER
+        { 
+            #if GB_IS_ANY_MONOID
+            // ANY monoid: take the first entry found; this sets cij = 1
+            GB_MULT (cij, ignore, ignore, 0, 0, 0) ;
+            #elif GB_IS_EQ_MONOID
+            // EQ_PAIR semiring: all entries are equal to 1
+            cij = 1 ;
+            #elif (GB_CTYPE_BITS > 0)
+            // PLUS, XOR monoids: A(:,i)'*B(:,j) is nnz(A(:,i)),
+            // for bool, 8-bit, 16-bit, or 32-bit integer
+            cij = (GB_CTYPE) (((uint64_t) ainz) & GB_CTYPE_BITS) ;
+            #else
+            // PLUS monoid for float, double, or 64-bit integers 
+            cij = GB_CTYPE_CAST (ainz, 0) ;
+            #endif
+        }
+        #else
         {
-            int64_t ia = Ai [pA] ;
-            int64_t ib = Bi [pB] ;
-            if (ia < ib)
-            { 
-                // A(ia,i) appears before B(ib,j)
-                // discard all entries A(ia:ib-1,i)
-                int64_t pleft = pA + 1 ;
-                int64_t pright = pA_end - 1 ;
-                GB_TRIM_BINARY_SEARCH (ib, Ai, pleft, pright) ;
-                ASSERT (pleft > pA) ;
-                pA = pleft ;
-            }
-            else if (ib < ia)
+            int64_t k = Ai [pA] ;               // first row index of A(:,i)
+            // cij = A(k,i) * B(k,j)
+            GB_GETA (aki, Ax, pA  ) ;           // aki = A(k,i)
+            GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
+            GB_MULT (cij, aki, bkj, i, k, j) ;  // cij = aki * bkj
+            GB_PRAGMA_SIMD_DOT (cij)
+            for (int64_t p = pA+1 ; p < pA_end ; p++)
             { 
-                // B(ib,j) appears before A(ia,i)
-                pB++ ;
-            }
-            else // ia == ib == k
-            { 
-                // A(k,i) and B(k,j) are the next entries to merge
-                GB_DOT (0,0) ;
-                pA++ ;
-                pB++ ;
+                GB_DOT_TERMINAL (cij) ;             // break if cij terminal
+                int64_t k = Ai [p] ;
+                // cij += A(k,i) * B(k,j)
+                GB_GETA (aki, Ax, p   ) ;           // aki = A(k,i)
+                GB_GETB (bkj, Bx, pB+k) ;           // bkj = B(k,j)
+                GB_MULTADD (cij, aki, bkj, i, k, j) ;   // cij += aki * bkj
             }
         }
+        #endif
+        GB_DOT_ALWAYS_SAVE_CIJ ;
 
     }
-    else if (bjnz > 8 * ainz)
+    #elif ( (GB_A_IS_SPARSE || GB_A_IS_HYPER) && GB_B_IS_BITMAP )
     {
 
         //----------------------------------------------------------------------
-        // A(:,i) is very sparse compared to B(:,j)
+        // A is sparse/hyper and B is bitmap
         //----------------------------------------------------------------------
 
-        while (pA < pA_end && pB < pB_end)
+        for (int64_t p = pA ; p < pA_end ; p++)
         {
-            int64_t ia = Ai [pA] ;
-            int64_t ib = Bi [pB] ;
-            if (ia < ib)
-            { 
-                // A(ia,i) appears before B(ib,j)
-                pA++ ;
-            }
-            else if (ib < ia)
+            int64_t k = Ai [p] ;
+            if (Bb [pB+k])
             { 
-                // B(ib,j) appears before A(ia,i)
-                // discard all entries B(ib:ia-1,j)
-                int64_t pleft = pB + 1 ;
-                int64_t pright = pB_end - 1 ;
-                GB_TRIM_BINARY_SEARCH (ia, Bi, pleft, pright) ;
-                ASSERT (pleft > pB) ;
-                pB = pleft ;
-            }
-            else // ia == ib == k
-            { 
-                // A(k,i) and B(k,j) are the next entries to merge
-                GB_DOT (0,0) ;
-                pA++ ;
-                pB++ ;
+                GB_DOT (k, p, pB+k) ;
             }
         }
+        GB_DOT_SAVE_CIJ ;
 
     }
-    else
+    #else
     {
 
         //----------------------------------------------------------------------
-        // A(:,i) and B(:,j) have about the same sparsity
+        // both A and B are sparse/hyper
         //----------------------------------------------------------------------
 
-        // This uses the following AVX2 instructions, for the PLUS_PAIR_T
-        // semirings (for T = int64, float, and double only):
-
-        // v =_mm256_set1_epi64x (int64_t x) ;
-        //      loads x into all 4 entries of the vector v:
-        //      for (k = 0 to 3) v [k] = x
-
-        // v = _mm256_loadu_si256 ((__m256i const *) x) ;
-        //      loads 4 entries from x [0...3] into the vector v:
-        //      for (k = 0 to 3) v [k] = x [k]
-
-        // c = _mm256_cmpeq_epi64 (a, b) ;
-        //      compares the vectors a and b, for k = 0:3
-        //      for (k = 0 to 3) c [k] = (a [k] == b [k]) ? -1 : 0
-
-        // c = _mm256_or_si256 (a,b) ;
-        //      logical or:
-        //      for (k = 0 to 3) c [k] = a [k] | b [k]
-
-        // mask = _mm256_movemask_pd (c) ;
-        //      sets each bit of the mask from most sig. bit of each entry in c
-        //      for (k = 0 to 3) mask bit [k] = (most sig. bit of c [k])
-
-        // k = _mm_popcnt_u32 (mask) ;
-        //      counts the number of bits set in a 32-bit integer (mask)
-
-        // load the next 4 entries of Ai [pA ...] and broadcast them
-        #define GB_BROADCAST(X)                     \
-            (X ## 0).m = _mm256_set1_epi64x (X ## i [p ## X +0]) ; \
-            (X ## 1).m = _mm256_set1_epi64x (X ## i [p ## X +1]) ; \
-            (X ## 2).m = _mm256_set1_epi64x (X ## i [p ## X +2]) ; \
-            (X ## 3).m = _mm256_set1_epi64x (X ## i [p ## X +3])
-
-        // load the next entries of Ai [pA ...] and broadcast it
-        #define GB_BROADCAST1(X)                        \
-            (X ## 0).m = _mm256_set1_epi64x (X ## i [p ## X +0])
-
-        // load the next 4 entries of Bi [pB ...]
-        #define GB_LOAD(X) \
-            (X ## 0).m = _mm256_loadu_si256 ((__m256i const *)(X ## i + p ## X))
-
-        // munch the next 4 entries of Ai [pA ...] and load the next 4
-        #define GB_MUNCH_BROADCAST(X)                   \
-        {                                               \
-            p ## X += 4 ;                               \
-            if (p ## X ## _end - p ## X < 4) break ;    \
-            GB_BROADCAST (X) ;                          \
-            continue ;                                  \
-        }
 
-        // munch the next entries of Ai [pA ...]
-        #define GB_MUNCH_BROADCAST1(X)                  \
-        {                                               \
-            p ## X += 1 ;                               \
-            if (p ## X ## _end - p ## X < 1) break ;    \
-            GB_BROADCAST1 (X) ;                         \
-            continue ;                                  \
-        }
+        if (Ai [pA_end-1] < ib_first || ib_last < Ai [pA])
+        { 
 
-        // munch the next 4 entries of Bi [pB ...] and load the next 4
-        #define GB_MUNCH_LOAD(X)                        \
-        {                                               \
-            p ## X += 4 ;                               \
-            if (p ## X ## _end - p ## X < 4) break ;    \
-            GB_LOAD (X) ;                               \
-            continue ;                                  \
-        }
+            //------------------------------------------------------------------
+            // pattern of A(:,i) and B(:,j) don't overlap; C(i,j) doesn't exist
+            //------------------------------------------------------------------
 
-        #if defined ( GB_USE_AVX2 )
+            ASSERT (!GB_CIJ_EXISTS) ;
 
-        if (ainz >= 4 && bjnz >= 4)
+        }
+        else if (ainz > 8 * bjnz)
         {
-            // if (ainz < bjnz)
+
+            //------------------------------------------------------------------
+            // B(:,j) is very sparse compared to A(:,i)
+            //------------------------------------------------------------------
+
+            while (pA < pA_end && pB < pB_end)
             {
-                GB_vector B0, A0, A1, A2, A3 ;
-                GB_BROADCAST (A) ;
-                GB_LOAD (B) ;
-                while (1)
-                {
-                    ASSERT (pA_end - pA >= 4) ;
-                    ASSERT (pB_end - pB >= 4) ;
-
-                    // skip if last entry of a comes before first entry of b
-                    if (A3.i [3] < B0.i [0]) GB_MUNCH_BROADCAST (A) ;
-
-                    // skip if last entry of b comes before first entry of a
-                    if (B0.i [3] < A0.i [0]) GB_MUNCH_LOAD (B)  ;
-
-                    // count the intersections
-
-                    // compare (4 instructions)
-                    GB_vector C0, C1, C2, C3 ;
-                    C0.m = _mm256_cmpeq_epi64 (A0.m, B0.m) ;
-                    C1.m = _mm256_cmpeq_epi64 (A1.m, B0.m) ;
-                    C2.m = _mm256_cmpeq_epi64 (A2.m, B0.m) ;
-                    C3.m = _mm256_cmpeq_epi64 (A3.m, B0.m) ;
-
-                    // or (3 instructions)
-                    C0.m = _mm256_or_si256 (C0.m, C1.m) ;
-                    C2.m = _mm256_or_si256 (C2.m, C3.m) ;
-                    C0.m = _mm256_or_si256 (C0.m, C2.m) ;
-
-                    // count (2 instructions)
-                    uint32_t mask = _mm256_movemask_pd (C0.d) ;
-                    cij += _mm_popcnt_u32 (mask) ;
-
-                    if (A3.i [3] < B0.i [3])
-                    {
-                        GB_MUNCH_BROADCAST (A) ;
-                    }
-                    else
-                    {
-                        GB_MUNCH_LOAD (B) ;
-                    }
+                int64_t ia = Ai [pA] ;
+                int64_t ib = Bi [pB] ;
+                if (ia < ib)
+                { 
+                    // A(ia,i) appears before B(ib,j)
+                    // discard all entries A(ia:ib-1,i)
+                    int64_t pleft = pA + 1 ;
+                    int64_t pright = pA_end - 1 ;
+                    GB_TRIM_BINARY_SEARCH (ib, Ai, pleft, pright) ;
+                    ASSERT (pleft > pA) ;
+                    pA = pleft ;
                 }
-            }
-            #if 0
-            else
-            {
-
-                GB_vector A0, B0, B1, B2, B3 ;
-                GB_LOAD (A) ;
-                GB_BROADCAST (B) ;
-                while (1)
-                {
-                    ASSERT (pA_end - pA >= 4) ;
-                    ASSERT (pB_end - pB >= 4) ;
-
-                    // skip if last entry of a comes before first entry of b
-                    if (A0.i [3] < B0.i [0]) GB_MUNCH_LOAD (A) ;
-
-                    // skip if last entry of b comes before first entry of a
-                    if (B3.i [3] < A0.i [0]) GB_MUNCH_BROADCAST (B)  ;
-
-                    // count the intersections
-
-                    // compare (4 instructions)
-                    GB_vector C0, C1, C2, C3 ;
-                    C0.m = _mm256_cmpeq_epi64 (A0.m, B0.m) ;
-                    C1.m = _mm256_cmpeq_epi64 (A0.m, B1.m) ;
-                    C2.m = _mm256_cmpeq_epi64 (A0.m, B2.m) ;
-                    C3.m = _mm256_cmpeq_epi64 (A0.m, B3.m) ;
-
-                    // or (3 instructions)
-                    C0.m = _mm256_or_si256 (C0.m, C1.m) ;
-                    C2.m = _mm256_or_si256 (C2.m, C3.m) ;
-                    C0.m = _mm256_or_si256 (C0.m, C2.m) ;
-
-                    // count (2 instructions)
-                    uint32_t mask = _mm256_movemask_pd (C0.d) ;
-                    cij += _mm_popcnt_u32 (mask) ;
-
-                    if (A0.i [3] < B3.i [3])
-                    {
-                        GB_MUNCH_LOAD (A) ;
-                    }
-                    else
-                    {
-                        GB_MUNCH_BROADCAST (B) ;
-                    }
+                else if (ib < ia)
+                { 
+                    // B(ib,j) appears before A(ia,i)
+                    pB++ ;
+                }
+                else // ia == ib == k
+                { 
+                    // A(k,i) and B(k,j) are the next entries to merge
+                    GB_DOT (ia, pA, pB) ;
+                    pA++ ;
+                    pB++ ;
                 }
             }
-            #endif
-        }
-        #endif
-
-#if 0
-        // load the next 3 entries of Ai [pA ...]
-        #define GB_LOAD_A                   \
-            a [0] = Ai [pA  ] ;             \
-            a [1] = Ai [pA+1] ;             \
-            a [2] = Ai [pA+2]
-
-        // load the next 3 entries of Bi [pB ...]
-        #define GB_LOAD_B                   \
-            b [0] = Bi [pB  ] ;             \
-            b [1] = Bi [pB+1] ;             \
-            b [2] = Bi [pB+2]
-
-        // munch the next 3 entries of Ai [pA ...] and load the next 3
-        #define GB_MUNCH_A                  \
-        {                                   \
-            pA += 3 ;                       \
-            if (pA_end - pA < 3) break ;    \
-            GB_LOAD_A ;                     \
-            continue ;                      \
-        }
+            GB_DOT_SAVE_CIJ ;
 
-        // munch the next 3 entries of Bi [pB ...] and load the next 3
-        #define GB_MUNCH_B                  \
-        {                                   \
-            pB += 3 ;                       \
-            if (pB_end - pB < 3) break ;    \
-            GB_LOAD_B ;                     \
-            continue ;                      \
         }
+        else if (bjnz > 8 * ainz)
+        {
 
-        // munch the next 3 entries of both Ai and Bi, and load next 3 of both
-        #define GB_MUNCH_AB                 \
-        {                                   \
-            pA += 3 ;                       \
-            pB += 3 ;                       \
-            if (pB_end - pB < 3) break ;    \
-            if (pA_end - pA < 3) break ;    \
-            GB_LOAD_A ;                     \
-            GB_LOAD_B ;                     \
-            continue ;                      \
-        }
+            //------------------------------------------------------------------
+            // A(:,i) is very sparse compared to B(:,j)
+            //------------------------------------------------------------------
 
-        // compute the dot product for vectors longer than 3 entries
-        if (ainz >= 3 && bjnz >= 3)
-        {
-            int64_t a [3] ; GB_LOAD_A ;
-            int64_t b [3] ; GB_LOAD_B ;
-            while (1)
+            while (pA < pA_end && pB < pB_end)
             {
-                // get the next 3 entries from each list
-                ASSERT (pA_end - pA >= 3) ;
-                ASSERT (pB_end - pB >= 3) ;
-                if (a [2] < b [0]) GB_MUNCH_A ;
-                if (b [2] < a [0]) GB_MUNCH_B ;
-
-                #if GB_IS_PLUS_PAIR_REAL_SEMIRING
-
-                    #if GB_CTYPE_IGNORE_OVERFLOW
-
-                        // no need to handle overflow 
-                        cij +=
-                        (a [0] == b [0]) + (a [0] == b [1]) + (a [0] == b [2]) +
-                        (a [1] == b [0]) + (a [1] == b [1]) + (a [1] == b [2]) +
-                        (a [2] == b [0]) + (a [2] == b [1]) + (a [2] == b [2]) ;
-
-                        #if ( GB_PHASE_1_OF_2 )
-                        if (cij != 0)
-                        {
-                            cij_exists = true ;
-                            break ;
-                        }
-                        #endif
-
-                    #else
-
-                        // cij might overflow
-                        GB_CTYPE cij_delta =
-                        (a [0] == b [0]) + (a [0] == b [1]) + (a [0] == b [2]) +
-                        (a [1] == b [0]) + (a [1] == b [1]) + (a [1] == b [2]) +
-                        (a [2] == b [0]) + (a [2] == b [1]) + (a [2] == b [2]) ;
-                        if (cij_delta !=0)
-                        {
-                            cij_exists = true ;
-                            #if ( GB_PHASE_1_OF_2 )
-                            break ;
-                            #else
-                            cij += cij_delta ;
-                            #endif
-                        }
-
-                    #endif
-
-                #else
-
-                    int c [3][3] ;
-
-                    c [0][0] = (a [0] == b [0]) ;
-                    c [1][0] = (a [1] == b [0]) ;
-                    c [2][0] = (a [2] == b [0]) ;
-
-                    c [0][1] = (a [0] == b [1]) ;
-                    c [1][1] = (a [1] == b [1]) ;
-                    c [2][1] = (a [2] == b [1]) ;
-
-                    c [0][2] = (a [0] == b [2]) ;
-                    c [1][2] = (a [1] == b [2]) ;
-                    c [2][2] = (a [2] == b [2]) ;
-
-                    if (c [0][0]) { GB_DOT (0, 0) ; }
-                    if (c [1][0]) { GB_DOT (1, 0) ; }
-                    if (c [2][0]) { GB_DOT (2, 0) ; }
-
-                    if (c [0][1]) { GB_DOT (0, 1) ; }
-                    if (c [1][1]) { GB_DOT (1, 1) ; }
-                    if (c [2][1]) { GB_DOT (2, 1) ; }
-
-                    if (c [0][2]) { GB_DOT (0, 2) ; }
-                    if (c [1][2]) { GB_DOT (1, 2) ; }
-                    if (c [2][2]) { GB_DOT (2, 2) ; GB_MUNCH_AB ; }
-
-                #endif
-
-                if (a [2] < b [2]) GB_MUNCH_A else GB_MUNCH_B ;
+                int64_t ia = Ai [pA] ;
+                int64_t ib = Bi [pB] ;
+                if (ia < ib)
+                { 
+                    // A(ia,i) appears before B(ib,j)
+                    pA++ ;
+                }
+                else if (ib < ia)
+                { 
+                    // B(ib,j) appears before A(ia,i)
+                    // discard all entries B(ib:ia-1,j)
+                    int64_t pleft = pB + 1 ;
+                    int64_t pright = pB_end - 1 ;
+                    GB_TRIM_BINARY_SEARCH (ia, Bi, pleft, pright) ;
+                    ASSERT (pleft > pB) ;
+                    pB = pleft ;
+                }
+                else // ia == ib == k
+                { 
+                    // A(k,i) and B(k,j) are the next entries to merge
+                    GB_DOT (ia, pA, pB) ;
+                    pA++ ;
+                    pB++ ;
+                }
             }
-        }
+            GB_DOT_SAVE_CIJ ;
 
-        #if defined ( GB_PHASE_1_OF_2 )
-            // symbolic phase: if C(i,j) already exists, skip the cleanup
-            cij_is_terminal = GB_CIJ_EXISTS ;
-        #endif
-        if (!cij_is_terminal)
-#endif
+        }
+        else
         {
-            // cleanup for remaining entries of A(:,i) and B(:,j)
+
+            //------------------------------------------------------------------
+            // A(:,i) and B(:,j) have about the same sparsity
+            //------------------------------------------------------------------
+
             while (pA < pA_end && pB < pB_end)
             {
                 int64_t ia = Ai [pA] ;
                 int64_t ib = Bi [pB] ;
-                if (ia == ib)
-                {
-                    GB_DOT (0,0) ;
+                if (ia < ib)
+                { 
+                    // A(ia,i) appears before B(ib,j)
                     pA++ ;
+                }
+                else if (ib < ia)
+                { 
+                    // B(ib,j) appears before A(ia,i)
                     pB++ ;
                 }
-                else
-                {
-                    pA += (ia < ib) ;
-                    pB += (ib < ia) ;
+                else // ia == ib == k
+                { 
+                    // A(k,i) and B(k,j) are the next entries to merge
+                    GB_DOT (ia, pA, pB) ;
+                    pA++ ;
+                    pB++ ;
                 }
             }
+            GB_DOT_SAVE_CIJ ;
         }
     }
-
-    //--------------------------------------------------------------------------
-    // save C(i,j)
-    //--------------------------------------------------------------------------
-
-    #if defined ( GB_DOT3 )
-
-        // GB_AxB_dot3: computing C<M>=A'*B
-        if (GB_CIJ_EXISTS)
-        { 
-            // C(i,j) = cij
-            GB_CIJ_SAVE (cij, pC) ;
-            Ci [pC] = i ;
-        }
-        else
-        { 
-            // C(i,j) becomes a zombie
-            task_nzombies++ ;
-            Ci [pC] = GB_FLIP (i) ;
-        }
-
-    #else
-
-        // GB_AxB_dot2: computing C=A'*B or C<!M>=A'*B
-        if (GB_CIJ_EXISTS)
-        { 
-            // C(i,j) = cij
-            #if defined ( GB_PHASE_1_OF_2 )
-                C_count [Iter_k] ++ ;
-            #else
-                GB_CIJ_SAVE (cij, cnz) ;
-                Ci [cnz++] = i ;
-                if (cnz > cnz_last) break ;
-            #endif
-        }
-
     #endif
 }
 
-#undef GB_DOT
-#undef GB_LOAD_A
-#undef GB_LOAD_B
-#undef GB_MUNCH_A
-#undef GB_MUNCH_B
-#undef GB_MUNCH_AB
-#undef GB_CIJ_EXISTS
-#undef GB_CIJ_CHECK
-#undef GB_USE_AVX2
-
-#undef GB_BROADCAST
-#undef GB_BROADCAST1
-#undef GB_LOAD
-#undef GB_MUNCH_BROADCAST
-#undef GB_MUNCH_BROADCAST1
-#undef GB_MUNCH_LOAD
-
diff --git a/GraphBLAS/Source/Template/GB_AxB_dot_cij.h b/GraphBLAS/Source/Template/GB_AxB_dot_cij.h
new file mode 100644
index 0000000000..41cf1e8ef4
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_dot_cij.h
@@ -0,0 +1,111 @@
+//------------------------------------------------------------------------------
+// GB_AxB_dot_cij.h: definitions for GB_AxB_dot*_cij.c
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// GB_DOT: cij += A(k,i) * B(k,j), then break if terminal
+// Ai [pA] and Bi [pB] are both equal to the index k.
+
+// use the boolean flag cij_exists to set/check if C(i,j) exists
+#undef  GB_CIJ_CHECK
+#define GB_CIJ_CHECK true
+#undef  GB_CIJ_EXISTS
+#define GB_CIJ_EXISTS cij_exists
+#undef  GB_DOT
+
+#if GB_IS_PLUS_PAIR_REAL_SEMIRING
+
+    //--------------------------------------------------------------------------
+    // plus_pair_real semiring
+    //--------------------------------------------------------------------------
+
+    #if GB_CTYPE_IGNORE_OVERFLOW
+
+        // PLUS_PAIR for 64-bit integers, float, and double (not complex):
+        // To check if C(i,j) exists, test (cij != 0) when done.  The
+        // boolean flag cij_exists is not used.
+        #undef  GB_CIJ_CHECK
+        #define GB_CIJ_CHECK false
+        #undef  GB_CIJ_EXISTS
+        #define GB_CIJ_EXISTS (cij != 0)
+        #define GB_DOT(k,pA,pB) cij++ ;
+
+    #else
+
+        // PLUS_PAIR semiring for small integers
+        #define GB_DOT(k,pA,pB)                                         \
+        {                                                               \
+            cij_exists = true ;                                         \
+            cij++ ;                                                     \
+        }
+
+    #endif
+
+#elif GB_IS_ANY_MONOID
+
+    //--------------------------------------------------------------------------
+    // ANY monoid
+    //--------------------------------------------------------------------------
+
+    #if defined ( GB_DOT3 )
+
+        // for the dot3 method: C is sparse or hyper
+        #define GB_DOT(k,pA,pB)                                         \
+        {                                                               \
+            GB_GETA (aki, Ax, pA) ;  /* aki = A(k,i) */                 \
+            GB_GETB (bkj, Bx, pB) ;  /* bkj = B(k,j) */                 \
+            /* cij = (A')(i,k) * B(k,j), and add to the pattern */      \
+            cij_exists = true ;                                         \
+            GB_MULT (cij, aki, bkj, i, k, j) ;                          \
+            break ;                                                     \
+        }
+
+    #else
+
+        // for the dot2 method: C is bitmap
+        #define GB_DOT(k,pA,pB)                                         \
+        {                                                               \
+            GB_GETA (aki, Ax, pA) ;  /* aki = A(k,i) */                 \
+            GB_GETB (bkj, Bx, pB) ;  /* bkj = B(k,j) */                 \
+            /* cij = (A')(i,k) * B(k,j), and add to the pattern */      \
+            GB_MULT (cij, aki, bkj, i, k, j) ;                          \
+            int64_t pC = pC_start + i ;                                 \
+            GB_PUTC (cij, pC) ;                                         \
+            Cb [pC] = 1 ;                                               \
+            task_cnvals++ ;                                             \
+            break ;                                                     \
+        }
+
+    #endif
+
+#else
+
+    //--------------------------------------------------------------------------
+    // all other semirings
+    //--------------------------------------------------------------------------
+
+    #define GB_DOT(k,pA,pB)                                             \
+    {                                                                   \
+        GB_GETA (aki, Ax, pA) ;  /* aki = A(k,i) */                     \
+        GB_GETB (bkj, Bx, pB) ;  /* bkj = B(k,j) */                     \
+        if (cij_exists)                                                 \
+        {                                                               \
+            /* cij += (A')(i,k) * B(k,j) */                             \
+            GB_MULTADD (cij, aki, bkj, i, k, j) ;                       \
+        }                                                               \
+        else                                                            \
+        {                                                               \
+            /* cij = (A')(i,k) * B(k,j), and add to the pattern */      \
+            cij_exists = true ;                                         \
+            GB_MULT (cij, aki, bkj, i, k, j) ;                          \
+        }                                                               \
+        /* if (cij is terminal) break ; */                              \
+        GB_DOT_TERMINAL (cij) ;                                         \
+    }
+
+#endif
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_dot_generic.c b/GraphBLAS/Source/Template/GB_AxB_dot_generic.c
new file mode 100644
index 0000000000..7a0f9850cb
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_dot_generic.c
@@ -0,0 +1,344 @@
+//------------------------------------------------------------------------------
+// GB_AxB_dot_generic: generic template for all dot-product methods
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This template serves all three dot product methods.  The #including file
+// defines GB_DOT2_GENERIC, GB_DOT3_GENERIC, or GB_DOT4_GENERIC.
+
+{
+
+    //--------------------------------------------------------------------------
+    // get operators, functions, workspace, contents of A, B, C
+    //--------------------------------------------------------------------------
+
+    GxB_binary_function fmult = mult->function ;    // NULL if positional
+    GxB_binary_function fadd  = add->op->function ;
+    GB_Opcode opcode = mult->opcode ;
+    bool op_is_positional = GB_OPCODE_IS_POSITIONAL (opcode) ;
+
+    size_t csize = C->type->size ;
+    size_t asize = A_is_pattern ? 0 : A->type->size ;
+    size_t bsize = B_is_pattern ? 0 : B->type->size ;
+
+    size_t xsize = mult->xtype->size ;
+    size_t ysize = mult->ytype->size ;
+
+    // scalar workspace: because of typecasting, the x/y types need not
+    // be the same as the size of the A and B types.
+    // flipxy false: aki = (xtype) A(k,i) and bkj = (ytype) B(k,j)
+    // flipxy true:  aki = (ytype) A(k,i) and bkj = (xtype) B(k,j)
+    size_t aki_size = flipxy ? ysize : xsize ;
+    size_t bkj_size = flipxy ? xsize : ysize ;
+
+    GB_void *GB_RESTRICT terminal = (GB_void *) add->terminal ;
+
+    GB_cast_function cast_A, cast_B ;
+    if (flipxy)
+    { 
+        // A is typecasted to y, and B is typecasted to x
+        cast_A = A_is_pattern ? NULL : 
+                 GB_cast_factory (mult->ytype->code, A->type->code) ;
+        cast_B = B_is_pattern ? NULL : 
+                 GB_cast_factory (mult->xtype->code, B->type->code) ;
+    }
+    else
+    { 
+        // A is typecasted to x, and B is typecasted to y
+        cast_A = A_is_pattern ? NULL :
+                 GB_cast_factory (mult->xtype->code, A->type->code) ;
+        cast_B = B_is_pattern ? NULL :
+                 GB_cast_factory (mult->ytype->code, B->type->code) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // C = A'*B via dot products, function pointers, and typecasting
+    //--------------------------------------------------------------------------
+
+    #define GB_ATYPE GB_void
+    #define GB_BTYPE GB_void
+    #define GB_PHASE_2_OF_2
+
+    // no vectorization
+    #define GB_PRAGMA_SIMD_VECTORIZE ;
+    #define GB_PRAGMA_SIMD_DOT(cij) ;
+
+    if (op_is_positional)
+    { 
+
+        //----------------------------------------------------------------------
+        // generic semirings with positional multiply operators
+        //----------------------------------------------------------------------
+
+        if (flipxy)
+        { 
+            // flip a positional multiplicative operator
+            opcode = GB_binop_flip (opcode) ;
+        }
+
+        // aki = A(i,k), located in Ax [pA], value not used
+        #define GB_GETA(aki,Ax,pA) ;
+
+        // bkj = B(k,j), located in Bx [pB], value not used
+        #define GB_GETB(bkj,Bx,pB) ;
+
+        // define cij for each task
+        #define GB_CIJ_DECLARE(cij) GB_CTYPE cij
+
+        // address of Cx [p]
+        #define GB_CX(p) (&Cx [p])
+
+        // cij = Cx [p]
+        #define GB_GETC(cij,p) cij = Cx [p]
+
+        // Cx [p] = cij
+        #define GB_PUTC(cij,p) Cx [p] = cij
+
+        // break if cij reaches the terminal value
+        #define GB_DOT_TERMINAL(cij)                                    \
+            if (is_terminal && cij == cij_terminal)                     \
+            {                                                           \
+                break ;                                                 \
+            }
+
+        // C(i,j) += (A')(i,k) * B(k,j)
+        #define GB_MULTADD(cij, aki, bkj, i, k, j)                      \
+            GB_CTYPE zwork ;                                            \
+            GB_MULT (zwork, aki, bkj, i, k, j) ;                        \
+            fadd (&cij, &cij, &zwork)
+
+        int64_t offset = GB_positional_offset (opcode) ;
+
+        if (mult->ztype == GrB_INT64)
+        {
+            #define GB_CTYPE int64_t
+            int64_t cij_terminal = 0 ;
+            bool is_terminal = (terminal != NULL) ;
+            if (is_terminal)
+            { 
+                memcpy (&cij_terminal, terminal, sizeof (int64_t)) ;
+            }
+            switch (opcode)
+            {
+                case GB_FIRSTI_opcode   :   // z = first_i(A'(i,k),y) == i
+                case GB_FIRSTI1_opcode  :   // z = first_i1(A'(i,k),y) == i+1
+                    #undef  GB_MULT
+                    #define GB_MULT(t, aki, bkj, i, k, j) t = i + offset
+                    #if defined ( GB_DOT2_GENERIC )
+                    #include "GB_AxB_dot2_meta.c"
+                    #elif defined ( GB_DOT3_GENERIC )
+                    #include "GB_AxB_dot3_meta.c"
+                    #else
+                    #include "GB_AxB_dot4_meta.c"
+                    #endif
+                    break ;
+                case GB_FIRSTJ_opcode   :   // z = first_j(A'(i,k),y) == k
+                case GB_FIRSTJ1_opcode  :   // z = first_j1(A'(i,k),y) == k+1
+                case GB_SECONDI_opcode  :   // z = second_i(x,B(k,j)) == k
+                case GB_SECONDI1_opcode :   // z = second_i1(x,B(k,j)) == k+1
+                    #undef  GB_MULT
+                    #define GB_MULT(t, aki, bkj, i, k, j) t = k + offset
+                    #if defined ( GB_DOT2_GENERIC )
+                    #include "GB_AxB_dot2_meta.c"
+                    #elif defined ( GB_DOT3_GENERIC )
+                    #include "GB_AxB_dot3_meta.c"
+                    #else
+                    #include "GB_AxB_dot4_meta.c"
+                    #endif
+                    break ;
+                case GB_SECONDJ_opcode  :   // z = second_j(x,B(k,j)) == j
+                case GB_SECONDJ1_opcode :   // z = second_j1(x,B(k,j)) == j+1
+                    #undef  GB_MULT
+                    #define GB_MULT(t, aki, bkj, i, k, j) t = j + offset
+                    #if defined ( GB_DOT2_GENERIC )
+                    #include "GB_AxB_dot2_meta.c"
+                    #elif defined ( GB_DOT3_GENERIC )
+                    #include "GB_AxB_dot3_meta.c"
+                    #else
+                    #include "GB_AxB_dot4_meta.c"
+                    #endif
+                    break ;
+                default: ;
+            }
+        }
+        else
+        {
+            #undef  GB_CTYPE
+            #define GB_CTYPE int32_t
+            int32_t cij_terminal = 0 ;
+            bool is_terminal = (terminal != NULL) ;
+            if (is_terminal)
+            { 
+                memcpy (&cij_terminal, terminal, sizeof (int32_t)) ;
+            }
+            switch (opcode)
+            {
+                case GB_FIRSTI_opcode   :   // z = first_i(A'(i,k),y) == i
+                case GB_FIRSTI1_opcode  :   // z = first_i1(A'(i,k),y) == i+1
+                    #undef  GB_MULT
+                    #define GB_MULT(t,aki,bkj,i,k,j) t = (int32_t) (i + offset)
+                    #if defined ( GB_DOT2_GENERIC )
+                    #include "GB_AxB_dot2_meta.c"
+                    #elif defined ( GB_DOT3_GENERIC )
+                    #include "GB_AxB_dot3_meta.c"
+                    #else
+                    #include "GB_AxB_dot4_meta.c"
+                    #endif
+                    break ;
+                case GB_FIRSTJ_opcode   :   // z = first_j(A'(i,k),y) == k
+                case GB_FIRSTJ1_opcode  :   // z = first_j1(A'(i,k),y) == k+1
+                case GB_SECONDI_opcode  :   // z = second_i(x,B(k,j)) == k
+                case GB_SECONDI1_opcode :   // z = second_i1(x,B(k,j)) == k+1
+                    #undef  GB_MULT
+                    #define GB_MULT(t,aki,bkj,i,k,j) t = (int32_t) (k + offset)
+                    #if defined ( GB_DOT2_GENERIC )
+                    #include "GB_AxB_dot2_meta.c"
+                    #elif defined ( GB_DOT3_GENERIC )
+                    #include "GB_AxB_dot3_meta.c"
+                    #else
+                    #include "GB_AxB_dot4_meta.c"
+                    #endif
+                    break ;
+                case GB_SECONDJ_opcode  :   // z = second_j(x,B(k,j)) == j
+                case GB_SECONDJ1_opcode :   // z = second_j1(x,B(k,j)) == j+1
+                    #undef  GB_MULT
+                    #define GB_MULT(t,aki,bkj,i,k,j) t = (int32_t) (j + offset)
+                    #if defined ( GB_DOT2_GENERIC )
+                    #include "GB_AxB_dot2_meta.c"
+                    #elif defined ( GB_DOT3_GENERIC )
+                    #include "GB_AxB_dot3_meta.c"
+                    #else
+                    #include "GB_AxB_dot4_meta.c"
+                    #endif
+                    break ;
+                default: ;
+            }
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // generic semirings with standard multiply operators
+        //----------------------------------------------------------------------
+
+        // aki = A(k,i), located in Ax [pA]
+        #undef  GB_GETA
+        #define GB_GETA(aki,Ax,pA)                                      \
+            GB_void aki [GB_VLA(aki_size)] ;                            \
+            if (!A_is_pattern) cast_A (aki, Ax +((pA)*asize), asize)
+
+        // bkj = B(k,j), located in Bx [pB]
+        #undef  GB_GETB
+        #define GB_GETB(bkj,Bx,pB)                                      \
+            GB_void bkj [GB_VLA(bkj_size)] ;                            \
+            if (!B_is_pattern) cast_B (bkj, Bx +((pB)*bsize), bsize)
+
+        // define cij for each task
+        #undef  GB_CIJ_DECLARE
+        #define GB_CIJ_DECLARE(cij) GB_void cij [GB_VLA(csize)]
+
+        // address of Cx [p]
+        #undef  GB_CX
+        #define GB_CX(p) Cx +((p)*csize)
+
+        // cij = Cx [p]
+        #undef  GB_GETC
+        #define GB_GETC(cij,p) memcpy (cij, GB_CX (p), csize)
+
+        // Cx [p] = cij
+        #undef  GB_PUTC
+        #define GB_PUTC(cij,p) memcpy (GB_CX (p), cij, csize)
+
+        // break if cij reaches the terminal value
+        #undef  GB_DOT_TERMINAL
+        #define GB_DOT_TERMINAL(cij)                                    \
+            if (terminal != NULL && memcmp (cij, terminal, csize) == 0) \
+            {                                                           \
+                break ;                                                 \
+            }
+
+        // C(i,j) += (A')(i,k) * B(k,j)
+        #undef  GB_MULTADD
+        #define GB_MULTADD(cij, aki, bkj, i, k, j)                      \
+            GB_void zwork [GB_VLA(csize)] ;                             \
+            GB_MULT (zwork, aki, bkj, i, k, j) ;                        \
+            fadd (cij, cij, zwork)
+
+        #undef  GB_CTYPE
+        #define GB_CTYPE GB_void
+
+        if (opcode == GB_FIRST_opcode || opcode == GB_SECOND_opcode)
+        {
+            // fmult is not used and can be NULL (for user-defined types)
+            if (flipxy)
+            { 
+                // flip first and second
+                opcode = GB_binop_flip (opcode) ;
+            }
+            if (opcode == GB_FIRST_opcode)
+            { 
+                // t = A(i,k)
+                ASSERT (B_is_pattern) ;
+                #undef  GB_MULT
+                #define GB_MULT(t, aik, bkj, i, k, j) memcpy (t, aik, csize)
+                #if defined ( GB_DOT2_GENERIC )
+                #include "GB_AxB_dot2_meta.c"
+                #elif defined ( GB_DOT3_GENERIC )
+                #include "GB_AxB_dot3_meta.c"
+                #else
+                #include "GB_AxB_dot4_meta.c"
+                #endif
+            }
+            else // opcode == GB_SECOND_opcode
+            { 
+                // t = B(i,k)
+                ASSERT (A_is_pattern) ;
+                #undef  GB_MULT
+                #define GB_MULT(t, aik, bkj, i, k, j) memcpy (t, bkj, csize)
+                #if defined ( GB_DOT2_GENERIC )
+                #include "GB_AxB_dot2_meta.c"
+                #elif defined ( GB_DOT3_GENERIC )
+                #include "GB_AxB_dot3_meta.c"
+                #else
+                #include "GB_AxB_dot4_meta.c"
+                #endif
+            }
+        }
+        else
+        {
+            if (flipxy)
+            { 
+                // t = B(k,j) * (A')(i,k)
+                #undef  GB_MULT
+                #define GB_MULT(t, aki, bkj, i, k, j) fmult (t, bkj, aki)
+                #if defined ( GB_DOT2_GENERIC )
+                #include "GB_AxB_dot2_meta.c"
+                #elif defined ( GB_DOT3_GENERIC )
+                #include "GB_AxB_dot3_meta.c"
+                #else
+                #include "GB_AxB_dot4_meta.c"
+                #endif
+            }
+            else
+            { 
+                // t = (A')(i,k) * B(k,j)
+                #undef  GB_MULT
+                #define GB_MULT(t, aki, bkj, i, k, j) fmult (t, aki, bkj)
+                #if defined ( GB_DOT2_GENERIC )
+                #include "GB_AxB_dot2_meta.c"
+                #elif defined ( GB_DOT3_GENERIC )
+                #include "GB_AxB_dot3_meta.c"
+                #else
+                #include "GB_AxB_dot4_meta.c"
+                #endif
+            }
+        }
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_factory.c b/GraphBLAS/Source/Template/GB_AxB_factory.c
index bbde125159..563b745069 100644
--- a/GraphBLAS/Source/Template/GB_AxB_factory.c
+++ b/GraphBLAS/Source/Template/GB_AxB_factory.c
@@ -2,24 +2,31 @@
 // GB_AxB_factory: switch factory for C=A*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// This is used by GB_AxB_saxpy3.c and GB_AxB_dot[234].c to create 1438
+// This is used by GB_AxB_saxpy3.c and GB_AxB_dot[234].c to create the
 // built-in versions of sparse matrix-matrix multiplication.  The #include'ing
 // file defines the GB_AxB_WORKER macro, and mult_opcode, add_opcode, xcode,
 // ycode, and zcode.
 
-// Two 2nd level switch factories are used:
+// Three 2nd level switch factories are used:
 
 //      GB_AxB_type_factory: handles all semirings where the multiply operator
 //          is TxT->T (as is the monoid).
 
 //      GB_AxB_compare_factory: handles all semirings where the multiply
-//          operator is TxT->bool (for the comparison operators, LT, GT, etc),
-//          and where the monoid is bool x bool -> bool.
+//          operator is TxT -> bool (for the comparison operators, LT, GT,
+//          etc), and where the monoid is bool x bool -> bool.
+
+//      GB_AxB_bitwise_factory: handles all semirings for bitwise operators.
+
+//      GxB_AxB_positional_factory: handles all semirings for positional
+//          multiply operators.  Those operators are of the for XxX -> int64,
+//          where "X" denotes any type.  No typecasting is needed from the
+//          types of A and B.
 
 // If the multiplicative operator is ANY, then it has already been renamed to
 // SECOND, prior to using this factory, since that is faster for the
@@ -44,7 +51,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50: (min,max,plus,times,any) for 10 non-boolean real
             // 5: (or,and,xor,eq,any) for boolean
             // 6: (plus,times,any) for 2 complex
-            #define GB_MULT_NAME _first
+            #define GB_MNAME _first
             #define GB_COMPLEX
             #include "GB_AxB_type_factory.c"
             break ;
@@ -57,7 +64,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50: (min,max,plus,times,any) for 10 real non-boolean
             // 5: (or,and,xor,eq,any) for boolean
             // 6: (plus,times,any) for 2 complex
-            #define GB_MULT_NAME _second
+            #define GB_MNAME _second
             #define GB_COMPLEX
             #include "GB_AxB_type_factory.c"
             break ;
@@ -73,7 +80,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // land_pair, lor_pair, max_pair, min_pair, times_pair, eq_pair
             // all become any_pair.
             #define GB_MULT_IS_PAIR_OPERATOR
-            #define GB_MULT_NAME _pair
+            #define GB_MNAME _pair
             #define GB_COMPLEX
             #include "GB_AxB_type_factory.c"
             #undef  GB_MULT_IS_PAIR_OPERATOR
@@ -86,7 +93,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50 semirings: (min,max,plus,times,any) for 10 real non-boolean
             // MIN == TIMES == AND for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _min
+            #define GB_MNAME _min
             #include "GB_AxB_type_factory.c"
             break ;
 
@@ -97,7 +104,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50 semirings: (min,max,plus,times,any) for 10 real non-boolean
             // MAX == PLUS == OR for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _max
+            #define GB_MNAME _max
             #include "GB_AxB_type_factory.c"
             break ;
 
@@ -110,7 +117,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 6: (plus,times,any) for 2 complex types
             // MAX == PLUS == OR for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _plus
+            #define GB_MNAME _plus
             #define GB_COMPLEX
             #include "GB_AxB_type_factory.c"
             break ;
@@ -124,7 +131,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 6: (plus,times,any) for 2 complex types
             // MINUS == RMINUS == NE == ISNE == XOR for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _minus
+            #define GB_MNAME _minus
             #define GB_COMPLEX
             #include "GB_AxB_type_factory.c"
             break ;
@@ -137,7 +144,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50 semirings: (min,max,plus,times,any) for 10 real non-boolean
             // 6: (plus,times,any) for 2 complex types
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _rminus
+            #define GB_MNAME _rminus
             #define GB_COMPLEX
             #include "GB_AxB_type_factory.c"
             break ;
@@ -150,7 +157,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50 semirings: (min,max,plus,times,any) for 10 real non-boolean
             // 6: (plus,times,any) for 2 complex types
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _times
+            #define GB_MNAME _times
             #define GB_COMPLEX
             #include "GB_AxB_type_factory.c"
             break ;
@@ -164,7 +171,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 6: (plus,times,any) for 2 complex types
             // FIRST == DIV for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _div
+            #define GB_MNAME _div
             #define GB_COMPLEX
             #include "GB_AxB_type_factory.c"
             break ;
@@ -178,7 +185,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 6: (plus,times,any) for 2 complex types
             // SECOND == RDIV for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _rdiv
+            #define GB_MNAME _rdiv
             #define GB_COMPLEX
             #include "GB_AxB_type_factory.c"
             break ;
@@ -190,7 +197,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50 semirings: (min,max,plus,times,any) for 10 real non-boolean
             // ISEQ == EQ for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _iseq
+            #define GB_MNAME _iseq
             #include "GB_AxB_type_factory.c"
             break ;
 
@@ -201,7 +208,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50 semirings: (min,max,plus,times,any) for 10 real non-boolean
             // MINUS == RMINUS == NE == ISNE == XOR for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _isne
+            #define GB_MNAME _isne
             #include "GB_AxB_type_factory.c"
             break ;
 
@@ -212,7 +219,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50 semirings: (min,max,plus,times,any) for 10 real non-boolean
             // ISGT == GT for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _isgt
+            #define GB_MNAME _isgt
             #include "GB_AxB_type_factory.c"
             break ;
 
@@ -223,7 +230,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50 semirings: (min,max,plus,times,any) for 10 real non-boolean
             // ISLT == LT for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _islt
+            #define GB_MNAME _islt
             #include "GB_AxB_type_factory.c"
             break ;
 
@@ -234,7 +241,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50 semirings: (min,max,plus,times,any) for 10 real non-boolean
             // ISGE == GE for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _isge
+            #define GB_MNAME _isge
             #include "GB_AxB_type_factory.c"
             break ;
 
@@ -245,7 +252,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50 semirings: (min,max,plus,times,any) for 10 real non-boolean
             // ISLE == LE for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _isle
+            #define GB_MNAME _isle
             #include "GB_AxB_type_factory.c"
             break ;
 
@@ -254,7 +261,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
         //----------------------------------------------------------------------
 
             // 55 semirings: (and,or,xor,eq,any) * 11 types (all but complex)
-            #define GB_MULT_NAME _eq
+            #define GB_MNAME _eq
             #include "GB_AxB_compare_factory.c"
             break ;
 
@@ -265,7 +272,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
             // 50 semirings: (and,or,xor,eq,any) * (10 real non-boolean types)
             // MINUS == RMINUS == NE == ISNE == XOR for boolean
             #define GB_NO_BOOLEAN
-            #define GB_MULT_NAME _ne
+            #define GB_MNAME _ne
             #include "GB_AxB_compare_factory.c"
             break ;
 
@@ -274,7 +281,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
         //----------------------------------------------------------------------
 
             // 55 semirings: (and,or,xor,eq,any) * 11 types (all but complex)
-            #define GB_MULT_NAME _gt
+            #define GB_MNAME _gt
             #include "GB_AxB_compare_factory.c"
             break ;
 
@@ -283,7 +290,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
         //----------------------------------------------------------------------
 
             // 55 semirings: (and,or,xor,eq,any) * 11 types (all but complex)
-            #define GB_MULT_NAME _lt
+            #define GB_MNAME _lt
             #include "GB_AxB_compare_factory.c"
             break ;
 
@@ -292,7 +299,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
         //----------------------------------------------------------------------
 
             // 55 semirings: (and,or,xor,eq,any) * 11 types (all but complex)
-            #define GB_MULT_NAME _ge
+            #define GB_MNAME _ge
             #include "GB_AxB_compare_factory.c"
             break ;
 
@@ -301,7 +308,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
         //----------------------------------------------------------------------
 
             // 55 semirings: (and,or,xor,eq,any) * 11 types (all but complex)
-            #define GB_MULT_NAME _le
+            #define GB_MNAME _le
             #include "GB_AxB_compare_factory.c"
             break ;
 
@@ -310,7 +317,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
         //----------------------------------------------------------------------
 
             // 55 semirings: (and,or,xor,eq,any) * 11 types (all but complex)
-            #define GB_MULT_NAME _lor
+            #define GB_MNAME _lor
             #include "GB_AxB_type_factory.c"
             break ;
 
@@ -319,7 +326,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
         //----------------------------------------------------------------------
 
             // 55 semirings: (and,or,xor,eq,any) * 11 types (all but complex)
-            #define GB_MULT_NAME _land
+            #define GB_MNAME _land
             #include "GB_AxB_type_factory.c"
             break ;
 
@@ -328,7 +335,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
         //----------------------------------------------------------------------
 
             // 55 semirings: (and,or,xor,eq,any) * 11 types (all but complex)
-            #define GB_MULT_NAME _lxor
+            #define GB_MNAME _lxor
             #include "GB_AxB_type_factory.c"
             break ;
 
@@ -337,7 +344,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
         //----------------------------------------------------------------------
 
             // 16 semirings: (bor,band,bxor,bxnor) * (uint8,16,32,64)
-            #define GB_MULT_NAME _bor
+            #define GB_MNAME _bor
             #include "GB_AxB_bitwise_factory.c"
             break ;
 
@@ -346,7 +353,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
         //----------------------------------------------------------------------
 
             // 16 semirings: (bor,band,bxor,bxnor) * (uint8,16,32,64)
-            #define GB_MULT_NAME _band
+            #define GB_MNAME _band
             #include "GB_AxB_bitwise_factory.c"
             break ;
 
@@ -355,7 +362,7 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
         //----------------------------------------------------------------------
 
             // 16 semirings: (bor,band,bxor,bxnor) * (uint8,16,32,64)
-            #define GB_MULT_NAME _bxor
+            #define GB_MNAME _bxor
             #include "GB_AxB_bitwise_factory.c"
             break ;
 
@@ -364,10 +371,68 @@ ASSERT (mult_opcode != GB_ANY_opcode) ;
         //----------------------------------------------------------------------
 
             // 16 semirings: (bor,band,bxor,bxnor) * (uint8,16,32,64)
-            #define GB_MULT_NAME _bxnor
+            #define GB_MNAME _bxnor
             #include "GB_AxB_bitwise_factory.c"
             break ;
 
+        //----------------------------------------------------------------------
+        case GB_FIRSTI_opcode   :   // z = first_i(A(i,k),y) == i
+        //----------------------------------------------------------------------
+
+            // 10 semirings: (min,max,times,plus,any) * (int32,int64)
+            #define GB_MNAME _firsti
+            #include "GB_AxB_positional_factory.c"
+            break ;
+
+        //----------------------------------------------------------------------
+        case GB_FIRSTI1_opcode  :   // z = first_i1(A(i,k),y) == i+1
+        //----------------------------------------------------------------------
+
+            // 10 semirings: (min,max,times,plus,any) * (int32,int64)
+            #define GB_MNAME _firsti1
+            #include "GB_AxB_positional_factory.c"
+            break ;
+
+        //----------------------------------------------------------------------
+        case GB_FIRSTJ_opcode   :   // z = first_j(A(i,k),y) == k
+        case GB_SECONDI_opcode  :   // z = second_i(x,B(k,j)) == k
+        //----------------------------------------------------------------------
+
+            // 10 semirings: (min,max,times,plus,any) * (int32,int64)
+            // FIRSTJ and SECONDI are identical when used in a semiring
+            #define GB_MNAME _firstj
+            #include "GB_AxB_positional_factory.c"
+            break ;
+
+        //----------------------------------------------------------------------
+        case GB_FIRSTJ1_opcode  :   // z = first_j1(A(i,k),y) == k+1
+        case GB_SECONDI1_opcode :   // z = second_i1(x,B(k,j)) == k+1
+        //----------------------------------------------------------------------
+
+            // 10 semirings: (min,max,times,plus,any) * (int32,int64)
+            // FIRSTJ1 and SECONDI1 are identical when used in a semiring
+            #define GB_MNAME _firstj1
+            #include "GB_AxB_positional_factory.c"
+            break ;
+
+        //----------------------------------------------------------------------
+        case GB_SECONDJ_opcode  :   // z = second_j(x,B(i,j)) == j
+        //----------------------------------------------------------------------
+
+            // 10 semirings: (min,max,times,plus,any) * (int32,int64)
+            #define GB_MNAME _secondj
+            #include "GB_AxB_positional_factory.c"
+            break ;
+
+        //----------------------------------------------------------------------
+        case GB_SECONDJ1_opcode :   // z = second_j1(x,B(i,j)) == j+1
+        //----------------------------------------------------------------------
+
+            // 10 semirings: (min,max,times,plus,any) * (int32,int64)
+            #define GB_MNAME _secondj1
+            #include "GB_AxB_positional_factory.c"
+            break ;
+
         default: ;
     }
 }
diff --git a/GraphBLAS/Source/Template/GB_AxB_positional_factory.c b/GraphBLAS/Source/Template/GB_AxB_positional_factory.c
new file mode 100644
index 0000000000..bdde2f4617
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_positional_factory.c
@@ -0,0 +1,46 @@
+//------------------------------------------------------------------------------
+// GB_AxB_positional_factory.c: switch factory for C=A*B for positional ops
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// the additive operator is a monoid, where all types of x,y,z are int64_t or
+// int32_t
+
+ASSERT (xcode == zcode) ;
+ASSERT (ycode == zcode) ;
+ASSERT (GB_OPCODE_IS_POSITIONAL (mult_opcode)) ;
+
+{
+    if (zcode == GB_INT32_code)
+    {
+        switch (add_opcode)
+        {
+            case GB_MIN_opcode   : GB_AxB_WORKER (_min,   GB_MNAME, _int32)
+            case GB_MAX_opcode   : GB_AxB_WORKER (_max,   GB_MNAME, _int32)
+            case GB_TIMES_opcode : GB_AxB_WORKER (_times, GB_MNAME, _int32)
+            case GB_PLUS_opcode  : GB_AxB_WORKER (_plus,  GB_MNAME, _int32)
+            case GB_ANY_opcode   : GB_AxB_WORKER (_any,   GB_MNAME, _int32)
+            default: ;
+        }
+    }
+    else // zcode == GB_INT64_code
+    {
+        ASSERT (zcode == GB_INT64_code) ;
+        switch (add_opcode)
+        {
+            case GB_MIN_opcode   : GB_AxB_WORKER (_min,   GB_MNAME, _int64)
+            case GB_MAX_opcode   : GB_AxB_WORKER (_max,   GB_MNAME, _int64)
+            case GB_TIMES_opcode : GB_AxB_WORKER (_times, GB_MNAME, _int64)
+            case GB_PLUS_opcode  : GB_AxB_WORKER (_plus,  GB_MNAME, _int64)
+            case GB_ANY_opcode   : GB_AxB_WORKER (_any,   GB_MNAME, _int64)
+            default: ;
+        }
+    }
+}
+
+#undef GB_MNAME
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_rowscale_meta.c b/GraphBLAS/Source/Template/GB_AxB_rowscale_meta.c
index b58ecd0afd..dc68cc05d2 100644
--- a/GraphBLAS/Source/Template/GB_AxB_rowscale_meta.c
+++ b/GraphBLAS/Source/Template/GB_AxB_rowscale_meta.c
@@ -2,17 +2,29 @@
 // GB_AxB_rowscale_meta: C=D*B where D is a square diagonal matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// All entries in C=D*B are computed fully in parallel. 
+// All entries in C=D*B are computed entirely in parallel. 
+
+// B and C can be jumbled.  D cannot, but it is a diagonal matrix so it is
+// never jumbled.
 
 {
+
+    //--------------------------------------------------------------------------
+    // check inputs
+    //--------------------------------------------------------------------------
+
     // Bx is unused if the operator is FIRST or PAIR
     #include "GB_unused.h"
 
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    ASSERT (!GB_JUMBLED (D)) ;
+    ASSERT (GB_JUMBLED_OK (B)) ;
+
     //--------------------------------------------------------------------------
     // get C, D, and B
     //--------------------------------------------------------------------------
@@ -20,17 +32,18 @@
     const GB_ATYPE *GB_RESTRICT Dx = (GB_ATYPE *) (D_is_pattern ? NULL : D->x) ;
     const GB_BTYPE *GB_RESTRICT Bx = (GB_BTYPE *) (B_is_pattern ? NULL : B->x) ;
     const int64_t  *GB_RESTRICT Bi = B->i ;
-    int64_t bnz = GB_NNZ (B) ;
+    const int64_t bnz = GB_IS_FULL (B) ? GB_NNZ_FULL (B) : GB_NNZ (B) ;
+    const int64_t bvlen = B->vlen ;
 
     //--------------------------------------------------------------------------
     // C=D*B
     //--------------------------------------------------------------------------
 
-    int ntasks = (nthreads == 1) ? 1 : (32 * nthreads) ;
+    int ntasks = nthreads ;
     ntasks = GB_IMIN (bnz, ntasks) ;
 
     int tid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+    #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (tid = 0 ; tid < ntasks ; tid++)
     {
         int64_t pstart, pend ;
@@ -38,10 +51,10 @@
         GB_PRAGMA_SIMD_VECTORIZE
         for (int64_t p = pstart ; p < pend ; p++)
         { 
-            int64_t i = Bi [p] ;                // get row index of B(i,j)
-            GB_GETA (dii, Dx, i) ;              // dii = D(i,i)
-            GB_GETB (bij, Bx, p) ;              // bij = B(i,j)
-            GB_BINOP (GB_CX (p), dii, bij) ;    // C(i,j) = dii*bij
+            int64_t i = GBI (Bi, p, bvlen) ;        // get row index of B(i,j)
+            GB_GETA (dii, Dx, i) ;                  // dii = D(i,i)
+            GB_GETB (bij, Bx, p) ;                  // bij = B(i,j)
+            GB_BINOP (GB_CX (p), dii, bij, 0, 0) ;  // C(i,j) = dii*bij
         }
     }
 }
diff --git a/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseGus_M_phase1.c b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseGus_M_phase1.c
new file mode 100644
index 0000000000..ad06143357
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseGus_M_phase1.c
@@ -0,0 +1,67 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_coarseGus_M_phase1: symbolic coarse Gustavson, with M
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+    // Initially, Hf [...] < mark for all of Hf.
+
+    // Hf [i] < mark    : M(i,j)=0, C(i,j) is ignored.
+    // Hf [i] == mark   : M(i,j)=1, and C(i,j) not yet seen.
+    // Hf [i] == mark+1 : M(i,j)=1, and C(i,j) has been seen.
+
+    for (int64_t kk = kfirst ; kk <= klast ; kk++)
+    {
+        GB_GET_B_j ;            // get B(:,j)
+        Cp [kk] = 0 ;
+
+        //----------------------------------------------------------------------
+        // special case when B(:,j) is empty
+        //----------------------------------------------------------------------
+
+        #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
+        if (bjnz == 0) continue ;
+        #endif
+
+        //----------------------------------------------------------------------
+        // get M(:,j) and scatter it into the Hf workspace
+        //----------------------------------------------------------------------
+
+        GB_GET_M_j ;                                // get M(:,j)
+        if (mjnz == 0) continue ;
+        GB_GET_M_j_RANGE (64) ;
+        mark += 2 ;
+        const int64_t f0 = mark ;
+        const int64_t f1 = mark+1 ;
+        GB_SCATTER_M_j (pM_start, pM_end, f0) ;     // scatter M(:,j)
+
+        //----------------------------------------------------------------------
+        // count nnz in C(:,j)
+        //----------------------------------------------------------------------
+
+        int64_t cjnz = 0 ;
+        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
+        { 
+            GB_GET_B_kj_INDEX ;         // get k of B(k,j)
+            GB_GET_A_k ;                // get A(:,k)
+            if (aknz == 0) continue ;
+            #define GB_IKJ                                          \
+            {                                                       \
+                if (Hf [i] == f0)       /* if true, M(i,j) is 1 */  \
+                {                                                   \
+                    Hf [i] = f1 ;       /* flag C(i,j) as seen */   \
+                    cjnz++ ;            /* C(i,j) is new */         \
+                }                                                   \
+            }
+            GB_SCAN_M_j_OR_A_k (((GB_A_IS_SPARSE || GB_A_IS_HYPER) && 
+                !A_jumbled)) ;
+            #undef GB_IKJ
+        }
+        Cp [kk] = cjnz ;                // count the entries in C(:,j)
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseGus_noM_phase1.c b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseGus_noM_phase1.c
new file mode 100644
index 0000000000..d1741c6884
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseGus_noM_phase1.c
@@ -0,0 +1,69 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_coarseGus_noM_phase1: symbolic coarse Gustavson, no mask
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This method is #include'd in GB_meta16_methods.c, via GB_meta16_factory.c,
+// and used by GB_AxB_saxpy3_symbolic.  It is selected by the #define of
+// GB_SAXPY_COARSE_GUSTAVSON_NOMASK_PHASE1.
+
+//------------------------------------------------------------------------------
+
+// Initially, Hf [...] < mark for all Hf.
+// Hf [i] is set to mark when C(i,j) is found.
+
+{
+    for (int64_t kk = kfirst ; kk <= klast ; kk++)
+    {
+        GB_GET_B_j ;                    // get B(:,j)
+
+        //----------------------------------------------------------------------
+        // special cases when B(:,j) has zero or one entries
+        //----------------------------------------------------------------------
+
+        #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
+        if (bjnz == 0)
+        {
+            Cp [kk] = 0 ;               // C(:,j) is empty
+            continue ;
+        }
+        #if ( GB_A_IS_SPARSE )
+        if (bjnz == 1)
+        {
+            GB_GET_B_kj_INDEX ;         // get index k of entry B(k,j)
+            GB_GET_A_k ;                // get A(:,k)
+            Cp [kk] = aknz ;            // nnz (C (:,j)) = nnz (A (:,k))
+            continue ;
+        }
+        #endif
+        #endif
+
+        //----------------------------------------------------------------------
+        // count nnz in C(:,j)
+        //----------------------------------------------------------------------
+
+        const int64_t f = (++mark) ;
+        int64_t cjnz = 0 ;
+        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
+        {
+            GB_GET_B_kj_INDEX ;         // get index k of entry B(k,j)
+            GB_GET_A_k ;                // get A(:,k)
+            // scan A(:,k)
+            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+            {
+                GB_GET_A_ik_INDEX ;     // get index i of entry A(i,k)
+                if (Hf [i] != f)        // if true, i is new
+                { 
+                    Hf [i] = f ;        // mark C(i,j) as seen
+                    cjnz++ ;            // C(i,j) is a new entry
+                }
+            }
+        }
+        Cp [kk] = cjnz ;                // save count of entries in C(:,j)
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseGus_noM_phase5.c b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseGus_noM_phase5.c
new file mode 100644
index 0000000000..c4767c8ade
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseGus_noM_phase5.c
@@ -0,0 +1,126 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_coarseGus_noM_phase5: numeric coarse Gustavson, no mask
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This method is #include'd in GB_meta16_methods.c, via GB_meta16_factory.c,
+// and used by GB_AxB_saxpy3_template.  It is selected by the #define of
+// GB_SAXPY_COARSE_GUSTAVSON_NOMASK_PHASE5.
+
+{
+    for (int64_t kk = kfirst ; kk <= klast ; kk++)
+    {
+
+        //----------------------------------------------------------------------
+        // get C(:,j) and B(:,j)
+        //----------------------------------------------------------------------
+
+        int64_t pC = Cp [kk] ;
+        int64_t cjnz = Cp [kk+1] - pC ;
+        if (cjnz == 0) continue ;           // no work to do if C(:,j) empty
+        GB_GET_B_j ;
+
+        //----------------------------------------------------------------------
+        // special case when C (:,j) is dense
+        //----------------------------------------------------------------------
+
+        #ifdef GB_IDENTITY
+        if (cjnz == cvlen)          // C(:,j) is dense
+        { 
+            // this requires the monoid identity.  It is not
+            // defined for the generic saxpy3.
+            GB_COMPUTE_DENSE_C_j ;  // C(:,j) = A*B(:,j)
+            continue ;
+        }
+        #endif
+
+        //----------------------------------------------------------------------
+        // C(:,j) = A*B(:,j)
+        //----------------------------------------------------------------------
+
+        mark++ ;
+        if (bjnz == 1 && (A_is_sparse || A_is_hyper))
+        { 
+
+            //------------------------------------------------------------------
+            // C(:,j) = A(:,k)*B(k,j) where B(:,j) has a single entry
+            //------------------------------------------------------------------
+
+            GB_COMPUTE_C_j_WHEN_NNZ_B_j_IS_ONE ;
+
+        }
+        else if (16 * cjnz > cvlen)
+        {
+
+            //------------------------------------------------------------------
+            // C(:,j) is not very sparse
+            //------------------------------------------------------------------
+
+            for ( ; pB < pB_end ; pB++)     // scan B(:,j)
+            {
+                GB_GET_B_kj_INDEX ;             // get index k of entry B(k,j)
+                GB_GET_A_k ;                    // get A(:,k)
+                if (aknz == 0) continue ;       // skip if A(:,k) is empty
+                GB_GET_B_kj ;                   // bkj = B(k,j)
+                // scan A(:,k)
+                for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                {
+                    GB_GET_A_ik_INDEX ;         // get index i of entry A(i,k)
+                    GB_MULT_A_ik_B_kj ;         // t = A(i,k)*B(k,j)
+                    if (Hf [i] != mark)
+                    { 
+                        // C(i,j) = A(i,k) * B(k,j)
+                        Hf [i] = mark ;
+                        GB_HX_WRITE (i, t) ;    // Hx [i] = t
+                    }
+                    else
+                    { 
+                        // C(i,j) += A(i,k) * B(k,j)
+                        GB_HX_UPDATE (i, t) ;   // Hx [i] += t
+                    }
+                }
+            }
+            GB_GATHER_ALL_C_j (mark) ;          // gather into C(:,j) 
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // C(:,j) is very sparse
+            //------------------------------------------------------------------
+
+            for ( ; pB < pB_end ; pB++)     // scan B(:,j)
+            {
+                GB_GET_B_kj_INDEX ;             // get index k of entry B(k,j)
+                GB_GET_A_k ;                    // get A(:,k)
+                if (aknz == 0) continue ;       // skip if A(:,k) is empty
+                GB_GET_B_kj ;                   // bkj = B(k,j)
+                // scan A(:,k)
+                for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                {
+                    GB_GET_A_ik_INDEX ;         // get index i of entry A(i,k)
+                    GB_MULT_A_ik_B_kj ;         // t = A(i,k)*B(k,j)
+                    if (Hf [i] != mark)
+                    { 
+                        // C(i,j) = A(i,k) * B(k,j)
+                        Hf [i] = mark ;
+                        GB_HX_WRITE (i, t) ;    // Hx [i] = t
+                        Ci [pC++] = i ;
+                    }
+                    else
+                    { 
+                        // C(i,j) += A(i,k) * B(k,j)
+                        GB_HX_UPDATE (i, t) ;   // Hx [i] += t
+                    }
+                }
+            }
+            GB_SORT_AND_GATHER_C_j ;            // gather into C(:,j)
+        }
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseGus_notM_phase1.c b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseGus_notM_phase1.c
new file mode 100644
index 0000000000..c414445a56
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseGus_notM_phase1.c
@@ -0,0 +1,63 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_coarseGus_notM_phase1: symbolic coarse Gustavson, with !M
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+    // Initially, Hf [...] < mark for all of Hf.
+
+    // Hf [i] < mark    : M(i,j)=0, C(i,j) is not yet seen.
+    // Hf [i] == mark   : M(i,j)=1, so C(i,j) is ignored.
+    // Hf [i] == mark+1 : M(i,j)=0, and C(i,j) has been seen.
+
+    for (int64_t kk = kfirst ; kk <= klast ; kk++)
+    {
+        GB_GET_B_j ;            // get B(:,j)
+        Cp [kk] = 0 ;
+
+        //----------------------------------------------------------------------
+        // special case when B(:,j) is empty
+        //----------------------------------------------------------------------
+
+        #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
+        if (bjnz == 0) continue ;
+        #endif
+
+        //----------------------------------------------------------------------
+        // get M(:,j) and scatter it into the Hf workspace
+        //----------------------------------------------------------------------
+
+        GB_GET_M_j ;                                // get M(:,j)
+        mark += 2 ;
+        const int64_t f0 = mark ;
+        const int64_t f1 = mark+1 ;
+        GB_SCATTER_M_j (pM_start, pM_end, f0) ;     // scatter M(:,j)
+
+        //----------------------------------------------------------------------
+        // count nnz in C(:,j)
+        //----------------------------------------------------------------------
+
+        int64_t cjnz = 0 ;
+        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
+        {
+            GB_GET_B_kj_INDEX ;         // get k of B(k,j)
+            GB_GET_A_k ;                // get A(:,k)
+            // scan A(:,k)
+            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+            {
+                GB_GET_A_ik_INDEX ;     // get index i of A(i,k)
+                if (Hf [i] < f0)        // if true, M(i,j) is 0
+                { 
+                    Hf [i] = f1 ;       // flag C(i,j) as seen
+                    cjnz++ ;            // C(i,j) is a new entry
+                }
+            }
+        }
+        Cp [kk] = cjnz ;                // count the entries in C(:,j)
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseHash_M_phase1.c b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseHash_M_phase1.c
new file mode 100644
index 0000000000..b3ac9d7d06
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseHash_M_phase1.c
@@ -0,0 +1,78 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_coarseHash_M_phase1: symbolic coarse hash method, with M
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+    // Initially, Hf [...] < mark for all of Hf.
+    // Let h = Hi [hash] and f = Hf [hash].
+
+    // f < mark: unoccupied, M(i,j)=0, C(i,j) ignored if
+    //           this case occurs while scanning A(:,k)
+    // h == i, f == mark   : M(i,j)=1, and C(i,j) not yet seen.
+    // h == i, f == mark+1 : M(i,j)=1, and C(i,j) has been seen.
+
+    for (int64_t kk = kfirst ; kk <= klast ; kk++)
+    {
+        GB_GET_B_j ;            // get B(:,j)
+        Cp [kk] = 0 ;
+
+        //----------------------------------------------------------------------
+        // special case when B(:,j) is empty
+        //----------------------------------------------------------------------
+
+        #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
+        if (bjnz == 0) continue ;
+        #endif
+
+        //----------------------------------------------------------------------
+        // get M(:,j) and scatter it into the Hf workspace
+        //----------------------------------------------------------------------
+
+        GB_GET_M_j ;                                // get M(:,j)
+        if (mjnz == 0) continue ;
+        GB_GET_M_j_RANGE (64) ;
+        mark += 2 ;
+        const int64_t f0 = mark ;
+        const int64_t f1 = mark+1 ;
+        GB_HASH_M_j ;                               // hash M(:,j)
+
+        //----------------------------------------------------------------------
+        // count nnz in C(:,j)
+        //----------------------------------------------------------------------
+
+        int64_t cjnz = 0 ;
+        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
+        { 
+            GB_GET_B_kj_INDEX ;         // get k of B(k,j)
+            GB_GET_A_k ;                // get A(:,k)
+            if (aknz == 0) continue ;
+            #define GB_IKJ                                              \
+            {                                                           \
+                for (GB_HASH (i))               /* find i in hash */    \
+                {                                                       \
+                    int64_t f = Hf [hash] ;                             \
+                    if (f < f0) break ;         /* M(i,j)=0; ignore */  \
+                    if (Hi [hash] == i)         /* if true, i found */  \
+                    {                                                   \
+                        if (f == f0)            /* if true, i is new */ \
+                        {                                               \
+                            Hf [hash] = f1 ;    /* flag i as seen */    \
+                            cjnz++ ;            /* C(i,j) is new */     \
+                        }                                               \
+                        break ;                                         \
+                    }                                                   \
+                }                                                       \
+            }
+            GB_SCAN_M_j_OR_A_k (((GB_A_IS_SPARSE || GB_A_IS_HYPER) && 
+                !A_jumbled)) ;
+            #undef GB_IKJ
+        }
+        Cp [kk] = cjnz ;                // count the entries in C(:,j)
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseHash_notM_phase1.c b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseHash_notM_phase1.c
new file mode 100644
index 0000000000..0ce061407c
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseHash_notM_phase1.c
@@ -0,0 +1,69 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_coarseHash_notM_phase1: symbolic coarse hash method, with !M
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+    // Initially, Hf [...] < mark for all of Hf.
+    // Let h = Hi [hash] and f = Hf [hash].
+
+    // f < mark: unoccupied, M(i,j)=0, and C(i,j) not yet seen.
+    // h == i, f == mark   : M(i,j)=1. C(i,j) ignored.
+    // h == i, f == mark+1 : M(i,j)=0, and C(i,j) has been seen.
+
+    for (int64_t kk = kfirst ; kk <= klast ; kk++)
+    {
+        GB_GET_B_j ;            // get B(:,j)
+        Cp [kk] = 0 ;
+
+        //----------------------------------------------------------------------
+        // special case when B(:,j) is empty
+        //----------------------------------------------------------------------
+
+        #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
+        if (bjnz == 0) continue ;
+        #endif
+
+        //----------------------------------------------------------------------
+        // get M(:,j) and scatter it into the Hf workspace
+        //----------------------------------------------------------------------
+
+        GB_GET_M_j ;            // get M(:,j)
+        mark += 2 ;
+        int64_t mark1 = mark+1 ;
+        GB_HASH_M_j ;           // hash M(:,j)
+
+        //----------------------------------------------------------------------
+        // count nnz in C(:,j)
+        //----------------------------------------------------------------------
+
+        int64_t cjnz = 0 ;
+        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
+        {
+            GB_GET_B_kj_INDEX ;         // get index k of B(k,j)
+            GB_GET_A_k ;                // get A(:,k)
+            // scan A(:,k)
+            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+            {
+                GB_GET_A_ik_INDEX ;     // get index i of A(i,k)
+                for (GB_HASH (i))       // find i in hash
+                {
+                    if (Hf [hash] < mark)   // if true, i is new
+                    { 
+                        Hf [hash] = mark1 ; // mark C(i,j) seen
+                        Hi [hash] = i ;
+                        cjnz++ ;        // C(i,j) is a new entry
+                        break ;
+                    }
+                    if (Hi [hash] == i) break ;
+                }
+            }
+        }
+        Cp [kk] = cjnz ;                // count the entries in C(:,j)
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseHash_phase1.c b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseHash_phase1.c
new file mode 100644
index 0000000000..1c9f72e6ea
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseHash_phase1.c
@@ -0,0 +1,116 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_coarseHash_phase1: symbolic coarse Hash, optional dense mask
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+
+    //--------------------------------------------------------------------------
+    // phase1: coarse hash task, C=A*B, or C<#M>=A*B if M is dense
+    //--------------------------------------------------------------------------
+
+    // Initially, Hf [...] < mark for all of Hf.
+    // Let f = Hf [hash] and h = Hi [hash]
+
+    // f < mark          : unoccupied.
+    // h == i, f == mark : occupied with C(i,j)
+
+    // The mask M can be optionally checked, if it is packed (full, bitmap, or
+    // sparse/hyper with all entries present and not jumbled) and checked in
+    // place.  This method is not used if M is present and sparse.
+
+    for (int64_t kk = kfirst ; kk <= klast ; kk++)
+    {
+        GB_GET_B_j ;            // get B(:,j)
+        Cp [kk] = 0 ;
+
+        //----------------------------------------------------------------------
+        // special case when B(:,j) is empty
+        //----------------------------------------------------------------------
+
+        #if ( GB_B_IS_SPARSE || GB_B_IS_HYPER )
+        if (bjnz == 0) continue ;
+        #endif
+
+        //----------------------------------------------------------------------
+        // get M(:,j), or handle the case when B(:,j) has one entry
+        //----------------------------------------------------------------------
+
+        #ifdef GB_CHECK_MASK_ij
+
+            // The mask M is packed (full, bitmap, or sparse/hyper and not
+            // jumbled, with all entries present in the entire matrix).  Get
+            // pointers Mjb and Mjx into the M(:,j) vector.
+            GB_GET_M_j
+            const M_TYPE *GB_RESTRICT Mjx = Mask_struct ? NULL :
+                ((M_TYPE *) Mx) + (M_SIZE * pM_start) ;
+            const int8_t *GB_RESTRICT Mjb = M_is_bitmap ? (Mb+pM_start) : NULL ;
+
+        #else
+
+            // M is not present
+            #if ( GB_A_IS_SPARSE || GB_A_IS_HYPER )
+            if (bjnz == 1)
+            { 
+                GB_GET_B_kj_INDEX ;     // get index k of B(k,j)
+                GB_GET_A_k ;            // get A(:,k)
+                Cp [kk] = aknz ;
+                continue ;
+            }
+            #endif
+
+        #endif
+
+        mark++ ;
+
+        //----------------------------------------------------------------------
+        // count nnz in C(:,j)
+        //----------------------------------------------------------------------
+
+        int64_t cjnz = 0 ;
+        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
+        {
+            GB_GET_B_kj_INDEX ;         // get index k of B(k,j)
+            GB_GET_A_k ;                // get A(:,k)
+            // scan A(:,k)
+            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+            {
+                GB_GET_A_ik_INDEX ;     // get index i of A(i,k)
+                #ifdef GB_CHECK_MASK_ij
+                // check mask condition and skip if C(i,j) is protected by
+                // the mask
+                GB_CHECK_MASK_ij ;
+                #endif
+                int64_t hash ;
+                bool marked = false ;
+                bool done = false ;
+                for (hash = GB_HASHF (i) ; ; GB_REHASH (hash, i))
+                { 
+                    // if the hash entry is marked then it is occuppied with
+                    // some row index in the current C(:,j).
+                    marked = (Hf [hash] == mark) ;
+                    // if found, then the hash entry holds the row index i.
+                    bool found = marked && (Hi [hash] == i) ;
+                    // if the hash entry is unmarked, then it is empty, and i
+                    // is not in the hash table.  In this case, C(i,j) is a new
+                    // entry.  The search terminates if either i is found, or
+                    // if an empty (unmarked) slot is found.
+                    if (found || !marked) break ;
+                }
+                if (!marked)
+                { 
+                    // empty slot found, insert C(i,j)
+                    Hf [hash] = mark ;
+                    Hi [hash] = i ;
+                    cjnz++ ;            // C(i,j) is a new entry
+                }
+            }
+        }
+        Cp [kk] = cjnz ;                // count the entries in C(:,j)
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseHash_phase5.c b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseHash_phase5.c
new file mode 100644
index 0000000000..d003ff9def
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_saxpy3_coarseHash_phase5.c
@@ -0,0 +1,102 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_coarseHash_phase5:
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+
+    //--------------------------------------------------------------------------
+    // phase 5: coarse hash task, C=A*B
+    //--------------------------------------------------------------------------
+
+    // Initially, Hf [...] < mark for all of Hf.
+    // Let f = Hf [hash] and h = Hi [hash]
+
+    // f < mark          : unoccupied.
+    // h == i, f == mark : occupied with C(i,j)
+
+    for (int64_t kk = kfirst ; kk <= klast ; kk++)
+    {
+        int64_t pC = Cp [kk] ;
+        int64_t cjnz = Cp [kk+1] - pC ;
+        if (cjnz == 0) continue ;   // nothing to do
+        GB_GET_B_j ;                // get B(:,j)
+
+        #ifdef GB_CHECK_MASK_ij
+
+            // The mask M is packed (full, bitmap, or sparse/hyper and not
+            // jumbled with all entries present in the entire matrix).  Get
+            // pointers Mjb and Mjx into the M(:,j) vector.
+            GB_GET_M_j                  // get M(:,j)
+            #ifndef M_SIZE
+            #define M_SIZE 1
+            #endif
+            const M_TYPE *GB_RESTRICT Mjx = Mask_struct ? NULL :
+                ((M_TYPE *) Mx) + (M_SIZE * pM_start) ;
+            const int8_t *GB_RESTRICT Mjb = M_is_bitmap ? (Mb+pM_start) : NULL ;
+
+        #else
+
+            // M is not present
+            if (bjnz == 1 && (A_is_sparse || A_is_hyper))
+            { 
+                // C(:,j) = A(:,k)*B(k,j), no mask
+                GB_COMPUTE_C_j_WHEN_NNZ_B_j_IS_ONE ;
+                continue ;
+            }
+
+        #endif
+
+        mark++ ;
+        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
+        {
+            GB_GET_B_kj_INDEX ;         // get index k of B(k,j)
+            GB_GET_A_k ;                // get A(:,k)
+            if (aknz == 0) continue ;
+            GB_GET_B_kj ;               // bkj = B(k,j)
+            // scan A(:,k)
+            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+            {
+                GB_GET_A_ik_INDEX ;     // get index i of A(i,j)
+                #ifdef GB_CHECK_MASK_ij
+                // check mask condition and skip if C(i,j) is protected by the
+                // mask
+                GB_CHECK_MASK_ij ;
+                #endif
+                GB_MULT_A_ik_B_kj ;     // t = A(i,k)*B(k,j)
+                for (GB_HASH (i))   // find i in hash table
+                {
+                    if (Hf [hash] == mark)
+                    {
+                        // hash entry is occupied
+                        if (Hi [hash] == i)
+                        { 
+                            // i already in the hash table
+                            // Hx [hash] += t ;
+                            GB_HX_UPDATE (hash, t) ;
+                            break ;
+                        }
+                    }
+                    else
+                    { 
+                        // hash entry is not occupied
+                        Hf [hash] = mark ;
+                        Hi [hash] = i ;
+                        GB_HX_WRITE (hash, t) ;// Hx[hash]=t
+                        Ci [pC++] = i ;
+                        break ;
+                    }
+                }
+            }
+        }
+        GB_SORT_AND_GATHER_HASHED_C_j (mark) ;  // gather into C(:,j)
+    }
+}
+
+#undef M_TYPE
+#undef M_SIZE
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_saxpy3_fineHash_phase2.c b/GraphBLAS/Source/Template/GB_AxB_saxpy3_fineHash_phase2.c
new file mode 100644
index 0000000000..d8b793fdd9
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_saxpy3_fineHash_phase2.c
@@ -0,0 +1,180 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy3_fineHash_phase2_template: no mask, or dense mask
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+
+    //--------------------------------------------------------------------------
+    // phase2: fine hash task, C(:,j)=A*B(:,j)
+    //--------------------------------------------------------------------------
+
+    // Given Hf [hash] split into (h,f)
+
+    // h == 0  , f == 0 : unlocked and unoccupied.
+    // h == i+1, f == 2 : unlocked, occupied by C(i,j).  Hx is initialized.
+    // h == ..., f == 3 : locked.
+
+    // 0 -> 3 : to lock, if i seen for first time
+    // 2 -> 3 : to lock, if i seen already
+    // 3 -> 2 : to unlock; now i has been seen
+
+    // The mask M can be optionally checked, if it is dense and checked in
+    // place.  This method is not used if M is present and sparse.
+
+    #ifdef GB_CHECK_MASK_ij
+
+        // The mask M is packed (full, bitmap, or sparse/hyper and not
+        // jumbled, with all entries present in the entire matrix).  Get
+        // pointers Mjb and Mjx into the M(:,j) vector.
+        GB_GET_M_j
+        #ifndef M_SIZE
+        #define M_SIZE 1
+        #endif
+        const M_TYPE *GB_RESTRICT Mjx = Mask_struct ? NULL :
+            ((M_TYPE *) Mx) + (M_SIZE * pM_start) ;
+        const int8_t *GB_RESTRICT Mjb = M_is_bitmap ? (Mb + pM_start) : NULL ;
+
+    #endif
+
+    if (team_size == 1)
+    {
+
+        //----------------------------------------------------------------------
+        // single-threaded version
+        //----------------------------------------------------------------------
+
+        // the hash state 3 is not used, since only a single thread is
+        // doing all the work for this vector C(:,j).
+
+        // 0: if i seen for first time
+        // 2: if i seen already
+
+        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
+        {
+            GB_GET_B_kj_INDEX ;         // get index k of B(k,j)
+            GB_GET_A_k ;                // get A(:,k)
+            if (aknz == 0) continue ;
+            GB_GET_B_kj ;               // bkj = B(k,j)
+            // scan A(:,k)
+            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+            {
+                GB_GET_A_ik_INDEX ;     // get index i of A(i,j)
+                #ifdef GB_CHECK_MASK_ij
+                // check mask condition and skip if C(i,j)
+                // is protected by the mask
+                GB_CHECK_MASK_ij ;
+                #endif
+                GB_MULT_A_ik_B_kj ;         // t = A(i,k) * B(k,j)
+                int64_t i_unlocked = ((i+1) << 2) + 2 ;    // (i+1,2)
+                // find the entry i in the hash table
+                bool hf_unlocked = false ;  // true if i found
+                bool hf_empty = false ;     // true if empty slot found
+                int64_t hash ;
+                for (hash = GB_HASHF (i) ; ; GB_REHASH (hash,i))
+                { 
+                    int64_t hf = Hf [hash] ;    // grab the entry
+                    hf_unlocked = (hf == i_unlocked) ;
+                    hf_empty = (hf == 0) ;
+                    if (hf_unlocked || hf_empty) break ;
+                }
+                if (hf_unlocked)    // if true, update C(i,j)
+                { 
+                    // hash entry occuppied by C(i,j): update it
+                    GB_HX_UPDATE (hash, t) ;    // Hx [hash] += t
+                }
+                else // hf_empty:   if true, load the hash entry with C(i,j)
+                { 
+                    // hash entry unoccuppied: fill it with C(i,j)
+                    ASSERT (hf_empty) ;
+                    GB_HX_WRITE (hash, t) ;     // Hx [hash] = t
+                    Hf [hash] = i_unlocked ;    // unlock entry
+                }
+            }
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // multi-threaded version
+        //----------------------------------------------------------------------
+
+        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
+        {
+            GB_GET_B_kj_INDEX ;         // get index k of B(k,j)
+            GB_GET_A_k ;                // get A(:,k)
+            if (aknz == 0) continue ;
+            GB_GET_B_kj ;               // bkj = B(k,j)
+            // scan A(:,k)
+            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+            {
+                GB_GET_A_ik_INDEX ;     // get index i of A(i,j)
+                #ifdef GB_CHECK_MASK_ij
+                // check mask condition and skip if C(i,j)
+                // is protected by the mask
+                GB_CHECK_MASK_ij ;
+                #endif
+                GB_MULT_A_ik_B_kj ;         // t = A(i,k) * B(k,j)
+                int64_t i1 = i + 1 ;        // i1 = one-based index
+                int64_t i_unlocked = (i1 << 2) + 2 ;    // (i+1,2)
+                for (GB_HASH (i))           // find i in hash table
+                {
+                    int64_t hf ;
+                    GB_ATOMIC_READ
+                    hf = Hf [hash] ;        // grab the entry
+                    #if GB_HAS_ATOMIC
+                    if (hf == i_unlocked)  // if true, update C(i,j)
+                    { 
+                        GB_ATOMIC_UPDATE_HX (hash, t) ;// Hx [.]+=t
+                        break ;         // C(i,j) has been updated
+                    }
+                    #endif
+                    int64_t h = (hf >> 2) ;
+                    if (h == 0 || h == i1)
+                    {
+                        // h=0: unoccupied, h=i1: occupied by i
+                        do  // lock the entry
+                        { 
+                            // do this atomically:
+                            // { hf = Hf [hash] ; Hf [hash] |= 3 ; }
+                            GB_ATOMIC_CAPTURE_INT64_OR (hf, Hf [hash], 3) ;
+                        } while ((hf & 3) == 3) ; // owner: f=0 or 2
+
+                        if (hf == 0) // f == 0
+                        { 
+                            // C(i,j) is a new entry in C(:,j)
+                            // Hx [hash] = t
+                            GB_ATOMIC_WRITE_HX (hash, t) ;
+                            GB_ATOMIC_WRITE
+                            Hf [hash] = i_unlocked ; // unlock entry
+                            break ;
+                        }
+                        if (hf == i_unlocked) // f == 2
+                        { 
+                            // C(i,j) already appears in C(:,j)
+                            // Hx [hash] += t
+                            GB_ATOMIC_UPDATE_HX (hash, t) ;
+                            GB_ATOMIC_WRITE
+                            Hf [hash] = i_unlocked ; // unlock entry
+                            break ;
+                        }
+
+                        // hash table occupied, but not with i
+                        GB_ATOMIC_WRITE
+                        Hf [hash] = hf ;  // unlock with prior value
+                    }
+                }
+            }
+        }
+    }
+}
+
+#undef M_TYPE
+#undef M_SIZE
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_saxpy3_template.c b/GraphBLAS/Source/Template/GB_AxB_saxpy3_template.c
index e2bd30cb48..c161d97193 100644
--- a/GraphBLAS/Source/Template/GB_AxB_saxpy3_template.c
+++ b/GraphBLAS/Source/Template/GB_AxB_saxpy3_template.c
@@ -2,15 +2,13 @@
 // GB_AxB_saxpy3_template: C=A*B, C<M>=A*B, or C<!M>=A*B via saxpy3 method
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// GB_AxB_saxpy3_template.c computes C=A*B for any semiring and matrix types.
-// It is #include'd in GB_AxB_saxpy3 to construct the generic method (for
-// arbitary user-defined operators and/or typecasting), and in the hard-coded
-// GB_Asaxpy3B* workers in the Generated/ folder.
+// GB_AxB_saxpy3_template.c computes C=A*B for any semiring and matrix types,
+// where C is sparse or hypersparse.
 
 #include "GB_unused.h"
 
@@ -20,6 +18,8 @@
 
 {
 
+// double ttt = omp_get_wtime ( ) ;
+
     //--------------------------------------------------------------------------
     // get the chunk size
     //--------------------------------------------------------------------------
@@ -37,35 +37,51 @@
 
     const int64_t *GB_RESTRICT Bp = B->p ;
     const int64_t *GB_RESTRICT Bh = B->h ;
+    const int8_t  *GB_RESTRICT Bb = B->b ;
     const int64_t *GB_RESTRICT Bi = B->i ;
     const GB_BTYPE *GB_RESTRICT Bx = (GB_BTYPE *) (B_is_pattern ? NULL : B->x) ;
-    // const int64_t bvlen = B->vlen ;
-    // const int64_t bnvec = B->nvec ;
-    // const bool B_is_hyper = B->is_hyper ;
+    const int64_t bvlen = B->vlen ;
+    const bool B_jumbled = B->jumbled ;
+    const bool B_is_sparse = GB_IS_SPARSE (B) ;
+    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
+    const bool B_is_bitmap = GB_IS_BITMAP (B) ;
+    const bool B_is_sparse_or_hyper = B_is_sparse || B_is_hyper ;
 
     const int64_t *GB_RESTRICT Ap = A->p ;
     const int64_t *GB_RESTRICT Ah = A->h ;
+    const int8_t  *GB_RESTRICT Ab = A->b ;
     const int64_t *GB_RESTRICT Ai = A->i ;
     const int64_t anvec = A->nvec ;
-    const bool A_is_hyper = GB_IS_HYPER (A) ;
+    const int64_t avlen = A->vlen ;
+    const bool A_is_sparse = GB_IS_SPARSE (A) ;
+    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
     const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) (A_is_pattern ? NULL : A->x) ;
+    const bool A_jumbled = A->jumbled ;
+    const bool A_ok_for_binary_search = 
+        ((A_is_sparse || A_is_hyper) && !A_jumbled) ;
 
     const int64_t *GB_RESTRICT Mp = NULL ;
     const int64_t *GB_RESTRICT Mh = NULL ;
+    const int8_t  *GB_RESTRICT Mb = NULL ;
     const int64_t *GB_RESTRICT Mi = NULL ;
     const GB_void *GB_RESTRICT Mx = NULL ;
     size_t msize = 0 ;
     int64_t mnvec = 0 ;
-    bool M_is_hyper = false ;
+    int64_t mvlen = 0 ;
+    const bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
+    const bool M_is_bitmap = GB_IS_BITMAP (M) ;
+    const bool M_jumbled = GB_JUMBLED (M) ;
     if (M != NULL)
     { 
         Mp = M->p ;
         Mh = M->h ;
+        Mb = M->b ;
         Mi = M->i ;
         Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
         msize = M->type->size ;
         mnvec = M->nvec ;
-        M_is_hyper = M->is_hyper ;
+        mvlen = M->vlen ;
     }
 
     // 3 cases:
@@ -74,7 +90,13 @@
     //      M present     and Mask_comp true : compute C<!M>=A*B
     // If M is NULL on input, then Mask_comp is also false on input.
 
-    bool mask_is_M = (M != NULL && !Mask_comp) ;
+    const bool mask_is_M = (M != NULL && !Mask_comp) ;
+
+    // ignore the mask if present, not complemented, dense and
+    // used in place, structural, and not bitmap.  In this case,
+    // all entries in M are true, so M can be ignored.
+    const bool ignore_mask = mask_is_M && M_dense_in_place &&
+        Mask_struct && !M_is_bitmap ;
 
     //==========================================================================
     // phase2: numeric work for fine tasks
@@ -93,11 +115,15 @@
         //----------------------------------------------------------------------
 
         int64_t kk = TaskList [taskid].vector ;
+        int team_size = TaskList [taskid].team_size ;
         int64_t hash_size = TaskList [taskid].hsize ;
         bool use_Gustavson = (hash_size == cvlen) ;
         int64_t pB     = TaskList [taskid].start ;
         int64_t pB_end = TaskList [taskid].end + 1 ;
         int64_t pleft = 0, pright = anvec-1 ;
+        int64_t j = GBH (Bh, kk) ;
+
+        GB_GET_T_FOR_SECONDJ ;
 
         #if !GB_IS_ANY_PAIR_SEMIRING
         GB_CTYPE *GB_RESTRICT Hx = (GB_CTYPE *) TaskList [taskid].Hx ;
@@ -141,81 +167,93 @@
             {
 
                 //--------------------------------------------------------------
-                // phase2: fine Gustavson task, C=A*B
+                // phase2: fine Gustavson task, C(:,j)=A*B(:,j)
                 //--------------------------------------------------------------
 
                 // Hf [i] is initially 0.
-
                 // 0 -> 3 : to lock, if i seen for first time
                 // 2 -> 3 : to lock, if i seen already
                 // 3 -> 2 : to unlock; now i has been seen
 
                 for ( ; pB < pB_end ; pB++)     // scan B(:,j)
                 {
-                    int64_t k = Bi [pB] ;       // get B(k,j)
+                    GB_GET_B_kj_INDEX ;         // get index k of B(k,j)
                     GB_GET_A_k ;                // get A(:,k)
                     if (aknz == 0) continue ;
                     GB_GET_B_kj ;               // bkj = B(k,j)
                     // scan A(:,k)
                     for (int64_t pA = pA_start ; pA < pA_end ; pA++)
                     {
-                        int64_t i = Ai [pA] ;    // get A(i,k)
-                        GB_MULT_A_ik_B_kj ;      // t = A(i,k) * B(k,j)
+                        GB_GET_A_ik_INDEX ;     // get index i of A(i,j)
+                        GB_MULT_A_ik_B_kj ;     // t = A(i,k) * B(k,j)
                         int8_t f ;
 
                         #if GB_IS_ANY_MONOID
 
-                        GB_ATOMIC_READ
-                        f = Hf [i] ;            // grab the entry
-                        if (f == 2) continue ;  // check if already updated
-                        GB_ATOMIC_WRITE
-                        Hf [i] = 2 ;                // flag the entry
-                        GB_ATOMIC_WRITE_HX (i, t) ;    // Hx [i] = t
+                            //--------------------------------------------------
+                            // C(i,j) += t ; with the ANY monoid
+                            //--------------------------------------------------
+
+                            GB_ATOMIC_READ
+                            f = Hf [i] ;            // grab the entry
+                            if (f == 2) continue ;  // check if already updated
+                            GB_ATOMIC_WRITE_HX (i, t) ;    // Hx [i] = t
 
                         #else
 
-                        #if GB_HAS_ATOMIC
-                        GB_ATOMIC_READ
-                        f = Hf [i] ;            // grab the entry
-                        if (f == 2)             // if true, update C(i,j)
-                        {
-                            GB_ATOMIC_UPDATE_HX (i, t) ;   // Hx [i] += t
-                            continue ;          // C(i,j) has been updated
-                        }
-                        #endif
-                        do  // lock the entry
-                        {
-                            // do this atomically:
-                            // { f = Hf [i] ; Hf [i] = 3 ; }
-                            GB_ATOMIC_CAPTURE_INT8 (f, Hf [i], 3) ;
-                        } while (f == 3) ; // lock owner gets f=0 or 2
-                        if (f == 0)
-                        { 
-                            // C(i,j) is a new entry
-                            GB_ATOMIC_WRITE_HX (i, t) ;    // Hx [i] = t
-                        }
-                        else // f == 2
-                        { 
-                            // C(i,j) already appears in C(:,j)
-                            GB_ATOMIC_UPDATE_HX (i, t) ;   // Hx [i] += t
-                        }
-                        GB_ATOMIC_WRITE
-                        Hf [i] = 2 ;                // unlock the entry
+                            //--------------------------------------------------
+                            // C(i,j) += t ; with all other monoids
+                            //--------------------------------------------------
+
+                            #if GB_HAS_ATOMIC
+
+                                // if C(i,j) is already present (f==2), and the
+                                // monoid can be done atomically, then do the
+                                // atomic update.  No need to modify Hf [i].
+                                GB_ATOMIC_READ
+                                f = Hf [i] ;        // grab the entry
+                                if (f == 2)         // if true, update C(i,j)
+                                {
+                                    GB_ATOMIC_UPDATE_HX (i, t) ; // Hx [i] += t
+                                    continue ;      // C(i,j) has been updated
+                                }
+
+                            #endif
+
+                            do  // lock the entry
+                            { 
+                                // do this atomically:
+                                // { f = Hf [i] ; Hf [i] = 3 ; }
+                                GB_ATOMIC_CAPTURE_INT8 (f, Hf [i], 3) ;
+                            } while (f == 3) ; // lock owner gets f=0 or 2
+                            if (f == 0)
+                            { 
+                                // C(i,j) is a new entry
+                                GB_ATOMIC_WRITE_HX (i, t) ;    // Hx [i] = t
+                            }
+                            else // f == 2
+                            { 
+                                // C(i,j) already appears in C(:,j)
+                                GB_ATOMIC_UPDATE_HX (i, t) ;   // Hx [i] += t
+                            }
 
                         #endif
+
+                        GB_ATOMIC_WRITE
+                        Hf [i] = 2 ;            // flag/unlock the entry
                     }
                 }
-
+                
             }
             else if (mask_is_M)
             {
 
                 //--------------------------------------------------------------
-                // phase2: fine Gustavson task, C<M>=A*B
+                // phase2: fine Gustavson task, C(:,j)<M(:,j)>=A*B(:,j)
                 //--------------------------------------------------------------
 
                 // Hf [i] is 0 if M(i,j) not present or M(i,j)=0.
-                // 0 -> 1 : has already been done in phase0 if M(i,j)=1
+                // 0 -> 1 : has already been done in phase0 if M(i,j)=1.
 
                 // 0 -> 0 : to ignore, if M(i,j)=0
                 // 1 -> 3 : to lock, if i seen for first time
@@ -226,60 +264,69 @@
                 GB_GET_M_j_RANGE (16) ;     // get first and last in M(:,j)
                 for ( ; pB < pB_end ; pB++)     // scan B(:,j)
                 { 
-                    int64_t k = Bi [pB] ;       // get B(k,j)
+                    GB_GET_B_kj_INDEX ;         // get index k of B(k,j)
                     GB_GET_A_k ;                // get A(:,k)
-                    GB_SKIP_IF_A_k_DISJOINT_WITH_M_j ;
+                    if (aknz == 0) continue ;
                     GB_GET_B_kj ;               // bkj = B(k,j)
 
                     #if GB_IS_ANY_MONOID
 
-                    #define GB_IKJ                                             \
-                        int8_t f ;                                             \
-                        GB_ATOMIC_READ                                         \
-                        f = Hf [i] ;            /* grab the entry */           \
-                        if (f == 0 || f == 2) continue ;                       \
-                        GB_ATOMIC_WRITE                                        \
-                        Hf [i] = 2 ;            /* unlock the entry */         \
-                        GB_MULT_A_ik_B_kj ;     /* t = A(i,k) * B(k,j) */      \
-                        GB_ATOMIC_WRITE_HX (i, t) ;    /* Hx [i] = t */
+                        //------------------------------------------------------
+                        // C(i,j) += A(i,k)*B(k,j) ; with the ANY monoid
+                        //------------------------------------------------------
+
+                        #define GB_IKJ                                         \
+                            int8_t f ;                                         \
+                            GB_ATOMIC_READ                                     \
+                            f = Hf [i] ;            /* grab the entry */       \
+                            if (f == 0 || f == 2) continue ;                   \
+                            GB_ATOMIC_WRITE                                    \
+                            Hf [i] = 2 ;            /* unlock the entry */     \
+                            GB_MULT_A_ik_B_kj ;     /* t = A(i,k) * B(k,j) */  \
+                            GB_ATOMIC_WRITE_HX (i, t) ;    /* Hx [i] = t */
 
                     #else
 
-                    #define GB_IKJ                                             \
-                    {                                                          \
-                        GB_MULT_A_ik_B_kj ;     /* t = A(i,k) * B(k,j) */      \
-                        int8_t f ;                                             \
-                        GB_ATOMIC_READ                                         \
-                        f = Hf [i] ;            /* grab the entry */           \
-                        if (GB_HAS_ATOMIC && (f == 2))                         \
-                        {                                                      \
-                            /* C(i,j) already seen; update it */               \
-                            GB_ATOMIC_UPDATE_HX (i, t) ; /* Hx [i] += t */     \
-                            continue ;       /* C(i,j) has been updated */     \
-                        }                                                      \
-                        if (f == 0) continue ; /* M(i,j)=0; ignore C(i,j)*/    \
-                        do  /* lock the entry */                               \
-                        {                                                      \
-                            /* do this atomically: */                          \
-                            /* { f = Hf [i] ; Hf [i] = 3 ; } */                \
-                            GB_ATOMIC_CAPTURE_INT8 (f, Hf [i], 3) ;            \
-                        } while (f == 3) ; /* lock owner gets f=1 or 2 */      \
-                        if (f == 1)                                            \
-                        {                                                      \
-                            /* C(i,j) is a new entry */                        \
-                            GB_ATOMIC_WRITE_HX (i, t) ; /* Hx [i] = t */       \
-                        }                                                      \
-                        else /* f == 2 */                                      \
+                        //------------------------------------------------------
+                        // C(i,j) += A(i,k)*B(k,j) ; all other monoids
+                        //------------------------------------------------------
+
+                        #define GB_IKJ                                         \
                         {                                                      \
-                            /* C(i,j) already appears in C(:,j) */             \
-                            GB_ATOMIC_UPDATE_HX (i, t) ; /* Hx [i] += t */     \
-                        }                                                      \
-                        GB_ATOMIC_WRITE                                        \
-                        Hf [i] = 2 ;                /* unlock the entry */     \
-                    }
+                            GB_MULT_A_ik_B_kj ;     /* t = A(i,k) * B(k,j) */  \
+                            int8_t f ;                                         \
+                            GB_ATOMIC_READ                                     \
+                            f = Hf [i] ;            /* grab the entry */       \
+                            if (GB_HAS_ATOMIC && (f == 2))                     \
+                            {                                                  \
+                                /* C(i,j) already seen; update it */           \
+                                GB_ATOMIC_UPDATE_HX (i, t) ; /* Hx [i] += t */ \
+                                continue ;       /* C(i,j) has been updated */ \
+                            }                                                  \
+                            if (f == 0) continue ; /* M(i,j)=0; ignore C(i,j)*/\
+                            do  /* lock the entry */                           \
+                            {                                                  \
+                                /* do this atomically: */                      \
+                                /* { f = Hf [i] ; Hf [i] = 3 ; } */            \
+                                GB_ATOMIC_CAPTURE_INT8 (f, Hf [i], 3) ;        \
+                            } while (f == 3) ; /* lock owner gets f=1 or 2 */  \
+                            if (f == 1)                                        \
+                            {                                                  \
+                                /* C(i,j) is a new entry */                    \
+                                GB_ATOMIC_WRITE_HX (i, t) ; /* Hx [i] = t */   \
+                            }                                                  \
+                            else /* f == 2 */                                  \
+                            {                                                  \
+                                /* C(i,j) already appears in C(:,j) */         \
+                                GB_ATOMIC_UPDATE_HX (i, t) ; /* Hx [i] += t */ \
+                            }                                                  \
+                            GB_ATOMIC_WRITE                                    \
+                            Hf [i] = 2 ;                /* unlock the entry */ \
+                        }
+
                     #endif
 
-                    GB_SCAN_M_j_OR_A_k ;
+                    GB_SCAN_M_j_OR_A_k (A_ok_for_binary_search) ;
                     #undef GB_IKJ
                 }
 
@@ -288,7 +335,7 @@
             {
 
                 //--------------------------------------------------------------
-                // phase2: fine Gustavson task, C<!M>=A*B
+                // phase2: fine Gustavson task, C(:,j)<!M(:,j)>=A*B(:,j)
                 //--------------------------------------------------------------
 
                 // Hf [i] is 0 if M(i,j) not present or M(i,j)=0.
@@ -299,27 +346,41 @@
                 // 2 -> 3 : to lock, if i seen already
                 // 3 -> 2 : to unlock; now i has been seen
 
+                GB_GET_M_j ;                // get M(:,j)
+
                 for ( ; pB < pB_end ; pB++)     // scan B(:,j)
                 {
-                    int64_t k = Bi [pB] ;       // get B(k,j)
+                    GB_GET_B_kj_INDEX ;         // get index k of B(k,j)
                     GB_GET_A_k ;                // get A(:,k)
                     if (aknz == 0) continue ;
                     GB_GET_B_kj ;               // bkj = B(k,j)
                     // scan A(:,k)
                     for (int64_t pA = pA_start ; pA < pA_end ; pA++)
                     {
-                        int64_t i = Ai [pA] ;   // get A(i,k)
+                        GB_GET_A_ik_INDEX ;     // get index i of A(i,j)
                         GB_MULT_A_ik_B_kj ;     // t = A(i,k) * B(k,j)
                         int8_t f ;
 
                         #if GB_IS_ANY_MONOID
 
-                        GB_ATOMIC_READ
-                        f = Hf [i] ;            // grab the entry
-                        if (f == 1 || f == 2) continue ;
-                        GB_ATOMIC_WRITE
-                        Hf [i] = 2 ;                // unlock the entry
-                        GB_ATOMIC_WRITE_HX (i, t) ;    // Hx [i] = t
+                            //--------------------------------------------------
+                            // ANY monoid
+                            //--------------------------------------------------
+
+                            // lock state (3) not needed
+                            // 0: not seen: update with new value, f becomes 2
+                            // 1: masked, do nothing, f stays 1
+                            // 2: already updated, do nothing, f stays 2
+                            // 3: state not used, f can be 2
+                            GB_ATOMIC_READ
+                            f = Hf [i] ;
+                            if (!f)
+                            {
+                                GB_ATOMIC_WRITE
+                                Hf [i] = 2 ;
+                                GB_ATOMIC_WRITE_HX (i, t) ;    // Hx [i] = t
+                            }
+
 
                         #else
 
@@ -327,14 +388,14 @@
                         f = Hf [i] ;            // grab the entry
                         #if GB_HAS_ATOMIC
                         if (f == 2)             // if true, update C(i,j)
-                        {
+                        { 
                             GB_ATOMIC_UPDATE_HX (i, t) ;   // Hx [i] += t
                             continue ;          // C(i,j) has been updated
                         }
                         #endif
                         if (f == 1) continue ; // M(i,j)=1; ignore C(i,j)
                         do  // lock the entry
-                        {
+                        { 
                             // do this atomically:
                             // { f = Hf [i] ; Hf [i] = 3 ; }
                             GB_ATOMIC_CAPTURE_INT8 (f, Hf [i], 3) ;
@@ -392,91 +453,80 @@
                 Hf = (int64_t *GB_RESTRICT) TaskList [taskid].Hf ;
             int64_t hash_bits = (hash_size-1) ;
 
-            if (M == NULL)
+            if (M == NULL || ignore_mask)
+            { 
+
+                //--------------------------------------------------------------
+                // phase2: fine hash task, C(:,j)=A*B(:,j)
+                //--------------------------------------------------------------
+
+                // no mask present, or mask ignored
+                #undef GB_CHECK_MASK_ij
+                #include "GB_AxB_saxpy3_fineHash_phase2.c"
+
+            }
+            else if (mask_is_M)
             {
 
                 //--------------------------------------------------------------
-                // phase2: fine hash task, C=A*B
+                // phase2: fine hash task, C(:,j)<M(:,j)>=A*B(:,j)
                 //--------------------------------------------------------------
 
-                // Given Hf [hash] split into (h,f)
+                GB_GET_M_j ;                // get M(:,j)
+                if (M_dense_in_place)
+                { 
 
-                // h == 0  , f == 0 : unlocked and unoccupied.
-                // h == i+1, f == 2 : unlocked, occupied by C(i,j).
-                //                    Hx is initialized.
-                // h == ..., f == 3 : locked.
+                    //----------------------------------------------------------
+                    // M(:,j) is dense.  M is not scattered into Hf.
+                    //----------------------------------------------------------
 
-                // 0 -> 3 : to lock, if i seen for first time
-                // 2 -> 3 : to lock, if i seen already
-                // 3 -> 2 : to unlock; now i has been seen
+                    ASSERT (!Mask_struct || M_is_bitmap) ;
+                    #undef  GB_CHECK_MASK_ij
+                    #define GB_CHECK_MASK_ij                        \
+                        bool mij =                                  \
+                            (M_is_bitmap ? Mjb [i] : 1) &&          \
+                            (Mask_struct ? 1 : (Mjx [i] != 0)) ;    \
+                        if (!mij) continue ;
 
-                for ( ; pB < pB_end ; pB++)     // scan B(:,j)
-                {
-                    int64_t k = Bi [pB] ;       // get B(k,j)
-                    GB_GET_A_k ;                // get A(:,k)
-                    if (aknz == 0) continue ;
-                    GB_GET_B_kj ;               // bkj = B(k,j)
-                    // scan A(:,k)
-                    for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                    switch (msize)
                     {
-                        int64_t i = Ai [pA] ;       // get A(i,k)
-                        GB_MULT_A_ik_B_kj ;         // t = A(i,k) * B(k,j)
-                        int64_t i1 = i + 1 ;        // i1 = one-based index
-                        int64_t i_unlocked = (i1 << 2) + 2 ;    // (i+1,2)
-                        for (GB_HASH (i))           // find i in hash table
-                        {
-                            int64_t hf ;
-                            GB_ATOMIC_READ
-                            hf = Hf [hash] ;        // grab the entry
-                            #if GB_HAS_ATOMIC
-                            if (hf == i_unlocked)  // if true, update C(i,j)
-                            {
-                                GB_ATOMIC_UPDATE_HX (hash, t) ;// Hx [.]+=t
-                                break ;         // C(i,j) has been updated
-                            }
-                            #endif
-                            int64_t h = (hf >> 2) ;
-                            if (h == 0 || h == i1)
-                            {
-                                // h=0: unoccupied, h=i1: occupied by i
-                                do  // lock the entry
-                                {
-                                    // do this atomically:
-                                    // { hf = Hf [hash] ; Hf [hash] |= 3 ; }
-                                    GB_ATOMIC_CAPTURE_INT64_OR (hf,Hf[hash],3) ;
-                                } while ((hf & 3) == 3) ; // owner: f=0 or 2
-                                if (hf == 0) // f == 0
-                                { 
-                                    // C(i,j) is a new entry in C(:,j)
-                                    // Hx [hash] = t
-                                    GB_ATOMIC_WRITE_HX (hash, t) ;
-                                    GB_ATOMIC_WRITE
-                                    Hf [hash] = i_unlocked ; // unlock entry
-                                    break ;
-                                }
-                                if (hf == i_unlocked) // f == 2
-                                { 
-                                    // C(i,j) already appears in C(:,j)
-                                    // Hx [hash] += t
-                                    GB_ATOMIC_UPDATE_HX (hash, t) ;
-                                    GB_ATOMIC_WRITE
-                                    Hf [hash] = i_unlocked ; // unlock entry
-                                    break ;
-                                }
-                                // hash table occupied, but not with i
-                                GB_ATOMIC_WRITE
-                                Hf [hash] = hf ;  // unlock with prior value
-                            }
-                        }
+                        default:
+                        case 1 : 
+                            #define M_TYPE uint8_t
+                            #include "GB_AxB_saxpy3_fineHash_phase2.c"
+                            break ;
+                        case 2 : 
+                            #define M_TYPE uint16_t
+                            #include "GB_AxB_saxpy3_fineHash_phase2.c"
+                            break ;
+                        case 4 : 
+                            #define M_TYPE uint32_t
+                            #include "GB_AxB_saxpy3_fineHash_phase2.c"
+                            break ;
+                        case 8 : 
+                            #define M_TYPE uint64_t
+                            #include "GB_AxB_saxpy3_fineHash_phase2.c"
+                            break ;
+                        case 16 : 
+                            #define M_TYPE uint64_t
+                            #define M_SIZE 2
+                            #undef  GB_CHECK_MASK_ij
+                            #define GB_CHECK_MASK_ij                    \
+                                bool mij =                              \
+                                    (M_is_bitmap ? Mjb [i] : 1) &&      \
+                                    (Mask_struct ? 1 :                  \
+                                        (Mjx [2*i] != 0) ||             \
+                                        (Mjx [2*i+1] != 0)) ;           \
+                                if (!mij) continue ;
+                            #include "GB_AxB_saxpy3_fineHash_phase2.c"
+                            break ;
                     }
+                    // the task is finished: go to the next task
+                    continue ;
                 }
 
-            }
-            else if (mask_is_M)
-            {
-
                 //--------------------------------------------------------------
-                // phase2: fine hash task, C<M>=A*B
+                // M is sparse and scattered into Hf
                 //--------------------------------------------------------------
 
                 // Given Hf [hash] split into (h,f)
@@ -494,13 +544,12 @@
                 // 2 -> 3 : to lock, if i seen already
                 // 3 -> 2 : to unlock; now i has been seen
 
-                GB_GET_M_j ;                // get M(:,j)
                 GB_GET_M_j_RANGE (16) ;     // get first and last in M(:,j)
                 for ( ; pB < pB_end ; pB++)     // scan B(:,j)
                 { 
-                    int64_t k = Bi [pB] ;       // get B(k,j)
+                    GB_GET_B_kj_INDEX ;         // get index k of B(k,j)
                     GB_GET_A_k ;                // get A(:,k)
-                    GB_SKIP_IF_A_k_DISJOINT_WITH_M_j ;
+                    if (aknz == 0) continue ;
                     GB_GET_B_kj ;               // bkj = B(k,j)
                     #define GB_IKJ                                             \
                     {                                                          \
@@ -545,7 +594,7 @@
                             }                                                  \
                         }                                                      \
                     }
-                    GB_SCAN_M_j_OR_A_k ;
+                    GB_SCAN_M_j_OR_A_k (A_ok_for_binary_search) ;
                     #undef GB_IKJ
                 }
 
@@ -554,7 +603,70 @@
             {
 
                 //--------------------------------------------------------------
-                // phase2: fine hash task, C<!M>=A*B
+                // phase2: fine hash task, C(:,j)<!M(:,j)>=A*B(:,j)
+                //--------------------------------------------------------------
+
+                GB_GET_M_j ;                // get M(:,j)
+                if (M_dense_in_place)
+                { 
+
+                    //----------------------------------------------------------
+                    // M(:,j) is dense.  M is not scattered into Hf.
+                    //----------------------------------------------------------
+
+                    if (Mask_struct && !M_is_bitmap)
+                    { 
+                        // structural mask, complemented, and not bitmap.
+                        // No work to do.
+                        continue ;
+                    }
+
+                    #undef  GB_CHECK_MASK_ij
+                    #define GB_CHECK_MASK_ij                        \
+                        bool mij =                                  \
+                            (M_is_bitmap ? Mjb [i] : 1) &&          \
+                            (Mask_struct ? 1 : (Mjx [i] != 0)) ;    \
+                        if (mij) continue ;
+
+                    switch (msize)
+                    {
+                        default:
+                        case 1 : 
+                            #define M_TYPE uint8_t
+                            #include "GB_AxB_saxpy3_fineHash_phase2.c"
+                            break ;
+                        case 2 : 
+                            #define M_TYPE uint16_t
+                            #include "GB_AxB_saxpy3_fineHash_phase2.c"
+                            break ;
+                        case 4 : 
+                            #define M_TYPE uint32_t
+                            #include "GB_AxB_saxpy3_fineHash_phase2.c"
+                            break ;
+                        case 8 : 
+                            #define M_TYPE uint64_t
+                            #include "GB_AxB_saxpy3_fineHash_phase2.c"
+                            break ;
+                        case 16 : 
+                            #define M_TYPE uint64_t
+                            #define M_SIZE 2
+                            #undef  GB_CHECK_MASK_ij
+                            #define GB_CHECK_MASK_ij                    \
+                                bool mij =                              \
+                                    (M_is_bitmap ? Mjb [i] : 1) &&      \
+                                    (Mask_struct ? 1 :                  \
+                                        (Mjx [2*i] != 0) ||             \
+                                        (Mjx [2*i+1] != 0)) ;           \
+                                if (mij) continue ;
+                            #include "GB_AxB_saxpy3_fineHash_phase2.c"
+                            break ;
+                    }
+                    // the task is finished: go to the next task
+                    continue ;
+                }
+
+                //--------------------------------------------------------------
+                // M is sparse and scattered into Hf
                 //--------------------------------------------------------------
 
                 // Given Hf [hash] split into (h,f)
@@ -574,14 +686,14 @@
 
                 for ( ; pB < pB_end ; pB++)     // scan B(:,j)
                 {
-                    int64_t k = Bi [pB] ;       // get B(k,j)
+                    GB_GET_B_kj_INDEX ;         // get index k of B(k,j)
                     GB_GET_A_k ;                // get A(:,k)
                     if (aknz == 0) continue ;
                     GB_GET_B_kj ;               // bkj = B(k,j)
                     // scan A(:,k)
                     for (int64_t pA = pA_start ; pA < pA_end ; pA++)
                     {
-                        int64_t i = Ai [pA] ;       // get A(i,k)
+                        GB_GET_A_ik_INDEX ;     // get index i of A(i,j)
                         GB_MULT_A_ik_B_kj ;         // t = A(i,k) * B(k,j)
                         int64_t i1 = i + 1 ;        // i1 = one-based index
                         int64_t i_unlocked = (i1 << 2) + 2 ;    // (i+1,2)
@@ -593,7 +705,7 @@
                             hf = Hf [hash] ;        // grab the entry
                             #if GB_HAS_ATOMIC
                             if (hf == i_unlocked)  // if true, update C(i,j)
-                            {
+                            { 
                                 GB_ATOMIC_UPDATE_HX (hash, t) ;// Hx [.]+=t
                                 break ;         // C(i,j) has been updated
                             }
@@ -604,7 +716,7 @@
                             {
                                 // h=0: unoccupied, h=i1: occupied by i
                                 do // lock the entry
-                                {
+                                { 
                                     // do this atomically:
                                     // { hf = Hf [hash] ; Hf [hash] |= 3 ; }
                                     GB_ATOMIC_CAPTURE_INT64_OR (hf,Hf[hash],3) ;
@@ -639,12 +751,19 @@
         }
     }
 
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (9, ttt) ;
+// ttt = omp_get_wtime ( ) ;
+
     //==========================================================================
     // phase3/phase4: count nnz(C(:,j)) for fine tasks, cumsum of Cp
     //==========================================================================
 
-    int64_t cjnz_max = GB_AxB_saxpy3_cumsum (C, TaskList,
-        nfine, chunk, nthreads) ;
+    GB_AxB_saxpy3_cumsum (C, TaskList, nfine, chunk, nthreads) ;
+
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (10, ttt) ;
+// ttt = omp_get_wtime ( ) ;
 
     //==========================================================================
     // phase5: numeric phase for coarse tasks, gather for fine tasks
@@ -652,7 +771,7 @@
 
     // allocate Ci and Cx
     int64_t cnz = Cp [cnvec] ;
-    GrB_Info info = GB_ix_alloc (C, cnz, true, Context) ;
+    GrB_Info info = GB_bix_alloc (C, cnz, false, false, true, true, Context) ;
     if (info != GrB_SUCCESS)
     { 
         // out of memory
@@ -664,6 +783,8 @@
 
     #if GB_IS_ANY_PAIR_SEMIRING
 
+        // TODO: create C as a constant-value matrix.
+
         // ANY_PAIR semiring: result is purely symbolic
         int64_t pC ;
         #pragma omp parallel for num_threads(nthreads) schedule(static)
@@ -687,7 +808,13 @@
 
     #endif
 
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (11, ttt) ;
+// ttt = omp_get_wtime ( ) ;
+
+    bool C_jumbled = false ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+        reduction(||:C_jumbled)
     for (taskid = 0 ; taskid < ntasks ; taskid++)
     {
 
@@ -700,6 +827,7 @@
         #endif
         int64_t hash_size = TaskList [taskid].hsize ;
         bool use_Gustavson = (hash_size == cvlen) ;
+        bool task_C_jumbled = false ;
 
         if (taskid < nfine)
         {
@@ -710,8 +838,8 @@
 
             int64_t kk = TaskList [taskid].vector ;
             int team_size = TaskList [taskid].team_size ;
-            int master    = TaskList [taskid].master ;
-            int my_teamid = taskid - master ;
+            int leader    = TaskList [taskid].leader ;
+            int my_teamid = taskid - leader ;
             int64_t pC = Cp [kk] ;
 
             if (use_Gustavson)
@@ -747,9 +875,7 @@
                     {
                         if (Hf [i] == 2)
                         { 
-                            #if !GB_IS_ANY_PAIR_SEMIRING
                             GB_CIJ_GATHER (pC, i) ; // Cx [pC] = Hx [i]
-                            #endif
                             Ci [pC++] = i ;
                         }
                     }
@@ -777,9 +903,12 @@
                     if ((hf & 3) == 2)
                     { 
                         int64_t i = (hf >> 2) - 1 ; // found C(i,j) in hash
-                        Ci [pC++] = i ;
+                        Ci [pC] = i ;
+                        GB_CIJ_GATHER (pC, hash) ;  // Cx [pC] = Hx [hash]
+                        pC++ ;
                     }
                 }
+                task_C_jumbled = true ;
             }
 
         }
@@ -811,79 +940,13 @@
                     // phase5: coarse Gustavson task, C=A*B
                     //----------------------------------------------------------
 
-                    for (int64_t kk = kfirst ; kk <= klast ; kk++)
-                    {
-                        int64_t pC = Cp [kk] ;
-                        int64_t cjnz = Cp [kk+1] - pC ;
-                        if (cjnz == 0) continue ;   // nothing to do
-                        GB_GET_B_j ;                // get B(:,j)
-                        mark++ ;
-                        if (cjnz == cvlen)          // C(:,j) is dense
-                        { 
-                            GB_COMPUTE_DENSE_C_j ;  // C(:,j) = A*B(:,j)
-                        }
-                        else if (bjnz == 1)         // C(:,j) = A(:,k)*B(k,j)
-                        { 
-                            GB_COMPUTE_C_j_WHEN_NNZ_B_j_IS_ONE ;
-                        }
-                        else if (16 * cjnz > cvlen) // C(:,j) is not very sparse
-                        {
-                            for ( ; pB < pB_end ; pB++)     // scan B(:,j)
-                            {
-                                int64_t k = Bi [pB] ;       // get B(k,j)
-                                GB_GET_A_k ;                // get A(:,k)
-                                if (aknz == 0) continue ;
-                                GB_GET_B_kj ;               // bkj = B(k,j)
-                                // scan A(:,k)
-                                for (int64_t pA = pA_start ; pA < pA_end ; pA++)
-                                {
-                                    int64_t i = Ai [pA] ;   // get A(i,k)
-                                    GB_MULT_A_ik_B_kj ;     // t = A(i,k)*B(k,j)
-                                    if (Hf [i] != mark)
-                                    { 
-                                        // C(i,j) = A(i,k) * B(k,j)
-                                        Hf [i] = mark ;
-                                        GB_HX_WRITE (i, t) ;    // Hx [i] = t
-                                    }
-                                    else
-                                    { 
-                                        // C(i,j) += A(i,k) * B(k,j)
-                                        GB_HX_UPDATE (i, t) ;   // Hx [i] += t
-                                    }
-                                }
-                            }
-                            GB_GATHER_ALL_C_j(mark) ;   // gather into C(:,j) 
-                        }
-                        else    // C(:,j) is very sparse
-                        {
-                            for ( ; pB < pB_end ; pB++)     // scan B(:,j)
-                            {
-                                int64_t k = Bi [pB] ;       // get B(k,j)
-                                GB_GET_A_k ;                // get A(:,k)
-                                if (aknz == 0) continue ;
-                                GB_GET_B_kj ;               // bkj = B(k,j)
-                                // scan A(:,k)
-                                for (int64_t pA = pA_start ; pA < pA_end ; pA++)
-                                {
-                                    int64_t i = Ai [pA] ;   // get A(i,k)
-                                    GB_MULT_A_ik_B_kj ;     // t = A(i,k)*B(k,j)
-                                    if (Hf [i] != mark)
-                                    { 
-                                        // C(i,j) = A(i,k) * B(k,j)
-                                        Hf [i] = mark ;
-                                        GB_HX_WRITE (i, t) ;    // Hx [i] = t
-                                        Ci [pC++] = i ;
-                                    }
-                                    else
-                                    { 
-                                        // C(i,j) += A(i,k) * B(k,j)
-                                        GB_HX_UPDATE (i, t) ;   // Hx [i] += t
-                                    }
-                                }
-                            }
-                            GB_SORT_AND_GATHER_C_j ;    // gather into C(:,j)
-                        }
-                    }
+                    #if GB_IS_PERFORMANCE_CRITICAL_SEMIRING
+                    #define GB_SAXPY_COARSE_GUSTAVSON_NOMASK_PHASE5
+                    #include "GB_meta16_factory.c"
+                    #undef  GB_SAXPY_COARSE_GUSTAVSON_NOMASK_PHASE5
+                    #else
+                    #include "GB_AxB_saxpy3_coarseGus_noM_phase5.c"
+                    #endif
 
                 }
                 else if (mask_is_M)
@@ -905,11 +968,15 @@
                         int64_t cjnz = Cp [kk+1] - pC ;
                         if (cjnz == 0) continue ;   // nothing to do
                         GB_GET_B_j ;                // get B(:,j)
+                        #ifdef GB_IDENTITY
                         if (cjnz == cvlen)          // C(:,j) is dense
                         { 
+                            // this requires the monoid identity.  It is not
+                            // defined for the generic saxpy3.
                             GB_COMPUTE_DENSE_C_j ;  // C(:,j) = A*B(:,j)
-                            continue ;              // no need to examine M(:,j)
+                            continue ;
                         }
+                        #endif
                         GB_GET_M_j ;            // get M(:,j)
                         GB_GET_M_j_RANGE (64) ; // get first and last in M(:,j)
                         mark += 2 ;
@@ -920,9 +987,9 @@
                         {
                             for ( ; pB < pB_end ; pB++)     // scan B(:,j)
                             { 
-                                int64_t k = Bi [pB] ;       // get B(k,j)
+                                GB_GET_B_kj_INDEX ;         // get k of B(k,j)
                                 GB_GET_A_k ;                // get A(:,k)
-                                GB_SKIP_IF_A_k_DISJOINT_WITH_M_j ;
+                                if (aknz == 0) continue ;
                                 GB_GET_B_kj ;               // bkj = B(k,j)
                                 #define GB_IKJ                                 \
                                 {                                              \
@@ -941,7 +1008,7 @@
                                         GB_HX_UPDATE (i, t) ;/* Hx [i] += t */ \
                                     }                                          \
                                 }
-                                GB_SCAN_M_j_OR_A_k ;
+                                GB_SCAN_M_j_OR_A_k (A_ok_for_binary_search) ;
                                 #undef GB_IKJ
                             }
                             GB_GATHER_ALL_C_j(mark1) ;  // gather into C(:,j) 
@@ -950,9 +1017,9 @@
                         {
                             for ( ; pB < pB_end ; pB++)     // scan B(:,j)
                             { 
-                                int64_t k = Bi [pB] ;       // get B(k,j)
+                                GB_GET_B_kj_INDEX ;         // get k of B(k,j)
                                 GB_GET_A_k ;                // get A(:,k)
-                                GB_SKIP_IF_A_k_DISJOINT_WITH_M_j ;
+                                if (aknz == 0) continue ;
                                 GB_GET_B_kj ;               // bkj = B(k,j)
                                 #define GB_IKJ                                 \
                                 {                                              \
@@ -972,7 +1039,7 @@
                                         GB_HX_UPDATE (i, t) ;/* Hx [i] += t */ \
                                     }                                          \
                                 }
-                                GB_SCAN_M_j_OR_A_k ;
+                                GB_SCAN_M_j_OR_A_k (A_ok_for_binary_search) ;
                                 #undef GB_IKJ
                             }
                             GB_SORT_AND_GATHER_C_j ;    // gather into C(:,j)
@@ -987,7 +1054,7 @@
                     // phase5: coarse Gustavson task, C<!M>=A*B
                     //----------------------------------------------------------
 
-                    // if !M:
+                    // Since the mask is !M:
                     // Hf [i] < mark    : M(i,j)=0, C(i,j) is not yet seen.
                     // Hf [i] == mark   : M(i,j)=1, so C(i,j) is ignored.
                     // Hf [i] == mark+1 : M(i,j)=0, and C(i,j) has been seen.
@@ -998,11 +1065,15 @@
                         int64_t cjnz = Cp [kk+1] - pC ;
                         if (cjnz == 0) continue ;   // nothing to do
                         GB_GET_B_j ;                // get B(:,j)
+                        #ifdef GB_IDENTITY
                         if (cjnz == cvlen)          // C(:,j) is dense
                         { 
+                            // this requires the monoid identity.  It is not
+                            // defined for the generic saxpy3.
                             GB_COMPUTE_DENSE_C_j ;  // C(:,j) = A*B(:,j)
-                            continue ;              // no need to examine M(:,j)
+                            continue ;
                         }
+                        #endif
                         GB_GET_M_j ;            // get M(:,j)
                         mark += 2 ;
                         int64_t mark1 = mark+1 ;
@@ -1012,14 +1083,14 @@
                         {
                             for ( ; pB < pB_end ; pB++)     // scan B(:,j)
                             {
-                                int64_t k = Bi [pB] ;       // get B(k,j)
+                                GB_GET_B_kj_INDEX ;         // get k of B(k,j)
                                 GB_GET_A_k ;                // get A(:,k)
                                 if (aknz == 0) continue ;
                                 GB_GET_B_kj ;               // bkj = B(k,j)
                                 // scan A(:,k)
                                 for (int64_t pA = pA_start ; pA < pA_end ; pA++)
                                 {
-                                    int64_t i = Ai [pA] ;   // get A(i,k)
+                                    GB_GET_A_ik_INDEX ;     // get i of A(i,j)
                                     int64_t hf = Hf [i] ;
                                     if (hf < mark)
                                     { 
@@ -1042,14 +1113,14 @@
                         {
                             for ( ; pB < pB_end ; pB++)     // scan B(:,j)
                             {
-                                int64_t k = Bi [pB] ;       // get B(k,j)
+                                GB_GET_B_kj_INDEX ;         // get k of B(k,j)
                                 GB_GET_A_k ;                // get A(:,k)
                                 if (aknz == 0) continue ;
                                 GB_GET_B_kj ;               // bkj = B(k,j)
                                 // scan A(:,k)
                                 for (int64_t pA = pA_start ; pA < pA_end ; pA++)
                                 {
-                                    int64_t i = Ai [pA] ;   // get A(i,k)
+                                    GB_GET_A_ik_INDEX ;     // get i of A(i,j)
                                     int64_t hf = Hf [i] ;
                                     if (hf < mark)
                                     { 
@@ -1083,70 +1154,16 @@
                 int64_t *GB_RESTRICT Hi = TaskList [taskid].Hi ;
                 int64_t hash_bits = (hash_size-1) ;
 
-                if (M == NULL)
-                {
+                if (M == NULL || ignore_mask)
+                { 
 
                     //----------------------------------------------------------
                     // phase5: coarse hash task, C=A*B
                     //----------------------------------------------------------
 
-                    // Initially, Hf [...] < mark for all of Hf.
-                    // Let f = Hf [hash] and h = Hi [hash]
-
-                    // f < mark          : unoccupied.
-                    // h == i, f == mark : occupied with C(i,j)
-
-                    for (int64_t kk = kfirst ; kk <= klast ; kk++)
-                    {
-                        int64_t pC = Cp [kk] ;
-                        int64_t cjnz = Cp [kk+1] - pC ;
-                        if (cjnz == 0) continue ;   // nothing to do
-                        GB_GET_B_j ;                // get B(:,j)
-                        if (bjnz == 1)              // C(:,j) = A(:,k)*B(k,j)
-                        { 
-                            GB_COMPUTE_C_j_WHEN_NNZ_B_j_IS_ONE ;
-                            continue ;
-                        }
-                        mark++ ;
-                        for ( ; pB < pB_end ; pB++)     // scan B(:,j)
-                        {
-                            int64_t k = Bi [pB] ;       // get B(k,j)
-                            GB_GET_A_k ;                // get A(:,k)
-                            if (aknz == 0) continue ;
-                            GB_GET_B_kj ;               // bkj = B(k,j)
-                            // scan A(:,k)
-                            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
-                            {
-                                int64_t i = Ai [pA] ;   // get A(i,k)
-                                GB_MULT_A_ik_B_kj ;     // t = A(i,k)*B(k,j)
-                                for (GB_HASH (i))   // find i in hash table
-                                {
-                                    if (Hf [hash] == mark)
-                                    {
-                                        // hash entry is occupied
-                                        if (Hi [hash] == i)
-                                        { 
-                                            // i already in the hash table
-                                            // Hx [hash] += t ;
-                                            GB_HX_UPDATE (hash, t) ;
-                                            break ;
-                                        }
-                                    }
-                                    else
-                                    { 
-                                        // hash entry is not occupied
-                                        Hf [hash] = mark ;
-                                        Hi [hash] = i ;
-                                        GB_HX_WRITE (hash, t) ;// Hx[hash]=t
-                                        Ci [pC++] = i ;
-                                        break ;
-                                    }
-                                }
-                            }
-                        }
-                        // found i if: Hf [hash] == mark and Hi [hash] == i
-                        GB_SORT_AND_GATHER_HASHED_C_j (mark, Hi [hash] == i)
-                    }
+                    // no mask present, or mask ignored (see below)
+                    #undef GB_CHECK_MASK_ij
+                    #include "GB_AxB_saxpy3_coarseHash_phase5.c"
 
                 }
                 else if (mask_is_M)
@@ -1156,6 +1173,57 @@
                     // phase5: coarse hash task, C<M>=A*B
                     //----------------------------------------------------------
 
+                    if (M_dense_in_place)
+                    { 
+
+                        ASSERT (!Mask_struct || M_is_bitmap) ;
+                        #define GB_CHECK_MASK_ij                        \
+                            bool mij =                                  \
+                                (M_is_bitmap ? Mjb [i] : 1) &&          \
+                                (Mask_struct ? 1 : (Mjx [i] != 0)) ;    \
+                            if (!mij) continue ;
+
+                        switch (msize)
+                        {
+                            default:
+                            case 1 : 
+                                #define M_TYPE uint8_t
+                                #include "GB_AxB_saxpy3_coarseHash_phase5.c"
+                                break ;
+                            case 2 : 
+                                #define M_TYPE uint16_t
+                                #include "GB_AxB_saxpy3_coarseHash_phase5.c"
+                                break ;
+                            case 4 : 
+                                #define M_TYPE uint32_t
+                                #include "GB_AxB_saxpy3_coarseHash_phase5.c"
+                                break ;
+                            case 8 : 
+                                #define M_TYPE uint64_t
+                                #include "GB_AxB_saxpy3_coarseHash_phase5.c"
+                                break ;
+                            case 16 : 
+                                #define M_TYPE uint64_t
+                                #define M_SIZE 2
+                                #undef  GB_CHECK_MASK_ij
+                                #define GB_CHECK_MASK_ij                    \
+                                    bool mij =                              \
+                                        (M_is_bitmap ? Mjb [i] : 1) &&      \
+                                        (Mask_struct ? 1 :                  \
+                                            (Mjx [2*i] != 0) ||             \
+                                            (Mjx [2*i+1] != 0)) ;           \
+                                    if (!mij) continue ;
+                                #include "GB_AxB_saxpy3_coarseHash_phase5.c"
+                                break ;
+                        }
+                    }
+                    else
+                    {
+
+                    //----------------------------------------------------------
+                    // M is sparse and scattered into Hf
+                    //----------------------------------------------------------
+
                     // Initially, Hf [...] < mark for all of Hf.
                     // Let h = Hi [hash] and f = Hf [hash].
 
@@ -1176,9 +1244,9 @@
                         GB_GET_B_j ;                // get B(:,j)
                         for ( ; pB < pB_end ; pB++)     // scan B(:,j)
                         { 
-                            int64_t k = Bi [pB] ;       // get B(k,j)
+                            GB_GET_B_kj_INDEX ;         // get index k of B(k,j)
                             GB_GET_A_k ;                // get A(:,k)
-                            GB_SKIP_IF_A_k_DISJOINT_WITH_M_j ;
+                            if (aknz == 0) continue ;
                             GB_GET_B_kj ;               // bkj = B(k,j)
                             #define GB_IKJ                                     \
                             {                                                  \
@@ -1205,11 +1273,11 @@
                                     }                                          \
                                 }                                              \
                             }
-                            GB_SCAN_M_j_OR_A_k ;
+                            GB_SCAN_M_j_OR_A_k (A_ok_for_binary_search) ;
                             #undef GB_IKJ
                         }
-                        // found i if: Hf [hash] == mark1 and Hi [hash] == i
-                        GB_SORT_AND_GATHER_HASHED_C_j (mark1, Hi [hash] == i) ;
+                        GB_SORT_AND_GATHER_HASHED_C_j (mark1) ;
+                    }
                     }
 
                 }
@@ -1220,6 +1288,68 @@
                     // phase5: coarse hash task, C<!M>=A*B
                     //----------------------------------------------------------
 
+                    if (M_dense_in_place)
+                    { 
+
+                        //------------------------------------------------------
+                        // M(:,j) is dense.  M is not scattered into Hf.
+                        //------------------------------------------------------
+
+                        if (Mask_struct && !M_is_bitmap)
+                        { 
+                            // structural mask, complemented, not bitmap.
+                            // No work to do; C is empty.
+                            continue ;
+                        }
+
+                        #undef  GB_CHECK_MASK_ij
+                        #define GB_CHECK_MASK_ij                        \
+                            bool mij =                                  \
+                                (M_is_bitmap ? Mjb [i] : 1) &&          \
+                                (Mask_struct ? 1 : (Mjx [i] != 0)) ;    \
+                            if (mij) continue ;
+
+                        switch (msize)
+                        {
+                            default:
+                            case 1 : 
+                                #define M_TYPE uint8_t
+                                #include "GB_AxB_saxpy3_coarseHash_phase5.c"
+                                break ;
+                            case 2 : 
+                                #define M_TYPE uint16_t
+                                #include "GB_AxB_saxpy3_coarseHash_phase5.c"
+                                break ;
+                            case 4 : 
+                                #define M_TYPE uint32_t
+                                #include "GB_AxB_saxpy3_coarseHash_phase5.c"
+                                break ;
+                            case 8 : 
+                                #define M_TYPE uint64_t
+                                #include "GB_AxB_saxpy3_coarseHash_phase5.c"
+                                break ;
+                            case 16 : 
+                                #define M_TYPE uint64_t
+                                #define M_SIZE 2
+                                #undef  GB_CHECK_MASK_ij
+                                #define GB_CHECK_MASK_ij                    \
+                                    bool mij =                              \
+                                        (M_is_bitmap ? Mjb [i] : 1) &&      \
+                                        (Mask_struct ? 1 :                  \
+                                            (Mjx [2*i] != 0) ||             \
+                                            (Mjx [2*i+1] != 0)) ;           \
+                                    if (mij) continue ;
+                                #include "GB_AxB_saxpy3_coarseHash_phase5.c"
+                                break ;
+                        }
+                    }
+                    else
+                    {
+
+                    //----------------------------------------------------------
+                    // M is sparse and scattered into Hf
+                    //----------------------------------------------------------
+
                     // Initially, Hf [...] < mark for all of Hf.
                     // Let h = Hi [hash] and f = Hf [hash].
 
@@ -1239,14 +1369,14 @@
                         GB_GET_B_j ;                // get B(:,j)
                         for ( ; pB < pB_end ; pB++)     // scan B(:,j)
                         {
-                            int64_t k = Bi [pB] ;       // get B(k,j)
+                            GB_GET_B_kj_INDEX ;         // get index k of B(k,j)
                             GB_GET_A_k ;                // get A(:,k)
                             if (aknz == 0) continue ;
                             GB_GET_B_kj ;               // bkj = B(k,j)
                             // scan A(:,k)
                             for (int64_t pA = pA_start ; pA < pA_end ; pA++)
                             {
-                                int64_t i = Ai [pA] ;   // get A(i,k)
+                                GB_GET_A_ik_INDEX ;     // get index i of A(i,j)
                                 for (GB_HASH (i))       // find i in hash
                                 {
                                     int64_t f = Hf [hash] ;
@@ -1273,88 +1403,24 @@
                                 }
                             }
                         }
-                        // found i if: Hf [hash] == mark1 and Hi [hash] == i
-                        GB_SORT_AND_GATHER_HASHED_C_j (mark1, Hi [hash] == i) ;
+                        GB_SORT_AND_GATHER_HASHED_C_j (mark1) ;
+                    }
                     }
                 }
             }
         }
+        C_jumbled = C_jumbled || task_C_jumbled ;
     }
 
-    //==========================================================================
-    // phase6: final gather phase for fine hash tasks
-    //==========================================================================
-
-    if (cjnz_max > 0)
-    {
-        int64_t *GB_RESTRICT W = NULL ;
-        int nthreads_msort = GB_MSORT_NTHREADS (nthreads) ;
-        if (cjnz_max <= GB_BASECASE) nthreads_msort = 1 ;
-        if (nthreads_msort > 1)
-        {
-            // allocate workspace for parallel mergesort
-            W = GB_MALLOC (cjnz_max, int64_t) ;
-            if (W == NULL)
-            { 
-                // out of memory
-                return (GrB_OUT_OF_MEMORY) ;
-            }
-        }
-
-        for (taskid = 0 ; taskid < nfine ; taskid++)
-        {
-            int64_t hash_size = TaskList [taskid].hsize ;
-            bool use_Gustavson = (hash_size == cvlen) ;
-            if (!use_Gustavson && taskid == TaskList [taskid].master)
-            {
+    //--------------------------------------------------------------------------
+    // log the state of C->jumbled
+    //--------------------------------------------------------------------------
 
-                //--------------------------------------------------------------
-                // phase6: fine hash task, C=A*B, C<M>=A*B, C<!M>=A*B
-                //--------------------------------------------------------------
+    C->jumbled = C_jumbled ;    // C is jumbled if any task left it jumbled
 
-                // (Hf [hash] & 3) == 2 if C(i,j) is an entry in C(:,j),
-                // and the index i of the entry is (Hf [hash] >> 2) - 1.
+// ttt = omp_get_wtime ( ) - ttt ;
+// GB_Global_timing_add (12, ttt) ;
 
-                int64_t kk = TaskList [taskid].vector ;
-                int64_t hash_bits = (hash_size-1) ;
-                int64_t  *GB_RESTRICT
-                    Hf = (int64_t  *GB_RESTRICT) TaskList [taskid].Hf ;
-                int64_t cjnz = Cp [kk+1] - Cp [kk] ;
-
-                // sort the pattern of C(:,j)
-                int nth = GB_nthreads (cjnz, chunk, nthreads_msort) ;
-                GB_msort_1 (Ci + Cp [kk], W, cjnz, nth) ;
-
-                #if !GB_IS_ANY_PAIR_SEMIRING
-
-                    GB_CTYPE *GB_RESTRICT Hx =
-                        (GB_CTYPE *) TaskList [taskid].Hx ;
-                    // gather the values of C(:,j)
-                    int64_t pC ;
-                    #pragma omp parallel for num_threads(nth) schedule(static)
-                    for (pC = Cp [kk] ; pC < Cp [kk+1] ; pC++)
-                    {
-                        int64_t i = Ci [pC] ;   // get C(i,j)
-                        int64_t i1 = i + 1 ;
-                        for (GB_HASH (i))       // find i in hash table
-                        {
-                            int64_t hf = Hf [hash] ;
-                            if ((hf & 3) == 2 && (hf >> 2) == i1)
-                            { 
-                                // found i in the hash table
-                                GB_CIJ_GATHER (pC, hash) ; // Cx[pC] = Hx[hash]
-                                break ;
-                            }
-                        }
-                    }
-
-                #endif
-            }
-        }
-
-        // free workspace
-        GB_FREE (W) ;
-    }
 }
 
 #undef Cx
diff --git a/GraphBLAS/Source/Template/GB_AxB_saxpy3_template.h b/GraphBLAS/Source/Template/GB_AxB_saxpy3_template.h
index 9fdd652d7e..04b88267c0 100644
--- a/GraphBLAS/Source/Template/GB_AxB_saxpy3_template.h
+++ b/GraphBLAS/Source/Template/GB_AxB_saxpy3_template.h
@@ -2,12 +2,13 @@
 // GB_AxB_saxpy3_template.h: C=A*B, C<M>=A*B, or C<!M>=A*B via saxpy3 method
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Definitions for GB_AxB_saxpy3_template.c
+// Definitions for GB_AxB_saxpy3_template.c.  These do not depend on the
+// sparsity of A and B.
 
 #ifndef GB_AXB_SAXPY3_TEMPLATE_H
 #define GB_AXB_SAXPY3_TEMPLATE_H
@@ -23,45 +24,70 @@
     int64_t mpleft = 0 ;                                        \
     int64_t mpright = mnvec-1 ;                                 \
     int64_t pM_start, pM_end ;                                  \
-    GB_lookup (M_is_hyper, Mh, Mp, &mpleft, mpright,            \
-        ((Bh == NULL) ? kk : Bh [kk]), &pM_start, &pM_end) ;    \
-    int64_t mjnz = pM_end - pM_start ;    /* nnz (M (:,j)) */
+    GB_lookup (M_is_hyper, Mh, Mp, mvlen, &mpleft, mpright,     \
+        GBH (Bh, kk), &pM_start, &pM_end) ;                     \
+    int64_t mjnz = pM_end - pM_start ;
 
 //------------------------------------------------------------------------------
-// GB_GET_M_j_RANGE: get the first and last indices in M(:,j)
+// GB_GET_M_j_RANGE
 //------------------------------------------------------------------------------
 
 #define GB_GET_M_j_RANGE(gamma)                                 \
-    int64_t im_first = -1, im_last = -1 ;                       \
-    if (mjnz > 0)                                               \
-    {                                                           \
-        im_first = Mi [pM_start] ;  /* get first M(:,j) */      \
-        im_last  = Mi [pM_end-1] ;  /* get last M(:,j) */       \
-    }                                                           \
     int64_t mjnz_much = mjnz * gamma
 
 //------------------------------------------------------------------------------
-// GB_SCATTER_M_j: scatter M(:,j) for a fine or coarse Gustavson task
+// GB_SCATTER_M_j_TYPE: scatter M(:,j) of the given type into Gus. workspace
 //------------------------------------------------------------------------------
 
 #define GB_SCATTER_M_j_TYPE(mask_t,pMstart,pMend,mark)                  \
 {                                                                       \
     const mask_t *GB_RESTRICT Mxx = (mask_t *) Mx ;                     \
-    for (int64_t pM = pMstart ; pM < pMend ; pM++) /* scan M(:,j) */    \
+    if (M_is_bitmap)                                                    \
     {                                                                   \
-        if (Mxx [pM]) Hf [Mi [pM]] = mark ;   /* Hf [i] = M(i,j) */     \
+        /* M is bitmap */                                               \
+        for (int64_t pM = pMstart ; pM < pMend ; pM++)                  \
+        {                                                               \
+            /* if (M (i,j) == 1) mark Hf [i] */                         \
+            if (Mb [pM] && Mxx [pM]) Hf [GBI (Mi, pM, mvlen)] = mark ;  \
+        }                                                               \
+    }                                                                   \
+    else                                                                \
+    {                                                                   \
+        /* M is hyper, sparse, or full */                               \
+        for (int64_t pM = pMstart ; pM < pMend ; pM++)                  \
+        {                                                               \
+            /* if (M (i,j) == 1) mark Hf [i] */                         \
+            if (Mxx [pM]) Hf [GBI (Mi, pM, mvlen)] = mark ;             \
+        }                                                               \
     }                                                                   \
 }                                                                       \
 break ;
 
-// scatter M(:,j) for a coarse Gustavson task, C<M>=A*B or C<!M>=A*B
+//------------------------------------------------------------------------------
+// GB_SCATTER_M_j:  scatter M(:,j) into the Gustavson workpace
+//------------------------------------------------------------------------------
+
 #define GB_SCATTER_M_j(pMstart,pMend,mark)                                  \
     if (Mx == NULL)                                                         \
     {                                                                       \
-        /* mask is structural, not valued */                                \
-        for (int64_t pM = pMstart ; pM < pMend ; pM++)                      \
+        /* M is structural, not valued */                                   \
+        if (M_is_bitmap)                                                    \
         {                                                                   \
-            Hf [Mi [pM]] = mark ;   /* Hf [i] = M(i,j) */                   \
+            /* M is bitmap */                                               \
+            for (int64_t pM = pMstart ; pM < pMend ; pM++)                  \
+            {                                                               \
+                /* if (M (i,j) is present) mark Hf [i] */                   \
+                if (Mb [pM]) Hf [GBI (Mi, pM, mvlen)] = mark ;              \
+            }                                                               \
+        }                                                                   \
+        else                                                                \
+        {                                                                   \
+            /* M is hyper, sparse, or full */                               \
+            for (int64_t pM = pMstart ; pM < pMend ; pM++)                  \
+            {                                                               \
+                /* mark Hf [i] */                                           \
+                Hf [GBI (Mi, pM, mvlen)] = mark ;                           \
+            }                                                               \
         }                                                                   \
     }                                                                       \
     else                                                                    \
@@ -74,6 +100,20 @@ break ;
             case 2: GB_SCATTER_M_j_TYPE (uint16_t, pMstart, pMend, mark) ;  \
             case 4: GB_SCATTER_M_j_TYPE (uint32_t, pMstart, pMend, mark) ;  \
             case 8: GB_SCATTER_M_j_TYPE (uint64_t, pMstart, pMend, mark) ;  \
+            case 16:                                                        \
+            {                                                               \
+                const uint64_t *GB_RESTRICT Mxx = (uint64_t *) Mx ;         \
+                for (int64_t pM = pMstart ; pM < pMend ; pM++)              \
+                {                                                           \
+                    /* if (M (i,j) == 1) mark Hf [i] */                     \
+                    if (!GBB (Mb, pM)) continue ;                           \
+                    if (Mxx [2*pM] || Mxx [2*pM+1])                         \
+                    {                                                       \
+                        /* Hf [i] = M(i,j) */                               \
+                        Hf [GBI (Mi, pM, mvlen)] = mark ;                   \
+                    }                                                       \
+                }                                                           \
+            }                                                               \
         }                                                                   \
     }
 
@@ -83,11 +123,11 @@ break ;
 
 // hash M(:,j) into Hf and Hi for coarse hash task, C<M>=A*B or C<!M>=A*B
 #define GB_HASH_M_j                                                     \
-    for (int64_t pM = pM_start ; pM < pM_end ; pM++) /* scan M(:,j) */  \
+    for (int64_t pM = pM_start ; pM < pM_end ; pM++)                    \
     {                                                                   \
-        GB_GET_M_ij ;           /* get M(i,j) */                        \
+        GB_GET_M_ij (pM) ;      /* get M(i,j) */                        \
         if (!mij) continue ;    /* skip if M(i,j)=0 */                  \
-        int64_t i = Mi [pM] ;                                           \
+        int64_t i = GBI (Mi, pM, mvlen) ;                               \
         for (GB_HASH (i))       /* find i in hash */                    \
         {                                                               \
             if (Hf [hash] < mark)                                       \
@@ -100,60 +140,77 @@ break ;
     }
 
 //------------------------------------------------------------------------------
-// GB_GET_B_j: prepare to iterate over B(:,j)
+// GB_GET_T_FOR_SECONDJ: define t for SECONDJ and SECONDJ1 semirings
 //------------------------------------------------------------------------------
 
-// prepare to iterate over the vector B(:,j), the (kk)th vector in B,
-// where j == ((Bh == NULL) ? kk : Bh [kk]).  Note that j itself is never
-// needed; just kk.
-#define GB_GET_B_j                                                          \
+#if GB_IS_SECONDJ_MULTIPLIER
+    #define GB_GET_T_FOR_SECONDJ                            \
+        GB_CIJ_DECLARE (t) ;                                \
+        GB_MULT (t, ignore, ignore, ignore, ignore, j) ;
+#else
+    #define GB_GET_T_FOR_SECONDJ
+#endif
+
+//------------------------------------------------------------------------------
+// GB_GET_B_j_FOR_ALL_FORMATS: prepare to iterate over B(:,j)
+//------------------------------------------------------------------------------
+
+// prepare to iterate over the vector B(:,j), the (kk)th vector in B, where 
+// j == GBH (Bh, kk).  This macro works regardless of the sparsity of A and B.
+#define GB_GET_B_j_FOR_ALL_FORMATS(A_is_hyper,B_is_sparse,B_is_hyper)       \
     int64_t pleft = 0 ;                                                     \
     int64_t pright = anvec-1 ;                                              \
-    int64_t pB = Bp [kk] ;                                                  \
-    int64_t pB_end = Bp [kk+1] ;                                            \
+    int64_t j = (B_is_hyper) ? Bh [kk] : kk ;                               \
+    GB_GET_T_FOR_SECONDJ ;  /* t = j for SECONDJ, or j+1 for SECONDJ1 */    \
+    int64_t pB = (B_is_sparse || B_is_hyper) ? Bp [kk] : (kk * bvlen) ;     \
+    int64_t pB_end = (B_is_sparse || B_is_hyper) ? Bp [kk+1] : (pB+bvlen) ; \
     int64_t bjnz = pB_end - pB ;  /* nnz (B (:,j) */                        \
     /* FUTURE::: can skip if mjnz == 0 for C<M>=A*B tasks */                \
-    if (A_is_hyper && bjnz > 2)                                             \
+    if (A_is_hyper && (B_is_sparse || B_is_hyper) && bjnz > 2 && !B_jumbled)\
     {                                                                       \
         /* trim Ah [0..pright] to remove any entries past last B(:,j), */   \
-        /* to speed up GB_lookup in GB_GET_A_k. */                          \
-        GB_bracket_right (Bi [pB_end-1], Ah, 0, &pright) ;                  \
+        /* to speed up GB_lookup in GB_GET_A_k_FOR_ALL_FORMATS. */          \
+        /* This requires that B is not jumbled */                           \
+        GB_bracket_right (GBI (Bi, pB_end-1, bvlen), Ah, 0, &pright) ;      \
     }
 
 //------------------------------------------------------------------------------
 // GB_GET_B_kj: get the numeric value of B(k,j)
 //------------------------------------------------------------------------------
 
-#define GB_GET_B_kj \
-    GB_GETB (bkj, Bx, pB)       /* bkj = Bx [pB] */
+#if GB_IS_FIRSTJ_MULTIPLIER
 
-//------------------------------------------------------------------------------
-// GB_GET_A_k: prepare to iterate over the vector A(:,k)
-//------------------------------------------------------------------------------
+    // FIRSTJ or FIRSTJ1 multiplier
+    // t = aik * bkj = k or k+1
+    #define GB_GET_B_kj                                     \
+        GB_CIJ_DECLARE (t) ;                                \
+        GB_MULT (t, ignore, ignore, ignore, k, ignore)
 
-#define GB_GET_A_k                                                          \
-    int64_t pA_start, pA_end ;                                              \
-    GB_lookup (A_is_hyper, Ah, Ap, &pleft, pright, k, &pA_start, &pA_end) ; \
-    int64_t aknz = pA_end - pA_start ;    /* nnz (A (:,k)) */
+#else
+
+    #define GB_GET_B_kj \
+        GB_GETB (bkj, Bx, pB)       /* bkj = Bx [pB] */
+
+#endif
 
 //------------------------------------------------------------------------------
-// GB_SKIP_IF_A_k_DISJOINT_WITH_M_j:  skip if A(:,k) and M(:,j) are disjoint
+// GB_GET_A_k_FOR_ALL_FORMATS: prepare to iterate over the vector A(:,k)
 //------------------------------------------------------------------------------
 
-// skip C(:,j)<M> += A(:,k)*B(k,j) if A(:,k) and M(:,j), for C<M>=A*B methods
-#define GB_SKIP_IF_A_k_DISJOINT_WITH_M_j                    \
-    if (aknz == 0) continue ;                               \
-    int64_t alo = Ai [pA_start] ;   /* get first A(:,k) */  \
-    int64_t ahi = Ai [pA_end-1] ;   /* get last A(:,k) */   \
-    if (ahi < im_first || alo > im_last) continue
+#define GB_GET_A_k_FOR_ALL_FORMATS(A_is_hyper)                              \
+    if (B_jumbled) pleft = 0 ;  /* reuse pleft if B is not jumbled */       \
+    int64_t pA_start, pA_end ;                                              \
+    GB_lookup (A_is_hyper, Ah, Ap, avlen, &pleft, pright, k,                \
+        &pA_start, &pA_end) ;                                               \
+    int64_t aknz = pA_end - pA_start
 
 //------------------------------------------------------------------------------
 // GB_GET_M_ij: get the numeric value of M(i,j)
 //------------------------------------------------------------------------------
 
-#define GB_GET_M_ij                                 \
+#define GB_GET_M_ij(pM)                             \
     /* get M(i,j), at Mi [pM] and Mx [pM] */        \
-    bool mij = GB_mcast (Mx, pM, msize)
+    bool mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize)
 
 //------------------------------------------------------------------------------
 // GB_MULT_A_ik_B_kj: declare t and compute t = A(i,k) * B(k,j)
@@ -167,84 +224,18 @@ break ;
     #define t (GB_CTYPE_CAST (1, 0))
     #define GB_MULT_A_ik_B_kj
 
-#else
-
-    // typical semiring
-    #define GB_MULT_A_ik_B_kj                                   \
-        GB_GETA (aik, Ax, pA) ;     /* aik = Ax [pA] ;  */      \
-        GB_CIJ_DECLARE (t) ;        /* ctype t ;        */      \
-        GB_MULT (t, aik, bkj)       /* t = aik * bkj ;  */
-
-#endif
-
-//------------------------------------------------------------------------------
-// GB_COMPUTE_DENSE_C_j: compute C(:,j)=A*B(:,j) when C(:,j) is completely dense
-//------------------------------------------------------------------------------
-
-#if GB_IS_ANY_PAIR_SEMIRING
-
-    // ANY_PAIR: result is purely symbolic; no numeric work to do
-    #define GB_COMPUTE_DENSE_C_j                                \
-        for (int64_t i = 0 ; i < cvlen ; i++)                   \
-        {                                                       \
-            Ci [pC + i] = i ;                                   \
-        }
-
-#else
-
-    // typical semiring
-    #define GB_COMPUTE_DENSE_C_j                                    \
-        for (int64_t i = 0 ; i < cvlen ; i++)                       \
-        {                                                           \
-            Ci [pC + i] = i ;                                       \
-            GB_CIJ_WRITE (pC + i, GB_IDENTITY) ; /* C(i,j)=0 */     \
-        }                                                           \
-        for ( ; pB < pB_end ; pB++)     /* scan B(:,j) */           \
-        {                                                           \
-            int64_t k = Bi [pB] ;       /* get B(k,j) */            \
-            GB_GET_A_k ;                /* get A(:,k) */            \
-            if (aknz == 0) continue ;                               \
-            GB_GET_B_kj ;               /* bkj = B(k,j) */          \
-            /* FUTURE::: handle the case when A(:,k) is dense */    \
-            /* scan A(:,k) */                                       \
-            for (int64_t pA = pA_start ; pA < pA_end ; pA++)        \
-            {                                                       \
-                int64_t i = Ai [pA] ;    /* get A(i,k) */           \
-                GB_MULT_A_ik_B_kj ;      /* t = A(i,k)*B(k,j) */    \
-                GB_CIJ_UPDATE (pC + i, t) ; /* Cx [pC+i]+=t */      \
-            }                                                       \
-        }
+#elif ( GB_IS_FIRSTJ_MULTIPLIER || GB_IS_SECONDJ_MULTIPLIER )
 
-#endif
-
-//------------------------------------------------------------------------------
-// GB_COMPUTE_C_j_WHEN_NNZ_B_j_IS_ONE: compute C(:,j) when nnz(B(:,j)) == 1
-//------------------------------------------------------------------------------
-
-// C(:,j) = A(:,k)*B(k,j) when there is a single entry in B(:,j)
-#if GB_IS_ANY_PAIR_SEMIRING
-
-    // ANY_PAIR: result is purely symbolic; no numeric work to do
-    #define GB_COMPUTE_C_j_WHEN_NNZ_B_j_IS_ONE                      \
-        int64_t k = Bi [pB] ;       /* get B(k,j) */                \
-        GB_GET_A_k ;                /* get A(:,k) */                \
-        memcpy (Ci + pC, Ai + pA_start, aknz * sizeof (int64_t)) ;
+    // nothing to do; t = aik*bkj already defined in an outer loop
+    #define GB_MULT_A_ik_B_kj
 
 #else
 
     // typical semiring
-    #define GB_COMPUTE_C_j_WHEN_NNZ_B_j_IS_ONE                      \
-        int64_t k = Bi [pB] ;       /* get B(k,j) */                \
-        GB_GET_A_k ;                /* get A(:,k) */                \
-        GB_GET_B_kj ;               /* bkj = B(k,j) */              \
-        /* scan A(:,k) */                                           \
-        for (int64_t pA = pA_start ; pA < pA_end ; pA++)            \
-        {                                                           \
-            int64_t i = Ai [pA] ;       /* get A(i,k) */            \
-            GB_MULT_A_ik_B_kj ;         /* t = A(i,k)*B(k,j) */     \
-            GB_CIJ_WRITE (pC, t) ;      /* Cx [pC] = t */           \
-            Ci [pC++] = i ;                                         \
-        }
+    #define GB_MULT_A_ik_B_kj                                       \
+        GB_GETA (aik, Ax, pA) ;         /* aik = Ax [pA] ;  */      \
+        GB_CIJ_DECLARE (t) ;            /* ctype t ;        */      \
+        GB_MULT (t, aik, bkj, i, k, j)  /* t = aik * bkj ;  */
 
 #endif
 
@@ -252,7 +243,9 @@ break ;
 // GB_GATHER_ALL_C_j: gather the values and pattern of C(:,j)
 //------------------------------------------------------------------------------
 
-// gather the pattern and values of C(:,j) for a coarse Gustavson task (no sort)
+// gather the pattern and values of C(:,j) for a coarse Gustavson task;
+// the pattern is not flagged as jumbled.
+
 #if GB_IS_ANY_PAIR_SEMIRING
 
     // ANY_PAIR: result is purely symbolic; no numeric work to do
@@ -281,23 +274,40 @@ break ;
 #endif
 
 //------------------------------------------------------------------------------
-// GB_SORT_AND_GATHER_C_j: sort the pattern of C(:,j) and gather values
+// GB_SORT_C_j_PATTERN: sort C(:,j) for a coarse task, or flag as jumbled
+//------------------------------------------------------------------------------
+
+// Only coarse tasks do the optional sort.  Fine hash tasks always leave C
+// jumbled.
+
+#define GB_SORT_C_j_PATTERN                                     \
+    if (do_sort)                                                \
+    {                                                           \
+        /* sort the pattern of C(:,j) (non-default) */          \
+        GB_qsort_1a (Ci + Cp [kk], cjnz) ;                      \
+    }                                                           \
+    else                                                        \
+    {                                                           \
+        /* lazy sort: C(:,j) is now jumbled (default) */        \
+        task_C_jumbled = true ;                                 \
+    }
+
+//------------------------------------------------------------------------------
+// GB_SORT_AND_GATHER_C_j: sort and gather C(:,j) for a coarse Gustavson task
 //------------------------------------------------------------------------------
 
-// sort the pattern of C(:,j) then gather the values for a coarse Gustavson task
+// gather the values of C(:,j) for a coarse Gustavson task
 #if GB_IS_ANY_PAIR_SEMIRING
 
-    // ANY_PAIR: result is purely symbolic; just sort the pattern
+    // ANY_PAIR: result is purely symbolic
     #define GB_SORT_AND_GATHER_C_j                              \
-        /* sort the pattern of C(:,j) */                        \
-        GB_qsort_1a (Ci + Cp [kk], cjnz) ;
+        GB_SORT_C_j_PATTERN ;
 
 #else
 
     // typical semiring
     #define GB_SORT_AND_GATHER_C_j                              \
-        /* sort the pattern of C(:,j) */                        \
-        GB_qsort_1a (Ci + Cp [kk], cjnz) ;                      \
+        GB_SORT_C_j_PATTERN ;                                   \
         /* gather the values into C(:,j) */                     \
         for (int64_t pC = Cp [kk] ; pC < Cp [kk+1] ; pC++)      \
         {                                                       \
@@ -308,29 +318,26 @@ break ;
 #endif
 
 //------------------------------------------------------------------------------
-// GB_SORT_AND_GATHER_HASHED_C_j: sort pattern, gather values, for coarse hash 
+// GB_SORT_AND_GATHER_HASHED_C_j: sort and gather C(:,j) for a coarse hash task
 //------------------------------------------------------------------------------
 
 #if GB_IS_ANY_PAIR_SEMIRING
 
-    // ANY_PAIR: result is purely symbolic; just sort the pattern
-    #define GB_SORT_AND_GATHER_HASHED_C_j(hash_mark,Hi_hash_equals_i)       \
-        /* sort the pattern of C(:,j) */                                    \
-        GB_qsort_1a (Ci + Cp [kk], cjnz) ;
+    // ANY_PAIR: result is purely symbolic
+    #define GB_SORT_AND_GATHER_HASHED_C_j(hash_mark)            \
+        GB_SORT_C_j_PATTERN ;
 
 #else
 
-    // sort the pattern of C(:,j) then gather the values for a coarse hash task
-    #define GB_SORT_AND_GATHER_HASHED_C_j(hash_mark,Hi_hash_equals_i)       \
-        /* sort the pattern of C(:,j) */                                    \
-        GB_qsort_1a (Ci + Cp [kk], cjnz) ;                                  \
+    // gather the values of C(:,j) for a coarse hash task
+    #define GB_SORT_AND_GATHER_HASHED_C_j(hash_mark)                        \
+        GB_SORT_C_j_PATTERN ;                                               \
         for (int64_t pC = Cp [kk] ; pC < Cp [kk+1] ; pC++)                  \
         {                                                                   \
             int64_t i = Ci [pC] ;                                           \
-            int64_t marked = (hash_mark) ;                                  \
             for (GB_HASH (i))           /* find i in hash table */          \
             {                                                               \
-                if (Hf [hash] == marked && (Hi_hash_equals_i))              \
+                if (Hf [hash] == (hash_mark) && (Hi [hash] == i))           \
                 {                                                           \
                     /* i found in the hash table */                         \
                     /* Cx [pC] = Hx [hash] ; */                             \
@@ -342,53 +349,6 @@ break ;
 
 #endif
 
-//------------------------------------------------------------------------------
-// GB_SCAN_M_j_OR_A_k: compute C(:,j) using linear scan or binary search
-//------------------------------------------------------------------------------
-
-// C(:,j)<M(:,j)>=A(:,k)*B(k,j) using one of two methods
-#define GB_SCAN_M_j_OR_A_k                                              \
-{                                                                       \
-    if (aknz > 256 && mjnz_much < aknz)                                 \
-    /* nnz(M(:,j)) much less than nnz(A(:,k)) */                        \
-    {                                                                   \
-        /* scan M(:,j), and do binary search for A(i,k) */              \
-        int64_t pA = pA_start ;                                         \
-        for (int64_t pM = pM_start ; pM < pM_end ; pM++)                \
-        {                                                               \
-            GB_GET_M_ij ;           /* get M(i,j) */                    \
-            if (!mij) continue ;    /* skip if M(i,j)=0 */              \
-            int64_t i = Mi [pM] ;                                       \
-            bool found ;            /* search for A(i,k) */             \
-            int64_t apright = pA_end - 1 ;                              \
-            GB_BINARY_SEARCH (i, Ai, pA, apright, found) ;              \
-            if (found)                                                  \
-            {                                                           \
-                /* C(i,j)<M(i,j)> += A(i,k) * B(k,j) for this method. */\
-                /* M(i,j) is now known to be equal to 1, so there are */\
-                /* cases in the GB_IKJ operation that can never */      \
-                /* occur.  This could be pruned from the GB_IKJ */      \
-                /* operation, but then this operation would differ */   \
-                /* from the GB_IKJ operation in the linear-time scan */ \
-                /* of A(:,j), below.  It's unlikely that pruning this */\
-                /* case would lead to much performance improvement. */  \
-                GB_IKJ ;                                                \
-            }                                                           \
-        }                                                               \
-    }                                                                   \
-    else                                                                \
-    {                                                                   \
-        /* scan A(:,k), and lookup M(i,j) */                            \
-        for (int64_t pA = pA_start ; pA < pA_end ; pA++)                \
-        {                                                               \
-            int64_t i = Ai [pA] ;    /* get A(i,k) */                   \
-            /* do C(i,j)<M(i,j)> += A(i,k) * B(k,j) for this method */  \
-            /* M(i,j) may be 0 or 1, as given in the hash table */      \
-            GB_IKJ ;                                                    \
-        }                                                               \
-    }                                                                   \
-}
-
 //------------------------------------------------------------------------------
 // GB_ATOMIC_UPDATE_HX:  Hx [i] += t
 //------------------------------------------------------------------------------
@@ -407,9 +367,67 @@ break ;
     // Hx [i] += t via atomic update
     //--------------------------------------------------------------------------
 
-    #if GB_IS_PLUS_FC32_MONOID
+    // for built-in MIN/MAX monoids only, on built-in types
+    #define GB_MINMAX(i,t,done)                                     \
+    {                                                               \
+        GB_CTYPE xold, xnew, *px = Hx + (i) ;                       \
+        do                                                          \
+        {                                                           \
+            /* xold = Hx [i] via atomic read */                     \
+            GB_ATOMIC_READ                                          \
+            xold = (*px) ;                                          \
+            /* done if xold <= t for MIN, or xold >= t for MAX, */  \
+            /* but not done if xold is NaN */                       \
+            if (done) break ;                                       \
+            xnew = t ;  /* t should be assigned; it is not NaN */   \
+        }                                                           \
+        while (!GB_ATOMIC_COMPARE_EXCHANGE (px, xold, xnew)) ;      \
+    }
 
-        // built-in PLUS_FC32 monoid
+    #if GB_IS_IMIN_MONOID
+
+        // built-in MIN monoids for signed and unsigned integers
+        #define GB_ATOMIC_UPDATE_HX(i,t)                            \
+            GB_MINMAX (i, t, xold <= t)
+
+    #elif GB_IS_IMAX_MONOID
+
+        // built-in MAX monoids for signed and unsigned integers
+        #define GB_ATOMIC_UPDATE_HX(i,t)                            \
+            GB_MINMAX (i, t, xold >= t)
+
+    #elif GB_IS_FMIN_MONOID
+
+        // built-in MIN monoids for float and double, with omitnan behavior.
+        // The update is skipped entirely if t is NaN.  Otherwise, if t is not
+        // NaN, xold is checked.  If xold is NaN, islessequal (xold, t) is
+        // always false, so the non-NaN t must be always be assigned to Hx [i].
+        // If both terms are not NaN, then islessequal (xold,t) is just the
+        // comparison xold <= t.  If that is true, there is no work to do and
+        // the loop breaks.  Otherwise, t is smaller than xold and so it must
+        // be assigned to Hx [i].
+        #define GB_ATOMIC_UPDATE_HX(i,t)                            \
+        {                                                           \
+            if (!isnan (t))                                         \
+            {                                                       \
+                GB_MINMAX (i, t, islessequal (xold, t)) ;           \
+            }                                                       \
+        }
+
+    #elif GB_IS_FMAX_MONOID
+
+        // built-in MAX monoids for float and double, with omitnan behavior.
+        #define GB_ATOMIC_UPDATE_HX(i,t)                            \
+        {                                                           \
+            if (!isnan (t))                                         \
+            {                                                       \
+                GB_MINMAX (i, t, isgreaterequal (xold, t)) ;        \
+            }                                                       \
+        }
+
+    #elif GB_IS_PLUS_FC32_MONOID
+
+        // built-in PLUS_FC32 monoid can be done as two independent atomics
         #define GB_ATOMIC_UPDATE_HX(i,t)                            \
             GB_ATOMIC_UPDATE                                        \
             Hx_real [2*(i)] += crealf (t) ;                         \
@@ -418,7 +436,7 @@ break ;
 
     #elif GB_IS_PLUS_FC64_MONOID
 
-        // built-in PLUS_FC64 monoid
+        // built-in PLUS_FC64 monoid can be done as two independent atomics
         #define GB_ATOMIC_UPDATE_HX(i,t)                            \
             GB_ATOMIC_UPDATE                                        \
             Hx_real [2*(i)] += creal (t) ;                          \
@@ -427,17 +445,19 @@ break ;
 
     #elif GB_HAS_OMP_ATOMIC
 
-        // built-in PLUS, TIMES, LOR, LAND, LXOR monoids can be
-        // implemented with an OpenMP pragma
+        // built-in PLUS and TIMES for integers and real, and boolean LOR,
+        // LAND, LXOR monoids can be implemented with an OpenMP pragma.
         #define GB_ATOMIC_UPDATE_HX(i,t)                            \
             GB_ATOMIC_UPDATE                                        \
             GB_HX_UPDATE (i, t)
 
     #else
 
-        // built-in MIN, MAX, and EQ monoids only, which cannot
-        // be implemented with an OpenMP pragma
+        // all other atomic monoids (EQ, XNOR) on boolean, signed and unsigned
+        // integers, float, and double (not used for single and double
+        // complex).
         #define GB_ATOMIC_UPDATE_HX(i,t)                            \
+        {                                                           \
             GB_CTYPE xold, xnew, *px = Hx + (i) ;                   \
             do                                                      \
             {                                                       \
@@ -447,7 +467,8 @@ break ;
                 /* xnew = xold + t */                               \
                 xnew = GB_ADD_FUNCTION (xold, t) ;                  \
             }                                                       \
-            while (!GB_ATOMIC_COMPARE_EXCHANGE (px, xold, xnew))
+            while (!GB_ATOMIC_COMPARE_EXCHANGE (px, xold, xnew)) ;  \
+        }
 
     #endif
 
@@ -457,13 +478,18 @@ break ;
     // Hx [i] += t can only be done inside the critical section
     //--------------------------------------------------------------------------
 
+    // all user-defined monoids go here, and all complex monoids (except PLUS)
     #define GB_ATOMIC_UPDATE_HX(i,t)    \
-        GB_PRAGMA (omp flush)           \
+        GB_OMP_FLUSH                    \
         GB_HX_UPDATE (i, t) ;           \
-        GB_PRAGMA (omp flush)
+        GB_OMP_FLUSH
 
 #endif
 
+#define GB_IS_MINMAX_MONOID \
+    (GB_IS_IMIN_MONOID || GB_IS_IMAX_MONOID ||  \
+     GB_IS_FMIN_MONOID || GB_IS_FMAX_MONOID)
+
 //------------------------------------------------------------------------------
 // GB_ATOMIC_WRITE_HX:  Hx [i] = t
 //------------------------------------------------------------------------------
@@ -476,6 +502,22 @@ break ;
 
     #define GB_ATOMIC_WRITE_HX(i,t)
 
+    //--------------------------------------------------------------------------
+    // ANY_PAIR: for the bitmap case only: Hx [i] = 1
+    //--------------------------------------------------------------------------
+
+    #if GB_IS_ANY_FC32_MONOID || GB_IS_ANY_FC64_MONOID
+        #define GB_ATOMIC_SET_HX_ONE(i)             \
+            GB_ATOMIC_WRITE                         \
+            Hx_real [2*(i)] = 1 ;                   \
+            GB_ATOMIC_WRITE                         \
+            Hx_imag [2*(i)] = 0 ;
+    #else
+        #define GB_ATOMIC_SET_HX_ONE(i)             \
+            GB_ATOMIC_WRITE                         \
+            Hx [i] = 1 ;
+    #endif
+
 #elif GB_HAS_ATOMIC
 
     //--------------------------------------------------------------------------
@@ -485,26 +527,26 @@ break ;
     #if GB_IS_PLUS_FC32_MONOID
 
         // built-in PLUS_FC32 monoid
-        #define GB_ATOMIC_WRITE_HX(i,t)                             \
-            GB_ATOMIC_WRITE                                         \
-            Hx_real [2*(i)] = crealf (t) ;                          \
-            GB_ATOMIC_WRITE                                         \
+        #define GB_ATOMIC_WRITE_HX(i,t)         \
+            GB_ATOMIC_WRITE                     \
+            Hx_real [2*(i)] = crealf (t) ;      \
+            GB_ATOMIC_WRITE                     \
             Hx_imag [2*(i)] = cimagf (t) ;
 
     #elif GB_IS_PLUS_FC64_MONOID
 
         // built-in PLUS_FC64 monoid
-        #define GB_ATOMIC_WRITE_HX(i,t)                             \
-            GB_ATOMIC_WRITE                                         \
-            Hx_real [2*(i)] = creal (t) ;                           \
-            GB_ATOMIC_WRITE                                         \
+        #define GB_ATOMIC_WRITE_HX(i,t)         \
+            GB_ATOMIC_WRITE                     \
+            Hx_real [2*(i)] = creal (t) ;       \
+            GB_ATOMIC_WRITE                     \
             Hx_imag [2*(i)] = cimag (t) ;
 
     #else
 
         // all other atomic monoids
-        #define GB_ATOMIC_WRITE_HX(i,t)                             \
-            GB_ATOMIC_WRITE                                         \
+        #define GB_ATOMIC_WRITE_HX(i,t)         \
+            GB_ATOMIC_WRITE                     \
             GB_HX_WRITE (i, t)
 
     #endif
@@ -515,50 +557,41 @@ break ;
     // Hx [i] = t via critical section
     //--------------------------------------------------------------------------
 
-    #define GB_ATOMIC_WRITE_HX(i,t)    \
-        GB_PRAGMA (omp flush)          \
-        GB_HX_WRITE (i, t) ;           \
-        GB_PRAGMA (omp flush)
+    #define GB_ATOMIC_WRITE_HX(i,t)             \
+        GB_OMP_FLUSH                            \
+        GB_HX_WRITE (i, t) ;                    \
+        GB_OMP_FLUSH
 
 #endif
 
 //------------------------------------------------------------------------------
-// hash
+// hash iteration
 //------------------------------------------------------------------------------
 
 // to iterate over the hash table, looking for index i:
-// for (GB_HASH (i)) { ... }
-#define GB_HASH(i) int64_t hash = GB_HASH_FUNCTION (i) ; ; GB_REHASH (hash,i)
+// 
+//      for (GB_HASH (i))
+//      {
+//          ...
+//      }
+//
+// which expands into the following, where f(i) is the GB_HASHF(i) hash
+// function:
+//
+//      for (int64_t hash = f(i) ; ; hash = (hash+1)&(hash_size-1))
+//      {
+//          ...
+//      }
 
-#endif
+#define GB_HASH(i) \
+    int64_t hash = GB_HASHF (i) ; ; GB_REHASH (hash,i)
 
 //------------------------------------------------------------------------------
-// free workspace
+// define macros for any sparsity of A and B
 //------------------------------------------------------------------------------
 
-#undef  GB_FREE_INITIAL_WORK
-#define GB_FREE_INITIAL_WORK ;
-
-#undef  GB_FREE_TASKLIST_AND_HASH_TABLES
-#define GB_FREE_TASKLIST_AND_HASH_TABLES                                    \
-{                                                                           \
-    GB_FREE (*(TaskList_handle)) ;                                          \
-    GB_FREE (Hi_all) ;                                                      \
-    GB_FREE (Hf_all) ;                                                      \
-    GB_FREE (Hx_all) ;                                                      \
-}
+#undef GB_META16
+#include "GB_meta16_definitions.h"
 
-#undef  GB_FREE_WORK
-#define GB_FREE_WORK                                                        \
-{                                                                           \
-    GB_FREE_INITIAL_WORK ;                                                  \
-    GB_FREE_TASKLIST_AND_HASH_TABLES ;                                      \
-}
-
-#undef  GB_FREE_ALL
-#define GB_FREE_ALL                                                         \
-{                                                                           \
-    GB_FREE_WORK ;                                                          \
-    GB_MATRIX_FREE (Chandle) ;                                              \
-}
+#endif
 
diff --git a/GraphBLAS/Source/Template/GB_AxB_saxpy_template.c b/GraphBLAS/Source/Template/GB_AxB_saxpy_template.c
new file mode 100644
index 0000000000..00bc1b3247
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_AxB_saxpy_template.c
@@ -0,0 +1,24 @@
+//------------------------------------------------------------------------------
+// GB_AxB_saxpy_template: C=A*B, C<M>=A*B, or C<!M>=A*B via saxpy method
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// All 4 matrices have any format: hypersparse, sparse, bitmap, or full.
+
+{
+    if (GB_IS_SPARSE (C) || GB_IS_HYPERSPARSE (C))
+    { 
+        // C is sparse or hypersparse
+        #include "GB_AxB_saxpy3_template.c"
+    }
+    else
+    { 
+        // C is bitmap or full
+        #include "GB_bitmap_AxB_saxpy_template.c"
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_AxB_type_factory.c b/GraphBLAS/Source/Template/GB_AxB_type_factory.c
index ebbf000c16..0f41bffadd 100644
--- a/GraphBLAS/Source/Template/GB_AxB_type_factory.c
+++ b/GraphBLAS/Source/Template/GB_AxB_type_factory.c
@@ -2,8 +2,8 @@
 // GB_AxB_type_factory.c: switch factory for C=A*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -45,16 +45,16 @@ if (xcode != GB_BOOL_code)
             switch (xcode)
             {
                 // 10 real, non-boolean types
-                case GB_INT8_code   : GB_AxB_WORKER (_min, GB_MULT_NAME, _int8  )
-                case GB_INT16_code  : GB_AxB_WORKER (_min, GB_MULT_NAME, _int16 )
-                case GB_INT32_code  : GB_AxB_WORKER (_min, GB_MULT_NAME, _int32 )
-                case GB_INT64_code  : GB_AxB_WORKER (_min, GB_MULT_NAME, _int64 )
-                case GB_UINT8_code  : GB_AxB_WORKER (_min, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_min, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_min, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_min, GB_MULT_NAME, _uint64)
-                case GB_FP32_code   : GB_AxB_WORKER (_min, GB_MULT_NAME, _fp32  )
-                case GB_FP64_code   : GB_AxB_WORKER (_min, GB_MULT_NAME, _fp64  )
+                case GB_INT8_code   : GB_AxB_WORKER (_min, GB_MNAME, _int8  )
+                case GB_INT16_code  : GB_AxB_WORKER (_min, GB_MNAME, _int16 )
+                case GB_INT32_code  : GB_AxB_WORKER (_min, GB_MNAME, _int32 )
+                case GB_INT64_code  : GB_AxB_WORKER (_min, GB_MNAME, _int64 )
+                case GB_UINT8_code  : GB_AxB_WORKER (_min, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_min, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_min, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_min, GB_MNAME, _uint64)
+                case GB_FP32_code   : GB_AxB_WORKER (_min, GB_MNAME, _fp32  )
+                case GB_FP64_code   : GB_AxB_WORKER (_min, GB_MNAME, _fp64  )
                 default: ;
             }
             break ;
@@ -64,16 +64,16 @@ if (xcode != GB_BOOL_code)
             switch (xcode)
             {
                 // 10 real, non-boolean types
-                case GB_INT8_code   : GB_AxB_WORKER (_max, GB_MULT_NAME, _int8  )
-                case GB_INT16_code  : GB_AxB_WORKER (_max, GB_MULT_NAME, _int16 )
-                case GB_INT32_code  : GB_AxB_WORKER (_max, GB_MULT_NAME, _int32 )
-                case GB_INT64_code  : GB_AxB_WORKER (_max, GB_MULT_NAME, _int64 )
-                case GB_UINT8_code  : GB_AxB_WORKER (_max, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_max, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_max, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_max, GB_MULT_NAME, _uint64)
-                case GB_FP32_code   : GB_AxB_WORKER (_max, GB_MULT_NAME, _fp32  )
-                case GB_FP64_code   : GB_AxB_WORKER (_max, GB_MULT_NAME, _fp64  )
+                case GB_INT8_code   : GB_AxB_WORKER (_max, GB_MNAME, _int8  )
+                case GB_INT16_code  : GB_AxB_WORKER (_max, GB_MNAME, _int16 )
+                case GB_INT32_code  : GB_AxB_WORKER (_max, GB_MNAME, _int32 )
+                case GB_INT64_code  : GB_AxB_WORKER (_max, GB_MNAME, _int64 )
+                case GB_UINT8_code  : GB_AxB_WORKER (_max, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_max, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_max, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_max, GB_MNAME, _uint64)
+                case GB_FP32_code   : GB_AxB_WORKER (_max, GB_MNAME, _fp32  )
+                case GB_FP64_code   : GB_AxB_WORKER (_max, GB_MNAME, _fp64  )
                 default: ;
             }
             break ;
@@ -83,19 +83,19 @@ if (xcode != GB_BOOL_code)
             switch (xcode)
             {
                 // 10 real, non-boolean types, plus 2 complex
-                case GB_INT8_code   : GB_AxB_WORKER (_times, GB_MULT_NAME, _int8  )
-                case GB_INT16_code  : GB_AxB_WORKER (_times, GB_MULT_NAME, _int16 )
-                case GB_INT32_code  : GB_AxB_WORKER (_times, GB_MULT_NAME, _int32 )
-                case GB_INT64_code  : GB_AxB_WORKER (_times, GB_MULT_NAME, _int64 )
-                case GB_UINT8_code  : GB_AxB_WORKER (_times, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_times, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_times, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_times, GB_MULT_NAME, _uint64)
-                case GB_FP32_code   : GB_AxB_WORKER (_times, GB_MULT_NAME, _fp32  )
-                case GB_FP64_code   : GB_AxB_WORKER (_times, GB_MULT_NAME, _fp64  )
+                case GB_INT8_code   : GB_AxB_WORKER (_times, GB_MNAME, _int8  )
+                case GB_INT16_code  : GB_AxB_WORKER (_times, GB_MNAME, _int16 )
+                case GB_INT32_code  : GB_AxB_WORKER (_times, GB_MNAME, _int32 )
+                case GB_INT64_code  : GB_AxB_WORKER (_times, GB_MNAME, _int64 )
+                case GB_UINT8_code  : GB_AxB_WORKER (_times, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_times, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_times, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_times, GB_MNAME, _uint64)
+                case GB_FP32_code   : GB_AxB_WORKER (_times, GB_MNAME, _fp32  )
+                case GB_FP64_code   : GB_AxB_WORKER (_times, GB_MNAME, _fp64  )
                 #if defined ( GB_COMPLEX )
-                case GB_FC32_code   : GB_AxB_WORKER (_times, GB_MULT_NAME, _fc32  )
-                case GB_FC64_code   : GB_AxB_WORKER (_times, GB_MULT_NAME, _fc64  )
+                case GB_FC32_code   : GB_AxB_WORKER (_times, GB_MNAME, _fc32  )
+                case GB_FC64_code   : GB_AxB_WORKER (_times, GB_MNAME, _fc64  )
                 #endif
                 default: ;
             }
@@ -108,19 +108,19 @@ if (xcode != GB_BOOL_code)
             switch (xcode)
             {
                 // 10 real, non-boolean types, plus 2 complex
-                case GB_INT8_code   : GB_AxB_WORKER (_plus, GB_MULT_NAME, _int8  )
-                case GB_INT16_code  : GB_AxB_WORKER (_plus, GB_MULT_NAME, _int16 )
-                case GB_INT32_code  : GB_AxB_WORKER (_plus, GB_MULT_NAME, _int32 )
-                case GB_INT64_code  : GB_AxB_WORKER (_plus, GB_MULT_NAME, _int64 )
-                case GB_UINT8_code  : GB_AxB_WORKER (_plus, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_plus, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_plus, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_plus, GB_MULT_NAME, _uint64)
-                case GB_FP32_code   : GB_AxB_WORKER (_plus, GB_MULT_NAME, _fp32  )
-                case GB_FP64_code   : GB_AxB_WORKER (_plus, GB_MULT_NAME, _fp64  )
+                case GB_INT8_code   : GB_AxB_WORKER (_plus, GB_MNAME, _int8  )
+                case GB_INT16_code  : GB_AxB_WORKER (_plus, GB_MNAME, _int16 )
+                case GB_INT32_code  : GB_AxB_WORKER (_plus, GB_MNAME, _int32 )
+                case GB_INT64_code  : GB_AxB_WORKER (_plus, GB_MNAME, _int64 )
+                case GB_UINT8_code  : GB_AxB_WORKER (_plus, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_plus, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_plus, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_plus, GB_MNAME, _uint64)
+                case GB_FP32_code   : GB_AxB_WORKER (_plus, GB_MNAME, _fp32  )
+                case GB_FP64_code   : GB_AxB_WORKER (_plus, GB_MNAME, _fp64  )
                 #if defined ( GB_COMPLEX )
-                case GB_FC32_code   : GB_AxB_WORKER (_plus, GB_MULT_NAME, _fc32  )
-                case GB_FC64_code   : GB_AxB_WORKER (_plus, GB_MULT_NAME, _fc64  )
+                case GB_FC32_code   : GB_AxB_WORKER (_plus, GB_MNAME, _fc32  )
+                case GB_FC64_code   : GB_AxB_WORKER (_plus, GB_MNAME, _fc64  )
                 #endif
                 default: ;
             }
@@ -131,19 +131,19 @@ if (xcode != GB_BOOL_code)
             switch (xcode)
             {
                 // 10 real, non-boolean types, plus 2 complex
-                case GB_INT8_code   : GB_AxB_WORKER (_any, GB_MULT_NAME, _int8  )
-                case GB_INT16_code  : GB_AxB_WORKER (_any, GB_MULT_NAME, _int16 )
-                case GB_INT32_code  : GB_AxB_WORKER (_any, GB_MULT_NAME, _int32 )
-                case GB_INT64_code  : GB_AxB_WORKER (_any, GB_MULT_NAME, _int64 )
-                case GB_UINT8_code  : GB_AxB_WORKER (_any, GB_MULT_NAME, _uint8 )
-                case GB_UINT16_code : GB_AxB_WORKER (_any, GB_MULT_NAME, _uint16)
-                case GB_UINT32_code : GB_AxB_WORKER (_any, GB_MULT_NAME, _uint32)
-                case GB_UINT64_code : GB_AxB_WORKER (_any, GB_MULT_NAME, _uint64)
-                case GB_FP32_code   : GB_AxB_WORKER (_any, GB_MULT_NAME, _fp32  )
-                case GB_FP64_code   : GB_AxB_WORKER (_any, GB_MULT_NAME, _fp64  )
+                case GB_INT8_code   : GB_AxB_WORKER (_any, GB_MNAME, _int8  )
+                case GB_INT16_code  : GB_AxB_WORKER (_any, GB_MNAME, _int16 )
+                case GB_INT32_code  : GB_AxB_WORKER (_any, GB_MNAME, _int32 )
+                case GB_INT64_code  : GB_AxB_WORKER (_any, GB_MNAME, _int64 )
+                case GB_UINT8_code  : GB_AxB_WORKER (_any, GB_MNAME, _uint8 )
+                case GB_UINT16_code : GB_AxB_WORKER (_any, GB_MNAME, _uint16)
+                case GB_UINT32_code : GB_AxB_WORKER (_any, GB_MNAME, _uint32)
+                case GB_UINT64_code : GB_AxB_WORKER (_any, GB_MNAME, _uint64)
+                case GB_FP32_code   : GB_AxB_WORKER (_any, GB_MNAME, _fp32  )
+                case GB_FP64_code   : GB_AxB_WORKER (_any, GB_MNAME, _fp64  )
                 #if defined ( GB_COMPLEX )
-                case GB_FC32_code   : GB_AxB_WORKER (_any, GB_MULT_NAME, _fc32  )
-                case GB_FC64_code   : GB_AxB_WORKER (_any, GB_MULT_NAME, _fc64  )
+                case GB_FC32_code   : GB_AxB_WORKER (_any, GB_MNAME, _fc32  )
+                case GB_FC64_code   : GB_AxB_WORKER (_any, GB_MNAME, _fc64  )
                 #endif
                 default: ;
             }
@@ -161,18 +161,18 @@ else
             // 5 boolean monoids
             #ifndef GB_MULT_IS_PAIR_OPERATOR
             // EQ_PAIR, LOR_PAIR, LAND_PAIR, been renamed to ANY_PAIR
-            case GB_LOR_opcode  : GB_AxB_WORKER (_lor , GB_MULT_NAME, _bool)
-            case GB_LAND_opcode : GB_AxB_WORKER (_land, GB_MULT_NAME, _bool)
-            case GB_EQ_opcode   : GB_AxB_WORKER (_eq  , GB_MULT_NAME, _bool)
+            case GB_LOR_opcode  : GB_AxB_WORKER (_lor , GB_MNAME, _bool)
+            case GB_LAND_opcode : GB_AxB_WORKER (_land, GB_MNAME, _bool)
+            case GB_EQ_opcode   : GB_AxB_WORKER (_eq  , GB_MNAME, _bool)
             #endif
-            case GB_LXOR_opcode : GB_AxB_WORKER (_lxor, GB_MULT_NAME, _bool)
-            case GB_ANY_opcode  : GB_AxB_WORKER (_any , GB_MULT_NAME, _bool)
+            case GB_LXOR_opcode : GB_AxB_WORKER (_lxor, GB_MNAME, _bool)
+            case GB_ANY_opcode  : GB_AxB_WORKER (_any , GB_MNAME, _bool)
             default: ;
         }
 }
 #endif
 
 #undef GB_NO_BOOLEAN
-#undef GB_MULT_NAME
+#undef GB_MNAME
 #undef GB_COMPLEX
 
diff --git a/GraphBLAS/Source/Template/GB_Matrix_extractElement.c b/GraphBLAS/Source/Template/GB_Matrix_extractElement.c
index 7292aaf3bf..cf2d1c59f5 100644
--- a/GraphBLAS/Source/Template/GB_Matrix_extractElement.c
+++ b/GraphBLAS/Source/Template/GB_Matrix_extractElement.c
@@ -2,8 +2,8 @@
 // GB_Matrix_extractElement: x = A(row,col)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -31,22 +31,20 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = A(row,col)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_CONTEXT_RETURN_IF_NULL (A) ;
-    GB_CONTEXT_RETURN_IF_FAULTY (A) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (A) ;
+    GB_RETURN_IF_NULL (x) ;
 
-    // delete any lingering zombies and assemble any pending tuples
-    if (GB_PENDING_OR_ZOMBIES (A))
+    // delete any lingering zombies, assemble any pending tuples, and unjumble
+    if (GB_ANY_PENDING_WORK (A))
     { 
         GrB_Info info ;
-        GB_WHERE (GB_WHERE_STRING) ;
+        GB_WHERE1 (GB_WHERE_STRING) ;
         GB_BURBLE_START ("GrB_Matrix_extractElement") ;
         GB_OK (GB_Matrix_wait (A, Context)) ;
-        ASSERT (!GB_ZOMBIES (A)) ;
-        ASSERT (!GB_PENDING (A)) ;
         GB_BURBLE_END ;
     }
 
-    GB_CONTEXT_RETURN_IF_NULL (x) ;
+    ASSERT (!GB_ANY_PENDING_WORK (A)) ;
 
     // look for index i in vector j
     int64_t i, j, nrows, ncols ;
@@ -66,28 +64,16 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = A(row,col)
     }
 
     // check row and column indices
-    if (row >= nrows)
+    if (row >= nrows || col >= ncols)
     { 
-        GB_WHERE (GB_WHERE_STRING) ;
-        return (GB_ERROR (GrB_INVALID_INDEX, (GB_LOG, "Row index "
-            GBu " out of range; must be < " GBd, row, nrows))) ;
-    }
-    if (col >= ncols)
-    { 
-        GB_WHERE (GB_WHERE_STRING) ;
-        return (GB_ERROR (GrB_INVALID_INDEX, (GB_LOG, "Column index "
-            GBu " out of range; must be < " GBd, col, ncols))) ;
+        return (GrB_INVALID_INDEX) ;
     }
 
     // GB_XCODE and A must be compatible
     GB_Type_code acode = A->type->code ;
     if (!GB_code_compatible (GB_XCODE, acode))
     { 
-        GB_WHERE (GB_WHERE_STRING) ;
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-            "entry A(i,j) of type [%s] cannot be typecast\n"
-            "to output scalar x of type [%s]",
-            A->type->name, GB_code_string (GB_XCODE)))) ;
+        return (GrB_DOMAIN_MISMATCH) ;
     }
 
     if (A->nzmax == 0)
@@ -97,43 +83,64 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = A(row,col)
     }
 
     //--------------------------------------------------------------------------
-    // binary search in A->h for vector j
+    // find the entry A(i,j)
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Ap = A->p ;
-    const int64_t *GB_RESTRICT Ai = A->i ;
+    int64_t pleft ;
     bool found ;
+    const int64_t *GB_RESTRICT Ap = A->p ;
 
-    // extract from vector j of a GrB_Matrix
-    int64_t k ;
-    if (A->is_hyper)
-    {
-        // look for vector j in hyperlist A->h [0 ... A->nvec-1]
-        const int64_t *Ah = A->h ;
-        int64_t pleft = 0 ;
-        int64_t pright = A->nvec-1 ;
-        GB_BINARY_SEARCH (j, Ah, pleft, pright, found) ;
-        if (!found)
+    if (Ap != NULL)
+    { 
+        // A is sparse or hypersparse
+        const int64_t *GB_RESTRICT Ai = A->i ;
+
+        // extract from vector j of a GrB_Matrix
+        int64_t k ;
+        if (A->h != NULL)
+        {
+            // A is hypersparse: look for j in hyperlist A->h [0 ... A->nvec-1]
+            const int64_t *GB_RESTRICT Ah = A->h ;
+            int64_t pleft = 0 ;
+            int64_t pright = A->nvec-1 ;
+            GB_BINARY_SEARCH (j, Ah, pleft, pright, found) ;
+            if (!found)
+            { 
+                // vector j is empty
+                return (GrB_NO_VALUE) ;
+            }
+            ASSERT (j == Ah [pleft]) ;
+            k = pleft ;
+        }
+        else
         { 
-            // vector j is empty
-            return (GrB_NO_VALUE) ;
+            // A is sparse: j = k is the kth vector
+            k = j ;
         }
-        ASSERT (j == Ah [pleft]) ;
-        k = pleft ;
+
+        pleft = Ap [k] ;
+        int64_t pright = Ap [k+1] - 1 ;
+
+        // binary search in kth vector for index i
+        // Time taken for this step is at most O(log(nnz(A(:,j))).
+        GB_BINARY_SEARCH (i, Ai, pleft, pright, found) ;
     }
     else
-    { 
-        k = j ;
+    {
+        // A is bitmap or full
+        pleft = i + j * A->vlen ;
+        const int8_t *GB_RESTRICT Ab = A->b ;
+        if (Ab != NULL)
+        { 
+            // A is bitmap
+            found = (Ab [pleft] == 1) ;
+        }
+        else
+        { 
+            // A is full
+            found = true ;
+        }
     }
-    int64_t pleft = Ap [k] ;
-    int64_t pright = Ap [k+1] - 1 ;
-
-    //--------------------------------------------------------------------------
-    // binary search in kth vector for index i
-    //--------------------------------------------------------------------------
-
-    // Time taken for this step is at most O(log(nnz(A(:,j))).
-    GB_BINARY_SEARCH (i, Ai, pleft, pright, found) ;
 
     //--------------------------------------------------------------------------
     // extract the element
@@ -155,7 +162,7 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = A(row,col)
             // typecast the value from A into x
             size_t asize = A->type->size ;
             GB_cast_array ((GB_void *) x, GB_XCODE,
-                ((GB_void *) A->x) +(pleft*asize), acode, asize, 1, 1) ;
+                ((GB_void *) A->x) +(pleft*asize), acode, NULL, asize, 1, 1) ;
         }
         return (GrB_SUCCESS) ;
     }
diff --git a/GraphBLAS/Source/Template/GB_Scalar_extractElement.c b/GraphBLAS/Source/Template/GB_Scalar_extractElement.c
index 477c08ec42..12ac75beb3 100644
--- a/GraphBLAS/Source/Template/GB_Scalar_extractElement.c
+++ b/GraphBLAS/Source/Template/GB_Scalar_extractElement.c
@@ -2,8 +2,8 @@
 // GB_Scalar_extractElement_template: x = S
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -28,35 +28,33 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry from S
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_CONTEXT_RETURN_IF_NULL (S) ;
-    GB_CONTEXT_RETURN_IF_FAULTY (S) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (S) ;
+    GB_RETURN_IF_NULL (x) ;
 
-    // delete any lingering zombies and assemble any pending tuples
-    if (GB_PENDING_OR_ZOMBIES (S))
+    // delete any lingering zombies, assemble any pending tuples, and unjumble
+    if (GB_ANY_PENDING_WORK (S))
     { 
+        // extract scalar with pending tuples or zombies.  It cannot be
+        // actually jumbled, but S->jumbled might true anyway.
         GrB_Info info ;
-        GB_WHERE (GB_WHERE_STRING) ;
+        GB_WHERE1 (GB_WHERE_STRING) ;
         GB_BURBLE_START ("GxB_Scalar_extractElement") ;
         GB_OK (GB_Matrix_wait ((GrB_Matrix) S, Context)) ;
-        ASSERT (!GB_ZOMBIES (S)) ;
-        ASSERT (!GB_PENDING (S)) ;
         GB_BURBLE_END ;
     }
 
-    GB_CONTEXT_RETURN_IF_NULL (x) ;
+    ASSERT (!GB_ANY_PENDING_WORK (S)) ;
 
     // GB_XCODE and S must be compatible
     GB_Type_code scode = S->type->code ;
     if (!GB_code_compatible (GB_XCODE, scode))
     { 
-        GB_WHERE (GB_WHERE_STRING) ;
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-            "entry s of type [%s] cannot be typecast\n"
-            "to output scalar x of type [%s]",
-            S->type->name, GB_code_string (GB_XCODE)))) ;
+        return (GrB_DOMAIN_MISMATCH) ;
     }
 
-    if (S->nzmax == 0 || S->p [1] == 0)
+    if ((S->nzmax == 0)                         // empty
+        || (S->p != NULL && S->p [1] == 0)      // sparse/hyper with no entry
+        || (S->b != NULL && S->b [0] == 0))     // bitmap with no entry
     { 
         // quick return
         return (GrB_NO_VALUE) ;
@@ -79,7 +77,7 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry from S
     { 
         // typecast the value from S into x
         GB_cast_array ((GB_void *) x, GB_XCODE,
-            ((GB_void *) S->x), scode, S->type->size, 1, 1) ;
+            ((GB_void *) S->x), scode, NULL, S->type->size, 1, 1) ;
     }
     return (GrB_SUCCESS) ;
 }
diff --git a/GraphBLAS/Source/Template/GB_Vector_extractElement.c b/GraphBLAS/Source/Template/GB_Vector_extractElement.c
index 77e9e95492..4987b91a4d 100644
--- a/GraphBLAS/Source/Template/GB_Vector_extractElement.c
+++ b/GraphBLAS/Source/Template/GB_Vector_extractElement.c
@@ -2,8 +2,8 @@
 // GB_Vector_extractElement: x = V(i)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -30,40 +30,32 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = V(i)
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_CONTEXT_RETURN_IF_NULL (V) ;
-    GB_CONTEXT_RETURN_IF_FAULTY (V) ;
+    GB_RETURN_IF_NULL_OR_FAULTY (V) ;
+    GB_RETURN_IF_NULL (x) ;
 
-    // delete any lingering zombies and assemble any pending tuples
-    if (GB_PENDING_OR_ZOMBIES (V))
+    // delete any lingering zombies, assemble any pending tuples, and unjumble
+    if (GB_ANY_PENDING_WORK (V))
     { 
         GrB_Info info ;
-        GB_WHERE (GB_WHERE_STRING) ;
+        GB_WHERE1 (GB_WHERE_STRING) ;
         GB_BURBLE_START ("GrB_Vector_extractElement") ;
         GB_OK (GB_Matrix_wait ((GrB_Matrix) V, Context)) ;
-        ASSERT (!GB_ZOMBIES (V)) ;
-        ASSERT (!GB_PENDING (V)) ;
         GB_BURBLE_END ;
     }
 
-    GB_CONTEXT_RETURN_IF_NULL (x) ;
+    ASSERT (!GB_ANY_PENDING_WORK (V)) ;
 
     // check index
     if (i >= V->vlen)
     { 
-        GB_WHERE (GB_WHERE_STRING) ;
-        return (GB_ERROR (GrB_INVALID_INDEX, (GB_LOG, "Row index "
-            GBu " out of range; must be < " GBd, i, V->vlen))) ;
+        return (GrB_INVALID_INDEX) ;
     }
 
     // GB_XCODE and V must be compatible
     GB_Type_code vcode = V->type->code ;
     if (!GB_code_compatible (GB_XCODE, vcode))
     { 
-        GB_WHERE (GB_WHERE_STRING) ;
-        return (GB_ERROR (GrB_DOMAIN_MISMATCH, (GB_LOG,
-            "entry v(i) of type [%s] cannot be typecast\n"
-            "to output scalar x of type [%s]",
-            V->type->name, GB_code_string (GB_XCODE)))) ;
+        return (GrB_DOMAIN_MISMATCH) ;
     }
 
     if (V->nzmax == 0)
@@ -73,23 +65,41 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = V(i)
     }
 
     //--------------------------------------------------------------------------
-    // get the pattern of the vector
+    // find the entry V(i)
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Vp = V->p ;
-    const int64_t *GB_RESTRICT Vi = V->i ;
+    int64_t pleft ;
     bool found ;
+    const int64_t *GB_RESTRICT Vp = V->p ;
 
-    // extract from a GrB_Vector
-    int64_t pleft = 0 ;
-    int64_t pright = Vp [1] - 1 ;
+    if (Vp != NULL)
+    { 
+        // V is sparse
+        const int64_t *GB_RESTRICT Vi = V->i ;
 
-    //--------------------------------------------------------------------------
-    // binary search in kth vector for index i
-    //--------------------------------------------------------------------------
+        pleft = 0 ;
+        int64_t pright = Vp [1] - 1 ;
 
-    // Time taken for this step is at most O(log(nnz(V))).
-    GB_BINARY_SEARCH (i, Vi, pleft, pright, found) ;
+        // binary search for index i
+        // Time taken for this step is at most O(log(nnz(V))).
+        GB_BINARY_SEARCH (i, Vi, pleft, pright, found) ;
+    }
+    else
+    {
+        // V is bitmap or full
+        pleft = i ;
+        const int8_t *GB_RESTRICT Vb = V->b ;
+        if (Vb != NULL)
+        { 
+            // V is bitmap
+            found = (Vb [pleft] == 1) ;
+        }
+        else
+        { 
+            // V is full
+            found = true ;
+        }
+    }
 
     //--------------------------------------------------------------------------
     // extract the element
@@ -111,7 +121,7 @@ GrB_Info GB_EXTRACT_ELEMENT     // extract a single entry, x = V(i)
             // typecast the value from V into x
             size_t vsize = V->type->size ;
             GB_cast_array ((GB_void *) x, GB_XCODE,
-                ((GB_void *) V->x) +(pleft*vsize), vcode, vsize, 1, 1) ;
+                ((GB_void *) V->x) +(pleft*vsize), vcode, NULL, vsize, 1, 1) ;
         }
         return (GrB_SUCCESS) ;
     }
diff --git a/GraphBLAS/Source/Template/GB_add_template.c b/GraphBLAS/Source/Template/GB_add_template.c
index bf83ff02ba..adbcdc524e 100644
--- a/GraphBLAS/Source/Template/GB_add_template.c
+++ b/GraphBLAS/Source/Template/GB_add_template.c
@@ -1,16 +1,22 @@
 //------------------------------------------------------------------------------
-// GB_add_template:  phase1 and phase2 for C=A+B, C<M>=A+B
+// GB_add_template:  phase1 and phase2 for C=A+B, C<M>=A+B, C<!M>=A+B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Computes C=A+B (no mask) or C<M>=A+B (mask present and not complemented).
-// Does not handle the case C<!M>=A+B.  The complemented mask is handled in
-// GB_mask instead.  If present, the mask M is assumed to be very sparse
-// compared with A and B.
+// Computes C=A+B, C<M>=A+B, or C<!M>=A+B.
+
+// M can have any sparsity structure:
+
+//      If M is not present, bitmap, or full, then A and B are sparse or
+//      hypersparse.  They are not bitmap or full, since in those cases,
+//      C will not be sparse/hypersparse, and this method is not used.
+
+//      Otherwise, if M is present and sparse/hypersparse, then A and B can
+//      have any sparsity pattern (hyper, sparse, bitmap, or full).
 
 // phase1: does not compute C itself, but just counts the # of entries in each
 // vector of C.  Fine tasks compute the # of entries in their slice of a
@@ -24,22 +30,46 @@
     // get A, B, M, and C
     //--------------------------------------------------------------------------
 
+    int taskid ;
+
     const int64_t *GB_RESTRICT Ap = A->p ;
+    const int64_t *GB_RESTRICT Ah = A->h ;
+    const int8_t  *GB_RESTRICT Ab = A->b ;
     const int64_t *GB_RESTRICT Ai = A->i ;
     const int64_t vlen = A->vlen ;
+    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
+    const bool A_is_sparse = GB_IS_SPARSE (A) ;
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
+    const bool A_is_full = GB_as_if_full (A) ;
+    int A_nthreads, A_ntasks ;
 
     const int64_t *GB_RESTRICT Bp = B->p ;
+    const int64_t *GB_RESTRICT Bh = B->h ;
+    const int8_t  *GB_RESTRICT Bb = B->b ;
     const int64_t *GB_RESTRICT Bi = B->i ;
+    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
+    const bool B_is_sparse = GB_IS_SPARSE (B) ;
+    const bool B_is_bitmap = GB_IS_BITMAP (B) ;
+    const bool B_is_full = GB_as_if_full (B) ;
+    int B_nthreads, B_ntasks ;
 
     const int64_t *GB_RESTRICT Mp = NULL ;
-    // const int64_t *GB_RESTRICT Mh = NULL ;
+    const int64_t *GB_RESTRICT Mh = NULL ;
+    const int8_t  *GB_RESTRICT Mb = NULL ;
     const int64_t *GB_RESTRICT Mi = NULL ;
     const GB_void *GB_RESTRICT Mx = NULL ;
+    const bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
+    const bool M_is_sparse = GB_IS_SPARSE (M) ;
+    const bool M_is_bitmap = GB_IS_BITMAP (M) ;
+    const bool M_is_full = GB_as_if_full (M) ;
+    const bool M_is_sparse_or_hyper = M_is_sparse || M_is_hyper ;
+    int M_nthreads, M_ntasks ;
     size_t msize = 0 ;
     if (M != NULL)
     { 
         Mp = M->p ;
-        // Mh = M->h ;
+        Mh = M->h ;
+        Mb = M->b ;
         Mi = M->i ;
         Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
         msize = M->type->size ;
@@ -50,734 +80,44 @@
     const GB_BTYPE *GB_RESTRICT Bx = (GB_BTYPE *) B->x ;
     const int64_t  *GB_RESTRICT Cp = C->p ;
     const int64_t  *GB_RESTRICT Ch = C->h ;
+          int8_t   *GB_RESTRICT Cb = C->b ;
           int64_t  *GB_RESTRICT Ci = C->i ;
           GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
+    // when C is bitmap or full:
+    const int64_t cnz = GB_NNZ_HELD (C) ;
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
     #endif
 
     //--------------------------------------------------------------------------
-    // phase1: count entries in each C(:,j); phase2: compute C
+    // C=A+B, C<M>=A+B, or C<!M>=A+B: 3 cases for the sparsity of C
     //--------------------------------------------------------------------------
 
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
+    #if defined ( GB_PHASE_1_OF_2 )
+
+        // phase1: symbolic phase
+        // C is sparse or hypersparse (never bitmap or full)
+        #include "GB_sparse_add_template.c"
 
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
+    #else
 
-        int64_t kfirst = TaskList [taskid].kfirst ;
-        int64_t klast  = TaskList [taskid].klast ;
-        bool fine_task = (klast == -1) ;
-        int64_t len ;
-        if (fine_task)
+        // phase2: numerical phase
+        if (C_sparsity == GxB_SPARSE || C_sparsity == GxB_HYPERSPARSE)
         { 
-            // a fine task operates on a slice of a single vector
-            klast = kfirst ;
-            len = TaskList [taskid].len ;
+            // C is sparse or hypersparse
+            #include "GB_sparse_add_template.c"
+        }
+        else if (C_sparsity == GxB_BITMAP)
+        { 
+            // C is bitmap (phase2 only)
+            #include "GB_bitmap_add_template.c"
         }
         else
         { 
-            // a coarse task operates on one or more whole vectors
-            len = vlen ;
+            // C is full (phase2 only)
+            ASSERT (C_sparsity == GxB_FULL) ;
+            #include "GB_full_add_template.c"
         }
 
-        //----------------------------------------------------------------------
-        // compute all vectors in this task
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get j, the kth vector of C
-            //------------------------------------------------------------------
-
-            int64_t j = (Ch == NULL) ? k : Ch [k] ;
-
-            #if defined ( GB_PHASE_1_OF_2 )
-            int64_t cjnz = 0 ;
-            #else
-            int64_t pC, pC_end ;
-            if (fine_task)
-            { 
-                // A fine task computes a slice of C(:,j)
-                pC     = TaskList [taskid  ].pC ;
-                pC_end = TaskList [taskid+1].pC ;
-                ASSERT (Cp [k] <= pC && pC <= pC_end && pC_end <= Cp [k+1]) ;
-            }
-            else
-            { 
-                // The vectors of C are never sliced for a coarse task.
-                pC     = Cp [k] ;
-                pC_end = Cp [k+1] ;
-            }
-            int64_t cjnz = pC_end - pC ;
-            if (cjnz == 0) continue ;
-            #endif
-
-            //------------------------------------------------------------------
-            // get A(:,j)
-            //------------------------------------------------------------------
-
-            // GB_GET_MAPPED_VECTOR (pA, pA_end, pA, pA_end, Ap, j, k, C_to_A) ;
-            int64_t pA = -1, pA_end = -1 ;
-            if (fine_task)
-            { 
-                // A fine task operates on Ai,Ax [pA...pA_end-1], which is
-                // a subset of the vector A(:,j)
-                pA     = TaskList [taskid].pA ;
-                pA_end = TaskList [taskid].pA_end ;
-            }
-            else
-            {
-                // A coarse task operates on the entire vector A (:,j)
-                int64_t kA = (C_to_A == NULL) ? j : C_to_A [k] ;
-                if (kA >= 0)
-                { 
-                    pA     = Ap [kA] ;
-                    pA_end = Ap [kA+1] ;
-                }
-            }
-            // ----
-
-            int64_t ajnz = pA_end - pA ;        // nnz in A(:,j) for this slice
-            int64_t pA_start = pA ;
-            bool adense = (ajnz == len) ;
-            int64_t iA_first = -1, iA_last = -1 ;
-            if (ajnz > 0)
-            { 
-                // get the first and last indices in A(:,j) for this vector
-                iA_first = Ai [pA] ;
-                iA_last  = Ai [pA_end-1] ;
-            }
-
-            //------------------------------------------------------------------
-            // get B(:,j)
-            //------------------------------------------------------------------
-
-            // GB_GET_MAPPED_VECTOR (pB, pB_end, pB, pB_end, Bp, j, k, C_to_B) ;
-            int64_t pB = -1, pB_end = -1 ;
-            if (fine_task)
-            { 
-                // A fine task operates on Bi,Bx [pB...pB_end-1], which is
-                // a subset of the vector B(:,j)
-                pB     = TaskList [taskid].pB ;
-                pB_end = TaskList [taskid].pB_end ;
-            }
-            else
-            {
-                // A coarse task operates on the entire vector B (:,j)
-                int64_t kB = (C_to_B == NULL) ? j : C_to_B [k] ;
-                if (kB >= 0)
-                { 
-                    pB     = Bp [kB] ;
-                    pB_end = Bp [kB+1] ;
-                }
-            }
-            // ----
-
-            int64_t bjnz = pB_end - pB ;        // nnz in B(:,j) for this slice
-            int64_t pB_start = pB ;
-            bool bdense = (bjnz == len) ;
-            int64_t iB_first = -1, iB_last = -1 ;
-            if (bjnz > 0)
-            {
-                // get the first and last indices in B(:,j) for this vector
-                iB_first = Bi [pB] ;
-                iB_last  = Bi [pB_end-1] ;
-            }
-
-            //------------------------------------------------------------------
-            // phase1: count nnz (C (:,j)); phase2: compute C(:,j)
-            //------------------------------------------------------------------
-
-            if (M == NULL)
-            {
-
-                //--------------------------------------------------------------
-                // No mask
-                //--------------------------------------------------------------
-
-                // if present, M(:,j) is ignored since !M(:,j) is all true
-
-                #if defined ( GB_PHASE_1_OF_2 )
-
-                if (A_and_B_are_disjoint)
-                { 
-
-                    // only used by GB_Matrix_wait, which computes A+T where T
-                    // is the matrix of pending tuples for A.  The pattern of
-                    // pending tuples is always disjoint with the pattern of A.
-
-                    cjnz = ajnz + bjnz ;
-
-                }
-                else
-
-                #endif
-
-                if (adense && bdense)
-                {
-
-                    //----------------------------------------------------------
-                    // A(:,j) and B(:,j) dense: thus C(:,j) dense
-                    //----------------------------------------------------------
-
-                    ASSERT (ajnz == bjnz) ;
-                    ASSERT (iA_first == iB_first) ;
-                    ASSERT (iA_last  == iB_last ) ;
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cjnz = ajnz ;
-                    #else
-                    ASSERT (cjnz == ajnz) ;
-                    for (int64_t p = 0 ; p < ajnz ; p++)
-                    { 
-                        Ci [pC + p] = p + iA_first ;
-                        GB_GETA (aij, Ax, pA + p) ;
-                        GB_GETB (bij, Bx, pB + p) ;
-                        GB_BINOP (GB_CX (pC + p), aij, bij) ;
-                    }
-                    #endif
-
-                }
-                else if (adense)
-                {
-
-                    //----------------------------------------------------------
-                    // A(:,j) dense, B(:,j) sparse: thus C(:,j) dense
-                    //----------------------------------------------------------
-
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cjnz = ajnz ;
-                    #else
-                    ASSERT (cjnz == ajnz) ;
-                    for (int64_t p = 0 ; p < ajnz ; p++)
-                    { 
-                        Ci [pC + p] = p + iA_first ;
-                        GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
-                    }
-                    for (int64_t p = 0 ; p < bjnz ; p++)
-                    { 
-                        int64_t ii = Bi [pB + p] - iA_first ;
-                        GB_GETA (aij, Ax, pA + ii) ;
-                        GB_GETB (bij, Bx, pB + p) ;
-                        GB_BINOP (GB_CX (pC + ii), aij, bij) ;
-                    }
-                    #endif
-
-                }
-                else if (bdense)
-                {
-
-                    //----------------------------------------------------------
-                    // A(:,j) sparse, B(:,j) dense: thus C(:,j) dense
-                    //----------------------------------------------------------
-
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cjnz = bjnz ;
-                    #else
-                    ASSERT (cjnz == bjnz) ;
-                    for (int64_t p = 0 ; p < bjnz ; p++)
-                    { 
-                        Ci [pC + p] = p + iB_first ;
-                        GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
-                    }
-                    for (int64_t p = 0 ; p < ajnz ; p++)
-                    { 
-                        int64_t ii = Ai [pA + p] - iB_first ;
-                        GB_GETA (aij, Ax, pA + p) ;
-                        GB_GETB (bij, Bx, pB + ii) ;
-                        GB_BINOP (GB_CX (pC + ii), aij, bij) ;
-                    }
-                    #endif
-
-                }
-                else if (ajnz == 0)
-                {
-
-                    //----------------------------------------------------------
-                    // A(:,j) is empty
-                    //----------------------------------------------------------
-
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cjnz = bjnz ;
-                    #else
-                    ASSERT (cjnz == bjnz) ;
-                    for (int64_t p = 0 ; p < bjnz ; p++)
-                    { 
-                        Ci [pC + p] = Bi [pB + p] ;
-                        GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
-                    }
-                    #endif
-
-                }
-                else if (bjnz == 0)
-                {
-
-                    //----------------------------------------------------------
-                    // B(:,j) is empty
-                    //----------------------------------------------------------
-
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cjnz = ajnz ;
-                    #else
-                    ASSERT (cjnz == ajnz) ;
-                    for (int64_t p = 0 ; p < ajnz ; p++)
-                    { 
-                        Ci [pC + p] = Ai [pA + p] ;
-                        GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
-                    }
-                    #endif
-
-                }
-                else if (iA_last < iB_first)
-                {
-
-                    //----------------------------------------------------------
-                    // last entry of A(:,j) comes before first entry of B(:,j)
-                    //----------------------------------------------------------
-
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cjnz = ajnz + bjnz ;
-                    #else
-                    ASSERT (cjnz == ajnz + bjnz) ;
-                    for (int64_t p = 0 ; p < ajnz ; p++)
-                    { 
-                        Ci [pC + p] = Ai [pA + p] ;
-                        GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
-                    }
-                    pC += ajnz ;
-                    for (int64_t p = 0 ; p < bjnz ; p++)
-                    { 
-                        Ci [pC + p] = Bi [pB + p] ;
-                        GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
-                    }
-                    #endif
-
-                }
-                else if (iB_last < iA_first)
-                {
-
-                    //----------------------------------------------------------
-                    // last entry of B(:,j) comes before first entry of A(:,j)
-                    //----------------------------------------------------------
-
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cjnz = ajnz + bjnz ;
-                    #else
-                    ASSERT (cjnz == ajnz + bjnz) ;
-                    for (int64_t p = 0 ; p < bjnz ; p++)
-                    { 
-                        Ci [pC + p] = Bi [pB + p] ;
-                        GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
-                    }
-                    pC += bjnz ;
-                    for (int64_t p = 0 ; p < ajnz ; p++)
-                    { 
-                        Ci [pC + p] = Ai [pA + p] ;
-                        GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
-                    }
-                    #endif
-
-                }
-
-                #if defined ( GB_PHASE_1_OF_2 )
-                else if (ajnz > 32 * bjnz)
-                {
-
-                    //----------------------------------------------------------
-                    // A(:,j) is much denser than B(:,j)
-                    //----------------------------------------------------------
-
-                    // cjnz = ajnz + bjnz - nnz in the intersection
-
-                    cjnz = ajnz + bjnz ;
-                    for ( ; pB < pB_end ; pB++)
-                    { 
-                        int64_t i = Bi [pB] ;
-                        // find i in A(:,j)
-                        int64_t pright = pA_end - 1 ;
-                        bool found ;
-                        GB_BINARY_SEARCH (i, Ai, pA, pright, found) ;
-                        if (found) cjnz-- ;
-                    }
-
-                }
-                else if (bjnz > 32 * ajnz)
-                {
-
-                    //----------------------------------------------------------
-                    // B(:,j) is much denser than A(:,j)
-                    //----------------------------------------------------------
-
-                    // cjnz = ajnz + bjnz - nnz in the intersection
-
-                    cjnz = ajnz + bjnz ;
-                    for ( ; pA < pA_end ; pA++)
-                    { 
-                        int64_t i = Ai [pA] ;
-                        // find i in B(:,j)
-                        int64_t pright = pB_end - 1 ;
-                        bool found ;
-                        GB_BINARY_SEARCH (i, Bi, pB, pright, found) ;
-                        if (found) cjnz-- ;
-                    }
-
-                }
-                #endif
-
-                else
-                {
-
-                    //----------------------------------------------------------
-                    // A(:,j) and B(:,j) have about the same # of entries
-                    //----------------------------------------------------------
-
-                    while (pA < pA_end && pB < pB_end)
-                    {
-                        int64_t iA = Ai [pA] ;
-                        int64_t iB = Bi [pB] ;
-                        if (iA < iB)
-                        { 
-                            // C (iA,j) = A (iA,j)
-                            #if defined ( GB_PHASE_2_OF_2 )
-                            Ci [pC] = iA ;
-                            GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
-                            #endif
-                            pA++ ;
-                        }
-                        else if (iA > iB)
-                        { 
-                            // C (iB,j) = B (iB,j)
-                            #if defined ( GB_PHASE_2_OF_2 )
-                            Ci [pC] = iB ;
-                            GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
-                            #endif
-                            pB++ ;
-                        }
-                        else
-                        { 
-                            // C (i,j) = A (i,j) + B (i,j)
-                            #if defined ( GB_PHASE_2_OF_2 )
-                            Ci [pC] = iB ;
-                            GB_GETA (aij, Ax, pA) ;
-                            GB_GETB (bij, Bx, pB) ;
-                            GB_BINOP (GB_CX (pC), aij, bij) ;
-                            #endif
-                            pA++ ;
-                            pB++ ;
-                        }
-                        #if defined ( GB_PHASE_2_OF_2 )
-                        pC++ ;
-                        #else
-                        cjnz++ ;
-                        #endif
-                    }
-
-                    //----------------------------------------------------------
-                    // A (:,j) or B (:,j) have entries left; not both
-                    //----------------------------------------------------------
-
-                    ajnz = (pA_end - pA) ;
-                    bjnz = (pB_end - pB) ;
-                    ASSERT (ajnz == 0 || bjnz == 0) ;
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cjnz += ajnz + bjnz ;
-                    #else
-                    for (int64_t p = 0 ; p < ajnz ; p++)
-                    { 
-                        // C (i,j) = A (i,j)
-                        Ci [pC + p] = Ai [pA + p] ;
-                        GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
-                    }
-                    for (int64_t p = 0 ; p < bjnz ; p++)
-                    { 
-                        // C (i,j) = B (i,j)
-                        Ci [pC + p] = Bi [pB + p] ;
-                        GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
-                    }
-                    ASSERT (pC + ajnz + bjnz == pC_end) ;
-                    #endif
-                }
-
-            }
-            else
-            {
-
-                //--------------------------------------------------------------
-                // Mask is present
-                //--------------------------------------------------------------
-
-                int64_t pM = -1 ;
-                int64_t pM_end = -1 ;
-                if (fine_task)
-                { 
-                    // A fine task operates on Mi,Mx [pM...pM_end-1], which is
-                    // a subset of the vector M(:,j)
-                    pM     = TaskList [taskid].pM ;
-                    pM_end = TaskList [taskid].pM_end ;
-                }
-                else
-                {
-                    int64_t kM = -1 ;
-                    if (Ch_is_Mh)
-                    { 
-                        // Ch is the same as Mh (a deep copy)
-                        ASSERT (Ch != NULL) ;
-                        ASSERT (M->h != NULL) ;
-                        ASSERT (Ch [k] == M->h [k]) ;
-                        kM = k ;
-                    }
-                    else
-                    { 
-                        kM = (C_to_M == NULL) ? j : C_to_M [k] ;
-                    }
-                    if (kM >= 0)
-                    { 
-                        pM     = Mp [kM] ;
-                        pM_end = Mp [kM+1] ;
-                    }
-                }
-
-                //--------------------------------------------------------------
-                // C(:,j)<M(:,j)> = A(:,j) + B (:,j)
-                //--------------------------------------------------------------
-
-                // A and B cannot both be dense, because GB_ewise converts
-                // eWiseAdd(A,B) into eWiseMult(A,B) in that case.
-
-                bool mask_is_easy = 
-                    (adense && B == M) ||
-                    (bdense && A == M) ||
-                    (A == M && B == M) ;
-
-                if (mask_is_easy && Mask_struct)
-                {
-
-                    //----------------------------------------------------------
-                    // special case: mask is very easy to use
-                    //----------------------------------------------------------
-
-                    // the mask M is structural, and every entry in the
-                    // mask is guaranteed to appear in A+B
-
-                    int64_t mjnz = pM_end - pM ;        // nnz (M (:,j))
-
-                    #if defined ( GB_PHASE_1_OF_2 )
-
-                    cjnz = mjnz ;
-
-                    #else
-
-                    // copy the pattern into C (:,j)
-                    int64_t pC_start = pC ;
-                    int64_t pM_start = pM ;
-                    memcpy (Ci + pC, Mi + pM, mjnz * sizeof (int64_t)) ;
-                    int64_t pA_offset = pA_start - iA_first ;
-                    int64_t pB_offset = pB_start - iB_first ;
-
-                    if (adense && B == M)
-                    { 
-
-                        //------------------------------------------------------
-                        // A dense, B == M
-                        //------------------------------------------------------
-
-                        GB_PRAGMA_SIMD_VECTORIZE
-                        for (int64_t p = 0 ; p < mjnz ; p++)
-                        {
-                            int64_t pM = p + pM_start ;
-                            int64_t pC = p + pC_start ;
-                            int64_t i = Mi [pM] ;
-                            ASSERT (GB_mcast (Mx, pM, msize)) ;
-                            ASSERT (Ai [pA_offset + i] == i) ;
-                            ASSERT (Bi [pM] == i) ;
-                            GB_GETA (aij, Ax, pA_offset + i) ;
-                            GB_GETB (bij, Bx, pM) ;
-                            GB_BINOP (GB_CX (pC), aij, bij) ;
-                        }
-
-                    }
-                    else if (bdense && A == M)
-                    { 
-
-                        //------------------------------------------------------
-                        // B dense, A == M
-                        //------------------------------------------------------
-
-                        GB_PRAGMA_SIMD_VECTORIZE
-                        for (int64_t p = 0 ; p < mjnz ; p++)
-                        {
-                            int64_t pM = p + pM_start ;
-                            int64_t pC = p + pC_start ;
-                            int64_t i = Mi [pM] ;
-                            ASSERT (GB_mcast (Mx, pM, msize)) ;
-                            ASSERT (Ai [pM] == i) ;
-                            ASSERT (Bi [pB_offset + i] == i) ;
-                            GB_GETA (aij, Ax, pM) ;
-                            GB_GETB (bij, Bx, pB_offset + i) ;
-                            GB_BINOP (GB_CX (pC), aij, bij) ;
-                        }
-
-                    }
-                    else // (A == M) && (B == M)
-                    { 
-
-                        //------------------------------------------------------
-                        // A == M == B: all three matrices are the same
-                        //------------------------------------------------------
-
-                        GB_PRAGMA_SIMD_VECTORIZE
-                        for (int64_t p = 0 ; p < mjnz ; p++)
-                        {
-                            int64_t pM = p + pM_start ;
-                            int64_t pC = p + pC_start ;
-                            #if GB_OP_IS_SECOND
-                            GB_GETB (t, Bx, pM) ;
-                            #else
-                            GB_GETA (t, Ax, pM) ;
-                            #endif
-                            GB_BINOP (GB_CX (pC), t, t) ;
-                        }
-                    }
-
-                    #endif
-
-                }
-                else
-                {
-
-                    //----------------------------------------------------------
-                    // scan M(:,j) and count nnz (C (:,j))
-                    //----------------------------------------------------------
-
-                    for ( ; pM < pM_end ; pM++)
-                    {
-
-                        //------------------------------------------------------
-                        // get M(i,j) for A(i,j) + B (i,j)
-                        //------------------------------------------------------
-
-                        int64_t i = Mi [pM] ;
-                        bool mij = GB_mcast (Mx, pM, msize) ;
-                        if (!mij) continue ;
-
-                        //------------------------------------------------------
-                        // get A(i,j)
-                        //------------------------------------------------------
-
-                        bool afound ;
-                        if (adense)
-                        { 
-                            // A is dense; use quick lookup
-                            pA = pA_start + (i - iA_first) ;
-                            afound = true ;
-                        }
-                        else if (A == M)
-                        { 
-                            // A is aliased to M
-                            pA = pM ;
-                            afound = true ;
-                        }
-                        else
-                        { 
-                            // A is sparse; use binary search
-                            int64_t apright = pA_end - 1 ;
-                            GB_BINARY_SEARCH (i, Ai, pA, apright, afound) ;
-                        }
-
-                        ASSERT (GB_IMPLIES (afound, Ai [pA] == i)) ;
-
-                        //------------------------------------------------------
-                        // get B(i,j)
-                        //------------------------------------------------------
-
-                        bool bfound ;
-                        if (bdense)
-                        { 
-                            // B is dense; use quick lookup
-                            pB = pB_start + (i - iB_first) ;
-                            bfound = true ;
-                        }
-                        else if (B == M)
-                        { 
-                            // B is aliased to M
-                            pB = pM ;
-                            bfound = true ;
-                        }
-                        else
-                        { 
-                            // B is sparse; use binary search
-                            int64_t bpright = pB_end - 1 ;
-                            GB_BINARY_SEARCH (i, Bi, pB, bpright, bfound) ;
-                        }
-
-                        ASSERT (GB_IMPLIES (bfound, Bi [pB] == i)) ;
-
-                        //------------------------------------------------------
-                        // C(i,j) = A(i,j) + B(i,j)
-                        //------------------------------------------------------
-
-                        if (afound && bfound)
-                        { 
-                            // C (i,j) = A (i,j) + B (i,j)
-                            #if defined ( GB_PHASE_1_OF_2 )
-                            cjnz++ ;
-                            #else
-                            Ci [pC] = i ;
-                            GB_GETA (aij, Ax, pA) ;
-                            GB_GETB (bij, Bx, pB) ;
-                            GB_BINOP (GB_CX (pC), aij, bij) ;
-                            pC++ ;
-                            #endif
-                        }
-                        else if (afound)
-                        { 
-                            // C (i,j) = A (i,j)
-                            #if defined ( GB_PHASE_1_OF_2 )
-                            cjnz++ ;
-                            #else
-                            Ci [pC] = i ;
-                            GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
-                            pC++ ;
-                            #endif
-                        }
-                        else if (bfound)
-                        { 
-                            // C (i,j) = B (i,j)
-                            #if defined ( GB_PHASE_1_OF_2 )
-                            cjnz++ ;
-                            #else
-                            Ci [pC] = i ;
-                            GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
-                            pC++ ;
-                            #endif
-                        }
-                    }
-
-                    #if defined ( GB_PHASE_2_OF_2 )
-                    ASSERT (pC == pC_end) ;
-                    #endif
-                }
-            }
-
-            //------------------------------------------------------------------
-            // final count of nnz (C (:,j))
-            //------------------------------------------------------------------
-
-            #if defined ( GB_PHASE_1_OF_2 )
-            if (fine_task)
-            { 
-                TaskList [taskid].pC = cjnz ;
-            }
-            else
-            { 
-                Cp [k] = cjnz ;
-            }
-            #endif
-        }
-    }
+    #endif
 }
 
diff --git a/GraphBLAS/Source/Template/GB_binop_factory.c b/GraphBLAS/Source/Template/GB_binop_factory.c
index 0fb6af8466..feb57ffb33 100644
--- a/GraphBLAS/Source/Template/GB_binop_factory.c
+++ b/GraphBLAS/Source/Template/GB_binop_factory.c
@@ -2,8 +2,8 @@
 // GB_binop_factory: switch factory for built-in methods for C=binop(A,B)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -19,6 +19,9 @@
     // launch the switch factory
     //--------------------------------------------------------------------------
 
+    // this switch factory does not handle positional operators
+    ASSERT (!GB_OPCODE_IS_POSITIONAL (opcode)) ;
+
     switch (opcode)
     {
 
diff --git a/GraphBLAS/Source/Template/GB_bitmap_AxB_saxpy_A_bitmap_B_bitmap_template.c b/GraphBLAS/Source/Template/GB_bitmap_AxB_saxpy_A_bitmap_B_bitmap_template.c
new file mode 100644
index 0000000000..720b10e597
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_AxB_saxpy_A_bitmap_B_bitmap_template.c
@@ -0,0 +1,287 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_AxB_saxpy_A_bitmap_B_bitmap: C<#M>+=A*B, C bitmap, M any format
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+
+    int64_t tid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+        reduction(+:cnvals)
+    for (tid = 0 ; tid < ntasks ; tid++)
+    {
+        
+        //----------------------------------------------------------------------
+        // get the task to compute C (I,J)
+        //----------------------------------------------------------------------
+
+        int64_t I_tid = tid / nI_tasks ;
+        int64_t J_tid = tid % nI_tasks ;
+
+        // I = istart:iend-1
+        int64_t istart = I_tid * GB_TILE_SIZE ;
+        int64_t iend   = GB_IMIN (avlen, istart + GB_TILE_SIZE) ;
+
+        // J = jstart:jend-1
+        int64_t jstart = J_tid * GB_TILE_SIZE ;
+        int64_t jend   = GB_IMIN (bvdim, jstart + GB_TILE_SIZE) ;
+
+        int64_t task_cnvals = 0 ;
+
+        //----------------------------------------------------------------------
+        // check if any entry in the M(I,J) mask permits any change to C(I,J)
+        //----------------------------------------------------------------------
+
+        #if GB_MASK_IS_SPARSE_OR_HYPER || GB_MASK_IS_BITMAP_OR_FULL
+
+            bool any_update_allowed = false ;
+
+            for (int64_t j = jstart ; j < jend && !any_update_allowed ; j++)
+            {
+                for (int64_t i = istart ; i < iend && !any_update_allowed ; i++)
+                { 
+
+                    //----------------------------------------------------------
+                    // get pointer to C(i,j) and M(i,j)
+                    //----------------------------------------------------------
+
+                    int64_t pC = j * avlen + i ;
+
+                    //----------------------------------------------------------
+                    // check M(i,j)
+                    //----------------------------------------------------------
+
+                    #if GB_MASK_IS_SPARSE_OR_HYPER
+
+                        // M is sparse or hypersparse
+                        int8_t cb = Cb [pC] ;
+                        bool mij = (cb & 2) ;
+
+                    #elif GB_MASK_IS_BITMAP_OR_FULL
+
+                        // M is bitmap or full
+                        GB_GET_M_ij (pC) ;
+
+                    #endif
+
+                    if (Mask_comp) mij = !mij ;
+                    if (!mij) continue ;
+                    any_update_allowed = true ;
+                }
+            }
+
+            if (!any_update_allowed)
+            { 
+                // C(I,J) cannot be modified at all; skip it
+                continue ;
+            }
+
+        #endif
+
+        //----------------------------------------------------------------------
+        // declare local storage for this task
+        //----------------------------------------------------------------------
+
+//      GB_ATYPE Ax_cache [GB_TILE_SIZE * GB_KTILE_SIZE] ;
+//      int8_t Ab_cache [GB_TILE_SIZE * GB_KTILE_SIZE] ;
+        bool Ab_any_in_row [GB_TILE_SIZE] ;
+
+        //----------------------------------------------------------------------
+        // C<#M>(I,J) += A(I,:) * B(:,J)
+        //----------------------------------------------------------------------
+
+        for (int64_t kstart = 0 ; kstart < avdim ; kstart += GB_KTILE_SIZE)
+        {
+            // K = kstart:kend-1
+            int64_t kend = GB_IMIN (avdim, kstart + GB_KTILE_SIZE) ;
+
+            //------------------------------------------------------------------
+            // TODO: load A(I,K) into local storage
+            //------------------------------------------------------------------
+
+            // For built-in semirings, load A(I,K) into local storage of size
+            // GB_TILE_SIZE * GB_KTILE_SIZE and transpose it.  Load in the
+            // bitmap Ab if not NULL, and Ax if not NULL.
+
+#if 0
+            for (int64_t k = kstart ; k < kend ; k++)
+            {
+                for (int64_t i = istart ; i < iend ; i++)
+                {
+                    int64_t pA = i + k * avlen ;
+                    int8_t ab = GBB (Ab, pA) ;
+                    i_local = i - istart ;
+                    k_local = k - kstart ;
+                    Ab_cache [i_local * GB_KTILE_SIZE ...
+                }
+            }
+#endif
+
+            //------------------------------------------------------------------
+            // Check for entries in each row of A(I,K)
+            //------------------------------------------------------------------
+
+            if (A_is_bitmap)
+            {
+                for (int i = 0 ; i < GB_TILE_SIZE ; i++)
+                { 
+                    Ab_any_in_row [i] = false ;
+                }
+                for (int64_t k = kstart ; k < kend ; k++)
+                {
+                    for (int64_t i = istart ; i < iend ; i++)
+                    { 
+                        int64_t pA = i + k * avlen ;    // get pointer to A(i,k)
+                        int8_t  ab = Ab [pA] ;
+                        // Ab_cache [(i-istart) * GB_KTILE_SIZE + (k-kstart)]
+                        //      = ab ;
+                        Ab_any_in_row [i-istart] |= ab ;
+                    }
+                }
+            }
+
+            //------------------------------------------------------------------
+            // C<#M>(I,J) += A(I,K) * B(K,J)
+            //------------------------------------------------------------------
+
+            for (int64_t j = jstart ; j < jend ; j++)
+            {
+
+                //--------------------------------------------------------------
+                // B is bitmap or full: check if any B(K,j) entry exists
+                //--------------------------------------------------------------
+
+                if (B_is_bitmap)
+                {
+                    int b = 0 ;
+                    for (int64_t k = kstart ; k < kend ; k++)
+                    { 
+                        int64_t pB = k + j * bvlen ;    // pointer to B(k,j)
+                        b += Bb [pB] ;
+                    }
+                    if (b == 0)
+                    { 
+                        // no entry exists in B(K,j)
+                        continue ;
+                    }
+                }
+
+                //--------------------------------------------------------------
+                // C<#M>(I,j) += A(I,K) * B(K,j)
+                //--------------------------------------------------------------
+
+                GB_GET_T_FOR_SECONDJ ;
+
+                for (int64_t i = istart ; i < iend ; i++)
+                {
+
+                    //----------------------------------------------------------
+                    // skip if A(i,K) has no entries
+                    //----------------------------------------------------------
+
+                    if (A_is_bitmap && !Ab_any_in_row [i - istart])
+                    { 
+                        continue ;
+                    }
+
+                    //----------------------------------------------------------
+                    // get C(i,j)
+                    //----------------------------------------------------------
+
+                    int64_t pC = i + j * avlen ;
+
+                    //----------------------------------------------------------
+                    // check M(i,j)
+                    //----------------------------------------------------------
+
+                    #if GB_MASK_IS_SPARSE_OR_HYPER
+
+                        // M is sparse or hypersparse
+                        int8_t cb = Cb [pC] ;
+                        bool mij = (cb & 2) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (!mij) continue ;
+                        cb = (cb & 1) ;
+
+                    #elif GB_MASK_IS_BITMAP_OR_FULL
+
+                        // M is bitmap or full
+                        GB_GET_M_ij (pC) ;
+                        if (Mask_comp) mij = !mij ;
+                        if (!mij) continue ;
+                        int8_t cb = Cb [pC] ;
+
+                    #else
+
+                        // no mask
+                        int8_t cb = Cb [pC] ;
+
+                    #endif
+
+                    //----------------------------------------------------------
+                    // C(i,j) += A(i,K) * B(K,j)
+                    //----------------------------------------------------------
+
+                    if (cb == 0)
+                    {
+
+                        //------------------------------------------------------
+                        // C(i,j) does not yet exist
+                        //------------------------------------------------------
+
+                        for (int64_t k = kstart ; k < kend ; k++)
+                        {
+                            int64_t pA = i + k * avlen ;    // pointer to A(i,k)
+                            int64_t pB = k + j * bvlen ;    // pointer to B(k,j)
+                            if (!GBB (Ab, pA)) continue ;
+                            if (!GBB (Bb, pB)) continue ;
+                            GB_GET_B_kj ;                   // get B(k,j)
+                            GB_MULT_A_ik_B_kj ;             // t = A(i,k)*B(k,j)
+                            if (cb == 0)
+                            { 
+                                // C(i,j) = A(i,k) * B(k,j)
+                                GB_CIJ_WRITE (pC, t) ;
+                                Cb [pC] = keep ;
+                                cb = keep ;
+                                task_cnvals++ ;
+                            }
+                            else
+                            { 
+                                // C(i,j) += A(i,k) * B(k,j)
+                                GB_CIJ_UPDATE (pC, t) ;
+                            }
+                        }
+
+                    }
+                    else
+                    {
+
+                        //------------------------------------------------------
+                        // C(i,j) already exists
+                        //------------------------------------------------------
+
+                        #if !GB_IS_ANY_PAIR_SEMIRING
+                        for (int64_t k = kstart ; k < kend ; k++)
+                        { 
+                            int64_t pA = i + k * avlen ;    // pointer to A(i,k)
+                            int64_t pB = k + j * bvlen ;    // pointer to B(k,j)
+                            if (!GBB (Ab, pA)) continue ;
+                            if (!GBB (Bb, pB)) continue ;
+                            GB_GET_B_kj ;                   // get B(k,j)
+                            GB_MULT_A_ik_B_kj ;             // t = A(i,k)*B(k,j)
+                            // C(i,j) += A(i,k) * B(k,j)
+                            GB_CIJ_UPDATE (pC, t) ;
+                        }
+                        #endif
+                    }
+                }
+            }
+        }
+        cnvals += task_cnvals ;
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_AxB_saxpy_A_bitmap_B_sparse_template.c b/GraphBLAS/Source/Template/GB_bitmap_AxB_saxpy_A_bitmap_B_sparse_template.c
new file mode 100644
index 0000000000..5b4dc7fdd0
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_AxB_saxpy_A_bitmap_B_sparse_template.c
@@ -0,0 +1,487 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_AxB_saxpy_A_bitmap_B_sparse: C<#M>+=A*B, C bitmap, M any format
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// C is bitmap, A is bitmap or full, B is sparse or hypersparse.
+// M has any format.
+
+{
+
+    //--------------------------------------------------------------------------
+    // allocate workspace for each task
+    //--------------------------------------------------------------------------
+
+    // imeta = total number of rows of A and H in all panels
+    int64_t imeta = naslice * GB_PANEL_SIZE ;
+
+    // number of entries in one panel of G for A.
+    #if GB_HAS_BITMAP_MULTADD && !GB_IS_ANY_PAIR_SEMIRING
+    // Always load the A panel into G, since Ax [pA] has uninitialized values
+    // where Ab [pA] == 0.  The GB_BITMAP_MULTADD update will access these
+    // values, and they must be initialized.
+    const bool load_apanel = true ;
+    #else
+    // only load the A panel into G if it consists of more than one panel
+    const bool load_apanel = (avlen > GB_PANEL_SIZE) ;
+    #endif
+    // Each panel of G is GB_PANEL_SIZE-by-avdim, held by column.
+    int64_t apanel_size = load_apanel ? (GB_PANEL_SIZE * avdim) : 0 ;
+    int64_t afpanel_size = GB_A_IS_BITMAP  ? (apanel_size) : 0 ;
+    int64_t axpanel_size = A_is_pattern ? 0 : (apanel_size * GB_ASIZE) ;
+
+    // each panel of H is GB_PANEL_SIZE-by-bnvec, held by column; note that
+    // H has bnvec vectors, not bvdim.  The C bitmap has bvdim vectors,
+    // and bnvec <= bvdim if B is hypersparse.
+    int64_t hpanel_size = GB_PANEL_SIZE * bnvec ;
+
+    //--------------------------------------------------------------------------
+    // allocate the panels
+    //--------------------------------------------------------------------------
+
+    // The G panels are not needed if A would fit into a single panel.
+    // In that case A is used in place and not copied into G.
+
+    int64_t wafsize = naslice * afpanel_size ;
+    int64_t waxsize = naslice * axpanel_size ;
+    int64_t wcsize  = naslice * hpanel_size ;
+    int64_t wcxsize = GB_IS_ANY_PAIR_SEMIRING ? 0 : (wcsize * GB_CSIZE) ;
+    Wf = GB_MALLOC (wafsize + wcsize, int8_t) ;
+    Wax = GB_MALLOC (waxsize, GB_void) ;
+    Wcx = GB_MALLOC (wcxsize, GB_void) ;
+    if (Wf == NULL || Wax == NULL || Wcx == NULL)
+    { 
+        // out of memory
+        GB_FREE_WORK ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // initialize the panels
+    //--------------------------------------------------------------------------
+
+    // for all semirings: set the bitmaps Gb and Hf to zero
+    GB_memset (Wf,  0, wafsize + wcsize, nthreads_max) ;
+
+    #if GB_HAS_BITMAP_MULTADD && !GB_IS_ANY_PAIR_SEMIRING
+    { 
+        // Initialize the Hx workspace to identity, if this semiring has a
+        // concise bitmap multiply-add expression.  For the any_pair semiring,
+        // the numerical values are not needed so Hx is not allocated.
+        #if GB_HAS_IDENTITY_BYTE
+            // the identity value can be assigned via memset
+            GB_memset (Wcx, GB_IDENTITY_BYTE, wcxsize, nthreads_max) ;
+        #else
+            // an explicit loop is required to set Hx to identity
+            // TODO: should each task initialize its own Hf and Hx,
+            // and use a static schedule here and for H=G*B?
+            GB_CTYPE *GB_RESTRICT Hx = (GB_CTYPE *) Wcx ;
+            int64_t pH ;
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (pH = 0 ; pH < wcsize ; pH++)
+            {
+                Hx [pH] = GB_IDENTITY ;
+            }
+        #endif
+    }
+    #endif
+
+    //--------------------------------------------------------------------------
+    // C<#M>=A*B, one metapanel at a time
+    //--------------------------------------------------------------------------
+
+    int tid ;
+
+    for (int64_t iouter = 0 ; iouter < avlen ; iouter += imeta)
+    {
+
+        //----------------------------------------------------------------------
+        // C<#M>(metapanel,:) += A (metapanel,:)*B
+        //----------------------------------------------------------------------
+
+        // The rows in this metapanel are iouter:iouter+imeta-1.
+
+        //----------------------------------------------------------------------
+        // load the metapanel: G = A (iouter:iouter+imeta-1,:)
+        //----------------------------------------------------------------------
+
+        if ((GB_A_IS_BITMAP || !A_is_pattern) && load_apanel)
+        {
+
+            // Loading the panel into G keeps its storage order.  A is not
+            // transposed when loaded into the G panels.  However, the leading
+            // dimension is reduced.  A is avlen-by-avdim with a leading
+            // dimension of avlen, which can be large.  G is np-by-avdim, with
+            // np <= GB_PANEL_SIZE.  The loading of A into G can be skipped
+            // if all of A can be used in-place.
+
+            #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+            for (tid = 0 ; tid < ntasks ; tid++)
+            {
+
+                //--------------------------------------------------------------
+                // get the panel for this task
+                //--------------------------------------------------------------
+
+                int a_tid = tid / nbslice ;
+                int b_tid = tid % nbslice ;
+                int64_t istart = iouter + a_tid     * GB_PANEL_SIZE ;
+                int64_t iend   = iouter + (a_tid+1) * GB_PANEL_SIZE ;
+                iend = GB_IMIN (iend, avlen) ;
+                int64_t np = iend - istart ;
+                if (np <= 0) continue ;
+                int64_t kstart, kend ; 
+                GB_PARTITION (kstart, kend, avdim, b_tid, nbslice) ;
+                int8_t   *GB_RESTRICT Gb = Wf  + (a_tid * afpanel_size) ;
+                GB_ATYPE *GB_RESTRICT Gx = (GB_ATYPE *)
+                    (Wax + (a_tid * axpanel_size)) ;
+
+                //--------------------------------------------------------------
+                // load A for this panel
+                //--------------------------------------------------------------
+
+                #if ( GB_A_IS_BITMAP )
+                {
+
+                    //----------------------------------------------------------
+                    // A is bitmap
+                    //----------------------------------------------------------
+
+                    if (!A_is_pattern)
+                    {
+                        // load Ab and Ax into Gb and Gx
+                        for (int64_t k = kstart ; k < kend ; k++)
+                        {
+                            for (int64_t ii = 0 ; ii < np ; ii++)
+                            {
+                                // Gb (ii,k) = Ab (istart+ii,k)
+                                const int64_t pG = ii + k*np ;
+                                const int64_t pA = istart + ii + k*avlen ;
+                                const int8_t gb = Ab [pA] ;
+                                Gb [pG] = gb ;
+                                if (gb)
+                                { 
+                                    // Gx (ii,k) = Ax (istart+ii,k)
+                                    GB_LOADA (Gx, pG, Ax, pA) ;
+                                }
+                                #if GB_HAS_BITMAP_MULTADD
+                                else
+                                { 
+                                    // Gx (ii,k) = 0
+                                    Gx [pG] = GB_ATYPE_CAST (0, 0) ;
+                                }
+                                #endif
+                            }
+                        }
+                    }
+                    else
+                    {
+                        // just load the Ab bitmap into Gb, not the values
+                        for (int64_t k = kstart ; k < kend ; k++)
+                        {
+                            for (int64_t ii = 0 ; ii < np ; ii++)
+                            { 
+                                // Gb (ii,k) = Ab (istart+ii,k)
+                                const int64_t pG = ii + k*np ;
+                                const int64_t pA = istart + ii + k*avlen ;
+                                Gb [pG] = Ab [pA] ;
+                            }
+                        }
+                    }
+
+                }
+                #else
+                {
+
+                    //----------------------------------------------------------
+                    // A is full
+                    //----------------------------------------------------------
+
+                    if (!A_is_pattern)
+                    {
+                        for (int64_t k = kstart ; k < kend ; k++)
+                        {
+                            for (int64_t ii = 0 ; ii < np ; ii++)
+                            { 
+                                // Gx (ii,k) = Ax (istart+ii,k)
+                                const int64_t pG = ii + k*np ;
+                                const int64_t pA = istart + ii + k*avlen ;
+                                GB_LOADA (Gx, pG, Ax, pA) ;
+                            }
+                        }
+                    }
+                }
+                #endif
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // H = G*B
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+        for (tid = 0 ; tid < ntasks ; tid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the panel of H and G for this task
+            //------------------------------------------------------------------
+
+            int a_tid = tid / nbslice ;
+            int b_tid = tid % nbslice ;
+            int64_t istart = iouter + a_tid     * GB_PANEL_SIZE ;
+            int64_t iend   = iouter + (a_tid+1) * GB_PANEL_SIZE ;
+            iend = GB_IMIN (iend, avlen) ;
+            int64_t np = iend - istart ;
+            if (np <= 0) continue ;
+
+            const int8_t   *GB_RESTRICT Gb ;
+            const GB_ATYPE *GB_RESTRICT Gx ;
+
+            if (load_apanel)
+            { 
+                // A has been loaded into the G panel
+                Gb = Wf  + (a_tid * afpanel_size) ;
+                Gx = (GB_ATYPE *) (Wax + (a_tid * axpanel_size)) ;
+            }
+            else
+            { 
+                // use A in-place
+                Gb = Ab ;
+                Gx = (GB_ATYPE *) Ax ;
+            }
+
+            int8_t   *GB_RESTRICT Hf = Wf  + (a_tid * hpanel_size) + wafsize ;
+            GB_CTYPE *GB_RESTRICT Hx = (GB_CTYPE *)
+                (Wcx + (a_tid * hpanel_size) * GB_CSIZE) ;
+            GB_XINIT ;  // for plus, bor, band, and bxor monoids only
+
+            //------------------------------------------------------------------
+            // H_panel (:,kfirst:klast-1) = G_panel * B (:, kfirst:klast-1)
+            //------------------------------------------------------------------
+
+            int64_t kfirst = B_slice [b_tid] ;
+            int64_t klast = B_slice [b_tid + 1] ;
+            for (int64_t kk = kfirst ; kk < klast ; kk++)
+            {
+
+                //--------------------------------------------------------------
+                // H_panel (:,kk) = G_panel * B (:,kk)
+                //--------------------------------------------------------------
+
+                // H and B are indexed in the compact space kk = 0:bnvec-1,
+                // not by the names j = 0:bvdim-1.  When B is sparse, these are
+                // the same.  If B is hypersparse, j is Bh [kk].  However, j is
+                // needed for the SECONDJ and SECONDJ1 multipliers.
+
+                int64_t j = GBH (Bh, kk) ;
+                int64_t pB = Bp [kk] ;
+                int64_t pB_end = Bp [kk+1] ;
+                int64_t pH = kk * np ;
+                #if GB_IS_SECONDJ_MULTIPLIER
+                    // t = j or j+1 for SECONDJ and SECONDJ1 multipliers
+                    GB_CIJ_DECLARE (t) ;
+                    GB_MULT (t, ignore, ignore, ignore, ignore, j) ;
+                #endif
+
+                #undef GB_MULT_G_iik_B_kj
+                #if GB_IS_PAIR_MULTIPLIER
+                    // t = G(ii,k) * B(k,j) is always equal to 1
+                    #define GB_MULT_G_iik_B_kj(ii)
+                #elif ( GB_IS_FIRSTJ_MULTIPLIER || GB_IS_SECONDJ_MULTIPLIER )
+                    // t is already defined for these multipliers
+                    #define GB_MULT_G_iik_B_kj(ii)
+                #else
+                    // t = G(ii,k) * B(k,j)
+                    #define GB_MULT_G_iik_B_kj(ii)                          \
+                        GB_GETA (giik, Gx, pG + ii) ;                       \
+                        GB_CIJ_DECLARE (t) ;                                \
+                        GB_MULT (t, giik, bkj, istart + ii, k, j)
+                #endif
+
+                for ( ; pB < pB_end ; pB++)
+                {
+                    int64_t k = Bi [pB] ;       // get B(k,j)
+                    int64_t pG = k * np ;       // get G(:,k)
+                    GB_GET_B_kj ;               // bkj = B(k,j)
+                    GB_XLOAD (bkj) ;            // X [1] = bkj (plus_times only)
+                    // H_panel (:,j) = G_panel (:,k) * B(k,j)
+                    for (int64_t ii = 0 ; ii < np ; ii++)
+                    { 
+                        #if GB_HAS_BITMAP_MULTADD
+                        { 
+                            // if (Gb (ii,k))
+                            //      if (Hf (ii,j) == 0)
+                            //          Hx (ii,j) = G (ii,k) * B(k,j) ;
+                            //          Hf (ii,j) = 1
+                            //      else
+                            //          Hx (ii,j) += G (ii,k) * B(k,j) ;
+                            #if GB_IS_FIRSTI_MULTIPLIER
+                            int64_t i = istart + ii ;
+                            #endif
+                            #if GB_A_IS_BITMAP
+                                GB_BITMAP_MULTADD (
+                                    Hf [pH+ii], Hx [pH+ii],
+                                    Gb [pG+ii], Gx [pG+ii], bkj) ;
+                            #else
+                                GB_BITMAP_MULTADD (
+                                    Hf [pH+ii], Hx [pH+ii],
+                                    1,          Gx [pG+ii], bkj) ;
+                            #endif
+                        }
+                        #else
+                        { 
+                            #if GB_A_IS_BITMAP
+                            if (Gb [pG+ii])
+                            #endif
+                            {
+                                // t = G(ii,k) * B(k,j)
+                                GB_MULT_G_iik_B_kj (ii) ;
+                                if (Hf [pH+ii] == 0)
+                                { 
+                                    // H (ii,j) is a new entry
+                                    GB_HX_WRITE (pH+ii, t) ;    // Hx (ii,j)=t
+                                    Hf [pH+ii] = 1 ;
+                                }
+                                else
+                                { 
+                                    // H (ii,j) is already present
+                                    GB_HX_UPDATE (pH+ii, t) ;   // Hx (ii,j)+=t
+                                }
+                            }
+                        }
+                        #endif
+                    }
+                }
+                #undef GB_MULT_G_iik_B_kj
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // C (metapanel,:) += H
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:cnvals)
+        for (tid = 0 ; tid < ntasks ; tid++)
+        {
+
+            //------------------------------------------------------------------
+            // get the panel of H and G for this task
+            //------------------------------------------------------------------
+
+            int a_tid = tid / nbslice ;
+            int b_tid = tid % nbslice ;
+            int64_t istart = iouter + a_tid     * GB_PANEL_SIZE ;
+            int64_t iend   = iouter + (a_tid+1) * GB_PANEL_SIZE ;
+            iend = GB_IMIN (iend, avlen) ;
+            int64_t np = iend - istart ;
+            if (np <= 0) continue ;
+            int64_t task_cnvals = 0 ;
+
+            int64_t kstart, kend ; 
+            GB_PARTITION (kstart, kend, bnvec, b_tid, nbslice) ;
+
+            int8_t   *GB_RESTRICT Hf = Wf  + (a_tid * hpanel_size) + wafsize ;
+            GB_CTYPE *GB_RESTRICT Hx = (GB_CTYPE *)
+                (Wcx + (a_tid * hpanel_size) * GB_CSIZE) ;
+
+            //------------------------------------------------------------------
+            // C<#M>(metapanel,j1:j2-1) += H (:,kstart:kend-1)
+            //------------------------------------------------------------------
+
+            // If B is hypersparse, the kk-th vector of H is the jth vector
+            // of C, where j = Bh [kk].
+
+            for (int64_t kk = kstart ; kk < kend ; kk++)
+            {
+                int64_t j = GBH (Bh, kk) ;      // j is the range j1:j2-1
+                int64_t pC_start = istart + j * avlen ; // get C(istart,j)
+                int64_t pH_start = kk * np ;            // get H(:,kk)
+
+                for (int64_t ii = 0 ; ii < np ; ii++)
+                {
+                    int64_t pC = pC_start + ii ;    // get C(i,j)
+                    int64_t pH = pH_start + ii ;    // get H(ii,kk)
+                    if (!Hf [pH]) continue ;
+                    Hf [pH] = 0 ;                   // clear the panel
+                    int8_t cb = Cb [pC] ;
+
+                    //----------------------------------------------------------
+                    // check M(i,j)
+                    //----------------------------------------------------------
+
+                    #undef GB_IF_MIJ
+                    #if GB_MASK_IS_SPARSE_OR_HYPER
+
+                        // M is sparse or hypersparse
+                        bool mij = ((cb & 2) != 0) ^ Mask_comp ;
+                        cb = (cb & 1) ;
+                        #define GB_IF_MIJ if (mij)
+
+                    #elif GB_MASK_IS_BITMAP_OR_FULL
+
+                        // M is bitmap or full
+                        GB_GET_M_ij (pC) ;
+                        mij = mij ^ Mask_comp ;
+                        #define GB_IF_MIJ if (mij)
+
+                    #else
+
+                        #define GB_IF_MIJ
+
+                    #endif
+
+                    //----------------------------------------------------------
+                    // C(i,j) += H(ii,kk)
+                    //----------------------------------------------------------
+
+                    GB_IF_MIJ
+                    {
+                        if (cb == 0)
+                        { 
+                            // C(i,j) = H(ii,kk)
+                            #if GB_IS_ANY_PAIR_SEMIRING
+                            Cx [pC] = GB_CTYPE_CAST (1,0) ; // C(i,j) = 1
+                            #else
+                            GB_CIJ_GATHER (pC, pH) ;
+                            #endif
+                            Cb [pC] = keep ;
+                            task_cnvals++ ;
+                        }
+                        else
+                        {
+                            // Currently, the matrix C is a newly allocated
+                            // matrix, not the C_in input matrix to GrB_mxm.
+                            // As a result, this condition is not used.  It
+                            // will be in the future when this method is
+                            // modified to modify C in-place.
+                            ASSERT (GB_DEAD_CODE) ;
+                            // C(i,j) += H(ii,kk)
+                            GB_CIJ_GATHER_UPDATE (pC, pH) ;
+                        }
+                    }
+
+                    //----------------------------------------------------------
+                    // clear the panel
+                    //----------------------------------------------------------
+
+                    #if GB_HAS_BITMAP_MULTADD && !GB_IS_ANY_PAIR_SEMIRING
+                    { 
+                        // H(ii,kk) = identity
+                        Hx [pH] = GB_IDENTITY ;
+                    }
+                    #endif
+                }
+            }
+            cnvals += task_cnvals ;
+        }
+    }
+}
+
+#undef GB_IF_MIJ
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_AxB_saxpy_A_sparse_B_bitmap_template.c b/GraphBLAS/Source/Template/GB_bitmap_AxB_saxpy_A_sparse_B_bitmap_template.c
new file mode 100644
index 0000000000..9ee40ee440
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_AxB_saxpy_A_sparse_B_bitmap_template.c
@@ -0,0 +1,846 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_AxB_saxpy_A_sparse_B_bitmap: C<#M>+=A*B, C bitmap, M any format
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+
+    if (use_coarse_tasks)
+    {
+
+        //----------------------------------------------------------------------
+        // C<#M> += A*B using coarse tasks
+        //----------------------------------------------------------------------
+
+        // number of columns in the workspace for each task
+        #define GB_PANEL_SIZE 4
+
+        //----------------------------------------------------------------------
+        // allocate workspace for each task
+        //----------------------------------------------------------------------
+
+        GH_slice = GB_MALLOC (2*ntasks, int64_t) ;
+        if (GH_slice == NULL)
+        {
+            // out of memory
+            GB_FREE_WORK ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+
+        int64_t *GB_RESTRICT G_slice = GH_slice ;
+        int64_t *GB_RESTRICT H_slice = GH_slice + ntasks ;
+
+        int64_t gwork = 0 ;
+        int64_t hwork = 0 ;
+        int tid ;
+        for (tid = 0 ; tid < ntasks ; tid++)
+        {
+            int64_t jstart, jend ;
+            GB_PARTITION (jstart, jend, bvdim, tid, ntasks) ;
+            int64_t jtask = jend - jstart ;
+            int64_t jpanel = GB_IMIN (jtask, GB_PANEL_SIZE) ;
+            G_slice [tid] = gwork ;
+            H_slice [tid] = hwork ;
+            if (jpanel > 1)
+            {
+                // no need to allocate workspace for Gb and Gx if jpanel == 1
+                gwork += jpanel ;
+            }
+            hwork += jpanel ;
+        }
+
+        int64_t bvlenx = (B_is_pattern ? 0 : bvlen) * GB_BSIZE ;
+        int64_t cvlenx = (GB_IS_ANY_PAIR_SEMIRING ? 0 : cvlen) * GB_CSIZE ;
+        int64_t bvlenb = (GB_B_IS_BITMAP ? bvlen : 0) ;
+        size_t gfspace = gwork * bvlenb ;
+        size_t wfspace = gfspace + hwork * cvlen ;
+        size_t wbxspace = gwork * bvlenx ;
+        size_t wcxspace = hwork * cvlenx ;
+        Wf = GB_MALLOC (wfspace, int8_t) ;
+        Wbx = GB_MALLOC (wbxspace, GB_void) ;
+        Wcx = GB_MALLOC (wcxspace, GB_void) ;
+        if (Wf == NULL || Wcx == NULL || Wbx == NULL)
+        {
+            // out of memory
+            GB_FREE_WORK ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+
+        //----------------------------------------------------------------------
+        // C<#M> += A*B
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:cnvals)
+        for (tid = 0 ; tid < ntasks ; tid++)
+        {
+
+            //------------------------------------------------------------------
+            // determine the vectors of B and C for this coarse task
+            //------------------------------------------------------------------
+
+            int64_t jstart, jend ;
+            GB_PARTITION (jstart, jend, bvdim, tid, ntasks) ;
+            int64_t jtask = jend - jstart ;
+            int64_t jpanel = GB_IMIN (jtask, GB_PANEL_SIZE) ;
+            int64_t task_cnvals = 0 ;
+
+            //------------------------------------------------------------------
+            // get the workspace for this task
+            //------------------------------------------------------------------
+
+            // Gb and Gx workspace to load the panel of B
+            int8_t   *GB_RESTRICT Gb = Wf  + G_slice [tid] * bvlenb ;
+            GB_BTYPE *GB_RESTRICT Gx = (GB_BTYPE *)
+                (Wbx + G_slice [tid] * bvlenx) ;
+
+            // Hf and Hx workspace to compute the panel of C
+            int8_t   *GB_RESTRICT Hf = Wf  + (H_slice [tid] * cvlen) + gfspace ;
+            GB_CTYPE *GB_RESTRICT Hx = (GB_CTYPE *)
+                (Wcx +  H_slice [tid] * cvlenx) ;
+            #if GB_IS_PLUS_FC32_MONOID
+            float  *GB_RESTRICT Hx_real = (float *) Hx ;
+            float  *GB_RESTRICT Hx_imag = Hx_real + 1 ;
+            #elif GB_IS_PLUS_FC64_MONOID
+            double *GB_RESTRICT Hx_real = (double *) Hx ;
+            double *GB_RESTRICT Hx_imag = Hx_real + 1 ;
+            #endif
+
+            //------------------------------------------------------------------
+            // clear the panel
+            //------------------------------------------------------------------
+
+            memset (Hf, 0, jpanel * cvlen) ;
+
+            //------------------------------------------------------------------
+            // C<#M>(:,jstart:jend-1) += A * B(:,jstart:jend-1) by panel
+            //------------------------------------------------------------------
+
+            for (int64_t j1 = jstart ; j1 < jend ; j1 += jpanel)
+            {
+
+                //--------------------------------------------------------------
+                // get the panel of np vectors j1:j2-1
+                //--------------------------------------------------------------
+
+                int64_t j2 = GB_IMIN (jend, j1 + jpanel) ;
+                int64_t np = j2 - j1 ;
+
+                //--------------------------------------------------------------
+                // load and transpose B(:,j1:j2-1) for one panel
+                //--------------------------------------------------------------
+
+                #if GB_B_IS_BITMAP
+                {
+                    if (np == 1)
+                    {
+                        // no need to load a single vector of B
+                        Gb = (int8_t *) (Bb + (j1 * bvlen)) ;
+                    }
+                    else
+                    {
+                        // load and transpose the bitmap of B(:,j1:j2-1)
+                        for (int64_t jj = 0 ; jj < np ; jj++)
+                        {
+                            int64_t j = j1 + jj ;
+                            for (int64_t i = 0 ; i < bvlen ; i++)
+                            { 
+                                Gb [i*np + jj] = Bb [i + j * bvlen] ;
+                            }
+                        }
+                    }
+                }
+                #endif
+
+                if (!B_is_pattern)
+                {
+                    if (np == 1)
+                    {
+                        // no need to load a single vector of B
+                        GB_void *GB_RESTRICT Bx = B->x ;
+                        Gx = (GB_BTYPE *) (Bx + (j1 * bvlen) * GB_BSIZE) ;
+                    }
+                    else
+                    {
+                        // load and transpose the values of B(:,j1:j2-1)
+                        for (int64_t jj = 0 ; jj < np ; jj++)
+                        {
+                            int64_t j = j1 + jj ;
+                            for (int64_t i = 0 ; i < bvlen ; i++)
+                            { 
+                                // G(i,jj) = B(i,j), and change storage order
+                                int64_t pG = i*np + jj ;
+                                int64_t pB = i + j * bvlen ;
+                                GB_LOADB (Gx, pG, Bx, pB) ;
+                            }
+                        }
+                    }
+                }
+
+                //--------------------------------------------------------------
+                // H = A*G for one panel
+                //--------------------------------------------------------------
+
+                for (int64_t kA = 0 ; kA < anvec ; kA++)
+                {
+
+                    //----------------------------------------------------------
+                    // get A(:,k)
+                    //----------------------------------------------------------
+
+                    int64_t k = GBH (Ah, kA) ;
+                    int64_t pA = Ap [kA] ;
+                    int64_t pA_end = Ap [kA+1] ;
+                    int64_t pG = k * np ;
+
+                    #undef  GB_MULT_A_ik_G_kjj
+                    #if GB_IS_PAIR_MULTIPLIER
+                        // t = A(i,k) * G (k,jj) is always equal to 1
+                        #define GB_MULT_A_ik_G_kjj(jj)
+                    #else
+                        // t = A(i,k) * G (k,jj)
+                        GB_CIJ_DECLARE (t) ;
+                        #define GB_MULT_A_ik_G_kjj(jj)                      \
+                            GB_GETB (gkj, Gx, pG+jj) ;                      \
+                            GB_MULT (t, aik, gkj, i, k, j1 + jj) ;
+                    #endif
+
+                    #undef  GB_HX_COMPUTE
+                    #define GB_HX_COMPUTE(jj)                               \
+                    {                                                       \
+                        /* H (i,jj) += A(i,k)*G(k,jj) */                    \
+                        if (!GB_B_IS_BITMAP || Gb [pG+jj])                  \
+                        {                                                   \
+                            GB_MULT_A_ik_G_kjj (jj) ;                       \
+                            if (Hf [pH+jj] == 0)                            \
+                            {                                               \
+                                /* H(i,jj) is a new entry */                \
+                                GB_HX_WRITE (pH+jj, t) ; /* Hx(i,jj)=t */   \
+                                Hf [pH+jj] = 1 ;                            \
+                            }                                               \
+                            else                                            \
+                            {                                               \
+                                /* H(i,jj) is already present */            \
+                                GB_HX_UPDATE (pH+jj, t) ; /* Hx(i,jj)+=t */ \
+                            }                                               \
+                        }                                                   \
+                    }
+
+                    #undef  GB_LOAD_A_ij
+                    #define GB_LOAD_A_ij                                    \
+                        int64_t i = Ai [pA] ;                               \
+                        GB_GETA (aik, Ax, pA) ;                             \
+                        int64_t pH = i * np ;
+
+                    //----------------------------------------------------------
+                    // H += A(:,k)*G(k,:)
+                    //----------------------------------------------------------
+
+                    #if GB_B_IS_BITMAP
+                    bool gb = false ;
+                    switch (np)
+                    {
+                        case 4 : gb  = Gb [pG+3] ;
+                        case 3 : gb |= Gb [pG+2] ;
+                        case 2 : gb |= Gb [pG+1] ;
+                        case 1 : gb |= Gb [pG  ] ; 
+                        default: ;
+                    }
+                    if (gb)
+                    #endif
+                    {
+                        switch (np)
+                        {
+
+                            case 4 : 
+                                for ( ; pA < pA_end ; pA++)
+                                {
+                                    GB_LOAD_A_ij ;
+                                    GB_HX_COMPUTE (0) ;
+                                    GB_HX_COMPUTE (1) ;
+                                    GB_HX_COMPUTE (2) ;
+                                    GB_HX_COMPUTE (3) ;
+                                }
+                                break ;
+
+                            case 3 : 
+                                for ( ; pA < pA_end ; pA++)
+                                {
+                                    GB_LOAD_A_ij ;
+                                    GB_HX_COMPUTE (0) ;
+                                    GB_HX_COMPUTE (1) ;
+                                    GB_HX_COMPUTE (2) ;
+                                }
+                                break ;
+
+                            case 2 : 
+                                for ( ; pA < pA_end ; pA++)
+                                {
+                                    GB_LOAD_A_ij ;
+                                    GB_HX_COMPUTE (0) ;
+                                    GB_HX_COMPUTE (1) ;
+                                }
+                                break ;
+
+                            case 1 : 
+                                for ( ; pA < pA_end ; pA++)
+                                {
+                                    GB_LOAD_A_ij ;
+                                    GB_HX_COMPUTE (0) ;
+                                }
+                                break ;
+                            default:;
+                        }
+                    }
+
+                    #undef  GB_MULT_A_ik_G_kjj
+                    #undef  GB_HX_COMPUTE
+                    #undef  GB_LOAD_A_ij
+                }
+
+                //--------------------------------------------------------------
+                // C<#M>(:,j1:j2-1) += H
+                //--------------------------------------------------------------
+
+                for (int64_t jj = 0 ; jj < np ; jj++)
+                {
+
+                    //----------------------------------------------------------
+                    // C<#M>(:,j) += H (:,jj)
+                    //----------------------------------------------------------
+
+                    int64_t j = j1 + jj ;
+                    int64_t pC_start = j * avlen ;  // get pointer to C(:,j)
+
+                    for (int64_t i = 0 ; i < cvlen ; i++)
+                    {
+                        int64_t pC = pC_start + i ;     // pointer to C(i,j)
+                        int64_t pH = i * np + jj ;      // pointer to H(i,jj)
+                        if (!Hf [pH]) continue ;
+                        Hf [pH] = 0 ;                   // clear the panel
+                        int8_t cb = Cb [pC] ;
+
+                        //------------------------------------------------------
+                        // check M(i,j)
+                        //------------------------------------------------------
+
+                        #if GB_MASK_IS_SPARSE_OR_HYPER
+
+                            // M is sparse or hypersparse
+                            bool mij = ((cb & 2) != 0) ^ Mask_comp ;
+                            if (!mij) continue ;
+                            cb = (cb & 1) ;
+
+                        #elif GB_MASK_IS_BITMAP_OR_FULL
+
+                            // M is bitmap or full
+                            GB_GET_M_ij (pC) ;
+                            mij = mij ^ Mask_comp ;
+                            if (!mij) continue ;
+
+                        #endif
+
+                        //------------------------------------------------------
+                        // C(i,j) += H(i,jj)
+                        //------------------------------------------------------
+
+                        if (cb == 0)
+                        { 
+                            // C(i,j) = H(i,jj)
+                            #if GB_IS_ANY_PAIR_SEMIRING
+                            Cx [pC] = GB_CTYPE_CAST (1, 0) ;    // C(i,j) = 1
+                            #else
+                            GB_CIJ_GATHER (pC, pH) ;
+                            #endif
+                            Cb [pC] = keep ;
+                            task_cnvals++ ;
+                        }
+                        else
+                        {
+                            // Currently, the matrix C is a newly allocated
+                            // matrix, not the C_in input matrix to GrB_mxm.
+                            // As a result, this condition is not used.  It
+                            // will be in the future when this method is
+                            // modified to modify C in-place.
+                            ASSERT (GB_DEAD_CODE) ;
+                            // C(i,j) += H(i,jj)
+                            GB_CIJ_GATHER_UPDATE (pC, pH) ;
+                        }
+                    }
+                }
+            }
+            cnvals += task_cnvals ;
+        }
+
+        #undef GB_PANEL_SIZE
+
+    }
+    else if (use_atomics)
+    {
+
+        //----------------------------------------------------------------------
+        // C<#M> += A*B using fine tasks and atomics
+        //----------------------------------------------------------------------
+
+        int tid ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:cnvals)
+        for (tid = 0 ; tid < ntasks ; tid++)
+        {
+
+            //------------------------------------------------------------------
+            // determine the vector of B and C for this fine task
+            //------------------------------------------------------------------
+
+            // The fine task operates on C(:,j) and B(:,j).  Its fine task
+            // id ranges from 0 to nfine_tasks_per_vector-1, and determines
+            // which slice of A to operate on.
+
+            int64_t j    = tid / nfine_tasks_per_vector ;
+            int fine_tid = tid % nfine_tasks_per_vector ;
+            int64_t kfirst = A_slice [fine_tid] ;
+            int64_t klast = A_slice [fine_tid + 1] ;
+            int64_t pB_start = j * bvlen ;      // pointer to B(:,j)
+            int64_t pC_start = j * avlen ;      // pointer to C(:,j)
+            GB_GET_T_FOR_SECONDJ ;              // t = j or j+1 for SECONDJ*
+            int64_t task_cnvals = 0 ;
+
+            // for Hx Gustavason workspace: use C(:,j) in-place:
+            GB_CTYPE *GB_RESTRICT Hx = (GB_CTYPE *)
+                (((GB_void *) Cx) + (pC_start * GB_CSIZE)) ;
+            #if GB_IS_PLUS_FC32_MONOID || GB_IS_ANY_FC32_MONOID
+            float  *GB_RESTRICT Hx_real = (float *) Hx ;
+            float  *GB_RESTRICT Hx_imag = Hx_real + 1 ;
+            #elif GB_IS_PLUS_FC64_MONOID || GB_IS_ANY_FC64_MONOID
+            double *GB_RESTRICT Hx_real = (double *) Hx ;
+            double *GB_RESTRICT Hx_imag = Hx_real + 1 ;
+            #endif
+
+            //------------------------------------------------------------------
+            // C<#M>(:,j) += A(:,k1:k2) * B(k1:k2,j)
+            //------------------------------------------------------------------
+
+            for (int64_t kk = kfirst ; kk < klast ; kk++)
+            {
+
+                //--------------------------------------------------------------
+                // C<#M>(:,j) += A(:,k) * B(k,j)
+                //--------------------------------------------------------------
+
+                int64_t k = GBH (Ah, kk) ;      // k in range k1:k2
+                int64_t pB = pB_start + k ;     // get pointer to B(k,j)
+                if (!GBB (Bb, pB)) continue ;   
+                int64_t pA = Ap [kk] ;
+                int64_t pA_end = Ap [kk+1] ;
+                GB_GET_B_kj ;                   // bkj = B(k,j)
+
+                for ( ; pA < pA_end ; pA++)
+                {
+
+                    //----------------------------------------------------------
+                    // get A(i,k) and C(i,j)
+                    //----------------------------------------------------------
+
+                    int64_t i = Ai [pA] ;       // get A(i,k) index
+                    int64_t pC = pC_start + i ; // get C(i,j) pointer
+                    int8_t cb ;
+
+                    //----------------------------------------------------------
+                    // C<#M>(i,j) += A(i,k) * B(k,j)
+                    //----------------------------------------------------------
+
+                    #if GB_MASK_IS_SPARSE_OR_HYPER
+                    { 
+
+                        //------------------------------------------------------
+                        // M is sparse, and scattered into the C bitmap
+                        //------------------------------------------------------
+
+                        // finite-state machine in Cb [pC]:
+                        // 0:   cij not present, mij zero
+                        // 1:   cij present, mij zero (keep==1 for !M)
+                        // 2:   cij not present, mij one
+                        // 3:   cij present, mij one (keep==3 for M)
+                        // 7:   cij is locked
+
+                        #if GB_HAS_ATOMIC
+                        { 
+                            // if C(i,j) is already present and can be modified
+                            // (cb==keep), and the monoid can be done
+                            // atomically, then do the atomic update.  No need
+                            // to modify Cb [pC].
+                            GB_ATOMIC_READ
+                            cb = Cb [pC] ;          // grab the entry
+                            if (cb == keep)
+                            { 
+                                #if !GB_IS_ANY_MONOID
+                                GB_MULT_A_ik_B_kj ;     // t = A(i,k) * B(k,j)
+                                GB_ATOMIC_UPDATE_HX (i, t) ;    // C(i,j) += t
+                                #endif
+                                continue ;          // C(i,j) has been updated
+                            }
+                        }
+                        #endif
+
+                        do  // lock the entry
+                        { 
+                            // do this atomically:
+                            // { cb = Cb [pC] ;  Cb [pC] = 7 ; }
+                            GB_ATOMIC_CAPTURE_INT8 (cb, Cb [pC], 7) ;
+                        } while (cb == 7) ; // lock owner gets 0, 1, 2, or 3
+                        if (cb == keep-1)
+                        { 
+                            // C(i,j) is a new entry
+                            GB_MULT_A_ik_B_kj ;             // t = A(i,k)*B(k,j)
+                            #if GB_IS_ANY_PAIR_SEMIRING
+                            GB_ATOMIC_SET_HX_ONE (i) ;      // C(i,j) = 1
+                            #else
+                            GB_ATOMIC_WRITE_HX (i, t) ;     // C(i,j) = t
+                            #endif
+                            task_cnvals++ ;
+                            cb = keep ;                     // keep the entry
+                        }
+                        else if (cb == keep)
+                        { 
+                            // C(i,j) is already present
+                            #if !GB_IS_ANY_MONOID
+                            GB_MULT_A_ik_B_kj ;             // t = A(i,k)*B(k,j)
+                            GB_ATOMIC_UPDATE_HX (i, t) ;    // C(i,j) += t
+                            #endif
+                        }
+                        GB_ATOMIC_WRITE
+                        Cb [pC] = cb ;                  // unlock the entry
+
+                    }
+                    #else
+                    { 
+
+                        //------------------------------------------------------
+                        // M is not present, or bitmap/full
+                        //------------------------------------------------------
+
+                        // finite-state machine in Cb [pC]:
+                        // 0:   cij not present; can be written
+                        // 1:   cij present; can be updated
+                        // 7:   cij is locked
+
+                        #if GB_MASK_IS_BITMAP_OR_FULL
+                        { 
+                            // M is bitmap or full, and not in C bitmap.
+                            // Do not modify C(i,j) if not permitted by the mask
+                            GB_GET_M_ij (pC) ;
+                            mij = mij ^ Mask_comp ;
+                            if (!mij) continue ;
+                        }
+                        #endif
+
+                        //------------------------------------------------------
+                        // C(i,j) += A(i,j) * B(k,j)
+                        //------------------------------------------------------
+
+                        #if GB_HAS_ATOMIC
+                        { 
+                            // if C(i,j) is already present (cb==1), and the
+                            // monoid can be done atomically, then do the
+                            // atomic update.  No need to modify Cb [pC].
+                            GB_ATOMIC_READ
+                            cb = Cb [pC] ;          // grab the entry
+                            if (cb == 1)
+                            { 
+                                #if !GB_IS_ANY_MONOID
+                                GB_MULT_A_ik_B_kj ;     // t = A(i,k) * B(k,j)
+                                GB_ATOMIC_UPDATE_HX (i, t) ;    // C(i,j) += t
+                                #endif
+                                continue ;          // C(i,j) has been updated
+                            }
+                        }
+                        #endif
+
+                        do  // lock the entry
+                        { 
+                            // do this atomically:
+                            // { cb = Cb [pC] ;  Cb [pC] = 7 ; }
+                            GB_ATOMIC_CAPTURE_INT8 (cb, Cb [pC], 7) ;
+                        } while (cb == 7) ; // lock owner gets 0 or 1
+                        if (cb == 0)
+                        { 
+                            // C(i,j) is a new entry
+                            GB_MULT_A_ik_B_kj ;             // t = A(i,k)*B(k,j)
+                            #if GB_IS_ANY_PAIR_SEMIRING
+                            GB_ATOMIC_SET_HX_ONE (i) ;      // C(i,j) = 1
+                            #else
+                            GB_ATOMIC_WRITE_HX (i, t) ;     // C(i,j) = t
+                            #endif
+                            task_cnvals++ ;
+                        }
+                        else // cb == 1
+                        { 
+                            // C(i,j) is already present
+                            #if !GB_IS_ANY_MONOID
+                            GB_MULT_A_ik_B_kj ;             // t = A(i,k)*B(k,j)
+                            GB_ATOMIC_UPDATE_HX (i, t) ;    // C(i,j) += t
+                            #endif
+                        }
+                        GB_ATOMIC_WRITE
+                        Cb [pC] = 1 ;               // unlock the entry
+
+                    }
+                    #endif
+
+                }
+            }
+            cnvals += task_cnvals ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // C<#M> += A*B using fine tasks and workspace, with no atomics
+        //----------------------------------------------------------------------
+
+        // Each fine task is given size-cvlen workspace to compute its result
+        // in the first phase, W(:,tid) = A(:,k1:k2) * B(k1:k2,j), where k1:k2
+        // is defined by the fine_tid of the task.  The workspaces are then
+        // summed into C in the second phase.
+
+        //----------------------------------------------------------------------
+        // allocate workspace
+        //----------------------------------------------------------------------
+
+        size_t workspace = cvlen * ntasks ;
+        Wf = GB_CALLOC (workspace, int8_t) ;
+        size_t cxsize = (GB_IS_ANY_PAIR_SEMIRING) ? 0 : GB_CSIZE ;
+        Wcx = GB_MALLOC (workspace * cxsize, GB_void) ;
+        if (Wf == NULL || Wcx == NULL)
+        { 
+            // out of memory
+            GB_FREE_WORK ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+
+        //----------------------------------------------------------------------
+        // first phase: W (:,tid) = A (:,k1:k2) * B (k2:k2,j) for each fine task
+        //----------------------------------------------------------------------
+
+        int tid ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+        for (tid = 0 ; tid < ntasks ; tid++)
+        {
+
+            //------------------------------------------------------------------
+            // determine the vector of B and C for this fine task
+            //------------------------------------------------------------------
+
+            // The fine task operates on C(:,j) and B(:,j).  Its fine task
+            // id ranges from 0 to nfine_tasks_per_vector-1, and determines
+            // which slice of A to operate on.
+
+            int64_t j    = tid / nfine_tasks_per_vector ;
+            int fine_tid = tid % nfine_tasks_per_vector ;
+            int64_t kfirst = A_slice [fine_tid] ;
+            int64_t klast = A_slice [fine_tid + 1] ;
+            int64_t pB_start = j * bvlen ;      // pointer to B(:,j)
+            int64_t pC_start = j * avlen ;      // pointer to C(:,j), for bitmap
+            int64_t pW_start = tid * avlen ;    // pointer to W(:,tid)
+            GB_GET_T_FOR_SECONDJ ;              // t = j or j+1 for SECONDJ*
+            int64_t task_cnvals = 0 ;
+
+            // for Hf and Hx Gustavason workspace: use W(:,tid):
+            int8_t   *GB_RESTRICT Hf = Wf + pW_start ;
+            GB_CTYPE *GB_RESTRICT Hx = (GB_CTYPE *) 
+                (Wcx + (pW_start * cxsize)) ;
+            #if GB_IS_PLUS_FC32_MONOID
+            float  *GB_RESTRICT Hx_real = (float *) Hx ;
+            float  *GB_RESTRICT Hx_imag = Hx_real + 1 ;
+            #elif GB_IS_PLUS_FC64_MONOID
+            double *GB_RESTRICT Hx_real = (double *) Hx ;
+            double *GB_RESTRICT Hx_imag = Hx_real + 1 ;
+            #endif
+
+            //------------------------------------------------------------------
+            // W<#M> = A(:,k1:k2) * B(k1:k2,j)
+            //------------------------------------------------------------------
+
+            for (int64_t kk = kfirst ; kk < klast ; kk++)
+            {
+
+                //--------------------------------------------------------------
+                // W<#M>(:,tid) += A(:,k) * B(k,j)
+                //--------------------------------------------------------------
+
+                int64_t k = GBH (Ah, kk) ;      // k in range k1:k2
+                int64_t pB = pB_start + k ;     // get pointer to B(k,j)
+                if (!GBB (Bb, pB)) continue ;   
+                int64_t pA = Ap [kk] ;
+                int64_t pA_end = Ap [kk+1] ;
+                GB_GET_B_kj ;                   // bkj = B(k,j)
+
+                for ( ; pA < pA_end ; pA++)
+                {
+
+                    //----------------------------------------------------------
+                    // get A(i,k)
+                    //----------------------------------------------------------
+
+                    int64_t i = Ai [pA] ;       // get A(i,k) index
+
+                    //----------------------------------------------------------
+                    // check M(i,j)
+                    //----------------------------------------------------------
+
+                    #if GB_MASK_IS_SPARSE_OR_HYPER
+                    { 
+                        // M is sparse or hypersparse
+                        int64_t pC = pC_start + i ;
+                        int8_t cb = Cb [pC] ;
+                        bool mij = ((cb & 2) != 0) ^ Mask_comp ;
+                        if (!mij) continue ;
+                    }
+                    #elif GB_MASK_IS_BITMAP_OR_FULL
+                    { 
+                        // M is bitmap or full
+                        int64_t pC = pC_start + i ;
+                        GB_GET_M_ij (pC) ;
+                        mij = mij ^ Mask_comp ;
+                        if (!mij) continue ;
+                    }
+                    #endif
+
+                    //----------------------------------------------------------
+                    // W<#M>(i) += A(i,k) * B(k,j)
+                    //----------------------------------------------------------
+
+                    #if GB_IS_ANY_PAIR_SEMIRING
+                    { 
+                        // Hx is not used; Cx [...] = 1 is done below
+                        Hf [i] = 1 ;
+                    }
+                    #else
+                    {
+                        GB_MULT_A_ik_B_kj ;         // t = A(i,k)*B(k,j)
+                        if (Hf [i] == 0)
+                        { 
+                            // W(i,j) is a new entry
+                            GB_HX_WRITE (i, t) ;    // Hx(i) = t
+                            Hf [i] = 1 ;
+                        }
+                        else
+                        { 
+                            // W(i) is already present
+                            GB_HX_UPDATE (i, t) ;   // Hx(i) += t
+                        }
+                    }
+                    #endif
+                }
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // second phase: C<#M> += reduce (W)
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:cnvals)
+        for (tid = 0 ; tid < ntasks ; tid++)
+        {
+
+            //------------------------------------------------------------------
+            // determine the W and C for this fine task
+            //------------------------------------------------------------------
+
+            // The fine task operates on C(i1:i2,j) and W(i1:i2,w1:w2), where
+            // i1:i2 is defined by the fine task id.  Its fine task id ranges
+            // from 0 to nfine_tasks_per_vector-1.
+            
+            // w1:w2 are the updates to C(:,j), where w1:w2 =
+            // [j*nfine_tasks_per_vector : (j+1)*nfine_tasks_per_vector-1].
+
+            int64_t j    = tid / nfine_tasks_per_vector ;
+            int fine_tid = tid % nfine_tasks_per_vector ;
+            int64_t istart, iend ;
+            GB_PARTITION (istart, iend, cvlen, fine_tid,
+                nfine_tasks_per_vector) ;
+            int64_t pC_start = j * cvlen ;          // pointer to C(:,j)
+            int64_t wstart = j * nfine_tasks_per_vector ;
+            int64_t wend = (j + 1) * nfine_tasks_per_vector ;
+            int64_t task_cnvals = 0 ;
+
+            // Hx = (typecasted) Wcx workspace, use Wf as-is
+            GB_CTYPE *GB_RESTRICT Hx = ((GB_CTYPE *) Wcx) ;
+            #if GB_IS_PLUS_FC32_MONOID
+            float  *GB_RESTRICT Hx_real = (float *) Hx ;
+            float  *GB_RESTRICT Hx_imag = Hx_real + 1 ;
+            #elif GB_IS_PLUS_FC64_MONOID
+            double *GB_RESTRICT Hx_real = (double *) Hx ;
+            double *GB_RESTRICT Hx_imag = Hx_real + 1 ;
+            #endif
+
+            //------------------------------------------------------------------
+            // C<#M>(i1:i2,j) += reduce (W (i2:i2, wstart:wend))
+            //------------------------------------------------------------------
+
+            for (int64_t w = wstart ; w < wend ; w++)
+            {
+
+                //--------------------------------------------------------------
+                // C<#M>(i1:i2,j) += W (i1:i2,w)
+                //--------------------------------------------------------------
+            
+                int64_t pW_start = w * cvlen ;      // pointer to W (:,w)
+
+                for (int64_t i = istart ; i < iend ; i++)
+                {
+
+                    //----------------------------------------------------------
+                    // get pointer and bitmap C(i,j) and W(i,w)
+                    //----------------------------------------------------------
+
+                    int64_t pW = pW_start + i ;     // pointer to W(i,w)
+                    if (Wf [pW] == 0) continue ;    // skip if not present
+                    int64_t pC = pC_start + i ;     // pointer to C(i,j)
+                    int8_t cb = Cb [pC] ;           // bitmap status of C(i,j)
+
+                    //----------------------------------------------------------
+                    // M(i,j) already checked, but adjust Cb if M is sparse
+                    //----------------------------------------------------------
+
+                    #if GB_MASK_IS_SPARSE_OR_HYPER
+                    { 
+                        // M is sparse or hypersparse
+                        cb = (cb & 1) ;
+                    }
+                    #endif
+
+                    //----------------------------------------------------------
+                    // C(i,j) += W (i,w)
+                    //----------------------------------------------------------
+
+                    if (cb == 0)
+                    { 
+                        // C(i,j) = W(i,w)
+                        #if GB_IS_ANY_PAIR_SEMIRING
+                        Cx [pC] = GB_CTYPE_CAST (1, 0) ;        // C(i,j) = 1
+                        #else
+                        GB_CIJ_GATHER (pC, pW) ;
+                        #endif
+                        Cb [pC] = keep ;
+                        task_cnvals++ ;
+                    }
+                    else
+                    { 
+                        // C(i,j) += W(i,w)
+                        GB_CIJ_GATHER_UPDATE (pC, pW) ;
+                    }
+                }
+            }
+            cnvals += task_cnvals ;
+        }
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_AxB_saxpy_template.c b/GraphBLAS/Source/Template/GB_bitmap_AxB_saxpy_template.c
new file mode 100644
index 0000000000..8814c32040
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_AxB_saxpy_template.c
@@ -0,0 +1,621 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_AxB_saxpy_template.c: C<#M>+=A*B when C is bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// GB_AxB_saxpy_sparsity determines the sparsity structure for C<M or !M>=A*B
+// or C=A*B, and this template is used when C is bitmap.  C can be modified
+// in-place if the accum operator is the same as the monoid.
+
+#undef  GB_FREE_WORK
+#define GB_FREE_WORK                                                    \
+{                                                                       \
+    GB_FREE (Wf) ;                                                      \
+    GB_FREE (Wax) ;                                                     \
+    GB_FREE (Wbx) ;                                                     \
+    GB_FREE (Wcx) ;                                                     \
+    GB_FREE (GH_slice) ;                                                \
+    GB_FREE (A_slice) ;                                                 \
+    GB_FREE (B_slice) ;                                                 \
+    GB_ek_slice_free (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice) ;  \
+}
+
+{
+
+    //--------------------------------------------------------------------------
+    // declare workspace
+    //--------------------------------------------------------------------------
+
+    int8_t  *GB_RESTRICT Wf = NULL ;
+    GB_void *GB_RESTRICT Wax = NULL ;
+    GB_void *GB_RESTRICT Wbx = NULL ;
+    GB_void *GB_RESTRICT Wcx = NULL ;
+    int64_t *GB_RESTRICT GH_slice = NULL ;
+    int64_t *GB_RESTRICT A_slice = NULL ;
+    int64_t *GB_RESTRICT B_slice = NULL ;
+    int64_t *GB_RESTRICT pstart_Mslice = NULL ;
+    int64_t *GB_RESTRICT kfirst_Mslice = NULL ;
+    int64_t *GB_RESTRICT klast_Mslice  = NULL ;
+
+    //--------------------------------------------------------------------------
+    // determine max # of threads to use
+    //--------------------------------------------------------------------------
+
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+
+    //--------------------------------------------------------------------------
+    // get C, M, A, and B
+    //--------------------------------------------------------------------------
+
+    ASSERT (GB_IS_BITMAP (C)) ;                 // C is always bitmap
+    int8_t *GB_RESTRICT Cb = C->b ;
+    GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
+    const int64_t cvlen = C->vlen ;
+    ASSERT (C->vlen == A->vlen) ;
+    ASSERT (C->vdim == B->vdim) ;
+    ASSERT (A->vdim == B->vlen) ;
+    int64_t cnvals = C->nvals ;
+
+    const int64_t *GB_RESTRICT Bp = B->p ;
+    const int64_t *GB_RESTRICT Bh = B->h ;
+    const int8_t  *GB_RESTRICT Bb = B->b ;
+    const int64_t *GB_RESTRICT Bi = B->i ;
+    const GB_BTYPE *GB_RESTRICT Bx = (GB_BTYPE *) (B_is_pattern ? NULL : B->x) ;
+    const int64_t bvlen = B->vlen ;
+    const int64_t bvdim = B->vdim ;
+    const int64_t bnvec = B->nvec ;
+
+    const bool B_jumbled = B->jumbled ;
+    const int64_t bnz = GB_NNZ_HELD (B) ;
+
+    const bool B_is_sparse = GB_IS_SPARSE (B) ;
+    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
+    const bool B_is_bitmap = GB_IS_BITMAP (B) ;
+    const bool B_is_sparse_or_hyper = B_is_sparse || B_is_hyper ;
+
+    const int64_t *GB_RESTRICT Ap = A->p ;
+    const int64_t *GB_RESTRICT Ah = A->h ;
+    const int8_t  *GB_RESTRICT Ab = A->b ;
+    const int64_t *GB_RESTRICT Ai = A->i ;
+    const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) (A_is_pattern ? NULL : A->x) ;
+    const int64_t anvec = A->nvec ;
+    const int64_t avlen = A->vlen ;
+    const int64_t avdim = A->vdim ;
+
+    const bool A_jumbled = A->jumbled ;
+    const int64_t anz = GB_NNZ_HELD (A) ;
+
+    const bool A_is_sparse = GB_IS_SPARSE (A) ;
+    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
+    const bool A_is_sparse_or_hyper = A_is_sparse || A_is_hyper ;
+
+    const int64_t *GB_RESTRICT Mp = NULL ;
+    const int64_t *GB_RESTRICT Mh = NULL ;
+    const int8_t  *GB_RESTRICT Mb = NULL ;
+    const int64_t *GB_RESTRICT Mi = NULL ;
+    const GB_void *GB_RESTRICT Mx = NULL ;
+    size_t msize = 0 ;
+    int64_t mnvec = 0 ;
+    int64_t mvlen = 0 ;
+    const bool M_is_hyper  = GB_IS_HYPERSPARSE (M) ;
+    const bool M_is_sparse = GB_IS_SPARSE (M) ;
+    const bool M_is_sparse_or_hyper = M_is_hyper || M_is_sparse ;
+    const bool M_is_bitmap = GB_IS_BITMAP (M) ;
+    const bool M_is_full   = GB_IS_FULL (M) ;
+    int64_t mnz = 0 ;
+    int mthreads = 0 ;
+    int mtasks = 0 ;
+
+    if (M != NULL)
+    { 
+        ASSERT (C->vlen == M->vlen) ;
+        ASSERT (C->vdim == M->vdim) ;
+        Mp = M->p ;
+        Mh = M->h ;
+        Mb = M->b ;
+        Mi = M->i ;
+        Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
+        msize = M->type->size ;
+        mnvec = M->nvec ;
+        mvlen = M->vlen ;
+        mnz = GB_NNZ (M) ;
+
+        mthreads = GB_nthreads (mnz + M->nvec, chunk, nthreads_max) ;
+        mtasks = (mthreads == 1) ? 1 : (8 * mthreads) ;
+        if (!GB_ek_slice (&pstart_Mslice, &kfirst_Mslice, &klast_Mslice,
+            M, &mtasks))
+        { 
+            // out of memory
+            GB_FREE_WORK ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+
+        // if M is sparse or hypersparse, scatter it into the C bitmap
+        if (M_is_sparse_or_hyper)
+        { 
+            // Cb [pC] += 2 for each entry M(i,j) in the mask
+            GB_bitmap_M_scatter (C,
+                NULL, 0, GB_ALL, NULL, NULL, 0, GB_ALL, NULL,
+                M, Mask_struct, GB_ASSIGN, GB_BITMAP_M_SCATTER_PLUS_2,
+                pstart_Mslice, kfirst_Mslice, klast_Mslice,
+                mthreads, mtasks, Context) ;
+            // the bitmap of C now contains:
+            //  Cb (i,j) = 0:   cij not present, mij zero
+            //  Cb (i,j) = 1:   cij present, mij zero
+            //  Cb (i,j) = 2:   cij not present, mij 1
+            //  Cb (i,j) = 3:   cij present, mij 1
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // select the method
+    //--------------------------------------------------------------------------
+
+    if (B_is_sparse_or_hyper)
+    {
+
+        //-----------------------------------------------------
+        // C                =               A     *     B
+        //-----------------------------------------------------
+
+        // bitmap           .               bitmap      hyper
+        // bitmap           .               full        hyper
+        // bitmap           .               bitmap      sparse
+        // bitmap           .               full        sparse
+
+        //-----------------------------------------------------
+        // C               <M>=             A     *     B
+        //-----------------------------------------------------
+
+        // bitmap           any             bitmap      hyper
+        // bitmap           any             full        hyper
+        // bitmap           any             bitmap      sparse
+        // bitmap           any             full        sparse
+
+        //-----------------------------------------------------
+        // C               <!M>=            A     *     B
+        //-----------------------------------------------------
+
+        // bitmap           any             bitmap      hyper
+        // bitmap           any             full        hyper
+        // bitmap           any             bitmap      sparse
+        // bitmap           any             full        sparse
+
+        #undef  GB_PANEL_SIZE
+        #define GB_PANEL_SIZE 64
+
+        ASSERT (GB_IS_BITMAP (A) || GB_IS_FULL (A)) ;
+        double work = ((double) avlen) * ((double) bnz) ;
+        nthreads = GB_nthreads (work, chunk, nthreads_max) ;
+        int naslice, nbslice ;
+
+        if (nthreads == 1 || bnvec == 0)
+        { 
+            // do the entire computation with a single thread
+            naslice = 1 ;
+            nbslice = 1 ;
+        }
+        else
+        {
+            // determine number of slices for A and B
+            ntasks = 2 * nthreads ;
+            int naslice_max = GB_ICEIL (avlen, GB_PANEL_SIZE) ;
+            int dtasks = floor (sqrt ((double) ntasks)) ;
+            naslice = GB_IMIN (dtasks, naslice_max) ;
+            naslice = GB_IMAX (naslice, 1) ;
+            nbslice = ntasks / naslice ;
+            nbslice = GB_IMIN (nbslice, bnvec) ;
+            if (nbslice > bnvec)
+            {
+                // too few vectors of B; recompute nbslice and naslice
+                nbslice = bnvec ;
+                naslice = ntasks / nbslice ;
+                naslice = GB_IMIN (naslice, naslice_max) ;
+                naslice = GB_IMAX (naslice, 1) ;
+            }
+        }
+
+        ntasks = naslice * nbslice ;
+
+        // slice the matrix B
+        if (!GB_pslice (&B_slice, Bp, bnvec, nbslice, false))
+        { 
+            // out of memory
+            GB_FREE_WORK ;
+            return (GrB_OUT_OF_MEMORY) ;
+        }
+
+        if (M == NULL)
+        { 
+
+            //------------------------------------------------------------------
+            // C = A*B, no mask, A bitmap/full, B sparse/hyper
+            //------------------------------------------------------------------
+
+            #define GB_MASK_IS_SPARSE_OR_HYPER 0
+            #define GB_MASK_IS_BITMAP_OR_FULL  0
+            #undef  keep
+            #define keep 1
+            if (A_is_bitmap)
+            {
+                // A is bitmap, B is sparse/hyper, no mask
+                #undef  GB_A_IS_BITMAP
+                #define GB_A_IS_BITMAP 1
+                #include "GB_bitmap_AxB_saxpy_A_bitmap_B_sparse_template.c"
+            }
+            else
+            {
+                // A is full, B is sparse/hyper, no mask
+                #undef  GB_A_IS_BITMAP
+                #define GB_A_IS_BITMAP 0
+                #include "GB_bitmap_AxB_saxpy_A_bitmap_B_sparse_template.c"
+            }
+            #undef GB_MASK_IS_SPARSE_OR_HYPER
+            #undef GB_MASK_IS_BITMAP_OR_FULL
+
+        }
+        else if (M_is_sparse_or_hyper)
+        { 
+
+            //------------------------------------------------------------------
+            // C<M> or <!M>=A*B, M sparse/hyper, A bitmap/full, B sparse/hyper
+            //------------------------------------------------------------------
+
+            #define GB_MASK_IS_SPARSE_OR_HYPER 1
+            #define GB_MASK_IS_BITMAP_OR_FULL  0
+            #undef  keep
+            const int8_t keep = (Mask_comp) ? 1 : 3 ;
+            if (A_is_bitmap)
+            {
+                // A is bitmap, M and B are sparse/hyper
+                #undef  GB_A_IS_BITMAP
+                #define GB_A_IS_BITMAP 1
+                #include "GB_bitmap_AxB_saxpy_A_bitmap_B_sparse_template.c"
+            }
+            else
+            {
+                // A is full, M and B are sparse/hyper
+                #undef  GB_A_IS_BITMAP
+                #define GB_A_IS_BITMAP 0
+                #include "GB_bitmap_AxB_saxpy_A_bitmap_B_sparse_template.c"
+            }
+            #undef GB_MASK_IS_SPARSE_OR_HYPER
+            #undef GB_MASK_IS_BITMAP_OR_FULL
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // C<M> or <!M> = A*B, M bitmap/full, A bitmap/full, B sparse/hyper
+            //------------------------------------------------------------------
+
+            #define GB_MASK_IS_SPARSE_OR_HYPER 0
+            #define GB_MASK_IS_BITMAP_OR_FULL  1
+            #undef  keep
+            #define keep 1
+            if (A_is_bitmap)
+            {
+                // A is bitmap, M is bitmap/full, B is sparse/hyper
+                #undef  GB_A_IS_BITMAP
+                #define GB_A_IS_BITMAP 1
+                #include "GB_bitmap_AxB_saxpy_A_bitmap_B_sparse_template.c"
+            }
+            else
+            {
+                // A is full, M is bitmap/full, B is sparse/hyper
+                #undef  GB_A_IS_BITMAP
+                #define GB_A_IS_BITMAP 0
+                #include "GB_bitmap_AxB_saxpy_A_bitmap_B_sparse_template.c"
+            }
+            #undef GB_MASK_IS_SPARSE_OR_HYPER
+            #undef GB_MASK_IS_BITMAP_OR_FULL
+        }
+
+        #undef GB_PANEL_SIZE
+        #undef GB_A_IS_BITMAP
+
+    }
+    else if (A_is_sparse_or_hyper)
+    {
+
+        //-----------------------------------------------------
+        // C                =               A     *     B
+        //-----------------------------------------------------
+
+        // bitmap           .               hyper       bitmap
+        // bitmap           .               sparse      bitmap
+        // bitmap           .               hyper       full 
+        // bitmap           .               sparse      full
+
+        //-----------------------------------------------------
+        // C               <M>=             A     *     B
+        //-----------------------------------------------------
+
+        // bitmap           any             hyper       bitmap
+        // bitmap           any             sparse      bitmap
+        // bitmap           bitmap/full     hyper       full
+        // bitmap           bitmap/full     sparse      full
+
+        //-----------------------------------------------------
+        // C               <!M>=            A     *     B
+        //-----------------------------------------------------
+
+        // bitmap           any             hyper       bitmap
+        // bitmap           any             sparse      bitmap
+        // bitmap           any             hyper       full 
+        // bitmap           any             sparse      full
+
+        ASSERT (GB_IS_BITMAP (B) || GB_IS_FULL (B)) ;
+        double work = ((double) anz) * (double) bvdim ;
+        nthreads = GB_nthreads (work, chunk, nthreads_max) ;
+        int nfine_tasks_per_vector = 0 ;
+        bool use_coarse_tasks, use_atomics = false ;
+
+        if (nthreads == 1 || bvdim == 0)
+        { 
+            // do the entire computation with a single thread, with coarse task
+            ntasks = 1 ;
+            use_coarse_tasks = true ;
+            GBURBLE ("(coarse, threads: 1) ") ;
+        }
+        else if (nthreads <= bvdim)
+        {
+            // All tasks are coarse, and each coarse task does 1 or more
+            // whole vectors of B
+            ntasks = GB_IMIN (bvdim, 2 * nthreads) ;
+            use_coarse_tasks = true ;
+            GBURBLE ("(coarse, threads: %d, tasks %d) ", nthreads, ntasks) ;
+        }
+        else
+        {
+            // All tasks are fine.  Each task does a slice of a single vector
+            // of B, and each vector of B is handled by the same # of fine
+            // tasks.
+            use_coarse_tasks = false ;
+
+            // Select between a non-atomic method with Wf/Wx workspace,
+            // and an atomic method with no workspace.
+            double cnz = ((double) cvlen) * ((double) bvdim) ;
+            double intensity = ((double) work) / fmax (cnz, 1) ;
+            double workspace = ((double) cvlen) * ((double) nthreads) ;
+            double relwspace = workspace / fmax (anz + bnz + cnz, 1) ;
+            GBURBLE ("(fine, threads: %d, relwspace: %0.3g, intensity: %0.3g",
+                nthreads, relwspace, intensity) ;
+            if ((intensity > 64 && relwspace < 0.05) ||
+                (intensity > 16 && intensity <= 64 && relwspace < 0.50))
+            { 
+                // non-atomic method with workspace
+                use_atomics = false ;
+                ntasks = nthreads ;
+                GBURBLE (": non-atomic) ") ;
+            }
+            else
+            { 
+                // atomic method
+                use_atomics = true ;
+                ntasks = 2 * nthreads ;
+                GBURBLE (": atomic) ") ;
+            }
+
+            nfine_tasks_per_vector = ceil ((double) ntasks / (double) bvdim) ;
+            ntasks = bvdim * nfine_tasks_per_vector ;
+            ASSERT (nfine_tasks_per_vector > 1) ;
+
+            // slice the matrix A for each team of fine tasks
+            if (!GB_pslice (&A_slice, Ap, anvec, nfine_tasks_per_vector, true))
+            { 
+                // out of memory
+                GB_FREE_WORK ;
+                return (GrB_OUT_OF_MEMORY) ;
+            }
+        }
+
+        if (M == NULL)
+        { 
+
+            //------------------------------------------------------------------
+            // C = A*B, no mask, A sparse/hyper, B bitmap/full
+            //------------------------------------------------------------------
+
+            #define GB_MASK_IS_SPARSE_OR_HYPER 0
+            #define GB_MASK_IS_BITMAP_OR_FULL  0
+            #undef  keep
+            #define keep 1
+            if (B_is_bitmap)
+            {
+                // A is sparse/hyper, B is bitmap, no mask
+                #undef  GB_B_IS_BITMAP
+                #define GB_B_IS_BITMAP 1
+                #include "GB_bitmap_AxB_saxpy_A_sparse_B_bitmap_template.c"
+            }
+            else
+            {
+                // A is sparse/hyper, B is full, no mask
+                #undef  GB_B_IS_BITMAP
+                #define GB_B_IS_BITMAP 0
+                #include "GB_bitmap_AxB_saxpy_A_sparse_B_bitmap_template.c"
+            }
+            #undef GB_MASK_IS_SPARSE_OR_HYPER
+            #undef GB_MASK_IS_BITMAP_OR_FULL
+
+        }
+        else if (M_is_sparse_or_hyper)
+        { 
+
+            //------------------------------------------------------------------
+            // C<M> or <!M> = A*B, M and A are sparse/hyper, B bitmap/full
+            //------------------------------------------------------------------
+
+            #define GB_MASK_IS_SPARSE_OR_HYPER 1
+            #define GB_MASK_IS_BITMAP_OR_FULL  0
+            #undef  keep
+            const int8_t keep = (Mask_comp) ? 1 : 3 ;
+            if (B_is_bitmap)
+            {
+                // A is sparse/hyper, B is bitmap, M is sparse/hyper
+                #undef  GB_B_IS_BITMAP
+                #define GB_B_IS_BITMAP 1
+                #include "GB_bitmap_AxB_saxpy_A_sparse_B_bitmap_template.c"
+            }
+            else
+            {
+                // A is sparse/hyper, B is full, no mask
+                #undef  GB_B_IS_BITMAP
+                #define GB_B_IS_BITMAP 0
+                #include "GB_bitmap_AxB_saxpy_A_sparse_B_bitmap_template.c"
+            }
+            #undef GB_MASK_IS_SPARSE_OR_HYPER
+            #undef GB_MASK_IS_BITMAP_OR_FULL
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // C<M> or <!M> = A*B, M bitmap, A sparse, B bitmap
+            //------------------------------------------------------------------
+
+            #define GB_MASK_IS_SPARSE_OR_HYPER 0
+            #define GB_MASK_IS_BITMAP_OR_FULL  1
+            #undef  keep
+            #define keep 1
+            if (B_is_bitmap)
+            {
+                // A is sparse/hyper, B is bitmap, M is bitmap/full
+                #undef  GB_B_IS_BITMAP
+                #define GB_B_IS_BITMAP 1
+                #include "GB_bitmap_AxB_saxpy_A_sparse_B_bitmap_template.c"
+            }
+            else
+            {
+                // A is sparse/hyper, B is full, M is bitmap/full
+                #undef  GB_B_IS_BITMAP
+                #define GB_B_IS_BITMAP 0
+                #include "GB_bitmap_AxB_saxpy_A_sparse_B_bitmap_template.c"
+            }
+            #undef GB_MASK_IS_SPARSE_OR_HYPER
+            #undef GB_MASK_IS_BITMAP_OR_FULL
+        }
+
+        #undef GB_B_IS_BITMAP
+
+    }
+    else
+    {
+
+        //-----------------------------------------------------
+        // C                =               A     *     B
+        //-----------------------------------------------------
+
+        // bitmap           .               bitmap      bitmap
+        // bitmap           .               full        bitmap
+        // bitmap           .               bitmap      full
+        // full             .               full        full
+
+        //-----------------------------------------------------
+        // C               <M>=             A     *     B
+        //-----------------------------------------------------
+
+        // bitmap           any             bitmap      bitmap
+        // bitmap           any             full        bitmap
+        // bitmap           bitmap/full     bitmap      full
+        // bitmap           bitmap/full     full        full
+
+        //-----------------------------------------------------
+        // C               <!M>=            A     *     B
+        //-----------------------------------------------------
+
+        // bitmap           any             bitmap      bitmap
+        // bitmap           any             full        bitmap
+        // bitmap           any             bitmap      full
+        // bitmap           any             full        full
+
+        #define GB_TILE_SIZE 64
+        #define GB_KTILE_SIZE 8
+
+        double work = ((double) avlen) * ((double) bvlen) * ((double) bvdim) ;
+        nthreads = GB_nthreads (work, chunk, nthreads_max) ;
+        int64_t nI_tasks = (bvdim == 0) ? 1 : (1 + (bvdim-1) / GB_TILE_SIZE) ;
+        int64_t nJ_tasks = (avlen == 0) ? 1 : (1 + (avlen-1) / GB_TILE_SIZE) ;
+        int64_t ntasks = nI_tasks * nJ_tasks ;
+
+        if (M == NULL)
+        { 
+
+            //------------------------------------------------------------------
+            // C = A*B, no mask, A and B bitmap/full
+            //------------------------------------------------------------------
+
+            #define GB_MASK_IS_SPARSE_OR_HYPER 0
+            #define GB_MASK_IS_BITMAP_OR_FULL  0
+            #undef  keep
+            #define keep 1
+            #include "GB_bitmap_AxB_saxpy_A_bitmap_B_bitmap_template.c"
+            #undef GB_MASK_IS_SPARSE_OR_HYPER
+            #undef GB_MASK_IS_BITMAP_OR_FULL
+
+        }
+        else if (M_is_sparse_or_hyper)
+        { 
+
+            //------------------------------------------------------------------
+            // C<M> or <!M> = A*B, M sparse/hyper, A and B bitmap/full
+            //------------------------------------------------------------------
+
+            #define GB_MASK_IS_SPARSE_OR_HYPER 1
+            #define GB_MASK_IS_BITMAP_OR_FULL  0
+            #undef  keep
+            const int8_t keep = (Mask_comp) ? 1 : 3 ;
+            #include "GB_bitmap_AxB_saxpy_A_bitmap_B_bitmap_template.c"
+            #undef GB_MASK_IS_SPARSE_OR_HYPER
+            #undef GB_MASK_IS_BITMAP_OR_FULL
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // C<M> or <!M> = A*B, all matrices bitmap/full
+            //------------------------------------------------------------------
+
+            #define GB_MASK_IS_SPARSE_OR_HYPER 0
+            #define GB_MASK_IS_BITMAP_OR_FULL  1
+            #undef  keep
+            #define keep 1
+            #include "GB_bitmap_AxB_saxpy_A_bitmap_B_bitmap_template.c"
+            #undef GB_MASK_IS_SPARSE_OR_HYPER
+            #undef GB_MASK_IS_BITMAP_OR_FULL
+        }
+    }
+
+    C->nvals = cnvals ;
+
+    //--------------------------------------------------------------------------
+    // if M is sparse, clear it from the C bitmap
+    //--------------------------------------------------------------------------
+
+    if (M_is_sparse_or_hyper)
+    { 
+        // Cb [pC] -= 2 for each entry M(i,j) in the mask
+        GB_bitmap_M_scatter (C,
+            NULL, 0, GB_ALL, NULL, NULL, 0, GB_ALL, NULL,
+            M, Mask_struct, GB_ASSIGN, GB_BITMAP_M_SCATTER_MINUS_2,
+            pstart_Mslice, kfirst_Mslice, klast_Mslice,
+            mthreads, mtasks, Context) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace
+    //--------------------------------------------------------------------------
+
+    GB_FREE_WORK ;
+}
+
+#undef GB_FREE_WORK
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_add_template.c b/GraphBLAS/Source/Template/GB_bitmap_add_template.c
new file mode 100644
index 0000000000..6a6626cec9
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_add_template.c
@@ -0,0 +1,797 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_add_template: C = A+B, C<M>=A+B, and C<!M>=A+B, C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// C is bitmap.  The mask M can have any sparsity structure, and is efficient
+// to apply (all methods are asymptotically optimal).  All cases (no M, M, !M)
+// are handled.
+
+{
+
+    // TODO: the input C can be modified in-place, if it is also bitmap
+    int64_t cnvals = 0 ;
+
+    if (M == NULL)
+    {
+
+        //----------------------------------------------------------------------
+        // M is not present
+        //----------------------------------------------------------------------
+
+        //      ------------------------------------------
+        //      C       =           A       +       B
+        //      ------------------------------------------
+        //      bitmap  .           sparse          bitmap
+        //      bitmap  .           bitmap          sparse
+        //      bitmap  .           bitmap          bitmap
+
+        ASSERT (A_is_bitmap || B_is_bitmap) ;
+        ASSERT (!A_is_full) ;
+        ASSERT (!B_is_full) ;
+
+        if (A_is_bitmap && B_is_bitmap)
+        {
+
+            //------------------------------------------------------------------
+            // Method21: C, A, and B are all bitmap
+            //------------------------------------------------------------------
+
+            int tid ;
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
+                reduction(+:cnvals)
+            for (tid = 0 ; tid < C_nthreads ; tid++)
+            {
+                int64_t pstart, pend, task_cnvals = 0 ;
+                GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
+                for (int64_t p = pstart ; p < pend ; p++)
+                {
+                    int8_t c = 0 ;
+                    if (Ab [p] && Bb [p])
+                    { 
+                        // C (i,j) = A (i,j) + B (i,j)
+                        GB_GETA (aij, Ax, p) ;
+                        GB_GETB (bij, Bx, p) ;
+                        GB_BINOP (GB_CX (p), aij, bij, p % vlen, p / vlen) ;
+                        c = 1 ;
+                    }
+                    else if (Bb [p])
+                    { 
+                        // C (i,j) = B (i,j)
+                        GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
+                        c = 1 ;
+                    }
+                    else if (Ab [p])
+                    { 
+                        // C (i,j) = A (i,j)
+                        GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
+                        c = 1 ;
+                    }
+                    Cb [p] = c ;
+                    task_cnvals += c ;
+                }
+                cnvals += task_cnvals ;
+            }
+
+        }
+        else if (A_is_bitmap)
+        {
+
+            //------------------------------------------------------------------
+            // Method22: C and A are bitmap; B is sparse or hypersparse
+            //------------------------------------------------------------------
+
+            int64_t p ;
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static)
+            for (p = 0 ; p < cnz ; p++)
+            { 
+                // C (i,j) = A (i,j)
+                int8_t a = Ab [p] ;
+                if (a) GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
+                Cb [p] = a ;
+            }
+            cnvals = A->nvals ;
+
+            GB_SLICE_MATRIX (B, 8) ;
+
+            #pragma omp parallel for num_threads(B_nthreads) \
+                schedule(dynamic,1) reduction(+:cnvals)
+            for (taskid = 0 ; taskid < B_ntasks ; taskid++)
+            {
+                int64_t kfirst = kfirst_Bslice [taskid] ;
+                int64_t klast  = klast_Bslice  [taskid] ;
+                int64_t task_cnvals = 0 ;
+                for (int64_t k = kfirst ; k <= klast ; k++)
+                {
+                    // find the part of B(:,k) for this task
+                    int64_t j = GBH (Bh, k) ;
+                    int64_t pB_start, pB_end ;
+                    GB_get_pA (&pB_start, &pB_end, taskid, k, kfirst,
+                        klast, pstart_Bslice, Bp, vlen) ;
+                    int64_t pC_start = j * vlen ;
+                    // traverse over B(:,j), the kth vector of B
+                    for (int64_t pB = pB_start ; pB < pB_end ; pB++)
+                    {
+                        int64_t i = Bi [pB] ;
+                        int64_t p = pC_start + i ;
+                        if (Cb [p])
+                        { 
+                            // C (i,j) = A (i,j) + B (i,j)
+                            GB_GETA (aij, Ax, p) ;
+                            GB_GETB (bij, Bx, pB) ;
+                            GB_BINOP (GB_CX (p), aij, bij, i, j) ;
+                        }
+                        else
+                        { 
+                            // C (i,j) = B (i,j)
+                            GB_COPY_B_TO_C (GB_CX (p), Bx, pB) ;
+                            Cb [p] = 1 ;
+                            task_cnvals++ ;
+                        }
+                    }
+                }
+                cnvals += task_cnvals ;
+            }
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // Method23: C and B are bitmap; A is sparse or hypersparse
+            //------------------------------------------------------------------
+
+            int64_t p ;
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static)
+            for (p = 0 ; p < cnz ; p++)
+            { 
+                // C (i,j) = B (i,j)
+                int8_t b = Bb [p] ;
+                if (b) GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
+                Cb [p] = b ;
+            }
+            cnvals = B->nvals ;
+
+            GB_SLICE_MATRIX (A, 8) ;
+
+            #pragma omp parallel for num_threads(A_nthreads) \
+                schedule(dynamic,1) reduction(+:cnvals)
+            for (taskid = 0 ; taskid < A_ntasks ; taskid++)
+            {
+                int64_t kfirst = kfirst_Aslice [taskid] ;
+                int64_t klast  = klast_Aslice  [taskid] ;
+                int64_t task_cnvals = 0 ;
+                for (int64_t k = kfirst ; k <= klast ; k++)
+                {
+                    // find the part of A(:,k) for this task
+                    int64_t j = GBH (Ah, k) ;
+                    int64_t pA_start, pA_end ;
+                    GB_get_pA (&pA_start, &pA_end, taskid, k, kfirst,
+                        klast, pstart_Aslice, Ap, vlen) ;
+                    int64_t pC_start = j * vlen ;
+                    // traverse over A(:,j), the kth vector of A
+                    for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                    {
+                        int64_t i = Ai [pA] ;
+                        int64_t p = pC_start + i ;
+                        if (Cb [p])
+                        { 
+                            // C (i,j) = A (i,j) + B (i,j)
+                            GB_GETA (aij, Ax, pA) ;
+                            GB_GETB (bij, Bx, p) ;
+                            GB_BINOP (GB_CX (p), aij, bij, i, j) ;
+                        }
+                        else
+                        { 
+                            // C (i,j) = A (i,j)
+                            GB_COPY_A_TO_C (GB_CX (p), Ax, pA) ;
+                            Cb [p] = 1 ;
+                            task_cnvals++ ;
+                        }
+                    }
+                }
+                cnvals += task_cnvals ;
+            }
+        }
+
+    }
+    else if (M_is_sparse_or_hyper)
+    { 
+
+        //----------------------------------------------------------------------
+        // C is bitmap, M is sparse or hyper and complemented
+        //----------------------------------------------------------------------
+
+        //      ------------------------------------------
+        //      C     <!M> =        A       +       B
+        //      ------------------------------------------
+        //      bitmap  sparse      sparse          bitmap
+        //      bitmap  sparse      sparse          full  
+        //      bitmap  sparse      bitmap          sparse
+        //      bitmap  sparse      bitmap          bitmap
+        //      bitmap  sparse      bitmap          full  
+        //      bitmap  sparse      full            sparse
+        //      bitmap  sparse      full            bitmap
+        //      bitmap  sparse      full            full  
+
+        // M is sparse and complemented.  If M is sparse and not
+        // complemented, then C is constructed as sparse, not bitmap.
+        ASSERT (Mask_comp) ;
+
+        // C(i,j) = A(i,j) + B(i,j) can only be computed where M(i,j) is
+        // not present in the sparse pattern of M, and where it is present
+        // but equal to zero.
+
+        //----------------------------------------------------------------------
+        // scatter M into the C bitmap
+        //----------------------------------------------------------------------
+
+        GB_SLICE_MATRIX (M, 8) ;
+
+        #pragma omp parallel for num_threads(M_nthreads) schedule(dynamic,1)
+        for (taskid = 0 ; taskid < M_ntasks ; taskid++)
+        {
+            int64_t kfirst = kfirst_Mslice [taskid] ;
+            int64_t klast  = klast_Mslice  [taskid] ;
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+                // find the part of M(:,k) for this task
+                int64_t j = GBH (Mh, k) ;
+                int64_t pM_start, pM_end ;
+                GB_get_pA (&pM_start, &pM_end, taskid, k, kfirst,
+                    klast, pstart_Mslice, Mp, vlen) ;
+                int64_t pC_start = j * vlen ;
+                // traverse over M(:,j), the kth vector of M
+                for (int64_t pM = pM_start ; pM < pM_end ; pM++)
+                {
+                    // mark C(i,j) if M(i,j) is true
+                    bool mij = GB_mcast (Mx, pM, msize) ;
+                    if (mij)
+                    { 
+                        int64_t i = Mi [pM] ;
+                        int64_t p = pC_start + i ;
+                        Cb [p] = 2 ;
+                    }
+                }
+            }
+        }
+
+        // C(i,j) has been marked, in Cb, with the value 2 where M(i,j)=1.
+        // These positions will not be computed in C(i,j).  C(i,j) can only
+        // be modified where Cb [p] is zero.
+
+        //----------------------------------------------------------------------
+        // compute C<!M>=A+B using the mask scattered in C
+        //----------------------------------------------------------------------
+
+        bool M_cleared = false ;
+
+        if ((A_is_bitmap || A_is_full) && (B_is_bitmap || B_is_full))
+        {
+
+            //------------------------------------------------------------------
+            // Method24(!M,sparse): C is bitmap, both A and B are bitmap or full
+            //------------------------------------------------------------------
+
+            int tid ;
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
+                reduction(+:cnvals)
+            for (tid = 0 ; tid < C_nthreads ; tid++)
+            {
+                int64_t pstart, pend, task_cnvals = 0 ;
+                GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
+                for (int64_t p = pstart ; p < pend ; p++)
+                {
+                    int8_t c = Cb [p] ;
+                    if (c == 0)
+                    {
+                        // M(i,j) is zero, so C(i,j) can be computed
+                        int8_t a = GBB (Ab, p) ;
+                        int8_t b = GBB (Bb, p) ;
+                        if (a && b)
+                        { 
+                            // C (i,j) = A (i,j) + B (i,j)
+                            GB_GETA (aij, Ax, p) ;
+                            GB_GETB (bij, Bx, p) ;
+                            GB_BINOP (GB_CX (p), aij, bij, p % vlen, p / vlen) ;
+                            c = 1 ;
+                        }
+                        else if (b)
+                        { 
+                            // C (i,j) = B (i,j)
+                            GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
+                            c = 1 ;
+                        }
+                        else if (a)
+                        { 
+                            // C (i,j) = A (i,j)
+                            GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
+                            c = 1 ;
+                        }
+                        Cb [p] = c ;
+                        task_cnvals += c ;
+                    }
+                    else
+                    { 
+                        // M(i,j) == 1, so C(i,j) is not computed
+                        Cb [p] = 0 ;
+                    }
+                }
+                cnvals += task_cnvals ;
+            }
+            M_cleared = true ;      // M has also been cleared from C
+
+        }
+        else if (A_is_bitmap || A_is_full)
+        {
+
+            //------------------------------------------------------------------
+            // Method25(!M,sparse): C bitmap, A bitmap or full, B sparse/hyper
+            //------------------------------------------------------------------
+
+            int tid ;
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
+                reduction(+:cnvals)
+            for (tid = 0 ; tid < C_nthreads ; tid++)
+            {
+                int64_t pstart, pend, task_cnvals = 0 ;
+                GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
+                for (int64_t p = pstart ; p < pend ; p++)
+                {
+                    if (Cb [p] == 0)
+                    { 
+                        // C (i,j) = A (i,j)
+                        int8_t a = GBB (Ab, p) ;
+                        if (a) GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
+                        Cb [p] = a ;
+                        task_cnvals += a ;
+                    }
+                }
+                cnvals += task_cnvals ;
+            }
+
+            GB_SLICE_MATRIX (B, 8) ;
+
+            #pragma omp parallel for num_threads(B_nthreads) \
+                schedule(dynamic,1) reduction(+:cnvals)
+            for (taskid = 0 ; taskid < B_ntasks ; taskid++)
+            {
+                int64_t kfirst = kfirst_Bslice [taskid] ;
+                int64_t klast  = klast_Bslice  [taskid] ;
+                int64_t task_cnvals = 0 ;
+                for (int64_t k = kfirst ; k <= klast ; k++)
+                {
+                    // find the part of B(:,k) for this task
+                    int64_t j = GBH (Bh, k) ;
+                    int64_t pB_start, pB_end ;
+                    GB_get_pA (&pB_start, &pB_end, taskid, k, kfirst,
+                        klast, pstart_Bslice, Bp, vlen) ;
+                    int64_t pC_start = j * vlen ;
+                    // traverse over B(:,j), the kth vector of B
+                    for (int64_t pB = pB_start ; pB < pB_end ; pB++)
+                    {
+                        int64_t i = Bi [pB] ;
+                        int64_t p = pC_start + i ;
+                        int8_t c = Cb [p] ;
+                        if (c == 1)
+                        { 
+                            // C (i,j) = A (i,j) + B (i,j)
+                            GB_GETA (aij, Ax, p) ;
+                            GB_GETB (bij, Bx, pB) ;
+                            GB_BINOP (GB_CX (p), aij, bij, i, j) ;
+                        }
+                        else if (c == 0)
+                        { 
+                            // C (i,j) = B (i,j)
+                            GB_COPY_B_TO_C (GB_CX (p), Bx, pB) ;
+                            Cb [p] = 1 ;
+                            task_cnvals++ ;
+                        }
+                    }
+                }
+                cnvals += task_cnvals ;
+            }
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // Method26: C bitmap, A sparse or hypersparse, B bitmap or full
+            //------------------------------------------------------------------
+
+            int tid ;
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
+                reduction(+:cnvals)
+            for (tid = 0 ; tid < C_nthreads ; tid++)
+            {
+                int64_t pstart, pend, task_cnvals = 0 ;
+                GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
+                for (int64_t p = pstart ; p < pend ; p++)
+                {
+                    if (Cb [p] == 0)
+                    { 
+                        // C (i,j) = B (i,j)
+                        int8_t b = GBB (Bb, p) ;
+                        if (b) GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
+                        Cb [p] = b ;
+                        task_cnvals += b ;
+                    }
+                }
+                cnvals += task_cnvals ;
+            }
+
+            GB_SLICE_MATRIX (A, 8) ;
+
+            #pragma omp parallel for num_threads(A_nthreads) \
+                schedule(dynamic,1) reduction(+:cnvals)
+            for (taskid = 0 ; taskid < A_ntasks ; taskid++)
+            {
+                int64_t kfirst = kfirst_Aslice [taskid] ;
+                int64_t klast  = klast_Aslice  [taskid] ;
+                int64_t task_cnvals = 0 ;
+                for (int64_t k = kfirst ; k <= klast ; k++)
+                {
+                    // find the part of A(:,k) for this task
+                    int64_t j = GBH (Ah, k) ;
+                    int64_t pA_start, pA_end ;
+                    GB_get_pA (&pA_start, &pA_end, taskid, k, kfirst,
+                        klast, pstart_Aslice, Ap, vlen) ;
+                    int64_t pC_start = j * vlen ;
+                    // traverse over A(:,j), the kth vector of A
+                    for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                    {
+                        int64_t i = Ai [pA] ;
+                        int64_t p = pC_start + i ;
+                        int8_t c = Cb [p] ;
+                        if (c == 1)
+                        { 
+                            // C (i,j) = A (i,j) + B (i,j)
+                            GB_GETA (aij, Ax, pA) ;
+                            GB_GETB (bij, Bx, p) ;
+                            GB_BINOP (GB_CX (p), aij, bij, i, j) ;
+                        }
+                        else if (c == 0)
+                        { 
+                            // C (i,j) = A (i,j)
+                            GB_COPY_A_TO_C (GB_CX (p), Ax, pA) ;
+                            Cb [p] = 1 ;
+                            task_cnvals++ ;
+                        }
+                    }
+                }
+                cnvals += task_cnvals ;
+            }
+        }
+
+        //---------------------------------------------------------------------
+        // clear M from C
+        //---------------------------------------------------------------------
+
+        if (!M_cleared)
+        {
+            // This step is required if either A or B are sparse/hyper (if
+            // one is sparse/hyper, the other must be bitmap).  It requires
+            // an extra pass over the mask M, so this might be slower than
+            // postponing the application of the mask, and doing it later.
+
+            #pragma omp parallel for num_threads(M_nthreads) schedule(dynamic,1)
+            for (taskid = 0 ; taskid < M_ntasks ; taskid++)
+            {
+                int64_t kfirst = kfirst_Mslice [taskid] ;
+                int64_t klast  = klast_Mslice  [taskid] ;
+                for (int64_t k = kfirst ; k <= klast ; k++)
+                {
+                    // find the part of M(:,k) for this task
+                    int64_t j = GBH (Mh, k) ;
+                    int64_t pM_start, pM_end ;
+                    GB_get_pA (&pM_start, &pM_end, taskid, k, kfirst,
+                        klast, pstart_Mslice, Mp, vlen) ;
+                    int64_t pC_start = j * vlen ;
+                    // traverse over M(:,j), the kth vector of M
+                    for (int64_t pM = pM_start ; pM < pM_end ; pM++)
+                    {
+                        // mark C(i,j) if M(i,j) is true
+                        bool mij = GB_mcast (Mx, pM, msize) ;
+                        if (mij)
+                        { 
+                            int64_t i = Mi [pM] ;
+                            int64_t p = pC_start + i ;
+                            Cb [p] = 0 ;
+                        }
+                    }
+                }
+            }
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // C is bitmap; M is bitmap or full
+        //----------------------------------------------------------------------
+
+        //      ------------------------------------------
+        //      C      <M> =        A       +       B
+        //      ------------------------------------------
+        //      bitmap  bitmap      sparse          bitmap
+        //      bitmap  bitmap      sparse          full  
+        //      bitmap  bitmap      bitmap          sparse
+        //      bitmap  bitmap      bitmap          bitmap
+        //      bitmap  bitmap      bitmap          full  
+        //      bitmap  bitmap      full            sparse
+        //      bitmap  bitmap      full            bitmap
+        //      bitmap  bitmap      full            full  
+
+        //      ------------------------------------------
+        //      C      <M> =        A       +       B
+        //      ------------------------------------------
+        //      bitmap  full        sparse          bitmap
+        //      bitmap  full        sparse          full  
+        //      bitmap  full        bitmap          sparse
+        //      bitmap  full        bitmap          bitmap
+        //      bitmap  full        bitmap          full  
+        //      bitmap  full        full            sparse
+        //      bitmap  full        full            bitmap
+        //      bitmap  full        full            full  
+
+        //      ------------------------------------------
+        //      C     <!M> =        A       +       B
+        //      ------------------------------------------
+        //      bitmap  bitmap      sparse          sparse
+        //      bitmap  bitmap      sparse          bitmap
+        //      bitmap  bitmap      sparse          full  
+        //      bitmap  bitmap      bitmap          sparse
+        //      bitmap  bitmap      bitmap          bitmap
+        //      bitmap  bitmap      bitmap          full  
+        //      bitmap  bitmap      full            sparse
+        //      bitmap  bitmap      full            bitmap
+        //      bitmap  bitmap      full            full  
+
+        //      ------------------------------------------
+        //      C     <!M> =        A       +       B
+        //      ------------------------------------------
+        //      bitmap  full        sparse          sparse
+        //      bitmap  full        sparse          bitmap
+        //      bitmap  full        sparse          full  
+        //      bitmap  full        bitmap          sparse
+        //      bitmap  full        bitmap          bitmap
+        //      bitmap  full        bitmap          full  
+        //      bitmap  full        full            sparse
+        //      bitmap  full        full            bitmap
+        //      bitmap  full        full            full  
+
+
+        ASSERT (M_is_bitmap || M_is_full) ;
+        ASSERT (A_is_bitmap || A_is_full || B_is_bitmap || B_is_full) ;
+
+        #undef  GB_GET_MIJ     
+        #define GB_GET_MIJ(p)                                           \
+            bool mij = GBB (Mb, p) && GB_mcast (Mx, p, msize) ;         \
+            if (Mask_comp) mij = !mij ;
+
+        if ((A_is_bitmap || A_is_full) && (B_is_bitmap || B_is_full))
+        {
+
+            //------------------------------------------------------------------
+            // Method27: C is bitmap; M, A, and B are bitmap or full
+            //------------------------------------------------------------------
+
+            int tid ;
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
+                reduction(+:cnvals)
+            for (tid = 0 ; tid < C_nthreads ; tid++)
+            {
+                int64_t pstart, pend, task_cnvals = 0 ;
+                GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
+                for (int64_t p = pstart ; p < pend ; p++)
+                {
+                    GB_GET_MIJ (p) ;
+                    if (mij)
+                    {
+                        // M(i,j) is true, so C(i,j) can be computed
+                        int8_t a = GBB (Ab, p) ;
+                        int8_t b = GBB (Bb, p) ;
+                        int8_t c = 0 ;
+                        if (a && b)
+                        { 
+                            // C (i,j) = A (i,j) + B (i,j)
+                            GB_GETA (aij, Ax, p) ;
+                            GB_GETB (bij, Bx, p) ;
+                            GB_BINOP (GB_CX (p), aij, bij, p % vlen, p / vlen) ;
+                            c = 1 ;
+                        }
+                        else if (b)
+                        { 
+                            // C (i,j) = B (i,j)
+                            GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
+                            c = 1 ;
+                        }
+                        else if (a)
+                        { 
+                            // C (i,j) = A (i,j)
+                            GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
+                            c = 1 ;
+                        }
+                        Cb [p] = c ;
+                        task_cnvals += c ;
+                    }
+                    else
+                    { 
+                        // M(i,j) == 1, so C(i,j) is not computed
+                        Cb [p] = 0 ;
+                    }
+                }
+                cnvals += task_cnvals ;
+            }
+
+        }
+        else if (A_is_bitmap || A_is_full)
+        {
+
+            //------------------------------------------------------------------
+            // Method28: C bitmap; M and A bitmap or full; B sparse or hyper
+            //------------------------------------------------------------------
+
+            int tid ;
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
+                reduction(+:cnvals)
+            for (tid = 0 ; tid < C_nthreads ; tid++)
+            {
+                int64_t pstart, pend, task_cnvals = 0 ;
+                GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
+                for (int64_t p = pstart ; p < pend ; p++)
+                {
+                    GB_GET_MIJ (p) ;
+                    if (mij)
+                    { 
+                        // C (i,j) = A (i,j)
+                        int8_t a = GBB (Ab, p) ;
+                        if (a) GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
+                        Cb [p] = a ;
+                        task_cnvals += a ;
+                    }
+                    else
+                    { 
+                        Cb [p] = 0 ;
+                    }
+                }
+                cnvals += task_cnvals ;
+            }
+
+            GB_SLICE_MATRIX (B, 8) ;
+
+            #pragma omp parallel for num_threads(B_nthreads) \
+                schedule(dynamic,1) reduction(+:cnvals)
+            for (taskid = 0 ; taskid < B_ntasks ; taskid++)
+            {
+                int64_t kfirst = kfirst_Bslice [taskid] ;
+                int64_t klast  = klast_Bslice  [taskid] ;
+                int64_t task_cnvals = 0 ;
+                for (int64_t k = kfirst ; k <= klast ; k++)
+                {
+                    // find the part of B(:,k) for this task
+                    int64_t j = GBH (Bh, k) ;
+                    int64_t pB_start, pB_end ;
+                    GB_get_pA (&pB_start, &pB_end, taskid, k, kfirst,
+                        klast, pstart_Bslice, Bp, vlen) ;
+                    int64_t pC_start = j * vlen ;
+                    // traverse over B(:,j), the kth vector of B
+                    for (int64_t pB = pB_start ; pB < pB_end ; pB++)
+                    {
+                        int64_t i = Bi [pB] ;
+                        int64_t p = pC_start + i ;
+                        GB_GET_MIJ (p) ;
+                        if (mij)
+                        {
+                            int8_t c = Cb [p] ;
+                            if (c == 1)
+                            { 
+                                // C (i,j) = A (i,j) + B (i,j)
+                                GB_GETA (aij, Ax, p) ;
+                                GB_GETB (bij, Bx, pB) ;
+                                GB_BINOP (GB_CX (p), aij, bij, i, j) ;
+                            }
+                            else
+                            { 
+                                // C (i,j) = B (i,j)
+                                GB_COPY_B_TO_C (GB_CX (p), Bx, pB) ;
+                                Cb [p] = 1 ;
+                                task_cnvals++ ;
+                            }
+                        }
+                    }
+                }
+                cnvals += task_cnvals ;
+            }
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // Method29: C bitmap; M and B bitmap or full; A sparse or hyper
+            //------------------------------------------------------------------
+
+            int tid ;
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
+                reduction(+:cnvals)
+            for (tid = 0 ; tid < C_nthreads ; tid++)
+            {
+                int64_t pstart, pend, task_cnvals = 0 ;
+                GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
+                for (int64_t p = pstart ; p < pend ; p++)
+                {
+                    GB_GET_MIJ (p) ;
+                    if (mij)
+                    { 
+                        // C (i,j) = B (i,j)
+                        int8_t b = GBB (Bb, p) ;
+                        if (b) GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
+                        Cb [p] = b ;
+                        task_cnvals += b ;
+                    }
+                    else
+                    { 
+                        Cb [p] = 0 ;
+                    }
+                }
+                cnvals += task_cnvals ;
+            }
+
+            GB_SLICE_MATRIX (A, 8) ;
+
+            #pragma omp parallel for num_threads(A_nthreads) \
+                schedule(dynamic,1) reduction(+:cnvals)
+            for (taskid = 0 ; taskid < A_ntasks ; taskid++)
+            {
+                int64_t kfirst = kfirst_Aslice [taskid] ;
+                int64_t klast  = klast_Aslice  [taskid] ;
+                int64_t task_cnvals = 0 ;
+                for (int64_t k = kfirst ; k <= klast ; k++)
+                {
+                    // find the part of A(:,k) for this task
+                    int64_t j = GBH (Ah, k) ;
+                    int64_t pA_start, pA_end ;
+                    GB_get_pA (&pA_start, &pA_end, taskid, k, kfirst,
+                        klast, pstart_Aslice, Ap, vlen) ;
+                    int64_t pC_start = j * vlen ;
+                    // traverse over A(:,j), the kth vector of A
+                    for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                    {
+                        int64_t i = Ai [pA] ;
+                        int64_t p = pC_start + i ;
+                        GB_GET_MIJ (p) ;
+                        if (mij)
+                        {
+                            int8_t c = Cb [p] ;
+                            if (c == 1)
+                            { 
+                                // C (i,j) = A (i,j) + B (i,j)
+                                GB_GETA (aij, Ax, pA) ;
+                                GB_GETB (bij, Bx, p) ;
+                                GB_BINOP (GB_CX (p), aij, bij, i, j) ;
+                            }
+                            else
+                            { 
+                                // C (i,j) = A (i,j)
+                                GB_COPY_A_TO_C (GB_CX (p), Ax, pA) ;
+                                Cb [p] = 1 ;
+                                task_cnvals++ ;
+                            }
+                        }
+                    }
+                }
+                cnvals += task_cnvals ;
+            }
+        }
+    }
+
+    C->nvals = cnvals ;
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_assign_A_template.c b/GraphBLAS/Source/Template/GB_bitmap_assign_A_template.c
new file mode 100644
index 0000000000..3fb50ab229
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_assign_A_template.c
@@ -0,0 +1,92 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_A_template: traverse over A for bitmap assignment into C
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This template traverses over all the entries of the matrix A and operates on
+// the corresponding entry in C(i,j), using the GB_AIJ_WORK macro.  A can be
+// hypersparse, sparse, bitmap, or full.  It is not a scalar.  The matrix
+// C must be bitmap or full.
+
+{
+
+    //--------------------------------------------------------------------------
+    // matrix assignment: slice the entries of A for each task
+    //--------------------------------------------------------------------------
+
+    int nthreads = GB_nthreads (GB_NNZ (A) + A->nvec, chunk, nthreads_max) ;
+    int ntasks = (nthreads == 1) ? 1 : (8 * nthreads) ;
+    int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
+    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A, &ntasks))
+    { 
+        // out of memory
+        GB_FREE_ALL ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // traverse of the entries of the matrix A
+    //--------------------------------------------------------------------------
+
+    int tid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+        reduction(+:cnvals)
+    for (tid = 0 ; tid < ntasks ; tid++)
+    {
+
+        // if kfirst > klast then task tid does no work at all
+        int64_t kfirst = kfirst_slice [tid] ;
+        int64_t klast  = klast_slice  [tid] ;
+        int64_t task_cnvals = 0 ;
+
+        //----------------------------------------------------------------------
+        // traverse over A (:,kfirst:klast)
+        //----------------------------------------------------------------------
+
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // find the part of A(:,k) for this task
+            //------------------------------------------------------------------
+
+            int64_t jA = GBH (Ah, k) ;
+            int64_t pA_start, pA_end ;
+            GB_get_pA (&pA_start, &pA_end, tid, k, kfirst,
+                klast, pstart_slice, Ap, nI) ;
+
+            //------------------------------------------------------------------
+            // traverse over A(:,jA), the kth vector of A
+            //------------------------------------------------------------------
+
+            int64_t jC = GB_ijlist (J, jA, Jkind, Jcolon) ;
+            int64_t pC0 = jC * cvlen ;      // first entry in C(:,jC)
+
+            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+            { 
+                if (!GBB (Ab, pA)) continue ;
+                int64_t iA = GBI (Ai, pA, nI) ;
+                int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                int64_t pC = iC + pC0 ;
+                // operate on C(iC,jC) at pC, and A(iA,jA) at pA.  The mask
+                // can be accessed at pC if M is bitmap or full.  A has any
+                // sparsity format so only A(iA,jA) can be accessed at pA.
+                // To access a full matrix M for the subassign case, use
+                // the position (iA + jA*nI).
+                GB_AIJ_WORK (pC, pA) ;
+            }
+        }
+        cnvals += task_cnvals ;
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace
+    //--------------------------------------------------------------------------
+
+    GB_ek_slice_free (&pstart_slice, &kfirst_slice, &klast_slice) ;
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_assign_A_whole_template.c b/GraphBLAS/Source/Template/GB_bitmap_assign_A_whole_template.c
new file mode 100644
index 0000000000..bb0189fc67
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_assign_A_whole_template.c
@@ -0,0 +1,87 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_A_whole_template: traverse A for bitmap assignment into C
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This template traverses over all the entries of the matrix A and operates on
+// the corresponding entry in C(i,j), using the GB_AIJ_WORK macro.  A can be
+// hypersparse or sparse, not bitmap or full.  It is not a scalar.
+
+{
+
+    //--------------------------------------------------------------------------
+    // matrix assignment: slice the entries of A for each task
+    //--------------------------------------------------------------------------
+
+    const int64_t avlen = A->vlen ;
+    int nthreads = GB_nthreads (GB_NNZ (A) + A->nvec, chunk, nthreads_max) ;
+    int ntasks = (nthreads == 1) ? 1 : (8 * nthreads) ;
+    int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
+    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A, &ntasks))
+    { 
+        // out of memory
+        GB_FREE_ALL ;
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // traverse of the entries of the matrix A
+    //--------------------------------------------------------------------------
+
+    int tid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+        reduction(+:cnvals)
+    for (tid = 0 ; tid < ntasks ; tid++)
+    {
+
+        // if kfirst > klast then task tid does no work at all
+        int64_t kfirst = kfirst_slice [tid] ;
+        int64_t klast  = klast_slice  [tid] ;
+        int64_t task_cnvals = 0 ;
+
+        //----------------------------------------------------------------------
+        // traverse over A (:,kfirst:klast)
+        //----------------------------------------------------------------------
+
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // find the part of A(:,k) for this task
+            //------------------------------------------------------------------
+
+            int64_t j = GBH (Ah, k) ;
+            int64_t pA_start, pA_end ;
+            GB_get_pA (&pA_start, &pA_end, tid, k, kfirst,
+                klast, pstart_slice, Ap, avlen) ;
+
+            //------------------------------------------------------------------
+            // traverse over A(:,j), the kth vector of A
+            //------------------------------------------------------------------
+
+            int64_t pC0 = j * cvlen ;      // first entry in C(:,j)
+
+            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+            { 
+                int64_t i = Ai [pA] ;
+                int64_t pC = i + pC0 ;
+                // operate on C(i,j) at pC, and A(i,j) at pA.  The mask
+                // can be accessed at pC if M is bitmap or full.  A has any
+                // sparsity format so only A(i,j) can be accessed at pA.
+                GB_AIJ_WORK (pC, pA) ;
+            }
+        }
+        cnvals += task_cnvals ;
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace
+    //--------------------------------------------------------------------------
+
+    GB_ek_slice_free (&pstart_slice, &kfirst_slice, &klast_slice) ;
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_assign_C_template.c b/GraphBLAS/Source/Template/GB_bitmap_assign_C_template.c
new file mode 100644
index 0000000000..dad7520d63
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_assign_C_template.c
@@ -0,0 +1,118 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_C_template: iterate over a bitmap matrix C
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// The #include'ing file defines a GB_CIJ_WORK macro for the body of the loop,
+// which operates on the entry C(iC,jC) at position Cx [pC] and Cb [pC].  The C
+// matrix held in bitmap form.  If the mask matrix is also a bitmap matrix or
+// full matrix, the GB_GET_MIJ macro can compute the effective value of the
+// mask for the C(iC,jC) entry.
+
+// C must be bitmap or full.  If M is accessed, it must also be bitmap or full.
+
+#ifndef GB_GET_MIJ
+#define GB_GET_MIJ(mij,pM) ;
+#endif
+
+{
+    switch (assign_kind)
+    {
+
+        //----------------------------------------------------------------------
+        // row assignment: C<M'>(iC,:), M is a column vector
+        //----------------------------------------------------------------------
+
+        case GB_ROW_ASSIGN : 
+        {
+            // iterate over all of C(iC,:)
+            const int64_t iC = I [0] ;
+            const int nthreads = GB_nthreads (cvdim, chunk, nthreads_max) ;
+            int tid ;
+            #pragma omp parallel for num_threads(nthreads) schedule(static) \
+                reduction(+:cnvals)
+            for (tid = 0 ; tid < nthreads ; tid++)
+            {
+                int64_t jC_start, jC_end, task_cnvals = 0 ;
+                GB_PARTITION (jC_start, jC_end, cvdim, tid, nthreads) ;
+                for (int64_t jC = jC_start ; jC < jC_end ; jC++)
+                { 
+                    int64_t pC = iC + jC * cvlen ;
+                    GB_GET_MIJ (mij, jC) ;          // mij = Mask (jC)
+                    GB_CIJ_WORK (pC) ;              // operate on C(iC,jC)
+                }
+                cnvals += task_cnvals ;
+            }
+        }
+        break ;
+
+        //----------------------------------------------------------------------
+        // column assignment: C<M>(:,jC), M is a column vector
+        //----------------------------------------------------------------------
+
+        case GB_COL_ASSIGN : 
+        {
+            // iterate over all of C(:,jC)
+            const int64_t jC = J [0] ;
+            const int64_t pC0 = jC * cvlen ;
+            const int nthreads = GB_nthreads (cvlen, chunk, nthreads_max) ;
+            int tid ;
+            #pragma omp parallel for num_threads(nthreads) schedule(static) \
+                reduction(+:cnvals)
+            for (tid = 0 ; tid < nthreads ; tid++)
+            {
+                int64_t iC_start, iC_end, task_cnvals = 0 ;
+                GB_PARTITION (iC_start, iC_end, cvlen, tid, nthreads) ;
+                for (int64_t iC = iC_start ; iC < iC_end ; iC++)
+                { 
+                    int64_t pC = iC + pC0 ;
+                    GB_GET_MIJ (mij, iC) ;          // mij = Mask (iC)
+                    GB_CIJ_WORK (pC) ;              // operate on C(iC,jC)
+                }
+                cnvals += task_cnvals ;
+            }
+        }
+        break ;
+
+        //----------------------------------------------------------------------
+        // GrB_assign: C<M>(I,J), M is a matrix the same size as C
+        //----------------------------------------------------------------------
+
+        #ifndef GB_NO_ASSIGN_CASE
+        case GB_ASSIGN : 
+        {
+            // iterate over all of C(:,:).
+            #include "GB_bitmap_assign_C_whole_template.c"
+        }
+        break ;
+        #endif
+
+        //----------------------------------------------------------------------
+        // GxB_subassign: C(I,J)<M>, M is a matrix the same size as C(I,J)
+        //----------------------------------------------------------------------
+
+        #ifndef GB_NO_SUBASSIGN_CASE
+        case GB_SUBASSIGN : 
+        {
+            // iterate over all of C(I,J)
+            #undef  GB_IXJ_WORK
+            #define GB_IXJ_WORK(pC,pA)                                      \
+            {                                                               \
+                GB_GET_MIJ (mij, pA) ;          /* mij = Mask (pA)      */  \
+                GB_CIJ_WORK (pC) ;              /* operate on C(iC,jC)  */  \
+            }
+            #include "GB_bitmap_assign_IxJ_template.c"
+        }
+        break ;
+        #endif
+
+        default: ;
+    }
+}
+
+#undef GB_NO_ASSIGN_CASE
+#undef GB_NO_SUBASSIGN_CASE
diff --git a/GraphBLAS/Source/Template/GB_bitmap_assign_C_whole_template.c b/GraphBLAS/Source/Template/GB_bitmap_assign_C_whole_template.c
new file mode 100644
index 0000000000..8a0ad1e1bd
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_assign_C_whole_template.c
@@ -0,0 +1,41 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_C_whole_template: iterate over a bitmap matrix C
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// The #include'ing file defines a GB_CIJ_WORK macro for the body of the loop,
+// which operates on the entry C(iC,jC) at position Cx [pC] and Cb [pC].  The C
+// matrix held in bitmap form.  If the mask matrix is also a bitmap matrix or
+// full matrix, the GB_GET_MIJ macro can compute the effective value of the
+// mask for the C(iC,jC) entry.
+
+// C must be bitmap or full.  If M is accessed, it must also be bitmap or full.
+
+#ifndef GB_GET_MIJ
+#define GB_GET_MIJ(mij,pM) ;
+#endif
+
+{
+    // iterate over all of C(:,:).
+    int nthreads = GB_nthreads (cnzmax, chunk, nthreads_max) ;
+    int tid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(static) \
+        reduction(+:cnvals)
+    for (tid = 0 ; tid < nthreads ; tid++)
+    {
+        int64_t pC_start, pC_end, task_cnvals = 0 ;
+        GB_PARTITION (pC_start, pC_end, cnzmax, tid, nthreads) ;
+        for (int64_t pC = pC_start ; pC < pC_end ; pC++)
+        { 
+            // int64_t iC = pC % cvlen ;
+            // int64_t jC = pC / cvlen ;
+            GB_GET_MIJ (mij, pC) ;          // mij = Mask (pC)
+            GB_CIJ_WORK (pC) ;              // operate on C(iC,jC)
+        }
+        cnvals += task_cnvals ;
+    }
+}
diff --git a/GraphBLAS/Source/Template/GB_bitmap_assign_IxJ_template.c b/GraphBLAS/Source/Template/GB_bitmap_assign_IxJ_template.c
new file mode 100644
index 0000000000..378303b34c
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_assign_IxJ_template.c
@@ -0,0 +1,96 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_IxJ_template: iterate over all of C(I,J)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Iterate over all positions in the IxJ Cartesian product.  This is all
+// entries C(i,j) where i is in the list I and j is in the list J.  This
+// traversal occurs whether or not C(i,j) is an entry present in C.
+
+// The C matrix is accessed at C(I,J).  The A matrix is size |I|-by-|J|.
+// For bitmap assignent, C(I,J)=A is being computed.  For bitmap extraction,
+// C=A(I,J) so the roles of A and C are swapped (see GB_bitmap_subref.c).
+
+{
+
+    //--------------------------------------------------------------------------
+    // create the tasks to iterate over IxJ
+    //--------------------------------------------------------------------------
+
+    int ntasks = 0, max_ntasks = 0, nthreads ;
+    GB_task_struct *TaskList = NULL ;
+    GB_OK (GB_subassign_IxJ_slice (&TaskList, &max_ntasks, &ntasks, &nthreads,
+        /* I, */ nI, /* Ikind, Icolon, J, */ nJ, /* Jkind, Jcolon, */ Context));
+
+    //--------------------------------------------------------------------------
+    // iterate over all IxJ
+    //--------------------------------------------------------------------------
+
+    int taskid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+        reduction(+:cnvals)
+    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    {
+
+        //----------------------------------------------------------------------
+        // get the task descriptor
+        //----------------------------------------------------------------------
+
+        int64_t kfirst = TaskList [taskid].kfirst ;
+        int64_t klast  = TaskList [taskid].klast ;
+        int64_t task_cnvals = 0 ;
+        bool fine_task = (klast == -1) ;
+        int64_t iA_start = 0, iA_end = nI ;
+        if (fine_task)
+        { 
+            // a fine task operates on a slice of a single vector
+            klast = kfirst ;
+            iA_start = TaskList [taskid].pA ;
+            iA_end   = TaskList [taskid].pA_end ;
+        }
+
+        //----------------------------------------------------------------------
+        // compute all vectors in this task
+        //----------------------------------------------------------------------
+
+        for (int64_t jA = kfirst ; jA <= klast ; jA++)
+        {
+
+            //------------------------------------------------------------------
+            // get jC, the corresponding vector of C
+            //------------------------------------------------------------------
+
+            int64_t jC = GB_ijlist (J, jA, Jkind, Jcolon) ;
+            int64_t pC0 = jC * vlen ;       // first entry in C(:,jC)
+            int64_t pA0 = jA * nI ;         // first entry in A(:,jA)
+
+            //------------------------------------------------------------------
+            // operate on C (I(iA_start,iA_end-1),jC)
+            //------------------------------------------------------------------
+
+            for (int64_t iA = iA_start ; iA < iA_end ; iA++)
+            { 
+                int64_t iC = GB_ijlist (I, iA, Ikind, Icolon) ;
+                int64_t pC = iC + pC0 ;
+                int64_t pA = iA + pA0 ;
+                // operate on C(iC,jC) at pC (if C is bitmap or full)
+                // and A(iA,jA) or M(iA,jA) at pA, if A and/or M are
+                // bitmap or full.  M(iA,jA) is accessed only for the
+                // subassign method when M is bitmap or full.
+                GB_IXJ_WORK (pC, pA) ;
+            }
+        }
+        cnvals += task_cnvals ;
+    }
+
+    //--------------------------------------------------------------------------
+    // free workpace
+    //--------------------------------------------------------------------------
+
+    GB_FREE (TaskList) ;
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_assign_M_all_template.c b/GraphBLAS/Source/Template/GB_bitmap_assign_M_all_template.c
new file mode 100644
index 0000000000..3a6391f77c
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_assign_M_all_template.c
@@ -0,0 +1,63 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_M_all_template:  traverse M for GB_ASSIGN
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// M is sparse or hypersparse, not bitmap or full.  C<M>(I,J) = ... is being
+// computed (or !M), and all entries in M are traversed.  For a given entry
+// M (iM,jM) in the mask, the entry C(iM,jM) is accessed at location pC.
+
+// C is bitmap/full.  M is sparse/hyper, and can be jumbled.
+
+{
+    int tid ;
+    #pragma omp parallel for num_threads(M_nthreads) schedule(dynamic,1) \
+        reduction(+:cnvals)
+    for (tid = 0 ; tid < M_ntasks ; tid++)
+    {
+        int64_t kfirst = kfirst_Mslice [tid] ;
+        int64_t klast  = klast_Mslice  [tid] ;
+        int64_t task_cnvals = 0 ;
+
+        //----------------------------------------------------------------------
+        // traverse over M (:,kfirst:klast)
+        //----------------------------------------------------------------------
+
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // find the part of M(:,k) for this task
+            //------------------------------------------------------------------
+
+            int64_t jM = GBH (Mh, k) ;
+            int64_t pM_start, pM_end ;
+            GB_get_pA (&pM_start, &pM_end, tid, k, kfirst,
+                klast, pstart_Mslice, Mp, mvlen) ;
+
+            //------------------------------------------------------------------
+            // traverse over M(:,jM), the kth vector of M
+            //------------------------------------------------------------------
+
+            // for assign: M is a matrix the same size as C
+            int64_t jC = jM ;
+
+            for (int64_t pM = pM_start ; pM < pM_end ; pM++)
+            {
+                bool mij = GB_mcast (Mx, pM, msize) ;
+                if (mij)
+                { 
+                    int64_t iC = Mi [pM] ;
+                    int64_t pC = iC + jC * cvlen ;
+                    GB_MASK_WORK (pC) ;
+                }
+            }
+        }
+        cnvals += task_cnvals ;
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_assign_M_col_template.c b/GraphBLAS/Source/Template/GB_bitmap_assign_M_col_template.c
new file mode 100644
index 0000000000..362dfe7d39
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_assign_M_col_template.c
@@ -0,0 +1,62 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_M_col_template:  traverse M for GB_COL_ASSIGN
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// M is a (C->vlen)-by-1 hypersparse or sparse matrix, for
+// GrB_Row_assign (if C is CSR) or GrB_Col_assign (if C is CSC).
+
+// C is bitmap/full.  M is sparse/hyper, and can be jumbled.
+
+{
+    int64_t jC = J [0] ;
+    int tid ;
+    #pragma omp parallel for num_threads(M_nthreads) schedule(dynamic,1) \
+        reduction(+:cnvals)
+    for (tid = 0 ; tid < M_ntasks ; tid++)
+    {
+        int64_t kfirst = kfirst_Mslice [tid] ;
+        int64_t klast  = klast_Mslice  [tid] ;
+        int64_t task_cnvals = 0 ;
+
+        //----------------------------------------------------------------------
+        // traverse over M (:,kfirst:klast)
+        //----------------------------------------------------------------------
+
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // find the part of M(:,k) for this task
+            //------------------------------------------------------------------
+
+            ASSERT (k == 0) ;
+            ASSERT (GBH (Mh, k) == 0) ;
+            int64_t pM_start, pM_end ;
+            GB_get_pA (&pM_start, &pM_end, tid, k, kfirst,
+                klast, pstart_Mslice, Mp, mvlen) ;
+
+            //------------------------------------------------------------------
+            // traverse over M(:,0), the kth vector of M
+            //------------------------------------------------------------------
+
+            // for col_assign: M is a single vector, jC = J [0]
+            for (int64_t pM = pM_start ; pM < pM_end ; pM++)
+            {
+                bool mij = GB_mcast (Mx, pM, msize) ;
+                if (mij)
+                { 
+                    int64_t iC = Mi [pM] ;
+                    int64_t pC = iC + jC * cvlen ;
+                    GB_MASK_WORK (pC) ;
+                }
+            }
+        }
+        cnvals += task_cnvals ;
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_assign_M_row_template.c b/GraphBLAS/Source/Template/GB_bitmap_assign_M_row_template.c
new file mode 100644
index 0000000000..1e8b23dca8
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_assign_M_row_template.c
@@ -0,0 +1,66 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_M_row_template:  traverse M for GB_ROW_ASSIGN
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// M is a 1-by-(C->vdim) hypersparse or sparse matrix, not a vector, for
+// GrB_Row_assign (if C is CSC) or GrB_Col_assign (if C is CSR).
+
+// C is bitmap/full.  M is sparse/hyper, and can be jumbled.
+
+{ 
+
+    ASSERT (mvlen == 1) ;
+    int64_t iC = I [0] ;
+    int tid ;
+    #pragma omp parallel for num_threads(M_nthreads) schedule(dynamic,1) \
+        reduction(+:cnvals)
+    for (tid = 0 ; tid < M_ntasks ; tid++)
+    {
+        int64_t kfirst = kfirst_Mslice [tid] ;
+        int64_t klast  = klast_Mslice  [tid] ;
+        int64_t task_cnvals = 0 ;
+
+        //----------------------------------------------------------------------
+        // traverse over M (0,kfirst:klast)
+        //----------------------------------------------------------------------
+
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // find the part of M(0,k) for this task
+            //------------------------------------------------------------------
+
+            int64_t jM = GBH (Mh, k) ;
+            int64_t pM_start, pM_end ;
+            GB_get_pA (&pM_start, &pM_end, tid, k, kfirst,
+                klast, pstart_Mslice, Mp, mvlen) ;
+
+            //------------------------------------------------------------------
+            // traverse over M(0,jM), the kth vector of M
+            //------------------------------------------------------------------
+
+            // for row_assign: M is a single row, iC = I [0]
+            // It has either 0 or 1 entry.
+            int64_t pM = pM_start ;
+
+            if (pM < pM_end)
+            {
+                bool mij = GB_mcast (Mx, pM, msize) ;
+                if (mij)
+                { 
+                    int64_t jC = jM ;
+                    int64_t pC = iC + jC * cvlen ;
+                    GB_MASK_WORK (pC) ;
+                }
+            }
+        }
+        cnvals += task_cnvals ;
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_assign_M_sub_template.c b/GraphBLAS/Source/Template/GB_bitmap_assign_M_sub_template.c
new file mode 100644
index 0000000000..5b7f492f19
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_assign_M_sub_template.c
@@ -0,0 +1,66 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_M_sub_template:  traverse M for GB_SUBASSIGN
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// M is sparse or hypersparse, not bitmap or full.  C(I,J)<M>= ... is being
+// computed (or !M), and all entries in M are traversed.  For a given entry
+// M(iM,jM) in the mask, at location pM.  The entry C(iC,jC) is at location pC,
+// where iC = I [iM] and jC = J [jM].  The matrix C is bitmap or full.
+
+// C is bitmap/full.  M is sparse/hyper, and can be jumbled.
+
+{
+    int tid ;
+    #pragma omp parallel for num_threads(M_nthreads) schedule(dynamic,1) \
+        reduction(+:cnvals)
+    for (tid = 0 ; tid < M_ntasks ; tid++)
+    {
+        int64_t kfirst = kfirst_Mslice [tid] ;
+        int64_t klast  = klast_Mslice  [tid] ;
+        int64_t task_cnvals = 0 ;
+
+        //----------------------------------------------------------------------
+        // traverse over M (:,kfirst:klast)
+        //----------------------------------------------------------------------
+
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // find the part of M(:,k) for this task
+            //------------------------------------------------------------------
+
+            int64_t jM = GBH (Mh, k) ;
+            int64_t pM_start, pM_end ;
+            GB_get_pA (&pM_start, &pM_end, tid, k, kfirst,
+                klast, pstart_Mslice, Mp, mvlen) ;
+
+            //------------------------------------------------------------------
+            // traverse over M(:,jM), the kth vector of M
+            //------------------------------------------------------------------
+
+            // for subassign, M has same size as C(I,J) and A.
+            int64_t jC = GB_ijlist (J, jM, Jkind, Jcolon) ;
+            int64_t pC0 = jC * cvlen ;
+
+            for (int64_t pM = pM_start ; pM < pM_end ; pM++)
+            {
+                bool mij = GB_mcast (Mx, pM, msize) ;
+                if (mij)
+                { 
+                    int64_t iM = Mi [pM] ;
+                    int64_t iC = GB_ijlist (I, iM, Ikind, Icolon) ;
+                    int64_t pC = iC + pC0 ;
+                    GB_MASK_WORK (pC) ;             // operate on Cx [pC]
+                }
+            }
+        }
+        cnvals += task_cnvals ;
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_assign_M_template.c b/GraphBLAS/Source/Template/GB_bitmap_assign_M_template.c
new file mode 100644
index 0000000000..9eeb63f4ef
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_assign_M_template.c
@@ -0,0 +1,48 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_assign_M_template: traverse over M for bitmap assignment into C
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This template traverses over all the entries of the mask matrix M, and
+// operates on C(i,j) if the mask M(i,j) == 1, via the GB_MASK_WORK macro,
+// where C(i,j) is at Cx [pC] and Cb [pC].  M is hypersparse or sparse.
+
+// GB_SLICE_MATRIX (M,...) has alreadly sliced M for parallel work.  The tasks
+// are held in pstart_Mslice, kfirst_Mslice, klast_Mslice, M_ntasks, and the
+// work is done by M_nthreads threads.
+
+// The work done by this kernel is independent of Mask_comp; both M and !M
+// do the same work by scattering their entries into the C bitmap.
+
+// C is bitmap/full.  M is sparse/hyper, and can be jumbled.
+ASSERT (GB_IS_HYPERSPARSE (M) || GB_IS_SPARSE (M)) ;
+ASSERT (GB_IS_BITMAP (C) || GB_IS_FULL (C)) ;
+ASSERT (GB_JUMBLED_OK (M)) ;
+
+switch (assign_kind)
+{
+    case GB_ROW_ASSIGN : 
+        // row assignment: C<M>(iC,J), where M is a row vector
+        #include "GB_bitmap_assign_M_row_template.c"
+        break ;
+    case GB_COL_ASSIGN : 
+        // column assignment: C<M>(I,jC), where M is a column vector
+        #include "GB_bitmap_assign_M_col_template.c"
+        break ;
+    case GB_ASSIGN : 
+        // GrB_assign: C<M>(I,J), where M is the same size as C
+        #include "GB_bitmap_assign_M_all_template.c"
+        break ;
+    #ifndef GB_NO_SUBASSIGN_CASE
+    case GB_SUBASSIGN : 
+        // GxB_subassign: C(I,J)<M>, where M is the same size as C(I,J) and A
+        #include "GB_bitmap_assign_M_sub_template.c"
+        break ;
+    #endif
+    default: ;
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_emult_template.c b/GraphBLAS/Source/Template/GB_bitmap_emult_template.c
new file mode 100644
index 0000000000..f3b1ca9a6d
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_emult_template.c
@@ -0,0 +1,250 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_emult_template: C = A.*B, C<M>=A.*B, and C<!M>=A.*B, C bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// C is bitmap.  The mask M can have any sparsity structure, and is efficient
+// to apply (all methods are asymptotically optimal).  A and B are bitmap or
+// full (with at least one of them bitmap).  All cases (no M, M, !M) are
+// handled.
+
+{
+
+    ASSERT (A_is_bitmap || B_is_bitmap) ;
+    ASSERT (A_is_bitmap || A_is_full) ;
+    ASSERT (B_is_bitmap || B_is_full) ;
+
+    // TODO modify this method so it can modify C in-place, and also use the
+    // accum operator.
+    int64_t cnvals = 0 ;
+
+    if (M == NULL)
+    {
+
+        //----------------------------------------------------------------------
+        // M is not present
+        //----------------------------------------------------------------------
+
+        //      ------------------------------------------
+        //      C       =           A       .*      B
+        //      ------------------------------------------
+        //      bitmap  .           bitmap          bitmap
+        //      bitmap  .           bitmap          full  
+        //      bitmap  .           full            bitmap
+
+        //----------------------------------------------------------------------
+        // Method18: C bitmap, A and B are bitmap or full
+        //----------------------------------------------------------------------
+
+        int tid ;
+        #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
+            reduction(+:cnvals)
+        for (tid = 0 ; tid < C_nthreads ; tid++)
+        {
+            int64_t pstart, pend, task_cnvals = 0 ;
+            GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
+            for (int64_t p = pstart ; p < pend ; p++)
+            {
+                if (GBB (Ab, p) && GBB (Bb,p))
+                { 
+                    // C (i,j) = A (i,j) + B (i,j)
+                    GB_GETA (aij, Ax, p) ;
+                    GB_GETB (bij, Bx, p) ;
+                    GB_BINOP (GB_CX (p), aij, bij, p % vlen, p / vlen) ;
+                    Cb [p] = 1 ;
+                    task_cnvals++ ;
+                }
+            }
+            cnvals += task_cnvals ;
+        }
+
+    }
+    else if (M_is_sparse_or_hyper)
+    { 
+
+        //----------------------------------------------------------------------
+        // C is bitmap, M is sparse or hyper
+        //----------------------------------------------------------------------
+
+        //      ------------------------------------------
+        //      C       <!M>=       A       .*      B
+        //      ------------------------------------------
+        //      bitmap  sparse      bitmap          bitmap
+        //      bitmap  sparse      bitmap          full  
+        //      bitmap  sparse      full            bitmap
+
+        // M is sparse and complemented.  If M is sparse and not
+        // complemented, then C is constructed as sparse, not bitmap.
+        ASSERT (Mask_comp) ;
+
+        // C(i,j) = A(i,j) .* B(i,j) can only be computed where M(i,j) is
+        // not present in the sparse pattern of M, and where it is present
+        // but equal to zero.
+
+        //----------------------------------------------------------------------
+        // scatter M into the C bitmap
+        //----------------------------------------------------------------------
+
+        GB_SLICE_MATRIX (M, 8) ;
+        GB_bitmap_M_scatter_whole (C, M, Mask_struct, GB_BITMAP_M_SCATTER_SET_2,
+            pstart_Mslice, kfirst_Mslice, klast_Mslice,
+            M_nthreads, M_ntasks, Context) ;
+
+#if 0
+        #pragma omp parallel for num_threads(M_nthreads) schedule(dynamic,1)
+        for (taskid = 0 ; taskid < M_ntasks ; taskid++)
+        {
+            int64_t kfirst = kfirst_Mslice [taskid] ;
+            int64_t klast  = klast_Mslice  [taskid] ;
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+                // find the part of M(:,k) for this task
+                int64_t j = GBH (Mh, k) ;
+                int64_t pM_start, pM_end ;
+                GB_get_pA (&pM_start, &pM_end, taskid, k, kfirst,
+                    klast, pstart_Mslice, Mp, vlen) ;
+                int64_t pC_start = j * vlen ;
+                // traverse over M(:,j), the kth vector of M
+                for (int64_t pM = pM_start ; pM < pM_end ; pM++)
+                {
+                    // mark C(i,j) if M(i,j) is true
+                    bool mij = GB_mcast (Mx, pM, msize) ;
+                    if (mij)
+                    { 
+                        int64_t i = Mi [pM] ;
+                        int64_t p = pC_start + i ;
+                        Cb [p] = 2 ;
+                    }
+                }
+            }
+        }
+#endif
+
+        // C(i,j) has been marked, in Cb, with the value 2 where M(i,j)=1.
+        // These positions will not be computed in C(i,j).  C(i,j) can only
+        // be modified where Cb [p] is zero.
+
+        //----------------------------------------------------------------------
+        // Method19(!M,sparse): C is bitmap, both A and B are bitmap or full
+        //----------------------------------------------------------------------
+
+        // TODO: M may be jumbled, in which case so is C
+
+        int tid ;
+        #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
+            reduction(+:cnvals)
+        for (tid = 0 ; tid < C_nthreads ; tid++)
+        {
+            int64_t pstart, pend, task_cnvals = 0 ;
+            GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
+            for (int64_t p = pstart ; p < pend ; p++)
+            {
+                if (Cb [p] == 0)
+                {
+                    // M(i,j) is zero, so C(i,j) can be computed
+                    if (GBB (Ab, p) && GBB (Bb, p))
+                    { 
+                        // C (i,j) = A (i,j) + B (i,j)
+                        GB_GETA (aij, Ax, p) ;
+                        GB_GETB (bij, Bx, p) ;
+                        GB_BINOP (GB_CX (p), aij, bij, p % vlen, p / vlen) ;
+                        Cb [p] = 1 ;
+                        task_cnvals++ ;
+                    }
+                }
+                else
+                { 
+                    // M(i,j) == 1, so C(i,j) is not computed
+                    Cb [p] = 0 ;
+                }
+            }
+            cnvals += task_cnvals ;
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // C is bitmap; M is bitmap or full
+        //----------------------------------------------------------------------
+
+        //      ------------------------------------------
+        //      C      <M> =        A       .*      B
+        //      ------------------------------------------
+        //      bitmap  bitmap      bitmap          bitmap
+        //      bitmap  bitmap      bitmap          full  
+        //      bitmap  bitmap      full            bitmap
+
+        //      ------------------------------------------
+        //      C      <M> =        A       .*      B
+        //      ------------------------------------------
+        //      bitmap  full        bitmap          bitmap
+        //      bitmap  full        bitmap          full  
+        //      bitmap  full        full            bitmap
+
+        //      ------------------------------------------
+        //      C      <!M> =       A       .*      B
+        //      ------------------------------------------
+        //      bitmap  bitmap      bitmap          bitmap
+        //      bitmap  bitmap      bitmap          full  
+        //      bitmap  bitmap      full            bitmap
+
+        //      ------------------------------------------
+        //      C      <!M> =       A       .*      B
+        //      ------------------------------------------
+        //      bitmap  full        bitmap          bitmap
+        //      bitmap  full        bitmap          full  
+        //      bitmap  full        full            bitmap
+
+        ASSERT (M_is_bitmap || M_is_full) ;
+
+        #undef  GB_GET_MIJ     
+        #define GB_GET_MIJ(p)                                           \
+            bool mij = GBB (Mb, p) && GB_mcast (Mx, p, msize) ;         \
+            if (Mask_comp) mij = !mij ;
+
+        //----------------------------------------------------------------------
+        // Method20: C is bitmap; M, A, and B are bitmap or full
+        //----------------------------------------------------------------------
+
+        int tid ;
+        #pragma omp parallel for num_threads(C_nthreads) schedule(static) \
+            reduction(+:cnvals)
+        for (tid = 0 ; tid < C_nthreads ; tid++)
+        {
+            int64_t pstart, pend, task_cnvals = 0 ;
+            GB_PARTITION (pstart, pend, cnz, tid, C_nthreads) ;
+            for (int64_t p = pstart ; p < pend ; p++)
+            {
+                GB_GET_MIJ (p) ;
+                if (mij)
+                {
+                    // M(i,j) is true, so C(i,j) can be computed
+                    if (GBB (Ab, p) && GBB (Bb, p))
+                    {
+                        // C (i,j) = A (i,j) + B (i,j)
+                        GB_GETA (aij, Ax, p) ;
+                        GB_GETB (bij, Bx, p) ;
+                        GB_BINOP (GB_CX (p), aij, bij, p % vlen, p / vlen) ;
+                        Cb [p] = 1 ;
+                        task_cnvals++ ;
+                    }
+                }
+                else
+                {
+                    // M(i,j) == 1, so C(i,j) is not computed
+                    Cb [p] = 0 ;
+                }
+            }
+            cnvals += task_cnvals ;
+        }
+    }
+
+    C->nvals = cnvals ;
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_masker_template.c b/GraphBLAS/Source/Template/GB_bitmap_masker_template.c
new file mode 100644
index 0000000000..ede7aa438d
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_masker_template.c
@@ -0,0 +1,288 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_masker_template:  phase2 for R = masker (C, M, Z), R is bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Computes C<M>=Z or C<!M>=Z, returning the result in R, which is bitmap.
+// The input matrix C is not modified.  Effectively, this computes R=C and then
+// R<M>=Z or R<!M>=Z.  If the C_replace descriptor is enabled, then C has
+// already been cleared, and is an empty (but non-NULL) matrix.
+
+// phase2: computes R in a single pass
+
+// C is sparse or hypersparse.  Z is bitmap or full.  R is bitmap.
+// M has any sparsity structure.
+
+        //      ------------------------------------------
+        //      C       <!M> =       Z              R
+        //      ------------------------------------------
+
+        //      sparse  sparse      bitmap          bitmap
+        //      sparse  sparse      full            bitmap
+
+        //      sparse  bitmap      bitmap          bitmap
+        //      sparse  bitmap      full            bitmap
+
+        //      sparse  full        bitmap          bitmap
+        //      sparse  full        full            bitmap
+
+        //      ------------------------------------------
+        //      C       <M> =        Z              R
+        //      ------------------------------------------
+
+        //      sparse  bitmap      bitmap          bitmap
+        //      sparse  bitmap      full            bitmap
+
+        //      sparse  full        bitmap          bitmap
+        //      sparse  full        full            bitmap
+
+// FUTURE:: add special cases for C==Z, C==M, and Z==M aliases
+
+{
+    
+    int64_t p, rnvals = 0 ;
+
+    ASSERT (R_sparsity == GxB_BITMAP) ;
+    ASSERT (C_is_sparse || C_is_hyper) ;
+    ASSERT (Z_is_bitmap || Z_is_full) ;
+
+    //--------------------------------------------------------------------------
+    // scatter C into the R bitmap
+    //--------------------------------------------------------------------------
+
+    ASSERT_MATRIX_OK (C, "C input to R_bitmap_masker", GB0) ;
+    GB_SLICE_MATRIX (C, 8) ;
+
+    #pragma omp parallel for num_threads(C_nthreads) schedule(dynamic,1) \
+        reduction(+:rnvals)
+    for (taskid = 0 ; taskid < C_ntasks ; taskid++)
+    {
+        int64_t kfirst = kfirst_Cslice [taskid] ;
+        int64_t klast  = klast_Cslice  [taskid] ;
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+            // find the part of C(:,k) for this task
+            int64_t j = GBH (Ch, k) ;
+            int64_t pC_start, pC_end ;
+            GB_get_pA (&pC_start, &pC_end, taskid, k, kfirst,
+                klast, pstart_Cslice, Cp, vlen) ;
+            int64_t pR_start = j * vlen ;
+            // traverse over C(:,j), the kth vector of C
+            for (int64_t pC = pC_start ; pC < pC_end ; pC++)
+            { 
+                // R(i,j) = C(i,j)
+                int64_t i = Ci [pC] ;
+                int64_t pR = pR_start + i ;
+                Rb [pR] = 1 ;
+                rnvals++ ;
+                memcpy (Rx + (pR)*rsize, Cx +(pC)*rsize, rsize) ;
+            }
+        }
+    }
+
+    R->nvals = rnvals ;
+    ASSERT_MATRIX_OK (R, "R with C scattered", GB0) ;
+
+    //--------------------------------------------------------------------------
+    // R<M>=Z or R<!M>=Z
+    //--------------------------------------------------------------------------
+
+    if (M_is_sparse || M_is_hyper)
+    {
+
+        //----------------------------------------------------------------------
+        // Method05: M is sparse or hypersparse, Z bitmap/full, R bitmap
+        //----------------------------------------------------------------------
+
+        //      ------------------------------------------
+        //      C       <!M> =       Z              R
+        //      ------------------------------------------
+
+        //      sparse  sparse      bitmap          bitmap
+        //      sparse  sparse      full            bitmap
+
+        ASSERT (Mask_comp) ;
+
+        //----------------------------------------------------------------------
+        // scatter M into the R bitmap
+        //----------------------------------------------------------------------
+
+        GB_SLICE_MATRIX (M, 8) ;
+
+        #pragma omp parallel for num_threads(M_nthreads) schedule(dynamic,1)
+        for (taskid = 0 ; taskid < M_ntasks ; taskid++)
+        {
+            int64_t kfirst = kfirst_Mslice [taskid] ;
+            int64_t klast  = klast_Mslice  [taskid] ;
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+                // find the part of M(:,k) for this task
+                int64_t j = GBH (Mh, k) ;
+                int64_t pM_start, pM_end ;
+                GB_get_pA (&pM_start, &pM_end, taskid, k, kfirst,
+                    klast, pstart_Mslice, Mp, vlen) ;
+                int64_t pR_start = j * vlen ;
+                // traverse over M(:,j), the kth vector of M
+                for (int64_t pM = pM_start ; pM < pM_end ; pM++)
+                {
+                    // mark R(i,j) if M(i,j) is true
+                    bool mij = GB_mcast (Mx, pM, msize) ;
+                    if (mij)
+                    { 
+                        int64_t i = Mi [pM] ;
+                        int64_t p = pR_start + i ;
+                        Rb [p] += 2 ;
+                    }
+                }
+            }
+        }
+
+        //----------------------------------------------------------------------
+        // R<!M>=Z, using M scattered into R
+        //----------------------------------------------------------------------
+
+        // Rb is marked as follows:
+        //  0:  R(i,j) is not present, and M(i,j) is false
+        //  1:  R(i,j) is present, and M(i,j) is false
+        //  2:  R(i,j) is not present, and M(i,j) is true
+        //  3:  R(i,j) is present, and M(i,j) is true
+
+        // M is complemented, but shown uncomplemented in the table below since
+        // that is how it is scattered into R.
+
+        // Rb   R(i,j)  M(i,j)  Z(i,j)      modification to R(i,j)
+        // 0    -       0       zij         R(i,j) = Z(i,j), new value, rnvals++
+        // 0    -       0       -           do nothing
+        // 1    rij     0       zij         R(i,j) = Z(i,j), overwrite
+        // 1    rij     0       -           delete R(i,j), rnvals--
+        // 2    -       1       zij         do nothing, set Rb to 0
+        // 2    -       1       -           do nothing, set Rb to 0
+        // 3    rij     1       zij         keep R(i,j), set Rb to 1
+        // 3    rij     1       -           keep R(i,j), set Rb to 1
+
+        #pragma omp parallel for num_threads(R_nthreads) schedule(static) \
+            reduction(+:rnvals)
+        for (p = 0 ; p < rnz ; p++)
+        {
+            int8_t r = Rb [p] ;
+            int8_t z = GBB (Zb, p) ;
+            switch (r)
+            {
+                case 0 :    // R(i,j) not present, M(i,j) false
+                    if (z)
+                    { 
+                        // R(i,j) = Z(i,j), insert new value
+                        memcpy (Rx +(p)*rsize, Zx +(p)*rsize, rsize) ;
+                        Rb [p] = 1 ;
+                        rnvals++ ;
+                    }
+                    break ;
+
+                case 1 :    // R(i,j) present, M(i,j) false
+                    if (z)
+                    { 
+                        // R(i,j) = Z(i,j), update prior value
+                        memcpy (Rx +(p)*rsize, Zx +(p)*rsize, rsize) ;
+                    }
+                    else
+                    { 
+                        // delete R(i,j)
+                        Rb [p] = 0 ;
+                        rnvals-- ;
+                    }
+                    break ;
+
+                case 2 :    // R(i,j) not present, M(i,j) true
+                    Rb [p] = 0 ;
+                    break ;
+
+                case 3 :    // R(i,j) present, M(i,j) true
+                    Rb [p] = 1 ;
+                    break ;
+
+                default: ;
+            }
+        }
+
+    }
+    else
+    { 
+
+        //----------------------------------------------------------------------
+        // Method06: M and Z are bitmap or full, R is bitmap
+        //----------------------------------------------------------------------
+
+        //      ------------------------------------------
+        //      C       <!M> =       Z              R
+        //      ------------------------------------------
+
+        //      sparse  bitmap      bitmap          bitmap
+        //      sparse  bitmap      full            bitmap
+
+        //      sparse  full        bitmap          bitmap
+        //      sparse  full        full            bitmap
+
+        //      ------------------------------------------
+        //      C       <M> =        Z              R
+        //      ------------------------------------------
+
+        //      sparse  bitmap      bitmap          bitmap
+        //      sparse  bitmap      full            bitmap
+
+        //      sparse  full        bitmap          bitmap
+        //      sparse  full        full            bitmap
+
+        // Rb   R(i,j)  M(i,j)  Z(i,j)      modification to R(i,j)
+
+        // 0    -       0       zij         do nothing
+        // 0    -       0       -           do nothing
+        // 1    rij     0       zij         do nothing
+        // 1    rij     0       -           do nothing
+
+        // 0    -       1       zij         R(i,j) = Z(i,j), rnvals++
+        // 0    -       1       -           do nothing
+        // 1    rij     1       zij         R(i,j) = Z(i,j), no change to rnvals
+        // 1    rij     1       -           delete, rnvals--
+
+        #pragma omp parallel for num_threads(R_nthreads) schedule(static) \
+            reduction(+:rnvals)
+        for (p = 0 ; p < rnz ; p++)
+        {
+            bool mij = GBB (Mb, p) && GB_mcast (Mx, p, msize) ;
+            if (Mask_comp) mij = !mij ;
+            if (mij)
+            {
+                int8_t z = GBB (Zb, p) ;
+                int8_t r = Rb [p] ;
+                if (r)
+                {
+                    if (z)
+                    { 
+                        // R(i,j) = Z(i,j), update, no change to rnvals
+                        memcpy (Rx +(p)*rsize, Zx +(p)*rsize, rsize) ;
+                    }
+                    else
+                    { 
+                        // delete R(i,j)
+                        Rb [p] = 0 ;
+                        rnvals-- ;
+                    }
+                }
+                else if (z)
+                { 
+                    // R(i,j) = Z(i,j), new entry
+                    memcpy (Rx +(p)*rsize, Zx +(p)*rsize, rsize) ;
+                    Rb [p] = 1 ;
+                    rnvals++ ;
+                }
+            }
+        }
+    }
+
+    R->nvals = rnvals ;
+}
+
diff --git a/GraphBLAS/Source/Template/GB_bitmap_select_template.c b/GraphBLAS/Source/Template/GB_bitmap_select_template.c
new file mode 100644
index 0000000000..b3d43a4cd4
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_bitmap_select_template.c
@@ -0,0 +1,61 @@
+//------------------------------------------------------------------------------
+// GB_bitmap_select_template: C=select(A,thunk) if A is bitmap or full
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Ab and Cb can be aliased, if A is bitmap and the selection is done in-place.
+// Ax and Cx are not aliased.
+
+// TODO: If done in-place, Cx can be passed as NULL.  Then if A is not bitmap,
+// C->b needs to be allocated, but not C->x.
+
+// the following macro is awkward but currently needed for the user_select op:
+#undef  GBI
+#define GBI(Ai,p,avlen) i
+
+{
+    int8_t *Ab = A->b ;
+    GB_ATYPE *GB_RESTRICT Ax = A->x ;
+    const int64_t avlen = A->vlen ;
+    const int64_t avdim = A->vdim ;
+    const size_t asize = A->type->size ;
+    const int64_t anz = avlen * avdim ;
+    int64_t pA, cnvals = 0 ;
+    #pragma omp parallel for num_threads(nthreads) schedule(static) \
+        reduction(+:cnvals)
+    for (pA = 0 ; pA < anz ; pA++)
+    { 
+        int64_t i = pA % avlen ;
+        int64_t j = pA / avlen ;
+        #if defined ( GB_ENTRY_SELECTOR )
+            // test the existence and value of A(i,j) 
+            int8_t cb = GBB (Ab, pA) && GB_TEST_VALUE_OF_ENTRY (pA) ;
+        #else
+            // test the existence and position of A(i,j) 
+            #if defined ( GB_TRIL_SELECTOR )
+            int8_t cb = GBB (Ab, pA) && (j-i <= ithunk) ;
+            #elif defined ( GB_TRIU_SELECTOR )
+            int8_t cb = GBB (Ab, pA) && (j-i >= ithunk) ;
+            #elif defined ( GB_DIAG_SELECTOR )
+            int8_t cb = GBB (Ab, pA) && (j-i == ithunk) ;
+            #elif defined ( GB_OFFDIAG_SELECTOR )
+            int8_t cb = GBB (Ab, pA) && (j-i != ithunk) ;
+            #else
+            ASSERT (GB_DEAD_CODE) ;
+            #endif
+        #endif
+        Cb [pA] = cb ;
+        cnvals += cb ;
+        // if (Cx != NULL)
+        { 
+            // Cx [pA] = Ax [pA]
+            GB_SELECT_ENTRY (Cx, pA, Ax, pA) ;
+        }
+    }
+    (*cnvals_handle)= cnvals ;
+}
+
diff --git a/GraphBLAS/Source/Template/GB_convert_sparse_to_bitmap_template.c b/GraphBLAS/Source/Template/GB_convert_sparse_to_bitmap_template.c
new file mode 100644
index 0000000000..18b6aee794
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_convert_sparse_to_bitmap_template.c
@@ -0,0 +1,77 @@
+//------------------------------------------------------------------------------
+// GB_convert_sparse_to_bitmap_template: convert A from sparse to bitmap
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+    ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ;
+
+    const int64_t  *GB_RESTRICT Ap = A->p ;
+    const int64_t  *GB_RESTRICT Ah = A->h ;
+    const int64_t  *GB_RESTRICT Ai = A->i ;
+    const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) A->x ;
+    const int64_t avlen = A->vlen ;
+    const int64_t nzombies = A->nzombies ;
+
+    int tid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+    for (tid = 0 ; tid < ntasks ; tid++)
+    {
+        int64_t kfirst = kfirst_slice [tid] ;
+        int64_t klast  = klast_slice  [tid] ;
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // find the part of A(:,j) to be operated on by this task
+            //------------------------------------------------------------------
+
+            int64_t j = GBH (Ah, k) ;
+            int64_t pA_start, pA_end ;
+            GB_get_pA (&pA_start, &pA_end, tid, k,
+                kfirst, klast, pstart_slice, Ap, avlen) ;
+
+            // the start of A(:,j) in the new bitmap
+            int64_t pA_new = j * avlen ;
+
+            //------------------------------------------------------------------
+            // convert A(:,j) from sparse to bitmap
+            //------------------------------------------------------------------
+
+            if (nzombies == 0)
+            {
+                for (int64_t p = pA_start ; p < pA_end ; p++)
+                { 
+                    // A(i,j) has index i, value Ax [p]
+                    int64_t i = Ai [p] ;
+                    int64_t pnew = i + pA_new ;
+                    // move A(i,j) to its new place in the bitmap
+                    // Ax_new [pnew] = Ax [p]
+                    GB_COPY_A_TO_C (Ax_new, pnew, Ax, p) ;
+                    Ab [pnew] = 1 ;
+                }
+            }
+            else
+            {
+                for (int64_t p = pA_start ; p < pA_end ; p++)
+                { 
+                    // A(i,j) has index i, value Ax [p]
+                    int64_t i = Ai [p] ;
+                    if (!GB_IS_ZOMBIE (i))
+                    { 
+                        int64_t pnew = i + pA_new ;
+                        // move A(i,j) to its new place in the bitmap
+                        // Ax_new [pnew] = Ax [p]
+                        GB_COPY_A_TO_C (Ax_new, pnew, Ax, p) ;
+                        Ab [pnew] = 1 ;
+                    }
+                }
+            }
+        }
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_critical_section.c b/GraphBLAS/Source/Template/GB_critical_section.c
deleted file mode 100644
index fbb03191c3..0000000000
--- a/GraphBLAS/Source/Template/GB_critical_section.c
+++ /dev/null
@@ -1,22 +0,0 @@
-// Source/Template/GB_critical_section: TODO in 4.0: delete
-// DEPRECATED: This critical section is only used to protect the global queue
-// of matrices with pending operations, for GrB_wait ( ).  It will be removed
-// in v4.0.
-
-{
-    #if defined (USER_POSIX_THREADS)
-    {
-        ok = (pthread_mutex_lock (&GB_sync) == 0) ;
-        GB_CRITICAL_SECTION ;
-        ok = ok && (pthread_mutex_unlock (&GB_sync) == 0) ;
-    }
-    #else
-    { 
-        #pragma omp critical(GB_critical_section)
-        GB_CRITICAL_SECTION ;
-    }
-    #endif
-}
-
-#undef GB_CRITICAL_SECTION
-
diff --git a/GraphBLAS/Source/Template/GB_dense_ewise3_accum_template.c b/GraphBLAS/Source/Template/GB_dense_ewise3_accum_template.c
index 5f464b630b..24364dcc7a 100644
--- a/GraphBLAS/Source/Template/GB_dense_ewise3_accum_template.c
+++ b/GraphBLAS/Source/Template/GB_dense_ewise3_accum_template.c
@@ -2,8 +2,8 @@
 // GB_dense_ewise3_accum_template: C += A+B where all 3 matrices are dense
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -53,8 +53,8 @@
             { 
                 GB_GETA (aij, Ax, p) ;                  // aij = Ax [p]
                 GB_CTYPE_SCALAR (t) ;                   // declare scalar t
-                GB_BINOP (t, aij, aij) ;                // t = aij + aij
-                GB_BINOP (GB_CX (p), GB_CX (p), t) ;    // Cx [p] = cij + t
+                GB_BINOP (t, aij, aij, 0, 0) ;          // t = aij + aij
+                GB_BINOP (GB_CX (p), GB_CX (p), t, 0, 0) ; // Cx [p] = cij + t
             }
 
         #endif
@@ -87,8 +87,8 @@
                 GB_GETA (aij, Ax, p) ;                  // aij = Ax [p]
                 GB_GETB (bij, Bx, p) ;                  // bij = Bx [p]
                 GB_CTYPE_SCALAR (t) ;                   // declare scalar t
-                GB_BINOP (t, aij, bij) ;                // t = aij + bij
-                GB_BINOP (GB_CX (p), GB_CX (p), t) ;    // Cx [p] = cij + t
+                GB_BINOP (t, aij, bij, 0, 0) ;          // t = aij + bij
+                GB_BINOP (GB_CX (p), GB_CX (p), t, 0, 0) ; // Cx [p] = cij + t
             }
 
         #endif
diff --git a/GraphBLAS/Source/Template/GB_dense_ewise3_noaccum_template.c b/GraphBLAS/Source/Template/GB_dense_ewise3_noaccum_template.c
index ab9610aceb..a1eb556089 100644
--- a/GraphBLAS/Source/Template/GB_dense_ewise3_noaccum_template.c
+++ b/GraphBLAS/Source/Template/GB_dense_ewise3_noaccum_template.c
@@ -2,8 +2,8 @@
 // GB_dense_ewise3_noaccum_template: C = A+B where all 3 matrices are dense
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -56,7 +56,8 @@
             for (p = 0 ; p < cnz ; p++)
             { 
                 GB_GETA (aij, Ax, p) ;                  // aij = Ax [p]
-                GB_BINOP (GB_CX (p), aij, GB_CX (p)) ;  // Cx [p] = aij + Cx [p]
+                // Cx [p] = aij + Cx [p]
+                GB_BINOP (GB_CX (p), aij, GB_CX (p), 0, 0) ;
             }
 
         #endif
@@ -90,7 +91,7 @@
             for (p = 0 ; p < cnz ; p++)
             { 
                 GB_GETB (bij, Bx, p) ;                  // bij = Bx [p]
-                GB_BINOP (GB_CX (p), GB_CX (p), bij) ;  // Cx [p] += bij
+                GB_BINOP (GB_CX (p), GB_CX (p), bij, 0, 0) ; // Cx [p] += bij
             }
 
         #endif
@@ -126,7 +127,7 @@
             { 
                 GB_GETA (aij, Ax, p) ;              // aij = Ax [p]
                 GB_GETB (bij, Bx, p) ;              // bij = Bx [p]
-                GB_BINOP (GB_CX (p), aij, bij) ;    // Cx [p] = aij + bij
+                GB_BINOP (GB_CX (p), aij, bij, 0, 0) ;  // Cx [p] = aij + bij
             }
 
         #endif
diff --git a/GraphBLAS/Source/Template/GB_dense_subassign_05d_template.c b/GraphBLAS/Source/Template/GB_dense_subassign_05d_template.c
index d3eafa01a2..d24a0eff14 100644
--- a/GraphBLAS/Source/Template/GB_dense_subassign_05d_template.c
+++ b/GraphBLAS/Source/Template/GB_dense_subassign_05d_template.c
@@ -2,8 +2,8 @@
 // GB_dense_subassign_05d_template: C<M> = x where C is dense
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,11 +13,15 @@
     // get C and M
     //--------------------------------------------------------------------------
 
+    ASSERT (GB_JUMBLED_OK (M)) ;
+
     const int64_t *GB_RESTRICT Mp = M->p ;
+    const int8_t  *GB_RESTRICT Mb = M->b ;
     const int64_t *GB_RESTRICT Mh = M->h ;
     const int64_t *GB_RESTRICT Mi = M->i ;
     const GB_void *GB_RESTRICT Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
     const size_t msize = M->type->size ;
+    const size_t mvlen = M->vlen ;
 
     GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
     const int64_t cvlen = C->vlen ;
@@ -46,10 +50,10 @@
             // find the part of M(:,k) to be operated on by this task
             //------------------------------------------------------------------
 
-            int64_t j = (Mh == NULL) ? k : Mh [k] ;
+            int64_t j = GBH (Mh, k) ;
             int64_t pM_start, pM_end ;
-            GB_get_pA_and_pC (&pM_start, &pM_end, NULL,
-                taskid, k, kfirst, klast, pstart_slice, NULL, NULL, Mp) ;
+            GB_get_pA (&pM_start, &pM_end, taskid, k,
+                kfirst, klast, pstart_slice, Mp, mvlen) ;
 
             // pC points to the start of C(:,j) if C is dense
             int64_t pC = j * cvlen ;
@@ -58,12 +62,12 @@
             // C<M(:,j)> = x
             //------------------------------------------------------------------
 
-            if (Mx == NULL)
+            if (Mx == NULL && Mb == NULL)
             {
                 GB_PRAGMA_SIMD_VECTORIZE
                 for (int64_t pM = pM_start ; pM < pM_end ; pM++)
                 { 
-                    int64_t p = pC + Mi [pM] ;
+                    int64_t p = pC + GBI (Mi, pM, mvlen) ;
                     GB_COPY_SCALAR_TO_C (p, cwork) ;        // Cx [p] = scalar
                 }
             }
@@ -72,9 +76,9 @@
                 GB_PRAGMA_SIMD_VECTORIZE
                 for (int64_t pM = pM_start ; pM < pM_end ; pM++)
                 {
-                    if (GB_mcast (Mx, pM, msize))
+                    if (GBB (Mb, pM) && GB_mcast (Mx, pM, msize))
                     { 
-                        int64_t p = pC + Mi [pM] ;
+                        int64_t p = pC + GBI (Mi, pM, mvlen) ;
                         GB_COPY_SCALAR_TO_C (p, cwork) ;    // Cx [p] = scalar
                     }
                 }
diff --git a/GraphBLAS/Source/Template/GB_dense_subassign_06d_template.c b/GraphBLAS/Source/Template/GB_dense_subassign_06d_template.c
index 1aaf83c529..d1eebdb733 100644
--- a/GraphBLAS/Source/Template/GB_dense_subassign_06d_template.c
+++ b/GraphBLAS/Source/Template/GB_dense_subassign_06d_template.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_dense_subassign_06d_template: C<A> = A where C is dense
+// GB_dense_subassign_06d_template: C<A> = A where C is dense or bitmap
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,72 +13,333 @@
     // get C and A
     //--------------------------------------------------------------------------
 
+    ASSERT (!GB_ZOMBIES (A)) ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+    ASSERT (!GB_PENDING (A)) ;
+
     const int64_t  *GB_RESTRICT Ap = A->p ;
     const int64_t  *GB_RESTRICT Ah = A->h ;
     const int64_t  *GB_RESTRICT Ai = A->i ;
+    const int8_t   *GB_RESTRICT Ab = A->b ;
     const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) A->x ;
+    const int64_t avlen = A->vlen ;
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
+    const bool A_is_dense = GB_as_if_full (A) ;
+    const int64_t anz = GB_NNZ_HELD (A) ;
 
     GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
+    int8_t   *GB_RESTRICT Cb = C->b ;
     const int64_t cvlen = C->vlen ;
+    const bool C_is_bitmap = GB_IS_BITMAP (C) ;
 
     //--------------------------------------------------------------------------
     // C<A> = A
     //--------------------------------------------------------------------------
 
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
+    int64_t cnvals = C->nvals ;     // for C bitmap
 
-        // if kfirst > klast then taskid does no work at all
-        int64_t kfirst = kfirst_slice [taskid] ;
-        int64_t klast  = klast_slice  [taskid] ;
+    if (A_is_dense)
+    { 
 
         //----------------------------------------------------------------------
-        // C<A(:,kfirst:klast)> = A(:,kfirst:klast)
+        // A is dense: all entries present
         //----------------------------------------------------------------------
 
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
+        if (C_is_bitmap)
+        { 
 
             //------------------------------------------------------------------
-            // find the part of A(:,k) to be operated on by this task
+            // C is bitmap, A is dense
             //------------------------------------------------------------------
 
-            int64_t j = (Ah == NULL) ? k : Ah [k] ;
-            int64_t pA_start, pA_end ;
-            GB_get_pA_and_pC (&pA_start, &pA_end, NULL,
-                taskid, k, kfirst, klast, pstart_slice, NULL, NULL, Ap) ;
+            if (Mask_struct)
+            {
+                // C<A,struct>=A with C bitmap, A dense
+                int64_t p ;
+                #pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (p = 0 ; p < anz ; p++)
+                { 
+                    // Cx [p] = Ax [p]
+                    GB_COPY_A_TO_C (Cx, p, Ax, p) ;
+                }
+                GB_memset (Cb, 1, anz, nthreads) ;
+                cnvals = anz ;
+            }
+            else
+            {
+                // C<A>=A with C bitmap, A dense
+                int tid ;
+                #pragma omp parallel for num_threads(nthreads) schedule(static)\
+                    reduction(+:cnvals)
+                for (tid = 0 ; tid < nthreads ; tid++)
+                {
+                    int64_t pA_start, pA_end, task_cnvals = 0 ;
+                    GB_PARTITION (pA_start, pA_end, anz, tid, nthreads) ;
+                    for (int64_t p = pA_start ; p < pA_end ; p++)
+                    {
+                        if (GB_AX_MASK (Ax, p, asize))
+                        { 
+                            // Cx [p] = Ax [p]
+                            GB_COPY_A_TO_C (Cx, p, Ax, p) ;
+                            task_cnvals += (Cb [p] == 0) ;
+                            Cb [p] = 1 ;
+                        }
+                    }
+                    cnvals += task_cnvals ;
+                }
+            }
 
-            // pC points to the start of C(:,j) if C is dense
-            int64_t pC = j * cvlen ;
+        }
+        else
+        {
 
             //------------------------------------------------------------------
-            // C<A(:,j)> = A(:,j)
+            // C is hypersparse, sparse, or full, with all entries present
             //------------------------------------------------------------------
 
             if (Mask_struct)
             {
-                GB_PRAGMA_SIMD_VECTORIZE
-                for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                // C<A,struct>=A with C sparse/hyper/full
+                int64_t p ;
+                #pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (p = 0 ; p < anz ; p++)
                 { 
-                    int64_t p = pC + Ai [pA] ;
-                    GB_COPY_A_TO_C (Cx, p, Ax, pA) ;    // Cx [p] = Ax [pA]
+                    // Cx [p] = Ax [p]
+                    GB_COPY_A_TO_C (Cx, p, Ax, p) ;
+                }
+            }
+            else
+            {
+                // C<A>=A with C sparse/hyper/full
+                int64_t p ;
+                #pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (p = 0 ; p < anz ; p++)
+                {
+                    if (GB_AX_MASK (Ax, p, asize))
+                    { 
+                        // Cx [p] = Ax [p]
+                        GB_COPY_A_TO_C (Cx, p, Ax, p) ;
+                    }
+                }
+            }
+        }
+
+    }
+    else if (A_is_bitmap)
+    {
+        //----------------------------------------------------------------------
+        // A is bitmap
+        //----------------------------------------------------------------------
+
+        if (C_is_bitmap)
+        {
+
+            //------------------------------------------------------------------
+            // C is bitmap, A is bitmap
+            //------------------------------------------------------------------
+
+            if (Mask_struct)
+            {
+                // C<A,struct>=A with A and C bitmap
+                int tid ;
+                #pragma omp parallel for num_threads(nthreads) schedule(static)\
+                    reduction(+:cnvals)
+                for (tid = 0 ; tid < nthreads ; tid++)
+                {
+                    int64_t pA_start, pA_end, task_cnvals = 0 ;
+                    GB_PARTITION (pA_start, pA_end, anz, tid, nthreads) ;
+                    for (int64_t p = pA_start ; p < pA_end ; p++)
+                    {
+                        if (Ab [p])
+                        { 
+                            // Cx [p] = Ax [p]
+                            GB_COPY_A_TO_C (Cx, p, Ax, p) ;
+                            task_cnvals += (Cb [p] == 0) ;
+                            Cb [p] = 1 ;
+                        }
+                    }
+                    cnvals += task_cnvals ;
+                }
+
+            }
+            else
+            {
+                // C<A>=A with A and C bitmap
+                int tid ;
+                #pragma omp parallel for num_threads(nthreads) schedule(static)\
+                    reduction(+:cnvals)
+                for (tid = 0 ; tid < nthreads ; tid++)
+                {
+                    int64_t pA_start, pA_end, task_cnvals = 0 ;
+                    GB_PARTITION (pA_start, pA_end, anz, tid, nthreads) ;
+                    for (int64_t p = pA_start ; p < pA_end ; p++)
+                    {
+                        if (Ab [p] && GB_AX_MASK (Ax, p, asize))
+                        { 
+                            // Cx [p] = Ax [p]
+                            GB_COPY_A_TO_C (Cx, p, Ax, p) ;
+                            task_cnvals += (Cb [p] == 0) ;
+                            Cb [p] = 1 ;
+                        }
+                    }
+                    cnvals += task_cnvals ;
+                }
+            }
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // C is hypersparse, sparse, or full, with all entries present
+            //------------------------------------------------------------------
+
+            if (Mask_struct)
+            {
+                // C<A,struct>=A with A bitmap, and C hyper/sparse/full
+                // this method is used by LAGraph_bfs_parent when q is
+                // a bitmap and pi is full.
+                int64_t p ;
+                #pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (p = 0 ; p < anz ; p++)
+                {
+                    // Cx [p] = Ax [p]
+                    if (Ab [p])
+                    { 
+                        GB_COPY_A_TO_C (Cx, p, Ax, p) ;
+                    }
                 }
             }
             else
             {
-                GB_PRAGMA_SIMD_VECTORIZE
-                for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                // C<A>=A with A bitmap, and C hyper/sparse/full
+                int64_t p ;
+                #pragma omp parallel for num_threads(nthreads) schedule(static)
+                for (p = 0 ; p < anz ; p++)
                 {
-                    if (GB_AX_MASK (Ax, pA, asize))
+                    if (Ab [p] && GB_AX_MASK (Ax, p, asize))
                     { 
-                        int64_t p = pC + Ai [pA] ;
-                        GB_COPY_A_TO_C (Cx, p, Ax, pA) ;    // Cx [p] = Ax [pA]
+                        // Cx [p] = Ax [p]
+                        GB_COPY_A_TO_C (Cx, p, Ax, p) ;
+                    }
+                }
+            }
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // A is hypersparse or sparse; C is dense or a bitmap
+        //----------------------------------------------------------------------
+
+        int taskid ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:cnvals)
+        for (taskid = 0 ; taskid < ntasks ; taskid++)
+        {
+
+            // if kfirst > klast then taskid does no work at all
+            int64_t kfirst = kfirst_slice [taskid] ;
+            int64_t klast  = klast_slice  [taskid] ;
+            int64_t task_cnvals = 0 ;
+
+            //------------------------------------------------------------------
+            // C<A(:,kfirst:klast)> = A(:,kfirst:klast)
+            //------------------------------------------------------------------
+
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+
+                //--------------------------------------------------------------
+                // find the part of A(:,k) to be operated on by this task
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Ah, k) ;
+                int64_t pA_start, pA_end ;
+                GB_get_pA (&pA_start, &pA_end, taskid, k,
+                    kfirst, klast, pstart_slice, Ap, avlen) ;
+
+                // pC points to the start of C(:,j) if C is dense or bitmap
+                int64_t pC = j * cvlen ;
+
+                //--------------------------------------------------------------
+                // C<A(:,j)> = A(:,j)
+                //--------------------------------------------------------------
+
+                if (Mask_struct)
+                {
+                    if (C_is_bitmap)
+                    {
+                        // C<A,struct>=A with C bitmap, A sparse
+                        GB_PRAGMA_SIMD_VECTORIZE
+                        for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                        { 
+                            int64_t p = pC + GBI (Ai, pA, avlen) ;
+                            // Cx [p] = Ax [pA]
+                            GB_COPY_A_TO_C (Cx, p, Ax, pA) ;
+                            task_cnvals += (Cb [p] == 0) ;
+                            Cb [p] = 1 ;
+                        }
+                    }
+                    else
+                    {
+                        // C<A,struct>=A with C full, A sparse
+                        GB_PRAGMA_SIMD_VECTORIZE
+                        for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                        { 
+                            int64_t p = pC + GBI (Ai, pA, avlen) ;
+                            // Cx [p] = Ax [pA]
+                            GB_COPY_A_TO_C (Cx, p, Ax, pA) ;
+                        }
+                    }
+                }
+                else
+                {
+                    if (C_is_bitmap)
+                    {
+                        // C<A,struct>=A with C bitmap, A sparse
+                        GB_PRAGMA_SIMD_VECTORIZE
+                        for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                        {
+                            if (GB_AX_MASK (Ax, pA, asize))
+                            { 
+                                int64_t p = pC + GBI (Ai, pA, avlen) ;
+                                // Cx [p] = Ax [pA]
+                                GB_COPY_A_TO_C (Cx, p, Ax, pA) ;
+                                task_cnvals += (Cb [p] == 0) ;
+                                Cb [p] = 1 ;
+                            }
+                        }
+                    }
+                    else
+                    {
+                        // C<A,struct>=A with C dense, A sparse
+                        GB_PRAGMA_SIMD_VECTORIZE
+                        for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                        {
+                            if (GB_AX_MASK (Ax, pA, asize))
+                            { 
+                                int64_t p = pC + GBI (Ai, pA, avlen) ;
+                                // Cx [p] = Ax [pA]
+                                GB_COPY_A_TO_C (Cx, p, Ax, pA) ;
+                            }
+                        }
                     }
                 }
             }
+            cnvals += task_cnvals ;
         }
     }
+
+    //--------------------------------------------------------------------------
+    // log the number of entries in the C bitmap
+    //--------------------------------------------------------------------------
+
+    if (C_is_bitmap)
+    { 
+        C->nvals = cnvals ;
+    }
 }
 
diff --git a/GraphBLAS/Source/Template/GB_dense_subassign_22_template.c b/GraphBLAS/Source/Template/GB_dense_subassign_22_template.c
index 050dc6b7ae..58cb3037a7 100644
--- a/GraphBLAS/Source/Template/GB_dense_subassign_22_template.c
+++ b/GraphBLAS/Source/Template/GB_dense_subassign_22_template.c
@@ -2,8 +2,8 @@
 // GB_dense_subassign_22_template: C += b where C is dense and b is a scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -24,7 +24,7 @@
     #pragma omp parallel for num_threads(nthreads) schedule(static)
     for (pC = 0 ; pC < cnz ; pC++)
     { 
-        GB_BINOP (GB_CX (pC), GB_CX (pC), bwork) ;
+        GB_BINOP (GB_CX (pC), GB_CX (pC), bwork, 0, 0) ;
     }
 }
 
diff --git a/GraphBLAS/Source/Template/GB_dense_subassign_23_template.c b/GraphBLAS/Source/Template/GB_dense_subassign_23_template.c
index b4b881465c..90de306a1b 100644
--- a/GraphBLAS/Source/Template/GB_dense_subassign_23_template.c
+++ b/GraphBLAS/Source/Template/GB_dense_subassign_23_template.c
@@ -2,12 +2,12 @@
 // GB_dense_subassign_23_template: C += B where C is dense; B is sparse or dense
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// All entries in C+=B are computed fully in parallel, using the same kind of
+// All entries in C+=B are computed entirely in parallel, using the same kind of
 // parallelism as Template/GB_AxB_colscale.c.
 
 #include "GB_unused.h"
@@ -21,8 +21,27 @@
     const GB_BTYPE *GB_RESTRICT Bx = (GB_BTYPE *) B->x ;
     GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
     ASSERT (GB_is_dense (C)) ;
+    const int64_t cnz = GB_NNZ_HELD (C) ;
 
-    if (kfirst_slice == NULL)
+    if (GB_IS_BITMAP (B))
+    {
+
+        //----------------------------------------------------------------------
+        // C += B when C is dense and B is bitmap
+        //----------------------------------------------------------------------
+
+        const int8_t *GB_RESTRICT Bb = B->b ;
+        int64_t p ;
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (p = 0 ; p < cnz ; p++)
+        { 
+            if (!Bb [p]) continue ;
+            GB_GETB (bij, Bx, p) ;                  // bij = B(i,j)
+            GB_BINOP (GB_CX (p), GB_CX (p), bij, 0, 0) ;  // C(i,j) += bij
+        }
+
+    }
+    else if (kfirst_slice == NULL)
     {
 
         //----------------------------------------------------------------------
@@ -30,7 +49,6 @@
         //----------------------------------------------------------------------
 
         ASSERT (GB_is_dense (B)) ;
-        const int64_t cnz = GB_NNZ (C) ;
 
         #if defined ( GB_HAS_CBLAS ) && GB_OP_IS_PLUS_REAL
 
@@ -63,7 +81,7 @@
             for (p = 0 ; p < cnz ; p++)
             { 
                 GB_GETB (bij, Bx, p) ;                  // bij = B(i,j)
-                GB_BINOP (GB_CX (p), GB_CX (p), bij) ;  // C(i,j) += bij
+                GB_BINOP (GB_CX (p), GB_CX (p), bij, 0, 0) ;  // C(i,j) += bij
             }
 
         #endif
@@ -75,10 +93,14 @@
         // C += B when C is dense and B is sparse
         //----------------------------------------------------------------------
 
+        ASSERT (GB_JUMBLED_OK (B)) ;
+
         const int64_t *GB_RESTRICT Bp = B->p ;
         const int64_t *GB_RESTRICT Bh = B->h ;
         const int64_t *GB_RESTRICT Bi = B->i ;
+        const int64_t bvlen = B->vlen ;
         const int64_t cvlen = C->vlen ;
+        bool B_jumbled = B->jumbled ;
 
         int taskid ;
         #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
@@ -100,13 +122,14 @@
                 // find the part of B(:,k) and C(:,k) for this task
                 //--------------------------------------------------------------
 
-                int64_t j = (Bh == NULL) ? k : Bh [k] ;
+                int64_t j = GBH (Bh, k) ;
                 int64_t my_pB_start, my_pB_end ;
-                GB_get_pA_and_pC (&my_pB_start, &my_pB_end, NULL,
-                    taskid, k, kfirst, klast, pstart_slice, NULL, NULL, Bp) ;
+                GB_get_pA (&my_pB_start, &my_pB_end, taskid, k,
+                    kfirst, klast, pstart_slice, Bp, bvlen) ;
 
-                int64_t pB_start = Bp [k] ;
-                bool ajdense = ((Bp [k+1] - pB_start) == cvlen) ;
+                int64_t pB_start = GBP (Bp, k, bvlen) ;
+                int64_t pB_end   = GBP (Bp, k+1, bvlen) ;
+                bool bjdense = ((pB_end - pB_start) == cvlen) ;
 
                 // pC points to the start of C(:,j) if C is dense
                 int64_t pC = j * cvlen ;
@@ -115,7 +138,7 @@
                 // C(:,j) += B(:,j)
                 //--------------------------------------------------------------
 
-                if (ajdense)
+                if (bjdense && !B_jumbled)
                 { 
 
                     //----------------------------------------------------------
@@ -170,7 +193,7 @@
                             // bij = B(i,j)
                             GB_GETB (bij, Bx, pB) ;
                             // C(i,j) += bij
-                            GB_BINOP (GB_CX (p), GB_CX (p), bij) ;
+                            GB_BINOP (GB_CX (p), GB_CX (p), bij, 0, 0) ;
                         }
 
                     #endif
@@ -189,7 +212,8 @@
                         int64_t i = Bi [pB] ;
                         int64_t p = pC + i ;
                         GB_GETB (bij, Bx, pB) ;                 // bij = B(i,j)
-                        GB_BINOP (GB_CX (p), GB_CX (p), bij) ;  // C(i,j) += bij
+                        // C(i,j) += bij
+                        GB_BINOP (GB_CX (p), GB_CX (p), bij, 0, 0) ;
                     }
                 }
             }
diff --git a/GraphBLAS/Source/Template/GB_dense_subassign_25_template.c b/GraphBLAS/Source/Template/GB_dense_subassign_25_template.c
index 7e9cacdefb..f8d65a6047 100644
--- a/GraphBLAS/Source/Template/GB_dense_subassign_25_template.c
+++ b/GraphBLAS/Source/Template/GB_dense_subassign_25_template.c
@@ -2,13 +2,14 @@
 // GB_dense_subassign_25_template: C<M> = A where C is empty and A is dense
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // C<M> = A where C starts as empty, M is structural, and A is dense.  The
-// pattern of C is an exact copy of M.
+// pattern of C is an exact copy of M.  A is full, dense, or bitmap.
+// M is sparse or hypersparse, and C is constructed with the same pattern as M.
 
 {
 
@@ -16,54 +17,136 @@
     // get C, M, and A
     //--------------------------------------------------------------------------
 
+    ASSERT (GB_sparsity (M) == GB_sparsity (C)) ;
     GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
+    int64_t *GB_RESTRICT Ci = C->i ;
+
+    ASSERT (GB_IS_SPARSE (M) || GB_IS_HYPERSPARSE (M)) ;
+    ASSERT (GB_JUMBLED_OK (M)) ;
     const int64_t *GB_RESTRICT Mp = M->p ;
     const int64_t *GB_RESTRICT Mh = M->h ;
     const int64_t *GB_RESTRICT Mi = M->i ;
+    const int64_t mvlen = M->vlen ;
+
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
     const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) A->x ;
+    const int8_t   *GB_RESTRICT Ab = A->b ;
     const int64_t avlen = A->vlen ;
 
     //--------------------------------------------------------------------------
     // C<M> = A
     //--------------------------------------------------------------------------
 
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
-
-        // if kfirst > klast then taskid does no work at all
-        int64_t kfirst = kfirst_slice [taskid] ;
-        int64_t klast  = klast_slice  [taskid] ;
+    if (A_is_bitmap)
+    { 
 
         //----------------------------------------------------------------------
-        // C<M(:,kfirst:klast)> = A(:,kfirst:klast)
+        // A is bitmap, so zombies can be created in C
         //----------------------------------------------------------------------
 
-        for (int64_t k = kfirst ; k <= klast ; k++)
+        int64_t nzombies = 0 ;
+
+        int tid ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
+            reduction(+:nzombies)
+        for (tid = 0 ; tid < ntasks ; tid++)
         {
 
+            // if kfirst > klast then task tid does no work at all
+            int64_t kfirst = kfirst_slice [tid] ;
+            int64_t klast  = klast_slice  [tid] ;
+            int64_t task_nzombies = 0 ;
+
             //------------------------------------------------------------------
-            // find the part of M(:,k) to be operated on by this task
+            // C<M(:,kfirst:klast)> = A(:,kfirst:klast)
             //------------------------------------------------------------------
 
-            int64_t j = (Mh == NULL) ? k : Mh [k] ;
-            int64_t pM_start, pM_end ;
-            GB_get_pA_and_pC (&pM_start, &pM_end, NULL,
-                taskid, k, kfirst, klast, pstart_slice, NULL, NULL, Mp) ;
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+
+                //--------------------------------------------------------------
+                // find the part of M(:,k) to be operated on by this task
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Mh, k) ;
+                int64_t pM_start, pM_end ;
+                GB_get_pA (&pM_start, &pM_end, tid, k,
+                    kfirst, klast, pstart_slice, Mp, mvlen) ;
+
+                //--------------------------------------------------------------
+                // C<M(:,j)> = A(:,j)
+                //--------------------------------------------------------------
+
+                // M is hypersparse or sparse.  C is the same as M.
+                // pA points to the start of A(:,j) since A is dense
+                int64_t pA = j * avlen ;
+                for (int64_t pM = pM_start ; pM < pM_end ; pM++)
+                { 
+                    int64_t i = Mi [pM] ;
+                    int64_t p = pA + i ;
+                    if (Ab [p])
+                    { 
+                        // C(i,j) = A(i,j)
+                        GB_COPY_A_TO_C (Cx, pM, Ax, p) ;    // Cx [pM] = Ax [p]
+                    }
+                    else
+                    { 
+                        // C(i,j) becomes a zombie
+                        task_nzombies++ ;
+                        Ci [pM] = GB_FLIP (i) ;
+                    }
+                }
+            }
+            nzombies += task_nzombies ;
+        }
+        C->nzombies = nzombies ;
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // A is full, so no zombies will appear in C
+        //----------------------------------------------------------------------
+
+        int tid ;
+        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+        for (tid = 0 ; tid < ntasks ; tid++)
+        {
 
-            // pA points to the start of A(:,j) since A is dense
-            int64_t pA = j * avlen ;
+            // if kfirst > klast then task tid does no work at all
+            int64_t kfirst = kfirst_slice [tid] ;
+            int64_t klast  = klast_slice  [tid] ;
 
             //------------------------------------------------------------------
-            // C<M(:,j)> = A(:,j)
+            // C<M(:,kfirst:klast)> = A(:,kfirst:klast)
             //------------------------------------------------------------------
 
-            GB_PRAGMA_SIMD_VECTORIZE
-            for (int64_t pM = pM_start ; pM < pM_end ; pM++)
-            { 
-                int64_t p = pA + Mi [pM] ;
-                GB_COPY_A_TO_C (Cx, pM, Ax, p) ;    // Cx [pM] = Ax [p]
+            for (int64_t k = kfirst ; k <= klast ; k++)
+            {
+
+                //--------------------------------------------------------------
+                // find the part of M(:,k) to be operated on by this task
+                //--------------------------------------------------------------
+
+                int64_t j = GBH (Mh, k) ;
+                int64_t pM_start, pM_end ;
+                GB_get_pA (&pM_start, &pM_end, tid, k,
+                    kfirst, klast, pstart_slice, Mp, mvlen) ;
+
+                //--------------------------------------------------------------
+                // C<M(:,j)> = A(:,j)
+                //--------------------------------------------------------------
+
+                // M is hypersparse or sparse.  C is the same as M.
+                // pA points to the start of A(:,j) since A is dense
+                int64_t pA = j * avlen ;
+                GB_PRAGMA_SIMD_VECTORIZE
+                for (int64_t pM = pM_start ; pM < pM_end ; pM++)
+                { 
+                    int64_t p = pA + GBI (Mi, pM, mvlen) ;
+                    GB_COPY_A_TO_C (Cx, pM, Ax, p) ;    // Cx [pM] = Ax [p]
+                }
             }
         }
     }
diff --git a/GraphBLAS/Source/Template/GB_emult_template.c b/GraphBLAS/Source/Template/GB_emult_template.c
index 02058ab9a7..0ebd20579a 100644
--- a/GraphBLAS/Source/Template/GB_emult_template.c
+++ b/GraphBLAS/Source/Template/GB_emult_template.c
@@ -1,16 +1,17 @@
 //------------------------------------------------------------------------------
-// GB_emult_template:  phase1 and phase2 for C=A.*B, C<M>=A.*B
+// GB_emult_template:  phase1 and phase2 for C=A.*B, C<M>=A.*B, C<!M>=A.*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Computes C=A.*B (no mask) or C<M>=A.*B (mask present and not complemented).
-// Does not handle the case C<!M>=A.*B.  The complemented mask is handled in
-// GB_mask instead.  If present, the mask M is assumed to be very sparse
-// compared with A and B.
+// Computes C=A.*B, C<M>=A.*B, or C<!M>=A.*B. 
+
+// C is sparse, hypersparse, or bitmap.  M, A, and B can have any sparsity
+// structure.  If both A and B are full, then GB_add is used instead (this is
+// the only case where C can be full).
 
 // phase1: does not compute C itself, but just counts the # of entries in each
 // vector of C.  Fine tasks compute the # of entries in their slice of a
@@ -27,24 +28,46 @@
     // get A, B, M, and C
     //--------------------------------------------------------------------------
 
+    int taskid ;
+
     const int64_t *GB_RESTRICT Ap = A->p ;
     const int64_t *GB_RESTRICT Ah = A->h ;
+    const int8_t  *GB_RESTRICT Ab = A->b ;
     const int64_t *GB_RESTRICT Ai = A->i ;
     const int64_t vlen = A->vlen ;
+    const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
+    const bool A_is_sparse = GB_IS_SPARSE (A) ;
+    const bool A_is_bitmap = GB_IS_BITMAP (A) ;
+    const bool A_is_full = GB_as_if_full (A) ;
+    int A_nthreads, A_ntasks ;
 
     const int64_t *GB_RESTRICT Bp = B->p ;
     const int64_t *GB_RESTRICT Bh = B->h ;
+    const int8_t  *GB_RESTRICT Bb = B->b ;
     const int64_t *GB_RESTRICT Bi = B->i ;
+    const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
+    const bool B_is_sparse = GB_IS_SPARSE (B) ;
+    const bool B_is_bitmap = GB_IS_BITMAP (B) ;
+    const bool B_is_full = GB_as_if_full (B) ;
+    int B_nthreads, B_ntasks ;
 
     const int64_t *GB_RESTRICT Mp = NULL ;
     const int64_t *GB_RESTRICT Mh = NULL ;
+    const int8_t  *GB_RESTRICT Mb = NULL ;
     const int64_t *GB_RESTRICT Mi = NULL ;
     const GB_void *GB_RESTRICT Mx = NULL ;
+    const bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
+    const bool M_is_sparse = GB_IS_SPARSE (M) ;
+    const bool M_is_bitmap = GB_IS_BITMAP (M) ;
+    const bool M_is_full = GB_as_if_full (M) ;
+    const bool M_is_sparse_or_hyper = M_is_sparse || M_is_hyper ;
+    int M_nthreads, M_ntasks ;
     size_t msize = 0 ;
     if (M != NULL)
     { 
         Mp = M->p ;
         Mh = M->h ;
+        Mb = M->b ;
         Mi = M->i ;
         Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
         msize = M->type->size ;
@@ -55,497 +78,39 @@
     const GB_BTYPE *GB_RESTRICT Bx = (GB_BTYPE *) B->x ;
     const int64_t  *GB_RESTRICT Cp = C->p ;
     const int64_t  *GB_RESTRICT Ch = C->h ;
+          int8_t   *GB_RESTRICT Cb = C->b ;
           int64_t  *GB_RESTRICT Ci = C->i ;
           GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
+    // when C is bitmap or full:
+    const int64_t cnz = GB_NNZ_HELD (C) ;
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
     #endif
 
     //--------------------------------------------------------------------------
-    // phase1: count entries in each C(:,j); phase2: compute C
+    // C=A.*B, C<M>=A.*B, or C<!M>=A.*B: 2 cases for the sparsity of C
     //--------------------------------------------------------------------------
 
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
-    {
+    #if defined ( GB_PHASE_1_OF_2 )
+
+        // phase1: symbolic phase
+        // C is sparse or hypersparse (never bitmap or full)
+        #include "GB_sparse_emult_template.c"
 
-        //----------------------------------------------------------------------
-        // get the task descriptor
-        //----------------------------------------------------------------------
+    #else
 
-        int64_t kfirst = TaskList [taskid].kfirst ;
-        int64_t klast  = TaskList [taskid].klast ;
-        bool fine_task = (klast == -1) ;
-        int64_t len ;
-        if (fine_task)
+        // phase2: numerical phase
+        if (C_sparsity == GxB_SPARSE || C_sparsity == GxB_HYPERSPARSE)
         { 
-            // a fine task operates on a slice of a single vector
-            klast = kfirst ;
-            len = TaskList [taskid].len ;
+            // C is sparse or hypersparse
+            #include "GB_sparse_emult_template.c"
         }
-        else
+        else // C_sparsity == GxB_BITMAP
         { 
-            // a coarse task operates on one or more whole vectors
-            len = vlen ;
+            // C is bitmap (phase2 only)
+            ASSERT (C_sparsity == GxB_BITMAP) ;
+            #include "GB_bitmap_emult_template.c"
         }
 
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // get j, the kth vector of C
-            //------------------------------------------------------------------
-
-            int64_t j = (Ch == NULL) ? k : Ch [k] ;
-
-            #if defined ( GB_PHASE_1_OF_2 )
-            int64_t cjnz = 0 ;
-            #else
-            int64_t pC, pC_end ;
-            if (fine_task)
-            { 
-                // A fine task computes a slice of C(:,j)
-                pC     = TaskList [taskid  ].pC ;
-                pC_end = TaskList [taskid+1].pC ;
-                ASSERT (Cp [k] <= pC && pC <= pC_end && pC_end <= Cp [k+1]) ;
-            }
-            else
-            { 
-                // The vectors of C are never sliced for a coarse task.
-                pC     = Cp [k] ;
-                pC_end = Cp [k+1] ;
-            }
-            int64_t cjnz = pC_end - pC ;
-            if (cjnz == 0) continue ;
-            #endif
-
-            //------------------------------------------------------------------
-            // get A(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pA = -1, pA_end = -1 ;
-            if (fine_task)
-            { 
-                // A fine task operates on Ai,Ax [pA...pA_end-1], which is
-                // A fine task operates on Ai,Ax [pA...pA_end-1], which is
-                // a subset of the vector A(:,j)
-                pA     = TaskList [taskid].pA ;
-                pA_end = TaskList [taskid].pA_end ;
-            }
-            else
-            {
-                // A coarse task operates on the entire vector A (:,j)
-                int64_t kA = (Ch == Ah) ? k :
-                            ((C_to_A == NULL) ? j : C_to_A [k]) ;
-                if (kA >= 0)
-                { 
-                    pA     = Ap [kA] ;
-                    pA_end = Ap [kA+1] ;
-                }
-            }
-
-            int64_t ajnz = pA_end - pA ;        // nnz in A(:,j) for this slice
-            bool adense = (ajnz == len) ;
-            int64_t pA_start = pA ;
-
-            // get the first and last indices in A(:,j) for this vector
-            int64_t iA_first = -1 ;
-            if (ajnz > 0)
-            { 
-                iA_first = Ai [pA] ;
-            }
-            #if defined ( GB_PHASE_1_OF_2 ) || defined ( GB_DEBUG )
-            int64_t iA_last = -1 ;
-            if (ajnz > 0)
-            { 
-                iA_last  = Ai [pA_end-1] ;
-            }
-            #endif
-
-            //------------------------------------------------------------------
-            // get B(:,j)
-            //------------------------------------------------------------------
-
-            int64_t pB = -1, pB_end = -1 ;
-            if (fine_task)
-            { 
-                // A fine task operates on Bi,Bx [pB...pB_end-1], which is
-                // a subset of the vector B(:,j)
-                pB     = TaskList [taskid].pB ;
-                pB_end = TaskList [taskid].pB_end ;
-            }
-            else
-            {
-                // A coarse task operates on the entire vector B (:,j)
-                int64_t kB = (Ch == Bh) ? k :
-                            ((C_to_B == NULL) ? j : C_to_B [k]) ;
-                if (kB >= 0)
-                { 
-                    pB     = Bp [kB] ;
-                    pB_end = Bp [kB+1] ;
-                }
-            }
-
-            int64_t bjnz = pB_end - pB ;        // nnz in B(:,j) for this slice
-            bool bdense = (bjnz == len) ;
-            int64_t pB_start = pB ;
-
-            // get the first and last indices in B(:,j) for this vector
-            int64_t iB_first = -1 ;
-            if (bjnz > 0)
-            { 
-                iB_first = Bi [pB] ;
-            }
-            #if defined ( GB_PHASE_1_OF_2 ) || defined ( GB_DEBUG )
-            int64_t iB_last = -1 ;
-            if (bjnz > 0)
-            { 
-                iB_last  = Bi [pB_end-1] ;
-            }
-            #endif
-
-            //------------------------------------------------------------------
-            // phase1: count nnz (C (:,j)); phase2: compute C(:,j)
-            //------------------------------------------------------------------
-
-            #if defined ( GB_PHASE_1_OF_2 )
-
-            if (ajnz == 0 || bjnz == 0)
-            { 
-
-                //--------------------------------------------------------------
-                // A(:,j) and/or B(:,j) are empty
-                //--------------------------------------------------------------
-
-                ;
-
-            }
-            else if (iA_last < iB_first || iB_last < iA_first)
-            { 
-
-                //--------------------------------------------------------------
-                // intersection of A(:,j) and B(:,j) is empty
-                //--------------------------------------------------------------
-
-                // the last entry of A(:,j) comes before the first entry
-                // of B(:,j), or visa versa
-                ;
-
-            }
-            else
-
-            #endif
-
-            if (M == NULL)
-            {
-
-                if (adense && bdense)
-                {
-
-                    //----------------------------------------------------------
-                    // A(:,j) and B(:,j) dense: thus C(:,j) dense
-                    //----------------------------------------------------------
-
-                    ASSERT (ajnz == bjnz) ;
-                    ASSERT (iA_first == iB_first) ;
-                    ASSERT (iA_last  == iB_last ) ;
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cjnz = ajnz ;
-                    #else
-                    ASSERT (cjnz == ajnz) ;
-                    GB_PRAGMA_SIMD_VECTORIZE
-                    for (int64_t p = 0 ; p < ajnz ; p++)
-                    { 
-                        Ci [pC + p] = p + iA_first ;
-                        GB_GETA (aij, Ax, pA + p) ;
-                        GB_GETB (bij, Bx, pB + p) ;
-                        GB_BINOP (GB_CX (pC + p), aij, bij) ;
-                    }
-                    #endif
-
-                }
-                else if (adense)
-                {
-
-                    //----------------------------------------------------------
-                    // A(:,j) is dense, B(:,j) is sparse: thus C(:,j) sparse
-                    //----------------------------------------------------------
-
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cjnz = bjnz ;
-                    #else
-                    ASSERT (cjnz == bjnz) ;
-                    GB_PRAGMA_SIMD_VECTORIZE
-                    for (int64_t p = 0 ; p < bjnz ; p++)
-                    { 
-                        int64_t i = Bi [pB + p] ;
-                        Ci [pC + p] = i ;
-                        GB_GETA (aij, Ax, pA + i - iA_first) ;
-                        GB_GETB (bij, Bx, pB + p) ;
-                        GB_BINOP (GB_CX (pC + p), aij, bij) ;
-                    }
-                    #endif
-
-                }
-                else if (bdense)
-                {
-
-                    //----------------------------------------------------------
-                    // A(:,j) is sparse, B(:,j) is dense: thus C(:,j) sparse
-                    //----------------------------------------------------------
-
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cjnz = ajnz ;
-                    #else
-                    ASSERT (cjnz == ajnz) ;
-                    GB_PRAGMA_SIMD_VECTORIZE
-                    for (int64_t p = 0 ; p < ajnz ; p++)
-                    { 
-                        int64_t i = Ai [pA + p] ;
-                        Ci [pC + p] = i ;
-                        GB_GETA (aij, Ax, pA + p) ;
-                        GB_GETB (bij, Bx, pB + i - iB_first) ;
-                        GB_BINOP (GB_CX (pC + p), aij, bij) ;
-                    }
-                    #endif
-
-                }
-                else if (ajnz > 32 * bjnz)
-                {
-
-                    //----------------------------------------------------------
-                    // A(:,j) is much denser than B(:,j)
-                    //----------------------------------------------------------
-
-                    for ( ; pB < pB_end ; pB++)
-                    {
-                        int64_t i = Bi [pB] ;
-                        // find i in A(:,j)
-                        int64_t pright = pA_end - 1 ;
-                        bool found ;
-                        GB_BINARY_SEARCH (i, Ai, pA, pright, found) ;
-                        if (found)
-                        { 
-                            #if defined ( GB_PHASE_1_OF_2 )
-                            cjnz++ ;
-                            #else
-                            ASSERT (pC < pC_end) ;
-                            Ci [pC] = i ;
-                            GB_GETA (aij, Ax, pA) ;
-                            GB_GETB (bij, Bx, pB) ;
-                            GB_BINOP (GB_CX (pC), aij, bij) ;
-                            pC++ ;
-                            #endif
-                        }
-                    }
-                    #if defined ( GB_PHASE_2_OF_2 )
-                    ASSERT (pC == pC_end) ;
-                    #endif
-
-                }
-                else if (bjnz > 32 * ajnz)
-                {
-
-                    //----------------------------------------------------------
-                    // B(:,j) is much denser than A(:,j)
-                    //----------------------------------------------------------
-
-                    for ( ; pA < pA_end ; pA++)
-                    {
-                        int64_t i = Ai [pA] ;
-                        // find i in B(:,j)
-                        int64_t pright = pB_end - 1 ;
-                        bool found ;
-                        GB_BINARY_SEARCH (i, Bi, pB, pright, found) ;
-                        if (found)
-                        { 
-                            #if defined ( GB_PHASE_1_OF_2 )
-                            cjnz++ ;
-                            #else
-                            ASSERT (pC < pC_end) ;
-                            Ci [pC] = i ;
-                            GB_GETA (aij, Ax, pA) ;
-                            GB_GETB (bij, Bx, pB) ;
-                            GB_BINOP (GB_CX (pC), aij, bij) ;
-                            pC++ ;
-                            #endif
-                        }
-                    }
-                    #if defined ( GB_PHASE_2_OF_2 )
-                    ASSERT (pC == pC_end) ;
-                    #endif
-
-                }
-                else
-                {
-
-                    //----------------------------------------------------------
-                    // A(:,j) and B(:,j) have about the same # of entries
-                    //----------------------------------------------------------
-
-                    // linear-time scan of A(:,j) and B(:,j)
-
-                    while (pA < pA_end && pB < pB_end)
-                    {
-                        int64_t iA = Ai [pA] ;
-                        int64_t iB = Bi [pB] ;
-                        if (iA < iB)
-                        { 
-                            // A(i,j) exists but not B(i,j)
-                            pA++ ;
-                        }
-                        else if (iB < iA)
-                        { 
-                            // B(i,j) exists but not A(i,j)
-                            pB++ ;
-                        }
-                        else
-                        { 
-                            // both A(i,j) and B(i,j) exist
-                            #if defined ( GB_PHASE_1_OF_2 )
-                            cjnz++ ;
-                            #else
-                            ASSERT (pC < pC_end) ;
-                            Ci [pC] = iB ;
-                            GB_GETA (aij, Ax, pA) ;
-                            GB_GETB (bij, Bx, pB) ;
-                            GB_BINOP (GB_CX (pC), aij, bij) ;
-                            pC++ ;
-                            #endif
-                            pA++ ;
-                            pB++ ;
-                        }
-                    }
-
-                    #if defined ( GB_PHASE_2_OF_2 )
-                    ASSERT (pC == pC_end) ;
-                    #endif
-                }
-
-            }
-            else
-            {
-
-                //--------------------------------------------------------------
-                // Mask is present
-                //--------------------------------------------------------------
-
-                int64_t pM = -1 ;
-                int64_t pM_end = -1 ;
-                if (fine_task)
-                { 
-                    // A fine task operates on Mi,Mx [pM...pM_end-1], which is
-                    // a subset of the vector M(:,j)
-                    pM     = TaskList [taskid].pM ;
-                    pM_end = TaskList [taskid].pM_end ;
-                }
-                else
-                {
-                    int64_t kM = -1 ;
-                    if (Ch == Mh)
-                    { 
-                        // Ch is the same as Mh (a shallow copy), or both NULL
-                        kM = k ;
-                    }
-                    else
-                    { 
-                        kM = (C_to_M == NULL) ? j : C_to_M [k] ;
-                    }
-                    if (kM >= 0)
-                    { 
-                        pM     = Mp [kM] ;
-                        pM_end = Mp [kM+1] ;
-                    }
-                }
-
-                //--------------------------------------------------------------
-                // C(:,j)<M(:,j) = A(:,j) .* B (:,j)
-                //--------------------------------------------------------------
-
-                for ( ; pM < pM_end ; pM++)
-                {
-
-                    //----------------------------------------------------------
-                    // get M(i,j) for A(i,j) .* B (i,j)
-                    //----------------------------------------------------------
-
-                    int64_t i = Mi [pM] ;
-                    bool mij = GB_mcast (Mx, pM, msize) ;
-                    if (!mij) continue ;
-
-                    //----------------------------------------------------------
-                    // get A(i,j)
-                    //----------------------------------------------------------
-
-                    if (adense)
-                    { 
-                        // A(:,j) is dense; use direct lookup for A(i,j)
-                        pA = pA_start + i - iA_first ;
-                    }
-                    else
-                    { 
-                        // A(:,j) is sparse; use binary search for A(i,j)
-                        int64_t apright = pA_end - 1 ;
-                        bool afound ;
-                        GB_BINARY_SEARCH (i, Ai, pA, apright, afound) ;
-                        if (!afound) continue ;
-                    }
-                    ASSERT (Ai [pA] == i) ;
-
-                    //----------------------------------------------------------
-                    // get B(i,j)
-                    //----------------------------------------------------------
-
-                    if (bdense)
-                    { 
-                        // B(:,j) is dense; use direct lookup for B(i,j)
-                        pB = pB_start + i - iB_first ;
-                    }
-                    else
-                    { 
-                        // B(:,j) is sparse; use binary search for B(i,j)
-                        int64_t bpright = pB_end - 1 ;
-                        bool bfound ;
-                        GB_BINARY_SEARCH (i, Bi, pB, bpright, bfound) ;
-                        if (!bfound) continue ;
-                    }
-                    ASSERT (Bi [pB] == i) ;
-
-                    //----------------------------------------------------------
-                    // C(i,j) = A(i,j) .* B(i,j)
-                    //----------------------------------------------------------
-
-                    // C (i,j) = A (i,j) .* B (i,j)
-                    #if defined ( GB_PHASE_1_OF_2 )
-                    cjnz++ ;
-                    #else
-                    Ci [pC] = i ;
-                    GB_GETA (aij, Ax, pA) ;
-                    GB_GETB (bij, Bx, pB) ;
-                    GB_BINOP (GB_CX (pC), aij, bij) ;
-                    pC++ ;
-                    #endif
-                }
-
-                #if defined ( GB_PHASE_2_OF_2 )
-                ASSERT (pC == pC_end) ;
-                #endif
-            }
-
-            //------------------------------------------------------------------
-            // final count of nnz (C (:,j))
-            //------------------------------------------------------------------
-
-            #if defined ( GB_PHASE_1_OF_2 )
-            if (fine_task)
-            { 
-                TaskList [taskid].pC = cjnz ;
-            }
-            else
-            { 
-                Cp [k] = cjnz ;
-            }
-            #endif
-        }
-    }
+    #endif
 }
 
diff --git a/GraphBLAS/Source/Template/GB_full_add_template.c b/GraphBLAS/Source/Template/GB_full_add_template.c
new file mode 100644
index 0000000000..6bae8af14b
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_full_add_template.c
@@ -0,0 +1,198 @@
+//------------------------------------------------------------------------------
+// GB_full_add_template:  phase2 for C=A+B, C<M>=A+B, C<!M>=A+B, C is full
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// C is full.  The mask M is not present (otherwise, C would be sparse,
+// hypersparse, or bitmap).  All of these methods are asymptotically optimal.
+
+    //      ------------------------------------------
+    //      C       =           A       +       B
+    //      ------------------------------------------
+    //      full    .           sparse          full  
+    //      full    .           bitmap          full  
+    //      full    .           full            sparse
+    //      full    .           full            bitmap
+    //      full    .           full            full  
+
+{
+
+    int64_t p ;
+    ASSERT (M == NULL) ;
+    ASSERT (A_is_full || B_is_full) ;
+    ASSERT (C_sparsity == GxB_FULL) ;
+
+    if (A_is_full && B_is_full)
+    {
+
+        //----------------------------------------------------------------------
+        // Method30: C, A, B are all full
+        //----------------------------------------------------------------------
+
+        #pragma omp parallel for num_threads(C_nthreads) schedule(static)
+        for (p = 0 ; p < cnz ; p++)
+        { 
+            // C (i,j) = A (i,j) + B (i,j)
+            GB_GETA (aij, Ax, p) ;
+            GB_GETB (bij, Bx, p) ;
+            GB_BINOP (GB_CX (p), aij, bij, p % vlen, p / vlen) ;
+        }
+
+    }
+    else if (A_is_full)
+    {
+
+        //----------------------------------------------------------------------
+        // C and A are full; B is hypersparse, sparse, or bitmap
+        //----------------------------------------------------------------------
+
+        if (B_is_bitmap)
+        {
+
+            //------------------------------------------------------------------
+            // Method31: C and A are full; B is bitmap
+            //------------------------------------------------------------------
+
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static)
+            for (p = 0 ; p < cnz ; p++)
+            {
+                if (Bb [p])
+                { 
+                    // C (i,j) = A (i,j) + B (i,j)
+                    GB_GETA (aij, Ax, p) ;
+                    GB_GETB (bij, Bx, p) ;
+                    GB_BINOP (GB_CX (p), aij, bij, p % vlen, p / vlen) ;
+                }
+                else
+                { 
+                    // C (i,j) = A (i,j)
+                    GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
+                }
+            }
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // Method32: C and A full; B is sparse or hypersparse
+            //------------------------------------------------------------------
+
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static)
+            for (p = 0 ; p < cnz ; p++)
+            {
+                // C (i,j) = A (i,j)
+                GB_COPY_A_TO_C (GB_CX (p), Ax, p) ;
+            }
+
+            GB_SLICE_MATRIX (B, 8) ;
+
+            #pragma omp parallel for num_threads(B_nthreads) schedule(dynamic,1)
+            for (taskid = 0 ; taskid < B_ntasks ; taskid++)
+            {
+                int64_t kfirst = kfirst_Bslice [taskid] ;
+                int64_t klast  = klast_Bslice  [taskid] ;
+                for (int64_t k = kfirst ; k <= klast ; k++)
+                {
+                    // find the part of B(:,k) for this task
+                    int64_t j = GBH (Bh, k) ;
+                    int64_t pB_start, pB_end ;
+                    GB_get_pA (&pB_start, &pB_end, taskid, k, kfirst,
+                        klast, pstart_Bslice, Bp, vlen) ;
+                    int64_t pC_start = j * vlen ;
+                    // traverse over B(:,j), the kth vector of B
+                    for (int64_t pB = pB_start ; pB < pB_end ; pB++)
+                    { 
+                        // C (i,j) = A (i,j) + B (i,j)
+                        int64_t i = Bi [pB] ;
+                        int64_t p = pC_start + i ;
+                        GB_GETA (aij, Ax, p) ;
+                        GB_GETB (bij, Bx, pB) ;
+                        GB_BINOP (GB_CX (p), aij, bij, i, j) ;
+                    }
+                }
+            }
+        }
+
+    }
+    else
+    {
+
+        //----------------------------------------------------------------------
+        // C and B are full; A is hypersparse, sparse, or bitmap
+        //----------------------------------------------------------------------
+
+        if (A_is_bitmap)
+        {
+
+            //------------------------------------------------------------------
+            // Method33: C and B are full; A is bitmap
+            //------------------------------------------------------------------
+
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static)
+            for (p = 0 ; p < cnz ; p++)
+            {
+                if (Ab [p])
+                { 
+                    // C (i,j) = A (i,j) + B (i,j)
+                    GB_GETA (aij, Ax, p) ;
+                    GB_GETB (bij, Bx, p) ;
+                    GB_BINOP (GB_CX (p), aij, bij, p % vlen, p / vlen) ;
+                }
+                else
+                { 
+                    // C (i,j) = B (i,j)
+                    GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
+                }
+            }
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // Method34: C and B are full; A is hypersparse or sparse
+            //------------------------------------------------------------------
+
+            #pragma omp parallel for num_threads(C_nthreads) schedule(static)
+            for (p = 0 ; p < cnz ; p++)
+            {
+                // C (i,j) = B (i,j)
+                GB_COPY_B_TO_C (GB_CX (p), Bx, p) ;
+            }
+
+            GB_SLICE_MATRIX (A, 8) ;
+
+            #pragma omp parallel for num_threads(A_nthreads) schedule(dynamic,1)
+            for (taskid = 0 ; taskid < A_ntasks ; taskid++)
+            {
+                int64_t kfirst = kfirst_Aslice [taskid] ;
+                int64_t klast  = klast_Aslice  [taskid] ;
+                for (int64_t k = kfirst ; k <= klast ; k++)
+                {
+                    // find the part of A(:,k) for this task
+                    int64_t j = GBH (Ah, k) ;
+                    int64_t pA_start, pA_end ;
+                    GB_get_pA (&pA_start, &pA_end, taskid, k, kfirst,
+                        klast, pstart_Aslice, Ap, vlen) ;
+                    int64_t pC_start = j * vlen ;
+                    // traverse over A(:,j), the kth vector of A
+                    for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                    { 
+                        // C (i,j) = A (i,j) + B (i,j)
+                        int64_t i = Ai [pA] ;
+                        int64_t p = pC_start + i ;
+                        GB_GETA (aij, Ax, pA) ;
+                        GB_GETB (bij, Bx, p) ;
+                        GB_BINOP (GB_CX (p), aij, bij, i, j) ;
+                    }
+                }
+            }
+        }
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_lookup_template.c b/GraphBLAS/Source/Template/GB_lookup_template.c
index b303487fb4..8d6fc9c55c 100644
--- a/GraphBLAS/Source/Template/GB_lookup_template.c
+++ b/GraphBLAS/Source/Template/GB_lookup_template.c
@@ -2,10 +2,16 @@
 // GB_lookup_template: find k so that j == Ah [k]
 //------------------------------------------------------------------------------
 
-// Given a sparse, hypersparse, or hyperslice matrix, find k so that j == Ah
-// [k], if it appears in the list.  k is not needed by the caller, just the
-// variables pstart, pend, pleft, and found.  GB_lookup cannot be used if
-// A is a slice (it could be extended to handle this case).
+// For a sparse, bitmap, or full matrix j == k.
+// For a hypersparse matrix, find k so that j == Ah [k], if it
+// appears in the list.
+
+// k is not needed by the caller, just the variables
+// pstart, pend, pleft, and found.
+
+// Once k is found, find pstart and pend, the start and end of the vector.
+// pstart and pend are defined for all sparsity structures: hypersparse,
+// sparse, bitmap, or full.
 
 // This fine is #included' by GB.h, so the #include'ing file does either:
 //      #include "GB.h"
@@ -23,6 +29,7 @@ static inline bool GB_lookup        // find j = Ah [k] in a hyperlist
     const bool A_is_hyper,          // true if A is hypersparse
     const int64_t *GB_RESTRICT Ah,  // A->h [0..A->nvec-1]: list of vectors
     const int64_t *GB_RESTRICT Ap,  // A->p [0..A->nvec  ]: pointers to vectors
+    const int64_t avlen,            // A->vlen
     int64_t *GB_RESTRICT pleft,     // look only in A->h [pleft..pright]
     int64_t pright,                 // normally A->nvec-1, but can be trimmed
 //  const int64_t nvec,             // A->nvec: number of vectors
@@ -55,12 +62,11 @@ static inline bool GB_lookup        // find j = Ah [k] in a hyperlist
     }
     else
     { 
-        // A is not hypersparse; j always appears
+        // A is sparse, bitmap, or full; j always appears
         // k = j
-        (*pstart) = Ap [j] ;
-        (*pend)   = Ap [j+1] ;
+        (*pstart) = GBP (Ap, j, avlen) ;
+        (*pend)   = GBP (Ap, j+1, avlen) ;
         return (true) ;
     }
 }
 
-
diff --git a/GraphBLAS/Source/Template/GB_masker_template.c b/GraphBLAS/Source/Template/GB_masker_template.c
new file mode 100644
index 0000000000..40676d4625
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_masker_template.c
@@ -0,0 +1,114 @@
+//------------------------------------------------------------------------------
+// GB_masker_template:  R = masker (C, M, Z)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Computes C<M>=Z or C<!M>=Z, returning the result in R.  The input matrix C
+// is not modified.  Effectively, this computes R=C and then R<M>=Z or R<!M>=Z.
+// If the C_replace descriptor is enabled, then C has already been cleared, and
+// is an empty (but non-NULL) matrix.
+
+// phase1: does not compute R itself, but just counts the # of entries in each
+// vector of R.  Fine tasks compute the # of entries in their slice of a
+// single vector of R, and the results are cumsum'd.
+
+// phase2: computes R, using the counts computed by phase1.
+
+// FUTURE:: add special cases for C==Z, C==M, and Z==M aliases
+
+{
+
+    //--------------------------------------------------------------------------
+    // get C, Z, M, and R
+    //--------------------------------------------------------------------------
+
+    int taskid ;
+
+    const int64_t *GB_RESTRICT Cp = C->p ;
+    const int64_t *GB_RESTRICT Ch = C->h ;
+    const int8_t  *GB_RESTRICT Cb = C->b ;
+    const int64_t *GB_RESTRICT Ci = C->i ;
+    const int64_t vlen = C->vlen ;
+    const bool C_is_hyper = GB_IS_HYPERSPARSE (C) ;
+    const bool C_is_sparse = GB_IS_SPARSE (C) ;
+    const bool C_is_bitmap = GB_IS_BITMAP (C) ;
+    const bool C_is_full = GB_IS_FULL (C) ;
+    int C_nthreads, C_ntasks ;
+
+    const int64_t *GB_RESTRICT Zp = Z->p ;
+    const int64_t *GB_RESTRICT Zh = Z->h ;
+    const int8_t  *GB_RESTRICT Zb = Z->b ;
+    const int64_t *GB_RESTRICT Zi = Z->i ;
+    const bool Z_is_hyper = GB_IS_HYPERSPARSE (Z) ;
+    const bool Z_is_sparse = GB_IS_SPARSE (Z) ;
+    const bool Z_is_bitmap = GB_IS_BITMAP (Z) ;
+    const bool Z_is_full = GB_IS_FULL (Z) ;
+    int Z_nthreads, Z_ntasks ;
+
+    const int64_t *GB_RESTRICT Mp = NULL ;
+    const int64_t *GB_RESTRICT Mh = NULL ;
+    const int8_t  *GB_RESTRICT Mb = NULL ;
+    const int64_t *GB_RESTRICT Mi = NULL ;
+    const GB_void *GB_RESTRICT Mx = NULL ;
+    const bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
+    const bool M_is_sparse = GB_IS_SPARSE (M) ;
+    const bool M_is_bitmap = GB_IS_BITMAP (M) ;
+    const bool M_is_full = GB_IS_FULL (M) ;
+    const bool M_is_sparse_or_hyper = M_is_sparse || M_is_hyper ;
+    int M_nthreads, M_ntasks ;
+    size_t msize = 0 ;
+    if (M != NULL)
+    { 
+        Mp = M->p ;
+        Mh = M->h ;
+        Mb = M->b ;
+        Mi = M->i ;
+        Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
+        msize = M->type->size ;
+    }
+
+    #if defined ( GB_PHASE_2_OF_2 )
+    const GB_void *GB_RESTRICT Cx = (GB_void *) C->x ;
+    const GB_void *GB_RESTRICT Zx = (GB_void *) Z->x ;
+    const int64_t *GB_RESTRICT Rp = R->p ;
+    const int64_t *GB_RESTRICT Rh = R->h ;
+          int8_t  *GB_RESTRICT Rb = R->b ;
+          int64_t *GB_RESTRICT Ri = R->i ;
+          GB_void *GB_RESTRICT Rx = (GB_void *) R->x ;
+    size_t rsize = R->type->size ;
+    // when R is bitmap or full:
+    const int64_t rnz = GB_NNZ_HELD (R) ;
+    GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
+    #endif
+
+    //--------------------------------------------------------------------------
+    // 
+    //--------------------------------------------------------------------------
+
+    #if defined ( GB_PHASE_1_OF_2 )
+
+        // phase1
+        #include "GB_sparse_masker_template.c"
+
+    #else
+
+        // phase2
+        if (R_sparsity == GxB_SPARSE || R_sparsity == GxB_HYPERSPARSE)
+        { 
+            // R is sparse or hypersparse (phase1 and phase2)
+            #include "GB_sparse_masker_template.c"
+        }
+        else // R_sparsity == GxB_BITMAP
+        { 
+            // R is bitmap (phase2 only)
+            ASSERT (R_sparsity == GxB_BITMAP) ;
+            #include "GB_bitmap_masker_template.c"
+        }
+
+    #endif
+}
+
diff --git a/GraphBLAS/Source/Template/GB_matrix.h b/GraphBLAS/Source/Template/GB_matrix.h
index 958375dae5..d947a4b7fb 100644
--- a/GraphBLAS/Source/Template/GB_matrix.h
+++ b/GraphBLAS/Source/Template/GB_matrix.h
@@ -2,8 +2,8 @@
 // GB_matrix.h: definitions for GrB_Matrix and GrB_Vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,10 +15,10 @@
 // the same struct, but then the compiler gets confused with Generic(x).
 
 // For a GrB_Vector object, as an m-by-1 non-hypersparse CSC matrix:
-//      bool is_hyper ;         // always false
 //      bool is_csc ;           // always true
 //      int64_t plen ;          // always 1, so A->p always has length 2, and
-//                              // contains [0 k] if the vector has k entries
+//                              // contains [0 k] if the vector has k entries;
+//                              // A->p is NULL if the GrB_Vector is bitmap.
 //      int64_t vdim ;          // always 1
 //      int64_t nvec ;          // always 1
 //      int64_t *h ;            // always NULL
@@ -29,27 +29,27 @@
 
 int64_t magic ;         // for detecting uninitialized objects
 GrB_Type type ;         // the type of each numerical entry
-size_t type_size ;      // type->size, copied here since the type could be
-                        // user-defined, and freed before the matrix or vector
+char *logger ;          // for error logging
 
 //------------------------------------------------------------------------------
 // compressed sparse vector data structure
 //------------------------------------------------------------------------------
 
-// The matrix can be held in one of 6 formats, each one consisting of a set of
-// sparse vectors.  The vector "names" are in the range 0 to A->vdim-1.  Each
+// The matrix can be held in one of 8 formats, each one consisting of a set of
+// vectors.  The vector "names" are in the range 0 to A->vdim-1.  Each
 // vector has length A->vlen.  These two values define the dimension of the
 // matrix, where A is m-by-n.  The m and n dimenions are vlen and vdim for the
-// standard CSC and hypersparse-CSC formats, and reversed for the standard CSR
-// and hypersparse-CSR formats.
+// CSC formats, and reversed for the CSR formats.
 
-// Ap, Ai, Ax, and Ah are abbreviations for A->p, A->i, A->x, and A->h.
+// Ap, Ai, Ax, Ah, and Ab are abbreviations for A->p, A->i, A->x, A->h, and
+// A->b, respectively.
 
-// For all formats Ap is an integer array of size A->plen+1, with Ap [0] always
-// zero.  The matrix contains A->nvec sparse vectors, where A->nvec <= A->plen
-// <= A->vdim.  The arrays Ai and Ax are both of size A->nzmax, and define the
-// indices and values in each sparse vector.  The total number of entries in
-// the matrix is Ap [nvec] <= A->nzmax.
+// For the sparse and hypersparse formats, Ap is an integer array of size
+// A->plen+1, with Ap [0] always zero.  The matrix contains A->nvec sparse
+// vectors, where A->nvec <= A->plen <= A->vdim.  The arrays Ai and Ax are both
+// of size A->nzmax, and define the indices and values in each sparse vector.
+// The total number of entries in the matrix is Ap [nvec] <= A->nzmax.
+// For the bitmap and full sparsity structures, Ap and Ai are NULL.
 
 // For both hypersparse and non-hypersparse matrices, if A->nvec_nonempty is
 // computed, it is the number of vectors that contain at least one entry, where
@@ -57,20 +57,75 @@ size_t type_size ;      // type->size, copied here since the type could be
 // A->nvec_nonempty is equal to -1.
 
 //------------------------------------------------------------------------------
-// The Primary 4 formats:  (standard or hypersparse) * (CSR or CSC)
+// The 8 formats:  (hypersparse, sparse, bitmap, full) x (CSR or CSC)
 //------------------------------------------------------------------------------
 
-// A->is_slice is false.  These are the only matrices returned to the user.
+// --------------------------------------
+// Full structure:
+// --------------------------------------
+
+    // Ah, Ap, Ai, and Ab are all NULL.
+    // A->nvec == A->vdim.   A->plen is not needed (set to -1)
+
+    // --------------------------------------
+    // A->is_csc is true:  full CSC format
+    // --------------------------------------
+
+        // A is m-by-n: where A->vdim = n, and A->vlen = m
+
+        // Column A(:,j) is held in Ax [p1:p2-1] where p1 = k*m, p2 = (k+1)*m.
+        // A(i,j) at position p has row index i = p%m and value Ax [p]
+
+    // --------------------------------------
+    // A->is_csc is false:  full CSR format
+    // --------------------------------------
+
+        // A is m-by-n: where A->vdim = m, and A->vlen = n
+
+        // Row A(i,:) is held in Ax [p1:p2-1] where p1 = k*n, p2 = (k+1)*n.
+        // A(i,j) at position p has column index j = p%n and value Ax [p]
+
+// --------------------------------------
+// Bitmap structure:
+// --------------------------------------
+
+    // Ah, Ap, and Ai are NULL.  Ab is an int8_t array of size m*n.
+    // A->nvec == A->vdim.   A->plen is not needed (set to -1)
+
+    // The bitmap structure is identical to the full structure, except for the
+    // addition of the bitmap array A->b.
+
+    // --------------------------------------
+    // A->is_csc is true:  bitmap CSC format
+    // --------------------------------------
+
+        // A is m-by-n: where A->vdim = n, and A->vlen = m
+
+        // Column A(:,j) is held in Ax [p1:p2-1] where p1 = k*m, p2 = (k+1)*m.
+        // A(i,j) at position p has row index i = p%m and value Ax [p].
+        // The entry A(i,j) is present if Ab [p] == 1, and not present if
+        // Ab [p] == 0.
+
+    // --------------------------------------
+    // A->is_csc is false:  bitmap CSR format
+    // --------------------------------------
+
+        // A is m-by-n: where A->vdim = m, and A->vlen = n
+
+        // Row A(i,:) is held in Ax [p1:p2-1] where p1 = k*n, p2 = (k+1)*n.
+        // A(i,j) at position p has column index j = p%n and value Ax [p]
+        // The entry A(i,j) is present if Ab [p] == 1, and not present if
+        // Ab [p] == 0.
 
 // --------------------------------------
-// A->is_hyper is false: standard format.
+// Sparse structure:
 // --------------------------------------
 
-    // Ah is NULL
+    // Ah and Ab are NULL
     // A->nvec == A->plen == A->vdim
 
     // --------------------------------------
-    // A->is_csc is true:  standard CSC format
+    // A->is_csc is true:  sparse CSC format
     // --------------------------------------
 
         // Ap, Ai, and Ax store a sparse matrix in the a very similar style
@@ -83,7 +138,7 @@ size_t type_size ;      // type->size, copied here since the type could be
         // A is m-by-n: where A->vdim = n, and A->vlen = m
 
     // --------------------------------------
-    // A->is_csc is false:  standard CSR format
+    // A->is_csc is false:  sparse CSR format
     // --------------------------------------
 
         // Ap, Ai, and Ax store a sparse matrix in CSR format, as a collection
@@ -96,9 +151,10 @@ size_t type_size ;      // type->size, copied here since the type could be
         // A is m-by-n: where A->vdim = m, and A->vlen = n
 
 // --------------------------------------
-// A->is_hyper is true: hypersparse format
+// Hypersparse structure:
 // --------------------------------------
 
+    // Ab is NULL
     // Ah is non-NULL and has size A->plen; it is always kept sorted,
     // A->nvec <= A->plen <= A->vdim
 
@@ -135,68 +191,43 @@ size_t type_size ;      // type->size, copied here since the type could be
         // A is m-by-n: where A->vdim = n, and A->vlen = m
 
 //------------------------------------------------------------------------------
-// Internal formats: a slice or hyperslice (either CSR or CSC)
+// GraphBLAS vs MATLAB vs CSparse
 //------------------------------------------------------------------------------
 
-    // A->is_slice is true.  This format is only used inside GraphBLAS, for
-    // internal slices or hyperslices of another matrix.
-
-    // It is the same as the hypersparse format, except that Ah may be NULL.
-    // All Ah, Ap, Ai, Ax content of the slice is shallow.
-    // Ap [0] == 0 only for the leftmost slice; it is normally >= 0.
+// Like MATLAB, the indices in a completed GraphBLAS matrix (as implemented
+// here) are always kept sorted.  If all vectors in a matrix have row indices
+// in strictly ascending order, the matrix is called "unjumbled" in this code.
+// A matrix with one or more unsorted vectors is "jumbled".
 
-    // slice: A->is_hyper is false
-
-            // Ah is NULL: Ah [0..A->nvec-1] is implicitly the contiguous list:
-            // [A->hfirst ... A->hfirst + A->nvec - 1].  The original matrix is
-            // not hypersparse.  A->plen gives the size of Ap, as above.  Ap
-            // points into an offset of p of the original matrix.
-
-   // hyperslice: A->is_hyper is true
-
-            // Ah is not-NULL.  The original matrix is hypersparse.  Ah points
-            // to an offset inside the h of the original matrix.  A->hfirst is
-            // zero, and not used.
-
-//------------------------------------------------------------------------------
-
-// Like MATLAB, the indices in a GraphBLAS matrix (as implemented here) are
-// "always" kept sorted.  There is one temporary exception to this rule.
-// GB_subref is allowed return a matrix with unsorted vectors, if it will be
-// later be transposed by its caller.  The transpose does the sort.
+// GraphBLAS allows for pending operations, in a matrix with pending work.
+// Pending work includes one or more of the following (1) the presence of
+// zombies, (2) pending tuples, and (3) the matrix is jumbled.
 
 // Unlike MATLAB, explicit zeros are never dropped in a GraphBLAS matrix.  They
 // cannot be since the semiring "zero" might be something else, like -Infinity
 // for a max-plus semiring.  However, dropping zeros is a minor nuance in the
 // data structure.
 
-// Like GraphBLAS, CSparse also keeps explicit zeros.  Unlike GraphBLAS,
-// CSparse allows its sparse matrices to be jumbled; its interface to MATLAB
-// always makes sure its matrices are sorted before returning them to MATLAB.
-// Allowing a matrix to remain jumbled can be faster and simpler, but it means
-// that operations such as GrB_setElement and GrB_*assign are very difficult
-// (CSparse does not provide those operations).
+// Like GraphBLAS, CSparse also keeps explicit zeros.  CSparse allows its
+// matrices to be jumbled at any time, and this is not considered an unfinished
+// GraphBLAS matrix.
 
 // Finally, MATLAB only allows for boolean ("logical" class) and double
-// precision sparse matrices.  CSparse only supports double.  By contrast,
-// GraphBLAS supports any type, including types defined at run time by the user
-// application.  In the GraphBLAS code, the term "nonzero" is sometimes used in
-// the comments, but this is short-hand for the phrase "an entry A(i,j) whose
-// value is explicity held in the matrix and which appears in the pattern; its
-// value can be anything".  Entries not in the pattern are simply "not there";
-// see for example GrB_*_extractElement.  The actual numerical value of these
-// implicit entries is dependent upon the identity value of the semiring's
-// monoid operation used on the matrix.  The actual semiring is not held in the
-// matrix itself, and there are no restrictions on using a matrix in multiple
-// semirings.
-
-// The bool content is placed last, to reduce the size of the struct.
-
-// bool is_hyper ;      // true if the matrix is hypersparse
-// bool is_csc ;        // true if stored by column (CSC or hypersparse CSC)
-                        // false if by row (CSR or hypersparse CSR)
-
-double hyper_ratio ;    // controls conversion to/from hypersparse
+// precision sparse matrices (complex and real).  CSparse only supports double.
+// By contrast, GraphBLAS supports any type, including types defined at run
+// time by the user application.  In the GraphBLAS code, the term "nonzero" is
+// sometimes used in the comments, but this is short-hand for the phrase "an
+// entry A(i,j) whose value is explicity held in the matrix and which appears
+// in the pattern; its value can be anything".  Entries not in the pattern are
+// simply "not there"; see for example GrB_*_extractElement.  The actual
+// numerical value of these implicit entries is dependent upon the identity
+// value of the semiring's monoid operation used on the matrix.  The actual
+// semiring is not held in the matrix itself, and there are no restrictions on
+// using a matrix in multiple semirings.
+
+//------------------------------------------------------------------------------
+// primary matrix content
+//------------------------------------------------------------------------------
 
 int64_t plen ;          // A->h has size plen, A->p has size plen+1
 int64_t vlen ;          // length of each sparse vector
@@ -211,46 +242,9 @@ int64_t *h ;            // list of non-empty vectors of size plen
 int64_t *p ;            // array of size plen+1
 int64_t *i ;            // array of size nzmax
 void *x ;               // size nzmax; each entry of size A->type->size
+int8_t *b ;             // size nzmax; bitmap
 int64_t nzmax ;         // size of i and x arrays
-
-int64_t hfirst ;        // if A->is_hyper is false but A->is_slice is true,
-                        // then A->h is NULL, and the matrix A is a slice
-                        // of another standard matrix S.  The vectors in
-                        // A are the contiguous list:
-                        // [A->hfirst ... A->hfirst+A->nvec-1].
-                        // Otherwise, A->hfirst is zero.
-
-// The hyper_ratio determines how the matrix is converted between the
-// hypersparse and non-hypersparse formats.  Let n = A->vdim and let k be the
-// actual number of non-empty vectors.  If A is hypersparse, k can be less than
-// A->nvec since the latter can include vectors that appear in A->h but are
-// actually empty.
-
-// If a matrix is currently hypersparse, it can be converted to non-hypersparse
-// if the condition (n <= 1 || k > n*hyper_ratio*2) holds.  Otherwise, it stays
-// hypersparse.  Note that if n <= 1 the matrix is always stored as
-// non-hypersparse.
-
-// If currently non-hypersparse, it can be converted to hypersparse if the
-// condition (n > 1 && k <= n*hyper_ratio) holds.  Otherwise, it stays
-// non-hypersparse.  Note that if n <= 1 the matrix remains non-hypersparse.
-
-// The default value of hyper_ratio is assigned to be GxB_HYPER_DEFAULT at
-// startup by GrB_init, and can then be modified globally with
-// GxB_Global_Option_set.  All new matrices are created with the same ratio.
-// Once a particular matrix has been constructed, its hypersparsity ratio can
-// be modified from the default with GxB_Matrix_Option_set.  GrB_Vectors are
-// always stored as non-hypersparse.
-
-// A new matrix created via GrB_Matrix_new starts with k=0 and is created in
-// hypersparse form unless (n <= 1 || 0 > hyper_ratio) holds, where hyper_ratio
-// is the global default value.  GrB_Vectors are always non-hypersparse.
-
-// To force a matrix to always stay non-hypersparse, use hyper_ratio = -1 (or
-// any negative number).  To force a matrix to always stay hypersparse, use
-// hyper_ratio = 1 or more.  For code readability, these values are also
-// predefined for the user application as the constants GxB_ALWAYS_HYPER and
-// GxB_NEVER_HYPER.
+int64_t nvals ;         // nvals(A) if A is bitmap
 
 //------------------------------------------------------------------------------
 // pending tuples
@@ -344,25 +338,82 @@ GB_Pending Pending ;        // list of pending tuples
 uint64_t nzombies ;     // number of zombies marked for deletion
 
 //------------------------------------------------------------------------------
-// statistics
+// sparsity control
 //------------------------------------------------------------------------------
 
-GrB_Desc_Value AxB_method_used ;    // last method used for C=A*B (this is C)
+// The hyper_switch determines how the matrix is converted between the
+// hypersparse and non-hypersparse formats.  Let n = A->vdim and let k be the
+// actual number of non-empty vectors.  If A is hypersparse, k can be less than
+// A->nvec since the latter can include vectors that appear in A->h but are
+// actually empty.
+
+// If a matrix is currently hypersparse, it can be converted to non-hypersparse
+// if the condition (n <= 1 || k > n*hyper_switch*2) holds.  Otherwise, it
+// stays hypersparse.  Note that if n <= 1 the matrix is always stored as
+// non-hypersparse.
+
+// If currently non-hypersparse, it can be converted to hypersparse if the
+// condition (n > 1 && k <= n*hyper_switch) holds.  Otherwise, it stays
+// non-hypersparse.  Note that if n <= 1 the matrix remains non-hypersparse.
+
+// The default value of hyper_switch is assigned to be GxB_HYPER_DEFAULT at
+// startup by GrB_init, and can then be modified globally with
+// GxB_Global_Option_set.  All new matrices are created with the same
+// hyper_switch.  Once a particular matrix has been constructed, its
+// hyper_switch can be modified from the default with GxB_Matrix_Option_set.
+// GrB_Vectors are never stored as hypersparse.
+
+// A new matrix created via GrB_Matrix_new starts with k=0 and is created in
+// hypersparse form unless (n <= 1 || 0 > hyper_switch) holds, where
+// hyper_switch is the global default value.  GrB_Vectors are always
+// non-hypersparse.
+
+// To force a matrix to always stay non-hypersparse, use hyper_switch = -1 (or
+// any negative number).  To force a matrix to always stay hypersparse, use
+// hyper_switch = 1 or more.  For code readability, these values are also
+// predefined for the user application as GxB_ALWAYS_HYPER and GxB_NEVER_HYPER.
+
+// Summary for switching between formats:
+
+// (1) by-row and by-column: there is no automatic switch between CSR/CSC.
+//      By default, all GrB_Matrices are held in CSR form, unless they are
+//      n-by-1 (then they are CSC).  The GrB_vector is always CSC.
 
-void *queue_next ;      // TODO in 4.0: delete
-void *queue_prev ;      // TODO in 4.0: delete
-bool enqueued ;         // TODO in 4.0: delete
+// (2) If A->sparsity is GxB_AUTO_SPARSITY (15), then the following rules are
+//      used to control the sparsity structure:
+//
+//      (a) When a matrix is created, it is empty and starts as hypersparse,
+//          except that a GrB_Vector is never hypersparse.
+//
+//      (b) A hypersparse matrix with k non-empty vectors and
+//          k > 2*n*A->hyper_switch is changed to sparse, and a sparse matrix
+//          with k <= 1*n*A->hyper_switch is changed to hypersparse.
+//          A->hyper_switch = (1/16) by default.  See GB_convert*test.
+//
+//      (c) A matrix with all entries present is converted to full (anz =
+//          GB_NNZ(A) = anz_dense = (A->vlen)*(A->vdim)).
+//
+//      (d) A matrix with anz = GB_NNZ(A) entries and dimension A->vlen by
+//          A->vdim can have at most anz_dense = (A->vlen)*(A->vdim) entries.
+//          If A is sparse/hypersparse with anz > A->bitmap_switch * anz_dense,
+//          then it switches to bitmap.  If A is bitmap and anz =
+//          (A->bitmap_switch / 2) * anz_dense, it switches to sparse.  In
+//          between those two regions, the sparsity structure is unchanged.
+
+float hyper_switch ;    // controls conversion hyper to/from sparse
+float bitmap_switch ;   // controls conversion sparse to/from bitmap
+int sparsity ;          // controls sparsity structure: hypersparse,
+                        // sparse, bitmap, or full, or any combination.
 
 //------------------------------------------------------------------------------
-// shallow matrices: like MATLAB but not in CSparse
+// shallow matrices
 //------------------------------------------------------------------------------
 
 // Internal matrices in this implementation of GraphBLAS may have "shallow"
-// components.  These are pointers A->p, A->i, and A->x that point to the
-// content of another matrix.  Using shallow components speeds up computations
-// and saves memory, but shallow matrices are never passed back to the user
-// application.  They could be in the future, since the GraphBLAS objects are
-// opaque to the user application.
+// components.  These are pointers A->p, A->h, A->i, A->b, and A->x that point
+// to the content of another matrix.  Using shallow components speeds up
+// computations and saves memory, but shallow matrices are never passed back to
+// the user application.
 
 // If the following are true, then the corresponding component of the
 // object is a pointer into components of another object.  They must not
@@ -370,6 +421,7 @@ bool enqueued ;         // TODO in 4.0: delete
 
 bool p_shallow ;        // true if p is a shallow copy
 bool h_shallow ;        // true if h is a shallow copy
+bool b_shallow ;        // true if b is a shallow copy
 bool i_shallow ;        // true if i is a shallow copy
 bool x_shallow ;        // true if x is a shallow copy
 
@@ -377,15 +429,116 @@ bool x_shallow ;        // true if x is a shallow copy
 // other bool content
 //------------------------------------------------------------------------------
 
-// The boolean content appears last, to reduce the size of the struct
+bool is_csc ;           // true if stored by column, false if by row
+bool jumbled ;          // true if the matrix may be jumbled.  bitmap and full
+                        // matrices are never jumbled.
 
-bool is_hyper ;         // true if the matrix is hypersparse
-bool is_csc ;           // true if stored by column (CSC or hypersparse CSC)
-bool is_slice ;         // true if the matrix is a slice or hyperslice
-
-//-----------------------------------------------------------------------------
-// MKL analysis, if available
-//-----------------------------------------------------------------------------
+//------------------------------------------------------------------------------
+// iterating through a matrix
+//------------------------------------------------------------------------------
 
-void *mkl ;
+// The matrix can be held in 8 formats: (hypersparse, sparse, bitmap, full) x
+// (CSR, CSC).  The comments below assume A is in CSC format but the code works
+// for both CSR and CSC.
+
+#ifdef for_comments_only    // only so vim will add color to the code below:
+
+// for reference:
+#define GBI(Ai,p,avlen) ((Ai == NULL) ? ((p) % (avlen)) : Ai [p])
+#define GBB(Ab,p)       ((Ab == NULL) ? 1 : Ab [p])
+#define GBP(Ap,k,avlen) ((Ap == NULL) ? ((k) * (avlen)) : Ap [k])
+#define GBH(Ah,k)       ((Ah == NULL) ? (k) : Ah [k])
+
+    // A->vdim: the vector dimension of A (ncols(A))
+    // A->nvec: # of vectors that appear in A.  For the hypersparse case,
+    //          these are the number of column indices in Ah [0..nvec-1], since
+    //          A is CSC.  For all cases, Ap [0...nvec] are the pointers.
+
+    //--------------------
+    // (1) full      // A->h, A->p, A->i, A->b are NULL, A->nvec == A->vdim
+
+        int64_t vlen = A->vlen ;
+        for (k = 0 ; k < A->nvec ; k++)
+        {
+            j = k ;
+            // operate on column A(:,j)
+            int64_t pA_start = k * vlen ;
+            int64_t pA_end   = (k+1) * vlen ;
+            for (p = pA_start ; p < pA_end ; p++)
+            {
+                // A(i,j) has row i = (p % vlen), value aij = Ax [p]
+            }
+        }
+
+    //--------------------
+    // (2) bitmap    // A->h, A->p, A->i are NULL, A->nvec == A->vdim
+
+        int64_t vlen = A->vlen ;
+        for (k = 0 ; k < A->nvec ; k++)
+        {
+            j = k ;
+            // operate on column A(:,j)
+            int64_t pA_start = k * vlen ;
+            int64_t pA_end   = (k+1) * vlen ;
+            for (p = pA_start ; p < pA_end ; p++)
+            {
+                if (Ab [p] != 0)
+                {
+                    // A(i,j) has row i = (p % vlen), value aij = Ax [p]
+                }
+                else
+                {
+                    // A(i,j) is not present
+                }
+            }
+        }
+
+    //--------------------
+    // (3) sparse     // A->h is NULL, A->nvec == A->vdim
+
+        for (k = 0 ; k < A->nvec ; k++)
+        {
+            j = k ;
+            // operate on column A(:,j)
+            for (p = Ap [k] ; p < Ap [k+1] ; p++)
+            {
+                // A(i,j) has row i = Ai [p], value aij = Ax [p]
+            }
+        }
+
+    //--------------------
+    // (4) hypersparse  // A->h is non-NULL, A->nvec <= A->dim
+
+        for (k = 0 ; k < A->nvec ; k++)
+        {
+            j = A->h [k]
+            // operate on column A(:,j)
+            for (p = Ap [k] ; p < Ap [k+1] ; p++)
+            {
+                // A(i,j) has row i = Ai [p], value aij = Ax [p]
+            }
+        }
+
+    //--------------------
+    // generic: for any matrix
+
+        int64_t vlen = A->vlen ;
+        for (k = 0 ; k < A->nvec ; k++)
+        {
+            j = GBH (Ah, k) ;
+            // operate on column A(:,j)
+            int64_t pA_start = GBP (Ap, k, vlen) ;
+            int64_t pA_end   = GBP (Ap, k+1, vlen) ;
+            for (p = pA_start ; p < pA_end ; p++)
+            {
+                // A(i,j) has row index i, value aij = Ax [p]
+                if (!GBB (Ab, p)) continue ;
+                int64_t i = GBI (Ai, p, vlen) ;
+                double aij = Ax [p] ;
+            }
+        }
+
+#endif
+
+// #include "GB_matrix_mkl_template.h"
 
diff --git a/GraphBLAS/Source/Template/GB_meta16_definitions.h b/GraphBLAS/Source/Template/GB_meta16_definitions.h
new file mode 100644
index 0000000000..670d7e927d
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_meta16_definitions.h
@@ -0,0 +1,303 @@
+//------------------------------------------------------------------------------
+// GB_meta16_definitions.h: methods that depend on the sparsity of A and B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Define macros that depend on the sparsity of A and B for GB_meta16_factory.
+
+//------------------------------------------------------------------------------
+// GB_GET_B_j: prepare to iterate over B(:,j)
+//------------------------------------------------------------------------------
+
+#undef GB_GET_B_j
+
+#if defined ( GB_META16 )
+
+    // this method appears in the GB_meta16_factory
+    #if ( GB_B_IS_HYPER || GB_A_IS_HYPER )
+
+        // A or B are hyper
+        #define GB_GET_B_j \
+        GB_GET_B_j_FOR_ALL_FORMATS (GB_A_IS_HYPER,GB_B_IS_SPARSE,GB_B_IS_HYPER)
+
+    #else
+
+        #if ( GB_B_IS_SPARSE )
+
+            // B is sparse
+            #define GB_GET_B_j                              \
+                int64_t j = kk ;                            \
+                int64_t pB = Bp [kk] ;                      \
+                int64_t pB_end = Bp [kk+1] ;                \
+                int64_t bjnz = pB_end - pB ;                \
+                GB_GET_T_FOR_SECONDJ
+
+        #else
+
+            // B is bitmap or full
+            #define GB_GET_B_j                              \
+                int64_t j = kk ;                            \
+                int64_t pB = kk * bvlen ;                   \
+                int64_t pB_end = pB + bvlen ;               \
+                int64_t bjnz = bvlen ;                      \
+                GB_GET_T_FOR_SECONDJ
+
+        #endif
+
+    #endif
+
+#else
+
+    // define GB_GET_B_j for all sparsity formats
+    #define GB_GET_B_j \
+        GB_GET_B_j_FOR_ALL_FORMATS (A_is_hyper, B_is_sparse, B_is_hyper)
+
+#endif
+
+//------------------------------------------------------------------------------
+// GB_GET_B_kj_INDEX: get the index k of the entry B(k,j)
+//------------------------------------------------------------------------------
+
+#undef GB_GET_B_kj_INDEX
+
+#if defined ( GB_META16 )
+
+    #if ( GB_B_IS_HYPER || GB_B_IS_SPARSE )
+
+        // B is hyper or sparse
+        #define GB_GET_B_kj_INDEX               \
+            int64_t k = Bi [pB]
+
+    #elif ( GB_B_IS_BITMAP )
+
+        // B is bitmap
+        #define GB_GET_B_kj_INDEX               \
+            if (!Bb [pB]) continue ;            \
+            int64_t k = pB % bvlen
+
+    #else
+
+        // B is full
+        #define GB_GET_B_kj_INDEX               \
+            int64_t k = pB % bvlen
+
+    #endif
+
+#else
+
+    // for any format of B
+    #define GB_GET_B_kj_INDEX                   \
+        if (!GBB (Bb, pB)) continue ;           \
+        int64_t k = GBI (Bi, pB, bvlen)
+
+#endif
+
+//------------------------------------------------------------------------------
+// GB_GET_A_k: prepare to iterate over the vector A(:,k)
+//------------------------------------------------------------------------------
+
+#undef GB_GET_A_k
+
+#if defined ( GB_META16 )
+
+    #if ( GB_A_IS_HYPER )
+
+        // A is hyper
+        #define GB_GET_A_k GB_GET_A_k_FOR_ALL_FORMATS (true)
+
+    #elif ( GB_A_IS_SPARSE )
+
+        // A is sparse
+        #define GB_GET_A_k                              \
+            int64_t pA_start = Ap [k] ;                 \
+            int64_t pA_end = Ap [k+1] ;                 \
+            int64_t aknz = pA_end - pA_start
+
+    #else
+
+        // A is bitmap or full
+        #define GB_GET_A_k                              \
+            int64_t pA_start = k * avlen ;              \
+            int64_t pA_end = pA_start + avlen ;         \
+            int64_t aknz = avlen
+
+    #endif
+
+#else
+
+    // define GB_GET_A_k for all sparsity formats
+    #define GB_GET_A_k GB_GET_A_k_FOR_ALL_FORMATS (A_is_hyper)
+
+#endif
+
+//------------------------------------------------------------------------------
+// GB_GET_A_ik_INDEX: get the index i of the entry A(i,k)
+//------------------------------------------------------------------------------
+
+#undef GB_GET_A_ik_INDEX
+
+#if defined ( GB_META16 )
+
+    #if ( GB_A_IS_HYPER || GB_A_IS_SPARSE )
+
+        // A is hyper or sparse
+        #define GB_GET_A_ik_INDEX               \
+            int64_t i = Ai [pA]
+
+    #elif ( GB_A_IS_BITMAP )
+
+        // A is bitmap
+        #define GB_GET_A_ik_INDEX               \
+            if (!Ab [pA]) continue ;            \
+            int64_t i = pA % avlen
+
+    #else
+
+        // A is full
+        #define GB_GET_A_ik_INDEX               \
+            int64_t i = pA % avlen
+
+    #endif
+
+#else
+
+    // for any format of A
+    #define GB_GET_A_ik_INDEX                   \
+        if (!GBB (Ab, pA)) continue ;           \
+        int64_t i = GBI (Ai, pA, avlen)
+
+#endif
+
+//------------------------------------------------------------------------------
+// GB_COMPUTE_C_j_WHEN_NNZ_B_j_IS_ONE: compute C(:,j) when nnz(B(:,j)) == 1
+//------------------------------------------------------------------------------
+
+// C(:,j) = A(:,k)*B(k,j) when there is a single entry in B(:,j)
+// The mask must not be present.  A must be sparse or hypersparse.
+
+#undef GB_COMPUTE_C_j_WHEN_NNZ_B_j_IS_ONE
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // ANY_PAIR: result is purely symbolic; no numeric work to do
+    #define GB_COMPUTE_C_j_WHEN_NNZ_B_j_IS_ONE                      \
+        ASSERT (A_is_sparse || A_is_hyper) ;                        \
+        GB_GET_B_kj_INDEX ;         /* get index k of B(k,j) */     \
+        GB_GET_A_k ;                /* get A(:,k) */                \
+        memcpy (Ci + pC, Ai + pA_start, aknz * sizeof (int64_t)) ;  \
+        /* C becomes jumbled if A is jumbled */                     \
+        task_C_jumbled = task_C_jumbled || A_jumbled ;
+
+#else
+
+    // typical semiring
+    #define GB_COMPUTE_C_j_WHEN_NNZ_B_j_IS_ONE                      \
+        ASSERT (A_is_sparse || A_is_hyper) ;                        \
+        GB_GET_B_kj_INDEX ;         /* get index k of B(k,j) */     \
+        GB_GET_A_k ;                /* get A(:,k) */                \
+        GB_GET_B_kj ;               /* bkj = B(k,j) */              \
+        /* scan A(:,k) */                                           \
+        for (int64_t pA = pA_start ; pA < pA_end ; pA++)            \
+        {                                                           \
+            GB_GET_A_ik_INDEX ;     /* get index i of A(i,k) */     \
+            GB_MULT_A_ik_B_kj ;         /* t = A(i,k)*B(k,j) */     \
+            GB_CIJ_WRITE (pC, t) ;      /* Cx [pC] = t */           \
+            Ci [pC++] = i ;                                         \
+        }                                                           \
+        /* C becomes jumbled if A is jumbled */                     \
+        task_C_jumbled = task_C_jumbled || A_jumbled ;
+
+#endif
+
+//------------------------------------------------------------------------------
+// GB_COMPUTE_DENSE_C_j: compute C(:,j)=A*B(:,j) when C(:,j) is completely dense
+//------------------------------------------------------------------------------
+
+#undef GB_COMPUTE_DENSE_C_j
+
+#if GB_IS_ANY_PAIR_SEMIRING
+
+    // ANY_PAIR: result is purely symbolic; no numeric work to do
+    #define GB_COMPUTE_DENSE_C_j                                    \
+        for (int64_t i = 0 ; i < cvlen ; i++)                       \
+        {                                                           \
+            Ci [pC + i] = i ;                                       \
+        }
+
+#else
+
+    // typical semiring
+    #define GB_COMPUTE_DENSE_C_j                                    \
+        for (int64_t i = 0 ; i < cvlen ; i++)                       \
+        {                                                           \
+            Ci [pC + i] = i ;                                       \
+            GB_CIJ_WRITE (pC + i, GB_IDENTITY) ; /* C(i,j)=0 */     \
+        }                                                           \
+        for ( ; pB < pB_end ; pB++)     /* scan B(:,j) */           \
+        {                                                           \
+            GB_GET_B_kj_INDEX ;         /* get index k of B(k,j) */ \
+            GB_GET_A_k ;                /* get A(:,k) */            \
+            if (aknz == 0) continue ;   /* skip if A(:,k) empty */  \
+            GB_GET_B_kj ;               /* bkj = B(k,j) */          \
+            /* scan A(:,k) */                                       \
+            for (int64_t pA = pA_start ; pA < pA_end ; pA++)        \
+            {                                                       \
+                GB_GET_A_ik_INDEX ;     /* get index i of A(i,k) */ \
+                GB_MULT_A_ik_B_kj ;     /* t = A(i,k)*B(k,j) */     \
+                GB_CIJ_UPDATE (pC + i, t) ; /* Cx [pC+i]+=t */      \
+            }                                                       \
+        }
+
+#endif
+
+//------------------------------------------------------------------------------
+// GB_SCAN_M_j_OR_A_k: compute C(:,j) using linear scan or binary search
+//------------------------------------------------------------------------------
+
+// C(:,j)<M(:,j)>=A(:,k)*B(k,j) using one of two methods
+#undef  GB_SCAN_M_j_OR_A_k
+#define GB_SCAN_M_j_OR_A_k(A_ok_for_binary_search)                          \
+{                                                                           \
+    if (A_ok_for_binary_search && aknz > 256 && mjnz_much < aknz &&         \
+        mjnz < mvlen && aknz < avlen)                                       \
+    {                                                                       \
+        /* M and A are both sparse, and nnz(M(:,j)) is much less than */    \
+        /* nnz(A(:,k)); scan M(:,j), and do binary search for A(i,k).*/     \
+        /* This requires that A is not jumbled. */                          \
+        int64_t pA = pA_start ;                                             \
+        for (int64_t pM = pM_start ; pM < pM_end ; pM++)                    \
+        {                                                                   \
+            GB_GET_M_ij (pM) ;      /* get M(i,j) */                        \
+            if (!mij) continue ;    /* skip if M(i,j)=0 */                  \
+            int64_t i = Mi [pM] ;                                           \
+            bool found ;            /* search for A(i,k) */                 \
+            if (M_jumbled) pA = pA_start ;                                  \
+            int64_t apright = pA_end - 1 ;                                  \
+            GB_BINARY_SEARCH (i, Ai, pA, apright, found) ;                  \
+            if (found)                                                      \
+            {                                                               \
+                /* C(i,j)<M(i,j)> += A(i,k) * B(k,j) for this method. */    \
+                /* M(i,j) is always 1, as given in the hash table */        \
+                GB_IKJ ;                                                    \
+            }                                                               \
+        }                                                                   \
+    }                                                                       \
+    else                                                                    \
+    {                                                                       \
+        /* A(:,j) is sparse enough relative to M(:,j) */                    \
+        /* M and/or A can dense, and either can be jumbled. */              \
+        /* scan A(:,k), and lookup M(i,j) (in the hash table) */            \
+        for (int64_t pA = pA_start ; pA < pA_end ; pA++)                    \
+        {                                                                   \
+            GB_GET_A_ik_INDEX ;     /* get index i of A(i,j) */             \
+            /* do C(i,j)<M(i,j)> += A(i,k) * B(k,j) for this method */      \
+            /* M(i,j) may be 0 or 1, as given in the hash table */          \
+            GB_IKJ ;                                                        \
+        }                                                                   \
+    }                                                                       \
+}
+
diff --git a/GraphBLAS/Source/Template/GB_meta16_factory.c b/GraphBLAS/Source/Template/GB_meta16_factory.c
new file mode 100644
index 0000000000..0d7ac7b89d
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_meta16_factory.c
@@ -0,0 +1,324 @@
+//------------------------------------------------------------------------------
+// GB_meta16_factory: 16 cases of a method for A and B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// All 16 cases are handled: A and B are sparse, hyper, bitmap, or full.
+
+#define GB_META16
+
+{
+    if (A_is_sparse)
+    {
+
+        if (B_is_sparse)
+        { 
+
+            //------------------------------------------------------------------
+            // both A and B are sparse
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 1
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 1
+            #define GB_B_IS_HYPER  0
+            #define GB_B_IS_BITMAP 0
+            #define GB_B_IS_FULL   0
+            #include "GB_meta16_methods.c"
+
+        }
+        else if (B_is_hyper)
+        { 
+
+            //------------------------------------------------------------------
+            // A is sparse and B is hyper
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 1
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_HYPER  1
+            #define GB_B_IS_BITMAP 0
+            #define GB_B_IS_FULL   0
+            #include "GB_meta16_methods.c"
+
+        }
+        else if (B_is_bitmap)
+        { 
+
+            //------------------------------------------------------------------
+            // A is sparse and B is bitmap
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 1
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_HYPER  0
+            #define GB_B_IS_BITMAP 1
+            #define GB_B_IS_FULL   0
+            #include "GB_meta16_methods.c"
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // A is sparse and B is full
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 1
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_HYPER  0
+            #define GB_B_IS_BITMAP 0
+            #define GB_B_IS_FULL   1
+            #include "GB_meta16_methods.c"
+
+        }
+    }
+    else if (A_is_hyper)
+    {
+        if (B_is_sparse)
+        { 
+
+            //------------------------------------------------------------------
+            // A is hyper and B is sparse
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 0
+            #define GB_A_IS_HYPER  1
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 1
+            #define GB_B_IS_HYPER  0
+            #define GB_B_IS_BITMAP 0
+            #define GB_B_IS_FULL   0
+            #include "GB_meta16_methods.c"
+
+        }
+        else if (B_is_hyper)
+        { 
+
+            //------------------------------------------------------------------
+            // both A and B are hyper
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 0
+            #define GB_A_IS_HYPER  1
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_HYPER  1
+            #define GB_B_IS_BITMAP 0
+            #define GB_B_IS_FULL   0
+            #include "GB_meta16_methods.c"
+
+        }
+        else if (B_is_bitmap)
+        { 
+
+            //------------------------------------------------------------------
+            // A is hyper and B is bitmap
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 0
+            #define GB_A_IS_HYPER  1
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_HYPER  0
+            #define GB_B_IS_BITMAP 1
+            #define GB_B_IS_FULL   0
+            #include "GB_meta16_methods.c"
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // A is hyper and B is full
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 0
+            #define GB_A_IS_HYPER  1
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_HYPER  0
+            #define GB_B_IS_BITMAP 0
+            #define GB_B_IS_FULL   1
+            #include "GB_meta16_methods.c"
+
+        }
+    }
+    else if (A_is_bitmap)
+    {
+        if (B_is_sparse)
+        { 
+
+            //------------------------------------------------------------------
+            // A is bitmap and B is sparse
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 0
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 1
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 1
+            #define GB_B_IS_HYPER  0
+            #define GB_B_IS_BITMAP 0
+            #define GB_B_IS_FULL   0
+            #include "GB_meta16_methods.c"
+
+        }
+        else if (B_is_hyper)
+        { 
+
+            //------------------------------------------------------------------
+            // A is bitmap and B is hyper
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 0
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 1
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_HYPER  1
+            #define GB_B_IS_BITMAP 0
+            #define GB_B_IS_FULL   0
+            #include "GB_meta16_methods.c"
+
+        }
+        else if (B_is_bitmap)
+        { 
+
+            //------------------------------------------------------------------
+            // both A and B are bitmap
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 0
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 1
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_HYPER  0
+            #define GB_B_IS_BITMAP 1
+            #define GB_B_IS_FULL   0
+            #include "GB_meta16_methods.c"
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // A is bitmap and B is full
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 0
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 1
+            #define GB_A_IS_FULL   0
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_HYPER  0
+            #define GB_B_IS_BITMAP 0
+            #define GB_B_IS_FULL   1
+            #include "GB_meta16_methods.c"
+
+        }
+    }
+    else
+    {
+        if (B_is_sparse)
+        { 
+
+            //------------------------------------------------------------------
+            // A is full and B is sparse
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 0
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   1
+            #define GB_B_IS_SPARSE 1
+            #define GB_B_IS_HYPER  0
+            #define GB_B_IS_BITMAP 0
+            #define GB_B_IS_FULL   0
+            #include "GB_meta16_methods.c"
+
+        }
+        else if (B_is_hyper)
+        { 
+
+            //------------------------------------------------------------------
+            // A is full and B is hyper
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 0
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   1
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_HYPER  1
+            #define GB_B_IS_BITMAP 0
+            #define GB_B_IS_FULL   0
+            #include "GB_meta16_methods.c"
+
+        }
+        else if (B_is_bitmap)
+        { 
+
+            //------------------------------------------------------------------
+            // A is full and B is bitmap
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 0
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   1
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_HYPER  0
+            #define GB_B_IS_BITMAP 1
+            #define GB_B_IS_FULL   0
+            #include "GB_meta16_methods.c"
+
+        }
+        else
+        { 
+
+            //------------------------------------------------------------------
+            // both A and B are full
+            //------------------------------------------------------------------
+
+            #define GB_A_IS_SPARSE 0
+            #define GB_A_IS_HYPER  0
+            #define GB_A_IS_BITMAP 0
+            #define GB_A_IS_FULL   1
+            #define GB_B_IS_SPARSE 0
+            #define GB_B_IS_HYPER  0
+            #define GB_B_IS_BITMAP 0
+            #define GB_B_IS_FULL   1
+            #include "GB_meta16_methods.c"
+
+        }
+    }
+}
+
+//------------------------------------------------------------------------------
+// redefine macros for any sparity of A and B
+//------------------------------------------------------------------------------
+
+#undef GB_META16
+#include "GB_meta16_definitions.h"
+
diff --git a/GraphBLAS/Source/Template/GB_meta16_methods.c b/GraphBLAS/Source/Template/GB_meta16_methods.c
new file mode 100644
index 0000000000..40c571bb1b
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_meta16_methods.c
@@ -0,0 +1,57 @@
+//------------------------------------------------------------------------------
+// GB_meta16_methods: methods for GB_meta16_factory.c
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+
+    // declare macros that depend on the sparsity of A and B
+    #include "GB_meta16_definitions.h"
+
+    // dot product methods
+    #if defined ( GB_DOT4 )
+    #include "GB_AxB_dot4_template.c"
+    #elif defined ( GB_DOT3_PHASE1 )
+    #include "GB_AxB_dot3_phase1_template.c"
+    #elif defined ( GB_DOT3_PHASE2 )
+    #include "GB_AxB_dot3_template.c"
+    #elif defined ( GB_DOT2 )
+    #include "GB_AxB_dot2_template.c"
+
+    // saxpy3 symbolic (coarse Gustavson and hash tasks)
+    #elif defined ( GB_SAXPY_COARSE_GUSTAVSON_NOMASK_PHASE1 )
+    #include "GB_AxB_saxpy3_coarseGus_noM_phase1.c"
+    #elif defined ( GB_SAXPY_COARSE_GUSTAVSON_M_PHASE1 )
+    #include "GB_AxB_saxpy3_coarseGus_M_phase1.c"
+    #elif defined ( GB_SAXPY_COARSE_GUSTAVSON_NOTM_PHASE1 )
+    #include "GB_AxB_saxpy3_coarseGus_notM_phase1.c"
+    #elif defined ( GB_SAXPY_COARSE_HASH_PHASE1 )
+    #include "GB_AxB_saxpy3_coarseHash_phase1.c"
+    #elif defined ( GB_SAXPY_COARSE_HASH_M_PHASE1 )
+    #include "GB_AxB_saxpy3_coarseHash_M_phase1.c"
+    #elif defined ( GB_SAXPY_COARSE_HASH_NOTM_PHASE1 )
+    #include "GB_AxB_saxpy3_coarseHash_notM_phase1.c"
+
+    // saxpy3 numeric (just the no-mask coarse Gustvason task)
+    #elif defined ( GB_SAXPY_COARSE_GUSTAVSON_NOMASK_PHASE5 )
+    #include "GB_AxB_saxpy3_coarseGus_noM_phase5.c"
+
+    #else
+    #error "method undefined"
+    #endif
+
+    // undefine the macros that define the A and B sparsity
+    #undef GB_A_IS_SPARSE
+    #undef GB_A_IS_HYPER
+    #undef GB_A_IS_BITMAP
+    #undef GB_A_IS_FULL
+    #undef GB_B_IS_SPARSE
+    #undef GB_B_IS_HYPER
+    #undef GB_B_IS_BITMAP
+    #undef GB_B_IS_FULL
+}
+
diff --git a/GraphBLAS/Source/Template/GB_ops_template.c b/GraphBLAS/Source/Template/GB_ops_template.c
index ebf51c84de..2ddbb95089 100644
--- a/GraphBLAS/Source/Template/GB_ops_template.c
+++ b/GraphBLAS/Source/Template/GB_ops_template.c
@@ -2,8 +2,8 @@
 // GB_ops_template.c: built-in unary and binary functions and operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -33,10 +33,10 @@ GB_OP1 (GxB_, ONE, "one") ;
     GB_OP1 (GrB_, AINV     , "ainv"     ) ;
     GB_OP1 (GrB_, MINV     , "minv"     ) ;
 
-    // z=abs(x), z and x have the same type (now in v1.3 spec)
+    // z=abs(x), z and x have the same type
     GB_OP1 (GrB_, ABS      , "abs"      ) ;
 
-    // GxB_ABS_* is now GrB_ABS_* (but keep the old GxB_ABS_* name as well)
+    // GxB_ABS_* is now GrB_ABS_*, and GxB_ABS is deprecated
     GB_OP1_RENAME (GxB_, GrB_, ABS) ;
 
     // LNOT is only defined for real types, not complex
@@ -95,7 +95,7 @@ GB_OP1 (GxB_, ONE, "one") ;
 
 #if defined ( GB_SIGNED_INT ) || defined ( GB_UNSIGNED_INT )
 
-    // bitwise complement, added for v1.3 spec
+    // bitwise complement
     GB_OP1 (GrB_, BNOT     , "bnot"     ) ;
 
 #endif
@@ -204,8 +204,7 @@ GB_OP2 (GxB_, POW    , "pow"   )
 
 #if defined ( GB_SIGNED_INT ) || defined ( GB_UNSIGNED_INT )
 
-    // bitwise binary operators, added for v1.3 spec.  For integer only (int*,
-    // and uint*), not for logical or floating-point types.
+    // bitwise binary operators
     GB_OP2 (GrB_, BOR      , "bitor"   ) ;
     GB_OP2 (GrB_, BAND     , "bitand"  ) ;
     GB_OP2 (GrB_, BXOR     , "bitxor"  ) ;
diff --git a/GraphBLAS/Source/Template/GB_ops_template.h b/GraphBLAS/Source/Template/GB_ops_template.h
index 68542674c7..f97a5c7280 100644
--- a/GraphBLAS/Source/Template/GB_ops_template.h
+++ b/GraphBLAS/Source/Template/GB_ops_template.h
@@ -2,8 +2,8 @@
 // GB_ops_template.h: define the unary and binary functions and operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/Template/GB_positional_op_ijp.c b/GraphBLAS/Source/Template/GB_positional_op_ijp.c
new file mode 100644
index 0000000000..0370cad3f1
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_positional_op_ijp.c
@@ -0,0 +1,80 @@
+//------------------------------------------------------------------------------
+// GB_positional_op_ijp: C = positional_op (A), depending j
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// TODO: rename, and use #ifdef instead of offset = 0 or 1.
+// TODO: use this kernel for GrB_extractTuples, to create J array.
+
+// A can be jumbled.  If A is jumbled, so is C.
+// if A and C are bitmap, not all of Cx need to be written to, but it's faster
+// just to write to all of it.  C->b is copied from A->b in the caller.
+
+{
+
+    //--------------------------------------------------------------------------
+    // slice the entries for each task
+    //--------------------------------------------------------------------------
+
+    int64_t *pstart_slice = NULL, *kfirst_slice = NULL, *klast_slice = NULL ;
+    if (!GB_ek_slice (&pstart_slice, &kfirst_slice, &klast_slice, A, &ntasks))
+    { 
+        // out of memory
+        return (GrB_OUT_OF_MEMORY) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // Cx = positional_op (A)
+    //--------------------------------------------------------------------------
+
+    int tid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+    for (tid = 0 ; tid < ntasks ; tid++)
+    {
+
+        // if kfirst > klast then task tid does no work at all
+        int64_t kfirst = kfirst_slice [tid] ;
+        int64_t klast  = klast_slice  [tid] ;
+
+        //----------------------------------------------------------------------
+        // C(:,kfirst:klast) = op (A(:,kfirst:klast))
+        //----------------------------------------------------------------------
+
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // find the part of A(:,k) and Cx to be operated on by this task
+            //------------------------------------------------------------------
+
+            int64_t j = GBH (Ah, k) ;
+            int64_t pA_start, pA_end ;
+            GB_get_pA (&pA_start, &pA_end, tid, k,
+                kfirst, klast, pstart_slice, Ap, avlen) ;
+
+            //------------------------------------------------------------------
+            // C(:,j) = op (A(:,j))
+            //------------------------------------------------------------------
+
+            GB_PRAGMA_SIMD
+            for (int64_t p = pA_start ; p < pA_end ; p++)
+            { 
+                // GB_POSITION is j or j+1
+                Cx_int [p] = GB_POSITION ;
+            }
+        }
+    }
+
+    //--------------------------------------------------------------------------
+    // free workspace
+    //--------------------------------------------------------------------------
+
+    GB_ek_slice_free (&pstart_slice, &kfirst_slice, &klast_slice) ;
+}
+
+#undef GB_POSITION
+
diff --git a/GraphBLAS/Source/Template/GB_positional_op_ip.c b/GraphBLAS/Source/Template/GB_positional_op_ip.c
new file mode 100644
index 0000000000..998b7e3042
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_positional_op_ip.c
@@ -0,0 +1,29 @@
+//------------------------------------------------------------------------------
+// GB_positional_op_ip: C = positional_op (A), depending only on i
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// A can be jumbled.  If A is jumbled, so is C.
+
+{
+
+    //--------------------------------------------------------------------------
+    // Cx = positional_op (A)
+    //--------------------------------------------------------------------------
+
+    int64_t p ;
+    #pragma omp parallel for num_threads(nthreads) schedule(static)
+    for (p = 0 ; p < anz ; p++)
+    { 
+        // GB_POSITION is either i or i+1
+        int64_t i = GBI (Ai, p, avlen) ;
+        Cx_int [p] = GB_POSITION ;
+    }
+}
+
+#undef GB_POSITION
+
diff --git a/GraphBLAS/Source/Template/GB_qsort_template.c b/GraphBLAS/Source/Template/GB_qsort_template.c
index df32ff7ca4..8c56d3d092 100644
--- a/GraphBLAS/Source/Template/GB_qsort_template.c
+++ b/GraphBLAS/Source/Template/GB_qsort_template.c
@@ -2,8 +2,8 @@
 // GB_qsort_template: quicksort of a K-by-n array
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,7 +20,7 @@
 // GB_partition: use a pivot to partition an array
 //------------------------------------------------------------------------------
 
-// C.A.R Hoare partition method, partitions an array in place via a pivot.
+// C.A.R Hoare partition method, partitions an array in-place via a pivot.
 // k = partition (A, n) partitions A [0:n-1] such that all entries in
 // A [0:k] are <= all entries in A [k+1:n-1].
 
@@ -84,7 +84,7 @@ static inline int64_t GB_partition
 }
 
 //------------------------------------------------------------------------------
-// GB_quicksort: recursive quicksort
+// GB_quicksort: recursive single-threaded quicksort
 //------------------------------------------------------------------------------
 
 static void GB_quicksort    // sort A [0:n-1]
diff --git a/GraphBLAS/Source/Template/GB_red_factory.c b/GraphBLAS/Source/Template/GB_red_factory.c
index 3e4b23b41f..fa9ee8af64 100644
--- a/GraphBLAS/Source/Template/GB_red_factory.c
+++ b/GraphBLAS/Source/Template/GB_red_factory.c
@@ -2,8 +2,8 @@
 // GB_red_factory.c: switch factory for reduction operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/Template/GB_reduce_build_template.c b/GraphBLAS/Source/Template/GB_reduce_build_template.c
index fe6bcbbe28..021432f1f9 100644
--- a/GraphBLAS/Source/Template/GB_reduce_build_template.c
+++ b/GraphBLAS/Source/Template/GB_reduce_build_template.c
@@ -2,8 +2,8 @@
 // GB_build_template: T=build(S), and assemble any duplicate tuples
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Source/Template/GB_reduce_each_index.c b/GraphBLAS/Source/Template/GB_reduce_each_index.c
deleted file mode 100644
index b93beb450c..0000000000
--- a/GraphBLAS/Source/Template/GB_reduce_each_index.c
+++ /dev/null
@@ -1,325 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_reduce_each_index: T(i)=reduce(A(i,:)), reduce a matrix to a vector
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Reduce a matrix to a vector.  All entries in A(i,:) are reduced to T(i).
-// First, all threads reduce their slice to their own workspace, operating on
-// roughly the same number of entries each.  The vectors in A are ignored; the
-// reduction only depends on the indices.  Next, the threads cooperate to
-// reduce all workspaces to the workspace of thread 0.  Finally, this last
-// workspace is collected into T.
-
-// If an out-of-memory condition occurs, the macro GB_FREE_ALL frees any
-// workspace.  This has no effect on the built-in workers (GB_FREE_ALL does
-// nothing), and the workspace is freed in the caller.  For the generic worker,
-// the GB_FREE_ALL macro defined in GB_reduce_to_vector frees all workspace.
-
-{
-
-    //--------------------------------------------------------------------------
-    // get A
-    //--------------------------------------------------------------------------
-
-    const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) A->x ;
-    const int64_t  *GB_RESTRICT Ai = A->i ;
-    const int64_t n = A->vlen ;
-    size_t zsize = ttype->size ;
-
-    //--------------------------------------------------------------------------
-    // allocate workspace for each thread
-    //--------------------------------------------------------------------------
-
-    int ntasks = 256 * nthreads ;
-    ntasks = GB_IMIN (ntasks, n) ;
-
-    GB_void *GB_RESTRICT *Works = GB_CALLOC (nth, GB_void *) ;
-    bool    *GB_RESTRICT *Marks = GB_CALLOC (nth, bool *) ;
-    int64_t *GB_RESTRICT Tnz    = GB_CALLOC (nth, int64_t) ;
-    int64_t *GB_RESTRICT Count  = GB_CALLOC (ntasks+1, int64_t) ;
-    bool ok = (Works != NULL && Marks != NULL && Tnz != NULL && Count != NULL) ;
-
-    // This does not need to be parallel.  The calloc does not take O(n) time.
-    if (ok)
-    {
-        for (int tid = 0 ; tid < nth ; tid++)
-        { 
-            Works [tid] = GB_MALLOC (n * zsize, GB_void) ;
-            Marks [tid] = GB_CALLOC (n, bool) ;
-            ok = ok && (Works [tid] != NULL && Marks [tid] != NULL) ;
-        }
-    }
-
-    if (!ok)
-    {
-        // out of memory
-        if (Works != NULL)
-        {
-            for (int tid = 0 ; tid < nth ; tid++)
-            { 
-                GB_FREE (Works [tid]) ;
-            }
-        }
-        if (Marks != NULL)
-        {
-            for (int tid = 0 ; tid < nth ; tid++)
-            { 
-                GB_FREE (Marks [tid]) ;
-            }
-        }
-        GB_FREE (Works) ;
-        GB_FREE (Marks) ;
-        GB_FREE (Tnz) ;
-        GB_FREE (Count) ;
-        GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
-    }
-
-    //--------------------------------------------------------------------------
-    // reduce each slice in its own workspace
-    //--------------------------------------------------------------------------
-
-    // each thread reduces its own slice in parallel
-    int tid ;
-    #pragma omp parallel for num_threads(nth) schedule(static)
-    for (tid = 0 ; tid < nth ; tid++)
-    {
-
-        //----------------------------------------------------------------------
-        // get the workspace for this thread
-        //----------------------------------------------------------------------
-
-        GB_CTYPE *GB_RESTRICT Work = (GB_CTYPE *) Works [tid] ;
-        bool     *GB_RESTRICT Mark = Marks [tid] ;
-        int64_t my_tnz = 0 ;
-
-        //----------------------------------------------------------------------
-        // reduce the entries
-        //----------------------------------------------------------------------
-
-        for (int64_t p = pstart_slice [tid] ; p < pstart_slice [tid+1] ;p++)
-        {
-            int64_t i = Ai [p] ;
-            // ztype aij = (ztype) Ax [p], with typecast
-            GB_SCALAR (aij) ;
-            GB_CAST_ARRAY_TO_SCALAR (aij, Ax, p) ;
-            if (!Mark [i])
-            { 
-                // first time index i has been seen
-                // Work [i] = aij ; no typecast
-                GB_COPY_SCALAR_TO_ARRAY (Work, i, aij) ;
-                Mark [i] = true ;
-                my_tnz++ ;
-            }
-            else
-            { 
-                // Work [i] += aij ; no typecast
-                GB_ADD_SCALAR_TO_ARRAY (Work, i, aij) ;
-            }
-        }
-        Tnz [tid] = my_tnz ;
-    }
-
-    //--------------------------------------------------------------------------
-    // reduce all workspace to Work [0] and count # entries in T
-    //--------------------------------------------------------------------------
-
-    GB_CTYPE *GB_RESTRICT Work0 = (GB_CTYPE *) Works [0] ;
-    bool     *GB_RESTRICT Mark0 = Marks [0] ;
-    int64_t tnz = Tnz [0] ;
-
-    if (nth > 1)
-    {
-        int64_t i ;
-        #pragma omp parallel for num_threads(nthreads) schedule(static) \
-            reduction(+:tnz)
-        for (i = 0 ; i < n ; i++)
-        {
-            for (int tid = 1 ; tid < nth ; tid++)
-            {
-                const bool *GB_RESTRICT Mark = Marks [tid] ;
-                if (Mark [i])
-                {
-                    // thread tid has a contribution to index i
-                    const GB_CTYPE *GB_RESTRICT Work = (GB_CTYPE *) Works [tid];
-                    if (!Mark0 [i])
-                    { 
-                        // first time index i has been seen
-                        // Work0 [i] = Work [i] ; no typecast
-                        GB_COPY_ARRAY_TO_ARRAY (Work0, i, Work, i) ;
-                        Mark0 [i] = true ;
-                        tnz++ ;
-                    }
-                    else
-                    { 
-                        // Work0 [i] += Work [i] ; no typecast
-                        GB_ADD_ARRAY_TO_ARRAY (Work0, i, Work, i) ;
-                    }
-                }
-            }
-        }
-
-        // free all but workspace for thread 0
-        for (int tid = 1 ; tid < nth ; tid++)
-        {
-            GB_FREE (Works [tid]) ;
-            GB_FREE (Marks [tid]) ;
-        }
-    }
-
-    //--------------------------------------------------------------------------
-    // free workspace
-    //--------------------------------------------------------------------------
-
-    GB_FREE (Works) ;
-    GB_FREE (Marks) ;
-    GB_FREE (Tnz) ;
-
-    //--------------------------------------------------------------------------
-    // allocate T
-    //--------------------------------------------------------------------------
-
-    // since T is a GrB_Vector, it is CSC and not hypersparse
-    info = GB_create (&T, ttype, n, 1, GB_Ap_calloc, true,
-        GB_FORCE_NONHYPER, GB_HYPER_DEFAULT, 1, tnz, true, Context) ;
-    if (info != GrB_SUCCESS)
-    { 
-        // out of memory
-        GB_FREE (Work0) ;
-        GB_FREE (Mark0) ;
-        GB_FREE (Count) ;
-        GB_FREE_ALL ;
-        return (GB_OUT_OF_MEMORY) ;
-    }
-
-    T->p [0] = 0 ;
-    T->p [1] = tnz ;
-    int64_t  *GB_RESTRICT Ti = T->i ;
-    GB_CTYPE *GB_RESTRICT Tx = (GB_CTYPE *) T->x ;
-    T->nvec_nonempty = (tnz > 0) ? 1 : 0 ;
-
-    //--------------------------------------------------------------------------
-    // gather the results into T
-    //--------------------------------------------------------------------------
-
-    if (tnz == n)
-    {
-
-        //----------------------------------------------------------------------
-        // T is dense: transplant Work0 into T->x
-        //----------------------------------------------------------------------
-
-        int64_t i ;
-        #pragma omp parallel for num_threads(nthreads) schedule(static)
-        for (i = 0 ; i < n ; i++)
-        { 
-            Ti [i] = i ;
-        }
-        GB_FREE (T->x) ;
-        T->x = Work0 ;
-        Work0 = NULL ;
-
-    }
-    else
-    {
-
-        //----------------------------------------------------------------------
-        // T is sparse: gather from Work0 and Mark0
-        //----------------------------------------------------------------------
-
-        if (nthreads == 1)
-        {
-
-            //------------------------------------------------------------------
-            // gather sparse T using a single thread
-            //------------------------------------------------------------------
-
-            int64_t p = 0 ;
-            for (int64_t i = 0 ; i < n ; i++)
-            {
-                if (Mark0 [i])
-                { 
-                    Ti [p] = i ;
-                    // Tx [p] = Work0 [i], no typecast
-                    GB_COPY_ARRAY_TO_ARRAY (Tx, p, Work0, i) ;
-                    p++ ;
-                }
-            }
-            ASSERT (p == tnz) ;
-
-        }
-        else
-        {
-
-            //------------------------------------------------------------------
-            // gather sparse T using multiple threads
-            //------------------------------------------------------------------
-
-            // Some tasks may be completely empty and thus take no time at all;
-            // 256 tasks per thread are created for better load balancing.
-
-            int taskid ;
-            #pragma omp parallel for num_threads(nthreads) schedule(dynamic)
-            for (taskid = 0 ; taskid < ntasks ; taskid++)
-            {
-                int64_t ifirst, ilast, p = 0 ;
-                GB_PARTITION (ifirst, ilast, n, taskid, ntasks) ;
-                for (int64_t i = ifirst ; i < ilast ; i++)
-                { 
-                    p += Mark0 [i] ;
-                }
-                Count [taskid] = p ;
-            }
-
-            GB_cumsum (Count, ntasks, NULL, 1) ;
-
-            #pragma omp parallel for num_threads(nthreads) schedule(dynamic)
-            for (taskid = 0 ; taskid < ntasks ; taskid++)
-            {
-                int64_t ifirst, ilast, p = Count [taskid] ;
-                int64_t my_count = (Count [taskid+1] - p) ;
-                GB_PARTITION (ifirst, ilast, n, taskid, ntasks) ;
-                if (my_count > 0)
-                {
-                    for (int64_t i = ifirst ; i < ilast ; i++)
-                    {
-                        if (Mark0 [i])
-                        { 
-                            Ti [p] = i ;
-                            // Tx [p] = Work0 [i], no typecast
-                            GB_COPY_ARRAY_TO_ARRAY (Tx, p, Work0, i) ;
-                            p++ ;
-                        }
-                    }
-                }
-            }
-
-            #ifdef GB_DEBUG
-            // check result using a single thread
-            int64_t p = 0 ;
-            for (int64_t i = 0 ; i < n ; i++)
-            {
-                if (Mark0 [i])
-                {
-                    ASSERT (Ti [p] == i) ;
-                    p++ ;
-                }
-            }
-            ASSERT (p == tnz) ;
-            #endif
-        }
-    }
-
-    //--------------------------------------------------------------------------
-    // free workspace
-    //--------------------------------------------------------------------------
-
-    GB_FREE (Count) ;
-    GB_FREE (Work0) ;
-    GB_FREE (Mark0) ;
-}
-
diff --git a/GraphBLAS/Source/Template/GB_reduce_each_vector.c b/GraphBLAS/Source/Template/GB_reduce_each_vector.c
deleted file mode 100644
index b1d6bb688e..0000000000
--- a/GraphBLAS/Source/Template/GB_reduce_each_vector.c
+++ /dev/null
@@ -1,199 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_reduce_each_vector: Tx(j)=reduce(A(:,j)), reduce a matrix to a vector
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-// Reduce a matrix to a vector.  The kth vector A(:,k) is reduced to the kth
-// scalar Tx(k).  Each thread computes the reductions on roughly the same number
-// of entries, which means that a vector A(:,k) may be reduced by more than one
-// thread.  The first vector A(:,kfirst) reduced by thread tid may be partial,
-// where the prior thread tid-1 (and other prior threads) may also do some of
-// the reductions for this same vector A(:,kfirst).  The thread tid fully
-// reduces all vectors A(:,k) for k in the range kfirst+1 to klast-1.  The last
-// vector A(:,klast) reduced by thread tid may also be partial.  Thread tid+1,
-// and following threads, may also do some of the reduces for A(:,klast).
-
-#ifndef GB_GET_J
-#define GB_GET_J ;
-#endif
-
-{
-
-    // Ah, Ai, asize, avlen, avdim unused for some uses of this template
-    #include "GB_unused.h"
-
-    //--------------------------------------------------------------------------
-    // get A
-    //--------------------------------------------------------------------------
-
-    const int64_t  *GB_RESTRICT Ap = A->p ;
-    const int64_t  *GB_RESTRICT Ah = A->h ;
-    const int64_t  *GB_RESTRICT Ai = A->i ;
-    const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) A->x ;
-    size_t  asize = A->type->size ;
-    int64_t avlen = A->vlen ;
-    int64_t avdim = A->vdim ;
-
-    //--------------------------------------------------------------------------
-    // workspace for first and last vectors of each slice
-    //--------------------------------------------------------------------------
-
-    // ztype Wfirst [ntasks], Wlast [ntasks] ;
-    GB_CTYPE *GB_RESTRICT Wfirst = (GB_CTYPE *) Wfirst_space ;
-    GB_CTYPE *GB_RESTRICT Wlast  = (GB_CTYPE *) Wlast_space ;
-
-    //--------------------------------------------------------------------------
-    // reduce each slice
-    //--------------------------------------------------------------------------
-
-    // each thread reduces its own part in parallel
-    int tid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
-    for (tid = 0 ; tid < ntasks ; tid++)
-    {
-
-        // if kfirst > klast then thread tid does no work at all
-        int64_t kfirst = kfirst_slice [tid] ;
-        int64_t klast  = klast_slice  [tid] ;
-
-        //----------------------------------------------------------------------
-        // reduce vectors kfirst to klast
-        //----------------------------------------------------------------------
-
-        for (int64_t k = kfirst ; k <= klast ; k++)
-        {
-
-            //------------------------------------------------------------------
-            // find the part of A(:,k) to be reduced by this thread
-            //------------------------------------------------------------------
-
-            GB_GET_J ;
-            int64_t pA_start, pA_end ;
-            GB_get_pA_and_pC (&pA_start, &pA_end, NULL,
-                tid, k, kfirst, klast, pstart_slice, NULL, NULL, Ap) ;
-
-            //------------------------------------------------------------------
-            // reduce Ax [pA_start ... pA_end-1] to a scalar, if non-empty
-            //------------------------------------------------------------------
-
-            if (pA_start < pA_end)
-            {
-
-                //--------------------------------------------------------------
-                // reduce the vector to the scalar s
-                //--------------------------------------------------------------
-
-                // ztype s = (ztype) Ax [pA_start], with typecast
-                GB_SCALAR (s) ;
-                GB_CAST_ARRAY_TO_SCALAR (s, Ax, pA_start) ;
-                for (int64_t p = pA_start+1 ; p < pA_end ; p++)
-                { 
-                    // check for early exit
-                    GB_BREAK_IF_TERMINAL (s) ;
-                    // s += (ztype) Ax [p], with typecast
-                    GB_ADD_CAST_ARRAY_TO_SCALAR (s, Ax, p) ;
-                }
-
-                //--------------------------------------------------------------
-                // save the result s
-                //--------------------------------------------------------------
-
-                if (k == kfirst)
-                { 
-                    // Wfirst [tid] = s ; no typecast
-                    GB_COPY_SCALAR_TO_ARRAY (Wfirst, tid, s) ;
-                }
-                else if (k == klast)
-                { 
-                    // Wlast [tid] = s ; no typecast
-                    GB_COPY_SCALAR_TO_ARRAY (Wlast, tid, s) ;
-                }
-                else
-                { 
-                    // Tx [k] = s ; no typecast
-                    GB_COPY_SCALAR_TO_ARRAY (Tx, k, s) ;
-                }
-            }
-        }
-    }
-
-    //--------------------------------------------------------------------------
-    // reduce the first and last vector of each slice using a single thread
-    //--------------------------------------------------------------------------
-
-    // This step is sequential, but it takes only O(ntasks) time.  The only
-    // case where this could be a problem is if a user-defined operator was
-    // a very costly one.
-
-    int64_t kprior = -1 ;
-
-    for (int tid = 0 ; tid < ntasks ; tid++)
-    {
-
-        //----------------------------------------------------------------------
-        // sum up the partial result that thread tid computed for kfirst
-        //----------------------------------------------------------------------
-
-        int64_t kfirst = kfirst_slice [tid] ;
-        int64_t klast  = klast_slice  [tid] ;
-
-        if (kfirst <= klast)
-        {
-            int64_t pA_start = pstart_slice [tid] ;
-            int64_t pA_end   = GB_IMIN (Ap [kfirst+1], pstart_slice [tid+1]) ;
-            if (pA_start < pA_end)
-            {
-                if (kprior < kfirst)
-                { 
-                    // This thread is the first one that did work on
-                    // A(:,kfirst), so use it to start the reduction.
-                    // Tx [kfirst] = Wfirst [tid], no typecast
-                    GB_COPY_ARRAY_TO_ARRAY (Tx, kfirst, Wfirst, tid) ;
-                }
-                else
-                { 
-                    // Tx [kfirst] += Wfirst [tid], no typecast
-                    GB_ADD_ARRAY_TO_ARRAY (Tx, kfirst, Wfirst, tid) ;
-                }
-                kprior = kfirst ;
-            }
-        }
-
-        //----------------------------------------------------------------------
-        // sum up the partial result that thread tid computed for klast
-        //----------------------------------------------------------------------
-
-        if (kfirst < klast)
-        {
-            int64_t pA_start = Ap [klast] ;
-            int64_t pA_end   = pstart_slice [tid+1] ;
-            if (pA_start < pA_end)
-            {
-                /* if */ ASSERT (kprior < klast) ;
-                { 
-                    // This thread is the first one that did work on
-                    // A(:,klast), so use it to start the reduction.
-                    // Tx [klast] = Wlast [tid], no typecast
-                    GB_COPY_ARRAY_TO_ARRAY (Tx, klast, Wlast, tid) ;
-                }
-                /*
-                else
-                {
-                    // If kfirst < klast and A(:,klast is not empty, then this
-                    // task is always the first one to do work on A(:,klast),
-                    // so this case is never used.
-                    ASSERT (GB_DEAD_CODE) ;
-                    // Tx [klast] += Wlast [tid], no typecast
-                    GB_ADD_ARRAY_TO_ARRAY (Tx, klast, Wlast, tid) ;
-                }
-                */
-                kprior = klast ;
-            }
-        }
-    }
-}
-
diff --git a/GraphBLAS/Source/Template/GB_reduce_panel.c b/GraphBLAS/Source/Template/GB_reduce_panel.c
index e5ac2682e6..9b41a5481f 100644
--- a/GraphBLAS/Source/Template/GB_reduce_panel.c
+++ b/GraphBLAS/Source/Template/GB_reduce_panel.c
@@ -2,13 +2,15 @@
 // GB_reduce_panel: s=reduce(A), reduce a matrix to a scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // Reduce a matrix to a scalar using a panel-based method for built-in
-// operators.  No typecasting is performed.
+// operators.  No typecasting is performed.  A must be sparse, hypersparse,
+// or full (it cannot be bitmap).  A cannot have any zombies.  If A has zombies
+// or is bitmap, GB_reduce_to_scalar_template is used instead.
 
 {
 
@@ -19,19 +21,14 @@
     const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) A->x ;
     int64_t anz = GB_NNZ (A) ;
     ASSERT (anz > 0) ;
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    ASSERT (A->nzombies == 0) ;
 
     #if GB_IS_ANY_MONOID
     // the ANY monoid can take any entry, and terminate immediately
     s = Ax [anz-1] ;
     #else
 
-    //--------------------------------------------------------------------------
-    // typecast workspace
-    //--------------------------------------------------------------------------
-
-    // ctype W [ntasks] ;
-    GB_CTYPE *GB_RESTRICT W = (GB_CTYPE *) W_space ;
-
     //--------------------------------------------------------------------------
     // reduce A to a scalar
     //--------------------------------------------------------------------------
@@ -71,7 +68,7 @@
             }
             else
             {
-                // full panel
+                // whole panel
                 for (int64_t k = 0 ; k < GB_PANEL ; k++)
                 { 
                     // Panel [k] = op (Panel [k], Ax [p+k]) ;
@@ -85,7 +82,7 @@
                     panel_count = 256 ;
                     int count = 0 ;
                     for (int64_t k = 0 ; k < GB_PANEL ; k++)
-                    {
+                    { 
                         count += (Panel [k] == GB_TERMINAL_VALUE) ;
                     }
                     if (count > 0)
@@ -194,7 +191,7 @@
                     }
                     else
                     {
-                        // full panel
+                        // whole panel
                         for (int64_t k = 0 ; k < GB_PANEL ; k++)
                         { 
                             // Panel [k] = op (Panel [k], Ax [p+k]) ;
@@ -208,7 +205,7 @@
                             panel_count = 256 ;
                             int count = 0 ;
                             for (int64_t k = 0 ; k < GB_PANEL ; k++)
-                            {
+                            { 
                                 count += (Panel [k] == GB_TERMINAL_VALUE) ;
                             }
                             if (count > 0)
diff --git a/GraphBLAS/Source/Template/GB_reduce_to_scalar_template.c b/GraphBLAS/Source/Template/GB_reduce_to_scalar_template.c
index 87cb24efcc..7ac4d72df2 100644
--- a/GraphBLAS/Source/Template/GB_reduce_to_scalar_template.c
+++ b/GraphBLAS/Source/Template/GB_reduce_to_scalar_template.c
@@ -2,8 +2,8 @@
 // GB_reduce_to_scalar_template: s=reduce(A), reduce a matrix to a scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,9 +16,12 @@
     // get A
     //--------------------------------------------------------------------------
 
+    const int8_t   *GB_RESTRICT Ab = A->b ;
+    const int64_t  *GB_RESTRICT Ai = A->i ;
     const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) A->x ;
-    int64_t anz = GB_NNZ (A) ;
+    int64_t anz = GB_NNZ_HELD (A) ;
     ASSERT (anz > 0) ;
+    const bool A_has_zombies = (A->nzombies > 0) ;
 
     //--------------------------------------------------------------------------
     // reduce A to a scalar
@@ -31,14 +34,17 @@
         // single thread
         //----------------------------------------------------------------------
 
-        // s = (ztype) Ax [0]
-        GB_CAST_ARRAY_TO_SCALAR (s, Ax, 0) ;
-        for (int64_t p = 1 ; p < anz ; p++)
+        for (int64_t p = 0 ; p < anz ; p++)
         { 
-            // check for early exit
-            GB_BREAK_IF_TERMINAL (s) ;
+            // skip if the entry is a zombie or if not in the bitmap
+            if (A_has_zombies && GB_IS_ZOMBIE (Ai [p])) continue ;
+            if (!GBB (Ab, p)) continue ;
             // s = op (s, (ztype) Ax [p])
             GB_ADD_CAST_ARRAY_TO_SCALAR (s, Ax, p) ;
+            // check for early exit
+            #if GB_HAS_TERMINAL
+            if (GB_IS_TERMINAL (s)) break ;
+            #endif
         }
 
     }
@@ -50,26 +56,41 @@
         //----------------------------------------------------------------------
 
         bool early_exit = false ;
-
         int tid ;
+
         #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
         for (tid = 0 ; tid < ntasks ; tid++)
         {
             int64_t pstart, pend ;
             GB_PARTITION (pstart, pend, anz, tid, ntasks) ;
-            // ztype t = (ztype) Ax [pstart], with typecast
-            GB_SCALAR (t) ;
-            GB_CAST_ARRAY_TO_SCALAR (t, Ax, pstart) ;
-            GB_IF_NOT_EARLY_EXIT
+            // ztype t = identity
+            GB_SCALAR_IDENTITY (t) ;
+            bool my_exit, found = false ;
+            GB_ATOMIC_READ
+            my_exit = early_exit ;
+            if (!my_exit)
             {
-                for (int64_t p = pstart+1 ; p < pend ; p++)
+                for (int64_t p = pstart ; p < pend ; p++)
                 { 
-                    // check for early exit
-                    GB_PARALLEL_BREAK_IF_TERMINAL (t) ;
+                    // skip if the entry is a zombie or if not in the bitmap
+                    if (A_has_zombies && GB_IS_ZOMBIE (Ai [p])) continue ;
+                    if (!GBB (Ab, p)) continue ;
+                    found = true ;
                     // t = op (t, (ztype) Ax [p]), with typecast
                     GB_ADD_CAST_ARRAY_TO_SCALAR (t, Ax, p) ;
+                    // check for early exit
+                    #if GB_HAS_TERMINAL
+                    if (GB_IS_TERMINAL (t))
+                    { 
+                        // tell the other tasks to exit early
+                        GB_ATOMIC_WRITE
+                        early_exit = true ;
+                        break ;
+                    }
+                    #endif
                 }
             }
+            F [tid] = found ;
             // W [tid] = t, no typecast
             GB_COPY_SCALAR_TO_ARRAY (W, tid, t) ;
         }
@@ -78,12 +99,13 @@
         // sum up the results of each slice using a single thread
         //----------------------------------------------------------------------
 
-        // s = W [0], no typecast
-        GB_COPY_ARRAY_TO_SCALAR (s, W, 0) ;
-        for (int tid = 1 ; tid < ntasks ; tid++)
-        { 
-            // s = op (s, W [tid]), no typecast
-            GB_ADD_ARRAY_TO_SCALAR (s, W, tid) ;
+        for (int tid = 0 ; tid < ntasks ; tid++)
+        {
+            if (F [tid])
+            { 
+                // s = op (s, W [tid]), no typecast
+                GB_ADD_ARRAY_TO_SCALAR (s, W, tid) ;
+            }
         }
     }
 }
diff --git a/GraphBLAS/Source/Template/GB_search_for_vector_template.c b/GraphBLAS/Source/Template/GB_search_for_vector_template.c
index 4ca40c2c48..c3b67c6b97 100644
--- a/GraphBLAS/Source/Template/GB_search_for_vector_template.c
+++ b/GraphBLAS/Source/Template/GB_search_for_vector_template.c
@@ -2,14 +2,17 @@
 // GB_search_for_vector_template: find the vector k that contains p
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // Given an index p, find k so that Ap [k] <= p && p < Ap [k+1].  The search is
 // limited to k in the range Ap [kleft ... anvec].
 
+// A->p can come from any matrix: hypersparse, sparse, bitmap, or full.
+// For the latter two cases, A->p is NULL.
+
 #ifdef GB_KERNEL
 __device__
 static inline int64_t GB_search_for_vector_device
@@ -20,7 +23,8 @@ static inline int64_t GB_search_for_vector // return vector k that contains p
     const int64_t p,                // search for vector k that contains p
     const int64_t *GB_RESTRICT Ap,  // vector pointers to search
     int64_t kleft,                  // left-most k to search
-    int64_t anvec                   // Ap is of size anvec+1
+    int64_t anvec,                  // Ap is of size anvec+1
+    int64_t avlen                   // A->vlen
 )
 {
 
@@ -28,6 +32,14 @@ static inline int64_t GB_search_for_vector // return vector k that contains p
     // check inputs
     //--------------------------------------------------------------------------
 
+    if (Ap == NULL)
+    { 
+        // A is full or bitmap
+        ASSERT (p >= 0 && p < avlen * anvec) ;
+        return ((avlen == 0) ? 0 : (p / avlen)) ;
+    }
+
+    // A is sparse
     ASSERT (p >= 0 && p < Ap [anvec]) ;
 
     //--------------------------------------------------------------------------
diff --git a/GraphBLAS/Source/Template/GB_select_factory.c b/GraphBLAS/Source/Template/GB_select_factory.c
index 2198f4017f..c877cd7838 100644
--- a/GraphBLAS/Source/Template/GB_select_factory.c
+++ b/GraphBLAS/Source/Template/GB_select_factory.c
@@ -2,8 +2,8 @@
 // GB_select_factory: switch factory for C=select(A,thunk)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,9 +14,11 @@ switch (opcode)
     case GB_TRIU_opcode          : GB_SEL_WORKER (_triu    , _any, GB_void)
     case GB_DIAG_opcode          : GB_SEL_WORKER (_diag    , _any, GB_void)
     case GB_OFFDIAG_opcode       : GB_SEL_WORKER (_offdiag , _any, GB_void)
-    case GB_RESIZE_opcode        : GB_SEL_WORKER (_resize  , _any, GB_void)
     case GB_USER_SELECT_opcode   : GB_SEL_WORKER (_user    , _any, GB_void)
 
+    // resize and nonzombie selectors are not used for the bitmap case
+    #ifndef GB_BITMAP_SELECTOR
+    case GB_RESIZE_opcode        : GB_SEL_WORKER (_resize  , _any, GB_void)
     case GB_NONZOMBIE_opcode :  // A(i,j) not a zombie
 
         #ifdef GB_SELECT_PHASE1
@@ -44,6 +46,7 @@ switch (opcode)
         }
         break ;
         #endif
+    #endif
 
     case GB_NONZERO_opcode   :  // A(i,j) != 0
 
@@ -90,7 +93,7 @@ switch (opcode)
     case GB_GT_ZERO_opcode   :  // A(i,j) > 0
 
         // bool and uint: renamed GxB_GT_ZERO to GxB_NONZERO
-        // user type: return error
+        // user type and complex: return error
         switch (typecode)
         {
             case GB_INT8_code   : GB_SEL_WORKER (_gt_zero, _int8  , int8_t  )
@@ -106,7 +109,7 @@ switch (opcode)
     case GB_GE_ZERO_opcode   :  // A(i,j) >= 0
 
         // bool and uint: always true; use GB_dup
-        // user type: return error
+        // user type and complex: return error
         switch (typecode)
         {
             case GB_INT8_code   : GB_SEL_WORKER (_ge_zero, _int8  , int8_t  )
@@ -122,7 +125,7 @@ switch (opcode)
     case GB_LT_ZERO_opcode   :  // A(i,j) < 0
 
         // bool and uint: always false; return an empty matrix
-        // user type: return error
+        // user type and complex: return error
         switch (typecode)
         {
             case GB_INT8_code   : GB_SEL_WORKER (_lt_zero, _int8  , int8_t  )
@@ -138,7 +141,7 @@ switch (opcode)
     case GB_LE_ZERO_opcode   :  // A(i,j) <= 0
 
         // bool and uint: renamed GxB_LE_ZERO to GxB_EQ_ZERO
-        // user type: return error
+        // user type and complex: return error
         switch (typecode)
         {
             case GB_INT8_code   : GB_SEL_WORKER (_le_zero, _int8  , int8_t  )
@@ -199,7 +202,7 @@ switch (opcode)
 
         // bool: if thunk is false, renamed GxB_GT_THUNK to GxB_NONZERO
         //       if thunk is true,  return an empty matrix
-        // user type: return error
+        // user type and complex: return error
         switch (typecode)
         {
             case GB_INT8_code   : GB_SEL_WORKER (_gt_thunk, _int8  , int8_t  )
@@ -220,7 +223,7 @@ switch (opcode)
 
         // bool: if thunk is false, use GB_dup
         //       if thunk is true,  renamed GxB_GE_THUNK to GxB_NONZERO
-        // user type: return error
+        // user type and complex: return error
         switch (typecode)
         {
             case GB_INT8_code   : GB_SEL_WORKER (_ge_thunk, _int8  , int8_t  )
@@ -241,7 +244,7 @@ switch (opcode)
 
         // bool: if thunk is true,  renamed GxB_LT_THUNK to GxB_EQ_ZERO
         //       if thunk is false, return an empty matrix
-        // user type: return error
+        // user type and complex: return error
         switch (typecode)
         {
             case GB_INT8_code   : GB_SEL_WORKER (_lt_thunk, _int8  , int8_t  )
@@ -262,7 +265,7 @@ switch (opcode)
 
         // bool: if thunk is true,  use GB_dup
         //       if thunk is false, renamed GxB_LE_ZERO to GxB_EQ_ZERO
-        // user type: return error
+        // user type and complex: return error
         switch (typecode)
         {
             case GB_INT8_code   : GB_SEL_WORKER (_le_thunk, _int8  , int8_t  )
diff --git a/GraphBLAS/Source/Template/GB_select_phase1.c b/GraphBLAS/Source/Template/GB_select_phase1.c
index 89128f9e11..4a308879a3 100644
--- a/GraphBLAS/Source/Template/GB_select_phase1.c
+++ b/GraphBLAS/Source/Template/GB_select_phase1.c
@@ -1,60 +1,187 @@
 //------------------------------------------------------------------------------
-// GB_select_count: count entries in eacn vector for C=select(A,thunk)
+// GB_select_phase1: count entries in each vector for C=select(A,thunk)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #if defined ( GB_ENTRY_SELECTOR )
 
     //--------------------------------------------------------------------------
-    // declarations for Template/GB_reduce_each_vector.c
+    // entry selector
     //--------------------------------------------------------------------------
 
-    // The two if tests below are written carefully so that typecasting from
-    // Bool works properly.  The user might import a Bool array whose values
-    // are not 0 and 1, and this can lead to subtle errors with compiler
-    // optimization.  The compiler may assume that the array contains only 0's
-    // and 1's, which leads to a miscount.
+    ASSERT (GB_JUMBLED_OK (A)) ;
 
-    // declare scalar and initialize it to zero
-    #define GB_SCALAR(s)                                    \
-        int64_t s = 0 ;
+    // The count of live entries kth vector A(:,k) is reduced to the kth scalar
+    // Cp(k).  Each thread computes the reductions on roughly the same number
+    // of entries, which means that a vector A(:,k) may be reduced by more than
+    // one thread.  The first vector A(:,kfirst) reduced by thread tid may be
+    // partial, where the prior thread tid-1 (and other prior threads) may also
+    // do some of the reductions for this same vector A(:,kfirst).  The thread
+    // tid reduces all vectors A(:,k) for k in the range kfirst+1 to klast-1.
+    // The last vector A(:,klast) reduced by thread tid may also be partial.
+    // Thread tid+1, and following threads, may also do some of the reduces for
+    // A(:,klast).
 
-    // ztype s = (ztype) Ax [p], with typecast.
-    #define GB_CAST_ARRAY_TO_SCALAR(s,Ax,p)                 \
-        if (GB_TEST_VALUE_OF_ENTRY (p)) { s = 1 ; } else { s = 0 ; }
+    //--------------------------------------------------------------------------
+    // get A
+    //--------------------------------------------------------------------------
+
+    const int64_t  *GB_RESTRICT Ap = A->p ;
+    const int64_t  *GB_RESTRICT Ah = A->h ;
+    const int64_t  *GB_RESTRICT Ai = A->i ;
+    const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) A->x ;
+    size_t  asize = A->type->size ;
+    int64_t avlen = A->vlen ;
+    int64_t avdim = A->vdim ;
+    ASSERT (GB_JUMBLED_OK (A)) ;
+
+    //--------------------------------------------------------------------------
+    // reduce each slice
+    //--------------------------------------------------------------------------
+
+    // each thread reduces its own part in parallel
+    int tid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+    for (tid = 0 ; tid < ntasks ; tid++)
+    {
+
+        // if kfirst > klast then thread tid does no work at all
+        int64_t kfirst = kfirst_slice [tid] ;
+        int64_t klast  = klast_slice  [tid] ;
+
+        //----------------------------------------------------------------------
+        // reduce vectors kfirst to klast
+        //----------------------------------------------------------------------
+
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // find the part of A(:,k) to be reduced by this thread
+            //------------------------------------------------------------------
+
+            GB_GET_J ; // int64_t j = GBH (Ah, k) ; but for user selectop only
+            int64_t pA_start, pA_end ;
+            GB_get_pA (&pA_start, &pA_end, tid, k,
+                kfirst, klast, pstart_slice, Ap, avlen) ;
+
+            //------------------------------------------------------------------
+            // count entries in Ax [pA_start ... pA_end-1], if non-empty
+            //------------------------------------------------------------------
+
+            if (pA_start < pA_end)
+            {
+
+                //--------------------------------------------------------------
+                // count the live entries in Ax [pA_start ... pA_end-1]
+                //--------------------------------------------------------------
+
+                int64_t s = 0 ;
+                for (int64_t p = pA_start ; p < pA_end ; p++)
+                { 
+                    if (GB_TEST_VALUE_OF_ENTRY (p)) s++ ;
+                }
+
+                //--------------------------------------------------------------
+                // save the result s
+                //--------------------------------------------------------------
+
+                if (k == kfirst)
+                { 
+                    Wfirst [tid] = s ;
+                }
+                else if (k == klast)
+                { 
+                    Wlast [tid] = s ;
+                }
+                else
+                { 
+                    Cp [k] = s ; 
+                }
+            }
+        }
+    }
 
-    // s += (ztype) Ax [p], with typecast
-    #define GB_ADD_CAST_ARRAY_TO_SCALAR(s,Ax,p)             \
-        if (GB_TEST_VALUE_OF_ENTRY (p)) s++ ;
+    //--------------------------------------------------------------------------
+    // reduce the first and last vector of each slice using a single thread
+    //--------------------------------------------------------------------------
 
-    // The scalar s and array W are always of type int64_t (GB_CTYPE)
-    #define GB_CTYPE int64_t
+    // This step is sequential, but it takes only O(ntasks) time.  The only
+    // case where this could be a problem is if a user-defined operator was
+    // a very costly one.
 
-    // W [k] = s
-    #define GB_COPY_SCALAR_TO_ARRAY(W,k,s)                  \
-        W [k] = s
+    int64_t kprior = -1 ;
 
-    // W [k] = S [i]
-    #define GB_COPY_ARRAY_TO_ARRAY(W,k,S,i)                 \
-        W [k] = S [i]
+    for (int tid = 0 ; tid < ntasks ; tid++)
+    {
 
-    // W [k] += S [i]
-    #define GB_ADD_ARRAY_TO_ARRAY(W,k,S,i)                  \
-        W [k] += S [i]
+        //----------------------------------------------------------------------
+        // sum up the partial result that thread tid computed for kfirst
+        //----------------------------------------------------------------------
+
+        int64_t kfirst = kfirst_slice [tid] ;
+        int64_t klast  = klast_slice  [tid] ;
+
+        if (kfirst <= klast)
+        {
+            int64_t pA_start = pstart_slice [tid] ;
+            int64_t pA_end   = GBP (Ap, kfirst+1, avlen) ;
+            pA_end = GB_IMIN (pA_end, pstart_slice [tid+1]) ;
+            if (pA_start < pA_end)
+            {
+                if (kprior < kfirst)
+                { 
+                    // This thread is the first one that did work on
+                    // A(:,kfirst), so use it to start the reduction.
+                    Cp [kfirst] = Wfirst [tid] ;
+                }
+                else
+                { 
+                    Cp [kfirst] += Wfirst [tid] ;
+                }
+                kprior = kfirst ;
+            }
+        }
 
-    // no terminal value
-    #define GB_BREAK_IF_TERMINAL(t) ;
+        //----------------------------------------------------------------------
+        // sum up the partial result that thread tid computed for klast
+        //----------------------------------------------------------------------
 
-    #include "GB_reduce_each_vector.c"
+        if (kfirst < klast)
+        {
+            int64_t pA_start = GBP (Ap, klast, avlen) ;
+            int64_t pA_end   = pstart_slice [tid+1] ;
+            if (pA_start < pA_end)
+            {
+                /* if */ ASSERT (kprior < klast) ;
+                { 
+                    // This thread is the first one that did work on
+                    // A(:,klast), so use it to start the reduction.
+                    Cp [klast] = Wlast [tid] ;
+                }
+                /*
+                else
+                {
+                    // If kfirst < klast and A(:,klast is not empty, then this
+                    // task is always the first one to do work on A(:,klast),
+                    // so this case is never used.
+                    ASSERT (GB_DEAD_CODE) ;
+                    Cp [klast] += Wlast [tid] ;
+                }
+                */
+                kprior = klast ;
+            }
+        }
+    }
 
 #else
 
     //--------------------------------------------------------------------------
-    // get A
+    // positional selector (tril, triu, diag, offdiag, resize)
     //--------------------------------------------------------------------------
 
     const int64_t *GB_RESTRICT Ap = A->p ;
@@ -62,6 +189,7 @@
     const int64_t *GB_RESTRICT Ai = A->i ;
     int64_t anvec = A->nvec ;
     int64_t avlen = A->vlen ;
+    ASSERT (!GB_JUMBLED (A)) ;
 
     //--------------------------------------------------------------------------
     // tril, triu, diag, offdiag, resize: binary search in each vector
@@ -76,8 +204,8 @@
         // get A(:,k)
         //----------------------------------------------------------------------
 
-        int64_t pA_start = Ap [k] ;
-        int64_t pA_end   = Ap [k+1] ;
+        int64_t pA_start = GBP (Ap, k, avlen) ;
+        int64_t pA_end   = GBP (Ap, k+1, avlen) ;
         int64_t p = pA_start ;
         int64_t cjnz = 0 ;
         int64_t ajnz = pA_end - pA_start ;
@@ -90,13 +218,13 @@
             // search for the entry A(i,k)
             //------------------------------------------------------------------
 
-            int64_t ifirst = Ai [pA_start] ;
-            int64_t ilast  = Ai [pA_end-1] ;
+            int64_t ifirst = GBI (Ai, pA_start, avlen) ;
+            int64_t ilast  = GBI (Ai, pA_end-1, avlen) ;
 
             #if defined ( GB_RESIZE_SELECTOR )
             int64_t i = ithunk ;
             #else
-            int64_t j = (Ah == NULL) ? k : Ah [k] ;
+            int64_t j = GBH (Ah, k) ;
             int64_t i = j-ithunk ;
             #endif
 
@@ -115,7 +243,7 @@
                 // A(:,k) is dense
                 found = true ;
                 p += i ;
-                ASSERT (Ai [p] == i) ;
+                ASSERT (GBI (Ai, p, avlen) == i) ;
             }
             else
             { 
@@ -183,9 +311,6 @@
     // Wfirst [0..ntasks-1] and Wlast [0..ntasks-1] are required for
     // constructing C_start_slice [0..ntasks-1] in GB_selector.
 
-    int64_t *GB_RESTRICT Wfirst = (int64_t *) Wfirst_space ;
-    int64_t *GB_RESTRICT Wlast  = (int64_t *) Wlast_space  ;
-
     for (int tid = 0 ; tid < ntasks ; tid++)
     {
 
@@ -196,7 +321,8 @@
         if (kfirst <= klast)
         {
             int64_t pA_start = pstart_slice [tid] ;
-            int64_t pA_end = GB_IMIN (Ap [kfirst+1], pstart_slice [tid+1]) ;
+            int64_t pA_end   = GBP (Ap, kfirst+1, avlen) ;
+            pA_end = GB_IMIN (pA_end, pstart_slice [tid+1]) ;
             if (pA_start < pA_end)
             { 
                 #if defined ( GB_TRIL_SELECTOR )
@@ -235,7 +361,7 @@
 
         if (kfirst < klast)
         {
-            int64_t pA_start = Ap [klast] ;
+            int64_t pA_start = GBP (Ap, klast, avlen) ;
             int64_t pA_end   = pstart_slice [tid+1] ;
             if (pA_start < pA_end)
             { 
diff --git a/GraphBLAS/Source/Template/GB_select_phase2.c b/GraphBLAS/Source/Template/GB_select_phase2.c
index c4fa16ac4a..897a7b3718 100644
--- a/GraphBLAS/Source/Template/GB_select_phase2.c
+++ b/GraphBLAS/Source/Template/GB_select_phase2.c
@@ -2,16 +2,12 @@
 // GB_select_phase2: C=select(A,thunk)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 {
-
-    // asize, avlen, avdim, and Ah unused for some uses of this template
-    #include "GB_unused.h"
-
     //--------------------------------------------------------------------------
     // get A
     //--------------------------------------------------------------------------
@@ -23,6 +19,12 @@
     size_t asize = A->type->size ;
     int64_t avlen = A->vlen ;
     int64_t avdim = A->vdim ;
+    // if A is bitmap, the bitmap selector is always used instead
+    ASSERT (!GB_IS_BITMAP (A)) ;
+    #ifndef GB_DIAG_SELECTOR
+    // if A is full, all opcodes except DIAG use the bitmap selector instead
+    ASSERT (!GB_IS_FULL (A)) ;
+    #endif
 
     //--------------------------------------------------------------------------
     // C = select (A)
@@ -49,8 +51,8 @@
             //------------------------------------------------------------------
 
             int64_t pA_start, pA_end, pC ;
-            GB_get_pA_and_pC (&pA_start, &pA_end, &pC,
-                tid, k, kfirst, klast, pstart_slice, C_pstart_slice, Cp, Ap) ;
+            GB_get_pA_and_pC (&pA_start, &pA_end, &pC, tid, k, kfirst, klast,
+                pstart_slice, C_pstart_slice, Cp, avlen, Ap, avlen) ;
 
             //------------------------------------------------------------------
             // compact Ai and Ax [pA_start ... pA_end-1] into Ci and Cx
@@ -61,6 +63,10 @@
                 GB_GET_J ;
                 for (int64_t pA = pA_start ; pA < pA_end ; pA++)
                 {
+                    // A is never full; that case is now handled by the
+                    // bitmap selector instead.
+                    // int64_t i = GBI (Ai, pA, avlen) ;
+                    ASSERT (Ai != NULL) ;
                     int64_t i = Ai [pA] ;
                     if (GB_TEST_VALUE_OF_ENTRY (pA))
                     { 
@@ -81,18 +87,40 @@
                 if (mynz > 0)
                 { 
                     ASSERT (pC >= Cp [k] && pC + mynz <= Cp [k+1]) ;
-                    memcpy (Ci +pC, Ai +pA_start, mynz*sizeof (int64_t)) ;
+                    ASSERT (Ai != NULL) ;
+                    // if (Ai != NULL)
+                    {
+                        // A and C are both sparse or hypersparse
+                        memcpy (Ci +pC, Ai +pA_start, mynz*sizeof (int64_t)) ;
+                    }
+                    #if 0
+                    else
+                    {
+                        // A is full and C is sparse: for triu: the bitmap
+                        // selector is used.  For resize, A is converted to
+                        // hypersparse first.
+                        ASSERT (GB_DEAD_CODE) ;
+                        int64_t i_start = pA_start % avlen ;
+                        for (int64_t s = 0 ; s < mynz ; s++)
+                        {
+                            int64_t i = i_start + s ;
+                            ASSERT (GBI (Ai, pA_start+s, avlen) == i) ;
+                            Ci [pC+s] = i ;
+                        }
+                    }
+                    #endif
                     memcpy (Cx +pC*asize, Ax +pA_start*asize, mynz*asize) ;
                 }
 
             #elif defined ( GB_DIAG_SELECTOR )
 
                 // task that owns the diagonal entry does this work
+                // A can be sparse or full, but not bitmap
                 int64_t p = Zp [k] ;
                 if (pA_start <= p && p < pA_end)
                 { 
                     ASSERT (pC >= Cp [k] && pC + 1 <= Cp [k+1]) ;
-                    Ci [pC] = Ai [p] ;
+                    Ci [pC] = GBI (Ai, p, avlen) ;
                     memcpy (Cx +pC*asize, Ax +p*asize, asize) ;
                 }
 
@@ -104,7 +132,27 @@
                 if (mynz > 0)
                 { 
                     ASSERT (pC >= Cp [k] && pC + mynz <= Cp [k+1]) ;
-                    memcpy (Ci +pC, Ai +pA_start, mynz*sizeof (int64_t)) ;
+                    ASSERT (Ai != NULL) ;
+                    // if (Ai != NULL)
+                    {
+                        // A and C are both sparse or hypersparse
+                        memcpy (Ci +pC, Ai +pA_start, mynz*sizeof (int64_t)) ;
+                    }
+                    #if 0
+                    else
+                    {
+                        // A is full and C is sparse or hypersparse:
+                        // this is now always handled by the bitmap selector
+                        ASSERT (GB_DEAD_CODE) ;
+                        int64_t i_start = pA_start % avlen ;
+                        for (int64_t s = 0 ; s < mynz ; s++)
+                        {
+                            int64_t i = i_start + s ;
+                            ASSERT (GBI (Ai, pA_start+s, avlen) == i) ;
+                            Ci [pC+s] = i ;
+                        }
+                    }
+                    #endif
                     memcpy (Cx +pC*asize, Ax +pA_start*asize, mynz*asize) ;
                     pC += mynz ;
                 }
@@ -116,7 +164,26 @@
                 { 
                     ASSERT (pA_start <= p && p < pA_end) ;
                     ASSERT (pC >= Cp [k] && pC + mynz <= Cp [k+1]) ;
-                    memcpy (Ci +pC, Ai +p, mynz*sizeof (int64_t)) ;
+                    ASSERT (Ai != NULL) ;
+                    // if (Ai != NULL)
+                    {
+                        // A and C are both sparse or hypersparse
+                        memcpy (Ci +pC, Ai +p, mynz*sizeof (int64_t)) ;
+                    }
+                    #if 0
+                    else
+                    {
+                        // A is full and C is sparse or hypersparse
+                        ASSERT (GB_DEAD_CODE) ;
+                        int64_t i_start = p % avlen ;
+                        for (int64_t s = 0 ; s < mynz ; s++)
+                        {
+                            int64_t i = i_start + s ;
+                            ASSERT (GBI (Ai, p+s, avlen) == i) ;
+                            Ci [pC+s] = i ;
+                        }
+                    }
+                    #endif
                     memcpy (Cx +pC*asize, Ax +p*asize, mynz*asize) ;
                 }
 
@@ -129,7 +196,27 @@
                 { 
                     ASSERT (pA_start <= p && p + mynz <= pA_end) ;
                     ASSERT (pC >= Cp [k] && pC + mynz <= Cp [k+1]) ;
-                    memcpy (Ci +pC, Ai +p, mynz*sizeof (int64_t)) ;
+                    ASSERT (Ai != NULL) ;
+                    // if (Ai != NULL)
+                    {
+                        // A and C are both sparse or hypersparse
+                        memcpy (Ci +pC, Ai +p, mynz*sizeof (int64_t)) ;
+                    }
+                    #if 0
+                    else
+                    {
+                        // A is full and C is sparse or hypersparse:
+                        // this is now always handled by the bitmap selector
+                        ASSERT (GB_DEAD_CODE) ;
+                        int64_t i_start = p % avlen ;
+                        for (int64_t s = 0 ; s < mynz ; s++)
+                        {
+                            int64_t i = i_start + s ;
+                            ASSERT (GBI (Ai, p+s, avlen) == i) ;
+                            Ci [pC+s] = i ;
+                        }
+                    }
+                    #endif
                     memcpy (Cx +pC*asize, Ax +p*asize, mynz*asize) ;
                 }
 
diff --git a/GraphBLAS/Source/Template/GB_semiring_template.c b/GraphBLAS/Source/Template/GB_semiring_template.c
index f7fdb743a5..8207901168 100644
--- a/GraphBLAS/Source/Template/GB_semiring_template.c
+++ b/GraphBLAS/Source/Template/GB_semiring_template.c
@@ -2,8 +2,8 @@
 // GB_semiring_template.c: built-in unary and binary functions and operators
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -60,6 +60,14 @@
 //      4 bitwise multiply operators: BOR, BAND, BXOR, BXNOR
 //      4 unsigned integer types: UINT8, UINT16, UINT32, UINT64
 
+// 80 positional semirings:
+
+//      5 monoids: MIN, MAX, PLUS, TIMES, ANY
+//      8 multiply operators:
+//          FIRSTI, FIRSTI1, FIRSTJ, FIRSTJ1,
+//          SECONDI, SECONDI1, SECONDJ, SECONDJ1
+//      2 types: INT32, INT64
+
 #if defined ( GB_BOOLEAN )
 
     //--------------------------------------------------------------------------
@@ -292,7 +300,7 @@
     GB_SEMIRING_DEFINE ( ANY   , GxB_, LXOR   )
 
     //--------------------------------------------------------------------------
-    // 30 semirings of the form TxT->bool
+    // 30 semirings of the form TxT -> bool
     //--------------------------------------------------------------------------
 
     // The multiply operator has the form z=compare(x,y), where x and y are of
diff --git a/GraphBLAS/Source/Template/GB_sparse_add_template.c b/GraphBLAS/Source/Template/GB_sparse_add_template.c
new file mode 100644
index 0000000000..8c9b377833
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_sparse_add_template.c
@@ -0,0 +1,1202 @@
+//------------------------------------------------------------------------------
+// GB_sparse_add_template:  C=A+B, C<M>=A+B when C is sparse/hypersparse
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// C is sparse or hypersparse:
+
+        //      ------------------------------------------
+        //      C       =           A       +       B
+        //      ------------------------------------------
+        //      sparse  .           sparse          sparse
+
+        //      ------------------------------------------
+        //      C      <M> =        A       +       B
+        //      ------------------------------------------
+        //      sparse  sparse      sparse          sparse
+        //      sparse  sparse      sparse          bitmap
+        //      sparse  sparse      sparse          full  
+        //      sparse  sparse      bitmap          sparse
+        //      sparse  sparse      bitmap          bitmap
+        //      sparse  sparse      bitmap          full  
+        //      sparse  sparse      full            sparse
+        //      sparse  sparse      full            bitmap
+        //      sparse  sparse      full            full        (same as emult)
+
+        //      sparse  bitmap      sparse          sparse
+        //      sparse  full        sparse          sparse
+
+        //      ------------------------------------------
+        //      C     <!M> =        A       +       B
+        //      ------------------------------------------
+        //      sparse  bitmap      sparse          sparse
+        //      sparse  full        sparse          sparse
+
+// If all four matrices are sparse/hypersparse, and C<!M>=A+B is being
+// computed, then M is passed in as NULL to GB_add_phase*.  GB_add_sparsity
+// returns apply_mask as false.  The methods below do not handle the case when
+// C is sparse, M is sparse, and !M is used.  All other uses of !M when M
+// is sparse result in a bitmap structure for C, and this is handled by
+// GB_bitmap_add_template.
+
+        // For this case: the mask is done later, so C=A+B is computed here:
+
+        //      ------------------------------------------
+        //      C     <!M> =        A       +       B
+        //      ------------------------------------------
+        //      sparse  sparse      sparse          sparse      (mask later)
+
+{
+
+    #ifdef GB_DEBUG
+    if (M == NULL || M_is_bitmap || M_is_full)
+    {
+        ASSERT (A_is_sparse || A_is_hyper) ;
+        ASSERT (B_is_sparse || B_is_hyper) ;
+    }
+    #endif
+
+    //--------------------------------------------------------------------------
+    // phase1: count entries in each C(:,j)
+    // phase2: compute C
+    //--------------------------------------------------------------------------
+
+    #pragma omp parallel for num_threads(C_nthreads) schedule(dynamic,1)
+    for (taskid = 0 ; taskid < C_ntasks ; taskid++)
+    {
+
+        //----------------------------------------------------------------------
+        // get the task descriptor
+        //----------------------------------------------------------------------
+
+        int64_t kfirst = TaskList [taskid].kfirst ;
+        int64_t klast  = TaskList [taskid].klast ;
+        bool fine_task = (klast == -1) ;
+        int64_t len ;
+        if (fine_task)
+        { 
+            // a fine task operates on a slice of a single vector
+            klast = kfirst ;
+            len = TaskList [taskid].len ;
+        }
+        else
+        { 
+            // a coarse task operates on one or more whole vectors
+            len = vlen ;
+        }
+
+        //----------------------------------------------------------------------
+        // compute all vectors in this task
+        //----------------------------------------------------------------------
+
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // get j, the kth vector of C
+            //------------------------------------------------------------------
+
+            int64_t j = GBH (Ch, k) ;
+
+            #if defined ( GB_PHASE_1_OF_2 )
+            int64_t cjnz = 0 ;
+            #else
+            int64_t pC, pC_end ;
+            if (fine_task)
+            { 
+                // A fine task computes a slice of C(:,j)
+                pC     = TaskList [taskid  ].pC ;
+                pC_end = TaskList [taskid+1].pC ;
+                ASSERT (Cp [k] <= pC && pC <= pC_end && pC_end <= Cp [k+1]) ;
+            }
+            else
+            { 
+                // The vectors of C are never sliced for a coarse task.
+                pC     = Cp [k  ] ;
+                pC_end = Cp [k+1] ;
+            }
+            int64_t cjnz = pC_end - pC ;
+            if (cjnz == 0) continue ;
+            #endif
+
+            //------------------------------------------------------------------
+            // get A(:,j)
+            //------------------------------------------------------------------
+
+            int64_t pA = -1, pA_end = -1 ;
+            if (fine_task)
+            { 
+                // A fine task operates on Ai,Ax [pA...pA_end-1], which is
+                // a subset of the vector A(:,j)
+                pA     = TaskList [taskid].pA ;
+                pA_end = TaskList [taskid].pA_end ;
+            }
+            else
+            {
+                // A coarse task operates on the entire vector A (:,j)
+                int64_t kA = (C_to_A == NULL) ? j : C_to_A [k] ;
+                if (kA >= 0)
+                { 
+                    pA     = GBP (Ap, kA, vlen) ;
+                    pA_end = GBP (Ap, kA+1, vlen) ;
+                }
+            }
+
+            int64_t ajnz = pA_end - pA ;    // nnz in A(:,j) for this slice
+            int64_t pA_start = pA ;
+            bool adense = (ajnz == len) ;
+
+            // get the first and last indices in A(:,j) for this vector
+            int64_t iA_first = -1, iA_last = -1 ;
+            if (ajnz > 0)
+            { 
+                iA_first = GBI (Ai, pA, vlen) ;
+                iA_last  = GBI (Ai, pA_end-1, vlen) ;
+            }
+
+            //------------------------------------------------------------------
+            // get B(:,j)
+            //------------------------------------------------------------------
+
+            int64_t pB = -1, pB_end = -1 ;
+            if (fine_task)
+            { 
+                // A fine task operates on Bi,Bx [pB...pB_end-1], which is
+                // a subset of the vector B(:,j)
+                pB     = TaskList [taskid].pB ;
+                pB_end = TaskList [taskid].pB_end ;
+            }
+            else
+            {
+                // A coarse task operates on the entire vector B (:,j)
+                int64_t kB = (C_to_B == NULL) ? j : C_to_B [k] ;
+                if (kB >= 0)
+                { 
+                    pB     = GBP (Bp, kB, vlen) ;
+                    pB_end = GBP (Bp, kB+1, vlen) ;
+                }
+            }
+
+            int64_t bjnz = pB_end - pB ;    // nnz in B(:,j) for this slice
+            int64_t pB_start = pB ;
+            bool bdense = (bjnz == len) ;
+
+            // get the first and last indices in B(:,j) for this vector
+            int64_t iB_first = -1, iB_last = -1 ;
+            if (bjnz > 0)
+            { 
+                iB_first = GBI (Bi, pB, vlen) ;
+                iB_last  = GBI (Bi, pB_end-1, vlen) ;
+            }
+
+            //------------------------------------------------------------------
+            // get M(:,j) if M is sparse or hypersparse
+            //------------------------------------------------------------------
+
+            bool sparse_mask_is_easy = false ;
+            int64_t pM = -1 ;
+            int64_t pM_end = -1 ;
+            if (M_is_sparse_or_hyper)
+            {
+                if (fine_task)
+                { 
+                    // A fine task operates on Mi,Mx [pM...pM_end-1],
+                    // which is a subset of the vector M(:,j)
+                    pM     = TaskList [taskid].pM ;
+                    pM_end = TaskList [taskid].pM_end ;
+                }
+                else
+                {
+                    int64_t kM = -1 ;
+                    if (Ch_is_Mh)
+                    { 
+                        // Ch is the same as Mh (a deep copy)
+                        ASSERT (Ch != NULL) ;
+                        ASSERT (M_is_hyper) ;
+                        ASSERT (Ch [k] == M->h [k]) ;
+                        kM = k ;
+                    }
+                    else
+                    { 
+                        kM = (C_to_M == NULL) ? j : C_to_M [k] ;
+                    }
+                    if (kM >= 0)
+                    { 
+                        pM     = GBP (Mp, kM  , vlen) ;
+                        pM_end = GBP (Mp, kM+1, vlen) ;
+                    }
+                }
+
+                // The "easy mask" condition requires M to be sparse/hyper
+                // and structural.  A and B cannot be bitmap.  Also one of
+                // the following 3 conditions must hold:
+                // (1) all entries are present in A(:,j) and B == M
+                // (2) all entries are present in B(:,j) and A == M
+                // (3) both A and B are aliased to M
+                sparse_mask_is_easy =
+                    Mask_struct &&          // M must be structural
+                    !A_is_bitmap &&         // A must not be bitmap
+                    !B_is_bitmap &&         // B must not be bitmap
+                    ((adense && B == M) ||  // one of 3 conditions holds
+                     (bdense && A == M) ||
+                     (A == M && B == M)) ;
+
+                // TODO: add the condition above to GB_add_sparsity,
+                // where adense/bdense are true for the whole matrix
+                // (adense is true if A is full, or sparse/hypersparse with
+                // all entries present).  The test here is done vector by
+                // vector, for each A(:,j) and B(:,j).  This is a finer grain
+                // test, as compared to a test for all of A and B.
+
+            }
+
+            //------------------------------------------------------------------
+            // C(:,j)<optional mask> = A (:,j) + B (:,j) or subvector
+            //------------------------------------------------------------------
+
+            if (M == NULL)
+            {
+
+                //--------------------------------------------------------------
+                // M is not present, or !M is sparse but not applied here
+                //--------------------------------------------------------------
+
+                //      ------------------------------------------
+                //      C       =           A       +       B
+                //      ------------------------------------------
+                //      sparse  .           sparse          sparse
+
+                //      ------------------------------------------
+                //      C     <!M> =        A       +       B
+                //      ------------------------------------------
+                //      sparse  sparse      sparse          sparse  (mask later)
+
+                // If all four matrices are sparse or hypersparse, and
+                // Mask_comp is true, the mask M is passed in to this method as
+                // NULL.  C=A+B is computed with no mask, and !M is applied
+                // later.
+
+                // A and B are both sparse or hypersparse, not bitmap or
+                // full, but individual vectors of A and B might have all
+                // entries present (adense and/or bdense).
+                ASSERT (A_is_sparse || A_is_hyper) ;
+                ASSERT (B_is_sparse || B_is_hyper) ;
+
+                #if defined ( GB_PHASE_1_OF_2 )
+
+                if (A_and_B_are_disjoint)
+                { 
+
+                    // only used by GB_Matrix_wait, which computes A+T
+                    // where T is the matrix of pending tuples for A.  The
+                    // pattern of pending tuples is always disjoint with
+                    // the pattern of A.
+
+                    cjnz = ajnz + bjnz ;
+
+                }
+                else
+
+                #endif
+
+                if (adense && bdense)
+                {
+
+                    //----------------------------------------------------------
+                    // Method01: A(:,j) and B(:,j) dense: thus C(:,j) dense
+                    //----------------------------------------------------------
+
+                    ASSERT (ajnz == bjnz) ;
+                    ASSERT (iA_first == iB_first) ;
+                    ASSERT (iA_last  == iB_last ) ;
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cjnz = ajnz ;
+                    #else
+                    ASSERT (cjnz == ajnz) ;
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < ajnz ; p++)
+                    { 
+                        // C (i,j) = A (i,j) + B (i,j)
+                        int64_t i = p + iA_first ;
+                        Ci [pC + p] = i ;
+                        ASSERT (Ai [pA + p] == i) ;
+                        ASSERT (Bi [pB + p] == i) ;
+                        GB_GETA (aij, Ax, pA + p) ;
+                        GB_GETB (bij, Bx, pB + p) ;
+                        GB_BINOP (GB_CX (pC + p), aij, bij, i, j) ;
+                    }
+                    #endif
+
+                }
+                else if (adense)
+                {
+
+                    //----------------------------------------------------------
+                    // Method02: A(:,j) dense, B(:,j) sparse: C(:,j) dense
+                    //----------------------------------------------------------
+
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cjnz = ajnz ;
+                    #else
+                    ASSERT (cjnz == ajnz) ;
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < ajnz ; p++)
+                    { 
+                        // C (i,j) = A (i,j)
+                        int64_t i = p + iA_first ;
+                        Ci [pC + p] = i ;
+                        ASSERT (Ai [pA + p] == i) ;
+                        GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
+                    }
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < bjnz ; p++)
+                    { 
+                        // C (i,j) = A (i,j) + B (i,j)
+                        int64_t i = Bi [pB + p] ;
+                        int64_t ii = i - iA_first ;
+                        ASSERT (Ai [pA + ii] == i) ;
+                        GB_GETA (aij, Ax, pA + ii) ;
+                        GB_GETB (bij, Bx, pB + p) ;
+                        GB_BINOP (GB_CX (pC + ii), aij, bij, i, j) ;
+                    }
+                    #endif
+
+                }
+                else if (bdense)
+                {
+
+                    //----------------------------------------------------------
+                    // Method03: A(:,j) sparse, B(:,j) dense: C(:,j) dense
+                    //----------------------------------------------------------
+
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cjnz = bjnz ;
+                    #else
+                    ASSERT (cjnz == bjnz) ;
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < bjnz ; p++)
+                    { 
+                        // C (i,j) = B (i,j)
+                        int64_t i = p + iB_first ;
+                        Ci [pC + p] = i ;
+                        ASSERT (Bi [pB + p] == i) ;
+                        GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
+                    }
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < ajnz ; p++)
+                    { 
+                        // C (i,j) = A (i,j) + B (i,j)
+                        int64_t i = Ai [pA + p] ;
+                        int64_t ii = i - iB_first ;
+                        ASSERT (Bi [pB + ii] == i) ;
+                        GB_GETA (aij, Ax, pA + p) ;
+                        GB_GETB (bij, Bx, pB + ii) ;
+                        GB_BINOP (GB_CX (pC + ii), aij, bij, i, j) ;
+                    }
+                    #endif
+
+                }
+                else if (ajnz == 0)
+                {
+
+                    //----------------------------------------------------------
+                    // Method04: A(:,j) is empty
+                    //----------------------------------------------------------
+
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cjnz = bjnz ;
+                    #else
+                    ASSERT (cjnz == bjnz) ;
+                    memcpy (Ci + pC, Bi + pB, bjnz * sizeof (int64_t)) ;
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < bjnz ; p++)
+                    { 
+                        // C (i,j) = B (i,j)
+                        GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
+                    }
+                    #endif
+
+                }
+                else if (bjnz == 0)
+                {
+
+                    //----------------------------------------------------------
+                    // Method05: B(:,j) is empty
+                    //----------------------------------------------------------
+
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cjnz = ajnz ;
+                    #else
+                    ASSERT (cjnz == ajnz) ;
+                    memcpy (Ci + pC, Ai + pA, ajnz * sizeof (int64_t)) ;
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < ajnz ; p++)
+                    { 
+                        // C (i,j) = A (i,j)
+                        GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
+                    }
+                    #endif
+
+                }
+                else if (iA_last < iB_first)
+                {
+
+                    //----------------------------------------------------------
+                    // Method06: last A(:,j) comes before 1st B(:,j)
+                    //----------------------------------------------------------
+
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cjnz = ajnz + bjnz ;
+                    #else
+                    ASSERT (cjnz == ajnz + bjnz) ;
+                    memcpy (Ci + pC, Ai + pA, ajnz * sizeof (int64_t)) ;
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < ajnz ; p++)
+                    { 
+                        // C (i,j) = A (i,j)
+                        GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
+                    }
+                    pC += ajnz ;
+                    memcpy (Ci + pC, Bi + pB, bjnz * sizeof (int64_t)) ;
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < bjnz ; p++)
+                    { 
+                        // C (i,j) = B (i,j)
+                        GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
+                    }
+                    #endif
+
+                }
+                else if (iB_last < iA_first)
+                {
+
+                    //----------------------------------------------------------
+                    // Method07: last B(:,j) comes before 1st A(:,j)
+                    //----------------------------------------------------------
+
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cjnz = ajnz + bjnz ;
+                    #else
+                    ASSERT (cjnz == ajnz + bjnz) ;
+                    memcpy (Ci + pC, Bi + pB, bjnz * sizeof (int64_t)) ;
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < bjnz ; p++)
+                    { 
+                        // C (i,j) = B (i,j)
+                        GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
+                    }
+                    pC += bjnz ;
+                    memcpy (Ci + pC, Ai + pA, ajnz * sizeof (int64_t)) ;
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < ajnz ; p++)
+                    { 
+                        // C (i,j) = A (i,j)
+                        GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
+                    }
+                    #endif
+
+                }
+
+                #if defined ( GB_PHASE_1_OF_2 )
+                else if (ajnz > 32 * bjnz)
+                {
+
+                    //----------------------------------------------------------
+                    // Method08: A(:,j) is much denser than B(:,j)
+                    //----------------------------------------------------------
+
+                    // cjnz = ajnz + bjnz - nnz in the intersection
+
+                    cjnz = ajnz + bjnz ;
+                    for ( ; pB < pB_end ; pB++)
+                    { 
+                        int64_t i = Bi [pB] ;
+                        // find i in A(:,j)
+                        int64_t pright = pA_end - 1 ;
+                        bool found ;
+                        GB_BINARY_SEARCH (i, Ai, pA, pright, found) ;
+                        if (found) cjnz-- ;
+                    }
+
+                }
+                else if (bjnz > 32 * ajnz)
+                {
+
+                    //----------------------------------------------------------
+                    // Method09: B(:,j) is much denser than A(:,j)
+                    //----------------------------------------------------------
+
+                    // cjnz = ajnz + bjnz - nnz in the intersection
+
+                    cjnz = ajnz + bjnz ;
+                    for ( ; pA < pA_end ; pA++)
+                    { 
+                        int64_t i = Ai [pA] ;
+                        // find i in B(:,j)
+                        int64_t pright = pB_end - 1 ;
+                        bool found ;
+                        GB_BINARY_SEARCH (i, Bi, pB, pright, found) ;
+                        if (found) cjnz-- ;
+                    }
+
+                }
+                #endif
+
+                else
+                {
+
+                    //----------------------------------------------------------
+                    // Method10: A(:,j) and B(:,j) about the same sparsity
+                    //----------------------------------------------------------
+
+                    while (pA < pA_end && pB < pB_end)
+                    {
+                        int64_t iA = Ai [pA] ;
+                        int64_t iB = Bi [pB] ;
+                        if (iA < iB)
+                        { 
+                            // C (iA,j) = A (iA,j)
+                            #if defined ( GB_PHASE_2_OF_2 )
+                            Ci [pC] = iA ;
+                            GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
+                            #endif
+                            pA++ ;
+                        }
+                        else if (iA > iB)
+                        { 
+                            // C (iB,j) = B (iB,j)
+                            #if defined ( GB_PHASE_2_OF_2 )
+                            Ci [pC] = iB ;
+                            GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
+                            #endif
+                            pB++ ;
+                        }
+                        else
+                        { 
+                            // C (i,j) = A (i,j) + B (i,j)
+                            #if defined ( GB_PHASE_2_OF_2 )
+                            Ci [pC] = iB ;
+                            GB_GETA (aij, Ax, pA) ;
+                            GB_GETB (bij, Bx, pB) ;
+                            GB_BINOP (GB_CX (pC), aij, bij, iB, j) ;
+                            #endif
+                            pA++ ;
+                            pB++ ;
+                        }
+                        #if defined ( GB_PHASE_2_OF_2 )
+                        pC++ ;
+                        #else
+                        cjnz++ ;
+                        #endif
+                    }
+
+                    //----------------------------------------------------------
+                    // A (:,j) or B (:,j) have entries left; not both
+                    //----------------------------------------------------------
+
+                    ajnz = (pA_end - pA) ;
+                    bjnz = (pB_end - pB) ;
+                    ASSERT (ajnz == 0 || bjnz == 0) ;
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cjnz += ajnz + bjnz ;
+                    #else
+                    memcpy (Ci + pC, Ai + pA, ajnz * sizeof (int64_t)) ;
+                    for (int64_t p = 0 ; p < ajnz ; p++)
+                    { 
+                        // C (i,j) = A (i,j)
+                        GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ;
+                    }
+                    memcpy (Ci + pC, Bi + pB, bjnz * sizeof (int64_t)) ;
+                    for (int64_t p = 0 ; p < bjnz ; p++)
+                    { 
+                        // C (i,j) = B (i,j)
+                        GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ;
+                    }
+                    ASSERT (pC + ajnz + bjnz == pC_end) ;
+                    #endif
+                }
+
+            }
+            else if (sparse_mask_is_easy)
+            {
+
+                //--------------------------------------------------------------
+                // special case: M is present and very easy to use
+                //--------------------------------------------------------------
+
+                //      ------------------------------------------
+                //      C      <M> =        A       +       B
+                //      ------------------------------------------
+                //      sparse  sparse      sparse          sparse
+                //      sparse  sparse      sparse          full  
+                //      sparse  sparse      full            sparse
+                //      sparse  sparse      full            full  
+
+                // A and B are sparse, hypersparse or full, not bitmap.
+                ASSERT (!A_is_bitmap) ;
+                ASSERT (!B_is_bitmap) ;
+                ASSERT (Mask_struct) ;
+
+                int64_t mjnz = pM_end - pM ;        // nnz (M (:,j))
+
+                #if defined ( GB_PHASE_1_OF_2 )
+
+                // M is structural, and sparse or hypersparse, so every entry
+                // in the mask is guaranteed to appear in A+B.  The symbolic
+                // count is thus trivial.
+
+                cjnz = mjnz ;
+
+                #else
+
+                // copy the pattern into C (:,j)
+                int64_t pC_start = pC ;
+                int64_t pM_start = pM ;
+                memcpy (Ci + pC, Mi + pM, mjnz * sizeof (int64_t)) ;
+                int64_t pA_offset = pA_start - iA_first ;
+                int64_t pB_offset = pB_start - iB_first ;
+
+                if (adense && B == M)
+                { 
+
+                    //----------------------------------------------------------
+                    // Method11: A dense, B == M
+                    //----------------------------------------------------------
+
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < mjnz ; p++)
+                    {
+                        int64_t pM = p + pM_start ;
+                        int64_t pC = p + pC_start ;
+                        int64_t i = Mi [pM] ;
+                        ASSERT (GB_mcast (Mx, pM, msize)) ;
+                        ASSERT (GBI (Ai, pA_offset + i, vlen) == i) ;
+                        ASSERT (GBI (Bi, pM, vlen) == i) ;
+                        GB_GETA (aij, Ax, pA_offset + i) ;
+                        GB_GETB (bij, Bx, pM) ;
+                        GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                    }
+
+                }
+                else if (bdense && A == M)
+                { 
+
+                    //----------------------------------------------------------
+                    // Method12: B dense, A == M
+                    //----------------------------------------------------------
+
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < mjnz ; p++)
+                    {
+                        int64_t pM = p + pM_start ;
+                        int64_t pC = p + pC_start ;
+                        int64_t i = Mi [pM] ;
+                        ASSERT (GB_mcast (Mx, pM, msize)) ;
+                        ASSERT (GBI (Ai, pM, vlen) == i) ;
+                        ASSERT (GBI (Bi, pB_offset + i, vlen) == i) ;
+                        GB_GETA (aij, Ax, pM) ;
+                        GB_GETB (bij, Bx, pB_offset + i) ;
+                        GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                    }
+
+                }
+                else // (A == M) && (B == M)
+                { 
+
+                    //----------------------------------------------------------
+                    // Method13: A == M == B: all three matrices the same
+                    //----------------------------------------------------------
+
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < mjnz ; p++)
+                    {
+                        int64_t pM = p + pM_start ;
+                        int64_t pC = p + pC_start ;
+                        #if GB_OP_IS_SECOND
+                        GB_GETB (t, Bx, pM) ;
+                        #else
+                        GB_GETA (t, Ax, pM) ;
+                        #endif
+                        GB_BINOP (GB_CX (pC), t, t, Mi [pM], j) ;
+                    }
+                }
+                #endif
+
+            }
+            else if (M_is_sparse_or_hyper)
+            {
+
+                //--------------------------------------------------------------
+                // Method14: C and M are sparse or hypersparse
+                //--------------------------------------------------------------
+
+                //      ------------------------------------------
+                //      C      <M> =        A       +       B
+                //      ------------------------------------------
+                //      sparse  sparse      sparse          sparse  (*)
+                //      sparse  sparse      sparse          bitmap  (*)
+                //      sparse  sparse      sparse          full    (*)
+                //      sparse  sparse      bitmap          sparse  (*)
+                //      sparse  sparse      bitmap          bitmap
+                //      sparse  sparse      bitmap          full  
+                //      sparse  sparse      full            sparse  (*)
+                //      sparse  sparse      full            bitmap
+                //      sparse  sparse      full            full    
+
+                // (*) This method is efficient except when either A or B are
+                // sparse, and when M is sparse but with many entries.  When M
+                // is sparse and either A or B are sparse, the method is
+                // designed to be very efficient when M is very sparse compared
+                // with A and/or B.  It traverses all entries in the sparse M,
+                // and (for sparse A or B) does a binary search for entries in
+                // A or B.  In that case, if M has many entries, the mask M
+                // should be ignored, and C=A+B should be computed without any
+                // mask.  The test for when to use M here should ignore A or B
+                // if they are bitmap or full.
+
+                // A and B can have any sparsity pattern (hypersparse,
+                // sparse, bitmap, or full).
+
+                for ( ; pM < pM_end ; pM++)
+                {
+
+                    //----------------------------------------------------------
+                    // get M(i,j) for A(i,j) + B (i,j)
+                    //----------------------------------------------------------
+
+                    int64_t i = Mi [pM] ;
+                    bool mij = GB_mcast (Mx, pM, msize) ;
+                    if (!mij) continue ;
+
+                    //----------------------------------------------------------
+                    // get A(i,j)
+                    //----------------------------------------------------------
+
+                    bool afound ;
+                    if (adense)
+                    { 
+                        // A is dense, bitmap, or full; use quick lookup
+                        pA = pA_start + (i - iA_first) ;
+                        afound = GBB (Ab, pA) ;
+                    }
+                    else if (A == M)
+                    { 
+                        // A is aliased to M
+                        pA = pM ;
+                        afound = true ;
+                    }
+                    else
+                    { 
+                        // A is sparse; use binary search.  This is slow unless
+                        // M is very sparse compared with A.
+                        int64_t apright = pA_end - 1 ;
+                        GB_BINARY_SEARCH (i, Ai, pA, apright, afound) ;
+                    }
+
+                    ASSERT (GB_IMPLIES (afound, GBI (Ai, pA, vlen) == i)) ;
+
+                    //----------------------------------------------------------
+                    // get B(i,j)
+                    //----------------------------------------------------------
+
+                    bool bfound ;
+                    if (bdense)
+                    { 
+                        // B is dense; use quick lookup
+                        pB = pB_start + (i - iB_first) ;
+                        bfound = GBB (Bb, pB) ;
+                    }
+                    else if (B == M)
+                    { 
+                        // B is aliased to M
+                        pB = pM ;
+                        bfound = true ;
+                    }
+                    else
+                    { 
+                        // B is sparse; use binary search.  This is slow unless
+                        // M is very sparse compared with B.
+                        int64_t bpright = pB_end - 1 ;
+                        GB_BINARY_SEARCH (i, Bi, pB, bpright, bfound) ;
+                    }
+
+                    ASSERT (GB_IMPLIES (bfound, GBI (Bi, pB, vlen) == i)) ;
+
+                    //----------------------------------------------------------
+                    // C(i,j) = A(i,j) + B(i,j)
+                    //----------------------------------------------------------
+
+                    if (afound && bfound)
+                    { 
+                        // C (i,j) = A (i,j) + B (i,j)
+                        #if defined ( GB_PHASE_1_OF_2 )
+                        cjnz++ ;
+                        #else
+                        Ci [pC] = i ;
+                        GB_GETA (aij, Ax, pA) ;
+                        GB_GETB (bij, Bx, pB) ;
+                        GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                        pC++ ;
+                        #endif
+                    }
+                    else if (afound)
+                    { 
+                        // C (i,j) = A (i,j)
+                        #if defined ( GB_PHASE_1_OF_2 )
+                        cjnz++ ;
+                        #else
+                        Ci [pC] = i ;
+                        GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
+                        pC++ ;
+                        #endif
+                    }
+                    else if (bfound)
+                    { 
+                        // C (i,j) = B (i,j)
+                        #if defined ( GB_PHASE_1_OF_2 )
+                        cjnz++ ;
+                        #else
+                        Ci [pC] = i ;
+                        GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
+                        pC++ ;
+                        #endif
+                    }
+                }
+
+                #if defined ( GB_PHASE_2_OF_2 )
+                ASSERT (pC == pC_end) ;
+                #endif
+
+            }
+            else
+            {
+
+                //--------------------------------------------------------------
+                // M is bitmap or full, for either C<M>=A+B or C<!M>=A+B
+                //--------------------------------------------------------------
+
+                //      ------------------------------------------
+                //      C      <M> =        A       +       B
+                //      ------------------------------------------
+                //      sparse  bitmap      sparse          sparse
+                //      sparse  full        sparse          sparse
+
+                //      ------------------------------------------
+                //      C      <!M> =       A       +       B
+                //      ------------------------------------------
+                //      sparse  bitmap      sparse          sparse
+                //      sparse  full        sparse          sparse
+
+                // This method is very efficient for any mask, and should
+                // always be used if M is bitmap or full, even if the mask must
+                // also be applied later in GB_mask or GB_accum_mask.
+                // Exploiting the mask here adds no extra search time, and it
+                // reduces the size of C on output.
+
+                // GB_GET_MIJ: get M(i,j) where M is bitmap or full
+                #undef  GB_GET_MIJ
+                #define GB_GET_MIJ(i)                                     \
+                    int64_t pM = pM_start + i ;                           \
+                    bool mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ; \
+                    if (Mask_comp) mij = !mij ;
+
+                // A and B are sparse or hypersparse, not bitmap or full,
+                // but individual vectors of A and B might have all entries
+                // present (adense and/or bdense).
+                ASSERT (A_is_sparse || A_is_hyper) ;
+                ASSERT (B_is_sparse || B_is_hyper) ;
+
+                int64_t pM_start = j * vlen ;
+
+                if (adense && bdense)
+                {
+
+                    //----------------------------------------------------------
+                    // Method15: A(:,j) and B(:,j) dense, M bitmap/full
+                    //----------------------------------------------------------
+
+                    ASSERT (ajnz == bjnz) ;
+                    ASSERT (iA_first == iB_first) ;
+                    ASSERT (iA_last  == iB_last ) ;
+                    for (int64_t p = 0 ; p < ajnz ; p++)
+                    {
+                        int64_t i = p + iA_first ;
+                        ASSERT (Ai [pA + p] == i) ;
+                        ASSERT (Bi [pB + p] == i) ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        { 
+                            // C (i,j) = A (i,j) + B (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = i ;
+                            GB_GETA (aij, Ax, pA + p) ;
+                            GB_GETB (bij, Bx, pB + p) ;
+                            GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                }
+                else if (ajnz == 0)
+                {
+
+                    //----------------------------------------------------------
+                    // Method16: A(:,j) is empty, M bitmap/full
+                    //----------------------------------------------------------
+
+                    for ( ; pB < pB_end ; pB++)
+                    {
+                        int64_t i = Bi [pB] ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        { 
+                            // C (i,j) = B (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = i ;
+                            GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                }
+                else if (bjnz == 0)
+                {
+
+                    //----------------------------------------------------------
+                    // Method17: B(:,j) is empty, M bitmap/full
+                    //----------------------------------------------------------
+
+                    for ( ; pA < pA_end ; pA++)
+                    {
+                        int64_t i = Ai [pA] ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        { 
+                            // C (i,j) = A (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = i ;
+                            GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                }
+                else if (iA_last < iB_first)
+                {
+
+                    //----------------------------------------------------------
+                    // Method18:last A(:,j) before 1st B(:,j), M bitmap/full
+                    //----------------------------------------------------------
+
+                    for ( ; pA < pA_end ; pA++)
+                    {
+                        int64_t i = Ai [pA] ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        { 
+                            // C (i,j) = A (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = i ;
+                            GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                    for ( ; pB < pB_end ; pB++)
+                    {
+                        int64_t i = Bi [pB] ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        { 
+                            // C (i,j) = B (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = i ;
+                            GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                }
+                else if (iB_last < iA_first)
+                {
+
+                    //----------------------------------------------------------
+                    // Method19:last B(:,j) before 1st A(:,j), M bitmap/full
+                    //----------------------------------------------------------
+
+                    for ( ; pB < pB_end ; pB++)
+                    {
+                        int64_t i = Bi [pB] ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        { 
+                            // C (i,j) = B (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = i ;
+                            GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                    for ( ; pA < pA_end ; pA++)
+                    {
+                        int64_t i = Ai [pA] ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        { 
+                            // C (i,j) = A (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = i ;
+                            GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                }
+                else
+                {
+
+                    //----------------------------------------------------------
+                    // Method20: merge A(:,j) and B(:,j), M bitmap/full
+                    //----------------------------------------------------------
+
+                    while (pA < pA_end && pB < pB_end)
+                    {
+                        int64_t iA = Ai [pA] ;
+                        int64_t iB = Bi [pB] ;
+                        if (iA < iB)
+                        {
+                            GB_GET_MIJ (iA) ;
+                            if (mij)
+                            { 
+                                // C (iA,j) = A (iA,j)
+                                #if defined ( GB_PHASE_1_OF_2 )
+                                cjnz++ ;
+                                #else
+                                Ci [pC] = iA ;
+                                GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
+                                pC++ ;
+                                #endif
+                            }
+                            pA++ ;
+                        }
+                        else if (iA > iB)
+                        {
+                            GB_GET_MIJ (iB) ;
+                            if (mij)
+                            { 
+                                // C (iB,j) = B (iB,j)
+                                #if defined ( GB_PHASE_1_OF_2 )
+                                cjnz++ ;
+                                #else
+                                Ci [pC] = iB ;
+                                GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
+                                pC++ ;
+                                #endif
+                            }
+                            pB++ ;
+                        }
+                        else
+                        {
+                            GB_GET_MIJ (iB) ;
+                            if (mij)
+                            { 
+                                // C (i,j) = A (i,j) + B (i,j)
+                                #if defined ( GB_PHASE_1_OF_2 )
+                                cjnz++ ;
+                                #else
+                                Ci [pC] = iB ;
+                                GB_GETA (aij, Ax, pA) ;
+                                GB_GETB (bij, Bx, pB) ;
+                                GB_BINOP (GB_CX (pC), aij, bij, iB, j) ;
+                                pC++ ;
+                                #endif
+                            }
+                            pA++ ;
+                            pB++ ;
+                        }
+                    }
+
+                    //----------------------------------------------------------
+                    // A (:,j) or B (:,j) have entries left; not both
+                    //----------------------------------------------------------
+
+                    for ( ; pA < pA_end ; pA++)
+                    {
+                        int64_t iA = Ai [pA] ;
+                        GB_GET_MIJ (iA) ;
+                        if (mij)
+                        { 
+                            // C (iA,j) = A (iA,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = iA ;
+                            GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                    for ( ; pB < pB_end ; pB++)
+                    {
+                        int64_t iB = Bi [pB] ;
+                        GB_GET_MIJ (iB) ;
+                        if (mij)
+                        { 
+                            // C (iB,j) = B (iB,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = iB ;
+                            GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+                }
+            }
+
+            //------------------------------------------------------------------
+            // final count of nnz (C (:,j))
+            //------------------------------------------------------------------
+
+            #if defined ( GB_PHASE_1_OF_2 )
+            if (fine_task)
+            { 
+                TaskList [taskid].pC = cjnz ;
+            }
+            else
+            { 
+                Cp [k] = cjnz ;
+            }
+            #endif
+        }
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_sparse_emult_template.c b/GraphBLAS/Source/Template/GB_sparse_emult_template.c
new file mode 100644
index 0000000000..503394e6c3
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_sparse_emult_template.c
@@ -0,0 +1,1058 @@
+//------------------------------------------------------------------------------
+// GB_sparse_emult_template: C=A.*B, C<M or !M>=A.*B when C is sparse/hyper
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Computes C=A.*B, C<M>=A.*B, or C<!M>=A.*B when C is sparse or hypersparse:
+
+        //      ------------------------------------------
+        //      C       =           A       .*      B
+        //      ------------------------------------------
+        //      sparse  .           sparse          sparse
+        //      sparse  .           sparse          bitmap
+        //      sparse  .           sparse          full  
+        //      sparse  .           bitmap          sparse
+        //      sparse  .           full            sparse
+
+        //      ------------------------------------------
+        //      C       <M>=        A       .*      B
+        //      ------------------------------------------
+        //      sparse  sparse      sparse          sparse
+        //      sparse  sparse      sparse          bitmap
+        //      sparse  sparse      sparse          full  
+        //      sparse  sparse      bitmap          sparse
+        //      sparse  sparse      bitmap          bitmap
+        //      sparse  sparse      bitmap          full  
+        //      sparse  sparse      full            sparse
+        //      sparse  sparse      full            bitmap
+
+        //      ------------------------------------------
+        //      C      <M> =        A       .*      B
+        //      ------------------------------------------
+        //      sparse  bitmap      sparse          sparse
+        //      sparse  bitmap      sparse          bitmap
+        //      sparse  bitmap      sparse          full  
+        //      sparse  bitmap      bitmap          sparse
+        //      sparse  bitmap      full            sparse
+
+        //      ------------------------------------------
+        //      C      <M> =        A       .*      B
+        //      ------------------------------------------
+        //      sparse  full        sparse          sparse
+        //      sparse  full        sparse          bitmap
+        //      sparse  full        sparse          full  
+        //      sparse  full        bitmap          sparse
+        //      sparse  full        full            sparse
+
+        //      ------------------------------------------
+        //      C      <!M> =       A       .*      B
+        //      ------------------------------------------
+        //      sparse  bitmap      sparse          sparse
+        //      sparse  bitmap      sparse          bitmap
+        //      sparse  bitmap      sparse          full  
+        //      sparse  bitmap      bitmap          sparse
+        //      sparse  bitmap      full            sparse
+
+        //      ------------------------------------------
+        //      C      <!M> =       A       .*      B
+        //      ------------------------------------------
+        //      sparse  full        sparse          sparse
+        //      sparse  full        sparse          bitmap
+        //      sparse  full        sparse          full  
+        //      sparse  full        bitmap          sparse
+        //      sparse  full        full            sparse
+
+        // For these cases: the mask is done later, and C=A.*B is computed
+        // here, without the mask (M is passed as NULL):
+
+        //      ------------------------------------------
+        //      C       <!M>=       A       .*      B
+        //      ------------------------------------------
+        //      sparse  sparse      sparse          sparse  (mask later)
+        //      sparse  sparse      sparse          bitmap  (mask later)
+        //      sparse  sparse      sparse          full    (mask later)
+        //      sparse  sparse      bitmap          sparse  (mask later)
+        //      sparse  sparse      full            sparse  (mask later)
+
+// phase1: does not compute C itself, but just counts the # of entries in each
+// vector of C.  Fine tasks compute the # of entries in their slice of a
+// single vector of C, and the results are cumsum'd.
+
+// phase2: computes C, using the counts computed by phase1.
+
+{
+
+    //--------------------------------------------------------------------------
+    // phase1: count entries in each C(:,j)
+    // phase2: compute C
+    //--------------------------------------------------------------------------
+
+    #pragma omp parallel for num_threads(C_nthreads) schedule(dynamic,1)
+    for (taskid = 0 ; taskid < C_ntasks ; taskid++)
+    {
+
+        //----------------------------------------------------------------------
+        // get the task descriptor
+        //----------------------------------------------------------------------
+
+        int64_t kfirst = TaskList [taskid].kfirst ;
+        int64_t klast  = TaskList [taskid].klast ;
+        bool fine_task = (klast == -1) ;
+        int64_t len ;
+        if (fine_task)
+        { 
+            // a fine task operates on a slice of a single vector
+            klast = kfirst ;
+            len = TaskList [taskid].len ;
+        }
+        else
+        { 
+            // a coarse task operates on one or more whole vectors
+            len = vlen ;
+        }
+
+        //----------------------------------------------------------------------
+        // compute all vectors in this task
+        //----------------------------------------------------------------------
+
+        for (int64_t k = kfirst ; k <= klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // get j, the kth vector of C
+            //------------------------------------------------------------------
+
+            int64_t j = GBH (Ch, k) ;
+
+            #if defined ( GB_PHASE_1_OF_2 )
+            int64_t cjnz = 0 ;
+            #else
+            int64_t pC, pC_end ;
+            if (fine_task)
+            { 
+                // A fine task computes a slice of C(:,j)
+                pC     = TaskList [taskid  ].pC ;
+                pC_end = TaskList [taskid+1].pC ;
+                ASSERT (Cp [k] <= pC && pC <= pC_end && pC_end <= Cp [k+1]) ;
+            }
+            else
+            { 
+                // The vectors of C are never sliced for a coarse task.
+                pC     = Cp [k] ;
+                pC_end = Cp [k+1] ;
+            }
+            int64_t cjnz = pC_end - pC ;
+            if (cjnz == 0) continue ;
+            #endif
+
+            //------------------------------------------------------------------
+            // get A(:,j)
+            //------------------------------------------------------------------
+
+            int64_t pA = -1, pA_end = -1 ;
+            if (fine_task)
+            { 
+                // A fine task operates on Ai,Ax [pA...pA_end-1], which is
+                // a subset of the vector A(:,j)
+                pA     = TaskList [taskid].pA ;
+                pA_end = TaskList [taskid].pA_end ;
+            }
+            else
+            {
+                // A coarse task operates on the entire vector A (:,j)
+                int64_t kA = (Ch == Ah) ? k :
+                            ((C_to_A == NULL) ? j : C_to_A [k]) ;
+                if (kA >= 0)
+                { 
+                    pA     = GBP (Ap, kA, vlen) ;
+                    pA_end = GBP (Ap, kA+1, vlen) ;
+                }
+            }
+
+            int64_t ajnz = pA_end - pA ;        // nnz in A(:,j) for this slice
+            int64_t pA_start = pA ;
+            bool adense = (ajnz == len) ;
+
+            // get the first and last indices in A(:,j) for this vector
+            int64_t iA_first = -1 ;
+            if (ajnz > 0)
+            { 
+                iA_first = GBI (Ai, pA, vlen) ;
+            }
+            #if defined ( GB_PHASE_1_OF_2 ) || defined ( GB_DEBUG )
+            int64_t iA_last = -1 ;
+            if (ajnz > 0)
+            { 
+                iA_last  = GBI (Ai, pA_end-1, vlen) ;
+            }
+            #endif
+
+            //------------------------------------------------------------------
+            // get B(:,j)
+            //------------------------------------------------------------------
+
+            int64_t pB = -1, pB_end = -1 ;
+            if (fine_task)
+            { 
+                // A fine task operates on Bi,Bx [pB...pB_end-1], which is
+                // a subset of the vector B(:,j)
+                pB     = TaskList [taskid].pB ;
+                pB_end = TaskList [taskid].pB_end ;
+            }
+            else
+            {
+                // A coarse task operates on the entire vector B (:,j)
+                int64_t kB = (Ch == Bh) ? k :
+                            ((C_to_B == NULL) ? j : C_to_B [k]) ;
+                if (kB >= 0)
+                { 
+                    pB     = GBP (Bp, kB, vlen) ;
+                    pB_end = GBP (Bp, kB+1, vlen) ;
+                }
+            }
+
+            int64_t bjnz = pB_end - pB ;        // nnz in B(:,j) for this slice
+            int64_t pB_start = pB ;
+            bool bdense = (bjnz == len) ;
+
+            // get the first and last indices in B(:,j) for this vector
+            int64_t iB_first = -1 ;
+            if (bjnz > 0)
+            { 
+                iB_first = GBI (Bi, pB, vlen) ;
+            }
+            #if defined ( GB_PHASE_1_OF_2 ) || defined ( GB_DEBUG )
+            int64_t iB_last = -1 ;
+            if (bjnz > 0)
+            { 
+                iB_last  = GBI (Bi, pB_end-1, vlen) ;
+            }
+            #endif
+
+            //------------------------------------------------------------------
+            // get M(:,j) if M is sparse or hypersparse
+            //------------------------------------------------------------------
+
+            int64_t pM = -1 ;
+            int64_t pM_end = -1 ;
+            if (M_is_sparse_or_hyper)
+            {
+                if (fine_task)
+                { 
+                    // A fine task operates on Mi,Mx [pM...pM_end-1], which is
+                    // a subset of the vector M(:,j)
+                    pM     = TaskList [taskid].pM ;
+                    pM_end = TaskList [taskid].pM_end ;
+                }
+                else
+                {
+                    int64_t kM = -1 ;
+                    if (Ch == Mh)
+                    { 
+                        // Ch is the same as Mh (a shallow copy), or both NULL
+                        kM = k ;
+                    }
+                    else
+                    { 
+                        kM = (C_to_M == NULL) ? j : C_to_M [k] ;
+                    }
+                    if (kM >= 0)
+                    { 
+                        pM     = GBP (Mp, kM, vlen) ;
+                        pM_end = GBP (Mp, kM+1, vlen) ;
+                    }
+                }
+            }
+
+            //------------------------------------------------------------------
+            // C(:,j)<optional mask> = A (:,j) .* B (:,j) or subvector
+            //------------------------------------------------------------------
+
+            #if defined ( GB_PHASE_1_OF_2 )
+
+            if (ajnz == 0 || bjnz == 0)
+            { 
+
+                //--------------------------------------------------------------
+                // A(:,j) and/or B(:,j) are empty
+                //--------------------------------------------------------------
+
+                ;
+
+            }
+            else if (iA_last < iB_first || iB_last < iA_first)
+            { 
+
+                //--------------------------------------------------------------
+                // intersection of A(:,j) and B(:,j) is empty
+                //--------------------------------------------------------------
+
+                // the last entry of A(:,j) comes before the first entry
+                // of B(:,j), or visa versa
+                ;
+
+            }
+            else
+
+            #endif
+
+            if (M == NULL)
+            {
+
+                //--------------------------------------------------------------
+                // M is not present, or !M is sparse but not applied here
+                //--------------------------------------------------------------
+
+                //      ------------------------------------------
+                //      C       =           A       .*      B
+                //      ------------------------------------------
+                //      sparse  .           sparse          sparse
+                //      sparse  .           sparse          bitmap
+                //      sparse  .           sparse          full  
+                //      sparse  .           bitmap          sparse
+                //      sparse  .           full            sparse
+
+                //      ------------------------------------------
+                //      C       <!M>=       A       .*      B
+                //      ------------------------------------------
+                //      sparse  sparse      sparse          sparse  (mask later)
+                //      sparse  sparse      sparse          bitmap  (mask later)
+                //      sparse  sparse      sparse          full    (mask later)
+                //      sparse  sparse      bitmap          sparse  (mask later)
+                //      sparse  sparse      full            sparse  (mask later)
+
+                // A or B are sparse/hyper, or both
+                ASSERT (A_is_sparse || A_is_hyper || B_is_sparse || B_is_hyper);
+
+                if (A_is_bitmap)
+                {
+
+                    //----------------------------------------------------------
+                    // Method01: A(:,j) is bitmap; B(:,j) is sparse/hyper
+                    //----------------------------------------------------------
+
+                    // TODO: B can be jumbled; then so is C
+
+                    ASSERT (B_is_sparse || B_is_hyper) ;
+                    for ( ; pB < pB_end ; pB++)
+                    { 
+                        int64_t i = Bi [pB] ;
+                        int64_t pA = pA_start + i - iA_first ;
+                        if (!Ab [pA]) continue ;
+                        // C (i,j) = A (i,j) .* B (i,j)
+                        #if defined ( GB_PHASE_1_OF_2 )
+                        cjnz++ ;
+                        #else
+                        Ci [pC] = i ;
+                        GB_GETA (aij, Ax, pA) ;     
+                        GB_GETB (bij, Bx, pB) ;
+                        GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                        pC++ ;
+                        #endif
+                    }
+
+                }
+                else if (B_is_bitmap)
+                {
+
+                    //----------------------------------------------------------
+                    // Method02: B(:,j) is bitmap; A(:,j) is sparse/hyper
+                    //----------------------------------------------------------
+
+                    // TODO: A can be jumbled; then so is C
+
+                    ASSERT (A_is_sparse || A_is_hyper) ;
+                    for ( ; pA < pA_end ; pA++)
+                    { 
+                        int64_t i = Ai [pA] ;
+                        int64_t pB = pB_start + i - iB_first ;
+                        if (!Bb [pB]) continue ;
+                        // C (i,j) = A (i,j) .* B (i,j)
+                        #if defined ( GB_PHASE_1_OF_2 )
+                        cjnz++ ;
+                        #else
+                        Ci [pC] = i ;
+                        GB_GETA (aij, Ax, pA) ;     
+                        GB_GETB (bij, Bx, pB) ;
+                        GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                        pC++ ;
+                        #endif
+                    }
+
+                }
+                else if (adense && bdense)
+                {
+
+                    //----------------------------------------------------------
+                    // Method03: A(:,j) and B(:,j) dense: thus C(:,j) dense
+                    //----------------------------------------------------------
+
+                    // TODO: only do this if A and B are full, not just (:,j)
+                    // Then no matrix will be jumbled.
+
+                    ASSERT (ajnz == bjnz) ;
+                    ASSERT (iA_first == iB_first) ;
+                    ASSERT (iA_last  == iB_last ) ;
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cjnz = ajnz ;
+                    #else
+                    ASSERT (cjnz == ajnz) ;
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < ajnz ; p++)
+                    { 
+                        // C (i,j) = A (i,j) .* B (i,j)
+                        int64_t i = p + iA_first ;
+                        Ci [pC + p] = i ;
+                        ASSERT (GBI (Ai, pA + p, vlen) == i) ;
+                        ASSERT (GBI (Bi, pB + p, vlen) == i) ;
+                        GB_GETA (aij, Ax, pA + p) ;
+                        GB_GETB (bij, Bx, pB + p) ;
+                        GB_BINOP (GB_CX (pC + p), aij, bij, i, j) ;
+                    }
+                    #endif
+
+                }
+                else if (adense)
+                {
+
+                    //----------------------------------------------------------
+                    // Method04: A(:,j) dense, B(:,j) sparse: C(:,j) sparse
+                    //----------------------------------------------------------
+
+                    // TODO: only do this if A is full, not just A(:,j)
+                    // TODO: B can be jumbled; then so is C
+
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cjnz = bjnz ;
+                    #else
+                    ASSERT (cjnz == bjnz) ;
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < bjnz ; p++)
+                    { 
+                        // C (i,j) = A (i,j) .* B (i,j)
+                        int64_t i = Bi [pB + p] ;
+                        Ci [pC + p] = i ;
+                        GB_GETA (aij, Ax, pA + i - iA_first) ;
+                        GB_GETB (bij, Bx, pB + p) ;
+                        GB_BINOP (GB_CX (pC + p), aij, bij, i, j) ;
+                    }
+                    #endif
+
+                }
+                else if (bdense)
+                {
+
+                    //----------------------------------------------------------
+                    // Method05: A(:,j) sparse, B(:,j) dense: C(:,j) sparse
+                    //----------------------------------------------------------
+
+                    // TODO: only do this if B is full, not just B(:,j)
+                    // TODO: A can be jumbled; then so is C
+
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cjnz = ajnz ;
+                    #else
+                    ASSERT (cjnz == ajnz) ;
+                    GB_PRAGMA_SIMD_VECTORIZE
+                    for (int64_t p = 0 ; p < ajnz ; p++)
+                    { 
+                        // C (i,j) = A (i,j) .* B (i,j)
+                        int64_t i = Ai [pA + p] ;
+                        Ci [pC + p] = i ;
+                        GB_GETA (aij, Ax, pA + p) ;
+                        GB_GETB (bij, Bx, pB + i - iB_first) ;
+                        GB_BINOP (GB_CX (pC + p), aij, bij, i, j) ;
+                    }
+                    #endif
+
+                }
+                else if (ajnz > 32 * bjnz)
+                {
+
+                    //----------------------------------------------------------
+                    // Method06: A(:,j) is much denser than B(:,j)
+                    //----------------------------------------------------------
+
+                    // A and B cannot be jumbled
+
+                    for ( ; pB < pB_end ; pB++)
+                    {
+                        int64_t i = Bi [pB] ;
+                        // find i in A(:,j)
+                        int64_t pright = pA_end - 1 ;
+                        bool found ;
+                        GB_BINARY_SEARCH (i, Ai, pA, pright, found) ;
+                        if (found)
+                        { 
+                            // C (i,j) = A (i,j) .* B (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            ASSERT (pC < pC_end) ;
+                            Ci [pC] = i ;
+                            GB_GETA (aij, Ax, pA) ;
+                            GB_GETB (bij, Bx, pB) ;
+                            GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+                    #if defined ( GB_PHASE_2_OF_2 )
+                    ASSERT (pC == pC_end) ;
+                    #endif
+
+                }
+                else if (bjnz > 32 * ajnz)
+                {
+
+                    //----------------------------------------------------------
+                    // Method07: B(:,j) is much denser than A(:,j)
+                    //----------------------------------------------------------
+
+                    // A and B cannot be jumbled
+
+                    for ( ; pA < pA_end ; pA++)
+                    {
+                        int64_t i = Ai [pA] ;
+                        // find i in B(:,j)
+                        int64_t pright = pB_end - 1 ;
+                        bool found ;
+                        GB_BINARY_SEARCH (i, Bi, pB, pright, found) ;
+                        if (found)
+                        { 
+                            // C (i,j) = A (i,j) .* B (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            ASSERT (pC < pC_end) ;
+                            Ci [pC] = i ;
+                            GB_GETA (aij, Ax, pA) ;
+                            GB_GETB (bij, Bx, pB) ;
+                            GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+                    #if defined ( GB_PHASE_2_OF_2 )
+                    ASSERT (pC == pC_end) ;
+                    #endif
+
+                }
+                else
+                {
+
+                    //----------------------------------------------------------
+                    // Method08: A(:,j) and B(:,j) about the sparsity
+                    //----------------------------------------------------------
+
+                    // linear-time scan of A(:,j) and B(:,j)
+                    // A and B cannot be jumbled
+
+                    while (pA < pA_end && pB < pB_end)
+                    {
+                        int64_t iA = Ai [pA] ;
+                        int64_t iB = Bi [pB] ;
+                        if (iA < iB)
+                        { 
+                            // A(i,j) exists but not B(i,j)
+                            pA++ ;
+                        }
+                        else if (iB < iA)
+                        { 
+                            // B(i,j) exists but not A(i,j)
+                            pB++ ;
+                        }
+                        else
+                        { 
+                            // both A(i,j) and B(i,j) exist
+                            // C (i,j) = A (i,j) .* B (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            ASSERT (pC < pC_end) ;
+                            Ci [pC] = iB ;
+                            GB_GETA (aij, Ax, pA) ;
+                            GB_GETB (bij, Bx, pB) ;
+                            GB_BINOP (GB_CX (pC), aij, bij, iB, j) ;
+                            pC++ ;
+                            #endif
+                            pA++ ;
+                            pB++ ;
+                        }
+                    }
+
+                    #if defined ( GB_PHASE_2_OF_2 )
+                    ASSERT (pC == pC_end) ;
+                    #endif
+                }
+
+            }
+            else if (M_is_sparse_or_hyper)
+            {
+
+                //--------------------------------------------------------------
+                // Method09: C and M are sparse or hypersparse
+                //--------------------------------------------------------------
+
+                //      ------------------------------------------
+                //      C       <M>=        A       .*      B
+                //      ------------------------------------------
+                //      sparse  sparse      sparse          sparse
+                //      sparse  sparse      sparse          bitmap
+                //      sparse  sparse      sparse          full  
+                //      sparse  sparse      bitmap          sparse
+                //      sparse  sparse      bitmap          bitmap
+                //      sparse  sparse      bitmap          full  
+                //      sparse  sparse      full            sparse
+                //      sparse  sparse      full            bitmap
+
+                for ( ; pM < pM_end ; pM++)
+                {
+
+                    // M can be jumbled; A and B cannot
+
+                    //----------------------------------------------------------
+                    // get M(i,j) for A(i,j) .* B (i,j)
+                    //----------------------------------------------------------
+
+                    int64_t i = GBI (Mi, pM, vlen) ;
+                    bool mij = GB_mcast (Mx, pM, msize) ;
+                    if (!mij) continue ;
+
+                    //----------------------------------------------------------
+                    // get A(i,j)
+                    //----------------------------------------------------------
+
+                    bool afound ;
+                    if (adense)
+                    { 
+                        // A(:,j) is dense, bitmap, or full; use quick lookup
+                        pA = pA_start + i - iA_first ;
+                        afound = GBB (Ab, pA) ;
+                    }
+                    else
+                    { 
+                        // A(:,j) is sparse; use binary search for A(i,j)
+                        int64_t apright = pA_end - 1 ;
+                        GB_BINARY_SEARCH (i, Ai, pA, apright, afound) ;
+                    }
+                    if (!afound) continue ;
+                    ASSERT (GBI (Ai, pA, vlen) == i) ;
+
+                    //----------------------------------------------------------
+                    // get B(i,j)
+                    //----------------------------------------------------------
+
+                    bool bfound ;
+                    if (bdense)
+                    { 
+                        // B(:,j) is dense; use direct lookup for B(i,j)
+                        pB = pB_start + i - iB_first ;
+                        bfound = GBB (Bb, pB) ;
+                    }
+                    else
+                    { 
+                        // B(:,j) is sparse; use binary search for B(i,j)
+                        int64_t bpright = pB_end - 1 ;
+                        GB_BINARY_SEARCH (i, Bi, pB, bpright, bfound) ;
+                    }
+                    if (!bfound) continue ;
+                    ASSERT (GBI (Bi, pB, vlen) == i) ;
+
+                    //----------------------------------------------------------
+                    // C(i,j) = A(i,j) .* B(i,j)
+                    //----------------------------------------------------------
+
+                    // C (i,j) = A (i,j) .* B (i,j)
+                    #if defined ( GB_PHASE_1_OF_2 )
+                    cjnz++ ;
+                    #else
+                    Ci [pC] = i ;
+                    GB_GETA (aij, Ax, pA) ;
+                    GB_GETB (bij, Bx, pB) ;
+                    GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                    pC++ ;
+                    #endif
+                }
+
+                #if defined ( GB_PHASE_2_OF_2 )
+                ASSERT (pC == pC_end) ;
+                #endif
+
+            }
+            else
+            {
+
+                //--------------------------------------------------------------
+                // M is bitmap or full, for either C<M>=A.*B or C<!M>=A.*B
+                //--------------------------------------------------------------
+
+                //      ------------------------------------------
+                //      C      <M> =        A       .*      B
+                //      ------------------------------------------
+                //      sparse  bitmap      sparse          sparse
+                //      sparse  bitmap      sparse          bitmap
+                //      sparse  bitmap      sparse          full  
+                //      sparse  bitmap      bitmap          sparse
+                //      sparse  bitmap      full            sparse
+
+                //      ------------------------------------------
+                //      C      <M> =        A       .*      B
+                //      ------------------------------------------
+                //      sparse  full        sparse          sparse
+                //      sparse  full        sparse          bitmap
+                //      sparse  full        sparse          full  
+                //      sparse  full        bitmap          sparse
+                //      sparse  full        full            sparse
+
+                //      ------------------------------------------
+                //      C      <!M> =       A       .*      B
+                //      ------------------------------------------
+                //      sparse  bitmap      sparse          sparse
+                //      sparse  bitmap      sparse          bitmap
+                //      sparse  bitmap      sparse          full  
+                //      sparse  bitmap      bitmap          sparse
+                //      sparse  bitmap      full            sparse
+
+                //      ------------------------------------------
+                //      C      <!M> =       A       .*      B
+                //      ------------------------------------------
+                //      sparse  full        sparse          sparse
+                //      sparse  full        sparse          bitmap
+                //      sparse  full        sparse          full  
+                //      sparse  full        bitmap          sparse
+                //      sparse  full        full            sparse
+
+                // GB_GET_MIJ: get M(i,j) where M is bitmap or full
+                #undef  GB_GET_MIJ
+                #define GB_GET_MIJ(i)                                     \
+                    int64_t pM = pM_start + i ;                           \
+                    bool mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ; \
+                    if (Mask_comp) mij = !mij ;
+
+                // A or B are sparse/hyper, or both
+                ASSERT (A_is_sparse || A_is_hyper || B_is_sparse || B_is_hyper);
+
+                int64_t pM_start = j * vlen ;
+
+                if (A_is_bitmap)
+                {
+
+                    //----------------------------------------------------------
+                    // Method10: A(:,j) bitmap; B(:,j) sparse, M bitmap/full
+                    //----------------------------------------------------------
+
+                    // TODO: B can be jumbled; then so is C
+
+                    ASSERT (B_is_sparse || B_is_hyper) ;
+                    for ( ; pB < pB_end ; pB++)
+                    {
+                        int64_t i = Bi [pB] ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        { 
+                            // C (i,j) = A (i,j) .* B (i,j)
+                            int64_t pA = pA_start + i - iA_first ;
+                            if (!Ab [pA]) continue ;
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = i ;
+                            GB_GETA (aij, Ax, pA) ;     
+                            GB_GETB (bij, Bx, pB) ;
+                            GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                }
+                else if (B_is_bitmap)
+                {
+
+
+                    //----------------------------------------------------------
+                    // Method11: B(:,j) bitmap; A(:,j) sparse, M bitmap/full
+                    //----------------------------------------------------------
+
+                    // TODO: A can be jumbled; then so is C
+
+                    ASSERT (A_is_sparse || A_is_hyper) ;
+                    for ( ; pA < pA_end ; pA++)
+                    {
+                        int64_t i = Ai [pA] ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        { 
+                            // C (i,j) = A (i,j) .* B (i,j)
+                            int64_t pB = pB_start + i - iB_first ;
+                            if (!Bb [pB]) continue ;
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = i ;
+                            GB_GETA (aij, Ax, pA) ;     
+                            GB_GETB (bij, Bx, pB) ;
+                            GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                }
+                else if (adense && bdense)
+                {
+
+
+                    //----------------------------------------------------------
+                    // Method12: A(:,j) and B(:,j) dense, M bitmap/full
+                    //----------------------------------------------------------
+
+                    // TODO: only do this if A and B are full, not just (:,j)
+                    // Then no matrix will be jumbled.
+
+                    ASSERT (ajnz == bjnz) ;
+                    ASSERT (iA_first == iB_first) ;
+                    ASSERT (iA_last  == iB_last ) ;
+
+                    for (int64_t p = 0 ; p < ajnz ; p++)
+                    {
+                        int64_t i = p + iA_first ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        { 
+                            // C (i,j) = A (i,j) .* B (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = i ;
+                            GB_GETA (aij, Ax, pA + p) ;     // aij = Ax [pA+p]
+                            GB_GETB (bij, Bx, pB + p) ;
+                            GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                }
+                else if (adense)
+                {
+
+
+                    //----------------------------------------------------------
+                    // Method13: A(:,j) dense, B(:,j) sparse, M bitmap/full
+                    //----------------------------------------------------------
+
+                    // TODO: only do this if A is full, not just A(:,j)
+                    // TODO: B can be jumbled; then so is C
+
+                    for ( ; pB < pB_end ; pB++)
+                    {
+                        int64_t i = Bi [pB] ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        { 
+                            // C (i,j) = A (i,j) .* B (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = i ;
+                            GB_GETA (aij, Ax, pA + i - iA_first) ;
+                            GB_GETB (bij, Bx, pB) ;
+                            GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                }
+                else if (bdense)
+                {
+
+
+                    //----------------------------------------------------------
+                    // Method14: A(:,j) sparse, B(:,j) dense, M bitmap/full
+                    //----------------------------------------------------------
+
+                    // TODO: only do this if B is full, not just B(:,j)
+                    // TODO: A can be jumbled; then so is C
+
+                    for ( ; pA < pA_end ; pA++)
+                    {
+                        int64_t i = Ai [pA] ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        { 
+                            // C (i,j) = A (i,j) .* B (i,j)
+                            #if defined ( GB_PHASE_1_OF_2 )
+                            cjnz++ ;
+                            #else
+                            Ci [pC] = i ;
+                            GB_GETA (aij, Ax, pA) ;
+                            GB_GETB (bij, Bx, pB + i - iB_first) ;
+                            GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                            pC++ ;
+                            #endif
+                        }
+                    }
+
+                }
+                else if (ajnz > 32 * bjnz)
+                {
+
+                    //----------------------------------------------------------
+                    // Method15: A(:,j) much denser than B(:,j), M bitmap/full
+                    //----------------------------------------------------------
+
+                    // A and B cannot be jumbled
+
+                    for ( ; pB < pB_end ; pB++)
+                    {
+                        int64_t i = Bi [pB] ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        {
+                            // find i in A(:,j)
+                            int64_t pright = pA_end - 1 ;
+                            bool found ;
+                            GB_BINARY_SEARCH (i, Ai, pA, pright, found) ;
+                            if (found)
+                            { 
+                                // C (i,j) = A (i,j) .* B (i,j)
+                                #if defined ( GB_PHASE_1_OF_2 )
+                                cjnz++ ;
+                                #else
+                                ASSERT (pC < pC_end) ;
+                                Ci [pC] = i ;
+                                GB_GETA (aij, Ax, pA) ;
+                                GB_GETB (bij, Bx, pB) ;
+                                GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                                pC++ ;
+                                #endif
+                            }
+                        }
+                    }
+
+                    #if defined ( GB_PHASE_2_OF_2 )
+                    ASSERT (pC == pC_end) ;
+                    #endif
+
+                }
+                else if (bjnz > 32 * ajnz)
+                {
+
+                    //----------------------------------------------------------
+                    // Method16: B(:,j) much denser than A(:,j), M bitmap/full
+                    //----------------------------------------------------------
+
+                    // A and B cannot be jumbled
+
+                    for ( ; pA < pA_end ; pA++)
+                    {
+                        int64_t i = Ai [pA] ;
+                        GB_GET_MIJ (i) ;
+                        if (mij)
+                        {
+
+                            // find i in B(:,j)
+                            int64_t pright = pB_end - 1 ;
+                            bool found ;
+                            GB_BINARY_SEARCH (i, Bi, pB, pright, found) ;
+                            if (found)
+                            { 
+                                // C (i,j) = A (i,j) .* B (i,j)
+                                #if defined ( GB_PHASE_1_OF_2 )
+                                cjnz++ ;
+                                #else
+                                ASSERT (pC < pC_end) ;
+                                Ci [pC] = i ;
+                                GB_GETA (aij, Ax, pA) ;
+                                GB_GETB (bij, Bx, pB) ;
+                                GB_BINOP (GB_CX (pC), aij, bij, i, j) ;
+                                pC++ ;
+                                #endif
+                            }
+                        }
+                    }
+
+                    #if defined ( GB_PHASE_2_OF_2 )
+                    ASSERT (pC == pC_end) ;
+                    #endif
+
+                }
+                else
+                {
+
+                    //----------------------------------------------------------
+                    // Method17: A(:,j) and B(:,j) about the same, M bitmap/full
+                    //----------------------------------------------------------
+
+                    // linear-time scan of A(:,j) and B(:,j)
+                    // A and B cannot be jumbled
+
+                    while (pA < pA_end && pB < pB_end)
+                    {
+                        int64_t iA = Ai [pA] ;
+                        int64_t iB = Bi [pB] ;
+                        if (iA < iB)
+                        { 
+                            // A(i,j) exists but not B(i,j)
+                            pA++ ;
+                        }
+                        else if (iB < iA)
+                        { 
+                            // B(i,j) exists but not A(i,j)
+                            pB++ ;
+                        }
+                        else
+                        {
+                            // both A(i,j) and B(i,j) exist
+                            int64_t i = iA ;
+                            GB_GET_MIJ (i) ;
+                            if (mij)
+                            { 
+                                // C (i,j) = A (i,j) .* B (i,j)
+                                #if defined ( GB_PHASE_1_OF_2 )
+                                cjnz++ ;
+                                #else
+                                ASSERT (pC < pC_end) ;
+                                Ci [pC] = i ;
+                                GB_GETA (aij, Ax, pA) ;
+                                GB_GETB (bij, Bx, pB) ;
+                                GB_BINOP (GB_CX (pC), aij, bij, iB, j) ;
+                                pC++ ;
+                                #endif
+                            }
+                            pA++ ;
+                            pB++ ;
+                        }
+                    }
+
+                    #if defined ( GB_PHASE_2_OF_2 )
+                    ASSERT (pC == pC_end) ;
+                    #endif
+                }
+            }
+
+            //------------------------------------------------------------------
+            // final count of nnz (C (:,j))
+            //------------------------------------------------------------------
+
+            #if defined ( GB_PHASE_1_OF_2 )
+            if (fine_task)
+            { 
+                TaskList [taskid].pC = cjnz ;
+            }
+            else
+            { 
+                Cp [k] = cjnz ;
+            }
+            #endif
+        }
+    }
+}
+
diff --git a/GraphBLAS/Source/Template/GB_mask_template.c b/GraphBLAS/Source/Template/GB_sparse_masker_template.c
similarity index 65%
rename from GraphBLAS/Source/Template/GB_mask_template.c
rename to GraphBLAS/Source/Template/GB_sparse_masker_template.c
index 5fa29771b5..7a9baa0ad2 100644
--- a/GraphBLAS/Source/Template/GB_mask_template.c
+++ b/GraphBLAS/Source/Template/GB_sparse_masker_template.c
@@ -1,16 +1,17 @@
 //------------------------------------------------------------------------------
-// GB_mask_template:  phase1 and phase2 for R = masker (M, C, Z)
+// GB_sparse_masker_template:  R = masker (C, M, Z) where R is sparse/hyper
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Computes C<M>=Z or C<!M>=Z, returning the result in R.  The input matrix C
-// is not modified.  Effectively, this computes R=C and then R<M>=Z or R<!M>=Z.
-// If the C_replace descriptor is enabled, then C has already been cleared, and
-// is an empty (but non-NULL) matrix.
+// Computes C<M>=Z or C<!M>=Z, returning the result in R, which is sparse or
+// hypersparse.  The input matrix C is not modified.  Effectively, this
+// computes R=C and then R<M>=Z or R<!M>=Z.  If the C_replace descriptor is
+// enabled, then C has already been cleared, and is an empty (but non-NULL)
+// matrix.
 
 // phase1: does not compute R itself, but just counts the # of entries in each
 // vector of R.  Fine tasks compute the # of entries in their slice of a
@@ -18,23 +19,65 @@
 
 // phase2: computes R, using the counts computed by phase1.
 
+// C is sparse or hypersparse.  M and Z can have any sparsity structure.
+
+        //      ------------------------------------------
+        //      C       <!M> =       Z              R
+        //      ------------------------------------------
+
+        //      sparse  sparse      sparse          sparse
+        //      sparse  bitmap      sparse          sparse
+        //      sparse  full        sparse          sparse
+
+        //      ------------------------------------------
+        //      C       <M> =        Z              R
+        //      ------------------------------------------
+
+        //      sparse  sparse      sparse          sparse
+        //      sparse  sparse      bitmap          sparse
+        //      sparse  sparse      full            sparse
+        //      sparse  bitmap      sparse          sparse
+        //      sparse  full        sparse          sparse
+
 // FUTURE:: add special cases for C==Z, C==M, and Z==M aliases
 
 //------------------------------------------------------------------------------
-// R(i,j) = Z(i,j)
+// R(i,j) = Z(i,j) when Z is sparse or hypersparse
 //------------------------------------------------------------------------------
 
 #if defined ( GB_PHASE_1_OF_2 )
-    #define GB_COPY_Z                                       \
-    {                                                       \
-        rjnz++ ;                                            \
+    #define GB_COPY_Z                                           \
+    {                                                           \
+        rjnz++ ;                                                \
     }
 #else
-    #define GB_COPY_Z                                       \
-    {                                                       \
-        Ri [pR] = i ;                                       \
-        memcpy (Rx +(pR)*rsize, Zx +(pZ)*rsize, rsize) ;    \
-        pR++ ;                                              \
+    #define GB_COPY_Z                                           \
+    {                                                           \
+        Ri [pR] = i ;                                           \
+        memcpy (Rx +(pR)*rsize, Zx +(pZ)*rsize, rsize) ;        \
+        pR++ ;                                                  \
+    }
+#endif
+
+//------------------------------------------------------------------------------
+// R(i,j) = Z(i,j) when Z is bitmap or full
+//------------------------------------------------------------------------------
+
+#if defined ( GB_PHASE_1_OF_2 )
+    #define GB_COPY_Z_BITMAP_OR_FULL                            \
+    {                                                           \
+        rjnz += GBB (Zb, pZ_start + i - iZ_first) ;             \
+    }
+#else
+    #define GB_COPY_Z_BITMAP_OR_FULL                            \
+    {                                                           \
+        int64_t pZ = pZ_start + i - iZ_first ;                  \
+        if (GBB (Zb, pZ))                                       \
+        {                                                       \
+            Ri [pR] = i ;                                       \
+            memcpy (Rx +(pR)*rsize, Zx +(pZ)*rsize, rsize) ;    \
+            pR++ ;                                              \
+        }                                                       \
     }
 #endif
 
@@ -43,71 +86,34 @@
 //------------------------------------------------------------------------------
 
 #if defined ( GB_PHASE_1_OF_2 )
-    #define GB_COPY_C                                       \
-    {                                                       \
-        rjnz++ ;                                            \
+    #define GB_COPY_C                                           \
+    {                                                           \
+        rjnz++ ;                                                \
     }
 #else
-    #define GB_COPY_C                                       \
-    {                                                       \
-        Ri [pR] = i ;                                       \
-        memcpy (Rx +(pR)*rsize, Cx +(pC)*rsize, rsize) ;    \
-        pR++ ;                                              \
+    #define GB_COPY_C                                           \
+    {                                                           \
+        Ri [pR] = i ;                                           \
+        memcpy (Rx +(pR)*rsize, Cx +(pC)*rsize, rsize) ;        \
+        pR++ ;                                                  \
     }
 #endif
 
 //------------------------------------------------------------------------------
-// mask template
+// template for R = masker (C, M, Z) when R is sparse or hypersparse
 //------------------------------------------------------------------------------
 
 {
 
     //--------------------------------------------------------------------------
-    // get C, Z, M, and R
+    // phase1: count entries in each C(:,j)
+    // phase2: compute C
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Cp = C->p ;
-    const int64_t *GB_RESTRICT Ci = C->i ;
-    const int64_t vlen = C->vlen ;
-
-    const int64_t *GB_RESTRICT Zp = Z->p ;
-    const int64_t *GB_RESTRICT Zi = Z->i ;
-
-    const int64_t *GB_RESTRICT Mp = NULL ;
-    // const int64_t *GB_RESTRICT Mh = NULL ;
-    const int64_t *GB_RESTRICT Mi = NULL ;
-    const GB_void *GB_RESTRICT Mx = NULL ;
-    size_t msize = 0 ;
-    // int64_t Mnvec = 0 ;
-    // bool M_is_hyper = false ;
-    if (M != NULL)
-    { 
-        Mp = M->p ;
-        // Mh = M->h ;
-        Mi = M->i ;
-        Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
-        msize = M->type->size ;
-        // Mnvec = M->nvec ;
-        // M_is_hyper = M->is_hyper ;
-    }
-
-    #if defined ( GB_PHASE_2_OF_2 )
-    const GB_void *GB_RESTRICT Cx = (GB_void *) C->x ;
-    const GB_void *GB_RESTRICT Zx = (GB_void *) Z->x ;
-    const int64_t *GB_RESTRICT Rp = R->p ;
-    const int64_t *GB_RESTRICT Rh = R->h ;
-          int64_t *GB_RESTRICT Ri = R->i ;
-          GB_void *GB_RESTRICT Rx = (GB_void *) R->x ;
-    size_t rsize = R->type->size ;
-    #endif
+    ASSERT (C_is_sparse || C_is_hyper) ;
 
-    //--------------------------------------------------------------------------
-    // phase1: count entries in each C(:,j); phase2: compute C
-    //--------------------------------------------------------------------------
-
-    int taskid ;
-    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
-    for (taskid = 0 ; taskid < ntasks ; taskid++)
+    #pragma omp parallel for num_threads(R_nthreads) schedule(dynamic,1)
+    for (taskid = 0 ; taskid < R_ntasks ; taskid++)
     {
 
         //----------------------------------------------------------------------
@@ -141,7 +147,7 @@
             // get j, the kth vector of R
             //------------------------------------------------------------------
 
-            int64_t j = (Rh == NULL) ? k : Rh [k] ;
+            int64_t j = GBH (Rh, k) ;
 
             #if defined ( GB_PHASE_1_OF_2 )
             int64_t rjnz = 0 ;
@@ -161,7 +167,10 @@
                 pR_end = Rp [k+1] ;
             }
             int64_t rjnz = pR_end - pR ;
-            if (rjnz == 0) continue ;
+            if (rjnz == 0)
+            {
+                continue ;
+            }
             #endif
 
             //------------------------------------------------------------------
@@ -219,22 +228,21 @@
                 int64_t kZ = (R_to_Z == NULL) ? j : R_to_Z [k] ;
                 if (kZ >= 0)
                 { 
-                    pZ     = Zp [kZ] ;
-                    pZ_end = Zp [kZ+1] ;
+                    pZ     = GBP (Zp, kZ, vlen) ;
+                    pZ_end = GBP (Zp, kZ+1, vlen) ;
                 }
             }
 
             int64_t zjnz = pZ_end - pZ ;        // nnz in Z(:,j) for this slice
+            int64_t pZ_start = pZ ;
             bool zdense = (zjnz == len) && (zjnz > 0) ;
 
-            #ifdef GB_DEBUG
             int64_t iZ_first = -1, iZ_last = -1 ;
             if (zjnz > 0)
             {
-                iZ_first = Zi [pZ] ;
-                iZ_last  = Zi [pZ_end-1] ;
+                iZ_first = GBI (Zi, pZ, vlen) ;
+                iZ_last  = GBI (Zi, pZ_end-1, vlen) ;
             }
-            #endif
 
             //------------------------------------------------------------------
             // get M(:,j)
@@ -254,8 +262,8 @@
                 int64_t kM = (R_to_M == NULL) ? j : R_to_M [k] ;
                 if (kM >= 0)
                 { 
-                    pM     = Mp [kM] ;
-                    pM_end = Mp [kM+1] ;
+                    pM     = GBP (Mp, kM, vlen) ;
+                    pM_end = GBP (Mp, kM+1, vlen) ;
                 }
             }
 
@@ -265,24 +273,134 @@
             // get the first index in M(:,j) for this vector
             int64_t iM_first = -1 ;
             int64_t pM_first = pM ;
-            if (mjnz > 0) iM_first = Mi [pM_first] ;
+            if (mjnz > 0) iM_first = GBI (Mi, pM_first, vlen) ;
 
             //------------------------------------------------------------------
-            // phase1: count nnz (R(:,j)); phase2: compute R(:,j)
+            // R(:,j) = masker (C (:,j), M (:,j), Z (:,j))
             //------------------------------------------------------------------
 
-            if (mjnz == 0)
+            if (Z_is_bitmap || Z_is_full)
+            {
+
+                //--------------------------------------------------------------
+                // Method01: Z is bitmap or full; M is sparse or hypersparse
+                //--------------------------------------------------------------
+
+                //      ------------------------------------------
+                //      C       <M> =        Z              R
+                //      ------------------------------------------
+
+                //      sparse  sparse      bitmap          sparse
+                //      sparse  sparse      full            sparse
+
+                // M is sparse or hypersparse, and not complemented.
+                // Otherwise, R is bitmap and not computed here, but in
+                // GB_bitmap_masker_template instead.
+
+                ASSERT (M_is_sparse || M_is_hyper) ;
+                ASSERT (!Mask_comp) ;
+
+                // 2-way merge of C(:,j) and M(:,j) and direct lookup of Z
+
+                while (pC < pC_end && pM < pM_end)
+                {
+                    
+                    int64_t iC = Ci [pC] ;
+                    int64_t iM = Mi [pM] ;
+
+                    if (iC < iM)
+                    { 
+                        // C(i,j) is present but M(i,j) is not
+                        // R(i,j) = C(i,j)
+                        int64_t i = iC ;
+                        GB_COPY_C ;
+                        pC++ ;
+                    }
+                    else if (iC > iM)
+                    { 
+                        // M(i,j) is present but C(i,j) is not
+                        int64_t i = iM ;
+                        bool mij = GB_mcast (Mx, pM, msize) ;
+                        if (mij)
+                        {
+                            // R(i,j) = Z(i,j)
+                            GB_COPY_Z_BITMAP_OR_FULL ;
+                        }
+                        pM++ ;
+                    }
+                    else
+                    {
+                        // both C(i,j) and M(i,j) are present
+                        int64_t i = iM ;
+                        bool mij = GB_mcast (Mx, pM, msize) ;
+                        if (mij)
+                        { 
+                            // R(i,j) = Z(i,j)
+                            GB_COPY_Z_BITMAP_OR_FULL ;
+                        }
+                        else
+                        { 
+                            // R(i,j) = C(i,j)
+                            GB_COPY_C ;
+                        }
+                        pC++ ;
+                        pM++ ;
+                    }
+                }
+
+                // if M(:,j) is exhausted ; continue scanning all of C(:,j)
+                #if defined ( GB_PHASE_1_OF_2 )
+                rjnz += (pC_end - pC) ;
+                #else
+                for ( ; pC < pC_end ; pC++)
+                { 
+                    // C(i,j) is present but M(i,j) is not
+                    int64_t i = Ci [pC] ;
+                    GB_COPY_C ;
+                }
+                #endif
+
+                // if C(:,j) is exhausted ; continue scanning all of M(:,j)
+                for ( ; pM < pM_end ; pM++)
+                {
+                    // M(i,j) is present but C(i,j) is not
+                    int64_t i = Mi [pM] ;
+                    bool mij = GB_mcast (Mx, pM, msize) ;
+                    if (mij)
+                    { 
+                        // R(i,j) = Z(i,j)
+                        GB_COPY_Z_BITMAP_OR_FULL ;
+                    }
+                }
+
+            }
+            else if (mjnz == 0)
             {
 
                 //--------------------------------------------------------------
-                // M(:,j) is empty
+                // Z is sparse or hypersparse, M(:,j) is empty
                 //--------------------------------------------------------------
 
+                //      ------------------------------------------
+                //      C       <!M> =       Z              R
+                //      ------------------------------------------
+
+                //      sparse  sparse      sparse          sparse
+
+                //      ------------------------------------------
+                //      C       <M> =        Z              R
+                //      ------------------------------------------
+
+                //      sparse  sparse      sparse          sparse
+
+                // Z must be sparse or hypersparse
+                ASSERT (Z_is_sparse || Z_is_hyper) ;
+
                 if (!Mask_comp)
                 { 
 
                     //----------------------------------------------------------
-                    // M(:,j) is empty and not complemented
+                    // Method02: M(:,j) is empty and not complemented
                     //----------------------------------------------------------
 
                     // R(:,j) = C(:,j), regardless of Z(:,j)
@@ -299,7 +417,7 @@
                 { 
 
                     //----------------------------------------------------------
-                    // M(:,j) is empty and complemented
+                    // Method03: M(:,j) is empty and complemented
                     //----------------------------------------------------------
 
                     // R(:,j) = Z(:,j), regardless of C(:,j)
@@ -317,9 +435,31 @@
             {
 
                 //--------------------------------------------------------------
-                // C(:,j) and Z(:,j) dense: thus R(:,j) dense
+                // Method03: C(:,j) and Z(:,j) dense: thus R(:,j) dense
                 //--------------------------------------------------------------
 
+                //      ------------------------------------------
+                //      C       <!M> =       Z              R
+                //      ------------------------------------------
+
+                //      sparse  sparse      sparse          sparse
+                //      sparse  bitmap      sparse          sparse
+                //      sparse  full        sparse          sparse
+
+                //      ------------------------------------------
+                //      C       <M> =        Z              R
+                //      ------------------------------------------
+
+                //      sparse  sparse      sparse          sparse
+                //      sparse  bitmap      sparse          sparse
+                //      sparse  full        sparse          sparse
+
+                // Both C(:,j) and Z(:,j) are dense (that is, all entries
+                // present), but both C and Z are stored in a sparse or
+                // hypersparse sparsity structure.  M has any sparsity.
+
+                ASSERT (Z_is_sparse || Z_is_hyper) ;
+
                 ASSERT (cjnz == zjnz) ;
                 ASSERT (iC_first == iZ_first) ;
                 ASSERT (iC_last  == iZ_last ) ;
@@ -331,20 +471,22 @@
                 {
                     int64_t i = p + iC_first ;
                     Ri [pR + p] = i ;
-                    int64_t iM = (pM < pM_end) ? Mi [pM] : INT64_MAX ;
+                    int64_t iM = (pM < pM_end) ? GBI (Mi, pM, vlen) : INT64_MAX;
                     bool mij = false ;
                     if (i == iM)
                     { 
-                        mij = GB_mcast (Mx, pM, msize) ;
+                        mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
                         pM++ ;
                     }
                     if (Mask_comp) mij = !mij ;
                     if (mij)
                     { 
+                        // R(i,j) = Z (i,j)
                         memcpy (Rx +(pR+p)*rsize, Zx +(pZ+p)*rsize, rsize) ;
                     }
                     else
                     { 
+                        // R(i,j) = C (i,j)
                         memcpy (Rx +(pR+p)*rsize, Cx +(pC+p)*rsize, rsize) ;
                     }
                 }
@@ -355,9 +497,32 @@
             {
 
                 //--------------------------------------------------------------
-                // 2-way merge of C(:,j) and Z(:,j); binary search of M(:,j)
+                // Method04: 2-way merge of C(:,j) and Z(:,j)
+                //--------------------------------------------------------------
+
+                // Z is sparse or hypersparse; M has any sparsity structure
+                ASSERT (Z_is_sparse || Z_is_hyper) ;
+
+                //--------------------------------------------------------------
+                // Z is sparse or hypersparse, M has any sparsity
                 //--------------------------------------------------------------
 
+                //      ------------------------------------------
+                //      C       <!M> =       Z              R
+                //      ------------------------------------------
+
+                //      sparse  sparse      sparse          sparse
+                //      sparse  bitmap      sparse          sparse
+                //      sparse  full        sparse          sparse
+
+                //      ------------------------------------------
+                //      C       <M> =        Z              R
+                //      ------------------------------------------
+
+                //      sparse  sparse      sparse          sparse
+                //      sparse  bitmap      sparse          sparse
+                //      sparse  full        sparse          sparse
+
                 while (pC < pC_end && pZ < pZ_end)
                 {
 
@@ -379,7 +544,7 @@
                     { 
 
                         //------------------------------------------------------
-                        // M(:,j) is dense
+                        // Method04a: M(:,j) is dense
                         //------------------------------------------------------
 
                         // mask is dense, lookup M(i,j)
@@ -389,20 +554,22 @@
                         // let pM = pM_first + delta
                         // then delta = i - iM_first
                         pM = pM_first + (i - iM_first) ;
-                        ASSERT (i == Mi [pM]) ;
-                        mij = GB_mcast (Mx, pM, msize) ;
+                        ASSERT (i == GBI (Mi, pM, vlen)) ;
+                        mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ;
                         // increment pM for the wrapup phase below
                         pM++ ;
+
                     }
                     else
                     {
 
                         //------------------------------------------------------
-                        // M(:,j) is sparse
+                        // Method04b: M(:,j) is sparse
                         //------------------------------------------------------
 
                         // Use GB_SPLIT_BINARY_SEARCH so that pM can be used in
                         // the for loop with index pM in the wrapup phase.
+                        ASSERT (M_is_sparse || M_is_hyper) ;
                         int64_t pright = pM_end - 1 ;
                         bool found ;
                         GB_SPLIT_BINARY_SEARCH (i, Mi, pM, pright, found) ;
@@ -436,6 +603,7 @@
                     else
                     {
                         // both C(i,j) and Z(i,j) are present
+                        int64_t i = iC ;
                         if (mij)
                         { 
                             GB_COPY_Z ;
@@ -450,7 +618,7 @@
                 }
 
                 //--------------------------------------------------------------
-                // wrapup: C or Z are exhausted, or initially empty
+                // Method04: wrapup: C or Z are exhausted, or initially empty
                 //--------------------------------------------------------------
 
                 cjnz = pC_end - pC ;    // nnz (C(:,j)) remaining
@@ -475,7 +643,7 @@
                         {
 
                             //--------------------------------------------------
-                            // M(:,j) is dense
+                            // Method04c: M(:,j) is dense
                             //--------------------------------------------------
 
                             for ( ; pZ < pZ_end ; pZ++)
@@ -483,8 +651,9 @@
                                 int64_t i = Zi [pZ] ;
                                 // mask is dense, lookup M(i,j)
                                 pM = pM_first + (i - iM_first) ;
-                                ASSERT (i == Mi [pM]) ;
-                                bool mij = GB_mcast (Mx, pM, msize) ;
+                                ASSERT (i == GBI (Mi, pM, vlen)) ;
+                                bool mij = GBB (Mb, pM) &&
+                                           GB_mcast (Mx, pM, msize) ;
                                 if (mij) GB_COPY_Z ;
                             }
 
@@ -493,12 +662,13 @@
                         {
 
                             //--------------------------------------------------
-                            // Z(:,j) is much denser than M(:,j)
+                            // Method04d: Z(:,j) is much denser than M(:,j)
                             //--------------------------------------------------
 
                             // This loop requires pM to start at the first
                             // entry in M(:,j) that has not yet been handled.
 
+                            ASSERT (M_is_sparse || M_is_hyper) ;
                             for ( ; pM < pM_end ; pM++)
                             {
                                 if (GB_mcast (Mx, pM, msize))
@@ -516,9 +686,10 @@
                         {
 
                             //--------------------------------------------------
-                            // M(:,j) is much denser than Z(:,j)
+                            // Method04e: M(:,j) is much denser than Z(:,j)
                             //--------------------------------------------------
 
+                            ASSERT (M_is_sparse || M_is_hyper) ;
                             for ( ; pZ < pZ_end ; pZ++)
                             { 
                                 int64_t i = Zi [pZ] ;
@@ -535,9 +706,10 @@
                         {
 
                             //--------------------------------------------------
-                            // M(:,j) and Z(:,j) have about the same # entries
+                            // Method04f: M(:,j) and Z(:,j) about same # entries
                             //--------------------------------------------------
 
+                            ASSERT (M_is_sparse || M_is_hyper) ;
                             while (pM < pM_end && pZ < pZ_end)
                             {
                                 int64_t iM = Mi [pM] ;
@@ -574,7 +746,7 @@
                         {
 
                             //--------------------------------------------------
-                            // M(:,j) is dense
+                            // Method04g: M(:,j) is dense
                             //--------------------------------------------------
 
                             for ( ; pZ < pZ_end ; pZ++)
@@ -582,8 +754,9 @@
                                 int64_t i = Zi [pZ] ;
                                 // mask is dense, lookup M(i,j)
                                 pM = pM_first + (i - iM_first) ;
-                                ASSERT (i == Mi [pM]) ;
-                                bool mij = GB_mcast (Mx, pM, msize) ;
+                                ASSERT (i == GBI (Mi, pM, vlen)) ;
+                                bool mij = GBB (Mb, pM) &&
+                                           GB_mcast (Mx, pM, msize) ;
                                 if (!mij) GB_COPY_Z ;   // mask is complemented
                             }
                         }
@@ -591,9 +764,10 @@
                         {
 
                             //--------------------------------------------------
-                            // M(:,j) is sparse
+                            // Method04h: M(:,j) is sparse
                             //--------------------------------------------------
 
+                            ASSERT (M_is_sparse || M_is_hyper) ;
                             for ( ; pZ < pZ_end ; pZ++)
                             { 
                                 int64_t i = Zi [pZ] ;
@@ -626,7 +800,7 @@
                         {
 
                             //--------------------------------------------------
-                            // M(:,j) is dense
+                            // Method04i: M(:,j) is dense
                             //--------------------------------------------------
 
                             for ( ; pC < pC_end ; pC++)
@@ -634,8 +808,9 @@
                                 int64_t i = Ci [pC] ;
                                 // mask is dense, lookup M(i,j)
                                 pM = pM_first + (i - iM_first) ;
-                                ASSERT (i == Mi [pM]) ;
-                                bool mij = GB_mcast (Mx, pM, msize) ;
+                                ASSERT (i == GBI (Mi, pM, vlen)) ;
+                                bool mij = GBB (Mb, pM) &&
+                                           GB_mcast (Mx, pM, msize) ;
                                 if (mij) GB_COPY_C ;
                             }
 
@@ -644,9 +819,10 @@
                         {
 
                             //--------------------------------------------------
-                            // C(:,j) is much denser than M(:,j)
+                            // Method04j: C(:,j) is much denser than M(:,j)
                             //--------------------------------------------------
 
+                            ASSERT (M_is_sparse || M_is_hyper) ;
                             for ( ; pM < pM_end ; pM++)
                             {
                                 if (GB_mcast (Mx, pM, msize))
@@ -664,9 +840,10 @@
                         {
 
                             //--------------------------------------------------
-                            // M(:,j) is much denser than C(:,j)
+                            // Method04k: M(:,j) is much denser than C(:,j)
                             //--------------------------------------------------
 
+                            ASSERT (M_is_sparse || M_is_hyper) ;
                             for ( ; pC < pC_end ; pC++)
                             { 
                                 int64_t i = Ci [pC] ;
@@ -683,9 +860,10 @@
                         {
 
                             //--------------------------------------------------
-                            // M(:,j) and C(:,j) have about the same # entries
+                            // Method04l: M(:,j) and C(:,j) about same # entries
                             //--------------------------------------------------
 
+                            ASSERT (M_is_sparse || M_is_hyper) ;
                             while (pM < pM_end && pC < pC_end)
                             {
                                 int64_t iM = Mi [pM] ;
@@ -722,7 +900,7 @@
                         {
 
                             //--------------------------------------------------
-                            // M(:,j) is dense
+                            // Method04m: M(:,j) is dense
                             //--------------------------------------------------
 
                             for ( ; pC < pC_end ; pC++)
@@ -730,8 +908,9 @@
                                 int64_t i = Ci [pC] ;
                                 // mask is dense, lookup M(i,j)
                                 pM = pM_first + (i - iM_first) ;
-                                ASSERT (i == Mi [pM]) ;
-                                bool mij = GB_mcast (Mx, pM, msize) ;
+                                ASSERT (i == GBI (Mi, pM, vlen)) ;
+                                bool mij = GBB (Mb, pM) &&
+                                           GB_mcast (Mx, pM, msize) ;
                                 if (!mij) GB_COPY_C ;
                             }
                         }
@@ -739,9 +918,10 @@
                         {
 
                             //--------------------------------------------------
-                            // M(:,j) is sparse
+                            // Method04n: M(:,j) is sparse
                             //--------------------------------------------------
 
+                            ASSERT (M_is_sparse || M_is_hyper) ;
                             for ( ; pC < pC_end ; pC++)
                             { 
                                 int64_t i = Ci [pC] ;
diff --git a/GraphBLAS/Source/Template/GB_subref_template.c b/GraphBLAS/Source/Template/GB_subref_template.c
index 6746596c3f..1d2d1ed281 100644
--- a/GraphBLAS/Source/Template/GB_subref_template.c
+++ b/GraphBLAS/Source/Template/GB_subref_template.c
@@ -2,17 +2,17 @@
 // GB_subref_template: C = A(I,J), or C = pattern (A(I,J))
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 #if defined ( GB_SYMBOLIC )
 // symbolic method must tolerate zombies
-#define GB_Ai(p) GB_UNFLIP (Ai [p])
+#define GB_Ai(p) GBI_UNFLIP (Ai, p, avlen)
 #else
 // numeric method will not see any zombies
-#define GB_Ai(p) Ai [p]
+#define GB_Ai(p) GBI (Ai, p, avlen)
 #endif
 
 // to iterate across all entries in a bucket:
@@ -323,6 +323,7 @@
                         // with zombies
                         for (int64_t k = 0 ; k < alen ; k++)
                         { 
+                            // symbolic C(:,kC) = A(:,kA) where A has zombies
                             int64_t i = GB_Ai (pA + k) ;
                             ASSERT (i == GB_ijlist (I, i, Ikind, Icolon)) ;
                             Ci [pC + k] = i ;
@@ -540,6 +541,9 @@
                         }
                     }
 
+                    // TODO: skip the sort if C is allowed to be jumbled on
+                    // output.  Flag C as jumbled instead.
+
                     #if defined ( GB_PHASE_2_OF_2 )
                     ASSERT (pC == pC_end) ;
                     if (!fine_task)
@@ -562,7 +566,7 @@
                     // Case 11 works well when I has many entries and A(:,kA)
                     // has few entries.  It requires that I be sorted on input,
                     // so that no sort is required for C(:,kC).  It is
-                    // otherwise identical to Case 9.
+                    // otherwise identical to Case 10.
 
                     ASSERT (Ikind == GB_LIST) ;
                     for (int64_t k = 0 ; k < alen ; k++)
@@ -626,7 +630,7 @@
                     break ;
 
                 //--------------------------------------------------------------
-                default:;
+                default: ;
                 //--------------------------------------------------------------
             }
 
@@ -651,6 +655,9 @@
     // phase2: post sort for any vectors handled by fine tasks with method 10
     //--------------------------------------------------------------------------
 
+    // TODO: skip the sort if C is allowed to be jumbled on output.
+    // Flag C as jumbled instead.
+
     #if defined ( GB_PHASE_2_OF_2 )
     if (post_sort)
     {
diff --git a/GraphBLAS/Source/Template/GB_unjumbled_template.c b/GraphBLAS/Source/Template/GB_unjumbled_template.c
new file mode 100644
index 0000000000..60b9e96795
--- /dev/null
+++ b/GraphBLAS/Source/Template/GB_unjumbled_template.c
@@ -0,0 +1,62 @@
+//------------------------------------------------------------------------------
+// GB_unjumble_template: unjumble the vectors of a matrix
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+{
+    int tid ;
+    #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
+    for (tid = 0 ; tid < ntasks ; tid++)
+    {
+
+        //----------------------------------------------------------------------
+        // get the task description
+        //----------------------------------------------------------------------
+
+        int64_t kfirst = A_slice [tid] ;
+        int64_t klast  = A_slice [tid+1] ;
+
+        //----------------------------------------------------------------------
+        // sort vectors kfirst to klast
+        //----------------------------------------------------------------------
+
+        for (int64_t k = kfirst ; k < klast ; k++)
+        {
+
+            //------------------------------------------------------------------
+            // check if the vector needs sorting
+            //------------------------------------------------------------------
+
+            bool jumbled = false ;
+            int64_t pA_start = Ap [k] ;
+            int64_t pA_end   = Ap [k+1] ;
+            int64_t ilast = -1 ;
+            for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+            {
+                int64_t i = Ai [pA] ;
+                if (i < ilast)
+                { 
+                    jumbled = true ;
+                    break ;
+                }
+                ilast = i ;
+            }
+
+            //------------------------------------------------------------------
+            // sort the vector
+            //------------------------------------------------------------------
+
+            if (jumbled)
+            { 
+                int64_t aknz = pA_end - pA_start ;
+                GB_QSORT_WORKER ;
+            }
+        }
+    }
+}
+
+#undef GB_QSORT_WORKER
diff --git a/GraphBLAS/Source/Template/GB_unop_factory.c b/GraphBLAS/Source/Template/GB_unop_factory.c
index 33271c0405..41a9b4dd1e 100644
--- a/GraphBLAS/Source/Template/GB_unop_factory.c
+++ b/GraphBLAS/Source/Template/GB_unop_factory.c
@@ -2,8 +2,8 @@
 // GB_unop_factory.c:  switch factory for unary operators and 2 types
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -31,14 +31,10 @@
         // z = (ztype) x, with arbitrary typecasting
         //----------------------------------------------------------------------
 
-        // With no typecasting, the identity operator and the switch factory
-        // are not used.  Instead, the work is done by GB_mempcy, as a shallow
-        // copy (see the call to GB_apply_op in GB_shallow_op.c and
-        // GB_transpose.c), or skipped (if the operator is applied in-place;
-        // see the call to GB_apply_op in GB_apply.c). 
+        // the identity operator is only used with typecasting via this switch
+        // factory, so code1 is never equal to code2.
 
         ASSERT (code1 != code2)
-
         #define GB_OPNAME _identity
         #define GB_EXCLUDE_SAME_TYPES
         #include "GB_2type_factory.c"
diff --git a/GraphBLAS/Source/Template/GB_unop_transpose.c b/GraphBLAS/Source/Template/GB_unop_transpose.c
index fb69581df8..e3b88bb2f5 100644
--- a/GraphBLAS/Source/Template/GB_unop_transpose.c
+++ b/GraphBLAS/Source/Template/GB_unop_transpose.c
@@ -2,14 +2,11 @@
 // GB_unop_transpose: C=op(cast(A')), transpose, typecast, and apply op
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// This method is parallel, but not highly scalable.  It uses only naslice =
-// nnz(A)/(A->vlen) threads.  Each thread requires O(vlen) workspace.
-
 {
 
     // Ax unused for some uses of this template
@@ -19,41 +16,180 @@
     // get A and C
     //--------------------------------------------------------------------------
 
-    const int64_t *GB_RESTRICT Ai = A->i ;
-
-    #if defined ( GB_PHASE_2_OF_2 )
     const GB_ATYPE *GB_RESTRICT Ax = (GB_ATYPE *) A->x ;
-    int64_t  *GB_RESTRICT Ci = C->i ;
     GB_CTYPE *GB_RESTRICT Cx = (GB_CTYPE *) C->x ;
-    #endif
 
     //--------------------------------------------------------------------------
     // C = op (cast (A'))
     //--------------------------------------------------------------------------
 
-    int taskid ;
-    #pragma omp parallel for num_threads(naslice) schedule(static)
-    for (taskid = 0 ; taskid < naslice ; taskid++)
+    if (Workspaces == NULL)
     {
-        // get the rowcount for this slice, of size A->vlen
-        int64_t *GB_RESTRICT rowcount = Rowcounts [taskid] ;
-        for (int64_t Iter_k = A_slice [taskid] ;
-                     Iter_k < A_slice [taskid+1] ;
-                     Iter_k++)
+
+        //----------------------------------------------------------------------
+        // A and C are both full or both bitmap
+        //----------------------------------------------------------------------
+
+        // A is avlen-by-avdim; C is avdim-by-avlen
+        int64_t avlen = A->vlen ;
+        int64_t avdim = A->vdim ;
+        int64_t anz = avlen * avdim ;
+
+        const int8_t *GB_RESTRICT Ab = A->b ;
+        int8_t *GB_RESTRICT Cb = C->b ;
+        ASSERT ((Cb == NULL) == (Ab == NULL)) ;
+
+        //----------------------------------------------------------------------
+        // A and C are both full or bitmap
+        //----------------------------------------------------------------------
+
+        // TODO: it would be faster to by tiles, not rows/columns, for large
+        // matrices, but in most of the cases, A and C will be tall-and-thin
+        // or short-and-fat.
+
+        int tid ;
+        #pragma omp parallel for num_threads(nthreads) schedule(static)
+        for (tid = 0 ; tid < nthreads ; tid++)
         {
-            GBI_jth_iteration_with_iter (Iter, j, pA, pA_end) ;
-            for ( ; pA < pA_end ; pA++)
-            { 
-                #if defined ( GB_PHASE_1_OF_2)
-                // count one more entry in C(i,:) for this slice
-                rowcount [Ai [pA]]++ ;
-                #else
-                // insert the entry into C(i,:) for this slice
-                int64_t pC = rowcount [Ai [pA]]++ ;
-                Ci [pC] = j ;
-                // Cx [pC] = op (Ax [pA])
-                GB_CAST_OP (pC, pA) ;
-                #endif
+            int64_t pC_start, pC_end ;
+            GB_PARTITION (pC_start, pC_end, anz, tid, nthreads) ;
+            if (Ab == NULL)
+            {
+                // A and C are both full
+                for (int64_t pC = pC_start ; pC < pC_end ; pC++)
+                { 
+                    // get i and j of the entry C(i,j)
+                    // i = (pC % avdim) ;
+                    // j = (pC / avdim) ;
+                    // find the position of the entry A(j,i) 
+                    // pA = j + i * avlen
+                    // Cx [pC] = op (Ax [pA])
+                    GB_CAST_OP (pC, ((pC / avdim) + (pC % avdim) * avlen)) ;
+                }
+            }
+            else
+            {
+                // A and C are both bitmap
+                for (int64_t pC = pC_start ; pC < pC_end ; pC++)
+                {
+                    // get i and j of the entry C(i,j)
+                    // i = (pC % avdim) ;
+                    // j = (pC / avdim) ;
+                    // find the position of the entry A(j,i) 
+                    // pA = j + i * avlen
+                    int64_t pA = ((pC / avdim) + (pC % avdim) * avlen) ;
+                    int8_t cij_exists = Ab [pA] ;
+                    Cb [pC] = cij_exists ;
+                    if (cij_exists)
+                    { 
+                        // Cx [pC] = op (Ax [pA])
+                        GB_CAST_OP (pC, pA) ;
+                    }
+                }
+            }
+        }
+
+    }
+    else
+    { 
+
+        //----------------------------------------------------------------------
+        // A is sparse or hypersparse; C is sparse
+        //----------------------------------------------------------------------
+
+        const int64_t *GB_RESTRICT Ap = A->p ;
+        const int64_t *GB_RESTRICT Ah = A->h ;
+        const int64_t *GB_RESTRICT Ai = A->i ;
+        const int64_t anvec = A->nvec ;
+        int64_t *GB_RESTRICT Ci = C->i ;
+
+        if (nthreads == 1)
+        {
+
+            //------------------------------------------------------------------
+            // sequential method
+            //------------------------------------------------------------------
+
+            int64_t *GB_RESTRICT workspace = Workspaces [0] ;
+            for (int64_t k = 0 ; k < anvec ; k++)
+            {
+                // iterate over the entries in A(:,j)
+                int64_t j = GBH (Ah, k) ;
+                int64_t pA_start = Ap [k] ;
+                int64_t pA_end = Ap [k+1] ;
+                for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                { 
+                    // C(j,i) = A(i,j)
+                    int64_t i = Ai [pA] ;
+                    int64_t pC = workspace [i]++ ;
+                    Ci [pC] = j ;
+                    // Cx [pC] = op (Ax [pA])
+                    GB_CAST_OP (pC, pA) ;
+                }
+            }
+
+        }
+        else if (nworkspaces == 1)
+        {
+
+            //------------------------------------------------------------------
+            // atomic method
+            //------------------------------------------------------------------
+
+            int64_t *GB_RESTRICT workspace = Workspaces [0] ;
+            int tid ;
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (tid = 0 ; tid < nthreads ; tid++)
+            {
+                for (int64_t k = A_slice [tid] ; k < A_slice [tid+1] ; k++)
+                {
+                    // iterate over the entries in A(:,j)
+                    int64_t j = GBH (Ah, k) ;
+                    int64_t pA_start = Ap [k] ;
+                    int64_t pA_end = Ap [k+1] ;
+                    for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                    { 
+                        // C(j,i) = A(i,j)
+                        int64_t i = Ai [pA] ;
+                        // do this atomically:  pC = workspace [i]++
+                        int64_t pC ;
+                        GB_ATOMIC_CAPTURE_INC64 (pC, workspace [i]) ;
+                        Ci [pC] = j ;
+                        // Cx [pC] = op (Ax [pA])
+                        GB_CAST_OP (pC, pA) ;
+                    }
+                }
+            }
+
+        }
+        else
+        {
+
+            //------------------------------------------------------------------
+            // non-atomic method
+            //------------------------------------------------------------------
+
+            int tid ;
+            #pragma omp parallel for num_threads(nthreads) schedule(static)
+            for (tid = 0 ; tid < nthreads ; tid++)
+            {
+                int64_t *GB_RESTRICT workspace = Workspaces [tid] ;
+                for (int64_t k = A_slice [tid] ; k < A_slice [tid+1] ; k++)
+                {
+                    // iterate over the entries in A(:,j)
+                    int64_t j = GBH (Ah, k) ;
+                    int64_t pA_start = Ap [k] ;
+                    int64_t pA_end = Ap [k+1] ;
+                    for (int64_t pA = pA_start ; pA < pA_end ; pA++)
+                    { 
+                        // C(j,i) = A(i,j)
+                        int64_t i = Ai [pA] ;
+                        int64_t pC = workspace [i]++ ;
+                        Ci [pC] = j ;
+                        // Cx [pC] = op (Ax [pA])
+                        GB_CAST_OP (pC, pA) ;
+                    }
+                }
             }
         }
     }
diff --git a/GraphBLAS/Source/Template/README.txt b/GraphBLAS/Source/Template/README.txt
index bfc8dc7809..bfcd8d1380 100644
--- a/GraphBLAS/Source/Template/README.txt
+++ b/GraphBLAS/Source/Template/README.txt
@@ -1,5 +1,5 @@
-SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved
-http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
 
 This is the GraphBLAS/Source/Template folder.
 
diff --git a/GraphBLAS/Source/codegen.m b/GraphBLAS/Source/codegen.m
index 2d15a9f2ab..e9d7da5f27 100644
--- a/GraphBLAS/Source/codegen.m
+++ b/GraphBLAS/Source/codegen.m
@@ -1,6 +1,9 @@
 function codegen
 %CODEGEN generate all code for Generated/*
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 codegen_1type ;     % types
 codegen_axb ;       % semirings
 codegen_binop ;     % binary operators
diff --git a/GraphBLAS/Source/codegen_1type.m b/GraphBLAS/Source/codegen_1type.m
index 733fbdcea5..4952376f7c 100644
--- a/GraphBLAS/Source/codegen_1type.m
+++ b/GraphBLAS/Source/codegen_1type.m
@@ -4,6 +4,9 @@
 % This function creates all files of the form GB_type__*.[ch], including 11
 % functions (GB_type__*.c) and one include file, GB_type__include.h.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 fprintf ('\ntypes:\n') ;
 
 f = fopen ('Generated/GB_type__include.h', 'w') ;
@@ -11,9 +14,8 @@
 fprintf (f, '// GB_type__include.h: definitions for GB_type__*.c\n') ;
 fprintf (f, '//------------------------------------------------------------------------------\n') ;
 fprintf (f, '\n') ;
-fprintf (f, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.\n') ;
-fprintf (f, '// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.\n') ;
-fprintf (f, '\n') ;
+fprintf (f, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.\n') ;
+fprintf (f, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (f, '// This file has been automatically generated from Generator/GB_type.h') ;
 fprintf (f, '\n\n') ;
 fclose (f) ;
diff --git a/GraphBLAS/Source/codegen_1type_template.m b/GraphBLAS/Source/codegen_1type_template.m
index 91c2577b02..cf4bb0fa1b 100644
--- a/GraphBLAS/Source/codegen_1type_template.m
+++ b/GraphBLAS/Source/codegen_1type_template.m
@@ -3,6 +3,9 @@ function codegen_1type_method (xtype)
 %
 % codegen_1type_method (xtype)
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 f = fopen ('control.m4', 'w') ;
 
 [fname, unsigned, bits] = codegen_type (xtype) ;
@@ -12,10 +15,10 @@ function codegen_1type_method (xtype)
 fprintf (f, 'define(`GB_Cdense_05d'', `GB_Cdense_05d__%s'')\n', fname) ;
 fprintf (f, 'define(`GB_Cdense_06d'', `GB_Cdense_06d__%s'')\n', fname) ;
 fprintf (f, 'define(`GB_Cdense_25'', `GB_Cdense_25__%s'')\n', fname) ;
+fprintf (f, 'define(`GB_convert_s2b'', `GB_convert_s2b__%s'')\n', fname) ;
 
 fprintf (f, 'define(`GB_ctype'', `%s'')\n', xtype) ;
 
-
 % mask macro
 if (isequal (xtype, 'GxB_FC32_t') || isequal (xtype, 'GxB_FC64_t'))
     asize = sprintf ('sizeof (%s)', xtype) ;
@@ -31,13 +34,13 @@ function codegen_1type_method (xtype)
 
 % construct the *.c file
 cmd = sprintf (...
-'cat control.m4 Generator/GB_type.c | m4 | tail -n +7 > Generated/GB_type__%s.c', ...
+'cat control.m4 Generator/GB_type.c | m4 | tail -n +8 > Generated/GB_type__%s.c', ...
 fname) ;
 system (cmd) ;
 
 % append to the *.h file
 cmd = sprintf (...
-'cat control.m4 Generator/GB_type.h | m4 | tail -n +7 >> Generated/GB_type__include.h') ;
+'cat control.m4 Generator/GB_type.h | m4 | tail -n +8 >> Generated/GB_type__include.h') ;
 system (cmd) ;
 
 delete ('control.m4') ;
diff --git a/GraphBLAS/Source/codegen_axb.m b/GraphBLAS/Source/codegen_axb.m
index 0d1e7b2384..aa33fb3729 100644
--- a/GraphBLAS/Source/codegen_axb.m
+++ b/GraphBLAS/Source/codegen_axb.m
@@ -4,6 +4,9 @@
 % This function creates all files of the form GB_AxB__*.[ch], including all
 % built-in semirings (GB_AxB__*.c) and one include file, GB_AxB__include.h.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % The ANY operator is not used as a multiplicative operator in the generated
 % functions.  It can be used as the multiplicative op in a semiring, but is
 % renamed to SECOND before calling the generated function.
@@ -15,13 +18,10 @@
 fprintf (f, '// GB_AxB__include.h: definitions for GB_AxB__*.c\n') ;
 fprintf (f, '//------------------------------------------------------------------------------\n') ;
 fprintf (f, '\n') ;
-fprintf (f, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.\n') ;
-fprintf (f, '// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.\n') ;
-fprintf (f, '\n') ;
+fprintf (f, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.\n') ;
+fprintf (f, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (f, '// This file has been automatically generated from Generator/GB_AxB.h') ;
 fprintf (f, '\n\n') ;
-fprintf (f, '#include "GB_iterator.h"\n') ;
-fprintf (f, '\n\n') ;
 fclose (f) ;
 
 % codegen_axb_template (multop, bmult, imult, fmult, dmult, fcmult, dcmult)
@@ -219,7 +219,7 @@
 atomx = {  1             , 1              , 1              , 0                };
 
 nbits = [8 16 32 64] ;
-bits =  { '0xFF', '0xFFFF', '0xFFFFFFFF', '0xFFFFFFFFFFFFFFFF' } ;
+bits =  { '0xFF', '0xFFFF', '0xFFFFFFFF', '0xFFFFFFFFFFFFFFFFL' } ;
 
 for i = 1:4
     addop = ops {i} ;
@@ -248,13 +248,47 @@
             else
                 id = '0' ;
             end
-            % fprintf ('%s %s %s %s %s %s %s %s %s %d 0\n', ...
-            % addop, multop, add, addfunc, mult, type, type, id, tm, at) ;
             codegen_axb_method (addop, multop, add, addfunc, mult, type, ...
                 type, id, tm, at, 0) ;
         end
     end
+end
 
+% positional semirings
+mults = { 'firsti', 'firsti1', 'firstj', 'firstj1', 'secondj', 'secondj1' } ;
+funcs = { 'i', '(i+1)', 'k', '(k+1)', 'j', '(j+1)' } ;
+
+% min, max, and times are normally terminal monoids, but there is no reason to terminate
+% them early when used with positional operators. Only the ANY monoid is still terminal.
+addops   = { 'min',                'max',                'any',   'plus',   'times'  } ;
+adds     = { 'w = GB_IMIN (w, t)', 'w = GB_IMAX (w, t)', 'w = t', 'w += t', 'w *= t' } ;
+addfuncs = {     'GB_IMIN (w, t)',     'GB_IMAX (w, t)',     't', 'w + t' , 'w * t'  } ;
+ids      = { 'INT64_MAX',          'INT64_MIN',          '0',     '0',      '1'      } ;
+terms    = { [ ],                  [ ],                  '0',     [ ],      [ ]      } ;
+atomx    = {  0                  ,  0 ,                   0,       1,        1,      } ;
+
+for j = 1:6
+    multop = mults {j} ;
+    mult = funcs {j} ;
+    fprintf ('\n%-9s', multop) ;
+    for i = 1:5
+        addop = addops {i} ;
+        addfunc = strrep (strrep (addfuncs {i}, 'xarg', 'w'), 'yarg', 't') ;
+        add = adds {i} ;
+        identity = ids {i} ;
+        term = terms {i} ;
+        at = atomx {i} ;
+        id = ids {i} ;
+        tm = terms {i} ;
+        at = atomx {i} ;
+        fprintf ('.') ;
+        codegen_axb_method (addop, multop, add, addfunc, mult, 'int64_t', ...
+            'int64_t', id, tm, at, 0) ;
+        id = strrep (id, '64', '32')  ;
+        fprintf ('.') ;
+        codegen_axb_method (addop, multop, add, addfunc, mult, 'int32_t', ...
+            'int32_t', id, tm, at, 0) ;
+    end
 end
 
 fprintf ('\n') ;
diff --git a/GraphBLAS/Source/codegen_axb_compare_template.m b/GraphBLAS/Source/codegen_axb_compare_template.m
index 6cf3392dbf..7497130274 100644
--- a/GraphBLAS/Source/codegen_axb_compare_template.m
+++ b/GraphBLAS/Source/codegen_axb_compare_template.m
@@ -1,5 +1,8 @@
 function codegen_axb_compare_template (multop, bmult, mult)
-%CODEGEN_AXB_COMPARE_TEMPLATE create a function for a semiring with a TxT->bool multiplier
+%CODEGEN_AXB_COMPARE_TEMPLATE create a function for a semiring with a TxT -> bool multiplier
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % For gcc and icc: lor, land, and lxor monoids can be done with OpenMP atomics.
 % For Microsoft Visual Studio: no boolean monoids work with OpenMP atomics.
@@ -7,8 +10,8 @@ function codegen_axb_compare_template (multop, bmult, mult)
 fprintf ('\n%-7s', multop) ;
 
 % lor monoid
-add = 'w |= t' ;
-addfunc = 'w | t' ;
+add = 'w |= t' ;        % TODO: should this be w ||= t ?
+addfunc = 'w | t' ;     % TODO: should this be w || t ?
 codegen_axb_method ('lor', multop, add, addfunc, bmult, 'bool', 'bool'    , 'false', 'true', 1, 0) ;
 codegen_axb_method ('lor', multop, add, addfunc,  mult, 'bool', 'int8_t'  , 'false', 'true', 1, 0) ;
 codegen_axb_method ('lor', multop, add, addfunc,  mult, 'bool', 'uint8_t' , 'false', 'true', 1, 0) ;
@@ -37,8 +40,8 @@ function codegen_axb_compare_template (multop, bmult, mult)
 codegen_axb_method ('any', multop, add, addfunc,  mult, 'bool', 'double'  , 'false', '(any value)', 0, 0) ;
 
 % land monoid
-add = 'w &= t' ;
-addfunc = 'w & t' ;
+add = 'w &= t' ;        % TODO: should this be w &&= t ?
+addfunc = 'w & t' ;     % TODO: should this be w && t ?
 codegen_axb_method ('land', multop, add, addfunc, bmult, 'bool', 'bool'    , 'true', 'false', 1, 0) ;
 codegen_axb_method ('land', multop, add, addfunc,  mult, 'bool', 'int8_t'  , 'true', 'false', 1, 0) ;
 codegen_axb_method ('land', multop, add, addfunc,  mult, 'bool', 'uint8_t' , 'true', 'false', 1, 0) ;
diff --git a/GraphBLAS/Source/codegen_axb_method.m b/GraphBLAS/Source/codegen_axb_method.m
index 6a9a4586aa..e278d3ee57 100644
--- a/GraphBLAS/Source/codegen_axb_method.m
+++ b/GraphBLAS/Source/codegen_axb_method.m
@@ -4,52 +4,85 @@ function codegen_axb_method (addop, multop, add, addfunc, mult, ztype, ...
 %
 % codegen_axb_method (addop, multop, add, addfunc, mult, ztype, xytype, identity, terminal, omp_atomic, omp_microsoft_atomic)
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 if (isempty (mult))
     return
 end
 
 f = fopen ('control.m4', 'w') ;
 
-is_first     = isequal (multop, 'first') ;
-is_second    = isequal (multop, 'second') ;
-is_pair      = isequal (multop, 'pair') ;
-is_any       = isequal (addop, 'any') ;
-is_eq        = isequal (addop, 'eq') ;
-is_any_pair  = is_any && isequal (multop, 'pair') ;
+is_first  = false ;
+is_second = false ;
+is_pair   = false ;
+is_positional = false ;
+switch (multop)
+    case { 'firsti', 'firsti1', 'firstj', 'firstj1', 'secondj', 'secondj1' }
+        is_positional = true ;
+    case { 'first' }
+        is_first = true ;
+    case { 'second' }
+        is_second = true ;
+    case { 'pair' }
+        is_pair = true ;
+end
+
+is_any = isequal (addop, 'any') ;
+is_max = isequal (addop, 'max') ;
+is_min = isequal (addop, 'min') ;
+is_eq  = isequal (addop, 'eq') ;
+is_any_pair = is_any && isequal (multop, 'pair') ;
 ztype_is_real = ~contains (ztype, 'FC') ;
+is_any_complex = is_any && ~ztype_is_real ;
 is_plus_pair_real = isequal (addop, 'plus') && isequal (multop, 'pair') ...
     && ztype_is_real ;
 
+t_is_simple = isequal (multop, 'pair') || contains (multop, 'first') || contains (multop, 'second') ;
+t_is_nonnan = isequal (multop (1:2), 'is') || (multop (1) == 'l') ;
+
 switch (ztype)
     case { 'bool' }
+        ztype_is_float = false ;
         ztype_ignore_overflow = false ;
         nbits = 8 ;
         bits = '0x1L' ;
     case { 'int8_t', 'uint8_t' }
+        ztype_is_float = false ;
         ztype_ignore_overflow = false ;
         nbits = 8 ;
         bits = '0xffL' ;
+        xbits = '0xFF' ;
     case { 'int16_t', 'uint16_t' }
+        ztype_is_float = false ;
         ztype_ignore_overflow = false ;
         nbits = 16 ;
         bits = '0xffffL' ;
+        xbits = '0xFFFF' ;
     case { 'int32_t', 'uint32_t' }
+        ztype_is_float = false ;
         ztype_ignore_overflow = false ;
         nbits = 32 ;
         bits = '0xffffffffL' ;
+        xbits = '0xFFFFFFFF' ;
     case { 'int64_t', 'uint64_t' }
+        ztype_is_float = false ;
         ztype_ignore_overflow = true ;
         nbits = 64 ;
         bits = '0' ;
+        xbits = '0xFFFFFFFFFFFFFFFFL' ;
     case { 'float' }
+        ztype_is_float = true ;
         ztype_ignore_overflow = true ;
         nbits = 32 ;
         bits = '0' ;
     case { 'double', 'GxB_FC32_t' }
+        ztype_is_float = true ;
         ztype_ignore_overflow = true ;
         nbits = 64 ;
         bits = '0' ;
     case { 'GxB_FC64_t' }
+        ztype_is_float = true ;
         ztype_ignore_overflow = true ;
         nbits = 128 ;
         bits = '0' ;
@@ -78,11 +111,9 @@ function codegen_axb_method (addop, multop, add, addfunc, mult, ztype, ...
 name = sprintf ('%s_%s_%s', addop, multop, fname) ;
 
 % function names
-fprintf (f, 'define(`GB_AgusB'', `GB_AgusB__%s'')\n', name) ;
 fprintf (f, 'define(`GB_Adot2B'', `GB_Adot2B__%s'')\n', name) ;
 fprintf (f, 'define(`GB_Adot3B'', `GB_Adot3B__%s'')\n', name) ;
 fprintf (f, 'define(`GB_Adot4B'', `GB_Adot4B__%s'')\n', name) ;
-fprintf (f, 'define(`GB_AheapB'', `GB_AheapB__%s'')\n', name) ;
 fprintf (f, 'define(`GB_Asaxpy3B'', `GB_Asaxpy3B__%s'')\n', name) ;
 
 % type of C, A, and B
@@ -103,6 +134,16 @@ function codegen_axb_method (addop, multop, add, addfunc, mult, ztype, ...
         fprintf (f, 'define(`GB_ctype_cast'', `((GB_ctype) $1)'')\n') ;
 end
 
+% simple typecast from 1 (or 2) real scalars to any other type
+switch (xytype)
+    case { 'GxB_FC32_t' }
+        fprintf (f, 'define(`GB_atype_cast'', `GxB_CMPLXF (((float) $1), ((float) $2))'')\n') ;
+    case { 'GxB_FC64_t' }
+        fprintf (f, 'define(`GB_atype_cast'', `GxB_CMPLX (((double) $1), ((double) $2))'')\n') ;
+    otherwise
+        fprintf (f, 'define(`GB_atype_cast'', `((GB_atype) $1)'')\n') ;
+end
+
 % identity and terminal values for the monoid
 fprintf (f, 'define(`GB_identity'', `%s'')\n', identity) ;
 
@@ -130,19 +171,29 @@ function codegen_axb_method (addop, multop, add, addfunc, mult, ztype, ...
     fprintf (f, 'define(`GB_is_eq_monoid'', `0'')\n') ;
 end
 
+% for the conventional semirings in MATLAB, which get extra optimization
+if (isequal (addop, 'plus') && isequal (multop, 'times') && ztype_is_float)
+    fprintf (f, 'define(`GB_is_performance_critical_semiring'', `1'')\n') ;
+else
+    fprintf (f, 'define(`GB_is_performance_critical_semiring'', `0'')\n') ;
+end
+
+
 if (is_any)
     % the ANY monoid terminates on the first entry seen
     fprintf (f, 'define(`GB_is_any_monoid'', `1'')\n') ;
-    fprintf (f, 'define(`GB_terminal'', `{ cij_is_terminal = true ; break ; }'')\n') ;
+    fprintf (f, 'define(`GB_terminal'', `break ;'')\n') ;
     fprintf (f, 'define(`GB_dot_simd_vectorize'', `;'')\n') ;
 elseif (~isempty (terminal))
+    % terminal monoids terminate when cij equals the terminal value
     fprintf (f, 'define(`GB_is_any_monoid'', `0'')\n') ;
-    fprintf (f, 'define(`GB_terminal'', `if (cij == %s) { cij_is_terminal = true ; break ; }'')\n', terminal) ;
+    fprintf (f, 'define(`GB_terminal'', `if (cij == %s) break ;'')\n', ...
+        terminal) ;
     fprintf (f, 'define(`GB_dot_simd_vectorize'', `;'')\n') ;
 else
+    % non-terminal monoids
     fprintf (f, 'define(`GB_is_any_monoid'', `0'')\n') ;
     fprintf (f, 'define(`GB_terminal'', `;'')\n') ;
-    fprintf (f, 'define(`GB_terminal_flag'', `;'')\n') ;
     op = '' ;
     if (ztype_is_real)
         switch (addop)
@@ -175,8 +226,17 @@ function codegen_axb_method (addop, multop, add, addfunc, mult, ztype, ...
 end
 
 if (ztype_is_real)
-    % all built-in real monoids are atomic
-    fprintf (f, 'define(`GB_has_atomic'', `1'')\n') ;
+    if (omp_atomic || is_any)
+        % on x86: all built-in real monoids are atomic.
+        % The ANY monoid is atomic on any architecture.
+        % MIN, MAX, EQ, XNOR are implemented with atomic compare/exchange.
+        fprintf (f, 'define(`GB_has_atomic'', `1'')\n') ;
+    else
+        %% % no built-in OpenMP atomic pragma for this monoid.
+        %% % Do not use atomic compare/exchange unless on the x86.
+        %% fprintf (f, 'define(`GB_has_atomic'', `GB_X86_64'')\n') ;
+        fprintf (f, 'define(`GB_has_atomic'', `1'')\n') ;
+    end
 else
     % complex monoids are not atomic, except for 'plus'
     if (isequal (addop, 'plus'))
@@ -186,6 +246,27 @@ function codegen_axb_method (addop, multop, add, addfunc, mult, ztype, ...
     end
 end
 
+% firsti multiply operator
+if (contains (multop, 'firsti'))
+    fprintf (f, 'define(`GB_is_firsti_multiplier'', `1'')\n') ;
+else
+    fprintf (f, 'define(`GB_is_firsti_multiplier'', `0'')\n') ;
+end
+
+% firstj multiply operator
+if (contains (multop, 'firstj'))
+    fprintf (f, 'define(`GB_is_firstj_multiplier'', `1'')\n') ;
+else
+    fprintf (f, 'define(`GB_is_firstj_multiplier'', `0'')\n') ;
+end
+
+% secondj multiply operator
+if (contains (multop, 'secondj'))
+    fprintf (f, 'define(`GB_is_secondj_multiplier'', `1'')\n') ;
+else
+    fprintf (f, 'define(`GB_is_secondj_multiplier'', `0'')\n') ;
+end
+
 % plus_fc32 monoid:
 if (isequal (addop, 'plus') && isequal (ztype, 'GxB_FC32_t'))
     fprintf (f, 'define(`GB_is_plus_fc32_monoid'', `1'')\n') ;
@@ -200,38 +281,80 @@ function codegen_axb_method (addop, multop, add, addfunc, mult, ztype, ...
     fprintf (f, 'define(`GB_is_plus_fc64_monoid'', `0'')\n') ;
 end
 
+% any_fc32 monoid:
+if (isequal (addop, 'any') && isequal (ztype, 'GxB_FC32_t'))
+    fprintf (f, 'define(`GB_is_any_fc32_monoid'', `1'')\n') ;
+else
+    fprintf (f, 'define(`GB_is_any_fc32_monoid'', `0'')\n') ;
+end
+
+% any_fc64 monoid:
+if (isequal (addop, 'any') && isequal (ztype, 'GxB_FC64_t'))
+    fprintf (f, 'define(`GB_is_any_fc64_monoid'', `1'')\n') ;
+else
+    fprintf (f, 'define(`GB_is_any_fc64_monoid'', `0'')\n') ;
+end
+
+% min monoids:
+if (is_min)
+    if (contains (ztype, 'int'))
+        % min monoid for signed or unsigned integers
+        fprintf (f, 'define(`GB_is_imin_monoid'', `1'')\n') ;
+        fprintf (f, 'define(`GB_is_fmin_monoid'', `0'')\n') ;
+    else
+        % min monoid for float or double
+        fprintf (f, 'define(`GB_is_imin_monoid'', `0'')\n') ;
+        fprintf (f, 'define(`GB_is_fmin_monoid'', `1'')\n') ;
+    end
+else
+    % not a min monoid
+    fprintf (f, 'define(`GB_is_imin_monoid'', `0'')\n') ;
+    fprintf (f, 'define(`GB_is_fmin_monoid'', `0'')\n') ;
+end
+
+% max monoids:
+if (is_max)
+    if (contains (ztype, 'int'))
+        % max monoid for signed or unsigned integers
+        fprintf (f, 'define(`GB_is_imax_monoid'', `1'')\n') ;
+        fprintf (f, 'define(`GB_is_fmax_monoid'', `0'')\n') ;
+    else
+        % max monoid for float or double
+        fprintf (f, 'define(`GB_is_imax_monoid'', `0'')\n') ;
+        fprintf (f, 'define(`GB_is_fmax_monoid'', `1'')\n') ;
+    end
+else
+    % not a max monoid
+    fprintf (f, 'define(`GB_is_imax_monoid'', `0'')\n') ;
+    fprintf (f, 'define(`GB_is_fmax_monoid'', `0'')\n') ;
+end
+
 % only PLUS, TIMES, LOR, LAND, and LXOR can be done with OpenMP atomics
 % in gcc and icc.  However, only PLUS and TIMES work with OpenMP atomics
 % in Microsoft Visual Studio; the LOR, LAND, and LXOR atomics don't compile.
 fprintf (f, 'define(`GB_has_omp_atomic'', `%d'')\n', omp_atomic) ;
 fprintf (f, 'define(`GB_microsoft_has_omp_atomic'', `%d'')\n', omp_microsoft_atomic) ;
 
-% MIN and MAX for floating-point types need unsigned integer puns
-% pun for compare-and-swap of ztype
-if (isequal (ztype, 'float'))
-    pun = 'uint32_t' ;
-elseif (isequal (ztype, 'double'))
-    pun = 'uint64_t' ;
-else
-    % no type punning needed for compare-and-swap
-    pun = ztype ;
-end
-fprintf (f, 'define(`GB_ctype_pun'', `%s'')\n', pun) ;
-
 % to get an entry from A
-if (is_second || is_pair)
+if (is_second || is_pair || is_positional)
     % value of A is ignored for the SECOND and PAIR operators
+    fprintf (f, 'define(`GB_a_is_pattern'', `1'')\n') ;
     fprintf (f, 'define(`GB_geta'', `;'')\n') ;
 else
+    fprintf (f, 'define(`GB_a_is_pattern'', `0'')\n') ;
     fprintf (f, 'define(`GB_geta'', `%s $1 = $2 [$3]'')\n', xytype) ;
 end
 
 % to get an entry from B
-if (is_first || is_pair)
+if (is_first || is_pair || is_positional)
     % value of B is ignored for the FIRST and PAIR operators
+    fprintf (f, 'define(`GB_b_is_pattern'', `1'')\n') ;
     fprintf (f, 'define(`GB_getb'', `;'')\n') ;
+    fprintf (f, 'define(`GB_loadb'', `;'')\n') ;
 else
+    fprintf (f, 'define(`GB_b_is_pattern'', `0'')\n') ;
     fprintf (f, 'define(`GB_getb'', `%s $1 = $2 [$3]'')\n', xytype) ;
+    fprintf (f, 'define(`GB_loadb'', `$1 [$2] = $3 [$4]'')\n', xytype) ;
 end
 
 % type-specific IDIV
@@ -249,9 +372,36 @@ function codegen_axb_method (addop, multop, add, addfunc, mult, ztype, ...
 mult2 = strrep (mult2, 'yarg', '`$3''') ;
 fprintf (f, 'define(`GB_multiply'', `$1 = %s'')\n', mult2) ;
 
-% create the add operator, of the form w += t
-add2 = strrep (add,  'w', '`$1''') ;
-add2 = strrep (add2, 't', '`$2''') ;
+% create the add update, of the form w += t
+if (is_min)
+    if (contains (ztype, 'int'))
+        % min monoid for signed or unsigned integers
+        add2 = 'if ($1 > $2) $1 = $2' ;
+    else
+        % min monoid for float or double, with omitnan property
+        if (t_is_nonnan)
+            add2 = 'if (!islessequal ($1, $2)) $1 = $2' ;
+        else
+            add2 = 'if (!isnan ($2) && !islessequal ($1, $2)) $1 = $2' ;
+        end
+    end
+elseif (is_max)
+    if (contains (ztype, 'int'))
+        % max monoid for signed or unsigned integers
+        add2 = 'if ($1 < $2) $1 = $2' ;
+    else
+        % max monoid for float or double, with omitnan property
+        if (t_is_nonnan)
+            add2 = 'if (!isgreaterequal ($1, $2)) $1 = $2' ;
+        else
+            add2 = 'if (!isnan ($2) && !isgreaterequal ($1, $2)) $1 = $2' ;
+        end
+    end
+else
+    % use the add function as given
+    add2 = strrep (add,  'w', '`$1''') ;
+    add2 = strrep (add2, 't', '`$2''') ;
+end
 fprintf (f, 'define(`GB_add_update'', `%s'')\n', add2) ;
 
 % create the add function, of the form w + t
@@ -260,9 +410,10 @@ function codegen_axb_method (addop, multop, add, addfunc, mult, ztype, ...
 fprintf (f, 'define(`GB_add_function'', `%s'')\n', add2) ;
 
 % create the multiply-add operator
-if (isequal (ztype, 'float') || isequal (ztype, 'double') || ...
-    isequal (ztype, 'bool') || is_first || is_second || is_pair || ...
-    isequal (multop (1:2), 'is') || isequal (multop, 'any'))
+is_imin_or_imax = (isequal (addop, 'min') || isequal (addop, 'max')) && contains (ztype, 'int') ;
+if (~is_imin_or_imax && ...
+    (isequal (ztype, 'float') || isequal (ztype, 'double') || ...
+     isequal (ztype, 'bool') || is_first || is_second || is_pair || is_positional))
     % float and double do not get promoted.
     % bool is OK since promotion of the result (0 or 1) to int is safe.
     % first and second are OK since no promotion occurs.
@@ -272,13 +423,205 @@ function codegen_axb_method (addop, multop, add, addfunc, mult, ztype, ...
     multadd = strrep (multadd, 'xarg', '`$2''') ;
     multadd = strrep (multadd, 'yarg', '`$3''') ;
     fprintf (f, 'define(`GB_multiply_add'', `%s'')\n', multadd) ;
+    need_mult_typecast = false ;
 else
     % use explicit typecasting to avoid ANSI C integer promotion.
     add2 = strrep (add,  'w', '`$1''') ;
     add2 = strrep (add2, 't', 'x_op_y') ;
     fprintf (f, 'define(`GB_multiply_add'', `%s x_op_y = %s ; %s'')\n', ...
         ztype, mult2, add2) ;
+    need_mult_typecast = true ;
+end
+
+% create the bitmap multiply-add statement:
+% The bitmap_multadd (cb,cx,exists,ax,bx) macro computes does the following.
+% The value of cx has been initialized to the identity value of the monoid, so
+% cx += ax*bx can always be used (except for the ANY monoid).
+%
+%   if (exists)
+%       if (cb == 0)
+%           cx = ax * bx
+%           cb = 1
+%       else
+%           cx += ax * bx
+
+mult2 = strrep (mult,  'xarg', 'ax') ;
+mult2 = strrep (mult2, 'yarg', 'bx') ;
+xinit = ';' ;
+xload = ';' ;
+idbyte = '' ;
+if (need_mult_typecast)
+    % the result of the multiplier must be explicitly typcasted
+    mult2 = sprintf ('((%s) (%s))', ztype, mult2) ;
+else
+    mult2 = sprintf ('(%s)', mult2) ;
+end
+
+switch (addop)
+
+    % any monoid
+    case { 'any' }
+        if (isequal (multop, 'pair'))
+            s = ' ' ;
+        else
+            s = sprintf ('if (exists && !cb) cx = %s', mult2) ;
+        end
+
+    % boolean monoids (except eq / lxnor)
+    case { 'lor' }
+        % TODO: should this be: cx ||= exists && %s ?
+        s = sprintf ('cx |= exists & %s', mult2) ;
+        idbyte = '0' ;
+    case { 'land' }
+        % TODO: should this be: cx &&= !exists || %s ?
+        s = sprintf ('cx &= ~exists | %s', mult2) ;
+        idbyte = '1' ;
+    case { 'lxor' }
+        if (isequal (multop, 'pair'))
+            s = sprintf ('cx ^= exists') ;
+        else
+            % TODO: should this be: cx ^= exists && %s ?
+            s = sprintf ('cx ^= exists & %s', mult2) ;
+        end
+        idbyte = '0' ;
+
+    % min/max monoids:
+    case { 'min' }
+        if (contains (ztype, 'int'))
+            % min for signed or unsigned integers
+            if (t_is_simple)
+                s = sprintf ('if (exists && cx > %s) cx = %s', mult2, mult2) ;
+            else
+                s = sprintf ('%s t = %s ; if (exists && cx > t) cx = t', ztype, mult2) ;
+            end
+        else
+            % min for float or double, with omitnan property
+            if (t_is_simple)
+                s = sprintf ('if (exists && !isnan (%s) && !islessequal (cx, %s)) cx = %s', mult2, mult2, mult2) ;
+            elseif (t_is_nonnan)
+                s = sprintf ('%s t = %s ; if (exists && !islessequal (cx, t)) cx = t', ztype, mult2) ;
+            else
+                s = sprintf ('%s t = %s ; if (exists && !isnan (t) && !islessequal (cx, t)) cx = t', ztype, mult2) ;
+            end
+        end
+        if (contains (ztype, 'uint'))
+            idbyte = '0xFF' ;
+        end
+    case { 'max' }
+        if (contains (ztype, 'int'))
+            % max for signed or unsigned integers
+            if (t_is_simple)
+                s = sprintf ('if (exists && cx < %s) cx = %s', mult2, mult2) ;
+            else
+                s = sprintf ('%s t = %s ; if (exists && cx < t) cx = t', ztype, mult2) ;
+            end
+        else
+            % max for float or double, with omitnan property
+            if (t_is_simple)
+                s = sprintf ('if (exists && !isnan (%s) && !isgreaterequal (cx, %s)) cx = %s', mult2, mult2, mult2) ;
+            elseif (t_is_nonnan)
+                s = sprintf ('%s t = %s ; if (exists && !isgreaterequal (cx, t)) cx = t', ztype, mult2) ;
+            else
+                s = sprintf ('%s t = %s ; if (exists && !isnan (t) && !isgreaterequal (cx, t)) cx = t', ztype, mult2) ;
+            end
+        end
+        if (contains (ztype, 'uint'))
+            idbyte = '0' ;
+        end
+
+    % plus monoid: special cases for some multipliers
+    case { 'plus' }
+        idbyte = '0' ;
+        if (ztype_is_real)
+            if (isequal (multop, 'times'))
+                % X = {0,bx}
+                xinit = sprintf ('%s X [2] = {0,0}', ztype) ;
+                xload = 'X [1] = bx' ;
+                if (need_mult_typecast)
+                    s = sprintf ('cx += (%s) (ax * X [exists])', ztype) ;
+                else
+                    s = 'cx += ax * X [exists]' ;
+                end
+            elseif (isequal (multop, 'pair'))
+                s = 'cx += exists' ;
+            else
+                % X = {0,1}
+                xinit = sprintf ('%s X [2] = {0,1}', ztype) ;
+                if (need_mult_typecast)
+                    s = sprintf ('cx += (%s) (%s * X [exists])', ztype, mult2) ;
+                else
+                    s = sprintf ('cx += %s * X [exists]', mult2) ;
+                end
+            end
+        else
+            % plus monoids for complex types
+            s = '' ;
+        end
+
+    % bitwise monoids (except bxnor)
+    case { 'bor' }
+        % X = {all zeros, all ones}
+        xinit = sprintf ('%s X [2] = {0,%s}', ztype, xbits) ;
+        s = sprintf ('cx |= X [exists] & %s', mult2) ;
+        idbyte = '0' ;
+    case { 'band' }
+        % X = {all ones, all zeros}
+        xinit = sprintf ('%s X [2] = {%s,0}', ztype, xbits) ;
+        s = sprintf ('cx &= X [exists] | %s', mult2) ;
+        idbyte = '0xFF' ;
+    case { 'bxor' }
+        % X = {all zeros, all ones}
+        xinit = sprintf ('%s X [2] = {0,%s}', ztype, xbits) ;
+        s = sprintf ('cx ^= X [exists] & %s', mult2) ;
+        idbyte = '0' ;
+
+    % these monoids do not have a concise bitmap multiply-add
+    case { 'eq' }
+        s = '' ;
+        idbyte = '1' ;      % eq monoid: identity byte for memset
+    case { 'times' }
+        s = '' ;
+        idbyte = '' ;
+    case {'bxnor' }
+        s = '' ;
+        idbyte = '0xFF' ;   % bxnor monoid: identity byte for memset
+end
+
+if (isempty (idbyte))
+    fprintf (f, 'define(`GB_has_identity_byte'', `0'')\n') ;
+    fprintf (f, 'define(`GB_identity_byte'', `(none)'')\n') ;
+else
+    fprintf (f, 'define(`GB_has_identity_byte'', `1'')\n') ;
+    fprintf (f, 'define(`GB_identity_byte'', `%s'')\n', idbyte) ;
+end
+
+% disable the bitmap multadd when using div or rdiv and any floating-point
+% type, to avoid divide-by-zero when operating on entries not in the bitmap.
+if (contains (multop, 'div') && ztype_is_float)
+    s = '' ;
+end
+
+if (isempty (s))
+    fprintf (f, 'define(`GB_has_bitmap_multadd'', `0'')\n') ;
+    fprintf (f, 'define(`GB_bitmap_multadd'', `(none)'')\n') ;
+else
+    if (length (s) > 1)
+        s = [s ' ; cb |= exists'] ;
+    else
+        s = ['cb |= exists'] ;
+    end
+    s = strrep (s, 'cb', '$1') ;
+    s = strrep (s, 'cx', '$2') ;
+    s = strrep (s, 'exists', '$3') ;
+    s = strrep (s, 'ax', '$4') ;
+    s = strrep (s, 'bx', '$5') ;
+    fprintf (f, 'define(`GB_has_bitmap_multadd'', `1'')\n') ;
+    fprintf (f, 'define(`GB_bitmap_multadd'', `%s'')\n', s) ;
 end
+xload = strrep (xload, 'bx', '$1') ;
+fprintf (f, 'define(`GB_xload'', `%s'')\n', xload) ;
+fprintf (f, 'define(`GB_xinit'', `%s'')\n', xinit) ;
+% fprintf ('(%5s %-8s %10s): { %s } { %s } { %s }\n', addop, multop, ztype, xinit, xload, s) ;
 
 % create the disable flag
 disable  = sprintf ('GxB_NO_%s', upper (addop)) ;
@@ -295,24 +638,18 @@ function codegen_axb_method (addop, multop, add, addfunc, mult, ztype, ...
 fprintf (f, 'define(`GB_disable'', `(%s)'')\n', disable) ;
 fclose (f) ;
 
-% To create GB_control.h
-% ff = fopen ('temp.h', 'a') ;
-% fprintf (ff, '// #define GxB_NO_%s\n', upper (addop)) ;
-% fprintf (ff, '// #define GxB_NO_%s\n', upper (multop)) ;
-% fprintf (ff, '//  #define GxB_NO_%s\n', upper (fname)) ;
-% fprintf (ff, '//   #define GxB_NO_%s_%s_%s\n', upper (addop), upper (multop), upper (fname)) ;
-% fclose (ff) ;
+nprune = 52 ;
 
 % construct the *.c file
 cmd = sprintf (...
-'cat control.m4 Generator/GB_AxB.c | m4 | tail -n +35 > Generated/GB_AxB__%s.c', ...
-name) ;
+'cat control.m4 Generator/GB_AxB.c | m4 | tail -n +%d > Generated/GB_AxB__%s.c', ...
+nprune, name) ;
 fprintf ('.') ;
 system (cmd) ;
 
 % append to the *.h file
 cmd = sprintf (...
-'cat control.m4 Generator/GB_AxB.h | m4 | tail -n +35 >> Generated/GB_AxB__include.h') ;
+'cat control.m4 Generator/GB_AxB.h | m4 | tail -n +%d >> Generated/GB_AxB__include.h', nprune) ;
 system (cmd) ;
 
 delete ('control.m4') ;
diff --git a/GraphBLAS/Source/codegen_axb_template.m b/GraphBLAS/Source/codegen_axb_template.m
index ccffd2078e..ce5895d2a0 100644
--- a/GraphBLAS/Source/codegen_axb_template.m
+++ b/GraphBLAS/Source/codegen_axb_template.m
@@ -1,6 +1,9 @@
 function codegen_axb_template (multop, bmult, imult, fmult, dmult, fcmult, dcmult)
 %CODEGEN_AXB_TEMPLATE create a function for a semiring with a TxT->T multiplier
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 fprintf ('\n%-7s', multop) ;
 
 plusinf32 = 'INFINITY' ;
diff --git a/GraphBLAS/Source/codegen_binop.m b/GraphBLAS/Source/codegen_binop.m
index 8de92fe0be..184b981f1c 100644
--- a/GraphBLAS/Source/codegen_binop.m
+++ b/GraphBLAS/Source/codegen_binop.m
@@ -4,6 +4,9 @@
 % This function creates all files of the form GB_binop__*.[ch], including 260
 % functions (GB_binop__*.c) and one include file, GB_binop__include.h.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 fprintf ('\nbinary operators:\n') ;
 
 f = fopen ('Generated/GB_binop__include.h', 'w') ;
@@ -11,13 +14,10 @@
 fprintf (f, '// GB_binop__include.h: definitions for GB_binop__*.c\n') ;
 fprintf (f, '//------------------------------------------------------------------------------\n') ;
 fprintf (f, '\n') ;
-fprintf (f, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.\n') ;
-fprintf (f, '// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.\n') ;
-fprintf (f, '\n') ;
+fprintf (f, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.\n') ;
+fprintf (f, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (f, '// This file has been automatically generated from Generator/GB_binop.h') ;
 fprintf (f, '\n\n') ;
-fprintf (f, '#include "GB_iterator.h"\n') ;
-fprintf (f, '\n\n') ;
 fclose (f) ;
 
 % The ANY operator is not used as a binary operator in the generated functions.
diff --git a/GraphBLAS/Source/codegen_binop_method.m b/GraphBLAS/Source/codegen_binop_method.m
index 3af64c6e79..ec0c22fff3 100644
--- a/GraphBLAS/Source/codegen_binop_method.m
+++ b/GraphBLAS/Source/codegen_binop_method.m
@@ -3,6 +3,9 @@ function codegen_binop_method (binop, op, xtype)
 %
 % codegen_binop_method (binop, op, xtype)
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 f = fopen ('control.m4', 'w') ;
 
 % no code is generated for the ANY operator (SECOND is used in its place)
@@ -123,7 +126,7 @@ function codegen_binop_method (binop, op, xtype)
 % determine type of z, x, and y from xtype and binop
 switch (binop)
     case { 'eq', 'ne', 'gt', 'lt', 'ge', 'le' }
-        % GrB_LT_* and related operators are TxT->bool
+        % GrB_LT_* and related operators are TxT -> bool
         ztype = 'bool' ;
         ytype = xtype ;
     case { 'cmplx' }
diff --git a/GraphBLAS/Source/codegen_binop_template.m b/GraphBLAS/Source/codegen_binop_template.m
index 864fbd1c58..f59efa169b 100644
--- a/GraphBLAS/Source/codegen_binop_template.m
+++ b/GraphBLAS/Source/codegen_binop_template.m
@@ -3,8 +3,8 @@ function codegen_binop_template (binop, bfunc, ifunc, ffunc, dfunc, fcfunc, dcfu
 %
 % Generate functions for a binary operator, for all types.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n%-9s', binop) ;
 
diff --git a/GraphBLAS/Source/codegen_red.m b/GraphBLAS/Source/codegen_red.m
index 2649ed8877..1a77a2680d 100644
--- a/GraphBLAS/Source/codegen_red.m
+++ b/GraphBLAS/Source/codegen_red.m
@@ -4,6 +4,9 @@
 % This function creates all files of the form GB_red__*.c,
 % and the include file GB_red__include.h.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 fprintf ('\nreduction operators:\n') ;
 
 f = fopen ('Generated/GB_red__include.h', 'w') ;
@@ -11,9 +14,8 @@
 fprintf (f, '// GB_red__include.h: definitions for GB_red__*.c\n') ;
 fprintf (f, '//------------------------------------------------------------------------------\n') ;
 fprintf (f, '\n') ;
-fprintf (f, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.\n') ;
-fprintf (f, '// http://suitesparse.com   See GraphBLAS/Doc/License.txargt for license.\n') ;
-fprintf (f, '\n') ;
+fprintf (f, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.\n') ;
+fprintf (f, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (f, '// This file has been automatically generated from Generator/GB_red.h') ;
 fprintf (f, '\n\n') ;
 fclose (f) ;
@@ -22,7 +24,7 @@
 % the monoid: MIN, MAX, PLUS, TIMES, ANY, OR, AND, XOR, EQ
 %-------------------------------------------------------------------------------
 
-% Note that the min and max monoids are carefully written to obtain the correct
+% Note that the min and max monoids are written to obtain the correct
 % NaN behavior for float and double.  Comparisons with NaN are always false.
 % zarg is the accumulator.  If zarg is not NaN and the comparison is false,
 % zarg is not modified and the value of yarg is properly ignored.  Thus if zarg
@@ -70,7 +72,7 @@
 op = 'if ((yarg > zarg) || (zarg != zarg)) zarg = yarg' ;
 codegen_red_method ('max',    op, 'float'   , '(-INFINITY)', 'INFINITY' , 16) ;
 codegen_red_method ('max',    op, 'double'  , ...
-    '((double) INFINITY)'  , '((double) -INFINITY)' , 16) ;
+    '((double) -INFINITY)'  , '((double) INFINITY)' , 16) ;
 
 % ANY: 13 monoids (including bool and complex)
 fprintf ('\nany    ') ;
diff --git a/GraphBLAS/Source/codegen_red_method.m b/GraphBLAS/Source/codegen_red_method.m
index 4e71484f70..f4c6d47507 100644
--- a/GraphBLAS/Source/codegen_red_method.m
+++ b/GraphBLAS/Source/codegen_red_method.m
@@ -3,6 +3,9 @@ function codegen_red_method (opname, func, atype, identity, terminal, panel)
 %
 % codegen_red_method (opname, func, atype, identity, terminal)
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 f = fopen ('control.m4', 'w') ;
 
 [aname, unsigned, bits] = codegen_type (atype) ;
@@ -19,16 +22,12 @@ function codegen_red_method (opname, func, atype, identity, terminal, panel)
 
 if (~isempty (identity))
     fprintf (f, 'define(`GB_red_scalar'',    `GB_red_scalar__%s'')\n',    name);
-    fprintf (f, 'define(`GB_red_eachvec'',   `GB_red_eachvec__%s'')\n',   name);
-    fprintf (f, 'define(`GB_red_eachindex'', `GB_red_eachindex__%s'')\n', name);
     % identity and terminal values for the monoid
     fprintf (f, 'define(`GB_identity'', `%s'')\n', identity) ;
     fprintf (f, 'define(`if_is_monoid'', `'')\n') ;
     fprintf (f, 'define(`endif_is_monoid'', `'')\n') ;
 else
     fprintf (f, 'define(`GB_red_scalar'',    `GB_red_scalar__(none)'')\n') ;
-    fprintf (f, 'define(`GB_red_eachvec'',   `GB_red_eachvec__(none)'')\n') ;
-    fprintf (f, 'define(`GB_red_eachindex'', `GB_red_eachindex__(none)'')\n') ;
     % first and second operators are not monoids (GB_red_build only)
     fprintf (f, 'define(`GB_identity'', `(none)'')\n') ;
     fprintf (f, 'define(`if_is_monoid'', `#if 0'')\n') ;
@@ -39,16 +38,19 @@ function codegen_red_method (opname, func, atype, identity, terminal, panel)
     fprintf (f, 'define(`GB_is_any_monoid'', `1'')\n') ;
     fprintf (f, 'define(`GB_has_terminal'', `1'')\n') ;
     fprintf (f, 'define(`GB_terminal_value'', `(any value)'')\n') ;
+    fprintf (f, 'define(`GB_is_terminal'', `true'')\n') ;
     fprintf (f, 'define(`GB_terminal'', `break ;'')\n') ;
 elseif (~isempty (terminal))
     fprintf (f, 'define(`GB_is_any_monoid'', `0'')\n') ;
     fprintf (f, 'define(`GB_has_terminal'', `1'')\n') ;
     fprintf (f, 'define(`GB_terminal_value'', `%s'')\n', terminal) ;
+    fprintf (f, 'define(`GB_is_terminal'', `(s == %s)'')\n', terminal) ;
     fprintf (f, 'define(`GB_terminal'', `if (s == %s) break ;'')\n', terminal) ;
 else
     fprintf (f, 'define(`GB_is_any_monoid'', `0'')\n') ;
     fprintf (f, 'define(`GB_has_terminal'', `0'')\n') ;
     fprintf (f, 'define(`GB_terminal_value'', `(none)'')\n') ;
+    fprintf (f, 'define(`GB_is_terminal'', `(none)'')\n') ;
     fprintf (f, 'define(`GB_terminal'', `;'')\n') ;
 end
 
@@ -73,14 +75,14 @@ function codegen_red_method (opname, func, atype, identity, terminal, panel)
 
 % construct the *.c file
 cmd = sprintf (...
-'cat control.m4 Generator/GB_red.c | m4 | tail -n +17 > Generated/GB_red__%s.c', ...
+'cat control.m4 Generator/GB_red.c | m4 | tail -n +16 > Generated/GB_red__%s.c', ...
 name) ;
 fprintf ('.') ;
 system (cmd) ;
 
 % append to the *.h file
 cmd = sprintf (...
-'cat control.m4 Generator/GB_red.h | m4 | tail -n +17 >> Generated/GB_red__include.h') ;
+'cat control.m4 Generator/GB_red.h | m4 | tail -n +16 >> Generated/GB_red__include.h') ;
 system (cmd) ;
 
 delete ('control.m4') ;
diff --git a/GraphBLAS/Source/codegen_sel.m b/GraphBLAS/Source/codegen_sel.m
index 8667780f75..75b123ce4a 100644
--- a/GraphBLAS/Source/codegen_sel.m
+++ b/GraphBLAS/Source/codegen_sel.m
@@ -4,6 +4,9 @@
 % This function creates all files of the form GB_sel__*.c,
 % and the include file GB_sel__include.h.
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 fprintf ('\nselection operators:\n') ;
 
 f = fopen ('Generated/GB_sel__include.h', 'w') ;
@@ -11,9 +14,8 @@
 fprintf (f, '// GB_sel__include.h: definitions for GB_sel__*.c\n') ;
 fprintf (f, '//------------------------------------------------------------------------------\n') ;
 fprintf (f, '\n') ;
-fprintf (f, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.\n') ;
-fprintf (f, '// http://suitesparse.com   See GraphBLAS/Doc/License.txargt for license.\n') ;
-fprintf (f, '\n') ;
+fprintf (f, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.\n') ;
+fprintf (f, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (f, '// This file has been automatically generated from Generator/GB_sel.h') ;
 fprintf (f, '\n\n') ;
 fclose (f) ;
@@ -24,10 +26,8 @@
 fprintf ('\nuser       ') ;
 codegen_sel_method ('user',  ...
     [ 'user_select (' ...
-      ' flipij ? j : Ai[p], ' ...
-      ' flipij ? Ai[p] : j, ' ...
-      ' flipij ? avdim : avlen, ' ...
-      ' flipij ? avlen : avdim, Ax +((p)*asize), xthunk)' ] , 'GB_void') ;
+      'flipij ? j : GBI (Ai, p, avlen), flipij ? GBI (Ai, p, avlen) : j, ' ...
+      'Ax +((p)*asize), xthunk)' ] , 'GB_void') ;
 
 % TRIL, TRIU, DIAG, OFFIDIAG, RESIZE:
 fprintf ('\ntril       ') ;
@@ -45,20 +45,20 @@
 % phase1: depends on Ai only, so only nonzombie_any is used
 % phase2: use all 14 workers
 fprintf ('\nnonzombie  ') ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'bool'      ) ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'int8_t'    ) ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'int16_t'   ) ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'int32_t'   ) ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'int64_t'   ) ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'uint8_t'   ) ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'uint16_t'  ) ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'uint32_t'  ) ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'uint64_t'  ) ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'float'     ) ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'double'    ) ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'GxB_FC32_t') ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'GxB_FC64_t') ;
-codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai [p])', 'GB_void'   ) ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'bool'      ) ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'int8_t'    ) ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'int16_t'   ) ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'int32_t'   ) ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'int64_t'   ) ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'uint8_t'   ) ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'uint16_t'  ) ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'uint32_t'  ) ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'uint64_t'  ) ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'float'     ) ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'double'    ) ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'GxB_FC32_t') ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'GxB_FC64_t') ;
+codegen_sel_method ('nonzombie', 'GB_IS_NOT_ZOMBIE (Ai, p)', 'GB_void'   ) ;
 
 % NONZERO            name         selector       type
 fprintf ('\nnonzero    ') ;
diff --git a/GraphBLAS/Source/codegen_sel_method.m b/GraphBLAS/Source/codegen_sel_method.m
index 943f6bd2bd..17648b214c 100644
--- a/GraphBLAS/Source/codegen_sel_method.m
+++ b/GraphBLAS/Source/codegen_sel_method.m
@@ -3,6 +3,9 @@ function codegen_sel_method (opname, func, atype, kind)
 %
 % codegen_sel_method (opname, func, atype, kind)
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 f = fopen ('control.m4', 'w') ;
 
 [aname, unsigned, ~] = codegen_type (atype) ;
@@ -22,6 +25,16 @@ function codegen_sel_method (opname, func, atype, kind)
 end
 fprintf (f, 'define(`GB_sel_phase2'', `GB_sel_phase2__%s'')\n', name) ;
 
+if isequal (opname, 'nonzombie') || isequal (opname, 'resize') 
+    fprintf (f, 'define(`GB_sel_bitmap'', `GB_sel_bitmap__(none)'')\n') ;
+    fprintf (f, 'define(`if_bitmap'', `#if 0'')\n') ;
+    fprintf (f, 'define(`endif_bitmap'', `#endif'')\n') ;
+else
+    fprintf (f, 'define(`GB_sel_bitmap'', `GB_sel_bitmap__%s'')\n', name) ;
+    fprintf (f, 'define(`if_bitmap'', `'')\n') ;
+    fprintf (f, 'define(`endif_bitmap'', `'')\n') ;
+end
+
 % the type of A (no typecasting)
 fprintf (f, 'define(`GB_atype'', `%s'')\n', atype) ;
 
@@ -36,7 +49,7 @@ function codegen_sel_method (opname, func, atype, kind)
 
 % get vector index for user-defined select operator
 if (isequal (opname, 'user'))
-    fprintf (f, 'define(`GB_get_j'', `int64_t j = (Ah == NULL) ? k : Ah [k]'')\n') ;
+    fprintf (f, 'define(`GB_get_j'', `int64_t j = GBH (Ah, k)'')\n') ;
 else
     fprintf (f, 'define(`GB_get_j'', `;'')\n') ;
 end
@@ -50,7 +63,7 @@ function codegen_sel_method (opname, func, atype, kind)
 
 % enable phase1
 if (is_nonzombie)
-    % nonzombie: phase1 uses a single worker
+    % nonzombie: phase1 uses a single worker: GB_sel_phase1__nonzombie_any
     fprintf (f, 'define(`if_phase1'', `#if 0'')\n') ;
     fprintf (f, 'define(`endif_phase1'', `#endif'')\n') ;
 else
@@ -81,14 +94,14 @@ function codegen_sel_method (opname, func, atype, kind)
 
 % construct the *.c file
 cmd = sprintf (...
-'cat control.m4 Generator/GB_sel.c | m4 | tail -n +11 > Generated/GB_sel__%s.c', ...
+'cat control.m4 Generator/GB_sel.c | m4 | tail -n +14 > Generated/GB_sel__%s.c', ...
 name) ;
 fprintf ('.') ;
 system (cmd) ;
 
 % append to the *.h file
 cmd = sprintf (...
-'cat control.m4 Generator/GB_sel.h | m4 | tail -n +11 >> Generated/GB_sel__include.h') ;
+'cat control.m4 Generator/GB_sel.h | m4 | tail -n +14 >> Generated/GB_sel__include.h') ;
 system (cmd) ;
 
 delete ('control.m4') ;
diff --git a/GraphBLAS/Source/codegen_type.m b/GraphBLAS/Source/codegen_type.m
index 5ff83b5dc8..5b238e05db 100644
--- a/GraphBLAS/Source/codegen_type.m
+++ b/GraphBLAS/Source/codegen_type.m
@@ -2,8 +2,8 @@
 %CODEGEN_TYPE determine function _suffix, signed or not
 % and # bits a C type
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 unsigned = (type (1) == 'u') ;
 switch (type)
diff --git a/GraphBLAS/Source/codegen_unop.m b/GraphBLAS/Source/codegen_unop.m
index fb5dc3eb8f..86e916be10 100644
--- a/GraphBLAS/Source/codegen_unop.m
+++ b/GraphBLAS/Source/codegen_unop.m
@@ -4,8 +4,8 @@
 % This function creates all files of the form GB_unop__*.[ch],
 % and the include file GB_unop__include.h.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\nunary operators:\n') ;
 
@@ -14,13 +14,10 @@
 fprintf (f, '// GB_unop__include.h: definitions for GB_unop__*.c\n') ;
 fprintf (f, '//------------------------------------------------------------------------------\n') ;
 fprintf (f, '\n') ;
-fprintf (f, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.\n') ;
-fprintf (f, '// http://suitesparse.com   See GraphBLAS/Doc/License.txargt for license.\n') ;
-fprintf (f, '\n') ;
+fprintf (f, '// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.\n') ;
+fprintf (f, '// SPDX-License-Identifier: Apache-2.0\n\n') ;
 fprintf (f, '// This file has been automatically generated from Generator/GB_unop.h') ;
 fprintf (f, '\n\n') ;
-fprintf (f, '#include "GB_iterator.h"\n') ;
-fprintf (f, '\n\n') ;
 fclose (f) ;
 
 codegen_unop_template ('one', ...
diff --git a/GraphBLAS/Source/codegen_unop_method.m b/GraphBLAS/Source/codegen_unop_method.m
index 649616a306..8261dbbda7 100644
--- a/GraphBLAS/Source/codegen_unop_method.m
+++ b/GraphBLAS/Source/codegen_unop_method.m
@@ -8,6 +8,9 @@ function codegen_unop_method (unop, op, fcast, ztype, xtype)
 %   ztype: the type of z for z=f(x)
 %   xtype: the type of x for z=f(x)
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 f = fopen ('control.m4', 'w') ;
 
 [zname, zunsigned, zbits] = codegen_type (ztype) ;
@@ -15,20 +18,17 @@ function codegen_unop_method (unop, op, fcast, ztype, xtype)
 
 name = sprintf ('%s_%s_%s', unop, zname, xname) ;
 
+% determine if the op is identity with no typecast
 is_identity = isequal (unop, 'identity') ;
 no_typecast = isequal (ztype, xtype) ;
-
-% function names
 if (is_identity && no_typecast)
-    % disable this worker
-    fprintf (f, 'define(`GB_unop_apply'', `(none)'')\n', name) ;
-    fprintf (f, 'define(`if_operator_is_enabled'', `#if 0'')\n') ;
-    fprintf (f, 'define(`endif_operator_is_enabled'', `#endif'')\n') ;
+    fprintf (f, 'define(`GB_op_is_identity_with_no_typecast'', `1'')\n') ;
 else
-    fprintf (f, 'define(`GB_unop_apply'', `GB_unop_apply__%s'')\n', name) ;
-    fprintf (f, 'define(`if_operator_is_enabled'', `'')\n') ;
-    fprintf (f, 'define(`endif_operator_is_enabled'', `'')\n') ;
+    fprintf (f, 'define(`GB_op_is_identity_with_no_typecast'', `0'')\n') ;
 end
+
+% function names
+fprintf (f, 'define(`GB_unop_apply'', `GB_unop_apply__%s'')\n', name) ;
 fprintf (f, 'define(`GB_unop_tran'', `GB_unop_tran__%s'')\n', name) ;
 
 % type of C and A
@@ -84,14 +84,14 @@ function codegen_unop_method (unop, op, fcast, ztype, xtype)
 
 % construct the *.c file
 cmd = sprintf (...
-'cat control.m4 Generator/GB_unop.c | m4 | tail -n +11 > Generated/GB_unop__%s.c', ...
+'cat control.m4 Generator/GB_unop.c | m4 | tail -n +10 > Generated/GB_unop__%s.c', ...
 name) ;
 fprintf ('.') ;
 system (cmd) ;
 
 % append to the *.h file
 cmd = sprintf (...
-'cat control.m4 Generator/GB_unop.h | m4 | tail -n +11 >> Generated/GB_unop__include.h') ;
+'cat control.m4 Generator/GB_unop.h | m4 | tail -n +10 >> Generated/GB_unop__include.h') ;
 system (cmd) ;
 
 delete ('control.m4') ;
diff --git a/GraphBLAS/Source/codegen_unop_template.m b/GraphBLAS/Source/codegen_unop_template.m
index 871bf8b3eb..b644493451 100644
--- a/GraphBLAS/Source/codegen_unop_template.m
+++ b/GraphBLAS/Source/codegen_unop_template.m
@@ -23,8 +23,8 @@ function codegen_unop_template (unop, bfunc, ifunc, ufunc, ffunc, dfunc, ...
 % pairs of functions are generated.  The other unary operators are defined only
 % for a single type (ctype == atype).
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n%-9s', unop) ;
 
diff --git a/GraphBLAS/Source/dodiff b/GraphBLAS/Source/dodiff
index 1535582ce0..d799d526e6 100755
--- a/GraphBLAS/Source/dodiff
+++ b/GraphBLAS/Source/dodiff
@@ -1,5 +1,10 @@
 #!/bin/bash
 
+echo "======================================================================="
+echo "02 and 04:"
+echo "======================================================================="
+diff *suba*02* *suba*04*
+
 echo "======================================================================="
 echo "01 and 03:"
 echo "======================================================================="
@@ -10,26 +15,11 @@ echo "05 and 07:"
 echo "======================================================================="
 diff *suba*05.c* *suba*07*
 
-echo "======================================================================="
-echo "06s and 14:"
-echo "======================================================================="
-diff *suba*06s* *suba*14*
-
 echo "======================================================================="
 echo "09 and 11:"
 echo "======================================================================="
 diff *suba*09* *suba*11*
 
-echo "======================================================================="
-echo "10 and 18:"
-echo "======================================================================="
-diff *suba*10* *suba*18*
-
-echo "======================================================================="
-echo "12 and 20:"
-echo "======================================================================="
-diff *suba*12* *suba*20*
-
 echo "======================================================================="
 echo "13 and 15:"
 echo "======================================================================="
@@ -60,3 +50,22 @@ echo "17 and 19:"
 echo "======================================================================="
 diff *suba*17* *suba*19*
 
+echo "======================================================================="
+echo "06s/14 and 10/18:"
+echo "======================================================================="
+diff *suba*06s* *suba*10*
+
+echo "======================================================================="
+echo "10/18 and 12/20:"
+echo "======================================================================="
+diff *suba*10* *suba*12*
+
+echo "======================================================================="
+echo "06s/14 and 12/20:"
+echo "======================================================================="
+diff *suba*06s* *suba*12*
+
+echo "======================================================================="
+echo "06s/14 and 08s/18:"
+echo "======================================================================="
+diff *suba*06s* *suba*08s*
diff --git a/GraphBLAS/Tcov/Contents.m b/GraphBLAS/Tcov/Contents.m
index 3f5acf9df7..43f5483160 100644
--- a/GraphBLAS/Tcov/Contents.m
+++ b/GraphBLAS/Tcov/Contents.m
@@ -1,7 +1,12 @@
 % GraphBLAS/Tcov/Contents.m:
 %
-%   grbcover      - compile the ../Test mexFunctions for statement coverage testing
-%   grbcover_edit - create a version of GraphBLAS for statement coverage tests
-%   testcov       - run all GraphBLAS tests, with statement coverage
-%   grbshow       - create a test coverage report in tmp_cover/
-%   grbmake       - compile the GraphBLAS library for statement coverage testing
+% grbcov - compile, run, and evaluate test coverage
+%
+% grbcover      - compile ../Test/* for statement coverage testing
+% grbcover_edit - create a version of GraphBLAS for statement coverage tests
+% testcov       - run all GraphBLAS tests, with statement coverage
+% grbshow       - create a test coverage report in tmp_cover/
+% grbmake       - compile the GraphBLAS library for statement coverage testing
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
diff --git a/GraphBLAS/Tcov/GB_cover_util.c b/GraphBLAS/Tcov/GB_cover_util.c
index 8aeeda613c..a5969830fc 100644
--- a/GraphBLAS/Tcov/GB_cover_util.c
+++ b/GraphBLAS/Tcov/GB_cover_util.c
@@ -2,8 +2,8 @@
 // GB_cover_util.c: utilities for test coverage
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Tcov/Makefile b/GraphBLAS/Tcov/Makefile
index 5a3dad29b9..a1a36d63e4 100644
--- a/GraphBLAS/Tcov/Makefile
+++ b/GraphBLAS/Tcov/Makefile
@@ -2,8 +2,8 @@
 # GraphBLAS/Tcov/Makefile
 #-------------------------------------------------------------------------------
 
-# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-# http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 #-------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Tcov/README.txt b/GraphBLAS/Tcov/README.txt
index b4da541927..f7615c5892 100644
--- a/GraphBLAS/Tcov/README.txt
+++ b/GraphBLAS/Tcov/README.txt
@@ -1,5 +1,5 @@
-SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
 
 GraphBLAS/Tcov: statement coverage tests
 
@@ -12,7 +12,7 @@ of creating a statement coverage mechanism to use within a MATLAB mexFunction.
 To compile GraphBLAS for statement coverage testing, and to run the tests, type
 this in the MATLAB command window.
 
-    grbmake ; testcov
+    grbcov
 
 If you get a linking problem on linux, add this directory to your
 LD_LIBRARY_PATH, so that the libgraphblas_tcov.so constructed by grbmake can be
@@ -20,9 +20,7 @@ found by the mexFunctions.
 
 Statement coverage tests results will be saved in Tcov/log.txt.
 
-To list the lines covered by the test, do this in MATLAB:
-
-    grbshow
+The lines covered by the test are marked in each file in tmp_cover/.
 
 To remove all compiled files, type this in the Unix/Linux shell:
 
@@ -30,7 +28,7 @@ To remove all compiled files, type this in the Unix/Linux shell:
 
 Or, delete these files manually:
 
-    *.o *.obj *.mex* cover_*.c errlog.txt grbstat.mat tmp*/*
+    *.o *.obj *.mex* cover_*.c errlog*.txt grbstat.mat tmp*/*
 
 To also remove the log.txt file:
 
@@ -42,12 +40,13 @@ Files in GraphBLAS/Tcov:
 
     Contents.m     for 'help Tcov' in MATLAB; list of files
 
-    grbcover.m     compile GraphBLAS for statement coverage testing
+    grbcov.m        makes the tests, runs them, and lists the test coverage
+    grbcover.m      compile GraphBLAS for statement coverage testing
     grbcover_edit.m create a version of GraphBLAS for statement coverage tests
-    testcov.m      run all GraphBLAS tests, with statement coverage
-    grbshow.m      create a test coverage report in tmp_cover/
-    Makefile       just for 'make clean' and 'make purge'
-    README.txt     this file
+    testcov.m       run all GraphBLAS tests, with statement coverage
+    grbshow.m       create a test coverage report in tmp_cover/
+    Makefile        just for 'make clean' and 'make purge'
+    README.txt      this file
 
     GB_cover_util.c     get/put the coverage to/from MATLAB
     log_*.txt           100% test coverage certificates
diff --git a/GraphBLAS/Tcov/grbcov.m b/GraphBLAS/Tcov/grbcov.m
new file mode 100644
index 0000000000..e8f49dda61
--- /dev/null
+++ b/GraphBLAS/Tcov/grbcov.m
@@ -0,0 +1,16 @@
+function grbcov
+%GRBCOV compile, run, and evaluate test coverage
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+clear all
+tstart = tic ;
+system ('make purge') ;
+grbmake ;
+testcov ;
+grbshow ;
+ttotal = toc (tstart) ;
+
+fprintf ('\nTotal time, incl compilation: %8.2f minutes\n', ttotal / 60) ;
+
diff --git a/GraphBLAS/Tcov/grbcover.m b/GraphBLAS/Tcov/grbcover.m
index 1d9f159b7a..a24345905f 100644
--- a/GraphBLAS/Tcov/grbcover.m
+++ b/GraphBLAS/Tcov/grbcover.m
@@ -1,13 +1,13 @@
 function grbcover (what)
-%GBCOVER compile the ../Test mexFunctions for statement coverage testing
+%GBCOVER compile ../Test/* for statement coverage testing
 %
 % This function compiles just the mexFunctions in ../Test.
 % It does not compile the GraphBLAS library itself.
 %
 % See also: grbcover_edit, grbmake
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (ispc)
     error ('The tests in Tcov are not ported to Windows') ;
diff --git a/GraphBLAS/Tcov/grbcover_edit.m b/GraphBLAS/Tcov/grbcover_edit.m
index d404bab7d0..8a8734a863 100644
--- a/GraphBLAS/Tcov/grbcover_edit.m
+++ b/GraphBLAS/Tcov/grbcover_edit.m
@@ -24,8 +24,9 @@
 %       case stuff :  GB_cov[count]++ ; statement
 %       default :     GB_cov[count]++ ; statement
 %
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (ispc)
     error ('The tests in Tcov are not ported to Windows') ;
diff --git a/GraphBLAS/Tcov/grbmake.m b/GraphBLAS/Tcov/grbmake.m
index f8348f63ba..380aa41f9f 100644
--- a/GraphBLAS/Tcov/grbmake.m
+++ b/GraphBLAS/Tcov/grbmake.m
@@ -7,8 +7,8 @@
 %
 % See also: grbcover, grbcover_edit
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (ispc)
     error ('The tests in Tcov are not ported to Windows') ;
diff --git a/GraphBLAS/Tcov/grbshow.m b/GraphBLAS/Tcov/grbshow.m
index bf2fb124d1..1091a71822 100644
--- a/GraphBLAS/Tcov/grbshow.m
+++ b/GraphBLAS/Tcov/grbshow.m
@@ -1,8 +1,8 @@
 function grbshow
 %GBSHOW create a test coverage report in tmp_cover/
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (ispc)
     error ('The tests in Tcov are not ported to Windows') ;
diff --git a/GraphBLAS/Tcov/log_19Jan2021.txt b/GraphBLAS/Tcov/log_19Jan2021.txt
new file mode 100644
index 0000000000..97132764a5
--- /dev/null
+++ b/GraphBLAS/Tcov/log_19Jan2021.txt
@@ -0,0 +1,115 @@
+
+---------------------------------------------- [malloc] [cover]
+19-Jan 14:25:30 test187        6.8 sec   390:   390 of 16469   2.4%    57.40/sec
+19-Jan 14:25:31 test186        0.4 sec   155:   545 of 16469   3.3%   392.02/sec
+19-Jan 14:26:37 test185       66.8 sec   234:   779 of 16469   4.7%     3.50/sec
+19-Jan 14:27:51 test185       73.6 sec    23:   802 of 16469   4.9%     0.31/sec
+19-Jan 14:27:56 test184        5.0 sec    80:   882 of 16469   5.4%    16.00/sec
+19-Jan 14:27:56 test183        0.2 sec    41:   923 of 16469   5.6%   229.64/sec
+19-Jan 14:27:57 test182        0.7 sec   133:  1056 of 16469   6.4%   179.58/sec
+19-Jan 14:28:10 test181       13.3 sec   157:  1213 of 16469   7.4%    11.79/sec
+19-Jan 14:28:18 test180        7.5 sec   205:  1418 of 16469   8.6%    27.34/sec
+19-Jan 14:28:37 test180       19.2 sec    32:  1450 of 16469   8.8%     1.66/sec
+19-Jan 14:28:37 test179        0.2 sec    81:  1531 of 16469   9.3%   491.10/sec
+19-Jan 14:28:37 test165        0.0 sec     5:  1536 of 16469   9.3%   843.03/sec
+19-Jan 14:28:38 test01         0.4 sec   811:  2347 of 16469  14.3%  1829.28/sec
+19-Jan 14:28:38 test01         0.2 sec     3:  2350 of 16469  14.3%    13.36/sec
+19-Jan 14:28:38 test07b        0.0 sec    12:  2362 of 16469  14.3%   721.37/sec
+19-Jan 14:28:38 test168        0.1 sec     4:  2366 of 16469  14.4%    35.13/sec
+19-Jan 14:28:38 test83         0.0 sec     1:  2367 of 16469  14.4%   206.23/sec
+19-Jan 14:28:38 test176        0.3 sec     7:  2374 of 16469  14.4%    26.68/sec
+19-Jan 14:28:38 test174        0.1 sec    11:  2385 of 16469  14.5%   183.49/sec
+19-Jan 14:28:39 test170        0.1 sec     3:  2388 of 16469  14.5%    25.19/sec
+19-Jan 14:28:39 test169        0.0 sec    13:  2401 of 16469  14.6%   819.31/sec
+19-Jan 14:28:39 test166        0.2 sec    60:  2461 of 16469  14.9%   358.03/sec
+19-Jan 14:28:39 test164        0.1 sec     9:  2470 of 16469  15.0%    72.04/sec
+19-Jan 14:28:40 test152        1.0 sec   859:  3329 of 16469  20.2%   853.59/sec
+19-Jan 14:28:40 test155        0.2 sec    14:  3343 of 16469  20.3%    87.09/sec
+19-Jan 14:28:41 test156        0.5 sec   246:  3589 of 16469  21.8%   522.24/sec
+19-Jan 14:28:41 test136        0.1 sec    30:  3619 of 16469  22.0%   480.68/sec
+19-Jan 14:28:41 test02         0.3 sec   178:  3797 of 16469  23.1%   677.56/sec
+19-Jan 14:28:41 test150        0.1 sec    23:  3820 of 16469  23.2%   204.28/sec
+19-Jan 14:28:41 test109        0.1 sec    11:  3831 of 16469  23.3%   100.60/sec
+19-Jan 14:28:41 test109        0.0 sec     3:  3834 of 16469  23.3%  1238.65/sec
+19-Jan 14:28:41 test110        0.1 sec    18:  3852 of 16469  23.4%   138.69/sec
+19-Jan 14:28:41 test04         0.0 sec     9:  3861 of 16469  23.4%   329.13/sec
+19-Jan 14:28:47 test142        5.8 sec   740:  4601 of 16469  27.9%   128.53/sec
+19-Jan 14:28:47 test162        0.1 sec     2:  4603 of 16469  27.9%    26.33/sec
+19-Jan 14:28:47 test161        0.1 sec     2:  4605 of 16469  28.0%    18.24/sec
+19-Jan 14:28:48 test159        0.9 sec   118:  4723 of 16469  28.7%   130.71/sec
+19-Jan 14:28:48 test137        0.2 sec    21:  4744 of 16469  28.8%   126.99/sec
+19-Jan 14:28:49 test139        0.3 sec     2:  4746 of 16469  28.8%     6.17/sec
+19-Jan 14:28:49 test09         0.0 sec     1:  4747 of 16469  28.8%   106.24/sec
+19-Jan 14:28:49 test132        0.1 sec     5:  4752 of 16469  28.9%    60.16/sec
+19-Jan 14:28:49 test15         0.1 sec     7:  4759 of 16469  28.9%    49.49/sec
+19-Jan 14:28:49 test167        0.2 sec    23:  4782 of 16469  29.0%   132.24/sec
+19-Jan 14:28:50 test177        0.9 sec     4:  4786 of 16469  29.1%     4.55/sec
+19-Jan 14:28:50 test94         0.1 sec    16:  4802 of 16469  29.2%   117.31/sec
+19-Jan 14:28:50 test94         0.0 sec     2:  4804 of 16469  29.2%    48.21/sec
+19-Jan 14:28:53 test141        2.9 sec   461:  5265 of 16469  32.0%   158.61/sec
+19-Jan 14:28:53 test144        0.2 sec     5:  5270 of 16469  32.0%    32.18/sec
+19-Jan 14:28:53 test145        0.2 sec     7:  5277 of 16469  32.0%    41.69/sec
+19-Jan 14:28:53 test92         0.1 sec     5:  5282 of 16469  32.1%    62.11/sec
+19-Jan 14:28:54 test108        0.2 sec    18:  5300 of 16469  32.2%    75.20/sec
+19-Jan 14:28:54 test172        0.1 sec     7:  5307 of 16469  32.2%    86.85/sec
+19-Jan 14:28:54 test26         0.6 sec    37:  5344 of 16469  32.4%    66.83/sec
+19-Jan 14:28:55 test148        0.4 sec     9:  5353 of 16469  32.5%    23.20/sec
+19-Jan 14:28:55 testc2(1)      0.2 sec    18:  5371 of 16469  32.6%   102.71/sec
+19-Jan 14:28:55 test163        0.1 sec     3:  5374 of 16469  32.6%    23.23/sec
+19-Jan 14:28:55 test146        0.1 sec     1:  5375 of 16469  32.6%    12.40/sec
+19-Jan 14:28:56 test173        1.2 sec    15:  5390 of 16469  32.7%    12.62/sec
+19-Jan 14:28:57 test157        0.8 sec    14:  5404 of 16469  32.8%    17.65/sec
+19-Jan 14:29:01 test29         4.2 sec   198:  5602 of 16469  34.0%    46.79/sec
+19-Jan 14:30:33 test74        91.8 sec  4606: 10208 of 16469  62.0%    50.19/sec
+19-Jan 14:30:33 test03         0.2 sec     3: 10211 of 16469  62.0%    19.03/sec
+19-Jan 14:30:33 test03         0.1 sec     2: 10213 of 16469  62.0%    14.78/sec
+19-Jan 14:30:34 test128        0.3 sec    44: 10257 of 16469  62.3%   157.56/sec
+19-Jan 14:30:57 test125       23.0 sec   683: 10940 of 16469  66.4%    29.73/sec
+19-Jan 14:31:07 test14        10.5 sec    91: 11031 of 16469  67.0%     8.71/sec
+19-Jan 14:31:07 test131        0.2 sec     1: 11032 of 16469  67.0%     5.32/sec
+19-Jan 14:31:08 test82         0.2 sec     6: 11038 of 16469  67.0%    29.91/sec
+19-Jan 14:32:33 test154       85.2 sec  1586: 12624 of 16469  76.7%    18.62/sec
+19-Jan 14:32:43 test158        9.9 sec    17: 12641 of 16469  76.8%     1.72/sec
+19-Jan 14:32:46 test84         3.2 sec    23: 12664 of 16469  76.9%     7.12/sec
+19-Jan 14:32:48 test130        1.9 sec    21: 12685 of 16469  77.0%    11.02/sec
+19-Jan 14:33:00 test19b       12.3 sec    50: 12735 of 16469  77.3%     4.05/sec
+19-Jan 14:33:08 test19b        7.7 sec     5: 12740 of 16469  77.4%     0.65/sec
+19-Jan 14:33:08 test101        0.7 sec     2: 12742 of 16469  77.4%     3.01/sec
+19-Jan 14:33:09 test133        0.6 sec     6: 12748 of 16469  77.4%    10.41/sec
+19-Jan 14:33:09 test72         0.2 sec     1: 12749 of 16469  77.4%     4.96/sec
+19-Jan 14:33:13 test80         3.9 sec    15: 12764 of 16469  77.5%     3.82/sec
+19-Jan 14:33:27 test151       13.9 sec    89: 12853 of 16469  78.0%     6.38/sec
+19-Jan 14:33:27 test124        0.4 sec     3: 12856 of 16469  78.1%     8.19/sec
+19-Jan 14:33:44 test23        16.8 sec    88: 12944 of 16469  78.6%     5.25/sec
+19-Jan 14:33:53 test175        8.5 sec     1: 12945 of 16469  78.6%     0.12/sec
+19-Jan 14:34:16 test160       23.6 sec    44: 12989 of 16469  78.9%     1.86/sec
+19-Jan 14:34:44 test160       27.9 sec    16: 13005 of 16469  79.0%     0.57/sec
+19-Jan 14:36:30 test134      105.7 sec   415: 13420 of 16469  81.5%     3.93/sec
+19-Jan 14:36:33 test00         3.5 sec     7: 13427 of 16469  81.5%     2.01/sec
+19-Jan 14:36:52 test54        18.4 sec    20: 13447 of 16469  81.7%     1.09/sec
+19-Jan 14:36:56 test104        4.5 sec    29: 13476 of 16469  81.8%     6.42/sec
+19-Jan 14:36:59 test11         2.6 sec     6: 13482 of 16469  81.9%     2.30/sec
+19-Jan 14:37:00 test28         0.6 sec     1: 13483 of 16469  81.9%     1.68/sec
+19-Jan 14:37:01 test129        0.9 sec     1: 13484 of 16469  81.9%     1.10/sec
+19-Jan 14:37:01 test138        0.1 sec     1: 13485 of 16469  81.9%     9.89/sec
+19-Jan 14:42:27 test127      326.0 sec   409: 13894 of 16469  84.4%     1.25/sec
+19-Jan 14:42:28 test88         1.5 sec     3: 13897 of 16469  84.4%     1.99/sec
+19-Jan 14:42:37 test76         8.6 sec    14: 13911 of 16469  84.5%     1.62/sec
+19-Jan 14:42:38 test107        1.3 sec     6: 13917 of 16469  84.5%     4.58/sec
+19-Jan 14:42:46 test69         7.9 sec     3: 13920 of 16469  84.5%     0.38/sec
+19-Jan 14:42:48 test135        1.4 sec     1: 13921 of 16469  84.5%     0.69/sec
+19-Jan 14:43:05 test17        17.9 sec     7: 13928 of 16469  84.6%     0.39/sec
+19-Jan 14:43:18 test143       12.8 sec     5: 13933 of 16469  84.6%     0.39/sec
+19-Jan 14:43:29 test27        11.1 sec     2: 13935 of 16469  84.6%     0.18/sec
+19-Jan 14:44:07 test53        37.9 sec     2: 13937 of 16469  84.6%     0.05/sec
+19-Jan 14:54:28 test77       621.1 sec    34: 13971 of 16469  84.8%     0.05/sec
+19-Jan 15:00:47 test19       378.6 sec    11: 13982 of 16469  84.9%     0.03/sec
+[malloc debugging turned off]
+19-Jan 15:00:48 test20         1.1 sec     3: 13985 of 16469  84.9%     2.79/sec
+19-Jan 15:08:55 test10       486.8 sec   931: 14916 of 16469  90.6%     1.91/sec
+19-Jan 15:14:14 test75b      318.8 sec  1506: 16422 of 16469  99.7%     4.72/sec
+19-Jan 15:16:25 test16       131.1 sec    15: 16437 of 16469  99.8%     0.11/sec
+19-Jan 15:17:28 test81        63.0 sec     6: 16443 of 16469  99.8%     0.10/sec
+19-Jan 15:18:26 test21b       58.5 sec    19: 16462 of 16469 100.0%     0.32/sec
+19-Jan 15:21:35 test18       189.4 sec     7:   all 16469 full 100%     0.04/sec
+[malloc debugging turned back on]
diff --git a/GraphBLAS/Tcov/log_21June2020.txt b/GraphBLAS/Tcov/log_21June2020.txt
deleted file mode 100644
index b0417c8788..0000000000
--- a/GraphBLAS/Tcov/log_21June2020.txt
+++ /dev/null
@@ -1,98 +0,0 @@
-
----------------------------------------------- [malloc] [cover]
-21-Jun-2020 21:56:11 test152        0.9 sec coverage:   975 :   975 of 14781 (  6.6% rate:  1117.99/sec)
-21-Jun-2020 21:56:11 test155        0.1 sec coverage:   231 :  1206 of 14781 (  8.2% rate:  2187.52/sec)
-21-Jun-2020 21:56:11 test07b        0.0 sec coverage:   102 :  1308 of 14781 (  8.8% rate:  2316.65/sec)
-21-Jun-2020 21:56:11 test01         0.2 sec coverage:  1025 :  2333 of 14781 ( 15.8% rate:  6229.60/sec)
-21-Jun-2020 21:56:11 test01         0.1 sec coverage:    27 :  2360 of 14781 ( 16.0% rate:   401.16/sec)
-21-Jun-2020 21:56:11 test83         0.0 sec coverage:    15 :  2375 of 14781 ( 16.1% rate:  5563.80/sec)
-21-Jun-2020 21:56:11 test136        0.1 sec coverage:    45 :  2420 of 14781 ( 16.4% rate:   320.63/sec)
-21-Jun-2020 21:56:11 test98         0.1 sec coverage:    13 :  2433 of 14781 ( 16.5% rate:   123.75/sec)
-21-Jun-2020 21:56:11 test84         0.0 sec coverage:    14 :  2447 of 14781 ( 16.6% rate:   421.88/sec)
-21-Jun-2020 21:56:12 test85         0.0 sec coverage:     6 :  2453 of 14781 ( 16.6% rate:   134.24/sec)
-21-Jun-2020 21:56:12 test02         0.3 sec coverage:   481 :  2934 of 14781 ( 19.8% rate:  1714.09/sec)
-21-Jun-2020 21:56:12 test148        0.3 sec coverage:    22 :  2956 of 14781 ( 20.0% rate:    76.02/sec)
-21-Jun-2020 21:56:12 test150        0.1 sec coverage:    39 :  2995 of 14781 ( 20.3% rate:   342.99/sec)
-21-Jun-2020 21:56:12 test137        0.1 sec coverage:     4 :  2999 of 14781 ( 20.3% rate:    27.41/sec)
-21-Jun-2020 21:56:13 test138        0.2 sec coverage:     5 :  3004 of 14781 ( 20.3% rate:    25.69/sec)
-21-Jun-2020 21:56:13 test139        0.5 sec coverage:    36 :  3040 of 14781 ( 20.6% rate:    71.91/sec)
-21-Jun-2020 21:56:13 test72         0.2 sec coverage:    10 :  3050 of 14781 ( 20.6% rate:    52.26/sec)
-21-Jun-2020 21:56:13 test09         0.0 sec coverage:     1 :  3051 of 14781 ( 20.6% rate:   114.50/sec)
-21-Jun-2020 21:56:13 test109        0.0 sec coverage:     9 :  3060 of 14781 ( 20.7% rate:   184.96/sec)
-21-Jun-2020 21:56:13 test109        0.0 sec coverage:     3 :  3063 of 14781 ( 20.7% rate:  1431.30/sec)
-21-Jun-2020 21:56:13 test110        0.1 sec coverage:     7 :  3070 of 14781 ( 20.8% rate:   113.63/sec)
-21-Jun-2020 21:56:13 test131        0.0 sec coverage:     2 :  3072 of 14781 ( 20.8% rate:    42.77/sec)
-21-Jun-2020 21:56:14 test132        0.1 sec coverage:    10 :  3082 of 14781 ( 20.9% rate:   186.55/sec)
-21-Jun-2020 21:56:14 test92         0.0 sec coverage:     4 :  3086 of 14781 ( 20.9% rate:    85.01/sec)
-21-Jun-2020 21:56:14 test97         0.2 sec coverage:     8 :  3094 of 14781 ( 20.9% rate:    48.69/sec)
-21-Jun-2020 21:56:14 test04         0.0 sec coverage:    16 :  3110 of 14781 ( 21.0% rate:   476.90/sec)
-21-Jun-2020 21:56:14 test05         0.0 sec coverage:     0 :  3110 of 14781 ( 21.0% rate:     0.00/sec)
-21-Jun-2020 21:56:14 test05         0.0 sec coverage:     0 :  3110 of 14781 ( 21.0% rate:     0.00/sec)
-21-Jun-2020 21:56:14 test15         0.1 sec coverage:    14 :  3124 of 14781 ( 21.1% rate:   144.49/sec)
-21-Jun-2020 21:56:14 test78         0.1 sec coverage:     2 :  3126 of 14781 ( 21.1% rate:    34.00/sec)
-21-Jun-2020 21:56:14 test82         0.1 sec coverage:     9 :  3135 of 14781 ( 21.2% rate:    84.72/sec)
-21-Jun-2020 21:56:14 test94         0.1 sec coverage:    32 :  3167 of 14781 ( 21.4% rate:   333.18/sec)
-21-Jun-2020 21:56:14 test94         0.0 sec coverage:     2 :  3169 of 14781 ( 21.4% rate:    57.44/sec)
-21-Jun-2020 21:56:14 test126        0.1 sec coverage:     7 :  3176 of 14781 ( 21.5% rate:    47.69/sec)
-21-Jun-2020 21:56:14 test03         0.1 sec coverage:     1 :  3177 of 14781 ( 21.5% rate:     7.68/sec)
-21-Jun-2020 21:56:15 test03         0.1 sec coverage:     2 :  3179 of 14781 ( 21.5% rate:    26.07/sec)
-21-Jun-2020 21:56:15 test128        0.2 sec coverage:    46 :  3225 of 14781 ( 21.8% rate:   224.71/sec)
-21-Jun-2020 21:56:33 test17        18.2 sec coverage:     4 :  3229 of 14781 ( 21.8% rate:     0.22/sec)
-21-Jun-2020 21:56:33 test108        0.2 sec coverage:    18 :  3247 of 14781 ( 22.0% rate:    75.22/sec)
-21-Jun-2020 21:56:34 test124        0.3 sec coverage:     3 :  3250 of 14781 ( 22.0% rate:    10.33/sec)
-21-Jun-2020 21:56:34 test101        0.5 sec coverage:    26 :  3276 of 14781 ( 22.2% rate:    50.21/sec)
-21-Jun-2020 21:56:35 test26         0.5 sec coverage:    73 :  3349 of 14781 ( 22.7% rate:   157.46/sec)
-21-Jun-2020 21:56:38 test141        3.4 sec coverage:   908 :  4257 of 14781 ( 28.8% rate:   269.09/sec)
-21-Jun-2020 21:56:44 test142        6.0 sec coverage:   670 :  4927 of 14781 ( 33.3% rate:   112.07/sec)
-21-Jun-2020 21:56:44 test144        0.1 sec coverage:    11 :  4938 of 14781 ( 33.4% rate:    98.47/sec)
-21-Jun-2020 21:56:44 test145        0.1 sec coverage:    23 :  4961 of 14781 ( 33.6% rate:   185.56/sec)
-21-Jun-2020 21:56:44 test147        0.3 sec coverage:     1 :  4962 of 14781 ( 33.6% rate:     3.96/sec)
-21-Jun-2020 21:56:44 test146        0.1 sec coverage:     2 :  4964 of 14781 ( 33.6% rate:    37.52/sec)
-21-Jun-2020 21:56:46 test149        1.1 sec coverage:     7 :  4971 of 14781 ( 33.6% rate:     6.56/sec)
-21-Jun-2020 21:56:46 test133        0.5 sec coverage:     8 :  4979 of 14781 ( 33.7% rate:    15.36/sec)
-21-Jun-2020 21:56:58 test151       12.3 sec coverage:    33 :  5012 of 14781 ( 33.9% rate:     2.68/sec)
-21-Jun-2020 21:57:20 test99        21.7 sec coverage:    21 :  5033 of 14781 ( 34.1% rate:     0.97/sec)
-21-Jun-2020 21:57:25 test29         4.4 sec coverage:   166 :  5199 of 14781 ( 35.2% rate:    37.97/sec)
-21-Jun-2020 21:57:25 test90         0.4 sec coverage:    17 :  5216 of 14781 ( 35.3% rate:    48.47/sec)
-21-Jun-2020 21:57:25 testc2(1)      0.2 sec coverage:    31 :  5247 of 14781 ( 35.5% rate:   180.47/sec)
-21-Jun-2020 21:57:26 test80         0.8 sec coverage:    11 :  5258 of 14781 ( 35.6% rate:    13.76/sec)
-21-Jun-2020 21:57:27 test130        1.1 sec coverage:    13 :  5271 of 14781 ( 35.7% rate:    11.58/sec)
-21-Jun-2020 21:57:34 test14         7.2 sec coverage:   132 :  5403 of 14781 ( 36.6% rate:    18.39/sec)
-21-Jun-2020 21:57:36 test129        2.3 sec coverage:     6 :  5409 of 14781 ( 36.6% rate:     2.66/sec)
-21-Jun-2020 21:57:37 test102        1.0 sec coverage:     3 :  5412 of 14781 ( 36.6% rate:     3.07/sec)
-21-Jun-2020 21:57:39 test12         1.4 sec coverage:     1 :  5413 of 14781 ( 36.6% rate:     0.70/sec)
-21-Jun-2020 21:57:39 test28         0.4 sec coverage:     2 :  5415 of 14781 ( 36.6% rate:     4.62/sec)
-21-Jun-2020 21:57:41 test107        1.4 sec coverage:     4 :  5419 of 14781 ( 36.7% rate:     2.96/sec)
-21-Jun-2020 21:57:43 test93         2.8 sec coverage:     3 :  5422 of 14781 ( 36.7% rate:     1.07/sec)
-21-Jun-2020 21:57:45 test135        1.6 sec coverage:     2 :  5424 of 14781 ( 36.7% rate:     1.23/sec)
-21-Jun-2020 21:57:49 test11         3.6 sec coverage:     3 :  5427 of 14781 ( 36.7% rate:     0.83/sec)
-21-Jun-2020 21:57:53 test106        4.7 sec coverage:     4 :  5431 of 14781 ( 36.7% rate:     0.85/sec)
-21-Jun-2020 21:58:01 test69         7.8 sec coverage:     7 :  5438 of 14781 ( 36.8% rate:     0.89/sec)
-21-Jun-2020 22:00:57 test77       176.2 sec coverage:    33 :  5471 of 14781 ( 37.0% rate:     0.19/sec)
-21-Jun-2020 22:01:00 test19b        2.4 sec coverage:    70 :  5541 of 14781 ( 37.5% rate:    29.17/sec)
-21-Jun-2020 22:01:02 test19b        2.0 sec coverage:     8 :  5549 of 14781 ( 37.5% rate:     4.01/sec)
-21-Jun-2020 22:01:02 test104        0.7 sec coverage:     1 :  5550 of 14781 ( 37.5% rate:     1.53/sec)
-21-Jun-2020 22:01:55 test154       52.4 sec coverage:  1535 :  7085 of 14781 ( 47.9% rate:    29.27/sec)
-21-Jun-2020 22:02:14 test125       19.1 sec coverage:   607 :  7692 of 14781 ( 52.0% rate:    31.80/sec)
-21-Jun-2020 22:03:27 test74        72.5 sec coverage:  4467 : 12159 of 14781 ( 82.3% rate:    61.59/sec)
-21-Jun-2020 22:03:37 test54        10.2 sec coverage:    21 : 12180 of 14781 ( 82.4% rate:     2.06/sec)
-21-Jun-2020 22:03:49 test23        12.5 sec coverage:    87 : 12267 of 14781 ( 83.0% rate:     6.95/sec)
-21-Jun-2020 22:03:53 test00         3.7 sec coverage:    11 : 12278 of 14781 ( 83.1% rate:     2.98/sec)
-21-Jun-2020 22:04:00 test76         6.8 sec coverage:    20 : 12298 of 14781 ( 83.2% rate:     2.93/sec)
-21-Jun-2020 22:04:01 test88         1.5 sec coverage:     4 : 12302 of 14781 ( 83.2% rate:     2.58/sec)
-21-Jun-2020 22:04:17 test127       15.9 sec coverage:    27 : 12329 of 14781 ( 83.4% rate:     1.69/sec)
-21-Jun-2020 22:04:34 test143       16.5 sec coverage:    11 : 12340 of 14781 ( 83.5% rate:     0.67/sec)
-21-Jun-2020 22:04:58 test19        24.6 sec coverage:    10 : 12350 of 14781 ( 83.6% rate:     0.41/sec)
-21-Jun-2020 22:05:23 test53        24.3 sec coverage:     4 : 12354 of 14781 ( 83.6% rate:     0.16/sec)
-21-Jun-2020 22:05:35 test27        12.3 sec coverage:     2 : 12356 of 14781 ( 83.6% rate:     0.16/sec)
-[malloc debugging turned off]
-21-Jun-2020 22:08:17 test10       161.6 sec coverage:   589 : 12945 of 14781 ( 87.6% rate:     3.64/sec)
-21-Jun-2020 22:09:06 test134       49.2 sec coverage:   327 : 13272 of 14781 ( 89.8% rate:     6.64/sec)
-21-Jun-2020 22:14:50 test75b      344.6 sec coverage:  1463 : 14735 of 14781 ( 99.7% rate:     4.25/sec)
-21-Jun-2020 22:15:33 test21        43.0 sec coverage:    12 : 14747 of 14781 ( 99.8% rate:     0.28/sec)
-21-Jun-2020 22:18:35 test16       181.8 sec coverage:    11 : 14758 of 14781 ( 99.8% rate:     0.06/sec)
-21-Jun-2020 22:19:45 test81        70.1 sec coverage:     6 : 14764 of 14781 ( 99.9% rate:     0.09/sec)
-21-Jun-2020 22:20:49 test21b       63.3 sec coverage:     3 : 14767 of 14781 ( 99.9% rate:     0.05/sec)
-21-Jun-2020 22:24:03 test18       194.0 sec coverage:     7 : 14774 of 14781 (100.0% rate:     0.04/sec)
-21-Jun-2020 22:24:05 test20         2.0 sec coverage:     7 :   all 14781 (full 100% rate:     3.56/sec)
-[malloc debugging turned back on]
diff --git a/GraphBLAS/Tcov/log_22June2020.txt b/GraphBLAS/Tcov/log_22June2020.txt
deleted file mode 100644
index c0021c90ca..0000000000
--- a/GraphBLAS/Tcov/log_22June2020.txt
+++ /dev/null
@@ -1,96 +0,0 @@
-
----------------------------------------------- [malloc] [cover]
-21-Jun-2020 22:51:00 test152        0.7 sec coverage:   975 :   975 of 14781 (  6.6% rate:  1416.51/sec)
-21-Jun-2020 22:51:00 test155        0.1 sec coverage:   231 :  1206 of 14781 (  8.2% rate:  2611.08/sec)
-21-Jun-2020 22:51:00 test07b        0.0 sec coverage:   102 :  1308 of 14781 (  8.8% rate:  2426.61/sec)
-21-Jun-2020 22:51:00 test01         0.2 sec coverage:  1025 :  2333 of 14781 ( 15.8% rate:  6549.14/sec)
-21-Jun-2020 22:51:00 test01         0.1 sec coverage:    27 :  2360 of 14781 ( 16.0% rate:   421.30/sec)
-21-Jun-2020 22:51:00 test83         0.0 sec coverage:    15 :  2375 of 14781 ( 16.1% rate:  7853.40/sec)
-21-Jun-2020 22:51:00 test136        0.1 sec coverage:    45 :  2420 of 14781 ( 16.4% rate:   420.01/sec)
-21-Jun-2020 22:51:00 test98         0.1 sec coverage:    13 :  2433 of 14781 ( 16.5% rate:   146.02/sec)
-21-Jun-2020 22:51:00 test84         0.0 sec coverage:    14 :  2447 of 14781 ( 16.6% rate:   531.91/sec)
-21-Jun-2020 22:51:00 test85         0.0 sec coverage:     6 :  2453 of 14781 ( 16.6% rate:   144.93/sec)
-21-Jun-2020 22:51:01 test02         0.2 sec coverage:   481 :  2934 of 14781 ( 19.8% rate:  2295.88/sec)
-21-Jun-2020 22:51:01 test148        0.2 sec coverage:    22 :  2956 of 14781 ( 20.0% rate:   102.22/sec)
-21-Jun-2020 22:51:01 test150        0.1 sec coverage:    39 :  2995 of 14781 ( 20.3% rate:   494.55/sec)
-21-Jun-2020 22:51:01 test137        0.1 sec coverage:     4 :  2999 of 14781 ( 20.3% rate:    32.06/sec)
-21-Jun-2020 22:51:01 test138        0.1 sec coverage:     5 :  3004 of 14781 ( 20.3% rate:    46.45/sec)
-21-Jun-2020 22:51:02 test139        0.5 sec coverage:    36 :  3040 of 14781 ( 20.6% rate:    72.63/sec)
-21-Jun-2020 22:51:02 test72         0.2 sec coverage:    10 :  3050 of 14781 ( 20.6% rate:    54.87/sec)
-21-Jun-2020 22:51:02 test09         0.0 sec coverage:     1 :  3051 of 14781 ( 20.6% rate:   260.21/sec)
-21-Jun-2020 22:51:02 test109        0.1 sec coverage:     9 :  3060 of 14781 ( 20.7% rate:   176.13/sec)
-21-Jun-2020 22:51:02 test109        0.0 sec coverage:     3 :  3063 of 14781 ( 20.7% rate:  2120.14/sec)
-21-Jun-2020 22:51:02 test110        0.1 sec coverage:     7 :  3070 of 14781 ( 20.8% rate:   122.46/sec)
-21-Jun-2020 22:51:02 test131        0.0 sec coverage:     2 :  3072 of 14781 ( 20.8% rate:    43.11/sec)
-21-Jun-2020 22:51:02 test132        0.1 sec coverage:    10 :  3082 of 14781 ( 20.9% rate:   193.31/sec)
-21-Jun-2020 22:51:02 test92         0.0 sec coverage:     4 :  3086 of 14781 ( 20.9% rate:    88.71/sec)
-21-Jun-2020 22:51:02 test97         0.2 sec coverage:     8 :  3094 of 14781 ( 20.9% rate:    53.21/sec)
-21-Jun-2020 22:51:02 test04         0.0 sec coverage:    16 :  3110 of 14781 ( 21.0% rate:   648.64/sec)
-21-Jun-2020 22:51:02 test15         0.1 sec coverage:    14 :  3124 of 14781 ( 21.1% rate:   142.27/sec)
-21-Jun-2020 22:51:02 test78         0.1 sec coverage:     2 :  3126 of 14781 ( 21.1% rate:    34.14/sec)
-21-Jun-2020 22:51:02 test82         0.1 sec coverage:     9 :  3135 of 14781 ( 21.2% rate:    86.86/sec)
-21-Jun-2020 22:51:03 test94         0.1 sec coverage:    32 :  3167 of 14781 ( 21.4% rate:   332.80/sec)
-21-Jun-2020 22:51:03 test94         0.0 sec coverage:     2 :  3169 of 14781 ( 21.4% rate:    57.25/sec)
-21-Jun-2020 22:51:03 test126        0.1 sec coverage:     7 :  3176 of 14781 ( 21.5% rate:    50.39/sec)
-21-Jun-2020 22:51:03 test03         0.1 sec coverage:     1 :  3177 of 14781 ( 21.5% rate:     8.43/sec)
-21-Jun-2020 22:51:03 test03         0.1 sec coverage:     2 :  3179 of 14781 ( 21.5% rate:    31.19/sec)
-21-Jun-2020 22:51:03 test128        0.2 sec coverage:    46 :  3225 of 14781 ( 21.8% rate:   262.98/sec)
-21-Jun-2020 22:51:21 test17        18.0 sec coverage:     4 :  3229 of 14781 ( 21.8% rate:     0.22/sec)
-21-Jun-2020 22:51:21 test108        0.2 sec coverage:    18 :  3247 of 14781 ( 22.0% rate:    86.97/sec)
-21-Jun-2020 22:51:22 test124        0.4 sec coverage:     3 :  3250 of 14781 ( 22.0% rate:     7.90/sec)
-21-Jun-2020 22:51:22 test101        0.5 sec coverage:    26 :  3276 of 14781 ( 22.2% rate:    49.30/sec)
-21-Jun-2020 22:51:23 test26         0.5 sec coverage:    73 :  3349 of 14781 ( 22.7% rate:   157.99/sec)
-21-Jun-2020 22:51:26 test141        3.2 sec coverage:   908 :  4257 of 14781 ( 28.8% rate:   286.89/sec)
-21-Jun-2020 22:51:32 test142        5.9 sec coverage:   670 :  4927 of 14781 ( 33.3% rate:   113.31/sec)
-21-Jun-2020 22:51:32 test144        0.1 sec coverage:    11 :  4938 of 14781 ( 33.4% rate:   104.13/sec)
-21-Jun-2020 22:51:32 test145        0.1 sec coverage:    23 :  4961 of 14781 ( 33.6% rate:   168.64/sec)
-21-Jun-2020 22:51:32 test147        0.2 sec coverage:     1 :  4962 of 14781 ( 33.6% rate:     4.07/sec)
-21-Jun-2020 22:51:32 test146        0.0 sec coverage:     2 :  4964 of 14781 ( 33.6% rate:    40.32/sec)
-21-Jun-2020 22:51:33 test149        1.0 sec coverage:     7 :  4971 of 14781 ( 33.6% rate:     6.71/sec)
-21-Jun-2020 22:51:34 test133        0.5 sec coverage:     8 :  4979 of 14781 ( 33.7% rate:    15.40/sec)
-21-Jun-2020 22:51:47 test151       12.6 sec coverage:    33 :  5012 of 14781 ( 33.9% rate:     2.62/sec)
-21-Jun-2020 22:52:08 test99        21.4 sec coverage:    21 :  5033 of 14781 ( 34.1% rate:     0.98/sec)
-21-Jun-2020 22:52:12 test29         4.2 sec coverage:   166 :  5199 of 14781 ( 35.2% rate:    39.80/sec)
-21-Jun-2020 22:52:12 test90         0.3 sec coverage:    17 :  5216 of 14781 ( 35.3% rate:    50.71/sec)
-21-Jun-2020 22:52:13 testc2(1)      0.2 sec coverage:    31 :  5247 of 14781 ( 35.5% rate:   197.69/sec)
-21-Jun-2020 22:52:13 test80         0.7 sec coverage:    11 :  5258 of 14781 ( 35.6% rate:    15.11/sec)
-21-Jun-2020 22:52:14 test130        1.0 sec coverage:    13 :  5271 of 14781 ( 35.7% rate:    12.61/sec)
-21-Jun-2020 22:52:21 test14         6.6 sec coverage:   132 :  5403 of 14781 ( 36.6% rate:    20.04/sec)
-21-Jun-2020 22:52:23 test129        2.2 sec coverage:     6 :  5409 of 14781 ( 36.6% rate:     2.75/sec)
-21-Jun-2020 22:52:24 test102        0.8 sec coverage:     3 :  5412 of 14781 ( 36.6% rate:     3.72/sec)
-21-Jun-2020 22:52:25 test12         1.3 sec coverage:     1 :  5413 of 14781 ( 36.6% rate:     0.77/sec)
-21-Jun-2020 22:52:26 test28         0.4 sec coverage:     2 :  5415 of 14781 ( 36.6% rate:     4.78/sec)
-21-Jun-2020 22:52:27 test107        1.3 sec coverage:     4 :  5419 of 14781 ( 36.7% rate:     3.04/sec)
-21-Jun-2020 22:52:30 test93         2.7 sec coverage:     3 :  5422 of 14781 ( 36.7% rate:     1.11/sec)
-21-Jun-2020 22:52:31 test135        1.4 sec coverage:     2 :  5424 of 14781 ( 36.7% rate:     1.45/sec)
-21-Jun-2020 22:52:34 test11         2.6 sec coverage:     3 :  5427 of 14781 ( 36.7% rate:     1.16/sec)
-21-Jun-2020 22:52:38 test106        4.2 sec coverage:     4 :  5431 of 14781 ( 36.7% rate:     0.94/sec)
-21-Jun-2020 22:52:46 test69         7.8 sec coverage:     7 :  5438 of 14781 ( 36.8% rate:     0.89/sec)
-21-Jun-2020 22:55:41 test77       175.3 sec coverage:    33 :  5471 of 14781 ( 37.0% rate:     0.19/sec)
-21-Jun-2020 22:55:44 test19b        2.4 sec coverage:    70 :  5541 of 14781 ( 37.5% rate:    28.73/sec)
-21-Jun-2020 22:55:46 test19b        2.1 sec coverage:     8 :  5549 of 14781 ( 37.5% rate:     3.84/sec)
-21-Jun-2020 22:55:46 test104        0.7 sec coverage:     1 :  5550 of 14781 ( 37.5% rate:     1.51/sec)
-21-Jun-2020 22:56:40 test154       54.0 sec coverage:  1535 :  7085 of 14781 ( 47.9% rate:    28.41/sec)
-21-Jun-2020 22:57:00 test125       19.5 sec coverage:   607 :  7692 of 14781 ( 52.0% rate:    31.17/sec)
-21-Jun-2020 22:58:13 test74        73.6 sec coverage:  4467 : 12159 of 14781 ( 82.3% rate:    60.70/sec)
-21-Jun-2020 22:58:24 test54        10.1 sec coverage:    21 : 12180 of 14781 ( 82.4% rate:     2.07/sec)
-21-Jun-2020 22:58:36 test23        12.4 sec coverage:    87 : 12267 of 14781 ( 83.0% rate:     6.99/sec)
-21-Jun-2020 22:58:40 test00         3.7 sec coverage:    11 : 12278 of 14781 ( 83.1% rate:     2.95/sec)
-21-Jun-2020 22:58:47 test76         7.0 sec coverage:    20 : 12298 of 14781 ( 83.2% rate:     2.85/sec)
-21-Jun-2020 22:58:48 test88         1.5 sec coverage:     4 : 12302 of 14781 ( 83.2% rate:     2.60/sec)
-21-Jun-2020 22:59:05 test127       16.2 sec coverage:    27 : 12329 of 14781 ( 83.4% rate:     1.66/sec)
-21-Jun-2020 22:59:21 test143       16.3 sec coverage:    11 : 12340 of 14781 ( 83.5% rate:     0.67/sec)
-21-Jun-2020 22:59:46 test19        24.6 sec coverage:    10 : 12350 of 14781 ( 83.6% rate:     0.41/sec)
-21-Jun-2020 23:00:10 test53        24.2 sec coverage:     4 : 12354 of 14781 ( 83.6% rate:     0.17/sec)
-21-Jun-2020 23:00:22 test27        12.2 sec coverage:     2 : 12356 of 14781 ( 83.6% rate:     0.16/sec)
-[malloc debugging turned off]
-21-Jun-2020 23:03:09 test10       167.5 sec coverage:   589 : 12945 of 14781 ( 87.6% rate:     3.52/sec)
-21-Jun-2020 23:03:57 test134       47.8 sec coverage:   327 : 13272 of 14781 ( 89.8% rate:     6.84/sec)
-21-Jun-2020 23:09:24 test75b      326.8 sec coverage:  1463 : 14735 of 14781 ( 99.7% rate:     4.48/sec)
-21-Jun-2020 23:10:03 test21        39.0 sec coverage:    12 : 14747 of 14781 ( 99.8% rate:     0.31/sec)
-21-Jun-2020 23:12:58 test16       175.2 sec coverage:    11 : 14758 of 14781 ( 99.8% rate:     0.06/sec)
-21-Jun-2020 23:14:08 test81        70.1 sec coverage:     6 : 14764 of 14781 ( 99.9% rate:     0.09/sec)
-21-Jun-2020 23:15:12 test21b       63.2 sec coverage:     3 : 14767 of 14781 ( 99.9% rate:     0.05/sec)
-21-Jun-2020 23:18:25 test18       193.4 sec coverage:     7 : 14774 of 14781 (100.0% rate:     0.04/sec)
-21-Jun-2020 23:18:27 test20         1.9 sec coverage:     7 :   all 14781 (full 100% rate:     3.71/sec)
-[malloc debugging turned back on]
diff --git a/GraphBLAS/Tcov/log_23June2020.txt b/GraphBLAS/Tcov/log_23June2020.txt
deleted file mode 100644
index 5f16218f35..0000000000
--- a/GraphBLAS/Tcov/log_23June2020.txt
+++ /dev/null
@@ -1,96 +0,0 @@
-
----------------------------------------------- [malloc] [cover]
-23-Jun-2020 12:07:01 test152        1.3 sec coverage:   975 :   975 of 14781 (  6.6% rate:   723.52/sec)
-23-Jun-2020 12:07:02 test155        0.1 sec coverage:   231 :  1206 of 14781 (  8.2% rate:  1828.91/sec)
-23-Jun-2020 12:07:02 test07b        0.1 sec coverage:   102 :  1308 of 14781 (  8.8% rate:  1622.40/sec)
-23-Jun-2020 12:07:02 test01         0.2 sec coverage:  1025 :  2333 of 14781 ( 15.8% rate:  5290.81/sec)
-23-Jun-2020 12:07:02 test01         0.1 sec coverage:    27 :  2360 of 14781 ( 16.0% rate:   350.06/sec)
-23-Jun-2020 12:07:02 test83         0.0 sec coverage:    15 :  2375 of 14781 ( 16.1% rate:  2323.42/sec)
-23-Jun-2020 12:07:02 test136        0.2 sec coverage:    45 :  2420 of 14781 ( 16.4% rate:   195.24/sec)
-23-Jun-2020 12:07:02 test98         0.1 sec coverage:    13 :  2433 of 14781 ( 16.5% rate:    95.38/sec)
-23-Jun-2020 12:07:02 test84         0.1 sec coverage:    14 :  2447 of 14781 ( 16.6% rate:   262.74/sec)
-23-Jun-2020 12:07:02 test85         0.1 sec coverage:     6 :  2453 of 14781 ( 16.6% rate:   108.59/sec)
-23-Jun-2020 12:07:03 test02         0.3 sec coverage:   481 :  2934 of 14781 ( 19.8% rate:  1844.68/sec)
-23-Jun-2020 12:07:03 test148        0.3 sec coverage:    22 :  2956 of 14781 ( 20.0% rate:    72.02/sec)
-23-Jun-2020 12:07:03 test150        0.1 sec coverage:    39 :  2995 of 14781 ( 20.3% rate:   321.64/sec)
-23-Jun-2020 12:07:03 test137        0.2 sec coverage:     4 :  2999 of 14781 ( 20.3% rate:    22.93/sec)
-23-Jun-2020 12:07:03 test138        0.1 sec coverage:     5 :  3004 of 14781 ( 20.3% rate:    37.94/sec)
-23-Jun-2020 12:07:04 test139        0.5 sec coverage:    36 :  3040 of 14781 ( 20.6% rate:    67.42/sec)
-23-Jun-2020 12:07:04 test72         0.2 sec coverage:    10 :  3050 of 14781 ( 20.6% rate:    49.86/sec)
-23-Jun-2020 12:07:04 test09         0.0 sec coverage:     1 :  3051 of 14781 ( 20.6% rate:   103.90/sec)
-23-Jun-2020 12:07:04 test109        0.1 sec coverage:     9 :  3060 of 14781 ( 20.7% rate:   107.55/sec)
-23-Jun-2020 12:07:04 test109        0.0 sec coverage:     3 :  3063 of 14781 ( 20.7% rate:  1767.83/sec)
-23-Jun-2020 12:07:04 test110        0.1 sec coverage:     7 :  3070 of 14781 ( 20.8% rate:    99.74/sec)
-23-Jun-2020 12:07:04 test131        0.1 sec coverage:     2 :  3072 of 14781 ( 20.8% rate:    37.56/sec)
-23-Jun-2020 12:07:04 test132        0.1 sec coverage:    10 :  3082 of 14781 ( 20.9% rate:   154.23/sec)
-23-Jun-2020 12:07:05 test92         0.1 sec coverage:     4 :  3086 of 14781 ( 20.9% rate:    74.78/sec)
-23-Jun-2020 12:07:05 test97         0.2 sec coverage:     8 :  3094 of 14781 ( 20.9% rate:    48.60/sec)
-23-Jun-2020 12:07:05 test04         0.0 sec coverage:    16 :  3110 of 14781 ( 21.0% rate:   367.47/sec)
-23-Jun-2020 12:07:05 test15         0.1 sec coverage:    14 :  3124 of 14781 ( 21.1% rate:   125.94/sec)
-23-Jun-2020 12:07:05 test78         0.1 sec coverage:     2 :  3126 of 14781 ( 21.1% rate:    31.57/sec)
-23-Jun-2020 12:07:05 test82         0.1 sec coverage:     9 :  3135 of 14781 ( 21.2% rate:    80.63/sec)
-23-Jun-2020 12:07:05 test94         0.1 sec coverage:    32 :  3167 of 14781 ( 21.4% rate:   317.74/sec)
-23-Jun-2020 12:07:05 test94         0.0 sec coverage:     2 :  3169 of 14781 ( 21.4% rate:    66.13/sec)
-23-Jun-2020 12:07:05 test126        0.1 sec coverage:     7 :  3176 of 14781 ( 21.5% rate:    47.55/sec)
-23-Jun-2020 12:07:05 test03         0.1 sec coverage:     1 :  3177 of 14781 ( 21.5% rate:     8.18/sec)
-23-Jun-2020 12:07:06 test03         0.1 sec coverage:     2 :  3179 of 14781 ( 21.5% rate:    29.28/sec)
-23-Jun-2020 12:07:06 test128        0.2 sec coverage:    46 :  3225 of 14781 ( 21.8% rate:   256.47/sec)
-23-Jun-2020 12:07:22 test17        16.4 sec coverage:     4 :  3229 of 14781 ( 21.8% rate:     0.24/sec)
-23-Jun-2020 12:07:22 test108        0.2 sec coverage:    18 :  3247 of 14781 ( 22.0% rate:    72.27/sec)
-23-Jun-2020 12:07:23 test124        0.2 sec coverage:     3 :  3250 of 14781 ( 22.0% rate:    12.41/sec)
-23-Jun-2020 12:07:23 test101        0.5 sec coverage:    26 :  3276 of 14781 ( 22.2% rate:    55.72/sec)
-23-Jun-2020 12:07:24 test26         0.5 sec coverage:    73 :  3349 of 14781 ( 22.7% rate:   149.25/sec)
-23-Jun-2020 12:07:27 test141        3.2 sec coverage:   908 :  4257 of 14781 ( 28.8% rate:   281.31/sec)
-23-Jun-2020 12:07:32 test142        5.2 sec coverage:   670 :  4927 of 14781 ( 33.3% rate:   129.05/sec)
-23-Jun-2020 12:07:32 test144        0.1 sec coverage:    11 :  4938 of 14781 ( 33.4% rate:    91.36/sec)
-23-Jun-2020 12:07:32 test145        0.1 sec coverage:    23 :  4961 of 14781 ( 33.6% rate:   168.67/sec)
-23-Jun-2020 12:07:32 test147        0.3 sec coverage:     1 :  4962 of 14781 ( 33.6% rate:     3.68/sec)
-23-Jun-2020 12:07:33 test146        0.1 sec coverage:     2 :  4964 of 14781 ( 33.6% rate:    35.49/sec)
-23-Jun-2020 12:07:34 test149        1.1 sec coverage:     7 :  4971 of 14781 ( 33.6% rate:     6.63/sec)
-23-Jun-2020 12:07:34 test133        0.4 sec coverage:     8 :  4979 of 14781 ( 33.7% rate:    19.30/sec)
-23-Jun-2020 12:07:45 test151       11.2 sec coverage:    33 :  5012 of 14781 ( 33.9% rate:     2.95/sec)
-23-Jun-2020 12:08:04 test99        18.6 sec coverage:    21 :  5033 of 14781 ( 34.1% rate:     1.13/sec)
-23-Jun-2020 12:08:07 test29         3.6 sec coverage:   166 :  5199 of 14781 ( 35.2% rate:    45.69/sec)
-23-Jun-2020 12:08:08 test90         0.3 sec coverage:    17 :  5216 of 14781 ( 35.3% rate:    49.81/sec)
-23-Jun-2020 12:08:08 testc2(1)      0.2 sec coverage:    31 :  5247 of 14781 ( 35.5% rate:   151.94/sec)
-23-Jun-2020 12:08:09 test80         0.6 sec coverage:    11 :  5258 of 14781 ( 35.6% rate:    17.18/sec)
-23-Jun-2020 12:08:10 test130        1.0 sec coverage:    13 :  5271 of 14781 ( 35.7% rate:    13.42/sec)
-23-Jun-2020 12:08:16 test14         6.3 sec coverage:   132 :  5403 of 14781 ( 36.6% rate:    21.08/sec)
-23-Jun-2020 12:08:18 test129        2.0 sec coverage:     6 :  5409 of 14781 ( 36.6% rate:     3.04/sec)
-23-Jun-2020 12:08:19 test102        0.9 sec coverage:     3 :  5412 of 14781 ( 36.6% rate:     3.17/sec)
-23-Jun-2020 12:08:20 test12         1.3 sec coverage:     1 :  5413 of 14781 ( 36.6% rate:     0.76/sec)
-23-Jun-2020 12:08:21 test28         0.4 sec coverage:     2 :  5415 of 14781 ( 36.6% rate:     4.83/sec)
-23-Jun-2020 12:08:22 test107        1.4 sec coverage:     4 :  5419 of 14781 ( 36.7% rate:     2.96/sec)
-23-Jun-2020 12:08:25 test93         2.7 sec coverage:     3 :  5422 of 14781 ( 36.7% rate:     1.11/sec)
-23-Jun-2020 12:08:26 test135        1.3 sec coverage:     2 :  5424 of 14781 ( 36.7% rate:     1.48/sec)
-23-Jun-2020 12:08:28 test11         2.5 sec coverage:     3 :  5427 of 14781 ( 36.7% rate:     1.19/sec)
-23-Jun-2020 12:08:33 test106        4.1 sec coverage:     4 :  5431 of 14781 ( 36.7% rate:     0.97/sec)
-23-Jun-2020 12:08:40 test69         7.3 sec coverage:     7 :  5438 of 14781 ( 36.8% rate:     0.96/sec)
-23-Jun-2020 12:11:14 test77       154.3 sec coverage:    33 :  5471 of 14781 ( 37.0% rate:     0.21/sec)
-23-Jun-2020 12:11:17 test19b        2.4 sec coverage:    70 :  5541 of 14781 ( 37.5% rate:    29.48/sec)
-23-Jun-2020 12:11:19 test19b        1.9 sec coverage:     8 :  5549 of 14781 ( 37.5% rate:     4.18/sec)
-23-Jun-2020 12:11:19 test104        0.6 sec coverage:     1 :  5550 of 14781 ( 37.5% rate:     1.59/sec)
-23-Jun-2020 12:12:09 test154       50.1 sec coverage:  1535 :  7085 of 14781 ( 47.9% rate:    30.61/sec)
-23-Jun-2020 12:12:27 test125       17.8 sec coverage:   607 :  7692 of 14781 ( 52.0% rate:    34.10/sec)
-23-Jun-2020 12:13:36 test74        68.5 sec coverage:  4467 : 12159 of 14781 ( 82.3% rate:    65.21/sec)
-23-Jun-2020 12:13:45 test54         9.3 sec coverage:    21 : 12180 of 14781 ( 82.4% rate:     2.26/sec)
-23-Jun-2020 12:13:57 test23        12.2 sec coverage:    87 : 12267 of 14781 ( 83.0% rate:     7.14/sec)
-23-Jun-2020 12:14:00 test00         3.2 sec coverage:    11 : 12278 of 14781 ( 83.1% rate:     3.39/sec)
-23-Jun-2020 12:14:07 test76         6.4 sec coverage:    20 : 12298 of 14781 ( 83.2% rate:     3.14/sec)
-23-Jun-2020 12:14:08 test88         1.4 sec coverage:     4 : 12302 of 14781 ( 83.2% rate:     2.86/sec)
-23-Jun-2020 12:14:23 test127       14.9 sec coverage:    27 : 12329 of 14781 ( 83.4% rate:     1.81/sec)
-23-Jun-2020 12:14:36 test143       12.4 sec coverage:    11 : 12340 of 14781 ( 83.5% rate:     0.89/sec)
-23-Jun-2020 12:14:57 test19        21.6 sec coverage:    10 : 12350 of 14781 ( 83.6% rate:     0.46/sec)
-23-Jun-2020 12:15:20 test53        22.6 sec coverage:     4 : 12354 of 14781 ( 83.6% rate:     0.18/sec)
-23-Jun-2020 12:15:31 test27        10.9 sec coverage:     2 : 12356 of 14781 ( 83.6% rate:     0.18/sec)
-[malloc debugging turned off]
-23-Jun-2020 12:18:01 test10       150.1 sec coverage:   589 : 12945 of 14781 ( 87.6% rate:     3.92/sec)
-23-Jun-2020 12:18:42 test134       40.9 sec coverage:   327 : 13272 of 14781 ( 89.8% rate:     8.00/sec)
-23-Jun-2020 12:23:42 test75b      300.2 sec coverage:  1463 : 14735 of 14781 ( 99.7% rate:     4.87/sec)
-23-Jun-2020 12:24:19 test21        37.4 sec coverage:    12 : 14747 of 14781 ( 99.8% rate:     0.32/sec)
-23-Jun-2020 12:26:59 test16       160.1 sec coverage:    11 : 14758 of 14781 ( 99.8% rate:     0.07/sec)
-23-Jun-2020 12:28:00 test81        60.7 sec coverage:     6 : 14764 of 14781 ( 99.9% rate:     0.10/sec)
-23-Jun-2020 12:28:55 test21b       55.3 sec coverage:     3 : 14767 of 14781 ( 99.9% rate:     0.05/sec)
-23-Jun-2020 12:31:45 test18       169.6 sec coverage:     7 : 14774 of 14781 (100.0% rate:     0.04/sec)
-23-Jun-2020 12:31:47 test20         1.8 sec coverage:     7 :   all 14781 (full 100% rate:     4.00/sec)
-[malloc debugging turned back on]
diff --git a/GraphBLAS/Tcov/log_30June2020.txt b/GraphBLAS/Tcov/log_30June2020.txt
deleted file mode 100644
index 1021d6fdf7..0000000000
--- a/GraphBLAS/Tcov/log_30June2020.txt
+++ /dev/null
@@ -1,97 +0,0 @@
-
----------------------------------------------- [malloc] [cover]
-30-Jun-2020 06:13:35 test152        1.3 sec coverage:   975 :   975 of 14781 (  6.6% rate:   778.54/sec)
-30-Jun-2020 06:13:35 test155        0.2 sec coverage:   230 :  1205 of 14781 (  8.2% rate:  1523.43/sec)
-30-Jun-2020 06:13:36 test156        0.6 sec coverage:   307 :  1512 of 14781 ( 10.2% rate:   535.96/sec)
-30-Jun-2020 06:13:36 test07b        0.0 sec coverage:    86 :  1598 of 14781 ( 10.8% rate:  5024.24/sec)
-30-Jun-2020 06:13:36 test01         0.3 sec coverage:   992 :  2590 of 14781 ( 17.5% rate:  3537.63/sec)
-30-Jun-2020 06:13:36 test01         0.1 sec coverage:    27 :  2617 of 14781 ( 17.7% rate:   311.92/sec)
-30-Jun-2020 06:13:36 test83         0.0 sec coverage:    15 :  2632 of 14781 ( 17.8% rate:  2865.33/sec)
-30-Jun-2020 06:13:36 test136        0.3 sec coverage:    45 :  2677 of 14781 ( 18.1% rate:   152.11/sec)
-30-Jun-2020 06:13:37 test98         0.2 sec coverage:    13 :  2690 of 14781 ( 18.2% rate:    80.50/sec)
-30-Jun-2020 06:13:37 test84         0.1 sec coverage:    14 :  2704 of 14781 ( 18.3% rate:   269.80/sec)
-30-Jun-2020 06:13:37 test85         0.1 sec coverage:     6 :  2710 of 14781 ( 18.3% rate:    77.11/sec)
-30-Jun-2020 06:13:37 test02         0.3 sec coverage:   226 :  2936 of 14781 ( 19.9% rate:   695.90/sec)
-30-Jun-2020 06:13:39 test148        1.5 sec coverage:    22 :  2958 of 14781 ( 20.0% rate:    14.44/sec)
-30-Jun-2020 06:13:39 test150        0.2 sec coverage:    39 :  2997 of 14781 ( 20.3% rate:   209.19/sec)
-30-Jun-2020 06:13:39 test137        0.3 sec coverage:     4 :  3001 of 14781 ( 20.3% rate:    14.40/sec)
-30-Jun-2020 06:13:39 test138        0.1 sec coverage:     5 :  3006 of 14781 ( 20.3% rate:    38.02/sec)
-30-Jun-2020 06:13:40 test139        0.6 sec coverage:    36 :  3042 of 14781 ( 20.6% rate:    56.82/sec)
-30-Jun-2020 06:13:40 test72         0.2 sec coverage:    10 :  3052 of 14781 ( 20.6% rate:    50.34/sec)
-30-Jun-2020 06:13:40 test09         0.0 sec coverage:     1 :  3053 of 14781 ( 20.7% rate:   121.60/sec)
-30-Jun-2020 06:13:40 test109        0.1 sec coverage:     9 :  3062 of 14781 ( 20.7% rate:    87.44/sec)
-30-Jun-2020 06:13:40 test109        0.0 sec coverage:     3 :  3065 of 14781 ( 20.7% rate:   969.31/sec)
-30-Jun-2020 06:13:40 test110        0.1 sec coverage:     7 :  3072 of 14781 ( 20.8% rate:    57.17/sec)
-30-Jun-2020 06:13:40 test131        0.1 sec coverage:     2 :  3074 of 14781 ( 20.8% rate:    23.08/sec)
-30-Jun-2020 06:13:40 test132        0.1 sec coverage:    10 :  3084 of 14781 ( 20.9% rate:    99.46/sec)
-30-Jun-2020 06:13:41 test92         0.1 sec coverage:     4 :  3088 of 14781 ( 20.9% rate:    44.85/sec)
-30-Jun-2020 06:13:41 test97         0.2 sec coverage:     8 :  3096 of 14781 ( 20.9% rate:    49.25/sec)
-30-Jun-2020 06:13:41 test04         0.0 sec coverage:    16 :  3112 of 14781 ( 21.1% rate:   396.97/sec)
-30-Jun-2020 06:13:41 test15         0.2 sec coverage:    14 :  3126 of 14781 ( 21.1% rate:    90.36/sec)
-30-Jun-2020 06:13:41 test78         0.1 sec coverage:     2 :  3128 of 14781 ( 21.2% rate:    23.50/sec)
-30-Jun-2020 06:13:41 test82         0.1 sec coverage:     9 :  3137 of 14781 ( 21.2% rate:    63.82/sec)
-30-Jun-2020 06:13:41 test94         0.1 sec coverage:    32 :  3169 of 14781 ( 21.4% rate:   234.64/sec)
-30-Jun-2020 06:13:41 test94         0.0 sec coverage:     2 :  3171 of 14781 ( 21.5% rate:    56.24/sec)
-30-Jun-2020 06:13:41 test126        0.2 sec coverage:     7 :  3178 of 14781 ( 21.5% rate:    42.38/sec)
-30-Jun-2020 06:13:42 test03         0.1 sec coverage:     1 :  3179 of 14781 ( 21.5% rate:     6.95/sec)
-30-Jun-2020 06:13:42 test03         0.1 sec coverage:     2 :  3181 of 14781 ( 21.5% rate:    29.93/sec)
-30-Jun-2020 06:13:42 test128        0.2 sec coverage:    46 :  3227 of 14781 ( 21.8% rate:   239.26/sec)
-30-Jun-2020 06:14:02 test17        19.6 sec coverage:     4 :  3231 of 14781 ( 21.9% rate:     0.20/sec)
-30-Jun-2020 06:14:02 test108        0.3 sec coverage:    18 :  3249 of 14781 ( 22.0% rate:    55.54/sec)
-30-Jun-2020 06:14:02 test124        0.3 sec coverage:     3 :  3252 of 14781 ( 22.0% rate:    11.59/sec)
-30-Jun-2020 06:14:03 test101        0.5 sec coverage:    26 :  3278 of 14781 ( 22.2% rate:    51.93/sec)
-30-Jun-2020 06:14:03 test26         0.6 sec coverage:    73 :  3351 of 14781 ( 22.7% rate:   116.51/sec)
-30-Jun-2020 06:14:07 test141        3.3 sec coverage:   908 :  4259 of 14781 ( 28.8% rate:   273.78/sec)
-30-Jun-2020 06:14:13 test142        6.3 sec coverage:   670 :  4929 of 14781 ( 33.3% rate:   107.17/sec)
-30-Jun-2020 06:14:13 test144        0.2 sec coverage:    11 :  4940 of 14781 ( 33.4% rate:    65.53/sec)
-30-Jun-2020 06:14:13 test145        0.4 sec coverage:    22 :  4962 of 14781 ( 33.6% rate:    55.12/sec)
-30-Jun-2020 06:14:14 test147        0.7 sec coverage:     1 :  4963 of 14781 ( 33.6% rate:     1.49/sec)
-30-Jun-2020 06:14:14 test146        0.2 sec coverage:     2 :  4965 of 14781 ( 33.6% rate:     9.95/sec)
-30-Jun-2020 06:14:18 test149        3.9 sec coverage:     7 :  4972 of 14781 ( 33.6% rate:     1.80/sec)
-30-Jun-2020 06:14:20 test133        1.4 sec coverage:     8 :  4980 of 14781 ( 33.7% rate:     5.52/sec)
-30-Jun-2020 06:14:38 test151       18.8 sec coverage:    33 :  5013 of 14781 ( 33.9% rate:     1.75/sec)
-30-Jun-2020 06:15:02 test99        23.2 sec coverage:    21 :  5034 of 14781 ( 34.1% rate:     0.90/sec)
-30-Jun-2020 06:15:05 test29         3.7 sec coverage:   166 :  5200 of 14781 ( 35.2% rate:    44.56/sec)
-30-Jun-2020 06:15:06 test90         0.3 sec coverage:    17 :  5217 of 14781 ( 35.3% rate:    48.63/sec)
-30-Jun-2020 06:15:06 testc2(1)      0.2 sec coverage:    31 :  5248 of 14781 ( 35.5% rate:   143.28/sec)
-30-Jun-2020 06:15:07 test80         1.1 sec coverage:    11 :  5259 of 14781 ( 35.6% rate:    10.05/sec)
-30-Jun-2020 06:15:08 test130        1.1 sec coverage:    13 :  5272 of 14781 ( 35.7% rate:    12.07/sec)
-30-Jun-2020 06:15:17 test14         8.8 sec coverage:   132 :  5404 of 14781 ( 36.6% rate:    15.08/sec)
-30-Jun-2020 06:15:19 test129        2.0 sec coverage:     6 :  5410 of 14781 ( 36.6% rate:     3.07/sec)
-30-Jun-2020 06:15:20 test102        1.0 sec coverage:     3 :  5413 of 14781 ( 36.6% rate:     2.93/sec)
-30-Jun-2020 06:15:21 test12         1.4 sec coverage:     1 :  5414 of 14781 ( 36.6% rate:     0.71/sec)
-30-Jun-2020 06:15:22 test28         0.5 sec coverage:     2 :  5416 of 14781 ( 36.6% rate:     4.31/sec)
-30-Jun-2020 06:15:24 test107        2.2 sec coverage:     4 :  5420 of 14781 ( 36.7% rate:     1.86/sec)
-30-Jun-2020 06:15:27 test93         2.9 sec coverage:     3 :  5423 of 14781 ( 36.7% rate:     1.04/sec)
-30-Jun-2020 06:15:28 test135        1.5 sec coverage:     2 :  5425 of 14781 ( 36.7% rate:     1.38/sec)
-30-Jun-2020 06:15:31 test11         2.7 sec coverage:     3 :  5428 of 14781 ( 36.7% rate:     1.12/sec)
-30-Jun-2020 06:15:37 test106        5.5 sec coverage:     3 :  5431 of 14781 ( 36.7% rate:     0.54/sec)
-30-Jun-2020 06:15:44 test69         7.6 sec coverage:     7 :  5438 of 14781 ( 36.8% rate:     0.93/sec)
-30-Jun-2020 06:19:15 test77       210.5 sec coverage:    33 :  5471 of 14781 ( 37.0% rate:     0.16/sec)
-30-Jun-2020 06:19:17 test19b        2.6 sec coverage:    70 :  5541 of 14781 ( 37.5% rate:    26.66/sec)
-30-Jun-2020 06:19:19 test19b        2.0 sec coverage:     8 :  5549 of 14781 ( 37.5% rate:     3.99/sec)
-30-Jun-2020 06:19:20 test104        0.7 sec coverage:     1 :  5550 of 14781 ( 37.5% rate:     1.50/sec)
-30-Jun-2020 06:20:22 test154       61.8 sec coverage:  1535 :  7085 of 14781 ( 47.9% rate:    24.82/sec)
-30-Jun-2020 06:20:44 test125       22.2 sec coverage:   607 :  7692 of 14781 ( 52.0% rate:    27.34/sec)
-30-Jun-2020 06:22:27 test74       103.1 sec coverage:  4467 : 12159 of 14781 ( 82.3% rate:    43.34/sec)
-30-Jun-2020 06:22:44 test54        17.2 sec coverage:    21 : 12180 of 14781 ( 82.4% rate:     1.22/sec)
-30-Jun-2020 06:23:06 test23        22.1 sec coverage:    87 : 12267 of 14781 ( 83.0% rate:     3.93/sec)
-30-Jun-2020 06:23:13 test00         6.9 sec coverage:    11 : 12278 of 14781 ( 83.1% rate:     1.60/sec)
-30-Jun-2020 06:23:24 test76        10.3 sec coverage:    20 : 12298 of 14781 ( 83.2% rate:     1.94/sec)
-30-Jun-2020 06:23:25 test88         1.7 sec coverage:     4 : 12302 of 14781 ( 83.2% rate:     2.32/sec)
-30-Jun-2020 06:23:48 test127       22.9 sec coverage:    27 : 12329 of 14781 ( 83.4% rate:     1.18/sec)
-30-Jun-2020 06:24:03 test143       14.5 sec coverage:    11 : 12340 of 14781 ( 83.5% rate:     0.76/sec)
-30-Jun-2020 06:24:34 test19        30.9 sec coverage:    10 : 12350 of 14781 ( 83.6% rate:     0.32/sec)
-30-Jun-2020 06:25:02 test53        27.9 sec coverage:     4 : 12354 of 14781 ( 83.6% rate:     0.14/sec)
-30-Jun-2020 06:25:13 test27        11.7 sec coverage:     2 : 12356 of 14781 ( 83.6% rate:     0.17/sec)
-[malloc debugging turned off]
-30-Jun-2020 06:28:44 test10       210.5 sec coverage:   589 : 12945 of 14781 ( 87.6% rate:     2.80/sec)
-30-Jun-2020 06:29:36 test134       52.2 sec coverage:   327 : 13272 of 14781 ( 89.8% rate:     6.26/sec)
-30-Jun-2020 06:37:14 test75b      458.5 sec coverage:  1463 : 14735 of 14781 ( 99.7% rate:     3.19/sec)
-30-Jun-2020 06:38:34 test21        79.4 sec coverage:    12 : 14747 of 14781 ( 99.8% rate:     0.15/sec)
-30-Jun-2020 06:43:16 test16       282.3 sec coverage:    11 : 14758 of 14781 ( 99.8% rate:     0.04/sec)
-30-Jun-2020 06:45:01 test81       104.9 sec coverage:     6 : 14764 of 14781 ( 99.9% rate:     0.06/sec)
-30-Jun-2020 06:46:19 test21b       78.3 sec coverage:     3 : 14767 of 14781 ( 99.9% rate:     0.04/sec)
-30-Jun-2020 06:53:02 test18       402.7 sec coverage:     7 : 14774 of 14781 (100.0% rate:     0.02/sec)
-30-Jun-2020 06:53:07 test20         5.1 sec coverage:     7 :   all 14781 (full 100% rate:     1.37/sec)
-[malloc debugging turned back on]
diff --git a/GraphBLAS/Tcov/testcov.m b/GraphBLAS/Tcov/testcov.m
index b94b16bf83..d8f09c845b 100644
--- a/GraphBLAS/Tcov/testcov.m
+++ b/GraphBLAS/Tcov/testcov.m
@@ -1,12 +1,12 @@
 %TESTCOV run all GraphBLAS tests, with statement coverage
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 if (ispc)
     error ('The tests in Tcov are not ported to Windows') ;
 end
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
 try
     addpath ('../Test') ;
     addpath ('../Test/spok') ;
diff --git a/GraphBLAS/Test/Contents.m b/GraphBLAS/Test/Contents.m
index d9c108fc4b..b124b6b19f 100644
--- a/GraphBLAS/Test/Contents.m
+++ b/GraphBLAS/Test/Contents.m
@@ -100,7 +100,6 @@
 %   test34   - test GrB_eWiseAdd
 %   test35   - test GrB_*_extractTuples
 %   test36   - performance test of matrix subref
-%   test37   - performance test of qsort
 %   test38   - test GrB_transpose
 %   test39   - performance test for GrB_transpose
 %   test40   - test GrB_Matrix_extractElement
@@ -154,7 +153,7 @@
 %   test85   - test GrB_transpose: 1-by-n with typecasting
 %   test86   - performance test of of GrB_Matrix_extract
 %   test87   - performance test of GrB_mxm
-%   test88   - test hypersparse matrices with heap-based method
+%   test88   - test hypersparse matrices with hash-based method
 %   test89   - performance test of complex A*B
 %   test90   - test AxB with user-defined semirings: plus_rdiv and plus_rdiv2
 %   test91   - test subref performance on dense vectors
@@ -169,7 +168,7 @@
 %   test99   - test GB_mex_transpose with explicit zeros in the Mask
 %   test100  - test GB_mex_isequal
 %   test101  - test import/export
-%   test102  - test GB_AxB_flopcount
+%   test102  - test GB_AxB_saxpy3_flopcount
 %   test103  - test aliases in GrB_transpose
 %   test104  - export/import
 %   test105  - eWiseAdd with hypersparse matrices
@@ -274,7 +273,7 @@
 %   stat             - report status of statement coverage and malloc debugging
 %   GB_define        - create C source code for GraphBLAS.h
 
-%   grbresults       - return time taken by last GraphBLAS function, and AxB method
+%   grbresults       - return time taken by last GraphBLAS function
 %   isequal_roundoff - compare two matrices, allowing for roundoff errors
 
 %   test_other       - installs all packages needed for extensive tests
@@ -306,5 +305,6 @@
 %   ../Demo/MATLAB/kron_demo      - test Program/kron_demo.c and compare with MATLAB kron
 %   ../Demo/MATLAB/kron_test      - test kron_demo.m
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
diff --git a/GraphBLAS/Test/GB_builtin_complex_get.m b/GraphBLAS/Test/GB_builtin_complex_get.m
index 9bb93a23ad..8de697d059 100644
--- a/GraphBLAS/Test/GB_builtin_complex_get.m
+++ b/GraphBLAS/Test/GB_builtin_complex_get.m
@@ -9,8 +9,8 @@
 %
 % See also GB_builtin_complex_set.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 global GraphBLAS_builtin_complex
 if (isempty (GraphBLAS_builtin_complex))
diff --git a/GraphBLAS/Test/GB_builtin_complex_set.m b/GraphBLAS/Test/GB_builtin_complex_set.m
index d2959c7a0b..538be1fab8 100644
--- a/GraphBLAS/Test/GB_builtin_complex_set.m
+++ b/GraphBLAS/Test/GB_builtin_complex_set.m
@@ -9,8 +9,8 @@
 %
 % See also GB_builtin_complex_get.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 global GraphBLAS_builtin_complex
 if (nargin > 0)
diff --git a/GraphBLAS/Test/GB_complex_compare.m b/GraphBLAS/Test/GB_complex_compare.m
index a3a3d04864..8b220c10b6 100644
--- a/GraphBLAS/Test/GB_complex_compare.m
+++ b/GraphBLAS/Test/GB_complex_compare.m
@@ -3,10 +3,10 @@ function GB_complex_compare (C1, C2, tol)
 %
 % compare two complex results, from GB_mex_op and GB_user_op
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-if (tol)
+if (tol ~= 0)
     if (any (any (isinf (C1))))
         assert (isequal (isinf (C1), isinf (C2)))
         C1 (isinf (C1)) = 1i ;
diff --git a/GraphBLAS/Test/GB_mex.h b/GraphBLAS/Test/GB_mex.h
index 6e482a2467..bad3be966a 100644
--- a/GraphBLAS/Test/GB_mex.h
+++ b/GraphBLAS/Test/GB_mex.h
@@ -2,8 +2,8 @@
 // GB_mex.h: definitions for the MATLAB interface to GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,8 +13,6 @@
 #ifndef GB_MEXH
 #define GB_MEXH
 
-#define GB_PANIC mexErrMsgTxt ("panic") ;
-
 // #include "GB.h"
 #include "GB_mxm.h"
 #include "GB_Pending.h"
@@ -50,10 +48,7 @@
 
 // timer functions, and result statistics
 extern double grbtime, tic [2] ;
-void GB_mx_put_time
-(
-    GrB_Desc_Value AxB_method_used
-) ;
+void GB_mx_put_time (void) ;
 void GB_mx_clear_time (void) ;          // clear the time and start the tic
 #define GB_MEX_TIC { GB_mx_clear_time ( ) ; }
 #define GB_MEX_TOC { grbtime = simple_toc (tic) ; }
@@ -214,8 +209,7 @@ bool GB_mx_get_global       // true if doing malloc_debug
 
 void GB_mx_put_global
 (   
-    bool cover,
-    GrB_Desc_Value AxB_method_used
+    bool cover
 ) ;
 
 bool GB_mx_same     // true if arrays X and Y are the same
@@ -229,6 +223,7 @@ bool GB_mx_xsame    // true if arrays X and Y are the same (ignoring zombies)
 (
     char *X,
     char *Y,
+    int8_t *Xb,     // bitmap of X and Y (NULL if no bitmap)
     int64_t len,    // length of X and Y
     size_t s,       // size of each entry of X and Y
     int64_t *I      // row indices (for zombies), same length as X and Y
@@ -238,6 +233,7 @@ bool GB_mx_xsame32  // true if arrays X and Y are the same (ignoring zombies)
 (
     float *X,
     float *Y,
+    int8_t *Xb,     // bitmap of X and Y (NULL if no bitmap)
     int64_t len,    // length of X and Y
     int64_t *I,     // row indices (for zombies), same length as X and Y
     float eps       // error tolerance allowed (eps > 0)
@@ -247,6 +243,7 @@ bool GB_mx_xsame64  // true if arrays X and Y are the same (ignoring zombies)
 (
     double *X,
     double *Y,
+    int8_t *Xb,     // bitmap of X and Y (NULL if no bitmap)
     int64_t len,    // length of X and Y
     int64_t *I,     // row indices (for zombies), same length as X and Y
     double eps      // error tolerance allowed (eps > 0)
@@ -333,7 +330,7 @@ GrB_Type GB_mx_string_to_Type       // GrB_Type from the string
         if (! (info == GrB_SUCCESS || info == GrB_NO_VALUE))                \
         {                                                                   \
             FREE_ALL ;                                                      \
-            mexErrMsgTxt (GrB_error ( )) ;                                  \
+            mexErrMsgTxt ("method failed") ;                                \
         }                                                                   \
     }                                                                       \
     else                                                                    \
@@ -378,7 +375,7 @@ GrB_Type GB_mx_string_to_Type       // GrB_Type from the string
                         "method [%s]\n",                                    \
                         tries, nleak, nmalloc_end, nmalloc_start,           \
                         GB_STR (GRAPHBLAS_OPERATION)) ;                     \
-                    mexWarnMsgIdAndTxt ("GB:leak", GrB_error ( )) ;         \
+                    mexWarnMsgIdAndTxt ("GB:leak", "memory leak") ;         \
                     FREE_ALL ;                                              \
                     mexErrMsgTxt ("Leak!") ;                                \
                 }                                                           \
@@ -386,11 +383,9 @@ GrB_Type GB_mx_string_to_Type       // GrB_Type from the string
             else                                                            \
             {                                                               \
                 /* another error has occurred */                            \
-                printf ("an error: %s line %d\n%s\n", __FILE__, __LINE__,   \
-                    GrB_error ()) ;                                         \
                 FREE_ALL ;                                                  \
                 if (info == GrB_PANIC) mexErrMsgTxt ("panic!") ;            \
-                mexErrMsgTxt (GrB_error ( )) ;                              \
+                mexErrMsgTxt ("unexpected error in mex brutal malloc debug") ; \
             }                                                               \
         }                                                                   \
     }
diff --git a/GraphBLAS/Test/GB_mex_AdotB.c b/GraphBLAS/Test/GB_mex_AdotB.c
index 57180a3b98..de322e01dd 100644
--- a/GraphBLAS/Test/GB_mex_AdotB.c
+++ b/GraphBLAS/Test/GB_mex_AdotB.c
@@ -2,8 +2,8 @@
 // GB_mex_AdotB: compute C=spones(Mask).*(A'*B)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,14 +17,14 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&A) ;                   \
-    GB_MATRIX_FREE (&Aconj) ;               \
-    GB_MATRIX_FREE (&B) ;                   \
-    GB_MATRIX_FREE (&C) ;                   \
-    GB_MATRIX_FREE (&Mask) ;                \
+    GrB_Matrix_free_(&A) ;                   \
+    GrB_Matrix_free_(&Aconj) ;               \
+    GrB_Matrix_free_(&B) ;                   \
+    GrB_Matrix_free_(&C) ;                   \
+    GrB_Matrix_free_(&Mask) ;                \
     GrB_Monoid_free_(&add) ;                \
     GrB_Semiring_free_(&semiring) ;         \
-    GB_mx_put_global (true, GxB_AxB_DOT) ;  \
+    GB_mx_put_global (true) ;               \
 }
 
 GrB_Matrix A = NULL, B = NULL, C = NULL, Aconj = NULL, Mask = NULL ;
@@ -60,9 +60,6 @@ GrB_Info adotb_complex (GB_Context Context)
 
     GrB_Semiring semiring = Complex_plus_times ;
 
-    GrB_Matrix Aslice [1] ;
-    Aslice [0] = Aconj ;
-
     if (Mask != NULL)
     {
         // C<M> = A'*B using dot product method
@@ -73,10 +70,9 @@ GrB_Info adotb_complex (GB_Context Context)
     else
     {
         // C = A'*B using dot product method
-        info = GB_AxB_dot2 (&C, NULL, false, Aslice, B, semiring, flipxy,
-            &mask_applied,
-            /* single thread: */
-            1, 1, 1, Context) ;
+        mask_applied = false ;  // no mask to apply
+        info = GB_AxB_dot2 (&C, NULL, false, false, Aconj, B, semiring, flipxy,
+            Context) ;
     }
 
     GrB_Matrix_free_(&Aconj) ;
@@ -98,8 +94,6 @@ GrB_Info adotb (GB_Context Context)
     }
     // C = A'*B
     bool mask_applied = false ;
-    GrB_Matrix Aslice [1] ;
-    Aslice [0] = A ;
 
     if (Mask != NULL)
     {
@@ -111,11 +105,9 @@ GrB_Info adotb (GB_Context Context)
     }
     else
     {
-        info = GB_AxB_dot2 (&C, NULL, false, Aslice, B,
-            semiring /* GxB_PLUS_TIMES_FP64 */,
-            flipxy, &mask_applied,
-            // single thread:
-            1, 1, 1, Context) ;
+        mask_applied = false ;  // no mask to apply
+        info = GB_AxB_dot2 (&C, NULL, false, false, A, B,
+            semiring /* GxB_PLUS_TIMES_FP64 */, flipxy, Context) ;
     }
 
     GrB_Monoid_free_(&add) ;
@@ -136,7 +128,7 @@ void mexFunction
 
     bool malloc_debug = GB_mx_get_global (true) ;
 
-    GB_WHERE (USAGE) ;
+    GB_CONTEXT (USAGE) ;
 
     // check inputs
     if (nargout > 1 || nargin < 2 || nargin > 4)
@@ -201,6 +193,12 @@ void mexFunction
         mexErrMsgTxt ("inner dimensions of A'*B do not match") ;
     }
 
+    if (anrows == 0)
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("inner dimensions of A'*B must be > 0") ;
+    }
+
     // get flipxy
     GET_SCALAR (3, bool, flipxy, false) ;
 
diff --git a/GraphBLAS/Test/GB_mex_AplusB.c b/GraphBLAS/Test/GB_mex_AplusB.c
index 0ed4f16347..1e467da864 100644
--- a/GraphBLAS/Test/GB_mex_AplusB.c
+++ b/GraphBLAS/Test/GB_mex_AplusB.c
@@ -2,8 +2,8 @@
 // GB_mex_AplusB: compute C=A+B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,10 +16,10 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&B) ;               \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&A) ;               \
+    GrB_Matrix_free_(&B) ;               \
+    GrB_Matrix_free_(&C) ;               \
+    GB_mx_put_global (true) ;           \
 }
 
 
@@ -40,7 +40,7 @@ void mexFunction
     GrB_Matrix C = NULL ;
     GrB_BinaryOp op = NULL ;
 
-    GB_WHERE (USAGE) ;
+    GB_CONTEXT (USAGE) ;
 
     // check inputs
     if (nargout > 1 || nargin != 3)
@@ -74,7 +74,9 @@ void mexFunction
     // simple_tic (tic2) ;
 
     // C = A+B using the op.  No mask
-    METHOD (GB_add (&C, A->type, true, NULL, false, A, B, op, Context)) ;
+    bool ignore ;
+    METHOD (GB_add (&C, A->type, true, NULL, false, false, &ignore, A, B, op,
+        Context)) ;
 
     // return C to MATLAB as a plain sparse matrix
     pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C AplusB result", false) ;
diff --git a/GraphBLAS/Test/GB_mex_AplusB_M_aliased.c b/GraphBLAS/Test/GB_mex_AplusB_M_aliased.c
new file mode 100644
index 0000000000..9c076d17bb
--- /dev/null
+++ b/GraphBLAS/Test/GB_mex_AplusB_M_aliased.c
@@ -0,0 +1,82 @@
+//------------------------------------------------------------------------------
+// GB_mex_AplusB_M_aliased: compute C<B>=A+B
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This is for testing only.  See GrB_eWiseAdd instead.  Returns a plain MATLAB
+// matrix, in double.
+
+#include "GB_mex.h"
+
+#define USAGE "C = GB_mex_AplusB_M_aliased (A, B, op)"
+
+#define FREE_ALL                        \
+{                                       \
+    GrB_Matrix_free_(&A) ;              \
+    GrB_Matrix_free_(&B) ;              \
+    GrB_Matrix_free_(&C) ;              \
+    GB_mx_put_global (true) ;           \
+}
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+    // double tic2 [2] ;
+    // simple_tic (tic2) ;
+
+    bool malloc_debug = GB_mx_get_global (true) ;
+    GrB_Matrix A = NULL ;
+    GrB_Matrix B = NULL ;
+    GrB_Matrix C = NULL ;
+    GrB_BinaryOp op = NULL ;
+
+    GB_CONTEXT (USAGE) ;
+
+    // check inputs
+    if (nargout > 1 || nargin != 3)
+    {
+        mexErrMsgTxt ("Usage: " USAGE) ;
+    }
+
+    #define GET_DEEP_COPY ;
+    #define FREE_DEEP_COPY ;
+
+    // get A and B
+    A = GB_mx_mxArray_to_Matrix (pargin [0], "A", false, true) ;
+    B = GB_mx_mxArray_to_Matrix (pargin [1], "B", false, true) ;
+    if (A == NULL || B == NULL)
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("failed") ;
+    }
+
+    // get op
+    bool user_complex = (Complex != GxB_FC64)
+        && (A->type == Complex || B->type == Complex) ;
+    if (!GB_mx_mxArray_to_BinaryOp (&op, pargin [2], "op",
+        A->type, user_complex) || op == NULL)
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("op failed") ;
+    }
+
+    // C<B> = A+B using the op.  M == B alias
+    bool ignore ;
+    METHOD (GB_add (&C, A->type, true, B, false, false, &ignore, A, B, op,
+        Context)) ;
+
+    // return C to MATLAB as a plain sparse matrix
+    pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C<B>=A+B result", false) ;
+
+    FREE_ALL ;
+}
+
diff --git a/GraphBLAS/Test/GB_mex_AxB.c b/GraphBLAS/Test/GB_mex_AxB.c
index f6ec29cefd..1ecebd1aff 100644
--- a/GraphBLAS/Test/GB_mex_AxB.c
+++ b/GraphBLAS/Test/GB_mex_AxB.c
@@ -2,8 +2,8 @@
 // GB_mex_AxB: compute C=A*B, A'*B, A*B', or A'*B'
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,15 +16,15 @@
 
 #define FREE_ALL                                \
 {                                               \
-    GB_MATRIX_FREE (&A) ;                       \
-    GB_MATRIX_FREE (&Aconj) ;                   \
-    GB_MATRIX_FREE (&B) ;                       \
-    GB_MATRIX_FREE (&Bconj) ;                   \
-    GB_MATRIX_FREE (&C) ;                       \
-    GB_MATRIX_FREE (&Mask) ;                    \
+    GrB_Matrix_free_(&A) ;                      \
+    GrB_Matrix_free_(&Aconj) ;                  \
+    GrB_Matrix_free_(&B) ;                      \
+    GrB_Matrix_free_(&Bconj) ;                  \
+    GrB_Matrix_free_(&C) ;                      \
+    GrB_Matrix_free_(&Mask) ;                   \
     GrB_Monoid_free_(&add) ;                    \
     GrB_Semiring_free_(&semiring) ;             \
-    GB_mx_put_global (true, AxB_method_used) ;  \
+    GB_mx_put_global (true) ;                   \
 }
 
 //------------------------------------------------------------------------------
@@ -44,7 +44,6 @@ int64_t bnrows = 0 ;
 int64_t bncols = 0 ;
 
 GrB_Desc_Value AxB_method = GxB_DEFAULT ;
-GrB_Desc_Value AxB_method_used = GxB_DEFAULT ;
 
 GrB_Info axb (GB_Context Context) ;
 GrB_Info axb_complex (GB_Context Context) ;
@@ -66,8 +65,7 @@ GrB_Info axb (GB_Context Context)
     }
 
     // C = A*B, A'*B, A*B', or A'*B'
-    info = GB_AxB_meta (&C,
-        NULL,       // not in place
+    info = GB_AxB_meta (&C, NULL,       // C cannot be computed in place
         false,      // C_replace
         true,       // CSC
         NULL,       // no MT returned
@@ -75,14 +73,17 @@ GrB_Info axb (GB_Context Context)
         false,      // mask not complemented
         false,      // mask not structural
         NULL,       // no accum
-        A, B,
+        A,
+        B,
         semiring,   // GrB_PLUS_TIMES_FP64
         atranspose,
         btranspose,
         false,      // flipxy
         &ignore,    // mask_applied
         &ignore2,   // done_in_place
-        AxB_method, &AxB_method_used, Context) ;
+        AxB_method,
+        true,       // do the sort
+        Context) ;
 
     GrB_Monoid_free_(&add) ;
     GrB_Semiring_free_(&semiring) ;
@@ -156,10 +157,9 @@ GrB_Info axb_complex (GB_Context Context)
         }
     }
 
-    info = GB_AxB_meta (&C,
-        NULL,       // not in place
+    info = GB_AxB_meta (&C, NULL,       // C cannot be computed in place
         false,      // C_replace
-        true,       //CSC
+        true,       // CSC
         NULL,       // no MT returned
         NULL,       // no Mask
         false,      // mask not complemented
@@ -173,7 +173,9 @@ GrB_Info axb_complex (GB_Context Context)
         false,      // flipxy
         &ignore,    // mask_applied
         &ignore2,   // done_in_place
-        AxB_method, &AxB_method_used, Context) ;
+        AxB_method,
+        true,       // do the sort
+        Context) ;
 
     GrB_Matrix_free_(&Bconj) ;
     GrB_Matrix_free_(&Aconj) ;
@@ -205,7 +207,7 @@ void mexFunction
     add = NULL ;
     semiring = NULL ;
 
-    GB_WHERE (USAGE) ;
+    GB_CONTEXT (USAGE) ;
 
     // check inputs
     if (nargout > 1 || nargin < 2 || nargin > 5)
@@ -239,7 +241,6 @@ void mexFunction
     // get the axb_method
     // 0 or not present: default
     // 1001: Gustavson
-    // 1002: heap
     // 1003: dot
     // 1004: hash
     // 1005: saxpy
@@ -247,7 +248,7 @@ void mexFunction
 
     if (! ((AxB_method == GxB_DEFAULT) ||
         (AxB_method == GxB_AxB_GUSTAVSON) ||
-        (AxB_method == GxB_AxB_HEAP) ||
+        (AxB_method == GxB_AxB_HASH) ||
         (AxB_method == GxB_AxB_DOT)))
     {
         mexErrMsgTxt ("unknown method") ;
diff --git a/GraphBLAS/Test/GB_mex_Col_assign.c b/GraphBLAS/Test/GB_mex_Col_assign.c
index 990d4a9928..e37c9dce7a 100644
--- a/GraphBLAS/Test/GB_mex_Col_assign.c
+++ b/GraphBLAS/Test/GB_mex_Col_assign.c
@@ -2,8 +2,8 @@
 // GB_mex_assign: C<Mask>(I,J) = accum (C (I,J), A)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 // This function is a wrapper for GrB_Matrix_assign, GrB_Matrix_assign_T
 // GrB_Vector_assign, and GrB_Vector_assign_T.  For these uses, the Mask must
@@ -23,17 +23,17 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&Mask) ;            \
-    GB_MATRIX_FREE (&C) ;               \
+    GrB_Matrix_free_(&A) ;               \
+    GrB_Matrix_free_(&Mask) ;            \
+    GrB_Matrix_free_(&C) ;               \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
 
-#define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+#define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
 
 GrB_Matrix C = NULL ;
 GrB_Matrix Mask = NULL ;
@@ -74,12 +74,9 @@ GrB_Info many_assign
 
 GrB_Info assign ( )
 {
-    GB_WHERE ("assign") ;
-
     bool at = (desc != NULL && desc->in0 == GrB_TRAN) ;
     GrB_Info info ;
 
-    // printf ("\n--- assign:\n") ;
     ASSERT_MATRIX_OK (C, "C", GB0) ;
     ASSERT_MATRIX_OK_OR_NULL (Mask, "Mask", GB0) ;
     ASSERT_MATRIX_OK (A, "A", GB0) ;
@@ -171,7 +168,6 @@ GrB_Info assign ( )
         {
 
             // test Matrix_assign_scalar functions
-            // printf ("scalar assign to matrix\n") ;
             #undef  ASSIGN
             #define ASSIGN(prefix,suffix,type)                          \
             {                                                           \
@@ -245,8 +241,6 @@ GrB_Info many_assign
     const mxArray *pargin [ ]
 )
 {
-    GB_WHERE ("many_assign") ;
-
     GrB_Info info = GrB_SUCCESS ;
 
     for (int64_t k = 0 ; k < nwork ; k++)
@@ -338,8 +332,8 @@ GrB_Info many_assign
 
         info = assign ( ) ;
 
-        GB_MATRIX_FREE (&A) ;
-        GB_MATRIX_FREE (&Mask) ;
+        GrB_Matrix_free_(&A) ;
+        GrB_Matrix_free_(&Mask) ;
         GrB_Descriptor_free_(&desc) ;
 
         if (info != GrB_SUCCESS)
@@ -376,10 +370,6 @@ void mexFunction
     Mask = NULL ;
     desc = NULL ;
 
-    GB_WHERE (USAGE) ;
-
-    // printf ("\n========================= GB_mex_assign:\n") ;
-
     if (nargout > 1 || ! (nargin == 2 || nargin == 6 || nargin == 7))
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_Col_extract.c b/GraphBLAS/Test/GB_mex_Col_extract.c
index 5be4ac74d7..c32f35c179 100644
--- a/GraphBLAS/Test/GB_mex_Col_extract.c
+++ b/GraphBLAS/Test/GB_mex_Col_extract.c
@@ -2,8 +2,8 @@
 // GB_mex_Col_extract: MATLAB interface for w<mask> = accum (w,A(I,j))
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,9 +15,9 @@
 {                                       \
     GrB_Vector_free_(&w) ;              \
     GrB_Vector_free_(&mask) ;           \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&A) ;               \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -38,8 +38,6 @@ void mexFunction
     GrB_Index *J = NULL, nj = 0, J_range [3] ;
     bool ignore ;
 
-    GB_WHERE (USAGE) ;
-
     // check inputs
     if (nargout > 1 || nargin < 6 || nargin > 7)
     {
diff --git a/GraphBLAS/Test/GB_mex_Matrix_build.c b/GraphBLAS/Test/GB_mex_Matrix_build.c
index 4b7f23815e..ec3dd723d8 100644
--- a/GraphBLAS/Test/GB_mex_Matrix_build.c
+++ b/GraphBLAS/Test/GB_mex_Matrix_build.c
@@ -2,8 +2,8 @@
 // GB_mex_Matrix_build.c: MATLAB interface to GrB_Matrix_build
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mex_Matrix_eWiseAdd.c b/GraphBLAS/Test/GB_mex_Matrix_eWiseAdd.c
index ff2b831874..7b088ef167 100644
--- a/GraphBLAS/Test/GB_mex_Matrix_eWiseAdd.c
+++ b/GraphBLAS/Test/GB_mex_Matrix_eWiseAdd.c
@@ -2,8 +2,8 @@
 // GB_mex_Matrix_eWiseAdd: C<M> = accum(C,A+B)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,12 +14,12 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&B) ;               \
-    GB_MATRIX_FREE (&C) ;               \
+    GrB_Matrix_free_(&A) ;               \
+    GrB_Matrix_free_(&B) ;               \
+    GrB_Matrix_free_(&C) ;               \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_MATRIX_FREE (&M) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&M) ;               \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -39,7 +39,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 6 || nargin > 8)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -49,7 +48,7 @@ void mexFunction
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;   \
     if (nargin > 7 && C != NULL) C->nvec_nonempty = -1 ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
@@ -119,18 +118,9 @@ void mexFunction
         C->nvec_nonempty = -1 ;
     }
 
-    // GxB_print (A, 3) ;
-    // GxB_print (B, 3) ;
-    // if (M != NULL) GxB_print (M, 3) ;
-    // GxB_print (C, 3) ;
-    // GxB_print (accum, 3) ;
-    // GxB_print (add, 3) ;
-
     // C<M> = accum(C,A+B)
     METHOD (GrB_Matrix_eWiseAdd_BinaryOp_(C, M, accum, add, A, B, desc)) ;
 
-    // printf ("result: \n") ; GxB_print (C, 3) ;
-
     // return C to MATLAB as a struct and free the GraphBLAS C
     pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C output", true) ;
 
diff --git a/GraphBLAS/Test/GB_mex_Matrix_eWiseMult.c b/GraphBLAS/Test/GB_mex_Matrix_eWiseMult.c
index 88167bf971..e4d08c8fdb 100644
--- a/GraphBLAS/Test/GB_mex_Matrix_eWiseMult.c
+++ b/GraphBLAS/Test/GB_mex_Matrix_eWiseMult.c
@@ -2,8 +2,8 @@
 // GB_mex_Matrix_eWiseMult: C<Mask> = accum(C,A.*B)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,12 +13,12 @@
 
 #define FREE_ALL                    \
 {                                   \
-    GB_MATRIX_FREE (&A) ;           \
-    GB_MATRIX_FREE (&B) ;           \
-    GB_MATRIX_FREE (&C) ;           \
+    GrB_Matrix_free_(&A) ;           \
+    GrB_Matrix_free_(&B) ;           \
+    GrB_Matrix_free_(&C) ;           \
     GrB_Descriptor_free_(&desc) ;   \
-    GB_MATRIX_FREE (&Mask) ;        \
-    GB_mx_put_global (true, 0) ;    \
+    GrB_Matrix_free_(&Mask) ;        \
+    GB_mx_put_global (true) ;       \
 }
 
 void mexFunction
@@ -38,7 +38,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 6 || nargin > 7)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -47,7 +46,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
diff --git a/GraphBLAS/Test/GB_mex_Matrix_extract.c b/GraphBLAS/Test/GB_mex_Matrix_extract.c
index adab034032..3c1b533207 100644
--- a/GraphBLAS/Test/GB_mex_Matrix_extract.c
+++ b/GraphBLAS/Test/GB_mex_Matrix_extract.c
@@ -2,8 +2,8 @@
 // GB_mex_Matrix_extract: MATLAB interface for C<Mask> = accum (C,A(I,J))
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,11 +13,11 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_MATRIX_FREE (&Mask) ;            \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&C) ;               \
+    GrB_Matrix_free_(&Mask) ;            \
+    GrB_Matrix_free_(&A) ;               \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -38,8 +38,6 @@ void mexFunction
     GrB_Index *J = NULL, nj = 0, J_range [3] ;
     bool ignore ;
 
-    GB_WHERE (USAGE) ;
-
     // check inputs
     if (nargout > 1 || nargin < 6 || nargin > 7)
     {
@@ -49,7 +47,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
diff --git a/GraphBLAS/Test/GB_mex_Matrix_extractElement.c b/GraphBLAS/Test/GB_mex_Matrix_extractElement.c
index 56ebf11a25..3b9fd38a10 100644
--- a/GraphBLAS/Test/GB_mex_Matrix_extractElement.c
+++ b/GraphBLAS/Test/GB_mex_Matrix_extractElement.c
@@ -2,8 +2,8 @@
 // GB_mex_Matrix_extractElement: MATLAB interface for x = A(i,j)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,9 +18,9 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&A) ;              \
     GB_FREE (Xtemp) ;                   \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -41,7 +41,6 @@ void mexFunction
     bool is_list ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 3 || nargin > 4)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_Matrix_subref.c b/GraphBLAS/Test/GB_mex_Matrix_subref.c
index b01354765d..da4830fd6d 100644
--- a/GraphBLAS/Test/GB_mex_Matrix_subref.c
+++ b/GraphBLAS/Test/GB_mex_Matrix_subref.c
@@ -2,8 +2,8 @@
 // GB_mex_Matrix_subref: C=A(I,J)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,9 +13,9 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&A) ;              \
+    GrB_Matrix_free_(&C) ;              \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -35,7 +35,7 @@ void mexFunction
     bool ignore ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
+    GB_CONTEXT (USAGE) ;
     if (nargout > 1 || nargin != 3)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -66,8 +66,8 @@ void mexFunction
         mexErrMsgTxt ("J failed") ;
     }
 
-    // C = A(I,J)
-    METHOD (GB_subref (&C, true, A, I, ni, J, nj, false, true, Context)) ;
+    // C = A(I,J), numeric not symbolic
+    METHOD (GB_subref (&C, true, A, I, ni, J, nj, false, Context)) ;
 
     // return C to MATLAB
     pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C subref result", false) ;
diff --git a/GraphBLAS/Test/GB_mex_Vector_build.c b/GraphBLAS/Test/GB_mex_Vector_build.c
index dc064c195e..6caa93dd60 100644
--- a/GraphBLAS/Test/GB_mex_Vector_build.c
+++ b/GraphBLAS/Test/GB_mex_Vector_build.c
@@ -2,8 +2,8 @@
 // GB_mex_Vector_build.c: MATLAB interface to GrB_Vector_build
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mex_Vector_eWiseAdd.c b/GraphBLAS/Test/GB_mex_Vector_eWiseAdd.c
index 46d295024d..79fd763fd5 100644
--- a/GraphBLAS/Test/GB_mex_Vector_eWiseAdd.c
+++ b/GraphBLAS/Test/GB_mex_Vector_eWiseAdd.c
@@ -2,8 +2,8 @@
 // GB_mex_Vector_eWiseAdd: w<mask> = accum(w,u+v)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,7 +18,7 @@
     GrB_Vector_free_(&v) ;          \
     GrB_Descriptor_free_(&desc) ;   \
     GrB_Vector_free_(&mask) ;       \
-    GB_mx_put_global (true, 0) ;    \
+    GB_mx_put_global (true) ;       \
 }
 
 void mexFunction
@@ -38,7 +38,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 6 || nargin > 7)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_Vector_eWiseMult.c b/GraphBLAS/Test/GB_mex_Vector_eWiseMult.c
index a7a4964d3e..4d26176135 100644
--- a/GraphBLAS/Test/GB_mex_Vector_eWiseMult.c
+++ b/GraphBLAS/Test/GB_mex_Vector_eWiseMult.c
@@ -2,8 +2,8 @@
 // GB_mex_Vector_eWiseMult: w<mask> = accum(w,u.*v)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,7 +18,7 @@
     GrB_Vector_free_(&v) ;          \
     GrB_Descriptor_free_(&desc) ;   \
     GrB_Vector_free_(&mask) ;       \
-    GB_mx_put_global (true, 0) ;    \
+    GB_mx_put_global (true) ;       \
 }
 
 void mexFunction
@@ -38,7 +38,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 6 || nargin > 7)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_Vector_extract.c b/GraphBLAS/Test/GB_mex_Vector_extract.c
index 489a037f30..2cf000a5b5 100644
--- a/GraphBLAS/Test/GB_mex_Vector_extract.c
+++ b/GraphBLAS/Test/GB_mex_Vector_extract.c
@@ -2,8 +2,8 @@
 // GB_mex_Vector_extract: MATLAB interface for w<mask> = accum (w,u(I))
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,7 +17,7 @@
     GrB_Vector_free_(&mask) ;           \
     GrB_Vector_free_(&u) ;              \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -38,7 +38,6 @@ void mexFunction
     bool ignore ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 5 || nargin > 6)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_Vector_extractElement.c b/GraphBLAS/Test/GB_mex_Vector_extractElement.c
index 1b2a735b3f..27eb842d20 100644
--- a/GraphBLAS/Test/GB_mex_Vector_extractElement.c
+++ b/GraphBLAS/Test/GB_mex_Vector_extractElement.c
@@ -2,8 +2,8 @@
 // GB_mex_Vector_extractElement: MATLAB interface for x = v(i)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,7 @@
 #define FREE_ALL                                        \
 {                                                       \
     GrB_Vector_free_(&v) ;                              \
-    GB_mx_put_global (true, 0) ;                        \
+    GB_mx_put_global (true) ;                           \
 }
 
 void mexFunction
@@ -33,7 +33,6 @@ void mexFunction
     bool is_list ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 2 || nargin > 3)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_about.c b/GraphBLAS/Test/GB_mex_about.c
index 8fe1364f91..940f47a832 100644
--- a/GraphBLAS/Test/GB_mex_about.c
+++ b/GraphBLAS/Test/GB_mex_about.c
@@ -2,8 +2,8 @@
 // GB_mex_about: print the 'about' information
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,6 +11,7 @@
 
 #include "GB_mex.h"
 #include "GB_mex_errors.h"
+#include "GB_bitmap_AxB_saxpy.h"
 
 #define USAGE "GB_mex_about"
 
@@ -18,27 +19,26 @@ GrB_Info ack (int64_t *stuff, GrB_Matrix GunkIt) ;
 
 GrB_Info ack (int64_t *stuff, GrB_Matrix GunkIt)
 {
-    GB_WHERE ("ack") ;
     GB_RETURN_IF_NULL (stuff) ;
     GB_RETURN_IF_NULL_OR_FAULTY (GunkIt) ;
     return (GrB_SUCCESS) ;
 }
 
-bool select_plus_one (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const double *x, const double *thunk) ;
+bool select_plus_one (GrB_Index i, GrB_Index j, 
+    const double *x, const double *thunk) ;
 
-bool select_nothing (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const void *x, const void *thunk) ;
+bool select_nothing (GrB_Index i, GrB_Index j,
+    const void *x, const void *thunk) ;
 
-bool select_plus_one (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const double *x, const double *thunk)
+bool select_plus_one (GrB_Index i, GrB_Index j,
+    const double *x, const double *thunk)
 {
     // return true if x >= thunk+1
     return ((*x) >= ((*thunk)+1)) ;
 }
 
-bool select_nothing (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const void *x, const void *thunk)
+bool select_nothing (GrB_Index i, GrB_Index j,
+    const void *x, const void *thunk)
 {
     return (false) ;
 }
@@ -60,6 +60,7 @@ void mexFunction
 
     FILE *f = fopen ("errlog2.txt", "w") ;
     printf ("in %s\n", __FILE__) ;
+    char *err ;
 
     //--------------------------------------------------------------------------
     // test GrB_init with invalid mode
@@ -72,9 +73,9 @@ void mexFunction
 
     bool malloc_debug = GB_mx_get_global (true) ;
 
-    GB_WHERE (USAGE) ;
+    // GB_CONTEXT (USAGE) ;
 
-    printf ("in %s:\n%s", __FILE__, GrB_error ( )) ;
+    printf ("in %s:\n", __FILE__) ;
 
     printf ("sizeof (struct GB_Type_opaque) %d\n",
              sizeof (struct GB_Type_opaque)) ;
@@ -95,12 +96,6 @@ void mexFunction
     printf ("sizeof (struct GB_Descriptor_opaque) %d\n",
              sizeof (struct GB_Descriptor_opaque)) ;
 
-    info = GB_ERROR (GrB_PANIC, (GB_LOG,
-        "just testing the error log ... not really a panic\n"
-        "hello world, the answer is %d", 42)) ;
-
-    printf ("%s", GrB_error ( )) ;
-
     size_t s ;
     GxB_Type_size (&s, GrB_BOOL  ) ; printf ("%d %d\n", s, sizeof (bool      ));
     GxB_Type_size (&s, GrB_INT8  ) ; printf ("%d %d\n", s, sizeof (int8_t    ));
@@ -118,69 +113,68 @@ void mexFunction
 
     GrB_Type t ;
 
-    GB_UnaryOp_check (GrB_LNOT, "LNOT", GxB_COMPLETE, stdout, Context) ;
+    GB_UnaryOp_check (GrB_LNOT, "LNOT", GxB_COMPLETE, stdout) ;
     GxB_UnaryOp_ztype (&t, GrB_LNOT) ;
-    GB_UnaryOp_check (t, "ztype", GxB_COMPLETE, stdout, Context) ;
+    GB_UnaryOp_check (t, "ztype", GxB_COMPLETE, stdout) ;
     GxB_UnaryOp_xtype (&t, GrB_LNOT) ;
-    GB_UnaryOp_check (t, "xtype", GxB_COMPLETE, stdout, Context) ;
+    GB_UnaryOp_check (t, "xtype", GxB_COMPLETE, stdout) ;
 
-    GB_UnaryOp_check (GxB_LNOT_FP32, "LNOT_FP32", GxB_COMPLETE, stdout, Context) ;
+    GB_UnaryOp_check (GxB_LNOT_FP32, "LNOT_FP32", GxB_COMPLETE, stdout) ;
     GxB_UnaryOp_ztype (&t, GxB_LNOT_FP32) ;
-    GB_UnaryOp_check (t, "ztype", GxB_COMPLETE, stdout, Context) ;
+    GB_UnaryOp_check (t, "ztype", GxB_COMPLETE, stdout) ;
     GxB_UnaryOp_xtype (&t, GxB_LNOT_FP32) ;
-    GB_UnaryOp_check (t, "xtype", GxB_COMPLETE, stdout, Context) ;
+    GB_UnaryOp_check (t, "xtype", GxB_COMPLETE, stdout) ;
 
-    GB_BinaryOp_check (GxB_ISEQ_INT32, "ISEQ_INT32", GxB_COMPLETE, stdout, Context) ;
+    GB_BinaryOp_check (GxB_ISEQ_INT32, "ISEQ_INT32", GxB_COMPLETE, stdout) ;
     GxB_BinaryOp_ztype (&t, GxB_ISEQ_INT32) ;
-    GB_BinaryOp_check (t, "ztype", GxB_COMPLETE, stdout, Context) ;
+    GB_BinaryOp_check (t, "ztype", GxB_COMPLETE, stdout) ;
     GxB_BinaryOp_xtype (&t, GxB_ISEQ_INT32) ;
-    GB_BinaryOp_check (t, "xtype", GxB_COMPLETE, stdout, Context) ;
+    GB_BinaryOp_check (t, "xtype", GxB_COMPLETE, stdout) ;
     GxB_BinaryOp_ytype (&t, GxB_ISEQ_INT32) ;
-    GB_BinaryOp_check (t, "ytype", GxB_COMPLETE, stdout, Context) ;
+    GB_BinaryOp_check (t, "ytype", GxB_COMPLETE, stdout) ;
 
-    GB_BinaryOp_check (GrB_EQ_INT32, "EQ_INT32", GxB_COMPLETE, stdout, Context) ;
+    GB_BinaryOp_check (GrB_EQ_INT32, "EQ_INT32", GxB_COMPLETE, stdout) ;
     GxB_BinaryOp_ztype (&t, GrB_EQ_INT32) ;
-    GB_BinaryOp_check (t, "ztype", GxB_COMPLETE, stdout, Context) ;
+    GB_BinaryOp_check (t, "ztype", GxB_COMPLETE, stdout) ;
     GxB_BinaryOp_xtype (&t, GrB_EQ_INT32) ;
-    GB_BinaryOp_check (t, "xtype", GxB_COMPLETE, stdout, Context) ;
+    GB_BinaryOp_check (t, "xtype", GxB_COMPLETE, stdout) ;
     GxB_BinaryOp_ytype (&t, GrB_EQ_INT32) ;
-    GB_BinaryOp_check (t, "ytype", GxB_COMPLETE, stdout, Context) ;
+    GB_BinaryOp_check (t, "ytype", GxB_COMPLETE, stdout) ;
 
     GrB_Monoid m ;
     GrB_BinaryOp op ;
 
     GrB_Monoid_new_UINT16_(&m, GrB_PLUS_UINT16, (uint16_t) 0) ;
     OK (GrB_Monoid_wait_(&m)) ;
-    GB_Monoid_check (m, "plus uint16 monoid", GxB_COMPLETE, stdout, Context) ;
+    GB_Monoid_check (m, "plus uint16 monoid", GxB_COMPLETE, stdout) ;
     uint16_t id ;
     GxB_Monoid_identity (&id, m) ;
     printf ("id is %d\n", id) ;
     GxB_Monoid_operator (&op, m) ;
-    GB_Monoid_check (op, "plus op from monoid", GxB_COMPLETE, stdout, Context) ;
+    GB_Monoid_check (op, "plus op from monoid", GxB_COMPLETE, stdout) ;
 
     GrB_Monoid_free_(&m) ;
 
     int16_t id0 = INT16_MIN ;
 
     GrB_Monoid_new_INT16_(&m, GrB_MAX_INT16, id0) ;
-    GB_Monoid_check (m, "max int16 monoid", GxB_COMPLETE, stdout, Context) ;
+    GB_Monoid_check (m, "max int16 monoid", GxB_COMPLETE, stdout) ;
     int16_t id1 ;
     GxB_Monoid_identity (&id1, m) ;
     printf ("id1 is %d\n", id1) ;
     GxB_Monoid_operator (&op, m) ;
-    GB_BinaryOp_check (op, "plus op from monoid", GxB_COMPLETE, stdout,
-        Context) ;
+    GB_BinaryOp_check (op, "plus op from monoid", GxB_COMPLETE, stdout) ;
 
     GrB_Semiring sem ;
     GrB_Semiring_new (&sem, m, GrB_TIMES_INT16) ;
     OK (GrB_Semiring_wait_(&sem)) ;
-    GB_Semiring_check (sem, "\nnew sem", GxB_COMPLETE, stdout, Context) ;
+    GB_Semiring_check (sem, "\nnew sem", GxB_COMPLETE, stdout) ;
 
     GrB_Monoid mm ;
     GxB_Semiring_add (&mm, sem) ;
-    GB_Monoid_check (mm, "sem mm", GxB_COMPLETE, stdout, Context) ;
+    GB_Monoid_check (mm, "sem mm", GxB_COMPLETE, stdout) ;
     GxB_Semiring_multiply (&op, sem) ;
-    GB_BinaryOp_check (op, "sem mult", GxB_COMPLETE, stdout, Context) ;
+    GB_BinaryOp_check (op, "sem mult", GxB_COMPLETE, stdout) ;
 
     GrB_Monoid_free_(&m) ;
     GrB_Semiring_free_(&sem) ;
@@ -192,13 +186,13 @@ void mexFunction
     info = ack (&morestuff, Gunk) ;
 
     GxB_Matrix_type (&t, Gunk) ;
-    GB_Type_check (t, "matrix Gunk type is:", GxB_COMPLETE, stdout, Context) ;
+    GB_Type_check (t, "matrix Gunk type is:", GxB_COMPLETE, stdout) ;
 
     GrB_Vector victor ;
     GrB_Vector_new (&victor, GrB_UINT32, 43) ;
     GxB_Vector_type (&t, victor) ;
     OK (GrB_Vector_wait_(&victor)) ;
-    GB_Type_check (t, "victor type is:", GxB_COMPLETE, stdout, Context) ;
+    GB_Type_check (t, "victor type is:", GxB_COMPLETE, stdout) ;
     GxB_Type_size (&s, t) ;
     printf ("and its size of type is %d\n", s) ;
     GrB_Vector_free_(&victor) ;
@@ -212,7 +206,7 @@ void mexFunction
 
     GrB_Descriptor_new (&Duh) ;
     GB_Descriptor_check (Duh, "\n---------------------------------- Duh:",
-        GxB_COMPLETE, stdout, Context) ;
+        GxB_COMPLETE, stdout) ;
     GxB_Desc_get (Duh, GrB_OUTP, &val) ; printf ("got outp %d\n", val) ; CHECK (val == GxB_DEFAULT) ;
     GxB_Desc_get (Duh, GrB_MASK, &val) ; printf ("got mask %d\n", val) ; CHECK (val == GxB_DEFAULT) ;
     GxB_Desc_get (Duh, GrB_INP0, &val) ; printf ("got inp0 %d\n", val) ; CHECK (val == GxB_DEFAULT) ;
@@ -220,7 +214,7 @@ void mexFunction
 
     GxB_Desc_set (Duh, GrB_INP0, GrB_TRAN) ;
     GB_Descriptor_check (Duh, "\n------------------------------- Duh set:",
-        GxB_COMPLETE, stdout, Context) ;
+        GxB_COMPLETE, stdout) ;
     GxB_Desc_get (Duh, GrB_OUTP, &val) ; printf ("got outp %d\n", val) ; CHECK (val == GxB_DEFAULT) ;
     GxB_Desc_get (Duh, GrB_MASK, &val) ; printf ("got mask %d\n", val) ; CHECK (val == GxB_DEFAULT) ;
     GxB_Desc_get (Duh, GrB_INP0, &val) ; printf ("got inp0 %d\n", val) ; CHECK (val == GrB_TRAN) ;
@@ -228,7 +222,7 @@ void mexFunction
 
     GxB_Desc_set (Duh, GrB_MASK, GrB_COMP) ;
     GB_Descriptor_check (Duh, "\n-----Duh set mask",
-        GxB_COMPLETE, stdout, Context) ;
+        GxB_COMPLETE, stdout) ;
     GxB_Desc_get (Duh, GrB_OUTP, &val) ; printf ("got outp %d\n", val) ; CHECK (val == GxB_DEFAULT) ;
     GxB_Desc_get (Duh, GrB_MASK, &val) ; printf ("got mask %d\n", val) ; CHECK (val == GrB_COMP) ;
     GxB_Desc_get (Duh, GrB_INP0, &val) ; printf ("got inp0 %d\n", val) ; CHECK (val == GrB_TRAN) ;
@@ -236,7 +230,7 @@ void mexFunction
 
     GxB_Desc_set (Duh, GrB_OUTP, GrB_REPLACE) ;
     GB_Descriptor_check (Duh, "\n-----Duh set out",
-        GxB_COMPLETE, stdout, Context) ;
+        GxB_COMPLETE, stdout) ;
     GxB_Desc_get (Duh, GrB_OUTP, &val) ; printf ("got outp %d\n", val) ; CHECK (val == GrB_REPLACE) ;
     GxB_Desc_get (Duh, GrB_MASK, &val) ; printf ("got mask %d\n", val) ; CHECK (val == GrB_COMP) ;
     GxB_Desc_get (Duh, GrB_INP0, &val) ; printf ("got inp0 %d\n", val) ; CHECK (val == GrB_TRAN) ;
@@ -244,7 +238,7 @@ void mexFunction
 
     GrB_Descriptor_set (Duh, GrB_MASK, GrB_STRUCTURE) ;
     GB_Descriptor_check (Duh, "\n-----Duh set mask structural",
-        GxB_COMPLETE, stdout, Context) ;
+        GxB_COMPLETE, stdout) ;
     GxB_Desc_get (Duh, GrB_OUTP, &val) ; printf ("got outp %d\n", val) ; CHECK (val == GrB_REPLACE) ;
     GxB_Desc_get (Duh, GrB_MASK, &val) ; printf ("got mask %d\n", val) ; CHECK (val == GrB_COMP + GrB_STRUCTURE) ;
     GxB_Desc_get (Duh, GrB_INP0, &val) ; printf ("got inp0 %d\n", val) ; CHECK (val == GrB_TRAN) ;
@@ -252,7 +246,7 @@ void mexFunction
 
     GrB_Descriptor_set (Duh, GrB_MASK, GxB_DEFAULT) ;
     GB_Descriptor_check (Duh, "\n-----Duh set mask back",
-        GxB_COMPLETE, stdout, Context) ;
+        GxB_COMPLETE, stdout) ;
     GxB_Desc_get (Duh, GrB_OUTP, &val) ; printf ("got outp %d\n", val) ; CHECK (val == GrB_REPLACE) ;
     GxB_Desc_get (Duh, GrB_MASK, &val) ; printf ("got mask %d\n", val) ; CHECK (val == GxB_DEFAULT) ;
     GxB_Desc_get (Duh, GrB_INP0, &val) ; printf ("got inp0 %d\n", val) ; CHECK (val == GrB_TRAN) ;
@@ -260,9 +254,10 @@ void mexFunction
 
     info = GxB_Desc_set (Duh, GrB_INP1, GrB_REPLACE) ;
     OK (GrB_Descriptor_wait_(&Duh)) ;
-    printf ("%s\n", GrB_error () ) ;
+    GrB_Descriptor_error_(&err, Duh) ;
+    printf ("%s\n", err) ;
     GB_Descriptor_check (Duh, "\n-----Duh set in1",
-        GxB_COMPLETE, stdout, Context) ;
+        GxB_COMPLETE, stdout) ;
     GxB_Desc_get (Duh, GrB_OUTP, &val) ; printf ("got outp %d\n", val) ; CHECK (val == GrB_REPLACE) ;
     GxB_Desc_get (Duh, GrB_MASK, &val) ; printf ("got mask %d\n", val) ; CHECK (val == GxB_DEFAULT) ;
     GxB_Desc_get (Duh, GrB_INP0, &val) ; printf ("got inp0 %d\n", val) ; CHECK (val == GrB_TRAN) ;
@@ -274,20 +269,15 @@ void mexFunction
     // error handling
     //--------------------------------------------------------------------------
 
-    printf ("ok:\n%s", GrB_error ( )) ;
-
     info = ack (NULL, Gunk) ;
 
-    printf ("%s", GrB_error ( )) ;
-
     Gunk->magic = 999 ;
     info = ack (&morestuff, Gunk) ;
-    printf ("%s", GrB_error ( )) ;
 
     Gunk->magic = GB_MAGIC ;
     GrB_Matrix_free_(&Gunk) ;
 
-    GB_Type_check (Complex, "user Complex type", GxB_COMPLETE, stdout, Context);
+    GB_Type_check (Complex, "user Complex type", GxB_COMPLETE, stdout);
     GxB_Type_size (&s, Complex) ;
     printf ("size is %d\n", (int) s) ;
 
@@ -391,8 +381,7 @@ void mexFunction
     // CUDA
     //--------------------------------------------------------------------------
 
-    int gpu_count = -1 ;
-    OK (GxB_Global_Option_get_(GxB_GPU_COUNT, &gpu_count)) ;
+    int gpu_count = GB_Global_gpu_count_get ( ) ;
     printf ("gpu count: %d\n", gpu_count) ;
 
     GrB_Desc_Value gpu_control = -99 ;
@@ -420,38 +409,22 @@ void mexFunction
     OK (GxB_Global_Option_get_(GxB_GLOBAL_GPU_CHUNK, &gpu_chunk)) ;
     CHECK (gpu_chunk == 42e6) ;
 
-    //--------------------------------------------------------------------------
-    // MKL
-    //--------------------------------------------------------------------------
-
-    int use_mkl = -99 ;
-    OK (GxB_Global_Option_get_(GxB_GLOBAL_MKL, &use_mkl)) ;
-    printf ("MKL control: %d\n", use_mkl) ;
-
-    OK (GxB_Global_Option_set_(GxB_GLOBAL_MKL, true)) ;
-    OK (GxB_Global_Option_get_(GxB_GLOBAL_MKL, &use_mkl)) ;
-    CHECK (use_mkl == true) ;
-
-    OK (GxB_Global_Option_set_(GxB_GLOBAL_MKL, false)) ;
-    OK (GxB_Global_Option_get_(GxB_GLOBAL_MKL, &use_mkl)) ;
-    CHECK (use_mkl == false) ;
-
     //--------------------------------------------------------------------------
     // types
     //--------------------------------------------------------------------------
 
     printf ("built-in types:\n") ;
-    GB_Type_check (GrB_BOOL, "bool", GxB_COMPLETE, stdout, Context) ;
-    GB_Type_check (GrB_INT8, "int8", GxB_COMPLETE, stdout, Context) ;
-    GB_Type_check (GrB_UINT8, "uint8", GxB_COMPLETE, stdout, Context) ;
-    GB_Type_check (GrB_INT16, "int16", GxB_COMPLETE, stdout, Context) ;
-    GB_Type_check (GrB_UINT16, "uint16", GxB_COMPLETE, stdout, Context) ;
-    GB_Type_check (GrB_INT32, "int32", GxB_COMPLETE, stdout, Context) ;
-    GB_Type_check (GrB_UINT32, "uint32", GxB_COMPLETE, stdout, Context) ;
-    GB_Type_check (GrB_INT64, "int64", GxB_COMPLETE, stdout, Context) ;
-    GB_Type_check (GrB_UINT64, "uint64", GxB_COMPLETE, stdout, Context) ;
-    GB_Type_check (GrB_FP32, "fp32", GxB_COMPLETE, stdout, Context) ;
-    GB_Type_check (GrB_FP64, "fp64", GxB_COMPLETE, stdout, Context) ;
+    GB_Type_check (GrB_BOOL, "bool", GxB_COMPLETE, stdout) ;
+    GB_Type_check (GrB_INT8, "int8", GxB_COMPLETE, stdout) ;
+    GB_Type_check (GrB_UINT8, "uint8", GxB_COMPLETE, stdout) ;
+    GB_Type_check (GrB_INT16, "int16", GxB_COMPLETE, stdout) ;
+    GB_Type_check (GrB_UINT16, "uint16", GxB_COMPLETE, stdout) ;
+    GB_Type_check (GrB_INT32, "int32", GxB_COMPLETE, stdout) ;
+    GB_Type_check (GrB_UINT32, "uint32", GxB_COMPLETE, stdout) ;
+    GB_Type_check (GrB_INT64, "int64", GxB_COMPLETE, stdout) ;
+    GB_Type_check (GrB_UINT64, "uint64", GxB_COMPLETE, stdout) ;
+    GB_Type_check (GrB_FP32, "fp32", GxB_COMPLETE, stdout) ;
+    GB_Type_check (GrB_FP64, "fp64", GxB_COMPLETE, stdout) ;
 
     printf ("\nprinting built-in types:\n") ;
     bool       b = true ;
@@ -466,56 +439,43 @@ void mexFunction
     float    f32 = 3.14 ;
     double   f64 = 99.4 ;
 
-    GB_code_check (GB_BOOL_code,   &b  , 5, stdout, Context) ; printf ("\n");
-    GB_code_check (GB_INT8_code,   &i8 , 5, stdout, Context) ; printf ("\n");
-    GB_code_check (GB_UINT8_code,  &u8 , 5, stdout, Context) ; printf ("\n");
-    GB_code_check (GB_INT16_code,  &i16, 5, stdout, Context) ; printf ("\n");
-    GB_code_check (GB_UINT16_code, &u16, 5, stdout, Context) ; printf ("\n");
-    GB_code_check (GB_INT32_code,  &i32, 5, stdout, Context) ; printf ("\n");
-    GB_code_check (GB_UINT32_code, &u32, 5, stdout, Context) ; printf ("\n");
-    GB_code_check (GB_INT64_code,  &i64, 5, stdout, Context) ; printf ("\n");
-    GB_code_check (GB_UINT64_code, &u64, 5, stdout, Context) ; printf ("\n");
-    GB_code_check (GB_FP32_code,   &f32, 5, stdout, Context) ; printf ("\n");
-    GB_code_check (GB_FP64_code,   &f64, 5, stdout, Context) ; printf ("\n");
-    GB_code_check (GB_UDT_code,    &f64, 5, stdout, Context) ; printf ("\n");
+    GB_code_check (GB_BOOL_code,   &b  , 5, stdout) ; printf ("\n");
+    GB_code_check (GB_INT8_code,   &i8 , 5, stdout) ; printf ("\n");
+    GB_code_check (GB_UINT8_code,  &u8 , 5, stdout) ; printf ("\n");
+    GB_code_check (GB_INT16_code,  &i16, 5, stdout) ; printf ("\n");
+    GB_code_check (GB_UINT16_code, &u16, 5, stdout) ; printf ("\n");
+    GB_code_check (GB_INT32_code,  &i32, 5, stdout) ; printf ("\n");
+    GB_code_check (GB_UINT32_code, &u32, 5, stdout) ; printf ("\n");
+    GB_code_check (GB_INT64_code,  &i64, 5, stdout) ; printf ("\n");
+    GB_code_check (GB_UINT64_code, &u64, 5, stdout) ; printf ("\n");
+    GB_code_check (GB_FP32_code,   &f32, 5, stdout) ; printf ("\n");
+    GB_code_check (GB_FP64_code,   &f64, 5, stdout) ; printf ("\n");
+    GB_code_check (GB_UDT_code,    &f64, 5, stdout) ; printf ("\n");
 
     for (int i = 0 ; i <= GrB_PANIC + 1 ; i++)
     {
         printf ("info: %2d %s\n", i, GB_status_code (i)) ;
     }
 
-    //--------------------------------------------------------------------------
-    // threading
-    //--------------------------------------------------------------------------
-
-    #if defined (USER_POSIX_THREADS)
-    printf ("User threads: POSIX\n") ;
-    #elif defined (USER_OPENMP_THREADS)
-    printf ("User threads: OpenMP\n") ;
-    #else
-    printf ("User threads: none\n") ;
-    #endif
-
     //--------------------------------------------------------------------------
     // global get/set
     //--------------------------------------------------------------------------
 
-    double h ;
+    double h, bswitch [GxB_NBITMAP_SWITCH] ;
     GxB_Format_Value ff ;
-    GxB_Global_Option_get_(GxB_HYPER, &h) ;
+    GxB_Global_Option_get_(GxB_HYPER_SWITCH, &h) ;
+    GxB_Global_Option_get_(GxB_BITMAP_SWITCH, bswitch) ;
     GxB_Global_Option_get_(GxB_FORMAT, &ff) ;
-    printf ("hyper_ratio %g csc %d\n", h, (ff == GxB_BY_COL)) ;
+    printf ("hyper_switch %g csc %d\n", h, (ff == GxB_BY_COL)) ;
+    for (int k = 0 ; k < GxB_NBITMAP_SWITCH ; k++)
+    {
+        printf ("bitmap_switch [%d]: %g\n", k, bswitch [k]) ;
+    }
 
     GrB_Mode mode ;
     GxB_Global_Option_get_(GxB_MODE, &mode) ;
     printf ("mode: %d\n", mode) ;
 
-    GxB_Thread_Model threading ;
-    GxB_Global_Option_get_(GxB_THREAD_SAFETY, &threading) ;
-    printf ("thread safety: %d\n", threading) ;
-    GxB_Global_Option_get_(GxB_THREADING, &threading) ;
-    printf ("threading: %d\n", threading) ;
-
     int nthreads ;
     GxB_Global_Option_get_(GxB_NTHREADS, &nthreads) ;
     printf ("# threads: %d\n", nthreads) ;
@@ -524,9 +484,6 @@ void mexFunction
     GxB_Global_Option_get_(GxB_CHUNK, &chunk) ;
     printf ("chunk: %g\n", chunk) ;
 
-    GxB_Global_Option_get_(GxB_MKL, &use_mkl) ;
-    printf ("use mkl: %d\n", use_mkl) ;
-
     //--------------------------------------------------------------------------
     // check A and B aliased
     //--------------------------------------------------------------------------
@@ -557,15 +514,12 @@ void mexFunction
     OK (GrB_Descriptor_new (&desc)) ;
     OK (GxB_Desc_set (desc, GxB_NTHREADS, 42)) ;
     OK (GxB_Desc_set (desc, GxB_CHUNK, (double) 12345)) ;
-    OK (GxB_Desc_set (desc, GxB_MKL, false)) ;
-    OK (GxB_Desc_get (desc, GxB_MKL, &use_mkl)) ;
     OK (GxB_Desc_get (desc, GxB_CHUNK, &chunk)) ;
     OK (GxB_Desc_get (desc, GxB_NTHREADS, &nthreads)) ;
     OK (GrB_Descriptor_wait_(&desc)) ;
     OK (GxB_Descriptor_fprint_(desc, GxB_COMPLETE, NULL)) ;
     CHECK (chunk == 12345) ;
     CHECK (nthreads == 42) ;
-    CHECK (use_mkl == false) ;
     GrB_Descriptor_free_(&desc) ;
 
     //--------------------------------------------------------------------------
@@ -616,7 +570,7 @@ void mexFunction
     //--------------------------------------------------------------------------
 
     int64_t *Slice = NULL ;
-    GB_pslice (&Slice, NULL, 0, 4) ;
+    GB_pslice (&Slice, NULL, 0, 4, true) ;
     for (int t = 0 ; t < 4 ; t++) CHECK (Slice [t] == 0) ;
     GB_FREE (Slice) ;
 
@@ -683,10 +637,6 @@ void mexFunction
     OK (GxB_Matrix_fprint_(A, GxB_COMPLETE, NULL)) ;
     OK (GxB_Matrix_fprint_(B, GxB_COMPLETE, NULL)) ;
 
-    // expected = GrB_DIMENSION_MISMATCH ;
-    // ERR (GxB_Matrix_select_(A, NULL, NULL, GxB_NE_THUNK, A, A, NULL)) ;
-    // printf ("Expected error: info: %d\n%s\n", info, GrB_error ( )) ;
-
     GxB_Scalar thunk = NULL ;
     OK (GxB_Scalar_new (&thunk, user_type)) ;
     GrB_Type type2 = NULL ;
@@ -694,14 +644,14 @@ void mexFunction
     CHECK (type2 == user_type) ;
     OK (GxB_Scalar_fprint_(thunk, GxB_COMPLETE, NULL)) ;
     OK (GxB_Matrix_select_(A, NULL, NULL, GxB_NE_THUNK, A, thunk, NULL)) ;
-    // printf ("Expected error: info: %d\n%s\n", info, GrB_error ( )) ;
 
     value = (int64_t) 4 ;
     OK (GxB_Scalar_setElement_UDT (thunk, &value)) ;
 
     expected = GrB_DOMAIN_MISMATCH ;
-    ERR (GxB_Matrix_select_(A, NULL, NULL, GxB_GE_THUNK, A, thunk, NULL)) ;
-    printf ("Expected error: info: %d\n%s\n", info, GrB_error ( )) ;
+    ERR1 (A, GxB_Matrix_select_(A, NULL, NULL, GxB_GE_THUNK, A, thunk, NULL)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("Expected error: info: %d\n%s\n", info, err) ;
 
     GxB_Scalar thunk2 = NULL ;
     OK (GxB_Scalar_new (&thunk2, GrB_INT16)) ;
@@ -710,20 +660,25 @@ void mexFunction
 
     expected = GrB_DOMAIN_MISMATCH ;
 
-    ERR (GxB_Matrix_select_(A, NULL, NULL, GxB_GE_ZERO, A, NULL, NULL)) ;
-    printf ("Expected error: info: %d\n%s\n", info, GrB_error ( )) ;
+    ERR1 (A, GxB_Matrix_select_(A, NULL, NULL, GxB_GE_ZERO, A, NULL, NULL)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("Expected error: info: %d\n%s\n", info, err) ;
 
-    ERR (GxB_Matrix_select_(A, NULL, NULL, GxB_GT_ZERO, A, NULL, NULL)) ;
-    printf ("Expected error: info: %d\n%s\n", info, GrB_error ( )) ;
+    ERR1 (A, GxB_Matrix_select_(A, NULL, NULL, GxB_GT_ZERO, A, NULL, NULL)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("Expected error: info: %d\n%s\n", info, err) ;
 
-    ERR (GxB_Matrix_select_(A, NULL, NULL, GxB_LT_ZERO, A, NULL, NULL)) ;
-    printf ("Expected error: info: %d\n%s\n", info, GrB_error ( )) ;
+    ERR1 (A, GxB_Matrix_select_(A, NULL, NULL, GxB_LT_ZERO, A, NULL, NULL)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("Expected error: info: %d\n%s\n", info, err) ;
 
-    ERR (GxB_Matrix_select_(A, NULL, NULL, GxB_LE_ZERO, A, NULL, NULL)) ;
-    printf ("Expected error: info: %d\n%s\n", info, GrB_error ( )) ;
+    ERR1 (A, GxB_Matrix_select_(A, NULL, NULL, GxB_LE_ZERO, A, NULL, NULL)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("Expected error: info: %d\n%s\n", info, err) ;
 
-    ERR (GxB_Matrix_select_(B, NULL, NULL, GxB_LE_THUNK, B, thunk, NULL)) ;
-    printf ("Expected error: info: %d\n%s\n", info, GrB_error ( )) ;
+    ERR1 (B, GxB_Matrix_select_(B, NULL, NULL, GxB_LE_THUNK, B, thunk, NULL)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("Expected error: info: %d\n%s\n", info, err) ;
     GrB_Matrix_free_(&B) ;
 
     OK (GrB_Matrix_new (&B, user_type, 10, 10)) ;
@@ -793,65 +748,48 @@ void mexFunction
     OK (GrB_Vector_wait_(&victor)) ;
 
     //--------------------------------------------------------------------------
-    // removeElement errors
+    // GxB_get
     //--------------------------------------------------------------------------
 
-    expected = GrB_INVALID_INDEX ;
-    ERR (GrB_Vector_removeElement (victor, 9999)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GrB_Matrix_removeElement (A, 0, 9999)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GrB_Matrix_removeElement (A, 9999, 0)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
+    int sparsity ;
+    OK (GxB_Matrix_Option_get_(A, GxB_SPARSITY_CONTROL, &sparsity)) ;
+    CHECK (sparsity == GxB_AUTO_SPARSITY) ;
+    OK (GxB_Vector_Option_get_(victor, GxB_SPARSITY_CONTROL, &sparsity)) ;
+    CHECK (sparsity == GxB_AUTO_SPARSITY) ;
+    OK (GxB_Vector_Option_get_(victor, GxB_SPARSITY_STATUS, &sparsity)) ;
+    CHECK (sparsity == GxB_SPARSE) ;
+    GxB_Format_Value fmt ;
+    OK (GxB_Vector_Option_get_(victor, GxB_FORMAT, &fmt)) ;
+    CHECK (fmt == GxB_BY_COL) ;
+    bool is_hyper ;
+    OK (GxB_Vector_Option_get_(victor, GxB_IS_HYPER, &is_hyper)) ;
+    CHECK (!is_hyper) ;
+    expected = GrB_INVALID_VALUE ;
+    ERR (GxB_Vector_Option_get_(victor, -999, &is_hyper)) ;
 
     //--------------------------------------------------------------------------
-    // GxB_Matrix_fprint for a slice or hyperslice
+    // GxB_set
     //--------------------------------------------------------------------------
 
-    expected = GrB_INVALID_OBJECT ;
+    ERR (GxB_Vector_Option_set_(victor, -999, &is_hyper)) ;
+    OK (GxB_Vector_Option_set_(victor, GxB_SPARSITY_CONTROL, 9999)) ;
+    OK (GxB_Vector_Option_get_(victor, GxB_SPARSITY_CONTROL, &sparsity)) ;
+    CHECK (sparsity == GxB_AUTO_SPARSITY) ;
 
-    for (int hyper = 0 ; hyper <= 1 ; hyper++)
-    {
-        if (hyper)
-        {
-            OK (GxB_Matrix_Option_set_(A, GxB_HYPER, GxB_ALWAYS_HYPER)) ;
-        }
+    //--------------------------------------------------------------------------
+    // removeElement errors
+    //--------------------------------------------------------------------------
 
-        GrB_Matrix Aslice [2] = { NULL, NULL } ;
-        int64_t Slice [8] ;
-        Slice [0] = 0 ;
-        Slice [1] = 4 ;
-        Slice [2] = 8 ;
-        OK (GB_slice (A, 2, Slice, Aslice, Context)) ;
-        OK (GxB_Matrix_fprint_(Aslice [0], GxB_COMPLETE, NULL)) ;
-        OK (GxB_Matrix_fprint_(Aslice [1], GxB_COMPLETE, NULL)) ;
-
-        GB_Pending gunk ;
-        Aslice [0]->Pending = &gunk ;
-        ERR (GxB_Matrix_fprint_(Aslice [0], GxB_SHORT, NULL)) ;
-        Aslice [0]->Pending = NULL ;
-        OK (GxB_Matrix_fprint_(Aslice [0], GxB_SILENT, NULL)) ;
-
-        int64_t a1save = Aslice [0]->nvec ;
-        Aslice [0]->nvec = 999999 ;
-        ERR (GxB_Matrix_fprint_(Aslice [0], GxB_SHORT, NULL)) ;
-        Aslice [0]->nvec = a1save ;
-        OK (GxB_Matrix_fprint_(Aslice [0], GxB_SILENT, NULL)) ;
-
-        Aslice [0]->i_shallow = false ;
-        ERR (GxB_Matrix_fprint_(Aslice [0], GxB_SHORT, NULL)) ;
-        Aslice [0]->i_shallow = true ;
-        OK (GxB_Matrix_fprint_(Aslice [0], GxB_SILENT, NULL)) ;
-
-        int64_t hfirst = Aslice [0]->hfirst ;
-        Aslice [0]->hfirst = -1 ;
-        ERR (GxB_Matrix_fprint_(Aslice [0], GxB_SHORT, NULL)) ;
-        Aslice [0]->hfirst = 0 ;
-        OK (GxB_Matrix_fprint_(Aslice [0], GxB_SILENT, NULL)) ;
-
-        GrB_Matrix_free_(&Aslice [0]) ;
-        GrB_Matrix_free_(&Aslice [1]) ;
-    }
+    expected = GrB_INVALID_INDEX ;
+    ERR1 (victor, GrB_Vector_removeElement (victor, 9999)) ;
+    GrB_Vector_error_(&err, victor) ;
+    printf ("expected error: %s\n", err) ;
+    ERR1 (A, GrB_Matrix_removeElement (A, 0, 9999)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("expected error: %s\n", err) ;
+    ERR1 (A, GrB_Matrix_removeElement (A, 9999, 0)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("expected error: %s\n", err) ;
 
     //--------------------------------------------------------------------------
     // pending tuples
@@ -909,24 +847,28 @@ void mexFunction
     GB_Global_print_one_based_set (false) ;
 
     expected = GrB_NULL_POINTER ;
-    ERR (GxB_Matrix_select_(C, NULL, NULL, selectop, A, NULL, NULL)) ;
-    printf ("Error expected: %d\n%s\n", info, GrB_error ( )) ;
+    ERR1 (C, GxB_Matrix_select_(C, NULL, NULL, selectop, A, NULL, NULL)) ;
+    GrB_Matrix_error_(&err, C) ;
+    printf ("Error expected: %d\n%s\n", info, err) ;
 
     expected = GrB_INVALID_VALUE ;
     OK (GxB_Scalar_clear (thunk)) ;
-    ERR (GxB_Matrix_select_(C, NULL, NULL, selectop, A, thunk, NULL)) ;
-    printf ("Error expected: %d\n%s\n", info, GrB_error ( )) ;
+    ERR1 (C, GxB_Matrix_select_(C, NULL, NULL, selectop, A, thunk, NULL)) ;
+    GrB_Matrix_error_(&err, C) ;
+    printf ("Error expected: %d\n%s\n", info, err) ;
 
     expected = GrB_DOMAIN_MISMATCH ;
     GxB_Scalar_free_(&thunk) ;
     OK (GxB_Scalar_new (&thunk, GrB_FP32)) ;
-    ERR (GxB_Matrix_select_(C, NULL, NULL, selectop, A, thunk, NULL)) ;
-    printf ("Error expected: %d\n%s\n", info, GrB_error ( )) ;
+    ERR1 (C, GxB_Matrix_select_(C, NULL, NULL, selectop, A, thunk, NULL)) ;
+    GrB_Matrix_error_(&err, C) ;
+    printf ("Error expected: %d\n%s\n", info, err) ;
 
     GxB_SelectOp_free_(&selectop) ;
     OK (GxB_SelectOp_new (&selectop, select_nothing, GrB_FP64, NULL)) ;
-    ERR (GxB_Matrix_select_(C, NULL, NULL, selectop, A, thunk, NULL)) ;
-    printf ("Error expected: %d\n%s\n", info, GrB_error ( )) ;
+    ERR1 (C, GxB_Matrix_select_(C, NULL, NULL, selectop, A, thunk, NULL)) ;
+    GrB_Matrix_error_(&err, C) ;
+    printf ("Error expected: %d\n%s\n", info, err) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
     OK (GrB_Type_new (&user_type, sizeof (user_int))) ;
@@ -941,42 +883,13 @@ void mexFunction
     thunk->magic = 0xDEAD ;
     ERR (GxB_Scalar_fprint_(thunk, GxB_COMPLETE, NULL)) ;
     thunk->magic = GB_MAGIC ;
-    printf ("Error expected: %d\n%s\n", info, GrB_error ( )) ;
+    printf ("Error expected: %d\n", info) ;
 
-    GrB_Type_free_(&user_type) ;
     GrB_Matrix_free_(&A) ;
     GrB_Matrix_free_(&C) ;
     GxB_Scalar_free_(&thunk) ;
     GxB_SelectOp_free_(&selectop) ;
 
-    //--------------------------------------------------------------------------
-    // slice vector
-    //--------------------------------------------------------------------------
-
-    // GB_Matrix_wait constructs a slice, Aslice [1], that is then added to the
-    // pending tuples, B = T.  It then calls GB_add to compute Aslice [1] + T,
-    // where Aslice [1] can either be hypersparse or a hyperslice.  Need to
-    // trigger the condition that the index i appears after all entries in the
-    // implicit hyperlist Ah.  It's hard to test this case directly, via
-    // GB_Matrix_wait and GB_add.
-
-    int64_t i, pA = -1, pB = -1 ;
-    int64_t Bh [10] ;
-    for (int i = 0 ; i < 10 ; i++)
-    {
-        Bh [i] = 1000 + i ;
-    }
-    GB_slice_vector (&i, NULL, &pA, &pB,
-        0, 0, NULL,     // Mi is empty
-        0, 10, NULL, 1, // Ah is an implicit hyperlist: [1 2 3 4 5 6 7 8 9 10]
-        0, 10, Bh,      // Bh is an explicit hyperlist
-        2001,           // n
-        (double) 10) ;  // target_work
-    printf ("slice_vector: i "GBd" pA "GBd" pB "GBd"\n", i, pA, pB) ;
-    OK (i == 1000) ;    // n is cut in half, i = floor ((0+(n-1))/2) 
-    OK (pA == 10) ;     // first task does all of A
-    OK (pB == 0) ;      // second task does all of B
-
     //--------------------------------------------------------------------------
     // GxB_Scalar
     //--------------------------------------------------------------------------
@@ -985,6 +898,7 @@ void mexFunction
     GxB_Scalar scalar = NULL, scalar2 = NULL ;
     OK (GxB_Scalar_new (&scalar, GrB_FP64)) ;
     OK (GxB_Scalar_nvals (&nvals, scalar)) ;
+    OK (GxB_Scalar_wait_(&scalar)) ;
     CHECK (nvals == 0) ;
 
     bool     b_8 = 0 ;
@@ -1001,6 +915,7 @@ void mexFunction
 
     OK (GxB_Scalar_setElement_FP64_(scalar, (double) 1.25)) ;
     OK (GxB_Scalar_nvals (&nvals, scalar)) ;
+    OK (GxB_Scalar_wait_(&scalar)) ;
     CHECK (nvals == 1) ;
 
     OK (GxB_Scalar_dup (&scalar2, scalar)) ;
@@ -1037,6 +952,7 @@ void mexFunction
     u_64 = 0 ;
     OK (GxB_Scalar_extractElement_UINT64_(&u_64, scalar2)) ; CHECK (u_64 == 1) ;
     OK (GxB_Scalar_nvals (&nvals, scalar2)) ;
+    OK (GxB_Scalar_wait_(&scalar2)) ;
     CHECK (nvals == 1) ;
 
     expected = GrB_INVALID_OBJECT ;
@@ -1089,8 +1005,6 @@ void mexFunction
     OK (GxB_Descriptor_fprint_(Duh, GxB_COMPLETE, NULL)) ;
     OK (GxB_Desc_set (Duh, GxB_AxB_METHOD, GxB_AxB_HASH)) ;
     OK (GxB_Descriptor_fprint_(Duh, GxB_COMPLETE, NULL)) ;
-    OK (GxB_Desc_set (Duh, GxB_AxB_METHOD, GxB_AxB_HEAP)) ;
-    OK (GxB_Descriptor_fprint_(Duh, GxB_COMPLETE, NULL)) ;
     OK (GxB_Desc_set (Duh, GxB_AxB_METHOD, GxB_AxB_GUSTAVSON)) ;
     OK (GxB_Descriptor_fprint_(Duh, GxB_COMPLETE, NULL)) ;
     OK (GxB_Desc_set (Duh, GxB_AxB_METHOD, GxB_AxB_DOT)) ;
@@ -1099,10 +1013,8 @@ void mexFunction
 
     expected = GrB_INVALID_VALUE ;
     ERR (GxB_Desc_set (GrB_DESC_S, GrB_INP0, GrB_TRAN)) ;
-    printf ("\nExpected error: %s\n", GrB_error ( )) ;
 
     ERR (GrB_Descriptor_set (GrB_DESC_S, GrB_INP0, GrB_TRAN)) ;
-    printf ("\nExpected error: %s\n", GrB_error ( )) ;
 
     //--------------------------------------------------------------------------
     // burble
@@ -1116,9 +1028,9 @@ void mexFunction
     // select ops
     //--------------------------------------------------------------------------
 
-    OK (GxB_SelectOp_fprint (GxB_TRIL,     "tril"    ,  GxB_COMPLETE, NULL)) ;
-    OK (GxB_SelectOp_fprint (GxB_TRIU,     "triu"    ,  GxB_COMPLETE, NULL)) ;
-    OK (GxB_SelectOp_fprint (GxB_DIAG,     "diag"    ,  GxB_COMPLETE, NULL)) ;
+    OK (GxB_SelectOp_fprint (GxB_TRIL,     "tril"    , GxB_COMPLETE, NULL)) ;
+    OK (GxB_SelectOp_fprint (GxB_TRIU,     "triu"    , GxB_COMPLETE, NULL)) ;
+    OK (GxB_SelectOp_fprint (GxB_DIAG,     "diag"    , GxB_COMPLETE, NULL)) ;
     OK (GxB_SelectOp_fprint (GxB_OFFDIAG,  "offidiag", GxB_COMPLETE, NULL)) ;
     OK (GxB_SelectOp_fprint (GxB_NONZERO,  "nonzero" , GxB_COMPLETE, NULL)) ;
     OK (GxB_SelectOp_fprint (GxB_EQ_ZERO,  "eq_zero" , GxB_COMPLETE, NULL)) ;
@@ -1141,16 +1053,112 @@ void mexFunction
     n = n * 1024 ;
     OK (GrB_Matrix_new (&A, GrB_FP64, n, n)) ;
     expected = GrB_OUT_OF_MEMORY ;
-    ERR (GrB_Matrix_assign_FP64_(A, NULL, NULL, (double) 1,
+    ERR1 (A, GrB_Matrix_assign_FP64_(A, NULL, NULL, (double) 1,
         GrB_ALL, n, GrB_ALL, n, NULL)) ;
-    printf ("\nproblem too large, expected error: %s\n", GrB_error ( )) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("\nproblem too large, expected error: %s\n", err) ;
     OK (GrB_Matrix_free_(&A)) ;
 
+    //--------------------------------------------------------------------------
+    // setElement typecast
+    //--------------------------------------------------------------------------
+
+    OK (GrB_Matrix_new (&A, user_type, 10, 10)) ;
+
+    expected = GrB_DOMAIN_MISMATCH ;
+
+    ERR1 (A, GrB_Matrix_setElement_BOOL   (A, 0, 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+    ERR1 (A, GrB_Matrix_setElement_INT8   (A, 0, 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+    ERR1 (A, GrB_Matrix_setElement_INT16  (A, 0, 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+    ERR1 (A, GrB_Matrix_setElement_INT32  (A, 0, 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+    ERR1 (A, GrB_Matrix_setElement_INT64  (A, 0, 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+    ERR1 (A, GrB_Matrix_setElement_UINT8  (A, 0, 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+    ERR1 (A, GrB_Matrix_setElement_UINT16 (A, 0, 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+    ERR1 (A, GrB_Matrix_setElement_UINT32 (A, 0, 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+    ERR1 (A, GrB_Matrix_setElement_UINT64 (A, 0, 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+    ERR1 (A, GrB_Matrix_setElement_FP32   (A, 0, 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+    ERR1 (A, GrB_Matrix_setElement_FP64   (A, 0, 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+    ERR1 (A, GxB_Matrix_setElement_FC32   (A, GxB_CMPLXF(0,0), 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+    ERR1 (A, GxB_Matrix_setElement_FC64   (A, GxB_CMPLX (0,0), 0, 0)) ;
+    GrB_Matrix_error_(&err, A) ; printf ("expected: %s\n", err) ;
+
+    //--------------------------------------------------------------------------
+    // GrB_error
+    //--------------------------------------------------------------------------
+
+    GrB_Type_error_(&err, user_type) ;
+    CHECK (err != NULL && err [0] == '\0') ;
+
+    GrB_UnaryOp_error_(&err, GrB_AINV_FP32) ;
+    CHECK (err != NULL && err [0] == '\0') ;
+
+    GrB_BinaryOp_error_(&err, GrB_PLUS_FP32) ;
+    CHECK (err != NULL && err [0] == '\0') ;
+
+    GxB_SelectOp_error_(&err, GxB_TRIL) ;
+    CHECK (err != NULL && err [0] == '\0') ;
+
+    GrB_Monoid_error_(&err, GrB_LOR_MONOID_BOOL) ;
+    CHECK (err != NULL && err [0] == '\0') ;
+
+    GrB_Semiring_error_(&err, GrB_PLUS_TIMES_SEMIRING_FP32) ;
+    CHECK (err != NULL && err [0] == '\0') ;
+
+    GrB_Descriptor_error_(&err, GrB_DESC_T0) ;
+    CHECK (err != NULL && err [0] == '\0') ;
+
+    OK (GxB_Scalar_new (&scalar, GrB_FP32)) ;
+    GxB_Scalar_error_(&err, scalar) ;
+    CHECK (err != NULL && err [0] == '\0') ;
+
+    OK (GrB_Vector_new (&victor, GrB_FP32, 10)) ;
+    GrB_Vector_error_(&err, victor) ;
+    CHECK (err != NULL && err [0] == '\0') ;
+
+    int16_t fortytwo = 42 ;
+    OK (GrB_Matrix_setElement_UDT (A, (void *) &fortytwo, 3, 7)) ;
+    OK (GxB_Matrix_fprint_(A, GxB_COMPLETE, NULL)) ;
+    GrB_Matrix_error_(&err, A) ;
+    CHECK (err != NULL && err [0] == '\0') ;
+
+    GrB_Vector_free_(&victor) ;
+    GxB_Scalar_free_(&scalar) ;
+    GrB_Type_free_(&user_type) ;
+    GrB_Matrix_free_(&A) ;
+
+    //--------------------------------------------------------------------------
+    // test for problem too large in GB_bitmap_AxB_saxpy
+    //--------------------------------------------------------------------------
+
+    n = INT32_MAX ;
+    OK (GrB_Matrix_new (&A, GrB_FP32, n, 0)) ;
+    OK (GrB_Matrix_new (&B, GrB_FP32, 0, n)) ;
+    expected = GrB_OUT_OF_MEMORY ;
+    bool ignore ;
+    ERR (GB_bitmap_AxB_saxpy (&C, GxB_BITMAP, NULL, false, false, A, B,
+        GrB_PLUS_TIMES_SEMIRING_FP32, false, &ignore, NULL)) ;
+    GrB_Matrix_free_(&A) ;
+    GrB_Matrix_free_(&B) ;
+    CHECK (C == NULL) ;
+
     //--------------------------------------------------------------------------
     // wrapup
     //--------------------------------------------------------------------------
 
-    GB_mx_put_global (true, 0) ;
+    // #include "GB_Test_about_mkl_template.c"
+    GB_mx_put_global (true) ;   
     fclose (f) ;
     printf ("\nAll errors printed above were expected.\n") ;
     printf ("GB_mex_about: all tests passed\n\n") ;
diff --git a/GraphBLAS/Test/GB_mex_about2.c b/GraphBLAS/Test/GB_mex_about2.c
new file mode 100644
index 0000000000..5da356155d
--- /dev/null
+++ b/GraphBLAS/Test/GB_mex_about2.c
@@ -0,0 +1,422 @@
+//------------------------------------------------------------------------------
+// GB_mex_about2: more basic tests
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// Test lots of random stuff.  The function otherwise serves no purpose.
+
+#include "GB_mex.h"
+#include "GB_mex_errors.h"
+#include "GB_ij.h"
+
+#define USAGE "GB_mex_about2"
+
+typedef struct
+{
+    int gunk [16] ;
+}
+wild ;
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    GrB_Info info ;
+    GrB_Matrix A = NULL, B = NULL, C = NULL ;
+    GxB_Scalar scalar = NULL ;
+    GrB_Vector victor = NULL ;
+    GrB_Descriptor desc = NULL ;
+    GrB_Type Wild = NULL ;
+
+    //--------------------------------------------------------------------------
+    // startup GraphBLAS
+    //--------------------------------------------------------------------------
+
+    bool malloc_debug = GB_mx_get_global (true) ;
+    FILE *f = fopen ("errlog3.txt", "w") ;
+    int expected = GrB_SUCCESS ;
+
+    //--------------------------------------------------------------------------
+    // test removeElement/setElement when jumbled
+    //--------------------------------------------------------------------------
+
+    OK (GrB_Matrix_new (&A, GrB_INT32, 10, 10)) ;
+    OK (GrB_Vector_new (&victor, GrB_INT32, 10)) ;
+    OK (GxB_Vector_Option_set_(victor, GxB_BITMAP_SWITCH, 2.0)) ;
+    OK (GxB_Scalar_new (&scalar, GrB_INT32)) ;
+
+    OK (GxB_Matrix_fprint (A, "A before set", 3, NULL)) ;
+    OK (GrB_Matrix_setElement_INT32 (A, 314159, 0, 0)) ;
+    OK (GxB_Matrix_fprint (A, "A after set", 3, NULL)) ;
+    A->jumbled = true ;
+    OK (GrB_Matrix_removeElement (A, 0, 0)) ;
+    OK (GxB_Matrix_fprint (A, "A after remove", 3, NULL)) ;
+    A->jumbled = true ;
+    OK (GrB_Matrix_setElement_INT32 (A, 99099, 0, 0)) ;
+    OK (GxB_Matrix_fprint (A, "A after set again", 3, NULL)) ;
+
+    OK (GxB_Vector_fprint (victor, "victor before set", 3, NULL)) ;
+    OK (GrB_Vector_setElement_INT32 (victor, 44, 0)) ;
+    OK (GxB_Vector_fprint (victor, "victor after set", 3, NULL)) ;
+    victor->jumbled = true ;
+    OK (GrB_Vector_removeElement (victor, 0)) ;
+    OK (GxB_Vector_fprint (victor, "victor remove set", 3, NULL)) ;
+    victor->jumbled = true ;
+    OK (GrB_Vector_setElement_INT32 (victor, 88, 0)) ;
+    OK (GxB_Vector_fprint (victor, "victor after set again", 3, NULL)) ;
+
+    OK (GxB_Scalar_fprint (scalar, "scalar before set", 3, NULL)) ;
+    OK (GxB_Scalar_setElement_INT32 (scalar, 404)) ;
+    OK (GxB_Scalar_fprint (scalar, "scalar after set", 3, NULL)) ;
+    int i = 0 ;
+    OK (GxB_Scalar_extractElement_INT32 (&i, scalar)) ;
+    CHECK (i == 404) ;
+    OK (GxB_Scalar_fprint (scalar, "scalar after extract", 3, NULL)) ;
+    OK (GrB_Matrix_removeElement ((GrB_Matrix) scalar, 0, 0)) ;
+    OK (GxB_Scalar_fprint (scalar, "scalar after remove", 3, NULL)) ;
+    i = 777 ;
+    expected = GrB_NO_VALUE ;
+    ERR (GxB_Scalar_extractElement_INT32 (&i, scalar)) ;
+    CHECK (i == 777) ;
+
+    // force a zombie into the scalar
+    OK (GxB_Scalar_setElement_INT32 (scalar, 707)) ;
+    OK (GxB_Scalar_wait (&scalar)) ;
+    OK (GxB_Scalar_fprint (scalar, "scalar after wait", 3, NULL)) ;
+    OK (GxB_Matrix_Option_set (scalar, GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
+    CHECK (scalar->i != NULL) ;
+    scalar->i [0] = GB_FLIP (0) ;
+    scalar->nzombies = 1 ;
+    OK (GxB_Scalar_fprint (scalar, "scalar with zombie", 3, NULL)) ;
+    expected = GrB_NO_VALUE ;
+    ERR (GxB_Scalar_extractElement_INT32 (&i, scalar)) ;
+    OK (GxB_Scalar_fprint (scalar, "scalar after extract", 3, NULL)) ;
+    CHECK (i == 777) ;
+
+    GrB_Vector_free_(&victor) ;
+    GrB_Matrix_free_(&A) ;
+    GxB_Scalar_free_(&scalar) ;
+
+    //--------------------------------------------------------------------------
+    // builtin comparators not defined for complex types
+    //--------------------------------------------------------------------------
+
+    int n = 10 ;
+    OK (GrB_Matrix_new (&A, GxB_FC32, n, n)) ;
+    OK (GxB_Matrix_Option_set_(A, GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
+
+    OK (GrB_Matrix_new (&C, GxB_FC32, n, n)) ;
+    OK (GxB_Scalar_new (&scalar, GxB_FC32)) ;
+    expected = GrB_DOMAIN_MISMATCH ;
+    ERR (GxB_Matrix_select (C, NULL, NULL, GxB_LT_THUNK, A, scalar, NULL)) ;
+    char *message = NULL ;
+    OK (GrB_Matrix_error (&message, C)) ;
+    printf ("error expected: %s\n", message) ;
+    GrB_Matrix_free_(&C) ;
+    GxB_Scalar_free_(&scalar) ;
+
+    //--------------------------------------------------------------------------
+    // GB_pslice
+    //--------------------------------------------------------------------------
+
+    int64_t *Slice = NULL ;
+    GB_pslice (&Slice, A->p, n, 2, true) ;
+    CHECK (Slice [0] == 0) ;
+    GB_FREE (Slice) ;
+
+    int64_t Ap [11] = { 1, 1, 1, 1, 1,
+                        1, 1, 1, 1, 1, 1 } ;
+    bool ok = GB_pslice (&Slice, Ap, 10, 10, false) ;
+    printf ("Slice: ") ;
+    for (int k = 0 ; k <= 10 ; k++) printf (" %ld", Slice [k]) ;
+    printf ("\n") ;
+    GB_FREE (Slice) ;
+
+    GrB_Matrix_free_(&A) ;
+
+    //--------------------------------------------------------------------------
+    // GrB_Matrix_check
+    //--------------------------------------------------------------------------
+
+    double bswitch = 1 ;
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    OK (GxB_Matrix_Option_set_(A, GxB_BITMAP_SWITCH, 0.125)) ;
+    OK (GxB_Matrix_Option_get_(A, GxB_BITMAP_SWITCH, &bswitch)) ;
+    CHECK (fabsf (bswitch - 0.125) < 1e-5) ;
+
+    OK (GxB_Matrix_Option_set_(A, GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
+    OK (GrB_Matrix_assign_INT32 (A, NULL, NULL, 3, GrB_ALL, n, GrB_ALL, n,
+        NULL)) ;
+    OK (GrB_Matrix_wait (&A)) ;
+    OK (GxB_Matrix_fprint (A, "valid matrix", GxB_SHORT, NULL)) ;
+    // mangle the matrix
+    GB_FREE (A->p) ;
+    GB_FREE (A->x) ;
+    expected = GrB_INVALID_OBJECT ;
+    ERR (GxB_Matrix_fprint (A, "invalid sparse matrix", GxB_SHORT, NULL)) ;
+    GrB_Matrix_free_(&A) ;
+
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    A->sparsity = 999 ;
+    ERR (GxB_Matrix_fprint (A, "invalid sparsity control", GxB_SHORT, NULL)) ;
+    GrB_Matrix_free_(&A) ;
+
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    OK (GrB_Matrix_assign_INT32 (A, NULL, NULL, 3, GrB_ALL, n, GrB_ALL, n,
+        NULL)) ;
+    OK (GrB_Matrix_wait (&A)) ;
+
+    A->jumbled = true ;
+    ERR (GxB_Matrix_fprint (A, "full matrix cannot be jumbled", GxB_SHORT,
+        NULL)) ;
+
+    A->jumbled = false ;
+    A->plen = 999 ;
+    ERR (GxB_Matrix_fprint (A, "invalid full matrix", GxB_SHORT, NULL)) ;
+
+    A->plen = -1 ;
+    A->nzombies = 1 ;
+    ERR (GxB_Matrix_fprint (A, "full matrix cannot have zombies",
+        GxB_SHORT, NULL)) ;
+    A->nzombies = 0 ;
+    CHECK (GB_Pending_alloc (&(A->Pending), GrB_INT32, NULL, true, 4)) ;
+    ERR (GxB_Matrix_fprint (A, "full matrix cannot have pending tuples",
+        GxB_SHORT, NULL)) ;
+    GrB_Matrix_free_(&A) ;
+
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    OK (GxB_Matrix_Option_set_(A, GxB_SPARSITY_CONTROL, GxB_BITMAP)) ;
+    A->plen = 999 ;
+    ERR (GxB_Matrix_fprint (A, "invalid bitmap", GxB_SHORT, NULL)) ;
+
+    A->plen = -1 ;
+    A->b [0] = 1 ;
+    ERR (GxB_Matrix_fprint (A, "invalid bitmap", GxB_SUMMARY, NULL)) ;
+    GrB_Matrix_free_(&A) ;
+
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    OK (GxB_Matrix_Option_set_(A, GxB_SPARSITY_CONTROL, GxB_BITMAP)) ;
+    OK (GrB_Matrix_setElement_INT32 (A, 12345, 0, 0)) ;
+    OK (GxB_Matrix_fprint (A, "valid matrix", GxB_SHORT, NULL)) ;
+    A->b [0] = 3 ;
+    ERR (GxB_Matrix_fprint (A, "invalid bitmap", GxB_SHORT, NULL)) ;
+    GrB_Matrix_free_(&A) ;
+
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    A->nvec_nonempty = 2 ;
+    ERR (GxB_Matrix_fprint (A, "invalid nvec_nonempty", GxB_SHORT, NULL)) ;
+    GrB_Matrix_free_(&A) ;
+
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    OK (GrB_Matrix_setElement_INT32 (A, 12345, 0, 0)) ;
+    OK (GxB_Matrix_fprint (A, "valid matrix with 1 pending", GxB_SHORT, NULL)) ;
+    A->Pending->size = 900 ;
+    ERR (GxB_Matrix_fprint (A, "invalid pending type", GxB_SHORT, NULL)) ;
+    GrB_Matrix_free_(&A) ;
+
+    //--------------------------------------------------------------------------
+    // lo:stride:hi with stride of zero
+    //--------------------------------------------------------------------------
+
+    OK (GxB_Global_Option_set_(GxB_BURBLE, true)) ;
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    GrB_Index I [3] = { 1, 1, 0 } ;
+    OK (GrB_Matrix_new (&C, GrB_INT32, n, 0)) ;
+    OK (GrB_Matrix_extract_(C, NULL, NULL, A, GrB_ALL, n, I, GxB_STRIDE,
+        NULL)) ;
+    OK (GxB_Matrix_fprint (C, "C = A (:,1:0:1)", GxB_COMPLETE, NULL)) ;
+    GrB_Matrix_free_(&C) ;
+    OK (GrB_Matrix_new (&C, GrB_INT32, 0, n)) ;
+    OK (GrB_Matrix_extract_(C, NULL, NULL, A, I, GxB_STRIDE, GrB_ALL, n,
+        NULL)) ;
+    OK (GxB_Matrix_fprint (C, "C = A (1:0:1,:)", GxB_COMPLETE, NULL)) ;
+    GrB_Matrix_free_(&C) ;
+    GrB_Matrix_free_(&A) ;
+    OK (GxB_Global_Option_set_(GxB_BURBLE, false)) ;
+
+    int64_t Icolon [3] = { 1, 1, 0 } ;
+    CHECK (!GB_ij_is_in_list (NULL, 0, 0, GB_STRIDE, Icolon)) ;
+
+    //--------------------------------------------------------------------------
+    // GB_aliased
+    //--------------------------------------------------------------------------
+
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    OK (GrB_Matrix_setElement_INT32 (A, 12345, 0, 0)) ;
+    OK (GrB_Matrix_dup (&C, A)) ;
+    CHECK (!GB_aliased (A, C)) ;
+    GB_FREE (C->p) ;
+    C->p = A->p ;
+    C->p_shallow = true ;
+    CHECK (GB_aliased (A, C)) ;
+    C->p = NULL ;
+    C->p_shallow = false ;
+    CHECK (!GB_aliased (A, C)) ;
+    GB_FREE (C->i) ;
+    C->i = A->i ;
+    C->i_shallow = true ;
+    CHECK (GB_aliased (A, C)) ;
+    C->i = NULL ;
+    C->i_shallow = false ;
+    GrB_Matrix_free_(&A) ;
+    GrB_Matrix_free_(&C) ;
+
+    //--------------------------------------------------------------------------
+    // GrB_apply with empty scalar
+    //--------------------------------------------------------------------------
+
+    OK (GxB_Scalar_new (&scalar, GrB_INT32)) ;
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    OK (GrB_Matrix_new (&C, GrB_INT32, n, n)) ;
+    expected = GrB_INVALID_VALUE ;
+    ERR (GxB_Matrix_apply_BinaryOp2nd (C, NULL, NULL, GrB_PLUS_INT32, A,
+        scalar, NULL)) ;
+    OK (GrB_Matrix_error (&message, C)) ;
+    printf ("error expected: %s\n", message) ;
+    GrB_Matrix_free_(&A) ;
+    GrB_Matrix_free_(&C) ;
+    GxB_Scalar_free_(&scalar) ;
+
+    //--------------------------------------------------------------------------
+    // invalid descriptor
+    //--------------------------------------------------------------------------
+
+    int method ;
+    OK (GrB_Descriptor_new (&desc)) ;
+    OK (GxB_Descriptor_fprint (desc, "descriptor", GxB_COMPLETE, NULL)) ;
+
+    OK (GxB_Desc_get (NULL, GxB_AxB_METHOD, &method)) ;
+    CHECK (method == GxB_DEFAULT) ;
+    OK (GxB_Desc_set (desc, GxB_AxB_METHOD, GxB_AxB_GUSTAVSON)) ;
+    OK (GxB_Descriptor_fprint (desc, "descriptor", GxB_COMPLETE, NULL)) ;
+    OK (GxB_Desc_get (desc, GxB_AxB_METHOD, &method)) ;
+    CHECK (method == GxB_AxB_GUSTAVSON) ;
+
+    desc->mask = GrB_REPLACE ;
+    expected = GrB_INVALID_OBJECT ;
+    ERR (GxB_Descriptor_fprint (desc, "invalid", GxB_COMPLETE, NULL)) ;
+    OK (GrB_Descriptor_free (&desc)) ;
+
+    //--------------------------------------------------------------------------
+    // GrB_build an empty matrix
+    //--------------------------------------------------------------------------
+
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    OK (GrB_Matrix_build_INT32 (A, I, I, I, 0, GrB_PLUS_INT32)) ;
+    OK (GxB_Matrix_fprint (A, "empty", GxB_COMPLETE, NULL)) ;
+    CHECK (!GB_is_shallow (A)) ;
+    GrB_Matrix_free_(&A) ;
+
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    expected = GrB_DOMAIN_MISMATCH ;
+    ERR (GrB_Matrix_build_INT32 (A, I, I, I, 0, GxB_FIRSTI_INT32)) ;
+    OK (GrB_Matrix_error (&message, A)) ;
+    printf ("error expected: %s\n", message) ;
+    GrB_Matrix_free_(&A) ;
+
+    //--------------------------------------------------------------------------
+    // reduce with positional op
+    //--------------------------------------------------------------------------
+
+    OK (GrB_Matrix_new (&A, GrB_INT32, n, n)) ;
+    OK (GrB_Vector_new (&victor, GrB_INT32, n)) ;
+    OK (GxB_Vector_Option_get_(victor, GxB_BITMAP_SWITCH, &bswitch)) ;
+    printf ("vector bitmap switch: %g\n\n", bswitch) ;
+
+    expected = GrB_DOMAIN_MISMATCH ;
+    ERR (GrB_Matrix_reduce_BinaryOp (victor, NULL, NULL, GxB_FIRSTI_INT32,
+        A, NULL)) ;
+    OK (GrB_Matrix_error (&message, victor)) ;
+    printf ("error expected: %s\n", message) ;
+    GrB_Matrix_free_(&A) ;
+    GrB_Vector_free_(&victor) ;
+
+    //--------------------------------------------------------------------------
+    // GrB_init
+    //--------------------------------------------------------------------------
+
+    expected = GrB_INVALID_VALUE ;
+    ERR (GrB_init (GrB_BLOCKING)) ;
+
+    //--------------------------------------------------------------------------
+    // jumbled user-defined matrix
+    //--------------------------------------------------------------------------
+
+    wild w ;
+    n = 3 ;
+    memset (w.gunk, 13, 16 * sizeof (int)) ;
+    OK (GrB_Type_new (&Wild, sizeof (wild))) ;
+    OK (GrB_Matrix_new (&C, Wild, n, n)) ;
+    OK (GxB_Matrix_Option_set (C, GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
+    OK (GrB_Matrix_assign_UDT (C, NULL, NULL, &w, GrB_ALL, n, GrB_ALL, n,
+        NULL)) ;
+    OK (GxB_Matrix_fprint (C, "wild matrix", GxB_SHORT, NULL)) ;
+
+    // jumble the matrix
+    C->jumbled = true ;
+    C->i [0] = 1 ;
+    C->i [1] = 0 ;
+    OK (GxB_Matrix_fprint (C, "wild matrix jumbled", GxB_SHORT, NULL)) ;
+
+    // unjumble the matrix
+    OK (GrB_Matrix_wait (&C)) ;
+    OK (GxB_Matrix_fprint (C, "wild matrix unjumbled", GxB_SHORT, NULL)) ;
+
+    GrB_Matrix_free_(&C) ;
+    GrB_Type_free_(&Wild) ;
+
+    //--------------------------------------------------------------------------
+    // malloc/realloc wrappers
+    //--------------------------------------------------------------------------
+
+    ok = false ;
+    int *p = GB_malloc_memory (4, sizeof (int)) ;
+    CHECK (p != NULL) ;
+    p = GB_realloc_memory (4, 4, sizeof (int), p, &ok) ;
+    CHECK (p != NULL) ;
+    CHECK (ok) ;
+    GB_free_memory (p) ;
+    p = NULL ;
+
+    //--------------------------------------------------------------------------
+    // try to import a huge full matrix (this will fail):
+    //--------------------------------------------------------------------------
+
+    GrB_Matrix X = NULL ;
+    info = GxB_Matrix_import_FullC (&X, GrB_FP32, GxB_INDEX_MAX, GxB_INDEX_MAX,
+        NULL, UINT64_MAX, NULL) ;
+    if (info != GrB_INVALID_VALUE || X != NULL) mexErrMsgTxt ("huge fail1") ;
+
+    GrB_Index nhuge = (((GrB_Index) 2) << 50) ;
+    info = GxB_Matrix_import_BitmapC (&X, GrB_FP32, nhuge, nhuge,
+        NULL, NULL, 0, 0, 0, NULL) ;
+    if (info != GrB_INVALID_VALUE || X != NULL) mexErrMsgTxt ("huge fail5") ;
+
+    // try to convert a huge sparse matrix to bitmap (this will fail too):
+    info = GrB_Matrix_new (&X, GrB_FP32, nhuge, nhuge) ;
+    if (info != GrB_SUCCESS) mexErrMsgTxt ("huge fail2") ;
+    info = GxB_Matrix_Option_set_(X, GxB_SPARSITY_CONTROL, GxB_BITMAP) ;
+    if (info != GrB_OUT_OF_MEMORY) mexErrMsgTxt ("huge fail3") ;
+    info = GB_convert_to_full (X) ;
+    if (info != GrB_OUT_OF_MEMORY) mexErrMsgTxt ("huge fail4") ;
+    GrB_Matrix_free (&X) ;
+
+    //--------------------------------------------------------------------------
+    // wrapup
+    //--------------------------------------------------------------------------
+
+    GB_mx_put_global (true) ;   
+    fclose (f) ;
+    printf ("\nAll errors printed above were expected.\n") ;
+    printf ("GB_mex_about2: all tests passed\n\n") ;
+}
+
diff --git a/GraphBLAS/Test/GB_mex_apply.c b/GraphBLAS/Test/GB_mex_apply.c
index fee6288055..9f0211a458 100644
--- a/GraphBLAS/Test/GB_mex_apply.c
+++ b/GraphBLAS/Test/GB_mex_apply.c
@@ -2,8 +2,8 @@
 // GB_mex_apply: C<Mask> = accum(C,op(A)) or op(A')
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,11 +15,11 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_MATRIX_FREE (&Mask) ;            \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&C) ;              \
+    GrB_Matrix_free_(&Mask) ;           \
+    GrB_Matrix_free_(&A) ;              \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 GrB_Matrix C = NULL ;
@@ -63,7 +63,6 @@ void mexFunction
     bool malloc_debug = GB_mx_get_global (true) ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 5 || nargin > 6)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -72,7 +71,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
@@ -121,12 +120,6 @@ void mexFunction
         mexErrMsgTxt ("desc failed") ;
     }
 
-    // printf ("\nin GB_mex_op ---------------------------\n")  ;
-    // GxB_print (A, 3) ;
-    // GxB_print (op, 3) ;
-    // GxB_print (accum, 3) ;
-    // printf ("input:\n") ; GxB_print (C, 3) ;
-
     // C<Mask> = accum(C,op(A))
     if (GB_NCOLS (C) == 1 && (desc == NULL || desc->in0 == GxB_DEFAULT))
     {
@@ -138,14 +131,9 @@ void mexFunction
         METHOD (apply (true)) ;
     }
 
-    // printf ("result:\n") ; GxB_print (C, 3) ;
-
     // return C to MATLAB as a struct and free the GraphBLAS C
     pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C output", true) ;
 
     FREE_ALL ;
-
-    // printf ("\nfinished GB_mex_op ---------------------------\n")  ;
-
 }
 
diff --git a/GraphBLAS/Test/GB_mex_apply1.c b/GraphBLAS/Test/GB_mex_apply1.c
index b95b770f29..29e0ef61ef 100644
--- a/GraphBLAS/Test/GB_mex_apply1.c
+++ b/GraphBLAS/Test/GB_mex_apply1.c
@@ -2,8 +2,8 @@
 // GB_mex_apply1: C<Mask> = accum(C,op(x,A)) or op(x,A')
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,12 +18,12 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_MATRIX_FREE (&Mask) ;            \
-    GB_MATRIX_FREE (&S) ;               \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&C) ;               \
+    GrB_Matrix_free_(&Mask) ;            \
+    GrB_Matrix_free_(&S) ;               \
+    GrB_Matrix_free_(&A) ;               \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 GrB_Matrix C = NULL, S = NULL ;
@@ -240,7 +240,6 @@ void mexFunction
     bool malloc_debug = GB_mx_get_global (true) ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 7 || nargin > 8)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -249,7 +248,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
@@ -327,13 +326,6 @@ void mexFunction
         mexErrMsgTxt ("desc failed") ;
     }
 
-// printf ("\nin GB_mex_apply1 ---------------------------\n")  ;
-// printf ("input:\n") ; GxB_print (C, 3) ;
-// GxB_print (accum, 3) ;
-// GxB_print (op, 3) ;
-// GxB_print (scalar, 3) ;
-// GxB_print (A, 3) ;
-
     // C<Mask> = accum(C,op(x,A))
     if (GB_NCOLS (C) == 1 && (desc == NULL || desc->in0 == GxB_DEFAULT))
     {
@@ -345,8 +337,6 @@ void mexFunction
         METHOD (apply1 (true)) ;
     }
 
-// printf ("result:\n") ; GxB_print (C, 3) ;
-
     // return C to MATLAB as a struct and free the GraphBLAS C
     pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C output", true) ;
 
diff --git a/GraphBLAS/Test/GB_mex_apply2.c b/GraphBLAS/Test/GB_mex_apply2.c
index 407bbdc602..f3a11f4120 100644
--- a/GraphBLAS/Test/GB_mex_apply2.c
+++ b/GraphBLAS/Test/GB_mex_apply2.c
@@ -2,8 +2,8 @@
 // GB_mex_apply2: C<Mask> = accum(C,op(A,y)) or op(A',y)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,12 +18,12 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_MATRIX_FREE (&Mask) ;            \
-    GB_MATRIX_FREE (&S) ;               \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&C) ;               \
+    GrB_Matrix_free_(&Mask) ;            \
+    GrB_Matrix_free_(&S) ;               \
+    GrB_Matrix_free_(&A) ;               \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 GrB_Matrix C = NULL, S = NULL ;
@@ -44,7 +44,6 @@ GrB_Info apply2 (bool is_matrix)
     GrB_Type stype ;
     GxB_Scalar_type (&stype, scalar) ;
 
-    // printf ("ismatrix: %d how %d\n", is_matrix, how) ;
     if (is_matrix && how == 1)
     {
         if (stype == GrB_BOOL)
@@ -241,7 +240,6 @@ void mexFunction
     bool malloc_debug = GB_mx_get_global (true) ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 7 || nargin > 8)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -250,7 +248,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
@@ -328,15 +326,9 @@ void mexFunction
         mexErrMsgTxt ("desc failed") ;
     }
 
-    // printf ("\nin GB_mex_apply2 ---------------------------\n")  ;
-    // printf ("input:\n") ; GxB_print (C, 2) ;
-    // GxB_print (accum, 3) ;
-    // GxB_print (op, 3) ;
-    // GxB_print (scalar, 3) ;
-    // GxB_print (A, 2) ;
-
     // C<Mask> = accum(C,op(x,A))
-    if (GB_NCOLS (C) == 1 && (desc == NULL || desc->in0 == GxB_DEFAULT))
+    if (GB_NCOLS (C) == 1 && (desc == NULL || desc->in0 == GxB_DEFAULT)
+        && GB_VECTOR_OK (C))
     {
         // this is just to test the Vector version
         METHOD (apply2 (false)) ;
@@ -346,8 +338,6 @@ void mexFunction
         METHOD (apply2 (true)) ;
     }
 
-    // printf ("result:\n") ; GxB_print (C, 2) ;
-
     // return C to MATLAB as a struct and free the GraphBLAS C
     pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C output", true) ;
 
diff --git a/GraphBLAS/Test/GB_mex_apply_maskalias.c b/GraphBLAS/Test/GB_mex_apply_maskalias.c
index 0d1422d23b..a5d3f38003 100644
--- a/GraphBLAS/Test/GB_mex_apply_maskalias.c
+++ b/GraphBLAS/Test/GB_mex_apply_maskalias.c
@@ -2,8 +2,8 @@
 // GB_mex_apply_maskalias: C<C> = accum(C,op(A)) or op(A')
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,10 +15,10 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&C) ;               \
+    GrB_Matrix_free_(&A) ;               \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -36,7 +36,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 4 || nargin > 5)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -45,7 +44,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
diff --git a/GraphBLAS/Test/GB_mex_assign.c b/GraphBLAS/Test/GB_mex_assign.c
index 1050979ca8..69132fcca7 100644
--- a/GraphBLAS/Test/GB_mex_assign.c
+++ b/GraphBLAS/Test/GB_mex_assign.c
@@ -2,8 +2,8 @@
 // GB_mex_assign: C<Mask>(I,J) = accum (C (I,J), A)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 // This function is a wrapper for GrB_Matrix_assign, GrB_Matrix_assign_T
 // GrB_Vector_assign, and GrB_Vector_assign_T (when kind=0 or by default).  For
@@ -26,17 +26,21 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&Mask) ;            \
-    GB_MATRIX_FREE (&C) ;               \
+    GrB_Matrix_free_(&A) ;              \
+    GrB_Matrix_free_(&Mask) ;           \
+    GrB_Matrix_free_(&C) ;              \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 #define GET_DEEP_COPY \
-    C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
+    C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;         \
+    if (have_sparsity_control)                                                \
+    {                                                                         \
+        GxB_Matrix_Option_set (C, GxB_SPARSITY_CONTROL, C_sparsity_control) ; \
+    }
 
-#define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+#define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
 
 GrB_Matrix C = NULL ;
 GrB_Matrix Mask = NULL ;
@@ -50,6 +54,9 @@ bool malloc_debug = false ;
 GrB_Info info = GrB_SUCCESS ;
 int kind = 0 ;
 GrB_Info assign (void) ;
+int C_sparsity_control ;
+int M_sparsity_control ;
+bool have_sparsity_control = false ;
 
 GrB_Info many_assign
 (
@@ -79,7 +86,6 @@ GrB_Info many_assign
 
 GrB_Info assign ( )
 {
-    GB_WHERE ("assign") ;
     bool at = (desc != NULL && desc->in0 == GrB_TRAN) ;
     GrB_Info info ;
 
@@ -92,39 +98,6 @@ GrB_Info assign ( )
     ASSERT_BINARYOP_OK_OR_NULL (accum, "accum for GB_mex_assign", pr) ;
     ASSERT_DESCRIPTOR_OK_OR_NULL (desc, "desc for GB_mex_assign", pr) ;
 
-    /*
-    if (I == NULL)
-    {
-        printf ("I is NULL\n") ;
-    }
-    else if (I == GrB_ALL)
-    {
-        printf ("I is ALL\n") ;
-    }
-    else
-    {
-        for (int64_t k = 0 ; k < ni ; k++)
-        {
-            printf ("I [%lld] = %lld\n", k, I [k]) ;
-        }
-    }
-    if (J == NULL)
-    {
-        printf ("J is NULL\n") ;
-    }
-    else if (J == GrB_ALL)
-    {
-        printf ("J is ALL\n") ;
-    }
-    else
-    {
-        for (int64_t k = 0 ; k < nj ; k++)
-        {
-            printf ("J [%lld] = %lld\n", k, J [k]) ;
-        }
-    }
-    */
-
     if (kind == 1)
     {
         // test GrB_Col_assign
@@ -302,7 +275,6 @@ GrB_Info many_assign
     const mxArray *pargin [ ]
 )
 {
-    GB_WHERE ("many_assign") ;
     GrB_Info info = GrB_SUCCESS ;
 
     for (int64_t k = 0 ; k < nwork ; k++)
@@ -320,22 +292,27 @@ GrB_Info many_assign
         bool save = GB_Global_malloc_debug_get ( ) ;
         GB_Global_malloc_debug_set (false) ;
 
-        // get Mask (shallow copy)
+        // get Mask (deep copy)
         Mask = NULL ;
         if (fMask >= 0)
         {
             p = mxGetFieldByNumber (pargin [1], k, fMask) ;
-            Mask = GB_mx_mxArray_to_Matrix (p, "Mask", false, false) ;
+            Mask = GB_mx_mxArray_to_Matrix (p, "Mask", true, false) ;
             if (Mask == NULL && !mxIsEmpty (p))
             {
                 FREE_ALL ;
                 mexErrMsgTxt ("Mask failed") ;
             }
+            if (have_sparsity_control)
+            {
+                GxB_Matrix_Option_set (Mask, GxB_SPARSITY_CONTROL,
+                    M_sparsity_control) ;
+            }
         }
 
-        // get A (shallow copy)
+        // get A (deep copy)
         p = mxGetFieldByNumber (pargin [1], k, fA) ;
-        A = GB_mx_mxArray_to_Matrix (p, "A", false, true) ;
+        A = GB_mx_mxArray_to_Matrix (p, "A", true, true) ;
         if (A == NULL)
         {
             FREE_ALL ;
@@ -373,11 +350,6 @@ GrB_Info many_assign
             mexErrMsgTxt ("J failed") ;
         }
 
-        /*
-        printf ("many assign: fI %d fJ %d ni %lld nj %lld\n", fI, fJ, ni, nj) ;
-        for (int kk = 0 ; kk < nj ; kk++) printf ("J[%d]=%lld\n", kk, J[kk]) ;
-        */
-
         // get desc
         desc = NULL ;
         if (fdesc > 0)
@@ -407,8 +379,8 @@ GrB_Info many_assign
 
         info = assign ( ) ;
 
-        GB_MATRIX_FREE (&A) ;
-        GB_MATRIX_FREE (&Mask) ;
+        GrB_Matrix_free_(&A) ;
+        GrB_Matrix_free_(&Mask) ;
         GrB_Descriptor_free_(&desc) ;
 
         if (info != GrB_SUCCESS)
@@ -435,6 +407,20 @@ void mexFunction
 )
 {
 
+    C = NULL ;
+    Mask = NULL ;
+    A = NULL ;
+    desc = NULL ;
+    accum = NULL ;
+    I = NULL ; ni = 0 ;
+    J = NULL ; nj = 0 ;
+    malloc_debug = false ;
+    info = GrB_SUCCESS ;
+    kind = 0 ;
+    C_sparsity_control = GxB_AUTO_SPARSITY ;
+    M_sparsity_control = GxB_AUTO_SPARSITY ;
+    have_sparsity_control = false ;
+
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
@@ -446,13 +432,24 @@ void mexFunction
     desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || !
-        (nargin == 2 || nargin == 6 || nargin == 7 || nargin == 8))
+        (nargin == 2 || nargin == 3 || nargin == 6 || nargin == 7 ||
+         nargin == 8))
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
     }
 
+    // get sparsity control if present
+    if (nargin == 3)
+    {
+        int n = mxGetNumberOfElements (pargin [2]) ;
+        if (n != 2) mexErrMsgTxt ("invalid sparsity control") ;
+        have_sparsity_control = true ;
+        double *p = mxGetDoubles (pargin [2]) ;
+        C_sparsity_control = (int) p [0] ;
+        M_sparsity_control = (int) p [1] ;
+    }
+
     //--------------------------------------------------------------------------
     // get C (make a deep copy)
     //--------------------------------------------------------------------------
@@ -464,7 +461,7 @@ void mexFunction
         mexErrMsgTxt ("C failed") ;
     }
 
-    if (nargin == 2)
+    if (nargin == 2 || nargin == 3)
     {
 
         //----------------------------------------------------------------------
@@ -512,16 +509,16 @@ void mexFunction
         // C<Mask>(I,J) = A, with a single assignment
         //----------------------------------------------------------------------
 
-        // get Mask (shallow copy)
-        Mask = GB_mx_mxArray_to_Matrix (pargin [1], "Mask", false, false) ;
+        // get Mask (deep copy)
+        Mask = GB_mx_mxArray_to_Matrix (pargin [1], "Mask", true, false) ;
         if (Mask == NULL && !mxIsEmpty (pargin [1]))
         {
             FREE_ALL ;
             mexErrMsgTxt ("Mask failed") ;
         }
 
-        // get A (shallow copy)
-        A = GB_mx_mxArray_to_Matrix (pargin [3], "A", false, true) ;
+        // get A (deep copy)
+        A = GB_mx_mxArray_to_Matrix (pargin [3], "A", true, true) ;
         if (A == NULL)
         {
             FREE_ALL ;
diff --git a/GraphBLAS/Test/GB_mex_assign_alias.c b/GraphBLAS/Test/GB_mex_assign_alias.c
index 33a2968269..8fa50b0c73 100644
--- a/GraphBLAS/Test/GB_mex_assign_alias.c
+++ b/GraphBLAS/Test/GB_mex_assign_alias.c
@@ -2,8 +2,8 @@
 // GB_mex_assign_alias: C(I,J) = accum(C(I,J),C)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,9 +13,9 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&C) ;                   \
+    GrB_Matrix_free_(&C) ;                   \
     GrB_Descriptor_free_(&desc) ;           \
-    GB_mx_put_global (true, 0) ;            \
+    GB_mx_put_global (true) ;               \
 }
 
 void mexFunction
@@ -35,7 +35,6 @@ void mexFunction
     bool ignore ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 2 || nargin > 5)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -44,7 +43,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
         C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
@@ -83,10 +82,6 @@ void mexFunction
         mexErrMsgTxt ("desc failed") ;
     }
 
-    GrB_Index nrows, ncols ;
-    GrB_Matrix_nvals (&nrows, C) ;
-    GrB_Matrix_nvals (&ncols, C) ;
-
     // C(I,J) = accum (C(I,J),C)
     METHOD (GrB_Matrix_assign_(C, NULL, accum, C, I, ni, J, nj, desc)) ;
 
diff --git a/GraphBLAS/Test/GB_mex_assign_alias_mask.c b/GraphBLAS/Test/GB_mex_assign_alias_mask.c
index d51c81cb68..0201651213 100644
--- a/GraphBLAS/Test/GB_mex_assign_alias_mask.c
+++ b/GraphBLAS/Test/GB_mex_assign_alias_mask.c
@@ -2,8 +2,8 @@
 // GB_mex_assign_alias_mask: C<A> = A
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,10 +13,10 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&C) ;                   \
-    GB_MATRIX_FREE (&A) ;                   \
+    GrB_Matrix_free_(&C) ;                   \
+    GrB_Matrix_free_(&A) ;                   \
     GrB_Descriptor_free_(&desc) ;           \
-    GB_mx_put_global (true, 0) ;            \
+    GB_mx_put_global (true) ;               \
 }
 
 void mexFunction
@@ -33,29 +33,25 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 2 || nargin > 3)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
     }
 
-    // get C (make a deep copy)
-    #define GET_DEEP_COPY \
-        C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    // get A C (make a deep copy)
+    #define GET_DEEP_COPY       \
+        C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;   \
+        GxB_Matrix_Option_set (C, GxB_SPARSITY_CONTROL, C->sparsity) ;      \
+        A = GB_mx_mxArray_to_Matrix (pargin [1], "A input", true, true) ;   \
+        GxB_Matrix_Option_set (A, GxB_SPARSITY_CONTROL, A->sparsity) ;
+    #define FREE_DEEP_COPY      \
+        GrB_Matrix_free_(&C) ;  \
+        GrB_Matrix_free_(&A) ;
     GET_DEEP_COPY ;
-    if (C == NULL)
+    if (C == NULL || A == NULL)
     {
         FREE_ALL ;
-        mexErrMsgTxt ("C failed") ;
-    }
-
-    // get A (shallow copy)
-    A = GB_mx_mxArray_to_Matrix (pargin [1], "A input", false, true) ;
-    if (A == NULL)
-    {
-        FREE_ALL ;
-        mexErrMsgTxt ("A failed") ;
+        mexErrMsgTxt ("C or A failed") ;
     }
 
     // get desc
@@ -66,11 +62,11 @@ void mexFunction
     }
 
     GrB_Index nrows, ncols ;
-    GrB_Matrix_nvals (&nrows, C) ;
-    GrB_Matrix_nvals (&ncols, C) ;
+    GrB_Matrix_nrows (&nrows, C) ;
+    GrB_Matrix_ncols (&ncols, C) ;
 
     // C<A> = A
-    METHOD (GrB_Matrix_assign_(C, A, NULL, A,
+    METHOD (GxB_Matrix_subassign_(C, A, NULL, A,
         GrB_ALL, nrows, GrB_ALL, ncols, desc)) ;
 
     // return C to MATLAB as a struct and free the GraphBLAS C
diff --git a/GraphBLAS/Test/GB_mex_band.c b/GraphBLAS/Test/GB_mex_band.c
index b445e67b00..4679146522 100644
--- a/GraphBLAS/Test/GB_mex_band.c
+++ b/GraphBLAS/Test/GB_mex_band.c
@@ -2,8 +2,8 @@
 // GB_mex_band: C = tril (triu (A,lo), hi), or with A'
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,13 +15,13 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_SCALAR_FREE (&Thunk) ;           \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_MATRIX_FREE (&A) ;               \
+    GxB_Scalar_free_(&Thunk) ;          \
+    GrB_Matrix_free_(&C) ;              \
+    GrB_Matrix_free_(&A) ;              \
     GxB_Scalar_free_(&Thunk_type) ;     \
     GxB_SelectOp_free_(&op) ;           \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 #define OK(method)                                      \
@@ -30,7 +30,6 @@
     if (info != GrB_SUCCESS)                            \
     {                                                   \
         FREE_ALL ;                                      \
-        printf ("%s\n", GrB_error ()) ;                 \
         mexErrMsgTxt ("GraphBLAS failed") ;             \
     }                                                   \
 }
@@ -41,11 +40,11 @@ typedef struct
     int64_t hi ;
 } LoHi_type ; 
 
-bool LoHi_band (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, /* x is unused: */ const void *x, const LoHi_type *thunk) ;
+bool LoHi_band (GrB_Index i, GrB_Index j,
+    /* x is unused: */ const void *x, const LoHi_type *thunk) ;
 
-bool LoHi_band (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, /* x is unused: */ const void *x, const LoHi_type *thunk)
+bool LoHi_band (GrB_Index i, GrB_Index j,
+    /* x is unused: */ const void *x, const LoHi_type *thunk)
 {
     int64_t i2 = (int64_t) i ;
     int64_t j2 = (int64_t) j ;
@@ -76,7 +75,6 @@ void mexFunction
     #define FREE_DEEP_COPY ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 3 || nargin > 4)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -100,8 +98,7 @@ void mexFunction
 
     OK (GxB_Scalar_new (&Thunk, Thunk_type)) ;
     OK (GxB_Scalar_setElement_UDT (Thunk, (void *) &bandwidth)) ;
-    GrB_Index ignore ;
-    OK (GxB_Scalar_nvals (&ignore, Thunk)) ;
+    OK (GxB_Scalar_wait_(&Thunk)) ;
 
     // get atranspose
     bool atranspose = false ;
diff --git a/GraphBLAS/Test/GB_mex_bfs.c b/GraphBLAS/Test/GB_mex_bfs.c
index c60e73fda1..ce704c888c 100644
--- a/GraphBLAS/Test/GB_mex_bfs.c
+++ b/GraphBLAS/Test/GB_mex_bfs.c
@@ -2,8 +2,8 @@
 // GB_mex_bfs: v = bfs (A,s)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,9 +13,9 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&A) ;                   \
-    GB_VECTOR_FREE (&v) ;                   \
-    GB_mx_put_global (true, 0) ;            \
+    GrB_Matrix_free_(&A) ;                  \
+    GrB_Vector_free_(&v) ;                  \
+    GB_mx_put_global (true) ;               \
 }
 
 void mexFunction
@@ -32,7 +32,6 @@ void mexFunction
     GrB_Vector v = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 1 || nargin > 2)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_binaryop.c b/GraphBLAS/Test/GB_mex_binaryop.c
index e93611e54a..ef1727c3a8 100644
--- a/GraphBLAS/Test/GB_mex_binaryop.c
+++ b/GraphBLAS/Test/GB_mex_binaryop.c
@@ -2,8 +2,8 @@
 // GB_mex_binaryop: parse a binaryop, for testing
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -21,7 +21,7 @@ void mexFunction
 {
 
     // check inputs
-    GB_WHERE (USAGE) ;
+    GB_CONTEXT (USAGE) ;
     if (nargin != 1)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -32,10 +32,10 @@ void mexFunction
         GrB_FP64, false) ;
 
     GrB_Info info = GB_BinaryOp_check (binaryop, "binaryop", GxB_COMPLETE,
-        stdout, Context) ;
+        stdout) ;
     if (info != GrB_SUCCESS)
     {
-        mexErrMsgTxt (GrB_error ( )) ;
+        mexErrMsgTxt ("binaryop failed") ;
     }
 }
 
diff --git a/GraphBLAS/Test/GB_mex_cast.c b/GraphBLAS/Test/GB_mex_cast.c
index 168f991287..02f15b7fe4 100644
--- a/GraphBLAS/Test/GB_mex_cast.c
+++ b/GraphBLAS/Test/GB_mex_cast.c
@@ -2,8 +2,8 @@
 // GB_mex_cast: cast a MATLAB array using C-style casting rules
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -28,7 +28,6 @@ void mexFunction
     bool malloc_debug = GB_mx_get_global (do_cover) ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 2 || nargin < 1 || nargin > 3)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -58,41 +57,14 @@ void mexFunction
 
     // create C
     pargout [0] = GB_mx_create_full (nrows, ncols, ctype) ;
+    if (ctype == Complex) ctype = GxB_FC64 ;
     if (xtype == Complex) xtype = GxB_FC64 ;
     GB_void *C = mxGetData (pargout [0]) ;
 
-//  GxB_print (xtype, 3) ;
-//  GxB_print (ctype, 3) ;
-//  printf ("\nGB_mex_cast from %d to %d: size %ld\n", xtype->code,
-//      ctype->code, nrows*ncols) ;
-
-//  printf ("X input:\n") ;
-//  for (int k = 0 ; k < nrows*ncols ; k++)
-//  {
-//      printf ("X [%d] = ", k) ;
-//      GB_code_check (xtype->code, X + k*(xtype->size), 3, NULL, Context) ;
-//      printf ("\n") ;
-//  }
-
     // cast the data from X to C
-    GB_cast_array (C, ctype->code, X, xtype->code, xtype->size, nrows*ncols, 1) ;
-
-//  printf ("X input again:\n") ;
-//  for (int k = 0 ; k < nrows*ncols ; k++)
-//  {
-//      printf ("X [%d] = ", k) ;
-//      GB_code_check (xtype->code, X + k*(xtype->size), 3, NULL, Context) ;
-//      printf ("\n") ;
-//  }
-
-//  printf ("C output:\n") ;
-//  for (int k = 0 ; k < nrows*ncols ; k++)
-//  {
-//      printf ("C [%d] = ", k) ;
-//      GB_code_check (ctype->code, C + k*(ctype->size), 3, NULL, Context) ;
-//      printf ("\n") ;
-//  }
+    GB_cast_array (C, ctype->code, X, xtype->code, NULL,
+        xtype->size, nrows*ncols, 1) ;
 
-    GB_mx_put_global (do_cover, 0) ;
+    GB_mx_put_global (do_cover) ;
 }
 
diff --git a/GraphBLAS/Test/GB_mex_clear.c b/GraphBLAS/Test/GB_mex_clear.c
index 79db178c7f..c0c6df815c 100644
--- a/GraphBLAS/Test/GB_mex_clear.c
+++ b/GraphBLAS/Test/GB_mex_clear.c
@@ -2,8 +2,8 @@
 // GB_mex_clear: clear a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,7 +17,7 @@
 {                                       \
     GrB_Matrix_free_(&A) ;              \
     GrB_Matrix_free_(&C) ;              \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -33,7 +33,6 @@ void mexFunction
     GrB_Matrix A = NULL, C = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 1 || nargin > 3)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_complex.c b/GraphBLAS/Test/GB_mex_complex.c
index cb6899e71c..1349aa8812 100644
--- a/GraphBLAS/Test/GB_mex_complex.c
+++ b/GraphBLAS/Test/GB_mex_complex.c
@@ -2,8 +2,8 @@
 // GB_mex_complex: convert a real matrix into a complex one
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -39,18 +39,26 @@ void mexFunction
 
     // get the input matrix
     const mxArray *A = pargin [0] ;
-    if (!mxIsSparse (A))
-    {
-        mexErrMsgTxt ("A must be sparse") ;
-    }
+    bool A_is_sparse = mxIsSparse (A) ;
     if (mxIsLogical (A))
     {
         mexErrMsgTxt ("A must be double or double complex") ;
     }
 
-    int64_t *Ap = (int64_t *) mxGetJc (A) ;
-    int64_t *Ai = (int64_t *) mxGetIr (A) ;
-    double  *Ax = NULL ;
+    int64_t *Ap, *Ai ;
+
+    if (A_is_sparse)
+    {
+        Ap = (int64_t *) mxGetJc (A) ;
+        Ai = (int64_t *) mxGetIr (A) ;
+    }
+    else
+    {
+        Ap = NULL ;
+        Ai = NULL ;
+    }
+
+    double *Ax = NULL ;
     if (mxIsComplex (A))
     {
         Ax = (double *) mxGetComplexDoubles (pargin [0]) ;
@@ -62,20 +70,27 @@ void mexFunction
 
     int64_t m = mxGetM (A) ;
     int64_t n = mxGetN (A) ;
-    int64_t anz = Ap [n] ;
+    int64_t anz = (A_is_sparse) ? Ap [n] : (m*n) ;
 
     // create the output matrix
-    pargout [0] = mxCreateSparse (m, n, anz+1, mxCOMPLEX) ;
-    mxArray *C = pargout [0] ;
-    int64_t *Cp = (int64_t *) mxGetJc (C) ;
-    int64_t *Ci = (int64_t *) mxGetIr (C) ;
-    double  *Cx = (double  *) mxGetComplexDoubles (C) ;
-
-    // copy the pattern of A into C
-    memcpy (Cp, Ap, (n+1) * sizeof (int64_t)) ;
-    memcpy (Ci, Ai, anz   * sizeof (int64_t)) ;
+    if (A_is_sparse)
+    {
+        // A and C are sparse
+        pargout [0] = mxCreateSparse (m, n, anz+1, mxCOMPLEX) ;
+        int64_t *Cp = (int64_t *) mxGetJc (pargout [0]) ;
+        int64_t *Ci = (int64_t *) mxGetIr (pargout [0]) ;
+        // copy the pattern of A into C
+        memcpy (Cp, Ap, (n+1) * sizeof (int64_t)) ;
+        memcpy (Ci, Ai, anz   * sizeof (int64_t)) ;
+    }
+    else
+    {
+        // A and C are full
+        pargout [0] = mxCreateDoubleMatrix (m, n, mxCOMPLEX) ;
+    }
 
     // copy the values of A into C
+    double *Cx = (double *) mxGetComplexDoubles (pargout [0]) ;
     if (mxIsComplex (A))
     {
         memcpy (Cx, Ax, anz * 2 * sizeof (double)) ;
diff --git a/GraphBLAS/Test/GB_mex_cumsum.c b/GraphBLAS/Test/GB_mex_cumsum.c
index dea6056640..a0e2d3d8b5 100644
--- a/GraphBLAS/Test/GB_mex_cumsum.c
+++ b/GraphBLAS/Test/GB_mex_cumsum.c
@@ -2,8 +2,8 @@
 // GB_mex_cumsum: cumulative using GB_cumsum
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -69,6 +69,6 @@ void mexFunction
     }
 
     // log the test coverage
-    GB_mx_put_global (true, 0) ;
+    GB_mx_put_global (true) ;   
 }
 
diff --git a/GraphBLAS/Test/GB_mex_debug.c b/GraphBLAS/Test/GB_mex_debug.c
index ca213b0d02..39f7bca924 100644
--- a/GraphBLAS/Test/GB_mex_debug.c
+++ b/GraphBLAS/Test/GB_mex_debug.c
@@ -2,8 +2,8 @@
 // GB_mex_debug: determine GB_DEBUG status
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,7 +22,6 @@ void mexFunction
     bool malloc_debug = GB_mx_get_global (false) ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 4 || nargin != 0)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -77,6 +76,6 @@ void mexFunction
         printf ("-------------------------------------------------------\n\n") ;
     }
 
-    GB_mx_put_global (false, 0) ;
+    GB_mx_put_global (false) ;
 }
 
diff --git a/GraphBLAS/Test/GB_mex_diag.c b/GraphBLAS/Test/GB_mex_diag.c
index dd7431b418..b84d9f23a8 100644
--- a/GraphBLAS/Test/GB_mex_diag.c
+++ b/GraphBLAS/Test/GB_mex_diag.c
@@ -2,8 +2,8 @@
 // GB_mex_diag: compute C=diag(A,1)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,10 +15,10 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_VECTOR_FREE (&Thunk) ;           \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GxB_Scalar_free_(&Thunk) ;          \
+    GrB_Matrix_free_(&A) ;              \
+    GrB_Matrix_free_(&C) ;              \
+    GB_mx_put_global (true) ;           \
 }
 
 
@@ -36,7 +36,6 @@ void mexFunction
     GxB_Scalar Thunk = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 1 || nargin > 2)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -74,8 +73,7 @@ void mexFunction
 
     GxB_Scalar_new (&Thunk, GrB_INT64) ;
     GxB_Scalar_setElement_INT64_(Thunk, k) ;
-    GrB_Index ignore ;
-    GxB_Scalar_nvals (&ignore, Thunk) ;
+    GxB_Scalar_wait_(&Thunk) ;
 
     // C = diag (A,k)
     METHOD (GxB_Matrix_select_(C, NULL, NULL, GxB_DIAG, A, Thunk, NULL)) ;
diff --git a/GraphBLAS/Test/GB_mex_dpagerank.c b/GraphBLAS/Test/GB_mex_dpagerank.c
index 3005a117a9..5e900ef533 100644
--- a/GraphBLAS/Test/GB_mex_dpagerank.c
+++ b/GraphBLAS/Test/GB_mex_dpagerank.c
@@ -2,8 +2,8 @@
 // GB_mex_dpagerank: compute pagerank with a real semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,8 +17,8 @@
 #define FREE_ALL                        \
 {                                       \
     if (P != NULL) mxFree (P) ;         \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&A) ;               \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -36,8 +36,6 @@ void mexFunction
     GrB_Index n = 0 ;
     bool malloc_debug = GB_mx_get_global (true) ;
 
-    GB_WHERE (USAGE) ;
-
     // check inputs
     if (nargout > 3 || nargin < 1 || nargin > 2)
     {
@@ -74,7 +72,6 @@ void mexFunction
     if (info != GrB_SUCCESS)
     {
         FREE_ALL ;
-        printf ("%s\n", GrB_error ( )) ;
         mexErrMsgTxt ("failed") ;
     }
 
diff --git a/GraphBLAS/Test/GB_mex_dump.c b/GraphBLAS/Test/GB_mex_dump.c
index 9ea84779b2..7a7437ae67 100644
--- a/GraphBLAS/Test/GB_mex_dump.c
+++ b/GraphBLAS/Test/GB_mex_dump.c
@@ -2,8 +2,8 @@
 // GB_mex_dump: copy and print a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,9 +13,9 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&A) ;              \
     Complex_finalize ( ) ;              \
-    GB_mx_put_global (false, 0) ;       \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -31,7 +31,6 @@ void mexFunction
     GrB_Matrix A = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 1 || nargin > 2)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -49,10 +48,10 @@ void mexFunction
     int GET_SCALAR (1, int, pr, 1) ;
 
     // dump the matrix
-    GrB_Info info = GB_Matrix_check (A, "", pr, stdout, NULL) ;
+    GrB_Info info = GB_Matrix_check (A, "", pr, NULL) ;
     if (info != GrB_SUCCESS)
     {
-        mexErrMsgTxt (GrB_error ( )) ;
+        mexErrMsgTxt ("matrix fail") ;
     }
 
     // return A to MATLAB as a struct and free the GraphBLAS A
diff --git a/GraphBLAS/Test/GB_mex_dup.c b/GraphBLAS/Test/GB_mex_dup.c
index f4640430a7..85194b2567 100644
--- a/GraphBLAS/Test/GB_mex_dup.c
+++ b/GraphBLAS/Test/GB_mex_dup.c
@@ -2,8 +2,8 @@
 // GB_mex_dup: copy a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,14 +11,14 @@
 
 #include "GB_mex.h"
 
-#define USAGE "C = GB_mex_dup (A, type, method)"
+#define USAGE "C = GB_mex_dup (A, type, method, sparsity)"
 
 #define FREE_ALL                        \
 {                                       \
     GrB_Matrix_free_(&A) ;              \
     GrB_Matrix_free_(&C) ;              \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -35,8 +35,7 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
-    if (nargout > 1 || nargin < 1 || nargin > 3)
+    if (nargout > 1 || nargin < 1 || nargin > 4)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
     }
@@ -58,12 +57,14 @@ void mexFunction
     // get method
     int GET_SCALAR (2, int, method, 0) ;
 
+    // get sparsity
+    int GET_SCALAR (3, int, sparsity, GxB_DEFAULT) ;
+
     if (ctype == A->type)
     {
-        // copy C with the same type as A
-        if (method == 0)
+        // copy C with the same type as A, with default sparsity
+        if (method == 0 && sparsity == GxB_DEFAULT)
         {
-            // printf ("dup\n") ;
             METHOD (GrB_Matrix_dup (&C, A)) ;
         }
         else
@@ -71,7 +72,6 @@ void mexFunction
             // try another method, just for testing (see User Guide)
 
             // C = create an exact copy of A, just like GrB_Matrix_dup
-            // printf ("tran dup\n") ;
             GrB_Type type ;
             GrB_Index nrows, ncols ;
 
@@ -85,6 +85,10 @@ void mexFunction
                 GrB_Matrix_ncols (&ncols, A) ;                  \
                 GrB_Matrix_new (&C, type, nrows, ncols) ;       \
                 GrB_Descriptor_new (&desc) ;                    \
+                if (sparsity != GxB_DEFAULT)                    \
+                {                                               \
+                    GxB_Matrix_Option_set (C, GxB_SPARSITY_CONTROL, sparsity) ;\
+                }                                               \
                 GxB_Desc_set (desc, GrB_INP0, GrB_TRAN) ;       \
             }
             #define FREE_DEEP_COPY                              \
@@ -94,7 +98,18 @@ void mexFunction
             }
 
             GET_DEEP_COPY ;
-            METHOD (GrB_transpose (C, NULL, NULL, A, desc)) ;
+
+            if (method == 1)
+            {
+                // C = A using GrB_transpose with a desc.inp0 = transpose
+                METHOD (GrB_transpose (C, NULL, NULL, A, desc)) ;
+            }
+            else
+            {
+                // C = A using GrB_assign
+                METHOD (GrB_assign (C, NULL, NULL, A,
+                    GrB_ALL, nrows, GrB_ALL, ncols, NULL)) ;
+            }
 
             #undef GET_DEEP_COPY
             #undef FREE_DEEP_COPY
@@ -110,14 +125,18 @@ void mexFunction
         }
 
         // C = (ctype) A
-        // printf ("cast\n") ;
         GrB_Index nrows, ncols ;
+
         #define GET_DEEP_COPY                               \
         {                                                   \
             GrB_Matrix_nrows (&nrows, A) ;                  \
             GrB_Matrix_ncols (&ncols, A) ;                  \
             GrB_Matrix_new (&C, ctype, nrows, ncols) ;      \
             GrB_Descriptor_new (&desc) ;                    \
+            if (sparsity != GxB_DEFAULT)                    \
+            {                                               \
+                GxB_Matrix_Option_set (C, GxB_SPARSITY_CONTROL, sparsity) ; \
+            }                                               \
             GxB_Desc_set (desc, GrB_INP0, GrB_TRAN) ;       \
         }
         #define FREE_DEEP_COPY                              \
@@ -127,7 +146,18 @@ void mexFunction
         }
 
         GET_DEEP_COPY ;
-        METHOD (GrB_transpose (C, NULL, NULL, A, desc)) ;
+
+        if (method == 1)
+        {
+            // C = A using GrB_transpose with a desc.inp0 = transpose
+            METHOD (GrB_transpose (C, NULL, NULL, A, desc)) ;
+        }
+        else
+        {
+            // C = A using GrB_assign
+            METHOD (GrB_assign (C, NULL, NULL, A,
+                GrB_ALL, nrows, GrB_ALL, ncols, NULL)) ;
+        }
 
         #undef GET_DEEP_COPY
         #undef FREE_DEEP_COPY
diff --git a/GraphBLAS/Test/GB_mex_eWiseMult_first.c b/GraphBLAS/Test/GB_mex_eWiseMult_first.c
index cba7ed2cc2..a9530932f0 100644
--- a/GraphBLAS/Test/GB_mex_eWiseMult_first.c
+++ b/GraphBLAS/Test/GB_mex_eWiseMult_first.c
@@ -2,8 +2,8 @@
 // GB_mex_eWiseMult_first: C<Mask> = accum(C,first(A,B))
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,12 +16,12 @@
 
 #define FREE_ALL                    \
 {                                   \
-    GB_MATRIX_FREE (&A) ;           \
-    GB_MATRIX_FREE (&B) ;           \
-    GB_MATRIX_FREE (&C) ;           \
+    GrB_Matrix_free_(&A) ;           \
+    GrB_Matrix_free_(&B) ;           \
+    GrB_Matrix_free_(&C) ;           \
     GrB_Descriptor_free_(&desc) ;   \
-    GB_MATRIX_FREE (&Mask) ;        \
-    GB_mx_put_global (true, 0) ;    \
+    GrB_Matrix_free_(&Mask) ;        \
+    GB_mx_put_global (true) ;       \
 }
 
 void mexFunction
@@ -41,7 +41,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 6 || nargin > 7)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -50,7 +49,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
diff --git a/GraphBLAS/Test/GB_mex_eWiseMult_second.c b/GraphBLAS/Test/GB_mex_eWiseMult_second.c
index c989084d16..bd296fa08b 100644
--- a/GraphBLAS/Test/GB_mex_eWiseMult_second.c
+++ b/GraphBLAS/Test/GB_mex_eWiseMult_second.c
@@ -2,8 +2,8 @@
 // GB_mex_eWiseMult_second: C<Mask> = accum(C,second(A,B))
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,12 +16,12 @@
 
 #define FREE_ALL                    \
 {                                   \
-    GB_MATRIX_FREE (&A) ;           \
-    GB_MATRIX_FREE (&B) ;           \
-    GB_MATRIX_FREE (&C) ;           \
+    GrB_Matrix_free_(&A) ;           \
+    GrB_Matrix_free_(&B) ;           \
+    GrB_Matrix_free_(&C) ;           \
     GrB_Descriptor_free_(&desc) ;   \
-    GB_MATRIX_FREE (&Mask) ;        \
-    GB_mx_put_global (true, 0) ;    \
+    GrB_Matrix_free_(&Mask) ;        \
+    GB_mx_put_global (true) ;       \
 }
 
 void mexFunction
@@ -41,7 +41,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 6 || nargin > 7)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -50,7 +49,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
diff --git a/GraphBLAS/Test/GB_mex_edit.c b/GraphBLAS/Test/GB_mex_edit.c
index cfb2329106..76e1488dbe 100644
--- a/GraphBLAS/Test/GB_mex_edit.c
+++ b/GraphBLAS/Test/GB_mex_edit.c
@@ -2,8 +2,8 @@
 // GB_mex_edit: add/remove entries from a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,8 +13,8 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&C) ;               \
+    GB_mx_put_global (true) ;           \
 }
 
 #define OK(method)                                          \
@@ -22,7 +22,7 @@
     info = method ;                                         \
     if (info != GrB_SUCCESS)                                \
     {                                                       \
-        mexErrMsgTxt (GrB_error ( )) ;                      \
+        mexErrMsgTxt ("fail") ;                             \
     }                                                       \
 }
 
@@ -51,7 +51,6 @@ void mexFunction
     C = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin != 5)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -73,11 +72,8 @@ void mexFunction
     bool is_hyper ;
     OK (GrB_Matrix_ncols (&ncols, C)) ;
     OK (GxB_Matrix_Option_get (C, GxB_FORMAT, &fmt)) ;
-    OK (GxB_Matrix_Option_get (C, GxB_IS_HYPER, &is_hyper)) ;
-    // printf ("init: \n"); GxB_print (C, 3) ;
+    OK (GxB_Matrix_Option_get (C, GxB_IS_HYPER, &is_hyper)) ;   // deprecated
     bool is_vector = (fmt == GxB_BY_COL && !is_hyper && ncols == 1) ;
-    // printf ("fmt %d is_hyper %d ncols %ld is_vector: %d\n", 
-    //     fmt, is_hyper, ncols, is_vector) ;
 
     // get I
     if (!GB_mx_mxArray_to_indices (&I, pargin [1], &ni, I_range, &ignore))
@@ -131,12 +127,10 @@ void mexFunction
             // remove the (i,j) entry
             if (is_vector)
             {
-                // printf ("vector remove: %ld: (%ld)\n", k, i) ;
                 OK (GrB_Vector_removeElement ((GrB_Vector) C, i)) ;
             }
             else
             {
-                // printf ("matrix remove: %ld: (%ld, %ld)\n", k, i, j) ;
                 OK (GrB_Matrix_removeElement (C, i, j)) ;
             }
         }
@@ -145,12 +139,10 @@ void mexFunction
             // add the (i,j) entry
             if (is_vector)
             {
-                // printf ("vector set: %ld: (%ld) %g\n", k, i, x) ;
                 OK (GrB_Vector_setElement_FP64_ ((GrB_Vector) C, x, i)) ;
             }
             else
             {
-                // printf ("matrix set: %ld: (%ld, %ld) %g\n", k, i, j, x) ;
                 OK (GrB_Matrix_setElement_FP64_ (C, x, i, j)) ;
             }
         }
@@ -163,7 +155,7 @@ void mexFunction
     GB_Global_malloc_debug_set (save) ;
 
     //--------------------------------------------------------------------------
-    // return C to MATLAB as a struct
+    // return C to MATLAB as a MATLAB sparse matrix
     //--------------------------------------------------------------------------
 
     pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C mex_edit result", false) ;
diff --git a/GraphBLAS/Test/GB_mex_errors.c b/GraphBLAS/Test/GB_mex_errors.c
index 112e58758f..1a20271aab 100644
--- a/GraphBLAS/Test/GB_mex_errors.c
+++ b/GraphBLAS/Test/GB_mex_errors.c
@@ -2,8 +2,8 @@
 // GB_mex_errors: test error handling
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -61,15 +61,19 @@
     GxB_SelectOp_free_(&selectop) ;   CHECK (selectop     == NULL) ;      \
     GxB_SelectOp_free_(&selectopgunk) ; CHECK (selectopgunk == NULL) ;    \
     GxB_Scalar_free_(&a_scalar) ;                                         \
-    GB_mx_put_global (true, 0) ;                                          \
+    GB_mx_put_global (true) ;                                             \
 }
 
 #include "GB_mex_errors.h"
 
+#define G3 GxB_COMPLETE
+#define G2 GxB_SHORT
+#define G1 GxB_SUMMARY
+#define G0 GxB_SILENT
+
 void f1 (double *z, const uint32_t *x) ;
 void f2 (int32_t *z, const uint8_t *x, const int16_t *y) ;
-bool fselect (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const double *x, const double *k) ;
+bool fselect (GrB_Index i, GrB_Index j, const double *x, const double *k) ;
 
 void f1 (double *z, const uint32_t *x)
 { 
@@ -87,8 +91,7 @@ void f3 (GxB_FC64_t *z, const GxB_FC64_t *x, const double *y)
     (*z) = GB_FC64_add ((*x), GxB_CMPLX (0,(*y))) ;
 }
 
-bool fselect (GrB_Index i, GrB_Index j, GrB_Index nrows,
-    GrB_Index ncols, const double *x, const double *k)
+bool fselect (GrB_Index i, GrB_Index j, const double *x, const double *k)
 {
     // select entries in triu(A) that are greater than k
     int64_t i2 = (int64_t) i ;
@@ -126,10 +129,9 @@ void mexFunction
     fprintf (f,"========================================================\n") ;
     fprintf (f,"many errors are expected\n") ;
 
-    OK (GxB_Type_fprint_(GrB_BOOL, GB3, ff)) ;
+    OK (GxB_Type_fprint_(GrB_BOOL, G3, ff)) ;
     expected = GrB_INVALID_VALUE ;
-    ERR (GxB_Type_fprint_(GrB_BOOL, GB3, stdin)) ;
-    fprintf (ff, "GrB_error for testing failed I/O:\n%s\n", GrB_error ( )) ;
+    ERR (GxB_Type_fprint_(GrB_BOOL, G3, stdin)) ;
 
     int64_t nmalloc ;
     nmalloc = GB_Global_nmalloc_get ( ) ;
@@ -181,11 +183,13 @@ void mexFunction
     GxB_SelectOp selectop = NULL, selectopgunk = NULL, sel0 ;
     GxB_Scalar a_scalar = NULL ;
 
+    char *err ;
+
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE (USAGE) ;
+    GB_CONTEXT (USAGE) ;
     if (nargout > 0 || nargin > 0)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -207,7 +211,7 @@ void mexFunction
     printf ("GrB_init-----------------------------------------------------\n") ;
 
     // can't call it twice
-    expected = GrB_PANIC ;
+    expected = GrB_INVALID_VALUE ;
     ERR (GxB_init (GrB_NONBLOCKING, mxMalloc, mxCalloc, mxRealloc, mxFree,
         false)) ;
     GB_Global_GrB_init_called_set (false) ;
@@ -273,8 +277,8 @@ void mexFunction
     #define GET_DEEP_COPY ;
 
     METHOD (GrB_Type_new (&T, sizeof (int))) ;
-    OK (GB_Type_check (T, "new type", GB3, NULL, Context)) ;
-    OK (GxB_Type_fprint (T, "new type", GB3, ff)) ;
+    OK (GB_Type_check (T, "new type", G3, NULL)) ;
+    OK (GxB_Type_fprint (T, "new type", G3, ff)) ;
     OK (GrB_Type_free_(&T)) ;
     CHECK (T == NULL) ;
 
@@ -286,8 +290,8 @@ void mexFunction
         GrB_Type utype = Complex ;
         GrB_Type ttype = GB_code_type (tcode, utype) ;
         printf ("\n----------------------------------tcode: %d\n", tcode) ;
-        OK (GB_Type_check (ttype, "GB_code_type:", GB3, NULL, Context)) ;
-        OK (GxB_Type_fprint_(ttype, GB3, ff)) ;
+        OK (GB_Type_check (ttype, "GB_code_type:", G3, NULL)) ;
+        OK (GxB_Type_fprint_(ttype, G3, ff)) ;
     }
 
     // Tgunk is allocated but uninitialized
@@ -381,8 +385,8 @@ void mexFunction
 
     GrB_UnaryOp opzz ;
     METHOD (GrB_UnaryOp_new (&opzz, f1, GrB_FP64, GrB_UINT32)) ;
-    OK (GB_UnaryOp_check (opzz, "new unary opzz", GB3, NULL, Context)) ;
-    OK (GxB_UnaryOp_fprint (opzz, "new unary opzz", GB3, ff)) ;
+    OK (GB_UnaryOp_check (opzz, "new unary opzz", G3, NULL)) ;
+    OK (GxB_UnaryOp_fprint (opzz, "new unary opzz", G3, ff)) ;
     OK (GrB_UnaryOp_free_(&opzz)) ;
     CHECK (opzz == NULL) ;
 
@@ -470,9 +474,9 @@ void mexFunction
 
     GrB_BinaryOp opxx ;
     METHOD (GrB_BinaryOp_new (&opxx, f2, GrB_INT32, GrB_UINT8, GrB_INT16)) ;
-    OK (GB_BinaryOp_check (opxx, "new binary opxx", GB3, NULL, Context)) ;
-    OK (GxB_BinaryOp_fprint (opxx, "new binary opxx", GB3, ff)) ;
-    OK (GxB_BinaryOp_fprint_(opxx, GB3, ff)) ;
+    OK (GB_BinaryOp_check (opxx, "new binary opxx", G3, NULL)) ;
+    OK (GxB_BinaryOp_fprint (opxx, "new binary opxx", G3, ff)) ;
+    OK (GxB_BinaryOp_fprint_(opxx, G3, ff)) ;
     OK (GrB_BinaryOp_free_(&opxx)) ;
     CHECK (opxx == NULL) ;
 
@@ -817,30 +821,35 @@ void mexFunction
     OK (GxB_Scalar_setElement_INT32 (a_scalar, 42)) ;
     OK (GxB_Scalar_extractElement_INT32_(&i_scalar, a_scalar)) ;
     CHECK (i_scalar == 42) ;
-    i_scalar = 33 ;
+    GxB_Scalar_fprint_(a_scalar, 3, NULL) ;
 
     // force a zombie
-    a_scalar->i [0] = GB_FLIP (0) ;
-    a_scalar->nzombies = 1 ;
-    GB_queue_insert ((GrB_Matrix) a_scalar) ;
+    i_scalar = 33 ;
+    bool scalar_is_full = GB_IS_FULL (a_scalar) ;
+    if (!scalar_is_full)
+    {
+        a_scalar->i [0] = GB_FLIP (0) ;
+        a_scalar->nzombies = 1 ;
+    }
 
     info = GxB_Scalar_extractElement_INT32_(&i_scalar, a_scalar) ;
-    CHECK (i_scalar == 33) ;
-    CHECK (info == GrB_NO_VALUE) ;
+    CHECK (i_scalar == (scalar_is_full) ? 42 : 33) ;
+    CHECK (info == (scalar_is_full) ? GrB_SUCCESS : GrB_NO_VALUE) ;
 
     OK (GxB_Scalar_free_(&a_scalar)) ;
 
     OK (GrB_Type_new (&T, sizeof (int))) ;
 
+    i_scalar = 207 ;
     expected = GrB_DOMAIN_MISMATCH ;
     OK (GxB_Scalar_new (&a_scalar, T)) ;
     GxB_Scalar_fprint_(a_scalar, 3, NULL) ;
     GxB_Type_fprint_(T, 3, NULL) ;
-    ERR (GxB_Scalar_setElement_INT32 (a_scalar, 42)) ;
+    ERR1 (a_scalar, GxB_Scalar_setElement_INT32 (a_scalar, 42)) ;
     ERR (GxB_Scalar_extractElement_INT32_(&i_scalar, a_scalar)) ;
-    CHECK (i_scalar == 33) ;
+    CHECK (i_scalar == 207) ;
 
-    printf ("error expected: %s\n", GrB_error ( )) ;
+    printf ("error expected: %d\n", info) ;
 
     OK (GrB_Type_free_(&T)) ;
     OK (GxB_Scalar_free_(&a_scalar)) ;
@@ -939,8 +948,8 @@ void mexFunction
     OK (GrB_Vector_setElement_UINT64 (v, 8, 26)) ;
     OK (GrB_Vector_setElement_FP32   (v, 9, 27)) ;
     OK (GrB_Vector_setElement_FP64   (v, 10, 28)) ;
-    GB_Vector_check (v, "vector 18:28", GB3, NULL, Context) ;
-    GxB_Vector_fprint (v, "v", GB3, ff) ;
+    GB_Vector_check (v, "vector 18:28", G3, NULL) ;
+    GxB_Vector_fprint (v, "v", G3, ff) ;
 
     expected = GrB_NULL_POINTER ;
 
@@ -952,18 +961,18 @@ void mexFunction
     CHECK (nvals == 0) ;
 
     OK (GrB_Vector_nvals (&nvals, v)) ;
+    OK (GrB_Vector_wait (&v)) ;
     printf ("nvals "GBd"\n", nvals) ;
-    GB_Vector_check (v, "vector 18:28", GB3, NULL, Context) ;
-    GxB_Vector_fprint (v, "v", GB3, ff) ;
+    GB_Vector_check (v, "vector 18:28", G3, NULL) ;
+    GxB_Vector_fprint (v, "v", G3, ff) ;
     CHECK (nvals == 12) ;
 
     expected = GrB_INVALID_OBJECT ;
     GrB_Vector zz ;
     OK (GrB_Vector_dup (&zz, v)) ;
-    OK (GB_Vector_check (zz, "zz ok vector", GB3, NULL, Context)) ;
-    GB_to_hyper ((GrB_Matrix) zz, Context) ;
-    ERR (GB_Vector_check (zz, "zz mangled: vectors cannot be hyper", GB3, ff,
-        Context)) ;
+    OK (GB_Vector_check (zz, "zz ok vector", G3, NULL)) ;
+    GB_convert_any_to_hyper ((GrB_Matrix) zz, Context) ;
+    ERR (GB_Vector_check (zz, "zz mangled: vectors cannot be hyper", G3, ff)) ;
     OK (GrB_Vector_free_(&zz)) ;
 
     OK (GrB_Vector_clear (v)) ;
@@ -1017,43 +1026,43 @@ void mexFunction
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
-    ERR (GrB_Vector_build_BOOL   (vgunk, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Vector_build_INT8   (vgunk, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Vector_build_UINT8  (vgunk, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Vector_build_INT16  (vgunk, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Vector_build_UINT16 (vgunk, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Vector_build_INT32  (vgunk, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Vector_build_UINT32 (vgunk, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Vector_build_INT64  (vgunk, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Vector_build_UINT64 (vgunk, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Vector_build_FP32   (vgunk, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Vector_build_FP64   (vgunk, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Vector_build_UDT    (vgunk, NULL, NULL, 0, NULL)) ;
+    ERR1 (vgunk, GrB_Vector_build_BOOL   (vgunk, NULL, NULL, 0, NULL)) ;
+    ERR1 (vgunk, GrB_Vector_build_INT8   (vgunk, NULL, NULL, 0, NULL)) ;
+    ERR1 (vgunk, GrB_Vector_build_UINT8  (vgunk, NULL, NULL, 0, NULL)) ;
+    ERR1 (vgunk, GrB_Vector_build_INT16  (vgunk, NULL, NULL, 0, NULL)) ;
+    ERR1 (vgunk, GrB_Vector_build_UINT16 (vgunk, NULL, NULL, 0, NULL)) ;
+    ERR1 (vgunk, GrB_Vector_build_INT32  (vgunk, NULL, NULL, 0, NULL)) ;
+    ERR1 (vgunk, GrB_Vector_build_UINT32 (vgunk, NULL, NULL, 0, NULL)) ;
+    ERR1 (vgunk, GrB_Vector_build_INT64  (vgunk, NULL, NULL, 0, NULL)) ;
+    ERR1 (vgunk, GrB_Vector_build_UINT64 (vgunk, NULL, NULL, 0, NULL)) ;
+    ERR1 (vgunk, GrB_Vector_build_FP32   (vgunk, NULL, NULL, 0, NULL)) ;
+    ERR1 (vgunk, GrB_Vector_build_FP64   (vgunk, NULL, NULL, 0, NULL)) ;
+    ERR1 (vgunk, GrB_Vector_build_UDT    (vgunk, NULL, NULL, 0, NULL)) ;
 
     expected = GrB_NULL_POINTER ;
 
     OK  (GrB_Vector_new (&v, GrB_FP64, 10)) ;
-    ERR (GrB_Vector_build_FP64 (v, I, NULL, 0, NULL)) ;
-    ERR (GrB_Vector_build_FP64_(v, I, X,    0, NULL)) ;
+    ERR1 (v, GrB_Vector_build_FP64 (v, I, NULL, 0, NULL)) ;
+    ERR1 (v, GrB_Vector_build_FP64_(v, I, X,    0, NULL)) ;
 
     expected = GrB_INVALID_VALUE ;
     o2 = GrB_SECOND_FP64 ;
-    ERR (GrB_Vector_build_FP64 (v, GrB_ALL, X, 0, o2)) ;
+    ERR1 (v, GrB_Vector_build_FP64 (v, GrB_ALL, X, 0, o2)) ;
 
-    ERR (GrB_Vector_build_FP64_(v, I, X, GxB_INDEX_MAX+1, o2)) ;
+    ERR1 (v, GrB_Vector_build_FP64_(v, I, X, GxB_INDEX_MAX+1, o2)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
-    ERR (GrB_Vector_build_FP64_(v, I, X, 5, op2gunk)) ;
+    ERR1 (v, GrB_Vector_build_FP64_(v, I, X, 5, op2gunk)) ;
 
     expected = GrB_DOMAIN_MISMATCH ;
 
-    ERR (GrB_Vector_build_FP64_(v, I, X, 5, GrB_LE_FP64)) ;
-    ERR (GrB_Vector_build_UDT_(v, I, (void *) X, 5, GrB_PLUS_FP64)) ;
+    ERR1 (v, GrB_Vector_build_FP64_(v, I, X, 5, GrB_LE_FP64)) ;
+    ERR1 (v, GrB_Vector_build_UDT_(v, I, (void *) X, 5, GrB_PLUS_FP64)) ;
 
-    OK (GxB_BinaryOp_fprint (Complex_plus, "Complex-plus op", GB3, f)) ;
-    OK (GxB_Type_fprint (Complex, "Complex user type", GB3, f)) ;
-    OK (GxB_Type_fprint (GxB_FC64, "Complex built-in type", GB3, f)) ;
+    OK (GxB_BinaryOp_fprint (Complex_plus, "Complex-plus op", G3, f)) ;
+    OK (GxB_Type_fprint (Complex, "Complex user type", G3, f)) ;
+    OK (GxB_Type_fprint (GxB_FC64, "Complex built-in type", G3, f)) ;
     if (Complex == GxB_FC64)
     {
         OK (GrB_Vector_build_FP64_(v, I, X, 5, Complex_plus)) ;
@@ -1062,33 +1071,35 @@ void mexFunction
     }
     else
     {
-        ERR (GrB_Vector_build_FP64_(v, I, X, 5, Complex_plus)) ;
+        ERR1 (v, GrB_Vector_build_FP64_(v, I, X, 5, Complex_plus)) ;
     }
 
     expected = GrB_OUTPUT_NOT_EMPTY ;
 
     OK  (GrB_Vector_setElement_INT32 (v, 12, 0)) ;
 
-    ERR (GrB_Vector_build_FP64_(v, I, X, 5, GrB_PLUS_FP64)) ;
+    ERR1 (v, GrB_Vector_build_FP64_(v, I, X, 5, GrB_PLUS_FP64)) ;
 
     OK  (GrB_Vector_clear (v)) ;
     OK  (GrB_Vector_build_FP64_(v, I, X, 5, GrB_PLUS_FP64)) ;
     OK  (GrB_Vector_clear (v)) ;
-    GB_Vector_check (v, "v clear", GB3, NULL, Context) ;
-    GxB_Vector_fprint (v, "v", GB3, ff) ;
+    GB_Vector_check (v, "v clear", G3, NULL) ;
+    GxB_Vector_fprint (v, "v", G3, ff) ;
 
     expected = GrB_INVALID_VALUE ;
-    ERR (GrB_Vector_build_FP64_(v, I, X, GxB_RANGE, GrB_PLUS_FP64)) ;
-    printf ("%s\n", GrB_error ( )) ;
+    ERR1 (v, GrB_Vector_build_FP64_(v, I, X, GxB_RANGE, GrB_PLUS_FP64)) ;
+    GrB_Vector_error_(&err, v) ;
+    printf ("%s\n", err) ;
 
     expected = GrB_INDEX_OUT_OF_BOUNDS ;
 
     I [0] = 10 ;
-    ERR (GrB_Vector_build_FP64_(v, I, X, 5, GrB_PLUS_FP64)) ;
-    printf ("expected error, index out of bounds:\n%s\n", GrB_error ()) ;
+    ERR1 (v, GrB_Vector_build_FP64_(v, I, X, 5, GrB_PLUS_FP64)) ;
+    GrB_Vector_error_(&err, v) ;
+    printf ("expected error, index out of bounds:\n%s\n", err) ;
 
-    GB_Vector_check (v, "v bad", GB3, NULL, Context) ;
-    GxB_Vector_fprint (v, "v", GB3, ff) ;
+    GB_Vector_check (v, "v bad", G3, NULL) ;
+    GxB_Vector_fprint (v, "v", G3, ff) ;
 
     expected = GrB_INVALID_OBJECT ;
     ERR (GrB_Vector_nvals (&nvals, v)) ;
@@ -1098,7 +1109,7 @@ void mexFunction
 
     I [0] = -1 ;
     expected = GrB_INDEX_OUT_OF_BOUNDS ;
-    ERR (GrB_Vector_build_FP64_(v, I, X, 5, GrB_PLUS_FP64)) ;
+    ERR1 (v, GrB_Vector_build_FP64_(v, I, X, 5, GrB_PLUS_FP64)) ;
 
     expected = GrB_INVALID_OBJECT ;
     ERR (GrB_Vector_nvals (&nvals, v)) ;
@@ -1128,30 +1139,30 @@ void mexFunction
     ERR (GrB_Vector_setElement_FP32   (NULL, 0, 0)) ;
     ERR (GrB_Vector_setElement_FP64   (NULL, 0, 0)) ;
     ERR (GrB_Vector_setElement_UDT    (NULL, 0, 0)) ;
-    ERR (GrB_Vector_setElement_UDT    (v, NULL, 0)) ;
+    ERR1 (v, GrB_Vector_setElement_UDT    (v, NULL, 0)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
-    ERR (GrB_Vector_setElement_BOOL   (vgunk, 0, 0)) ;
-    ERR (GrB_Vector_setElement_INT8   (vgunk, 0, 0)) ;
-    ERR (GrB_Vector_setElement_UINT8  (vgunk, 0, 0)) ;
-    ERR (GrB_Vector_setElement_INT16  (vgunk, 0, 0)) ;
-    ERR (GrB_Vector_setElement_UINT16 (vgunk, 0, 0)) ;
-    ERR (GrB_Vector_setElement_INT32  (vgunk, 0, 0)) ;
-    ERR (GrB_Vector_setElement_UINT32 (vgunk, 0, 0)) ;
-    ERR (GrB_Vector_setElement_INT64  (vgunk, 0, 0)) ;
-    ERR (GrB_Vector_setElement_UINT64 (vgunk, 0, 0)) ;
-    ERR (GrB_Vector_setElement_FP32   (vgunk, 0, 0)) ;
-    ERR (GrB_Vector_setElement_FP64   (vgunk, 0, 0)) ;
-    ERR (GrB_Vector_setElement_UDT    (vgunk, 0, 0)) ;
+    ERR1 (vgunk, GrB_Vector_setElement_BOOL   (vgunk, 0, 0)) ;
+    ERR1 (vgunk, GrB_Vector_setElement_INT8   (vgunk, 0, 0)) ;
+    ERR1 (vgunk, GrB_Vector_setElement_UINT8  (vgunk, 0, 0)) ;
+    ERR1 (vgunk, GrB_Vector_setElement_INT16  (vgunk, 0, 0)) ;
+    ERR1 (vgunk, GrB_Vector_setElement_UINT16 (vgunk, 0, 0)) ;
+    ERR1 (vgunk, GrB_Vector_setElement_INT32  (vgunk, 0, 0)) ;
+    ERR1 (vgunk, GrB_Vector_setElement_UINT32 (vgunk, 0, 0)) ;
+    ERR1 (vgunk, GrB_Vector_setElement_INT64  (vgunk, 0, 0)) ;
+    ERR1 (vgunk, GrB_Vector_setElement_UINT64 (vgunk, 0, 0)) ;
+    ERR1 (vgunk, GrB_Vector_setElement_FP32   (vgunk, 0, 0)) ;
+    ERR1 (vgunk, GrB_Vector_setElement_FP64   (vgunk, 0, 0)) ;
+    ERR1 (vgunk, GrB_Vector_setElement_UDT    (vgunk, 0, 0)) ;
 
     expected = GrB_INVALID_INDEX ;
 
-    ERR (GrB_Vector_setElement_INT32 (v, 0, -1)) ;
+    ERR1 (v, GrB_Vector_setElement_INT32 (v, 0, -1)) ;
 
     expected = GrB_DOMAIN_MISMATCH ;
 
-    ERR (GrB_Vector_setElement_UDT (v, (void *) X, 0)) ;
+    ERR1 (v, GrB_Vector_setElement_UDT (v, (void *) X, 0)) ;
 
     //--------------------------------------------------------------------------
     // Vector extractElement
@@ -1217,7 +1228,6 @@ void mexFunction
     OK (GrB_Vector_extractElement_FP64_(&x_double, v, 3)) ;
     CHECK (x_double == 404) ;
     CHECK (info == GrB_NO_VALUE) ;
-    fprintf (f, "%s\n", GrB_error ()) ;
 
     OK (GrB_Vector_setElement_FP64 (v, 77.3, 0)) ;
 
@@ -1227,6 +1237,7 @@ void mexFunction
     CHECK (x_double == 77.3) ;
 
     OK (GrB_Vector_nvals (&n2, v)) ;
+    OK (GrB_Vector_wait_(&v)) ;
     fprintf (f, "vector nvals: %d\n", (int) n2) ;
     CHECK (n2 == 3) ;
 
@@ -1400,7 +1411,7 @@ void mexFunction
     OK (GrB_Matrix_setElement_INT32 (A, 19, 3, 1)) ;
 
     expected = GrB_INVALID_INDEX ;
-    ERR (GrB_Matrix_setElement_INT32 (A, 19, 3, 1000)) ;
+    ERR1 (A, GrB_Matrix_setElement_INT32 (A, 19, 3, 1000)) ;
 
     expected = GrB_NULL_POINTER ;
 
@@ -1465,41 +1476,41 @@ void mexFunction
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
-    ERR (GrB_Matrix_build_BOOL   (Agunk, NULL, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_INT8   (Agunk, NULL, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_UINT8  (Agunk, NULL, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_INT16  (Agunk, NULL, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_UINT16 (Agunk, NULL, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_INT32  (Agunk, NULL, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_UINT32 (Agunk, NULL, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_INT64  (Agunk, NULL, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_UINT64 (Agunk, NULL, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_FP32   (Agunk, NULL, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_FP64   (Agunk, NULL, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_UDT    (Agunk, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (Agunk, GrB_Matrix_build_BOOL   (Agunk, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (Agunk, GrB_Matrix_build_INT8   (Agunk, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (Agunk, GrB_Matrix_build_UINT8  (Agunk, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (Agunk, GrB_Matrix_build_INT16  (Agunk, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (Agunk, GrB_Matrix_build_UINT16 (Agunk, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (Agunk, GrB_Matrix_build_INT32  (Agunk, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (Agunk, GrB_Matrix_build_UINT32 (Agunk, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (Agunk, GrB_Matrix_build_INT64  (Agunk, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (Agunk, GrB_Matrix_build_UINT64 (Agunk, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (Agunk, GrB_Matrix_build_FP32   (Agunk, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (Agunk, GrB_Matrix_build_FP64   (Agunk, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (Agunk, GrB_Matrix_build_UDT    (Agunk, NULL, NULL, NULL, 0, NULL)) ;
 
     expected = GrB_NULL_POINTER ;
 
     OK (GrB_Matrix_new (&A, GrB_FP64, 10, 5)) ;
-    ERR (GrB_Matrix_build_FP64 (A, I,    NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_FP64 (A, NULL, NULL, NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_FP64 (A, I,    J,    NULL, 0, NULL)) ;
-    ERR (GrB_Matrix_build_FP64_(A, I,    J,    X,    0, NULL)) ;
+    ERR1 (A, GrB_Matrix_build_FP64 (A, I,    NULL, NULL, 0, NULL)) ;
+    ERR1 (A, GrB_Matrix_build_FP64 (A, NULL, NULL, NULL, 0, NULL)) ;
+    ERR1 (A, GrB_Matrix_build_FP64 (A, I,    J,    NULL, 0, NULL)) ;
+    ERR1 (A, GrB_Matrix_build_FP64_(A, I,    J,    X,    0, NULL)) ;
 
     expected = GrB_INVALID_VALUE ;
 
     o2 = GrB_SECOND_FP64 ;
-    ERR (GrB_Matrix_build_FP64_(A, GrB_ALL, J, X, 0, o2)) ;
-    ERR (GrB_Matrix_build_FP64_(A, I, GrB_ALL, X, 0, o2)) ;
-    ERR (GrB_Matrix_build_FP64_(A, I,       J, X, GxB_INDEX_MAX+1, o2)) ;
+    ERR1 (A, GrB_Matrix_build_FP64_(A, GrB_ALL, J, X, 0, o2)) ;
+    ERR1 (A, GrB_Matrix_build_FP64_(A, I, GrB_ALL, X, 0, o2)) ;
+    ERR1 (A, GrB_Matrix_build_FP64_(A, I,       J, X, GxB_INDEX_MAX+1, o2)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
-    ERR (GrB_Matrix_build_FP64_(A, I, J, X, 5, op2gunk)) ;
+    ERR1 (A, GrB_Matrix_build_FP64_(A, I, J, X, 5, op2gunk)) ;
 
     expected = GrB_DOMAIN_MISMATCH ;
 
-    ERR (GrB_Matrix_build_FP64_(A, I, J, X, 5, GrB_LE_FP64)) ;
+    ERR1 (A, GrB_Matrix_build_FP64_(A, I, J, X, 5, GrB_LE_FP64)) ;
     if (Complex == GxB_FC64)
     {
         OK (GrB_Matrix_build_FP64_(A, I, J, X, 5, Complex_plus)) ;
@@ -1508,14 +1519,14 @@ void mexFunction
     }
     else
     {
-        ERR (GrB_Matrix_build_FP64_(A, I, J, X, 5, Complex_plus)) ;
+        ERR1 (A, GrB_Matrix_build_FP64_(A, I, J, X, 5, Complex_plus)) ;
     }
-    ERR (GrB_Matrix_build_UDT_(A, I, J, (void *) X, 5, GrB_PLUS_FP64)) ;
+    ERR1 (A, GrB_Matrix_build_UDT_(A, I, J, (void *) X, 5, GrB_PLUS_FP64)) ;
 
     expected = GrB_OUTPUT_NOT_EMPTY ;
 
     OK  (GrB_Matrix_setElement_INT32 (A, 12, 0, 0)) ;
-    ERR (GrB_Matrix_build_FP64_(A, I, J, X, 5, GrB_PLUS_FP64)) ;
+    ERR1 (A, GrB_Matrix_build_FP64_(A, I, J, X, 5, GrB_PLUS_FP64)) ;
     OK  (GrB_Matrix_clear (A)) ;
     OK  (GrB_Matrix_build_FP64_(A, I, J, X, 5, GrB_PLUS_FP64)) ;
     OK  (GrB_Matrix_clear (A)) ;
@@ -1523,7 +1534,7 @@ void mexFunction
     expected = GrB_INDEX_OUT_OF_BOUNDS ;
 
     I [0] = 10 ;
-    ERR (GrB_Matrix_build_FP64_(A, I, J, X, 5, GrB_PLUS_FP64)) ;
+    ERR1 (A, GrB_Matrix_build_FP64_(A, I, J, X, 5, GrB_PLUS_FP64)) ;
 
     expected = GrB_INVALID_OBJECT ;
     ERR (GrB_Matrix_nvals (&nvals, A)) ;
@@ -1534,12 +1545,10 @@ void mexFunction
     I [0] = -1 ;
 
     expected = GrB_INDEX_OUT_OF_BOUNDS ;
-    ERR (GrB_Matrix_build_FP64_(A, I, J, X, 5, GrB_PLUS_FP64)) ;
-    printf ("expected error:\n%s\n", GrB_error ( )) ;
+    ERR1 (A, GrB_Matrix_build_FP64_(A, I, J, X, 5, GrB_PLUS_FP64)) ;
 
     expected = GrB_INVALID_OBJECT ;
     ERR (GrB_Matrix_nvals (&nvals, A)) ;
-    printf ("expected error:\n%s\n", GrB_error ( )) ;
     I [0] = 0 ;
 
     OK (GrB_Matrix_free_(&A)) ;
@@ -1566,30 +1575,30 @@ void mexFunction
     ERR (GrB_Matrix_setElement_FP32   (NULL, 0, 0, 0)) ;
     ERR (GrB_Matrix_setElement_FP64   (NULL, 0, 0, 0)) ;
     ERR (GrB_Matrix_setElement_UDT    (NULL, 0, 0, 0)) ;
-    ERR (GrB_Matrix_setElement_UDT    (A, NULL, 0, 0)) ;
+    ERR1 (A, GrB_Matrix_setElement_UDT    (A, NULL, 0, 0)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
-    ERR (GrB_Matrix_setElement_BOOL   (Agunk, 0, 0, 0)) ;
-    ERR (GrB_Matrix_setElement_INT8   (Agunk, 0, 0, 0)) ;
-    ERR (GrB_Matrix_setElement_UINT8  (Agunk, 0, 0, 0)) ;
-    ERR (GrB_Matrix_setElement_INT16  (Agunk, 0, 0, 0)) ;
-    ERR (GrB_Matrix_setElement_UINT16 (Agunk, 0, 0, 0)) ;
-    ERR (GrB_Matrix_setElement_INT32  (Agunk, 0, 0, 0)) ;
-    ERR (GrB_Matrix_setElement_UINT32 (Agunk, 0, 0, 0)) ;
-    ERR (GrB_Matrix_setElement_INT64  (Agunk, 0, 0, 0)) ;
-    ERR (GrB_Matrix_setElement_UINT64 (Agunk, 0, 0, 0)) ;
-    ERR (GrB_Matrix_setElement_FP32   (Agunk, 0, 0, 0)) ;
-    ERR (GrB_Matrix_setElement_FP64   (Agunk, 0, 0, 0)) ;
-    ERR (GrB_Matrix_setElement_UDT    (Agunk, 0, 0, 0)) ;
+    ERR1 (Agunk, GrB_Matrix_setElement_BOOL   (Agunk, 0, 0, 0)) ;
+    ERR1 (Agunk, GrB_Matrix_setElement_INT8   (Agunk, 0, 0, 0)) ;
+    ERR1 (Agunk, GrB_Matrix_setElement_UINT8  (Agunk, 0, 0, 0)) ;
+    ERR1 (Agunk, GrB_Matrix_setElement_INT16  (Agunk, 0, 0, 0)) ;
+    ERR1 (Agunk, GrB_Matrix_setElement_UINT16 (Agunk, 0, 0, 0)) ;
+    ERR1 (Agunk, GrB_Matrix_setElement_INT32  (Agunk, 0, 0, 0)) ;
+    ERR1 (Agunk, GrB_Matrix_setElement_UINT32 (Agunk, 0, 0, 0)) ;
+    ERR1 (Agunk, GrB_Matrix_setElement_INT64  (Agunk, 0, 0, 0)) ;
+    ERR1 (Agunk, GrB_Matrix_setElement_UINT64 (Agunk, 0, 0, 0)) ;
+    ERR1 (Agunk, GrB_Matrix_setElement_FP32   (Agunk, 0, 0, 0)) ;
+    ERR1 (Agunk, GrB_Matrix_setElement_FP64   (Agunk, 0, 0, 0)) ;
+    ERR1 (Agunk, GrB_Matrix_setElement_UDT    (Agunk, 0, 0, 0)) ;
 
     expected = GrB_INVALID_INDEX ;
 
-    ERR (GrB_Matrix_setElement_INT32 (A, 0, -1, 0)) ;
+    ERR1 (A, GrB_Matrix_setElement_INT32 (A, 0, -1, 0)) ;
 
     expected = GrB_DOMAIN_MISMATCH ;
 
-    ERR (GrB_Matrix_setElement_UDT (A, (void *) X, 0, 0)) ;
+    ERR1 (A, GrB_Matrix_setElement_UDT (A, (void *) X, 0, 0)) ;
 
     //--------------------------------------------------------------------------
     // Matrix extractElement
@@ -1656,7 +1665,6 @@ void mexFunction
     OK (GrB_Matrix_extractElement_FP64_(&x_double, A, 3, 0)) ;
     CHECK (x_double == 404) ;
     CHECK (info == GrB_NO_VALUE) ;
-    fprintf (f, "%s\n", GrB_error ()) ;
 
     OK (GrB_Matrix_setElement_FP64 (A, 707.3, 0, 0)) ;
 
@@ -1666,6 +1674,7 @@ void mexFunction
     CHECK (x_double == 707.3) ;
 
     OK (GrB_Matrix_nvals (&n2, A)) ;
+    OK (GrB_Matrix_wait_(&A)) ;
     fprintf (f, "nvals: %d\n", (int) n2) ;
 
     // A is now a valid FP64 matrix with 3 entries
@@ -1759,18 +1768,18 @@ void mexFunction
 
     GrB_Descriptor dnull = NULL ;
 
-    ERR (GxB_Desc_set (dnull, 0, 0)) ;
+    ERR1 (dnull, GxB_Desc_set (dnull, 0, 0)) ;
     ERR (GxB_Desc_get (dnull, 0, NULL)) ;
 
-    ERR (GrB_Descriptor_set (dnull, 0, 0)) ;
+    ERR1 (dnull, GrB_Descriptor_set (dnull, 0, 0)) ;
     ERR (GxB_Descriptor_get (NULL, dnull, 0)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
-    ERR (GxB_Desc_set (dgunk, 0, 0)) ;
+    ERR1 (dgunk, GxB_Desc_set (dgunk, 0, 0)) ;
     ERR (GxB_Desc_get (dgunk, 0, &dval)) ;
 
-    ERR (GrB_Descriptor_set (dgunk, 0, 0)) ;
+    ERR1 (dgunk, GrB_Descriptor_set (dgunk, 0, 0)) ;
     ERR (GxB_Descriptor_get (&dval, dgunk, 0)) ;
 
     OK (GxB_Desc_get (dnull, 0, &dval)) ;
@@ -1784,32 +1793,42 @@ void mexFunction
     expected = GrB_INVALID_VALUE ;
 
     ERR (GxB_Desc_get (desc, -1, &dval)) ;
-    ERR (GxB_Desc_set (desc, -1, 0)) ;
+    ERR1 (desc, GxB_Desc_set (desc, -1, 0)) ;
 
     ERR (GxB_Descriptor_get (&dval, desc, -1)) ;
-    ERR (GrB_Descriptor_set (desc, -1, 0)) ;
-
-    ERR (GxB_Desc_set (desc, GrB_OUTP, -1)) ;
-    printf ("%s\n", GrB_error ( )) ;
-    ERR (GxB_Desc_set (desc, GrB_MASK, -1)) ;
-    printf ("%s\n", GrB_error ( )) ;
-    ERR (GxB_Desc_set (desc, GrB_INP0, -1)) ;
-    printf ("%s\n", GrB_error ( )) ;
-    ERR (GxB_Desc_set (desc, GrB_INP1, -1)) ;
-    printf ("%s\n", GrB_error ( )) ;
-    ERR (GrB_Descriptor_set (desc, GxB_AxB_METHOD, -1)) ;
-    printf ("%s\n", GrB_error ( )) ;
-
-    ERR (GrB_Descriptor_set (desc, GrB_OUTP, -1)) ;
-    printf ("%s\n", GrB_error ( )) ;
-    ERR (GrB_Descriptor_set (desc, GrB_MASK, -1)) ;
-    printf ("%s\n", GrB_error ( )) ;
-    ERR (GrB_Descriptor_set (desc, GrB_INP0, -1)) ;
-    printf ("%s\n", GrB_error ( )) ;
-    ERR (GrB_Descriptor_set (desc, GrB_INP1, -1)) ;
-    printf ("%s\n", GrB_error ( )) ;
-    ERR (GrB_Descriptor_set (desc, GxB_AxB_METHOD, -1)) ;
-    printf ("%s\n", GrB_error ( )) ;
+    ERR1 (desc, GrB_Descriptor_set (desc, -1, 0)) ;
+
+    ERR1 (desc, GxB_Desc_set (desc, GrB_OUTP, -1)) ;
+    GrB_Descriptor_error_(&err, desc) ;
+    printf ("%s\n", err) ;
+    ERR1 (desc, GxB_Desc_set (desc, GrB_MASK, -1)) ;
+    GrB_Descriptor_error_(&err, desc) ;
+    printf ("%s\n", err) ;
+    ERR1 (desc, GxB_Desc_set (desc, GrB_INP0, -1)) ;
+    GrB_Descriptor_error_(&err, desc) ;
+    printf ("%s\n", err) ;
+    ERR1 (desc, GxB_Desc_set (desc, GrB_INP1, -1)) ;
+    GrB_Descriptor_error_(&err, desc) ;
+    printf ("%s\n", err) ;
+    ERR1 (desc, GrB_Descriptor_set (desc, GxB_AxB_METHOD, -1)) ;
+    GrB_Descriptor_error_(&err, desc) ;
+    printf ("%s\n", err) ;
+
+    ERR1 (desc, GrB_Descriptor_set (desc, GrB_OUTP, -1)) ;
+    GrB_Descriptor_error_(&err, desc) ;
+    printf ("%s\n", err) ;
+    ERR1 (desc, GrB_Descriptor_set (desc, GrB_MASK, -1)) ;
+    GrB_Descriptor_error_(&err, desc) ;
+    printf ("%s\n", err) ;
+    ERR1 (desc, GrB_Descriptor_set (desc, GrB_INP0, -1)) ;
+    GrB_Descriptor_error_(&err, desc) ;
+    printf ("%s\n", err) ;
+    ERR1 (desc, GrB_Descriptor_set (desc, GrB_INP1, -1)) ;
+    GrB_Descriptor_error_(&err, desc) ;
+    printf ("%s\n", err) ;
+    ERR1 (desc, GrB_Descriptor_set (desc, GxB_AxB_METHOD, -1)) ;
+    GrB_Descriptor_error_(&err, desc) ;
+    printf ("%s\n", err) ;
 
     OK (GxB_Desc_get (desc, GrB_OUTP, &dval)) ;
     CHECK (dval == GxB_DEFAULT) ;
@@ -1838,58 +1857,52 @@ void mexFunction
 
     GrB_Descriptor d7 ;
     METHOD (GrB_Descriptor_new (&d7)) ;
-    OK (GB_Descriptor_check (d7, "new descriptor", GB3, NULL, Context)) ;
-    OK (GxB_Descriptor_fprint (d7, "new descriptor", GB3, ff)) ;
-    OK (GxB_Descriptor_fprint (d7, "d7", GB3, ff)) ;
+    OK (GB_Descriptor_check (d7, "new descriptor", G3, NULL)) ;
+    OK (GxB_Descriptor_fprint (d7, "new descriptor", G3, ff)) ;
+    OK (GxB_Descriptor_fprint (d7, "d7", G3, ff)) ;
 
     #undef FREE_DEEP_COPY
     #undef GET_DEEP_COPY
 
     OK (GxB_Desc_set (d7, GxB_AxB_METHOD, GxB_DEFAULT)) ;
-    OK (GB_Descriptor_check (d7, "new descriptor (default)", GB3, NULL, Context)) ;
-    OK (GxB_Descriptor_fprint (d7, "d7", GB3, ff)) ;
-
-    OK (GxB_Desc_set (d7, GxB_AxB_METHOD, GxB_AxB_HEAP)) ;
-    OK (GB_Descriptor_check (d7, "new descriptor (heap)", GB3, NULL, Context)) ;
-    OK (GxB_Descriptor_fprint (d7, "d7", GB3, ff)) ;
-    OK (GxB_Desc_get (d7, GxB_AxB_METHOD, &dval)) ;
-    CHECK (dval == GxB_AxB_HEAP) ;
+    OK (GB_Descriptor_check (d7, "new descriptor (default)", G3, NULL)) ;
+    OK (GxB_Descriptor_fprint (d7, "d7", G3, ff)) ;
 
     OK (GxB_Desc_set (d7, GxB_AxB_METHOD, GxB_AxB_DOT)) ;
-    OK (GB_Descriptor_check (d7, "new descriptor (dot)", GB3, NULL, Context)) ;
-    OK (GxB_Descriptor_fprint (d7, "d7", GB3, ff)) ;
+    OK (GB_Descriptor_check (d7, "new descriptor (dot)", G3, NULL)) ;
+    OK (GxB_Descriptor_fprint (d7, "d7", G3, ff)) ;
     OK (GxB_Descriptor_get (&dval, d7, GxB_AxB_METHOD)) ;
     CHECK (dval == GxB_AxB_DOT) ;
 
     OK (GxB_Desc_set (d7, GxB_AxB_METHOD, GxB_AxB_GUSTAVSON)) ;
-    OK (GB_Descriptor_check (d7, "new descriptor (Gustavson)", GB3, NULL, Context)) ;
-    OK (GxB_Descriptor_fprint (d7, "d7", GB3, ff)) ;
+    OK (GB_Descriptor_check (d7, "new descriptor (Gustavson)", G3, NULL)) ;
+    OK (GxB_Descriptor_fprint (d7, "d7", G3, ff)) ;
 
     expected = GrB_INVALID_VALUE ;
-    ERR (GxB_Desc_set (d7, GxB_AxB_METHOD, 911911)) ;
-    OK (GB_Descriptor_check (d7, "new descriptor (still Gustavson)", GB3, NULL, Context)) ;
-    OK (GxB_Descriptor_fprint (d7, "d7", GB3, ff)) ;
+    ERR1 (d7, GxB_Desc_set (d7, GxB_AxB_METHOD, 911911)) ;
+    OK (GB_Descriptor_check (d7, "new descriptor (still Gustavson)", G3, NULL)) ;
+    OK (GxB_Descriptor_fprint (d7, "d7", G3, ff)) ;
 
     expected = GrB_INVALID_OBJECT ;
 
     d7->axb = 99 ;
-    ERR (GB_Descriptor_check (d7, "invalid", GB3, NULL, Context)) ;
-    ERR (GxB_Descriptor_fprint (d7, "d7", GB3, ff)) ;
+    ERR (GB_Descriptor_check (d7, "invalid", G3, NULL)) ;
+    ERR (GxB_Descriptor_fprint (d7, "d7", G3, ff)) ;
     d7->axb = GxB_DEFAULT ;
 
     d7->out = 99 ;
-    ERR (GB_Descriptor_check (d7, "invalid", GB3, NULL, Context)) ;
-    ERR (GxB_Descriptor_fprint (d7, "d7", GB3, ff)) ;
+    ERR (GB_Descriptor_check (d7, "invalid", G3, NULL)) ;
+    ERR (GxB_Descriptor_fprint (d7, "d7", G3, ff)) ;
     d7->out = GxB_DEFAULT ;
 
-    d7->out = GxB_AxB_HEAP ;
-    ERR (GB_Descriptor_check (d7, "invalid", GB3, NULL, Context)) ;
-    ERR (GxB_Descriptor_fprint (d7, "d7", GB3, ff)) ;
+    d7->out = GxB_AxB_HASH ;
+    ERR (GB_Descriptor_check (d7, "invalid", G3, NULL)) ;
+    ERR (GxB_Descriptor_fprint (d7, "d7", G3, ff)) ;
     d7->out = GxB_DEFAULT ;
 
     d7->axb = GrB_TRAN ;
-    ERR (GB_Descriptor_check (d7, "invalid", GB3, NULL, Context)) ;
-    ERR (GxB_Descriptor_fprint (d7, "d7", GB3, ff)) ;
+    ERR (GB_Descriptor_check (d7, "invalid", G3, NULL)) ;
+    ERR (GxB_Descriptor_fprint (d7, "d7", G3, ff)) ;
     d7->out = GxB_DEFAULT ;
 
     OK (GrB_Descriptor_free_(&d7)) ;
@@ -1946,55 +1959,55 @@ void mexFunction
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
-    ERR (GrB_mxm (Agunk,  NULL , NULL   , NULL    , NULL , NULL , NULL )) ;
-    ERR (GrB_mxm (C    ,  Agunk, NULL   , NULL    , NULL , NULL , NULL )) ;
-    ERR (GrB_mxm (C    ,  C    , NULL   , NULL    , Agunk, NULL , NULL )) ;
-    ERR (GrB_mxm (C    ,  C    , NULL   , NULL    , A    , Agunk, NULL )) ;
-    ERR (GrB_mxm (C    ,  C    , NULL   , NULL    , A    , A    , dgunk)) ;
-    ERR (GrB_mxm (C    ,  C    , op2gunk, NULL    , A    , A    , NULL )) ;
-    ERR (GrB_mxm (C    ,  C    , o2     , semigunk, A    , A    , NULL )) ;
-
-    ERR (GrB_vxm (vgunk,  NULL , NULL   , NULL    , NULL , NULL , NULL )) ;
-    ERR (GrB_vxm (v    ,  vgunk, NULL   , NULL    , NULL , NULL , NULL )) ;
-    ERR (GrB_vxm (v    ,  v    , NULL   , NULL    , vgunk, NULL , NULL )) ;
-    ERR (GrB_vxm (v    ,  v    , NULL   , NULL    , v    , Agunk, NULL )) ;
-    ERR (GrB_vxm (v    ,  v    , NULL   , NULL    , v    , A    , dgunk)) ;
-    ERR (GrB_vxm (v    ,  v    , op2gunk, NULL    , v    , A    , NULL )) ;
-    ERR (GrB_vxm (v    ,  v    , o2     , semigunk, v    , A    , NULL )) ;
-
-    ERR (GrB_mxv (vgunk,  NULL , NULL   , NULL    , NULL , NULL , NULL )) ;
-    ERR (GrB_mxv (v    ,  vgunk, NULL   , NULL    , NULL , NULL , NULL )) ;
-    ERR (GrB_mxv (v    ,  v    , NULL   , NULL    , Agunk, NULL , NULL )) ;
-    ERR (GrB_mxv (v    ,  v    , NULL   , NULL    , A    , vgunk, NULL )) ;
-    ERR (GrB_mxv (v    ,  v    , NULL   , NULL    , A    , v    , dgunk)) ;
-    ERR (GrB_mxv (v    ,  v    , op2gunk, NULL    , A    , v    , NULL )) ;
-    ERR (GrB_mxv (v    ,  v    , o2     , semigunk, A    , v    , NULL )) ;
+    ERR1 (Agunk, GrB_mxm (Agunk,  NULL , NULL   , NULL    , NULL , NULL , NULL )) ;
+    ERR1 (C, GrB_mxm (C    ,  Agunk, NULL   , NULL    , NULL , NULL , NULL )) ;
+    ERR1 (C, GrB_mxm (C    ,  C    , NULL   , NULL    , Agunk, NULL , NULL )) ;
+    ERR1 (C, GrB_mxm (C    ,  C    , NULL   , NULL    , A    , Agunk, NULL )) ;
+    ERR1 (C, GrB_mxm (C    ,  C    , NULL   , NULL    , A    , A    , dgunk)) ;
+    ERR1 (C, GrB_mxm (C    ,  C    , op2gunk, NULL    , A    , A    , NULL )) ;
+    ERR1 (C, GrB_mxm (C    ,  C    , o2     , semigunk, A    , A    , NULL )) ;
+
+    ERR1 (vgunk, GrB_vxm (vgunk,  NULL , NULL   , NULL    , NULL , NULL , NULL )) ;
+    ERR1 (v, GrB_vxm (v    ,  vgunk, NULL   , NULL    , NULL , NULL , NULL )) ;
+    ERR1 (v, GrB_vxm (v    ,  v    , NULL   , NULL    , vgunk, NULL , NULL )) ;
+    ERR1 (v, GrB_vxm (v    ,  v    , NULL   , NULL    , v    , Agunk, NULL )) ;
+    ERR1 (v, GrB_vxm (v    ,  v    , NULL   , NULL    , v    , A    , dgunk)) ;
+    ERR1 (v, GrB_vxm (v    ,  v    , op2gunk, NULL    , v    , A    , NULL )) ;
+    ERR1 (v, GrB_vxm (v    ,  v    , o2     , semigunk, v    , A    , NULL )) ;
+
+    ERR1 (vgunk, GrB_mxv (vgunk,  NULL , NULL   , NULL    , NULL , NULL , NULL )) ;
+    ERR1 (v, GrB_mxv (v    ,  vgunk, NULL   , NULL    , NULL , NULL , NULL )) ;
+    ERR1 (v, GrB_mxv (v    ,  v    , NULL   , NULL    , Agunk, NULL , NULL )) ;
+    ERR1 (v, GrB_mxv (v    ,  v    , NULL   , NULL    , A    , vgunk, NULL )) ;
+    ERR1 (v, GrB_mxv (v    ,  v    , NULL   , NULL    , A    , v    , dgunk)) ;
+    ERR1 (v, GrB_mxv (v    ,  v    , op2gunk, NULL    , A    , v    , NULL )) ;
+    ERR1 (v, GrB_mxv (v    ,  v    , o2     , semigunk, A    , v    , NULL )) ;
 
     expected = GrB_NULL_POINTER ;
 
     ERR (GrB_mxm (NULL ,  NULL , NULL   , NULL    , NULL , NULL , NULL )) ;
-    ERR (GrB_mxm (C    ,  NULL , NULL   , NULL    , NULL , NULL , NULL )) ;
-    ERR (GrB_mxm (C    ,  NULL , NULL   , NULL    , A    , NULL , NULL )) ;
-    ERR (GrB_mxm (C    ,  NULL , o2     , NULL    , A    , A    , NULL )) ;
+    ERR1 (C, GrB_mxm (C    ,  NULL , NULL   , NULL    , NULL , NULL , NULL )) ;
+    ERR1 (C, GrB_mxm (C    ,  NULL , NULL   , NULL    , A    , NULL , NULL )) ;
+    ERR1 (C, GrB_mxm (C    ,  NULL , o2     , NULL    , A    , A    , NULL )) ;
 
     ERR (GrB_vxm (NULL ,  NULL , NULL   , NULL    , NULL , NULL , NULL )) ;
-    ERR (GrB_vxm (v    ,  v    , NULL   , NULL    , NULL , NULL , NULL )) ;
-    ERR (GrB_vxm (v    ,  v    , NULL   , NULL    , v    , NULL , NULL )) ;
-    ERR (GrB_vxm (v    ,  v    , o2     , NULL    , v    , A    , NULL )) ;
+    ERR1 (v, GrB_vxm (v    ,  v    , NULL   , NULL    , NULL , NULL , NULL )) ;
+    ERR1 (v, GrB_vxm (v    ,  v    , NULL   , NULL    , v    , NULL , NULL )) ;
+    ERR1 (v, GrB_vxm (v    ,  v    , o2     , NULL    , v    , A    , NULL )) ;
 
     ERR (GrB_mxv (NULL ,  NULL , NULL   , NULL    , NULL , NULL , NULL )) ;
-    ERR (GrB_mxv (v    ,  v    , NULL   , NULL    , NULL , NULL , NULL )) ;
-    ERR (GrB_mxv (v    ,  v    , NULL   , NULL    , A    , NULL , NULL )) ;
-    ERR (GrB_mxv (v    ,  v    , o2     , NULL    , A    , v    , NULL )) ;
+    ERR1 (v, GrB_mxv (v    ,  v    , NULL   , NULL    , NULL , NULL , NULL )) ;
+    ERR1 (v, GrB_mxv (v    ,  v    , NULL   , NULL    , A    , NULL , NULL )) ;
+    ERR1 (v, GrB_mxv (v    ,  v    , o2     , NULL    , A    , v    , NULL )) ;
 
     expected = GrB_DIMENSION_MISMATCH ;
 
-    ERR (GrB_mxm (C   , NULL, NULL, s2  , B   , A   , NULL)) ;
-    ERR (GrB_mxm (C   , A   , NULL, s2  , A   , B   , NULL)) ;
+    ERR1 (C, GrB_mxm (C   , NULL, NULL, s2  , B   , A   , NULL)) ;
+    ERR1 (C, GrB_mxm (C   , A   , NULL, s2  , A   , B   , NULL)) ;
 
     expected = (Complex == GxB_FC64) ? GrB_DIMENSION_MISMATCH : GrB_DOMAIN_MISMATCH ;
-    ERR (GrB_mxm (C, NULL, NULL, s2, Z, B, NULL)) ;
-    ERR (GrB_mxm (C, NULL, NULL, s2, B, Z, NULL)) ;
+    ERR1 (C, GrB_mxm (C, NULL, NULL, s2, Z, B, NULL)) ;
+    ERR1 (C, GrB_mxm (C, NULL, NULL, s2, B, Z, NULL)) ;
     if (Complex == GxB_FC64)
     {
         OK  (GrB_mxm (C, NULL, NULL, Complex_plus_times, A, B, NULL)) ;
@@ -2003,14 +2016,14 @@ void mexFunction
     }
     else
     {
-        ERR (GrB_mxm (C, NULL, NULL, Complex_plus_times, A, B, NULL)) ;
-        ERR (GrB_mxm (Z, NULL, NULL, s2, A, B, NULL)) ;
-        ERR (GrB_mxm (C, Z   , NULL, s2, A, B, NULL)) ;
+        ERR1 (C, GrB_mxm (C, NULL, NULL, Complex_plus_times, A, B, NULL)) ;
+        ERR1 (Z, GrB_mxm (Z, NULL, NULL, s2, A, B, NULL)) ;
+        ERR1 (C, GrB_mxm (C, Z   , NULL, s2, A, B, NULL)) ;
     }
 
-    printf ("here we are, last error was %s\n", GrB_error ( )) ;
+    GrB_Matrix_error_(&err, C) ;
+    printf ("last error was [%s]\n", err) ;
     OK (GrB_mxm (C, NULL, o2 , s2, A, B, NULL)) ;
-    printf ("got here\n") ;
 
     // The following are now allocated; keep them for the rest the tests:
     // Agunk, Tgunk, op1gunk, op2gunk, monoid_gunk, semigunk, Aempty, vempty,
@@ -2023,16 +2036,13 @@ void mexFunction
     GrB_Index huge = GxB_INDEX_MAX ;
     GrB_Matrix HugeRow, HugeMatrix = NULL ;
     OK (GrB_Matrix_new (&HugeRow, GrB_FP64, 1, huge)) ;
-    GB_Matrix_check (HugeRow, "huge row", GB3, NULL, Context) ;
-    GxB_Matrix_fprint (HugeRow, "HugeRow", GB3, ff) ;
+    GB_Matrix_check (HugeRow, "huge row", G3, NULL) ;
+    GxB_Matrix_fprint (HugeRow, "HugeRow", G3, ff) ;
 
-    bool mask_applied = false ;
-    GrB_Matrix Aslice [1] ;
-    Aslice [0] = HugeRow ;
-    OK (GB_AxB_dot2 (&HugeMatrix, NULL, false, Aslice, HugeRow,
-        GxB_PLUS_TIMES_FP64, false, &mask_applied, 1, 1, 1, Context)) ;
+    OK (GB_AxB_dot2 (&HugeMatrix, NULL, false, false, HugeRow, HugeRow,
+        GxB_PLUS_TIMES_FP64, false, Context)) ;
 
-    GxB_Matrix_fprint (HugeMatrix, "HugeMatrix", GB3, ff) ;
+    GxB_Matrix_fprint (HugeMatrix, "HugeMatrix", G3, ff) ;
     GrB_Matrix_free_(&HugeMatrix) ;
     GrB_Matrix_free_(&HugeRow) ;
 
@@ -2047,65 +2057,65 @@ void mexFunction
 
     expected = GrB_NULL_POINTER ;
 
-    ERR (GrB_Vector_eWiseMult_Semiring_(v0, NULL, NULL, s0 , v0, v0, d0)) ;  // vector semiring
-    ERR (GrB_Vector_eWiseMult_Semiring_(v0, NULL, NULL, s2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_Semiring_(v , NULL, NULL, s2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_Semiring_(v , NULL, NULL, s2 , v , v0, d0)) ;
-
-    ERR (GrB_Vector_eWiseMult_Monoid_(v0, NULL, NULL, m0 , v0, v0, d0)) ;  // vector monoid
-    ERR (GrB_Vector_eWiseMult_Monoid_(v0, NULL, NULL, m2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_Monoid_(v , NULL, NULL, m2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_Monoid_(v , NULL, NULL, m2 , v , v0, d0)) ;
-
-    ERR (GrB_Vector_eWiseMult_BinaryOp_(v0, NULL, NULL, op0, v0, v0, d0)) ;  // vector op
-    ERR (GrB_Vector_eWiseMult_BinaryOp_(v0, NULL, NULL, o2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_BinaryOp_(v , NULL, NULL, o2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_BinaryOp_(v , NULL, NULL, o2 , v , v0, d0)) ;
-
-    ERR (GrB_Matrix_eWiseMult_Semiring_(A0, NULL, NULL, s0 , A0, A0, d0)) ;  // matrix semiring
-    ERR (GrB_Matrix_eWiseMult_Semiring_(A0, NULL, NULL, s2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_Semiring_(A , NULL, NULL, s2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_Semiring_(A , NULL, NULL, s2 , A , A0, d0)) ;
-
-    ERR (GrB_Matrix_eWiseMult_Monoid_(A0, NULL, NULL, m0 , A0, A0, d0)) ;  // matrix monoid
-    ERR (GrB_Matrix_eWiseMult_Monoid_(A0, NULL, NULL, m2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_Monoid_(A , NULL, NULL, m2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_Monoid_(A , NULL, NULL, m2 , A , A0, d0)) ;
-
-    ERR (GrB_Matrix_eWiseMult_BinaryOp_(A0, NULL, NULL, op0, A0, A0, d0)) ;  // matrix op
-    ERR (GrB_Matrix_eWiseMult_BinaryOp_(A0, NULL, NULL, o2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_BinaryOp_(A , NULL, NULL, o2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_BinaryOp_(A , NULL, NULL, o2 , A , A0, d0)) ;
-
-    ERR (GrB_Vector_eWiseAdd_Semiring_(v0, NULL, NULL, s0 , v0, v0, d0)) ;  // vector semiring
-    ERR (GrB_Vector_eWiseAdd_Semiring_(v0, NULL, NULL, s2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_Semiring_(v , NULL, NULL, s2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_Semiring_(v , NULL, NULL, s2 , v , v0, d0)) ;
-
-    ERR (GrB_Vector_eWiseAdd_Monoid_(v0, NULL, NULL, m0 , v0, v0, d0)) ;  // vector monoid
-    ERR (GrB_Vector_eWiseAdd_Monoid_(v0, NULL, NULL, m2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_Monoid_(v , NULL, NULL, m2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_Monoid_(v , NULL, NULL, m2 , v , v0, d0)) ;
-
-    ERR (GrB_Vector_eWiseAdd_BinaryOp_(v0, NULL, NULL, op0, v0, v0, d0)) ;  // vector op
-    ERR (GrB_Vector_eWiseAdd_BinaryOp_(v0, NULL, NULL, o2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_BinaryOp_(v , NULL, NULL, o2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_BinaryOp_(v , NULL, NULL, o2 , v , v0, d0)) ;
-
-    ERR (GrB_Matrix_eWiseAdd_Semiring_(A0, NULL, NULL, s0 , A0, A0, d0)) ;  // matrix semiring
-    ERR (GrB_Matrix_eWiseAdd_Semiring_(A0, NULL, NULL, s2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_Semiring_(A , NULL, NULL, s2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_Semiring_(A , NULL, NULL, s2 , A , A0, d0)) ;
-
-    ERR (GrB_Matrix_eWiseAdd_Monoid_(A0, NULL, NULL, m0 , A0, A0, d0)) ;  // matrix monoid
-    ERR (GrB_Matrix_eWiseAdd_Monoid_(A0, NULL, NULL, m2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_Monoid_(A , NULL, NULL, m2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_Monoid_(A , NULL, NULL, m2 , A , A0, d0)) ;
-
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A0, NULL, NULL, op0, A0, A0, d0)) ;  // matrix op
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A0, NULL, NULL, o2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, o2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, o2 , A , A0, d0)) ;
+    ERR1 (v0, GrB_Vector_eWiseMult_Semiring_(v0, NULL, NULL, s0 , v0, v0, d0)) ;  // vector semiring
+    ERR1 (v0, GrB_Vector_eWiseMult_Semiring_(v0, NULL, NULL, s2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_Semiring_(v , NULL, NULL, s2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_Semiring_(v , NULL, NULL, s2 , v , v0, d0)) ;
+
+    ERR1 (v0, GrB_Vector_eWiseMult_Monoid_(v0, NULL, NULL, m0 , v0, v0, d0)) ;  // vector monoid
+    ERR1 (v0, GrB_Vector_eWiseMult_Monoid_(v0, NULL, NULL, m2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_Monoid_(v , NULL, NULL, m2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_Monoid_(v , NULL, NULL, m2 , v , v0, d0)) ;
+
+    ERR1 (v0, GrB_Vector_eWiseMult_BinaryOp_(v0, NULL, NULL, op0, v0, v0, d0)) ;  // vector op
+    ERR1 (v0, GrB_Vector_eWiseMult_BinaryOp_(v0, NULL, NULL, o2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_BinaryOp_(v , NULL, NULL, o2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_BinaryOp_(v , NULL, NULL, o2 , v , v0, d0)) ;
+
+    ERR1 (A0, GrB_Matrix_eWiseMult_Semiring_(A0, NULL, NULL, s0 , A0, A0, d0)) ;  // matrix semiring
+    ERR1 (A0, GrB_Matrix_eWiseMult_Semiring_(A0, NULL, NULL, s2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_Semiring_(A , NULL, NULL, s2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_Semiring_(A , NULL, NULL, s2 , A , A0, d0)) ;
+
+    ERR1 (A0, GrB_Matrix_eWiseMult_Monoid_(A0, NULL, NULL, m0 , A0, A0, d0)) ;  // matrix monoid
+    ERR1 (A0, GrB_Matrix_eWiseMult_Monoid_(A0, NULL, NULL, m2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_Monoid_(A , NULL, NULL, m2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_Monoid_(A , NULL, NULL, m2 , A , A0, d0)) ;
+
+    ERR1 (A0, GrB_Matrix_eWiseMult_BinaryOp_(A0, NULL, NULL, op0, A0, A0, d0)) ;  // matrix op
+    ERR1 (A0, GrB_Matrix_eWiseMult_BinaryOp_(A0, NULL, NULL, o2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_BinaryOp_(A , NULL, NULL, o2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_BinaryOp_(A , NULL, NULL, o2 , A , A0, d0)) ;
+
+    ERR1 (v0, GrB_Vector_eWiseAdd_Semiring_(v0, NULL, NULL, s0 , v0, v0, d0)) ;  // vector semiring
+    ERR1 (v0, GrB_Vector_eWiseAdd_Semiring_(v0, NULL, NULL, s2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_Semiring_(v , NULL, NULL, s2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_Semiring_(v , NULL, NULL, s2 , v , v0, d0)) ;
+
+    ERR1 (v0, GrB_Vector_eWiseAdd_Monoid_(v0, NULL, NULL, m0 , v0, v0, d0)) ;  // vector monoid
+    ERR1 (v0, GrB_Vector_eWiseAdd_Monoid_(v0, NULL, NULL, m2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_Monoid_(v , NULL, NULL, m2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_Monoid_(v , NULL, NULL, m2 , v , v0, d0)) ;
+
+    ERR1 (v0, GrB_Vector_eWiseAdd_BinaryOp_(v0, NULL, NULL, op0, v0, v0, d0)) ;  // vector op
+    ERR1 (v0, GrB_Vector_eWiseAdd_BinaryOp_(v0, NULL, NULL, o2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_BinaryOp_(v , NULL, NULL, o2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_BinaryOp_(v , NULL, NULL, o2 , v , v0, d0)) ;
+
+    ERR1 (A0, GrB_Matrix_eWiseAdd_Semiring_(A0, NULL, NULL, s0 , A0, A0, d0)) ;  // matrix semiring
+    ERR1 (A0, GrB_Matrix_eWiseAdd_Semiring_(A0, NULL, NULL, s2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_Semiring_(A , NULL, NULL, s2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_Semiring_(A , NULL, NULL, s2 , A , A0, d0)) ;
+
+    ERR1 (A0, GrB_Matrix_eWiseAdd_Monoid_(A0, NULL, NULL, m0 , A0, A0, d0)) ;  // matrix monoid
+    ERR1 (A0, GrB_Matrix_eWiseAdd_Monoid_(A0, NULL, NULL, m2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_Monoid_(A , NULL, NULL, m2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_Monoid_(A , NULL, NULL, m2 , A , A0, d0)) ;
+
+    ERR1 (A0, GrB_Matrix_eWiseAdd_BinaryOp_(A0, NULL, NULL, op0, A0, A0, d0)) ;  // matrix op
+    ERR1 (A0, GrB_Matrix_eWiseAdd_BinaryOp_(A0, NULL, NULL, o2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, o2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, o2 , A , A0, d0)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
@@ -2115,97 +2125,97 @@ void mexFunction
     A0 = Agunk ;
     op0 = op2gunk ;
 
-    ERR (GrB_Vector_eWiseMult_Semiring_(v0, NULL, NULL, s0 , v0, v0, d0)) ;  // vector semiring
-    ERR (GrB_Vector_eWiseMult_Semiring_(v0, NULL, NULL, s2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_Semiring_(v , NULL, NULL, s2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_Semiring_(v , NULL, NULL, s2 , v , v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_Semiring_(v , v0  , NULL, s2 , v , v , d0)) ;
-    ERR (GrB_Vector_eWiseMult_Semiring_(v , NULL, op0 , s2 , v , v , NULL)) ;
-
-    ERR (GrB_Vector_eWiseMult_Monoid_(v0, NULL, NULL, m0 , v0, v0, d0)) ;  // vector monoid
-    ERR (GrB_Vector_eWiseMult_Monoid_(v0, NULL, NULL, m2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_Monoid_(v , NULL, NULL, m2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_Monoid_(v , NULL, NULL, m2 , v , v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_Monoid_(v , v0  , NULL, m2 , v , v , d0)) ;
-    ERR (GrB_Vector_eWiseMult_Monoid_(v , NULL, op0 , m2 , v , v , NULL)) ;
-
-    ERR (GrB_Vector_eWiseMult_BinaryOp_(v0, NULL, NULL, op0, v0, v0, d0)) ;  // vector op
-    ERR (GrB_Vector_eWiseMult_BinaryOp_(v0, NULL, NULL, o2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_BinaryOp_(v , NULL, NULL, o2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_BinaryOp_(v , NULL, NULL, o2 , v , v0, d0)) ;
-    ERR (GrB_Vector_eWiseMult_BinaryOp_(v , v0  , NULL, o2 , v , v , d0)) ;
-    ERR (GrB_Vector_eWiseMult_BinaryOp_(v , NULL, op0 , o2 , v , v , NULL)) ;
-
-    ERR (GrB_Matrix_eWiseMult_Semiring_(A0, NULL, NULL, s0 , A0, A0, d0)) ;  // matrix semiring
-    ERR (GrB_Matrix_eWiseMult_Semiring_(A0, NULL, NULL, s2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_Semiring_(A , NULL, NULL, s2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_Semiring_(A , NULL, NULL, s2 , A , A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_Semiring_(A , A0  , NULL, s2 , A , A , d0)) ;
-    ERR (GrB_Matrix_eWiseMult_Semiring_(A , NULL, op0 , s2 , A , A , NULL)) ;
-
-    ERR (GrB_Matrix_eWiseMult_Monoid_(A0, NULL, NULL, m0 , A0, A0, d0)) ;  // matrix monoid
-    ERR (GrB_Matrix_eWiseMult_Monoid_(A0, NULL, NULL, m2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_Monoid_(A , NULL, NULL, m2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_Monoid_(A , NULL, NULL, m2 , A , A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_Monoid_(A , A0  , NULL, m2 , A , A , d0)) ;
-    ERR (GrB_Matrix_eWiseMult_Monoid_(A , NULL, op0 , m2 , A , A , NULL)) ;
-
-    ERR (GrB_Matrix_eWiseMult_BinaryOp_(A0, NULL, NULL, op0, A0, A0, d0)) ;  // matrix op
-    ERR (GrB_Matrix_eWiseMult_BinaryOp_(A0, NULL, NULL, o2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_BinaryOp_(A , NULL, NULL, o2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_BinaryOp_(A , NULL, NULL, o2 , A , A0, d0)) ;
-    ERR (GrB_Matrix_eWiseMult_BinaryOp_(A , A0  , NULL, o2 , A , A , d0)) ;
-    ERR (GrB_Matrix_eWiseMult_BinaryOp_(A , NULL, op0 , o2 , A , A , NULL)) ;
-
-    ERR (GrB_Vector_eWiseAdd_Semiring_(v0, NULL, NULL, s0 , v0, v0, d0)) ;  // vector semiring
-    ERR (GrB_Vector_eWiseAdd_Semiring_(v0, NULL, NULL, s2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_Semiring_(v , NULL, NULL, s2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_Semiring_(v , NULL, NULL, s2 , v , v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_Semiring_(v , v0  , NULL, s2 , v , v , d0)) ;
-    ERR (GrB_Vector_eWiseAdd_Semiring_(v , NULL, op0 , s2 , v , v , NULL)) ;
-
-    ERR (GrB_Vector_eWiseAdd_Monoid_(v0, NULL, NULL, m0 , v0, v0, d0)) ;  // vector monoid
-    ERR (GrB_Vector_eWiseAdd_Monoid_(v0, NULL, NULL, m2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_Monoid_(v , NULL, NULL, m2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_Monoid_(v , NULL, NULL, m2 , v , v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_Monoid_(v , v0  , NULL, m2 , v , v , d0)) ;
-    ERR (GrB_Vector_eWiseAdd_Monoid_(v , NULL, op0 , m2 , v , v , NULL)) ;
-
-    ERR (GrB_Vector_eWiseAdd_BinaryOp_(v0, NULL, NULL, op0, v0, v0, d0)) ;  // vector op
-    ERR (GrB_Vector_eWiseAdd_BinaryOp_(v0, NULL, NULL, o2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_BinaryOp_(v , NULL, NULL, o2 , v0, v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_BinaryOp_(v , NULL, NULL, o2 , v , v0, d0)) ;
-    ERR (GrB_Vector_eWiseAdd_BinaryOp_(v , v0  , NULL, o2 , v , v , d0)) ;
-    ERR (GrB_Vector_eWiseAdd_BinaryOp_(v , NULL, op0 , o2 , v , v , NULL)) ;
-
-    ERR (GrB_Matrix_eWiseAdd_Semiring_(A0, NULL, NULL, s0 , A0, A0, d0)) ;  // matrix semiring
-    ERR (GrB_Matrix_eWiseAdd_Semiring_(A0, NULL, NULL, s2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_Semiring_(A , NULL, NULL, s2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_Semiring_(A , NULL, NULL, s2 , A , A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_Semiring_(A , A0  , NULL, s2 , A , A , d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_Semiring_(A , NULL, op0 , s2 , A , A , NULL)) ;
-
-    ERR (GrB_Matrix_eWiseAdd_Monoid_(A0, NULL, NULL, m0 , A0, A0, d0)) ;  // matrix monoid
-    ERR (GrB_Matrix_eWiseAdd_Monoid_(A0, NULL, NULL, m2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_Monoid_(A , NULL, NULL, m2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_Monoid_(A , NULL, NULL, m2 , A , A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_Monoid_(A , A0  , NULL, m2 , A , A , d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_Monoid_(A , NULL, op0 , m2 , A , A , NULL)) ;
-
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A0, NULL, NULL, op0, A0, A0, d0)) ;  // matrix op
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A0, NULL, NULL, o2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, o2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, o2 , A , A0, d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , A0  , NULL, o2 , A , A , d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, op0 , o2 , A , A , NULL)) ;
+    ERR1 (v0, GrB_Vector_eWiseMult_Semiring_(v0, NULL, NULL, s0 , v0, v0, d0)) ;  // vector semiring
+    ERR1 (v0, GrB_Vector_eWiseMult_Semiring_(v0, NULL, NULL, s2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_Semiring_(v , NULL, NULL, s2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_Semiring_(v , NULL, NULL, s2 , v , v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_Semiring_(v , v0  , NULL, s2 , v , v , d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_Semiring_(v , NULL, op0 , s2 , v , v , NULL)) ;
+
+    ERR1 (v0, GrB_Vector_eWiseMult_Monoid_(v0, NULL, NULL, m0 , v0, v0, d0)) ;  // vector monoid
+    ERR1 (v0, GrB_Vector_eWiseMult_Monoid_(v0, NULL, NULL, m2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_Monoid_(v , NULL, NULL, m2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_Monoid_(v , NULL, NULL, m2 , v , v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_Monoid_(v , v0  , NULL, m2 , v , v , d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_Monoid_(v , NULL, op0 , m2 , v , v , NULL)) ;
+
+    ERR1 (v0, GrB_Vector_eWiseMult_BinaryOp_(v0, NULL, NULL, op0, v0, v0, d0)) ;  // vector op
+    ERR1 (v0, GrB_Vector_eWiseMult_BinaryOp_(v0, NULL, NULL, o2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_BinaryOp_(v , NULL, NULL, o2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_BinaryOp_(v , NULL, NULL, o2 , v , v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_BinaryOp_(v , v0  , NULL, o2 , v , v , d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseMult_BinaryOp_(v , NULL, op0 , o2 , v , v , NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_eWiseMult_Semiring_(A0, NULL, NULL, s0 , A0, A0, d0)) ;  // matrix semiring
+    ERR1 (A0, GrB_Matrix_eWiseMult_Semiring_(A0, NULL, NULL, s2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_Semiring_(A , NULL, NULL, s2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_Semiring_(A , NULL, NULL, s2 , A , A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_Semiring_(A , A0  , NULL, s2 , A , A , d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_Semiring_(A , NULL, op0 , s2 , A , A , NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_eWiseMult_Monoid_(A0, NULL, NULL, m0 , A0, A0, d0)) ;  // matrix monoid
+    ERR1 (A0, GrB_Matrix_eWiseMult_Monoid_(A0, NULL, NULL, m2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_Monoid_(A , NULL, NULL, m2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_Monoid_(A , NULL, NULL, m2 , A , A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_Monoid_(A , A0  , NULL, m2 , A , A , d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_Monoid_(A , NULL, op0 , m2 , A , A , NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_eWiseMult_BinaryOp_(A0, NULL, NULL, op0, A0, A0, d0)) ;  // matrix op
+    ERR1 (A0, GrB_Matrix_eWiseMult_BinaryOp_(A0, NULL, NULL, o2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_BinaryOp_(A , NULL, NULL, o2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_BinaryOp_(A , NULL, NULL, o2 , A , A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_BinaryOp_(A , A0  , NULL, o2 , A , A , d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseMult_BinaryOp_(A , NULL, op0 , o2 , A , A , NULL)) ;
+
+    ERR1 (v0, GrB_Vector_eWiseAdd_Semiring_(v0, NULL, NULL, s0 , v0, v0, d0)) ;  // vector semiring
+    ERR1 (v0, GrB_Vector_eWiseAdd_Semiring_(v0, NULL, NULL, s2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_Semiring_(v , NULL, NULL, s2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_Semiring_(v , NULL, NULL, s2 , v , v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_Semiring_(v , v0  , NULL, s2 , v , v , d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_Semiring_(v , NULL, op0 , s2 , v , v , NULL)) ;
+
+    ERR1 (v0, GrB_Vector_eWiseAdd_Monoid_(v0, NULL, NULL, m0 , v0, v0, d0)) ;  // vector monoid
+    ERR1 (v0, GrB_Vector_eWiseAdd_Monoid_(v0, NULL, NULL, m2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_Monoid_(v , NULL, NULL, m2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_Monoid_(v , NULL, NULL, m2 , v , v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_Monoid_(v , v0  , NULL, m2 , v , v , d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_Monoid_(v , NULL, op0 , m2 , v , v , NULL)) ;
+
+    ERR1 (v0, GrB_Vector_eWiseAdd_BinaryOp_(v0, NULL, NULL, op0, v0, v0, d0)) ;  // vector op
+    ERR1 (v0, GrB_Vector_eWiseAdd_BinaryOp_(v0, NULL, NULL, o2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_BinaryOp_(v , NULL, NULL, o2 , v0, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_BinaryOp_(v , NULL, NULL, o2 , v , v0, d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_BinaryOp_(v , v0  , NULL, o2 , v , v , d0)) ;
+    ERR1 (v,  GrB_Vector_eWiseAdd_BinaryOp_(v , NULL, op0 , o2 , v , v , NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_eWiseAdd_Semiring_(A0, NULL, NULL, s0 , A0, A0, d0)) ;  // matrix semiring
+    ERR1 (A0, GrB_Matrix_eWiseAdd_Semiring_(A0, NULL, NULL, s2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_Semiring_(A , NULL, NULL, s2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_Semiring_(A , NULL, NULL, s2 , A , A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_Semiring_(A , A0  , NULL, s2 , A , A , d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_Semiring_(A , NULL, op0 , s2 , A , A , NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_eWiseAdd_Monoid_(A0, NULL, NULL, m0 , A0, A0, d0)) ;  // matrix monoid
+    ERR1 (A0, GrB_Matrix_eWiseAdd_Monoid_(A0, NULL, NULL, m2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_Monoid_(A , NULL, NULL, m2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_Monoid_(A , NULL, NULL, m2 , A , A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_Monoid_(A , A0  , NULL, m2 , A , A , d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_Monoid_(A , NULL, op0 , m2 , A , A , NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_eWiseAdd_BinaryOp_(A0, NULL, NULL, op0, A0, A0, d0)) ;  // matrix op
+    ERR1 (A0, GrB_Matrix_eWiseAdd_BinaryOp_(A0, NULL, NULL, o2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, o2 , A0, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, o2 , A , A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_BinaryOp_(A , A0  , NULL, o2 , A , A , d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, op0 , o2 , A , A , NULL)) ;
 
     expected = (Complex == GxB_FC64) ? GrB_DIMENSION_MISMATCH : GrB_DOMAIN_MISMATCH ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, o2 , Z , A , d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, o2 , A , Z , d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, Complex_plus, Z , A , d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, Complex_plus, A , Z , d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, Complex_plus, Z , Z , d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(Z , NULL, NULL, Complex_complex, A , A , d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, o2 , Z , A , d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, o2 , A , Z , d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, Complex_plus, Z , A , d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, Complex_plus, A , Z , d0)) ;
+    ERR1 (A,  GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, NULL, Complex_plus, Z , Z , d0)) ;
+    ERR1 (Z,  GrB_Matrix_eWiseAdd_BinaryOp_(Z , NULL, NULL, Complex_complex, A , A , d0)) ;
 
     if (Complex == GxB_FC64)
     {
@@ -2213,14 +2223,14 @@ void mexFunction
     }
     else
     {
-        ERR (GrB_Matrix_eWiseAdd_BinaryOp_(Z , Z   , NULL, Complex_plus, Z , Z , d0)) ;
+        ERR1 (Z,  GrB_Matrix_eWiseAdd_BinaryOp_(Z , Z   , NULL, Complex_plus, Z , Z , d0)) ;
     }
 
     OK (GrB_BinaryOp_new (&op3, f3, Complex, Complex, GrB_FP64)) ;
 
     expected = (Complex == GxB_FC64) ? GrB_DIMENSION_MISMATCH : GrB_DOMAIN_MISMATCH ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(Z , NULL, NULL, op3, Z , A , d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(Z , NULL, op3 , o2 , A , A , d0)) ;
+    ERR1 (Z, GrB_Matrix_eWiseAdd_BinaryOp_(Z , NULL, NULL, op3, Z , A , d0)) ;
+    ERR1 (Z, GrB_Matrix_eWiseAdd_BinaryOp_(Z , NULL, op3 , o2 , A , A , d0)) ;
 
     if (Complex == GxB_FC64)
     {
@@ -2229,16 +2239,16 @@ void mexFunction
     }
     else
     {
-        ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, op3 , o2 , A , A , d0)) ;
-        ERR (GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, Complex_complex, o2 , A , A , d0)) ;
+        ERR1 (A, GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, op3 , o2 , A , A , d0)) ;
+        ERR1 (A, GrB_Matrix_eWiseAdd_BinaryOp_(A , NULL, Complex_complex, o2 , A , A , d0)) ;
     }
 
     expected = GrB_DIMENSION_MISMATCH ;
 
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(C , NULL, NULL, o2 , A , B , d0)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(C , NULL, NULL, o2 , A , B , dtn)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(C , NULL, NULL, o2 , A , B , dnt)) ;
-    ERR (GrB_Matrix_eWiseAdd_BinaryOp_(C , NULL, NULL, o2 , A , B , dtt)) ;
+    ERR1 (C, GrB_Matrix_eWiseAdd_BinaryOp_(C , NULL, NULL, o2 , A , B , d0)) ;
+    ERR1 (C, GrB_Matrix_eWiseAdd_BinaryOp_(C , NULL, NULL, o2 , A , B , dtn)) ;
+    ERR1 (C, GrB_Matrix_eWiseAdd_BinaryOp_(C , NULL, NULL, o2 , A , B , dnt)) ;
+    ERR1 (C, GrB_Matrix_eWiseAdd_BinaryOp_(C , NULL, NULL, o2 , A , B , dtt)) ;
 
     // The following are now allocated; keep them for the rest the tests:
     // Agunk, Tgunk, op1gunk, op2gunk, monoid_gunk, semigunk, Aempty, vempty,
@@ -2262,9 +2272,9 @@ void mexFunction
     expected = GrB_NULL_POINTER ;
 
     info = (GrB_Matrix_kronecker_BinaryOp_(A0, NULL, NULL, op0, A0, A0, d0)) ;  // matrix op
-    ERR (GrB_Matrix_kronecker_BinaryOp_(A0, NULL, NULL, o2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_(A , NULL, NULL, o2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_(A , NULL, NULL, o2 , A , A0, d0)) ;
+    ERR1 (A0, GrB_Matrix_kronecker_BinaryOp_(A0, NULL, NULL, o2 , A0, A0, d0)) ;
+    ERR1 (A, GrB_Matrix_kronecker_BinaryOp_(A , NULL, NULL, o2 , A0, A0, d0)) ;
+    ERR1 (A, GrB_Matrix_kronecker_BinaryOp_(A , NULL, NULL, o2 , A , A0, d0)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
@@ -2274,28 +2284,28 @@ void mexFunction
     A0 = Agunk ;
     op0 = op2gunk ;
 
-    ERR (GrB_Matrix_kronecker_BinaryOp_(A0, NULL, NULL, op0, A0, A0, d0)) ;  // matrix op
-    ERR (GrB_Matrix_kronecker_BinaryOp_(A0, NULL, NULL, o2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_(A , NULL, NULL, o2 , A0, A0, d0)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_(A , NULL, NULL, o2 , A , A0, d0)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_(A , A0  , NULL, o2 , A , A , d0)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_(A , NULL, op0 , o2 , A , A , NULL)) ;
+    ERR1 (A0, GrB_Matrix_kronecker_BinaryOp_(A0, NULL, NULL, op0, A0, A0, d0)) ;  // matrix op
+    ERR1 (A0, GrB_Matrix_kronecker_BinaryOp_(A0, NULL, NULL, o2 , A0, A0, d0)) ;
+    ERR1 (A, GrB_Matrix_kronecker_BinaryOp_(A , NULL, NULL, o2 , A0, A0, d0)) ;
+    ERR1 (A, GrB_Matrix_kronecker_BinaryOp_(A , NULL, NULL, o2 , A , A0, d0)) ;
+    ERR1 (A, GrB_Matrix_kronecker_BinaryOp_(A , A0  , NULL, o2 , A , A , d0)) ;
+    ERR1 (A, GrB_Matrix_kronecker_BinaryOp_(A , NULL, op0 , o2 , A , A , NULL)) ;
 
     expected = (Complex == GxB_FC64) ? GrB_DIMENSION_MISMATCH : GrB_DOMAIN_MISMATCH ;
 
-    ERR (GrB_Matrix_kronecker_BinaryOp_  (A , NULL, NULL, o2 , Z , A , d0)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_  (A , NULL, NULL, o2 , A , Z , d0)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_  (A , NULL, NULL, Complex_plus, Z , A , d0)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_  (A , NULL, NULL, Complex_plus, A , Z , d0)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_  (A , NULL, NULL, Complex_plus, Z , Z , d0)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_  (Z , Z   , NULL, Complex_plus, Z , Z , d0)) ;
+    ERR1 (A, GrB_Matrix_kronecker_BinaryOp_  (A , NULL, NULL, o2 , Z , A , d0)) ;
+    ERR1 (A, GrB_Matrix_kronecker_BinaryOp_  (A , NULL, NULL, o2 , A , Z , d0)) ;
+    ERR1 (A, GrB_Matrix_kronecker_BinaryOp_  (A , NULL, NULL, Complex_plus, Z , A , d0)) ;
+    ERR1 (A, GrB_Matrix_kronecker_BinaryOp_  (A , NULL, NULL, Complex_plus, A , Z , d0)) ;
+    ERR1 (A, GrB_Matrix_kronecker_BinaryOp_  (A , NULL, NULL, Complex_plus, Z , Z , d0)) ;
+    ERR1 (Z, GrB_Matrix_kronecker_BinaryOp_  (Z , Z   , NULL, Complex_plus, Z , Z , d0)) ;
 
     expected = GrB_DIMENSION_MISMATCH ;
 
-    ERR (GrB_Matrix_kronecker_BinaryOp_  (C , NULL, NULL, o2 , A , B , d0)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_  (C , NULL, NULL, o2 , A , B , dtn)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_  (C , NULL, NULL, o2 , A , B , dnt)) ;
-    ERR (GrB_Matrix_kronecker_BinaryOp_  (C , NULL, NULL, o2 , A , B , dtt)) ;
+    ERR1 (C, GrB_Matrix_kronecker_BinaryOp_  (C , NULL, NULL, o2 , A , B , d0)) ;
+    ERR1 (C, GrB_Matrix_kronecker_BinaryOp_  (C , NULL, NULL, o2 , A , B , dtn)) ;
+    ERR1 (C, GrB_Matrix_kronecker_BinaryOp_  (C , NULL, NULL, o2 , A , B , dnt)) ;
+    ERR1 (C, GrB_Matrix_kronecker_BinaryOp_  (C , NULL, NULL, o2 , A , B , dtt)) ;
 
     //--------------------------------------------------------------------------
     // extract
@@ -2311,18 +2321,18 @@ void mexFunction
     A0 = NULL ;
     op0 = NULL ;
 
-    ERR (GrB_Vector_extract_(v0, NULL, NULL, v0, I0, 0,    d0)) ;     // vector extract
-    ERR (GrB_Vector_extract_(v , NULL, NULL, v0, I0, 0,    d0)) ;
-    ERR (GrB_Vector_extract_(v , NULL, NULL, u , I0, 0,    d0)) ;
+    ERR1 (v0, GrB_Vector_extract_(v0, NULL, NULL, v0, I0, 0,    d0)) ;     // vector extract
+    ERR1 (v,  GrB_Vector_extract_(v , NULL, NULL, v0, I0, 0,    d0)) ;
+    ERR1 (v,  GrB_Vector_extract_(v , NULL, NULL, u , I0, 0,    d0)) ;
 
-    ERR (GrB_Col_extract_(v0, NULL, NULL, A0, I0, 0, 0, d0)) ;     // column extract
-    ERR (GrB_Col_extract_(v , NULL, NULL, A0, I0, 0, 0, d0)) ;
-    ERR (GrB_Col_extract_(v , NULL, NULL, A , I0, 0, 0, d0)) ;
+    ERR1 (v0, GrB_Col_extract_(v0, NULL, NULL, A0, I0, 0, 0, d0)) ;     // column extract
+    ERR1 (v,  GrB_Col_extract_(v , NULL, NULL, A0, I0, 0, 0, d0)) ;
+    ERR1 (v,  GrB_Col_extract_(v , NULL, NULL, A , I0, 0, 0, d0)) ;
 
-    ERR (GrB_Matrix_extract_(A0, NULL, NULL, A0, I0, 0, J0, 0, d0)) ; // matrix extract
-    ERR (GrB_Matrix_extract_(A , NULL, NULL, A0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_extract_(A , NULL, NULL, A , I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_extract_(A , NULL, NULL, A , I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_extract_(A0, NULL, NULL, A0, I0, 0, J0, 0, d0)) ; // matrix extract
+    ERR1 (A,  GrB_Matrix_extract_(A , NULL, NULL, A0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_extract_(A , NULL, NULL, A , I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_extract_(A , NULL, NULL, A , I , 0, J0, 0, d0)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
@@ -2331,51 +2341,51 @@ void mexFunction
     d0 = dgunk ;
     op0 = op2gunk ;
 
-    ERR (GrB_Vector_extract_(v0, NULL, NULL, v0, I0, 0,    d0)) ;     // vector extract
-    ERR (GrB_Vector_extract_(v , v0  , NULL, v0, I0, 0,    d0)) ;
-    ERR (GrB_Vector_extract_(v , v   , NULL, v0, I0, 0,    d0)) ;
-    ERR (GrB_Vector_extract_(v , v   , NULL, v , I , 1,    d0)) ;
-    ERR (GrB_Vector_extract_(v , v   , op0 , v , I , 1,    NULL)) ;
+    ERR1 (v0, GrB_Vector_extract_(v0, NULL, NULL, v0, I0, 0,    d0)) ;     // vector extract
+    ERR1 (v,  GrB_Vector_extract_(v , v0  , NULL, v0, I0, 0,    d0)) ;
+    ERR1 (v,  GrB_Vector_extract_(v , v   , NULL, v0, I0, 0,    d0)) ;
+    ERR1 (v,  GrB_Vector_extract_(v , v   , NULL, v , I , 1,    d0)) ;
+    ERR1 (v,  GrB_Vector_extract_(v , v   , op0 , v , I , 1,    NULL)) ;
 
-    ERR (GrB_Col_extract_(v0, NULL, NULL, A0, I0, 0, 0, d0)) ;     // column extract
-    ERR (GrB_Col_extract_(v , v0  , NULL, A0, I0, 0, 0, d0)) ;
-    ERR (GrB_Col_extract_(v , v   , NULL, A0, I0, 0, 0, d0)) ;
-    ERR (GrB_Col_extract_(v , v   , NULL, A , I , 1, 0, d0)) ;
-    ERR (GrB_Col_extract_(v , v   , op0 , A , I , 1, 0, NULL)) ;
+    ERR1 (v0, GrB_Col_extract_(v0, NULL, NULL, A0, I0, 0, 0, d0)) ;     // column extract
+    ERR1 (v,  GrB_Col_extract_(v , v0  , NULL, A0, I0, 0, 0, d0)) ;
+    ERR1 (v,  GrB_Col_extract_(v , v   , NULL, A0, I0, 0, 0, d0)) ;
+    ERR1 (v,  GrB_Col_extract_(v , v   , NULL, A , I , 1, 0, d0)) ;
+    ERR1 (v,  GrB_Col_extract_(v , v   , op0 , A , I , 1, 0, NULL)) ;
 
-    ERR (GrB_Matrix_extract_(A0, NULL, NULL, A0, I0, 0, J0, 0, d0)) ; // matrix extract
-    ERR (GrB_Matrix_extract_(A , A0  , NULL, A0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_extract_(A , A   , NULL, A0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_extract_(A , A   , NULL, A0, I , 1, J , 1, d0)) ;
-    ERR (GrB_Matrix_extract_(A , A   , op0 , A , I , 1, J , 1, NULL)) ;
+    ERR1 (A0, GrB_Matrix_extract_(A0, NULL, NULL, A0, I0, 0, J0, 0, d0)) ; // matrix extract
+    ERR1 (A,  GrB_Matrix_extract_(A , A0  , NULL, A0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_extract_(A , A   , NULL, A0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_extract_(A , A   , NULL, A0, I , 1, J , 1, d0)) ;
+    ERR1 (A,  GrB_Matrix_extract_(A , A   , op0 , A , I , 1, J , 1, NULL)) ;
 
     v0 = NULL ;
     A0 = NULL ;
     d0 = NULL ;
     expected = (Complex == GxB_FC64) ? GrB_DIMENSION_MISMATCH : GrB_DOMAIN_MISMATCH ;
 
-    ERR (GrB_Vector_extract_(v, z, NULL, u, I, 0, d0)) ;              // vector extract
-    ERR (GrB_Vector_extract_(v, NULL, Complex_plus, u, I, 0, d0)) ;
-    ERR (GrB_Vector_extract_(v, NULL, Complex_plus, z, I, 0, d0)) ;
-    ERR (GrB_Vector_extract_(z, NULL, o2 , u, I, 0, d0)) ;
-    ERR (GrB_Vector_extract_(v, NULL, o2 , z, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_extract_(v, z, NULL, u, I, 0, d0)) ;              // vector extract
+    ERR1 (v,  GrB_Vector_extract_(v, NULL, Complex_plus, u, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_extract_(v, NULL, Complex_plus, z, I, 0, d0)) ;
+    ERR1 (z,  GrB_Vector_extract_(z, NULL, o2 , u, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_extract_(v, NULL, o2 , z, I, 0, d0)) ;
 
-    ERR (GrB_Col_extract_(v, z, NULL, A, I, 0, 0, d0)) ;           // column extract
-    ERR (GrB_Col_extract_(v, NULL, Complex_plus, A, I, 0, 0, d0)) ;
-    ERR (GrB_Col_extract_(v, NULL, Complex_plus, Z, I, 0, 0, d0)) ;
-    ERR (GrB_Col_extract_(z, NULL, o2 , A, I, 0, 0, d0)) ;
-    ERR (GrB_Col_extract_(v, NULL, o2 , Z, I, 0, 0, d0)) ;
+    ERR1 (v,  GrB_Col_extract_(v, z, NULL, A, I, 0, 0, d0)) ;           // column extract
+    ERR1 (v,  GrB_Col_extract_(v, NULL, Complex_plus, A, I, 0, 0, d0)) ;
+    ERR1 (v,  GrB_Col_extract_(v, NULL, Complex_plus, Z, I, 0, 0, d0)) ;
+    ERR1 (z,  GrB_Col_extract_(z, NULL, o2 , A, I, 0, 0, d0)) ;
+    ERR1 (v,  GrB_Col_extract_(v, NULL, o2 , Z, I, 0, 0, d0)) ;
 
-    ERR (GrB_Matrix_extract_(A, Z, NULL, A, I, 0, J, 0, d0)) ;        // matrix extract
-    ERR (GrB_Matrix_extract_(A, NULL, Complex_plus, A, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_extract_(A, NULL, Complex_plus, Z, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_extract_(Z, NULL, o2 , A, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_extract_(A, NULL, o2 , Z, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_extract_(A, Z, NULL, A, I, 0, J, 0, d0)) ;        // matrix extract
+    ERR1 (A,  GrB_Matrix_extract_(A, NULL, Complex_plus, A, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_extract_(A, NULL, Complex_plus, Z, I, 0, J, 0, d0)) ;
+    ERR1 (Z,  GrB_Matrix_extract_(Z, NULL, o2 , A, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_extract_(A, NULL, o2 , Z, I, 0, J, 0, d0)) ;
 
     expected = GrB_DIMENSION_MISMATCH ;
 
-    ERR (GrB_Matrix_extract_(A, NULL, NULL, A, I, 1, J, 2, d0)) ;
-    ERR (GrB_Matrix_extract_(A, NULL, NULL, A, I, 1, J, 2, dtn)) ;
+    ERR1 (A,  GrB_Matrix_extract_(A, NULL, NULL, A, I, 1, J, 2, d0)) ;
+    ERR1 (A,  GrB_Matrix_extract_(A, NULL, NULL, A, I, 1, J, 2, dtn)) ;
 
     expected = GrB_INVALID_INDEX ;
 
@@ -2383,18 +2393,18 @@ void mexFunction
 
     OK  (GrB_Col_extract_(h, NULL, NULL, A, I, 1,   0, d0)) ;  // column extract
 
-    ERR (GrB_Col_extract_(h, NULL, NULL, A, I, 1, 911, d0)) ;  // column extract
+    ERR1 (h, GrB_Col_extract_(h, NULL, NULL, A, I, 1, 911, d0)) ;  // column extract
 
     expected = GrB_INDEX_OUT_OF_BOUNDS ;
 
     OK (GrB_Matrix_new (&H, GrB_FP64, 1, 1)) ;
 
     I [0] = 911 ;
-    ERR (GrB_Matrix_extract (H, NULL, NULL, A, I, 1, J, 1, d0)) ;
+    ERR1 (H, GrB_Matrix_extract (H, NULL, NULL, A, I, 1, J, 1, d0)) ;
     I [0] = 0 ;
 
     J [0] = 911 ;
-    ERR (GrB_Matrix_extract (H, NULL, NULL, A, I, 1, J, 1, d0)) ;
+    ERR1 (H, GrB_Matrix_extract (H, NULL, NULL, A, I, 1, J, 1, d0)) ;
     J [0] = 4 ;
 
     //--------------------------------------------------------------------------
@@ -2411,109 +2421,109 @@ void mexFunction
     // GxB_Vector_subassign_T (w,mask,acc,x,I,ni,d)
     // GxB_Matrix_subassign_T (C,Mask,acc,x,I,ni,J,nj,d)
 
-    ERR (GxB_Vector_subassign_(v0, NULL, NULL, v0, I0, 0, d0)) ;       // vector assign
-    ERR (GxB_Vector_subassign_(v , NULL, NULL, v0, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_(v , NULL, NULL, v , I0, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_(v0, NULL, NULL, v0, I0, 0, d0)) ;       // vector assign
+    ERR1 (v,  GxB_Vector_subassign_(v , NULL, NULL, v0, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_(v , NULL, NULL, v , I0, 0, d0)) ;
 
-    ERR (GxB_Matrix_subassign_(A0, NULL, NULL, A0, I0, 0, J0, 0, d0)) ;// matrix assign
-    ERR (GxB_Matrix_subassign_(A , NULL, NULL, A0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_(A , NULL, NULL, A , I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_(A , NULL, NULL, A , I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_(A0, NULL, NULL, A0, I0, 0, J0, 0, d0)) ;// matrix assign
+    ERR1 (A,  GxB_Matrix_subassign_(A , NULL, NULL, A0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_(A , NULL, NULL, A , I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_(A , NULL, NULL, A , I , 0, J0, 0, d0)) ;
 
-    ERR (GxB_Col_subassign_(A0, NULL, NULL, v0, I0, 0,  0, d0)) ;   // column assign
-    ERR (GxB_Col_subassign_(A , NULL, NULL, v0, I0, 0,  0, d0)) ;
-    ERR (GxB_Col_subassign_(A , NULL, NULL, v , I0, 0,  0, d0)) ;
+    ERR1 (A0, GxB_Col_subassign_(A0, NULL, NULL, v0, I0, 0,  0, d0)) ;   // column assign
+    ERR1 (A,  GxB_Col_subassign_(A , NULL, NULL, v0, I0, 0,  0, d0)) ;
+    ERR1 (A,  GxB_Col_subassign_(A , NULL, NULL, v , I0, 0,  0, d0)) ;
 
-    ERR (GxB_Row_subassign_(A0, NULL, NULL, v0,  0, J0, 0, d0)) ;   // row assign
-    ERR (GxB_Row_subassign_(A , NULL, NULL, v0,  0, J0, 0, d0)) ;
-    ERR (GxB_Row_subassign_(A , NULL, NULL, v ,  0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Row_subassign_(A0, NULL, NULL, v0,  0, J0, 0, d0)) ;   // row assign
+    ERR1 (A,  GxB_Row_subassign_(A , NULL, NULL, v0,  0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Row_subassign_(A , NULL, NULL, v ,  0, J0, 0, d0)) ;
 
-    ERR (GxB_Vector_subassign_FP64_(v0, NULL, NULL,  x, I0, 0, d0)) ;       // vector scalar
-    ERR (GxB_Vector_subassign_FP64_(v , NULL, NULL,  x, I0, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_FP64_(v0, NULL, NULL,  x, I0, 0, d0)) ;       // vector scalar
+    ERR1 (v,  GxB_Vector_subassign_FP64_(v , NULL, NULL,  x, I0, 0, d0)) ;
 
-    ERR (GxB_Vector_subassign_BOOL_(v0, NULL, NULL,  (bool) 0, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_BOOL_(v , NULL, NULL,  (bool) 0, I0, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_BOOL_(v0, NULL, NULL,  (bool) 0, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_BOOL_(v , NULL, NULL,  (bool) 0, I0, 0, d0)) ;
 
-    ERR (GxB_Vector_subassign_INT8_(v0, NULL, NULL,  (int8_t) 0, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT8_(v , NULL, NULL,  (int8_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_INT8_(v0, NULL, NULL,  (int8_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT8_(v , NULL, NULL,  (int8_t) 0, I0, 0, d0)) ;
 
-    ERR (GxB_Vector_subassign_UINT8_(v0, NULL, NULL,  (uint8_t) 0, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT8_(v , NULL, NULL,  (uint8_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_UINT8_(v0, NULL, NULL,  (uint8_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT8_(v , NULL, NULL,  (uint8_t) 0, I0, 0, d0)) ;
 
-    ERR (GxB_Vector_subassign_INT16_(v0, NULL, NULL,  (int16_t) 0, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT16_(v , NULL, NULL,  (int16_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_INT16_(v0, NULL, NULL,  (int16_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT16_(v , NULL, NULL,  (int16_t) 0, I0, 0, d0)) ;
 
-    ERR (GxB_Vector_subassign_UINT16_(v0, NULL, NULL,  (uint16_t) 0, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT16_(v , NULL, NULL,  (uint16_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_UINT16_(v0, NULL, NULL,  (uint16_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT16_(v , NULL, NULL,  (uint16_t) 0, I0, 0, d0)) ;
 
-    ERR (GxB_Vector_subassign_INT32_(v0, NULL, NULL,  (int32_t) 0, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT32_(v , NULL, NULL,  (int32_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_INT32_(v0, NULL, NULL,  (int32_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT32_(v , NULL, NULL,  (int32_t) 0, I0, 0, d0)) ;
 
-    ERR (GxB_Vector_subassign_UINT32_(v0, NULL, NULL,  (uint32_t) 0, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT32_(v , NULL, NULL,  (uint32_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_UINT32_(v0, NULL, NULL,  (uint32_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT32_(v , NULL, NULL,  (uint32_t) 0, I0, 0, d0)) ;
 
-    ERR (GxB_Vector_subassign_INT64_(v0, NULL, NULL,  (int64_t) 0, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT64_(v , NULL, NULL,  (int64_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_INT64_(v0, NULL, NULL,  (int64_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT64_(v , NULL, NULL,  (int64_t) 0, I0, 0, d0)) ;
 
-    ERR (GxB_Vector_subassign_UINT64_(v0, NULL, NULL,  (uint64_t) 0, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT64_(v , NULL, NULL,  (uint64_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_UINT64_(v0, NULL, NULL,  (uint64_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT64_(v , NULL, NULL,  (uint64_t) 0, I0, 0, d0)) ;
 
-    ERR (GxB_Vector_subassign_FP32_(v0, NULL, NULL,  (float) 0, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_FP32_(v , NULL, NULL,  (float) 0, I0, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_FP32_(v0, NULL, NULL,  (float) 0, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_FP32_(v , NULL, NULL,  (float) 0, I0, 0, d0)) ;
 
-    ERR (GxB_Vector_subassign_UDT_(v0, NULL, NULL,  (void *) X, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UDT_(v , NULL, NULL,  (void *) X, I0, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UDT_(v , NULL, NULL,  (void *) NULL, I, 0, d0)) ;
+    ERR1 (v0, GxB_Vector_subassign_UDT_(v0, NULL, NULL,  (void *) X, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UDT_(v , NULL, NULL,  (void *) X, I0, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UDT_(v , NULL, NULL,  (void *) NULL, I, 0, d0)) ;
 
 
-    ERR (GxB_Matrix_subassign_FP64_(A0, NULL, NULL,  x, I0, 0, J0, 0, d0)) ;// matrix scalar
-    ERR (GxB_Matrix_subassign_FP64_(A , NULL, NULL,  x, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_FP64_(A , NULL, NULL,  x, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_FP64_(A0, NULL, NULL,  x, I0, 0, J0, 0, d0)) ;// matrix scalar
+    ERR1 (A,  GxB_Matrix_subassign_FP64_(A , NULL, NULL,  x, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_FP64_(A , NULL, NULL,  x, I , 0, J0, 0, d0)) ;
 
-    ERR (GxB_Matrix_subassign_BOOL_(A0, NULL, NULL,  (bool) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_BOOL_(A , NULL, NULL,  (bool) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_BOOL_(A , NULL, NULL,  (bool) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_BOOL_(A0, NULL, NULL,  (bool) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_BOOL_(A , NULL, NULL,  (bool) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_BOOL_(A , NULL, NULL,  (bool) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GxB_Matrix_subassign_INT8_(A0, NULL, NULL,  (int8_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT8_(A , NULL, NULL,  (int8_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT8_(A , NULL, NULL,  (int8_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_INT8_(A0, NULL, NULL,  (int8_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT8_(A , NULL, NULL,  (int8_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT8_(A , NULL, NULL,  (int8_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GxB_Matrix_subassign_UINT8_(A0, NULL, NULL,  (uint8_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT8_(A , NULL, NULL,  (uint8_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT8_(A , NULL, NULL,  (uint8_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_UINT8_(A0, NULL, NULL,  (uint8_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT8_(A , NULL, NULL,  (uint8_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT8_(A , NULL, NULL,  (uint8_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GxB_Matrix_subassign_INT16_(A0, NULL, NULL,  (int16_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT16_(A , NULL, NULL,  (int16_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT16_(A , NULL, NULL,  (int16_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_INT16_(A0, NULL, NULL,  (int16_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT16_(A , NULL, NULL,  (int16_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT16_(A , NULL, NULL,  (int16_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GxB_Matrix_subassign_UINT16_(A0, NULL, NULL,  (uint16_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT16_(A , NULL, NULL,  (uint16_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT16_(A , NULL, NULL,  (uint16_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_UINT16_(A0, NULL, NULL,  (uint16_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT16_(A , NULL, NULL,  (uint16_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT16_(A , NULL, NULL,  (uint16_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GxB_Matrix_subassign_INT32_(A0, NULL, NULL,  (int32_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT32_(A , NULL, NULL,  (int32_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT32_(A , NULL, NULL,  (int32_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_INT32_(A0, NULL, NULL,  (int32_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT32_(A , NULL, NULL,  (int32_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT32_(A , NULL, NULL,  (int32_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GxB_Matrix_subassign_UINT32_(A0, NULL, NULL,  (uint32_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT32_(A , NULL, NULL,  (uint32_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT32_(A , NULL, NULL,  (uint32_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_UINT32_(A0, NULL, NULL,  (uint32_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT32_(A , NULL, NULL,  (uint32_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT32_(A , NULL, NULL,  (uint32_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GxB_Matrix_subassign_INT64_(A0, NULL, NULL,  (int64_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT64_(A , NULL, NULL,  (int64_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT64_(A , NULL, NULL,  (int64_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_INT64_(A0, NULL, NULL,  (int64_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT64_(A , NULL, NULL,  (int64_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT64_(A , NULL, NULL,  (int64_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GxB_Matrix_subassign_UINT64_(A0, NULL, NULL,  (uint64_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT64_(A , NULL, NULL,  (uint64_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT64_(A , NULL, NULL,  (uint64_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_UINT64_(A0, NULL, NULL,  (uint64_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT64_(A , NULL, NULL,  (uint64_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT64_(A , NULL, NULL,  (uint64_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GxB_Matrix_subassign_FP32_(A0, NULL, NULL,  (float) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_FP32_(A , NULL, NULL,  (float) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_FP32_(A , NULL, NULL,  (float) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_FP32_(A0, NULL, NULL,  (float) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_FP32_(A , NULL, NULL,  (float) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_FP32_(A , NULL, NULL,  (float) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GxB_Matrix_subassign_UDT_(A0, NULL, NULL,  (void *) X, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UDT_(A , NULL, NULL,  (void *) X, I0, 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UDT_(A , NULL, NULL,  (void *) X, I , 0, J0, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UDT_(A , NULL, NULL,  (void *) NULL, I , 0, J, 0, d0)) ;
+    ERR1 (A0, GxB_Matrix_subassign_UDT_(A0, NULL, NULL,  (void *) X, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UDT_(A , NULL, NULL,  (void *) X, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UDT_(A , NULL, NULL,  (void *) X, I , 0, J0, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UDT_(A , NULL, NULL,  (void *) NULL, I , 0, J, 0, d0)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
@@ -2522,149 +2532,149 @@ void mexFunction
     d0 = dgunk ;
     op0 = op2gunk ;
 
-    ERR (GxB_Vector_subassign_(v0, NULL, NULL, v0, I, 0, d0)) ;        // vector assign
-    ERR (GxB_Vector_subassign_(v , v0  , NULL, v0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_(v , v   , NULL, v0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_(v , v   , NULL, v , I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_(v , v   , op0 , v , I, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_(A0, NULL, NULL, A0, I, 0, J, 0, d0)) ;  // matrix assign
-    ERR (GxB_Matrix_subassign_(A , A0  , NULL, A0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_(A , A   , NULL, A0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_(A , A   , NULL, A , I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_(A , A   , op0 , A , I, 0, J, 0, NULL)) ;
-
-    ERR (GxB_Col_subassign_(A0, NULL, NULL, v0, I, 0,  0, d0)) ;    // column assign
-    ERR (GxB_Col_subassign_(A , v0  , NULL, v0, I, 0,  0, d0)) ;
-    ERR (GxB_Col_subassign_(A , v   , NULL, v0, I, 0,  0, d0)) ;
-    ERR (GxB_Col_subassign_(A , v   , NULL, v , I, 0,  0, d0)) ;
-    ERR (GxB_Col_subassign_(A , v   , op0 , v , I, 0,  0, NULL)) ;
-
-    ERR (GxB_Row_subassign_(A0, NULL, NULL, v0,  0, J, 0, d0)) ;    // row assign
-    ERR (GxB_Row_subassign_(A , v0  , NULL, v0,  0, J, 0, d0)) ;
-    ERR (GxB_Row_subassign_(A , v   , NULL, v0,  0, J, 0, d0)) ;
-    ERR (GxB_Row_subassign_(A , v   , NULL, v ,  0, J, 0, d0)) ;
-    ERR (GxB_Row_subassign_(A , NULL, op0 , v ,  0, J, 0, NULL)) ;
-
-    ERR (GxB_Vector_subassign_FP64 (v0, NULL, NULL,  x, I, 0, d0)) ;       // vector scalar
-    ERR (GxB_Vector_subassign_FP64 (v , v0  , NULL,  x, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_FP64 (v , v   , NULL,  x, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_FP64 (v , v   , op0 ,  x, I, 0, NULL)) ;
-
-    ERR (GxB_Vector_subassign_BOOL (v0, NULL, NULL,  (bool) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_BOOL (v , v0  , NULL,  (bool) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_BOOL (v , v   , NULL,  (bool) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_BOOL (v , v   , op0 ,  (bool) 0, I, 0, NULL)) ;
-
-    ERR (GxB_Vector_subassign_INT8 (v0, NULL, NULL,  (int8_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT8 (v , v0  , NULL,  (int8_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT8 (v , v   , NULL,  (int8_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT8 (v , v   , op0 ,  (int8_t) 0, I, 0, NULL)) ;
-
-    ERR (GxB_Vector_subassign_UINT8 (v0, NULL, NULL,  (uint8_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT8 (v , v0  , NULL,  (uint8_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT8 (v , v   , NULL,  (uint8_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT8 (v , v   , NULL,  (uint8_t) 0, I, 0, d0)) ;
-
-    ERR (GxB_Vector_subassign_INT16 (v0, NULL, NULL,  (int16_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT16 (v , v0  , NULL,  (int16_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT16 (v , v   , NULL,  (int16_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT16 (v , v   , op0 ,  (int16_t) 0, I, 0, NULL)) ;
-
-    ERR (GxB_Vector_subassign_UINT16 (v0, NULL, NULL,  (uint16_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT16 (v , v0  , NULL,  (uint16_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT16 (v , v   , NULL,  (uint16_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT16 (v , v   , op0 ,  (uint16_t) 0, I, 0, NULL)) ;
-
-    ERR (GxB_Vector_subassign_INT32 (v0, NULL, NULL,  (int32_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT32 (v , v0  , NULL,  (int32_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT32 (v , v   , NULL,  (int32_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT32 (v , v   , op0 ,  (int32_t) 0, I, 0, NULL)) ;
-
-    ERR (GxB_Vector_subassign_UINT32 (v0, NULL, NULL,  (uint32_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT32 (v , v0  , NULL,  (uint32_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT32 (v , v   , NULL,  (uint32_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT32 (v , v   , op0 ,  (uint32_t) 0, I, 0, NULL)) ;
-
-    ERR (GxB_Vector_subassign_INT64 (v0, NULL, NULL,  (int64_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT64 (v , v0  , NULL,  (int64_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT64 (v , v   , NULL,  (int64_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_INT64 (v , v   , op0 ,  (int64_t) 0, I, 0, NULL)) ;
-
-    ERR (GxB_Vector_subassign_UINT64 (v0, NULL, NULL,  (uint64_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT64 (v , v0  , NULL,  (uint64_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT64 (v , v   , NULL,  (uint64_t) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UINT64 (v , v   , op0 ,  (uint64_t) 0, I, 0, NULL)) ;
-
-    ERR (GxB_Vector_subassign_FP32 (v0, NULL, NULL,  (float) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_FP32 (v , v0  , NULL,  (float) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_FP32 (v , v   , NULL,  (float) 0, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_FP32 (v , v   , op0 ,  (float) 0, I, 0, NULL)) ;
-
-    ERR (GxB_Vector_subassign_UDT (v0, NULL, NULL,  (void *) X, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UDT (v , v0  , NULL,  (void *) X, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UDT (v , v   , NULL,  (void *) X, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UDT (v , v   , op0 ,  (void *) X, I, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_FP64_(A0, NULL, NULL,  x, I, 0, J, 0, d0)) ;  // matrix scalar
-    ERR (GxB_Matrix_subassign_FP64_(A , A0  , NULL,  x, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_FP64_(A , A   , NULL,  x, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_FP64_(A , A   , op0 ,  x, I, 0, J, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_BOOL_(A0, NULL, NULL,  (bool) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_BOOL_(A , A0  , NULL,  (bool) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_BOOL_(A , A   , NULL,  (bool) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_BOOL_(A , A   , op0 ,  (bool) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_INT8_(A0, NULL, NULL,  (int8_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT8_(A , A0  , NULL,  (int8_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT8_(A , A   , NULL,  (int8_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT8_(A , A   , op0 ,  (int8_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_UINT8_(A0, NULL, NULL,  (uint8_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT8_(A , A0  , NULL,  (uint8_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT8_(A , A   , NULL,  (uint8_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT8_(A , A   , op0 ,  (uint8_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_INT16_(A0, NULL, NULL,  (int16_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT16_(A , A0  , NULL,  (int16_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT16_(A , A   , NULL,  (int16_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT16_(A , A   , op0 ,  (int16_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_UINT16_(A0, NULL, NULL,  (uint16_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT16_(A , A0  , NULL,  (uint16_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT16_(A , A   , NULL,  (uint16_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT16_(A , A   , op0 ,  (uint16_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_INT32_(A0, NULL, NULL,  (int32_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT32_(A , A0  , NULL,  (int32_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT32_(A , A   , NULL,  (int32_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT32_(A , A   , op0 ,  (int32_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_UINT32_(A0, NULL, NULL,  (uint32_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT32_(A , A0  , NULL,  (uint32_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT32_(A , A   , NULL,  (uint32_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT32_(A , A   , op0 ,  (uint32_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_INT64_(A0, NULL, NULL,  (int64_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT64_(A , A0  , NULL,  (int64_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT64_(A , A   , NULL,  (int64_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_INT64_(A , A   , op0 ,  (int64_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_UINT64_(A0, NULL, NULL,  (uint64_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT64_(A , A0  , NULL,  (uint64_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT64_(A , A   , NULL,  (uint64_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UINT64_(A , A   , op0 ,  (uint64_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_FP32_(A0, NULL, NULL,  (float) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_FP32_(A , A0  , NULL,  (float) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_FP32_(A , A   , NULL,  (float) 0, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_FP32_(A , A   , op0 ,  (float) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GxB_Matrix_subassign_UDT_(A0, NULL, NULL,  (void *) X, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UDT_(A , A0  , NULL,  (void *) X, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UDT_(A , A   , NULL,  (void *) X, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UDT_(A , A   , op0 ,  (void *) X, I, 0, J, 0, NULL)) ;
+    ERR1 (v0, GxB_Vector_subassign_(v0, NULL, NULL, v0, I, 0, d0)) ;        // vector assign
+    ERR1 (v,  GxB_Vector_subassign_(v , v0  , NULL, v0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_(v , v   , NULL, v0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_(v , v   , NULL, v , I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_(v , v   , op0 , v , I, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_(A0, NULL, NULL, A0, I, 0, J, 0, d0)) ;  // matrix assign
+    ERR1 (A,  GxB_Matrix_subassign_(A , A0  , NULL, A0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_(A , A   , NULL, A0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_(A , A   , NULL, A , I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_(A , A   , op0 , A , I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Col_subassign_(A0, NULL, NULL, v0, I, 0,  0, d0)) ;    // column assign
+    ERR1 (A,  GxB_Col_subassign_(A , v0  , NULL, v0, I, 0,  0, d0)) ;
+    ERR1 (A,  GxB_Col_subassign_(A , v   , NULL, v0, I, 0,  0, d0)) ;
+    ERR1 (A,  GxB_Col_subassign_(A , v   , NULL, v , I, 0,  0, d0)) ;
+    ERR1 (A,  GxB_Col_subassign_(A , v   , op0 , v , I, 0,  0, NULL)) ;
+
+    ERR1 (A0, GxB_Row_subassign_(A0, NULL, NULL, v0,  0, J, 0, d0)) ;    // row assign
+    ERR1 (A,  GxB_Row_subassign_(A , v0  , NULL, v0,  0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Row_subassign_(A , v   , NULL, v0,  0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Row_subassign_(A , v   , NULL, v ,  0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Row_subassign_(A , NULL, op0 , v ,  0, J, 0, NULL)) ;
+
+    ERR1 (v0, GxB_Vector_subassign_FP64 (v0, NULL, NULL,  x, I, 0, d0)) ;       // vector scalar
+    ERR1 (v,  GxB_Vector_subassign_FP64 (v , v0  , NULL,  x, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_FP64 (v , v   , NULL,  x, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_FP64 (v , v   , op0 ,  x, I, 0, NULL)) ;
+
+    ERR1 (v0, GxB_Vector_subassign_BOOL (v0, NULL, NULL,  (bool) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_BOOL (v , v0  , NULL,  (bool) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_BOOL (v , v   , NULL,  (bool) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_BOOL (v , v   , op0 ,  (bool) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GxB_Vector_subassign_INT8 (v0, NULL, NULL,  (int8_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT8 (v , v0  , NULL,  (int8_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT8 (v , v   , NULL,  (int8_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT8 (v , v   , op0 ,  (int8_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GxB_Vector_subassign_UINT8 (v0, NULL, NULL,  (uint8_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT8 (v , v0  , NULL,  (uint8_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT8 (v , v   , NULL,  (uint8_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT8 (v , v   , NULL,  (uint8_t) 0, I, 0, d0)) ;
+
+    ERR1 (v0, GxB_Vector_subassign_INT16 (v0, NULL, NULL,  (int16_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT16 (v , v0  , NULL,  (int16_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT16 (v , v   , NULL,  (int16_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT16 (v , v   , op0 ,  (int16_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GxB_Vector_subassign_UINT16 (v0, NULL, NULL,  (uint16_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT16 (v , v0  , NULL,  (uint16_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT16 (v , v   , NULL,  (uint16_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT16 (v , v   , op0 ,  (uint16_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GxB_Vector_subassign_INT32 (v0, NULL, NULL,  (int32_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT32 (v , v0  , NULL,  (int32_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT32 (v , v   , NULL,  (int32_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT32 (v , v   , op0 ,  (int32_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GxB_Vector_subassign_UINT32 (v0, NULL, NULL,  (uint32_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT32 (v , v0  , NULL,  (uint32_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT32 (v , v   , NULL,  (uint32_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT32 (v , v   , op0 ,  (uint32_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GxB_Vector_subassign_INT64 (v0, NULL, NULL,  (int64_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT64 (v , v0  , NULL,  (int64_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT64 (v , v   , NULL,  (int64_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_INT64 (v , v   , op0 ,  (int64_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GxB_Vector_subassign_UINT64 (v0, NULL, NULL,  (uint64_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT64 (v , v0  , NULL,  (uint64_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT64 (v , v   , NULL,  (uint64_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UINT64 (v , v   , op0 ,  (uint64_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GxB_Vector_subassign_FP32 (v0, NULL, NULL,  (float) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_FP32 (v , v0  , NULL,  (float) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_FP32 (v , v   , NULL,  (float) 0, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_FP32 (v , v   , op0 ,  (float) 0, I, 0, NULL)) ;
+
+    ERR1 (v,  GxB_Vector_subassign_UDT (v0, NULL, NULL,  (void *) X, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UDT (v , v0  , NULL,  (void *) X, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UDT (v , v   , NULL,  (void *) X, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UDT (v , v   , op0 ,  (void *) X, I, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_FP64_(A0, NULL, NULL,  x, I, 0, J, 0, d0)) ;  // matrix scalar
+    ERR1 (A,  GxB_Matrix_subassign_FP64_(A , A0  , NULL,  x, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_FP64_(A , A   , NULL,  x, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_FP64_(A , A   , op0 ,  x, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_BOOL_(A0, NULL, NULL,  (bool) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_BOOL_(A , A0  , NULL,  (bool) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_BOOL_(A , A   , NULL,  (bool) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_BOOL_(A , A   , op0 ,  (bool) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_INT8_(A0, NULL, NULL,  (int8_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT8_(A , A0  , NULL,  (int8_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT8_(A , A   , NULL,  (int8_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT8_(A , A   , op0 ,  (int8_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_UINT8_(A0, NULL, NULL,  (uint8_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT8_(A , A0  , NULL,  (uint8_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT8_(A , A   , NULL,  (uint8_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT8_(A , A   , op0 ,  (uint8_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_INT16_(A0, NULL, NULL,  (int16_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT16_(A , A0  , NULL,  (int16_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT16_(A , A   , NULL,  (int16_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT16_(A , A   , op0 ,  (int16_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_UINT16_(A0, NULL, NULL,  (uint16_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT16_(A , A0  , NULL,  (uint16_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT16_(A , A   , NULL,  (uint16_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT16_(A , A   , op0 ,  (uint16_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_INT32_(A0, NULL, NULL,  (int32_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT32_(A , A0  , NULL,  (int32_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT32_(A , A   , NULL,  (int32_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT32_(A , A   , op0 ,  (int32_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_UINT32_(A0, NULL, NULL,  (uint32_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT32_(A , A0  , NULL,  (uint32_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT32_(A , A   , NULL,  (uint32_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT32_(A , A   , op0 ,  (uint32_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_INT64_(A0, NULL, NULL,  (int64_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT64_(A , A0  , NULL,  (int64_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT64_(A , A   , NULL,  (int64_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_INT64_(A , A   , op0 ,  (int64_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_UINT64_(A0, NULL, NULL,  (uint64_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT64_(A , A0  , NULL,  (uint64_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT64_(A , A   , NULL,  (uint64_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UINT64_(A , A   , op0 ,  (uint64_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_FP32_(A0, NULL, NULL,  (float) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_FP32_(A , A0  , NULL,  (float) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_FP32_(A , A   , NULL,  (float) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_FP32_(A , A   , op0 ,  (float) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GxB_Matrix_subassign_UDT_(A0, NULL, NULL,  (void *) X, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UDT_(A , A0  , NULL,  (void *) X, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UDT_(A , A   , NULL,  (void *) X, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_UDT_(A , A   , op0 ,  (void *) X, I, 0, J, 0, NULL)) ;
 
     v0 = NULL ;
     A0 = NULL ;
@@ -2672,35 +2682,35 @@ void mexFunction
     op0 = Complex_plus ;
     expected = (Complex == GxB_FC64) ? GrB_DIMENSION_MISMATCH : GrB_DOMAIN_MISMATCH ;
 
-    ERR (GxB_Vector_subassign_(v, z , NULL, v, I, 0, d0)) ;            // vector assign
-    ERR (GxB_Vector_subassign_(v, v0, op0 , v, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_(v, v0, op0 , z, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_(z, v0, o2  , v, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_(v, v0, o2  , z, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_(v, v0, NULL, z, I, 0, d0)) ;
-
-    ERR (GxB_Matrix_subassign_(A, Z , NULL, A, I, 0, J, 0, d0)) ;      // matrix assign
-    ERR (GxB_Matrix_subassign_(A, A0, op0 , A, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_(A, A0, op0 , Z, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_(Z, A0, o2  , A, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_(A, A0, o2  , Z, I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_(A, A0, NULL, Z, I, 0, J, 0, d0)) ;
-
-    ERR (GxB_Col_subassign_(A, z , NULL, v, I, 0, 0, d0)) ;         // column assign
-    ERR (GxB_Col_subassign_(A, v0, op0 , v, I, 0, 0, d0)) ;
-    ERR (GxB_Col_subassign_(A, v0, op0 , z, I, 0, 0, d0)) ;
-    ERR (GxB_Col_subassign_(Z, v0, o2  , v, I, 0, 0, d0)) ;
-    ERR (GxB_Col_subassign_(A, v0, o2  , z, I, 0, 0, d0)) ;
-    ERR (GxB_Col_subassign_(A, v0, NULL, z, I, 0, 0, d0)) ;
-
-    ERR (GxB_Row_subassign_(A, z , NULL, v, 0, J, 0, d0)) ;         // row assign
-    ERR (GxB_Row_subassign_(A, v0, op0 , v, 0, J, 0, d0)) ;
-    ERR (GxB_Row_subassign_(A, v0, op0 , z, 0, J, 0, d0)) ;
-    ERR (GxB_Row_subassign_(Z, v0, o2  , v, 0, J, 0, d0)) ;
-    ERR (GxB_Row_subassign_(A, v0, o2  , z, 0, J, 0, d0)) ;
-    ERR (GxB_Row_subassign_(A, v0, NULL, z, 0, J, 0, d0)) ;
-
-    ERR (GxB_Vector_subassign_FP64_(v, z , NULL, x, I, 0, d0)) ;            // vector scalar
+    ERR1 (v,  GxB_Vector_subassign_(v, z , NULL, v, I, 0, d0)) ;            // vector assign
+    ERR1 (v,  GxB_Vector_subassign_(v, v0, op0 , v, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_(v, v0, op0 , z, I, 0, d0)) ;
+    ERR1 (z,  GxB_Vector_subassign_(z, v0, o2  , v, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_(v, v0, o2  , z, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_(v, v0, NULL, z, I, 0, d0)) ;
+
+    ERR1 (A,  GxB_Matrix_subassign_(A, Z , NULL, A, I, 0, J, 0, d0)) ;      // matrix assign
+    ERR1 (A,  GxB_Matrix_subassign_(A, A0, op0 , A, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_(A, A0, op0 , Z, I, 0, J, 0, d0)) ;
+    ERR1 (Z,  GxB_Matrix_subassign_(Z, A0, o2  , A, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_(A, A0, o2  , Z, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Matrix_subassign_(A, A0, NULL, Z, I, 0, J, 0, d0)) ;
+
+    ERR1 (A,  GxB_Col_subassign_(A, z , NULL, v, I, 0, 0, d0)) ;         // column assign
+    ERR1 (A,  GxB_Col_subassign_(A, v0, op0 , v, I, 0, 0, d0)) ;
+    ERR1 (A,  GxB_Col_subassign_(A, v0, op0 , z, I, 0, 0, d0)) ;
+    ERR1 (Z,  GxB_Col_subassign_(Z, v0, o2  , v, I, 0, 0, d0)) ;
+    ERR1 (A,  GxB_Col_subassign_(A, v0, o2  , z, I, 0, 0, d0)) ;
+    ERR1 (A,  GxB_Col_subassign_(A, v0, NULL, z, I, 0, 0, d0)) ;
+
+    ERR1 (A,  GxB_Row_subassign_(A, z , NULL, v, 0, J, 0, d0)) ;         // row assign
+    ERR1 (A,  GxB_Row_subassign_(A, v0, op0 , v, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Row_subassign_(A, v0, op0 , z, 0, J, 0, d0)) ;
+    ERR1 (Z,  GxB_Row_subassign_(Z, v0, o2  , v, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Row_subassign_(A, v0, o2  , z, 0, J, 0, d0)) ;
+    ERR1 (A,  GxB_Row_subassign_(A, v0, NULL, z, 0, J, 0, d0)) ;
+
+    ERR1 (v,  GxB_Vector_subassign_FP64_(v, z , NULL, x, I, 0, d0)) ;            // vector scalar
 
     if (Complex == GxB_FC64)
     {
@@ -2708,12 +2718,12 @@ void mexFunction
     }
     else
     {
-        ERR (GxB_Vector_subassign_FP64_(v, v0, op0 , x, I, 0, d0)) ;
+        ERR1 (v,  GxB_Vector_subassign_FP64_(v, v0, op0 , x, I, 0, d0)) ;
     }
 
     expected = GrB_DOMAIN_MISMATCH ;
 
-    ERR (GxB_Vector_subassign_UDT_(v, v0, op0 ,(void *) &c, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UDT_(v, v0, op0 ,(void *) &c, I, 0, d0)) ;
 
     if (Complex == GxB_FC64)
     {
@@ -2721,11 +2731,11 @@ void mexFunction
     }
     else
     {
-        ERR (GxB_Vector_subassign_FP64_(z, v0, o2  , x, I, 0, d0)) ;
+        ERR1 (z,  GxB_Vector_subassign_FP64_(z, v0, o2  , x, I, 0, d0)) ;
     }
 
-    ERR (GxB_Vector_subassign_UDT_(v, v0, o2  ,(void *) &c, I, 0, d0)) ;
-    ERR (GxB_Vector_subassign_UDT_(v, v0, NULL,(void *) &c, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UDT_(v, v0, o2  ,(void *) &c, I, 0, d0)) ;
+    ERR1 (v,  GxB_Vector_subassign_UDT_(v, v0, NULL,(void *) &c, I, 0, d0)) ;
 
     // matrix scalar
     if (Complex == GxB_FC64)
@@ -2737,21 +2747,22 @@ void mexFunction
     else
     {
         expected = GrB_DOMAIN_MISMATCH ;
-        ERR (GxB_Matrix_subassign_FP64_(A, A0, op0 , x, I, 0, J, 0, d0)) ;
-        ERR (GxB_Matrix_subassign_FP64_(Z, A0, o2  , x, I, 0, J, 0, d0)) ;
+        ERR1 (A, GxB_Matrix_subassign_FP64_(A, A0, op0 , x, I, 0, J, 0, d0)) ;
+        ERR1 (Z, GxB_Matrix_subassign_FP64_(Z, A0, o2  , x, I, 0, J, 0, d0)) ;
     }
-    ERR (GxB_Matrix_subassign_FP64_(A, Z , NULL, x, I, 0, J, 0, d0)) ;
+
+    ERR1 (A, GxB_Matrix_subassign_FP64_(A, Z , NULL, x, I, 0, J, 0, d0)) ;
 
     expected = GrB_DOMAIN_MISMATCH ;
-    ERR (GxB_Matrix_subassign_UDT_(A, A0, op0 ,(void *) &c , I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UDT_(A, A0, o2  ,(void *) &c , I, 0, J, 0, d0)) ;
-    ERR (GxB_Matrix_subassign_UDT_(A, A0, NULL,(void *) &c , I, 0, J, 0, d0)) ;
+    ERR1 (A, GxB_Matrix_subassign_UDT_(A, A0, op0 ,(void *) &c , I, 0, J, 0, d0)) ;
+    ERR1 (A, GxB_Matrix_subassign_UDT_(A, A0, o2  ,(void *) &c , I, 0, J, 0, d0)) ;
+    ERR1 (A, GxB_Matrix_subassign_UDT_(A, A0, NULL,(void *) &c , I, 0, J, 0, d0)) ;
 
     expected = GrB_DIMENSION_MISMATCH ;
 
-    ERR (GxB_Matrix_subassign_(A, NULL, NULL, A, I, 2, J, 3, d0)) ;
-    ERR (GxB_Matrix_subassign_(A, NULL, NULL, A, I, 2, J, 3, dtn)) ;
-    ERR (GxB_Row_subassign_(A , v   , NULL, v ,  0, J, 0, NULL)) ;
+    ERR1 (A, GxB_Matrix_subassign_(A, NULL, NULL, A, I, 2, J, 3, d0)) ;
+    ERR1 (A, GxB_Matrix_subassign_(A, NULL, NULL, A, I, 2, J, 3, dtn)) ;
+    ERR1 (A, GxB_Row_subassign_(A , v   , NULL, v ,  0, J, 0, NULL)) ;
 
     fprintf (ff, "test for indices out of bounds:\n") ;
     OK (GxB_Matrix_fprint (A, "A", GxB_COMPLETE, ff)) ;
@@ -2767,14 +2778,14 @@ void mexFunction
     expected = GrB_INDEX_OUT_OF_BOUNDS ;
 
     OK (GrB_Matrix_dup (&A4, A)) ;
-    ERR (GxB_Matrix_subassign_(A4, NULL, GrB_PLUS_FP64, C, I, 3, J, 2, NULL)) ;
+    ERR1 (A4, GxB_Matrix_subassign_(A4, NULL, GrB_PLUS_FP64, C, I, 3, J, 2, NULL)) ;
+    GrB_Matrix_error_(&err, A4) ;
+    fprintf (ff, "done bounds test: error returned:\n%s\n", err) ;
     OK (GrB_Matrix_free_(&A4)) ;
-    fprintf (ff, "done bounds test: error returned:\n%s\n", GrB_error ( )) ;
 
     GrB_Index I3 [5] = { 0,   1,   2,   3,    4 } ;
     GrB_Index J3 [5] = { 0,   1,   2,   3,    4 } ;
 
-    printf ("here2\n") ;
     OK (GxB_Matrix_fprint_(A, GxB_COMPLETE, NULL)) ;
     OK (GxB_Matrix_fprint_(A, GxB_COMPLETE, ff)) ;
     OK (GxB_Matrix_subassign_(A, NULL, GrB_PLUS_FP64, C, I3, 3, J3, 2, NULL)) ;
@@ -2783,18 +2794,18 @@ void mexFunction
 
     J3 [0] = 999 ;
     OK (GrB_Matrix_dup (&C4, C)) ;
-    ERR (GxB_Matrix_subassign (C4, C4, GrB_PLUS_FP64, C4, I3, 3, J3, 2, NULL)) ;
+    ERR1 (C4, GxB_Matrix_subassign (C4, C4, GrB_PLUS_FP64, C4, I3, 3, J3, 2, NULL)) ;
     OK (GrB_Matrix_free_(&C4)) ;
 
     OK (GrB_Matrix_dup (&A4, A)) ;
-    ERR (GxB_Matrix_subassign_FP64_(A4, NULL, GrB_PLUS_FP64, x_double, I3, 1, J3, 1, NULL));
+    ERR1 (A4, GxB_Matrix_subassign_FP64_(A4, NULL, GrB_PLUS_FP64, x_double, I3, 1, J3, 1, NULL));
     OK (GrB_Matrix_free_(&A4)) ;
 
     J3 [0] = 0 ;
     I3 [0] = 999 ;
 
     OK (GrB_Matrix_dup (&A4, A)) ;
-    ERR (GxB_Matrix_subassign_FP64_(A4, NULL, GrB_PLUS_FP64, x_double, I3, 1, J3, 1, NULL));
+    ERR1 (A4, GxB_Matrix_subassign_FP64_(A4, NULL, GrB_PLUS_FP64, x_double, I3, 1, J3, 1, NULL));
     OK (GrB_Matrix_free_(&A4)) ;
 
     //--------------------------------------------------------------------------
@@ -2811,108 +2822,108 @@ void mexFunction
     // GrB_Vector_assign_T (w,mask,acc,x,I,ni,d)
     // GrB_Matrix_assign_T (C,Mask,acc,x,I,ni,J,nj,d)
 
-    ERR (GrB_Vector_assign_(v0, NULL, NULL, v0, I0, 0, d0)) ;          // vector assign
-    ERR (GrB_Vector_assign_(v , NULL, NULL, v0, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_(v , NULL, NULL, v , I0, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_(v0, NULL, NULL, v0, I0, 0, d0)) ;          // vector assign
+    ERR1 (v,  GrB_Vector_assign_(v , NULL, NULL, v0, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_(v , NULL, NULL, v , I0, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_(A0, NULL, NULL, A0, I0, 0, J0, 0, d0)) ;   // matrix assign
-    ERR (GrB_Matrix_assign_(A , NULL, NULL, A0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_(A , NULL, NULL, A , I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_(A , NULL, NULL, A , I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_(A0, NULL, NULL, A0, I0, 0, J0, 0, d0)) ;   // matrix assign
+    ERR1 (A,  GrB_Matrix_assign_(A , NULL, NULL, A0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_(A , NULL, NULL, A , I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_(A , NULL, NULL, A , I , 0, J0, 0, d0)) ;
 
-    ERR (GrB_Col_assign_(A0, NULL, NULL, v0, I0, 0,  0, d0)) ;      // column assign
-    ERR (GrB_Col_assign_(A , NULL, NULL, v0, I0, 0,  0, d0)) ;
-    ERR (GrB_Col_assign_(A , NULL, NULL, v , I0, 0,  0, d0)) ;
+    ERR1 (A0, GrB_Col_assign_(A0, NULL, NULL, v0, I0, 0,  0, d0)) ;      // column assign
+    ERR1 (A,  GrB_Col_assign_(A , NULL, NULL, v0, I0, 0,  0, d0)) ;
+    ERR1 (A,  GrB_Col_assign_(A , NULL, NULL, v , I0, 0,  0, d0)) ;
 
-    ERR (GrB_Row_assign_(A0, NULL, NULL, v0,  0, J0, 0, d0)) ;      // row assign
-    ERR (GrB_Row_assign_(A , NULL, NULL, v0,  0, J0, 0, d0)) ;
-    ERR (GrB_Row_assign_(A , NULL, NULL, v ,  0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Row_assign_(A0, NULL, NULL, v0,  0, J0, 0, d0)) ;      // row assign
+    ERR1 (A,  GrB_Row_assign_(A , NULL, NULL, v0,  0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Row_assign_(A , NULL, NULL, v ,  0, J0, 0, d0)) ;
 
-    ERR (GrB_Vector_assign_FP64_(v0, NULL, NULL,  x, I0, 0, d0)) ;          // vector scalar
-    ERR (GrB_Vector_assign_FP64_(v , NULL, NULL,  x, I0, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_FP64_(v0, NULL, NULL,  x, I0, 0, d0)) ;          // vector scalar
+    ERR1 (v,  GrB_Vector_assign_FP64_(v , NULL, NULL,  x, I0, 0, d0)) ;
 
-    ERR (GrB_Vector_assign_BOOL_(v0, NULL, NULL,  (bool) 0, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_BOOL_(v , NULL, NULL,  (bool) 0, I0, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_BOOL_(v0, NULL, NULL,  (bool) 0, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_BOOL_(v , NULL, NULL,  (bool) 0, I0, 0, d0)) ;
 
-    ERR (GrB_Vector_assign_INT8_(v0, NULL, NULL,  (int8_t) 0, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT8_(v , NULL, NULL,  (int8_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_INT8_(v0, NULL, NULL,  (int8_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT8_(v , NULL, NULL,  (int8_t) 0, I0, 0, d0)) ;
 
-    ERR (GrB_Vector_assign_UINT8_(v0, NULL, NULL,  (uint8_t) 0, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT8_(v , NULL, NULL,  (uint8_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_UINT8_(v0, NULL, NULL,  (uint8_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT8_(v , NULL, NULL,  (uint8_t) 0, I0, 0, d0)) ;
 
-    ERR (GrB_Vector_assign_INT16_(v0, NULL, NULL,  (int16_t) 0, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT16_(v , NULL, NULL,  (int16_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_INT16_(v0, NULL, NULL,  (int16_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT16_(v , NULL, NULL,  (int16_t) 0, I0, 0, d0)) ;
 
-    ERR (GrB_Vector_assign_UINT16_(v0, NULL, NULL,  (uint16_t) 0, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT16_(v , NULL, NULL,  (uint16_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_UINT16_(v0, NULL, NULL,  (uint16_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT16_(v , NULL, NULL,  (uint16_t) 0, I0, 0, d0)) ;
 
-    ERR (GrB_Vector_assign_INT32_(v0, NULL, NULL,  (int32_t) 0, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT32_(v , NULL, NULL,  (int32_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_INT32_(v0, NULL, NULL,  (int32_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT32_(v , NULL, NULL,  (int32_t) 0, I0, 0, d0)) ;
 
-    ERR (GrB_Vector_assign_UINT32_(v0, NULL, NULL,  (uint32_t) 0, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT32_(v , NULL, NULL,  (uint32_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_UINT32_(v0, NULL, NULL,  (uint32_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT32_(v , NULL, NULL,  (uint32_t) 0, I0, 0, d0)) ;
 
-    ERR (GrB_Vector_assign_INT64_(v0, NULL, NULL,  (int64_t) 0, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT64_(v , NULL, NULL,  (int64_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_INT64_(v0, NULL, NULL,  (int64_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT64_(v , NULL, NULL,  (int64_t) 0, I0, 0, d0)) ;
 
-    ERR (GrB_Vector_assign_UINT64_(v0, NULL, NULL,  (uint64_t) 0, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT64_(v , NULL, NULL,  (uint64_t) 0, I0, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_UINT64_(v0, NULL, NULL,  (uint64_t) 0, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT64_(v , NULL, NULL,  (uint64_t) 0, I0, 0, d0)) ;
 
-    ERR (GrB_Vector_assign_FP32_(v0, NULL, NULL,  (float) 0, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_FP32_(v , NULL, NULL,  (float) 0, I0, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_FP32_(v0, NULL, NULL,  (float) 0, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_FP32_(v , NULL, NULL,  (float) 0, I0, 0, d0)) ;
 
-    ERR (GrB_Vector_assign_UDT_(v0, NULL, NULL,  (void *) X, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_UDT_(v , NULL, NULL,  (void *) X, I0, 0, d0)) ;
-    ERR (GrB_Vector_assign_UDT_(v , NULL, NULL,  (void *) NULL, I, 0, d0)) ;
+    ERR1 (v0, GrB_Vector_assign_UDT_(v0, NULL, NULL,  (void *) X, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UDT_(v , NULL, NULL,  (void *) X, I0, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UDT_(v , NULL, NULL,  (void *) NULL, I, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_FP64_(A0, NULL, NULL,  x, I0, 0, J0, 0, d0)) ;   // matrix scalar
-    ERR (GrB_Matrix_assign_FP64_(A , NULL, NULL,  x, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_FP64_(A , NULL, NULL,  x, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_FP64_(A0, NULL, NULL,  x, I0, 0, J0, 0, d0)) ;   // matrix scalar
+    ERR1 (A,  GrB_Matrix_assign_FP64_(A , NULL, NULL,  x, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_FP64_(A , NULL, NULL,  x, I , 0, J0, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_BOOL_(A0, NULL, NULL,  (bool) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_BOOL_(A , NULL, NULL,  (bool) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_BOOL_(A , NULL, NULL,  (bool) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_BOOL_(A0, NULL, NULL,  (bool) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_BOOL_(A , NULL, NULL,  (bool) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_BOOL_(A , NULL, NULL,  (bool) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_INT8_(A0, NULL, NULL,  (int8_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT8_(A , NULL, NULL,  (int8_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT8_(A , NULL, NULL,  (int8_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_INT8_(A0, NULL, NULL,  (int8_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT8_(A , NULL, NULL,  (int8_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT8_(A , NULL, NULL,  (int8_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_UINT8_(A0, NULL, NULL,  (uint8_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT8_(A , NULL, NULL,  (uint8_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT8_(A , NULL, NULL,  (uint8_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_UINT8_(A0, NULL, NULL,  (uint8_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT8_(A , NULL, NULL,  (uint8_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT8_(A , NULL, NULL,  (uint8_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_INT16_(A0, NULL, NULL,  (int16_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT16_(A , NULL, NULL,  (int16_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT16_(A , NULL, NULL,  (int16_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_INT16_(A0, NULL, NULL,  (int16_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT16_(A , NULL, NULL,  (int16_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT16_(A , NULL, NULL,  (int16_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_UINT16_(A0, NULL, NULL,  (uint16_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT16_(A , NULL, NULL,  (uint16_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT16_(A , NULL, NULL,  (uint16_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_UINT16_(A0, NULL, NULL,  (uint16_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT16_(A , NULL, NULL,  (uint16_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT16_(A , NULL, NULL,  (uint16_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_INT32_(A0, NULL, NULL,  (int32_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT32_(A , NULL, NULL,  (int32_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT32_(A , NULL, NULL,  (int32_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_INT32_(A0, NULL, NULL,  (int32_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT32_(A , NULL, NULL,  (int32_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT32_(A , NULL, NULL,  (int32_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_UINT32_(A0, NULL, NULL,  (uint32_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT32_(A , NULL, NULL,  (uint32_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT32_(A , NULL, NULL,  (uint32_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_UINT32_(A0, NULL, NULL,  (uint32_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT32_(A , NULL, NULL,  (uint32_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT32_(A , NULL, NULL,  (uint32_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_INT64_(A0, NULL, NULL,  (int64_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT64_(A , NULL, NULL,  (int64_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT64_(A , NULL, NULL,  (int64_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_INT64_(A0, NULL, NULL,  (int64_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT64_(A , NULL, NULL,  (int64_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT64_(A , NULL, NULL,  (int64_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_UINT64_(A0, NULL, NULL,  (uint64_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT64_(A , NULL, NULL,  (uint64_t) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT64_(A , NULL, NULL,  (uint64_t) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_UINT64_(A0, NULL, NULL,  (uint64_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT64_(A , NULL, NULL,  (uint64_t) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT64_(A , NULL, NULL,  (uint64_t) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_FP32_(A0, NULL, NULL,  (float) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_FP32_(A , NULL, NULL,  (float) 0, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_FP32_(A , NULL, NULL,  (float) 0, I , 0, J0, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_FP32_(A0, NULL, NULL,  (float) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_FP32_(A , NULL, NULL,  (float) 0, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_FP32_(A , NULL, NULL,  (float) 0, I , 0, J0, 0, d0)) ;
 
-    ERR (GrB_Matrix_assign_UDT_(A0, NULL, NULL,  (void *) X, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UDT_(A , NULL, NULL,  (void *) X, I0, 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UDT_(A , NULL, NULL,  (void *) X, I , 0, J0, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UDT_(A , NULL, NULL,  (void *) NULL, I , 0, J, 0, d0)) ;
+    ERR1 (A0, GrB_Matrix_assign_UDT_(A0, NULL, NULL,  (void *) X, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UDT_(A , NULL, NULL,  (void *) X, I0, 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UDT_(A , NULL, NULL,  (void *) X, I , 0, J0, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UDT_(A , NULL, NULL,  (void *) NULL, I , 0, J, 0, d0)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
@@ -2921,149 +2932,149 @@ void mexFunction
     d0 = dgunk ;
     op0 = op2gunk ;
 
-    ERR (GrB_Vector_assign_(v0, NULL, NULL, v0, I, 0, d0)) ;          // vector assign
-    ERR (GrB_Vector_assign_(v , v0  , NULL, v0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_(v , v   , NULL, v0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_(v , v   , NULL, v , I, 0, d0)) ;
-    ERR (GrB_Vector_assign_(v , v   , op0 , v , I, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_(A0, NULL, NULL, A0, I, 0, J, 0, d0)) ;   // matrix assign
-    ERR (GrB_Matrix_assign_(A , A0  , NULL, A0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_(A , A   , NULL, A0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_(A , A   , NULL, A , I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_(A , A   , op0 , A , I, 0, J, 0, NULL)) ;
-
-    ERR (GrB_Col_assign_(A0, NULL, NULL, v0, I, 0,  0, d0)) ;      // column assign
-    ERR (GrB_Col_assign_(A , v0  , NULL, v0, I, 0,  0, d0)) ;
-    ERR (GrB_Col_assign_(A , v   , NULL, v0, I, 0,  0, d0)) ;
-    ERR (GrB_Col_assign_(A , v   , NULL, v , I, 0,  0, d0)) ;
-    ERR (GrB_Col_assign_(A , v   , op0 , v , I, 0,  0, NULL)) ;
-
-    ERR (GrB_Row_assign_(A0, NULL, NULL, v0,  0, J, 0, d0)) ;      // row assign
-    ERR (GrB_Row_assign_(A , v0  , NULL, v0,  0, J, 0, d0)) ;
-    ERR (GrB_Row_assign_(A , v   , NULL, v0,  0, J, 0, d0)) ;
-    ERR (GrB_Row_assign_(A , v   , NULL, v ,  0, J, 0, d0)) ;
-    ERR (GrB_Row_assign_(A , NULL, op0 , v ,  0, J, 0, NULL)) ;
-
-    ERR (GrB_Vector_assign_FP64_(v0, NULL, NULL,  x, I, 0, d0)) ;          // vector scalar
-    ERR (GrB_Vector_assign_FP64_(v , v0  , NULL,  x, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_FP64_(v , v   , NULL,  x, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_FP64_(v , v   , op0 ,  x, I, 0, NULL)) ;
-
-    ERR (GrB_Vector_assign_BOOL_(v0, NULL, NULL,  (bool) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_BOOL_(v , v0  , NULL,  (bool) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_BOOL_(v , v   , NULL,  (bool) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_BOOL_(v , v   , op0 ,  (bool) 0, I, 0, NULL)) ;
-
-    ERR (GrB_Vector_assign_INT8_(v0, NULL, NULL,  (int8_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT8_(v , v0  , NULL,  (int8_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT8_(v , v   , NULL,  (int8_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT8_(v , v   , op0 ,  (int8_t) 0, I, 0, NULL)) ;
-
-    ERR (GrB_Vector_assign_UINT8_(v0, NULL, NULL,  (uint8_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT8_(v , v0  , NULL,  (uint8_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT8_(v , v   , NULL,  (uint8_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT8_(v , v   , NULL,  (uint8_t) 0, I, 0, d0)) ;
-
-    ERR (GrB_Vector_assign_INT16_(v0, NULL, NULL,  (int16_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT16_(v , v0  , NULL,  (int16_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT16_(v , v   , NULL,  (int16_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT16_(v , v   , op0 ,  (int16_t) 0, I, 0, NULL)) ;
-
-    ERR (GrB_Vector_assign_UINT16_(v0, NULL, NULL,  (uint16_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT16_(v , v0  , NULL,  (uint16_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT16_(v , v   , NULL,  (uint16_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT16_(v , v   , op0 ,  (uint16_t) 0, I, 0, NULL)) ;
-
-    ERR (GrB_Vector_assign_INT32_(v0, NULL, NULL,  (int32_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT32_(v , v0  , NULL,  (int32_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT32_(v , v   , NULL,  (int32_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT32_(v , v   , op0 ,  (int32_t) 0, I, 0, NULL)) ;
-
-    ERR (GrB_Vector_assign_UINT32_(v0, NULL, NULL,  (uint32_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT32_(v , v0  , NULL,  (uint32_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT32_(v , v   , NULL,  (uint32_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT32_(v , v   , op0 ,  (uint32_t) 0, I, 0, NULL)) ;
-
-    ERR (GrB_Vector_assign_INT64_(v0, NULL, NULL,  (int64_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT64_(v , v0  , NULL,  (int64_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT64_(v , v   , NULL,  (int64_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_INT64_(v , v   , op0 ,  (int64_t) 0, I, 0, NULL)) ;
-
-    ERR (GrB_Vector_assign_UINT64_(v0, NULL, NULL,  (uint64_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT64_(v , v0  , NULL,  (uint64_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT64_(v , v   , NULL,  (uint64_t) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UINT64_(v , v   , op0 ,  (uint64_t) 0, I, 0, NULL)) ;
-
-    ERR (GrB_Vector_assign_FP32_(v0, NULL, NULL,  (float) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_FP32_(v , v0  , NULL,  (float) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_FP32_(v , v   , NULL,  (float) 0, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_FP32_(v , v   , op0 ,  (float) 0, I, 0, NULL)) ;
-
-    ERR (GrB_Vector_assign_UDT_(v0, NULL, NULL,  (void *) X, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UDT_(v , v0  , NULL,  (void *) X, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UDT_(v , v   , NULL,  (void *) X, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UDT_(v , v   , op0 ,  (void *) X, I, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_FP64_(A0, NULL, NULL,  x, I, 0, J, 0, d0)) ;   // matrix scalar
-    ERR (GrB_Matrix_assign_FP64_(A , A0  , NULL,  x, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_FP64_(A , A   , NULL,  x, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_FP64_(A , A   , op0 ,  x, I, 0, J, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_BOOL_(A0, NULL, NULL,  (bool) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_BOOL_(A , A0  , NULL,  (bool) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_BOOL_(A , A   , NULL,  (bool) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_BOOL_(A , A   , op0 ,  (bool) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_INT8_(A0, NULL, NULL,  (int8_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT8_(A , A0  , NULL,  (int8_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT8_(A , A   , NULL,  (int8_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT8_(A , A   , op0 ,  (int8_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_UINT8_(A0, NULL, NULL,  (uint8_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT8_(A , A0  , NULL,  (uint8_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT8_(A , A   , NULL,  (uint8_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT8_(A , A   , op0 ,  (uint8_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_INT16_(A0, NULL, NULL,  (int16_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT16_(A , A0  , NULL,  (int16_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT16_(A , A   , NULL,  (int16_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT16_(A , A   , op0 ,  (int16_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_UINT16_(A0, NULL, NULL,  (uint16_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT16_(A , A0  , NULL,  (uint16_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT16_(A , A   , NULL,  (uint16_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT16_(A , A   , op0 ,  (uint16_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_INT32_(A0, NULL, NULL,  (int32_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT32_(A , A0  , NULL,  (int32_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT32_(A , A   , NULL,  (int32_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT32_(A , A   , op0 ,  (int32_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_UINT32_(A0, NULL, NULL,  (uint32_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT32_(A , A0  , NULL,  (uint32_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT32_(A , A   , NULL,  (uint32_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT32_(A , A   , op0 ,  (uint32_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_INT64_(A0, NULL, NULL,  (int64_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT64_(A , A0  , NULL,  (int64_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT64_(A , A   , NULL,  (int64_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_INT64_(A , A   , op0 ,  (int64_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_UINT64_(A0, NULL, NULL,  (uint64_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT64_(A , A0  , NULL,  (uint64_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT64_(A , A   , NULL,  (uint64_t) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UINT64_(A , A   , op0 ,  (uint64_t) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_FP32_(A0, NULL, NULL,  (float) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_FP32_(A , A0  , NULL,  (float) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_FP32_(A , A   , NULL,  (float) 0, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_FP32_(A , A   , op0 ,  (float) 0, I, 0, J, 0, NULL)) ;
-
-    ERR (GrB_Matrix_assign_UDT_(A0, NULL, NULL,  (void *) X, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UDT_(A , A0  , NULL,  (void *) X, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UDT_(A , A   , NULL,  (void *) X, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UDT_(A , A   , op0 ,  (void *) X, I, 0, J, 0, NULL)) ;
+    ERR1 (v0, GrB_Vector_assign_(v0, NULL, NULL, v0, I, 0, d0)) ;          // vector assign
+    ERR1 (v,  GrB_Vector_assign_(v , v0  , NULL, v0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_(v , v   , NULL, v0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_(v , v   , NULL, v , I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_(v , v   , op0 , v , I, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_(A0, NULL, NULL, A0, I, 0, J, 0, d0)) ;   // matrix assign
+    ERR1 (A,  GrB_Matrix_assign_(A , A0  , NULL, A0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_(A , A   , NULL, A0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_(A , A   , NULL, A , I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_(A , A   , op0 , A , I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Col_assign_(A0, NULL, NULL, v0, I, 0,  0, d0)) ;      // column assign
+    ERR1 (A,  GrB_Col_assign_(A , v0  , NULL, v0, I, 0,  0, d0)) ;
+    ERR1 (A,  GrB_Col_assign_(A , v   , NULL, v0, I, 0,  0, d0)) ;
+    ERR1 (A,  GrB_Col_assign_(A , v   , NULL, v , I, 0,  0, d0)) ;
+    ERR1 (A,  GrB_Col_assign_(A , v   , op0 , v , I, 0,  0, NULL)) ;
+
+    ERR1 (A0, GrB_Row_assign_(A0, NULL, NULL, v0,  0, J, 0, d0)) ;      // row assign
+    ERR1 (A,  GrB_Row_assign_(A , v0  , NULL, v0,  0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Row_assign_(A , v   , NULL, v0,  0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Row_assign_(A , v   , NULL, v ,  0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Row_assign_(A , NULL, op0 , v ,  0, J, 0, NULL)) ;
+
+    ERR1 (v0, GrB_Vector_assign_FP64_(v0, NULL, NULL,  x, I, 0, d0)) ;          // vector scalar
+    ERR1 (v,  GrB_Vector_assign_FP64_(v , v0  , NULL,  x, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_FP64_(v , v   , NULL,  x, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_FP64_(v , v   , op0 ,  x, I, 0, NULL)) ;
+
+    ERR1 (v0, GrB_Vector_assign_BOOL_(v0, NULL, NULL,  (bool) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_BOOL_(v , v0  , NULL,  (bool) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_BOOL_(v , v   , NULL,  (bool) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_BOOL_(v , v   , op0 ,  (bool) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GrB_Vector_assign_INT8_(v0, NULL, NULL,  (int8_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT8_(v , v0  , NULL,  (int8_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT8_(v , v   , NULL,  (int8_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT8_(v , v   , op0 ,  (int8_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GrB_Vector_assign_UINT8_(v0, NULL, NULL,  (uint8_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT8_(v , v0  , NULL,  (uint8_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT8_(v , v   , NULL,  (uint8_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT8_(v , v   , NULL,  (uint8_t) 0, I, 0, d0)) ;
+
+    ERR1 (v0, GrB_Vector_assign_INT16_(v0, NULL, NULL,  (int16_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT16_(v , v0  , NULL,  (int16_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT16_(v , v   , NULL,  (int16_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT16_(v , v   , op0 ,  (int16_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GrB_Vector_assign_UINT16_(v0, NULL, NULL,  (uint16_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT16_(v , v0  , NULL,  (uint16_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT16_(v , v   , NULL,  (uint16_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT16_(v , v   , op0 ,  (uint16_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GrB_Vector_assign_INT32_(v0, NULL, NULL,  (int32_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT32_(v , v0  , NULL,  (int32_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT32_(v , v   , NULL,  (int32_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT32_(v , v   , op0 ,  (int32_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GrB_Vector_assign_UINT32_(v0, NULL, NULL,  (uint32_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT32_(v , v0  , NULL,  (uint32_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT32_(v , v   , NULL,  (uint32_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT32_(v , v   , op0 ,  (uint32_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GrB_Vector_assign_INT64_(v0, NULL, NULL,  (int64_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT64_(v , v0  , NULL,  (int64_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT64_(v , v   , NULL,  (int64_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_INT64_(v , v   , op0 ,  (int64_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GrB_Vector_assign_UINT64_(v0, NULL, NULL,  (uint64_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT64_(v , v0  , NULL,  (uint64_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT64_(v , v   , NULL,  (uint64_t) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UINT64_(v , v   , op0 ,  (uint64_t) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GrB_Vector_assign_FP32_(v0, NULL, NULL,  (float) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_FP32_(v , v0  , NULL,  (float) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_FP32_(v , v   , NULL,  (float) 0, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_FP32_(v , v   , op0 ,  (float) 0, I, 0, NULL)) ;
+
+    ERR1 (v0, GrB_Vector_assign_UDT_(v0, NULL, NULL,  (void *) X, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UDT_(v , v0  , NULL,  (void *) X, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UDT_(v , v   , NULL,  (void *) X, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UDT_(v , v   , op0 ,  (void *) X, I, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_FP64_(A0, NULL, NULL,  x, I, 0, J, 0, d0)) ;   // matrix scalar
+    ERR1 (A,  GrB_Matrix_assign_FP64_(A , A0  , NULL,  x, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_FP64_(A , A   , NULL,  x, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_FP64_(A , A   , op0 ,  x, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_BOOL_(A0, NULL, NULL,  (bool) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_BOOL_(A , A0  , NULL,  (bool) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_BOOL_(A , A   , NULL,  (bool) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_BOOL_(A , A   , op0 ,  (bool) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_INT8_(A0, NULL, NULL,  (int8_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT8_(A , A0  , NULL,  (int8_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT8_(A , A   , NULL,  (int8_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT8_(A , A   , op0 ,  (int8_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_UINT8_(A0, NULL, NULL,  (uint8_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT8_(A , A0  , NULL,  (uint8_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT8_(A , A   , NULL,  (uint8_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT8_(A , A   , op0 ,  (uint8_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_INT16_(A0, NULL, NULL,  (int16_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT16_(A , A0  , NULL,  (int16_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT16_(A , A   , NULL,  (int16_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT16_(A , A   , op0 ,  (int16_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_UINT16_(A0, NULL, NULL,  (uint16_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT16_(A , A0  , NULL,  (uint16_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT16_(A , A   , NULL,  (uint16_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT16_(A , A   , op0 ,  (uint16_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_INT32_(A0, NULL, NULL,  (int32_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT32_(A , A0  , NULL,  (int32_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT32_(A , A   , NULL,  (int32_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT32_(A , A   , op0 ,  (int32_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_UINT32_(A0, NULL, NULL,  (uint32_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT32_(A , A0  , NULL,  (uint32_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT32_(A , A   , NULL,  (uint32_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT32_(A , A   , op0 ,  (uint32_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_INT64_(A0, NULL, NULL,  (int64_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT64_(A , A0  , NULL,  (int64_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT64_(A , A   , NULL,  (int64_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_INT64_(A , A   , op0 ,  (int64_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_UINT64_(A0, NULL, NULL,  (uint64_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT64_(A , A0  , NULL,  (uint64_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT64_(A , A   , NULL,  (uint64_t) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UINT64_(A , A   , op0 ,  (uint64_t) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_FP32_(A0, NULL, NULL,  (float) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_FP32_(A , A0  , NULL,  (float) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_FP32_(A , A   , NULL,  (float) 0, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_FP32_(A , A   , op0 ,  (float) 0, I, 0, J, 0, NULL)) ;
+
+    ERR1 (A0, GrB_Matrix_assign_UDT_(A0, NULL, NULL,  (void *) X, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UDT_(A , A0  , NULL,  (void *) X, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UDT_(A , A   , NULL,  (void *) X, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UDT_(A , A   , op0 ,  (void *) X, I, 0, J, 0, NULL)) ;
 
     v0 = NULL ;
     A0 = NULL ;
@@ -3072,33 +3083,33 @@ void mexFunction
 
     expected = (Complex == GxB_FC64) ? GrB_DIMENSION_MISMATCH : GrB_DOMAIN_MISMATCH ;
 
-    ERR (GrB_Vector_assign_(v, z , NULL, v, I, 0, d0)) ;               // vector assign
-    ERR (GrB_Vector_assign_(v, v0, op0 , v, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_(v, v0, op0 , z, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_(z, v0, o2  , v, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_(v, v0, o2  , z, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_(v, v0, NULL, z, I, 0, d0)) ;
-
-    ERR (GrB_Matrix_assign_(A, Z , NULL, A, I, 0, J, 0, d0)) ;         // matrix assign
-    ERR (GrB_Matrix_assign_(A, A0, op0 , A, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_(A, A0, op0 , Z, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_(Z, A0, o2  , A, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_(A, A0, o2  , Z, I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_(A, A0, NULL, Z, I, 0, J, 0, d0)) ;
-
-    ERR (GrB_Col_assign_(A, z , NULL, v, I, 0, 0, d0)) ;            // column assign
-    ERR (GrB_Col_assign_(A, v0, op0 , v, I, 0, 0, d0)) ;
-    ERR (GrB_Col_assign_(A, v0, op0 , z, I, 0, 0, d0)) ;
-    ERR (GrB_Col_assign_(Z, v0, o2  , v, I, 0, 0, d0)) ;
-    ERR (GrB_Col_assign_(A, v0, o2  , z, I, 0, 0, d0)) ;
-    ERR (GrB_Col_assign_(A, v0, NULL, z, I, 0, 0, d0)) ;
-
-    ERR (GrB_Row_assign_(A, z , NULL, v, 0, J, 0, d0)) ;            // row assign
-    ERR (GrB_Row_assign_(A, v0, op0 , v, 0, J, 0, d0)) ;
-    ERR (GrB_Row_assign_(A, v0, op0 , z, 0, J, 0, d0)) ;
-    ERR (GrB_Row_assign_(Z, v0, o2  , v, 0, J, 0, d0)) ;
-    ERR (GrB_Row_assign_(A, v0, o2  , z, 0, J, 0, d0)) ;
-    ERR (GrB_Row_assign_(A, v0, NULL, z, 0, J, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_(v, z , NULL, v, I, 0, d0)) ;               // vector assign
+    ERR1 (v,  GrB_Vector_assign_(v, v0, op0 , v, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_(v, v0, op0 , z, I, 0, d0)) ;
+    ERR1 (z,  GrB_Vector_assign_(z, v0, o2  , v, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_(v, v0, o2  , z, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_(v, v0, NULL, z, I, 0, d0)) ;
+
+    ERR1 (A,  GrB_Matrix_assign_(A, Z , NULL, A, I, 0, J, 0, d0)) ;         // matrix assign
+    ERR1 (A,  GrB_Matrix_assign_(A, A0, op0 , A, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_(A, A0, op0 , Z, I, 0, J, 0, d0)) ;
+    ERR1 (Z,  GrB_Matrix_assign_(Z, A0, o2  , A, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_(A, A0, o2  , Z, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_(A, A0, NULL, Z, I, 0, J, 0, d0)) ;
+
+    ERR1 (A,  GrB_Col_assign_(A, z , NULL, v, I, 0, 0, d0)) ;            // column assign
+    ERR1 (A,  GrB_Col_assign_(A, v0, op0 , v, I, 0, 0, d0)) ;
+    ERR1 (A,  GrB_Col_assign_(A, v0, op0 , z, I, 0, 0, d0)) ;
+    ERR1 (Z,  GrB_Col_assign_(Z, v0, o2  , v, I, 0, 0, d0)) ;
+    ERR1 (A,  GrB_Col_assign_(A, v0, o2  , z, I, 0, 0, d0)) ;
+    ERR1 (A,  GrB_Col_assign_(A, v0, NULL, z, I, 0, 0, d0)) ;
+
+    ERR1 (A,  GrB_Row_assign_(A, z , NULL, v, 0, J, 0, d0)) ;            // row assign
+    ERR1 (A,  GrB_Row_assign_(A, v0, op0 , v, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Row_assign_(A, v0, op0 , z, 0, J, 0, d0)) ;
+    ERR1 (Z,  GrB_Row_assign_(Z, v0, o2  , v, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Row_assign_(A, v0, o2  , z, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Row_assign_(A, v0, NULL, z, 0, J, 0, d0)) ;
 
     // vector scalar and matrix-scalar
     if (Complex == GxB_FC64)
@@ -3111,49 +3122,51 @@ void mexFunction
     }
     else
     {
-        ERR (GrB_Vector_assign_FP64_(v, z , NULL, x, I, 0, d0)) ;
-        ERR (GrB_Vector_assign_FP64_(v, v0, op0 , x, I, 0, d0)) ;
-        ERR (GrB_Vector_assign_FP64_(z, v0, o2  , x, I, 0, d0)) ;
-        ERR (GrB_Matrix_assign_FP64_(A, A0, op0 , x, I, 0, J, 0, d0)) ;
-        ERR (GrB_Matrix_assign_FP64_(Z, A0, o2  , x, I, 0, J, 0, d0)) ;
+        ERR1 (v,  GrB_Vector_assign_FP64_(v, z , NULL, x, I, 0, d0)) ;
+        ERR1 (v,  GrB_Vector_assign_FP64_(v, v0, op0 , x, I, 0, d0)) ;
+        ERR1 (z,  GrB_Vector_assign_FP64_(z, v0, o2  , x, I, 0, d0)) ;
+        ERR1 (A,  GrB_Matrix_assign_FP64_(A, A0, op0 , x, I, 0, J, 0, d0)) ;
+        ERR1 (Z,  GrB_Matrix_assign_FP64_(Z, A0, o2  , x, I, 0, J, 0, d0)) ;
     }
 
     expected = GrB_DOMAIN_MISMATCH ;
-    ERR (GrB_Vector_assign_UDT_(v, v0, op0 ,(void *) &c, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UDT_(v, v0, o2  ,(void *) &c, I, 0, d0)) ;
-    ERR (GrB_Vector_assign_UDT_(v, v0, NULL,(void *) &c, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UDT_(v, v0, op0 ,(void *) &c, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UDT_(v, v0, o2  ,(void *) &c, I, 0, d0)) ;
+    ERR1 (v,  GrB_Vector_assign_UDT_(v, v0, NULL,(void *) &c, I, 0, d0)) ;
 
     expected = (Complex == GxB_FC64) ? GrB_DIMENSION_MISMATCH : GrB_DOMAIN_MISMATCH ;
     
-    ERR (GrB_Matrix_assign_FP64_(A, Z , NULL, x, I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_FP64_(A, Z , NULL, x, I, 0, J, 0, d0)) ;
 
     expected = GrB_DOMAIN_MISMATCH ;
-    ERR (GrB_Matrix_assign_UDT_(A, A0, op0 ,(void *) &c , I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UDT_(A, A0, o2  ,(void *) &c , I, 0, J, 0, d0)) ;
-    ERR (GrB_Matrix_assign_UDT_(A, A0, NULL,(void *) &c , I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UDT_(A, A0, op0 ,(void *) &c , I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UDT_(A, A0, o2  ,(void *) &c , I, 0, J, 0, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_UDT_(A, A0, NULL,(void *) &c , I, 0, J, 0, d0)) ;
 
     expected = GrB_DIMENSION_MISMATCH ;
 
-    ERR (GrB_Matrix_assign_(A, NULL, NULL, A, I, 2, J, 3, d0)) ;
-    ERR (GrB_Matrix_assign_(A, NULL, NULL, A, I, 2, J, 3, dtn)) ;
-    ERR (GrB_Row_assign_(A , v   , NULL, v ,  0, J, 0, NULL)) ;
+    ERR1 (A,  GrB_Matrix_assign_(A, NULL, NULL, A, I, 2, J, 3, d0)) ;
+    ERR1 (A,  GrB_Matrix_assign_(A, NULL, NULL, A, I, 2, J, 3, dtn)) ;
+    ERR1 (A,  GrB_Row_assign_(A , v   , NULL, v ,  0, J, 0, NULL)) ;
 
     GrB_Vector v5 ;
     OK (GrB_Vector_new (&v5, GrB_BOOL, 5)) ;
-    GxB_Matrix_fprint_(A, GB3, NULL) ;
-    GxB_Vector_fprint_(v5, GB3, NULL) ;
-    GxB_Vector_fprint_(v, GB3, NULL) ;
-    ERR (GrB_Col_assign_(A, v5 , NULL, v, GrB_ALL, 0, 0, NULL)) ; // column assign
-    printf ("mask wrong size:\n%s\n", GrB_error ( )) ;
+    GxB_Matrix_fprint_(A, G3, NULL) ;
+    GxB_Vector_fprint_(v5, G3, NULL) ;
+    GxB_Vector_fprint_(v, G3, NULL) ;
+    ERR1 (A,  GrB_Col_assign_(A, v5 , NULL, v, GrB_ALL, 0, 0, NULL)) ; // column assign
+    GrB_Matrix_error_(&err, A) ;
+    printf ("mask wrong size:\n%s\n", err) ;
     OK (GrB_Vector_free_(&v5)) ;
 
     // matrix assign, mask wrong size
     GrB_Matrix A5 ;
     OK (GrB_Matrix_new (&A5, GrB_BOOL, 5, 5)) ;
-    GB_Matrix_check (A, "A", GB3, NULL, Context) ;
-    GB_Matrix_check (A5, "A5", GB3, NULL, Context) ;
-    ERR (GrB_Matrix_assign_(A, A5, NULL, A, GrB_ALL, 0, GrB_ALL, 0, NULL)) ;
-    printf ("mask wrong size:\n%s\n", GrB_error ( )) ;
+    GB_Matrix_check (A, "A", G3, NULL) ;
+    GB_Matrix_check (A5, "A5", G3, NULL) ;
+    ERR1 (A,  GrB_Matrix_assign_(A, A5, NULL, A, GrB_ALL, 0, GrB_ALL, 0, NULL)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("mask wrong size:\n%s\n", err) ;
     OK (GrB_Matrix_free_(&A5)) ;
 
     // change of op-2nd type
@@ -3163,13 +3176,14 @@ void mexFunction
     OK (GrB_Matrix_new (&A5, GrB_FP32, 5, 5)) ;
     OK (GrB_Matrix_assign_FP32 (A5, NULL, GrB_SECOND_FP32, 42,
         GrB_ALL, 0, GrB_ALL, 0, NULL)) ;
-    GB_Matrix_check (A5, "A5 with 2nd:fp32", GB3, NULL, Context) ;
+    GB_Matrix_check (A5, "A5 with 2nd:fp32", G3, NULL) ;
     OK (GrB_Matrix_assign_FP32 (A5, NULL, GrB_SECOND_BOOL, 42,
         GrB_ALL, 0, GrB_ALL, 0, NULL)) ;
-    GB_Matrix_check (A5, "A5 with 2nd:bool", GB3, NULL, Context) ;
+    GB_Matrix_check (A5, "A5 with 2nd:bool", G3, NULL) ;
     OK (GrB_Matrix_nvals (&nvals, A5)) ;
+    OK (GrB_Matrix_wait_(&A5)) ;
     CHECK (nvals == 25) ;
-    GB_Matrix_check (A5, "A5 done", GB3, NULL, Context) ;
+    GB_Matrix_check (A5, "A5 done", G3, NULL) ;
 
     OK (GrB_Matrix_free_(&A5)) ;
 
@@ -3180,13 +3194,13 @@ void mexFunction
     printf ("GrB_apply----------------------------------------------------\n") ;
     expected = GrB_NULL_POINTER ;
 
-    ERR (GrB_Vector_apply (v0, NULL, NULL, NULL, v0, d0)) ;
-    ERR (GrB_Vector_apply (v , NULL, NULL, NULL, v0, d0)) ;
-    ERR (GrB_Vector_apply (v , NULL, NULL, NULL, v , d0)) ;
+    ERR1 (v0, GrB_Vector_apply (v0, NULL, NULL, NULL, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_apply (v , NULL, NULL, NULL, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_apply (v , NULL, NULL, NULL, v , d0)) ;
 
-    ERR (GrB_Matrix_apply (A0, NULL, NULL, NULL, A0, d0)) ;
-    ERR (GrB_Matrix_apply (A , NULL, NULL, NULL, A0, d0)) ;
-    ERR (GrB_Matrix_apply (A , NULL, NULL, NULL, A , d0)) ;
+    ERR1 (A0, GrB_Matrix_apply (A0, NULL, NULL, NULL, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_apply (A , NULL, NULL, NULL, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_apply (A , NULL, NULL, NULL, A , d0)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
@@ -3195,24 +3209,24 @@ void mexFunction
     d0 = dgunk ;
     op0 = op2gunk ;
 
-    ERR (GrB_Vector_apply_(v0, NULL, NULL, op1gunk, v0, d0)) ;
-    ERR (GrB_Vector_apply_(v , v0  , NULL, op1gunk, v0, d0)) ;
-    ERR (GrB_Vector_apply_(v , v   , NULL, op1gunk, v0, d0)) ;
-    ERR (GrB_Vector_apply_(v , v   , NULL, op1gunk, v , d0)) ;
-    ERR (GrB_Vector_apply_(v , v   , op0 , op1gunk, v , NULL)) ;
-    ERR (GrB_Vector_apply_(v , v   , NULL, op1gunk, v , NULL)) ;
+    ERR1 (v0, GrB_Vector_apply_(v0, NULL, NULL, op1gunk, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_apply_(v , v0  , NULL, op1gunk, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_apply_(v , v   , NULL, op1gunk, v0, d0)) ;
+    ERR1 (v,  GrB_Vector_apply_(v , v   , NULL, op1gunk, v , d0)) ;
+    ERR1 (v,  GrB_Vector_apply_(v , v   , op0 , op1gunk, v , NULL)) ;
+    ERR1 (v,  GrB_Vector_apply_(v , v   , NULL, op1gunk, v , NULL)) ;
 
-    ERR (GrB_Matrix_apply_(A0, NULL, NULL, op1gunk, A0, d0)) ;
-    ERR (GrB_Matrix_apply_(A , A0  , NULL, op1gunk, A0, d0)) ;
-    ERR (GrB_Matrix_apply_(A , A   , NULL, op1gunk, A0, d0)) ;
-    ERR (GrB_Matrix_apply_(A , A   , NULL, op1gunk, A , d0)) ;
-    ERR (GrB_Matrix_apply_(A , A   , op0 , op1gunk, A , NULL)) ;
-    ERR (GrB_Matrix_apply_(A , A   , NULL, op1gunk, A , NULL)) ;
+    ERR1 (A0, GrB_Matrix_apply_(A0, NULL, NULL, op1gunk, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_apply_(A , A0  , NULL, op1gunk, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_apply_(A , A   , NULL, op1gunk, A0, d0)) ;
+    ERR1 (A,  GrB_Matrix_apply_(A , A   , NULL, op1gunk, A , d0)) ;
+    ERR1 (A,  GrB_Matrix_apply_(A , A   , op0 , op1gunk, A , NULL)) ;
+    ERR1 (A,  GrB_Matrix_apply_(A , A   , NULL, op1gunk, A , NULL)) ;
 
     expected = (Complex == GxB_FC64) ? GrB_DIMENSION_MISMATCH : GrB_DOMAIN_MISMATCH ;
 
     o2  = Complex_plus ;
-    ERR (GrB_Matrix_apply_(A, Z   , NULL, GrB_AINV_FP64, A, NULL)) ;
+    ERR1 (A,  GrB_Matrix_apply_(A, Z   , NULL, GrB_AINV_FP64, A, NULL)) ;
 
     if (Complex == GxB_FC64)
     {
@@ -3223,18 +3237,18 @@ void mexFunction
     }
     else
     {
-        ERR (GrB_Matrix_apply_(A, NULL, o2  , GrB_AINV_FP64, A, NULL)) ;
-        ERR (GrB_Matrix_apply_(Z, NULL, NULL, GrB_AINV_FP64, Z, NULL)) ;
-        ERR (GrB_Matrix_apply_BinaryOp1st_INT32_(Z, NULL, NULL, o2, 1, Z, NULL)) ;
-        ERR (GrB_Matrix_apply_BinaryOp2nd_INT32_(Z, NULL, NULL, o2, Z, 1, NULL)) ;
+        ERR1 (A,  GrB_Matrix_apply_(A, NULL, o2  , GrB_AINV_FP64, A, NULL)) ;
+        ERR1 (Z,  GrB_Matrix_apply_(Z, NULL, NULL, GrB_AINV_FP64, Z, NULL)) ;
+        ERR1 (Z,  GrB_Matrix_apply_BinaryOp1st_INT32_(Z, NULL, NULL, o2, 1, Z, NULL)) ;
+        ERR1 (Z,  GrB_Matrix_apply_BinaryOp2nd_INT32_(Z, NULL, NULL, o2, Z, 1, NULL)) ;
     }
 
-    ERR (GrB_Matrix_apply_(A, NULL, o2  , GrB_AINV_FP64, Z, NULL)) ;
-    ERR (GrB_Matrix_apply_(A, NULL, NULL, GrB_AINV_FP64, Z, NULL)) ;
-    ERR (GrB_Matrix_apply_(Z, NULL, NULL, GrB_AINV_FP64, A, NULL)) ;
+    ERR1 (A,  GrB_Matrix_apply_(A, NULL, o2  , GrB_AINV_FP64, Z, NULL)) ;
+    ERR1 (A,  GrB_Matrix_apply_(A, NULL, NULL, GrB_AINV_FP64, Z, NULL)) ;
+    ERR1 (Z,  GrB_Matrix_apply_(Z, NULL, NULL, GrB_AINV_FP64, A, NULL)) ;
 
-    ERR (GrB_Matrix_apply_BinaryOp1st_INT32_(Z, NULL, NULL, o2, 1, A, NULL)) ;
-    ERR (GrB_Matrix_apply_BinaryOp2nd_INT32_(Z, NULL, NULL, o2, A, 1, NULL)) ;
+    ERR1 (Z,  GrB_Matrix_apply_BinaryOp1st_INT32_(Z, NULL, NULL, o2, 1, A, NULL)) ;
+    ERR1 (Z,  GrB_Matrix_apply_BinaryOp2nd_INT32_(Z, NULL, NULL, o2, A, 1, NULL)) ;
 
     v0 = NULL ;
     A0 = NULL ;
@@ -3243,7 +3257,7 @@ void mexFunction
 
     expected = GrB_DIMENSION_MISMATCH ;
 
-    ERR (GrB_Matrix_apply_(A , NULL, NULL, GrB_AINV_FP64, C , d0)) ;
+    ERR1 (A,  GrB_Matrix_apply_(A , NULL, NULL, GrB_AINV_FP64, C , d0)) ;
 
     //--------------------------------------------------------------------------
     // select
@@ -3253,24 +3267,24 @@ void mexFunction
     CHECK (selectop == NULL) ;
     OK (GxB_SelectOp_new (&selectop, fselect, GrB_FP64, GrB_FP64)) ;
     CHECK (selectop != NULL) ;
-    OK (GB_SelectOp_check (selectop, "select op OK", GB3, NULL, Context)) ;
+    OK (GB_SelectOp_check (selectop, "select op OK", G3, NULL)) ;
 
     expected = GrB_NULL_POINTER ;
 
-    ERR (GxB_Vector_select_(v0, NULL, NULL, NULL, v0, NULL, d0)) ;
-    ERR (GxB_Vector_select_(v , NULL, NULL, NULL, v0, NULL, d0)) ;
-    ERR (GxB_Vector_select_(v , NULL, NULL, NULL, v , NULL, d0)) ;
+    ERR1 (v0, GxB_Vector_select_(v0, NULL, NULL, NULL, v0, NULL, d0)) ;
+    ERR1 (v,  GxB_Vector_select_(v , NULL, NULL, NULL, v0, NULL, d0)) ;
+    ERR1 (v,  GxB_Vector_select_(v , NULL, NULL, NULL, v , NULL, d0)) ;
 
-    ERR (GxB_Matrix_select_(A0, NULL, NULL, NULL, A0, NULL, d0)) ;
-    ERR (GxB_Matrix_select_(A , NULL, NULL, NULL, A0, NULL, d0)) ;
-    ERR (GxB_Matrix_select_(A , NULL, NULL, NULL, A , NULL, d0)) ;
+    ERR1 (A0, GxB_Matrix_select_(A0, NULL, NULL, NULL, A0, NULL, d0)) ;
+    ERR1 (A,  GxB_Matrix_select_(A , NULL, NULL, NULL, A0, NULL, d0)) ;
+    ERR1 (A,  GxB_Matrix_select_(A , NULL, NULL, NULL, A , NULL, d0)) ;
 
     CHECK (selectopgunk == NULL) ;
     OK (GxB_SelectOp_new (&selectopgunk, fselect, GrB_FP64, GrB_FP64)) ;
     CHECK (selectopgunk != NULL) ;
     selectopgunk->magic = 22309483 ;
     expected = GrB_UNINITIALIZED_OBJECT ;
-    ERR (GB_SelectOp_check (selectopgunk, "select gunk", GB3, NULL, Context)) ;
+    ERR (GB_SelectOp_check (selectopgunk, "select gunk", G3, NULL)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
@@ -3280,19 +3294,19 @@ void mexFunction
     op0 = op2gunk ;
     sel0 = selectopgunk ;
 
-    ERR (GxB_Vector_select_(v0, NULL, NULL, sel0, v0, NULL, d0)) ;
-    ERR (GxB_Vector_select_(v , v0  , NULL, sel0, v0, NULL, d0)) ;
-    ERR (GxB_Vector_select_(v , v   , NULL, sel0, v0, NULL, d0)) ;
-    ERR (GxB_Vector_select_(v , v   , NULL, sel0, v , NULL, d0)) ;
-    ERR (GxB_Vector_select_(v , v   , op0 , sel0, v , NULL, NULL)) ;
-    ERR (GxB_Vector_select_(v , v   , NULL, sel0, v , NULL, NULL)) ;
+    ERR1 (v0, GxB_Vector_select_(v0, NULL, NULL, sel0, v0, NULL, d0)) ;
+    ERR1 (v,  GxB_Vector_select_(v , v0  , NULL, sel0, v0, NULL, d0)) ;
+    ERR1 (v,  GxB_Vector_select_(v , v   , NULL, sel0, v0, NULL, d0)) ;
+    ERR1 (v,  GxB_Vector_select_(v , v   , NULL, sel0, v , NULL, d0)) ;
+    ERR1 (v,  GxB_Vector_select_(v , v   , op0 , sel0, v , NULL, NULL)) ;
+    ERR1 (v,  GxB_Vector_select_(v , v   , NULL, sel0, v , NULL, NULL)) ;
 
-    ERR (GxB_Matrix_select_(A0, NULL, NULL, sel0, A0, NULL, d0)) ;
-    ERR (GxB_Matrix_select_(A , A0  , NULL, sel0, A0, NULL, d0)) ;
-    ERR (GxB_Matrix_select_(A , A   , NULL, sel0, A0, NULL, d0)) ;
-    ERR (GxB_Matrix_select_(A , A   , NULL, sel0, A , NULL, d0)) ;
-    ERR (GxB_Matrix_select_(A , A   , op0 , sel0, A , NULL, NULL)) ;
-    ERR (GxB_Matrix_select_(A , A   , NULL, sel0, A , NULL, NULL)) ;
+    ERR1 (A0, GxB_Matrix_select_(A0, NULL, NULL, sel0, A0, NULL, d0)) ;
+    ERR1 (A,  GxB_Matrix_select_(A , A0  , NULL, sel0, A0, NULL, d0)) ;
+    ERR1 (A,  GxB_Matrix_select_(A , A   , NULL, sel0, A0, NULL, d0)) ;
+    ERR1 (A,  GxB_Matrix_select_(A , A   , NULL, sel0, A , NULL, d0)) ;
+    ERR1 (A,  GxB_Matrix_select_(A , A   , op0 , sel0, A , NULL, NULL)) ;
+    ERR1 (A,  GxB_Matrix_select_(A , A   , NULL, sel0, A , NULL, NULL)) ;
 
     expected = GrB_DOMAIN_MISMATCH ;
 
@@ -3311,14 +3325,14 @@ void mexFunction
     }
     else
     {
-        ERR (GxB_Matrix_select_(A, NULL, o2  , selectop, A, Thunk, NULL)) ;
-        ERR (GxB_Matrix_select_(Z, NULL, NULL, selectop, Z, Thunk, NULL)) ;
+        ERR1 (A,  GxB_Matrix_select_(A, NULL, o2  , selectop, A, Thunk, NULL)) ;
+        ERR1 (Z,  GxB_Matrix_select_(Z, NULL, NULL, selectop, Z, Thunk, NULL)) ;
     }
 
-    ERR (GxB_Matrix_select_(A, Z   , NULL, selectop, A, Thunk, NULL)) ;
-    ERR (GxB_Matrix_select_(A, NULL, o2  , selectop, Z, Thunk, NULL)) ;
-    ERR (GxB_Matrix_select_(A, NULL, NULL, selectop, Z, Thunk, NULL)) ;
-    ERR (GxB_Matrix_select_(Z, NULL, NULL, selectop, A, Thunk, NULL)) ;
+    ERR1 (A,  GxB_Matrix_select_(A, Z   , NULL, selectop, A, Thunk, NULL)) ;
+    ERR1 (A,  GxB_Matrix_select_(A, NULL, o2  , selectop, Z, Thunk, NULL)) ;
+    ERR1 (A,  GxB_Matrix_select_(A, NULL, NULL, selectop, Z, Thunk, NULL)) ;
+    ERR1 (Z,  GxB_Matrix_select_(Z, NULL, NULL, selectop, A, Thunk, NULL)) ;
 
     v0 = NULL ;
     A0 = NULL ;
@@ -3331,7 +3345,7 @@ void mexFunction
 
     expected = GrB_DIMENSION_MISMATCH ;
 
-    ERR (GxB_Matrix_select_(A , NULL, NULL, GxB_TRIL, C , NULL, d0)) ;
+    ERR1 (A,  GxB_Matrix_select_(A , NULL, NULL, GxB_TRIL, C , NULL, d0)) ;
 
     OK (GxB_Scalar_free_(&Thunk)) ;
 
@@ -3586,13 +3600,13 @@ void mexFunction
     o2 = GrB_PLUS_FP64 ;
     m2 = GxB_TIMES_FP64_MONOID ;
 
-    ERR (GrB_Matrix_reduce_BinaryOp_(v0, NULL, NULL, op0, A0, d0)) ;    // reduce via op
-    ERR (GrB_Matrix_reduce_BinaryOp_(v0, NULL, NULL, o2 , A0, d0)) ;
-    ERR (GrB_Matrix_reduce_BinaryOp_(v , NULL, NULL, o2 , A0, d0)) ;
+    ERR1 (v0, GrB_Matrix_reduce_BinaryOp_(v0, NULL, NULL, op0, A0, d0)) ;    // reduce via op
+    ERR1 (v0, GrB_Matrix_reduce_BinaryOp_(v0, NULL, NULL, o2 , A0, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_BinaryOp_(v , NULL, NULL, o2 , A0, d0)) ;
 
-    ERR (GrB_Matrix_reduce_Monoid_(v0, NULL, NULL, m0 , A0, d0)) ;    // reduce via monoid
-    ERR (GrB_Matrix_reduce_Monoid_(v0, NULL, NULL, m2 , A0, d0)) ;
-    ERR (GrB_Matrix_reduce_Monoid_(v , NULL, NULL, m2 , A0, d0)) ;
+    ERR1 (v0, GrB_Matrix_reduce_Monoid_(v0, NULL, NULL, m0 , A0, d0)) ;    // reduce via monoid
+    ERR1 (v0, GrB_Matrix_reduce_Monoid_(v0, NULL, NULL, m2 , A0, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_Monoid_(v , NULL, NULL, m2 , A0, d0)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
@@ -3602,19 +3616,19 @@ void mexFunction
     op0 = op2gunk ;
     d0 = dgunk ;
 
-    ERR (GrB_Matrix_reduce_BinaryOp_(v0, v0  , op0 , op0, A0, d0)) ;    // reduce via op
-    ERR (GrB_Matrix_reduce_BinaryOp_(v0, v0  , op0 , o2 , A0, d0)) ;
-    ERR (GrB_Matrix_reduce_BinaryOp_(v , v0  , op0 , o2 , A0, d0)) ;
-    ERR (GrB_Matrix_reduce_BinaryOp_(v , v   , op0 , o2 , A0, d0)) ;
-    ERR (GrB_Matrix_reduce_BinaryOp_(v , v   , o2  , o2 , A0, d0)) ;
-    ERR (GrB_Matrix_reduce_BinaryOp_(v , v   , o2  , o2 , A , d0)) ;
+    ERR1 (v0, GrB_Matrix_reduce_BinaryOp_(v0, v0  , op0 , op0, A0, d0)) ;    // reduce via op
+    ERR1 (v0, GrB_Matrix_reduce_BinaryOp_(v0, v0  , op0 , o2 , A0, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_BinaryOp_(v , v0  , op0 , o2 , A0, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_BinaryOp_(v , v   , op0 , o2 , A0, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_BinaryOp_(v , v   , o2  , o2 , A0, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_BinaryOp_(v , v   , o2  , o2 , A , d0)) ;
 
-    ERR (GrB_Matrix_reduce_Monoid_(v0, v0  , op0 , m0 , A0, d0)) ;    // reduce via monoid
-    ERR (GrB_Matrix_reduce_Monoid_(v0, v0  , op0 , m2 , A0, d0)) ;
-    ERR (GrB_Matrix_reduce_Monoid_(v , v0  , op0 , m2 , A0, d0)) ;
-    ERR (GrB_Matrix_reduce_Monoid_(v , v   , op0 , m2 , A0, d0)) ;
-    ERR (GrB_Matrix_reduce_Monoid_(v , v   , o2  , m2 , A0, d0)) ;
-    ERR (GrB_Matrix_reduce_Monoid_(v , v   , o2  , m2 , A , d0)) ;
+    ERR1 (v0, GrB_Matrix_reduce_Monoid_(v0, v0  , op0 , m0 , A0, d0)) ;    // reduce via monoid
+    ERR1 (v0, GrB_Matrix_reduce_Monoid_(v0, v0  , op0 , m2 , A0, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_Monoid_(v , v0  , op0 , m2 , A0, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_Monoid_(v , v   , op0 , m2 , A0, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_Monoid_(v , v   , o2  , m2 , A0, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_Monoid_(v , v   , o2  , m2 , A , d0)) ;
 
     m0 = NULL ;
     v0 = NULL ;
@@ -3625,26 +3639,26 @@ void mexFunction
     expected = (Complex == GxB_FC64) ? GrB_DIMENSION_MISMATCH : GrB_DOMAIN_MISMATCH ;
 
     o2 = Complex_plus ;
-    ERR (GrB_Matrix_reduce_BinaryOp_(v, z   , NULL, GrB_PLUS_FP64, A, d0)) ;
-    ERR (GrB_Matrix_reduce_BinaryOp_(z, NULL, NULL, GrB_PLUS_FP64, A, d0)) ;
-    ERR (GrB_Matrix_reduce_BinaryOp_(v, NULL, o2  , GrB_PLUS_FP64, A, d0)) ;
-    ERR (GrB_Matrix_reduce_BinaryOp_(v, NULL, NULL, GrB_PLUS_FP64, Z, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_BinaryOp_(v, z   , NULL, GrB_PLUS_FP64, A, d0)) ;
+    ERR1 (z,  GrB_Matrix_reduce_BinaryOp_(z, NULL, NULL, GrB_PLUS_FP64, A, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_BinaryOp_(v, NULL, o2  , GrB_PLUS_FP64, A, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_BinaryOp_(v, NULL, NULL, GrB_PLUS_FP64, Z, d0)) ;
 
-    ERR (GrB_Matrix_reduce_Monoid_(v, z   , NULL, GxB_PLUS_FP64_MONOID, A, d0)) ;
-    ERR (GrB_Matrix_reduce_Monoid_(z, NULL, NULL, GxB_PLUS_FP64_MONOID, A, d0)) ;
-    ERR (GrB_Matrix_reduce_Monoid_(v, NULL, o2  , GxB_PLUS_FP64_MONOID, A, d0)) ;
-    ERR (GrB_Matrix_reduce_Monoid_(v, NULL, NULL, GxB_PLUS_FP64_MONOID, Z, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_Monoid_(v, z   , NULL, GxB_PLUS_FP64_MONOID, A, d0)) ;
+    ERR1 (z,  GrB_Matrix_reduce_Monoid_(z, NULL, NULL, GxB_PLUS_FP64_MONOID, A, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_Monoid_(v, NULL, o2  , GxB_PLUS_FP64_MONOID, A, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_Monoid_(v, NULL, NULL, GxB_PLUS_FP64_MONOID, Z, d0)) ;
 
     expected = GrB_DOMAIN_MISMATCH ;
-    ERR (GrB_Matrix_reduce_BinaryOp_(v, NULL, NULL, GrB_EQ_FP64  , A, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_BinaryOp_(v, NULL, NULL, GrB_EQ_FP64  , A, d0)) ;
 
     expected = GrB_DIMENSION_MISMATCH ;
 
-    ERR (GrB_Matrix_reduce_BinaryOp_(v, NULL, NULL, GrB_PLUS_FP64, A, dtn)) ;
-    ERR (GrB_Matrix_reduce_BinaryOp_(v, NULL, NULL, GrB_PLUS_FP64, A, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_BinaryOp_(v, NULL, NULL, GrB_PLUS_FP64, A, dtn)) ;
+    ERR1 (v,  GrB_Matrix_reduce_BinaryOp_(v, NULL, NULL, GrB_PLUS_FP64, A, d0)) ;
 
-    ERR (GrB_Matrix_reduce_Monoid_(v, NULL, NULL, GxB_PLUS_FP64_MONOID, A, dtn)) ;
-    ERR (GrB_Matrix_reduce_Monoid_(v, NULL, NULL, GxB_PLUS_FP64_MONOID, A, d0)) ;
+    ERR1 (v,  GrB_Matrix_reduce_Monoid_(v, NULL, NULL, GxB_PLUS_FP64_MONOID, A, dtn)) ;
+    ERR1 (v,  GrB_Matrix_reduce_Monoid_(v, NULL, NULL, GxB_PLUS_FP64_MONOID, A, d0)) ;
 
     //--------------------------------------------------------------------------
     // transpose
@@ -3654,31 +3668,31 @@ void mexFunction
     expected = GrB_NULL_POINTER ;
 
     ERR (GrB_transpose (NULL, NULL, NULL, NULL, NULL)) ;
-    ERR (GrB_transpose (A   , NULL, NULL, NULL, NULL)) ;
+    ERR1 (A, GrB_transpose (A   , NULL, NULL, NULL, NULL)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
-    ERR (GrB_transpose (Agunk, NULL , NULL   , NULL , NULL )) ;
-    ERR (GrB_transpose (A    , Agunk, NULL   , NULL , NULL )) ;
-    ERR (GrB_transpose (A    , A    , op2gunk, NULL , NULL )) ;
-    ERR (GrB_transpose (A    , NULL , NULL   , Agunk, NULL )) ;
-    ERR (GrB_transpose (A    , NULL , NULL   , A    , dgunk)) ;
+    ERR1 (Agunk, GrB_transpose (Agunk, NULL , NULL   , NULL , NULL )) ;
+    ERR1 (A,  GrB_transpose (A    , Agunk, NULL   , NULL , NULL )) ;
+    ERR1 (A,  GrB_transpose (A    , A    , op2gunk, NULL , NULL )) ;
+    ERR1 (A,  GrB_transpose (A    , NULL , NULL   , Agunk, NULL )) ;
+    ERR1 (A,  GrB_transpose (A    , NULL , NULL   , A    , dgunk)) ;
 
     expected = (Complex == GxB_FC64) ? GrB_DIMENSION_MISMATCH : GrB_DOMAIN_MISMATCH ;
 
     o2 = Complex_plus ;
-    ERR (GrB_transpose (A   , Z   , NULL, A, NULL)) ;
-    ERR (GrB_transpose (A   , NULL, NULL, Z, NULL)) ;
-    ERR (GrB_transpose (A   , NULL, NULL, Z, NULL)) ;
-    ERR (GrB_transpose (Z   , NULL, NULL, A, NULL)) ;
-    ERR (GrB_transpose (A   , NULL, o2  , A, NULL)) ;
-    ERR (GrB_transpose (A   , NULL, o2  , Z, NULL)) ;
-    ERR (GrB_transpose (Z   , NULL, o2  , A, NULL)) ;
+    ERR1 (A,  GrB_transpose (A   , Z   , NULL, A, NULL)) ;
+    ERR1 (A,  GrB_transpose (A   , NULL, NULL, Z, NULL)) ;
+    ERR1 (A,  GrB_transpose (A   , NULL, NULL, Z, NULL)) ;
+    ERR1 (Z,  GrB_transpose (Z   , NULL, NULL, A, NULL)) ;
+    ERR1 (A,  GrB_transpose (A   , NULL, o2  , A, NULL)) ;
+    ERR1 (A,  GrB_transpose (A   , NULL, o2  , Z, NULL)) ;
+    ERR1 (Z,  GrB_transpose (Z   , NULL, o2  , A, NULL)) ;
 
     expected = GrB_DIMENSION_MISMATCH ;
 
-    ERR (GrB_transpose (A   , NULL, NULL, A, NULL)) ;
-    ERR (GrB_transpose (C   , NULL, NULL, A, dtn )) ;
+    ERR1 (A,  GrB_transpose (A   , NULL, NULL, A, NULL)) ;
+    ERR1 (C,  GrB_transpose (C   , NULL, NULL, A, dtn )) ;
 
     //==========================================================================
     //=== internal functions ===================================================
@@ -3694,14 +3708,14 @@ void mexFunction
 
     expected = GrB_NULL_POINTER ;
 
-    ERR (GB_entry_check (NULL, NULL, 5, NULL, Context)) ;
-    ERR (GB_entry_check (NULL, X, 5, NULL, Context)) ;
-    OK (GB_entry_check (GrB_FP64, X, 5, NULL, Context)) ;
+    ERR (GB_entry_check (NULL, NULL, 5, NULL)) ;
+    ERR (GB_entry_check (NULL, X, 5, NULL)) ;
+    OK (GB_entry_check (GrB_FP64, X, 5, NULL)) ;
     printf ("\n") ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
-    ERR (GB_entry_check (Tgunk, X, 5, NULL, Context)) ;
+    ERR (GB_entry_check (Tgunk, X, 5, NULL)) ;
     printf ("\nAll GB_entry_check tests passed (errors expected)\n") ;
 
     //--------------------------------------------------------------------------
@@ -3712,14 +3726,12 @@ void mexFunction
 
     Context->where = "GB_Type_check" ;
 
-    // GrB_error is not updated since checking a null object may not be an
-    // error; it may indicate an optional input
-    info = GB_Type_check (NULL, "null type", GB1, ff, Context) ;
+    info = GB_Type_check (NULL, "null type", G1, ff) ;
     CHECK (info == GrB_NULL_POINTER) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
-    ERR (GB_Type_check (Tgunk, "Tgunk", GB1, ff, Context)) ;
+    ERR (GB_Type_check (Tgunk, "Tgunk", G1, ff)) ;
 
     CHECK (T == NULL) ;
     // test the function instead of the macro:
@@ -3727,27 +3739,27 @@ void mexFunction
     OK (GrB_Type_new (&T, sizeof (int))) ;
 
     Context->where = "GB_Type_check" ;
-    OK (GB_Type_check (T, "T ok (via function)", GB3, ff, Context)) ;
+    OK (GB_Type_check (T, "T ok (via function)", G3, ff)) ;
 
     T->magic = GB_FREED ;
-    ERR (GB_Type_check (T, "T freed", GB1, ff, Context)) ;
+    ERR (GB_Type_check (T, "T freed", G1, ff)) ;
     T->magic = GB_MAGIC ;
 
     expected = GrB_INVALID_OBJECT ;
 
     T->code = 99 ;
-    ERR (GB_Type_check (T, "T bad code", GB1, ff, Context)) ;
+    ERR (GB_Type_check (T, "T bad code", G1, ff)) ;
     T->code = GB_UDT_code ;
     T->magic = GB_MAGIC ;
     T->size = 0 ;
-    ERR (GB_Type_check (T, "T bad size", GB1, ff, Context)) ;
+    ERR (GB_Type_check (T, "T bad size", G1, ff)) ;
     T->size = sizeof (int) ;
 
     char *e = GB_code_string (9999) ;
     printf ("unknown code: [%s]\n", e) ;
     CHECK (strcmp (e, "unknown!") == 0) ;
 
-    OK (GB_Type_check (T, "type ok", GB1, ff, Context)) ;
+    OK (GB_Type_check (T, "type ok", G1, ff)) ;
     printf ("\nAll GB_Type_check tests passed (errors expected)\n") ;
 
     //--------------------------------------------------------------------------
@@ -3758,7 +3770,7 @@ void mexFunction
 
     Context->where = "GB_UnaryOp_check" ;
 
-    info = GB_UnaryOp_check (NULL, "null unary op", GB3, ff, Context) ;
+    info = GB_UnaryOp_check (NULL, "null unary op", G3, ff) ;
     CHECK (info == GrB_NULL_POINTER) ;
 
     CHECK (op1b == NULL) ;
@@ -3769,30 +3781,30 @@ void mexFunction
     OK (GrB_UnaryOp_wait_(&op1b)) ;
 
     Context->where = "GB_UnaryOp_check" ;
-    OK (GB_UnaryOp_check (op1b, "op1b ok (via function)", GB3, ff, Context)) ;
+    OK (GB_UnaryOp_check (op1b, "op1b ok (via function)", G3, ff)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
     op1b->magic = GB_FREED ;
-    ERR (GB_UnaryOp_check (op1b, "op1b freed", GB1, ff, Context)) ;
+    ERR (GB_UnaryOp_check (op1b, "op1b freed", G1, ff)) ;
     op1b->magic = GB_MAGIC ;
 
     expected = GrB_INVALID_OBJECT ;
 
     op1b->function = NULL ;
-    ERR (GB_UnaryOp_check (op1b, "op1b null func", GB1, ff, Context)) ;
+    ERR (GB_UnaryOp_check (op1b, "op1b null func", G1, ff)) ;
     op1b->function = f1 ;
 
     op1b->opcode = 1024 ;
-    ERR (GB_UnaryOp_check (op1b, "op1b invalid opcode", GB1, ff, Context)) ;
+    ERR (GB_UnaryOp_check (op1b, "op1b invalid opcode", G1, ff)) ;
     op1b->opcode = GB_USER_opcode ;
 
     op1b->ztype = NULL ;
-    ERR (GB_UnaryOp_check (op1b, "op1b invalid ztype", GB1, ff, Context)) ;
+    ERR (GB_UnaryOp_check (op1b, "op1b invalid ztype", G1, ff)) ;
     op1b->ztype = GrB_FP64 ;
 
     op1b->xtype = NULL ;
-    ERR (GB_UnaryOp_check (op1b, "op1b invalid xtype", GB1, ff, Context)) ;
+    ERR (GB_UnaryOp_check (op1b, "op1b invalid xtype", G1, ff)) ;
     op1b->xtype = GrB_UINT32 ;
 
     printf ("\nAll GB_UnaryOp_check tests passed (errors expected)\n") ;
@@ -3805,7 +3817,7 @@ void mexFunction
 
     Context->where = "GB_BinaryOp_check" ;
 
-    info = GB_BinaryOp_check (NULL, "null unary op", GB3, ff, Context) ;
+    info = GB_BinaryOp_check (NULL, "null unary op", G3, ff) ;
     CHECK (info == GrB_NULL_POINTER) ;
 
     CHECK (op2b == NULL) ;
@@ -3815,34 +3827,34 @@ void mexFunction
     CHECK (op2b != NULL) ;
 
     Context->where = "GB_BinaryOp_check" ;
-    OK (GB_BinaryOp_check (op2b, "op2b ok (via function)", GB3, ff, Context)) ;
+    OK (GB_BinaryOp_check (op2b, "op2b ok (via function)", G3, ff)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
     op2b->magic = GB_FREED ;
-    ERR (GB_BinaryOp_check (op2b, "op2b freed", GB1, ff, Context)) ;
+    ERR (GB_BinaryOp_check (op2b, "op2b freed", G1, ff)) ;
     op2b->magic = GB_MAGIC ;
 
     expected = GrB_INVALID_OBJECT ;
 
     op2b->function = NULL ;
-    ERR (GB_BinaryOp_check (op2b, "op2b null func", GB1, ff, Context)) ;
+    ERR (GB_BinaryOp_check (op2b, "op2b null func", G1, ff)) ;
     op2b->function = f2 ;
 
     op2b->opcode = 1024 ;
-    ERR (GB_BinaryOp_check (op2b, "op2b invalid opcode", GB1, ff, Context)) ;
+    ERR (GB_BinaryOp_check (op2b, "op2b invalid opcode", G1, ff)) ;
     op2b->opcode = GB_USER_opcode ;
 
     op2b->ztype = NULL ;
-    ERR (GB_BinaryOp_check (op2b, "op2b invalid ztype", GB1, ff, Context)) ;
+    ERR (GB_BinaryOp_check (op2b, "op2b invalid ztype", G1, ff)) ;
     op2b->ztype = GrB_INT32 ;
 
     op2b->xtype = NULL ;
-    ERR (GB_BinaryOp_check (op2b, "op2b invalid xtype", GB1, ff, Context)) ;
+    ERR (GB_BinaryOp_check (op2b, "op2b invalid xtype", G1, ff)) ;
     op2b->xtype = GrB_UINT8 ;
 
     op2b->ytype = NULL ;
-    ERR (GB_BinaryOp_check (op2b, "op2b invalid ytype", GB1, ff, Context)) ;
+    ERR (GB_BinaryOp_check (op2b, "op2b invalid ytype", G1, ff)) ;
     op2b->ytype = GrB_UINT16 ;
 
     printf ("\nAll GB_BinaryOp_check tests passed (errors expected)\n") ;
@@ -3855,7 +3867,7 @@ void mexFunction
 
     Context->where = "GB_SelectOp_check" ;
 
-    info = GB_SelectOp_check (NULL, "null selectop", GB3, ff, Context) ;
+    info = GB_SelectOp_check (NULL, "null selectop", G3, ff) ;
     CHECK (info == GrB_NULL_POINTER) ;
 
     CHECK (selectop == NULL) ;
@@ -3865,34 +3877,30 @@ void mexFunction
     CHECK (selectop != NULL) ;
 
     Context->where = "GB_SelectOp_check" ;
-    OK (GB_SelectOp_check (selectop, "user selectop ok (via function)", GB3,
-        ff, Context)) ;
+    OK (GB_SelectOp_check (selectop, "user selectop ok (via function)", G3,
+        ff)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
     selectop->magic = GB_FREED ;
-    ERR (GB_SelectOp_check (selectop, "selectop freed", GB1, ff, Context)) ;
+    ERR (GB_SelectOp_check (selectop, "selectop freed", G1, ff)) ;
     selectop->magic = GB_MAGIC ;
 
     expected = GrB_INVALID_OBJECT ;
 
     selectop->function = NULL ;
-    ERR (GB_SelectOp_check (selectop, "selectop invalid function", GB1, ff,
-        Context)) ;
+    ERR (GB_SelectOp_check (selectop, "selectop invalid function", G1, ff)) ;
     selectop->function = fselect ;
 
     selectop->opcode = 9999 ;
-    ERR (GB_SelectOp_check (selectop, "selectop invalid opcode", GB1, ff,
-        Context)) ;
+    ERR (GB_SelectOp_check (selectop, "selectop invalid opcode", G1, ff)) ;
     selectop->opcode = GB_USER_SELECT_opcode ;
 
     selectop->xtype = Tgunk ;
-    ERR (GB_SelectOp_check (selectop, "selectop invalid xtype", GB1, ff,
-        Context)) ;
+    ERR (GB_SelectOp_check (selectop, "selectop invalid xtype", G1, ff)) ;
     selectop->xtype = GrB_FP64 ;
 
-    OK (GB_SelectOp_check (selectop, "user selectop ok", GB3, ff,
-        Context)) ;
+    OK (GB_SelectOp_check (selectop, "user selectop ok", G3, ff)) ;
 
     printf ("\nAll GB_SelectOp_check tests passed (errors expected)\n") ;
 
@@ -3904,7 +3912,7 @@ void mexFunction
 
     Context->where = "GB_Monoid_check" ;
 
-    info = GB_Monoid_check (NULL, "null monoid", GB3, ff, Context) ;
+    info = GB_Monoid_check (NULL, "null monoid", G3, ff) ;
     CHECK (info == GrB_NULL_POINTER) ;
 
     CHECK (monoidb == NULL) ;
@@ -3912,29 +3920,26 @@ void mexFunction
     CHECK (monoidb != NULL) ;
 
     Context->where = "GB_Monoid_check" ;
-    OK (GB_Monoid_check (monoidb, "monoidb ok", GB3, ff, Context)) ;
+    OK (GB_Monoid_check (monoidb, "monoidb ok", G3, ff)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
     monoidb->magic = GB_FREED ;
-    ERR (GB_Monoid_check (monoidb, "monoidb freed", GB1, ff, Context)) ;
+    ERR (GB_Monoid_check (monoidb, "monoidb freed", G1, ff)) ;
     monoidb->magic = GB_MAGIC ;
 
     expected = GrB_INVALID_OBJECT ;
 
     monoidb->op = NULL ;
-    ERR (GB_Monoid_check (monoidb, "monoidb invalid op", GB1, ff, Context)) ;
+    ERR (GB_Monoid_check (monoidb, "monoidb invalid op", G1, ff)) ;
     monoidb->op = GrB_TIMES_INT32 ;
 
     monoidb->op = GrB_EQ_INT32 ;
-    ERR (GB_Monoid_check (monoidb, "monoidb invalid op domains", GB1, ff,
-        Context)) ;
+    ERR (GB_Monoid_check (monoidb, "monoidb invalid op domains", G1, ff)) ;
     monoidb->op = GrB_TIMES_INT32 ;
 
-    OK (GB_Monoid_check (Complex_plus_monoid, "complex plus monoid", GB3, ff,
-        Context)) ;
-    OK (GB_Monoid_check (Complex_times_monoid, "complex times monoid", GB3, ff,
-        Context)) ;
+    OK (GB_Monoid_check (Complex_plus_monoid, "complex plus monoid", G3, ff)) ;
+    OK (GB_Monoid_check (Complex_times_monoid, "complex times monoid", G3, ff)) ;
 
     printf ("\nAll GB_Monoid_check tests passed (errors expected)\n") ;
 
@@ -3946,7 +3951,7 @@ void mexFunction
 
     Context->where = "GB_Semiring_check" ;
 
-    info = GB_Semiring_check (NULL, "null semiring", GB3, ff, Context) ;
+    info = GB_Semiring_check (NULL, "null semiring", G3, ff) ;
     CHECK (info == GrB_NULL_POINTER) ;
 
     CHECK (semiring2 == NULL) ;
@@ -3954,29 +3959,26 @@ void mexFunction
     CHECK (semiring2 != NULL) ;
 
     Context->where = "GB_Semiring_check" ;
-    OK (GB_Semiring_check (semiring2, "semiring2 ok", GB3, ff, Context)) ;
+    OK (GB_Semiring_check (semiring2, "semiring2 ok", G3, ff)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
     semiring2->magic = GB_FREED ;
-    ERR (GB_Semiring_check (semiring2, "semiring2 freed", GB1, ff, Context)) ;
+    ERR (GB_Semiring_check (semiring2, "semiring2 freed", G1, ff)) ;
     semiring2->magic = GB_MAGIC ;
 
     expected = GrB_INVALID_OBJECT ;
 
     semiring2->add = NULL ;
-    ERR (GB_Semiring_check (semiring2, "semiring2 invalid add monoid", GB1, ff,
-        Context)) ;
+    ERR (GB_Semiring_check (semiring2, "semiring2 invalid add monoid", G1, ff)) ;
     semiring2->add = GxB_MAX_FP32_MONOID ;
 
     semiring2->multiply = NULL ;
-    ERR (GB_Semiring_check (semiring2, "semiring2 invalid mult", GB1, ff,
-        Context)) ;
+    ERR (GB_Semiring_check (semiring2, "semiring2 invalid mult", G1, ff)) ;
     semiring2->multiply = GrB_TIMES_FP32 ;
 
     semiring2->multiply = GrB_TIMES_INT32 ;
-    ERR (GB_Semiring_check (semiring2, "semiring2 invalid mix", GB1, ff,
-        Context)) ;
+    ERR (GB_Semiring_check (semiring2, "semiring2 invalid mix", G1, ff)) ;
     semiring2->multiply = GrB_TIMES_FP32 ;
 
     printf ("\nAll GB_Semiring_check tests passed (errors expected)\n") ;
@@ -3989,7 +3991,7 @@ void mexFunction
 
     Context->where = "GB_Descriptor_check" ;
 
-    info = GB_Descriptor_check (NULL, "null descriptor", GB3, ff, Context) ;
+    info = GB_Descriptor_check (NULL, "null descriptor", G3, ff) ;
     CHECK (info == GrB_NULL_POINTER) ;
 
     CHECK (descb == NULL) ;
@@ -3997,18 +3999,18 @@ void mexFunction
     CHECK (descb != NULL) ;
 
     Context->where = "GB_Descriptor_check" ;
-    OK (GB_Descriptor_check (descb, "descb ok", GB3, ff, Context)) ;
+    OK (GB_Descriptor_check (descb, "descb ok", G3, ff)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
     descb->magic = GB_FREED ;
-    ERR (GB_Descriptor_check (descb, "descb freed", GB1, ff, Context)) ;
+    ERR (GB_Descriptor_check (descb, "descb freed", G1, ff)) ;
     descb->magic = GB_MAGIC ;
 
     expected = GrB_INVALID_OBJECT ;
 
     descb->out = 42 ;
-    ERR (GB_Descriptor_check (descb, "descb invalid", GB1, ff, Context)) ;
+    ERR (GB_Descriptor_check (descb, "descb invalid", G1, ff)) ;
     descb->out = GxB_DEFAULT ;
 
     printf ("\nAll GB_Descriptor_check tests passed (errors expected)\n") ;
@@ -4024,35 +4026,39 @@ void mexFunction
 
     Context->where = "GB_Vector_check" ;
 
-    info = GB_Vector_check (NULL, "null vector", GB3, ff, Context) ;
+    info = GB_Vector_check (NULL, "null vector", G3, ff) ;
     CHECK (info == GrB_NULL_POINTER) ;
 
     CHECK (v == NULL) ;
     OK (GrB_Vector_new (&v, GrB_FP64, 10)) ;
     CHECK (v != NULL) ;
-    CHECK (!v->is_hyper) ;
+    CHECK (v->h == NULL) ;
 
     Context->where = "GB_Vector_check" ;
-    OK (GB_Vector_check (v, "v ok", GB3, ff, Context)) ;
+    OK (GB_Vector_check (v, "v ok", G3, ff)) ;
 
     OK (GrB_Vector_setElement_INT32 (v, 990, 0)) ;
     OK (GrB_Vector_setElement_INT32 (v, 991, 1)) ;
     OK (GrB_Vector_nvals (&nvals, v)) ;
+    OK (GrB_Vector_wait_(&v)) ;
     CHECK (nvals == 2) ;
-    OK (GxB_Vector_fprint (v, "v ok", GB3, ff)) ;
+    OK (GxB_Vector_fprint (v, "v ok (might be bitmap)", G3, ff)) ;
+    OK (GxB_Vector_Option_set (v, GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
+    OK (GxB_Vector_fprint (v, "v ok (sparse)", G3, ff)) ;
 
     expected = GrB_INVALID_OBJECT ;
+    CHECK (!GB_IS_FULL (v)) ;
     v->i [0] = 1 ;
     v->i [1] = 0 ;
-    ERR (GxB_Vector_fprint (v, "v jumbled", GB3, ff)) ;
+    ERR (GxB_Vector_fprint (v, "v jumbled", G3, ff)) ;
     v->i [0] = 0 ;
     v->i [1] = 1 ;
-    OK (GxB_Vector_fprint (v, "v fixed", GB3, ff)) ;
+    OK (GxB_Vector_fprint (v, "v fixed", G3, ff)) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
     v->magic = GB_FREED ;
-    ERR (GB_Vector_check (v, "v freed", GB1, ff, Context)) ;
+    ERR (GB_Vector_check (v, "v freed", G1, ff)) ;
     v->magic = GB_MAGIC ;
 
     expected = GrB_INVALID_OBJECT ;
@@ -4060,11 +4066,12 @@ void mexFunction
     v->vdim = 2 ;
     int64_t *psave = v->p ;
     v->p = mxCalloc (3, sizeof (int64_t)) ;
-    ERR (GB_Vector_check (v, "v invalid", GB1, ff, Context)) ;
+    ERR (GB_Vector_check (v, "v invalid", G1, ff)) ;
     v->vdim = 1 ;
 
+    CHECK (!GB_IS_FULL (v)) ;
     v->p [0] = 1 ;
-    ERR (GB_Vector_check (v, "v p[0] invalid", GB1, ff, Context)) ;
+    ERR (GB_Vector_check (v, "v p[0] invalid", G1, ff)) ;
 
     mxFree (v->p) ;
     v->p = psave ;
@@ -4083,7 +4090,7 @@ void mexFunction
 
     Context->where = "GB_Matrix_check" ;
 
-    info = GB_Matrix_check (NULL, "null matrix", GB3, ff, Context) ;
+    info = GB_Matrix_check (NULL, "null matrix", G3, ff) ;
     CHECK (info == GrB_NULL_POINTER) ;
 
     CHECK (A == NULL) ;
@@ -4091,39 +4098,36 @@ void mexFunction
     CHECK (A != NULL) ;
 
     Context->where = "GB_Matrix_check" ;
-    OK (GB_Matrix_check (A, "A ok", GB3, ff, Context)) ;
-    CHECK (A->is_hyper) ;
+    OK (GB_Matrix_check (A, "A ok", G3, ff)) ;
+    CHECK (A->h != NULL) ;
 
     expected = GrB_UNINITIALIZED_OBJECT ;
 
     A->magic = GB_FREED ;
-    ERR (GB_Matrix_check (A, "A freed", GB1, ff, Context)) ;
+    ERR (GB_Matrix_check (A, "A freed", G1, ff)) ;
     A->magic = GB_MAGIC ;
 
     expected = GrB_INVALID_OBJECT ;
 
+    CHECK (!GB_IS_FULL (A)) ;
     A->p [0] = 1 ;
-    ERR (GB_Matrix_check (A, "p[0] invalid", GB1, ff, Context)) ;
+    ERR (GB_Matrix_check (A, "p[0] invalid", G1, ff)) ;
     A->p [0] = 0 ;
 
     A->vlen = -1 ;
-    ERR (GB_Matrix_check (A, "invalid dimensions", GB1, ff, Context)) ;
+    ERR (GB_Matrix_check (A, "invalid dimensions", G1, ff)) ;
     A->vlen = 10 ;
 
     A->type = NULL ;
-    ERR (GB_Matrix_check (A, "invalid type", GB1, ff, Context)) ;
+    ERR (GB_Matrix_check (A, "invalid type", G1, ff)) ;
     A->type = GrB_FP64 ;
 
     psave = A->p ;
     A->p = NULL ;
-    ERR (GB_Matrix_check (A, "NULL Ap", GB1, ff, Context)) ;
+    ERR (GB_Matrix_check (A, "NULL Ap", G1, ff)) ;
     A->p = psave ;
 
     CHECK (A->i == NULL) ;
-    A->i = mxMalloc (1) ;
-    ERR (GB_Matrix_check (A, "invalid empty", GB1, ff, Context)) ;
-    mxFree (A->i) ;
-    A->i = NULL ;
 
     OK (GrB_Matrix_free_(&A)) ;
     OK (GrB_Matrix_new (&A, GrB_FP64, 10, 4)) ;
@@ -4132,31 +4136,27 @@ void mexFunction
     GrB_Index J00 [1] = { 0 } ;
     OK (GrB_Matrix_setElement_FP64 (A, 3.14159, 0, 0)) ;
     OK (GrB_Matrix_assign_BOOL (A, NULL, GrB_SECOND_FP64, true, I00, 1, J00, 1, NULL)) ;
-    OK (GB_Matrix_check (A, "valid pending pi", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "valid pending pi", G3, NULL)) ;
     OK (GrB_Matrix_nvals (&nvals, A)) ;
+    OK (GrB_Matrix_wait_(&A)) ;
     CHECK (nvals == 1) ;
 
     printf ("\n========================================== valid pi\n") ;
-    OK (GB_Matrix_check (A, "valid pi", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "valid pi", G3, NULL)) ;
     printf ("\n===================================================\n") ;
 
     OK (GrB_Matrix_free_(&A)) ;
     OK (GrB_Matrix_new (&A, GrB_FP64, 10, 4)) ;
-    OK (GB_Matrix_check (A, "A empty here", GB3, NULL, Context)) ;
+    OK (GxB_Matrix_Option_set_(A, GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
+    OK (GB_Matrix_check (A, "A empty", G3, NULL)) ;
 
     // change the type of the pending tuples, forcing a wait
     OK (GrB_Matrix_assign_BOOL (A, NULL, GrB_SECOND_FP64, (bool) true,
         I00, 1, J00, 1, NULL)) ;
-    OK (GB_Matrix_check (A, "with bool pending", GB3, NULL, Context)) ;
-
-
-    AP = A->Pending ;
-    CHECK (AP != NULL) ;
-    CHECK (AP->n == 1) ;
-    CHECK (AP->type == GrB_BOOL) ;
+    OK (GB_Matrix_check (A, "with bool pending", G3, NULL)) ;
 
     OK (GrB_Matrix_setElement_FP64 (A, 3.14159, 3, 3)) ;
-    OK (GB_Matrix_check (A, "with pi pending", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "with pi pending", G3, NULL)) ;
 
     AP = A->Pending ;
     CHECK (AP != NULL) ;
@@ -4170,51 +4170,53 @@ void mexFunction
     CHECK (AP->n == 2) ;
     CHECK (AP->type == GrB_FP64) ;
 
-    OK (GB_Matrix_check (A, "with pi and 9.0909 pending", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "with pi and 9.0909 pending", G3, NULL)) ;
 
     OK (GrB_Matrix_nvals (&nvals, A)) ;
+    OK (GrB_Matrix_wait_(&A)) ;
     CHECK (nvals == 3) ;
 
     Context->where = "GB_Matrix_check" ;
 
     psave = A->i ;
     A->i = NULL ;
-    ERR (GB_Matrix_check (A, "NULL Ai", GB1, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "NULL Ai", G1, NULL)) ;
     A->i = psave ;
-    OK (GB_Matrix_check (A, "valid pi", GB0, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "valid pi", G0, NULL)) ;
 
+    CHECK (!GB_IS_FULL (A)) ;
     A->p [0] = 1 ;
-    ERR (GB_Matrix_check (A, "Ap[0] invalid", GB1, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "Ap[0] invalid", G1, NULL)) ;
     A->p [0] = 0 ;
 
     int64_t isave = A->p [1] ;
     A->p [1] = -1 ;
-    ERR (GB_Matrix_check (A, "Ap[1] invalid", GB1, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "Ap[1] invalid", G1, NULL)) ;
     A->p [1] = isave ;
 
     isave = A->p [4] ;
     A->p [4] += 999 ;
-    ERR (GB_Matrix_check (A, "Ap[ncols] invalid", GB1, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "Ap[ncols] invalid", G1, NULL)) ;
     A->p [4] = isave ;
 
     isave = A->nzombies ;
     A->nzombies = -1 ;
-    ERR (GB_Matrix_check (A, "negative zombies", GB1, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "negative zombies", G1, NULL)) ;
     A->nzombies = isave ;
 
     isave = A->nzombies ;
     A->nzombies = 1000 ;
-    ERR (GB_Matrix_check (A, "too many zombies", GB1, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "too many zombies", G1, NULL)) ;
     A->nzombies = isave ;
 
     isave = A->i [0] ;
     A->i [0] = -1 ;
-    ERR (GB_Matrix_check (A, "row index invalid", GB3, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "row index invalid", G3, NULL)) ;
     A->i [0] = isave ;
 
     isave = A->nzombies ;
     A->nzombies = 1 ;
-    ERR (GB_Matrix_check (A, "bad zombies", GB3, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "bad zombies", G3, NULL)) ;
     A->nzombies = isave ;
 
     AP = A->Pending ;
@@ -4222,7 +4224,7 @@ void mexFunction
 
     printf ("\n========================================== valid [pi 7.1]\n") ;
     OK (GrB_Matrix_setElement_FP64 (A, 7.1, 1, 0)) ;
-    OK (GB_Matrix_check (A, "valid pending [pi 7.1]", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "valid pending [pi 7.1]", G3, NULL)) ;
     printf ("\n===================================================\n") ;
 
     Context->where = "GB_Matrix_check" ;
@@ -4231,31 +4233,30 @@ void mexFunction
     CHECK (AP != NULL) ;
     isave = AP->n ;
     AP->n = -1 ;
-    ERR (GB_Matrix_check (A, "negative pending", GB1, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "negative pending", G1, NULL)) ;
     AP->n = isave ;
 
     AP = A->Pending ;
     CHECK (AP != NULL) ;
     psave = AP->i ;
     AP->i = NULL ;
-    ERR (GB_Matrix_check (A, "missing pending", GB3, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "missing pending", G3, NULL)) ;
     AP->i = psave ;
 
-    OK (GB_Matrix_check (A, "valid pending [pi 7.1]", GB0, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "valid pending [pi 7.1]", G0, NULL)) ;
 
     AP = A->Pending ;
     CHECK (AP != NULL) ;
     CHECK (AP->j != NULL) ;
     isave = AP->j [0] ;
     AP->j [0] = 1070 ;
-    ERR (GB_Matrix_check (A, "bad pending tuple", GB3, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "bad pending tuple", G3, NULL)) ;
     AP->j [0] = isave ;
-    OK (GB_Matrix_check (A, "valid pending [pi 7.1]", GB0, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "valid pending [pi 7.1]", G0, NULL)) ;
 
     printf ("\n====================================== valid [pi 7.1 11.4]\n") ;
     OK (GrB_Matrix_setElement_FP64 (A, 11.4, 0, 1)) ;
-    OK (GB_Matrix_check (A, "valid pending [pi 7.1 11.4]", GB3, NULL,
-        Context)) ;
+    OK (GB_Matrix_check (A, "valid pending [pi 7.1 11.4]", G3, NULL)) ;
     printf ("\n=========================================================\n") ;
 
     Context->where = "GB_Matrix_check" ;
@@ -4264,65 +4265,54 @@ void mexFunction
     CHECK (AP != NULL) ;
     isave = AP->j [0] ;
     AP->j [0] = 2 ;
-    ERR (GB_Matrix_check (A, "jumbled pending tuples", GB3, ff, Context)) ;
-    ERR (GxB_Matrix_fprint (A, "jumbled pending tuples", GB3, ff)) ;
+    ERR (GB_Matrix_check (A, "jumbled pending tuples", G3, ff)) ;
+    ERR (GxB_Matrix_fprint (A, "jumbled pending tuples", G3, ff)) ;
     AP->j [0] = isave ;
-    OK (GB_Matrix_check (A, "valid pending [pi 7.1 11.4]", GB0, ff, Context)) ;
+    OK (GB_Matrix_check (A, "valid pending [pi 7.1 11.4]", G0, ff)) ;
 
     AP = A->Pending ;
     CHECK (AP != NULL) ;
     CHECK (AP->op == NULL) ;
     AP->op = op2gunk ;
-    ERR (GB_Matrix_check (A, "invalid operator", GB3, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "invalid operator", G3, NULL)) ;
     AP->op = NULL ;
-    OK (GB_Matrix_check (A, "valid pending [pi 7.1 11.4]", GB0, NULL, Context));
-
-    CHECK (GB_Global_queue_head_get ( ) == A) ;
-    GB_Global_queue_head_set (NULL) ;
-    ERR (GB_Matrix_check (A, "inconsistent queue", GB3, NULL, Context)) ;
-    A->enqueued = false ;
-    ERR (GB_Matrix_check (A, "missing from queue", GB3, NULL, Context)) ;
-    GB_Global_queue_head_set (A) ;
-    A->enqueued = true ;
-    OK (GB_Matrix_check (A, "valid pending [pi 7.1 11.4]", GB0, NULL,
-        Context)) ;
-
-    CHECK (A->queue_prev == NULL) ;
-    A->queue_prev = A ;
-    ERR (GB_Matrix_check (A, "invalid queue", GB3, NULL, Context)) ;
-    A->queue_prev = NULL ;
-    printf ("\n====================================== valid [pi 7.1 11.4]\n") ;
-    OK (GB_Matrix_check (A, "valid pending [pi 7.1 11.4]", GB3, NULL,
-        Context)) ;
+
+    OK (GB_Matrix_check (A, "valid pending [pi 7.1 11.4]", G3, NULL)) ;
     printf ("\n=========================================================\n") ;
 
     printf ("\n###### get nvals; assemble the pending tuples ##### \n") ;
 
     OK (GrB_Matrix_nvals (&nvals, A)) ;
+    OK (GrB_Matrix_wait_(&A)) ;
 
     Context->where = "GB_Matrix_check" ;
     printf ("\n====================================== valid [pi 7.1 11.4]\n") ;
-    OK (GB_Matrix_check (A, "valid [pi 7 11.4]", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "valid [pi 7 11.4]", G3, NULL)) ;
     printf ("\n=========================================================\n") ;
     CHECK (nvals == 5) ;
 
     expected = GrB_INDEX_OUT_OF_BOUNDS ;
 
+    CHECK (!GB_IS_FULL (A)) ;
+    CHECK (!GB_IS_BITMAP (A)) ;
     A->i [0] = 1 ;
     A->i [1] = 0 ;
 
-    info = GB_Matrix_check (A, "jumbled", GB3, NULL, Context) ;
+    info = GB_Matrix_check (A, "jumbled", G3, NULL) ;
     printf ("jumbled info %d\n", info) ;
     CHECK (info == GrB_INDEX_OUT_OF_BOUNDS) ;
 
-    info = GxB_Matrix_fprint (A, "jumbled", GB3, ff) ;
+    info = GxB_Matrix_fprint (A, "jumbled", G3, ff) ;
     printf ("jumbled info %d\n", info) ;
     CHECK (info == GrB_INVALID_OBJECT) ;
 
+    CHECK (!GB_IS_FULL (A)) ;
+    CHECK (!GB_IS_BITMAP (A)) ;
     A->i [0] = 0 ;
     A->i [1] = 1 ;
-    OK (GB_Matrix_check (A, "OK", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "OK", G3, NULL)) ;
     OK (GrB_Matrix_nvals (&nvals, A)) ;
+    OK (GrB_Matrix_wait_(&A)) ;
     CHECK (nvals == 5) ;
 
     AP = A->Pending ;
@@ -4332,73 +4322,96 @@ void mexFunction
     I [0] = 0 ;
     J [0] = 0 ;
     OK (GxB_Matrix_subassign (A, NULL, NULL, Empty1, I, 1, J, 1, NULL)) ;
-    OK (GB_Matrix_check (A, "valid zombie", GB3, NULL, Context)) ;
-    CHECK (A->Pending == NULL && A->nzombies == 1) ;
+    OK (GB_Matrix_check (A, "valid zombie", G3, NULL)) ;
     OK (GrB_Matrix_setElement_INT32 (A, 99099, 0, 0)) ;
-    CHECK (A->Pending == NULL && A->nzombies == 0) ;
-    OK (GB_Matrix_check (A, "no more zombie", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "no more zombie", G3, NULL)) ;
     OK (GrB_Matrix_nvals (&nvals, A)) ;
+    OK (GrB_Matrix_wait_(&A)) ;
     CHECK (nvals == 5) ;
 
     OK (GxB_Matrix_subassign (A, NULL, NULL, Empty1, I, 1, J, 1, NULL)) ;
-    OK (GB_Matrix_check (A, "valid zombie", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "valid zombie", G3, NULL)) ;
     OK (GrB_Matrix_nvals (&nvals, A)) ;
+    OK (GrB_Matrix_wait_(&A)) ;
     CHECK (nvals == 4) ;
-    OK (GB_Matrix_check (A, "again no more zombie", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "again no more zombie", G3, NULL)) ;
     OK (A->Pending == NULL && A->nzombies == 0) ;
 
     expected = GrB_INVALID_OBJECT ;
 
-    CHECK (GB_Global_queue_head_get ( ) == NULL) ;
-    GB_Global_queue_head_set (A) ;
-    A->enqueued = true ;
-    ERR (GB_Matrix_check (A, "should not be in queue", GB3, NULL, Context)) ;
-    OK  (GB_Matrix_check (A, "ignore queue", GB_FLIP (GB3), NULL, Context)) ;
-    GB_Global_queue_head_set (NULL) ;
-    A->enqueued = false ;
-    OK (GB_Matrix_check (A, "valid, no pending", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "valid, no pending", G3, NULL)) ;
 
     // #define FREE_DEEP_COPY ;
     // #define GET_DEEP_COPY ;
 
-    OK (GB_to_hyper (A, Context)) ;
-    OK (GB_Matrix_check (A, "A now hyper", GB3, NULL, Context)) ;
-    CHECK (A->is_hyper) ;
+    CHECK (GB_IS_SPARSE (A)) ;
+    OK (GxB_Matrix_Option_set_(A, GxB_SPARSITY_CONTROL, GxB_HYPERSPARSE)) ;
+    CHECK (GB_IS_HYPERSPARSE (A)) ;
+    OK (GB_Matrix_check (A, "A now hyper", G3, NULL)) ;
+    CHECK (A->h != NULL) ;
 
-    OK (GxB_Matrix_Option_set_(A, GxB_HYPER, GxB_NEVER_HYPER)) ;
-    CHECK (!A->is_hyper) ;
+    OK (GxB_Matrix_Option_set_(A, GxB_HYPER_SWITCH, GxB_NEVER_HYPER)) ;
+    OK (GxB_Matrix_Option_set_(A, GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
+    CHECK (A->h == NULL) ;
     bool A_is_hyper ;
-    OK (GxB_Matrix_Option_get_(A, GxB_IS_HYPER, &A_is_hyper)) ;
+    OK (GxB_Matrix_Option_get_(A, GxB_IS_HYPER, &A_is_hyper)) ; // deprecated
     CHECK (!A_is_hyper) ;
 
-    OK (GxB_Matrix_Option_set_(A, GxB_HYPER, GxB_ALWAYS_HYPER)) ;
-    CHECK (A->is_hyper) ;
-    OK (GxB_Matrix_Option_get_(A, GxB_IS_HYPER, &A_is_hyper)) ;
+    OK (GxB_Matrix_Option_set_(A, GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER)) ;
+    OK (GxB_Matrix_Option_set_(A, GxB_SPARSITY_CONTROL, GxB_HYPERSPARSE)) ;
+    CHECK (A->h != NULL) ;
+    OK (GxB_Matrix_Option_get_(A, GxB_IS_HYPER, &A_is_hyper)) ; // deprecated
     CHECK (A_is_hyper) ;
 
     // make sure A->nvec_nonempty is valid
+    CHECK (GB_IS_HYPERSPARSE (A)) ;
     if (A->nvec_nonempty < 0)
     { 
         A->nvec_nonempty = GB_nvec_nonempty (A, NULL) ;
     }
 
-    // now make invalid.  GB_Matrix_check requires it to be -1, or the correct value
-    expected = GrB_INVALID_OBJECT ;
+    // now make invalid.  GB_Matrix_check requires it to be -1, or correct value
+    CHECK (GB_IS_HYPERSPARSE (A)) ;
     isave = A->p [1] ;
     A->p [1] = 0 ;
-    ERR (GB_Matrix_check (A, "A with bad nvec_nonempty", GB1, NULL, Context)) ;
-    ERR (GxB_Matrix_fprint (A, "A", GB1, ff)) ;
+    expected = GrB_INDEX_OUT_OF_BOUNDS ;
+    ERR (GB_Matrix_check (A, "A with bad nvec_nonempty", G1, NULL)) ;
+    expected = GrB_INVALID_OBJECT ;
+    ERR (GxB_Matrix_fprint (A, "A", G1, ff)) ;
+    CHECK (GB_IS_HYPERSPARSE (A)) ;
     A->p [1] = isave ;
-    OK (GB_Matrix_check (A, "A fixed", GB0, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "A fixed", G0, NULL)) ;
 
     double hratio = 0.5;
-    OK (GxB_Matrix_Option_set_(A, GxB_HYPER, hratio)) ;
+    OK (GxB_Matrix_Option_set_(A, GxB_HYPER_SWITCH, hratio)) ;
 
     double hratio2 = 0 ;
-    OK (GxB_Matrix_Option_get_(A, GxB_HYPER, &hratio2)) ;
+    OK (GxB_Matrix_Option_get_(A, GxB_HYPER_SWITCH, &hratio2)) ;
     CHECK (hratio == hratio2) ;
 
+    double bswitch [GxB_NBITMAP_SWITCH] ;
+    for (int k = 0 ; k < GxB_NBITMAP_SWITCH ; k++)
+    {
+        bswitch [k] = (double) k / 16.0 ;
+    }
+    OK (GxB_Global_Option_set_(GxB_BITMAP_SWITCH, bswitch)) ;
+
+    double bswitch2 [GxB_NBITMAP_SWITCH] ;
+    OK (GxB_Global_Option_get_(GxB_BITMAP_SWITCH, bswitch2)) ;
+    for (int k = 0 ; k < GxB_NBITMAP_SWITCH ; k++)
+    {
+        CHECK (fabs (bswitch [k] - bswitch2 [k]) < 1e-5) ;
+    }
+
+    OK (GxB_Global_Option_set_(GxB_BITMAP_SWITCH, NULL)) ;
+    OK (GxB_Global_Option_get_(GxB_BITMAP_SWITCH, bswitch)) ;
+    for (int k = 0 ; k < GxB_NBITMAP_SWITCH ; k++)
+    {
+        printf ("default bswitch [%d] = %g\n", k, bswitch [k]) ;
+    }
+
     OK (GxB_Matrix_Option_set_(A, GxB_FORMAT, GxB_BY_COL)) ;
+    CHECK (GB_IS_HYPERSPARSE (A)) ;
     CHECK (A->is_csc) ;
 
     GxB_Format_Value format = 0;
@@ -4419,59 +4432,63 @@ void mexFunction
     OK (GxB_Global_Option_get_(GxB_FORMAT, &format)) ;
     CHECK (format == 0) ;
 
-//    OK (GxB_Global_Option_set (GxB_FORMAT, GxB_BY_COL)) ;
-
-    OK (GxB_Global_Option_set_(GxB_HYPER, 77.33)) ;
-    OK (GxB_Global_Option_get_(GxB_HYPER, &hratio)) ;
-    CHECK (hratio == 77.33) ;
+    OK (GxB_Global_Option_set_(GxB_HYPER_SWITCH, 77.33f)) ;
+    OK (GxB_Global_Option_get_(GxB_HYPER_SWITCH, &hratio)) ;
+    printf ("%g\n", hratio) ;
+    CHECK (hratio == 77.33f) ;
 
-    OK (GxB_Global_Option_set_(GxB_HYPER, GxB_HYPER_DEFAULT)) ;
-    OK (GxB_Global_Option_get_(GxB_HYPER, &hratio)) ;
+    OK (GxB_Global_Option_set_(GxB_HYPER_SWITCH, GxB_HYPER_DEFAULT)) ;
+    OK (GxB_Global_Option_get_(GxB_HYPER_SWITCH, &hratio)) ;
     CHECK (hratio == GxB_HYPER_DEFAULT) ;
 
     expected = GrB_NULL_POINTER ;
     GrB_Matrix O_NULL = NULL ;
-    ERR (GxB_Matrix_Option_set_(O_NULL, GxB_FORMAT, GxB_BY_COL)) ;
-    printf ("error expected (A is null):%s\n", GrB_error ( )) ;
+    ERR1 (O_NULL, GxB_Matrix_Option_set_(O_NULL, GxB_FORMAT, GxB_BY_COL)) ;
 
     expected = GrB_NULL_POINTER ;
     ERR (GxB_Global_Option_get_(GxB_FORMAT, NULL)) ;
-    printf ("error expected (global null):%s\n", GrB_error ( )) ;
 
     expected = GrB_NULL_POINTER ;
     ERR (GxB_Matrix_Option_get_(A, GxB_FORMAT, NULL)) ;
-    printf ("error expected (A format null):%s\n", GrB_error ( )) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("error expected (A format null):%s\n", err) ;
 
     expected = GrB_NULL_POINTER ;
-    ERR (GxB_Matrix_Option_get_(A, GxB_HYPER, NULL)) ;
-    printf ("error expected:%s\n", GrB_error ( )) ;
+    ERR (GxB_Matrix_Option_get_(A, GxB_HYPER_SWITCH, NULL)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("error expected:%s\n", err) ;
+
+    ERR (GxB_Matrix_Option_get_(A, GxB_BITMAP_SWITCH, NULL)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("error expected:%s\n", err) ;
 
     expected = GrB_NULL_POINTER ;
-    ERR (GxB_Global_Option_get_(GxB_HYPER, NULL)) ;
-    printf ("error expected (global hyper null):%s\n", GrB_error ( )) ;
+    ERR (GxB_Global_Option_get_(GxB_HYPER_SWITCH, NULL)) ;
+    ERR (GxB_Global_Option_get_(GxB_BITMAP_SWITCH, NULL)) ;
 
     expected = GrB_INVALID_VALUE ;
     ERR (GxB_Global_Option_get_(-1, NULL)) ;
-    printf ("error expected (bad field):%s\n", GrB_error ( )) ;
 
     ERR (GxB_Matrix_Option_get_(A, 999, NULL)) ;
-    printf ("error expected (bad field):%s\n", GrB_error ( )) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("error expected (bad field):%s\n", err) ;
 
-    ERR (GxB_Matrix_Option_set_(A, 999, GxB_BY_ROW)) ;
-    printf ("error expected:%s\n", GrB_error ( )) ;
+    ERR1 (A, GxB_Matrix_Option_set_(A, 999, GxB_BY_ROW)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("error expected:%s\n", err) ;
 
     ERR (GxB_Global_Option_set_(999, GxB_BY_ROW)) ;
-    printf ("error expected:%s\n", GrB_error ( )) ;
 
     expected = GrB_INVALID_VALUE ;
     ERR (GxB_Global_Option_set_(GxB_FORMAT, 9999)) ;
-    printf ("error expected:%s\n", GrB_error ( )) ;
 
-    ERR (GxB_Matrix_Option_set_(A, 999, GxB_BY_ROW)) ;
-    printf ("error expected:%s\n", GrB_error ( )) ;
+    ERR1 (A, GxB_Matrix_Option_set_(A, 999, GxB_BY_ROW)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("error expected:%s\n", err) ;
 
-    ERR (GxB_Matrix_Option_set_(A, GxB_FORMAT, 909090)) ;
-    printf ("error expected:%s\n", GrB_error ( )) ;
+    ERR1 (A, GxB_Matrix_Option_set_(A, GxB_FORMAT, 909090)) ;
+    GrB_Matrix_error_(&err, A) ;
+    printf ("error expected:%s\n", err) ;
 
     CHECK (A != NULL) ;
     CHECK (A->is_csc) ;
@@ -4479,44 +4496,37 @@ void mexFunction
     // #undef FREE_DEEP_COPY
     // #undef GET_DEEP_COPY
 
+    OK (GxB_Matrix_Option_set_(A, GxB_SPARSITY_CONTROL, GxB_HYPERSPARSE)) ;
+
     expected = GrB_INVALID_OBJECT ;
 
     int64_t *Ah_save = A->h ;
     A->h = NULL ;
-    ERR (GB_Matrix_check (A, "h invalid", GB1, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "h invalid", G3, NULL)) ;
     A->h = Ah_save ;
-    OK (GB_Matrix_check (A, "h restored", GB1, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "h restored", G1, NULL)) ;
 
     int64_t nvec = A->nvec ;
     A->nvec = -1 ;
-    ERR (GB_Matrix_check (A, "nvec invalid", GB1, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "nvec invalid", G1, NULL)) ;
     A->nvec = nvec ;
-    OK (GB_Matrix_check (A, "nvec restored", GB1, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "nvec restored", G1, NULL)) ;
 
+    CHECK (!GB_IS_FULL (A)) ;
+    CHECK (A->h != NULL) ;
     int64_t jsave = A->h [0] ;
     A->h [0] = -1 ;
-    ERR (GB_Matrix_check (A, "h[0] invalid", GB1, NULL, Context)) ;
+    ERR (GB_Matrix_check (A, "h[0] invalid", G1, NULL)) ;
     A->h [0] = jsave ;
-    OK (GB_Matrix_check (A, "h[0] restored", GB1, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "h[0] restored", G1, NULL)) ;
 
     GrB_Matrix Eleven ;
     OK (GrB_Matrix_new (&Eleven, GrB_BOOL, 11, 11)) ;
     I [0] = 0 ;
     OK (GrB_Matrix_assign_BOOL (Eleven, NULL, NULL, (bool) true, I, 1, GrB_ALL, 0, NULL)) ;
+    OK (GB_Matrix_check (Eleven, "Eleven", G2, NULL)) ;
 
-    AP = Eleven->Pending ;
-    CHECK (AP != NULL) ;
-    GrB_Type tsave = AP->type ;
-
-    OK (GB_Matrix_check (Eleven, "Eleven", GB2, NULL, Context)) ;
-    AP->type = NULL ;
-    ERR (GB_Matrix_check (Eleven, "Eleven invalid pending type", GB2, NULL,
-        Context)) ;
-    ERR (GxB_Matrix_fprint (Eleven, "Eleven invalid pending type", GB2, ff)) ;
-    AP->type = tsave ;
-    OK (GB_Matrix_check (Eleven, "Eleven", GB2, NULL, Context)) ;
-
-    GB_Matrix_wait (Eleven, Context) ;
+    if (!GB_IS_FULL (Eleven)) GB_Matrix_wait (Eleven, Context) ;
 
     for (int pr = -4 ; pr <= 3 ; pr++)
     {
@@ -4525,12 +4535,12 @@ void mexFunction
         OK (GxB_Matrix_fprint (Eleven, "Eleven", pr, ff)) ;
     }
 
-    OK (GB_to_nonhyper (Eleven, Context)) ;
+    OK (GB_convert_hyper_to_sparse (Eleven, Context)) ;
     int64_t nothing = 42 ;
     Eleven->h = &nothing ;
-    ERR (GB_Matrix_check (Eleven, "Eleven invalid", GB2, NULL, Context)) ;
-    ERR (GxB_Matrix_fprint (Eleven, "Eleven", GB2, NULL)) ;
-    ERR (GxB_Matrix_fprint (Eleven, "Eleven invalid", GB2, ff)) ;
+    ERR (GB_Matrix_check (Eleven, "Eleven invalid", G2, NULL)) ;
+    ERR (GxB_Matrix_fprint (Eleven, "Eleven", G2, NULL)) ;
+    ERR (GxB_Matrix_fprint (Eleven, "Eleven invalid", G2, ff)) ;
     Eleven->h = NULL ;
 
     OK (GrB_Matrix_free_(&Eleven)) ;
@@ -4544,8 +4554,9 @@ void mexFunction
         }
     }
     OK (GrB_Matrix_nvals (&nvals, Eleven)) ;
+    OK (GrB_Matrix_wait_(&Eleven)) ;
     CHECK (nvals == 121) ;
-    OK (GB_Matrix_check (Eleven, "Eleven", GB2, NULL, Context)) ;
+    OK (GB_Matrix_check (Eleven, "Eleven", G2, NULL)) ;
     OK (GrB_Matrix_free_(&Eleven)) ;
 
     printf ("\nAll GB_Matrix_check tests passed (errors expected)\n") ;
@@ -4555,18 +4566,18 @@ void mexFunction
     //--------------------------------------------------------------------------
 
     OK (GrB_Matrix_setElement_FP64 (A, 32.4, 3, 2)) ;
-    OK (GB_Matrix_check (A, "A with one pending", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "A with one pending", G3, NULL)) ;
     AP = A->Pending ;
     CHECK (AP != NULL) ;
     CHECK (AP->n == 1 && A->nzombies == 0) ;
     GB_Global_mode_set (GrB_BLOCKING) ;
     OK (GB_block (A, Context)) ;
-    OK (GB_Matrix_check (A, "A with no pending", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "A with no pending", G3, NULL)) ;
     AP = A->Pending ;
     CHECK (AP == NULL) ;
     CHECK (A->nzombies == 0) ;
     OK (GrB_Matrix_setElement_FP64 (A, 99.4, 3, 3)) ;
-    OK (GB_Matrix_check (A, "A blocking mode", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "A blocking mode", G3, NULL)) ;
     GB_Global_mode_set (GrB_NONBLOCKING) ;
     AP = A->Pending ;
     CHECK (AP == NULL) ;
@@ -4584,27 +4595,27 @@ void mexFunction
 
     Context->where = "GB *_check" ;
 
-    ERR (GB_Type_check (Tgunk, "", GB0, NULL, Context)) ;
-    ERR (GB_UnaryOp_check (op1gunk, "", GB0, NULL, Context)) ;
-    ERR (GB_BinaryOp_check (op2gunk, "", GB0, NULL, Context)) ;
-    ERR (GB_Monoid_check (monoid_gunk, "", GB0, NULL, Context)) ;
-    ERR (GB_Semiring_check (semigunk, "", GB0, NULL, Context)) ;
-    ERR (GB_Vector_check (vgunk, "", GB0, NULL, Context)) ;
-    ERR (GB_Matrix_check (Agunk, "", GB0, NULL, Context)) ;
-    ERR (GB_Descriptor_check (dgunk, "", GB0, NULL, Context)) ;
-    GB_SelectOp_check (selectopgunk, "", GB3, NULL, Context) ;
-    ERR (GB_SelectOp_check (selectopgunk, "", GB0, NULL, Context)) ;
-
-    ERR (GxB_Type_fprint (Tgunk, "crud", GB0, ff)) ;
-    ERR (GxB_UnaryOp_fprint (op1gunk, "crud", GB0, ff)) ;
-    ERR (GxB_BinaryOp_fprint (op2gunk, "crud", GB0, ff)) ;
-    ERR (GxB_Monoid_fprint (monoid_gunk, "crud", GB0, ff)) ;
-    ERR (GxB_Semiring_fprint (semigunk, "crud", GB0, ff)) ;
-    ERR (GxB_Vector_fprint (vgunk, "crud", GB0, ff)) ;
-    ERR (GxB_Matrix_fprint (Agunk, "crud", GB0, ff)) ;
-    ERR (GxB_Descriptor_fprint (dgunk, "crud", GB0, ff)) ;
-    GxB_SelectOp_fprint (selectopgunk, "crud", GB3, ff) ;
-    ERR (GxB_SelectOp_fprint (selectopgunk, "crud", GB0, ff)) ;
+    ERR (GB_Type_check (Tgunk, "", G0, NULL)) ;
+    ERR (GB_UnaryOp_check (op1gunk, "", G0, NULL)) ;
+    ERR (GB_BinaryOp_check (op2gunk, "", G0, NULL)) ;
+    ERR (GB_Monoid_check (monoid_gunk, "", G0, NULL)) ;
+    ERR (GB_Semiring_check (semigunk, "", G0, NULL)) ;
+    ERR (GB_Vector_check (vgunk, "", G0, NULL)) ;
+    ERR (GB_Matrix_check (Agunk, "", G0, NULL)) ;
+    ERR (GB_Descriptor_check (dgunk, "", G0, NULL)) ;
+    GB_SelectOp_check (selectopgunk, "", G3, NULL) ;
+    ERR (GB_SelectOp_check (selectopgunk, "", G0, NULL)) ;
+
+    ERR (GxB_Type_fprint (Tgunk, "crud", G0, ff)) ;
+    ERR (GxB_UnaryOp_fprint (op1gunk, "crud", G0, ff)) ;
+    ERR (GxB_BinaryOp_fprint (op2gunk, "crud", G0, ff)) ;
+    ERR (GxB_Monoid_fprint (monoid_gunk, "crud", G0, ff)) ;
+    ERR (GxB_Semiring_fprint (semigunk, "crud", G0, ff)) ;
+    ERR (GxB_Vector_fprint (vgunk, "crud", G0, ff)) ;
+    ERR (GxB_Matrix_fprint (Agunk, "crud", G0, ff)) ;
+    ERR (GxB_Descriptor_fprint (dgunk, "crud", G0, ff)) ;
+    GxB_SelectOp_fprint (selectopgunk, "crud", G3, ff) ;
+    ERR (GxB_SelectOp_fprint (selectopgunk, "crud", G0, ff)) ;
 
     #define REMAGIC(p) if (p != NULL) p->magic = GB_MAGIC ;
     REMAGIC (Tgunk)
@@ -4618,25 +4629,25 @@ void mexFunction
     REMAGIC (selectopgunk)
     #undef REMAGIC
 
-    OK (GB_Type_check (Tgunk, "", GB0, NULL, Context)) ;
-    OK (GB_UnaryOp_check (op1gunk, "", GB0, NULL, Context)) ;
-    OK (GB_BinaryOp_check (op2gunk, "", GB0, NULL, Context)) ;
-    OK (GB_Monoid_check (monoid_gunk, "", GB0, NULL, Context)) ;
-    OK (GB_Semiring_check (semigunk, "", GB0, NULL, Context)) ;
-    OK (GB_Vector_check (vgunk, "", GB0, NULL, Context)) ;
-    OK (GB_Matrix_check (Agunk, "", GB0, NULL, Context)) ;
-    OK (GB_Descriptor_check (dgunk, "", GB0, NULL, Context)) ;
-    OK (GB_SelectOp_check (selectopgunk, "", GB0, NULL, Context)) ;
-
-    OK (GxB_Type_fprint_(Tgunk, GB0, ff)) ;
-    OK (GxB_UnaryOp_fprint_(op1gunk, GB0, ff)) ;
-    OK (GxB_BinaryOp_fprint_(op2gunk, GB0, ff)) ;
-    OK (GxB_Monoid_fprint_(monoid_gunk, GB0, ff)) ;
-    OK (GxB_Semiring_fprint_(semigunk, GB0, ff)) ;
-    OK (GxB_Vector_fprint_(vgunk, GB0, ff)) ;
-    OK (GxB_Matrix_fprint_(Agunk, GB0, ff)) ;
-    OK (GxB_Descriptor_fprint_(dgunk, GB0, ff)) ;
-    OK (GxB_SelectOp_fprint_(selectopgunk, GB0, ff)) ;
+    OK (GB_Type_check (Tgunk, "", G0, NULL)) ;
+    OK (GB_UnaryOp_check (op1gunk, "", G0, NULL)) ;
+    OK (GB_BinaryOp_check (op2gunk, "", G0, NULL)) ;
+    OK (GB_Monoid_check (monoid_gunk, "", G0, NULL)) ;
+    OK (GB_Semiring_check (semigunk, "", G0, NULL)) ;
+    OK (GB_Vector_check (vgunk, "", G0, NULL)) ;
+    OK (GB_Matrix_check (Agunk, "", G0, NULL)) ;
+    OK (GB_Descriptor_check (dgunk, "", G0, NULL)) ;
+    OK (GB_SelectOp_check (selectopgunk, "", G0, NULL)) ;
+
+    OK (GxB_Type_fprint_(Tgunk, G0, ff)) ;
+    OK (GxB_UnaryOp_fprint_(op1gunk, G0, ff)) ;
+    OK (GxB_BinaryOp_fprint_(op2gunk, G0, ff)) ;
+    OK (GxB_Monoid_fprint_(monoid_gunk, G0, ff)) ;
+    OK (GxB_Semiring_fprint_(semigunk, G0, ff)) ;
+    OK (GxB_Vector_fprint_(vgunk, G0, ff)) ;
+    OK (GxB_Matrix_fprint_(Agunk, G0, ff)) ;
+    OK (GxB_Descriptor_fprint_(dgunk, G0, ff)) ;
+    OK (GxB_SelectOp_fprint_(selectopgunk, G0, ff)) ;
 
     //--------------------------------------------------------------------------
     // GB_Descriptor_get
@@ -4646,8 +4657,8 @@ void mexFunction
     dgunk->out = 999 ;
     x_bool = false ;
     Context->where = "GB_Descriptor_get" ;
-    ERR (GB_Descriptor_get (dgunk, &x_bool, NULL, NULL, NULL, NULL, NULL,
-        Context)) ;
+    ERR (GB_Descriptor_get (dgunk, &x_bool, NULL, NULL, NULL, NULL, NULL, NULL,
+        Context)) ; 
     CHECK (x_bool == false) ;
     dgunk->out = GxB_DEFAULT ;
 
@@ -4738,8 +4749,8 @@ void mexFunction
     //--------------------------------------------------------------------------
 
     CHECK (A != NULL) ;
-    Context->where = "GB_ix_alloc" ;
-    info = GB_ix_alloc (A, GxB_INDEX_MAX+1, true, Context) ;
+    Context->where = "GB_bix_alloc" ;
+    info = GB_bix_alloc (A, GxB_INDEX_MAX+1, true, true, true, true, Context) ;
     CHECK (info == GrB_OUT_OF_MEMORY) ;
 
     Context->where = "GB_ix_realloc" ;
@@ -4748,12 +4759,12 @@ void mexFunction
     info = GB_ix_realloc (A, GxB_INDEX_MAX+1, true, Context) ;
     CHECK (info == GrB_OUT_OF_MEMORY) ;
 
-    OK (GB_Matrix_check (A, "A pattern 1", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "A pattern 1", G3, NULL)) ;
     OK (GB_ix_realloc (A, 20, false, Context)) ;
     CHECK (info == GrB_SUCCESS) ;
-    OK (GB_Matrix_check (A, "A pattern 2", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "A pattern 2", G3, NULL)) ;
 
-    GB_ix_free (NULL) ;
+    GB_bix_free (NULL) ;
     GB_ph_free (NULL) ;
 
     GrB_Matrix_free_(&C) ;
@@ -4761,12 +4772,12 @@ void mexFunction
     CHECK (C == NULL) ;
     CHECK (B == NULL) ;
     OK (GrB_Matrix_new (&C, GrB_FP32, 1, 1)) ;
-    OK (GB_Matrix_check (A, "A for shallow op", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "A for shallow op", G3, NULL)) ;
     Context->where = "GB_shallow_op" ;
     OK (GB_shallow_op (&B, true,
         GrB_AINV_FP32, NULL, NULL, false,
         C, Context)) ;
-    OK (GB_Matrix_check (B, "B empty, float", GB3, NULL, Context)) ;
+    OK (GB_Matrix_check (B, "B empty, float", G3, NULL)) ;
     GrB_Matrix_free_(&B) ;
 
     bool b1, b2 ;
@@ -4821,7 +4832,7 @@ void mexFunction
             OK (random_matrix (&Amask, false, false, n, n, nvals, 0, false)) ;
             OK (random_matrix (&F,     false, false, n, 1, uvals, 0, false)) ;
             // vectors cannot be hypersparse
-            GB_to_nonhyper (F, Context) ;
+            OK (GxB_Matrix_Option_set_(F, GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
             // vectors cannot be CSC: this is a hack just for brutal testing
             OK (GxB_Matrix_Option_set_(F, GxB_FORMAT, GxB_BY_COL)) ;
             umask = (GrB_Vector) F ;
@@ -4839,6 +4850,13 @@ void mexFunction
         OK (GrB_Matrix_dup (&B, A)) ;
         OK (GrB_mxm (B, Amask, NULL, GxB_PLUS_TIMES_FP64, A, A, NULL)) ;
         OK (GrB_mxm (A, Amask, NULL, GxB_PLUS_TIMES_FP64, A, A, NULL)) ;
+        OK (GxB_Matrix_fprint (A, "A ok", G3, ff)) ;
+        OK (GxB_Matrix_fprint (B, "B ok", G3, ff)) ;
+        if (Amask != NULL)
+        {
+            OK (GxB_Matrix_fprint (Amask, "Amask ok", G3, ff)) ;
+        }
+
         CHECK (GB_mx_isequal (A, B, 1e-14)) ;
         GrB_Matrix_free_(&B) ;
 
@@ -4846,6 +4864,13 @@ void mexFunction
         OK (GrB_vxm (v, umask, NULL, GxB_PLUS_TIMES_FP64, u, A, NULL)) ;
 
         OK (GrB_vxm (u, umask, NULL, GxB_PLUS_TIMES_FP64, u, A, NULL)) ;
+        OK (GxB_Vector_fprint (u, "u ok", G3, ff)) ;
+        OK (GxB_Vector_fprint (v, "v ok", G3, ff)) ;
+        if (umask != NULL)
+        {
+            OK (GxB_Vector_fprint (umask, "umask ok", G3, ff)) ;
+        }
+
         CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 1e-14)) ;
         GrB_Vector_free_(&v) ;
 
@@ -4935,86 +4960,6 @@ void mexFunction
         CHECK (GB_mx_isequal (A, B, 1e-14)) ;
         GrB_Matrix_free_(&B) ;
 
-        //----------------------------------------------------------------------
-        // GrB_eWiseMult (misnamed)
-        //----------------------------------------------------------------------
-
-        OK (GrB_Vector_dup (&v, u)) ;
-        OK (GrB_eWiseMult_Vector_Semiring (v, umask, NULL, GxB_PLUS_TIMES_FP64,  u, u, NULL)) ;
-        OK (GrB_eWiseMult_Vector_Semiring (u, umask, NULL, GxB_PLUS_TIMES_FP64,  u, u, NULL)) ;
-        CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 1e-14)) ;
-        GrB_Vector_free_(&v) ;
-
-        OK (GrB_Vector_dup (&v, u)) ;
-        OK (GrB_eWiseMult_Vector_Monoid (v, umask, NULL, GxB_PLUS_FP64_MONOID, u, u, NULL)) ;
-        OK (GrB_eWiseMult_Vector_Monoid (u, umask, NULL, GxB_PLUS_FP64_MONOID, u, u, NULL)) ;
-        CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 1e-14)) ;
-        GrB_Vector_free_(&v) ;
-
-        OK (GrB_Vector_dup (&v, u)) ;
-        OK (GrB_eWiseMult_Vector_BinaryOp (v, umask, NULL, GrB_PLUS_FP64,        u, u, NULL)) ;
-        OK (GrB_eWiseMult_Vector_BinaryOp (u, umask, NULL, GrB_PLUS_FP64,        u, u, NULL)) ;
-        CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 1e-14)) ;
-        GrB_Vector_free_(&v) ;
-
-        OK (GrB_Matrix_dup (&B, A)) ;
-        OK (GrB_eWiseMult_Matrix_Semiring (B, Amask, NULL, GxB_PLUS_TIMES_FP64,  A, A, NULL)) ;
-        OK (GrB_eWiseMult_Matrix_Semiring (A, Amask, NULL, GxB_PLUS_TIMES_FP64,  A, A, NULL)) ;
-        CHECK (GB_mx_isequal (A, B, 1e-14)) ;
-        GrB_Matrix_free_(&B) ;
-
-        OK (GrB_Matrix_dup (&B, A)) ;
-        OK (GrB_eWiseMult_Matrix_Monoid (B, Amask, NULL, GxB_PLUS_FP64_MONOID, A, A, NULL)) ;
-        OK (GrB_eWiseMult_Matrix_Monoid (A, Amask, NULL, GxB_PLUS_FP64_MONOID, A, A, NULL)) ;
-        CHECK (GB_mx_isequal (A, B, 1e-14)) ;
-        GrB_Matrix_free_(&B) ;
-
-        OK (GrB_Matrix_dup (&B, A)) ;
-        OK (GrB_eWiseMult_Matrix_BinaryOp (B, Amask, NULL, GrB_PLUS_FP64,        A, A, NULL)) ;
-        OK (GrB_eWiseMult_Matrix_BinaryOp (A, Amask, NULL, GrB_PLUS_FP64,        A, A, NULL)) ;
-        CHECK (GB_mx_isequal (A, B, 1e-14)) ;
-        GrB_Matrix_free_(&B) ;
-
-        //----------------------------------------------------------------------
-        // GrB_eWiseAdd (misnamed)
-        //----------------------------------------------------------------------
-
-        OK (GrB_Vector_dup (&v, u)) ;
-        OK (GrB_eWiseAdd_Vector_Semiring (v, umask, NULL, GxB_PLUS_TIMES_FP64,  u, u, NULL)) ;
-        OK (GrB_eWiseAdd_Vector_Semiring (u, umask, NULL, GxB_PLUS_TIMES_FP64,  u, u, NULL)) ;
-        CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 1e-14)) ;
-        GrB_Vector_free_(&v) ;
-
-        OK (GrB_Vector_dup (&v, u)) ;
-        OK (GrB_eWiseAdd_Vector_Monoid (v, umask, NULL, GxB_PLUS_FP64_MONOID, u, u, NULL)) ;
-        OK (GrB_eWiseAdd_Vector_Monoid (u, umask, NULL, GxB_PLUS_FP64_MONOID, u, u, NULL)) ;
-        CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 1e-14)) ;
-        GrB_Vector_free_(&v) ;
-
-        OK (GrB_Vector_dup (&v, u)) ;
-        OK (GrB_eWiseAdd_Vector_BinaryOp  (v, umask, NULL, GrB_PLUS_FP64,        u, u, NULL)) ;
-        OK (GrB_eWiseAdd_Vector_BinaryOp  (u, umask, NULL, GrB_PLUS_FP64,        u, u, NULL)) ;
-        CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 1e-14)) ;
-        GrB_Vector_free_(&v) ;
-
-        OK (GrB_Matrix_dup (&B, A)) ;
-        OK (GrB_eWiseAdd_Matrix_Semiring (B, Amask, NULL, GxB_PLUS_TIMES_FP64,  A, A, NULL)) ;
-        OK (GrB_eWiseAdd_Matrix_Semiring (A, Amask, NULL, GxB_PLUS_TIMES_FP64,  A, A, NULL)) ;
-        CHECK (GB_mx_isequal (A, B, 1e-14)) ;
-        GrB_Matrix_free_(&B) ;
-
-        OK (GrB_Matrix_dup (&B, A)) ;
-        OK (GrB_eWiseAdd_Matrix_Monoid (B, Amask, NULL, GxB_PLUS_FP64_MONOID, A, A, NULL)) ;
-        OK (GrB_eWiseAdd_Matrix_Monoid (A, Amask, NULL, GxB_PLUS_FP64_MONOID, A, A, NULL)) ;
-        CHECK (GB_mx_isequal (A, B, 1e-14)) ;
-        GrB_Matrix_free_(&B) ;
-
-        OK (GrB_Matrix_dup (&B, A)) ;
-        OK (GrB_eWiseAdd_Matrix_BinaryOp (B, Amask, NULL, GrB_PLUS_FP64,        A, A, NULL)) ;
-        OK (GrB_eWiseAdd_Matrix_BinaryOp (A, Amask, NULL, GrB_PLUS_FP64,        A, A, NULL)) ;
-        CHECK (GB_mx_isequal (A, B, 1e-14)) ;
-        GrB_Matrix_free_(&B) ;
-
         //----------------------------------------------------------------------
         // GrB_extract
         //----------------------------------------------------------------------
@@ -5022,13 +4967,13 @@ void mexFunction
         printf ("\nGrB_extract ============================================\n");
 
         OK (GrB_Vector_dup (&v, u)) ;
-        GB_Vector_check (u, "start u ", GB3, NULL, Context) ;
-        GB_Vector_check (v, "start v ", GB3, NULL, Context) ;
+        GB_Vector_check (u, "start u ", G3, NULL) ;
+        GB_Vector_check (v, "start v ", G3, NULL) ;
         CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 0)) ;
 
         OK (GrB_Vector_extract_(u, umask, NULL, u, GrB_ALL, n, NULL)) ;
-        GB_Vector_check (u, "u to check", GB3, NULL, Context) ;
-        GB_Vector_check (v, "v to check", GB3, NULL, Context) ;
+        GB_Vector_check (u, "u to check", G3, NULL) ;
+        GB_Vector_check (v, "v to check", G3, NULL) ;
         CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 0)) ;
         GrB_Vector_free_(&v) ;
 
@@ -5052,13 +4997,15 @@ void mexFunction
         OK (GxB_Matrix_subassign (B, Amask, NULL, A, GrB_ALL, n, GrB_ALL, n, NULL)) ;
         OK (GxB_Matrix_subassign (A, Amask, NULL, A, GrB_ALL, n, GrB_ALL, n, NULL)) ;
 
+        GB_Matrix_wait (A, Context) ;
         GB_Matrix_wait (B, Context) ;
         CHECK (GB_mx_isequal (A, B, 0)) ;
         GrB_Matrix_free_(&B) ;
 
         OK (GrB_Matrix_dup (&B, A)) ;
-        OK (GxB_Matrix_subassign (B, Amask, NULL, A, ilist, n, jlist, n, NULL)) ;
-        OK (GxB_Matrix_subassign (A, Amask, NULL, A, ilist, n, jlist, n, NULL)) ;
+        OK (GxB_Matrix_subassign (B, Amask, NULL, A, ilist, n, jlist, n, NULL));
+        OK (GxB_Matrix_subassign (A, Amask, NULL, A, ilist, n, jlist, n, NULL));
+        GB_Matrix_wait (A, Context) ;
         GB_Matrix_wait (B, Context) ;
         CHECK (GB_mx_isequal (A, B, 0)) ;
         GrB_Matrix_free_(&B) ;
@@ -5066,6 +5013,7 @@ void mexFunction
         OK (GrB_Vector_dup (&v, u)) ;
         OK (GxB_Vector_subassign (v, umask, NULL, u, GrB_ALL, n, NULL)) ;
         OK (GxB_Vector_subassign (u, umask, NULL, u, GrB_ALL, n, NULL)) ;
+        GB_Matrix_wait ((GrB_Matrix) u, Context) ;
         GB_Matrix_wait ((GrB_Matrix) v, Context) ;
         CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 0)) ;
         GrB_Vector_free_(&v) ;
@@ -5075,6 +5023,9 @@ void mexFunction
         OK (GxB_Vector_subassign (u, umask, NULL, u, ilist, n, NULL)) ;
         GB_Matrix_wait ((GrB_Matrix) v, Context) ;
         GB_Matrix_wait ((GrB_Matrix) u, Context) ;
+
+        OK (GxB_Vector_fprint_(v, G3, NULL)) ;
+        OK (GxB_Vector_fprint_(u, G3, NULL)) ;
         CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 0)) ;
         GrB_Vector_free_(&v) ;
 
@@ -5091,8 +5042,8 @@ void mexFunction
         OK (GrB_Matrix_dup (&B, A)) ;
         OK (GrB_Matrix_assign_(B, Amask, NULL, A, ilist, n, jlist, n, NULL)) ;
         OK (GrB_Matrix_assign_(A, Amask, NULL, A, ilist, n, jlist, n, NULL)) ;
-        GB_Matrix_wait (B, Context) ;
-        GB_Matrix_wait (A, Context) ;
+        if (!GB_IS_FULL (B)) GB_Matrix_wait (B, Context) ;
+        if (!GB_IS_FULL (A)) GB_Matrix_wait (A, Context) ;
         CHECK (GB_mx_isequal (A, B, 0)) ;
         GrB_Matrix_free_(&B) ;
 
@@ -5105,8 +5056,8 @@ void mexFunction
         OK (GrB_Vector_dup (&v, u)) ;
         OK (GrB_Vector_assign_(v, umask, NULL, u, ilist, n, NULL)) ;
         OK (GrB_Vector_assign_(u, umask, NULL, u, ilist, n, NULL)) ;
-        GB_Matrix_wait ((GrB_Matrix) v, Context) ;
-        GB_Matrix_wait ((GrB_Matrix) u, Context) ;
+        if (!GB_IS_FULL (v)) GB_Matrix_wait ((GrB_Matrix) v, Context) ;
+        if (!GB_IS_FULL (u)) GB_Matrix_wait ((GrB_Matrix) u, Context) ;
         CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 0)) ;
         GrB_Vector_free_(&v) ;
 
@@ -5123,6 +5074,8 @@ void mexFunction
         OK (GrB_Vector_dup (&v, u)) ;
         OK (GrB_Vector_apply_(v, umask, NULL, GrB_AINV_FP64, u, NULL)) ;
         OK (GrB_Vector_apply_(u, umask, NULL, GrB_AINV_FP64, u, NULL)) ;
+        OK (GxB_Vector_fprint_(v, G3, NULL)) ;
+        OK (GxB_Vector_fprint_(u, G3, NULL)) ;
         CHECK (GB_mx_isequal ((GrB_Matrix) u, (GrB_Matrix) v, 1e-14)) ;
         GrB_Vector_free_(&v) ;
 
@@ -5151,8 +5104,8 @@ void mexFunction
         OK (GrB_transpose (B, Amask, NULL, A, NULL)) ;
         OK (GrB_transpose (A, Amask, NULL, A, NULL)) ;
         GrB_Index ignore ;
-        OK (GrB_Matrix_nvals (&ignore, A)) ;
-        OK (GrB_Matrix_nvals (&ignore, B)) ;
+        OK (GrB_Matrix_wait (&A)) ;
+        OK (GrB_Matrix_wait (&B)) ;
         CHECK (GB_mx_isequal (A, B, 0)) ;
         GrB_Matrix_free_(&B) ;
 
@@ -5192,11 +5145,14 @@ void mexFunction
     GrB_Index *Ap, *Ai, *Aj, *Ah, nrows, ncols, nvecs ;
     double *Ax ;
     GrB_Type atype ;
-    int64_t nonempty = -1 ;
-    OK (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols, &nvals, &nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
+    bool jumbled ;
+    int64_t Ap_size, Aj_size, Ai_size, Ax_size, Ah_size, Ab_size ;
+
+    OK (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Aj, &Ax, &Ap_size, &Aj_size, &Ax_size, &jumbled, desc)) ;
+
     OK (GxB_Type_fprint (atype, "type of A", GxB_COMPLETE, stdout)) ;
-    printf ("nvals %llu\n", nvals) ;
+    printf ("Ax_size %llu\n", Ax_size) ;
     for (int64_t i = 0 ; i < ((int64_t) nrows) ; i++)
     {
         printf ("exported row %lld\n", j) ;
@@ -5205,233 +5161,371 @@ void mexFunction
             printf ("   col %lld value %g\n", Aj [p], Ax [p]) ;
         }
     }
-    OK (GxB_Matrix_import_CSR (&A, atype, nrows, ncols, nvals, nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
+    OK (GxB_Matrix_import_CSR (&A, atype, nrows, ncols,
+        &Ap, &Aj, &Ax, Ap_size, Aj_size, Ax_size, jumbled, desc)) ;
+
     OK (GxB_Matrix_fprint (A, "A imported", GxB_COMPLETE, stdout)) ;
 
     expected = GrB_NULL_POINTER ;
 
-    ERR (GxB_Matrix_export_CSR (NULL, &atype, &nrows, &ncols, &nvals, &nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSR (&A, NULL, &nrows, &ncols, &nvals, &nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSR (&A, &atype, NULL, &ncols, &nvals, &nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, NULL, &nvals, &nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols, NULL, &nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols, &nvals, NULL,
-        &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols, &nvals, &nonempty,
-        NULL, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols, &nvals, &nonempty,
-        &Ap, NULL, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols, &nvals, &nonempty,
-        &Ap, &Aj, NULL, desc)) ;
-
-    ERR (GxB_Matrix_export_CSC (NULL, &atype, &nrows, &ncols, &nvals, &nonempty,
-        &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSC (&A, NULL, &nrows, &ncols, &nvals, &nonempty,
-        &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSC (&A, &atype, NULL, &ncols, &nvals, &nonempty,
-        &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, NULL, &nvals, &nonempty,
-        &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols, NULL, &nonempty,
-        &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols, &nvals, NULL,
-        &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols, &nvals, &nonempty,
-        NULL, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols, &nvals, &nonempty,
-        &Ap, NULL, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols, &nvals, &nonempty,
-        &Ap, &Ai, NULL, desc)) ;
-
-    ERR (GxB_Matrix_export_HyperCSR (NULL, &atype, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSR (&A, NULL, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, NULL, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, NULL, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols, NULL,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols, &nvals,
-        NULL, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols, &nvals,
-        &nonempty, NULL, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, NULL, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, NULL, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, NULL, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, NULL, desc)) ;
-
-    ERR (GxB_Matrix_export_HyperCSC (NULL, &atype, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSC (&A, NULL, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, NULL, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, NULL, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols, NULL,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols, &nvals,
-        NULL, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols, &nvals,
-        &nonempty, NULL, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, NULL, &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, NULL, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, NULL, &Ax, desc)) ;
-    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, NULL, desc)) ;
-
-    OK (GB_Matrix_check (A, "A still OK", GB1, NULL, Context)) ;
-
-    OK (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols, &nvals, &nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
-
-    ERR (GxB_Matrix_import_CSR (NULL, atype, nrows, ncols, nvals, nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_CSR (&A, NULL, nrows, ncols, nvals, nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_CSR (&A, atype, nrows, ncols, nvals, nonempty,
-        NULL, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_CSR (&A, atype, nrows, ncols, nvals, nonempty,
-        &Ap, NULL, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_CSR (&A, atype, nrows, ncols, nvals, nonempty,
-        &Ap, &Aj, NULL, desc)) ;
+
+
+
+    ERR (GxB_Matrix_export_CSR (NULL, &atype, &nrows, &ncols,
+        &Ap, &Aj, &Ax, &Ap_size, &Aj_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSR (&A, NULL, &nrows, &ncols,
+        &Ap, &Aj, &Ax, &Ap_size, &Aj_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSR (&A, &atype, NULL, &ncols,
+        &Ap, &Aj, &Ax, &Ap_size, &Aj_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, NULL,
+        &Ap, &Aj, &Ax, &Ap_size, &Aj_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols,
+        NULL, &Aj, &Ax, &Ap_size, &Aj_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols,
+        &Ap, NULL, &Ax, &Ap_size, &Aj_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Aj, NULL, &Ap_size, &Aj_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Aj, &Ax, NULL, &Aj_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Aj, &Ax, &Ap_size, NULL, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Aj, &Ax, &Ap_size, &Aj_size, NULL, &jumbled, desc)) ;
+
+
+    ERR (GxB_Matrix_export_CSC (NULL, &atype, &nrows, &ncols,
+        &Ap, &Ai, &Ax, &Ap_size, &Ai_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSC (&A, NULL, &nrows, &ncols,
+        &Ap, &Ai, &Ax, &Ap_size, &Ai_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSC (&A, &atype, NULL, &ncols,
+        &Ap, &Ai, &Ax, &Ap_size, &Ai_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, NULL,
+        &Ap, &Ai, &Ax, &Ap_size, &Ai_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols,
+        NULL, &Ai, &Ax, &Ap_size, &Ai_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols,
+        &Ap, NULL, &Ax, &Ap_size, &Ai_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ai, NULL, &Ap_size, &Ai_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ai, &Ax, NULL, &Ai_size, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ai, &Ax, &Ap_size, NULL, &Ax_size, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ai, &Ax, &Ap_size, &Ai_size, NULL, &jumbled, desc)) ;
+
+
+
+    ERR (GxB_Matrix_export_HyperCSR (NULL, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Aj, &Ax, &Ap_size, &Ah_size, &Aj_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSR (&A, NULL, &nrows, &ncols,
+        &Ap, &Ah, &Aj, &Ax, &Ap_size, &Ah_size, &Aj_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, NULL, &ncols,
+        &Ap, &Ah, &Aj, &Ax, &Ap_size, &Ah_size, &Aj_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, NULL,
+        &Ap, &Ah, &Aj, &Ax, &Ap_size, &Ah_size, &Aj_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols,
+        NULL, &Ah, &Aj, &Ax, &Ap_size, &Ah_size, &Aj_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols,
+        &Ap, NULL, &Aj, &Ax, &Ap_size, &Ah_size, &Aj_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, NULL, &Ax, &Ap_size, &Ah_size, &Aj_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Aj, NULL, &Ap_size, &Ah_size, &Aj_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Aj, &Ax, NULL, &Ah_size, &Aj_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Aj, &Ax, &Ap_size, NULL, &Aj_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Aj, &Ax, &Ap_size, &Ah_size, NULL, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Aj, &Ax, &Ap_size, &Ah_size, &Aj_size, NULL,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Aj, &Ax, &Ap_size, &Ah_size, &Aj_size, &Ax_size,
+        NULL, &jumbled, desc)) ;
+
+
+
+
+
+    ERR (GxB_Matrix_export_HyperCSC (NULL, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Ai, &Ax, &Ap_size, &Ah_size, &Ai_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSC (&A, NULL, &nrows, &ncols,
+        &Ap, &Ah, &Ai, &Ax, &Ap_size, &Ah_size, &Ai_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, NULL, &ncols,
+        &Ap, &Ah, &Ai, &Ax, &Ap_size, &Ah_size, &Ai_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, NULL,
+        &Ap, &Ah, &Ai, &Ax, &Ap_size, &Ah_size, &Ai_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols,
+        NULL, &Ah, &Ai, &Ax, &Ap_size, &Ah_size, &Ai_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols,
+        &Ap, NULL, &Ai, &Ax, &Ap_size, &Ah_size, &Ai_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, NULL, &Ax, &Ap_size, &Ah_size, &Ai_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Ai, NULL, &Ap_size, &Ah_size, &Ai_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Ai, &Ax, NULL, &Ah_size, &Ai_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Ai, &Ax, &Ap_size, NULL, &Ai_size, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Ai, &Ax, &Ap_size, &Ah_size, NULL, &Ax_size,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Ai, &Ax, &Ap_size, &Ah_size, &Ai_size, NULL,
+        &nvec, &jumbled, desc)) ;
+
+    ERR (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Ai, &Ax, &Ap_size, &Ah_size, &Ai_size, &Ax_size,
+        NULL, &jumbled, desc)) ;
+
+
+    OK (GB_Matrix_check (A, "A still OK", G1, NULL)) ;
+
+    OK (GxB_Matrix_export_CSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Aj, &Ax, &Ap_size, &Aj_size, &Ax_size, &jumbled, desc)) ;
+
+
+
+    ERR (GxB_Matrix_import_CSR (NULL, atype, nrows, ncols,
+        &Ap, &Aj, &Ax, Ap_size, Aj_size, Ax_size, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_CSR (&A, NULL, nrows, ncols,
+        &Ap, &Aj, &Ax, Ap_size, Aj_size, Ax_size, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_CSR (&A, atype, nrows, ncols,
+        NULL, &Aj, &Ax, Ap_size, Aj_size, Ax_size, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_CSR (&A, atype, nrows, ncols,
+        &Ap, NULL, &Ax, Ap_size, Aj_size, Ax_size, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_CSR (&A, atype, nrows, ncols,
+        &Ap, &Aj, NULL, Ap_size, Aj_size, Ax_size, jumbled, desc)) ;
 
     expected = GrB_INVALID_VALUE ;
 
-    ERR (GxB_Matrix_import_CSR (&A, atype, INT64_MAX, ncols, nvals, nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_CSR (&A, atype, nrows, INT64_MAX, nvals, nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_CSR (&A, atype, nrows, ncols, INT64_MAX, nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
+    ERR (GxB_Matrix_import_CSR (&A, atype, INT64_MAX, ncols,
+        &Ap, &Aj, &Ax, Ap_size, Aj_size, Ax_size, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_CSR (&A, atype, nrows, INT64_MAX,
+        &Ap, &Aj, &Ax, Ap_size, Aj_size, Ax_size, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_CSR (&A, atype, nrows, ncols,
+        &Ap, &Aj, &Ax, Ap_size, Aj_size, INT64_MAX, jumbled, desc)) ;
+
 
     expected = GrB_NULL_POINTER ;
 
-    OK (GxB_Matrix_import_CSR (&A, atype, nrows, ncols, nvals, nonempty,
-        &Ap, &Aj, &Ax, desc)) ;
+    OK (GxB_Matrix_import_CSR (&A, atype, nrows, ncols,
+        &Ap, &Aj, &Ax, Ap_size, Aj_size, Ax_size, jumbled, desc)) ;
+
+    OK (GB_Matrix_check (A, "A also OK", G1, NULL)) ;
+
+    OK (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ai, &Ax, &Ap_size, &Ai_size, &Ax_size, &jumbled, desc)) ;
+
 
-    OK (GB_Matrix_check (A, "A still OK", GB1, NULL, Context)) ;
 
-    OK (GxB_Matrix_export_CSC (&A, &atype, &nrows, &ncols, &nvals, &nonempty,
-        &Ap, &Ai, &Ax, desc)) ;
+    ERR (GxB_Matrix_import_CSC (NULL, atype, nrows, ncols,
+        &Ap, &Ai, &Ax, Ap_size, Ai_size, Ax_size, jumbled, desc)) ;
 
-    ERR (GxB_Matrix_import_CSC (NULL, atype, nrows, ncols, nvals, nonempty,
-        &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_CSC (&A, atype, nrows, ncols, nvals, nonempty,
-        NULL, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_CSC (&A, atype, nrows, ncols, nvals, nonempty,
-        &Ap, NULL, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_CSC (&A, atype, nrows, ncols, nvals, nonempty,
-        &Ap, &Ai, NULL, desc)) ;
+    ERR (GxB_Matrix_import_CSC (&A, atype, nrows, ncols,
+        NULL, &Ai, &Ax, Ap_size, Ai_size, Ax_size, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_CSC (&A, atype, nrows, ncols,
+        &Ap, NULL, &Ax, Ap_size, Ai_size, Ax_size, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_CSC (&A, atype, nrows, ncols,
+        &Ap, &Ai, NULL, Ap_size, Ai_size, Ax_size, jumbled, desc)) ;
 
     expected = GrB_INVALID_VALUE ;
 
-    ERR (GxB_Matrix_import_CSC (&A, atype, INT64_MAX, ncols, nvals, nonempty,
-        &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_CSC (&A, atype, nrows, INT64_MAX, nvals, nonempty,
-        &Ap, &Ai, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_CSC (&A, atype, nrows, ncols, INT64_MAX, nonempty,
-        &Ap, &Ai, &Ax, desc)) ;
+    ERR (GxB_Matrix_import_CSC (&A, atype, INT64_MAX, ncols,
+        &Ap, &Ai, &Ax, Ap_size, Ai_size, Ax_size, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_CSC (&A, atype, nrows, INT64_MAX,
+        &Ap, &Ai, &Ax, Ap_size, Ai_size, Ax_size, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_CSC (&A, atype, nrows, ncols,
+        &Ap, &Ai, &Ax, Ap_size, Ai_size, INT64_MAX, jumbled, desc)) ;
+
 
     expected = GrB_NULL_POINTER ;
 
-    OK (GxB_Matrix_import_CSC (&A, atype, nrows, ncols, nvals, nonempty,
-        &Ap, &Ai, &Ax, desc)) ;
+    OK (GxB_Matrix_import_CSC (&A, atype, nrows, ncols,
+        &Ap, &Ai, &Ax, Ap_size, Ai_size, Ax_size, jumbled, desc)) ;
+
+    OK (GB_Matrix_check (A, "A here too OK", G1, NULL)) ;
+
+    OK (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Aj, &Ax, &Ap_size, &Ah_size, &Aj_size, &Ax_size,
+        &nvecs, &jumbled, desc)) ;
 
-    OK (GB_Matrix_check (A, "A still OK", GB1, NULL, Context)) ;
 
-    OK (GxB_Matrix_export_HyperCSR (&A, &atype, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, &Aj, &Ax, desc)) ;
 
-    ERR (GxB_Matrix_import_HyperCSR (NULL, atype, nrows, ncols, nvals,
-        nonempty, nvecs, &Ah, &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_HyperCSR (&A, NULL, nrows, ncols, nvals,
-        nonempty, nvecs, &Ah, &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols, nvals,
-        nonempty, nvecs, NULL, &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols, nvals,
-        nonempty, nvecs, &Ah, NULL, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols, nvals,
-        nonempty, nvecs, &Ah, &Ap, NULL, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols, nvals,
-        nonempty, nvecs, &Ah, &Ap, &Aj, NULL, desc)) ;
+    ERR (GxB_Matrix_import_HyperCSR (NULL, atype, nrows, ncols,
+        &Ap, &Ah, &Aj, &Ax, Ap_size, Ah_size, Aj_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSR (&A, NULL, nrows, ncols,
+        &Ap, &Ah, &Aj, &Ax, Ap_size, Ah_size, Aj_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols,
+        NULL, &Ah, &Aj, &Ax, Ap_size, Ah_size, Aj_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols,
+        &Ap, NULL, &Aj, &Ax, Ap_size, Ah_size, Aj_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols,
+        &Ap, &Ah, NULL, &Ax, Ap_size, Ah_size, Aj_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols,
+        &Ap, &Ah, &Aj, NULL, Ap_size, Ah_size, Aj_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
 
     expected = GrB_INVALID_VALUE ;
 
-    ERR (GxB_Matrix_import_HyperCSR (&A, atype, INT64_MAX, ncols, nvals,
-        nonempty, nvecs, &Ah, &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, INT64_MAX, nvals,
-        nonempty, nvecs, &Ah, &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols, INT64_MAX,
-        nonempty, nvecs, &Ah, &Ap, &Aj, &Ax, desc)) ;
-    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols, nvals,
-        nonempty, 2*nrows, &Ah, &Ap, &Aj, &Ax, desc)) ;
+    ERR (GxB_Matrix_import_HyperCSR (&A, atype, INT64_MAX, ncols,
+        &Ap, &Ah, &Aj, &Ax, Ap_size, Ah_size, Aj_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, INT64_MAX,
+        &Ap, &Ah, &Aj, &Ax, Ap_size, Ah_size, Aj_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols,
+        &Ap, &Ah, &Aj, &Ax, Ap_size, Ah_size, Aj_size, INT64_MAX,
+        nvecs, jumbled, desc)) ;
+
+
 
     expected = GrB_NULL_POINTER ;
 
-    OK (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols, nvals,
-        nonempty, nvecs, &Ah, &Ap, &Aj, &Ax, desc)) ;
-
-    OK (GB_Matrix_check (A, "A still OK", GB1, NULL, Context)) ;
-
-    OK (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols, &nvals,
-        &nonempty, &nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-
-    ERR (GxB_Matrix_import_HyperCSC (NULL, atype, nrows, ncols, nvals,
-        nonempty, nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Matrix_import_HyperCSC (&A, NULL, nrows, ncols, nvals,
-        nonempty, nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols, nvals,
-        nonempty, nvecs, NULL, &Ap, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols, nvals,
-        nonempty, nvecs, &Ah, NULL, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols, nvals,
-        nonempty, nvecs, &Ah, &Ap, NULL, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols, nvals,
-        nonempty, nvecs, &Ah, &Ap, &Ai, NULL, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
+    OK (GxB_Matrix_import_HyperCSR (&A, atype, nrows, ncols,
+        &Ap, &Ah, &Aj, &Ax, Ap_size, Ah_size, Aj_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    OK (GB_Matrix_check (A, "A yet still OK", G1, NULL)) ;
+
+    OK (GxB_Matrix_export_HyperCSC (&A, &atype, &nrows, &ncols,
+        &Ap, &Ah, &Ai, &Ax, &Ap_size, &Ah_size, &Ai_size, &Ax_size,
+        &nvecs, &jumbled, desc)) ;
+
+
+
+    ERR (GxB_Matrix_import_HyperCSC (NULL, atype, nrows, ncols,
+        &Ap, &Ah, &Ai, &Ax, Ap_size, Ah_size, Ai_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSC (&A, NULL, nrows, ncols,
+        &Ap, &Ah, &Ai, &Ax, Ap_size, Ah_size, Ai_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols,
+        NULL, &Ah, &Ai, &Ax, Ap_size, Ah_size, Ai_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols,
+        &Ap, NULL, &Ai, &Ax, Ap_size, Ah_size, Ai_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols,
+        &Ap, &Ah, NULL, &Ax, Ap_size, Ah_size, Ai_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols,
+        &Ap, &Ah, &Ai, NULL, Ap_size, Ah_size, Ai_size, Ax_size,
+        nvecs, jumbled, desc)) ;
 
     expected = GrB_INVALID_VALUE ;
 
-    ERR (GxB_Matrix_import_HyperCSC (&A, atype, INT64_MAX, ncols, nvals,
-        nonempty, nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, INT64_MAX, nvals,
-        nonempty, nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols, INT64_MAX,
-        nonempty, nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols, nvals,
-        nonempty, 2*ncols, &Ah, &Ap, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
+    ERR (GxB_Matrix_import_HyperCSC (&A, atype, INT64_MAX, ncols,
+        &Ap, &Ah, &Ai, &Ax, Ap_size, Ah_size, Ai_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, INT64_MAX,
+        &Ap, &Ah, &Ai, &Ax, Ap_size, Ah_size, Ai_size, Ax_size,
+        nvecs, jumbled, desc)) ;
+
+    ERR (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols,
+        &Ap, &Ah, &Ai, &Ax, Ap_size, Ah_size, Ai_size, INT64_MAX,
+        nvecs, jumbled, desc)) ;
 
     expected = GrB_NULL_POINTER ;
 
-    OK (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols, nvals,
-        nonempty, nvecs, &Ah, &Ap, &Ai, &Ax, desc)) ;
+    OK (GxB_Matrix_import_HyperCSC (&A, atype, nrows, ncols,
+        &Ap, &Ah, &Ai, &Ax, Ap_size, Ah_size, Ai_size, Ax_size,
+        nvecs, jumbled, desc)) ;
 
-    OK (GB_Matrix_check (A, "A still OK", GB1, NULL, Context)) ;
+    OK (GB_Matrix_check (A, "A yet again OK", G1, NULL)) ;
 
     //--------------------------------------------------------------------------
     // vector import/export
@@ -5439,60 +5533,53 @@ void mexFunction
 
     OK (GxB_Vector_fprint (u, "u to import/export", GxB_COMPLETE, stdout)) ;
     GrB_Type utype ;
-    OK (GxB_Vector_export (&u, &utype, &n, &nvals, &Ai, &Ax, desc)) ;
+    OK (GxB_Vector_export_CSC (&u, &utype, &n, &Ai, &Ax, &Ai_size, &Ax_size, &nvals, &jumbled, desc)) ;
+
     OK (GxB_Type_fprint (utype, "type of u", GxB_COMPLETE, stdout)) ;
     printf ("nvals %llu\n", nvals) ;
     for (int64_t p = 0 ; p < ((int64_t) nvals) ; p++)
     {
         printf ("   col %lld value %g\n", Ai [p], Ax [p]) ;
     }
-    OK (GxB_Vector_import (&u, utype, n, nvals, &Ai, &Ax, desc)) ;
+    OK (GxB_Vector_import_CSC (&u, utype, n, &Ai, &Ax, Ai_size, Ax_size, nvals, jumbled, desc)) ;
     OK (GxB_Vector_fprint (u, "u imported", GxB_COMPLETE, stdout)) ;
 
     expected = GrB_NULL_POINTER ;
 
-    ERR (GxB_Vector_export (NULL, &utype, &n, &nvals, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Vector_export (&u, NULL, &n, &nvals, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Vector_export (&u, &utype, NULL, &nvals, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Vector_export (&u, &utype, &n, NULL, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Vector_export (&u, &utype, &n, &nvals, NULL, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Vector_export (&u, &utype, &n, &nvals, &Ai, NULL, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-
-    OK (GB_Vector_check (u, "u still OK", GB1, NULL, Context)) ;
-
-    OK (GxB_Vector_export (&u, &utype, &n, &nvals, &Ai, &Ax, desc)) ;
-
-    ERR (GxB_Vector_import (NULL, utype, n, nvals, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Vector_import (&u, NULL, n, nvals, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Vector_import (&u, utype, n, nvals, NULL, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Vector_import (&u, utype, n, nvals, &Ai, NULL, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
+    ERR (GxB_Vector_export_CSC (NULL, &utype, &n, &Ai, &Ax, &Ai_size, &Ax_size, &nvals, &jumbled, desc)) ;
+    ERR (GxB_Vector_export_CSC (&u, NULL, &n, &Ai, &Ax, &Ai_size, &Ax_size, &nvals, &jumbled, desc)) ;
+    ERR (GxB_Vector_export_CSC (&u, &utype, NULL, &Ai, &Ax, &Ai_size, &Ax_size, &nvals, &jumbled, desc)) ;
+    ERR (GxB_Vector_export_CSC (&u, &utype, &n, NULL, &Ax, &Ai_size, &Ax_size, &nvals, &jumbled, desc)) ;
+    ERR (GxB_Vector_export_CSC (&u, &utype, &n, &Ai, NULL, &Ai_size, &Ax_size, &nvals, &jumbled, desc)) ;
+    ERR (GxB_Vector_export_CSC (&u, &utype, &n, &Ai, &Ax, NULL, &Ax_size, &nvals, &jumbled, desc)) ;
+    ERR (GxB_Vector_export_CSC (&u, &utype, &n, &Ai, &Ax, &Ai_size, NULL, &nvals, &jumbled, desc)) ;
+    ERR (GxB_Vector_export_CSC (&u, &utype, &n, &Ai, &Ax, &Ai_size, &Ax_size, NULL, &jumbled, desc)) ;
+//  ERR (GxB_Vector_export_CSC (&u, &utype, &n, &Ai, &Ax, &Ai_size, &Ax_size, &nvals, NULL, desc)) ;
+
+    OK (GB_Vector_check (u, "u still OK", G1, NULL)) ;
+
+    OK (GxB_Vector_export_CSC (&u, &utype, &n, &Ai, &Ax, &Ai_size, &Ax_size, &nvals, &jumbled, desc)) ;
+
+    ERR (GxB_Vector_import_CSC (NULL, utype, n, &Ai, &Ax, Ai_size, Ax_size, nvals, jumbled, desc)) ;
+    ERR (GxB_Vector_import_CSC (&u, NULL, n, &Ai, &Ax, Ai_size, Ax_size, nvals, jumbled, desc)) ;
+    ERR (GxB_Vector_import_CSC (&u, utype, n, NULL, &Ax, Ai_size, Ax_size, nvals, jumbled, desc)) ;
+    ERR (GxB_Vector_import_CSC (&u, utype, n, &Ai, NULL, Ai_size, Ax_size, nvals, jumbled, desc)) ;
 
     expected = GrB_INVALID_VALUE ;
-    ERR (GxB_Vector_import (&u, utype, INT64_MAX, nvals, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
-    ERR (GxB_Vector_import (&u, utype, n, INT64_MAX, &Ai, &Ax, desc)) ;
-    printf ("expected error: %s\n", GrB_error ( )) ;
+    ERR (GxB_Vector_import_CSC (&u, utype, INT64_MAX, &Ai, &Ax, Ai_size, Ax_size, nvals, jumbled, desc)) ;
+    ERR (GxB_Vector_import_CSC (&u, utype, n, &Ai, &Ax, INT64_MAX, Ax_size, nvals, jumbled, desc)) ;
+
     expected = GrB_NULL_POINTER ;
 
-    OK (GxB_Vector_import (&u, utype, n, nvals, &Ai, &Ax, desc)) ;
+    OK (GxB_Vector_import_CSC (&u, utype, n, &Ai, &Ax, Ai_size, Ax_size, nvals, jumbled, desc)) ;
 
-    OK (GB_Vector_check (u, "u still OK", GB3, NULL, Context)) ;
+    OK (GB_Vector_check (u, "u still OK", G3, NULL)) ;
 
     //--------------------------------------------------------------------------
     // free all
     //--------------------------------------------------------------------------
 
-    // this is also done by FREE_ALL, but the list here is meant to be
+    // this is also done by FREE_ALL, but the list is meant to be
     // accurate, so nmalloc should be zero at the check below
 
     nmalloc = GB_Global_nmalloc_get ( ) ;
diff --git a/GraphBLAS/Test/GB_mex_errors.h b/GraphBLAS/Test/GB_mex_errors.h
index deaeb2e2d3..bbc54a5ec7 100644
--- a/GraphBLAS/Test/GB_mex_errors.h
+++ b/GraphBLAS/Test/GB_mex_errors.h
@@ -2,17 +2,17 @@
 // GB_mex_errors.h: error handling macros
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-#define FAIL(s)                                                     \
-{                                                                   \
-    fprintf (f,"\ntest failure: line %d\n", __LINE__) ;             \
-    fprintf (f,"%s\n", GB_STR(s)) ;                                 \
-    fclose (f) ;                                                    \
-    mexErrMsgTxt (GB_STR(s) " line: " GB_XSTR(__LINE__)) ;          \
+#define FAIL(s)                                                             \
+{                                                                           \
+    fprintf (f,"\nTest failure: %s line %d\n", __FILE__, __LINE__) ;        \
+    fprintf (f, "%s\n", GB_STR(s)) ;                                        \
+    fclose (f) ;                                                            \
+    mexErrMsgTxt (GB_STR(s) " line: " GB_XSTR(__LINE__)) ;                  \
 }
 
 #undef CHECK
@@ -20,24 +20,37 @@
 #define CHECK2(x,s) if (!(x)) FAIL(s) ;
 
 // assert that a method should return a particular error code
-#define ERR(method)                                                 \
-{                                                                   \
-    info = method ;                                                 \
-    fprintf (f,"GB_mex_errors, line %d:", __LINE__) ;               \
-    fprintf (f,"%s\n", GrB_error ( )) ;                             \
-    if (info != expected) fprintf (f, "got %d expected %d\n", info, expected) ; \
-    CHECK2 (info == expected, method) ;                             \
+#define ERR(method)                                                         \
+{                                                                           \
+    info = method ;                                                         \
+    fprintf (f, "line %d: info %d\n", __LINE__, info) ;                     \
+    if (info != expected) fprintf (f, "got %d expected %d\n",               \
+        info, expected) ;                                                   \
+    CHECK2 (info == expected, method) ;                                     \
+}
+
+// assert that a method should return a particular error code: with logger
+#define ERR1(C,method)                                                      \
+{                                                                           \
+    info = method ;                                                         \
+    fprintf (f, "\nline %d: info %d, error logger:\n", __LINE__, info) ;    \
+    char *error_logger ;                                                    \
+    GrB_Matrix_error_(&error_logger, ((GrB_Matrix) C)) ;                    \
+    fprintf (f,"[%s]\n", error_logger) ;                                    \
+    if (info != expected) fprintf (f, "got %d expected %d\n",               \
+        info, expected) ;                                                   \
+    CHECK2 (info == expected, method) ;                                     \
 }
 
 // assert that a method should succeed
-#define OK(method)                                                  \
-{                                                                   \
-    info = method ;                                                 \
-    if (! (info == GrB_SUCCESS || info == GrB_NO_VALUE))            \
-    {                                                               \
-        fprintf (f,"[%d] >>>>>>>>%s\n", info, GrB_error ( )) ;      \
-        printf ("[%d] %s\n", info, GrB_error ( )) ;                 \
-        FAIL (method) ;                                             \
-    }                                                               \
+#define OK(method)                                                          \
+{                                                                           \
+    info = method ;                                                         \
+    if (! (info == GrB_SUCCESS || info == GrB_NO_VALUE))                    \
+    {                                                                       \
+        fprintf (f,"[%d] >>>>>>>>\n", info) ;                               \
+        mexPrintf ("[%d] Test failed\n", info) ;                            \
+        FAIL (method) ;                                                     \
+    }                                                                       \
 }
 
diff --git a/GraphBLAS/Test/GB_mex_ewise_alias1.c b/GraphBLAS/Test/GB_mex_ewise_alias1.c
index dd76b51db2..8ad91645c3 100644
--- a/GraphBLAS/Test/GB_mex_ewise_alias1.c
+++ b/GraphBLAS/Test/GB_mex_ewise_alias1.c
@@ -2,8 +2,8 @@
 // GB_mex_ewise_alias1: C = A+C
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,10 +13,10 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&A) ;                   \
-    GB_MATRIX_FREE (&C) ;                   \
+    GrB_Matrix_free_(&A) ;                   \
+    GrB_Matrix_free_(&C) ;                   \
     GrB_Descriptor_free_(&desc) ;           \
-    GB_mx_put_global (true, 0) ;            \
+    GB_mx_put_global (true) ;               \
 }
 
 void mexFunction
@@ -33,7 +33,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 3 || nargin > 4)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -42,7 +41,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
diff --git a/GraphBLAS/Test/GB_mex_ewise_alias2.c b/GraphBLAS/Test/GB_mex_ewise_alias2.c
index ae9a324333..1d96113b8c 100644
--- a/GraphBLAS/Test/GB_mex_ewise_alias2.c
+++ b/GraphBLAS/Test/GB_mex_ewise_alias2.c
@@ -2,8 +2,8 @@
 // GB_mex_ewise_alias2: C += A+A
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,10 +13,10 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&A) ;                   \
-    GB_MATRIX_FREE (&C) ;                   \
+    GrB_Matrix_free_(&A) ;                   \
+    GrB_Matrix_free_(&C) ;                   \
     GrB_Descriptor_free_(&desc) ;           \
-    GB_mx_put_global (true, 0) ;            \
+    GB_mx_put_global (true) ;               \
 }
 
 void mexFunction
@@ -33,7 +33,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 3 || nargin > 4)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -42,7 +41,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
diff --git a/GraphBLAS/Test/GB_mex_ewise_alias3.c b/GraphBLAS/Test/GB_mex_ewise_alias3.c
index 59305d928f..98c94b7e26 100644
--- a/GraphBLAS/Test/GB_mex_ewise_alias3.c
+++ b/GraphBLAS/Test/GB_mex_ewise_alias3.c
@@ -2,8 +2,8 @@
 // GB_mex_ewise_alias3: C = C+A
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,10 +13,10 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&A) ;                   \
-    GB_MATRIX_FREE (&C) ;                   \
+    GrB_Matrix_free_(&A) ;                   \
+    GrB_Matrix_free_(&C) ;                   \
     GrB_Descriptor_free_(&desc) ;           \
-    GB_mx_put_global (true, 0) ;            \
+    GB_mx_put_global (true) ;               \
 }
 
 void mexFunction
@@ -33,7 +33,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 3 || nargin > 4)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -42,7 +41,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
diff --git a/GraphBLAS/Test/GB_mex_ewise_alias4.c b/GraphBLAS/Test/GB_mex_ewise_alias4.c
index c935d4ed58..5bee3cd558 100644
--- a/GraphBLAS/Test/GB_mex_ewise_alias4.c
+++ b/GraphBLAS/Test/GB_mex_ewise_alias4.c
@@ -2,8 +2,8 @@
 // GB_mex_ewise_alias4: C<M> = M+M
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,10 +13,10 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&M) ;                   \
-    GB_MATRIX_FREE (&C) ;                   \
+    GrB_Matrix_free_(&M) ;                   \
+    GrB_Matrix_free_(&C) ;                   \
     GrB_Descriptor_free_(&desc) ;           \
-    GB_mx_put_global (true, 0) ;            \
+    GB_mx_put_global (true) ;               \
 }
 
 void mexFunction
@@ -33,7 +33,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 3 || nargin > 4)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -42,7 +41,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
diff --git a/GraphBLAS/Test/GB_mex_ewise_alias5.c b/GraphBLAS/Test/GB_mex_ewise_alias5.c
index fb4a01bd74..e639b5e8c2 100644
--- a/GraphBLAS/Test/GB_mex_ewise_alias5.c
+++ b/GraphBLAS/Test/GB_mex_ewise_alias5.c
@@ -2,8 +2,8 @@
 // GB_mex_ewise_alias5: C<M> = A+M
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,11 +13,11 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&A) ;                   \
-    GB_MATRIX_FREE (&M) ;                   \
-    GB_MATRIX_FREE (&C) ;                   \
+    GrB_Matrix_free_(&A) ;                   \
+    GrB_Matrix_free_(&M) ;                   \
+    GrB_Matrix_free_(&C) ;                   \
     GrB_Descriptor_free_(&desc) ;           \
-    GB_mx_put_global (true, 0) ;            \
+    GB_mx_put_global (true) ;               \
 }
 
 void mexFunction
@@ -34,7 +34,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 4 || nargin > 5)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -43,7 +42,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
diff --git a/GraphBLAS/Test/GB_mex_ewise_alias6.c b/GraphBLAS/Test/GB_mex_ewise_alias6.c
index 37651edd62..70d0c17996 100644
--- a/GraphBLAS/Test/GB_mex_ewise_alias6.c
+++ b/GraphBLAS/Test/GB_mex_ewise_alias6.c
@@ -2,8 +2,8 @@
 // GB_mex_ewise_alias5: C<M> = M+A
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,11 +13,11 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&A) ;                   \
-    GB_MATRIX_FREE (&M) ;                   \
-    GB_MATRIX_FREE (&C) ;                   \
+    GrB_Matrix_free_(&A) ;                   \
+    GrB_Matrix_free_(&M) ;                   \
+    GrB_Matrix_free_(&C) ;                   \
     GrB_Descriptor_free_(&desc) ;           \
-    GB_mx_put_global (true, 0) ;            \
+    GB_mx_put_global (true) ;               \
 }
 
 void mexFunction
@@ -34,7 +34,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 4 || nargin > 5)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -43,7 +42,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
diff --git a/GraphBLAS/Test/GB_mex_expand.c b/GraphBLAS/Test/GB_mex_expand.c
index d0ec2dc5b3..3f0f4fb1b0 100644
--- a/GraphBLAS/Test/GB_mex_expand.c
+++ b/GraphBLAS/Test/GB_mex_expand.c
@@ -2,8 +2,8 @@
 // GB_mex_expand: C<M,struct> = scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,9 +14,9 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&M) ;               \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&M) ;               \
+    GrB_Matrix_free_(&C) ;               \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -32,7 +32,6 @@ void mexFunction
     bool malloc_debug = GB_mx_get_global (true) ;
     GrB_Matrix C = NULL, M = NULL ;
 
-    GB_WHERE (USAGE) ;
     if (nargin != 2 || nargout > 1)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -74,7 +73,6 @@ void mexFunction
             GrB_ALL, nrows, GrB_ALL, ncols, GrB_DESC_RS) ;
         if (info != GrB_SUCCESS)
         {
-            printf ("Error:%s\n", GrB_error ( )) ;
             mexErrMsgTxt ("GxB_Matrix_subassign_[complex] failed") ;
         }
     }
diff --git a/GraphBLAS/Test/GB_mex_export.c b/GraphBLAS/Test/GB_mex_export.c
index 3c1242e7bf..bd9f77d96a 100644
--- a/GraphBLAS/Test/GB_mex_export.c
+++ b/GraphBLAS/Test/GB_mex_export.c
@@ -2,8 +2,8 @@
 // GB_mex_export: test import/export
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,13 +13,13 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&C) ;               \
+    GrB_Matrix_free_(&C) ;              \
     GB_FREE (Ap) ;                      \
     GB_FREE (Ah) ;                      \
     GB_FREE (Ai) ;                      \
     GB_FREE (Aj) ;                      \
     GB_FREE (Ax) ;                      \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 #define OK(method)                      \
@@ -27,6 +27,7 @@
     info = method ;                     \
     if (info != GrB_SUCCESS)            \
     {                                   \
+        if (dump) printf ("%d: %d\n", __LINE__, info) ; \
         FREE_ALL ;                      \
         return (info) ;                 \
     }                                   \
@@ -42,9 +43,16 @@ GrB_Index nrows = 0 ;
 GrB_Index ncols = 0 ;
 GrB_Index nvals = 0 ;
 GrB_Index nvec = 0 ;
-int64_t nonempty = -1 ;
+GrB_Index Ai_size = 0 ;
+GrB_Index Ax_size = 0 ;
+GrB_Index Ap_size = 0 ;
+GrB_Index Aj_size = 0 ;
+GrB_Index Ab_size = 0 ;
+GrB_Index Ah_size = 0 ;
+int64_t ignore = -1 ;
 char *Ax = NULL ;
 int format = 0 ;
+bool jumbled = false ;
 bool is_hyper = false ;
 bool clear_nvec = false ;
 bool is_csc = true ;
@@ -53,15 +61,15 @@ GrB_Descriptor desc = NULL ;
 bool dump = false ;
 GrB_Type type = NULL ;
 size_t asize = 0 ;
-GrB_Info import_export (GB_Context Context) ;
-GrB_Info import_export2 (GB_Context Context) ;
+GrB_Info import_export (void) ;
+GrB_Info import_export2 (void) ;
 
 //------------------------------------------------------------------------------
 
-GrB_Info import_export (GB_Context Context)
+GrB_Info import_export ( )
 {
 
-    OK (GB_Matrix_check (C, "C to export", GB0, stdout, Context)) ;
+    OK (GB_Matrix_check (C, "C to export", GB0, stdout)) ;
 
     //--------------------------------------------------------------------------
     // export/import a vector
@@ -69,27 +77,28 @@ GrB_Info import_export (GB_Context Context)
 
     if (GB_VECTOR_OK (C))
     {
-        OK (GxB_Vector_export ((GrB_Vector *) (&C), &type, &nrows, &nvals,
-            &Ai, &Ax, desc)) ;
+        OK (GxB_Vector_export_CSC ((GrB_Vector *) (&C), &type, &nrows,
+            &Ai, &Ax, &Ai_size, &Ax_size, &nvals, &jumbled, desc)) ;
+
         OK (GxB_Type_size (&asize, type)) ;
 
         if (dump)
         {
             printf ("export standard CSC vector: %llu-by-1, nvals %llu:\n",
                 nrows, nvals) ;
-            OK (GB_Type_check (type, "type", GxB_SUMMARY, stdout, Context)) ;
+            OK (GB_Type_check (type, "type", GxB_SUMMARY, stdout)) ;
             GB_Type_code code = type->code ;
 
             for (int64_t p = 0 ; p < nvals ; p++)
             {
                 printf ("  row %llu value ", Ai [p]) ;
-                GB_code_check (code, Ax + p*asize, 5, stdout, Context) ;
+                GB_code_check (code, Ax + p*asize, 5, stdout) ;
                 printf ("\n") ;
             }
         }
 
-        OK (GxB_Vector_import ((GrB_Vector *) (&C), type, nrows, nvals,
-            &Ai, &Ax, desc)) ;
+        OK (GxB_Vector_import_CSC ((GrB_Vector *) (&C), type, nrows,
+            &Ai, &Ax, Ai_size, Ax_size, nvals, jumbled, desc)) ;
 
         return (GrB_SUCCESS) ;
     }
@@ -105,17 +114,18 @@ GrB_Info import_export (GB_Context Context)
         case 0 : 
         //----------------------------------------------------------------------
 
-            OK (GxB_Matrix_export_CSR (&C, &type, &nrows, &ncols, &nvals,
-                    &nonempty, &Ap, &Aj, &Ax, desc)) ;
+            OK (GxB_Matrix_export_CSR (&C, &type, &nrows, &ncols,
+                    &Ap, &Aj, &Ax, &Ap_size, &Aj_size, &Ax_size,
+                    &jumbled, desc)) ;
+
             OK (GxB_Type_size (&asize, type)) ;
             nvec = nrows ;
 
             if (dump)
             {
-                printf ("\nexport standard CSR: %llu-by-%llu, nvals %llu:\n",
-                    nrows, ncols, nvals) ;
-                printf ("nonempty: %" PRId64"\n", nonempty) ;
-                OK (GB_Type_check (type, "type", GxB_SUMMARY, stdout, Context));
+                printf ("\nexport standard CSR: %llu-by-%llu, Ax_size %llu:\n",
+                    nrows, ncols, Ax_size) ;
+                OK (GB_Type_check (type, "type", GxB_SUMMARY, stdout));
                 GB_Type_code code = type->code ;
 
                 for (int64_t i = 0 ; i < nrows ; i++)
@@ -124,35 +134,35 @@ GrB_Info import_export (GB_Context Context)
                     for (int64_t p = Ap [i] ; p < Ap [i+1] ; p++)
                     {
                         printf ("  col %llu value ", Aj [p]) ;
-                        GB_code_check (code, Ax + p*asize, 5, stdout, Context) ;
+                        GB_code_check (code, Ax + p*asize, 5, stdout) ;
                         printf ("\n") ;
                     }
                 }
             }
 
-            if (clear_nvec) nonempty = -1 ;     // for testing
-
-            OK (GxB_Matrix_import_CSR (&C, type, nrows, ncols, nvals,
-                nonempty, &Ap, &Aj, &Ax, desc)) ;
+            OK (GxB_Matrix_import_CSR (&C, type, nrows, ncols,
+                &Ap, &Aj, &Ax, Ap_size, Aj_size, Ax_size, jumbled, desc)) ;
 
             OK (GB_Matrix_check (C, "C reimported",
-                dump ? GxB_COMPLETE : GxB_SILENT, stdout, Context)) ;
+                dump ? GxB_COMPLETE : GxB_SILENT, stdout)) ;
             break ;
 
         //----------------------------------------------------------------------
         case 1 : 
         //----------------------------------------------------------------------
 
-            OK (GxB_Matrix_export_CSC (&C, &type, &nrows, &ncols, &nvals,
-                &nonempty, &Ap, &Ai, &Ax, desc)) ;
+            OK (GxB_Matrix_export_CSC (&C, &type, &nrows, &ncols,
+                    &Ap, &Ai, &Ax, &Ap_size, &Ai_size, &Ax_size,
+                    &jumbled, desc)) ;
+
             nvec = ncols ;
             OK (GxB_Type_size (&asize, type)) ;
 
             if (dump)
             {
-                printf ("\nexport standard CSC: %llu-by-%llu, nvals %llu:\n",
-                    nrows, ncols, nvals) ;
-                OK (GB_Type_check (type, "type", GxB_SUMMARY, stdout, Context));
+                printf ("\nexport standard CSC: %llu-by-%llu, Ax_size %llu:\n",
+                    nrows, ncols, Ax_size) ;
+                OK (GB_Type_check (type, "type", GxB_SUMMARY, stdout));
                 GB_Type_code code = type->code ;
 
                 for (int64_t j = 0 ; j < ncols ; j++)
@@ -161,35 +171,34 @@ GrB_Info import_export (GB_Context Context)
                     for (int64_t p = Ap [j] ; p < Ap [j+1] ; p++)
                     {
                         printf ("  row %llu value ", Ai [p]) ;
-                        GB_code_check (code, Ax + p*asize, 5, stdout, Context) ;
+                        GB_code_check (code, Ax + p*asize, 5, stdout) ;
                         printf ("\n") ;
                     }
                 }
-
             }
 
-            if (clear_nvec) nonempty = -1 ;     // for testing
-
-            OK (GxB_Matrix_import_CSC (&C, type, nrows, ncols, nvals,
-                nonempty, &Ap, &Ai, &Ax, desc)) ;
+            OK (GxB_Matrix_import_CSC (&C, type, nrows, ncols,
+                &Ap, &Ai, &Ax, Ap_size, Ai_size, Ax_size, jumbled, desc)) ;
 
             OK (GB_Matrix_check (C, "C reimported",
-                dump ? GxB_COMPLETE : GxB_SILENT, stdout, Context)) ;
+                dump ? GxB_COMPLETE : GxB_SILENT, stdout)) ;
             break ;
 
         //----------------------------------------------------------------------
         case 2 : 
         //----------------------------------------------------------------------
 
-            OK (GxB_Matrix_export_HyperCSR (&C, &type, &nrows, &ncols, &nvals,
-                &nonempty, &nvec, &Ah, &Ap, &Aj, &Ax, desc)) ;
+            OK (GxB_Matrix_export_HyperCSR (&C, &type, &nrows, &ncols,
+                &Ap, &Ah, &Aj, &Ax, &Ap_size, &Ah_size, &Aj_size, &Ax_size,
+                &nvec, &jumbled, desc)) ;
+
             OK (GxB_Type_size (&asize, type)) ;
 
             if (dump)
             {
-                printf ("\nexport hyper CSR: %llu-by-%llu, nvals %llu, "
-                    "nvec %llu:\n", nrows, ncols, nvals, nvec) ;
-                OK (GB_Type_check (type, "type", GxB_SUMMARY, stdout, Context));
+                printf ("\nexport hyper CSR: %llu-by-%llu, Ax_size %llu, "
+                    "nvec %llu:\n", nrows, ncols, Ax_size, nvec) ;
+                OK (GB_Type_check (type, "type", GxB_SUMMARY, stdout));
                 GB_Type_code code = type->code ;
 
                 for (int64_t k = 0 ; k < nvec ; k++)
@@ -199,34 +208,35 @@ GrB_Info import_export (GB_Context Context)
                     for (int64_t p = Ap [k] ; p < Ap [k+1] ; p++)
                     {
                         printf ("  col %llu value ", Aj [p]) ;
-                        GB_code_check (code, Ax + p*asize, 5, stdout, Context) ;
+                        GB_code_check (code, Ax + p*asize, 5, stdout) ;
                         printf ("\n") ;
                     }
                 }
             }
 
-            if (clear_nvec) nonempty = -1 ;     // for testing
-
-            OK (GxB_Matrix_import_HyperCSR (&C, type, nrows, ncols, nvals,
-                nonempty, nvec, &Ah, &Ap, &Aj, &Ax, desc)) ;
+            OK (GxB_Matrix_import_HyperCSR (&C, type, nrows, ncols,
+                &Ap, &Ah, &Aj, &Ax, Ap_size, Ah_size, Aj_size, Ax_size,
+                nvec, jumbled, desc)) ;
 
             OK (GB_Matrix_check (C, "C reimported",
-                dump ? GxB_COMPLETE : GxB_SILENT, stdout, Context));
+                dump ? GxB_COMPLETE : GxB_SILENT, stdout));
             break ;
 
         //----------------------------------------------------------------------
         case 3 : 
         //----------------------------------------------------------------------
 
-            OK (GxB_Matrix_export_HyperCSC (&C, &type, &nrows, &ncols, &nvals,
-                &nonempty, &nvec, &Ah, &Ap, &Ai, &Ax, desc)) ;
+            OK (GxB_Matrix_export_HyperCSC (&C, &type, &nrows, &ncols,
+                &Ap, &Ah, &Ai, &Ax, &Ap_size, &Ah_size, &Ai_size, &Ax_size,
+                &nvec, &jumbled, desc)) ;
+
             OK (GxB_Type_size (&asize, type)) ;
 
             if (dump)
             {
-                printf ("export hyper CSC: %llu-by-%llu, nvals %llu, c %llu:\n",
-                    nrows, ncols, nvals, nvec) ;
-                OK (GB_Type_check (type, "type", GxB_SUMMARY, stdout, Context));
+                printf ("export hyper CSC: %llu-by-%llu, Ax_size %llu, "
+                    "c %llu:\n", nrows, ncols, Ax_size, nvec) ;
+                OK (GB_Type_check (type, "type", GxB_SUMMARY, stdout));
                 GB_Type_code code = type->code ;
 
                 for (int64_t k = 0 ; k < nvec ; k++)
@@ -236,19 +246,18 @@ GrB_Info import_export (GB_Context Context)
                     for (int64_t p = Ap [k] ; p < Ap [k+1] ; p++)
                     {
                         printf ("  row %llu value ", Ai [p]) ;
-                        GB_code_check (code, Ax + p*asize, 5, stdout, Context) ;
+                        GB_code_check (code, Ax + p*asize, 5, stdout) ;
                         printf ("\n") ;
                     }
                 }
             }
 
-            if (clear_nvec) nonempty = -1 ;     // for testing
-
-            OK (GxB_Matrix_import_HyperCSC (&C, type, nrows, ncols, nvals,
-                nonempty, nvec, &Ah, &Ap, &Ai, &Ax, desc)) ;
+            OK (GxB_Matrix_import_HyperCSC (&C, type, nrows, ncols,
+                &Ap, &Ah, &Ai, &Ax, Ap_size, Ah_size, Ai_size, Ax_size,
+                nvec, jumbled, desc)) ;
 
             OK (GB_Matrix_check (C, "C reimported",
-                dump ? GxB_COMPLETE : GxB_SILENT, stdout, Context)) ;
+                dump ? GxB_COMPLETE : GxB_SILENT, stdout)) ;
             break ;
 
         //----------------------------------------------------------------------
@@ -263,10 +272,10 @@ GrB_Info import_export (GB_Context Context)
 
 //------------------------------------------------------------------------------
 
-GrB_Info import_export2 (GB_Context Context)
+GrB_Info import_export2 (void)
 {
-    OK (import_export (Context)) ;
-    OK (import_export (Context)) ;
+    OK (import_export ( )) ;
+    OK (import_export ( )) ;
     return (GrB_SUCCESS) ;
 }
 
@@ -284,7 +293,6 @@ void mexFunction
     bool malloc_debug = GB_mx_get_global (true) ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 1 || nargin > 6)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -312,17 +320,16 @@ void mexFunction
         if (!is_csc)                                                        \
         {                                                                   \
             /* convert C to CSR */                                          \
-            GB_transpose (NULL, NULL, false, C,                             \
-                NULL, NULL, NULL, false,                                    \
+            GB_transpose (NULL, NULL, false, C, NULL, NULL, NULL, false,    \
                 NULL) ;                                                     \
         }                                                                   \
-        if (is_hyper)                                                       \
+        if (is_hyper && !GB_IS_FULL (C))                                    \
         {                                                                   \
             /* convert C to hypersparse */                                  \
-            GB_to_nonhyper (C, NULL) ;                                      \
+            GB_convert_sparse_to_hyper (C, NULL) ;                          \
         }                                                                   \
     }
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
@@ -331,7 +338,7 @@ void mexFunction
     }
 
     // import/export
-    METHOD (import_export2 (Context)) ;
+    METHOD (import_export2 ( )) ;
 
     // return C to MATLAB
     pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C export/import", false) ;
diff --git a/GraphBLAS/Test/GB_mex_export_import.c b/GraphBLAS/Test/GB_mex_export_import.c
index 9755c701e3..688a87d03b 100644
--- a/GraphBLAS/Test/GB_mex_export_import.c
+++ b/GraphBLAS/Test/GB_mex_export_import.c
@@ -2,8 +2,8 @@
 // GB_mex_export_import: export and then reimport a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,14 +23,15 @@
     GB_FREE (Ch) ;                      \
     GB_FREE (Ci) ;                      \
     GB_FREE (Cx) ;                      \
-    GB_MATRIX_FREE (&C) ;               \
+    GB_FREE (Cb) ;                      \
+    GrB_Matrix_free_(&C) ;              \
 }
 
 #define FREE_ALL                        \
 {                                       \
     FREE_WORK ;                         \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&A) ;              \
+    GB_mx_put_global (true) ;           \
 }
 
 #define OK(method)                              \
@@ -47,14 +48,92 @@ GrB_Matrix A = NULL ;
 GrB_Matrix C = NULL ;
 GrB_Index *Cp = NULL, *Ch = NULL, *Ci = NULL ;
 GB_void *Cx = NULL ;
+int8_t *Cb = NULL ;
 GB_Context Context = NULL ;
-size_t csize = 0 ;
 GrB_Index nvec = 0, nvals = 0, nrows = 0, ncols = 0 ;
-int64_t nonempty = -1 ;
-GrB_Type type = NULL, atype = NULL;
+
+GrB_Index Ci_size = 0 ;
+GrB_Index Cx_size = 0 ;
+GrB_Index Cp_size = 0 ;
+GrB_Index Cj_size = 0 ;
+GrB_Index Cb_size = 0 ;
+GrB_Index Ch_size = 0 ;
+
+int64_t ignore = -1 ;
+bool jumbled = false ;
+GrB_Type type = NULL ;
 GrB_Info info = GrB_SUCCESS ;
 
 GrB_Info export_import ( int format_matrix, int format_export) ;
+GrB_Info vector_export_import ( int format_matrix, int format_export) ;
+
+//------------------------------------------------------------------------------
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    bool malloc_debug = GB_mx_get_global (true) ;
+
+    // check inputs
+    if (nargout > 1 || nargin != 3)
+    {
+        mexErrMsgTxt ("Usage: " USAGE) ;
+    }
+
+    // get A (shallow copy)
+    {
+        A = GB_mx_mxArray_to_Matrix (pargin [0], "A input", false, true) ;
+        if (A == NULL)
+        {
+            FREE_ALL ;
+            mexErrMsgTxt ("A failed") ;
+        }
+    }
+
+    // get matrix format (1 to 8, and -1 to -8)
+    int GET_SCALAR (1, int, format_matrix, 0) ;
+    bool do_matrix = (format_matrix > 0) ;
+    if (format_matrix < 0)
+    {
+        format_matrix = -format_matrix ;
+    }
+
+    // get export/import format (0 to 11)
+    int GET_SCALAR (2, int, format_export, 0) ;
+
+    #define GET_DEEP_COPY   GrB_Matrix_dup (&C, A) ;
+    #define FREE_DEEP_COPY  GrB_Matrix_free (&C) ;
+
+    // C = deep copy of A
+    GET_DEEP_COPY ;
+
+    // convert matrix, export, then import
+    if (do_matrix)
+    {
+        METHOD (export_import (format_matrix, format_export)) ;
+    }
+
+    FREE_DEEP_COPY ;
+    GET_DEEP_COPY ;
+
+    // convert vector, export, then import, if C can be cast as a GrB_Vector
+    if (GB_VECTOR_OK (C))
+    {
+        METHOD (vector_export_import (format_matrix, format_export)) ;
+    }
+
+    // return C to MATLAB as a struct and free the GraphBLAS C
+    pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C output", true) ;
+    FREE_ALL ;
+}
+
+
 
 //------------------------------------------------------------------------------
 
@@ -65,11 +144,6 @@ GrB_Info export_import
 )
 {
 
-    GxB_Matrix_type (&atype, A) ;
-    GxB_Type_size (&csize, atype) ;
-
-    OK (GrB_Matrix_dup (&C, A)) ;
-
     //--------------------------------------------------------------------------
     // convert C to the requested format
     //--------------------------------------------------------------------------
@@ -78,38 +152,86 @@ GrB_Info export_import
     {
 
         //----------------------------------------------------------------------
-        case 0 :    // standard CSR
+        case 1 :    // standard CSR
         //----------------------------------------------------------------------
 
-            OK (GxB_Matrix_Option_set_(C, GxB_HYPER,  GxB_NEVER_HYPER)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_HYPER_SWITCH, GxB_NEVER_HYPER)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
             OK (GxB_Matrix_Option_set_(C, GxB_FORMAT, GxB_BY_ROW)) ;
             break ;
 
         //----------------------------------------------------------------------
-        case 1 :    // standard CSC
+        case 2 :    // standard CSC
         //----------------------------------------------------------------------
 
-            OK (GxB_Matrix_Option_set_(C, GxB_HYPER,  GxB_NEVER_HYPER)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_HYPER_SWITCH, GxB_NEVER_HYPER)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
             OK (GxB_Matrix_Option_set_(C, GxB_FORMAT, GxB_BY_COL)) ;
             break ;
 
         //----------------------------------------------------------------------
-        case 2 :    // hypersparse CSR
+        case 3 :    // hypersparse CSR
         //----------------------------------------------------------------------
 
-            OK (GxB_Matrix_Option_set_(C, GxB_HYPER,  GxB_ALWAYS_HYPER)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_SPARSITY_CONTROL,
+                GxB_HYPERSPARSE)) ;
             OK (GxB_Matrix_Option_set_(C, GxB_FORMAT, GxB_BY_ROW)) ;
             break ;
 
         //----------------------------------------------------------------------
-        case 3 :    // hypersparse CSC
+        case 4 :    // hypersparse CSC
         //----------------------------------------------------------------------
 
-            OK (GxB_Matrix_Option_set_(C, GxB_HYPER,  GxB_ALWAYS_HYPER)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_HYPER_SWITCH, GxB_ALWAYS_HYPER)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_SPARSITY_CONTROL,
+                GxB_HYPERSPARSE)) ;
             OK (GxB_Matrix_Option_set_(C, GxB_FORMAT, GxB_BY_COL)) ;
             break ;
 
-        default : mexErrMsgTxt ("invalid format") ;
+        //----------------------------------------------------------------------
+        case 5 :    // bitmapR
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_Option_set_(C, GxB_SPARSITY_CONTROL, GxB_BITMAP)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_FORMAT, GxB_BY_ROW)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        case 6 :    // bitmapC
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_Option_set_(C, GxB_SPARSITY_CONTROL, GxB_BITMAP)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_FORMAT, GxB_BY_COL)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        case 7 :    // FullR
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_Option_set_(C, GxB_SPARSITY_CONTROL, GxB_FULL)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_FORMAT, GxB_BY_ROW)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        case 8 :    // FullC
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_Option_set_(C, GxB_SPARSITY_CONTROL, GxB_FULL)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_FORMAT, GxB_BY_COL)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        case 9 :    // to control == 11, then bitmap
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_Option_set_(C, GxB_SPARSITY_CONTROL, GxB_BITMAP)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_SPARSITY_CONTROL,
+               GxB_HYPERSPARSE + GxB_SPARSE + GxB_FULL)) ;
+            OK (GxB_Matrix_Option_set_(C, GxB_FORMAT, GxB_BY_COL)) ;
+            break ;
+
+        default : mexErrMsgTxt ("invalid mtx format") ;
     }
 
     //--------------------------------------------------------------------------
@@ -123,49 +245,156 @@ GrB_Info export_import
         case 0 :    // standard CSR
         //----------------------------------------------------------------------
 
-            OK (GxB_Matrix_export_CSR (&C, &type, &nrows, &ncols, &nvals,
-                &nonempty, &Cp, &Ci, &Cx, NULL)) ;
-            nvec = nrows ;
+            OK (GxB_Matrix_export_CSR (&C, &type, &nrows, &ncols,
+                &Cp, &Ci, &Cx, &Cp_size, &Ci_size, &Cx_size, &jumbled, NULL)) ;
+
+            OK (GxB_Matrix_import_CSR (&C, type, nrows, ncols,
+                &Cp, &Ci, &Cx, Cp_size, Ci_size, Cx_size, jumbled, NULL)) ;
 
-            OK (GxB_Matrix_import_CSR (&C, type, nrows, ncols, nvals,
-                nonempty, &Cp, &Ci, &Cx, NULL)) ;
             break ;
 
         //----------------------------------------------------------------------
         case 1 :    // standard CSC
         //----------------------------------------------------------------------
 
-            OK (GxB_Matrix_export_CSC (&C, &type, &nrows, &ncols, &nvals,
-                &nonempty, &Cp, &Ci, &Cx, NULL)) ;
-            nvec = ncols ;
+            OK (GxB_Matrix_export_CSC (&C, &type, &nrows, &ncols,
+                &Cp, &Ci, &Cx, &Cp_size, &Ci_size, &Cx_size, &jumbled, NULL)) ;
+
+            OK (GxB_Matrix_import_CSC (&C, type, nrows, ncols,
+                &Cp, &Ci, &Cx, Cp_size, Ci_size, Cx_size, jumbled, NULL)) ;
 
-            OK (GxB_Matrix_import_CSC (&C, type, nrows, ncols, nvals,
-                nonempty, &Cp, &Ci, &Cx, NULL)) ;
             break ;
 
         //----------------------------------------------------------------------
         case 2 :    // hypersparse CSR
         //----------------------------------------------------------------------
 
-            OK (GxB_Matrix_export_HyperCSR (&C, &type, &nrows, &ncols, &nvals,
-                &nonempty, &nvec, &Ch, &Cp, &Ci, &Cx, NULL)) ;
+            OK (GxB_Matrix_export_HyperCSR (&C, &type, &nrows, &ncols,
+                &Cp, &Ch, &Ci, &Cx, &Cp_size, &Ch_size, &Ci_size, &Cx_size,
+                &nvec, &jumbled, NULL)) ;
+
+            OK (GxB_Matrix_import_HyperCSR (&C, type, nrows, ncols,
+                &Cp, &Ch, &Ci, &Cx, Cp_size, Ch_size, Ci_size, Cx_size,
+                nvec, jumbled, NULL)) ;
 
-            OK (GxB_Matrix_import_HyperCSR (&C, type, nrows, ncols, nvals,
-                nonempty, nvec, &Ch, &Cp, &Ci, &Cx, NULL)) ;
             break ;
 
         //----------------------------------------------------------------------
         case 3 :    // hypersparse CSC
         //----------------------------------------------------------------------
 
-            OK (GxB_Matrix_export_HyperCSC (&C, &type, &nrows, &ncols, &nvals,
-                &nonempty, &nvec, &Ch, &Cp, &Ci, &Cx, NULL)) ;
+            OK (GxB_Matrix_export_HyperCSC (&C, &type, &nrows, &ncols,
+                &Cp, &Ch, &Ci, &Cx, &Cp_size, &Ch_size, &Ci_size, &Cx_size,
+                &nvec, &jumbled, NULL)) ;
+
+            OK (GxB_Matrix_import_HyperCSC (&C, type, nrows, ncols,
+                &Cp, &Ch, &Ci, &Cx, Cp_size, Ch_size, Ci_size, Cx_size,
+                nvec, jumbled, NULL)) ;
 
-            OK (GxB_Matrix_import_HyperCSC (&C, type, nrows, ncols, nvals,
-                nonempty, nvec, &Ch, &Cp, &Ci, &Cx, NULL)) ;
             break ;
 
-        default : mexErrMsgTxt ("invalid format") ;
+        //----------------------------------------------------------------------
+        case 4 :    // bitmapR
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_export_BitmapR (&C, &type, &nrows, &ncols,
+                &Cb, &Cx, &Cb_size, &Cx_size, &nvals, NULL)) ;
+
+            OK (GxB_Matrix_import_BitmapR (&C, type, nrows, ncols,
+                &Cb, &Cx, Cb_size, Cx_size, nvals, NULL)) ;
+
+            break ;
+
+        //----------------------------------------------------------------------
+        case 5 :    // bitmapC
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_export_BitmapC (&C, &type, &nrows, &ncols,
+                &Cb, &Cx, &Cb_size, &Cx_size, &nvals, NULL)) ;
+
+            OK (GxB_Matrix_import_BitmapC (&C, type, nrows, ncols,
+                &Cb, &Cx, Cb_size, Cx_size, nvals, NULL)) ;
+
+            break ;
+
+        //----------------------------------------------------------------------
+        case 6 :    // FullR
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_export_FullR (&C, &type, &nrows, &ncols,
+                &Cx, &Cx_size, NULL)) ;
+
+            OK (GxB_Matrix_import_FullR (&C, type, nrows, ncols,
+                &Cx, Cx_size, NULL)) ;
+
+            break ;
+
+        //----------------------------------------------------------------------
+        case 7 :    // FullC
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_export_FullC (&C, &type, &nrows, &ncols,
+                &Cx, &Cx_size, NULL)) ;
+
+            OK (GxB_Matrix_import_FullC (&C, type, nrows, ncols,
+                &Cx, Cx_size, NULL)) ;
+
+            break ;
+
+        //----------------------------------------------------------------------
+        case 8 :    // standard CSR, not jumbled
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_export_CSR (&C, &type, &nrows, &ncols,
+                &Cp, &Ci, &Cx, &Cp_size, &Ci_size, &Cx_size, NULL, NULL)) ;
+
+            OK (GxB_Matrix_import_CSR (&C, type, nrows, ncols,
+                &Cp, &Ci, &Cx, Cp_size, Ci_size, Cx_size, false, NULL)) ;
+
+            break ;
+
+        //----------------------------------------------------------------------
+        case 9 :    // standard CSC, not jumbled
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_export_CSC (&C, &type, &nrows, &ncols,
+                &Cp, &Ci, &Cx, &Cp_size, &Ci_size, &Cx_size, NULL, NULL)) ;
+
+            OK (GxB_Matrix_import_CSC (&C, type, nrows, ncols,
+                &Cp, &Ci, &Cx, Cp_size, Ci_size, Cx_size, false, NULL)) ;
+
+            break ;
+
+        //----------------------------------------------------------------------
+        case 10 :    // hypersparse CSR, not jumbled
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_export_HyperCSR (&C, &type, &nrows, &ncols,
+                &Cp, &Ch, &Ci, &Cx, &Cp_size, &Ch_size, &Ci_size, &Cx_size,
+                &nvec, NULL, NULL)) ;
+
+            OK (GxB_Matrix_import_HyperCSR (&C, type, nrows, ncols,
+                &Cp, &Ch, &Ci, &Cx, Cp_size, Ch_size, Ci_size, Cx_size,
+                nvec, false, NULL)) ;
+
+            break ;
+
+        //----------------------------------------------------------------------
+        case 11 :    // hypersparse CSC, not jumbled
+        //----------------------------------------------------------------------
+
+            OK (GxB_Matrix_export_HyperCSC (&C, &type, &nrows, &ncols,
+                &Cp, &Ch, &Ci, &Cx, &Cp_size, &Ch_size, &Ci_size, &Cx_size,
+                &nvec, NULL, NULL)) ;
+
+            OK (GxB_Matrix_import_HyperCSC (&C, type, nrows, ncols,
+                &Cp, &Ch, &Ci, &Cx, Cp_size, Ch_size, Ci_size, Cx_size,
+                nvec, false, NULL)) ;
+
+            break ;
+
+
+        default : mexErrMsgTxt ("invalid export format") ;
     }
 
     return (GrB_SUCCESS) ;
@@ -173,49 +402,186 @@ GrB_Info export_import
 
 //------------------------------------------------------------------------------
 
-void mexFunction
+GrB_Info vector_export_import
 (
-    int nargout,
-    mxArray *pargout [ ],
-    int nargin,
-    const mxArray *pargin [ ]
+    int format_matrix,
+    int format_export
 )
 {
 
-    bool malloc_debug = GB_mx_get_global (true) ;
+    //--------------------------------------------------------------------------
+    // convert C as a vector to the requested format, if available
+    //--------------------------------------------------------------------------
 
-    // check inputs
-    GB_WHERE (USAGE) ;
-    if (nargout > 1 || nargin != 3)
+    switch (format_matrix)
     {
-        mexErrMsgTxt ("Usage: " USAGE) ;
+
+        //----------------------------------------------------------------------
+        case 1 :    // standard CSR
+        //----------------------------------------------------------------------
+
+            return (GrB_SUCCESS) ;
+
+        //----------------------------------------------------------------------
+        case 2 :    // standard CSC
+        //----------------------------------------------------------------------
+
+            OK (GxB_Vector_Option_set_((GrB_Vector) C,
+                GxB_SPARSITY_CONTROL, GxB_SPARSE)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        case 3 :    // hypersparse CSR 
+        //----------------------------------------------------------------------
+
+            return (GrB_SUCCESS) ;
+
+        //----------------------------------------------------------------------
+        case 4 :    // hypersparse CSC (will be sparse, not hypersparse)
+        //----------------------------------------------------------------------
+
+            OK (GxB_Vector_Option_set_((GrB_Vector) C,
+                GxB_SPARSITY_CONTROL, GxB_HYPERSPARSE)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        case 5 :    // bitmapR
+        //----------------------------------------------------------------------
+
+            return (GrB_SUCCESS) ;
+
+        //----------------------------------------------------------------------
+        case 6 :    // bitmapC
+        case 9 :    // bitmapC
+        //----------------------------------------------------------------------
+
+            OK (GxB_Vector_Option_set_((GrB_Vector) C,
+                GxB_SPARSITY_CONTROL, GxB_BITMAP)) ;
+            break ;
+
+        //----------------------------------------------------------------------
+        case 7 :    // FullR
+        //----------------------------------------------------------------------
+
+            return (GrB_SUCCESS) ;
+
+        //----------------------------------------------------------------------
+        case 8 :    // FullC
+        //----------------------------------------------------------------------
+
+            OK (GxB_Vector_Option_set_((GrB_Vector) C,
+                GxB_SPARSITY_CONTROL, GxB_FULL)) ;
+            break ;
+
+        default : mexErrMsgTxt ("invalid format") ;
     }
 
-    // get A (shallow copy)
+    //--------------------------------------------------------------------------
+    // export then import
+    //--------------------------------------------------------------------------
+
+    switch (format_export)
     {
-        A = GB_mx_mxArray_to_Matrix (pargin [0], "A input", false, true) ;
-        if (A == NULL)
-        {
-            FREE_ALL ;
-            mexErrMsgTxt ("A failed") ;
-        }
-    }
 
-    // get matrix format (0 to 3)
-    int GET_SCALAR (1, int, format_matrix, 0) ;
+        //----------------------------------------------------------------------
+        case 0 :    // standard CSR
+        //----------------------------------------------------------------------
 
-    // get export/import format (0 to 3)
-    int GET_SCALAR (2, int, format_export, 0) ;
+            return (GrB_SUCCESS) ;
+
+        //----------------------------------------------------------------------
+        case 1 :    // standard CSC
+        //----------------------------------------------------------------------
 
-    #define GET_DEEP_COPY ;
-    #define FREE_DEEP_COPY ;
+            OK (GxB_Vector_export_CSC ((GrB_Vector *) &C, &type, &nrows,
+                &Ci, &Cx, &Ci_size, &Cx_size, &nvals, &jumbled, NULL)) ;
 
-    // convert matrix, export, then import
-    METHOD (export_import (format_matrix, format_export)) ;
+            OK (GxB_Vector_import_CSC ((GrB_Vector *) &C, type, nrows,
+                &Ci, &Cx, Ci_size, Cx_size, nvals, jumbled, NULL)) ;
 
-    // return C to MATLAB as a struct and free the GraphBLAS C
-    pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C output", true) ;
+            break ;
 
-    FREE_ALL ;
+        //----------------------------------------------------------------------
+        case 2 :    // hypersparse CSR
+        //----------------------------------------------------------------------
+
+            return (GrB_SUCCESS) ;
+
+        //----------------------------------------------------------------------
+        case 3 :    // hypersparse CSC
+        //----------------------------------------------------------------------
+
+            return (GrB_SUCCESS) ;
+
+        //----------------------------------------------------------------------
+        case 4 :    // bitmapR
+        //----------------------------------------------------------------------
+
+            return (GrB_SUCCESS) ;
+
+        //----------------------------------------------------------------------
+        case 5 :    // bitmapC
+        //----------------------------------------------------------------------
+
+            OK (GxB_Vector_export_Bitmap ((GrB_Vector *) &C, &type, &nrows,
+                &Cb, &Cx, &Cb_size, &Cx_size, &nvals, NULL)) ;
+
+            OK (GxB_Vector_import_Bitmap ((GrB_Vector *) &C, type, nrows,
+                &Cb, &Cx, Cb_size, Cx_size, nvals, NULL)) ;
+
+            break ;
+
+        //----------------------------------------------------------------------
+        case 6 :    // FullR
+        //----------------------------------------------------------------------
+
+            return (GrB_SUCCESS) ;
+
+        //----------------------------------------------------------------------
+        case 7 :    // FullC
+        //----------------------------------------------------------------------
+
+            OK (GxB_Vector_export_Full ((GrB_Vector *) &C, &type, &nrows,
+                &Cx, &Cx_size, NULL)) ;
+
+            OK (GxB_Vector_import_Full ((GrB_Vector *) &C, type, nrows,
+                &Cx, Cx_size, NULL)) ;
+
+            break ;
+
+        //----------------------------------------------------------------------
+        case 8 :    // standard CSR, not jumbled
+        //----------------------------------------------------------------------
+
+            return (GrB_SUCCESS) ;
+
+        //----------------------------------------------------------------------
+        case 9 :    // standard CSC, not jumbled
+        //----------------------------------------------------------------------
+
+            OK (GxB_Vector_export_CSC ((GrB_Vector *) &C, &type, &nrows,
+                &Ci, &Cx, &Ci_size, &Cx_size, &nvals, NULL, NULL)) ;
+
+            OK (GxB_Vector_import_CSC ((GrB_Vector *) &C, type, nrows,
+                &Ci, &Cx, Ci_size, Cx_size, nvals, false, NULL)) ;
+
+            break ;
+
+        //----------------------------------------------------------------------
+        case 10 :    // hypersparse CSR, not jumbled
+        //----------------------------------------------------------------------
+
+            return (GrB_SUCCESS) ;
+
+        //----------------------------------------------------------------------
+        case 11 :    // hypersparse CSC, not jumbled
+        //----------------------------------------------------------------------
+
+            return (GrB_SUCCESS) ;
+
+        default : mexErrMsgTxt ("invalid format") ;
+    }
+
+    return (GrB_SUCCESS) ;
 }
 
diff --git a/GraphBLAS/Test/GB_mex_extractTuples.c b/GraphBLAS/Test/GB_mex_extractTuples.c
index 1483be0607..231aa7a034 100644
--- a/GraphBLAS/Test/GB_mex_extractTuples.c
+++ b/GraphBLAS/Test/GB_mex_extractTuples.c
@@ -2,8 +2,8 @@
 // GB_mex_extractTuples: extract all tuples from a matrix or vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,8 +13,8 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&A) ;               \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -32,7 +32,6 @@ void mexFunction
     GrB_Index nvals = 0 ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 3 || nargin < 1 || nargin > 2)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_gabor.c b/GraphBLAS/Test/GB_mex_gabor.c
new file mode 100644
index 0000000000..cdcb414dc3
--- /dev/null
+++ b/GraphBLAS/Test/GB_mex_gabor.c
@@ -0,0 +1,108 @@
+//------------------------------------------------------------------------------
+// GB_mex_gabor: test case from Gabor Szarnyas and Marton Elekes
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+// This test triggers the C<M>=A assignment where C starts out as sparse with
+// has many pending tuples, and is converted to bitmap by the assignment.  In
+// this case, C is the vector w.  If w_sparsity is 15 and 'wait' is false, then
+// it starts the w<v>=sum(A) reduction with many pending tuples, and converts w
+// from sparse/hyper with many pending tuples into a bitmap vector.  The
+// outputs w, v, and A should be the same, regardless of the input parameter s.
+
+// s is an optional vector of length 4, containing 4 parameters:
+// s = [wait, w_sparsity, v_sparsity, A_sparsity] ;
+// with wait 0 or 1, and the sparsity controls in range 1 to 15.
+
+#include "GB_mex.h"
+
+#define USAGE "[w,v,A] = GB_mex_gabor (s)"
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    bool malloc_debug = GB_mx_get_global (false) ;
+    GrB_Matrix A = NULL ;
+    GrB_Vector v = NULL ;
+    GrB_Vector w = NULL ;
+
+    // check inputs
+    if (nargout > 3 || nargin > 1)
+    {
+        mexErrMsgTxt ("Usage: " USAGE) ;
+    }
+
+    // get the sparsity control for w, v, and A, and the wait flag
+    int w_sparsity = 15 ;
+    int v_sparsity = 15 ;
+    int A_sparsity = 15 ;
+    bool wait = false ;
+
+    if (nargin > 0)
+    {
+        if (mxGetNumberOfElements (pargin [0]) != 4 || !mxIsDouble (pargin [0]))
+        {
+            mexErrMsgTxt ("Usage: " USAGE
+                "\ns must be a double vector of length 4\n") ;
+        }
+        double *p = mxGetDoubles (pargin [0]) ;
+        wait = (bool) p [0] ;
+        w_sparsity = (int) p [1] ;
+        v_sparsity = (int) p [2] ;
+        A_sparsity = (int) p [3] ;
+    }
+
+    // define the problem
+    uint64_t I [ ] = { 1, 2, 4, 5, 7, 11, 12, 13, 15, 18, 19, 20, 27, 32, 33,
+        35, 37, 41, 46, 50, 52, 53, 55, 57, 58, 61, 62, 63, 65, 66, 69, 70, 72,
+        73, 74, 75, 78, 79, 81, 84, 86, 87, 90, 91, 94, 96, 97, 98, 99, 100,
+        101, 102, 103, 104, 105, 107, 108, 109, 110, 115, 116, 117, 118, 120,
+        123, 129, 131, 132, 133, 134, 136, 140, 145, 146, 149, 152, 153, 154,
+        156, 158, 159, 160, 161, 163, 164, 165, 166, 168, 169, 172, 176, 177,
+        181, 184, 186, 187, 189, 191, 193, 194, 195, 197, 200, 201, 202, 203,
+        204, 205, 208, 209, 210, 211, 216, 217, 218, 219, 224, 225, 229, 230,
+        232, 235, 236, 238, 239, 242, 243 } ;
+
+    uint64_t nvals = sizeof (I) / sizeof (uint64_t) ;
+    uint64_t n = 1000 ;
+
+    // construct a diagonal matrix A where A(i,i)=i for each i in I 
+    GrB_Matrix_new (&A, GrB_UINT64, n, n) ;
+    GxB_Matrix_Option_set_ (A, GxB_SPARSITY_CONTROL, A_sparsity) ;
+    GrB_Matrix_build (A, I, I, I, nvals, GrB_PLUS_UINT64) ;
+    if (wait) GrB_Matrix_wait (&A) ;
+
+    // construct v from I, with value v (i) = i 
+    GrB_Vector_new (&v, GrB_UINT64, n) ;
+    GxB_Vector_Option_set_ (v, GxB_SPARSITY_CONTROL, v_sparsity) ;
+    GrB_Vector_build (v, I, I, nvals, GrB_PLUS_UINT64) ;
+    if (wait) GrB_Vector_wait (&v) ;
+
+    // w<v> = 1
+    GrB_Vector_new (&w, GrB_UINT64, n) ;
+    GxB_Vector_Option_set_ (w, GxB_SPARSITY_CONTROL, w_sparsity) ;
+    GrB_Vector_assign_UINT64 (w, v, NULL, 1, GrB_ALL, 0, NULL) ;
+    if (wait) GrB_Vector_wait (&w) ;
+
+    // w<v> = sum (A)
+    GrB_Matrix_reduce_Monoid (w, v, NULL, GrB_PLUS_MONOID_UINT64, A, NULL) ;
+
+    // return A, v, and w to MATLAB as structs
+    pargout [0] = GB_mx_Vector_to_mxArray (&w, "w output", true) ;
+    pargout [1] = GB_mx_Vector_to_mxArray (&v, "v output", true) ;
+    pargout [2] = GB_mx_Matrix_to_mxArray (&A, "A output", true) ;
+
+    // log the test coverage
+    GB_mx_put_global (true) ;
+}
+
diff --git a/GraphBLAS/Test/GB_mex_generic.h b/GraphBLAS/Test/GB_mex_generic.h
index 1d2ebcbfd7..191fe3e16c 100644
--- a/GraphBLAS/Test/GB_mex_generic.h
+++ b/GraphBLAS/Test/GB_mex_generic.h
@@ -2,13 +2,13 @@
 // GB_mex_generic.h: testing _Generic methods
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, _(c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-#ifndef GB_GENERIC_H
-#define GB_GENERIC_H
+#ifndef GB_MEX_GENERIC_H
+#define GB_MEX_GENERIC_H
 
 #if GxB_STDC_VERSION >= 201112L
 
@@ -28,6 +28,18 @@
 #define GrB_Matrix_free_(...)                       GrB_free (__VA_ARGS__)
 #define GrB_Descriptor_free_(...)                   GrB_free (__VA_ARGS__)
 
+// GrB_error:
+#define GrB_Type_error_(...)                        GrB_error (__VA_ARGS__)
+#define GrB_UnaryOp_error_(...)                     GrB_error (__VA_ARGS__)
+#define GrB_BinaryOp_error_(...)                    GrB_error (__VA_ARGS__)
+#define GxB_SelectOp_error_(...)                    GrB_error (__VA_ARGS__)
+#define GrB_Monoid_error_(...)                      GrB_error (__VA_ARGS__)
+#define GrB_Semiring_error_(...)                    GrB_error (__VA_ARGS__)
+#define GxB_Scalar_error_(...)                      GrB_error (__VA_ARGS__)
+#define GrB_Vector_error_(...)                      GrB_error (__VA_ARGS__)
+#define GrB_Matrix_error_(...)                      GrB_error (__VA_ARGS__)
+#define GrB_Descriptor_error_(...)                  GrB_error (__VA_ARGS__)
+
 // GrB_Monoid_new:
 #define GrB_Monoid_new_BOOL_(...)                   GrB_Monoid_new (__VA_ARGS__)
 #define GrB_Monoid_new_INT8_(...)                   GrB_Monoid_new (__VA_ARGS__)
@@ -223,26 +235,28 @@
 // GxB_set:
 #define GxB_Global_Option_set_(...)                 GxB_set (__VA_ARGS__)
 #define GxB_Matrix_Option_set_(...)                 GxB_set (__VA_ARGS__)
+#define GxB_Vector_Option_set_(...)                 GxB_set (__VA_ARGS__)
 #define GrB_Descriptor_set_(...)                    GxB_set (__VA_ARGS__)
 #define GxB_Desc_set_(...)                          GxB_set (__VA_ARGS__)
 
 // GxB_get:
 #define GxB_Global_Option_get_(...)                 GxB_get (__VA_ARGS__)
 #define GxB_Matrix_Option_get_(...)                 GxB_get (__VA_ARGS__)
+#define GxB_Vector_Option_get_(...)                 GxB_get (__VA_ARGS__)
 #define GxB_Descriptor_get_(...)                    GxB_get (__VA_ARGS__)
 #define GxB_Desc_get_(...)                          GxB_get (__VA_ARGS__)
 
-// GrB_wait (not yet available as _Generic):
-#define GrB_Type_wait_(...)                         GrB_Type_wait (__VA_ARGS__)
-#define GrB_UnaryOp_wait_(...)                      GrB_UnaryOp_wait (__VA_ARGS__)
-#define GrB_BinaryOp_wait_(...)                     GrB_BinaryOp_wait (__VA_ARGS__)
-#define GxB_SelectOp_wait_(...)                     GxB_SelectOp_wait (__VA_ARGS__)
-#define GrB_Monoid_wait_(...)                       GrB_Monoid_wait (__VA_ARGS__)
-#define GrB_Semiring_wait_(...)                     GrB_Semiring_wait (__VA_ARGS__)
-#define GrB_Descriptor_wait_(...)                   GrB_Descriptor_wait (__VA_ARGS__)
-#define GxB_Scalar_wait_(...)                       GxB_Scalar_wait (__VA_ARGS__)
-#define GrB_Vector_wait_(...)                       GrB_Vector_wait (__VA_ARGS__)
-#define GrB_Matrix_wait_(...)                       GrB_Matrix_wait (__VA_ARGS__)
+// GrB_wait:
+#define GrB_Type_wait_(...)                         GrB_wait (__VA_ARGS__)
+#define GrB_UnaryOp_wait_(...)                      GrB_wait (__VA_ARGS__)
+#define GrB_BinaryOp_wait_(...)                     GrB_wait (__VA_ARGS__)
+#define GxB_SelectOp_wait_(...)                     GrB_wait (__VA_ARGS__)
+#define GrB_Monoid_wait_(...)                       GrB_wait (__VA_ARGS__)
+#define GrB_Semiring_wait_(...)                     GrB_wait (__VA_ARGS__)
+#define GrB_Descriptor_wait_(...)                   GrB_wait (__VA_ARGS__)
+#define GxB_Scalar_wait_(...)                       GrB_wait (__VA_ARGS__)
+#define GrB_Vector_wait_(...)                       GrB_wait (__VA_ARGS__)
+#define GrB_Matrix_wait_(...)                       GrB_wait (__VA_ARGS__)
 
 // GrB_eWiseMult:
 #define GrB_Vector_eWiseMult_Semiring_(...)         GrB_eWiseMult (__VA_ARGS__)
@@ -472,6 +486,18 @@
 #define GrB_Matrix_free_(...)                       GrB_Matrix_free (__VA_ARGS__)
 #define GrB_Descriptor_free_(...)                   GrB_Descriptor_free (__VA_ARGS__)
 
+// GrB_error:
+#define GrB_Type_error_(...)                        GrB_Type_error (__VA_ARGS__)
+#define GrB_UnaryOp_error_(...)                     GrB_UnaryOp_error (__VA_ARGS__)
+#define GrB_BinaryOp_error_(...)                    GrB_BinaryOp_error (__VA_ARGS__)
+#define GxB_SelectOp_error_(...)                    GxB_SelectOp_error (__VA_ARGS__)
+#define GrB_Monoid_error_(...)                      GrB_Monoid_error (__VA_ARGS__)
+#define GrB_Semiring_error_(...)                    GrB_Semiring_error (__VA_ARGS__)
+#define GxB_Scalar_error_(...)                      GxB_Scalar_error (__VA_ARGS__)
+#define GrB_Vector_error_(...)                      GrB_Vector_error (__VA_ARGS__)
+#define GrB_Matrix_error_(...)                      GrB_Matrix_error (__VA_ARGS__)
+#define GrB_Descriptor_error_(...)                  GrB_Descriptor_error (__VA_ARGS__)
+
 // GrB_Monoid_new:
 #define GrB_Monoid_new_BOOL_(...)                   GrB_Monoid_new_BOOL (__VA_ARGS__)
 #define GrB_Monoid_new_INT8_(...)                   GrB_Monoid_new_INT8 (__VA_ARGS__)
@@ -667,11 +693,13 @@
 // GxB_Desc_set:
 #define GxB_Global_Option_set_(...)                 GxB_Global_Option_set (__VA_ARGS__)
 #define GxB_Matrix_Option_set_(...)                 GxB_Matrix_Option_set (__VA_ARGS__)
+#define GxB_Vector_Option_set_(...)                 GxB_Vector_Option_set (__VA_ARGS__)
 #define GrB_Descriptor_set_(...)                    GrB_Descriptor_set (__VA_ARGS__)
 
 // GxB_Desc_get:
 #define GxB_Global_Option_get_(...)                 GxB_Global_Option_get (__VA_ARGS__)
 #define GxB_Matrix_Option_get_(...)                 GxB_Matrix_Option_get (__VA_ARGS__)
+#define GxB_Vector_Option_get_(...)                 GxB_Vector_Option_get (__VA_ARGS__)
 #define GxB_Descriptor_get_(...)                    GxB_Descriptor_get (__VA_ARGS__)
 
 // GrB_wait:
diff --git a/GraphBLAS/Test/GB_mex_hack.c b/GraphBLAS/Test/GB_mex_hack.c
index 954b3e1b30..39cb8da297 100644
--- a/GraphBLAS/Test/GB_mex_hack.c
+++ b/GraphBLAS/Test/GB_mex_hack.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_mex_hack: return global hack flag
+// GB_mex_hack: get or set the global hack flag
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mex_init.c b/GraphBLAS/Test/GB_mex_init.c
index 68461479f9..de3740d852 100644
--- a/GraphBLAS/Test/GB_mex_init.c
+++ b/GraphBLAS/Test/GB_mex_init.c
@@ -2,8 +2,8 @@
 // GB_mex_init: initialize GraphBLAS
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,9 +11,9 @@
 
 #include "GB_mex.h"
 
-#define USAGE "[nthreads_max threading thread_safety format hyperratio" \
+#define USAGE "[nthreads format hyper_switch " \
 "name version date about license compiledate compiletime api api_about" \
-" chunk] = GB_mex_init"
+" chunk bitmap_switch] = GB_mex_init"
 
 void mexFunction
 (
@@ -23,85 +23,103 @@ void mexFunction
     const mxArray *pargin [ ]
 )
 {
+    mexPrintf ("usage:\n%s\n", USAGE) ;
+
     GxB_init (GrB_NONBLOCKING, mxMalloc, mxCalloc, mxRealloc, mxFree, false) ;
-    GB_WHERE (USAGE) ;
     GB_Global_abort_function_set (GB_mx_abort) ;
     GB_Global_malloc_tracking_set (true) ;
 
     // MATLAB default is by column
     GxB_Global_Option_set_(GxB_FORMAT, GxB_BY_COL) ;
 
-    int nthreads_max ;
-    GxB_Global_Option_get_(GxB_NTHREADS, &nthreads_max) ;
-    pargout [0] = mxCreateDoubleScalar (nthreads_max) ;
-
-    GxB_Thread_Model threading ;
-    GxB_Global_Option_get_(GxB_THREADING, &threading) ;
-    pargout [1] = mxCreateDoubleScalar (threading) ;
-
-    GxB_Thread_Model thread_safety ;
-    GxB_Global_Option_get_(GxB_THREAD_SAFETY, &thread_safety) ;
-    pargout [2] = mxCreateDoubleScalar (thread_safety) ;
+    int nthreads ;
+    GxB_Global_Option_get_(GxB_NTHREADS, &nthreads) ;
+    pargout [0] = mxCreateDoubleScalar (nthreads) ;
 
     GxB_Format_Value format ;
     GxB_Global_Option_get_(GxB_FORMAT, &format) ;
-    pargout [3] = mxCreateDoubleScalar (format) ;
+    pargout [1] = mxCreateDoubleScalar (format) ;
 
-    double hyperratio ;
-    GxB_Global_Option_get_(GxB_HYPER, &hyperratio) ;
-    pargout [4] = mxCreateDoubleScalar (hyperratio) ;
+    double hyper_switch ;
+    GxB_Global_Option_get_(GxB_HYPER_SWITCH, &hyper_switch) ;
+    pargout [2] = mxCreateDoubleScalar (hyper_switch) ;
 
     char *name ;
     GxB_Global_Option_get_(GxB_LIBRARY_NAME, &name) ;
-    pargout [5] = mxCreateString (name) ;
+    pargout [3] = mxCreateString (name) ;
 
     int version [3] ;
     GxB_Global_Option_get_(GxB_LIBRARY_VERSION, version) ;
-    pargout [6] = mxCreateDoubleMatrix (1, 3, mxREAL) ;
-    double *p = mxGetPr (pargout [6]) ;
+    pargout [4] = mxCreateDoubleMatrix (1, 3, mxREAL) ;
+    double *p = mxGetPr (pargout [4]) ;
     p [0] = version [0] ;
     p [1] = version [1] ;
     p [2] = version [2] ;
 
     char *date ;
     GxB_Global_Option_get_(GxB_LIBRARY_DATE, &date) ;
-    pargout [7] = mxCreateString (date) ;
+    pargout [5] = mxCreateString (date) ;
 
     char *about ;
     GxB_Global_Option_get_(GxB_LIBRARY_ABOUT, &about) ;
-    pargout [8] = mxCreateString (about) ;
+    pargout [6] = mxCreateString (about) ;
 
     char *license ;
     GxB_Global_Option_get_(GxB_LIBRARY_LICENSE, &license) ;
-    pargout [9] = mxCreateString (license) ;
+    pargout [7] = mxCreateString (license) ;
 
     char *compile_date ;
     GxB_Global_Option_get_(GxB_LIBRARY_COMPILE_DATE, &compile_date) ;
-    pargout [10] = mxCreateString (compile_date) ;
+    pargout [8] = mxCreateString (compile_date) ;
 
     char *compile_time ;
     GxB_Global_Option_get_(GxB_LIBRARY_COMPILE_TIME, &compile_time) ;
-    pargout [11] = mxCreateString (compile_time) ;
+    pargout [9] = mxCreateString (compile_time) ;
 
     int api [3] ;
     GxB_Global_Option_get_(GxB_API_VERSION, api) ;
-    pargout [12] = mxCreateDoubleMatrix (1, 3, mxREAL) ;
-    double *a = mxGetPr (pargout [12]) ;
+    pargout [10] = mxCreateDoubleMatrix (1, 3, mxREAL) ;
+    double *a = mxGetPr (pargout [10]) ;
     a [0] = api [0] ;
     a [1] = api [1] ;
     a [2] = api [2] ;
 
     char *api_about ;
     GxB_Global_Option_get_(GxB_API_ABOUT, &api_about) ;
-    pargout [13] = mxCreateString (api_about) ;
+    pargout [11] = mxCreateString (api_about) ;
 
     double chunk ;
     GxB_Global_Option_get_(GxB_CHUNK, &chunk) ;
-    pargout [14] = mxCreateDoubleScalar (chunk) ;
-
-    bool use_mkl ;
-    GxB_Global_Option_get_(GxB_MKL, &use_mkl) ;
-    pargout [15] = mxCreateLogicalScalar (use_mkl) ;
+    pargout [12] = mxCreateDoubleScalar (chunk) ;
+
+    double bitmap_switch [GxB_NBITMAP_SWITCH] ;
+    GxB_Global_Option_get_(GxB_BITMAP_SWITCH, bitmap_switch) ;
+    pargout [13] = mxCreateDoubleMatrix (1, GxB_NBITMAP_SWITCH, mxREAL) ;
+    double *bswitch = mxGetPr (pargout [13]) ;
+    for (int k = 0 ; k < GxB_NBITMAP_SWITCH ; k++)
+    {
+        bswitch [k] = bitmap_switch [k] ;
+    }
+
+    for (int k = 0 ; k < GxB_NBITMAP_SWITCH ; k++)
+    {
+        printf ("bitmap_switch [%d] = %g ", k, bswitch [k]) ;
+        if (k == 0)
+        {
+            printf ("for vectors and matrices with 1 row or column\n") ;
+        }
+        else if (k == GxB_NBITMAP_SWITCH - 1) 
+        {
+            printf ("for matrices with min dimension > %d\n", 1 << (k-1)) ;
+        }
+        else
+        {
+            printf ("for matrices with min dimension %d to %d\n",
+                (1 << (k-1)) + 1, 1 << k) ;
+        }
+    }
+
+    // #include "GB_Test_init_mkl_template.c"
 
     GrB_finalize ( ) ;
 }
diff --git a/GraphBLAS/Test/GB_mex_ipagerank.c b/GraphBLAS/Test/GB_mex_ipagerank.c
index c7d3a80531..49c0876b29 100644
--- a/GraphBLAS/Test/GB_mex_ipagerank.c
+++ b/GraphBLAS/Test/GB_mex_ipagerank.c
@@ -2,8 +2,8 @@
 // GB_mex_ipagerank: compute pagerank with an integer semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,8 +17,8 @@
 #define FREE_ALL                        \
 {                                       \
     if (P != NULL) mxFree (P) ;         \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&A) ;               \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -36,8 +36,6 @@ void mexFunction
     GrB_Index n = 0 ;
     bool malloc_debug = GB_mx_get_global (true) ;
 
-    GB_WHERE (USAGE) ;
-
     // check inputs
     if (nargout > 2 || nargin != 1)
     {
diff --git a/GraphBLAS/Test/GB_mex_isequal.c b/GraphBLAS/Test/GB_mex_isequal.c
index 8a3e5a1af9..b846a50a5e 100644
--- a/GraphBLAS/Test/GB_mex_isequal.c
+++ b/GraphBLAS/Test/GB_mex_isequal.c
@@ -2,8 +2,8 @@
 // GB_mex_isequal: returns true if A and B are equal
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,9 +13,9 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&B) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&A) ;               \
+    GrB_Matrix_free_(&B) ;               \
+    GB_mx_put_global (true) ;           \
 }
 
 
@@ -32,8 +32,6 @@ void mexFunction
     GrB_Matrix A = NULL ;
     GrB_Matrix B = NULL ;
 
-    GB_WHERE (USAGE) ;
-
     // check inputs
     if (nargout > 1 || nargin != 2)
     {
diff --git a/GraphBLAS/Test/GB_mex_kron.c b/GraphBLAS/Test/GB_mex_kron.c
index d944a79671..a664063081 100644
--- a/GraphBLAS/Test/GB_mex_kron.c
+++ b/GraphBLAS/Test/GB_mex_kron.c
@@ -2,8 +2,8 @@
 // GB_mex_kron: C<Mask> = accum(C,kron(A,B))
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,12 +13,12 @@
 
 #define FREE_ALL                    \
 {                                   \
-    GB_MATRIX_FREE (&A) ;           \
-    GB_MATRIX_FREE (&B) ;           \
-    GB_MATRIX_FREE (&C) ;           \
+    GrB_Matrix_free_(&A) ;           \
+    GrB_Matrix_free_(&B) ;           \
+    GrB_Matrix_free_(&C) ;           \
     GrB_Descriptor_free_(&desc) ;   \
-    GB_MATRIX_FREE (&Mask) ;        \
-    GB_mx_put_global (true, 0) ;    \
+    GrB_Matrix_free_(&Mask) ;        \
+    GB_mx_put_global (true) ;       \
 }
 
 void mexFunction
@@ -39,7 +39,6 @@ void mexFunction
     GrB_BinaryOp mult = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 6 || nargin > 7)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -48,7 +47,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
         C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
 
     GET_DEEP_COPY ;
     if (C == NULL)
diff --git a/GraphBLAS/Test/GB_mex_mis.c b/GraphBLAS/Test/GB_mex_mis.c
index fd30325edc..6d1726e2ef 100644
--- a/GraphBLAS/Test/GB_mex_mis.c
+++ b/GraphBLAS/Test/GB_mex_mis.c
@@ -2,8 +2,8 @@
 // GB_mex_mis: s=mis(A), find a maximal independent set
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,8 +15,8 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&A) ;               \
+    GB_mx_put_global (true) ;           \
 }
 
 
@@ -34,7 +34,6 @@ void mexFunction
     GrB_Vector iset = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 1 || nargin > 2)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_msort_1.c b/GraphBLAS/Test/GB_mex_msort_1.c
deleted file mode 100644
index 8f4917d7ed..0000000000
--- a/GraphBLAS/Test/GB_mex_msort_1.c
+++ /dev/null
@@ -1,67 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_mex_msort_1: sort using GB_msort_1
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-#include "GB_mex.h"
-
-#define USAGE "I = GB_mex_msort_1 (I,nthreads)"
-
-void mexFunction
-(
-    int nargout,
-    mxArray *pargout [ ],
-    int nargin,
-    const mxArray *pargin [ ]
-)
-{
-    bool malloc_debug = GB_mx_get_global (true) ;
-
-    // check inputs
-    if (nargin != 2 || nargout != 1)
-    {
-        mexErrMsgTxt ("Usage: " USAGE) ;
-    }
-    if (!mxIsClass (pargin [0], "int64"))
-    {
-        mexErrMsgTxt ("I must be a int64 array") ;
-    }
-
-    int64_t *I = mxGetData (pargin [0]) ;
-    int64_t n = (uint64_t) mxGetNumberOfElements (pargin [0]) ;
-
-    // get # of threads to use
-    int GET_SCALAR (1, int, nthreads, 1) ;
-    nthreads = GB_MSORT_NTHREADS (nthreads) ;
-
-    // make a copy of the input array
-    pargout [0] = GB_mx_create_full (n, 1, GrB_INT64) ;
-    int64_t *Iout = mxGetData (pargout [0]) ;
-    memcpy (Iout, I, n * sizeof (int64_t)) ;
-
-    // get workspace
-    int64_t *Work_0 = NULL ;
-    if (nthreads > 1)
-    {
-        Work_0 = mxMalloc ((n+1) * sizeof (int64_t)) ;
-    }
-
-    GB_MEX_TIC ;
-
-    GB_msort_1 (Iout, Work_0, n, nthreads) ;
-
-    GB_MEX_TOC ;
-
-    // free workspace
-    if (nthreads > 1)
-    {
-        mxFree (Work_0) ;
-    }
-
-    GB_mx_put_global (true, 0) ;
-}
-
diff --git a/GraphBLAS/Test/GB_mex_msort_2.c b/GraphBLAS/Test/GB_mex_msort_2.c
index 6e7ed7f38a..fcaa88908b 100644
--- a/GraphBLAS/Test/GB_mex_msort_2.c
+++ b/GraphBLAS/Test/GB_mex_msort_2.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_mex_msort_2: sort using GB_msort_2
+// GB_mex_msort_2: sort using GB_msort_2b
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -45,7 +45,6 @@ void mexFunction
     }
 
     int GET_SCALAR (2, int, nthreads, 1) ;
-    nthreads = GB_MSORT_NTHREADS (nthreads) ;
 
     // make a copy of the input arrays
     pargout [0] = GB_mx_create_full (n, 1, GrB_INT64) ;
@@ -56,28 +55,10 @@ void mexFunction
     int64_t *Jout = mxGetData (pargout [1]) ;
     memcpy (Jout, J, n * sizeof (int64_t)) ;
 
-    // get workspace
-    int64_t *Work_0 = NULL ;
-    int64_t *Work_1 = NULL ;
-    if (nthreads > 1)
-    {
-        Work_0 = mxMalloc ((n+1) * sizeof (int64_t)) ;
-        Work_1 = mxMalloc ((n+1) * sizeof (int64_t)) ;
-    }
-
     GB_MEX_TIC ;
-
-    GB_msort_2 (Iout, Jout, Work_0, Work_1, n, nthreads) ;
-
+    GB_msort_2b (Iout, Jout, n, nthreads) ;
     GB_MEX_TOC ;
 
-    // free workspace
-    if (nthreads > 1)
-    {
-        mxFree (Work_0) ;
-        mxFree (Work_1) ;
-    }
-
-    GB_mx_put_global (true, 0) ;
+    GB_mx_put_global (true) ;   
 }
 
diff --git a/GraphBLAS/Test/GB_mex_msort_3.c b/GraphBLAS/Test/GB_mex_msort_3.c
index 6577792a7c..630b991438 100644
--- a/GraphBLAS/Test/GB_mex_msort_3.c
+++ b/GraphBLAS/Test/GB_mex_msort_3.c
@@ -1,9 +1,9 @@
 //------------------------------------------------------------------------------
-// GB_mex_msort_3: sort using GB_msort_3
+// GB_mex_msort_3: sort using GB_msort_3b
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -55,7 +55,6 @@ void mexFunction
     }
 
     int GET_SCALAR (3, int, nthreads, 1) ;
-    nthreads = GB_MSORT_NTHREADS (nthreads) ;
 
     pargout [0] = GB_mx_create_full (n, 1, GrB_INT64) ;
     int64_t *Iout = mxGetData (pargout [0]) ;
@@ -69,32 +68,10 @@ void mexFunction
     int64_t *Kout = mxGetData (pargout [2]) ;
     memcpy (Kout, K, n * sizeof (int64_t)) ;
 
-    // get workspace
-    int64_t *Work_0 = NULL ;
-    int64_t *Work_1 = NULL ;
-    int64_t *Work_2 = NULL ;
-
-    if (nthreads > 1)
-    {
-        Work_0 = mxMalloc ((n+1) * sizeof (int64_t)) ;
-        Work_1 = mxMalloc ((n+1) * sizeof (int64_t)) ;
-        Work_2 = mxMalloc ((n+1) * sizeof (int64_t)) ;
-    }
-
     GB_MEX_TIC ;
-
-    GB_msort_3 (Iout, Jout, Kout, Work_0, Work_1, Work_2, n, nthreads) ;
-
+    GB_msort_3b (Iout, Jout, Kout, n, nthreads) ;
     GB_MEX_TOC ;
 
-    // free workspace
-    if (nthreads > 1)
-    {
-        mxFree (Work_0) ;
-        mxFree (Work_1) ;
-        mxFree (Work_2) ;
-    }
-
-    GB_mx_put_global (true, 0) ;
+    GB_mx_put_global (true) ;   
 }
 
diff --git a/GraphBLAS/Test/GB_mex_mxm.c b/GraphBLAS/Test/GB_mex_mxm.c
index 7146029095..b84299f683 100644
--- a/GraphBLAS/Test/GB_mex_mxm.c
+++ b/GraphBLAS/Test/GB_mex_mxm.c
@@ -2,8 +2,8 @@
 // GB_mex_mxm: C<Mask> = accum(C,A*B)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,10 +13,10 @@
 
 #define FREE_ALL                                    \
 {                                                   \
-    GB_MATRIX_FREE (&A) ;                           \
-    GB_MATRIX_FREE (&B) ;                           \
-    GB_MATRIX_FREE (&C) ;                           \
-    GB_MATRIX_FREE (&Mask) ;                        \
+    GrB_Matrix_free_(&A) ;                          \
+    GrB_Matrix_free_(&B) ;                          \
+    GrB_Matrix_free_(&C) ;                          \
+    GrB_Matrix_free_(&Mask) ;                       \
     if (semiring != Complex_plus_times)             \
     {                                               \
         if (semiring != NULL)                       \
@@ -26,7 +26,7 @@
         GrB_Semiring_free_(&semiring) ;             \
     }                                               \
     GrB_Descriptor_free_(&desc) ;                   \
-    GB_mx_put_global (true, AxB_method_used) ;      \
+    GB_mx_put_global (true) ;                       \
 }
 
 void mexFunction
@@ -45,10 +45,8 @@ void mexFunction
     GrB_Matrix Mask = NULL ;
     GrB_Semiring semiring = NULL ;
     GrB_Descriptor desc = NULL ;
-    GrB_Desc_Value AxB_method_used = GxB_DEFAULT ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 6 || nargin > 7)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -57,7 +55,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
@@ -118,11 +116,8 @@ void mexFunction
     // C<Mask> = accum(C,A*B)
     METHOD (GrB_mxm (C, Mask, accum, semiring, A, B, desc)) ;
 
-    if (C != NULL) AxB_method_used = C->AxB_method_used ;
-
     // return C to MATLAB as a struct and free the GraphBLAS C
     pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C output from GrB_mxm", true) ;
-
     FREE_ALL ;
 }
 
diff --git a/GraphBLAS/Test/GB_mex_mxm_alias.c b/GraphBLAS/Test/GB_mex_mxm_alias.c
index 9412f9409d..90800763cd 100644
--- a/GraphBLAS/Test/GB_mex_mxm_alias.c
+++ b/GraphBLAS/Test/GB_mex_mxm_alias.c
@@ -2,8 +2,8 @@
 // GB_mex_mxm_alias: C<C> = accum(C,C*C)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,7 +13,7 @@
 
 #define FREE_ALL                                    \
 {                                                   \
-    GB_MATRIX_FREE (&C) ;                           \
+    GrB_Matrix_free_(&C) ;                           \
     if (semiring != Complex_plus_times)             \
     {                                               \
         if (semiring != NULL)                       \
@@ -23,7 +23,7 @@
         GrB_Semiring_free_(&semiring) ;             \
     }                                               \
     GrB_Descriptor_free_(&desc) ;                   \
-    GB_mx_put_global (true, AxB_method_used) ;      \
+    GB_mx_put_global (true) ;                       \
 }
 
 void mexFunction
@@ -39,10 +39,8 @@ void mexFunction
     GrB_Matrix C = NULL ;
     GrB_Semiring semiring = NULL ;
     GrB_Descriptor desc = NULL ;
-    GrB_Desc_Value AxB_method_used = GxB_DEFAULT ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 3 || nargin > 4)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -51,7 +49,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
@@ -88,8 +86,6 @@ void mexFunction
     // C<C> = accum(C,C*C)
     METHOD (GrB_mxm (C, C, accum, semiring, C, C, desc)) ;
 
-    if (C != NULL) AxB_method_used = C->AxB_method_used ;
-
     // return C to MATLAB as a struct and free the GraphBLAS C
     pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C output", true) ;
 
diff --git a/GraphBLAS/Test/GB_mex_mxm_flops.c b/GraphBLAS/Test/GB_mex_mxm_flops.c
index 8473fd8fd0..a16b1d0936 100644
--- a/GraphBLAS/Test/GB_mex_mxm_flops.c
+++ b/GraphBLAS/Test/GB_mex_mxm_flops.c
@@ -2,8 +2,8 @@
 // GB_mex_mxm_flops: compute flops to do C=A*B, C<M>=A*B or C<!M>=A*B
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,10 +13,10 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&A) ;                   \
-    GB_MATRIX_FREE (&B) ;                   \
-    GB_MATRIX_FREE (&M) ;                   \
-    GB_mx_put_global (true, 0) ;            \
+    GrB_Matrix_free_(&A) ;                   \
+    GrB_Matrix_free_(&B) ;                   \
+    GrB_Matrix_free_(&M) ;                   \
+    GB_mx_put_global (true) ;               \
 }
 
 void mexFunction
@@ -34,7 +34,7 @@ void mexFunction
     GrB_Matrix M = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
+    GB_CONTEXT (USAGE) ;
     if (nargout > 2 || nargin != 4)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -50,7 +50,6 @@ void mexFunction
 
     // get Mask_comp
     bool GET_SCALAR (1, bool, Mask_comp, 0) ;
-    // printf ("complement: %d\n", Mask_comp) ;
 
     // get A (shallow copy)
     A = GB_mx_mxArray_to_Matrix (pargin [2], "A", false, true) ;
@@ -75,7 +74,7 @@ void mexFunction
     // compute the flop count
     int64_t Mwork = 0 ;
 
-    GB_AxB_flopcount (&Mwork, Bflops, M, Mask_comp, A, B, Context) ;
+    GB_AxB_saxpy3_flopcount (&Mwork, Bflops, M, Mask_comp, A, B, Context) ;
 
     // return result to MATLAB
     pargout [0] = mxCreateDoubleMatrix (1, bnvec+1, mxREAL) ;
diff --git a/GraphBLAS/Test/GB_mex_mxm_generic.c b/GraphBLAS/Test/GB_mex_mxm_generic.c
new file mode 100644
index 0000000000..0664457e5a
--- /dev/null
+++ b/GraphBLAS/Test/GB_mex_mxm_generic.c
@@ -0,0 +1,194 @@
+//------------------------------------------------------------------------------
+// GB_mex_mxm_generic: C<Mask> = accum(C,A*B)
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_mex.h"
+
+#define USAGE "C = GB_mex_mxm_generic (C, Mask, accum, semiring, A, B, desc)"
+
+#define FREE_ALL                                    \
+{                                                   \
+    GrB_Matrix_free_(&A) ;                          \
+    GrB_Matrix_free_(&B) ;                          \
+    GrB_Matrix_free_(&C) ;                          \
+    GrB_Matrix_free_(&Mask) ;                       \
+    GrB_Monoid_free_(&myplus_monoid) ;              \
+    GrB_BinaryOp_free_(&myplus) ;                   \
+    if (semiring != Complex_plus_times)             \
+    {                                               \
+        if (semiring != NULL)                       \
+        {                                           \
+            GrB_Monoid_free_(&(semiring->add)) ;    \
+        }                                           \
+        GrB_Semiring_free_(&semiring) ;             \
+    }                                               \
+    GrB_Descriptor_free_(&desc) ;                   \
+    GB_mx_put_global (true) ;                       \
+}
+
+void My_Plus_int64 (void *z, const void *x, const void *y) ;
+void My_Plus_int32 (void *z, const void *x, const void *y) ;
+void My_Plus_fp64  (void *z, const void *x, const void *y) ;
+
+void My_Plus_int64 (void *z, const void *x, const void *y)
+{
+    int64_t a = (*((int64_t *) x)) ;
+    int64_t b = (*((int64_t *) y)) ;
+    int64_t c = a + b ;
+    (*((int64_t *) z)) = c ;
+}
+
+void My_Plus_int32 (void *z, const void *x, const void *y)
+{
+    int32_t a = (*((int32_t *) x)) ;
+    int32_t b = (*((int32_t *) y)) ;
+    int32_t c = a + b ;
+    (*((int32_t *) z)) = c ;
+}
+
+void My_Plus_fp64  (void *z, const void *x, const void *y)
+{
+    double a = (*((double *) x)) ;
+    double b = (*((double *) y)) ;
+    double c = a + b ;
+    (*((double *) z)) = c ;
+}
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    bool malloc_debug = GB_mx_get_global (true) ;
+    GrB_Matrix A = NULL ;
+    GrB_Matrix B = NULL ;
+    GrB_Matrix C = NULL ;
+    GrB_Matrix Mask = NULL ;
+    GrB_Semiring semiring = NULL ;
+    GrB_Descriptor desc = NULL ;
+    GrB_BinaryOp myplus = NULL ;
+    GrB_Monoid   myplus_monoid = NULL ;
+
+    // check inputs
+    if (nargout > 1 || nargin < 6 || nargin > 7)
+    {
+        mexErrMsgTxt ("Usage: " USAGE) ;
+    }
+
+    // get C (make a deep copy)
+    #define GET_DEEP_COPY \
+    C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
+    GET_DEEP_COPY ;
+    if (C == NULL)
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("C failed") ;
+    }
+
+    // get Mask (shallow copy)
+    Mask = GB_mx_mxArray_to_Matrix (pargin [1], "Mask", false, false) ;
+    if (Mask == NULL && !mxIsEmpty (pargin [1]))
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("Mask failed") ;
+    }
+
+    // get A (shallow copy)
+    A = GB_mx_mxArray_to_Matrix (pargin [4], "A input", false, true) ;
+    if (A == NULL)
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("A failed") ;
+    }
+
+    // get B (shallow copy)
+    B = GB_mx_mxArray_to_Matrix (pargin [5], "B input", false, true) ;
+    if (B == NULL)
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("B failed") ;
+    }
+
+    bool user_complex = (Complex != GxB_FC64) && (C->type == Complex) ;
+
+    // get semiring
+    if (!GB_mx_mxArray_to_Semiring (&semiring, pargin [3], "semiring",  
+        C->type, user_complex))
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("semiring failed") ;
+    }
+
+    if (semiring != NULL && semiring->add == GrB_PLUS_MONOID_INT64)
+    { 
+        // replace the semiring with a user-defined monoid
+        GrB_BinaryOp mult = semiring->multiply ;
+        GrB_Monoid_free_(&(semiring->add)) ;
+        GrB_Semiring_free_(&semiring) ;
+        GrB_BinaryOp_new (&myplus, My_Plus_int64,
+            GrB_INT64, GrB_INT64, GrB_INT64) ;
+        // add a spurious terminal value
+        GxB_Monoid_terminal_new_INT64 (&myplus_monoid, myplus,
+            (int64_t) 0, (int64_t) -111) ;
+        GrB_Semiring_new (&semiring, myplus_monoid, mult) ;
+    }
+    else if (semiring != NULL && semiring->add == GrB_PLUS_MONOID_INT32)
+    { 
+        // replace the semiring with a user-defined monoid
+        GrB_BinaryOp mult = semiring->multiply ;
+        GrB_Monoid_free_(&(semiring->add)) ;
+        GrB_Semiring_free_(&semiring) ;
+        GrB_BinaryOp_new (&myplus, My_Plus_int32,
+            GrB_INT32, GrB_INT32, GrB_INT32) ;
+        // add a spurious terminal value
+        GxB_Monoid_terminal_new_INT32 (&myplus_monoid, myplus,
+            (int32_t) 0, (int32_t) -111) ;
+        GrB_Semiring_new (&semiring, myplus_monoid, mult) ;
+    }
+    else if (semiring != NULL && semiring->add == GrB_PLUS_MONOID_FP64)
+    { 
+        // replace the semiring with a user-defined monoid
+        GrB_BinaryOp mult = semiring->multiply ;
+        GrB_Monoid_free_(&(semiring->add)) ;
+        GrB_Semiring_free_(&semiring) ;
+        GrB_BinaryOp_new (&myplus, My_Plus_fp64,
+            GrB_FP64, GrB_FP64, GrB_FP64) ;
+        GrB_Monoid_new_FP64 (&myplus_monoid, myplus, (double) 0) ;
+        GrB_Semiring_new (&semiring, myplus_monoid, mult) ;
+    }
+
+    // get accum, if present
+    GrB_BinaryOp accum ;
+    if (!GB_mx_mxArray_to_BinaryOp (&accum, pargin [2], "accum",
+        C->type, user_complex))
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("accum failed") ;
+    }
+
+    // get desc
+    if (!GB_mx_mxArray_to_Descriptor (&desc, PARGIN (6), "desc"))
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("desc failed") ;
+    }
+
+    // C<Mask> = accum(C,A*B)
+    METHOD (GrB_mxm (C, Mask, accum, semiring, A, B, desc)) ;
+
+    // return C to MATLAB as a struct and free the GraphBLAS C
+    pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C output from GrB_mxm", true) ;
+
+    FREE_ALL ;
+}
+
diff --git a/GraphBLAS/Test/GB_mex_mxv.c b/GraphBLAS/Test/GB_mex_mxv.c
index 4c358e0669..796eaecccc 100644
--- a/GraphBLAS/Test/GB_mex_mxv.c
+++ b/GraphBLAS/Test/GB_mex_mxv.c
@@ -2,8 +2,8 @@
 // GB_mex_mxv: w<mask> = accum(w,A*u)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,7 +15,7 @@
 {                                                   \
     GrB_Vector_free_(&w) ;                          \
     GrB_Vector_free_(&u) ;                          \
-    GB_MATRIX_FREE (&A) ;                           \
+    GrB_Matrix_free_(&A) ;                          \
     GrB_Vector_free_(&mask) ;                       \
     if (semiring != Complex_plus_times)             \
     {                                               \
@@ -26,7 +26,7 @@
         GrB_Semiring_free_(&semiring) ;             \
     }                                               \
     GrB_Descriptor_free_(&desc) ;                   \
-    GB_mx_put_global (true, AxB_method_used) ;      \
+    GB_mx_put_global (true) ;                       \
 }
 
 void mexFunction
@@ -45,10 +45,8 @@ void mexFunction
     GrB_Vector mask = NULL ;
     GrB_Semiring semiring = NULL ;
     GrB_Descriptor desc = NULL ;
-    GrB_Desc_Value AxB_method_used = GxB_DEFAULT ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 6 || nargin > 7)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -118,8 +116,6 @@ void mexFunction
     // w<mask> = accum(w,A*u)
     METHOD (GrB_mxv (w, mask, accum, semiring, A, u, desc)) ;
 
-    if (w != NULL) AxB_method_used = w->AxB_method_used ;
-
     // return w to MATLAB as a struct and free the GraphBLAS w
     pargout [0] = GB_mx_Vector_to_mxArray (&w, "w output", true) ;
 
diff --git a/GraphBLAS/Test/GB_mex_nonzero.c b/GraphBLAS/Test/GB_mex_nonzero.c
index 945fe3720b..6653324fdd 100644
--- a/GraphBLAS/Test/GB_mex_nonzero.c
+++ b/GraphBLAS/Test/GB_mex_nonzero.c
@@ -2,8 +2,8 @@
 // GB_mex_nonzero: compute C=nonzero(A)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,9 +15,9 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&A) ;               \
+    GrB_Matrix_free_(&C) ;               \
+    GB_mx_put_global (true) ;           \
 }
 
 
@@ -34,7 +34,6 @@ void mexFunction
     GrB_Matrix A = NULL, C = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin != 1)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_offdiag.c b/GraphBLAS/Test/GB_mex_offdiag.c
index 1e2bdf9f6e..3044e9227f 100644
--- a/GraphBLAS/Test/GB_mex_offdiag.c
+++ b/GraphBLAS/Test/GB_mex_offdiag.c
@@ -2,8 +2,8 @@
 // GB_mex_offdiag: compute C=offdiag(A,1)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,10 +15,10 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_SCALAR_FREE (&Thunk) ;           \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GxB_Scalar_free_(&Thunk) ;          \
+    GrB_Matrix_free_(&A) ;              \
+    GrB_Matrix_free_(&C) ;              \
+    GB_mx_put_global (true) ;           \
 }
 
 
@@ -36,7 +36,6 @@ void mexFunction
     GxB_Scalar Thunk = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 1 || nargin > 2)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -74,8 +73,7 @@ void mexFunction
 
     GxB_Scalar_new (&Thunk, GrB_INT64) ;
     GxB_Scalar_setElement_INT64_(Thunk, k) ;
-    GrB_Index ignore ;
-    GxB_Scalar_nvals (&ignore, Thunk) ;
+    GxB_Scalar_wait (&Thunk) ;
 
     // C = offdiag (A,k)
     METHOD (GxB_Matrix_select_(C, NULL, NULL, GxB_OFFDIAG, A, Thunk, NULL)) ;
diff --git a/GraphBLAS/Test/GB_mex_omp_max_threads.c b/GraphBLAS/Test/GB_mex_omp_max_threads.c
index ba85ee5cc1..10c63cb626 100644
--- a/GraphBLAS/Test/GB_mex_omp_max_threads.c
+++ b/GraphBLAS/Test/GB_mex_omp_max_threads.c
@@ -2,8 +2,8 @@
 // GB_mex_omp_max_threads: omp_get_max_threads ( )
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mex_op.c b/GraphBLAS/Test/GB_mex_op.c
index a8530c9900..7225f88333 100644
--- a/GraphBLAS/Test/GB_mex_op.c
+++ b/GraphBLAS/Test/GB_mex_op.c
@@ -2,8 +2,8 @@
 // GB_mex_op: apply a built-in GraphBLAS operator to MATLAB arrays
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,7 +23,7 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_mx_put_global (do_cover, 0) ;    \
+    GB_mx_put_global (do_cover) ;       \
 }
 
 void mexFunction
@@ -56,7 +56,6 @@ void mexFunction
     // check inputs
     //--------------------------------------------------------------------------
 
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 2 || nargin > 4)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -90,6 +89,11 @@ void mexFunction
         op_xtype = op2->xtype ; op_xsize = op_xtype->size ;
         op_ytype = op2->ytype ; op_ysize = op_ytype->size ;
         ASSERT_BINARYOP_OK (op2, "binary op", GB0) ;
+        if (GB_OP_IS_POSITIONAL (op2))
+        { 
+            FREE_ALL ;
+            mexErrMsgTxt ("binary positional op not supported") ;
+        }
     }
     else
     {
@@ -104,6 +108,11 @@ void mexFunction
         op_xtype = op1->xtype ; op_xsize = op_xtype->size ;
         op_ytype = NULL       ; op_ysize = 1 ;
         ASSERT_UNARYOP_OK (op1, "unary op", GB0) ;
+        if (GB_OP_IS_POSITIONAL (op1))
+        { 
+            FREE_ALL ;
+            mexErrMsgTxt ("unary positional op not supported") ;
+        }
     }
 
     ASSERT_TYPE_OK (op_ztype, "Z type", GB0) ;
@@ -186,14 +195,11 @@ void mexFunction
         {
             cast_X (xwork, X +(k*X_size), X_size) ;
             cast_Y (ywork, Y +(k*Y_size), Y_size) ;
-            // printf ("x: ") ;
-            // GB_code_check (op_xtype->code, xwork, 3, NULL, NULL) ;
-            // printf ("\ny: ") ;
-            // GB_code_check (op_ytype->code, ywork, 3, NULL, NULL) ;
-            // printf ("\nz: ") ;
+            // printf ("x: ")   ; GB_code_check (op_xtype->code,xwork,3,NULL) ;
+            // printf ("\ny: ") ; GB_code_check (op_ytype->code,ywork,3,NULL) ;
             f_binary (Z +(k*op_zsize), xwork, ywork) ;
-            // GB_code_check (op_ztype->code, Z +(k*op_zsize), 3, NULL, NULL) ;
-            // printf ("\n") ;
+            // printf ("\nz: ") ; GB_code_check (op_ztype->code,
+            //                    Z +(k*op_zsize), 3, NULL) ; printf ("\n") ;
         }
 
     }
diff --git a/GraphBLAS/Test/GB_mex_qsort.c b/GraphBLAS/Test/GB_mex_qsort.c
deleted file mode 100644
index 9e51600036..0000000000
--- a/GraphBLAS/Test/GB_mex_qsort.c
+++ /dev/null
@@ -1,48 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_mex_qsort: sort int64's using GB_qsort_1a
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-#include "GB_mex.h"
-
-#define USAGE "J = qsort (I)"
-
-void mexFunction
-(
-    int nargout,
-    mxArray *pargout [ ],
-    int nargin,
-    const mxArray *pargin [ ]
-)
-{
-
-    // check inputs
-    if (nargin != 1)
-    {
-        mexErrMsgTxt ("Usage: " USAGE) ;
-    }
-
-    if (!mxIsClass (pargin [0], "int64"))
-    {
-        mexErrMsgTxt ("I must be a int64 array") ;
-    }
-    int64_t *I = mxGetData (pargin [0]) ;
-    int64_t n = (uint64_t) mxGetNumberOfElements (pargin [0]) ;
-
-    pargout [0] = GB_mx_create_full (1, n, GrB_INT64) ;
-    int64_t *J = mxGetData (pargout [0]) ;
-    memcpy (J, I, n * sizeof (int64_t)) ;
-
-    GB_MEX_TIC ;
-
-    GB_qsort_1a (J, n) ;
-
-    GB_MEX_TOC ;
-
-    GB_mx_put_time (0) ;
-}
-
diff --git a/GraphBLAS/Test/GB_mex_qsort_1a.c b/GraphBLAS/Test/GB_mex_qsort_1a.c
deleted file mode 100644
index 27983a7f05..0000000000
--- a/GraphBLAS/Test/GB_mex_qsort_1a.c
+++ /dev/null
@@ -1,48 +0,0 @@
-//------------------------------------------------------------------------------
-// GB_mex_qsort_1a: sort using GB_qsort_1a
-//------------------------------------------------------------------------------
-
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-//------------------------------------------------------------------------------
-
-#include "GB_mex.h"
-
-#define USAGE "J = qsort (I)"
-
-void mexFunction
-(
-    int nargout,
-    mxArray *pargout [ ],
-    int nargin,
-    const mxArray *pargin [ ]
-)
-{
-
-    // check inputs
-    if (nargin != 1)
-    {
-        mexErrMsgTxt ("Usage: " USAGE) ;
-    }
-
-    if (!mxIsClass (pargin [0], "int64"))
-    {
-        mexErrMsgTxt ("I must be a int64 array") ;
-    }
-
-    int64_t *I = mxGetData (pargin [0]) ;
-    int64_t n = (uint64_t) mxGetNumberOfElements (pargin [0]) ;
-
-    pargout [0] = GB_mx_create_full (n, 1, GrB_INT64) ;
-    int64_t *J = mxGetData (pargout [0]) ;
-    memcpy (J, I, n * sizeof (int64_t)) ;
-
-    GB_MEX_TIC ;
-
-    GB_qsort_1a (J, n) ;
-
-    GB_MEX_TOC ;
-    GB_mx_put_time (0) ;
-}
-
diff --git a/GraphBLAS/Test/GB_mex_qsort_1b.c b/GraphBLAS/Test/GB_mex_qsort_1b.c
index fdd9e705d0..7fb4564ba2 100644
--- a/GraphBLAS/Test/GB_mex_qsort_1b.c
+++ b/GraphBLAS/Test/GB_mex_qsort_1b.c
@@ -2,8 +2,8 @@
 // GB_mex_qsort_1b: sort using GB_qsort_1b
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -56,6 +56,6 @@ void mexFunction
     GB_qsort_1b (Iout, Jout, sizeof (int64_t), n) ;
 
     GB_MEX_TOC ;
-    GB_mx_put_time (0) ;
+    GB_mx_put_time ( ) ;
 }
 
diff --git a/GraphBLAS/Test/GB_mex_qsort_2.c b/GraphBLAS/Test/GB_mex_qsort_2.c
index 480f081bd0..ad3b952f61 100644
--- a/GraphBLAS/Test/GB_mex_qsort_2.c
+++ b/GraphBLAS/Test/GB_mex_qsort_2.c
@@ -2,8 +2,8 @@
 // GB_mex_qsort_2: sort using GB_qsort_2
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -56,6 +56,6 @@ void mexFunction
     GB_qsort_2 (Iout, Jout, n) ;
 
     GB_MEX_TOC ;
-    GB_mx_put_time (0) ;
+    GB_mx_put_time ( ) ;
 }
 
diff --git a/GraphBLAS/Test/GB_mex_qsort_3.c b/GraphBLAS/Test/GB_mex_qsort_3.c
index 8ce0710047..745df57e9d 100644
--- a/GraphBLAS/Test/GB_mex_qsort_3.c
+++ b/GraphBLAS/Test/GB_mex_qsort_3.c
@@ -2,8 +2,8 @@
 // GB_mex_qsort_3: sort using GB_qsort_3
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -70,6 +70,6 @@ void mexFunction
     GB_qsort_3 (Iout, Jout, Kout, n) ;
 
     GB_MEX_TOC ;
-    GB_mx_put_time (0) ;
+    GB_mx_put_time ( ) ;
 }
 
diff --git a/GraphBLAS/Test/GB_mex_random.c b/GraphBLAS/Test/GB_mex_random.c
index 74a5c8e9b6..7a197932e2 100644
--- a/GraphBLAS/Test/GB_mex_random.c
+++ b/GraphBLAS/Test/GB_mex_random.c
@@ -2,8 +2,8 @@
 // GB_mex_random: construct a random matrix, double or Complex
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,7 @@
 
 #define GET_DEEP_COPY ;
 #define FREE_DEEP_COPY ;
-#define FREE_ALL GB_mx_put_global (true, 0) ;
+#define FREE_ALL GB_mx_put_global (true) ;   
 
 void mexFunction
 (
@@ -29,7 +29,6 @@ void mexFunction
     GrB_Matrix A = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin == 0 || nargin > 8)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_rdiv.c b/GraphBLAS/Test/GB_mex_rdiv.c
index f06b63e475..418a00ac92 100644
--- a/GraphBLAS/Test/GB_mex_rdiv.c
+++ b/GraphBLAS/Test/GB_mex_rdiv.c
@@ -2,8 +2,8 @@
 // GB_mex_rdiv: compute C=A*B with the rdiv operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -17,12 +17,12 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&B) ;               \
-    GB_MATRIX_FREE (&C) ;               \
+    GrB_Matrix_free_(&A) ;               \
+    GrB_Matrix_free_(&B) ;               \
+    GrB_Matrix_free_(&C) ;               \
     GrB_BinaryOp_free_(&My_rdiv) ;      \
     GrB_Semiring_free_(&My_plus_rdiv) ; \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 //------------------------------------------------------------------------------
@@ -36,7 +36,7 @@ int64_t anrows = 0 ;
 int64_t ancols = 0 ;
 int64_t bnrows = 0 ;
 int64_t bncols = 0 ;
-GrB_Desc_Value AxB_method = GxB_DEFAULT, AxB_method_used ;
+GrB_Desc_Value AxB_method = GxB_DEFAULT ;
 
 GrB_Info axb (GB_Context Context, bool cprint) ;
 
@@ -67,8 +67,7 @@ GrB_Info axb (GB_Context Context, bool cprint)
     }
 
     // C = A*B
-    info = GB_AxB_meta (&C,
-        NULL,       // not in place
+    info = GB_AxB_meta (&C, NULL,       // C cannot be computed in place
         false,      // C_replace
         true,       // CSC
         NULL,       // no MT returned
@@ -83,11 +82,12 @@ GrB_Info axb (GB_Context Context, bool cprint)
         false,      // no flipxy
         &ignore,    // mask_applied
         &ignore2,   // done_in_place
-        AxB_method, &AxB_method_used, Context) ;
+        AxB_method,
+        true,       // do the sort
+        Context) ;
 
     if (C != NULL)
     {
-        C->AxB_method_used = AxB_method_used ;
         if (cprint) GxB_Matrix_fprint_(C, GxB_COMPLETE, NULL) ;
     }
 
@@ -118,7 +118,7 @@ void mexFunction
     My_rdiv = NULL ;
     My_plus_rdiv = NULL ;
 
-    GB_WHERE (USAGE) ;
+    GB_CONTEXT (USAGE) ;
 
     // check inputs
     if (nargout > 1 || nargin < 2 || nargin > 4)
@@ -146,7 +146,6 @@ void mexFunction
     // get the axb_method
     // 0 or not present: default
     // 1001: Gustavson
-    // 1002: heap
     // 1003: dot
     // 1004: hash
     // 1005: saxpy
@@ -157,7 +156,6 @@ void mexFunction
 
     if (! ((AxB_method == GxB_DEFAULT) ||
         (AxB_method == GxB_AxB_GUSTAVSON) ||
-        (AxB_method == GxB_AxB_HEAP) ||
         (AxB_method == GxB_AxB_HASH) ||
         (AxB_method == GxB_AxB_SAXPY) ||
         (AxB_method == GxB_AxB_DOT)))
diff --git a/GraphBLAS/Test/GB_mex_rdiv2.c b/GraphBLAS/Test/GB_mex_rdiv2.c
index 2f4d1d1c80..36e17bfe84 100644
--- a/GraphBLAS/Test/GB_mex_rdiv2.c
+++ b/GraphBLAS/Test/GB_mex_rdiv2.c
@@ -2,8 +2,8 @@
 // GB_mex_rdiv2: compute C=A*B with the rdiv2 operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,14 +18,14 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&A) ;                   \
-    GB_MATRIX_FREE (&B) ;                   \
-    GB_MATRIX_FREE (&B64) ;                 \
-    GB_MATRIX_FREE (&C) ;                   \
-    GB_MATRIX_FREE (&T) ;                   \
+    GrB_Matrix_free_(&A) ;                   \
+    GrB_Matrix_free_(&B) ;                   \
+    GrB_Matrix_free_(&B64) ;                 \
+    GrB_Matrix_free_(&C) ;                   \
+    GrB_Matrix_free_(&T) ;                   \
     GrB_BinaryOp_free_(&My_rdiv2) ;         \
     GrB_Semiring_free_(&My_plus_rdiv2) ;    \
-    GB_mx_put_global (true, 0) ;            \
+    GB_mx_put_global (true) ;               \
 }
 
 //------------------------------------------------------------------------------
@@ -40,7 +40,7 @@ int64_t anrows = 0 ;
 int64_t ancols = 0 ;
 int64_t bnrows = 0 ;
 int64_t bncols = 0 ;
-GrB_Desc_Value AxB_method = GxB_DEFAULT, AxB_method_used ;
+GrB_Desc_Value AxB_method = GxB_DEFAULT ;
 bool flipxy = false ;
 bool done_in_place = false ;
 double C_scalar = 0 ;
@@ -99,9 +99,7 @@ GrB_Info axb (GB_Context Context)
     }
 
     // C = A*B or C += A*B
-    info = GB_AxB_meta (
-        &T,
-        C,
+    info = GB_AxB_meta (&T, C,  // can be done in place if C != NULL
         false,      // C_replace
         true,       // CSC
         NULL,       // no MT returned
@@ -116,13 +114,14 @@ GrB_Info axb (GB_Context Context)
         flipxy,
         &ignore,    // mask_applied
         &done_in_place,
-        AxB_method, &AxB_method_used, Context) ;
+        AxB_method,
+        true,       // do the sort
+        Context) ;
 
     if (info == GrB_SUCCESS)
     {
         if (done_in_place != do_in_place)
         {
-            printf ("done in place: %d %d\n", do_in_place, done_in_place) ;
             mexErrMsgTxt ("failure: not in place as expected\n") ;
         }
         if (!done_in_place)
@@ -166,7 +165,7 @@ void mexFunction
     My_rdiv2 = NULL ;
     My_plus_rdiv2 = NULL ;
 
-    GB_WHERE (USAGE) ;
+    GB_CONTEXT (USAGE) ;
 
     // check inputs
     if (nargout > 1 || nargin < 2 || nargin > 7)
@@ -200,7 +199,6 @@ void mexFunction
     // get the axb_method
     // 0 or not present: default
     // 1001: Gustavson
-    // 1002: heap
     // 1003: dot
     // 1004: hash
     // 1005: saxpy
@@ -208,7 +206,6 @@ void mexFunction
 
     if (! ((AxB_method == GxB_DEFAULT) ||
         (AxB_method == GxB_AxB_GUSTAVSON) ||
-        (AxB_method == GxB_AxB_HEAP) ||
         (AxB_method == GxB_AxB_HASH) ||
         (AxB_method == GxB_AxB_SAXPY) ||
         (AxB_method == GxB_AxB_DOT)))
@@ -221,7 +218,6 @@ void mexFunction
 
     // get the C_scalar
     GET_SCALAR (6, double, C_scalar, 0) ;
-    // printf ("C scalar: %g\n", C_scalar) ;
 
     // determine the dimensions
     anrows = (atranspose) ? GB_NCOLS (A) : GB_NROWS (A) ;
@@ -236,7 +232,6 @@ void mexFunction
 
     if (atranspose && btranspose && C_scalar != 0)
     {
-        printf ("C=A'*B'; ignoring C_scalar!\n") ;
         C_scalar = 0 ;
     }
 
@@ -245,8 +240,7 @@ void mexFunction
     GrB_Matrix_assign_(B, NULL, NULL, B64, GrB_ALL, 0, GrB_ALL, 0, NULL) ;
 
     // B must be completed
-    GrB_Index nvals ;
-    GrB_Matrix_nvals (&nvals, B) ;
+    GrB_Matrix_wait (&B) ;
 
     METHOD (axb (Context)) ;
 
diff --git a/GraphBLAS/Test/GB_mex_reduce_bool.c b/GraphBLAS/Test/GB_mex_reduce_bool.c
index d054c53cc2..dd7c204fc1 100644
--- a/GraphBLAS/Test/GB_mex_reduce_bool.c
+++ b/GraphBLAS/Test/GB_mex_reduce_bool.c
@@ -2,8 +2,8 @@
 // GB_mex_reduce_bool: c = accum(c,reduce_to_scalar(A)) for boolean
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,9 +15,9 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&A) ;               \
     GrB_Monoid_free_(&reduce) ;         \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -36,7 +36,6 @@ void mexFunction
     GrB_BinaryOp reduceop = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 3 || nargin > 4)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_reduce_complex.c b/GraphBLAS/Test/GB_mex_reduce_complex.c
index 8696a15865..839e5a4f9c 100644
--- a/GraphBLAS/Test/GB_mex_reduce_complex.c
+++ b/GraphBLAS/Test/GB_mex_reduce_complex.c
@@ -2,8 +2,8 @@
 // GB_mex_mxm: C<Mask> = accum(C,A*B)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,9 +15,9 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&A) ;                   \
+    GrB_Matrix_free_(&A) ;                   \
     GrB_Monoid_free_(&Times_terminal) ;     \
-    GB_mx_put_global (true, 0) ;            \
+    GB_mx_put_global (true) ;               \
 }
 
 void mexFunction
@@ -35,7 +35,6 @@ void mexFunction
     GrB_Monoid Times_terminal = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 1 || nargin > 2)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -59,8 +58,6 @@ void mexFunction
     GxB_FC64_t zero = GxB_CMPLX (0,0) ;
 
     // create the monoid
-    // GxB_print (Complex_times, 5) ;
-
     if (Complex == GxB_FC64)
     {
         Times_terminal = GxB_TIMES_FC64_MONOID ;
@@ -71,7 +68,6 @@ void mexFunction
             Complex_times, &one, &zero) ;
         if (info != GrB_SUCCESS)
         {
-            printf ("Error:\n%s\n", GrB_error ( )) ;
             FREE_ALL ;
             mexErrMsgTxt ("Times_terminal failed") ;
         }
@@ -99,7 +95,6 @@ void mexFunction
     }
     if (info != GrB_SUCCESS)
     {
-        printf ("Error:\n%s\n", GrB_error ( )) ;
         FREE_ALL ;
         mexErrMsgTxt ("reduce failed") ;
     }
diff --git a/GraphBLAS/Test/GB_mex_reduce_terminal.c b/GraphBLAS/Test/GB_mex_reduce_terminal.c
index d220245ce5..3bd5e9811b 100644
--- a/GraphBLAS/Test/GB_mex_reduce_terminal.c
+++ b/GraphBLAS/Test/GB_mex_reduce_terminal.c
@@ -2,8 +2,8 @@
 // GB_mex_reduce_terminal: [c,flag] = sum(A), reduce to scalar
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -16,10 +16,10 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&A) ;               \
     GrB_BinaryOp_free_(&Max) ;          \
     GrB_Monoid_free_(&Max_Terminal) ;   \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void maxdouble (double *z, const double *x, const double *y) ;
@@ -46,7 +46,6 @@ void mexFunction
     GrB_Info info ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 1 || nargin > 2)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -72,24 +71,18 @@ void mexFunction
     // get the terminal value, if present.  Default is 1.
     double GET_SCALAR (1, double, terminal, 1) ;
 
-    // printf ("\nterminal %g\n", terminal) ;
-
     // create the Max operator
     info = GrB_BinaryOp_new (&Max, maxdouble, GrB_FP64, GrB_FP64, GrB_FP64);
     if (info != GrB_SUCCESS)
     {
-        printf ("error: %d %s\n", info, GrB_error ( )) ;
         mexErrMsgTxt ("Max failed") ;
     }
 
-    // printf ("create the monoid:\n") ;
-
     // create the Max monoid
     info = GxB_Monoid_terminal_new_FP64_(&Max_Terminal, Max, (double) 0,
         terminal) ;
     if (info != GrB_SUCCESS)
     {
-        printf ("error: %d %s\n", info, GrB_error ( )) ;
         mexErrMsgTxt ("Max_Terminal failed") ;
     }
 
@@ -98,7 +91,7 @@ void mexFunction
     info = GrB_Matrix_reduce_FP64_(&c, NULL, Max_Terminal, A, NULL) ;
     if (info != GrB_SUCCESS)
     {
-        printf ("error: %d %s\n", info, GrB_error ( )) ;
+        printf ("error: %d\n", info) ;
         mexErrMsgTxt ("reduce failed") ;
     }
 
diff --git a/GraphBLAS/Test/GB_mex_reduce_to_scalar.c b/GraphBLAS/Test/GB_mex_reduce_to_scalar.c
index 3419741ebf..904c130277 100644
--- a/GraphBLAS/Test/GB_mex_reduce_to_scalar.c
+++ b/GraphBLAS/Test/GB_mex_reduce_to_scalar.c
@@ -2,8 +2,8 @@
 // GB_mex_reduce_to_scalar: c = accum(c,reduce_to_scalar(A))
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,12 +15,12 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&A) ;               \
     if (reduce_monoid_allocated)        \
     {                                   \
         GrB_Monoid_free_(&reduce) ;     \
     }                                   \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -39,7 +39,6 @@ void mexFunction
     bool reduce_monoid_allocated = false ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin != 4)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_reduce_to_vector.c b/GraphBLAS/Test/GB_mex_reduce_to_vector.c
index da2a36c3c9..3f6a5eeb30 100644
--- a/GraphBLAS/Test/GB_mex_reduce_to_vector.c
+++ b/GraphBLAS/Test/GB_mex_reduce_to_vector.c
@@ -2,8 +2,8 @@
 // GB_mex_reduce_to_vector: c = accum(c,reduce_to_vector(A))
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,7 +18,7 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&A) ;               \
     GrB_Vector_free_(&w) ;              \
     GrB_Vector_free_(&mask) ;           \
     GrB_Descriptor_free_(&desc) ;       \
@@ -26,7 +26,7 @@
     {                                   \
         GrB_Monoid_free_(&reduce) ;     \
     }                                   \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -47,7 +47,6 @@ void mexFunction
     bool user_complex = false ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 5 || nargin > 6)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -82,22 +81,22 @@ void mexFunction
 
     // get reduce operator
     user_complex = (Complex != GxB_FC64) && (A->type == Complex) ;
-    GrB_BinaryOp reduceop ;
-    if (!GB_mx_mxArray_to_BinaryOp (&reduceop, pargin [3], "reduceop",
-        w->type, user_complex) || reduceop == NULL)
+    GrB_BinaryOp op ;
+    if (!GB_mx_mxArray_to_BinaryOp (&op, pargin [3], "op",
+        w->type, user_complex) || op == NULL)
     {
         FREE_ALL ;
-        mexErrMsgTxt ("reduceop failed") ;
+        mexErrMsgTxt ("op failed") ;
     }
 
     // get the reduce monoid
     if (user_complex)
     {
-        if (reduceop == Complex_plus)
+        if (op == Complex_plus)
         {
             reduce = Complex_plus_monoid ;
         }
-        else if (reduceop == Complex_times)
+        else if (op == Complex_times)
         {
             reduce = Complex_times_monoid ;
         }
@@ -110,7 +109,7 @@ void mexFunction
     else
     {
         // create the reduce monoid
-        if (!GB_mx_Monoid (&reduce, reduceop, malloc_debug))
+        if (!GB_mx_Monoid (&reduce, op, malloc_debug))
         {
             FREE_ALL ;
             mexErrMsgTxt ("reduce failed") ;
@@ -134,8 +133,94 @@ void mexFunction
         mexErrMsgTxt ("desc failed") ;
     }
 
-    // w<mask> = accum (w, reduce_to_vector (A))
-    METHOD (GrB_Matrix_reduce_Monoid_(w, mask, accum, reduce, A, desc)) ;
+    // test GrB_Matrix_reduce_BinaryOp, if possible
+
+    if (op == GrB_MIN_INT8
+    ||  op == GrB_MIN_INT16
+    ||  op == GrB_MIN_INT32
+    ||  op == GrB_MIN_INT64
+    ||  op == GrB_MIN_UINT8
+    ||  op == GrB_MIN_UINT16
+    ||  op == GrB_MIN_UINT32
+    ||  op == GrB_MIN_UINT64
+    ||  op == GrB_MIN_FP32
+    ||  op == GrB_MIN_FP64
+    ||  op == GrB_MAX_INT8
+    ||  op == GrB_MAX_INT16
+    ||  op == GrB_MAX_INT32
+    ||  op == GrB_MAX_INT64
+    ||  op == GrB_MAX_UINT8
+    ||  op == GrB_MAX_UINT16
+    ||  op == GrB_MAX_UINT32
+    ||  op == GrB_MAX_UINT64
+    ||  op == GrB_MAX_FP32
+    ||  op == GrB_MAX_FP64
+    ||  op == GrB_PLUS_INT8
+    ||  op == GrB_PLUS_INT16
+    ||  op == GrB_PLUS_INT32
+    ||  op == GrB_PLUS_INT64
+    ||  op == GrB_PLUS_UINT8
+    ||  op == GrB_PLUS_UINT16
+    ||  op == GrB_PLUS_UINT32
+    ||  op == GrB_PLUS_UINT64
+    ||  op == GrB_PLUS_FP32
+    ||  op == GrB_PLUS_FP64
+    ||  op == GxB_PLUS_FC32
+    ||  op == GxB_PLUS_FC64
+    ||  op == GrB_TIMES_INT8
+    ||  op == GrB_TIMES_INT16
+    ||  op == GrB_TIMES_INT32
+    ||  op == GrB_TIMES_INT64
+    ||  op == GrB_TIMES_UINT8
+    ||  op == GrB_TIMES_UINT16
+    ||  op == GrB_TIMES_UINT32
+    ||  op == GrB_TIMES_UINT64
+    ||  op == GrB_TIMES_FP32
+    ||  op == GrB_TIMES_FP64
+    ||  op == GxB_TIMES_FC32
+    ||  op == GxB_TIMES_FC64
+    ||  op == GxB_ANY_BOOL
+    ||  op == GxB_ANY_INT8
+    ||  op == GxB_ANY_INT16
+    ||  op == GxB_ANY_INT32
+    ||  op == GxB_ANY_INT64
+    ||  op == GxB_ANY_UINT8
+    ||  op == GxB_ANY_UINT16
+    ||  op == GxB_ANY_UINT32
+    ||  op == GxB_ANY_UINT64
+    ||  op == GxB_ANY_FP32
+    ||  op == GxB_ANY_FP64
+    ||  op == GxB_ANY_FC32
+    ||  op == GxB_ANY_FC64
+    ||  op == GrB_LOR
+    ||  op == GrB_LAND
+    ||  op == GrB_LXOR
+    ||  op == GrB_LXNOR
+    ||  op == GrB_BOR_UINT8
+    ||  op == GrB_BOR_UINT16
+    ||  op == GrB_BOR_UINT32
+    ||  op == GrB_BOR_UINT64
+    ||  op == GrB_BAND_UINT8
+    ||  op == GrB_BAND_UINT16
+    ||  op == GrB_BAND_UINT32
+    ||  op == GrB_BAND_UINT64
+    ||  op == GrB_BXOR_UINT8
+    ||  op == GrB_BXOR_UINT16
+    ||  op == GrB_BXOR_UINT32
+    ||  op == GrB_BXOR_UINT64
+    ||  op == GrB_BXNOR_UINT8
+    ||  op == GrB_BXNOR_UINT16
+    ||  op == GrB_BXNOR_UINT32
+    ||  op == GrB_BXNOR_UINT64)
+    {
+        // w<mask> = accum (w, reduce_to_vector (A)) using a binary op
+        METHOD (GrB_Matrix_reduce_BinaryOp_(w, mask, accum, op, A, desc)) ;
+    }
+    else
+    {
+        // w<mask> = accum (w, reduce_to_vector (A)) using a monoid
+        METHOD (GrB_Matrix_reduce_Monoid_(w, mask, accum, reduce, A, desc)) ;
+    }
 
     // return w to MATLAB as a struct and free the GraphBLAS w
     pargout [0] = GB_mx_Vector_to_mxArray (&w, "w output", true) ;
diff --git a/GraphBLAS/Test/GB_mex_resize.c b/GraphBLAS/Test/GB_mex_resize.c
index 91c95872fa..0481bb902a 100644
--- a/GraphBLAS/Test/GB_mex_resize.c
+++ b/GraphBLAS/Test/GB_mex_resize.c
@@ -2,8 +2,8 @@
 // GB_mex_resize: resize a matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -14,7 +14,7 @@
 #define FREE_ALL                        \
 {                                       \
     GrB_Matrix_free_(&C) ;              \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -30,7 +30,6 @@ void mexFunction
     GrB_Matrix C = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 1 || nargin > 3)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -38,7 +37,7 @@ void mexFunction
 
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
 
     GET_DEEP_COPY ;
     if (C == NULL)
diff --git a/GraphBLAS/Test/GB_mex_select.c b/GraphBLAS/Test/GB_mex_select.c
index 6ff8d1b960..6d7d811bb7 100644
--- a/GraphBLAS/Test/GB_mex_select.c
+++ b/GraphBLAS/Test/GB_mex_select.c
@@ -2,8 +2,8 @@
 // GB_mex_select: C<M> = accum(C,select(A,k)) or select(A',k)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,12 +15,21 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_SCALAR_FREE (&Thunk) ;           \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_MATRIX_FREE (&M) ;               \
-    GB_MATRIX_FREE (&A) ;               \
+    GxB_Scalar_free_(&Thunk) ;          \
+    GrB_Matrix_free_(&C) ;              \
+    GrB_Matrix_free_(&M) ;              \
+    GrB_Matrix_free_(&A) ;              \
+    GxB_SelectOp_free_(&isnanop) ;      \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
+}
+
+bool isnan64 (GrB_Index i, GrB_Index j, const void *x, const void *b) ;
+
+bool isnan64 (GrB_Index i, GrB_Index j, const void *x, const void *b)
+{ 
+    double aij = * ((double *) x) ;
+    return (isnan (aij)) ;
 }
 
 void mexFunction
@@ -38,10 +47,10 @@ void mexFunction
     GrB_Matrix A = NULL ;
     GrB_Descriptor desc = NULL ;
     GxB_Scalar Thunk = NULL ;
+    GxB_SelectOp isnanop = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
-    if (nargout > 1 || nargin < 6 || nargin > 8)
+    if (nargout > 1 || nargin < 5 || nargin > 8)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
     }
@@ -50,7 +59,7 @@ void mexFunction
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;   \
     if (nargin > 7 && C != NULL) C->nvec_nonempty = -1 ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
@@ -93,9 +102,15 @@ void mexFunction
         mexErrMsgTxt ("SelectOp failed") ;
     }
 
-    // get Thunk (shallow copy)
-    if (nargin > 5)
+    if (op == NULL)
     {
+        // user-defined isnan operator, with no Thunk
+        GxB_SelectOp_new (&isnanop, isnan64, GrB_FP64, NULL) ;
+        op = isnanop ;
+    }
+    else if (nargin > 5)
+    {
+        // get Thunk (shallow copy)
         if (mxIsSparse (pargin [5]))
         {
             Thunk = (GxB_Scalar) GB_mx_mxArray_to_Matrix (pargin [5],
@@ -105,6 +120,11 @@ void mexFunction
                 FREE_ALL ;
                 mexErrMsgTxt ("Thunk failed") ;
             }
+            if (!GB_SCALAR_OK (Thunk))
+            { 
+                FREE_ALL ;
+                mexErrMsgTxt ("Thunk not a valid scalar") ;
+            }
         }
         else
         {
@@ -183,7 +203,7 @@ void mexFunction
             }
             else
             {
-                mexErrMsgTxt ("unknown type") ;
+                mexErrMsgTxt ("unknown thunk type") ;
             }
             GxB_Scalar_wait_(&Thunk) ;
         }
diff --git a/GraphBLAS/Test/GB_mex_semiring.c b/GraphBLAS/Test/GB_mex_semiring.c
index 323eef93db..01f04f976e 100644
--- a/GraphBLAS/Test/GB_mex_semiring.c
+++ b/GraphBLAS/Test/GB_mex_semiring.c
@@ -2,8 +2,8 @@
 // GB_mex_semiring: parse a semiring, for testing; returns nothing
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,7 +13,7 @@
 
 #define FREE_ALL            \
 {                           \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -31,7 +31,6 @@ void mexFunction
     // printf ("user complex: %d\n", Complex != GxB_FC64) ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargin < 1 || nargin > 2 || nargout > 0)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -44,10 +43,10 @@ void mexFunction
 
     int GET_SCALAR (1, int, pr, GxB_COMPLETE) ;
 
-    GrB_Info info = GB_Semiring_check (semiring, "semiring", pr, NULL, NULL) ;
+    GrB_Info info = GB_Semiring_check (semiring, "semiring", pr, NULL) ;
     if (info != GrB_SUCCESS)
     {
-        mexErrMsgTxt (GrB_error ( )) ;
+        mexErrMsgTxt ("semiring fail") ;
     }
     FREE_ALL ;
 }
diff --git a/GraphBLAS/Test/GB_mex_setElement.c b/GraphBLAS/Test/GB_mex_setElement.c
index 7efef3efbb..d7e81635b4 100644
--- a/GraphBLAS/Test/GB_mex_setElement.c
+++ b/GraphBLAS/Test/GB_mex_setElement.c
@@ -2,8 +2,8 @@
 // GB_mex_setElement: MATLAB interface for A(i,j) = x
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -20,8 +20,8 @@ bool debug_wait = false ;
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&A) ;               \
+    GB_mx_put_global (true) ;           \
 }
 
 #if defined ( __GNUC__ )
@@ -125,7 +125,6 @@ void mexFunction
     bool is_list ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 4 || nargin > 5)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -134,7 +133,7 @@ void mexFunction
     // get A (deep copy)
     #define GET_DEEP_COPY \
     A = GB_mx_mxArray_to_Matrix (pargin [0], "A input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&A) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&A) ;
     GET_DEEP_COPY ;
     if (A == NULL)
     {
diff --git a/GraphBLAS/Test/GB_mex_subassign.c b/GraphBLAS/Test/GB_mex_subassign.c
index 67684f7641..9e95d30655 100644
--- a/GraphBLAS/Test/GB_mex_subassign.c
+++ b/GraphBLAS/Test/GB_mex_subassign.c
@@ -2,8 +2,8 @@
 // GB_mex_subassign: C(I,J)<M> = accum (C (I,J), A)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 // This function is a wrapper for all GxB_*_subassign functions.
 // For these uses, the mask M must always be the same size as C(I,J) and A.
@@ -42,21 +42,25 @@
     bool A_is_M = (A == M) ;            \
     bool A_is_C = (A == C) ;            \
     bool C_is_M = (C == M) ;            \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&A) ;               \
     if (A_is_C) C = NULL ;              \
     if (A_is_M) M = NULL ;              \
-    GB_MATRIX_FREE (&C) ;               \
+    GrB_Matrix_free_(&C) ;               \
     if (C_is_M) M = NULL ;              \
-    GB_MATRIX_FREE (&M) ;               \
+    GrB_Matrix_free_(&M) ;               \
     GrB_Descriptor_free_(&desc) ;       \
     if (!user_complex) GrB_Monoid_free_(&reduce) ;                \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 #define GET_DEEP_COPY                                                   \
 {                                                                       \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;   \
-    if (nargin > 2 && mxIsChar (pargin [1]))                            \
+    if (have_sparsity_control)                                          \
+    {                                                                   \
+        GxB_Matrix_Option_set (C, GxB_SPARSITY_CONTROL, C_sparsity_control) ; \
+    }                                                                   \
+    if (nargin > 3 && mxIsChar (pargin [1]))                            \
     {                                                                   \
         M = GB_mx_alias ("M", pargin [1], "C", C, "A", A) ;             \
     }                                                                   \
@@ -70,7 +74,7 @@
 {                               \
     if (A == C) A = NULL ;      \
     if (M == C) M = NULL ;      \
-    GB_MATRIX_FREE (&C) ;       \
+    GrB_Matrix_free_(&C) ;       \
 }
 
 GrB_Matrix C = NULL ;
@@ -87,6 +91,9 @@ GrB_Info info = GrB_SUCCESS ;
 GrB_Monoid reduce = NULL ;
 GrB_BinaryOp op = NULL ;
 bool user_complex = false ;
+int C_sparsity_control ;
+int M_sparsity_control ;
+bool have_sparsity_control = false ;
 
 GrB_Info assign (GB_Context Context) ;
 
@@ -113,8 +120,8 @@ GrB_Info many_subassign
     info = method ;                     \
     if (info != GrB_SUCCESS)            \
     {                                   \
-        GB_MATRIX_FREE (&mask) ;        \
-        GB_MATRIX_FREE (&u) ;           \
+        GrB_Matrix_free_(&mask) ;       \
+        GrB_Matrix_free_(&u) ;          \
         return (info) ;                 \
     }                                   \
 }
@@ -277,7 +284,7 @@ GrB_Info assign (GB_Context Context)
         OK (GxB_Col_subassign_(C, (GrB_Vector) M, accum, (GrB_Vector) A,
             I, ni, J [0], desc)) ;
     }
-    else if (A->vlen == 1 && ni == 1 &&
+    else if (A->vlen == 1 && ni == 1 && nj > 0 &&
         (M == NULL || M->vlen == 1) && !at)
     {
         // test GxB_Row_subassign; this is not meant to be efficient,
@@ -285,19 +292,27 @@ GrB_Info assign (GB_Context Context)
         if (ph) printf ("row assign\n") ;
         if (M != NULL)
         {
-            OK (GB_transpose_bucket (&mask, GrB_BOOL, true, M,
-                NULL, NULL, NULL, false,
-                Context)) ;
+            // mask = M'
+            int64_t mnrows, mncols ;
+            OK (GrB_Matrix_nrows (&mnrows, M)) ;
+            OK (GrB_Matrix_ncols (&mncols, M)) ;
+            OK (GrB_Matrix_new (&mask, M->type, mncols, mnrows)) ;
+            OK (GrB_transpose (mask, NULL, NULL, M, NULL)) ;
+            mask->is_csc = true ;
             ASSERT (GB_VECTOR_OK (mask)) ;
         }
-        OK (GB_transpose_bucket (&u, A->type, true, A,
-            NULL, NULL, NULL, false,
-            Context)) ;
+        // u = A'
+        int64_t ancols, anrows ;
+        OK (GrB_Matrix_nrows (&anrows, A)) ;
+        OK (GrB_Matrix_ncols (&ancols, A)) ;
+        OK (GrB_Matrix_new (&u, A->type, ancols, anrows)) ;
+        OK (GrB_transpose (u, NULL, NULL, A, NULL)) ;
+        u->is_csc = true ;
         ASSERT (GB_VECTOR_OK (u)) ;
         OK (GxB_Row_subassign_(C, (GrB_Vector) mask, accum, (GrB_Vector) u,
             I [0], J, nj, desc)) ;
-        GB_MATRIX_FREE (&mask) ;
-        GB_MATRIX_FREE (&u) ;
+        GrB_Matrix_free_(&mask) ;
+        GrB_Matrix_free_(&u) ;
     }
     else
     {
@@ -334,7 +349,6 @@ GrB_Info many_subassign
 
     for (int64_t k = 0 ; k < nwork ; k++)
     {
-        // printf ("work %g of %g\n", (double) k, (double) nwork-1) ;
 
         //----------------------------------------------------------------------
         // get the kth work to do
@@ -348,22 +362,27 @@ GrB_Info many_subassign
         bool save = GB_Global_malloc_debug_get ( ) ;
         GB_Global_malloc_debug_set (false) ;
 
-        // get M (shallow copy)
+        // get M (deep copy)
         M = NULL ;
         if (fM >= 0)
         {
             p = mxGetFieldByNumber (pargin [1], k, fM) ;
-            M = GB_mx_mxArray_to_Matrix (p, "Mask", false, false) ;
+            M = GB_mx_mxArray_to_Matrix (p, "Mask", true, false) ;
             if (M == NULL && !mxIsEmpty (p))
             {
                 FREE_ALL ;
                 mexErrMsgTxt ("M failed") ;
             }
+            if (have_sparsity_control)
+            {
+                GxB_Matrix_Option_set (M, GxB_SPARSITY_CONTROL,
+                    M_sparsity_control) ;
+            }
         }
 
-        // get A (shallow copy)
+        // get A (true copy)
         p = mxGetFieldByNumber (pargin [1], k, fA) ;
-        A = GB_mx_mxArray_to_Matrix (p, "A", false, true) ;
+        A = GB_mx_mxArray_to_Matrix (p, "A", true, true) ;
         if (A == NULL)
         {
             FREE_ALL ;
@@ -422,8 +441,8 @@ GrB_Info many_subassign
 
         info = assign (Context) ;
 
-        GB_MATRIX_FREE (&A) ;
-        GB_MATRIX_FREE (&M) ;
+        GrB_Matrix_free_(&A) ;
+        GrB_Matrix_free_(&M) ;
         GrB_Descriptor_free_(&desc) ;
 
         if (info != GrB_SUCCESS)
@@ -449,6 +468,24 @@ void mexFunction
 )
 {
 
+    C = NULL ;
+    M = NULL ;
+    A = NULL ;
+    mask = NULL ;
+    u = NULL ;
+    desc = NULL ;
+    accum = NULL ;
+    I = NULL ; ni = 0 ;
+    J = NULL ; nj = 0 ;
+    malloc_debug = false ;
+    info = GrB_SUCCESS ;
+    reduce = NULL ;
+    op = NULL ;
+    user_complex = false ;
+    C_sparsity_control = GxB_AUTO_SPARSITY ;
+    M_sparsity_control = GxB_AUTO_SPARSITY ;
+    have_sparsity_control = false ;
+
     //--------------------------------------------------------------------------
     // check inputs
     //--------------------------------------------------------------------------
@@ -462,15 +499,28 @@ void mexFunction
     op = NULL ;
     reduce = NULL ;
 
-    GB_WHERE (USAGE) ;
-    if (!((nargout == 1 && (nargin == 2 || nargin == 6 || nargin == 7)) ||
+    GB_CONTEXT (USAGE) ;
+    if (!((nargout == 1 && (nargin == 2 || nargin == 3 ||
+            nargin == 6 || nargin == 7)) ||
           ((nargout == 2 || nargout == 3) && nargin == 8)))
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
     }
 
-    if (nargin == 2)
+    if (nargin == 2 || nargin == 3)
     {
+
+        // get sparsity control if present
+        if (nargin == 3)
+        {
+            int n = mxGetNumberOfElements (pargin [2]) ;
+            if (n != 2) mexErrMsgTxt ("invalid sparsity control") ;
+            have_sparsity_control = true ;
+            double *p = mxGetDoubles (pargin [2]) ;
+            C_sparsity_control = (int) p [0] ;
+            M_sparsity_control = (int) p [1] ;
+        }
+
         // get C (deep copy)
         GET_DEEP_COPY ;
         if (C == NULL)
@@ -523,10 +573,10 @@ void mexFunction
         // C(I,J)<M> = A, with a single assignment
         //----------------------------------------------------------------------
 
-        // get M (shallow copy)
+        // get M (deep copy)
         if (!mxIsChar (pargin [1]))
         {
-            M = GB_mx_mxArray_to_Matrix (pargin [1], "M", false, false) ;
+            M = GB_mx_mxArray_to_Matrix (pargin [1], "M", true, false) ;
             if (M == NULL && !mxIsEmpty (pargin [1]))
             {
                 FREE_ALL ;
@@ -534,10 +584,10 @@ void mexFunction
             }
         }
 
-        // get A (shallow copy)
+        // get A (deep copy)
         if (!mxIsChar (pargin [3]))
         {
-            A = GB_mx_mxArray_to_Matrix (pargin [3], "A", false, true) ;
+            A = GB_mx_mxArray_to_Matrix (pargin [3], "A", true, true) ;
             if (A == NULL)
             {
                 FREE_ALL ;
@@ -625,14 +675,11 @@ void mexFunction
         }
 
         // C(I,J)<M> = A
-
         METHOD (assign (Context)) ;
 
         // apply the reduce monoid
         if (nargin == 8 && (nargout == 2 || nargout == 3))
         {
-            // if (C->nzombies > 0)
-            //  printf ("do the reduce thing, zombies %lld\n", C->nzombies) ;
 
             pargout [1] = GB_mx_create_full (1, 1, C->type) ;
             GB_void *p = mxGetData (pargout [1]) ;
diff --git a/GraphBLAS/Test/GB_mex_subassign_alias.c b/GraphBLAS/Test/GB_mex_subassign_alias.c
index 96709e533e..415f946e80 100644
--- a/GraphBLAS/Test/GB_mex_subassign_alias.c
+++ b/GraphBLAS/Test/GB_mex_subassign_alias.c
@@ -2,8 +2,8 @@
 // GB_mex_subassign_alias: C<C>(:,:) = accum(C(:,:),C)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,9 +13,9 @@
 
 #define FREE_ALL                            \
 {                                           \
-    GB_MATRIX_FREE (&C) ;                   \
+    GrB_Matrix_free_(&C) ;                   \
     GrB_Descriptor_free_(&desc) ;           \
-    GB_mx_put_global (true, 0) ;            \
+    GB_mx_put_global (true) ;               \
 }
 
 void mexFunction
@@ -32,7 +32,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 2 || nargin > 3)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -41,7 +40,7 @@ void mexFunction
     // get C (make a deep copy)
     #define GET_DEEP_COPY \
     C = GB_mx_mxArray_to_Matrix (pargin [0], "C input", true, true) ;
-    #define FREE_DEEP_COPY GB_MATRIX_FREE (&C) ;
+    #define FREE_DEEP_COPY GrB_Matrix_free_(&C) ;
     GET_DEEP_COPY ;
     if (C == NULL)
     {
@@ -67,8 +66,8 @@ void mexFunction
     }
 
     GrB_Index nrows, ncols ;
-    GrB_Matrix_nvals (&nrows, C) ;
-    GrB_Matrix_nvals (&ncols, C) ;
+    GrB_Matrix_nrows (&nrows, C) ;
+    GrB_Matrix_ncols (&ncols, C) ;
 
     // C<C>(:,:) = accum (C(:,:),C)
     METHOD (GxB_Matrix_subassign_(C, C, accum, C,
diff --git a/GraphBLAS/Test/GB_mex_subref_symbolic.c b/GraphBLAS/Test/GB_mex_subref_symbolic.c
index 7607865c41..01fdf9ebbb 100644
--- a/GraphBLAS/Test/GB_mex_subref_symbolic.c
+++ b/GraphBLAS/Test/GB_mex_subref_symbolic.c
@@ -2,8 +2,8 @@
 // GB_mex_subref_symbolic: S=A(I,J)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,9 +13,9 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&A) ;              \
+    GrB_Matrix_free_(&C) ;              \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -35,7 +35,7 @@ void mexFunction
     bool ignore ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
+    GB_CONTEXT (USAGE) ;
     if (nargout > 1 || nargin != 3)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -66,8 +66,16 @@ void mexFunction
         mexErrMsgTxt ("J failed") ;
     }
 
-    // C = A(I,J) or A(J,I)', no need to check dimensions of C
-    METHOD (GB_subref (&C, true , A, I, ni, J, nj, true, true, Context)) ;
+    // symbolic subref is not needed when A is bitmap.
+    int sparsity = 0 ;
+    GxB_Matrix_Option_get_(A, GxB_SPARSITY_STATUS, &sparsity) ;
+    if (sparsity == GxB_BITMAP)
+    {
+        mexErrMsgTxt ("A failed: cannot be bitmap") ;
+    }
+
+    // C = A(I,J) or A(J,I)', no need to check dimensions of C; symbolic
+    METHOD (GB_subref (&C, true , A, I, ni, J, nj, true, Context)) ;
 
     // return C to MATLAB as a struct
     pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C subref symbolic", true) ;
diff --git a/GraphBLAS/Test/GB_mex_transpose.c b/GraphBLAS/Test/GB_mex_transpose.c
index fb7dce3518..ab0a0d03f7 100644
--- a/GraphBLAS/Test/GB_mex_transpose.c
+++ b/GraphBLAS/Test/GB_mex_transpose.c
@@ -2,8 +2,8 @@
 // GB_mex_transpose: transpose a sparse matrix and return it to MATLAB
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,14 +18,14 @@
     bool A_is_M = (A == M) ;            \
     bool A_is_C = (A == C) ;            \
     bool C_is_M = (C == M) ;            \
-    GB_MATRIX_FREE (&A) ;               \
+    GrB_Matrix_free_(&A) ;               \
     if (A_is_C) C = NULL ;              \
     if (A_is_M) M = NULL ;              \
-    GB_MATRIX_FREE (&C) ;               \
+    GrB_Matrix_free_(&C) ;               \
     if (C_is_M) M = NULL ;              \
-    GB_MATRIX_FREE (&M) ;               \
+    GrB_Matrix_free_(&M) ;               \
     GrB_Descriptor_free_(&desc) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -44,7 +44,6 @@ void mexFunction
     GrB_Descriptor desc = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 4 || nargin > 6)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -94,7 +93,7 @@ void mexFunction
     {                               \
         if (A == C) A = NULL ;      \
         if (M == C) M = NULL ;      \
-        GB_MATRIX_FREE (&C) ;       \
+        GrB_Matrix_free_(&C) ;       \
     }
 
     GET_DEEP_COPY ;
diff --git a/GraphBLAS/Test/GB_mex_tricount.c b/GraphBLAS/Test/GB_mex_tricount.c
index 81d20affa8..0d87ef89b4 100644
--- a/GraphBLAS/Test/GB_mex_tricount.c
+++ b/GraphBLAS/Test/GB_mex_tricount.c
@@ -2,8 +2,8 @@
 // GB_mex_tricount: count the number of triangles in a graph
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 // Usage: ntri = GB_mex_tricount (method, A, E, L, U) ;
 // see tricount.c for a description of the inputs
@@ -20,11 +20,11 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&E) ;               \
-    GB_MATRIX_FREE (&L) ;               \
-    GB_MATRIX_FREE (&U) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&A) ;               \
+    GrB_Matrix_free_(&E) ;               \
+    GrB_Matrix_free_(&L) ;               \
+    GrB_Matrix_free_(&U) ;               \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -40,7 +40,6 @@ void mexFunction
     GrB_Matrix A = NULL, E = NULL, L = NULL, U = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 2 || nargin != 5)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mex_tril.c b/GraphBLAS/Test/GB_mex_tril.c
index 1646d9b8e9..f2a6c1ad97 100644
--- a/GraphBLAS/Test/GB_mex_tril.c
+++ b/GraphBLAS/Test/GB_mex_tril.c
@@ -2,8 +2,8 @@
 // GB_mex_tril: compute C=tril(A,1)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,10 +15,10 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_SCALAR_FREE (&Thunk) ;           \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GxB_Scalar_free_(&Thunk) ;          \
+    GrB_Matrix_free_(&A) ;              \
+    GrB_Matrix_free_(&C) ;              \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -35,7 +35,6 @@ void mexFunction
     GxB_Scalar Thunk = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 1 || nargin > 2)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -70,8 +69,7 @@ void mexFunction
 
     GxB_Scalar_new (&Thunk, GrB_INT64) ;
     GxB_Scalar_setElement_INT64_(Thunk, k) ;
-    GrB_Index ignore ;
-    GxB_Scalar_nvals (&ignore, Thunk) ;
+    GxB_Scalar_wait_(&Thunk) ;
 
     // C = tril (A,k)
     METHOD (GxB_Matrix_select_(C, NULL, NULL, GxB_TRIL, A, Thunk, NULL)) ;
diff --git a/GraphBLAS/Test/GB_mex_triple_mxm.c b/GraphBLAS/Test/GB_mex_triple_mxm.c
new file mode 100644
index 0000000000..b18b1dbe03
--- /dev/null
+++ b/GraphBLAS/Test/GB_mex_triple_mxm.c
@@ -0,0 +1,119 @@
+//------------------------------------------------------------------------------
+// GB_mex_triple_mxm: C = A*B*E
+//------------------------------------------------------------------------------
+
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
+#include "GB_mex.h"
+
+#define USAGE "C = GB_mex_triple_mxm (semiring, A, B, E, desc)"
+
+#define FREE_ALL                                    \
+{                                                   \
+    GrB_Matrix_free_(&A) ;                          \
+    GrB_Matrix_free_(&B) ;                          \
+    GrB_Matrix_free_(&C) ;                          \
+    GrB_Matrix_free_(&E) ;                          \
+    GrB_Matrix_free_(&T) ;                          \
+    if (semiring != Complex_plus_times)             \
+    {                                               \
+        if (semiring != NULL)                       \
+        {                                           \
+            GrB_Monoid_free_(&(semiring->add)) ;    \
+        }                                           \
+        GrB_Semiring_free_(&semiring) ;             \
+    }                                               \
+    GrB_Descriptor_free_(&desc) ;                   \
+    GB_mx_put_global (true) ;                       \
+}
+
+void mexFunction
+(
+    int nargout,
+    mxArray *pargout [ ],
+    int nargin,
+    const mxArray *pargin [ ]
+)
+{
+
+    bool malloc_debug = GB_mx_get_global (true) ;
+    GrB_Matrix A = NULL ;
+    GrB_Matrix B = NULL ;
+    GrB_Matrix C = NULL ;
+    GrB_Matrix E = NULL ;
+    GrB_Matrix T = NULL ;
+    GrB_Semiring semiring = NULL ;
+    GrB_Descriptor desc = NULL ;
+
+    // check inputs
+    if (nargout > 1 || nargin < 4 || nargin > 5)
+    {
+        mexErrMsgTxt ("Usage: " USAGE) ;
+    }
+
+    #define GET_DEEP_COPY ;
+    #define FREE_DEEP_COPY ;
+
+    // get A (shallow copy)
+    A = GB_mx_mxArray_to_Matrix (pargin [1], "A input", false, true) ;
+    if (A == NULL)
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("A failed") ;
+    }
+
+    // get B (shallow copy)
+    B = GB_mx_mxArray_to_Matrix (pargin [2], "B input", false, true) ;
+    if (B == NULL)
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("B failed") ;
+    }
+
+    // get E (shallow copy)
+    E = GB_mx_mxArray_to_Matrix (pargin [3], "E input", false, true) ;
+    if (E == NULL)
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("B failed") ;
+    }
+
+    bool user_complex = (Complex != GxB_FC64) && (A->type == Complex) ;
+
+    // get semiring
+    if (!GB_mx_mxArray_to_Semiring (&semiring, pargin [0], "semiring",  
+        A->type, user_complex))
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("semiring failed") ;
+    }
+
+    // get desc
+    if (!GB_mx_mxArray_to_Descriptor (&desc, PARGIN (4), "desc"))
+    {
+        FREE_ALL ;
+        mexErrMsgTxt ("desc failed") ;
+    }
+
+    // T = B*E
+    GrB_Index nrows, ncols ;
+    GrB_Matrix_nrows (&nrows, B) ;
+    GrB_Matrix_ncols (&ncols, E) ;
+    GrB_Matrix_new (&T, A->type, nrows, ncols) ;
+    GxB_Matrix_Option_set_(T, GxB_SPARSITY_CONTROL, GxB_SPARSE) ;
+    GrB_mxm (T, NULL, NULL, semiring, B, E, desc) ;
+
+    // C = A*T
+    GrB_Matrix_ncols (&nrows, A) ;
+    GrB_Matrix_new (&C, A->type, nrows, ncols) ;
+    GrB_mxm (C, NULL, NULL, semiring, A, T, desc) ;
+
+    // return C to MATLAB as a struct and free the GraphBLAS C
+    pargout [0] = GB_mx_Matrix_to_mxArray (&C, "C from GrB_mxm_triple", true) ;
+
+    FREE_ALL ;
+}
+
diff --git a/GraphBLAS/Test/GB_mex_triu.c b/GraphBLAS/Test/GB_mex_triu.c
index 9940352408..42bac6e595 100644
--- a/GraphBLAS/Test/GB_mex_triu.c
+++ b/GraphBLAS/Test/GB_mex_triu.c
@@ -2,8 +2,8 @@
 // GB_mex_triu: compute C=triu(A,1)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,10 +15,10 @@
 
 #define FREE_ALL                        \
 {                                       \
-    GB_SCALAR_FREE (&Thunk) ;           \
-    GB_MATRIX_FREE (&A) ;               \
-    GB_MATRIX_FREE (&C) ;               \
-    GB_mx_put_global (true, 0) ;        \
+    GxB_Scalar_free_(&Thunk) ;          \
+    GrB_Matrix_free_(&A) ;              \
+    GrB_Matrix_free_(&C) ;              \
+    GB_mx_put_global (true) ;           \
 }
 
 void mexFunction
@@ -35,7 +35,6 @@ void mexFunction
     GxB_Scalar Thunk = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 1 || nargin > 2)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -70,8 +69,7 @@ void mexFunction
 
     GxB_Scalar_new (&Thunk, GrB_INT64) ;
     GxB_Scalar_setElement_INT64_(Thunk, k) ;
-    GrB_Index ignore ;
-    GxB_Scalar_nvals (&ignore, Thunk) ;
+    GxB_Scalar_wait_(&Thunk) ;
 
     // C = triu (A,k)
     METHOD (GxB_Matrix_select_(C, NULL, NULL, GxB_TRIU, A, Thunk, NULL)) ;
diff --git a/GraphBLAS/Test/GB_mex_vxm.c b/GraphBLAS/Test/GB_mex_vxm.c
index b38c52a984..6cfa8bd69c 100644
--- a/GraphBLAS/Test/GB_mex_vxm.c
+++ b/GraphBLAS/Test/GB_mex_vxm.c
@@ -2,8 +2,8 @@
 // GB_mex_vxm: w'<mask> = accum(w',u'A)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -15,7 +15,7 @@
 {                                                   \
     GrB_Vector_free_(&w) ;                          \
     GrB_Vector_free_(&u) ;                          \
-    GB_MATRIX_FREE (&A) ;                           \
+    GrB_Matrix_free_(&A) ;                          \
     GrB_Vector_free_(&mask) ;                       \
     if (semiring != Complex_plus_times)             \
     {                                               \
@@ -26,7 +26,7 @@
         GrB_Semiring_free_(&semiring) ;             \
     }                                               \
     GrB_Descriptor_free_(&desc) ;                   \
-    GB_mx_put_global (true, AxB_method_used) ;      \
+    GB_mx_put_global (true) ;                       \
 }
 
 void mexFunction
@@ -45,10 +45,8 @@ void mexFunction
     GrB_Vector mask = NULL ;
     GrB_Semiring semiring = NULL ;
     GrB_Descriptor desc = NULL ;
-    GrB_Desc_Value AxB_method_used = GxB_DEFAULT ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin < 6 || nargin > 7)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
@@ -118,8 +116,6 @@ void mexFunction
     // w'<mask> = accum(w',u'*A)
     METHOD (GrB_vxm (w, mask, accum, semiring, u, A, desc)) ;
 
-    if (w != NULL) AxB_method_used = w->AxB_method_used ;
-
     // return w to MATLAB as a struct and free the GraphBLAS w
     pargout [0] = GB_mx_Vector_to_mxArray (&w, "w output", true) ;
 
diff --git a/GraphBLAS/Test/GB_mex_wathen.c b/GraphBLAS/Test/GB_mex_wathen.c
index 0addf290b7..13a3e5b090 100644
--- a/GraphBLAS/Test/GB_mex_wathen.c
+++ b/GraphBLAS/Test/GB_mex_wathen.c
@@ -2,8 +2,8 @@
 // GB_mex_wathen: construct a random finite-element matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,7 +11,7 @@
 
 #define USAGE "A = GB_mex_wathen (nx, ny, method, scale, rho)"
 
-#define FREE_ALL GB_mx_put_global (true, 0) ;
+#define FREE_ALL GB_mx_put_global (true) ;   
 
 void mexFunction
 (
@@ -26,7 +26,6 @@ void mexFunction
     GrB_Matrix A = NULL ;
 
     // check inputs
-    GB_WHERE (USAGE) ;
     if (nargout > 1 || nargin > 5)
     {
         mexErrMsgTxt ("Usage: " USAGE) ;
diff --git a/GraphBLAS/Test/GB_mx_BinaryOp_to_Monoid.c b/GraphBLAS/Test/GB_mx_BinaryOp_to_Monoid.c
index 5254fb6b20..6706dde36c 100644
--- a/GraphBLAS/Test/GB_mx_BinaryOp_to_Monoid.c
+++ b/GraphBLAS/Test/GB_mx_BinaryOp_to_Monoid.c
@@ -2,8 +2,8 @@
 // GB_mx_BinaryOp_to_Monoid: convert a binary op to a monoid
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_Matrix_to_mxArray.c b/GraphBLAS/Test/GB_mx_Matrix_to_mxArray.c
index e98945fdec..6128d76df4 100644
--- a/GraphBLAS/Test/GB_mx_Matrix_to_mxArray.c
+++ b/GraphBLAS/Test/GB_mx_Matrix_to_mxArray.c
@@ -2,8 +2,8 @@
 // GB_mx_Matrix_to_mxArray
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_Monoid.c b/GraphBLAS/Test/GB_mx_Monoid.c
index edcb3e2f0f..d47fd10546 100644
--- a/GraphBLAS/Test/GB_mx_Monoid.c
+++ b/GraphBLAS/Test/GB_mx_Monoid.c
@@ -2,8 +2,8 @@
 // GB_mx_Monoid: construct a monoid from a built-in operator
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,7 +22,6 @@ bool GB_mx_Monoid               // true if successful, false otherwise
     const bool malloc_debug     // true if malloc debug should be done
 )
 {
-    GB_WHERE ("GB_mx_Monoid") ;
 
     GrB_Monoid M = NULL ;
     (*handle) = NULL ;
diff --git a/GraphBLAS/Test/GB_mx_Type.c b/GraphBLAS/Test/GB_mx_Type.c
index 526a5307bf..69bcd8c0cd 100644
--- a/GraphBLAS/Test/GB_mx_Type.c
+++ b/GraphBLAS/Test/GB_mx_Type.c
@@ -2,8 +2,8 @@
 // GB_mx_Type: get GraphBLAS type of a MATLAB matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -22,7 +22,10 @@ GrB_Type GB_mx_Type                    // returns a GraphBLAS type
 
     GrB_Type xtype ;
 
-    if (X == NULL) return (NULL) ;
+    if (X == NULL)
+    {
+        return (NULL) ;
+    }
 
     if (mxIsComplex (X))
     {
diff --git a/GraphBLAS/Test/GB_mx_Type_to_mxstring.c b/GraphBLAS/Test/GB_mx_Type_to_mxstring.c
index 41c1360eab..fc9b9bd5b9 100644
--- a/GraphBLAS/Test/GB_mx_Type_to_mxstring.c
+++ b/GraphBLAS/Test/GB_mx_Type_to_mxstring.c
@@ -2,8 +2,8 @@
 // GB_mx_Type_to_string: return a MATLAB string for a GrB_Type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_Vector_to_mxArray.c b/GraphBLAS/Test/GB_mx_Vector_to_mxArray.c
index 25d0528bde..f6253d0ed5 100644
--- a/GraphBLAS/Test/GB_mx_Vector_to_mxArray.c
+++ b/GraphBLAS/Test/GB_mx_Vector_to_mxArray.c
@@ -2,8 +2,8 @@
 // GB_mx_Vector_to_mxArray
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_abort.c b/GraphBLAS/Test/GB_mx_abort.c
index fe18a53777..752f44a410 100644
--- a/GraphBLAS/Test/GB_mx_abort.c
+++ b/GraphBLAS/Test/GB_mx_abort.c
@@ -2,8 +2,8 @@
 // GB_mx_abort: terminate MATLAB
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_alias.c b/GraphBLAS/Test/GB_mx_alias.c
index c85b37287a..bd5350f99a 100644
--- a/GraphBLAS/Test/GB_mx_alias.c
+++ b/GraphBLAS/Test/GB_mx_alias.c
@@ -2,8 +2,8 @@
 // GB_mx_alias:  return an aliased argument
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_clear_time.c b/GraphBLAS/Test/GB_mx_clear_time.c
index e2aa2833e7..6a0dbc038f 100644
--- a/GraphBLAS/Test/GB_mx_clear_time.c
+++ b/GraphBLAS/Test/GB_mx_clear_time.c
@@ -2,8 +2,8 @@
 // GB_mx_clear_time: clear the time and start the timer
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_create_full.c b/GraphBLAS/Test/GB_mx_create_full.c
index 40c867388d..6060ec5d26 100644
--- a/GraphBLAS/Test/GB_mx_create_full.c
+++ b/GraphBLAS/Test/GB_mx_create_full.c
@@ -2,8 +2,8 @@
 // GB_mx_create_full: create a full MATLAB matrix of a given GrB_Type
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_get_global.c b/GraphBLAS/Test/GB_mx_get_global.c
index 78ca6ad1bb..03c81a9e5d 100644
--- a/GraphBLAS/Test/GB_mx_get_global.c
+++ b/GraphBLAS/Test/GB_mx_get_global.c
@@ -2,16 +2,14 @@
 // GB_mx_get_global: get the GraphBLAS thread-local storage from MATLAB
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
 // Get the variable 'GraphBLAS_debug' from the MATLAB global workspace.
 // If it doesn't exist, create it and set it to false.
 
-// GxB_MKL is not controllable via MATLAB.
-
 #include "GB_mex.h"
 
 bool GB_mx_get_global       // true if doing malloc_debug
diff --git a/GraphBLAS/Test/GB_mx_isequal.c b/GraphBLAS/Test/GB_mx_isequal.c
index 2f46c3a9f2..b6e7b65db5 100644
--- a/GraphBLAS/Test/GB_mx_isequal.c
+++ b/GraphBLAS/Test/GB_mx_isequal.c
@@ -2,8 +2,8 @@
 // GB_mx_isequal: check if two matrices are equal
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,6 +23,12 @@ bool GB_mx_isequal     // true if A and B are exactly the same
     if (A == NULL) return (false) ;
     if (B == NULL) return (false) ;
 
+    int A_sparsity = GB_sparsity (A) ;
+    if (A_sparsity != GB_sparsity (B))
+    {
+        return (false) ;
+    }
+
     GB_Pending AP = A->Pending ;
     GB_Pending BP = B->Pending ;
 
@@ -34,18 +40,19 @@ bool GB_mx_isequal     // true if A and B are exactly the same
 
     if (GB_NNZ (A)  != GB_NNZ (B) ) return (false) ;
 
-    if (A->is_hyper != B->is_hyper) return (false) ;
+    if ((A->h != NULL) != (B->h != NULL)) return (false) ;
     if (A->is_csc   != B->is_csc  ) return (false) ;
 
-    // these differences are OK:
+    // these differences are OK
     // if (A->plen  != B->plen ) return (false) ;
     // if (A->nzmax != B->nzmax) return (false) ;
     // if (AP->nmax != BP->nmax) return (false) ;
 
-    if (A->p_shallow        != B->p_shallow        ) return (false) ;
-    if (A->h_shallow        != B->h_shallow        ) return (false) ;
-    if (A->i_shallow        != B->i_shallow        ) return (false) ;
-    if (A->x_shallow        != B->i_shallow        ) return (false) ;
+//  if (A->p_shallow        != B->p_shallow        ) return (false) ;
+//  if (A->h_shallow        != B->h_shallow        ) return (false) ;
+//  if (A->i_shallow        != B->i_shallow        ) return (false) ;
+//  if (A->x_shallow        != B->i_shallow        ) return (false) ;
+
     if (A->nzombies         != B->nzombies         ) return (false) ;
 
     if ((AP != NULL) != (BP != NULL)) return (false) ;
@@ -66,30 +73,55 @@ bool GB_mx_isequal     // true if A and B are exactly the same
 
     ASSERT (n >= 0 && n <= A->vdim) ;
 
-    if (!GB_mx_same  ((char *) A->p, (char *) B->p, (n+1) * s)) return (false) ;
-    if (A->is_hyper)
+    bool A_is_dense = GB_is_dense (A) || GB_IS_FULL (A) ;
+    bool B_is_dense = GB_is_dense (B) || GB_IS_FULL (B) ;
+
+    if (A_is_dense != B_is_dense) return (false) ;
+
+    if (!A_is_dense)
+    {
+        if (!GB_mx_same  ((char *) A->p, (char *) B->p, (n+1) * s))
+        {
+            return (false) ;
+        }
+        if (A->h != NULL)
+        {
+            if (!GB_mx_same ((char *) A->h, (char *) B->h, n * s))
+                return (false) ;
+        }
+    }
+
+    if (A_sparsity == GxB_BITMAP)
     {
-        if (!GB_mx_same ((char *) A->h, (char *) B->h, n * s)) return (false) ;
+        if (!GB_mx_same ((char *) A->b, (char *) B->b, nnz))
+        {
+            return (false) ;
+        }
     }
 
     if (A->nzmax > 0 && B->nzmax > 0)
     {
-        if (!GB_mx_same  ((char *) A->i, (char *) B->i, nnz * s))
-            return (false) ;
+        if (!A_is_dense)
+        {
+            if (!GB_mx_same  ((char *) A->i, (char *) B->i, nnz * s))
+            {
+                return (false) ;
+            }
+        }
 
         if (A->type == GrB_FP32 && eps > 0)
         {
-            if (!GB_mx_xsame32 (A->x, B->x, nnz, A->i, eps))
+            if (!GB_mx_xsame32 (A->x, B->x, A->b, nnz, A->i, eps))
                 return (false) ;
         }
         else if (A->type == GrB_FP64 && eps > 0)
         {
-            if (!GB_mx_xsame64 (A->x, B->x, nnz, A->i, eps))
+            if (!GB_mx_xsame64 (A->x, B->x, A->b, nnz, A->i, eps))
                 return (false) ;
         }
         else
         {
-            if (!GB_mx_xsame (A->x, B->x, nnz, asize, A->i))
+            if (!GB_mx_xsame (A->x, B->x, A->b, nnz, asize, A->i))
                 return (false) ;
         }
     }
diff --git a/GraphBLAS/Test/GB_mx_mxArray_to_BinaryOp.c b/GraphBLAS/Test/GB_mx_mxArray_to_BinaryOp.c
index 83fa6471a6..c80be79bdf 100644
--- a/GraphBLAS/Test/GB_mx_mxArray_to_BinaryOp.c
+++ b/GraphBLAS/Test/GB_mx_mxArray_to_BinaryOp.c
@@ -2,8 +2,8 @@
 // GB_mx_mxArray_to_BinaryOp
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -29,8 +29,6 @@ bool GB_mx_mxArray_to_BinaryOp          // true if successful, false otherwise
 )
 {
 
-    GB_WHERE ("GB_mx_mxArray_to_BinaryOp") ;
-
     (*op_handle) = NULL ;
 
     const mxArray *opname_mx = NULL, *optype_mx = NULL ;
diff --git a/GraphBLAS/Test/GB_mx_mxArray_to_Descriptor.c b/GraphBLAS/Test/GB_mx_mxArray_to_Descriptor.c
index bc0039e405..c8ecf7fbb8 100644
--- a/GraphBLAS/Test/GB_mx_mxArray_to_Descriptor.c
+++ b/GraphBLAS/Test/GB_mx_mxArray_to_Descriptor.c
@@ -2,8 +2,8 @@
 // GB_mx_mxArray_to_Descriptor: get the contents of a GraphBLAS Descriptor
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -94,10 +94,6 @@ static bool get_descriptor
         {
             info = GxB_Desc_set (D, field, GxB_AxB_DOT) ;
         }
-        else if (MATCH (s, "heap"))
-        {
-            info = GxB_Desc_set (D, field, GxB_AxB_HEAP) ;
-        }
         else if (MATCH (s, "hash"))
         {
             info = GxB_Desc_set (D, field, GxB_AxB_HASH) ;
@@ -128,8 +124,6 @@ bool GB_mx_mxArray_to_Descriptor   // true if successful, false otherwise
     const char *name                // name of the descriptor
 )
 {
-    GB_WHERE ("GB_mx_mxArray_to_Descriptor") ;
-
     // a null descriptor is OK; the method will use defaults
     (*handle) = NULL ;
     if (D_matlab == NULL || mxIsEmpty (D_matlab))
diff --git a/GraphBLAS/Test/GB_mx_mxArray_to_Matrix.c b/GraphBLAS/Test/GB_mx_mxArray_to_Matrix.c
index abe41e9f2c..acdd67edf9 100644
--- a/GraphBLAS/Test/GB_mx_mxArray_to_Matrix.c
+++ b/GraphBLAS/Test/GB_mx_mxArray_to_Matrix.c
@@ -2,23 +2,23 @@
 // GB_mx_mxArray_to_Matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Convert a MATLAB sparse matrix or struct to a GraphBLAS sparse matrix.  The
-// mxArray is either a struct containing two terms: a sparse matrix or vector,
-// and type (a string, "logical", "double", etc), or it is just a plain sparse
-// matrix.  If A.class is present, it is used to typecast the MATLAB matrix
-// into the corresponding type in GraphBLAS.
+// Convert a MATLAB sparse or full matrix, or a struct to a GraphBLAS sparse
+// matrix.  The mxArray is either a struct containing two terms: a sparse or
+// full matrix or vector, and type (a string, "logical", "double", etc), or it
+// is just a plain sparse or full matrix.  If A.class is present, it is used to
+// typecast the MATLAB matrix into the corresponding type in GraphBLAS.
 
 // That is:
 // A = sparse (...) ;   % a sparse double or logical GraphBLAS matrix
 
-// A.matrix = A ; A.class = 'int8' ; Represents  A MATLAB sparse matrix that
-// represents a GraphBLAS int8 matrix.  On input, the MATLAB sparse matrix is
-// typecasted.
+// A.matrix = A ; A.class = 'int8' ; Represents a MATLAB sparse or full matrix
+// that represents a GraphBLAS int8 matrix.  On input, the MATLAB sparse or
+// full matrix is typecasted.
 
 // The MATLAB matrix or struct is not modified.  If deep_copy is true, the
 // GraphBLAS matrix is always a deep copy and can be modified by GraphBLAS.
@@ -34,11 +34,20 @@
 // Like GB_mx_Matrix_to_mxArray, this could be done using only user-callable
 // GraphBLAS functions, but the method used here is faster.
 
+// A.sparsity sets the GxB_SPARSITY_CONTROL option: 0 to 15 (see GB_conform.c),
+// which is any sum of these 4 flags:
+//
+//    // GxB_SPARSITY_CONTROL can be any sum or bitwise OR of these 4 values:
+//    #define GxB_HYPERSPARSE 1   // hypersparse form
+//    #define GxB_SPARSE      2   // sparse form
+//    #define GxB_BITMAP      4   // a bitmap
+//    #define GxB_FULL        8   // full (all entries must be present)
+
 #include "GB_mex.h"
 
 #define FREE_ALL            \
 {                           \
-    GB_MATRIX_FREE (&A) ;   \
+    GrB_Matrix_free_(&A) ;  \
 }
 
 GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
@@ -55,7 +64,7 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
     // check for empty matrix
     //--------------------------------------------------------------------------
 
-    GB_WHERE ("mxArray_to_Matrix") ;
+    GB_CONTEXT ("mxArray_to_Matrix") ;
 
     GrB_Matrix A = NULL ;
 
@@ -87,7 +96,6 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
     // get the matrix
     //--------------------------------------------------------------------------
 
-
     const mxArray *Amatrix = NULL ;
     GrB_Type atype_in, atype_out ;
     GB_Type_code atype_in_code, atype_out_code ;
@@ -139,12 +147,7 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
         atype_out = atype_in ;
     }
 
-    if (!mxIsSparse (Amatrix))
-    {
-        FREE_ALL ;
-        mexWarnMsgIdAndTxt ("GB:warn", "input matrix must be sparse") ;
-        return (NULL) ;
-    }
+    bool A_is_sparse = mxIsSparse (Amatrix) ;
 
     //--------------------------------------------------------------------------
     // get the matrix type
@@ -159,11 +162,24 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
 
     int64_t nrows = mxGetM (Amatrix) ;
     int64_t ncols = mxGetN (Amatrix) ;
-    int64_t *Mp = (int64_t *) mxGetJc (Amatrix) ;
-    int64_t *Mi = (int64_t *) mxGetIr (Amatrix) ;
-    int64_t anz = Mp [ncols] ;
-    GB_void *Mx = mxGetData (Amatrix) ;     // OK:any type
-    int64_t anzmax = mxGetNzmax (Amatrix) ;
+    int64_t *Mp, *Mi, anz, anzmax ;
+
+    if (A_is_sparse)
+    {
+        Mp = (int64_t *) mxGetJc (Amatrix) ;
+        Mi = (int64_t *) mxGetIr (Amatrix) ;
+        anz = Mp [ncols] ;
+        anzmax = mxGetNzmax (Amatrix) ;
+    }
+    else
+    {
+        Mp = NULL ;
+        Mi = NULL ;
+        anz = nrows * ncols ;
+        anzmax = anz ;
+    }
+
+    GB_void *Mx = mxGetData (Amatrix) ;
 
     //--------------------------------------------------------------------------
     // look for A.values
@@ -182,7 +198,7 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
             }
             if (mxGetNumberOfElements (values) >= anz)
             {
-                Mx = mxGetData (values) ;       // OK:any type
+                Mx = mxGetData (values) ;
                 atype_in = GB_mx_Type (values) ;
                 atype_in_code = atype_in->code ;
                 anzmax = mxGetNumberOfElements (values) ;
@@ -202,9 +218,9 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
 
     GrB_Info info ;
 
-    // MATLAB matrices are non-hypersparse CSC
+    // MATLAB matrices are sparse or full CSC, not hypersparse or bitmap
     bool is_csc = true ;
-    bool is_hyper = false ;
+    int sparsity = (A_is_sparse) ? GxB_SPARSE : GxB_FULL ;
 
     //--------------------------------------------------------------------------
     // get the pattern of A
@@ -214,8 +230,9 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
     {
 
         // create the GraphBLAS matrix
-        info = GB_new (&A, atype_out, (GrB_Index) nrows, (GrB_Index) ncols,
-            GB_Ap_calloc, is_csc, is_hyper, GB_HYPER_DEFAULT, 0, Context) ;
+        info = GB_new (&A, // sparse or full, new header
+            atype_out, (GrB_Index) nrows, (GrB_Index) ncols,
+            GB_Ap_calloc, is_csc, sparsity, GxB_HYPER_DEFAULT, 0, Context) ;
         if (info != GrB_SUCCESS)
         {
             FREE_ALL ;
@@ -224,7 +241,8 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
         }
 
         // A is a deep copy and can be modified by GraphBLAS
-        info = GB_ix_alloc (A, anz, true, Context) ;
+        info = GB_bix_alloc (A, anz, false, false, sparsity != GxB_FULL, true,
+            Context) ;
         if (info != GrB_SUCCESS)
         {
             FREE_ALL ;
@@ -232,8 +250,11 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
             return (NULL) ;
         }
 
-        memcpy (A->p, Mp, (ncols+1) * sizeof (int64_t)) ;
-        memcpy (A->i, Mi, anz * sizeof (int64_t)) ;
+        if (sparsity != GxB_FULL)
+        {
+            memcpy (A->p, Mp, (ncols+1) * sizeof (int64_t)) ;
+            memcpy (A->i, Mi, anz * sizeof (int64_t)) ;
+        }
         A->magic = GB_MAGIC ;
 
     }
@@ -244,8 +265,9 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
         // MATLAB matrix and must not be modified.
 
         // [ create the GraphBLAS matrix, do not allocate A->p
-        info = GB_new (&A, atype_out, (GrB_Index) nrows, (GrB_Index) ncols,
-            GB_Ap_null, is_csc, is_hyper, GB_HYPER_DEFAULT, 0, Context) ;
+        info = GB_new (&A, // sparse or full, new header
+            atype_out, (GrB_Index) nrows, (GrB_Index) ncols,
+            GB_Ap_null, is_csc, sparsity, GxB_HYPER_DEFAULT, 0, Context) ;
         if (info != GrB_SUCCESS)
         {
             FREE_ALL ;
@@ -253,23 +275,41 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
             return (NULL) ;
         }
 
-        A->p = Mp ;
-        A->magic = GB_MAGIC ;       // A->p now initialized ]
-        A->i = Mi ;
-        A->p_shallow = true ;
+        if (sparsity != GxB_FULL)
+        {
+            A->p = Mp ;
+            A->i = Mi ;
+            A->p_shallow = true ;
+            A->i_shallow = true ;
+        }
+        else
+        {
+            A->p = NULL ;
+            A->i = NULL ;
+            A->p_shallow = false ;
+            A->i_shallow = false ;
+        }
+
         A->h_shallow = false ;      // A->h is NULL
-        A->i_shallow = true ;
+        A->magic = GB_MAGIC ;       // A->p now initialized ]
     }
 
     //--------------------------------------------------------------------------
     // copy the numerical values from MATLAB to the GraphBLAS matrix
     //--------------------------------------------------------------------------
 
-    A->x_shallow = (!deep_copy &&
-           ((atype_out_code == GB_BOOL_code ||
-             atype_out_code == GB_FP64_code ||
-             atype_out_code == GB_FC64_code)
-         && (atype_out_code == atype_in_code))) ;
+    if (sparsity == GxB_FULL)
+    {
+        A->x_shallow = (!deep_copy && (atype_out_code == atype_in_code)) ;
+    }
+    else
+    {
+        A->x_shallow = (!deep_copy &&
+               ((atype_out_code == GB_BOOL_code ||
+                 atype_out_code == GB_FP64_code ||
+                 atype_out_code == GB_FC64_code)
+             && (atype_out_code == atype_in_code))) ;
+    }
 
     if (A->x_shallow)
     {
@@ -299,7 +339,8 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
             (atype_out_code == GB_UDT_code) ? GB_FC64_code : atype_out_code,
             Mx,
             (atype_in_code == GB_UDT_code) ? GB_FC64_code : atype_in_code,
-            (atype_in_code == GB_UDT_code) ? sizeof(GxB_FC64_t) : atype_in->size,
+            NULL,
+            (atype_in_code == GB_UDT_code) ? sizeof(GxB_FC64_t) :atype_in->size,
             anz, 1) ;
     }
 
@@ -307,8 +348,11 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
     // look for CSR/CSC and hyper/non-hyper format
     //--------------------------------------------------------------------------
 
-    bool has_hyper_ratio = false ;
-    double hyper_ratio = GB_HYPER_DEFAULT ;
+    bool A_is_hyper = false ;
+    bool has_hyper_switch = false ;
+    bool has_sparsity_control = false ;
+    int sparsity_control = GxB_AUTO_SPARSITY ;
+    double hyper_switch = GxB_HYPER_DEFAULT ;
 
     if (mxIsStruct (A_matlab))
     {
@@ -320,20 +364,30 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
                 0, fieldnumber)) ;
         }
 
-        // look for A.is_hyper (ignored if hyper_ratio present)
+        // look for A.is_hyper (ignored if hyper_switch present
+        // or if A is full)
         fieldnumber = mxGetFieldNumber (A_matlab, "is_hyper") ;
         if (fieldnumber >= 0)
         {
-            is_hyper = mxGetScalar (mxGetFieldByNumber (A_matlab,
+            A_is_hyper = mxGetScalar (mxGetFieldByNumber (A_matlab,
                 0, fieldnumber)) ;
         }
 
-        // look for A.hyper_ratio
-        fieldnumber = mxGetFieldNumber (A_matlab, "hyper_ratio") ;
+        // look for A.hyper_switch (ignored if A is full)
+        fieldnumber = mxGetFieldNumber (A_matlab, "hyper_switch") ;
         if (fieldnumber >= 0)
         {
-            has_hyper_ratio = true ;
-            hyper_ratio = mxGetScalar (mxGetFieldByNumber (A_matlab,
+            has_hyper_switch = true ;
+            hyper_switch = mxGetScalar (mxGetFieldByNumber (A_matlab,
+                0, fieldnumber)) ;
+        }
+
+        // look for A.sparsity
+        fieldnumber = mxGetFieldNumber (A_matlab, "sparsity") ;
+        if (fieldnumber >= 0)
+        {
+            has_sparsity_control = true ;
+            sparsity_control = mxGetScalar (mxGetFieldByNumber (A_matlab,
                 0, fieldnumber)) ;
         }
     }
@@ -342,10 +396,13 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
     // compute the # of non-empty vectors in A only when needed
     //--------------------------------------------------------------------------
 
-    A->nvec_nonempty = -1 ; // compute when needed; see also GxB_Matrix_import
+    if (sparsity != GxB_FULL)
+    {
+        A->nvec_nonempty = -1 ;
+    }
 
     ASSERT_MATRIX_OK (A, "got natural A from MATLAB", GB0) ;
-    ASSERT (!A->is_hyper) ;
+    ASSERT (A->h == NULL) ;
 
     //--------------------------------------------------------------------------
     // convert to CSR if requested
@@ -359,40 +416,51 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
         // this might convert A to hypersparse
         GxB_Matrix_Option_set_(A, GxB_FORMAT, GxB_BY_ROW) ;
         // so convert it back; hypersparsity is defined below
-        GB_to_nonhyper (A, Context) ;
+        if (sparsity != GxB_FULL)
+        {
+            GB_convert_hyper_to_sparse (A, Context) ;
+        }
         ASSERT (!A->is_csc) ;
     }
 
     ASSERT_MATRIX_OK (A, "conformed from MATLAB", GB0) ;
-    ASSERT (!A->is_hyper) ;
+    ASSERT (A->h == NULL) ;
     ASSERT (A->is_csc == is_csc) ;
 
     //--------------------------------------------------------------------------
     // convert to hypersparse or set hypersparse ratio, if requested
     //--------------------------------------------------------------------------
 
-    if (has_hyper_ratio)
+    if (sparsity == GxB_FULL)
+    {
+        // leave as-is
+        ;
+    }
+    else if (has_hyper_switch)
     {
-        // this sets the hyper_ratio and then conforms the matrix to its
+        // this sets the hyper_switch and then conforms the matrix to its
         // desired hypersparsity.  It may stay non-hypersparse.
-        GxB_Matrix_Option_set_(A, GxB_HYPER, hyper_ratio) ;
+        GxB_Matrix_Option_set_(A, GxB_HYPER_SWITCH, hyper_switch) ;
     }
-    else if (is_hyper)
+    else if (A_is_hyper)
     {
         // this forces the matrix to be always hypersparse
-        GxB_Matrix_Option_set_(A, GxB_HYPER, GxB_ALWAYS_HYPER) ;
-        if (A->vdim > 1)
-        {
-            ASSERT (A->is_hyper == is_hyper) ;
-        }
-        else
-        {
-            // column vectors are never hypersparse
-            ASSERT (!A->is_hyper) ;
-        }
+        ASSERT_MATRIX_OK (A, "to always hyper", GB0) ;
+        GxB_Matrix_Option_set_(A, GxB_SPARSITY_CONTROL, GxB_HYPERSPARSE) ;
+        ASSERT_MATRIX_OK (A, "always hyper", GB0) ;
+    }
+
+    //--------------------------------------------------------------------------
+    // set the sparsity control and conform the matrix
+    //--------------------------------------------------------------------------
+
+    if (has_sparsity_control)
+    {
+        ASSERT_MATRIX_OK (A, "setting sparsity", GB0) ;
+        GxB_Matrix_Option_set_(A, GxB_SPARSITY_CONTROL, sparsity_control) ;
+        ASSERT_MATRIX_OK (A, "set sparsity", GB0) ;
     }
 
-    ASSERT_MATRIX_OK (A, "final hyper/nonhyper", GB0) ;
     ASSERT (A->is_csc == is_csc) ;
     ASSERT (nrows_old == GB_NROWS (A)) ;
     ASSERT (ncols_old == GB_NCOLS (A)) ;
@@ -401,6 +469,14 @@ GrB_Matrix GB_mx_mxArray_to_Matrix     // returns GraphBLAS version of A
     // return the GraphBLAS matrix
     //--------------------------------------------------------------------------
 
+    info = GrB_Matrix_wait (&A) ;
+    if (info != GrB_SUCCESS)
+    {
+        FREE_ALL ;
+        mexWarnMsgIdAndTxt ("GB:warn", "matrix wait failed") ;
+        return (NULL) ;
+    }
+
     ASSERT_MATRIX_OK (A, "got A from MATLAB", GB0) ;
     return (A) ;
 }
diff --git a/GraphBLAS/Test/GB_mx_mxArray_to_SelectOp.c b/GraphBLAS/Test/GB_mx_mxArray_to_SelectOp.c
index 794d7388f9..2939df397e 100644
--- a/GraphBLAS/Test/GB_mx_mxArray_to_SelectOp.c
+++ b/GraphBLAS/Test/GB_mx_mxArray_to_SelectOp.c
@@ -2,8 +2,8 @@
 // GB_mx_mxArray_to_SelectOp
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -18,8 +18,6 @@ bool GB_mx_mxArray_to_SelectOp          // true if successful
     const char *name                    // name of the argument
 )
 {
-    GB_WHERE ("GB_mx_mxArray_to_SelectOp") ;
-
     (*handle) = NULL ;
     const mxArray *opname_mx = NULL ;
 
@@ -70,6 +68,8 @@ bool GB_mx_mxArray_to_SelectOp          // true if successful
     else if (MATCH (opname, "lt_thunk" )) { op = GxB_LT_THUNK ; }
     else if (MATCH (opname, "le_thunk" )) { op = GxB_LE_THUNK ; }
 
+    else if (MATCH (opname, "isnan"    )) { op = NULL ; }
+
     else
     {
         mexWarnMsgIdAndTxt ("GB:warn", "unknown select op") ;
@@ -77,7 +77,10 @@ bool GB_mx_mxArray_to_SelectOp          // true if successful
     }
 
     // return the op
-    ASSERT_SELECTOP_OK (op, name, GB0) ;
+    if (op != NULL)
+    {
+        ASSERT_SELECTOP_OK (op, name, GB0) ;
+    }
     (*handle) = op ;
     return (true) ;
 }
diff --git a/GraphBLAS/Test/GB_mx_mxArray_to_Semiring.c b/GraphBLAS/Test/GB_mx_mxArray_to_Semiring.c
index 0c2ba238bc..fa6f8ca761 100644
--- a/GraphBLAS/Test/GB_mx_mxArray_to_Semiring.c
+++ b/GraphBLAS/Test/GB_mx_mxArray_to_Semiring.c
@@ -2,8 +2,8 @@
 // GB_mx_mxArray_to_Semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -29,8 +29,6 @@ bool GB_mx_mxArray_to_Semiring         // true if successful
 )
 {
 
-    GB_WHERE ("GB_mx_mxArray_to_Semiring") ;
-
     (*handle) = NULL ;
     const mxArray *multiply_mx = NULL, *type_mx = NULL, *add_mx = NULL ;
 
diff --git a/GraphBLAS/Test/GB_mx_mxArray_to_UnaryOp.c b/GraphBLAS/Test/GB_mx_mxArray_to_UnaryOp.c
index a7c554db70..cc0f0ef282 100644
--- a/GraphBLAS/Test/GB_mx_mxArray_to_UnaryOp.c
+++ b/GraphBLAS/Test/GB_mx_mxArray_to_UnaryOp.c
@@ -2,8 +2,8 @@
 // GB_mx_mxArray_to_UnaryOp
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -28,8 +28,6 @@ bool GB_mx_mxArray_to_UnaryOp           // true if successful
     const bool user_complex             // if true, use user-defined Complex
 )
 {
-    GB_WHERE ("GB_mx_mxArray_to_UnaryOp") ;
-
     (*op_handle) = NULL ;
     const mxArray *opname_mx = NULL, *optype_mx = NULL ;
 
diff --git a/GraphBLAS/Test/GB_mx_mxArray_to_Vector.c b/GraphBLAS/Test/GB_mx_mxArray_to_Vector.c
index 8eb3604988..ae72855e91 100644
--- a/GraphBLAS/Test/GB_mx_mxArray_to_Vector.c
+++ b/GraphBLAS/Test/GB_mx_mxArray_to_Vector.c
@@ -2,8 +2,8 @@
 // GB_mx_mxArray_to_Vector
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -23,7 +23,7 @@ GrB_Vector GB_mx_mxArray_to_Vector     // returns GraphBLAS version of V
     if (V != NULL && !GB_VECTOR_OK (V))
     {
         mexWarnMsgIdAndTxt ("GB:warn", "must be a column vector") ;
-        GB_MATRIX_FREE (&V) ;
+        GrB_Matrix_free_(&V) ;
         return (NULL) ;
     }
     return ((GrB_Vector) V) ;
diff --git a/GraphBLAS/Test/GB_mx_mxArray_to_array.c b/GraphBLAS/Test/GB_mx_mxArray_to_array.c
index 496412162e..78f38e7910 100644
--- a/GraphBLAS/Test/GB_mx_mxArray_to_array.c
+++ b/GraphBLAS/Test/GB_mx_mxArray_to_array.c
@@ -2,8 +2,8 @@
 // GB_mx_mxArray_to_array: get a dense numerical MATLAB array
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_mxArray_to_indices.c b/GraphBLAS/Test/GB_mx_mxArray_to_indices.c
index 6570882aee..0eb493616f 100644
--- a/GraphBLAS/Test/GB_mx_mxArray_to_indices.c
+++ b/GraphBLAS/Test/GB_mx_mxArray_to_indices.c
@@ -2,8 +2,8 @@
 // GB_mx_mxArray_to_indices
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_mxArray_to_string.c b/GraphBLAS/Test/GB_mx_mxArray_to_string.c
index 2606a96e78..611361d1ff 100644
--- a/GraphBLAS/Test/GB_mx_mxArray_to_string.c
+++ b/GraphBLAS/Test/GB_mx_mxArray_to_string.c
@@ -2,8 +2,8 @@
 // GB_mx_mxArray_to_string.c: get a MATLAB string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_object_to_mxArray.c b/GraphBLAS/Test/GB_mx_object_to_mxArray.c
index 53bb719d59..73d2fe9483 100644
--- a/GraphBLAS/Test/GB_mx_object_to_mxArray.c
+++ b/GraphBLAS/Test/GB_mx_object_to_mxArray.c
@@ -2,12 +2,12 @@
 // GB_mx_object_to_mxArray
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
-// Convert a GraphBLAS sparse matrix to a MATLAB struct C containing
+// Convert a GraphBLAS sparse or full matrix to a MATLAB struct C containing
 // C.matrix and a string C.class.  The GraphBLAS matrix is destroyed.
 
 // This could be done using only user-callable GraphBLAS functions, by
@@ -37,7 +37,7 @@ mxArray *GB_mx_object_to_mxArray   // returns the MATLAB mxArray
     const bool create_struct        // if true, then return a struct
 )
 {
-    GB_WHERE ("GB_mx_object_to_mxArray") ;
+    GB_CONTEXT ("GB_mx_object_to_mxArray") ;
 
     // get the inputs
     mxArray *A, *Astruct, *X = NULL ;
@@ -48,52 +48,88 @@ mxArray *GB_mx_object_to_mxArray   // returns the MATLAB mxArray
     ASSERT_MATRIX_OK (C, name, GB0) ;
 
     // C must not be shallow
-    ASSERT (!C->i_shallow && !C->x_shallow && !C->p_shallow && !C->h_shallow) ;
+    ASSERT (!C->p_shallow) ;
+    ASSERT (!C->h_shallow) ;
+    ASSERT (!C->b_shallow) ;
+    ASSERT (!C->i_shallow) ;
+    ASSERT (!C->x_shallow) ;
 
     // make sure there are no pending computations
-    GB_Matrix_wait (C, Context) ;
+    if (GB_IS_FULL (C) || GB_IS_BITMAP (C))
+    {
+        ASSERT (!GB_JUMBLED (C)) ;
+        ASSERT (!GB_ZOMBIES (C)) ;
+        ASSERT (!GB_PENDING (C)) ;
+    }
+    else
+    {
+        // this may convert C to full
+        GrB_Matrix_wait (&C) ;
+        C = (*handle) ;
+    }
 
-    // must be done after GB_Matrix_wait:
+    // must be done after GrB_Matrix_wait:
     int64_t cnz = GB_NNZ (C) ;
 
     ASSERT_MATRIX_OK (C, "TO MATLAB after assembling pending tuples", GB0) ;
 
-    // convert C to non-hypersparse
-    GxB_Matrix_Option_set_(C, GxB_HYPER, GxB_NEVER_HYPER) ;
+    // ensure C is sparse or full, not hypersparse or bitmap
+    GxB_Matrix_Option_set_(C, GxB_SPARSITY_CONTROL, GxB_FULL + GxB_SPARSE) ;
+    ASSERT_MATRIX_OK (C, "TO MATLAB, sparse or full", GB0) ;
+    ASSERT (!GB_IS_HYPERSPARSE (C)) ;
+    ASSERT (!GB_IS_BITMAP (C)) ;
 
-    ASSERT_MATRIX_OK (C, "TO MATLAB, non-hyper", GB0) ;
-    ASSERT (!C->is_hyper) ;
-    ASSERT (C->h == NULL) ;
+    // get the current sparsity
+    int sparsity ;
+    GxB_Matrix_Option_get_(C, GxB_SPARSITY_STATUS, &sparsity) ;
+    ASSERT (sparsity == GxB_FULL || sparsity == GxB_SPARSE) ;
 
     // make sure it's CSC
-    // GrB_Matrix CT ;
     if (!C->is_csc)
     {
         GxB_Matrix_Option_set_(C, GxB_FORMAT, GxB_BY_COL) ;
     }
 
+    // setting to CSC may have transposed the matrix
+    ASSERT (GB_JUMBLED_OK (C)) ;
+    GrB_Matrix_wait (&C) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    cnz = GB_NNZ (C) ;
+
     ASSERT_MATRIX_OK (C, "TO MATLAB, non-hyper CSC", GB0) ;
-    ASSERT (!C->is_hyper) ;
+    ASSERT (!GB_JUMBLED (C)) ;
+    ASSERT (!GB_IS_HYPERSPARSE (C)) ;
+    ASSERT (!GB_IS_BITMAP (C)) ;
+    ASSERT (GB_IS_SPARSE (C) || GB_IS_FULL (C)) ;
     ASSERT (C->is_csc) ;
 
     // MATLAB doesn't want NULL pointers in its empty matrices
     if (C->x == NULL)
     {
         ASSERT (C->nzmax == 0 && cnz == 0) ;
-        C->x = GB_CALLOC (2 * sizeof (double), GB_void) ;
+        C->x = GB_MALLOC (2 * sizeof (double), GB_void) ;
+        memset (C->x, 0, 2 * sizeof (double)) ;
         C->x_shallow = false ;
     }
-    if (C->i == NULL)
-    {
-        ASSERT (C->nzmax == 0 && cnz == 0) ;
-        C->i = GB_CALLOC (1, int64_t) ;
-        C->i_shallow = false ;
-    }
-    if (C->p == NULL)
+
+    bool C_is_full = (sparsity == GxB_FULL) ;
+    if (!C_is_full)
     {
-        ASSERT (C->nzmax == 0 && cnz == 0) ;
-        C->p = GB_CALLOC (C->vdim + 1, int64_t) ;
-        C->p_shallow = false ;
+        // MATLAB doesn't want NULL pointers in its empty sparse matrices
+        if (C->i == NULL)
+        {
+            ASSERT (C->nzmax == 0 && cnz == 0) ;
+            C->i = GB_MALLOC (1, int64_t) ;
+            C->i [0] = 0 ;
+            C->i_shallow = false ;
+        }
+        if (C->p == NULL)
+        {
+            ASSERT (C->nzmax == 0 && cnz == 0) ;
+            C->p = GB_MALLOC (C->vdim + 1, int64_t) ;
+            memset (C->p, 0, (C->vdim + 1) * sizeof (int64_t)) ;
+            C->p_shallow = false ;
+        }
     }
 
     C->nzmax = GB_IMAX (C->nzmax, 1) ;
@@ -102,7 +138,89 @@ mxArray *GB_mx_object_to_mxArray   // returns the MATLAB mxArray
     // create the MATLAB matrix A and link in the numerical values of C
     //--------------------------------------------------------------------------
 
-    if (C->type == GrB_BOOL)
+    if (C_is_full)
+    {
+        // C is full.  See gb_export_to_mxfull
+        // allocate an empty dense matrix of the right type, then set content
+
+        void *Cx = (void *) C->x ;
+
+        if (ctype == GrB_BOOL)
+        { 
+            A = mxCreateLogicalMatrix (0, 0) ;
+            mxSetData (A, Cx) ;
+        }
+        else if (ctype == GrB_FP32)
+        { 
+            A = mxCreateNumericMatrix (0, 0, mxSINGLE_CLASS, mxREAL) ;
+            mxSetSingles (A, Cx) ;
+        }
+        else if (ctype == GrB_FP64)
+        { 
+            A = mxCreateNumericMatrix (0, 0, mxDOUBLE_CLASS, mxREAL) ;
+            mxSetDoubles (A, Cx) ;
+        }
+        else if (ctype == GrB_INT8)
+        { 
+            A = mxCreateNumericMatrix (0, 0, mxINT8_CLASS, mxREAL) ;
+            mxSetInt8s (A, Cx) ;
+        }
+        else if (ctype == GrB_INT16)
+        { 
+            A = mxCreateNumericMatrix (0, 0, mxINT16_CLASS, mxREAL) ;
+            mxSetInt16s (A, Cx) ;
+        }
+        else if (ctype == GrB_INT32)
+        { 
+            A = mxCreateNumericMatrix (0, 0, mxINT32_CLASS, mxREAL) ;
+            mxSetInt32s (A, Cx) ;
+        }
+        else if (ctype == GrB_INT64)
+        { 
+            A = mxCreateNumericMatrix (0, 0, mxINT64_CLASS, mxREAL) ;
+            mxSetInt64s (A, Cx) ;
+        }
+        else if (ctype == GrB_UINT8)
+        { 
+            A = mxCreateNumericMatrix (0, 0, mxUINT8_CLASS, mxREAL) ;
+            mxSetUint8s (A, Cx) ;
+        }
+        else if (ctype == GrB_UINT16)
+        { 
+            A = mxCreateNumericMatrix (0, 0, mxUINT16_CLASS, mxREAL) ;
+            mxSetUint16s (A, Cx) ;
+        }
+        else if (ctype == GrB_UINT32)
+        { 
+            A = mxCreateNumericMatrix (0, 0, mxUINT32_CLASS, mxREAL) ;
+            mxSetUint32s (A, Cx) ;
+        }
+        else if (ctype == GrB_UINT64)
+        { 
+            A = mxCreateNumericMatrix (0, 0, mxUINT64_CLASS, mxREAL) ;
+            mxSetUint64s (A, Cx) ;
+        }
+        else if (ctype == GxB_FC32)
+        {
+            A = mxCreateNumericMatrix (0, 0, mxSINGLE_CLASS, mxCOMPLEX) ;
+            mxSetComplexSingles (A, Cx) ;
+        }
+        else if (ctype == Complex || ctype == GxB_FC64)
+        {
+            A = mxCreateNumericMatrix (0, 0, mxDOUBLE_CLASS, mxCOMPLEX) ;
+            mxSetComplexDoubles (A, Cx) ;
+        }
+        else
+        {
+            mexErrMsgTxt ("... unsupported type") ;
+        }
+
+        mexMakeMemoryPersistent (C->x) ;
+        C->x_shallow = false ;
+        AS_IF_FREE (C->x) ;   // unlink C->x from C since it's now in MATLAB C
+
+    }
+    else if (C->type == GrB_BOOL)
     {
         // C is boolean, which is the same as a MATLAB logical sparse matrix
         A = mxCreateSparseLogicalMatrix (0, 0, 0) ;
@@ -140,7 +258,7 @@ mxArray *GB_mx_object_to_mxArray   // returns the MATLAB mxArray
         // C is single complex, typecast to sparse double complex
         A = mxCreateSparse (C->vlen, C->vdim, C->nzmax, mxCOMPLEX) ;
         GB_cast_array (mxGetComplexDoubles (A), GB_FC64_code,
-            C->x, C->type->code, C->type->size, cnz, 1) ;
+            C->x, C->type->code, NULL, C->type->size, cnz, 1) ;
 
     }
     else
@@ -150,7 +268,7 @@ mxArray *GB_mx_object_to_mxArray   // returns the MATLAB mxArray
         A = mxCreateSparse (0, 0, 0, mxREAL) ;
         double *Sx = GB_MALLOC (cnz+1, double) ;
         GB_cast_array (Sx, GB_FP64_code,
-            C->x, C->type->code, C->type->size, cnz, 1) ;
+            C->x, C->type->code, NULL, C->type->size, cnz, 1) ;
         mexMakeMemoryPersistent (Sx) ;
         mxSetPr (A, Sx) ;
 
@@ -177,23 +295,27 @@ mxArray *GB_mx_object_to_mxArray   // returns the MATLAB mxArray
     mxSetM (A, C->vlen) ;
     mxSetN (A, C->vdim) ;
     mxSetNzmax (A, C->nzmax) ;
-    mxFree (mxGetJc (A)) ;
-    mxFree (mxGetIr (A)) ;
-    mexMakeMemoryPersistent (C->p) ;
-    mexMakeMemoryPersistent (C->i) ;
-    mxSetJc (A, (size_t *) C->p) ;
-    mxSetIr (A, (size_t *) C->i) ;
 
-    // treat C->p as if freed
-    AS_IF_FREE (C->p) ;
+    if (!C_is_full)
+    {
+        mxFree (mxGetJc (A)) ;
+        mxFree (mxGetIr (A)) ;
+        mexMakeMemoryPersistent (C->p) ;
+        mexMakeMemoryPersistent (C->i) ;
+        mxSetJc (A, (size_t *) C->p) ;
+        mxSetIr (A, (size_t *) C->i) ;
+
+        // treat C->p as if freed
+        AS_IF_FREE (C->p) ;
 
-    // treat C->i as if freed
-    C->i_shallow = false ;
-    AS_IF_FREE (C->i) ;
+        // treat C->i as if freed
+        C->i_shallow = false ;
+        AS_IF_FREE (C->i) ;
+    }
 
     // free C, but leave any shallow components untouched
     // since these have been transplanted into the MATLAB matrix.
-    GB_MATRIX_FREE (handle) ;
+    GrB_Matrix_free_(handle) ;
 
     if (create_struct)
     {
diff --git a/GraphBLAS/Test/GB_mx_put_global.c b/GraphBLAS/Test/GB_mx_put_global.c
index b2f703bdf4..09d15a71a8 100644
--- a/GraphBLAS/Test/GB_mx_put_global.c
+++ b/GraphBLAS/Test/GB_mx_put_global.c
@@ -2,8 +2,8 @@
 // GB_mx_put_global: put the GraphBLAS status in MATLAB workspace
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,8 +11,7 @@
 
 void GB_mx_put_global
 (
-    bool cover,
-    GrB_Desc_Value AxB_method_used
+    bool cover
 )
 {
 
@@ -26,7 +25,7 @@ void GB_mx_put_global
     // return the time to MATLAB, if it was computed
     //--------------------------------------------------------------------------
 
-    GB_mx_put_time (AxB_method_used) ;
+    GB_mx_put_time ( ) ;
 
     //--------------------------------------------------------------------------
     // log the statement coverage
@@ -49,8 +48,8 @@ void GB_mx_put_global
     int64_t nmalloc = GB_Global_nmalloc_get ( ) ;
     if (nmalloc != 0)
     {
-        printf ("GraphBLAS nmalloc "GBd"!\n", nmalloc) ;
-        mexErrMsgTxt ("memory leak!") ;
+        printf ("in GB_mx_put_global: GraphBLAS nmalloc "GBd"!\n", nmalloc) ;
+        mexErrMsgTxt ("memory leak in test!") ;
     }
 }
 
diff --git a/GraphBLAS/Test/GB_mx_put_time.c b/GraphBLAS/Test/GB_mx_put_time.c
index 2baba59e8f..3c5bd35b07 100644
--- a/GraphBLAS/Test/GB_mx_put_time.c
+++ b/GraphBLAS/Test/GB_mx_put_time.c
@@ -2,8 +2,8 @@
 // GB_mx_put_time: put the time back to the global MATLAB workspace
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -11,10 +11,7 @@
 
 double grbtime = 0, tic [2] = {0,0} ;
 
-void GB_mx_put_time
-(
-    GrB_Desc_Value AxB_method_used
-)
+void GB_mx_put_time (void)
 {
 
     // create a MATLAB array with the right size
@@ -24,7 +21,7 @@ void GB_mx_put_time
     double *t = (double *) mxGetData (grbresults_matlab) ;
 
     t [0] = grbtime ;
-    t [1] = AxB_method_used ;
+    t [1] = 0 ;
 
     grbtime = 0 ;
 
diff --git a/GraphBLAS/Test/GB_mx_same.c b/GraphBLAS/Test/GB_mx_same.c
index dcc6fe604d..8985e35f3d 100644
--- a/GraphBLAS/Test/GB_mx_same.c
+++ b/GraphBLAS/Test/GB_mx_same.c
@@ -2,8 +2,8 @@
 // GB_mx_same: check if two arrays are equal
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_semiring.c b/GraphBLAS/Test/GB_mx_semiring.c
index a3842a4ced..1351716926 100644
--- a/GraphBLAS/Test/GB_mx_semiring.c
+++ b/GraphBLAS/Test/GB_mx_semiring.c
@@ -2,8 +2,8 @@
 // GB_mx_semiring: get a built-in semiring
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -84,9 +84,6 @@ GrB_Semiring GB_mx_semiring         // semiring, or NULL if error
         add_opcode = GB_boolean_rename (add_opcode) ;
     }
 
-//  printf ("zcode %d xcode %d ycode %d add_opcode %d mult_opcode %d\n",
-//      zcode, xcode, mult->ytype->code, add_opcode, mult_opcode) ;
-
     //--------------------------------------------------------------------------
     // launch the switch factory
     //--------------------------------------------------------------------------
@@ -2544,12 +2541,246 @@ GrB_Semiring GB_mx_semiring         // semiring, or NULL if error
             default : ;
         }
 
+        //----------------------------------------------------------------------
+        // 80 positional semirings
+        //----------------------------------------------------------------------
+
+        switch (mult_opcode)
+        {
+
+            case GB_FIRSTI_opcode   :   // z = first_i(A(i,k),y) == i
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTI_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTI_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTI_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTI_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTI_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTI_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTI_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTI_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTI_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTI_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_FIRSTI1_opcode  :   // z = first_i1(A(i,k),y) == i+1
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTI1_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTI1_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTI1_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTI1_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTI1_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTI1_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTI1_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTI1_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTI1_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTI1_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_FIRSTJ_opcode   :   // z = first_j(A(i,k),y) == k
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTJ_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTJ_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTJ_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTJ_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTJ_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTJ_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTJ_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTJ_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTJ_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTJ_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_FIRSTJ1_opcode  :   // z = first_j1(A(i,k),y) == k+1
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTJ1_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTJ1_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTJ1_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTJ1_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTJ1_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_FIRSTJ1_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_FIRSTJ1_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_FIRSTJ1_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_FIRSTJ1_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_FIRSTJ1_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_SECONDI_opcode  :   // z = second_i(x,B(k,j)) == k
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDI_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDI_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDI_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDI_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDI_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDI_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDI_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDI_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDI_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDI_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_SECONDI1_opcode :   // z = second_i1(x,B(k,j)) == k+1
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDI1_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDI1_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDI1_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDI1_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDI1_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDI1_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDI1_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDI1_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDI1_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDI1_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_SECONDJ_opcode  :   // z = second_j(x,B(i,j)) == j
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDJ_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDJ_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDJ_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDJ_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDJ_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDJ_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDJ_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDJ_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDJ_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDJ_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            case GB_SECONDJ1_opcode :   // z = second_j1(x,B(i,j)) == j+1
+
+                if (zcode == GB_INT64_code)
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDJ1_INT64) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDJ1_INT64) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDJ1_INT64) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDJ1_INT64) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDJ1_INT64) ;
+                        default: ;
+                    }
+                }
+                else
+                {
+                    switch (add_opcode)
+                    {
+                        case GB_MIN_opcode   : return (GxB_MIN_SECONDJ1_INT32) ;
+                        case GB_MAX_opcode   : return (GxB_MAX_SECONDJ1_INT32) ;
+                        case GB_TIMES_opcode : return (GxB_TIMES_SECONDJ1_INT32) ;
+                        case GB_PLUS_opcode  : return (GxB_PLUS_SECONDJ1_INT32) ;
+                        case GB_ANY_opcode   : return (GxB_ANY_SECONDJ1_INT32) ;
+                        default: ;
+                    }
+                }
+                break ;
+
+            default  : ;
+        }
+
     }
     else if (xcode != GB_BOOL_code)
     {
 
         //----------------------------------------------------------------------
-        // 300 semirings with TxT->bool multiply operators
+        // 300 semirings with TxT -> bool multiply operators
         //----------------------------------------------------------------------
 
         // x,y are one of the 10 non-Boolean types, z is Boolean
diff --git a/GraphBLAS/Test/GB_mx_string_to_BinaryOp.c b/GraphBLAS/Test/GB_mx_string_to_BinaryOp.c
index 13cc4f371e..68311fa696 100644
--- a/GraphBLAS/Test/GB_mx_string_to_BinaryOp.c
+++ b/GraphBLAS/Test/GB_mx_string_to_BinaryOp.c
@@ -2,8 +2,8 @@
 // GB_mx_string_to_BinaryOp.c: get a GraphBLAS operator from MATLAB strings
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -33,8 +33,6 @@ bool GB_mx_string_to_BinaryOp       // true if successful, false otherwise
 )
 {
 
-    GB_WHERE ("GB_mx_string_to_BinaryOp") ;
-
     (*op_handle) = NULL ;
     GrB_BinaryOp op = NULL ;
 
@@ -175,6 +173,24 @@ bool GB_mx_string_to_BinaryOp       // true if successful, false otherwise
         else if (MATCH (opname, "ldexp"   )) { opcode = GB_LDEXP_opcode ; }
         else if (MATCH (opname, "pow"     )) { opcode = GB_POW_opcode ; }
 
+        // positional ops
+        else if (MATCH (opname, "firsti"  ) ||
+                 MATCH (opname, "1sti"    )) { opcode = GB_FIRSTI_opcode ; }
+        else if (MATCH (opname, "firsti1" ) ||
+                 MATCH (opname, "1sti1"   )) { opcode = GB_FIRSTI1_opcode ; }
+        else if (MATCH (opname, "firstj"  ) ||
+                 MATCH (opname, "1stj"    )) { opcode = GB_FIRSTJ_opcode ; }
+        else if (MATCH (opname, "firstj1" ) ||
+                 MATCH (opname, "1stj1"   )) { opcode = GB_FIRSTJ1_opcode ; }
+        else if (MATCH (opname, "secondi" ) ||
+                 MATCH (opname, "2ndi"    )) { opcode = GB_SECONDI_opcode ; }
+        else if (MATCH (opname, "secondi1") ||
+                 MATCH (opname, "2ndi1"   )) { opcode = GB_SECONDI1_opcode ; }
+        else if (MATCH (opname, "secondj" ) ||
+                 MATCH (opname, "2ndj"    )) { opcode = GB_SECONDJ_opcode ; }
+        else if (MATCH (opname, "secondj1") ||
+                 MATCH (opname, "2ndj1"   )) { opcode = GB_SECONDJ1_opcode ; }
+
         // z is complex, x and y are real
         else if (cmplx_op                  ) { opcode = GB_CMPLX_opcode ; }
 
@@ -203,6 +219,16 @@ bool GB_mx_string_to_BinaryOp       // true if successful, false otherwise
         }
 
         GB_Type_code xcode = optype->code ;
+        bool is64 = (xcode == GB_INT64_code) ;
+
+        if (GB_OPCODE_IS_POSITIONAL (opcode))
+        {
+            if (! (xcode == GB_INT64_code || xcode == GB_INT32_code))
+            {
+                mexWarnMsgIdAndTxt ("GB:warn","unknown operator") ;
+                return (false) ;
+            }
+        }
 
         switch (opcode)
         {
@@ -1055,6 +1081,15 @@ bool GB_mx_string_to_BinaryOp       // true if successful, false otherwise
                 }
                 break ;
 
+            case GB_FIRSTI_opcode   : op = is64 ? GxB_FIRSTI_INT64   : GxB_FIRSTI_INT32   ; break ;
+            case GB_FIRSTI1_opcode  : op = is64 ? GxB_FIRSTI1_INT64  : GxB_FIRSTI1_INT32  ; break ;
+            case GB_FIRSTJ_opcode   : op = is64 ? GxB_FIRSTJ_INT64   : GxB_FIRSTJ_INT32   ; break ;
+            case GB_FIRSTJ1_opcode  : op = is64 ? GxB_FIRSTJ1_INT64  : GxB_FIRSTJ1_INT32  ; break ;
+            case GB_SECONDI_opcode  : op = is64 ? GxB_SECONDI_INT64  : GxB_SECONDI_INT32  ; break ;
+            case GB_SECONDI1_opcode : op = is64 ? GxB_SECONDI1_INT64 : GxB_SECONDI1_INT32 ; break ;
+            case GB_SECONDJ_opcode  : op = is64 ? GxB_SECONDJ_INT64  : GxB_SECONDJ_INT32  ; break ;
+            case GB_SECONDJ1_opcode : op = is64 ? GxB_SECONDJ1_INT64 : GxB_SECONDJ1_INT32 ; break ;
+
             default : 
                 mexWarnMsgIdAndTxt ("GB:warn","unknown binary operator") ;
                 return (false) ;
diff --git a/GraphBLAS/Test/GB_mx_string_to_Type.c b/GraphBLAS/Test/GB_mx_string_to_Type.c
index a5e977e679..48d032e721 100644
--- a/GraphBLAS/Test/GB_mx_string_to_Type.c
+++ b/GraphBLAS/Test/GB_mx_string_to_Type.c
@@ -2,8 +2,8 @@
 // GB_mx_string_to_Type.c: return the GrB_type from a string
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_string_to_UnaryOp.c b/GraphBLAS/Test/GB_mx_string_to_UnaryOp.c
index cafec8586a..f48e48325c 100644
--- a/GraphBLAS/Test/GB_mx_string_to_UnaryOp.c
+++ b/GraphBLAS/Test/GB_mx_string_to_UnaryOp.c
@@ -2,8 +2,8 @@
 // GB_mx_string_to_UnaryOp.c: get a GraphBLAS operator from MATLAB strings
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -24,7 +24,6 @@ bool GB_mx_string_to_UnaryOp            // true if successful, false otherwise
     const bool user_complex             // true if X is complex
 )
 {
-    GB_WHERE ("GB_mx_string_to_UnaryOp") ;
 
     (*op_handle) = NULL ;
     GrB_UnaryOp op = NULL ;
@@ -155,16 +154,32 @@ bool GB_mx_string_to_UnaryOp            // true if successful, false otherwise
         else if (MATCH (opname, "bnot"    )) { opcode = GB_BNOT_opcode ; }
         else if (MATCH (opname, "bcmp"    )) { opcode = GB_BNOT_opcode ; }
 
+        else if (MATCH (opname, "positioni" )) { opcode = GB_POSITIONI_opcode ; }
+        else if (MATCH (opname, "i"         )) { opcode = GB_POSITIONI_opcode ; }
+        else if (MATCH (opname, "positioni1")) { opcode = GB_POSITIONI1_opcode ; }
+        else if (MATCH (opname, "i1"        )) { opcode = GB_POSITIONI1_opcode ; }
+        else if (MATCH (opname, "positionj" )) { opcode = GB_POSITIONJ_opcode ; }
+        else if (MATCH (opname, "j"         )) { opcode = GB_POSITIONJ_opcode ; }
+        else if (MATCH (opname, "positionj1")) { opcode = GB_POSITIONJ1_opcode ; }
+        else if (MATCH (opname, "j1"        )) { opcode = GB_POSITIONJ1_opcode ; }
+
         else
         {
             mexWarnMsgIdAndTxt ("GB:warn", "unrecognized function name") ;
             return (false) ;
         }
 
-//      GxB_print (default_optype, 3) ;
-//      GxB_print (optype, 3) ;
-
         GB_Type_code xcode = optype->code ;
+        bool is64 = (xcode == GB_INT64_code) ;
+
+        if (GB_OPCODE_IS_POSITIONAL (opcode))
+        {
+            if (! (xcode == GB_INT64_code || xcode == GB_INT32_code))
+            {
+                mexWarnMsgIdAndTxt ("GB:warn","unknown operator") ;
+                return (false) ;
+            }
+        }
 
         switch (opcode)
         {
@@ -862,6 +877,14 @@ bool GB_mx_string_to_UnaryOp            // true if successful, false otherwise
                 }
                 break ;
 
+    //--------------------------------------------------------------------------
+    // positional ops
+    //--------------------------------------------------------------------------
+
+            case GB_POSITIONI_opcode  : op = is64 ? GxB_POSITIONI_INT64  : GxB_POSITIONI_INT32  ; break ;
+            case GB_POSITIONI1_opcode : op = is64 ? GxB_POSITIONI1_INT64 : GxB_POSITIONI1_INT32 ; break ;
+            case GB_POSITIONJ_opcode  : op = is64 ? GxB_POSITIONJ_INT64  : GxB_POSITIONJ_INT32  ; break ;
+            case GB_POSITIONJ1_opcode : op = is64 ? GxB_POSITIONJ1_INT64 : GxB_POSITIONJ1_INT32 ; break ;
 
     //--------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/GB_mx_xsame.c b/GraphBLAS/Test/GB_mx_xsame.c
index 211577b1dc..4fdd8fef2b 100644
--- a/GraphBLAS/Test/GB_mx_xsame.c
+++ b/GraphBLAS/Test/GB_mx_xsame.c
@@ -2,8 +2,8 @@
 // GB_mx_xsame: check if two arrays are equal (ignoring zombies)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,6 +13,7 @@ bool GB_mx_xsame    // true if arrays X and Y are the same (ignoring zombies)
 (
     char *X,
     char *Y,
+    int8_t *Xb,     // bitmap of X and Y (NULL if no bitmap)
     int64_t len,    // length of X and Y
     size_t s,       // size of each entry of X and Y
     int64_t *I      // row indices (for zombies), same length as X and Y
@@ -21,11 +22,18 @@ bool GB_mx_xsame    // true if arrays X and Y are the same (ignoring zombies)
     if (X == Y) return (true) ;
     if (X == NULL) return (false) ;
     if (Y == NULL) return (false) ;
-    if (I == NULL) return (false) ;
     for (int64_t i = 0 ; i < len ; i++)
     {
+        if (Xb != NULL && Xb [i] == 0)
+        {
+            // ignore X [i] and Y [i] if they are not in the bitmap
+            continue ;
+        }
         // check X [i] and Y [i], but ignore zombies
-        if (I [i] >= 0 && !GB_mx_same (X+i*s, Y+i*s, s)) return (false) ;
+        if (I == NULL || I [i] >= 0)
+        {
+            if (!GB_mx_same (X+i*s, Y+i*s, s)) return (false) ;
+        }
     }
     return (true) ;
 }
diff --git a/GraphBLAS/Test/GB_mx_xsame32.c b/GraphBLAS/Test/GB_mx_xsame32.c
index c47ed8ffad..ef7fea8dbf 100644
--- a/GraphBLAS/Test/GB_mx_xsame32.c
+++ b/GraphBLAS/Test/GB_mx_xsame32.c
@@ -2,8 +2,8 @@
 // GB_mx_xsame32: check if two FP32 arrays are equal (ignoring zombies)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,6 +13,7 @@ bool GB_mx_xsame32  // true if arrays X and Y are the same (ignoring zombies)
 (
     float *X,
     float *Y,
+    int8_t *Xb,     // bitmap of X and Y (NULL if no bitmap)
     int64_t len,    // length of X and Y
     int64_t *I,     // row indices (for zombies), same length as X and Y
     float eps       // error tolerance allowed (eps > 0)
@@ -21,11 +22,15 @@ bool GB_mx_xsame32  // true if arrays X and Y are the same (ignoring zombies)
     if (X == Y) return (true) ;
     if (X == NULL) return (false) ;
     if (Y == NULL) return (false) ;
-    if (I == NULL) return (false) ;
     for (int64_t i = 0 ; i < len ; i++)
     {
+        if (Xb != NULL && Xb [i] == 0)
+        {
+            // ignore X [i] and Y [i] if they are not in the bitmap
+            continue ;
+        }
         // check X [i] and Y [i], but ignore zombies
-        if (I [i] >= 0)
+        if (I == NULL || I [i] >= 0)
         {
             int c = fpclassify (X [i]) ;
             if (c != fpclassify (Y [i])) return (false) ;
diff --git a/GraphBLAS/Test/GB_mx_xsame64.c b/GraphBLAS/Test/GB_mx_xsame64.c
index 7d56f16ab9..7a98087573 100644
--- a/GraphBLAS/Test/GB_mx_xsame64.c
+++ b/GraphBLAS/Test/GB_mx_xsame64.c
@@ -2,8 +2,8 @@
 // GB_mx_xsame64: check if two FP64 arrays are equal (ignoring zombies)
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -13,6 +13,7 @@ bool GB_mx_xsame64  // true if arrays X and Y are the same (ignoring zombies)
 (
     double *X,
     double *Y,
+    int8_t *Xb,     // bitmap of X and Y (NULL if no bitmap)
     int64_t len,    // length of X and Y
     int64_t *I,     // row indices (for zombies), same length as X and Y
     double eps      // error tolerance allowed (eps > 0)
@@ -21,11 +22,15 @@ bool GB_mx_xsame64  // true if arrays X and Y are the same (ignoring zombies)
     if (X == Y) return (true) ;
     if (X == NULL) return (false) ;
     if (Y == NULL) return (false) ;
-    if (I == NULL) return (false) ;
     for (int64_t i = 0 ; i < len ; i++)
     {
+        if (Xb != NULL && Xb [i] == 0)
+        {
+            // ignore X [i] and Y [i] if they are not in the bitmap
+            continue ;
+        }
         // check X [i] and Y [i], but ignore zombies
-        if (I [i] >= 0)
+        if (I == NULL || I [i] >= 0)
         {
             int c = fpclassify (X [i]) ;
             if (c != fpclassify (Y [i])) return (false) ;
diff --git a/GraphBLAS/Test/GB_random_mask.m b/GraphBLAS/Test/GB_random_mask.m
index e1de2cddef..c665848739 100644
--- a/GraphBLAS/Test/GB_random_mask.m
+++ b/GraphBLAS/Test/GB_random_mask.m
@@ -5,8 +5,8 @@
 % With 3 arguments, Mask is a sparse logical matrix.
 % With 4, Mask is a struct.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 M = (sprand (m, n, d) ~= 0) ;
 
diff --git a/GraphBLAS/Test/GB_spec_Col_assign.m b/GraphBLAS/Test/GB_spec_Col_assign.m
index 22e2696cfc..f0859b1250 100644
--- a/GraphBLAS/Test/GB_spec_Col_assign.m
+++ b/GraphBLAS/Test/GB_spec_Col_assign.m
@@ -8,8 +8,8 @@
 %
 % This function does the same thing as GrB_Col_assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
diff --git a/GraphBLAS/Test/GB_spec_Col_extract.m b/GraphBLAS/Test/GB_spec_Col_extract.m
index 07d3223b82..fd6afdbf49 100644
--- a/GraphBLAS/Test/GB_spec_Col_extract.m
+++ b/GraphBLAS/Test/GB_spec_Col_extract.m
@@ -1,8 +1,8 @@
 function w = GB_spec_Col_extract (w, mask, accum, A, I, j, descriptor)
 %GB_SPEC_COL_EXTRACT a MATLAB mimic of GrB_Col_extract
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargout > 1 || nargin ~= 7)
     error ('usage: w = GB_spec_Col_extract (w, mask, accum, A, I, j, desc)');
diff --git a/GraphBLAS/Test/GB_spec_Matrix_eWiseAdd.m b/GraphBLAS/Test/GB_spec_Matrix_eWiseAdd.m
index de791d63e8..6c33323123 100644
--- a/GraphBLAS/Test/GB_spec_Matrix_eWiseAdd.m
+++ b/GraphBLAS/Test/GB_spec_Matrix_eWiseAdd.m
@@ -7,8 +7,8 @@
 % Computes C<Mask> = accum(C,T), in GraphBLAS notation, where T =A+B, A'+B,
 % A+B' or A'+B'.  The pattern of T is the union of A and B.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
@@ -44,16 +44,25 @@
 
 % T = A+B, with typecasting
 T.matrix = GB_spec_zeros (size (A.matrix), ztype) ;
+p = A.pattern & B.pattern ;
 
 % apply the add to entries in the intersection of A and B
-p = A.pattern & B.pattern ;
-% first cast the entries into the class of the operator
-% note that in the spec, all three domains z=op(x,y) can be different
-% here they are assumed to all be the same
-A1 = GB_mex_cast (A.matrix (p), xtype) ;
-B1 = GB_mex_cast (B.matrix (p), ytype) ;
-z = GB_spec_op (add, A1, B1) ;
-T.matrix (p) = z ;
+if (GB_spec_is_positional (add))
+    [m, n] = size (A.matrix) ;
+    for i = 1:m
+        for j = 1:n
+            if (p (i,j))
+                T.matrix (i,j) = GB_spec_binop_positional (add_op, i, j, i, j) ;
+            end
+        end
+    end
+else
+    % cast the entries into the class of the operator
+    A1 = GB_mex_cast (A.matrix (p), xtype) ;
+    B1 = GB_mex_cast (B.matrix (p), ytype) ;
+    % apply the operator
+    T.matrix (p) = GB_spec_op (add, A1, B1) ;
+end
 
 % cast entries in A but not in B, into the result T
 p = A.pattern & ~B.pattern ;
diff --git a/GraphBLAS/Test/GB_spec_Matrix_eWiseMult.m b/GraphBLAS/Test/GB_spec_Matrix_eWiseMult.m
index 64ab353b84..d88f67e11d 100644
--- a/GraphBLAS/Test/GB_spec_Matrix_eWiseMult.m
+++ b/GraphBLAS/Test/GB_spec_Matrix_eWiseMult.m
@@ -7,8 +7,8 @@
 % Computes C<Mask> = accum(C,T), in GraphBLAS notation, where T =A.*B, A'.*B,
 % A.*B' or A'.*B'.  The pattern of T is the union of A and B.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
@@ -45,28 +45,29 @@
 % T = A.*B, with typecasting
 T.matrix = GB_spec_zeros (size (A.matrix), ztype) ;
 
-% apply the mult to entries in the intersection of A and B
-p = A.pattern & B.pattern ;
-% first cast the entries into the class of the operator
-A1 = GB_mex_cast (A.matrix (p), xtype) ;
-B1 = GB_mex_cast (B.matrix (p), ytype) ;
-z = GB_spec_op (mult, A1, B1) ;
+% the pattern of T is the intersection of both A and B
+T.pattern = A.pattern & B.pattern ;
 
-% assert (isequal (ztype, GB_spec_type (T.matrix))) ;
-if (~isequal (ztype, GB_spec_type (z)))
-    A1
-    B1
-    mult
-    z
-    assert (false) ;
+% apply the mult to entries in the intersection of A and B
+if (GB_spec_is_positional (mult))
+    [m, n] = size (A.matrix) ;
+    for i = 1:m
+        for j = 1:n
+            if (T.pattern (i,j))
+                T.matrix (i,j) = GB_spec_binop_positional (mult_op, i, j, i, j) ;
+            end
+        end
+    end
+else
+    p = T.pattern ;
+    % first cast the entries into the class of the operator
+    A1 = GB_mex_cast (A.matrix (p), xtype) ;
+    B1 = GB_mex_cast (B.matrix (p), ytype) ;
+    T.matrix (p) = GB_spec_op (mult, A1, B1) ;
 end
 
-% the pattern of T is the intersection of both A and B
-T.matrix (p) = z ;
-T.pattern = p ;
 T.class = ztype ;
 
 % C<Mask> = accum (C,T): apply the accum, then Mask, and return the result
 C = GB_spec_accum_mask (C, Mask, accum, T, C_replace, Mask_comp, 0) ;
 
-
diff --git a/GraphBLAS/Test/GB_spec_Matrix_extract.m b/GraphBLAS/Test/GB_spec_Matrix_extract.m
index bc657641cd..265fdec539 100644
--- a/GraphBLAS/Test/GB_spec_Matrix_extract.m
+++ b/GraphBLAS/Test/GB_spec_Matrix_extract.m
@@ -6,8 +6,8 @@
 %
 % MATLAB mimic of C<Mask> = accum (A (I,J))
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
diff --git a/GraphBLAS/Test/GB_spec_Matrix_extractElement.m b/GraphBLAS/Test/GB_spec_Matrix_extractElement.m
index 853aede624..53c87ee3de 100644
--- a/GraphBLAS/Test/GB_spec_Matrix_extractElement.m
+++ b/GraphBLAS/Test/GB_spec_Matrix_extractElement.m
@@ -1,8 +1,8 @@
 function [x no_value] = GB_spec_Matrix_extractElement (A, i, j, xclass)
 %GB_SPEC_MATRIX_EXTRACTELEMENT a MATLAB mimic of GrB_Matrix_extractElement
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 A = GB_spec_matrix (A) ;
 if (nargin < 4)
diff --git a/GraphBLAS/Test/GB_spec_Row_assign.m b/GraphBLAS/Test/GB_spec_Row_assign.m
index 9a67730621..40c1bdb337 100644
--- a/GraphBLAS/Test/GB_spec_Row_assign.m
+++ b/GraphBLAS/Test/GB_spec_Row_assign.m
@@ -9,8 +9,8 @@
 %
 % This function does the same thing as GrB_Row_assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
diff --git a/GraphBLAS/Test/GB_spec_Vector_eWiseAdd.m b/GraphBLAS/Test/GB_spec_Vector_eWiseAdd.m
index b92493d5b9..a44bd25110 100644
--- a/GraphBLAS/Test/GB_spec_Vector_eWiseAdd.m
+++ b/GraphBLAS/Test/GB_spec_Vector_eWiseAdd.m
@@ -7,8 +7,8 @@
 % Computes w<mask> = accum(w,t), in GraphBLAS notation, where t =u+v,
 % The pattern of t is the union of u and v.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
diff --git a/GraphBLAS/Test/GB_spec_Vector_eWiseMult.m b/GraphBLAS/Test/GB_spec_Vector_eWiseMult.m
index 310b99aa89..2edff99ebb 100644
--- a/GraphBLAS/Test/GB_spec_Vector_eWiseMult.m
+++ b/GraphBLAS/Test/GB_spec_Vector_eWiseMult.m
@@ -7,8 +7,8 @@
 % Computes w<mask> = accum(w,t), in GraphBLAS notation, where t =u.*v,
 % The pattern of t is the union of u and v.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
diff --git a/GraphBLAS/Test/GB_spec_Vector_extract.m b/GraphBLAS/Test/GB_spec_Vector_extract.m
index c609f065bb..598c1f0c70 100644
--- a/GraphBLAS/Test/GB_spec_Vector_extract.m
+++ b/GraphBLAS/Test/GB_spec_Vector_extract.m
@@ -1,8 +1,8 @@
 function w = GB_spec_Vector_extract (w, mask, accum, u, I, descriptor)
 %GB_SPEC_VECTOR_EXTRACT a MATLAB mimic of GrB_Vector_extract
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargout > 1 || nargin ~= 6)
     error ('usage: w = GB_spec_Vector_extract (w, mask, accum, u, I, desc)') ;
diff --git a/GraphBLAS/Test/GB_spec_Vector_extractElement.m b/GraphBLAS/Test/GB_spec_Vector_extractElement.m
index 04926e5fb6..8af6cf68dd 100644
--- a/GraphBLAS/Test/GB_spec_Vector_extractElement.m
+++ b/GraphBLAS/Test/GB_spec_Vector_extractElement.m
@@ -1,9 +1,8 @@
 function [x no_value] = GB_spec_Vector_extractElement (A, i, xclass)
-%
 %GB_SPEC_VECTOR_EXTRACTELEMENT a MATLAB mimic of GrB_Matrix_extractElement
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (size (A,2) ~= 1)
     error ('invalid vector') ;
diff --git a/GraphBLAS/Test/GB_spec_accum.m b/GraphBLAS/Test/GB_spec_accum.m
index b6a1411aac..354bfba9cf 100644
--- a/GraphBLAS/Test/GB_spec_accum.m
+++ b/GraphBLAS/Test/GB_spec_accum.m
@@ -5,12 +5,16 @@
 %
 % Apply accum binary operator to the input C and the intermediate result T.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % get the operator; of is type(C) if type is not present in the accum op
 [opname optype ztype xtype ytype] = GB_spec_operator (accum, C.class) ;
 
+if (GB_spec_is_positional (opname))
+    error ('accum operator cannot be positional') ;
+end
+
 if (nargin < 4)
     identity = 0 ;
 end
diff --git a/GraphBLAS/Test/GB_spec_accum_mask.m b/GraphBLAS/Test/GB_spec_accum_mask.m
index b610f41477..cd1c6c5d88 100644
--- a/GraphBLAS/Test/GB_spec_accum_mask.m
+++ b/GraphBLAS/Test/GB_spec_accum_mask.m
@@ -4,8 +4,8 @@
 %
 % C<Mask> = accum (C,T): apply the accum, then mask, and return the result
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [Z simple] = GB_spec_accum (accum, C, T, identity) ;
 C = GB_spec_mask (C, Mask, Z, C_replace, Mask_complement, identity) ;
diff --git a/GraphBLAS/Test/GB_spec_apply.m b/GraphBLAS/Test/GB_spec_apply.m
index c5e5e6de1d..68a3d72dfd 100644
--- a/GraphBLAS/Test/GB_spec_apply.m
+++ b/GraphBLAS/Test/GB_spec_apply.m
@@ -4,8 +4,8 @@
 % Usage:
 % C = GB_spec_apply (C, Mask, accum, op, A, descriptor)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
@@ -36,13 +36,22 @@
 T.matrix = GB_spec_zeros (size (A.matrix), ztype) ;
 T.pattern = A.pattern ;
 T.class = ztype ;
-
 p = T.pattern ;
-x = A.matrix (p) ;
-
-z = GB_spec_op (op, x) ;
 
-T.matrix (p) = z ;
+if (GB_spec_is_positional (opname))
+    [m, n] = size (A.matrix) ;
+    for i = 1:m
+        for j = 1:n
+            if (p (i,j))
+                T.matrix (i,j) = GB_spec_unop_positional (opname, i, j) ;
+            end
+        end
+    end
+else
+    x = A.matrix (p) ;
+    z = GB_spec_op (op, x) ;
+    T.matrix (p) = z ;
+end
 
 % C<Mask> = accum (C,T): apply the accum, then Mask, and return the result
 C = GB_spec_accum_mask (C, Mask, accum, T, C_replace, Mask_comp, 0) ;
diff --git a/GraphBLAS/Test/GB_spec_assign.m b/GraphBLAS/Test/GB_spec_assign.m
index b890f237cd..5041554ccf 100644
--- a/GraphBLAS/Test/GB_spec_assign.m
+++ b/GraphBLAS/Test/GB_spec_assign.m
@@ -16,8 +16,8 @@
 % not affect any part of C outside that row or column.  Those two functions
 % have their own GB_spec_Row_assign.m and GB_spec_Col_assign.m functions.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
diff --git a/GraphBLAS/Test/GB_spec_binop_positional.m b/GraphBLAS/Test/GB_spec_binop_positional.m
new file mode 100644
index 0000000000..b26e383b10
--- /dev/null
+++ b/GraphBLAS/Test/GB_spec_binop_positional.m
@@ -0,0 +1,29 @@
+function z = GB_spec_binop_positional (op, ia, ja, ib, jb)
+%GB_SPEC_BINOP_POSITIONAL compute a binary positional op
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+switch (op)
+    case { 'firsti'   , '1sti'  }
+        z = ia - 1 ;
+    case { 'firsti1'  , '1sti1' }
+        z = ia ;
+    case { 'firstj'   , '1stj'  }
+        z = ja - 1 ;
+    case { 'firstj1'  , '1stj1' }
+        z = ja ;
+    case { 'secondi'  , '2ndi'  }
+        z = ib - 1 ;
+    case { 'secondi1' , '2ndi1' }
+        z = ib ;
+    case { 'secondj'  , '2ndj'  }
+        z = jb - 1 ;
+    case { 'secondj1' , '2ndj1' }
+        z = jb ;
+    otherwise
+        error ('unknown binary positional op') ;
+end
+
+z = int64 (z) ;
+
diff --git a/GraphBLAS/Test/GB_spec_build.m b/GraphBLAS/Test/GB_spec_build.m
index 0ab9cfae73..bfd4192976 100644
--- a/GraphBLAS/Test/GB_spec_build.m
+++ b/GraphBLAS/Test/GB_spec_build.m
@@ -46,8 +46,8 @@
 % parameters, or pass fewer inputs.  For exampe S = GB_spec_build (I, J, X,
 % nrows, ncols) uses defaults for op, and order, but not X, nrows and ncols.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
@@ -107,6 +107,10 @@
 end
 [opname optype ztype xtype ytype] = GB_spec_operator (op, GB_spec_type (X)) ;
 
+if (GB_spec_is_positional (opname))
+    error ('dup operator cannot be positional') ;
+end
+
 assert (isequal (ztype, xtype)) ;
 assert (isequal (ztype, ytype)) ;
 
diff --git a/GraphBLAS/Test/GB_spec_compare.m b/GraphBLAS/Test/GB_spec_compare.m
index 85e136892f..116b631142 100644
--- a/GraphBLAS/Test/GB_spec_compare.m
+++ b/GraphBLAS/Test/GB_spec_compare.m
@@ -7,8 +7,8 @@
 % some GraphBLAS method.  C_mex = GB_mex_* (...) is the output of the
 % corresponding MATLAB interface to the true GraphBLAS method, in C.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % get the semiring identity
 if (nargin < 3)
diff --git a/GraphBLAS/Test/GB_spec_descriptor.m b/GraphBLAS/Test/GB_spec_descriptor.m
index 2b3d3e014c..b0fe0b75c0 100644
--- a/GraphBLAS/Test/GB_spec_descriptor.m
+++ b/GraphBLAS/Test/GB_spec_descriptor.m
@@ -12,8 +12,8 @@
 % inp0:  'default' or 'tran'
 % inp1:  'default' or 'tran'
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isempty (descriptor))
     descriptor = struct ;
diff --git a/GraphBLAS/Test/GB_spec_extractTuples.m b/GraphBLAS/Test/GB_spec_extractTuples.m
index beb93a4612..aff2c51534 100644
--- a/GraphBLAS/Test/GB_spec_extractTuples.m
+++ b/GraphBLAS/Test/GB_spec_extractTuples.m
@@ -1,8 +1,8 @@
 function [I, J, X] = GB_spec_extractTuples (A, xclass)
 %GB_SPEC_EXTRACTTUPLES a MATLAB mimic of GrB_*_extractTuples
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 A = GB_spec_matrix (A) ;
 if (nargin < 2)
diff --git a/GraphBLAS/Test/GB_spec_getmask.m b/GraphBLAS/Test/GB_spec_getmask.m
index 93a4d80848..e2272c37b0 100644
--- a/GraphBLAS/Test/GB_spec_getmask.m
+++ b/GraphBLAS/Test/GB_spec_getmask.m
@@ -1,24 +1,23 @@
 function Mask = GB_spec_getmask (Mask, Mask_struct)
 %GB_SPEC_GETMASK return the mask, typecasted to logical
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isstruct (Mask))
-    if (Mask_struct)
-        if (isfield (Mask, 'pattern'))
-            Mask = Mask.pattern ;
-        elseif (issparse (Mask))
-            Mask = GB_spones_mex (Mask) ;
-        end
+    if (Mask_struct && isfield (Mask, 'pattern'))
+        Mask = Mask.pattern ;
     else
         Mask = Mask.matrix ;
     end
-else
-    if (Mask_struct && issparse (Mask))
+end
+
+if (Mask_struct)
+    if (issparse (Mask))
         Mask = GB_spones_mex (Mask) ;
+    else
+        Mask = true (size (Mask)) ;
     end
 end
 
 Mask = GB_mex_cast (full (Mask), 'logical') ;
-
diff --git a/GraphBLAS/Test/GB_spec_identity.m b/GraphBLAS/Test/GB_spec_identity.m
index 6f7af5ae31..2f9c321b1c 100644
--- a/GraphBLAS/Test/GB_spec_identity.m
+++ b/GraphBLAS/Test/GB_spec_identity.m
@@ -16,8 +16,8 @@
 % For the 'or', 'and', 'xor', and 'eq' the add_type must be 'logical'.
 % For the 'bit*' operators, the add_type must be unsigned integer.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 1)
     add = arg1 ;
diff --git a/GraphBLAS/Test/GB_spec_is_positional.m b/GraphBLAS/Test/GB_spec_is_positional.m
new file mode 100644
index 0000000000..37fac99508
--- /dev/null
+++ b/GraphBLAS/Test/GB_spec_is_positional.m
@@ -0,0 +1,21 @@
+function s = GB_spec_is_positional (op)
+%GB_SPEC_IS_POSITIONAL determine if an op is positional
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+if (isstruct (op))
+    op = op.opname ;
+end
+switch (op)
+    case { 'firsti' , 'firsti1' , 'firstj' , 'firstj1', ...
+           'secondi', 'secondi1', 'secondj', 'secondj1' } ;
+        % binary positional op
+        s = true ;
+    case { 'positioni', 'positioni1', 'positionj', 'positionj1' }
+        % unary positional op
+        s = true ;
+    otherwise
+        s = false ;
+end
+
diff --git a/GraphBLAS/Test/GB_spec_kron.m b/GraphBLAS/Test/GB_spec_kron.m
index be70ec8385..ce09cafe5a 100644
--- a/GraphBLAS/Test/GB_spec_kron.m
+++ b/GraphBLAS/Test/GB_spec_kron.m
@@ -7,8 +7,8 @@
 % Computes C<Mask> = accum(C,T), in GraphBLAS notation, where T = kron(A,B),
 % kron(A',B), kron(A,B') or kron(A',B')
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
@@ -47,31 +47,55 @@
 [bnrows, bncols] = size (B.matrix) ;
 cnrows = anrows * bnrows ;
 cncols = ancols * bncols ;
+T.matrix  = GB_spec_zeros ([cnrows cncols], ztype) ;
+T.pattern = false (cnrows, cncols) ;
 
 % first cast the inputs into the x,y types of the operator
 A1 = GB_mex_cast (A.matrix, xtype) ;
 B1 = GB_mex_cast (B.matrix, ytype) ;
 
-% do the values
-T.matrix  = GB_spec_zeros ([cnrows cncols], ztype) ;
-T.pattern = false (cnrows, cncols) ;
-S = GB_spec_zeros ([bnrows bncols], xtype) ;
-for j = 1:ancols
-    for i = 1:anrows
-        if A.pattern (i,j)
-            S (:,:) = A1 (i,j) ;
-            ci = (i-1) * bnrows + 1 ;
-            cj = (j-1) * bncols + 1 ;
-            p = B.pattern ;
-            K = GB_spec_op (mult, S(p), B1(p)) ;
-            Tblock = GB_spec_zeros ([bnrows bncols], ztype) ;
-            Tblock (p) = K ;
-            T.matrix  (ci:ci+bnrows-1, cj:cj+bncols-1) = Tblock ;
-            T.pattern (ci:ci+bnrows-1, cj:cj+bncols-1) = B.pattern ;
+for ja = 1:ancols
+    for ia = 1:anrows
+        if (A.pattern (ia,ja))
+            for jb = 1:bncols
+                for ib = 1:bnrows
+                    if (B.pattern (ib,jb))
+                        i = (ia-1) * bnrows + ib ;
+                        j = (ja-1) * bncols + jb ;
+                        if (GB_spec_is_positional (mult))
+                            z = GB_spec_binop_positional (mult_op,ia,ja,ib,jb) ;
+                        else
+                            z = GB_spec_op (mult, A1 (ia,ja), B1 (ib,jb)) ;
+                        end
+                        T.matrix (i,j) = z ;
+                        T.pattern (i,j) = true ;
+                    end
+                end
+            end
         end
     end
 end
 
+%{
+    % do the values
+    S = GB_spec_zeros ([bnrows bncols], xtype) ;
+    for j = 1:ancols
+        for i = 1:anrows
+            if A.pattern (i,j)
+                S (:,:) = A1 (i,j) ;
+                ci = (i-1) * bnrows + 1 ;
+                cj = (j-1) * bncols + 1 ;
+                p = B.pattern ;
+                K = GB_spec_op (mult, S(p), B1(p)) ;
+                Tblock = GB_spec_zeros ([bnrows bncols], ztype) ;
+                Tblock (p) = K ;
+                T.matrix  (ci:ci+bnrows-1, cj:cj+bncols-1) = Tblock ;
+                T.pattern (ci:ci+bnrows-1, cj:cj+bncols-1) = B.pattern ;
+            end
+        end
+    end
+%}
+
 assert (isequal (ztype, GB_spec_type (T.matrix))) ;
 T.class = ztype ;
 
diff --git a/GraphBLAS/Test/GB_spec_mask.m b/GraphBLAS/Test/GB_spec_mask.m
index 566e119828..47717127a4 100644
--- a/GraphBLAS/Test/GB_spec_mask.m
+++ b/GraphBLAS/Test/GB_spec_mask.m
@@ -20,8 +20,8 @@
 % This method operates on both plain matrices and on structs with
 % matrix, pattern, and class components.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 6)
     identity = 0 ;
diff --git a/GraphBLAS/Test/GB_spec_matrix.m b/GraphBLAS/Test/GB_spec_matrix.m
index 375e5d387a..4bf8090bba 100644
--- a/GraphBLAS/Test/GB_spec_matrix.m
+++ b/GraphBLAS/Test/GB_spec_matrix.m
@@ -19,12 +19,12 @@
 % of X is given by Cin.pattern; otherwise the pattern for a sparse X is
 % GB_spones_mex(X) and entries outside the pattern are assumed to be equal to
 % identity.  For a dense X, with no Cin.pattern present the pattern of X is
-% just X ~= identity.
+% all true.
 %
-% Cin is a matrix, then its type is given by GB_spec_type (Cin).  If the
+% If Cin is a matrix, then its type is given by GB_spec_type (Cin).  If the
 % matrix is sparse, its pattern is GB_spones_mex(Cin) and entries not in the
-% pattern are assumed equal to identity.  Otherwise the pattern of Cin is given
-% by Cin ~= identity.
+% pattern are assumed equal to identity.  Otherwise the pattern of Cin is
+% all true.
 %
 % The output Cout is a struct with all three fields present (matrix, pattern,
 % and type).  Cout.matrix is dense, and it has been typecast into the type
@@ -65,8 +65,8 @@
 % must first be passed to this function, C0=GB_spec_matrix(C0,identity) and
 % then C0 and C1 should be identical.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % get the semiring addititive identity, if present
 if (nargin < 2)
@@ -108,7 +108,8 @@
         % the pattern are assumed to be equal to the addititve identity.
         xpattern = GB_mex_cast (full (GB_spones_mex (X)), 'logical') ;
     else
-        xpattern = (X ~= identity) ;
+        xpattern = true (size (X)) ;
+        % xpattern = (X ~= identity) ;
     end
 end
 
diff --git a/GraphBLAS/Test/GB_spec_mxm.m b/GraphBLAS/Test/GB_spec_mxm.m
index 863db8a97f..c25d60df77 100644
--- a/GraphBLAS/Test/GB_spec_mxm.m
+++ b/GraphBLAS/Test/GB_spec_mxm.m
@@ -42,8 +42,8 @@
 % C<Mask> = accum (C,T).  See GrB_accum_mask for a description of this
 % last step.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
@@ -93,12 +93,19 @@
 A_matrix = GB_mex_cast (A.matrix, xtype) ;
 B_matrix = GB_mex_cast (B.matrix, ytype) ;
 
+op_is_positional = GB_spec_is_positional (multiply) ;
+multop = multiply.opname ;
+
 for j = 1:n
     for i = 1:m
         for k = 1:s
             % T (i,j) += A (i,k) * B (k,j), using the semiring
             if (A.pattern (i,k) && B.pattern (k,j))
-                z = GB_spec_op (multiply, A_matrix (i,k), B_matrix (k,j)) ;
+                if (op_is_positional)
+                    z = GB_spec_binop_positional (multop, i, k, k, j) ;
+                else
+                    z = GB_spec_op (multiply, A_matrix (i,k), B_matrix (k,j)) ;
+                end
                 T.matrix (i,j) = GB_spec_op (add, T.matrix (i,j), z) ;
                 T.pattern (i,j) = true ;
             end
diff --git a/GraphBLAS/Test/GB_spec_mxv.m b/GraphBLAS/Test/GB_spec_mxv.m
index 73dffea420..9d6e084cb6 100644
--- a/GraphBLAS/Test/GB_spec_mxv.m
+++ b/GraphBLAS/Test/GB_spec_mxv.m
@@ -6,8 +6,8 @@
 %
 % w, mask, and u are vectors.  u is not transposed (descriptor inp1 ignored)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargout > 1 || nargin ~= 7)
     error ('usage: w = GB_spec_mxv (w, mask, accum, semiring, A, u, descriptor)') ;
diff --git a/GraphBLAS/Test/GB_spec_nbits.m b/GraphBLAS/Test/GB_spec_nbits.m
index f6a7872579..ca7d93dd5b 100644
--- a/GraphBLAS/Test/GB_spec_nbits.m
+++ b/GraphBLAS/Test/GB_spec_nbits.m
@@ -3,8 +3,8 @@
 %
 % bits = GB_spec_nbits ('int16') returns 16.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 switch (type)
 
diff --git a/GraphBLAS/Test/GB_spec_ones.m b/GraphBLAS/Test/GB_spec_ones.m
index 7c00db08d7..61718f95bf 100644
--- a/GraphBLAS/Test/GB_spec_ones.m
+++ b/GraphBLAS/Test/GB_spec_ones.m
@@ -7,8 +7,8 @@
 %
 % See also GB_spec_type, GB_spec_zeros, ones.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 2)
     type = 'double' ;
diff --git a/GraphBLAS/Test/GB_spec_op.m b/GraphBLAS/Test/GB_spec_op.m
index 2523df9d06..75ce9a695f 100644
--- a/GraphBLAS/Test/GB_spec_op.m
+++ b/GraphBLAS/Test/GB_spec_op.m
@@ -35,13 +35,19 @@
 % divide-by-zero and overflow rules for integers differs between MATLAB and C.
 % Also, typecasting in MATLAB and GraphBLAS differs with underflow and overflow
 % conditions.
+%
+% Positional ops are not computed by this function.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % get the operator name and class
 [opname optype ztype xtype ytype] = GB_spec_operator (op, GB_spec_type (A)) ;
 
+if (GB_spec_is_positional (opname))
+    error ('positional op not supported by this funciton') ;
+end
+
 % cast the inputs A and B to the inputs of the operator
 if (~isequal (GB_spec_type (A), xtype))
     x = GB_mex_cast (A, xtype) ;
@@ -52,7 +58,7 @@
 use_matlab = (isa (x, 'float') && ...
     (contains (optype, 'single') || contains (optype, 'double'))) ;
 
-if (nargin > 2)
+if (nargin > 2 && ~ischar (B))
     if (~isequal (GB_spec_type (B), ytype))
         y = GB_mex_cast (B, ytype) ;
     else
diff --git a/GraphBLAS/Test/GB_spec_opdomain.m b/GraphBLAS/Test/GB_spec_opdomain.m
index 459d1f26ca..c95fed66e5 100644
--- a/GraphBLAS/Test/GB_spec_opdomain.m
+++ b/GraphBLAS/Test/GB_spec_opdomain.m
@@ -10,8 +10,8 @@
 %
 % See also GB_spec_op.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % get the operator name and type
 [opname optype ztype xtype ytype] = GB_spec_operator (op, 'double') ;
diff --git a/GraphBLAS/Test/GB_spec_operator.m b/GraphBLAS/Test/GB_spec_operator.m
index 5fb611c87a..326a9c181a 100644
--- a/GraphBLAS/Test/GB_spec_operator.m
+++ b/GraphBLAS/Test/GB_spec_operator.m
@@ -9,8 +9,8 @@
 % ztype, xtype, and ytype are the types of z, x, and y for z = f(x,y), if
 % f is a binary operator, or z = f(x) if f is a unary operator.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (isempty (op))
     % No operator has been defined; return an empty operator.  GB_spec_accum
@@ -28,9 +28,14 @@
     opname = op.opname ;
     optype = op.optype ;
 else
-    % op is a string, use the default optype unless the op is just logical
+    % op is a string
     opname = op ;
-    optype = optype_default ;
+    if (nargin == 1 && GB_spec_is_positional (opname))
+        % optype_default is ignored
+        optype = 'int64' ;
+    else
+        optype = optype_default ;
+    end
 end
 
 % xtype is always the optype
@@ -290,6 +295,26 @@
         ytype = 'none' ;
 
     %--------------------------------------------------------------------------
+    % binary positional ops
+    %--------------------------------------------------------------------------
+
+    case { 'firsti' , 'firsti1' , 'firstj' , 'firstj1', ...
+           'secondi', 'secondi1', 'secondj', 'secondj1' } ;
+        if (~(isequal (ztype, 'int64') || isequal (ztype, 'int32')))
+            error ('invalid op') ;
+        end
+        xtype = optype ;
+        ytype = optype ;
+
+    %--------------------------------------------------------------------------
+    % unary positional ops
+    %--------------------------------------------------------------------------
+
+    case { 'positioni', 'positioni1', 'positionj', 'positionj1' }
+        if (~(isequal (ztype, 'int64') || isequal (ztype, 'int32')))
+            error ('invalid op') ;
+        end
+        ytype = optype ;
 
     otherwise
         error ('unknown op') ;
diff --git a/GraphBLAS/Test/GB_spec_opsall.m b/GraphBLAS/Test/GB_spec_opsall.m
index 1e07fe8386..81f3b53748 100644
--- a/GraphBLAS/Test/GB_spec_opsall.m
+++ b/GraphBLAS/Test/GB_spec_opsall.m
@@ -3,8 +3,8 @@
 %
 % [binops unary_ops add_ops types semirings select_ops] = GB_spec_opsall
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % types
@@ -89,9 +89,13 @@
 % binary ops for FC32 and FC64 only
 binops.complex = { } ;
 
+% binary positional ops
+binops.positional = { 'firsti' , 'firsti1' , 'firstj' , 'firstj1', ...
+                      'secondi', 'secondi1', 'secondj', 'secondj1' } ;
+
 % list of all binary ops
 binops.all = [ binops.alltypes, binops.real, binops.int, ...
-    binops.float, binops.fpreal, binops.complex ] ;
+    binops.float, binops.fpreal, binops.complex, binops.positional ] ;
 
 %-------------------------------------------------------------------------------
 % unary ops
@@ -135,9 +139,13 @@
 unary_ops.complex = {
     'conj', 'real', 'imag', 'carg' } ;
 
+% unary positional ops
+unary_ops.positional = { 'positioni', 'positioni1', 'positionj', 'positionj1' };
+
 % list of all unary ops
 unary_ops.all = [ unary_ops.alltypes, unary_ops.real, unary_ops.int, ...
-    unary_ops.float, unary_ops.fpreal, unary_ops.complex ] ;
+    unary_ops.float, unary_ops.fpreal, unary_ops.complex, ...
+    unary_ops.positional ] ;
 
 %-------------------------------------------------------------------------------
 % valid binary ops
@@ -246,6 +254,20 @@
     end
 end
 
+%-------------------------------------------------------------------------------
+% 40: positional
+%-------------------------------------------------------------------------------
+
+for mult = { 'firsti' , 'firsti1' , 'firstj' , 'firstj1', ...
+              'secondi', 'secondi1', 'secondj', 'secondj1' } ;
+    for add = { 'min', 'max', 'plus', 'times', 'any' }
+        n = n + 1 ;
+        c = { 'int64' } ;
+        s = struct ('multiply', mult{1}, 'add', add{1}, 'class', c{1}) ;
+        semirings {n} = s ;
+    end
+end
+
 %-------------------------------------------------------------------------------
 % select operators
 %-------------------------------------------------------------------------------
diff --git a/GraphBLAS/Test/GB_spec_random.m b/GraphBLAS/Test/GB_spec_random.m
index 2b21d8a845..e1a965774f 100644
--- a/GraphBLAS/Test/GB_spec_random.m
+++ b/GraphBLAS/Test/GB_spec_random.m
@@ -1,7 +1,7 @@
-function A = GB_spec_random (m, n, d, scale, type, is_csc,is_hyper,hyper_ratio)
+function A = GB_spec_random (m, n, d, scale, type, is_csc,is_hyper,hyper_switch)
 %GB_SPEC_RANDOM generate random matrix
 %
-% A = GB_spec_random (m, n, d, scale, type, is_csc, is_hyper, hyper_ratio)
+% A = GB_spec_random (m, n, d, scale, type, is_csc, is_hyper, hyper_switch)
 %
 % m,n,d: parameters to sprandn (m,n,d)
 % m,n: defaults to 4
@@ -11,8 +11,8 @@
 % is_csc: true for CSC, false for CSR; defaults to true
 % is_hyper: false for non-hypersparse, true for hypersparse, default false
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 1)
     m = 4 ;
@@ -43,7 +43,7 @@
 end
 
 if (nargin >= 8)
-    A.hyper_ratio = hyper_ratio ;
+    A.hyper_switch = hyper_switch ;
 end
 
 if (isinf (d))
diff --git a/GraphBLAS/Test/GB_spec_reduce_to_scalar.m b/GraphBLAS/Test/GB_spec_reduce_to_scalar.m
index 857110d21f..eadef38214 100644
--- a/GraphBLAS/Test/GB_spec_reduce_to_scalar.m
+++ b/GraphBLAS/Test/GB_spec_reduce_to_scalar.m
@@ -8,8 +8,8 @@
 %
 % cin is a dense scalar
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
@@ -37,6 +37,10 @@
 assert (isequal (zt, xt)) ;
 assert (isequal (zt, yt)) ;
 
+if (GB_spec_is_positional (reduce_op))
+    error ('reduce operator cannot be positional') ;
+end
+
 % get the identity
 identity = GB_spec_identity (reduce_op, reduce_optype) ;
 if (isempty (identity))
@@ -49,6 +53,10 @@
 % get the accumulator and its types for z = accum(x,y)
 [accum_op accum_optype ztype xtype ytype ] = GB_spec_operator (accum, cin_type) ;
 
+if (GB_spec_is_positional (accum_op))
+    error ('accum operator cannot be positional') ;
+end
+
 %-------------------------------------------------------------------------------
 % do the work via a clean MATLAB interpretation of the entire GraphBLAS spec
 %-------------------------------------------------------------------------------
diff --git a/GraphBLAS/Test/GB_spec_reduce_to_vector.m b/GraphBLAS/Test/GB_spec_reduce_to_vector.m
index 36628a723b..11e83a10e3 100644
--- a/GraphBLAS/Test/GB_spec_reduce_to_vector.m
+++ b/GraphBLAS/Test/GB_spec_reduce_to_vector.m
@@ -6,8 +6,8 @@
 %
 % Reduces a matrix to a vector
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
@@ -31,6 +31,10 @@
 end
 [reduce_op reduce_class] = GB_spec_operator (reduce, aclass) ;
 
+if (GB_spec_is_positional (reduce_op))
+    error ('reduce op must not be positional') ;
+end
+
 % get the identity
 identity = GB_spec_identity (reduce_op, reduce_class) ;
 if (isempty (identity))
diff --git a/GraphBLAS/Test/GB_spec_resize.m b/GraphBLAS/Test/GB_spec_resize.m
index a96bec9c01..56324c0d0a 100644
--- a/GraphBLAS/Test/GB_spec_resize.m
+++ b/GraphBLAS/Test/GB_spec_resize.m
@@ -4,8 +4,8 @@
 % Usage:
 % C = GB_spec_resize (A, nrows_new, ncols_new)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
diff --git a/GraphBLAS/Test/GB_spec_select.m b/GraphBLAS/Test/GB_spec_select.m
index 59c6290121..b8a3a068a5 100644
--- a/GraphBLAS/Test/GB_spec_select.m
+++ b/GraphBLAS/Test/GB_spec_select.m
@@ -4,8 +4,8 @@
 % Usage:
 % C = GB_spec_select (C, Mask, accum, opname, A, thunk, descriptor)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
@@ -80,6 +80,8 @@
         p = A.pattern & (A.matrix < xthunk) ;
     case 'le_thunk'
         p = A.pattern & (A.matrix <= xthunk) ;
+    case 'isnan'
+        p = A.pattern & isnan (A.matrix) ;
     otherwise
         error ('invalid op') ;
 end
diff --git a/GraphBLAS/Test/GB_spec_semiring.m b/GraphBLAS/Test/GB_spec_semiring.m
index 4b806a4ae1..2f5ac777e9 100644
--- a/GraphBLAS/Test/GB_spec_semiring.m
+++ b/GraphBLAS/Test/GB_spec_semiring.m
@@ -24,8 +24,8 @@
 %               in GraphBLAS), 'double' (FP64 in GraphBLAS),
 %               'single complex', and 'double complex'
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % set the default semiring
 if (isempty (semiring))
diff --git a/GraphBLAS/Test/GB_spec_subassign.m b/GraphBLAS/Test/GB_spec_subassign.m
index 5c17da47cc..c65858e45a 100644
--- a/GraphBLAS/Test/GB_spec_subassign.m
+++ b/GraphBLAS/Test/GB_spec_subassign.m
@@ -12,8 +12,8 @@
 % is the same size as A (after optionally being transpose) and the submatrix
 % C(I,J).  Entries outside the C(I,J) submatrix are never modified.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
diff --git a/GraphBLAS/Test/GB_spec_transpose.m b/GraphBLAS/Test/GB_spec_transpose.m
index ee999bf19c..53bdae15c1 100644
--- a/GraphBLAS/Test/GB_spec_transpose.m
+++ b/GraphBLAS/Test/GB_spec_transpose.m
@@ -57,8 +57,8 @@
 % Use an empty value ([ ] or '') to obtain the default value for optional
 % parameters.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 %-------------------------------------------------------------------------------
 % get inputs
diff --git a/GraphBLAS/Test/GB_spec_type.m b/GraphBLAS/Test/GB_spec_type.m
index b29a9237a0..a80c192b44 100644
--- a/GraphBLAS/Test/GB_spec_type.m
+++ b/GraphBLAS/Test/GB_spec_type.m
@@ -11,8 +11,8 @@
 %
 % See also GrB.type.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 s = class (X) ;
 if (~isreal (X))
diff --git a/GraphBLAS/Test/GB_spec_unop_positional.m b/GraphBLAS/Test/GB_spec_unop_positional.m
new file mode 100644
index 0000000000..387176551e
--- /dev/null
+++ b/GraphBLAS/Test/GB_spec_unop_positional.m
@@ -0,0 +1,21 @@
+function z = GB_spec_unop_positional (op, i, j)
+%GB_SPEC_UNOP_POSITIONAL compute a unary positional op
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+switch (op)
+    case { 'positioni' , 'i'   }
+        z = i - 1 ;
+    case { 'positioni1', 'i1'  }
+        z = i ;
+    case { 'positionj' , 'j'   }
+        z = j - 1 ;
+    case { 'positionj1', 'j1'  }
+        z = j ;
+    otherwise
+        error ('unknown unary positional op') ;
+end
+
+z = int64 (z) ;
+
diff --git a/GraphBLAS/Test/GB_spec_vxm.m b/GraphBLAS/Test/GB_spec_vxm.m
index de90b0611c..0d090dd1dc 100644
--- a/GraphBLAS/Test/GB_spec_vxm.m
+++ b/GraphBLAS/Test/GB_spec_vxm.m
@@ -6,8 +6,8 @@
 %
 % w, mask, and u are column vectors.  Computes w'=u'*A or w'=u'*A'
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargout > 1 || nargin ~= 7)
     error ('usage: w = GB_spec_vxm (w, mask, accum, semiring, u, A, descriptor)') ;
diff --git a/GraphBLAS/Test/GB_spec_zeros.m b/GraphBLAS/Test/GB_spec_zeros.m
index f9473b60d5..ff4a6003cd 100644
--- a/GraphBLAS/Test/GB_spec_zeros.m
+++ b/GraphBLAS/Test/GB_spec_zeros.m
@@ -7,8 +7,8 @@
 %
 % See also GB_spec_type, GB_spec_zeros, zeros.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 2)
     type = 'double' ;
diff --git a/GraphBLAS/Test/GB_spok.m b/GraphBLAS/Test/GB_spok.m
new file mode 100644
index 0000000000..98eb938aa8
--- /dev/null
+++ b/GraphBLAS/Test/GB_spok.m
@@ -0,0 +1,11 @@
+function ok = GB_spok (A)
+%GB_SPOK check if a matrix is valid
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+if (issparse (A))
+    ok = spok (A) ;
+else
+    ok = true ;
+end
diff --git a/GraphBLAS/Test/GB_spones_mex.c b/GraphBLAS/Test/GB_spones_mex.c
index 38a30cf627..8c352793ce 100644
--- a/GraphBLAS/Test/GB_spones_mex.c
+++ b/GraphBLAS/Test/GB_spones_mex.c
@@ -2,6 +2,11 @@
 // GB_spones_mex: like spones(A) in MATLAB but do not drop zeros on input
 //------------------------------------------------------------------------------
 
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+//------------------------------------------------------------------------------
+
 // The MATLAB built-in function spones(A) has changed, of MATLAB R2019b.
 // It now drops zeros on input.  Prior versions converted them to 1 on output.
 // The tests here use the old behavior, so this function replaces spones(A)
diff --git a/GraphBLAS/Test/GB_user_op.m b/GraphBLAS/Test/GB_user_op.m
index f1268b3fac..f2c460a8e7 100644
--- a/GraphBLAS/Test/GB_user_op.m
+++ b/GraphBLAS/Test/GB_user_op.m
@@ -11,8 +11,8 @@
 % No typecasting is done for user-defined operators.  x,y,z are either
 % double complex or double
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 tol = false ;
 
diff --git a/GraphBLAS/Test/GB_user_opsall.m b/GraphBLAS/Test/GB_user_opsall.m
index e0734e4c49..6c2d96c01b 100644
--- a/GraphBLAS/Test/GB_user_opsall.m
+++ b/GraphBLAS/Test/GB_user_opsall.m
@@ -1,8 +1,8 @@
 function [complex_binaryops complex_unaryops ] = GB_user_opsall
 %GB_USER_OPSALL return list of complex operators
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % complex binary operators
 complex_binaryops = {
diff --git a/GraphBLAS/Test/GraphBLAS_Test.pdf b/GraphBLAS/Test/GraphBLAS_Test.pdf
index 6737f4b764..415153fe42 100644
Binary files a/GraphBLAS/Test/GraphBLAS_Test.pdf and b/GraphBLAS/Test/GraphBLAS_Test.pdf differ
diff --git a/GraphBLAS/Test/Makefile b/GraphBLAS/Test/Makefile
index f5f4d0c0c5..1211addb8a 100644
--- a/GraphBLAS/Test/Makefile
+++ b/GraphBLAS/Test/Makefile
@@ -2,8 +2,8 @@
 # GraphBLAS/Test/Makefile
 #-------------------------------------------------------------------------------
 
-# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-# http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 #-------------------------------------------------------------------------------
 
diff --git a/GraphBLAS/Test/README.txt b/GraphBLAS/Test/README.txt
index e181fc8946..faa98ad64d 100644
--- a/GraphBLAS/Test/README.txt
+++ b/GraphBLAS/Test/README.txt
@@ -1,5 +1,5 @@
-SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
 
 GraphBLAS/Test:
 
diff --git a/GraphBLAS/Test/Template/GB_mx_build_template.c b/GraphBLAS/Test/Template/GB_mx_build_template.c
index 98167858c3..4848e47cd4 100644
--- a/GraphBLAS/Test/Template/GB_mx_build_template.c
+++ b/GraphBLAS/Test/Template/GB_mx_build_template.c
@@ -2,8 +2,8 @@
 // GB_mx_build_template: build a sparse vector or matrix
 //------------------------------------------------------------------------------
 
-// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-// http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+// SPDX-License-Identifier: Apache-2.0
 
 //------------------------------------------------------------------------------
 
@@ -51,8 +51,8 @@
 
 #define FREE_ALL                \
 {                               \
-    GB_MATRIX_FREE (&C) ;       \
-    GB_mx_put_global (true, 0) ;        \
+    GrB_Matrix_free_(&C) ;       \
+    GB_mx_put_global (true) ;           \
 }
 
 #else
@@ -68,7 +68,7 @@
 #define FREE_ALL                    \
 {                                   \
     GrB_Matrix_free_(&C) ;          \
-    GB_mx_put_global (true, 0) ;    \
+    GB_mx_put_global (true) ;       \
 }
 
 #endif
@@ -125,17 +125,22 @@ GrB_Info builder
     (*Chandle) = NULL ;
 
     // create the GraphBLAS output object C
+    int sparsity = GxB_SPARSE + GxB_HYPERSPARSE ;
     #ifdef MATRIX
     if (C_is_csc)
     {
         // create a hypersparse CSC matrix
-        info = GrB_Matrix_new (Chandle, ctype, nrows, ncols) ;
+        // was: info = GrB_Matrix_new (Chandle, ctype, nrows, ncols) ;
+        info = GB_new (Chandle, // auto (sparse or hyper), new header
+            ctype, nrows, ncols, GB_Ap_calloc,
+            true, sparsity, GxB_HYPER_DEFAULT, 1, Context) ;
     }
     else
     {
         // create a hypersparse CSR matrix
-        info = GB_new (Chandle, ctype, ncols, nrows, GB_Ap_calloc,
-            false, GB_AUTO_HYPER, GB_HYPER_DEFAULT, 1, Context) ;
+        info = GB_new (Chandle, // auto (sparse or hyper), new header
+            ctype, ncols, nrows, GB_Ap_calloc,
+            false, sparsity, GxB_HYPER_DEFAULT, 1, Context) ;
     }
     #else
     info = GrB_Vector_new (Chandle, ctype, nrows) ;
@@ -186,8 +191,6 @@ GrB_Info builder
             mexErrMsgTxt ("xtype not supported")  ;
     }
 
-    // printf ("info %d\n", info) ;
-
     if (info == GrB_SUCCESS)
     {
         ASSERT_MATRIX_OK (C, "C built", GB0) ;
@@ -222,7 +225,7 @@ void mexFunction
     GrB_Vector C = NULL ;
     #endif
 
-    GB_WHERE (USAGE) ;
+    GB_CONTEXT (USAGE) ;
 
     // check inputs
     if (nargout > 1 || nargin < MIN_NARGIN || nargin > MAX_NARGIN)
diff --git a/GraphBLAS/Test/accum_mask.m b/GraphBLAS/Test/accum_mask.m
index 3979d39e61..ee724efa5c 100644
--- a/GraphBLAS/Test/accum_mask.m
+++ b/GraphBLAS/Test/accum_mask.m
@@ -1,8 +1,8 @@
 function C = accum_mask (C, Mask, accum, T, C_replace, Mask_complement)
 %ACCUM_MASK apply the mask
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m n] = size (C.matrix) ;
 Z.matrix  = zeros (m, n) ;
diff --git a/GraphBLAS/Test/accum_mask2.m b/GraphBLAS/Test/accum_mask2.m
index df962d592f..56d4b5a265 100644
--- a/GraphBLAS/Test/accum_mask2.m
+++ b/GraphBLAS/Test/accum_mask2.m
@@ -4,8 +4,8 @@
 % The purpose is for illustration to describe what the accum/mask operation
 % does, not for actual testing.  This file appears in the User Guide.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [nrows ncols] = size (C.matrix) ;
 Z.matrix  = zeros (nrows, ncols) ;
diff --git a/GraphBLAS/Test/atest.m b/GraphBLAS/Test/atest.m
index 8bb23c93d5..ba592f1398 100644
--- a/GraphBLAS/Test/atest.m
+++ b/GraphBLAS/Test/atest.m
@@ -1,7 +1,7 @@
 % test GrB_assign and GxB_subassign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 addpath ('~/ssget') ;
diff --git a/GraphBLAS/Test/atest11.m b/GraphBLAS/Test/atest11.m
index 2849e5f2a7..0e23f1275c 100644
--- a/GraphBLAS/Test/atest11.m
+++ b/GraphBLAS/Test/atest11.m
@@ -1,7 +1,7 @@
 % test GrB_assign and GxB_subassign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 addpath ('~/ssget') ;
diff --git a/GraphBLAS/Test/bfs_book.m b/GraphBLAS/Test/bfs_book.m
index e47e94e0f4..8e132313aa 100644
--- a/GraphBLAS/Test/bfs_book.m
+++ b/GraphBLAS/Test/bfs_book.m
@@ -1,7 +1,7 @@
 %BFS_BOOK run BFS on a small graph
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % graph on the cover of the book, 'Graph Algorithms in the language
 % of linear algebra'.  The source node is node 4.
diff --git a/GraphBLAS/Test/bfs_matlab.m b/GraphBLAS/Test/bfs_matlab.m
index ba30831367..a04e5ef5fd 100644
--- a/GraphBLAS/Test/bfs_matlab.m
+++ b/GraphBLAS/Test/bfs_matlab.m
@@ -11,8 +11,8 @@
 % kth level, where the shortest path (in terms of # of edges) from  s to j has
 % length k+1.  The source node s defaults to 1.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [m n] = size (A) ;
 if (m ~= n)
diff --git a/GraphBLAS/Test/bfs_test.m b/GraphBLAS/Test/bfs_test.m
index 97a7588b52..11e6a87135 100644
--- a/GraphBLAS/Test/bfs_test.m
+++ b/GraphBLAS/Test/bfs_test.m
@@ -1,8 +1,8 @@
 function v = bfs_test (A, s)
 %BFS_TEST compares bfs_matlab and GB_mex_bfs
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 2)
     s = 1 ;
diff --git a/GraphBLAS/Test/btest.m b/GraphBLAS/Test/btest.m
index 2d6456734d..b8d32c5ce9 100644
--- a/GraphBLAS/Test/btest.m
+++ b/GraphBLAS/Test/btest.m
@@ -1,7 +1,7 @@
 % test GrB_build
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 nthreads_set(2)
 grbinfo
diff --git a/GraphBLAS/Test/debug_off.m b/GraphBLAS/Test/debug_off.m
index bb1b901b65..7afc6441a8 100644
--- a/GraphBLAS/Test/debug_off.m
+++ b/GraphBLAS/Test/debug_off.m
@@ -1,8 +1,8 @@
 function debug_off
 %DEBUG_OFF turn off malloc debugging
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 global GraphBLAS_debug
 GraphBLAS_debug = false ;
diff --git a/GraphBLAS/Test/debug_on.m b/GraphBLAS/Test/debug_on.m
index bc21bc2eac..0c9e09d4a9 100644
--- a/GraphBLAS/Test/debug_on.m
+++ b/GraphBLAS/Test/debug_on.m
@@ -1,8 +1,8 @@
 function debug_on
 %DEBUG_ON turn on malloc debugging
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 global GraphBLAS_debug
 GraphBLAS_debug = true ;
diff --git a/GraphBLAS/Test/ee.m b/GraphBLAS/Test/ee.m
index 3676f32de0..f8509ad249 100644
--- a/GraphBLAS/Test/ee.m
+++ b/GraphBLAS/Test/ee.m
@@ -1,7 +1,7 @@
 %EE eWiseMult and eWiseAdd performance tests
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 addpath ('~/ssget') ;
diff --git a/GraphBLAS/Test/eee.m b/GraphBLAS/Test/eee.m
index a52c6a1efc..0a9ed599e9 100644
--- a/GraphBLAS/Test/eee.m
+++ b/GraphBLAS/Test/eee.m
@@ -1,3 +1,7 @@
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 % test141
 % test63
 % test151
diff --git a/GraphBLAS/Test/etest.m b/GraphBLAS/Test/etest.m
index 400faa1e70..2e0f8beb16 100644
--- a/GraphBLAS/Test/etest.m
+++ b/GraphBLAS/Test/etest.m
@@ -1,7 +1,7 @@
 % test eWise
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 addpath ('~/ssget') ;
diff --git a/GraphBLAS/Test/flopcount.m b/GraphBLAS/Test/flopcount.m
index 069aa18880..4edd215e5c 100644
--- a/GraphBLAS/Test/flopcount.m
+++ b/GraphBLAS/Test/flopcount.m
@@ -7,11 +7,11 @@
 % flopcount, if B is m-by-n.
 %
 % Each 'flop' counted is actually a multiply-add.  M can be [ ]. The
-% flopcount m-file returns the same thing as GB_AxB_flopcount.  Also
+% flopcount m-file returns the same thing as GB_AxB_saxpy3_flopcount.  Also
 % included in flops(j) is the work needed to access the mask M(:,j).
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 n = size (B,2) ;
 flops = zeros (1,n) ;
@@ -34,39 +34,26 @@
     % C<M>=A*B and C<!M>=A*B
 
     mask_is_M = (~Mask_complement) ;    % true for C<M>=A*B
+    mask_is_dense = (nnz (M) == prod (size (M))) ;
 
     for j = 1:n
         brows = find (B (:,j)) ;
         if (isempty (brows))
             continue ;
         end
-        mrows = find (M (:,j)) ;
-        mjnz = length (mrows) ;
-        
-        if (mask_is_M & mjnz == 0)
-            continue ;
-        end
-        imin = min (mrows) ;
-        imax = max (mrows) ;
-        brows = brows (:)' ;
 
-        flops (j) = flops (j) + mjnz ;
-        mwork = mwork + mjnz ;
-
-        for k = brows
-            [arows ignore] = find (A (:,k)) ;
-            if (isempty (arows))
-                % A(:,k) is empty
+        if (~mask_is_dense)
+            mrows = find (M (:,j)) ;
+            mjnz = length (mrows) ;
+            if (mask_is_M & mjnz == 0)
                 continue ;
             end
-            if (mask_is_M)
-                amin = min (arows) ;
-                amax = max (arows) ;
-                if (amax < imin || amin > imax)
-                    % intersection of A(:,k) and M(:,j) is empty
-                    continue ;
-                end
-            end
+            flops (j) = flops (j) + mjnz ;
+            mwork = mwork + mjnz ;
+        end
+
+        brows = brows (:)' ;
+        for k = brows
             flops (j) = flops (j) + nnz (A (:,k)) ;
         end
     end
diff --git a/GraphBLAS/Test/floptest.m b/GraphBLAS/Test/floptest.m
index b02bd07db4..3913ca6958 100644
--- a/GraphBLAS/Test/floptest.m
+++ b/GraphBLAS/Test/floptest.m
@@ -12,8 +12,8 @@ function floptest (M, Mask_complement, A, B, flops1)
 % hypersparse, flops1 has length B->nvec+1, not size(B,2).  In this case,
 % only the total flop count is checked.  In that case, flops1 is a scalar.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [flops2 mwork] = GB_mex_mxm_flops (M, Mask_complement, A, B) ;
 total_flops = flops2 (end) ;
diff --git a/GraphBLAS/Test/grb_clear_coverage.m b/GraphBLAS/Test/grb_clear_coverage.m
index c0c4e8019c..6b883b7a7b 100644
--- a/GraphBLAS/Test/grb_clear_coverage.m
+++ b/GraphBLAS/Test/grb_clear_coverage.m
@@ -1,6 +1,9 @@
 function c = grb_clear_coverage
 %GRB_CLEAR_COVERAGE clear current statement coverage
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 try
     global GraphBLAS_debug GraphBLAS_grbcov
     GraphBLAS_grbcov (:) = 0 ;
diff --git a/GraphBLAS/Test/grb_get_coverage.m b/GraphBLAS/Test/grb_get_coverage.m
index 1828e71bec..3ce85c9762 100644
--- a/GraphBLAS/Test/grb_get_coverage.m
+++ b/GraphBLAS/Test/grb_get_coverage.m
@@ -1,6 +1,9 @@
 function c = grb_get_coverage
 %GRB_GET_COVERAGE return current statement coverage
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 c = 0 ;
 try
     global GraphBLAS_debug GraphBLAS_grbcov
diff --git a/GraphBLAS/Test/grbinfo.m b/GraphBLAS/Test/grbinfo.m
index 5ed7e812df..28a81400d2 100644
--- a/GraphBLAS/Test/grbinfo.m
+++ b/GraphBLAS/Test/grbinfo.m
@@ -3,12 +3,12 @@
 %
 % nthreads = grbinfo
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-[nthreads threading thread_safety format hyperratio ... 
-name version date about license compiledate compiletime api api_about ...
-chunk ] = GB_mex_init ;
+[nthreads, format, hyper_switch, ...
+name, version, date, about, license, compiledate, compiletime, api, ...
+api_about, chunk, bitmap_switch ] = GB_mex_init ;
 
 d = stat ;
 
@@ -21,34 +21,14 @@
 end
 
 ncores = feature ('numcores') ;
-[nthreads chunk] = nthreads_get ;
+[nthreads2 chunk2] = nthreads_get ;
+
+fprintf ('    # of threads to use:   %d (%d)\n', nthreads, nthreads2) ;
+fprintf ('    chunk:                 %g (%g)\n', chunk, chunk2) ;
 
-fprintf ('    # of threads to use:   %d\n', nthreads) ;
-fprintf ('    chunk:                 %g\n', chunk) ;
 fprintf ('    OpenMP max threads:    %d\n', GB_mex_omp_max_threads) ;
 fprintf ('    # of cores for MATLAB: %d\n', ncores) ;
 
-
-switch (threading)
-    case {0}
-        fprintf ('    no internal threading\n') ;
-    case {1}
-        fprintf ('    OpenMP for internal threads\n') ;
-    otherwise
-        error ('?') ;
-end
-
-switch (thread_safety)
-    case {0}
-        fprintf ('    no thread safety\n') ;
-    case {1}
-        fprintf ('    OpenMP for user thread safety\n') ;
-    case {2}
-        fprintf ('    POSIX for user thread safety\n') ;
-    otherwise
-        error ('?') ;
-end
-
 switch (format)
     case {0}
         fprintf ('    default format: CSR\n') ;
@@ -58,7 +38,8 @@
         error ('?') ;
 end
 
-fprintf ('    hyperratio: %g\n', hyperratio) ;
+fprintf ('    hyper_switch: %g\n', hyper_switch) ;
+fprintf ('    bitmap_switch: %g\n', bitmap_switch) ;
 fprintf ('    date: %s\n', date) ;
 fprintf ('    compile date: %s\n', compiledate) ;
 fprintf ('    compile time: %s\n\n', compiletime) ;
diff --git a/GraphBLAS/Test/grbresults.m b/GraphBLAS/Test/grbresults.m
index 5c00fff6b1..78edd83300 100644
--- a/GraphBLAS/Test/grbresults.m
+++ b/GraphBLAS/Test/grbresults.m
@@ -1,27 +1,9 @@
-function [t method] = grbresults
-%GRBRESULTS return time taken by last GraphBLAS function, and AxB method
+function t = grbresults
+%GRBRESULTS return time taken by last GraphBLAS function
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 global GraphBLAS_results
 t = GraphBLAS_results (1) ;
-method = GraphBLAS_results (2) ;
-
-if method == 0
-    method = 'auto' ;
-elseif method == 1001 || method == 4
-    method = 'Gustavson' ;
-elseif method == 1002 || method == 5
-    method = 'heap' ;
-elseif method == 1003 || method == 6
-    method = 'dot' ;
-elseif method == 1004
-    method = 'hash' ;
-elseif method == 1005
-    method = 'saxpy' ;
-else
-    error ('invalid method') ;
-end
-
 
diff --git a/GraphBLAS/Test/irand.m b/GraphBLAS/Test/irand.m
index b4918ede62..07154a36d3 100644
--- a/GraphBLAS/Test/irand.m
+++ b/GraphBLAS/Test/irand.m
@@ -8,8 +8,8 @@
 %
 % if imin > imax, the ranges are swapped.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 4)
     n = 1 ;
diff --git a/GraphBLAS/Test/isequal_roundoff.m b/GraphBLAS/Test/isequal_roundoff.m
index bcf5b33523..0a2adfe8fe 100644
--- a/GraphBLAS/Test/isequal_roundoff.m
+++ b/GraphBLAS/Test/isequal_roundoff.m
@@ -5,8 +5,8 @@
 % tol = 64*eps if not present.  NaNs and Infs are ignored in the
 % tol, but the NaN and +/-Inf pattern must be the same.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % if (~isequal (GB_spec_type (A), GB_spec_type (B)))
 %     ok = false ;
diff --git a/GraphBLAS/Test/logstat.m b/GraphBLAS/Test/logstat.m
index 7e7ae5ed1b..101473d0f4 100644
--- a/GraphBLAS/Test/logstat.m
+++ b/GraphBLAS/Test/logstat.m
@@ -1,8 +1,8 @@
 function logstat (testscript, threads)
 %LOGSTAT run a GraphBLAS test and log the results to log.txt 
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [debug, compact, malloc, covered] = GB_mex_debug ;
 
@@ -11,6 +11,13 @@ function logstat (testscript, threads)
 if (nargin < 2)
     % by default, use 4 threads and a tiny chunk size of 1
     threads {1} = [4 1] ;
+else
+    % only the # of threads is specified; also set the chunk size to 1
+    if (isscalar (threads) && isnumeric (threads))
+        threads = max (threads, 1) ;
+        t {1} = [threads 1] ;
+        threads = t ;
+    end
 end
 
 ntrials = length (threads) ;
@@ -66,6 +73,10 @@ function logstat (testscript, threads)
     f = fopen ('log.txt', 'a') ;
 
     s = datestr (now) ;
+
+    % trim the year from the date
+    s = s ([1:6 12:end]) ;
+
     fprintf (   '%s %-10s %7.1f sec ', s, testscript, t) ;
     fprintf (f, '%s %-10s %7.1f sec ', s, testscript, t) ;
 
@@ -75,19 +86,18 @@ function logstat (testscript, threads)
         if (isempty (GraphBLAS_debug))
             GraphBLAS_debug = false ;
         end
-        % fprintf ('malloc debug: %d\n', GraphBLAS_debug) ;
         if (~isempty (GraphBLAS_grbcov))
             c = sum (GraphBLAS_grbcov > 0) ;
             n = length (GraphBLAS_grbcov) ;
             if (c == n)
-                fprintf (   'coverage: %5d :   all %5d (full 100%% rate: %8.2f/sec)', ...
+                fprintf (   '%5d:   all %5d full 100%% %8.2f/sec', ...
                     c - clast, n, (c-clast) / t) ;
-                fprintf (f, 'coverage: %5d :   all %5d (full 100%% rate: %8.2f/sec)', ...
+                fprintf (f, '%5d:   all %5d full 100%% %8.2f/sec', ...
                     c - clast, n, (c-clast) / t) ;
             else
-                fprintf (   'coverage: %5d : %5d of %5d (%5.1f%% rate: %8.2f/sec)', ...
+                fprintf (   '%5d: %5d of %5d %5.1f%% %8.2f/sec', ...
                     c - clast, c, n, 100 * (c/n), (c-clast) / t) ;
-                fprintf (f, 'coverage: %5d : %5d of %5d (%5.1f%% rate: %8.2f/sec)', ...
+                fprintf (f, '%5d: %5d of %5d %5.1f%% %8.2f/sec', ...
                     c - clast, c, n, 100 * (c/n), (c-clast) / t) ;
             end
             if (debug)
diff --git a/GraphBLAS/Test/longtests.m b/GraphBLAS/Test/longtests.m
index 1255a3ff5f..29fd3596b1 100644
--- a/GraphBLAS/Test/longtests.m
+++ b/GraphBLAS/Test/longtests.m
@@ -1,7 +1,7 @@
 %LONGTESTS very long tests
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 nthreads_set (4,1) ;
 debug_off 
diff --git a/GraphBLAS/Test/make.m b/GraphBLAS/Test/make.m
index 3f59f66007..ad23f95021 100644
--- a/GraphBLAS/Test/make.m
+++ b/GraphBLAS/Test/make.m
@@ -14,8 +14,8 @@ function make (what)
 % GraphBLAS requires an ANSI C11 compliant compiler.  On the Mac, clang 8.0
 % suffices.  gcc should be version 4.9.3 or later
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 here = pwd ;
 if (ispc)
diff --git a/GraphBLAS/Test/mtest.m b/GraphBLAS/Test/mtest.m
index e89d02e8c2..ef2e4a0165 100644
--- a/GraphBLAS/Test/mtest.m
+++ b/GraphBLAS/Test/mtest.m
@@ -1,7 +1,7 @@
 % test mxm
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 addpath ('~/ssget') ;
diff --git a/GraphBLAS/Test/nthreads_get.m b/GraphBLAS/Test/nthreads_get.m
index 53b013c3ef..178a0f77dc 100644
--- a/GraphBLAS/Test/nthreads_get.m
+++ b/GraphBLAS/Test/nthreads_get.m
@@ -3,8 +3,8 @@
 %
 % [nthreads chunk] = nthreads_get
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 global GraphBLAS_nthreads
 if (isempty (GraphBLAS_nthreads))
@@ -15,7 +15,7 @@
 if (nargout > 1)
     global GraphBLAS_chunk
     if (isempty (GraphBLAS_chunk))
-        [nthreads chunk] = nthreads_set (nthreads, 4096) ;
+        [nthreads chunk] = nthreads_set (nthreads, 64*1024) ;
     end
     chunk = GraphBLAS_chunk ;
 end
diff --git a/GraphBLAS/Test/nthreads_set.m b/GraphBLAS/Test/nthreads_set.m
index 45bf11544b..42bf72e469 100644
--- a/GraphBLAS/Test/nthreads_set.m
+++ b/GraphBLAS/Test/nthreads_set.m
@@ -6,8 +6,8 @@
 % If nthreads is empty, or if no input arguments, nthreads is set to 1.
 % If chunk is empty, or if no input arguments, chunk is not modified.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 global GraphBLAS_nthreads
 if (nargin < 1)
@@ -24,7 +24,7 @@
     if (nargin > 1)
         GraphBLAS_chunk = chunk ;
     elseif (isempty (GraphBLAS_chunk))
-        GraphBLAS_chunk = 4096 ;
+        GraphBLAS_chunk = 64*1024 ;
     end
     if (nargout > 1)
         chunk = GraphBLAS_chunk ;
diff --git a/GraphBLAS/Test/rtest.m b/GraphBLAS/Test/rtest.m
index a88794fc20..f24202edd4 100644
--- a/GraphBLAS/Test/rtest.m
+++ b/GraphBLAS/Test/rtest.m
@@ -1,7 +1,7 @@
 % test GrB_reduce to vector and scalar
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 
diff --git a/GraphBLAS/Test/runtest.m b/GraphBLAS/Test/runtest.m
index 505d51cf22..ae3f5789bc 100644
--- a/GraphBLAS/Test/runtest.m
+++ b/GraphBLAS/Test/runtest.m
@@ -1,8 +1,8 @@
 function runtest (testscript)
 %RUNTEST run a single GraphBLAS test
 
-%  SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-%  http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 eval (testscript) ;
 
diff --git a/GraphBLAS/Test/spok/Contents.m b/GraphBLAS/Test/spok/Contents.m
index c6f68b9bec..fa78a933b6 100644
--- a/GraphBLAS/Test/spok/Contents.m
+++ b/GraphBLAS/Test/spok/Contents.m
@@ -23,3 +23,5 @@
 %   spok (A > .5)           % logical sparse
 
 % Copyright 2008-2011, Timothy A. Davis, http://suitesparse.com
+% SPDX-License-Identifier: Apache-2.0
+
diff --git a/GraphBLAS/Test/spok/private/spok_invalid.c b/GraphBLAS/Test/spok/private/spok_invalid.c
index af675cfdc3..51b83f6880 100644
--- a/GraphBLAS/Test/spok/private/spok_invalid.c
+++ b/GraphBLAS/Test/spok/private/spok_invalid.c
@@ -2,6 +2,9 @@
 
 /* spok_invalid: returns an invalid sparse matrix to test SPOK. */
 
+/* Copyright 2008-2011, Timothy A. Davis, http://suitesparse.com */
+/* SPDX-License-Identifier: Apache-2.0 */
+
 void mexFunction
 (
     int nargout,
diff --git a/GraphBLAS/Test/spok/spok.c b/GraphBLAS/Test/spok/spok.c
index f97e79863c..3c64ecca80 100644
--- a/GraphBLAS/Test/spok/spok.c
+++ b/GraphBLAS/Test/spok/spok.c
@@ -2,6 +2,9 @@
 
 /* check the validity of a MATLAB sparse matrix */
 
+/* Copyright 2008-2011, Timothy A. Davis, http://suitesparse.com */
+/* SPDX-License-Identifier: Apache-2.0 */
+
 SPOK_INT spok
 (
     /* inputs, not modified */
diff --git a/GraphBLAS/Test/spok/spok.h b/GraphBLAS/Test/spok/spok.h
index 0ba597746d..92085574fb 100644
--- a/GraphBLAS/Test/spok/spok.h
+++ b/GraphBLAS/Test/spok/spok.h
@@ -1,3 +1,7 @@
+
+/* Copyright 2008-2011, Timothy A. Davis, http://suitesparse.com */
+/* SPDX-License-Identifier: Apache-2.0 */
+
 #ifdef MATLAB_MEX_FILE
 #include "mex.h"
 #define SPOK_INT mwSignedIndex
diff --git a/GraphBLAS/Test/spok/spok.m b/GraphBLAS/Test/spok/spok.m
index 11d6cd27a5..5dcbfcf974 100644
--- a/GraphBLAS/Test/spok/spok.m
+++ b/GraphBLAS/Test/spok/spok.m
@@ -31,5 +31,6 @@
 % See also sparse.
 
 % Copyright 2008-2011, Timothy A. Davis, http://suitesparse.com
+% SPDX-License-Identifier: Apache-2.0
 
 error ('spok mexFunction not installed') ;
diff --git a/GraphBLAS/Test/spok/spok_install.m b/GraphBLAS/Test/spok/spok_install.m
index 502966e5ed..885c2e3065 100644
--- a/GraphBLAS/Test/spok/spok_install.m
+++ b/GraphBLAS/Test/spok/spok_install.m
@@ -9,6 +9,7 @@
 % See also sparse, spok, spok_test
 
 % Copyright 2008-2011, Timothy A. Davis, http://suitesparse.com
+% SPDX-License-Identifier: Apache-2.0
 
 mex -largeArrayDims spok.c spok_mex.c
 addpath (pwd) ;
diff --git a/GraphBLAS/Test/spok/spok_mex.c b/GraphBLAS/Test/spok/spok_mex.c
index cc29031b8f..a8be50f70e 100644
--- a/GraphBLAS/Test/spok/spok_mex.c
+++ b/GraphBLAS/Test/spok/spok_mex.c
@@ -4,6 +4,9 @@
 /* SPOK mexFunction */
 /* -------------------------------------------------------------------------- */
 
+/* Copyright 2008-2011, Timothy A. Davis, http://suitesparse.com */
+/* SPDX-License-Identifier: Apache-2.0 */
+
 /* Checks the validity of a MATLAB sparse matrix.  Returns 1 if OK (or if the
    matrix is not sparse), 0 if the row indices are jumbled (C=A' might lead to
    a valid C matrix) or if the matrix includes explicit zero entries (which can
diff --git a/GraphBLAS/Test/spok/spok_test.m b/GraphBLAS/Test/spok/spok_test.m
index d804211236..8860d14201 100644
--- a/GraphBLAS/Test/spok/spok_test.m
+++ b/GraphBLAS/Test/spok/spok_test.m
@@ -7,6 +7,7 @@
 % See also sparse, spok, spok_install
 
 % Copyright 2008-2011, Timothy A. Davis, http://suitesparse.com
+% SPDX-License-Identifier: Apache-2.0
 
 % compile and install spok
 help spok ;
diff --git a/GraphBLAS/Test/ss.m b/GraphBLAS/Test/ss.m
index 3079e9bd47..108025ea81 100644
--- a/GraphBLAS/Test/ss.m
+++ b/GraphBLAS/Test/ss.m
@@ -1,7 +1,7 @@
 % test GxB_select
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 
diff --git a/GraphBLAS/Test/stat.m b/GraphBLAS/Test/stat.m
index 14dcb43811..d657a37233 100644
--- a/GraphBLAS/Test/stat.m
+++ b/GraphBLAS/Test/stat.m
@@ -1,8 +1,8 @@
 function [d nthreads] = stat
 %STAT report status of statement coverage and malloc debugging
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 global GraphBLAS_debug GraphBLAS_grbcov
 
diff --git a/GraphBLAS/Test/stest.m b/GraphBLAS/Test/stest.m
index bcd6794ff3..024c03d840 100644
--- a/GraphBLAS/Test/stest.m
+++ b/GraphBLAS/Test/stest.m
@@ -1,7 +1,7 @@
 % test GxB_select
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 
diff --git a/GraphBLAS/Test/t74.m b/GraphBLAS/Test/t74.m
index 0372a2418c..3017bcca57 100644
--- a/GraphBLAS/Test/t74.m
+++ b/GraphBLAS/Test/t74.m
@@ -1,7 +1,7 @@
 %T74 run test20 and test74
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 threads {1} = [4 1] ;
diff --git a/GraphBLAS/Test/t99.m b/GraphBLAS/Test/t99.m
new file mode 100644
index 0000000000..299f18d6bb
--- /dev/null
+++ b/GraphBLAS/Test/t99.m
@@ -0,0 +1,9 @@
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+clear all
+test99
+t {1} = [4 1] ;
+logstat ('test99',t) ;  % GB_mex_transpose with explicit zeros in the Mask
+
diff --git a/GraphBLAS/Test/test00.m b/GraphBLAS/Test/test00.m
index 5bd8055d6e..0ac61acb92 100644
--- a/GraphBLAS/Test/test00.m
+++ b/GraphBLAS/Test/test00.m
@@ -1,8 +1,8 @@
 function test00
 %TEST00 test GB_mex_mis
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest00: MIS\n') ;
 rng ('default') ;
diff --git a/GraphBLAS/Test/test01.m b/GraphBLAS/Test/test01.m
index e5a7a651ec..82a4cbb39f 100644
--- a/GraphBLAS/Test/test01.m
+++ b/GraphBLAS/Test/test01.m
@@ -1,9 +1,10 @@
 function test01
 %TEST01 test GraphBLAS error handling
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
+GB_mex_about2 ;
 GB_mex_about ;
 GB_mex_errors ;
 
diff --git a/GraphBLAS/Test/test02.m b/GraphBLAS/Test/test02.m
index 0bbe0f0e16..2bca2e9ae0 100644
--- a/GraphBLAS/Test/test02.m
+++ b/GraphBLAS/Test/test02.m
@@ -1,8 +1,8 @@
 function test02
 %TEST02 test GrB_*_dup
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [~, ~, ~, types, ~, ~] = GB_spec_opsall ;
 GB_builtin_complex_set (1) ;
@@ -23,8 +23,8 @@
             A.pattern (1,1) = true ;
             A_matrix = full (A.matrix) ;
             A_pattern = full (A.pattern) ;
-            assert (spok (1*A.matrix) == 1) ;
-            assert (spok (A.pattern) == 1) ;
+            assert (GB_spok (1*A.matrix) == 1) ;
+            assert (GB_spok (A.pattern) == 1) ;
 
             for k2 = 1:length (types)
                 ctype = types {k2} ;
@@ -33,7 +33,7 @@
                 C = GB_mex_dup (A, ctype) ;
                 C_matrix = full (C.matrix) ;
                 C_pattern = full (GB_spones_mex (C.matrix)) ;
-                assert (spok (1*C.matrix) == 1) ;
+                assert (GB_spok (1*C.matrix) == 1) ;
 
                 if (k1 == k2)
                     % also try another method
@@ -44,7 +44,7 @@
                     C2_matrix = full (C2.matrix) ;
                     C2_pattern = full (GB_spones_mex (C2.matrix)) ;
                     assert (isequal (C, C2))  ;
-                    assert (spok (1*C2.matrix) == 1) ;
+                    assert (GB_spok (1*C2.matrix) == 1) ;
                 end
 
             end
@@ -58,17 +58,17 @@
 
     % duplicate a complex matrix (user-defined can't be typecasted)
     A = GB_mex_random (4, 4, 10, 1) ;
-    assert (spok (1*A) == 1) ;
+    assert (GB_spok (1*A) == 1) ;
 
     C = GB_mex_dup (A) ;
     % C_matrix = full (C.matrix) ;
     assert (isequal (A, C.matrix))  ;
-    assert (spok (1*C.matrix) == 1) ;
+    assert (GB_spok (1*C.matrix) == 1) ;
 
     C = GB_mex_dup (A, 'double complex', 1) ;
     % C_matrix = full (C.matrix) ;
     assert (isequal (A, C.matrix))  ;
-    assert (spok (1*C.matrix) == 1) ;
+    assert (GB_spok (1*C.matrix) == 1) ;
 end
 
 format
diff --git a/GraphBLAS/Test/test03.m b/GraphBLAS/Test/test03.m
index 97c371a22d..f4f723be98 100644
--- a/GraphBLAS/Test/test03.m
+++ b/GraphBLAS/Test/test03.m
@@ -1,8 +1,8 @@
 function test03
 %TEST03 test GB_*_check functions
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [~, ~, ~, types, ~, ~] = GB_spec_opsall ;
 types = types.all ;
@@ -14,19 +14,22 @@
     for is_hyper = 0:1
         for is_csc = 0:1
             A = GB_spec_random (10,30,0.2,100,aclass, is_csc, is_hyper) ;
-            GB_mex_dump (A,2) ;
+            C = GB_mex_dump (A,2) ;
+            GB_spec_compare (C, A) ;
         end
     end
-    for hyper_ratio = -0.1:0.1:0.4
-        A = GB_spec_random (10,30,0.02,100,aclass, is_csc, [ ], hyper_ratio) ;
-        GB_mex_dump (A,2) ;
+    for hyper_switch = -0.1:0.1:0.4
+        A = GB_spec_random (10,30,0.02,100,aclass, is_csc, [ ], hyper_switch) ;
+        C = GB_mex_dump (A,2) ;
+        GB_spec_compare (C, A) ;
     end
 end
 
 for is_hyper = 0:1
     for is_csc = 0:1
         A = GB_spec_random (100,2,0.5,100,'int8', is_csc, is_hyper) ;
-        GB_mex_dump (A,2) ;
+        C = GB_mex_dump (A,2) ;
+        GB_spec_compare (C, A) ;
     end
 end
 
@@ -36,17 +39,23 @@
 
     % complex case:
     A = GB_mex_random (10, 30, 15, 1, 1, 0, 0, 0) ;
-    GB_mex_dump (A,2) ;
+    C = GB_mex_dump (A,2) ;
+    GB_spec_compare (C, A) ;
     A = GB_mex_random (10, 30, 15, 1, 1, 0, 0, 1) ;
-    GB_mex_dump (A,2) ;
+    C = GB_mex_dump (A,2) ;
+    GB_spec_compare (C, A) ;
     A = GB_mex_random (10, 30, 15, 1, 1, 1, 0, 1) ;
-    GB_mex_dump (A,2) ;
+    C = GB_mex_dump (A,2) ;
+    GB_spec_compare (C, A) ;
     A = GB_mex_random (10, 30, 15, 1, 1, 1, 0, 0) ;
-    GB_mex_dump (A,2) ;
+    C = GB_mex_dump (A,2) ;
+    GB_spec_compare (C, A) ;
     A = GB_mex_random (10, 30, 15, 1, 1, 1, 1, 1) ;
-    GB_mex_dump (A,2) ;
+    C = GB_mex_dump (A,2) ;
+    GB_spec_compare (C, A) ;
     A = GB_mex_random (3, 3, 5, 0, 1, 1, 1, 3) 
-    GB_mex_dump (A) 
+    C = GB_mex_dump (A)
+    GB_spec_compare (C, A) ;
 end
 
 fprintf ('\ntest03: all object check tests passed\n') ;
diff --git a/GraphBLAS/Test/test04.m b/GraphBLAS/Test/test04.m
index 903a27ce3a..110835c20b 100644
--- a/GraphBLAS/Test/test04.m
+++ b/GraphBLAS/Test/test04.m
@@ -1,8 +1,8 @@
 function test04
 %TEST04 test and demo for accumulator/mask and transpose
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n-------------------- simple mask and transpose tests\n') ;
 
diff --git a/GraphBLAS/Test/test05.m b/GraphBLAS/Test/test05.m
index b42dee6186..34c9c10368 100644
--- a/GraphBLAS/Test/test05.m
+++ b/GraphBLAS/Test/test05.m
@@ -1,8 +1,8 @@
 function test05
 %TEST05 test GrB_*_setElement
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 A = sparse (5,5) ;
 A (2,2) = 42 ;
diff --git a/GraphBLAS/Test/test06.m b/GraphBLAS/Test/test06.m
index bf583c1b62..67b44026ae 100644
--- a/GraphBLAS/Test/test06.m
+++ b/GraphBLAS/Test/test06.m
@@ -9,12 +9,13 @@ function test06 (A,B,fulltests,method_list)
 % matrix id number from the SuiteSparse collection otherwise A is the sparse
 % matrix to use in the test
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test06: GrB_mxm on all semirings\n') ;
 
 [binops, ~, add_ops, types, ~, ~] = GB_spec_opsall ;
+% mult_ops = binops.positional ;
 mult_ops = binops.all ;
 types = types.all ;
 
@@ -174,7 +175,6 @@ function test06 (A,B,fulltests,method_list)
             catch me
                 if (~isempty (strfind (me.message, 'gotcha')))
                     semiring
-                    pause
                 end
                 continue
             end
@@ -220,16 +220,16 @@ function test06 (A,B,fulltests,method_list)
                 dtn.axb = algo ;
                 dtt.axb = algo ;
 
-                t1 = nan ; method1 = 'x' ; method1m = 'x' ;
-                t2 = nan ; method2 = 'x' ; method2m = 'x' ;
-                t3 = nan ; method3 = 'x' ; method3m = 'x' ;
-                t4 = nan ; method4 = 'x' ; method4m = 'x' ;
+                t1 = nan ;
+                t2 = nan ;
+                t3 = nan ;
+                t4 = nan ;
 
                 % C = A*B, no mask
                 % tic
                 if (ok)
                 C1 = GB_mex_mxm  (Cin, [ ], [ ], semiring, A, B, dnn) ;
-                [t1 method1] = grbresults ; % toc ;
+                t1 = grbresults ; % toc ;
                 if (n < 200)
                 C2 = GB_spec_mxm (Cin, [ ], [ ], semiring, A, B, dnn);
                 GB_spec_compare (C1, C2, id) ;
@@ -239,7 +239,7 @@ function test06 (A,B,fulltests,method_list)
                 % C = A'*B, no mask
                 if (ok)
                 C1 = GB_mex_mxm  (Cin, [ ], [ ], semiring, A, B, dtn);
-                [t2 method2] = grbresults ; % toc ;
+                t2 = grbresults ; % toc ;
                 if (n < 200)
                 C2 = GB_spec_mxm (Cin, [ ], [ ], semiring, A, B, dtn);
                 GB_spec_compare (C1, C2, id) ;
@@ -249,7 +249,7 @@ function test06 (A,B,fulltests,method_list)
                 % C = A*B', no mask
                 if (ok)
                 C1 = GB_mex_mxm  (Cin, [ ], [ ], semiring, A, B, dnt);
-                [t3 method3] = grbresults ; % toc ;
+                t3 = grbresults ; % toc ;
                 if (n < 200)
                 C2 = GB_spec_mxm (Cin, [ ], [ ], semiring, A, B, dnt);
                 GB_spec_compare (C1, C2, id) ;
@@ -259,7 +259,7 @@ function test06 (A,B,fulltests,method_list)
                 % C = A'*B', no mask
                 if (ok)
                 C1 = GB_mex_mxm  (Cin, [ ], [ ], semiring, A, B, dtt);
-                [t4 method4] = grbresults ; % toc ;
+                t4 = grbresults ; % toc ;
                 if (n < 200)
                 C2 = GB_spec_mxm (Cin, [ ], [ ], semiring, A, B, dtt);
                 GB_spec_compare (C1, C2, id) ;
@@ -269,14 +269,13 @@ function test06 (A,B,fulltests,method_list)
                 if (n > 500)
                     fprintf (...
                     'speedups %10.4f(%s) %10.4f(%s) %10.4f(%s) %10.4f(%s) ', ...
-                    tm1/t1, method1(1), tm2/t2, method2(1), ...
-                    tm3/t3, method3(1), tm5/t4, method4(1)) ;
+                    tm1/t1, tm2/t2, tm3/t3, tm5/t4 ) ;
                 end
 
                 % C = A*B, with mask
                 % tic
                 C1 = GB_mex_mxm  (Cin, Mask, [ ], semiring, A, B, dnn);
-                [t1 method1m] = grbresults ; % toc ;
+                t1 = grbresults ; % toc ;
                 if (n < 200)
                 C2 = GB_spec_mxm (Cin, Mask, [ ], semiring, A, B, dnn);
                 GB_spec_compare (C1, C2, id) ;
@@ -285,7 +284,7 @@ function test06 (A,B,fulltests,method_list)
                 % C = A'*B, with mask
                 % tic
                 C1 = GB_mex_mxm  (Cin, Mask, [ ], semiring, A, B, dtn);
-                [t2 method2m] = grbresults ; % toc ;
+                t2 = grbresults ; % toc ;
                 if (n < 200)
                 C2 = GB_spec_mxm (Cin, Mask, [ ], semiring, A, B, dtn);
                 GB_spec_compare (C1, C2, id) ;
@@ -294,7 +293,7 @@ function test06 (A,B,fulltests,method_list)
                 % C = A*B', with mask
                 % tic
                 C1 = GB_mex_mxm  (Cin, Mask, [ ], semiring, A, B, dnt);
-                [t3 method3m] = grbresults ; % toc ;
+                t3 = grbresults ; % toc ;
                 if (n < 200)
                 C2 = GB_spec_mxm (Cin, Mask, [ ], semiring, A, B, dnt);
                 GB_spec_compare (C1, C2, id) ;
@@ -303,7 +302,7 @@ function test06 (A,B,fulltests,method_list)
                 % C = A'*B', with mask
                 % tic
                 C1 = GB_mex_mxm  (Cin, Mask, [ ], semiring, A, B, dtt);
-                [t4 method4m] = grbresults ; % toc ;
+                t4 = grbresults ; % toc ;
                 if (n < 200)
                 C2 = GB_spec_mxm (Cin, Mask, [ ], semiring, A, B, dtt);
                 GB_spec_compare (C1, C2, id) ;
@@ -312,8 +311,7 @@ function test06 (A,B,fulltests,method_list)
                 if (n > 500)
                     fprintf (...
                     'speedups %10.4f(%s) %10.4f(%s) %10.4f(%s) %10.4f(%s) ', ...
-                    tmm1/t1, method1m(1), tmm2/t2, method2m(1), ...
-                    tmm3/t3, method3m(1), tmm5/t4, method4m(1)) ;
+                    tmm1/t1, tmm2/t2, tmm3/t3, tmm5/t4) ;
                     fprintf ('\n') ;
                 end
 
diff --git a/GraphBLAS/Test/test07.m b/GraphBLAS/Test/test07.m
index 81eb51f193..5ec634bc51 100644
--- a/GraphBLAS/Test/test07.m
+++ b/GraphBLAS/Test/test07.m
@@ -1,8 +1,8 @@
 function test07
 %TEST07 test GxB_subassign with a single pending tuple
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % adds a single pending tuple
 
diff --git a/GraphBLAS/Test/test07b.m b/GraphBLAS/Test/test07b.m
index 072cbdb6ce..2dff2a306a 100644
--- a/GraphBLAS/Test/test07b.m
+++ b/GraphBLAS/Test/test07b.m
@@ -1,8 +1,8 @@
 function test07b
 %TEST07B test GrB_assign with a single pending tuple
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % adds a single pending tuple
 
@@ -23,9 +23,7 @@
 I0 = uint64 (I-1) ;
 J0 = uint64 (J-1) ;
 
-'here'
 C3 = GB_mex_assign (C, [ ], '', A, I0, J0, [ ]) 
-'did it'
 % C3.matrix
 % full (C3.matrix)
 assert (isequal (C3.matrix, C2))
diff --git a/GraphBLAS/Test/test08.m b/GraphBLAS/Test/test08.m
index 3f3d6d5f6c..2b4c4444db 100644
--- a/GraphBLAS/Test/test08.m
+++ b/GraphBLAS/Test/test08.m
@@ -1,8 +1,8 @@
 function test08
 %TEST08 test GxB_subassign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 C = sparse (rand (5,4)) ;
diff --git a/GraphBLAS/Test/test08b.m b/GraphBLAS/Test/test08b.m
index 43f32f6ee2..0bcd140f92 100644
--- a/GraphBLAS/Test/test08b.m
+++ b/GraphBLAS/Test/test08b.m
@@ -1,8 +1,8 @@
 function test08b
 %TEST08B test GrB_assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 C = sparse (rand (5,4)) ;
diff --git a/GraphBLAS/Test/test09.m b/GraphBLAS/Test/test09.m
index 36f81cd5c3..29eeb9b5b9 100644
--- a/GraphBLAS/Test/test09.m
+++ b/GraphBLAS/Test/test09.m
@@ -1,8 +1,8 @@
 function test09
 %TEST09 test GxB_subassign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n-----------duplicate I,J test of GB_mex_subassign\n') ;
 
diff --git a/GraphBLAS/Test/test09b.m b/GraphBLAS/Test/test09b.m
index 4695e386ca..ef3420001b 100644
--- a/GraphBLAS/Test/test09b.m
+++ b/GraphBLAS/Test/test09b.m
@@ -1,8 +1,8 @@
 function test09b
 %TEST09B test GrB_assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n-----------duplicate I,J test of GB_mex_assign\n') ;
 
diff --git a/GraphBLAS/Test/test10.m b/GraphBLAS/Test/test10.m
index 918b5c366b..40353b4a86 100644
--- a/GraphBLAS/Test/test10.m
+++ b/GraphBLAS/Test/test10.m
@@ -1,8 +1,8 @@
 function test10
 %TEST10 test GrB_apply
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest10: GrB_apply tests\n') ;
 
@@ -11,7 +11,6 @@
 unary_ops = unary_ops.all ;
 
 rng ('default') ;
-GrB.burble (0) ;
 
 m = 8 ;
 n = 4 ;
@@ -71,8 +70,9 @@
     A_pos5_matrix (A_matrix > 5) = 5 ;
     B_pos5_matrix (B_matrix > 5) = 5 ;
 
-    atype_is_double = isequal (atype, 'double') ;
-    if (atype_is_double)
+    % do longer tests for a few types
+    longer_tests = isequal (atype, 'double') || isequal (atype, 'int64') ;
+    if (longer_tests)
         hrange = [0 1] ;
         crange = [0 1] ;
     else
@@ -82,7 +82,7 @@
 
     for k2 = 1:length(unary_ops)
         op.opname = unary_ops {k2} ;
-        if (atype_is_double)
+        if (longer_tests)
             fprintf ('\n') ;
         end
         fprintf (' %s', op.opname) ;
@@ -158,10 +158,25 @@
                 tol = 1e-12 ;
             end
 
-            for A_is_hyper = hrange
+            for A_sparsity = [hrange 2]
+
+            if (A_sparsity == 0)
+                A_is_hyper = 0 ;
+                A_is_bitmap = 0 ;
+                A_sparsity_control = 2 ;    % sparse
+            elseif (A_sparsity == 1)
+                A_is_hyper = 1 ;
+                A_is_bitmap = 0 ;
+                A_sparsity_control = 1 ;    % hypersparse
+            else
+                A_is_hyper = 0 ;
+                A_is_bitmap = 1 ;
+                A_sparsity_control = 4 ;    % bitmap
+            end
+
             for A_is_csc   = crange
 
-            if (atype_is_double)
+            if (longer_tests)
                 fprintf ('.') ;
             end
 
@@ -174,6 +189,9 @@
             B.is_csc    = A_is_csc ; B.is_hyper    = A_is_hyper ;
             Mask.is_csc = M_is_csc ; Mask.is_hyper = M_is_hyper ;
 
+            A.sparsity = A_sparsity_control ;
+            B.sparsity = A_sparsity_control ;
+
             % no mask
             C1 = GB_spec_apply (Cin, [], [], op, A, []) ;
             C2 = GB_mex_apply  (Cin, [], [], op, A, []) ;
diff --git a/GraphBLAS/Test/test100.m b/GraphBLAS/Test/test100.m
index 927c6a835c..5d9b67fea7 100644
--- a/GraphBLAS/Test/test100.m
+++ b/GraphBLAS/Test/test100.m
@@ -1,8 +1,8 @@
 function test100 
 %TEST100 test GB_mex_isequal
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [~, ~, ~, types, ~, ~] = GB_spec_opsall ;
 types = types.all ;
diff --git a/GraphBLAS/Test/test101.m b/GraphBLAS/Test/test101.m
index 9cbcd1cbc1..e3bc13c2a1 100644
--- a/GraphBLAS/Test/test101.m
+++ b/GraphBLAS/Test/test101.m
@@ -1,8 +1,8 @@
 function test101
 %TEST101 test import/export
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/Test/test102.m b/GraphBLAS/Test/test102.m
index b571739a32..5de432d9bb 100644
--- a/GraphBLAS/Test/test102.m
+++ b/GraphBLAS/Test/test102.m
@@ -1,10 +1,10 @@
 function test102
-%TEST102 test GB_AxB_flopcount
+%TEST102 test GB_AxB_saxpy3_flopcount
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-fprintf ('\ntest102: testing GB_AxB_flopcount\n') ;
+fprintf ('\ntest102: testing GB_AxB_saxpy3_flopcount\n') ;
 
 rng ('default') ;
 
diff --git a/GraphBLAS/Test/test103.m b/GraphBLAS/Test/test103.m
index 1a41ab1dfc..119b785f2a 100644
--- a/GraphBLAS/Test/test103.m
+++ b/GraphBLAS/Test/test103.m
@@ -1,8 +1,8 @@
 function test103
 %TEST103 test aliases in GrB_transpose
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng 'default'
 fprintf ('\ntest103: test aliases in GrB_transpose\n') ;
@@ -14,21 +14,21 @@
                 for is_hyper = 0:1
 
                     if (is_hyper)
-                        hyper_ratio = 1 ;
+                        hyper_switch = 1 ;
                     else
-                        hyper_ratio = 0 ;
+                        hyper_switch = 0 ;
                     end
 
                     % A = GB_spec_random (m, n, d, scale, class,
-                    % is_csc,is_hyper,hyper_ratio)
+                    % is_csc,is_hyper,hyper_switch)
 
                     C = GB_spec_random (m, n, d, 100, 'double', ...
-                        is_csc, is_hyper, hyper_ratio) ;
+                        is_csc, is_hyper, hyper_switch) ;
 
                     M = sparse (ones (m, n)) ;
 
                     A = GB_spec_random (m, n, d, 100, 'double', ...
-                        is_csc, is_hyper, hyper_ratio) ;
+                        is_csc, is_hyper, hyper_switch) ;
 
                     % C<M>=A, to test shallow cast
                     desc.inp0 = 'tran' ;
diff --git a/GraphBLAS/Test/test104.m b/GraphBLAS/Test/test104.m
index eae3da7814..4fa0354c96 100644
--- a/GraphBLAS/Test/test104.m
+++ b/GraphBLAS/Test/test104.m
@@ -1,20 +1,42 @@
 function test104
 %TEST104 export/import
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-rng 'default'
+rng ('default')
 fprintf ('\ntest104: export/import tests\n') ;
 
 for m = [0 1 5 100]
     for n = [0 1 5 100]
-        for d = [0 0.1 0.5 1]
+        if (n == 1)
+            fmts = [-9:-1 1:9] ;
+        else
+            fmts = [1:9] ;
+        end
+        for d = [0 0.1 0.5 inf]
             A = GB_spec_random (m, n, d) ;
-            for format_matrix = 0:3
-                for format_export = 0:3
-                    C = GB_mex_export_import (A, format_matrix, format_export) ;
-                    GB_spec_compare (C, A) ;
+            nz = nnz (A.pattern) ;
+            is_sparse = (nz < m*n) ;
+            fprintf ('.') ;
+            for fmt_matrix = fmts
+                for fmt_export = 0:11
+                    try
+                        C = GB_mex_export_import (A, fmt_matrix, fmt_export) ;
+                        GB_spec_compare (C, A) ;
+                    catch me
+                        % should fail if A is sparse and it is attempted to
+                        % be exported as full
+                        ok = is_sparse && ...
+                            (fmt_export == 6 || fmt_export == 7 || ...
+                             fmt_export == -6 || fmt_export == -7) ;
+                        if (~ok)
+                            % this should not have failed
+                            me
+                            me.message
+                            assert (false) ;
+                        end
+                    end
                 end
             end
         end
diff --git a/GraphBLAS/Test/test105.m b/GraphBLAS/Test/test105.m
index 4af0f8f81b..dfea74f001 100644
--- a/GraphBLAS/Test/test105.m
+++ b/GraphBLAS/Test/test105.m
@@ -1,8 +1,8 @@
 function test105
 %TEST105 eWiseAdd with hypersparse matrices
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng 'default'
 fprintf ('\ntest105: eWiseAdd with hypersparse\n') ;
@@ -14,19 +14,19 @@
                 for is_hyper = 0:1
 
                     if (is_hyper)
-                        hyper_ratio = 1 ;
+                        hyper_switch = 1 ;
                     else
-                        hyper_ratio = 0 ;
+                        hyper_switch = 0 ;
                     end
 
                     A = GB_spec_random (m, n, d, 100, 'double', ...
-                        is_csc, is_hyper, hyper_ratio) ;
+                        is_csc, is_hyper, hyper_switch) ;
                     B = GB_spec_random (m, n, d, 100, 'double', ...
-                        is_csc, is_hyper, hyper_ratio) ;
+                        is_csc, is_hyper, hyper_switch) ;
                     C = GB_spec_random (m, n, d, 100, 'double', ...
-                        is_csc, is_hyper, hyper_ratio) ;
+                        is_csc, is_hyper, hyper_switch) ;
                     M = GB_spec_random (m, n, d, 100, 'double', ...
-                        is_csc, is_hyper, hyper_ratio) ;
+                        is_csc, is_hyper, hyper_switch) ;
 
                     % C = A+B, no mask
                     C0 = GB_spec_Matrix_eWiseAdd (C, [ ], [ ], ...
diff --git a/GraphBLAS/Test/test106.m b/GraphBLAS/Test/test106.m
index fc4d5b986c..5ad342a767 100644
--- a/GraphBLAS/Test/test106.m
+++ b/GraphBLAS/Test/test106.m
@@ -1,8 +1,8 @@
 function test106
 %TEST106 GxB_subassign with alias
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng 'default'
 fprintf ('\ntest106: GxB_subassign with alias\n') ;
@@ -18,19 +18,19 @@
                 for is_hyper = 0:1
 
                     if (is_hyper)
-                        hyper_ratio = 1 ;
+                        hyper_switch = 1 ;
                     else
-                        hyper_ratio = 0 ;
+                        hyper_switch = 0 ;
                     end
 
                     A = GB_spec_random (m, n, d, 100, 'double', ...
-                        is_csc, is_hyper, hyper_ratio) ;
+                        is_csc, is_hyper, hyper_switch) ;
                     B = GB_spec_random (m, n, d, 100, 'double', ...
-                        is_csc, is_hyper, hyper_ratio) ;
+                        is_csc, is_hyper, hyper_switch) ;
                     C = GB_spec_random (m, n, d, 100, 'double', ...
-                        is_csc, is_hyper, hyper_ratio) ;
+                        is_csc, is_hyper, hyper_switch) ;
                     M = GB_spec_random (m, n, d, 100, 'double', ...
-                        is_csc, is_hyper, hyper_ratio) ;
+                        is_csc, is_hyper, hyper_switch) ;
 
                     C1a = GB_mex_subassign  (C, [ ], [ ],  C,  I0, J0, [ ]) ;
                     C2  = GB_spec_subassign (C, [ ], [ ],  C,  I1, J1, [ ], 0) ;
diff --git a/GraphBLAS/Test/test107.m b/GraphBLAS/Test/test107.m
index 355f8ddc25..d68f0d9d0a 100644
--- a/GraphBLAS/Test/test107.m
+++ b/GraphBLAS/Test/test107.m
@@ -1,8 +1,8 @@
 function test107
 %TEST107 user-defined terminal monoid
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test107: reduce with built-in and  user-defined terminal monoids\n') ;
 
diff --git a/GraphBLAS/Test/test108.m b/GraphBLAS/Test/test108.m
index dd21eac674..b42d4506d9 100644
--- a/GraphBLAS/Test/test108.m
+++ b/GraphBLAS/Test/test108.m
@@ -1,8 +1,8 @@
 function test108
 %TEST108 test boolean monoids
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % only well-defined if op is associative
 
diff --git a/GraphBLAS/Test/test109.m b/GraphBLAS/Test/test109.m
index d39c4c2434..d741737326 100644
--- a/GraphBLAS/Test/test109.m
+++ b/GraphBLAS/Test/test109.m
@@ -1,8 +1,8 @@
 function test109
 %TEST109 terminal monoid with user-defined type
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest109: terminal monoid with user-defined type\n') ;
 
diff --git a/GraphBLAS/Test/test10_compare.m b/GraphBLAS/Test/test10_compare.m
index a6e870fc10..e629e954d2 100644
--- a/GraphBLAS/Test/test10_compare.m
+++ b/GraphBLAS/Test/test10_compare.m
@@ -7,8 +7,8 @@ function test10_compare (op, C1, C2, tol)
 % acos, asin, and other a* trig functions can return different but valid
 % results.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 C1 = GB_spec_matrix (C1, 0) ;
 C2 = GB_spec_matrix (C2, 0) ;
diff --git a/GraphBLAS/Test/test11.m b/GraphBLAS/Test/test11.m
index 29a7f9cdb4..6d01158acd 100644
--- a/GraphBLAS/Test/test11.m
+++ b/GraphBLAS/Test/test11.m
@@ -1,8 +1,8 @@
 function test11
 %TEST11 test GrB_*_extractTuples
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [~, ~, ~, types, ~, ~] = GB_spec_opsall ;
 types = types.all ;
diff --git a/GraphBLAS/Test/test110.m b/GraphBLAS/Test/test110.m
index 9ec48e4b4d..db28c8583a 100644
--- a/GraphBLAS/Test/test110.m
+++ b/GraphBLAS/Test/test110.m
@@ -1,8 +1,8 @@
 function test110
 %TEST110 test accum/mask (binary search of M(:,j))
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest110:  test accum/mask (binary search of M(:,j))\n') ;
 
diff --git a/GraphBLAS/Test/test111.m b/GraphBLAS/Test/test111.m
index d86a1b937a..1b22099c39 100644
--- a/GraphBLAS/Test/test111.m
+++ b/GraphBLAS/Test/test111.m
@@ -1,8 +1,8 @@
 function test111
 %TEST111 performance test for eWiseAdd
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest111 performance tests : eWiseAdd \n') ;
 rng ('default') ;
@@ -74,7 +74,7 @@
             end
             fprintf ('nthreads %2d GraphBLAS time: %12.4f ', nthreads, tg) ;
             fprintf ('speedup %12.4f over MATLAB: %12.4f\n', t1/tg, tm/tg) ;
-            assert (spok (C4) == 1) ;
+            assert (GB_spok (C4) == 1) ;
             assert (isequal (C1, C4)) ;
         end
 
@@ -104,7 +104,7 @@
             end
             fprintf ('nthreads %2d GraphBLAS time: %12.4f ', nthreads, tg) ;
             fprintf ('speedup %12.4f over MATLAB: %12.4f\n', t1/tg, tm/tg) ;
-            assert (spok (C4.matrix) == 1) ;
+            assert (GB_spok (C4.matrix) == 1) ;
             assert (isequal (C1, C4.matrix)) ;
         end
 
@@ -138,7 +138,7 @@
             end
             fprintf ('nthreads %2d GraphBLAS time: %12.4f ', nthreads, tg) ;
             fprintf ('speedup %12.4f over MATLAB: %12.4f\n', t1/tg, tm/tg) ;
-            assert (spok (C4.matrix) == 1) ;
+            assert (GB_spok (C4.matrix) == 1) ;
             assert (isequal (C1, C4.matrix)) ;
         end
 
@@ -167,7 +167,7 @@
             end
             fprintf ('nthreads %2d GraphBLAS time: %12.4f ', nthreads, tg) ;
             fprintf ('speedup %12.4f over MATLAB: %12.4f\n', t1/tg, tm/tg) ;
-            assert (spok (C4.matrix) == 1) ;
+            assert (GB_spok (C4.matrix) == 1) ;
             assert (isequal (C1, C4.matrix)) ;
         end
 
@@ -197,7 +197,7 @@
             end
             fprintf ('nthreads %2d GraphBLAS time: %12.4f ', nthreads, tg) ;
             fprintf ('speedup %12.4f over MATLAB: %12.4f\n', t1/tg, tm/tg) ;
-            assert (spok (C4.matrix) == 1) ;
+            assert (GB_spok (C4.matrix) == 1) ;
             assert (isequal (C1, C4.matrix)) ;
         end
 
diff --git a/GraphBLAS/Test/test112.m b/GraphBLAS/Test/test112.m
index 966493599c..d11f3ecbba 100644
--- a/GraphBLAS/Test/test112.m
+++ b/GraphBLAS/Test/test112.m
@@ -1,8 +1,8 @@
 function test112
 %TEST112 test row/col scale
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test112: row/col scale\n') ;
 
diff --git a/GraphBLAS/Test/test113.m b/GraphBLAS/Test/test113.m
index e66fe08489..1c6cc2f7ad 100644
--- a/GraphBLAS/Test/test113.m
+++ b/GraphBLAS/Test/test113.m
@@ -1,8 +1,8 @@
 function test113
 %TEST113 performance tests for GrB_kron
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test113: performance tests for GrB_kron\n') ;
 
diff --git a/GraphBLAS/Test/test114.m b/GraphBLAS/Test/test114.m
index e34ff1df6b..d7c9c3c269 100644
--- a/GraphBLAS/Test/test114.m
+++ b/GraphBLAS/Test/test114.m
@@ -1,8 +1,8 @@
 function test114
 %TEST114 performance of reduce-to-scalar
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/Test/test115.m b/GraphBLAS/Test/test115.m
index 938592b1cb..230b955a61 100644
--- a/GraphBLAS/Test/test115.m
+++ b/GraphBLAS/Test/test115.m
@@ -1,8 +1,8 @@
 function test115
 %TEST115 test GB_assign, scalar expansion and zombies, with duplicates
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
@@ -47,5 +47,12 @@
 
     GB_spec_compare (C1, C2) ;
 
+    for C_sparsity = [2 4]
+        for M_sparsity = [2 4]
+            C1 = GB_mex_assign (C, Work, [C_sparsity M_sparsity]) ;
+            GB_spec_compare (C1, C2) ;
+        end
+    end
+
 fprintf ('\ntest115: all tests passed\n') ;
 
diff --git a/GraphBLAS/Test/test116.m b/GraphBLAS/Test/test116.m
index 7bad954783..7256de5a95 100644
--- a/GraphBLAS/Test/test116.m
+++ b/GraphBLAS/Test/test116.m
@@ -1,8 +1,8 @@
 function test116
 %TEST116 performance tests for GrB_assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test116:---------------- C(I,J)=A and C=A(I,J) performance\n') ;
 
diff --git a/GraphBLAS/Test/test117.m b/GraphBLAS/Test/test117.m
index ede1b5de3d..dab8d60d0b 100644
--- a/GraphBLAS/Test/test117.m
+++ b/GraphBLAS/Test/test117.m
@@ -3,8 +3,8 @@
 
 % test C(:,:)<M> += A
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test117 ----------------------------------- C(:,:)<M> += A\n') ;
 
diff --git a/GraphBLAS/Test/test118.m b/GraphBLAS/Test/test118.m
index d99247ffa5..bf777e1b6f 100644
--- a/GraphBLAS/Test/test118.m
+++ b/GraphBLAS/Test/test118.m
@@ -3,8 +3,8 @@
 
 % test C(:,:)<M> = A
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test118 ----------------------------------- C(:,:)<M> = A\n') ;
 
diff --git a/GraphBLAS/Test/test119.m b/GraphBLAS/Test/test119.m
index 1292396ce9..a551d78f4a 100644
--- a/GraphBLAS/Test/test119.m
+++ b/GraphBLAS/Test/test119.m
@@ -1,8 +1,8 @@
 function test119
 %TEST119 performance tests for GrB_assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test119:-------------------  C(I,J) += scalar:\n') ;
 
diff --git a/GraphBLAS/Test/test12.m b/GraphBLAS/Test/test12.m
index ac7ba0c7e4..45ec7fff86 100644
--- a/GraphBLAS/Test/test12.m
+++ b/GraphBLAS/Test/test12.m
@@ -6,8 +6,8 @@ function test12 (cover)
 % if cover=1, do quick statement coverage tests
 % if cover=0, run larger problems
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 1)
     cover = 1 ;
@@ -22,7 +22,7 @@ function test12 (cover)
 rng ('default') ;
 
 A = GB_mex_wathen (2,2) ;
-assert (spok (A) == 1) ;
+assert (GB_spok (A) == 1) ;
 assert (nnz (A-A') == 0) ;
 
 % this test is too slow when debugging
diff --git a/GraphBLAS/Test/test120.m b/GraphBLAS/Test/test120.m
index 1bd7df686c..c367dd91c6 100644
--- a/GraphBLAS/Test/test120.m
+++ b/GraphBLAS/Test/test120.m
@@ -1,8 +1,8 @@
 function test120
 %TEST120 performance tests for GrB_assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test120:-------------------  C(I,J)<!M> += scalar:\n') ;
 
diff --git a/GraphBLAS/Test/test121.m b/GraphBLAS/Test/test121.m
index bc1034aedd..5d176a4f72 100644
--- a/GraphBLAS/Test/test121.m
+++ b/GraphBLAS/Test/test121.m
@@ -1,8 +1,8 @@
 function test121
 %TEST121 performance tests for GrB_assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test121:---------------- C(I,J)+=A performance\n') ;
 
diff --git a/GraphBLAS/Test/test122.m b/GraphBLAS/Test/test122.m
index e9886a2776..7c8cc216ee 100644
--- a/GraphBLAS/Test/test122.m
+++ b/GraphBLAS/Test/test122.m
@@ -1,8 +1,8 @@
 function test122
 %TEST122 performance tests for GrB_assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test120:-------------------  C(I,J)<!M> += A:\n') ;
 
diff --git a/GraphBLAS/Test/test123.m b/GraphBLAS/Test/test123.m
index 66a2d39cbb..c45cb83804 100644
--- a/GraphBLAS/Test/test123.m
+++ b/GraphBLAS/Test/test123.m
@@ -1,8 +1,8 @@
 function test123
 %TEST123 test MIS on large matrix
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test123: test MIS on large matrix\n') ;
 
diff --git a/GraphBLAS/Test/test124.m b/GraphBLAS/Test/test124.m
index 601d4cacc8..806fac9b34 100644
--- a/GraphBLAS/Test/test124.m
+++ b/GraphBLAS/Test/test124.m
@@ -1,8 +1,8 @@
 function test124
 %TEST124 GrB_extract, trigger case 6
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test124: GrB_extract, trigger case 6\n') ;
 
diff --git a/GraphBLAS/Test/test125.m b/GraphBLAS/Test/test125.m
index de4fc1aefe..12526a6d76 100644
--- a/GraphBLAS/Test/test125.m
+++ b/GraphBLAS/Test/test125.m
@@ -2,10 +2,11 @@
 %TEST125 test GrB_mxm: row and column scaling
 % all built-in semirings, no typecast, no mask
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, ~, add_ops, types, ~, ~] = GB_spec_opsall ;
+% mult_ops = binops.positional ;
 mult_ops = binops.all ;
 types = types.all ;
 
diff --git a/GraphBLAS/Test/test126.m b/GraphBLAS/Test/test126.m
index 792b6d5ee5..695eb8a9d3 100644
--- a/GraphBLAS/Test/test126.m
+++ b/GraphBLAS/Test/test126.m
@@ -1,8 +1,8 @@
 function test126
 %TEST126 test GrB_reduce to vector on a very sparse matrix 
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test126:  test GrB_reduce to vector on a very sparse matrix\n') ;
 rng ('default') ;
diff --git a/GraphBLAS/Test/test127.m b/GraphBLAS/Test/test127.m
index 8195b4e338..48aa450662 100644
--- a/GraphBLAS/Test/test127.m
+++ b/GraphBLAS/Test/test127.m
@@ -1,8 +1,8 @@
 function test127
 %TEST127 test GrB_eWiseAdd and GrB_eWiseMult (all types and operators)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, ~, ~, types, ~, ~] = GB_spec_opsall ;
 binops = binops.all ;
@@ -19,6 +19,7 @@
 dtn = struct ( 'inp0', 'tran' ) ;
 dnt = struct ( 'inp1', 'tran' ) ;
 dtt = struct ( 'inp0', 'tran', 'inp1', 'tran' ) ;
+dnn_notM = struct ('mask', 'complement') ;
 
 Amat2 = sparse (2 * sprand (m,n, 0.8)) ;
 Bmat2 = sparse (2 * sprand (m,n, 0.8)) ;
@@ -107,13 +108,37 @@
 
         fprintf (' %s', binop) ;
 
-        for A_is_hyper = 0 % 0:1
+        for A_sparsity_control = 0:1
         for A_is_csc   = 0 % 0:1
-        for B_is_hyper = 0 % 0:1
+        for B_sparsity_control = 0:1
         for B_is_csc   = 0 % 0:1
-        for C_is_hyper = 0 % 0:1
+        for C_sparsity_control = 0:1
         for C_is_csc   = 0 % 0:1
 
+        if (A_sparsity_control == 0)
+            A_is_hyper = 0 ; % not hyper
+            A_sparsity = 1 ; % sparse
+        else
+            A_is_hyper = 0 ; % not hyper
+            A_sparsity = 4 ; % bitmap
+        end
+
+        if (B_sparsity_control == 0)
+            B_is_hyper = 0 ; % not hyper
+            B_sparsity = 1 ; % sparse
+        else
+            B_is_hyper = 0 ; % not hyper
+            B_sparsity = 4 ; % bitmap
+        end
+
+        if (C_sparsity_control == 0)
+            C_is_hyper = 0 ; % not hyper
+            C_sparsity = 1 ; % sparse
+        else
+            C_is_hyper = 0 ; % not hyper
+            C_sparsity = 4 ; % bitmap
+        end
+
         for native = 1 % 0:1
 
         clear A AT B BT C u v
@@ -138,23 +163,27 @@
 
         A.is_hyper = A_is_hyper ;
         A.is_csc   = A_is_csc   ;
+        A.sparsity = A_sparsity ;
         if (native)
             A.class = op.optype ;
         end
 
         AT.is_hyper = A_is_hyper ;
+        AT.sparsity = A_sparsity ;
         AT.is_csc   = A_is_csc   ;
         if (native)
             AT.class = op.optype ;
         end
 
         B.is_hyper = B_is_hyper ;
+        B.sparsity = B_sparsity ;
         B.is_csc   = B_is_csc   ;
         if (native)
             B.class = op.optype ;
         end
 
         BT.is_hyper = B_is_hyper ;
+        BT.sparsity = B_sparsity ;
         BT.is_csc   = B_is_csc   ;
         if (native)
             BT.class = op.optype ;
@@ -162,6 +191,7 @@
 
         C.is_hyper = C_is_hyper ;
         C.is_csc   = C_is_csc   ;
+        C.sparsity = C_sparsity ;
 
         u.is_csc = true ;
         if (native)
@@ -177,92 +207,80 @@
         % A+B
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseAdd ...
-            (C, [ ], [ ], op, A, B, dnn);
-        C1 = GB_mex_Matrix_eWiseAdd ...
-            (C, [ ], [ ], op, A, B, dnn);
+        C0 = GB_spec_Matrix_eWiseAdd (C, [ ], [ ], op, A, B, dnn) ;
+        C1 = GB_mex_Matrix_eWiseAdd  (C, [ ], [ ], op, A, B, dnn) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
-        w0 = GB_spec_Vector_eWiseAdd ...
-            (w, [ ], [ ], op, u, v, dnn);
-        w1 = GB_mex_Vector_eWiseAdd ...
-            (w, [ ], [ ], op, u, v, dnn);
+        w0 = GB_spec_Vector_eWiseAdd (w, [ ], [ ], op, u, v, dnn) ;
+        w1 = GB_mex_Vector_eWiseAdd  (w, [ ], [ ], op, u, v, dnn) ;
         GB_spec_compare (w0, w1, 0, tol) ;
 
         %---------------------------------------
         % A'+B
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseAdd ...
-            (C, [ ], [ ], op, AT, B, dtn);
-        C1 = GB_mex_Matrix_eWiseAdd ...
-            (C, [ ], [ ], op, AT, B, dtn);
+        C0 = GB_spec_Matrix_eWiseAdd (C, [ ], [ ], op, AT, B, dtn) ;
+        C1 = GB_mex_Matrix_eWiseAdd  (C, [ ], [ ], op, AT, B, dtn) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
         %---------------------------------------
         % A+B'
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseAdd ...
-            (C, [ ], [ ], op, A, BT, dnt);
-        C1 = GB_mex_Matrix_eWiseAdd ...
-            (C, [ ], [ ], op, A, BT, dnt);
+        C0 = GB_spec_Matrix_eWiseAdd (C, [ ], [ ], op, A, BT, dnt) ;
+        C1 = GB_mex_Matrix_eWiseAdd  (C, [ ], [ ], op, A, BT, dnt) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
         %---------------------------------------
         % A'+B'
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseAdd ...
-            (C, [ ], [ ], op, AT, BT, dtt);
-        C1 = GB_mex_Matrix_eWiseAdd ...
-            (C, [ ], [ ], op, AT, BT, dtt);
+        C0 = GB_spec_Matrix_eWiseAdd (C, [ ], [ ], op, AT, BT, dtt) ;
+        C1 = GB_mex_Matrix_eWiseAdd  (C, [ ], [ ], op, AT, BT, dtt) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
         %---------------------------------------
         % A.*B
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseMult ...
-            (C, [ ], [ ], op, A, B, dnn);
-        C1 = GB_mex_Matrix_eWiseMult ...
-            (C, [ ], [ ], op, A, B, dnn);
+        C0 = GB_spec_Matrix_eWiseMult (C, [ ], [ ], op, A, B, dnn) ;
+        C1 = GB_mex_Matrix_eWiseMult  (C, [ ], [ ], op, A, B, dnn) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
-        w0 = GB_spec_Vector_eWiseMult ...
-            (w, [ ], [ ], op, u, v, dnn);
-        w1 = GB_mex_Vector_eWiseMult ...
-            (w, [ ], [ ], op, u, v, dnn);
+        w0 = GB_spec_Vector_eWiseMult (w, [ ], [ ], op, u, v, dnn) ;
+        w1 = GB_mex_Vector_eWiseMult  (w, [ ], [ ], op, u, v, dnn) ;
         GB_spec_compare (w0, w1, 0, tol) ;
 
         %---------------------------------------
         % A'.*B
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseMult ...
-            (C, [ ], [ ], op, AT, B, dtn);
-        C1 = GB_mex_Matrix_eWiseMult ...
-            (C, [ ], [ ], op, AT, B, dtn);
+        C0 = GB_spec_Matrix_eWiseMult (C, [ ], [ ], op, AT, B, dtn) ;
+        C1 = GB_mex_Matrix_eWiseMult  (C, [ ], [ ], op, AT, B, dtn) ;
+        GB_spec_compare (C0, C1, 0, tol) ;
+
+        %---------------------------------------
+        % B.*A'
+        %---------------------------------------
+
+        C0 = GB_spec_Matrix_eWiseMult (C, [ ], [ ], op, B, AT, dnt) ;
+        C1 = GB_mex_Matrix_eWiseMult  (C, [ ], [ ], op, B, AT, dnt) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
         %---------------------------------------
         % A.*B'
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseMult ...
-            (C, [ ], [ ], op, A, BT, dnt);
-        C1 = GB_mex_Matrix_eWiseMult ...
-            (C, [ ], [ ], op, A, BT, dnt);
+        C0 = GB_spec_Matrix_eWiseMult (C, [ ], [ ], op, A, BT, dnt) ;
+        C1 = GB_mex_Matrix_eWiseMult  (C, [ ], [ ], op, A, BT, dnt) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
         %---------------------------------------
         % A'.*B'
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseMult ...
-            (C, [ ], [ ], op, AT, BT, dtt);
-        C1 = GB_mex_Matrix_eWiseMult ...
-            (C, [ ], [ ], op, AT, BT, dtt);
+        C0 = GB_spec_Matrix_eWiseMult (C, [ ], [ ], op, AT, BT, dtt) ;
+        C1 = GB_mex_Matrix_eWiseMult  (C, [ ], [ ], op, AT, BT, dtt) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
         %-----------------------------------------------
@@ -270,7 +288,8 @@
         %-----------------------------------------------
 
         for M_is_very_sparse = 0 % 0:1
-        for M_is_hyper = 0 % 0:1
+        % for M_is_hyper = 0 % 0:1
+        for M_sparsity_control = 0:1
         for M_is_csc   = 0 % 0:1
 
         clear Mask mask
@@ -281,7 +300,17 @@
             Mask.matrix = Maskmat ;
             mask.matrix = maskvec ;
         end
+
+        if (M_sparsity_control == 0)
+            M_is_hyper = 0 ; % not hyper
+            M_sparsity = 1 ; % sparse
+        else
+            M_is_hyper = 0 ; % not hyper
+            M_sparsity = 4 ; % bitmap
+        end
+
         Mask.is_hyper = M_is_hyper ;
+        Mask.sparsity = M_sparsity ;
         Mask.is_csc   = M_is_csc   ;
         mask.is_csc = true ;
 
@@ -289,94 +318,98 @@
         % A+B, with mask
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseAdd ...
-            (C, Mask, [ ], op, A, B, dnn);
-        C1 = GB_mex_Matrix_eWiseAdd ...
-            (C, Mask, [ ], op, A, B, dnn);
+        C0 = GB_spec_Matrix_eWiseAdd (C, Mask, [ ], op, A, B, dnn) ;
+        C1 = GB_mex_Matrix_eWiseAdd  (C, Mask, [ ], op, A, B, dnn) ;
+        GB_spec_compare (C0, C1, 0, tol) ;
+
+        w0 = GB_spec_Vector_eWiseAdd (w, mask, [ ], op, u, v, dnn) ;
+        w1 = GB_mex_Vector_eWiseAdd  (w, mask, [ ], op, u, v, dnn) ;
+        GB_spec_compare (w0, w1, 0, tol) ;
+
+        %---------------------------------------
+        % A+B, with mask complemented
+        %---------------------------------------
+
+        C0 = GB_spec_Matrix_eWiseAdd (C, Mask, [ ], op, A, B, dnn_notM) ;
+        C1 = GB_mex_Matrix_eWiseAdd  (C, Mask, [ ], op, A, B, dnn_notM) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
-        w0 = GB_spec_Vector_eWiseAdd ...
-            (w, mask, [ ], op, u, v, dnn);
-        w1 = GB_mex_Vector_eWiseAdd ...
-            (w, mask, [ ], op, u, v, dnn);
+        w0 = GB_spec_Vector_eWiseAdd (w, mask, [ ], op, u, v, dnn_notM) ;
+        w1 = GB_mex_Vector_eWiseAdd  (w, mask, [ ], op, u, v, dnn_notM) ;
         GB_spec_compare (w0, w1, 0, tol) ;
 
         %---------------------------------------
         % A'+B, with mask
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseAdd ...
-            (C, Mask, [ ], op, AT, B, dtn);
-        C1 = GB_mex_Matrix_eWiseAdd ...
-            (C, Mask, [ ], op, AT, B, dtn);
+        C0 = GB_spec_Matrix_eWiseAdd (C, Mask, [ ], op, AT, B, dtn) ;
+        C1 = GB_mex_Matrix_eWiseAdd  (C, Mask, [ ], op, AT, B, dtn) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
         %---------------------------------------
         % A+B', with mask
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseAdd ...
-            (C, Mask, [ ], op, A, BT, dnt);
-        C1 = GB_mex_Matrix_eWiseAdd ...
-            (C, Mask, [ ], op, A, BT, dnt);
+        C0 = GB_spec_Matrix_eWiseAdd (C, Mask, [ ], op, A, BT, dnt) ;
+        C1 = GB_mex_Matrix_eWiseAdd  (C, Mask, [ ], op, A, BT, dnt) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
         %---------------------------------------
         % A'+B', with mask
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseAdd ...
-            (C, Mask, [ ], op, AT, BT, dtt);
-        C1 = GB_mex_Matrix_eWiseAdd ...
-            (C, Mask, [ ], op, AT, BT, dtt);
+        C0 = GB_spec_Matrix_eWiseAdd (C, Mask, [ ], op, AT, BT, dtt) ;
+        C1 = GB_mex_Matrix_eWiseAdd  (C, Mask, [ ], op, AT, BT, dtt) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
         %---------------------------------------
         % A.*B, with mask
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseMult ...
-            (C, Mask, [ ], op, A, B, dnn);
-        C1 = GB_mex_Matrix_eWiseMult ...
-            (C, Mask, [ ], op, A, B, dnn);
+        C0 = GB_spec_Matrix_eWiseMult (C, Mask, [ ], op, A, B, dnn) ;
+        C1 = GB_mex_Matrix_eWiseMult  (C, Mask, [ ], op, A, B, dnn) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
-        w0 = GB_spec_Vector_eWiseMult ...
-            (w, mask, [ ], op, u, v, dnn);
-        w1 = GB_mex_Vector_eWiseMult ...
-            (w, mask, [ ], op, u, v, dnn);
+        w0 = GB_spec_Vector_eWiseMult (w, mask, [ ], op, u, v, dnn) ;
+        w1 = GB_mex_Vector_eWiseMult  (w, mask, [ ], op, u, v, dnn) ;
         GB_spec_compare (w0, w1, 0, tol) ;
 
         %---------------------------------------
         % A'.*B, with mask
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseMult ...
-            (C, Mask, [ ], op, AT, B, dtn);
-        C1 = GB_mex_Matrix_eWiseMult ...
-            (C, Mask, [ ], op, AT, B, dtn);
+        C0 = GB_spec_Matrix_eWiseMult (C, Mask, [ ], op, AT, B, dtn) ;
+        C1 = GB_mex_Matrix_eWiseMult  (C, Mask, [ ], op, AT, B, dtn) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
         %---------------------------------------
         % A.*B', with mask
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseMult ...
-            (C, Mask, [ ], op, A, BT, dnt);
-        C1 = GB_mex_Matrix_eWiseMult ...
-            (C, Mask, [ ], op, A, BT, dnt);
+        C0 = GB_spec_Matrix_eWiseMult (C, Mask, [ ], op, A, BT, dnt) ;
+        C1 = GB_mex_Matrix_eWiseMult  (C, Mask, [ ], op, A, BT, dnt) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
         %---------------------------------------
         % A'.*B', with mask
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseMult ...
-            (C, Mask, [ ], op, AT, BT, dtt);
-        C1 = GB_mex_Matrix_eWiseMult ...
-            (C, Mask, [ ], op, AT, BT, dtt);
+        C0 = GB_spec_Matrix_eWiseMult (C, Mask, [ ], op, AT, BT, dtt) ;
+        C1 = GB_mex_Matrix_eWiseMult  (C, Mask, [ ], op, AT, BT, dtt) ;
+        GB_spec_compare (C0, C1, 0, tol) ;
+
+        %---------------------------------------
+        % A.*B, with mask complemented
+        %---------------------------------------
+
+        C0 = GB_spec_Matrix_eWiseMult (C, Mask, [ ], op, A, B, dnn_notM) ;
+        C1 = GB_mex_Matrix_eWiseMult  (C, Mask, [ ], op, A, B, dnn_notM) ;
         GB_spec_compare (C0, C1, 0, tol) ;
 
+        w0 = GB_spec_Vector_eWiseMult (w, mask, [ ], op, u, v, dnn_notM) ;
+        w1 = GB_mex_Vector_eWiseMult  (w, mask, [ ], op, u, v, dnn_notM) ;
+        GB_spec_compare (w0, w1, 0, tol) ;
+
 
         end
         end
diff --git a/GraphBLAS/Test/test128.m b/GraphBLAS/Test/test128.m
index 58862b0525..ce7e62b1c5 100644
--- a/GraphBLAS/Test/test128.m
+++ b/GraphBLAS/Test/test128.m
@@ -1,11 +1,8 @@
 function test128
 %TEST128 test eWiseMult and eWiseAdd, special cases
 
-% C = GB_mex_Matrix_eWiseMult (C, Mask, accum, mult, A, B, desc)
-% C = GB_mex_Matrix_eWiseAdd  (C, Mask, accum, add,  A, B, desc, test)
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest128: test eWiseMult and eWiseAdd, special cases\n') ;
 rng ('default') ;
diff --git a/GraphBLAS/Test/test129.m b/GraphBLAS/Test/test129.m
index 233c2c29f9..15c63ba0b4 100644
--- a/GraphBLAS/Test/test129.m
+++ b/GraphBLAS/Test/test129.m
@@ -3,8 +3,8 @@
 
 % This is a shorter version of test25
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest129: GxB_select tests (tril and nonzero)\n') ;
 
@@ -73,7 +73,7 @@
     cin = GB_mex_cast (0, atype) ;
     % Mask = (sprand (m, n, 0.5) ~= 0) ;
     Mask = GB_random_mask (m, n, 0.5, M_is_csc, M_is_hyper) ;
-    Mask.hyper_ratio = hm ;
+    Mask.hyper_switch = hm ;
 
     fprintf ('.') ;
 
diff --git a/GraphBLAS/Test/test13.m b/GraphBLAS/Test/test13.m
index 45637b4dae..41e272bd3d 100644
--- a/GraphBLAS/Test/test13.m
+++ b/GraphBLAS/Test/test13.m
@@ -1,8 +1,8 @@
 function test13
 %TEST13 test GrB_tranpsose
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 cinclass  = 'int16' ;
 
@@ -17,7 +17,7 @@
 accum.optype = 'logical'
 
 C = GB_mex_transpose  (Cin, [ ], accum, A, [ ]) ;
-assert (spok (C.matrix*1) == 1) ;
+assert (GB_spok (C.matrix*1) == 1) ;
 S = GB_spec_transpose (Cin, [ ], accum, A, [ ]) ;
 
 assert (isequal (full (double (C.matrix)), double (S.matrix))) ;
diff --git a/GraphBLAS/Test/test130.m b/GraphBLAS/Test/test130.m
index 6652c9c375..98a6a77f62 100644
--- a/GraphBLAS/Test/test130.m
+++ b/GraphBLAS/Test/test130.m
@@ -1,8 +1,8 @@
 function test130
 %TEST130 test GrB_apply (hypersparse cases)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest130: quick GrB_apply tests\n') ;
 
diff --git a/GraphBLAS/Test/test131.m b/GraphBLAS/Test/test131.m
index ab4668fa45..d9f26bc13e 100644
--- a/GraphBLAS/Test/test131.m
+++ b/GraphBLAS/Test/test131.m
@@ -1,8 +1,8 @@
 function test131
 %TEST131 test GrB_Matrix_clear
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest131: GrB_Matrix_clear\n') ;
 
@@ -24,5 +24,21 @@
 S = sparse (4,1) ;
 assert (isequal (S, C.matrix)) ;
 
+A = sparse (eye (4)) ;
+C = GB_mex_clear (A) ;
+S = sparse (4,4) ;
+assert (isequal (S, C.matrix)) ;
+
+Ahyper.matrix = A ;
+Ahyper.is_hyper = true ;
+
+C = GB_mex_clear (Ahyper) ;
+assert (isequal (S, C.matrix)) ;
+
+A = sparse (eye (4,1)) ;
+C = GB_mex_clear (A) ;
+S = sparse (4,1) ;
+assert (isequal (S, C.matrix)) ;
+
 fprintf ('\ntest131: all tests passed\n') ;
 
diff --git a/GraphBLAS/Test/test132.m b/GraphBLAS/Test/test132.m
index 663ce8a715..bc5db623ee 100644
--- a/GraphBLAS/Test/test132.m
+++ b/GraphBLAS/Test/test132.m
@@ -1,8 +1,8 @@
 function test132
 %TEST132 test GrB_*_setElement and GrB_*_*build
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % simplified from test45
 
@@ -20,7 +20,7 @@
 A = sparse (A)  ;
 
 C = GB_mex_setElement (A, uint64(1), uint64(1), 99, true) ;
-spok (C.matrix) ;
+GB_spok (C.matrix) ;
 
 A = sprand (67, 67, 0.1) ;
 
@@ -40,7 +40,7 @@
 
 A2 = A ;
 A3 = GB_mex_setElement (A2, I0, J0, X, true) ;
-assert (spok (A3.matrix) == 1)
+assert (GB_spok (A3.matrix) == 1)
 
 assert (isequal (A3.matrix, A1)) ;
 
@@ -63,7 +63,7 @@
 A2.matrix = A ;
 A2.is_hyper = true ;
 A3 = GB_mex_setElement (A2, I0, J0, X, true) ;
-assert (spok (A3.matrix) == 1)
+assert (GB_spok (A3.matrix) == 1)
 assert (isequal (A3.matrix, A1)) ;
 
 fprintf ('\ntest132: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test133.m b/GraphBLAS/Test/test133.m
index 8669c1b664..8d55fea546 100644
--- a/GraphBLAS/Test/test133.m
+++ b/GraphBLAS/Test/test133.m
@@ -1,10 +1,8 @@
 function test133
 %TEST133 test mask operations (GB_masker)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-% C = GB_mex_transpose (C, M, accum, A, desc, test)
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest133: test the mask operation\n') ;
 
diff --git a/GraphBLAS/Test/test134.m b/GraphBLAS/Test/test134.m
index 343efb3f05..ce227c5ab5 100644
--- a/GraphBLAS/Test/test134.m
+++ b/GraphBLAS/Test/test134.m
@@ -1,16 +1,24 @@
-function test134
+function test134(short)
 %TEST134 test GxB_select
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % A shorter version of test25
 
 fprintf ('\ntest134: GxB_select tests\n') ;
 
+if (nargin < 1)
+    short = false ;
+end
+
 [~, ~, ~, types, ~, select_ops] = GB_spec_opsall ;
 types = types.all ;
 
+if (short)
+    types = { 'double' } ;
+end
+
 rng ('default') ;
 
 m = 10 ;
@@ -21,13 +29,24 @@
     atype = types {k1} ;
     fprintf ('%-14s ', atype) ;
 
-    for A_is_hyper = 0:1
+    for A_sparsity = [0 1 2]
     for A_is_csc   = 0:1
     for C_is_hyper = 0:1
     for C_is_csc   = 0:1
     for M_is_hyper = 0:1
     for M_is_csc   = 0:1
 
+    if (A_sparsity == 0)
+        A_is_hyper = 0 ;
+        A_sparsity_control = 2 ;    % sparse
+    elseif (A_sparsity == 1)
+        A_is_hyper = 1 ;
+        A_sparsity_control = 1 ;    % hypersparse
+    else
+        A_is_hyper = 0 ;
+        A_sparsity_control = 4 ;    % bitmap
+    end
+
     if (A_is_hyper)
         ha = 1 ;
     else
@@ -53,7 +72,10 @@
     B = GB_spec_random (n, m, 0.3, 100, atype, A_is_csc, A_is_hyper, ha) ;
     cin = GB_mex_cast (0, atype) ;
     Mask = GB_random_mask (m, n, 0.5, M_is_csc, M_is_hyper) ;
-    Mask.hyper_ratio = hm ;
+    Mask.hyper_switch = hm ;
+
+    A.sparsity = A_sparsity_control ;
+    B.sparsity = A_sparsity_control ;
 
     fprintf ('.') ;
 
diff --git a/GraphBLAS/Test/test135.m b/GraphBLAS/Test/test135.m
index 77308833e6..878e17bad1 100644
--- a/GraphBLAS/Test/test135.m
+++ b/GraphBLAS/Test/test135.m
@@ -1,8 +1,8 @@
 function test135
 %TEST135 reduce-to-scalar, built-in monoids with terminal values
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test135: reduce to scalar\n') ;
 
diff --git a/GraphBLAS/Test/test136.m b/GraphBLAS/Test/test136.m
index 32d1ef5cfe..84327525dc 100644
--- a/GraphBLAS/Test/test136.m
+++ b/GraphBLAS/Test/test136.m
@@ -1,8 +1,8 @@
 function test136
 %TEST136 GxB_subassign, method 08, 09, 11
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test136: GxB_subassign, special cases\n') ;
 
@@ -99,7 +99,10 @@
         Work (k).A, Work (k).I, Work (k).J, Work (k).desc, false) ;
 end
 
-C2 = GB_mex_subassign  (C, Work2) ;
+C2 = GB_mex_subassign (C, Work2) ;
+GB_spec_compare (C1, C2) ;
+
+C2 = GB_mex_subassign (C, Work2, [2 2]) ;
 GB_spec_compare (C1, C2) ;
 
 fprintf ('test136: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test137.m b/GraphBLAS/Test/test137.m
index 261ce10d0e..591a073b75 100644
--- a/GraphBLAS/Test/test137.m
+++ b/GraphBLAS/Test/test137.m
@@ -1,8 +1,8 @@
 function test137
 %TEST137 GrB_eWiseMult with FIRST and SECOND operators
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test137: GrB_eWiseMult with FIRST and SECOND operators\n') ;
 
diff --git a/GraphBLAS/Test/test138.m b/GraphBLAS/Test/test138.m
index 0cd9a8b716..daf8b57764 100644
--- a/GraphBLAS/Test/test138.m
+++ b/GraphBLAS/Test/test138.m
@@ -1,8 +1,8 @@
 function test138
 %TEST138 test assign, with coarse-only tasks in IxJ slice
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/Test/test139.m b/GraphBLAS/Test/test139.m
index a73a07e4d9..efb8c5ffea 100644
--- a/GraphBLAS/Test/test139.m
+++ b/GraphBLAS/Test/test139.m
@@ -1,8 +1,8 @@
 function test139
 %TEST139 merge sort, special cases
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test139 --------------- merge sort, special cases\n') ;
 rng ('default') ;
@@ -28,31 +28,4 @@
 [a b] = GB_mex_msort_2 (J0, I0, 2) ;
 assert (isequal (IJ1, [a b])) ;
 
-a = GB_mex_msort_1 (I0, 1) ;
-c = sort (I0) ;
-assert (isequal (c, a)) ;
-
-a = GB_mex_msort_1 (J0, 1) ;
-c = sort (J0) ;
-assert (isequal (c, a)) ;
-
-I0 = int64 (randperm (10000, 5000)) ;
-a = GB_mex_msort_1 (I0, 8) ;
-c = sort (I0) ;
-assert (isequal (c, a')) ;
-
-for n = [10 100 1000 1e5 1e6]
-    I0 = int64 (1000 * rand (n,1)) ;
-    a = GB_mex_msort_1 (I0, 8) ;
-    c = sort (I0) ;
-    assert (isequal (c, a)) ;
-end
-
-for n = [10 100 1000 1e5 1e6]
-    I0 = int64 (4 * ones (n,1)) ;
-    a = GB_mex_msort_1 (I0, 8) ;
-    c = sort (I0) ;
-    assert (isequal (c, a)) ;
-end
-
 fprintf ('test139 --------------- all tests passed\n') ;
diff --git a/GraphBLAS/Test/test14.m b/GraphBLAS/Test/test14.m
index 62c5119730..51dad71a8f 100644
--- a/GraphBLAS/Test/test14.m
+++ b/GraphBLAS/Test/test14.m
@@ -1,8 +1,8 @@
-% function test14
+function test14
 %TEST14 test GrB_reduce
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest14: reduce to column and scalar\n') ;
 
diff --git a/GraphBLAS/Test/test140.m b/GraphBLAS/Test/test140.m
index cd86097ad0..08f98f34b6 100644
--- a/GraphBLAS/Test/test140.m
+++ b/GraphBLAS/Test/test140.m
@@ -1,8 +1,8 @@
 function test140
 %TEST140 test assign with duplicates
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/Test/test141.m b/GraphBLAS/Test/test141.m
index 3ed7d12d9d..8feb103c21 100644
--- a/GraphBLAS/Test/test141.m
+++ b/GraphBLAS/Test/test141.m
@@ -1,8 +1,8 @@
 function test141
 %TEST141 test GrB_eWiseAdd (all types and operators) for dense matrices
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, ~, ~, types, ~, ~] = GB_spec_opsall ;
 binops = binops.all ;
@@ -115,10 +115,11 @@
         % C += A+B
         %---------------------------------------
 
-        C0 = GB_spec_Matrix_eWiseAdd (C, [ ], op, op, A, B, [ ]) ;
-        C1 = GB_mex_Matrix_eWiseAdd  (C, [ ], op, op, A, B, [ ]) ;
-        GB_spec_compare (C0, C1) ;
-
+        if (~GB_spec_is_positional (op))
+            C0 = GB_spec_Matrix_eWiseAdd (C, [ ], op, op, A, B, [ ]) ;
+            C1 = GB_mex_Matrix_eWiseAdd  (C, [ ], op, op, A, B, [ ]) ;
+            GB_spec_compare (C0, C1) ;
+        end
     end
 end
 
diff --git a/GraphBLAS/Test/test142.m b/GraphBLAS/Test/test142.m
index a0a33187cf..12e3457acb 100644
--- a/GraphBLAS/Test/test142.m
+++ b/GraphBLAS/Test/test142.m
@@ -1,8 +1,8 @@
 function test142
 %TEST142 test GrB_assign for dense matrices
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, ~, ~, types, ~, ~] = GB_spec_opsall ;
 binops = binops.all ;
@@ -150,45 +150,49 @@
                     C.matrix = Cmat ;
             end
 
-            op.opname = binop ;
-            op.optype = type ;
+            accum.opname = binop ;
+            accum.optype = type ;
 
             try
-                GB_spec_operator (op) ;
+                GB_spec_operator (accum) ;
             catch
                 continue
             end
 
+            if (GB_spec_is_positional (accum))
+                continue ;
+            end
+
             %---------------------------------------
             % C += A where A is dense
             %---------------------------------------
 
-            C0 = GB_spec_assign (C, [ ], op, A, [ ], [ ], [ ], false) ;
-            C1 = GB_mex_assign  (C, [ ], op, A, [ ], [ ], [ ]) ;
+            C0 = GB_spec_assign (C, [ ], accum, A, [ ], [ ], [ ], false) ;
+            C1 = GB_mex_assign  (C, [ ], accum, A, [ ], [ ], [ ]) ;
             GB_spec_compare (C0, C1, 0, tol) ;
 
             %---------------------------------------
             % C += B where B is sparse
             %---------------------------------------
 
-            C0 = GB_spec_assign (C, [ ], op, B, [ ], [ ], [ ], false) ;
-            C1 = GB_mex_assign  (C, [ ], op, B, [ ], [ ], [ ]) ;
+            C0 = GB_spec_assign (C, [ ], accum, B, [ ], [ ], [ ], false) ;
+            C1 = GB_mex_assign  (C, [ ], accum, B, [ ], [ ], [ ]) ;
             GB_spec_compare (C0, C1, 0, tol) ;
 
             %---------------------------------------
             % C += x
             %---------------------------------------
 
-            C0 = GB_spec_assign (C, [ ], op, X, [ ], [ ], [ ], true) ;
-            C1 = GB_mex_assign  (C, [ ], op, X, [ ], [ ], [ ]) ;
+            C0 = GB_spec_assign (C, [ ], accum, X, [ ], [ ], [ ], true) ;
+            C1 = GB_mex_assign  (C, [ ], accum, X, [ ], [ ], [ ]) ;
             GB_spec_compare (C0, C1, 0, tol) ;
 
             %---------------------------------------
             % C<replace> += x
             %---------------------------------------
 
-            C0 = GB_spec_assign (C, [ ], op, X, [ ], [ ], drep, true) ;
-            C1 = GB_mex_subassign  (C, [ ], op, X, [ ], [ ], drep) ;
+            C0 = GB_spec_assign (C, [ ], accum, X, [ ], [ ], drep, true) ;
+            C1 = GB_mex_subassign  (C, [ ], accum, X, [ ], [ ], drep) ;
             GB_spec_compare (C0, C1, 0, tol) ;
 
         end
diff --git a/GraphBLAS/Test/test143.m b/GraphBLAS/Test/test143.m
index b5cdbaf7c1..cba888561f 100644
--- a/GraphBLAS/Test/test143.m
+++ b/GraphBLAS/Test/test143.m
@@ -1,8 +1,8 @@
 function test143
 %TEST143 test special cases for C<!M>=A*B and C<M>=A*B
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test143 ----------------------------- A*B special cases\n') ;
 
diff --git a/GraphBLAS/Test/test144.m b/GraphBLAS/Test/test144.m
index 651e1d36f4..ef1b9d1ae0 100644
--- a/GraphBLAS/Test/test144.m
+++ b/GraphBLAS/Test/test144.m
@@ -1,8 +1,8 @@
 function test144
 %TEST144 test GB_cumsum
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test144 ---------------------- test GB_cumsum\n') ;
 
diff --git a/GraphBLAS/Test/test145.m b/GraphBLAS/Test/test145.m
index 553413c535..96bc946f86 100644
--- a/GraphBLAS/Test/test145.m
+++ b/GraphBLAS/Test/test145.m
@@ -2,8 +2,8 @@
 %TEST145 test dot4
 % GB_AxB_dot4 computes C+=A'*B when C is dense.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test145 -------------------- C+=A''*B when C is dense, with dot4\n') ;
 
@@ -21,11 +21,9 @@
 [mult_op add_op id] = GB_spec_semiring (semiring) ;
 
 dnn = struct ('axb', 'dot') ;
-dtn = struct ('axb', 'dot', 'in0', 'tran') ;
-dnt = struct ('axb', 'dot', 'in1', 'tran') ;
-dtt = struct ('axb', 'dot', 'in0', 'tran', 'in1', 'tran') ;
-
-% C = GB_mex_mxm (C, Mask, accum, semiring, A, B, desc)
+dtn = struct ('axb', 'dot', 'inp0', 'tran') ;
+dnt = struct ('axb', 'dot', 'inp1', 'tran') ;
+dtt = struct ('axb', 'dot', 'inp0', 'tran', 'inp1', 'tran') ;
 
 C2 = GB_mex_mxm  (C, [ ], add_op, semiring, A, B, dnn) ;
 C1 = GB_spec_mxm (C, [ ], add_op, semiring, A, B, dnn) ;
diff --git a/GraphBLAS/Test/test146.m b/GraphBLAS/Test/test146.m
index 487bf46f5b..656af423c5 100644
--- a/GraphBLAS/Test/test146.m
+++ b/GraphBLAS/Test/test146.m
@@ -1,8 +1,8 @@
 function test146
 %TEST146 test C<M,struct> = scalar
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test146 --------------------- C<M,struct> = scalar\n') ;
 
@@ -11,23 +11,23 @@
 
 M = logical (sprand (m, n, 0.5)) ;
 
-C1 = GB_mex_expand (M, pi) 
+C1 = GB_mex_expand (M, pi) ;
 C2 = sparse (m, n) ;
-C2 (M) = pi 
+C2 (M) = pi ;
 assert (isequal (C2, C1.matrix))
 
 for k = [false true]
     GB_builtin_complex_set (k) ;
     z = 1 + 1i ;
-    C1 = GB_mex_expand (M, z) 
+    C1 = GB_mex_expand (M, z) ;
     C2 = sparse (m, n) ;
-    C2 (M) = z
+    C2 (M) = z ;
     assert (isequal (C2, C1.matrix))
 end
 
-C1 = GB_mex_expand (M, true) 
+C1 = GB_mex_expand (M, true) ;
 C2 = logical (sparse (m, n)) ;
-C2 (M) = true
+C2 (M) = true ;
 assert (isequal (C2, logical (C1.matrix)))
 
 fprintf ('test146: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test147.m b/GraphBLAS/Test/test147.m
index bf0aa27976..12ce32845c 100644
--- a/GraphBLAS/Test/test147.m
+++ b/GraphBLAS/Test/test147.m
@@ -1,8 +1,8 @@
 function test147
 %TEST147 test C<M>A*B with very sparse M
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test147 ----------------------------- C<M>A*B with very sparse M\n') ;
 rng ('default') ;
diff --git a/GraphBLAS/Test/test148.m b/GraphBLAS/Test/test148.m
index a2a4e6256d..bdb0d1f48b 100644
--- a/GraphBLAS/Test/test148.m
+++ b/GraphBLAS/Test/test148.m
@@ -1,8 +1,8 @@
 function test148
 %TEST148 eWiseAdd with aliases
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test148 ---------------eWiseAdd with alias\n') ;
 
diff --git a/GraphBLAS/Test/test149.m b/GraphBLAS/Test/test149.m
index 4b00d55fb7..7833e17390 100644
--- a/GraphBLAS/Test/test149.m
+++ b/GraphBLAS/Test/test149.m
@@ -1,6 +1,9 @@
 function test149
 %TEST149 test fine hash method for C<!M>=A*B
 
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
 fprintf ('test149: --------- fine hash method for C<!M>=A*B\n') ;
 
 rng ('default') ;
@@ -24,11 +27,9 @@
 semiring.multiply = 'times' ;
 semiring.class = 'double' ;
 
-GrB.burble (1) ;
 tic
 C1 = GB_mex_mxm (C, M, [ ], semiring, A, B, desc) ;
 toc
-GrB.burble (0) ;
 tic
 C2 = (A*B) .* double (~M) ;
 toc
diff --git a/GraphBLAS/Test/test15.m b/GraphBLAS/Test/test15.m
index b72a7fac61..f85e2f5ee3 100644
--- a/GraphBLAS/Test/test15.m
+++ b/GraphBLAS/Test/test15.m
@@ -1,8 +1,8 @@
 function test15
 %TEST15 test AxB and AdotB internal functions
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n --------------------- GB_mex_AxB, GB_mex_AdotB tests\n') ;
 
@@ -14,14 +14,13 @@
 B = sprand (k,n,0.5) ;
 C1 = A*B ;
 C = GB_mex_AxB (A, B) ;
-assert (spok (C) == 1) ;
+assert (GB_spok (C) == 1) ;
 assert (norm (C-C1,1) / norm (C,1)< 1e-12) ;
 
-
 A = A' ;
 C1 = A'*B ;
 C = GB_mex_AdotB (A, B) ;
-assert (spok (C) == 1) ;
+assert (GB_spok (C) == 1) ;
 assert (isequal (C, C1)) ;
 
 A = sprandn (10000,2,0.5) ;
@@ -30,21 +29,21 @@
 B (5,2) = 42 ;
 C1 = A'*B ;
 C = GB_mex_AdotB (A, B) ;
-assert (spok (C) == 1) ;
+assert (GB_spok (C) == 1) ;
 assert (isequal (C, C1)) ;
 C1 = B'*A ;
 C = GB_mex_AdotB (B, A) ;
-assert (spok (C) == 1) ;
+assert (GB_spok (C) == 1) ;
 assert (isequal (C, C1)) ;
 
 S = sparse (10000,2) ;
 C1 = A.*B ;
 C = GB_mex_Matrix_eWiseMult (S, [], [], 'times', A, B) ;
-assert (spok (C.matrix) == 1) ;
+assert (GB_spok (C.matrix) == 1) ;
 assert (isequal (C.matrix, C1)) ;
 C1 = B.*A ;
 C = GB_mex_Matrix_eWiseMult (S, [], [], 'times', B, A) ;
-assert (spok (C.matrix) == 1) ;
+assert (GB_spok (C.matrix) == 1) ;
 assert (isequal (C.matrix, C1)) ;
 
 fprintf ('\ntest15: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test150.m b/GraphBLAS/Test/test150.m
index d101dd1949..300b6d3cc1 100644
--- a/GraphBLAS/Test/test150.m
+++ b/GraphBLAS/Test/test150.m
@@ -1,8 +1,8 @@
 function test150
 %TEST150 test GrB_mxm with typecasting and zombies (dot3 and saxpy)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test150: ------- GrB_mxm with typecasting and zombies (dot3)\n') ;
 
diff --git a/GraphBLAS/Test/test151.m b/GraphBLAS/Test/test151.m
index fc5cb23b2c..c6e7204e15 100644
--- a/GraphBLAS/Test/test151.m
+++ b/GraphBLAS/Test/test151.m
@@ -1,8 +1,8 @@
 function test151
 %TEST151 test bitwise operators
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test151: test bitwise operators\n') ;
 
diff --git a/GraphBLAS/Test/test152.m b/GraphBLAS/Test/test152.m
index 0b2cd8aa6d..ce111d46eb 100644
--- a/GraphBLAS/Test/test152.m
+++ b/GraphBLAS/Test/test152.m
@@ -1,8 +1,8 @@
 function test152
 %TEST152 test C = A+B for dense A, B, and C
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest152: test binops with C=A+B, all dense\n') ;
 
diff --git a/GraphBLAS/Test/test153.m b/GraphBLAS/Test/test153.m
index da864d6acc..da2eeb2fa5 100644
--- a/GraphBLAS/Test/test153.m
+++ b/GraphBLAS/Test/test153.m
@@ -3,8 +3,8 @@
 %
 % Lists all possible semirings that can be constructed from built-in operators.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, unary_ops, add_ops, types, semirings, selops] = GB_spec_opsall ;
 
diff --git a/GraphBLAS/Test/test154.m b/GraphBLAS/Test/test154.m
index a4f24e33bd..fd1cbfe047 100644
--- a/GraphBLAS/Test/test154.m
+++ b/GraphBLAS/Test/test154.m
@@ -1,11 +1,8 @@
 function test154
 %TEST154 test GrB_apply with scalar binding
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights
-% Reserved. http://suitesparse.com.  See GraphBLAS/Doc/License.txt.
-
-% C = GB_mex_apply1 (C, Mask, accum, op, how, x, A, desc)
-% C = GB_mex_apply2 (C, Mask, accum, op, how, A, y, desc)
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, ~, ~, types, ~, ~] = GB_spec_opsall ;
 ops = binops.all ;
@@ -24,156 +21,158 @@
     fprintf ('\n%-10s ', mulop) ;
 
     for k1 = 1:length (types)
-        type = types {k1} ;
-
-        % create the op
-        clear op
-        op.opname = mulop ;
-        op.optype = type ;
-
-        try
-            [oname ot ztype xtype ytype] = GB_spec_operator (op) ;
-        catch
-            continue ;
-        end
-        n_operators = n_operators + 1  ;
-
-        switch (mulop)
-            case { 'pow' }
-                xlimits = [0, 5] ;
-                ylimits = [0, 5] ;
-            case { 'ldexp' }
-                xlimits = [-5, 5] ;
-                ylimits = [-5, 5] ;
-            otherwise
-                xlimits = [ ] ;
-                ylimits = [ ] ;
-        end
-
-        if (contains (type, 'single'))
-            tol = 1e-5 ;
-        elseif (contains (type, 'double'))
-            tol = 1e-12 ;
-        else
-            tol = 0 ;
-        end
-
-        fprintf ('.') ;
-
-        for m = [1 4] % [ 1 10 ]% 100]
-            for n = [1 4] % [1 10 ]% 100]
-                for hi = [1 5] % [-1:2:5 ]
-                    for lo = [-1 0] % [-3:2:5 ]
-                        Amat = (hi*sprand (m,n,0.8)-lo) .* sprand (m,n,0.5) ;
-                        Bmat = (hi*sprand (m,n,0.8)-lo) .* sprand (m,n,0.5) ;
-                        xmat = (hi*sparse (rand(1))-lo) .* sparse (rand(1)) ;
-                        ymat = (hi*sparse (rand(1))-lo) .* sparse (rand(1)) ;
-                        Cmat = sparse (m, n) ;
-
-                        if (~isempty (xlimits))
-                            Amat = max (Amat, xlimits (1)) ;
-                            Amat = min (Amat, xlimits (2)) ;
-                            xmat = max (xmat, xlimits (1)) ;
-                            xmat = min (xmat, xlimits (2)) ;
-                        end
-                        if (xmat == 0)
-                            xmat = sparse (0.5) ;
-                        end
-
-                        if (~isempty (ylimits))
-                            Bmat = max (Bmat, ylimits (1)) ;
-                            Bmat = min (Bmat, ylimits (2)) ;
-                            ymat = max (ymat, ylimits (1)) ;
-                            ymat = min (ymat, ylimits (2)) ;
-                        end
-                        if (ymat == 0)
-                            ymat = sparse (0.5) ;
-                        end
-
-                        C.matrix = Cmat ;
-                        C.class = ztype ;
-
-                        CT.matrix = Cmat' ;
-                        CT.class = ztype ;
-
-                        A.matrix = Amat ;
-                        A.class = xtype ;
-
-                        B.matrix = Bmat ;
-                        B.class = ytype ;
-
-                        x.matrix = xmat ;
-                        x.class = xtype ;
-
-                        y.matrix = ymat ;
-                        y.class = ytype ;
-
-                        X.matrix = xmat .* spones (Bmat) ;
-                        X.class = xtype ;
-
-                        Y.matrix = ymat .* spones (Amat) ; 
-                        Y.class = ytype ;
-
-                        op_ewise = op ;
-                        if (isequal (op.opname, 'any'))
-                            op_ewise.opname = 'second' ;
-                        end
-
-                        C1 = GB_mex_apply1 (C, [ ], [ ], op, 0, x, B) ;
-                        C2 = GB_spec_Matrix_eWiseMult ...
-                            (C, [ ], [ ], op_ewise, X, B, [ ]) ;
-                        GB_spec_compare (C1, C2, 0, tol) ;
-                        C1 = GB_mex_apply1 (C, [ ], [ ], op, 1, x, B) ;
-                        GB_spec_compare (C1, C2, 0, tol) ;
-
-                        C1 = GB_mex_apply1 (CT, [ ], [ ], op, 0, x, B, desc) ;
-                        C2 = GB_spec_Matrix_eWiseMult ...
-                            (CT, [ ], [ ], op_ewise, X, B, desc) ;
-                        GB_spec_compare (C1, C2, 0, tol) ;
-                        C1 = GB_mex_apply1 (CT, [ ], [ ], op, 1, x, B, desc) ;
-                        GB_spec_compare (C1, C2, 0, tol) ;
-
-                        op_ewise = op ;
-                        if (isequal (op.opname, 'any'))
-                            op_ewise.opname = 'first' ;
-                        end
-
-                        C1 = GB_mex_apply2 (C, [ ], [ ], op, 0, A, y) ;
-                        C2 = GB_spec_Matrix_eWiseMult ...
-                            (C, [ ], [ ], op_ewise, A, Y, [ ]) ;
-                        GB_spec_compare (C1, C2, 0, tol) ;
-                        C1 = GB_mex_apply2 (C, [ ], [ ], op, 1, A, y) ;
-                        GB_spec_compare (C1, C2, 0, tol) ;
-
-                        C1 = GB_mex_apply2 (CT, [ ], [ ], op, 0, A, y, desc) ;
-                        C2 = GB_spec_Matrix_eWiseMult ...
-                            (CT, [ ], [ ], op_ewise, A, Y, desc) ;
-                        GB_spec_compare (C1, C2, 0, tol) ;
-                        C1 = GB_mex_apply2 (CT, [ ], [ ], op, 1, A, y, desc) ;
-                        GB_spec_compare (C1, C2, 0, tol) ;
-
-                        y.class = 'double' ;
-                        Y.class = 'double' ;
-
-                        C1 = GB_mex_apply2 (C, [ ], [ ], op, 0, A, y) ;
-                        C2 = GB_spec_Matrix_eWiseMult ...
-                            (C, [ ], [ ], op_ewise, A, Y, [ ]) ;
-                        GB_spec_compare (C1, C2, 0, tol) ;
-                        C1 = GB_mex_apply2 (C, [ ], [ ], op, 1, A, y) ;
-                        GB_spec_compare (C1, C2, 0, tol) ;
-
-                        C1 = GB_mex_apply2 (CT, [ ], [ ], op, 0, A, y, desc) ;
-                        C2 = GB_spec_Matrix_eWiseMult ...
-                            (CT, [ ], [ ], op_ewise, A, Y, desc) ;
-                        GB_spec_compare (C1, C2, 0, tol) ;
-                        C1 = GB_mex_apply2 (CT, [ ], [ ], op, 1, A, y, desc) ;
-                        GB_spec_compare (C1, C2, 0, tol) ;
-
-                    end
-                end
-            end
-        end
+    type = types {k1} ;
+
+    % create the op
+    clear op
+    op.opname = mulop ;
+    op.optype = type ;
+
+    try
+        [oname ot ztype xtype ytype] = GB_spec_operator (op) ;
+    catch
+        continue ;
+    end
+    n_operators = n_operators + 1  ;
+
+    switch (mulop)
+        case { 'pow' }
+            xlimits = [0, 5] ;
+            ylimits = [0, 5] ;
+        case { 'ldexp' }
+            xlimits = [-5, 5] ;
+            ylimits = [-5, 5] ;
+        otherwise
+            xlimits = [ ] ;
+            ylimits = [ ] ;
+    end
+
+    if (contains (type, 'single'))
+        tol = 1e-5 ;
+    elseif (contains (type, 'double'))
+        tol = 1e-12 ;
+    else
+        tol = 0 ;
+    end
+
+    fprintf ('.') ;
+
+    for m = [1 4] % [ 1 10 ]% 100]
+    for n = [1 4] % [1 10 ]% 100]
+    for hi = [1 5] % [-1:2:5 ]
+    for lo = [-1 0] % [-3:2:5 ]
+    Amat = (hi*sprand (m,n,0.8)-lo) .* sprand (m,n,0.5) ;
+    Bmat = (hi*sprand (m,n,0.8)-lo) .* sprand (m,n,0.5) ;
+    xmat = (hi*sparse (rand(1))-lo) .* sparse (rand(1)) ;
+    ymat = (hi*sparse (rand(1))-lo) .* sparse (rand(1)) ;
+    Cmat = sparse (m, n) ;
+
+    if (~isempty (xlimits))
+        Amat = max (Amat, xlimits (1)) ;
+        Amat = min (Amat, xlimits (2)) ;
+        xmat = max (xmat, xlimits (1)) ;
+        xmat = min (xmat, xlimits (2)) ;
+    end
+    if (xmat == 0)
+        xmat = sparse (0.5) ;
+    end
+
+    if (~isempty (ylimits))
+        Bmat = max (Bmat, ylimits (1)) ;
+        Bmat = min (Bmat, ylimits (2)) ;
+        ymat = max (ymat, ylimits (1)) ;
+        ymat = min (ymat, ylimits (2)) ;
+    end
+    if (ymat == 0)
+        ymat = sparse (0.5) ;
     end
+
+    C.matrix = Cmat ;
+    C.class = ztype ;
+
+    CT.matrix = Cmat' ;
+    CT.class = ztype ;
+
+    A.matrix = Amat ;
+    A.class = xtype ;
+
+    B.matrix = Bmat ;
+    B.class = ytype ;
+
+    x.matrix = xmat ;
+    x.class = xtype ;
+
+    y.matrix = ymat ;
+    y.class = ytype ;
+
+    X.matrix = xmat .* spones (Bmat) ;
+    X.class = xtype ;
+
+    Y.matrix = ymat .* spones (Amat) ; 
+    Y.class = ytype ;
+
+    op_ewise = op ;
+    if (isequal (op.opname, 'any'))
+        op_ewise.opname = 'second' ;
+    end
+
+    C1 = GB_mex_apply1 (C, [ ], [ ], op, 0, x, B) ;
+    C2 = GB_spec_Matrix_eWiseMult (C, [ ], [ ], op_ewise, X, B, [ ]) ;
+    GB_spec_compare (C1, C2, 0, tol) ;
+    C1 = GB_mex_apply1 (C, [ ], [ ], op, 1, x, B) ;
+    GB_spec_compare (C1, C2, 0, tol) ;
+
+    C1 = GB_mex_apply1 (CT, [ ], [ ], op, 0, x, B, desc) ;
+    C2 = GB_spec_Matrix_eWiseMult (CT, [ ], [ ], op_ewise, X, B, desc) ;
+    GB_spec_compare (C1, C2, 0, tol) ;
+    C1 = GB_mex_apply1 (CT, [ ], [ ], op, 1, x, B, desc) ;
+    GB_spec_compare (C1, C2, 0, tol) ;
+
+    op_ewise = op ;
+    if (isequal (op.opname, 'any'))
+        op_ewise.opname = 'first' ;
+    end
+
+    for csc = 0:1
+
+        A.is_csc = csc ;
+        C.is_csc = csc ;
+        CT.is_csc = csc ;
+
+        C1 = GB_mex_apply2 (C, [ ], [ ], op, 0, A, y) ;
+        C2 = GB_spec_Matrix_eWiseMult (C, [ ], [ ], op_ewise, A, Y, [ ]) ;
+        GB_spec_compare (C1, C2, 0, tol) ;
+        C1 = GB_mex_apply2 (C, [ ], [ ], op, 1, A, y) ;
+        GB_spec_compare (C1, C2, 0, tol) ;
+
+        C1 = GB_mex_apply2 (CT, [ ], [ ], op, 0, A, y, desc) ;
+        C2 = GB_spec_Matrix_eWiseMult (CT, [ ], [ ], op_ewise, A, Y, desc) ;
+        GB_spec_compare (C1, C2, 0, tol) ;
+        C1 = GB_mex_apply2 (CT, [ ], [ ], op, 1, A, y, desc) ;
+        GB_spec_compare (C1, C2, 0, tol) ;
+    end
+
+    y.class = 'double' ;
+    Y.class = 'double' ;
+
+    C1 = GB_mex_apply2 (C, [ ], [ ], op, 0, A, y) ;
+    C2 = GB_spec_Matrix_eWiseMult (C, [ ], [ ], op_ewise, A, Y, [ ]) ;
+    GB_spec_compare (C1, C2, 0, tol) ;
+
+    C1 = GB_mex_apply2 (C, [ ], [ ], op, 1, A, y) ;
+    GB_spec_compare (C1, C2, 0, tol) ;
+
+    C1 = GB_mex_apply2 (CT, [ ], [ ], op, 0, A, y, desc) ;
+    C2 = GB_spec_Matrix_eWiseMult (CT, [ ], [ ], op_ewise, A, Y, desc) ;
+    GB_spec_compare (C1, C2, 0, tol) ;
+    C1 = GB_mex_apply2 (CT, [ ], [ ], op, 1, A, y, desc) ;
+    GB_spec_compare (C1, C2, 0, tol) ;
+
+end
+end
+end
+end
+end
 end
 
 fprintf ('\nNumber of built-in GraphBLAS operators: %d\n',  n_operators) ;
diff --git a/GraphBLAS/Test/test155.m b/GraphBLAS/Test/test155.m
index e456825d58..ecdabf503f 100644
--- a/GraphBLAS/Test/test155.m
+++ b/GraphBLAS/Test/test155.m
@@ -1,13 +1,13 @@
 function test155
 %TEST155 test GrB_*_setElement and GrB_*_removeElement
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
-mlist  = [ 1  1  10  20 ] ;
-nlist  = [ 1 10   1  10 ] ;
-nzlist = [ 5 100 100 1000 ] ;
+mlist  = [ 1  1  10  20    5  ] ;
+nlist  = [ 1 10   1  10    5  ]  ;
+nzlist = [ 5 100 100 1000  100] ;
 
 for trial = 1:4
 
@@ -20,6 +20,10 @@
     X = rand (nz, 1) ;
     Action = double (rand (nz, 1) > 0.4) ;
 
+    %---------------------------------------------------------------------------
+    % starting with an empty matrix:
+    %---------------------------------------------------------------------------
+
     % do the work in MATLAB
     C1 = sparse (m, n) ;
     for k = 1:nz
@@ -45,6 +49,30 @@
             assert (isequal (C1, C2)) ;
         end
     end
+
+    %---------------------------------------------------------------------------
+    % starting with a full matrix:
+    %---------------------------------------------------------------------------
+
+    % do the work in MATLAB
+    C1 = rand (m, n) ;
+    C1_start = C1 ;
+    for k = 1:nz
+        if (Action (k) == 0)
+            C1 (I (k), J (k)) = sparse (0) ;
+        else
+            C1 (I (k), J (k)) = sparse (X (k)) ;
+        end
+    end
+
+    % do the work in GraphBLAS, testing all sparsity control options
+    C0.matrix = C1_start ;
+    for sparsity_control = 1:15
+        C0.sparsity = sparsity_control ;
+        C2 = GB_mex_edit (C0, I, J, X, Action) ;
+        assert (isequal (C1, C2)) ;
+    end
+
 end
 
 fprintf ('test155: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test156.m b/GraphBLAS/Test/test156.m
index 876ae214b6..a830e88a4b 100644
--- a/GraphBLAS/Test/test156.m
+++ b/GraphBLAS/Test/test156.m
@@ -1,8 +1,8 @@
 function test156
 %TEST156 test assign C=A with typecasting
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/Test/test157.m b/GraphBLAS/Test/test157.m
new file mode 100644
index 0000000000..e1736e675d
--- /dev/null
+++ b/GraphBLAS/Test/test157.m
@@ -0,0 +1,48 @@
+function test157
+%TEST157 test sparsity formats
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+[~, ~, ~, types, ~, ~] = GB_spec_opsall ;
+types = types.all ;
+
+for k1 = 1:length(types)
+    in_type = types {k1} ;
+    A = GB_spec_random (5, 5, 0.5, 10, in_type) ;
+
+    for sparsity_control = 0:15
+        A.sparsity = sparsity_control ;
+        C = GB_mex_dump (A, 2) ;
+        GB_spec_compare (C, A) ;
+    end
+
+    % try a full matrix
+    A.matrix = 10 * rand (5, 5) ;
+    A.pattern = true (5, 5) ;
+
+    for sparsity_control = 0:15
+        A.sparsity = sparsity_control ;
+        C = GB_mex_dump (A, 2) ;
+        GB_spec_compare (C, A) ;
+    end
+
+    % try a very sparse matrix
+    A = GB_spec_random (50, 50, 0.002, 10, in_type) ;
+    for is_hyper = 0:1
+        A.is_hyper = is_hyper ;
+        for sparsity_control = 0:15
+            A.sparsity = sparsity_control ;
+            C = GB_mex_dump (A, 2) ;
+            GB_spec_compare (C, A) ;
+        end
+    end
+end
+
+A = GrB (rand (40)) ;
+A (1,1) = sparse (0)
+
+fprintf ('test157: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test158.m b/GraphBLAS/Test/test158.m
new file mode 100644
index 0000000000..4269f081a9
--- /dev/null
+++ b/GraphBLAS/Test/test158.m
@@ -0,0 +1,98 @@
+function test158
+%TEST158 test colscale (A*D) and rowscale (D*B) with positional ops
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+[binops, ~, ~, ~, ~, ~] = GB_spec_opsall ;
+pos = binops.positional ;
+pos {end+1} = 'times' ;
+pos {end+1} = 'div' ;
+
+n = 30 ;
+A = GB_spec_random (n, n, 0.05, 256, 'int64') ;
+D.matrix = speye (n) ;
+D.class = 'int64' ;
+D.pattern = logical (spones (D.matrix)) ;
+
+dnn = struct ;
+dtn = struct ('inp0', 'tran') ;
+dnt = struct ('inp1', 'tran') ;
+dtt = struct ('inp0', 'tran', 'inp1', 'tran') ;
+
+Cin = sparse (n,n) ;
+
+semiring.add = 'plus' ;
+semiring.class = 'int64' ;
+
+for c = 1:3
+
+    if (c == 1)
+        A.class = 'int32' ;
+        D.class = 'int32' ;
+        semiring.class = 'int32' ;
+    elseif (c == 2)
+        A.class = 'int64' ;
+        D.class = 'int64' ;
+        semiring.class = 'int64' ;
+    else
+        A.class = 'int32' ;
+        D.class = 'int64' ;
+        semiring.class = 'int64' ;
+    end
+
+    fprintf ('\ntypes: %s %s %s\n', D.class, A.class, semiring.class) ;
+
+    for k = 1:length(pos)
+
+        op = pos {k} ;
+        fprintf ('op: %s\n', op) ;
+        semiring.multiply = op ;
+
+        % colscale: C = A*D
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, A, D, [ ]) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, A, D, [ ]) ;
+        GB_spec_compare (C1, C2) ;
+
+        % rowscale: C = D*A
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, D, A, [ ]) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, D, A, [ ]) ;
+        GB_spec_compare (C1, C2) ;
+
+        % colscale: C = A'*D
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, A, D, dtn) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, A, D, dtn) ;
+        GB_spec_compare (C1, C2) ;
+
+        % colscale: C = D'*A
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, D, A, dtn) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, D, A, dtn) ;
+        GB_spec_compare (C1, C2) ;
+
+        % rowscale: C = D*A'
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, D, A, dnt) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, D, A, dnt) ;
+        GB_spec_compare (C1, C2) ;
+
+        % rowscale: C = A*D'
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, A, D, dnt) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, A, D, dnt) ;
+        GB_spec_compare (C1, C2) ;
+
+        % colscale: C = A'*D'
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, A, D, dtt) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, A, D, dtt) ;
+        GB_spec_compare (C1, C2) ;
+
+        % rowscale: C = D'*B'
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, D, A, dtt) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, D, A, dtt) ;
+        GB_spec_compare (C1, C2) ;
+
+    end
+
+end
+fprintf ('\ntest158: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test159.m b/GraphBLAS/Test/test159.m
new file mode 100644
index 0000000000..7a1e3bc842
--- /dev/null
+++ b/GraphBLAS/Test/test159.m
@@ -0,0 +1,99 @@
+function test159
+%TEST159 test dot and saxpy with positional ops
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+[binops, ~, ~, ~, ~, ~] = GB_spec_opsall ;
+pos = binops.positional ;
+pos {end+1} = 'times' ;
+pos {end+1} = 'div' ;
+pos {end+1} = 'first' ;
+pos {end+1} = 'second' ;
+
+n = 10 ;
+A = GB_spec_random (n, n, 0.05, 256, 'int64') ;
+B = GB_spec_random (n, n, 0.05, 256, 'int64') ;
+
+dnn = struct ;
+dtn = struct ('inp0', 'tran') ;
+dnt = struct ('inp1', 'tran') ;
+dtt = struct ('inp0', 'tran', 'inp1', 'tran') ;
+
+Cin = sparse (n,n) ;
+
+semiring.add = 'plus' ;
+
+A.class = 'int32' ;
+B.class = 'double' ;
+
+for c = 1:4
+
+    if (c == 1 || c == 2)
+        dnn.axb = 'saxpy' ;
+        dtn.axb = 'saxpy' ;
+        dnt.axb = 'saxpy' ;
+        dtt.axb = 'saxpy' ;
+    else
+        dnn.axb = 'dot' ;
+        dtn.axb = 'dot' ;
+        dnt.axb = 'dot' ;
+        dtt.axb = 'dot' ;
+    end
+
+    if (c == 1 || c == 3)
+        semiring.class = 'int64' ;
+    else
+        semiring.class = 'int32' ;
+    end
+
+    fprintf ('\ntypes: %s %s %s\n', A.class, B.class, semiring.class) ;
+
+    for k = 1:length(pos)
+
+        op = pos {k} ;
+        fprintf ('op: %s\n', op) ;
+        semiring.multiply = op ;
+
+        % C = A*B
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, A, B, dnn) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, A, B, dnn) ;
+        C3 = GB_mex_mxm_generic  (Cin, [ ], [ ], semiring, A, B, dnn) ;
+        GB_spec_compare (C1, C2) ;
+        GB_spec_compare (C1, C3) ;
+
+        % C = A'*B
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, A, B, dtn) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, A, B, dtn) ;
+        C3 = GB_mex_mxm_generic  (Cin, [ ], [ ], semiring, A, B, dtn) ;
+        GB_spec_compare (C1, C2) ;
+        GB_spec_compare (C1, C3) ;
+
+        % C = B*B'
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, B, A, dnt) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, B, A, dnt) ;
+        C3 = GB_mex_mxm_generic  (Cin, [ ], [ ], semiring, B, A, dnt) ;
+        GB_spec_compare (C1, C2) ;
+        GB_spec_compare (C1, C3) ;
+
+        % C = A'*B'
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, A, B, dtt) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, A, B, dtt) ;
+        C3 = GB_mex_mxm_generic  (Cin, [ ], [ ], semiring, A, B, dtt) ;
+        GB_spec_compare (C1, C2) ;
+        GB_spec_compare (C1, C3) ;
+
+        % C = B'*A
+        C1 = GB_spec_mxm (Cin, [ ], [ ], semiring, B, A, dtt) ;
+        C2 = GB_mex_mxm  (Cin, [ ], [ ], semiring, B, A, dtt) ;
+        C3 = GB_mex_mxm_generic (Cin, [ ], [ ], semiring, B, A, dtt) ;
+        GB_spec_compare (C1, C2) ;
+        GB_spec_compare (C1, C3) ;
+
+    end
+
+end
+fprintf ('\ntest159: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test16.m b/GraphBLAS/Test/test16.m
index c9178bd6c0..0111125c8b 100644
--- a/GraphBLAS/Test/test16.m
+++ b/GraphBLAS/Test/test16.m
@@ -1,8 +1,8 @@
 function test16
 %TEST16 test user-defined complex type (runs all testc*.m)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % all complex matrix tests
 
diff --git a/GraphBLAS/Test/test160.m b/GraphBLAS/Test/test160.m
new file mode 100644
index 0000000000..4d9061223d
--- /dev/null
+++ b/GraphBLAS/Test/test160.m
@@ -0,0 +1,158 @@
+function test160
+%TEST160 test GrB_mxm
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+n = 100 ;
+Mask.matrix = (rand (n) > 0.5) ;
+Mask.pattern = true (n) ;
+mtypes = { 'int8', 'int16', 'int32', 'int64', 'double complex' } ;
+
+Mask2 = GB_spec_random (n, n, 1, 0.01, 'logical') ;
+Mask2.matrix = logical (Mask2.matrix) ;
+Mask2.matrix (:,1) = false ;
+Mask2.pattern (:,1) = false ;
+
+semiring.add = 'plus' ;
+semiring.multiply = 'times' ;
+semiring.class = 'double' ;
+
+dnn = struct ;
+dnn_struct = struct ('mask', 'structural') ;
+dnn_notM_struct = struct ('mask', 'structural complement') ;
+dnn_notM = struct ('mask', 'complement') ;
+dnn_notM_hash = struct ('mask', 'complement', 'axb', 'hash') ;
+dnn_hash = struct ('axb', 'hash') ;
+
+d = 0.01 ;
+
+A = GB_spec_random (n, n, d, 1, 'double') ;
+G = A ;
+G.matrix (:,1:2) = 1 ;
+G.pattern (:,1:2) = true ;
+B = GB_spec_random (n, n, d, 1, 'double') ;
+B.matrix (1:2,1) = 1 ;
+B.pattern (1:2,1) = true ;
+b = GB_spec_random (n, 1, d, 1, 'double') ;
+Cin = sparse (n, n) ;
+cin = sparse (n, 1) ;
+mask.matrix = (rand (n,1) > 0.5) ;
+mask.pattern = true (n,1) ;
+
+H.matrix = sparse (ones (n,n)) ;
+H.matrix (1,1) = 0 ;
+H.pattern = sparse (true (n,n)) ;
+H.matrix (1,1) = false ;
+H.sparsity = 2 ;
+mask2.matrix = sparse (false (n,1)) ;
+mask2.matrix (1,1) = true ;
+mask2.pattern = sparse (false (n,1)) ;
+mask2.pattern (1,1) = true ;
+x = GB_spec_random (n, 1, 0.5, 1, 'double') ;
+x.sparsity = 2 ;
+y = GB_spec_random (n, 1, 0.02, 1, 'double') ;
+y.sparsity = 2 ;
+
+K = GB_spec_random (1000, 2, 0.1, 1, 'double') ;
+K.matrix (1:2, 1:2) = pi ;
+K.pattern (1:2, 1:2) = true ;
+K.sparsity = 2 ;
+z.matrix = rand (2,1) ;
+maskz.matrix = sparse (false (1000,1)) ;
+maskz.matrix (1,1) = true ;
+maskz.pattern = sparse (false (1000,1)) ;
+maskz.pattern (1,1) = true ;
+maskz.class = 'logical' ;
+cinz = sparse (1000, 1) ;
+
+
+for k = 1:length (mtypes)
+
+    fprintf ('%s ', mtypes {k}) ;
+    Mask.class = mtypes {k} ;
+    Mask2.class = mtypes {k} ;
+    mask.class = mtypes {k} ;
+
+    % C<M> = A*B
+    C1 = GB_spec_mxm (Cin, Mask, [ ], semiring, A, B, dnn) ;
+    C2 = GB_mex_mxm  (Cin, Mask, [ ], semiring, A, B, dnn) ;
+    GB_spec_compare (C1, C2) ;
+
+    % C<M,struct> = A*B
+    C1 = GB_spec_mxm (Cin, Mask, [ ], semiring, A, B, dnn_struct) ;
+    C2 = GB_mex_mxm  (Cin, Mask, [ ], semiring, A, B, dnn_struct) ;
+    GB_spec_compare (C1, C2) ;
+
+    % C<!M,struct> = A*B
+    C1 = GB_spec_mxm (Cin, Mask, [ ], semiring, A, B, dnn_notM_struct) ;
+    C2 = GB_mex_mxm  (Cin, Mask, [ ], semiring, A, B, dnn_notM_struct) ;
+    GB_spec_compare (C1, C2) ;
+
+    % C<!M> = A*B
+    C1 = GB_spec_mxm (Cin, Mask, [ ], semiring, A, B, dnn_notM) ;
+    C2 = GB_mex_mxm  (Cin, Mask, [ ], semiring, A, B, dnn_notM) ;
+    GB_spec_compare (C1, C2) ;
+
+    % C<M> = G*b
+    C1 = GB_spec_mxm (cin, mask, [ ], semiring, G, b, dnn) ;
+    C2 = GB_mex_mxm  (cin, mask, [ ], semiring, G, b, dnn) ;
+    GB_spec_compare (C1, C2) ;
+
+    % C<!M> = G*b
+    C1 = GB_spec_mxm (cin, mask, [ ], semiring, G, b, dnn_notM) ;
+    C2 = GB_mex_mxm  (cin, mask, [ ], semiring, G, b, dnn_notM) ;
+    GB_spec_compare (C1, C2) ;
+
+    % C<!M,struct> = A*B
+    C1 = GB_spec_mxm (cin, mask, [ ], semiring, G, b, dnn_notM_struct) ;
+    C2 = GB_mex_mxm  (cin, mask, [ ], semiring, G, b, dnn_notM_struct) ;
+    GB_spec_compare (C1, C2) ;
+
+    % C<!M> = A*B
+    C1 = GB_spec_mxm (Cin, Mask2, [ ], semiring, A, B, dnn_notM) ;
+    C2 = GB_mex_mxm  (Cin, Mask2, [ ], semiring, A, B, dnn_notM) ;
+    GB_spec_compare (C1, C2) ;
+
+    % C<!Mask2> = G*B
+    C1 = GB_spec_mxm (Cin, Mask2, [ ], semiring, G, B, dnn_notM) ;
+    C2 = GB_mex_mxm  (Cin, Mask2, [ ], semiring, G, B, dnn_notM) ;
+    GB_spec_compare (C1, C2) ;
+
+    % C<!M> = H*x
+    C1 = GB_spec_mxm (cin, mask2, [ ], semiring, H, x, dnn_notM) ;
+    C2 = GB_mex_mxm  (cin, mask2, [ ], semiring, H, x, dnn_notM) ;
+    GB_spec_compare (C1, C2) ;
+
+    % C<!M> = G*x
+    C1 = GB_spec_mxm (cin, mask2, [ ], semiring, G, x, dnn_notM_hash) ;
+    C2 = GB_mex_mxm  (cin, mask2, [ ], semiring, G, x, dnn_notM_hash) ;
+    GB_spec_compare (C1, C2) ;
+
+    % C<!M> = K*z
+    z.sparsity = 4 ;
+    C1 = GB_spec_mxm (cinz, maskz, [ ], semiring, K, z, dnn_notM_hash) ;
+    C2 = GB_mex_mxm  (cinz, maskz, [ ], semiring, K, z, dnn_notM_hash) ;
+    GB_spec_compare (C1, C2) ;
+
+end
+
+% C = K*z
+z.sparsity = 2 ;
+C1 = GB_spec_mxm (cinz, [ ], [ ], semiring, K, z, dnn_hash) ;
+C2 = GB_mex_mxm  (cinz, [ ], [ ], semiring, K, z, dnn_hash) ;
+C3 = GB_mex_mxm_generic  (cinz, [ ], [ ], semiring, K, z, dnn_hash) ;
+GB_spec_compare (C1, C2) ;
+GB_spec_compare (C1, C3) ;
+
+% C<!M> = K*z
+z.sparsity = 2 ;
+C1 = GB_spec_mxm (cinz, maskz, [ ], semiring, K, z, dnn_notM_hash) ;
+C2 = GB_mex_mxm  (cinz, maskz, [ ], semiring, K, z, dnn_notM_hash) ;
+C3 = GB_mex_mxm_generic  (cinz, maskz, [ ], semiring, K, z, dnn_notM_hash) ;
+GB_spec_compare (C1, C2) ;
+GB_spec_compare (C1, C3) ;
+
+fprintf ('\ntest160: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test161.m b/GraphBLAS/Test/test161.m
new file mode 100644
index 0000000000..def44662d2
--- /dev/null
+++ b/GraphBLAS/Test/test161.m
@@ -0,0 +1,26 @@
+function test161
+%TEST161 C=A*B*E
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+n = 100 ;
+d = 0.05 ;
+semiring.add = 'plus' ;
+semiring.multiply = 'times' ;
+semiring.class = 'double' ;
+
+for trial = 1:10
+    
+    A = sprand (n, n, d) ;
+    B = sprand (n, n, d) ;
+    E = sprand (n, n, d) ;
+    semiring.class = 'double' ;
+
+    C1 = A*B*E ;
+    C2 = GB_mex_triple_mxm (semiring, A, B, E) ;
+    GB_spec_compare (C1, C2) ;
+end
+
diff --git a/GraphBLAS/Test/test162.m b/GraphBLAS/Test/test162.m
new file mode 100644
index 0000000000..ceb5b330f8
--- /dev/null
+++ b/GraphBLAS/Test/test162.m
@@ -0,0 +1,45 @@
+function test162
+%TEST162 test C<M>=A*B with very sparse M
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+semiring.add = 'plus' ;
+semiring.multiply = 'times' ;
+semiring.class = 'double' ;
+
+rng ('default') ;
+
+d = 0.02 ;
+n = 1000 ;
+A = sprand (n, n, d) ;
+A (1:257,1) = rand (257, 1) ;
+B = sprand (n, n, d) ;
+B (1,1) = 1 ;
+M = logical (sprand (n, n, 0.002)) ;
+Cin = sparse (n, n) ;
+
+C1 = double (M) .* (A*B) ;
+C2 = GB_mex_mxm (Cin, M, [ ], semiring, A, B, [ ]) ;
+GB_spec_compare (C1, C2) ;
+
+clear A B M
+n = 80 ;
+A.matrix = sparse (rand (n)) ; A.sparsity = 4 ;    % A is bitmap
+B.matrix = sparse (rand (n)) ; B.sparsity = 4 ;    % B is bitmap
+M.matrix = logical (sprand (n, n, 0.5)) ; M.sparsity = 2 ;  % M is sparse
+desc.mask = 'complement' ;
+
+Cin = sparse (n, n) ;
+C1 = double (~(M.matrix)) .* (A.matrix*B.matrix) ;
+C2 = GB_mex_mxm  (Cin, M, [ ], semiring, A, B, desc) ;
+GB_spec_compare (C1, C2) ;
+
+M.sparsity = 4 ; % make M bitmap
+M.matrix (1:64, 1:64) = 0 ; % clear the leading 64-by-64 tile
+C1 = double (M.matrix) .* (A.matrix*B.matrix) ;
+C2 = GB_mex_mxm  (Cin, M, [ ], semiring, A, B, [ ]) ;
+GB_spec_compare (C1, C2) ;
+
+fprintf ('test162: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test163.m b/GraphBLAS/Test/test163.m
new file mode 100644
index 0000000000..20462b4879
--- /dev/null
+++ b/GraphBLAS/Test/test163.m
@@ -0,0 +1,24 @@
+function test163
+%TEST163 test C<!M>=A'*B where C and M are sparse
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+semiring.add = 'plus' ;
+semiring.multiply = 'times' ;
+semiring.class = 'double' ;
+
+rng ('default') ;
+
+n = 1000 ;
+m = 10 ;
+A = sprand (m, n, 0.1) ;
+B = sprand (m, n, 0.1) ;
+M = logical (sprand (n, n, 0.1)) ;
+Cin = sparse (n, n) ;
+dtn = struct ('inp0', 'tran', 'mask', 'complement', 'axb', 'dot') ;
+
+C1 = double (~M) .* (A'*B) ;
+C2 = GB_mex_mxm (Cin, M, [ ], semiring, A, B, dtn) ;
+GB_spec_compare (C1, C2) ;
+
diff --git a/GraphBLAS/Test/test164.m b/GraphBLAS/Test/test164.m
new file mode 100644
index 0000000000..04b346959b
--- /dev/null
+++ b/GraphBLAS/Test/test164.m
@@ -0,0 +1,37 @@
+function test164
+%TEST164 test GB_AxB_dot5
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+n = 10 ;
+A = GB_spec_random (n, n, 0.5, 1, 'logical') ;
+A.sparsity = 2 ;    % sparse
+B = GB_spec_random (n, 1, 0.5, 1, 'logical') ;
+B.sparsity = 4 ;    % bitmap
+
+semiring.add = 'any' ;
+semiring.multiply = 'secondi1' ;
+semiring.class = 'int32' ;
+
+M = GB_spec_random (n, 1, inf, 1, 'int32') ;
+M.matrix = double (full (M.matrix > 0.5)) ;
+M.sparsity = 8 ;    % full
+desc = struct ('inp0', 'tran', 'mask', 'complement') ;
+
+% no accum
+Cin = sparse (n, 1) ;
+
+% can't compare C0 and C1 because the ANY monoid differs
+C0 = GB_spec_mxm (Cin, M, [ ], semiring, A, B, desc) ;
+C1 = GB_mex_mxm  (Cin, M, [ ], semiring, A, B, desc) ;
+% GB_spec_compare (C0,C1) ;
+
+semiring.add = 'min' ;
+C0 = GB_spec_mxm (Cin, M, [ ], semiring, A, B, desc) ;
+C1 = GB_mex_mxm  (Cin, M, [ ], semiring, A, B, desc) ;
+GB_spec_compare (C0,C1) ;
+
+fprintf ('test164: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test165.m b/GraphBLAS/Test/test165.m
new file mode 100644
index 0000000000..98dc4ea459
--- /dev/null
+++ b/GraphBLAS/Test/test165.m
@@ -0,0 +1,28 @@
+function test165
+%TEST165 test C=A*B' where A is diagonal and B becomes bitmap
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+n = 10 ;
+D.matrix = sparse (1:n, 1:n, rand (n,1)) ;
+D.pattern = logical (speye (n)) ;
+D.class = 'double' ;
+
+d = 0.5 ;
+B = GB_spec_random (n, n, d, 1, 'double') ;
+Cin = sparse (n, n) ;
+
+semiring.add = 'plus' ;
+semiring.multiply = 'times' ;
+semiring.class = 'double' ;
+dnt = struct ('inp1', 'tran') ;
+
+C1 = D.matrix*B.matrix' ;
+C2 = GB_mex_mxm (Cin, [ ], [ ], semiring, D, B, dnt) ;
+GB_spec_compare (C1, C2) ;
+
+fprintf ('test165: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test166.m b/GraphBLAS/Test/test166.m
new file mode 100644
index 0000000000..7b9c0c6623
--- /dev/null
+++ b/GraphBLAS/Test/test166.m
@@ -0,0 +1,41 @@
+function test166
+%TEST166 GxB_select with a dense matrix
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+fprintf ('test166: ') ;
+
+n = 10 ;
+A.matrix = rand (n) ;
+A.matrix (1,1) = 0 ;
+A.pattern = true (n) ;
+A.sparsity = 8 ;
+Cin = sparse (n, n) ;
+
+[~, ~, ~, ~, ~, select_ops] = GB_spec_opsall ;
+
+thunk = 0 ;
+
+for k = 1:length(select_ops)
+
+    op = select_ops {k} ;
+    fprintf ('%s ', op) ;
+    if (mod (k, 4) == 0)
+        fprintf ('\n') ;
+    end
+
+    % no mask, thunk = 0
+    C1 = GB_spec_select (Cin, [], [], op, A, thunk, []) ;
+    C2 = GB_mex_select  (Cin, [], [], op, A, thunk, [], 'test') ;
+    GB_spec_compare (C1, C2) ;
+
+end
+
+fprintf ('resize\n') ;
+C1 = GB_spec_resize (A, 5, 15) ;
+C2 = GB_mex_resize  (A, 5, 15) ;
+GB_spec_compare (C1, C2) ;
+fprintf ('test166: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test167.m b/GraphBLAS/Test/test167.m
new file mode 100644
index 0000000000..c49dbc0273
--- /dev/null
+++ b/GraphBLAS/Test/test167.m
@@ -0,0 +1,50 @@
+function test167
+%TEST167 test C<M>=A*B with very sparse M, different types of A and B
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+semiring.add = 'plus' ;
+semiring.multiply = 'times' ;
+semiring.class = 'double' ;
+
+d = 0.02 ;
+n = 1000 ;
+
+A.matrix = 100 * sprand (n, n, d) ;
+A.matrix (1:257,1) = rand (257, 1) ;
+
+B.matrix = 100 * sprand (n, n, d) ;
+B.matrix (1,1) = 1 ;
+M = logical (sprand (n, n, 0.002)) ;
+Cin.matrix = sparse (n, n) ;
+
+[~, ~, ~, types, ~, ~,] = GB_spec_opsall ;
+types = types.all ;
+
+for k = 1:length (types)
+
+    type = types {k} ;
+    semiring.class = type ;
+    A.class = type ;
+    B.class = type ;
+    Cin.class = type ;
+    fprintf ('%s ', type) ;
+
+    C2 = GB_mex_mxm (Cin, M, [ ], semiring, A, B, [ ]) ;
+
+    if (isequal (type, 'double'))
+        A2 = GB_spec_matrix (A) ;
+        B2 = GB_spec_matrix (B) ;
+        C1 = double (M) .* (A2.matrix * B2.matrix) ;
+        err = norm (C1 - C2.matrix, 1) / norm (C1, 1) ;
+        assert (err < 1e-12) ;
+        % C1 = GB_spec_mxm (Cin, M, [ ], semiring, A, B, [ ]) ;
+        % GB_spec_compare (C1, C2) ;
+    end
+end
+
+fprintf ('\ntest167: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test168.m b/GraphBLAS/Test/test168.m
new file mode 100644
index 0000000000..27e8e566ee
--- /dev/null
+++ b/GraphBLAS/Test/test168.m
@@ -0,0 +1,26 @@
+function test168
+%TEST168 C=A+B with C and B full, A bitmap
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+fprintf ('test168:\n') ;
+
+n = 10 ;
+C = GB_spec_random (n, n, inf, 1, 'double') ;
+C.sparsity = 8 ;    % full
+
+A = GB_spec_random (n, n, 0.5, 1, 'double') ;
+A.sparsity = 4 ;    % bitmap
+
+B = GB_spec_random (n, n, inf, 1, 'double') ;
+B.sparsity = 8 ;    % full
+
+C1 = A.matrix + B.matrix ;
+C2 = GB_mex_Matrix_eWiseAdd (C, [ ], [ ], 'plus', A, B, [ ]) ;
+GB_spec_compare (C1, C2) ;
+
+fprintf ('test168: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test169.m b/GraphBLAS/Test/test169.m
new file mode 100644
index 0000000000..8e0e90e354
--- /dev/null
+++ b/GraphBLAS/Test/test169.m
@@ -0,0 +1,31 @@
+function test169
+%TEST169 C<M>=A+B with C sparse, M hyper, A and B sparse
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+fprintf ('test169:\n') ;
+
+n = 10 ;
+C = GB_spec_random (n, n, 0.5, 1, 'double') ;
+C.sparsity = 2 ;    % sparse
+
+M = GB_spec_random (n, n, 0.02, 1, 'double') ;
+M.sparsity = 1 ;    % hypersparse
+
+A = GB_spec_random (n, n, 0.5, 1, 'double') ;
+A.sparsity = 2 ;    % sparse
+
+B = GB_spec_random (n, n, 0.5, 1, 'double') ;
+B.sparsity = 2 ;    % sparse
+
+desc = struct ('mask', 'complement') ;
+
+C1 = GB_spec_Matrix_eWiseAdd (C, M, [ ], 'plus', A, B, desc) ;
+C2 = GB_mex_Matrix_eWiseAdd  (C, M, [ ], 'plus', A, B, desc) ;
+GB_spec_compare (C1, C2) ;
+
+fprintf ('test169: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test17.m b/GraphBLAS/Test/test17.m
index fefcbdef81..34c917a228 100644
--- a/GraphBLAS/Test/test17.m
+++ b/GraphBLAS/Test/test17.m
@@ -1,8 +1,8 @@
-% function test17
+function test17
 %TEST17 test GrB_*_extractElement
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n ------------ testing GrB_extractElement\n') ;
 
@@ -38,24 +38,24 @@
                 for A_is_csc   = 0:1
                 A.is_hyper = A_is_hyper ;
                 A.is_csc   = A_is_csc   ;
-
                 for i = 0:m-1
+                    iu = uint64 (i) ;
                     for j = 0:n-1
-                        x1 = GB_mex_Matrix_extractElement  (A, uint64(i), uint64(j), xtype) ;
+                        ju = uint64 (j) ;
+                        x1 = GB_mex_Matrix_extractElement  (A, iu, ju, xtype) ;
                         x2 = GB_spec_Matrix_extractElement (A, i, j, xtype) ;
                         assert (isequal (x1,x2))
                     end
                 end
-
                 end
                 end
 
                 for i = 0:(m*n)-1
-                    x1 = GB_mex_Vector_extractElement  (B, uint64(i), xtype) ;
-                    x2 = GB_spec_Vector_extractElement (B, uint64(i), xtype) ;
+                    iu = uint64 (i) ;
+                    x1 = GB_mex_Vector_extractElement  (B, iu, xtype) ;
+                    x2 = GB_spec_Vector_extractElement (B, i, xtype) ;
                     assert (isequal (x1,x2))
                 end
-
             end
         end
     end
diff --git a/GraphBLAS/Test/test170.m b/GraphBLAS/Test/test170.m
new file mode 100644
index 0000000000..9e245a2f52
--- /dev/null
+++ b/GraphBLAS/Test/test170.m
@@ -0,0 +1,24 @@
+function test170
+%TEST170 test C<B>=A+B (alias M==B)
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+fprintf ('test170:\n') ;
+
+n = 30 ;
+
+A = GB_spec_random (n, n, 0.5, 1, 'double') ;
+A.sparsity = 2 ;    % sparse
+
+B = GB_spec_random (n, n, 0.5, 1, 'double') ;
+B.sparsity = 2 ;    % sparse
+
+C1 = spones (B.matrix) .* (A.matrix+B.matrix) ;
+C2 = GB_mex_AplusB_M_aliased (A, B, 'plus') ;
+GB_spec_compare (C1, C2) ;
+
+fprintf ('test170: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test171.m b/GraphBLAS/Test/test171.m
new file mode 100644
index 0000000000..e0665c3d1c
--- /dev/null
+++ b/GraphBLAS/Test/test171.m
@@ -0,0 +1,23 @@
+function test171
+%TEST171 test conversion
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+fprintf ('test171:\n') ;
+
+for n = [ 1000 1025 2000 ]
+
+    A.matrix = rand (n) ;
+    A.sparsity = 4 ;
+    C = GB_mex_dump (A, 1) ;
+
+    A.matrix = sparse (rand (n)) ;
+    A.sparsity = 4 ;
+    C = GB_mex_dump (A, 1) ;
+end
+
+fprintf ('test171: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test172.m b/GraphBLAS/Test/test172.m
new file mode 100644
index 0000000000..d0bf53084d
--- /dev/null
+++ b/GraphBLAS/Test/test172.m
@@ -0,0 +1,55 @@
+function test172
+%TEST172 eWiseMult with M bitmap/full
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+fprintf ('test172:\n') ;
+
+n = 500 ;
+
+    A = GB_spec_random (n, n, 0.05, 1, 'double') ;
+    A.sparsity = 2 ;    % sparse
+
+    B = GB_spec_random (n, n, 0.05, 1, 'double') ;
+    B.sparsity = 2 ;    % sparse
+
+    M = GB_spec_random (n, n, 0.1, 1, 'double') ;
+    M.matrix (1,1) = 1 ;
+    M.pattern (1,1) = true ;
+    M.sparsity = 4 ;    % bitmap
+
+    Cin = sparse (n,n) ;
+
+    C1 = GB_spec_Matrix_eWiseMult (Cin, M, [ ], 'times', A, B, [ ]) ;
+    C2 = GB_mex_Matrix_eWiseMult  (Cin, M, [ ], 'times', A, B, [ ]) ;
+    C3 = spones (M.matrix) .* (A.matrix .* B.matrix) ;
+    GB_spec_compare (C1, C2) ;
+    GB_spec_compare (C1, C3) ;
+
+    A2 = A ;
+    B2 = B ;
+
+    A2.matrix (:,1) = sprand (n, 1, 0.9) ;
+    A2.pattern = logical (A2.matrix) ;
+
+    B2.matrix (:,1) = sprand (n, 1, 0.01) ;
+    B2.matrix (1,1) = 3 ;
+    B2.pattern = logical (B2.matrix) ;
+
+    C1 = GB_spec_Matrix_eWiseMult (Cin, M, [ ], 'times', A2, B2, [ ]) ;
+    C2 = GB_mex_Matrix_eWiseMult  (Cin, M, [ ], 'times', A2, B2, [ ]) ;
+    C3 = spones (M.matrix) .* (A2.matrix .* B2.matrix) ;
+    GB_spec_compare (C1, C2) ;
+    GB_spec_compare (C1, C3) ;
+
+    C1 = GB_spec_Matrix_eWiseMult (Cin, M, [ ], 'times', B2, A2, [ ]) ;
+    C2 = GB_mex_Matrix_eWiseMult  (Cin, M, [ ], 'times', B2, A2, [ ]) ;
+    C3 = spones (M.matrix) .* (B2.matrix .* A2.matrix) ;
+    GB_spec_compare (C1, C2) ;
+    GB_spec_compare (C1, C3) ;
+
+fprintf ('test172: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test173.m b/GraphBLAS/Test/test173.m
new file mode 100644
index 0000000000..cd55ac19f3
--- /dev/null
+++ b/GraphBLAS/Test/test173.m
@@ -0,0 +1,76 @@
+function test173
+%TEST173 test GrB_assign C<A>=A
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+% [~, ~, ~, types, ~, ~] = GB_spec_opsall ;
+% types = types.all ;
+
+types = { 'logical', 'double', 'double complex' } ;
+
+m = 10 ;
+n = 14 ;
+
+rng ('default') ;
+
+desc.mask = 'structural' ;
+
+for k = 1:length (types)
+
+    ctype = types {k} ;
+    fprintf ('%s, ', ctype) ;
+
+    for d = [0.5 inf]
+
+        C = GB_spec_random (m, n, d, 100, ctype) ;
+        C = GB_spec_matrix (C) ;
+        C.matrix = sparse (C.matrix) ;
+
+        A = GB_spec_random (m, n, 0.5, 100, ctype) ;
+        A.matrix = sparse (A.matrix) ;
+        A_nonzero = full (A.matrix ~= 0) ;
+
+        A_dense = GB_spec_random (m, n, inf, 100, ctype) ;
+        A_dense = GB_spec_matrix (A_dense) ;
+        A_dense.matrix = sparse (A_dense.matrix) ;
+        A_dense_nonzero = full (A_dense.matrix ~= 0) ;
+
+        for C_sparsity = 1:15
+            C.sparsity = C_sparsity ;
+
+            for A_sparsity = 1:15
+                A.sparsity = A_sparsity ;
+                A_dense.sparsity = A_sparsity ;
+
+                % C<A> = A
+                C1 = GB_mex_assign_alias_mask (C, A, [ ]) ;
+                C2 = full (C.matrix) ;
+                C2 (A_nonzero) = full (A.matrix (A_nonzero)) ;
+                err = norm (double (C2) - double (C1.matrix), 1) ;
+                assert (err == 0) ;
+
+                % C<A,struct> = A
+                B = A ;
+                B.matrix = sparse (B.matrix) ;
+                C3 = GB_mex_assign_alias_mask (C, B, desc) ;
+                err = norm (double (C2) - double (C3.matrix), 1) ;
+                assert (err == 0) ;
+
+                % C<A,struct> = A where A is dense
+                C1 = GB_mex_assign_alias_mask (C, A_dense, desc) ;
+                err = norm (double (A_dense.matrix) - double (C1.matrix), 1) ;
+                assert (err == 0) ;
+
+                % C<A> = A where A is dense
+                C1 = GB_mex_assign_alias_mask (C, A_dense, [ ]) ;
+                err = norm (double (A_dense.matrix) - double (C1.matrix), 1) ;
+                assert (err == 0) ;
+
+            end
+        end
+    end
+end
+
+fprintf ('\ntest173: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test174.m b/GraphBLAS/Test/test174.m
new file mode 100644
index 0000000000..1334007553
--- /dev/null
+++ b/GraphBLAS/Test/test174.m
@@ -0,0 +1,53 @@
+function test174
+%TEST174 bitmap assignment, C<!,repl>+=A
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+
+n = 100 ;
+A = GB_spec_random (n, n, 0.05, 1, 'double') ;
+A.sparsity = 2 ;
+
+C = GB_spec_random (n, n, 0.05, 1, 'double') ;
+C.sparsity = 4 ;
+
+desc.mask = 'complement' ;
+desc.outp = 'replace' ;
+
+% C<!,repl> += A
+C1 = GB_spec_assign (C, [ ], 'plus', A, [ ], [ ], desc, false) ;
+C2 = GB_mex_assign  (C, [ ], 'plus', A, [ ], [ ], desc, false) ;
+GB_spec_compare (C1, C2) ;
+
+k = 10 ;
+I = randperm (n, k) ;
+J = randperm (n, k) ;
+I0 = uint64 (I) - 1 ;
+J0 = uint64 (J) - 1 ;
+
+A = GB_spec_random (k, k, 0.05, 1, 'double') ;
+A.sparsity = 2 ;
+
+% C<!,repl>(I,J) += A
+C1 = GB_spec_assign (C, [ ], 'plus', A, I,  J,  desc, false) ;
+C2 = GB_mex_assign  (C, [ ], 'plus', A, I0, J0, desc, false) ;
+GB_spec_compare (C1, C2) ;
+
+I = 2 ;
+I0 = uint64 (I) - 1 ;
+Arow = sprand (n, 1, 0.5) ;
+Acol = sprand (n, 1, 0.5) ;
+
+% C<!,repl>(i,:) = A
+C1 = GB_spec_Row_assign (C, [ ], 'plus', Arow, I,  [ ],  desc) ;
+C2 = GB_mex_assign      (C, [ ], 'plus', Arow, I0, [ ], desc, 2) ;
+GB_spec_compare (C1, C2) ;
+
+% C<!,repl>(:,i) = A
+C1 = GB_spec_Col_assign (C, [ ], 'plus', Acol, [ ], I,  desc) ;
+C2 = GB_mex_assign      (C, [ ], 'plus', Acol, [ ], I0, desc, 1) ;
+GB_spec_compare (C1, C2) ;
+
+fprintf ('test174: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test175.m b/GraphBLAS/Test/test175.m
new file mode 100644
index 0000000000..75a7d18064
--- /dev/null
+++ b/GraphBLAS/Test/test175.m
@@ -0,0 +1,247 @@
+function test175
+%TEST175 test GrB_assign with accum
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+[binops, ~, ~, types, ~, ~] = GB_spec_opsall ;
+binops = binops.all ;
+types = types.all ;
+
+fprintf ('test175 ------------ GrB_assign with accum\n') ;
+
+m = 10 ;
+n = 14 ;
+
+rng ('default') ;
+
+M1.matrix = logical (spones (sprand (m, n, 0.5))) ;
+M1.sparsity = 2 ; % sparse
+
+M2.matrix = logical (spones (sprand (m, n, 0.5))) ;
+M2.sparsity = 4 ; % sparse
+
+Amat2 = sparse (2 * rand (m,n)) ;
+Bmat2 = sparse (2 * sprand (m,n, 0.5)) ;
+Cmat2 = sparse (2 * rand (m,n)) ;
+
+Amat = 5000 * Amat2 ;
+B1mat = 5000 * Bmat2 ;
+Cmat = 5000 * Cmat2 ;
+
+B2mat = rand (m,n) ;
+
+Smat = sparse (m,n) ;
+Xmat = sparse (pi) ;
+desc.mask = 'structural' ;
+drep.outp = 'replace' ;
+
+for k1 = 1:length (types)
+    type = types {k1}  ;
+    fprintf ('%s, ', type) ;
+
+    clear A B1 B2 C S X
+    A.matrix = Amat   ; A.class = 'see below' ;
+    B1.matrix = B1mat ; B1.class = 'see below' ;
+    B2.matrix = B2mat ; B2.class = 'see below' ;
+    C.matrix = Cmat   ; C.class = 'see below' ;
+    S.matrix = Smat   ; S.class = 'see below' ;
+    X.matrix = Xmat   ; X.class = 'see below' ;
+    A.class = type ;
+
+    for k3 = 1:3
+
+    if (k3 == 1)
+        X.class = type ;
+        B1.class = type ;
+        B2.class = type ;
+        C.class = 'logical' ;
+        S.class = 'logical' ;
+    elseif (k3 == 2)
+        X.class = type ;
+        B1.class = type ;
+        B2.class = type ;
+        C.class = type ;
+        S.class = type ;
+    else
+        X.class = 'int8' ;
+        B1.class = 'int8' ;
+        B2.class = 'int8' ;
+        C.class = type ;
+        S.class = type ;
+    end
+
+    for k4 = 1:7
+
+        if (k4 == 1)
+            M = M1 ;
+            B = B1 ;
+            C.sparsity = 2 ;
+            B.sparsity = 2 ;
+        elseif (k4 == 2)
+            M = M2 ;
+            B = B1 ;
+            C.sparsity = 2 ;
+            B.sparsity = 2 ;
+        elseif (k4 == 3)
+            M = M1 ;
+            B = B1 ;
+            C.sparsity = 4 ;
+            B.sparsity = 2 ;
+        elseif (k4 == 4)
+            M = M2 ;
+            B = B1 ;
+            C.sparsity = 4 ;
+            B.sparsity = 2 ;
+        elseif (k4 == 5)
+            M = M2 ;
+            B = B1 ;
+            C.sparsity = 4 ;
+            B.sparsity = 4 ;
+        elseif (k4 == 6)
+            M = M2 ;
+            B = B2 ;
+            C.sparsity = 4 ;
+            B.sparsity = 4 ;
+        elseif (k4 == 7)
+            M = M2 ;
+            B = B2 ;
+            C.sparsity = 2 ;
+            B.sparsity = 4 ;
+        end
+
+        %---------------------------------------
+        % C<M> = A where A is dense
+        %---------------------------------------
+
+        C0 = GB_spec_assign (C, M, [ ], A, [ ], [ ], [ ], false) ;
+        C1 = GB_mex_assign  (C, M, [ ], A, [ ], [ ], [ ]) ;
+        GB_spec_compare (C0, C1) ;
+
+        %---------------------------------------
+        % C<M> = B where B is sparse
+        %---------------------------------------
+
+        C0 = GB_spec_assign (C, M, [ ], B, [ ], [ ], [ ], false) ;
+        C1 = GB_mex_assign  (C, M, [ ], B, [ ], [ ], [ ]) ;
+        GB_spec_compare (C0, C1) ;
+
+        %---------------------------------------
+        % C<M> = A where A is dense and C starts empty
+        %---------------------------------------
+
+        C0 = GB_spec_assign (S, M, [ ], A, [ ], [ ], desc, false) ;
+        C1 = GB_mex_assign  (S, M, [ ], A, [ ], [ ], desc) ;
+        GB_spec_compare (C0, C1) ;
+
+        %---------------------------------------
+        % C<M> = x where C is dense
+        %---------------------------------------
+
+        C0 = GB_spec_assign (C, M, [ ], X, [ ], [ ], [ ], true) ;
+        C1 = GB_mex_assign  (C, M, [ ], X, [ ], [ ], [ ]) ;
+        GB_spec_compare (C0, C1) ;
+
+        %---------------------------------------
+        % C<M> = x where C is dense
+        %---------------------------------------
+
+        C0 = GB_spec_assign (C, M, [ ], X, [ ], [ ], desc, true) ;
+        C1 = GB_mex_assign  (C, M, [ ], X, [ ], [ ], desc) ;
+        GB_spec_compare (C0, C1) ;
+
+        %---------------------------------------
+        % C<M,struct> = x
+        %---------------------------------------
+
+        C0 = GB_spec_assign (S, M, [ ], X, [ ], [ ], desc, true) ;
+        C1 = GB_mex_assign  (S, M, [ ], X, [ ], [ ], desc) ;
+        GB_spec_compare (C0, C1) ;
+
+    end
+
+    %---------------------------------------
+    % C = x
+    %---------------------------------------
+
+    C0 = GB_spec_assign (S, [ ], [ ], X, [ ], [ ], [ ], true) ;
+    C1 = GB_mex_assign  (S, [ ], [ ], X, [ ], [ ], [ ]) ;
+    GB_spec_compare (C0, C1) ;
+
+    %---------------------------------------
+    % with accum operators
+    %---------------------------------------
+
+    clear A B C
+
+    for k2 = 1:length(binops)
+        binop = binops {k2}  ;
+
+        tol = 0 ;
+        switch (binop)
+            case { 'pow', 'atan2', 'hypot', 'remainder' }
+                A.matrix = Amat2 ;
+                B.matrix = Bmat2 ;
+                C.matrix = Cmat2 ;
+                if (contains (type, 'single'))
+                    tol = 1e-5 ;
+                elseif (contains (type, 'double'))
+                    tol = 1e-12 ;
+                end
+            otherwise
+                A.matrix = Amat ;
+                B.matrix = B1mat ;
+                C.matrix = Cmat ;
+        end
+
+        accum.opname = binop ;
+        accum.optype = type ;
+
+        try
+            GB_spec_operator (accum) ;
+        catch
+            continue
+        end
+
+        if (GB_spec_is_positional (accum))
+            continue ;
+        end
+
+        %---------------------------------------
+        % C += A where A is dense
+        %---------------------------------------
+
+        C0 = GB_spec_assign (C, [ ], accum, A, [ ], [ ], [ ], false) ;
+        C1 = GB_mex_assign  (C, [ ], accum, A, [ ], [ ], [ ]) ;
+        GB_spec_compare (C0, C1, 0, tol) ;
+
+        %---------------------------------------
+        % C += B where B is sparse
+        %---------------------------------------
+
+        C0 = GB_spec_assign (C, [ ], accum, B, [ ], [ ], [ ], false) ;
+        C1 = GB_mex_assign  (C, [ ], accum, B, [ ], [ ], [ ]) ;
+        GB_spec_compare (C0, C1, 0, tol) ;
+
+        %---------------------------------------
+        % C += x
+        %---------------------------------------
+
+        C0 = GB_spec_assign (C, [ ], accum, X, [ ], [ ], [ ], true) ;
+        C1 = GB_mex_assign  (C, [ ], accum, X, [ ], [ ], [ ]) ;
+        GB_spec_compare (C0, C1, 0, tol) ;
+
+        %---------------------------------------
+        % C<replace> += x
+        %---------------------------------------
+
+        C0 = GB_spec_assign (C, [ ], accum, X, [ ], [ ], drep, true) ;
+        C1 = GB_mex_subassign  (C, [ ], accum, X, [ ], [ ], drep) ;
+        GB_spec_compare (C0, C1, 0, tol) ;
+
+    end
+end
+end
+
+fprintf ('\ntest175: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test176.m b/GraphBLAS/Test/test176.m
new file mode 100644
index 0000000000..3729a00026
--- /dev/null
+++ b/GraphBLAS/Test/test176.m
@@ -0,0 +1,50 @@
+function test176
+%TEST176 test C(I,J)<M,repl> = scalar (method 09, 11), M bitmap
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+fprintf ('test176 ------------ assign/subassign: methods 09, 11\n') ;
+
+m = 10 ;
+n = 14 ;
+
+rng ('default') ;
+desc.outp = 'replace' ;
+scalar = sparse (pi) ;
+I = [1 2 4 5 6 8] ;
+J = [3 2 4 9 7 1] ;
+I0 = uint64 (I) - 1 ;
+J0 = uint64 (J) - 1 ;
+
+for trial = 1:10
+
+    clear M Cin
+    M.matrix = logical (spones (sprand (m, n, 0.5))) ;
+    M.sparsity = 4 ; % bitmap
+
+    Cin = GB_spec_random (m, n, 0.5, 1, 'double') ;
+    Cin.sparsity = 2 ; % sparse
+
+    C1 = GB_spec_assign (Cin, M, [ ], scalar, I,  J,  desc, true) ;
+    C2 = GB_mex_assign  (Cin, M, [ ], scalar, I0, J0, desc) ;
+    GB_spec_compare (C1, C2) ;
+
+    C1 = GB_spec_assign (Cin, M, 'plus', scalar, I,  J,  desc, true) ;
+    C2 = GB_mex_assign  (Cin, M, 'plus', scalar, I0, J0, desc) ;
+    GB_spec_compare (C1, C2) ;
+
+    M.matrix = logical (spones (sprand (length (I), length (J), 0.5))) ;
+    M.sparsity = 4 ; % bitmap
+
+    C1 = GB_spec_subassign (Cin, M, [ ], scalar, I,  J,  desc, true) ;
+    C2 = GB_mex_subassign  (Cin, M, [ ], scalar, I0, J0, desc) ;
+    GB_spec_compare (C1, C2) ;
+
+    C1 = GB_spec_subassign (Cin, M, 'plus', scalar, I,  J,  desc, true) ;
+    C2 = GB_mex_subassign  (Cin, M, 'plus', scalar, I0, J0, desc) ;
+    GB_spec_compare (C1, C2) ;
+
+end
+
+fprintf ('\ntest176: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test177.m b/GraphBLAS/Test/test177.m
new file mode 100644
index 0000000000..913b22dadb
--- /dev/null
+++ b/GraphBLAS/Test/test177.m
@@ -0,0 +1,48 @@
+function test177
+%TEST177 C<!M>=A*B, C bitmap, M sparse, A bitmap, B sparse
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+fprintf ('test177:   C<!M>=A*B, C bitmap, M sparse, A bitmap, B sparse\n') ;
+
+n = 20 ;
+
+rng ('default') ;
+
+desc.mask = 'complement' ;
+semiring.class = 'double' ;
+
+for trial = 1:10
+
+    clear M Cin
+    M.matrix = logical (spones (sprand (n, n, 0.5))) ;
+    M.sparsity = 2 ; % bitmap
+
+    Cin = GB_spec_random (n, n, 0.5, 1, 'double') ;
+    Cin.sparsity = 4 ; % bitmap
+
+    A = GB_spec_random (n, n, 0.5, 1, 'double') ;
+    A.sparsity = 4 ; % bitmap
+
+    B = GB_spec_random (n, n, 0.5, 1, 'double') ;
+    B.sparsity = 2 ; % sparse
+
+    semiring.add = 'plus' ;
+    semiring.multiply = 'times' ;
+
+    C1 = GB_spec_mxm (Cin, M, [ ], semiring, A, B, desc) ;
+    C2 = GB_mex_mxm  (Cin, M, [ ], semiring, A, B, desc) ;
+    GB_spec_compare (C1, C2) ;
+
+    semiring.add = 'max' ;
+    semiring.multiply = 'plus' ;
+
+    C1 = GB_spec_mxm (Cin, M, [ ], semiring, A, B, desc) ;
+    C2 = GB_mex_mxm  (Cin, M, [ ], semiring, A, B, desc) ;
+    GB_spec_compare (C1, C2) ;
+
+end
+
+fprintf ('\ntest177: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test178.m b/GraphBLAS/Test/test178.m
new file mode 100644
index 0000000000..34d5086c5a
--- /dev/null
+++ b/GraphBLAS/Test/test178.m
@@ -0,0 +1,33 @@
+function test178
+%TEST178 matrix realloc
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+fprintf ('test178: --------------------------------- matrix realloc\n') ;
+
+n = 20 ;
+
+rng ('default') ;
+
+desc = struct ('mask', 'complement') ;
+
+for trial = 1:10
+
+    Cin = GB_spec_random (n, n, inf, 1, 'double') ;
+    Cin.sparsity = 2 ; % sparse
+    M = sparse (n,n) ;
+    M (1,1) = 1 ;
+    A = sparse (n,n) ;
+
+    C1 = GB_spec_assign (Cin, M, [ ], A, [ ], [ ], desc, false) ;
+    C2 = GB_mex_assign  (Cin, M, [ ], A, [ ], [ ], desc) ;
+    GB_spec_compare (C1, C2) ;
+    sparse (C1.matrix)
+    sparse (C2.matrix)
+
+end
+
+
+fprintf ('\ntest178: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test179.m b/GraphBLAS/Test/test179.m
new file mode 100644
index 0000000000..5f90e00b99
--- /dev/null
+++ b/GraphBLAS/Test/test179.m
@@ -0,0 +1,48 @@
+function test179
+%TEST179 bitmap select
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+fprintf ('test179: --------------------------------- bitmap select\n') ;
+
+ops = { 'nonzero',  'eq_zero', 'ne_thunk', 'eq_thunk' } ;
+
+n = 20 ;
+rng ('default') ;
+
+Cin = GB_spec_random (n, n, 0.5, 1, 'double complex') ;
+A = GB_spec_random (n, n, 0.5, 1, 'double complex') ;
+scalar = 3+1i ;
+A.matrix  (2:3,2:4) = scalar ;
+A.pattern (2:3,2:4) = true ;
+
+for builtin = 0:1
+    GB_builtin_complex_set (builtin) ;
+    for sparsity_control = [1 2 4]
+        Cin.sparsity = sparsity_control ;
+        A.sparsity = sparsity_control ;
+        for k = 1:length (ops)
+            op = ops {k} ;
+            C1 = GB_spec_select (Cin, [], [], op, A, scalar, []) ;
+            C2 = GB_mex_select  (Cin, [], [], op, A, scalar, []) ;
+            GB_spec_compare (C1, C2) ;
+        end
+    end
+end
+
+% try a user-defined selectop
+Cin = GB_spec_random (n, n, 0.5, 1, 'double') ;
+A = GB_spec_random (n, n, 0.5, 1, 'double') ;
+A.matrix  (2:3,2:4) = nan ;
+A.pattern (2:3,2:4) = true ;
+for sparsity_control = [1 2 4]
+    Cin.sparsity = sparsity_control ;
+    A.sparsity = sparsity_control ;
+    C1 = GB_spec_select (Cin, [], [], 'isnan', A, 0, []) ;
+    C2 = GB_mex_select  (Cin, [], [], 'isnan', A) ;
+    GB_spec_compare (C1, C2) ;
+end
+
+fprintf ('\ntest179: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test18.m b/GraphBLAS/Test/test18.m
index b46921c760..907ab5d597 100644
--- a/GraphBLAS/Test/test18.m
+++ b/GraphBLAS/Test/test18.m
@@ -1,8 +1,8 @@
 function test18(fulltest)
 %TEST18 test GrB_eWiseAdd and GrB_eWiseMult
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, ~, ~, types, ~, ~] = GB_spec_opsall ;
 bin_ops = binops.all ;
@@ -159,13 +159,16 @@ function test18(fulltest)
                     if (k4 == 0)
                         accum = ''  ;
                         ntypes = 1 ;
-                        % fprintf ('accum: [ none ]') ;
                     else
                         accum.opname = bin_ops {k4}  ;
                         ntypes = length (types) ;
                         fprintf ('accum: %s ', accum.opname) ;
                     end
 
+                    if (GB_spec_is_positional (accum))
+                        continue
+                    end
+
                     for k5 = randi ([1 ntypes]) % ntypes
 
                         if (k4 > 0)
diff --git a/GraphBLAS/Test/test180.m b/GraphBLAS/Test/test180.m
new file mode 100644
index 0000000000..cfef43716f
--- /dev/null
+++ b/GraphBLAS/Test/test180.m
@@ -0,0 +1,93 @@
+function test180
+%TEST180 subassign and assign
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+% For test coverage, this test must be run with 1 thread
+
+fprintf ('test180: --------------------------------- assign\n') ;
+
+n = 20 ;
+rng ('default') ;
+
+Cin = GB_spec_random (n, n, 0.5, 1, 'double') ;
+m = 6 ;
+I1 = [2 3 5 1 9 11] ;
+J1 = [4 5 1 9 2 12] ;
+I0 = uint64 (I1) - 1 ;
+J0 = uint64 (J1) - 1 ;
+M1.matrix = logical (sprand (m, m, 0.5)) ;
+M2.matrix = logical (sprand (n, n, 0.5)) ;
+A = GB_spec_random (m, m, 0.8, 1, 'double') ;
+
+dr = struct ('outp', 'replace') ;
+dc = struct ('mask', 'complement') ;
+drc = struct ('outp', 'replace', 'mask', 'complement') ;
+
+for c = 1:15
+    Cin.sparsity = c ;
+    fprintf ('.') ;
+    for a = 1:15
+        A.sparsity = a ;
+
+        % C(I,J) = A
+        C1 = GB_spec_subassign (Cin, [ ], [ ], A, I1, J1, [ ], false) ;
+        C2 = GB_mex_subassign  (Cin, [ ], [ ], A, I0, J0, [ ]) ;
+        GB_spec_compare (C1, C2) ;
+
+        for ms = [2 4]
+            M1.sparsity = ms ;
+            M2.sparsity = ms ;
+
+            % C(I,J)<M> = A
+            C1 = GB_spec_subassign (Cin, M1, [ ], A, I1, J1, [ ], false) ;
+            C2 = GB_mex_subassign  (Cin, M1, [ ], A, I0, J0, [ ]) ;
+            GB_spec_compare (C1, C2) ;
+
+            % C<M,repl>(I,J) = A
+            C1 = GB_spec_assign (Cin, M2, [ ], A, I1, J1, dr, false) ;
+            C2 = GB_mex_assign  (Cin, M2, [ ], A, I0, J0, dr) ;
+            GB_spec_compare (C1, C2) ;
+
+            % C<!M,repl>(I,J) = A
+            C1 = GB_spec_assign (Cin, M2, [ ], A, I1, J1, drc, false) ;
+            C2 = GB_mex_assign  (Cin, M2, [ ], A, I0, J0, drc) ;
+            GB_spec_compare (C1, C2) ;
+
+            % C(I,J)<M,repl> = A
+            C1 = GB_spec_subassign (Cin, M1, [ ], A, I1, J1, dr, false) ;
+            C2 = GB_mex_subassign  (Cin, M1, [ ], A, I0, J0, dr) ;
+            GB_spec_compare (C1, C2) ;
+
+            % C(I,J)<M,repl> += A
+            C1 = GB_spec_subassign (Cin, M1, 'plus', A, I1, J1, dr, false) ;
+            C2 = GB_mex_subassign  (Cin, M1, 'plus', A, I0, J0, dr) ;
+            GB_spec_compare (C1, C2) ;
+
+            % C(I,J)<M,repl> = scalar
+            C1 = GB_spec_subassign (Cin, M1, [ ], 3, I1, J1, dr, true) ;
+            C2 = GB_mex_subassign  (Cin, M1, [ ], 3, I0, J0, dr) ;
+            GB_spec_compare (C1, C2) ;
+
+            % C(I,J)<!M> = scalar
+            C1 = GB_spec_subassign (Cin, M1, [ ], 3, I1, J1, dc, true) ;
+            C2 = GB_mex_subassign  (Cin, M1, [ ], 3, I0, J0, dc) ;
+            GB_spec_compare (C1, C2) ;
+
+            % C(I,J)<!M> += scalar
+            C1 = GB_spec_subassign (Cin, M1, 'plus', 3, I1, J1, dc, true) ;
+            C2 = GB_mex_subassign  (Cin, M1, 'plus', 3, I0, J0, dc) ;
+            GB_spec_compare (C1, C2) ;
+
+            % C(I,J)<!M,repl> += scalar
+            C1 = GB_spec_subassign (Cin, M1, 'plus', 3, I1, J1, drc, true) ;
+            C2 = GB_mex_subassign  (Cin, M1, 'plus', 3, I0, J0, drc) ;
+            GB_spec_compare (C1, C2) ;
+        end
+
+    end
+end
+
+fprintf ('\ntest180: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test181.m b/GraphBLAS/Test/test181.m
new file mode 100644
index 0000000000..9fc4c345a8
--- /dev/null
+++ b/GraphBLAS/Test/test181.m
@@ -0,0 +1,67 @@
+function test181
+%TEST181 test transpose with explicit zeros in the Mask
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+% see also test99
+
+rng ('default') ;
+
+fprintf ('test181: assign with explicit zeros in the Mask\n') ;
+
+masks = { '', 'complement', 'structural', 'structural complement' } ;
+repls = { '', 'replace' } ;
+
+for n = 10:20
+    for d = 0:.1:1
+        fprintf ('.') ;
+        for k = 1:10
+
+            C = sprand (n, n, 0.1) ;
+            A = GB_spec_random (n, n, 0.1) ;
+            A.sparsity = 4 ;
+            if (d == 1)
+                Mask = sparse (rand (n)) ;
+            else
+                Mask = sprand (n, n, d) ;
+            end
+            if (nnz (Mask) > 0)
+                [i j x] = find (Mask) ;
+                nz = length (x) ;
+                p = randperm (nz, floor(nz/2)) ;
+                x (p) = 0 ;
+                i = uint64 (i-1) ;
+                j = uint64 (j-1) ;
+                Mask = GB_mex_Matrix_build (i,j,x,n,n,[]) ;
+                Mask = Mask.matrix ;
+            end
+
+            for k1 = 1:length(masks)
+                for k2 = 1:2
+
+                    clear desc
+                    desc.inp0 = 'tran' ;
+                    if (~isempty (masks {k1}))
+                        desc.mask = masks {k1} ;
+                    end
+                    if (~isempty (repls {k2}))
+                        desc.outp = repls {k2} ;
+                    end
+
+                    C2 = GB_spec_transpose (C, Mask, [], A, desc) ;
+                    C3 = GB_mex_transpose  (C, Mask, [], A, desc) ;
+                    GB_spec_compare (C2, C3) ;
+
+                    C2 = GB_spec_transpose (C, Mask, [], C, desc) ;
+                    C3 = GB_mex_transpose  (C, Mask, [], C, desc) ;
+                    GB_spec_compare (C2, C3) ;
+
+                end
+            end
+        end
+    end
+end
+
+fprintf ('\ntest181: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test182.m b/GraphBLAS/Test/test182.m
new file mode 100644
index 0000000000..a09f9adcf1
--- /dev/null
+++ b/GraphBLAS/Test/test182.m
@@ -0,0 +1,69 @@
+function test182
+%TEST182 test for internal wait that changes w from sparse/hyper to bitmap/full
+%
+% This test triggers the C<M>=A assignment where C starts out as sparse with
+% has many pending tuples, and is converted to bitmap just before the
+% assignment.  In this case, C is the vector w.  If w_sparsity is 15 and 'wait'
+% is false, then it starts the w<v>=sum(A) reduction with many pending tuples,
+% and converts w from sparse/hyper with many pending tuples into a bitmap
+% vector.  The outputs w, v, and A should be the same, regardless of the input
+% parameter s.
+%
+% The internal condition is triggered if wait is false, and w_sparsity
+% is 5, 6, 7, 13, 14, or 15:
+%
+%   5:  4+1         bitmap (4) or hypersparse (1)
+%   6:  4+2         bitmap (4) or sparse (2)
+%   7:  4+2+1       bitmap (4), sparse (2), or hypersparse (1)
+%  13:  8+4+1       full (8), bitmap (4) or hypersparse (1)
+%  14:  8+4+2       full (8), bitmap (4) or sparse (2)
+%  15:  8+4+2+1     full (8), bitmap (4), sparse (2), or hypersparse (1)
+%
+% That is, the sparsity control for w allows it to change from sparse/hyper
+% (with pending updates) to/from bitmap.  If 'wait' is true, then GB_mex_gabor
+% does an explicit GrB_Vector_wait (&w) before the w<v>=sum(A) reduction, so
+% w is converted to bitmap before the assignment, not during, and the internal
+% condition is not triggered.
+%
+% s is an optional vector of length 4, containing 4 parameters:
+% s = [wait, w_sparsity, v_sparsity, A_sparsity] ;
+% with wait 0 or 1, and the sparsity controls in range 1 to 15.
+%
+% The sparsity control for A and v has no effect on this condition.
+%
+% This test case comes from Gabor Szarnyas and Marton Elekes.
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+I  = [ 1, 2, 4, 5, 7, 11, 12, 13, 15, 18, 19, 20, 27, 32, 33, ...
+    35, 37, 41, 46, 50, 52, 53, 55, 57, 58, 61, 62, 63, 65, 66, 69, 70, 72, ...
+    73, 74, 75, 78, 79, 81, 84, 86, 87, 90, 91, 94, 96, 97, 98, 99, 100, ...
+    101, 102, 103, 104, 105, 107, 108, 109, 110, 115, 116, 117, 118, 120, ...
+    123, 129, 131, 132, 133, 134, 136, 140, 145, 146, 149, 152, 153, 154, ...
+    156, 158, 159, 160, 161, 163, 164, 165, 166, 168, 169, 172, 176, 177, ...
+    181, 184, 186, 187, 189, 191, 193, 194, 195, 197, 200, 201, 202, 203, ...
+    204, 205, 208, 209, 210, 211, 216, 217, 218, 219, 224, 225, 229, 230, ...
+    232, 235, 236, 238, 239, 242, 243 ] ;
+
+n = 1000 ;
+Aok = sparse (I+1, I+1, I, n, n) ;
+wok = diag (Aok) ;
+
+for wait = 0:1
+    for w_sparsity = 1:15
+        fprintf ('.') ;
+        for v_sparsity = [2 4 15] % 1:15
+            for A_sparsity = [2 4 15] % 1:15
+                s = [wait, w_sparsity, v_sparsity, A_sparsity] ;
+                [w, v, A] = GB_mex_gabor (s) ;
+                assert (isequal (Aok, A.matrix)) ;
+                assert (isequal (wok, w.matrix)) ;
+                assert (isequal (wok, v.matrix)) ;
+            end
+        end
+    end
+end
+
+fprintf ('\ntest182: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test183.m b/GraphBLAS/Test/test183.m
new file mode 100644
index 0000000000..9c3e3fe9a6
--- /dev/null
+++ b/GraphBLAS/Test/test183.m
@@ -0,0 +1,46 @@
+function test183
+%TEST183 test GrB_eWiseMult with a hypersparse mask 
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+[binops, ~, ~, types, ~, ~] = GB_spec_opsall ;
+binops = binops.all ;
+types = types.all ;
+
+fprintf ('test183 -----------eWiseMult with hypersparse mask\n') ;
+
+n = 10 ;
+
+rng ('default') ;
+dtt = struct ( 'inp0', 'tran', 'inp1', 'tran' ) ;
+
+op.opname = 'times' ;
+op.optype = 'double' ;
+tol = 1e-12 ;
+
+% for trial = 1:5
+
+    A = GB_spec_random (n, n, inf, 1, 'double') ;
+    A.matrix (1,1) = 0 ;
+    A.pattern = logical (A.matrix) ;
+    A.pattern (1,1) = false ;
+    A.sparsity = 2 ;    % sparse
+    B = GB_spec_random (n, n, inf, 1, 'double') ;
+    B.sparsity = 2 ;    % sparse
+    M = GB_spec_random (n, n, 0.001, 1, 'logical') ;
+    M.sparsity = 1 ;    % hypersparse
+    C = sparse (n, n) ;
+
+    %---------------------------------------
+    % A'.*B', with mask
+    %---------------------------------------
+
+    C0 = GB_spec_Matrix_eWiseMult (C, M, [ ], op, A, B, dtt) ;
+    C1 = GB_mex_Matrix_eWiseMult  (C, M, [ ], op, A, B, dtt) ;
+    GB_spec_compare (C0, C1, 0, tol) ;
+
+% end
+
+fprintf ('\ntest183: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test184.m b/GraphBLAS/Test/test184.m
new file mode 100644
index 0000000000..bae153c3fa
--- /dev/null
+++ b/GraphBLAS/Test/test184.m
@@ -0,0 +1,107 @@
+function test184
+%TEST184 test special cases for mxm, transpose, and build
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+nthreads = nthreads_set ;
+GrB.burble (1) ;
+
+semiring.add = 'plus' ;
+semiring.multiply = 'times' ;
+semiring.class = 'double' ;
+dtn = struct ('inp0', 'tran') ;
+
+A = sprand (0, 10, 1) ;
+B = sprand (0, 10, 1) ;
+Cin = sparse (10, 10) ;
+C1 = A'*B ;
+C2 = GB_mex_mxm (Cin, [ ], [ ], semiring, A, B, dtn) 
+
+%----------------------------------------------------------------------
+
+m = 1201 ;
+n = 4 ;
+k = 26 ;
+d = 0.02 ;
+A = sprand (m, n, d) ;
+B = sprand (m, k, 1.0) ;
+
+C1 = A'*B ;
+Cin = sparse (n, k) ;
+
+C2 = GB_mex_mxm (Cin, [ ], [ ], semiring, A, B, dtn) ;
+
+assert (norm (C1 - C2.matrix, 1) < 1e-12)
+
+m = 1048576 ;
+n = 20 ;
+d = 0.0031119 ;
+A = sprand (m, n, d) ;
+B = sprand (m, n, d) ;
+
+C1 = A'*B ;
+Cin = sparse (n, n) ;
+
+C2 = GB_mex_mxm (Cin, [ ], [ ], semiring, A, B, dtn) ;
+
+assert (norm (C1 - C2.matrix, 1) < 1e-12)
+
+%----------------------------------------------------------------------
+
+n = 1 ;
+m = 400 ;
+k = 4000 ;
+nthreads_set (1) ;
+A = sprand (m, k, 0.5) ;
+B = sprand (k, n, 0.5) ;
+
+C1 = A*B ;
+Cin = sparse (m, n) ;
+
+C2 = GB_mex_mxm (Cin, [ ], [ ], semiring, A, B, [ ]) ;
+
+assert (norm (C1 - C2.matrix, 1) < 1e-12)
+
+%----------------------------------------------------------------------
+
+nthreads_set (4) ;
+m = 262144 ;
+n = 1048576 ;
+d = 4e-6 ;
+A = sprand (m, n, d) ;
+Cin = sparse (n, m) ;
+
+C1 = A' ;
+C2 = GB_mex_transpose (Cin, [ ], [ ], A, [ ]) ;
+
+assert (norm (C1 - C2.matrix, 1) < 1e-12)
+
+[I,J,X] = find (A) ;
+[m,n] = size (A) ;
+nz = length (I) ;
+I = I (randperm (nz)) ;
+J = J (randperm (nz)) ;
+X = X (randperm (nz)) ;
+I0 = uint64 (I) - 1 ;
+J0 = uint64 (J) - 1 ;
+A1 = sparse (I, J, X, m, n) ;
+A2 = GB_mex_Matrix_build (I0, J0, X, m, n, [ ]) ;
+
+assert (norm (A1 - A2.matrix, 1) < 1e-12)
+
+%----------------------------------------------------------------------
+
+v1 = sparse (I, 1, X, m, 1) ;
+v2 = GB_mex_Vector_build (I0, X, m, [ ]) ;
+assert (norm (v1 - v2.matrix, 1) / norm (v1,1) < 1e-12)
+
+%----------------------------------------------------------------------
+
+GrB.burble (0) ;
+% restore # of threads
+nthreads_set (nthreads) ;
+
+fprintf ('\ntest184: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test185.m b/GraphBLAS/Test/test185.m
new file mode 100644
index 0000000000..03518e416d
--- /dev/null
+++ b/GraphBLAS/Test/test185.m
@@ -0,0 +1,99 @@
+function test185
+%TEST185 test dot4 for all sparsity formats
+% GB_AxB_dot4 computes C+=A'*B when C is dense.
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+fprintf ('test185 -------------------- C+=A''*B when C is dense\n') ;
+
+rng ('default') ;
+GrB.burble (0) ;
+
+semiring.add = 'plus' ;
+semiring.multiply = 'times' ;
+semiring.class = 'double' ;
+
+plus_pair.add = 'plus' ;
+plus_pair.multiply = 'pair' ;
+plus_pair.class = 'double' ;
+
+add_op = 'plus' ;
+dtn_dot = struct ('axb', 'dot', 'inp0', 'tran') ;
+dtn_sax = struct ('axb', 'saxpy', 'inp0', 'tran') ;
+
+n = 20 ;
+C = GB_spec_random (n, n, inf, 1, 'double') ;
+C0 = sparse (n, n) ;
+maxerr = 0 ;
+
+M = sparse (rand (n, n) > 0.5) ;
+
+for da = [0.01 0.1 .5 0.9 inf]
+    A = GB_spec_random (n, n, da, 1, 'double') ;
+
+    for db = [0.01 0.1 .5 0.9 inf]
+        B = GB_spec_random (n, n, db, 1, 'double') ;
+
+        for A_sparsity = [1 2 4 8]
+            fprintf ('.') ;
+
+            for B_sparsity = [1 2 4 8]
+                A.sparsity = A_sparsity ;
+                B.sparsity = B_sparsity ;
+
+                % C2 = C + A'*B using dot4
+                C2 = GB_mex_mxm  (C, [ ], add_op, semiring, A, B, dtn_dot) ;
+                C1 = GB_spec_mxm (C, [ ], add_op, semiring, A, B, dtn_dot) ;
+                GB_spec_compare (C1, C2) ;
+                C3 = C.matrix + (A.matrix)'*B.matrix ;
+                err = norm (C3 - C2.matrix, 1) ;
+                maxerr = max (maxerr, err) ;
+                assert (err < 1e-12) ;
+
+                % C2 = A'*B using dot2/dot3
+                C2 = GB_mex_mxm  (C0, [ ], [ ], semiring, A, B, dtn_dot) ;
+                C1 = GB_spec_mxm (C0, [ ], [ ], semiring, A, B, dtn_dot) ;
+                GB_spec_compare (C1, C2) ;
+                C3 = (A.matrix)'*B.matrix ;
+                err = norm (C3 - C2.matrix, 1) ;
+                maxerr = max (maxerr, err) ;
+                assert (err < 1e-12) ;
+
+                % C2 = C + A'*B using saxpy
+                C2 = GB_mex_mxm  (C, [ ], add_op, semiring, A, B, dtn_sax) ;
+                C1 = GB_spec_mxm (C, [ ], add_op, semiring, A, B, dtn_sax) ;
+                GB_spec_compare (C1, C2) ;
+                C3 = C.matrix + (A.matrix)'*B.matrix ;
+                err = norm (C3 - C2.matrix, 1) ;
+                maxerr = max (maxerr, err) ;
+                assert (err < 1e-12) ;
+
+                % C2 = C + spones(A)'*spones(B) using dot4
+                C2 = GB_mex_mxm  (C, [ ], add_op, plus_pair, A, B, dtn_dot) ;
+                C1 = GB_spec_mxm (C, [ ], add_op, plus_pair, A, B, dtn_dot) ;
+                GB_spec_compare (C1, C2) ;
+                C3 = C.matrix + spones (A.matrix)' * spones (B.matrix) ;
+                err = norm (C3 - C2.matrix, 1) ;
+                maxerr = max (maxerr, err) ;
+                assert (err < 1e-12) ;
+
+                % C2 = spones(A)'*spones(B) using dot2/dot3
+                C2 = GB_mex_mxm  (C0, [ ], [ ], plus_pair, A, B, dtn_dot) ;
+                C1 = GB_spec_mxm (C0, [ ], [ ], plus_pair, A, B, dtn_dot) ;
+                GB_spec_compare (C1, C2) ;
+                C3 = spones (A.matrix)' * spones (B.matrix) ;
+                err = norm (C3 - C2.matrix, 1) ;
+                maxerr = max (maxerr, err) ;
+                assert (err < 1e-12) ;
+
+            end
+        end
+    end
+end
+
+fprintf ('\n') ;
+GrB.burble (0) ;
+fprintf ('maxerr: %g\n', maxerr) ;
+fprintf ('test185: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test186.m b/GraphBLAS/Test/test186.m
new file mode 100644
index 0000000000..1bcf8e3f07
--- /dev/null
+++ b/GraphBLAS/Test/test186.m
@@ -0,0 +1,86 @@
+function test186
+%TEST186 test saxpy for all sparsity formats
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+fprintf ('test186 --------------- C<!M>A*B for all sparsity formats\n') ;
+
+rng ('default') ;
+
+GrB.burble (1) ;
+
+load west0479 ;
+A.matrix = west0479 ;
+A.class = 'double' ;
+A.pattern = logical (spones (A.matrix)) ;
+m = size (A.matrix, 1) ;
+
+semiring.add = 'plus' ;
+semiring.multiply = 'times' ;
+semiring.class = 'double' ;
+
+any_pair.add = 'any' ;
+any_pair.multiply = 'pair' ;
+any_pair.class = 'double' ;
+
+C0 = sparse (m, 1) ;
+maxerr = 0 ;
+
+M = sparse (rand (m, 1) > 0.5) ;
+desc.mask = 'complement' ;
+
+B = GB_spec_random (m, 1, 0.5, 1, 'double') ;
+
+% using fine atomic tasks when A is sparse and B is bitmap
+for A_sparsity = [1 2 4 8]
+    for B_sparsity = [1 2 4 8]
+        A.sparsity = A_sparsity ;
+        B.sparsity = B_sparsity ;
+
+        % C2<!M> = A*B using the conventional semiring
+        C3 = double (~M) .* (A.matrix * B.matrix) ;
+        C2 = GB_mex_mxm  (C0, M, [ ], semiring, A, B, desc) ;
+        err = norm (C3 - C2.matrix, 1) / norm (C3, 1) ;
+        maxerr = max (maxerr, err) ;
+        assert (err < 1e-12) ;
+
+        % C2<!M> = A*B using the any-pair semiring
+        C3 = spones (C3) ;
+        C2 = GB_mex_mxm  (C0, M, [ ], any_pair, A, B, desc) ;
+        err = norm (C3 - C2.matrix, 1) / norm (C3, 1) ;
+        maxerr = max (maxerr, err) ;
+        assert (err < 1e-12) ;
+    end
+end
+
+% using fine non-atomic tasks when A is sparse and B is bitmap
+A.matrix = sprand (m, m, 0.8) ;
+A.pattern = logical (spones (A.matrix)) ;
+for A_sparsity = [1 2 4 8]
+    for B_sparsity = [1 2 4 8]
+        A.sparsity = A_sparsity ;
+        B.sparsity = B_sparsity ;
+        fprintf ('.') ;
+
+        % C2<!M> = A*B using the conventional semiring
+        C3 = double (~M) .* (A.matrix * B.matrix) ;
+        C2 = GB_mex_mxm  (C0, M, [ ], semiring, A, B, desc) ;
+        err = norm (C3 - C2.matrix, 1) / norm (C3, 1) ;
+        maxerr = max (maxerr, err) ;
+        assert (err < 1e-12) ;
+
+        % C2<!M> = A*B using the any-pair semiring
+        C3 = spones (C3) ;
+        C2 = GB_mex_mxm  (C0, M, [ ], any_pair, A, B, desc) ;
+        err = norm (C3 - C2.matrix, 1) / norm (C3, 1) ;
+        maxerr = max (maxerr, err) ;
+        assert (err < 1e-12) ;
+    end
+end
+
+fprintf ('\n') ;
+GrB.burble (0) ;
+fprintf ('maxerr: %g\n', maxerr) ;
+fprintf ('test186: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test187.m b/GraphBLAS/Test/test187.m
new file mode 100644
index 0000000000..6d462f98e5
--- /dev/null
+++ b/GraphBLAS/Test/test187.m
@@ -0,0 +1,36 @@
+function test187
+%TEST187 test dup/assign for all sparsity formats
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+fprintf ('test187 ----------- C = A for all sparsity formats and all types\n') ;
+
+[~, ~, ~, types, ~, ~] = GB_spec_opsall ;
+types = types.all ;
+
+rng ('default') ;
+
+for k1 = 1:length (types)
+    atype = types {k1} ;
+    fprintf ('\n%s', atype) ;
+    for d = [0.5 inf]
+        A = GB_spec_random (10, 10, d, 128, atype) ;
+        for A_sparsity = 0:15
+            fprintf ('.') ;
+            A.sparsity = A_sparsity ;
+            for C_sparsity = 0:15
+                for method = 0:2
+                    % no typecast, but do change the sparsity
+                    C = GB_mex_dup (A, atype, method, C_sparsity) ;
+                    GB_spec_compare (A, C) ;
+                end
+            end
+        end
+    end
+end
+
+fprintf ('\n') ;
+GrB.burble (0) ;
+fprintf ('test187: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test19.m b/GraphBLAS/Test/test19.m
index 3a20cc3e1a..dedc118edd 100644
--- a/GraphBLAS/Test/test19.m
+++ b/GraphBLAS/Test/test19.m
@@ -1,8 +1,8 @@
 function test19(fulltest)
 %TEST19 test GxB_subassign and GrB_*_setElement with many pending operations
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 1)
     fulltest = 0 ;
@@ -137,7 +137,6 @@ function test19(fulltest)
     end
 
     C3 = Corig ;
-
     for k = 1:ntrials
         J = Work (k).J ;
         I = Work (k).I ;
@@ -149,9 +148,46 @@ function test19(fulltest)
         C3 = GB_spec_subassign (C3, M, accum, A, I, J, d, scalar) ;
     end
 
+    % default sparsity
     C2 = GB_mex_subassign (Corig, Work2) ;
+    GB_spec_compare (C2, C3) ;
+
+    % with sparsity control
+    for C_sparsity_control = [1 2 4 8]
+        for M_sparsity_control = [2 12]
+            C2 = GB_mex_subassign (Corig, Work2, ...
+                [C_sparsity_control M_sparsity_control]) ;
+            GB_spec_compare (C2, C3) ;
+        end
+    end
+
+    % with no accum
 
+    C3 = Corig ;
+    for k = 1:ntrials
+        J = Work (k).J ;
+        I = Work (k).I ;
+        A = Work (k).A ;
+        M = Work (k).Mask ;
+        d = Work (k).desc ;
+        scalar = Work (k).scalar ;
+        C3 = GB_spec_subassign (C3, M, [ ], A, I, J, d, scalar) ;
+        Work2 (k).accum = [ ] ;
+    end
+
+    % default sparsity
+    C2 = GB_mex_subassign (Corig, Work2) ;
     GB_spec_compare (C2, C3) ;
+
+    % with sparsity control
+    for C_sparsity_control = [1 2 4 8]
+        for M_sparsity_control = [2 12]
+            C2 = GB_mex_subassign (Corig, Work2, ...
+                [C_sparsity_control M_sparsity_control]) ;
+            GB_spec_compare (C2, C3) ;
+        end
+    end
+
 end
 
 fprintf ('\ntest19: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test19_old.m b/GraphBLAS/Test/test19_old.m
new file mode 100644
index 0000000000..c9a4129c69
--- /dev/null
+++ b/GraphBLAS/Test/test19_old.m
@@ -0,0 +1,190 @@
+function test19(fulltest)
+%TEST19 test GxB_subassign and GrB_*_setElement with many pending operations
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+if (nargin < 1)
+    fulltest = 0 ;
+end
+if (fulltest)
+    nt = 3000 ;
+else
+    % check if malloc debugging is enabled
+    debug_status = stat ;
+    if (debug_status)
+        % exhaustive malloc debugging
+        nt = 50 ; % was 500, which takes too long
+    else
+        nt = 500 ;
+    end
+end
+
+fprintf ('\nGxB_subassign and setElement test, many pending computations\n') ;
+
+for problem = 0:2
+
+    clear Work Work2
+    switch (problem)
+        case 0
+            Corig = sparse (5,5) ;
+            d = 1 ;
+            ntrials = 100 ;
+        case 1
+            Corig = sprandn (10,20,0.1) ;
+            d = 0.3 ;
+            ntrials = nt ;
+        case 2
+            Corig = sparse (rand (10, 20)) ;
+            d = 0.3 ;
+            ntrials = nt ;
+    end
+
+    rng ('default') ;
+    [m n] = size (Corig) ;
+    fprintf ('problem %d: C is %d-by-%d, # assign/setElement to do: %d\n', ...
+        problem, m, n, ntrials) ;
+
+    if (problem > 0)
+        fprintf ('... please wait\n') ;
+    end
+
+    for k = 1:ntrials
+
+        c = randi (10,1) ;
+        if (c == 10)
+            d = struct ('outp', 'replace') ;
+        elseif (c == 9)
+            d = struct ('outp', 'replace', 'mask', 'complement') ;
+        elseif (c == 8)
+            d = struct ('mask', 'complement') ;
+        else
+            d = [ ] ;
+        end
+
+%       c = randi (10,1) ;
+%       if (c == 10)
+%           d = struct ('outp', 'replace') ;
+%       else
+%           d = [ ] ;
+%       end
+
+        c = randi (3,1) ;
+        switch (c)
+            case 1
+                accum = '' ;
+            case 2
+                accum = 'plus' ;
+            case 3
+                accum = 'second' ;
+        end
+
+        c = randi (10,1) ;
+        if (c < 8)
+            ni = randi (3,1) ;
+            nj = randi (3,1) ;
+            J = randperm (n, nj) ;
+            I = randperm (m, ni) ;
+            A = sprand (ni, nj, 0.3) ;
+            scalar = 0 ;
+        elseif (c == 8)
+            % scalar expansion
+            ni = 2 ;
+            nj = 2 ;
+            J = randperm (n, nj) ;
+            I = randperm (m, ni) ;
+            A = sparse (rand (1)) ;
+            scalar = 1 ;
+        elseif (c == 9)
+            ni = 1 ;
+            nj = 1 ;
+            I = randperm (m,1) ;
+            J = randperm (n,1) ;
+            A = sparse (rand (1)) ;
+            scalar = 0 ;
+        else
+            ni = 2 ;
+            nj = 2 ;
+            I = randperm (m,2) ;
+            J = randperm (n,2) ;
+            A = sparse (rand (2)) ;
+            scalar = 0 ;
+        end
+
+        c = randi (2,1) ;
+        switch (c)
+            case 1
+                Mask = [ ] ;
+            case 2
+                Mask = (sprand (ni, nj, 0.3) ~= 0) ;
+        end
+
+        Work (k).A = A ;
+        Work (k).I = I ;
+        Work (k).J = J ;
+        Work (k).Mask = Mask ;
+        Work (k).accum = accum ;
+        Work (k).desc = d ;
+        Work (k).scalar = scalar ;
+
+        Work2 (k).A = A ;
+        Work2 (k).I = uint64 (I-1) ;
+        Work2 (k).J = uint64 (J-1) ;
+        Work2 (k).Mask = Mask ;
+        Work2 (k).accum = accum ;
+        Work2 (k).desc = d ;
+
+    end
+
+    C3 = Corig ;
+    for k = 1:ntrials
+        J = Work (k).J ;
+        I = Work (k).I ;
+        A = Work (k).A ;
+        M = Work (k).Mask ;
+        accum = Work (k).accum ;
+        d = Work (k).desc ;
+        scalar = Work (k).scalar ;
+        C3 = GB_spec_subassign (C3, M, accum, A, I, J, d, scalar) ;
+    end
+
+    % default sparsity
+    C2 = GB_mex_subassign (Corig, Work2) ;
+    GB_spec_compare (C2, C3) ;
+
+    % with sparsity control
+    for sparsity_control = 0:15
+        C2 = GB_mex_subassign (Corig, Work2,  ...
+            [sparsity_control 15]) ;
+        GB_spec_compare (C2, C3) ;
+    end
+
+    % with no accum
+
+    C3 = Corig ;
+    for k = 1:ntrials
+        J = Work (k).J ;
+        I = Work (k).I ;
+        A = Work (k).A ;
+        M = Work (k).Mask ;
+        d = Work (k).desc ;
+        scalar = Work (k).scalar ;
+        C3 = GB_spec_subassign (C3, M, [ ], A, I, J, d, scalar) ;
+        Work2 (k).accum = [ ] ;
+    end
+
+    % default sparsity
+    C2 = GB_mex_subassign (Corig, Work2) ;
+    GB_spec_compare (C2, C3) ;
+
+    % with sparsity control
+    for sparsity_control = 0:15
+        C2 = GB_mex_subassign (Corig, Work2, ...
+            [sparsity_control 15]) ;
+        GB_spec_compare (C2, C3) ;
+    end
+
+end
+
+fprintf ('\ntest19: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test19b.m b/GraphBLAS/Test/test19b.m
index 8ca838742c..d4dc07cb70 100644
--- a/GraphBLAS/Test/test19b.m
+++ b/GraphBLAS/Test/test19b.m
@@ -1,8 +1,8 @@
 function test19b(fulltest)
 %TEST19B test GrB_assign and GrB_*_setElement with many pending operations
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest19b: GrB_assign and setElement, many pending computations\n') ;
 
@@ -128,15 +128,30 @@ function test19b(fulltest)
                 % no mask
                 Mask = [ ] ;
             case 2
-                if (kind == 0)
-                    % mask same size as C: Matrix or Vector assign
-                    Mask = (sprand (m, n, 0.3) ~= 0) ;
-                elseif (kind == 1)
-                    % mask same size as C(:,j): column assign
-                    Mask = (sprand (m, 1, 0.3) ~= 0) ;
-                else % (kind == 2)
-                    % mask same size as C(i,:)': row assign
-                    Mask = (sprand (n, 1, 0.3) ~= 0) ;
+                density = rand (1) ;
+                if (density < 0.8)
+                    if (kind == 0)
+                        % mask same size as C: Matrix or Vector assign
+                        Mask = (sprand (m, n, 0.3) ~= 0) ;
+                    elseif (kind == 1)
+                        % mask same size as C(:,j): column assign
+                        Mask = (sprand (m, 1, 0.3) ~= 0) ;
+                    else % (kind == 2)
+                        % mask same size as C(i,:)': row assign
+                        Mask = (sprand (n, 1, 0.3) ~= 0) ;
+                    end
+                else
+                    if (kind == 0)
+                        % mask same size as C: Matrix or Vector assign
+                        Mask = sparse (true (m, n)) ;
+                    elseif (kind == 1)
+                        % mask same size as C(:,j): column assign
+                        Mask = sparse (true (m, 1)) ;
+                    else % (kind == 2)
+                        % mask same size as C(i,:)': row assign
+                        Mask = sparse (true (n, 1)) ;
+                    end
+
                 end
         end
 
@@ -182,9 +197,24 @@ function test19b(fulltest)
         end
     end
 
+    % default sparsity
     C2 = GB_mex_assign (Corig, Work2) ;
+    GB_spec_compare (C2, C3) ;
+
+    % with sparsity control
+    for s = 0:15
+        C2 = GB_mex_assign (Corig, Work2, [s s]) ;
+        GB_spec_compare (C2, C3) ;
+    end
 
+    % default sparsity but both C and M sparse
+    C2 = GB_mex_assign (Corig, Work2, [2 2]) ;
     GB_spec_compare (C2, C3) ;
+
+    % default sparsity but C sparse and M bitmap/full
+    C2 = GB_mex_assign (Corig, Work2, [2 8]) ;
+    GB_spec_compare (C2, C3) ;
+
 end
 
 if (debug_status)
diff --git a/GraphBLAS/Test/test19b_old.m b/GraphBLAS/Test/test19b_old.m
new file mode 100644
index 0000000000..b05971825e
--- /dev/null
+++ b/GraphBLAS/Test/test19b_old.m
@@ -0,0 +1,202 @@
+function test19b(fulltest)
+%TEST19B test GrB_assign and GrB_*_setElement with many pending operations
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+fprintf ('\ntest19b: GrB_assign and setElement, many pending computations\n') ;
+
+debug_status = stat ;
+if (debug_status)
+    % turn off malloc debugging for this test
+    debug_off
+end
+
+if (nargin < 1)
+    fulltest = 0 ;
+end
+if (fulltest)
+    nt = 3000 ;
+else
+    nt = 2000 ;
+end
+
+for problem = 0:2
+
+    clear Work Work2
+    switch (problem)
+        case 0
+            Corig = sparse (5,5) ;
+            d = 1 ;
+            ntrials = 100 ;
+        case 1
+            Corig = sprandn (10,20,0.1) ;
+            d = 0.3 ;
+            ntrials = nt ;
+        case 2
+            Corig = sparse (rand (10, 20)) ;
+            d = 0.3 ;
+            ntrials = nt ;
+    end
+
+    rng ('default') ;
+    [m n] = size (Corig) ;
+    fprintf ('problem %d: C is %d-by-%d, # assign/setElement to do: %d\n', ...
+        problem, m, n, ntrials) ;
+
+    if (problem > 0)
+        fprintf ('... please wait\n') ;
+    end
+
+    for k = 1:ntrials
+
+        c = randi (10,1) ;
+        if (c == 10)
+            d = struct ('outp', 'replace') ;
+        elseif (c == 9)
+            d = struct ('outp', 'replace', 'mask', 'complement') ;
+        elseif (c == 8)
+            d = struct ('mask', 'complement') ;
+        else
+            d = [ ] ;
+        end
+
+        c = randi (3,1) ;
+        switch (c)
+            case 1
+                accum = '' ;
+            case 2
+                accum = 'plus' ;
+            case 3
+                accum = 'second' ;
+        end
+
+        kind = 0 ;
+        c = randi (12,1) ;
+        if (c < 8)
+            ni = randi (3,1) ;
+            nj = randi (3,1) ;
+            J = randperm (n, nj) ;
+            I = randperm (m, ni) ;
+            A = sprand (ni, nj, 0.3) ;
+            scalar = 0 ;
+        elseif (c == 8)
+            % scalar expansion
+            ni = 2 ;
+            nj = 2 ;
+            J = randperm (n, nj) ;
+            I = randperm (m, ni) ;
+            A = sparse (rand (1)) ;
+            scalar = 1 ;
+        elseif (c == 9)
+            ni = 1 ;
+            nj = 1 ;
+            I = randperm (m,1) ;
+            J = randperm (n,1) ;
+            A = sparse (rand (1)) ;
+            scalar = 0 ;
+        elseif (c == 10)
+            ni = 2 ;
+            nj = 2 ;
+            I = randperm (m,2) ;
+            J = randperm (n,2) ;
+            A = sparse (rand (2)) ;
+            scalar = 0 ;
+        elseif (c == 11)
+            % column assign
+            ni = randi (3,1) ;
+            nj = 1 ;
+            kind = 1 ;
+            J = randperm (n, 1) ;
+            I = randperm (m, ni) ;
+            A = sprand (ni, 1, 0.3) ;
+            scalar = 0 ;
+        else % (c == 12)
+            % row assign: A is a single *column* vector
+            ni = 1 ;
+            nj = randi (3,1) ;
+            kind = 2 ;
+            J = randperm (n, nj) ;
+            I = randperm (m, 1) ;
+            A = sprand (nj, 1, 0.3) ;
+            scalar = 0 ;
+        end
+
+        c = randi (2,1) ;
+        switch (c)
+            case 1
+                % no mask
+                Mask = [ ] ;
+            case 2
+                if (kind == 0)
+                    % mask same size as C: Matrix or Vector assign
+                    Mask = (sprand (m, n, 0.3) ~= 0) ;
+                elseif (kind == 1)
+                    % mask same size as C(:,j): column assign
+                    Mask = (sprand (m, 1, 0.3) ~= 0) ;
+                else % (kind == 2)
+                    % mask same size as C(i,:)': row assign
+                    Mask = (sprand (n, 1, 0.3) ~= 0) ;
+                end
+        end
+
+        Work (k).A = A ;
+        Work (k).I = I ;
+        Work (k).J = J ;
+        Work (k).Mask = Mask ;
+        Work (k).accum = accum ;
+        Work (k).desc = d ;
+        Work (k).scalar = scalar ;
+        Work (k).kind = kind ;
+
+        Work2 (k).A = A ;
+        Work2 (k).I = uint64 (I-1) ;
+        Work2 (k).J = uint64 (J-1) ;
+        Work2 (k).Mask = Mask ;
+        Work2 (k).accum = accum ;
+        Work2 (k).desc = d ;
+        Work2 (k).kind = kind ;
+
+    end
+
+    C3 = Corig ;
+
+    for k = 1:ntrials
+        J = Work (k).J ;
+        I = Work (k).I ;
+        A = Work (k).A ;
+        M = Work (k).Mask ;
+        accum = Work (k).accum ;
+        d = Work (k).desc ;
+        scalar = Work (k).scalar ;
+        kind = Work (k).kind ;
+        if (kind == 0)
+            % matrix/vector assign
+            C3 = GB_spec_assign (C3, M, accum, A, I, J, d, scalar) ;
+        elseif (kind == 1)
+            % col assign
+            C3 = GB_spec_Col_assign (C3, M, accum, A, I, J, d) ;
+        else % (kind == 2)
+            % row assign
+            C3 = GB_spec_Row_assign (C3, M, accum, A, I, J, d) ;
+        end
+    end
+
+    % default sparsity
+    C2 = GB_mex_assign (Corig, Work2) ;
+    GB_spec_compare (C2, C3) ;
+
+    % with sparsity control
+    for sparsity_control = 0:15
+        C2 = GB_mex_assign (Corig, Work2, ...
+            [sparsity_control 15]) ;
+        GB_spec_compare (C2, C3) ;
+    end
+
+end
+
+if (debug_status)
+    debug_on
+end
+fprintf ('\ntest19b: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test20.m b/GraphBLAS/Test/test20.m
index 8381a21d5c..a6fce2b31c 100644
--- a/GraphBLAS/Test/test20.m
+++ b/GraphBLAS/Test/test20.m
@@ -1,8 +1,8 @@
 function test20(fulltest)
 %TEST20 test GrB_mxm, mxv, and vxm
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, ~, add_ops, types, ~, ~] = GB_spec_opsall ;
 mult_ops = binops.all ;
@@ -24,10 +24,7 @@ function test20(fulltest)
 
 % types to test:
 kk = 1 ;
-
-% accum ops to test
-% accumops = 0:length(mult_ops) ;       % test all accum operators
-% accumops = 0 ;                        % test with no accum
+% ops to test
 aa = 1 ;
 
 if (fulltest > 0)
@@ -52,328 +49,343 @@ function test20(fulltest)
 
 for k1 = k1_list % 1:length(mult_ops)
     mulop = mult_ops {k1} ;
+    % if (~GB_spec_is_positional (mulop))
+    %     continue ;
+    % end
 
     for k2 = k2_list % 1:length(add_ops)
-        addop = add_ops {k2} ;
-        fprintf ('\nsemiring %s:%s ', addop, mulop) ;
-
-        for k3 = k3_list % 1:length (types)
-            rng ('default') ;
-            type = types {k3} ;
-
-            semiring.multiply = mulop ;
-            semiring.add = addop ;
-            semiring.class = type ;
-            if (n_semirings_max == 1)
-                semiring
-            end
-
-            % create the semiring.  some are not valid because the or,and,xor
-            % monoids can only be used when z is boolean for z=mult(x,y).
-            try
-                [mult_op add_op id] = GB_spec_semiring (semiring) ;
-                [mult_opname mult_optype ztype] = GB_spec_operator (mult_op);
-                [ add_opname  add_optype] = GB_spec_operator (add_op) ;
-                identity = GB_spec_identity (semiring.add, add_optype) ;
-            catch
-                continue
-            end
-
-            if (n_semirings+1 > n_semirings_max)
-                fprintf ('\ntest20: all quick tests passed\n') ;
-                return ;
-            end
-
-            % fprintf ('\n%4d ', n_semirings) ;
-            % fprintf ('%12.2f mxm semiring %s:%s:%s ', toc,addop,mulop,type) ;
-            % fprintf (' id: %g ', double (identity)) ;
-            n_semirings = n_semirings + 1 ;
-
-            % for k4 = [0 5 11 15] % 0:length(mult_ops)
-            for k4 = [ 0 randperm(length(mult_ops), aa)]
-                if (k4 == 0)
-                    accum_op = '' ;
-                    ntypes = 1 ;
-                else
-                    accum_op = mult_ops {k4} ;
-                    % ntypes = [1 2 8 ] ; % length (types) ;
-                    % ntypes = 1:length (types) ;
-                    ntypes = randperm (length (types),kk) ;
-                end
-
-                for k5 = ntypes
-                    clear accum
-                    if (~isempty (accum_op))
-                        accum_type = types {k5} ;
-                        accum.opname = accum_op ;
-                        accum.optype = accum_type ;
-                    else
-                        accum = '' ;
-                        accum_type = '' ;
-                    end
-
-                    try
-                        GB_spec_operator (accum) ;
-                    catch
-                        continue
-                    end
-
-                    for Mask_complement = [false true]
-
-                        if (Mask_complement)
-                            dnn.mask = 'complement' ;
-                            dtn.mask = 'complement' ;
-                            dnt.mask = 'complement' ;
-                            dtt.mask = 'complement' ;
-                        else
-                            dnn.mask = 'default' ;
-                            dtn.mask = 'default' ;
-                            dnt.mask = 'default' ;
-                            dtt.mask = 'default' ;
-                        end
-
-                        for C_replace = [false true]
-
-                            if (C_replace)
-                                dnn.outp = 'replace' ;
-                                dtn.outp = 'replace' ;
-                                dnt.outp = 'replace' ;
-                                dtt.outp = 'replace' ;
-                            else
-                                dnn.outp = 'default' ;
-                                dtn.outp = 'default' ;
-                                dnt.outp = 'default' ;
-                                dtt.outp = 'default' ;
-                            end
-
-                            % pick a random class, and int32
-                            atypes = randperm(length(types),kk) ;
-                            %  1:length (types)
-                            atypes = unique ([atypes 6]) ;
-
-                            % try all matrix types, to test casting
-                            for k6 = atypes
-                                aclas = types {k6} ;
-
-                                if (isequal (aclas, 'int32') && ...
-                                    mod (n_semirings, 100) == 1)
-                                    % single or double would lead to
-                                    % different roundoff errors
-                                    hyper_range = 0:1 ;
-                                    csc_range   = 0:1 ;
-                                else
-                                    hyper_range = 0 ;
-                                    csc_range   = 1 ;
-                                end
-
-                                % try some matrices
-                                for m = 8 % [1 5 10 ]
-                                    for n = 5 % [ 1 5 10 ]
-                                        for s = 4 % [ 1 5 10 ]
-                                        for density = [0.1 0.2 0.3 0.5]
-
-                                            % try all combinations of hyper/
-                                            % non-hyper and CSR/CSC
-                                            for A_is_hyper = hyper_range
-                                            for A_is_csc   = csc_range
-                                            for B_is_hyper = hyper_range
-                                            for B_is_csc   = csc_range
-                                            for C_is_hyper = hyper_range
-                                            for C_is_csc   = csc_range
-                                            for M_is_hyper = hyper_range
-                                            for M_is_csc   = csc_range
-
-                                            if (mod (ntrials, 23) == 0)
-                                                fprintf ('.') ;
-                                            end
-
-                                            %-----------------------------------
-                                            % A*B
-                                            %-----------------------------------
-
-                                            A = GB_spec_random (m,s,density,100,aclas, A_is_csc, A_is_hyper) ;
-                                            B = GB_spec_random (s,n,density,100,aclas, B_is_csc, B_is_hyper) ;
-                                            C = GB_spec_random (m,n,density,100,aclas, C_is_csc, C_is_hyper) ;
-
-                                            % C = A*B, no mask
-                                            Mask = [ ] ;
-                                            C0 = GB_spec_mxm (C, [ ], accum, semiring, A, B, dnn);
-                                            C1 = GB_mex_mxm  (C, [ ], accum, semiring, A, B, dnn);
-                                            GB_spec_compare (C0, C1, identity) ;
-
-                                            % w = A*u, no mask
-                                            w = GB_spec_random (m,1,density,100,type) ;
-                                            u = GB_spec_random (s,1,density,100,type) ;
-
-                                            w0 = GB_spec_mxv (w, [ ], accum, semiring, A, u, dnn);
-                                            w1 = GB_mex_mxv  (w, [ ], accum, semiring, A, u, dnn);
-                                            GB_spec_compare (w0, w1, identity) ;
-
-                                            % w' = u'*A' no mask
-                                            w0 = GB_spec_vxm (w, [ ], accum, semiring, u, A, dnt);
-                                            w1 = GB_mex_vxm  (w, [ ], accum, semiring, u, A, dnt);
-                                            GB_spec_compare (w0, w1, identity) ;
-
-                                            % C = A*B with mask
-                                            % Mask = sprandn (m,n,0.2) ~= 0 ;
-                                            Mask = GB_random_mask(m,n,0.2, M_is_csc, M_is_hyper) ;
-                                            C0 = GB_spec_mxm (C, Mask, accum, semiring, A, B, dnn);
-                                            C1 = GB_mex_mxm  (C, Mask, accum, semiring, A, B, dnn);
-                                            GB_spec_compare (C0, C1, identity) ;
-
-                                            % C = A*B with mask (with explicit zero entries)
-                                            Mask1 = sprandn (m,n,0.2) ~= 0 ;
-                                            Mask2 = Mask1 .* spones (sprandn (m,n,0.5)) ;
-                                            S = sparse (m,n) ;
-                                            Mask3 = GB_mex_Matrix_eWiseAdd (S,[ ],[ ],'minus',Mask1,Mask2) ;
-                                            clear Mask
-                                            Mask.matrix = Mask3.matrix ;
-                                            Mask.is_csc = M_is_csc ;
-                                            Mask.is_hyper = M_is_hyper ;
-                                            clear Mask1 Mask2 Mask3
-                                            % the Mask matrix will not pass spok(Mask) test since
-                                            % it will have explicit zeros
-
-                                            C0 = GB_spec_mxm (C, Mask, accum, semiring, A, B, dnn);
-                                            C1 = GB_mex_mxm  (C, Mask, accum, semiring, A, B, dnn);
-                                            GB_spec_compare (C0, C1, identity) ;
-
-                                            % w = A*u with mask
-                                            % mask = sprandn (m,1,0.2) ~= 0 ;
-                                            mask = GB_random_mask (m,1,0.2) ;
-                                            w0 = GB_spec_mxv (w, mask, accum, semiring, A, u, dnn);
-                                            w1 = GB_mex_mxv  (w, mask, accum, semiring, A, u, dnn);
-                                            GB_spec_compare (w0, w1, identity) ;
-
-                                            % w' = u'*A' with mask
-                                            w0 = GB_spec_vxm (w, mask, accum, semiring, u, A, dnt) ;
-                                            w1 = GB_mex_vxm  (w, mask, accum, semiring, u, A, dnt) ;
-                                            GB_spec_compare (w0, w1, identity) ;
-
-                                            %-----------------------------------
-                                            % A'*B
-                                            %-----------------------------------
-
-                                            A = GB_spec_random (s,m,density,100,aclas, A_is_csc, A_is_hyper) ;
-                                            B = GB_spec_random (s,n,density,100,aclas, B_is_csc, B_is_hyper) ;
-                                            C = GB_spec_random (m,n,density,100,aclas, C_is_csc, C_is_hyper) ;
-
-                                            % C = A'*B, no Mask
-                                            C0 = GB_spec_mxm (C, [ ], accum, semiring, A, B, dtn);
-                                            C1 = GB_mex_mxm  (C, [ ], accum, semiring, A, B, dtn);
-                                            GB_spec_compare (C0, C1, identity) ;
-
-                                            % w = A'*u, no mask
-
-                                            w0 = GB_spec_mxv (w, [ ], accum, semiring, A, u, dtn);
-                                            w1 = GB_mex_mxv  (w, [ ], accum, semiring, A, u, dtn);
-                                            GB_spec_compare (w0, w1, identity) ;
-
-                                            % w' = u'*A, no mask
-                                            w0 = GB_spec_vxm (w, [ ], accum, semiring, u, A, dnn);
-                                            w1 = GB_mex_vxm  (w, [ ], accum, semiring, u, A, dnn);
-                                            GB_spec_compare (w0, w1, identity) ;
-
-                                            % C = A'*B with mask
-                                            % Mask = sprandn (m,n,0.2) ~= 0 ;
-                                            Mask = GB_random_mask (m,n,0.2, M_is_csc, M_is_hyper) ;
-                                            C0 = GB_spec_mxm (C, Mask, accum, semiring, A, B, dtn);
-                                            C1 = GB_mex_mxm  (C, Mask, accum, semiring, A, B, dtn);
-                                            GB_spec_compare (C0, C1, identity) ;
-
-                                            % C = A'*B with mask
-                                            Mask1 = sprandn (m,n,0.2) ~= 0 ;
-                                            Mask2 = Mask1 .* spones (sprandn (m,n,0.5)) ;
-                                            S = sparse (m,n) ;
-                                            Mask3 = GB_mex_Matrix_eWiseAdd (S,[ ],[ ],'minus',Mask1,Mask2) ;
-                                            clear Mask
-                                            Mask.matrix = Mask3.matrix ;
-                                            Mask.is_csc = M_is_csc ;
-                                            Mask.is_hyper = M_is_hyper ;
-                                            clear Mask1 Mask2 Mask3
-                                            % the Mask matrix will not pass spok(Mask) test since
-                                            % it will have explicit zeros
-                                            C0 = GB_spec_mxm (C, Mask, accum, semiring, A, B, dtn);
-                                            C1 = GB_mex_mxm  (C, Mask, accum, semiring, A, B, dtn);
-                                            GB_spec_compare (C0, C1, identity) ;
-
-                                            % w = A'*u, with mask
-                                            % mask = sprandn (m,1,0.2) ~= 0 ;
-                                            mask = GB_random_mask (m,1,0.2) ;
-                                            w0 = GB_spec_mxv (w, mask, accum, semiring, A, u, dtn);
-                                            w1 = GB_mex_mxv  (w, mask, accum, semiring, A, u, dtn);
-                                            GB_spec_compare (w0, w1, identity) ;
-
-                                            % w' = u'*A, with mask
-                                            w0 = GB_spec_vxm (w, mask, accum, semiring, u, A, dnn);
-                                            w1 = GB_mex_vxm  (w, mask, accum, semiring, u, A, dnn);
-                                            GB_spec_compare (w0, w1, identity) ;
-
-                                            %-----------------------------------
-                                            % A*B'
-                                            %-----------------------------------
-
-                                            % no mask
-
-                                            A = GB_spec_random (m,s,density,100,aclas, A_is_csc, A_is_hyper) ;
-                                            B = GB_spec_random (n,s,density,100,aclas, B_is_csc, B_is_hyper) ;
-                                            C = GB_spec_random (m,n,density,100,aclas, C_is_csc, C_is_hyper) ;
-
-                                            C0 = GB_spec_mxm (C, [ ], accum, semiring, A, B, dnt);
-                                            C1 = GB_mex_mxm  (C, [ ], accum, semiring, A, B, dnt);
-                                            GB_spec_compare (C0, C1, identity) ;
-
-                                            % with mask
-                                            % Mask = sprandn (m,n,0.2) ~= 0 ;
-                                            Mask = GB_random_mask (m,n,0.2, M_is_csc, M_is_hyper) ;
-                                            C0 = GB_spec_mxm (C, Mask, accum, semiring, A, B, dnt);
-                                            C1 = GB_mex_mxm  (C, Mask, accum, semiring, A, B, dnt);
-                                            GB_spec_compare (C0, C1, identity) ;
-
-                                            %-----------------------------------
-                                            % A'*B'
-                                            %-----------------------------------
-
-                                            % no Mask
-
-                                            A = GB_spec_random (s,m,density,100,aclas, A_is_csc, A_is_hyper) ;
-                                            B = GB_spec_random (n,s,density,100,aclas, B_is_csc, B_is_hyper) ;
-                                            C = GB_spec_random (m,n,density,100,aclas, C_is_csc, C_is_hyper) ;
-
-                                            C0 = GB_spec_mxm (C, [ ], accum, semiring, A, B, dtt);
-                                            C1 = GB_mex_mxm  (C, [ ], accum, semiring, A, B, dtt);
-                                            GB_spec_compare (C0, C1, identity) ;
-
-                                            % A'*B', with mask
-                                            % Mask = sprandn (m,n,0.2) ~= 0 ;
-                                            Mask = GB_random_mask (m,n,0.2, M_is_csc, M_is_hyper) ;
-                                            C0 = GB_spec_mxm (C, Mask, accum, semiring, A, B, dtt);
-                                            C1 = GB_mex_mxm  (C, Mask, accum, semiring, A, B, dtt);
-                                            GB_spec_compare (C0, C1, identity) ;
-
-                                            ntrials = ntrials + 1 ;
-
-
-                                            end
-                                            end
-                                            end
-                                            end
-                                            end
-                                            end
-                                            end
-                                            end
-
-                                        end
-                                        end
-                                    end
-                                end
-                            end
-                        end
-                    end
-                end
-            end
-        end
+
+    addop = add_ops {k2} ;
+    fprintf ('\nsemiring %s:%s ', addop, mulop) ;
+
+    for k3 = k3_list % 1:length (types)
+
+    rng ('default') ;
+    type = types {k3} ;
+
+    semiring.multiply = mulop ;
+    semiring.add = addop ;
+    semiring.class = type ;
+    if (n_semirings_max == 1)
+        semiring
+    end
+
+    % create the semiring.  some are not valid because the or,and,xor
+    % monoids can only be used when z is boolean for z=mult(x,y).
+    try
+        [mult_op add_op id] = GB_spec_semiring (semiring) ;
+        [mult_opname mult_optype ztype] = GB_spec_operator (mult_op);
+        [ add_opname  add_optype] = GB_spec_operator (add_op) ;
+        identity = GB_spec_identity (semiring.add, add_optype) ;
+    catch
+        continue
+    end
+
+    if (n_semirings+1 > n_semirings_max)
+        fprintf ('\ntest20: all quick tests passed\n') ;
+        return ;
+    end
+
+    % fprintf ('\n%4d ', n_semirings) ;
+    % fprintf ('%12.2f mxm semiring %s:%s:%s ', toc,addop,mulop,type) ;
+    % fprintf (' id: %g ', double (identity)) ;
+    n_semirings = n_semirings + 1 ;
+
+    % for k4 = [0 5 11 15] % 0:length(mult_ops)
+    for k4 = [ 0 randperm(length(mult_ops), aa)]
+
+    if (k4 == 0)
+        accum_op = '' ;
+        ntypes = 1 ;
+    else
+        accum_op = mult_ops {k4} ;
+        % ntypes = [1 2 8 ] ; % length (types) ;
+        % ntypes = 1:length (types) ;
+        ntypes = randperm (length (types),kk) ;
+    end
+
+    for k5 = ntypes
+
+    clear accum
+    if (~isempty (accum_op))
+        accum_type = types {k5} ;
+        accum.opname = accum_op ;
+        accum.optype = accum_type ;
+    else
+        accum = '' ;
+        accum_type = '' ;
+    end
+
+    if (GB_spec_is_positional (accum))
+        continue ;
+    end
+
+    try
+        GB_spec_operator (accum) ;
+    catch
+        continue
+    end
+
+    for Mask_complement = [false true]
+
+    if (Mask_complement)
+        dnn.mask = 'complement' ;
+        dtn.mask = 'complement' ;
+        dnt.mask = 'complement' ;
+        dtt.mask = 'complement' ;
+    else
+        dnn.mask = 'default' ;
+        dtn.mask = 'default' ;
+        dnt.mask = 'default' ;
+        dtt.mask = 'default' ;
+    end
+
+    for C_replace = [false true]
+
+    if (C_replace)
+        dnn.outp = 'replace' ;
+        dtn.outp = 'replace' ;
+        dnt.outp = 'replace' ;
+        dtt.outp = 'replace' ;
+    else
+        dnn.outp = 'default' ;
+        dtn.outp = 'default' ;
+        dnt.outp = 'default' ;
+        dtt.outp = 'default' ;
+    end
+
+    % pick a random class, and int32
+    atypes = randperm(length(types),kk) ;
+    %  1:length (types)
+    atypes = unique ([atypes 6]) ;
+
+    % try all matrix types, to test casting
+    for k6 = atypes
+
+    aclas = types {k6} ;
+
+    if (isequal (aclas, 'int32') && ...
+        mod (n_semirings, 100) == 1)
+        % single or double would lead to
+        % different roundoff errors
+        hyper_range = 0:1 ;
+        csc_range   = 0:1 ;
+    else
+        hyper_range = 0 ;
+        csc_range   = 1 ;
+    end
+
+    % try some matrices
+    for m = 8 % [1 5 10 ]
+
+    for n = 5 % [ 1 5 10 ]
+
+    for s = 4 % [ 1 5 10 ]
+    for density = [0.1 0.2 0.3 0.5]
+
+    % try all combinations of hyper/
+    % non-hyper and CSR/CSC
+    for A_is_hyper = hyper_range
+    for A_is_csc   = csc_range
+    for B_is_hyper = hyper_range
+    for B_is_csc   = csc_range
+    for C_is_hyper = hyper_range
+    for C_is_csc   = csc_range
+    for M_is_hyper = hyper_range
+    for M_is_csc   = csc_range
+
+    if (mod (ntrials, 23) == 0)
+        fprintf ('.') ;
     end
+
+    %-----------------------------------
+    % A*B
+    %-----------------------------------
+
+    A = GB_spec_random (m,s,density,100,aclas, A_is_csc, A_is_hyper) ;
+    B = GB_spec_random (s,n,density,100,aclas, B_is_csc, B_is_hyper) ;
+    C = GB_spec_random (m,n,density,100,aclas, C_is_csc, C_is_hyper) ;
+
+    % C = A*B, no mask
+    Mask = [ ] ;
+    C0 = GB_spec_mxm (C, [ ], accum, semiring, A, B, dnn);
+    C1 = GB_mex_mxm  (C, [ ], accum, semiring, A, B, dnn);
+    GB_spec_compare (C0, C1, identity) ;
+
+    % w = A*u, no mask
+    w = GB_spec_random (m,1,density,100,type) ;
+    u = GB_spec_random (s,1,density,100,type) ;
+
+    w0 = GB_spec_mxv (w, [ ], accum, semiring, A, u, dnn);
+    w1 = GB_mex_mxv  (w, [ ], accum, semiring, A, u, dnn);
+    GB_spec_compare (w0, w1, identity) ;
+
+    % w' = u'*A' no mask
+    w0 = GB_spec_vxm (w, [ ], accum, semiring, u, A, dnt);
+    w1 = GB_mex_vxm  (w, [ ], accum, semiring, u, A, dnt);
+    GB_spec_compare (w0, w1, identity) ;
+
+    % C = A*B with mask
+    % Mask = sprandn (m,n,0.2) ~= 0 ;
+    Mask = GB_random_mask(m,n,0.2, M_is_csc, M_is_hyper) ;
+    C0 = GB_spec_mxm (C, Mask, accum, semiring, A, B, dnn);
+    C1 = GB_mex_mxm  (C, Mask, accum, semiring, A, B, dnn);
+    GB_spec_compare (C0, C1, identity) ;
+
+    % C = A*B with mask (with explicit zero entries)
+    Mask1 = sprandn (m,n,0.2) ~= 0 ;
+    Mask2 = Mask1 .* spones (sprandn (m,n,0.5)) ;
+    S = sparse (m,n) ;
+    Mask3 = GB_mex_Matrix_eWiseAdd (S,[ ],[ ],'minus',Mask1,Mask2) ;
+    clear Mask
+    Mask.matrix = Mask3.matrix ;
+    Mask.is_csc = M_is_csc ;
+    Mask.is_hyper = M_is_hyper ;
+    clear Mask1 Mask2 Mask3
+    % the Mask matrix will not pass GB_spok(Mask) test since
+    % it will have explicit zeros
+
+    C0 = GB_spec_mxm (C, Mask, accum, semiring, A, B, dnn);
+    C1 = GB_mex_mxm  (C, Mask, accum, semiring, A, B, dnn);
+    GB_spec_compare (C0, C1, identity) ;
+
+    % w = A*u with mask
+    % mask = sprandn (m,1,0.2) ~= 0 ;
+    mask = GB_random_mask (m,1,0.2) ;
+    w0 = GB_spec_mxv (w, mask, accum, semiring, A, u, dnn);
+    w1 = GB_mex_mxv  (w, mask, accum, semiring, A, u, dnn);
+    GB_spec_compare (w0, w1, identity) ;
+
+    % w' = u'*A' with mask
+    w0 = GB_spec_vxm (w, mask, accum, semiring, u, A, dnt) ;
+    w1 = GB_mex_vxm  (w, mask, accum, semiring, u, A, dnt) ;
+    GB_spec_compare (w0, w1, identity) ;
+
+    %-----------------------------------
+    % A'*B
+    %-----------------------------------
+
+    A = GB_spec_random (s,m,density,100,aclas, A_is_csc, A_is_hyper) ;
+    B = GB_spec_random (s,n,density,100,aclas, B_is_csc, B_is_hyper) ;
+    C = GB_spec_random (m,n,density,100,aclas, C_is_csc, C_is_hyper) ;
+
+    % C = A'*B, no Mask
+    C0 = GB_spec_mxm (C, [ ], accum, semiring, A, B, dtn);
+    C1 = GB_mex_mxm  (C, [ ], accum, semiring, A, B, dtn);
+    GB_spec_compare (C0, C1, identity) ;
+
+    % w = A'*u, no mask
+
+    w0 = GB_spec_mxv (w, [ ], accum, semiring, A, u, dtn);
+    w1 = GB_mex_mxv  (w, [ ], accum, semiring, A, u, dtn);
+    GB_spec_compare (w0, w1, identity) ;
+
+    % w' = u'*A, no mask
+    w0 = GB_spec_vxm (w, [ ], accum, semiring, u, A, dnn);
+    w1 = GB_mex_vxm  (w, [ ], accum, semiring, u, A, dnn);
+    GB_spec_compare (w0, w1, identity) ;
+
+    % C = A'*B with mask
+    % Mask = sprandn (m,n,0.2) ~= 0 ;
+    Mask = GB_random_mask (m,n,0.2, M_is_csc, M_is_hyper) ;
+    C0 = GB_spec_mxm (C, Mask, accum, semiring, A, B, dtn);
+    C1 = GB_mex_mxm  (C, Mask, accum, semiring, A, B, dtn);
+    GB_spec_compare (C0, C1, identity) ;
+
+    % C = A'*B with mask
+    Mask1 = sprandn (m,n,0.2) ~= 0 ;
+    Mask2 = Mask1 .* spones (sprandn (m,n,0.5)) ;
+    S = sparse (m,n) ;
+    Mask3 = GB_mex_Matrix_eWiseAdd (S,[ ],[ ],'minus',Mask1,Mask2) ;
+    clear Mask
+    Mask.matrix = Mask3.matrix ;
+    Mask.is_csc = M_is_csc ;
+    Mask.is_hyper = M_is_hyper ;
+    clear Mask1 Mask2 Mask3
+    % the Mask matrix will not pass GB_spok(Mask) test since
+    % it will have explicit zeros
+    C0 = GB_spec_mxm (C, Mask, accum, semiring, A, B, dtn);
+    C1 = GB_mex_mxm  (C, Mask, accum, semiring, A, B, dtn);
+    GB_spec_compare (C0, C1, identity) ;
+
+    % w = A'*u, with mask
+    % mask = sprandn (m,1,0.2) ~= 0 ;
+    mask = GB_random_mask (m,1,0.2) ;
+    w0 = GB_spec_mxv (w, mask, accum, semiring, A, u, dtn);
+    w1 = GB_mex_mxv  (w, mask, accum, semiring, A, u, dtn);
+    GB_spec_compare (w0, w1, identity) ;
+
+    % w' = u'*A, with mask
+    w0 = GB_spec_vxm (w, mask, accum, semiring, u, A, dnn);
+    w1 = GB_mex_vxm  (w, mask, accum, semiring, u, A, dnn);
+    GB_spec_compare (w0, w1, identity) ;
+
+    %-----------------------------------
+    % A*B'
+    %-----------------------------------
+
+    % no mask
+
+    A = GB_spec_random (m,s,density,100,aclas, A_is_csc, A_is_hyper) ;
+    B = GB_spec_random (n,s,density,100,aclas, B_is_csc, B_is_hyper) ;
+    C = GB_spec_random (m,n,density,100,aclas, C_is_csc, C_is_hyper) ;
+
+    C0 = GB_spec_mxm (C, [ ], accum, semiring, A, B, dnt);
+    C1 = GB_mex_mxm  (C, [ ], accum, semiring, A, B, dnt);
+    GB_spec_compare (C0, C1, identity) ;
+
+    % with mask
+    % Mask = sprandn (m,n,0.2) ~= 0 ;
+    Mask = GB_random_mask (m,n,0.2, M_is_csc, M_is_hyper) ;
+    C0 = GB_spec_mxm (C, Mask, accum, semiring, A, B, dnt);
+    C1 = GB_mex_mxm  (C, Mask, accum, semiring, A, B, dnt);
+    GB_spec_compare (C0, C1, identity) ;
+
+    %-----------------------------------
+    % A'*B'
+    %-----------------------------------
+
+    % no Mask
+
+    A = GB_spec_random (s,m,density,100,aclas, A_is_csc, A_is_hyper) ;
+    B = GB_spec_random (n,s,density,100,aclas, B_is_csc, B_is_hyper) ;
+    C = GB_spec_random (m,n,density,100,aclas, C_is_csc, C_is_hyper) ;
+
+    C0 = GB_spec_mxm (C, [ ], accum, semiring, A, B, dtt);
+    C1 = GB_mex_mxm  (C, [ ], accum, semiring, A, B, dtt);
+    GB_spec_compare (C0, C1, identity) ;
+
+
+    % A'*B', with mask
+    % Mask = sprandn (m,n,0.2) ~= 0 ;
+    Mask = GB_random_mask (m,n,0.2, M_is_csc, M_is_hyper) ;
+
+    C0 = GB_spec_mxm (C, Mask, accum, semiring, A, B, dtt);
+    C1 = GB_mex_mxm  (C, Mask, accum, semiring, A, B, dtt);
+    GB_spec_compare (C0, C1, identity) ;
+
+    ntrials = ntrials + 1 ;
+
+
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
 end
 
 fprintf ('semirings tested: %d\n', n_semirings) ;
diff --git a/GraphBLAS/Test/test21.m b/GraphBLAS/Test/test21.m
index 74ee90da27..3ff49771b7 100644
--- a/GraphBLAS/Test/test21.m
+++ b/GraphBLAS/Test/test21.m
@@ -1,8 +1,8 @@
 function test21(fulltest)
 %TEST21 test GxB_subassign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 1)
     % do a short test, by default
@@ -43,155 +43,158 @@ function test21(fulltest)
 
     % try all types
     for k2 = k2test % 1:ntypes
-        clear accum
-        if (~isempty (accum_op))
-            accum_type = types {k2}  ;
-            accum.opname = accum_op ;
-            accum.optype = accum_type ;
-        else
-            accum = '' ;
-            accum_type = '' ;
-        end
+    clear accum
+    if (~isempty (accum_op))
+        accum_type = types {k2}  ;
+        accum.opname = accum_op ;
+        accum.optype = accum_type ;
+    else
+        accum = '' ;
+        accum_type = '' ;
+    end
 
-        try
-            GB_spec_operator (accum) ;
-        catch
-            continue ;
-        end
-        fprintf (' %s', accum_type) ;
-
-        rng (k1 * 100 + k2, 'v4') ;
-
-        for Mask_complement = [false true]
-
-            if (Mask_complement)
-                dn.mask = 'complement' ;
-                dt.mask = 'complement' ;
-            else
-                dn.mask = 'default' ;
-                dt.mask = 'default' ;
-            end
-
-            for C_replace = [false true]
-
-                if (C_replace)
-                    dn.outp = 'replace' ;
-                    dt.outp = 'replace' ;
-                else
-                    dn.outp = 'default' ;
-                    dt.outp = 'default' ;
-                end
-
-                kk3 = randperm (length (types), 1) ;
-
-                % try all matrix types, to test casting
-                for k3 = kk3 % 1:length (types)
-                    atype = types {k3}  ;
-
-                    % try some matrices
-                    for m = [1 10] % Was [1 5 10 ]
-                        for n = [1 10] % Was [ 1 5 10 ]
-                            for sm = [0 1 5] % Was [ 0 1 5 10 ]
-                                if (sm > m)
-                                    continue
-                                end
-                                for sn = [0 1 5] % Was [ 0 1 5 10 ]
-                                    if (sn > n)
-                                        continue
-                                    end
-                                    for scalar = [false true]
-
-                                        %---------------------------------------
-
-
-                                        if (sm == 0)
-                                            I = [ ] ;
-                                            am = m ;
-                                        else
-                                            I = randperm (m,sm) ; % I = I(1:sm);
-                                            am = sm ;
-                                        end
-                                        I0 = uint64 (I-1) ;
-                                        if (sn == 0)
-                                            J = [ ] ;
-                                            an = n ;
-                                        else
-                                            J = randperm (n,sn) ; % J = J(1:sn);
-                                            an = sn ;
-                                        end
-                                        J0 = uint64 (J-1) ;
-
-                                        fprintf ('.') ;
-
-                                        for A_is_hyper = 0:1
-                                        for A_is_csc   = 0:1
-                                        for C_is_hyper = 0:1
-                                        for C_is_csc   = 0:1
-                                        for M_is_hyper = 0:1
-                                        for M_is_csc   = 0:1
-
-                                        if (scalar)
-                                            % test scalar expansion
-                                            % fprintf ('test expansion\n') ;
-                                            A.matrix = sparse (rand (1)) * 100 ;
-                                            A.pattern = sparse (logical (true));
-                                            A.class = atype ;
-                                            if (A_is_hyper || ~A_is_csc)
-                                                continue
-                                            end
-                                        else
-                                            A = GB_spec_random (am,an,0.2,100,atype, A_is_csc, A_is_hyper) ;
-                                        end
-
-                                        C = GB_spec_random (m,n,0.2,100,atype, C_is_csc, C_is_hyper) ;
-                                        Mask = GB_random_mask (am,an,0.2, M_is_csc, M_is_hyper) ;
-
-                                        % C(I,J) = accum (C (I,J),A)
-                                        % Mask = [ ] ;
-
-                                        C0 = GB_spec_subassign (C, [ ], accum, A, I, J, dn, scalar);
-                                        C1 = GB_mex_subassign  (C, [ ], accum, A, I0, J0, dn);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        % C(I,J)<Mask> = accum (C (I,J),A)
-                                        C0 = GB_spec_subassign (C, Mask, accum, A, I, J, dn, scalar);
-                                        C1 = GB_mex_subassign  (C, Mask, accum, A, I0, J0, dn);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        %---------------------------------------
-
-                                        % C(I,J)<Mask> = accum (C(I,J), A');
-                                        % note transposing twice
-                                        clear AT
-                                        AT.matrix  = A.matrix' ;
-                                        AT.pattern = A.pattern' ;
-                                        AT.class = A.class ;
-
-                                        C0 = GB_spec_subassign (C, [ ], accum, AT, I, J, dt, scalar);
-                                        C1 = GB_mex_subassign  (C, [ ], accum, AT, I0, J0, dt);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        C0 = GB_spec_subassign (C, Mask, accum, AT, I, J, dt, scalar);
-                                        C1 = GB_mex_subassign  (C, Mask, accum, AT, I0, J0, dt);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        %---------------------------------------
-                                        end
-                                        end
-                                        end
-                                        end
-                                        end
-                                        end
-
-                                    end
-                                end
-                            end
-                        end
-                    end
-                end
-            end
+    if (GB_spec_is_positional (accum))
+        continue ;
+    end
+
+    try
+        GB_spec_operator (accum) ;
+    catch
+        continue ;
+    end
+    fprintf (' %s', accum_type) ;
+
+    rng (k1 * 100 + k2, 'v4') ;
+
+    for Mask_complement = [false true]
+
+    if (Mask_complement)
+        dn.mask = 'complement' ;
+        dt.mask = 'complement' ;
+    else
+        dn.mask = 'default' ;
+        dt.mask = 'default' ;
+    end
+
+    for C_replace = [false true]
+
+    if (C_replace)
+        dn.outp = 'replace' ;
+        dt.outp = 'replace' ;
+    else
+        dn.outp = 'default' ;
+        dt.outp = 'default' ;
+    end
+
+    kk3 = randperm (length (types), 1) ;
+
+    % try all matrix types, to test casting
+    for k3 = kk3 % 1:length (types)
+    atype = types {k3}  ;
+
+    % try some matrices
+    for m = [1 10] % Was [1 5 10 ]
+    for n = [1 10] % Was [ 1 5 10 ]
+    for sm = [0 1 5] % Was [ 0 1 5 10 ]
+    if (sm > m)
+        continue
+    end
+    for sn = [0 1 5] % Was [ 0 1 5 10 ]
+    if (sn > n)
+        continue
+    end
+    for scalar = [false true]
+
+    %---------------------------------------
+
+
+    if (sm == 0)
+        I = [ ] ;
+        am = m ;
+    else
+        I = randperm (m,sm) ; % I = I(1:sm);
+        am = sm ;
+    end
+    I0 = uint64 (I-1) ;
+    if (sn == 0)
+        J = [ ] ;
+        an = n ;
+    else
+        J = randperm (n,sn) ; % J = J(1:sn);
+        an = sn ;
+    end
+    J0 = uint64 (J-1) ;
+
+    fprintf ('.') ;
+
+    for A_is_hyper = 0:1
+    for A_is_csc   = 0:1
+    for C_is_hyper = 0:1
+    for C_is_csc   = 0:1
+    for M_is_hyper = 0:1
+    for M_is_csc   = 0:1
+
+    if (scalar)
+        % test scalar expansion
+        % fprintf ('test expansion\n') ;
+        A.matrix = sparse (rand (1)) * 100 ;
+        A.pattern = sparse (logical (true));
+        A.class = atype ;
+        if (A_is_hyper || ~A_is_csc)
+            continue
         end
+    else
+        A = GB_spec_random (am,an,0.2,100,atype, A_is_csc, A_is_hyper) ;
     end
+
+    C = GB_spec_random (m,n,0.2,100,atype, C_is_csc, C_is_hyper) ;
+    Mask = GB_random_mask (am,an,0.2, M_is_csc, M_is_hyper) ;
+
+    % C(I,J) = accum (C (I,J),A)
+    % Mask = [ ] ;
+
+    C0 = GB_spec_subassign (C, [ ], accum, A, I, J, dn, scalar);
+    C1 = GB_mex_subassign  (C, [ ], accum, A, I0, J0, dn);
+    GB_spec_compare (C0, C1) ;
+
+    % C(I,J)<Mask> = accum (C (I,J),A)
+    C0 = GB_spec_subassign (C, Mask, accum, A, I, J, dn, scalar);
+    C1 = GB_mex_subassign  (C, Mask, accum, A, I0, J0, dn);
+    GB_spec_compare (C0, C1) ;
+
+    %---------------------------------------
+
+    % C(I,J)<Mask> = accum (C(I,J), A');
+    % note transposing twice
+    clear AT
+    AT.matrix  = A.matrix' ;
+    AT.pattern = A.pattern' ;
+    AT.class = A.class ;
+
+    C0 = GB_spec_subassign (C, [ ], accum, AT, I, J, dt, scalar);
+    C1 = GB_mex_subassign  (C, [ ], accum, AT, I0, J0, dt);
+    GB_spec_compare (C0, C1) ;
+
+    C0 = GB_spec_subassign (C, Mask, accum, AT, I, J, dt, scalar);
+    C1 = GB_mex_subassign  (C, Mask, accum, AT, I0, J0, dt);
+    GB_spec_compare (C0, C1) ;
+
+    %---------------------------------------
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
 end
 
 fprintf ('\ntest21: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test21b.m b/GraphBLAS/Test/test21b.m
index 194241e6d9..5f2fe0364d 100644
--- a/GraphBLAS/Test/test21b.m
+++ b/GraphBLAS/Test/test21b.m
@@ -1,8 +1,8 @@
 function test21b (fulltest)
 %TEST21B test GrB_assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 1)
     % do a short test, by default
@@ -17,10 +17,10 @@ function test21b (fulltest)
 dt = struct ( 'inp0', 'tran' ) ;
 
 if (fulltest)
-    fprintf ('\ntest21b --------------exhaustive test of GB_mex_subassign\n') ;
+    fprintf ('\ntest21b --------------exhaustive test of GB_mex_assign\n') ;
     k1test = 0:length(accum_ops) ;
 else
-    fprintf ('\ntest21b --------------quick test of GB_mex_subassign\n') ;
+    fprintf ('\ntest21b --------------quick test of GB_mex_assign\n') ;
     k1test = [0 4] ; % Was [0 2 4] ;
 end
 
@@ -45,209 +45,210 @@ function test21b (fulltest)
 
     % try all types
     for k2 = k2test % 1:ntypes
-        clear accum
-        if (~isempty (accum_op))
-            accum_type = types {k2}  ;
-            accum.opname = accum_op ;
-            accum.optype = accum_type ;
-        else
-            accum = '' ;
-            accum_type = '' ;
+    clear accum
+    if (~isempty (accum_op))
+        accum_type = types {k2}  ;
+        accum.opname = accum_op ;
+        accum.optype = accum_type ;
+    else
+        accum = '' ;
+        accum_type = '' ;
+    end
+
+    if (GB_spec_is_positional (accum))
+        continue ;
+    end
+
+    try
+        GB_spec_operator (accum) ;
+    catch
+        continue ;
+    end
+
+    rng (k1 * 100 + k2, 'v4') ;
+
+    for Mask_complement = [false true]
+
+    if (Mask_complement)
+        dn.mask = 'complement' ;
+        dt.mask = 'complement' ;
+    else
+        dn.mask = 'default' ;
+        dt.mask = 'default' ;
+    end
+
+    for C_replace = [true false]
+
+    if (C_replace)
+        % fprintf ('C_replace') ;
+        dn.outp = 'replace' ;
+        dt.outp = 'replace' ;
+    else
+        dn.outp = 'default' ;
+        dt.outp = 'default' ;
+    end
+
+    kk3 = randperm (length (types), 1) ;
+
+    % try all matrix types, to test casting
+    for k3 = kk3 % 1:length (types)
+    atype = types {k3}  ;
+
+    % try some matrices
+    for m = [1 5 10 ]
+    for n = [ 1 5 10 ]
+    for sm = [ -3 -2 -1 0 1 5 10 ]
+    if (sm > m)
+        continue
+    end
+
+    fprintf ('.') ;
+
+    for sn = [ 0 1 5 10 ]
+    if (sn > n)
+        continue
+    end
+    for scalar = [false true]
+
+    if (sm == -3)
+
+        % I = (m-2):-2:1
+        if (m < 5)
+            continue
         end
+        clear I0
+        I0.begin = m-3 ;
+        I0.inc = -2 ;
+        I0.end = 0 ;
+        I = (m-2):-2:1 ;
+        am = length (I) ;
+
+    elseif (sm == -2)
+
+        % I = 1:2:(m-2)
+        if (m < 5)
+            continue
+        end
+        clear I0
+        I0.begin = 0 ;
+        I0.inc = 2 ;
+        I0.end = m-3 ;
+        I = 1:2:(m-2) ;
+        am = length (I) ;
+
+    elseif (sm == -1)
+
+        % I = 1:(m-2)
+        if (m < 5)
+            continue
+        end
+        clear I0
+        I0.begin = 0 ;
+        I0.end = m-3 ;
+        I = 1:(m-2) ;
+        am = length (I) ;
+
+    elseif (sm == 0)
+
+        % I = ":"
+        I = [ ] ;
+        am = m ;
+        I0 = uint64 (I-1) ;
+
+    else
+
+        % I = random list of length sm
+        I = randperm (m,sm) ;
+        am = sm ;
+        I0 = uint64 (I-1) ;
 
-        try
-            GB_spec_operator (accum) ;
-        catch
-            continue ;
+    end
+
+    if (sn == 0)
+        J = [ ] ;
+        an = n ;
+    else
+        J = randperm (n,sn) ; % J = J(1:sn);
+        an = sn ;
+    end
+    J0 = uint64 (J-1) ;
+
+    for A_is_hyper = 0:1
+    for A_is_csc   = 0:1
+    for C_is_hyper = 0:1
+    for C_is_csc   = 0:1
+    for M_is_hyper = 0:1
+    for M_is_csc   = 0:1
+
+    quick = quick+1 ;
+    if (~fulltest)
+        % only do every 11th test
+        if (mod (quick, 11) ~= 1)
+            continue
         end
+    end
 
-        rng (k1 * 100 + k2, 'v4') ;
-
-        for Mask_complement = [false true]
-
-            if (Mask_complement)
-                dn.mask = 'complement' ;
-                dt.mask = 'complement' ;
-            else
-                dn.mask = 'default' ;
-                dt.mask = 'default' ;
-            end
-
-            for C_replace = [true false]
-
-                if (C_replace)
-                    % fprintf ('C_replace') ;
-                    dn.outp = 'replace' ;
-                    dt.outp = 'replace' ;
-                else
-                    dn.outp = 'default' ;
-                    dt.outp = 'default' ;
-                end
-
-                kk3 = randperm (length (types), 1) ;
-
-                % try all matrix types, to test casting
-                for k3 = kk3 % 1:length (types)
-                    atype = types {k3}  ;
-
-                    % try some matrices
-                    for m = [1 5 10 ]
-                        for n = [ 1 5 10 ]
-                            for sm = [ -3 -2 -1 0 1 5 10 ]
-                                if (sm > m)
-                                    continue
-                                end
-
-                                fprintf ('.') ;
-
-                                for sn = [ 0 1 5 10 ]
-                                    if (sn > n)
-                                        continue
-                                    end
-                                    for scalar = [false true]
-
-                                        %---------------------------------------
-
-                                        if (sm == -3)
-
-                                            % I = (m-2):-2:1
-                                            if (m < 5)
-                                                continue
-                                            end
-                                            clear I0
-                                            I0.begin = m-3 ;
-                                            I0.inc = -2 ;
-                                            I0.end = 0 ;
-                                            I = (m-2):-2:1 ;
-                                            am = length (I) ;
-
-                                        elseif (sm == -2)
-
-                                            % I = 1:2:(m-2)
-                                            if (m < 5)
-                                                continue
-                                            end
-                                            clear I0
-                                            I0.begin = 0 ;
-                                            I0.inc = 2 ;
-                                            I0.end = m-3 ;
-                                            I = 1:2:(m-2) ;
-                                            am = length (I) ;
-
-                                        elseif (sm == -1)
-
-                                            % I = 1:(m-2)
-                                            if (m < 5)
-                                                continue
-                                            end
-                                            clear I0
-                                            I0.begin = 0 ;
-                                            I0.end = m-3 ;
-                                            I = 1:(m-2) ;
-                                            am = length (I) ;
-
-                                        elseif (sm == 0)
-
-                                            % I = ":"
-                                            I = [ ] ;
-                                            am = m ;
-                                            I0 = uint64 (I-1) ;
-
-                                        else
-
-                                            % I = random list of length sm
-                                            I = randperm (m,sm) ;
-                                            am = sm ;
-                                            I0 = uint64 (I-1) ;
-
-                                        end
-
-                                        if (sn == 0)
-                                            J = [ ] ;
-                                            an = n ;
-                                        else
-                                            J = randperm (n,sn) ; % J = J(1:sn);
-                                            an = sn ;
-                                        end
-                                        J0 = uint64 (J-1) ;
-
-                                        for A_is_hyper = 0:1
-                                        for A_is_csc   = 0:1
-                                        for C_is_hyper = 0:1
-                                        for C_is_csc   = 0:1
-                                        for M_is_hyper = 0:1
-                                        for M_is_csc   = 0:1
-
-                                        quick = quick+1 ;
-                                        if (~fulltest)
-                                            % only do every 11th test
-                                            if (mod (quick, 11) ~= 1)
-                                                continue
-                                            end
-                                        end
-
-                                        if (scalar)
-                                            % test scalar expansion
-                                            % fprintf ('test expansion\n') ;
-                                            A.matrix = sparse (rand (1)) * 100 ;
-                                            A.pattern = sparse (logical (true));
-                                            A.class = atype ;
-                                            if (A_is_hyper || ~A_is_csc)
-                                                continue
-                                            end
-                                        else
-                                            A = GB_spec_random (am,an,0.2,100,atype, A_is_csc, A_is_hyper) ;
-                                        end
-
-                                        C = GB_spec_random (m,n,0.2,100,atype, C_is_csc, C_is_hyper) ;
-                                        Mask = GB_random_mask (m,n,0.2, M_is_csc, M_is_hyper) ;
-
-                                        % C(I,J) = accum (C (I,J),A)
-                                        % Mask = [ ] ;
-                                        C0 = GB_spec_assign (C, [ ], accum, A, I, J, dn, scalar);
-                                        C1 = GB_mex_assign  (C, [ ], accum, A, I0, J0, dn);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        % C(I,J)<Mask> = accum (C (I,J),A)
-                                        C0 = GB_spec_assign (C, Mask, accum, A, I, J, dn, scalar);
-                                        C1 = GB_mex_assign  (C, Mask, accum, A, I0, J0, dn);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        %---------------------------------------
-
-                                        % C(I,J)<Mask> = accum (C(I,J), A');
-                                        % note transposing twice
-                                        clear AT
-                                        AT.matrix  = A.matrix' ;
-                                        AT.pattern = A.pattern' ;
-                                        AT.class = A.class ;
-
-                                        C0 = GB_spec_assign (C, [ ], accum, AT, I, J, dt, scalar);
-                                        C1 = GB_mex_assign  (C, [ ], accum, AT, I0, J0, dt);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        C0 = GB_spec_assign (C, Mask, accum, AT, I, J, dt, scalar);
-                                        C1 = GB_mex_assign  (C, Mask, accum, AT, I0, J0, dt);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        %---------------------------------------
-
-                                        end
-                                        end
-                                        end
-                                        end
-                                        end
-                                        end
-
-                                    end
-                                end
-                            end
-                        end
-                    end
-                end
-            end
+    if (scalar)
+        % test scalar expansion
+        % fprintf ('test expansion\n') ;
+        A.matrix = sparse (rand (1)) * 100 ;
+        A.pattern = sparse (logical (true));
+        A.class = atype ;
+        if (A_is_hyper || ~A_is_csc)
+            continue
         end
+    else
+        A = GB_spec_random (am,an,0.2,100,atype, A_is_csc, A_is_hyper) ;
     end
+
+    C = GB_spec_random (m,n,0.2,100,atype, C_is_csc, C_is_hyper) ;
+    Mask = GB_random_mask (m,n,0.2, M_is_csc, M_is_hyper) ;
+
+    % C(I,J) = accum (C (I,J),A)
+    % Mask = [ ] ;
+    C0 = GB_spec_assign (C, [ ], accum, A, I, J, dn, scalar);
+    C1 = GB_mex_assign  (C, [ ], accum, A, I0, J0, dn);
+    GB_spec_compare (C0, C1) ;
+
+    % C<Mask>(I,J) = accum (C (I,J),A)
+    C0 = GB_spec_assign (C, Mask, accum, A, I, J, dn, scalar);
+    C1 = GB_mex_assign  (C, Mask, accum, A, I0, J0, dn);
+    GB_spec_compare (C0, C1) ;
+
+    %---------------------------------------
+
+    % C<Mask>(I,J) = accum (C(I,J), A');
+    % note transposing twice
+    clear AT
+    AT.matrix  = A.matrix' ;
+    AT.pattern = A.pattern' ;
+    AT.class = A.class ;
+
+    C0 = GB_spec_assign (C, [ ], accum, AT, I, J, dt, scalar);
+    C1 = GB_mex_assign  (C, [ ], accum, AT, I0, J0, dt);
+    GB_spec_compare (C0, C1) ;
+
+    C0 = GB_spec_assign (C, Mask, accum, AT, I, J, dt, scalar);
+    C1 = GB_mex_assign  (C, Mask, accum, AT, I0, J0, dt);
+    GB_spec_compare (C0, C1) ;
+
+    %---------------------------------------
+
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
+end
 end
 
 fprintf ('\ntest21b: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test22.m b/GraphBLAS/Test/test22.m
index 9eaa34f962..0743d70849 100644
--- a/GraphBLAS/Test/test22.m
+++ b/GraphBLAS/Test/test22.m
@@ -1,8 +1,8 @@
 function test22(fulltest)
 %TEST22 test GrB_transpose
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 1)
     % do a short test, by default
@@ -77,7 +77,7 @@ function test22(fulltest)
 
         % C = A'
         C = GB_mex_transpose  (Cempty2, [ ], [ ], A, [ ]) ;
-        assert (spok (C.matrix*1) == 1) ;
+        assert (GB_spok (C.matrix*1) == 1) ;
         S = GB_spec_transpose (Cempty2, [ ], [ ], A, [ ]) ;
         assert (isequal (C.class, A.class)) ;
         assert (isequal (C.class, S.class)) ;
@@ -90,7 +90,7 @@ function test22(fulltest)
         clear desc
         desc = struct ('inp0', 'tran') ;
         C = GB_mex_transpose  (Cempty, [ ], [ ], A, desc) ;
-        assert (spok (C.matrix*1) == 1) ;
+        assert (GB_spok (C.matrix*1) == 1) ;
         S = GB_spec_transpose (Cempty, [ ], [ ], A, desc) ;
         assert (isequal (C.class, A.class)) ;
         assert (isequal (C.class, S.class)) ;
@@ -102,7 +102,7 @@ function test22(fulltest)
         % C<Mask> = A'
         Cempty2.class = A.class ;
         C = GB_mex_transpose  (Cempty2, Mask', [ ], A, [ ]) ;
-        assert (spok (C.matrix*1) == 1) ;
+        assert (GB_spok (C.matrix*1) == 1) ;
         S = GB_spec_transpose (Cempty2, Mask', [ ], A, [ ]) ;
         assert (isequal (C.class, A.class)) ;
         assert (isequal (C.class, S.class)) ;
@@ -116,7 +116,7 @@ function test22(fulltest)
         desc = struct ('inp0', 'tran') ;
         Cempty.class = A.class ;
         C = GB_mex_transpose  (Cempty, Mask, [ ], A, desc) ;
-        assert (spok (C.matrix*1) == 1) ;
+        assert (GB_spok (C.matrix*1) == 1) ;
         S = GB_spec_transpose (Cempty, Mask, [ ], A, desc) ;
         assert (isequal (C.class, A.class)) ;
         assert (isequal (C.class, S.class)) ;
@@ -138,7 +138,7 @@ function test22(fulltest)
                     op = '' ;
                     ntypes = 1 ;
                 else
-                    op = binops {k3,1} ;
+                    op = binops {k3} ;
                     ntypes = length (types) ;
                 end
 
@@ -154,6 +154,10 @@ function test22(fulltest)
                     accum.opname = op ;
                     accum.optype = optype ;
 
+                    if (GB_spec_is_positional (accum))
+                        continue ;
+                    end
+
                     try
                         GB_spec_operator (accum) ;
                     catch
@@ -162,7 +166,7 @@ function test22(fulltest)
 
                     % C = op (Cin2,A')
                     C = GB_mex_transpose  (Cin2, [ ], accum, A, [ ]) ;
-                    assert (spok (C.matrix*1) == 1) ;
+                    assert (GB_spok (C.matrix*1) == 1) ;
                     S = GB_spec_transpose (Cin2, [ ], accum, A, [ ]) ;
                     assert (isequal (C.class, cinclass)) ;
                     assert (isequal (C.class, S.class)) ;
@@ -175,7 +179,7 @@ function test22(fulltest)
                     clear desc
                     desc = struct ('inp0', 'tran') ;
                     C = GB_mex_transpose  (Cin, [ ], accum, A, desc) ;
-                    assert (spok (C.matrix*1) == 1) ;
+                    assert (GB_spok (C.matrix*1) == 1) ;
                     S = GB_spec_transpose (Cin, [ ], accum, A, desc) ;
                     assert (isequal (C.class, cinclass)) ;
                     assert (isequal (C.class, S.class)) ;
@@ -189,7 +193,7 @@ function test22(fulltest)
 
                         % C = op (Cin2,A')
                         C = GB_mex_transpose  (Cin2, M', accum, A, [ ]) ;
-                        assert (spok (C.matrix*1) == 1) ;
+                        assert (GB_spok (C.matrix*1) == 1) ;
                         S = GB_spec_transpose (Cin2, M', accum, A, [ ]) ;
                         assert (isequal (C.class, cinclass)) ;
                         assert (isequal (C.class, S.class)) ;
@@ -201,7 +205,7 @@ function test22(fulltest)
                         clear desc
                         desc = struct ('inp0', 'tran') ;
                         C = GB_mex_transpose  (Cin, M, accum, A, desc) ;
-                        assert (spok (C.matrix*1) == 1) ;
+                        assert (GB_spok (C.matrix*1) == 1) ;
                         S = GB_spec_transpose (Cin, M, accum, A, desc) ;
                         assert (isequal (C.class, cinclass)) ;
                         assert (isequal (C.class, S.class)) ;
diff --git a/GraphBLAS/Test/test23.m b/GraphBLAS/Test/test23.m
index 543c8cb0cd..ef11f8ae80 100644
--- a/GraphBLAS/Test/test23.m
+++ b/GraphBLAS/Test/test23.m
@@ -1,8 +1,8 @@
 function test23(fulltest)
 %TEST23 test GrB_*_build
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [~, ~, ~, types, ~, ~] = GB_spec_opsall ;
 types = types.all ;
@@ -131,7 +131,7 @@ function test23(fulltest)
                         A = GB_mex_Matrix_build (I, J, X, nrows, ncols, op, ...
                             ctype, A_is_csc) ;
                         % A is sparse but may have explicit zeros
-                        if (~spok (A.matrix*1))
+                        if (~GB_spok (A.matrix*1))
                             fprintf ('test failure: invalid sparse matrix\n') ;
                             assert (false) ;
                         end
@@ -178,7 +178,7 @@ function test23(fulltest)
                     % fprintf ('xtype: %s\n', xtype) ;
                     A = GB_mex_Vector_build (I, X, nrows, op, ctype) ;
                     % A is sparse but may have explicit zeros
-                    if (~spok (A.matrix*1))
+                    if (~GB_spok (A.matrix*1))
                         fprintf ('test failure: invalid sparse matrix\n') ;
                         assert (false) ;
                     end
diff --git a/GraphBLAS/Test/test24.m b/GraphBLAS/Test/test24.m
index 200425762f..c7fc2126a2 100644
--- a/GraphBLAS/Test/test24.m
+++ b/GraphBLAS/Test/test24.m
@@ -2,8 +2,8 @@ function test24(fulltest)
 %TEST24 test GrB_reduce
 % test24(fulltest); fulltest=1 if longer test, 0 for quick test
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, ~, add_ops, types, ~, ~] = GB_spec_opsall ;
 test_types = types.all ;
@@ -120,59 +120,50 @@ function test24(fulltest)
                                     accum_class = '' ;
                                 end
 
+                                if (GB_spec_is_positional (accum))
+                                    continue ;
+                                end
+
                                 try
-                                    [opname optype ztype xtype ytype] = ...
-                                        GB_spec_operator (accum) ;
+                                    [opname optype ztype xtype ytype] = GB_spec_operator (accum) ;
                                 catch
                                     continue
                                 end
 
                                 % reduce matrix to scalar
-                                c = GB_mex_reduce_to_scalar ...
-                                    (cin, accum, reduce, A) ;
-                                c3 = GB_spec_reduce_to_scalar ...
-                                    (cin, accum, reduce, A) ;
+                                c  = GB_mex_reduce_to_scalar  (cin, accum, reduce, A) ;
+                                c3 = GB_spec_reduce_to_scalar (cin, accum, reduce, A) ;
                                 assert (isequal (c, c3))
 
                                 % reduce vector to scalar
-                                c = GB_mex_reduce_to_scalar ...
-                                    (cin, accum, reduce, B) ;
-                                c3 = GB_spec_reduce_to_scalar ...
-                                    (cin, accum, reduce, B) ;
+                                c  = GB_mex_reduce_to_scalar  (cin, accum, reduce, B) ;
+                                c3 = GB_spec_reduce_to_scalar (cin, accum, reduce, B) ;
                                 assert (isequal (c, c3))
 
                                 % row-wise reduce matrix to vector
 
                                 % no mask
-                                x = GB_mex_reduce_to_vector ...
-                                    (xin, [ ], accum, reduce, A, [ ]) ;
-                                x3 = GB_spec_reduce_to_vector ...
-                                    (xin, [ ], accum, reduce, A, [ ]) ;
+                                x  = GB_mex_reduce_to_vector  (xin, [ ], accum, reduce, A, [ ]) ;
+                                x3 = GB_spec_reduce_to_vector (xin, [ ], accum, reduce, A, [ ]) ;
                                 GB_spec_compare (x, x3, identity) ;
 
                                 % with mask
                                 mask = sprandn (m,1,0.3) ~= 0 ;
-                                x = GB_mex_reduce_to_vector ...
-                                    (xin, mask, accum, reduce, A, [ ]) ;
-                                x3 = GB_spec_reduce_to_vector ...
-                                    (xin, mask, accum, reduce, A, [ ]) ;
+                                x  = GB_mex_reduce_to_vector  (xin, mask, accum, reduce, A, [ ]) ;
+                                x3 = GB_spec_reduce_to_vector (xin, mask, accum, reduce, A, [ ]) ;
                                 GB_spec_compare (x, x3, identity) ;
 
                                 % col-wise reduce matrix to vector
 
                                 % no mask
-                                y = GB_mex_reduce_to_vector ...
-                                    (yin, [ ], accum, reduce, A, dt) ;
-                                y3 = GB_spec_reduce_to_vector ...
-                                    (yin, [ ], accum, reduce, A, dt) ;
+                                y  = GB_mex_reduce_to_vector  (yin, [ ], accum, reduce, A, dt) ;
+                                y3 = GB_spec_reduce_to_vector (yin, [ ], accum, reduce, A, dt) ;
                                 GB_spec_compare (y, y3, identity) ;
 
                                 % with mask
                                 mask = sprandn (n,1,0.3) ~= 0 ;
-                                y = GB_mex_reduce_to_vector ...
-                                    (yin, mask, accum, reduce, A, dt) ;
-                                y3 = GB_spec_reduce_to_vector ...
-                                    (yin, mask, accum, reduce, A, dt) ;
+                                y  = GB_mex_reduce_to_vector  (yin, mask, accum, reduce, A, dt) ;
+                                y3 = GB_spec_reduce_to_vector (yin, mask, accum, reduce, A, dt) ;
                                 GB_spec_compare (y, y3, identity) ;
 
                             end
diff --git a/GraphBLAS/Test/test25.m b/GraphBLAS/Test/test25.m
index d69db4c83e..91d4b5496d 100644
--- a/GraphBLAS/Test/test25.m
+++ b/GraphBLAS/Test/test25.m
@@ -1,8 +1,8 @@
 function test25
 %TEST25 test GxB_select
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest25: GxB_select tests\n') ;
 
@@ -52,7 +52,7 @@
     cin = GB_mex_cast (0, atype) ;
     % Mask = (sprand (m, n, 0.5) ~= 0) ;
     Mask = GB_random_mask (m, n, 0.5, M_is_csc, M_is_hyper) ;
-    Mask.hyper_ratio = hm ;
+    Mask.hyper_switch = hm ;
 
     fprintf ('.') ;
 
diff --git a/GraphBLAS/Test/test26.m b/GraphBLAS/Test/test26.m
index 0287095d08..9cba85c908 100644
--- a/GraphBLAS/Test/test26.m
+++ b/GraphBLAS/Test/test26.m
@@ -1,8 +1,8 @@
 function test26(longtests)
 %TEST26 performance test for GxB_select
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest26 ------------------------------performance of GxB_select\n') ;
 
@@ -41,7 +41,7 @@ function test26(longtests)
             A3 = sparse (10,10) ;
             A4 = GB_mex_Matrix_eWiseAdd (A3, [], [], 'minus', A1, A2, [ ]) ;
             A = A4.matrix ;
-            % spok(A) will fail since it has intentional explicit zeros
+            % GB_spok(A) will fail since it has intentional explicit zeros
         case 4
             A = sparse (rand (6000)) ;
         case 5
diff --git a/GraphBLAS/Test/test27.m b/GraphBLAS/Test/test27.m
index 470193b571..a6a2e1ef12 100644
--- a/GraphBLAS/Test/test27.m
+++ b/GraphBLAS/Test/test27.m
@@ -1,8 +1,8 @@
 function test27
 %TEST27 test GxB_select with user-defined select op (LoHi_band)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test 27: GxB_select with user-defined op (LoHi_band)\n') ;
 
diff --git a/GraphBLAS/Test/test28.m b/GraphBLAS/Test/test28.m
index 2b3d51c53b..fcf2d4ebec 100644
--- a/GraphBLAS/Test/test28.m
+++ b/GraphBLAS/Test/test28.m
@@ -1,8 +1,8 @@
 function test28
 %TEST28 test mxm with aliased inputs, C<C> = accum(C,C*C)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/Test/test29.m b/GraphBLAS/Test/test29.m
index d081fe2db9..e753a2ace4 100644
--- a/GraphBLAS/Test/test29.m
+++ b/GraphBLAS/Test/test29.m
@@ -1,8 +1,8 @@
 function test29
 %TEST29 GrB_reduce with zombies
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [~, ~, add_ops, types, ~, ~] = GB_spec_opsall ;
 types = types.all ;
@@ -34,15 +34,6 @@
             for kk4 = 1:length(add_ops)
 
                 op = add_ops {kk4} ;
-                try
-                    GB_spec_operator (op, atype) ;
-                    cin = GB_spec_identity (op, atype) ;
-                catch
-                    continue
-                end
-                if (isempty (cin))
-                    cin = GB_mex_cast (0, atype) ;
-                end
 
                 if (~builtin)
                     % no user-defined Complex_any_monoid
@@ -53,11 +44,35 @@
                     end
                 end
 
+                try
+                    GB_spec_operator (op, atype) ;
+                    GB_builtin_complex_set (1) ;
+                    cin = GB_spec_identity (op, atype) ;
+                    GB_builtin_complex_set (builtin) ;
+                catch
+                    continue
+                end
+
+                if (isempty (cin))
+                    GB_builtin_complex_set (1) ;
+                    cin = GB_mex_cast (0, atype) ;
+                    GB_builtin_complex_set (builtin) ;
+                end
+
                 [C3,c1,c3] = GB_mex_subassign (C, [ ], [ ], A, ...
                     [ ], [ ], [ ], op) ;
                 c2 = GB_mex_reduce_to_scalar (cin, '', op, C3) ;
 
-                if (isfloat (c1))
+                if (isequal (op, 'any'))
+                    [i,j,x] = find (C3.matrix) ;
+                    if (length (x) == 0)
+                        assert (c1 == 0) ;
+                        assert (c2 == 0) ;
+                    else
+                        assert (any (c1 == x)) ;
+                        assert (any (c2 == x)) ;
+                    end
+                elseif (isfloat (c1))
                     assert (isequal (c1,c2) || ...
                         (abs (c1-c2) <= 8 * eps (c2)))  ;
                 else
diff --git a/GraphBLAS/Test/test30.m b/GraphBLAS/Test/test30.m
index 6503bb5bbf..6cd25a04fd 100644
--- a/GraphBLAS/Test/test30.m
+++ b/GraphBLAS/Test/test30.m
@@ -1,8 +1,8 @@
 function test30
 %TEST30 test GxB_subassign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
     [save save_chunk] = nthreads_get ;
     chunk = 4096 ;
diff --git a/GraphBLAS/Test/test30b.m b/GraphBLAS/Test/test30b.m
index 86388422d1..a22e5f62a8 100644
--- a/GraphBLAS/Test/test30b.m
+++ b/GraphBLAS/Test/test30b.m
@@ -1,8 +1,8 @@
 function test30b
 %TEST30B performance test GB_mex_assign, scalar expansionb
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [save_nthreads save_chunk] = nthreads_get ;
 chunk = 4096 ;
diff --git a/GraphBLAS/Test/test31.m b/GraphBLAS/Test/test31.m
index 9dc141b50f..d55ad2a020 100644
--- a/GraphBLAS/Test/test31.m
+++ b/GraphBLAS/Test/test31.m
@@ -1,8 +1,8 @@
 function test31
 %TEST31 test GrB_transpose
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n------------------- simple tests of GB_mex_transpose\n') ;
 
@@ -11,7 +11,7 @@
 % full (A)
 % full (A')
 C = GB_mex_transpose (sparse (3,4), [ ], [ ], A) ;
-assert (spok (C.matrix) == 1) ;
+assert (GB_spok (C.matrix) == 1) ;
 assert (isequal (C.matrix,A')) ;
 
 % C = A
diff --git a/GraphBLAS/Test/test32.m b/GraphBLAS/Test/test32.m
index 9d19df56b0..bd5edaf0d4 100644
--- a/GraphBLAS/Test/test32.m
+++ b/GraphBLAS/Test/test32.m
@@ -1,8 +1,8 @@
 function test32
 %TEST32 test GrB_mxm
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n----- quick test for GB_mex_mxm\n') ;
 
diff --git a/GraphBLAS/Test/test33.m b/GraphBLAS/Test/test33.m
index 265d5ebd5a..8decad4c8f 100644
--- a/GraphBLAS/Test/test33.m
+++ b/GraphBLAS/Test/test33.m
@@ -1,8 +1,8 @@
 function test33
 %TEST33 test a semiring
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 semiring = struct ( ...
     'multiply', 'times', ...
diff --git a/GraphBLAS/Test/test34.m b/GraphBLAS/Test/test34.m
index c283a744fb..f14a68ee91 100644
--- a/GraphBLAS/Test/test34.m
+++ b/GraphBLAS/Test/test34.m
@@ -1,8 +1,8 @@
 function test34
 %TEST34 test GrB_eWiseAdd
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n----- quick test for GB_mex_Matrix_eWiseAdd\n') ;
 
diff --git a/GraphBLAS/Test/test35.m b/GraphBLAS/Test/test35.m
index 2bd44aea01..61a4f488c3 100644
--- a/GraphBLAS/Test/test35.m
+++ b/GraphBLAS/Test/test35.m
@@ -1,8 +1,8 @@
 function test35
 %TEST35 test GrB_*_extractTuples
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n test35 ---------------------- quick test of GrB_extractTuples\n') ;
 
diff --git a/GraphBLAS/Test/test36.m b/GraphBLAS/Test/test36.m
index b0f2f875aa..ef0774cff3 100644
--- a/GraphBLAS/Test/test36.m
+++ b/GraphBLAS/Test/test36.m
@@ -1,8 +1,8 @@
 function test36
 %TEST36 performance test of matrix subref
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest36 --------------------- performance of GB_Matrix_subref\n') ;
 
diff --git a/GraphBLAS/Test/test37.m b/GraphBLAS/Test/test37.m
deleted file mode 100644
index 20ebb793c5..0000000000
--- a/GraphBLAS/Test/test37.m
+++ /dev/null
@@ -1,57 +0,0 @@
-function test37
-%TEST37 performance test of qsort
-
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
-
-fprintf ('\n------------ testing GB_mex_qsort performance\n') ;
-
-% [save save_chunk] = nthreads_get ;
-% chunk = 4096 ;
-% nthreads = feature ('numcores') ;
-% nthreads_set (nthreads, chunk) ;
-
-rng ('default')  ;
-
-for n = [1 10 19 20 21 100 1000 1e5 1e6 1e7 1e8]
-
-    n
-    ntrials = 2 ;
-    for trials = 1:2
-        I = int64 (10 * n * rand (1,n)) ;
-
-        tic
-        J1 = GB_mex_qsort (I) ;
-        t1 = toc ;
-        t1b = grbresults ;
-
-        tic
-        J4 = sort (I) ;
-        t4 = toc ;
-
-        assert (isequal (J1, J4)) ;
-
-        fprintf ('q: %12.6f %12.6f MATLAB: %12.6f', t1, t1b, t4);
-        fprintf (' [ %8.4f]\n', t1/t4) ;
-
-        I = J4 ;
-
-        tic
-        J1 = GB_mex_qsort (I) ;
-        t1 = toc ;
-        t1b = grbresults ;
-
-        tic
-        J4 = sort (I) ;
-        t4 = toc ;
-
-        fprintf ('q: %12.6f %12.6f MATLAB: %12.6f', t1, t1b, t4);
-        fprintf (' [ %8.4f] already sorted\n', t1/t4) ;
-
-    end
-end
-
-% nthreads_set (save, save_chunk) ;
-
-fprintf ('\ntest37: all tests passed\n') ;
-
diff --git a/GraphBLAS/Test/test38.m b/GraphBLAS/Test/test38.m
index e1c39b1528..e3bdfbc277 100644
--- a/GraphBLAS/Test/test38.m
+++ b/GraphBLAS/Test/test38.m
@@ -1,8 +1,8 @@
 function test38
 %TEST38 test GrB_transpose
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n ----------- testing GB_mex_transpose on SuiteSparse matrices\n') ;
 
diff --git a/GraphBLAS/Test/test39.m b/GraphBLAS/Test/test39.m
index e391f611fa..fc553740a9 100644
--- a/GraphBLAS/Test/test39.m
+++ b/GraphBLAS/Test/test39.m
@@ -1,8 +1,8 @@
 function test39(use_ssget)
 %TEST39 performance test for GrB_transpose
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest39 performance tests : GrB_transpose \n') ;
 
diff --git a/GraphBLAS/Test/test40.m b/GraphBLAS/Test/test40.m
index ba452689a7..52ca13b690 100644
--- a/GraphBLAS/Test/test40.m
+++ b/GraphBLAS/Test/test40.m
@@ -1,8 +1,8 @@
 function test40
 %TEST40 test GrB_Matrix_extractElement
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n ------ quick test of GrB_Matrix_extractElement\n') ;
 
diff --git a/GraphBLAS/Test/test41.m b/GraphBLAS/Test/test41.m
index 20b02d61cd..4ffa55f8f0 100644
--- a/GraphBLAS/Test/test41.m
+++ b/GraphBLAS/Test/test41.m
@@ -1,8 +1,8 @@
 function test41
 %TEST41 test AxB
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n -------------- simple GB_mex_AxB numeric tests\n') ;
 
diff --git a/GraphBLAS/Test/test42.m b/GraphBLAS/Test/test42.m
index f0422d13a6..9a1e259dca 100644
--- a/GraphBLAS/Test/test42.m
+++ b/GraphBLAS/Test/test42.m
@@ -1,8 +1,8 @@
 function test42
 %TEST42 test GrB_Matrix_build
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n----------------------- performance tests for GrB_Matrix_build\n') ;
 
@@ -25,7 +25,7 @@
 S = GB_mex_Matrix_build (i,j,x) ;
 S = S.matrix ;
 assert (isequal (A,S)) ;
-assert (spok (S) == 1) ;
+assert (GB_spok (S) == 1) ;
 
 nz = nnz (A) ;
 p = randperm (nz) ;
@@ -37,7 +37,7 @@
 S = S.matrix ;
 assert (isequal (A,(S')'))
 assert (isequal (A,S)) ;
-assert (spok (S) == 1) ;
+assert (GB_spok (S) == 1) ;
 
 % duplicates
 rng ('default') ;
@@ -53,14 +53,7 @@
 S = S.matrix ;
 assert (isequal (spones (S), spones (T)))
 assert (norm (S-T,1) == 0) ;
-assert (spok (T) == 1) ;
-
-% for col = 1:n
-%     S (:,col)
-%     T (:,col)
-%     norm (S (:,col) - T (:,col), 1)
-%     % pause
-% end
+assert (GB_spok (T) == 1) ;
 
 %-------------------------------------------------------------------------------
 fprintf ('----------------------- matrix from collection, no sorting:\n') ;
@@ -90,7 +83,7 @@
     t = toc ;
     fprintf ('GrB with %d threads: %g\n', nth, t) ;
     assert (isequal (A,S))
-    assert (spok (S) == 1) ;
+    assert (GB_spok (S) == 1) ;
 end
 
 try 
@@ -98,7 +91,7 @@
     tic
     W = cs_sparse (i1,j1,x) ;
     toc
-    ok = isequal (A,W) && (spok (W) == 1) ;
+    ok = isequal (A,W) && (GB_spok (W) == 1) ;
 catch
     % CSparse not available
     ok = true ;
@@ -110,7 +103,7 @@
     tic
     Y = sparse2 (i1,j1,x) ;
     toc
-    ok = (isequal (A,Y)) && assert (spok (Y) == 1) ;
+    ok = (isequal (A,Y)) && assert (GB_spok (Y) == 1) ;
 catch
     % CHOLMOD not available
     ok = true ;
@@ -146,7 +139,7 @@
     t = toc ;
     fprintf ('GrB with %d threads: %g\n', nth, t) ;
     assert (isequal (T,S))
-    assert (spok (S) == 1) ;
+    assert (GB_spok (S) == 1) ;
 end
 
 %-------------------------------------------------------------------------------
@@ -178,7 +171,7 @@
     t = toc ;
     fprintf ('GrB with %d threads: %g\n', nth, t) ;
     assert (isequal (T,S))
-    assert (spok (S) == 1) ;
+    assert (GB_spok (S) == 1) ;
 end
 
 try
@@ -186,7 +179,7 @@
     tic
     W = cs_sparse (i1,j1,x) ;
     toc
-    ok = isequal (T,W) && (spok (W) == 1) ;
+    ok = isequal (T,W) && (GB_spok (W) == 1) ;
 catch
     % CSparse not available
     ok = true ;
@@ -199,7 +192,7 @@
     Y = sparse2 (i1,j1,x) ;
     toc
     % norm (T-Y,1)
-    ok = (isequal (T,Y)) && assert (spok (Y) == 1) ;
+    ok = (isequal (T,Y)) && assert (GB_spok (Y) == 1) ;
 catch
     % CHOLMOD not available
     ok = true ;
@@ -232,7 +225,7 @@
     t = toc ;
     fprintf ('GrB with %d threads: %g\n', nth, t) ;
     assert (isequal (T,S))
-    assert (spok (S) == 1) ;
+    assert (GB_spok (S) == 1) ;
 end
 
 try
@@ -240,7 +233,7 @@
     tic
     W = cs_sparse (i1,j1,x) ;
     toc
-    ok = isequal (T,W) && (spok (W) == 1) ;
+    ok = isequal (T,W) && (GB_spok (W) == 1) ;
 catch
     % CSparse not available
     ok = true ;
@@ -253,7 +246,7 @@
     Y = sparse2 (i1,j1,x) ;
     toc
     % norm (T-Y,1)
-    ok = (isequal (T,Y)) && assert (spok (Y) == 1) ;
+    ok = (isequal (T,Y)) && assert (GB_spok (Y) == 1) ;
 catch
     % CHOLMOD not available
     ok = true ;
diff --git a/GraphBLAS/Test/test43.m b/GraphBLAS/Test/test43.m
index a96775decc..e11ad74d62 100644
--- a/GraphBLAS/Test/test43.m
+++ b/GraphBLAS/Test/test43.m
@@ -1,8 +1,8 @@
 function test43
 %TEST43 test subref
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n------------------------------ testing GB_mex_Matrix_subref\n') ;
 
@@ -28,9 +28,6 @@
 C = GB_mex_Matrix_subref (A, [ ], uint64(0)) ;
 assert (isequal (C, C0))
 
-% 'hit'
-% pause
-
 I = uint64 ([0 1]) ;
 J = uint64 ([0 1]) ;
 A = sparse (rand (4)) ;
diff --git a/GraphBLAS/Test/test44.m b/GraphBLAS/Test/test44.m
index 6c5096f12f..ef1d5c53b3 100644
--- a/GraphBLAS/Test/test44.m
+++ b/GraphBLAS/Test/test44.m
@@ -1,8 +1,8 @@
 function test44(longtests)
 %TEST44 test qsort
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest44\n------------------------------------- qsort tests\n') ;
 
@@ -25,22 +25,6 @@ function test44(longtests)
 
 fprintf ('\n========================== n %g million\n', n / 1e6) ;
 
-fprintf ('\n----------------------- qsort 1a\n') ;
-
-I = int64 ((n/10)* rand (n,1)) ;
-
-tic
-Iout1 = sort (I) ;
-t = toc ;
-
-tic
-Iout = GB_mex_qsort_1a (I) ;
-t2 = toc ;
-
-fprintf ('MATLAB: sort %g sec  qsort1a: %g  speedup: %g\n', t, t2, t/t2) ;
-
-assert (isequal (Iout, Iout1))
-
 fprintf ('\n----------------------- qsort 1b\n') ;
 
 % qsort1b is not stable; it used only when I has unique values
diff --git a/GraphBLAS/Test/test45.m b/GraphBLAS/Test/test45.m
index e108208b2f..ce13781232 100644
--- a/GraphBLAS/Test/test45.m
+++ b/GraphBLAS/Test/test45.m
@@ -1,8 +1,8 @@
 function test45(use_ssget)
 %TEST45 test GrB_*_setElement and GrB_*_*build
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest45\n------------------ testing GrB_setElement and _build\n') ;
 
@@ -20,7 +20,7 @@ function test45(use_ssget)
 A = sparse (A)  ;
 
 C = GB_mex_setElement (A, uint64(1), uint64(1), 99) ;
-spok (C.matrix) ;
+GB_spok (C.matrix) ;
 
 if (use_ssget)
     Prob = ssget ('HB/west0067') ;
@@ -44,7 +44,7 @@ function test45(use_ssget)
 
 A2 = A ;
 A3 = GB_mex_setElement (A2, I0, J0, X) ;
-assert (spok (A3.matrix) == 1)
+assert (GB_spok (A3.matrix) == 1)
 
 assert (isequal (A3.matrix, A1)) ;
 % nnz (A)
@@ -125,8 +125,8 @@ function test45(use_ssget)
     fprintf ('GraphBLAS setElement: %g sec from scratch, nnz %d\n', ...
         t5, nnz (S.matrix)) ;
 
-    % fprintf ('spok it 1\n') ;
-    assert (spok (S.matrix*1) == 1) ;
+    % fprintf ('GB_spok it 1\n') ;
+    assert (GB_spok (S.matrix*1) == 1) ;
     assert (isequal (G, S.matrix)) ;
 
     if (trial == 3)
@@ -149,8 +149,8 @@ function test45(use_ssget)
     fprintf ('GraphBLAS build:      %g sec from scratch, nnz %d\n', ...
         t4, nnz (T.matrix)) ;
 
-    % fprintf ('spok it 2\n') ;
-    assert (spok (T.matrix*1) == 1) ;
+    % fprintf ('GB_spok it 2\n') ;
+    assert (GB_spok (T.matrix*1) == 1) ;
     assert (isequal (G, T.matrix)) ;
 
     fprintf ('\n------------------- now try a vector B = A(:)\n') ;
@@ -186,8 +186,8 @@ function test45(use_ssget)
     fprintf ('GraphBLAS setElement: %g sec from scratch, nnz %d\n', ...
         t5, nnz (S.matrix)) ;
 
-    % fprintf ('spok it 3\n') ;
-    assert (spok (S.matrix*1) == 1) ;
+    % fprintf ('GB_spok it 3\n') ;
+    assert (GB_spok (S.matrix*1) == 1) ;
     assert (isequal (G, S.matrix)) ;
 
     if (trial == 3)
@@ -210,9 +210,9 @@ function test45(use_ssget)
     fprintf ('GraphBLAS mtx: build: %g sec from scratch, nnz %d\n', ...
         t4, nnz (T.matrix)) ;
 
-    % fprintf ('spok it 4\n') ;
+    % fprintf ('GB_spok it 4\n') ;
     T_matrix = T.matrix * 1 ;
-    assert (spok (T_matrix) == 1) ;
+    assert (GB_spok (T_matrix) == 1) ;
     % assert (isequal (G, T.matrix)) ;
     assert (norm (G -  T_matrix, 1) / norm (G,1) < 1e-12) ;
 
@@ -222,9 +222,9 @@ function test45(use_ssget)
     fprintf ('GraphBLAS vec: build: %g sec from scratch, nnz %d\n', ...
         t4, nnz (T.matrix)) ;
 
-    % fprintf ('spok it 4\n') ;
+    % fprintf ('GB_spok it 4\n') ;
     T_matrix = T.matrix * 1 ;
-    assert (spok (T_matrix) == 1) ;
+    assert (GB_spok (T_matrix) == 1) ;
     assert (norm (G -  T_matrix, 1) / norm (G,1) < 1e-12) ;
 
 end
diff --git a/GraphBLAS/Test/test46.m b/GraphBLAS/Test/test46.m
index 37c12168f0..6685d329c5 100644
--- a/GraphBLAS/Test/test46.m
+++ b/GraphBLAS/Test/test46.m
@@ -1,8 +1,8 @@
 function test46
 %TEST46 performance test of GxB_subassign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n--------------performance test GB_mex_subassign\n') ;
 
diff --git a/GraphBLAS/Test/test46b.m b/GraphBLAS/Test/test46b.m
index 1685b96353..a77a719934 100644
--- a/GraphBLAS/Test/test46b.m
+++ b/GraphBLAS/Test/test46b.m
@@ -1,8 +1,8 @@
 function test46b
 %TEST46B performance test of GrB_assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n--------------performance test GB_mex_assign\n') ;
 
diff --git a/GraphBLAS/Test/test47.m b/GraphBLAS/Test/test47.m
index f66a5128e2..c0bbd6e631 100644
--- a/GraphBLAS/Test/test47.m
+++ b/GraphBLAS/Test/test47.m
@@ -1,8 +1,8 @@
 function test47
 %TEST47 prformance test of GrB_vxm
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 % d = struct ('inp1', 'tran', 'inp0', 'tran') ;
@@ -42,7 +42,7 @@
     tic
     c2 = GB_mex_vxm (w, [],[], semiring, x, A, d2) ;
     t2 = toc ;
-    [t2 method] = grbresults ;
+    t2 = grbresults ;
 
     tic
     c0 = x'*A ;
diff --git a/GraphBLAS/Test/test48.m b/GraphBLAS/Test/test48.m
index 993f219d65..7534de85da 100644
--- a/GraphBLAS/Test/test48.m
+++ b/GraphBLAS/Test/test48.m
@@ -1,8 +1,8 @@
 function test48
 %TEST48 performance test of GrB_mxm
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [save save_chunk] = nthreads_get ;
 chunk = 4096 ;
@@ -73,25 +73,22 @@
             % tic
             ca = GB_mex_mxm (w, [],[], semiring, A, x, dt_auto) ;
             % t = toc ;
-            [ta auto_method] = grbresults ;
+            ta = grbresults ;
 
             % tic
             c1 = GB_mex_mxm (w, [],[], semiring, A, x, dt_dot) ;
             % t = toc ;
-            [t method] = grbresults ;
-            assert (isequal (method, 'dot')) ;
+            t = grbresults ;
 
             % tic
             cg = GB_mex_mxm (w, [],[], semiring, A, x, dt_gus) ;
             % t = toc ;
-            [tg method] = grbresults ;
-            assert (isequal (method, 'Gustavson')) ;
+            tg = grbresults ;
 
             % tic
             ch = GB_mex_mxm (w, [],[], semiring, A, x, dt_hash) ;
             % t = toc ;
-            [th method] = grbresults ;
-            assert (isequal (method, 'hash')) ;
+            th = grbresults ;
 
             tic
             c0 = A'*x ;
@@ -103,8 +100,8 @@
             assert (isequal_roundoff (c0, ch.matrix)) ;
 
             fprintf ('%8d : ', nnz (x)) ;
-            fprintf ('auto: %10.4f(%s) dot: %10.4f gus: %10.4f hash: %10.4f MATLAB %10.4f', ...
-                ta, auto_method(1), t, tg, th, t2) ;
+            fprintf ('auto: %10.4f dot: %10.4f gus: %10.4f hash: %10.4f MATLAB %10.4f', ...
+                ta, t, tg, th, t2) ;
             fprintf (' speedup auto: %10.2f dot: %10.2f gus: %10.2f hash: %10.2f\n', ...
                 t2/ta, t2/t, t2/tg, t2/th) ;
 
@@ -129,25 +126,22 @@
             % tic
             ca = GB_mex_mxm (w, [],[], semiring, A, x, da_auto) ;
             % t = toc ;
-            [ta auto_method] = grbresults ;
+            ta = grbresults ;
 
             % tic
             c1 = GB_mex_mxm (w, [],[], semiring, A, x, da_dot) ;
             % t = toc ;
-            [t method] = grbresults ;
-            assert (isequal (method, 'dot')) ;
+            t = grbresults ;
 
             % tic
             cg = GB_mex_mxm (w, [],[], semiring, A, x, da_gus) ;
             % t = toc ;
-            [tg method] = grbresults ;
-            assert (isequal (method, 'Gustavson')) ;
+            tg = grbresults ;
 
             % tic
             ch = GB_mex_mxm (w, [],[], semiring, A, x, da_hash) ;
             % t = toc ;
-            [th method] = grbresults ;
-            assert (isequal (method, 'hash')) ;
+            th = grbresults ;
 
             tic
             c0 = A*x ;
@@ -159,8 +153,8 @@
             assert (isequal_roundoff (c0, ch.matrix)) ;
 
             fprintf ('%8d : ', nnz (x)) ;
-            fprintf ('auto: %10.4f(%s) dot: %10.4f gus: %10.4f hash: %10.4f MATLAB %10.4f', ...
-                ta, auto_method(1), t, tg, th, t2) ;
+            fprintf ('auto: %10.4f dot: %10.4f gus: %10.4f hash: %10.4f MATLAB %10.4f', ...
+                ta, t, tg, th, t2) ;
             fprintf (' speedup auto: %10.2f dot: %10.2f gus: %10.2f hash: %10.2f\n', ...
                 t2/ta, t2/t, t2/tg, t2/th) ;
 
@@ -187,25 +181,22 @@
             % tic
             ca = GB_mex_mxm (w, [],[], semiring, x, A, dtn_auto) ;
             % t = toc ;
-            [ta auto_method] = grbresults ;
+            ta = grbresults ;
 
             % tic
             c1 = GB_mex_mxm (w, [],[], semiring, x, A, dtn_dot) ;
             % t = toc ;
-            [t method] = grbresults ;
-            assert (isequal (method, 'dot')) ;
+            t = grbresults ;
 
             % tic
             cg = GB_mex_mxm (w, [],[], semiring, x, A, dtn_gus) ;
             % t = toc ;
-            [tg method] = grbresults ;
-            assert (isequal (method, 'Gustavson')) ;
+            tg = grbresults ;
 
             % tic
             ch = GB_mex_mxm (w, [],[], semiring, x, A, dtn_hash) ;
             % t = toc ;
-            [th method] = grbresults ;
-            assert (isequal (method, 'hash')) ;
+            th = grbresults ;
 
             tic
             c0 = x'*A ;
@@ -218,8 +209,8 @@
             assert (isequal_roundoff (c0, ch.matrix)) ;
 
             fprintf ('%8d : ', nnz (x)) ;
-            fprintf ('auto: %10.4f(%s) dot: %10.4f gus: %10.4f hash: %10.4f MATLAB %10.4f', ...
-                ta, auto_method(1), t, tg, th, t2) ;
+            fprintf ('auto: %10.4f dot: %10.4f gus: %10.4f hash: %10.4f MATLAB %10.4f', ...
+                ta, t, tg, th, t2) ;
             fprintf (' speedup auto: %10.2f dot: %10.2f gus: %10.2f hash: %10.2f\n', ...
                 t2/ta, t2/t, t2/tg, t2/th) ;
 
@@ -244,25 +235,22 @@
             % tic
             ca = GB_mex_mxm (w, [],[], semiring, x, A, dtt_auto) ;
             % t = toc ;
-            [ta auto_method] = grbresults ;
+            ta = grbresults ;
 
             % tic
             c1 = GB_mex_mxm (w, [],[], semiring, x, A, dtt_dot) ;
             % t = toc ;
-            [t method] = grbresults ;
-            assert (isequal (method, 'dot')) ;
+            t = grbresults ;
 
             % tic
             cg = GB_mex_mxm (w, [],[], semiring, x, A, dtt_gus) ;
             % t = toc ;
-            [tg method] = grbresults ;
-            assert (isequal (method, 'Gustavson')) ;
+            tg = grbresults ;
 
             % tic
             ch = GB_mex_mxm (w, [],[], semiring, x, A, dtt_hash) ;
             % t = toc ;
-            [th method] = grbresults ;
-            assert (isequal (method, 'hash')) ;
+            th = grbresults ;
 
             tic
             c0 = x'*A' ;
@@ -274,8 +262,8 @@
             assert (isequal_roundoff (c0, ch.matrix)) ;
 
             fprintf ('%8d : ', nnz (x)) ;
-            fprintf ('auto: %10.4f(%s) dot: %10.4f gus: %10.4f hash: %10.4f MATLAB %10.4f', ...
-                ta, auto_method(1), t, tg, th, t2) ;
+            fprintf ('auto: %10.4f dot: %10.4f gus: %10.4f hash: %10.4f MATLAB %10.4f', ...
+                ta, t, tg, th, t2) ;
             fprintf (' speedup auto: %10.2f dot: %10.2f gus: %10.2f hash: %10.2f\n', ...
                 t2/ta, t2/t, t2/tg, t2/th) ;
 
@@ -303,25 +291,22 @@
             % tic
             ca = GB_mex_mxm (w, [],[], semiring, A, x, dtt_auto) ;
             % t = toc ;
-            [ta auto_method] = grbresults ;
+            ta = grbresults ;
 
             % tic
             c1 = GB_mex_mxm (w, [],[], semiring, A, x, dtt_dot) ;
             % t = toc ;
-            [t method] = grbresults ;
-            assert (isequal (method, 'dot')) ;
+            t = grbresults ;
 
             % tic
             cg = GB_mex_mxm (w, [],[], semiring, A, x, dtt_gus) ;
             % t = toc ;
-            [tg method] = grbresults ;
-            assert (isequal (method, 'Gustavson')) ;
+            tg = grbresults ;
 
             % tic
             ch = GB_mex_mxm (w, [],[], semiring, A, x, dtt_hash) ;
             % t = toc ;
-            [th method] = grbresults ;
-            assert (isequal (method, 'hash')) ;
+            th = grbresults ;
 
             tic
             c0 = A'*x' ;
@@ -333,8 +318,8 @@
             assert (isequal_roundoff (c0, ch.matrix)) ;
 
             fprintf ('%8d : ', nnz (x)) ;
-            fprintf ('auto: %10.4f(%s) dot: %10.4f gus: %10.4f hash: %10.4f MATLAB %10.4f', ...
-                ta, auto_method(1), t, tg, th, t2) ;
+            fprintf ('auto: %10.4f dot: %10.4f gus: %10.4f hash: %10.4f MATLAB %10.4f', ...
+                ta, t, tg, th, t2) ;
             fprintf (' speedup auto: %10.2f dot: %10.2f gus: %10.2f hash: %10.2f\n', ...
                 t2/ta, t2/t, t2/tg, t2/th) ;
 
diff --git a/GraphBLAS/Test/test49.m b/GraphBLAS/Test/test49.m
index 077d918243..952bc90c6e 100644
--- a/GraphBLAS/Test/test49.m
+++ b/GraphBLAS/Test/test49.m
@@ -1,8 +1,8 @@
 function test49
 %TEST49 performance test of GrB_mxm (dot product method, A'*B)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [save save_chunk] = nthreads_get ;
 chunk = 4096 ;
diff --git a/GraphBLAS/Test/test50.m b/GraphBLAS/Test/test50.m
index 8a97fa2430..f636dcdec2 100644
--- a/GraphBLAS/Test/test50.m
+++ b/GraphBLAS/Test/test50.m
@@ -1,8 +1,8 @@
 function test50
 %TEST50 test AxB numeric and symbolic
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n----------------------------- GB_mex_AxB\n') ;
 
@@ -44,7 +44,7 @@
     tic
     S = GB_mex_AxB (A, B) ;
     toc
-    assert (spok (S*1) == 1) ;
+    assert (GB_spok (S*1) == 1) ;
     err = norm (C-S,1) / cnorm ;
     fprintf ('err %g\n', err) ;
     assert (isequal (C, S)) ;
diff --git a/GraphBLAS/Test/test51.m b/GraphBLAS/Test/test51.m
index 08f0571094..4dddd8be57 100644
--- a/GraphBLAS/Test/test51.m
+++ b/GraphBLAS/Test/test51.m
@@ -1,8 +1,8 @@
 function test51
 %TEST51 test GxB_subassign, multiply operations
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n-----------performance test GB_mex_subassign, multiple ops\n') ;
 
diff --git a/GraphBLAS/Test/test51b.m b/GraphBLAS/Test/test51b.m
index d829c2ab9a..7751d9a7f5 100644
--- a/GraphBLAS/Test/test51b.m
+++ b/GraphBLAS/Test/test51b.m
@@ -1,8 +1,8 @@
 function test51b
 %TEST51B test GrB_assign, multiply operations
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n-----------performance test GB_mex_assign, multiple ops\n') ;
 
diff --git a/GraphBLAS/Test/test52.m b/GraphBLAS/Test/test52.m
index 2283a115be..7da490c2e5 100644
--- a/GraphBLAS/Test/test52.m
+++ b/GraphBLAS/Test/test52.m
@@ -1,8 +1,8 @@
 function test52
 %TEST52 test AdotB vs AxB
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n----------------------- AdotB versus AxB\n') ;
 
@@ -34,13 +34,13 @@
             C2 = GB_mex_AdotB (A,B) ;
 
             assert (isequal (C, C2)) ;
-            assert (spok (C2) == 1)
+            assert (GB_spok (C2) == 1)
 
             C = spones (Mask) .* C ;
             C2 = GB_mex_AdotB (A,B, Mask) ;
 
             assert (isequal (C, C2)) ;
-            assert (spok (C2) == 1)
+            assert (GB_spok (C2) == 1)
         end
     end
 end
@@ -75,13 +75,12 @@
     tic
     C2 = GB_mex_AdotB (A,B) ;
     t2 = toc ;
-    % [tt method] = grbresults ;
 
     % fprintf ('GrB A''*B native:\n') ;
     tic
     C4 = GB_mex_AxB (A,B, true) ;
     t4 = toc ;
-    [t4 method] = grbresults ;
+    t4 = grbresults ;
 
     % fprintf ('GrB A''*B native:\n') ;
     tic
@@ -91,8 +90,8 @@
     reltime = [reltime t2/t5] ;
 
     fprintf (...
-'m %3d n %3d %10.2e MATLAB: %10.4f AdotB : %10.4f GB,auto:: %10.4f(%s) outer %10.4f', ...
-    m, n, cwork/awork, t1, t2, t4, method (1), t5) ;
+'m %3d n %3d %10.2e MATLAB: %10.4f AdotB : %10.4f GB,auto:: %10.4f outer %10.4f', ...
+    m, n, cwork/awork, t1, t2, t4, t5) ;
     % fprintf (' speedup: %10.4f (no Mask)\n', t2/t5) ;
     fprintf (' rel: %10.4f ', t2/t5) ;
     fprintf (' speedup: %10.4f\n', t1/t4) ;
diff --git a/GraphBLAS/Test/test53.m b/GraphBLAS/Test/test53.m
index 5f2c595b2a..cf8dea339a 100644
--- a/GraphBLAS/Test/test53.m
+++ b/GraphBLAS/Test/test53.m
@@ -1,8 +1,8 @@
 function test53(fulltests)
 %TEST53 test GrB_Matrix_extract
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 1)
     fulltests = 0 ;
@@ -81,7 +81,7 @@ function test53(fulltests)
 
         % C = A (:,:)
         C = GB_mex_Matrix_extract  (Cempty, [ ], [ ], A, [ ], [ ], [ ]) ;
-        assert (spok (C.matrix*1) == 1) ;
+        assert (GB_spok (C.matrix*1) == 1) ;
         S = GB_spec_Matrix_extract (Cempty, [ ], [ ], A, [ ], [ ], [ ]) ;
         assert (isequal (C.class, A.class)) ;
         assert (isequal (C.class, S.class)) ;
@@ -94,7 +94,7 @@ function test53(fulltests)
         clear D
         D = struct ('inp0', 'tran') ;
         C = GB_mex_Matrix_extract  (Cempty2, [ ], [ ], A, [ ], [ ], D) ;
-        assert (spok (C.matrix*1) == 1) ;
+        assert (GB_spok (C.matrix*1) == 1) ;
         S = GB_spec_Matrix_extract (Cempty2, [ ], [ ], A, [ ], [ ], D) ;
         assert (isequal (C.class, A.class)) ;
         assert (isequal (C.class, S.class)) ;
@@ -105,7 +105,7 @@ function test53(fulltests)
 
         % C<Mask> = A (:,:)
         C = GB_mex_Matrix_extract  (Cempty, Mask, [ ], A, [ ], [ ], [ ]) ;
-        assert (spok (C.matrix*1) == 1) ;
+        assert (GB_spok (C.matrix*1) == 1) ;
         S = GB_spec_Matrix_extract (Cempty, Mask, [ ], A, [ ], [ ], [ ]) ;
         assert (isequal (C.class, A.class)) ;
         assert (isequal (C.class, S.class)) ;
@@ -118,7 +118,7 @@ function test53(fulltests)
         clear D
         D = struct ('inp0', 'tran') ;
         C = GB_mex_Matrix_extract  (Cempty2, Mask', [ ], A, [ ], [ ], D) ;
-        assert (spok (C.matrix*1) == 1) ;
+        assert (GB_spok (C.matrix*1) == 1) ;
         S = GB_spec_Matrix_extract (Cempty2, Mask', [ ], A, [ ], [ ], D) ;
         assert (isequal (C.class, A.class)) ;
         assert (isequal (C.class, S.class)) ;
@@ -166,6 +166,10 @@ function test53(fulltests)
                     accum.opname = op ;
                     accum.optype = optype ;
 
+                    if (GB_spec_is_positional (accum))
+                        continue ;
+                    end
+
                     try
                         GB_spec_operator (accum) ;
                     catch
@@ -241,7 +245,7 @@ function test53(fulltests)
                             % C = op (Csub,A(I,J))
                             C = GB_mex_Matrix_extract  (Csub, [ ], accum, ...
                                 A, I-1, J-1, [ ]) ;
-                            assert (spok (C.matrix*1) == 1) ;
+                            assert (GB_spok (C.matrix*1) == 1) ;
                             S = GB_spec_Matrix_extract (Csub, [ ], accum,  ...
                                 A, I, J, [ ]) ;
                             assert (isequal (C.class, cintype)) ;
@@ -260,7 +264,7 @@ function test53(fulltests)
                                 % C = op (Csub,A(I,1))
                                 C = GB_mex_Vector_extract  (Csub, [ ], ...
                                     accum, A, I-1, [ ]) ;
-                                assert (spok (C.matrix*1) == 1) ;
+                                assert (GB_spok (C.matrix*1) == 1) ;
                                 S = GB_spec_Vector_extract (Csub, [ ], ...
                                     accum, A, I, [ ]) ;
                                 assert (isequal (C.class, cintype)) ;
@@ -275,7 +279,7 @@ function test53(fulltests)
                                 % C = op (Csub,A(I,j))
                                 C = GB_mex_Col_extract  (Csub, [ ], ...
                                     accum, A, I-1, J-1, [ ]) ;
-                                assert (spok (C.matrix*1) == 1) ;
+                                assert (GB_spok (C.matrix*1) == 1) ;
                                 S = GB_spec_Col_extract (Csub, [ ], ...
                                     accum, A, I, J, [ ]) ;
                                 assert (isequal (C.class, cintype)) ;
@@ -291,7 +295,7 @@ function test53(fulltests)
 
                             C = GB_mex_Matrix_extract  (Csub2, [ ], accum,  ...
                                 A, J-1, I-1, D) ;
-                            assert (spok (C.matrix*1) == 1) ;
+                            assert (GB_spok (C.matrix*1) == 1) ;
                             S = GB_spec_Matrix_extract (Csub2, [ ], accum,  ...
                                 A, J, I, D) ;
                             assert (isequal (C.class, cintype)) ;
@@ -305,7 +309,7 @@ function test53(fulltests)
                                 % C = op (Csub,A(i,J)')
                                 C = GB_mex_Col_extract  (Csub2, [ ], ...
                                     accum, A, J-1, I-1, D) ;
-                                assert (spok (C.matrix*1) == 1) ;
+                                assert (GB_spok (C.matrix*1) == 1) ;
                                 S = GB_spec_Col_extract (Csub2, [ ], ...
                                     accum, A, J, I, D) ;
                                 assert (isequal (C.class, cintype)) ;
@@ -326,7 +330,7 @@ function test53(fulltests)
                                 % C = op (Csub2,A (I,J))
                                 C = GB_mex_Matrix_extract  (Csub, Msub,  ...
                                     accum, A, I-1, J-1, [ ]) ;
-                                assert (spok (C.matrix*1) == 1) ;
+                                assert (GB_spok (C.matrix*1) == 1) ;
                                 S = GB_spec_Matrix_extract (Csub, Msub,  ...
                                     accum, A, I, J, [ ]) ;
                                 assert (isequal (C.class, cintype)) ;
@@ -340,7 +344,7 @@ function test53(fulltests)
                                     % C = op (Csub,A(I,1))
                                     C = GB_mex_Vector_extract  (Csub, Msub, ...
                                         accum, A, I-1, [ ]) ;
-                                    assert (spok (C.matrix*1) == 1) ;
+                                    assert (GB_spok (C.matrix*1) == 1) ;
                                     S = GB_spec_Vector_extract (Csub, Msub, ...
                                         accum, A, I, [ ]) ;
                                     assert (isequal (C.class, cintype)) ;
@@ -355,7 +359,7 @@ function test53(fulltests)
                                     % C = op (Csub,A(I,j))
                                     C = GB_mex_Col_extract  (Csub, Msub, ...
                                         accum, A, I-1, J-1, [ ]) ;
-                                    assert (spok (C.matrix*1) == 1) ;
+                                    assert (GB_spok (C.matrix*1) == 1) ;
                                     S = GB_spec_Col_extract (Csub, Msub, ...
                                         accum, A, I, J, [ ]) ;
                                     assert (isequal (C.class, cintype)) ;
@@ -370,7 +374,7 @@ function test53(fulltests)
                                 D = struct ('inp0', 'tran') ;
                                 C = GB_mex_Matrix_extract  (Csub2, Msub',  ...
                                     accum, A, J-1, I-1, D) ;
-                                assert (spok (C.matrix*1) == 1) ;
+                                assert (GB_spok (C.matrix*1) == 1) ;
                                 S = GB_spec_Matrix_extract (Csub2, Msub',  ...
                                     accum, A, J, I, D) ;
                                 assert (isequal (C.class, cintype)) ;
@@ -384,7 +388,7 @@ function test53(fulltests)
                                     % C = op (Csub,A(i,J)')
                                     C = GB_mex_Col_extract  (Csub2, Msub', ...
                                         accum, A, J-1, I-1, D) ;
-                                    assert (spok (C.matrix*1) == 1) ;
+                                    assert (GB_spok (C.matrix*1) == 1) ;
                                     S = GB_spec_Col_extract (Csub2, Msub', ...
                                         accum, A, J, I, D) ;
                                     assert (isequal (C.class, cintype)) ;
diff --git a/GraphBLAS/Test/test54.m b/GraphBLAS/Test/test54.m
index eb1f0e994c..78da03acb0 100644
--- a/GraphBLAS/Test/test54.m
+++ b/GraphBLAS/Test/test54.m
@@ -1,8 +1,8 @@
 function test54
 %TEST54 test GB_subref (numeric case) with I=lo:hi, J=lo:hi
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest54: ==== quick test for subref and assign (lo:stride:hi):\n') ;
 clear
diff --git a/GraphBLAS/Test/test55.m b/GraphBLAS/Test/test55.m
index adaf95ff82..fb95a82a59 100644
--- a/GraphBLAS/Test/test55.m
+++ b/GraphBLAS/Test/test55.m
@@ -1,8 +1,8 @@
 function test55
 %TEST55 test GxB_subassign, illustrate duplicate indices, MATLAB vs GraphBLAS
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % MATLAB and GraphBLAS differ on how repeated indices are handled
 %
diff --git a/GraphBLAS/Test/test55b.m b/GraphBLAS/Test/test55b.m
index 176115c45b..eb9279bb24 100644
--- a/GraphBLAS/Test/test55b.m
+++ b/GraphBLAS/Test/test55b.m
@@ -1,8 +1,8 @@
 function test55b
 %TEST55B test GrB_assign, illustrate duplicate indices, MATLAB vs GraphBLAS
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % MATLAB and GraphBLAS differ on how repeated indices are handled
 %
diff --git a/GraphBLAS/Test/test56.m b/GraphBLAS/Test/test56.m
index a184598edd..4d9aaf8945 100644
--- a/GraphBLAS/Test/test56.m
+++ b/GraphBLAS/Test/test56.m
@@ -1,8 +1,8 @@
 function test56
 %TEST56 test GrB_*_build
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 op.opname = 'min'
 op.optype = 'uint32'
diff --git a/GraphBLAS/Test/test57.m b/GraphBLAS/Test/test57.m
index b48919517a..72ea2da3a2 100644
--- a/GraphBLAS/Test/test57.m
+++ b/GraphBLAS/Test/test57.m
@@ -5,8 +5,8 @@ function test57 (op)
 %   test57(op)
 %   test57      % Default op is 'max' if no arguments given
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 1)
     op = 'max'
diff --git a/GraphBLAS/Test/test58.m b/GraphBLAS/Test/test58.m
index bfc0c0d9c6..1fb18e58b9 100644
--- a/GraphBLAS/Test/test58.m
+++ b/GraphBLAS/Test/test58.m
@@ -1,8 +1,8 @@
 function test58 (cover)
 %TEST58 test GrB_eWiseAdd
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin < 1)
     cover = 1 ;
diff --git a/GraphBLAS/Test/test59.m b/GraphBLAS/Test/test59.m
index 4df26cbd05..7df15b7cce 100644
--- a/GraphBLAS/Test/test59.m
+++ b/GraphBLAS/Test/test59.m
@@ -1,8 +1,8 @@
 function test59
 %TEST59 test GrB_mxm
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n----- quick test for GB_mex_mxm\n') ;
 
diff --git a/GraphBLAS/Test/test60.m b/GraphBLAS/Test/test60.m
index ed9e970aa0..7af13d2aaa 100644
--- a/GraphBLAS/Test/test60.m
+++ b/GraphBLAS/Test/test60.m
@@ -1,8 +1,8 @@
 function test60
 %TEST60 test min and max operators with NaNs
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('min\n') ;
 for x = [3 nan inf]
diff --git a/GraphBLAS/Test/test61.m b/GraphBLAS/Test/test61.m
index b0853d9726..6255c0742e 100644
--- a/GraphBLAS/Test/test61.m
+++ b/GraphBLAS/Test/test61.m
@@ -1,8 +1,8 @@
 function test61
 %TEST61 performance test of GrB_eWiseMult
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n----------------------------- eWiseMult performance tests\n') ;
 
diff --git a/GraphBLAS/Test/test62.m b/GraphBLAS/Test/test62.m
index e63c6a72c0..b7e073c547 100644
--- a/GraphBLAS/Test/test62.m
+++ b/GraphBLAS/Test/test62.m
@@ -1,8 +1,8 @@
 function test62
 %TEST62 test GrB_apply
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n ------------ testing GrB_apply\n') ;
 
@@ -71,7 +71,7 @@
                         end
 
                         try
-                            GB_spec_operate (unary_op) ;
+                            GB_spec_operator (unary_op) ;
                         catch
                             continue
                         end
@@ -97,8 +97,12 @@
                                     accum_type = '' ;
                                 end
 
+                                if (GB_spec_is_positional (accum))
+                                    continue ;
+                                end
+
                                 try
-                                    GB_spec_operate (accum) ;
+                                    GB_spec_operator (accum) ;
                                 catch
                                     continue
                                 end
diff --git a/GraphBLAS/Test/test63.m b/GraphBLAS/Test/test63.m
index 0e418d9b6c..7a7cd60874 100644
--- a/GraphBLAS/Test/test63.m
+++ b/GraphBLAS/Test/test63.m
@@ -1,8 +1,8 @@
 function test63
 %TEST63 test GraphBLAS binary operators
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, ~, ~, types, ~, ~] = GB_spec_opsall ;
 ops = binops.all ;
@@ -15,6 +15,9 @@
 n_operators = 0 ;
 for k2 = 1:length(ops)
     mulop = ops {k2} ;
+    if (GB_spec_is_positional (mulop))
+        continue
+    end
     fprintf ('\n%-10s ', mulop) ;
 
     for k1 = 1:length (types)
diff --git a/GraphBLAS/Test/test64.m b/GraphBLAS/Test/test64.m
index 29f0a55d63..9cde763c39 100644
--- a/GraphBLAS/Test/test64.m
+++ b/GraphBLAS/Test/test64.m
@@ -1,8 +1,8 @@
 function test64
 %TEST64 test GxB_*_subassign, scalar expansion, with and without duplicates
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n ------------------- quick test of GxB_*_subassign_scalar\n') ;
 
diff --git a/GraphBLAS/Test/test64b.m b/GraphBLAS/Test/test64b.m
index c924e2d7fc..400fd5ff4f 100644
--- a/GraphBLAS/Test/test64b.m
+++ b/GraphBLAS/Test/test64b.m
@@ -1,8 +1,8 @@
 function test64b
 %TEST64B test GrB_*_assign, scalar expansion, with and without duplicates
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n ------------------- quick test of GrB_*_assign_scalar\n') ;
 
diff --git a/GraphBLAS/Test/test65.m b/GraphBLAS/Test/test65.m
index 9a52c6c664..0f054ebc9f 100644
--- a/GraphBLAS/Test/test65.m
+++ b/GraphBLAS/Test/test65.m
@@ -1,8 +1,8 @@
 function test65
 %TEST65 test type casting
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 X = logical (rand (4) > 0.5) ;
 C = GB_mex_cast (X, 'logical', 1) ;
diff --git a/GraphBLAS/Test/test66.m b/GraphBLAS/Test/test66.m
index 5d9d264256..cfa8351428 100644
--- a/GraphBLAS/Test/test66.m
+++ b/GraphBLAS/Test/test66.m
@@ -1,15 +1,15 @@
 function test66
 %TEST66 test GrB_reduce
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest66: ---- quick test for GrB_reduce_to_scalar and vector\n') ;
 
 rng ('default') ;
 A = sparse (rand (4,3)) ;
-x = full (sum (sum (A))) + 1.3 ;
-c = GB_mex_reduce_to_scalar (1.3, 'plus', 'plus', A) ;
+x = full (sum (sum (A))) + 3.1416
+c = GB_mex_reduce_to_scalar (3.1416, 'plus', 'plus', A) ;
 assert (isequal (x,c))
 
 tic
diff --git a/GraphBLAS/Test/test67.m b/GraphBLAS/Test/test67.m
index aee5c1ab4a..372d58af20 100644
--- a/GraphBLAS/Test/test67.m
+++ b/GraphBLAS/Test/test67.m
@@ -1,8 +1,8 @@
 function test67
 %TEST67 test GrB_apply
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n---------------------------- quick test of GrB_apply\n') ;
 
diff --git a/GraphBLAS/Test/test68.m b/GraphBLAS/Test/test68.m
index 1534033aec..bab219c82f 100644
--- a/GraphBLAS/Test/test68.m
+++ b/GraphBLAS/Test/test68.m
@@ -1,8 +1,8 @@
 function test68(n)
 %TEST68 performance tests for eWiseMult
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest68 --------------------------- quick test of GrB_eWiseMult\n') ;
 
diff --git a/GraphBLAS/Test/test69.m b/GraphBLAS/Test/test69.m
index 65ced2086c..254e7a36d1 100644
--- a/GraphBLAS/Test/test69.m
+++ b/GraphBLAS/Test/test69.m
@@ -1,8 +1,8 @@
 function test69
 %TEST69 test GrB_assign with aliased inputs, C<C>(:,:) = accum(C(:,:),C)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test69 ------------------  assign alias tests\n') ;
 
diff --git a/GraphBLAS/Test/test70.m b/GraphBLAS/Test/test70.m
index fbddf0b1cd..da5c9b7814 100644
--- a/GraphBLAS/Test/test70.m
+++ b/GraphBLAS/Test/test70.m
@@ -21,8 +21,8 @@ function test70 (f)
 %
 % depends on functions in ../Demo/MATLAB
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 addpath ('../Demo/MATLAB') ;
 
diff --git a/GraphBLAS/Test/test70_plot.m b/GraphBLAS/Test/test70_plot.m
index 6f307821e6..7271fccbad 100644
--- a/GraphBLAS/Test/test70_plot.m
+++ b/GraphBLAS/Test/test70_plot.m
@@ -1,8 +1,8 @@
 function test70_plot (T, Nedges, Nnodes)
 %TEST70_PLOT plot the results from test70
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 0)
     load test70_results
diff --git a/GraphBLAS/Test/test71.m b/GraphBLAS/Test/test71.m
index 1b17ea9745..62b88e79e2 100644
--- a/GraphBLAS/Test/test71.m
+++ b/GraphBLAS/Test/test71.m
@@ -19,8 +19,8 @@ function test71(f)
 %
 % Edit ll_memory_limit and nz_limit to match the memory on your machine.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [save save_chunk] = nthreads_get ;
 chunk = 4096 ;
diff --git a/GraphBLAS/Test/test71_plot.m b/GraphBLAS/Test/test71_plot.m
index 063d2ee979..06a2d432fb 100644
--- a/GraphBLAS/Test/test71_plot.m
+++ b/GraphBLAS/Test/test71_plot.m
@@ -1,8 +1,8 @@
 function test71_plot (T, Nedges, Nnodes, LLnz, LLmem, LLflops, Ntri, f)
 %TEST71_PLOT plot the results from test71
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 if (nargin == 0)
     if (ismac || ispc)
diff --git a/GraphBLAS/Test/test71_table.m b/GraphBLAS/Test/test71_table.m
index bdd4897fe7..326a4bfd4c 100644
--- a/GraphBLAS/Test/test71_table.m
+++ b/GraphBLAS/Test/test71_table.m
@@ -1,8 +1,8 @@
 function test71_table
 %TEST71_TABLE print the table for triangle counting results
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 load test71_results
 
diff --git a/GraphBLAS/Test/test72.m b/GraphBLAS/Test/test72.m
index c7f278b062..c861a8a831 100644
--- a/GraphBLAS/Test/test72.m
+++ b/GraphBLAS/Test/test72.m
@@ -1,8 +1,8 @@
 function test72
 %TEST72 special cases for mxm, ewise, ...
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n--------------test72: special cases\n') ;
 
@@ -24,7 +24,6 @@
 semiring.add = 'plus' ;
 semiring.class = 'double' ;
 
-
 Mask = sparse (ones (n)) ;
 C0 = GB_spec_mxm (Z, Mask, [ ], semiring, A, B, dtt);
 C1 = GB_mex_mxm  (Z, Mask, [ ], semiring, A, B, dtt);
@@ -63,39 +62,9 @@
 C1 = B .* double (A) ;
 assert (isequal (C1, sparse (C0.matrix))) ;
 
-
-%{
-M = spones (sprand (n, n, 0.5)) ;
-A = [ 11 12 13 14 ;
-      21 22 23 99
-      31 42 33 34
-      32 33 34 35 ]
-A = sparse (A) ;
-B = [ 1.1 4.2 1.3 1.4 ;
-      2.1 2.2 2.3 2.4
-      0.1 3.2 3.3 3.4
-      3.2 3.3 7.4 3.5 ]
-B = sparse (B) ;
-M = spones (M + speye (n)) ;
-M = spones (M + M') ;
-M(1,2)=0 ;
-%}
-
 M = sprand (n, n, 0.01) ;
 C0 = GB_mex_mxm  (Z, M, [ ], semiring, A, B, dnt) ;
 C1 = (A*B') .* spones (M) ;
 assert (isequal (C1, sparse (C0.matrix))) ;
 
-%{
-M = full (M)
-C1 = full (C1)
-C0 = full (C0.matrix)
-
-C2 = (A*B') .* spones (M') ;
-C2 = full (C2)
-
-(isequal (C2, C0))
-(isequal (C1, C0))
-%}
-
 fprintf ('test72: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test73.m b/GraphBLAS/Test/test73.m
index 7f3d96a2c4..544cad9b59 100644
--- a/GraphBLAS/Test/test73.m
+++ b/GraphBLAS/Test/test73.m
@@ -1,8 +1,8 @@
 function test73
 %TEST73 performance of C = A*B, with mask
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n----------------- C=A*B performance\n') ;
 
diff --git a/GraphBLAS/Test/test74.m b/GraphBLAS/Test/test74.m
index cda913ac9c..87e6a703fa 100644
--- a/GraphBLAS/Test/test74.m
+++ b/GraphBLAS/Test/test74.m
@@ -1,10 +1,11 @@
 function test74
 %TEST74 test GrB_mxm: all built-in semirings
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, ~, add_ops, types, ~, ~] = GB_spec_opsall ;
+% mult_ops = binops.positional ;
 mult_ops = binops.all ;
 types = types.all ;
 
diff --git a/GraphBLAS/Test/test75.m b/GraphBLAS/Test/test75.m
index 8f043fa3d3..367fe8dcdc 100644
--- a/GraphBLAS/Test/test75.m
+++ b/GraphBLAS/Test/test75.m
@@ -1,12 +1,13 @@
 function test75
 %TEST75 test GrB_mxm and GrB_vxm on all semirings
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, ~, add_ops, types, ~, ~] = GB_spec_opsall ;
-types = types.all ;
+% mult_ops = binops.positional ;
 mult_ops = binops.all ;
+types = types.all ;
 
 rng ('default') ;
 
@@ -81,8 +82,6 @@
 
 n_semirings = 0 ;
 
-% eq_eq_bool: 18, 8, 1
-
 for k1 = 1:length(mult_ops)
     mulop = mult_ops {k1} ;
     fprintf ('\n%s', mulop) ;
@@ -104,11 +103,7 @@
                 [mult_opname mult_optype ztype xtype ytype] = GB_spec_operator (mult_op) ;
                 [ add_opname  add_optype] = GB_spec_operator (add_op) ;
                 identity = GB_spec_identity (semiring.add, add_optype) ;
-            catch me
-                if (~isempty (strfind (me.message, 'gotcha')))
-                    semiring
-                    pause
-                end
+            catch
                 continue
             end
 
diff --git a/GraphBLAS/Test/test75b.m b/GraphBLAS/Test/test75b.m
index 51413bb58f..41687c6459 100644
--- a/GraphBLAS/Test/test75b.m
+++ b/GraphBLAS/Test/test75b.m
@@ -1,10 +1,11 @@
 function test75b
 %TEST75B GrB_mxm and GrB_vxm on all semirings (shorter test than test75)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [binops, ~, add_ops, types, ~, ~] = GB_spec_opsall ;
+% mult_ops = binops.positional
 mult_ops = binops.all ;
 types = types.all ;
 
@@ -108,11 +109,7 @@
                 [mult_opname mult_optype ztype xtype ytype] = GB_spec_operator (mult_op) ;
                 [ add_opname  add_optype] = GB_spec_operator (add_op) ;
                 identity = GB_spec_identity (semiring.add, add_optype) ;
-            catch me
-                if (~isempty (strfind (me.message, 'gotcha')))
-                    semiring
-                    pause
-                end
+            catch
                 continue
             end
 
@@ -125,73 +122,27 @@
             n_semirings = n_semirings + 1 ;
             fprintf ('.') ;
 
-            % C<M> = A'*B, with mask
-%             C1 = GB_mex_mxm  (Cin, Mask, [ ], semiring, A, B, dtn_dot);
-%             C2 = GB_spec_mxm (Cin, Mask, [ ], semiring, A, B, dtn) ;
-%             GB_spec_compare (C1, C2, id) ;
-%             C1 = GB_mex_mxm  (Cin, Mask, [ ], semiring, A, B, dtn_saxpy);
-%             GB_spec_compare (C1, C2, id) ;
-
-            % C<M> += A'*B, C dense, typecasting of C
-%             C1 = GB_mex_mxm  (Din, Mask, add_op, semiring, A, B, dtn_dot) ;
-%             C2 = GB_spec_mxm (Din, Mask, add_op, semiring, A, B, dtn) ;
-%             GB_spec_compare (C1, C2, id) ;
-%             C1 = GB_mex_mxm  (Din, Mask, add_op, semiring, A, B, dtn_saxpy) ;
-%             GB_spec_compare (C1, C2, id) ;
-
-            % C<M> += A'*B, C dense, no typecasting of C
-%             C1 = GB_mex_mxm  (D, Mask, add_op, semiring, A, B, dtn_dot) ;
-%             C2 = GB_spec_mxm (D, Mask, add_op, semiring, A, B, dtn) ;
-%             GB_spec_compare (C1, C2, id) ;
-%             C1 = GB_mex_mxm  (D, Mask, add_op, semiring, A, B, dtn_saxpy) ;
-%             GB_spec_compare (C1, C2, id) ;
-
             % C += A'*B, C dense, typecasting of C
             % (test coverage: 96)
             C1 = GB_mex_mxm  (Din, [ ], add_op, semiring, A, B, dtn_dot) ;
             C2 = GB_spec_mxm (Din, [ ], add_op, semiring, A, B, dtn) ;
             GB_spec_compare (C1, C2, id) ;
-%            C1 = GB_mex_mxm  (Din, [ ], add_op, semiring, A, B, dtn_saxpy) ;
-%            GB_spec_compare (C1, C2, id) ;
 
             % C += A'*B, C sparse, no typecasting of C
             % (test coverage: 1,234)
             C1 = GB_mex_mxm  (D, [ ], add_op, semiring, A, B, dtn_dot) ;
             C2 = GB_spec_mxm (D, [ ], add_op, semiring, A, B, dtn) ;
             GB_spec_compare (C1, C2, id) ;
-%             C1 = GB_mex_mxm  (D, [ ], add_op, semiring, A, B, dtn_saxpy) ;
-%             GB_spec_compare (C1, C2, id) ;
 
             % X = u*A, with mask (test coverage: 12)
             C1 = GB_mex_vxm  (Xin, mask, [ ], semiring, X, A, [ ]) ;
             C2 = GB_spec_vxm (Xin, mask, [ ], semiring, X, A, [ ]) ;
             GB_spec_compare (C1, C2, id) ;
 
-%             if (k3 == 1)
-%               % repeat but with typecasting, to test generic A'*B
-%               A.class = 'double' ;
-
-%               % C = A'*B, with mask
-%               C1 = GB_mex_mxm  (Cin, Mask, [ ], semiring, A, B, dtn);
-%               C2 = GB_spec_mxm (Cin, Mask, [ ], semiring, A, B, dtn);
-%               GB_spec_compare (C1, C2, id) ;
-
-%               % X = u*A, with mask
-%               C1 = GB_mex_vxm  (Xin, mask, [ ], semiring, X, A, [ ]);
-%               C2 = GB_spec_vxm (Xin, mask, [ ], semiring, X, A, [ ]);
-%               GB_spec_compare (C1, C2, id) ;
-
-%               % X = u*A, with mask
-%               C1 = GB_mex_vxm  (Xin, mask, [ ], semiring, Y, A, [ ]);
-%               C2 = GB_spec_vxm (Xin, mask, [ ], semiring, Y, A, [ ]);
-%               GB_spec_compare (C1, C2, id) ;
-
-%            end
-
         end
     end
 end
 
-fprintf ('semirings tested: %d\n', n_semirings) ;
+fprintf ('\nsemirings tested: %d\n', n_semirings) ;
 fprintf ('\ntest75b: all tests passed\n') ;
 
diff --git a/GraphBLAS/Test/test76.m b/GraphBLAS/Test/test76.m
index 4a4d398158..bfe75d6d02 100644
--- a/GraphBLAS/Test/test76.m
+++ b/GraphBLAS/Test/test76.m
@@ -1,10 +1,9 @@
 function test76
 %TEST76 test GxB_resize
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-clear
 rng ('default') ;
 
 fprintf ('\n-------------- GrB_resize\n') ;
@@ -15,8 +14,8 @@
 for k1 = 1:length(types)
     type = types {k1} ;
     fprintf ('\n%-14s ', type) ;
-    for nrows_old = [1 2 5 10]
-        for ncols_old = [1 2 5 10]
+    for nrows_old = [1 2 5 10 100]
+        for ncols_old = [1 2 5 10 100]
 
             fprintf ('.') ;
             for A_is_hyper = 0:1
@@ -25,16 +24,14 @@
             A = GB_spec_random (nrows_old, ncols_old, 0.5, 99, type, ...
                 A_is_hyper, A_is_csc) ;
             for nrows_new = [1 2 5 10 ]
-                for ncols_new = [1 2 5 10]
+                for ncols_new = [1 2 5 100]
                     C1 = GB_spec_resize (A, nrows_new, ncols_new) ;
                     C2 = GB_mex_resize  (A, nrows_new, ncols_new) ;
                     GB_spec_compare (C1, C2, 0) ;
                 end
             end
-
             end
             end
-
         end
     end
 end
diff --git a/GraphBLAS/Test/test77.m b/GraphBLAS/Test/test77.m
index 3c230666a6..d208cea637 100644
--- a/GraphBLAS/Test/test77.m
+++ b/GraphBLAS/Test/test77.m
@@ -1,8 +1,10 @@
 function test77 (fulltest)
 %TEST77 test GrB_kronecker
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+% TODO: this test takes too long; cut it down
 
 [binops, ~, ~, types, ~, ~] = GB_spec_opsall ;
 binops = binops.all ;
@@ -17,7 +19,7 @@ function test77 (fulltest)
     k1test = 1:length(types) ;
 else
     fprintf ('--------------quick tests of GrB_kronecker\n') ;
-    k1test = [10 11] ; % Was [1 2 4 10 11] ;
+    k1test = [4 5 10 11] ;
 end
 
 rng ('default') ;
@@ -31,212 +33,210 @@ function test77 (fulltest)
 for k1 = k1test
     type = types {k1}  ;
 
-    % fprintf ('\n%s:\n', type) ;
+    fprintf ('\n%s:\n', type) ;
 
     if (fulltest)
         k2test = 1:length(binops) ;
     else
-        k2test = [4 7] ; % randperm (length(binops), 1) ; % Was 2
+        k2test = [44:51 44 4 7] ;
     end
 
-    for k2 = k2test % 1:length(binops)
-        binop = binops {k2}  ;
-
-        op.opname = binop ;
-        op.optype = type ;
+    for k2 = k2test
 
-        try
-            GB_spec_operator (op) ;
-        catch
+    binop = binops {k2}  ;
+    op.opname = binop ;
+    op.optype = type ;
+    if (GB_spec_is_positional (op.opname))
+        if (~(isequal (type, 'int32') || isequal (type, 'int64')))
             continue
         end
+    end
+
+    try
+        GB_spec_operator (op) ;
+    catch
+        continue
+    end
+
+    fprintf ('\n    binary op: [ %s %s ] ', binop, type) ;
 
-        fprintf ('\nbinary op: [ %s %s ] ', binop, type) ;
-
-        for k4 = [0 randi([0,length(binops)], 1, 3)] % 0:length(binops)
-
-            clear accum
-            fprintf ('\n') ;
-            if (k4 == 0)
-                accum = ''  ;
-                ntypes = 1 ;
-                fprintf ('accum: [ none ]') ;
-            else
-                accum.opname = binops {k4}  ;
-                ntypes = length (types) ;
-                fprintf ('accum: %s ', accum.opname) ;
-            end
-
-            for k5 = randi ([1 ntypes], 1, 3) % ntypes
-
-                if (k4 > 0)
-                    accum.optype = types {k5}  ;
-                end
-
-                try
-                    GB_spec_operator (accum) ;
-                catch
-                    continue
-                end
-
-                if (~isempty (accum))
-                    fprintf ('%s ', accum.optype) ;
-                end
-
-                for Mask_complement = [false true]
-
-                    if (Mask_complement)
-                        dnn.mask = 'complement' ;
-                        dtn.mask = 'complement' ;
-                        dnt.mask = 'complement' ;
-                        dtt.mask = 'complement' ;
-                    else
-                        dnn.mask = 'default' ;
-                        dtn.mask = 'default' ;
-                        dnt.mask = 'default' ;
-                        dtt.mask = 'default' ;
-                    end
-
-                    for C_replace = [false true]
-
-                        if (C_replace)
-                            dnn.outp = 'replace' ;
-                            dtn.outp = 'replace' ;
-                            dnt.outp = 'replace' ;
-                            dtt.outp = 'replace' ;
-                        else
-                            dnn.outp = 'default' ;
-                            dtn.outp = 'default' ;
-                            dnt.outp = 'default' ;
-                            dtt.outp = 'default' ;
-                        end
-
-                        % try some matrices
-                        for am = 5 %  % Was [1 5 10 ]
-                            for an = 3 % [1 10 ] % Was [ 1 5 10 ]
-                                for bm = 4 %  % Was [1 4 9 ]
-                                    for bn = 2 %  % Was [1 4 9 ]
-                                        fprintf ('.') ;
-
-                                        Ax= sparse (100 * sprandn (am,an, 0.5));
-                                        Bx= sparse (100 * sprandn (bm,bn, 0.5));
-                                        cm = am * bm ;
-                                        cn = an * bn ;
-                                        Cx= sparse (100 * sprandn (cm,cn, 0.2));
-                                        Mask = sprandn (cm,cn,0.2) ~= 0 ;
-                                        AT = Ax' ;
-                                        BT = Bx' ;
-
-                                        for A_is_hyper = 0:1
-                                        for A_is_csc   = 0:1
-                                        for B_is_hyper = 0:1
-                                        for B_is_csc   = 0:1
-                                        for C_is_hyper = 0:1
-                                        for C_is_csc   = 0:1
-
-                                        clear A
-                                        A.matrix = Ax ;
-                                        A.is_hyper = A_is_hyper ;
-                                        A.is_csc   = A_is_csc   ;
-
-                                        clear B
-                                        B.matrix = Bx ;
-                                        B.is_hyper = B_is_hyper ;
-                                        B.is_csc   = B_is_csc   ;
-
-                                        clear C
-                                        C.matrix = Cx ;
-                                        C.is_hyper = C_is_hyper ;
-                                        C.is_csc   = C_is_csc   ;
-
-                                        %---------------------------------------
-                                        % kron(A,B)
-                                        %---------------------------------------
-
-                                        % C = kron(A,B)
-                                        C0 = GB_spec_kron  ...
-                                            (C, [ ], accum, op, A, B, dnn);
-                                        C1 = GB_mex_kron  ...
-                                            (C, [ ], accum, op, A, B, dnn);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        % C = kron(A,B) with Mask
-                                        C0 = GB_spec_kron ...
-                                            (C, Mask, accum, op, A, B, dnn);
-                                        C1 = GB_mex_kron ...
-                                            (C, Mask, accum, op, A, B, dnn);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        %---------------------------------------
-                                        % kron(A',B)
-                                        %---------------------------------------
-
-                                        % C = kron(A',B), no Mask
-                                        C0 = GB_spec_kron ...
-                                            (C, [ ], accum, op, AT, B, dtn);
-                                        C1 = GB_mex_kron ...
-                                            (C, [ ], accum, op, AT, B, dtn);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        % C = kron(A',B), with Mask
-                                        C0 = GB_spec_kron ...
-                                            (C, Mask, accum, op, AT, B, dtn);
-                                        C1 = GB_mex_kron ...
-                                            (C, Mask, accum, op, AT, B, dtn);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        %---------------------------------------
-                                        % kron(A,B')
-                                        %---------------------------------------
-
-                                        % no mask
-                                        C0 = GB_spec_kron ...
-                                            (C, [ ], accum, op, A, BT, dnt);
-                                        C1 = GB_mex_kron ...
-                                            (C, [ ], accum, op, A, BT, dnt);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        % with mask
-                                        C0 = GB_spec_kron ...
-                                            (C, Mask, accum, op, A, BT, dnt);
-                                        C1 = GB_mex_kron ...
-                                            (C, Mask, accum, op, A, BT, dnt);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        %---------------------------------------
-                                        % kron(A',B')
-                                        %---------------------------------------
-
-                                        % no Mask
-                                        C0 = GB_spec_kron ...
-                                            (C, [ ], accum, op, AT, BT, dtt);
-                                        C1 = GB_mex_kron ...
-                                            (C, [ ], accum, op, AT, BT, dtt);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        % with mask
-                                        C0 = GB_spec_kron  ...
-                                            (C, Mask, accum, op, AT, BT, dtt);
-                                        C1 = GB_mex_kron ...
-                                            (C, Mask, accum, op, AT, BT, dtt);
-                                        GB_spec_compare (C0, C1) ;
-
-                                        end
-                                        end
-                                        end
-                                        end
-                                        end
-                                        end
-
-                                    end
-                                end
-                            end
-                        end
-                    end
-                end
-            end
+    for k4 = [0 randi([0,length(binops)], 1, 3)] % 0:length(binops)
+
+    clear accum
+    if (k4 == 0)
+        accum = ''  ;
+        ntypes = 1 ;
+        fprintf ('\n        accum: [ none ]') ;
+    else
+        if (GB_spec_is_positional (op.opname))
+            continue ;
+        end
+        accum.opname = binops {k4}  ;
+        if (GB_spec_is_positional (accum.opname))
+            continue ;
         end
+        ntypes = length (types) ;
+        fprintf ('\n        accum: %s ', accum.opname) ;
+    end
+
+    for k5 = randi ([1 ntypes], 1, 3) % ntypes
+
+    if (k4 > 0)
+        accum.optype = types {k5}  ;
+    end
+
+    if (GB_spec_is_positional (accum))
+        continue ;
+    end
+
+    try
+        GB_spec_operator (accum) ;
+    catch
+        continue
+    end
+
+    if (~isempty (accum))
+        fprintf ('%s ', accum.optype) ;
+    end
+
+    for Mask_complement = [false true]
+
+    if (Mask_complement)
+        dnn.mask = 'complement' ;
+        dtn.mask = 'complement' ;
+        dnt.mask = 'complement' ;
+        dtt.mask = 'complement' ;
+    else
+        dnn.mask = 'default' ;
+        dtn.mask = 'default' ;
+        dnt.mask = 'default' ;
+        dtt.mask = 'default' ;
     end
+
+    for C_replace = [false true]
+
+    if (C_replace)
+        dnn.outp = 'replace' ;
+        dtn.outp = 'replace' ;
+        dnt.outp = 'replace' ;
+        dtt.outp = 'replace' ;
+    else
+        dnn.outp = 'default' ;
+        dtn.outp = 'default' ;
+        dnt.outp = 'default' ;
+        dtt.outp = 'default' ;
+    end
+
+    % try some matrices
+    for am = 5 %  % Was [1 5 10 ]
+    for an = 3 % [1 10 ] % Was [ 1 5 10 ]
+    for bm = 4 %  % Was [1 4 9 ]
+    for bn = 2 %  % Was [1 4 9 ]
+    fprintf ('.') ;
+
+    Ax= sparse (100 * sprandn (am,an, 0.5));
+    Bx= sparse (100 * sprandn (bm,bn, 0.5));
+    cm = am * bm ;
+    cn = an * bn ;
+    Cx= sparse (100 * sprandn (cm,cn, 0.2));
+    Mask = sprandn (cm,cn,0.2) ~= 0 ;
+    AT = Ax' ;
+    BT = Bx' ;
+
+    for A_is_hyper = 0:1
+    for A_is_csc   = 0:1
+    for B_is_hyper = 0:1
+    for B_is_csc   = 0:1
+    for C_is_hyper = 0:1
+    for C_is_csc   = 0:1
+
+    clear A
+    A.matrix = Ax ;
+    A.is_hyper = A_is_hyper ;
+    A.is_csc   = A_is_csc   ;
+
+    clear B
+    B.matrix = Bx ;
+    B.is_hyper = B_is_hyper ;
+    B.is_csc   = B_is_csc   ;
+
+    clear C
+    C.matrix = Cx ;
+    C.is_hyper = C_is_hyper ;
+    C.is_csc   = C_is_csc   ;
+
+    %---------------------------------------
+    % kron(A,B)
+    %---------------------------------------
+
+    % C = kron(A,B)
+    C0 = GB_spec_kron (C, [ ], accum, op, A, B, dnn);
+    C1 = GB_mex_kron  (C, [ ], accum, op, A, B, dnn);
+    GB_spec_compare (C0, C1) ;
+
+    % C = kron(A,B) with Mask
+    C0 = GB_spec_kron (C, Mask, accum, op, A, B, dnn);
+    C1 = GB_mex_kron  (C, Mask, accum, op, A, B, dnn);
+    GB_spec_compare (C0, C1) ;
+
+    %---------------------------------------
+    % kron(A',B)
+    %---------------------------------------
+
+    % C = kron(A',B), no Mask
+    C0 = GB_spec_kron (C, [ ], accum, op, AT, B, dtn);
+    C1 = GB_mex_kron  (C, [ ], accum, op, AT, B, dtn);
+    GB_spec_compare (C0, C1) ;
+
+    % C = kron(A',B), with Mask
+    C0 = GB_spec_kron (C, Mask, accum, op, AT, B, dtn);
+    C1 = GB_mex_kron  (C, Mask, accum, op, AT, B, dtn);
+    GB_spec_compare (C0, C1) ;
+
+    %---------------------------------------
+    % kron(A,B')
+    %---------------------------------------
+
+    % no mask
+    C0 = GB_spec_kron (C, [ ], accum, op, A, BT, dnt);
+    C1 = GB_mex_kron  (C, [ ], accum, op, A, BT, dnt);
+    GB_spec_compare (C0, C1) ;
+
+    % with mask
+    C0 = GB_spec_kron (C, Mask, accum, op, A, BT, dnt);
+    C1 = GB_mex_kron  (C, Mask, accum, op, A, BT, dnt);
+    GB_spec_compare (C0, C1) ;
+
+    %---------------------------------------
+    % kron(A',B')
+    %---------------------------------------
+
+    % no Mask
+    C0 = GB_spec_kron (C, [ ], accum, op, AT, BT, dtt);
+    C1 = GB_mex_kron  (C, [ ], accum, op, AT, BT, dtt);
+    GB_spec_compare (C0, C1) ;
+
+    % with mask
+    C0 = GB_spec_kron (C, Mask, accum, op, AT, BT, dtt);
+    C1 = GB_mex_kron (C, Mask, accum, op, AT, BT, dtt);
+    GB_spec_compare (C0, C1) ;
+
+    end
+    end
+    end
+    end
+    end
+    end
+
+end
+end
+end
+end
+end
+end
+end
+end
+end
 end
 
 fprintf ('\ntest77: all tests passed\n') ;
diff --git a/GraphBLAS/Test/test78.m b/GraphBLAS/Test/test78.m
index fe43b85bc4..4b8ca02db4 100644
--- a/GraphBLAS/Test/test78.m
+++ b/GraphBLAS/Test/test78.m
@@ -1,8 +1,8 @@
 function test78
 %TEST78 test subref
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 n = 500 ;
 I = speye (n) ;
diff --git a/GraphBLAS/Test/test79.m b/GraphBLAS/Test/test79.m
index c3166de298..daa4e0b188 100644
--- a/GraphBLAS/Test/test79.m
+++ b/GraphBLAS/Test/test79.m
@@ -1,8 +1,8 @@
 function test79
 %TEST79 run all matrices with test06
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 index = ssget ;
 f = find (index.nrows == index.ncols & index.isReal & index.nrows > 1000) ;
diff --git a/GraphBLAS/Test/test80.m b/GraphBLAS/Test/test80.m
index 63c69d9c95..1406fa3f30 100644
--- a/GraphBLAS/Test/test80.m
+++ b/GraphBLAS/Test/test80.m
@@ -1,8 +1,8 @@
 function test80
 %TEST80 rerun test06 with different matrices
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 n = 33 ;
diff --git a/GraphBLAS/Test/test81.m b/GraphBLAS/Test/test81.m
index 68f70f2b2f..718581de38 100644
--- a/GraphBLAS/Test/test81.m
+++ b/GraphBLAS/Test/test81.m
@@ -1,8 +1,8 @@
 function test81
 %TEST81 test GrB_Matrix_extract with index range, stride, & backwards
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test81:  GrB_Matrix_extract with index range, stride, backwards\n') ;
 
diff --git a/GraphBLAS/Test/test82.m b/GraphBLAS/Test/test82.m
index 12d692972d..a678e36dd1 100644
--- a/GraphBLAS/Test/test82.m
+++ b/GraphBLAS/Test/test82.m
@@ -1,8 +1,8 @@
 function test82
 %TEST82 test GrB_Matrix_extract with index range (hypersparse)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test82: test GrB_Matrix_extract with index range (hypersparse)\n') ;
 
diff --git a/GraphBLAS/Test/test83.m b/GraphBLAS/Test/test83.m
index 1ac8cc241b..8175c83e49 100644
--- a/GraphBLAS/Test/test83.m
+++ b/GraphBLAS/Test/test83.m
@@ -1,8 +1,8 @@
 function test83
 %TEST83 test GrB_assign with J=lo:0:hi, an empty list, and C_replace true
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 % exercises the C_replace_phase of GB_assign.c
 
diff --git a/GraphBLAS/Test/test84.m b/GraphBLAS/Test/test84.m
index 46952180d9..38f7e29e70 100644
--- a/GraphBLAS/Test/test84.m
+++ b/GraphBLAS/Test/test84.m
@@ -1,8 +1,10 @@
 function test84
-%TEST84 test GrB_assign (row and column with C in CSR format)
+%TEST84 test GrB_assign (row and column with C in CSR/CSC format)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+fprintf ('\ntest84: GrB_assign with row/col assignments\n') ;
 
 rng ('default') ;
 m = 10 ;
@@ -11,9 +13,6 @@
 % create a CSR matrix
 C0 = GB_spec_random (m, n, 0.5, 100, 'double', false, false) ;
 
-Mcol = sparse (ones (m,1)) ; % spones (sprandn (m, 1, 0.5)) ;
-Mrow = sparse (ones (n,1)) ; % spones (sprandn (n, 1, 0.5)) ;
-
 Acol = sprandn (4, 1, 0.5)  ;
 Arow = sprandn (4, 1, 0.5)  ;
 
@@ -22,16 +21,52 @@
 I = 2 ;
 I0 = uint64 (I) - 1 ;
 
-% row assign
-C1 = GB_mex_assign      (C0, Mrow, 'plus', Arow, I0, J0, [ ], 2) ;
-C2 = GB_spec_Row_assign (C0, Mrow, 'plus', Arow, I,  J,  [ ]) ;
-GB_spec_compare (C1, C2) ;
+for trial = 1:2
 
-% col assign
-C1 = GB_mex_assign      (C0, Mcol, 'plus', Acol, J0, I0, [ ], 1) ;
-C2 = GB_spec_Col_assign (C0, Mcol, 'plus', Acol, J,  I,  [ ]) ;
-GB_spec_compare (C1, C2) ;
+    clear Mrow Mcol
+    if (trial == 1)
+        Mcol.matrix = sparse (ones (m,1)) ; % spones (sprandn (m, 1, 0.5)) ;
+        Mrow.matrix = sparse (ones (n,1)) ; % spones (sprandn (n, 1, 0.5)) ;
+    else
+        Mcol.matrix = spones (sprandn (m, 1, 0.5)) ;
+        Mrow.matrix = spones (sprandn (n, 1, 0.5)) ;
+    end
 
-fprintf ('\ntest84: all tests passed\n') ;
+    for M_sparsity = [1 2 4 8] 
+
+        Mrow.sparsity = M_sparsity ;
+        Mcol.sparsity = M_sparsity ;
+
+        for sparsity_control = 1:15
+            fprintf ('.') ;
+            C0.sparsity = sparsity_control ;
+            for csc = 0:1
+                C0.is_csc = csc ;
+
+                % row assign
+                C1 = GB_mex_assign      (C0, Mrow, 'plus', Arow, I0, J0, [], 2);
+                C2 = GB_spec_Row_assign (C0, Mrow, 'plus', Arow, I,  J,  []) ;
+                GB_spec_compare (C1, C2) ;
 
+                % col assign
+                C1 = GB_mex_assign      (C0, Mcol, 'plus', Acol, J0, I0, [], 1);
+                C2 = GB_spec_Col_assign (C0, Mcol, 'plus', Acol, J,  I,  [ ]) ;
+                GB_spec_compare (C1, C2) ;
+
+                % row assign, no accum
+                C1 = GB_mex_assign      (C0, Mrow, [ ], Arow, I0, J0, [ ], 2) ;
+                C2 = GB_spec_Row_assign (C0, Mrow, [ ], Arow, I,  J,  [ ]) ;
+                GB_spec_compare (C1, C2) ;
+
+                % col assign, no accum
+                C1 = GB_mex_assign      (C0, Mcol, [ ], Acol, J0, I0, [ ], 1) ;
+                C2 = GB_spec_Col_assign (C0, Mcol, [ ], Acol, J,  I,  [ ]) ;
+                GB_spec_compare (C1, C2) ;
+
+            end
+        end
+    end
+end
+
+fprintf ('\ntest84: all tests passed\n') ;
 
diff --git a/GraphBLAS/Test/test84_orig.m b/GraphBLAS/Test/test84_orig.m
new file mode 100644
index 0000000000..9fe1a018e7
--- /dev/null
+++ b/GraphBLAS/Test/test84_orig.m
@@ -0,0 +1,37 @@
+function test84
+%TEST84 test GrB_assign (row and column with C in CSR format)
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+rng ('default') ;
+m = 10 ;
+n = 20 ;
+
+% create a CSR matrix
+C0 = GB_spec_random (m, n, 0.5, 100, 'double', false, false) ;
+
+Mcol = sparse (ones (m,1)) ; % spones (sprandn (m, 1, 0.5)) ;
+Mrow = sparse (ones (n,1)) ; % spones (sprandn (n, 1, 0.5)) ;
+
+Acol = sprandn (4, 1, 0.5)  ;
+Arow = sprandn (4, 1, 0.5)  ;
+
+J = [3 4 5 6] ;
+J0 = uint64 (J) - 1 ;
+I = 2 ;
+I0 = uint64 (I) - 1 ;
+
+% row assign
+C1 = GB_mex_assign      (C0, Mrow, 'plus', Arow, I0, J0, [ ], 2) ;
+C2 = GB_spec_Row_assign (C0, Mrow, 'plus', Arow, I,  J,  [ ]) ;
+GB_spec_compare (C1, C2) ;
+
+% col assign
+C1 = GB_mex_assign      (C0, Mcol, 'plus', Acol, J0, I0, [ ], 1) ;
+C2 = GB_spec_Col_assign (C0, Mcol, 'plus', Acol, J,  I,  [ ]) ;
+GB_spec_compare (C1, C2) ;
+
+fprintf ('\ntest84: all tests passed\n') ;
+
+
diff --git a/GraphBLAS/Test/test85.m b/GraphBLAS/Test/test85.m
index 03a5a4c4d6..76835c1037 100644
--- a/GraphBLAS/Test/test85.m
+++ b/GraphBLAS/Test/test85.m
@@ -1,8 +1,8 @@
 function test85
 %TEST85 test GrB_transpose: 1-by-n with typecasting
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 A.matrix = sparse ([ 1 2 3 4]) ;
 A.class  = 'single' ;
diff --git a/GraphBLAS/Test/test86.m b/GraphBLAS/Test/test86.m
index 9b4f0e46cb..9da72f388f 100644
--- a/GraphBLAS/Test/test86.m
+++ b/GraphBLAS/Test/test86.m
@@ -1,8 +1,8 @@
 function test86
 %TEST86 performance test of of GrB_Matrix_extract
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test86: performance test of of GrB_Matrix_extract\n') ;
 
@@ -87,7 +87,7 @@
         % C2 = GB_mex_Matrix_extract (S, [ ], [ ], A, I, I) ;
           C2 = GB_mex_Matrix_extract (S, [ ], [ ], A, I, J) ;
         t2 = grbresults ;
-        spok (C2.matrix) ;
+        GB_spok (C2.matrix) ;
         assert (isequal (C, C2.matrix)) ;
         fprintf ('    GraphBLAS nthreads %2d %12.6f speedup %8.2f\n', nthreads, t2, t1/t2) ;
     end
diff --git a/GraphBLAS/Test/test87.m b/GraphBLAS/Test/test87.m
index 91e7b79c45..5b60761a7b 100644
--- a/GraphBLAS/Test/test87.m
+++ b/GraphBLAS/Test/test87.m
@@ -1,8 +1,8 @@
 function test87
 %TEST87 performance test of GrB_mxm
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [save save_chunk] = nthreads_get ;
 chunk = 4096 ;
@@ -40,14 +40,14 @@
 % tic
 C4 = GB_mex_AxB (A,B, true) ;
 % toc
-[tg method] = grbresults ;
+tg = grbresults ;
 
 assert (norm (C-C2,1) / norm (C,1) < 1e-12)
 assert (norm (C-C3,1) / norm (C,1) < 1e-12)
 assert (norm (C-C4,1) / norm (C,1) < 1e-12)
 
-fprintf ('MATLAB: %10.4f  GB:auto: %10.4f(%s) speedup %10.4f\n', ...
-    tm, tg, method (1), tm/tg) ;
+fprintf ('MATLAB: %10.4f  GB:auto: %10.4f speedup %10.4f\n', ...
+    tm, tg, tm/tg) ;
 
 %-------------------------------------------------------------------------------
 fprintf ('\n--------------------------------------------------\n') ;
@@ -73,8 +73,8 @@
 tic
 C3 = GB_mex_AxB (A',B) ;
 toc
-[tg1 method1] = grbresults ;
-fprintf ('just A*B %g method %s (both A and B non-hypersparse)\n', tg1, method1) ;
+tg1 = grbresults ;
+fprintf ('just A*B %g (both A and B non-hypersparse)\n', tg1) ;
 
 % this is slower than GB_mex_AxB (A',B) even though it uses the
 % same method, because the MATLAB A' above is non-hypersparse,
@@ -84,10 +84,10 @@
 tic
 C4 = GB_mex_AxB (A,B, true) ;
 toc
-[tg method] = grbresults ;
+tg = grbresults ;
 
 fprintf ('MATLAB: %10.4f  GB:auto: %10.4f(%s) speedup %10.4f\n', ...
-    tm, tg, method (1), tm/tg) ;
+    tm, tg, tm/tg) ;
 
 assert (norm (C-C2,1) / norm (C,1) < 1e-12)
 assert (norm (C-C3,1) / norm (C,1) < 1e-12)
@@ -109,7 +109,7 @@
 tic
 AT2 = GB_mex_transpose (S, [ ], [ ], A)
 toc
-[tg method] = grbresults ;
+tg = grbresults ;
 
 assert (isequal (AT1, AT2.matrix)) ;
 
@@ -120,8 +120,8 @@
 tic
 C3 = GB_mex_AxB (AT1,B) ;
 toc
-[tg1 method1] = grbresults ;
-fprintf ('just A*B %g method %s\n', tg1, method1) ;
+tg1 = grbresults ;
+fprintf ('just A*B %g\n', tg1) ;
 
 %-------------------------------------------------------------------------------
 fprintf ('\n--------------------------------------------------\n') ;
@@ -153,7 +153,7 @@
 tic
 y3 = GB_mex_AxB (A,x, true) ;
 toc
-[tg method] = grbresults ;
+tg = grbresults ;
 fprintf ('GrB time is %g\n', tg) ;
 
 fprintf ('GrB (A'')xB outer:\n') ;
@@ -166,8 +166,8 @@
 % assert (isequal (y1, y3)) ;
 assert (norm (y1-y3,1) / norm (y1,1) < eps)
 
-fprintf ('MATLAB: %10.4f  GB:auto: %10.4f(%s) speedup %10.4f\n', ...
-    tm, tg, method (1), tm/tg) ;
+fprintf ('MATLAB: %10.4f  GB:auto: %10.4f speedup %10.4f\n', ...
+    tm, tg, tm/tg) ;
 
 %-------------------------------------------------------------------------------
 fprintf ('\n--------------------------------------------------\n') ;
@@ -194,7 +194,7 @@
 tic
 y3 = GB_mex_AxB (x, A, true) ;
 toc
-[tg method] = grbresults ;
+tg = grbresults ;
 
 fprintf ('GrB (A''B outer:\n') ;
 tic
@@ -206,8 +206,8 @@
 % assert (isequal (y1, y3)) ;
 assert (norm (y1-y2,1) / norm (y2,1) < eps)
 
-fprintf ('MATLAB: %10.4f  GB:auto: %10.4f(%s) speedup %10.4f\n', ...
-    tm, tg, method (1), tm/tg) ;
+fprintf ('MATLAB: %10.4f  GB:auto: %10.4f speedup %10.4f\n', ...
+    tm, tg, tm/tg) ;
 
 %-------------------------------------------------------------------------------
 fprintf ('\n--------------------------------------------------\n') ;
@@ -228,14 +228,14 @@
 tic
 y3 = GB_mex_AxB (A, x, false) ;
 toc
-[tg method] = grbresults ;
+tg = grbresults ;
 
 assert (isequal (y1, sparse (y0))) ;
 % assert (isequal (y1, y3)) ;
 assert (norm (y1-y3,1) / norm (y1,1) < eps)
 
-fprintf ('MATLAB: %10.4f  GB:auto: %10.4f(%s) speedup %10.4f\n', ...
-    tm, tg, method (1), tm/tg) ;
+fprintf ('MATLAB: %10.4f  GB:auto: %10.4f speedup %10.4f\n', ...
+    tm, tg, tm/tg) ;
 
 %-------------------------------------------------------------------------------
 fprintf ('\n--------------------------------------------------\n') ;
@@ -258,7 +258,7 @@
 tic
 y3 = GB_mex_AxB (A,x, true) ;
 toc
-[tg method] = grbresults ;
+tg = grbresults ;
 
 fprintf ('GrB (A'')xB outer:\n') ;
 tic
@@ -269,8 +269,8 @@
 % assert (isequal (y1, y3)) ;
 assert (norm (y1-y3,1) / norm (y1,1) < eps)
 
-fprintf ('MATLAB: %10.4f  GB:auto: %10.4f(%s) speedup %10.4f\n', ...
-    tm, tg, method (1), tm/tg) ;
+fprintf ('MATLAB: %10.4f  GB:auto: %10.4f speedup %10.4f\n', ...
+    tm, tg, tm/tg) ;
 
 fprintf ('\ntest87: all tests passed\n') ;
 
diff --git a/GraphBLAS/Test/test88.m b/GraphBLAS/Test/test88.m
index 0f34c67dba..2c6f25086a 100644
--- a/GraphBLAS/Test/test88.m
+++ b/GraphBLAS/Test/test88.m
@@ -1,11 +1,11 @@
 function test88
-%TEST88 test hypersparse matrices with heap-based method
+%TEST88 test hypersparse matrices with hash-based method
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
-d.axb = 'heap' ;
+d.axb = 'hash' ;
 semiring.multiply = 'times' ;
 semiring.class = 'double' ;
 semiring.add = 'plus' ;
@@ -25,11 +25,7 @@
 
         C1 = A.matrix * B.matrix ;
         C2 = GB_mex_mxm (S, [ ], [ ], semiring, A, B, d) ;
-        [t method] = grbresults ;
-        % v3.1:
-        % assert (isequal (method, 'heap')) ;
-        % v3.2:
-        assert (isequal (method, 'saxpy')) ;
+
         assert (isequal_roundoff (C1, C2.matrix)) ;
     end
 end
diff --git a/GraphBLAS/Test/test89.m b/GraphBLAS/Test/test89.m
index 4c4d0ec4d7..4c994f440e 100644
--- a/GraphBLAS/Test/test89.m
+++ b/GraphBLAS/Test/test89.m
@@ -1,8 +1,8 @@
 function test89
 %TEST89 performance test of complex A*B
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 [save save_chunk] = nthreads_get ;
 chunk = 4096 ;
@@ -39,7 +39,6 @@
     fprintf ('MATLAB %g\n', tm) ;
 
         % 1001: Gustavson
-        % 1002: heap
         % 1003: dot
 
     for k = [false true]
@@ -64,11 +63,10 @@
         % these are expected to be slower still; they do not use the default method
         % (Gustavson) which is selected by the auto-strategy.
 
-        C2 = GB_mex_AxB (A, B, 0, 0, 1002) ;
+        C2 = GB_mex_AxB (A, B, 0, 0, 1004) ;
         tg = grbresults ;
         err = norm (C1-C2,1) ;
-        fprintf ('GraphBLAS %g speedup %g (heap) err: %g\n', tg, tm/tg, err) ;
-
+        fprintf ('GraphBLAS %g speedup %g (hash) err: %g\n', tg, tm/tg, err) ;
 
         C2 = GB_mex_AxB (A, B, 0, 0, 1003) ;
         tg = grbresults ;
diff --git a/GraphBLAS/Test/test90.m b/GraphBLAS/Test/test90.m
index 4552b980c5..a339080572 100644
--- a/GraphBLAS/Test/test90.m
+++ b/GraphBLAS/Test/test90.m
@@ -1,13 +1,12 @@
 function test90
 %TEST90 test AxB with user-defined semirings: plus_rdiv and plus_rdiv2
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n -------------- A*B plus_rdiv (user-defined semiring)\n') ;
 
     % 1001: Gustavson
-    % 1002: heap
     % 1003: dot
     % 1004: hash
     % 1005: saxpy
@@ -16,8 +15,6 @@
 
 for N = [10 100] % 1000]
 
-    N
-
     % create the problem
     A = sprand (4*N, 5*N, 0.01) ;
     B = sprand (5*N, 3*N, 0.01) ;
@@ -39,7 +36,7 @@
     t = grbresults ; fprintf ('GB time %g\n', t) ;
     assert (norm (C1-C2,1) / norm (C1,1) < 1e-10) ;
 
-    for method = 1001:1005
+    for method = [1001 1003 1004 1005]
         fprintf ('method: %d\n', method) ;
         cprint = (N <= 10) ;
         tic
@@ -89,7 +86,7 @@
     tic
     C0 = X*B ;
     toc
-    for method = 1001:1005
+    for method = [1001 1003 1004 1005]
         fprintf ('method %d\n', method) ;
         tic
         C5 = GB_mex_rdiv2 (A, B, at, bt, method, flipxy) ;
@@ -105,7 +102,7 @@
     tic
     C0 = X'*B ;
     toc
-    for method = 1001:1005
+    for method = [1001 1003 1004 1005]
         fprintf ('method %d\n', method) ;
         tic
         C5 = GB_mex_rdiv2 (A, B, at, bt, method, flipxy) ;
@@ -121,7 +118,7 @@
     tic
     C0 = X*B' ;
     toc
-    for method = 1001:1005
+    for method = [1001 1003 1004 1005]
         fprintf ('method %d\n', method) ;
         tic
         C5 = GB_mex_rdiv2 (A, B, at, bt, method, flipxy) ;
@@ -137,7 +134,7 @@
     tic
     C0 = X'*B' ;
     toc
-    for method = 1001:1005
+    for method = [1001 1003 1004 1005]
         fprintf ('method %d\n', method) ;
         tic
         C5 = GB_mex_rdiv2 (A, B, at, bt, method, flipxy) ;
@@ -153,7 +150,7 @@
     tic
     C0 = A*Y ;
     toc
-    for method = 1001:1005
+    for method = [1001 1003 1004 1005]
         fprintf ('method %d\n', method) ;
         tic
         C5 = GB_mex_rdiv2 (A, B, at, bt, method, flipxy) ;
@@ -169,7 +166,7 @@
     tic
     C0 = A'*Y ;
     toc
-    for method = 1001:1005
+    for method = [1001 1003 1004 1005]
         fprintf ('method %d\n', method) ;
         tic
         C5 = GB_mex_rdiv2 (A, B, at, bt, method, flipxy) ;
@@ -185,7 +182,7 @@
     tic
     C0 = A*Y' ;
     toc
-    for method = 1001:1005
+    for method = [1001 1003 1004 1005]
         fprintf ('method %d\n', method) ;
         tic
         C5 = GB_mex_rdiv2 (A, B, at, bt, method, flipxy) ;
@@ -201,7 +198,7 @@
     tic
     C0 = A'*Y' ;
     toc
-    for method = 1001:1005
+    for method = [1001 1003 1004 1005]
         fprintf ('method %d\n', method) ;
         tic
         C5 = GB_mex_rdiv2 (A, B, at, bt, method, flipxy) ;
diff --git a/GraphBLAS/Test/test91.m b/GraphBLAS/Test/test91.m
index 6a99c9ca7a..c38ff9951c 100644
--- a/GraphBLAS/Test/test91.m
+++ b/GraphBLAS/Test/test91.m
@@ -1,8 +1,8 @@
 function test91
 %TEST91 test subref performance on dense vectors
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\n------------------------------ testing GB_mex_Matrix_subref\n') ;
 
diff --git a/GraphBLAS/Test/test92.m b/GraphBLAS/Test/test92.m
index c48c00508f..e7ebda3f70 100644
--- a/GraphBLAS/Test/test92.m
+++ b/GraphBLAS/Test/test92.m
@@ -1,8 +1,8 @@
 function test92
 %TEST92 test GB_subref (symbolic case)
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('test92:  test GB_subref (symbolic case)\n') ;
 
diff --git a/GraphBLAS/Test/test93.m b/GraphBLAS/Test/test93.m
index ddecf645fb..bc93dedf45 100644
--- a/GraphBLAS/Test/test93.m
+++ b/GraphBLAS/Test/test93.m
@@ -1,8 +1,8 @@
 function test93
 %TEST93 test dpagerank and ipagerank
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 addpath ('../Demo/MATLAB') ;
diff --git a/GraphBLAS/Test/test93b.m b/GraphBLAS/Test/test93b.m
index 1fb143d8e5..67b3cdc7de 100644
--- a/GraphBLAS/Test/test93b.m
+++ b/GraphBLAS/Test/test93b.m
@@ -1,8 +1,8 @@
 function test93b
 %TEST93B test dpagerank and ipagerank
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 addpath ('../Test') ;
diff --git a/GraphBLAS/Test/test94.m b/GraphBLAS/Test/test94.m
index d8f1b4b76a..ca24a41b4d 100644
--- a/GraphBLAS/Test/test94.m
+++ b/GraphBLAS/Test/test94.m
@@ -1,9 +1,10 @@
 function test94
 %TEST94 test pagerank
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
+addpath ('../Demo/MATLAB') ;
 T = load ('../Demo/Matrix/west0067') ;
 n = max (max (T (:, 1:2))) + 1 ;
 A = sparse (1+T(:,1), 1+T(:,2), T(:,3), n, n) ;
@@ -37,11 +38,11 @@
 
 ir_diff = length (find (ir2 ~= ir3))
 
-summary = [((r2-r3)')./(r2')  ir2' ir3' ir2'-ir3'] ; 
+% summary = [((r2-r3)')./(r2')  ir2' ir3' ir2'-ir3'] ; 
 
 C.is_csc = false ;
 
-for method = [0 1001 1002 1003]
+for method = [0 1001 1003 1004 1005]
     fprintf ('------------------ method: %d\n', method) ;
     tic ;
     [r4, ir4] = GB_mex_dpagerank (C, method) ;
@@ -51,5 +52,8 @@
     assert (isequal (ir3, ir4)) ;
 end
 
-k = min (300, n) ;
-summary (1:k,:) 
+% k = min (300, n) ;
+% summary (1:k,:) 
+
+fprintf ('test94: all tests passed\n') ;
+
diff --git a/GraphBLAS/Test/test95.m b/GraphBLAS/Test/test95.m
index bef2e51f13..469c69bb2f 100644
--- a/GraphBLAS/Test/test95.m
+++ b/GraphBLAS/Test/test95.m
@@ -1,8 +1,8 @@
 function test95
 %TEST95 performance test for GrB_transpose
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntest95: performance tests : GrB_transpose \n') ;
 
diff --git a/GraphBLAS/Test/test96.m b/GraphBLAS/Test/test96.m
index a9dd3e6729..49be440b1e 100644
--- a/GraphBLAS/Test/test96.m
+++ b/GraphBLAS/Test/test96.m
@@ -1,8 +1,8 @@
 function test96
 %TEST96 test dot product
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 n = 1000 ;
 A = sprandn (n, n, 0.5) ;
diff --git a/GraphBLAS/Test/test97.m b/GraphBLAS/Test/test97.m
index 937fc04981..5ab32fa354 100644
--- a/GraphBLAS/Test/test97.m
+++ b/GraphBLAS/Test/test97.m
@@ -1,8 +1,8 @@
 function test97
 %TEST97 test GB_assign, scalar expansion and zombies
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/Test/test98.m b/GraphBLAS/Test/test98.m
index 4f61cd3c4c..8a56690779 100644
--- a/GraphBLAS/Test/test98.m
+++ b/GraphBLAS/Test/test98.m
@@ -1,10 +1,10 @@
 function test98
 %TEST98 test GrB_mxm, typecasting on the fly
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
-% This test is for the case when the auto AxB method selects the heap
+% This test is for the case when the auto AxB method selects the hash
 
 rng ('default') ;
 
diff --git a/GraphBLAS/Test/test99.m b/GraphBLAS/Test/test99.m
index 6d25563149..d516c0706c 100644
--- a/GraphBLAS/Test/test99.m
+++ b/GraphBLAS/Test/test99.m
@@ -1,8 +1,8 @@
 function test99
 %TEST99 test GB_mex_transpose with explicit zeros in the Mask
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/Test/test_other.m b/GraphBLAS/Test/test_other.m
index 2a39e8a352..c0873c6bdf 100644
--- a/GraphBLAS/Test/test_other.m
+++ b/GraphBLAS/Test/test_other.m
@@ -1,11 +1,11 @@
 function test_other
 %TEST_OTHER installs all packages needed for extensive tests
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 here = pwd ;
-fprintf ('\n------------------installing ssget:\n') ;
+fprintf ('\n------------------installing ssget\n') ;
 try
     index = ssget ;
 catch
@@ -14,10 +14,10 @@
 end
 cd (here) ;
 
-fprintf ('\n------------------installing GraphBLAS/Demo/MATLAB:\n') ;
+fprintf ('\n------------------installing GraphBLAS/Demo/MATLAB\n') ;
 addpath ../Demo/MATLAB
 
-fprintf ('\n------------------installing spok:\n') ;
+fprintf ('\n------------------installing spok\n') ;
 cd spok
 addpath (pwd) ;
 try
@@ -27,7 +27,7 @@
 end
 cd (here) ;
 
-fprintf ('\n------------------installing SSMULT:\n') ;
+fprintf ('\n------------------installing SSMULT\n') ;
 cd ../../SuiteSparse/MATLAB_Tools/SSMULT
 addpath (pwd) ;
 try
@@ -38,7 +38,7 @@
 end
 cd (here) ;
 
-fprintf ('\n------------------installing CXSparse:\n') ;
+fprintf ('\n------------------installing CXSparse\n') ;
 cd ../../SuiteSparse/CXSparse/MATLAB/Csparse
 addpath (pwd) ;
 try
@@ -48,7 +48,7 @@
 end
 cd (here) ;
 
-fprintf ('\n------------------installing CHOLMOD:\n') ;
+fprintf ('\n------------------installing CHOLMOD\n') ;
 cd ../../SuiteSparse/CHOLMOD/MATLAB
 addpath (pwd) ;
 try
diff --git a/GraphBLAS/Test/test_semirings.m b/GraphBLAS/Test/test_semirings.m
new file mode 100644
index 0000000000..ec18b8584e
--- /dev/null
+++ b/GraphBLAS/Test/test_semirings.m
@@ -0,0 +1,12 @@
+function test_semirings
+%TEST_SEMIRINGS test all semirings
+
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
+
+test06
+test125
+test74
+test75b
+test75
+test20(2)
diff --git a/GraphBLAS/Test/testall.m b/GraphBLAS/Test/testall.m
index 4f9ae713a4..1c688f4c9b 100644
--- a/GraphBLAS/Test/testall.m
+++ b/GraphBLAS/Test/testall.m
@@ -11,8 +11,8 @@ function testall (threads,longtests)
 % the # of threads to use and the 2nd being the chunk size.  The default is
 % {[4 1]} if empty or not present.
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 GrB.init
 
@@ -65,107 +65,142 @@ function testall (threads,longtests)
 % run with malloc debugging off.
 
 %----------------------------------------
-% test taking less than 1 second:
+% tests with high rates (over 100/sec)
 %----------------------------------------
 
+logstat ('test187',t) ; % test dup/assign for all sparsity formats
+logstat ('test186',t) ; % test saxpy for all sparsity formats
+logstat ('test185',s) ; % test dot4, saxpy for all sparsity formats
+logstat ('test185',t) ; % test dot4, saxpy for all sparsity formats
+logstat ('test184',t) ; % test special cases for mxm, transpose, and build
+logstat ('test183',s) ; % test eWiseMult with hypersparse mask
+logstat ('test182',s) ; % test for internal wait
+logstat ('test181',s) ; % test transpose with explicit zeros in the mask
+logstat ('test180',s) ; % test assign and subassign (single threaded)
+logstat ('test180',t) ; % test assign and subassign (multi threaded)
+logstat ('test179',t) ; % test bitmap select
+
+logstat ('test165',t) ; % test C=A*B' where A is diagonal and B becomes bitmap
+logstat ('test01',t) ;  logstat ('test01',s) ;  % error handling
+logstat ('test07b',t) ; % quick test GB_mex_assign
+logstat ('test168',t) ; % test C=A+B with C and B full, A bitmap
+logstat ('test83',t) ;  % GrB_assign with C_replace and empty J
+
+logstat ('test176',t) ; % test GrB_assign, method 09, 11
+logstat ('test174',t) ; % test GrB_assign C<A>=A
+logstat ('test170',t) ; % test C<B>=A+B (alias M==B)
+logstat ('test169',t) ; % test C<!M>=A+B with C sparse, M hyper, A and B sparse
+logstat ('test166',t) ; % test GxB_select with a dense matrix
+logstat ('test164',t) ; % test dot5 method
 logstat ('test152',t) ; % test binops with C=A+B, all matrices dense
 logstat ('test155',t) ; % test GrB_*_setElement and GrB_*_removeElement
 logstat ('test156',t) ; % test GrB_assign C=A with typecasting
-
-logstat ('test07b',t) ; % quick test GB_mex_assign
-logstat ('test01',t) ;  % error handling
-logstat ('test01',s) ;  % error handling
-logstat ('test83',t) ;  % GrB_assign with C_replace and empty J
 logstat ('test136',s) ; % subassignment special cases
-logstat ('test98',t) ;  % GB_mex_mxm, typecast on the fly
-logstat ('test84',t) ;  % GrB_assign (row and column with C in CSR format)
-logstat ('test85',t) ;  % GrB_transpose (1-by-n with typecasting)
 logstat ('test02',t) ;  % matrix copy and dup tests
-logstat ('test148',t) ; % ewise with alias
 logstat ('test150',t) ; % mxm with zombies and typecasting (dot3 and saxpy)
+logstat ('test109',t) ; % terminal monoid with user-defined type
+logstat ('test109',s);  % terminal monoid with user-defined type
+logstat ('test110',t) ; % binary search of M(:,j) in accum/mask
+logstat ('test04',t) ;  % simple mask and transpose test
+
+%----------------------------------------
+% tests with good rates (30 to 100/sec)
+%----------------------------------------
 
+logstat ('test142',t) ; % test GrB_assign with accum
+logstat ('test162',t) ; % test C<M>=A*B with very sparse M
+logstat ('test161',t) ; % test A*B*E
+logstat ('test159',t) ; % test A*B
 logstat ('test137',s) ; % GrB_eWiseMult with FIRST and SECOND operators
-logstat ('test138',s) ; % test assign, with coarse-only tasks in IxJ slice
 logstat ('test139',s) ; % merge sort, special cases
-logstat ('test72',t) ;  % several special cases
 logstat ('test09',t) ;  % duplicate I,J test of GB_mex_subassign
-logstat ('test109',t) ; % terminal monoid with user-defined type
-logstat ('test109',s);  % terminal monoid with user-defined type
-logstat ('test110',t) ; % binary search of M(:,j) in accum/mask
-logstat ('test131',t) ; % GrB_Matrix_clear
 logstat ('test132',t) ; % setElement
-logstat ('test92',t) ;  % GB_subref (symbolic case)
-logstat ('test97',t) ;  % GB_mex_assign, scalar expansion and zombies
-logstat ('test04',t) ;  % simple mask and transpose test
 logstat ('test15',t) ;  % simple test of GB_mex_AxB
-logstat ('test78',t) ;  % quick test of hypersparse subref
-logstat ('test82',t) ;  % GrB_extract with index range (hypersparse)
-logstat ('test94',t) ;  % pagerank
-logstat ('test94',s) ;  % pagerank
-logstat ('test126',t) ; % test GrB_reduce to vector on a very sparse matrix 
-logstat ('test03',t) ;  % random matrix tests
-logstat ('test03',s) ;  % random matrix tests
-logstat ('test128',t) ; % eWiseMult, eWiseAdd, special cases
-logstat ('test17',t) ;  % quick test of GrB_*_extractElement
-logstat ('test108',t) ; % boolean monoids
-logstat ('test124',t) ; % GrB_extract, case 6
-logstat ('test101',t) ; % GrB_*_import and export
-logstat ('test26',t) ;  % quick test of GxB_select
+logstat ('test167',t) ; % test C<M>=A*B with very sparse M, different types
+logstat ('test177',t) ; % test C<!M>=A*B, C and B bitmap, M and A sparse
+logstat ('test94',t) ; logstat ('test94',s) ;  % pagerank
 logstat ('test141',t) ; % eWiseAdd with dense matrices
-logstat ('test142',t) ; % assign with dense matrices
-logstat ('test144') ;   % cumsum
+logstat ('test144',t) ; % cumsum
 logstat ('test145',t) ; % dot4 for C += A'*B
-logstat ('test147',t) ; % C<M>=A*B with very sparse M
-logstat ('test146',t) ; % expand scalar
-logstat ('test149',t) ; % test fine hash tasks for C<!M>=A*B
-logstat ('test133',t) ; % test mask operations (GB_masker)
-logstat ('test151',t) ; % test bitwise operators
 
 %----------------------------------------
-% tests taking 1 to 10 seconds:
+% tests with decent rates (30 to 40/sec)
 %----------------------------------------
 
-logstat ('test99',t) ;  % GB_mex_transpose with explicit zeros in the Mask
-logstat ('test29',t) ;  % reduce with zombies
-logstat ('test90',t) ;  % test user-defined semirings
+logstat ('test92',t) ;  % GB_subref (symbolic case)
+logstat ('test108',t) ; % boolean monoids
+logstat ('test172',t) ; % test eWiseMult with M bitmap/full
+logstat ('test26',t) ;  % quick test of GxB_select
+logstat ('test148',t) ; % ewise with alias
 logstat ('testc2(1)',t) ;  % complex tests (quick case)
-logstat ('test80',t) ;  % test GrB_mxm on all semirings (different matrix)
-logstat ('test130',t) ; % GrB_apply, hypersparse cases
-logstat ('test14',t) ;  % GrB_reduce
+logstat ('test163',t) ; % test C<!M>=A'*B where C and M are sparse
 
-logstat ('test129',t) ; % test GxB_select (tril and nonzero, hypersparse)
-logstat ('test102',t);  % GB_AxB_flopcount
-logstat ('test12',t) ;  % Wathen finite-element matrices (short test)
-logstat ('test28',t) ;  % mxm with aliased inputs, C<C> = accum(C,C*C)
-logstat ('test107',t) ; % monoids with terminal values
-logstat ('test93',t) ;  % pagerank
-logstat ('test135',t) ; % reduce to scalar
-logstat ('test11',t) ;  % exhaustive test of GrB_extractTuples
-logstat ('test106',t) ; % GxB_subassign with alias
-logstat ('test69',t) ;  % assign and subassign with alias
-logstat ('test77',t) ;  % quick tests of GrB_kronecker
-logstat ('test19b',t) ; % GrB_assign, many pending operators (malloc debug off)
-logstat ('test19b',s);  % GrB_assign, many pending operators (malloc debug off)
-logstat ('test104',t) ; % export/import
+%----------------------------------------
+% tests with decent rates (20 to 30/sec)
+%----------------------------------------
+
+logstat ('test146',t) ; % expand scalar
+logstat ('test173',t) ; % test GrB_assign C<A>=A
+logstat ('test157',t) ; % test sparsity formats
+logstat ('test29',t) ;  % reduce with zombies
+logstat ('test74',t) ;  % test GrB_mxm on all semirings
 
 %----------------------------------------
-% tests taking 10 to 200 seconds
+% tests with decent rates (10 to 20/sec)
 %----------------------------------------
 
-logstat ('test154',t) ; % apply with binop and scalar binding
+logstat ('test03',t) ; logstat ('test03',s) ;  % random matrix tests
+logstat ('test128',t) ; % eWiseMult, eWiseAdd, special cases
 logstat ('test125',t) ; % test GrB_mxm: row and column scaling
-logstat ('test74',t) ;  % test GrB_mxm on all semirings
-logstat ('test54',t) ;  % assign and extract with begin:inc:end
-logstat ('test23',t) ;  % quick test of GB_*_build
+logstat ('test14',t) ;  % GrB_reduce
+logstat ('test131',t) ; % GrB_Matrix_clear
+logstat ('test82',t) ;  % GrB_extract with index range (hypersparse)
+
+%----------------------------------------
+% tests with low coverage/sec rates (1/sec to 10/sec)
+%----------------------------------------
 
+logstat ('test154',t) ; % apply with binop and scalar binding
+logstat ('test158',t) ; % test colscale and rowscale
+logstat ('test84',t) ;  % GrB_assign (row and column with C in CSR/CSC format)
+logstat ('test130',t) ; % GrB_apply, hypersparse cases
+logstat ('test19b',t) ; % GrB_assign, many pending operators
+logstat ('test19b',s);  % GrB_assign, many pending operators
+logstat ('test101',t) ; % GrB_*_import and export
+logstat ('test133',t) ; % test mask operations (GB_masker)
+logstat ('test72',t) ;  % several special cases
+logstat ('test80',t) ;  % test GrB_mxm on all semirings (different matrix)
+logstat ('test151',t) ; % test bitwise operators
+logstat ('test124',t) ; % GrB_extract, case 6
+logstat ('test23',t) ;  % quick test of GB_*_build
+logstat ('test175',t) ; % test142 updated
+logstat ('test160',t) ; % test A*B, parallel
+logstat ('test160',s) ; % test A*B, single threaded
+logstat ('test134',t) ; % quick test of GxB_select
 logstat ('test00',s);   % GB_mex_mis (single threaded)
-logstat ('test76',t) ;  % GxB_resize
-logstat ('test88',t) ;  % hypersparse matrices with heap-based method
+logstat ('test54',t) ;  % assign and extract with begin:inc:end
+logstat ('test104',t) ; % export/import
+logstat ('test11',t) ;  % exhaustive test of GrB_extractTuples
+logstat ('test28',t) ;  % mxm with aliased inputs, C<C> = accum(C,C*C)
+
+%----------------------------------------
+% tests with very low coverage/sec rates  (< 1/sec)
+%----------------------------------------
+
+logstat ('test129',t) ; % test GxB_select (tril and nonzero, hypersparse)
+logstat ('test138',s) ; % test assign, with coarse-only tasks in IxJ slice
 logstat ('test127',t) ; % test eWiseAdd, eWiseMult (all types and operators)
+logstat ('test88',t) ;  % hypersparse matrices with hash-based method
+logstat ('test76',s) ;  % GxB_resize (single threaded)
+logstat ('test107',t) ; % monoids with terminal values
+logstat ('test69',t) ;  % assign and subassign with alias
+logstat ('test135',t) ; % reduce to scalar
+logstat ('test17',t) ;  % quick test of GrB_*_extractElement
 logstat ('test143',t) ; % mxm, special cases
-logstat ('test19',t) ;  % GxB_subassign, many pending operators
-logstat ('test53',t) ;  % quick test of GB_mex_Matrix_extract
 logstat ('test27',t) ;  % quick test of GxB_select (LoHi_band)
+logstat ('test53',t) ;  % quick test of GB_mex_Matrix_extract
+logstat ('test77',t) ;  % quick tests of GrB_kronecker
+logstat ('test19',t) ;  % GxB_subassign, many pending operators
 
 %----------------------------------------
 % longer tests (200 seconds to 600 seconds)
@@ -181,15 +216,13 @@ function testall (threads,longtests)
     fclose (f) ;
 end
 
+logstat ('test20',t) ;  % quick test of GB_mex_mxm on a few semirings
 logstat ('test10',t) ;  % GrB_apply
-logstat ('test134',t) ; % quick test of GxB_select
 logstat ('test75b',t) ; % test GrB_mxm A'*B (quicker than test75)
-logstat ('test21',s) ;  % quick test of GB_mex_subassign
 logstat ('test16',t) ;  % user-defined complex operators
 logstat ('test81',t) ;  % GrB_Matrix_extract with stride, range, backwards
 logstat ('test21b',t) ; % quick test of GB_mex_assign
 logstat ('test18',t) ;  % quick tests of GrB_eWiseAdd and eWiseMult
-logstat ('test20',t) ;  % quick test of GB_mex_mxm on a few semirings
 
 %-------------------------------------------------------------------------------
 % The following tests are not required for statement coverage.  Some need
@@ -203,24 +236,23 @@ function testall (threads,longtests)
 % test script              % time % description
 % ------------------------ % ---- % ------------------------------
 
-logstat ('test05',t) ;     %      % quick setElement test, with typecasting
-logstat ('test103',t) ;    %      % GrB_transpose aliases
-logstat ('test100',t) ;    %    5 % GB_mex_isequal
-logstat ('test75',t) ;     %      % test GrB_mxm A'*B on all semirings
 logstat ('test00',t) ;     %    8 % GB_mex_mis (multiple threads)
-logstat ('test07',t) ;     %    0 % quick test GB_mex_subassign
-logstat ('test07',s) ;     %    0 % quick test GB_mex_subassign
+logstat ('test05',t) ;     %      % quick setElement test, with typecasting
 logstat ('test06',t) ;     %  532 % test GrB_mxm on all semirings
 logstat ('test06(936)',t); %      % performance test GrB_mxm on all semirings
+logstat ('test07',t) ;     %    0 % quick test GB_mex_subassign
+logstat ('test07',s) ;     %    0 % quick test GB_mex_subassign
 logstat ('test08',t) ;     %   35 % quick test GB_mex_subassign
 logstat ('test08b',t) ;    %      % quick test GB_mex_assign
 logstat ('test09b',t) ;    %      % duplicate I,J test of GB_mex_assign
 
+logstat ('test12',t) ;     %      % Wathen finite-element matrices (short test)
 logstat ('test12(0)',t) ;  %      % Wathen finite-element matrices (full test)
 logstat ('test13',t) ;     %      % simple tests of GB_mex_transpose
 logstat ('test18(1)',t) ;  %      % lengthy tests of GrB_eWiseAdd and eWiseMult
 
 logstat ('test20(1)',t) ;  %      % test of GB_mex_mxm on all built-in semirings
+logstat ('test21',s) ;     %   41 % quick test of GB_mex_subassign
 logstat ('test21(1)',t) ;  %      % exhaustive test of GB_mex_subassign
 logstat ('test22',t) ;     %      % quick test of GB_mex_transpose
 logstat ('test23(1)',t) ;  %      % exhaustive test of GB_*_build
@@ -237,7 +269,6 @@ function testall (threads,longtests)
 logstat ('test34',t) ;     %      % quick GB_mex_Matrix_eWiseAdd test
 logstat ('test35') ;       %      % performance test for GrB_extractTuples
 logstat ('test36') ;       %      % performance test for GB_mex_Matrix_subref
-logstat ('test37') ;       %      % performance test for GrB_qsort1
 logstat ('test38',t) ;     %      % GB_mex_transpose with matrix collection
 logstat ('test39') ;       %      % GrB_transpose, GB_*_add and eWiseAdd
 logstat ('test39(0)') ;    %   55 % GrB_transpose, GB_*_add and eWiseAdd
@@ -281,18 +312,30 @@ function testall (threads,longtests)
 logstat ('test70',t) ;     %      % performance of triangle counting methods
 logstat ('test71',t) ;     %      % performance of triangle counting methods
 logstat ('test73',t) ;     %      % performance of C = A*B, with mask
+logstat ('test75',t) ;     %      % test GrB_mxm A'*B on all semirings
+logstat ('test78',t) ;     %    1 % quick test of hypersparse subref
 logstat ('test79',t) ;     %      % run all in SuiteSparse Collection w/ test06
 
+logstat ('test85',t) ;     %    0 % GrB_transpose (1-by-n with typecasting)
 logstat ('test86',t) ;     %      % performance test of of GrB_Matrix_extract
 logstat ('test87',t) ;     %      % performance test of GrB_mxm
 logstat ('test89',t) ;     %      % performance test of complex A*B
 
+logstat ('test90',t) ;     %    1 % test user-defined semirings
 logstat ('test91',t) ;     %      % test subref performance on dense vectors
+logstat ('test93',t) ;     %    3 % pagerank
 logstat ('test93b',t) ;    %      % dpagerank and ipagerank
 logstat ('test95',t) ;     %      % performance test for GrB_transpose
 logstat ('test96',t) ;     %   16 % A*B using dot product
+logstat ('test97',t) ;     %    0 % GB_mex_assign, scalar expansion and zombies
+logstat ('test98',t) ;     %      % GB_mex_mxm, typecast on the fly
+logstat ('test99',t) ;     %   20 % GB_mex_transpose w/ explicit 0s in the Mask
 
+logstat ('test100',t) ;    %    5 % GB_mex_isequal
+logstat ('test102',t);     %    1 % GB_AxB_saxpy3_flopcount
+logstat ('test103',t) ;    %      % GrB_transpose aliases
 logstat ('test105',t) ;    %    2 % eWiseAdd for hypersparse
+logstat ('test106',t) ;    %    4 % GxB_subassign with alias
 
 logstat ('test111',t) ;    %      % performance test for eWiseAdd
 logstat ('test112',t) ;    %      % test row/col scale
@@ -308,6 +351,12 @@ function testall (threads,longtests)
 logstat ('test121',t) ;    %      % performance tests for GrB_assign
 logstat ('test122',t) ;    %      % performance tests for GrB_assign
 logstat ('test123',t) ;    %      % test MIS on large matrix
+logstat ('test126',t) ;    %    7 % test GrB_reduce to vector on a very sparse matrix 
+
+logstat ('test147',t) ;           % C<M>=A*B with very sparse M
+logstat ('test149',t) ;           % test fine hash tasks for C<!M>=A*B
+
+logstat ('test171',t) ;     %   1 % test conversion and GB_memset
 
 % tested via test16:
 logstat ('testc1',t) ;     %      % test complex operators
diff --git a/GraphBLAS/Test/testall2.m b/GraphBLAS/Test/testall2.m
index b7d61da7a6..ff71e47357 100644
--- a/GraphBLAS/Test/testall2.m
+++ b/GraphBLAS/Test/testall2.m
@@ -1,7 +1,7 @@
 %TESTALL2 run testall with different # of threads
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 for k = [4 1] %  8 20]
diff --git a/GraphBLAS/Test/testall3.m b/GraphBLAS/Test/testall3.m
index 393d55fe34..75a0dfc44f 100644
--- a/GraphBLAS/Test/testall3.m
+++ b/GraphBLAS/Test/testall3.m
@@ -1,7 +1,7 @@
 %TESTALL3 run testall with different # of threads
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 grbinfo
diff --git a/GraphBLAS/Test/testc1.m b/GraphBLAS/Test/testc1.m
index 5dc5ba4599..9ed3d0816c 100644
--- a/GraphBLAS/Test/testc1.m
+++ b/GraphBLAS/Test/testc1.m
@@ -1,22 +1,14 @@
 function testc1
 %TESTC1 test complex operators
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng 'default'
 
-% a = GB_mex_op ('min', complex(1), 1i, 1)  ;
-% b = min (complex (1), 1i)  ;
-% assert (isequal (a, complex (b))) ;
-
-% a = GB_mex_op ('max', complex(1), -1i, 1)  ;
-% b = max (complex (1), -1i)  ;
-% assert (isequal (a, b)) ;
-
 A = sparse (rand (2) + 1i * rand (2))  ;
-
 C = GB_mex_dump (A,0) ;
+GB_spec_compare (C, A) ;
 
 B = sparse (rand (2) + 1i * rand (2))  ;
 
diff --git a/GraphBLAS/Test/testc2.m b/GraphBLAS/Test/testc2.m
index 399967bcfe..8f230197f6 100644
--- a/GraphBLAS/Test/testc2.m
+++ b/GraphBLAS/Test/testc2.m
@@ -1,8 +1,8 @@
 function testc2(quick)
 %TESTC2 test complex A*B, A'*B, A*B', A'*B', A+B
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
@@ -50,7 +50,7 @@ function testc2(quick)
 semiring.class = 'complex' ;
 dtn.inp0 = 'tran' ;
 
-anum = [0 1001 1002 1003] ;
+anum = [0 1001 1003 1004] ;
 algos = {'auto', 'gustavson', 'dot', 'hash'} ;
 
 seed = 1 ;
diff --git a/GraphBLAS/Test/testc3.m b/GraphBLAS/Test/testc3.m
index a230fab896..4b34a17db1 100644
--- a/GraphBLAS/Test/testc3.m
+++ b/GraphBLAS/Test/testc3.m
@@ -1,8 +1,8 @@
 function testc3
 %TESTC3 test complex GrB_extract
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/Test/testc4.m b/GraphBLAS/Test/testc4.m
index f8d11d802d..1071b044c4 100644
--- a/GraphBLAS/Test/testc4.m
+++ b/GraphBLAS/Test/testc4.m
@@ -1,8 +1,8 @@
 function testc4
 %TESTC4 test complex extractElement and setElement
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng ('default') ;
 
diff --git a/GraphBLAS/Test/testc5.m b/GraphBLAS/Test/testc5.m
index b2399277eb..1ae712dbae 100644
--- a/GraphBLAS/Test/testc5.m
+++ b/GraphBLAS/Test/testc5.m
@@ -1,8 +1,8 @@
 function testc5
 %TESTC5 test complex subref
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 seed = 1 ;
 for m = [1 5 10 100]
diff --git a/GraphBLAS/Test/testc6.m b/GraphBLAS/Test/testc6.m
index 6765e80dbb..e1f6f1c0f7 100644
--- a/GraphBLAS/Test/testc6.m
+++ b/GraphBLAS/Test/testc6.m
@@ -1,8 +1,8 @@
 function testc6
 %TESTC6 test complex apply
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 rng 'default'
 
@@ -32,6 +32,7 @@
             C1 = GB_mex_op (op, a, '',1) ;
             [C2 tol] = GB_user_op (op, a) ;
             GB_complex_compare (C1, C2, tol) ;
+
             C1 = GB_mex_apply (C, [], [], op, A, dr) ;
             [i j x1] = find (C1.matrix) ;
             x1 = complex (x1) ;
@@ -59,10 +60,13 @@
             C1 = GB_mex_op (op, a, '',1) ;
             [C2 tol] = GB_user_op (op, a) ;
             GB_complex_compare (C1, C2, tol) ;
+
             C1 = GB_mex_apply (B, [], [], op, A, dr) ;
-            [i j x1] = find (C1.matrix) ;
+            % [i j x1] = find (sparse (C1.matrix)) ;
+            [i j x1] = GB_mex_extractTuples (C1.matrix) ;
             x1 = complex (x1) ;
-            [i j s] = find (A) ;
+            % [i j s] = find (sparse (A)) ;
+            [i j s] = GB_mex_extractTuples (A) ;
             x2 = GB_user_op (op, complex (s)) ;
             x2 = complex (x2) ;
             GB_complex_compare (x1, x2, tol) ;
@@ -72,9 +76,11 @@
         for k = 8:length(complex_unary)
             op = complex_unary {k} ;
             C1 = GB_mex_apply (B, [], [], op, D, dtr) ;
-            [i j x1] = find (C1.matrix) ;
+            % [i j x1] = find (C1.matrix) ;
+            [i j x1] = GB_mex_extractTuples (C1.matrix) ;
             x1 = complex (x1) ;
-            [i j s] = find (D.') ;
+            % [i j s] = find (D.') ;
+            [i j s] = GB_mex_extractTuples (D.') ;
             x2 = GB_user_op (op, complex (s)) ;
             x2 = complex (x2) ;
             GB_complex_compare (x1, x2, true) ;
diff --git a/GraphBLAS/Test/testc7.m b/GraphBLAS/Test/testc7.m
index c9c79bbb80..ebb7a38ed2 100644
--- a/GraphBLAS/Test/testc7.m
+++ b/GraphBLAS/Test/testc7.m
@@ -1,8 +1,8 @@
 function testc7
 %TESTC7 test complex assign
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('\ntestc7: all complex assign C(I,J)=A --------------------------\n') ;
 rng ('default')
@@ -63,17 +63,16 @@
         c2 = GB_mex_reduce_to_scalar (cin, '', 'plus', C3) ;
         assert (abs (c1-c2) <= tol * (abs (c1) + 1)) ;
 
-        % GrB.burble (1) ;
         clear S
         S.matrix = sparse (1i * ones (m,n)) ;
         S.pattern = false (m,n) ;
         cin = complex (1,1) ;
         M = sparse (true (m,n)) ;
-        C2 = GB_mex_subassign (S, M, [ ], sparse (cin), [ ], [ ], struct ('mask', 'structural')) ;
+        C2 = GB_mex_subassign (S, M, [ ], sparse (cin), ...
+            [ ], [ ], struct ('mask', 'structural')) ;
         C1 = sparse (ones (m,n)) ;
         C1 (:,:) = cin ;
         assert (norm (C1-C2.matrix, 1) < 1e-12)
-        % GrB.burble (0) ;
 
     end
 end
diff --git a/GraphBLAS/Test/testc8.m b/GraphBLAS/Test/testc8.m
index abbf78590a..885e84e159 100644
--- a/GraphBLAS/Test/testc8.m
+++ b/GraphBLAS/Test/testc8.m
@@ -1,8 +1,8 @@
 function testc8
 %TESTC8 test complex eWiseAdd and eWiseMult
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('testc8: test complex eWiseAdd and eWiseMult\n') ;
 
diff --git a/GraphBLAS/Test/testc9.m b/GraphBLAS/Test/testc9.m
index 8583ac1354..9e6d38409c 100644
--- a/GraphBLAS/Test/testc9.m
+++ b/GraphBLAS/Test/testc9.m
@@ -1,8 +1,8 @@
 function testc9
 %TESTC9 test complex extractTuples
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 seed = 1 ;
 for m = [1 5 10 100]
diff --git a/GraphBLAS/Test/testca.m b/GraphBLAS/Test/testca.m
index 3dca6a43e9..a0e803a107 100644
--- a/GraphBLAS/Test/testca.m
+++ b/GraphBLAS/Test/testca.m
@@ -1,8 +1,8 @@
 function testca
 %TESTCA test complex mxm, mxv, and vxm
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 fprintf ('testca: test complex mxm, mxv, and vxm\n') ;
 rng ('default') ;
@@ -11,7 +11,7 @@
 dtn = struct ('inp0', 'tran') ;
 dtt = struct ('inp0', 'tran', 'inp1', 'tran') ;
 
-algos = {'auto', 'heap', 'gustavson', 'dot', 'hash', 'saxpy'} ;
+algos = {'auto', 'gustavson', 'dot', 'hash', 'saxpy'} ;
 
 for kk = 1:length(algos)
 dnn.algo = algos {kk} ;
diff --git a/GraphBLAS/Test/testcb.m b/GraphBLAS/Test/testcb.m
index b35caa50b8..6aef630d5c 100644
--- a/GraphBLAS/Test/testcb.m
+++ b/GraphBLAS/Test/testcb.m
@@ -1,8 +1,8 @@
 function testcb
 %TESTCB test complex reduce
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 tol = 1e-13 ;
 seed = 1 ;
diff --git a/GraphBLAS/Test/testcc.m b/GraphBLAS/Test/testcc.m
index e28298d860..019dc6a839 100644
--- a/GraphBLAS/Test/testcc.m
+++ b/GraphBLAS/Test/testcc.m
@@ -1,8 +1,8 @@
 function testcc
 %TESTCC test complex transpose
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 dt = struct ('inp0', 'tran') ;
 seed = 1 ;
diff --git a/GraphBLAS/Test/testperf.m b/GraphBLAS/Test/testperf.m
index eb264632df..9a2f6631e2 100644
--- a/GraphBLAS/Test/testperf.m
+++ b/GraphBLAS/Test/testperf.m
@@ -1,8 +1,8 @@
 function testperf
 %TESTPERF run all performance tests
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 t = tic ;
 fprintf ('\ntestperf:  run all performance tests\n') ;
@@ -34,7 +34,6 @@
 test86 ;        % performance of GrB_Matrix_extract
 test52 ;        % performance of A*B with tall matrices, AdotB, AxB
 
-test37 ;        % performance of qsort
 test51b ;       % performance of GrB_assign, multiply operations
 test87 ;        % performance test of GrB_mxm
 test89 ;        % performance test of complex A*B
diff --git a/GraphBLAS/Test/testsort.m b/GraphBLAS/Test/testsort.m
index 5512e35abf..94b0744fe7 100644
--- a/GraphBLAS/Test/testsort.m
+++ b/GraphBLAS/Test/testsort.m
@@ -1,16 +1,10 @@
 %TESTSORT test qsort and msort
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 
-    test37
     test43
     test44
 
-
-
-
-
-
diff --git a/GraphBLAS/Test/tt.m b/GraphBLAS/Test/tt.m
index 6bf8672d52..4780231591 100644
--- a/GraphBLAS/Test/tt.m
+++ b/GraphBLAS/Test/tt.m
@@ -1,7 +1,7 @@
 %TT test eWiseMult and A+B
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 tic ;
 C = A + B ;
diff --git a/GraphBLAS/Test/ttest.m b/GraphBLAS/Test/ttest.m
index a401264ef4..4dca55e07f 100644
--- a/GraphBLAS/Test/ttest.m
+++ b/GraphBLAS/Test/ttest.m
@@ -1,7 +1,7 @@
 % test GrB_extractTuples
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 addpath ('~/ssget') ;
diff --git a/GraphBLAS/Test/ttt.m b/GraphBLAS/Test/ttt.m
index b96e3c69b0..e0d8e27c93 100644
--- a/GraphBLAS/Test/ttt.m
+++ b/GraphBLAS/Test/ttt.m
@@ -1,7 +1,7 @@
 %TTT various tests
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 test53 ;  % quick test of GB_mex_Matrix_extract
 test24 ;  % test of GrB_Matrix_reduce
diff --git a/GraphBLAS/Test/xtest.m b/GraphBLAS/Test/xtest.m
index 5209165dbb..11bc91fee3 100644
--- a/GraphBLAS/Test/xtest.m
+++ b/GraphBLAS/Test/xtest.m
@@ -1,7 +1,7 @@
 % test GrB_extract
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 clear all ; make
 addpath ('~/ssget') ;
diff --git a/GraphBLAS/Test/ztest.m b/GraphBLAS/Test/ztest.m
index f1f0efb7fc..a4617129bc 100644
--- a/GraphBLAS/Test/ztest.m
+++ b/GraphBLAS/Test/ztest.m
@@ -1,7 +1,7 @@
 % test zombie deletion
 
-% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-% http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+% SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+% SPDX-License-Identifier: Apache-2.0
 
 nthreads_set(1)
 grbinfo
diff --git a/GraphBLAS/alternative/Makefile b/GraphBLAS/alternative/Makefile
index 7bd0de46b2..c7d840cad2 100644
--- a/GraphBLAS/alternative/Makefile
+++ b/GraphBLAS/alternative/Makefile
@@ -2,8 +2,8 @@
 # GraphBLAS/alternative/Makefile
 #-------------------------------------------------------------------------------
 
-#  SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2020, All Rights Reserved.
-#  http://suitesparse.com   See GraphBLAS/Doc/License.txt for license.
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
 
 #-------------------------------------------------------------------------------
 
@@ -19,9 +19,9 @@
 
 default: library
 
-VER1 = 3
-VER2 = 3
-VER3 = 0
+VER1 = 4
+VER2 = 0
+VER3 = 1
 
 # pick your compiler:
   CC = gcc
@@ -44,7 +44,7 @@ VER3 = 0
 # compiler (it complains), and thus this option is only available in this
 # alternative/Makefile.  The GCC C++ 5.4 compiler fails; version 7.5 is
 # sufficient.
-CC = c++
+# CC = c++
 
 # Using the Intel MKL (don't try this if CC=c++, it will likely break):
 # from https://software.intel.com/content/www/us/en/develop/articles/intel-mkl-link-line-advisor.html
@@ -60,7 +60,7 @@ INC = ../Include/*.h ../Source/*.h ../Source/Template/* ../Source/Generated/*.h
 SRC2 = $(notdir $(wildcard $(SRC)))
 OBJ = $(SRC2:.c=.o)
 LDFLAGS = -fopenmp -lm $(MKL_LDFLAGS)
-CFLAGS = -fopenmp -fexceptions -fPIC -DUSER_OPENMP_THREADS $(MKL_CFLAGS)
+CFLAGS = -fopenmp -fexceptions -fPIC $(MKL_CFLAGS)
 # pick the roptimization level:
   CFLAGS += -O3
 # CFLAGS += -g
diff --git a/GraphBLAS/alternative/README.txt b/GraphBLAS/alternative/README.txt
index d369d39b8a..fdb6621904 100644
--- a/GraphBLAS/alternative/README.txt
+++ b/GraphBLAS/alternative/README.txt
@@ -1,5 +1,8 @@
 SuiteSparse/GraphBLAS/alternative/README.txt
 
+SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+SPDX-License-Identifier: Apache-2.0
+
 CMake is prefered for compiling SuiteSparse/GraphBLAS, but this folder provides
 a simple alternative, if you don't have CMake or if you prefer a simple
 Makefile build process.  The Makefile will typically require modifications
diff --git a/GraphBLAS/alternative/altdemo b/GraphBLAS/alternative/altdemo
index 9c642c29ff..b8320059de 100755
--- a/GraphBLAS/alternative/altdemo
+++ b/GraphBLAS/alternative/altdemo
@@ -1,9 +1,12 @@
 #!/bin/sh
+
+# SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
+# SPDX-License-Identifier: Apache-2.0
+
 ./wildtype_demo                         > wildtype_demo.out
 ./simple_demo                           > simple_demo.out
 ./complex_demo                          > complex_demo_out.m
 ./complex_demo 1                        > complex_demo_out2.m
-./pthread_demo                          > pthread_demo.out
 ./openmp_demo                           > openmp_demo.out
 
 ./import_demo   < ../Demo/Matrix/west0067            > import_demo.out
@@ -96,7 +99,7 @@ echo "number of mismatches between dpagerank and ipagerank in your tests: "
 grep mismatch pagerank_demo.out | wc -l
 echo "number of mismatches between dpagerank and ipagerank in Demo/Output: "
 grep mismatch ../Demo/Output/pagerank_demo.out | wc -l
-echo "Compare pthread_demo.out and openmp_demo.out with Output/* manually,"
+echo "Compare openmp_demo.out with Output/* manually,"
 echo "since the threads can print their results in random order."
 
 ./wildtype_demo                         > wildtype_demo.out
diff --git a/Makefile b/Makefile
index 763195515a..abd00d0fb0 100644
--- a/Makefile
+++ b/Makefile
@@ -242,6 +242,7 @@ docs:
 	( cd UMFPACK && $(MAKE) docs )
 	( cd CHOLMOD && $(MAKE) docs )
 	( cd SPQR && $(MAKE) docs )
+	( cd SLIP_LU && $(MAKE) docs )
 #	( cd PIRO_BAND && $(MAKE) docs )
 #	( cd SKYLINE_SVD && $(MAKE) docs )
 
diff --git a/README.md b/README.md
index 85f4215144..c202b3140f 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 SuiteSparse:  A Suite of Sparse matrix packages at http://suitesparse.com
 -----------------------------------------------------------------------------
 
-July 14, 2020.  SuiteSparse VERSION 5.8.1
+Mar 3, 2021.  SuiteSparse VERSION 5.9.0
 
     Now includes GraphBLAS, SLIP_LU, and a new interface to the SuiteSparse
     Matrix Collection (ssget), via MATLAB and a Java GUI, to
diff --git a/SLIP_LU/Makefile b/SLIP_LU/Makefile
index 10b5992f81..fce032cfe8 100644
--- a/SLIP_LU/Makefile
+++ b/SLIP_LU/Makefile
@@ -39,6 +39,10 @@ static:
 cov:
 	( cd Tcov ; $(MAKE) )
 
+# user guide
+docs:
+	( cd Doc ; $(MAKE) )
+
 # remove files not in the distribution, but keep compiled libraries
 clean:
 	( cd Lib ; $(MAKE) clean )
diff --git a/SPQR/Doc/spqr_user_guide.pdf b/SPQR/Doc/spqr_user_guide.pdf
index 73ff88046e..4a235217fc 100644
Binary files a/SPQR/Doc/spqr_user_guide.pdf and b/SPQR/Doc/spqr_user_guide.pdf differ
diff --git a/SuiteSparse_config/Makefile b/SuiteSparse_config/Makefile
index 70db9d722a..ddbbdaf96f 100644
--- a/SuiteSparse_config/Makefile
+++ b/SuiteSparse_config/Makefile
@@ -7,7 +7,7 @@ export SUITESPARSE
 
 # version of SuiteSparse_config is also version of SuiteSparse meta-package
 LIBRARY = libsuitesparseconfig
-VERSION = 5.8.1
+VERSION = 5.9.0
 SO_VERSION = 5
 
 default: library
diff --git a/SuiteSparse_config/README.txt b/SuiteSparse_config/README.txt
index 68e73b27bb..804e68b40c 100644
--- a/SuiteSparse_config/README.txt
+++ b/SuiteSparse_config/README.txt
@@ -1,4 +1,4 @@
-SuiteSparse_config, 2020, Timothy A. Davis, http://www.suitesparse.com
+SuiteSparse_config, 2021, Timothy A. Davis, http://www.suitesparse.com
 (formerly the UFconfig package)
 
 This directory contains a default SuiteSparse_config.mk file.  It tries to
diff --git a/SuiteSparse_config/SuiteSparse_config.h b/SuiteSparse_config/SuiteSparse_config.h
index 4f00e1e8c8..67a7d0f3f3 100644
--- a/SuiteSparse_config/SuiteSparse_config.h
+++ b/SuiteSparse_config/SuiteSparse_config.h
@@ -202,11 +202,11 @@ int SuiteSparse_version     /* returns SUITESPARSE_VERSION */
 */
 #define SUITESPARSE_HAS_VERSION_FUNCTION
 
-#define SUITESPARSE_DATE "July 14, 2020"
+#define SUITESPARSE_DATE "Mar 3, 2021"
 #define SUITESPARSE_VER_CODE(main,sub) ((main) * 1000 + (sub))
 #define SUITESPARSE_MAIN_VERSION 5
-#define SUITESPARSE_SUB_VERSION 8
-#define SUITESPARSE_SUBSUB_VERSION 1
+#define SUITESPARSE_SUB_VERSION 9
+#define SUITESPARSE_SUBSUB_VERSION 0
 #define SUITESPARSE_VERSION \
     SUITESPARSE_VER_CODE(SUITESPARSE_MAIN_VERSION,SUITESPARSE_SUB_VERSION)
 
diff --git a/SuiteSparse_config/SuiteSparse_config.mk b/SuiteSparse_config/SuiteSparse_config.mk
index db8063ce02..7fa74f1a50 100644
--- a/SuiteSparse_config/SuiteSparse_config.mk
+++ b/SuiteSparse_config/SuiteSparse_config.mk
@@ -7,7 +7,7 @@
 # and GraphBLAS.  The configuration settings for GraphBLAS are determined by
 # GraphBLAS/CMakeLists.txt
 
-SUITESPARSE_VERSION = 5.8.1
+SUITESPARSE_VERSION = 5.9.0
 
     #---------------------------------------------------------------------------
     # determine what system we are on